From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@intel.com>
Date: Tue, 4 Aug 2015 12:17:53 -0700
Subject: Add the rt linux 4.1.3-rt3 as base

Import the rt linux 4.1.3-rt3 as OPNFV kvm base.

It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and
the base is:

commit 0917f823c59692d751951bf5ea699a2d1e2f26a2
Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date:   Sat Jul 25 12:13:34 2015 +0200

    Prepare v4.1.3-rt3

    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

We lose all the git history this way and it's not good. We
should apply another opnfv project repo in future.

Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
---
 kernel/fs/9p/Kconfig                             |    46 +
 kernel/fs/9p/Makefile                            |    19 +
 kernel/fs/9p/acl.c                               |   381 +
 kernel/fs/9p/acl.h                               |    55 +
 kernel/fs/9p/cache.c                             |   414 +
 kernel/fs/9p/cache.h                             |   151 +
 kernel/fs/9p/fid.c                               |   306 +
 kernel/fs/9p/fid.h                               |    30 +
 kernel/fs/9p/v9fs.c                              |   685 +
 kernel/fs/9p/v9fs.h                              |   227 +
 kernel/fs/9p/v9fs_vfs.h                          |    86 +
 kernel/fs/9p/vfs_addr.c                          |   351 +
 kernel/fs/9p/vfs_dentry.c                        |   122 +
 kernel/fs/9p/vfs_dir.c                           |   261 +
 kernel/fs/9p/vfs_file.c                          |   705 ++
 kernel/fs/9p/vfs_inode.c                         |  1537 +++
 kernel/fs/9p/vfs_inode_dotl.c                    |  1016 ++
 kernel/fs/9p/vfs_super.c                         |   370 +
 kernel/fs/9p/xattr.c                             |   151 +
 kernel/fs/9p/xattr.h                             |    37 +
 kernel/fs/9p/xattr_security.c                    |    80 +
 kernel/fs/9p/xattr_trusted.c                     |    80 +
 kernel/fs/9p/xattr_user.c                        |    80 +
 kernel/fs/Kconfig                                |   281 +
 kernel/fs/Kconfig.binfmt                         |   181 +
 kernel/fs/Makefile                               |   129 +
 kernel/fs/adfs/Kconfig                           |    27 +
 kernel/fs/adfs/Makefile                          |     7 +
 kernel/fs/adfs/adfs.h                            |   208 +
 kernel/fs/adfs/dir.c                             |   288 +
 kernel/fs/adfs/dir_f.c                           |   486 +
 kernel/fs/adfs/dir_f.h                           |    65 +
 kernel/fs/adfs/dir_fplus.c                       |   265 +
 kernel/fs/adfs/dir_fplus.h                       |    45 +
 kernel/fs/adfs/file.c                            |    35 +
 kernel/fs/adfs/inode.c                           |   371 +
 kernel/fs/adfs/map.c                             |   290 +
 kernel/fs/adfs/super.c                           |   562 +
 kernel/fs/affs/Changes                           |   343 +
 kernel/fs/affs/Kconfig                           |    21 +
 kernel/fs/affs/Makefile                          |     9 +
 kernel/fs/affs/affs.h                            |   314 +
 kernel/fs/affs/amigaffs.c                        |   515 +
 kernel/fs/affs/bitmap.c                          |   364 +
 kernel/fs/affs/dir.c                             |   142 +
 kernel/fs/affs/file.c                            |   982 ++
 kernel/fs/affs/inode.c                           |   416 +
 kernel/fs/affs/namei.c                           |   462 +
 kernel/fs/affs/super.c                           |   649 +
 kernel/fs/affs/symlink.c                         |    81 +
 kernel/fs/afs/Kconfig                            |    29 +
 kernel/fs/afs/Makefile                           |    32 +
 kernel/fs/afs/afs.h                              |   170 +
 kernel/fs/afs/afs_cm.h                           |    33 +
 kernel/fs/afs/afs_fs.h                           |    56 +
 kernel/fs/afs/afs_vl.h                           |    84 +
 kernel/fs/afs/cache.c                            |   402 +
 kernel/fs/afs/callback.c                         |   475 +
 kernel/fs/afs/cell.c                             |   460 +
 kernel/fs/afs/cmservice.c                        |   604 +
 kernel/fs/afs/dir.c                              |  1123 ++
 kernel/fs/afs/file.c                             |   378 +
 kernel/fs/afs/flock.c                            |   585 +
 kernel/fs/afs/fsclient.c                         |  1911 +++
 kernel/fs/afs/inode.c                            |   498 +
 kernel/fs/afs/internal.h                         |   889 ++
 kernel/fs/afs/main.c                             |   184 +
 kernel/fs/afs/misc.c                             |    89 +
 kernel/fs/afs/mntpt.c                            |   270 +
 kernel/fs/afs/netdevices.c                       |    68 +
 kernel/fs/afs/proc.c                             |   666 +
 kernel/fs/afs/rxrpc.c                            |   862 ++
 kernel/fs/afs/security.c                         |   363 +
 kernel/fs/afs/server.c                           |   322 +
 kernel/fs/afs/super.c                            |   557 +
 kernel/fs/afs/vlclient.c                         |   219 +
 kernel/fs/afs/vlocation.c                        |   718 ++
 kernel/fs/afs/vnode.c                            |  1025 ++
 kernel/fs/afs/volume.c                           |   401 +
 kernel/fs/afs/write.c                            |   761 ++
 kernel/fs/aio.c                                  |  1742 +++
 kernel/fs/anon_inodes.c                          |   179 +
 kernel/fs/attr.c                                 |   278 +
 kernel/fs/autofs4/Kconfig                        |    20 +
 kernel/fs/autofs4/Makefile                       |     7 +
 kernel/fs/autofs4/autofs_i.h                     |   284 +
 kernel/fs/autofs4/dev-ioctl.c                    |   762 ++
 kernel/fs/autofs4/expire.c                       |   603 +
 kernel/fs/autofs4/init.c                         |    52 +
 kernel/fs/autofs4/inode.c                        |   370 +
 kernel/fs/autofs4/root.c                         |   923 ++
 kernel/fs/autofs4/symlink.c                      |    28 +
 kernel/fs/autofs4/waitq.c                        |   565 +
 kernel/fs/bad_inode.c                            |   214 +
 kernel/fs/befs/ChangeLog                         |   417 +
 kernel/fs/befs/Kconfig                           |    26 +
 kernel/fs/befs/Makefile                          |     7 +
 kernel/fs/befs/TODO                              |    14 +
 kernel/fs/befs/befs.h                            |   157 +
 kernel/fs/befs/befs_fs_types.h                   |   255 +
 kernel/fs/befs/btree.c                           |   793 ++
 kernel/fs/befs/btree.h                           |    13 +
 kernel/fs/befs/datastream.c                      |   529 +
 kernel/fs/befs/datastream.h                      |    19 +
 kernel/fs/befs/debug.c                           |   258 +
 kernel/fs/befs/endian.h                          |   125 +
 kernel/fs/befs/inode.c                           |    54 +
 kernel/fs/befs/inode.h                           |     8 +
 kernel/fs/befs/io.c                              |    85 +
 kernel/fs/befs/io.h                              |     9 +
 kernel/fs/befs/linuxvfs.c                        |   981 ++
 kernel/fs/befs/super.c                           |   112 +
 kernel/fs/befs/super.h                           |     8 +
 kernel/fs/bfs/Kconfig                            |    19 +
 kernel/fs/bfs/Makefile                           |     7 +
 kernel/fs/bfs/bfs.h                              |    60 +
 kernel/fs/bfs/dir.c                              |   365 +
 kernel/fs/bfs/file.c                             |   197 +
 kernel/fs/bfs/inode.c                            |   499 +
 kernel/fs/binfmt_aout.c                          |   423 +
 kernel/fs/binfmt_elf.c                           |  2326 ++++
 kernel/fs/binfmt_elf_fdpic.c                     |  1794 +++
 kernel/fs/binfmt_em86.c                          |   117 +
 kernel/fs/binfmt_flat.c                          |   953 ++
 kernel/fs/binfmt_misc.c                          |   835 ++
 kernel/fs/binfmt_script.c                        |   129 +
 kernel/fs/block_dev.c                            |  1795 +++
 kernel/fs/btrfs/Kconfig                          |    91 +
 kernel/fs/btrfs/Makefile                         |    19 +
 kernel/fs/btrfs/acl.c                            |   166 +
 kernel/fs/btrfs/async-thread.c                   |   365 +
 kernel/fs/btrfs/async-thread.h                   |    81 +
 kernel/fs/btrfs/backref.c                        |  1978 +++
 kernel/fs/btrfs/backref.h                        |    77 +
 kernel/fs/btrfs/btrfs_inode.h                    |   328 +
 kernel/fs/btrfs/check-integrity.c                |  3245 +++++
 kernel/fs/btrfs/check-integrity.h                |    38 +
 kernel/fs/btrfs/compression.c                    |  1091 ++
 kernel/fs/btrfs/compression.h                    |    83 +
 kernel/fs/btrfs/ctree.c                          |  5910 +++++++++
 kernel/fs/btrfs/ctree.h                          |  4247 +++++++
 kernel/fs/btrfs/delayed-inode.c                  |  1997 +++
 kernel/fs/btrfs/delayed-inode.h                  |   156 +
 kernel/fs/btrfs/delayed-ref.c                    |   962 ++
 kernel/fs/btrfs/delayed-ref.h                    |   276 +
 kernel/fs/btrfs/dev-replace.c                    |   937 ++
 kernel/fs/btrfs/dev-replace.h                    |    44 +
 kernel/fs/btrfs/dir-item.c                       |   482 +
 kernel/fs/btrfs/disk-io.c                        |  4338 +++++++
 kernel/fs/btrfs/disk-io.h                        |   159 +
 kernel/fs/btrfs/export.c                         |   304 +
 kernel/fs/btrfs/export.h                         |    19 +
 kernel/fs/btrfs/extent-tree.c                    | 10174 +++++++++++++++
 kernel/fs/btrfs/extent_io.c                      |  5632 +++++++++
 kernel/fs/btrfs/extent_io.h                      |   382 +
 kernel/fs/btrfs/extent_map.c                     |   455 +
 kernel/fs/btrfs/extent_map.h                     |    85 +
 kernel/fs/btrfs/file-item.c                      |   953 ++
 kernel/fs/btrfs/file.c                           |  2860 +++++
 kernel/fs/btrfs/free-space-cache.c               |  3653 ++++++
 kernel/fs/btrfs/free-space-cache.h               |   133 +
 kernel/fs/btrfs/hash.c                           |    46 +
 kernel/fs/btrfs/hash.h                           |    42 +
 kernel/fs/btrfs/inode-item.c                     |   440 +
 kernel/fs/btrfs/inode-map.c                      |   568 +
 kernel/fs/btrfs/inode-map.h                      |    13 +
 kernel/fs/btrfs/inode.c                          |  9907 +++++++++++++++
 kernel/fs/btrfs/ioctl.c                          |  5359 ++++++++
 kernel/fs/btrfs/locking.c                        |   300 +
 kernel/fs/btrfs/locking.h                        |    62 +
 kernel/fs/btrfs/lzo.c                            |   443 +
 kernel/fs/btrfs/math.h                           |    42 +
 kernel/fs/btrfs/ordered-data.c                   |  1051 ++
 kernel/fs/btrfs/ordered-data.h                   |   212 +
 kernel/fs/btrfs/orphan.c                         |    71 +
 kernel/fs/btrfs/print-tree.c                     |   349 +
 kernel/fs/btrfs/print-tree.h                     |    23 +
 kernel/fs/btrfs/props.c                          |   429 +
 kernel/fs/btrfs/props.h                          |    42 +
 kernel/fs/btrfs/qgroup.c                         |  2966 +++++
 kernel/fs/btrfs/qgroup.h                         |   107 +
 kernel/fs/btrfs/raid56.c                         |  2670 ++++
 kernel/fs/btrfs/raid56.h                         |    62 +
 kernel/fs/btrfs/rcu-string.h                     |    56 +
 kernel/fs/btrfs/reada.c                          |   992 ++
 kernel/fs/btrfs/relocation.c                     |  4658 +++++++
 kernel/fs/btrfs/root-tree.c                      |   497 +
 kernel/fs/btrfs/scrub.c                          |  4228 +++++++
 kernel/fs/btrfs/send.c                           |  5962 +++++++++
 kernel/fs/btrfs/send.h                           |   134 +
 kernel/fs/btrfs/struct-funcs.c                   |   142 +
 kernel/fs/btrfs/super.c                          |  2220 ++++
 kernel/fs/btrfs/sysfs.c                          |   758 ++
 kernel/fs/btrfs/sysfs.h                          |    89 +
 kernel/fs/btrfs/tests/btrfs-tests.c              |   171 +
 kernel/fs/btrfs/tests/btrfs-tests.h              |    68 +
 kernel/fs/btrfs/tests/extent-buffer-tests.c      |   229 +
 kernel/fs/btrfs/tests/extent-io-tests.c          |   275 +
 kernel/fs/btrfs/tests/free-space-tests.c         |   909 ++
 kernel/fs/btrfs/tests/inode-tests.c              |  1123 ++
 kernel/fs/btrfs/tests/qgroup-tests.c             |   469 +
 kernel/fs/btrfs/transaction.c                    |  2204 ++++
 kernel/fs/btrfs/transaction.h                    |   194 +
 kernel/fs/btrfs/tree-defrag.c                    |   139 +
 kernel/fs/btrfs/tree-log.c                       |  5089 ++++++++
 kernel/fs/btrfs/tree-log.h                       |    85 +
 kernel/fs/btrfs/ulist.c                          |   251 +
 kernel/fs/btrfs/ulist.h                          |    80 +
 kernel/fs/btrfs/uuid-tree.c                      |   354 +
 kernel/fs/btrfs/volumes.c                        |  6730 ++++++++++
 kernel/fs/btrfs/volumes.h                        |   541 +
 kernel/fs/btrfs/xattr.c                          |   517 +
 kernel/fs/btrfs/xattr.h                          |    41 +
 kernel/fs/btrfs/zlib.c                           |   412 +
 kernel/fs/buffer.c                               |  3422 +++++
 kernel/fs/cachefiles/Kconfig                     |    39 +
 kernel/fs/cachefiles/Makefile                    |    18 +
 kernel/fs/cachefiles/bind.c                      |   277 +
 kernel/fs/cachefiles/daemon.c                    |   751 ++
 kernel/fs/cachefiles/interface.c                 |   557 +
 kernel/fs/cachefiles/internal.h                  |   363 +
 kernel/fs/cachefiles/key.c                       |   159 +
 kernel/fs/cachefiles/main.c                      |   105 +
 kernel/fs/cachefiles/namei.c                     |   978 ++
 kernel/fs/cachefiles/proc.c                      |   134 +
 kernel/fs/cachefiles/rdwr.c                      |   967 ++
 kernel/fs/cachefiles/security.c                  |   116 +
 kernel/fs/cachefiles/xattr.c                     |   324 +
 kernel/fs/ceph/Kconfig                           |    40 +
 kernel/fs/ceph/Makefile                          |    13 +
 kernel/fs/ceph/acl.c                             |   263 +
 kernel/fs/ceph/addr.c                            |  1599 +++
 kernel/fs/ceph/cache.c                           |   402 +
 kernel/fs/ceph/cache.h                           |   182 +
 kernel/fs/ceph/caps.c                            |  3456 +++++
 kernel/fs/ceph/ceph_frag.c                       |    22 +
 kernel/fs/ceph/debugfs.c                         |   321 +
 kernel/fs/ceph/dir.c                             |  1411 +++
 kernel/fs/ceph/export.c                          |   250 +
 kernel/fs/ceph/file.c                            |  1344 ++
 kernel/fs/ceph/inode.c                           |  2016 +++
 kernel/fs/ceph/ioctl.c                           |   295 +
 kernel/fs/ceph/ioctl.h                           |   100 +
 kernel/fs/ceph/locks.c                           |   381 +
 kernel/fs/ceph/mds_client.c                      |  3871 ++++++
 kernel/fs/ceph/mds_client.h                      |   406 +
 kernel/fs/ceph/mdsmap.c                          |   189 +
 kernel/fs/ceph/snap.c                            |   977 ++
 kernel/fs/ceph/strings.c                         |   125 +
 kernel/fs/ceph/super.c                           |  1059 ++
 kernel/fs/ceph/super.h                           |   946 ++
 kernel/fs/ceph/xattr.c                           |  1117 ++
 kernel/fs/char_dev.c                             |   569 +
 kernel/fs/cifs/Kconfig                           |   202 +
 kernel/fs/cifs/Makefile                          |    20 +
 kernel/fs/cifs/asn1.c                            |   623 +
 kernel/fs/cifs/cache.c                           |   333 +
 kernel/fs/cifs/cifs_debug.c                      |   701 ++
 kernel/fs/cifs/cifs_debug.h                      |    77 +
 kernel/fs/cifs/cifs_dfs_ref.c                    |   379 +
 kernel/fs/cifs/cifs_fs_sb.h                      |    71 +
 kernel/fs/cifs/cifs_spnego.c                     |   179 +
 kernel/fs/cifs/cifs_spnego.h                     |    47 +
 kernel/fs/cifs/cifs_unicode.c                    |   611 +
 kernel/fs/cifs/cifs_unicode.h                    |   418 +
 kernel/fs/cifs/cifs_uniupr.h                     |   253 +
 kernel/fs/cifs/cifsacl.c                         |  1119 ++
 kernel/fs/cifs/cifsacl.h                         |   101 +
 kernel/fs/cifs/cifsencrypt.c                     |   818 ++
 kernel/fs/cifs/cifsfs.c                          |  1308 ++
 kernel/fs/cifs/cifsfs.h                          |   140 +
 kernel/fs/cifs/cifsglob.h                        |  1620 +++
 kernel/fs/cifs/cifspdu.h                         |  2739 ++++
 kernel/fs/cifs/cifsproto.h                       |   514 +
 kernel/fs/cifs/cifssmb.c                         |  6526 ++++++++++
 kernel/fs/cifs/connect.c                         |  4137 ++++++
 kernel/fs/cifs/dir.c                             |   924 ++
 kernel/fs/cifs/dns_resolve.c                     |    99 +
 kernel/fs/cifs/dns_resolve.h                     |    30 +
 kernel/fs/cifs/export.c                          |    67 +
 kernel/fs/cifs/file.c                            |  3896 ++++++
 kernel/fs/cifs/fscache.c                         |   242 +
 kernel/fs/cifs/fscache.h                         |   149 +
 kernel/fs/cifs/inode.c                           |  2454 ++++
 kernel/fs/cifs/ioctl.c                           |   217 +
 kernel/fs/cifs/link.c                            |   746 ++
 kernel/fs/cifs/misc.c                            |   641 +
 kernel/fs/cifs/netmisc.c                         |  1014 ++
 kernel/fs/cifs/nterr.c                           |   687 +
 kernel/fs/cifs/nterr.h                           |   563 +
 kernel/fs/cifs/ntlmssp.h                         |   138 +
 kernel/fs/cifs/readdir.c                         |   875 ++
 kernel/fs/cifs/rfc1002pdu.h                      |    74 +
 kernel/fs/cifs/sess.c                            |  1442 +++
 kernel/fs/cifs/smb1ops.c                         |  1116 ++
 kernel/fs/cifs/smb2file.c                        |   265 +
 kernel/fs/cifs/smb2glob.h                        |    63 +
 kernel/fs/cifs/smb2inode.c                       |   273 +
 kernel/fs/cifs/smb2maperror.c                    |  2481 ++++
 kernel/fs/cifs/smb2misc.c                        |   632 +
 kernel/fs/cifs/smb2ops.c                         |  1716 +++
 kernel/fs/cifs/smb2pdu.c                         |  2661 ++++
 kernel/fs/cifs/smb2pdu.h                         |  1080 ++
 kernel/fs/cifs/smb2proto.h                       |   174 +
 kernel/fs/cifs/smb2status.h                      |  1782 +++
 kernel/fs/cifs/smb2transport.c                   |   612 +
 kernel/fs/cifs/smbencrypt.c                      |   249 +
 kernel/fs/cifs/smberr.h                          |   184 +
 kernel/fs/cifs/smbfsctl.h                        |   121 +
 kernel/fs/cifs/transport.c                       |  1113 ++
 kernel/fs/cifs/winucase.c                        |   663 +
 kernel/fs/cifs/xattr.c                           |   424 +
 kernel/fs/coda/Kconfig                           |    21 +
 kernel/fs/coda/Makefile                          |    12 +
 kernel/fs/coda/cache.c                           |   118 +
 kernel/fs/coda/cnode.c                           |   168 +
 kernel/fs/coda/coda_cache.h                      |    22 +
 kernel/fs/coda/coda_fs_i.h                       |    58 +
 kernel/fs/coda/coda_int.h                        |    20 +
 kernel/fs/coda/coda_linux.c                      |   186 +
 kernel/fs/coda/coda_linux.h                      |   105 +
 kernel/fs/coda/dir.c                             |   579 +
 kernel/fs/coda/file.c                            |   230 +
 kernel/fs/coda/inode.c                           |   333 +
 kernel/fs/coda/pioctl.c                          |    90 +
 kernel/fs/coda/psdev.c                           |   437 +
 kernel/fs/coda/symlink.c                         |    50 +
 kernel/fs/coda/sysctl.c                          |    73 +
 kernel/fs/coda/upcall.c                          |   882 ++
 kernel/fs/compat.c                               |  1481 +++
 kernel/fs/compat_binfmt_elf.c                    |   145 +
 kernel/fs/compat_ioctl.c                         |  1638 +++
 kernel/fs/configfs/Kconfig                       |    11 +
 kernel/fs/configfs/Makefile                      |     7 +
 kernel/fs/configfs/configfs_internal.h           |   163 +
 kernel/fs/configfs/dir.c                         |  1724 +++
 kernel/fs/configfs/file.c                        |   336 +
 kernel/fs/configfs/inode.c                       |   272 +
 kernel/fs/configfs/item.c                        |   214 +
 kernel/fs/configfs/mount.c                       |   175 +
 kernel/fs/configfs/symlink.c                     |   314 +
 kernel/fs/coredump.c                             |   751 ++
 kernel/fs/cramfs/Kconfig                         |    22 +
 kernel/fs/cramfs/Makefile                        |     7 +
 kernel/fs/cramfs/README                          |   168 +
 kernel/fs/cramfs/inode.c                         |   611 +
 kernel/fs/cramfs/internal.h                      |     4 +
 kernel/fs/cramfs/uncompress.c                    |    79 +
 kernel/fs/dax.c                                  |   550 +
 kernel/fs/dcache.c                               |  3459 +++++
 kernel/fs/dcookies.c                             |   350 +
 kernel/fs/debugfs/Makefile                       |     4 +
 kernel/fs/debugfs/file.c                         |   818 ++
 kernel/fs/debugfs/inode.c                        |   736 ++
 kernel/fs/devpts/Makefile                        |     7 +
 kernel/fs/devpts/inode.c                         |   689 +
 kernel/fs/direct-io.c                            |  1332 ++
 kernel/fs/dlm/Kconfig                            |    16 +
 kernel/fs/dlm/Makefile                           |    21 +
 kernel/fs/dlm/ast.c                              |   314 +
 kernel/fs/dlm/ast.h                              |    32 +
 kernel/fs/dlm/config.c                           |  1040 ++
 kernel/fs/dlm/config.h                           |    53 +
 kernel/fs/dlm/debug_fs.c                         |   791 ++
 kernel/fs/dlm/dir.c                              |   308 +
 kernel/fs/dlm/dir.h                              |    25 +
 kernel/fs/dlm/dlm_internal.h                     |   729 ++
 kernel/fs/dlm/lock.c                             |  6304 ++++++++++
 kernel/fs/dlm/lock.h                             |    80 +
 kernel/fs/dlm/lockspace.c                        |   917 ++
 kernel/fs/dlm/lockspace.h                        |    26 +
 kernel/fs/dlm/lowcomms.c                         |  1830 +++
 kernel/fs/dlm/lowcomms.h                         |    27 +
 kernel/fs/dlm/lvb_table.h                        |    18 +
 kernel/fs/dlm/main.c                             |    97 +
 kernel/fs/dlm/member.c                           |   722 ++
 kernel/fs/dlm/member.h                           |    33 +
 kernel/fs/dlm/memory.c                           |    96 +
 kernel/fs/dlm/memory.h                           |    27 +
 kernel/fs/dlm/midcomms.c                         |   137 +
 kernel/fs/dlm/midcomms.h                         |    21 +
 kernel/fs/dlm/netlink.c                          |   136 +
 kernel/fs/dlm/plock.c                            |   515 +
 kernel/fs/dlm/rcom.c                             |   656 +
 kernel/fs/dlm/rcom.h                             |    26 +
 kernel/fs/dlm/recover.c                          |   955 ++
 kernel/fs/dlm/recover.h                          |    34 +
 kernel/fs/dlm/recoverd.c                         |   342 +
 kernel/fs/dlm/recoverd.h                         |    23 +
 kernel/fs/dlm/requestqueue.c                     |   171 +
 kernel/fs/dlm/requestqueue.h                     |    22 +
 kernel/fs/dlm/user.c                             |  1015 ++
 kernel/fs/dlm/user.h                             |    19 +
 kernel/fs/dlm/util.c                             |   154 +
 kernel/fs/dlm/util.h                             |    22 +
 kernel/fs/drop_caches.c                          |    67 +
 kernel/fs/ecryptfs/Kconfig                       |    22 +
 kernel/fs/ecryptfs/Makefile                      |    10 +
 kernel/fs/ecryptfs/crypto.c                      |  2166 ++++
 kernel/fs/ecryptfs/debug.c                       |   121 +
 kernel/fs/ecryptfs/dentry.c                      |    92 +
 kernel/fs/ecryptfs/ecryptfs_kernel.h             |   721 ++
 kernel/fs/ecryptfs/file.c                        |   375 +
 kernel/fs/ecryptfs/inode.c                       |  1138 ++
 kernel/fs/ecryptfs/keystore.c                    |  2529 ++++
 kernel/fs/ecryptfs/kthread.c                     |   172 +
 kernel/fs/ecryptfs/main.c                        |   906 ++
 kernel/fs/ecryptfs/messaging.c                   |   468 +
 kernel/fs/ecryptfs/miscdev.c                     |   511 +
 kernel/fs/ecryptfs/mmap.c                        |   561 +
 kernel/fs/ecryptfs/read_write.c                  |   273 +
 kernel/fs/ecryptfs/super.c                       |   191 +
 kernel/fs/efivarfs/Kconfig                       |    13 +
 kernel/fs/efivarfs/Makefile                      |     7 +
 kernel/fs/efivarfs/file.c                        |   111 +
 kernel/fs/efivarfs/inode.c                       |   162 +
 kernel/fs/efivarfs/internal.h                    |    22 +
 kernel/fs/efivarfs/super.c                       |   267 +
 kernel/fs/efs/Kconfig                            |    14 +
 kernel/fs/efs/Makefile                           |     7 +
 kernel/fs/efs/dir.c                              |   103 +
 kernel/fs/efs/efs.h                              |   146 +
 kernel/fs/efs/file.c                             |    56 +
 kernel/fs/efs/inode.c                            |   311 +
 kernel/fs/efs/namei.c                            |   119 +
 kernel/fs/efs/super.c                            |   353 +
 kernel/fs/efs/symlink.c                          |    54 +
 kernel/fs/eventfd.c                              |   449 +
 kernel/fs/eventpoll.c                            |  2133 ++++
 kernel/fs/exec.c                                 |  1747 +++
 kernel/fs/exofs/BUGS                             |     3 +
 kernel/fs/exofs/Kbuild                           |    20 +
 kernel/fs/exofs/Kconfig                          |    13 +
 kernel/fs/exofs/Kconfig.ore                      |    14 +
 kernel/fs/exofs/common.h                         |   262 +
 kernel/fs/exofs/dir.c                            |   667 +
 kernel/fs/exofs/exofs.h                          |   245 +
 kernel/fs/exofs/file.c                           |    83 +
 kernel/fs/exofs/inode.c                          |  1524 +++
 kernel/fs/exofs/namei.c                          |   320 +
 kernel/fs/exofs/ore.c                            |  1164 ++
 kernel/fs/exofs/ore_raid.c                       |   721 ++
 kernel/fs/exofs/ore_raid.h                       |    62 +
 kernel/fs/exofs/super.c                          |  1049 ++
 kernel/fs/exofs/symlink.c                        |    55 +
 kernel/fs/exofs/sys.c                            |   205 +
 kernel/fs/exportfs/Makefile                      |     6 +
 kernel/fs/exportfs/expfs.c                       |   544 +
 kernel/fs/ext2/Kconfig                           |    44 +
 kernel/fs/ext2/Makefile                          |    12 +
 kernel/fs/ext2/acl.c                             |   257 +
 kernel/fs/ext2/acl.h                             |    71 +
 kernel/fs/ext2/balloc.c                          |  1536 +++
 kernel/fs/ext2/dir.c                             |   730 ++
 kernel/fs/ext2/ext2.h                            |   820 ++
 kernel/fs/ext2/file.c                            |   121 +
 kernel/fs/ext2/ialloc.c                          |   675 +
 kernel/fs/ext2/inode.c                           |  1573 +++
 kernel/fs/ext2/ioctl.c                           |   188 +
 kernel/fs/ext2/namei.c                           |   431 +
 kernel/fs/ext2/super.c                           |  1580 +++
 kernel/fs/ext2/symlink.c                         |    54 +
 kernel/fs/ext2/xattr.c                           |  1029 ++
 kernel/fs/ext2/xattr.h                           |   125 +
 kernel/fs/ext2/xattr_security.c                  |    74 +
 kernel/fs/ext2/xattr_trusted.c                   |    54 +
 kernel/fs/ext2/xattr_user.c                      |    61 +
 kernel/fs/ext3/Kconfig                           |    89 +
 kernel/fs/ext3/Makefile                          |    12 +
 kernel/fs/ext3/acl.c                             |   281 +
 kernel/fs/ext3/acl.h                             |    72 +
 kernel/fs/ext3/balloc.c                          |  2158 ++++
 kernel/fs/ext3/bitmap.c                          |    20 +
 kernel/fs/ext3/dir.c                             |   537 +
 kernel/fs/ext3/ext3.h                            |  1332 ++
 kernel/fs/ext3/ext3_jbd.c                        |    59 +
 kernel/fs/ext3/file.c                            |    79 +
 kernel/fs/ext3/fsync.c                           |   109 +
 kernel/fs/ext3/hash.c                            |   206 +
 kernel/fs/ext3/ialloc.c                          |   706 ++
 kernel/fs/ext3/inode.c                           |  3573 ++++++
 kernel/fs/ext3/ioctl.c                           |   327 +
 kernel/fs/ext3/namei.c                           |  2585 ++++
 kernel/fs/ext3/namei.h                           |    27 +
 kernel/fs/ext3/resize.c                          |  1117 ++
 kernel/fs/ext3/super.c                           |  3165 +++++
 kernel/fs/ext3/symlink.c                         |    54 +
 kernel/fs/ext3/xattr.c                           |  1330 ++
 kernel/fs/ext3/xattr.h                           |   136 +
 kernel/fs/ext3/xattr_security.c                  |    78 +
 kernel/fs/ext3/xattr_trusted.c                   |    54 +
 kernel/fs/ext3/xattr_user.c                      |    58 +
 kernel/fs/ext4/Kconfig                           |    97 +
 kernel/fs/ext4/Makefile                          |    16 +
 kernel/fs/ext4/acl.c                             |   283 +
 kernel/fs/ext4/acl.h                             |    72 +
 kernel/fs/ext4/balloc.c                          |   880 ++
 kernel/fs/ext4/bitmap.c                          |    97 +
 kernel/fs/ext4/block_validity.c                  |   242 +
 kernel/fs/ext4/crypto.c                          |   558 +
 kernel/fs/ext4/crypto_fname.c                    |   719 ++
 kernel/fs/ext4/crypto_key.c                      |   166 +
 kernel/fs/ext4/crypto_policy.c                   |   198 +
 kernel/fs/ext4/dir.c                             |   644 +
 kernel/fs/ext4/ext4.h                            |  2998 +++++
 kernel/fs/ext4/ext4_crypto.h                     |   156 +
 kernel/fs/ext4/ext4_extents.h                    |   274 +
 kernel/fs/ext4/ext4_jbd2.c                       |   327 +
 kernel/fs/ext4/ext4_jbd2.h                       |   450 +
 kernel/fs/ext4/extents.c                         |  5707 +++++++++
 kernel/fs/ext4/extents_status.c                  |  1310 ++
 kernel/fs/ext4/extents_status.h                  |   175 +
 kernel/fs/ext4/file.c                            |   651 +
 kernel/fs/ext4/fsync.c                           |   150 +
 kernel/fs/ext4/hash.c                            |   207 +
 kernel/fs/ext4/ialloc.c                          |  1350 ++
 kernel/fs/ext4/indirect.c                        |  1558 +++
 kernel/fs/ext4/inline.c                          |  2024 +++
 kernel/fs/ext4/inode.c                           |  5279 ++++++++
 kernel/fs/ext4/ioctl.c                           |   773 ++
 kernel/fs/ext4/mballoc.c                         |  5238 ++++++++
 kernel/fs/ext4/mballoc.h                         |   215 +
 kernel/fs/ext4/migrate.c                         |   672 +
 kernel/fs/ext4/mmp.c                             |   393 +
 kernel/fs/ext4/move_extent.c                     |   684 +
 kernel/fs/ext4/namei.c                           |  3918 ++++++
 kernel/fs/ext4/page-io.c                         |   529 +
 kernel/fs/ext4/readpage.c                        |   328 +
 kernel/fs/ext4/resize.c                          |  2024 +++
 kernel/fs/ext4/super.c                           |  5668 +++++++++
 kernel/fs/ext4/symlink.c                         |   145 +
 kernel/fs/ext4/truncate.h                        |    43 +
 kernel/fs/ext4/xattr.c                           |  1727 +++
 kernel/fs/ext4/xattr.h                           |   139 +
 kernel/fs/ext4/xattr_security.c                  |    82 +
 kernel/fs/ext4/xattr_trusted.c                   |    58 +
 kernel/fs/ext4/xattr_user.c                      |    61 +
 kernel/fs/f2fs/Kconfig                           |    83 +
 kernel/fs/f2fs/Makefile                          |     8 +
 kernel/fs/f2fs/acl.c                             |   410 +
 kernel/fs/f2fs/acl.h                             |    54 +
 kernel/fs/f2fs/checkpoint.c                      |  1135 ++
 kernel/fs/f2fs/data.c                            |  1864 +++
 kernel/fs/f2fs/debug.c                           |   413 +
 kernel/fs/f2fs/dir.c                             |   811 ++
 kernel/fs/f2fs/f2fs.h                            |  1814 +++
 kernel/fs/f2fs/file.c                            |  1172 ++
 kernel/fs/f2fs/gc.c                              |   746 ++
 kernel/fs/f2fs/gc.h                              |   110 +
 kernel/fs/f2fs/hash.c                            |   104 +
 kernel/fs/f2fs/inline.c                          |   532 +
 kernel/fs/f2fs/inode.c                           |   387 +
 kernel/fs/f2fs/namei.c                           |   832 ++
 kernel/fs/f2fs/node.c                            |  2084 +++
 kernel/fs/f2fs/node.h                            |   416 +
 kernel/fs/f2fs/recovery.c                        |   575 +
 kernel/fs/f2fs/segment.c                         |  2307 ++++
 kernel/fs/f2fs/segment.h                         |   751 ++
 kernel/fs/f2fs/super.c                           |  1350 ++
 kernel/fs/f2fs/trace.c                           |   159 +
 kernel/fs/f2fs/trace.h                           |    46 +
 kernel/fs/f2fs/xattr.c                           |   618 +
 kernel/fs/f2fs/xattr.h                           |   152 +
 kernel/fs/fat/Kconfig                            |   100 +
 kernel/fs/fat/Makefile                           |    11 +
 kernel/fs/fat/cache.c                            |   351 +
 kernel/fs/fat/dir.c                              |  1402 +++
 kernel/fs/fat/fat.h                              |   420 +
 kernel/fs/fat/fatent.c                           |   692 +
 kernel/fs/fat/file.c                             |   459 +
 kernel/fs/fat/inode.c                            |  1850 +++
 kernel/fs/fat/misc.c                             |   278 +
 kernel/fs/fat/namei_msdos.c                      |   684 +
 kernel/fs/fat/namei_vfat.c                       |  1089 ++
 kernel/fs/fat/nfs.c                              |   301 +
 kernel/fs/fcntl.c                                |   759 ++
 kernel/fs/fhandle.c                              |   266 +
 kernel/fs/file.c                                 |   914 ++
 kernel/fs/file_table.c                           |   327 +
 kernel/fs/filesystems.c                          |   288 +
 kernel/fs/freevxfs/Kconfig                       |    16 +
 kernel/fs/freevxfs/Makefile                      |     8 +
 kernel/fs/freevxfs/vxfs.h                        |   263 +
 kernel/fs/freevxfs/vxfs_bmap.c                   |   281 +
 kernel/fs/freevxfs/vxfs_dir.h                    |    92 +
 kernel/fs/freevxfs/vxfs_extern.h                 |    81 +
 kernel/fs/freevxfs/vxfs_fshead.c                 |   203 +
 kernel/fs/freevxfs/vxfs_fshead.h                 |    67 +
 kernel/fs/freevxfs/vxfs_immed.c                  |   115 +
 kernel/fs/freevxfs/vxfs_inode.c                  |   360 +
 kernel/fs/freevxfs/vxfs_inode.h                  |   180 +
 kernel/fs/freevxfs/vxfs_lookup.c                 |   313 +
 kernel/fs/freevxfs/vxfs_olt.c                    |   130 +
 kernel/fs/freevxfs/vxfs_olt.h                    |   145 +
 kernel/fs/freevxfs/vxfs_subr.c                   |   183 +
 kernel/fs/freevxfs/vxfs_super.c                  |   293 +
 kernel/fs/fs-writeback.c                         |  1595 +++
 kernel/fs/fs_pin.c                               |   102 +
 kernel/fs/fs_struct.c                            |   166 +
 kernel/fs/fscache/Kconfig                        |    61 +
 kernel/fs/fscache/Makefile                       |    20 +
 kernel/fs/fscache/cache.c                        |   421 +
 kernel/fs/fscache/cookie.c                       |   722 ++
 kernel/fs/fscache/fsdef.c                        |   146 +
 kernel/fs/fscache/histogram.c                    |   107 +
 kernel/fs/fscache/internal.h                     |   464 +
 kernel/fs/fscache/main.c                         |   206 +
 kernel/fs/fscache/netfs.c                        |   104 +
 kernel/fs/fscache/object-list.c                  |   412 +
 kernel/fs/fscache/object.c                       |  1018 ++
 kernel/fs/fscache/operation.c                    |   529 +
 kernel/fs/fscache/page.c                         |  1195 ++
 kernel/fs/fscache/proc.c                         |    81 +
 kernel/fs/fscache/stats.c                        |   291 +
 kernel/fs/fuse/Kconfig                           |    27 +
 kernel/fs/fuse/Makefile                          |     8 +
 kernel/fs/fuse/control.c                         |   354 +
 kernel/fs/fuse/cuse.c                            |   636 +
 kernel/fs/fuse/dev.c                             |  2273 ++++
 kernel/fs/fuse/dir.c                             |  1952 +++
 kernel/fs/fuse/file.c                            |  2994 +++++
 kernel/fs/fuse/fuse_i.h                          |   912 ++
 kernel/fs/fuse/inode.c                           |  1320 ++
 kernel/fs/gfs2/Kconfig                           |    34 +
 kernel/fs/gfs2/Makefile                          |    10 +
 kernel/fs/gfs2/acl.c                             |   117 +
 kernel/fs/gfs2/acl.h                             |    22 +
 kernel/fs/gfs2/aops.c                            |  1229 ++
 kernel/fs/gfs2/bmap.c                            |  1495 +++
 kernel/fs/gfs2/bmap.h                            |    61 +
 kernel/fs/gfs2/dentry.c                          |   134 +
 kernel/fs/gfs2/dir.c                             |  2090 +++
 kernel/fs/gfs2/dir.h                             |    86 +
 kernel/fs/gfs2/export.c                          |   208 +
 kernel/fs/gfs2/file.c                            |  1159 ++
 kernel/fs/gfs2/gfs2.h                            |    26 +
 kernel/fs/gfs2/glock.c                           |  2119 ++++
 kernel/fs/gfs2/glock.h                           |   250 +
 kernel/fs/gfs2/glops.c                           |   616 +
 kernel/fs/gfs2/glops.h                           |    30 +
 kernel/fs/gfs2/incore.h                          |   843 ++
 kernel/fs/gfs2/inode.c                           |  1973 +++
 kernel/fs/gfs2/inode.h                           |   141 +
 kernel/fs/gfs2/lock_dlm.c                        |  1336 ++
 kernel/fs/gfs2/log.c                             |   950 ++
 kernel/fs/gfs2/log.h                             |    85 +
 kernel/fs/gfs2/lops.c                            |   886 ++
 kernel/fs/gfs2/lops.h                            |   104 +
 kernel/fs/gfs2/main.c                            |   258 +
 kernel/fs/gfs2/meta_io.c                         |   403 +
 kernel/fs/gfs2/meta_io.h                         |    78 +
 kernel/fs/gfs2/ops_fstype.c                      |  1409 +++
 kernel/fs/gfs2/quota.c                           |  1680 +++
 kernel/fs/gfs2/quota.h                           |    64 +
 kernel/fs/gfs2/recovery.c                        |   611 +
 kernel/fs/gfs2/recovery.h                        |    36 +
 kernel/fs/gfs2/rgrp.c                            |  2623 ++++
 kernel/fs/gfs2/rgrp.h                            |    86 +
 kernel/fs/gfs2/super.c                           |  1666 +++
 kernel/fs/gfs2/super.h                           |    58 +
 kernel/fs/gfs2/sys.c                             |   711 ++
 kernel/fs/gfs2/sys.h                             |    25 +
 kernel/fs/gfs2/trace_gfs2.h                      |   558 +
 kernel/fs/gfs2/trans.c                           |   279 +
 kernel/fs/gfs2/trans.h                           |    47 +
 kernel/fs/gfs2/util.c                            |   265 +
 kernel/fs/gfs2/util.h                            |   171 +
 kernel/fs/gfs2/xattr.c                           |  1508 +++
 kernel/fs/gfs2/xattr.h                           |    67 +
 kernel/fs/hfs/Kconfig                            |    12 +
 kernel/fs/hfs/Makefile                           |    10 +
 kernel/fs/hfs/attr.c                             |   121 +
 kernel/fs/hfs/bfind.c                            |   224 +
 kernel/fs/hfs/bitmap.c                           |   243 +
 kernel/fs/hfs/bnode.c                            |   485 +
 kernel/fs/hfs/brec.c                             |   520 +
 kernel/fs/hfs/btree.c                            |   369 +
 kernel/fs/hfs/btree.h                            |   163 +
 kernel/fs/hfs/catalog.c                          |   366 +
 kernel/fs/hfs/dir.c                              |   319 +
 kernel/fs/hfs/extent.c                           |   545 +
 kernel/fs/hfs/hfs.h                              |   289 +
 kernel/fs/hfs/hfs_fs.h                           |   290 +
 kernel/fs/hfs/inode.c                            |   692 +
 kernel/fs/hfs/mdb.c                              |   366 +
 kernel/fs/hfs/part_tbl.c                         |   117 +
 kernel/fs/hfs/string.c                           |   114 +
 kernel/fs/hfs/super.c                            |   508 +
 kernel/fs/hfs/sysdep.c                           |    45 +
 kernel/fs/hfs/trans.c                            |   150 +
 kernel/fs/hfsplus/Kconfig                        |    31 +
 kernel/fs/hfsplus/Makefile                       |    11 +
 kernel/fs/hfsplus/acl.h                          |    27 +
 kernel/fs/hfsplus/attributes.c                   |   371 +
 kernel/fs/hfsplus/bfind.c                        |   293 +
 kernel/fs/hfsplus/bitmap.c                       |   245 +
 kernel/fs/hfsplus/bnode.c                        |   671 +
 kernel/fs/hfsplus/brec.c                         |   527 +
 kernel/fs/hfsplus/btree.c                        |   497 +
 kernel/fs/hfsplus/catalog.c                      |   521 +
 kernel/fs/hfsplus/dir.c                          |   576 +
 kernel/fs/hfsplus/extents.c                      |   611 +
 kernel/fs/hfsplus/hfsplus_fs.h                   |   540 +
 kernel/fs/hfsplus/hfsplus_raw.h                  |   407 +
 kernel/fs/hfsplus/inode.c                        |   617 +
 kernel/fs/hfsplus/ioctl.c                        |   150 +
 kernel/fs/hfsplus/options.c                      |   238 +
 kernel/fs/hfsplus/part_tbl.c                     |   157 +
 kernel/fs/hfsplus/posix_acl.c                    |   140 +
 kernel/fs/hfsplus/super.c                        |   700 ++
 kernel/fs/hfsplus/tables.c                       |  3245 +++++
 kernel/fs/hfsplus/unicode.c                      |   469 +
 kernel/fs/hfsplus/wrapper.c                      |   261 +
 kernel/fs/hfsplus/xattr.c                        |   911 ++
 kernel/fs/hfsplus/xattr.h                        |    43 +
 kernel/fs/hfsplus/xattr_security.c               |    98 +
 kernel/fs/hfsplus/xattr_trusted.c                |    44 +
 kernel/fs/hfsplus/xattr_user.c                   |    44 +
 kernel/fs/hostfs/Makefile                        |    11 +
 kernel/fs/hostfs/hostfs.h                        |    98 +
 kernel/fs/hostfs/hostfs_kern.c                   |  1023 ++
 kernel/fs/hostfs/hostfs_user.c                   |   410 +
 kernel/fs/hpfs/Kconfig                           |    14 +
 kernel/fs/hpfs/Makefile                          |     8 +
 kernel/fs/hpfs/alloc.c                           |   486 +
 kernel/fs/hpfs/anode.c                           |   496 +
 kernel/fs/hpfs/buffer.c                          |   204 +
 kernel/fs/hpfs/dentry.c                          |    61 +
 kernel/fs/hpfs/dir.c                             |   330 +
 kernel/fs/hpfs/dnode.c                           |  1093 ++
 kernel/fs/hpfs/ea.c                              |   367 +
 kernel/fs/hpfs/file.c                            |   211 +
 kernel/fs/hpfs/hpfs.h                            |   559 +
 kernel/fs/hpfs/hpfs_fn.h                         |   363 +
 kernel/fs/hpfs/inode.c                           |   315 +
 kernel/fs/hpfs/map.c                             |   308 +
 kernel/fs/hpfs/name.c                            |   113 +
 kernel/fs/hpfs/namei.c                           |   628 +
 kernel/fs/hpfs/super.c                           |   753 ++
 kernel/fs/hppfs/Makefile                         |     6 +
 kernel/fs/hppfs/hppfs.c                          |   766 ++
 kernel/fs/hugetlbfs/Makefile                     |     7 +
 kernel/fs/hugetlbfs/inode.c                      |  1114 ++
 kernel/fs/inode.c                                |  1977 +++
 kernel/fs/internal.h                             |   153 +
 kernel/fs/ioctl.c                                |   625 +
 kernel/fs/isofs/Kconfig                          |    39 +
 kernel/fs/isofs/Makefile                         |    10 +
 kernel/fs/isofs/compress.c                       |   380 +
 kernel/fs/isofs/dir.c                            |   283 +
 kernel/fs/isofs/export.c                         |   192 +
 kernel/fs/isofs/inode.c                          |  1557 +++
 kernel/fs/isofs/isofs.h                          |   198 +
 kernel/fs/isofs/joliet.c                         |    69 +
 kernel/fs/isofs/namei.c                          |   172 +
 kernel/fs/isofs/rock.c                           |   801 ++
 kernel/fs/isofs/rock.h                           |   122 +
 kernel/fs/isofs/util.c                           |    70 +
 kernel/fs/isofs/zisofs.h                         |    21 +
 kernel/fs/jbd/Kconfig                            |    30 +
 kernel/fs/jbd/Makefile                           |     7 +
 kernel/fs/jbd/checkpoint.c                       |   784 ++
 kernel/fs/jbd/commit.c                           |  1021 ++
 kernel/fs/jbd/journal.c                          |  2145 ++++
 kernel/fs/jbd/recovery.c                         |   594 +
 kernel/fs/jbd/revoke.c                           |   733 ++
 kernel/fs/jbd/transaction.c                      |  2237 ++++
 kernel/fs/jbd2/Kconfig                           |    35 +
 kernel/fs/jbd2/Makefile                          |     7 +
 kernel/fs/jbd2/checkpoint.c                      |   647 +
 kernel/fs/jbd2/commit.c                          |  1164 ++
 kernel/fs/jbd2/journal.c                         |  2671 ++++
 kernel/fs/jbd2/recovery.c                        |   880 ++
 kernel/fs/jbd2/revoke.c                          |   764 ++
 kernel/fs/jbd2/transaction.c                     |  2487 ++++
 kernel/fs/jffs2/Kconfig                          |   188 +
 kernel/fs/jffs2/LICENCE                          |    30 +
 kernel/fs/jffs2/Makefile                         |    21 +
 kernel/fs/jffs2/README.Locking                   |   172 +
 kernel/fs/jffs2/TODO                             |    37 +
 kernel/fs/jffs2/acl.c                            |   309 +
 kernel/fs/jffs2/acl.h                            |    41 +
 kernel/fs/jffs2/background.c                     |   168 +
 kernel/fs/jffs2/build.c                          |   394 +
 kernel/fs/jffs2/compr.c                          |   418 +
 kernel/fs/jffs2/compr.h                          |   105 +
 kernel/fs/jffs2/compr_lzo.c                      |   110 +
 kernel/fs/jffs2/compr_rtime.c                    |   130 +
 kernel/fs/jffs2/compr_rubin.c                    |   452 +
 kernel/fs/jffs2/compr_zlib.c                     |   222 +
 kernel/fs/jffs2/debug.c                          |   866 ++
 kernel/fs/jffs2/debug.h                          |   275 +
 kernel/fs/jffs2/dir.c                            |   862 ++
 kernel/fs/jffs2/erase.c                          |   515 +
 kernel/fs/jffs2/file.c                           |   337 +
 kernel/fs/jffs2/fs.c                             |   766 ++
 kernel/fs/jffs2/gc.c                             |  1378 ++
 kernel/fs/jffs2/ioctl.c                          |    22 +
 kernel/fs/jffs2/jffs2_fs_i.h                     |    56 +
 kernel/fs/jffs2/jffs2_fs_sb.h                    |   162 +
 kernel/fs/jffs2/malloc.c                         |   324 +
 kernel/fs/jffs2/nodelist.c                       |   755 ++
 kernel/fs/jffs2/nodelist.h                       |   480 +
 kernel/fs/jffs2/nodemgmt.c                       |   883 ++
 kernel/fs/jffs2/os-linux.h                       |   199 +
 kernel/fs/jffs2/read.c                           |   228 +
 kernel/fs/jffs2/readinode.c                      |  1451 +++
 kernel/fs/jffs2/scan.c                           |  1176 ++
 kernel/fs/jffs2/security.c                       |    89 +
 kernel/fs/jffs2/summary.c                        |   874 ++
 kernel/fs/jffs2/summary.h                        |   213 +
 kernel/fs/jffs2/super.c                          |   442 +
 kernel/fs/jffs2/symlink.c                        |    66 +
 kernel/fs/jffs2/wbuf.c                           |  1353 ++
 kernel/fs/jffs2/write.c                          |   722 ++
 kernel/fs/jffs2/writev.c                         |    51 +
 kernel/fs/jffs2/xattr.c                          |  1340 ++
 kernel/fs/jffs2/xattr.h                          |   133 +
 kernel/fs/jffs2/xattr_trusted.c                  |    55 +
 kernel/fs/jffs2/xattr_user.c                     |    55 +
 kernel/fs/jfs/Kconfig                            |    50 +
 kernel/fs/jfs/Makefile                           |    16 +
 kernel/fs/jfs/acl.c                              |   161 +
 kernel/fs/jfs/file.c                             |   165 +
 kernel/fs/jfs/inode.c                            |   423 +
 kernel/fs/jfs/ioctl.c                            |   189 +
 kernel/fs/jfs/jfs_acl.h                          |    36 +
 kernel/fs/jfs/jfs_btree.h                        |   172 +
 kernel/fs/jfs/jfs_debug.c                        |   109 +
 kernel/fs/jfs/jfs_debug.h                        |   122 +
 kernel/fs/jfs/jfs_dinode.h                       |   176 +
 kernel/fs/jfs/jfs_discard.c                      |   121 +
 kernel/fs/jfs/jfs_discard.h                      |    26 +
 kernel/fs/jfs/jfs_dmap.c                         |  4092 ++++++
 kernel/fs/jfs/jfs_dmap.h                         |   316 +
 kernel/fs/jfs/jfs_dtree.c                        |  4576 +++++++
 kernel/fs/jfs/jfs_dtree.h                        |   269 +
 kernel/fs/jfs/jfs_extent.c                       |   651 +
 kernel/fs/jfs/jfs_extent.h                       |    31 +
 kernel/fs/jfs/jfs_filsys.h                       |   285 +
 kernel/fs/jfs/jfs_imap.c                         |  3178 +++++
 kernel/fs/jfs/jfs_imap.h                         |   175 +
 kernel/fs/jfs/jfs_incore.h                       |   228 +
 kernel/fs/jfs/jfs_inode.c                        |   165 +
 kernel/fs/jfs/jfs_inode.h                        |    53 +
 kernel/fs/jfs/jfs_lock.h                         |    52 +
 kernel/fs/jfs/jfs_logmgr.c                       |  2533 ++++
 kernel/fs/jfs/jfs_logmgr.h                       |   513 +
 kernel/fs/jfs/jfs_metapage.c                     |   837 ++
 kernel/fs/jfs/jfs_metapage.h                     |   154 +
 kernel/fs/jfs/jfs_mount.c                        |   507 +
 kernel/fs/jfs/jfs_superblock.h                   |   122 +
 kernel/fs/jfs/jfs_txnmgr.c                       |  3093 +++++
 kernel/fs/jfs/jfs_txnmgr.h                       |   311 +
 kernel/fs/jfs/jfs_types.h                        |   170 +
 kernel/fs/jfs/jfs_umount.c                       |   168 +
 kernel/fs/jfs/jfs_unicode.c                      |   138 +
 kernel/fs/jfs/jfs_unicode.h                      |   156 +
 kernel/fs/jfs/jfs_uniupr.c                       |   134 +
 kernel/fs/jfs/jfs_xattr.h                        |    77 +
 kernel/fs/jfs/jfs_xtree.c                        |  3903 ++++++
 kernel/fs/jfs/jfs_xtree.h                        |   125 +
 kernel/fs/jfs/namei.c                            |  1606 +++
 kernel/fs/jfs/resize.c                           |   543 +
 kernel/fs/jfs/super.c                            |  1012 ++
 kernel/fs/jfs/symlink.c                          |    52 +
 kernel/fs/jfs/xattr.c                            |  1106 ++
 kernel/fs/kernfs/Kconfig                         |     7 +
 kernel/fs/kernfs/Makefile                        |     5 +
 kernel/fs/kernfs/dir.c                           |  1464 +++
 kernel/fs/kernfs/file.c                          |   954 ++
 kernel/fs/kernfs/inode.c                         |   372 +
 kernel/fs/kernfs/kernfs-internal.h               |   119 +
 kernel/fs/kernfs/mount.c                         |   249 +
 kernel/fs/kernfs/symlink.c                       |   147 +
 kernel/fs/libfs.c                                |  1191 ++
 kernel/fs/lockd/Makefile                         |    11 +
 kernel/fs/lockd/clnt4xdr.c                       |   599 +
 kernel/fs/lockd/clntlock.c                       |   301 +
 kernel/fs/lockd/clntproc.c                       |   849 ++
 kernel/fs/lockd/clntxdr.c                        |   621 +
 kernel/fs/lockd/host.c                           |   675 +
 kernel/fs/lockd/mon.c                            |   622 +
 kernel/fs/lockd/netns.h                          |    22 +
 kernel/fs/lockd/procfs.c                         |    92 +
 kernel/fs/lockd/procfs.h                         |    28 +
 kernel/fs/lockd/svc.c                            |   695 +
 kernel/fs/lockd/svc4proc.c                       |   505 +
 kernel/fs/lockd/svclock.c                        |   939 ++
 kernel/fs/lockd/svcproc.c                        |   549 +
 kernel/fs/lockd/svcshare.c                       |   106 +
 kernel/fs/lockd/svcsubs.c                        |   457 +
 kernel/fs/lockd/xdr.c                            |   337 +
 kernel/fs/lockd/xdr4.c                           |   334 +
 kernel/fs/locks.c                                |  2703 ++++
 kernel/fs/logfs/Kconfig                          |    17 +
 kernel/fs/logfs/Makefile                         |    13 +
 kernel/fs/logfs/compr.c                          |    95 +
 kernel/fs/logfs/dev_bdev.c                       |   321 +
 kernel/fs/logfs/dev_mtd.c                        |   274 +
 kernel/fs/logfs/dir.c                            |   801 ++
 kernel/fs/logfs/file.c                           |   285 +
 kernel/fs/logfs/gc.c                             |   732 ++
 kernel/fs/logfs/inode.c                          |   426 +
 kernel/fs/logfs/journal.c                        |   894 ++
 kernel/fs/logfs/logfs.h                          |   736 ++
 kernel/fs/logfs/logfs_abi.h                      |   629 +
 kernel/fs/logfs/readwrite.c                      |  2298 ++++
 kernel/fs/logfs/segment.c                        |   961 ++
 kernel/fs/logfs/super.c                          |   653 +
 kernel/fs/mbcache.c                              |   858 ++
 kernel/fs/minix/Kconfig                          |    25 +
 kernel/fs/minix/Makefile                         |     7 +
 kernel/fs/minix/bitmap.c                         |   272 +
 kernel/fs/minix/dir.c                            |   473 +
 kernel/fs/minix/file.c                           |    51 +
 kernel/fs/minix/inode.c                          |   690 +
 kernel/fs/minix/itree_common.c                   |   364 +
 kernel/fs/minix/itree_v1.c                       |    67 +
 kernel/fs/minix/itree_v2.c                       |    76 +
 kernel/fs/minix/minix.h                          |   169 +
 kernel/fs/minix/namei.c                          |   270 +
 kernel/fs/mount.h                                |   140 +
 kernel/fs/mpage.c                                |   717 ++
 kernel/fs/namei.c                                |  4552 +++++++
 kernel/fs/namespace.c                            |  3298 +++++
 kernel/fs/ncpfs/Kconfig                          |   108 +
 kernel/fs/ncpfs/Makefile                         |    16 +
 kernel/fs/ncpfs/dir.c                            |  1237 ++
 kernel/fs/ncpfs/file.c                           |   262 +
 kernel/fs/ncpfs/getopt.c                         |    75 +
 kernel/fs/ncpfs/getopt.h                         |    16 +
 kernel/fs/ncpfs/inode.c                          |  1072 ++
 kernel/fs/ncpfs/ioctl.c                          |   919 ++
 kernel/fs/ncpfs/mmap.c                           |   125 +
 kernel/fs/ncpfs/ncp_fs.h                         |   100 +
 kernel/fs/ncpfs/ncp_fs_i.h                       |    30 +
 kernel/fs/ncpfs/ncp_fs_sb.h                      |   174 +
 kernel/fs/ncpfs/ncplib_kernel.c                  |  1321 ++
 kernel/fs/ncpfs/ncplib_kernel.h                  |   214 +
 kernel/fs/ncpfs/ncpsign_kernel.c                 |   127 +
 kernel/fs/ncpfs/ncpsign_kernel.h                 |    26 +
 kernel/fs/ncpfs/sock.c                           |   881 ++
 kernel/fs/ncpfs/symlink.c                        |   181 +
 kernel/fs/nfs/Kconfig                            |   202 +
 kernel/fs/nfs/Makefile                           |    36 +
 kernel/fs/nfs/blocklayout/Makefile               |     6 +
 kernel/fs/nfs/blocklayout/blocklayout.c          |   928 ++
 kernel/fs/nfs/blocklayout/blocklayout.h          |   204 +
 kernel/fs/nfs/blocklayout/dev.c                  |   363 +
 kernel/fs/nfs/blocklayout/extent_tree.c          |   602 +
 kernel/fs/nfs/blocklayout/rpc_pipefs.c           |   288 +
 kernel/fs/nfs/cache_lib.c                        |   158 +
 kernel/fs/nfs/cache_lib.h                        |    31 +
 kernel/fs/nfs/callback.c                         |   499 +
 kernel/fs/nfs/callback.h                         |   214 +
 kernel/fs/nfs/callback_proc.c                    |   554 +
 kernel/fs/nfs/callback_xdr.c                     |  1033 ++
 kernel/fs/nfs/client.c                           |  1472 +++
 kernel/fs/nfs/delegation.c                       |   902 ++
 kernel/fs/nfs/delegation.h                       |    73 +
 kernel/fs/nfs/dir.c                              |  2520 ++++
 kernel/fs/nfs/direct.c                           |  1066 ++
 kernel/fs/nfs/dns_resolve.c                      |   470 +
 kernel/fs/nfs/dns_resolve.h                      |    36 +
 kernel/fs/nfs/file.c                             |   947 ++
 kernel/fs/nfs/filelayout/Makefile                |     5 +
 kernel/fs/nfs/filelayout/filelayout.c            |  1162 ++
 kernel/fs/nfs/filelayout/filelayout.h            |   117 +
 kernel/fs/nfs/filelayout/filelayoutdev.c         |   299 +
 kernel/fs/nfs/flexfilelayout/Makefile            |     5 +
 kernel/fs/nfs/flexfilelayout/flexfilelayout.c    |  1535 +++
 kernel/fs/nfs/flexfilelayout/flexfilelayout.h    |   155 +
 kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c |   552 +
 kernel/fs/nfs/fscache-index.c                    |   336 +
 kernel/fs/nfs/fscache.c                          |   439 +
 kernel/fs/nfs/fscache.h                          |   229 +
 kernel/fs/nfs/getroot.c                          |   133 +
 kernel/fs/nfs/inode.c                            |  2066 +++
 kernel/fs/nfs/internal.h                         |   683 +
 kernel/fs/nfs/iostat.h                           |    76 +
 kernel/fs/nfs/mount_clnt.c                       |   535 +
 kernel/fs/nfs/namespace.c                        |   289 +
 kernel/fs/nfs/netns.h                            |    40 +
 kernel/fs/nfs/nfs.h                              |    29 +
 kernel/fs/nfs/nfs2super.c                        |    31 +
 kernel/fs/nfs/nfs2xdr.c                          |  1151 ++
 kernel/fs/nfs/nfs3_fs.h                          |    36 +
 kernel/fs/nfs/nfs3acl.c                          |   296 +
 kernel/fs/nfs/nfs3client.c                       |   107 +
 kernel/fs/nfs/nfs3proc.c                         |   974 ++
 kernel/fs/nfs/nfs3super.c                        |    35 +
 kernel/fs/nfs/nfs3xdr.c                          |  2564 ++++
 kernel/fs/nfs/nfs42.h                            |    16 +
 kernel/fs/nfs/nfs42proc.c                        |   167 +
 kernel/fs/nfs/nfs42xdr.c                         |   249 +
 kernel/fs/nfs/nfs4_fs.h                          |   529 +
 kernel/fs/nfs/nfs4client.c                       |  1232 ++
 kernel/fs/nfs/nfs4file.c                         |   189 +
 kernel/fs/nfs/nfs4getroot.c                      |    50 +
 kernel/fs/nfs/nfs4idmap.c                        |   792 ++
 kernel/fs/nfs/nfs4idmap.h                        |    68 +
 kernel/fs/nfs/nfs4namespace.c                    |   523 +
 kernel/fs/nfs/nfs4proc.c                         |  8687 +++++++++++++
 kernel/fs/nfs/nfs4renewd.c                       |   143 +
 kernel/fs/nfs/nfs4session.c                      |   565 +
 kernel/fs/nfs/nfs4session.h                      |   160 +
 kernel/fs/nfs/nfs4state.c                        |  2455 ++++
 kernel/fs/nfs/nfs4super.c                        |   361 +
 kernel/fs/nfs/nfs4sysctl.c                       |    69 +
 kernel/fs/nfs/nfs4trace.c                        |    17 +
 kernel/fs/nfs/nfs4trace.h                        |  1148 ++
 kernel/fs/nfs/nfs4xdr.c                          |  7443 +++++++++++
 kernel/fs/nfs/nfsroot.c                          |   309 +
 kernel/fs/nfs/nfstrace.c                         |    12 +
 kernel/fs/nfs/nfstrace.h                         |   730 ++
 kernel/fs/nfs/objlayout/Kbuild                   |     5 +
 kernel/fs/nfs/objlayout/objio_osd.c              |   678 +
 kernel/fs/nfs/objlayout/objlayout.c              |   706 ++
 kernel/fs/nfs/objlayout/objlayout.h              |   184 +
 kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c       |   415 +
 kernel/fs/nfs/pagelist.c                         |  1309 ++
 kernel/fs/nfs/pnfs.c                             |  2249 ++++
 kernel/fs/nfs/pnfs.h                             |   692 +
 kernel/fs/nfs/pnfs_dev.c                         |   365 +
 kernel/fs/nfs/pnfs_nfs.c                         |   880 ++
 kernel/fs/nfs/proc.c                             |   749 ++
 kernel/fs/nfs/read.c                             |   445 +
 kernel/fs/nfs/super.c                            |  2877 +++++
 kernel/fs/nfs/symlink.c                          |    78 +
 kernel/fs/nfs/sysctl.c                           |    64 +
 kernel/fs/nfs/unlink.c                           |   604 +
 kernel/fs/nfs/write.c                            |  2019 +++
 kernel/fs/nfs_common/Makefile                    |     8 +
 kernel/fs/nfs_common/grace.c                     |   113 +
 kernel/fs/nfs_common/nfsacl.c                    |   296 +
 kernel/fs/nfsd/Kconfig                           |   117 +
 kernel/fs/nfsd/Makefile                          |    20 +
 kernel/fs/nfsd/acl.h                             |    59 +
 kernel/fs/nfsd/auth.c                            |    90 +
 kernel/fs/nfsd/auth.h                            |    16 +
 kernel/fs/nfsd/blocklayout.c                     |   200 +
 kernel/fs/nfsd/blocklayoutxdr.c                  |   157 +
 kernel/fs/nfsd/blocklayoutxdr.h                  |    62 +
 kernel/fs/nfsd/cache.h                           |    86 +
 kernel/fs/nfsd/current_stateid.h                 |    28 +
 kernel/fs/nfsd/export.c                          |  1345 ++
 kernel/fs/nfsd/export.h                          |   113 +
 kernel/fs/nfsd/fault_inject.c                    |   150 +
 kernel/fs/nfsd/idmap.h                           |    62 +
 kernel/fs/nfsd/lockd.c                           |    77 +
 kernel/fs/nfsd/netns.h                           |   121 +
 kernel/fs/nfsd/nfs2acl.c                         |   382 +
 kernel/fs/nfsd/nfs3acl.c                         |   273 +
 kernel/fs/nfsd/nfs3proc.c                        |   891 ++
 kernel/fs/nfsd/nfs3xdr.c                         |  1114 ++
 kernel/fs/nfsd/nfs4acl.c                         |   896 ++
 kernel/fs/nfsd/nfs4callback.c                    |  1099 ++
 kernel/fs/nfsd/nfs4idmap.c                       |   666 +
 kernel/fs/nfsd/nfs4layouts.c                     |   723 ++
 kernel/fs/nfsd/nfs4proc.c                        |  2368 ++++
 kernel/fs/nfsd/nfs4recover.c                     |  1551 +++
 kernel/fs/nfsd/nfs4state.c                       |  6716 ++++++++++
 kernel/fs/nfsd/nfs4xdr.c                         |  4453 +++++++
 kernel/fs/nfsd/nfscache.c                        |   637 +
 kernel/fs/nfsd/nfsctl.c                          |  1318 ++
 kernel/fs/nfsd/nfsd.h                            |   422 +
 kernel/fs/nfsd/nfsfh.c                           |   692 +
 kernel/fs/nfsd/nfsfh.h                           |   292 +
 kernel/fs/nfsd/nfsproc.c                         |   759 ++
 kernel/fs/nfsd/nfssvc.c                          |   752 ++
 kernel/fs/nfsd/nfsxdr.c                          |   550 +
 kernel/fs/nfsd/pnfs.h                            |    86 +
 kernel/fs/nfsd/state.h                           |   662 +
 kernel/fs/nfsd/stats.c                           |   104 +
 kernel/fs/nfsd/stats.h                           |    43 +
 kernel/fs/nfsd/trace.c                           |     5 +
 kernel/fs/nfsd/trace.h                           |    54 +
 kernel/fs/nfsd/vfs.c                             |  2154 ++++
 kernel/fs/nfsd/vfs.h                             |   131 +
 kernel/fs/nfsd/xdr.h                             |   173 +
 kernel/fs/nfsd/xdr3.h                            |   349 +
 kernel/fs/nfsd/xdr4.h                            |   724 ++
 kernel/fs/nfsd/xdr4cb.h                          |    30 +
 kernel/fs/nilfs2/Kconfig                         |    24 +
 kernel/fs/nilfs2/Makefile                        |     5 +
 kernel/fs/nilfs2/alloc.c                         |   785 ++
 kernel/fs/nilfs2/alloc.h                         |   110 +
 kernel/fs/nilfs2/bmap.c                          |   593 +
 kernel/fs/nilfs2/bmap.h                          |   280 +
 kernel/fs/nilfs2/btnode.c                        |   297 +
 kernel/fs/nilfs2/btnode.h                        |    59 +
 kernel/fs/nilfs2/btree.c                         |  2414 ++++
 kernel/fs/nilfs2/btree.h                         |    77 +
 kernel/fs/nilfs2/cpfile.c                        |  1027 ++
 kernel/fs/nilfs2/cpfile.h                        |    46 +
 kernel/fs/nilfs2/dat.c                           |   529 +
 kernel/fs/nilfs2/dat.h                           |    59 +
 kernel/fs/nilfs2/dir.c                           |   676 +
 kernel/fs/nilfs2/direct.c                        |   386 +
 kernel/fs/nilfs2/direct.h                        |    51 +
 kernel/fs/nilfs2/export.h                        |    25 +
 kernel/fs/nilfs2/file.c                          |   165 +
 kernel/fs/nilfs2/gcinode.c                       |   197 +
 kernel/fs/nilfs2/ifile.c                         |   227 +
 kernel/fs/nilfs2/ifile.h                         |    58 +
 kernel/fs/nilfs2/inode.c                         |  1135 ++
 kernel/fs/nilfs2/ioctl.c                         |  1379 ++
 kernel/fs/nilfs2/mdt.c                           |   652 +
 kernel/fs/nilfs2/mdt.h                           |   123 +
 kernel/fs/nilfs2/namei.c                         |   585 +
 kernel/fs/nilfs2/nilfs.h                         |   354 +
 kernel/fs/nilfs2/page.c                          |   581 +
 kernel/fs/nilfs2/page.h                          |    80 +
 kernel/fs/nilfs2/recovery.c                      |   964 ++
 kernel/fs/nilfs2/segbuf.c                        |   536 +
 kernel/fs/nilfs2/segbuf.h                        |   184 +
 kernel/fs/nilfs2/segment.c                       |  2758 ++++
 kernel/fs/nilfs2/segment.h                       |   251 +
 kernel/fs/nilfs2/sufile.c                        |  1222 ++
 kernel/fs/nilfs2/sufile.h                        |   146 +
 kernel/fs/nilfs2/super.c                         |  1486 +++
 kernel/fs/nilfs2/sysfs.c                         |  1137 ++
 kernel/fs/nilfs2/sysfs.h                         |   176 +
 kernel/fs/nilfs2/the_nilfs.c                     |   815 ++
 kernel/fs/nilfs2/the_nilfs.h                     |   396 +
 kernel/fs/nls/Kconfig                            |   619 +
 kernel/fs/nls/Makefile                           |    55 +
 kernel/fs/nls/mac-celtic.c                       |   601 +
 kernel/fs/nls/mac-centeuro.c                     |   531 +
 kernel/fs/nls/mac-croatian.c                     |   601 +
 kernel/fs/nls/mac-cyrillic.c                     |   496 +
 kernel/fs/nls/mac-gaelic.c                       |   566 +
 kernel/fs/nls/mac-greek.c                        |   496 +
 kernel/fs/nls/mac-iceland.c                      |   601 +
 kernel/fs/nls/mac-inuit.c                        |   531 +
 kernel/fs/nls/mac-roman.c                        |   636 +
 kernel/fs/nls/mac-romanian.c                     |   601 +
 kernel/fs/nls/mac-turkish.c                      |   601 +
 kernel/fs/nls/nls_ascii.c                        |   166 +
 kernel/fs/nls/nls_base.c                         |   548 +
 kernel/fs/nls/nls_cp1250.c                       |   346 +
 kernel/fs/nls/nls_cp1251.c                       |   301 +
 kernel/fs/nls/nls_cp1255.c                       |   384 +
 kernel/fs/nls/nls_cp437.c                        |   387 +
 kernel/fs/nls/nls_cp737.c                        |   350 +
 kernel/fs/nls/nls_cp775.c                        |   319 +
 kernel/fs/nls/nls_cp850.c                        |   315 +
 kernel/fs/nls/nls_cp852.c                        |   337 +
 kernel/fs/nls/nls_cp855.c                        |   299 +
 kernel/fs/nls/nls_cp857.c                        |   301 +
 kernel/fs/nls/nls_cp860.c                        |   364 +
 kernel/fs/nls/nls_cp861.c                        |   387 +
 kernel/fs/nls/nls_cp862.c                        |   421 +
 kernel/fs/nls/nls_cp863.c                        |   381 +
 kernel/fs/nls/nls_cp864.c                        |   407 +
 kernel/fs/nls/nls_cp865.c                        |   387 +
 kernel/fs/nls/nls_cp866.c                        |   305 +
 kernel/fs/nls/nls_cp869.c                        |   315 +
 kernel/fs/nls/nls_cp874.c                        |   275 +
 kernel/fs/nls/nls_cp932.c                        |  7933 ++++++++++++
 kernel/fs/nls/nls_cp936.c                        | 11111 ++++++++++++++++
 kernel/fs/nls/nls_cp949.c                        | 13946 +++++++++++++++++++++
 kernel/fs/nls/nls_cp950.c                        |  9482 ++++++++++++++
 kernel/fs/nls/nls_euc-jp.c                       |   580 +
 kernel/fs/nls/nls_iso8859-1.c                    |   257 +
 kernel/fs/nls/nls_iso8859-13.c                   |   285 +
 kernel/fs/nls/nls_iso8859-14.c                   |   341 +
 kernel/fs/nls/nls_iso8859-15.c                   |   307 +
 kernel/fs/nls/nls_iso8859-2.c                    |   308 +
 kernel/fs/nls/nls_iso8859-3.c                    |   308 +
 kernel/fs/nls/nls_iso8859-4.c                    |   308 +
 kernel/fs/nls/nls_iso8859-5.c                    |   272 +
 kernel/fs/nls/nls_iso8859-6.c                    |   263 +
 kernel/fs/nls/nls_iso8859-7.c                    |   317 +
 kernel/fs/nls/nls_iso8859-9.c                    |   272 +
 kernel/fs/nls/nls_koi8-r.c                       |   323 +
 kernel/fs/nls/nls_koi8-ru.c                      |    82 +
 kernel/fs/nls/nls_koi8-u.c                       |   330 +
 kernel/fs/nls/nls_utf8.c                         |    67 +
 kernel/fs/no-block.c                             |    23 +
 kernel/fs/notify/Kconfig                         |     7 +
 kernel/fs/notify/Makefile                        |     6 +
 kernel/fs/notify/dnotify/Kconfig                 |    11 +
 kernel/fs/notify/dnotify/Makefile                |     1 +
 kernel/fs/notify/dnotify/dnotify.c               |   388 +
 kernel/fs/notify/fanotify/Kconfig                |    26 +
 kernel/fs/notify/fanotify/Makefile               |     1 +
 kernel/fs/notify/fanotify/fanotify.c             |   270 +
 kernel/fs/notify/fanotify/fanotify.h             |    50 +
 kernel/fs/notify/fanotify/fanotify_user.c        |   940 ++
 kernel/fs/notify/fdinfo.c                        |   165 +
 kernel/fs/notify/fdinfo.h                        |    27 +
 kernel/fs/notify/fsnotify.c                      |   299 +
 kernel/fs/notify/fsnotify.h                      |    60 +
 kernel/fs/notify/group.c                         |   118 +
 kernel/fs/notify/inode_mark.c                    |   247 +
 kernel/fs/notify/inotify/Kconfig                 |    17 +
 kernel/fs/notify/inotify/Makefile                |     1 +
 kernel/fs/notify/inotify/inotify.h               |    32 +
 kernel/fs/notify/inotify/inotify_fsnotify.c      |   184 +
 kernel/fs/notify/inotify/inotify_user.c          |   815 ++
 kernel/fs/notify/mark.c                          |   494 +
 kernel/fs/notify/notification.c                  |   213 +
 kernel/fs/notify/vfsmount_mark.c                 |   127 +
 kernel/fs/nsfs.c                                 |   161 +
 kernel/fs/ntfs/Kconfig                           |    78 +
 kernel/fs/ntfs/Makefile                          |    14 +
 kernel/fs/ntfs/aops.c                            |  1773 +++
 kernel/fs/ntfs/aops.h                            |   107 +
 kernel/fs/ntfs/attrib.c                          |  2614 ++++
 kernel/fs/ntfs/attrib.h                          |   116 +
 kernel/fs/ntfs/bitmap.c                          |   193 +
 kernel/fs/ntfs/bitmap.h                          |   118 +
 kernel/fs/ntfs/collate.c                         |   124 +
 kernel/fs/ntfs/collate.h                         |    50 +
 kernel/fs/ntfs/compress.c                        |   969 ++
 kernel/fs/ntfs/debug.c                           |   173 +
 kernel/fs/ntfs/debug.h                           |    71 +
 kernel/fs/ntfs/dir.c                             |  1553 +++
 kernel/fs/ntfs/dir.h                             |    48 +
 kernel/fs/ntfs/endian.h                          |    93 +
 kernel/fs/ntfs/file.c                            |  2043 +++
 kernel/fs/ntfs/index.c                           |   454 +
 kernel/fs/ntfs/index.h                           |   148 +
 kernel/fs/ntfs/inode.c                           |  3111 +++++
 kernel/fs/ntfs/inode.h                           |   325 +
 kernel/fs/ntfs/layout.h                          |  2435 ++++
 kernel/fs/ntfs/lcnalloc.c                        |  1014 ++
 kernel/fs/ntfs/lcnalloc.h                        |   145 +
 kernel/fs/ntfs/logfile.c                         |   863 ++
 kernel/fs/ntfs/logfile.h                         |   309 +
 kernel/fs/ntfs/malloc.h                          |    96 +
 kernel/fs/ntfs/mft.c                             |  2917 +++++
 kernel/fs/ntfs/mft.h                             |   124 +
 kernel/fs/ntfs/mst.c                             |   203 +
 kernel/fs/ntfs/namei.c                           |   405 +
 kernel/fs/ntfs/ntfs.h                            |   164 +
 kernel/fs/ntfs/quota.c                           |   117 +
 kernel/fs/ntfs/quota.h                           |    35 +
 kernel/fs/ntfs/runlist.c                         |  1907 +++
 kernel/fs/ntfs/runlist.h                         |   102 +
 kernel/fs/ntfs/super.c                           |  3220 +++++
 kernel/fs/ntfs/sysctl.c                          |    83 +
 kernel/fs/ntfs/sysctl.h                          |    41 +
 kernel/fs/ntfs/time.h                            |   100 +
 kernel/fs/ntfs/types.h                           |    69 +
 kernel/fs/ntfs/unistr.c                          |   398 +
 kernel/fs/ntfs/upcase.c                          |    87 +
 kernel/fs/ntfs/usnjrnl.c                         |    84 +
 kernel/fs/ntfs/usnjrnl.h                         |   205 +
 kernel/fs/ntfs/volume.h                          |   178 +
 kernel/fs/ocfs2/Kconfig                          |    76 +
 kernel/fs/ocfs2/Makefile                         |    53 +
 kernel/fs/ocfs2/acl.c                            |   310 +
 kernel/fs/ocfs2/acl.h                            |    39 +
 kernel/fs/ocfs2/alloc.c                          |  7405 +++++++++++
 kernel/fs/ocfs2/alloc.h                          |   324 +
 kernel/fs/ocfs2/aops.c                           |  2448 ++++
 kernel/fs/ocfs2/aops.h                           |   105 +
 kernel/fs/ocfs2/blockcheck.c                     |   647 +
 kernel/fs/ocfs2/blockcheck.h                     |   107 +
 kernel/fs/ocfs2/buffer_head_io.c                 |   427 +
 kernel/fs/ocfs2/buffer_head_io.h                 |    77 +
 kernel/fs/ocfs2/cluster/Makefile                 |     4 +
 kernel/fs/ocfs2/cluster/heartbeat.c              |  2678 ++++
 kernel/fs/ocfs2/cluster/heartbeat.h              |    90 +
 kernel/fs/ocfs2/cluster/masklog.c                |   155 +
 kernel/fs/ocfs2/cluster/masklog.h                |   221 +
 kernel/fs/ocfs2/cluster/netdebug.c               |   539 +
 kernel/fs/ocfs2/cluster/nodemanager.c            |   987 ++
 kernel/fs/ocfs2/cluster/nodemanager.h            |    88 +
 kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h        |    38 +
 kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h      |    45 +
 kernel/fs/ocfs2/cluster/quorum.c                 |   340 +
 kernel/fs/ocfs2/cluster/quorum.h                 |    36 +
 kernel/fs/ocfs2/cluster/sys.c                    |    82 +
 kernel/fs/ocfs2/cluster/sys.h                    |    33 +
 kernel/fs/ocfs2/cluster/tcp.c                    |  2219 ++++
 kernel/fs/ocfs2/cluster/tcp.h                    |   155 +
 kernel/fs/ocfs2/cluster/tcp_internal.h           |   242 +
 kernel/fs/ocfs2/dcache.c                         |   474 +
 kernel/fs/ocfs2/dcache.h                         |    59 +
 kernel/fs/ocfs2/dir.c                            |  4501 +++++++
 kernel/fs/ocfs2/dir.h                            |   116 +
 kernel/fs/ocfs2/dlm/Makefile                     |     7 +
 kernel/fs/ocfs2/dlm/dlmapi.h                     |   220 +
 kernel/fs/ocfs2/dlm/dlmast.c                     |   504 +
 kernel/fs/ocfs2/dlm/dlmcommon.h                  |  1150 ++
 kernel/fs/ocfs2/dlm/dlmconvert.c                 |   544 +
 kernel/fs/ocfs2/dlm/dlmconvert.h                 |    35 +
 kernel/fs/ocfs2/dlm/dlmdebug.c                   |  1000 ++
 kernel/fs/ocfs2/dlm/dlmdebug.h                   |    81 +
 kernel/fs/ocfs2/dlm/dlmdomain.c                  |  2375 ++++
 kernel/fs/ocfs2/dlm/dlmdomain.h                  |    35 +
 kernel/fs/ocfs2/dlm/dlmlock.c                    |   761 ++
 kernel/fs/ocfs2/dlm/dlmmaster.c                  |  3487 ++++++
 kernel/fs/ocfs2/dlm/dlmrecovery.c                |  2935 +++++
 kernel/fs/ocfs2/dlm/dlmthread.c                  |   756 ++
 kernel/fs/ocfs2/dlm/dlmunlock.c                  |   698 ++
 kernel/fs/ocfs2/dlmfs/Makefile                   |     5 +
 kernel/fs/ocfs2/dlmfs/dlmfs.c                    |   690 +
 kernel/fs/ocfs2/dlmfs/userdlm.c                  |   688 +
 kernel/fs/ocfs2/dlmfs/userdlm.h                  |   113 +
 kernel/fs/ocfs2/dlmglue.c                        |  4106 ++++++
 kernel/fs/ocfs2/dlmglue.h                        |   173 +
 kernel/fs/ocfs2/export.c                         |   271 +
 kernel/fs/ocfs2/export.h                         |    33 +
 kernel/fs/ocfs2/extent_map.c                     |   994 ++
 kernel/fs/ocfs2/extent_map.h                     |    92 +
 kernel/fs/ocfs2/file.c                           |  2702 ++++
 kernel/fs/ocfs2/file.h                           |    85 +
 kernel/fs/ocfs2/heartbeat.c                      |   134 +
 kernel/fs/ocfs2/heartbeat.h                      |    45 +
 kernel/fs/ocfs2/inode.c                          |  1460 +++
 kernel/fs/ocfs2/inode.h                          |   190 +
 kernel/fs/ocfs2/ioctl.c                          |  1005 ++
 kernel/fs/ocfs2/ioctl.h                          |    16 +
 kernel/fs/ocfs2/journal.c                        |  2323 ++++
 kernel/fs/ocfs2/journal.h                        |   645 +
 kernel/fs/ocfs2/localalloc.c                     |  1343 ++
 kernel/fs/ocfs2/localalloc.h                     |    68 +
 kernel/fs/ocfs2/locks.c                          |   141 +
 kernel/fs/ocfs2/locks.h                          |    32 +
 kernel/fs/ocfs2/mmap.c                           |   193 +
 kernel/fs/ocfs2/mmap.h                           |     6 +
 kernel/fs/ocfs2/move_extents.c                   |  1075 ++
 kernel/fs/ocfs2/move_extents.h                   |    22 +
 kernel/fs/ocfs2/namei.c                          |  2921 +++++
 kernel/fs/ocfs2/namei.h                          |    51 +
 kernel/fs/ocfs2/ocfs1_fs_compat.h                |   109 +
 kernel/fs/ocfs2/ocfs2.h                          |   919 ++
 kernel/fs/ocfs2/ocfs2_fs.h                       |  1651 +++
 kernel/fs/ocfs2/ocfs2_ioctl.h                    |   242 +
 kernel/fs/ocfs2/ocfs2_lockid.h                   |   128 +
 kernel/fs/ocfs2/ocfs2_lockingver.h               |    32 +
 kernel/fs/ocfs2/ocfs2_trace.h                    |  2768 ++++
 kernel/fs/ocfs2/quota.h                          |   123 +
 kernel/fs/ocfs2/quota_global.c                   |   971 ++
 kernel/fs/ocfs2/quota_local.c                    |  1315 ++
 kernel/fs/ocfs2/refcounttree.c                   |  4474 +++++++
 kernel/fs/ocfs2/refcounttree.h                   |   118 +
 kernel/fs/ocfs2/reservations.c                   |   839 ++
 kernel/fs/ocfs2/reservations.h                   |   159 +
 kernel/fs/ocfs2/resize.c                         |   589 +
 kernel/fs/ocfs2/resize.h                         |    32 +
 kernel/fs/ocfs2/slot_map.c                       |   538 +
 kernel/fs/ocfs2/slot_map.h                       |    44 +
 kernel/fs/ocfs2/stack_o2cb.c                     |   449 +
 kernel/fs/ocfs2/stack_user.c                     |  1134 ++
 kernel/fs/ocfs2/stackglue.c                      |   748 ++
 kernel/fs/ocfs2/stackglue.h                      |   301 +
 kernel/fs/ocfs2/suballoc.c                       |  2919 +++++
 kernel/fs/ocfs2/suballoc.h                       |   237 +
 kernel/fs/ocfs2/super.c                          |  2644 ++++
 kernel/fs/ocfs2/super.h                          |    53 +
 kernel/fs/ocfs2/symlink.c                        |   100 +
 kernel/fs/ocfs2/symlink.h                        |    42 +
 kernel/fs/ocfs2/sysfile.c                        |   182 +
 kernel/fs/ocfs2/sysfile.h                        |    33 +
 kernel/fs/ocfs2/uptodate.c                       |   638 +
 kernel/fs/ocfs2/uptodate.h                       |    84 +
 kernel/fs/ocfs2/xattr.c                          |  7425 +++++++++++
 kernel/fs/ocfs2/xattr.h                          |   100 +
 kernel/fs/omfs/Kconfig                           |    13 +
 kernel/fs/omfs/Makefile                          |     4 +
 kernel/fs/omfs/bitmap.c                          |   193 +
 kernel/fs/omfs/dir.c                             |   457 +
 kernel/fs/omfs/file.c                            |   383 +
 kernel/fs/omfs/inode.c                           |   596 +
 kernel/fs/omfs/omfs.h                            |    68 +
 kernel/fs/omfs/omfs_fs.h                         |    82 +
 kernel/fs/open.c                                 |  1143 ++
 kernel/fs/openpromfs/Makefile                    |     7 +
 kernel/fs/openpromfs/inode.c                     |   471 +
 kernel/fs/overlayfs/Kconfig                      |    10 +
 kernel/fs/overlayfs/Makefile                     |     7 +
 kernel/fs/overlayfs/copy_up.c                    |   416 +
 kernel/fs/overlayfs/dir.c                        |   951 ++
 kernel/fs/overlayfs/inode.c                      |   436 +
 kernel/fs/overlayfs/overlayfs.h                  |   199 +
 kernel/fs/overlayfs/readdir.c                    |   557 +
 kernel/fs/overlayfs/super.c                      |  1046 ++
 kernel/fs/pipe.c                                 |  1125 ++
 kernel/fs/pnode.c                                |   452 +
 kernel/fs/pnode.h                                |    57 +
 kernel/fs/posix_acl.c                            |   905 ++
 kernel/fs/proc/Kconfig                           |    73 +
 kernel/fs/proc/Makefile                          |    32 +
 kernel/fs/proc/array.c                           |   695 +
 kernel/fs/proc/base.c                            |  3214 +++++
 kernel/fs/proc/cmdline.c                         |    29 +
 kernel/fs/proc/consoles.c                        |   112 +
 kernel/fs/proc/cpuinfo.c                         |    24 +
 kernel/fs/proc/devices.c                         |    70 +
 kernel/fs/proc/fd.c                              |   357 +
 kernel/fs/proc/fd.h                              |    19 +
 kernel/fs/proc/generic.c                         |   645 +
 kernel/fs/proc/inode.c                           |   489 +
 kernel/fs/proc/internal.h                        |   305 +
 kernel/fs/proc/interrupts.c                      |    53 +
 kernel/fs/proc/kcore.c                           |   644 +
 kernel/fs/proc/kmsg.c                            |    64 +
 kernel/fs/proc/loadavg.c                         |    45 +
 kernel/fs/proc/meminfo.c                         |   234 +
 kernel/fs/proc/namespaces.c                      |   172 +
 kernel/fs/proc/nommu.c                           |   134 +
 kernel/fs/proc/page.c                            |   234 +
 kernel/fs/proc/proc_net.c                        |   233 +
 kernel/fs/proc/proc_sysctl.c                     |  1619 +++
 kernel/fs/proc/proc_tty.c                        |   189 +
 kernel/fs/proc/root.c                            |   270 +
 kernel/fs/proc/self.c                            |    84 +
 kernel/fs/proc/softirqs.c                        |    44 +
 kernel/fs/proc/stat.c                            |   206 +
 kernel/fs/proc/task_mmu.c                        |  1625 +++
 kernel/fs/proc/task_nommu.c                      |   345 +
 kernel/fs/proc/thread_self.c                     |    85 +
 kernel/fs/proc/uptime.c                          |    52 +
 kernel/fs/proc/version.c                         |    34 +
 kernel/fs/proc/vmcore.c                          |  1194 ++
 kernel/fs/proc_namespace.c                       |   335 +
 kernel/fs/pstore/Kconfig                         |    62 +
 kernel/fs/pstore/Makefile                        |    13 +
 kernel/fs/pstore/ftrace.c                        |   131 +
 kernel/fs/pstore/inode.c                         |   483 +
 kernel/fs/pstore/internal.h                      |    64 +
 kernel/fs/pstore/platform.c                      |   547 +
 kernel/fs/pstore/pmsg.c                          |   114 +
 kernel/fs/pstore/ram.c                           |   648 +
 kernel/fs/pstore/ram_core.c                      |   539 +
 kernel/fs/qnx4/Kconfig                           |    14 +
 kernel/fs/qnx4/Makefile                          |     7 +
 kernel/fs/qnx4/README                            |     9 +
 kernel/fs/qnx4/bitmap.c                          |    44 +
 kernel/fs/qnx4/dir.c                             |    81 +
 kernel/fs/qnx4/inode.c                           |   426 +
 kernel/fs/qnx4/namei.c                           |   125 +
 kernel/fs/qnx4/qnx4.h                            |    45 +
 kernel/fs/qnx6/Kconfig                           |    26 +
 kernel/fs/qnx6/Makefile                          |     8 +
 kernel/fs/qnx6/README                            |     8 +
 kernel/fs/qnx6/dir.c                             |   286 +
 kernel/fs/qnx6/inode.c                           |   685 +
 kernel/fs/qnx6/namei.c                           |    42 +
 kernel/fs/qnx6/qnx6.h                            |   135 +
 kernel/fs/qnx6/super_mmi.c                       |   148 +
 kernel/fs/quota/Kconfig                          |    76 +
 kernel/fs/quota/Makefile                         |     7 +
 kernel/fs/quota/compat.c                         |   118 +
 kernel/fs/quota/dquot.c                          |  2889 +++++
 kernel/fs/quota/kqid.c                           |   132 +
 kernel/fs/quota/netlink.c                        |   109 +
 kernel/fs/quota/quota.c                          |   808 ++
 kernel/fs/quota/quota_tree.c                     |   670 +
 kernel/fs/quota/quota_tree.h                     |    25 +
 kernel/fs/quota/quota_v1.c                       |   235 +
 kernel/fs/quota/quota_v2.c                       |   346 +
 kernel/fs/quota/quotaio_v1.h                     |    33 +
 kernel/fs/quota/quotaio_v2.h                     |    75 +
 kernel/fs/ramfs/Makefile                         |     9 +
 kernel/fs/ramfs/file-mmu.c                       |    46 +
 kernel/fs/ramfs/file-nommu.c                     |   271 +
 kernel/fs/ramfs/inode.c                          |   266 +
 kernel/fs/ramfs/internal.h                       |    13 +
 kernel/fs/read_write.c                           |  1329 ++
 kernel/fs/readdir.c                              |   309 +
 kernel/fs/reiserfs/Kconfig                       |    88 +
 kernel/fs/reiserfs/Makefile                      |    38 +
 kernel/fs/reiserfs/README                        |   161 +
 kernel/fs/reiserfs/acl.h                         |    76 +
 kernel/fs/reiserfs/bitmap.c                      |  1468 +++
 kernel/fs/reiserfs/dir.c                         |   346 +
 kernel/fs/reiserfs/do_balan.c                    |  1911 +++
 kernel/fs/reiserfs/file.c                        |   270 +
 kernel/fs/reiserfs/fix_node.c                    |  2825 +++++
 kernel/fs/reiserfs/hashes.c                      |   177 +
 kernel/fs/reiserfs/ibalance.c                    |  1160 ++
 kernel/fs/reiserfs/inode.c                       |  3461 +++++
 kernel/fs/reiserfs/ioctl.c                       |   230 +
 kernel/fs/reiserfs/item_ops.c                    |   752 ++
 kernel/fs/reiserfs/journal.c                     |  4403 +++++++
 kernel/fs/reiserfs/lbalance.c                    |  1427 +++
 kernel/fs/reiserfs/lock.c                        |   100 +
 kernel/fs/reiserfs/namei.c                       |  1659 +++
 kernel/fs/reiserfs/objectid.c                    |   217 +
 kernel/fs/reiserfs/prints.c                      |   777 ++
 kernel/fs/reiserfs/procfs.c                      |   508 +
 kernel/fs/reiserfs/reiserfs.h                    |  3411 +++++
 kernel/fs/reiserfs/resize.c                      |   229 +
 kernel/fs/reiserfs/stree.c                       |  2262 ++++
 kernel/fs/reiserfs/super.c                       |  2563 ++++
 kernel/fs/reiserfs/tail_conversion.c             |   317 +
 kernel/fs/reiserfs/xattr.c                       |  1064 ++
 kernel/fs/reiserfs/xattr.h                       |   122 +
 kernel/fs/reiserfs/xattr_acl.c                   |   407 +
 kernel/fs/reiserfs/xattr_security.c              |   120 +
 kernel/fs/reiserfs/xattr_trusted.c               |    56 +
 kernel/fs/reiserfs/xattr_user.c                  |    52 +
 kernel/fs/romfs/Kconfig                          |    62 +
 kernel/fs/romfs/Makefile                         |    12 +
 kernel/fs/romfs/internal.h                       |    47 +
 kernel/fs/romfs/mmap-nommu.c                     |    89 +
 kernel/fs/romfs/storage.c                        |   293 +
 kernel/fs/romfs/super.c                          |   659 +
 kernel/fs/select.c                               |  1040 ++
 kernel/fs/seq_file.c                             |   968 ++
 kernel/fs/signalfd.c                             |   342 +
 kernel/fs/splice.c                               |  2034 +++
 kernel/fs/squashfs/Kconfig                       |   210 +
 kernel/fs/squashfs/Makefile                      |    17 +
 kernel/fs/squashfs/block.c                       |   214 +
 kernel/fs/squashfs/cache.c                       |   458 +
 kernel/fs/squashfs/decompressor.c                |   148 +
 kernel/fs/squashfs/decompressor.h                |    61 +
 kernel/fs/squashfs/decompressor_multi.c          |   198 +
 kernel/fs/squashfs/decompressor_multi_percpu.c   |    97 +
 kernel/fs/squashfs/decompressor_single.c         |    85 +
 kernel/fs/squashfs/dir.c                         |   236 +
 kernel/fs/squashfs/export.c                      |   163 +
 kernel/fs/squashfs/file.c                        |   503 +
 kernel/fs/squashfs/file_cache.c                  |    38 +
 kernel/fs/squashfs/file_direct.c                 |   176 +
 kernel/fs/squashfs/fragment.c                    |    99 +
 kernel/fs/squashfs/id.c                          |   102 +
 kernel/fs/squashfs/inode.c                       |   429 +
 kernel/fs/squashfs/lz4_wrapper.c                 |   142 +
 kernel/fs/squashfs/lzo_wrapper.c                 |   130 +
 kernel/fs/squashfs/namei.c                       |   252 +
 kernel/fs/squashfs/page_actor.c                  |   100 +
 kernel/fs/squashfs/page_actor.h                  |    81 +
 kernel/fs/squashfs/squashfs.h                    |   113 +
 kernel/fs/squashfs/squashfs_fs.h                 |   457 +
 kernel/fs/squashfs/squashfs_fs_i.h               |    54 +
 kernel/fs/squashfs/squashfs_fs_sb.h              |    80 +
 kernel/fs/squashfs/super.c                       |   508 +
 kernel/fs/squashfs/symlink.c                     |   127 +
 kernel/fs/squashfs/xattr.c                       |   324 +
 kernel/fs/squashfs/xattr.h                       |    47 +
 kernel/fs/squashfs/xattr_id.c                    |    95 +
 kernel/fs/squashfs/xz_wrapper.c                  |   193 +
 kernel/fs/squashfs/zlib_wrapper.c                |   135 +
 kernel/fs/stack.c                                |    76 +
 kernel/fs/stat.c                                 |   511 +
 kernel/fs/statfs.c                               |   241 +
 kernel/fs/super.c                                |  1396 +++
 kernel/fs/sync.c                                 |   366 +
 kernel/fs/sysfs/Kconfig                          |    24 +
 kernel/fs/sysfs/Makefile                         |     5 +
 kernel/fs/sysfs/dir.c                            |   157 +
 kernel/fs/sysfs/file.c                           |   497 +
 kernel/fs/sysfs/group.c                          |   354 +
 kernel/fs/sysfs/mount.c                          |    79 +
 kernel/fs/sysfs/symlink.c                        |   197 +
 kernel/fs/sysfs/sysfs.h                          |    43 +
 kernel/fs/sysv/Kconfig                           |    36 +
 kernel/fs/sysv/Makefile                          |     8 +
 kernel/fs/sysv/balloc.c                          |   239 +
 kernel/fs/sysv/dir.c                             |   372 +
 kernel/fs/sysv/file.c                            |    57 +
 kernel/fs/sysv/ialloc.c                          |   234 +
 kernel/fs/sysv/inode.c                           |   370 +
 kernel/fs/sysv/itree.c                           |   501 +
 kernel/fs/sysv/namei.c                           |   290 +
 kernel/fs/sysv/super.c                           |   593 +
 kernel/fs/sysv/symlink.c                         |    20 +
 kernel/fs/sysv/sysv.h                            |   247 +
 kernel/fs/timerfd.c                              |   571 +
 kernel/fs/tracefs/Makefile                       |     4 +
 kernel/fs/tracefs/inode.c                        |   648 +
 kernel/fs/ubifs/Kconfig                          |    37 +
 kernel/fs/ubifs/Makefile                         |     6 +
 kernel/fs/ubifs/budget.c                         |   730 ++
 kernel/fs/ubifs/commit.c                         |   734 ++
 kernel/fs/ubifs/compress.c                       |   250 +
 kernel/fs/ubifs/debug.c                          |  3104 +++++
 kernel/fs/ubifs/debug.h                          |   315 +
 kernel/fs/ubifs/dir.c                            |  1202 ++
 kernel/fs/ubifs/file.c                           |  1594 +++
 kernel/fs/ubifs/find.c                           |   985 ++
 kernel/fs/ubifs/gc.c                             |   984 ++
 kernel/fs/ubifs/io.c                             |  1150 ++
 kernel/fs/ubifs/ioctl.c                          |   205 +
 kernel/fs/ubifs/journal.c                        |  1466 +++
 kernel/fs/ubifs/key.h                            |   548 +
 kernel/fs/ubifs/log.c                            |   753 ++
 kernel/fs/ubifs/lprops.c                         |  1321 ++
 kernel/fs/ubifs/lpt.c                            |  2278 ++++
 kernel/fs/ubifs/lpt_commit.c                     |  2037 +++
 kernel/fs/ubifs/master.c                         |   395 +
 kernel/fs/ubifs/misc.h                           |   303 +
 kernel/fs/ubifs/orphan.c                         |   956 ++
 kernel/fs/ubifs/recovery.c                       |  1547 +++
 kernel/fs/ubifs/replay.c                         |  1082 ++
 kernel/fs/ubifs/sb.c                             |   809 ++
 kernel/fs/ubifs/scan.c                           |   379 +
 kernel/fs/ubifs/shrinker.c                       |   331 +
 kernel/fs/ubifs/super.c                          |  2300 ++++
 kernel/fs/ubifs/tnc.c                            |  3327 +++++
 kernel/fs/ubifs/tnc_commit.c                     |  1071 ++
 kernel/fs/ubifs/tnc_misc.c                       |   494 +
 kernel/fs/ubifs/ubifs-media.h                    |   784 ++
 kernel/fs/ubifs/ubifs.h                          |  1803 +++
 kernel/fs/ubifs/xattr.c                          |   666 +
 kernel/fs/udf/Kconfig                            |    20 +
 kernel/fs/udf/Makefile                           |     9 +
 kernel/fs/udf/balloc.c                           |   821 ++
 kernel/fs/udf/dir.c                              |   199 +
 kernel/fs/udf/directory.c                        |   240 +
 kernel/fs/udf/ecma_167.h                         |   796 ++
 kernel/fs/udf/file.c                             |   273 +
 kernel/fs/udf/ialloc.c                           |   133 +
 kernel/fs/udf/inode.c                            |  2291 ++++
 kernel/fs/udf/lowlevel.c                         |    67 +
 kernel/fs/udf/misc.c                             |   298 +
 kernel/fs/udf/namei.c                            |  1300 ++
 kernel/fs/udf/osta_udf.h                         |   279 +
 kernel/fs/udf/partition.c                        |   338 +
 kernel/fs/udf/super.c                            |  2469 ++++
 kernel/fs/udf/symlink.c                          |   159 +
 kernel/fs/udf/truncate.c                         |   286 +
 kernel/fs/udf/udf_i.h                            |    62 +
 kernel/fs/udf/udf_sb.h                           |   184 +
 kernel/fs/udf/udfdecl.h                          |   255 +
 kernel/fs/udf/udfend.h                           |    77 +
 kernel/fs/udf/udftime.c                          |   170 +
 kernel/fs/udf/unicode.c                          |   496 +
 kernel/fs/ufs/Kconfig                            |    43 +
 kernel/fs/ufs/Makefile                           |     9 +
 kernel/fs/ufs/balloc.c                           |   938 ++
 kernel/fs/ufs/cylinder.c                         |   201 +
 kernel/fs/ufs/dir.c                              |   662 +
 kernel/fs/ufs/file.c                             |    44 +
 kernel/fs/ufs/ialloc.c                           |   353 +
 kernel/fs/ufs/inode.c                            |   910 ++
 kernel/fs/ufs/namei.c                            |   356 +
 kernel/fs/ufs/super.c                            |  1524 +++
 kernel/fs/ufs/swab.h                             |   115 +
 kernel/fs/ufs/symlink.c                          |    53 +
 kernel/fs/ufs/truncate.c                         |   523 +
 kernel/fs/ufs/ufs.h                              |   176 +
 kernel/fs/ufs/ufs_fs.h                           |   960 ++
 kernel/fs/ufs/util.c                             |   282 +
 kernel/fs/ufs/util.h                             |   592 +
 kernel/fs/utimes.c                               |   235 +
 kernel/fs/xattr.c                                |   972 ++
 kernel/fs/xfs/Kconfig                            |    97 +
 kernel/fs/xfs/Makefile                           |   124 +
 kernel/fs/xfs/kmem.c                             |   127 +
 kernel/fs/xfs/kmem.h                             |   125 +
 kernel/fs/xfs/libxfs/xfs_alloc.c                 |  2639 ++++
 kernel/fs/xfs/libxfs/xfs_alloc.h                 |   237 +
 kernel/fs/xfs/libxfs/xfs_alloc_btree.c           |   503 +
 kernel/fs/xfs/libxfs/xfs_alloc_btree.h           |    65 +
 kernel/fs/xfs/libxfs/xfs_attr.c                  |  1456 +++
 kernel/fs/xfs/libxfs/xfs_attr_leaf.c             |  2773 ++++
 kernel/fs/xfs/libxfs/xfs_attr_leaf.h             |   110 +
 kernel/fs/xfs/libxfs/xfs_attr_remote.c           |   626 +
 kernel/fs/xfs/libxfs/xfs_attr_remote.h           |    27 +
 kernel/fs/xfs/libxfs/xfs_attr_sf.h               |    70 +
 kernel/fs/xfs/libxfs/xfs_bit.h                   |    87 +
 kernel/fs/xfs/libxfs/xfs_bmap.c                  |  5945 +++++++++
 kernel/fs/xfs/libxfs/xfs_bmap.h                  |   225 +
 kernel/fs/xfs/libxfs/xfs_bmap_btree.c            |   883 ++
 kernel/fs/xfs/libxfs/xfs_bmap_btree.h            |   143 +
 kernel/fs/xfs/libxfs/xfs_btree.c                 |  4067 ++++++
 kernel/fs/xfs/libxfs/xfs_btree.h                 |   468 +
 kernel/fs/xfs/libxfs/xfs_cksum.h                 |    63 +
 kernel/fs/xfs/libxfs/xfs_da_btree.c              |  2660 ++++
 kernel/fs/xfs/libxfs/xfs_da_btree.h              |   221 +
 kernel/fs/xfs/libxfs/xfs_da_format.c             |   908 ++
 kernel/fs/xfs/libxfs/xfs_da_format.h             |   873 ++
 kernel/fs/xfs/libxfs/xfs_dir2.c                  |   731 ++
 kernel/fs/xfs/libxfs/xfs_dir2.h                  |   320 +
 kernel/fs/xfs/libxfs/xfs_dir2_block.c            |  1254 ++
 kernel/fs/xfs/libxfs/xfs_dir2_data.c             |  1049 ++
 kernel/fs/xfs/libxfs/xfs_dir2_leaf.c             |  1819 +++
 kernel/fs/xfs/libxfs/xfs_dir2_node.c             |  2270 ++++
 kernel/fs/xfs/libxfs/xfs_dir2_priv.h             |   134 +
 kernel/fs/xfs/libxfs/xfs_dir2_sf.c               |  1142 ++
 kernel/fs/xfs/libxfs/xfs_dquot_buf.c             |   288 +
 kernel/fs/xfs/libxfs/xfs_format.h                |  1461 +++
 kernel/fs/xfs/libxfs/xfs_fs.h                    |   576 +
 kernel/fs/xfs/libxfs/xfs_ialloc.c                |  2202 ++++
 kernel/fs/xfs/libxfs/xfs_ialloc.h                |   167 +
 kernel/fs/xfs/libxfs/xfs_ialloc_btree.c          |   420 +
 kernel/fs/xfs/libxfs/xfs_ialloc_btree.h          |    65 +
 kernel/fs/xfs/libxfs/xfs_inode_buf.c             |   476 +
 kernel/fs/xfs/libxfs/xfs_inode_buf.h             |    50 +
 kernel/fs/xfs/libxfs/xfs_inode_fork.c            |  1902 +++
 kernel/fs/xfs/libxfs/xfs_inode_fork.h            |   171 +
 kernel/fs/xfs/libxfs/xfs_log_format.h            |   679 +
 kernel/fs/xfs/libxfs/xfs_log_recover.h           |    66 +
 kernel/fs/xfs/libxfs/xfs_log_rlimit.c            |   148 +
 kernel/fs/xfs/libxfs/xfs_quota_defs.h            |   159 +
 kernel/fs/xfs/libxfs/xfs_rtbitmap.c              |   991 ++
 kernel/fs/xfs/libxfs/xfs_sb.c                    |   803 ++
 kernel/fs/xfs/libxfs/xfs_sb.h                    |    38 +
 kernel/fs/xfs/libxfs/xfs_shared.h                |   243 +
 kernel/fs/xfs/libxfs/xfs_symlink_remote.c        |   201 +
 kernel/fs/xfs/libxfs/xfs_trans_resv.c            |   878 ++
 kernel/fs/xfs/libxfs/xfs_trans_resv.h            |   116 +
 kernel/fs/xfs/libxfs/xfs_trans_space.h           |    92 +
 kernel/fs/xfs/libxfs/xfs_types.h                 |   137 +
 kernel/fs/xfs/mrlock.h                           |    90 +
 kernel/fs/xfs/uuid.c                             |    63 +
 kernel/fs/xfs/uuid.h                             |    35 +
 kernel/fs/xfs/xfs.h                              |    34 +
 kernel/fs/xfs/xfs_acl.c                          |   304 +
 kernel/fs/xfs/xfs_acl.h                          |    39 +
 kernel/fs/xfs/xfs_aops.c                         |  1931 +++
 kernel/fs/xfs/xfs_aops.h                         |    60 +
 kernel/fs/xfs/xfs_attr.h                         |   154 +
 kernel/fs/xfs/xfs_attr_inactive.c                |   465 +
 kernel/fs/xfs/xfs_attr_list.c                    |   653 +
 kernel/fs/xfs/xfs_bit.c                          |   118 +
 kernel/fs/xfs/xfs_bmap_util.c                    |  1920 +++
 kernel/fs/xfs/xfs_bmap_util.h                    |    79 +
 kernel/fs/xfs/xfs_buf.c                          |  1901 +++
 kernel/fs/xfs/xfs_buf.h                          |   393 +
 kernel/fs/xfs/xfs_buf_item.c                     |  1155 ++
 kernel/fs/xfs/xfs_buf_item.h                     |    76 +
 kernel/fs/xfs/xfs_dir2_readdir.c                 |   681 +
 kernel/fs/xfs/xfs_discard.c                      |   239 +
 kernel/fs/xfs/xfs_discard.h                      |    10 +
 kernel/fs/xfs/xfs_dquot.c                        |  1104 ++
 kernel/fs/xfs/xfs_dquot.h                        |   188 +
 kernel/fs/xfs/xfs_dquot_item.c                   |   443 +
 kernel/fs/xfs/xfs_dquot_item.h                   |    47 +
 kernel/fs/xfs/xfs_error.c                        |   180 +
 kernel/fs/xfs/xfs_error.h                        |   153 +
 kernel/fs/xfs/xfs_export.c                       |   254 +
 kernel/fs/xfs/xfs_export.h                       |    72 +
 kernel/fs/xfs/xfs_extent_busy.c                  |   604 +
 kernel/fs/xfs/xfs_extent_busy.h                  |    73 +
 kernel/fs/xfs/xfs_extfree_item.c                 |   507 +
 kernel/fs/xfs/xfs_extfree_item.h                 |    81 +
 kernel/fs/xfs/xfs_file.c                         |  1534 +++
 kernel/fs/xfs/xfs_filestream.c                   |   431 +
 kernel/fs/xfs/xfs_filestream.h                   |    40 +
 kernel/fs/xfs/xfs_fsops.c                        |   850 ++
 kernel/fs/xfs/xfs_fsops.h                        |    30 +
 kernel/fs/xfs/xfs_globals.c                      |    49 +
 kernel/fs/xfs/xfs_icache.c                       |  1418 +++
 kernel/fs/xfs/xfs_icache.h                       |   115 +
 kernel/fs/xfs/xfs_icreate_item.c                 |   188 +
 kernel/fs/xfs/xfs_icreate_item.h                 |    34 +
 kernel/fs/xfs/xfs_inode.c                        |  3606 ++++++
 kernel/fs/xfs/xfs_inode.h                        |   456 +
 kernel/fs/xfs/xfs_inode_item.c                   |   789 ++
 kernel/fs/xfs/xfs_inode_item.h                   |    54 +
 kernel/fs/xfs/xfs_ioctl.c                        |  1806 +++
 kernel/fs/xfs/xfs_ioctl.h                        |    95 +
 kernel/fs/xfs/xfs_ioctl32.c                      |   680 +
 kernel/fs/xfs/xfs_ioctl32.h                      |   238 +
 kernel/fs/xfs/xfs_iomap.c                        |   920 ++
 kernel/fs/xfs/xfs_iomap.h                        |    32 +
 kernel/fs/xfs/xfs_iops.c                         |  1305 ++
 kernel/fs/xfs/xfs_iops.h                         |    38 +
 kernel/fs/xfs/xfs_itable.c                       |   652 +
 kernel/fs/xfs/xfs_itable.h                       |    99 +
 kernel/fs/xfs/xfs_linux.h                        |   384 +
 kernel/fs/xfs/xfs_log.c                          |  4007 ++++++
 kernel/fs/xfs/xfs_log.h                          |   193 +
 kernel/fs/xfs/xfs_log_cil.c                      |   998 ++
 kernel/fs/xfs/xfs_log_priv.h                     |   561 +
 kernel/fs/xfs/xfs_log_recover.c                  |  4651 +++++++
 kernel/fs/xfs/xfs_message.c                      |   113 +
 kernel/fs/xfs/xfs_message.h                      |    64 +
 kernel/fs/xfs/xfs_mount.c                        |  1283 ++
 kernel/fs/xfs/xfs_mount.h                        |   335 +
 kernel/fs/xfs/xfs_mru_cache.c                    |   552 +
 kernel/fs/xfs/xfs_mru_cache.h                    |    46 +
 kernel/fs/xfs/xfs_pnfs.c                         |   329 +
 kernel/fs/xfs/xfs_pnfs.h                         |    19 +
 kernel/fs/xfs/xfs_qm.c                           |  1939 +++
 kernel/fs/xfs/xfs_qm.h                           |   174 +
 kernel/fs/xfs/xfs_qm_bhv.c                       |   149 +
 kernel/fs/xfs/xfs_qm_syscalls.c                  |   770 ++
 kernel/fs/xfs/xfs_quota.h                        |   155 +
 kernel/fs/xfs/xfs_quotaops.c                     |   271 +
 kernel/fs/xfs/xfs_rtalloc.c                      |  1302 ++
 kernel/fs/xfs/xfs_rtalloc.h                      |   145 +
 kernel/fs/xfs/xfs_stats.c                        |   198 +
 kernel/fs/xfs/xfs_stats.h                        |   249 +
 kernel/fs/xfs/xfs_super.c                        |  1889 +++
 kernel/fs/xfs/xfs_super.h                        |    79 +
 kernel/fs/xfs/xfs_symlink.c                      |   608 +
 kernel/fs/xfs/xfs_symlink.h                      |    27 +
 kernel/fs/xfs/xfs_sysctl.c                       |   243 +
 kernel/fs/xfs/xfs_sysctl.h                       |   108 +
 kernel/fs/xfs/xfs_sysfs.c                        |   239 +
 kernel/fs/xfs/xfs_sysfs.h                        |    60 +
 kernel/fs/xfs/xfs_trace.c                        |    54 +
 kernel/fs/xfs/xfs_trace.h                        |  2083 +++
 kernel/fs/xfs/xfs_trans.c                        |  1105 ++
 kernel/fs/xfs/xfs_trans.h                        |   245 +
 kernel/fs/xfs/xfs_trans_ail.c                    |   794 ++
 kernel/fs/xfs/xfs_trans_buf.c                    |   802 ++
 kernel/fs/xfs/xfs_trans_dquot.c                  |   887 ++
 kernel/fs/xfs/xfs_trans_extfree.c                |   133 +
 kernel/fs/xfs/xfs_trans_inode.c                  |   135 +
 kernel/fs/xfs/xfs_trans_priv.h                   |   161 +
 kernel/fs/xfs/xfs_xattr.c                        |   243 +
 1804 files changed, 1171684 insertions(+)
 create mode 100644 kernel/fs/9p/Kconfig
 create mode 100644 kernel/fs/9p/Makefile
 create mode 100644 kernel/fs/9p/acl.c
 create mode 100644 kernel/fs/9p/acl.h
 create mode 100644 kernel/fs/9p/cache.c
 create mode 100644 kernel/fs/9p/cache.h
 create mode 100644 kernel/fs/9p/fid.c
 create mode 100644 kernel/fs/9p/fid.h
 create mode 100644 kernel/fs/9p/v9fs.c
 create mode 100644 kernel/fs/9p/v9fs.h
 create mode 100644 kernel/fs/9p/v9fs_vfs.h
 create mode 100644 kernel/fs/9p/vfs_addr.c
 create mode 100644 kernel/fs/9p/vfs_dentry.c
 create mode 100644 kernel/fs/9p/vfs_dir.c
 create mode 100644 kernel/fs/9p/vfs_file.c
 create mode 100644 kernel/fs/9p/vfs_inode.c
 create mode 100644 kernel/fs/9p/vfs_inode_dotl.c
 create mode 100644 kernel/fs/9p/vfs_super.c
 create mode 100644 kernel/fs/9p/xattr.c
 create mode 100644 kernel/fs/9p/xattr.h
 create mode 100644 kernel/fs/9p/xattr_security.c
 create mode 100644 kernel/fs/9p/xattr_trusted.c
 create mode 100644 kernel/fs/9p/xattr_user.c
 create mode 100644 kernel/fs/Kconfig
 create mode 100644 kernel/fs/Kconfig.binfmt
 create mode 100644 kernel/fs/Makefile
 create mode 100644 kernel/fs/adfs/Kconfig
 create mode 100644 kernel/fs/adfs/Makefile
 create mode 100644 kernel/fs/adfs/adfs.h
 create mode 100644 kernel/fs/adfs/dir.c
 create mode 100644 kernel/fs/adfs/dir_f.c
 create mode 100644 kernel/fs/adfs/dir_f.h
 create mode 100644 kernel/fs/adfs/dir_fplus.c
 create mode 100644 kernel/fs/adfs/dir_fplus.h
 create mode 100644 kernel/fs/adfs/file.c
 create mode 100644 kernel/fs/adfs/inode.c
 create mode 100644 kernel/fs/adfs/map.c
 create mode 100644 kernel/fs/adfs/super.c
 create mode 100644 kernel/fs/affs/Changes
 create mode 100644 kernel/fs/affs/Kconfig
 create mode 100644 kernel/fs/affs/Makefile
 create mode 100644 kernel/fs/affs/affs.h
 create mode 100644 kernel/fs/affs/amigaffs.c
 create mode 100644 kernel/fs/affs/bitmap.c
 create mode 100644 kernel/fs/affs/dir.c
 create mode 100644 kernel/fs/affs/file.c
 create mode 100644 kernel/fs/affs/inode.c
 create mode 100644 kernel/fs/affs/namei.c
 create mode 100644 kernel/fs/affs/super.c
 create mode 100644 kernel/fs/affs/symlink.c
 create mode 100644 kernel/fs/afs/Kconfig
 create mode 100644 kernel/fs/afs/Makefile
 create mode 100644 kernel/fs/afs/afs.h
 create mode 100644 kernel/fs/afs/afs_cm.h
 create mode 100644 kernel/fs/afs/afs_fs.h
 create mode 100644 kernel/fs/afs/afs_vl.h
 create mode 100644 kernel/fs/afs/cache.c
 create mode 100644 kernel/fs/afs/callback.c
 create mode 100644 kernel/fs/afs/cell.c
 create mode 100644 kernel/fs/afs/cmservice.c
 create mode 100644 kernel/fs/afs/dir.c
 create mode 100644 kernel/fs/afs/file.c
 create mode 100644 kernel/fs/afs/flock.c
 create mode 100644 kernel/fs/afs/fsclient.c
 create mode 100644 kernel/fs/afs/inode.c
 create mode 100644 kernel/fs/afs/internal.h
 create mode 100644 kernel/fs/afs/main.c
 create mode 100644 kernel/fs/afs/misc.c
 create mode 100644 kernel/fs/afs/mntpt.c
 create mode 100644 kernel/fs/afs/netdevices.c
 create mode 100644 kernel/fs/afs/proc.c
 create mode 100644 kernel/fs/afs/rxrpc.c
 create mode 100644 kernel/fs/afs/security.c
 create mode 100644 kernel/fs/afs/server.c
 create mode 100644 kernel/fs/afs/super.c
 create mode 100644 kernel/fs/afs/vlclient.c
 create mode 100644 kernel/fs/afs/vlocation.c
 create mode 100644 kernel/fs/afs/vnode.c
 create mode 100644 kernel/fs/afs/volume.c
 create mode 100644 kernel/fs/afs/write.c
 create mode 100644 kernel/fs/aio.c
 create mode 100644 kernel/fs/anon_inodes.c
 create mode 100644 kernel/fs/attr.c
 create mode 100644 kernel/fs/autofs4/Kconfig
 create mode 100644 kernel/fs/autofs4/Makefile
 create mode 100644 kernel/fs/autofs4/autofs_i.h
 create mode 100644 kernel/fs/autofs4/dev-ioctl.c
 create mode 100644 kernel/fs/autofs4/expire.c
 create mode 100644 kernel/fs/autofs4/init.c
 create mode 100644 kernel/fs/autofs4/inode.c
 create mode 100644 kernel/fs/autofs4/root.c
 create mode 100644 kernel/fs/autofs4/symlink.c
 create mode 100644 kernel/fs/autofs4/waitq.c
 create mode 100644 kernel/fs/bad_inode.c
 create mode 100644 kernel/fs/befs/ChangeLog
 create mode 100644 kernel/fs/befs/Kconfig
 create mode 100644 kernel/fs/befs/Makefile
 create mode 100644 kernel/fs/befs/TODO
 create mode 100644 kernel/fs/befs/befs.h
 create mode 100644 kernel/fs/befs/befs_fs_types.h
 create mode 100644 kernel/fs/befs/btree.c
 create mode 100644 kernel/fs/befs/btree.h
 create mode 100644 kernel/fs/befs/datastream.c
 create mode 100644 kernel/fs/befs/datastream.h
 create mode 100644 kernel/fs/befs/debug.c
 create mode 100644 kernel/fs/befs/endian.h
 create mode 100644 kernel/fs/befs/inode.c
 create mode 100644 kernel/fs/befs/inode.h
 create mode 100644 kernel/fs/befs/io.c
 create mode 100644 kernel/fs/befs/io.h
 create mode 100644 kernel/fs/befs/linuxvfs.c
 create mode 100644 kernel/fs/befs/super.c
 create mode 100644 kernel/fs/befs/super.h
 create mode 100644 kernel/fs/bfs/Kconfig
 create mode 100644 kernel/fs/bfs/Makefile
 create mode 100644 kernel/fs/bfs/bfs.h
 create mode 100644 kernel/fs/bfs/dir.c
 create mode 100644 kernel/fs/bfs/file.c
 create mode 100644 kernel/fs/bfs/inode.c
 create mode 100644 kernel/fs/binfmt_aout.c
 create mode 100644 kernel/fs/binfmt_elf.c
 create mode 100644 kernel/fs/binfmt_elf_fdpic.c
 create mode 100644 kernel/fs/binfmt_em86.c
 create mode 100644 kernel/fs/binfmt_flat.c
 create mode 100644 kernel/fs/binfmt_misc.c
 create mode 100644 kernel/fs/binfmt_script.c
 create mode 100644 kernel/fs/block_dev.c
 create mode 100644 kernel/fs/btrfs/Kconfig
 create mode 100644 kernel/fs/btrfs/Makefile
 create mode 100644 kernel/fs/btrfs/acl.c
 create mode 100644 kernel/fs/btrfs/async-thread.c
 create mode 100644 kernel/fs/btrfs/async-thread.h
 create mode 100644 kernel/fs/btrfs/backref.c
 create mode 100644 kernel/fs/btrfs/backref.h
 create mode 100644 kernel/fs/btrfs/btrfs_inode.h
 create mode 100644 kernel/fs/btrfs/check-integrity.c
 create mode 100644 kernel/fs/btrfs/check-integrity.h
 create mode 100644 kernel/fs/btrfs/compression.c
 create mode 100644 kernel/fs/btrfs/compression.h
 create mode 100644 kernel/fs/btrfs/ctree.c
 create mode 100644 kernel/fs/btrfs/ctree.h
 create mode 100644 kernel/fs/btrfs/delayed-inode.c
 create mode 100644 kernel/fs/btrfs/delayed-inode.h
 create mode 100644 kernel/fs/btrfs/delayed-ref.c
 create mode 100644 kernel/fs/btrfs/delayed-ref.h
 create mode 100644 kernel/fs/btrfs/dev-replace.c
 create mode 100644 kernel/fs/btrfs/dev-replace.h
 create mode 100644 kernel/fs/btrfs/dir-item.c
 create mode 100644 kernel/fs/btrfs/disk-io.c
 create mode 100644 kernel/fs/btrfs/disk-io.h
 create mode 100644 kernel/fs/btrfs/export.c
 create mode 100644 kernel/fs/btrfs/export.h
 create mode 100644 kernel/fs/btrfs/extent-tree.c
 create mode 100644 kernel/fs/btrfs/extent_io.c
 create mode 100644 kernel/fs/btrfs/extent_io.h
 create mode 100644 kernel/fs/btrfs/extent_map.c
 create mode 100644 kernel/fs/btrfs/extent_map.h
 create mode 100644 kernel/fs/btrfs/file-item.c
 create mode 100644 kernel/fs/btrfs/file.c
 create mode 100644 kernel/fs/btrfs/free-space-cache.c
 create mode 100644 kernel/fs/btrfs/free-space-cache.h
 create mode 100644 kernel/fs/btrfs/hash.c
 create mode 100644 kernel/fs/btrfs/hash.h
 create mode 100644 kernel/fs/btrfs/inode-item.c
 create mode 100644 kernel/fs/btrfs/inode-map.c
 create mode 100644 kernel/fs/btrfs/inode-map.h
 create mode 100644 kernel/fs/btrfs/inode.c
 create mode 100644 kernel/fs/btrfs/ioctl.c
 create mode 100644 kernel/fs/btrfs/locking.c
 create mode 100644 kernel/fs/btrfs/locking.h
 create mode 100644 kernel/fs/btrfs/lzo.c
 create mode 100644 kernel/fs/btrfs/math.h
 create mode 100644 kernel/fs/btrfs/ordered-data.c
 create mode 100644 kernel/fs/btrfs/ordered-data.h
 create mode 100644 kernel/fs/btrfs/orphan.c
 create mode 100644 kernel/fs/btrfs/print-tree.c
 create mode 100644 kernel/fs/btrfs/print-tree.h
 create mode 100644 kernel/fs/btrfs/props.c
 create mode 100644 kernel/fs/btrfs/props.h
 create mode 100644 kernel/fs/btrfs/qgroup.c
 create mode 100644 kernel/fs/btrfs/qgroup.h
 create mode 100644 kernel/fs/btrfs/raid56.c
 create mode 100644 kernel/fs/btrfs/raid56.h
 create mode 100644 kernel/fs/btrfs/rcu-string.h
 create mode 100644 kernel/fs/btrfs/reada.c
 create mode 100644 kernel/fs/btrfs/relocation.c
 create mode 100644 kernel/fs/btrfs/root-tree.c
 create mode 100644 kernel/fs/btrfs/scrub.c
 create mode 100644 kernel/fs/btrfs/send.c
 create mode 100644 kernel/fs/btrfs/send.h
 create mode 100644 kernel/fs/btrfs/struct-funcs.c
 create mode 100644 kernel/fs/btrfs/super.c
 create mode 100644 kernel/fs/btrfs/sysfs.c
 create mode 100644 kernel/fs/btrfs/sysfs.h
 create mode 100644 kernel/fs/btrfs/tests/btrfs-tests.c
 create mode 100644 kernel/fs/btrfs/tests/btrfs-tests.h
 create mode 100644 kernel/fs/btrfs/tests/extent-buffer-tests.c
 create mode 100644 kernel/fs/btrfs/tests/extent-io-tests.c
 create mode 100644 kernel/fs/btrfs/tests/free-space-tests.c
 create mode 100644 kernel/fs/btrfs/tests/inode-tests.c
 create mode 100644 kernel/fs/btrfs/tests/qgroup-tests.c
 create mode 100644 kernel/fs/btrfs/transaction.c
 create mode 100644 kernel/fs/btrfs/transaction.h
 create mode 100644 kernel/fs/btrfs/tree-defrag.c
 create mode 100644 kernel/fs/btrfs/tree-log.c
 create mode 100644 kernel/fs/btrfs/tree-log.h
 create mode 100644 kernel/fs/btrfs/ulist.c
 create mode 100644 kernel/fs/btrfs/ulist.h
 create mode 100644 kernel/fs/btrfs/uuid-tree.c
 create mode 100644 kernel/fs/btrfs/volumes.c
 create mode 100644 kernel/fs/btrfs/volumes.h
 create mode 100644 kernel/fs/btrfs/xattr.c
 create mode 100644 kernel/fs/btrfs/xattr.h
 create mode 100644 kernel/fs/btrfs/zlib.c
 create mode 100644 kernel/fs/buffer.c
 create mode 100644 kernel/fs/cachefiles/Kconfig
 create mode 100644 kernel/fs/cachefiles/Makefile
 create mode 100644 kernel/fs/cachefiles/bind.c
 create mode 100644 kernel/fs/cachefiles/daemon.c
 create mode 100644 kernel/fs/cachefiles/interface.c
 create mode 100644 kernel/fs/cachefiles/internal.h
 create mode 100644 kernel/fs/cachefiles/key.c
 create mode 100644 kernel/fs/cachefiles/main.c
 create mode 100644 kernel/fs/cachefiles/namei.c
 create mode 100644 kernel/fs/cachefiles/proc.c
 create mode 100644 kernel/fs/cachefiles/rdwr.c
 create mode 100644 kernel/fs/cachefiles/security.c
 create mode 100644 kernel/fs/cachefiles/xattr.c
 create mode 100644 kernel/fs/ceph/Kconfig
 create mode 100644 kernel/fs/ceph/Makefile
 create mode 100644 kernel/fs/ceph/acl.c
 create mode 100644 kernel/fs/ceph/addr.c
 create mode 100644 kernel/fs/ceph/cache.c
 create mode 100644 kernel/fs/ceph/cache.h
 create mode 100644 kernel/fs/ceph/caps.c
 create mode 100644 kernel/fs/ceph/ceph_frag.c
 create mode 100644 kernel/fs/ceph/debugfs.c
 create mode 100644 kernel/fs/ceph/dir.c
 create mode 100644 kernel/fs/ceph/export.c
 create mode 100644 kernel/fs/ceph/file.c
 create mode 100644 kernel/fs/ceph/inode.c
 create mode 100644 kernel/fs/ceph/ioctl.c
 create mode 100644 kernel/fs/ceph/ioctl.h
 create mode 100644 kernel/fs/ceph/locks.c
 create mode 100644 kernel/fs/ceph/mds_client.c
 create mode 100644 kernel/fs/ceph/mds_client.h
 create mode 100644 kernel/fs/ceph/mdsmap.c
 create mode 100644 kernel/fs/ceph/snap.c
 create mode 100644 kernel/fs/ceph/strings.c
 create mode 100644 kernel/fs/ceph/super.c
 create mode 100644 kernel/fs/ceph/super.h
 create mode 100644 kernel/fs/ceph/xattr.c
 create mode 100644 kernel/fs/char_dev.c
 create mode 100644 kernel/fs/cifs/Kconfig
 create mode 100644 kernel/fs/cifs/Makefile
 create mode 100644 kernel/fs/cifs/asn1.c
 create mode 100644 kernel/fs/cifs/cache.c
 create mode 100644 kernel/fs/cifs/cifs_debug.c
 create mode 100644 kernel/fs/cifs/cifs_debug.h
 create mode 100644 kernel/fs/cifs/cifs_dfs_ref.c
 create mode 100644 kernel/fs/cifs/cifs_fs_sb.h
 create mode 100644 kernel/fs/cifs/cifs_spnego.c
 create mode 100644 kernel/fs/cifs/cifs_spnego.h
 create mode 100644 kernel/fs/cifs/cifs_unicode.c
 create mode 100644 kernel/fs/cifs/cifs_unicode.h
 create mode 100644 kernel/fs/cifs/cifs_uniupr.h
 create mode 100644 kernel/fs/cifs/cifsacl.c
 create mode 100644 kernel/fs/cifs/cifsacl.h
 create mode 100644 kernel/fs/cifs/cifsencrypt.c
 create mode 100644 kernel/fs/cifs/cifsfs.c
 create mode 100644 kernel/fs/cifs/cifsfs.h
 create mode 100644 kernel/fs/cifs/cifsglob.h
 create mode 100644 kernel/fs/cifs/cifspdu.h
 create mode 100644 kernel/fs/cifs/cifsproto.h
 create mode 100644 kernel/fs/cifs/cifssmb.c
 create mode 100644 kernel/fs/cifs/connect.c
 create mode 100644 kernel/fs/cifs/dir.c
 create mode 100644 kernel/fs/cifs/dns_resolve.c
 create mode 100644 kernel/fs/cifs/dns_resolve.h
 create mode 100644 kernel/fs/cifs/export.c
 create mode 100644 kernel/fs/cifs/file.c
 create mode 100644 kernel/fs/cifs/fscache.c
 create mode 100644 kernel/fs/cifs/fscache.h
 create mode 100644 kernel/fs/cifs/inode.c
 create mode 100644 kernel/fs/cifs/ioctl.c
 create mode 100644 kernel/fs/cifs/link.c
 create mode 100644 kernel/fs/cifs/misc.c
 create mode 100644 kernel/fs/cifs/netmisc.c
 create mode 100644 kernel/fs/cifs/nterr.c
 create mode 100644 kernel/fs/cifs/nterr.h
 create mode 100644 kernel/fs/cifs/ntlmssp.h
 create mode 100644 kernel/fs/cifs/readdir.c
 create mode 100644 kernel/fs/cifs/rfc1002pdu.h
 create mode 100644 kernel/fs/cifs/sess.c
 create mode 100644 kernel/fs/cifs/smb1ops.c
 create mode 100644 kernel/fs/cifs/smb2file.c
 create mode 100644 kernel/fs/cifs/smb2glob.h
 create mode 100644 kernel/fs/cifs/smb2inode.c
 create mode 100644 kernel/fs/cifs/smb2maperror.c
 create mode 100644 kernel/fs/cifs/smb2misc.c
 create mode 100644 kernel/fs/cifs/smb2ops.c
 create mode 100644 kernel/fs/cifs/smb2pdu.c
 create mode 100644 kernel/fs/cifs/smb2pdu.h
 create mode 100644 kernel/fs/cifs/smb2proto.h
 create mode 100644 kernel/fs/cifs/smb2status.h
 create mode 100644 kernel/fs/cifs/smb2transport.c
 create mode 100644 kernel/fs/cifs/smbencrypt.c
 create mode 100644 kernel/fs/cifs/smberr.h
 create mode 100644 kernel/fs/cifs/smbfsctl.h
 create mode 100644 kernel/fs/cifs/transport.c
 create mode 100644 kernel/fs/cifs/winucase.c
 create mode 100644 kernel/fs/cifs/xattr.c
 create mode 100644 kernel/fs/coda/Kconfig
 create mode 100644 kernel/fs/coda/Makefile
 create mode 100644 kernel/fs/coda/cache.c
 create mode 100644 kernel/fs/coda/cnode.c
 create mode 100644 kernel/fs/coda/coda_cache.h
 create mode 100644 kernel/fs/coda/coda_fs_i.h
 create mode 100644 kernel/fs/coda/coda_int.h
 create mode 100644 kernel/fs/coda/coda_linux.c
 create mode 100644 kernel/fs/coda/coda_linux.h
 create mode 100644 kernel/fs/coda/dir.c
 create mode 100644 kernel/fs/coda/file.c
 create mode 100644 kernel/fs/coda/inode.c
 create mode 100644 kernel/fs/coda/pioctl.c
 create mode 100644 kernel/fs/coda/psdev.c
 create mode 100644 kernel/fs/coda/symlink.c
 create mode 100644 kernel/fs/coda/sysctl.c
 create mode 100644 kernel/fs/coda/upcall.c
 create mode 100644 kernel/fs/compat.c
 create mode 100644 kernel/fs/compat_binfmt_elf.c
 create mode 100644 kernel/fs/compat_ioctl.c
 create mode 100644 kernel/fs/configfs/Kconfig
 create mode 100644 kernel/fs/configfs/Makefile
 create mode 100644 kernel/fs/configfs/configfs_internal.h
 create mode 100644 kernel/fs/configfs/dir.c
 create mode 100644 kernel/fs/configfs/file.c
 create mode 100644 kernel/fs/configfs/inode.c
 create mode 100644 kernel/fs/configfs/item.c
 create mode 100644 kernel/fs/configfs/mount.c
 create mode 100644 kernel/fs/configfs/symlink.c
 create mode 100644 kernel/fs/coredump.c
 create mode 100644 kernel/fs/cramfs/Kconfig
 create mode 100644 kernel/fs/cramfs/Makefile
 create mode 100644 kernel/fs/cramfs/README
 create mode 100644 kernel/fs/cramfs/inode.c
 create mode 100644 kernel/fs/cramfs/internal.h
 create mode 100644 kernel/fs/cramfs/uncompress.c
 create mode 100644 kernel/fs/dax.c
 create mode 100644 kernel/fs/dcache.c
 create mode 100644 kernel/fs/dcookies.c
 create mode 100644 kernel/fs/debugfs/Makefile
 create mode 100644 kernel/fs/debugfs/file.c
 create mode 100644 kernel/fs/debugfs/inode.c
 create mode 100644 kernel/fs/devpts/Makefile
 create mode 100644 kernel/fs/devpts/inode.c
 create mode 100644 kernel/fs/direct-io.c
 create mode 100644 kernel/fs/dlm/Kconfig
 create mode 100644 kernel/fs/dlm/Makefile
 create mode 100644 kernel/fs/dlm/ast.c
 create mode 100644 kernel/fs/dlm/ast.h
 create mode 100644 kernel/fs/dlm/config.c
 create mode 100644 kernel/fs/dlm/config.h
 create mode 100644 kernel/fs/dlm/debug_fs.c
 create mode 100644 kernel/fs/dlm/dir.c
 create mode 100644 kernel/fs/dlm/dir.h
 create mode 100644 kernel/fs/dlm/dlm_internal.h
 create mode 100644 kernel/fs/dlm/lock.c
 create mode 100644 kernel/fs/dlm/lock.h
 create mode 100644 kernel/fs/dlm/lockspace.c
 create mode 100644 kernel/fs/dlm/lockspace.h
 create mode 100644 kernel/fs/dlm/lowcomms.c
 create mode 100644 kernel/fs/dlm/lowcomms.h
 create mode 100644 kernel/fs/dlm/lvb_table.h
 create mode 100644 kernel/fs/dlm/main.c
 create mode 100644 kernel/fs/dlm/member.c
 create mode 100644 kernel/fs/dlm/member.h
 create mode 100644 kernel/fs/dlm/memory.c
 create mode 100644 kernel/fs/dlm/memory.h
 create mode 100644 kernel/fs/dlm/midcomms.c
 create mode 100644 kernel/fs/dlm/midcomms.h
 create mode 100644 kernel/fs/dlm/netlink.c
 create mode 100644 kernel/fs/dlm/plock.c
 create mode 100644 kernel/fs/dlm/rcom.c
 create mode 100644 kernel/fs/dlm/rcom.h
 create mode 100644 kernel/fs/dlm/recover.c
 create mode 100644 kernel/fs/dlm/recover.h
 create mode 100644 kernel/fs/dlm/recoverd.c
 create mode 100644 kernel/fs/dlm/recoverd.h
 create mode 100644 kernel/fs/dlm/requestqueue.c
 create mode 100644 kernel/fs/dlm/requestqueue.h
 create mode 100644 kernel/fs/dlm/user.c
 create mode 100644 kernel/fs/dlm/user.h
 create mode 100644 kernel/fs/dlm/util.c
 create mode 100644 kernel/fs/dlm/util.h
 create mode 100644 kernel/fs/drop_caches.c
 create mode 100644 kernel/fs/ecryptfs/Kconfig
 create mode 100644 kernel/fs/ecryptfs/Makefile
 create mode 100644 kernel/fs/ecryptfs/crypto.c
 create mode 100644 kernel/fs/ecryptfs/debug.c
 create mode 100644 kernel/fs/ecryptfs/dentry.c
 create mode 100644 kernel/fs/ecryptfs/ecryptfs_kernel.h
 create mode 100644 kernel/fs/ecryptfs/file.c
 create mode 100644 kernel/fs/ecryptfs/inode.c
 create mode 100644 kernel/fs/ecryptfs/keystore.c
 create mode 100644 kernel/fs/ecryptfs/kthread.c
 create mode 100644 kernel/fs/ecryptfs/main.c
 create mode 100644 kernel/fs/ecryptfs/messaging.c
 create mode 100644 kernel/fs/ecryptfs/miscdev.c
 create mode 100644 kernel/fs/ecryptfs/mmap.c
 create mode 100644 kernel/fs/ecryptfs/read_write.c
 create mode 100644 kernel/fs/ecryptfs/super.c
 create mode 100644 kernel/fs/efivarfs/Kconfig
 create mode 100644 kernel/fs/efivarfs/Makefile
 create mode 100644 kernel/fs/efivarfs/file.c
 create mode 100644 kernel/fs/efivarfs/inode.c
 create mode 100644 kernel/fs/efivarfs/internal.h
 create mode 100644 kernel/fs/efivarfs/super.c
 create mode 100644 kernel/fs/efs/Kconfig
 create mode 100644 kernel/fs/efs/Makefile
 create mode 100644 kernel/fs/efs/dir.c
 create mode 100644 kernel/fs/efs/efs.h
 create mode 100644 kernel/fs/efs/file.c
 create mode 100644 kernel/fs/efs/inode.c
 create mode 100644 kernel/fs/efs/namei.c
 create mode 100644 kernel/fs/efs/super.c
 create mode 100644 kernel/fs/efs/symlink.c
 create mode 100644 kernel/fs/eventfd.c
 create mode 100644 kernel/fs/eventpoll.c
 create mode 100644 kernel/fs/exec.c
 create mode 100644 kernel/fs/exofs/BUGS
 create mode 100644 kernel/fs/exofs/Kbuild
 create mode 100644 kernel/fs/exofs/Kconfig
 create mode 100644 kernel/fs/exofs/Kconfig.ore
 create mode 100644 kernel/fs/exofs/common.h
 create mode 100644 kernel/fs/exofs/dir.c
 create mode 100644 kernel/fs/exofs/exofs.h
 create mode 100644 kernel/fs/exofs/file.c
 create mode 100644 kernel/fs/exofs/inode.c
 create mode 100644 kernel/fs/exofs/namei.c
 create mode 100644 kernel/fs/exofs/ore.c
 create mode 100644 kernel/fs/exofs/ore_raid.c
 create mode 100644 kernel/fs/exofs/ore_raid.h
 create mode 100644 kernel/fs/exofs/super.c
 create mode 100644 kernel/fs/exofs/symlink.c
 create mode 100644 kernel/fs/exofs/sys.c
 create mode 100644 kernel/fs/exportfs/Makefile
 create mode 100644 kernel/fs/exportfs/expfs.c
 create mode 100644 kernel/fs/ext2/Kconfig
 create mode 100644 kernel/fs/ext2/Makefile
 create mode 100644 kernel/fs/ext2/acl.c
 create mode 100644 kernel/fs/ext2/acl.h
 create mode 100644 kernel/fs/ext2/balloc.c
 create mode 100644 kernel/fs/ext2/dir.c
 create mode 100644 kernel/fs/ext2/ext2.h
 create mode 100644 kernel/fs/ext2/file.c
 create mode 100644 kernel/fs/ext2/ialloc.c
 create mode 100644 kernel/fs/ext2/inode.c
 create mode 100644 kernel/fs/ext2/ioctl.c
 create mode 100644 kernel/fs/ext2/namei.c
 create mode 100644 kernel/fs/ext2/super.c
 create mode 100644 kernel/fs/ext2/symlink.c
 create mode 100644 kernel/fs/ext2/xattr.c
 create mode 100644 kernel/fs/ext2/xattr.h
 create mode 100644 kernel/fs/ext2/xattr_security.c
 create mode 100644 kernel/fs/ext2/xattr_trusted.c
 create mode 100644 kernel/fs/ext2/xattr_user.c
 create mode 100644 kernel/fs/ext3/Kconfig
 create mode 100644 kernel/fs/ext3/Makefile
 create mode 100644 kernel/fs/ext3/acl.c
 create mode 100644 kernel/fs/ext3/acl.h
 create mode 100644 kernel/fs/ext3/balloc.c
 create mode 100644 kernel/fs/ext3/bitmap.c
 create mode 100644 kernel/fs/ext3/dir.c
 create mode 100644 kernel/fs/ext3/ext3.h
 create mode 100644 kernel/fs/ext3/ext3_jbd.c
 create mode 100644 kernel/fs/ext3/file.c
 create mode 100644 kernel/fs/ext3/fsync.c
 create mode 100644 kernel/fs/ext3/hash.c
 create mode 100644 kernel/fs/ext3/ialloc.c
 create mode 100644 kernel/fs/ext3/inode.c
 create mode 100644 kernel/fs/ext3/ioctl.c
 create mode 100644 kernel/fs/ext3/namei.c
 create mode 100644 kernel/fs/ext3/namei.h
 create mode 100644 kernel/fs/ext3/resize.c
 create mode 100644 kernel/fs/ext3/super.c
 create mode 100644 kernel/fs/ext3/symlink.c
 create mode 100644 kernel/fs/ext3/xattr.c
 create mode 100644 kernel/fs/ext3/xattr.h
 create mode 100644 kernel/fs/ext3/xattr_security.c
 create mode 100644 kernel/fs/ext3/xattr_trusted.c
 create mode 100644 kernel/fs/ext3/xattr_user.c
 create mode 100644 kernel/fs/ext4/Kconfig
 create mode 100644 kernel/fs/ext4/Makefile
 create mode 100644 kernel/fs/ext4/acl.c
 create mode 100644 kernel/fs/ext4/acl.h
 create mode 100644 kernel/fs/ext4/balloc.c
 create mode 100644 kernel/fs/ext4/bitmap.c
 create mode 100644 kernel/fs/ext4/block_validity.c
 create mode 100644 kernel/fs/ext4/crypto.c
 create mode 100644 kernel/fs/ext4/crypto_fname.c
 create mode 100644 kernel/fs/ext4/crypto_key.c
 create mode 100644 kernel/fs/ext4/crypto_policy.c
 create mode 100644 kernel/fs/ext4/dir.c
 create mode 100644 kernel/fs/ext4/ext4.h
 create mode 100644 kernel/fs/ext4/ext4_crypto.h
 create mode 100644 kernel/fs/ext4/ext4_extents.h
 create mode 100644 kernel/fs/ext4/ext4_jbd2.c
 create mode 100644 kernel/fs/ext4/ext4_jbd2.h
 create mode 100644 kernel/fs/ext4/extents.c
 create mode 100644 kernel/fs/ext4/extents_status.c
 create mode 100644 kernel/fs/ext4/extents_status.h
 create mode 100644 kernel/fs/ext4/file.c
 create mode 100644 kernel/fs/ext4/fsync.c
 create mode 100644 kernel/fs/ext4/hash.c
 create mode 100644 kernel/fs/ext4/ialloc.c
 create mode 100644 kernel/fs/ext4/indirect.c
 create mode 100644 kernel/fs/ext4/inline.c
 create mode 100644 kernel/fs/ext4/inode.c
 create mode 100644 kernel/fs/ext4/ioctl.c
 create mode 100644 kernel/fs/ext4/mballoc.c
 create mode 100644 kernel/fs/ext4/mballoc.h
 create mode 100644 kernel/fs/ext4/migrate.c
 create mode 100644 kernel/fs/ext4/mmp.c
 create mode 100644 kernel/fs/ext4/move_extent.c
 create mode 100644 kernel/fs/ext4/namei.c
 create mode 100644 kernel/fs/ext4/page-io.c
 create mode 100644 kernel/fs/ext4/readpage.c
 create mode 100644 kernel/fs/ext4/resize.c
 create mode 100644 kernel/fs/ext4/super.c
 create mode 100644 kernel/fs/ext4/symlink.c
 create mode 100644 kernel/fs/ext4/truncate.h
 create mode 100644 kernel/fs/ext4/xattr.c
 create mode 100644 kernel/fs/ext4/xattr.h
 create mode 100644 kernel/fs/ext4/xattr_security.c
 create mode 100644 kernel/fs/ext4/xattr_trusted.c
 create mode 100644 kernel/fs/ext4/xattr_user.c
 create mode 100644 kernel/fs/f2fs/Kconfig
 create mode 100644 kernel/fs/f2fs/Makefile
 create mode 100644 kernel/fs/f2fs/acl.c
 create mode 100644 kernel/fs/f2fs/acl.h
 create mode 100644 kernel/fs/f2fs/checkpoint.c
 create mode 100644 kernel/fs/f2fs/data.c
 create mode 100644 kernel/fs/f2fs/debug.c
 create mode 100644 kernel/fs/f2fs/dir.c
 create mode 100644 kernel/fs/f2fs/f2fs.h
 create mode 100644 kernel/fs/f2fs/file.c
 create mode 100644 kernel/fs/f2fs/gc.c
 create mode 100644 kernel/fs/f2fs/gc.h
 create mode 100644 kernel/fs/f2fs/hash.c
 create mode 100644 kernel/fs/f2fs/inline.c
 create mode 100644 kernel/fs/f2fs/inode.c
 create mode 100644 kernel/fs/f2fs/namei.c
 create mode 100644 kernel/fs/f2fs/node.c
 create mode 100644 kernel/fs/f2fs/node.h
 create mode 100644 kernel/fs/f2fs/recovery.c
 create mode 100644 kernel/fs/f2fs/segment.c
 create mode 100644 kernel/fs/f2fs/segment.h
 create mode 100644 kernel/fs/f2fs/super.c
 create mode 100644 kernel/fs/f2fs/trace.c
 create mode 100644 kernel/fs/f2fs/trace.h
 create mode 100644 kernel/fs/f2fs/xattr.c
 create mode 100644 kernel/fs/f2fs/xattr.h
 create mode 100644 kernel/fs/fat/Kconfig
 create mode 100644 kernel/fs/fat/Makefile
 create mode 100644 kernel/fs/fat/cache.c
 create mode 100644 kernel/fs/fat/dir.c
 create mode 100644 kernel/fs/fat/fat.h
 create mode 100644 kernel/fs/fat/fatent.c
 create mode 100644 kernel/fs/fat/file.c
 create mode 100644 kernel/fs/fat/inode.c
 create mode 100644 kernel/fs/fat/misc.c
 create mode 100644 kernel/fs/fat/namei_msdos.c
 create mode 100644 kernel/fs/fat/namei_vfat.c
 create mode 100644 kernel/fs/fat/nfs.c
 create mode 100644 kernel/fs/fcntl.c
 create mode 100644 kernel/fs/fhandle.c
 create mode 100644 kernel/fs/file.c
 create mode 100644 kernel/fs/file_table.c
 create mode 100644 kernel/fs/filesystems.c
 create mode 100644 kernel/fs/freevxfs/Kconfig
 create mode 100644 kernel/fs/freevxfs/Makefile
 create mode 100644 kernel/fs/freevxfs/vxfs.h
 create mode 100644 kernel/fs/freevxfs/vxfs_bmap.c
 create mode 100644 kernel/fs/freevxfs/vxfs_dir.h
 create mode 100644 kernel/fs/freevxfs/vxfs_extern.h
 create mode 100644 kernel/fs/freevxfs/vxfs_fshead.c
 create mode 100644 kernel/fs/freevxfs/vxfs_fshead.h
 create mode 100644 kernel/fs/freevxfs/vxfs_immed.c
 create mode 100644 kernel/fs/freevxfs/vxfs_inode.c
 create mode 100644 kernel/fs/freevxfs/vxfs_inode.h
 create mode 100644 kernel/fs/freevxfs/vxfs_lookup.c
 create mode 100644 kernel/fs/freevxfs/vxfs_olt.c
 create mode 100644 kernel/fs/freevxfs/vxfs_olt.h
 create mode 100644 kernel/fs/freevxfs/vxfs_subr.c
 create mode 100644 kernel/fs/freevxfs/vxfs_super.c
 create mode 100644 kernel/fs/fs-writeback.c
 create mode 100644 kernel/fs/fs_pin.c
 create mode 100644 kernel/fs/fs_struct.c
 create mode 100644 kernel/fs/fscache/Kconfig
 create mode 100644 kernel/fs/fscache/Makefile
 create mode 100644 kernel/fs/fscache/cache.c
 create mode 100644 kernel/fs/fscache/cookie.c
 create mode 100644 kernel/fs/fscache/fsdef.c
 create mode 100644 kernel/fs/fscache/histogram.c
 create mode 100644 kernel/fs/fscache/internal.h
 create mode 100644 kernel/fs/fscache/main.c
 create mode 100644 kernel/fs/fscache/netfs.c
 create mode 100644 kernel/fs/fscache/object-list.c
 create mode 100644 kernel/fs/fscache/object.c
 create mode 100644 kernel/fs/fscache/operation.c
 create mode 100644 kernel/fs/fscache/page.c
 create mode 100644 kernel/fs/fscache/proc.c
 create mode 100644 kernel/fs/fscache/stats.c
 create mode 100644 kernel/fs/fuse/Kconfig
 create mode 100644 kernel/fs/fuse/Makefile
 create mode 100644 kernel/fs/fuse/control.c
 create mode 100644 kernel/fs/fuse/cuse.c
 create mode 100644 kernel/fs/fuse/dev.c
 create mode 100644 kernel/fs/fuse/dir.c
 create mode 100644 kernel/fs/fuse/file.c
 create mode 100644 kernel/fs/fuse/fuse_i.h
 create mode 100644 kernel/fs/fuse/inode.c
 create mode 100644 kernel/fs/gfs2/Kconfig
 create mode 100644 kernel/fs/gfs2/Makefile
 create mode 100644 kernel/fs/gfs2/acl.c
 create mode 100644 kernel/fs/gfs2/acl.h
 create mode 100644 kernel/fs/gfs2/aops.c
 create mode 100644 kernel/fs/gfs2/bmap.c
 create mode 100644 kernel/fs/gfs2/bmap.h
 create mode 100644 kernel/fs/gfs2/dentry.c
 create mode 100644 kernel/fs/gfs2/dir.c
 create mode 100644 kernel/fs/gfs2/dir.h
 create mode 100644 kernel/fs/gfs2/export.c
 create mode 100644 kernel/fs/gfs2/file.c
 create mode 100644 kernel/fs/gfs2/gfs2.h
 create mode 100644 kernel/fs/gfs2/glock.c
 create mode 100644 kernel/fs/gfs2/glock.h
 create mode 100644 kernel/fs/gfs2/glops.c
 create mode 100644 kernel/fs/gfs2/glops.h
 create mode 100644 kernel/fs/gfs2/incore.h
 create mode 100644 kernel/fs/gfs2/inode.c
 create mode 100644 kernel/fs/gfs2/inode.h
 create mode 100644 kernel/fs/gfs2/lock_dlm.c
 create mode 100644 kernel/fs/gfs2/log.c
 create mode 100644 kernel/fs/gfs2/log.h
 create mode 100644 kernel/fs/gfs2/lops.c
 create mode 100644 kernel/fs/gfs2/lops.h
 create mode 100644 kernel/fs/gfs2/main.c
 create mode 100644 kernel/fs/gfs2/meta_io.c
 create mode 100644 kernel/fs/gfs2/meta_io.h
 create mode 100644 kernel/fs/gfs2/ops_fstype.c
 create mode 100644 kernel/fs/gfs2/quota.c
 create mode 100644 kernel/fs/gfs2/quota.h
 create mode 100644 kernel/fs/gfs2/recovery.c
 create mode 100644 kernel/fs/gfs2/recovery.h
 create mode 100644 kernel/fs/gfs2/rgrp.c
 create mode 100644 kernel/fs/gfs2/rgrp.h
 create mode 100644 kernel/fs/gfs2/super.c
 create mode 100644 kernel/fs/gfs2/super.h
 create mode 100644 kernel/fs/gfs2/sys.c
 create mode 100644 kernel/fs/gfs2/sys.h
 create mode 100644 kernel/fs/gfs2/trace_gfs2.h
 create mode 100644 kernel/fs/gfs2/trans.c
 create mode 100644 kernel/fs/gfs2/trans.h
 create mode 100644 kernel/fs/gfs2/util.c
 create mode 100644 kernel/fs/gfs2/util.h
 create mode 100644 kernel/fs/gfs2/xattr.c
 create mode 100644 kernel/fs/gfs2/xattr.h
 create mode 100644 kernel/fs/hfs/Kconfig
 create mode 100644 kernel/fs/hfs/Makefile
 create mode 100644 kernel/fs/hfs/attr.c
 create mode 100644 kernel/fs/hfs/bfind.c
 create mode 100644 kernel/fs/hfs/bitmap.c
 create mode 100644 kernel/fs/hfs/bnode.c
 create mode 100644 kernel/fs/hfs/brec.c
 create mode 100644 kernel/fs/hfs/btree.c
 create mode 100644 kernel/fs/hfs/btree.h
 create mode 100644 kernel/fs/hfs/catalog.c
 create mode 100644 kernel/fs/hfs/dir.c
 create mode 100644 kernel/fs/hfs/extent.c
 create mode 100644 kernel/fs/hfs/hfs.h
 create mode 100644 kernel/fs/hfs/hfs_fs.h
 create mode 100644 kernel/fs/hfs/inode.c
 create mode 100644 kernel/fs/hfs/mdb.c
 create mode 100644 kernel/fs/hfs/part_tbl.c
 create mode 100644 kernel/fs/hfs/string.c
 create mode 100644 kernel/fs/hfs/super.c
 create mode 100644 kernel/fs/hfs/sysdep.c
 create mode 100644 kernel/fs/hfs/trans.c
 create mode 100644 kernel/fs/hfsplus/Kconfig
 create mode 100644 kernel/fs/hfsplus/Makefile
 create mode 100644 kernel/fs/hfsplus/acl.h
 create mode 100644 kernel/fs/hfsplus/attributes.c
 create mode 100644 kernel/fs/hfsplus/bfind.c
 create mode 100644 kernel/fs/hfsplus/bitmap.c
 create mode 100644 kernel/fs/hfsplus/bnode.c
 create mode 100644 kernel/fs/hfsplus/brec.c
 create mode 100644 kernel/fs/hfsplus/btree.c
 create mode 100644 kernel/fs/hfsplus/catalog.c
 create mode 100644 kernel/fs/hfsplus/dir.c
 create mode 100644 kernel/fs/hfsplus/extents.c
 create mode 100644 kernel/fs/hfsplus/hfsplus_fs.h
 create mode 100644 kernel/fs/hfsplus/hfsplus_raw.h
 create mode 100644 kernel/fs/hfsplus/inode.c
 create mode 100644 kernel/fs/hfsplus/ioctl.c
 create mode 100644 kernel/fs/hfsplus/options.c
 create mode 100644 kernel/fs/hfsplus/part_tbl.c
 create mode 100644 kernel/fs/hfsplus/posix_acl.c
 create mode 100644 kernel/fs/hfsplus/super.c
 create mode 100644 kernel/fs/hfsplus/tables.c
 create mode 100644 kernel/fs/hfsplus/unicode.c
 create mode 100644 kernel/fs/hfsplus/wrapper.c
 create mode 100644 kernel/fs/hfsplus/xattr.c
 create mode 100644 kernel/fs/hfsplus/xattr.h
 create mode 100644 kernel/fs/hfsplus/xattr_security.c
 create mode 100644 kernel/fs/hfsplus/xattr_trusted.c
 create mode 100644 kernel/fs/hfsplus/xattr_user.c
 create mode 100644 kernel/fs/hostfs/Makefile
 create mode 100644 kernel/fs/hostfs/hostfs.h
 create mode 100644 kernel/fs/hostfs/hostfs_kern.c
 create mode 100644 kernel/fs/hostfs/hostfs_user.c
 create mode 100644 kernel/fs/hpfs/Kconfig
 create mode 100644 kernel/fs/hpfs/Makefile
 create mode 100644 kernel/fs/hpfs/alloc.c
 create mode 100644 kernel/fs/hpfs/anode.c
 create mode 100644 kernel/fs/hpfs/buffer.c
 create mode 100644 kernel/fs/hpfs/dentry.c
 create mode 100644 kernel/fs/hpfs/dir.c
 create mode 100644 kernel/fs/hpfs/dnode.c
 create mode 100644 kernel/fs/hpfs/ea.c
 create mode 100644 kernel/fs/hpfs/file.c
 create mode 100644 kernel/fs/hpfs/hpfs.h
 create mode 100644 kernel/fs/hpfs/hpfs_fn.h
 create mode 100644 kernel/fs/hpfs/inode.c
 create mode 100644 kernel/fs/hpfs/map.c
 create mode 100644 kernel/fs/hpfs/name.c
 create mode 100644 kernel/fs/hpfs/namei.c
 create mode 100644 kernel/fs/hpfs/super.c
 create mode 100644 kernel/fs/hppfs/Makefile
 create mode 100644 kernel/fs/hppfs/hppfs.c
 create mode 100644 kernel/fs/hugetlbfs/Makefile
 create mode 100644 kernel/fs/hugetlbfs/inode.c
 create mode 100644 kernel/fs/inode.c
 create mode 100644 kernel/fs/internal.h
 create mode 100644 kernel/fs/ioctl.c
 create mode 100644 kernel/fs/isofs/Kconfig
 create mode 100644 kernel/fs/isofs/Makefile
 create mode 100644 kernel/fs/isofs/compress.c
 create mode 100644 kernel/fs/isofs/dir.c
 create mode 100644 kernel/fs/isofs/export.c
 create mode 100644 kernel/fs/isofs/inode.c
 create mode 100644 kernel/fs/isofs/isofs.h
 create mode 100644 kernel/fs/isofs/joliet.c
 create mode 100644 kernel/fs/isofs/namei.c
 create mode 100644 kernel/fs/isofs/rock.c
 create mode 100644 kernel/fs/isofs/rock.h
 create mode 100644 kernel/fs/isofs/util.c
 create mode 100644 kernel/fs/isofs/zisofs.h
 create mode 100644 kernel/fs/jbd/Kconfig
 create mode 100644 kernel/fs/jbd/Makefile
 create mode 100644 kernel/fs/jbd/checkpoint.c
 create mode 100644 kernel/fs/jbd/commit.c
 create mode 100644 kernel/fs/jbd/journal.c
 create mode 100644 kernel/fs/jbd/recovery.c
 create mode 100644 kernel/fs/jbd/revoke.c
 create mode 100644 kernel/fs/jbd/transaction.c
 create mode 100644 kernel/fs/jbd2/Kconfig
 create mode 100644 kernel/fs/jbd2/Makefile
 create mode 100644 kernel/fs/jbd2/checkpoint.c
 create mode 100644 kernel/fs/jbd2/commit.c
 create mode 100644 kernel/fs/jbd2/journal.c
 create mode 100644 kernel/fs/jbd2/recovery.c
 create mode 100644 kernel/fs/jbd2/revoke.c
 create mode 100644 kernel/fs/jbd2/transaction.c
 create mode 100644 kernel/fs/jffs2/Kconfig
 create mode 100644 kernel/fs/jffs2/LICENCE
 create mode 100644 kernel/fs/jffs2/Makefile
 create mode 100644 kernel/fs/jffs2/README.Locking
 create mode 100644 kernel/fs/jffs2/TODO
 create mode 100644 kernel/fs/jffs2/acl.c
 create mode 100644 kernel/fs/jffs2/acl.h
 create mode 100644 kernel/fs/jffs2/background.c
 create mode 100644 kernel/fs/jffs2/build.c
 create mode 100644 kernel/fs/jffs2/compr.c
 create mode 100644 kernel/fs/jffs2/compr.h
 create mode 100644 kernel/fs/jffs2/compr_lzo.c
 create mode 100644 kernel/fs/jffs2/compr_rtime.c
 create mode 100644 kernel/fs/jffs2/compr_rubin.c
 create mode 100644 kernel/fs/jffs2/compr_zlib.c
 create mode 100644 kernel/fs/jffs2/debug.c
 create mode 100644 kernel/fs/jffs2/debug.h
 create mode 100644 kernel/fs/jffs2/dir.c
 create mode 100644 kernel/fs/jffs2/erase.c
 create mode 100644 kernel/fs/jffs2/file.c
 create mode 100644 kernel/fs/jffs2/fs.c
 create mode 100644 kernel/fs/jffs2/gc.c
 create mode 100644 kernel/fs/jffs2/ioctl.c
 create mode 100644 kernel/fs/jffs2/jffs2_fs_i.h
 create mode 100644 kernel/fs/jffs2/jffs2_fs_sb.h
 create mode 100644 kernel/fs/jffs2/malloc.c
 create mode 100644 kernel/fs/jffs2/nodelist.c
 create mode 100644 kernel/fs/jffs2/nodelist.h
 create mode 100644 kernel/fs/jffs2/nodemgmt.c
 create mode 100644 kernel/fs/jffs2/os-linux.h
 create mode 100644 kernel/fs/jffs2/read.c
 create mode 100644 kernel/fs/jffs2/readinode.c
 create mode 100644 kernel/fs/jffs2/scan.c
 create mode 100644 kernel/fs/jffs2/security.c
 create mode 100644 kernel/fs/jffs2/summary.c
 create mode 100644 kernel/fs/jffs2/summary.h
 create mode 100644 kernel/fs/jffs2/super.c
 create mode 100644 kernel/fs/jffs2/symlink.c
 create mode 100644 kernel/fs/jffs2/wbuf.c
 create mode 100644 kernel/fs/jffs2/write.c
 create mode 100644 kernel/fs/jffs2/writev.c
 create mode 100644 kernel/fs/jffs2/xattr.c
 create mode 100644 kernel/fs/jffs2/xattr.h
 create mode 100644 kernel/fs/jffs2/xattr_trusted.c
 create mode 100644 kernel/fs/jffs2/xattr_user.c
 create mode 100644 kernel/fs/jfs/Kconfig
 create mode 100644 kernel/fs/jfs/Makefile
 create mode 100644 kernel/fs/jfs/acl.c
 create mode 100644 kernel/fs/jfs/file.c
 create mode 100644 kernel/fs/jfs/inode.c
 create mode 100644 kernel/fs/jfs/ioctl.c
 create mode 100644 kernel/fs/jfs/jfs_acl.h
 create mode 100644 kernel/fs/jfs/jfs_btree.h
 create mode 100644 kernel/fs/jfs/jfs_debug.c
 create mode 100644 kernel/fs/jfs/jfs_debug.h
 create mode 100644 kernel/fs/jfs/jfs_dinode.h
 create mode 100644 kernel/fs/jfs/jfs_discard.c
 create mode 100644 kernel/fs/jfs/jfs_discard.h
 create mode 100644 kernel/fs/jfs/jfs_dmap.c
 create mode 100644 kernel/fs/jfs/jfs_dmap.h
 create mode 100644 kernel/fs/jfs/jfs_dtree.c
 create mode 100644 kernel/fs/jfs/jfs_dtree.h
 create mode 100644 kernel/fs/jfs/jfs_extent.c
 create mode 100644 kernel/fs/jfs/jfs_extent.h
 create mode 100644 kernel/fs/jfs/jfs_filsys.h
 create mode 100644 kernel/fs/jfs/jfs_imap.c
 create mode 100644 kernel/fs/jfs/jfs_imap.h
 create mode 100644 kernel/fs/jfs/jfs_incore.h
 create mode 100644 kernel/fs/jfs/jfs_inode.c
 create mode 100644 kernel/fs/jfs/jfs_inode.h
 create mode 100644 kernel/fs/jfs/jfs_lock.h
 create mode 100644 kernel/fs/jfs/jfs_logmgr.c
 create mode 100644 kernel/fs/jfs/jfs_logmgr.h
 create mode 100644 kernel/fs/jfs/jfs_metapage.c
 create mode 100644 kernel/fs/jfs/jfs_metapage.h
 create mode 100644 kernel/fs/jfs/jfs_mount.c
 create mode 100644 kernel/fs/jfs/jfs_superblock.h
 create mode 100644 kernel/fs/jfs/jfs_txnmgr.c
 create mode 100644 kernel/fs/jfs/jfs_txnmgr.h
 create mode 100644 kernel/fs/jfs/jfs_types.h
 create mode 100644 kernel/fs/jfs/jfs_umount.c
 create mode 100644 kernel/fs/jfs/jfs_unicode.c
 create mode 100644 kernel/fs/jfs/jfs_unicode.h
 create mode 100644 kernel/fs/jfs/jfs_uniupr.c
 create mode 100644 kernel/fs/jfs/jfs_xattr.h
 create mode 100644 kernel/fs/jfs/jfs_xtree.c
 create mode 100644 kernel/fs/jfs/jfs_xtree.h
 create mode 100644 kernel/fs/jfs/namei.c
 create mode 100644 kernel/fs/jfs/resize.c
 create mode 100644 kernel/fs/jfs/super.c
 create mode 100644 kernel/fs/jfs/symlink.c
 create mode 100644 kernel/fs/jfs/xattr.c
 create mode 100644 kernel/fs/kernfs/Kconfig
 create mode 100644 kernel/fs/kernfs/Makefile
 create mode 100644 kernel/fs/kernfs/dir.c
 create mode 100644 kernel/fs/kernfs/file.c
 create mode 100644 kernel/fs/kernfs/inode.c
 create mode 100644 kernel/fs/kernfs/kernfs-internal.h
 create mode 100644 kernel/fs/kernfs/mount.c
 create mode 100644 kernel/fs/kernfs/symlink.c
 create mode 100644 kernel/fs/libfs.c
 create mode 100644 kernel/fs/lockd/Makefile
 create mode 100644 kernel/fs/lockd/clnt4xdr.c
 create mode 100644 kernel/fs/lockd/clntlock.c
 create mode 100644 kernel/fs/lockd/clntproc.c
 create mode 100644 kernel/fs/lockd/clntxdr.c
 create mode 100644 kernel/fs/lockd/host.c
 create mode 100644 kernel/fs/lockd/mon.c
 create mode 100644 kernel/fs/lockd/netns.h
 create mode 100644 kernel/fs/lockd/procfs.c
 create mode 100644 kernel/fs/lockd/procfs.h
 create mode 100644 kernel/fs/lockd/svc.c
 create mode 100644 kernel/fs/lockd/svc4proc.c
 create mode 100644 kernel/fs/lockd/svclock.c
 create mode 100644 kernel/fs/lockd/svcproc.c
 create mode 100644 kernel/fs/lockd/svcshare.c
 create mode 100644 kernel/fs/lockd/svcsubs.c
 create mode 100644 kernel/fs/lockd/xdr.c
 create mode 100644 kernel/fs/lockd/xdr4.c
 create mode 100644 kernel/fs/locks.c
 create mode 100644 kernel/fs/logfs/Kconfig
 create mode 100644 kernel/fs/logfs/Makefile
 create mode 100644 kernel/fs/logfs/compr.c
 create mode 100644 kernel/fs/logfs/dev_bdev.c
 create mode 100644 kernel/fs/logfs/dev_mtd.c
 create mode 100644 kernel/fs/logfs/dir.c
 create mode 100644 kernel/fs/logfs/file.c
 create mode 100644 kernel/fs/logfs/gc.c
 create mode 100644 kernel/fs/logfs/inode.c
 create mode 100644 kernel/fs/logfs/journal.c
 create mode 100644 kernel/fs/logfs/logfs.h
 create mode 100644 kernel/fs/logfs/logfs_abi.h
 create mode 100644 kernel/fs/logfs/readwrite.c
 create mode 100644 kernel/fs/logfs/segment.c
 create mode 100644 kernel/fs/logfs/super.c
 create mode 100644 kernel/fs/mbcache.c
 create mode 100644 kernel/fs/minix/Kconfig
 create mode 100644 kernel/fs/minix/Makefile
 create mode 100644 kernel/fs/minix/bitmap.c
 create mode 100644 kernel/fs/minix/dir.c
 create mode 100644 kernel/fs/minix/file.c
 create mode 100644 kernel/fs/minix/inode.c
 create mode 100644 kernel/fs/minix/itree_common.c
 create mode 100644 kernel/fs/minix/itree_v1.c
 create mode 100644 kernel/fs/minix/itree_v2.c
 create mode 100644 kernel/fs/minix/minix.h
 create mode 100644 kernel/fs/minix/namei.c
 create mode 100644 kernel/fs/mount.h
 create mode 100644 kernel/fs/mpage.c
 create mode 100644 kernel/fs/namei.c
 create mode 100644 kernel/fs/namespace.c
 create mode 100644 kernel/fs/ncpfs/Kconfig
 create mode 100644 kernel/fs/ncpfs/Makefile
 create mode 100644 kernel/fs/ncpfs/dir.c
 create mode 100644 kernel/fs/ncpfs/file.c
 create mode 100644 kernel/fs/ncpfs/getopt.c
 create mode 100644 kernel/fs/ncpfs/getopt.h
 create mode 100644 kernel/fs/ncpfs/inode.c
 create mode 100644 kernel/fs/ncpfs/ioctl.c
 create mode 100644 kernel/fs/ncpfs/mmap.c
 create mode 100644 kernel/fs/ncpfs/ncp_fs.h
 create mode 100644 kernel/fs/ncpfs/ncp_fs_i.h
 create mode 100644 kernel/fs/ncpfs/ncp_fs_sb.h
 create mode 100644 kernel/fs/ncpfs/ncplib_kernel.c
 create mode 100644 kernel/fs/ncpfs/ncplib_kernel.h
 create mode 100644 kernel/fs/ncpfs/ncpsign_kernel.c
 create mode 100644 kernel/fs/ncpfs/ncpsign_kernel.h
 create mode 100644 kernel/fs/ncpfs/sock.c
 create mode 100644 kernel/fs/ncpfs/symlink.c
 create mode 100644 kernel/fs/nfs/Kconfig
 create mode 100644 kernel/fs/nfs/Makefile
 create mode 100644 kernel/fs/nfs/blocklayout/Makefile
 create mode 100644 kernel/fs/nfs/blocklayout/blocklayout.c
 create mode 100644 kernel/fs/nfs/blocklayout/blocklayout.h
 create mode 100644 kernel/fs/nfs/blocklayout/dev.c
 create mode 100644 kernel/fs/nfs/blocklayout/extent_tree.c
 create mode 100644 kernel/fs/nfs/blocklayout/rpc_pipefs.c
 create mode 100644 kernel/fs/nfs/cache_lib.c
 create mode 100644 kernel/fs/nfs/cache_lib.h
 create mode 100644 kernel/fs/nfs/callback.c
 create mode 100644 kernel/fs/nfs/callback.h
 create mode 100644 kernel/fs/nfs/callback_proc.c
 create mode 100644 kernel/fs/nfs/callback_xdr.c
 create mode 100644 kernel/fs/nfs/client.c
 create mode 100644 kernel/fs/nfs/delegation.c
 create mode 100644 kernel/fs/nfs/delegation.h
 create mode 100644 kernel/fs/nfs/dir.c
 create mode 100644 kernel/fs/nfs/direct.c
 create mode 100644 kernel/fs/nfs/dns_resolve.c
 create mode 100644 kernel/fs/nfs/dns_resolve.h
 create mode 100644 kernel/fs/nfs/file.c
 create mode 100644 kernel/fs/nfs/filelayout/Makefile
 create mode 100644 kernel/fs/nfs/filelayout/filelayout.c
 create mode 100644 kernel/fs/nfs/filelayout/filelayout.h
 create mode 100644 kernel/fs/nfs/filelayout/filelayoutdev.c
 create mode 100644 kernel/fs/nfs/flexfilelayout/Makefile
 create mode 100644 kernel/fs/nfs/flexfilelayout/flexfilelayout.c
 create mode 100644 kernel/fs/nfs/flexfilelayout/flexfilelayout.h
 create mode 100644 kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c
 create mode 100644 kernel/fs/nfs/fscache-index.c
 create mode 100644 kernel/fs/nfs/fscache.c
 create mode 100644 kernel/fs/nfs/fscache.h
 create mode 100644 kernel/fs/nfs/getroot.c
 create mode 100644 kernel/fs/nfs/inode.c
 create mode 100644 kernel/fs/nfs/internal.h
 create mode 100644 kernel/fs/nfs/iostat.h
 create mode 100644 kernel/fs/nfs/mount_clnt.c
 create mode 100644 kernel/fs/nfs/namespace.c
 create mode 100644 kernel/fs/nfs/netns.h
 create mode 100644 kernel/fs/nfs/nfs.h
 create mode 100644 kernel/fs/nfs/nfs2super.c
 create mode 100644 kernel/fs/nfs/nfs2xdr.c
 create mode 100644 kernel/fs/nfs/nfs3_fs.h
 create mode 100644 kernel/fs/nfs/nfs3acl.c
 create mode 100644 kernel/fs/nfs/nfs3client.c
 create mode 100644 kernel/fs/nfs/nfs3proc.c
 create mode 100644 kernel/fs/nfs/nfs3super.c
 create mode 100644 kernel/fs/nfs/nfs3xdr.c
 create mode 100644 kernel/fs/nfs/nfs42.h
 create mode 100644 kernel/fs/nfs/nfs42proc.c
 create mode 100644 kernel/fs/nfs/nfs42xdr.c
 create mode 100644 kernel/fs/nfs/nfs4_fs.h
 create mode 100644 kernel/fs/nfs/nfs4client.c
 create mode 100644 kernel/fs/nfs/nfs4file.c
 create mode 100644 kernel/fs/nfs/nfs4getroot.c
 create mode 100644 kernel/fs/nfs/nfs4idmap.c
 create mode 100644 kernel/fs/nfs/nfs4idmap.h
 create mode 100644 kernel/fs/nfs/nfs4namespace.c
 create mode 100644 kernel/fs/nfs/nfs4proc.c
 create mode 100644 kernel/fs/nfs/nfs4renewd.c
 create mode 100644 kernel/fs/nfs/nfs4session.c
 create mode 100644 kernel/fs/nfs/nfs4session.h
 create mode 100644 kernel/fs/nfs/nfs4state.c
 create mode 100644 kernel/fs/nfs/nfs4super.c
 create mode 100644 kernel/fs/nfs/nfs4sysctl.c
 create mode 100644 kernel/fs/nfs/nfs4trace.c
 create mode 100644 kernel/fs/nfs/nfs4trace.h
 create mode 100644 kernel/fs/nfs/nfs4xdr.c
 create mode 100644 kernel/fs/nfs/nfsroot.c
 create mode 100644 kernel/fs/nfs/nfstrace.c
 create mode 100644 kernel/fs/nfs/nfstrace.h
 create mode 100644 kernel/fs/nfs/objlayout/Kbuild
 create mode 100644 kernel/fs/nfs/objlayout/objio_osd.c
 create mode 100644 kernel/fs/nfs/objlayout/objlayout.c
 create mode 100644 kernel/fs/nfs/objlayout/objlayout.h
 create mode 100644 kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
 create mode 100644 kernel/fs/nfs/pagelist.c
 create mode 100644 kernel/fs/nfs/pnfs.c
 create mode 100644 kernel/fs/nfs/pnfs.h
 create mode 100644 kernel/fs/nfs/pnfs_dev.c
 create mode 100644 kernel/fs/nfs/pnfs_nfs.c
 create mode 100644 kernel/fs/nfs/proc.c
 create mode 100644 kernel/fs/nfs/read.c
 create mode 100644 kernel/fs/nfs/super.c
 create mode 100644 kernel/fs/nfs/symlink.c
 create mode 100644 kernel/fs/nfs/sysctl.c
 create mode 100644 kernel/fs/nfs/unlink.c
 create mode 100644 kernel/fs/nfs/write.c
 create mode 100644 kernel/fs/nfs_common/Makefile
 create mode 100644 kernel/fs/nfs_common/grace.c
 create mode 100644 kernel/fs/nfs_common/nfsacl.c
 create mode 100644 kernel/fs/nfsd/Kconfig
 create mode 100644 kernel/fs/nfsd/Makefile
 create mode 100644 kernel/fs/nfsd/acl.h
 create mode 100644 kernel/fs/nfsd/auth.c
 create mode 100644 kernel/fs/nfsd/auth.h
 create mode 100644 kernel/fs/nfsd/blocklayout.c
 create mode 100644 kernel/fs/nfsd/blocklayoutxdr.c
 create mode 100644 kernel/fs/nfsd/blocklayoutxdr.h
 create mode 100644 kernel/fs/nfsd/cache.h
 create mode 100644 kernel/fs/nfsd/current_stateid.h
 create mode 100644 kernel/fs/nfsd/export.c
 create mode 100644 kernel/fs/nfsd/export.h
 create mode 100644 kernel/fs/nfsd/fault_inject.c
 create mode 100644 kernel/fs/nfsd/idmap.h
 create mode 100644 kernel/fs/nfsd/lockd.c
 create mode 100644 kernel/fs/nfsd/netns.h
 create mode 100644 kernel/fs/nfsd/nfs2acl.c
 create mode 100644 kernel/fs/nfsd/nfs3acl.c
 create mode 100644 kernel/fs/nfsd/nfs3proc.c
 create mode 100644 kernel/fs/nfsd/nfs3xdr.c
 create mode 100644 kernel/fs/nfsd/nfs4acl.c
 create mode 100644 kernel/fs/nfsd/nfs4callback.c
 create mode 100644 kernel/fs/nfsd/nfs4idmap.c
 create mode 100644 kernel/fs/nfsd/nfs4layouts.c
 create mode 100644 kernel/fs/nfsd/nfs4proc.c
 create mode 100644 kernel/fs/nfsd/nfs4recover.c
 create mode 100644 kernel/fs/nfsd/nfs4state.c
 create mode 100644 kernel/fs/nfsd/nfs4xdr.c
 create mode 100644 kernel/fs/nfsd/nfscache.c
 create mode 100644 kernel/fs/nfsd/nfsctl.c
 create mode 100644 kernel/fs/nfsd/nfsd.h
 create mode 100644 kernel/fs/nfsd/nfsfh.c
 create mode 100644 kernel/fs/nfsd/nfsfh.h
 create mode 100644 kernel/fs/nfsd/nfsproc.c
 create mode 100644 kernel/fs/nfsd/nfssvc.c
 create mode 100644 kernel/fs/nfsd/nfsxdr.c
 create mode 100644 kernel/fs/nfsd/pnfs.h
 create mode 100644 kernel/fs/nfsd/state.h
 create mode 100644 kernel/fs/nfsd/stats.c
 create mode 100644 kernel/fs/nfsd/stats.h
 create mode 100644 kernel/fs/nfsd/trace.c
 create mode 100644 kernel/fs/nfsd/trace.h
 create mode 100644 kernel/fs/nfsd/vfs.c
 create mode 100644 kernel/fs/nfsd/vfs.h
 create mode 100644 kernel/fs/nfsd/xdr.h
 create mode 100644 kernel/fs/nfsd/xdr3.h
 create mode 100644 kernel/fs/nfsd/xdr4.h
 create mode 100644 kernel/fs/nfsd/xdr4cb.h
 create mode 100644 kernel/fs/nilfs2/Kconfig
 create mode 100644 kernel/fs/nilfs2/Makefile
 create mode 100644 kernel/fs/nilfs2/alloc.c
 create mode 100644 kernel/fs/nilfs2/alloc.h
 create mode 100644 kernel/fs/nilfs2/bmap.c
 create mode 100644 kernel/fs/nilfs2/bmap.h
 create mode 100644 kernel/fs/nilfs2/btnode.c
 create mode 100644 kernel/fs/nilfs2/btnode.h
 create mode 100644 kernel/fs/nilfs2/btree.c
 create mode 100644 kernel/fs/nilfs2/btree.h
 create mode 100644 kernel/fs/nilfs2/cpfile.c
 create mode 100644 kernel/fs/nilfs2/cpfile.h
 create mode 100644 kernel/fs/nilfs2/dat.c
 create mode 100644 kernel/fs/nilfs2/dat.h
 create mode 100644 kernel/fs/nilfs2/dir.c
 create mode 100644 kernel/fs/nilfs2/direct.c
 create mode 100644 kernel/fs/nilfs2/direct.h
 create mode 100644 kernel/fs/nilfs2/export.h
 create mode 100644 kernel/fs/nilfs2/file.c
 create mode 100644 kernel/fs/nilfs2/gcinode.c
 create mode 100644 kernel/fs/nilfs2/ifile.c
 create mode 100644 kernel/fs/nilfs2/ifile.h
 create mode 100644 kernel/fs/nilfs2/inode.c
 create mode 100644 kernel/fs/nilfs2/ioctl.c
 create mode 100644 kernel/fs/nilfs2/mdt.c
 create mode 100644 kernel/fs/nilfs2/mdt.h
 create mode 100644 kernel/fs/nilfs2/namei.c
 create mode 100644 kernel/fs/nilfs2/nilfs.h
 create mode 100644 kernel/fs/nilfs2/page.c
 create mode 100644 kernel/fs/nilfs2/page.h
 create mode 100644 kernel/fs/nilfs2/recovery.c
 create mode 100644 kernel/fs/nilfs2/segbuf.c
 create mode 100644 kernel/fs/nilfs2/segbuf.h
 create mode 100644 kernel/fs/nilfs2/segment.c
 create mode 100644 kernel/fs/nilfs2/segment.h
 create mode 100644 kernel/fs/nilfs2/sufile.c
 create mode 100644 kernel/fs/nilfs2/sufile.h
 create mode 100644 kernel/fs/nilfs2/super.c
 create mode 100644 kernel/fs/nilfs2/sysfs.c
 create mode 100644 kernel/fs/nilfs2/sysfs.h
 create mode 100644 kernel/fs/nilfs2/the_nilfs.c
 create mode 100644 kernel/fs/nilfs2/the_nilfs.h
 create mode 100644 kernel/fs/nls/Kconfig
 create mode 100644 kernel/fs/nls/Makefile
 create mode 100644 kernel/fs/nls/mac-celtic.c
 create mode 100644 kernel/fs/nls/mac-centeuro.c
 create mode 100644 kernel/fs/nls/mac-croatian.c
 create mode 100644 kernel/fs/nls/mac-cyrillic.c
 create mode 100644 kernel/fs/nls/mac-gaelic.c
 create mode 100644 kernel/fs/nls/mac-greek.c
 create mode 100644 kernel/fs/nls/mac-iceland.c
 create mode 100644 kernel/fs/nls/mac-inuit.c
 create mode 100644 kernel/fs/nls/mac-roman.c
 create mode 100644 kernel/fs/nls/mac-romanian.c
 create mode 100644 kernel/fs/nls/mac-turkish.c
 create mode 100644 kernel/fs/nls/nls_ascii.c
 create mode 100644 kernel/fs/nls/nls_base.c
 create mode 100644 kernel/fs/nls/nls_cp1250.c
 create mode 100644 kernel/fs/nls/nls_cp1251.c
 create mode 100644 kernel/fs/nls/nls_cp1255.c
 create mode 100644 kernel/fs/nls/nls_cp437.c
 create mode 100644 kernel/fs/nls/nls_cp737.c
 create mode 100644 kernel/fs/nls/nls_cp775.c
 create mode 100644 kernel/fs/nls/nls_cp850.c
 create mode 100644 kernel/fs/nls/nls_cp852.c
 create mode 100644 kernel/fs/nls/nls_cp855.c
 create mode 100644 kernel/fs/nls/nls_cp857.c
 create mode 100644 kernel/fs/nls/nls_cp860.c
 create mode 100644 kernel/fs/nls/nls_cp861.c
 create mode 100644 kernel/fs/nls/nls_cp862.c
 create mode 100644 kernel/fs/nls/nls_cp863.c
 create mode 100644 kernel/fs/nls/nls_cp864.c
 create mode 100644 kernel/fs/nls/nls_cp865.c
 create mode 100644 kernel/fs/nls/nls_cp866.c
 create mode 100644 kernel/fs/nls/nls_cp869.c
 create mode 100644 kernel/fs/nls/nls_cp874.c
 create mode 100644 kernel/fs/nls/nls_cp932.c
 create mode 100644 kernel/fs/nls/nls_cp936.c
 create mode 100644 kernel/fs/nls/nls_cp949.c
 create mode 100644 kernel/fs/nls/nls_cp950.c
 create mode 100644 kernel/fs/nls/nls_euc-jp.c
 create mode 100644 kernel/fs/nls/nls_iso8859-1.c
 create mode 100644 kernel/fs/nls/nls_iso8859-13.c
 create mode 100644 kernel/fs/nls/nls_iso8859-14.c
 create mode 100644 kernel/fs/nls/nls_iso8859-15.c
 create mode 100644 kernel/fs/nls/nls_iso8859-2.c
 create mode 100644 kernel/fs/nls/nls_iso8859-3.c
 create mode 100644 kernel/fs/nls/nls_iso8859-4.c
 create mode 100644 kernel/fs/nls/nls_iso8859-5.c
 create mode 100644 kernel/fs/nls/nls_iso8859-6.c
 create mode 100644 kernel/fs/nls/nls_iso8859-7.c
 create mode 100644 kernel/fs/nls/nls_iso8859-9.c
 create mode 100644 kernel/fs/nls/nls_koi8-r.c
 create mode 100644 kernel/fs/nls/nls_koi8-ru.c
 create mode 100644 kernel/fs/nls/nls_koi8-u.c
 create mode 100644 kernel/fs/nls/nls_utf8.c
 create mode 100644 kernel/fs/no-block.c
 create mode 100644 kernel/fs/notify/Kconfig
 create mode 100644 kernel/fs/notify/Makefile
 create mode 100644 kernel/fs/notify/dnotify/Kconfig
 create mode 100644 kernel/fs/notify/dnotify/Makefile
 create mode 100644 kernel/fs/notify/dnotify/dnotify.c
 create mode 100644 kernel/fs/notify/fanotify/Kconfig
 create mode 100644 kernel/fs/notify/fanotify/Makefile
 create mode 100644 kernel/fs/notify/fanotify/fanotify.c
 create mode 100644 kernel/fs/notify/fanotify/fanotify.h
 create mode 100644 kernel/fs/notify/fanotify/fanotify_user.c
 create mode 100644 kernel/fs/notify/fdinfo.c
 create mode 100644 kernel/fs/notify/fdinfo.h
 create mode 100644 kernel/fs/notify/fsnotify.c
 create mode 100644 kernel/fs/notify/fsnotify.h
 create mode 100644 kernel/fs/notify/group.c
 create mode 100644 kernel/fs/notify/inode_mark.c
 create mode 100644 kernel/fs/notify/inotify/Kconfig
 create mode 100644 kernel/fs/notify/inotify/Makefile
 create mode 100644 kernel/fs/notify/inotify/inotify.h
 create mode 100644 kernel/fs/notify/inotify/inotify_fsnotify.c
 create mode 100644 kernel/fs/notify/inotify/inotify_user.c
 create mode 100644 kernel/fs/notify/mark.c
 create mode 100644 kernel/fs/notify/notification.c
 create mode 100644 kernel/fs/notify/vfsmount_mark.c
 create mode 100644 kernel/fs/nsfs.c
 create mode 100644 kernel/fs/ntfs/Kconfig
 create mode 100644 kernel/fs/ntfs/Makefile
 create mode 100644 kernel/fs/ntfs/aops.c
 create mode 100644 kernel/fs/ntfs/aops.h
 create mode 100644 kernel/fs/ntfs/attrib.c
 create mode 100644 kernel/fs/ntfs/attrib.h
 create mode 100644 kernel/fs/ntfs/bitmap.c
 create mode 100644 kernel/fs/ntfs/bitmap.h
 create mode 100644 kernel/fs/ntfs/collate.c
 create mode 100644 kernel/fs/ntfs/collate.h
 create mode 100644 kernel/fs/ntfs/compress.c
 create mode 100644 kernel/fs/ntfs/debug.c
 create mode 100644 kernel/fs/ntfs/debug.h
 create mode 100644 kernel/fs/ntfs/dir.c
 create mode 100644 kernel/fs/ntfs/dir.h
 create mode 100644 kernel/fs/ntfs/endian.h
 create mode 100644 kernel/fs/ntfs/file.c
 create mode 100644 kernel/fs/ntfs/index.c
 create mode 100644 kernel/fs/ntfs/index.h
 create mode 100644 kernel/fs/ntfs/inode.c
 create mode 100644 kernel/fs/ntfs/inode.h
 create mode 100644 kernel/fs/ntfs/layout.h
 create mode 100644 kernel/fs/ntfs/lcnalloc.c
 create mode 100644 kernel/fs/ntfs/lcnalloc.h
 create mode 100644 kernel/fs/ntfs/logfile.c
 create mode 100644 kernel/fs/ntfs/logfile.h
 create mode 100644 kernel/fs/ntfs/malloc.h
 create mode 100644 kernel/fs/ntfs/mft.c
 create mode 100644 kernel/fs/ntfs/mft.h
 create mode 100644 kernel/fs/ntfs/mst.c
 create mode 100644 kernel/fs/ntfs/namei.c
 create mode 100644 kernel/fs/ntfs/ntfs.h
 create mode 100644 kernel/fs/ntfs/quota.c
 create mode 100644 kernel/fs/ntfs/quota.h
 create mode 100644 kernel/fs/ntfs/runlist.c
 create mode 100644 kernel/fs/ntfs/runlist.h
 create mode 100644 kernel/fs/ntfs/super.c
 create mode 100644 kernel/fs/ntfs/sysctl.c
 create mode 100644 kernel/fs/ntfs/sysctl.h
 create mode 100644 kernel/fs/ntfs/time.h
 create mode 100644 kernel/fs/ntfs/types.h
 create mode 100644 kernel/fs/ntfs/unistr.c
 create mode 100644 kernel/fs/ntfs/upcase.c
 create mode 100644 kernel/fs/ntfs/usnjrnl.c
 create mode 100644 kernel/fs/ntfs/usnjrnl.h
 create mode 100644 kernel/fs/ntfs/volume.h
 create mode 100644 kernel/fs/ocfs2/Kconfig
 create mode 100644 kernel/fs/ocfs2/Makefile
 create mode 100644 kernel/fs/ocfs2/acl.c
 create mode 100644 kernel/fs/ocfs2/acl.h
 create mode 100644 kernel/fs/ocfs2/alloc.c
 create mode 100644 kernel/fs/ocfs2/alloc.h
 create mode 100644 kernel/fs/ocfs2/aops.c
 create mode 100644 kernel/fs/ocfs2/aops.h
 create mode 100644 kernel/fs/ocfs2/blockcheck.c
 create mode 100644 kernel/fs/ocfs2/blockcheck.h
 create mode 100644 kernel/fs/ocfs2/buffer_head_io.c
 create mode 100644 kernel/fs/ocfs2/buffer_head_io.h
 create mode 100644 kernel/fs/ocfs2/cluster/Makefile
 create mode 100644 kernel/fs/ocfs2/cluster/heartbeat.c
 create mode 100644 kernel/fs/ocfs2/cluster/heartbeat.h
 create mode 100644 kernel/fs/ocfs2/cluster/masklog.c
 create mode 100644 kernel/fs/ocfs2/cluster/masklog.h
 create mode 100644 kernel/fs/ocfs2/cluster/netdebug.c
 create mode 100644 kernel/fs/ocfs2/cluster/nodemanager.c
 create mode 100644 kernel/fs/ocfs2/cluster/nodemanager.h
 create mode 100644 kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h
 create mode 100644 kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h
 create mode 100644 kernel/fs/ocfs2/cluster/quorum.c
 create mode 100644 kernel/fs/ocfs2/cluster/quorum.h
 create mode 100644 kernel/fs/ocfs2/cluster/sys.c
 create mode 100644 kernel/fs/ocfs2/cluster/sys.h
 create mode 100644 kernel/fs/ocfs2/cluster/tcp.c
 create mode 100644 kernel/fs/ocfs2/cluster/tcp.h
 create mode 100644 kernel/fs/ocfs2/cluster/tcp_internal.h
 create mode 100644 kernel/fs/ocfs2/dcache.c
 create mode 100644 kernel/fs/ocfs2/dcache.h
 create mode 100644 kernel/fs/ocfs2/dir.c
 create mode 100644 kernel/fs/ocfs2/dir.h
 create mode 100644 kernel/fs/ocfs2/dlm/Makefile
 create mode 100644 kernel/fs/ocfs2/dlm/dlmapi.h
 create mode 100644 kernel/fs/ocfs2/dlm/dlmast.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmcommon.h
 create mode 100644 kernel/fs/ocfs2/dlm/dlmconvert.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmconvert.h
 create mode 100644 kernel/fs/ocfs2/dlm/dlmdebug.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmdebug.h
 create mode 100644 kernel/fs/ocfs2/dlm/dlmdomain.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmdomain.h
 create mode 100644 kernel/fs/ocfs2/dlm/dlmlock.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmmaster.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmrecovery.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmthread.c
 create mode 100644 kernel/fs/ocfs2/dlm/dlmunlock.c
 create mode 100644 kernel/fs/ocfs2/dlmfs/Makefile
 create mode 100644 kernel/fs/ocfs2/dlmfs/dlmfs.c
 create mode 100644 kernel/fs/ocfs2/dlmfs/userdlm.c
 create mode 100644 kernel/fs/ocfs2/dlmfs/userdlm.h
 create mode 100644 kernel/fs/ocfs2/dlmglue.c
 create mode 100644 kernel/fs/ocfs2/dlmglue.h
 create mode 100644 kernel/fs/ocfs2/export.c
 create mode 100644 kernel/fs/ocfs2/export.h
 create mode 100644 kernel/fs/ocfs2/extent_map.c
 create mode 100644 kernel/fs/ocfs2/extent_map.h
 create mode 100644 kernel/fs/ocfs2/file.c
 create mode 100644 kernel/fs/ocfs2/file.h
 create mode 100644 kernel/fs/ocfs2/heartbeat.c
 create mode 100644 kernel/fs/ocfs2/heartbeat.h
 create mode 100644 kernel/fs/ocfs2/inode.c
 create mode 100644 kernel/fs/ocfs2/inode.h
 create mode 100644 kernel/fs/ocfs2/ioctl.c
 create mode 100644 kernel/fs/ocfs2/ioctl.h
 create mode 100644 kernel/fs/ocfs2/journal.c
 create mode 100644 kernel/fs/ocfs2/journal.h
 create mode 100644 kernel/fs/ocfs2/localalloc.c
 create mode 100644 kernel/fs/ocfs2/localalloc.h
 create mode 100644 kernel/fs/ocfs2/locks.c
 create mode 100644 kernel/fs/ocfs2/locks.h
 create mode 100644 kernel/fs/ocfs2/mmap.c
 create mode 100644 kernel/fs/ocfs2/mmap.h
 create mode 100644 kernel/fs/ocfs2/move_extents.c
 create mode 100644 kernel/fs/ocfs2/move_extents.h
 create mode 100644 kernel/fs/ocfs2/namei.c
 create mode 100644 kernel/fs/ocfs2/namei.h
 create mode 100644 kernel/fs/ocfs2/ocfs1_fs_compat.h
 create mode 100644 kernel/fs/ocfs2/ocfs2.h
 create mode 100644 kernel/fs/ocfs2/ocfs2_fs.h
 create mode 100644 kernel/fs/ocfs2/ocfs2_ioctl.h
 create mode 100644 kernel/fs/ocfs2/ocfs2_lockid.h
 create mode 100644 kernel/fs/ocfs2/ocfs2_lockingver.h
 create mode 100644 kernel/fs/ocfs2/ocfs2_trace.h
 create mode 100644 kernel/fs/ocfs2/quota.h
 create mode 100644 kernel/fs/ocfs2/quota_global.c
 create mode 100644 kernel/fs/ocfs2/quota_local.c
 create mode 100644 kernel/fs/ocfs2/refcounttree.c
 create mode 100644 kernel/fs/ocfs2/refcounttree.h
 create mode 100644 kernel/fs/ocfs2/reservations.c
 create mode 100644 kernel/fs/ocfs2/reservations.h
 create mode 100644 kernel/fs/ocfs2/resize.c
 create mode 100644 kernel/fs/ocfs2/resize.h
 create mode 100644 kernel/fs/ocfs2/slot_map.c
 create mode 100644 kernel/fs/ocfs2/slot_map.h
 create mode 100644 kernel/fs/ocfs2/stack_o2cb.c
 create mode 100644 kernel/fs/ocfs2/stack_user.c
 create mode 100644 kernel/fs/ocfs2/stackglue.c
 create mode 100644 kernel/fs/ocfs2/stackglue.h
 create mode 100644 kernel/fs/ocfs2/suballoc.c
 create mode 100644 kernel/fs/ocfs2/suballoc.h
 create mode 100644 kernel/fs/ocfs2/super.c
 create mode 100644 kernel/fs/ocfs2/super.h
 create mode 100644 kernel/fs/ocfs2/symlink.c
 create mode 100644 kernel/fs/ocfs2/symlink.h
 create mode 100644 kernel/fs/ocfs2/sysfile.c
 create mode 100644 kernel/fs/ocfs2/sysfile.h
 create mode 100644 kernel/fs/ocfs2/uptodate.c
 create mode 100644 kernel/fs/ocfs2/uptodate.h
 create mode 100644 kernel/fs/ocfs2/xattr.c
 create mode 100644 kernel/fs/ocfs2/xattr.h
 create mode 100644 kernel/fs/omfs/Kconfig
 create mode 100644 kernel/fs/omfs/Makefile
 create mode 100644 kernel/fs/omfs/bitmap.c
 create mode 100644 kernel/fs/omfs/dir.c
 create mode 100644 kernel/fs/omfs/file.c
 create mode 100644 kernel/fs/omfs/inode.c
 create mode 100644 kernel/fs/omfs/omfs.h
 create mode 100644 kernel/fs/omfs/omfs_fs.h
 create mode 100644 kernel/fs/open.c
 create mode 100644 kernel/fs/openpromfs/Makefile
 create mode 100644 kernel/fs/openpromfs/inode.c
 create mode 100644 kernel/fs/overlayfs/Kconfig
 create mode 100644 kernel/fs/overlayfs/Makefile
 create mode 100644 kernel/fs/overlayfs/copy_up.c
 create mode 100644 kernel/fs/overlayfs/dir.c
 create mode 100644 kernel/fs/overlayfs/inode.c
 create mode 100644 kernel/fs/overlayfs/overlayfs.h
 create mode 100644 kernel/fs/overlayfs/readdir.c
 create mode 100644 kernel/fs/overlayfs/super.c
 create mode 100644 kernel/fs/pipe.c
 create mode 100644 kernel/fs/pnode.c
 create mode 100644 kernel/fs/pnode.h
 create mode 100644 kernel/fs/posix_acl.c
 create mode 100644 kernel/fs/proc/Kconfig
 create mode 100644 kernel/fs/proc/Makefile
 create mode 100644 kernel/fs/proc/array.c
 create mode 100644 kernel/fs/proc/base.c
 create mode 100644 kernel/fs/proc/cmdline.c
 create mode 100644 kernel/fs/proc/consoles.c
 create mode 100644 kernel/fs/proc/cpuinfo.c
 create mode 100644 kernel/fs/proc/devices.c
 create mode 100644 kernel/fs/proc/fd.c
 create mode 100644 kernel/fs/proc/fd.h
 create mode 100644 kernel/fs/proc/generic.c
 create mode 100644 kernel/fs/proc/inode.c
 create mode 100644 kernel/fs/proc/internal.h
 create mode 100644 kernel/fs/proc/interrupts.c
 create mode 100644 kernel/fs/proc/kcore.c
 create mode 100644 kernel/fs/proc/kmsg.c
 create mode 100644 kernel/fs/proc/loadavg.c
 create mode 100644 kernel/fs/proc/meminfo.c
 create mode 100644 kernel/fs/proc/namespaces.c
 create mode 100644 kernel/fs/proc/nommu.c
 create mode 100644 kernel/fs/proc/page.c
 create mode 100644 kernel/fs/proc/proc_net.c
 create mode 100644 kernel/fs/proc/proc_sysctl.c
 create mode 100644 kernel/fs/proc/proc_tty.c
 create mode 100644 kernel/fs/proc/root.c
 create mode 100644 kernel/fs/proc/self.c
 create mode 100644 kernel/fs/proc/softirqs.c
 create mode 100644 kernel/fs/proc/stat.c
 create mode 100644 kernel/fs/proc/task_mmu.c
 create mode 100644 kernel/fs/proc/task_nommu.c
 create mode 100644 kernel/fs/proc/thread_self.c
 create mode 100644 kernel/fs/proc/uptime.c
 create mode 100644 kernel/fs/proc/version.c
 create mode 100644 kernel/fs/proc/vmcore.c
 create mode 100644 kernel/fs/proc_namespace.c
 create mode 100644 kernel/fs/pstore/Kconfig
 create mode 100644 kernel/fs/pstore/Makefile
 create mode 100644 kernel/fs/pstore/ftrace.c
 create mode 100644 kernel/fs/pstore/inode.c
 create mode 100644 kernel/fs/pstore/internal.h
 create mode 100644 kernel/fs/pstore/platform.c
 create mode 100644 kernel/fs/pstore/pmsg.c
 create mode 100644 kernel/fs/pstore/ram.c
 create mode 100644 kernel/fs/pstore/ram_core.c
 create mode 100644 kernel/fs/qnx4/Kconfig
 create mode 100644 kernel/fs/qnx4/Makefile
 create mode 100644 kernel/fs/qnx4/README
 create mode 100644 kernel/fs/qnx4/bitmap.c
 create mode 100644 kernel/fs/qnx4/dir.c
 create mode 100644 kernel/fs/qnx4/inode.c
 create mode 100644 kernel/fs/qnx4/namei.c
 create mode 100644 kernel/fs/qnx4/qnx4.h
 create mode 100644 kernel/fs/qnx6/Kconfig
 create mode 100644 kernel/fs/qnx6/Makefile
 create mode 100644 kernel/fs/qnx6/README
 create mode 100644 kernel/fs/qnx6/dir.c
 create mode 100644 kernel/fs/qnx6/inode.c
 create mode 100644 kernel/fs/qnx6/namei.c
 create mode 100644 kernel/fs/qnx6/qnx6.h
 create mode 100644 kernel/fs/qnx6/super_mmi.c
 create mode 100644 kernel/fs/quota/Kconfig
 create mode 100644 kernel/fs/quota/Makefile
 create mode 100644 kernel/fs/quota/compat.c
 create mode 100644 kernel/fs/quota/dquot.c
 create mode 100644 kernel/fs/quota/kqid.c
 create mode 100644 kernel/fs/quota/netlink.c
 create mode 100644 kernel/fs/quota/quota.c
 create mode 100644 kernel/fs/quota/quota_tree.c
 create mode 100644 kernel/fs/quota/quota_tree.h
 create mode 100644 kernel/fs/quota/quota_v1.c
 create mode 100644 kernel/fs/quota/quota_v2.c
 create mode 100644 kernel/fs/quota/quotaio_v1.h
 create mode 100644 kernel/fs/quota/quotaio_v2.h
 create mode 100644 kernel/fs/ramfs/Makefile
 create mode 100644 kernel/fs/ramfs/file-mmu.c
 create mode 100644 kernel/fs/ramfs/file-nommu.c
 create mode 100644 kernel/fs/ramfs/inode.c
 create mode 100644 kernel/fs/ramfs/internal.h
 create mode 100644 kernel/fs/read_write.c
 create mode 100644 kernel/fs/readdir.c
 create mode 100644 kernel/fs/reiserfs/Kconfig
 create mode 100644 kernel/fs/reiserfs/Makefile
 create mode 100644 kernel/fs/reiserfs/README
 create mode 100644 kernel/fs/reiserfs/acl.h
 create mode 100644 kernel/fs/reiserfs/bitmap.c
 create mode 100644 kernel/fs/reiserfs/dir.c
 create mode 100644 kernel/fs/reiserfs/do_balan.c
 create mode 100644 kernel/fs/reiserfs/file.c
 create mode 100644 kernel/fs/reiserfs/fix_node.c
 create mode 100644 kernel/fs/reiserfs/hashes.c
 create mode 100644 kernel/fs/reiserfs/ibalance.c
 create mode 100644 kernel/fs/reiserfs/inode.c
 create mode 100644 kernel/fs/reiserfs/ioctl.c
 create mode 100644 kernel/fs/reiserfs/item_ops.c
 create mode 100644 kernel/fs/reiserfs/journal.c
 create mode 100644 kernel/fs/reiserfs/lbalance.c
 create mode 100644 kernel/fs/reiserfs/lock.c
 create mode 100644 kernel/fs/reiserfs/namei.c
 create mode 100644 kernel/fs/reiserfs/objectid.c
 create mode 100644 kernel/fs/reiserfs/prints.c
 create mode 100644 kernel/fs/reiserfs/procfs.c
 create mode 100644 kernel/fs/reiserfs/reiserfs.h
 create mode 100644 kernel/fs/reiserfs/resize.c
 create mode 100644 kernel/fs/reiserfs/stree.c
 create mode 100644 kernel/fs/reiserfs/super.c
 create mode 100644 kernel/fs/reiserfs/tail_conversion.c
 create mode 100644 kernel/fs/reiserfs/xattr.c
 create mode 100644 kernel/fs/reiserfs/xattr.h
 create mode 100644 kernel/fs/reiserfs/xattr_acl.c
 create mode 100644 kernel/fs/reiserfs/xattr_security.c
 create mode 100644 kernel/fs/reiserfs/xattr_trusted.c
 create mode 100644 kernel/fs/reiserfs/xattr_user.c
 create mode 100644 kernel/fs/romfs/Kconfig
 create mode 100644 kernel/fs/romfs/Makefile
 create mode 100644 kernel/fs/romfs/internal.h
 create mode 100644 kernel/fs/romfs/mmap-nommu.c
 create mode 100644 kernel/fs/romfs/storage.c
 create mode 100644 kernel/fs/romfs/super.c
 create mode 100644 kernel/fs/select.c
 create mode 100644 kernel/fs/seq_file.c
 create mode 100644 kernel/fs/signalfd.c
 create mode 100644 kernel/fs/splice.c
 create mode 100644 kernel/fs/squashfs/Kconfig
 create mode 100644 kernel/fs/squashfs/Makefile
 create mode 100644 kernel/fs/squashfs/block.c
 create mode 100644 kernel/fs/squashfs/cache.c
 create mode 100644 kernel/fs/squashfs/decompressor.c
 create mode 100644 kernel/fs/squashfs/decompressor.h
 create mode 100644 kernel/fs/squashfs/decompressor_multi.c
 create mode 100644 kernel/fs/squashfs/decompressor_multi_percpu.c
 create mode 100644 kernel/fs/squashfs/decompressor_single.c
 create mode 100644 kernel/fs/squashfs/dir.c
 create mode 100644 kernel/fs/squashfs/export.c
 create mode 100644 kernel/fs/squashfs/file.c
 create mode 100644 kernel/fs/squashfs/file_cache.c
 create mode 100644 kernel/fs/squashfs/file_direct.c
 create mode 100644 kernel/fs/squashfs/fragment.c
 create mode 100644 kernel/fs/squashfs/id.c
 create mode 100644 kernel/fs/squashfs/inode.c
 create mode 100644 kernel/fs/squashfs/lz4_wrapper.c
 create mode 100644 kernel/fs/squashfs/lzo_wrapper.c
 create mode 100644 kernel/fs/squashfs/namei.c
 create mode 100644 kernel/fs/squashfs/page_actor.c
 create mode 100644 kernel/fs/squashfs/page_actor.h
 create mode 100644 kernel/fs/squashfs/squashfs.h
 create mode 100644 kernel/fs/squashfs/squashfs_fs.h
 create mode 100644 kernel/fs/squashfs/squashfs_fs_i.h
 create mode 100644 kernel/fs/squashfs/squashfs_fs_sb.h
 create mode 100644 kernel/fs/squashfs/super.c
 create mode 100644 kernel/fs/squashfs/symlink.c
 create mode 100644 kernel/fs/squashfs/xattr.c
 create mode 100644 kernel/fs/squashfs/xattr.h
 create mode 100644 kernel/fs/squashfs/xattr_id.c
 create mode 100644 kernel/fs/squashfs/xz_wrapper.c
 create mode 100644 kernel/fs/squashfs/zlib_wrapper.c
 create mode 100644 kernel/fs/stack.c
 create mode 100644 kernel/fs/stat.c
 create mode 100644 kernel/fs/statfs.c
 create mode 100644 kernel/fs/super.c
 create mode 100644 kernel/fs/sync.c
 create mode 100644 kernel/fs/sysfs/Kconfig
 create mode 100644 kernel/fs/sysfs/Makefile
 create mode 100644 kernel/fs/sysfs/dir.c
 create mode 100644 kernel/fs/sysfs/file.c
 create mode 100644 kernel/fs/sysfs/group.c
 create mode 100644 kernel/fs/sysfs/mount.c
 create mode 100644 kernel/fs/sysfs/symlink.c
 create mode 100644 kernel/fs/sysfs/sysfs.h
 create mode 100644 kernel/fs/sysv/Kconfig
 create mode 100644 kernel/fs/sysv/Makefile
 create mode 100644 kernel/fs/sysv/balloc.c
 create mode 100644 kernel/fs/sysv/dir.c
 create mode 100644 kernel/fs/sysv/file.c
 create mode 100644 kernel/fs/sysv/ialloc.c
 create mode 100644 kernel/fs/sysv/inode.c
 create mode 100644 kernel/fs/sysv/itree.c
 create mode 100644 kernel/fs/sysv/namei.c
 create mode 100644 kernel/fs/sysv/super.c
 create mode 100644 kernel/fs/sysv/symlink.c
 create mode 100644 kernel/fs/sysv/sysv.h
 create mode 100644 kernel/fs/timerfd.c
 create mode 100644 kernel/fs/tracefs/Makefile
 create mode 100644 kernel/fs/tracefs/inode.c
 create mode 100644 kernel/fs/ubifs/Kconfig
 create mode 100644 kernel/fs/ubifs/Makefile
 create mode 100644 kernel/fs/ubifs/budget.c
 create mode 100644 kernel/fs/ubifs/commit.c
 create mode 100644 kernel/fs/ubifs/compress.c
 create mode 100644 kernel/fs/ubifs/debug.c
 create mode 100644 kernel/fs/ubifs/debug.h
 create mode 100644 kernel/fs/ubifs/dir.c
 create mode 100644 kernel/fs/ubifs/file.c
 create mode 100644 kernel/fs/ubifs/find.c
 create mode 100644 kernel/fs/ubifs/gc.c
 create mode 100644 kernel/fs/ubifs/io.c
 create mode 100644 kernel/fs/ubifs/ioctl.c
 create mode 100644 kernel/fs/ubifs/journal.c
 create mode 100644 kernel/fs/ubifs/key.h
 create mode 100644 kernel/fs/ubifs/log.c
 create mode 100644 kernel/fs/ubifs/lprops.c
 create mode 100644 kernel/fs/ubifs/lpt.c
 create mode 100644 kernel/fs/ubifs/lpt_commit.c
 create mode 100644 kernel/fs/ubifs/master.c
 create mode 100644 kernel/fs/ubifs/misc.h
 create mode 100644 kernel/fs/ubifs/orphan.c
 create mode 100644 kernel/fs/ubifs/recovery.c
 create mode 100644 kernel/fs/ubifs/replay.c
 create mode 100644 kernel/fs/ubifs/sb.c
 create mode 100644 kernel/fs/ubifs/scan.c
 create mode 100644 kernel/fs/ubifs/shrinker.c
 create mode 100644 kernel/fs/ubifs/super.c
 create mode 100644 kernel/fs/ubifs/tnc.c
 create mode 100644 kernel/fs/ubifs/tnc_commit.c
 create mode 100644 kernel/fs/ubifs/tnc_misc.c
 create mode 100644 kernel/fs/ubifs/ubifs-media.h
 create mode 100644 kernel/fs/ubifs/ubifs.h
 create mode 100644 kernel/fs/ubifs/xattr.c
 create mode 100644 kernel/fs/udf/Kconfig
 create mode 100644 kernel/fs/udf/Makefile
 create mode 100644 kernel/fs/udf/balloc.c
 create mode 100644 kernel/fs/udf/dir.c
 create mode 100644 kernel/fs/udf/directory.c
 create mode 100644 kernel/fs/udf/ecma_167.h
 create mode 100644 kernel/fs/udf/file.c
 create mode 100644 kernel/fs/udf/ialloc.c
 create mode 100644 kernel/fs/udf/inode.c
 create mode 100644 kernel/fs/udf/lowlevel.c
 create mode 100644 kernel/fs/udf/misc.c
 create mode 100644 kernel/fs/udf/namei.c
 create mode 100644 kernel/fs/udf/osta_udf.h
 create mode 100644 kernel/fs/udf/partition.c
 create mode 100644 kernel/fs/udf/super.c
 create mode 100644 kernel/fs/udf/symlink.c
 create mode 100644 kernel/fs/udf/truncate.c
 create mode 100644 kernel/fs/udf/udf_i.h
 create mode 100644 kernel/fs/udf/udf_sb.h
 create mode 100644 kernel/fs/udf/udfdecl.h
 create mode 100644 kernel/fs/udf/udfend.h
 create mode 100644 kernel/fs/udf/udftime.c
 create mode 100644 kernel/fs/udf/unicode.c
 create mode 100644 kernel/fs/ufs/Kconfig
 create mode 100644 kernel/fs/ufs/Makefile
 create mode 100644 kernel/fs/ufs/balloc.c
 create mode 100644 kernel/fs/ufs/cylinder.c
 create mode 100644 kernel/fs/ufs/dir.c
 create mode 100644 kernel/fs/ufs/file.c
 create mode 100644 kernel/fs/ufs/ialloc.c
 create mode 100644 kernel/fs/ufs/inode.c
 create mode 100644 kernel/fs/ufs/namei.c
 create mode 100644 kernel/fs/ufs/super.c
 create mode 100644 kernel/fs/ufs/swab.h
 create mode 100644 kernel/fs/ufs/symlink.c
 create mode 100644 kernel/fs/ufs/truncate.c
 create mode 100644 kernel/fs/ufs/ufs.h
 create mode 100644 kernel/fs/ufs/ufs_fs.h
 create mode 100644 kernel/fs/ufs/util.c
 create mode 100644 kernel/fs/ufs/util.h
 create mode 100644 kernel/fs/utimes.c
 create mode 100644 kernel/fs/xattr.c
 create mode 100644 kernel/fs/xfs/Kconfig
 create mode 100644 kernel/fs/xfs/Makefile
 create mode 100644 kernel/fs/xfs/kmem.c
 create mode 100644 kernel/fs/xfs/kmem.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc_btree.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc_btree.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_attr.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_leaf.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_leaf.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_remote.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_remote.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_sf.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_bit.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap_btree.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap_btree.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_btree.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_btree.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_cksum.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_da_btree.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_da_btree.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_da_format.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_da_format.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_block.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_data.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_leaf.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_node.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_priv.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_sf.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_dquot_buf.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_format.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_fs.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc_btree.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc_btree.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_buf.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_buf.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_fork.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_fork.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_log_format.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_log_recover.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_log_rlimit.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_quota_defs.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_rtbitmap.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_sb.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_sb.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_shared.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_symlink_remote.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_resv.c
 create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_resv.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_space.h
 create mode 100644 kernel/fs/xfs/libxfs/xfs_types.h
 create mode 100644 kernel/fs/xfs/mrlock.h
 create mode 100644 kernel/fs/xfs/uuid.c
 create mode 100644 kernel/fs/xfs/uuid.h
 create mode 100644 kernel/fs/xfs/xfs.h
 create mode 100644 kernel/fs/xfs/xfs_acl.c
 create mode 100644 kernel/fs/xfs/xfs_acl.h
 create mode 100644 kernel/fs/xfs/xfs_aops.c
 create mode 100644 kernel/fs/xfs/xfs_aops.h
 create mode 100644 kernel/fs/xfs/xfs_attr.h
 create mode 100644 kernel/fs/xfs/xfs_attr_inactive.c
 create mode 100644 kernel/fs/xfs/xfs_attr_list.c
 create mode 100644 kernel/fs/xfs/xfs_bit.c
 create mode 100644 kernel/fs/xfs/xfs_bmap_util.c
 create mode 100644 kernel/fs/xfs/xfs_bmap_util.h
 create mode 100644 kernel/fs/xfs/xfs_buf.c
 create mode 100644 kernel/fs/xfs/xfs_buf.h
 create mode 100644 kernel/fs/xfs/xfs_buf_item.c
 create mode 100644 kernel/fs/xfs/xfs_buf_item.h
 create mode 100644 kernel/fs/xfs/xfs_dir2_readdir.c
 create mode 100644 kernel/fs/xfs/xfs_discard.c
 create mode 100644 kernel/fs/xfs/xfs_discard.h
 create mode 100644 kernel/fs/xfs/xfs_dquot.c
 create mode 100644 kernel/fs/xfs/xfs_dquot.h
 create mode 100644 kernel/fs/xfs/xfs_dquot_item.c
 create mode 100644 kernel/fs/xfs/xfs_dquot_item.h
 create mode 100644 kernel/fs/xfs/xfs_error.c
 create mode 100644 kernel/fs/xfs/xfs_error.h
 create mode 100644 kernel/fs/xfs/xfs_export.c
 create mode 100644 kernel/fs/xfs/xfs_export.h
 create mode 100644 kernel/fs/xfs/xfs_extent_busy.c
 create mode 100644 kernel/fs/xfs/xfs_extent_busy.h
 create mode 100644 kernel/fs/xfs/xfs_extfree_item.c
 create mode 100644 kernel/fs/xfs/xfs_extfree_item.h
 create mode 100644 kernel/fs/xfs/xfs_file.c
 create mode 100644 kernel/fs/xfs/xfs_filestream.c
 create mode 100644 kernel/fs/xfs/xfs_filestream.h
 create mode 100644 kernel/fs/xfs/xfs_fsops.c
 create mode 100644 kernel/fs/xfs/xfs_fsops.h
 create mode 100644 kernel/fs/xfs/xfs_globals.c
 create mode 100644 kernel/fs/xfs/xfs_icache.c
 create mode 100644 kernel/fs/xfs/xfs_icache.h
 create mode 100644 kernel/fs/xfs/xfs_icreate_item.c
 create mode 100644 kernel/fs/xfs/xfs_icreate_item.h
 create mode 100644 kernel/fs/xfs/xfs_inode.c
 create mode 100644 kernel/fs/xfs/xfs_inode.h
 create mode 100644 kernel/fs/xfs/xfs_inode_item.c
 create mode 100644 kernel/fs/xfs/xfs_inode_item.h
 create mode 100644 kernel/fs/xfs/xfs_ioctl.c
 create mode 100644 kernel/fs/xfs/xfs_ioctl.h
 create mode 100644 kernel/fs/xfs/xfs_ioctl32.c
 create mode 100644 kernel/fs/xfs/xfs_ioctl32.h
 create mode 100644 kernel/fs/xfs/xfs_iomap.c
 create mode 100644 kernel/fs/xfs/xfs_iomap.h
 create mode 100644 kernel/fs/xfs/xfs_iops.c
 create mode 100644 kernel/fs/xfs/xfs_iops.h
 create mode 100644 kernel/fs/xfs/xfs_itable.c
 create mode 100644 kernel/fs/xfs/xfs_itable.h
 create mode 100644 kernel/fs/xfs/xfs_linux.h
 create mode 100644 kernel/fs/xfs/xfs_log.c
 create mode 100644 kernel/fs/xfs/xfs_log.h
 create mode 100644 kernel/fs/xfs/xfs_log_cil.c
 create mode 100644 kernel/fs/xfs/xfs_log_priv.h
 create mode 100644 kernel/fs/xfs/xfs_log_recover.c
 create mode 100644 kernel/fs/xfs/xfs_message.c
 create mode 100644 kernel/fs/xfs/xfs_message.h
 create mode 100644 kernel/fs/xfs/xfs_mount.c
 create mode 100644 kernel/fs/xfs/xfs_mount.h
 create mode 100644 kernel/fs/xfs/xfs_mru_cache.c
 create mode 100644 kernel/fs/xfs/xfs_mru_cache.h
 create mode 100644 kernel/fs/xfs/xfs_pnfs.c
 create mode 100644 kernel/fs/xfs/xfs_pnfs.h
 create mode 100644 kernel/fs/xfs/xfs_qm.c
 create mode 100644 kernel/fs/xfs/xfs_qm.h
 create mode 100644 kernel/fs/xfs/xfs_qm_bhv.c
 create mode 100644 kernel/fs/xfs/xfs_qm_syscalls.c
 create mode 100644 kernel/fs/xfs/xfs_quota.h
 create mode 100644 kernel/fs/xfs/xfs_quotaops.c
 create mode 100644 kernel/fs/xfs/xfs_rtalloc.c
 create mode 100644 kernel/fs/xfs/xfs_rtalloc.h
 create mode 100644 kernel/fs/xfs/xfs_stats.c
 create mode 100644 kernel/fs/xfs/xfs_stats.h
 create mode 100644 kernel/fs/xfs/xfs_super.c
 create mode 100644 kernel/fs/xfs/xfs_super.h
 create mode 100644 kernel/fs/xfs/xfs_symlink.c
 create mode 100644 kernel/fs/xfs/xfs_symlink.h
 create mode 100644 kernel/fs/xfs/xfs_sysctl.c
 create mode 100644 kernel/fs/xfs/xfs_sysctl.h
 create mode 100644 kernel/fs/xfs/xfs_sysfs.c
 create mode 100644 kernel/fs/xfs/xfs_sysfs.h
 create mode 100644 kernel/fs/xfs/xfs_trace.c
 create mode 100644 kernel/fs/xfs/xfs_trace.h
 create mode 100644 kernel/fs/xfs/xfs_trans.c
 create mode 100644 kernel/fs/xfs/xfs_trans.h
 create mode 100644 kernel/fs/xfs/xfs_trans_ail.c
 create mode 100644 kernel/fs/xfs/xfs_trans_buf.c
 create mode 100644 kernel/fs/xfs/xfs_trans_dquot.c
 create mode 100644 kernel/fs/xfs/xfs_trans_extfree.c
 create mode 100644 kernel/fs/xfs/xfs_trans_inode.c
 create mode 100644 kernel/fs/xfs/xfs_trans_priv.h
 create mode 100644 kernel/fs/xfs/xfs_xattr.c

(limited to 'kernel/fs')

diff --git a/kernel/fs/9p/Kconfig b/kernel/fs/9p/Kconfig
new file mode 100644
index 000000000..6489e1fc1
--- /dev/null
+++ b/kernel/fs/9p/Kconfig
@@ -0,0 +1,46 @@
+config 9P_FS
+	tristate "Plan 9 Resource Sharing Support (9P2000)"
+	depends on INET && NET_9P
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
+
+if 9P_FS
+config 9P_FSCACHE
+	bool "Enable 9P client caching support"
+	depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
+	help
+	  Choose Y here to enable persistent, read-only local
+	  caching support for 9p clients using FS-Cache
+
+
+config 9P_FS_POSIX_ACL
+	bool "9P POSIX Access Control Lists"
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+endif
+
+
+config 9P_FS_SECURITY
+        bool "9P Security Labels"
+        depends on 9P_FS
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute handler for file security
+          labels in the 9P filesystem.
+
+          If you are not using a security module that requires using
+          extended attributes for file security labels, say N.
diff --git a/kernel/fs/9p/Makefile b/kernel/fs/9p/Makefile
new file mode 100644
index 000000000..ff7be98f8
--- /dev/null
+++ b/kernel/fs/9p/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_9P_FS) := 9p.o
+
+9p-objs := \
+	vfs_super.o \
+	vfs_inode.o \
+	vfs_inode_dotl.o \
+	vfs_addr.o \
+	vfs_file.o \
+	vfs_dir.o \
+	vfs_dentry.o \
+	v9fs.o \
+	fid.o  \
+	xattr.o \
+	xattr_user.o \
+	xattr_trusted.o
+
+9p-$(CONFIG_9P_FSCACHE) += cache.o
+9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
+9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/kernel/fs/9p/acl.c b/kernel/fs/9p/acl.c
new file mode 100644
index 000000000..31c010372
--- /dev/null
+++ b/kernel/fs/9p/acl.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+{
+	ssize_t size;
+	void *value = NULL;
+	struct posix_acl *acl = NULL;
+
+	size = v9fs_fid_xattr_get(fid, name, NULL, 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = v9fs_fid_xattr_get(fid, name, value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(&init_user_ns, value, size);
+			if (IS_ERR(acl))
+				goto err_out;
+		}
+	} else if (size == -ENODATA || size == 0 ||
+		   size == -ENOSYS || size == -EOPNOTSUPP) {
+		acl = NULL;
+	} else
+		acl = ERR_PTR(-EIO);
+
+err_out:
+	kfree(value);
+	return acl;
+}
+
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	int retval = 0;
+	struct posix_acl *pacl, *dacl;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+			((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+		return 0;
+	}
+	/* get the default/access acl values and cache them */
+	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
+	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+
+	if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
+	} else
+		retval = -EIO;
+
+	if (!IS_ERR(dacl))
+		posix_acl_release(dacl);
+
+	if (!IS_ERR(pacl))
+		posix_acl_release(pacl);
+
+	return retval;
+}
+
+static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	/*
+	 * 9p Always cache the acl value when
+	 * instantiating the inode (v9fs_inode_from_fid)
+	 */
+	acl = get_cached_acl(inode, type);
+	BUG_ON(acl == ACL_NOT_CACHED);
+	return acl;
+}
+
+struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type)
+{
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+			((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
+		/*
+		 * On access = client  and acl = on mode get the acl
+		 * values from the server
+		 */
+		return NULL;
+	}
+	return v9fs_get_cached_acl(inode, type);
+
+}
+
+static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
+{
+	int retval;
+	char *name;
+	size_t size;
+	void *buffer;
+	if (!acl)
+		return 0;
+
+	/* Set a setxattr request to server */
+	size = posix_acl_xattr_size(acl->a_count);
+	buffer = kmalloc(size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+	retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+	if (retval < 0)
+		goto err_free_out;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0);
+err_free_out:
+	kfree(buffer);
+	return retval;
+}
+
+int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
+{
+	int retval = 0;
+	struct posix_acl *acl;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+	if (acl) {
+		retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+		if (retval)
+			return retval;
+		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+		retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
+		posix_acl_release(acl);
+	}
+	return retval;
+}
+
+int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
+			struct posix_acl *dacl, struct posix_acl *acl)
+{
+	set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+	set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+	v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl);
+	v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
+	return 0;
+}
+
+void v9fs_put_acl(struct posix_acl *dacl,
+		  struct posix_acl *acl)
+{
+	posix_acl_release(dacl);
+	posix_acl_release(acl);
+}
+
+int v9fs_acl_mode(struct inode *dir, umode_t *modep,
+		  struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+	int retval = 0;
+	umode_t mode = *modep;
+	struct posix_acl *acl = NULL;
+
+	if (!S_ISLNK(mode)) {
+		acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (!acl)
+			mode &= ~current_umask();
+	}
+	if (acl) {
+		if (S_ISDIR(mode))
+			*dpacl = posix_acl_dup(acl);
+		retval = __posix_acl_create(&acl, GFP_NOFS, &mode);
+		if (retval < 0)
+			return retval;
+		if (retval > 0)
+			*pacl = acl;
+		else
+			posix_acl_release(acl);
+	}
+	*modep  = mode;
+	return 0;
+}
+
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+			       void *buffer, size_t size, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
+			      void *buffer, size_t size, int type)
+{
+	struct v9fs_session_info *v9ses;
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	/*
+	 * We allow set/get/list of acl when access=client is not specified
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+
+	acl = v9fs_get_cached_acl(d_inode(dentry), type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+
+static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	int retval;
+	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
+	struct inode *inode = d_inode(dentry);
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	/*
+	 * set the attribute on the remote. Without even looking at the
+	 * xattr value. We leave it to the server to validate
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_set_acl(dentry, name,
+					   value, size, flags, type);
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+	if (value) {
+		/* update the cached acl value */
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			retval = posix_acl_valid(acl);
+			if (retval)
+				goto err_out;
+		}
+	} else
+		acl = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			umode_t mode = inode->i_mode;
+			retval = posix_acl_equiv_mode(acl, &mode);
+			if (retval < 0)
+				goto err_out;
+			else {
+				struct iattr iattr;
+				if (retval == 0) {
+					/*
+					 * ACL can be represented
+					 * by the mode bits. So don't
+					 * update ACL.
+					 */
+					acl = NULL;
+					value = NULL;
+					size = 0;
+				}
+				/* Updte the mode bits */
+				iattr.ia_mode = ((mode & S_IALLUGO) |
+						 (inode->i_mode & ~S_IALLUGO));
+				iattr.ia_valid = ATTR_MODE;
+				/* FIXME should we update ctime ?
+				 * What is the following setxattr update the
+				 * mode ?
+				 */
+				v9fs_vfs_setattr_dotl(dentry, &iattr);
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode)) {
+			retval = acl ? -EINVAL : 0;
+			goto err_out;
+		}
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_xattr_set(dentry, name, value, size, flags);
+	if (!retval)
+		set_cached_acl(inode, type, acl);
+err_out:
+	posix_acl_release(acl);
+	return retval;
+}
+
+const struct xattr_handler v9fs_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
+
+const struct xattr_handler v9fs_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
diff --git a/kernel/fs/9p/acl.h b/kernel/fs/9p/acl.h
new file mode 100644
index 000000000..e4f7e8822
--- /dev/null
+++ b/kernel/fs/9p/acl.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_ACL_H
+#define FS_9P_ACL_H
+
+#ifdef CONFIG_9P_FS_POSIX_ACL
+extern int v9fs_get_acl(struct inode *, struct p9_fid *);
+extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
+extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
+extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
+			       struct posix_acl *, struct posix_acl *);
+extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
+			 struct posix_acl **dpacl, struct posix_acl **pacl);
+extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
+#else
+#define v9fs_iop_get_acl NULL
+static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	return 0;
+}
+static inline int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
+{
+	return 0;
+}
+static inline int v9fs_set_create_acl(struct inode *inode,
+				      struct p9_fid *fid,
+				      struct posix_acl *dacl,
+				      struct posix_acl *acl)
+{
+	return 0;
+}
+static inline void v9fs_put_acl(struct posix_acl *dacl,
+				struct posix_acl *acl)
+{
+}
+static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,
+				struct posix_acl **dpacl,
+				struct posix_acl **pacl)
+{
+	return 0;
+}
+
+#endif
+#endif /* FS_9P_XATTR_H */
diff --git a/kernel/fs/9p/cache.c b/kernel/fs/9p/cache.c
new file mode 100644
index 000000000..a69260f27
--- /dev/null
+++ b/kernel/fs/9p/cache.c
@@ -0,0 +1,414 @@
+/*
+ * V9FS cache definitions.
+ *
+ *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/jiffies.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+
+#include "v9fs.h"
+#include "cache.h"
+
+#define CACHETAG_LEN  11
+
+struct fscache_netfs v9fs_cache_netfs = {
+	.name 		= "9p",
+	.version 	= 0,
+};
+
+/**
+ * v9fs_random_cachetag - Generate a random tag to be associated
+ *			  with a new cache session.
+ *
+ * The value of jiffies is used for a fairly randomly cache tag.
+ */
+
+static
+int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
+{
+	v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
+	if (!v9ses->cachetag)
+		return -ENOMEM;
+
+	return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
+}
+
+static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t bufmax)
+{
+	struct v9fs_session_info *v9ses;
+	uint16_t klen = 0;
+
+	v9ses = (struct v9fs_session_info *)cookie_netfs_data;
+	p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
+		 v9ses, buffer, bufmax);
+
+	if (v9ses->cachetag)
+		klen = strlen(v9ses->cachetag);
+
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, v9ses->cachetag, klen);
+	p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
+	return klen;
+}
+
+const struct fscache_cookie_def v9fs_cache_session_index_def = {
+	.name		= "9P.session",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= v9fs_cache_session_get_key,
+};
+
+void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
+{
+	/* If no cache session tag was specified, we generate a random one. */
+	if (!v9ses->cachetag)
+		v9fs_random_cachetag(v9ses);
+
+	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
+						&v9fs_cache_session_index_def,
+						v9ses, true);
+	p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
+		 v9ses, v9ses->fscache);
+}
+
+void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
+{
+	p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
+		 v9ses, v9ses->fscache);
+	fscache_relinquish_cookie(v9ses->fscache, 0);
+	v9ses->fscache = NULL;
+}
+
+
+static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
+					 void *buffer, uint16_t bufmax)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
+	p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
+		 &v9inode->vfs_inode, v9inode->qid.path);
+	return sizeof(v9inode->qid.path);
+}
+
+static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
+				      uint64_t *size)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	*size = i_size_read(&v9inode->vfs_inode);
+
+	p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
+		 &v9inode->vfs_inode, *size);
+}
+
+static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
+					 void *buffer, uint16_t buflen)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
+	p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
+		 &v9inode->vfs_inode, v9inode->qid.version);
+	return sizeof(v9inode->qid.version);
+}
+
+static enum
+fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
+					    const void *buffer,
+					    uint16_t buflen)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+
+	if (buflen != sizeof(v9inode->qid.version))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	if (memcmp(buffer, &v9inode->qid.version,
+		   sizeof(v9inode->qid.version)))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct v9fs_inode *v9inode = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	for (;;) {
+		nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+const struct fscache_cookie_def v9fs_cache_inode_index_def = {
+	.name		= "9p.inode",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= v9fs_cache_inode_get_key,
+	.get_attr	= v9fs_cache_inode_get_attr,
+	.get_aux	= v9fs_cache_inode_get_aux,
+	.check_aux	= v9fs_cache_inode_check_aux,
+	.now_uncached	= v9fs_cache_inode_now_uncached,
+};
+
+void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode;
+	struct v9fs_session_info *v9ses;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	v9inode = V9FS_I(inode);
+	if (v9inode->fscache)
+		return;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
+						  &v9fs_cache_inode_index_def,
+						  v9inode, true);
+
+	p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
+		 inode, v9inode->fscache);
+}
+
+void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	if (!v9inode->fscache)
+		return;
+	p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
+		 inode, v9inode->fscache);
+
+	fscache_relinquish_cookie(v9inode->fscache, 0);
+	v9inode->fscache = NULL;
+}
+
+void v9fs_cache_inode_flush_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	if (!v9inode->fscache)
+		return;
+	p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
+		 inode, v9inode->fscache);
+
+	fscache_relinquish_cookie(v9inode->fscache, 1);
+	v9inode->fscache = NULL;
+}
+
+void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	if (!v9inode->fscache)
+		return;
+
+	spin_lock(&v9inode->fscache_lock);
+
+	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+		v9fs_cache_inode_flush_cookie(inode);
+	else
+		v9fs_cache_inode_get_cookie(inode);
+
+	spin_unlock(&v9inode->fscache_lock);
+}
+
+void v9fs_cache_inode_reset_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct v9fs_session_info *v9ses;
+	struct fscache_cookie *old;
+
+	if (!v9inode->fscache)
+		return;
+
+	old = v9inode->fscache;
+
+	spin_lock(&v9inode->fscache_lock);
+	fscache_relinquish_cookie(v9inode->fscache, 1);
+
+	v9ses = v9fs_inode2v9ses(inode);
+	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
+						  &v9fs_cache_inode_index_def,
+						  v9inode, true);
+	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
+		 inode, old, v9inode->fscache);
+
+	spin_unlock(&v9inode->fscache_lock);
+}
+
+int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	struct inode *inode = page->mapping->host;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	BUG_ON(!v9inode->fscache);
+
+	return fscache_maybe_release_page(v9inode->fscache, page, gfp);
+}
+
+void __v9fs_fscache_invalidate_page(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	BUG_ON(!v9inode->fscache);
+
+	if (PageFsCache(page)) {
+		fscache_wait_on_page_write(v9inode->fscache, page);
+		BUG_ON(!PageLocked(page));
+		fscache_uncache_page(v9inode->fscache, page);
+	}
+}
+
+static void v9fs_vfs_readpage_complete(struct page *page, void *data,
+				       int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+
+	unlock_page(page);
+}
+
+/**
+ * __v9fs_readpage_from_fscache - read a page from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
+	if (!v9inode->fscache)
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_page(v9inode->fscache,
+					 page,
+					 v9fs_vfs_readpage_complete,
+					 NULL,
+					 GFP_KERNEL);
+	switch (ret) {
+	case -ENOBUFS:
+	case -ENODATA:
+		p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
+		return 1;
+	case 0:
+		p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
+		return ret;
+	default:
+		p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
+		return ret;
+	}
+}
+
+/**
+ * __v9fs_readpages_from_fscache - read multiple pages from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpages_from_fscache(struct inode *inode,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages)
+{
+	int ret;
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
+	if (!v9inode->fscache)
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_pages(v9inode->fscache,
+					  mapping, pages, nr_pages,
+					  v9fs_vfs_readpage_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+	switch (ret) {
+	case -ENOBUFS:
+	case -ENODATA:
+		p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
+		return 1;
+	case 0:
+		BUG_ON(!list_empty(pages));
+		BUG_ON(*nr_pages != 0);
+		p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
+		return ret;
+	default:
+		p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
+		return ret;
+	}
+}
+
+/**
+ * __v9fs_readpage_to_fscache - write a page to the cache
+ *
+ */
+
+void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
+	ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
+	p9_debug(P9_DEBUG_FSC, "ret =  %d\n", ret);
+	if (ret != 0)
+		v9fs_uncache_page(inode, page);
+}
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
+	if (PageFsCache(page))
+		fscache_wait_on_page_write(v9inode->fscache, page);
+}
diff --git a/kernel/fs/9p/cache.h b/kernel/fs/9p/cache.h
new file mode 100644
index 000000000..2f9675491
--- /dev/null
+++ b/kernel/fs/9p/cache.h
@@ -0,0 +1,151 @@
+/*
+ * V9FS cache definitions.
+ *
+ *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#ifndef _9P_CACHE_H
+#ifdef CONFIG_9P_FSCACHE
+#include <linux/fscache.h>
+#include <linux/spinlock.h>
+
+extern struct fscache_netfs v9fs_cache_netfs;
+extern const struct fscache_cookie_def v9fs_cache_session_index_def;
+extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
+
+extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
+extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
+
+extern void v9fs_cache_inode_get_cookie(struct inode *inode);
+extern void v9fs_cache_inode_put_cookie(struct inode *inode);
+extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
+extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
+extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
+
+extern int __v9fs_cache_register(void);
+extern void __v9fs_cache_unregister(void);
+
+extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
+extern void __v9fs_fscache_invalidate_page(struct page *page);
+extern int __v9fs_readpage_from_fscache(struct inode *inode,
+					struct page *page);
+extern int __v9fs_readpages_from_fscache(struct inode *inode,
+					 struct address_space *mapping,
+					 struct list_head *pages,
+					 unsigned *nr_pages);
+extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
+					      struct page *page);
+
+static inline int v9fs_fscache_release_page(struct page *page,
+					    gfp_t gfp)
+{
+	return __v9fs_fscache_release_page(page, gfp);
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page)
+{
+	__v9fs_fscache_invalidate_page(page);
+}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	return __v9fs_readpage_from_fscache(inode, page);
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return __v9fs_readpages_from_fscache(inode, mapping, pages,
+					     nr_pages);
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+	if (PageFsCache(page))
+		__v9fs_readpage_to_fscache(inode, page);
+}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	fscache_uncache_page(v9inode->fscache, page);
+	BUG_ON(PageFsCache(page));
+}
+
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+						   struct page *page)
+{
+	return __v9fs_fscache_wait_on_page_write(inode, page);
+}
+
+#else /* CONFIG_9P_FSCACHE */
+
+static inline void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+}
+
+static inline void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+}
+
+static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file)
+{
+}
+
+static inline int v9fs_fscache_release_page(struct page *page,
+					    gfp_t gfp) {
+	return 1;
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page) {}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{}
+
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+						   struct page *page)
+{
+	return;
+}
+
+#endif /* CONFIG_9P_FSCACHE */
+#endif /* _9P_CACHE_H */
diff --git a/kernel/fs/9p/fid.c b/kernel/fs/9p/fid.c
new file mode 100644
index 000000000..47db55aee
--- /dev/null
+++ b/kernel/fs/9p/fid.c
@@ -0,0 +1,306 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2005, 2006 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * v9fs_fid_add - add a fid to a dentry
+ * @dentry: dentry that the fid is being added to
+ * @fid: fid to add
+ *
+ */
+
+static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
+{
+	hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
+}
+
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+{
+	spin_lock(&dentry->d_lock);
+	__add_fid(dentry, fid);
+	spin_unlock(&dentry->d_lock);
+}
+
+/**
+ * v9fs_fid_find - retrieve a fid that belongs to the specified uid
+ * @dentry: dentry to look for fid in
+ * @uid: return fid that belongs to the specified user
+ * @any: if non-zero, return any fid associated with the dentry
+ *
+ */
+
+static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
+{
+	struct p9_fid *fid, *ret;
+
+	p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p) uid %d any %d\n",
+		 dentry, dentry, from_kuid(&init_user_ns, uid),
+		 any);
+	ret = NULL;
+	/* we'll recheck under lock if there's anything to look in */
+	if (dentry->d_fsdata) {
+		struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
+		spin_lock(&dentry->d_lock);
+		hlist_for_each_entry(fid, h, dlist) {
+			if (any || uid_eq(fid->uid, uid)) {
+				ret = fid;
+				break;
+			}
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * We need to hold v9ses->rename_sem as long as we hold references
+ * to returned path array. Array element contain pointers to
+ * dentry names.
+ */
+static int build_path_from_dentry(struct v9fs_session_info *v9ses,
+				  struct dentry *dentry, char ***names)
+{
+	int n = 0, i;
+	char **wnames;
+	struct dentry *ds;
+
+	for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+		n++;
+
+	wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+	if (!wnames)
+		goto err_out;
+
+	for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
+		wnames[i] = (char  *)ds->d_name.name;
+
+	*names = wnames;
+	return n;
+err_out:
+	return -ENOMEM;
+}
+
+static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
+					       kuid_t uid, int any)
+{
+	struct dentry *ds;
+	char **wnames, *uname;
+	int i, n, l, clone, access;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid, *old_fid = NULL;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	access = v9ses->flags & V9FS_ACCESS_MASK;
+	fid = v9fs_fid_find(dentry, uid, any);
+	if (fid)
+		return fid;
+	/*
+	 * we don't have a matching fid. To do a TWALK we need
+	 * parent fid. We need to prevent rename when we want to
+	 * look at the parent.
+	 */
+	down_read(&v9ses->rename_sem);
+	ds = dentry->d_parent;
+	fid = v9fs_fid_find(ds, uid, any);
+	if (fid) {
+		/* Found the parent fid do a lookup with that */
+		fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
+		goto fid_out;
+	}
+	up_read(&v9ses->rename_sem);
+
+	/* start from the root and try to do a lookup */
+	fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
+	if (!fid) {
+		/* the user is not attached to the fs yet */
+		if (access == V9FS_ACCESS_SINGLE)
+			return ERR_PTR(-EPERM);
+
+		if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
+				uname = NULL;
+		else
+			uname = v9ses->uname;
+
+		fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
+				       v9ses->aname);
+		if (IS_ERR(fid))
+			return fid;
+
+		v9fs_fid_add(dentry->d_sb->s_root, fid);
+	}
+	/* If we are root ourself just return that */
+	if (dentry->d_sb->s_root == dentry)
+		return fid;
+	/*
+	 * Do a multipath walk with attached root.
+	 * When walking parent we need to make sure we
+	 * don't have a parallel rename happening
+	 */
+	down_read(&v9ses->rename_sem);
+	n  = build_path_from_dentry(v9ses, dentry, &wnames);
+	if (n < 0) {
+		fid = ERR_PTR(n);
+		goto err_out;
+	}
+	clone = 1;
+	i = 0;
+	while (i < n) {
+		l = min(n - i, P9_MAXWELEM);
+		/*
+		 * We need to hold rename lock when doing a multipath
+		 * walk to ensure none of the patch component change
+		 */
+		fid = p9_client_walk(fid, l, &wnames[i], clone);
+		if (IS_ERR(fid)) {
+			if (old_fid) {
+				/*
+				 * If we fail, clunk fid which are mapping
+				 * to path component and not the last component
+				 * of the path.
+				 */
+				p9_client_clunk(old_fid);
+			}
+			kfree(wnames);
+			goto err_out;
+		}
+		old_fid = fid;
+		i += l;
+		clone = 0;
+	}
+	kfree(wnames);
+fid_out:
+	if (!IS_ERR(fid)) {
+		spin_lock(&dentry->d_lock);
+		if (d_unhashed(dentry)) {
+			spin_unlock(&dentry->d_lock);
+			p9_client_clunk(fid);
+			fid = ERR_PTR(-ENOENT);
+		} else {
+			__add_fid(dentry, fid);
+			spin_unlock(&dentry->d_lock);
+		}
+	}
+err_out:
+	up_read(&v9ses->rename_sem);
+	return fid;
+}
+
+/**
+ * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+ * @dentry: dentry to look for fid in
+ *
+ * Look for a fid in the specified dentry for the current user.
+ * If no fid is found, try to create one walking from a fid from the parent
+ * dentry (if it has one), or the root dentry. If the user haven't accessed
+ * the fs yet, attach now and walk from the root.
+ */
+
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+{
+	kuid_t uid;
+	int  any, access;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	access = v9ses->flags & V9FS_ACCESS_MASK;
+	switch (access) {
+	case V9FS_ACCESS_SINGLE:
+	case V9FS_ACCESS_USER:
+	case V9FS_ACCESS_CLIENT:
+		uid = current_fsuid();
+		any = 0;
+		break;
+
+	case V9FS_ACCESS_ANY:
+		uid = v9ses->uid;
+		any = 1;
+		break;
+
+	default:
+		uid = INVALID_UID;
+		any = 0;
+		break;
+	}
+	return v9fs_fid_lookup_with_uid(dentry, uid, any);
+}
+
+struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
+{
+	struct p9_fid *fid, *ret;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return fid;
+
+	ret = p9_client_walk(fid, 0, NULL, 1);
+	return ret;
+}
+
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid)
+{
+	struct p9_fid *fid, *ret;
+
+	fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
+	if (IS_ERR(fid))
+		return fid;
+
+	ret = p9_client_walk(fid, 0, NULL, 1);
+	return ret;
+}
+
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
+{
+	int err;
+	struct p9_fid *fid;
+
+	fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID);
+	if (IS_ERR(fid))
+		goto error_out;
+	/*
+	 * writeback fid will only be used to write back the
+	 * dirty pages. We always request for the open fid in read-write
+	 * mode so that a partial page write which result in page
+	 * read can work.
+	 */
+	err = p9_client_open(fid, O_RDWR);
+	if (err < 0) {
+		p9_client_clunk(fid);
+		fid = ERR_PTR(err);
+		goto error_out;
+	}
+error_out:
+	return fid;
+}
diff --git a/kernel/fs/9p/fid.h b/kernel/fs/9p/fid.h
new file mode 100644
index 000000000..2b6787fcb
--- /dev/null
+++ b/kernel/fs/9p/fid.h
@@ -0,0 +1,30 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#ifndef FS_9P_FID_H
+#define FS_9P_FID_H
+#include <linux/list.h>
+
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
+struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+#endif
diff --git a/kernel/fs/9p/v9fs.c b/kernel/fs/9p/v9fs.c
new file mode 100644
index 000000000..620d93489
--- /dev/null
+++ b/kernel/fs/9p/v9fs.c
@@ -0,0 +1,685 @@
+/*
+ *  linux/fs/9p/v9fs.c
+ *
+ *  This file contains functions assisting in mapping VFS to 9P2000
+ *
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "cache.h"
+
+static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
+static LIST_HEAD(v9fs_sessionlist);
+struct kmem_cache *v9fs_inode_cache;
+
+/*
+ * Option Parsing (code inspired by NFS code)
+ *  NOTE: each transport will parse its own options
+ */
+
+enum {
+	/* Options that take integer arguments */
+	Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
+	/* String options */
+	Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
+	/* Options that take no arguments */
+	Opt_nodevmap,
+	/* Cache options */
+	Opt_cache_loose, Opt_fscache, Opt_mmap,
+	/* Access options */
+	Opt_access, Opt_posixacl,
+	/* Error token */
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_debug, "debug=%x"},
+	{Opt_dfltuid, "dfltuid=%u"},
+	{Opt_dfltgid, "dfltgid=%u"},
+	{Opt_afid, "afid=%u"},
+	{Opt_uname, "uname=%s"},
+	{Opt_remotename, "aname=%s"},
+	{Opt_nodevmap, "nodevmap"},
+	{Opt_cache, "cache=%s"},
+	{Opt_cache_loose, "loose"},
+	{Opt_fscache, "fscache"},
+	{Opt_mmap, "mmap"},
+	{Opt_cachetag, "cachetag=%s"},
+	{Opt_access, "access=%s"},
+	{Opt_posixacl, "posixacl"},
+	{Opt_err, NULL}
+};
+
+/* Interpret mount options for cache mode */
+static int get_cache_mode(char *s)
+{
+	int version = -EINVAL;
+
+	if (!strcmp(s, "loose")) {
+		version = CACHE_LOOSE;
+		p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
+	} else if (!strcmp(s, "fscache")) {
+		version = CACHE_FSCACHE;
+		p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
+	} else if (!strcmp(s, "mmap")) {
+		version = CACHE_MMAP;
+		p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
+	} else if (!strcmp(s, "none")) {
+		version = CACHE_NONE;
+		p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
+	} else
+		pr_info("Unknown Cache mode %s\n", s);
+	return version;
+}
+
+/**
+ * v9fs_parse_options - parse mount options into session structure
+ * @v9ses: existing v9fs session information
+ *
+ * Return 0 upon success, -ERRNO upon failure.
+ */
+
+static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
+{
+	char *options, *tmp_options;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+	int option = 0;
+	char *s, *e;
+	int ret = 0;
+
+	/* setup defaults */
+	v9ses->afid = ~0;
+	v9ses->debug = 0;
+	v9ses->cache = CACHE_NONE;
+#ifdef CONFIG_9P_FSCACHE
+	v9ses->cachetag = NULL;
+#endif
+
+	if (!opts)
+		return 0;
+
+	tmp_options = kstrdup(opts, GFP_KERNEL);
+	if (!tmp_options) {
+		ret = -ENOMEM;
+		goto fail_option_alloc;
+	}
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, r;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_debug:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			v9ses->debug = option;
+#ifdef CONFIG_NET_9P_DEBUG
+			p9_debug_level = option;
+#endif
+			break;
+
+		case Opt_dfltuid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			v9ses->dfltuid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(v9ses->dfltuid)) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "uid field, but not a uid?\n");
+				ret = -EINVAL;
+				continue;
+			}
+			break;
+		case Opt_dfltgid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			v9ses->dfltgid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(v9ses->dfltgid)) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "gid field, but not a gid?\n");
+				ret = -EINVAL;
+				continue;
+			}
+			break;
+		case Opt_afid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			v9ses->afid = option;
+			break;
+		case Opt_uname:
+			kfree(v9ses->uname);
+			v9ses->uname = match_strdup(&args[0]);
+			if (!v9ses->uname) {
+				ret = -ENOMEM;
+				goto free_and_return;
+			}
+			break;
+		case Opt_remotename:
+			kfree(v9ses->aname);
+			v9ses->aname = match_strdup(&args[0]);
+			if (!v9ses->aname) {
+				ret = -ENOMEM;
+				goto free_and_return;
+			}
+			break;
+		case Opt_nodevmap:
+			v9ses->nodev = 1;
+			break;
+		case Opt_cache_loose:
+			v9ses->cache = CACHE_LOOSE;
+			break;
+		case Opt_fscache:
+			v9ses->cache = CACHE_FSCACHE;
+			break;
+		case Opt_mmap:
+			v9ses->cache = CACHE_MMAP;
+			break;
+		case Opt_cachetag:
+#ifdef CONFIG_9P_FSCACHE
+			v9ses->cachetag = match_strdup(&args[0]);
+#endif
+			break;
+		case Opt_cache:
+			s = match_strdup(&args[0]);
+			if (!s) {
+				ret = -ENOMEM;
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of cache arg\n");
+				goto free_and_return;
+			}
+			ret = get_cache_mode(s);
+			if (ret == -EINVAL) {
+				kfree(s);
+				goto free_and_return;
+			}
+
+			v9ses->cache = ret;
+			kfree(s);
+			break;
+
+		case Opt_access:
+			s = match_strdup(&args[0]);
+			if (!s) {
+				ret = -ENOMEM;
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of access arg\n");
+				goto free_and_return;
+			}
+
+			v9ses->flags &= ~V9FS_ACCESS_MASK;
+			if (strcmp(s, "user") == 0)
+				v9ses->flags |= V9FS_ACCESS_USER;
+			else if (strcmp(s, "any") == 0)
+				v9ses->flags |= V9FS_ACCESS_ANY;
+			else if (strcmp(s, "client") == 0) {
+				v9ses->flags |= V9FS_ACCESS_CLIENT;
+			} else {
+				uid_t uid;
+				v9ses->flags |= V9FS_ACCESS_SINGLE;
+				uid = simple_strtoul(s, &e, 10);
+				if (*e != '\0') {
+					ret = -EINVAL;
+					pr_info("Unknown access argument %s\n",
+						s);
+					kfree(s);
+					goto free_and_return;
+				}
+				v9ses->uid = make_kuid(current_user_ns(), uid);
+				if (!uid_valid(v9ses->uid)) {
+					ret = -EINVAL;
+					pr_info("Uknown uid %s\n", s);
+					kfree(s);
+					goto free_and_return;
+				}
+			}
+
+			kfree(s);
+			break;
+
+		case Opt_posixacl:
+#ifdef CONFIG_9P_FS_POSIX_ACL
+			v9ses->flags |= V9FS_POSIX_ACL;
+#else
+			p9_debug(P9_DEBUG_ERROR,
+				 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
+#endif
+			break;
+
+		default:
+			continue;
+		}
+	}
+
+free_and_return:
+	kfree(tmp_options);
+fail_option_alloc:
+	return ret;
+}
+
+/**
+ * v9fs_session_init - initialize session
+ * @v9ses: session information structure
+ * @dev_name: device being mounted
+ * @data: options
+ *
+ */
+
+struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
+		  const char *dev_name, char *data)
+{
+	int retval = -EINVAL;
+	struct p9_fid *fid;
+	int rc;
+
+	v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
+	if (!v9ses->uname)
+		return ERR_PTR(-ENOMEM);
+
+	v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
+	if (!v9ses->aname) {
+		kfree(v9ses->uname);
+		return ERR_PTR(-ENOMEM);
+	}
+	init_rwsem(&v9ses->rename_sem);
+
+	rc = bdi_setup_and_register(&v9ses->bdi, "9p");
+	if (rc) {
+		kfree(v9ses->aname);
+		kfree(v9ses->uname);
+		return ERR_PTR(rc);
+	}
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_add(&v9ses->slist, &v9fs_sessionlist);
+	spin_unlock(&v9fs_sessionlist_lock);
+
+	v9ses->uid = INVALID_UID;
+	v9ses->dfltuid = V9FS_DEFUID;
+	v9ses->dfltgid = V9FS_DEFGID;
+
+	v9ses->clnt = p9_client_create(dev_name, data);
+	if (IS_ERR(v9ses->clnt)) {
+		retval = PTR_ERR(v9ses->clnt);
+		v9ses->clnt = NULL;
+		p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
+		goto error;
+	}
+
+	v9ses->flags = V9FS_ACCESS_USER;
+
+	if (p9_is_proto_dotl(v9ses->clnt)) {
+		v9ses->flags = V9FS_ACCESS_CLIENT;
+		v9ses->flags |= V9FS_PROTO_2000L;
+	} else if (p9_is_proto_dotu(v9ses->clnt)) {
+		v9ses->flags |= V9FS_PROTO_2000U;
+	}
+
+	rc = v9fs_parse_options(v9ses, data);
+	if (rc < 0) {
+		retval = rc;
+		goto error;
+	}
+
+	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
+
+	if (!v9fs_proto_dotl(v9ses) &&
+	    ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACCESS_CLIENT only for dotl.
+		 * Fall back to ACCESS_USER
+		 */
+		v9ses->flags &= ~V9FS_ACCESS_MASK;
+		v9ses->flags |= V9FS_ACCESS_USER;
+	}
+	/*FIXME !! */
+	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
+	if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
+		((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
+
+		v9ses->flags &= ~V9FS_ACCESS_MASK;
+		v9ses->flags |= V9FS_ACCESS_ANY;
+		v9ses->uid = INVALID_UID;
+	}
+	if (!v9fs_proto_dotl(v9ses) ||
+		!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACL checks on clinet only if the protocol is
+		 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
+		 */
+		v9ses->flags &= ~V9FS_ACL_MASK;
+	}
+
+	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
+							v9ses->aname);
+	if (IS_ERR(fid)) {
+		retval = PTR_ERR(fid);
+		fid = NULL;
+		p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
+		goto error;
+	}
+
+	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
+		fid->uid = v9ses->uid;
+	else
+		fid->uid = INVALID_UID;
+
+#ifdef CONFIG_9P_FSCACHE
+	/* register the session for caching */
+	v9fs_cache_session_get_cookie(v9ses);
+#endif
+
+	return fid;
+
+error:
+	bdi_destroy(&v9ses->bdi);
+	return ERR_PTR(retval);
+}
+
+/**
+ * v9fs_session_close - shutdown a session
+ * @v9ses: session information structure
+ *
+ */
+
+void v9fs_session_close(struct v9fs_session_info *v9ses)
+{
+	if (v9ses->clnt) {
+		p9_client_destroy(v9ses->clnt);
+		v9ses->clnt = NULL;
+	}
+
+#ifdef CONFIG_9P_FSCACHE
+	if (v9ses->fscache) {
+		v9fs_cache_session_put_cookie(v9ses);
+		kfree(v9ses->cachetag);
+	}
+#endif
+	kfree(v9ses->uname);
+	kfree(v9ses->aname);
+
+	bdi_destroy(&v9ses->bdi);
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_del(&v9ses->slist);
+	spin_unlock(&v9fs_sessionlist_lock);
+}
+
+/**
+ * v9fs_session_cancel - terminate a session
+ * @v9ses: session to terminate
+ *
+ * mark transport as disconnected and cancel all pending requests.
+ */
+
+void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+	p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
+	p9_client_disconnect(v9ses->clnt);
+}
+
+/**
+ * v9fs_session_begin_cancel - Begin terminate of a session
+ * @v9ses: session to terminate
+ *
+ * After this call we don't allow any request other than clunk.
+ */
+
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
+{
+	p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+	p9_client_begin_disconnect(v9ses->clnt);
+}
+
+extern int v9fs_error_init(void);
+
+static struct kobject *v9fs_kobj;
+
+#ifdef CONFIG_9P_FSCACHE
+/**
+ * caches_show - list caches associated with a session
+ *
+ * Returns the size of buffer written.
+ */
+
+static ssize_t caches_show(struct kobject *kobj,
+			   struct kobj_attribute *attr,
+			   char *buf)
+{
+	ssize_t n = 0, count = 0, limit = PAGE_SIZE;
+	struct v9fs_session_info *v9ses;
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
+		if (v9ses->cachetag) {
+			n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
+			if (n < 0) {
+				count = n;
+				break;
+			}
+
+			count += n;
+			limit -= n;
+		}
+	}
+
+	spin_unlock(&v9fs_sessionlist_lock);
+	return count;
+}
+
+static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
+#endif /* CONFIG_9P_FSCACHE */
+
+static struct attribute *v9fs_attrs[] = {
+#ifdef CONFIG_9P_FSCACHE
+	&v9fs_attr_cache.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group v9fs_attr_group = {
+	.attrs = v9fs_attrs,
+};
+
+/**
+ * v9fs_sysfs_init - Initialize the v9fs sysfs interface
+ *
+ */
+
+static int __init v9fs_sysfs_init(void)
+{
+	v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
+	if (!v9fs_kobj)
+		return -ENOMEM;
+
+	if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
+		kobject_put(v9fs_kobj);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
+ *
+ */
+
+static void v9fs_sysfs_cleanup(void)
+{
+	sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
+	kobject_put(v9fs_kobj);
+}
+
+static void v9fs_inode_init_once(void *foo)
+{
+	struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
+#ifdef CONFIG_9P_FSCACHE
+	v9inode->fscache = NULL;
+#endif
+	memset(&v9inode->qid, 0, sizeof(v9inode->qid));
+	inode_init_once(&v9inode->vfs_inode);
+}
+
+/**
+ * v9fs_init_inode_cache - initialize a cache for 9P
+ * Returns 0 on success.
+ */
+static int v9fs_init_inode_cache(void)
+{
+	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
+					  sizeof(struct v9fs_inode),
+					  0, (SLAB_RECLAIM_ACCOUNT|
+					      SLAB_MEM_SPREAD),
+					  v9fs_inode_init_once);
+	if (!v9fs_inode_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * v9fs_destroy_inode_cache - destroy the cache of 9P inode
+ *
+ */
+static void v9fs_destroy_inode_cache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(v9fs_inode_cache);
+}
+
+static int v9fs_cache_register(void)
+{
+	int ret;
+	ret = v9fs_init_inode_cache();
+	if (ret < 0)
+		return ret;
+#ifdef CONFIG_9P_FSCACHE
+	ret = fscache_register_netfs(&v9fs_cache_netfs);
+	if (ret < 0)
+		v9fs_destroy_inode_cache();
+#endif
+	return ret;
+}
+
+static void v9fs_cache_unregister(void)
+{
+	v9fs_destroy_inode_cache();
+#ifdef CONFIG_9P_FSCACHE
+	fscache_unregister_netfs(&v9fs_cache_netfs);
+#endif
+}
+
+/**
+ * init_v9fs - Initialize module
+ *
+ */
+
+static int __init init_v9fs(void)
+{
+	int err;
+	pr_info("Installing v9fs 9p2000 file system support\n");
+	/* TODO: Setup list of registered trasnport modules */
+
+	err = v9fs_cache_register();
+	if (err < 0) {
+		pr_err("Failed to register v9fs for caching\n");
+		return err;
+	}
+
+	err = v9fs_sysfs_init();
+	if (err < 0) {
+		pr_err("Failed to register with sysfs\n");
+		goto out_cache;
+	}
+	err = register_filesystem(&v9fs_fs_type);
+	if (err < 0) {
+		pr_err("Failed to register filesystem\n");
+		goto out_sysfs_cleanup;
+	}
+
+	return 0;
+
+out_sysfs_cleanup:
+	v9fs_sysfs_cleanup();
+
+out_cache:
+	v9fs_cache_unregister();
+
+	return err;
+}
+
+/**
+ * exit_v9fs - shutdown module
+ *
+ */
+
+static void __exit exit_v9fs(void)
+{
+	v9fs_sysfs_cleanup();
+	v9fs_cache_unregister();
+	unregister_filesystem(&v9fs_fs_type);
+}
+
+module_init(init_v9fs)
+module_exit(exit_v9fs)
+
+MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/9p/v9fs.h b/kernel/fs/9p/v9fs.h
new file mode 100644
index 000000000..fb9ffcb43
--- /dev/null
+++ b/kernel/fs/9p/v9fs.h
@@ -0,0 +1,227 @@
+/*
+ * V9FS definitions.
+ *
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#ifndef FS_9P_V9FS_H
+#define FS_9P_V9FS_H
+
+#include <linux/backing-dev.h>
+
+/**
+ * enum p9_session_flags - option flags for each 9P session
+ * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
+ * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
+ * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
+ * @V9FS_ACCESS_ANY: use a single attach for all users
+ * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ * @V9FS_POSIX_ACL: POSIX ACLs are enforced
+ *
+ * Session flags reflect options selected by users at mount time
+ */
+#define	V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+			 V9FS_ACCESS_USER |   \
+			 V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+#define V9FS_ACL_MASK V9FS_POSIX_ACL
+
+enum p9_session_flags {
+	V9FS_PROTO_2000U	= 0x01,
+	V9FS_PROTO_2000L	= 0x02,
+	V9FS_ACCESS_SINGLE	= 0x04,
+	V9FS_ACCESS_USER	= 0x08,
+	V9FS_ACCESS_CLIENT	= 0x10,
+	V9FS_POSIX_ACL		= 0x20
+};
+
+/* possible values of ->cache */
+/**
+ * enum p9_cache_modes - user specified cache preferences
+ * @CACHE_NONE: do not cache data, dentries, or directory contents (default)
+ * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency
+ *
+ * eventually support loose, tight, time, session, default always none
+ */
+
+enum p9_cache_modes {
+	CACHE_NONE,
+	CACHE_MMAP,
+	CACHE_LOOSE,
+	CACHE_FSCACHE,
+};
+
+/**
+ * struct v9fs_session_info - per-instance session information
+ * @flags: session options of type &p9_session_flags
+ * @nodev: set to 1 to disable device mapping
+ * @debug: debug level
+ * @afid: authentication handle
+ * @cache: cache mode of type &p9_cache_modes
+ * @cachetag: the tag of the cache associated with this session
+ * @fscache: session cookie associated with FS-Cache
+ * @uname: string user name to mount hierarchy as
+ * @aname: mount specifier for remote hierarchy
+ * @maxdata: maximum data to be sent/recvd per protocol message
+ * @dfltuid: default numeric userid to mount hierarchy as
+ * @dfltgid: default numeric groupid to mount hierarchy as
+ * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
+ * @clnt: reference to 9P network client instantiated for this session
+ * @slist: reference to list of registered 9p sessions
+ *
+ * This structure holds state for each session instance established during
+ * a sys_mount() .
+ *
+ * Bugs: there seems to be a lot of state which could be condensed and/or
+ * removed.
+ */
+
+struct v9fs_session_info {
+	/* options */
+	unsigned char flags;
+	unsigned char nodev;
+	unsigned short debug;
+	unsigned int afid;
+	unsigned int cache;
+#ifdef CONFIG_9P_FSCACHE
+	char *cachetag;
+	struct fscache_cookie *fscache;
+#endif
+
+	char *uname;		/* user name to mount as */
+	char *aname;		/* name of remote hierarchy being mounted */
+	unsigned int maxdata;	/* max data for client interface */
+	kuid_t dfltuid;		/* default uid/muid for legacy support */
+	kgid_t dfltgid;		/* default gid for legacy support */
+	kuid_t uid;		/* if ACCESS_SINGLE, the uid that has access */
+	struct p9_client *clnt;	/* 9p client */
+	struct list_head slist; /* list of sessions registered with v9fs */
+	struct backing_dev_info bdi;
+	struct rw_semaphore rename_sem;
+};
+
+/* cache_validity flags */
+#define V9FS_INO_INVALID_ATTR 0x01
+
+struct v9fs_inode {
+#ifdef CONFIG_9P_FSCACHE
+	spinlock_t fscache_lock;
+	struct fscache_cookie *fscache;
+#endif
+	struct p9_qid qid;
+	unsigned int cache_validity;
+	struct p9_fid *writeback_fid;
+	struct mutex v_mutex;
+	struct inode vfs_inode;
+};
+
+static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
+{
+	return container_of(inode, struct v9fs_inode, vfs_inode);
+}
+
+struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
+									char *);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+			unsigned int flags);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			void *p);
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
+					 struct p9_fid *fid,
+					 struct super_block *sb, int new);
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
+					      struct p9_fid *fid,
+					      struct super_block *sb, int new);
+
+/* other default globals */
+#define V9FS_PORT	564
+#define V9FS_DEFUSER	"nobody"
+#define V9FS_DEFANAME	""
+#define V9FS_DEFUID	KUIDT_INIT(-2)
+#define V9FS_DEFGID	KGIDT_INIT(-2)
+
+static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
+{
+	return (inode->i_sb->s_fs_info);
+}
+
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+{
+	return dentry->d_sb->s_fs_info;
+}
+
+static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
+{
+	return v9ses->flags & V9FS_PROTO_2000U;
+}
+
+static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
+{
+	return v9ses->flags & V9FS_PROTO_2000L;
+}
+
+/**
+ * v9fs_get_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
+	else
+		return v9fs_inode_from_fid(v9ses, fid, sb, 0);
+}
+
+/**
+ * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			    struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
+	else
+		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
+}
+
+#endif
diff --git a/kernel/fs/9p/v9fs_vfs.h b/kernel/fs/9p/v9fs_vfs.h
new file mode 100644
index 000000000..5a0db6dec
--- /dev/null
+++ b/kernel/fs/9p/v9fs_vfs.h
@@ -0,0 +1,86 @@
+/*
+ * V9FS VFS extensions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#ifndef FS_9P_V9FS_VFS_H
+#define FS_9P_V9FS_VFS_H
+
+/* plan9 semantics are that created files are implicitly opened.
+ * But linux semantics are that you call create, then open.
+ * the plan9 approach is superior as it provides an atomic
+ * open.
+ * we track the create fid here. When the file is opened, if fidopen is
+ * non-zero, we use the fid and can skip some steps.
+ * there may be a better way to do this, but I don't know it.
+ * one BAD way is to clunk the fid on create, then open it again:
+ * you lose the atomicity of file open
+ */
+
+/* special case:
+ * unlink calls remove, which is an implicit clunk. So we have to track
+ * that kind of thing so that we don't try to clunk a dead fid.
+ */
+#define P9_LOCK_TIMEOUT (30*HZ)
+
+extern struct file_system_type v9fs_fs_type;
+extern const struct address_space_operations v9fs_addr_operations;
+extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
+extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
+extern const struct dentry_operations v9fs_dentry_operations;
+extern const struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct file_operations v9fs_cached_file_operations;
+extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern const struct file_operations v9fs_mmap_file_operations;
+extern const struct file_operations v9fs_mmap_file_operations_dotl;
+extern struct kmem_cache *v9fs_inode_cache;
+
+struct inode *v9fs_alloc_inode(struct super_block *sb);
+void v9fs_destroy_inode(struct inode *inode);
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t);
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+		    struct inode *inode, umode_t mode, dev_t);
+void v9fs_evict_inode(struct inode *inode);
+ino_t v9fs_qid2ino(struct p9_qid *qid);
+void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+int v9fs_dir_release(struct inode *inode, struct file *filp);
+int v9fs_file_open(struct inode *inode, struct file *file);
+void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
+int v9fs_uflags2omode(int uflags, int extended);
+
+void v9fs_blank_wstat(struct p9_wstat *wstat);
+int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
+			 int datasync);
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
+static inline void v9fs_invalidate_inode_attr(struct inode *inode)
+{
+	struct v9fs_inode *v9inode;
+	v9inode = V9FS_I(inode);
+	v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+	return;
+}
+
+int v9fs_open_to_dotl_flags(int flags);
+#endif
diff --git a/kernel/fs/9p/vfs_addr.c b/kernel/fs/9p/vfs_addr.c
new file mode 100644
index 000000000..e9e04376c
--- /dev/null
+++ b/kernel/fs/9p/vfs_addr.c
@@ -0,0 +1,351 @@
+/*
+ *  linux/fs/9p/vfs_addr.c
+ *
+ * This file contians vfs address (mmap) ops for 9P2000.
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/pagemap.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "cache.h"
+#include "fid.h"
+
+/**
+ * v9fs_fid_readpage - read an entire page in from 9P
+ *
+ * @fid: fid being read
+ * @page: structure to page
+ *
+ */
+static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE};
+	struct iov_iter to;
+	int retval, err;
+
+	p9_debug(P9_DEBUG_VFS, "\n");
+
+	BUG_ON(!PageLocked(page));
+
+	retval = v9fs_readpage_from_fscache(inode, page);
+	if (retval == 0)
+		return retval;
+
+	iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE);
+
+	retval = p9_client_read(fid, page_offset(page), &to, &err);
+	if (err) {
+		v9fs_uncache_page(inode, page);
+		retval = err;
+		goto done;
+	}
+
+	zero_user(page, retval, PAGE_SIZE - retval);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+
+	v9fs_readpage_to_fscache(inode, page);
+	retval = 0;
+
+done:
+	unlock_page(page);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ *
+ * @filp: file being read
+ * @page: structure to page
+ *
+ */
+
+static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+{
+	return v9fs_fid_readpage(filp->private_data, page);
+}
+
+/**
+ * v9fs_vfs_readpages - read a set of pages from 9P
+ *
+ * @filp: file being read
+ * @mapping: the address space
+ * @pages: list of pages to read
+ * @nr_pages: count of pages to read
+ *
+ */
+
+static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
+			     struct list_head *pages, unsigned nr_pages)
+{
+	int ret = 0;
+	struct inode *inode;
+
+	inode = mapping->host;
+	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+
+	ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
+	if (ret == 0)
+		return ret;
+
+	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+	p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
+	return ret;
+}
+
+/**
+ * v9fs_release_page - release the private state associated with a page
+ *
+ * Returns 1 if the page can be released, false otherwise.
+ */
+
+static int v9fs_release_page(struct page *page, gfp_t gfp)
+{
+	if (PagePrivate(page))
+		return 0;
+	return v9fs_fscache_release_page(page, gfp);
+}
+
+/**
+ * v9fs_invalidate_page - Invalidate a page completely or partially
+ *
+ * @page: structure to page
+ * @offset: offset in the page
+ */
+
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
+{
+	/*
+	 * If called with zero offset, we should release
+	 * the private state assocated with the page
+	 */
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
+		v9fs_fscache_invalidate_page(page);
+}
+
+static int v9fs_vfs_writepage_locked(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	loff_t size = i_size_read(inode);
+	struct iov_iter from;
+	struct bio_vec bvec;
+	int err, len;
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	bvec.bv_page = page;
+	bvec.bv_offset = 0;
+	bvec.bv_len = len;
+	iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len);
+
+	/* We should have writeback_fid always set */
+	BUG_ON(!v9inode->writeback_fid);
+
+	set_page_writeback(page);
+
+	p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err);
+
+	end_page_writeback(page);
+	return err;
+}
+
+static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int retval;
+
+	p9_debug(P9_DEBUG_VFS, "page %p\n", page);
+
+	retval = v9fs_vfs_writepage_locked(page);
+	if (retval < 0) {
+		if (retval == -EAGAIN) {
+			redirty_page_for_writepage(wbc, page);
+			retval = 0;
+		} else {
+			SetPageError(page);
+			mapping_set_error(page->mapping, retval);
+		}
+	} else
+		retval = 0;
+
+	unlock_page(page);
+	return retval;
+}
+
+/**
+ * v9fs_launder_page - Writeback a dirty page
+ * Returns 0 on success.
+ */
+
+static int v9fs_launder_page(struct page *page)
+{
+	int retval;
+	struct inode *inode = page->mapping->host;
+
+	v9fs_fscache_wait_on_page_write(inode, page);
+	if (clear_page_dirty_for_io(page)) {
+		retval = v9fs_vfs_writepage_locked(page);
+		if (retval)
+			return retval;
+	}
+	return 0;
+}
+
+/**
+ * v9fs_direct_IO - 9P address space operation for direct I/O
+ * @iocb: target I/O control block
+ * @pos: offset in file to begin the operation
+ *
+ * The presence of v9fs_direct_IO() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode, we shunt off direct read and write requests before
+ * the VFS gets them, so this method should never be called.
+ *
+ * Direct IO is not 'yet' supported in the cached mode. Hence when
+ * this routine is called through generic_file_aio_read(), the read/write fails
+ * with an error.
+ *
+ */
+static ssize_t
+v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t n;
+	int err = 0;
+	if (iov_iter_rw(iter) == WRITE) {
+		n = p9_client_write(file->private_data, pos, iter, &err);
+		if (n) {
+			struct inode *inode = file_inode(file);
+			loff_t i_size = i_size_read(inode);
+			if (pos + n > i_size)
+				inode_add_bytes(inode, pos + n - i_size);
+		}
+	} else {
+		n = p9_client_read(file->private_data, pos, iter, &err);
+	}
+	return n ? n : err;
+}
+
+static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	int retval = 0;
+	struct page *page;
+	struct v9fs_inode *v9inode;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct inode *inode = mapping->host;
+
+
+	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
+	v9inode = V9FS_I(inode);
+start:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		retval = -ENOMEM;
+		goto out;
+	}
+	BUG_ON(!v9inode->writeback_fid);
+	if (PageUptodate(page))
+		goto out;
+
+	if (len == PAGE_CACHE_SIZE)
+		goto out;
+
+	retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
+	page_cache_release(page);
+	if (!retval)
+		goto start;
+out:
+	*pagep = page;
+	return retval;
+}
+
+static int v9fs_write_end(struct file *filp, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	loff_t last_pos = pos + copied;
+	struct inode *inode = page->mapping->host;
+
+	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
+	if (unlikely(copied < len)) {
+		/*
+		 * zero out the rest of the area
+		 */
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
+		flush_dcache_page(page);
+	}
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size) {
+		inode_add_bytes(inode, last_pos - inode->i_size);
+		i_size_write(inode, last_pos);
+	}
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+
+const struct address_space_operations v9fs_addr_operations = {
+	.readpage = v9fs_vfs_readpage,
+	.readpages = v9fs_vfs_readpages,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.writepage = v9fs_vfs_writepage,
+	.write_begin = v9fs_write_begin,
+	.write_end = v9fs_write_end,
+	.releasepage = v9fs_release_page,
+	.invalidatepage = v9fs_invalidate_page,
+	.launder_page = v9fs_launder_page,
+	.direct_IO = v9fs_direct_IO,
+};
diff --git a/kernel/fs/9p/vfs_dentry.c b/kernel/fs/9p/vfs_dentry.c
new file mode 100644
index 000000000..bd456c668
--- /dev/null
+++ b/kernel/fs/9p/vfs_dentry.c
@@ -0,0 +1,122 @@
+/*
+ *  linux/fs/9p/vfs_dentry.c
+ *
+ * This file contians vfs dentry ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * v9fs_cached_dentry_delete - called when dentry refcount equals 0
+ * @dentry:  dentry in question
+ *
+ */
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
+{
+	p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
+		 dentry, dentry);
+
+	/* Don't cache negative dentries */
+	if (d_really_is_negative(dentry))
+		return 1;
+	return 0;
+}
+
+/**
+ * v9fs_dentry_release - called when dentry is going to be freed
+ * @dentry:  dentry that is being release
+ *
+ */
+
+static void v9fs_dentry_release(struct dentry *dentry)
+{
+	struct hlist_node *p, *n;
+	p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
+		 dentry, dentry);
+	hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
+		p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+	dentry->d_fsdata = NULL;
+}
+
+static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct p9_fid *fid;
+	struct inode *inode;
+	struct v9fs_inode *v9inode;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = d_inode(dentry);
+	if (!inode)
+		goto out_valid;
+
+	v9inode = V9FS_I(inode);
+	if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+		int retval;
+		struct v9fs_session_info *v9ses;
+		fid = v9fs_fid_lookup(dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		v9ses = v9fs_inode2v9ses(inode);
+		if (v9fs_proto_dotl(v9ses))
+			retval = v9fs_refresh_inode_dotl(fid, inode);
+		else
+			retval = v9fs_refresh_inode(fid, inode);
+		if (retval == -ENOENT)
+			return 0;
+		if (retval < 0)
+			return retval;
+	}
+out_valid:
+	return 1;
+}
+
+const struct dentry_operations v9fs_cached_dentry_operations = {
+	.d_revalidate = v9fs_lookup_revalidate,
+	.d_weak_revalidate = v9fs_lookup_revalidate,
+	.d_delete = v9fs_cached_dentry_delete,
+	.d_release = v9fs_dentry_release,
+};
+
+const struct dentry_operations v9fs_dentry_operations = {
+	.d_delete = always_delete_dentry,
+	.d_release = v9fs_dentry_release,
+};
diff --git a/kernel/fs/9p/vfs_dir.c b/kernel/fs/9p/vfs_dir.c
new file mode 100644
index 000000000..5cc00e562
--- /dev/null
+++ b/kernel/fs/9p/vfs_dir.c
@@ -0,0 +1,261 @@
+/*
+ * linux/fs/9p/vfs_dir.c
+ *
+ * This file contains vfs directory ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * struct p9_rdir - readdir accounting
+ * @head: start offset of current dirread buffer
+ * @tail: end offset of current dirread buffer
+ * @buf: dirread buffer
+ *
+ * private structure for keeping track of readdir
+ * allocated on demand
+ */
+
+struct p9_rdir {
+	int head;
+	int tail;
+	uint8_t buf[];
+};
+
+/**
+ * dt_type - return file type
+ * @mistat: mistat structure
+ *
+ */
+
+static inline int dt_type(struct p9_wstat *mistat)
+{
+	unsigned long perm = mistat->mode;
+	int rettype = DT_REG;
+
+	if (perm & P9_DMDIR)
+		rettype = DT_DIR;
+	if (perm & P9_DMSYMLINK)
+		rettype = DT_LNK;
+
+	return rettype;
+}
+
+static void p9stat_init(struct p9_wstat *stbuf)
+{
+	stbuf->name  = NULL;
+	stbuf->uid   = NULL;
+	stbuf->gid   = NULL;
+	stbuf->muid  = NULL;
+	stbuf->extension = NULL;
+}
+
+/**
+ * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
+ * @filp: opened file structure
+ * @buflen: Length in bytes of buffer to allocate
+ *
+ */
+
+static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
+{
+	struct p9_fid *fid = filp->private_data;
+	if (!fid->rdir)
+		fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+	return fid->rdir;
+}
+
+/**
+ * v9fs_dir_readdir - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
+ *
+ */
+
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+	bool over;
+	struct p9_wstat st;
+	int err = 0;
+	struct p9_fid *fid;
+	int buflen;
+	int reclen = 0;
+	struct p9_rdir *rdir;
+	struct kvec kvec;
+
+	p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
+	fid = file->private_data;
+
+	buflen = fid->clnt->msize - P9_IOHDRSZ;
+
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
+	if (!rdir)
+		return -ENOMEM;
+	kvec.iov_base = rdir->buf;
+	kvec.iov_len = buflen;
+
+	while (1) {
+		if (rdir->tail == rdir->head) {
+			struct iov_iter to;
+			int n;
+			iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen);
+			n = p9_client_read(file->private_data, ctx->pos, &to,
+					   &err);
+			if (err)
+				return err;
+			if (n == 0)
+				return 0;
+
+			rdir->head = 0;
+			rdir->tail = n;
+		}
+		while (rdir->head < rdir->tail) {
+			p9stat_init(&st);
+			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
+					  rdir->tail - rdir->head, &st);
+			if (err) {
+				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
+				p9stat_free(&st);
+				return -EIO;
+			}
+			reclen = st.size+2;
+
+			over = !dir_emit(ctx, st.name, strlen(st.name),
+					 v9fs_qid2ino(&st.qid), dt_type(&st));
+			p9stat_free(&st);
+			if (over)
+				return 0;
+
+			rdir->head += reclen;
+			ctx->pos += reclen;
+		}
+	}
+}
+
+/**
+ * v9fs_dir_readdir_dotl - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
+ *
+ */
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
+{
+	int err = 0;
+	struct p9_fid *fid;
+	int buflen;
+	struct p9_rdir *rdir;
+	struct p9_dirent curdirent;
+
+	p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
+	fid = file->private_data;
+
+	buflen = fid->clnt->msize - P9_READDIRHDRSZ;
+
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
+	if (!rdir)
+		return -ENOMEM;
+
+	while (1) {
+		if (rdir->tail == rdir->head) {
+			err = p9_client_readdir(fid, rdir->buf, buflen,
+						ctx->pos);
+			if (err <= 0)
+				return err;
+
+			rdir->head = 0;
+			rdir->tail = err;
+		}
+
+		while (rdir->head < rdir->tail) {
+
+			err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
+					    rdir->tail - rdir->head,
+					    &curdirent);
+			if (err < 0) {
+				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
+				return -EIO;
+			}
+
+			if (!dir_emit(ctx, curdirent.d_name,
+				      strlen(curdirent.d_name),
+				      v9fs_qid2ino(&curdirent.qid),
+				      curdirent.d_type))
+				return 0;
+
+			ctx->pos = curdirent.d_off;
+			rdir->head += err;
+		}
+	}
+}
+
+
+/**
+ * v9fs_dir_release - close a directory
+ * @inode: inode of the directory
+ * @filp: file pointer to a directory
+ *
+ */
+
+int v9fs_dir_release(struct inode *inode, struct file *filp)
+{
+	struct p9_fid *fid;
+
+	fid = filp->private_data;
+	p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
+		 inode, filp, fid ? fid->fid : -1);
+	if (fid)
+		p9_client_clunk(fid);
+	return 0;
+}
+
+const struct file_operations v9fs_dir_operations = {
+	.read = generic_read_dir,
+	.llseek = generic_file_llseek,
+	.iterate = v9fs_dir_readdir,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+};
+
+const struct file_operations v9fs_dir_operations_dotl = {
+	.read = generic_read_dir,
+	.llseek = generic_file_llseek,
+	.iterate = v9fs_dir_readdir_dotl,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+        .fsync = v9fs_file_fsync_dotl,
+};
diff --git a/kernel/fs/9p/vfs_file.c b/kernel/fs/9p/vfs_file.c
new file mode 100644
index 000000000..1ef16bd82
--- /dev/null
+++ b/kernel/fs/9p/vfs_file.c
@@ -0,0 +1,705 @@
+/*
+ *  linux/fs/9p/vfs_file.c
+ *
+ * This file contians vfs file ops for 9P2000.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/utsname.h>
+#include <asm/uaccess.h>
+#include <linux/idr.h>
+#include <linux/uio.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+
+static const struct vm_operations_struct v9fs_file_vm_ops;
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
+
+/**
+ * v9fs_file_open - open a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ */
+
+int v9fs_file_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct v9fs_inode *v9inode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	int omode;
+
+	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+	v9inode = V9FS_I(inode);
+	v9ses = v9fs_inode2v9ses(inode);
+	if (v9fs_proto_dotl(v9ses))
+		omode = v9fs_open_to_dotl_flags(file->f_flags);
+	else
+		omode = v9fs_uflags2omode(file->f_flags,
+					v9fs_proto_dotu(v9ses));
+	fid = file->private_data;
+	if (!fid) {
+		fid = v9fs_fid_clone(file->f_path.dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		err = p9_client_open(fid, omode);
+		if (err < 0) {
+			p9_client_clunk(fid);
+			return err;
+		}
+		if ((file->f_flags & O_APPEND) &&
+			(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
+			generic_file_llseek(file, 0, SEEK_END);
+	}
+
+	file->private_data = fid;
+	mutex_lock(&v9inode->v_mutex);
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
+	    ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		fid = v9fs_writeback_fid(file->f_path.dentry);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			mutex_unlock(&v9inode->v_mutex);
+			goto out_error;
+		}
+		v9inode->writeback_fid = (void *) fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		v9fs_cache_inode_set_cookie(inode, file);
+	return 0;
+out_error:
+	p9_client_clunk(file->private_data);
+	file->private_data = NULL;
+	return err;
+}
+
+/**
+ * v9fs_file_lock - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ * Bugs: this looks like a local only lock, we should extend into 9P
+ *       by using open exclusive
+ */
+
+static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	int res = 0;
+	struct inode *inode = file_inode(filp);
+
+	p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+
+	return res;
+}
+
+static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct p9_flock flock;
+	struct p9_fid *fid;
+	uint8_t status = P9_LOCK_ERROR;
+	int res = 0;
+	unsigned char fl_type;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
+		BUG();
+
+	res = posix_lock_file_wait(filp, fl);
+	if (res < 0)
+		goto out;
+
+	/* convert posix lock to p9 tlock args */
+	memset(&flock, 0, sizeof(flock));
+	/* map the lock type */
+	switch (fl->fl_type) {
+	case F_RDLCK:
+		flock.type = P9_LOCK_TYPE_RDLCK;
+		break;
+	case F_WRLCK:
+		flock.type = P9_LOCK_TYPE_WRLCK;
+		break;
+	case F_UNLCK:
+		flock.type = P9_LOCK_TYPE_UNLCK;
+		break;
+	}
+	flock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		flock.length = 0;
+	else
+		flock.length = fl->fl_end - fl->fl_start + 1;
+	flock.proc_id = fl->fl_pid;
+	flock.client_id = fid->clnt->name;
+	if (IS_SETLKW(cmd))
+		flock.flags = P9_LOCK_FLAGS_BLOCK;
+
+	/*
+	 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
+	 * for lock request, keep on trying
+	 */
+	for (;;) {
+		res = p9_client_lock_dotl(fid, &flock, &status);
+		if (res < 0)
+			goto out_unlock;
+
+		if (status != P9_LOCK_BLOCKED)
+			break;
+		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
+			break;
+		if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+			break;
+	}
+
+	/* map 9p status to VFS status */
+	switch (status) {
+	case P9_LOCK_SUCCESS:
+		res = 0;
+		break;
+	case P9_LOCK_BLOCKED:
+		res = -EAGAIN;
+		break;
+	default:
+		WARN_ONCE(1, "unknown lock status code: %d\n", status);
+		/* fallthough */
+	case P9_LOCK_ERROR:
+	case P9_LOCK_GRACE:
+		res = -ENOLCK;
+		break;
+	}
+
+out_unlock:
+	/*
+	 * incase server returned error for lock request, revert
+	 * it locally
+	 */
+	if (res < 0 && fl->fl_type != F_UNLCK) {
+		fl_type = fl->fl_type;
+		fl->fl_type = F_UNLCK;
+		res = posix_lock_file_wait(filp, fl);
+		fl->fl_type = fl_type;
+	}
+out:
+	return res;
+}
+
+static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
+{
+	struct p9_getlock glock;
+	struct p9_fid *fid;
+	int res = 0;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	posix_test_lock(filp, fl);
+	/*
+	 * if we have a conflicting lock locally, no need to validate
+	 * with server
+	 */
+	if (fl->fl_type != F_UNLCK)
+		return res;
+
+	/* convert posix lock to p9 tgetlock args */
+	memset(&glock, 0, sizeof(glock));
+	glock.type  = P9_LOCK_TYPE_UNLCK;
+	glock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		glock.length = 0;
+	else
+		glock.length = fl->fl_end - fl->fl_start + 1;
+	glock.proc_id = fl->fl_pid;
+	glock.client_id = fid->clnt->name;
+
+	res = p9_client_getlock_dotl(fid, &glock);
+	if (res < 0)
+		return res;
+	/* map 9p lock type to os lock type */
+	switch (glock.type) {
+	case P9_LOCK_TYPE_RDLCK:
+		fl->fl_type = F_RDLCK;
+		break;
+	case P9_LOCK_TYPE_WRLCK:
+		fl->fl_type = F_WRLCK;
+		break;
+	case P9_LOCK_TYPE_UNLCK:
+		fl->fl_type = F_UNLCK;
+		break;
+	}
+	if (glock.type != P9_LOCK_TYPE_UNLCK) {
+		fl->fl_start = glock.start;
+		if (glock.length == 0)
+			fl->fl_end = OFFSET_MAX;
+		else
+			fl->fl_end = glock.start + glock.length - 1;
+		fl->fl_pid = glock.proc_id;
+	}
+	kfree(glock.client_id);
+	return res;
+}
+
+/**
+ * v9fs_file_lock_dotl - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(filp);
+	int ret = -ENOLCK;
+
+	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
+		 filp, cmd, fl, filp);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+
+	if (IS_SETLK(cmd) || IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else if (IS_GETLK(cmd))
+		ret = v9fs_file_getlock(filp, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
+/**
+ * v9fs_file_flock_dotl - lock a file
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_flock_dotl(struct file *filp, int cmd,
+	struct file_lock *fl)
+{
+	struct inode *inode = file_inode(filp);
+	int ret = -ENOLCK;
+
+	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
+		 filp, cmd, fl, filp);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+	/* Convert flock to posix lock */
+	fl->fl_flags |= FL_POSIX;
+	fl->fl_flags ^= FL_FLOCK;
+
+	if (IS_SETLK(cmd) | IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
+/**
+ * v9fs_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
+static ssize_t
+v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct p9_fid *fid = iocb->ki_filp->private_data;
+	int ret, err;
+
+	p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
+		 iov_iter_count(to), iocb->ki_pos);
+
+	ret = p9_client_read(fid, iocb->ki_pos, to, &err);
+	if (!ret)
+		return err;
+
+	iocb->ki_pos += ret;
+	return ret;
+}
+
+/**
+ * v9fs_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t retval;
+	loff_t origin;
+	int err = 0;
+
+	retval = generic_write_checks(iocb, from);
+	if (retval <= 0)
+		return retval;
+
+	origin = iocb->ki_pos;
+	retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
+	if (retval > 0) {
+		struct inode *inode = file_inode(file);
+		loff_t i_size;
+		unsigned long pg_start, pg_end;
+		pg_start = origin >> PAGE_CACHE_SHIFT;
+		pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT;
+		if (inode->i_mapping && inode->i_mapping->nrpages)
+			invalidate_inode_pages2_range(inode->i_mapping,
+						      pg_start, pg_end);
+		iocb->ki_pos += retval;
+		i_size = i_size_read(inode);
+		if (iocb->ki_pos > i_size) {
+			inode_add_bytes(inode, iocb->ki_pos - i_size);
+			i_size_write(inode, iocb->ki_pos);
+		}
+		return retval;
+	}
+	return err;
+}
+
+static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			   int datasync)
+{
+	struct p9_fid *fid;
+	struct inode *inode = filp->f_mapping->host;
+	struct p9_wstat wstat;
+	int retval;
+
+	retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (retval)
+		return retval;
+
+	mutex_lock(&inode->i_mutex);
+	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+
+	fid = filp->private_data;
+	v9fs_blank_wstat(&wstat);
+
+	retval = p9_client_wstat(fid, &wstat);
+	mutex_unlock(&inode->i_mutex);
+
+	return retval;
+}
+
+int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
+			 int datasync)
+{
+	struct p9_fid *fid;
+	struct inode *inode = filp->f_mapping->host;
+	int retval;
+
+	retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (retval)
+		return retval;
+
+	mutex_lock(&inode->i_mutex);
+	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+
+	fid = filp->private_data;
+
+	retval = p9_client_fsync(fid, datasync);
+	mutex_unlock(&inode->i_mutex);
+
+	return retval;
+}
+
+static int
+v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int retval;
+
+
+	retval = generic_file_mmap(filp, vma);
+	if (!retval)
+		vma->vm_ops = &v9fs_file_vm_ops;
+
+	return retval;
+}
+
+static int
+v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int retval;
+	struct inode *inode;
+	struct v9fs_inode *v9inode;
+	struct p9_fid *fid;
+
+	inode = file_inode(filp);
+	v9inode = V9FS_I(inode);
+	mutex_lock(&v9inode->v_mutex);
+	if (!v9inode->writeback_fid &&
+	    (vma->vm_flags & VM_WRITE)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during mmap instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		fid = v9fs_writeback_fid(filp->f_path.dentry);
+		if (IS_ERR(fid)) {
+			retval = PTR_ERR(fid);
+			mutex_unlock(&v9inode->v_mutex);
+			return retval;
+		}
+		v9inode->writeback_fid = (void *) fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+
+	retval = generic_file_mmap(filp, vma);
+	if (!retval)
+		vma->vm_ops = &v9fs_mmap_file_vm_ops;
+
+	return retval;
+}
+
+static int
+v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct v9fs_inode *v9inode;
+	struct page *page = vmf->page;
+	struct file *filp = vma->vm_file;
+	struct inode *inode = file_inode(filp);
+
+
+	p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
+		 page, (unsigned long)filp->private_data);
+
+	/* Update file times before taking page lock */
+	file_update_time(filp);
+
+	v9inode = V9FS_I(inode);
+	/* make sure the cache has finished storing the page */
+	v9fs_fscache_wait_on_page_write(inode, page);
+	BUG_ON(!v9inode->writeback_fid);
+	lock_page(page);
+	if (page->mapping != inode->i_mapping)
+		goto out_unlock;
+	wait_for_stable_page(page);
+
+	return VM_FAULT_LOCKED;
+out_unlock:
+	unlock_page(page);
+	return VM_FAULT_NOPAGE;
+}
+
+/**
+ * v9fs_mmap_file_read - read from a file
+ * @filp: file pointer to read
+ * @data: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	/* TODO: Check if there are dirty pages */
+	return v9fs_file_read_iter(iocb, to);
+}
+
+/**
+ * v9fs_mmap_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	/*
+	 * TODO: invalidate mmaps on filp's inode between
+	 * offset and offset+count
+	 */
+	return v9fs_file_write_iter(iocb, from);
+}
+
+static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
+{
+	struct inode *inode;
+
+	struct writeback_control wbc = {
+		.nr_to_write = LONG_MAX,
+		.sync_mode = WB_SYNC_ALL,
+		.range_start = vma->vm_pgoff * PAGE_SIZE,
+		 /* absolute end, byte at end included */
+		.range_end = vma->vm_pgoff * PAGE_SIZE +
+			(vma->vm_end - vma->vm_start - 1),
+	};
+
+
+	p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
+
+	inode = file_inode(vma->vm_file);
+
+	if (!mapping_cap_writeback_dirty(inode->i_mapping))
+		wbc.nr_to_write = 0;
+
+	might_sleep();
+	sync_inode(inode, &wbc);
+}
+
+
+static const struct vm_operations_struct v9fs_file_vm_ops = {
+	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = v9fs_vm_page_mkwrite,
+};
+
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
+	.close = v9fs_mmap_vm_close,
+	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = v9fs_vm_page_mkwrite,
+};
+
+
+const struct file_operations v9fs_cached_file_operations = {
+	.llseek = generic_file_llseek,
+	.read_iter = generic_file_read_iter,
+	.write_iter = generic_file_write_iter,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = v9fs_file_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_cached_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read_iter = generic_file_read_iter,
+	.write_iter = generic_file_write_iter,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = v9fs_file_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
+
+const struct file_operations v9fs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read_iter = v9fs_file_read_iter,
+	.write_iter = v9fs_file_write_iter,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read_iter = v9fs_file_read_iter,
+	.write_iter = v9fs_file_write_iter,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
+
+const struct file_operations v9fs_mmap_file_operations = {
+	.llseek = generic_file_llseek,
+	.read_iter = v9fs_mmap_file_read_iter,
+	.write_iter = v9fs_mmap_file_write_iter,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = v9fs_mmap_file_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_mmap_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read_iter = v9fs_mmap_file_read_iter,
+	.write_iter = v9fs_mmap_file_write_iter,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = v9fs_mmap_file_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
diff --git a/kernel/fs/9p/vfs_inode.c b/kernel/fs/9p/vfs_inode.c
new file mode 100644
index 000000000..703342e30
--- /dev/null
+++ b/kernel/fs/9p/vfs_inode.c
@@ -0,0 +1,1537 @@
+/*
+ *  linux/fs/9p/vfs_inode.c
+ *
+ * This file contains vfs inode ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+
+static const struct inode_operations v9fs_dir_inode_operations;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations;
+
+/**
+ * unixmode2p9mode - convert unix mode bits to plan 9
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+
+static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
+{
+	int res;
+	res = mode & 0777;
+	if (S_ISDIR(mode))
+		res |= P9_DMDIR;
+	if (v9fs_proto_dotu(v9ses)) {
+		if (v9ses->nodev == 0) {
+			if (S_ISSOCK(mode))
+				res |= P9_DMSOCKET;
+			if (S_ISFIFO(mode))
+				res |= P9_DMNAMEDPIPE;
+			if (S_ISBLK(mode))
+				res |= P9_DMDEVICE;
+			if (S_ISCHR(mode))
+				res |= P9_DMDEVICE;
+		}
+
+		if ((mode & S_ISUID) == S_ISUID)
+			res |= P9_DMSETUID;
+		if ((mode & S_ISGID) == S_ISGID)
+			res |= P9_DMSETGID;
+		if ((mode & S_ISVTX) == S_ISVTX)
+			res |= P9_DMSETVTX;
+	}
+	return res;
+}
+
+/**
+ * p9mode2perm- convert plan9 mode bits to unix permission bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ *
+ */
+static int p9mode2perm(struct v9fs_session_info *v9ses,
+		       struct p9_wstat *stat)
+{
+	int res;
+	int mode = stat->mode;
+
+	res = mode & S_IALLUGO;
+	if (v9fs_proto_dotu(v9ses)) {
+		if ((mode & P9_DMSETUID) == P9_DMSETUID)
+			res |= S_ISUID;
+
+		if ((mode & P9_DMSETGID) == P9_DMSETGID)
+			res |= S_ISGID;
+
+		if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
+			res |= S_ISVTX;
+	}
+	return res;
+}
+
+/**
+ * p9mode2unixmode- convert plan9 mode bits to unix mode bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ * @rdev: major number, minor number in case of device files.
+ *
+ */
+static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
+			       struct p9_wstat *stat, dev_t *rdev)
+{
+	int res;
+	u32 mode = stat->mode;
+
+	*rdev = 0;
+	res = p9mode2perm(v9ses, stat);
+
+	if ((mode & P9_DMDIR) == P9_DMDIR)
+		res |= S_IFDIR;
+	else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
+		res |= S_IFLNK;
+	else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
+		 && (v9ses->nodev == 0))
+		res |= S_IFSOCK;
+	else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
+		 && (v9ses->nodev == 0))
+		res |= S_IFIFO;
+	else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
+		 && (v9ses->nodev == 0)) {
+		char type = 0, ext[32];
+		int major = -1, minor = -1;
+
+		strlcpy(ext, stat->extension, sizeof(ext));
+		sscanf(ext, "%c %i %i", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			res |= S_IFCHR;
+			break;
+		case 'b':
+			res |= S_IFBLK;
+			break;
+		default:
+			p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
+				 type, stat->extension);
+		};
+		*rdev = MKDEV(major, minor);
+	} else
+		res |= S_IFREG;
+
+	return res;
+}
+
+/**
+ * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
+ * @uflags: flags to convert
+ * @extended: if .u extensions are active
+ */
+
+int v9fs_uflags2omode(int uflags, int extended)
+{
+	int ret;
+
+	ret = 0;
+	switch (uflags&3) {
+	default:
+	case O_RDONLY:
+		ret = P9_OREAD;
+		break;
+
+	case O_WRONLY:
+		ret = P9_OWRITE;
+		break;
+
+	case O_RDWR:
+		ret = P9_ORDWR;
+		break;
+	}
+
+	if (extended) {
+		if (uflags & O_EXCL)
+			ret |= P9_OEXCL;
+
+		if (uflags & O_APPEND)
+			ret |= P9_OAPPEND;
+	}
+
+	return ret;
+}
+
+/**
+ * v9fs_blank_wstat - helper function to setup a 9P stat structure
+ * @wstat: structure to initialize
+ *
+ */
+
+void
+v9fs_blank_wstat(struct p9_wstat *wstat)
+{
+	wstat->type = ~0;
+	wstat->dev = ~0;
+	wstat->qid.type = ~0;
+	wstat->qid.version = ~0;
+	*((long long *)&wstat->qid.path) = ~0;
+	wstat->mode = ~0;
+	wstat->atime = ~0;
+	wstat->mtime = ~0;
+	wstat->length = ~0;
+	wstat->name = NULL;
+	wstat->uid = NULL;
+	wstat->gid = NULL;
+	wstat->muid = NULL;
+	wstat->n_uid = INVALID_UID;
+	wstat->n_gid = INVALID_GID;
+	wstat->n_muid = INVALID_UID;
+	wstat->extension = NULL;
+}
+
+/**
+ * v9fs_alloc_inode - helper function to allocate an inode
+ *
+ */
+struct inode *v9fs_alloc_inode(struct super_block *sb)
+{
+	struct v9fs_inode *v9inode;
+	v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
+							GFP_KERNEL);
+	if (!v9inode)
+		return NULL;
+#ifdef CONFIG_9P_FSCACHE
+	v9inode->fscache = NULL;
+	spin_lock_init(&v9inode->fscache_lock);
+#endif
+	v9inode->writeback_fid = NULL;
+	v9inode->cache_validity = 0;
+	mutex_init(&v9inode->v_mutex);
+	return &v9inode->vfs_inode;
+}
+
+/**
+ * v9fs_destroy_inode - destroy an inode
+ *
+ */
+
+static void v9fs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
+}
+
+void v9fs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, v9fs_i_callback);
+}
+
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+		    struct inode *inode, umode_t mode, dev_t rdev)
+{
+	int err = 0;
+
+	inode_init_owner(inode, NULL, mode);
+	inode->i_blocks = 0;
+	inode->i_rdev = rdev;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mapping->a_ops = &v9fs_addr_operations;
+
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		if (v9fs_proto_dotl(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations_dotl;
+		} else if (v9fs_proto_dotu(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations;
+		} else {
+			p9_debug(P9_DEBUG_ERROR,
+				 "special files without extended mode\n");
+			err = -EINVAL;
+			goto error;
+		}
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	case S_IFREG:
+		if (v9fs_proto_dotl(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations_dotl;
+			if (v9ses->cache == CACHE_LOOSE ||
+			    v9ses->cache == CACHE_FSCACHE)
+				inode->i_fop =
+					&v9fs_cached_file_operations_dotl;
+			else if (v9ses->cache == CACHE_MMAP)
+				inode->i_fop = &v9fs_mmap_file_operations_dotl;
+			else
+				inode->i_fop = &v9fs_file_operations_dotl;
+		} else {
+			inode->i_op = &v9fs_file_inode_operations;
+			if (v9ses->cache == CACHE_LOOSE ||
+			    v9ses->cache == CACHE_FSCACHE)
+				inode->i_fop =
+					&v9fs_cached_file_operations;
+			else if (v9ses->cache == CACHE_MMAP)
+				inode->i_fop = &v9fs_mmap_file_operations;
+			else
+				inode->i_fop = &v9fs_file_operations;
+		}
+
+		break;
+	case S_IFLNK:
+		if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
+			p9_debug(P9_DEBUG_ERROR,
+				 "extended modes used with legacy protocol\n");
+			err = -EINVAL;
+			goto error;
+		}
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_symlink_inode_operations_dotl;
+		else
+			inode->i_op = &v9fs_symlink_inode_operations;
+
+		break;
+	case S_IFDIR:
+		inc_nlink(inode);
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotl;
+		else if (v9fs_proto_dotu(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotu;
+		else
+			inode->i_op = &v9fs_dir_inode_operations;
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_fop = &v9fs_dir_operations_dotl;
+		else
+			inode->i_fop = &v9fs_dir_operations;
+
+		break;
+	default:
+		p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
+			 mode, mode & S_IFMT);
+		err = -EINVAL;
+		goto error;
+	}
+error:
+	return err;
+
+}
+
+/**
+ * v9fs_get_inode - helper function to setup an inode
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
+{
+	int err;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
+
+	inode = new_inode(sb);
+	if (!inode) {
+		pr_warn("%s (%d): Problem allocating inode\n",
+			__func__, task_pid_nr(current));
+		return ERR_PTR(-ENOMEM);
+	}
+	err = v9fs_init_inode(v9ses, inode, mode, rdev);
+	if (err) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	return inode;
+}
+
+/*
+static struct v9fs_fid*
+v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
+{
+	int err;
+	int nfid;
+	struct v9fs_fid *ret;
+	struct v9fs_fcall *fcall;
+
+	nfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (nfid < 0) {
+		eprintk(KERN_WARNING, "no free fids available\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
+		&fcall);
+
+	if (err < 0) {
+		if (fcall && fcall->id == RWALK)
+			goto clunk_fid;
+
+		PRINT_FCALL_ERROR("walk error", fcall);
+		v9fs_put_idpool(nfid, &v9ses->fidpool);
+		goto error;
+	}
+
+	kfree(fcall);
+	fcall = NULL;
+	ret = v9fs_fid_create(v9ses, nfid);
+	if (!ret) {
+		err = -ENOMEM;
+		goto clunk_fid;
+	}
+
+	err = v9fs_fid_insert(ret, dentry);
+	if (err < 0) {
+		v9fs_fid_destroy(ret);
+		goto clunk_fid;
+	}
+
+	return ret;
+
+clunk_fid:
+	v9fs_t_clunk(v9ses, nfid);
+
+error:
+	kfree(fcall);
+	return ERR_PTR(err);
+}
+*/
+
+
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+void v9fs_evict_inode(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	truncate_inode_pages_final(inode->i_mapping);
+	clear_inode(inode);
+	filemap_fdatawrite(inode->i_mapping);
+
+	v9fs_cache_inode_put_cookie(inode);
+	/* clunk the fid stashed in writeback_fid */
+	if (v9inode->writeback_fid) {
+		p9_client_clunk(v9inode->writeback_fid);
+		v9inode->writeback_fid = NULL;
+	}
+}
+
+static int v9fs_test_inode(struct inode *inode, void *data)
+{
+	int umode;
+	dev_t rdev;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+static int v9fs_test_new_inode(struct inode *inode, void *data)
+{
+	return 0;
+}
+
+static int v9fs_set_inode(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	return 0;
+}
+
+static struct inode *v9fs_qid_iget(struct super_block *sb,
+				   struct p9_qid *qid,
+				   struct p9_wstat *st,
+				   int new)
+{
+	dev_t rdev;
+	int retval;
+	umode_t umode;
+	unsigned long i_ino;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode;
+	else
+		test = v9fs_test_inode;
+
+	i_ino = v9fs_qid2ino(qid);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	/*
+	 * initialize the inode with the stat info
+	 * FIXME!! we may need support for stale inodes
+	 * later.
+	 */
+	inode->i_ino = i_ino;
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	retval = v9fs_init_inode(v9ses, inode, umode, rdev);
+	if (retval)
+		goto error;
+
+	v9fs_stat2inode(st, inode, sb);
+	v9fs_cache_inode_get_cookie(inode);
+	unlock_new_inode(inode);
+	return inode;
+error:
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+		    struct super_block *sb, int new)
+{
+	struct p9_wstat *st;
+	struct inode *inode = NULL;
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	inode = v9fs_qid_iget(sb, &st->qid, st, new);
+	p9stat_free(st);
+	kfree(st);
+	return inode;
+}
+
+/**
+ * v9fs_at_to_dotl_flags- convert Linux specific AT flags to
+ * plan 9 AT flag.
+ * @flags: flags to convert
+ */
+static int v9fs_at_to_dotl_flags(int flags)
+{
+	int rflags = 0;
+	if (flags & AT_REMOVEDIR)
+		rflags |= P9_DOTL_AT_REMOVEDIR;
+	return rflags;
+}
+
+/**
+ * v9fs_remove - helper function to remove files and directories
+ * @dir: directory inode that is being deleted
+ * @dentry:  dentry that is being deleted
+ * @flags: removing a directory
+ *
+ */
+
+static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
+{
+	struct inode *inode;
+	int retval = -EOPNOTSUPP;
+	struct p9_fid *v9fid, *dfid;
+	struct v9fs_session_info *v9ses;
+
+	p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
+		 dir, dentry, flags);
+
+	v9ses = v9fs_inode2v9ses(dir);
+	inode = d_inode(dentry);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		retval = PTR_ERR(dfid);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
+		return retval;
+	}
+	if (v9fs_proto_dotl(v9ses))
+		retval = p9_client_unlinkat(dfid, dentry->d_name.name,
+					    v9fs_at_to_dotl_flags(flags));
+	if (retval == -EOPNOTSUPP) {
+		/* Try the one based on path */
+		v9fid = v9fs_fid_clone(dentry);
+		if (IS_ERR(v9fid))
+			return PTR_ERR(v9fid);
+		retval = p9_client_remove(v9fid);
+	}
+	if (!retval) {
+		/*
+		 * directories on unlink should have zero
+		 * link count
+		 */
+		if (flags & AT_REMOVEDIR) {
+			clear_nlink(inode);
+			drop_nlink(dir);
+		} else
+			drop_nlink(inode);
+
+		v9fs_invalidate_inode_attr(inode);
+		v9fs_invalidate_inode_attr(dir);
+	}
+	return retval;
+}
+
+/**
+ * v9fs_create - Create a file
+ * @v9ses: session information
+ * @dir: directory that dentry is being created in
+ * @dentry:  dentry that is being created
+ * @extension: 9p2000.u extension string to support devices, etc.
+ * @perm: create permissions
+ * @mode: open mode
+ *
+ */
+static struct p9_fid *
+v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
+		struct dentry *dentry, char *extension, u32 perm, u8 mode)
+{
+	int err;
+	char *name;
+	struct p9_fid *dfid, *ofid, *fid;
+	struct inode *inode;
+
+	p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
+
+	err = 0;
+	ofid = NULL;
+	fid = NULL;
+	name = (char *) dentry->d_name.name;
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	/* clone a fid to use for creation */
+	ofid = p9_client_walk(dfid, 0, NULL, 1);
+	if (IS_ERR(ofid)) {
+		err = PTR_ERR(ofid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	err = p9_client_fcreate(ofid, name, perm, mode, extension);
+	if (err < 0) {
+		p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
+		goto error;
+	}
+
+	if (!(perm & P9_DMLINK)) {
+		/* now walk from the parent so we can get unopened fid */
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			p9_debug(P9_DEBUG_VFS,
+				   "p9_client_walk failed %d\n", err);
+			fid = NULL;
+			goto error;
+		}
+		/*
+		 * instantiate inode and assign the unopened fid to the dentry
+		 */
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			p9_debug(P9_DEBUG_VFS,
+				   "inode creation failed %d\n", err);
+			goto error;
+		}
+		v9fs_fid_add(dentry, fid);
+		d_instantiate(dentry, inode);
+	}
+	return ofid;
+error:
+	if (ofid)
+		p9_client_clunk(ofid);
+
+	if (fid)
+		p9_client_clunk(fid);
+
+	return ERR_PTR(err);
+}
+
+/**
+ * v9fs_vfs_create - VFS hook to create a regular file
+ *
+ * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open().  This is only called
+ * for mknod(2).
+ *
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ *
+ */
+
+static int
+v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool excl)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	u32 perm = unixmode2p9mode(v9ses, mode);
+	struct p9_fid *fid;
+
+	/* P9_OEXCL? */
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_ORDWR);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	v9fs_invalidate_inode_attr(dir);
+	p9_client_clunk(fid);
+
+	return 0;
+}
+
+/**
+ * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int err;
+	u32 perm;
+	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses;
+
+	p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
+	err = 0;
+	v9ses = v9fs_inode2v9ses(dir);
+	perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_OREAD);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		fid = NULL;
+	} else {
+		inc_nlink(dir);
+		v9fs_invalidate_inode_attr(dir);
+	}
+
+	if (fid)
+		p9_client_clunk(fid);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
+ * @dir:  inode that is being walked from
+ * @dentry: dentry that is being walked to?
+ * @flags: lookup flags (unused)
+ *
+ */
+
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+				      unsigned int flags)
+{
+	struct dentry *res;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *dfid, *fid;
+	struct inode *inode;
+	char *name;
+
+	p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%pd) %p flags: %x\n",
+		 dir, dentry, dentry, flags);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	v9ses = v9fs_inode2v9ses(dir);
+	/* We can walk d_parent because we hold the dir->i_mutex */
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid))
+		return ERR_CAST(dfid);
+
+	name = (char *) dentry->d_name.name;
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		if (fid == ERR_PTR(-ENOENT)) {
+			d_add(dentry, NULL);
+			return NULL;
+		}
+		return ERR_CAST(fid);
+	}
+	/*
+	 * Make sure we don't use a wrong inode due to parallel
+	 * unlink. For cached mode create calls request for new
+	 * inode. But with cache disabled, lookup should do this.
+	 */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	else
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		p9_client_clunk(fid);
+		return ERR_CAST(inode);
+	}
+	/*
+	 * If we had a rename on the server and a parallel lookup
+	 * for the new name, then make sure we instantiate with
+	 * the new name. ie look up for a/b, while on server somebody
+	 * moved b under k and client parallely did a lookup for
+	 * k/b.
+	 */
+	res = d_splice_alias(inode, dentry);
+	if (!res)
+		v9fs_fid_add(dentry, fid);
+	else if (!IS_ERR(res))
+		v9fs_fid_add(res, fid);
+	else
+		p9_client_clunk(fid);
+	return res;
+}
+
+static int
+v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
+		     struct file *file, unsigned flags, umode_t mode,
+		     int *opened)
+{
+	int err;
+	u32 perm;
+	struct v9fs_inode *v9inode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid, *inode_fid;
+	struct dentry *res = NULL;
+
+	if (d_unhashed(dentry)) {
+		res = v9fs_vfs_lookup(dir, dentry, 0);
+		if (IS_ERR(res))
+			return PTR_ERR(res);
+
+		if (res)
+			dentry = res;
+	}
+
+	/* Only creates */
+	if (!(flags & O_CREAT) || d_really_is_positive(dentry))
+		return finish_no_open(file, res);
+
+	err = 0;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	perm = unixmode2p9mode(v9ses, mode);
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
+				v9fs_uflags2omode(flags,
+						v9fs_proto_dotu(v9ses)));
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		fid = NULL;
+		goto error;
+	}
+
+	v9fs_invalidate_inode_attr(dir);
+	v9inode = V9FS_I(d_inode(dentry));
+	mutex_lock(&v9inode->v_mutex);
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
+	    ((flags & O_ACCMODE) != O_RDONLY)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		inode_fid = v9fs_writeback_fid(dentry);
+		if (IS_ERR(inode_fid)) {
+			err = PTR_ERR(inode_fid);
+			mutex_unlock(&v9inode->v_mutex);
+			goto error;
+		}
+		v9inode->writeback_fid = (void *) inode_fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+	err = finish_open(file, dentry, generic_file_open, opened);
+	if (err)
+		goto error;
+
+	file->private_data = fid;
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		v9fs_cache_inode_set_cookie(d_inode(dentry), file);
+
+	*opened |= FILE_CREATED;
+out:
+	dput(res);
+	return err;
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	goto out;
+}
+
+/**
+ * v9fs_vfs_unlink - VFS unlink hook to delete an inode
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, 0);
+}
+
+/**
+ * v9fs_vfs_rmdir - VFS unlink hook to delete a directory
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, AT_REMOVEDIR);
+}
+
+/**
+ * v9fs_vfs_rename - VFS hook to rename an inode
+ * @old_dir:  old dir inode
+ * @old_dentry: old dentry
+ * @new_dir: new dir inode
+ * @new_dentry: new dentry
+ *
+ */
+
+int
+v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	int retval;
+	struct inode *old_inode;
+	struct inode *new_inode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *oldfid;
+	struct p9_fid *olddirfid;
+	struct p9_fid *newdirfid;
+	struct p9_wstat wstat;
+
+	p9_debug(P9_DEBUG_VFS, "\n");
+	retval = 0;
+	old_inode = d_inode(old_dentry);
+	new_inode = d_inode(new_dentry);
+	v9ses = v9fs_inode2v9ses(old_inode);
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	olddirfid = v9fs_fid_clone(old_dentry->d_parent);
+	if (IS_ERR(olddirfid)) {
+		retval = PTR_ERR(olddirfid);
+		goto done;
+	}
+
+	newdirfid = v9fs_fid_clone(new_dentry->d_parent);
+	if (IS_ERR(newdirfid)) {
+		retval = PTR_ERR(newdirfid);
+		goto clunk_olddir;
+	}
+
+	down_write(&v9ses->rename_sem);
+	if (v9fs_proto_dotl(v9ses)) {
+		retval = p9_client_renameat(olddirfid, old_dentry->d_name.name,
+					    newdirfid, new_dentry->d_name.name);
+		if (retval == -EOPNOTSUPP)
+			retval = p9_client_rename(oldfid, newdirfid,
+						  new_dentry->d_name.name);
+		if (retval != -EOPNOTSUPP)
+			goto clunk_newdir;
+	}
+	if (old_dentry->d_parent != new_dentry->d_parent) {
+		/*
+		 * 9P .u can only handle file rename in the same directory
+		 */
+
+		p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
+		retval = -EXDEV;
+		goto clunk_newdir;
+	}
+	v9fs_blank_wstat(&wstat);
+	wstat.muid = v9ses->uname;
+	wstat.name = (char *) new_dentry->d_name.name;
+	retval = p9_client_wstat(oldfid, &wstat);
+
+clunk_newdir:
+	if (!retval) {
+		if (new_inode) {
+			if (S_ISDIR(new_inode->i_mode))
+				clear_nlink(new_inode);
+			else
+				drop_nlink(new_inode);
+		}
+		if (S_ISDIR(old_inode->i_mode)) {
+			if (!new_inode)
+				inc_nlink(new_dir);
+			drop_nlink(old_dir);
+		}
+		v9fs_invalidate_inode_attr(old_inode);
+		v9fs_invalidate_inode_attr(old_dir);
+		v9fs_invalidate_inode_attr(new_dir);
+
+		/* successful rename */
+		d_move(old_dentry, new_dentry);
+	}
+	up_write(&v9ses->rename_sem);
+	p9_client_clunk(newdirfid);
+
+clunk_olddir:
+	p9_client_clunk(olddirfid);
+
+done:
+	return retval;
+}
+
+/**
+ * v9fs_vfs_getattr - retrieve file metadata
+ * @mnt: mount information
+ * @dentry: file to get attributes on
+ * @stat: metadata structure to populate
+ *
+ */
+
+static int
+v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_wstat *st;
+
+	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	v9ses = v9fs_dentry2v9ses(dentry);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		generic_fillattr(d_inode(dentry), stat);
+		return 0;
+	}
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb);
+	generic_fillattr(d_inode(dentry), stat);
+
+	p9stat_free(st);
+	kfree(st);
+	return 0;
+}
+
+/**
+ * v9fs_vfs_setattr - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	int retval;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_wstat wstat;
+
+	p9_debug(P9_DEBUG_VFS, "\n");
+	retval = inode_change_ok(d_inode(dentry), iattr);
+	if (retval)
+		return retval;
+
+	retval = -EPERM;
+	v9ses = v9fs_dentry2v9ses(dentry);
+	fid = v9fs_fid_lookup(dentry);
+	if(IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	v9fs_blank_wstat(&wstat);
+	if (iattr->ia_valid & ATTR_MODE)
+		wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode);
+
+	if (iattr->ia_valid & ATTR_MTIME)
+		wstat.mtime = iattr->ia_mtime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_ATIME)
+		wstat.atime = iattr->ia_atime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_SIZE)
+		wstat.length = iattr->ia_size;
+
+	if (v9fs_proto_dotu(v9ses)) {
+		if (iattr->ia_valid & ATTR_UID)
+			wstat.n_uid = iattr->ia_uid;
+
+		if (iattr->ia_valid & ATTR_GID)
+			wstat.n_gid = iattr->ia_gid;
+	}
+
+	/* Write all dirty data */
+	if (d_is_reg(dentry))
+		filemap_write_and_wait(d_inode(dentry)->i_mapping);
+
+	retval = p9_client_wstat(fid, &wstat);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(d_inode(dentry)))
+		truncate_setsize(d_inode(dentry), iattr->ia_size);
+
+	v9fs_invalidate_inode_attr(d_inode(dentry));
+
+	setattr_copy(d_inode(dentry), iattr);
+	mark_inode_dirty(d_inode(dentry));
+	return 0;
+}
+
+/**
+ * v9fs_stat2inode - populate an inode structure with mistat info
+ * @stat: Plan 9 metadata (mistat) structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+	struct super_block *sb)
+{
+	umode_t mode;
+	char ext[32];
+	char tag_name[14];
+	unsigned int i_nlink;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	set_nlink(inode, 1);
+
+	inode->i_atime.tv_sec = stat->atime;
+	inode->i_mtime.tv_sec = stat->mtime;
+	inode->i_ctime.tv_sec = stat->mtime;
+
+	inode->i_uid = v9ses->dfltuid;
+	inode->i_gid = v9ses->dfltgid;
+
+	if (v9fs_proto_dotu(v9ses)) {
+		inode->i_uid = stat->n_uid;
+		inode->i_gid = stat->n_gid;
+	}
+	if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
+		if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+			/*
+			 * Hadlink support got added later to
+			 * to the .u extension. So there can be
+			 * server out there that doesn't support
+			 * this even with .u extension. So check
+			 * for non NULL stat->extension
+			 */
+			strlcpy(ext, stat->extension, sizeof(ext));
+			/* HARDLINKCOUNT %u */
+			sscanf(ext, "%13s %u", tag_name, &i_nlink);
+			if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+				set_nlink(inode, i_nlink);
+		}
+	}
+	mode = p9mode2perm(v9ses, stat);
+	mode |= inode->i_mode & ~S_IALLUGO;
+	inode->i_mode = mode;
+	i_size_write(inode, stat->length);
+
+	/* not real number of blocks, but 512 byte ones ... */
+	inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
+}
+
+/**
+ * v9fs_qid2ino - convert qid into inode number
+ * @qid: qid to hash
+ *
+ * BUG: potential for inode number collisions?
+ */
+
+ino_t v9fs_qid2ino(struct p9_qid *qid)
+{
+	u64 path = qid->path + 2;
+	ino_t i = 0;
+
+	if (sizeof(ino_t) == sizeof(path))
+		memcpy(&i, &path, sizeof(ino_t));
+	else
+		i = (ino_t) (path ^ (path >> 32));
+
+	return i;
+}
+
+/**
+ * v9fs_readlink - read a symlink's location (internal version)
+ * @dentry: dentry for symlink
+ * @buffer: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+
+static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+	int retval;
+
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_wstat *st;
+
+	p9_debug(P9_DEBUG_VFS, " %pd\n", dentry);
+	retval = -EPERM;
+	v9ses = v9fs_dentry2v9ses(dentry);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	if (!v9fs_proto_dotu(v9ses))
+		return -EBADF;
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	if (!(st->mode & P9_DMSYMLINK)) {
+		retval = -EINVAL;
+		goto done;
+	}
+
+	/* copy extension buffer into buffer */
+	retval = min(strlen(st->extension)+1, (size_t)buflen);
+	memcpy(buffer, st->extension, retval);
+
+	p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n",
+		 dentry, st->extension, buflen, buffer);
+
+done:
+	p9stat_free(st);
+	kfree(st);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_follow_link - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int len = 0;
+	char *link = __getname();
+
+	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+
+	if (!link)
+		link = ERR_PTR(-ENOMEM);
+	else {
+		len = v9fs_readlink(dentry, link, PATH_MAX);
+
+		if (len < 0) {
+			__putname(link);
+			link = ERR_PTR(len);
+		} else
+			link[min(len, PATH_MAX-1)] = 0;
+	}
+	nd_set_link(nd, link);
+
+	return NULL;
+}
+
+/**
+ * v9fs_vfs_put_link - release a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ * @p: unused
+ *
+ */
+
+void
+v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+
+	p9_debug(P9_DEBUG_VFS, " %pd %s\n",
+		 dentry, IS_ERR(s) ? "<error>" : s);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+/**
+ * v9fs_vfs_mkspecial - create a special file
+ * @dir: inode to create special file in
+ * @dentry: dentry to create
+ * @perm: mode to create special file
+ * @extension: 9p2000.u format extension string representing special file
+ *
+ */
+
+static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
+	u32 perm, const char *extension)
+{
+	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	if (!v9fs_proto_dotu(v9ses)) {
+		p9_debug(P9_DEBUG_ERROR, "not extended\n");
+		return -EPERM;
+	}
+
+	fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm,
+								P9_OREAD);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	v9fs_invalidate_inode_attr(dir);
+	p9_client_clunk(fid);
+	return 0;
+}
+
+/**
+ * v9fs_vfs_symlink - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See Also: 9P2000.u RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n",
+		 dir->i_ino, dentry, symname);
+
+	return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
+}
+
+/**
+ * v9fs_vfs_link - create a hardlink
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+static int
+v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int retval;
+	char *name;
+	struct p9_fid *oldfid;
+
+	p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n",
+		 dir->i_ino, dentry, old_dentry);
+
+	oldfid = v9fs_fid_clone(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	name = __getname();
+	if (unlikely(!name)) {
+		retval = -ENOMEM;
+		goto clunk_fid;
+	}
+
+	sprintf(name, "%d\n", oldfid->fid);
+	retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
+	__putname(name);
+	if (!retval) {
+		v9fs_refresh_inode(oldfid, d_inode(old_dentry));
+		v9fs_invalidate_inode_attr(dir);
+	}
+clunk_fid:
+	p9_client_clunk(oldfid);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_mknod - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+
+static int
+v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	int retval;
+	char *name;
+	u32 perm;
+
+	p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+		 dir->i_ino, dentry, mode,
+		 MAJOR(rdev), MINOR(rdev));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	name = __getname();
+	if (!name)
+		return -ENOMEM;
+	/* build extension */
+	if (S_ISBLK(mode))
+		sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISCHR(mode))
+		sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISFIFO(mode))
+		*name = 0;
+	else if (S_ISSOCK(mode))
+		*name = 0;
+	else {
+		__putname(name);
+		return -EINVAL;
+	}
+
+	perm = unixmode2p9mode(v9ses, mode);
+	retval = v9fs_vfs_mkspecial(dir, dentry, perm, name);
+	__putname(name);
+
+	return retval;
+}
+
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
+{
+	int umode;
+	dev_t rdev;
+	loff_t i_size;
+	struct p9_wstat *st;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	/*
+	 * We don't want to refresh inode->i_size,
+	 * because we may have cached data
+	 */
+	i_size = inode->i_size;
+	v9fs_stat2inode(st, inode, inode->i_sb);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		inode->i_size = i_size;
+	spin_unlock(&inode->i_lock);
+out:
+	p9stat_free(st);
+	kfree(st);
+	return 0;
+}
+
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.atomic_open = v9fs_vfs_atomic_open,
+	.symlink = v9fs_vfs_symlink,
+	.link = v9fs_vfs_link,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_dir_inode_operations = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.atomic_open = v9fs_vfs_atomic_open,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_file_inode_operations = {
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_symlink_inode_operations = {
+	.readlink = generic_readlink,
+	.follow_link = v9fs_vfs_follow_link,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
diff --git a/kernel/fs/9p/vfs_inode_dotl.c b/kernel/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 000000000..9861c7c95
--- /dev/null
+++ b/kernel/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,1016 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
+		    dev_t rdev);
+
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+
+static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+	BUG_ON(dir_inode == NULL);
+
+	if (dir_inode->i_mode & S_ISGID) {
+		/* set_gid bit is set.*/
+		return dir_inode->i_gid;
+	}
+	return current_fsgid();
+}
+
+static int v9fs_test_inode_dotl(struct inode *inode, void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		return 0;
+
+	if (inode->i_generation != st->st_gen)
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+/* Always get a new inode */
+static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
+{
+	return 0;
+}
+
+static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	inode->i_generation = st->st_gen;
+	return 0;
+}
+
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+					struct p9_qid *qid,
+					struct p9_fid *fid,
+					struct p9_stat_dotl *st,
+					int new)
+{
+	int retval;
+	unsigned long i_ino;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode_dotl;
+	else
+		test = v9fs_test_inode_dotl;
+
+	i_ino = v9fs_qid2ino(qid);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	/*
+	 * initialize the inode with the stat info
+	 * FIXME!! we may need support for stale inodes
+	 * later.
+	 */
+	inode->i_ino = i_ino;
+	retval = v9fs_init_inode(v9ses, inode,
+				 st->st_mode, new_decode_dev(st->st_rdev));
+	if (retval)
+		goto error;
+
+	v9fs_stat2inode_dotl(st, inode);
+	v9fs_cache_inode_get_cookie(inode);
+	retval = v9fs_get_acl(inode, fid);
+	if (retval)
+		goto error;
+
+	unlock_new_inode(inode);
+	return inode;
+error:
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			 struct super_block *sb, int new)
+{
+	struct p9_stat_dotl *st;
+	struct inode *inode = NULL;
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
+	kfree(st);
+	return inode;
+}
+
+struct dotl_openflag_map {
+	int open_flag;
+	int dotl_flag;
+};
+
+static int v9fs_mapped_dotl_flags(int flags)
+{
+	int i;
+	int rflags = 0;
+	struct dotl_openflag_map dotl_oflag_map[] = {
+		{ O_CREAT,	P9_DOTL_CREATE },
+		{ O_EXCL,	P9_DOTL_EXCL },
+		{ O_NOCTTY,	P9_DOTL_NOCTTY },
+		{ O_APPEND,	P9_DOTL_APPEND },
+		{ O_NONBLOCK,	P9_DOTL_NONBLOCK },
+		{ O_DSYNC,	P9_DOTL_DSYNC },
+		{ FASYNC,	P9_DOTL_FASYNC },
+		{ O_DIRECT,	P9_DOTL_DIRECT },
+		{ O_LARGEFILE,	P9_DOTL_LARGEFILE },
+		{ O_DIRECTORY,	P9_DOTL_DIRECTORY },
+		{ O_NOFOLLOW,	P9_DOTL_NOFOLLOW },
+		{ O_NOATIME,	P9_DOTL_NOATIME },
+		{ O_CLOEXEC,	P9_DOTL_CLOEXEC },
+		{ O_SYNC,	P9_DOTL_SYNC},
+	};
+	for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
+		if (flags & dotl_oflag_map[i].open_flag)
+			rflags |= dotl_oflag_map[i].dotl_flag;
+	}
+	return rflags;
+}
+
+/**
+ * v9fs_open_to_dotl_flags- convert Linux specific open flags to
+ * plan 9 open flag.
+ * @flags: flags to convert
+ */
+int v9fs_open_to_dotl_flags(int flags)
+{
+	int rflags = 0;
+
+	/*
+	 * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
+	 * and P9_DOTL_NOACCESS
+	 */
+	rflags |= flags & O_ACCMODE;
+	rflags |= v9fs_mapped_dotl_flags(flags);
+
+	return rflags;
+}
+
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @omode: create permissions
+ *
+ */
+
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
+		bool excl)
+{
+	return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+}
+
+static int
+v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
+			  struct file *file, unsigned flags, umode_t omode,
+			  int *opened)
+{
+	int err = 0;
+	kgid_t gid;
+	umode_t mode;
+	char *name = NULL;
+	struct p9_qid qid;
+	struct inode *inode;
+	struct p9_fid *fid = NULL;
+	struct v9fs_inode *v9inode;
+	struct p9_fid *dfid, *ofid, *inode_fid;
+	struct v9fs_session_info *v9ses;
+	struct posix_acl *pacl = NULL, *dacl = NULL;
+	struct dentry *res = NULL;
+
+	if (d_unhashed(dentry)) {
+		res = v9fs_vfs_lookup(dir, dentry, 0);
+		if (IS_ERR(res))
+			return PTR_ERR(res);
+
+		if (res)
+			dentry = res;
+	}
+
+	/* Only creates */
+	if (!(flags & O_CREAT) || d_really_is_positive(dentry))
+		return	finish_no_open(file, res);
+
+	v9ses = v9fs_inode2v9ses(dir);
+
+	name = (char *) dentry->d_name.name;
+	p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
+		 name, flags, omode);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		goto out;
+	}
+
+	/* clone a fid to use for creation */
+	ofid = p9_client_walk(dfid, 0, NULL, 1);
+	if (IS_ERR(ofid)) {
+		err = PTR_ERR(ofid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		goto out;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
+			 err);
+		goto error;
+	}
+	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
+				    mode, gid, &qid);
+	if (err < 0) {
+		p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
+			 err);
+		goto error;
+	}
+	v9fs_invalidate_inode_attr(dir);
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		fid = NULL;
+		goto error;
+	}
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+		goto error;
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(inode, fid, dacl, pacl);
+
+	v9fs_fid_add(dentry, fid);
+	d_instantiate(dentry, inode);
+
+	v9inode = V9FS_I(inode);
+	mutex_lock(&v9inode->v_mutex);
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
+	    ((flags & O_ACCMODE) != O_RDONLY)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		inode_fid = v9fs_writeback_fid(dentry);
+		if (IS_ERR(inode_fid)) {
+			err = PTR_ERR(inode_fid);
+			mutex_unlock(&v9inode->v_mutex);
+			goto err_clunk_old_fid;
+		}
+		v9inode->writeback_fid = (void *) inode_fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+	/* Since we are opening a file, assign the open fid to the file */
+	err = finish_open(file, dentry, generic_file_open, opened);
+	if (err)
+		goto err_clunk_old_fid;
+	file->private_data = ofid;
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		v9fs_cache_inode_set_cookie(inode, file);
+	*opened |= FILE_CREATED;
+out:
+	v9fs_put_acl(dacl, pacl);
+	dput(res);
+	return err;
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+err_clunk_old_fid:
+	if (ofid)
+		p9_client_clunk(ofid);
+	goto out;
+}
+
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @omode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+			       struct dentry *dentry, umode_t omode)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	kgid_t gid;
+	char *name;
+	umode_t mode;
+	struct inode *inode;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
+	err = 0;
+	v9ses = v9fs_inode2v9ses(dir);
+
+	omode |= S_IFDIR;
+	if (dir->i_mode & S_ISGID)
+		omode |= S_ISGID;
+
+	dir_dentry = dentry->d_parent;
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n",
+			 err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+	err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+			 err);
+		fid = NULL;
+		goto error;
+	}
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
+			goto error;
+		}
+		v9fs_fid_add(dentry, fid);
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
+		d_instantiate(dentry, inode);
+		fid = NULL;
+		err = 0;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate
+		 * inode with stat. We need to get an inode
+		 * so that we can set the acl with dentry
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode, 0);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
+		d_instantiate(dentry, inode);
+	}
+	inc_nlink(dir);
+	v9fs_invalidate_inode_attr(dir);
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	v9fs_put_acl(dacl, pacl);
+	return err;
+}
+
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_stat_dotl *st;
+
+	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	v9ses = v9fs_dentry2v9ses(dentry);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		generic_fillattr(d_inode(dentry), stat);
+		return 0;
+	}
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	/* Ask for all the fields in stat structure. Server will return
+	 * whatever it supports
+	 */
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	v9fs_stat2inode_dotl(st, d_inode(dentry));
+	generic_fillattr(d_inode(dentry), stat);
+	/* Change block size to what the server returned */
+	stat->blksize = st->st_blksize;
+
+	kfree(st);
+	return 0;
+}
+
+/*
+ * Attribute flags.
+ */
+#define P9_ATTR_MODE		(1 << 0)
+#define P9_ATTR_UID		(1 << 1)
+#define P9_ATTR_GID		(1 << 2)
+#define P9_ATTR_SIZE		(1 << 3)
+#define P9_ATTR_ATIME		(1 << 4)
+#define P9_ATTR_MTIME		(1 << 5)
+#define P9_ATTR_CTIME		(1 << 6)
+#define P9_ATTR_ATIME_SET	(1 << 7)
+#define P9_ATTR_MTIME_SET	(1 << 8)
+
+struct dotl_iattr_map {
+	int iattr_valid;
+	int p9_iattr_valid;
+};
+
+static int v9fs_mapped_iattr_valid(int iattr_valid)
+{
+	int i;
+	int p9_iattr_valid = 0;
+	struct dotl_iattr_map dotl_iattr_map[] = {
+		{ ATTR_MODE,		P9_ATTR_MODE },
+		{ ATTR_UID,		P9_ATTR_UID },
+		{ ATTR_GID,		P9_ATTR_GID },
+		{ ATTR_SIZE,		P9_ATTR_SIZE },
+		{ ATTR_ATIME,		P9_ATTR_ATIME },
+		{ ATTR_MTIME,		P9_ATTR_MTIME },
+		{ ATTR_CTIME,		P9_ATTR_CTIME },
+		{ ATTR_ATIME_SET,	P9_ATTR_ATIME_SET },
+		{ ATTR_MTIME_SET,	P9_ATTR_MTIME_SET },
+	};
+	for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) {
+		if (iattr_valid & dotl_iattr_map[i].iattr_valid)
+			p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid;
+	}
+	return p9_iattr_valid;
+}
+
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+	int retval;
+	struct p9_fid *fid;
+	struct p9_iattr_dotl p9attr;
+	struct inode *inode = d_inode(dentry);
+
+	p9_debug(P9_DEBUG_VFS, "\n");
+
+	retval = inode_change_ok(inode, iattr);
+	if (retval)
+		return retval;
+
+	p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
+	p9attr.mode = iattr->ia_mode;
+	p9attr.uid = iattr->ia_uid;
+	p9attr.gid = iattr->ia_gid;
+	p9attr.size = iattr->ia_size;
+	p9attr.atime_sec = iattr->ia_atime.tv_sec;
+	p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+	p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	/* Write all dirty data */
+	if (S_ISREG(inode->i_mode))
+		filemap_write_and_wait(inode->i_mapping);
+
+	retval = p9_client_setattr(fid, &p9attr);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode))
+		truncate_setsize(inode, iattr->ia_size);
+
+	v9fs_invalidate_inode_attr(inode);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	if (iattr->ia_valid & ATTR_MODE) {
+		/* We also want to update ACL when we update mode bits */
+		retval = v9fs_acl_chmod(inode, fid);
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ *
+ */
+
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+	umode_t mode;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+		inode->i_atime.tv_sec = stat->st_atime_sec;
+		inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		inode->i_mtime.tv_sec = stat->st_mtime_sec;
+		inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		inode->i_ctime.tv_sec = stat->st_ctime_sec;
+		inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		inode->i_uid = stat->st_uid;
+		inode->i_gid = stat->st_gid;
+		set_nlink(inode, stat->st_nlink);
+
+		mode = stat->st_mode & S_IALLUGO;
+		mode |= inode->i_mode & ~S_IALLUGO;
+		inode->i_mode = mode;
+
+		i_size_write(inode, stat->st_size);
+		inode->i_blocks = stat->st_blocks;
+	} else {
+		if (stat->st_result_mask & P9_STATS_ATIME) {
+			inode->i_atime.tv_sec = stat->st_atime_sec;
+			inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_MTIME) {
+			inode->i_mtime.tv_sec = stat->st_mtime_sec;
+			inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_CTIME) {
+			inode->i_ctime.tv_sec = stat->st_ctime_sec;
+			inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_UID)
+			inode->i_uid = stat->st_uid;
+		if (stat->st_result_mask & P9_STATS_GID)
+			inode->i_gid = stat->st_gid;
+		if (stat->st_result_mask & P9_STATS_NLINK)
+			set_nlink(inode, stat->st_nlink);
+		if (stat->st_result_mask & P9_STATS_MODE) {
+			inode->i_mode = stat->st_mode;
+			if ((S_ISBLK(inode->i_mode)) ||
+						(S_ISCHR(inode->i_mode)))
+				init_special_inode(inode, inode->i_mode,
+								inode->i_rdev);
+		}
+		if (stat->st_result_mask & P9_STATS_RDEV)
+			inode->i_rdev = new_decode_dev(stat->st_rdev);
+		if (stat->st_result_mask & P9_STATS_SIZE)
+			i_size_write(inode, stat->st_size);
+		if (stat->st_result_mask & P9_STATS_BLOCKS)
+			inode->i_blocks = stat->st_blocks;
+	}
+	if (stat->st_result_mask & P9_STATS_GEN)
+		inode->i_generation = stat->st_gen;
+
+	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+	 * because the inode structure does not have fields for them.
+	 */
+	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
+}
+
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+		const char *symname)
+{
+	int err;
+	kgid_t gid;
+	char *name;
+	struct p9_qid qid;
+	struct inode *inode;
+	struct p9_fid *dfid;
+	struct p9_fid *fid = NULL;
+	struct v9fs_session_info *v9ses;
+
+	name = (char *) dentry->d_name.name;
+	p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
+	v9ses = v9fs_inode2v9ses(dir);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return err;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	/* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+
+	if (err < 0) {
+		p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+		goto error;
+	}
+
+	v9fs_invalidate_inode_attr(dir);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		/* Now walk from the parent so we can get an unopened fid. */
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				 err);
+			fid = NULL;
+			goto error;
+		}
+
+		/* instantiate inode and assign the unopened fid to dentry */
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
+			goto error;
+		}
+		v9fs_fid_add(dentry, fid);
+		d_instantiate(dentry, inode);
+		fid = NULL;
+		err = 0;
+	} else {
+		/* Not in cached mode. No need to populate inode with stat */
+		inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+	}
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	int err;
+	struct dentry *dir_dentry;
+	struct p9_fid *dfid, *oldfid;
+	struct v9fs_session_info *v9ses;
+
+	p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n",
+		 dir->i_ino, old_dentry, dentry);
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = dentry->d_parent;
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid))
+		return PTR_ERR(dfid);
+
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+
+	if (err < 0) {
+		p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+		return err;
+	}
+
+	v9fs_invalidate_inode_attr(dir);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		/* Get the latest stat info from server. */
+		struct p9_fid *fid;
+		fid = v9fs_fid_lookup(old_dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		v9fs_refresh_inode_dotl(fid, d_inode(old_dentry));
+	}
+	ihold(d_inode(old_dentry));
+	d_instantiate(dentry, d_inode(old_dentry));
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @omode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
+		dev_t rdev)
+{
+	int err;
+	kgid_t gid;
+	char *name;
+	umode_t mode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	struct inode *inode;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+		 dir->i_ino, dentry, omode,
+		 MAJOR(rdev), MINOR(rdev));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = dentry->d_parent;
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n",
+			 err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+
+	err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	v9fs_invalidate_inode_attr(dir);
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+			 err);
+		fid = NULL;
+		goto error;
+	}
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
+			goto error;
+		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
+		v9fs_fid_add(dentry, fid);
+		d_instantiate(dentry, inode);
+		fid = NULL;
+		err = 0;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate inode with stat.
+		 * socket syscall returns a fd, so we need instantiate
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode, rdev);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
+		d_instantiate(dentry, inode);
+	}
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	v9fs_put_acl(dacl, pacl);
+	return err;
+}
+
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+	int retval;
+	struct p9_fid *fid;
+	char *link = __getname();
+	char *target;
+
+	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+		goto ndset;
+	}
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid)) {
+		__putname(link);
+		link = ERR_CAST(fid);
+		goto ndset;
+	}
+	retval = p9_client_readlink(fid, &target);
+	if (!retval) {
+		strcpy(link, target);
+		kfree(target);
+		goto ndset;
+	}
+	__putname(link);
+	link = ERR_PTR(retval);
+ndset:
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
+{
+	loff_t i_size;
+	struct p9_stat_dotl *st;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	/*
+	 * We don't want to refresh inode->i_size,
+	 * because we may have cached data
+	 */
+	i_size = inode->i_size;
+	v9fs_stat2inode_dotl(st, inode);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		inode->i_size = i_size;
+	spin_unlock(&inode->i_lock);
+out:
+	kfree(st);
+	return 0;
+}
+
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+	.create = v9fs_vfs_create_dotl,
+	.atomic_open = v9fs_vfs_atomic_open_dotl,
+	.lookup = v9fs_vfs_lookup,
+	.link = v9fs_vfs_link_dotl,
+	.symlink = v9fs_vfs_symlink_dotl,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir_dotl,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod_dotl,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.get_acl = v9fs_iop_get_acl,
+};
+
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.get_acl = v9fs_iop_get_acl,
+};
+
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+	.readlink = generic_readlink,
+	.follow_link = v9fs_vfs_follow_link_dotl,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+};
diff --git a/kernel/fs/9p/vfs_super.c b/kernel/fs/9p/vfs_super.c
new file mode 100644
index 000000000..e99a338a4
--- /dev/null
+++ b/kernel/fs/9p/vfs_super.c
@@ -0,0 +1,370 @@
+/*
+ *  linux/fs/9p/vfs_super.c
+ *
+ * This file contians superblock ops for 9P2000. It is intended that
+ * you mount this file system on directories.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "xattr.h"
+#include "acl.h"
+
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
+
+/**
+ * v9fs_set_super - set the superblock
+ * @s: super block
+ * @data: file system specific data
+ *
+ */
+
+static int v9fs_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return set_anon_super(s, data);
+}
+
+/**
+ * v9fs_fill_super - populate superblock with info
+ * @sb: superblock
+ * @v9ses: session information
+ * @flags: flags propagated from v9fs_mount()
+ *
+ */
+
+static void
+v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
+		int flags, void *data)
+{
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
+	sb->s_blocksize = 1 << sb->s_blocksize_bits;
+	sb->s_magic = V9FS_MAGIC;
+	if (v9fs_proto_dotl(v9ses)) {
+		sb->s_op = &v9fs_super_ops_dotl;
+		sb->s_xattr = v9fs_xattr_handlers;
+	} else
+		sb->s_op = &v9fs_super_ops;
+	sb->s_bdi = &v9ses->bdi;
+	if (v9ses->cache)
+		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
+
+	sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	if (!v9ses->cache)
+		sb->s_flags |= MS_SYNCHRONOUS;
+
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
+		sb->s_flags |= MS_POSIXACL;
+#endif
+
+	save_mount_options(sb, data);
+}
+
+/**
+ * v9fs_mount - mount a superblock
+ * @fs_type: file system type
+ * @flags: mount flags
+ * @dev_name: device name that was mounted
+ * @data: mount options
+ *
+ */
+
+static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
+{
+	struct super_block *sb = NULL;
+	struct inode *inode = NULL;
+	struct dentry *root = NULL;
+	struct v9fs_session_info *v9ses = NULL;
+	umode_t mode = S_IRWXUGO | S_ISVTX;
+	struct p9_fid *fid;
+	int retval = 0;
+
+	p9_debug(P9_DEBUG_VFS, "\n");
+
+	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
+	if (!v9ses)
+		return ERR_PTR(-ENOMEM);
+
+	fid = v9fs_session_init(v9ses, dev_name, data);
+	if (IS_ERR(fid)) {
+		retval = PTR_ERR(fid);
+		/*
+		 * we need to call session_close to tear down some
+		 * of the data structure setup by session_init
+		 */
+		goto close_session;
+	}
+
+	sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
+	if (IS_ERR(sb)) {
+		retval = PTR_ERR(sb);
+		goto clunk_fid;
+	}
+	v9fs_fill_super(sb, v9ses, flags, data);
+
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		sb->s_d_op = &v9fs_cached_dentry_operations;
+	else
+		sb->s_d_op = &v9fs_dentry_operations;
+
+	inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
+	if (IS_ERR(inode)) {
+		retval = PTR_ERR(inode);
+		goto release_sb;
+	}
+
+	root = d_make_root(inode);
+	if (!root) {
+		retval = -ENOMEM;
+		goto release_sb;
+	}
+	sb->s_root = root;
+	if (v9fs_proto_dotl(v9ses)) {
+		struct p9_stat_dotl *st = NULL;
+		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+		if (IS_ERR(st)) {
+			retval = PTR_ERR(st);
+			goto release_sb;
+		}
+		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
+		v9fs_stat2inode_dotl(st, d_inode(root));
+		kfree(st);
+	} else {
+		struct p9_wstat *st = NULL;
+		st = p9_client_stat(fid);
+		if (IS_ERR(st)) {
+			retval = PTR_ERR(st);
+			goto release_sb;
+		}
+
+		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
+		v9fs_stat2inode(st, d_inode(root), sb);
+
+		p9stat_free(st);
+		kfree(st);
+	}
+	retval = v9fs_get_acl(inode, fid);
+	if (retval)
+		goto release_sb;
+	v9fs_fid_add(root, fid);
+
+	p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	return dget(sb->s_root);
+
+clunk_fid:
+	p9_client_clunk(fid);
+close_session:
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+	return ERR_PTR(retval);
+
+release_sb:
+	/*
+	 * we will do the session_close and root dentry release
+	 * in the below call. But we need to clunk fid, because we haven't
+	 * attached the fid to dentry so it won't get clunked
+	 * automatically.
+	 */
+	p9_client_clunk(fid);
+	deactivate_locked_super(sb);
+	return ERR_PTR(retval);
+}
+
+/**
+ * v9fs_kill_super - Kill Superblock
+ * @s: superblock
+ *
+ */
+
+static void v9fs_kill_super(struct super_block *s)
+{
+	struct v9fs_session_info *v9ses = s->s_fs_info;
+
+	p9_debug(P9_DEBUG_VFS, " %p\n", s);
+
+	kill_anon_super(s);
+
+	v9fs_session_cancel(v9ses);
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+	s->s_fs_info = NULL;
+	p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
+}
+
+static void
+v9fs_umount_begin(struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses;
+
+	v9ses = sb->s_fs_info;
+	v9fs_session_begin_cancel(v9ses);
+}
+
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_rstatfs rs;
+	int res;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid)) {
+		res = PTR_ERR(fid);
+		goto done;
+	}
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	if (v9fs_proto_dotl(v9ses)) {
+		res = p9_client_statfs(fid, &rs);
+		if (res == 0) {
+			buf->f_type = rs.type;
+			buf->f_bsize = rs.bsize;
+			buf->f_blocks = rs.blocks;
+			buf->f_bfree = rs.bfree;
+			buf->f_bavail = rs.bavail;
+			buf->f_files = rs.files;
+			buf->f_ffree = rs.ffree;
+			buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+			buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+			buf->f_namelen = rs.namelen;
+		}
+		if (res != -ENOSYS)
+			goto done;
+	}
+	res = simple_statfs(dentry, buf);
+done:
+	return res;
+}
+
+static int v9fs_drop_inode(struct inode *inode)
+{
+	struct v9fs_session_info *v9ses;
+	v9ses = v9fs_inode2v9ses(inode);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		return generic_drop_inode(inode);
+	/*
+	 * in case of non cached mode always drop the
+	 * the inode because we want the inode attribute
+	 * to always match that on the server.
+	 */
+	return 1;
+}
+
+static int v9fs_write_inode(struct inode *inode,
+			    struct writeback_control *wbc)
+{
+	int ret;
+	struct p9_wstat wstat;
+	struct v9fs_inode *v9inode;
+	/*
+	 * send an fsync request to server irrespective of
+	 * wbc->sync_mode.
+	 */
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+	v9inode = V9FS_I(inode);
+	if (!v9inode->writeback_fid)
+		return 0;
+	v9fs_blank_wstat(&wstat);
+
+	ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
+	if (ret < 0) {
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		return ret;
+	}
+	return 0;
+}
+
+static int v9fs_write_inode_dotl(struct inode *inode,
+				 struct writeback_control *wbc)
+{
+	int ret;
+	struct v9fs_inode *v9inode;
+	/*
+	 * send an fsync request to server irrespective of
+	 * wbc->sync_mode.
+	 */
+	v9inode = V9FS_I(inode);
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
+		 __func__, inode, v9inode->writeback_fid);
+	if (!v9inode->writeback_fid)
+		return 0;
+
+	ret = p9_client_fsync(v9inode->writeback_fid, 0);
+	if (ret < 0) {
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		return ret;
+	}
+	return 0;
+}
+
+static const struct super_operations v9fs_super_ops = {
+	.alloc_inode = v9fs_alloc_inode,
+	.destroy_inode = v9fs_destroy_inode,
+	.statfs = simple_statfs,
+	.evict_inode = v9fs_evict_inode,
+	.show_options = generic_show_options,
+	.umount_begin = v9fs_umount_begin,
+	.write_inode = v9fs_write_inode,
+};
+
+static const struct super_operations v9fs_super_ops_dotl = {
+	.alloc_inode = v9fs_alloc_inode,
+	.destroy_inode = v9fs_destroy_inode,
+	.statfs = v9fs_statfs,
+	.drop_inode = v9fs_drop_inode,
+	.evict_inode = v9fs_evict_inode,
+	.show_options = generic_show_options,
+	.umount_begin = v9fs_umount_begin,
+	.write_inode = v9fs_write_inode_dotl,
+};
+
+struct file_system_type v9fs_fs_type = {
+	.name = "9p",
+	.mount = v9fs_mount,
+	.kill_sb = v9fs_kill_super,
+	.owner = THIS_MODULE,
+	.fs_flags = FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("9p");
diff --git a/kernel/fs/9p/xattr.c b/kernel/fs/9p/xattr.c
new file mode 100644
index 000000000..0cf44b6cc
--- /dev/null
+++ b/kernel/fs/9p/xattr.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "fid.h"
+#include "xattr.h"
+
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
+			   void *buffer, size_t buffer_size)
+{
+	ssize_t retval;
+	u64 attr_size;
+	struct p9_fid *attr_fid;
+	struct kvec kvec = {.iov_base = buffer, .iov_len = buffer_size};
+	struct iov_iter to;
+	int err;
+
+	iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size);
+
+	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
+	if (IS_ERR(attr_fid)) {
+		retval = PTR_ERR(attr_fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
+			 retval);
+		return retval;
+	}
+	if (attr_size > buffer_size) {
+		if (!buffer_size) /* request to get the attr_size */
+			retval = attr_size;
+		else
+			retval = -ERANGE;
+	} else {
+		iov_iter_truncate(&to, attr_size);
+		retval = p9_client_read(attr_fid, 0, &to, &err);
+		if (err)
+			retval = err;
+	}
+	p9_client_clunk(attr_fid);
+	return retval;
+}
+
+
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t buffer_size)
+{
+	struct p9_fid *fid;
+
+	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+		 name, buffer_size);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
+}
+
+/*
+ * v9fs_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
+	struct p9_fid *fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+	return v9fs_fid_xattr_set(fid, name, value, value_len, flags);
+}
+
+int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
+	struct kvec kvec = {.iov_base = (void *)value, .iov_len = value_len};
+	struct iov_iter from;
+	int retval;
+
+	iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len);
+
+	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
+		 name, value_len, flags);
+
+	/* Clone it */
+	fid = p9_client_walk(fid, 0, NULL, 1);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	/*
+	 * On success fid points to xattr
+	 */
+	retval = p9_client_xattrcreate(fid, name, value_len, flags);
+	if (retval < 0)
+		p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
+			 retval);
+	else
+		p9_client_write(fid, 0, &from, &retval);
+	p9_client_clunk(fid);
+	return retval;
+}
+
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+}
+
+const struct xattr_handler *v9fs_xattr_handlers[] = {
+	&v9fs_xattr_user_handler,
+	&v9fs_xattr_trusted_handler,
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	&v9fs_xattr_acl_access_handler,
+	&v9fs_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_9P_FS_SECURITY
+	&v9fs_xattr_security_handler,
+#endif
+	NULL
+};
diff --git a/kernel/fs/9p/xattr.h b/kernel/fs/9p/xattr.h
new file mode 100644
index 000000000..d3e2ea384
--- /dev/null
+++ b/kernel/fs/9p/xattr.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_XATTR_H
+#define FS_9P_XATTR_H
+
+#include <linux/xattr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern struct xattr_handler v9fs_xattr_user_handler;
+extern struct xattr_handler v9fs_xattr_trusted_handler;
+extern struct xattr_handler v9fs_xattr_security_handler;
+extern const struct xattr_handler v9fs_xattr_acl_access_handler;
+extern const struct xattr_handler v9fs_xattr_acl_default_handler;
+
+extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
+				  void *, size_t);
+extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
+			      void *, size_t);
+extern int v9fs_fid_xattr_set(struct p9_fid *, const char *,
+			  const void *, size_t, int);
+extern int v9fs_xattr_set(struct dentry *, const char *,
+			  const void *, size_t, int);
+extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+#endif /* FS_9P_XATTR_H */
diff --git a/kernel/fs/9p/xattr_security.c b/kernel/fs/9p/xattr_security.c
new file mode 100644
index 000000000..cb247a142
--- /dev/null
+++ b/kernel/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
+			void *buffer, size_t size, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+	memcpy(full_name+prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+	kfree(full_name);
+	return retval;
+}
+
+static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+	memcpy(full_name + prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+	kfree(full_name);
+	return retval;
+}
+
+struct xattr_handler v9fs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= v9fs_xattr_security_get,
+	.set	= v9fs_xattr_security_set,
+};
diff --git a/kernel/fs/9p/xattr_trusted.c b/kernel/fs/9p/xattr_trusted.c
new file mode 100644
index 000000000..e30d33b8a
--- /dev/null
+++ b/kernel/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
+			void *buffer, size_t size, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+	memcpy(full_name+prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+	kfree(full_name);
+	return retval;
+}
+
+static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+	memcpy(full_name + prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+	kfree(full_name);
+	return retval;
+}
+
+struct xattr_handler v9fs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.get	= v9fs_xattr_trusted_get,
+	.set	= v9fs_xattr_trusted_set,
+};
diff --git a/kernel/fs/9p/xattr_user.c b/kernel/fs/9p/xattr_user.c
new file mode 100644
index 000000000..d0b701b72
--- /dev/null
+++ b/kernel/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
+			void *buffer, size_t size, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+	memcpy(full_name+prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+	kfree(full_name);
+	return retval;
+}
+
+static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+	memcpy(full_name + prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+	kfree(full_name);
+	return retval;
+}
+
+struct xattr_handler v9fs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= v9fs_xattr_user_get,
+	.set	= v9fs_xattr_user_set,
+};
diff --git a/kernel/fs/Kconfig b/kernel/fs/Kconfig
new file mode 100644
index 000000000..011f43365
--- /dev/null
+++ b/kernel/fs/Kconfig
@@ -0,0 +1,281 @@
+#
+# File system configuration
+#
+
+menu "File systems"
+
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+       bool
+
+if BLOCK
+
+source "fs/ext2/Kconfig"
+source "fs/ext3/Kconfig"
+source "fs/ext4/Kconfig"
+source "fs/jbd/Kconfig"
+source "fs/jbd2/Kconfig"
+
+config FS_MBCACHE
+# Meta block cache for Extended Attributes (ext2/ext3/ext4)
+	tristate
+	default y if EXT2_FS=y && EXT2_FS_XATTR
+	default y if EXT3_FS=y && EXT3_FS_XATTR
+	default y if EXT4_FS=y
+	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
+
+source "fs/reiserfs/Kconfig"
+source "fs/jfs/Kconfig"
+
+source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
+source "fs/ocfs2/Kconfig"
+source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
+source "fs/f2fs/Kconfig"
+
+config FS_DAX
+	bool "Direct Access (DAX) support"
+	depends on MMU
+	depends on !(ARM || MIPS || SPARC)
+	help
+	  Direct Access (DAX) can be used on memory-backed block devices.
+	  If the block device supports DAX and the filesystem supports DAX,
+	  then you can avoid using the pagecache to buffer I/Os.  Turning
+	  on this option will compile in support for DAX; you will need to
+	  mount the filesystem using the -o dax option.
+
+	  If you do not have a block device that is capable of using this,
+	  or if unsure, say N.  Saying Y will increase the size of the kernel
+	  by about 5kB.
+
+endif # BLOCK
+
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers.  Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+	def_bool n
+
+config EXPORTFS
+	tristate
+
+config FILE_LOCKING
+	bool "Enable POSIX file locking API" if EXPERT
+	default y
+	help
+	  This option enables standard file locking support, required
+          for filesystems like NFS and for the flock() system
+          call. Disabling this option saves about 11k.
+
+source "fs/notify/Kconfig"
+
+source "fs/quota/Kconfig"
+
+source "fs/autofs4/Kconfig"
+source "fs/fuse/Kconfig"
+source "fs/overlayfs/Kconfig"
+
+menu "Caches"
+
+source "fs/fscache/Kconfig"
+source "fs/cachefiles/Kconfig"
+
+endmenu
+
+if BLOCK
+menu "CD-ROM/DVD Filesystems"
+
+source "fs/isofs/Kconfig"
+source "fs/udf/Kconfig"
+
+endmenu
+endif # BLOCK
+
+if BLOCK
+menu "DOS/FAT/NT Filesystems"
+
+source "fs/fat/Kconfig"
+source "fs/ntfs/Kconfig"
+
+endmenu
+endif # BLOCK
+
+menu "Pseudo filesystems"
+
+source "fs/proc/Kconfig"
+source "fs/kernfs/Kconfig"
+source "fs/sysfs/Kconfig"
+
+config TMPFS
+	bool "Tmpfs virtual memory file system support (former shm fs)"
+	depends on SHMEM
+	help
+	  Tmpfs is a file system which keeps all files in virtual memory.
+
+	  Everything in tmpfs is temporary in the sense that no files will be
+	  created on your hard drive. The files live in memory and swap
+	  space. If you unmount a tmpfs instance, everything stored therein is
+	  lost.
+
+	  See <file:Documentation/filesystems/tmpfs.txt> for details.
+
+config TMPFS_POSIX_ACL
+	bool "Tmpfs POSIX Access Control Lists"
+	depends on TMPFS
+	select TMPFS_XATTR
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support additional access rights
+	  for users and groups beyond the standard owner/group/world scheme,
+	  and this option selects support for ACLs specifically for tmpfs
+	  filesystems.
+
+	  If you've selected TMPFS, it's possible that you'll also need
+	  this option as there are a number of Linux distros that require
+	  POSIX ACL support under /dev for certain features to work properly.
+	  For example, some distros need this feature for ALSA-related /dev
+	  files for sound to work properly.  In short, if you're not sure,
+	  say Y.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+config TMPFS_XATTR
+	bool "Tmpfs extended attributes"
+	depends on TMPFS
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  Currently this enables support for the trusted.* and
+	  security.* namespaces.
+
+	  You need this for POSIX ACL support on tmpfs.
+
+	  If unsure, say N.
+
+config HUGETLBFS
+	bool "HugeTLB file system support"
+	depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
+		   SYS_SUPPORTS_HUGETLBFS || BROKEN
+	help
+	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
+	  ramfs. For architectures that support it, say Y here and read
+	  <file:Documentation/vm/hugetlbpage.txt> for details.
+
+	  If unsure, say N.
+
+config HUGETLB_PAGE
+	def_bool HUGETLBFS
+
+source "fs/configfs/Kconfig"
+source "fs/efivarfs/Kconfig"
+
+endmenu
+
+menuconfig MISC_FILESYSTEMS
+	bool "Miscellaneous filesystems"
+	default y
+	---help---
+	  Say Y here to get to see options for various miscellaneous
+	  filesystems, such as filesystems that came from other
+	  operating systems.
+
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled; if unsure, say Y here.
+
+if MISC_FILESYSTEMS
+
+source "fs/adfs/Kconfig"
+source "fs/affs/Kconfig"
+source "fs/ecryptfs/Kconfig"
+source "fs/hfs/Kconfig"
+source "fs/hfsplus/Kconfig"
+source "fs/befs/Kconfig"
+source "fs/bfs/Kconfig"
+source "fs/efs/Kconfig"
+source "fs/jffs2/Kconfig"
+# UBIFS File system configuration
+source "fs/ubifs/Kconfig"
+source "fs/logfs/Kconfig"
+source "fs/cramfs/Kconfig"
+source "fs/squashfs/Kconfig"
+source "fs/freevxfs/Kconfig"
+source "fs/minix/Kconfig"
+source "fs/omfs/Kconfig"
+source "fs/hpfs/Kconfig"
+source "fs/qnx4/Kconfig"
+source "fs/qnx6/Kconfig"
+source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
+source "fs/sysv/Kconfig"
+source "fs/ufs/Kconfig"
+source "fs/exofs/Kconfig"
+
+endif # MISC_FILESYSTEMS
+
+source "fs/exofs/Kconfig.ore"
+
+menuconfig NETWORK_FILESYSTEMS
+	bool "Network File Systems"
+	default y
+	depends on NET
+	---help---
+	  Say Y here to get to see options for network filesystems and
+	  filesystem-related networking code, such as NFS daemon and
+	  RPCSEC security modules.
+
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled; if unsure, say Y here.
+
+if NETWORK_FILESYSTEMS
+
+source "fs/nfs/Kconfig"
+source "fs/nfsd/Kconfig"
+
+config GRACE_PERIOD
+	tristate
+
+config LOCKD
+	tristate
+	depends on FILE_LOCKING
+	select GRACE_PERIOD
+
+config LOCKD_V4
+	bool
+	depends on NFSD_V3 || NFS_V3
+	depends on FILE_LOCKING
+	default y
+
+config NFS_ACL_SUPPORT
+	tristate
+	select FS_POSIX_ACL
+
+config NFS_COMMON
+	bool
+	depends on NFSD || NFS_FS || LOCKD
+	default y
+
+source "net/sunrpc/Kconfig"
+source "fs/ceph/Kconfig"
+source "fs/cifs/Kconfig"
+source "fs/ncpfs/Kconfig"
+source "fs/coda/Kconfig"
+source "fs/afs/Kconfig"
+source "fs/9p/Kconfig"
+
+endif # NETWORK_FILESYSTEMS
+
+source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
+
+endmenu
diff --git a/kernel/fs/Kconfig.binfmt b/kernel/fs/Kconfig.binfmt
new file mode 100644
index 000000000..2d0cbbd14
--- /dev/null
+++ b/kernel/fs/Kconfig.binfmt
@@ -0,0 +1,181 @@
+config BINFMT_ELF
+	bool "Kernel support for ELF binaries"
+	depends on MMU && (BROKEN || !FRV)
+	default y
+	---help---
+	  ELF (Executable and Linkable Format) is a format for libraries and
+	  executables used across different architectures and operating
+	  systems. Saying Y here will enable your kernel to run ELF binaries
+	  and enlarge it by about 13 KB. ELF support under Linux has now all
+	  but replaced the traditional Linux a.out formats (QMAGIC and ZMAGIC)
+	  because it is portable (this does *not* mean that you will be able
+	  to run executables from different architectures or operating systems
+	  however) and makes building run-time libraries very easy. Many new
+	  executables are distributed solely in ELF format. You definitely
+	  want to say Y here.
+
+	  Information about ELF is contained in the ELF HOWTO available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  If you find that after upgrading from Linux kernel 1.2 and saying Y
+	  here, you still can't run any ELF binaries (they just crash), then
+	  you'll have to install the newest ELF runtime libraries, including
+	  ld.so (check the file <file:Documentation/Changes> for location and
+	  latest version).
+
+config COMPAT_BINFMT_ELF
+	bool
+	depends on COMPAT && BINFMT_ELF
+
+config ARCH_BINFMT_ELF_STATE
+	bool
+
+config BINFMT_ELF_FDPIC
+	bool "Kernel support for FDPIC ELF binaries"
+	default y
+	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+	help
+	  ELF FDPIC binaries are based on ELF, but allow the individual load
+	  segments of a binary to be located in memory independently of each
+	  other. This makes this format ideal for use in environments where no
+	  MMU is available as it still permits text segments to be shared,
+	  even if data segments are not.
+
+	  It is also possible to run FDPIC ELF binaries on MMU linux also.
+
+config CORE_DUMP_DEFAULT_ELF_HEADERS
+	bool "Write ELF core dumps with partial segments"
+	default y
+	depends on BINFMT_ELF && ELF_CORE
+	help
+	  ELF core dump files describe each memory mapping of the crashed
+	  process, and can contain or omit the memory contents of each one.
+	  The contents of an unmodified text mapping are omitted by default.
+
+	  For an unmodified text mapping of an ELF object, including just
+	  the first page of the file in a core dump makes it possible to
+	  identify the build ID bits in the file, without paying the i/o
+	  cost and disk space to dump all the text.  However, versions of
+	  GDB before 6.7 are confused by ELF core dump files in this format.
+
+	  The core dump behavior can be controlled per process using
+	  the /proc/PID/coredump_filter pseudo-file; this setting is
+	  inherited.  See Documentation/filesystems/proc.txt for details.
+
+	  This config option changes the default setting of coredump_filter
+	  seen at boot time.  If unsure, say Y.
+
+config BINFMT_SCRIPT
+	tristate "Kernel support for scripts starting with #!"
+	default y
+	help
+	  Say Y here if you want to execute interpreted scripts starting with
+	  #! followed by the path to an interpreter.
+
+	  You can build this support as a module; however, until that module
+	  gets loaded, you cannot run scripts.  Thus, if you want to load this
+	  module from an initramfs, the portion of the initramfs before loading
+	  this module must consist of compiled binaries only.
+
+	  Most systems will not boot if you say M or N here.  If unsure, say Y.
+
+config BINFMT_FLAT
+	bool "Kernel support for flat binaries"
+	depends on !MMU && (!FRV || BROKEN)
+	help
+	  Support uClinux FLAT format binaries.
+
+config BINFMT_ZFLAT
+	bool "Enable ZFLAT support"
+	depends on BINFMT_FLAT
+	select ZLIB_INFLATE
+	help
+	  Support FLAT format compressed binaries
+
+config BINFMT_SHARED_FLAT
+	bool "Enable shared FLAT support"
+	depends on BINFMT_FLAT
+	help
+	  Support FLAT shared libraries
+
+config HAVE_AOUT
+       def_bool n
+
+config BINFMT_AOUT
+	tristate "Kernel support for a.out and ECOFF binaries"
+	depends on HAVE_AOUT
+	---help---
+	  A.out (Assembler.OUTput) is a set of formats for libraries and
+	  executables used in the earliest versions of UNIX.  Linux used
+	  the a.out formats QMAGIC and ZMAGIC until they were replaced
+	  with the ELF format.
+
+	  The conversion to ELF started in 1995.  This option is primarily
+	  provided for historical interest and for the benefit of those
+	  who need to run binaries from that era.
+
+	  Most people should answer N here.  If you think you may have
+	  occasional use for this format, enable module support above
+	  and answer M here to compile this support as a module called
+	  binfmt_aout.
+
+	  If any crucial components of your system (such as /sbin/init
+	  or /lib/ld.so) are still in a.out format, you will have to
+	  say Y here.
+
+config OSF4_COMPAT
+	bool "OSF/1 v4 readv/writev compatibility"
+	depends on ALPHA && BINFMT_AOUT
+	help
+	  Say Y if you are using OSF/1 binaries (like Netscape and Acrobat)
+	  with v4 shared libraries freely available from Compaq. If you're
+	  going to use shared libraries from Tru64 version 5.0 or later, say N.
+
+config BINFMT_EM86
+	tristate "Kernel support for Linux/Intel ELF binaries"
+	depends on ALPHA
+	---help---
+	  Say Y here if you want to be able to execute Linux/Intel ELF
+	  binaries just like native Alpha binaries on your Alpha machine. For
+	  this to work, you need to have the emulator /usr/bin/em86 in place.
+
+	  You can get the same functionality by saying N here and saying Y to
+	  "Kernel support for MISC binaries".
+
+	  You may answer M to compile the emulation support as a module and
+	  later load the module when you want to use a Linux/Intel binary. The
+	  module will be called binfmt_em86. If unsure, say Y.
+
+config BINFMT_MISC
+	tristate "Kernel support for MISC binaries"
+	---help---
+	  If you say Y here, it will be possible to plug wrapper-driven binary
+	  formats into the kernel. You will like this especially when you use
+	  programs that need an interpreter to run like Java, Python, .NET or
+	  Emacs-Lisp. It's also useful if you often run DOS executables under
+	  the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>). Once you have
+	  registered such a binary class with the kernel, you can start one of
+	  those programs simply by typing in its name at a shell prompt; Linux
+	  will automatically feed it to the correct interpreter.
+
+	  You can do other nice things, too. Read the file
+	  <file:Documentation/binfmt_misc.txt> to learn how to use this
+	  feature, <file:Documentation/java.txt> for information about how
+	  to include Java support. and <file:Documentation/mono.txt> for
+          information about how to include Mono-based .NET support.
+
+          To use binfmt_misc, you will need to mount it:
+		mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc
+
+	  You may say M here for module support and later load the module when
+	  you have use for it; the module is called binfmt_misc. If you
+	  don't know what to answer at this point, say Y.
+
+config COREDUMP
+	bool "Enable core dump support" if EXPERT
+	default y
+	help
+	  This option enables support for performing core dumps. You almost
+	  certainly want to say Y here. Not necessary on systems that never
+	  need debugging or only ever run flawless code.
diff --git a/kernel/fs/Makefile b/kernel/fs/Makefile
new file mode 100644
index 000000000..cb92fd4c3
--- /dev/null
+++ b/kernel/fs/Makefile
@@ -0,0 +1,129 @@
+#
+# Makefile for the Linux filesystems.
+#
+# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+# 
+
+obj-y :=	open.o read_write.o file_table.o super.o \
+		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
+		ioctl.o readdir.o select.o dcache.o inode.o \
+		attr.o bad_inode.o file.o filesystems.o namespace.o \
+		seq_file.o xattr.o libfs.o fs-writeback.o \
+		pnode.o splice.o sync.o utimes.o \
+		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+
+ifeq ($(CONFIG_BLOCK),y)
+obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
+else
+obj-y +=	no-block.o
+endif
+
+obj-$(CONFIG_PROC_FS) += proc_namespace.o
+
+obj-y				+= notify/
+obj-$(CONFIG_EPOLL)		+= eventpoll.o
+obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
+obj-$(CONFIG_SIGNALFD)		+= signalfd.o
+obj-$(CONFIG_TIMERFD)		+= timerfd.o
+obj-$(CONFIG_EVENTFD)		+= eventfd.o
+obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FS_DAX)		+= dax.o
+obj-$(CONFIG_FILE_LOCKING)      += locks.o
+obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
+obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
+obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
+obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
+obj-$(CONFIG_BINFMT_SCRIPT)	+= binfmt_script.o
+obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
+obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
+
+obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
+obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
+obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
+obj-$(CONFIG_COREDUMP)		+= coredump.o
+obj-$(CONFIG_SYSCTL)		+= drop_caches.o
+
+obj-$(CONFIG_FHANDLE)		+= fhandle.o
+
+obj-y				+= quota/
+
+obj-$(CONFIG_PROC_FS)		+= proc/
+obj-$(CONFIG_KERNFS)		+= kernfs/
+obj-$(CONFIG_SYSFS)		+= sysfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
+obj-y				+= devpts/
+
+obj-$(CONFIG_PROFILING)		+= dcookies.o
+obj-$(CONFIG_DLM)		+= dlm/
+ 
+# Do not add any filesystems before this line
+obj-$(CONFIG_FSCACHE)		+= fscache/
+obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
+obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
+obj-$(CONFIG_EXT2_FS)		+= ext2/
+# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
+# unless explicitly requested by rootfstype
+obj-$(CONFIG_EXT4_FS)		+= ext4/
+obj-$(CONFIG_JBD)		+= jbd/
+obj-$(CONFIG_JBD2)		+= jbd2/
+obj-$(CONFIG_CRAMFS)		+= cramfs/
+obj-$(CONFIG_SQUASHFS)		+= squashfs/
+obj-y				+= ramfs/
+obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
+obj-$(CONFIG_CODA_FS)		+= coda/
+obj-$(CONFIG_MINIX_FS)		+= minix/
+obj-$(CONFIG_FAT_FS)		+= fat/
+obj-$(CONFIG_BFS_FS)		+= bfs/
+obj-$(CONFIG_ISO9660_FS)	+= isofs/
+obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
+obj-$(CONFIG_HFS_FS)		+= hfs/
+obj-$(CONFIG_ECRYPT_FS)		+= ecryptfs/
+obj-$(CONFIG_VXFS_FS)		+= freevxfs/
+obj-$(CONFIG_NFS_FS)		+= nfs/
+obj-$(CONFIG_EXPORTFS)		+= exportfs/
+obj-$(CONFIG_NFSD)		+= nfsd/
+obj-$(CONFIG_LOCKD)		+= lockd/
+obj-$(CONFIG_NLS)		+= nls/
+obj-$(CONFIG_SYSV_FS)		+= sysv/
+obj-$(CONFIG_CIFS)		+= cifs/
+obj-$(CONFIG_NCP_FS)		+= ncpfs/
+obj-$(CONFIG_HPFS_FS)		+= hpfs/
+obj-$(CONFIG_NTFS_FS)		+= ntfs/
+obj-$(CONFIG_UFS_FS)		+= ufs/
+obj-$(CONFIG_EFS_FS)		+= efs/
+obj-$(CONFIG_JFFS2_FS)		+= jffs2/
+obj-$(CONFIG_LOGFS)		+= logfs/
+obj-$(CONFIG_UBIFS_FS)		+= ubifs/
+obj-$(CONFIG_AFFS_FS)		+= affs/
+obj-$(CONFIG_ROMFS_FS)		+= romfs/
+obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
+obj-$(CONFIG_QNX6FS_FS)		+= qnx6/
+obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
+obj-$(CONFIG_ADFS_FS)		+= adfs/
+obj-$(CONFIG_FUSE_FS)		+= fuse/
+obj-$(CONFIG_OVERLAY_FS)	+= overlayfs/
+obj-$(CONFIG_UDF_FS)		+= udf/
+obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
+obj-$(CONFIG_OMFS_FS)		+= omfs/
+obj-$(CONFIG_JFS_FS)		+= jfs/
+obj-$(CONFIG_XFS_FS)		+= xfs/
+obj-$(CONFIG_9P_FS)		+= 9p/
+obj-$(CONFIG_AFS_FS)		+= afs/
+obj-$(CONFIG_NILFS2_FS)		+= nilfs2/
+obj-$(CONFIG_BEFS_FS)		+= befs/
+obj-$(CONFIG_HOSTFS)		+= hostfs/
+obj-$(CONFIG_HPPFS)		+= hppfs/
+obj-$(CONFIG_CACHEFILES)	+= cachefiles/
+obj-$(CONFIG_DEBUG_FS)		+= debugfs/
+obj-$(CONFIG_TRACING)		+= tracefs/
+obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
+obj-$(CONFIG_BTRFS_FS)		+= btrfs/
+obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_F2FS_FS)		+= f2fs/
+obj-y				+= exofs/ # Multiple modules
+obj-$(CONFIG_CEPH_FS)		+= ceph/
+obj-$(CONFIG_PSTORE)		+= pstore/
+obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
diff --git a/kernel/fs/adfs/Kconfig b/kernel/fs/adfs/Kconfig
new file mode 100644
index 000000000..c5a7787dd
--- /dev/null
+++ b/kernel/fs/adfs/Kconfig
@@ -0,0 +1,27 @@
+config ADFS_FS
+	tristate "ADFS file system support"
+	depends on BLOCK
+	help
+	  The Acorn Disc Filing System is the standard file system of the
+	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
+	  systems and the Acorn Archimedes range of machines. If you say Y
+	  here, Linux will be able to read from ADFS partitions on hard drives
+	  and from ADFS-formatted floppy discs. If you also want to be able to
+	  write to those devices, say Y to "ADFS write support" below.
+
+	  The ADFS partition should be the first partition (i.e.,
+	  /dev/[hs]d?1) on each of your drives. Please read the file
+	  <file:Documentation/filesystems/adfs.txt> for further details.
+
+	  To compile this code as a module, choose M here: the module will be
+	  called adfs.
+
+	  If unsure, say N.
+
+config ADFS_FS_RW
+	bool "ADFS write support (DANGEROUS)"
+	depends on ADFS_FS
+	help
+	  If you say Y here, you will be able to write to ADFS partitions on
+	  hard drives and ADFS-formatted floppy disks. This is experimental
+	  codes, so if you're unsure, say N.
diff --git a/kernel/fs/adfs/Makefile b/kernel/fs/adfs/Makefile
new file mode 100644
index 000000000..9b2d71a9a
--- /dev/null
+++ b/kernel/fs/adfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux adfs filesystem routines.
+#
+
+obj-$(CONFIG_ADFS_FS) += adfs.o
+
+adfs-objs := dir.o dir_f.o dir_fplus.o file.o inode.o map.o super.o
diff --git a/kernel/fs/adfs/adfs.h b/kernel/fs/adfs/adfs.h
new file mode 100644
index 000000000..24575d9d8
--- /dev/null
+++ b/kernel/fs/adfs/adfs.h
@@ -0,0 +1,208 @@
+#include <linux/fs.h>
+#include <linux/adfs_fs.h>
+
+/* Internal data structures for ADFS */
+
+#define ADFS_FREE_FRAG		 0
+#define ADFS_BAD_FRAG		 1
+#define ADFS_ROOT_FRAG		 2
+
+#define ADFS_NDA_OWNER_READ	(1 << 0)
+#define ADFS_NDA_OWNER_WRITE	(1 << 1)
+#define ADFS_NDA_LOCKED		(1 << 2)
+#define ADFS_NDA_DIRECTORY	(1 << 3)
+#define ADFS_NDA_EXECUTE	(1 << 4)
+#define ADFS_NDA_PUBLIC_READ	(1 << 5)
+#define ADFS_NDA_PUBLIC_WRITE	(1 << 6)
+
+#include "dir_f.h"
+
+struct buffer_head;
+
+/*
+ * adfs file system inode data in memory
+ */
+struct adfs_inode_info {
+	loff_t		mmu_private;
+	unsigned long	parent_id;	/* object id of parent		*/
+	__u32		loadaddr;	/* RISC OS load address		*/
+	__u32		execaddr;	/* RISC OS exec address		*/
+	unsigned int	filetype;	/* RISC OS file type		*/
+	unsigned int	attr;		/* RISC OS permissions		*/
+	unsigned int	stamped:1;	/* RISC OS file has date/time	*/
+	struct inode vfs_inode;
+};
+
+/*
+ * Forward-declare this
+ */
+struct adfs_discmap;
+struct adfs_dir_ops;
+
+/*
+ * ADFS file system superblock data in memory
+ */
+struct adfs_sb_info {
+	union { struct {
+		struct adfs_discmap *s_map;	/* bh list containing map	 */
+		struct adfs_dir_ops *s_dir;	/* directory operations		 */
+		};
+		struct rcu_head rcu;		/* used only at shutdown time	 */
+	};
+	kuid_t		s_uid;		/* owner uid				 */
+	kgid_t		s_gid;		/* owner gid				 */
+	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
+	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
+	int		s_ftsuffix;	/* ,xyz hex filetype suffix option */
+
+	__u32		s_ids_per_zone;	/* max. no ids in one zone		 */
+	__u32		s_idlen;	/* length of ID in map			 */
+	__u32		s_map_size;	/* sector size of a map			 */
+	unsigned long	s_size;		/* total size (in blocks) of this fs	 */
+	signed int	s_map2blk;	/* shift left by this for map->sector	 */
+	unsigned int	s_log2sharesize;/* log2 share size			 */
+	__le32		s_version;	/* disc format version			 */
+	unsigned int	s_namelen;	/* maximum number of characters in name	 */
+};
+
+static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
+{
+	return container_of(inode, struct adfs_inode_info, vfs_inode);
+}
+
+/*
+ * Directory handling
+ */
+struct adfs_dir {
+	struct super_block	*sb;
+
+	int			nr_buffers;
+	struct buffer_head	*bh[4];
+
+	/* big directories need allocated buffers */
+	struct buffer_head	**bh_fplus;
+
+	unsigned int		pos;
+	unsigned int		parent_id;
+
+	struct adfs_dirheader	dirhead;
+	union  adfs_dirtail	dirtail;
+};
+
+/*
+ * This is the overall maximum name length
+ */
+#define ADFS_MAX_NAME_LEN	(256 + 4) /* +4 for ,xyz hex filetype suffix */
+struct object_info {
+	__u32		parent_id;		/* parent object id	*/
+	__u32		file_id;		/* object id		*/
+	__u32		loadaddr;		/* load address		*/
+	__u32		execaddr;		/* execution address	*/
+	__u32		size;			/* size			*/
+	__u8		attr;			/* RISC OS attributes	*/
+	unsigned int	name_len;		/* name length		*/
+	char		name[ADFS_MAX_NAME_LEN];/* file name		*/
+
+	/* RISC OS file type (12-bit: derived from loadaddr) */
+	__u16		filetype;
+};
+
+/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
+static inline int append_filetype_suffix(char *buf, __u16 filetype)
+{
+	if (filetype == 0xffff)	/* no explicit 12-bit file type was set */
+		return 0;
+
+	*buf++ = ',';
+	*buf++ = hex_asc_lo(filetype >> 8);
+	*buf++ = hex_asc_lo(filetype >> 4);
+	*buf++ = hex_asc_lo(filetype >> 0);
+	return 4;
+}
+
+struct adfs_dir_ops {
+	int	(*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
+	int	(*setpos)(struct adfs_dir *dir, unsigned int fpos);
+	int	(*getnext)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*update)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*create)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*remove)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*sync)(struct adfs_dir *dir);
+	void	(*free)(struct adfs_dir *dir);
+};
+
+struct adfs_discmap {
+	struct buffer_head	*dm_bh;
+	__u32			dm_startblk;
+	unsigned int		dm_startbit;
+	unsigned int		dm_endbit;
+};
+
+/* Inode stuff */
+struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
+int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
+
+/* map.c */
+extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigned int offset);
+extern unsigned int adfs_map_free(struct super_block *sb);
+
+/* Misc */
+__printf(3, 4)
+void __adfs_error(struct super_block *sb, const char *function,
+		  const char *fmt, ...);
+#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt)
+
+/* super.c */
+
+/*
+ * Inodes and file operations
+ */
+
+/* dir_*.c */
+extern const struct inode_operations adfs_dir_inode_operations;
+extern const struct file_operations adfs_dir_operations;
+extern const struct dentry_operations adfs_dentry_operations;
+extern struct adfs_dir_ops adfs_f_dir_ops;
+extern struct adfs_dir_ops adfs_fplus_dir_ops;
+
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
+			   int wait);
+
+/* file.c */
+extern const struct inode_operations adfs_file_inode_operations;
+extern const struct file_operations adfs_file_operations;
+
+static inline __u32 signed_asl(__u32 val, signed int shift)
+{
+	if (shift >= 0)
+		val <<= shift;
+	else
+		val >>= -shift;
+	return val;
+}
+
+/*
+ * Calculate the address of a block in an object given the block offset
+ * and the object identity.
+ *
+ * The root directory ID should always be looked up in the map [3.4]
+ */
+static inline int
+__adfs_block_map(struct super_block *sb, unsigned int object_id,
+		 unsigned int block)
+{
+	if (object_id & 255) {
+		unsigned int off;
+
+		off = (object_id & 255) - 1;
+		block += off << ADFS_SB(sb)->s_log2sharesize;
+	}
+
+	return adfs_map_lookup(sb, object_id >> 8, block);
+}
diff --git a/kernel/fs/adfs/dir.c b/kernel/fs/adfs/dir.c
new file mode 100644
index 000000000..51c279a29
--- /dev/null
+++ b/kernel/fs/adfs/dir.c
@@ -0,0 +1,288 @@
+/*
+ *  linux/fs/adfs/dir.c
+ *
+ *  Copyright (C) 1999-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Common directory handling for ADFS
+ */
+#include "adfs.h"
+
+/*
+ * For future.  This should probably be per-directory.
+ */
+static DEFINE_RWLOCK(adfs_dir_lock);
+
+static int
+adfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	struct object_info obj;
+	struct adfs_dir dir;
+	int ret = 0;
+
+	if (ctx->pos >> 32)
+		return 0;
+
+	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
+	if (ret)
+		return ret;
+
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
+			goto free_out;
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
+			goto free_out;
+		ctx->pos = 2;
+	}
+
+	read_lock(&adfs_dir_lock);
+
+	ret = ops->setpos(&dir, ctx->pos - 2);
+	if (ret)
+		goto unlock_out;
+	while (ops->getnext(&dir, &obj) == 0) {
+		if (!dir_emit(ctx, obj.name, obj.name_len,
+			    obj.file_id, DT_UNKNOWN))
+			break;
+		ctx->pos++;
+	}
+
+unlock_out:
+	read_unlock(&adfs_dir_lock);
+
+free_out:
+	ops->free(&dir);
+	return ret;
+}
+
+int
+adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
+{
+	int ret = -EINVAL;
+#ifdef CONFIG_ADFS_FS_RW
+	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	struct adfs_dir dir;
+
+	printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n",
+		 obj->file_id, obj->parent_id);
+
+	if (!ops->update) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ops->read(sb, obj->parent_id, 0, &dir);
+	if (ret)
+		goto out;
+
+	write_lock(&adfs_dir_lock);
+	ret = ops->update(&dir, obj);
+	write_unlock(&adfs_dir_lock);
+
+	if (wait) {
+		int err = ops->sync(&dir);
+		if (!ret)
+			ret = err;
+	}
+
+	ops->free(&dir);
+out:
+#endif
+	return ret;
+}
+
+static int
+adfs_match(struct qstr *name, struct object_info *obj)
+{
+	int i;
+
+	if (name->len != obj->name_len)
+		return 0;
+
+	for (i = 0; i < name->len; i++) {
+		char c1, c2;
+
+		c1 = name->name[i];
+		c2 = obj->name[i];
+
+		if (c1 >= 'A' && c1 <= 'Z')
+			c1 += 'a' - 'A';
+		if (c2 >= 'A' && c2 <= 'Z')
+			c2 += 'a' - 'A';
+
+		if (c1 != c2)
+			return 0;
+	}
+	return 1;
+}
+
+static int
+adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj)
+{
+	struct super_block *sb = inode->i_sb;
+	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	struct adfs_dir dir;
+	int ret;
+
+	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
+	if (ret)
+		goto out;
+
+	if (ADFS_I(inode)->parent_id != dir.parent_id) {
+		adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n",
+			   ADFS_I(inode)->parent_id, dir.parent_id);
+		ret = -EIO;
+		goto free_out;
+	}
+
+	obj->parent_id = inode->i_ino;
+
+	/*
+	 * '.' is handled by reserved_lookup() in fs/namei.c
+	 */
+	if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') {
+		/*
+		 * Currently unable to fill in the rest of 'obj',
+		 * but this is better than nothing.  We need to
+		 * ascend one level to find it's parent.
+		 */
+		obj->name_len = 0;
+		obj->file_id  = obj->parent_id;
+		goto free_out;
+	}
+
+	read_lock(&adfs_dir_lock);
+
+	ret = ops->setpos(&dir, 0);
+	if (ret)
+		goto unlock_out;
+
+	ret = -ENOENT;
+	while (ops->getnext(&dir, obj) == 0) {
+		if (adfs_match(name, obj)) {
+			ret = 0;
+			break;
+		}
+	}
+
+unlock_out:
+	read_unlock(&adfs_dir_lock);
+
+free_out:
+	ops->free(&dir);
+out:
+	return ret;
+}
+
+const struct file_operations adfs_dir_operations = {
+	.read		= generic_read_dir,
+	.llseek		= generic_file_llseek,
+	.iterate	= adfs_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+static int
+adfs_hash(const struct dentry *parent, struct qstr *qstr)
+{
+	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
+	const unsigned char *name;
+	unsigned long hash;
+	int i;
+
+	if (qstr->len < name_len)
+		return 0;
+
+	/*
+	 * Truncate the name in place, avoids
+	 * having to define a compare function.
+	 */
+	qstr->len = i = name_len;
+	name = qstr->name;
+	hash = init_name_hash();
+	while (i--) {
+		char c;
+
+		c = *name++;
+		if (c >= 'A' && c <= 'Z')
+			c += 'a' - 'A';
+
+		hash = partial_name_hash(c, hash);
+	}
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Compare two names, taking note of the name length
+ * requirements of the underlying filesystem.
+ */
+static int
+adfs_compare(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	int i;
+
+	if (len != name->len)
+		return 1;
+
+	for (i = 0; i < name->len; i++) {
+		char a, b;
+
+		a = str[i];
+		b = name->name[i];
+
+		if (a >= 'A' && a <= 'Z')
+			a += 'a' - 'A';
+		if (b >= 'A' && b <= 'Z')
+			b += 'a' - 'A';
+
+		if (a != b)
+			return 1;
+	}
+	return 0;
+}
+
+const struct dentry_operations adfs_dentry_operations = {
+	.d_hash		= adfs_hash,
+	.d_compare	= adfs_compare,
+};
+
+static struct dentry *
+adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode = NULL;
+	struct object_info obj;
+	int error;
+
+	error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
+	if (error == 0) {
+		error = -EACCES;
+		/*
+		 * This only returns NULL if get_empty_inode
+		 * fails.
+		 */
+		inode = adfs_iget(dir->i_sb, &obj);
+		if (inode)
+			error = 0;
+	}
+	d_add(dentry, inode);
+	return ERR_PTR(error);
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations adfs_dir_inode_operations = {
+	.lookup		= adfs_lookup,
+	.setattr	= adfs_notify_change,
+};
diff --git a/kernel/fs/adfs/dir_f.c b/kernel/fs/adfs/dir_f.c
new file mode 100644
index 000000000..4bbe853ee
--- /dev/null
+++ b/kernel/fs/adfs/dir_f.c
@@ -0,0 +1,486 @@
+/*
+ *  linux/fs/adfs/dir_f.c
+ *
+ * Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  E and F format directory handling
+ */
+#include <linux/buffer_head.h>
+#include "adfs.h"
+#include "dir_f.h"
+
+static void adfs_f_free(struct adfs_dir *dir);
+
+/*
+ * Read an (unaligned) value of length 1..4 bytes
+ */
+static inline unsigned int adfs_readval(unsigned char *p, int len)
+{
+	unsigned int val = 0;
+
+	switch (len) {
+	case 4:		val |= p[3] << 24;
+	case 3:		val |= p[2] << 16;
+	case 2:		val |= p[1] << 8;
+	default:	val |= p[0];
+	}
+	return val;
+}
+
+static inline void adfs_writeval(unsigned char *p, int len, unsigned int val)
+{
+	switch (len) {
+	case 4:		p[3] = val >> 24;
+	case 3:		p[2] = val >> 16;
+	case 2:		p[1] = val >> 8;
+	default:	p[0] = val;
+	}
+}
+
+static inline int adfs_readname(char *buf, char *ptr, int maxlen)
+{
+	char *old_buf = buf;
+
+	while ((unsigned char)*ptr >= ' ' && maxlen--) {
+		if (*ptr == '/')
+			*buf++ = '.';
+		else
+			*buf++ = *ptr;
+		ptr++;
+	}
+
+	return buf - old_buf;
+}
+
+#define ror13(v) ((v >> 13) | (v << 19))
+
+#define dir_u8(idx)				\
+	({ int _buf = idx >> blocksize_bits;	\
+	   int _off = idx - (_buf << blocksize_bits);\
+	  *(u8 *)(bh[_buf]->b_data + _off);	\
+	})
+
+#define dir_u32(idx)				\
+	({ int _buf = idx >> blocksize_bits;	\
+	   int _off = idx - (_buf << blocksize_bits);\
+	  *(__le32 *)(bh[_buf]->b_data + _off);	\
+	})
+
+#define bufoff(_bh,_idx)			\
+	({ int _buf = _idx >> blocksize_bits;	\
+	   int _off = _idx - (_buf << blocksize_bits);\
+	  (u8 *)(_bh[_buf]->b_data + _off);	\
+	})
+
+/*
+ * There are some algorithms that are nice in
+ * assembler, but a bitch in C...  This is one
+ * of them.
+ */
+static u8
+adfs_dir_checkbyte(const struct adfs_dir *dir)
+{
+	struct buffer_head * const *bh = dir->bh;
+	const int blocksize_bits = dir->sb->s_blocksize_bits;
+	union { __le32 *ptr32; u8 *ptr8; } ptr, end;
+	u32 dircheck = 0;
+	int last = 5 - 26;
+	int i = 0;
+
+	/*
+	 * Accumulate each word up to the last whole
+	 * word of the last directory entry.  This
+	 * can spread across several buffer heads.
+	 */
+	do {
+		last += 26;
+		do {
+			dircheck = le32_to_cpu(dir_u32(i)) ^ ror13(dircheck);
+
+			i += sizeof(u32);
+		} while (i < (last & ~3));
+	} while (dir_u8(last) != 0);
+
+	/*
+	 * Accumulate the last few bytes.  These
+	 * bytes will be within the same bh.
+	 */
+	if (i != last) {
+		ptr.ptr8 = bufoff(bh, i);
+		end.ptr8 = ptr.ptr8 + last - i;
+
+		do {
+			dircheck = *ptr.ptr8++ ^ ror13(dircheck);
+		} while (ptr.ptr8 < end.ptr8);
+	}
+
+	/*
+	 * The directory tail is in the final bh
+	 * Note that contary to the RISC OS PRMs,
+	 * the first few bytes are NOT included
+	 * in the check.  All bytes are in the
+	 * same bh.
+	 */
+	ptr.ptr8 = bufoff(bh, 2008);
+	end.ptr8 = ptr.ptr8 + 36;
+
+	do {
+		__le32 v = *ptr.ptr32++;
+		dircheck = le32_to_cpu(v) ^ ror13(dircheck);
+	} while (ptr.ptr32 < end.ptr32);
+
+	return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff;
+}
+
+/*
+ * Read and check that a directory is valid
+ */
+static int
+adfs_dir_read(struct super_block *sb, unsigned long object_id,
+	      unsigned int size, struct adfs_dir *dir)
+{
+	const unsigned int blocksize_bits = sb->s_blocksize_bits;
+	int blk = 0;
+
+	/*
+	 * Directories which are not a multiple of 2048 bytes
+	 * are considered bad v2 [3.6]
+	 */
+	if (size & 2047)
+		goto bad_dir;
+
+	size >>= blocksize_bits;
+
+	dir->nr_buffers = 0;
+	dir->sb = sb;
+
+	for (blk = 0; blk < size; blk++) {
+		int phys;
+
+		phys = __adfs_block_map(sb, object_id, blk);
+		if (!phys) {
+			adfs_error(sb, "dir object %lX has a hole at offset %d",
+				   object_id, blk);
+			goto release_buffers;
+		}
+
+		dir->bh[blk] = sb_bread(sb, phys);
+		if (!dir->bh[blk])
+			goto release_buffers;
+	}
+
+	memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead));
+	memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail));
+
+	if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq ||
+	    memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4))
+		goto bad_dir;
+
+	if (memcmp(&dir->dirhead.startname, "Nick", 4) &&
+	    memcmp(&dir->dirhead.startname, "Hugo", 4))
+		goto bad_dir;
+
+	if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte)
+		goto bad_dir;
+
+	dir->nr_buffers = blk;
+
+	return 0;
+
+bad_dir:
+	adfs_error(sb, "corrupted directory fragment %lX",
+		   object_id);
+release_buffers:
+	for (blk -= 1; blk >= 0; blk -= 1)
+		brelse(dir->bh[blk]);
+
+	dir->sb = NULL;
+
+	return -EIO;
+}
+
+/*
+ * convert a disk-based directory entry to a Linux ADFS directory entry
+ */
+static inline void
+adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
+	struct adfs_direntry *de)
+{
+	obj->name_len =	adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
+	obj->file_id  = adfs_readval(de->dirinddiscadd, 3);
+	obj->loadaddr = adfs_readval(de->dirload, 4);
+	obj->execaddr = adfs_readval(de->direxec, 4);
+	obj->size     = adfs_readval(de->dirlen,  4);
+	obj->attr     = de->newdiratts;
+	obj->filetype = -1;
+
+	/*
+	 * object is a file and is filetyped and timestamped?
+	 * RISC OS 12-bit filetype is stored in load_address[19:8]
+	 */
+	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+
+		/* optionally append the ,xyz hex filetype suffix */
+		if (ADFS_SB(dir->sb)->s_ftsuffix)
+			obj->name_len +=
+				append_filetype_suffix(
+					&obj->name[obj->name_len],
+					obj->filetype);
+	}
+}
+
+/*
+ * convert a Linux ADFS directory entry to a disk-based directory entry
+ */
+static inline void
+adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj)
+{
+	adfs_writeval(de->dirinddiscadd, 3, obj->file_id);
+	adfs_writeval(de->dirload, 4, obj->loadaddr);
+	adfs_writeval(de->direxec, 4, obj->execaddr);
+	adfs_writeval(de->dirlen,  4, obj->size);
+	de->newdiratts = obj->attr;
+}
+
+/*
+ * get a directory entry.  Note that the caller is responsible
+ * for holding the relevant locks.
+ */
+static int
+__adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
+{
+	struct super_block *sb = dir->sb;
+	struct adfs_direntry de;
+	int thissize, buffer, offset;
+
+	buffer = pos >> sb->s_blocksize_bits;
+
+	if (buffer > dir->nr_buffers)
+		return -EINVAL;
+
+	offset = pos & (sb->s_blocksize - 1);
+	thissize = sb->s_blocksize - offset;
+	if (thissize > 26)
+		thissize = 26;
+
+	memcpy(&de, dir->bh[buffer]->b_data + offset, thissize);
+	if (thissize != 26)
+		memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data,
+		       26 - thissize);
+
+	if (!de.dirobname[0])
+		return -ENOENT;
+
+	adfs_dir2obj(dir, obj, &de);
+
+	return 0;
+}
+
+static int
+__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj)
+{
+	struct super_block *sb = dir->sb;
+	struct adfs_direntry de;
+	int thissize, buffer, offset;
+
+	buffer = pos >> sb->s_blocksize_bits;
+
+	if (buffer > dir->nr_buffers)
+		return -EINVAL;
+
+	offset = pos & (sb->s_blocksize - 1);
+	thissize = sb->s_blocksize - offset;
+	if (thissize > 26)
+		thissize = 26;
+
+	/*
+	 * Get the entry in total
+	 */
+	memcpy(&de, dir->bh[buffer]->b_data + offset, thissize);
+	if (thissize != 26)
+		memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data,
+		       26 - thissize);
+
+	/*
+	 * update it
+	 */
+	adfs_obj2dir(&de, obj);
+
+	/*
+	 * Put the new entry back
+	 */
+	memcpy(dir->bh[buffer]->b_data + offset, &de, thissize);
+	if (thissize != 26)
+		memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize,
+		       26 - thissize);
+
+	return 0;
+}
+
+/*
+ * the caller is responsible for holding the necessary
+ * locks.
+ */
+static int
+adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id)
+{
+	int pos, ret;
+
+	ret = -ENOENT;
+
+	for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) {
+		struct object_info obj;
+
+		if (!__adfs_dir_get(dir, pos, &obj))
+			break;
+
+		if (obj.file_id == object_id) {
+			ret = pos;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+adfs_f_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir)
+{
+	int ret;
+
+	if (sz != ADFS_NEWDIR_SIZE)
+		return -EIO;
+
+	ret = adfs_dir_read(sb, id, sz, dir);
+	if (ret)
+		adfs_error(sb, "unable to read directory");
+	else
+		dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3);
+
+	return ret;
+}
+
+static int
+adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos)
+{
+	if (fpos >= ADFS_NUM_DIR_ENTRIES)
+		return -ENOENT;
+
+	dir->pos = 5 + fpos * 26;
+	return 0;
+}
+
+static int
+adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj)
+{
+	unsigned int ret;
+
+	ret = __adfs_dir_get(dir, dir->pos, obj);
+	if (ret == 0)
+		dir->pos += 26;
+
+	return ret;
+}
+
+static int
+adfs_f_update(struct adfs_dir *dir, struct object_info *obj)
+{
+	struct super_block *sb = dir->sb;
+	int ret, i;
+
+	ret = adfs_dir_find_entry(dir, obj->file_id);
+	if (ret < 0) {
+		adfs_error(dir->sb, "unable to locate entry to update");
+		goto out;
+	}
+
+	__adfs_dir_put(dir, ret, obj);
+ 
+	/*
+	 * Increment directory sequence number
+	 */
+	dir->bh[0]->b_data[0] += 1;
+	dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1;
+
+	ret = adfs_dir_checkbyte(dir);
+	/*
+	 * Update directory check byte
+	 */
+	dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret;
+
+#if 1
+	{
+	const unsigned int blocksize_bits = sb->s_blocksize_bits;
+
+	memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead));
+	memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail));
+
+	if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq ||
+	    memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4))
+		goto bad_dir;
+
+	if (memcmp(&dir->dirhead.startname, "Nick", 4) &&
+	    memcmp(&dir->dirhead.startname, "Hugo", 4))
+		goto bad_dir;
+
+	if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte)
+		goto bad_dir;
+	}
+#endif
+	for (i = dir->nr_buffers - 1; i >= 0; i--)
+		mark_buffer_dirty(dir->bh[i]);
+
+	ret = 0;
+out:
+	return ret;
+#if 1
+bad_dir:
+	adfs_error(dir->sb, "whoops!  I broke a directory!");
+	return -EIO;
+#endif
+}
+
+static int
+adfs_f_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
+static void
+adfs_f_free(struct adfs_dir *dir)
+{
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		brelse(dir->bh[i]);
+		dir->bh[i] = NULL;
+	}
+
+	dir->nr_buffers = 0;
+	dir->sb = NULL;
+}
+
+struct adfs_dir_ops adfs_f_dir_ops = {
+	.read		= adfs_f_read,
+	.setpos		= adfs_f_setpos,
+	.getnext	= adfs_f_getnext,
+	.update		= adfs_f_update,
+	.sync		= adfs_f_sync,
+	.free		= adfs_f_free
+};
diff --git a/kernel/fs/adfs/dir_f.h b/kernel/fs/adfs/dir_f.h
new file mode 100644
index 000000000..e47134040
--- /dev/null
+++ b/kernel/fs/adfs/dir_f.h
@@ -0,0 +1,65 @@
+/*
+ *  linux/fs/adfs/dir_f.h
+ *
+ *  Copyright (C) 1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Structures of directories on the F format disk
+ */
+#ifndef ADFS_DIR_F_H
+#define ADFS_DIR_F_H
+
+/*
+ * Directory header
+ */
+struct adfs_dirheader {
+	unsigned char startmasseq;
+	unsigned char startname[4];
+};
+
+#define ADFS_NEWDIR_SIZE	2048
+#define ADFS_NUM_DIR_ENTRIES	77
+
+/*
+ * Directory entries
+ */
+struct adfs_direntry {
+#define ADFS_F_NAME_LEN 10
+	char dirobname[ADFS_F_NAME_LEN];
+	__u8 dirload[4];
+	__u8 direxec[4];
+	__u8 dirlen[4];
+	__u8 dirinddiscadd[3];
+	__u8 newdiratts;
+};
+
+/*
+ * Directory tail
+ */
+union adfs_dirtail {
+	struct {
+		unsigned char dirlastmask;
+		char dirname[10];
+		unsigned char dirparent[3];
+		char dirtitle[19];
+		unsigned char reserved[14];
+		unsigned char endmasseq;
+		unsigned char endname[4];
+		unsigned char dircheckbyte;
+	} old;
+	struct {
+		unsigned char dirlastmask;
+		unsigned char reserved[2];
+		unsigned char dirparent[3];
+		char dirtitle[19];
+		char dirname[10];
+		unsigned char endmasseq;
+		unsigned char endname[4];
+		unsigned char dircheckbyte;
+	} new;
+};
+
+#endif
diff --git a/kernel/fs/adfs/dir_fplus.c b/kernel/fs/adfs/dir_fplus.c
new file mode 100644
index 000000000..82d14cdf7
--- /dev/null
+++ b/kernel/fs/adfs/dir_fplus.c
@@ -0,0 +1,265 @@
+/*
+ *  linux/fs/adfs/dir_fplus.c
+ *
+ *  Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include "adfs.h"
+#include "dir_fplus.h"
+
+static int
+adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir)
+{
+	struct adfs_bigdirheader *h;
+	struct adfs_bigdirtail *t;
+	unsigned long block;
+	unsigned int blk, size;
+	int i, ret = -EIO;
+
+	dir->nr_buffers = 0;
+
+	/* start off using fixed bh set - only alloc for big dirs */
+	dir->bh_fplus = &dir->bh[0];
+
+	block = __adfs_block_map(sb, id, 0);
+	if (!block) {
+		adfs_error(sb, "dir object %X has a hole at offset 0", id);
+		goto out;
+	}
+
+	dir->bh_fplus[0] = sb_bread(sb, block);
+	if (!dir->bh_fplus[0])
+		goto out;
+	dir->nr_buffers += 1;
+
+	h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
+	size = le32_to_cpu(h->bigdirsize);
+	if (size != sz) {
+		printk(KERN_WARNING "adfs: adfs_fplus_read:"
+					" directory header size %X\n"
+					" does not match directory size %X\n",
+					size, sz);
+	}
+
+	if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
+	    h->bigdirversion[2] != 0 || size & 2047 ||
+	    h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
+		printk(KERN_WARNING "adfs: dir object %X has"
+					" malformed dir header\n", id);
+		goto out;
+	}
+
+	size >>= sb->s_blocksize_bits;
+	if (size > ARRAY_SIZE(dir->bh)) {
+		/* this directory is too big for fixed bh set, must allocate */
+		struct buffer_head **bh_fplus =
+			kcalloc(size, sizeof(struct buffer_head *),
+				GFP_KERNEL);
+		if (!bh_fplus) {
+			ret = -ENOMEM;
+			adfs_error(sb, "not enough memory for"
+					" dir object %X (%d blocks)", id, size);
+			goto out;
+		}
+		dir->bh_fplus = bh_fplus;
+		/* copy over the pointer to the block that we've already read */
+		dir->bh_fplus[0] = dir->bh[0];
+	}
+
+	for (blk = 1; blk < size; blk++) {
+		block = __adfs_block_map(sb, id, blk);
+		if (!block) {
+			adfs_error(sb, "dir object %X has a hole at offset %d", id, blk);
+			goto out;
+		}
+
+		dir->bh_fplus[blk] = sb_bread(sb, block);
+		if (!dir->bh_fplus[blk]) {
+			adfs_error(sb,	"dir object %x failed read for offset %d, mapped block %lX",
+				   id, blk, block);
+			goto out;
+		}
+
+		dir->nr_buffers += 1;
+	}
+
+	t = (struct adfs_bigdirtail *)
+		(dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
+
+	if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
+	    t->bigdirendmasseq != h->startmasseq ||
+	    t->reserved[0] != 0 || t->reserved[1] != 0) {
+		printk(KERN_WARNING "adfs: dir object %X has "
+					"malformed dir end\n", id);
+		goto out;
+	}
+
+	dir->parent_id = le32_to_cpu(h->bigdirparent);
+	dir->sb = sb;
+	return 0;
+
+out:
+	if (dir->bh_fplus) {
+		for (i = 0; i < dir->nr_buffers; i++)
+			brelse(dir->bh_fplus[i]);
+
+		if (&dir->bh[0] != dir->bh_fplus)
+			kfree(dir->bh_fplus);
+
+		dir->bh_fplus = NULL;
+	}
+
+	dir->nr_buffers = 0;
+	dir->sb = NULL;
+	return ret;
+}
+
+static int
+adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
+{
+	struct adfs_bigdirheader *h =
+		(struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
+	int ret = -ENOENT;
+
+	if (fpos <= le32_to_cpu(h->bigdirentries)) {
+		dir->pos = fpos;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static void
+dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
+{
+	struct super_block *sb = dir->sb;
+	unsigned int buffer, partial, remainder;
+
+	buffer = offset >> sb->s_blocksize_bits;
+	offset &= sb->s_blocksize - 1;
+
+	partial = sb->s_blocksize - offset;
+
+	if (partial >= len)
+		memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
+	else {
+		char *c = (char *)to;
+
+		remainder = len - partial;
+
+		memcpy(c,
+			dir->bh_fplus[buffer]->b_data + offset,
+			partial);
+
+		memcpy(c + partial,
+			dir->bh_fplus[buffer + 1]->b_data,
+			remainder);
+	}
+}
+
+static int
+adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
+{
+	struct adfs_bigdirheader *h =
+		(struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
+	struct adfs_bigdirentry bde;
+	unsigned int offset;
+	int i, ret = -ENOENT;
+
+	if (dir->pos >= le32_to_cpu(h->bigdirentries))
+		goto out;
+
+	offset = offsetof(struct adfs_bigdirheader, bigdirname);
+	offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3);
+	offset += dir->pos * sizeof(struct adfs_bigdirentry);
+
+	dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry));
+
+	obj->loadaddr = le32_to_cpu(bde.bigdirload);
+	obj->execaddr = le32_to_cpu(bde.bigdirexec);
+	obj->size     = le32_to_cpu(bde.bigdirlen);
+	obj->file_id  = le32_to_cpu(bde.bigdirindaddr);
+	obj->attr     = le32_to_cpu(bde.bigdirattr);
+	obj->name_len = le32_to_cpu(bde.bigdirobnamelen);
+
+	offset = offsetof(struct adfs_bigdirheader, bigdirname);
+	offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3);
+	offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry);
+	offset += le32_to_cpu(bde.bigdirobnameptr);
+
+	dir_memcpy(dir, offset, obj->name, obj->name_len);
+	for (i = 0; i < obj->name_len; i++)
+		if (obj->name[i] == '/')
+			obj->name[i] = '.';
+
+	obj->filetype = -1;
+
+	/*
+	 * object is a file and is filetyped and timestamped?
+	 * RISC OS 12-bit filetype is stored in load_address[19:8]
+	 */
+	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+
+		/* optionally append the ,xyz hex filetype suffix */
+		if (ADFS_SB(dir->sb)->s_ftsuffix)
+			obj->name_len +=
+				append_filetype_suffix(
+					&obj->name[obj->name_len],
+					obj->filetype);
+	}
+
+	dir->pos += 1;
+	ret = 0;
+out:
+	return ret;
+}
+
+static int
+adfs_fplus_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh_fplus[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
+static void
+adfs_fplus_free(struct adfs_dir *dir)
+{
+	int i;
+
+	if (dir->bh_fplus) {
+		for (i = 0; i < dir->nr_buffers; i++)
+			brelse(dir->bh_fplus[i]);
+
+		if (&dir->bh[0] != dir->bh_fplus)
+			kfree(dir->bh_fplus);
+
+		dir->bh_fplus = NULL;
+	}
+
+	dir->nr_buffers = 0;
+	dir->sb = NULL;
+}
+
+struct adfs_dir_ops adfs_fplus_dir_ops = {
+	.read		= adfs_fplus_read,
+	.setpos		= adfs_fplus_setpos,
+	.getnext	= adfs_fplus_getnext,
+	.sync		= adfs_fplus_sync,
+	.free		= adfs_fplus_free
+};
diff --git a/kernel/fs/adfs/dir_fplus.h b/kernel/fs/adfs/dir_fplus.h
new file mode 100644
index 000000000..b55aa41a6
--- /dev/null
+++ b/kernel/fs/adfs/dir_fplus.h
@@ -0,0 +1,45 @@
+/*
+ *  linux/fs/adfs/dir_fplus.h
+ *
+ *  Copyright (C) 1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Structures of directories on the F+ format disk
+ */
+
+#define ADFS_FPLUS_NAME_LEN	255
+
+#define BIGDIRSTARTNAME ('S' | 'B' << 8 | 'P' << 16 | 'r' << 24)
+#define BIGDIRENDNAME	('o' | 'v' << 8 | 'e' << 16 | 'n' << 24)
+
+struct adfs_bigdirheader {
+	__u8	startmasseq;
+	__u8	bigdirversion[3];
+	__le32	bigdirstartname;
+	__le32	bigdirnamelen;
+	__le32	bigdirsize;
+	__le32	bigdirentries;
+	__le32	bigdirnamesize;
+	__le32	bigdirparent;
+	char	bigdirname[1];
+};
+
+struct adfs_bigdirentry {
+	__le32	bigdirload;
+	__le32	bigdirexec;
+	__le32	bigdirlen;
+	__le32	bigdirindaddr;
+	__le32	bigdirattr;
+	__le32	bigdirobnamelen;
+	__le32	bigdirobnameptr;
+};
+
+struct adfs_bigdirtail {
+	__le32	bigdirendname;
+	__u8	bigdirendmasseq;
+	__u8	reserved[2];
+	__u8	bigdircheckbyte;
+};
diff --git a/kernel/fs/adfs/file.c b/kernel/fs/adfs/file.c
new file mode 100644
index 000000000..46c0d5671
--- /dev/null
+++ b/kernel/fs/adfs/file.c
@@ -0,0 +1,35 @@
+/*
+ *  linux/fs/adfs/file.c
+ *
+ * Copyright (C) 1997-1999 Russell King
+ * from:
+ *
+ *  linux/fs/ext2/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  adfs regular file handling primitives           
+ */
+#include "adfs.h"
+
+const struct file_operations adfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.mmap		= generic_file_mmap,
+	.fsync		= generic_file_fsync,
+	.write_iter	= generic_file_write_iter,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations adfs_file_inode_operations = {
+	.setattr	= adfs_notify_change,
+};
diff --git a/kernel/fs/adfs/inode.c b/kernel/fs/adfs/inode.c
new file mode 100644
index 000000000..335055d82
--- /dev/null
+++ b/kernel/fs/adfs/inode.c
@@ -0,0 +1,371 @@
+/*
+ *  linux/fs/adfs/inode.c
+ *
+ *  Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include "adfs.h"
+
+/*
+ * Lookup/Create a block at offset 'block' into 'inode'.  We currently do
+ * not support creation of new blocks, so we return -EIO for this case.
+ */
+static int
+adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
+	       int create)
+{
+	if (!create) {
+		if (block >= inode->i_blocks)
+			goto abort_toobig;
+
+		block = __adfs_block_map(inode->i_sb, inode->i_ino, block);
+		if (block)
+			map_bh(bh, inode->i_sb, block);
+		return 0;
+	}
+	/* don't support allocation of blocks yet */
+	return -EIO;
+
+abort_toobig:
+	return 0;
+}
+
+static int adfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, adfs_get_block, wbc);
+}
+
+static int adfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, adfs_get_block);
+}
+
+static void adfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size)
+		truncate_pagecache(inode, inode->i_size);
+}
+
+static int adfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				adfs_get_block,
+				&ADFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret))
+		adfs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, adfs_get_block);
+}
+
+static const struct address_space_operations adfs_aops = {
+	.readpage	= adfs_readpage,
+	.writepage	= adfs_writepage,
+	.write_begin	= adfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= _adfs_bmap
+};
+
+/*
+ * Convert ADFS attributes and filetype to Linux permission.
+ */
+static umode_t
+adfs_atts2mode(struct super_block *sb, struct inode *inode)
+{
+	unsigned int attr = ADFS_I(inode)->attr;
+	umode_t mode, rmask;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	if (attr & ADFS_NDA_DIRECTORY) {
+		mode = S_IRUGO & asb->s_owner_mask;
+		return S_IFDIR | S_IXUGO | mode;
+	}
+
+	switch (ADFS_I(inode)->filetype) {
+	case 0xfc0:	/* LinkFS */
+		return S_IFLNK|S_IRWXUGO;
+
+	case 0xfe6:	/* UnixExec */
+		rmask = S_IRUGO | S_IXUGO;
+		break;
+
+	default:
+		rmask = S_IRUGO;
+	}
+
+	mode = S_IFREG;
+
+	if (attr & ADFS_NDA_OWNER_READ)
+		mode |= rmask & asb->s_owner_mask;
+
+	if (attr & ADFS_NDA_OWNER_WRITE)
+		mode |= S_IWUGO & asb->s_owner_mask;
+
+	if (attr & ADFS_NDA_PUBLIC_READ)
+		mode |= rmask & asb->s_other_mask;
+
+	if (attr & ADFS_NDA_PUBLIC_WRITE)
+		mode |= S_IWUGO & asb->s_other_mask;
+	return mode;
+}
+
+/*
+ * Convert Linux permission to ADFS attribute.  We try to do the reverse
+ * of atts2mode, but there is not a 1:1 translation.
+ */
+static int
+adfs_mode2atts(struct super_block *sb, struct inode *inode)
+{
+	umode_t mode;
+	int attr;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	/* FIXME: should we be able to alter a link? */
+	if (S_ISLNK(inode->i_mode))
+		return ADFS_I(inode)->attr;
+
+	if (S_ISDIR(inode->i_mode))
+		attr = ADFS_NDA_DIRECTORY;
+	else
+		attr = 0;
+
+	mode = inode->i_mode & asb->s_owner_mask;
+	if (mode & S_IRUGO)
+		attr |= ADFS_NDA_OWNER_READ;
+	if (mode & S_IWUGO)
+		attr |= ADFS_NDA_OWNER_WRITE;
+
+	mode = inode->i_mode & asb->s_other_mask;
+	mode &= ~asb->s_owner_mask;
+	if (mode & S_IRUGO)
+		attr |= ADFS_NDA_PUBLIC_READ;
+	if (mode & S_IWUGO)
+		attr |= ADFS_NDA_PUBLIC_WRITE;
+
+	return attr;
+}
+
+/*
+ * Convert an ADFS time to Unix time.  ADFS has a 40-bit centi-second time
+ * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
+ * of time to convert from RISC OS epoch to Unix epoch.
+ */
+static void
+adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
+{
+	unsigned int high, low;
+	/* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
+	 * 01 Jan 1900 00:00:00 (RISC OS epoch)
+	 */
+	static const s64 nsec_unix_epoch_diff_risc_os_epoch =
+							2208988800000000000LL;
+	s64 nsec;
+
+	if (ADFS_I(inode)->stamped == 0)
+		goto cur_time;
+
+	high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
+	low  = ADFS_I(inode)->execaddr;    /* bottom 32 bits of timestamp */
+
+	/* convert 40-bit centi-seconds to 32-bit seconds
+	 * going via nanoseconds to retain precision
+	 */
+	nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */
+
+	/* Files dated pre  01 Jan 1970 00:00:00. */
+	if (nsec < nsec_unix_epoch_diff_risc_os_epoch)
+		goto too_early;
+
+	/* convert from RISC OS to Unix epoch */
+	nsec -= nsec_unix_epoch_diff_risc_os_epoch;
+
+	*tv = ns_to_timespec(nsec);
+	return;
+
+ cur_time:
+	*tv = CURRENT_TIME;
+	return;
+
+ too_early:
+	tv->tv_sec = tv->tv_nsec = 0;
+	return;
+}
+
+/*
+ * Convert an Unix time to ADFS time.  We only do this if the entry has a
+ * time/date stamp already.
+ */
+static void
+adfs_unix2adfs_time(struct inode *inode, unsigned int secs)
+{
+	unsigned int high, low;
+
+	if (ADFS_I(inode)->stamped) {
+		/* convert 32-bit seconds to 40-bit centi-seconds */
+		low  = (secs & 255) * 100;
+		high = (secs / 256) * 100 + (low >> 8) + 0x336e996a;
+
+		ADFS_I(inode)->loadaddr = (high >> 24) |
+				(ADFS_I(inode)->loadaddr & ~0xff);
+		ADFS_I(inode)->execaddr = (low & 255) | (high << 8);
+	}
+}
+
+/*
+ * Fill in the inode information from the object information.
+ *
+ * Note that this is an inode-less filesystem, so we can't use the inode
+ * number to reference the metadata on the media.  Instead, we use the
+ * inode number to hold the object ID, which in turn will tell us where
+ * the data is held.  We also save the parent object ID, and with these
+ * two, we can locate the metadata.
+ *
+ * This does mean that we rely on an objects parent remaining the same at
+ * all times - we cannot cope with a cross-directory rename (yet).
+ */
+struct inode *
+adfs_iget(struct super_block *sb, struct object_info *obj)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	inode->i_uid	 = ADFS_SB(sb)->s_uid;
+	inode->i_gid	 = ADFS_SB(sb)->s_gid;
+	inode->i_ino	 = obj->file_id;
+	inode->i_size	 = obj->size;
+	set_nlink(inode, 2);
+	inode->i_blocks	 = (inode->i_size + sb->s_blocksize - 1) >>
+			    sb->s_blocksize_bits;
+
+	/*
+	 * we need to save the parent directory ID so that
+	 * write_inode can update the directory information
+	 * for this file.  This will need special handling
+	 * for cross-directory renames.
+	 */
+	ADFS_I(inode)->parent_id = obj->parent_id;
+	ADFS_I(inode)->loadaddr  = obj->loadaddr;
+	ADFS_I(inode)->execaddr  = obj->execaddr;
+	ADFS_I(inode)->attr      = obj->attr;
+	ADFS_I(inode)->filetype  = obj->filetype;
+	ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
+
+	inode->i_mode	 = adfs_atts2mode(sb, inode);
+	adfs_adfs2unix_time(&inode->i_mtime, inode);
+	inode->i_atime = inode->i_mtime;
+	inode->i_ctime = inode->i_mtime;
+
+	if (S_ISDIR(inode->i_mode)) {
+		inode->i_op	= &adfs_dir_inode_operations;
+		inode->i_fop	= &adfs_dir_operations;
+	} else if (S_ISREG(inode->i_mode)) {
+		inode->i_op	= &adfs_file_inode_operations;
+		inode->i_fop	= &adfs_file_operations;
+		inode->i_mapping->a_ops = &adfs_aops;
+		ADFS_I(inode)->mmu_private = inode->i_size;
+	}
+
+	insert_inode_hash(inode);
+
+out:
+	return inode;
+}
+
+/*
+ * Validate and convert a changed access mode/time to their ADFS equivalents.
+ * adfs_write_inode will actually write the information back to the directory
+ * later.
+ */
+int
+adfs_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = inode->i_sb;
+	unsigned int ia_valid = attr->ia_valid;
+	int error;
+	
+	error = inode_change_ok(inode, attr);
+
+	/*
+	 * we can't change the UID or GID of any file -
+	 * we have a global UID/GID in the superblock
+	 */
+	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, ADFS_SB(sb)->s_uid)) ||
+	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, ADFS_SB(sb)->s_gid)))
+		error = -EPERM;
+
+	if (error)
+		goto out;
+
+	/* XXX: this is missing some actual on-disk truncation.. */
+	if (ia_valid & ATTR_SIZE)
+		truncate_setsize(inode, attr->ia_size);
+
+	if (ia_valid & ATTR_MTIME) {
+		inode->i_mtime = attr->ia_mtime;
+		adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec);
+	}
+	/*
+	 * FIXME: should we make these == to i_mtime since we don't
+	 * have the ability to represent them in our filesystem?
+	 */
+	if (ia_valid & ATTR_ATIME)
+		inode->i_atime = attr->ia_atime;
+	if (ia_valid & ATTR_CTIME)
+		inode->i_ctime = attr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		ADFS_I(inode)->attr = adfs_mode2atts(sb, inode);
+		inode->i_mode = adfs_atts2mode(sb, inode);
+	}
+
+	/*
+	 * FIXME: should we be marking this inode dirty even if
+	 * we don't have any metadata to write back?
+	 */
+	if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
+		mark_inode_dirty(inode);
+out:
+	return error;
+}
+
+/*
+ * write an existing inode back to the directory, and therefore the disk.
+ * The adfs-specific inode data has already been updated by
+ * adfs_notify_change()
+ */
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct object_info obj;
+	int ret;
+
+	obj.file_id	= inode->i_ino;
+	obj.name_len	= 0;
+	obj.parent_id	= ADFS_I(inode)->parent_id;
+	obj.loadaddr	= ADFS_I(inode)->loadaddr;
+	obj.execaddr	= ADFS_I(inode)->execaddr;
+	obj.attr	= ADFS_I(inode)->attr;
+	obj.size	= inode->i_size;
+
+	ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
+	return ret;
+}
diff --git a/kernel/fs/adfs/map.c b/kernel/fs/adfs/map.c
new file mode 100644
index 000000000..6935f0520
--- /dev/null
+++ b/kernel/fs/adfs/map.c
@@ -0,0 +1,290 @@
+/*
+ *  linux/fs/adfs/map.c
+ *
+ *  Copyright (C) 1997-2002 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/buffer_head.h>
+#include <asm/unaligned.h>
+#include "adfs.h"
+
+/*
+ * The ADFS map is basically a set of sectors.  Each sector is called a
+ * zone which contains a bitstream made up of variable sized fragments.
+ * Each bit refers to a set of bytes in the filesystem, defined by
+ * log2bpmb.  This may be larger or smaller than the sector size, but
+ * the overall size it describes will always be a round number of
+ * sectors.  A fragment id is always idlen bits long.
+ *
+ *  < idlen > <       n        > <1>
+ * +---------+-------//---------+---+
+ * | frag id |  0000....000000  | 1 |
+ * +---------+-------//---------+---+
+ *
+ * The physical disk space used by a fragment is taken from the start of
+ * the fragment id up to and including the '1' bit - ie, idlen + n + 1
+ * bits.
+ *
+ * A fragment id can be repeated multiple times in the whole map for
+ * large or fragmented files.  The first map zone a fragment starts in
+ * is given by fragment id / ids_per_zone - this allows objects to start
+ * from any zone on the disk.
+ *
+ * Free space is described by a linked list of fragments.  Each free
+ * fragment describes free space in the same way as the other fragments,
+ * however, the frag id specifies an offset (in map bits) from the end
+ * of this fragment to the start of the next free fragment.
+ *
+ * Objects stored on the disk are allocated object ids (we use these as
+ * our inode numbers.)  Object ids contain a fragment id and an optional
+ * offset.  This allows a directory fragment to contain small files
+ * associated with that directory.
+ */
+
+/*
+ * For the future...
+ */
+static DEFINE_RWLOCK(adfs_map_lock);
+
+/*
+ * This is fun.  We need to load up to 19 bits from the map at an
+ * arbitrary bit alignment.  (We're limited to 19 bits by F+ version 2).
+ */
+#define GET_FRAG_ID(_map,_start,_idmask)				\
+	({								\
+		unsigned char *_m = _map + (_start >> 3);		\
+		u32 _frag = get_unaligned_le32(_m);			\
+		_frag >>= (_start & 7);					\
+		_frag & _idmask;					\
+	})
+
+/*
+ * return the map bit offset of the fragment frag_id in the zone dm.
+ * Note that the loop is optimised for best asm code - look at the
+ * output of:
+ *  gcc -D__KERNEL__ -O2 -I../../include -o - -S map.c
+ */
+static int
+lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen,
+	    const unsigned int frag_id, unsigned int *offset)
+{
+	const unsigned int mapsize = dm->dm_endbit;
+	const u32 idmask = (1 << idlen) - 1;
+	unsigned char *map = dm->dm_bh->b_data + 4;
+	unsigned int start = dm->dm_startbit;
+	unsigned int mapptr;
+	u32 frag;
+
+	do {
+		frag = GET_FRAG_ID(map, start, idmask);
+		mapptr = start + idlen;
+
+		/*
+		 * find end of fragment
+		 */
+		{
+			__le32 *_map = (__le32 *)map;
+			u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31);
+			while (v == 0) {
+				mapptr = (mapptr & ~31) + 32;
+				if (mapptr >= mapsize)
+					goto error;
+				v = le32_to_cpu(_map[mapptr >> 5]);
+			}
+
+			mapptr += 1 + ffz(~v);
+		}
+
+		if (frag == frag_id)
+			goto found;
+again:
+		start = mapptr;
+	} while (mapptr < mapsize);
+	return -1;
+
+error:
+	printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n",
+		frag, start, mapptr);
+	return -1;
+
+found:
+	{
+		int length = mapptr - start;
+		if (*offset >= length) {
+			*offset -= length;
+			goto again;
+		}
+	}
+	return start + *offset;
+}
+
+/*
+ * Scan the free space map, for this zone, calculating the total
+ * number of map bits in each free space fragment.
+ *
+ * Note: idmask is limited to 15 bits [3.2]
+ */
+static unsigned int
+scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm)
+{
+	const unsigned int mapsize = dm->dm_endbit + 32;
+	const unsigned int idlen  = asb->s_idlen;
+	const unsigned int frag_idlen = idlen <= 15 ? idlen : 15;
+	const u32 idmask = (1 << frag_idlen) - 1;
+	unsigned char *map = dm->dm_bh->b_data;
+	unsigned int start = 8, mapptr;
+	u32 frag;
+	unsigned long total = 0;
+
+	/*
+	 * get fragment id
+	 */
+	frag = GET_FRAG_ID(map, start, idmask);
+
+	/*
+	 * If the freelink is null, then no free fragments
+	 * exist in this zone.
+	 */
+	if (frag == 0)
+		return 0;
+
+	do {
+		start += frag;
+
+		/*
+		 * get fragment id
+		 */
+		frag = GET_FRAG_ID(map, start, idmask);
+		mapptr = start + idlen;
+
+		/*
+		 * find end of fragment
+		 */
+		{
+			__le32 *_map = (__le32 *)map;
+			u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31);
+			while (v == 0) {
+				mapptr = (mapptr & ~31) + 32;
+				if (mapptr >= mapsize)
+					goto error;
+				v = le32_to_cpu(_map[mapptr >> 5]);
+			}
+
+			mapptr += 1 + ffz(~v);
+		}
+
+		total += mapptr - start;
+	} while (frag >= idlen + 1);
+
+	if (frag != 0)
+		printk(KERN_ERR "adfs: undersized free fragment\n");
+
+	return total;
+error:
+	printk(KERN_ERR "adfs: oversized free fragment\n");
+	return 0;
+}
+
+static int
+scan_map(struct adfs_sb_info *asb, unsigned int zone,
+	 const unsigned int frag_id, unsigned int mapoff)
+{
+	const unsigned int idlen = asb->s_idlen;
+	struct adfs_discmap *dm, *dm_end;
+	int result;
+
+	dm	= asb->s_map + zone;
+	zone	= asb->s_map_size;
+	dm_end	= asb->s_map + zone;
+
+	do {
+		result = lookup_zone(dm, idlen, frag_id, &mapoff);
+
+		if (result != -1)
+			goto found;
+
+		dm ++;
+		if (dm == dm_end)
+			dm = asb->s_map;
+	} while (--zone > 0);
+
+	return -1;
+found:
+	result -= dm->dm_startbit;
+	result += dm->dm_startblk;
+
+	return result;
+}
+
+/*
+ * calculate the amount of free blocks in the map.
+ *
+ *              n=1
+ *  total_free = E(free_in_zone_n)
+ *              nzones
+ */
+unsigned int
+adfs_map_free(struct super_block *sb)
+{
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+	struct adfs_discmap *dm;
+	unsigned int total = 0;
+	unsigned int zone;
+
+	dm   = asb->s_map;
+	zone = asb->s_map_size;
+
+	do {
+		total += scan_free_map(asb, dm++);
+	} while (--zone > 0);
+
+	return signed_asl(total, asb->s_map2blk);
+}
+
+int
+adfs_map_lookup(struct super_block *sb, unsigned int frag_id,
+		unsigned int offset)
+{
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+	unsigned int zone, mapoff;
+	int result;
+
+	/*
+	 * map & root fragment is special - it starts in the center of the
+	 * disk.  The other fragments start at zone (frag / ids_per_zone)
+	 */
+	if (frag_id == ADFS_ROOT_FRAG)
+		zone = asb->s_map_size >> 1;
+	else
+		zone = frag_id / asb->s_ids_per_zone;
+
+	if (zone >= asb->s_map_size)
+		goto bad_fragment;
+
+	/* Convert sector offset to map offset */
+	mapoff = signed_asl(offset, -asb->s_map2blk);
+
+	read_lock(&adfs_map_lock);
+	result = scan_map(asb, zone, frag_id, mapoff);
+	read_unlock(&adfs_map_lock);
+
+	if (result > 0) {
+		unsigned int secoff;
+
+		/* Calculate sector offset into map block */
+		secoff = offset - signed_asl(mapoff, asb->s_map2blk);
+		return secoff + signed_asl(result, asb->s_map2blk);
+	}
+
+	adfs_error(sb, "fragment 0x%04x at offset %d not found in map",
+		   frag_id, offset);
+	return 0;
+
+bad_fragment:
+	adfs_error(sb, "invalid fragment 0x%04x (zone = %d, max = %d)",
+		   frag_id, zone, asb->s_map_size);
+	return 0;
+}
diff --git a/kernel/fs/adfs/super.c b/kernel/fs/adfs/super.c
new file mode 100644
index 000000000..a19c31d3f
--- /dev/null
+++ b/kernel/fs/adfs/super.c
@@ -0,0 +1,562 @@
+/*
+ *  linux/fs/adfs/super.c
+ *
+ *  Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/user_namespace.h>
+#include "adfs.h"
+#include "dir_f.h"
+#include "dir_fplus.h"
+
+#define ADFS_DEFAULT_OWNER_MASK S_IRWXU
+#define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO)
+
+void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...)
+{
+	char error_buf[128];
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+	va_end(args);
+
+	printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n",
+		sb->s_id, function ? ": " : "",
+		function ? function : "", error_buf);
+}
+
+static int adfs_checkdiscrecord(struct adfs_discrecord *dr)
+{
+	int i;
+
+	/* sector size must be 256, 512 or 1024 bytes */
+	if (dr->log2secsize != 8 &&
+	    dr->log2secsize != 9 &&
+	    dr->log2secsize != 10)
+		return 1;
+
+	/* idlen must be at least log2secsize + 3 */
+	if (dr->idlen < dr->log2secsize + 3)
+		return 1;
+
+	/* we cannot have such a large disc that we
+	 * are unable to represent sector offsets in
+	 * 32 bits.  This works out at 2.0 TB.
+	 */
+	if (le32_to_cpu(dr->disc_size_high) >> dr->log2secsize)
+		return 1;
+
+	/* idlen must be no greater than 19 v2 [1.0] */
+	if (dr->idlen > 19)
+		return 1;
+
+	/* reserved bytes should be zero */
+	for (i = 0; i < sizeof(dr->unused52); i++)
+		if (dr->unused52[i] != 0)
+			return 1;
+
+	return 0;
+}
+
+static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map)
+{
+	unsigned int v0, v1, v2, v3;
+	int i;
+
+	v0 = v1 = v2 = v3 = 0;
+	for (i = sb->s_blocksize - 4; i; i -= 4) {
+		v0 += map[i]     + (v3 >> 8);
+		v3 &= 0xff;
+		v1 += map[i + 1] + (v0 >> 8);
+		v0 &= 0xff;
+		v2 += map[i + 2] + (v1 >> 8);
+		v1 &= 0xff;
+		v3 += map[i + 3] + (v2 >> 8);
+		v2 &= 0xff;
+	}
+	v0 +=           v3 >> 8;
+	v1 += map[1] + (v0 >> 8);
+	v2 += map[2] + (v1 >> 8);
+	v3 += map[3] + (v2 >> 8);
+
+	return v0 ^ v1 ^ v2 ^ v3;
+}
+
+static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm)
+{
+	unsigned char crosscheck = 0, zonecheck = 1;
+	int i;
+
+	for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) {
+		unsigned char *map;
+
+		map = dm[i].dm_bh->b_data;
+
+		if (adfs_calczonecheck(sb, map) != map[0]) {
+			adfs_error(sb, "zone %d fails zonecheck", i);
+			zonecheck = 0;
+		}
+		crosscheck ^= map[3];
+	}
+	if (crosscheck != 0xff)
+		adfs_error(sb, "crosscheck != 0xff");
+	return crosscheck == 0xff && zonecheck;
+}
+
+static void adfs_put_super(struct super_block *sb)
+{
+	int i;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	for (i = 0; i < asb->s_map_size; i++)
+		brelse(asb->s_map[i].dm_bh);
+	kfree(asb->s_map);
+	kfree_rcu(asb, rcu);
+}
+
+static int adfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct adfs_sb_info *asb = ADFS_SB(root->d_sb);
+
+	if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID))
+		seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid));
+	if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID))
+		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid));
+	if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK)
+		seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
+	if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
+		seq_printf(seq, ",othmask=%o", asb->s_other_mask);
+	if (asb->s_ftsuffix != 0)
+		seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix);
+
+	return 0;
+}
+
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_ownmask, "ownmask=%o"},
+	{Opt_othmask, "othmask=%o"},
+	{Opt_ftsuffix, "ftsuffix=%u"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options)
+{
+	char *p;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+	int option;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(args, &option))
+				return -EINVAL;
+			asb->s_uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(asb->s_uid))
+				return -EINVAL;
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return -EINVAL;
+			asb->s_gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(asb->s_gid))
+				return -EINVAL;
+			break;
+		case Opt_ownmask:
+			if (match_octal(args, &option))
+				return -EINVAL;
+			asb->s_owner_mask = option;
+			break;
+		case Opt_othmask:
+			if (match_octal(args, &option))
+				return -EINVAL;
+			asb->s_other_mask = option;
+			break;
+		case Opt_ftsuffix:
+			if (match_int(args, &option))
+				return -EINVAL;
+			asb->s_ftsuffix = option;
+			break;
+		default:
+			printk("ADFS-fs: unrecognised mount option \"%s\" "
+					"or missing value\n", p);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int adfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_NODIRATIME;
+	return parse_options(sb, data);
+}
+
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct adfs_sb_info *sbi = ADFS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type    = ADFS_SUPER_MAGIC;
+	buf->f_namelen = sbi->s_namelen;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = sbi->s_size;
+	buf->f_files   = sbi->s_ids_per_zone * sbi->s_map_size;
+	buf->f_bavail  =
+	buf->f_bfree   = adfs_map_free(sb);
+	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static struct kmem_cache *adfs_inode_cachep;
+
+static struct inode *adfs_alloc_inode(struct super_block *sb)
+{
+	struct adfs_inode_info *ei;
+	ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void adfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
+}
+
+static void adfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, adfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
+					     sizeof(struct adfs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (adfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(adfs_inode_cachep);
+}
+
+static const struct super_operations adfs_sops = {
+	.alloc_inode	= adfs_alloc_inode,
+	.destroy_inode	= adfs_destroy_inode,
+	.write_inode	= adfs_write_inode,
+	.put_super	= adfs_put_super,
+	.statfs		= adfs_statfs,
+	.remount_fs	= adfs_remount,
+	.show_options	= adfs_show_options,
+};
+
+static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr)
+{
+	struct adfs_discmap *dm;
+	unsigned int map_addr, zone_size, nzones;
+	int i, zone;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	nzones    = asb->s_map_size;
+	zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare);
+	map_addr  = (nzones >> 1) * zone_size -
+		     ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0);
+	map_addr  = signed_asl(map_addr, asb->s_map2blk);
+
+	asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1);
+
+	dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL);
+	if (dm == NULL) {
+		adfs_error(sb, "not enough memory");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (zone = 0; zone < nzones; zone++, map_addr++) {
+		dm[zone].dm_startbit = 0;
+		dm[zone].dm_endbit   = zone_size;
+		dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS;
+		dm[zone].dm_bh       = sb_bread(sb, map_addr);
+
+		if (!dm[zone].dm_bh) {
+			adfs_error(sb, "unable to read map");
+			goto error_free;
+		}
+	}
+
+	/* adjust the limits for the first and last map zones */
+	i = zone - 1;
+	dm[0].dm_startblk = 0;
+	dm[0].dm_startbit = ADFS_DR_SIZE_BITS;
+	dm[i].dm_endbit   = (le32_to_cpu(dr->disc_size_high) << (32 - dr->log2bpmb)) +
+			    (le32_to_cpu(dr->disc_size) >> dr->log2bpmb) +
+			    (ADFS_DR_SIZE_BITS - i * zone_size);
+
+	if (adfs_checkmap(sb, dm))
+		return dm;
+
+	adfs_error(sb, "map corrupted");
+
+error_free:
+	while (--zone >= 0)
+		brelse(dm[zone].dm_bh);
+
+	kfree(dm);
+	return ERR_PTR(-EIO);
+}
+
+static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits)
+{
+	unsigned long discsize;
+
+	discsize  = le32_to_cpu(dr->disc_size_high) << (32 - block_bits);
+	discsize |= le32_to_cpu(dr->disc_size) >> block_bits;
+
+	return discsize;
+}
+
+static int adfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct adfs_discrecord *dr;
+	struct buffer_head *bh;
+	struct object_info root_obj;
+	unsigned char *b_data;
+	struct adfs_sb_info *asb;
+	struct inode *root;
+	int ret = -EINVAL;
+
+	sb->s_flags |= MS_NODIRATIME;
+
+	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
+	if (!asb)
+		return -ENOMEM;
+	sb->s_fs_info = asb;
+
+	/* set default options */
+	asb->s_uid = GLOBAL_ROOT_UID;
+	asb->s_gid = GLOBAL_ROOT_GID;
+	asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
+	asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+	asb->s_ftsuffix = 0;
+
+	if (parse_options(sb, data))
+		goto error;
+
+	sb_set_blocksize(sb, BLOCK_SIZE);
+	if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) {
+		adfs_error(sb, "unable to read superblock");
+		ret = -EIO;
+		goto error;
+	}
+
+	b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE);
+
+	if (adfs_checkbblk(b_data)) {
+		if (!silent)
+			printk("VFS: Can't find an adfs filesystem on dev "
+				"%s.\n", sb->s_id);
+		ret = -EINVAL;
+		goto error_free_bh;
+	}
+
+	dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
+
+	/*
+	 * Do some sanity checks on the ADFS disc record
+	 */
+	if (adfs_checkdiscrecord(dr)) {
+		if (!silent)
+			printk("VPS: Can't find an adfs filesystem on dev "
+				"%s.\n", sb->s_id);
+		ret = -EINVAL;
+		goto error_free_bh;
+	}
+
+	brelse(bh);
+	if (sb_set_blocksize(sb, 1 << dr->log2secsize)) {
+		bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize);
+		if (!bh) {
+			adfs_error(sb, "couldn't read superblock on "
+				"2nd try.");
+			ret = -EIO;
+			goto error;
+		}
+		b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize);
+		if (adfs_checkbblk(b_data)) {
+			adfs_error(sb, "disc record mismatch, very weird!");
+			ret = -EINVAL;
+			goto error_free_bh;
+		}
+		dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
+	} else {
+		if (!silent)
+			printk(KERN_ERR "VFS: Unsupported blocksize on dev "
+				"%s.\n", sb->s_id);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	/*
+	 * blocksize on this device should now be set to the ADFS log2secsize
+	 */
+
+	sb->s_magic		= ADFS_SUPER_MAGIC;
+	asb->s_idlen		= dr->idlen;
+	asb->s_map_size		= dr->nzones | (dr->nzones_high << 8);
+	asb->s_map2blk		= dr->log2bpmb - dr->log2secsize;
+	asb->s_size    		= adfs_discsize(dr, sb->s_blocksize_bits);
+	asb->s_version 		= dr->format_version;
+	asb->s_log2sharesize	= dr->log2sharesize;
+
+	asb->s_map = adfs_read_map(sb, dr);
+	if (IS_ERR(asb->s_map)) {
+		ret =  PTR_ERR(asb->s_map);
+		goto error_free_bh;
+	}
+
+	brelse(bh);
+
+	/*
+	 * set up enough so that we can read an inode
+	 */
+	sb->s_op = &adfs_sops;
+
+	dr = (struct adfs_discrecord *)(asb->s_map[0].dm_bh->b_data + 4);
+
+	root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
+	root_obj.name_len  = 0;
+	/* Set root object date as 01 Jan 1987 00:00:00 */
+	root_obj.loadaddr  = 0xfff0003f;
+	root_obj.execaddr  = 0xec22c000;
+	root_obj.size	   = ADFS_NEWDIR_SIZE;
+	root_obj.attr	   = ADFS_NDA_DIRECTORY   | ADFS_NDA_OWNER_READ |
+			     ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
+	root_obj.filetype  = -1;
+
+	/*
+	 * If this is a F+ disk with variable length directories,
+	 * get the root_size from the disc record.
+	 */
+	if (asb->s_version) {
+		root_obj.size = le32_to_cpu(dr->root_size);
+		asb->s_dir     = &adfs_fplus_dir_ops;
+		asb->s_namelen = ADFS_FPLUS_NAME_LEN;
+	} else {
+		asb->s_dir     = &adfs_f_dir_ops;
+		asb->s_namelen = ADFS_F_NAME_LEN;
+	}
+	/*
+	 * ,xyz hex filetype suffix may be added by driver
+	 * to files that have valid RISC OS filetype
+	 */
+	if (asb->s_ftsuffix)
+		asb->s_namelen += 4;
+
+	sb->s_d_op = &adfs_dentry_operations;
+	root = adfs_iget(sb, &root_obj);
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		int i;
+		for (i = 0; i < asb->s_map_size; i++)
+			brelse(asb->s_map[i].dm_bh);
+		kfree(asb->s_map);
+		adfs_error(sb, "get root inode failed\n");
+		ret = -EIO;
+		goto error;
+	}
+	return 0;
+
+error_free_bh:
+	brelse(bh);
+error:
+	sb->s_fs_info = NULL;
+	kfree(asb);
+	return ret;
+}
+
+static struct dentry *adfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+}
+
+static struct file_system_type adfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "adfs",
+	.mount		= adfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("adfs");
+
+static int __init init_adfs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&adfs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_adfs_fs(void)
+{
+	unregister_filesystem(&adfs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_adfs_fs)
+module_exit(exit_adfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/affs/Changes b/kernel/fs/affs/Changes
new file mode 100644
index 000000000..b41c2c979
--- /dev/null
+++ b/kernel/fs/affs/Changes
@@ -0,0 +1,343 @@
+(Note: I consider version numbers as cheap. That means
+that I do not like numbers like 0.1 and the like for
+things that can be used since quite some time. But
+then, 3.1 doesn't mean 'perfectly stable', too.)
+
+Known bugs:
+-----------
+
+- Doesn't work on the alpha. The only 64/32-bit
+  problem that I'm aware of (pointer/int conversion
+  in readdir()) gives compiler warnings but is
+  apparently not causing the failure, as directory
+  reads basically work (but all files are of size 0).
+  Alas, I've got no alpha to debug. :-(
+
+- The partition checker (drivers/block/genhd.c)
+  doesn't work with devices which have 256 byte
+  blocks (some very old SCSI drives). 
+
+- The feature to automatically make the fs clean
+  might leave a trashed file system with the
+  bitmap flag set valid.
+
+- When a file is truncated to a size that is not
+  a multiple of the blocksize, the rest of the
+  last allocated block is not cleared. Well,
+  this fs never claimed to be Posix conformant.
+
+Please direct bug reports to: zippel@linux-m68k.org
+
+Version 3.20
+------------
+- kill kernel lock
+- fix for a possible bitmap corruption
+
+Version 3.19
+------------
+
+- sizeof changes from Kernel Janitor Project
+- several bug fixes found with fsx
+
+Version 3.18
+------------
+
+- change to global min macro + warning fixes
+- add module tags
+
+Version 3.17
+------------
+
+- locking fixes
+- wrong sign in __affs_hash_dentry
+- remove unnecessary check in affs_new_inode
+- enable international mode for dircache fs
+
+Version 3.16
+------------
+
+- use mark_buffer_dirty_inode instead of mark_buffer_dirty.
+- introduce affs_lock_{link|dir|ext}.
+
+Version 3.15
+------------
+
+- disable link to directories until we can properly support them.
+- locking fixes for link creation/removal.
+
+Version 3.14
+------------
+
+- correctly cut off long file names for compares
+- correctly initialize s_last_bmap
+
+Version 3.13
+------------
+
+Major cleanup for 2.4 [Roman Zippel]
+- new extended block handling
+- new bitmap allocation functions
+- locking should be safe for the future
+- cleanup of some interfaces
+
+Version 3.12
+------------
+
+more 2.4 fixes: [Roman Zippel]
+- s_lock changes
+- increased getblock mess
+- clear meta blocks 
+
+Version 3.11
+------------
+
+- Converted to use 2.3.x page cache [Dave Jones]
+- Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>]
+
+Version 3.10
+------------
+
+- Changed partition checker to allow devices
+  with physical blocks != 512 bytes.
+
+- The partition checker now also ignores the
+  word at 0xd0 that Windows likes to write to.
+
+Version 3.9
+-----------
+
+- Moved cleanup from release_file() to put_inode().
+  This makes the first one obsolete.
+
+- truncate() zeroes the unused remainder of a
+  partially used last block when a file is truncated.
+  It also marks the inode dirty now (which is not
+  really necessary as notify_change() will do
+  it anyway).
+
+- Added a few comments, fixed some typos (and
+  introduced some new ones), made the debug messages
+  more consistent. Changed a bad example in the
+  doc file (affs.txt).
+
+- Sets the NOEXEC flag in read_super() for old file
+  systems, since you can't run programs on them.
+
+Version 3.8
+-----------
+Bill Hawes kindly reviewed the affs and sent me the
+patches he did. They're marked (BH). Thanks, Bill!
+
+- Cleanup of error handling in read_super().
+  Didn't release all resources in case of an
+  error. (BH)
+
+- put_inode() releases the ext cache only if it's
+  no longer needed. (BH)
+
+- One set of dentry callbacks is enough. (BH)
+
+- Cleanup of error handling in namei.c. (BH)
+
+- Cleanup of error handling in file.c. (BH)
+
+- The original blocksize of the device is
+  restored when the fs is unmounted. (BH)
+
+- getblock() did not invalidate the key cache
+  when it allocated a new block.
+
+- Removed some unnecessary locks as Bill
+  suggested.
+
+- Simplified match_name(), changed all hashing
+  and case insensitive name comparisons to use
+  uppercase. This makes the tolower() routines
+  obsolete.
+
+- Added mount option 'mufs' to force muFS
+  uid/gid interpretation.
+
+- File mode changes were not updated on disk.
+  This was fixed before, but somehow got lost.
+
+Version 3.7
+-----------
+
+- Added dentry callbacks to allow the dcache to
+  operate case insensitive and length ignorant
+  like the affs itself.
+
+- getblock() didn't update the lastblock field in the
+  inode if the fs was not an OFS. This bug only shows
+  up if a file was enlarged via truncate() and there
+  was not enough space.
+
+- Remove some more superfluous code left over from
+  the old link days ...
+
+- Fixed some oversights which were in patch 2.1.78.
+
+- Fixed a few typos.
+
+Version 3.6
+-----------
+
+- dentry changes. (Thanks to Jes Sorensen for his help.)
+
+- Fixed bug in balloc(): Superblock was not set dirty after
+  the bitmap was changed, so the bitmap wasn't sync'd.
+
+- Fixed nasty bug in find_new_zone(): If the current
+  zone number was zero, the loop didn't terminate,
+  causing a solid lock-up.
+
+- Removed support for old-style directory reads.
+
+- Fixed bug in add_entry(): When doing a sorted insert,
+  the pointer to the next entry in the hash chain wasn't
+  correctly byte-swapped. Since most of the users of the
+  affs use it on a 68k, they didn't notice. But why did
+  I not find this during my tests?
+
+- Fixed some oversights (version wasn't updated on some
+  directory changes).
+
+- Handling of hard links rewritten. To the VFS
+  they appear now as normal Unix links. They are
+  now resolved only once in lookup(). The backside
+  is that unlink(), rename() and rmdir() have to
+  be smart about them, but the result is worth the
+  effort. This also led to some code cleanup.
+
+- Changed name type to unsigned char; the test for
+  invalid filenames didn't work correctly.
+  (Thanks to Michael Krause for pointing at this.)
+
+- Changed mapping of executable flag.
+
+- Changed all network byte-order macros to the
+  recommended ones.
+
+- Added a remount function, so attempts to remount
+  a dircache filesystem or one with errors read/write
+  can be trapped. Previously, ro remounts didn't
+  flush the super block, and rw remounts didn't
+  create allocation zones ...
+
+- Call shrink_dcache_parent() in rmdir().
+  (Thanks to Bill Hawes.)
+
+- Permission checks in unlink().
+
+- Allow mounting of volumes with superfluous
+  bitmap pointers read only, also allows them
+  to be remounted read/write.
+
+- Owner/Group defaults now to the fs user (i.e.
+  the one that mounted it) instead of root. This
+  obsoletes the mount options uid and gid.
+
+- Argument to volume option could overflow the
+  name buffer. It is now silently truncated to
+  30 characters. (Damn it! This kind of bug
+  is too embarrassing.)
+
+- Split inode.c into 2 files, the superblock
+  routines desperately wanted their own file.
+
+- truncate() didn't allocate an extension block
+  cache. If a file was extended by means of
+  truncate(), this led to an Oops.
+
+- fsuser is now checked last.
+
+- rename() will not ignore changes in filename
+  casing any more (though mv(1) still won't allow
+  you to do "mv oldname OldName").
+
+Version 3.5
+-----------
+
+- Extension block caches are now allocated on
+  demand instead of when a file is opened, as
+  files can be read and written without opening
+  them (e. g. the loopback device does this).
+
+- Removed an unused function.
+
+Version 3.4
+-----------
+
+- Hash chains are now sorted by block numbers.
+  (Thanks to Kars de Jong for finding this.)
+- Removed all unnecessary external symbols.
+
+Version 3.3
+-----------
+
+- Tried to make all types 'correct' and consistent.
+- Errors and warnings are now reported via a
+  function. They are all prefixed by a severity
+  and have the same appearance:
+    "AFFS: <function>: <error message>"
+  (There's one exception to this, as in that function
+  is no pointer to the super block available.)
+- The filesystem is remounted read-only after an
+  error.
+- The names of newly created filesystem objects are
+  now checked for validity.
+- Minor cleanups in comments.
+- Added this Changes file. At last!
+
+Version 3.2
+-----------
+
+- Extension block cache: Reading/writing of huge files
+  (several MB) is much faster (of course the added
+  overhead slows down opening, but this is hardly
+  noticeable).
+- The same get_block()-routine can now be used for
+  both OFS and FFS.
+- The super block is now searched in the block that
+  was calculated and in the one following. This
+  should remedy the round-off error introduced by
+  the 1-k blocks that Linux uses.
+- Minor changes to adhere to the new VFS interface.
+- The number of used blocks is now also calculated
+  if the filesystem is mounted read-only.
+- Prefixed some constants with AFFS_ to avoid name
+  clashes.
+- Removed 'EXPERIMENTAL' status.
+
+Version 3.1
+-----------
+
+- Fixed a nasty bug which didn't allow read-only
+  mounts.
+- Allow dir-cache filesystems to be mounted
+  read only.
+- OFS support.
+- Several other changes I just cannot remember
+  any more.
+
+Version 3.0
+-----------
+
+- Almost complete rewrite for the new VFS
+  interface in Linux 1.3.
+- Write support.
+- Support for hard and symbolic links.
+- Lots of things I remember even less ...
+
+Version 2.0
+-----------
+
+- Fixed a few things to get it compiled.
+- Automatic root block calculation.
+- Partition checker for genhd.c
+
+========================================
+
+Let's just call Ray Burr's original affs
+'Version 1.0'.
diff --git a/kernel/fs/affs/Kconfig b/kernel/fs/affs/Kconfig
new file mode 100644
index 000000000..a04d9e848
--- /dev/null
+++ b/kernel/fs/affs/Kconfig
@@ -0,0 +1,21 @@
+config AFFS_FS
+	tristate "Amiga FFS file system support"
+	depends on BLOCK
+	help
+	  The Fast File System (FFS) is the common file system used on hard
+	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
+	  if you want to be able to read and write files from and to an Amiga
+	  FFS partition on your hard drive.  Amiga floppies however cannot be
+	  read with this driver due to an incompatibility of the floppy
+	  controller used in an Amiga and the standard floppy controller in
+	  PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
+	  and <file:fs/affs/Changes>.
+
+	  With this driver you can also mount disk files used by Bernd
+	  Schmidt's Un*X Amiga Emulator
+	  (<http://www.freiburg.linux.de/~uae/>).
+	  If you want to do this, you will also need to say Y or M to "Loop
+	  device support", above.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called affs.  If unsure, say N.
diff --git a/kernel/fs/affs/Makefile b/kernel/fs/affs/Makefile
new file mode 100644
index 000000000..3988b4a78
--- /dev/null
+++ b/kernel/fs/affs/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux affs filesystem routines.
+#
+
+#ccflags-y := -DDEBUG=1
+
+obj-$(CONFIG_AFFS_FS) += affs.o
+
+affs-objs := super.o namei.o inode.o file.o dir.o amigaffs.o bitmap.o symlink.o
diff --git a/kernel/fs/affs/affs.h b/kernel/fs/affs/affs.h
new file mode 100644
index 000000000..cffe8370f
--- /dev/null
+++ b/kernel/fs/affs/affs.h
@@ -0,0 +1,314 @@
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/amigaffs.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+/* Ugly macros make the code more pretty. */
+
+#define GET_END_PTR(st,p,sz)		 ((st *)((char *)(p)+((sz)-sizeof(st))))
+#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
+#define AFFS_BLOCK(sb, bh, blk)		(AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
+
+#define AFFS_HEAD(bh)		((struct affs_head *)(bh)->b_data)
+#define AFFS_TAIL(sb, bh)	((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
+#define AFFS_ROOT_HEAD(bh)	((struct affs_root_head *)(bh)->b_data)
+#define AFFS_ROOT_TAIL(sb, bh)	((struct affs_root_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_root_tail)))
+#define AFFS_DATA_HEAD(bh)	((struct affs_data_head *)(bh)->b_data)
+#define AFFS_DATA(bh)		(((struct affs_data_head *)(bh)->b_data)->data)
+
+#define AFFS_CACHE_SIZE		PAGE_SIZE
+
+#define AFFS_LC_SIZE		(AFFS_CACHE_SIZE/sizeof(u32)/2)
+#define AFFS_AC_SIZE		(AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
+#define AFFS_AC_MASK		(AFFS_AC_SIZE-1)
+
+#define AFFSNAMEMAX 30U
+
+struct affs_ext_key {
+	u32	ext;				/* idx of the extended block */
+	u32	key;				/* block number */
+};
+
+/*
+ * affs fs inode data in memory
+ */
+struct affs_inode_info {
+	atomic_t i_opencnt;
+	struct semaphore i_link_lock;		/* Protects internal inode access. */
+	struct semaphore i_ext_lock;		/* Protects internal inode access. */
+#define i_hash_lock i_ext_lock
+	u32	 i_blkcnt;			/* block count */
+	u32	 i_extcnt;			/* extended block count */
+	u32	*i_lc;				/* linear cache of extended blocks */
+	u32	 i_lc_size;
+	u32	 i_lc_shift;
+	u32	 i_lc_mask;
+	struct affs_ext_key *i_ac;		/* associative cache of extended blocks */
+	u32	 i_ext_last;			/* last accessed extended block */
+	struct buffer_head *i_ext_bh;		/* bh of last extended block */
+	loff_t	 mmu_private;
+	u32	 i_protect;			/* unused attribute bits */
+	u32	 i_lastalloc;			/* last allocated block */
+	int	 i_pa_cnt;			/* number of preallocated blocks */
+	struct inode vfs_inode;
+};
+
+/* short cut to get to the affs specific inode data */
+static inline struct affs_inode_info *AFFS_I(struct inode *inode)
+{
+	return list_entry(inode, struct affs_inode_info, vfs_inode);
+}
+
+/*
+ * super-block data in memory
+ *
+ * Block numbers are adjusted for their actual size
+ *
+ */
+
+struct affs_bm_info {
+	u32 bm_key;			/* Disk block number */
+	u32 bm_free;			/* Free blocks in here */
+};
+
+struct affs_sb_info {
+	int s_partition_size;		/* Partition size in blocks. */
+	int s_reserved;			/* Number of reserved blocks. */
+	//u32 s_blksize;			/* Initial device blksize */
+	u32 s_data_blksize;		/* size of the data block w/o header */
+	u32 s_root_block;		/* FFS root block number. */
+	int s_hashsize;			/* Size of hash table. */
+	unsigned long s_flags;		/* See below. */
+	kuid_t s_uid;			/* uid to override */
+	kgid_t s_gid;			/* gid to override */
+	umode_t s_mode;			/* mode to override */
+	struct buffer_head *s_root_bh;	/* Cached root block. */
+	struct mutex s_bmlock;		/* Protects bitmap access. */
+	struct affs_bm_info *s_bitmap;	/* Bitmap infos. */
+	u32 s_bmap_count;		/* # of bitmap blocks. */
+	u32 s_bmap_bits;		/* # of bits in one bitmap blocks */
+	u32 s_last_bmap;
+	struct buffer_head *s_bmap_bh;
+	char *s_prefix;			/* Prefix for volumes and assigns. */
+	char s_volume[32];		/* Volume prefix for absolute symlinks. */
+	spinlock_t symlink_lock;	/* protects the previous two */
+	struct super_block *sb;		/* the VFS superblock object */
+	int work_queued;		/* non-zero delayed work is queued */
+	struct delayed_work sb_work;	/* superblock flush delayed work */
+	spinlock_t work_lock;		/* protects sb_work and work_queued */
+};
+
+#define AFFS_MOUNT_SF_INTL		0x0001 /* International filesystem. */
+#define AFFS_MOUNT_SF_BM_VALID		0x0002 /* Bitmap is valid. */
+#define AFFS_MOUNT_SF_IMMUTABLE		0x0004 /* Protection bits cannot be changed */
+#define AFFS_MOUNT_SF_QUIET		0x0008 /* chmod errors will be not reported */
+#define AFFS_MOUNT_SF_SETUID		0x0010 /* Ignore Amiga uid */
+#define AFFS_MOUNT_SF_SETGID		0x0020 /* Ignore Amiga gid */
+#define AFFS_MOUNT_SF_SETMODE		0x0040 /* Ignore Amiga protection bits */
+#define AFFS_MOUNT_SF_MUFS		0x0100 /* Use MUFS uid/gid mapping */
+#define AFFS_MOUNT_SF_OFS		0x0200 /* Old filesystem */
+#define AFFS_MOUNT_SF_PREFIX		0x0400 /* Buffer for prefix is allocated */
+#define AFFS_MOUNT_SF_VERBOSE		0x0800 /* Talk about fs when mounting */
+#define AFFS_MOUNT_SF_NO_TRUNCATE	0x1000 /* Don't truncate filenames */
+
+#define affs_clear_opt(o, opt)    (o &= ~AFFS_MOUNT_##opt)
+#define affs_set_opt(o, opt)      (o |= AFFS_MOUNT_##opt)
+#define affs_test_opt(o, opt)     ((o) & AFFS_MOUNT_##opt)
+
+/* short cut to get to the affs specific sb data */
+static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+void affs_mark_sb_dirty(struct super_block *sb);
+
+/* amigaffs.c */
+
+extern int	affs_insert_hash(struct inode *inode, struct buffer_head *bh);
+extern int	affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh);
+extern int	affs_remove_header(struct dentry *dentry);
+extern u32	affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
+extern void	affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
+extern void	secs_to_datestamp(time_t secs, struct affs_date *ds);
+extern umode_t	prot_to_mode(u32 prot);
+extern void	mode_to_prot(struct inode *inode);
+__printf(3, 4)
+extern void	affs_error(struct super_block *sb, const char *function,
+			   const char *fmt, ...);
+__printf(3, 4)
+extern void	affs_warning(struct super_block *sb, const char *function,
+			     const char *fmt, ...);
+extern bool	affs_nofilenametruncate(const struct dentry *dentry);
+extern int	affs_check_name(const unsigned char *name, int len,
+				bool notruncate);
+extern int	affs_copy_name(unsigned char *bstr, struct dentry *dentry);
+
+/* bitmap. c */
+
+extern u32	affs_count_free_blocks(struct super_block *s);
+extern void	affs_free_block(struct super_block *sb, u32 block);
+extern u32	affs_alloc_block(struct inode *inode, u32 goal);
+extern int	affs_init_bitmap(struct super_block *sb, int *flags);
+extern void	affs_free_bitmap(struct super_block *sb);
+
+/* namei.c */
+
+extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
+extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int);
+extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
+extern int	affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool);
+extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
+extern int	affs_link(struct dentry *olddentry, struct inode *dir,
+			  struct dentry *dentry);
+extern int	affs_symlink(struct inode *dir, struct dentry *dentry,
+			     const char *symname);
+extern int	affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			    struct inode *new_dir, struct dentry *new_dentry);
+
+/* inode.c */
+
+extern unsigned long		 affs_parent_ino(struct inode *dir);
+extern struct inode		*affs_new_inode(struct inode *dir);
+extern int			 affs_notify_change(struct dentry *dentry, struct iattr *attr);
+extern void			 affs_evict_inode(struct inode *inode);
+extern struct inode		*affs_iget(struct super_block *sb,
+					unsigned long ino);
+extern int			 affs_write_inode(struct inode *inode,
+					struct writeback_control *wbc);
+extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
+
+/* file.c */
+
+void		affs_free_prealloc(struct inode *inode);
+extern void	affs_truncate(struct inode *);
+int		affs_file_fsync(struct file *, loff_t, loff_t, int);
+
+/* dir.c */
+
+extern void   affs_dir_truncate(struct inode *);
+
+/* jump tables */
+
+extern const struct inode_operations	 affs_file_inode_operations;
+extern const struct inode_operations	 affs_dir_inode_operations;
+extern const struct inode_operations   affs_symlink_inode_operations;
+extern const struct file_operations	 affs_file_operations;
+extern const struct file_operations	 affs_file_operations_ofs;
+extern const struct file_operations	 affs_dir_operations;
+extern const struct address_space_operations	 affs_symlink_aops;
+extern const struct address_space_operations	 affs_aops;
+extern const struct address_space_operations	 affs_aops_ofs;
+
+extern const struct dentry_operations	 affs_dentry_operations;
+extern const struct dentry_operations	 affs_intl_dentry_operations;
+
+static inline void
+affs_set_blocksize(struct super_block *sb, int size)
+{
+	sb_set_blocksize(sb, size);
+}
+static inline struct buffer_head *
+affs_bread(struct super_block *sb, int block)
+{
+	pr_debug("%s: %d\n", __func__, block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
+		return sb_bread(sb, block);
+	return NULL;
+}
+static inline struct buffer_head *
+affs_getblk(struct super_block *sb, int block)
+{
+	pr_debug("%s: %d\n", __func__, block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
+		return sb_getblk(sb, block);
+	return NULL;
+}
+static inline struct buffer_head *
+affs_getzeroblk(struct super_block *sb, int block)
+{
+	struct buffer_head *bh;
+	pr_debug("%s: %d\n", __func__, block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
+		bh = sb_getblk(sb, block);
+		lock_buffer(bh);
+		memset(bh->b_data, 0 , sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	return NULL;
+}
+static inline struct buffer_head *
+affs_getemptyblk(struct super_block *sb, int block)
+{
+	struct buffer_head *bh;
+	pr_debug("%s: %d\n", __func__, block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
+		bh = sb_getblk(sb, block);
+		wait_on_buffer(bh);
+		set_buffer_uptodate(bh);
+		return bh;
+	}
+	return NULL;
+}
+static inline void
+affs_brelse(struct buffer_head *bh)
+{
+	if (bh)
+		pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr);
+	brelse(bh);
+}
+
+static inline void
+affs_adjust_checksum(struct buffer_head *bh, u32 val)
+{
+	u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[5]);
+	((__be32 *)bh->b_data)[5] = cpu_to_be32(tmp - val);
+}
+static inline void
+affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val)
+{
+	u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[0]);
+	((__be32 *)bh->b_data)[0] = cpu_to_be32(tmp - val);
+}
+
+static inline void
+affs_lock_link(struct inode *inode)
+{
+	down(&AFFS_I(inode)->i_link_lock);
+}
+static inline void
+affs_unlock_link(struct inode *inode)
+{
+	up(&AFFS_I(inode)->i_link_lock);
+}
+static inline void
+affs_lock_dir(struct inode *inode)
+{
+	down(&AFFS_I(inode)->i_hash_lock);
+}
+static inline void
+affs_unlock_dir(struct inode *inode)
+{
+	up(&AFFS_I(inode)->i_hash_lock);
+}
+static inline void
+affs_lock_ext(struct inode *inode)
+{
+	down(&AFFS_I(inode)->i_ext_lock);
+}
+static inline void
+affs_unlock_ext(struct inode *inode)
+{
+	up(&AFFS_I(inode)->i_ext_lock);
+}
diff --git a/kernel/fs/affs/amigaffs.c b/kernel/fs/affs/amigaffs.c
new file mode 100644
index 000000000..a8f463c02
--- /dev/null
+++ b/kernel/fs/affs/amigaffs.c
@@ -0,0 +1,515 @@
+/*
+ *  linux/fs/affs/amigaffs.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Amiga FFS filesystem.
+ *
+ *  Please send bug reports to: hjw@zvw.de
+ */
+
+#include "affs.h"
+
+/*
+ * Functions for accessing Amiga-FFS structures.
+ */
+
+
+/* Insert a header block bh into the directory dir
+ * caller must hold AFFS_DIR->i_hash_lock!
+ */
+
+int
+affs_insert_hash(struct inode *dir, struct buffer_head *bh)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *dir_bh;
+	u32 ino, hash_ino;
+	int offset;
+
+	ino = bh->b_blocknr;
+	offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
+
+	pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino);
+
+	dir_bh = affs_bread(sb, dir->i_ino);
+	if (!dir_bh)
+		return -EIO;
+
+	hash_ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[offset]);
+	while (hash_ino) {
+		affs_brelse(dir_bh);
+		dir_bh = affs_bread(sb, hash_ino);
+		if (!dir_bh)
+			return -EIO;
+		hash_ino = be32_to_cpu(AFFS_TAIL(sb, dir_bh)->hash_chain);
+	}
+	AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino);
+	AFFS_TAIL(sb, bh)->hash_chain = 0;
+	affs_fix_checksum(sb, bh);
+
+	if (dir->i_ino == dir_bh->b_blocknr)
+		AFFS_HEAD(dir_bh)->table[offset] = cpu_to_be32(ino);
+	else
+		AFFS_TAIL(sb, dir_bh)->hash_chain = cpu_to_be32(ino);
+
+	affs_adjust_checksum(dir_bh, ino);
+	mark_buffer_dirty_inode(dir_bh, dir);
+	affs_brelse(dir_bh);
+
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	dir->i_version++;
+	mark_inode_dirty(dir);
+
+	return 0;
+}
+
+/* Remove a header block from its directory.
+ * caller must hold AFFS_DIR->i_hash_lock!
+ */
+
+int
+affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
+{
+	struct super_block *sb;
+	struct buffer_head *bh;
+	u32 rem_ino, hash_ino;
+	__be32 ino;
+	int offset, retval;
+
+	sb = dir->i_sb;
+	rem_ino = rem_bh->b_blocknr;
+	offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
+	pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino,
+		 rem_ino, offset);
+
+	bh = affs_bread(sb, dir->i_ino);
+	if (!bh)
+		return -EIO;
+
+	retval = -ENOENT;
+	hash_ino = be32_to_cpu(AFFS_HEAD(bh)->table[offset]);
+	while (hash_ino) {
+		if (hash_ino == rem_ino) {
+			ino = AFFS_TAIL(sb, rem_bh)->hash_chain;
+			if (dir->i_ino == bh->b_blocknr)
+				AFFS_HEAD(bh)->table[offset] = ino;
+			else
+				AFFS_TAIL(sb, bh)->hash_chain = ino;
+			affs_adjust_checksum(bh, be32_to_cpu(ino) - hash_ino);
+			mark_buffer_dirty_inode(bh, dir);
+			AFFS_TAIL(sb, rem_bh)->parent = 0;
+			retval = 0;
+			break;
+		}
+		affs_brelse(bh);
+		bh = affs_bread(sb, hash_ino);
+		if (!bh)
+			return -EIO;
+		hash_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain);
+	}
+
+	affs_brelse(bh);
+
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	dir->i_version++;
+	mark_inode_dirty(dir);
+
+	return retval;
+}
+
+static void
+affs_fix_dcache(struct inode *inode, u32 entry_ino)
+{
+	struct dentry *dentry;
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
+		if (entry_ino == (u32)(long)dentry->d_fsdata) {
+			dentry->d_fsdata = (void *)inode->i_ino;
+			break;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+
+/* Remove header from link chain */
+
+static int
+affs_remove_link(struct dentry *dentry)
+{
+	struct inode *dir, *inode = d_inode(dentry);
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh = NULL, *link_bh = NULL;
+	u32 link_ino, ino;
+	int retval;
+
+	pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
+	retval = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto done;
+
+	link_ino = (u32)(long)dentry->d_fsdata;
+	if (inode->i_ino == link_ino) {
+		/* we can't remove the head of the link, as its blocknr is still used as ino,
+		 * so we remove the block of the first link instead.
+		 */ 
+		link_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain);
+		link_bh = affs_bread(sb, link_ino);
+		if (!link_bh)
+			goto done;
+
+		dir = affs_iget(sb, be32_to_cpu(AFFS_TAIL(sb, link_bh)->parent));
+		if (IS_ERR(dir)) {
+			retval = PTR_ERR(dir);
+			goto done;
+		}
+
+		affs_lock_dir(dir);
+		/*
+		 * if there's a dentry for that block, make it
+		 * refer to inode itself.
+		 */
+		affs_fix_dcache(inode, link_ino);
+		retval = affs_remove_hash(dir, link_bh);
+		if (retval) {
+			affs_unlock_dir(dir);
+			goto done;
+		}
+		mark_buffer_dirty_inode(link_bh, inode);
+
+		memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32);
+		retval = affs_insert_hash(dir, bh);
+		if (retval) {
+			affs_unlock_dir(dir);
+			goto done;
+		}
+		mark_buffer_dirty_inode(bh, inode);
+
+		affs_unlock_dir(dir);
+		iput(dir);
+	} else {
+		link_bh = affs_bread(sb, link_ino);
+		if (!link_bh)
+			goto done;
+	}
+
+	while ((ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain)) != 0) {
+		if (ino == link_ino) {
+			__be32 ino2 = AFFS_TAIL(sb, link_bh)->link_chain;
+			AFFS_TAIL(sb, bh)->link_chain = ino2;
+			affs_adjust_checksum(bh, be32_to_cpu(ino2) - link_ino);
+			mark_buffer_dirty_inode(bh, inode);
+			retval = 0;
+			/* Fix the link count, if bh is a normal header block without links */
+			switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
+			case ST_LINKDIR:
+			case ST_LINKFILE:
+				break;
+			default:
+				if (!AFFS_TAIL(sb, bh)->link_chain)
+					set_nlink(inode, 1);
+			}
+			affs_free_block(sb, link_ino);
+			goto done;
+		}
+		affs_brelse(bh);
+		bh = affs_bread(sb, ino);
+		if (!bh)
+			goto done;
+	}
+	retval = -ENOENT;
+done:
+	affs_brelse(link_bh);
+	affs_brelse(bh);
+	return retval;
+}
+
+
+static int
+affs_empty_dir(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh;
+	int retval, size;
+
+	retval = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto done;
+
+	retval = -ENOTEMPTY;
+	for (size = AFFS_SB(sb)->s_hashsize - 1; size >= 0; size--)
+		if (AFFS_HEAD(bh)->table[size])
+			goto not_empty;
+	retval = 0;
+not_empty:
+	affs_brelse(bh);
+done:
+	return retval;
+}
+
+
+/* Remove a filesystem object. If the object to be removed has
+ * links to it, one of the links must be changed to inherit
+ * the file or directory. As above, any inode will do.
+ * The buffer will not be freed. If the header is a link, the
+ * block will be marked as free.
+ * This function returns a negative error number in case of
+ * an error, else 0 if the inode is to be deleted or 1 if not.
+ */
+
+int
+affs_remove_header(struct dentry *dentry)
+{
+	struct super_block *sb;
+	struct inode *inode, *dir;
+	struct buffer_head *bh = NULL;
+	int retval;
+
+	dir = d_inode(dentry->d_parent);
+	sb = dir->i_sb;
+
+	retval = -ENOENT;
+	inode = d_inode(dentry);
+	if (!inode)
+		goto done;
+
+	pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
+	retval = -EIO;
+	bh = affs_bread(sb, (u32)(long)dentry->d_fsdata);
+	if (!bh)
+		goto done;
+
+	affs_lock_link(inode);
+	affs_lock_dir(dir);
+	switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
+	case ST_USERDIR:
+		/* if we ever want to support links to dirs
+		 * i_hash_lock of the inode must only be
+		 * taken after some checks
+		 */
+		affs_lock_dir(inode);
+		retval = affs_empty_dir(inode);
+		affs_unlock_dir(inode);
+		if (retval)
+			goto done_unlock;
+		break;
+	default:
+		break;
+	}
+
+	retval = affs_remove_hash(dir, bh);
+	if (retval)
+		goto done_unlock;
+	mark_buffer_dirty_inode(bh, inode);
+
+	affs_unlock_dir(dir);
+
+	if (inode->i_nlink > 1)
+		retval = affs_remove_link(dentry);
+	else
+		clear_nlink(inode);
+	affs_unlock_link(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+done:
+	affs_brelse(bh);
+	return retval;
+
+done_unlock:
+	affs_unlock_dir(dir);
+	affs_unlock_link(inode);
+	goto done;
+}
+
+/* Checksum a block, do various consistency checks and optionally return
+   the blocks type number.  DATA points to the block.  If their pointers
+   are non-null, *PTYPE and *STYPE are set to the primary and secondary
+   block types respectively, *HASHSIZE is set to the size of the hashtable
+   (which lets us calculate the block size).
+   Returns non-zero if the block is not consistent. */
+
+u32
+affs_checksum_block(struct super_block *sb, struct buffer_head *bh)
+{
+	__be32 *ptr = (__be32 *)bh->b_data;
+	u32 sum;
+	int bsize;
+
+	sum = 0;
+	for (bsize = sb->s_blocksize / sizeof(__be32); bsize > 0; bsize--)
+		sum += be32_to_cpu(*ptr++);
+	return sum;
+}
+
+/*
+ * Calculate the checksum of a disk block and store it
+ * at the indicated position.
+ */
+
+void
+affs_fix_checksum(struct super_block *sb, struct buffer_head *bh)
+{
+	int cnt = sb->s_blocksize / sizeof(__be32);
+	__be32 *ptr = (__be32 *)bh->b_data;
+	u32 checksum;
+	__be32 *checksumptr;
+
+	checksumptr = ptr + 5;
+	*checksumptr = 0;
+	for (checksum = 0; cnt > 0; ptr++, cnt--)
+		checksum += be32_to_cpu(*ptr);
+	*checksumptr = cpu_to_be32(-checksum);
+}
+
+void
+secs_to_datestamp(time_t secs, struct affs_date *ds)
+{
+	u32	 days;
+	u32	 minute;
+
+	secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60);
+	if (secs < 0)
+		secs = 0;
+	days    = secs / 86400;
+	secs   -= days * 86400;
+	minute  = secs / 60;
+	secs   -= minute * 60;
+
+	ds->days = cpu_to_be32(days);
+	ds->mins = cpu_to_be32(minute);
+	ds->ticks = cpu_to_be32(secs * 50);
+}
+
+umode_t
+prot_to_mode(u32 prot)
+{
+	umode_t mode = 0;
+
+	if (!(prot & FIBF_NOWRITE))
+		mode |= S_IWUSR;
+	if (!(prot & FIBF_NOREAD))
+		mode |= S_IRUSR;
+	if (!(prot & FIBF_NOEXECUTE))
+		mode |= S_IXUSR;
+	if (prot & FIBF_GRP_WRITE)
+		mode |= S_IWGRP;
+	if (prot & FIBF_GRP_READ)
+		mode |= S_IRGRP;
+	if (prot & FIBF_GRP_EXECUTE)
+		mode |= S_IXGRP;
+	if (prot & FIBF_OTR_WRITE)
+		mode |= S_IWOTH;
+	if (prot & FIBF_OTR_READ)
+		mode |= S_IROTH;
+	if (prot & FIBF_OTR_EXECUTE)
+		mode |= S_IXOTH;
+
+	return mode;
+}
+
+void
+mode_to_prot(struct inode *inode)
+{
+	u32 prot = AFFS_I(inode)->i_protect;
+	umode_t mode = inode->i_mode;
+
+	if (!(mode & S_IXUSR))
+		prot |= FIBF_NOEXECUTE;
+	if (!(mode & S_IRUSR))
+		prot |= FIBF_NOREAD;
+	if (!(mode & S_IWUSR))
+		prot |= FIBF_NOWRITE;
+	if (mode & S_IXGRP)
+		prot |= FIBF_GRP_EXECUTE;
+	if (mode & S_IRGRP)
+		prot |= FIBF_GRP_READ;
+	if (mode & S_IWGRP)
+		prot |= FIBF_GRP_WRITE;
+	if (mode & S_IXOTH)
+		prot |= FIBF_OTR_EXECUTE;
+	if (mode & S_IROTH)
+		prot |= FIBF_OTR_READ;
+	if (mode & S_IWOTH)
+		prot |= FIBF_OTR_WRITE;
+
+	AFFS_I(inode)->i_protect = prot;
+}
+
+void
+affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf);
+	if (!(sb->s_flags & MS_RDONLY))
+		pr_warn("Remounting filesystem read-only\n");
+	sb->s_flags |= MS_RDONLY;
+	va_end(args);
+}
+
+void
+affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf);
+	va_end(args);
+}
+
+bool
+affs_nofilenametruncate(const struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE);
+}
+
+/* Check if the name is valid for a affs object. */
+
+int
+affs_check_name(const unsigned char *name, int len, bool notruncate)
+{
+	int	 i;
+
+	if (len > AFFSNAMEMAX) {
+		if (notruncate)
+			return -ENAMETOOLONG;
+		len = AFFSNAMEMAX;
+	}
+	for (i = 0; i < len; i++) {
+		if (name[i] < ' ' || name[i] == ':'
+		    || (name[i] > 0x7e && name[i] < 0xa0))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* This function copies name to bstr, with at most 30
+ * characters length. The bstr will be prepended by
+ * a length byte.
+ * NOTE: The name will must be already checked by
+ *       affs_check_name()!
+ */
+
+int
+affs_copy_name(unsigned char *bstr, struct dentry *dentry)
+{
+	u32 len = min(dentry->d_name.len, AFFSNAMEMAX);
+
+	*bstr++ = len;
+	memcpy(bstr, dentry->d_name.name, len);
+	return len;
+}
diff --git a/kernel/fs/affs/bitmap.c b/kernel/fs/affs/bitmap.c
new file mode 100644
index 000000000..675148950
--- /dev/null
+++ b/kernel/fs/affs/bitmap.c
@@ -0,0 +1,364 @@
+/*
+ *  linux/fs/affs/bitmap.c
+ *
+ *  (c) 1996 Hans-Joachim Widmaier
+ *
+ *  bitmap.c contains the code that handles all bitmap related stuff -
+ *  block allocation, deallocation, calculation of free space.
+ */
+
+#include <linux/slab.h>
+#include "affs.h"
+
+u32
+affs_count_free_blocks(struct super_block *sb)
+{
+	struct affs_bm_info *bm;
+	u32 free;
+	int i;
+
+	pr_debug("%s()\n", __func__);
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	mutex_lock(&AFFS_SB(sb)->s_bmlock);
+
+	bm = AFFS_SB(sb)->s_bitmap;
+	free = 0;
+	for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--)
+		free += bm->bm_free;
+
+	mutex_unlock(&AFFS_SB(sb)->s_bmlock);
+
+	return free;
+}
+
+void
+affs_free_block(struct super_block *sb, u32 block)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	struct affs_bm_info *bm;
+	struct buffer_head *bh;
+	u32 blk, bmap, bit, mask, tmp;
+	__be32 *data;
+
+	pr_debug("%s(%u)\n", __func__, block);
+
+	if (block > sbi->s_partition_size)
+		goto err_range;
+
+	blk     = block - sbi->s_reserved;
+	bmap    = blk / sbi->s_bmap_bits;
+	bit     = blk % sbi->s_bmap_bits;
+	bm      = &sbi->s_bitmap[bmap];
+
+	mutex_lock(&sbi->s_bmlock);
+
+	bh = sbi->s_bmap_bh;
+	if (sbi->s_last_bmap != bmap) {
+		affs_brelse(bh);
+		bh = affs_bread(sb, bm->bm_key);
+		if (!bh)
+			goto err_bh_read;
+		sbi->s_bmap_bh = bh;
+		sbi->s_last_bmap = bmap;
+	}
+
+	mask = 1 << (bit & 31);
+	data = (__be32 *)bh->b_data + bit / 32 + 1;
+
+	/* mark block free */
+	tmp = be32_to_cpu(*data);
+	if (tmp & mask)
+		goto err_free;
+	*data = cpu_to_be32(tmp | mask);
+
+	/* fix checksum */
+	tmp = be32_to_cpu(*(__be32 *)bh->b_data);
+	*(__be32 *)bh->b_data = cpu_to_be32(tmp - mask);
+
+	mark_buffer_dirty(bh);
+	affs_mark_sb_dirty(sb);
+	bm->bm_free++;
+
+	mutex_unlock(&sbi->s_bmlock);
+	return;
+
+err_free:
+	affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block);
+	mutex_unlock(&sbi->s_bmlock);
+	return;
+
+err_bh_read:
+	affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key);
+	sbi->s_bmap_bh = NULL;
+	sbi->s_last_bmap = ~0;
+	mutex_unlock(&sbi->s_bmlock);
+	return;
+
+err_range:
+	affs_error(sb, "affs_free_block","Block %u outside partition", block);
+}
+
+/*
+ * Allocate a block in the given allocation zone.
+ * Since we have to byte-swap the bitmap on little-endian
+ * machines, this is rather expensive. Therefore we will
+ * preallocate up to 16 blocks from the same word, if
+ * possible. We are not doing preallocations in the
+ * header zone, though.
+ */
+
+u32
+affs_alloc_block(struct inode *inode, u32 goal)
+{
+	struct super_block *sb;
+	struct affs_sb_info *sbi;
+	struct affs_bm_info *bm;
+	struct buffer_head *bh;
+	__be32 *data, *enddata;
+	u32 blk, bmap, bit, mask, mask2, tmp;
+	int i;
+
+	sb = inode->i_sb;
+	sbi = AFFS_SB(sb);
+
+	pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
+
+	if (AFFS_I(inode)->i_pa_cnt) {
+		pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1);
+		AFFS_I(inode)->i_pa_cnt--;
+		return ++AFFS_I(inode)->i_lastalloc;
+	}
+
+	if (!goal || goal > sbi->s_partition_size) {
+		if (goal)
+			affs_warning(sb, "affs_balloc", "invalid goal %d", goal);
+		//if (!AFFS_I(inode)->i_last_block)
+		//	affs_warning(sb, "affs_balloc", "no last alloc block");
+		goal = sbi->s_reserved;
+	}
+
+	blk = goal - sbi->s_reserved;
+	bmap = blk / sbi->s_bmap_bits;
+	bm = &sbi->s_bitmap[bmap];
+
+	mutex_lock(&sbi->s_bmlock);
+
+	if (bm->bm_free)
+		goto find_bmap_bit;
+
+find_bmap:
+	/* search for the next bmap buffer with free bits */
+	i = sbi->s_bmap_count;
+	do {
+		if (--i < 0)
+			goto err_full;
+		bmap++;
+		bm++;
+		if (bmap < sbi->s_bmap_count)
+			continue;
+		/* restart search at zero */
+		bmap = 0;
+		bm = sbi->s_bitmap;
+	} while (!bm->bm_free);
+	blk = bmap * sbi->s_bmap_bits;
+
+find_bmap_bit:
+
+	bh = sbi->s_bmap_bh;
+	if (sbi->s_last_bmap != bmap) {
+		affs_brelse(bh);
+		bh = affs_bread(sb, bm->bm_key);
+		if (!bh)
+			goto err_bh_read;
+		sbi->s_bmap_bh = bh;
+		sbi->s_last_bmap = bmap;
+	}
+
+	/* find an unused block in this bitmap block */
+	bit = blk % sbi->s_bmap_bits;
+	data = (__be32 *)bh->b_data + bit / 32 + 1;
+	enddata = (__be32 *)((u8 *)bh->b_data + sb->s_blocksize);
+	mask = ~0UL << (bit & 31);
+	blk &= ~31UL;
+
+	tmp = be32_to_cpu(*data);
+	if (tmp & mask)
+		goto find_bit;
+
+	/* scan the rest of the buffer */
+	do {
+		blk += 32;
+		if (++data >= enddata)
+			/* didn't find something, can only happen
+			 * if scan didn't start at 0, try next bmap
+			 */
+			goto find_bmap;
+	} while (!*data);
+	tmp = be32_to_cpu(*data);
+	mask = ~0;
+
+find_bit:
+	/* finally look for a free bit in the word */
+	bit = ffs(tmp & mask) - 1;
+	blk += bit + sbi->s_reserved;
+	mask2 = mask = 1 << (bit & 31);
+	AFFS_I(inode)->i_lastalloc = blk;
+
+	/* prealloc as much as possible within this word */
+	while ((mask2 <<= 1)) {
+		if (!(tmp & mask2))
+			break;
+		AFFS_I(inode)->i_pa_cnt++;
+		mask |= mask2;
+	}
+	bm->bm_free -= AFFS_I(inode)->i_pa_cnt + 1;
+
+	*data = cpu_to_be32(tmp & ~mask);
+
+	/* fix checksum */
+	tmp = be32_to_cpu(*(__be32 *)bh->b_data);
+	*(__be32 *)bh->b_data = cpu_to_be32(tmp + mask);
+
+	mark_buffer_dirty(bh);
+	affs_mark_sb_dirty(sb);
+
+	mutex_unlock(&sbi->s_bmlock);
+
+	pr_debug("%d\n", blk);
+	return blk;
+
+err_bh_read:
+	affs_error(sb,"affs_read_block","Cannot read bitmap block %u", bm->bm_key);
+	sbi->s_bmap_bh = NULL;
+	sbi->s_last_bmap = ~0;
+err_full:
+	mutex_unlock(&sbi->s_bmlock);
+	pr_debug("failed\n");
+	return 0;
+}
+
+int affs_init_bitmap(struct super_block *sb, int *flags)
+{
+	struct affs_bm_info *bm;
+	struct buffer_head *bmap_bh = NULL, *bh = NULL;
+	__be32 *bmap_blk;
+	u32 size, blk, end, offset, mask;
+	int i, res = 0;
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+
+	if (*flags & MS_RDONLY)
+		return 0;
+
+	if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
+		pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
+		*flags |= MS_RDONLY;
+		return 0;
+	}
+
+	sbi->s_last_bmap = ~0;
+	sbi->s_bmap_bh = NULL;
+	sbi->s_bmap_bits = sb->s_blocksize * 8 - 32;
+	sbi->s_bmap_count = (sbi->s_partition_size - sbi->s_reserved +
+				 sbi->s_bmap_bits - 1) / sbi->s_bmap_bits;
+	size = sbi->s_bmap_count * sizeof(*bm);
+	bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL);
+	if (!sbi->s_bitmap) {
+		pr_err("Bitmap allocation failed\n");
+		return -ENOMEM;
+	}
+
+	bmap_blk = (__be32 *)sbi->s_root_bh->b_data;
+	blk = sb->s_blocksize / 4 - 49;
+	end = blk + 25;
+
+	for (i = sbi->s_bmap_count; i > 0; bm++, i--) {
+		affs_brelse(bh);
+
+		bm->bm_key = be32_to_cpu(bmap_blk[blk]);
+		bh = affs_bread(sb, bm->bm_key);
+		if (!bh) {
+			pr_err("Cannot read bitmap\n");
+			res = -EIO;
+			goto out;
+		}
+		if (affs_checksum_block(sb, bh)) {
+			pr_warn("Bitmap %u invalid - mounting %s read only.\n",
+				bm->bm_key, sb->s_id);
+			*flags |= MS_RDONLY;
+			goto out;
+		}
+		pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
+		bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
+
+		/* Don't try read the extension if this is the last block,
+		 * but we also need the right bm pointer below
+		 */
+		if (++blk < end || i == 1)
+			continue;
+		if (bmap_bh)
+			affs_brelse(bmap_bh);
+		bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk]));
+		if (!bmap_bh) {
+			pr_err("Cannot read bitmap extension\n");
+			res = -EIO;
+			goto out;
+		}
+		bmap_blk = (__be32 *)bmap_bh->b_data;
+		blk = 0;
+		end = sb->s_blocksize / 4 - 1;
+	}
+
+	offset = (sbi->s_partition_size - sbi->s_reserved) % sbi->s_bmap_bits;
+	mask = ~(0xFFFFFFFFU << (offset & 31));
+	pr_debug("last word: %d %d %d\n", offset, offset / 32 + 1, mask);
+	offset = offset / 32 + 1;
+
+	if (mask) {
+		u32 old, new;
+
+		/* Mark unused bits in the last word as allocated */
+		old = be32_to_cpu(((__be32 *)bh->b_data)[offset]);
+		new = old & mask;
+		//if (old != new) {
+			((__be32 *)bh->b_data)[offset] = cpu_to_be32(new);
+			/* fix checksum */
+			//new -= old;
+			//old = be32_to_cpu(*(__be32 *)bh->b_data);
+			//*(__be32 *)bh->b_data = cpu_to_be32(old - new);
+			//mark_buffer_dirty(bh);
+		//}
+		/* correct offset for the bitmap count below */
+		//offset++;
+	}
+	while (++offset < sb->s_blocksize / 4)
+		((__be32 *)bh->b_data)[offset] = 0;
+	((__be32 *)bh->b_data)[0] = 0;
+	((__be32 *)bh->b_data)[0] = cpu_to_be32(-affs_checksum_block(sb, bh));
+	mark_buffer_dirty(bh);
+
+	/* recalculate bitmap count for last block */
+	bm--;
+	bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
+
+out:
+	affs_brelse(bh);
+	affs_brelse(bmap_bh);
+	return res;
+}
+
+void affs_free_bitmap(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+
+	if (!sbi->s_bitmap)
+		return;
+
+	affs_brelse(sbi->s_bmap_bh);
+	sbi->s_bmap_bh = NULL;
+	sbi->s_last_bmap = ~0;
+	kfree(sbi->s_bitmap);
+	sbi->s_bitmap = NULL;
+}
diff --git a/kernel/fs/affs/dir.c b/kernel/fs/affs/dir.c
new file mode 100644
index 000000000..ac4f318aa
--- /dev/null
+++ b/kernel/fs/affs/dir.c
@@ -0,0 +1,142 @@
+/*
+ *  linux/fs/affs/dir.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *
+ *  affs directory handling functions
+ *
+ */
+
+#include "affs.h"
+
+static int affs_readdir(struct file *, struct dir_context *);
+
+const struct file_operations affs_dir_operations = {
+	.read		= generic_read_dir,
+	.llseek		= generic_file_llseek,
+	.iterate	= affs_readdir,
+	.fsync		= affs_file_fsync,
+};
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations affs_dir_inode_operations = {
+	.create		= affs_create,
+	.lookup		= affs_lookup,
+	.link		= affs_link,
+	.unlink		= affs_unlink,
+	.symlink	= affs_symlink,
+	.mkdir		= affs_mkdir,
+	.rmdir		= affs_rmdir,
+	.rename		= affs_rename,
+	.setattr	= affs_notify_change,
+};
+
+static int
+affs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode		*inode = file_inode(file);
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*dir_bh = NULL;
+	struct buffer_head	*fh_bh = NULL;
+	unsigned char		*name;
+	int			 namelen;
+	u32			 i;
+	int			 hash_pos;
+	int			 chain_pos;
+	u32			 ino;
+	int			 error = 0;
+
+	pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
+
+	if (ctx->pos < 2) {
+		file->private_data = (void *)0;
+		if (!dir_emit_dots(file, ctx))
+			return 0;
+	}
+
+	affs_lock_dir(inode);
+	chain_pos = (ctx->pos - 2) & 0xffff;
+	hash_pos  = (ctx->pos - 2) >> 16;
+	if (chain_pos == 0xffff) {
+		affs_warning(sb, "readdir", "More than 65535 entries in chain");
+		chain_pos = 0;
+		hash_pos++;
+		ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
+	}
+	dir_bh = affs_bread(sb, inode->i_ino);
+	if (!dir_bh)
+		goto out_unlock_dir;
+
+	/* If the directory hasn't changed since the last call to readdir(),
+	 * we can jump directly to where we left off.
+	 */
+	ino = (u32)(long)file->private_data;
+	if (ino && file->f_version == inode->i_version) {
+		pr_debug("readdir() left off=%d\n", ino);
+		goto inside;
+	}
+
+	ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
+	for (i = 0; ino && i < chain_pos; i++) {
+		fh_bh = affs_bread(sb, ino);
+		if (!fh_bh) {
+			affs_error(sb, "readdir","Cannot read block %d", i);
+			error = -EIO;
+			goto out_brelse_dir;
+		}
+		ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
+		affs_brelse(fh_bh);
+		fh_bh = NULL;
+	}
+	if (ino)
+		goto inside;
+	hash_pos++;
+
+	for (; hash_pos < AFFS_SB(sb)->s_hashsize; hash_pos++) {
+		ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
+		if (!ino)
+			continue;
+		ctx->pos = (hash_pos << 16) + 2;
+inside:
+		do {
+			fh_bh = affs_bread(sb, ino);
+			if (!fh_bh) {
+				affs_error(sb, "readdir",
+					   "Cannot read block %d", ino);
+				break;
+			}
+
+			namelen = min(AFFS_TAIL(sb, fh_bh)->name[0],
+				      (u8)AFFSNAMEMAX);
+			name = AFFS_TAIL(sb, fh_bh)->name + 1;
+			pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n",
+				 namelen, name, ino, hash_pos, ctx->pos);
+
+			if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
+				goto done;
+			ctx->pos++;
+			ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
+			affs_brelse(fh_bh);
+			fh_bh = NULL;
+		} while (ino);
+	}
+done:
+	file->f_version = inode->i_version;
+	file->private_data = (void *)(long)ino;
+	affs_brelse(fh_bh);
+
+out_brelse_dir:
+	affs_brelse(dir_bh);
+
+out_unlock_dir:
+	affs_unlock_dir(inode);
+	return error;
+}
diff --git a/kernel/fs/affs/file.c b/kernel/fs/affs/file.c
new file mode 100644
index 000000000..659c579c4
--- /dev/null
+++ b/kernel/fs/affs/file.c
@@ -0,0 +1,982 @@
+/*
+ *  linux/fs/affs/file.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *
+ *  affs regular file handling primitives
+ */
+
+#include <linux/uio.h>
+#include "affs.h"
+
+static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
+
+static int
+affs_file_open(struct inode *inode, struct file *filp)
+{
+	pr_debug("open(%lu,%d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+	atomic_inc(&AFFS_I(inode)->i_opencnt);
+	return 0;
+}
+
+static int
+affs_file_release(struct inode *inode, struct file *filp)
+{
+	pr_debug("release(%lu, %d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+
+	if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		if (inode->i_size != AFFS_I(inode)->mmu_private)
+			affs_truncate(inode);
+		affs_free_prealloc(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	return 0;
+}
+
+static int
+affs_grow_extcache(struct inode *inode, u32 lc_idx)
+{
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*bh;
+	u32 lc_max;
+	int i, j, key;
+
+	if (!AFFS_I(inode)->i_lc) {
+		char *ptr = (char *)get_zeroed_page(GFP_NOFS);
+		if (!ptr)
+			return -ENOMEM;
+		AFFS_I(inode)->i_lc = (u32 *)ptr;
+		AFFS_I(inode)->i_ac = (struct affs_ext_key *)(ptr + AFFS_CACHE_SIZE / 2);
+	}
+
+	lc_max = AFFS_LC_SIZE << AFFS_I(inode)->i_lc_shift;
+
+	if (AFFS_I(inode)->i_extcnt > lc_max) {
+		u32 lc_shift, lc_mask, tmp, off;
+
+		/* need to recalculate linear cache, start from old size */
+		lc_shift = AFFS_I(inode)->i_lc_shift;
+		tmp = (AFFS_I(inode)->i_extcnt / AFFS_LC_SIZE) >> lc_shift;
+		for (; tmp; tmp >>= 1)
+			lc_shift++;
+		lc_mask = (1 << lc_shift) - 1;
+
+		/* fix idx and old size to new shift */
+		lc_idx >>= (lc_shift - AFFS_I(inode)->i_lc_shift);
+		AFFS_I(inode)->i_lc_size >>= (lc_shift - AFFS_I(inode)->i_lc_shift);
+
+		/* first shrink old cache to make more space */
+		off = 1 << (lc_shift - AFFS_I(inode)->i_lc_shift);
+		for (i = 1, j = off; j < AFFS_LC_SIZE; i++, j += off)
+			AFFS_I(inode)->i_ac[i] = AFFS_I(inode)->i_ac[j];
+
+		AFFS_I(inode)->i_lc_shift = lc_shift;
+		AFFS_I(inode)->i_lc_mask = lc_mask;
+	}
+
+	/* fill cache to the needed index */
+	i = AFFS_I(inode)->i_lc_size;
+	AFFS_I(inode)->i_lc_size = lc_idx + 1;
+	for (; i <= lc_idx; i++) {
+		if (!i) {
+			AFFS_I(inode)->i_lc[0] = inode->i_ino;
+			continue;
+		}
+		key = AFFS_I(inode)->i_lc[i - 1];
+		j = AFFS_I(inode)->i_lc_mask + 1;
+		// unlock cache
+		for (; j > 0; j--) {
+			bh = affs_bread(sb, key);
+			if (!bh)
+				goto err;
+			key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+			affs_brelse(bh);
+		}
+		// lock cache
+		AFFS_I(inode)->i_lc[i] = key;
+	}
+
+	return 0;
+
+err:
+	// lock cache
+	return -EIO;
+}
+
+static struct buffer_head *
+affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh;
+	u32 blocknr, tmp;
+
+	blocknr = affs_alloc_block(inode, bh->b_blocknr);
+	if (!blocknr)
+		return ERR_PTR(-ENOSPC);
+
+	new_bh = affs_getzeroblk(sb, blocknr);
+	if (!new_bh) {
+		affs_free_block(sb, blocknr);
+		return ERR_PTR(-EIO);
+	}
+
+	AFFS_HEAD(new_bh)->ptype = cpu_to_be32(T_LIST);
+	AFFS_HEAD(new_bh)->key = cpu_to_be32(blocknr);
+	AFFS_TAIL(sb, new_bh)->stype = cpu_to_be32(ST_FILE);
+	AFFS_TAIL(sb, new_bh)->parent = cpu_to_be32(inode->i_ino);
+	affs_fix_checksum(sb, new_bh);
+
+	mark_buffer_dirty_inode(new_bh, inode);
+
+	tmp = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+	if (tmp)
+		affs_warning(sb, "alloc_ext", "previous extension set (%x)", tmp);
+	AFFS_TAIL(sb, bh)->extension = cpu_to_be32(blocknr);
+	affs_adjust_checksum(bh, blocknr - tmp);
+	mark_buffer_dirty_inode(bh, inode);
+
+	AFFS_I(inode)->i_extcnt++;
+	mark_inode_dirty(inode);
+
+	return new_bh;
+}
+
+static inline struct buffer_head *
+affs_get_extblock(struct inode *inode, u32 ext)
+{
+	/* inline the simplest case: same extended block as last time */
+	struct buffer_head *bh = AFFS_I(inode)->i_ext_bh;
+	if (ext == AFFS_I(inode)->i_ext_last)
+		get_bh(bh);
+	else
+		/* we have to do more (not inlined) */
+		bh = affs_get_extblock_slow(inode, ext);
+
+	return bh;
+}
+
+static struct buffer_head *
+affs_get_extblock_slow(struct inode *inode, u32 ext)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh;
+	u32 ext_key;
+	u32 lc_idx, lc_off, ac_idx;
+	u32 tmp, idx;
+
+	if (ext == AFFS_I(inode)->i_ext_last + 1) {
+		/* read the next extended block from the current one */
+		bh = AFFS_I(inode)->i_ext_bh;
+		ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+		if (ext < AFFS_I(inode)->i_extcnt)
+			goto read_ext;
+		BUG_ON(ext > AFFS_I(inode)->i_extcnt);
+		bh = affs_alloc_extblock(inode, bh, ext);
+		if (IS_ERR(bh))
+			return bh;
+		goto store_ext;
+	}
+
+	if (ext == 0) {
+		/* we seek back to the file header block */
+		ext_key = inode->i_ino;
+		goto read_ext;
+	}
+
+	if (ext >= AFFS_I(inode)->i_extcnt) {
+		struct buffer_head *prev_bh;
+
+		/* allocate a new extended block */
+		BUG_ON(ext > AFFS_I(inode)->i_extcnt);
+
+		/* get previous extended block */
+		prev_bh = affs_get_extblock(inode, ext - 1);
+		if (IS_ERR(prev_bh))
+			return prev_bh;
+		bh = affs_alloc_extblock(inode, prev_bh, ext);
+		affs_brelse(prev_bh);
+		if (IS_ERR(bh))
+			return bh;
+		goto store_ext;
+	}
+
+again:
+	/* check if there is an extended cache and whether it's large enough */
+	lc_idx = ext >> AFFS_I(inode)->i_lc_shift;
+	lc_off = ext & AFFS_I(inode)->i_lc_mask;
+
+	if (lc_idx >= AFFS_I(inode)->i_lc_size) {
+		int err;
+
+		err = affs_grow_extcache(inode, lc_idx);
+		if (err)
+			return ERR_PTR(err);
+		goto again;
+	}
+
+	/* every n'th key we find in the linear cache */
+	if (!lc_off) {
+		ext_key = AFFS_I(inode)->i_lc[lc_idx];
+		goto read_ext;
+	}
+
+	/* maybe it's still in the associative cache */
+	ac_idx = (ext - lc_idx - 1) & AFFS_AC_MASK;
+	if (AFFS_I(inode)->i_ac[ac_idx].ext == ext) {
+		ext_key = AFFS_I(inode)->i_ac[ac_idx].key;
+		goto read_ext;
+	}
+
+	/* try to find one of the previous extended blocks */
+	tmp = ext;
+	idx = ac_idx;
+	while (--tmp, --lc_off > 0) {
+		idx = (idx - 1) & AFFS_AC_MASK;
+		if (AFFS_I(inode)->i_ac[idx].ext == tmp) {
+			ext_key = AFFS_I(inode)->i_ac[idx].key;
+			goto find_ext;
+		}
+	}
+
+	/* fall back to the linear cache */
+	ext_key = AFFS_I(inode)->i_lc[lc_idx];
+find_ext:
+	/* read all extended blocks until we find the one we need */
+	//unlock cache
+	do {
+		bh = affs_bread(sb, ext_key);
+		if (!bh)
+			goto err_bread;
+		ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+		affs_brelse(bh);
+		tmp++;
+	} while (tmp < ext);
+	//lock cache
+
+	/* store it in the associative cache */
+	// recalculate ac_idx?
+	AFFS_I(inode)->i_ac[ac_idx].ext = ext;
+	AFFS_I(inode)->i_ac[ac_idx].key = ext_key;
+
+read_ext:
+	/* finally read the right extended block */
+	//unlock cache
+	bh = affs_bread(sb, ext_key);
+	if (!bh)
+		goto err_bread;
+	//lock cache
+
+store_ext:
+	/* release old cached extended block and store the new one */
+	affs_brelse(AFFS_I(inode)->i_ext_bh);
+	AFFS_I(inode)->i_ext_last = ext;
+	AFFS_I(inode)->i_ext_bh = bh;
+	get_bh(bh);
+
+	return bh;
+
+err_bread:
+	affs_brelse(bh);
+	return ERR_PTR(-EIO);
+}
+
+static int
+affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create)
+{
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*ext_bh;
+	u32			 ext;
+
+	pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino,
+		 (unsigned long long)block);
+
+	BUG_ON(block > (sector_t)0x7fffffffUL);
+
+	if (block >= AFFS_I(inode)->i_blkcnt) {
+		if (block > AFFS_I(inode)->i_blkcnt || !create)
+			goto err_big;
+	} else
+		create = 0;
+
+	//lock cache
+	affs_lock_ext(inode);
+
+	ext = (u32)block / AFFS_SB(sb)->s_hashsize;
+	block -= ext * AFFS_SB(sb)->s_hashsize;
+	ext_bh = affs_get_extblock(inode, ext);
+	if (IS_ERR(ext_bh))
+		goto err_ext;
+	map_bh(bh_result, sb, (sector_t)be32_to_cpu(AFFS_BLOCK(sb, ext_bh, block)));
+
+	if (create) {
+		u32 blocknr = affs_alloc_block(inode, ext_bh->b_blocknr);
+		if (!blocknr)
+			goto err_alloc;
+		set_buffer_new(bh_result);
+		AFFS_I(inode)->mmu_private += AFFS_SB(sb)->s_data_blksize;
+		AFFS_I(inode)->i_blkcnt++;
+
+		/* store new block */
+		if (bh_result->b_blocknr)
+			affs_warning(sb, "get_block",
+				     "block already set (%llx)",
+				     (unsigned long long)bh_result->b_blocknr);
+		AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
+		AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
+		affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
+		bh_result->b_blocknr = blocknr;
+
+		if (!block) {
+			/* insert first block into header block */
+			u32 tmp = be32_to_cpu(AFFS_HEAD(ext_bh)->first_data);
+			if (tmp)
+				affs_warning(sb, "get_block", "first block already set (%d)", tmp);
+			AFFS_HEAD(ext_bh)->first_data = cpu_to_be32(blocknr);
+			affs_adjust_checksum(ext_bh, blocknr - tmp);
+		}
+	}
+
+	affs_brelse(ext_bh);
+	//unlock cache
+	affs_unlock_ext(inode);
+	return 0;
+
+err_big:
+	affs_error(inode->i_sb, "get_block", "strange block request %llu",
+		   (unsigned long long)block);
+	return -EIO;
+err_ext:
+	// unlock cache
+	affs_unlock_ext(inode);
+	return PTR_ERR(ext_bh);
+err_alloc:
+	brelse(ext_bh);
+	clear_buffer_mapped(bh_result);
+	bh_result->b_bdev = NULL;
+	// unlock cache
+	affs_unlock_ext(inode);
+	return -ENOSPC;
+}
+
+static int affs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, affs_get_block, wbc);
+}
+
+static int affs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, affs_get_block);
+}
+
+static void affs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		affs_truncate(inode);
+	}
+}
+
+static ssize_t
+affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (iov_iter_rw(iter) == WRITE) {
+		loff_t size = offset + count;
+
+		if (AFFS_I(inode)->mmu_private < size)
+			return 0;
+	}
+
+	ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block);
+	if (ret < 0 && iov_iter_rw(iter) == WRITE)
+		affs_write_failed(mapping, offset + count);
+	return ret;
+}
+
+static int affs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				affs_get_block,
+				&AFFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret))
+		affs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,affs_get_block);
+}
+
+const struct address_space_operations affs_aops = {
+	.readpage = affs_readpage,
+	.writepage = affs_writepage,
+	.write_begin = affs_write_begin,
+	.write_end = generic_write_end,
+	.direct_IO = affs_direct_IO,
+	.bmap = _affs_bmap
+};
+
+static inline struct buffer_head *
+affs_bread_ino(struct inode *inode, int block, int create)
+{
+	struct buffer_head *bh, tmp_bh;
+	int err;
+
+	tmp_bh.b_state = 0;
+	err = affs_get_block(inode, block, &tmp_bh, create);
+	if (!err) {
+		bh = affs_bread(inode->i_sb, tmp_bh.b_blocknr);
+		if (bh) {
+			bh->b_state |= tmp_bh.b_state;
+			return bh;
+		}
+		err = -EIO;
+	}
+	return ERR_PTR(err);
+}
+
+static inline struct buffer_head *
+affs_getzeroblk_ino(struct inode *inode, int block)
+{
+	struct buffer_head *bh, tmp_bh;
+	int err;
+
+	tmp_bh.b_state = 0;
+	err = affs_get_block(inode, block, &tmp_bh, 1);
+	if (!err) {
+		bh = affs_getzeroblk(inode->i_sb, tmp_bh.b_blocknr);
+		if (bh) {
+			bh->b_state |= tmp_bh.b_state;
+			return bh;
+		}
+		err = -EIO;
+	}
+	return ERR_PTR(err);
+}
+
+static inline struct buffer_head *
+affs_getemptyblk_ino(struct inode *inode, int block)
+{
+	struct buffer_head *bh, tmp_bh;
+	int err;
+
+	tmp_bh.b_state = 0;
+	err = affs_get_block(inode, block, &tmp_bh, 1);
+	if (!err) {
+		bh = affs_getemptyblk(inode->i_sb, tmp_bh.b_blocknr);
+		if (bh) {
+			bh->b_state |= tmp_bh.b_state;
+			return bh;
+		}
+		err = -EIO;
+	}
+	return ERR_PTR(err);
+}
+
+static int
+affs_do_readpage_ofs(struct page *page, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh;
+	char *data;
+	unsigned pos = 0;
+	u32 bidx, boff, bsize;
+	u32 tmp;
+
+	pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
+		 page->index, to);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	kmap(page);
+	data = page_address(page);
+	bsize = AFFS_SB(sb)->s_data_blksize;
+	tmp = page->index << PAGE_CACHE_SHIFT;
+	bidx = tmp / bsize;
+	boff = tmp % bsize;
+
+	while (pos < to) {
+		bh = affs_bread_ino(inode, bidx, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		tmp = min(bsize - boff, to - pos);
+		BUG_ON(pos + tmp > to || tmp > bsize);
+		memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
+		affs_brelse(bh);
+		bidx++;
+		pos += tmp;
+		boff = 0;
+	}
+	flush_dcache_page(page);
+	kunmap(page);
+	return 0;
+}
+
+static int
+affs_extent_file_ofs(struct inode *inode, u32 newsize)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh, *prev_bh;
+	u32 bidx, boff;
+	u32 size, bsize;
+	u32 tmp;
+
+	pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize);
+	bsize = AFFS_SB(sb)->s_data_blksize;
+	bh = NULL;
+	size = AFFS_I(inode)->mmu_private;
+	bidx = size / bsize;
+	boff = size % bsize;
+	if (boff) {
+		bh = affs_bread_ino(inode, bidx, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		tmp = min(bsize - boff, newsize - size);
+		BUG_ON(boff + tmp > bsize || tmp > bsize);
+		memset(AFFS_DATA(bh) + boff, 0, tmp);
+		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		size += tmp;
+		bidx++;
+	} else if (bidx) {
+		bh = affs_bread_ino(inode, bidx - 1, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+	}
+
+	while (size < newsize) {
+		prev_bh = bh;
+		bh = affs_getzeroblk_ino(inode, bidx);
+		if (IS_ERR(bh))
+			goto out;
+		tmp = min(bsize, newsize - size);
+		BUG_ON(tmp > bsize);
+		AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
+		AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
+		AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
+		affs_fix_checksum(sb, bh);
+		bh->b_state &= ~(1UL << BH_New);
+		mark_buffer_dirty_inode(bh, inode);
+		if (prev_bh) {
+			u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+			if (tmp_next)
+				affs_warning(sb, "extent_file_ofs",
+					     "next block already set for %d (%d)",
+					     bidx, tmp_next);
+			AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
+			affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
+			mark_buffer_dirty_inode(prev_bh, inode);
+			affs_brelse(prev_bh);
+		}
+		size += bsize;
+		bidx++;
+	}
+	affs_brelse(bh);
+	inode->i_size = AFFS_I(inode)->mmu_private = newsize;
+	return 0;
+
+out:
+	inode->i_size = AFFS_I(inode)->mmu_private = newsize;
+	return PTR_ERR(bh);
+}
+
+static int
+affs_readpage_ofs(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	u32 to;
+	int err;
+
+	pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
+	to = PAGE_CACHE_SIZE;
+	if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
+		to = inode->i_size & ~PAGE_CACHE_MASK;
+		memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
+	}
+
+	err = affs_do_readpage_ofs(page, to);
+	if (!err)
+		SetPageUptodate(page);
+	unlock_page(page);
+	return err;
+}
+
+static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	pgoff_t index;
+	int err = 0;
+
+	pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
+		 pos + len);
+	if (pos > AFFS_I(inode)->mmu_private) {
+		/* XXX: this probably leaves a too-big i_size in case of
+		 * failure. Should really be updating i_size at write_end time
+		 */
+		err = affs_extent_file_ofs(inode, pos);
+		if (err)
+			return err;
+	}
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if (PageUptodate(page))
+		return 0;
+
+	/* XXX: inefficient but safe in the face of short writes */
+	err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
+	if (err) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return err;
+}
+
+static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh, *prev_bh;
+	char *data;
+	u32 bidx, boff, bsize;
+	unsigned from, to;
+	u32 tmp;
+	int written;
+
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = pos + len;
+	/*
+	 * XXX: not sure if this can handle short copies (len < copied), but
+	 * we don't have to, because the page should always be uptodate here,
+	 * due to write_begin.
+	 */
+
+	pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
+		 pos + len);
+	bsize = AFFS_SB(sb)->s_data_blksize;
+	data = page_address(page);
+
+	bh = NULL;
+	written = 0;
+	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	bidx = tmp / bsize;
+	boff = tmp % bsize;
+	if (boff) {
+		bh = affs_bread_ino(inode, bidx, 0);
+		if (IS_ERR(bh)) {
+			written = PTR_ERR(bh);
+			goto err_first_bh;
+		}
+		tmp = min(bsize - boff, to - from);
+		BUG_ON(boff + tmp > bsize || tmp > bsize);
+		memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
+		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		written += tmp;
+		from += tmp;
+		bidx++;
+	} else if (bidx) {
+		bh = affs_bread_ino(inode, bidx - 1, 0);
+		if (IS_ERR(bh)) {
+			written = PTR_ERR(bh);
+			goto err_first_bh;
+		}
+	}
+	while (from + bsize <= to) {
+		prev_bh = bh;
+		bh = affs_getemptyblk_ino(inode, bidx);
+		if (IS_ERR(bh))
+			goto err_bh;
+		memcpy(AFFS_DATA(bh), data + from, bsize);
+		if (buffer_new(bh)) {
+			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
+			AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
+			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize);
+			AFFS_DATA_HEAD(bh)->next = 0;
+			bh->b_state &= ~(1UL << BH_New);
+			if (prev_bh) {
+				u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+				if (tmp_next)
+					affs_warning(sb, "commit_write_ofs",
+						     "next block already set for %d (%d)",
+						     bidx, tmp_next);
+				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
+				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
+				mark_buffer_dirty_inode(prev_bh, inode);
+			}
+		}
+		affs_brelse(prev_bh);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		written += bsize;
+		from += bsize;
+		bidx++;
+	}
+	if (from < to) {
+		prev_bh = bh;
+		bh = affs_bread_ino(inode, bidx, 1);
+		if (IS_ERR(bh))
+			goto err_bh;
+		tmp = min(bsize, to - from);
+		BUG_ON(tmp > bsize);
+		memcpy(AFFS_DATA(bh), data + from, tmp);
+		if (buffer_new(bh)) {
+			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
+			AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
+			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
+			AFFS_DATA_HEAD(bh)->next = 0;
+			bh->b_state &= ~(1UL << BH_New);
+			if (prev_bh) {
+				u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+				if (tmp_next)
+					affs_warning(sb, "commit_write_ofs",
+						     "next block already set for %d (%d)",
+						     bidx, tmp_next);
+				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
+				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
+				mark_buffer_dirty_inode(prev_bh, inode);
+			}
+		} else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp)
+			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
+		affs_brelse(prev_bh);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		written += tmp;
+		from += tmp;
+		bidx++;
+	}
+	SetPageUptodate(page);
+
+done:
+	affs_brelse(bh);
+	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	if (tmp > inode->i_size)
+		inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+
+err_first_bh:
+	unlock_page(page);
+	page_cache_release(page);
+
+	return written;
+
+err_bh:
+	bh = prev_bh;
+	if (!written)
+		written = PTR_ERR(bh);
+	goto done;
+}
+
+const struct address_space_operations affs_aops_ofs = {
+	.readpage = affs_readpage_ofs,
+	//.writepage = affs_writepage_ofs,
+	.write_begin = affs_write_begin_ofs,
+	.write_end = affs_write_end_ofs
+};
+
+/* Free any preallocated blocks. */
+
+void
+affs_free_prealloc(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino);
+
+	while (AFFS_I(inode)->i_pa_cnt) {
+		AFFS_I(inode)->i_pa_cnt--;
+		affs_free_block(sb, ++AFFS_I(inode)->i_lastalloc);
+	}
+}
+
+/* Truncate (or enlarge) a file to the requested size. */
+
+void
+affs_truncate(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	u32 ext, ext_key;
+	u32 last_blk, blkcnt, blk;
+	u32 size;
+	struct buffer_head *ext_bh;
+	int i;
+
+	pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n",
+		 inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size);
+
+	last_blk = 0;
+	ext = 0;
+	if (inode->i_size) {
+		last_blk = ((u32)inode->i_size - 1) / AFFS_SB(sb)->s_data_blksize;
+		ext = last_blk / AFFS_SB(sb)->s_hashsize;
+	}
+
+	if (inode->i_size > AFFS_I(inode)->mmu_private) {
+		struct address_space *mapping = inode->i_mapping;
+		struct page *page;
+		void *fsdata;
+		loff_t isize = inode->i_size;
+		int res;
+
+		res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata);
+		if (!res)
+			res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
+		else
+			inode->i_size = AFFS_I(inode)->mmu_private;
+		mark_inode_dirty(inode);
+		return;
+	} else if (inode->i_size == AFFS_I(inode)->mmu_private)
+		return;
+
+	// lock cache
+	ext_bh = affs_get_extblock(inode, ext);
+	if (IS_ERR(ext_bh)) {
+		affs_warning(sb, "truncate",
+			     "unexpected read error for ext block %u (%ld)",
+			     ext, PTR_ERR(ext_bh));
+		return;
+	}
+	if (AFFS_I(inode)->i_lc) {
+		/* clear linear cache */
+		i = (ext + 1) >> AFFS_I(inode)->i_lc_shift;
+		if (AFFS_I(inode)->i_lc_size > i) {
+			AFFS_I(inode)->i_lc_size = i;
+			for (; i < AFFS_LC_SIZE; i++)
+				AFFS_I(inode)->i_lc[i] = 0;
+		}
+		/* clear associative cache */
+		for (i = 0; i < AFFS_AC_SIZE; i++)
+			if (AFFS_I(inode)->i_ac[i].ext >= ext)
+				AFFS_I(inode)->i_ac[i].ext = 0;
+	}
+	ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension);
+
+	blkcnt = AFFS_I(inode)->i_blkcnt;
+	i = 0;
+	blk = last_blk;
+	if (inode->i_size) {
+		i = last_blk % AFFS_SB(sb)->s_hashsize + 1;
+		blk++;
+	} else
+		AFFS_HEAD(ext_bh)->first_data = 0;
+	AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i);
+	size = AFFS_SB(sb)->s_hashsize;
+	if (size > blkcnt - blk + i)
+		size = blkcnt - blk + i;
+	for (; i < size; i++, blk++) {
+		affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i)));
+		AFFS_BLOCK(sb, ext_bh, i) = 0;
+	}
+	AFFS_TAIL(sb, ext_bh)->extension = 0;
+	affs_fix_checksum(sb, ext_bh);
+	mark_buffer_dirty_inode(ext_bh, inode);
+	affs_brelse(ext_bh);
+
+	if (inode->i_size) {
+		AFFS_I(inode)->i_blkcnt = last_blk + 1;
+		AFFS_I(inode)->i_extcnt = ext + 1;
+		if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS)) {
+			struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
+			u32 tmp;
+			if (IS_ERR(bh)) {
+				affs_warning(sb, "truncate",
+					     "unexpected read error for last block %u (%ld)",
+					     ext, PTR_ERR(bh));
+				return;
+			}
+			tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
+			AFFS_DATA_HEAD(bh)->next = 0;
+			affs_adjust_checksum(bh, -tmp);
+			affs_brelse(bh);
+		}
+	} else {
+		AFFS_I(inode)->i_blkcnt = 0;
+		AFFS_I(inode)->i_extcnt = 1;
+	}
+	AFFS_I(inode)->mmu_private = inode->i_size;
+	// unlock cache
+
+	while (ext_key) {
+		ext_bh = affs_bread(sb, ext_key);
+		size = AFFS_SB(sb)->s_hashsize;
+		if (size > blkcnt - blk)
+			size = blkcnt - blk;
+		for (i = 0; i < size; i++, blk++)
+			affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i)));
+		affs_free_block(sb, ext_key);
+		ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension);
+		affs_brelse(ext_bh);
+	}
+	affs_free_prealloc(inode);
+}
+
+int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int ret, err;
+
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	ret = write_inode_now(inode, 0);
+	err = sync_blockdev(inode->i_sb->s_bdev);
+	if (!ret)
+		ret = err;
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+const struct file_operations affs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.open		= affs_file_open,
+	.release	= affs_file_release,
+	.fsync		= affs_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations affs_file_inode_operations = {
+	.setattr	= affs_notify_change,
+};
diff --git a/kernel/fs/affs/inode.c b/kernel/fs/affs/inode.c
new file mode 100644
index 000000000..a022f4acc
--- /dev/null
+++ b/kernel/fs/affs/inode.c
@@ -0,0 +1,416 @@
+/*
+ *  linux/fs/affs/inode.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include "affs.h"
+
+struct inode *affs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct affs_sb_info	*sbi = AFFS_SB(sb);
+	struct buffer_head	*bh;
+	struct affs_tail	*tail;
+	struct inode		*inode;
+	u32			 block;
+	u32			 size;
+	u32			 prot;
+	u16			 id;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	pr_debug("affs_iget(%lu)\n", inode->i_ino);
+
+	block = inode->i_ino;
+	bh = affs_bread(sb, block);
+	if (!bh) {
+		affs_warning(sb, "read_inode", "Cannot read block %d", block);
+		goto bad_inode;
+	}
+	if (affs_checksum_block(sb, bh) || be32_to_cpu(AFFS_HEAD(bh)->ptype) != T_SHORT) {
+		affs_warning(sb,"read_inode",
+			   "Checksum or type (ptype=%d) error on inode %d",
+			   AFFS_HEAD(bh)->ptype, block);
+		goto bad_inode;
+	}
+
+	tail = AFFS_TAIL(sb, bh);
+	prot = be32_to_cpu(tail->protect);
+
+	inode->i_size = 0;
+	set_nlink(inode, 1);
+	inode->i_mode = 0;
+	AFFS_I(inode)->i_extcnt = 1;
+	AFFS_I(inode)->i_ext_last = ~1;
+	AFFS_I(inode)->i_protect = prot;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
+	AFFS_I(inode)->i_blkcnt = 0;
+	AFFS_I(inode)->i_lc = NULL;
+	AFFS_I(inode)->i_lc_size = 0;
+	AFFS_I(inode)->i_lc_shift = 0;
+	AFFS_I(inode)->i_lc_mask = 0;
+	AFFS_I(inode)->i_ac = NULL;
+	AFFS_I(inode)->i_ext_bh = NULL;
+	AFFS_I(inode)->mmu_private = 0;
+	AFFS_I(inode)->i_lastalloc = 0;
+	AFFS_I(inode)->i_pa_cnt = 0;
+
+	if (affs_test_opt(sbi->s_flags, SF_SETMODE))
+		inode->i_mode = sbi->s_mode;
+	else
+		inode->i_mode = prot_to_mode(prot);
+
+	id = be16_to_cpu(tail->uid);
+	if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETUID))
+		inode->i_uid = sbi->s_uid;
+	else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS))
+		i_uid_write(inode, 0);
+	else
+		i_uid_write(inode, id);
+
+	id = be16_to_cpu(tail->gid);
+	if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETGID))
+		inode->i_gid = sbi->s_gid;
+	else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS))
+		i_gid_write(inode, 0);
+	else
+		i_gid_write(inode, id);
+
+	switch (be32_to_cpu(tail->stype)) {
+	case ST_ROOT:
+		inode->i_uid = sbi->s_uid;
+		inode->i_gid = sbi->s_gid;
+		/* fall through */
+	case ST_USERDIR:
+		if (be32_to_cpu(tail->stype) == ST_USERDIR ||
+		    affs_test_opt(sbi->s_flags, SF_SETMODE)) {
+			if (inode->i_mode & S_IRUSR)
+				inode->i_mode |= S_IXUSR;
+			if (inode->i_mode & S_IRGRP)
+				inode->i_mode |= S_IXGRP;
+			if (inode->i_mode & S_IROTH)
+				inode->i_mode |= S_IXOTH;
+			inode->i_mode |= S_IFDIR;
+		} else
+			inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR;
+		/* Maybe it should be controlled by mount parameter? */
+		//inode->i_mode |= S_ISVTX;
+		inode->i_op = &affs_dir_inode_operations;
+		inode->i_fop = &affs_dir_operations;
+		break;
+	case ST_LINKDIR:
+#if 0
+		affs_warning(sb, "read_inode", "inode is LINKDIR");
+		goto bad_inode;
+#else
+		inode->i_mode |= S_IFDIR;
+		/* ... and leave ->i_op and ->i_fop pointing to empty */
+		break;
+#endif
+	case ST_LINKFILE:
+		affs_warning(sb, "read_inode", "inode is LINKFILE");
+		goto bad_inode;
+	case ST_FILE:
+		size = be32_to_cpu(tail->size);
+		inode->i_mode |= S_IFREG;
+		AFFS_I(inode)->mmu_private = inode->i_size = size;
+		if (inode->i_size) {
+			AFFS_I(inode)->i_blkcnt = (size - 1) /
+					       sbi->s_data_blksize + 1;
+			AFFS_I(inode)->i_extcnt = (AFFS_I(inode)->i_blkcnt - 1) /
+					       sbi->s_hashsize + 1;
+		}
+		if (tail->link_chain)
+			set_nlink(inode, 2);
+		inode->i_mapping->a_ops = affs_test_opt(sbi->s_flags, SF_OFS) ?
+					  &affs_aops_ofs : &affs_aops;
+		inode->i_op = &affs_file_inode_operations;
+		inode->i_fop = &affs_file_operations;
+		break;
+	case ST_SOFTLINK:
+		inode->i_mode |= S_IFLNK;
+		inode->i_op = &affs_symlink_inode_operations;
+		inode->i_data.a_ops = &affs_symlink_aops;
+		break;
+	}
+
+	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec
+		       = (be32_to_cpu(tail->change.days) * (24 * 60 * 60) +
+		         be32_to_cpu(tail->change.mins) * 60 +
+			 be32_to_cpu(tail->change.ticks) / 50 +
+			 ((8 * 365 + 2) * 24 * 60 * 60)) +
+			 sys_tz.tz_minuteswest * 60;
+	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0;
+	affs_brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	affs_brelse(bh);
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+int
+affs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*bh;
+	struct affs_tail	*tail;
+	uid_t			 uid;
+	gid_t			 gid;
+
+	pr_debug("write_inode(%lu)\n", inode->i_ino);
+
+	if (!inode->i_nlink)
+		// possibly free block
+		return 0;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh) {
+		affs_error(sb,"write_inode","Cannot read block %lu",inode->i_ino);
+		return -EIO;
+	}
+	tail = AFFS_TAIL(sb, bh);
+	if (tail->stype == cpu_to_be32(ST_ROOT)) {
+		secs_to_datestamp(inode->i_mtime.tv_sec,&AFFS_ROOT_TAIL(sb, bh)->root_change);
+	} else {
+		tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect);
+		tail->size = cpu_to_be32(inode->i_size);
+		secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change);
+		if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
+			uid = i_uid_read(inode);
+			gid = i_gid_read(inode);
+			if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_MUFS)) {
+				if (uid == 0 || uid == 0xFFFF)
+					uid = uid ^ ~0;
+				if (gid == 0 || gid == 0xFFFF)
+					gid = gid ^ ~0;
+			}
+			if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETUID))
+				tail->uid = cpu_to_be16(uid);
+			if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETGID))
+				tail->gid = cpu_to_be16(gid);
+		}
+	}
+	affs_fix_checksum(sb, bh);
+	mark_buffer_dirty_inode(bh, inode);
+	affs_brelse(bh);
+	affs_free_prealloc(inode);
+	return 0;
+}
+
+int
+affs_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
+
+	error = inode_change_ok(inode,attr);
+	if (error)
+		goto out;
+
+	if (((attr->ia_valid & ATTR_UID) &&
+	      affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETUID)) ||
+	    ((attr->ia_valid & ATTR_GID) &&
+	      affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETGID)) ||
+	    ((attr->ia_valid & ATTR_MODE) &&
+	     (AFFS_SB(inode->i_sb)->s_flags &
+	      (AFFS_MOUNT_SF_SETMODE | AFFS_MOUNT_SF_IMMUTABLE)))) {
+		if (!affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_QUIET))
+			error = -EPERM;
+		goto out;
+	}
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+
+		truncate_setsize(inode, attr->ia_size);
+		affs_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	if (attr->ia_valid & ATTR_MODE)
+		mode_to_prot(inode);
+out:
+	return error;
+}
+
+void
+affs_evict_inode(struct inode *inode)
+{
+	unsigned long cache_page;
+	pr_debug("evict_inode(ino=%lu, nlink=%u)\n",
+		 inode->i_ino, inode->i_nlink);
+	truncate_inode_pages_final(&inode->i_data);
+
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		affs_truncate(inode);
+	}
+
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+	affs_free_prealloc(inode);
+	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
+	if (cache_page) {
+		pr_debug("freeing ext cache\n");
+		AFFS_I(inode)->i_lc = NULL;
+		AFFS_I(inode)->i_ac = NULL;
+		free_page(cache_page);
+	}
+	affs_brelse(AFFS_I(inode)->i_ext_bh);
+	AFFS_I(inode)->i_ext_last = ~1;
+	AFFS_I(inode)->i_ext_bh = NULL;
+
+	if (!inode->i_nlink)
+		affs_free_block(inode->i_sb, inode->i_ino);
+}
+
+struct inode *
+affs_new_inode(struct inode *dir)
+{
+	struct super_block	*sb = dir->i_sb;
+	struct inode		*inode;
+	u32			 block;
+	struct buffer_head	*bh;
+
+	if (!(inode = new_inode(sb)))
+		goto err_inode;
+
+	if (!(block = affs_alloc_block(dir, dir->i_ino)))
+		goto err_block;
+
+	bh = affs_getzeroblk(sb, block);
+	if (!bh)
+		goto err_bh;
+	mark_buffer_dirty_inode(bh, inode);
+	affs_brelse(bh);
+
+	inode->i_uid     = current_fsuid();
+	inode->i_gid     = current_fsgid();
+	inode->i_ino     = block;
+	set_nlink(inode, 1);
+	inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
+	AFFS_I(inode)->i_blkcnt = 0;
+	AFFS_I(inode)->i_lc = NULL;
+	AFFS_I(inode)->i_lc_size = 0;
+	AFFS_I(inode)->i_lc_shift = 0;
+	AFFS_I(inode)->i_lc_mask = 0;
+	AFFS_I(inode)->i_ac = NULL;
+	AFFS_I(inode)->i_ext_bh = NULL;
+	AFFS_I(inode)->mmu_private = 0;
+	AFFS_I(inode)->i_protect = 0;
+	AFFS_I(inode)->i_lastalloc = 0;
+	AFFS_I(inode)->i_pa_cnt = 0;
+	AFFS_I(inode)->i_extcnt = 1;
+	AFFS_I(inode)->i_ext_last = ~1;
+
+	insert_inode_hash(inode);
+
+	return inode;
+
+err_bh:
+	affs_free_block(sb, block);
+err_block:
+	iput(inode);
+err_inode:
+	return NULL;
+}
+
+/*
+ * Add an entry to a directory. Create the header block
+ * and insert it into the hash table.
+ */
+
+int
+affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *inode_bh = NULL;
+	struct buffer_head *bh = NULL;
+	u32 block = 0;
+	int retval;
+
+	pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__,
+		 dir->i_ino, inode->i_ino, dentry, type);
+
+	retval = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto done;
+
+	affs_lock_link(inode);
+	switch (type) {
+	case ST_LINKFILE:
+	case ST_LINKDIR:
+		retval = -ENOSPC;
+		block = affs_alloc_block(dir, dir->i_ino);
+		if (!block)
+			goto err;
+		retval = -EIO;
+		inode_bh = bh;
+		bh = affs_getzeroblk(sb, block);
+		if (!bh)
+			goto err;
+		break;
+	default:
+		break;
+	}
+
+	AFFS_HEAD(bh)->ptype = cpu_to_be32(T_SHORT);
+	AFFS_HEAD(bh)->key = cpu_to_be32(bh->b_blocknr);
+	affs_copy_name(AFFS_TAIL(sb, bh)->name, dentry);
+	AFFS_TAIL(sb, bh)->stype = cpu_to_be32(type);
+	AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino);
+
+	if (inode_bh) {
+		__be32 chain;
+	       	chain = AFFS_TAIL(sb, inode_bh)->link_chain;
+		AFFS_TAIL(sb, bh)->original = cpu_to_be32(inode->i_ino);
+		AFFS_TAIL(sb, bh)->link_chain = chain;
+		AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
+		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
+		mark_buffer_dirty_inode(inode_bh, inode);
+		set_nlink(inode, 2);
+		ihold(inode);
+	}
+	affs_fix_checksum(sb, bh);
+	mark_buffer_dirty_inode(bh, inode);
+	dentry->d_fsdata = (void *)(long)bh->b_blocknr;
+
+	affs_lock_dir(dir);
+	retval = affs_insert_hash(dir, bh);
+	mark_buffer_dirty_inode(bh, inode);
+	affs_unlock_dir(dir);
+	affs_unlock_link(inode);
+
+	d_instantiate(dentry, inode);
+done:
+	affs_brelse(inode_bh);
+	affs_brelse(bh);
+	return retval;
+err:
+	if (block)
+		affs_free_block(sb, block);
+	affs_unlock_link(inode);
+	goto done;
+}
diff --git a/kernel/fs/affs/namei.c b/kernel/fs/affs/namei.c
new file mode 100644
index 000000000..181e05b46
--- /dev/null
+++ b/kernel/fs/affs/namei.c
@@ -0,0 +1,462 @@
+/*
+ *  linux/fs/affs/namei.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+
+#include "affs.h"
+
+typedef int (*toupper_t)(int);
+
+static int	 affs_toupper(int ch);
+static int	 affs_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name);
+static int	 affs_intl_toupper(int ch);
+static int	 affs_intl_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name);
+
+const struct dentry_operations affs_dentry_operations = {
+	.d_hash		= affs_hash_dentry,
+	.d_compare	= affs_compare_dentry,
+};
+
+const struct dentry_operations affs_intl_dentry_operations = {
+	.d_hash		= affs_intl_hash_dentry,
+	.d_compare	= affs_intl_compare_dentry,
+};
+
+
+/* Simple toupper() for DOS\1 */
+
+static int
+affs_toupper(int ch)
+{
+	return ch >= 'a' && ch <= 'z' ? ch -= ('a' - 'A') : ch;
+}
+
+/* International toupper() for DOS\3 ("international") */
+
+static int
+affs_intl_toupper(int ch)
+{
+	return (ch >= 'a' && ch <= 'z') || (ch >= 0xE0
+		&& ch <= 0xFE && ch != 0xF7) ?
+		ch - ('a' - 'A') : ch;
+}
+
+static inline toupper_t
+affs_get_toupper(struct super_block *sb)
+{
+	return affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL) ?
+	       affs_intl_toupper : affs_toupper;
+}
+
+/*
+ * Note: the dentry argument is the parent dentry.
+ */
+static inline int
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
+{
+	const u8 *name = qstr->name;
+	unsigned long hash;
+	int retval;
+	u32 len;
+
+	retval = affs_check_name(qstr->name, qstr->len, notruncate);
+	if (retval)
+		return retval;
+
+	hash = init_name_hash();
+	len = min(qstr->len, AFFSNAMEMAX);
+	for (; len > 0; name++, len--)
+		hash = partial_name_hash(toupper(*name), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int
+affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
+{
+	return __affs_hash_dentry(qstr, affs_toupper,
+				  affs_nofilenametruncate(dentry));
+
+}
+
+static int
+affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
+{
+	return __affs_hash_dentry(qstr, affs_intl_toupper,
+				  affs_nofilenametruncate(dentry));
+
+}
+
+static inline int __affs_compare_dentry(unsigned int len,
+		const char *str, const struct qstr *name, toupper_t toupper,
+		bool notruncate)
+{
+	const u8 *aname = str;
+	const u8 *bname = name->name;
+
+	/*
+	 * 'str' is the name of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
+	 */
+
+	if (affs_check_name(name->name, name->len, notruncate))
+		return 1;
+
+	/*
+	 * If the names are longer than the allowed 30 chars,
+	 * the excess is ignored, so their length may differ.
+	 */
+	if (len >= AFFSNAMEMAX) {
+		if (name->len < AFFSNAMEMAX)
+			return 1;
+		len = AFFSNAMEMAX;
+	} else if (len != name->len)
+		return 1;
+
+	for (; len > 0; len--)
+		if (toupper(*aname++) != toupper(*bname++))
+			return 1;
+
+	return 0;
+}
+
+static int
+affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+
+	return __affs_compare_dentry(len, str, name, affs_toupper,
+				     affs_nofilenametruncate(parent));
+}
+
+static int
+affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return __affs_compare_dentry(len, str, name, affs_intl_toupper,
+				     affs_nofilenametruncate(parent));
+
+}
+
+/*
+ * NOTE! unlike strncmp, affs_match returns 1 for success, 0 for failure.
+ */
+
+static inline int
+affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
+{
+	const u8 *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+
+	if (len >= AFFSNAMEMAX) {
+		if (*name2 < AFFSNAMEMAX)
+			return 0;
+		len = AFFSNAMEMAX;
+	} else if (len != *name2)
+		return 0;
+
+	for (name2++; len > 0; len--)
+		if (toupper(*name++) != toupper(*name2++))
+			return 0;
+	return 1;
+}
+
+int
+affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
+{
+	toupper_t toupper = affs_get_toupper(sb);
+	u32 hash;
+
+	hash = len = min(len, AFFSNAMEMAX);
+	for (; len > 0; len--)
+		hash = (hash * 13 + toupper(*name++)) & 0x7ff;
+
+	return hash % AFFS_SB(sb)->s_hashsize;
+}
+
+static struct buffer_head *
+affs_find_entry(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *bh;
+	toupper_t toupper = affs_get_toupper(sb);
+	u32 key;
+
+	pr_debug("%s(\"%pd\")\n", __func__, dentry);
+
+	bh = affs_bread(sb, dir->i_ino);
+	if (!bh)
+		return ERR_PTR(-EIO);
+
+	key = be32_to_cpu(AFFS_HEAD(bh)->table[affs_hash_name(sb, dentry->d_name.name, dentry->d_name.len)]);
+
+	for (;;) {
+		affs_brelse(bh);
+		if (key == 0)
+			return NULL;
+		bh = affs_bread(sb, key);
+		if (!bh)
+			return ERR_PTR(-EIO);
+		if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper))
+			return bh;
+		key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain);
+	}
+}
+
+struct dentry *
+affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *bh;
+	struct inode *inode = NULL;
+
+	pr_debug("%s(\"%pd\")\n", __func__, dentry);
+
+	affs_lock_dir(dir);
+	bh = affs_find_entry(dir, dentry);
+	affs_unlock_dir(dir);
+	if (IS_ERR(bh))
+		return ERR_CAST(bh);
+	if (bh) {
+		u32 ino = bh->b_blocknr;
+
+		/* store the real header ino in d_fsdata for faster lookups */
+		dentry->d_fsdata = (void *)(long)ino;
+		switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
+		//link to dirs disabled
+		//case ST_LINKDIR:
+		case ST_LINKFILE:
+			ino = be32_to_cpu(AFFS_TAIL(sb, bh)->original);
+		}
+		affs_brelse(bh);
+		inode = affs_iget(sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+int
+affs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
+		 d_inode(dentry)->i_ino, dentry);
+
+	return affs_remove_header(dentry);
+}
+
+int
+affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode	*inode;
+	int		 error;
+
+	pr_debug("%s(%lu,\"%pd\",0%ho)\n",
+		 __func__, dir->i_ino, dentry, mode);
+
+	inode = affs_new_inode(dir);
+	if (!inode)
+		return -ENOSPC;
+
+	inode->i_mode = mode;
+	mode_to_prot(inode);
+	mark_inode_dirty(inode);
+
+	inode->i_op = &affs_file_inode_operations;
+	inode->i_fop = &affs_file_operations;
+	inode->i_mapping->a_ops = affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS) ?
+				  &affs_aops_ofs : &affs_aops;
+	error = affs_add_entry(dir, inode, dentry, ST_FILE);
+	if (error) {
+		clear_nlink(inode);
+		iput(inode);
+		return error;
+	}
+	return 0;
+}
+
+int
+affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode		*inode;
+	int			 error;
+
+	pr_debug("%s(%lu,\"%pd\",0%ho)\n",
+		 __func__, dir->i_ino, dentry, mode);
+
+	inode = affs_new_inode(dir);
+	if (!inode)
+		return -ENOSPC;
+
+	inode->i_mode = S_IFDIR | mode;
+	mode_to_prot(inode);
+
+	inode->i_op = &affs_dir_inode_operations;
+	inode->i_fop = &affs_dir_operations;
+
+	error = affs_add_entry(dir, inode, dentry, ST_USERDIR);
+	if (error) {
+		clear_nlink(inode);
+		mark_inode_dirty(inode);
+		iput(inode);
+		return error;
+	}
+	return 0;
+}
+
+int
+affs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
+		 d_inode(dentry)->i_ino, dentry);
+
+	return affs_remove_header(dentry);
+}
+
+int
+affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct super_block	*sb = dir->i_sb;
+	struct buffer_head	*bh;
+	struct inode		*inode;
+	char			*p;
+	int			 i, maxlen, error;
+	char			 c, lc;
+
+	pr_debug("%s(%lu,\"%pd\" -> \"%s\")\n",
+		 __func__, dir->i_ino, dentry, symname);
+
+	maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1;
+	inode  = affs_new_inode(dir);
+	if (!inode)
+		return -ENOSPC;
+
+	inode->i_op = &affs_symlink_inode_operations;
+	inode->i_data.a_ops = &affs_symlink_aops;
+	inode->i_mode = S_IFLNK | 0777;
+	mode_to_prot(inode);
+
+	error = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto err;
+	i  = 0;
+	p  = (char *)AFFS_HEAD(bh)->table;
+	lc = '/';
+	if (*symname == '/') {
+		struct affs_sb_info *sbi = AFFS_SB(sb);
+		while (*symname == '/')
+			symname++;
+		spin_lock(&sbi->symlink_lock);
+		while (sbi->s_volume[i])	/* Cannot overflow */
+			*p++ = sbi->s_volume[i++];
+		spin_unlock(&sbi->symlink_lock);
+	}
+	while (i < maxlen && (c = *symname++)) {
+		if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') {
+			*p++ = '/';
+			i++;
+			symname += 2;
+			lc = '/';
+		} else if (c == '.' && lc == '/' && *symname == '/') {
+			symname++;
+			lc = '/';
+		} else {
+			*p++ = c;
+			lc   = c;
+			i++;
+		}
+		if (lc == '/')
+			while (*symname == '/')
+				symname++;
+	}
+	*p = 0;
+	mark_buffer_dirty_inode(bh, inode);
+	affs_brelse(bh);
+	mark_inode_dirty(inode);
+
+	error = affs_add_entry(dir, inode, dentry, ST_SOFTLINK);
+	if (error)
+		goto err;
+
+	return 0;
+
+err:
+	clear_nlink(inode);
+	mark_inode_dirty(inode);
+	iput(inode);
+	return error;
+}
+
+int
+affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+
+	pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino,
+		 dentry);
+
+	return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
+}
+
+int
+affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+	    struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct super_block *sb = old_dir->i_sb;
+	struct buffer_head *bh = NULL;
+	int retval;
+
+	pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
+		 old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
+
+	retval = affs_check_name(new_dentry->d_name.name,
+				 new_dentry->d_name.len,
+				 affs_nofilenametruncate(old_dentry));
+
+	if (retval)
+		return retval;
+
+	/* Unlink destination if it already exists */
+	if (d_really_is_positive(new_dentry)) {
+		retval = affs_remove_header(new_dentry);
+		if (retval)
+			return retval;
+	}
+
+	bh = affs_bread(sb, d_inode(old_dentry)->i_ino);
+	if (!bh)
+		return -EIO;
+
+	/* Remove header from its parent directory. */
+	affs_lock_dir(old_dir);
+	retval = affs_remove_hash(old_dir, bh);
+	affs_unlock_dir(old_dir);
+	if (retval)
+		goto done;
+
+	/* And insert it into the new directory with the new name. */
+	affs_copy_name(AFFS_TAIL(sb, bh)->name, new_dentry);
+	affs_fix_checksum(sb, bh);
+	affs_lock_dir(new_dir);
+	retval = affs_insert_hash(new_dir, bh);
+	affs_unlock_dir(new_dir);
+	/* TODO: move it back to old_dir, if error? */
+
+done:
+	mark_buffer_dirty_inode(bh, retval ? old_dir : new_dir);
+	affs_brelse(bh);
+	return retval;
+}
diff --git a/kernel/fs/affs/super.c b/kernel/fs/affs/super.c
new file mode 100644
index 000000000..3f89c9e05
--- /dev/null
+++ b/kernel/fs/affs/super.c
@@ -0,0 +1,649 @@
+/*
+ *  linux/fs/affs/inode.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include "affs.h"
+
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int affs_remount (struct super_block *sb, int *flags, char *data);
+
+static void
+affs_commit_super(struct super_block *sb, int wait)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	struct buffer_head *bh = sbi->s_root_bh;
+	struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
+
+	lock_buffer(bh);
+	secs_to_datestamp(get_seconds(), &tail->disk_change);
+	affs_fix_checksum(sb, bh);
+	unlock_buffer(bh);
+
+	mark_buffer_dirty(bh);
+	if (wait)
+		sync_dirty_buffer(bh);
+}
+
+static void
+affs_put_super(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	pr_debug("%s()\n", __func__);
+
+	cancel_delayed_work_sync(&sbi->sb_work);
+}
+
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+	affs_commit_super(sb, wait);
+	return 0;
+}
+
+static void flush_superblock(struct work_struct *work)
+{
+	struct affs_sb_info *sbi;
+	struct super_block *sb;
+
+	sbi = container_of(work, struct affs_sb_info, sb_work.work);
+	sb = sbi->sb;
+
+	spin_lock(&sbi->work_lock);
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->work_lock);
+
+	affs_commit_super(sb, 1);
+}
+
+void affs_mark_sb_dirty(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	unsigned long delay;
+
+	if (sb->s_flags & MS_RDONLY)
+	       return;
+
+	spin_lock(&sbi->work_lock);
+	if (!sbi->work_queued) {
+	       delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+	       queue_delayed_work(system_long_wq, &sbi->sb_work, delay);
+	       sbi->work_queued = 1;
+	}
+	spin_unlock(&sbi->work_lock);
+}
+
+static struct kmem_cache * affs_inode_cachep;
+
+static struct inode *affs_alloc_inode(struct super_block *sb)
+{
+	struct affs_inode_info *i;
+
+	i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+	if (!i)
+		return NULL;
+
+	i->vfs_inode.i_version = 1;
+	i->i_lc = NULL;
+	i->i_ext_bh = NULL;
+	i->i_pa_cnt = 0;
+
+	return &i->vfs_inode;
+}
+
+static void affs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
+}
+
+static void affs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, affs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
+
+	sema_init(&ei->i_link_lock, 1);
+	sema_init(&ei->i_ext_lock, 1);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
+					     sizeof(struct affs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (affs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(affs_inode_cachep);
+}
+
+static const struct super_operations affs_sops = {
+	.alloc_inode	= affs_alloc_inode,
+	.destroy_inode	= affs_destroy_inode,
+	.write_inode	= affs_write_inode,
+	.evict_inode	= affs_evict_inode,
+	.put_super	= affs_put_super,
+	.sync_fs	= affs_sync_fs,
+	.statfs		= affs_statfs,
+	.remount_fs	= affs_remount,
+	.show_options	= generic_show_options,
+};
+
+enum {
+	Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
+	Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
+	Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_bs, "bs=%u"},
+	{Opt_mode, "mode=%o"},
+	{Opt_mufs, "mufs"},
+	{Opt_notruncate, "nofilenametruncate"},
+	{Opt_prefix, "prefix=%s"},
+	{Opt_protect, "protect"},
+	{Opt_reserved, "reserved=%u"},
+	{Opt_root, "root=%u"},
+	{Opt_setgid, "setgid=%u"},
+	{Opt_setuid, "setuid=%u"},
+	{Opt_verbose, "verbose"},
+	{Opt_volume, "volume=%s"},
+	{Opt_ignore, "grpquota"},
+	{Opt_ignore, "noquota"},
+	{Opt_ignore, "quota"},
+	{Opt_ignore, "usrquota"},
+	{Opt_err, NULL},
+};
+
+static int
+parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
+		int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	/* Fill in defaults */
+
+	*uid        = current_uid();
+	*gid        = current_gid();
+	*reserved   = 2;
+	*root       = -1;
+	*blocksize  = -1;
+	volume[0]   = ':';
+	volume[1]   = 0;
+	*mount_opts = 0;
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, n, option;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bs:
+			if (match_int(&args[0], &n))
+				return 0;
+			if (n != 512 && n != 1024 && n != 2048
+			    && n != 4096) {
+				pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+				return 0;
+			}
+			*blocksize = n;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return 0;
+			*mode = option & 0777;
+			affs_set_opt(*mount_opts, SF_SETMODE);
+			break;
+		case Opt_mufs:
+			affs_set_opt(*mount_opts, SF_MUFS);
+			break;
+		case Opt_notruncate:
+			affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
+			break;
+		case Opt_prefix:
+			*prefix = match_strdup(&args[0]);
+			if (!*prefix)
+				return 0;
+			affs_set_opt(*mount_opts, SF_PREFIX);
+			break;
+		case Opt_protect:
+			affs_set_opt(*mount_opts, SF_IMMUTABLE);
+			break;
+		case Opt_reserved:
+			if (match_int(&args[0], reserved))
+				return 0;
+			break;
+		case Opt_root:
+			if (match_int(&args[0], root))
+				return 0;
+			break;
+		case Opt_setgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			*gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(*gid))
+				return 0;
+			affs_set_opt(*mount_opts, SF_SETGID);
+			break;
+		case Opt_setuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			*uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(*uid))
+				return 0;
+			affs_set_opt(*mount_opts, SF_SETUID);
+			break;
+		case Opt_verbose:
+			affs_set_opt(*mount_opts, SF_VERBOSE);
+			break;
+		case Opt_volume: {
+			char *vol = match_strdup(&args[0]);
+			if (!vol)
+				return 0;
+			strlcpy(volume, vol, 32);
+			kfree(vol);
+			break;
+		}
+		case Opt_ignore:
+		 	/* Silently ignore the quota options */
+			break;
+		default:
+			pr_warn("Unrecognized mount option \"%s\" or missing value\n",
+				p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/* This function definitely needs to be split up. Some fine day I'll
+ * hopefully have the guts to do so. Until then: sorry for the mess.
+ */
+
+static int affs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct affs_sb_info	*sbi;
+	struct buffer_head	*root_bh = NULL;
+	struct buffer_head	*boot_bh;
+	struct inode		*root_inode = NULL;
+	s32			 root_block;
+	int			 size, blocksize;
+	u32			 chksum;
+	int			 num_bm;
+	int			 i, j;
+	kuid_t			 uid;
+	kgid_t			 gid;
+	int			 reserved;
+	unsigned long		 mount_flags;
+	int			 tmp_flags;	/* fix remount prototype... */
+	u8			 sig[4];
+	int			 ret;
+
+	save_mount_options(sb, data);
+
+	pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
+
+	sb->s_magic             = AFFS_SUPER_MAGIC;
+	sb->s_op                = &affs_sops;
+	sb->s_flags |= MS_NODIRATIME;
+
+	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+	sbi->sb = sb;
+	mutex_init(&sbi->s_bmlock);
+	spin_lock_init(&sbi->symlink_lock);
+	spin_lock_init(&sbi->work_lock);
+	INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock);
+
+	if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
+				&blocksize,&sbi->s_prefix,
+				sbi->s_volume, &mount_flags)) {
+		pr_err("Error parsing options\n");
+		return -EINVAL;
+	}
+	/* N.B. after this point s_prefix must be released */
+
+	sbi->s_flags   = mount_flags;
+	sbi->s_mode    = i;
+	sbi->s_uid     = uid;
+	sbi->s_gid     = gid;
+	sbi->s_reserved= reserved;
+
+	/* Get the size of the device in 512-byte blocks.
+	 * If we later see that the partition uses bigger
+	 * blocks, we will have to change it.
+	 */
+
+	size = sb->s_bdev->bd_inode->i_size >> 9;
+	pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
+
+	affs_set_blocksize(sb, PAGE_SIZE);
+	/* Try to find root block. Its location depends on the block size. */
+
+	i = 512;
+	j = 4096;
+	if (blocksize > 0) {
+		i = j = blocksize;
+		size = size / (blocksize / 512);
+	}
+	for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
+		sbi->s_root_block = root_block;
+		if (root_block < 0)
+			sbi->s_root_block = (reserved + size - 1) / 2;
+		pr_debug("setting blocksize to %d\n", blocksize);
+		affs_set_blocksize(sb, blocksize);
+		sbi->s_partition_size = size;
+
+		/* The root block location that was calculated above is not
+		 * correct if the partition size is an odd number of 512-
+		 * byte blocks, which will be rounded down to a number of
+		 * 1024-byte blocks, and if there were an even number of
+		 * reserved blocks. Ideally, all partition checkers should
+		 * report the real number of blocks of the real blocksize,
+		 * but since this just cannot be done, we have to try to
+		 * find the root block anyways. In the above case, it is one
+		 * block behind the calculated one. So we check this one, too.
+		 */
+		for (num_bm = 0; num_bm < 2; num_bm++) {
+			pr_debug("Dev %s, trying root=%u, bs=%d, "
+				"size=%d, reserved=%d\n",
+				sb->s_id,
+				sbi->s_root_block + num_bm,
+				blocksize, size, reserved);
+			root_bh = affs_bread(sb, sbi->s_root_block + num_bm);
+			if (!root_bh)
+				continue;
+			if (!affs_checksum_block(sb, root_bh) &&
+			    be32_to_cpu(AFFS_ROOT_HEAD(root_bh)->ptype) == T_SHORT &&
+			    be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) {
+				sbi->s_hashsize    = blocksize / 4 - 56;
+				sbi->s_root_block += num_bm;
+				goto got_root;
+			}
+			affs_brelse(root_bh);
+			root_bh = NULL;
+		}
+	}
+	if (!silent)
+		pr_err("No valid root block on device %s\n", sb->s_id);
+	return -EINVAL;
+
+	/* N.B. after this point bh must be released */
+got_root:
+	/* Keep super block in cache */
+	sbi->s_root_bh = root_bh;
+	root_block = sbi->s_root_block;
+
+	/* Find out which kind of FS we have */
+	boot_bh = sb_bread(sb, 0);
+	if (!boot_bh) {
+		pr_err("Cannot read boot block\n");
+		return -EINVAL;
+	}
+	memcpy(sig, boot_bh->b_data, 4);
+	brelse(boot_bh);
+	chksum = be32_to_cpu(*(__be32 *)sig);
+
+	/* Dircache filesystems are compatible with non-dircache ones
+	 * when reading. As long as they aren't supported, writing is
+	 * not recommended.
+	 */
+	if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
+	     || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) {
+		pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
+		sb->s_flags |= MS_RDONLY;
+	}
+	switch (chksum) {
+	case MUFS_FS:
+	case MUFS_INTLFFS:
+	case MUFS_DCFFS:
+		affs_set_opt(sbi->s_flags, SF_MUFS);
+		/* fall thru */
+	case FS_INTLFFS:
+	case FS_DCFFS:
+		affs_set_opt(sbi->s_flags, SF_INTL);
+		break;
+	case MUFS_FFS:
+		affs_set_opt(sbi->s_flags, SF_MUFS);
+		break;
+	case FS_FFS:
+		break;
+	case MUFS_OFS:
+		affs_set_opt(sbi->s_flags, SF_MUFS);
+		/* fall thru */
+	case FS_OFS:
+		affs_set_opt(sbi->s_flags, SF_OFS);
+		sb->s_flags |= MS_NOEXEC;
+		break;
+	case MUFS_DCOFS:
+	case MUFS_INTLOFS:
+		affs_set_opt(sbi->s_flags, SF_MUFS);
+	case FS_DCOFS:
+	case FS_INTLOFS:
+		affs_set_opt(sbi->s_flags, SF_INTL);
+		affs_set_opt(sbi->s_flags, SF_OFS);
+		sb->s_flags |= MS_NOEXEC;
+		break;
+	default:
+		pr_err("Unknown filesystem on device %s: %08X\n",
+		       sb->s_id, chksum);
+		return -EINVAL;
+	}
+
+	if (affs_test_opt(mount_flags, SF_VERBOSE)) {
+		u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
+		pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
+			len > 31 ? 31 : len,
+			AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
+			sig, sig[3] + '0', blocksize);
+	}
+
+	sb->s_flags |= MS_NODEV | MS_NOSUID;
+
+	sbi->s_data_blksize = sb->s_blocksize;
+	if (affs_test_opt(sbi->s_flags, SF_OFS))
+		sbi->s_data_blksize -= 24;
+
+	tmp_flags = sb->s_flags;
+	ret = affs_init_bitmap(sb, &tmp_flags);
+	if (ret)
+		return ret;
+	sb->s_flags = tmp_flags;
+
+	/* set up enough so that it can read an inode */
+
+	root_inode = affs_iget(sb, root_block);
+	if (IS_ERR(root_inode))
+		return PTR_ERR(root_inode);
+
+	if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
+		sb->s_d_op = &affs_intl_dentry_operations;
+	else
+		sb->s_d_op = &affs_dentry_operations;
+
+	sb->s_root = d_make_root(root_inode);
+	if (!sb->s_root) {
+		pr_err("AFFS: Get root inode failed\n");
+		return -ENOMEM;
+	}
+
+	pr_debug("s_flags=%lX\n", sb->s_flags);
+	return 0;
+}
+
+static int
+affs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct affs_sb_info	*sbi = AFFS_SB(sb);
+	int			 blocksize;
+	kuid_t			 uid;
+	kgid_t			 gid;
+	int			 mode;
+	int			 reserved;
+	int			 root_block;
+	unsigned long		 mount_flags;
+	int			 res = 0;
+	char			*new_opts;
+	char			 volume[32];
+	char			*prefix = NULL;
+
+	new_opts = kstrdup(data, GFP_KERNEL);
+	if (!new_opts)
+		return -ENOMEM;
+
+	pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
+
+	sync_filesystem(sb);
+	*flags |= MS_NODIRATIME;
+
+	memcpy(volume, sbi->s_volume, 32);
+	if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
+			   &blocksize, &prefix, volume,
+			   &mount_flags)) {
+		kfree(prefix);
+		kfree(new_opts);
+		return -EINVAL;
+	}
+
+	flush_delayed_work(&sbi->sb_work);
+	replace_mount_options(sb, new_opts);
+
+	sbi->s_flags = mount_flags;
+	sbi->s_mode  = mode;
+	sbi->s_uid   = uid;
+	sbi->s_gid   = gid;
+	/* protect against readers */
+	spin_lock(&sbi->symlink_lock);
+	if (prefix) {
+		kfree(sbi->s_prefix);
+		sbi->s_prefix = prefix;
+	}
+	memcpy(sbi->s_volume, volume, 32);
+	spin_unlock(&sbi->symlink_lock);
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+
+	if (*flags & MS_RDONLY)
+		affs_free_bitmap(sb);
+	else
+		res = affs_init_bitmap(sb, flags);
+
+	return res;
+}
+
+static int
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	int		 free;
+	u64		 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	pr_debug("%s() partsize=%d, reserved=%d\n",
+		 __func__, AFFS_SB(sb)->s_partition_size,
+		 AFFS_SB(sb)->s_reserved);
+
+	free          = affs_count_free_blocks(sb);
+	buf->f_type    = AFFS_SUPER_MAGIC;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
+	buf->f_bfree   = free;
+	buf->f_bavail  = free;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = AFFSNAMEMAX;
+	return 0;
+}
+
+static struct dentry *affs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+}
+
+static void affs_kill_sb(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	kill_block_super(sb);
+	if (sbi) {
+		affs_free_bitmap(sb);
+		affs_brelse(sbi->s_root_bh);
+		kfree(sbi->s_prefix);
+		mutex_destroy(&sbi->s_bmlock);
+		kfree(sbi);
+	}
+}
+
+static struct file_system_type affs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "affs",
+	.mount		= affs_mount,
+	.kill_sb	= affs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("affs");
+
+static int __init init_affs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&affs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_affs_fs(void)
+{
+	unregister_filesystem(&affs_fs_type);
+	destroy_inodecache();
+}
+
+MODULE_DESCRIPTION("Amiga filesystem support for Linux");
+MODULE_LICENSE("GPL");
+
+module_init(init_affs_fs)
+module_exit(exit_affs_fs)
diff --git a/kernel/fs/affs/symlink.c b/kernel/fs/affs/symlink.c
new file mode 100644
index 000000000..f39b71c39
--- /dev/null
+++ b/kernel/fs/affs/symlink.c
@@ -0,0 +1,81 @@
+/*
+ *  linux/fs/affs/symlink.c
+ *
+ *  1995  Hans-Joachim Widmaier - Modified for affs.
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  affs symlink handling code
+ */
+
+#include "affs.h"
+
+static int affs_symlink_readpage(struct file *file, struct page *page)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	char *link = kmap(page);
+	struct slink_front *lf;
+	int err;
+	int			 i, j;
+	char			 c;
+	char			 lc;
+
+	pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
+
+	err = -EIO;
+	bh = affs_bread(inode->i_sb, inode->i_ino);
+	if (!bh)
+		goto fail;
+	i  = 0;
+	j  = 0;
+	lf = (struct slink_front *)bh->b_data;
+	lc = 0;
+
+	if (strchr(lf->symname,':')) {	/* Handle assign or volume name */
+		struct affs_sb_info *sbi = AFFS_SB(inode->i_sb);
+		char *pf;
+		spin_lock(&sbi->symlink_lock);
+		pf = sbi->s_prefix ? sbi->s_prefix : "/";
+		while (i < 1023 && (c = pf[i]))
+			link[i++] = c;
+		spin_unlock(&sbi->symlink_lock);
+		while (i < 1023 && lf->symname[j] != ':')
+			link[i++] = lf->symname[j++];
+		if (i < 1023)
+			link[i++] = '/';
+		j++;
+		lc = '/';
+	}
+	while (i < 1023 && (c = lf->symname[j])) {
+		if (c == '/' && lc == '/' && i < 1020) {	/* parent dir */
+			link[i++] = '.';
+			link[i++] = '.';
+		}
+		link[i++] = c;
+		lc = c;
+		j++;
+	}
+	link[i] = '\0';
+	affs_brelse(bh);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+const struct address_space_operations affs_symlink_aops = {
+	.readpage	= affs_symlink_readpage,
+};
+
+const struct inode_operations affs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= affs_notify_change,
+};
diff --git a/kernel/fs/afs/Kconfig b/kernel/fs/afs/Kconfig
new file mode 100644
index 000000000..ebba3b18e
--- /dev/null
+++ b/kernel/fs/afs/Kconfig
@@ -0,0 +1,29 @@
+config AFS_FS
+	tristate "Andrew File System support (AFS)"
+	depends on INET
+	select AF_RXRPC
+	select DNS_RESOLVER
+	help
+	  If you say Y here, you will get an experimental Andrew File System
+	  driver. It currently only supports unsecured read-only AFS access.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
+
+config AFS_DEBUG
+	bool "AFS dynamic debugging"
+	depends on AFS_FS
+	help
+	  Say Y here to make runtime controllable debugging messages appear.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
+
+config AFS_FSCACHE
+	bool "Provide AFS client caching support"
+	depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
+	help
+	  Say Y here if you want AFS data to be cached locally on disk through
+	  the generic filesystem cache manager
diff --git a/kernel/fs/afs/Makefile b/kernel/fs/afs/Makefile
new file mode 100644
index 000000000..4f64b95d5
--- /dev/null
+++ b/kernel/fs/afs/Makefile
@@ -0,0 +1,32 @@
+#
+# Makefile for Red Hat Linux AFS client.
+#
+
+afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
+
+kafs-objs := \
+	$(afs-cache-y) \
+	callback.o \
+	cell.o \
+	cmservice.o \
+	dir.o \
+	file.o \
+	flock.o \
+	fsclient.o \
+	inode.o \
+	main.o \
+	misc.o \
+	mntpt.o \
+	proc.o \
+	rxrpc.o \
+	security.o \
+	server.o \
+	super.o \
+	netdevices.o \
+	vlclient.o \
+	vlocation.o \
+	vnode.o \
+	volume.o \
+	write.o
+
+obj-$(CONFIG_AFS_FS)  := kafs.o
diff --git a/kernel/fs/afs/afs.h b/kernel/fs/afs/afs.h
new file mode 100644
index 000000000..3c462ff6d
--- /dev/null
+++ b/kernel/fs/afs/afs.h
@@ -0,0 +1,170 @@
+/* AFS common types
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_H
+#define AFS_H
+
+#include <linux/in.h>
+
+#define AFS_MAXCELLNAME	64		/* maximum length of a cell name */
+#define AFS_MAXVOLNAME	64		/* maximum length of a volume name */
+#define AFSNAMEMAX	256		/* maximum length of a filename plus NUL */
+#define AFSPATHMAX	1024		/* maximum length of a pathname plus NUL */
+#define AFSOPAQUEMAX	1024		/* maximum length of an opaque field */
+
+typedef unsigned			afs_volid_t;
+typedef unsigned			afs_vnodeid_t;
+typedef unsigned long long		afs_dataversion_t;
+
+typedef enum {
+	AFSVL_RWVOL,			/* read/write volume */
+	AFSVL_ROVOL,			/* read-only volume */
+	AFSVL_BACKVOL,			/* backup volume */
+} __attribute__((packed)) afs_voltype_t;
+
+typedef enum {
+	AFS_FTYPE_INVALID	= 0,
+	AFS_FTYPE_FILE		= 1,
+	AFS_FTYPE_DIR		= 2,
+	AFS_FTYPE_SYMLINK	= 3,
+} afs_file_type_t;
+
+typedef enum {
+	AFS_LOCK_READ		= 0,	/* read lock request */
+	AFS_LOCK_WRITE		= 1,	/* write lock request */
+} afs_lock_type_t;
+
+#define AFS_LOCKWAIT		(5 * 60) /* time until a lock times out (seconds) */
+
+/*
+ * AFS file identifier
+ */
+struct afs_fid {
+	afs_volid_t	vid;		/* volume ID */
+	afs_vnodeid_t	vnode;		/* file index within volume */
+	unsigned	unique;		/* unique ID number (file index version) */
+};
+
+/*
+ * AFS callback notification
+ */
+typedef enum {
+	AFSCM_CB_UNTYPED	= 0,	/* no type set on CB break */
+	AFSCM_CB_EXCLUSIVE	= 1,	/* CB exclusive to CM [not implemented] */
+	AFSCM_CB_SHARED		= 2,	/* CB shared by other CM's */
+	AFSCM_CB_DROPPED	= 3,	/* CB promise cancelled by file server */
+} afs_callback_type_t;
+
+struct afs_callback {
+	struct afs_fid		fid;		/* file identifier */
+	unsigned		version;	/* callback version */
+	unsigned		expiry;		/* time at which expires */
+	afs_callback_type_t	type;		/* type of callback */
+};
+
+#define AFSCBMAX 50	/* maximum callbacks transferred per bulk op */
+
+/*
+ * AFS volume information
+ */
+struct afs_volume_info {
+	afs_volid_t		vid;		/* volume ID */
+	afs_voltype_t		type;		/* type of this volume */
+	afs_volid_t		type_vids[5];	/* volume ID's for possible types for this vol */
+
+	/* list of fileservers serving this volume */
+	size_t			nservers;	/* number of entries used in servers[] */
+	struct {
+		struct in_addr	addr;		/* fileserver address */
+	} servers[8];
+};
+
+/*
+ * AFS security ACE access mask
+ */
+typedef u32 afs_access_t;
+#define AFS_ACE_READ		0x00000001U	/* - permission to read a file/dir */
+#define AFS_ACE_WRITE		0x00000002U	/* - permission to write/chmod a file */
+#define AFS_ACE_INSERT		0x00000004U	/* - permission to create dirent in a dir */
+#define AFS_ACE_LOOKUP		0x00000008U	/* - permission to lookup a file/dir in a dir */
+#define AFS_ACE_DELETE		0x00000010U	/* - permission to delete a dirent from a dir */
+#define AFS_ACE_LOCK		0x00000020U	/* - permission to lock a file */
+#define AFS_ACE_ADMINISTER	0x00000040U	/* - permission to change ACL */
+#define AFS_ACE_USER_A		0x01000000U	/* - 'A' user-defined permission */
+#define AFS_ACE_USER_B		0x02000000U	/* - 'B' user-defined permission */
+#define AFS_ACE_USER_C		0x04000000U	/* - 'C' user-defined permission */
+#define AFS_ACE_USER_D		0x08000000U	/* - 'D' user-defined permission */
+#define AFS_ACE_USER_E		0x10000000U	/* - 'E' user-defined permission */
+#define AFS_ACE_USER_F		0x20000000U	/* - 'F' user-defined permission */
+#define AFS_ACE_USER_G		0x40000000U	/* - 'G' user-defined permission */
+#define AFS_ACE_USER_H		0x80000000U	/* - 'H' user-defined permission */
+
+/*
+ * AFS file status information
+ */
+struct afs_file_status {
+	unsigned		if_version;	/* interface version */
+#define AFS_FSTATUS_VERSION	1
+
+	afs_file_type_t		type;		/* file type */
+	unsigned		nlink;		/* link count */
+	u64			size;		/* file size */
+	afs_dataversion_t	data_version;	/* current data version */
+	u32			author;		/* author ID */
+	kuid_t			owner;		/* owner ID */
+	kgid_t			group;		/* group ID */
+	afs_access_t		caller_access;	/* access rights for authenticated caller */
+	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
+	umode_t			mode;		/* UNIX mode */
+	struct afs_fid		parent;		/* parent dir ID for non-dirs only */
+	time_t			mtime_client;	/* last time client changed data */
+	time_t			mtime_server;	/* last time server changed data */
+	s32			lock_count;	/* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
+};
+
+/*
+ * AFS file status change request
+ */
+
+#define AFS_SET_MTIME		0x01		/* set the mtime */
+#define AFS_SET_OWNER		0x02		/* set the owner ID */
+#define AFS_SET_GROUP		0x04		/* set the group ID (unsupported?) */
+#define AFS_SET_MODE		0x08		/* set the UNIX mode */
+#define AFS_SET_SEG_SIZE	0x10		/* set the segment size (unsupported) */
+
+/*
+ * AFS volume synchronisation information
+ */
+struct afs_volsync {
+	time_t			creation;	/* volume creation time */
+};
+
+/*
+ * AFS volume status record
+ */
+struct afs_volume_status {
+	u32			vid;		/* volume ID */
+	u32			parent_id;	/* parent volume ID */
+	u8			online;		/* true if volume currently online and available */
+	u8			in_service;	/* true if volume currently in service */
+	u8			blessed;	/* same as in_service */
+	u8			needs_salvage;	/* true if consistency checking required */
+	u32			type;		/* volume type (afs_voltype_t) */
+	u32			min_quota;	/* minimum space set aside (blocks) */
+	u32			max_quota;	/* maximum space this volume may occupy (blocks) */
+	u32			blocks_in_use;	/* space this volume currently occupies (blocks) */
+	u32			part_blocks_avail; /* space available in volume's partition */
+	u32			part_max_blocks; /* size of volume's partition */
+};
+
+#define AFS_BLOCK_SIZE	1024
+
+#endif /* AFS_H */
diff --git a/kernel/fs/afs/afs_cm.h b/kernel/fs/afs/afs_cm.h
new file mode 100644
index 000000000..255f5dd60
--- /dev/null
+++ b/kernel/fs/afs/afs_cm.h
@@ -0,0 +1,33 @@
+/* AFS Cache Manager definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_CM_H
+#define AFS_CM_H
+
+#define AFS_CM_PORT		7001	/* AFS file server port */
+#define CM_SERVICE		1	/* AFS File Service ID */
+
+enum AFS_CM_Operations {
+	CBCallBack		= 204,	/* break callback promises */
+	CBInitCallBackState	= 205,	/* initialise callback state */
+	CBProbe			= 206,	/* probe client */
+	CBGetLock		= 207,	/* get contents of CM lock table */
+	CBGetCE			= 208,	/* get cache file description */
+	CBGetXStatsVersion	= 209,	/* get version of extended statistics */
+	CBGetXStats		= 210,	/* get contents of extended statistics data */
+	CBInitCallBackState3	= 213,	/* initialise callback state, version 3 */
+	CBProbeUuid		= 214,	/* check the client hasn't rebooted */
+	CBTellMeAboutYourself	= 65538, /* get client capabilities */
+};
+
+#define AFS_CAP_ERROR_TRANSLATION	0x1
+
+#endif /* AFS_FS_H */
diff --git a/kernel/fs/afs/afs_fs.h b/kernel/fs/afs/afs_fs.h
new file mode 100644
index 000000000..eb647323d
--- /dev/null
+++ b/kernel/fs/afs/afs_fs.h
@@ -0,0 +1,56 @@
+/* AFS File Service definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_FS_H
+#define AFS_FS_H
+
+#define AFS_FS_PORT		7000	/* AFS file server port */
+#define FS_SERVICE		1	/* AFS File Service ID */
+
+enum AFS_FS_Operations {
+	FSFETCHDATA		= 130,	/* AFS Fetch file data */
+	FSFETCHSTATUS		= 132,	/* AFS Fetch file status */
+	FSSTOREDATA		= 133,	/* AFS Store file data */
+	FSSTORESTATUS		= 135,	/* AFS Store file status */
+	FSREMOVEFILE		= 136,	/* AFS Remove a file */
+	FSCREATEFILE		= 137,	/* AFS Create a file */
+	FSRENAME		= 138,	/* AFS Rename or move a file or directory */
+	FSSYMLINK		= 139,	/* AFS Create a symbolic link */
+	FSLINK			= 140,	/* AFS Create a hard link */
+	FSMAKEDIR		= 141,	/* AFS Create a directory */
+	FSREMOVEDIR		= 142,	/* AFS Remove a directory */
+	FSGIVEUPCALLBACKS	= 147,	/* AFS Discard callback promises */
+	FSGETVOLUMEINFO		= 148,	/* AFS Get information about a volume */
+	FSGETVOLUMESTATUS	= 149,	/* AFS Get volume status information */
+	FSGETROOTVOLUME		= 151,	/* AFS Get root volume name */
+	FSSETLOCK		= 156,	/* AFS Request a file lock */
+	FSEXTENDLOCK		= 157,	/* AFS Extend a file lock */
+	FSRELEASELOCK		= 158,	/* AFS Release a file lock */
+	FSLOOKUP		= 161,	/* AFS lookup file in directory */
+	FSFETCHDATA64		= 65537, /* AFS Fetch file data */
+	FSSTOREDATA64		= 65538, /* AFS Store file data */
+};
+
+enum AFS_FS_Errors {
+	VSALVAGE	= 101,	/* volume needs salvaging */
+	VNOVNODE	= 102,	/* no such file/dir (vnode) */
+	VNOVOL		= 103,	/* no such volume or volume unavailable */
+	VVOLEXISTS	= 104,	/* volume name already exists */
+	VNOSERVICE	= 105,	/* volume not currently in service */
+	VOFFLINE	= 106,	/* volume is currently offline (more info available [VVL-spec]) */
+	VONLINE		= 107,	/* volume is already online */
+	VDISKFULL	= 108,	/* disk partition is full */
+	VOVERQUOTA	= 109,	/* volume's maximum quota exceeded */
+	VBUSY		= 110,	/* volume is temporarily unavailable */
+	VMOVED		= 111,	/* volume moved to new server - ask this FS where */
+};
+
+#endif /* AFS_FS_H */
diff --git a/kernel/fs/afs/afs_vl.h b/kernel/fs/afs/afs_vl.h
new file mode 100644
index 000000000..800f607ff
--- /dev/null
+++ b/kernel/fs/afs/afs_vl.h
@@ -0,0 +1,84 @@
+/* AFS Volume Location Service client interface
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_VL_H
+#define AFS_VL_H
+
+#include "afs.h"
+
+#define AFS_VL_PORT		7003	/* volume location service port */
+#define VL_SERVICE		52	/* RxRPC service ID for the Volume Location service */
+
+enum AFSVL_Operations {
+	VLGETENTRYBYID		= 503,	/* AFS Get Cache Entry By ID operation ID */
+	VLGETENTRYBYNAME	= 504,	/* AFS Get Cache Entry By Name operation ID */
+	VLPROBE			= 514,	/* AFS Probe Volume Location Service operation ID */
+};
+
+enum AFSVL_Errors {
+	AFSVL_IDEXIST 		= 363520,	/* Volume Id entry exists in vl database */
+	AFSVL_IO 		= 363521,	/* I/O related error */
+	AFSVL_NAMEEXIST 	= 363522,	/* Volume name entry exists in vl database */
+	AFSVL_CREATEFAIL 	= 363523,	/* Internal creation failure */
+	AFSVL_NOENT 		= 363524,	/* No such entry */
+	AFSVL_EMPTY 		= 363525,	/* Vl database is empty */
+	AFSVL_ENTDELETED 	= 363526,	/* Entry is deleted (soft delete) */
+	AFSVL_BADNAME 		= 363527,	/* Volume name is illegal */
+	AFSVL_BADINDEX 		= 363528,	/* Index is out of range */
+	AFSVL_BADVOLTYPE 	= 363529,	/* Bad volume type */
+	AFSVL_BADSERVER 	= 363530,	/* Illegal server number (out of range) */
+	AFSVL_BADPARTITION 	= 363531,	/* Bad partition number */
+	AFSVL_REPSFULL 		= 363532,	/* Run out of space for Replication sites */
+	AFSVL_NOREPSERVER 	= 363533,	/* No such Replication server site exists */
+	AFSVL_DUPREPSERVER 	= 363534,	/* Replication site already exists */
+	AFSVL_RWNOTFOUND 	= 363535,	/* Parent R/W entry not found */
+	AFSVL_BADREFCOUNT 	= 363536,	/* Illegal Reference Count number */
+	AFSVL_SIZEEXCEEDED 	= 363537,	/* Vl size for attributes exceeded */
+	AFSVL_BADENTRY 		= 363538,	/* Bad incoming vl entry */
+	AFSVL_BADVOLIDBUMP 	= 363539,	/* Illegal max volid increment */
+	AFSVL_IDALREADYHASHED 	= 363540,	/* RO/BACK id already hashed */
+	AFSVL_ENTRYLOCKED 	= 363541,	/* Vl entry is already locked */
+	AFSVL_BADVOLOPER 	= 363542,	/* Bad volume operation code */
+	AFSVL_BADRELLOCKTYPE 	= 363543,	/* Bad release lock type */
+	AFSVL_RERELEASE 	= 363544,	/* Status report: last release was aborted */
+	AFSVL_BADSERVERFLAG 	= 363545,	/* Invalid replication site server flag */
+	AFSVL_PERM 		= 363546,	/* No permission access */
+	AFSVL_NOMEM 		= 363547,	/* malloc/realloc failed to alloc enough memory */
+};
+
+/*
+ * maps to "struct vldbentry" in vvl-spec.pdf
+ */
+struct afs_vldbentry {
+	char		name[65];		/* name of volume (with NUL char) */
+	afs_voltype_t	type;			/* volume type */
+	unsigned	num_servers;		/* num servers that hold instances of this vol */
+	unsigned	clone_id;		/* cloning ID */
+
+	unsigned	flags;
+#define AFS_VLF_RWEXISTS	0x1000		/* R/W volume exists */
+#define AFS_VLF_ROEXISTS	0x2000		/* R/O volume exists */
+#define AFS_VLF_BACKEXISTS	0x4000		/* backup volume exists */
+
+	afs_volid_t	volume_ids[3];		/* volume IDs */
+
+	struct {
+		struct in_addr	addr;		/* server address */
+		unsigned	partition;	/* partition ID on this server */
+		unsigned	flags;		/* server specific flags */
+#define AFS_VLSF_NEWREPSITE	0x0001	/* unused */
+#define AFS_VLSF_ROVOL		0x0002	/* this server holds a R/O instance of the volume */
+#define AFS_VLSF_RWVOL		0x0004	/* this server holds a R/W instance of the volume */
+#define AFS_VLSF_BACKVOL	0x0008	/* this server holds a backup instance of the volume */
+	} servers[8];
+};
+
+#endif /* AFS_VL_H */
diff --git a/kernel/fs/afs/cache.c b/kernel/fs/afs/cache.c
new file mode 100644
index 000000000..577763c3d
--- /dev/null
+++ b/kernel/fs/afs/cache.c
@@ -0,0 +1,402 @@
+/* AFS caching stuff
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include "internal.h"
+
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+				       void *buffer, uint16_t buflen);
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+				       void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+						      const void *buffer,
+						      uint16_t buflen);
+
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vlocation_cache_check_aux(
+	void *cookie_netfs_data, const void *buffer, uint16_t buflen);
+
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+					 void *buffer, uint16_t buflen);
+
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t buflen);
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+				     uint64_t *size);
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+					void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+						       const void *buffer,
+						       uint16_t buflen);
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
+
+struct fscache_netfs afs_cache_netfs = {
+	.name			= "afs",
+	.version		= 0,
+};
+
+struct fscache_cookie_def afs_cell_cache_index_def = {
+	.name		= "AFS.cell",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= afs_cell_cache_get_key,
+	.get_aux	= afs_cell_cache_get_aux,
+	.check_aux	= afs_cell_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_vlocation_cache_index_def = {
+	.name			= "AFS.vldb",
+	.type			= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key		= afs_vlocation_cache_get_key,
+	.get_aux		= afs_vlocation_cache_get_aux,
+	.check_aux		= afs_vlocation_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_volume_cache_index_def = {
+	.name		= "AFS.volume",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= afs_volume_cache_get_key,
+};
+
+struct fscache_cookie_def afs_vnode_cache_index_def = {
+	.name			= "AFS.vnode",
+	.type			= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key		= afs_vnode_cache_get_key,
+	.get_attr		= afs_vnode_cache_get_attr,
+	.get_aux		= afs_vnode_cache_get_aux,
+	.check_aux		= afs_vnode_cache_check_aux,
+	.now_uncached		= afs_vnode_cache_now_uncached,
+};
+
+/*
+ * set the key for the index entry
+ */
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+				       void *buffer, uint16_t bufmax)
+{
+	const struct afs_cell *cell = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("%p,%p,%u", cell, buffer, bufmax);
+
+	klen = strlen(cell->name);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, cell->name, klen);
+	return klen;
+}
+
+/*
+ * provide new auxiliary cache data
+ */
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+				       void *buffer, uint16_t bufmax)
+{
+	const struct afs_cell *cell = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("%p,%p,%u", cell, buffer, bufmax);
+
+	dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
+	dlen = min(dlen, bufmax);
+	dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
+
+	memcpy(buffer, cell->vl_addrs, dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxiliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+						      const void *buffer,
+						      uint16_t buflen)
+{
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*****************************************************************************/
+/*
+ * set the key for the index entry
+ */
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+	klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, vlocation->vldb.name, klen);
+
+	_leave(" = %u", klen);
+	return klen;
+}
+
+/*
+ * provide new auxiliary cache data
+ */
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+	dlen = sizeof(struct afs_cache_vlocation);
+	dlen -= offsetof(struct afs_cache_vlocation, nservers);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
+
+	_leave(" = %u", dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxiliary data indicates that the entry is still valid
+ */
+static
+enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
+						    const void *buffer,
+						    uint16_t buflen)
+{
+	const struct afs_cache_vlocation *cvldb;
+	struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
+
+	/* check the size of the data is what we're expecting */
+	dlen = sizeof(struct afs_cache_vlocation);
+	dlen -= offsetof(struct afs_cache_vlocation, nservers);
+	if (dlen != buflen)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
+
+	/* if what's on disk is more valid than what's in memory, then use the
+	 * VL record from the cache */
+	if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
+		memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
+		vlocation->valid = 1;
+		_leave(" = SUCCESS [c->m]");
+		return FSCACHE_CHECKAUX_OKAY;
+	}
+
+	/* need to update the cache if the cached info differs */
+	if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
+		/* delete if the volume IDs for this name differ */
+		if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
+			   sizeof(cvldb->vid)) != 0
+		    ) {
+			_leave(" = OBSOLETE");
+			return FSCACHE_CHECKAUX_OBSOLETE;
+		}
+
+		_leave(" = UPDATE");
+		return FSCACHE_CHECKAUX_NEEDS_UPDATE;
+	}
+
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*****************************************************************************/
+/*
+ * set the key for the volume index entry
+ */
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
+{
+	const struct afs_volume *volume = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%u},%p,%u", volume->type, buffer, bufmax);
+
+	klen = sizeof(volume->type);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, &volume->type, sizeof(volume->type));
+
+	_leave(" = %u", klen);
+	return klen;
+
+}
+
+/*****************************************************************************/
+/*
+ * set the key for the index entry
+ */
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
+{
+	const struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%x,%x,%llx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, bufmax);
+
+	klen = sizeof(vnode->fid.vnode);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
+
+	_leave(" = %u", klen);
+	return klen;
+}
+
+/*
+ * provide updated file attributes
+ */
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+				     uint64_t *size)
+{
+	const struct afs_vnode *vnode = cookie_netfs_data;
+
+	_enter("{%x,%x,%llx},",
+	       vnode->fid.vnode, vnode->fid.unique,
+	       vnode->status.data_version);
+
+	*size = vnode->status.size;
+}
+
+/*
+ * provide new auxiliary cache data
+ */
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
+{
+	const struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%x,%x,%Lx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, bufmax);
+
+	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
+	buffer += sizeof(vnode->fid.unique);
+	memcpy(buffer, &vnode->status.data_version,
+	       sizeof(vnode->status.data_version));
+
+	_leave(" = %u", dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxiliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+						       const void *buffer,
+						       uint16_t buflen)
+{
+	struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%x,%x,%llx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, buflen);
+
+	/* check the size of the data is what we're expecting */
+	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+	if (dlen != buflen) {
+		_leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	if (memcmp(buffer,
+		   &vnode->fid.unique,
+		   sizeof(vnode->fid.unique)
+		   ) != 0) {
+		unsigned unique;
+
+		memcpy(&unique, buffer, sizeof(unique));
+
+		_leave(" = OBSOLETE [uniq %x != %x]",
+		       unique, vnode->fid.unique);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	if (memcmp(buffer + sizeof(vnode->fid.unique),
+		   &vnode->status.data_version,
+		   sizeof(vnode->status.data_version)
+		   ) != 0) {
+		afs_dataversion_t version;
+
+		memcpy(&version, buffer + sizeof(vnode->fid.unique),
+		       sizeof(version));
+
+		_leave(" = OBSOLETE [vers %llx != %llx]",
+		       version, vnode->status.data_version);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	_leave(" = SUCCESS");
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * indication the cookie is no longer uncached
+ * - this function is called when the backing store currently caching a cookie
+ *   is removed
+ * - the netfs should use this to clean up any markers indicating cached pages
+ * - this is mandatory for any object that may have data
+ */
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
+{
+	struct afs_vnode *vnode = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	_enter("{%x,%x,%Lx}",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	for (;;) {
+		/* grab a bunch of pages to clean */
+		nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	_leave("");
+}
diff --git a/kernel/fs/afs/callback.c b/kernel/fs/afs/callback.c
new file mode 100644
index 000000000..7ef637d7f
--- /dev/null
+++ b/kernel/fs/afs/callback.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *          David Howells <dhowells@redhat.com>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/circ_buf.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+#if 0
+unsigned afs_vnode_update_timeout = 10;
+#endif  /*  0  */
+
+#define afs_breakring_space(server) \
+	CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,	\
+		   ARRAY_SIZE((server)->cb_break))
+
+//static void afs_callback_updater(struct work_struct *);
+
+static struct workqueue_struct *afs_callback_update_worker;
+
+/*
+ * allow the fileserver to request callback state (re-)initialisation
+ */
+void afs_init_callback_state(struct afs_server *server)
+{
+	struct afs_vnode *vnode;
+
+	_enter("{%p}", server);
+
+	spin_lock(&server->cb_lock);
+
+	/* kill all the promises on record from this server */
+	while (!RB_EMPTY_ROOT(&server->cb_promises)) {
+		vnode = rb_entry(server->cb_promises.rb_node,
+				 struct afs_vnode, cb_promise);
+		_debug("UNPROMISE { vid=%x:%u uq=%u}",
+		       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
+
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * handle the data invalidation side of a callback being broken
+ */
+void afs_broken_callback_work(struct work_struct *work)
+{
+	struct afs_vnode *vnode =
+		container_of(work, struct afs_vnode, cb_broken_work);
+
+	_enter("");
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		return;
+
+	/* we're only interested in dealing with a broken callback on *this*
+	 * vnode and only if no-one else has dealt with it yet */
+	if (!mutex_trylock(&vnode->validate_lock))
+		return; /* someone else is dealing with it */
+
+	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		if (S_ISDIR(vnode->vfs_inode.i_mode))
+			afs_clear_permits(vnode);
+
+		if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
+			goto out;
+
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			goto out;
+
+		/* if the vnode's data version number changed then its contents
+		 * are different */
+		if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+			afs_zap_data(vnode);
+	}
+
+out:
+	mutex_unlock(&vnode->validate_lock);
+
+	/* avoid the potential race whereby the mutex_trylock() in this
+	 * function happens again between the clear_bit() and the
+	 * mutex_unlock() */
+	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		_debug("requeue");
+		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+	}
+	_leave("");
+}
+
+/*
+ * actually break a callback
+ */
+static void afs_break_callback(struct afs_server *server,
+			       struct afs_vnode *vnode)
+{
+	_enter("");
+
+	set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+
+	if (vnode->cb_promised) {
+		spin_lock(&vnode->lock);
+
+		_debug("break callback");
+
+		spin_lock(&server->cb_lock);
+		if (vnode->cb_promised) {
+			rb_erase(&vnode->cb_promise, &server->cb_promises);
+			vnode->cb_promised = false;
+		}
+		spin_unlock(&server->cb_lock);
+
+		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+		if (list_empty(&vnode->granted_locks) &&
+		    !list_empty(&vnode->pending_locks))
+			afs_lock_may_be_available(vnode);
+		spin_unlock(&vnode->lock);
+	}
+}
+
+/*
+ * allow the fileserver to explicitly break one callback
+ * - happens when
+ *   - the backing file is changed
+ *   - a lock is released
+ */
+static void afs_break_one_callback(struct afs_server *server,
+				   struct afs_fid *fid)
+{
+	struct afs_vnode *vnode;
+	struct rb_node *p;
+
+	_debug("find");
+	spin_lock(&server->fs_lock);
+	p = server->fs_vnodes.rb_node;
+	while (p) {
+		vnode = rb_entry(p, struct afs_vnode, server_rb);
+		if (fid->vid < vnode->fid.vid)
+			p = p->rb_left;
+		else if (fid->vid > vnode->fid.vid)
+			p = p->rb_right;
+		else if (fid->vnode < vnode->fid.vnode)
+			p = p->rb_left;
+		else if (fid->vnode > vnode->fid.vnode)
+			p = p->rb_right;
+		else if (fid->unique < vnode->fid.unique)
+			p = p->rb_left;
+		else if (fid->unique > vnode->fid.unique)
+			p = p->rb_right;
+		else
+			goto found;
+	}
+
+	/* not found so we just ignore it (it may have moved to another
+	 * server) */
+not_available:
+	_debug("not avail");
+	spin_unlock(&server->fs_lock);
+	_leave("");
+	return;
+
+found:
+	_debug("found");
+	ASSERTCMP(server, ==, vnode->server);
+
+	if (!igrab(AFS_VNODE_TO_I(vnode)))
+		goto not_available;
+	spin_unlock(&server->fs_lock);
+
+	afs_break_callback(server, vnode);
+	iput(&vnode->vfs_inode);
+	_leave("");
+}
+
+/*
+ * allow the fileserver to break callback promises
+ */
+void afs_break_callbacks(struct afs_server *server, size_t count,
+			 struct afs_callback callbacks[])
+{
+	_enter("%p,%zu,", server, count);
+
+	ASSERT(server != NULL);
+	ASSERTCMP(count, <=, AFSCBMAX);
+
+	for (; count > 0; callbacks++, count--) {
+		_debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
+		       callbacks->fid.vid,
+		       callbacks->fid.vnode,
+		       callbacks->fid.unique,
+		       callbacks->version,
+		       callbacks->expiry,
+		       callbacks->type
+		       );
+		afs_break_one_callback(server, &callbacks->fid);
+	}
+
+	_leave("");
+	return;
+}
+
+/*
+ * record the callback for breaking
+ * - the caller must hold server->cb_lock
+ */
+static void afs_do_give_up_callback(struct afs_server *server,
+				    struct afs_vnode *vnode)
+{
+	struct afs_callback *cb;
+
+	_enter("%p,%p", server, vnode);
+
+	cb = &server->cb_break[server->cb_break_head];
+	cb->fid		= vnode->fid;
+	cb->version	= vnode->cb_version;
+	cb->expiry	= vnode->cb_expiry;
+	cb->type	= vnode->cb_type;
+	smp_wmb();
+	server->cb_break_head =
+		(server->cb_break_head + 1) &
+		(ARRAY_SIZE(server->cb_break) - 1);
+
+	/* defer the breaking of callbacks to try and collect as many as
+	 * possible to ship in one operation */
+	switch (atomic_inc_return(&server->cb_break_n)) {
+	case 1 ... AFSCBMAX - 1:
+		queue_delayed_work(afs_callback_update_worker,
+				   &server->cb_break_work, HZ * 2);
+		break;
+	case AFSCBMAX:
+		afs_flush_callback_breaks(server);
+		break;
+	default:
+		break;
+	}
+
+	ASSERT(server->cb_promises.rb_node != NULL);
+	rb_erase(&vnode->cb_promise, &server->cb_promises);
+	vnode->cb_promised = false;
+	_leave("");
+}
+
+/*
+ * discard the callback on a deleted item
+ */
+void afs_discard_callback_on_delete(struct afs_vnode *vnode)
+{
+	struct afs_server *server = vnode->server;
+
+	_enter("%d", vnode->cb_promised);
+
+	if (!vnode->cb_promised) {
+		_leave(" [not promised]");
+		return;
+	}
+
+	ASSERT(server != NULL);
+
+	spin_lock(&server->cb_lock);
+	if (vnode->cb_promised) {
+		ASSERT(server->cb_promises.rb_node != NULL);
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * give up the callback registered for a vnode on the file server when the
+ * inode is being cleared
+ */
+void afs_give_up_callback(struct afs_vnode *vnode)
+{
+	struct afs_server *server = vnode->server;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("%d", vnode->cb_promised);
+
+	_debug("GIVE UP INODE %p", &vnode->vfs_inode);
+
+	if (!vnode->cb_promised) {
+		_leave(" [not promised]");
+		return;
+	}
+
+	ASSERT(server != NULL);
+
+	spin_lock(&server->cb_lock);
+	if (vnode->cb_promised && afs_breakring_space(server) == 0) {
+		add_wait_queue(&server->cb_break_waitq, &myself);
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!vnode->cb_promised ||
+			    afs_breakring_space(server) != 0)
+				break;
+			spin_unlock(&server->cb_lock);
+			schedule();
+			spin_lock(&server->cb_lock);
+		}
+		remove_wait_queue(&server->cb_break_waitq, &myself);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	/* of course, it's always possible for the server to break this vnode's
+	 * callback first... */
+	if (vnode->cb_promised)
+		afs_do_give_up_callback(server, vnode);
+
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * dispatch a deferred give up callbacks operation
+ */
+void afs_dispatch_give_up_callbacks(struct work_struct *work)
+{
+	struct afs_server *server =
+		container_of(work, struct afs_server, cb_break_work.work);
+
+	_enter("");
+
+	/* tell the fileserver to discard the callback promises it has
+	 * - in the event of ENOMEM or some other error, we just forget that we
+	 *   had callbacks entirely, and the server will call us later to break
+	 *   them
+	 */
+	afs_fs_give_up_callbacks(server, &afs_async_call);
+}
+
+/*
+ * flush the outstanding callback breaks on a server
+ */
+void afs_flush_callback_breaks(struct afs_server *server)
+{
+	mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0);
+}
+
+#if 0
+/*
+ * update a bunch of callbacks
+ */
+static void afs_callback_updater(struct work_struct *work)
+{
+	struct afs_server *server;
+	struct afs_vnode *vnode, *xvnode;
+	time_t now;
+	long timeout;
+	int ret;
+
+	server = container_of(work, struct afs_server, updater);
+
+	_enter("");
+
+	now = get_seconds();
+
+	/* find the first vnode to update */
+	spin_lock(&server->cb_lock);
+	for (;;) {
+		if (RB_EMPTY_ROOT(&server->cb_promises)) {
+			spin_unlock(&server->cb_lock);
+			_leave(" [nothing]");
+			return;
+		}
+
+		vnode = rb_entry(rb_first(&server->cb_promises),
+				 struct afs_vnode, cb_promise);
+		if (atomic_read(&vnode->usage) > 0)
+			break;
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
+
+	timeout = vnode->update_at - now;
+	if (timeout > 0) {
+		queue_delayed_work(afs_vnode_update_worker,
+				   &afs_vnode_update, timeout * HZ);
+		spin_unlock(&server->cb_lock);
+		_leave(" [nothing]");
+		return;
+	}
+
+	list_del_init(&vnode->update);
+	atomic_inc(&vnode->usage);
+	spin_unlock(&server->cb_lock);
+
+	/* we can now perform the update */
+	_debug("update %s", vnode->vldb.name);
+	vnode->state = AFS_VL_UPDATING;
+	vnode->upd_rej_cnt = 0;
+	vnode->upd_busy_cnt = 0;
+
+	ret = afs_vnode_update_record(vl, &vldb);
+	switch (ret) {
+	case 0:
+		afs_vnode_apply_update(vl, &vldb);
+		vnode->state = AFS_VL_UPDATING;
+		break;
+	case -ENOMEDIUM:
+		vnode->state = AFS_VL_VOLUME_DELETED;
+		break;
+	default:
+		vnode->state = AFS_VL_UNCERTAIN;
+		break;
+	}
+
+	/* and then reschedule */
+	_debug("reschedule");
+	vnode->update_at = get_seconds() + afs_vnode_update_timeout;
+
+	spin_lock(&server->cb_lock);
+
+	if (!list_empty(&server->cb_promises)) {
+		/* next update in 10 minutes, but wait at least 1 second more
+		 * than the newest record already queued so that we don't spam
+		 * the VL server suddenly with lots of requests
+		 */
+		xvnode = list_entry(server->cb_promises.prev,
+				    struct afs_vnode, update);
+		if (vnode->update_at <= xvnode->update_at)
+			vnode->update_at = xvnode->update_at + 1;
+		xvnode = list_entry(server->cb_promises.next,
+				    struct afs_vnode, update);
+		timeout = xvnode->update_at - now;
+		if (timeout < 0)
+			timeout = 0;
+	} else {
+		timeout = afs_vnode_update_timeout;
+	}
+
+	list_add_tail(&vnode->update, &server->cb_promises);
+
+	_debug("timeout %ld", timeout);
+	queue_delayed_work(afs_vnode_update_worker,
+			   &afs_vnode_update, timeout * HZ);
+	spin_unlock(&server->cb_lock);
+	afs_put_vnode(vl);
+}
+#endif
+
+/*
+ * initialise the callback update process
+ */
+int __init afs_callback_update_init(void)
+{
+	afs_callback_update_worker =
+		create_singlethread_workqueue("kafs_callbackd");
+	return afs_callback_update_worker ? 0 : -ENOMEM;
+}
+
+/*
+ * shut down the callback update process
+ */
+void afs_callback_update_kill(void)
+{
+	destroy_workqueue(afs_callback_update_worker);
+}
diff --git a/kernel/fs/afs/cell.c b/kernel/fs/afs/cell.c
new file mode 100644
index 000000000..ca0a3cf93
--- /dev/null
+++ b/kernel/fs/afs/cell.c
@@ -0,0 +1,460 @@
+/* AFS cell and server record management
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+#include <linux/ctype.h>
+#include <linux/dns_resolver.h>
+#include <linux/sched.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+DECLARE_RWSEM(afs_proc_cells_sem);
+LIST_HEAD(afs_proc_cells);
+
+static LIST_HEAD(afs_cells);
+static DEFINE_RWLOCK(afs_cells_lock);
+static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
+static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
+static struct afs_cell *afs_cell_root;
+
+/*
+ * allocate a cell record and fill in its name, VL server address list and
+ * allocate an anonymous key
+ */
+static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
+				       char *vllist)
+{
+	struct afs_cell *cell;
+	struct key *key;
+	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
+	char  *dvllist = NULL, *_vllist = NULL;
+	char  delimiter = ':';
+	int ret;
+
+	_enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
+
+	BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
+
+	if (namelen > AFS_MAXCELLNAME) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	/* allocate and initialise a cell record */
+	cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
+	if (!cell) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memcpy(cell->name, name, namelen);
+	cell->name[namelen] = 0;
+
+	atomic_set(&cell->usage, 1);
+	INIT_LIST_HEAD(&cell->link);
+	rwlock_init(&cell->servers_lock);
+	INIT_LIST_HEAD(&cell->servers);
+	init_rwsem(&cell->vl_sem);
+	INIT_LIST_HEAD(&cell->vl_list);
+	spin_lock_init(&cell->vl_lock);
+
+	/* if the ip address is invalid, try dns query */
+	if (!vllist || strlen(vllist) < 7) {
+		ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
+		if (ret < 0) {
+			if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
+				/* translate these errors into something
+				 * userspace might understand */
+				ret = -EDESTADDRREQ;
+			_leave(" = %d", ret);
+			return ERR_PTR(ret);
+		}
+		_vllist = dvllist;
+
+		/* change the delimiter for user-space reply */
+		delimiter = ',';
+
+	} else {
+		_vllist = vllist;
+	}
+
+	/* fill in the VL server list from the rest of the string */
+	do {
+		unsigned a, b, c, d;
+
+		next = strchr(_vllist, delimiter);
+		if (next)
+			*next++ = 0;
+
+		if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
+			goto bad_address;
+
+		if (a > 255 || b > 255 || c > 255 || d > 255)
+			goto bad_address;
+
+		cell->vl_addrs[cell->vl_naddrs++].s_addr =
+			htonl((a << 24) | (b << 16) | (c << 8) | d);
+
+	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
+
+	/* create a key to represent an anonymous user */
+	memcpy(keyname, "afs@", 4);
+	dp = keyname + 4;
+	cp = cell->name;
+	do {
+		*dp++ = toupper(*cp);
+	} while (*cp++);
+
+	key = rxrpc_get_null_key(keyname);
+	if (IS_ERR(key)) {
+		_debug("no key");
+		ret = PTR_ERR(key);
+		goto error;
+	}
+	cell->anonymous_key = key;
+
+	_debug("anon key %p{%x}",
+	       cell->anonymous_key, key_serial(cell->anonymous_key));
+
+	_leave(" = %p", cell);
+	return cell;
+
+bad_address:
+	printk(KERN_ERR "kAFS: bad VL server IP address\n");
+	ret = -EINVAL;
+error:
+	key_put(cell->anonymous_key);
+	kfree(dvllist);
+	kfree(cell);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * afs_cell_crate() - create a cell record
+ * @name:	is the name of the cell.
+ * @namsesz:	is the strlen of the cell name.
+ * @vllist:	is a colon separated list of IP addresses in "a.b.c.d" format.
+ * @retref:	is T to return the cell reference when the cell exists.
+ */
+struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
+				 char *vllist, bool retref)
+{
+	struct afs_cell *cell;
+	int ret;
+
+	_enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
+
+	down_write(&afs_cells_sem);
+	read_lock(&afs_cells_lock);
+	list_for_each_entry(cell, &afs_cells, link) {
+		if (strncasecmp(cell->name, name, namesz) == 0)
+			goto duplicate_name;
+	}
+	read_unlock(&afs_cells_lock);
+
+	cell = afs_cell_alloc(name, namesz, vllist);
+	if (IS_ERR(cell)) {
+		_leave(" = %ld", PTR_ERR(cell));
+		up_write(&afs_cells_sem);
+		return cell;
+	}
+
+	/* add a proc directory for this cell */
+	ret = afs_proc_cell_setup(cell);
+	if (ret < 0)
+		goto error;
+
+#ifdef CONFIG_AFS_FSCACHE
+	/* put it up for caching (this never returns an error) */
+	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
+					     &afs_cell_cache_index_def,
+					     cell, true);
+#endif
+
+	/* add to the cell lists */
+	write_lock(&afs_cells_lock);
+	list_add_tail(&cell->link, &afs_cells);
+	write_unlock(&afs_cells_lock);
+
+	down_write(&afs_proc_cells_sem);
+	list_add_tail(&cell->proc_link, &afs_proc_cells);
+	up_write(&afs_proc_cells_sem);
+	up_write(&afs_cells_sem);
+
+	_leave(" = %p", cell);
+	return cell;
+
+error:
+	up_write(&afs_cells_sem);
+	key_put(cell->anonymous_key);
+	kfree(cell);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+
+duplicate_name:
+	if (retref && !IS_ERR(cell))
+		afs_get_cell(cell);
+
+	read_unlock(&afs_cells_lock);
+	up_write(&afs_cells_sem);
+
+	if (retref) {
+		_leave(" = %p", cell);
+		return cell;
+	}
+
+	_leave(" = -EEXIST");
+	return ERR_PTR(-EEXIST);
+}
+
+/*
+ * set the root cell information
+ * - can be called with a module parameter string
+ * - can be called from a write to /proc/fs/afs/rootcell
+ */
+int afs_cell_init(char *rootcell)
+{
+	struct afs_cell *old_root, *new_root;
+	char *cp;
+
+	_enter("");
+
+	if (!rootcell) {
+		/* module is loaded with no parameters, or built statically.
+		 * - in the future we might initialize cell DB here.
+		 */
+		_leave(" = 0 [no root]");
+		return 0;
+	}
+
+	cp = strchr(rootcell, ':');
+	if (!cp)
+		_debug("kAFS: no VL server IP addresses specified");
+	else
+		*cp++ = 0;
+
+	/* allocate a cell record for the root cell */
+	new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
+	if (IS_ERR(new_root)) {
+		_leave(" = %ld", PTR_ERR(new_root));
+		return PTR_ERR(new_root);
+	}
+
+	/* install the new cell */
+	write_lock(&afs_cells_lock);
+	old_root = afs_cell_root;
+	afs_cell_root = new_root;
+	write_unlock(&afs_cells_lock);
+	afs_put_cell(old_root);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * lookup a cell record
+ */
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
+				 bool dns_cell)
+{
+	struct afs_cell *cell;
+
+	_enter("\"%*.*s\",", namesz, namesz, name ?: "");
+
+	down_read(&afs_cells_sem);
+	read_lock(&afs_cells_lock);
+
+	if (name) {
+		/* if the cell was named, look for it in the cell record list */
+		list_for_each_entry(cell, &afs_cells, link) {
+			if (strncmp(cell->name, name, namesz) == 0) {
+				afs_get_cell(cell);
+				goto found;
+			}
+		}
+		cell = ERR_PTR(-ENOENT);
+		if (dns_cell)
+			goto create_cell;
+	found:
+		;
+	} else {
+		cell = afs_cell_root;
+		if (!cell) {
+			/* this should not happen unless user tries to mount
+			 * when root cell is not set. Return an impossibly
+			 * bizarre errno to alert the user. Things like
+			 * ENOENT might be "more appropriate" but they happen
+			 * for other reasons.
+			 */
+			cell = ERR_PTR(-EDESTADDRREQ);
+		} else {
+			afs_get_cell(cell);
+		}
+
+	}
+
+	read_unlock(&afs_cells_lock);
+	up_read(&afs_cells_sem);
+	_leave(" = %p", cell);
+	return cell;
+
+create_cell:
+	read_unlock(&afs_cells_lock);
+	up_read(&afs_cells_sem);
+
+	cell = afs_cell_create(name, namesz, NULL, true);
+
+	_leave(" = %p", cell);
+	return cell;
+}
+
+#if 0
+/*
+ * try and get a cell record
+ */
+struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
+{
+	write_lock(&afs_cells_lock);
+
+	if (cell && !list_empty(&cell->link))
+		afs_get_cell(cell);
+	else
+		cell = NULL;
+
+	write_unlock(&afs_cells_lock);
+	return cell;
+}
+#endif  /*  0  */
+
+/*
+ * destroy a cell record
+ */
+void afs_put_cell(struct afs_cell *cell)
+{
+	if (!cell)
+		return;
+
+	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+
+	ASSERTCMP(atomic_read(&cell->usage), >, 0);
+
+	/* to prevent a race, the decrement and the dequeue must be effectively
+	 * atomic */
+	write_lock(&afs_cells_lock);
+
+	if (likely(!atomic_dec_and_test(&cell->usage))) {
+		write_unlock(&afs_cells_lock);
+		_leave("");
+		return;
+	}
+
+	ASSERT(list_empty(&cell->servers));
+	ASSERT(list_empty(&cell->vl_list));
+
+	write_unlock(&afs_cells_lock);
+
+	wake_up(&afs_cells_freeable_wq);
+
+	_leave(" [unused]");
+}
+
+/*
+ * destroy a cell record
+ * - must be called with the afs_cells_sem write-locked
+ * - cell->link should have been broken by the caller
+ */
+static void afs_cell_destroy(struct afs_cell *cell)
+{
+	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+
+	ASSERTCMP(atomic_read(&cell->usage), >=, 0);
+	ASSERT(list_empty(&cell->link));
+
+	/* wait for everyone to stop using the cell */
+	if (atomic_read(&cell->usage) > 0) {
+		DECLARE_WAITQUEUE(myself, current);
+
+		_debug("wait for cell %s", cell->name);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&afs_cells_freeable_wq, &myself);
+
+		while (atomic_read(&cell->usage) > 0) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		remove_wait_queue(&afs_cells_freeable_wq, &myself);
+		set_current_state(TASK_RUNNING);
+	}
+
+	_debug("cell dead");
+	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+	ASSERT(list_empty(&cell->servers));
+	ASSERT(list_empty(&cell->vl_list));
+
+	afs_proc_cell_remove(cell);
+
+	down_write(&afs_proc_cells_sem);
+	list_del_init(&cell->proc_link);
+	up_write(&afs_proc_cells_sem);
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(cell->cache, 0);
+#endif
+	key_put(cell->anonymous_key);
+	kfree(cell);
+
+	_leave(" [destroyed]");
+}
+
+/*
+ * purge in-memory cell database on module unload or afs_init() failure
+ * - the timeout daemon is stopped before calling this
+ */
+void afs_cell_purge(void)
+{
+	struct afs_cell *cell;
+
+	_enter("");
+
+	afs_put_cell(afs_cell_root);
+
+	down_write(&afs_cells_sem);
+
+	while (!list_empty(&afs_cells)) {
+		cell = NULL;
+
+		/* remove the next cell from the front of the list */
+		write_lock(&afs_cells_lock);
+
+		if (!list_empty(&afs_cells)) {
+			cell = list_entry(afs_cells.next,
+					  struct afs_cell, link);
+			list_del_init(&cell->link);
+		}
+
+		write_unlock(&afs_cells_lock);
+
+		if (cell) {
+			_debug("PURGING CELL %s (%d)",
+			       cell->name, atomic_read(&cell->usage));
+
+			/* now the cell should be left with no references */
+			afs_cell_destroy(cell);
+		}
+	}
+
+	up_write(&afs_cells_sem);
+	_leave("");
+}
diff --git a/kernel/fs/afs/cmservice.c b/kernel/fs/afs/cmservice.c
new file mode 100644
index 000000000..4b0eff6da
--- /dev/null
+++ b/kernel/fs/afs/cmservice.c
@@ -0,0 +1,604 @@
+/* AFS Cache Manager Service
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/ip.h>
+#include "internal.h"
+#include "afs_cm.h"
+
+#if 0
+struct workqueue_struct *afs_cm_workqueue;
+#endif  /*  0  */
+
+static int afs_deliver_cb_init_call_back_state(struct afs_call *,
+					       struct sk_buff *, bool);
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
+						struct sk_buff *, bool);
+static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *,
+						 struct sk_buff *, bool);
+static void afs_cm_destructor(struct afs_call *);
+
+/*
+ * CB.CallBack operation type
+ */
+static const struct afs_call_type afs_SRXCBCallBack = {
+	.name		= "CB.CallBack",
+	.deliver	= afs_deliver_cb_callback,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.InitCallBackState operation type
+ */
+static const struct afs_call_type afs_SRXCBInitCallBackState = {
+	.name		= "CB.InitCallBackState",
+	.deliver	= afs_deliver_cb_init_call_back_state,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.InitCallBackState3 operation type
+ */
+static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
+	.name		= "CB.InitCallBackState3",
+	.deliver	= afs_deliver_cb_init_call_back_state3,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.Probe operation type
+ */
+static const struct afs_call_type afs_SRXCBProbe = {
+	.name		= "CB.Probe",
+	.deliver	= afs_deliver_cb_probe,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.ProbeUuid operation type
+ */
+static const struct afs_call_type afs_SRXCBProbeUuid = {
+	.name		= "CB.ProbeUuid",
+	.deliver	= afs_deliver_cb_probe_uuid,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.TellMeAboutYourself operation type
+ */
+static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
+	.name		= "CB.TellMeAboutYourself",
+	.deliver	= afs_deliver_cb_tell_me_about_yourself,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * route an incoming cache manager call
+ * - return T if supported, F if not
+ */
+bool afs_cm_incoming_call(struct afs_call *call)
+{
+	u32 operation_id = ntohl(call->operation_ID);
+
+	_enter("{CB.OP %u}", operation_id);
+
+	switch (operation_id) {
+	case CBCallBack:
+		call->type = &afs_SRXCBCallBack;
+		return true;
+	case CBInitCallBackState:
+		call->type = &afs_SRXCBInitCallBackState;
+		return true;
+	case CBInitCallBackState3:
+		call->type = &afs_SRXCBInitCallBackState3;
+		return true;
+	case CBProbe:
+		call->type = &afs_SRXCBProbe;
+		return true;
+	case CBTellMeAboutYourself:
+		call->type = &afs_SRXCBTellMeAboutYourself;
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * clean up a cache manager call
+ */
+static void afs_cm_destructor(struct afs_call *call)
+{
+	_enter("");
+
+	/* Break the callbacks here so that we do it after the final ACK is
+	 * received.  The step number here must match the final number in
+	 * afs_deliver_cb_callback().
+	 */
+	if (call->unmarshall == 6) {
+		ASSERT(call->server && call->count && call->request);
+		afs_break_callbacks(call->server, call->count, call->request);
+	}
+
+	afs_put_server(call->server);
+	call->server = NULL;
+	kfree(call->buffer);
+	call->buffer = NULL;
+}
+
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_CallBack(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+
+	_enter("");
+
+	/* be sure to send the reply *before* attempting to spam the AFS server
+	 * with FSFetchStatus requests on the vnodes with broken callbacks lest
+	 * the AFS server get into a vicious cycle of trying to break further
+	 * callbacks because it hadn't received completion of the CBCallBack op
+	 * yet */
+	afs_send_empty_reply(call);
+
+	afs_break_callbacks(call->server, call->count, call->request);
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.CallBack call
+ */
+static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
+				   bool last)
+{
+	struct afs_callback *cb;
+	struct afs_server *server;
+	struct in_addr addr;
+	__be32 *bp;
+	u32 tmp;
+	int ret, loop;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the FID array and its count in two steps */
+	case 1:
+		_debug("extract FID count");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("FID count: %u", call->count);
+		if (call->count > AFSCBMAX)
+			return -EBADMSG;
+
+		call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 2:
+		_debug("extract FID array");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count * 3 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall FID array");
+		call->request = kcalloc(call->count,
+					sizeof(struct afs_callback),
+					GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		cb = call->request;
+		bp = call->buffer;
+		for (loop = call->count; loop > 0; loop--, cb++) {
+			cb->fid.vid	= ntohl(*bp++);
+			cb->fid.vnode	= ntohl(*bp++);
+			cb->fid.unique	= ntohl(*bp++);
+			cb->type	= AFSCM_CB_UNTYPED;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the callback array and its count in two steps */
+	case 3:
+		_debug("extract CB count");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		tmp = ntohl(call->tmp);
+		_debug("CB count: %u", tmp);
+		if (tmp != call->count && tmp != 0)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+		if (tmp == 0)
+			goto empty_cb_array;
+
+	case 4:
+		_debug("extract CB array");
+		ret = afs_extract_data(call, skb, last, call->request,
+				       call->count * 3 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall CB array");
+		cb = call->request;
+		bp = call->buffer;
+		for (loop = call->count; loop > 0; loop--, cb++) {
+			cb->version	= ntohl(*bp++);
+			cb->expiry	= ntohl(*bp++);
+			cb->type	= ntohl(*bp++);
+		}
+
+	empty_cb_array:
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 5:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+
+		/* Record that the message was unmarshalled successfully so
+		 * that the call destructor can know do the callback breaking
+		 * work, even if the final ACK isn't received.
+		 *
+		 * If the step number changes, then afs_cm_destructor() must be
+		 * updated also.
+		 */
+		call->unmarshall++;
+	case 6:
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	call->state = AFS_CALL_REPLYING;
+
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
+
+	INIT_WORK(&call->work, SRXAFSCB_CallBack);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to request callback state (re-)initialisation
+ */
+static void SRXAFSCB_InitCallBackState(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+
+	_enter("{%p}", call->server);
+
+	afs_init_callback_state(call->server);
+	afs_send_empty_reply(call);
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.InitCallBackState call
+ */
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
+					       struct sk_buff *skb,
+					       bool last)
+{
+	struct afs_server *server;
+	struct in_addr addr;
+
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
+
+	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * deliver request data to a CB.InitCallBackState3 call
+ */
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
+						struct sk_buff *skb,
+						bool last)
+{
+	struct afs_server *server;
+	struct in_addr addr;
+
+	_enter(",{%u},%d", skb->len, last);
+
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
+
+	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_Probe(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+
+	_enter("");
+	afs_send_empty_reply(call);
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.Probe call
+ */
+static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
+				bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	INIT_WORK(&call->work, SRXAFSCB_Probe);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to quickly find out if the fileserver has been rebooted
+ */
+static void SRXAFSCB_ProbeUuid(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+	struct afs_uuid *r = call->request;
+
+	struct {
+		__be32	match;
+	} reply;
+
+	_enter("");
+
+
+	if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
+		reply.match = htonl(0);
+	else
+		reply.match = htonl(1);
+
+	afs_send_simple_reply(call, &reply, sizeof(reply));
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.ProbeUuid call
+ */
+static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
+				     bool last)
+{
+	struct afs_uuid *r;
+	unsigned loop;
+	__be32 *b;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		call->unmarshall++;
+
+	case 1:
+		_debug("extract UUID");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       11 * sizeof(__be32));
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall UUID");
+		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		b = call->buffer;
+		r = call->request;
+		r->time_low			= ntohl(b[0]);
+		r->time_mid			= ntohl(b[1]);
+		r->time_hi_and_version		= ntohl(b[2]);
+		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
+		r->clock_seq_low		= ntohl(b[4]);
+
+		for (loop = 0; loop < 6; loop++)
+			r->node[loop] = ntohl(b[loop + 5]);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 2:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	call->state = AFS_CALL_REPLYING;
+
+	INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to ask about the cache manager's capabilities
+ */
+static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
+{
+	struct afs_interface *ifs;
+	struct afs_call *call = container_of(work, struct afs_call, work);
+	int loop, nifs;
+
+	struct {
+		struct /* InterfaceAddr */ {
+			__be32 nifs;
+			__be32 uuid[11];
+			__be32 ifaddr[32];
+			__be32 netmask[32];
+			__be32 mtu[32];
+		} ia;
+		struct /* Capabilities */ {
+			__be32 capcount;
+			__be32 caps[1];
+		} cap;
+	} reply;
+
+	_enter("");
+
+	nifs = 0;
+	ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
+	if (ifs) {
+		nifs = afs_get_ipv4_interfaces(ifs, 32, false);
+		if (nifs < 0) {
+			kfree(ifs);
+			ifs = NULL;
+			nifs = 0;
+		}
+	}
+
+	memset(&reply, 0, sizeof(reply));
+	reply.ia.nifs = htonl(nifs);
+
+	reply.ia.uuid[0] = htonl(afs_uuid.time_low);
+	reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
+	reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
+	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+	for (loop = 0; loop < 6; loop++)
+		reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+
+	if (ifs) {
+		for (loop = 0; loop < nifs; loop++) {
+			reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
+			reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
+			reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
+		}
+		kfree(ifs);
+	}
+
+	reply.cap.capcount = htonl(1);
+	reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
+	afs_send_simple_reply(call, &reply, sizeof(reply));
+
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.TellMeAboutYourself call
+ */
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
+						 struct sk_buff *skb, bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
diff --git a/kernel/fs/afs/dir.c b/kernel/fs/afs/dir.c
new file mode 100644
index 000000000..e10e17788
--- /dev/null
+++ b/kernel/fs/afs/dir.c
@@ -0,0 +1,1123 @@
+/* dir.c: AFS filesystem directory handling
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags);
+static int afs_dir_open(struct inode *inode, struct file *file);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
+static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
+static int afs_d_delete(const struct dentry *dentry);
+static void afs_d_release(struct dentry *dentry);
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
+				  loff_t fpos, u64 ino, unsigned dtype);
+static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+static int afs_rmdir(struct inode *dir, struct dentry *dentry);
+static int afs_unlink(struct inode *dir, struct dentry *dentry);
+static int afs_link(struct dentry *from, struct inode *dir,
+		    struct dentry *dentry);
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *content);
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry);
+
+const struct file_operations afs_dir_file_operations = {
+	.open		= afs_dir_open,
+	.release	= afs_release,
+	.iterate	= afs_readdir,
+	.lock		= afs_lock,
+	.llseek		= generic_file_llseek,
+};
+
+const struct inode_operations afs_dir_inode_operations = {
+	.create		= afs_create,
+	.lookup		= afs_lookup,
+	.link		= afs_link,
+	.unlink		= afs_unlink,
+	.symlink	= afs_symlink,
+	.mkdir		= afs_mkdir,
+	.rmdir		= afs_rmdir,
+	.rename		= afs_rename,
+	.permission	= afs_permission,
+	.getattr	= afs_getattr,
+	.setattr	= afs_setattr,
+};
+
+const struct dentry_operations afs_fs_dentry_operations = {
+	.d_revalidate	= afs_d_revalidate,
+	.d_delete	= afs_d_delete,
+	.d_release	= afs_d_release,
+	.d_automount	= afs_d_automount,
+};
+
+#define AFS_DIR_HASHTBL_SIZE	128
+#define AFS_DIR_DIRENT_SIZE	32
+#define AFS_DIRENT_PER_BLOCK	64
+
+union afs_dirent {
+	struct {
+		uint8_t		valid;
+		uint8_t		unused[1];
+		__be16		hash_next;
+		__be32		vnode;
+		__be32		unique;
+		uint8_t		name[16];
+		uint8_t		overflow[4];	/* if any char of the name (inc
+						 * NUL) reaches here, consume
+						 * the next dirent too */
+	} u;
+	uint8_t	extended_name[32];
+};
+
+/* AFS directory page header (one at the beginning of every 2048-byte chunk) */
+struct afs_dir_pagehdr {
+	__be16		npages;
+	__be16		magic;
+#define AFS_DIR_MAGIC htons(1234)
+	uint8_t		nentries;
+	uint8_t		bitmap[8];
+	uint8_t		pad[19];
+};
+
+/* directory block layout */
+union afs_dir_block {
+
+	struct afs_dir_pagehdr pagehdr;
+
+	struct {
+		struct afs_dir_pagehdr	pagehdr;
+		uint8_t			alloc_ctrs[128];
+		/* dir hash table */
+		uint16_t		hashtable[AFS_DIR_HASHTBL_SIZE];
+	} hdr;
+
+	union afs_dirent dirents[AFS_DIRENT_PER_BLOCK];
+};
+
+/* layout on a linux VM page */
+struct afs_dir_page {
+	union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
+};
+
+struct afs_lookup_cookie {
+	struct dir_context ctx;
+	struct afs_fid	fid;
+	struct qstr name;
+	int		found;
+};
+
+/*
+ * check that a directory page is valid
+ */
+static inline void afs_dir_check_page(struct inode *dir, struct page *page)
+{
+	struct afs_dir_page *dbuf;
+	loff_t latter;
+	int tmp, qty;
+
+#if 0
+	/* check the page count */
+	qty = desc.size / sizeof(dbuf->blocks[0]);
+	if (qty == 0)
+		goto error;
+
+	if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
+		printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
+		       __func__, dir->i_ino, qty,
+		       ntohs(dbuf->blocks[0].pagehdr.npages));
+		goto error;
+	}
+#endif
+
+	/* determine how many magic numbers there should be in this page */
+	latter = dir->i_size - page_offset(page);
+	if (latter >= PAGE_SIZE)
+		qty = PAGE_SIZE;
+	else
+		qty = latter;
+	qty /= sizeof(union afs_dir_block);
+
+	/* check them */
+	dbuf = page_address(page);
+	for (tmp = 0; tmp < qty; tmp++) {
+		if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) {
+			printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n",
+			       __func__, dir->i_ino, tmp, qty,
+			       ntohs(dbuf->blocks[tmp].pagehdr.magic));
+			goto error;
+		}
+	}
+
+	SetPageChecked(page);
+	return;
+
+error:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+/*
+ * discard a page cached in the pagecache
+ */
+static inline void afs_dir_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/*
+ * get a page into the pagecache
+ */
+static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
+				     struct key *key)
+{
+	struct page *page;
+	_enter("{%lu},%lu", dir->i_ino, index);
+
+	page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			afs_dir_check_page(dir, page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	afs_dir_put_page(page);
+	_leave(" = -EIO");
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * open an AFS directory file
+ */
+static int afs_dir_open(struct inode *inode, struct file *file)
+{
+	_enter("{%lu}", inode->i_ino);
+
+	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+
+	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
+		return -ENOENT;
+
+	return afs_open(inode, file);
+}
+
+/*
+ * deal with one block in an AFS directory
+ */
+static int afs_dir_iterate_block(struct dir_context *ctx,
+				 union afs_dir_block *block,
+				 unsigned blkoff)
+{
+	union afs_dirent *dire;
+	unsigned offset, next, curr;
+	size_t nlen;
+	int tmp;
+
+	_enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
+
+	curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
+
+	/* walk through the block, an entry at a time */
+	for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
+	     offset < AFS_DIRENT_PER_BLOCK;
+	     offset = next
+	     ) {
+		next = offset + 1;
+
+		/* skip entries marked unused in the bitmap */
+		if (!(block->pagehdr.bitmap[offset / 8] &
+		      (1 << (offset % 8)))) {
+			_debug("ENT[%Zu.%u]: unused",
+			       blkoff / sizeof(union afs_dir_block), offset);
+			if (offset >= curr)
+				ctx->pos = blkoff +
+					next * sizeof(union afs_dirent);
+			continue;
+		}
+
+		/* got a valid entry */
+		dire = &block->dirents[offset];
+		nlen = strnlen(dire->u.name,
+			       sizeof(*block) -
+			       offset * sizeof(union afs_dirent));
+
+		_debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
+		       blkoff / sizeof(union afs_dir_block), offset,
+		       (offset < curr ? "skip" : "fill"),
+		       nlen, dire->u.name);
+
+		/* work out where the next possible entry is */
+		for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) {
+			if (next >= AFS_DIRENT_PER_BLOCK) {
+				_debug("ENT[%Zu.%u]:"
+				       " %u travelled beyond end dir block"
+				       " (len %u/%Zu)",
+				       blkoff / sizeof(union afs_dir_block),
+				       offset, next, tmp, nlen);
+				return -EIO;
+			}
+			if (!(block->pagehdr.bitmap[next / 8] &
+			      (1 << (next % 8)))) {
+				_debug("ENT[%Zu.%u]:"
+				       " %u unmarked extension (len %u/%Zu)",
+				       blkoff / sizeof(union afs_dir_block),
+				       offset, next, tmp, nlen);
+				return -EIO;
+			}
+
+			_debug("ENT[%Zu.%u]: ext %u/%Zu",
+			       blkoff / sizeof(union afs_dir_block),
+			       next, tmp, nlen);
+			next++;
+		}
+
+		/* skip if starts before the current position */
+		if (offset < curr)
+			continue;
+
+		/* found the next entry */
+		if (!dir_emit(ctx, dire->u.name, nlen,
+			      ntohl(dire->u.vnode),
+			      ctx->actor == afs_lookup_filldir ?
+			      ntohl(dire->u.unique) : DT_UNKNOWN)) {
+			_leave(" = 0 [full]");
+			return 0;
+		}
+
+		ctx->pos = blkoff + next * sizeof(union afs_dirent);
+	}
+
+	_leave(" = 1 [more]");
+	return 1;
+}
+
+/*
+ * iterate through the data blob that lists the contents of an AFS directory
+ */
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+			   struct key *key)
+{
+	union afs_dir_block *dblock;
+	struct afs_dir_page *dbuf;
+	struct page *page;
+	unsigned blkoff, limit;
+	int ret;
+
+	_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
+
+	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
+
+	/* round the file position up to the next entry boundary */
+	ctx->pos += sizeof(union afs_dirent) - 1;
+	ctx->pos &= ~(sizeof(union afs_dirent) - 1);
+
+	/* walk through the blocks in sequence */
+	ret = 0;
+	while (ctx->pos < dir->i_size) {
+		blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
+
+		/* fetch the appropriate page from the directory */
+		page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			break;
+		}
+
+		limit = blkoff & ~(PAGE_SIZE - 1);
+
+		dbuf = page_address(page);
+
+		/* deal with the individual blocks stashed on this page */
+		do {
+			dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
+					       sizeof(union afs_dir_block)];
+			ret = afs_dir_iterate_block(ctx, dblock, blkoff);
+			if (ret != 1) {
+				afs_dir_put_page(page);
+				goto out;
+			}
+
+			blkoff += sizeof(union afs_dir_block);
+
+		} while (ctx->pos < dir->i_size && blkoff < limit);
+
+		afs_dir_put_page(page);
+		ret = 0;
+	}
+
+out:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read an AFS directory
+ */
+static int afs_readdir(struct file *file, struct dir_context *ctx)
+{
+	return afs_dir_iterate(file_inode(file), 
+			      ctx, file->private_data);
+}
+
+/*
+ * search the directory for a name
+ * - if afs_dir_iterate_block() spots this function, it'll pass the FID
+ *   uniquifier through dtype
+ */
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+			      int nlen, loff_t fpos, u64 ino, unsigned dtype)
+{
+	struct afs_lookup_cookie *cookie =
+		container_of(ctx, struct afs_lookup_cookie, ctx);
+
+	_enter("{%s,%u},%s,%u,,%llu,%u",
+	       cookie->name.name, cookie->name.len, name, nlen,
+	       (unsigned long long) ino, dtype);
+
+	/* insanity checks first */
+	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+
+	if (cookie->name.len != nlen ||
+	    memcmp(cookie->name.name, name, nlen) != 0) {
+		_leave(" = 0 [no]");
+		return 0;
+	}
+
+	cookie->fid.vnode = ino;
+	cookie->fid.unique = dtype;
+	cookie->found = 1;
+
+	_leave(" = -1 [found]");
+	return -1;
+}
+
+/*
+ * do a lookup in a directory
+ * - just returns the FID the dentry name maps to if found
+ */
+static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
+			 struct afs_fid *fid, struct key *key)
+{
+	struct afs_super_info *as = dir->i_sb->s_fs_info;
+	struct afs_lookup_cookie cookie = {
+		.ctx.actor = afs_lookup_filldir,
+		.name = dentry->d_name,
+		.fid.vid = as->volume->vid
+	};
+	int ret;
+
+	_enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
+
+	/* search the directory */
+	ret = afs_dir_iterate(dir, &cookie.ctx, key);
+	if (ret < 0) {
+		_leave(" = %d [iter]", ret);
+		return ret;
+	}
+
+	ret = -ENOENT;
+	if (!cookie.found) {
+		_leave(" = -ENOENT [not found]");
+		return -ENOENT;
+	}
+
+	*fid = cookie.fid;
+	_leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+	return 0;
+}
+
+/*
+ * Try to auto mount the mountpoint with pseudo directory, if the autocell
+ * operation is setted.
+ */
+static struct inode *afs_try_auto_mntpt(
+	int ret, struct dentry *dentry, struct inode *dir, struct key *key,
+	struct afs_fid *fid)
+{
+	const char *devname = dentry->d_name.name;
+	struct afs_vnode *vnode = AFS_FS_I(dir);
+	struct inode *inode;
+
+	_enter("%d, %p{%pd}, {%x:%u}, %p",
+	       ret, dentry, dentry, vnode->fid.vid, vnode->fid.vnode, key);
+
+	if (ret != -ENOENT ||
+	    !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+		goto out;
+
+	inode = afs_iget_autocell(dir, devname, strlen(devname), key);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	*fid = AFS_FS_I(inode)->fid;
+	_leave("= %p", inode);
+	return inode;
+
+out:
+	_leave("= %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags)
+{
+	struct afs_vnode *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},%p{%pd},",
+	       vnode->fid.vid, vnode->fid.vnode, dentry, dentry);
+
+	ASSERTCMP(d_inode(dentry), ==, NULL);
+
+	if (dentry->d_name.len >= AFSNAMEMAX) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_leave(" = -ESTALE");
+		return ERR_PTR(-ESTALE);
+	}
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return ERR_CAST(key);
+	}
+
+	ret = afs_validate(vnode, key);
+	if (ret < 0) {
+		key_put(key);
+		_leave(" = %d [val]", ret);
+		return ERR_PTR(ret);
+	}
+
+	ret = afs_do_lookup(dir, dentry, &fid, key);
+	if (ret < 0) {
+		inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
+		if (!IS_ERR(inode)) {
+			key_put(key);
+			goto success;
+		}
+
+		ret = PTR_ERR(inode);
+		key_put(key);
+		if (ret == -ENOENT) {
+			d_add(dentry, NULL);
+			_leave(" = NULL [negative]");
+			return NULL;
+		}
+		_leave(" = %d [do]", ret);
+		return ERR_PTR(ret);
+	}
+	dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
+
+	/* instantiate the dentry */
+	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
+	key_put(key);
+	if (IS_ERR(inode)) {
+		_leave(" = %ld", PTR_ERR(inode));
+		return ERR_CAST(inode);
+	}
+
+success:
+	d_add(dentry, inode);
+	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }",
+	       fid.vnode,
+	       fid.unique,
+	       d_inode(dentry)->i_ino,
+	       d_inode(dentry)->i_generation);
+
+	return NULL;
+}
+
+/*
+ * check that a dentry lookup hit has found a valid entry
+ * - NOTE! the hit can be a negative hit too, so we can't assume we have an
+ *   inode
+ */
+static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct afs_vnode *vnode, *dir;
+	struct afs_fid uninitialized_var(fid);
+	struct dentry *parent;
+	struct key *key;
+	void *dir_version;
+	int ret;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	vnode = AFS_FS_I(d_inode(dentry));
+
+	if (d_really_is_positive(dentry))
+		_enter("{v={%x:%u} n=%pd fl=%lx},",
+		       vnode->fid.vid, vnode->fid.vnode, dentry,
+		       vnode->flags);
+	else
+		_enter("{neg n=%pd}", dentry);
+
+	key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
+	if (IS_ERR(key))
+		key = NULL;
+
+	/* lock down the parent dentry so we can peer at it */
+	parent = dget_parent(dentry);
+	dir = AFS_FS_I(d_inode(parent));
+
+	/* validate the parent directory */
+	if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
+		afs_validate(dir, key);
+
+	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
+		_debug("%pd: parent dir deleted", dentry);
+		goto out_bad;
+	}
+
+	dir_version = (void *) (unsigned long) dir->status.data_version;
+	if (dentry->d_fsdata == dir_version)
+		goto out_valid; /* the dir contents are unchanged */
+
+	_debug("dir modified");
+
+	/* search the directory for this vnode */
+	ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
+	switch (ret) {
+	case 0:
+		/* the filename maps to something */
+		if (d_really_is_negative(dentry))
+			goto out_bad;
+		if (is_bad_inode(d_inode(dentry))) {
+			printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
+			       dentry);
+			goto out_bad;
+		}
+
+		/* if the vnode ID has changed, then the dirent points to a
+		 * different file */
+		if (fid.vnode != vnode->fid.vnode) {
+			_debug("%pd: dirent changed [%u != %u]",
+			       dentry, fid.vnode,
+			       vnode->fid.vnode);
+			goto not_found;
+		}
+
+		/* if the vnode ID uniqifier has changed, then the file has
+		 * been deleted and replaced, and the original vnode ID has
+		 * been reused */
+		if (fid.unique != vnode->fid.unique) {
+			_debug("%pd: file deleted (uq %u -> %u I:%u)",
+			       dentry, fid.unique,
+			       vnode->fid.unique,
+			       d_inode(dentry)->i_generation);
+			spin_lock(&vnode->lock);
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			spin_unlock(&vnode->lock);
+			goto not_found;
+		}
+		goto out_valid;
+
+	case -ENOENT:
+		/* the filename is unknown */
+		_debug("%pd: dirent not found", dentry);
+		if (d_really_is_positive(dentry))
+			goto not_found;
+		goto out_valid;
+
+	default:
+		_debug("failed to iterate dir %pd: %d",
+		       parent, ret);
+		goto out_bad;
+	}
+
+out_valid:
+	dentry->d_fsdata = dir_version;
+	dput(parent);
+	key_put(key);
+	_leave(" = 1 [valid]");
+	return 1;
+
+	/* the dirent, if it exists, now points to a different vnode */
+not_found:
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+	spin_unlock(&dentry->d_lock);
+
+out_bad:
+	_debug("dropping dentry %pd2", dentry);
+	dput(parent);
+	key_put(key);
+
+	_leave(" = 0 [bad]");
+	return 0;
+}
+
+/*
+ * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
+ * sleep)
+ * - called from dput() when d_count is going to 0.
+ * - return 1 to request dentry be unhashed, 0 otherwise
+ */
+static int afs_d_delete(const struct dentry *dentry)
+{
+	_enter("%pd", dentry);
+
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto zap;
+
+	if (d_really_is_positive(dentry) &&
+	    (test_bit(AFS_VNODE_DELETED,   &AFS_FS_I(d_inode(dentry))->flags) ||
+	     test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(d_inode(dentry))->flags)))
+		goto zap;
+
+	_leave(" = 0 [keep]");
+	return 0;
+
+zap:
+	_leave(" = 1 [zap]");
+	return 1;
+}
+
+/*
+ * handle dentry release
+ */
+static void afs_d_release(struct dentry *dentry)
+{
+	_enter("%pd", dentry);
+}
+
+/*
+ * create a directory on an AFS filesystem
+ */
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct afs_file_status status;
+	struct afs_callback cb;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%pd},%ho",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	mode |= S_IFDIR;
+	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+			       mode, &fid, &status, &cb, &server);
+	if (ret < 0)
+		goto mkdir_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+mkdir_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * remove a directory from an AFS filesystem
+ */
+static int afs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%pd}",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
+	if (ret < 0)
+		goto rmdir_error;
+
+	if (d_really_is_positive(dentry)) {
+		vnode = AFS_FS_I(d_inode(dentry));
+		clear_nlink(&vnode->vfs_inode);
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		afs_discard_callback_on_delete(vnode);
+	}
+
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+rmdir_error:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * remove a file from an AFS filesystem
+ */
+static int afs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%pd}",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	if (d_really_is_positive(dentry)) {
+		vnode = AFS_FS_I(d_inode(dentry));
+
+		/* make sure we have a callback promise on the victim */
+		ret = afs_validate(vnode, key);
+		if (ret < 0)
+			goto error;
+	}
+
+	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
+	if (ret < 0)
+		goto remove_error;
+
+	if (d_really_is_positive(dentry)) {
+		/* if the file wasn't deleted due to excess hard links, the
+		 * fileserver will break the callback promise on the file - if
+		 * it had one - before it returns to us, and if it was deleted,
+		 * it won't
+		 *
+		 * however, if we didn't have a callback promise outstanding,
+		 * or it was outstanding on a different server, then it won't
+		 * break it either...
+		 */
+		vnode = AFS_FS_I(d_inode(dentry));
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			_debug("AFS_VNODE_DELETED");
+		if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+			_debug("AFS_VNODE_CB_BROKEN");
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_validate(vnode, key);
+		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+	}
+
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+remove_error:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a regular file on an AFS filesystem
+ */
+static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl)
+{
+	struct afs_file_status status;
+	struct afs_callback cb;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%pd},%ho,",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	mode |= S_IFREG;
+	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+			       mode, &fid, &status, &cb, &server);
+	if (ret < 0)
+		goto create_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+create_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a hard link between files in an AFS filesystem
+ */
+static int afs_link(struct dentry *from, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(d_inode(from));
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%x:%u},{%pd}",
+	       vnode->fid.vid, vnode->fid.vnode,
+	       dvnode->fid.vid, dvnode->fid.vnode,
+	       dentry);
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
+	if (ret < 0)
+		goto link_error;
+
+	ihold(&vnode->vfs_inode);
+	d_instantiate(dentry, &vnode->vfs_inode);
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+link_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a symlink in an AFS filesystem
+ */
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *content)
+{
+	struct afs_file_status status;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%pd},%s",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry,
+	       content);
+
+	ret = -EINVAL;
+	if (strlen(content) >= AFSPATHMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
+				&fid, &status, &server);
+	if (ret < 0)
+		goto create_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+create_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * rename a file in an AFS filesystem and/or move it between directories
+ */
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(d_inode(old_dentry));
+	orig_dvnode = AFS_FS_I(old_dir);
+	new_dvnode = AFS_FS_I(new_dir);
+
+	_enter("{%x:%u},{%x:%u},{%x:%u},{%pd}",
+	       orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
+	       vnode->fid.vid, vnode->fid.vnode,
+	       new_dvnode->fid.vid, new_dvnode->fid.vnode,
+	       new_dentry);
+
+	key = afs_request_key(orig_dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
+			       old_dentry->d_name.name,
+			       new_dentry->d_name.name);
+	if (ret < 0)
+		goto rename_error;
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+rename_error:
+	key_put(key);
+error:
+	d_drop(new_dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/kernel/fs/afs/file.c b/kernel/fs/afs/file.c
new file mode 100644
index 000000000..999bc3cae
--- /dev/null
+++ b/kernel/fs/afs/file.c
@@ -0,0 +1,378 @@
+/* AFS filesystem file handling
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/gfp.h>
+#include "internal.h"
+
+static int afs_readpage(struct file *file, struct page *page);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length);
+static int afs_releasepage(struct page *page, gfp_t gfp_flags);
+static int afs_launder_page(struct page *page);
+
+static int afs_readpages(struct file *filp, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages);
+
+const struct file_operations afs_file_operations = {
+	.open		= afs_open,
+	.release	= afs_release,
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= afs_file_write,
+	.mmap		= generic_file_readonly_mmap,
+	.splice_read	= generic_file_splice_read,
+	.fsync		= afs_fsync,
+	.lock		= afs_lock,
+	.flock		= afs_flock,
+};
+
+const struct inode_operations afs_file_inode_operations = {
+	.getattr	= afs_getattr,
+	.setattr	= afs_setattr,
+	.permission	= afs_permission,
+};
+
+const struct address_space_operations afs_fs_aops = {
+	.readpage	= afs_readpage,
+	.readpages	= afs_readpages,
+	.set_page_dirty	= afs_set_page_dirty,
+	.launder_page	= afs_launder_page,
+	.releasepage	= afs_releasepage,
+	.invalidatepage	= afs_invalidatepage,
+	.write_begin	= afs_write_begin,
+	.write_end	= afs_write_end,
+	.writepage	= afs_writepage,
+	.writepages	= afs_writepages,
+};
+
+/*
+ * open an AFS file or directory and attach a key to it
+ */
+int afs_open(struct inode *inode, struct file *file)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	ret = afs_validate(vnode, key);
+	if (ret < 0) {
+		_leave(" = %d [val]", ret);
+		return ret;
+	}
+
+	file->private_data = key;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * release an AFS file or directory and discard its key
+ */
+int afs_release(struct inode *inode, struct file *file)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
+	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+
+	key_put(file->private_data);
+	_leave(" = 0");
+	return 0;
+}
+
+#ifdef CONFIG_AFS_FSCACHE
+/*
+ * deal with notification that a page was read from the cache
+ */
+static void afs_file_readpage_read_complete(struct page *page,
+					    void *data,
+					    int error)
+{
+	_enter("%p,%p,%d", page, data, error);
+
+	/* if the read completes with an error, we just unlock the page and let
+	 * the VM reissue the readpage */
+	if (!error)
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+#endif
+
+/*
+ * read page from file, directory or symlink, given a key to use
+ */
+int afs_page_filler(void *data, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct key *key = data;
+	size_t len;
+	off_t offset;
+	int ret;
+
+	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
+
+	BUG_ON(!PageLocked(page));
+
+	ret = -ESTALE;
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		goto error;
+
+	/* is it cached? */
+#ifdef CONFIG_AFS_FSCACHE
+	ret = fscache_read_or_alloc_page(vnode->cache,
+					 page,
+					 afs_file_readpage_read_complete,
+					 NULL,
+					 GFP_KERNEL);
+#else
+	ret = -ENOBUFS;
+#endif
+	switch (ret) {
+		/* read BIO submitted (page in cache) */
+	case 0:
+		break;
+
+		/* page not yet cached */
+	case -ENODATA:
+		_debug("cache said ENODATA");
+		goto go_on;
+
+		/* page will not be cached */
+	case -ENOBUFS:
+		_debug("cache said ENOBUFS");
+	default:
+	go_on:
+		offset = page->index << PAGE_CACHE_SHIFT;
+		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
+
+		/* read the contents of the file from the server into the
+		 * page */
+		ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				_debug("got NOENT from server"
+				       " - marking file deleted and stale");
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				ret = -ESTALE;
+			}
+
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			BUG_ON(PageFsCache(page));
+			goto error;
+		}
+
+		SetPageUptodate(page);
+
+		/* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+		if (PageFsCache(page) &&
+		    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+			fscache_uncache_page(vnode->cache, page);
+			BUG_ON(PageFsCache(page));
+		}
+#endif
+		unlock_page(page);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+error:
+	SetPageError(page);
+	unlock_page(page);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read page from file, directory or symlink, given a file to nominate the key
+ * to be used
+ */
+static int afs_readpage(struct file *file, struct page *page)
+{
+	struct key *key;
+	int ret;
+
+	if (file) {
+		key = file->private_data;
+		ASSERT(key != NULL);
+		ret = afs_page_filler(key, page);
+	} else {
+		struct inode *inode = page->mapping->host;
+		key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+		if (IS_ERR(key)) {
+			ret = PTR_ERR(key);
+		} else {
+			ret = afs_page_filler(key, page);
+			key_put(key);
+		}
+	}
+	return ret;
+}
+
+/*
+ * read a set of pages
+ */
+static int afs_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
+{
+	struct key *key = file->private_data;
+	struct afs_vnode *vnode;
+	int ret = 0;
+
+	_enter("{%d},{%lu},,%d",
+	       key_serial(key), mapping->host->i_ino, nr_pages);
+
+	ASSERT(key != NULL);
+
+	vnode = AFS_FS_I(mapping->host);
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
+
+	/* attempt to read as many of the pages as possible */
+#ifdef CONFIG_AFS_FSCACHE
+	ret = fscache_read_or_alloc_pages(vnode->cache,
+					  mapping,
+					  pages,
+					  &nr_pages,
+					  afs_file_readpage_read_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+#else
+	ret = -ENOBUFS;
+#endif
+
+	switch (ret) {
+		/* all pages are being read from the cache */
+	case 0:
+		BUG_ON(!list_empty(pages));
+		BUG_ON(nr_pages != 0);
+		_leave(" = 0 [reading all]");
+		return 0;
+
+		/* there were pages that couldn't be read from the cache */
+	case -ENODATA:
+	case -ENOBUFS:
+		break;
+
+		/* other error */
+	default:
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* load the missing pages from the network */
+	ret = read_cache_pages(mapping, pages, afs_page_filler, key);
+
+	_leave(" = %d [netting]", ret);
+	return ret;
+}
+
+/*
+ * write back a dirty page
+ */
+static int afs_launder_page(struct page *page)
+{
+	_enter("{%lu}", page->index);
+
+	return 0;
+}
+
+/*
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ *   the entire page)
+ */
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length)
+{
+	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+
+	_enter("{%lu},%u,%u", page->index, offset, length);
+
+	BUG_ON(!PageLocked(page));
+
+	/* we clean up only if the entire page is being invalidated */
+	if (offset == 0 && length == PAGE_CACHE_SIZE) {
+#ifdef CONFIG_AFS_FSCACHE
+		if (PageFsCache(page)) {
+			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+			fscache_wait_on_page_write(vnode->cache, page);
+			fscache_uncache_page(vnode->cache, page);
+		}
+#endif
+
+		if (PagePrivate(page)) {
+			if (wb && !PageWriteback(page)) {
+				set_page_private(page, 0);
+				afs_put_writeback(wb);
+			}
+
+			if (!page_private(page))
+				ClearPagePrivate(page);
+		}
+	}
+
+	_leave("");
+}
+
+/*
+ * release a page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
+ */
+static int afs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+
+	_enter("{{%x:%u}[%lu],%lx},%x",
+	       vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
+	       gfp_flags);
+
+	/* deny if page is being written to the cache and the caller hasn't
+	 * elected to wait */
+#ifdef CONFIG_AFS_FSCACHE
+	if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
+		_leave(" = F [cache busy]");
+		return 0;
+	}
+#endif
+
+	if (PagePrivate(page)) {
+		if (wb) {
+			set_page_private(page, 0);
+			afs_put_writeback(wb);
+		}
+		ClearPagePrivate(page);
+	}
+
+	/* indicate that the page can be released */
+	_leave(" = T");
+	return 1;
+}
diff --git a/kernel/fs/afs/flock.c b/kernel/fs/afs/flock.c
new file mode 100644
index 000000000..4baf1d2b3
--- /dev/null
+++ b/kernel/fs/afs/flock.c
@@ -0,0 +1,585 @@
+/* AFS file locking support
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "internal.h"
+
+#define AFS_LOCK_GRANTED	0
+#define AFS_LOCK_PENDING	1
+
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
+static void afs_fl_release_private(struct file_lock *fl);
+
+static struct workqueue_struct *afs_lock_manager;
+static DEFINE_MUTEX(afs_lock_manager_mutex);
+
+static const struct file_lock_operations afs_lock_ops = {
+	.fl_copy_lock		= afs_fl_copy_lock,
+	.fl_release_private	= afs_fl_release_private,
+};
+
+/*
+ * initialise the lock manager thread if it isn't already running
+ */
+static int afs_init_lock_manager(void)
+{
+	int ret;
+
+	ret = 0;
+	if (!afs_lock_manager) {
+		mutex_lock(&afs_lock_manager_mutex);
+		if (!afs_lock_manager) {
+			afs_lock_manager =
+				create_singlethread_workqueue("kafs_lockd");
+			if (!afs_lock_manager)
+				ret = -ENOMEM;
+		}
+		mutex_unlock(&afs_lock_manager_mutex);
+	}
+	return ret;
+}
+
+/*
+ * destroy the lock manager thread if it's running
+ */
+void __exit afs_kill_lock_manager(void)
+{
+	if (afs_lock_manager)
+		destroy_workqueue(afs_lock_manager);
+}
+
+/*
+ * if the callback is broken on this vnode, then the lock may now be available
+ */
+void afs_lock_may_be_available(struct afs_vnode *vnode)
+{
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
+}
+
+/*
+ * the lock will time out in 5 minutes unless we extend it, so schedule
+ * extension in a bit less than that time
+ */
+static void afs_schedule_lock_extension(struct afs_vnode *vnode)
+{
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+			   AFS_LOCKWAIT * HZ / 2);
+}
+
+/*
+ * grant one or more locks (readlocks are allowed to jump the queue if the
+ * first lock in the queue is itself a readlock)
+ * - the caller must hold the vnode lock
+ */
+static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl)
+{
+	struct file_lock *p, *_p;
+
+	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+	if (fl->fl_type == F_RDLCK) {
+		list_for_each_entry_safe(p, _p, &vnode->pending_locks,
+					 fl_u.afs.link) {
+			if (p->fl_type == F_RDLCK) {
+				p->fl_u.afs.state = AFS_LOCK_GRANTED;
+				list_move_tail(&p->fl_u.afs.link,
+					       &vnode->granted_locks);
+				wake_up(&p->fl_wait);
+			}
+		}
+	}
+}
+
+/*
+ * do work for a lock, including:
+ * - probing for a lock we're waiting on but didn't get immediately
+ * - extending a lock that's close to timing out
+ */
+void afs_lock_work(struct work_struct *work)
+{
+	struct afs_vnode *vnode =
+		container_of(work, struct afs_vnode, lock_work.work);
+	struct file_lock *fl;
+	afs_lock_type_t type;
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	spin_lock(&vnode->lock);
+
+	if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
+		_debug("unlock");
+		spin_unlock(&vnode->lock);
+
+		/* attempt to release the server lock; if it fails, we just
+		 * wait 5 minutes and it'll time out anyway */
+		ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
+		if (ret < 0)
+			printk(KERN_WARNING "AFS:"
+			       " Failed to release lock on {%x:%x} error %d\n",
+			       vnode->fid.vid, vnode->fid.vnode, ret);
+
+		spin_lock(&vnode->lock);
+		key_put(vnode->unlock_key);
+		vnode->unlock_key = NULL;
+		clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
+	}
+
+	/* if we've got a lock, then it must be time to extend that lock as AFS
+	 * locks time out after 5 minutes */
+	if (!list_empty(&vnode->granted_locks)) {
+		_debug("extend");
+
+		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+			BUG();
+		fl = list_entry(vnode->granted_locks.next,
+				struct file_lock, fl_u.afs.link);
+		key = key_get(fl->fl_file->private_data);
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_extend_lock(vnode, key);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		key_put(key);
+		switch (ret) {
+		case 0:
+			afs_schedule_lock_extension(vnode);
+			break;
+		default:
+			/* ummm... we failed to extend the lock - retry
+			 * extension shortly */
+			printk(KERN_WARNING "AFS:"
+			       " Failed to extend lock on {%x:%x} error %d\n",
+			       vnode->fid.vid, vnode->fid.vnode, ret);
+			queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+					   HZ * 10);
+			break;
+		}
+		_leave(" [extend]");
+		return;
+	}
+
+	/* if we don't have a granted lock, then we must've been called back by
+	 * the server, and so if might be possible to get a lock we're
+	 * currently waiting for */
+	if (!list_empty(&vnode->pending_locks)) {
+		_debug("get");
+
+		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+			BUG();
+		fl = list_entry(vnode->pending_locks.next,
+				struct file_lock, fl_u.afs.link);
+		key = key_get(fl->fl_file->private_data);
+		type = (fl->fl_type == F_RDLCK) ?
+			AFS_LOCK_READ : AFS_LOCK_WRITE;
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_set_lock(vnode, key, type);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		switch (ret) {
+		case -EWOULDBLOCK:
+			_debug("blocked");
+			break;
+		case 0:
+			_debug("acquired");
+			if (type == AFS_LOCK_READ)
+				set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+			else
+				set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+			ret = AFS_LOCK_GRANTED;
+		default:
+			spin_lock(&vnode->lock);
+			/* the pending lock may have been withdrawn due to a
+			 * signal */
+			if (list_entry(vnode->pending_locks.next,
+				       struct file_lock, fl_u.afs.link) == fl) {
+				fl->fl_u.afs.state = ret;
+				if (ret == AFS_LOCK_GRANTED)
+					afs_grant_locks(vnode, fl);
+				else
+					list_del_init(&fl->fl_u.afs.link);
+				wake_up(&fl->fl_wait);
+				spin_unlock(&vnode->lock);
+			} else {
+				_debug("withdrawn");
+				clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+				clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+				spin_unlock(&vnode->lock);
+				afs_vnode_release_lock(vnode, key);
+				if (!list_empty(&vnode->pending_locks))
+					afs_lock_may_be_available(vnode);
+			}
+			break;
+		}
+		key_put(key);
+		_leave(" [pend]");
+		return;
+	}
+
+	/* looks like the lock request was withdrawn on a signal */
+	spin_unlock(&vnode->lock);
+	_leave(" [no locks]");
+}
+
+/*
+ * pass responsibility for the unlocking of a vnode on the server to the
+ * manager thread, lest a pending signal in the calling thread interrupt
+ * AF_RXRPC
+ * - the caller must hold the vnode lock
+ */
+static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
+{
+	cancel_delayed_work(&vnode->lock_work);
+	if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
+	    !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
+		BUG();
+	if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
+		BUG();
+	vnode->unlock_key = key_get(key);
+	afs_lock_may_be_available(vnode);
+}
+
+/*
+ * request a lock on a file on the server
+ */
+static int afs_do_setlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(file);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	afs_lock_type_t type;
+	struct key *key = file->private_data;
+	int ret;
+
+	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+
+	/* only whole-file locks are supported */
+	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+		return -EINVAL;
+
+	ret = afs_init_lock_manager();
+	if (ret < 0)
+		return ret;
+
+	fl->fl_ops = &afs_lock_ops;
+	INIT_LIST_HEAD(&fl->fl_u.afs.link);
+	fl->fl_u.afs.state = AFS_LOCK_PENDING;
+
+	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+
+	spin_lock(&inode->i_lock);
+
+	/* make sure we've got a callback on this file and that our view of the
+	 * data version is up to date */
+	ret = afs_vnode_fetch_status(vnode, NULL, key);
+	if (ret < 0)
+		goto error;
+
+	if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
+		ret = -EAGAIN;
+		goto error;
+	}
+
+	spin_lock(&vnode->lock);
+
+	/* if we've already got a readlock on the server then we can instantly
+	 * grant another readlock, irrespective of whether there are any
+	 * pending writelocks */
+	if (type == AFS_LOCK_READ &&
+	    vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
+		_debug("instant readlock");
+		ASSERTCMP(vnode->flags &
+			  ((1 << AFS_VNODE_LOCKING) |
+			   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+		ASSERT(!list_empty(&vnode->granted_locks));
+		goto sharing_existing_lock;
+	}
+
+	/* if there's no-one else with a lock on this vnode, then we need to
+	 * ask the server for a lock */
+	if (list_empty(&vnode->pending_locks) &&
+	    list_empty(&vnode->granted_locks)) {
+		_debug("not locked");
+		ASSERTCMP(vnode->flags &
+			  ((1 << AFS_VNODE_LOCKING) |
+			   (1 << AFS_VNODE_READLOCKED) |
+			   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+		list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+		set_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_set_lock(vnode, key, type);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		switch (ret) {
+		case 0:
+			_debug("acquired");
+			goto acquired_server_lock;
+		case -EWOULDBLOCK:
+			_debug("would block");
+			spin_lock(&vnode->lock);
+			ASSERT(list_empty(&vnode->granted_locks));
+			ASSERTCMP(vnode->pending_locks.next, ==,
+				  &fl->fl_u.afs.link);
+			goto wait;
+		default:
+			spin_lock(&vnode->lock);
+			list_del_init(&fl->fl_u.afs.link);
+			spin_unlock(&vnode->lock);
+			goto error;
+		}
+	}
+
+	/* otherwise, we need to wait for a local lock to become available */
+	_debug("wait local");
+	list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+wait:
+	if (!(fl->fl_flags & FL_SLEEP)) {
+		_debug("noblock");
+		ret = -EAGAIN;
+		goto abort_attempt;
+	}
+	spin_unlock(&vnode->lock);
+
+	/* now we need to sleep and wait for the lock manager thread to get the
+	 * lock from the server */
+	_debug("sleep");
+	ret = wait_event_interruptible(fl->fl_wait,
+				       fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
+	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+		ret = fl->fl_u.afs.state;
+		if (ret < 0)
+			goto error;
+		spin_lock(&vnode->lock);
+		goto given_lock;
+	}
+
+	/* we were interrupted, but someone may still be in the throes of
+	 * giving us the lock */
+	_debug("intr");
+	ASSERTCMP(ret, ==, -ERESTARTSYS);
+
+	spin_lock(&vnode->lock);
+	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+		ret = fl->fl_u.afs.state;
+		if (ret < 0) {
+			spin_unlock(&vnode->lock);
+			goto error;
+		}
+		goto given_lock;
+	}
+
+abort_attempt:
+	/* we aren't going to get the lock, either because we're unwilling to
+	 * wait, or because some signal happened */
+	_debug("abort");
+	if (list_empty(&vnode->granted_locks) &&
+	    vnode->pending_locks.next == &fl->fl_u.afs.link) {
+		if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
+			/* kick the next pending lock into having a go */
+			list_del_init(&fl->fl_u.afs.link);
+			afs_lock_may_be_available(vnode);
+		}
+	} else {
+		list_del_init(&fl->fl_u.afs.link);
+	}
+	spin_unlock(&vnode->lock);
+	goto error;
+
+acquired_server_lock:
+	/* we've acquired a server lock, but it needs to be renewed after 5
+	 * mins */
+	spin_lock(&vnode->lock);
+	afs_schedule_lock_extension(vnode);
+	if (type == AFS_LOCK_READ)
+		set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+	else
+		set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+sharing_existing_lock:
+	/* the lock has been granted as far as we're concerned... */
+	fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+given_lock:
+	/* ... but we do still need to get the VFS's blessing */
+	ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
+	ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
+				(1 << AFS_VNODE_WRITELOCKED))) != 0);
+	ret = posix_lock_file(file, fl, NULL);
+	if (ret < 0)
+		goto vfs_rejected_lock;
+	spin_unlock(&vnode->lock);
+
+	/* again, make sure we've got a callback on this file and, again, make
+	 * sure that our view of the data version is up to date (we ignore
+	 * errors incurred here and deal with the consequences elsewhere) */
+	afs_vnode_fetch_status(vnode, NULL, key);
+
+error:
+	spin_unlock(&inode->i_lock);
+	_leave(" = %d", ret);
+	return ret;
+
+vfs_rejected_lock:
+	/* the VFS rejected the lock we just obtained, so we have to discard
+	 * what we just got */
+	_debug("vfs refused %d", ret);
+	list_del_init(&fl->fl_u.afs.link);
+	if (list_empty(&vnode->granted_locks))
+		afs_defer_unlock(vnode, key);
+	goto abort_attempt;
+}
+
+/*
+ * unlock on a file on the server
+ */
+static int afs_do_unlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct key *key = file->private_data;
+	int ret;
+
+	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+
+	/* only whole-file unlocks are supported */
+	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+		return -EINVAL;
+
+	fl->fl_ops = &afs_lock_ops;
+	INIT_LIST_HEAD(&fl->fl_u.afs.link);
+	fl->fl_u.afs.state = AFS_LOCK_PENDING;
+
+	spin_lock(&vnode->lock);
+	ret = posix_lock_file(file, fl, NULL);
+	if (ret < 0) {
+		spin_unlock(&vnode->lock);
+		_leave(" = %d [vfs]", ret);
+		return ret;
+	}
+
+	/* discard the server lock only if all granted locks are gone */
+	if (list_empty(&vnode->granted_locks))
+		afs_defer_unlock(vnode, key);
+	spin_unlock(&vnode->lock);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * return information about a lock we currently hold, if indeed we hold one
+ */
+static int afs_do_getlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct key *key = file->private_data;
+	int ret, lock_count;
+
+	_enter("");
+
+	fl->fl_type = F_UNLCK;
+
+	mutex_lock(&vnode->vfs_inode.i_mutex);
+
+	/* check local lock records first */
+	ret = 0;
+	posix_test_lock(file, fl);
+	if (fl->fl_type == F_UNLCK) {
+		/* no local locks; consult the server */
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error;
+		lock_count = vnode->status.lock_count;
+		if (lock_count) {
+			if (lock_count > 0)
+				fl->fl_type = F_RDLCK;
+			else
+				fl->fl_type = F_WRLCK;
+			fl->fl_start = 0;
+			fl->fl_end = OFFSET_MAX;
+		}
+	}
+
+error:
+	mutex_unlock(&vnode->vfs_inode.i_mutex);
+	_leave(" = %d [%hd]", ret, fl->fl_type);
+	return ret;
+}
+
+/*
+ * manage POSIX locks on a file
+ */
+int afs_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+
+	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
+	       vnode->fid.vid, vnode->fid.vnode, cmd,
+	       fl->fl_type, fl->fl_flags,
+	       (long long) fl->fl_start, (long long) fl->fl_end);
+
+	/* AFS doesn't support mandatory locks */
+	if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	if (IS_GETLK(cmd))
+		return afs_do_getlk(file, fl);
+	if (fl->fl_type == F_UNLCK)
+		return afs_do_unlk(file, fl);
+	return afs_do_setlk(file, fl);
+}
+
+/*
+ * manage FLOCK locks on a file
+ */
+int afs_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+
+	_enter("{%x:%u},%d,{t=%x,fl=%x}",
+	       vnode->fid.vid, vnode->fid.vnode, cmd,
+	       fl->fl_type, fl->fl_flags);
+
+	/*
+	 * No BSD flocks over NFS allowed.
+	 * Note: we could try to fake a POSIX lock request here by
+	 * using ((u32) filp | 0x80000000) or some such as the pid.
+	 * Not sure whether that would be unique, though, or whether
+	 * that would break in other places.
+	 */
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+
+	/* we're simulating flock() locks using posix locks on the server */
+	if (fl->fl_type == F_UNLCK)
+		return afs_do_unlk(file, fl);
+	return afs_do_setlk(file, fl);
+}
+
+/*
+ * the POSIX lock management core VFS code copies the lock record and adds the
+ * copy into its own list, so we need to add that copy to the vnode's lock
+ * queue in the same place as the original (which will be deleted shortly
+ * after)
+ */
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	_enter("");
+
+	list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
+}
+
+/*
+ * need to remove this lock from the vnode queue when it's removed from the
+ * VFS's list
+ */
+static void afs_fl_release_private(struct file_lock *fl)
+{
+	_enter("");
+
+	list_del_init(&fl->fl_u.afs.link);
+}
diff --git a/kernel/fs/afs/fsclient.c b/kernel/fs/afs/fsclient.c
new file mode 100644
index 000000000..c2e930ec2
--- /dev/null
+++ b/kernel/fs/afs/fsclient.c
@@ -0,0 +1,1911 @@
+/* AFS File Server client stubs
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/circ_buf.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+/*
+ * decode an AFSFid block
+ */
+static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
+{
+	const __be32 *bp = *_bp;
+
+	fid->vid		= ntohl(*bp++);
+	fid->vnode		= ntohl(*bp++);
+	fid->unique		= ntohl(*bp++);
+	*_bp = bp;
+}
+
+/*
+ * decode an AFSFetchStatus block
+ */
+static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
+				      struct afs_file_status *status,
+				      struct afs_vnode *vnode,
+				      afs_dataversion_t *store_version)
+{
+	afs_dataversion_t expected_version;
+	const __be32 *bp = *_bp;
+	umode_t mode;
+	u64 data_version, size;
+	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+	kuid_t owner;
+	kgid_t group;
+
+#define EXTRACT(DST)				\
+	do {					\
+		u32 x = ntohl(*bp++);		\
+		changed |= DST - x;		\
+		DST = x;			\
+	} while (0)
+
+	status->if_version = ntohl(*bp++);
+	EXTRACT(status->type);
+	EXTRACT(status->nlink);
+	size = ntohl(*bp++);
+	data_version = ntohl(*bp++);
+	EXTRACT(status->author);
+	owner = make_kuid(&init_user_ns, ntohl(*bp++));
+	changed |= !uid_eq(owner, status->owner);
+	status->owner = owner;
+	EXTRACT(status->caller_access); /* call ticket dependent */
+	EXTRACT(status->anon_access);
+	EXTRACT(status->mode);
+	EXTRACT(status->parent.vnode);
+	EXTRACT(status->parent.unique);
+	bp++; /* seg size */
+	status->mtime_client = ntohl(*bp++);
+	status->mtime_server = ntohl(*bp++);
+	group = make_kgid(&init_user_ns, ntohl(*bp++));
+	changed |= !gid_eq(group, status->group);
+	status->group = group;
+	bp++; /* sync counter */
+	data_version |= (u64) ntohl(*bp++) << 32;
+	EXTRACT(status->lock_count);
+	size |= (u64) ntohl(*bp++) << 32;
+	bp++; /* spare 4 */
+	*_bp = bp;
+
+	if (size != status->size) {
+		status->size = size;
+		changed |= true;
+	}
+	status->mode &= S_IALLUGO;
+
+	_debug("vnode time %lx, %lx",
+	       status->mtime_client, status->mtime_server);
+
+	if (vnode) {
+		status->parent.vid = vnode->fid.vid;
+		if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+			_debug("vnode changed");
+			i_size_write(&vnode->vfs_inode, size);
+			vnode->vfs_inode.i_uid = status->owner;
+			vnode->vfs_inode.i_gid = status->group;
+			vnode->vfs_inode.i_generation = vnode->fid.unique;
+			set_nlink(&vnode->vfs_inode, status->nlink);
+
+			mode = vnode->vfs_inode.i_mode;
+			mode &= ~S_IALLUGO;
+			mode |= status->mode;
+			barrier();
+			vnode->vfs_inode.i_mode = mode;
+		}
+
+		vnode->vfs_inode.i_ctime.tv_sec	= status->mtime_server;
+		vnode->vfs_inode.i_mtime	= vnode->vfs_inode.i_ctime;
+		vnode->vfs_inode.i_atime	= vnode->vfs_inode.i_ctime;
+		vnode->vfs_inode.i_version	= data_version;
+	}
+
+	expected_version = status->data_version;
+	if (store_version)
+		expected_version = *store_version;
+
+	if (expected_version != data_version) {
+		status->data_version = data_version;
+		if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+			_debug("vnode modified %llx on {%x:%u}",
+			       (unsigned long long) data_version,
+			       vnode->fid.vid, vnode->fid.vnode);
+			set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+		}
+	} else if (store_version) {
+		status->data_version = data_version;
+	}
+}
+
+/*
+ * decode an AFSCallBack block
+ */
+static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+{
+	const __be32 *bp = *_bp;
+
+	vnode->cb_version	= ntohl(*bp++);
+	vnode->cb_expiry	= ntohl(*bp++);
+	vnode->cb_type		= ntohl(*bp++);
+	vnode->cb_expires	= vnode->cb_expiry + get_seconds();
+	*_bp = bp;
+}
+
+static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
+				       struct afs_callback *cb)
+{
+	const __be32 *bp = *_bp;
+
+	cb->version	= ntohl(*bp++);
+	cb->expiry	= ntohl(*bp++);
+	cb->type	= ntohl(*bp++);
+	*_bp = bp;
+}
+
+/*
+ * decode an AFSVolSync block
+ */
+static void xdr_decode_AFSVolSync(const __be32 **_bp,
+				  struct afs_volsync *volsync)
+{
+	const __be32 *bp = *_bp;
+
+	volsync->creation = ntohl(*bp++);
+	bp++; /* spare2 */
+	bp++; /* spare3 */
+	bp++; /* spare4 */
+	bp++; /* spare5 */
+	bp++; /* spare6 */
+	*_bp = bp;
+}
+
+/*
+ * encode the requested attributes into an AFSStoreStatus block
+ */
+static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
+{
+	__be32 *bp = *_bp;
+	u32 mask = 0, mtime = 0, owner = 0, group = 0, mode = 0;
+
+	mask = 0;
+	if (attr->ia_valid & ATTR_MTIME) {
+		mask |= AFS_SET_MTIME;
+		mtime = attr->ia_mtime.tv_sec;
+	}
+
+	if (attr->ia_valid & ATTR_UID) {
+		mask |= AFS_SET_OWNER;
+		owner = from_kuid(&init_user_ns, attr->ia_uid);
+	}
+
+	if (attr->ia_valid & ATTR_GID) {
+		mask |= AFS_SET_GROUP;
+		group = from_kgid(&init_user_ns, attr->ia_gid);
+	}
+
+	if (attr->ia_valid & ATTR_MODE) {
+		mask |= AFS_SET_MODE;
+		mode = attr->ia_mode & S_IALLUGO;
+	}
+
+	*bp++ = htonl(mask);
+	*bp++ = htonl(mtime);
+	*bp++ = htonl(owner);
+	*bp++ = htonl(group);
+	*bp++ = htonl(mode);
+	*bp++ = 0;		/* segment size */
+	*_bp = bp;
+}
+
+/*
+ * decode an AFSFetchVolumeStatus block
+ */
+static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
+					    struct afs_volume_status *vs)
+{
+	const __be32 *bp = *_bp;
+
+	vs->vid			= ntohl(*bp++);
+	vs->parent_id		= ntohl(*bp++);
+	vs->online		= ntohl(*bp++);
+	vs->in_service		= ntohl(*bp++);
+	vs->blessed		= ntohl(*bp++);
+	vs->needs_salvage	= ntohl(*bp++);
+	vs->type		= ntohl(*bp++);
+	vs->min_quota		= ntohl(*bp++);
+	vs->max_quota		= ntohl(*bp++);
+	vs->blocks_in_use	= ntohl(*bp++);
+	vs->part_blocks_avail	= ntohl(*bp++);
+	vs->part_max_blocks	= ntohl(*bp++);
+	*_bp = bp;
+}
+
+/*
+ * deliver reply data to an FS.FetchStatus
+ */
+static int afs_deliver_fs_fetch_status(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	xdr_decode_AFSCallBack(&bp, vnode);
+	if (call->reply2)
+		xdr_decode_AFSVolSync(&bp, call->reply2);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.FetchStatus operation type
+ */
+static const struct afs_call_type afs_RXFSFetchStatus = {
+	.name		= "FS.FetchStatus",
+	.deliver	= afs_deliver_fs_fetch_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * fetch the status information for a file
+ */
+int afs_fs_fetch_file_status(struct afs_server *server,
+			     struct key *key,
+			     struct afs_vnode *vnode,
+			     struct afs_volsync *volsync,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = volsync;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHSTATUS);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.FetchData
+ */
+static int afs_deliver_fs_fetch_data(struct afs_call *call,
+				     struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+	struct page *page;
+	void *buffer;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+		if (call->operation_ID != FSFETCHDATA64) {
+			call->unmarshall++;
+			goto no_msw;
+		}
+
+		/* extract the upper part of the returned data length of an
+		 * FSFETCHDATA64 op (which should always be 0 using this
+		 * client) */
+	case 1:
+		_debug("extract data length (MSW)");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("DATA length MSW: %u", call->count);
+		if (call->count > 0)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+	no_msw:
+		/* extract the returned data length */
+	case 2:
+		_debug("extract data length");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("DATA length: %u", call->count);
+		if (call->count > PAGE_SIZE)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the returned data */
+	case 3:
+		_debug("extract data");
+		if (call->count > 0) {
+			page = call->reply3;
+			buffer = kmap_atomic(page);
+			ret = afs_extract_data(call, skb, last, buffer,
+					       call->count);
+			kunmap_atomic(buffer);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the metadata */
+	case 4:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       (21 + 3 + 6) * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		bp = call->buffer;
+		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+		xdr_decode_AFSCallBack(&bp, vnode);
+		if (call->reply2)
+			xdr_decode_AFSVolSync(&bp, call->reply2);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 5:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	if (call->count < PAGE_SIZE) {
+		_debug("clear");
+		page = call->reply3;
+		buffer = kmap_atomic(page);
+		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
+		kunmap_atomic(buffer);
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.FetchData operation type
+ */
+static const struct afs_call_type afs_RXFSFetchData = {
+	.name		= "FS.FetchData",
+	.deliver	= afs_deliver_fs_fetch_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSFetchData64 = {
+	.name		= "FS.FetchData64",
+	.deliver	= afs_deliver_fs_fetch_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * fetch data from a very large file
+ */
+static int afs_fs_fetch_data64(struct afs_server *server,
+			       struct key *key,
+			       struct afs_vnode *vnode,
+			       off_t offset, size_t length,
+			       struct page *buffer,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	ASSERTCMP(length, <, ULONG_MAX);
+
+	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = NULL; /* volsync */
+	call->reply3 = buffer;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSFETCHDATA64;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHDATA64);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+	bp[4] = htonl(upper_32_bits(offset));
+	bp[5] = htonl((u32) offset);
+	bp[6] = 0;
+	bp[7] = htonl((u32) length);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * fetch data from a file
+ */
+int afs_fs_fetch_data(struct afs_server *server,
+		      struct key *key,
+		      struct afs_vnode *vnode,
+		      off_t offset, size_t length,
+		      struct page *buffer,
+		      const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	if (upper_32_bits(offset) || upper_32_bits(offset + length))
+		return afs_fs_fetch_data64(server, key, vnode, offset, length,
+					   buffer, wait_mode);
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = NULL; /* volsync */
+	call->reply3 = buffer;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSFETCHDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHDATA);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+	bp[4] = htonl(offset);
+	bp[5] = htonl(length);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.GiveUpCallBacks
+ */
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
+					    struct sk_buff *skb, bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG; /* shouldn't be any reply data */
+	return 0;
+}
+
+/*
+ * FS.GiveUpCallBacks operation type
+ */
+static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
+	.name		= "FS.GiveUpCallBacks",
+	.deliver	= afs_deliver_fs_give_up_callbacks,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * give up a set of callbacks
+ * - the callbacks are held in the server->cb_break ring
+ */
+int afs_fs_give_up_callbacks(struct afs_server *server,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t ncallbacks;
+	__be32 *bp, *tp;
+	int loop;
+
+	ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
+			      ARRAY_SIZE(server->cb_break));
+
+	_enter("{%zu},", ncallbacks);
+
+	if (ncallbacks == 0)
+		return 0;
+	if (ncallbacks > AFSCBMAX)
+		ncallbacks = AFSCBMAX;
+
+	_debug("break %zu callbacks", ncallbacks);
+
+	call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
+				   12 + ncallbacks * 6 * 4, 0);
+	if (!call)
+		return -ENOMEM;
+
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	tp = bp + 2 + ncallbacks * 3;
+	*bp++ = htonl(FSGIVEUPCALLBACKS);
+	*bp++ = htonl(ncallbacks);
+	*tp++ = htonl(ncallbacks);
+
+	atomic_sub(ncallbacks, &server->cb_break_n);
+	for (loop = ncallbacks; loop > 0; loop--) {
+		struct afs_callback *cb =
+			&server->cb_break[server->cb_break_tail];
+
+		*bp++ = htonl(cb->fid.vid);
+		*bp++ = htonl(cb->fid.vnode);
+		*bp++ = htonl(cb->fid.unique);
+		*tp++ = htonl(cb->version);
+		*tp++ = htonl(cb->expiry);
+		*tp++ = htonl(cb->type);
+		smp_mb();
+		server->cb_break_tail =
+			(server->cb_break_tail + 1) &
+			(ARRAY_SIZE(server->cb_break) - 1);
+	}
+
+	ASSERT(ncallbacks > 0);
+	wake_up_nr(&server->cb_break_waitq, ncallbacks);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.CreateFile or an FS.MakeDir
+ */
+static int afs_deliver_fs_create_vnode(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFid(&bp, call->reply2);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	xdr_decode_AFSCallBack_raw(&bp, call->reply4);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateXXXX = {
+	.name		= "FS.CreateXXXX",
+	.deliver	= afs_deliver_fs_create_vnode,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a file or make a directory
+ */
+int afs_fs_create(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *vnode,
+		  const char *name,
+		  umode_t mode,
+		  struct afs_fid *newfid,
+		  struct afs_file_status *newstatus,
+		  struct afs_callback *newcb,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz + (6 * 4);
+
+	call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
+				   (3 + 21 + 21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = newfid;
+	call->reply3 = newstatus;
+	call->reply4 = newcb;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+	*bp++ = htonl(AFS_SET_MODE);
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.RemoveFile or FS.RemoveDir
+ */
+static int afs_deliver_fs_remove(struct afs_call *call,
+				 struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.RemoveDir/FS.RemoveFile operation type
+ */
+static const struct afs_call_type afs_RXFSRemoveXXXX = {
+	.name		= "FS.RemoveXXXX",
+	.deliver	= afs_deliver_fs_remove,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * remove a file or directory
+ */
+int afs_fs_remove(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *vnode,
+		  const char *name,
+		  bool isdir,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz;
+
+	call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Link
+ */
+static int afs_deliver_fs_link(struct afs_call *call,
+			       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Link operation type
+ */
+static const struct afs_call_type afs_RXFSLink = {
+	.name		= "FS.Link",
+	.deliver	= afs_deliver_fs_link,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * make a hard link
+ */
+int afs_fs_link(struct afs_server *server,
+		struct key *key,
+		struct afs_vnode *dvnode,
+		struct afs_vnode *vnode,
+		const char *name,
+		const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz + (3 * 4);
+
+	call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = dvnode;
+	call->reply2 = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSLINK);
+	*bp++ = htonl(dvnode->fid.vid);
+	*bp++ = htonl(dvnode->fid.vnode);
+	*bp++ = htonl(dvnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Symlink
+ */
+static int afs_deliver_fs_symlink(struct afs_call *call,
+				  struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFid(&bp, call->reply2);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Symlink operation type
+ */
+static const struct afs_call_type afs_RXFSSymlink = {
+	.name		= "FS.Symlink",
+	.deliver	= afs_deliver_fs_symlink,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_symlink(struct afs_server *server,
+		   struct key *key,
+		   struct afs_vnode *vnode,
+		   const char *name,
+		   const char *contents,
+		   struct afs_fid *newfid,
+		   struct afs_file_status *newstatus,
+		   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+
+	c_namesz = strlen(contents);
+	c_padsz = (4 - (c_namesz & 3)) & 3;
+
+	reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
+
+	call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+				   (3 + 21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = newfid;
+	call->reply3 = newstatus;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSYMLINK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+	*bp++ = htonl(c_namesz);
+	memcpy(bp, contents, c_namesz);
+	bp = (void *) bp + c_namesz;
+	if (c_padsz > 0) {
+		memset(bp, 0, c_padsz);
+		bp = (void *) bp + c_padsz;
+	}
+	*bp++ = htonl(AFS_SET_MODE);
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = htonl(S_IRWXUGO); /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Rename
+ */
+static int afs_deliver_fs_rename(struct afs_call *call,
+				  struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL);
+	if (new_dvnode != orig_dvnode)
+		xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
+					  NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Rename operation type
+ */
+static const struct afs_call_type afs_RXFSRename = {
+	.name		= "FS.Rename",
+	.deliver	= afs_deliver_fs_rename,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_rename(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *orig_dvnode,
+		  const char *orig_name,
+		  struct afs_vnode *new_dvnode,
+		  const char *new_name,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	o_namesz = strlen(orig_name);
+	o_padsz = (4 - (o_namesz & 3)) & 3;
+
+	n_namesz = strlen(new_name);
+	n_padsz = (4 - (n_namesz & 3)) & 3;
+
+	reqsz = (4 * 4) +
+		4 + o_namesz + o_padsz +
+		(3 * 4) +
+		4 + n_namesz + n_padsz;
+
+	call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = orig_dvnode;
+	call->reply2 = new_dvnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSRENAME);
+	*bp++ = htonl(orig_dvnode->fid.vid);
+	*bp++ = htonl(orig_dvnode->fid.vnode);
+	*bp++ = htonl(orig_dvnode->fid.unique);
+	*bp++ = htonl(o_namesz);
+	memcpy(bp, orig_name, o_namesz);
+	bp = (void *) bp + o_namesz;
+	if (o_padsz > 0) {
+		memset(bp, 0, o_padsz);
+		bp = (void *) bp + o_padsz;
+	}
+
+	*bp++ = htonl(new_dvnode->fid.vid);
+	*bp++ = htonl(new_dvnode->fid.vnode);
+	*bp++ = htonl(new_dvnode->fid.unique);
+	*bp++ = htonl(n_namesz);
+	memcpy(bp, new_name, n_namesz);
+	bp = (void *) bp + n_namesz;
+	if (n_padsz > 0) {
+		memset(bp, 0, n_padsz);
+		bp = (void *) bp + n_padsz;
+	}
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.StoreData
+ */
+static int afs_deliver_fs_store_data(struct afs_call *call,
+				     struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last) {
+		_leave(" = 0 [more]");
+		return 0;
+	}
+
+	if (call->reply_size != call->reply_max) {
+		_leave(" = -EBADMSG [%u != %u]",
+		       call->reply_size, call->reply_max);
+		return -EBADMSG;
+	}
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
+				  &call->store_version);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	afs_pages_written_back(vnode, call);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.StoreData operation type
+ */
+static const struct afs_call_type afs_RXFSStoreData = {
+	.name		= "FS.StoreData",
+	.deliver	= afs_deliver_fs_store_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData64 = {
+	.name		= "FS.StoreData64",
+	.deliver	= afs_deliver_fs_store_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * store a set of pages to a very large file
+ */
+static int afs_fs_store_data64(struct afs_server *server,
+			       struct afs_writeback *wb,
+			       pgoff_t first, pgoff_t last,
+			       unsigned offset, unsigned to,
+			       loff_t size, loff_t pos, loff_t i_size,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_vnode *vnode = wb->vnode;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData64,
+				   (4 + 6 + 3 * 2) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->wb = wb;
+	call->key = wb->key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->mapping = vnode->vfs_inode.i_mapping;
+	call->first = first;
+	call->last = last;
+	call->first_offset = offset;
+	call->last_to = to;
+	call->send_pages = true;
+	call->store_version = vnode->status.data_version + 1;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA64);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	*bp++ = 0; /* mask */
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = 0; /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	*bp++ = htonl(pos >> 32);
+	*bp++ = htonl((u32) pos);
+	*bp++ = htonl(size >> 32);
+	*bp++ = htonl((u32) size);
+	*bp++ = htonl(i_size >> 32);
+	*bp++ = htonl((u32) i_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * store a set of pages
+ */
+int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
+		      pgoff_t first, pgoff_t last,
+		      unsigned offset, unsigned to,
+		      const struct afs_wait_mode *wait_mode)
+{
+	struct afs_vnode *vnode = wb->vnode;
+	struct afs_call *call;
+	loff_t size, pos, i_size;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+
+	size = to - offset;
+	if (first != last)
+		size += (loff_t)(last - first) << PAGE_SHIFT;
+	pos = (loff_t)first << PAGE_SHIFT;
+	pos += offset;
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (pos + size > i_size)
+		i_size = size + pos;
+
+	_debug("size %llx, at %llx, i_size %llx",
+	       (unsigned long long) size, (unsigned long long) pos,
+	       (unsigned long long) i_size);
+
+	if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
+		return afs_fs_store_data64(server, wb, first, last, offset, to,
+					   size, pos, i_size, wait_mode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData,
+				   (4 + 6 + 3) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->wb = wb;
+	call->key = wb->key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->mapping = vnode->vfs_inode.i_mapping;
+	call->first = first;
+	call->last = last;
+	call->first_offset = offset;
+	call->last_to = to;
+	call->send_pages = true;
+	call->store_version = vnode->status.data_version + 1;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	*bp++ = 0; /* mask */
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = 0; /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	*bp++ = htonl(pos);
+	*bp++ = htonl(size);
+	*bp++ = htonl(i_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.StoreStatus
+ */
+static int afs_deliver_fs_store_status(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	afs_dataversion_t *store_version;
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last) {
+		_leave(" = 0 [more]");
+		return 0;
+	}
+
+	if (call->reply_size != call->reply_max) {
+		_leave(" = -EBADMSG [%u != %u]",
+		       call->reply_size, call->reply_max);
+		return -EBADMSG;
+	}
+
+	/* unmarshall the reply once we've received all of it */
+	store_version = NULL;
+	if (call->operation_ID == FSSTOREDATA)
+		store_version = &call->store_version;
+
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.StoreStatus operation type
+ */
+static const struct afs_call_type afs_RXFSStoreStatus = {
+	.name		= "FS.StoreStatus",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData_as_Status = {
+	.name		= "FS.StoreData",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
+	.name		= "FS.StoreData64",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * set the attributes on a very large file, using FS.StoreData rather than
+ * FS.StoreStatus so as to alter the file size also
+ */
+static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
+				 struct afs_vnode *vnode, struct iattr *attr,
+				 const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	ASSERT(attr->ia_valid & ATTR_SIZE);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status,
+				   (4 + 6 + 3 * 2) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->store_version = vnode->status.data_version + 1;
+	call->operation_ID = FSSTOREDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA64);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	*bp++ = 0;				/* position of start of write */
+	*bp++ = 0;
+	*bp++ = 0;				/* size of write */
+	*bp++ = 0;
+	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
+	*bp++ = htonl((u32) attr->ia_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
+ * so as to alter the file size also
+ */
+static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
+			       struct afs_vnode *vnode, struct iattr *attr,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	ASSERT(attr->ia_valid & ATTR_SIZE);
+	if (attr->ia_size >> 32)
+		return afs_fs_setattr_size64(server, key, vnode, attr,
+					     wait_mode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
+				   (4 + 6 + 3) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->store_version = vnode->status.data_version + 1;
+	call->operation_ID = FSSTOREDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	*bp++ = 0;				/* position of start of write */
+	*bp++ = 0;				/* size of write */
+	*bp++ = htonl(attr->ia_size);		/* new file length */
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * set the attributes on a file, using FS.StoreData if there's a change in file
+ * size, and FS.StoreStatus otherwise
+ */
+int afs_fs_setattr(struct afs_server *server, struct key *key,
+		   struct afs_vnode *vnode, struct iattr *attr,
+		   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		return afs_fs_setattr_size(server, key, vnode, attr,
+					   wait_mode);
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
+				   (4 + 6) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSSTORESTATUS;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTORESTATUS);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.GetVolumeStatus
+ */
+static int afs_deliver_fs_get_volume_status(struct afs_call *call,
+					    struct sk_buff *skb, bool last)
+{
+	const __be32 *bp;
+	char *p;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the returned status record */
+	case 1:
+		_debug("extract status");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       12 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		bp = call->buffer;
+		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name length */
+	case 2:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("volname length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name */
+	case 3:
+		_debug("extract volname");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("volname '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_volname_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 4:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_volname_padding:
+
+		/* extract the offline message length */
+	case 5:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("offline msg length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the offline message */
+	case 6:
+		_debug("extract offline");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("offline '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the offline message padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_offline_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 7:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_offline_padding:
+
+		/* extract the message of the day length */
+	case 8:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("motd length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the message of the day */
+	case 9:
+		_debug("extract motd");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("motd '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the message of the day padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_motd_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 10:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_motd_padding:
+
+	case 11:
+		_debug("trailer %d", skb->len);
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * destroy an FS.GetVolumeStatus call
+ */
+static void afs_get_volume_status_call_destructor(struct afs_call *call)
+{
+	kfree(call->reply3);
+	call->reply3 = NULL;
+	afs_flat_call_destructor(call);
+}
+
+/*
+ * FS.GetVolumeStatus operation type
+ */
+static const struct afs_call_type afs_RXFSGetVolumeStatus = {
+	.name		= "FS.GetVolumeStatus",
+	.deliver	= afs_deliver_fs_get_volume_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_get_volume_status_call_destructor,
+};
+
+/*
+ * fetch the status of a volume
+ */
+int afs_fs_get_volume_status(struct afs_server *server,
+			     struct key *key,
+			     struct afs_vnode *vnode,
+			     struct afs_volume_status *vs,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+	void *tmpbuf;
+
+	_enter("");
+
+	tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
+	if (!tmpbuf)
+		return -ENOMEM;
+
+	call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
+	if (!call) {
+		kfree(tmpbuf);
+		return -ENOMEM;
+	}
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = vs;
+	call->reply3 = tmpbuf;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSGETVOLUMESTATUS);
+	bp[1] = htonl(vnode->fid.vid);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
+ */
+static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
+				    struct sk_buff *skb, bool last)
+{
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.SetLock operation type
+ */
+static const struct afs_call_type afs_RXFSSetLock = {
+	.name		= "FS.SetLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * FS.ExtendLock operation type
+ */
+static const struct afs_call_type afs_RXFSExtendLock = {
+	.name		= "FS.ExtendLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * FS.ReleaseLock operation type
+ */
+static const struct afs_call_type afs_RXFSReleaseLock = {
+	.name		= "FS.ReleaseLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * get a lock on a file
+ */
+int afs_fs_set_lock(struct afs_server *server,
+		    struct key *key,
+		    struct afs_vnode *vnode,
+		    afs_lock_type_t type,
+		    const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSETLOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(type);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * extend a lock on a file
+ */
+int afs_fs_extend_lock(struct afs_server *server,
+		       struct key *key,
+		       struct afs_vnode *vnode,
+		       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSEXTENDLOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * release a lock on a file
+ */
+int afs_fs_release_lock(struct afs_server *server,
+			struct key *key,
+			struct afs_vnode *vnode,
+			const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSRELEASELOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/kernel/fs/afs/inode.c b/kernel/fs/afs/inode.c
new file mode 100644
index 000000000..e06f5a233
--- /dev/null
+++ b/kernel/fs/afs/inode.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *          David Howells <dhowells@redhat.com>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+struct afs_iget_data {
+	struct afs_fid		fid;
+	struct afs_volume	*volume;	/* volume on which resides */
+};
+
+/*
+ * map the AFS file status to the inode member variables
+ */
+static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
+{
+	struct inode *inode = AFS_VNODE_TO_I(vnode);
+
+	_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
+	       vnode->status.type,
+	       vnode->status.nlink,
+	       (unsigned long long) vnode->status.size,
+	       vnode->status.data_version,
+	       vnode->status.mode);
+
+	switch (vnode->status.type) {
+	case AFS_FTYPE_FILE:
+		inode->i_mode	= S_IFREG | vnode->status.mode;
+		inode->i_op	= &afs_file_inode_operations;
+		inode->i_fop	= &afs_file_operations;
+		break;
+	case AFS_FTYPE_DIR:
+		inode->i_mode	= S_IFDIR | vnode->status.mode;
+		inode->i_op	= &afs_dir_inode_operations;
+		inode->i_fop	= &afs_dir_file_operations;
+		break;
+	case AFS_FTYPE_SYMLINK:
+		inode->i_mode	= S_IFLNK | vnode->status.mode;
+		inode->i_op	= &page_symlink_inode_operations;
+		break;
+	default:
+		printk("kAFS: AFS vnode with undefined type\n");
+		return -EBADMSG;
+	}
+
+#ifdef CONFIG_AFS_FSCACHE
+	if (vnode->status.size != inode->i_size)
+		fscache_attr_changed(vnode->cache);
+#endif
+
+	set_nlink(inode, vnode->status.nlink);
+	inode->i_uid		= vnode->status.owner;
+	inode->i_gid		= GLOBAL_ROOT_GID;
+	inode->i_size		= vnode->status.size;
+	inode->i_ctime.tv_sec	= vnode->status.mtime_server;
+	inode->i_ctime.tv_nsec	= 0;
+	inode->i_atime		= inode->i_mtime = inode->i_ctime;
+	inode->i_blocks		= 0;
+	inode->i_generation	= vnode->fid.unique;
+	inode->i_version	= vnode->status.data_version;
+	inode->i_mapping->a_ops	= &afs_fs_aops;
+
+	/* check to see whether a symbolic link is really a mountpoint */
+	if (vnode->status.type == AFS_FTYPE_SYMLINK) {
+		afs_mntpt_check_symlink(vnode, key);
+
+		if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) {
+			inode->i_mode	= S_IFDIR | vnode->status.mode;
+			inode->i_op	= &afs_mntpt_inode_operations;
+			inode->i_fop	= &afs_mntpt_file_operations;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * iget5() comparator
+ */
+static int afs_iget5_test(struct inode *inode, void *opaque)
+{
+	struct afs_iget_data *data = opaque;
+
+	return inode->i_ino == data->fid.vnode &&
+		inode->i_generation == data->fid.unique;
+}
+
+/*
+ * iget5() comparator for inode created by autocell operations
+ *
+ * These pseudo inodes don't match anything.
+ */
+static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
+{
+	return 0;
+}
+
+/*
+ * iget5() inode initialiser
+ */
+static int afs_iget5_set(struct inode *inode, void *opaque)
+{
+	struct afs_iget_data *data = opaque;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
+	inode->i_ino = data->fid.vnode;
+	inode->i_generation = data->fid.unique;
+	vnode->fid = data->fid;
+	vnode->volume = data->volume;
+
+	return 0;
+}
+
+/*
+ * inode retrieval for autocell
+ */
+struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
+				int namesz, struct key *key)
+{
+	struct afs_iget_data data;
+	struct afs_super_info *as;
+	struct afs_vnode *vnode;
+	struct super_block *sb;
+	struct inode *inode;
+	static atomic_t afs_autocell_ino;
+
+	_enter("{%x:%u},%*.*s,",
+	       AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
+	       namesz, namesz, dev_name ?: "");
+
+	sb = dir->i_sb;
+	as = sb->s_fs_info;
+	data.volume = as->volume;
+	data.fid.vid = as->volume->vid;
+	data.fid.unique = 0;
+	data.fid.vnode = 0;
+
+	inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
+			     afs_iget5_autocell_test, afs_iget5_set,
+			     &data);
+	if (!inode) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	_debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+	       inode, inode->i_ino, data.fid.vid, data.fid.vnode,
+	       data.fid.unique);
+
+	vnode = AFS_FS_I(inode);
+
+	/* there shouldn't be an existing inode */
+	BUG_ON(!(inode->i_state & I_NEW));
+
+	inode->i_size		= 0;
+	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->i_op		= &afs_autocell_inode_operations;
+	set_nlink(inode, 2);
+	inode->i_uid		= GLOBAL_ROOT_UID;
+	inode->i_gid		= GLOBAL_ROOT_GID;
+	inode->i_ctime.tv_sec	= get_seconds();
+	inode->i_ctime.tv_nsec	= 0;
+	inode->i_atime		= inode->i_mtime = inode->i_ctime;
+	inode->i_blocks		= 0;
+	inode->i_version	= 0;
+	inode->i_generation	= 0;
+
+	set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+	set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+	inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
+	unlock_new_inode(inode);
+	_leave(" = %p", inode);
+	return inode;
+}
+
+/*
+ * inode retrieval
+ */
+struct inode *afs_iget(struct super_block *sb, struct key *key,
+		       struct afs_fid *fid, struct afs_file_status *status,
+		       struct afs_callback *cb)
+{
+	struct afs_iget_data data = { .fid = *fid };
+	struct afs_super_info *as;
+	struct afs_vnode *vnode;
+	struct inode *inode;
+	int ret;
+
+	_enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
+
+	as = sb->s_fs_info;
+	data.volume = as->volume;
+
+	inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set,
+			     &data);
+	if (!inode) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	_debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+	       inode, fid->vid, fid->vnode, fid->unique);
+
+	vnode = AFS_FS_I(inode);
+
+	/* deal with an existing inode */
+	if (!(inode->i_state & I_NEW)) {
+		_leave(" = %p", inode);
+		return inode;
+	}
+
+	if (!status) {
+		/* it's a remotely extant inode */
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto bad_inode;
+	} else {
+		/* it's an inode we just created */
+		memcpy(&vnode->status, status, sizeof(vnode->status));
+
+		if (!cb) {
+			/* it's a symlink we just created (the fileserver
+			 * didn't give us a callback) */
+			vnode->cb_version = 0;
+			vnode->cb_expiry = 0;
+			vnode->cb_type = 0;
+			vnode->cb_expires = get_seconds();
+		} else {
+			vnode->cb_version = cb->version;
+			vnode->cb_expiry = cb->expiry;
+			vnode->cb_type = cb->type;
+			vnode->cb_expires = vnode->cb_expiry + get_seconds();
+		}
+	}
+
+	/* set up caching before mapping the status, as map-status reads the
+	 * first page of symlinks to see if they're really mountpoints */
+	inode->i_size = vnode->status.size;
+#ifdef CONFIG_AFS_FSCACHE
+	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+					      &afs_vnode_cache_index_def,
+					      vnode, true);
+#endif
+
+	ret = afs_inode_map_status(vnode, key);
+	if (ret < 0)
+		goto bad_inode;
+
+	/* success */
+	clear_bit(AFS_VNODE_UNSET, &vnode->flags);
+	inode->i_flags |= S_NOATIME;
+	unlock_new_inode(inode);
+	_leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
+	return inode;
+
+	/* failure */
+bad_inode:
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vnode->cache, 0);
+	vnode->cache = NULL;
+#endif
+	iget_failed(inode);
+	_leave(" = %d [bad]", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * mark the data attached to an inode as obsolete due to a write on the server
+ * - might also want to ditch all the outstanding writes and dirty pages
+ */
+void afs_zap_data(struct afs_vnode *vnode)
+{
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	/* nuke all the non-dirty pages that aren't locked, mapped or being
+	 * written back in a regular file and completely discard the pages in a
+	 * directory or symlink */
+	if (S_ISREG(vnode->vfs_inode.i_mode))
+		invalidate_remote_inode(&vnode->vfs_inode);
+	else
+		invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
+}
+
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ *     symlink)
+ *   - parent dir metadata changed (security changes)
+ *   - dentry data changed (write, truncate)
+ *   - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+	int ret;
+
+	_enter("{v={%x:%u} fl=%lx},%x",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+	       key_serial(key));
+
+	if (vnode->cb_promised &&
+	    !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
+	    !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+		if (vnode->cb_expires < get_seconds() + 10) {
+			_debug("callback expired");
+			set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		} else {
+			goto valid;
+		}
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		goto valid;
+
+	mutex_lock(&vnode->validate_lock);
+
+	/* if the promise has expired, we need to check the server again to get
+	 * a new promise - note that if the (parent) directory's metadata was
+	 * changed then the security may be different and we may no longer have
+	 * access */
+	if (!vnode->cb_promised ||
+	    test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		_debug("not promised");
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error_unlock;
+		_debug("new promise [fl=%lx]", vnode->flags);
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_debug("file already deleted");
+		ret = -ESTALE;
+		goto error_unlock;
+	}
+
+	/* if the vnode's data version number changed then its contents are
+	 * different */
+	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+		afs_zap_data(vnode);
+
+	clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+	mutex_unlock(&vnode->validate_lock);
+valid:
+	_leave(" = 0");
+	return 0;
+
+error_unlock:
+	mutex_unlock(&vnode->validate_lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read the attributes of an inode
+ */
+int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		      struct kstat *stat)
+{
+	struct inode *inode;
+
+	inode = d_inode(dentry);
+
+	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+/*
+ * discard an AFS inode
+ */
+int afs_drop_inode(struct inode *inode)
+{
+	_enter("");
+
+	if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
+		return generic_delete_inode(inode);
+	else
+		return generic_drop_inode(inode);
+}
+
+/*
+ * clear an AFS inode
+ */
+void afs_evict_inode(struct inode *inode)
+{
+	struct afs_permits *permits;
+	struct afs_vnode *vnode;
+
+	vnode = AFS_FS_I(inode);
+
+	_enter("{%x:%u.%d} v=%u x=%u t=%u }",
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       vnode->cb_version,
+	       vnode->cb_expiry,
+	       vnode->cb_type);
+
+	_debug("CLEAR INODE %p", inode);
+
+	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	afs_give_up_callback(vnode);
+
+	if (vnode->server) {
+		spin_lock(&vnode->server->fs_lock);
+		rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+		spin_unlock(&vnode->server->fs_lock);
+		afs_put_server(vnode->server);
+		vnode->server = NULL;
+	}
+
+	ASSERT(list_empty(&vnode->writebacks));
+	ASSERT(!vnode->cb_promised);
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vnode->cache, 0);
+	vnode->cache = NULL;
+#endif
+
+	mutex_lock(&vnode->permits_lock);
+	permits = vnode->permits;
+	rcu_assign_pointer(vnode->permits, NULL);
+	mutex_unlock(&vnode->permits_lock);
+	if (permits)
+		call_rcu(&permits->rcu, afs_zap_permits);
+
+	_leave("");
+}
+
+/*
+ * set the attributes of an inode
+ */
+int afs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u},{n=%pd},%x",
+	       vnode->fid.vid, vnode->fid.vnode, dentry,
+	       attr->ia_valid);
+
+	if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
+				ATTR_MTIME))) {
+		_leave(" = 0 [unsupported]");
+		return 0;
+	}
+
+	/* flush any dirty data outstanding on a regular file */
+	if (S_ISREG(vnode->vfs_inode.i_mode)) {
+		filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+		afs_writeback_all(vnode);
+	}
+
+	if (attr->ia_valid & ATTR_FILE) {
+		key = attr->ia_file->private_data;
+	} else {
+		key = afs_request_key(vnode->volume->cell);
+		if (IS_ERR(key)) {
+			ret = PTR_ERR(key);
+			goto error;
+		}
+	}
+
+	ret = afs_vnode_setattr(vnode, key, attr);
+	if (!(attr->ia_valid & ATTR_FILE))
+		key_put(key);
+
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/kernel/fs/afs/internal.h b/kernel/fs/afs/internal.h
new file mode 100644
index 000000000..71d598231
--- /dev/null
+++ b/kernel/fs/afs/internal.h
@@ -0,0 +1,889 @@
+/* internal AFS stuff
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/skbuff.h>
+#include <linux/rxrpc.h>
+#include <linux/key.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/fscache.h>
+#include <linux/backing-dev.h>
+
+#include "afs.h"
+#include "afs_vl.h"
+
+#define AFS_CELL_MAX_ADDRS 15
+
+struct pagevec;
+struct afs_call;
+
+typedef enum {
+	AFS_VL_NEW,			/* new, uninitialised record */
+	AFS_VL_CREATING,		/* creating record */
+	AFS_VL_VALID,			/* record is pending */
+	AFS_VL_NO_VOLUME,		/* no such volume available */
+	AFS_VL_UPDATING,		/* update in progress */
+	AFS_VL_VOLUME_DELETED,		/* volume was deleted */
+	AFS_VL_UNCERTAIN,		/* uncertain state (update failed) */
+} __attribute__((packed)) afs_vlocation_state_t;
+
+struct afs_mount_params {
+	bool			rwpath;		/* T if the parent should be considered R/W */
+	bool			force;		/* T to force cell type */
+	bool			autocell;	/* T if set auto mount operation */
+	afs_voltype_t		type;		/* type of volume requested */
+	int			volnamesz;	/* size of volume name */
+	const char		*volname;	/* name of volume to mount */
+	struct afs_cell		*cell;		/* cell in which to find volume */
+	struct afs_volume	*volume;	/* volume record */
+	struct key		*key;		/* key to use for secure mounting */
+};
+
+/*
+ * definition of how to wait for the completion of an operation
+ */
+struct afs_wait_mode {
+	/* RxRPC received message notification */
+	void (*rx_wakeup)(struct afs_call *call);
+
+	/* synchronous call waiter and call dispatched notification */
+	int (*wait)(struct afs_call *call);
+
+	/* asynchronous call completion */
+	void (*async_complete)(void *reply, int error);
+};
+
+extern const struct afs_wait_mode afs_sync_call;
+extern const struct afs_wait_mode afs_async_call;
+
+/*
+ * a record of an in-progress RxRPC call
+ */
+struct afs_call {
+	const struct afs_call_type *type;	/* type of call */
+	const struct afs_wait_mode *wait_mode;	/* completion wait mode */
+	wait_queue_head_t	waitq;		/* processes awaiting completion */
+	void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
+	struct work_struct	async_work;	/* asynchronous work processor */
+	struct work_struct	work;		/* actual work processor */
+	struct sk_buff_head	rx_queue;	/* received packets */
+	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
+	struct key		*key;		/* security for this call */
+	struct afs_server	*server;	/* server affected by incoming CM call */
+	void			*request;	/* request data (first part) */
+	struct address_space	*mapping;	/* page set */
+	struct afs_writeback	*wb;		/* writeback being performed */
+	void			*buffer;	/* reply receive buffer */
+	void			*reply;		/* reply buffer (first part) */
+	void			*reply2;	/* reply buffer (second part) */
+	void			*reply3;	/* reply buffer (third part) */
+	void			*reply4;	/* reply buffer (fourth part) */
+	pgoff_t			first;		/* first page in mapping to deal with */
+	pgoff_t			last;		/* last page in mapping to deal with */
+	enum {					/* call state */
+		AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
+		AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
+		AFS_CALL_AWAIT_OP_ID,	/* awaiting op ID on incoming call */
+		AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
+		AFS_CALL_REPLYING,	/* replying to incoming call */
+		AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
+		AFS_CALL_COMPLETE,	/* successfully completed */
+		AFS_CALL_BUSY,		/* server was busy */
+		AFS_CALL_ABORTED,	/* call was aborted */
+		AFS_CALL_ERROR,		/* call failed due to error */
+	}			state;
+	int			error;		/* error code */
+	unsigned		request_size;	/* size of request data */
+	unsigned		reply_max;	/* maximum size of reply */
+	unsigned		reply_size;	/* current size of reply */
+	unsigned		first_offset;	/* offset into mapping[first] */
+	unsigned		last_to;	/* amount of mapping[last] */
+	unsigned		offset;		/* offset into received data store */
+	unsigned char		unmarshall;	/* unmarshalling phase */
+	bool			incoming;	/* T if incoming call */
+	bool			send_pages;	/* T if data from mapping should be sent */
+	u16			service_id;	/* RxRPC service ID to call */
+	__be16			port;		/* target UDP port */
+	__be32			operation_ID;	/* operation ID for an incoming call */
+	u32			count;		/* count for use in unmarshalling */
+	__be32			tmp;		/* place to extract temporary data */
+	afs_dataversion_t	store_version;	/* updated version expected from store */
+};
+
+struct afs_call_type {
+	const char *name;
+
+	/* deliver request or reply data to an call
+	 * - returning an error will cause the call to be aborted
+	 */
+	int (*deliver)(struct afs_call *call, struct sk_buff *skb,
+		       bool last);
+
+	/* map an abort code to an error number */
+	int (*abort_to_error)(u32 abort_code);
+
+	/* clean up a call */
+	void (*destructor)(struct afs_call *call);
+};
+
+/*
+ * record of an outstanding writeback on a vnode
+ */
+struct afs_writeback {
+	struct list_head	link;		/* link in vnode->writebacks */
+	struct work_struct	writer;		/* work item to perform the writeback */
+	struct afs_vnode	*vnode;		/* vnode to which this write applies */
+	struct key		*key;		/* owner of this write */
+	wait_queue_head_t	waitq;		/* completion and ready wait queue */
+	pgoff_t			first;		/* first page in batch */
+	pgoff_t			point;		/* last page in current store op */
+	pgoff_t			last;		/* last page in batch (inclusive) */
+	unsigned		offset_first;	/* offset into first page of start of write */
+	unsigned		to_last;	/* offset into last page of end of write */
+	int			num_conflicts;	/* count of conflicting writes in list */
+	int			usage;
+	bool			conflicts;	/* T if has dependent conflicts */
+	enum {
+		AFS_WBACK_SYNCING,		/* synchronisation being performed */
+		AFS_WBACK_PENDING,		/* write pending */
+		AFS_WBACK_CONFLICTING,		/* conflicting writes posted */
+		AFS_WBACK_WRITING,		/* writing back */
+		AFS_WBACK_COMPLETE		/* the writeback record has been unlinked */
+	} state __attribute__((packed));
+};
+
+/*
+ * AFS superblock private data
+ * - there's one superblock per volume
+ */
+struct afs_super_info {
+	struct afs_volume	*volume;	/* volume record */
+	char			rwparent;	/* T if parent is R/W AFS volume */
+};
+
+static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+extern struct file_system_type afs_fs_type;
+
+/*
+ * entry in the cached cell catalogue
+ */
+struct afs_cache_cell {
+	char		name[AFS_MAXCELLNAME];	/* cell name (padded with NULs) */
+	struct in_addr	vl_servers[15];		/* cached cell VL servers */
+};
+
+/*
+ * AFS cell record
+ */
+struct afs_cell {
+	atomic_t		usage;
+	struct list_head	link;		/* main cell list link */
+	struct key		*anonymous_key;	/* anonymous user key for this cell */
+	struct list_head	proc_link;	/* /proc cell list link */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+
+	/* server record management */
+	rwlock_t		servers_lock;	/* active server list lock */
+	struct list_head	servers;	/* active server list */
+
+	/* volume location record management */
+	struct rw_semaphore	vl_sem;		/* volume management serialisation semaphore */
+	struct list_head	vl_list;	/* cell's active VL record list */
+	spinlock_t		vl_lock;	/* vl_list lock */
+	unsigned short		vl_naddrs;	/* number of VL servers in addr list */
+	unsigned short		vl_curr_svix;	/* current server index */
+	struct in_addr		vl_addrs[AFS_CELL_MAX_ADDRS];	/* cell VL server addresses */
+
+	char			name[0];	/* cell name - must go last */
+};
+
+/*
+ * entry in the cached volume location catalogue
+ */
+struct afs_cache_vlocation {
+	/* volume name (lowercase, padded with NULs) */
+	uint8_t			name[AFS_MAXVOLNAME + 1];
+
+	uint8_t			nservers;	/* number of entries used in servers[] */
+	uint8_t			vidmask;	/* voltype mask for vid[] */
+	uint8_t			srvtmask[8];	/* voltype masks for servers[] */
+#define AFS_VOL_VTM_RW	0x01 /* R/W version of the volume is available (on this server) */
+#define AFS_VOL_VTM_RO	0x02 /* R/O version of the volume is available (on this server) */
+#define AFS_VOL_VTM_BAK	0x04 /* backup version of the volume is available (on this server) */
+
+	afs_volid_t		vid[3];		/* volume IDs for R/W, R/O and Bak volumes */
+	struct in_addr		servers[8];	/* fileserver addresses */
+	time_t			rtime;		/* last retrieval time */
+};
+
+/*
+ * volume -> vnode hash table entry
+ */
+struct afs_cache_vhash {
+	afs_voltype_t		vtype;		/* which volume variation */
+	uint8_t			hash_bucket;	/* which hash bucket this represents */
+} __attribute__((packed));
+
+/*
+ * AFS volume location record
+ */
+struct afs_vlocation {
+	atomic_t		usage;
+	time_t			time_of_death;	/* time at which put reduced usage to 0 */
+	struct list_head	link;		/* link in cell volume location list */
+	struct list_head	grave;		/* link in master graveyard list */
+	struct list_head	update;		/* link in master update list */
+	struct afs_cell		*cell;		/* cell to which volume belongs */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+	struct afs_cache_vlocation vldb;	/* volume information DB record */
+	struct afs_volume	*vols[3];	/* volume access record pointer (index by type) */
+	wait_queue_head_t	waitq;		/* status change waitqueue */
+	time_t			update_at;	/* time at which record should be updated */
+	spinlock_t		lock;		/* access lock */
+	afs_vlocation_state_t	state;		/* volume location state */
+	unsigned short		upd_rej_cnt;	/* ENOMEDIUM count during update */
+	unsigned short		upd_busy_cnt;	/* EBUSY count during update */
+	bool			valid;		/* T if valid */
+};
+
+/*
+ * AFS fileserver record
+ */
+struct afs_server {
+	atomic_t		usage;
+	time_t			time_of_death;	/* time at which put reduced usage to 0 */
+	struct in_addr		addr;		/* server address */
+	struct afs_cell		*cell;		/* cell in which server resides */
+	struct list_head	link;		/* link in cell's server list */
+	struct list_head	grave;		/* link in master graveyard list */
+	struct rb_node		master_rb;	/* link in master by-addr tree */
+	struct rw_semaphore	sem;		/* access lock */
+
+	/* file service access */
+	struct rb_root		fs_vnodes;	/* vnodes backed by this server (ordered by FID) */
+	unsigned long		fs_act_jif;	/* time at which last activity occurred */
+	unsigned long		fs_dead_jif;	/* time at which no longer to be considered dead */
+	spinlock_t		fs_lock;	/* access lock */
+	int			fs_state;      	/* 0 or reason FS currently marked dead (-errno) */
+
+	/* callback promise management */
+	struct rb_root		cb_promises;	/* vnode expiration list (ordered earliest first) */
+	struct delayed_work	cb_updater;	/* callback updater */
+	struct delayed_work	cb_break_work;	/* collected break dispatcher */
+	wait_queue_head_t	cb_break_waitq;	/* space available in cb_break waitqueue */
+	spinlock_t		cb_lock;	/* access lock */
+	struct afs_callback	cb_break[64];	/* ring of callbacks awaiting breaking */
+	atomic_t		cb_break_n;	/* number of pending breaks */
+	u8			cb_break_head;	/* head of callback breaking ring */
+	u8			cb_break_tail;	/* tail of callback breaking ring */
+};
+
+/*
+ * AFS volume access record
+ */
+struct afs_volume {
+	atomic_t		usage;
+	struct afs_cell		*cell;		/* cell to which belongs (unrefd ptr) */
+	struct afs_vlocation	*vlocation;	/* volume location */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+	afs_volid_t		vid;		/* volume ID */
+	afs_voltype_t		type;		/* type of volume */
+	char			type_force;	/* force volume type (suppress R/O -> R/W) */
+	unsigned short		nservers;	/* number of server slots filled */
+	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
+	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
+	struct rw_semaphore	server_sem;	/* lock for accessing current server */
+	struct backing_dev_info	bdi;
+};
+
+/*
+ * vnode catalogue entry
+ */
+struct afs_cache_vnode {
+	afs_vnodeid_t		vnode_id;	/* vnode ID */
+	unsigned		vnode_unique;	/* vnode ID uniquifier */
+	afs_dataversion_t	data_version;	/* data version */
+};
+
+/*
+ * AFS inode private data
+ */
+struct afs_vnode {
+	struct inode		vfs_inode;	/* the VFS's inode record */
+
+	struct afs_volume	*volume;	/* volume on which vnode resides */
+	struct afs_server	*server;	/* server currently supplying this file */
+	struct afs_fid		fid;		/* the file identifier for this inode */
+	struct afs_file_status	status;		/* AFS status info for this file */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+	struct afs_permits	*permits;	/* cache of permits so far obtained */
+	struct mutex		permits_lock;	/* lock for altering permits list */
+	struct mutex		validate_lock;	/* lock for validating this vnode */
+	wait_queue_head_t	update_waitq;	/* status fetch waitqueue */
+	int			update_cnt;	/* number of outstanding ops that will update the
+						 * status */
+	spinlock_t		writeback_lock;	/* lock for writebacks */
+	spinlock_t		lock;		/* waitqueue/flags lock */
+	unsigned long		flags;
+#define AFS_VNODE_CB_BROKEN	0		/* set if vnode's callback was broken */
+#define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
+#define AFS_VNODE_MODIFIED	2		/* set if vnode's data modified */
+#define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
+#define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
+#define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
+#define AFS_VNODE_LOCKING	6		/* set if waiting for lock on vnode */
+#define AFS_VNODE_READLOCKED	7		/* set if vnode is read-locked on the server */
+#define AFS_VNODE_WRITELOCKED	8		/* set if vnode is write-locked on the server */
+#define AFS_VNODE_UNLOCKING	9		/* set if vnode is being unlocked on the server */
+#define AFS_VNODE_AUTOCELL	10		/* set if Vnode is an auto mount point */
+#define AFS_VNODE_PSEUDODIR	11		/* set if Vnode is a pseudo directory */
+
+	long			acl_order;	/* ACL check count (callback break count) */
+
+	struct list_head	writebacks;	/* alterations in pagecache that need writing */
+	struct list_head	pending_locks;	/* locks waiting to be granted */
+	struct list_head	granted_locks;	/* locks granted on this file */
+	struct delayed_work	lock_work;	/* work to be done in locking */
+	struct key		*unlock_key;	/* key to be used in unlocking */
+
+	/* outstanding callback notification on this file */
+	struct rb_node		server_rb;	/* link in server->fs_vnodes */
+	struct rb_node		cb_promise;	/* link in server->cb_promises */
+	struct work_struct	cb_broken_work;	/* work to be done on callback break */
+	time_t			cb_expires;	/* time at which callback expires */
+	time_t			cb_expires_at;	/* time used to order cb_promise */
+	unsigned		cb_version;	/* callback version */
+	unsigned		cb_expiry;	/* callback expiry time */
+	afs_callback_type_t	cb_type;	/* type of callback */
+	bool			cb_promised;	/* true if promise still holds */
+};
+
+/*
+ * cached security record for one user's attempt to access a vnode
+ */
+struct afs_permit {
+	struct key		*key;		/* RxRPC ticket holding a security context */
+	afs_access_t		access_mask;	/* access mask for this key */
+};
+
+/*
+ * cache of security records from attempts to access a vnode
+ */
+struct afs_permits {
+	struct rcu_head		rcu;		/* disposal procedure */
+	int			count;		/* number of records */
+	struct afs_permit	permits[0];	/* the permits so far examined */
+};
+
+/*
+ * record of one of a system's set of network interfaces
+ */
+struct afs_interface {
+	struct in_addr	address;	/* IPv4 address bound to interface */
+	struct in_addr	netmask;	/* netmask applied to address */
+	unsigned	mtu;		/* MTU of interface */
+};
+
+/*
+ * UUID definition [internet draft]
+ * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
+ *   increments since midnight 15th October 1582
+ *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
+ *     time
+ * - the clock sequence is a 14-bit counter to avoid duplicate times
+ */
+struct afs_uuid {
+	u32		time_low;			/* low part of timestamp */
+	u16		time_mid;			/* mid part of timestamp */
+	u16		time_hi_and_version;		/* high part of timestamp and version  */
+#define AFS_UUID_TO_UNIX_TIME	0x01b21dd213814000ULL
+#define AFS_UUID_TIMEHI_MASK	0x0fff
+#define AFS_UUID_VERSION_TIME	0x1000	/* time-based UUID */
+#define AFS_UUID_VERSION_NAME	0x3000	/* name-based UUID */
+#define AFS_UUID_VERSION_RANDOM	0x4000	/* (pseudo-)random generated UUID */
+	u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
+#define AFS_UUID_CLOCKHI_MASK	0x3f
+#define AFS_UUID_VARIANT_STD	0x80
+	u8		clock_seq_low;			/* clock seq low */
+	u8		node[6];			/* spatially unique node ID (MAC addr) */
+};
+
+/*****************************************************************************/
+/*
+ * cache.c
+ */
+#ifdef CONFIG_AFS_FSCACHE
+extern struct fscache_netfs afs_cache_netfs;
+extern struct fscache_cookie_def afs_cell_cache_index_def;
+extern struct fscache_cookie_def afs_vlocation_cache_index_def;
+extern struct fscache_cookie_def afs_volume_cache_index_def;
+extern struct fscache_cookie_def afs_vnode_cache_index_def;
+#else
+#define afs_cell_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_vlocation_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_volume_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_vnode_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#endif
+
+/*
+ * callback.c
+ */
+extern void afs_init_callback_state(struct afs_server *);
+extern void afs_broken_callback_work(struct work_struct *);
+extern void afs_break_callbacks(struct afs_server *, size_t,
+				struct afs_callback[]);
+extern void afs_discard_callback_on_delete(struct afs_vnode *);
+extern void afs_give_up_callback(struct afs_vnode *);
+extern void afs_dispatch_give_up_callbacks(struct work_struct *);
+extern void afs_flush_callback_breaks(struct afs_server *);
+extern int __init afs_callback_update_init(void);
+extern void afs_callback_update_kill(void);
+
+/*
+ * cell.c
+ */
+extern struct rw_semaphore afs_proc_cells_sem;
+extern struct list_head afs_proc_cells;
+
+#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
+extern int afs_cell_init(char *);
+extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
+extern struct afs_cell *afs_grab_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_cell *);
+extern void afs_cell_purge(void);
+
+/*
+ * cmservice.c
+ */
+extern bool afs_cm_incoming_call(struct afs_call *);
+
+/*
+ * dir.c
+ */
+extern const struct inode_operations afs_dir_inode_operations;
+extern const struct dentry_operations afs_fs_dentry_operations;
+extern const struct file_operations afs_dir_file_operations;
+
+/*
+ * file.c
+ */
+extern const struct address_space_operations afs_fs_aops;
+extern const struct inode_operations afs_file_inode_operations;
+extern const struct file_operations afs_file_operations;
+
+extern int afs_open(struct inode *, struct file *);
+extern int afs_release(struct inode *, struct file *);
+extern int afs_page_filler(void *, struct page *);
+
+/*
+ * flock.c
+ */
+extern void __exit afs_kill_lock_manager(void);
+extern void afs_lock_work(struct work_struct *);
+extern void afs_lock_may_be_available(struct afs_vnode *);
+extern int afs_lock(struct file *, int, struct file_lock *);
+extern int afs_flock(struct file *, int, struct file_lock *);
+
+/*
+ * fsclient.c
+ */
+extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
+				    struct afs_vnode *, struct afs_volsync *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_give_up_callbacks(struct afs_server *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_fetch_data(struct afs_server *, struct key *,
+			     struct afs_vnode *, off_t, size_t, struct page *,
+			     const struct afs_wait_mode *);
+extern int afs_fs_create(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *, umode_t,
+			 struct afs_fid *, struct afs_file_status *,
+			 struct afs_callback *,
+			 const struct afs_wait_mode *);
+extern int afs_fs_remove(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *, bool,
+			 const struct afs_wait_mode *);
+extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
+		       struct afs_vnode *, const char *,
+		       const struct afs_wait_mode *);
+extern int afs_fs_symlink(struct afs_server *, struct key *,
+			  struct afs_vnode *, const char *, const char *,
+			  struct afs_fid *, struct afs_file_status *,
+			  const struct afs_wait_mode *);
+extern int afs_fs_rename(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *,
+			 struct afs_vnode *, const char *,
+			 const struct afs_wait_mode *);
+extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
+			     pgoff_t, pgoff_t, unsigned, unsigned,
+			     const struct afs_wait_mode *);
+extern int afs_fs_setattr(struct afs_server *, struct key *,
+			  struct afs_vnode *, struct iattr *,
+			  const struct afs_wait_mode *);
+extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
+				    struct afs_vnode *,
+				    struct afs_volume_status *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_set_lock(struct afs_server *, struct key *,
+			   struct afs_vnode *, afs_lock_type_t,
+			   const struct afs_wait_mode *);
+extern int afs_fs_extend_lock(struct afs_server *, struct key *,
+			      struct afs_vnode *,
+			      const struct afs_wait_mode *);
+extern int afs_fs_release_lock(struct afs_server *, struct key *,
+			       struct afs_vnode *,
+			       const struct afs_wait_mode *);
+
+/*
+ * inode.c
+ */
+extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
+				       struct key *);
+extern struct inode *afs_iget(struct super_block *, struct key *,
+			      struct afs_fid *, struct afs_file_status *,
+			      struct afs_callback *);
+extern void afs_zap_data(struct afs_vnode *);
+extern int afs_validate(struct afs_vnode *, struct key *);
+extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int afs_setattr(struct dentry *, struct iattr *);
+extern void afs_evict_inode(struct inode *);
+extern int afs_drop_inode(struct inode *);
+
+/*
+ * main.c
+ */
+extern struct workqueue_struct *afs_wq;
+extern struct afs_uuid afs_uuid;
+
+/*
+ * misc.c
+ */
+extern int afs_abort_to_error(u32);
+
+/*
+ * mntpt.c
+ */
+extern const struct inode_operations afs_mntpt_inode_operations;
+extern const struct inode_operations afs_autocell_inode_operations;
+extern const struct file_operations afs_mntpt_file_operations;
+
+extern struct vfsmount *afs_d_automount(struct path *);
+extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
+extern void afs_mntpt_kill_timer(void);
+
+/*
+ * proc.c
+ */
+extern int afs_proc_init(void);
+extern void afs_proc_cleanup(void);
+extern int afs_proc_cell_setup(struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_cell *);
+
+/*
+ * rxrpc.c
+ */
+extern int afs_open_socket(void);
+extern void afs_close_socket(void);
+extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
+			 const struct afs_wait_mode *);
+extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+					    size_t, size_t);
+extern void afs_flat_call_destructor(struct afs_call *);
+extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern void afs_send_empty_reply(struct afs_call *);
+extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
+extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
+			    size_t);
+
+/*
+ * security.c
+ */
+extern void afs_clear_permits(struct afs_vnode *);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern void afs_zap_permits(struct rcu_head *);
+extern struct key *afs_request_key(struct afs_cell *);
+extern int afs_permission(struct inode *, int);
+
+/*
+ * server.c
+ */
+extern spinlock_t afs_server_peer_lock;
+
+#define afs_get_server(S)					\
+do {								\
+	_debug("GET SERVER %d", atomic_read(&(S)->usage));	\
+	atomic_inc(&(S)->usage);				\
+} while(0)
+
+extern struct afs_server *afs_lookup_server(struct afs_cell *,
+					    const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct in_addr *);
+extern void afs_put_server(struct afs_server *);
+extern void __exit afs_purge_servers(void);
+
+/*
+ * super.c
+ */
+extern int afs_fs_init(void);
+extern void afs_fs_exit(void);
+
+/*
+ * use-rtnetlink.c
+ */
+extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
+extern int afs_get_MAC_address(u8 *, size_t);
+
+/*
+ * vlclient.c
+ */
+extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
+				    const char *, struct afs_cache_vlocation *,
+				    const struct afs_wait_mode *);
+extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
+				  afs_volid_t, afs_voltype_t,
+				  struct afs_cache_vlocation *,
+				  const struct afs_wait_mode *);
+
+/*
+ * vlocation.c
+ */
+#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern int __init afs_vlocation_update_init(void);
+extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
+						  struct key *,
+						  const char *, size_t);
+extern void afs_put_vlocation(struct afs_vlocation *);
+extern void afs_vlocation_purge(void);
+
+/*
+ * vnode.c
+ */
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+	return container_of(inode, struct afs_vnode, vfs_inode);
+}
+
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+	return &vnode->vfs_inode;
+}
+
+extern void afs_vnode_finalise_status_update(struct afs_vnode *,
+					     struct afs_server *);
+extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
+				  struct key *);
+extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
+				off_t, size_t, struct page *);
+extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
+			    umode_t, struct afs_fid *, struct afs_file_status *,
+			    struct afs_callback *, struct afs_server **);
+extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
+			    bool);
+extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
+			  const char *);
+extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
+			     const char *, struct afs_fid *,
+			     struct afs_file_status *, struct afs_server **);
+extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
+			    struct key *, const char *, const char *);
+extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
+				unsigned, unsigned);
+extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
+extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
+				       struct afs_volume_status *);
+extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
+			      afs_lock_type_t);
+extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
+extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
+
+/*
+ * volume.c
+ */
+#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern void afs_put_volume(struct afs_volume *);
+extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
+extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
+extern int afs_volume_release_fileserver(struct afs_vnode *,
+					 struct afs_server *, int);
+
+/*
+ * write.c
+ */
+extern int afs_set_page_dirty(struct page *);
+extern void afs_put_writeback(struct afs_writeback *);
+extern int afs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata);
+extern int afs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata);
+extern int afs_writepage(struct page *, struct writeback_control *);
+extern int afs_writepages(struct address_space *, struct writeback_control *);
+extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
+extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
+extern int afs_writeback_all(struct afs_vnode *);
+extern int afs_fsync(struct file *, loff_t, loff_t, int);
+
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+extern unsigned afs_debug;
+
+#define dbgprintk(FMT,...) \
+	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
+
+#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT,...)	kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...)	kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...)	kdebug(FMT,##__VA_ARGS__)
+
+#elif defined(CONFIG_AFS_DEBUG)
+#define AFS_DEBUG_KENTER	0x01
+#define AFS_DEBUG_KLEAVE	0x02
+#define AFS_DEBUG_KDEBUG	0x04
+
+#define _enter(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KENTER))	\
+		kenter(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _leave(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KLEAVE))	\
+		kleave(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _debug(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KDEBUG))	\
+		kdebug(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#else
+#define _enter(FMT,...)	no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	no_printk("    "FMT ,##__VA_ARGS__)
+#endif
+
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+
+#define ASSERT(X)						\
+do {								\
+	if (unlikely(!(X))) {					\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "AFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#define ASSERTRANGE(L, OP1, N, OP2, H)					\
+do {									\
+	if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) {		\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n",	\
+		       (unsigned long)(L), (unsigned long)(N),		\
+		       (unsigned long)(H));				\
+		printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \
+		       (unsigned long)(L), (unsigned long)(N),		\
+		       (unsigned long)(H));				\
+		BUG();							\
+	}								\
+} while(0)
+
+#define ASSERTIF(C, X)						\
+do {								\
+	if (unlikely((C) && !(X))) {				\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "AFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#else
+
+#define ASSERT(X)				\
+do {						\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)			\
+do {						\
+} while(0)
+
+#define ASSERTRANGE(L, OP1, N, OP2, H)		\
+do {						\
+} while(0)
+
+#define ASSERTIF(C, X)				\
+do {						\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)		\
+do {						\
+} while(0)
+
+#endif /* __KDEBUGALL */
diff --git a/kernel/fs/afs/main.c b/kernel/fs/afs/main.c
new file mode 100644
index 000000000..35de0c047
--- /dev/null
+++ b/kernel/fs/afs/main.c
@@ -0,0 +1,184 @@
+/* AFS client file system
+ *
+ * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("AFS Client File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned afs_debug;
+module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "AFS debugging mask");
+
+static char *rootcell;
+
+module_param(rootcell, charp, 0);
+MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
+
+struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
+
+/*
+ * get a client UUID
+ */
+static int __init afs_get_client_UUID(void)
+{
+	struct timespec ts;
+	u64 uuidtime;
+	u16 clockseq;
+	int ret;
+
+	/* read the MAC address of one of the external interfaces and construct
+	 * a UUID from it */
+	ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node));
+	if (ret < 0)
+		return ret;
+
+	getnstimeofday(&ts);
+	uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
+	uuidtime += ts.tv_nsec / 100;
+	uuidtime += AFS_UUID_TO_UNIX_TIME;
+	afs_uuid.time_low = uuidtime;
+	afs_uuid.time_mid = uuidtime >> 32;
+	afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
+	afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME;
+
+	get_random_bytes(&clockseq, 2);
+	afs_uuid.clock_seq_low = clockseq;
+	afs_uuid.clock_seq_hi_and_reserved =
+		(clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
+	afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD;
+
+	_debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+	       afs_uuid.time_low,
+	       afs_uuid.time_mid,
+	       afs_uuid.time_hi_and_version,
+	       afs_uuid.clock_seq_hi_and_reserved,
+	       afs_uuid.clock_seq_low,
+	       afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
+	       afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
+
+	return 0;
+}
+
+/*
+ * initialise the AFS client FS module
+ */
+static int __init afs_init(void)
+{
+	int ret;
+
+	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
+
+	ret = afs_get_client_UUID();
+	if (ret < 0)
+		return ret;
+
+	/* create workqueue */
+	ret = -ENOMEM;
+	afs_wq = alloc_workqueue("afs", 0, 0);
+	if (!afs_wq)
+		return ret;
+
+	/* register the /proc stuff */
+	ret = afs_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+#ifdef CONFIG_AFS_FSCACHE
+	/* we want to be able to cache */
+	ret = fscache_register_netfs(&afs_cache_netfs);
+	if (ret < 0)
+		goto error_cache;
+#endif
+
+	/* initialise the cell DB */
+	ret = afs_cell_init(rootcell);
+	if (ret < 0)
+		goto error_cell_init;
+
+	/* initialise the VL update process */
+	ret = afs_vlocation_update_init();
+	if (ret < 0)
+		goto error_vl_update_init;
+
+	/* initialise the callback update process */
+	ret = afs_callback_update_init();
+	if (ret < 0)
+		goto error_callback_update_init;
+
+	/* create the RxRPC transport */
+	ret = afs_open_socket();
+	if (ret < 0)
+		goto error_open_socket;
+
+	/* register the filesystems */
+	ret = afs_fs_init();
+	if (ret < 0)
+		goto error_fs;
+
+	return ret;
+
+error_fs:
+	afs_close_socket();
+error_open_socket:
+	afs_callback_update_kill();
+error_callback_update_init:
+	afs_vlocation_purge();
+error_vl_update_init:
+	afs_cell_purge();
+error_cell_init:
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_unregister_netfs(&afs_cache_netfs);
+error_cache:
+#endif
+	afs_proc_cleanup();
+error_proc:
+	destroy_workqueue(afs_wq);
+	rcu_barrier();
+	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
+	return ret;
+}
+
+/* XXX late_initcall is kludgy, but the only alternative seems to create
+ * a transport upon the first mount, which is worse. Or is it?
+ */
+late_initcall(afs_init);	/* must be called after net/ to create socket */
+
+/*
+ * clean up on module removal
+ */
+static void __exit afs_exit(void)
+{
+	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
+
+	afs_fs_exit();
+	afs_kill_lock_manager();
+	afs_close_socket();
+	afs_purge_servers();
+	afs_callback_update_kill();
+	afs_vlocation_purge();
+	destroy_workqueue(afs_wq);
+	afs_cell_purge();
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_unregister_netfs(&afs_cache_netfs);
+#endif
+	afs_proc_cleanup();
+	rcu_barrier();
+}
+
+module_exit(afs_exit);
diff --git a/kernel/fs/afs/misc.c b/kernel/fs/afs/misc.c
new file mode 100644
index 000000000..91ea1aa0d
--- /dev/null
+++ b/kernel/fs/afs/misc.c
@@ -0,0 +1,89 @@
+/* miscellaneous bits
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+/*
+ * convert an AFS abort code to a Linux error number
+ */
+int afs_abort_to_error(u32 abort_code)
+{
+	switch (abort_code) {
+	/* low errno codes inserted into abort namespace */
+	case 13:		return -EACCES;
+	case 27:		return -EFBIG;
+	case 30:		return -EROFS;
+
+	/* VICE "special error" codes; 101 - 111 */
+	case VSALVAGE:		return -EIO;
+	case VNOVNODE:		return -ENOENT;
+	case VNOVOL:		return -ENOMEDIUM;
+	case VVOLEXISTS:	return -EEXIST;
+	case VNOSERVICE:	return -EIO;
+	case VOFFLINE:		return -ENOENT;
+	case VONLINE:		return -EEXIST;
+	case VDISKFULL:		return -ENOSPC;
+	case VOVERQUOTA:	return -EDQUOT;
+	case VBUSY:		return -EBUSY;
+	case VMOVED:		return -ENXIO;
+
+	/* Unified AFS error table; ET "uae" == 0x2f6df00 */
+	case 0x2f6df00:		return -EPERM;
+	case 0x2f6df01:		return -ENOENT;
+	case 0x2f6df04:		return -EIO;
+	case 0x2f6df0a:		return -EAGAIN;
+	case 0x2f6df0b:		return -ENOMEM;
+	case 0x2f6df0c:		return -EACCES;
+	case 0x2f6df0f:		return -EBUSY;
+	case 0x2f6df10:		return -EEXIST;
+	case 0x2f6df11:		return -EXDEV;
+	case 0x2f6df12:		return -ENODEV;
+	case 0x2f6df13:		return -ENOTDIR;
+	case 0x2f6df14:		return -EISDIR;
+	case 0x2f6df15:		return -EINVAL;
+	case 0x2f6df1a:		return -EFBIG;
+	case 0x2f6df1b:		return -ENOSPC;
+	case 0x2f6df1d:		return -EROFS;
+	case 0x2f6df1e:		return -EMLINK;
+	case 0x2f6df20:		return -EDOM;
+	case 0x2f6df21:		return -ERANGE;
+	case 0x2f6df22:		return -EDEADLK;
+	case 0x2f6df23:		return -ENAMETOOLONG;
+	case 0x2f6df24:		return -ENOLCK;
+	case 0x2f6df26:		return -ENOTEMPTY;
+	case 0x2f6df28:		return -EWOULDBLOCK;
+	case 0x2f6df69:		return -ENOTCONN;
+	case 0x2f6df6c:		return -ETIMEDOUT;
+	case 0x2f6df78:		return -EDQUOT;
+
+	/* RXKAD abort codes; from include/rxrpc/packet.h.  ET "RXK" == 0x1260B00 */
+	case RXKADINCONSISTENCY: return -EPROTO;
+	case RXKADPACKETSHORT:	return -EPROTO;
+	case RXKADLEVELFAIL:	return -EKEYREJECTED;
+	case RXKADTICKETLEN:	return -EKEYREJECTED;
+	case RXKADOUTOFSEQUENCE: return -EPROTO;
+	case RXKADNOAUTH:	return -EKEYREJECTED;
+	case RXKADBADKEY:	return -EKEYREJECTED;
+	case RXKADBADTICKET:	return -EKEYREJECTED;
+	case RXKADUNKNOWNKEY:	return -EKEYREJECTED;
+	case RXKADEXPIRED:	return -EKEYEXPIRED;
+	case RXKADSEALEDINCON:	return -EKEYREJECTED;
+	case RXKADDATALEN:	return -EKEYREJECTED;
+	case RXKADILLEGALLEVEL:	return -EKEYREJECTED;
+
+	default:		return -EREMOTEIO;
+	}
+}
diff --git a/kernel/fs/afs/mntpt.c b/kernel/fs/afs/mntpt.c
new file mode 100644
index 000000000..ccd0b212e
--- /dev/null
+++ b/kernel/fs/afs/mntpt.c
@@ -0,0 +1,270 @@
+/* mountpoint management
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/gfp.h>
+#include "internal.h"
+
+
+static struct dentry *afs_mntpt_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       unsigned int flags);
+static int afs_mntpt_open(struct inode *inode, struct file *file);
+static void afs_mntpt_expiry_timed_out(struct work_struct *work);
+
+const struct file_operations afs_mntpt_file_operations = {
+	.open		= afs_mntpt_open,
+	.llseek		= noop_llseek,
+};
+
+const struct inode_operations afs_mntpt_inode_operations = {
+	.lookup		= afs_mntpt_lookup,
+	.readlink	= page_readlink,
+	.getattr	= afs_getattr,
+};
+
+const struct inode_operations afs_autocell_inode_operations = {
+	.getattr	= afs_getattr,
+};
+
+static LIST_HEAD(afs_vfsmounts);
+static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
+
+static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
+
+/*
+ * check a symbolic link to see whether it actually encodes a mountpoint
+ * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately
+ */
+int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
+{
+	struct page *page;
+	size_t size;
+	char *buf;
+	int ret;
+
+	_enter("{%x:%u,%u}",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+
+	/* read the contents of the symlink into the pagecache */
+	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
+			       afs_page_filler, key);
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		goto out;
+	}
+
+	ret = -EIO;
+	if (PageError(page))
+		goto out_free;
+
+	buf = kmap(page);
+
+	/* examine the symlink's contents */
+	size = vnode->status.size;
+	_debug("symlink to %*.*s", (int) size, (int) size, buf);
+
+	if (size > 2 &&
+	    (buf[0] == '%' || buf[0] == '#') &&
+	    buf[size - 1] == '.'
+	    ) {
+		_debug("symlink is a mountpoint");
+		spin_lock(&vnode->lock);
+		set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+		vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
+		spin_unlock(&vnode->lock);
+	}
+
+	ret = 0;
+
+	kunmap(page);
+out_free:
+	page_cache_release(page);
+out:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * no valid lookup procedure on this sort of dir
+ */
+static struct dentry *afs_mntpt_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       unsigned int flags)
+{
+	_enter("%p,%p{%pd2}", dir, dentry, dentry);
+	return ERR_PTR(-EREMOTE);
+}
+
+/*
+ * no valid open procedure on this sort of dir
+ */
+static int afs_mntpt_open(struct inode *inode, struct file *file)
+{
+	_enter("%p,%p{%pD2}", inode, file, file);
+	return -EREMOTE;
+}
+
+/*
+ * create a vfsmount to be automounted
+ */
+static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+{
+	struct afs_super_info *super;
+	struct vfsmount *mnt;
+	struct afs_vnode *vnode;
+	struct page *page;
+	char *devname, *options;
+	bool rwpath = false;
+	int ret;
+
+	_enter("{%pd}", mntpt);
+
+	BUG_ON(!d_inode(mntpt));
+
+	ret = -ENOMEM;
+	devname = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!devname)
+		goto error_no_devname;
+
+	options = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!options)
+		goto error_no_options;
+
+	vnode = AFS_FS_I(d_inode(mntpt));
+	if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
+		/* if the directory is a pseudo directory, use the d_name */
+		static const char afs_root_cell[] = ":root.cell.";
+		unsigned size = mntpt->d_name.len;
+
+		ret = -ENOENT;
+		if (size < 2 || size > AFS_MAXCELLNAME)
+			goto error_no_page;
+
+		if (mntpt->d_name.name[0] == '.') {
+			devname[0] = '#';
+			memcpy(devname + 1, mntpt->d_name.name, size - 1);
+			memcpy(devname + size, afs_root_cell,
+			       sizeof(afs_root_cell));
+			rwpath = true;
+		} else {
+			devname[0] = '%';
+			memcpy(devname + 1, mntpt->d_name.name, size);
+			memcpy(devname + size + 1, afs_root_cell,
+			       sizeof(afs_root_cell));
+		}
+	} else {
+		/* read the contents of the AFS special symlink */
+		loff_t size = i_size_read(d_inode(mntpt));
+		char *buf;
+
+		ret = -EINVAL;
+		if (size > PAGE_SIZE - 1)
+			goto error_no_page;
+
+		page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto error_no_page;
+		}
+
+		ret = -EIO;
+		if (PageError(page))
+			goto error;
+
+		buf = kmap_atomic(page);
+		memcpy(devname, buf, size);
+		kunmap_atomic(buf);
+		page_cache_release(page);
+		page = NULL;
+	}
+
+	/* work out what options we want */
+	super = AFS_FS_S(mntpt->d_sb);
+	memcpy(options, "cell=", 5);
+	strcpy(options + 5, super->volume->cell->name);
+	if (super->volume->type == AFSVL_RWVOL || rwpath)
+		strcat(options, ",rwpath");
+
+	/* try and do the mount */
+	_debug("--- attempting mount %s -o %s ---", devname, options);
+	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+	_debug("--- mount result %p ---", mnt);
+
+	free_page((unsigned long) devname);
+	free_page((unsigned long) options);
+	_leave(" = %p", mnt);
+	return mnt;
+
+error:
+	page_cache_release(page);
+error_no_page:
+	free_page((unsigned long) options);
+error_no_options:
+	free_page((unsigned long) devname);
+error_no_devname:
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * handle an automount point
+ */
+struct vfsmount *afs_d_automount(struct path *path)
+{
+	struct vfsmount *newmnt;
+
+	_enter("{%pd}", path->dentry);
+
+	newmnt = afs_mntpt_do_automount(path->dentry);
+	if (IS_ERR(newmnt))
+		return newmnt;
+
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &afs_vfsmounts);
+	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+			   afs_mntpt_expiry_timeout * HZ);
+	_leave(" = %p", newmnt);
+	return newmnt;
+}
+
+/*
+ * handle mountpoint expiry timer going off
+ */
+static void afs_mntpt_expiry_timed_out(struct work_struct *work)
+{
+	_enter("");
+
+	if (!list_empty(&afs_vfsmounts)) {
+		mark_mounts_for_expiry(&afs_vfsmounts);
+		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+				   afs_mntpt_expiry_timeout * HZ);
+	}
+
+	_leave("");
+}
+
+/*
+ * kill the AFS mountpoint timer if it's still running
+ */
+void afs_mntpt_kill_timer(void)
+{
+	_enter("");
+
+	ASSERT(list_empty(&afs_vfsmounts));
+	cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
+}
diff --git a/kernel/fs/afs/netdevices.c b/kernel/fs/afs/netdevices.c
new file mode 100644
index 000000000..7ad36506c
--- /dev/null
+++ b/kernel/fs/afs/netdevices.c
@@ -0,0 +1,68 @@
+/* AFS network device helpers
+ *
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/string.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <net/net_namespace.h>
+#include "internal.h"
+
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer will normally be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 *mac, size_t maclen)
+{
+	struct net_device *dev;
+	int ret = -ENODEV;
+
+	BUG_ON(maclen != ETH_ALEN);
+
+	rtnl_lock();
+	dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
+	if (dev) {
+		memcpy(mac, dev->dev_addr, maclen);
+		ret = 0;
+	}
+	rtnl_unlock();
+	return ret;
+}
+
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - maxbufs must be at least 1
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+			    bool wantloopback)
+{
+	struct net_device *dev;
+	struct in_device *idev;
+	int n = 0;
+
+	ASSERT(maxbufs > 0);
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
+			continue;
+		idev = __in_dev_get_rtnl(dev);
+		if (!idev)
+			continue;
+		for_primary_ifa(idev) {
+			bufs[n].address.s_addr = ifa->ifa_address;
+			bufs[n].netmask.s_addr = ifa->ifa_mask;
+			bufs[n].mtu = dev->mtu;
+			n++;
+			if (n >= maxbufs)
+				goto out;
+		} endfor_ifa(idev);
+	}
+out:
+	rtnl_unlock();
+	return n;
+}
diff --git a/kernel/fs/afs/proc.c b/kernel/fs/afs/proc.c
new file mode 100644
index 000000000..24a905b07
--- /dev/null
+++ b/kernel/fs/afs/proc.c
@@ -0,0 +1,666 @@
+/* /proc interface for AFS
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static struct proc_dir_entry *proc_afs;
+
+
+static int afs_proc_cells_open(struct inode *inode, struct file *file);
+static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos);
+static void afs_proc_cells_stop(struct seq_file *p, void *v);
+static int afs_proc_cells_show(struct seq_file *m, void *v);
+static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
+				    size_t size, loff_t *_pos);
+
+static const struct seq_operations afs_proc_cells_ops = {
+	.start	= afs_proc_cells_start,
+	.next	= afs_proc_cells_next,
+	.stop	= afs_proc_cells_stop,
+	.show	= afs_proc_cells_show,
+};
+
+static const struct file_operations afs_proc_cells_fops = {
+	.open		= afs_proc_cells_open,
+	.read		= seq_read,
+	.write		= afs_proc_cells_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *_pos);
+static ssize_t afs_proc_rootcell_write(struct file *file,
+				       const char __user *buf,
+				       size_t size, loff_t *_pos);
+
+static const struct file_operations afs_proc_rootcell_fops = {
+	.read		= afs_proc_rootcell_read,
+	.write		= afs_proc_rootcell_write,
+	.llseek		= no_llseek,
+};
+
+static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
+static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
+					loff_t *pos);
+static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v);
+static int afs_proc_cell_volumes_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_cell_volumes_ops = {
+	.start	= afs_proc_cell_volumes_start,
+	.next	= afs_proc_cell_volumes_next,
+	.stop	= afs_proc_cell_volumes_stop,
+	.show	= afs_proc_cell_volumes_show,
+};
+
+static const struct file_operations afs_proc_cell_volumes_fops = {
+	.open		= afs_proc_cell_volumes_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int afs_proc_cell_vlservers_open(struct inode *inode,
+					struct file *file);
+static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
+					  loff_t *pos);
+static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v);
+static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_cell_vlservers_ops = {
+	.start	= afs_proc_cell_vlservers_start,
+	.next	= afs_proc_cell_vlservers_next,
+	.stop	= afs_proc_cell_vlservers_stop,
+	.show	= afs_proc_cell_vlservers_show,
+};
+
+static const struct file_operations afs_proc_cell_vlservers_fops = {
+	.open		= afs_proc_cell_vlservers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
+static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
+					loff_t *pos);
+static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
+static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_cell_servers_ops = {
+	.start	= afs_proc_cell_servers_start,
+	.next	= afs_proc_cell_servers_next,
+	.stop	= afs_proc_cell_servers_stop,
+	.show	= afs_proc_cell_servers_show,
+};
+
+static const struct file_operations afs_proc_cell_servers_fops = {
+	.open		= afs_proc_cell_servers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * initialise the /proc/fs/afs/ directory
+ */
+int afs_proc_init(void)
+{
+	_enter("");
+
+	proc_afs = proc_mkdir("fs/afs", NULL);
+	if (!proc_afs)
+		goto error_dir;
+
+	if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
+	    !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
+		goto error_tree;
+
+	_leave(" = 0");
+	return 0;
+
+error_tree:
+	remove_proc_subtree("fs/afs", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/afs/ directory
+ */
+void afs_proc_cleanup(void)
+{
+	remove_proc_subtree("fs/afs", NULL);
+}
+
+/*
+ * open "/proc/fs/afs/cells" which provides a summary of extant cells
+ */
+static int afs_proc_cells_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &afs_proc_cells_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = PDE_DATA(inode);
+
+	return 0;
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
+{
+	/* lock the list against modification */
+	down_read(&afs_proc_cells_sem);
+	return seq_list_start_head(&afs_proc_cells, *_pos);
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &afs_proc_cells, pos);
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cells_stop(struct seq_file *p, void *v)
+{
+	up_read(&afs_proc_cells_sem);
+}
+
+/*
+ * display a header line followed by a load of cell lines
+ */
+static int afs_proc_cells_show(struct seq_file *m, void *v)
+{
+	struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
+
+	if (v == &afs_proc_cells) {
+		/* display header on line 1 */
+		seq_puts(m, "USE NAME\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	seq_printf(m, "%3d %s\n",
+		   atomic_read(&cell->usage), cell->name);
+	return 0;
+}
+
+/*
+ * handle writes to /proc/fs/afs/cells
+ * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]"
+ */
+static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
+				    size_t size, loff_t *_pos)
+{
+	char *kbuf, *name, *args;
+	int ret;
+
+	/* start by dragging the command into memory */
+	if (size <= 1 || size >= PAGE_SIZE)
+		return -EINVAL;
+
+	kbuf = kmalloc(size + 1, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(kbuf, buf, size) != 0)
+		goto done;
+	kbuf[size] = 0;
+
+	/* trim to first NL */
+	name = memchr(kbuf, '\n', size);
+	if (name)
+		*name = 0;
+
+	/* split into command, name and argslist */
+	name = strchr(kbuf, ' ');
+	if (!name)
+		goto inval;
+	do {
+		*name++ = 0;
+	} while(*name == ' ');
+	if (!*name)
+		goto inval;
+
+	args = strchr(name, ' ');
+	if (!args)
+		goto inval;
+	do {
+		*args++ = 0;
+	} while(*args == ' ');
+	if (!*args)
+		goto inval;
+
+	/* determine command to perform */
+	_debug("cmd=%s name=%s args=%s", kbuf, name, args);
+
+	if (strcmp(kbuf, "add") == 0) {
+		struct afs_cell *cell;
+
+		cell = afs_cell_create(name, strlen(name), args, false);
+		if (IS_ERR(cell)) {
+			ret = PTR_ERR(cell);
+			goto done;
+		}
+
+		afs_put_cell(cell);
+		printk("kAFS: Added new cell '%s'\n", name);
+	} else {
+		goto inval;
+	}
+
+	ret = size;
+
+done:
+	kfree(kbuf);
+	_leave(" = %d", ret);
+	return ret;
+
+inval:
+	ret = -EINVAL;
+	printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n");
+	goto done;
+}
+
+static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *_pos)
+{
+	return 0;
+}
+
+/*
+ * handle writes to /proc/fs/afs/rootcell
+ * - to initialize rootcell: echo "cell.name:192.168.231.14"
+ */
+static ssize_t afs_proc_rootcell_write(struct file *file,
+				       const char __user *buf,
+				       size_t size, loff_t *_pos)
+{
+	char *kbuf, *s;
+	int ret;
+
+	/* start by dragging the command into memory */
+	if (size <= 1 || size >= PAGE_SIZE)
+		return -EINVAL;
+
+	ret = -ENOMEM;
+	kbuf = kmalloc(size + 1, GFP_KERNEL);
+	if (!kbuf)
+		goto nomem;
+
+	ret = -EFAULT;
+	if (copy_from_user(kbuf, buf, size) != 0)
+		goto infault;
+	kbuf[size] = 0;
+
+	/* trim to first NL */
+	s = memchr(kbuf, '\n', size);
+	if (s)
+		*s = 0;
+
+	/* determine command to perform */
+	_debug("rootcell=%s", kbuf);
+
+	ret = afs_cell_init(kbuf);
+	if (ret >= 0)
+		ret = size;	/* consume everything, always */
+
+infault:
+	kfree(kbuf);
+nomem:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * initialise /proc/fs/afs/<cell>/
+ */
+int afs_proc_cell_setup(struct afs_cell *cell)
+{
+	struct proc_dir_entry *dir;
+
+	_enter("%p{%s}", cell, cell->name);
+
+	dir = proc_mkdir(cell->name, proc_afs);
+	if (!dir)
+		goto error_dir;
+
+	if (!proc_create_data("servers", 0, dir,
+			     &afs_proc_cell_servers_fops, cell) ||
+	    !proc_create_data("vlservers", 0, dir,
+			     &afs_proc_cell_vlservers_fops, cell) ||
+	    !proc_create_data("volumes", 0, dir,
+			     &afs_proc_cell_volumes_fops, cell))
+		goto error_tree;
+
+	_leave(" = 0");
+	return 0;
+
+error_tree:
+	remove_proc_subtree(cell->name, proc_afs);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * remove /proc/fs/afs/<cell>/
+ */
+void afs_proc_cell_remove(struct afs_cell *cell)
+{
+	_enter("");
+
+	remove_proc_subtree(cell->name, proc_afs);
+
+	_leave("");
+}
+
+/*
+ * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
+ */
+static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
+{
+	struct afs_cell *cell;
+	struct seq_file *m;
+	int ret;
+
+	cell = PDE_DATA(inode);
+	if (!cell)
+		return -ENOENT;
+
+	ret = seq_open(file, &afs_proc_cell_volumes_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = cell;
+
+	return 0;
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
+{
+	struct afs_cell *cell = m->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+
+	/* lock the list against modification */
+	down_read(&cell->vl_sem);
+	return seq_list_start_head(&cell->vl_list, *_pos);
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
+					loff_t *_pos)
+{
+	struct afs_cell *cell = p->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+	return seq_list_next(v, &cell->vl_list, _pos);
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
+{
+	struct afs_cell *cell = p->private;
+
+	up_read(&cell->vl_sem);
+}
+
+static const char afs_vlocation_states[][4] = {
+	[AFS_VL_NEW]			= "New",
+	[AFS_VL_CREATING]		= "Crt",
+	[AFS_VL_VALID]			= "Val",
+	[AFS_VL_NO_VOLUME]		= "NoV",
+	[AFS_VL_UPDATING]		= "Upd",
+	[AFS_VL_VOLUME_DELETED]		= "Del",
+	[AFS_VL_UNCERTAIN]		= "Unc",
+};
+
+/*
+ * display a header line followed by a load of volume lines
+ */
+static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
+{
+	struct afs_cell *cell = m->private;
+	struct afs_vlocation *vlocation =
+		list_entry(v, struct afs_vlocation, link);
+
+	/* display header on line 1 */
+	if (v == &cell->vl_list) {
+		seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	seq_printf(m, "%3d %s %08x %08x %08x %s\n",
+		   atomic_read(&vlocation->usage),
+		   afs_vlocation_states[vlocation->state],
+		   vlocation->vldb.vid[0],
+		   vlocation->vldb.vid[1],
+		   vlocation->vldb.vid[2],
+		   vlocation->vldb.name);
+
+	return 0;
+}
+
+/*
+ * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
+ * location server
+ */
+static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
+{
+	struct afs_cell *cell;
+	struct seq_file *m;
+	int ret;
+
+	cell = PDE_DATA(inode);
+	if (!cell)
+		return -ENOENT;
+
+	ret = seq_open(file, &afs_proc_cell_vlservers_ops);
+	if (ret<0)
+		return ret;
+
+	m = file->private_data;
+	m->private = cell;
+
+	return 0;
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
+{
+	struct afs_cell *cell = m->private;
+	loff_t pos = *_pos;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+
+	/* lock the list against modification */
+	down_read(&cell->vl_sem);
+
+	/* allow for the header line */
+	if (!pos)
+		return (void *) 1;
+	pos--;
+
+	if (pos >= cell->vl_naddrs)
+		return NULL;
+
+	return &cell->vl_addrs[pos];
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
+					  loff_t *_pos)
+{
+	struct afs_cell *cell = p->private;
+	loff_t pos;
+
+	_enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos);
+
+	pos = *_pos;
+	(*_pos)++;
+	if (pos >= cell->vl_naddrs)
+		return NULL;
+
+	return &cell->vl_addrs[pos];
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
+{
+	struct afs_cell *cell = p->private;
+
+	up_read(&cell->vl_sem);
+}
+
+/*
+ * display a header line followed by a load of volume lines
+ */
+static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
+{
+	struct in_addr *addr = v;
+
+	/* display header on line 1 */
+	if (v == (struct in_addr *) 1) {
+		seq_puts(m, "ADDRESS\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	seq_printf(m, "%pI4\n", &addr->s_addr);
+	return 0;
+}
+
+/*
+ * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
+ * servers
+ */
+static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
+{
+	struct afs_cell *cell;
+	struct seq_file *m;
+	int ret;
+
+	cell = PDE_DATA(inode);
+	if (!cell)
+		return -ENOENT;
+
+	ret = seq_open(file, &afs_proc_cell_servers_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = cell;
+	return 0;
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
+	__acquires(m->private->servers_lock)
+{
+	struct afs_cell *cell = m->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+
+	/* lock the list against modification */
+	read_lock(&cell->servers_lock);
+	return seq_list_start_head(&cell->servers, *_pos);
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
+					loff_t *_pos)
+{
+	struct afs_cell *cell = p->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+	return seq_list_next(v, &cell->servers, _pos);
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
+	__releases(p->private->servers_lock)
+{
+	struct afs_cell *cell = p->private;
+
+	read_unlock(&cell->servers_lock);
+}
+
+/*
+ * display a header line followed by a load of volume lines
+ */
+static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
+{
+	struct afs_cell *cell = m->private;
+	struct afs_server *server = list_entry(v, struct afs_server, link);
+	char ipaddr[20];
+
+	/* display header on line 1 */
+	if (v == &cell->servers) {
+		seq_puts(m, "USE ADDR            STATE\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	sprintf(ipaddr, "%pI4", &server->addr);
+	seq_printf(m, "%3d %-15.15s %5d\n",
+		   atomic_read(&server->usage), ipaddr, server->fs_state);
+
+	return 0;
+}
diff --git a/kernel/fs/afs/rxrpc.c b/kernel/fs/afs/rxrpc.c
new file mode 100644
index 000000000..3a57a1b0f
--- /dev/null
+++ b/kernel/fs/afs/rxrpc.c
@@ -0,0 +1,862 @@
+/* Maintain an RxRPC server socket to do AFS communications through
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_cm.h"
+
+static struct socket *afs_socket; /* my RxRPC socket */
+static struct workqueue_struct *afs_async_calls;
+static atomic_t afs_outstanding_calls;
+static atomic_t afs_outstanding_skbs;
+
+static void afs_wake_up_call_waiter(struct afs_call *);
+static int afs_wait_for_call_to_complete(struct afs_call *);
+static void afs_wake_up_async_call(struct afs_call *);
+static int afs_dont_wait_for_call_to_complete(struct afs_call *);
+static void afs_process_async_call(struct afs_call *);
+static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
+static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+
+/* synchronous call management */
+const struct afs_wait_mode afs_sync_call = {
+	.rx_wakeup	= afs_wake_up_call_waiter,
+	.wait		= afs_wait_for_call_to_complete,
+};
+
+/* asynchronous call management */
+const struct afs_wait_mode afs_async_call = {
+	.rx_wakeup	= afs_wake_up_async_call,
+	.wait		= afs_dont_wait_for_call_to_complete,
+};
+
+/* asynchronous incoming call management */
+static const struct afs_wait_mode afs_async_incoming_call = {
+	.rx_wakeup	= afs_wake_up_async_call,
+};
+
+/* asynchronous incoming call initial processing */
+static const struct afs_call_type afs_RXCMxxxx = {
+	.name		= "CB.xxxx",
+	.deliver	= afs_deliver_cm_op_id,
+	.abort_to_error	= afs_abort_to_error,
+};
+
+static void afs_collect_incoming_call(struct work_struct *);
+
+static struct sk_buff_head afs_incoming_calls;
+static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+
+static void afs_async_workfn(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, async_work);
+
+	call->async_workfn(call);
+}
+
+/*
+ * open an RxRPC socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+int afs_open_socket(void)
+{
+	struct sockaddr_rxrpc srx;
+	struct socket *socket;
+	int ret;
+
+	_enter("");
+
+	skb_queue_head_init(&afs_incoming_calls);
+
+	afs_async_calls = create_singlethread_workqueue("kafsd");
+	if (!afs_async_calls) {
+		_leave(" = -ENOMEM [wq]");
+		return -ENOMEM;
+	}
+
+	ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+	if (ret < 0) {
+		destroy_workqueue(afs_async_calls);
+		_leave(" = %d [socket]", ret);
+		return ret;
+	}
+
+	socket->sk->sk_allocation = GFP_NOFS;
+
+	/* bind the callback manager's address to make this a server socket */
+	srx.srx_family			= AF_RXRPC;
+	srx.srx_service			= CM_SERVICE;
+	srx.transport_type		= SOCK_DGRAM;
+	srx.transport_len		= sizeof(srx.transport.sin);
+	srx.transport.sin.sin_family	= AF_INET;
+	srx.transport.sin.sin_port	= htons(AFS_CM_PORT);
+	memset(&srx.transport.sin.sin_addr, 0,
+	       sizeof(srx.transport.sin.sin_addr));
+
+	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	if (ret < 0) {
+		sock_release(socket);
+		destroy_workqueue(afs_async_calls);
+		_leave(" = %d [bind]", ret);
+		return ret;
+	}
+
+	rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
+
+	afs_socket = socket;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * close the RxRPC socket AFS was using
+ */
+void afs_close_socket(void)
+{
+	_enter("");
+
+	sock_release(afs_socket);
+
+	_debug("dework");
+	destroy_workqueue(afs_async_calls);
+
+	ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
+	ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
+	_leave("");
+}
+
+/*
+ * note that the data in a socket buffer is now delivered and that the buffer
+ * should be freed
+ */
+static void afs_data_delivered(struct sk_buff *skb)
+{
+	if (!skb) {
+		_debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
+		dump_stack();
+	} else {
+		_debug("DLVR %p{%u} [%d]",
+		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+			BUG();
+		rxrpc_kernel_data_delivered(skb);
+	}
+}
+
+/*
+ * free a socket buffer
+ */
+static void afs_free_skb(struct sk_buff *skb)
+{
+	if (!skb) {
+		_debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
+		dump_stack();
+	} else {
+		_debug("FREE %p{%u} [%d]",
+		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+			BUG();
+		rxrpc_kernel_free_skb(skb);
+	}
+}
+
+/*
+ * free a call
+ */
+static void afs_free_call(struct afs_call *call)
+{
+	_debug("DONE %p{%s} [%d]",
+	       call, call->type->name, atomic_read(&afs_outstanding_calls));
+	if (atomic_dec_return(&afs_outstanding_calls) == -1)
+		BUG();
+
+	ASSERTCMP(call->rxcall, ==, NULL);
+	ASSERT(!work_pending(&call->async_work));
+	ASSERT(skb_queue_empty(&call->rx_queue));
+	ASSERT(call->type->name != NULL);
+
+	kfree(call->request);
+	kfree(call);
+}
+
+/*
+ * End a call but do not free it
+ */
+static void afs_end_call_nofree(struct afs_call *call)
+{
+	if (call->rxcall) {
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+	}
+	if (call->type->destructor)
+		call->type->destructor(call);
+}
+
+/*
+ * End a call and free it
+ */
+static void afs_end_call(struct afs_call *call)
+{
+	afs_end_call_nofree(call);
+	afs_free_call(call);
+}
+
+/*
+ * allocate a call with flat request and reply buffers
+ */
+struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+				     size_t request_size, size_t reply_size)
+{
+	struct afs_call *call;
+
+	call = kzalloc(sizeof(*call), GFP_NOFS);
+	if (!call)
+		goto nomem_call;
+
+	_debug("CALL %p{%s} [%d]",
+	       call, type->name, atomic_read(&afs_outstanding_calls));
+	atomic_inc(&afs_outstanding_calls);
+
+	call->type = type;
+	call->request_size = request_size;
+	call->reply_max = reply_size;
+
+	if (request_size) {
+		call->request = kmalloc(request_size, GFP_NOFS);
+		if (!call->request)
+			goto nomem_free;
+	}
+
+	if (reply_size) {
+		call->buffer = kmalloc(reply_size, GFP_NOFS);
+		if (!call->buffer)
+			goto nomem_free;
+	}
+
+	init_waitqueue_head(&call->waitq);
+	skb_queue_head_init(&call->rx_queue);
+	return call;
+
+nomem_free:
+	afs_free_call(call);
+nomem_call:
+	return NULL;
+}
+
+/*
+ * clean up a call with flat buffer
+ */
+void afs_flat_call_destructor(struct afs_call *call)
+{
+	_enter("");
+
+	kfree(call->request);
+	call->request = NULL;
+	kfree(call->buffer);
+	call->buffer = NULL;
+}
+
+/*
+ * attach the data from a bunch of pages on an inode to a call
+ */
+static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
+			  struct kvec *iov)
+{
+	struct page *pages[8];
+	unsigned count, n, loop, offset, to;
+	pgoff_t first = call->first, last = call->last;
+	int ret;
+
+	_enter("");
+
+	offset = call->first_offset;
+	call->first_offset = 0;
+
+	do {
+		_debug("attach %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > ARRAY_SIZE(pages))
+			count = ARRAY_SIZE(pages);
+		n = find_get_pages_contig(call->mapping, first, count, pages);
+		ASSERTCMP(n, ==, count);
+
+		loop = 0;
+		do {
+			msg->msg_flags = 0;
+			to = PAGE_SIZE;
+			if (first + loop >= last)
+				to = call->last_to;
+			else
+				msg->msg_flags = MSG_MORE;
+			iov->iov_base = kmap(pages[loop]) + offset;
+			iov->iov_len = to - offset;
+			offset = 0;
+
+			_debug("- range %u-%u%s",
+			       offset, to, msg->msg_flags ? " [more]" : "");
+			iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC,
+				      iov, 1, to - offset);
+
+			/* have to change the state *before* sending the last
+			 * packet as RxRPC might give us the reply before it
+			 * returns from sending the request */
+			if (first + loop >= last)
+				call->state = AFS_CALL_AWAIT_REPLY;
+			ret = rxrpc_kernel_send_data(call->rxcall, msg,
+						     to - offset);
+			kunmap(pages[loop]);
+			if (ret < 0)
+				break;
+		} while (++loop < count);
+		first += count;
+
+		for (loop = 0; loop < count; loop++)
+			put_page(pages[loop]);
+		if (ret < 0)
+			break;
+	} while (first <= last);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * initiate a call
+ */
+int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct sockaddr_rxrpc srx;
+	struct rxrpc_call *rxcall;
+	struct msghdr msg;
+	struct kvec iov[1];
+	int ret;
+	struct sk_buff *skb;
+
+	_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+
+	ASSERT(call->type != NULL);
+	ASSERT(call->type->name != NULL);
+
+	_debug("____MAKE %p{%s,%x} [%d]____",
+	       call, call->type->name, key_serial(call->key),
+	       atomic_read(&afs_outstanding_calls));
+
+	call->wait_mode = wait_mode;
+	call->async_workfn = afs_process_async_call;
+	INIT_WORK(&call->async_work, afs_async_workfn);
+
+	memset(&srx, 0, sizeof(srx));
+	srx.srx_family = AF_RXRPC;
+	srx.srx_service = call->service_id;
+	srx.transport_type = SOCK_DGRAM;
+	srx.transport_len = sizeof(srx.transport.sin);
+	srx.transport.sin.sin_family = AF_INET;
+	srx.transport.sin.sin_port = call->port;
+	memcpy(&srx.transport.sin.sin_addr, addr, 4);
+
+	/* create a call */
+	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+					 (unsigned long) call, gfp);
+	call->key = NULL;
+	if (IS_ERR(rxcall)) {
+		ret = PTR_ERR(rxcall);
+		goto error_kill_call;
+	}
+
+	call->rxcall = rxcall;
+
+	/* send the request */
+	iov[0].iov_base	= call->request;
+	iov[0].iov_len	= call->request_size;
+
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
+		      call->request_size);
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
+
+	/* have to change the state *before* sending the last packet as RxRPC
+	 * might give us the reply before it returns from sending the
+	 * request */
+	if (!call->send_pages)
+		call->state = AFS_CALL_AWAIT_REPLY;
+	ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+	if (ret < 0)
+		goto error_do_abort;
+
+	if (call->send_pages) {
+		ret = afs_send_pages(call, &msg, iov);
+		if (ret < 0)
+			goto error_do_abort;
+	}
+
+	/* at this point, an async call may no longer exist as it may have
+	 * already completed */
+	return wait_mode->wait(call);
+
+error_do_abort:
+	rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+	while ((skb = skb_dequeue(&call->rx_queue)))
+		afs_free_skb(skb);
+error_kill_call:
+	afs_end_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * handles intercepted messages that were arriving in the socket's Rx queue
+ * - called with the socket receive queue lock held to ensure message ordering
+ * - called with softirqs disabled
+ */
+static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
+			       struct sk_buff *skb)
+{
+	struct afs_call *call = (struct afs_call *) user_call_ID;
+
+	_enter("%p,,%u", call, skb->mark);
+
+	_debug("ICPT %p{%u} [%d]",
+	       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+
+	ASSERTCMP(sk, ==, afs_socket->sk);
+	atomic_inc(&afs_outstanding_skbs);
+
+	if (!call) {
+		/* its an incoming call for our callback service */
+		skb_queue_tail(&afs_incoming_calls, skb);
+		queue_work(afs_wq, &afs_collect_incoming_call_work);
+	} else {
+		/* route the messages directly to the appropriate call */
+		skb_queue_tail(&call->rx_queue, skb);
+		call->wait_mode->rx_wakeup(call);
+	}
+
+	_leave("");
+}
+
+/*
+ * deliver messages to a call
+ */
+static void afs_deliver_to_call(struct afs_call *call)
+{
+	struct sk_buff *skb;
+	bool last;
+	u32 abort_code;
+	int ret;
+
+	_enter("");
+
+	while ((call->state == AFS_CALL_AWAIT_REPLY ||
+		call->state == AFS_CALL_AWAIT_OP_ID ||
+		call->state == AFS_CALL_AWAIT_REQUEST ||
+		call->state == AFS_CALL_AWAIT_ACK) &&
+	       (skb = skb_dequeue(&call->rx_queue))) {
+		switch (skb->mark) {
+		case RXRPC_SKB_MARK_DATA:
+			_debug("Rcv DATA");
+			last = rxrpc_kernel_is_data_last(skb);
+			ret = call->type->deliver(call, skb, last);
+			switch (ret) {
+			case 0:
+				if (last &&
+				    call->state == AFS_CALL_AWAIT_REPLY)
+					call->state = AFS_CALL_COMPLETE;
+				break;
+			case -ENOTCONN:
+				abort_code = RX_CALL_DEAD;
+				goto do_abort;
+			case -ENOTSUPP:
+				abort_code = RX_INVALID_OPERATION;
+				goto do_abort;
+			default:
+				abort_code = RXGEN_CC_UNMARSHAL;
+				if (call->state != AFS_CALL_AWAIT_REPLY)
+					abort_code = RXGEN_SS_UNMARSHAL;
+			do_abort:
+				rxrpc_kernel_abort_call(call->rxcall,
+							abort_code);
+				call->error = ret;
+				call->state = AFS_CALL_ERROR;
+				break;
+			}
+			afs_data_delivered(skb);
+			skb = NULL;
+			continue;
+		case RXRPC_SKB_MARK_FINAL_ACK:
+			_debug("Rcv ACK");
+			call->state = AFS_CALL_COMPLETE;
+			break;
+		case RXRPC_SKB_MARK_BUSY:
+			_debug("Rcv BUSY");
+			call->error = -EBUSY;
+			call->state = AFS_CALL_BUSY;
+			break;
+		case RXRPC_SKB_MARK_REMOTE_ABORT:
+			abort_code = rxrpc_kernel_get_abort_code(skb);
+			call->error = call->type->abort_to_error(abort_code);
+			call->state = AFS_CALL_ABORTED;
+			_debug("Rcv ABORT %u -> %d", abort_code, call->error);
+			break;
+		case RXRPC_SKB_MARK_NET_ERROR:
+			call->error = -rxrpc_kernel_get_error_number(skb);
+			call->state = AFS_CALL_ERROR;
+			_debug("Rcv NET ERROR %d", call->error);
+			break;
+		case RXRPC_SKB_MARK_LOCAL_ERROR:
+			call->error = -rxrpc_kernel_get_error_number(skb);
+			call->state = AFS_CALL_ERROR;
+			_debug("Rcv LOCAL ERROR %d", call->error);
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		afs_free_skb(skb);
+	}
+
+	/* make sure the queue is empty if the call is done with (we might have
+	 * aborted the call early because of an unmarshalling error) */
+	if (call->state >= AFS_CALL_COMPLETE) {
+		while ((skb = skb_dequeue(&call->rx_queue)))
+			afs_free_skb(skb);
+		if (call->incoming)
+			afs_end_call(call);
+	}
+
+	_leave("");
+}
+
+/*
+ * wait synchronously for a call to complete
+ */
+static int afs_wait_for_call_to_complete(struct afs_call *call)
+{
+	struct sk_buff *skb;
+	int ret;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("");
+
+	add_wait_queue(&call->waitq, &myself);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* deliver any messages that are in the queue */
+		if (!skb_queue_empty(&call->rx_queue)) {
+			__set_current_state(TASK_RUNNING);
+			afs_deliver_to_call(call);
+			continue;
+		}
+
+		ret = call->error;
+		if (call->state >= AFS_CALL_COMPLETE)
+			break;
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+		schedule();
+	}
+
+	remove_wait_queue(&call->waitq, &myself);
+	__set_current_state(TASK_RUNNING);
+
+	/* kill the call */
+	if (call->state < AFS_CALL_COMPLETE) {
+		_debug("call incomplete");
+		rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
+		while ((skb = skb_dequeue(&call->rx_queue)))
+			afs_free_skb(skb);
+	}
+
+	_debug("call complete");
+	afs_end_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * wake up a waiting call
+ */
+static void afs_wake_up_call_waiter(struct afs_call *call)
+{
+	wake_up(&call->waitq);
+}
+
+/*
+ * wake up an asynchronous call
+ */
+static void afs_wake_up_async_call(struct afs_call *call)
+{
+	_enter("");
+	queue_work(afs_async_calls, &call->async_work);
+}
+
+/*
+ * put a call into asynchronous mode
+ * - mustn't touch the call descriptor as the call my have completed by the
+ *   time we get here
+ */
+static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
+{
+	_enter("");
+	return -EINPROGRESS;
+}
+
+/*
+ * delete an asynchronous call
+ */
+static void afs_delete_async_call(struct afs_call *call)
+{
+	_enter("");
+
+	afs_free_call(call);
+
+	_leave("");
+}
+
+/*
+ * perform processing on an asynchronous call
+ * - on a multiple-thread workqueue this work item may try to run on several
+ *   CPUs at the same time
+ */
+static void afs_process_async_call(struct afs_call *call)
+{
+	_enter("");
+
+	if (!skb_queue_empty(&call->rx_queue))
+		afs_deliver_to_call(call);
+
+	if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+		if (call->wait_mode->async_complete)
+			call->wait_mode->async_complete(call->reply,
+							call->error);
+		call->reply = NULL;
+
+		/* kill the call */
+		afs_end_call_nofree(call);
+
+		/* we can't just delete the call because the work item may be
+		 * queued */
+		call->async_workfn = afs_delete_async_call;
+		queue_work(afs_async_calls, &call->async_work);
+	}
+
+	_leave("");
+}
+
+/*
+ * empty a socket buffer into a flat reply buffer
+ */
+void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+{
+	size_t len = skb->len;
+
+	if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+		BUG();
+	call->reply_size += len;
+}
+
+/*
+ * accept the backlog of incoming calls
+ */
+static void afs_collect_incoming_call(struct work_struct *work)
+{
+	struct rxrpc_call *rxcall;
+	struct afs_call *call = NULL;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&afs_incoming_calls))) {
+		_debug("new call");
+
+		/* don't need the notification */
+		afs_free_skb(skb);
+
+		if (!call) {
+			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+			if (!call) {
+				rxrpc_kernel_reject_call(afs_socket);
+				return;
+			}
+
+			call->async_workfn = afs_process_async_call;
+			INIT_WORK(&call->async_work, afs_async_workfn);
+			call->wait_mode = &afs_async_incoming_call;
+			call->type = &afs_RXCMxxxx;
+			init_waitqueue_head(&call->waitq);
+			skb_queue_head_init(&call->rx_queue);
+			call->state = AFS_CALL_AWAIT_OP_ID;
+
+			_debug("CALL %p{%s} [%d]",
+			       call, call->type->name,
+			       atomic_read(&afs_outstanding_calls));
+			atomic_inc(&afs_outstanding_calls);
+		}
+
+		rxcall = rxrpc_kernel_accept_call(afs_socket,
+						  (unsigned long) call);
+		if (!IS_ERR(rxcall)) {
+			call->rxcall = rxcall;
+			call = NULL;
+		}
+	}
+
+	if (call)
+		afs_free_call(call);
+}
+
+/*
+ * grab the operation ID from an incoming cache manager call
+ */
+static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
+				bool last)
+{
+	size_t len = skb->len;
+	void *oibuf = (void *) &call->operation_ID;
+
+	_enter("{%u},{%zu},%d", call->offset, len, last);
+
+	ASSERTCMP(call->offset, <, 4);
+
+	/* the operation ID forms the first four bytes of the request data */
+	len = min_t(size_t, len, 4 - call->offset);
+	if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
+		BUG();
+	if (!pskb_pull(skb, len))
+		BUG();
+	call->offset += len;
+
+	if (call->offset < 4) {
+		if (last) {
+			_leave(" = -EBADMSG [op ID short]");
+			return -EBADMSG;
+		}
+		_leave(" = 0 [incomplete]");
+		return 0;
+	}
+
+	call->state = AFS_CALL_AWAIT_REQUEST;
+
+	/* ask the cache manager to route the call (it'll change the call type
+	 * if successful) */
+	if (!afs_cm_incoming_call(call))
+		return -ENOTSUPP;
+
+	/* pass responsibility for the remainer of this message off to the
+	 * cache manager op */
+	return call->type->deliver(call, skb, last);
+}
+
+/*
+ * send an empty reply
+ */
+void afs_send_empty_reply(struct afs_call *call)
+{
+	struct msghdr msg;
+
+	_enter("");
+
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= 0;
+
+	call->state = AFS_CALL_AWAIT_ACK;
+	switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+	case 0:
+		_leave(" [replied]");
+		return;
+
+	case -ENOMEM:
+		_debug("oom");
+		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+	default:
+		afs_end_call(call);
+		_leave(" [error]");
+		return;
+	}
+}
+
+/*
+ * send a simple reply
+ */
+void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
+{
+	struct msghdr msg;
+	struct kvec iov[1];
+	int n;
+
+	_enter("");
+
+	iov[0].iov_base		= (void *) buf;
+	iov[0].iov_len		= len;
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= 0;
+
+	call->state = AFS_CALL_AWAIT_ACK;
+	n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
+	if (n >= 0) {
+		/* Success */
+		_leave(" [replied]");
+		return;
+	}
+
+	if (n == -ENOMEM) {
+		_debug("oom");
+		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+	}
+	afs_end_call(call);
+	_leave(" [error]");
+}
+
+/*
+ * extract a piece of data from the received data socket buffers
+ */
+int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
+		     bool last, void *buf, size_t count)
+{
+	size_t len = skb->len;
+
+	_enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+
+	ASSERTCMP(call->offset, <, count);
+
+	len = min_t(size_t, len, count - call->offset);
+	if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
+	    !pskb_pull(skb, len))
+		BUG();
+	call->offset += len;
+
+	if (call->offset < count) {
+		if (last) {
+			_leave(" = -EBADMSG [%d < %zu]", call->offset, count);
+			return -EBADMSG;
+		}
+		_leave(" = -EAGAIN");
+		return -EAGAIN;
+	}
+	return 0;
+}
diff --git a/kernel/fs/afs/security.c b/kernel/fs/afs/security.c
new file mode 100644
index 000000000..8d010422d
--- /dev/null
+++ b/kernel/fs/afs/security.c
@@ -0,0 +1,363 @@
+/* AFS security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+/*
+ * get a key
+ */
+struct key *afs_request_key(struct afs_cell *cell)
+{
+	struct key *key;
+
+	_enter("{%x}", key_serial(cell->anonymous_key));
+
+	_debug("key %s", cell->anonymous_key->description);
+	key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
+			  NULL);
+	if (IS_ERR(key)) {
+		if (PTR_ERR(key) != -ENOKEY) {
+			_leave(" = %ld", PTR_ERR(key));
+			return key;
+		}
+
+		/* act as anonymous user */
+		_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
+		return key_get(cell->anonymous_key);
+	} else {
+		/* act as authorised user */
+		_leave(" = {%x} [auth]", key_serial(key));
+		return key;
+	}
+}
+
+/*
+ * dispose of a permits list
+ */
+void afs_zap_permits(struct rcu_head *rcu)
+{
+	struct afs_permits *permits =
+		container_of(rcu, struct afs_permits, rcu);
+	int loop;
+
+	_enter("{%d}", permits->count);
+
+	for (loop = permits->count - 1; loop >= 0; loop--)
+		key_put(permits->permits[loop].key);
+	kfree(permits);
+}
+
+/*
+ * dispose of a permits list in which all the key pointers have been copied
+ */
+static void afs_dispose_of_permits(struct rcu_head *rcu)
+{
+	struct afs_permits *permits =
+		container_of(rcu, struct afs_permits, rcu);
+
+	_enter("{%d}", permits->count);
+
+	kfree(permits);
+}
+
+/*
+ * get the authorising vnode - this is the specified inode itself if it's a
+ * directory or it's the parent directory if the specified inode is a file or
+ * symlink
+ * - the caller must release the ref on the inode
+ */
+static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
+					    struct key *key)
+{
+	struct afs_vnode *auth_vnode;
+	struct inode *auth_inode;
+
+	_enter("");
+
+	if (S_ISDIR(vnode->vfs_inode.i_mode)) {
+		auth_inode = igrab(&vnode->vfs_inode);
+		ASSERT(auth_inode != NULL);
+	} else {
+		auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
+				      &vnode->status.parent, NULL, NULL);
+		if (IS_ERR(auth_inode))
+			return ERR_CAST(auth_inode);
+	}
+
+	auth_vnode = AFS_FS_I(auth_inode);
+	_leave(" = {%x}", auth_vnode->fid.vnode);
+	return auth_vnode;
+}
+
+/*
+ * clear the permit cache on a directory vnode
+ */
+void afs_clear_permits(struct afs_vnode *vnode)
+{
+	struct afs_permits *permits;
+
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	mutex_lock(&vnode->permits_lock);
+	permits = vnode->permits;
+	rcu_assign_pointer(vnode->permits, NULL);
+	mutex_unlock(&vnode->permits_lock);
+
+	if (permits)
+		call_rcu(&permits->rcu, afs_zap_permits);
+	_leave("");
+}
+
+/*
+ * add the result obtained for a vnode to its or its parent directory's cache
+ * for the key used to access it
+ */
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+{
+	struct afs_permits *permits, *xpermits;
+	struct afs_permit *permit;
+	struct afs_vnode *auth_vnode;
+	int count, loop;
+
+	_enter("{%x:%u},%x,%lx",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order);
+
+	auth_vnode = afs_get_auth_inode(vnode, key);
+	if (IS_ERR(auth_vnode)) {
+		_leave(" [get error %ld]", PTR_ERR(auth_vnode));
+		return;
+	}
+
+	mutex_lock(&auth_vnode->permits_lock);
+
+	/* guard against a rename being detected whilst we waited for the
+	 * lock */
+	if (memcmp(&auth_vnode->fid, &vnode->status.parent,
+		   sizeof(struct afs_fid)) != 0) {
+		_debug("renamed");
+		goto out_unlock;
+	}
+
+	/* have to be careful as the directory's callback may be broken between
+	 * us receiving the status we're trying to cache and us getting the
+	 * lock to update the cache for the status */
+	if (auth_vnode->acl_order - acl_order > 0) {
+		_debug("ACL changed?");
+		goto out_unlock;
+	}
+
+	/* always update the anonymous mask */
+	_debug("anon access %x", vnode->status.anon_access);
+	auth_vnode->status.anon_access = vnode->status.anon_access;
+	if (key == vnode->volume->cell->anonymous_key)
+		goto out_unlock;
+
+	xpermits = auth_vnode->permits;
+	count = 0;
+	if (xpermits) {
+		/* see if the permit is already in the list
+		 * - if it is then we just amend the list
+		 */
+		count = xpermits->count;
+		permit = xpermits->permits;
+		for (loop = count; loop > 0; loop--) {
+			if (permit->key == key) {
+				permit->access_mask =
+					vnode->status.caller_access;
+				goto out_unlock;
+			}
+			permit++;
+		}
+	}
+
+	permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
+			  GFP_NOFS);
+	if (!permits)
+		goto out_unlock;
+
+	if (xpermits)
+		memcpy(permits->permits, xpermits->permits,
+			count * sizeof(struct afs_permit));
+
+	_debug("key %x access %x",
+	       key_serial(key), vnode->status.caller_access);
+	permits->permits[count].access_mask = vnode->status.caller_access;
+	permits->permits[count].key = key_get(key);
+	permits->count = count + 1;
+
+	rcu_assign_pointer(auth_vnode->permits, permits);
+	if (xpermits)
+		call_rcu(&xpermits->rcu, afs_dispose_of_permits);
+
+out_unlock:
+	mutex_unlock(&auth_vnode->permits_lock);
+	iput(&auth_vnode->vfs_inode);
+	_leave("");
+}
+
+/*
+ * check with the fileserver to see if the directory or parent directory is
+ * permitted to be accessed with this authorisation, and if so, what access it
+ * is granted
+ */
+static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
+			    afs_access_t *_access)
+{
+	struct afs_permits *permits;
+	struct afs_permit *permit;
+	struct afs_vnode *auth_vnode;
+	bool valid;
+	int loop, ret;
+
+	_enter("{%x:%u},%x",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key));
+
+	auth_vnode = afs_get_auth_inode(vnode, key);
+	if (IS_ERR(auth_vnode)) {
+		*_access = 0;
+		_leave(" = %ld", PTR_ERR(auth_vnode));
+		return PTR_ERR(auth_vnode);
+	}
+
+	ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+
+	/* check the permits to see if we've got one yet */
+	if (key == auth_vnode->volume->cell->anonymous_key) {
+		_debug("anon");
+		*_access = auth_vnode->status.anon_access;
+		valid = true;
+	} else {
+		valid = false;
+		rcu_read_lock();
+		permits = rcu_dereference(auth_vnode->permits);
+		if (permits) {
+			permit = permits->permits;
+			for (loop = permits->count; loop > 0; loop--) {
+				if (permit->key == key) {
+					_debug("found in cache");
+					*_access = permit->access_mask;
+					valid = true;
+					break;
+				}
+				permit++;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	if (!valid) {
+		/* check the status on the file we're actually interested in
+		 * (the post-processing will cache the result on auth_vnode) */
+		_debug("no valid permit");
+
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+		if (ret < 0) {
+			iput(&auth_vnode->vfs_inode);
+			*_access = 0;
+			_leave(" = %d", ret);
+			return ret;
+		}
+		*_access = vnode->status.caller_access;
+	}
+
+	iput(&auth_vnode->vfs_inode);
+	_leave(" = 0 [access %x]", *_access);
+	return 0;
+}
+
+/*
+ * check the permissions on an AFS file
+ * - AFS ACLs are attached to directories only, and a file is controlled by its
+ *   parent directory's ACL
+ */
+int afs_permission(struct inode *inode, int mask)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	afs_access_t uninitialized_var(access);
+	struct key *key;
+	int ret;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	_enter("{{%x:%u},%lx},%x,",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	/* if the promise has expired, we need to check the server again */
+	if (!vnode->cb_promised) {
+		_debug("not promised");
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error;
+		_debug("new promise [fl=%lx]", vnode->flags);
+	}
+
+	/* check the permits to see if we've got one yet */
+	ret = afs_check_permit(vnode, key, &access);
+	if (ret < 0)
+		goto error;
+
+	/* interpret the access mask */
+	_debug("REQ %x ACC %x on %s",
+	       mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (mask & MAY_EXEC) {
+			if (!(access & AFS_ACE_LOOKUP))
+				goto permission_denied;
+		} else if (mask & MAY_READ) {
+			if (!(access & AFS_ACE_READ))
+				goto permission_denied;
+		} else if (mask & MAY_WRITE) {
+			if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
+					AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
+					AFS_ACE_WRITE))) /* chmod */
+				goto permission_denied;
+		} else {
+			BUG();
+		}
+	} else {
+		if (!(access & AFS_ACE_LOOKUP))
+			goto permission_denied;
+		if (mask & (MAY_EXEC | MAY_READ)) {
+			if (!(access & AFS_ACE_READ))
+				goto permission_denied;
+		} else if (mask & MAY_WRITE) {
+			if (!(access & AFS_ACE_WRITE))
+				goto permission_denied;
+		}
+	}
+
+	key_put(key);
+	ret = generic_permission(inode, mask);
+	_leave(" = %d", ret);
+	return ret;
+
+permission_denied:
+	ret = -EACCES;
+error:
+	key_put(key);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/kernel/fs/afs/server.c b/kernel/fs/afs/server.c
new file mode 100644
index 000000000..f342acf35
--- /dev/null
+++ b/kernel/fs/afs/server.c
@@ -0,0 +1,322 @@
+/* AFS server record management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static unsigned afs_server_timeout = 10;	/* server timeout in seconds */
+
+static void afs_reap_server(struct work_struct *);
+
+/* tree of all the servers, indexed by IP address */
+static struct rb_root afs_servers = RB_ROOT;
+static DEFINE_RWLOCK(afs_servers_lock);
+
+/* LRU list of all the servers not currently in use */
+static LIST_HEAD(afs_server_graveyard);
+static DEFINE_SPINLOCK(afs_server_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+
+/*
+ * install a server record in the master tree
+ */
+static int afs_install_server(struct afs_server *server)
+{
+	struct afs_server *xserver;
+	struct rb_node **pp, *p;
+	int ret;
+
+	_enter("%p", server);
+
+	write_lock(&afs_servers_lock);
+
+	ret = -EEXIST;
+	pp = &afs_servers.rb_node;
+	p = NULL;
+	while (*pp) {
+		p = *pp;
+		_debug("- consider %p", p);
+		xserver = rb_entry(p, struct afs_server, master_rb);
+		if (server->addr.s_addr < xserver->addr.s_addr)
+			pp = &(*pp)->rb_left;
+		else if (server->addr.s_addr > xserver->addr.s_addr)
+			pp = &(*pp)->rb_right;
+		else
+			goto error;
+	}
+
+	rb_link_node(&server->master_rb, p, pp);
+	rb_insert_color(&server->master_rb, &afs_servers);
+	ret = 0;
+
+error:
+	write_unlock(&afs_servers_lock);
+	return ret;
+}
+
+/*
+ * allocate a new server record
+ */
+static struct afs_server *afs_alloc_server(struct afs_cell *cell,
+					   const struct in_addr *addr)
+{
+	struct afs_server *server;
+
+	_enter("");
+
+	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
+	if (server) {
+		atomic_set(&server->usage, 1);
+		server->cell = cell;
+
+		INIT_LIST_HEAD(&server->link);
+		INIT_LIST_HEAD(&server->grave);
+		init_rwsem(&server->sem);
+		spin_lock_init(&server->fs_lock);
+		server->fs_vnodes = RB_ROOT;
+		server->cb_promises = RB_ROOT;
+		spin_lock_init(&server->cb_lock);
+		init_waitqueue_head(&server->cb_break_waitq);
+		INIT_DELAYED_WORK(&server->cb_break_work,
+				  afs_dispatch_give_up_callbacks);
+
+		memcpy(&server->addr, addr, sizeof(struct in_addr));
+		server->addr.s_addr = addr->s_addr;
+		_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	} else {
+		_leave(" = NULL [nomem]");
+	}
+	return server;
+}
+
+/*
+ * get an FS-server record for a cell
+ */
+struct afs_server *afs_lookup_server(struct afs_cell *cell,
+				     const struct in_addr *addr)
+{
+	struct afs_server *server, *candidate;
+
+	_enter("%p,%pI4", cell, &addr->s_addr);
+
+	/* quick scan of the list to see if we already have the server */
+	read_lock(&cell->servers_lock);
+
+	list_for_each_entry(server, &cell->servers, link) {
+		if (server->addr.s_addr == addr->s_addr)
+			goto found_server_quickly;
+	}
+	read_unlock(&cell->servers_lock);
+
+	candidate = afs_alloc_server(cell, addr);
+	if (!candidate) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	write_lock(&cell->servers_lock);
+
+	/* check the cell's server list again */
+	list_for_each_entry(server, &cell->servers, link) {
+		if (server->addr.s_addr == addr->s_addr)
+			goto found_server;
+	}
+
+	_debug("new");
+	server = candidate;
+	if (afs_install_server(server) < 0)
+		goto server_in_two_cells;
+
+	afs_get_cell(cell);
+	list_add_tail(&server->link, &cell->servers);
+
+	write_unlock(&cell->servers_lock);
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	return server;
+
+	/* found a matching server quickly */
+found_server_quickly:
+	_debug("found quickly");
+	afs_get_server(server);
+	read_unlock(&cell->servers_lock);
+no_longer_unused:
+	if (!list_empty(&server->grave)) {
+		spin_lock(&afs_server_graveyard_lock);
+		list_del_init(&server->grave);
+		spin_unlock(&afs_server_graveyard_lock);
+	}
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	return server;
+
+	/* found a matching server on the second pass */
+found_server:
+	_debug("found");
+	afs_get_server(server);
+	write_unlock(&cell->servers_lock);
+	kfree(candidate);
+	goto no_longer_unused;
+
+	/* found a server that seems to be in two cells */
+server_in_two_cells:
+	write_unlock(&cell->servers_lock);
+	kfree(candidate);
+	printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n",
+	       addr);
+	_leave(" = -EEXIST");
+	return ERR_PTR(-EEXIST);
+}
+
+/*
+ * look up a server by its IP address
+ */
+struct afs_server *afs_find_server(const struct in_addr *_addr)
+{
+	struct afs_server *server = NULL;
+	struct rb_node *p;
+	struct in_addr addr = *_addr;
+
+	_enter("%pI4", &addr.s_addr);
+
+	read_lock(&afs_servers_lock);
+
+	p = afs_servers.rb_node;
+	while (p) {
+		server = rb_entry(p, struct afs_server, master_rb);
+
+		_debug("- consider %p", p);
+
+		if (addr.s_addr < server->addr.s_addr) {
+			p = p->rb_left;
+		} else if (addr.s_addr > server->addr.s_addr) {
+			p = p->rb_right;
+		} else {
+			afs_get_server(server);
+			goto found;
+		}
+	}
+
+	server = NULL;
+found:
+	read_unlock(&afs_servers_lock);
+	ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
+	_leave(" = %p", server);
+	return server;
+}
+
+/*
+ * destroy a server record
+ * - removes from the cell list
+ */
+void afs_put_server(struct afs_server *server)
+{
+	if (!server)
+		return;
+
+	_enter("%p{%d}", server, atomic_read(&server->usage));
+
+	_debug("PUT SERVER %d", atomic_read(&server->usage));
+
+	ASSERTCMP(atomic_read(&server->usage), >, 0);
+
+	if (likely(!atomic_dec_and_test(&server->usage))) {
+		_leave("");
+		return;
+	}
+
+	afs_flush_callback_breaks(server);
+
+	spin_lock(&afs_server_graveyard_lock);
+	if (atomic_read(&server->usage) == 0) {
+		list_move_tail(&server->grave, &afs_server_graveyard);
+		server->time_of_death = get_seconds();
+		queue_delayed_work(afs_wq, &afs_server_reaper,
+				   afs_server_timeout * HZ);
+	}
+	spin_unlock(&afs_server_graveyard_lock);
+	_leave(" [dead]");
+}
+
+/*
+ * destroy a dead server
+ */
+static void afs_destroy_server(struct afs_server *server)
+{
+	_enter("%p", server);
+
+	ASSERTIF(server->cb_break_head != server->cb_break_tail,
+		 delayed_work_pending(&server->cb_break_work));
+
+	ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
+	ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
+	ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
+	ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
+
+	afs_put_cell(server->cell);
+	kfree(server);
+}
+
+/*
+ * reap dead server records
+ */
+static void afs_reap_server(struct work_struct *work)
+{
+	LIST_HEAD(corpses);
+	struct afs_server *server;
+	unsigned long delay, expiry;
+	time_t now;
+
+	now = get_seconds();
+	spin_lock(&afs_server_graveyard_lock);
+
+	while (!list_empty(&afs_server_graveyard)) {
+		server = list_entry(afs_server_graveyard.next,
+				    struct afs_server, grave);
+
+		/* the queue is ordered most dead first */
+		expiry = server->time_of_death + afs_server_timeout;
+		if (expiry > now) {
+			delay = (expiry - now) * HZ;
+			mod_delayed_work(afs_wq, &afs_server_reaper, delay);
+			break;
+		}
+
+		write_lock(&server->cell->servers_lock);
+		write_lock(&afs_servers_lock);
+		if (atomic_read(&server->usage) > 0) {
+			list_del_init(&server->grave);
+		} else {
+			list_move_tail(&server->grave, &corpses);
+			list_del_init(&server->link);
+			rb_erase(&server->master_rb, &afs_servers);
+		}
+		write_unlock(&afs_servers_lock);
+		write_unlock(&server->cell->servers_lock);
+	}
+
+	spin_unlock(&afs_server_graveyard_lock);
+
+	/* now reap the corpses we've extracted */
+	while (!list_empty(&corpses)) {
+		server = list_entry(corpses.next, struct afs_server, grave);
+		list_del(&server->grave);
+		afs_destroy_server(server);
+	}
+}
+
+/*
+ * discard all the server records for rmmod
+ */
+void __exit afs_purge_servers(void)
+{
+	afs_server_timeout = 0;
+	mod_delayed_work(afs_wq, &afs_server_reaper, 0);
+}
diff --git a/kernel/fs/afs/super.c b/kernel/fs/afs/super.c
new file mode 100644
index 000000000..1fb4a5129
--- /dev/null
+++ b/kernel/fs/afs/super.c
@@ -0,0 +1,557 @@
+/* AFS superblock handling
+ *
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: David Howells <dhowells@redhat.com>
+ *          David Woodhouse <dwmw2@infradead.org>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include "internal.h"
+
+#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
+
+static void afs_i_init_once(void *foo);
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data);
+static void afs_kill_super(struct super_block *sb);
+static struct inode *afs_alloc_inode(struct super_block *sb);
+static void afs_destroy_inode(struct inode *inode);
+static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+struct file_system_type afs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "afs",
+	.mount		= afs_mount,
+	.kill_sb	= afs_kill_super,
+	.fs_flags	= 0,
+};
+MODULE_ALIAS_FS("afs");
+
+static const struct super_operations afs_super_ops = {
+	.statfs		= afs_statfs,
+	.alloc_inode	= afs_alloc_inode,
+	.drop_inode	= afs_drop_inode,
+	.destroy_inode	= afs_destroy_inode,
+	.evict_inode	= afs_evict_inode,
+	.show_options	= generic_show_options,
+};
+
+static struct kmem_cache *afs_inode_cachep;
+static atomic_t afs_count_active_inodes;
+
+enum {
+	afs_no_opt,
+	afs_opt_cell,
+	afs_opt_rwpath,
+	afs_opt_vol,
+	afs_opt_autocell,
+};
+
+static const match_table_t afs_options_list = {
+	{ afs_opt_cell,		"cell=%s"	},
+	{ afs_opt_rwpath,	"rwpath"	},
+	{ afs_opt_vol,		"vol=%s"	},
+	{ afs_opt_autocell,	"autocell"	},
+	{ afs_no_opt,		NULL		},
+};
+
+/*
+ * initialise the filesystem
+ */
+int __init afs_fs_init(void)
+{
+	int ret;
+
+	_enter("");
+
+	/* create ourselves an inode cache */
+	atomic_set(&afs_count_active_inodes, 0);
+
+	ret = -ENOMEM;
+	afs_inode_cachep = kmem_cache_create("afs_inode_cache",
+					     sizeof(struct afs_vnode),
+					     0,
+					     SLAB_HWCACHE_ALIGN,
+					     afs_i_init_once);
+	if (!afs_inode_cachep) {
+		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
+		return ret;
+	}
+
+	/* now export our filesystem to lesser mortals */
+	ret = register_filesystem(&afs_fs_type);
+	if (ret < 0) {
+		kmem_cache_destroy(afs_inode_cachep);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * clean up the filesystem
+ */
+void __exit afs_fs_exit(void)
+{
+	_enter("");
+
+	afs_mntpt_kill_timer();
+	unregister_filesystem(&afs_fs_type);
+
+	if (atomic_read(&afs_count_active_inodes) != 0) {
+		printk("kAFS: %d active inode objects still present\n",
+		       atomic_read(&afs_count_active_inodes));
+		BUG();
+	}
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(afs_inode_cachep);
+	_leave("");
+}
+
+/*
+ * parse the mount options
+ * - this function has been shamelessly adapted from the ext3 fs which
+ *   shamelessly adapted it from the msdos fs
+ */
+static int afs_parse_options(struct afs_mount_params *params,
+			     char *options, const char **devname)
+{
+	struct afs_cell *cell;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+	int token;
+
+	_enter("%s", options);
+
+	options[PAGE_SIZE - 1] = 0;
+
+	while ((p = strsep(&options, ","))) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, afs_options_list, args);
+		switch (token) {
+		case afs_opt_cell:
+			cell = afs_cell_lookup(args[0].from,
+					       args[0].to - args[0].from,
+					       false);
+			if (IS_ERR(cell))
+				return PTR_ERR(cell);
+			afs_put_cell(params->cell);
+			params->cell = cell;
+			break;
+
+		case afs_opt_rwpath:
+			params->rwpath = 1;
+			break;
+
+		case afs_opt_vol:
+			*devname = args[0].from;
+			break;
+
+		case afs_opt_autocell:
+			params->autocell = 1;
+			break;
+
+		default:
+			printk(KERN_ERR "kAFS:"
+			       " Unknown or invalid mount option: '%s'\n", p);
+			return -EINVAL;
+		}
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * parse a device name to get cell name, volume name, volume type and R/W
+ * selector
+ * - this can be one of the following:
+ *	"%[cell:]volume[.]"		R/W volume
+ *	"#[cell:]volume[.]"		R/O or R/W volume (rwpath=0),
+ *					 or R/W (rwpath=1) volume
+ *	"%[cell:]volume.readonly"	R/O volume
+ *	"#[cell:]volume.readonly"	R/O volume
+ *	"%[cell:]volume.backup"		Backup volume
+ *	"#[cell:]volume.backup"		Backup volume
+ */
+static int afs_parse_device_name(struct afs_mount_params *params,
+				 const char *name)
+{
+	struct afs_cell *cell;
+	const char *cellname, *suffix;
+	int cellnamesz;
+
+	_enter(",%s", name);
+
+	if (!name) {
+		printk(KERN_ERR "kAFS: no volume name specified\n");
+		return -EINVAL;
+	}
+
+	if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+		printk(KERN_ERR "kAFS: unparsable volume name\n");
+		return -EINVAL;
+	}
+
+	/* determine the type of volume we're looking for */
+	params->type = AFSVL_ROVOL;
+	params->force = false;
+	if (params->rwpath || name[0] == '%') {
+		params->type = AFSVL_RWVOL;
+		params->force = true;
+	}
+	name++;
+
+	/* split the cell name out if there is one */
+	params->volname = strchr(name, ':');
+	if (params->volname) {
+		cellname = name;
+		cellnamesz = params->volname - name;
+		params->volname++;
+	} else {
+		params->volname = name;
+		cellname = NULL;
+		cellnamesz = 0;
+	}
+
+	/* the volume type is further affected by a possible suffix */
+	suffix = strrchr(params->volname, '.');
+	if (suffix) {
+		if (strcmp(suffix, ".readonly") == 0) {
+			params->type = AFSVL_ROVOL;
+			params->force = true;
+		} else if (strcmp(suffix, ".backup") == 0) {
+			params->type = AFSVL_BACKVOL;
+			params->force = true;
+		} else if (suffix[1] == 0) {
+		} else {
+			suffix = NULL;
+		}
+	}
+
+	params->volnamesz = suffix ?
+		suffix - params->volname : strlen(params->volname);
+
+	_debug("cell %*.*s [%p]",
+	       cellnamesz, cellnamesz, cellname ?: "", params->cell);
+
+	/* lookup the cell record */
+	if (cellname || !params->cell) {
+		cell = afs_cell_lookup(cellname, cellnamesz, true);
+		if (IS_ERR(cell)) {
+			printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+			       cellnamesz, cellnamesz, cellname ?: "");
+			return PTR_ERR(cell);
+		}
+		afs_put_cell(params->cell);
+		params->cell = cell;
+	}
+
+	_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
+	       params->cell->name, params->cell,
+	       params->volnamesz, params->volnamesz, params->volname,
+	       suffix ?: "-", params->type, params->force ? " FORCE" : "");
+
+	return 0;
+}
+
+/*
+ * check a superblock to see if it's the one we're looking for
+ */
+static int afs_test_super(struct super_block *sb, void *data)
+{
+	struct afs_super_info *as1 = data;
+	struct afs_super_info *as = sb->s_fs_info;
+
+	return as->volume == as1->volume;
+}
+
+static int afs_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+/*
+ * fill in the superblock
+ */
+static int afs_fill_super(struct super_block *sb,
+			  struct afs_mount_params *params)
+{
+	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_fid fid;
+	struct inode *inode = NULL;
+	int ret;
+
+	_enter("");
+
+	/* fill in the superblock */
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= AFS_FS_MAGIC;
+	sb->s_op		= &afs_super_ops;
+	sb->s_bdi		= &as->volume->bdi;
+	strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
+
+	/* allocate the root inode and dentry */
+	fid.vid		= as->volume->vid;
+	fid.vnode	= 1;
+	fid.unique	= 1;
+	inode = afs_iget(sb, params->key, &fid, NULL, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (params->autocell)
+		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
+
+	ret = -ENOMEM;
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		goto error;
+
+	sb->s_d_op = &afs_fs_dentry_operations;
+
+	_leave(" = 0");
+	return 0;
+
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * get an AFS superblock
+ */
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *options)
+{
+	struct afs_mount_params params;
+	struct super_block *sb;
+	struct afs_volume *vol;
+	struct key *key;
+	char *new_opts = kstrdup(options, GFP_KERNEL);
+	struct afs_super_info *as;
+	int ret;
+
+	_enter(",,%s,%p", dev_name, options);
+
+	memset(&params, 0, sizeof(params));
+
+	ret = -EINVAL;
+	if (current->nsproxy->net_ns != &init_net)
+		goto error;
+
+	/* parse the options and device name */
+	if (options) {
+		ret = afs_parse_options(&params, options, &dev_name);
+		if (ret < 0)
+			goto error;
+	}
+
+	ret = afs_parse_device_name(&params, dev_name);
+	if (ret < 0)
+		goto error;
+
+	/* try and do the mount securely */
+	key = afs_request_key(params.cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		ret = PTR_ERR(key);
+		goto error;
+	}
+	params.key = key;
+
+	/* parse the device name */
+	vol = afs_volume_lookup(&params);
+	if (IS_ERR(vol)) {
+		ret = PTR_ERR(vol);
+		goto error;
+	}
+
+	/* allocate a superblock info record */
+	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
+	if (!as) {
+		ret = -ENOMEM;
+		afs_put_volume(vol);
+		goto error;
+	}
+	as->volume = vol;
+
+	/* allocate a deviceless superblock */
+	sb = sget(fs_type, afs_test_super, afs_set_super, flags, as);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		afs_put_volume(vol);
+		kfree(as);
+		goto error;
+	}
+
+	if (!sb->s_root) {
+		/* initial superblock/root creation */
+		_debug("create");
+		ret = afs_fill_super(sb, &params);
+		if (ret < 0) {
+			deactivate_locked_super(sb);
+			goto error;
+		}
+		save_mount_options(sb, new_opts);
+		sb->s_flags |= MS_ACTIVE;
+	} else {
+		_debug("reuse");
+		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+		afs_put_volume(vol);
+		kfree(as);
+	}
+
+	afs_put_cell(params.cell);
+	kfree(new_opts);
+	_leave(" = 0 [%p]", sb);
+	return dget(sb->s_root);
+
+error:
+	afs_put_cell(params.cell);
+	key_put(params.key);
+	kfree(new_opts);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+static void afs_kill_super(struct super_block *sb)
+{
+	struct afs_super_info *as = sb->s_fs_info;
+	kill_anon_super(sb);
+	afs_put_volume(as->volume);
+	kfree(as);
+}
+
+/*
+ * initialise an inode cache slab element prior to any use
+ */
+static void afs_i_init_once(void *_vnode)
+{
+	struct afs_vnode *vnode = _vnode;
+
+	memset(vnode, 0, sizeof(*vnode));
+	inode_init_once(&vnode->vfs_inode);
+	init_waitqueue_head(&vnode->update_waitq);
+	mutex_init(&vnode->permits_lock);
+	mutex_init(&vnode->validate_lock);
+	spin_lock_init(&vnode->writeback_lock);
+	spin_lock_init(&vnode->lock);
+	INIT_LIST_HEAD(&vnode->writebacks);
+	INIT_LIST_HEAD(&vnode->pending_locks);
+	INIT_LIST_HEAD(&vnode->granted_locks);
+	INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
+	INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
+}
+
+/*
+ * allocate an AFS inode struct from our slab cache
+ */
+static struct inode *afs_alloc_inode(struct super_block *sb)
+{
+	struct afs_vnode *vnode;
+
+	vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+	if (!vnode)
+		return NULL;
+
+	atomic_inc(&afs_count_active_inodes);
+
+	memset(&vnode->fid, 0, sizeof(vnode->fid));
+	memset(&vnode->status, 0, sizeof(vnode->status));
+
+	vnode->volume		= NULL;
+	vnode->update_cnt	= 0;
+	vnode->flags		= 1 << AFS_VNODE_UNSET;
+	vnode->cb_promised	= false;
+
+	_leave(" = %p", &vnode->vfs_inode);
+	return &vnode->vfs_inode;
+}
+
+static void afs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	kmem_cache_free(afs_inode_cachep, vnode);
+}
+
+/*
+ * destroy an AFS inode struct
+ */
+static void afs_destroy_inode(struct inode *inode)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
+	_enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode);
+
+	_debug("DESTROY INODE %p", inode);
+
+	ASSERTCMP(vnode->server, ==, NULL);
+
+	call_rcu(&inode->i_rcu, afs_i_callback);
+	atomic_dec(&afs_count_active_inodes);
+}
+
+/*
+ * return information about an AFS volume
+ */
+static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct afs_volume_status vs;
+	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+	struct key *key;
+	int ret;
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	ret = afs_vnode_get_volume_status(vnode, key, &vs);
+	key_put(key);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	buf->f_type	= dentry->d_sb->s_magic;
+	buf->f_bsize	= AFS_BLOCK_SIZE;
+	buf->f_namelen	= AFSNAMEMAX - 1;
+
+	if (vs.max_quota == 0)
+		buf->f_blocks = vs.part_max_blocks;
+	else
+		buf->f_blocks = vs.max_quota;
+	buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
+	return 0;
+}
diff --git a/kernel/fs/afs/vlclient.c b/kernel/fs/afs/vlclient.c
new file mode 100644
index 000000000..340afd0cd
--- /dev/null
+++ b/kernel/fs/afs/vlclient.c
@@ -0,0 +1,219 @@
+/* AFS Volume Location Service client
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+/*
+ * map volume locator abort codes to error codes
+ */
+static int afs_vl_abort_to_error(u32 abort_code)
+{
+	_enter("%u", abort_code);
+
+	switch (abort_code) {
+	case AFSVL_IDEXIST:		return -EEXIST;
+	case AFSVL_IO:			return -EREMOTEIO;
+	case AFSVL_NAMEEXIST:		return -EEXIST;
+	case AFSVL_CREATEFAIL:		return -EREMOTEIO;
+	case AFSVL_NOENT:		return -ENOMEDIUM;
+	case AFSVL_EMPTY:		return -ENOMEDIUM;
+	case AFSVL_ENTDELETED:		return -ENOMEDIUM;
+	case AFSVL_BADNAME:		return -EINVAL;
+	case AFSVL_BADINDEX:		return -EINVAL;
+	case AFSVL_BADVOLTYPE:		return -EINVAL;
+	case AFSVL_BADSERVER:		return -EINVAL;
+	case AFSVL_BADPARTITION:	return -EINVAL;
+	case AFSVL_REPSFULL:		return -EFBIG;
+	case AFSVL_NOREPSERVER:		return -ENOENT;
+	case AFSVL_DUPREPSERVER:	return -EEXIST;
+	case AFSVL_RWNOTFOUND:		return -ENOENT;
+	case AFSVL_BADREFCOUNT:		return -EINVAL;
+	case AFSVL_SIZEEXCEEDED:	return -EINVAL;
+	case AFSVL_BADENTRY:		return -EINVAL;
+	case AFSVL_BADVOLIDBUMP:	return -EINVAL;
+	case AFSVL_IDALREADYHASHED:	return -EINVAL;
+	case AFSVL_ENTRYLOCKED:		return -EBUSY;
+	case AFSVL_BADVOLOPER:		return -EBADRQC;
+	case AFSVL_BADRELLOCKTYPE:	return -EINVAL;
+	case AFSVL_RERELEASE:		return -EREMOTEIO;
+	case AFSVL_BADSERVERFLAG:	return -EINVAL;
+	case AFSVL_PERM:		return -EACCES;
+	case AFSVL_NOMEM:		return -EREMOTEIO;
+	default:
+		return afs_abort_to_error(abort_code);
+	}
+}
+
+/*
+ * deliver reply data to a VL.GetEntryByXXX call
+ */
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
+					   struct sk_buff *skb, bool last)
+{
+	struct afs_cache_vlocation *entry;
+	__be32 *bp;
+	u32 tmp;
+	int loop;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	entry = call->reply;
+	bp = call->buffer;
+
+	for (loop = 0; loop < 64; loop++)
+		entry->name[loop] = ntohl(*bp++);
+	entry->name[loop] = 0;
+	bp++; /* final NUL */
+
+	bp++; /* type */
+	entry->nservers = ntohl(*bp++);
+
+	for (loop = 0; loop < 8; loop++)
+		entry->servers[loop].s_addr = *bp++;
+
+	bp += 8; /* partition IDs */
+
+	for (loop = 0; loop < 8; loop++) {
+		tmp = ntohl(*bp++);
+		entry->srvtmask[loop] = 0;
+		if (tmp & AFS_VLSF_RWVOL)
+			entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
+		if (tmp & AFS_VLSF_ROVOL)
+			entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
+		if (tmp & AFS_VLSF_BACKVOL)
+			entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
+	}
+
+	entry->vid[0] = ntohl(*bp++);
+	entry->vid[1] = ntohl(*bp++);
+	entry->vid[2] = ntohl(*bp++);
+
+	bp++; /* clone ID */
+
+	tmp = ntohl(*bp++); /* flags */
+	entry->vidmask = 0;
+	if (tmp & AFS_VLF_RWEXISTS)
+		entry->vidmask |= AFS_VOL_VTM_RW;
+	if (tmp & AFS_VLF_ROEXISTS)
+		entry->vidmask |= AFS_VOL_VTM_RO;
+	if (tmp & AFS_VLF_BACKEXISTS)
+		entry->vidmask |= AFS_VOL_VTM_BAK;
+	if (!entry->vidmask)
+		return -EBADMSG;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * VL.GetEntryByName operation type
+ */
+static const struct afs_call_type afs_RXVLGetEntryByName = {
+	.name		= "VL.GetEntryByName",
+	.deliver	= afs_deliver_vl_get_entry_by_xxx,
+	.abort_to_error	= afs_vl_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * VL.GetEntryById operation type
+ */
+static const struct afs_call_type afs_RXVLGetEntryById = {
+	.name		= "VL.GetEntryById",
+	.deliver	= afs_deliver_vl_get_entry_by_xxx,
+	.abort_to_error	= afs_vl_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * dispatch a get volume entry by name operation
+ */
+int afs_vl_get_entry_by_name(struct in_addr *addr,
+			     struct key *key,
+			     const char *volname,
+			     struct afs_cache_vlocation *entry,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t volnamesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	volnamesz = strlen(volname);
+	padsz = (4 - (volnamesz & 3)) & 3;
+	reqsz = 8 + volnamesz + padsz;
+
+	call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = entry;
+	call->service_id = VL_SERVICE;
+	call->port = htons(AFS_VL_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETENTRYBYNAME);
+	*bp++ = htonl(volnamesz);
+	memcpy(bp, volname, volnamesz);
+	if (padsz > 0)
+		memset((void *) bp + volnamesz, 0, padsz);
+
+	/* initiate the call */
+	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
+
+/*
+ * dispatch a get volume entry by ID operation
+ */
+int afs_vl_get_entry_by_id(struct in_addr *addr,
+			   struct key *key,
+			   afs_volid_t volid,
+			   afs_voltype_t voltype,
+			   struct afs_cache_vlocation *entry,
+			   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = entry;
+	call->service_id = VL_SERVICE;
+	call->port = htons(AFS_VL_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETENTRYBYID);
+	*bp++ = htonl(volid);
+	*bp   = htonl(voltype);
+
+	/* initiate the call */
+	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
diff --git a/kernel/fs/afs/vlocation.c b/kernel/fs/afs/vlocation.c
new file mode 100644
index 000000000..52976785a
--- /dev/null
+++ b/kernel/fs/afs/vlocation.c
@@ -0,0 +1,718 @@
+/* AFS volume location management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
+static unsigned afs_vlocation_update_timeout = 10 * 60;
+
+static void afs_vlocation_reaper(struct work_struct *);
+static void afs_vlocation_updater(struct work_struct *);
+
+static LIST_HEAD(afs_vlocation_updates);
+static LIST_HEAD(afs_vlocation_graveyard);
+static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
+static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
+static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
+static struct workqueue_struct *afs_vlocation_update_worker;
+
+/*
+ * iterate through the VL servers in a cell until one of them admits knowing
+ * about the volume in question
+ */
+static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
+					   struct key *key,
+					   struct afs_cache_vlocation *vldb)
+{
+	struct afs_cell *cell = vl->cell;
+	struct in_addr addr;
+	int count, ret;
+
+	_enter("%s,%s", cell->name, vl->vldb.name);
+
+	down_write(&vl->cell->vl_sem);
+	ret = -ENOMEDIUM;
+	for (count = cell->vl_naddrs; count > 0; count--) {
+		addr = cell->vl_addrs[cell->vl_curr_svix];
+
+		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
+
+		/* attempt to access the VL server */
+		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
+					       &afs_sync_call);
+		switch (ret) {
+		case 0:
+			goto out;
+		case -ENOMEM:
+		case -ENONET:
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			if (ret == -ENOMEM || ret == -ENONET)
+				goto out;
+			goto rotate;
+		case -ENOMEDIUM:
+		case -EKEYREJECTED:
+		case -EKEYEXPIRED:
+			goto out;
+		default:
+			ret = -EIO;
+			goto rotate;
+		}
+
+		/* rotate the server records upon lookup failure */
+	rotate:
+		cell->vl_curr_svix++;
+		cell->vl_curr_svix %= cell->vl_naddrs;
+	}
+
+out:
+	up_write(&vl->cell->vl_sem);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * iterate through the VL servers in a cell until one of them admits knowing
+ * about the volume in question
+ */
+static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
+					 struct key *key,
+					 afs_volid_t volid,
+					 afs_voltype_t voltype,
+					 struct afs_cache_vlocation *vldb)
+{
+	struct afs_cell *cell = vl->cell;
+	struct in_addr addr;
+	int count, ret;
+
+	_enter("%s,%x,%d,", cell->name, volid, voltype);
+
+	down_write(&vl->cell->vl_sem);
+	ret = -ENOMEDIUM;
+	for (count = cell->vl_naddrs; count > 0; count--) {
+		addr = cell->vl_addrs[cell->vl_curr_svix];
+
+		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
+
+		/* attempt to access the VL server */
+		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
+					     &afs_sync_call);
+		switch (ret) {
+		case 0:
+			goto out;
+		case -ENOMEM:
+		case -ENONET:
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			if (ret == -ENOMEM || ret == -ENONET)
+				goto out;
+			goto rotate;
+		case -EBUSY:
+			vl->upd_busy_cnt++;
+			if (vl->upd_busy_cnt <= 3) {
+				if (vl->upd_busy_cnt > 1) {
+					/* second+ BUSY - sleep a little bit */
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					schedule_timeout(1);
+				}
+				continue;
+			}
+			break;
+		case -ENOMEDIUM:
+			vl->upd_rej_cnt++;
+			goto rotate;
+		default:
+			ret = -EIO;
+			goto rotate;
+		}
+
+		/* rotate the server records upon lookup failure */
+	rotate:
+		cell->vl_curr_svix++;
+		cell->vl_curr_svix %= cell->vl_naddrs;
+		vl->upd_busy_cnt = 0;
+	}
+
+out:
+	if (ret < 0 && vl->upd_rej_cnt > 0) {
+		printk(KERN_NOTICE "kAFS:"
+		       " Active volume no longer valid '%s'\n",
+		       vl->vldb.name);
+		vl->valid = 0;
+		ret = -ENOMEDIUM;
+	}
+
+	up_write(&vl->cell->vl_sem);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * allocate a volume location record
+ */
+static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
+						 const char *name,
+						 size_t namesz)
+{
+	struct afs_vlocation *vl;
+
+	vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
+	if (vl) {
+		vl->cell = cell;
+		vl->state = AFS_VL_NEW;
+		atomic_set(&vl->usage, 1);
+		INIT_LIST_HEAD(&vl->link);
+		INIT_LIST_HEAD(&vl->grave);
+		INIT_LIST_HEAD(&vl->update);
+		init_waitqueue_head(&vl->waitq);
+		spin_lock_init(&vl->lock);
+		memcpy(vl->vldb.name, name, namesz);
+	}
+
+	_leave(" = %p", vl);
+	return vl;
+}
+
+/*
+ * update record if we found it in the cache
+ */
+static int afs_vlocation_update_record(struct afs_vlocation *vl,
+				       struct key *key,
+				       struct afs_cache_vlocation *vldb)
+{
+	afs_voltype_t voltype;
+	afs_volid_t vid;
+	int ret;
+
+	/* try to look up a cached volume in the cell VL databases by ID */
+	_debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+	       vl->vldb.name,
+	       vl->vldb.vidmask,
+	       ntohl(vl->vldb.servers[0].s_addr),
+	       vl->vldb.srvtmask[0],
+	       ntohl(vl->vldb.servers[1].s_addr),
+	       vl->vldb.srvtmask[1],
+	       ntohl(vl->vldb.servers[2].s_addr),
+	       vl->vldb.srvtmask[2]);
+
+	_debug("Vids: %08x %08x %08x",
+	       vl->vldb.vid[0],
+	       vl->vldb.vid[1],
+	       vl->vldb.vid[2]);
+
+	if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
+		vid = vl->vldb.vid[0];
+		voltype = AFSVL_RWVOL;
+	} else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
+		vid = vl->vldb.vid[1];
+		voltype = AFSVL_ROVOL;
+	} else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
+		vid = vl->vldb.vid[2];
+		voltype = AFSVL_BACKVOL;
+	} else {
+		BUG();
+		vid = 0;
+		voltype = 0;
+	}
+
+	/* contact the server to make sure the volume is still available
+	 * - TODO: need to handle disconnected operation here
+	 */
+	ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
+	switch (ret) {
+		/* net error */
+	default:
+		printk(KERN_WARNING "kAFS:"
+		       " failed to update volume '%s' (%x) up in '%s': %d\n",
+		       vl->vldb.name, vid, vl->cell->name, ret);
+		_leave(" = %d", ret);
+		return ret;
+
+		/* pulled from local cache into memory */
+	case 0:
+		_leave(" = 0");
+		return 0;
+
+		/* uh oh... looks like the volume got deleted */
+	case -ENOMEDIUM:
+		printk(KERN_ERR "kAFS:"
+		       " volume '%s' (%x) does not exist '%s'\n",
+		       vl->vldb.name, vid, vl->cell->name);
+
+		/* TODO: make existing record unavailable */
+		_leave(" = %d", ret);
+		return ret;
+	}
+}
+
+/*
+ * apply the update to a VL record
+ */
+static void afs_vlocation_apply_update(struct afs_vlocation *vl,
+				       struct afs_cache_vlocation *vldb)
+{
+	_debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+	       vldb->name, vldb->vidmask,
+	       ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
+	       ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
+	       ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
+
+	_debug("Vids: %08x %08x %08x",
+	       vldb->vid[0], vldb->vid[1], vldb->vid[2]);
+
+	if (strcmp(vldb->name, vl->vldb.name) != 0)
+		printk(KERN_NOTICE "kAFS:"
+		       " name of volume '%s' changed to '%s' on server\n",
+		       vl->vldb.name, vldb->name);
+
+	vl->vldb = *vldb;
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_update_cookie(vl->cache);
+#endif
+}
+
+/*
+ * fill in a volume location record, consulting the cache and the VL server
+ * both
+ */
+static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
+					struct key *key)
+{
+	struct afs_cache_vlocation vldb;
+	int ret;
+
+	_enter("");
+
+	ASSERTCMP(vl->valid, ==, 0);
+
+	memset(&vldb, 0, sizeof(vldb));
+
+	/* see if we have an in-cache copy (will set vl->valid if there is) */
+#ifdef CONFIG_AFS_FSCACHE
+	vl->cache = fscache_acquire_cookie(vl->cell->cache,
+					   &afs_vlocation_cache_index_def, vl,
+					   true);
+#endif
+
+	if (vl->valid) {
+		/* try to update a known volume in the cell VL databases by
+		 * ID as the name may have changed */
+		_debug("found in cache");
+		ret = afs_vlocation_update_record(vl, key, &vldb);
+	} else {
+		/* try to look up an unknown volume in the cell VL databases by
+		 * name */
+		ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
+		if (ret < 0) {
+			printk("kAFS: failed to locate '%s' in cell '%s'\n",
+			       vl->vldb.name, vl->cell->name);
+			return ret;
+		}
+	}
+
+	afs_vlocation_apply_update(vl, &vldb);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * queue a vlocation record for updates
+ */
+static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
+{
+	struct afs_vlocation *xvl;
+
+	/* wait at least 10 minutes before updating... */
+	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
+
+	spin_lock(&afs_vlocation_updates_lock);
+
+	if (!list_empty(&afs_vlocation_updates)) {
+		/* ... but wait at least 1 second more than the newest record
+		 * already queued so that we don't spam the VL server suddenly
+		 * with lots of requests
+		 */
+		xvl = list_entry(afs_vlocation_updates.prev,
+				 struct afs_vlocation, update);
+		if (vl->update_at <= xvl->update_at)
+			vl->update_at = xvl->update_at + 1;
+	} else {
+		queue_delayed_work(afs_vlocation_update_worker,
+				   &afs_vlocation_update,
+				   afs_vlocation_update_timeout * HZ);
+	}
+
+	list_add_tail(&vl->update, &afs_vlocation_updates);
+	spin_unlock(&afs_vlocation_updates_lock);
+}
+
+/*
+ * lookup volume location
+ * - iterate through the VL servers in a cell until one of them admits knowing
+ *   about the volume in question
+ * - lookup in the local cache if not able to find on the VL server
+ * - insert/update in the local cache if did get a VL response
+ */
+struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
+					   struct key *key,
+					   const char *name,
+					   size_t namesz)
+{
+	struct afs_vlocation *vl;
+	int ret;
+
+	_enter("{%s},{%x},%*.*s,%zu",
+	       cell->name, key_serial(key),
+	       (int) namesz, (int) namesz, name, namesz);
+
+	if (namesz >= sizeof(vl->vldb.name)) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	/* see if we have an in-memory copy first */
+	down_write(&cell->vl_sem);
+	spin_lock(&cell->vl_lock);
+	list_for_each_entry(vl, &cell->vl_list, link) {
+		if (vl->vldb.name[namesz] != '\0')
+			continue;
+		if (memcmp(vl->vldb.name, name, namesz) == 0)
+			goto found_in_memory;
+	}
+	spin_unlock(&cell->vl_lock);
+
+	/* not in the cell's in-memory lists - create a new record */
+	vl = afs_vlocation_alloc(cell, name, namesz);
+	if (!vl) {
+		up_write(&cell->vl_sem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	afs_get_cell(cell);
+
+	list_add_tail(&vl->link, &cell->vl_list);
+	vl->state = AFS_VL_CREATING;
+	up_write(&cell->vl_sem);
+
+fill_in_record:
+	ret = afs_vlocation_fill_in_record(vl, key);
+	if (ret < 0)
+		goto error_abandon;
+	spin_lock(&vl->lock);
+	vl->state = AFS_VL_VALID;
+	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
+
+	/* update volume entry in local cache */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_update_cookie(vl->cache);
+#endif
+
+	/* schedule for regular updates */
+	afs_vlocation_queue_for_updates(vl);
+	goto success;
+
+found_in_memory:
+	/* found in memory */
+	_debug("found in memory");
+	atomic_inc(&vl->usage);
+	spin_unlock(&cell->vl_lock);
+	if (!list_empty(&vl->grave)) {
+		spin_lock(&afs_vlocation_graveyard_lock);
+		list_del_init(&vl->grave);
+		spin_unlock(&afs_vlocation_graveyard_lock);
+	}
+	up_write(&cell->vl_sem);
+
+	/* see if it was an abandoned record that we might try filling in */
+	spin_lock(&vl->lock);
+	while (vl->state != AFS_VL_VALID) {
+		afs_vlocation_state_t state = vl->state;
+
+		_debug("invalid [state %d]", state);
+
+		if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
+			vl->state = AFS_VL_CREATING;
+			spin_unlock(&vl->lock);
+			goto fill_in_record;
+		}
+
+		/* must now wait for creation or update by someone else to
+		 * complete */
+		_debug("wait");
+
+		spin_unlock(&vl->lock);
+		ret = wait_event_interruptible(vl->waitq,
+					       vl->state == AFS_VL_NEW ||
+					       vl->state == AFS_VL_VALID ||
+					       vl->state == AFS_VL_NO_VOLUME);
+		if (ret < 0)
+			goto error;
+		spin_lock(&vl->lock);
+	}
+	spin_unlock(&vl->lock);
+
+success:
+	_leave(" = %p", vl);
+	return vl;
+
+error_abandon:
+	spin_lock(&vl->lock);
+	vl->state = AFS_VL_NEW;
+	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
+error:
+	ASSERT(vl != NULL);
+	afs_put_vlocation(vl);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * finish using a volume location record
+ */
+void afs_put_vlocation(struct afs_vlocation *vl)
+{
+	if (!vl)
+		return;
+
+	_enter("%s", vl->vldb.name);
+
+	ASSERTCMP(atomic_read(&vl->usage), >, 0);
+
+	if (likely(!atomic_dec_and_test(&vl->usage))) {
+		_leave("");
+		return;
+	}
+
+	spin_lock(&afs_vlocation_graveyard_lock);
+	if (atomic_read(&vl->usage) == 0) {
+		_debug("buried");
+		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
+		vl->time_of_death = get_seconds();
+		queue_delayed_work(afs_wq, &afs_vlocation_reap,
+				   afs_vlocation_timeout * HZ);
+
+		/* suspend updates on this record */
+		if (!list_empty(&vl->update)) {
+			spin_lock(&afs_vlocation_updates_lock);
+			list_del_init(&vl->update);
+			spin_unlock(&afs_vlocation_updates_lock);
+		}
+	}
+	spin_unlock(&afs_vlocation_graveyard_lock);
+	_leave(" [killed?]");
+}
+
+/*
+ * destroy a dead volume location record
+ */
+static void afs_vlocation_destroy(struct afs_vlocation *vl)
+{
+	_enter("%p", vl);
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vl->cache, 0);
+#endif
+	afs_put_cell(vl->cell);
+	kfree(vl);
+}
+
+/*
+ * reap dead volume location records
+ */
+static void afs_vlocation_reaper(struct work_struct *work)
+{
+	LIST_HEAD(corpses);
+	struct afs_vlocation *vl;
+	unsigned long delay, expiry;
+	time_t now;
+
+	_enter("");
+
+	now = get_seconds();
+	spin_lock(&afs_vlocation_graveyard_lock);
+
+	while (!list_empty(&afs_vlocation_graveyard)) {
+		vl = list_entry(afs_vlocation_graveyard.next,
+				struct afs_vlocation, grave);
+
+		_debug("check %p", vl);
+
+		/* the queue is ordered most dead first */
+		expiry = vl->time_of_death + afs_vlocation_timeout;
+		if (expiry > now) {
+			delay = (expiry - now) * HZ;
+			_debug("delay %lu", delay);
+			mod_delayed_work(afs_wq, &afs_vlocation_reap, delay);
+			break;
+		}
+
+		spin_lock(&vl->cell->vl_lock);
+		if (atomic_read(&vl->usage) > 0) {
+			_debug("no reap");
+			list_del_init(&vl->grave);
+		} else {
+			_debug("reap");
+			list_move_tail(&vl->grave, &corpses);
+			list_del_init(&vl->link);
+		}
+		spin_unlock(&vl->cell->vl_lock);
+	}
+
+	spin_unlock(&afs_vlocation_graveyard_lock);
+
+	/* now reap the corpses we've extracted */
+	while (!list_empty(&corpses)) {
+		vl = list_entry(corpses.next, struct afs_vlocation, grave);
+		list_del(&vl->grave);
+		afs_vlocation_destroy(vl);
+	}
+
+	_leave("");
+}
+
+/*
+ * initialise the VL update process
+ */
+int __init afs_vlocation_update_init(void)
+{
+	afs_vlocation_update_worker =
+		create_singlethread_workqueue("kafs_vlupdated");
+	return afs_vlocation_update_worker ? 0 : -ENOMEM;
+}
+
+/*
+ * discard all the volume location records for rmmod
+ */
+void afs_vlocation_purge(void)
+{
+	afs_vlocation_timeout = 0;
+
+	spin_lock(&afs_vlocation_updates_lock);
+	list_del_init(&afs_vlocation_updates);
+	spin_unlock(&afs_vlocation_updates_lock);
+	mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0);
+	destroy_workqueue(afs_vlocation_update_worker);
+
+	mod_delayed_work(afs_wq, &afs_vlocation_reap, 0);
+}
+
+/*
+ * update a volume location
+ */
+static void afs_vlocation_updater(struct work_struct *work)
+{
+	struct afs_cache_vlocation vldb;
+	struct afs_vlocation *vl, *xvl;
+	time_t now;
+	long timeout;
+	int ret;
+
+	_enter("");
+
+	now = get_seconds();
+
+	/* find a record to update */
+	spin_lock(&afs_vlocation_updates_lock);
+	for (;;) {
+		if (list_empty(&afs_vlocation_updates)) {
+			spin_unlock(&afs_vlocation_updates_lock);
+			_leave(" [nothing]");
+			return;
+		}
+
+		vl = list_entry(afs_vlocation_updates.next,
+				struct afs_vlocation, update);
+		if (atomic_read(&vl->usage) > 0)
+			break;
+		list_del_init(&vl->update);
+	}
+
+	timeout = vl->update_at - now;
+	if (timeout > 0) {
+		queue_delayed_work(afs_vlocation_update_worker,
+				   &afs_vlocation_update, timeout * HZ);
+		spin_unlock(&afs_vlocation_updates_lock);
+		_leave(" [nothing]");
+		return;
+	}
+
+	list_del_init(&vl->update);
+	atomic_inc(&vl->usage);
+	spin_unlock(&afs_vlocation_updates_lock);
+
+	/* we can now perform the update */
+	_debug("update %s", vl->vldb.name);
+	vl->state = AFS_VL_UPDATING;
+	vl->upd_rej_cnt = 0;
+	vl->upd_busy_cnt = 0;
+
+	ret = afs_vlocation_update_record(vl, NULL, &vldb);
+	spin_lock(&vl->lock);
+	switch (ret) {
+	case 0:
+		afs_vlocation_apply_update(vl, &vldb);
+		vl->state = AFS_VL_VALID;
+		break;
+	case -ENOMEDIUM:
+		vl->state = AFS_VL_VOLUME_DELETED;
+		break;
+	default:
+		vl->state = AFS_VL_UNCERTAIN;
+		break;
+	}
+	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
+
+	/* and then reschedule */
+	_debug("reschedule");
+	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
+
+	spin_lock(&afs_vlocation_updates_lock);
+
+	if (!list_empty(&afs_vlocation_updates)) {
+		/* next update in 10 minutes, but wait at least 1 second more
+		 * than the newest record already queued so that we don't spam
+		 * the VL server suddenly with lots of requests
+		 */
+		xvl = list_entry(afs_vlocation_updates.prev,
+				 struct afs_vlocation, update);
+		if (vl->update_at <= xvl->update_at)
+			vl->update_at = xvl->update_at + 1;
+		xvl = list_entry(afs_vlocation_updates.next,
+				 struct afs_vlocation, update);
+		timeout = xvl->update_at - now;
+		if (timeout < 0)
+			timeout = 0;
+	} else {
+		timeout = afs_vlocation_update_timeout;
+	}
+
+	ASSERT(list_empty(&vl->update));
+
+	list_add_tail(&vl->update, &afs_vlocation_updates);
+
+	_debug("timeout %ld", timeout);
+	queue_delayed_work(afs_vlocation_update_worker,
+			   &afs_vlocation_update, timeout * HZ);
+	spin_unlock(&afs_vlocation_updates_lock);
+	afs_put_vlocation(vl);
+}
diff --git a/kernel/fs/afs/vnode.c b/kernel/fs/afs/vnode.c
new file mode 100644
index 000000000..25cf4c3f4
--- /dev/null
+++ b/kernel/fs/afs/vnode.c
@@ -0,0 +1,1025 @@
+/* AFS vnode management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+#if 0
+static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
+				   int depth, char lr)
+{
+	struct afs_vnode *vnode;
+	bool bad = false;
+
+	if (!node)
+		return false;
+
+	if (node->rb_left)
+		bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
+
+	vnode = rb_entry(node, struct afs_vnode, cb_promise);
+	_debug("%c %*.*s%c%p {%d}",
+	       rb_is_red(node) ? 'R' : 'B',
+	       depth, depth, "", lr,
+	       vnode, vnode->cb_expires_at);
+	if (rb_parent(node) != parent) {
+		printk("BAD: %p != %p\n", rb_parent(node), parent);
+		bad = true;
+	}
+
+	if (node->rb_right)
+		bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
+
+	return bad;
+}
+
+static noinline void dump_tree(const char *name, struct afs_server *server)
+{
+	_enter("%s", name);
+	if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
+		BUG();
+}
+#endif
+
+/*
+ * insert a vnode into the backing server's vnode tree
+ */
+static void afs_install_vnode(struct afs_vnode *vnode,
+			      struct afs_server *server)
+{
+	struct afs_server *old_server = vnode->server;
+	struct afs_vnode *xvnode;
+	struct rb_node *parent, **p;
+
+	_enter("%p,%p", vnode, server);
+
+	if (old_server) {
+		spin_lock(&old_server->fs_lock);
+		rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
+		spin_unlock(&old_server->fs_lock);
+	}
+
+	afs_get_server(server);
+	vnode->server = server;
+	afs_put_server(old_server);
+
+	/* insert into the server's vnode tree in FID order */
+	spin_lock(&server->fs_lock);
+
+	parent = NULL;
+	p = &server->fs_vnodes.rb_node;
+	while (*p) {
+		parent = *p;
+		xvnode = rb_entry(parent, struct afs_vnode, server_rb);
+		if (vnode->fid.vid < xvnode->fid.vid)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.vid > xvnode->fid.vid)
+			p = &(*p)->rb_right;
+		else if (vnode->fid.vnode < xvnode->fid.vnode)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.vnode > xvnode->fid.vnode)
+			p = &(*p)->rb_right;
+		else if (vnode->fid.unique < xvnode->fid.unique)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.unique > xvnode->fid.unique)
+			p = &(*p)->rb_right;
+		else
+			BUG(); /* can't happen unless afs_iget() malfunctions */
+	}
+
+	rb_link_node(&vnode->server_rb, parent, p);
+	rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
+
+	spin_unlock(&server->fs_lock);
+	_leave("");
+}
+
+/*
+ * insert a vnode into the promising server's update/expiration tree
+ * - caller must hold vnode->lock
+ */
+static void afs_vnode_note_promise(struct afs_vnode *vnode,
+				   struct afs_server *server)
+{
+	struct afs_server *old_server;
+	struct afs_vnode *xvnode;
+	struct rb_node *parent, **p;
+
+	_enter("%p,%p", vnode, server);
+
+	ASSERT(server != NULL);
+
+	old_server = vnode->server;
+	if (vnode->cb_promised) {
+		if (server == old_server &&
+		    vnode->cb_expires == vnode->cb_expires_at) {
+			_leave(" [no change]");
+			return;
+		}
+
+		spin_lock(&old_server->cb_lock);
+		if (vnode->cb_promised) {
+			_debug("delete");
+			rb_erase(&vnode->cb_promise, &old_server->cb_promises);
+			vnode->cb_promised = false;
+		}
+		spin_unlock(&old_server->cb_lock);
+	}
+
+	if (vnode->server != server)
+		afs_install_vnode(vnode, server);
+
+	vnode->cb_expires_at = vnode->cb_expires;
+	_debug("PROMISE on %p {%lu}",
+	       vnode, (unsigned long) vnode->cb_expires_at);
+
+	/* abuse an RB-tree to hold the expiration order (we may have multiple
+	 * items with the same expiration time) */
+	spin_lock(&server->cb_lock);
+
+	parent = NULL;
+	p = &server->cb_promises.rb_node;
+	while (*p) {
+		parent = *p;
+		xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
+		if (vnode->cb_expires_at < xvnode->cb_expires_at)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&vnode->cb_promise, parent, p);
+	rb_insert_color(&vnode->cb_promise, &server->cb_promises);
+	vnode->cb_promised = true;
+
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * handle remote file deletion by discarding the callback promise
+ */
+static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
+{
+	struct afs_server *server;
+
+	_enter("{%p}", vnode->server);
+
+	set_bit(AFS_VNODE_DELETED, &vnode->flags);
+
+	server = vnode->server;
+	if (server) {
+		if (vnode->cb_promised) {
+			spin_lock(&server->cb_lock);
+			if (vnode->cb_promised) {
+				rb_erase(&vnode->cb_promise,
+					 &server->cb_promises);
+				vnode->cb_promised = false;
+			}
+			spin_unlock(&server->cb_lock);
+		}
+
+		spin_lock(&server->fs_lock);
+		rb_erase(&vnode->server_rb, &server->fs_vnodes);
+		spin_unlock(&server->fs_lock);
+
+		vnode->server = NULL;
+		afs_put_server(server);
+	} else {
+		ASSERT(!vnode->cb_promised);
+	}
+
+	_leave("");
+}
+
+/*
+ * finish off updating the recorded status of a file after a successful
+ * operation completion
+ * - starts callback expiry timer
+ * - adds to server's callback list
+ */
+void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
+				      struct afs_server *server)
+{
+	struct afs_server *oldserver = NULL;
+
+	_enter("%p,%p", vnode, server);
+
+	spin_lock(&vnode->lock);
+	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+	afs_vnode_note_promise(vnode, server);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+
+	wake_up_all(&vnode->update_waitq);
+	afs_put_server(oldserver);
+	_leave("");
+}
+
+/*
+ * finish off updating the recorded status of a file after an operation failed
+ */
+static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
+{
+	_enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret);
+
+	spin_lock(&vnode->lock);
+
+	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+
+	if (ret == -ENOENT) {
+		/* the file was deleted on the server */
+		_debug("got NOENT from server - marking file deleted");
+		afs_vnode_deleted_remotely(vnode);
+	}
+
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+
+	wake_up_all(&vnode->update_waitq);
+	_leave("");
+}
+
+/*
+ * fetch file status from the volume
+ * - don't issue a fetch if:
+ *   - the changed bit is not set and there's a valid callback
+ *   - there are any outstanding ops that will fetch the status
+ * - TODO implement local caching
+ */
+int afs_vnode_fetch_status(struct afs_vnode *vnode,
+			   struct afs_vnode *auth_vnode, struct key *key)
+{
+	struct afs_server *server;
+	unsigned long acl_order;
+	int ret;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("%s,{%x:%u.%u}",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+
+	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    vnode->cb_promised) {
+		_leave(" [unchanged]");
+		return 0;
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_leave(" [deleted]");
+		return -ENOENT;
+	}
+
+	acl_order = 0;
+	if (auth_vnode)
+		acl_order = auth_vnode->acl_order;
+
+	spin_lock(&vnode->lock);
+
+	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    vnode->cb_promised) {
+		spin_unlock(&vnode->lock);
+		_leave(" [unchanged]");
+		return 0;
+	}
+
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+
+	if (vnode->update_cnt > 0) {
+		/* someone else started a fetch */
+		_debug("wait on fetch %d", vnode->update_cnt);
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		ASSERT(myself.func != NULL);
+		add_wait_queue(&vnode->update_waitq, &myself);
+
+		/* wait for the status to be updated */
+		for (;;) {
+			if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+				break;
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				break;
+
+			/* check to see if it got updated and invalidated all
+			 * before we saw it */
+			if (vnode->update_cnt == 0) {
+				remove_wait_queue(&vnode->update_waitq,
+						  &myself);
+				set_current_state(TASK_RUNNING);
+				goto get_anyway;
+			}
+
+			spin_unlock(&vnode->lock);
+
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+
+			spin_lock(&vnode->lock);
+		}
+
+		remove_wait_queue(&vnode->update_waitq, &myself);
+		spin_unlock(&vnode->lock);
+		set_current_state(TASK_RUNNING);
+
+		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
+			-ENOENT : 0;
+	}
+
+get_anyway:
+	/* okay... we're going to have to initiate the op */
+	vnode->update_cnt++;
+
+	spin_unlock(&vnode->lock);
+
+	/* merge AFS status fetches and clear outstanding callback on this
+	 * vnode */
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %p{%08x}",
+		       server, ntohl(server->addr.s_addr));
+
+		ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
+					       &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		_debug("adjust");
+		if (auth_vnode)
+			afs_cache_permit(vnode, key, acl_order);
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		_debug("failed [%d]", ret);
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * fetch file data from the volume
+ * - TODO implement caching
+ */
+int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
+			 off_t offset, size_t length, struct page *page)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,,,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	/* merge in AFS status fetches and clear outstanding callback on this
+	 * vnode */
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_fetch_data(server, key, vnode, offset, length,
+					page, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * make a file or a directory
+ */
+int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
+		     const char *name, umode_t mode, struct afs_fid *newfid,
+		     struct afs_file_status *newstatus,
+		     struct afs_callback *newcb, struct afs_server **_server)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%s,,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're creating in */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_create(server, key, vnode, name, mode, newfid,
+				    newstatus, newcb, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		*_server = server;
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		*_server = NULL;
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * remove a file or directory
+ */
+int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
+		     bool isdir)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%s",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're removing from */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_remove(server, key, vnode, name, isdir,
+				    &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * create a hard link
+ */
+int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+			  struct key *key, const char *name)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s",
+	       dvnode->volume->vlocation->vldb.name,
+	       dvnode->fid.vid,
+	       dvnode->fid.vnode,
+	       dvnode->fid.unique,
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're removing from */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	spin_lock(&dvnode->lock);
+	dvnode->update_cnt++;
+	spin_unlock(&dvnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(dvnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_link(server, key, dvnode, vnode, name,
+				  &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(dvnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_vnode_finalise_status_update(dvnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		afs_vnode_status_update_failed(dvnode, ret);
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	spin_lock(&dvnode->lock);
+	dvnode->update_cnt--;
+	ASSERTCMP(dvnode->update_cnt, >=, 0);
+	spin_unlock(&dvnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * create a symbolic link
+ */
+int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
+		      const char *name, const char *content,
+		      struct afs_fid *newfid,
+		      struct afs_file_status *newstatus,
+		      struct afs_server **_server)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%s,%s,,,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name, content);
+
+	/* this op will fetch the status on the directory we're creating in */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_symlink(server, key, vnode, name, content,
+				     newfid, newstatus, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		*_server = server;
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		*_server = NULL;
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * rename a file
+ */
+int afs_vnode_rename(struct afs_vnode *orig_dvnode,
+		     struct afs_vnode *new_dvnode,
+		     struct key *key,
+		     const char *orig_name,
+		     const char *new_name)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s",
+	       orig_dvnode->volume->vlocation->vldb.name,
+	       orig_dvnode->fid.vid,
+	       orig_dvnode->fid.vnode,
+	       orig_dvnode->fid.unique,
+	       new_dvnode->volume->vlocation->vldb.name,
+	       new_dvnode->fid.vid,
+	       new_dvnode->fid.vnode,
+	       new_dvnode->fid.unique,
+	       key_serial(key),
+	       orig_name,
+	       new_name);
+
+	/* this op will fetch the status on both the directories we're dealing
+	 * with */
+	spin_lock(&orig_dvnode->lock);
+	orig_dvnode->update_cnt++;
+	spin_unlock(&orig_dvnode->lock);
+	if (new_dvnode != orig_dvnode) {
+		spin_lock(&new_dvnode->lock);
+		new_dvnode->update_cnt++;
+		spin_unlock(&new_dvnode->lock);
+	}
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(orig_dvnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
+				    new_dvnode, new_name, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(orig_dvnode, server);
+		if (new_dvnode != orig_dvnode)
+			afs_vnode_finalise_status_update(new_dvnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(orig_dvnode, ret);
+		if (new_dvnode != orig_dvnode)
+			afs_vnode_status_update_failed(new_dvnode, ret);
+	}
+
+	_leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&orig_dvnode->lock);
+	orig_dvnode->update_cnt--;
+	ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
+	spin_unlock(&orig_dvnode->lock);
+	if (new_dvnode != orig_dvnode) {
+		spin_lock(&new_dvnode->lock);
+		new_dvnode->update_cnt--;
+		ASSERTCMP(new_dvnode->update_cnt, >=, 0);
+		spin_unlock(&new_dvnode->lock);
+	}
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * write to a file
+ */
+int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
+			 unsigned offset, unsigned to)
+{
+	struct afs_server *server;
+	struct afs_vnode *vnode = wb->vnode;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(wb->key),
+	       first, last, offset, to);
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_store_data(server, wb, first, last, offset, to,
+					&afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * set the attributes on a file
+ */
+int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
+		      struct iattr *attr)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * get the status of a volume
+ */
+int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
+				struct afs_volume_status *vs)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * get a lock on a file
+ */
+int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
+		       afs_lock_type_t type)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%u",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key), type);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * extend a lock on a file
+ */
+int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * release a lock on a file
+ */
+int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
diff --git a/kernel/fs/afs/volume.c b/kernel/fs/afs/volume.c
new file mode 100644
index 000000000..d142a2449
--- /dev/null
+++ b/kernel/fs/afs/volume.c
@@ -0,0 +1,401 @@
+/* AFS volume management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
+
+/*
+ * lookup a volume by name
+ * - this can be one of the following:
+ *	"%[cell:]volume[.]"		R/W volume
+ *	"#[cell:]volume[.]"		R/O or R/W volume (rwparent=0),
+ *					 or R/W (rwparent=1) volume
+ *	"%[cell:]volume.readonly"	R/O volume
+ *	"#[cell:]volume.readonly"	R/O volume
+ *	"%[cell:]volume.backup"		Backup volume
+ *	"#[cell:]volume.backup"		Backup volume
+ *
+ * The cell name is optional, and defaults to the current cell.
+ *
+ * See "The Rules of Mount Point Traversal" in Chapter 5 of the AFS SysAdmin
+ * Guide
+ * - Rule 1: Explicit type suffix forces access of that type or nothing
+ *           (no suffix, then use Rule 2 & 3)
+ * - Rule 2: If parent volume is R/O, then mount R/O volume by preference, R/W
+ *           if not available
+ * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
+ *           explicitly told otherwise
+ */
+struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
+{
+	struct afs_vlocation *vlocation = NULL;
+	struct afs_volume *volume = NULL;
+	struct afs_server *server = NULL;
+	char srvtmask;
+	int ret, loop;
+
+	_enter("{%*.*s,%d}",
+	       params->volnamesz, params->volnamesz, params->volname, params->rwpath);
+
+	/* lookup the volume location record */
+	vlocation = afs_vlocation_lookup(params->cell, params->key,
+					 params->volname, params->volnamesz);
+	if (IS_ERR(vlocation)) {
+		ret = PTR_ERR(vlocation);
+		vlocation = NULL;
+		goto error;
+	}
+
+	/* make the final decision on the type we want */
+	ret = -ENOMEDIUM;
+	if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
+		goto error;
+
+	srvtmask = 0;
+	for (loop = 0; loop < vlocation->vldb.nservers; loop++)
+		srvtmask |= vlocation->vldb.srvtmask[loop];
+
+	if (params->force) {
+		if (!(srvtmask & (1 << params->type)))
+			goto error;
+	} else if (srvtmask & AFS_VOL_VTM_RO) {
+		params->type = AFSVL_ROVOL;
+	} else if (srvtmask & AFS_VOL_VTM_RW) {
+		params->type = AFSVL_RWVOL;
+	} else {
+		goto error;
+	}
+
+	down_write(&params->cell->vl_sem);
+
+	/* is the volume already active? */
+	if (vlocation->vols[params->type]) {
+		/* yes - re-use it */
+		volume = vlocation->vols[params->type];
+		afs_get_volume(volume);
+		goto success;
+	}
+
+	/* create a new volume record */
+	_debug("creating new volume record");
+
+	ret = -ENOMEM;
+	volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
+	if (!volume)
+		goto error_up;
+
+	atomic_set(&volume->usage, 1);
+	volume->type		= params->type;
+	volume->type_force	= params->force;
+	volume->cell		= params->cell;
+	volume->vid		= vlocation->vldb.vid[params->type];
+
+	ret = bdi_setup_and_register(&volume->bdi, "afs");
+	if (ret)
+		goto error_bdi;
+
+	init_rwsem(&volume->server_sem);
+
+	/* look up all the applicable server records */
+	for (loop = 0; loop < 8; loop++) {
+		if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
+			server = afs_lookup_server(
+			       volume->cell, &vlocation->vldb.servers[loop]);
+			if (IS_ERR(server)) {
+				ret = PTR_ERR(server);
+				goto error_discard;
+			}
+
+			volume->servers[volume->nservers] = server;
+			volume->nservers++;
+		}
+	}
+
+	/* attach the cache and volume location */
+#ifdef CONFIG_AFS_FSCACHE
+	volume->cache = fscache_acquire_cookie(vlocation->cache,
+					       &afs_volume_cache_index_def,
+					       volume, true);
+#endif
+	afs_get_vlocation(vlocation);
+	volume->vlocation = vlocation;
+
+	vlocation->vols[volume->type] = volume;
+
+success:
+	_debug("kAFS selected %s volume %08x",
+	       afs_voltypes[volume->type], volume->vid);
+	up_write(&params->cell->vl_sem);
+	afs_put_vlocation(vlocation);
+	_leave(" = %p", volume);
+	return volume;
+
+	/* clean up */
+error_up:
+	up_write(&params->cell->vl_sem);
+error:
+	afs_put_vlocation(vlocation);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+
+error_discard:
+	bdi_destroy(&volume->bdi);
+error_bdi:
+	up_write(&params->cell->vl_sem);
+
+	for (loop = volume->nservers - 1; loop >= 0; loop--)
+		afs_put_server(volume->servers[loop]);
+
+	kfree(volume);
+	goto error;
+}
+
+/*
+ * destroy a volume record
+ */
+void afs_put_volume(struct afs_volume *volume)
+{
+	struct afs_vlocation *vlocation;
+	int loop;
+
+	if (!volume)
+		return;
+
+	_enter("%p", volume);
+
+	ASSERTCMP(atomic_read(&volume->usage), >, 0);
+
+	vlocation = volume->vlocation;
+
+	/* to prevent a race, the decrement and the dequeue must be effectively
+	 * atomic */
+	down_write(&vlocation->cell->vl_sem);
+
+	if (likely(!atomic_dec_and_test(&volume->usage))) {
+		up_write(&vlocation->cell->vl_sem);
+		_leave("");
+		return;
+	}
+
+	vlocation->vols[volume->type] = NULL;
+
+	up_write(&vlocation->cell->vl_sem);
+
+	/* finish cleaning up the volume */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(volume->cache, 0);
+#endif
+	afs_put_vlocation(vlocation);
+
+	for (loop = volume->nservers - 1; loop >= 0; loop--)
+		afs_put_server(volume->servers[loop]);
+
+	bdi_destroy(&volume->bdi);
+	kfree(volume);
+
+	_leave(" [destroyed]");
+}
+
+/*
+ * pick a server to use to try accessing this volume
+ * - returns with an elevated usage count on the server chosen
+ */
+struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
+{
+	struct afs_volume *volume = vnode->volume;
+	struct afs_server *server;
+	int ret, state, loop;
+
+	_enter("%s", volume->vlocation->vldb.name);
+
+	/* stick with the server we're already using if we can */
+	if (vnode->server && vnode->server->fs_state == 0) {
+		afs_get_server(vnode->server);
+		_leave(" = %p [current]", vnode->server);
+		return vnode->server;
+	}
+
+	down_read(&volume->server_sem);
+
+	/* handle the no-server case */
+	if (volume->nservers == 0) {
+		ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
+		up_read(&volume->server_sem);
+		_leave(" = %d [no servers]", ret);
+		return ERR_PTR(ret);
+	}
+
+	/* basically, just search the list for the first live server and use
+	 * that */
+	ret = 0;
+	for (loop = 0; loop < volume->nservers; loop++) {
+		server = volume->servers[loop];
+		state = server->fs_state;
+
+		_debug("consider %d [%d]", loop, state);
+
+		switch (state) {
+			/* found an apparently healthy server */
+		case 0:
+			afs_get_server(server);
+			up_read(&volume->server_sem);
+			_leave(" = %p (picked %08x)",
+			       server, ntohl(server->addr.s_addr));
+			return server;
+
+		case -ENETUNREACH:
+			if (ret == 0)
+				ret = state;
+			break;
+
+		case -EHOSTUNREACH:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH)
+				ret = state;
+			break;
+
+		case -ECONNREFUSED:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH)
+				ret = state;
+			break;
+
+		default:
+		case -EREMOTEIO:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH ||
+			    ret == -ECONNREFUSED)
+				ret = state;
+			break;
+		}
+	}
+
+	/* no available servers
+	 * - TODO: handle the no active servers case better
+	 */
+	up_read(&volume->server_sem);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * release a server after use
+ * - releases the ref on the server struct that was acquired by picking
+ * - records result of using a particular server to access a volume
+ * - return 0 to try again, 1 if okay or to issue error
+ * - the caller must release the server struct if result was 0
+ */
+int afs_volume_release_fileserver(struct afs_vnode *vnode,
+				  struct afs_server *server,
+				  int result)
+{
+	struct afs_volume *volume = vnode->volume;
+	unsigned loop;
+
+	_enter("%s,%08x,%d",
+	       volume->vlocation->vldb.name, ntohl(server->addr.s_addr),
+	       result);
+
+	switch (result) {
+		/* success */
+	case 0:
+		server->fs_act_jif = jiffies;
+		server->fs_state = 0;
+		_leave("");
+		return 1;
+
+		/* the fileserver denied all knowledge of the volume */
+	case -ENOMEDIUM:
+		server->fs_act_jif = jiffies;
+		down_write(&volume->server_sem);
+
+		/* firstly, find where the server is in the active list (if it
+		 * is) */
+		for (loop = 0; loop < volume->nservers; loop++)
+			if (volume->servers[loop] == server)
+				goto present;
+
+		/* no longer there - may have been discarded by another op */
+		goto try_next_server_upw;
+
+	present:
+		volume->nservers--;
+		memmove(&volume->servers[loop],
+			&volume->servers[loop + 1],
+			sizeof(volume->servers[loop]) *
+			(volume->nservers - loop));
+		volume->servers[volume->nservers] = NULL;
+		afs_put_server(server);
+		volume->rjservers++;
+
+		if (volume->nservers > 0)
+			/* another server might acknowledge its existence */
+			goto try_next_server_upw;
+
+		/* handle the case where all the fileservers have rejected the
+		 * volume
+		 * - TODO: try asking the fileservers for volume information
+		 * - TODO: contact the VL server again to see if the volume is
+		 *         no longer registered
+		 */
+		up_write(&volume->server_sem);
+		afs_put_server(server);
+		_leave(" [completely rejected]");
+		return 1;
+
+		/* problem reaching the server */
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -ECONNREFUSED:
+	case -ETIME:
+	case -ETIMEDOUT:
+	case -EREMOTEIO:
+		/* mark the server as dead
+		 * TODO: vary dead timeout depending on error
+		 */
+		spin_lock(&server->fs_lock);
+		if (!server->fs_state) {
+			server->fs_dead_jif = jiffies + HZ * 10;
+			server->fs_state = result;
+			printk("kAFS: SERVER DEAD state=%d\n", result);
+		}
+		spin_unlock(&server->fs_lock);
+		goto try_next_server;
+
+		/* miscellaneous error */
+	default:
+		server->fs_act_jif = jiffies;
+	case -ENOMEM:
+	case -ENONET:
+		/* tell the caller to accept the result */
+		afs_put_server(server);
+		_leave(" [local failure]");
+		return 1;
+	}
+
+	/* tell the caller to loop around and try the next server */
+try_next_server_upw:
+	up_write(&volume->server_sem);
+try_next_server:
+	afs_put_server(server);
+	_leave(" [try next server]");
+	return 0;
+}
diff --git a/kernel/fs/afs/write.c b/kernel/fs/afs/write.c
new file mode 100644
index 000000000..0714abcd7
--- /dev/null
+++ b/kernel/fs/afs/write.c
@@ -0,0 +1,761 @@
+/* handling of writes to regular files and writing back to the server
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+					   struct page *page);
+
+/*
+ * mark a page as having been made dirty and thus needing writeback
+ */
+int afs_set_page_dirty(struct page *page)
+{
+	_enter("");
+	return __set_page_dirty_nobuffers(page);
+}
+
+/*
+ * unlink a writeback record because its usage has reached zero
+ * - must be called with the wb->vnode->writeback_lock held
+ */
+static void afs_unlink_writeback(struct afs_writeback *wb)
+{
+	struct afs_writeback *front;
+	struct afs_vnode *vnode = wb->vnode;
+
+	list_del_init(&wb->link);
+	if (!list_empty(&vnode->writebacks)) {
+		/* if an fsync rises to the front of the queue then wake it
+		 * up */
+		front = list_entry(vnode->writebacks.next,
+				   struct afs_writeback, link);
+		if (front->state == AFS_WBACK_SYNCING) {
+			_debug("wake up sync");
+			front->state = AFS_WBACK_COMPLETE;
+			wake_up(&front->waitq);
+		}
+	}
+}
+
+/*
+ * free a writeback record
+ */
+static void afs_free_writeback(struct afs_writeback *wb)
+{
+	_enter("");
+	key_put(wb->key);
+	kfree(wb);
+}
+
+/*
+ * dispose of a reference to a writeback record
+ */
+void afs_put_writeback(struct afs_writeback *wb)
+{
+	struct afs_vnode *vnode = wb->vnode;
+
+	_enter("{%d}", wb->usage);
+
+	spin_lock(&vnode->writeback_lock);
+	if (--wb->usage == 0)
+		afs_unlink_writeback(wb);
+	else
+		wb = NULL;
+	spin_unlock(&vnode->writeback_lock);
+	if (wb)
+		afs_free_writeback(wb);
+}
+
+/*
+ * partly or wholly fill a page that's under preparation for writing
+ */
+static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
+			 loff_t pos, struct page *page)
+{
+	loff_t i_size;
+	int ret;
+	int len;
+
+	_enter(",,%llu", (unsigned long long)pos);
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (pos + PAGE_CACHE_SIZE > i_size)
+		len = i_size - pos;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			_debug("got NOENT from server"
+			       " - marking file deleted and stale");
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			ret = -ESTALE;
+		}
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * prepare to perform part of a write to a page
+ */
+int afs_write_begin(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned flags,
+		    struct page **pagep, void **fsdata)
+{
+	struct afs_writeback *candidate, *wb;
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+	struct page *page;
+	struct key *key = file->private_data;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	_enter("{%x:%u},{%lx},%u,%u",
+	       vnode->fid.vid, vnode->fid.vnode, index, from, to);
+
+	candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
+	if (!candidate)
+		return -ENOMEM;
+	candidate->vnode = vnode;
+	candidate->first = candidate->last = index;
+	candidate->offset_first = from;
+	candidate->to_last = to;
+	INIT_LIST_HEAD(&candidate->link);
+	candidate->usage = 1;
+	candidate->state = AFS_WBACK_PENDING;
+	init_waitqueue_head(&candidate->waitq);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		kfree(candidate);
+		return -ENOMEM;
+	}
+	*pagep = page;
+	/* page won't leak in error case: it eventually gets cleaned off LRU */
+
+	if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) {
+		ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page);
+		if (ret < 0) {
+			kfree(candidate);
+			_leave(" = %d [prep]", ret);
+			return ret;
+		}
+		SetPageUptodate(page);
+	}
+
+try_again:
+	spin_lock(&vnode->writeback_lock);
+
+	/* see if this page is already pending a writeback under a suitable key
+	 * - if so we can just join onto that one */
+	wb = (struct afs_writeback *) page_private(page);
+	if (wb) {
+		if (wb->key == key && wb->state == AFS_WBACK_PENDING)
+			goto subsume_in_current_wb;
+		goto flush_conflicting_wb;
+	}
+
+	if (index > 0) {
+		/* see if we can find an already pending writeback that we can
+		 * append this page to */
+		list_for_each_entry(wb, &vnode->writebacks, link) {
+			if (wb->last == index - 1 && wb->key == key &&
+			    wb->state == AFS_WBACK_PENDING)
+				goto append_to_previous_wb;
+		}
+	}
+
+	list_add_tail(&candidate->link, &vnode->writebacks);
+	candidate->key = key_get(key);
+	spin_unlock(&vnode->writeback_lock);
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long) candidate);
+	_leave(" = 0 [new]");
+	return 0;
+
+subsume_in_current_wb:
+	_debug("subsume");
+	ASSERTRANGE(wb->first, <=, index, <=, wb->last);
+	if (index == wb->first && from < wb->offset_first)
+		wb->offset_first = from;
+	if (index == wb->last && to > wb->to_last)
+		wb->to_last = to;
+	spin_unlock(&vnode->writeback_lock);
+	kfree(candidate);
+	_leave(" = 0 [sub]");
+	return 0;
+
+append_to_previous_wb:
+	_debug("append into %lx-%lx", wb->first, wb->last);
+	wb->usage++;
+	wb->last++;
+	wb->to_last = to;
+	spin_unlock(&vnode->writeback_lock);
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long) wb);
+	kfree(candidate);
+	_leave(" = 0 [app]");
+	return 0;
+
+	/* the page is currently bound to another context, so if it's dirty we
+	 * need to flush it before we can use the new context */
+flush_conflicting_wb:
+	_debug("flush conflict");
+	if (wb->state == AFS_WBACK_PENDING)
+		wb->state = AFS_WBACK_CONFLICTING;
+	spin_unlock(&vnode->writeback_lock);
+	if (PageDirty(page)) {
+		ret = afs_write_back_from_locked_page(wb, page);
+		if (ret < 0) {
+			afs_put_writeback(candidate);
+			_leave(" = %d", ret);
+			return ret;
+		}
+	}
+
+	/* the page holds a ref on the writeback record */
+	afs_put_writeback(wb);
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	goto try_again;
+}
+
+/*
+ * finalise part of a write to a page
+ */
+int afs_write_end(struct file *file, struct address_space *mapping,
+		  loff_t pos, unsigned len, unsigned copied,
+		  struct page *page, void *fsdata)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+	loff_t i_size, maybe_i_size;
+
+	_enter("{%x:%u},{%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, page->index);
+
+	maybe_i_size = pos + copied;
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (maybe_i_size > i_size) {
+		spin_lock(&vnode->writeback_lock);
+		i_size = i_size_read(&vnode->vfs_inode);
+		if (maybe_i_size > i_size)
+			i_size_write(&vnode->vfs_inode, maybe_i_size);
+		spin_unlock(&vnode->writeback_lock);
+	}
+
+	set_page_dirty(page);
+	if (PageDirty(page))
+		_debug("dirtied");
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+/*
+ * kill all the pages in the given range
+ */
+static void afs_kill_pages(struct afs_vnode *vnode, bool error,
+			   pgoff_t first, pgoff_t last)
+{
+	struct pagevec pv;
+	unsigned count, loop;
+
+	_enter("{%x:%u},%lx-%lx",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	pagevec_init(&pv, 0);
+
+	do {
+		_debug("kill %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+					      first, count, pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		for (loop = 0; loop < count; loop++) {
+			ClearPageUptodate(pv.pages[loop]);
+			if (error)
+				SetPageError(pv.pages[loop]);
+			end_page_writeback(pv.pages[loop]);
+		}
+
+		__pagevec_release(&pv);
+	} while (first < last);
+
+	_leave("");
+}
+
+/*
+ * synchronously write back the locked page and any subsequent non-locked dirty
+ * pages also covered by the same writeback record
+ */
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+					   struct page *primary_page)
+{
+	struct page *pages[8], *page;
+	unsigned long count;
+	unsigned n, offset, to;
+	pgoff_t start, first, last;
+	int loop, ret;
+
+	_enter(",%lx", primary_page->index);
+
+	count = 1;
+	if (!clear_page_dirty_for_io(primary_page))
+		BUG();
+	if (test_set_page_writeback(primary_page))
+		BUG();
+
+	/* find all consecutive lockable dirty pages, stopping when we find a
+	 * page that is not immediately lockable, is not dirty or is missing,
+	 * or we reach the end of the range */
+	start = primary_page->index;
+	if (start >= wb->last)
+		goto no_more;
+	start++;
+	do {
+		_debug("more %lx [%lx]", start, count);
+		n = wb->last - start + 1;
+		if (n > ARRAY_SIZE(pages))
+			n = ARRAY_SIZE(pages);
+		n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
+					  start, n, pages);
+		_debug("fgpc %u", n);
+		if (n == 0)
+			goto no_more;
+		if (pages[0]->index != start) {
+			do {
+				put_page(pages[--n]);
+			} while (n > 0);
+			goto no_more;
+		}
+
+		for (loop = 0; loop < n; loop++) {
+			page = pages[loop];
+			if (page->index > wb->last)
+				break;
+			if (!trylock_page(page))
+				break;
+			if (!PageDirty(page) ||
+			    page_private(page) != (unsigned long) wb) {
+				unlock_page(page);
+				break;
+			}
+			if (!clear_page_dirty_for_io(page))
+				BUG();
+			if (test_set_page_writeback(page))
+				BUG();
+			unlock_page(page);
+			put_page(page);
+		}
+		count += loop;
+		if (loop < n) {
+			for (; loop < n; loop++)
+				put_page(pages[loop]);
+			goto no_more;
+		}
+
+		start += loop;
+	} while (start <= wb->last && count < 65536);
+
+no_more:
+	/* we now have a contiguous set of dirty pages, each with writeback set
+	 * and the dirty mark cleared; the first page is locked and must remain
+	 * so, all the rest are unlocked */
+	first = primary_page->index;
+	last = first + count - 1;
+
+	offset = (first == wb->first) ? wb->offset_first : 0;
+	to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
+
+	_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
+
+	ret = afs_vnode_store_data(wb, first, last, offset, to);
+	if (ret < 0) {
+		switch (ret) {
+		case -EDQUOT:
+		case -ENOSPC:
+			set_bit(AS_ENOSPC,
+				&wb->vnode->vfs_inode.i_mapping->flags);
+			break;
+		case -EROFS:
+		case -EIO:
+		case -EREMOTEIO:
+		case -EFBIG:
+		case -ENOENT:
+		case -ENOMEDIUM:
+		case -ENXIO:
+			afs_kill_pages(wb->vnode, true, first, last);
+			set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags);
+			break;
+		case -EACCES:
+		case -EPERM:
+		case -ENOKEY:
+		case -EKEYEXPIRED:
+		case -EKEYREJECTED:
+		case -EKEYREVOKED:
+			afs_kill_pages(wb->vnode, false, first, last);
+			break;
+		default:
+			break;
+		}
+	} else {
+		ret = count;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * write a page back to the server
+ * - the caller locked the page for us
+ */
+int afs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct afs_writeback *wb;
+	int ret;
+
+	_enter("{%lx},", page->index);
+
+	wb = (struct afs_writeback *) page_private(page);
+	ASSERT(wb != NULL);
+
+	ret = afs_write_back_from_locked_page(wb, page);
+	unlock_page(page);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return 0;
+	}
+
+	wbc->nr_to_write -= ret;
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * write a region of pages back to the server
+ */
+static int afs_writepages_region(struct address_space *mapping,
+				 struct writeback_control *wbc,
+				 pgoff_t index, pgoff_t end, pgoff_t *_next)
+{
+	struct afs_writeback *wb;
+	struct page *page;
+	int ret, n;
+
+	_enter(",,%lx,%lx,", index, end);
+
+	do {
+		n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
+				       1, &page);
+		if (!n)
+			break;
+
+		_debug("wback %lx", page->index);
+
+		if (page->index > end) {
+			*_next = index;
+			page_cache_release(page);
+			_leave(" = 0 [%lx]", *_next);
+			return 0;
+		}
+
+		/* at this point we hold neither mapping->tree_lock nor lock on
+		 * the page itself: the page may be truncated or invalidated
+		 * (changing page->mapping to NULL), or even swizzled back from
+		 * swapper_space to tmpfs file mapping
+		 */
+		lock_page(page);
+
+		if (page->mapping != mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			continue;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) || !PageDirty(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		wb = (struct afs_writeback *) page_private(page);
+		ASSERT(wb != NULL);
+
+		spin_lock(&wb->vnode->writeback_lock);
+		wb->state = AFS_WBACK_WRITING;
+		spin_unlock(&wb->vnode->writeback_lock);
+
+		ret = afs_write_back_from_locked_page(wb, page);
+		unlock_page(page);
+		page_cache_release(page);
+		if (ret < 0) {
+			_leave(" = %d", ret);
+			return ret;
+		}
+
+		wbc->nr_to_write -= ret;
+
+		cond_resched();
+	} while (index < end && wbc->nr_to_write > 0);
+
+	*_next = index;
+	_leave(" = 0 [%lx]", *_next);
+	return 0;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int afs_writepages(struct address_space *mapping,
+		   struct writeback_control *wbc)
+{
+	pgoff_t start, end, next;
+	int ret;
+
+	_enter("");
+
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index;
+		end = -1;
+		ret = afs_writepages_region(mapping, wbc, start, end, &next);
+		if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
+			ret = afs_writepages_region(mapping, wbc, 0, start,
+						    &next);
+		mapping->writeback_index = next;
+	} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+		end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
+		ret = afs_writepages_region(mapping, wbc, 0, end, &next);
+		if (wbc->nr_to_write > 0)
+			mapping->writeback_index = next;
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		ret = afs_writepages_region(mapping, wbc, start, end, &next);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * completion of write to server
+ */
+void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
+{
+	struct afs_writeback *wb = call->wb;
+	struct pagevec pv;
+	unsigned count, loop;
+	pgoff_t first = call->first, last = call->last;
+	bool free_wb;
+
+	_enter("{%x:%u},{%lx-%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	ASSERT(wb != NULL);
+
+	pagevec_init(&pv, 0);
+
+	do {
+		_debug("done %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(call->mapping, first, count,
+					      pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		spin_lock(&vnode->writeback_lock);
+		for (loop = 0; loop < count; loop++) {
+			struct page *page = pv.pages[loop];
+			end_page_writeback(page);
+			if (page_private(page) == (unsigned long) wb) {
+				set_page_private(page, 0);
+				ClearPagePrivate(page);
+				wb->usage--;
+			}
+		}
+		free_wb = false;
+		if (wb->usage == 0) {
+			afs_unlink_writeback(wb);
+			free_wb = true;
+		}
+		spin_unlock(&vnode->writeback_lock);
+		first += count;
+		if (free_wb) {
+			afs_free_writeback(wb);
+			wb = NULL;
+		}
+
+		__pagevec_release(&pv);
+	} while (first <= last);
+
+	_leave("");
+}
+
+/*
+ * write to an AFS file
+ */
+ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+	ssize_t result;
+	size_t count = iov_iter_count(from);
+
+	_enter("{%x.%u},{%zu},",
+	       vnode->fid.vid, vnode->fid.vnode, count);
+
+	if (IS_SWAPFILE(&vnode->vfs_inode)) {
+		printk(KERN_INFO
+		       "AFS: Attempt to write to active swap file!\n");
+		return -EBUSY;
+	}
+
+	if (!count)
+		return 0;
+
+	result = generic_file_write_iter(iocb, from);
+	if (IS_ERR_VALUE(result)) {
+		_leave(" = %zd", result);
+		return result;
+	}
+
+	_leave(" = %zd", result);
+	return result;
+}
+
+/*
+ * flush the vnode to the fileserver
+ */
+int afs_writeback_all(struct afs_vnode *vnode)
+{
+	struct address_space *mapping = vnode->vfs_inode.i_mapping;
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_to_write	= LONG_MAX,
+		.range_cyclic	= 1,
+	};
+	int ret;
+
+	_enter("");
+
+	ret = mapping->a_ops->writepages(mapping, &wbc);
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * flush any dirty pages for this process, and check for write errors.
+ * - the return status from this call provides a reliable indication of
+ *   whether any write errors occurred for this process.
+ */
+int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file_inode(file);
+	struct afs_writeback *wb, *xwb;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	int ret;
+
+	_enter("{%x:%u},{n=%pD},%d",
+	       vnode->fid.vid, vnode->fid.vnode, file,
+	       datasync);
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
+	/* use a writeback record as a marker in the queue - when this reaches
+	 * the front of the queue, all the outstanding writes are either
+	 * completed or rejected */
+	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
+	if (!wb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	wb->vnode = vnode;
+	wb->first = 0;
+	wb->last = -1;
+	wb->offset_first = 0;
+	wb->to_last = PAGE_SIZE;
+	wb->usage = 1;
+	wb->state = AFS_WBACK_SYNCING;
+	init_waitqueue_head(&wb->waitq);
+
+	spin_lock(&vnode->writeback_lock);
+	list_for_each_entry(xwb, &vnode->writebacks, link) {
+		if (xwb->state == AFS_WBACK_PENDING)
+			xwb->state = AFS_WBACK_CONFLICTING;
+	}
+	list_add_tail(&wb->link, &vnode->writebacks);
+	spin_unlock(&vnode->writeback_lock);
+
+	/* push all the outstanding writebacks to the server */
+	ret = afs_writeback_all(vnode);
+	if (ret < 0) {
+		afs_put_writeback(wb);
+		_leave(" = %d [wb]", ret);
+		goto out;
+	}
+
+	/* wait for the preceding writes to actually complete */
+	ret = wait_event_interruptible(wb->waitq,
+				       wb->state == AFS_WBACK_COMPLETE ||
+				       vnode->writebacks.next == &wb->link);
+	afs_put_writeback(wb);
+	_leave(" = %d", ret);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+/*
+ * notification that a previously read-only page is about to become writable
+ * - if it returns an error, the caller will deliver a bus error signal
+ */
+int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+
+	_enter("{{%x:%u}},{%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, page->index);
+
+	/* wait for the page to be written to the cache before we allow it to
+	 * be modified */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_wait_on_page_write(vnode->cache, page);
+#endif
+
+	_leave(" = 0");
+	return 0;
+}
diff --git a/kernel/fs/aio.c b/kernel/fs/aio.c
new file mode 100644
index 000000000..5a2380de4
--- /dev/null
+++ b/kernel/fs/aio.c
@@ -0,0 +1,1742 @@
+/*
+ *	An async IO implementation for Linux
+ *	Written by Benjamin LaHaise <bcrl@kvack.org>
+ *
+ *	Implements an efficient asynchronous io interface.
+ *
+ *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ *
+ *	See ../COPYING for licensing terms.
+ */
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/aio_abi.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/aio.h>
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/migrate.h>
+#include <linux/ramfs.h>
+#include <linux/percpu-refcount.h>
+#include <linux/mount.h>
+#include <linux/work-simple.h>
+
+#include <asm/kmap_types.h>
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+#define AIO_RING_MAGIC			0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES	1
+#define AIO_RING_INCOMPAT_FEATURES	0
+struct aio_ring {
+	unsigned	id;	/* kernel internal index number */
+	unsigned	nr;	/* number of io_events */
+	unsigned	head;	/* Written to by userland or under ring_lock
+				 * mutex by aio_read_events_ring(). */
+	unsigned	tail;
+
+	unsigned	magic;
+	unsigned	compat_features;
+	unsigned	incompat_features;
+	unsigned	header_length;	/* size of aio_ring */
+
+
+	struct io_event		io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES	8
+
+struct kioctx_table {
+	struct rcu_head	rcu;
+	unsigned	nr;
+	struct kioctx	*table[];
+};
+
+struct kioctx_cpu {
+	unsigned		reqs_available;
+};
+
+struct ctx_rq_wait {
+	struct completion comp;
+	atomic_t count;
+};
+
+struct kioctx {
+	struct percpu_ref	users;
+	atomic_t		dead;
+
+	struct percpu_ref	reqs;
+
+	unsigned long		user_id;
+
+	struct __percpu kioctx_cpu *cpu;
+
+	/*
+	 * For percpu reqs_available, number of slots we move to/from global
+	 * counter at a time:
+	 */
+	unsigned		req_batch;
+	/*
+	 * This is what userspace passed to io_setup(), it's not used for
+	 * anything but counting against the global max_reqs quota.
+	 *
+	 * The real limit is nr_events - 1, which will be larger (see
+	 * aio_setup_ring())
+	 */
+	unsigned		max_reqs;
+
+	/* Size of ringbuffer, in units of struct io_event */
+	unsigned		nr_events;
+
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+
+	struct page		**ring_pages;
+	long			nr_pages;
+
+	struct swork_event	free_work;
+
+	/*
+	 * signals when all in-flight requests are done
+	 */
+	struct ctx_rq_wait	*rq_wait;
+
+	struct {
+		/*
+		 * This counts the number of available slots in the ringbuffer,
+		 * so we avoid overflowing it: it's decremented (if positive)
+		 * when allocating a kiocb and incremented when the resulting
+		 * io_event is pulled off the ringbuffer.
+		 *
+		 * We batch accesses to it with a percpu version.
+		 */
+		atomic_t	reqs_available;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t	ctx_lock;
+		struct list_head active_reqs;	/* used for cancellation */
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		struct mutex	ring_lock;
+		wait_queue_head_t wait;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		unsigned	tail;
+		unsigned	completed_events;
+		spinlock_t	completion_lock;
+	} ____cacheline_aligned_in_smp;
+
+	struct page		*internal_pages[AIO_RING_PAGES];
+	struct file		*aio_ring_file;
+
+	unsigned		id;
+};
+
+/*
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ */
+#define KIOCB_CANCELLED		((void *) (~0ULL))
+
+struct aio_kiocb {
+	struct kiocb		common;
+
+	struct kioctx		*ki_ctx;
+	kiocb_cancel_fn		*ki_cancel;
+
+	struct iocb __user	*ki_user_iocb;	/* user's aiocb */
+	__u64			ki_user_data;	/* user's data for completion */
+
+	struct list_head	ki_list;	/* the aio core uses this
+						 * for cancellation */
+
+	/*
+	 * If the aio_resfd field of the userspace iocb is not zero,
+	 * this is the underlying eventfd context to deliver events to.
+	 */
+	struct eventfd_ctx	*ki_eventfd;
+};
+
+/*------ sysctl variables----*/
+static DEFINE_SPINLOCK(aio_nr_lock);
+unsigned long aio_nr;		/* current system wide number of aio requests */
+unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+/*----end sysctl variables---*/
+
+static struct kmem_cache	*kiocb_cachep;
+static struct kmem_cache	*kioctx_cachep;
+
+static struct vfsmount *aio_mnt;
+
+static const struct file_operations aio_ring_fops;
+static const struct address_space_operations aio_ctx_aops;
+
+static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
+{
+	struct qstr this = QSTR_INIT("[aio]", 5);
+	struct file *file;
+	struct path path;
+	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	inode->i_mapping->a_ops = &aio_ctx_aops;
+	inode->i_mapping->private_data = ctx;
+	inode->i_size = PAGE_SIZE * nr_pages;
+
+	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
+	if (!path.dentry) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	path.mnt = mntget(aio_mnt);
+
+	d_instantiate(path.dentry, inode);
+	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
+	if (IS_ERR(file)) {
+		path_put(&path);
+		return file;
+	}
+
+	file->f_flags = O_RDWR;
+	return file;
+}
+
+static struct dentry *aio_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	static const struct dentry_operations ops = {
+		.d_dname	= simple_dname,
+	};
+	return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
+}
+
+/* aio_setup
+ *	Creates the slab caches used by the aio routines, panic on
+ *	failure as this is done early during the boot sequence.
+ */
+static int __init aio_setup(void)
+{
+	static struct file_system_type aio_fs = {
+		.name		= "aio",
+		.mount		= aio_mount,
+		.kill_sb	= kill_anon_super,
+	};
+	BUG_ON(swork_get());
+	aio_mnt = kern_mount(&aio_fs);
+	if (IS_ERR(aio_mnt))
+		panic("Failed to create aio fs mount.");
+
+	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+
+	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
+
+	return 0;
+}
+__initcall(aio_setup);
+
+static void put_aio_ring_file(struct kioctx *ctx)
+{
+	struct file *aio_ring_file = ctx->aio_ring_file;
+	if (aio_ring_file) {
+		truncate_setsize(aio_ring_file->f_inode, 0);
+
+		/* Prevent further access to the kioctx from migratepages */
+		spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
+		aio_ring_file->f_inode->i_mapping->private_data = NULL;
+		ctx->aio_ring_file = NULL;
+		spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
+
+		fput(aio_ring_file);
+	}
+}
+
+static void aio_free_ring(struct kioctx *ctx)
+{
+	int i;
+
+	/* Disconnect the kiotx from the ring file.  This prevents future
+	 * accesses to the kioctx from page migration.
+	 */
+	put_aio_ring_file(ctx);
+
+	for (i = 0; i < ctx->nr_pages; i++) {
+		struct page *page;
+		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
+				page_count(ctx->ring_pages[i]));
+		page = ctx->ring_pages[i];
+		if (!page)
+			continue;
+		ctx->ring_pages[i] = NULL;
+		put_page(page);
+	}
+
+	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
+		kfree(ctx->ring_pages);
+		ctx->ring_pages = NULL;
+	}
+}
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_DONTEXPAND;
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct kioctx_table *table;
+	int i, res = -EINVAL;
+
+	spin_lock(&mm->ioctx_lock);
+	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
+	for (i = 0; i < table->nr; i++) {
+		struct kioctx *ctx;
+
+		ctx = table->table[i];
+		if (ctx && ctx->aio_ring_file == file) {
+			if (!atomic_read(&ctx->dead)) {
+				ctx->user_id = ctx->mmap_base = vma->vm_start;
+				res = 0;
+			}
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+	spin_unlock(&mm->ioctx_lock);
+	return res;
+}
+
+static const struct file_operations aio_ring_fops = {
+	.mmap = aio_ring_mmap,
+	.mremap = aio_ring_remap,
+};
+
+#if IS_ENABLED(CONFIG_MIGRATION)
+static int aio_migratepage(struct address_space *mapping, struct page *new,
+			struct page *old, enum migrate_mode mode)
+{
+	struct kioctx *ctx;
+	unsigned long flags;
+	pgoff_t idx;
+	int rc;
+
+	rc = 0;
+
+	/* mapping->private_lock here protects against the kioctx teardown.  */
+	spin_lock(&mapping->private_lock);
+	ctx = mapping->private_data;
+	if (!ctx) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* The ring_lock mutex.  The prevents aio_read_events() from writing
+	 * to the ring's head, and prevents page migration from mucking in
+	 * a partially initialized kiotx.
+	 */
+	if (!mutex_trylock(&ctx->ring_lock)) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	idx = old->index;
+	if (idx < (pgoff_t)ctx->nr_pages) {
+		/* Make sure the old page hasn't already been changed */
+		if (ctx->ring_pages[idx] != old)
+			rc = -EAGAIN;
+	} else
+		rc = -EINVAL;
+
+	if (rc != 0)
+		goto out_unlock;
+
+	/* Writeback must be complete */
+	BUG_ON(PageWriteback(old));
+	get_page(new);
+
+	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
+	if (rc != MIGRATEPAGE_SUCCESS) {
+		put_page(new);
+		goto out_unlock;
+	}
+
+	/* Take completion_lock to prevent other writes to the ring buffer
+	 * while the old page is copied to the new.  This prevents new
+	 * events from being lost.
+	 */
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	migrate_page_copy(new, old);
+	BUG_ON(ctx->ring_pages[idx] != old);
+	ctx->ring_pages[idx] = new;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	/* The old page is no longer accessible. */
+	put_page(old);
+
+out_unlock:
+	mutex_unlock(&ctx->ring_lock);
+out:
+	spin_unlock(&mapping->private_lock);
+	return rc;
+}
+#endif
+
+static const struct address_space_operations aio_ctx_aops = {
+	.set_page_dirty = __set_page_dirty_no_writeback,
+#if IS_ENABLED(CONFIG_MIGRATION)
+	.migratepage	= aio_migratepage,
+#endif
+};
+
+static int aio_setup_ring(struct kioctx *ctx)
+{
+	struct aio_ring *ring;
+	unsigned nr_events = ctx->max_reqs;
+	struct mm_struct *mm = current->mm;
+	unsigned long size, unused;
+	int nr_pages;
+	int i;
+	struct file *file;
+
+	/* Compensate for the ring buffer's head/tail overlap entry */
+	nr_events += 2;	/* 1 is required, 2 for good luck */
+
+	size = sizeof(struct aio_ring);
+	size += sizeof(struct io_event) * nr_events;
+
+	nr_pages = PFN_UP(size);
+	if (nr_pages < 0)
+		return -EINVAL;
+
+	file = aio_private_file(ctx, nr_pages);
+	if (IS_ERR(file)) {
+		ctx->aio_ring_file = NULL;
+		return -ENOMEM;
+	}
+
+	ctx->aio_ring_file = file;
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+			/ sizeof(struct io_event);
+
+	ctx->ring_pages = ctx->internal_pages;
+	if (nr_pages > AIO_RING_PAGES) {
+		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+					  GFP_KERNEL);
+		if (!ctx->ring_pages) {
+			put_aio_ring_file(ctx);
+			return -ENOMEM;
+		}
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+		page = find_or_create_page(file->f_inode->i_mapping,
+					   i, GFP_HIGHUSER | __GFP_ZERO);
+		if (!page)
+			break;
+		pr_debug("pid(%d) page[%d]->count=%d\n",
+			 current->pid, i, page_count(page));
+		SetPageUptodate(page);
+		unlock_page(page);
+
+		ctx->ring_pages[i] = page;
+	}
+	ctx->nr_pages = i;
+
+	if (unlikely(i != nr_pages)) {
+		aio_free_ring(ctx);
+		return -ENOMEM;
+	}
+
+	ctx->mmap_size = nr_pages * PAGE_SIZE;
+	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+
+	down_write(&mm->mmap_sem);
+	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
+				       PROT_READ | PROT_WRITE,
+				       MAP_SHARED, 0, &unused);
+	up_write(&mm->mmap_sem);
+	if (IS_ERR((void *)ctx->mmap_base)) {
+		ctx->mmap_size = 0;
+		aio_free_ring(ctx);
+		return -ENOMEM;
+	}
+
+	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+
+	ctx->user_id = ctx->mmap_base;
+	ctx->nr_events = nr_events; /* trusted copy */
+
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->nr = nr_events;	/* user copy */
+	ring->id = ~0U;
+	ring->head = ring->tail = 0;
+	ring->magic = AIO_RING_MAGIC;
+	ring->compat_features = AIO_RING_COMPAT_FEATURES;
+	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
+	ring->header_length = sizeof(struct aio_ring);
+	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
+
+	return 0;
+}
+
+#define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
+#define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
+#define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
+
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
+{
+	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
+	struct kioctx *ctx = req->ki_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+	if (!req->ki_list.next)
+		list_add(&req->ki_list, &ctx->active_reqs);
+
+	req->ki_cancel = cancel;
+
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
+static int kiocb_cancel(struct aio_kiocb *kiocb)
+{
+	kiocb_cancel_fn *old, *cancel;
+
+	/*
+	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+	 * actually has a cancel function, hence the cmpxchg()
+	 */
+
+	cancel = ACCESS_ONCE(kiocb->ki_cancel);
+	do {
+		if (!cancel || cancel == KIOCB_CANCELLED)
+			return -EINVAL;
+
+		old = cancel;
+		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+	} while (cancel != old);
+
+	return cancel(&kiocb->common);
+}
+
+static void free_ioctx(struct swork_event *sev)
+{
+	struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
+
+	pr_debug("freeing %p\n", ctx);
+
+	aio_free_ring(ctx);
+	free_percpu(ctx->cpu);
+	percpu_ref_exit(&ctx->reqs);
+	percpu_ref_exit(&ctx->users);
+	kmem_cache_free(kioctx_cachep, ctx);
+}
+
+static void free_ioctx_reqs(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+
+	/* At this point we know that there are no any in-flight requests */
+	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
+		complete(&ctx->rq_wait->comp);
+
+	INIT_SWORK(&ctx->free_work, free_ioctx);
+	swork_queue(&ctx->free_work);
+}
+
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
+ */
+static void free_ioctx_users_work(struct swork_event *sev)
+{
+	struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
+	struct aio_kiocb *req;
+
+	spin_lock_irq(&ctx->ctx_lock);
+
+	while (!list_empty(&ctx->active_reqs)) {
+		req = list_first_entry(&ctx->active_reqs,
+				       struct aio_kiocb, ki_list);
+
+		list_del_init(&req->ki_list);
+		kiocb_cancel(req);
+	}
+
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	percpu_ref_kill(&ctx->reqs);
+	percpu_ref_put(&ctx->reqs);
+}
+
+static void free_ioctx_users(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+
+	INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
+	swork_queue(&ctx->free_work);
+}
+
+static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
+{
+	unsigned i, new_nr;
+	struct kioctx_table *table, *old;
+	struct aio_ring *ring;
+
+	spin_lock(&mm->ioctx_lock);
+	table = rcu_dereference_raw(mm->ioctx_table);
+
+	while (1) {
+		if (table)
+			for (i = 0; i < table->nr; i++)
+				if (!table->table[i]) {
+					ctx->id = i;
+					table->table[i] = ctx;
+					spin_unlock(&mm->ioctx_lock);
+
+					/* While kioctx setup is in progress,
+					 * we are protected from page migration
+					 * changes ring_pages by ->ring_lock.
+					 */
+					ring = kmap_atomic(ctx->ring_pages[0]);
+					ring->id = ctx->id;
+					kunmap_atomic(ring);
+					return 0;
+				}
+
+		new_nr = (table ? table->nr : 1) * 4;
+		spin_unlock(&mm->ioctx_lock);
+
+		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
+				new_nr, GFP_KERNEL);
+		if (!table)
+			return -ENOMEM;
+
+		table->nr = new_nr;
+
+		spin_lock(&mm->ioctx_lock);
+		old = rcu_dereference_raw(mm->ioctx_table);
+
+		if (!old) {
+			rcu_assign_pointer(mm->ioctx_table, table);
+		} else if (table->nr > old->nr) {
+			memcpy(table->table, old->table,
+			       old->nr * sizeof(struct kioctx *));
+
+			rcu_assign_pointer(mm->ioctx_table, table);
+			kfree_rcu(old, rcu);
+		} else {
+			kfree(table);
+			table = old;
+		}
+	}
+}
+
+static void aio_nr_sub(unsigned nr)
+{
+	spin_lock(&aio_nr_lock);
+	if (WARN_ON(aio_nr - nr > aio_nr))
+		aio_nr = 0;
+	else
+		aio_nr -= nr;
+	spin_unlock(&aio_nr_lock);
+}
+
+/* ioctx_alloc
+ *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
+ */
+static struct kioctx *ioctx_alloc(unsigned nr_events)
+{
+	struct mm_struct *mm = current->mm;
+	struct kioctx *ctx;
+	int err = -ENOMEM;
+
+	/*
+	 * We keep track of the number of available ringbuffer slots, to prevent
+	 * overflow (reqs_available), and we also use percpu counters for this.
+	 *
+	 * So since up to half the slots might be on other cpu's percpu counters
+	 * and unavailable, double nr_events so userspace sees what they
+	 * expected: additionally, we move req_batch slots to/from percpu
+	 * counters at a time, so make sure that isn't 0:
+	 */
+	nr_events = max(nr_events, num_possible_cpus() * 4);
+	nr_events *= 2;
+
+	/* Prevent overflows */
+	if (nr_events > (0x10000000U / sizeof(struct io_event))) {
+		pr_debug("ENOMEM: nr_events too high\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
+		return ERR_PTR(-EAGAIN);
+
+	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	ctx->max_reqs = nr_events;
+
+	spin_lock_init(&ctx->ctx_lock);
+	spin_lock_init(&ctx->completion_lock);
+	mutex_init(&ctx->ring_lock);
+	/* Protect against page migration throughout kiotx setup by keeping
+	 * the ring_lock mutex held until setup is complete. */
+	mutex_lock(&ctx->ring_lock);
+	init_waitqueue_head(&ctx->wait);
+
+	INIT_LIST_HEAD(&ctx->active_reqs);
+
+	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
+		goto err;
+
+	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
+		goto err;
+
+	ctx->cpu = alloc_percpu(struct kioctx_cpu);
+	if (!ctx->cpu)
+		goto err;
+
+	err = aio_setup_ring(ctx);
+	if (err < 0)
+		goto err;
+
+	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
+	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
+	if (ctx->req_batch < 1)
+		ctx->req_batch = 1;
+
+	/* limit the number of system wide aios */
+	spin_lock(&aio_nr_lock);
+	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
+	    aio_nr + nr_events < aio_nr) {
+		spin_unlock(&aio_nr_lock);
+		err = -EAGAIN;
+		goto err_ctx;
+	}
+	aio_nr += ctx->max_reqs;
+	spin_unlock(&aio_nr_lock);
+
+	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
+	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
+
+	err = ioctx_add_table(ctx, mm);
+	if (err)
+		goto err_cleanup;
+
+	/* Release the ring_lock mutex now that all setup is complete. */
+	mutex_unlock(&ctx->ring_lock);
+
+	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+		 ctx, ctx->user_id, mm, ctx->nr_events);
+	return ctx;
+
+err_cleanup:
+	aio_nr_sub(ctx->max_reqs);
+err_ctx:
+	atomic_set(&ctx->dead, 1);
+	if (ctx->mmap_size)
+		vm_munmap(ctx->mmap_base, ctx->mmap_size);
+	aio_free_ring(ctx);
+err:
+	mutex_unlock(&ctx->ring_lock);
+	free_percpu(ctx->cpu);
+	percpu_ref_exit(&ctx->reqs);
+	percpu_ref_exit(&ctx->users);
+	kmem_cache_free(kioctx_cachep, ctx);
+	pr_debug("error allocating ioctx %d\n", err);
+	return ERR_PTR(err);
+}
+
+/* kill_ioctx
+ *	Cancels all outstanding aio requests on an aio context.  Used
+ *	when the processes owning a context have all exited to encourage
+ *	the rapid destruction of the kioctx.
+ */
+static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+		      struct ctx_rq_wait *wait)
+{
+	struct kioctx_table *table;
+
+	spin_lock(&mm->ioctx_lock);
+	if (atomic_xchg(&ctx->dead, 1)) {
+		spin_unlock(&mm->ioctx_lock);
+		return -EINVAL;
+	}
+
+	table = rcu_dereference_raw(mm->ioctx_table);
+	WARN_ON(ctx != table->table[ctx->id]);
+	table->table[ctx->id] = NULL;
+	spin_unlock(&mm->ioctx_lock);
+
+	/* percpu_ref_kill() will do the necessary call_rcu() */
+	wake_up_all(&ctx->wait);
+
+	/*
+	 * It'd be more correct to do this in free_ioctx(), after all
+	 * the outstanding kiocbs have finished - but by then io_destroy
+	 * has already returned, so io_setup() could potentially return
+	 * -EAGAIN with no ioctxs actually in use (as far as userspace
+	 *  could tell).
+	 */
+	aio_nr_sub(ctx->max_reqs);
+
+	if (ctx->mmap_size)
+		vm_munmap(ctx->mmap_base, ctx->mmap_size);
+
+	ctx->rq_wait = wait;
+	percpu_ref_kill(&ctx->users);
+	return 0;
+}
+
+/*
+ * exit_aio: called when the last user of mm goes away.  At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
+ */
+void exit_aio(struct mm_struct *mm)
+{
+	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
+	struct ctx_rq_wait wait;
+	int i, skipped;
+
+	if (!table)
+		return;
+
+	atomic_set(&wait.count, table->nr);
+	init_completion(&wait.comp);
+
+	skipped = 0;
+	for (i = 0; i < table->nr; ++i) {
+		struct kioctx *ctx = table->table[i];
+
+		if (!ctx) {
+			skipped++;
+			continue;
+		}
+
+		/*
+		 * We don't need to bother with munmap() here - exit_mmap(mm)
+		 * is coming and it'll unmap everything. And we simply can't,
+		 * this is not necessarily our ->mm.
+		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
+		 * that it needs to unmap the area, just set it to 0.
+		 */
+		ctx->mmap_size = 0;
+		kill_ioctx(mm, ctx, &wait);
+	}
+
+	if (!atomic_sub_and_test(skipped, &wait.count)) {
+		/* Wait until all IO for the context are done. */
+		wait_for_completion(&wait.comp);
+	}
+
+	RCU_INIT_POINTER(mm->ioctx_table, NULL);
+	kfree(table);
+}
+
+static void put_reqs_available(struct kioctx *ctx, unsigned nr)
+{
+	struct kioctx_cpu *kcpu;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	kcpu = this_cpu_ptr(ctx->cpu);
+	kcpu->reqs_available += nr;
+
+	while (kcpu->reqs_available >= ctx->req_batch * 2) {
+		kcpu->reqs_available -= ctx->req_batch;
+		atomic_add(ctx->req_batch, &ctx->reqs_available);
+	}
+
+	local_irq_restore(flags);
+}
+
+static bool get_reqs_available(struct kioctx *ctx)
+{
+	struct kioctx_cpu *kcpu;
+	bool ret = false;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	kcpu = this_cpu_ptr(ctx->cpu);
+	if (!kcpu->reqs_available) {
+		int old, avail = atomic_read(&ctx->reqs_available);
+
+		do {
+			if (avail < ctx->req_batch)
+				goto out;
+
+			old = avail;
+			avail = atomic_cmpxchg(&ctx->reqs_available,
+					       avail, avail - ctx->req_batch);
+		} while (avail != old);
+
+		kcpu->reqs_available += ctx->req_batch;
+	}
+
+	ret = true;
+	kcpu->reqs_available--;
+out:
+	local_irq_restore(flags);
+	return ret;
+}
+
+/* refill_reqs_available
+ *	Updates the reqs_available reference counts used for tracking the
+ *	number of free slots in the completion ring.  This can be called
+ *	from aio_complete() (to optimistically update reqs_available) or
+ *	from aio_get_req() (the we're out of events case).  It must be
+ *	called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+	unsigned events_in_ring, completed;
+
+	/* Clamp head since userland can write to it. */
+	head %= ctx->nr_events;
+	if (head <= tail)
+		events_in_ring = tail - head;
+	else
+		events_in_ring = ctx->nr_events - (head - tail);
+
+	completed = ctx->completed_events;
+	if (events_in_ring < completed)
+		completed -= events_in_ring;
+	else
+		completed = 0;
+
+	if (!completed)
+		return;
+
+	ctx->completed_events -= completed;
+	put_reqs_available(ctx, completed);
+}
+
+/* user_refill_reqs_available
+ *	Called to refill reqs_available when aio_get_req() encounters an
+ *	out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+	spin_lock_irq(&ctx->completion_lock);
+	if (ctx->completed_events) {
+		struct aio_ring *ring;
+		unsigned head;
+
+		/* Access of ring->head may race with aio_read_events_ring()
+		 * here, but that's okay since whether we read the old version
+		 * or the new version, and either will be valid.  The important
+		 * part is that head cannot pass tail since we prevent
+		 * aio_complete() from updating tail by holding
+		 * ctx->completion_lock.  Even if head is invalid, the check
+		 * against ctx->completed_events below will make sure we do the
+		 * safe/right thing.
+		 */
+		ring = kmap_atomic(ctx->ring_pages[0]);
+		head = ring->head;
+		kunmap_atomic(ring);
+
+		refill_reqs_available(ctx, head, ctx->tail);
+	}
+
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
+/* aio_get_req
+ *	Allocate a slot for an aio request.
+ * Returns NULL if no requests are free.
+ */
+static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
+{
+	struct aio_kiocb *req;
+
+	if (!get_reqs_available(ctx)) {
+		user_refill_reqs_available(ctx);
+		if (!get_reqs_available(ctx))
+			return NULL;
+	}
+
+	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+	if (unlikely(!req))
+		goto out_put;
+
+	percpu_ref_get(&ctx->reqs);
+
+	req->ki_ctx = ctx;
+	return req;
+out_put:
+	put_reqs_available(ctx, 1);
+	return NULL;
+}
+
+static void kiocb_free(struct aio_kiocb *req)
+{
+	if (req->common.ki_filp)
+		fput(req->common.ki_filp);
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
+	kmem_cache_free(kiocb_cachep, req);
+}
+
+static struct kioctx *lookup_ioctx(unsigned long ctx_id)
+{
+	struct aio_ring __user *ring  = (void __user *)ctx_id;
+	struct mm_struct *mm = current->mm;
+	struct kioctx *ctx, *ret = NULL;
+	struct kioctx_table *table;
+	unsigned id;
+
+	if (get_user(id, &ring->id))
+		return NULL;
+
+	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
+
+	if (!table || id >= table->nr)
+		goto out;
+
+	ctx = table->table[id];
+	if (ctx && ctx->user_id == ctx_id) {
+		percpu_ref_get(&ctx->users);
+		ret = ctx;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+/* aio_complete
+ *	Called when the io request on the given iocb is complete.
+ */
+static void aio_complete(struct kiocb *kiocb, long res, long res2)
+{
+	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
+	struct kioctx	*ctx = iocb->ki_ctx;
+	struct aio_ring	*ring;
+	struct io_event	*ev_page, *event;
+	unsigned tail, pos, head;
+	unsigned long	flags;
+
+	/*
+	 * Special case handling for sync iocbs:
+	 *  - events go directly into the iocb for fast handling
+	 *  - the sync task with the iocb in its stack holds the single iocb
+	 *    ref, no other paths have a way to get another ref
+	 *  - the sync task helpfully left a reference to itself in the iocb
+	 */
+	BUG_ON(is_sync_kiocb(kiocb));
+
+	if (iocb->ki_list.next) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->ctx_lock, flags);
+		list_del(&iocb->ki_list);
+		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+	}
+
+	/*
+	 * Add a completion event to the ring buffer. Must be done holding
+	 * ctx->completion_lock to prevent other code from messing with the tail
+	 * pointer since we might be called from irq context.
+	 */
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+
+	tail = ctx->tail;
+	pos = tail + AIO_EVENTS_OFFSET;
+
+	if (++tail >= ctx->nr_events)
+		tail = 0;
+
+	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+
+	event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
+	event->data = iocb->ki_user_data;
+	event->res = res;
+	event->res2 = res2;
+
+	kunmap_atomic(ev_page);
+	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+
+	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+		 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
+		 res, res2);
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	smp_wmb();	/* make event visible before updating tail */
+
+	ctx->tail = tail;
+
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	head = ring->head;
+	ring->tail = tail;
+	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
+
+	ctx->completed_events++;
+	if (ctx->completed_events > 1)
+		refill_reqs_available(ctx, head, tail);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
+
+	/*
+	 * Check if the user asked us to deliver the result through an
+	 * eventfd. The eventfd_signal() function is safe to be called
+	 * from IRQ context.
+	 */
+	if (iocb->ki_eventfd != NULL)
+		eventfd_signal(iocb->ki_eventfd, 1);
+
+	/* everything turned out well, dispose of the aiocb. */
+	kiocb_free(iocb);
+
+	/*
+	 * We have to order our ring_info tail store above and test
+	 * of the wait list below outside the wait lock.  This is
+	 * like in wake_up_bit() where clearing a bit has to be
+	 * ordered with the unlocked test.
+	 */
+	smp_mb();
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+
+	percpu_ref_put(&ctx->reqs);
+}
+
+/* aio_read_events_ring
+ *	Pull an event off of the ioctx's event ring.  Returns the number of
+ *	events fetched
+ */
+static long aio_read_events_ring(struct kioctx *ctx,
+				 struct io_event __user *event, long nr)
+{
+	struct aio_ring *ring;
+	unsigned head, tail, pos;
+	long ret = 0;
+	int copy_ret;
+
+	/*
+	 * The mutex can block and wake us up and that will cause
+	 * wait_event_interruptible_hrtimeout() to schedule without sleeping
+	 * and repeat. This should be rare enough that it doesn't cause
+	 * peformance issues. See the comment in read_events() for more detail.
+	 */
+	sched_annotate_sleep();
+	mutex_lock(&ctx->ring_lock);
+
+	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	head = ring->head;
+	tail = ring->tail;
+	kunmap_atomic(ring);
+
+	/*
+	 * Ensure that once we've read the current tail pointer, that
+	 * we also see the events that were stored up to the tail.
+	 */
+	smp_rmb();
+
+	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
+
+	if (head == tail)
+		goto out;
+
+	head %= ctx->nr_events;
+	tail %= ctx->nr_events;
+
+	while (ret < nr) {
+		long avail;
+		struct io_event *ev;
+		struct page *page;
+
+		avail = (head <= tail ?  tail : ctx->nr_events) - head;
+		if (head == tail)
+			break;
+
+		avail = min(avail, nr - ret);
+		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
+
+		pos = head + AIO_EVENTS_OFFSET;
+		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+		pos %= AIO_EVENTS_PER_PAGE;
+
+		ev = kmap(page);
+		copy_ret = copy_to_user(event + ret, ev + pos,
+					sizeof(*ev) * avail);
+		kunmap(page);
+
+		if (unlikely(copy_ret)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		ret += avail;
+		head += avail;
+		head %= ctx->nr_events;
+	}
+
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->head = head;
+	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
+
+	pr_debug("%li  h%u t%u\n", ret, head, tail);
+out:
+	mutex_unlock(&ctx->ring_lock);
+
+	return ret;
+}
+
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+			    struct io_event __user *event, long *i)
+{
+	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
+
+	if (ret > 0)
+		*i += ret;
+
+	if (unlikely(atomic_read(&ctx->dead)))
+		ret = -EINVAL;
+
+	if (!*i)
+		*i = ret;
+
+	return ret < 0 || *i >= min_nr;
+}
+
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
+			struct io_event __user *event,
+			struct timespec __user *timeout)
+{
+	ktime_t until = { .tv64 = KTIME_MAX };
+	long ret = 0;
+
+	if (timeout) {
+		struct timespec	ts;
+
+		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+			return -EFAULT;
+
+		until = timespec_to_ktime(ts);
+	}
+
+	/*
+	 * Note that aio_read_events() is being called as the conditional - i.e.
+	 * we're calling it after prepare_to_wait() has set task state to
+	 * TASK_INTERRUPTIBLE.
+	 *
+	 * But aio_read_events() can block, and if it blocks it's going to flip
+	 * the task state back to TASK_RUNNING.
+	 *
+	 * This should be ok, provided it doesn't flip the state back to
+	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
+	 * will only happen if the mutex_lock() call blocks, and we then find
+	 * the ringbuffer empty. So in practice we should be ok, but it's
+	 * something to be aware of when touching this code.
+	 */
+	if (until.tv64 == 0)
+		aio_read_events(ctx, min_nr, nr, event, &ret);
+	else
+		wait_event_interruptible_hrtimeout(ctx->wait,
+				aio_read_events(ctx, min_nr, nr, event, &ret),
+				until);
+
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+
+	return ret;
+}
+
+/* sys_io_setup:
+ *	Create an aio_context capable of receiving at least nr_events.
+ *	ctxp must not point to an aio_context that already exists, and
+ *	must be initialized to 0 prior to the call.  On successful
+ *	creation of the aio_context, *ctxp is filled in with the resulting 
+ *	handle.  May fail with -EINVAL if *ctxp is not initialized,
+ *	if the specified nr_events exceeds internal limits.  May fail 
+ *	with -EAGAIN if the specified nr_events exceeds the user's limit 
+ *	of available events.  May fail with -ENOMEM if insufficient kernel
+ *	resources are available.  May fail with -EFAULT if an invalid
+ *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
+ *	implemented.
+ */
+SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
+{
+	struct kioctx *ioctx = NULL;
+	unsigned long ctx;
+	long ret;
+
+	ret = get_user(ctx, ctxp);
+	if (unlikely(ret))
+		goto out;
+
+	ret = -EINVAL;
+	if (unlikely(ctx || nr_events == 0)) {
+		pr_debug("EINVAL: ctx %lu nr_events %u\n",
+		         ctx, nr_events);
+		goto out;
+	}
+
+	ioctx = ioctx_alloc(nr_events);
+	ret = PTR_ERR(ioctx);
+	if (!IS_ERR(ioctx)) {
+		ret = put_user(ioctx->user_id, ctxp);
+		if (ret)
+			kill_ioctx(current->mm, ioctx, NULL);
+		percpu_ref_put(&ioctx->users);
+	}
+
+out:
+	return ret;
+}
+
+/* sys_io_destroy:
+ *	Destroy the aio_context specified.  May cancel any outstanding 
+ *	AIOs and block on completion.  Will fail with -ENOSYS if not
+ *	implemented.  May fail with -EINVAL if the context pointed to
+ *	is invalid.
+ */
+SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx);
+	if (likely(NULL != ioctx)) {
+		struct ctx_rq_wait wait;
+		int ret;
+
+		init_completion(&wait.comp);
+		atomic_set(&wait.count, 1);
+
+		/* Pass requests_done to kill_ioctx() where it can be set
+		 * in a thread-safe way. If we try to set it here then we have
+		 * a race condition if two io_destroy() called simultaneously.
+		 */
+		ret = kill_ioctx(current->mm, ioctx, &wait);
+		percpu_ref_put(&ioctx->users);
+
+		/* Wait until all IO for the context are done. Otherwise kernel
+		 * keep using user-space buffers even if user thinks the context
+		 * is destroyed.
+		 */
+		if (!ret)
+			wait_for_completion(&wait.comp);
+
+		return ret;
+	}
+	pr_debug("EINVAL: invalid context id\n");
+	return -EINVAL;
+}
+
+typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
+
+static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
+				 struct iovec **iovec,
+				 bool compat,
+				 struct iov_iter *iter)
+{
+#ifdef CONFIG_COMPAT
+	if (compat)
+		return compat_import_iovec(rw,
+				(struct compat_iovec __user *)buf,
+				len, UIO_FASTIOV, iovec, iter);
+#endif
+	return import_iovec(rw, (struct iovec __user *)buf,
+				len, UIO_FASTIOV, iovec, iter);
+}
+
+/*
+ * aio_run_iocb:
+ *	Performs the initial checks and io submission.
+ */
+static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
+			    char __user *buf, size_t len, bool compat)
+{
+	struct file *file = req->ki_filp;
+	ssize_t ret;
+	int rw;
+	fmode_t mode;
+	rw_iter_op *iter_op;
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct iov_iter iter;
+
+	switch (opcode) {
+	case IOCB_CMD_PREAD:
+	case IOCB_CMD_PREADV:
+		mode	= FMODE_READ;
+		rw	= READ;
+		iter_op	= file->f_op->read_iter;
+		goto rw_common;
+
+	case IOCB_CMD_PWRITE:
+	case IOCB_CMD_PWRITEV:
+		mode	= FMODE_WRITE;
+		rw	= WRITE;
+		iter_op	= file->f_op->write_iter;
+		goto rw_common;
+rw_common:
+		if (unlikely(!(file->f_mode & mode)))
+			return -EBADF;
+
+		if (!iter_op)
+			return -EINVAL;
+
+		if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+			ret = aio_setup_vectored_rw(rw, buf, len,
+						&iovec, compat, &iter);
+		else {
+			ret = import_single_range(rw, buf, len, iovec, &iter);
+			iovec = NULL;
+		}
+		if (!ret)
+			ret = rw_verify_area(rw, file, &req->ki_pos,
+					     iov_iter_count(&iter));
+		if (ret < 0) {
+			kfree(iovec);
+			return ret;
+		}
+
+		len = ret;
+
+		if (rw == WRITE)
+			file_start_write(file);
+
+		ret = iter_op(req, &iter);
+
+		if (rw == WRITE)
+			file_end_write(file);
+		kfree(iovec);
+		break;
+
+	case IOCB_CMD_FDSYNC:
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 1);
+		break;
+
+	case IOCB_CMD_FSYNC:
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 0);
+		break;
+
+	default:
+		pr_debug("EINVAL: no operation provided\n");
+		return -EINVAL;
+	}
+
+	if (ret != -EIOCBQUEUED) {
+		/*
+		 * There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+			     ret == -ERESTARTNOHAND ||
+			     ret == -ERESTART_RESTARTBLOCK))
+			ret = -EINTR;
+		aio_complete(req, ret, 0);
+	}
+
+	return 0;
+}
+
+static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+			 struct iocb *iocb, bool compat)
+{
+	struct aio_kiocb *req;
+	ssize_t ret;
+
+	/* enforce forwards compatibility on users */
+	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+		pr_debug("EINVAL: reserve field set\n");
+		return -EINVAL;
+	}
+
+	/* prevent overflows */
+	if (unlikely(
+	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+	    ((ssize_t)iocb->aio_nbytes < 0)
+	   )) {
+		pr_debug("EINVAL: overflow check\n");
+		return -EINVAL;
+	}
+
+	req = aio_get_req(ctx);
+	if (unlikely(!req))
+		return -EAGAIN;
+
+	req->common.ki_filp = fget(iocb->aio_fildes);
+	if (unlikely(!req->common.ki_filp)) {
+		ret = -EBADF;
+		goto out_put_req;
+	}
+	req->common.ki_pos = iocb->aio_offset;
+	req->common.ki_complete = aio_complete;
+	req->common.ki_flags = iocb_flags(req->common.ki_filp);
+
+	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+		/*
+		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
+		 * instance of the file* now. The file descriptor must be
+		 * an eventfd() fd, and will be signaled for each completed
+		 * event using the eventfd_signal() function.
+		 */
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
+		if (IS_ERR(req->ki_eventfd)) {
+			ret = PTR_ERR(req->ki_eventfd);
+			req->ki_eventfd = NULL;
+			goto out_put_req;
+		}
+
+		req->common.ki_flags |= IOCB_EVENTFD;
+	}
+
+	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
+	if (unlikely(ret)) {
+		pr_debug("EFAULT: aio_key\n");
+		goto out_put_req;
+	}
+
+	req->ki_user_iocb = user_iocb;
+	req->ki_user_data = iocb->aio_data;
+
+	ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
+			   (char __user *)(unsigned long)iocb->aio_buf,
+			   iocb->aio_nbytes,
+			   compat);
+	if (ret)
+		goto out_put_req;
+
+	return 0;
+out_put_req:
+	put_reqs_available(ctx, 1);
+	percpu_ref_put(&ctx->reqs);
+	kiocb_free(req);
+	return ret;
+}
+
+long do_io_submit(aio_context_t ctx_id, long nr,
+		  struct iocb __user *__user *iocbpp, bool compat)
+{
+	struct kioctx *ctx;
+	long ret = 0;
+	int i = 0;
+	struct blk_plug plug;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
+		nr = LONG_MAX/sizeof(*iocbpp);
+
+	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
+		return -EFAULT;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx)) {
+		pr_debug("EINVAL: invalid context id\n");
+		return -EINVAL;
+	}
+
+	blk_start_plug(&plug);
+
+	/*
+	 * AKPM: should this return a partial result if some of the IOs were
+	 * successfully submitted?
+	 */
+	for (i=0; i<nr; i++) {
+		struct iocb __user *user_iocb;
+		struct iocb tmp;
+
+		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+		if (ret)
+			break;
+	}
+	blk_finish_plug(&plug);
+
+	percpu_ref_put(&ctx->users);
+	return i ? i : ret;
+}
+
+/* sys_io_submit:
+ *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *	the number of iocbs queued.  May return -EINVAL if the aio_context
+ *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *	*iocbpp[0] is not properly initialized, if the operation specified
+ *	is invalid for the file descriptor in the iocb.  May fail with
+ *	-EFAULT if any of the data structures point to invalid data.  May
+ *	fail with -EBADF if the file descriptor specified in the first
+ *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *	fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+		struct iocb __user * __user *, iocbpp)
+{
+	return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
+
+/* lookup_kiocb
+ *	Finds a given iocb for cancellation.
+ */
+static struct aio_kiocb *
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+{
+	struct aio_kiocb *kiocb;
+
+	assert_spin_locked(&ctx->ctx_lock);
+
+	if (key != KIOCB_KEY)
+		return NULL;
+
+	/* TODO: use a hash or array, this sucks. */
+	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+		if (kiocb->ki_user_iocb == iocb)
+			return kiocb;
+	}
+	return NULL;
+}
+
+/* sys_io_cancel:
+ *	Attempts to cancel an iocb previously passed to io_submit.  If
+ *	the operation is successfully cancelled, the resulting event is
+ *	copied into the memory pointed to by result without being placed
+ *	into the completion queue and 0 is returned.  May fail with
+ *	-EFAULT if any of the data structures pointed to are invalid.
+ *	May fail with -EINVAL if aio_context specified by ctx_id is
+ *	invalid.  May fail with -EAGAIN if the iocb specified was not
+ *	cancelled.  Will fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
+		struct io_event __user *, result)
+{
+	struct kioctx *ctx;
+	struct aio_kiocb *kiocb;
+	u32 key;
+	int ret;
+
+	ret = get_user(key, &iocb->aio_key);
+	if (unlikely(ret))
+		return -EFAULT;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx))
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->ctx_lock);
+
+	kiocb = lookup_kiocb(ctx, iocb, key);
+	if (kiocb)
+		ret = kiocb_cancel(kiocb);
+	else
+		ret = -EINVAL;
+
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	if (!ret) {
+		/*
+		 * The result argument is no longer used - the io_event is
+		 * always delivered via the ring buffer. -EINPROGRESS indicates
+		 * cancellation is progress:
+		 */
+		ret = -EINPROGRESS;
+	}
+
+	percpu_ref_put(&ctx->users);
+
+	return ret;
+}
+
+/* io_getevents:
+ *	Attempts to read at least min_nr events and up to nr events from
+ *	the completion queue for the aio_context specified by ctx_id. If
+ *	it succeeds, the number of read events is returned. May fail with
+ *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
+ *	out of range, if timeout is out of range.  May fail with -EFAULT
+ *	if any of the memory specified is invalid.  May return 0 or
+ *	< min_nr if the timeout specified by timeout has elapsed
+ *	before sufficient events are available, where timeout == NULL
+ *	specifies an infinite timeout. Note that the timeout pointed to by
+ *	timeout is relative.  Will fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct timespec __user *, timeout)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx_id);
+	long ret = -EINVAL;
+
+	if (likely(ioctx)) {
+		if (likely(min_nr <= nr && min_nr >= 0))
+			ret = read_events(ioctx, min_nr, nr, events, timeout);
+		percpu_ref_put(&ioctx->users);
+	}
+	return ret;
+}
diff --git a/kernel/fs/anon_inodes.c b/kernel/fs/anon_inodes.c
new file mode 100644
index 000000000..80ef38c73
--- /dev/null
+++ b/kernel/fs/anon_inodes.c
@@ -0,0 +1,179 @@
+/*
+ *  fs/anon_inodes.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ *  Thanks to Arnd Bergmann for code review and suggestions.
+ *  More changes for Thomas Gleixner suggestions.
+ *
+ */
+
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/magic.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/uaccess.h>
+
+static struct vfsmount *anon_inode_mnt __read_mostly;
+static struct inode *anon_inode_inode;
+
+/*
+ * anon_inodefs_dname() is called from d_path().
+ */
+static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
+				dentry->d_name.name);
+}
+
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+	.d_dname	= anon_inodefs_dname,
+};
+
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "anon_inode:", NULL,
+			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
+
+static struct file_system_type anon_inode_fs_type = {
+	.name		= "anon_inodefs",
+	.mount		= anon_inodefs_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+/**
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
+ *                      anonymous inode, and a dentry that describe the "class"
+ *                      of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfile() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup.  Returns the newly created file* or an error pointer.
+ */
+struct file *anon_inode_getfile(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags)
+{
+	struct qstr this;
+	struct path path;
+	struct file *file;
+
+	if (IS_ERR(anon_inode_inode))
+		return ERR_PTR(-ENODEV);
+
+	if (fops->owner && !try_module_get(fops->owner))
+		return ERR_PTR(-ENOENT);
+
+	/*
+	 * Link the inode to a directory entry by creating a unique name
+	 * using the inode sequence number.
+	 */
+	file = ERR_PTR(-ENOMEM);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0;
+	path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
+	if (!path.dentry)
+		goto err_module;
+
+	path.mnt = mntget(anon_inode_mnt);
+	/*
+	 * We know the anon_inode inode count is always greater than zero,
+	 * so ihold() is safe.
+	 */
+	ihold(anon_inode_inode);
+
+	d_instantiate(path.dentry, anon_inode_inode);
+
+	file = alloc_file(&path, OPEN_FMODE(flags), fops);
+	if (IS_ERR(file))
+		goto err_dput;
+	file->f_mapping = anon_inode_inode->i_mapping;
+
+	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
+	file->private_data = priv;
+
+	return file;
+
+err_dput:
+	path_put(&path);
+err_module:
+	module_put(fops->owner);
+	return file;
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile);
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to an
+ *                    anonymous inode, and a dentry that describe the "class"
+ *                    of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfd() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup.  Returns new descriptor or an error code.
+ */
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags)
+{
+	int error, fd;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = anon_inode_getfile(name, fops, priv, flags);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	fd_install(fd, file);
+
+	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+	return error;
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfd);
+
+static int __init anon_inode_init(void)
+{
+	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
+	if (IS_ERR(anon_inode_mnt))
+		panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
+
+	anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+	if (IS_ERR(anon_inode_inode))
+		panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+
+	return 0;
+}
+
+fs_initcall(anon_inode_init);
+
diff --git a/kernel/fs/attr.c b/kernel/fs/attr.c
new file mode 100644
index 000000000..6530ced19
--- /dev/null
+++ b/kernel/fs/attr.c
@@ -0,0 +1,278 @@
+/*
+ *  linux/fs/attr.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  changes by Thomas Schoebel-Theuer
+ */
+
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fsnotify.h>
+#include <linux/fcntl.h>
+#include <linux/security.h>
+#include <linux/evm.h>
+#include <linux/ima.h>
+
+/**
+ * inode_change_ok - check if attribute changes to an inode are allowed
+ * @inode:	inode to check
+ * @attr:	attributes to change
+ *
+ * Check if we are allowed to change the attributes contained in @attr
+ * in the given inode.  This includes the normal unix access permission
+ * checks, as well as checks for rlimits and others.
+ *
+ * Should be called as the first thing in ->setattr implementations,
+ * possibly after taking additional locks.
+ */
+int inode_change_ok(const struct inode *inode, struct iattr *attr)
+{
+	unsigned int ia_valid = attr->ia_valid;
+
+	/*
+	 * First check size constraints.  These can't be overriden using
+	 * ATTR_FORCE.
+	 */
+	if (ia_valid & ATTR_SIZE) {
+		int error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	/* If force is set do it anyway. */
+	if (ia_valid & ATTR_FORCE)
+		return 0;
+
+	/* Make sure a caller can chown. */
+	if ((ia_valid & ATTR_UID) &&
+	    (!uid_eq(current_fsuid(), inode->i_uid) ||
+	     !uid_eq(attr->ia_uid, inode->i_uid)) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+		return -EPERM;
+
+	/* Make sure caller can chgrp. */
+	if ((ia_valid & ATTR_GID) &&
+	    (!uid_eq(current_fsuid(), inode->i_uid) ||
+	    (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+		return -EPERM;
+
+	/* Make sure a caller can chmod. */
+	if (ia_valid & ATTR_MODE) {
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+		/* Also check the setgid bit! */
+		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
+				inode->i_gid) &&
+		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+			attr->ia_mode &= ~S_ISGID;
+	}
+
+	/* Check for setting the inode time. */
+	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(inode_change_ok);
+
+/**
+ * inode_newsize_ok - may this inode be truncated to a given size
+ * @inode:	the inode to be truncated
+ * @offset:	the new size to assign to the inode
+ * @Returns:	0 on success, -ve errno on failure
+ *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
+ * inode_newsize_ok will check filesystem limits and ulimits to check that the
+ * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
+ * when necessary. Caller must not proceed with inode size change if failure is
+ * returned. @inode must be a file (not directory), with appropriate
+ * permissions to allow truncate (inode_newsize_ok does NOT check these
+ * conditions).
+ */
+int inode_newsize_ok(const struct inode *inode, loff_t offset)
+{
+	if (inode->i_size < offset) {
+		unsigned long limit;
+
+		limit = rlimit(RLIMIT_FSIZE);
+		if (limit != RLIM_INFINITY && offset > limit)
+			goto out_sig;
+		if (offset > inode->i_sb->s_maxbytes)
+			goto out_big;
+	} else {
+		/*
+		 * truncation of in-use swapfiles is disallowed - it would
+		 * cause subsequent swapout to scribble on the now-freed
+		 * blocks.
+		 */
+		if (IS_SWAPFILE(inode))
+			return -ETXTBSY;
+	}
+
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out_big:
+	return -EFBIG;
+}
+EXPORT_SYMBOL(inode_newsize_ok);
+
+/**
+ * setattr_copy - copy simple metadata updates into the generic inode
+ * @inode:	the inode to be updated
+ * @attr:	the new attributes
+ *
+ * setattr_copy must be called with i_mutex held.
+ *
+ * setattr_copy updates the inode's metadata with that specified
+ * in attr. Noticeably missing is inode size update, which is more complex
+ * as it requires pagecache updates.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void setattr_copy(struct inode *inode, const struct iattr *attr)
+{
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_UID)
+		inode->i_uid = attr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		inode->i_gid = attr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		inode->i_atime = timespec_trunc(attr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		inode->i_mtime = timespec_trunc(attr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		inode->i_ctime = timespec_trunc(attr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) &&
+		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+			mode &= ~S_ISGID;
+		inode->i_mode = mode;
+	}
+}
+EXPORT_SYMBOL(setattr_copy);
+
+/**
+ * notify_change - modify attributes of a filesytem object
+ * @dentry:	object affected
+ * @iattr:	new attributes
+ * @delegated_inode: returns inode, if the inode is delegated
+ *
+ * The caller must hold the i_mutex on the affected object.
+ *
+ * If notify_change discovers a delegation in need of breaking,
+ * it will return -EWOULDBLOCK and return a reference to the inode in
+ * delegated_inode.  The caller should then break the delegation and
+ * retry.  Because breaking a delegation may take a long time, the
+ * caller should drop the i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.  Also, passing NULL is fine for callers holding
+ * the file open for write, as there can be no conflicting delegation in
+ * that case.
+ */
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+{
+	struct inode *inode = dentry->d_inode;
+	umode_t mode = inode->i_mode;
+	int error;
+	struct timespec now;
+	unsigned int ia_valid = attr->ia_valid;
+
+	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+
+	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
+	if ((ia_valid & ATTR_MODE)) {
+		umode_t amode = attr->ia_mode;
+		/* Flag setting protected by i_mutex */
+		if (is_sxid(amode))
+			inode->i_flags &= ~S_NOSEC;
+	}
+
+	now = current_fs_time(inode->i_sb);
+
+	attr->ia_ctime = now;
+	if (!(ia_valid & ATTR_ATIME_SET))
+		attr->ia_atime = now;
+	if (!(ia_valid & ATTR_MTIME_SET))
+		attr->ia_mtime = now;
+	if (ia_valid & ATTR_KILL_PRIV) {
+		attr->ia_valid &= ~ATTR_KILL_PRIV;
+		ia_valid &= ~ATTR_KILL_PRIV;
+		error = security_inode_need_killpriv(dentry);
+		if (error > 0)
+			error = security_inode_killpriv(dentry);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * We now pass ATTR_KILL_S*ID to the lower level setattr function so
+	 * that the function has the ability to reinterpret a mode change
+	 * that's due to these bits. This adds an implicit restriction that
+	 * no function will ever call notify_change with both ATTR_MODE and
+	 * ATTR_KILL_S*ID set.
+	 */
+	if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
+	    (ia_valid & ATTR_MODE))
+		BUG();
+
+	if (ia_valid & ATTR_KILL_SUID) {
+		if (mode & S_ISUID) {
+			ia_valid = attr->ia_valid |= ATTR_MODE;
+			attr->ia_mode = (inode->i_mode & ~S_ISUID);
+		}
+	}
+	if (ia_valid & ATTR_KILL_SGID) {
+		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+			if (!(ia_valid & ATTR_MODE)) {
+				ia_valid = attr->ia_valid |= ATTR_MODE;
+				attr->ia_mode = inode->i_mode;
+			}
+			attr->ia_mode &= ~S_ISGID;
+		}
+	}
+	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
+		return 0;
+
+	error = security_inode_setattr(dentry, attr);
+	if (error)
+		return error;
+	error = try_break_deleg(inode, delegated_inode);
+	if (error)
+		return error;
+
+	if (inode->i_op->setattr)
+		error = inode->i_op->setattr(dentry, attr);
+	else
+		error = simple_setattr(dentry, attr);
+
+	if (!error) {
+		fsnotify_change(dentry, ia_valid);
+		ima_inode_post_setattr(dentry);
+		evm_inode_post_setattr(dentry, ia_valid);
+	}
+
+	return error;
+}
+EXPORT_SYMBOL(notify_change);
diff --git a/kernel/fs/autofs4/Kconfig b/kernel/fs/autofs4/Kconfig
new file mode 100644
index 000000000..1204d6384
--- /dev/null
+++ b/kernel/fs/autofs4/Kconfig
@@ -0,0 +1,20 @@
+config AUTOFS4_FS
+	tristate "Kernel automounter version 4 support (also supports v3)"
+	help
+	  The automounter is a tool to automatically mount remote file systems
+	  on demand. This implementation is partially kernel-based to reduce
+	  overhead in the already-mounted case; this is unlike the BSD
+	  automounter (amd), which is a pure user space daemon.
+
+	  To use the automounter you need the user-space tools from
+	  <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+	  want to answer Y to "NFS file system support", below.
+
+	  To compile this support as a module, choose M here: the module will be
+	  called autofs4.  You will need to add "alias autofs autofs4" to your
+	  modules configuration file.
+
+	  If you are not a part of a fairly large, distributed network or
+	  don't have a laptop which needs to dynamically reconfigure to the
+	  local network, you probably do not need an automounter, and can say
+	  N here.
diff --git a/kernel/fs/autofs4/Makefile b/kernel/fs/autofs4/Makefile
new file mode 100644
index 000000000..a811c1f7d
--- /dev/null
+++ b/kernel/fs/autofs4/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux autofs-filesystem routines.
+#
+
+obj-$(CONFIG_AUTOFS4_FS) += autofs4.o
+
+autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o
diff --git a/kernel/fs/autofs4/autofs_i.h b/kernel/fs/autofs4/autofs_i.h
new file mode 100644
index 000000000..63cf85daa
--- /dev/null
+++ b/kernel/fs/autofs4/autofs_i.h
@@ -0,0 +1,284 @@
+/* -*- c -*- ------------------------------------------------------------- *
+ *   
+ * linux/fs/autofs/autofs_i.h
+ *
+ *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ *   Copyright 2005-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/* Internal header file for autofs */
+
+#include <linux/auto_fs4.h>
+#include <linux/auto_dev-ioctl.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+/* This is the range of ioctl() numbers we claim as ours */
+#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
+#define AUTOFS_IOC_COUNT     32
+
+#define AUTOFS_DEV_IOCTL_IOC_FIRST	(AUTOFS_DEV_IOCTL_VERSION)
+#define AUTOFS_DEV_IOCTL_IOC_COUNT	(AUTOFS_IOC_COUNT - 11)
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/delay.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+
+/* #define DEBUG */
+
+#define DPRINTK(fmt, ...)				\
+	pr_debug("pid %d: %s: " fmt "\n",		\
+		current->pid, __func__, ##__VA_ARGS__)
+
+#define AUTOFS_WARN(fmt, ...)				\
+	printk(KERN_WARNING "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##__VA_ARGS__)
+
+#define AUTOFS_ERROR(fmt, ...)				\
+	printk(KERN_ERR "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##__VA_ARGS__)
+
+/* Unified info structure.  This is pointed to by both the dentry and
+   inode structures.  Each file in the filesystem has an instance of this
+   structure.  It holds a reference to the dentry, so dentries are never
+   flushed while the file exists.  All name lookups are dealt with at the
+   dentry level, although the filesystem can interfere in the validation
+   process.  Readdir is implemented by traversing the dentry lists. */
+struct autofs_info {
+	struct dentry	*dentry;
+	struct inode	*inode;
+
+	int		flags;
+
+	struct completion expire_complete;
+
+	struct list_head active;
+	int active_count;
+
+	struct list_head expiring;
+
+	struct autofs_sb_info *sbi;
+	unsigned long last_used;
+	atomic_t count;
+
+	kuid_t uid;
+	kgid_t gid;
+};
+
+#define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_NO_RCU	(1<<1) /* the dentry is being considered
+					* for expiry, so RCU_walk is
+					* not permitted
+					*/
+#define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
+
+struct autofs_wait_queue {
+	wait_queue_head_t queue;
+	struct autofs_wait_queue *next;
+	autofs_wqt_t wait_queue_token;
+	/* We use the following to see what we are waiting for */
+	struct qstr name;
+	u32 dev;
+	u64 ino;
+	kuid_t uid;
+	kgid_t gid;
+	pid_t pid;
+	pid_t tgid;
+	/* This is for status reporting upon return */
+	int status;
+	unsigned int wait_ctr;
+};
+
+#define AUTOFS_SBI_MAGIC 0x6d4a556d
+
+struct autofs_sb_info {
+	u32 magic;
+	int pipefd;
+	struct file *pipe;
+	struct pid *oz_pgrp;
+	int catatonic;
+	int version;
+	int sub_version;
+	int min_proto;
+	int max_proto;
+	unsigned long exp_timeout;
+	unsigned int type;
+	int reghost_enabled;
+	int needs_reghost;
+	struct super_block *sb;
+	struct mutex wq_mutex;
+	struct mutex pipe_mutex;
+	spinlock_t fs_lock;
+	struct autofs_wait_queue *queues; /* Wait queue pointer */
+	spinlock_t lookup_lock;
+	struct list_head active_list;
+	struct list_head expiring_list;
+	struct rcu_head rcu;
+};
+
+static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
+{
+	return (struct autofs_sb_info *)(sb->s_fs_info);
+}
+
+static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
+{
+	return (struct autofs_info *)(dentry->d_fsdata);
+}
+
+/* autofs4_oz_mode(): do we see the man behind the curtain?  (The
+   processes which do manipulations for us in user space sees the raw
+   filesystem without "magic".) */
+
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
+}
+
+struct inode *autofs4_get_inode(struct super_block *, umode_t);
+void autofs4_free_ino(struct autofs_info *);
+
+/* Expiration */
+int is_autofs4_dentry(struct dentry *);
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
+int autofs4_expire_run(struct super_block *, struct vfsmount *,
+			struct autofs_sb_info *,
+			struct autofs_packet_expire __user *);
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when);
+int autofs4_expire_multi(struct super_block *, struct vfsmount *,
+			struct autofs_sb_info *, int __user *);
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+				     struct vfsmount *mnt,
+				     struct autofs_sb_info *sbi, int how);
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+				       struct vfsmount *mnt,
+				       struct autofs_sb_info *sbi, int how);
+
+/* Device node initialization */
+
+int autofs_dev_ioctl_init(void);
+void autofs_dev_ioctl_exit(void);
+
+/* Operations structures */
+
+extern const struct inode_operations autofs4_symlink_inode_operations;
+extern const struct inode_operations autofs4_dir_inode_operations;
+extern const struct file_operations autofs4_dir_operations;
+extern const struct file_operations autofs4_root_operations;
+extern const struct dentry_operations autofs4_dentry_operations;
+
+/* VFS automount flags management functions */
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+	dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+	dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+/* Initializing function */
+
+int autofs4_fill_super(struct super_block *, void *, int);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
+
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+	if (!(pipe->f_mode & FMODE_CAN_WRITE))
+		return -EINVAL;
+	if (!S_ISFIFO(file_inode(pipe)->i_mode))
+		return -EINVAL;
+	/* We want a packet pipe */
+	pipe->f_flags |= O_DIRECT;
+	return 0;
+}
+
+/* Queue management functions */
+
+int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+void autofs4_catatonic_mode(struct autofs_sb_info *);
+
+static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
+{
+	return new_encode_dev(sbi->sb->s_dev);
+}
+
+static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
+{
+	return d_inode(sbi->sb->s_root)->i_ino;
+}
+
+static inline int simple_positive(struct dentry *dentry)
+{
+	return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
+static inline void __autofs4_add_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		if (list_empty(&ino->expiring))
+			list_add(&ino->expiring, &sbi->expiring_list);
+	}
+	return;
+}
+
+static inline void autofs4_add_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (list_empty(&ino->expiring))
+			list_add(&ino->expiring, &sbi->expiring_list);
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static inline void autofs4_del_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->expiring))
+			list_del_init(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+extern void autofs4_kill_sb(struct super_block *);
diff --git a/kernel/fs/autofs4/dev-ioctl.c b/kernel/fs/autofs4/dev-ioctl.c
new file mode 100644
index 000000000..ac7d921ed
--- /dev/null
+++ b/kernel/fs/autofs4/dev-ioctl.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright 2008 Red Hat, Inc. All rights reserved.
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/magic.h>
+#include <linux/dcache.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+#include "autofs_i.h"
+
+/*
+ * This module implements an interface for routing autofs ioctl control
+ * commands via a miscellaneous device file.
+ *
+ * The alternate interface is needed because we need to be able open
+ * an ioctl file descriptor on an autofs mount that may be covered by
+ * another mount. This situation arises when starting automount(8)
+ * or other user space daemon which uses direct mounts or offset
+ * mounts (used for autofs lazy mount/umount of nested mount trees),
+ * which have been left busy at at service shutdown.
+ */
+
+#define AUTOFS_DEV_IOCTL_SIZE	sizeof(struct autofs_dev_ioctl)
+
+typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
+			struct autofs_dev_ioctl *);
+
+static int check_name(const char *name)
+{
+	if (!strchr(name, '/'))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from user land.
+ */
+static int invalid_str(char *str, size_t size)
+{
+	if (memchr(str, 0, size))
+		return 0;
+	return -EINVAL;
+}
+
+/*
+ * Check that the user compiled against correct version of autofs
+ * misc device code.
+ *
+ * As well as checking the version compatibility this always copies
+ * the kernel interface version out.
+ */
+static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
+{
+	int err = 0;
+
+	if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
+	    (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
+		AUTOFS_WARN("ioctl control interface version mismatch: "
+		     "kernel(%u.%u), user(%u.%u), cmd(%d)",
+		     AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+		     AUTOFS_DEV_IOCTL_VERSION_MINOR,
+		     param->ver_major, param->ver_minor, cmd);
+		err = -EINVAL;
+	}
+
+	/* Fill in the kernel version. */
+	param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+	param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+
+	return err;
+}
+
+/*
+ * Copy parameter control struct, including a possible path allocated
+ * at the end of the struct.
+ */
+static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+{
+	struct autofs_dev_ioctl tmp, *res;
+
+	if (copy_from_user(&tmp, in, sizeof(tmp)))
+		return ERR_PTR(-EFAULT);
+
+	if (tmp.size < sizeof(tmp))
+		return ERR_PTR(-EINVAL);
+
+	if (tmp.size > (PATH_MAX + sizeof(tmp)))
+		return ERR_PTR(-ENAMETOOLONG);
+
+	res = memdup_user(in, tmp.size);
+	if (!IS_ERR(res))
+		res->size = tmp.size;
+
+	return res;
+}
+
+static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
+{
+	kfree(param);
+	return;
+}
+
+/*
+ * Check sanity of parameter control fields and if a path is present
+ * check that it is terminated and contains at least one "/".
+ */
+static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
+{
+	int err;
+
+	err = check_dev_ioctl_version(cmd, param);
+	if (err) {
+		AUTOFS_WARN("invalid device control module version "
+		     "supplied for cmd(0x%08x)", cmd);
+		goto out;
+	}
+
+	if (param->size > sizeof(*param)) {
+		err = invalid_str(param->path, param->size - sizeof(*param));
+		if (err) {
+			AUTOFS_WARN(
+			  "path string terminator missing for cmd(0x%08x)",
+			  cmd);
+			goto out;
+		}
+
+		err = check_name(param->path);
+		if (err) {
+			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+				    cmd);
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * Get the autofs super block info struct from the file opened on
+ * the autofs mount point.
+ */
+static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
+{
+	struct autofs_sb_info *sbi = NULL;
+	struct inode *inode;
+
+	if (f) {
+		inode = file_inode(f);
+		sbi = autofs4_sbi(inode->i_sb);
+	}
+	return sbi;
+}
+
+/* Return autofs module protocol version */
+static int autofs_dev_ioctl_protover(struct file *fp,
+				     struct autofs_sb_info *sbi,
+				     struct autofs_dev_ioctl *param)
+{
+	param->protover.version = sbi->version;
+	return 0;
+}
+
+/* Return autofs module protocol sub version */
+static int autofs_dev_ioctl_protosubver(struct file *fp,
+					struct autofs_sb_info *sbi,
+					struct autofs_dev_ioctl *param)
+{
+	param->protosubver.sub_version = sbi->sub_version;
+	return 0;
+}
+
+/* Find the topmost mount satisfying test() */
+static int find_autofs_mount(const char *pathname,
+			     struct path *res,
+			     int test(struct path *path, void *data),
+			     void *data)
+{
+	struct path path;
+	int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+	if (err)
+		return err;
+	err = -ENOENT;
+	while (path.dentry == path.mnt->mnt_root) {
+		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
+			if (test(&path, data)) {
+				path_get(&path);
+				*res = path;
+				err = 0;
+				break;
+			}
+		}
+		if (!follow_up(&path))
+			break;
+	}
+	path_put(&path);
+	return err;
+}
+
+static int test_by_dev(struct path *path, void *p)
+{
+	return path->dentry->d_sb->s_dev == *(dev_t *)p;
+}
+
+static int test_by_type(struct path *path, void *p)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+	return ino && ino->sbi->type & *(unsigned *)p;
+}
+
+/*
+ * Open a file descriptor on the autofs mount point corresponding
+ * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
+ */
+static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
+{
+	int err, fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (likely(fd >= 0)) {
+		struct file *filp;
+		struct path path;
+
+		err = find_autofs_mount(name, &path, test_by_dev, &devid);
+		if (err)
+			goto out;
+
+		/*
+		 * Find autofs super block that has the device number
+		 * corresponding to the autofs fs we want to open.
+		 */
+
+		filp = dentry_open(&path, O_RDONLY, current_cred());
+		path_put(&path);
+		if (IS_ERR(filp)) {
+			err = PTR_ERR(filp);
+			goto out;
+		}
+
+		fd_install(fd, filp);
+	}
+
+	return fd;
+
+out:
+	put_unused_fd(fd);
+	return err;
+}
+
+/* Open a file descriptor on an autofs mount point */
+static int autofs_dev_ioctl_openmount(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	const char *path;
+	dev_t devid;
+	int err, fd;
+
+	/* param->path has already been checked */
+	if (!param->openmount.devid)
+		return -EINVAL;
+
+	param->ioctlfd = -1;
+
+	path = param->path;
+	devid = new_decode_dev(param->openmount.devid);
+
+	err = 0;
+	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
+	if (unlikely(fd < 0)) {
+		err = fd;
+		goto out;
+	}
+
+	param->ioctlfd = fd;
+out:
+	return err;
+}
+
+/* Close file descriptor allocated above (user can also use close(2)). */
+static int autofs_dev_ioctl_closemount(struct file *fp,
+				       struct autofs_sb_info *sbi,
+				       struct autofs_dev_ioctl *param)
+{
+	return sys_close(param->ioctlfd);
+}
+
+/*
+ * Send "ready" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_ready(struct file *fp,
+				  struct autofs_sb_info *sbi,
+				  struct autofs_dev_ioctl *param)
+{
+	autofs_wqt_t token;
+
+	token = (autofs_wqt_t) param->ready.token;
+	return autofs4_wait_release(sbi, token, 0);
+}
+
+/*
+ * Send "fail" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_fail(struct file *fp,
+				 struct autofs_sb_info *sbi,
+				 struct autofs_dev_ioctl *param)
+{
+	autofs_wqt_t token;
+	int status;
+
+	token = (autofs_wqt_t) param->fail.token;
+	status = param->fail.status ? param->fail.status : -ENOENT;
+	return autofs4_wait_release(sbi, token, status);
+}
+
+/*
+ * Set the pipe fd for kernel communication to the daemon.
+ *
+ * Normally this is set at mount using an option but if we
+ * are reconnecting to a busy mount then we need to use this
+ * to tell the autofs mount about the new kernel pipe fd. In
+ * order to protect mounts against incorrectly setting the
+ * pipefd we also require that the autofs mount be catatonic.
+ *
+ * This also sets the process group id used to identify the
+ * controlling process (eg. the owning automount(8) daemon).
+ */
+static int autofs_dev_ioctl_setpipefd(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	int pipefd;
+	int err = 0;
+	struct pid *new_pid = NULL;
+
+	if (param->setpipefd.pipefd == -1)
+		return -EINVAL;
+
+	pipefd = param->setpipefd.pipefd;
+
+	mutex_lock(&sbi->wq_mutex);
+	if (!sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return -EBUSY;
+	} else {
+		struct file *pipe;
+
+		new_pid = get_task_pid(current, PIDTYPE_PGID);
+
+		if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
+			AUTOFS_WARN("Not allowed to change PID namespace");
+			err = -EINVAL;
+			goto out;
+		}
+
+		pipe = fget(pipefd);
+		if (!pipe) {
+			err = -EBADF;
+			goto out;
+		}
+		if (autofs_prepare_pipe(pipe) < 0) {
+			err = -EPIPE;
+			fput(pipe);
+			goto out;
+		}
+		swap(sbi->oz_pgrp, new_pid);
+		sbi->pipefd = pipefd;
+		sbi->pipe = pipe;
+		sbi->catatonic = 0;
+	}
+out:
+	put_pid(new_pid);
+	mutex_unlock(&sbi->wq_mutex);
+	return err;
+}
+
+/*
+ * Make the autofs mount point catatonic, no longer responsive to
+ * mount requests. Also closes the kernel pipe file descriptor.
+ */
+static int autofs_dev_ioctl_catatonic(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	autofs4_catatonic_mode(sbi);
+	return 0;
+}
+
+/* Set the autofs mount timeout */
+static int autofs_dev_ioctl_timeout(struct file *fp,
+				    struct autofs_sb_info *sbi,
+				    struct autofs_dev_ioctl *param)
+{
+	unsigned long timeout;
+
+	timeout = param->timeout.timeout;
+	param->timeout.timeout = sbi->exp_timeout / HZ;
+	sbi->exp_timeout = timeout * HZ;
+	return 0;
+}
+
+/*
+ * Return the uid and gid of the last request for the mount
+ *
+ * When reconstructing an autofs mount tree with active mounts
+ * we need to re-connect to mounts that may have used the original
+ * process uid and gid (or string variations of them) for mount
+ * lookups within the map entry.
+ */
+static int autofs_dev_ioctl_requester(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	struct autofs_info *ino;
+	struct path path;
+	dev_t devid;
+	int err = -ENOENT;
+
+	if (param->size <= sizeof(*param)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	devid = sbi->sb->s_dev;
+
+	param->requester.uid = param->requester.gid = -1;
+
+	err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
+	if (err)
+		goto out;
+
+	ino = autofs4_dentry_ino(path.dentry);
+	if (ino) {
+		err = 0;
+		autofs4_expire_wait(path.dentry, 0);
+		spin_lock(&sbi->fs_lock);
+		param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
+		param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+		spin_unlock(&sbi->fs_lock);
+	}
+	path_put(&path);
+out:
+	return err;
+}
+
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more that can be done.
+ */
+static int autofs_dev_ioctl_expire(struct file *fp,
+				   struct autofs_sb_info *sbi,
+				   struct autofs_dev_ioctl *param)
+{
+	struct vfsmount *mnt;
+	int how;
+
+	how = param->expire.how;
+	mnt = fp->f_path.mnt;
+
+	return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
+}
+
+/* Check if autofs mount point is in use */
+static int autofs_dev_ioctl_askumount(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	param->askumount.may_umount = 0;
+	if (may_umount(fp->f_path.mnt))
+		param->askumount.may_umount = 1;
+	return 0;
+}
+
+/*
+ * Check if the given path is a mountpoint.
+ *
+ * If we are supplied with the file descriptor of an autofs
+ * mount we're looking for a specific mount. In this case
+ * the path is considered a mountpoint if it is itself a
+ * mountpoint or contains a mount, such as a multi-mount
+ * without a root mount. In this case we return 1 if the
+ * path is a mount point and the super magic of the covering
+ * mount if there is one or 0 if it isn't a mountpoint.
+ *
+ * If we aren't supplied with a file descriptor then we
+ * lookup the path and check if it is the root of a mount.
+ * If a type is given we are looking for a particular autofs
+ * mount and if we don't find a match we return fail. If the
+ * located path is the root of a mount we return 1 along with
+ * the super magic of the mount or 0 otherwise.
+ *
+ * In both cases the the device number (as returned by
+ * new_encode_dev()) is also returned.
+ */
+static int autofs_dev_ioctl_ismountpoint(struct file *fp,
+					 struct autofs_sb_info *sbi,
+					 struct autofs_dev_ioctl *param)
+{
+	struct path path;
+	const char *name;
+	unsigned int type;
+	unsigned int devid, magic;
+	int err = -ENOENT;
+
+	if (param->size <= sizeof(*param)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	name = param->path;
+	type = param->ismountpoint.in.type;
+
+	param->ismountpoint.out.devid = devid = 0;
+	param->ismountpoint.out.magic = magic = 0;
+
+	if (!fp || param->ioctlfd == -1) {
+		if (autofs_type_any(type))
+			err = kern_path_mountpoint(AT_FDCWD,
+						   name, &path, LOOKUP_FOLLOW);
+		else
+			err = find_autofs_mount(name, &path,
+						test_by_type, &type);
+		if (err)
+			goto out;
+		devid = new_encode_dev(path.dentry->d_sb->s_dev);
+		err = 0;
+		if (path.mnt->mnt_root == path.dentry) {
+			err = 1;
+			magic = path.dentry->d_sb->s_magic;
+		}
+	} else {
+		dev_t dev = sbi->sb->s_dev;
+
+		err = find_autofs_mount(name, &path, test_by_dev, &dev);
+		if (err)
+			goto out;
+
+		devid = new_encode_dev(dev);
+
+		err = have_submounts(path.dentry);
+
+		if (follow_down_one(&path))
+			magic = path.dentry->d_sb->s_magic;
+	}
+
+	param->ismountpoint.out.devid = devid;
+	param->ismountpoint.out.magic = magic;
+	path_put(&path);
+out:
+	return err;
+}
+
+/*
+ * Our range of ioctl numbers isn't 0 based so we need to shift
+ * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table
+ * lookup.
+ */
+#define cmd_idx(cmd)	(cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST))
+
+static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
+{
+	static struct {
+		int cmd;
+		ioctl_fn fn;
+	} _ioctls[] = {
+		{cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
+		{cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
+			 autofs_dev_ioctl_protover},
+		{cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
+			 autofs_dev_ioctl_protosubver},
+		{cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
+			 autofs_dev_ioctl_openmount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
+			 autofs_dev_ioctl_closemount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
+			 autofs_dev_ioctl_ready},
+		{cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
+			 autofs_dev_ioctl_fail},
+		{cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
+			 autofs_dev_ioctl_setpipefd},
+		{cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
+			 autofs_dev_ioctl_catatonic},
+		{cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
+			 autofs_dev_ioctl_timeout},
+		{cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
+			 autofs_dev_ioctl_requester},
+		{cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
+			 autofs_dev_ioctl_expire},
+		{cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
+			 autofs_dev_ioctl_askumount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
+			 autofs_dev_ioctl_ismountpoint}
+	};
+	unsigned int idx = cmd_idx(cmd);
+
+	return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
+}
+
+/* ioctl dispatcher */
+static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+{
+	struct autofs_dev_ioctl *param;
+	struct file *fp;
+	struct autofs_sb_info *sbi;
+	unsigned int cmd_first, cmd;
+	ioctl_fn fn = NULL;
+	int err = 0;
+
+	/* only root can play with this */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
+	cmd = _IOC_NR(command);
+
+	if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
+	    cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
+		return -ENOTTY;
+	}
+
+	/* Copy the parameters into kernel space. */
+	param = copy_dev_ioctl(user);
+	if (IS_ERR(param))
+		return PTR_ERR(param);
+
+	err = validate_dev_ioctl(command, param);
+	if (err)
+		goto out;
+
+	/* The validate routine above always sets the version */
+	if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
+		goto done;
+
+	fn = lookup_dev_ioctl(cmd);
+	if (!fn) {
+		AUTOFS_WARN("unknown command 0x%08x", command);
+		return -ENOTTY;
+	}
+
+	fp = NULL;
+	sbi = NULL;
+
+	/*
+	 * For obvious reasons the openmount can't have a file
+	 * descriptor yet. We don't take a reference to the
+	 * file during close to allow for immediate release.
+	 */
+	if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
+	    cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
+		fp = fget(param->ioctlfd);
+		if (!fp) {
+			if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
+				goto cont;
+			err = -EBADF;
+			goto out;
+		}
+
+		sbi = autofs_dev_ioctl_sbi(fp);
+		if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
+			err = -EINVAL;
+			fput(fp);
+			goto out;
+		}
+
+		/*
+		 * Admin needs to be able to set the mount catatonic in
+		 * order to be able to perform the re-open.
+		 */
+		if (!autofs4_oz_mode(sbi) &&
+		    cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) {
+			err = -EACCES;
+			fput(fp);
+			goto out;
+		}
+	}
+cont:
+	err = fn(fp, sbi, param);
+
+	if (fp)
+		fput(fp);
+done:
+	if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
+		err = -EFAULT;
+out:
+	free_dev_ioctl(param);
+	return err;
+}
+
+static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
+{
+	int err;
+	err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
+	return (long) err;
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u)
+{
+	return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u));
+}
+#else
+#define autofs_dev_ioctl_compat NULL
+#endif
+
+static const struct file_operations _dev_ioctl_fops = {
+	.unlocked_ioctl	 = autofs_dev_ioctl,
+	.compat_ioctl = autofs_dev_ioctl_compat,
+	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice _autofs_dev_ioctl_misc = {
+	.minor		= AUTOFS_MINOR,
+	.name  		= AUTOFS_DEVICE_NAME,
+	.fops  		= &_dev_ioctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
+
+/* Register/deregister misc character device */
+int __init autofs_dev_ioctl_init(void)
+{
+	int r;
+
+	r = misc_register(&_autofs_dev_ioctl_misc);
+	if (r) {
+		AUTOFS_ERROR("misc_register failed for control device");
+		return r;
+	}
+
+	return 0;
+}
+
+void autofs_dev_ioctl_exit(void)
+{
+	misc_deregister(&_autofs_dev_ioctl_misc);
+	return;
+}
+
diff --git a/kernel/fs/autofs4/expire.c b/kernel/fs/autofs4/expire.c
new file mode 100644
index 000000000..d487fa27a
--- /dev/null
+++ b/kernel/fs/autofs4/expire.c
@@ -0,0 +1,603 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/expire.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include "autofs_i.h"
+
+static unsigned long now;
+
+/* Check if a dentry can be expired */
+static inline int autofs4_can_expire(struct dentry *dentry,
+					unsigned long timeout, int do_now)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
+	/* dentry in the process of being deleted */
+	if (ino == NULL)
+		return 0;
+
+	if (!do_now) {
+		/* Too young to die */
+		if (!timeout || time_after(ino->last_used + timeout, now))
+			return 0;
+	}
+	return 1;
+}
+
+/* Check a mount point for busyness */
+static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct dentry *top = dentry;
+	struct path path = {.mnt = mnt, .dentry = dentry};
+	int status = 1;
+
+	DPRINTK("dentry %p %pd", dentry, dentry);
+
+	path_get(&path);
+
+	if (!follow_down_one(&path))
+		goto done;
+
+	if (is_autofs4_dentry(path.dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
+
+		/* This is an autofs submount, we can't expire it */
+		if (autofs_type_indirect(sbi->type))
+			goto done;
+	}
+
+	/* Update the expiry counter if fs is busy */
+	if (!may_umount_tree(path.mnt)) {
+		struct autofs_info *ino = autofs4_dentry_ino(top);
+		ino->last_used = jiffies;
+		goto done;
+	}
+
+	status = 0;
+done:
+	DPRINTK("returning = %d", status);
+	path_put(&path);
+	return status;
+}
+
+/*
+ * Calculate and dget next entry in the subdirs list under root.
+ */
+static struct dentry *get_next_positive_subdir(struct dentry *prev,
+						struct dentry *root)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct list_head *next;
+	struct dentry *q;
+
+	spin_lock(&sbi->lookup_lock);
+	spin_lock(&root->d_lock);
+
+	if (prev)
+		next = prev->d_child.next;
+	else {
+		prev = dget_dlock(root);
+		next = prev->d_subdirs.next;
+	}
+
+cont:
+	if (next == &root->d_subdirs) {
+		spin_unlock(&root->d_lock);
+		spin_unlock(&sbi->lookup_lock);
+		dput(prev);
+		return NULL;
+	}
+
+	q = list_entry(next, struct dentry, d_child);
+
+	spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
+	/* Already gone or negative dentry (under construction) - try next */
+	if (!d_count(q) || !simple_positive(q)) {
+		spin_unlock(&q->d_lock);
+		next = q->d_child.next;
+		goto cont;
+	}
+	dget_dlock(q);
+	spin_unlock(&q->d_lock);
+	spin_unlock(&root->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+	dput(prev);
+
+	return q;
+}
+
+/*
+ * Calculate and dget next entry in top down tree traversal.
+ */
+static struct dentry *get_next_positive_dentry(struct dentry *prev,
+						struct dentry *root)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct list_head *next;
+	struct dentry *p, *ret;
+
+	if (prev == NULL)
+		return dget(root);
+
+	spin_lock(&sbi->lookup_lock);
+relock:
+	p = prev;
+	spin_lock(&p->d_lock);
+again:
+	next = p->d_subdirs.next;
+	if (next == &p->d_subdirs) {
+		while (1) {
+			struct dentry *parent;
+
+			if (p == root) {
+				spin_unlock(&p->d_lock);
+				spin_unlock(&sbi->lookup_lock);
+				dput(prev);
+				return NULL;
+			}
+
+			parent = p->d_parent;
+			if (!spin_trylock(&parent->d_lock)) {
+				spin_unlock(&p->d_lock);
+				cpu_chill();
+				goto relock;
+			}
+			spin_unlock(&p->d_lock);
+			next = p->d_child.next;
+			p = parent;
+			if (next != &parent->d_subdirs)
+				break;
+		}
+	}
+	ret = list_entry(next, struct dentry, d_child);
+
+	spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
+	/* Negative dentry - try next */
+	if (!simple_positive(ret)) {
+		spin_unlock(&p->d_lock);
+		lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);
+		p = ret;
+		goto again;
+	}
+	dget_dlock(ret);
+	spin_unlock(&ret->d_lock);
+	spin_unlock(&p->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+	dput(prev);
+
+	return ret;
+}
+
+/*
+ * Check a direct mount point for busyness.
+ * Direct mounts have similar expiry semantics to tree mounts.
+ * The tree is not busy iff no mountpoints are busy and there are no
+ * autofs submounts.
+ */
+static int autofs4_direct_busy(struct vfsmount *mnt,
+				struct dentry *top,
+				unsigned long timeout,
+				int do_now)
+{
+	DPRINTK("top %p %pd", top, top);
+
+	/* If it's busy update the expiry counters */
+	if (!may_umount_tree(mnt)) {
+		struct autofs_info *ino = autofs4_dentry_ino(top);
+		if (ino)
+			ino->last_used = jiffies;
+		return 1;
+	}
+
+	/* Timeout of a direct mount is determined by its top dentry */
+	if (!autofs4_can_expire(top, timeout, do_now))
+		return 1;
+
+	return 0;
+}
+
+/* Check a directory tree of mount points for busyness
+ * The tree is not busy iff no mountpoints are busy
+ */
+static int autofs4_tree_busy(struct vfsmount *mnt,
+	       		     struct dentry *top,
+			     unsigned long timeout,
+			     int do_now)
+{
+	struct autofs_info *top_ino = autofs4_dentry_ino(top);
+	struct dentry *p;
+
+	DPRINTK("top %p %pd", top, top);
+
+	/* Negative dentry - give up */
+	if (!simple_positive(top))
+		return 1;
+
+	p = NULL;
+	while ((p = get_next_positive_dentry(p, top))) {
+		DPRINTK("dentry %p %pd", p, p);
+
+		/*
+		 * Is someone visiting anywhere in the subtree ?
+		 * If there's no mount we need to check the usage
+		 * count for the autofs dentry.
+		 * If the fs is busy update the expiry counter.
+		 */
+		if (d_mountpoint(p)) {
+			if (autofs4_mount_busy(mnt, p)) {
+				top_ino->last_used = jiffies;
+				dput(p);
+				return 1;
+			}
+		} else {
+			struct autofs_info *ino = autofs4_dentry_ino(p);
+			unsigned int ino_count = atomic_read(&ino->count);
+
+			/* allow for dget above and top is already dgot */
+			if (p == top)
+				ino_count += 2;
+			else
+				ino_count++;
+
+			if (d_count(p) > ino_count) {
+				top_ino->last_used = jiffies;
+				dput(p);
+				return 1;
+			}
+		}
+	}
+
+	/* Timeout of a tree mount is ultimately determined by its top dentry */
+	if (!autofs4_can_expire(top, timeout, do_now))
+		return 1;
+
+	return 0;
+}
+
+static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
+					   struct dentry *parent,
+					   unsigned long timeout,
+					   int do_now)
+{
+	struct dentry *p;
+
+	DPRINTK("parent %p %pd", parent, parent);
+
+	p = NULL;
+	while ((p = get_next_positive_dentry(p, parent))) {
+		DPRINTK("dentry %p %pd", p, p);
+
+		if (d_mountpoint(p)) {
+			/* Can we umount this guy */
+			if (autofs4_mount_busy(mnt, p))
+				continue;
+
+			/* Can we expire this guy */
+			if (autofs4_can_expire(p, timeout, do_now))
+				return p;
+		}
+	}
+	return NULL;
+}
+
+/* Check if we can expire a direct mount (possibly a tree) */
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+				     struct vfsmount *mnt,
+				     struct autofs_sb_info *sbi,
+				     int how)
+{
+	unsigned long timeout;
+	struct dentry *root = dget(sb->s_root);
+	int do_now = how & AUTOFS_EXP_IMMEDIATE;
+	struct autofs_info *ino;
+
+	if (!root)
+		return NULL;
+
+	now = jiffies;
+	timeout = sbi->exp_timeout;
+
+	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(root);
+	/* No point expiring a pending mount */
+	if (ino->flags & AUTOFS_INF_PENDING)
+		goto out;
+	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+		ino->flags |= AUTOFS_INF_NO_RCU;
+		spin_unlock(&sbi->fs_lock);
+		synchronize_rcu();
+		spin_lock(&sbi->fs_lock);
+		if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+			ino->flags |= AUTOFS_INF_EXPIRING;
+			smp_mb();
+			ino->flags &= ~AUTOFS_INF_NO_RCU;
+			init_completion(&ino->expire_complete);
+			spin_unlock(&sbi->fs_lock);
+			return root;
+		}
+		ino->flags &= ~AUTOFS_INF_NO_RCU;
+	}
+out:
+	spin_unlock(&sbi->fs_lock);
+	dput(root);
+
+	return NULL;
+}
+
+/* Check if 'dentry' should expire, or return a nearby
+ * dentry that is suitable.
+ * If returned dentry is different from arg dentry,
+ * then a dget() reference was taken, else not.
+ */
+static struct dentry *should_expire(struct dentry *dentry,
+				    struct vfsmount *mnt,
+				    unsigned long timeout,
+				    int how)
+{
+	int do_now = how & AUTOFS_EXP_IMMEDIATE;
+	int exp_leaves = how & AUTOFS_EXP_LEAVES;
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	unsigned int ino_count;
+
+	/* No point expiring a pending mount */
+	if (ino->flags & AUTOFS_INF_PENDING)
+		return NULL;
+
+	/*
+	 * Case 1: (i) indirect mount or top level pseudo direct mount
+	 *	   (autofs-4.1).
+	 *	   (ii) indirect mount with offset mount, check the "/"
+	 *	   offset (autofs-5.0+).
+	 */
+	if (d_mountpoint(dentry)) {
+		DPRINTK("checking mountpoint %p %pd", dentry, dentry);
+
+		/* Can we umount this guy */
+		if (autofs4_mount_busy(mnt, dentry))
+			return NULL;
+
+		/* Can we expire this guy */
+		if (autofs4_can_expire(dentry, timeout, do_now))
+			return dentry;
+		return NULL;
+	}
+
+	if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
+		DPRINTK("checking symlink %p %pd", dentry, dentry);
+		/*
+		 * A symlink can't be "busy" in the usual sense so
+		 * just check last used for expire timeout.
+		 */
+		if (autofs4_can_expire(dentry, timeout, do_now))
+			return dentry;
+		return NULL;
+	}
+
+	if (simple_empty(dentry))
+		return NULL;
+
+	/* Case 2: tree mount, expire iff entire tree is not busy */
+	if (!exp_leaves) {
+		/* Path walk currently on this dentry? */
+		ino_count = atomic_read(&ino->count) + 1;
+		if (d_count(dentry) > ino_count)
+			return NULL;
+
+		if (!autofs4_tree_busy(mnt, dentry, timeout, do_now))
+			return dentry;
+	/*
+	 * Case 3: pseudo direct mount, expire individual leaves
+	 *	   (autofs-4.1).
+	 */
+	} else {
+		/* Path walk currently on this dentry? */
+		struct dentry *expired;
+		ino_count = atomic_read(&ino->count) + 1;
+		if (d_count(dentry) > ino_count)
+			return NULL;
+
+		expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
+		if (expired) {
+			if (expired == dentry)
+				dput(dentry);
+			return expired;
+		}
+	}
+	return NULL;
+}
+/*
+ * Find an eligible tree to time-out
+ * A tree is eligible if :-
+ *  - it is unused by any user process
+ *  - it has been unused for exp_timeout time
+ */
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+				       struct vfsmount *mnt,
+				       struct autofs_sb_info *sbi,
+				       int how)
+{
+	unsigned long timeout;
+	struct dentry *root = sb->s_root;
+	struct dentry *dentry;
+	struct dentry *expired;
+	struct autofs_info *ino;
+
+	if (!root)
+		return NULL;
+
+	now = jiffies;
+	timeout = sbi->exp_timeout;
+
+	dentry = NULL;
+	while ((dentry = get_next_positive_subdir(dentry, root))) {
+		spin_lock(&sbi->fs_lock);
+		ino = autofs4_dentry_ino(dentry);
+		if (ino->flags & AUTOFS_INF_NO_RCU)
+			expired = NULL;
+		else
+			expired = should_expire(dentry, mnt, timeout, how);
+		if (!expired) {
+			spin_unlock(&sbi->fs_lock);
+			continue;
+		}
+		ino = autofs4_dentry_ino(expired);
+		ino->flags |= AUTOFS_INF_NO_RCU;
+		spin_unlock(&sbi->fs_lock);
+		synchronize_rcu();
+		spin_lock(&sbi->fs_lock);
+		if (should_expire(expired, mnt, timeout, how)) {
+			if (expired != dentry)
+				dput(dentry);
+			goto found;
+		}
+
+		ino->flags &= ~AUTOFS_INF_NO_RCU;
+		if (expired != dentry)
+			dput(expired);
+		spin_unlock(&sbi->fs_lock);
+	}
+	return NULL;
+
+found:
+	DPRINTK("returning %p %pd", expired, expired);
+	ino->flags |= AUTOFS_INF_EXPIRING;
+	smp_mb();
+	ino->flags &= ~AUTOFS_INF_NO_RCU;
+	init_completion(&ino->expire_complete);
+	spin_unlock(&sbi->fs_lock);
+	spin_lock(&sbi->lookup_lock);
+	spin_lock(&expired->d_parent->d_lock);
+	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
+	list_move(&expired->d_parent->d_subdirs, &expired->d_child);
+	spin_unlock(&expired->d_lock);
+	spin_unlock(&expired->d_parent->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+	return expired;
+}
+
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	/* Block on any pending expire */
+	if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)))
+		return 0;
+	if (rcu_walk)
+		return -ECHILD;
+
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_EXPIRING) {
+		spin_unlock(&sbi->fs_lock);
+
+		DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
+
+		status = autofs4_wait(sbi, dentry, NFY_NONE);
+		wait_for_completion(&ino->expire_complete);
+
+		DPRINTK("expire done status=%d", status);
+
+		if (d_unhashed(dentry))
+			return -EAGAIN;
+
+		return status;
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	return 0;
+}
+
+/* Perform an expiry operation */
+int autofs4_expire_run(struct super_block *sb,
+		      struct vfsmount *mnt,
+		      struct autofs_sb_info *sbi,
+		      struct autofs_packet_expire __user *pkt_p)
+{
+	struct autofs_packet_expire pkt;
+	struct autofs_info *ino;
+	struct dentry *dentry;
+	int ret = 0;
+
+	memset(&pkt,0,sizeof pkt);
+
+	pkt.hdr.proto_version = sbi->version;
+	pkt.hdr.type = autofs_ptype_expire;
+
+	if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+		return -EAGAIN;
+
+	pkt.len = dentry->d_name.len;
+	memcpy(pkt.name, dentry->d_name.name, pkt.len);
+	pkt.name[pkt.len] = '\0';
+	dput(dentry);
+
+	if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+		ret = -EFAULT;
+
+	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(dentry);
+	/* avoid rapid-fire expire attempts if expiry fails */
+	ino->last_used = now;
+	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	complete_all(&ino->expire_complete);
+	spin_unlock(&sbi->fs_lock);
+
+	return ret;
+}
+
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when)
+{
+	struct dentry *dentry;
+	int ret = -EAGAIN;
+
+	if (autofs_type_trigger(sbi->type))
+		dentry = autofs4_expire_direct(sb, mnt, sbi, when);
+	else
+		dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
+
+	if (dentry) {
+		struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
+		/* This is synchronous because it makes the daemon a
+                   little easier */
+		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+
+		spin_lock(&sbi->fs_lock);
+		/* avoid rapid-fire expire attempts if expiry fails */
+		ino->last_used = now;
+		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		complete_all(&ino->expire_complete);
+		spin_unlock(&sbi->fs_lock);
+		dput(dentry);
+	}
+
+	return ret;
+}
+
+/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+   more to be done */
+int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			struct autofs_sb_info *sbi, int __user *arg)
+{
+	int do_now = 0;
+
+	if (arg && get_user(do_now, arg))
+		return -EFAULT;
+
+	return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+}
+
diff --git a/kernel/fs/autofs4/init.c b/kernel/fs/autofs4/init.c
new file mode 100644
index 000000000..b3db517e8
--- /dev/null
+++ b/kernel/fs/autofs4/init.c
@@ -0,0 +1,52 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/init.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include "autofs_i.h"
+
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, autofs4_fill_super);
+}
+
+static struct file_system_type autofs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "autofs",
+	.mount		= autofs_mount,
+	.kill_sb	= autofs4_kill_sb,
+};
+MODULE_ALIAS_FS("autofs");
+
+static int __init init_autofs4_fs(void)
+{
+	int err;
+
+	autofs_dev_ioctl_init();
+
+	err = register_filesystem(&autofs_fs_type);
+	if (err)
+		autofs_dev_ioctl_exit();
+
+	return err;
+}
+
+static void __exit exit_autofs4_fs(void)
+{
+	autofs_dev_ioctl_exit();
+	unregister_filesystem(&autofs_fs_type);
+}
+
+module_init(init_autofs4_fs) 
+module_exit(exit_autofs4_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/autofs4/inode.c b/kernel/fs/autofs4/inode.c
new file mode 100644
index 000000000..a3ae0b2ae
--- /dev/null
+++ b/kernel/fs/autofs4/inode.c
@@ -0,0 +1,370 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/inode.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/bitops.h>
+#include <linux/magic.h>
+#include "autofs_i.h"
+#include <linux/module.h>
+
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
+{
+	struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+	if (ino) {
+		INIT_LIST_HEAD(&ino->active);
+		INIT_LIST_HEAD(&ino->expiring);
+		ino->last_used = jiffies;
+		ino->sbi = sbi;
+	}
+	return ino;
+}
+
+void autofs4_clean_ino(struct autofs_info *ino)
+{
+	ino->uid = GLOBAL_ROOT_UID;
+	ino->gid = GLOBAL_ROOT_GID;
+	ino->last_used = jiffies;
+}
+
+void autofs4_free_ino(struct autofs_info *ino)
+{
+	kfree(ino);
+}
+
+void autofs4_kill_sb(struct super_block *sb)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(sb);
+
+	/*
+	 * In the event of a failure in get_sb_nodev the superblock
+	 * info is not present so nothing else has been setup, so
+	 * just call kill_anon_super when we are called from
+	 * deactivate_super.
+	 */
+	if (sbi) {
+		/* Free wait queues, close pipe */
+		autofs4_catatonic_mode(sbi);
+		put_pid(sbi->oz_pgrp);
+	}
+
+	DPRINTK("shutting down");
+	kill_litter_super(sb);
+	if (sbi)
+		kfree_rcu(sbi, rcu);
+}
+
+static int autofs4_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct inode *root_inode = d_inode(root->d_sb->s_root);
+
+	if (!sbi)
+		return 0;
+
+	seq_printf(m, ",fd=%d", sbi->pipefd);
+	if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			from_kuid_munged(&init_user_ns, root_inode->i_uid));
+	if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			from_kgid_munged(&init_user_ns, root_inode->i_gid));
+	seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
+	seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
+	seq_printf(m, ",minproto=%d", sbi->min_proto);
+	seq_printf(m, ",maxproto=%d", sbi->max_proto);
+
+	if (autofs_type_offset(sbi->type))
+		seq_printf(m, ",offset");
+	else if (autofs_type_direct(sbi->type))
+		seq_printf(m, ",direct");
+	else
+		seq_printf(m, ",indirect");
+
+	return 0;
+}
+
+static void autofs4_evict_inode(struct inode *inode)
+{
+	clear_inode(inode);
+	kfree(inode->i_private);
+}
+
+static const struct super_operations autofs4_sops = {
+	.statfs		= simple_statfs,
+	.show_options	= autofs4_show_options,
+	.evict_inode	= autofs4_evict_inode,
+};
+
+enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
+	Opt_indirect, Opt_direct, Opt_offset};
+
+static const match_table_t tokens = {
+	{Opt_fd, "fd=%u"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_pgrp, "pgrp=%u"},
+	{Opt_minproto, "minproto=%u"},
+	{Opt_maxproto, "maxproto=%u"},
+	{Opt_indirect, "indirect"},
+	{Opt_direct, "direct"},
+	{Opt_offset, "offset"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
+			 int *pgrp, bool *pgrp_set, unsigned int *type,
+			 int *minproto, int *maxproto)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	*uid = current_uid();
+	*gid = current_gid();
+
+	*minproto = AUTOFS_MIN_PROTO_VERSION;
+	*maxproto = AUTOFS_MAX_PROTO_VERSION;
+
+	*pipefd = -1;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_fd:
+			if (match_int(args, pipefd))
+				return 1;
+			break;
+		case Opt_uid:
+			if (match_int(args, &option))
+				return 1;
+			*uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(*uid))
+				return 1;
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return 1;
+			*gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(*gid))
+				return 1;
+			break;
+		case Opt_pgrp:
+			if (match_int(args, &option))
+				return 1;
+			*pgrp = option;
+			*pgrp_set = true;
+			break;
+		case Opt_minproto:
+			if (match_int(args, &option))
+				return 1;
+			*minproto = option;
+			break;
+		case Opt_maxproto:
+			if (match_int(args, &option))
+				return 1;
+			*maxproto = option;
+			break;
+		case Opt_indirect:
+			set_autofs_type_indirect(type);
+			break;
+		case Opt_direct:
+			set_autofs_type_direct(type);
+			break;
+		case Opt_offset:
+			set_autofs_type_offset(type);
+			break;
+		default:
+			return 1;
+		}
+	}
+	return (*pipefd < 0);
+}
+
+int autofs4_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode * root_inode;
+	struct dentry * root;
+	struct file * pipe;
+	int pipefd;
+	struct autofs_sb_info *sbi;
+	struct autofs_info *ino;
+	int pgrp = 0;
+	bool pgrp_set = false;
+	int ret = -EINVAL;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	DPRINTK("starting up, sbi = %p",sbi);
+
+	s->s_fs_info = sbi;
+	sbi->magic = AUTOFS_SBI_MAGIC;
+	sbi->pipefd = -1;
+	sbi->pipe = NULL;
+	sbi->catatonic = 1;
+	sbi->exp_timeout = 0;
+	sbi->oz_pgrp = NULL;
+	sbi->sb = s;
+	sbi->version = 0;
+	sbi->sub_version = 0;
+	set_autofs_type_indirect(&sbi->type);
+	sbi->min_proto = 0;
+	sbi->max_proto = 0;
+	mutex_init(&sbi->wq_mutex);
+	mutex_init(&sbi->pipe_mutex);
+	spin_lock_init(&sbi->fs_lock);
+	sbi->queues = NULL;
+	spin_lock_init(&sbi->lookup_lock);
+	INIT_LIST_HEAD(&sbi->active_list);
+	INIT_LIST_HEAD(&sbi->expiring_list);
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = AUTOFS_SUPER_MAGIC;
+	s->s_op = &autofs4_sops;
+	s->s_d_op = &autofs4_dentry_operations;
+	s->s_time_gran = 1;
+
+	/*
+	 * Get the root inode and dentry, but defer checking for errors.
+	 */
+	ino = autofs4_new_ino(sbi);
+	if (!ino) {
+		ret = -ENOMEM;
+		goto fail_free;
+	}
+	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
+	root = d_make_root(root_inode);
+	if (!root)
+		goto fail_ino;
+	pipe = NULL;
+
+	root->d_fsdata = ino;
+
+	/* Can this call block? */
+	if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
+			  &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
+			  &sbi->max_proto)) {
+		printk("autofs: called with bogus options\n");
+		goto fail_dput;
+	}
+
+	if (pgrp_set) {
+		sbi->oz_pgrp = find_get_pid(pgrp);
+		if (!sbi->oz_pgrp) {
+			pr_warn("autofs: could not find process group %d\n",
+				pgrp);
+			goto fail_dput;
+		}
+	} else {
+		sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+	}
+
+	if (autofs_type_trigger(sbi->type))
+		__managed_dentry_set_managed(root);
+
+	root_inode->i_fop = &autofs4_root_operations;
+	root_inode->i_op = &autofs4_dir_inode_operations;
+
+	/* Couldn't this be tested earlier? */
+	if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
+	    sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
+		printk("autofs: kernel does not match daemon version "
+		       "daemon (%d, %d) kernel (%d, %d)\n",
+			sbi->min_proto, sbi->max_proto,
+			AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+		goto fail_dput;
+	}
+
+	/* Establish highest kernel protocol version */
+	if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
+		sbi->version = AUTOFS_MAX_PROTO_VERSION;
+	else
+		sbi->version = sbi->max_proto;
+	sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+
+	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
+	pipe = fget(pipefd);
+
+	if (!pipe) {
+		printk("autofs: could not open pipe file descriptor\n");
+		goto fail_dput;
+	}
+	ret = autofs_prepare_pipe(pipe);
+	if (ret < 0)
+		goto fail_fput;
+	sbi->pipe = pipe;
+	sbi->pipefd = pipefd;
+	sbi->catatonic = 0;
+
+	/*
+	 * Success! Install the root dentry now to indicate completion.
+	 */
+	s->s_root = root;
+	return 0;
+	
+	/*
+	 * Failure ... clean up.
+	 */
+fail_fput:
+	printk("autofs: pipe file descriptor does not contain proper ops\n");
+	fput(pipe);
+	/* fall through */
+fail_dput:
+	dput(root);
+	goto fail_free;
+fail_ino:
+	kfree(ino);
+fail_free:
+	put_pid(sbi->oz_pgrp);
+	kfree(sbi);
+	s->s_fs_info = NULL;
+	return ret;
+}
+
+struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode == NULL)
+		return NULL;
+
+	inode->i_mode = mode;
+	if (sb->s_root) {
+		inode->i_uid = d_inode(sb->s_root)->i_uid;
+		inode->i_gid = d_inode(sb->s_root)->i_gid;
+	}
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_ino = get_next_ino();
+
+	if (S_ISDIR(mode)) {
+		set_nlink(inode, 2);
+		inode->i_op = &autofs4_dir_inode_operations;
+		inode->i_fop = &autofs4_dir_operations;
+	} else if (S_ISLNK(mode)) {
+		inode->i_op = &autofs4_symlink_inode_operations;
+	}
+
+	return inode;
+}
diff --git a/kernel/fs/autofs4/root.c b/kernel/fs/autofs4/root.c
new file mode 100644
index 000000000..c6d7d3dbd
--- /dev/null
+++ b/kernel/fs/autofs4/root.c
@@ -0,0 +1,923 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/root.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/mutex.h>
+
+#include "autofs_i.h"
+
+static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
+static int autofs4_dir_unlink(struct inode *,struct dentry *);
+static int autofs4_dir_rmdir(struct inode *,struct dentry *);
+static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+#endif
+static int autofs4_dir_open(struct inode *inode, struct file *file);
+static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool);
+static void autofs4_dentry_release(struct dentry *);
+
+const struct file_operations autofs4_root_operations = {
+	.open		= dcache_dir_open,
+	.release	= dcache_dir_close,
+	.read		= generic_read_dir,
+	.iterate	= dcache_readdir,
+	.llseek		= dcache_dir_lseek,
+	.unlocked_ioctl	= autofs4_root_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= autofs4_root_compat_ioctl,
+#endif
+};
+
+const struct file_operations autofs4_dir_operations = {
+	.open		= autofs4_dir_open,
+	.release	= dcache_dir_close,
+	.read		= generic_read_dir,
+	.iterate	= dcache_readdir,
+	.llseek		= dcache_dir_lseek,
+};
+
+const struct inode_operations autofs4_dir_inode_operations = {
+	.lookup		= autofs4_lookup,
+	.unlink		= autofs4_dir_unlink,
+	.symlink	= autofs4_dir_symlink,
+	.mkdir		= autofs4_dir_mkdir,
+	.rmdir		= autofs4_dir_rmdir,
+};
+
+const struct dentry_operations autofs4_dentry_operations = {
+	.d_automount	= autofs4_d_automount,
+	.d_manage	= autofs4_d_manage,
+	.d_release	= autofs4_dentry_release,
+};
+
+static void autofs4_add_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (!ino->active_count) {
+			if (list_empty(&ino->active))
+				list_add(&ino->active, &sbi->active_list);
+		}
+		ino->active_count++;
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static void autofs4_del_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		ino->active_count--;
+		if (!ino->active_count) {
+			if (!list_empty(&ino->active))
+				list_del_init(&ino->active);
+		}
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static int autofs4_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
+
+	if (autofs4_oz_mode(sbi))
+		goto out;
+
+	/*
+	 * An empty directory in an autofs file system is always a
+	 * mount point. The daemon must have failed to mount this
+	 * during lookup so it doesn't exist. This can happen, for
+	 * example, if user space returns an incorrect status for a
+	 * mount request. Otherwise we're doing a readdir on the
+	 * autofs file system so just let the libfs routines handle
+	 * it.
+	 */
+	spin_lock(&sbi->lookup_lock);
+	if (!d_mountpoint(dentry) && simple_empty(dentry)) {
+		spin_unlock(&sbi->lookup_lock);
+		return -ENOENT;
+	}
+	spin_unlock(&sbi->lookup_lock);
+
+out:
+	return dcache_dir_open(inode, file);
+}
+
+static void autofs4_dentry_release(struct dentry *de)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(de);
+	struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
+
+	DPRINTK("releasing %p", de);
+
+	if (!ino)
+		return;
+
+	if (sbi) {
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->active))
+			list_del(&ino->active);
+		if (!list_empty(&ino->expiring))
+			list_del(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
+	}
+
+	autofs4_free_ino(ino);
+}
+
+static struct dentry *autofs4_lookup_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct dentry *parent = dentry->d_parent;
+	struct qstr *name = &dentry->d_name;
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct list_head *p, *head;
+
+	head = &sbi->active_list;
+	if (list_empty(head))
+		return NULL;
+	spin_lock(&sbi->lookup_lock);
+	list_for_each(p, head) {
+		struct autofs_info *ino;
+		struct dentry *active;
+		struct qstr *qstr;
+
+		ino = list_entry(p, struct autofs_info, active);
+		active = ino->dentry;
+
+		spin_lock(&active->d_lock);
+
+		/* Already gone? */
+		if ((int) d_count(active) <= 0)
+			goto next;
+
+		qstr = &active->d_name;
+
+		if (active->d_name.hash != hash)
+			goto next;
+		if (active->d_parent != parent)
+			goto next;
+
+		if (qstr->len != len)
+			goto next;
+		if (memcmp(qstr->name, str, len))
+			goto next;
+
+		if (d_unhashed(active)) {
+			dget_dlock(active);
+			spin_unlock(&active->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			return active;
+		}
+next:
+		spin_unlock(&active->d_lock);
+	}
+	spin_unlock(&sbi->lookup_lock);
+
+	return NULL;
+}
+
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
+					      bool rcu_walk)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct dentry *parent = dentry->d_parent;
+	struct qstr *name = &dentry->d_name;
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct list_head *p, *head;
+
+	head = &sbi->expiring_list;
+	if (list_empty(head))
+		return NULL;
+	spin_lock(&sbi->lookup_lock);
+	list_for_each(p, head) {
+		struct autofs_info *ino;
+		struct dentry *expiring;
+		struct qstr *qstr;
+
+		if (rcu_walk) {
+			spin_unlock(&sbi->lookup_lock);
+			return ERR_PTR(-ECHILD);
+		}
+
+		ino = list_entry(p, struct autofs_info, expiring);
+		expiring = ino->dentry;
+
+		spin_lock(&expiring->d_lock);
+
+		/* We've already been dentry_iput or unlinked */
+		if (d_really_is_negative(expiring))
+			goto next;
+
+		qstr = &expiring->d_name;
+
+		if (expiring->d_name.hash != hash)
+			goto next;
+		if (expiring->d_parent != parent)
+			goto next;
+
+		if (qstr->len != len)
+			goto next;
+		if (memcmp(qstr->name, str, len))
+			goto next;
+
+		if (d_unhashed(expiring)) {
+			dget_dlock(expiring);
+			spin_unlock(&expiring->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			return expiring;
+		}
+next:
+		spin_unlock(&expiring->d_lock);
+	}
+	spin_unlock(&sbi->lookup_lock);
+
+	return NULL;
+}
+
+static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status = 0;
+
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		if (rcu_walk)
+			return -ECHILD;
+		DPRINTK("waiting for mount name=%pd", dentry);
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+		DPRINTK("mount wait done status=%d", status);
+	}
+	ino->last_used = jiffies;
+	return status;
+}
+
+static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
+{
+	struct dentry *expiring;
+
+	expiring = autofs4_lookup_expiring(dentry, rcu_walk);
+	if (IS_ERR(expiring))
+		return PTR_ERR(expiring);
+	if (!expiring)
+		return autofs4_expire_wait(dentry, rcu_walk);
+	else {
+		/*
+		 * If we are racing with expire the request might not
+		 * be quite complete, but the directory has been removed
+		 * so it must have been successful, just wait for it.
+		 */
+		autofs4_expire_wait(expiring, 0);
+		autofs4_del_expiring(expiring);
+		dput(expiring);
+	}
+	return 0;
+}
+
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	/*
+	 * If this is an indirect mount the dentry could have gone away
+	 * as a result of an expire and a new one created.
+	 */
+	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+		struct dentry *parent = dentry->d_parent;
+		struct autofs_info *ino;
+		struct dentry *new = d_lookup(parent, &dentry->d_name);
+		if (!new)
+			return NULL;
+		ino = autofs4_dentry_ino(new);
+		ino->last_used = jiffies;
+		dput(path->dentry);
+		path->dentry = new;
+	}
+	return path->dentry;
+}
+
+static struct vfsmount *autofs4_d_automount(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	DPRINTK("dentry=%p %pd", dentry, dentry);
+
+	/* The daemon never triggers a mount. */
+	if (autofs4_oz_mode(sbi))
+		return NULL;
+
+	/*
+	 * If an expire request is pending everyone must wait.
+	 * If the expire fails we're still mounted so continue
+	 * the follow and return. A return of -EAGAIN (which only
+	 * happens with indirect mounts) means the expire completed
+	 * and the directory was removed, so just go ahead and try
+	 * the mount.
+	 */
+	status = do_expire_wait(dentry, 0);
+	if (status && status != -EAGAIN)
+		return NULL;
+
+	/* Callback to the daemon to perform the mount or wait */
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry, 0);
+		if (status)
+			return ERR_PTR(status);
+		goto done;
+	}
+
+	/*
+	 * If the dentry is a symlink it's equivalent to a directory
+	 * having d_mountpoint() true, so there's no need to call back
+	 * to the daemon.
+	 */
+	if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
+		spin_unlock(&sbi->fs_lock);
+		goto done;
+	}
+
+	if (!d_mountpoint(dentry)) {
+		/*
+		 * It's possible that user space hasn't removed directories
+		 * after umounting a rootless multi-mount, although it
+		 * should. For v5 have_submounts() is sufficient to handle
+		 * this because the leaves of the directory tree under the
+		 * mount never trigger mounts themselves (they have an autofs
+		 * trigger mount mounted on them). But v4 pseudo direct mounts
+		 * do need the leaves to trigger mounts. In this case we
+		 * have no choice but to use the list_empty() check and
+		 * require user space behave.
+		 */
+		if (sbi->version > 4) {
+			if (have_submounts(dentry)) {
+				spin_unlock(&sbi->fs_lock);
+				goto done;
+			}
+		} else {
+			if (!simple_empty(dentry)) {
+				spin_unlock(&sbi->fs_lock);
+				goto done;
+			}
+		}
+		ino->flags |= AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry, 0);
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_PENDING;
+		if (status) {
+			spin_unlock(&sbi->fs_lock);
+			return ERR_PTR(status);
+		}
+	}
+	spin_unlock(&sbi->fs_lock);
+done:
+	/* Mount succeeded, check if we ended up with a new dentry */
+	dentry = autofs4_mountpoint_changed(path);
+	if (!dentry)
+		return ERR_PTR(-ENOENT);
+
+	return NULL;
+}
+
+static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	DPRINTK("dentry=%p %pd", dentry, dentry);
+
+	/* The daemon never waits. */
+	if (autofs4_oz_mode(sbi)) {
+		if (!d_mountpoint(dentry))
+			return -EISDIR;
+		return 0;
+	}
+
+	/* Wait for pending expires */
+	if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
+		return -ECHILD;
+
+	/*
+	 * This dentry may be under construction so wait on mount
+	 * completion.
+	 */
+	status = autofs4_mount_wait(dentry, rcu_walk);
+	if (status)
+		return status;
+
+	if (rcu_walk) {
+		/* We don't need fs_lock in rcu_walk mode,
+		 * just testing 'AUTOFS_INFO_NO_RCU' is enough.
+		 * simple_empty() takes a spinlock, so leave it
+		 * to last.
+		 * We only return -EISDIR when certain this isn't
+		 * a mount-trap.
+		 */
+		struct inode *inode;
+		if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
+			return 0;
+		if (d_mountpoint(dentry))
+			return 0;
+		inode = d_inode_rcu(dentry);
+		if (inode && S_ISLNK(inode->i_mode))
+			return -EISDIR;
+		if (list_empty(&dentry->d_subdirs))
+			return 0;
+		if (!simple_empty(dentry))
+			return -EISDIR;
+		return 0;
+	}
+
+	spin_lock(&sbi->fs_lock);
+	/*
+	 * If the dentry has been selected for expire while we slept
+	 * on the lock then it might go away. We'll deal with that in
+	 * ->d_automount() and wait on a new mount if the expire
+	 * succeeds or return here if it doesn't (since there's no
+	 * mount to follow with a rootless multi-mount).
+	 */
+	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+		/*
+		 * Any needed mounting has been completed and the path
+		 * updated so check if this is a rootless multi-mount so
+		 * we can avoid needless calls ->d_automount() and avoid
+		 * an incorrect ELOOP error return.
+		 */
+		if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+		    (d_really_is_positive(dentry) && d_is_symlink(dentry)))
+			status = -EISDIR;
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	return status;
+}
+
+/* Lookups in the root directory */
+static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct autofs_sb_info *sbi;
+	struct autofs_info *ino;
+	struct dentry *active;
+
+	DPRINTK("name = %pd", dentry);
+
+	/* File name too long to exist */
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	sbi = autofs4_sbi(dir->i_sb);
+
+	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
+		current->pid, task_pgrp_nr(current), sbi->catatonic,
+		autofs4_oz_mode(sbi));
+
+	active = autofs4_lookup_active(dentry);
+	if (active) {
+		return active;
+	} else {
+		/*
+		 * A dentry that is not within the root can never trigger a
+		 * mount operation, unless the directory already exists, so we
+		 * can return fail immediately.  The daemon however does need
+		 * to create directories within the file system.
+		 */
+		if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+			return ERR_PTR(-ENOENT);
+
+		/* Mark entries in the root as mount triggers */
+		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+			__managed_dentry_set_managed(dentry);
+
+		ino = autofs4_new_ino(sbi);
+		if (!ino)
+			return ERR_PTR(-ENOMEM);
+
+		dentry->d_fsdata = ino;
+		ino->dentry = dentry;
+
+		autofs4_add_active(dentry);
+
+		d_instantiate(dentry, NULL);
+	}
+	return NULL;
+}
+
+static int autofs4_dir_symlink(struct inode *dir, 
+			       struct dentry *dentry,
+			       const char *symname)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	struct inode *inode;
+	size_t size = strlen(symname);
+	char *cp;
+
+	DPRINTK("%s <- %pd", symname, dentry);
+
+	if (!autofs4_oz_mode(sbi))
+		return -EACCES;
+
+	BUG_ON(!ino);
+
+	autofs4_clean_ino(ino);
+
+	autofs4_del_active(dentry);
+
+	cp = kmalloc(size + 1, GFP_KERNEL);
+	if (!cp)
+		return -ENOMEM;
+
+	strcpy(cp, symname);
+
+	inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
+	if (!inode) {
+		kfree(cp);
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
+	}
+	inode->i_private = cp;
+	inode->i_size = size;
+	d_add(dentry, inode);
+
+	dget(dentry);
+	atomic_inc(&ino->count);
+	p_ino = autofs4_dentry_ino(dentry->d_parent);
+	if (p_ino && !IS_ROOT(dentry))
+		atomic_inc(&p_ino->count);
+
+	dir->i_mtime = CURRENT_TIME;
+
+	return 0;
+}
+
+/*
+ * NOTE!
+ *
+ * Normal filesystems would do a "d_delete()" to tell the VFS dcache
+ * that the file no longer exists. However, doing that means that the
+ * VFS layer can turn the dentry into a negative dentry.  We don't want
+ * this, because the unlink is probably the result of an expire.
+ * We simply d_drop it and add it to a expiring list in the super block,
+ * which allows the dentry lookup to check for an incomplete expire.
+ *
+ * If a process is blocked on the dentry waiting for the expire to finish,
+ * it will invalidate the dentry and try to mount with a new one.
+ *
+ * Also see autofs4_dir_rmdir()..
+ */
+static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	
+	/* This allows root to remove symlinks */
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (atomic_dec_and_test(&ino->count)) {
+		p_ino = autofs4_dentry_ino(dentry->d_parent);
+		if (p_ino && !IS_ROOT(dentry))
+			atomic_dec(&p_ino->count);
+	}
+	dput(ino->dentry);
+
+	d_inode(dentry)->i_size = 0;
+	clear_nlink(d_inode(dentry));
+
+	dir->i_mtime = CURRENT_TIME;
+
+	spin_lock(&sbi->lookup_lock);
+	__autofs4_add_expiring(dentry);
+	d_drop(dentry);
+	spin_unlock(&sbi->lookup_lock);
+
+	return 0;
+}
+
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is necessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	/* root and dentrys in the root are already handled */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_set_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	managed_dentry_clear_managed(parent);
+	return;
+}
+
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+	struct list_head *d_child;
+	struct dentry *parent;
+
+	/* flags for dentrys in the root are handled elsewhere */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_clear_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	d_child = &dentry->d_child;
+	/* Set parent managed if it's becoming empty */
+	if (d_child->next == &parent->d_subdirs &&
+	    d_child->prev == &parent->d_subdirs)
+		managed_dentry_set_managed(parent);
+	return;
+}
+
+static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	
+	DPRINTK("dentry %p, removing %pd", dentry, dentry);
+
+	if (!autofs4_oz_mode(sbi))
+		return -EACCES;
+
+	spin_lock(&sbi->lookup_lock);
+	if (!simple_empty(dentry)) {
+		spin_unlock(&sbi->lookup_lock);
+		return -ENOTEMPTY;
+	}
+	__autofs4_add_expiring(dentry);
+	d_drop(dentry);
+	spin_unlock(&sbi->lookup_lock);
+
+	if (sbi->version < 5)
+		autofs_clear_leaf_automount_flags(dentry);
+
+	if (atomic_dec_and_test(&ino->count)) {
+		p_ino = autofs4_dentry_ino(dentry->d_parent);
+		if (p_ino && dentry->d_parent != dentry)
+			atomic_dec(&p_ino->count);
+	}
+	dput(ino->dentry);
+	d_inode(dentry)->i_size = 0;
+	clear_nlink(d_inode(dentry));
+
+	if (dir->i_nlink)
+		drop_nlink(dir);
+
+	return 0;
+}
+
+static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	struct inode *inode;
+
+	if (!autofs4_oz_mode(sbi))
+		return -EACCES;
+
+	DPRINTK("dentry %p, creating %pd", dentry, dentry);
+
+	BUG_ON(!ino);
+
+	autofs4_clean_ino(ino);
+
+	autofs4_del_active(dentry);
+
+	inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
+	if (!inode)
+		return -ENOMEM;
+	d_add(dentry, inode);
+
+	if (sbi->version < 5)
+		autofs_set_leaf_automount_flags(dentry);
+
+	dget(dentry);
+	atomic_inc(&ino->count);
+	p_ino = autofs4_dentry_ino(dentry->d_parent);
+	if (p_ino && !IS_ROOT(dentry))
+		atomic_inc(&p_ino->count);
+	inc_nlink(dir);
+	dir->i_mtime = CURRENT_TIME;
+
+	return 0;
+}
+
+/* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
+					 compat_ulong_t __user *p)
+{
+	int rv;
+	unsigned long ntimeout;
+
+	if ((rv = get_user(ntimeout, p)) ||
+	     (rv = put_user(sbi->exp_timeout/HZ, p)))
+		return rv;
+
+	if (ntimeout > UINT_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+#endif
+
+static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
+					 unsigned long __user *p)
+{
+	int rv;
+	unsigned long ntimeout;
+
+	if ((rv = get_user(ntimeout, p)) ||
+	     (rv = put_user(sbi->exp_timeout/HZ, p)))
+		return rv;
+
+	if (ntimeout > ULONG_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+
+/* Return protocol version */
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+{
+	return put_user(sbi->version, p);
+}
+
+/* Return protocol sub version */
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+{
+	return put_user(sbi->sub_version, p);
+}
+
+/*
+* Tells the daemon whether it can umount the autofs mount.
+*/
+static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
+{
+	int status = 0;
+
+	if (may_umount(mnt))
+		status = 1;
+
+	DPRINTK("returning %d", status);
+
+	status = put_user(status, p);
+
+	return status;
+}
+
+/* Identify autofs4_dentries - this is so we can tell if there's
+   an extra dentry refcount or not.  We only hold a refcount on the
+   dentry if its non-negative (ie, d_inode != NULL)
+*/
+int is_autofs4_dentry(struct dentry *dentry)
+{
+	return dentry && d_really_is_positive(dentry) &&
+		dentry->d_op == &autofs4_dentry_operations &&
+		dentry->d_fsdata != NULL;
+}
+
+/*
+ * ioctl()'s on the root directory is the chief method for the daemon to
+ * generate kernel reactions
+ */
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
+				       unsigned int cmd, unsigned long arg)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
+	void __user *p = (void __user *)arg;
+
+	DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
+		cmd,arg,sbi,task_pgrp_nr(current));
+
+	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
+	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
+		return -ENOTTY;
+	
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	
+	switch(cmd) {
+	case AUTOFS_IOC_READY:	/* Wait queue: go ahead and retry */
+		return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+	case AUTOFS_IOC_FAIL:	/* Wait queue: fail with ENOENT */
+		return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+	case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
+		autofs4_catatonic_mode(sbi);
+		return 0;
+	case AUTOFS_IOC_PROTOVER: /* Get protocol version */
+		return autofs4_get_protover(sbi, p);
+	case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */
+		return autofs4_get_protosubver(sbi, p);
+	case AUTOFS_IOC_SETTIMEOUT:
+		return autofs4_get_set_timeout(sbi, p);
+#ifdef CONFIG_COMPAT
+	case AUTOFS_IOC_SETTIMEOUT32:
+		return autofs4_compat_get_set_timeout(sbi, p);
+#endif
+
+	case AUTOFS_IOC_ASKUMOUNT:
+		return autofs4_ask_umount(filp->f_path.mnt, p);
+
+	/* return a single thing to expire */
+	case AUTOFS_IOC_EXPIRE:
+		return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+	/* same as above, but can send multiple expires through pipe */
+	case AUTOFS_IOC_EXPIRE_MULTI:
+		return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+
+	default:
+		return -ENOSYS;
+	}
+}
+
+static long autofs4_root_ioctl(struct file *filp,
+			       unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	int ret;
+
+	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+	else
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
+			(unsigned long)compat_ptr(arg));
+
+	return ret;
+}
+#endif
diff --git a/kernel/fs/autofs4/symlink.c b/kernel/fs/autofs4/symlink.c
new file mode 100644
index 000000000..de58cc7b8
--- /dev/null
+++ b/kernel/fs/autofs4/symlink.c
@@ -0,0 +1,28 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/symlink.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include "autofs_i.h"
+
+static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino && !autofs4_oz_mode(sbi))
+		ino->last_used = jiffies;
+	nd_set_link(nd, d_inode(dentry)->i_private);
+	return NULL;
+}
+
+const struct inode_operations autofs4_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= autofs4_follow_link
+};
diff --git a/kernel/fs/autofs4/waitq.c b/kernel/fs/autofs4/waitq.c
new file mode 100644
index 000000000..35b755e79
--- /dev/null
+++ b/kernel/fs/autofs4/waitq.c
@@ -0,0 +1,565 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/waitq.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/file.h>
+#include "autofs_i.h"
+
+/* We make this a static variable rather than a part of the superblock; it
+   is better if we don't reassign numbers easily even across filesystems */
+static autofs_wqt_t autofs4_next_wait_queue = 1;
+
+/* These are the signals we allow interrupting a pending mount */
+#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
+
+void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
+{
+	struct autofs_wait_queue *wq, *nwq;
+
+	mutex_lock(&sbi->wq_mutex);
+	if (sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return;
+	}
+
+	DPRINTK("entering catatonic mode");
+
+	sbi->catatonic = 1;
+	wq = sbi->queues;
+	sbi->queues = NULL;	/* Erase all wait queues */
+	while (wq) {
+		nwq = wq->next;
+		wq->status = -ENOENT; /* Magic is gone - report failure */
+		kfree(wq->name.name);
+		wq->name.name = NULL;
+		wq->wait_ctr--;
+		wake_up_interruptible(&wq->queue);
+		wq = nwq;
+	}
+	fput(sbi->pipe);	/* Close the pipe */
+	sbi->pipe = NULL;
+	sbi->pipefd = -1;
+	mutex_unlock(&sbi->wq_mutex);
+}
+
+static int autofs4_write(struct autofs_sb_info *sbi,
+			 struct file *file, const void *addr, int bytes)
+{
+	unsigned long sigpipe, flags;
+	mm_segment_t fs;
+	const char *data = (const char *)addr;
+	ssize_t wr = 0;
+
+	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
+
+	/* Save pointer to user space and point back to kernel space */
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	mutex_lock(&sbi->pipe_mutex);
+	while (bytes &&
+	       (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+		data += wr;
+		bytes -= wr;
+	}
+	mutex_unlock(&sbi->pipe_mutex);
+
+	set_fs(fs);
+
+	/* Keep the currently executing process from receiving a
+	   SIGPIPE unless it was already supposed to get one */
+	if (wr == -EPIPE && !sigpipe) {
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		sigdelset(&current->pending.signal, SIGPIPE);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+
+	return (bytes > 0);
+}
+	
+static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
+				 struct autofs_wait_queue *wq,
+				 int type)
+{
+	union {
+		struct autofs_packet_hdr hdr;
+		union autofs_packet_union v4_pkt;
+		union autofs_v5_packet_union v5_pkt;
+	} pkt;
+	struct file *pipe = NULL;
+	size_t pktsz;
+
+	DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
+		(unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+
+	memset(&pkt,0,sizeof pkt); /* For security reasons */
+
+	pkt.hdr.proto_version = sbi->version;
+	pkt.hdr.type = type;
+
+	switch (type) {
+	/* Kernel protocol v4 missing and expire packets */
+	case autofs_ptype_missing:
+	{
+		struct autofs_packet_missing *mp = &pkt.v4_pkt.missing;
+
+		pktsz = sizeof(*mp);
+
+		mp->wait_queue_token = wq->wait_queue_token;
+		mp->len = wq->name.len;
+		memcpy(mp->name, wq->name.name, wq->name.len);
+		mp->name[wq->name.len] = '\0';
+		break;
+	}
+	case autofs_ptype_expire_multi:
+	{
+		struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+
+		pktsz = sizeof(*ep);
+
+		ep->wait_queue_token = wq->wait_queue_token;
+		ep->len = wq->name.len;
+		memcpy(ep->name, wq->name.name, wq->name.len);
+		ep->name[wq->name.len] = '\0';
+		break;
+	}
+	/*
+	 * Kernel protocol v5 packet for handling indirect and direct
+	 * mount missing and expire requests
+	 */
+	case autofs_ptype_missing_indirect:
+	case autofs_ptype_expire_indirect:
+	case autofs_ptype_missing_direct:
+	case autofs_ptype_expire_direct:
+	{
+		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+		struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
+
+		pktsz = sizeof(*packet);
+
+		packet->wait_queue_token = wq->wait_queue_token;
+		packet->len = wq->name.len;
+		memcpy(packet->name, wq->name.name, wq->name.len);
+		packet->name[wq->name.len] = '\0';
+		packet->dev = wq->dev;
+		packet->ino = wq->ino;
+		packet->uid = from_kuid_munged(user_ns, wq->uid);
+		packet->gid = from_kgid_munged(user_ns, wq->gid);
+		packet->pid = wq->pid;
+		packet->tgid = wq->tgid;
+		break;
+	}
+	default:
+		printk("autofs4_notify_daemon: bad type %d!\n", type);
+		mutex_unlock(&sbi->wq_mutex);
+		return;
+	}
+
+	pipe = get_file(sbi->pipe);
+
+	mutex_unlock(&sbi->wq_mutex);
+
+	if (autofs4_write(sbi, pipe, &pkt, pktsz))
+		autofs4_catatonic_mode(sbi);
+	fput(pipe);
+}
+
+static int autofs4_getpath(struct autofs_sb_info *sbi,
+			   struct dentry *dentry, char **name)
+{
+	struct dentry *root = sbi->sb->s_root;
+	struct dentry *tmp;
+	char *buf;
+	char *p;
+	int len;
+	unsigned seq;
+
+rename_retry:
+	buf = *name;
+	len = 0;
+
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	spin_lock(&sbi->fs_lock);
+	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
+		len += tmp->d_name.len + 1;
+
+	if (!len || --len > NAME_MAX) {
+		spin_unlock(&sbi->fs_lock);
+		rcu_read_unlock();
+		if (read_seqretry(&rename_lock, seq))
+			goto rename_retry;
+		return 0;
+	}
+
+	*(buf + len) = '\0';
+	p = buf + len - dentry->d_name.len;
+	strncpy(p, dentry->d_name.name, dentry->d_name.len);
+
+	for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) {
+		*(--p) = '/';
+		p -= tmp->d_name.len;
+		strncpy(p, tmp->d_name.name, tmp->d_name.len);
+	}
+	spin_unlock(&sbi->fs_lock);
+	rcu_read_unlock();
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+
+	return len;
+}
+
+static struct autofs_wait_queue *
+autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
+{
+	struct autofs_wait_queue *wq;
+
+	for (wq = sbi->queues; wq; wq = wq->next) {
+		if (wq->name.hash == qstr->hash &&
+		    wq->name.len == qstr->len &&
+		    wq->name.name &&
+			 !memcmp(wq->name.name, qstr->name, qstr->len))
+			break;
+	}
+	return wq;
+}
+
+/*
+ * Check if we have a valid request.
+ * Returns
+ * 1 if the request should continue.
+ *   In this case we can return an autofs_wait_queue entry if one is
+ *   found or NULL to idicate a new wait needs to be created.
+ * 0 or a negative errno if the request shouldn't continue.
+ */
+static int validate_request(struct autofs_wait_queue **wait,
+			    struct autofs_sb_info *sbi,
+			    struct qstr *qstr,
+			    struct dentry*dentry, enum autofs_notify notify)
+{
+	struct autofs_wait_queue *wq;
+	struct autofs_info *ino;
+
+	if (sbi->catatonic)
+		return -ENOENT;
+
+	/* Wait in progress, continue; */
+	wq = autofs4_find_wait(sbi, qstr);
+	if (wq) {
+		*wait = wq;
+		return 1;
+	}
+
+	*wait = NULL;
+
+	/* If we don't yet have any info this is a new request */
+	ino = autofs4_dentry_ino(dentry);
+	if (!ino)
+		return 1;
+
+	/*
+	 * If we've been asked to wait on an existing expire (NFY_NONE)
+	 * but there is no wait in the queue ...
+	 */
+	if (notify == NFY_NONE) {
+		/*
+		 * Either we've betean the pending expire to post it's
+		 * wait or it finished while we waited on the mutex.
+		 * So we need to wait till either, the wait appears
+		 * or the expire finishes.
+		 */
+
+		while (ino->flags & AUTOFS_INF_EXPIRING) {
+			mutex_unlock(&sbi->wq_mutex);
+			schedule_timeout_interruptible(HZ/10);
+			if (mutex_lock_interruptible(&sbi->wq_mutex))
+				return -EINTR;
+
+			if (sbi->catatonic)
+				return -ENOENT;
+
+			wq = autofs4_find_wait(sbi, qstr);
+			if (wq) {
+				*wait = wq;
+				return 1;
+			}
+		}
+
+		/*
+		 * Not ideal but the status has already gone. Of the two
+		 * cases where we wait on NFY_NONE neither depend on the
+		 * return status of the wait.
+		 */
+		return 0;
+	}
+
+	/*
+	 * If we've been asked to trigger a mount and the request
+	 * completed while we waited on the mutex ...
+	 */
+	if (notify == NFY_MOUNT) {
+		struct dentry *new = NULL;
+		int valid = 1;
+
+		/*
+		 * If the dentry was successfully mounted while we slept
+		 * on the wait queue mutex we can return success. If it
+		 * isn't mounted (doesn't have submounts for the case of
+		 * a multi-mount with no mount at it's base) we can
+		 * continue on and create a new request.
+		 */
+		if (!IS_ROOT(dentry)) {
+			if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+				struct dentry *parent = dentry->d_parent;
+				new = d_lookup(parent, &dentry->d_name);
+				if (new)
+					dentry = new;
+			}
+		}
+		if (have_submounts(dentry))
+			valid = 0;
+
+		if (new)
+			dput(new);
+		return valid;
+	}
+
+	return 1;
+}
+
+int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
+		enum autofs_notify notify)
+{
+	struct autofs_wait_queue *wq;
+	struct qstr qstr;
+	char *name;
+	int status, ret, type;
+	pid_t pid;
+	pid_t tgid;
+
+	/* In catatonic mode, we don't wait for nobody */
+	if (sbi->catatonic)
+		return -ENOENT;
+
+	/*
+	 * Try translating pids to the namespace of the daemon.
+	 *
+	 * Zero means failure: we are in an unrelated pid namespace.
+	 */
+	pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+	tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+	if (pid == 0 || tgid == 0)
+		return -ENOENT;
+
+	if (d_really_is_negative(dentry)) {
+		/*
+		 * A wait for a negative dentry is invalid for certain
+		 * cases. A direct or offset mount "always" has its mount
+		 * point directory created and so the request dentry must
+		 * be positive or the map key doesn't exist. The situation
+		 * is very similar for indirect mounts except only dentrys
+		 * in the root of the autofs file system may be negative.
+		 */
+		if (autofs_type_trigger(sbi->type))
+			return -ENOENT;
+		else if (!IS_ROOT(dentry->d_parent))
+			return -ENOENT;
+	}
+
+	name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	/* If this is a direct mount request create a dummy name */
+	if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
+		qstr.len = sprintf(name, "%p", dentry);
+	else {
+		qstr.len = autofs4_getpath(sbi, dentry, &name);
+		if (!qstr.len) {
+			kfree(name);
+			return -ENOENT;
+		}
+	}
+	qstr.name = name;
+	qstr.hash = full_name_hash(name, qstr.len);
+
+	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+		kfree(qstr.name);
+		return -EINTR;
+	}
+
+	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+	if (ret <= 0) {
+		if (ret != -EINTR)
+			mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
+		return ret;
+	}
+
+	if (!wq) {
+		/* Create a new wait queue */
+		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+		if (!wq) {
+			kfree(qstr.name);
+			mutex_unlock(&sbi->wq_mutex);
+			return -ENOMEM;
+		}
+
+		wq->wait_queue_token = autofs4_next_wait_queue;
+		if (++autofs4_next_wait_queue == 0)
+			autofs4_next_wait_queue = 1;
+		wq->next = sbi->queues;
+		sbi->queues = wq;
+		init_waitqueue_head(&wq->queue);
+		memcpy(&wq->name, &qstr, sizeof(struct qstr));
+		wq->dev = autofs4_get_dev(sbi);
+		wq->ino = autofs4_get_ino(sbi);
+		wq->uid = current_uid();
+		wq->gid = current_gid();
+		wq->pid = pid;
+		wq->tgid = tgid;
+		wq->status = -EINTR; /* Status return if interrupted */
+		wq->wait_ctr = 2;
+
+		if (sbi->version < 5) {
+			if (notify == NFY_MOUNT)
+				type = autofs_ptype_missing;
+			else
+				type = autofs_ptype_expire_multi;
+		} else {
+			if (notify == NFY_MOUNT)
+				type = autofs_type_trigger(sbi->type) ?
+					autofs_ptype_missing_direct :
+					 autofs_ptype_missing_indirect;
+			else
+				type = autofs_type_trigger(sbi->type) ?
+					autofs_ptype_expire_direct :
+					autofs_ptype_expire_indirect;
+		}
+
+		DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
+
+		/* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+		autofs4_notify_daemon(sbi, wq, type);
+	} else {
+		wq->wait_ctr++;
+		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
+		mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
+	}
+
+	/*
+	 * wq->name.name is NULL iff the lock is already released
+	 * or the mount has been made catatonic.
+	 */
+	if (wq->name.name) {
+		/* Block all but "shutdown" signals while waiting */
+		sigset_t oldset;
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&current->sighand->siglock, irqflags);
+		oldset = current->blocked;
+		siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+
+		wait_event_interruptible(wq->queue, wq->name.name == NULL);
+
+		spin_lock_irqsave(&current->sighand->siglock, irqflags);
+		current->blocked = oldset;
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+	} else {
+		DPRINTK("skipped sleeping");
+	}
+
+	status = wq->status;
+
+	/*
+	 * For direct and offset mounts we need to track the requester's
+	 * uid and gid in the dentry info struct. This is so it can be
+	 * supplied, on request, by the misc device ioctl interface.
+	 * This is needed during daemon resatart when reconnecting
+	 * to existing, active, autofs mounts. The uid and gid (and
+	 * related string values) may be used for macro substitution
+	 * in autofs mount maps.
+	 */
+	if (!status) {
+		struct autofs_info *ino;
+		struct dentry *de = NULL;
+
+		/* direct mount or browsable map */
+		ino = autofs4_dentry_ino(dentry);
+		if (!ino) {
+			/* If not lookup actual dentry used */
+			de = d_lookup(dentry->d_parent, &dentry->d_name);
+			if (de)
+				ino = autofs4_dentry_ino(de);
+		}
+
+		/* Set mount requester */
+		if (ino) {
+			spin_lock(&sbi->fs_lock);
+			ino->uid = wq->uid;
+			ino->gid = wq->gid;
+			spin_unlock(&sbi->fs_lock);
+		}
+
+		if (de)
+			dput(de);
+	}
+
+	/* Are we the last process to need status? */
+	mutex_lock(&sbi->wq_mutex);
+	if (!--wq->wait_ctr)
+		kfree(wq);
+	mutex_unlock(&sbi->wq_mutex);
+
+	return status;
+}
+
+
+int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
+{
+	struct autofs_wait_queue *wq, **wql;
+
+	mutex_lock(&sbi->wq_mutex);
+	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
+		if (wq->wait_queue_token == wait_queue_token)
+			break;
+	}
+
+	if (!wq) {
+		mutex_unlock(&sbi->wq_mutex);
+		return -EINVAL;
+	}
+
+	*wql = wq->next;	/* Unlink from chain */
+	kfree(wq->name.name);
+	wq->name.name = NULL;	/* Do not wait on this queue */
+	wq->status = status;
+	wake_up_interruptible(&wq->queue);
+	if (!--wq->wait_ctr)
+		kfree(wq);
+	mutex_unlock(&sbi->wq_mutex);
+
+	return 0;
+}
+
diff --git a/kernel/fs/bad_inode.c b/kernel/fs/bad_inode.c
new file mode 100644
index 000000000..861b1e1c4
--- /dev/null
+++ b/kernel/fs/bad_inode.c
@@ -0,0 +1,214 @@
+/*
+ *  linux/fs/bad_inode.c
+ *
+ *  Copyright (C) 1997, Stephen Tweedie
+ *
+ *  Provide stub functions for unreadable inodes
+ *
+ *  Fabian Frederick : August 2003 - All file operations assigned to EIO
+ */
+
+#include <linux/fs.h>
+#include <linux/export.h>
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+
+static int bad_file_open(struct inode *inode, struct file *filp)
+{
+	return -EIO;
+}
+
+static const struct file_operations bad_file_ops =
+{
+	.open		= bad_file_open,
+};
+
+static int bad_inode_create (struct inode *dir, struct dentry *dentry,
+		umode_t mode, bool excl)
+{
+	return -EIO;
+}
+
+static struct dentry *bad_inode_lookup(struct inode *dir,
+			struct dentry *dentry, unsigned int flags)
+{
+	return ERR_PTR(-EIO);
+}
+
+static int bad_inode_link (struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	return -EIO;
+}
+
+static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return -EIO;
+}
+
+static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
+		const char *symname)
+{
+	return -EIO;
+}
+
+static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
+			umode_t mode)
+{
+	return -EIO;
+}
+
+static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
+{
+	return -EIO;
+}
+
+static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
+			umode_t mode, dev_t rdev)
+{
+	return -EIO;
+}
+
+static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry,
+			     unsigned int flags)
+{
+	return -EIO;
+}
+
+static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
+		int buflen)
+{
+	return -EIO;
+}
+
+static int bad_inode_permission(struct inode *inode, int mask)
+{
+	return -EIO;
+}
+
+static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	return -EIO;
+}
+
+static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+	return -EIO;
+}
+
+static int bad_inode_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	return -EIO;
+}
+
+static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
+{
+	return -EIO;
+}
+
+static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
+			size_t buffer_size)
+{
+	return -EIO;
+}
+
+static int bad_inode_removexattr(struct dentry *dentry, const char *name)
+{
+	return -EIO;
+}
+
+static const struct inode_operations bad_inode_ops =
+{
+	.create		= bad_inode_create,
+	.lookup		= bad_inode_lookup,
+	.link		= bad_inode_link,
+	.unlink		= bad_inode_unlink,
+	.symlink	= bad_inode_symlink,
+	.mkdir		= bad_inode_mkdir,
+	.rmdir		= bad_inode_rmdir,
+	.mknod		= bad_inode_mknod,
+	.rename2	= bad_inode_rename2,
+	.readlink	= bad_inode_readlink,
+	/* follow_link must be no-op, otherwise unmounting this inode
+	   won't work */
+	/* put_link returns void */
+	/* truncate returns void */
+	.permission	= bad_inode_permission,
+	.getattr	= bad_inode_getattr,
+	.setattr	= bad_inode_setattr,
+	.setxattr	= bad_inode_setxattr,
+	.getxattr	= bad_inode_getxattr,
+	.listxattr	= bad_inode_listxattr,
+	.removexattr	= bad_inode_removexattr,
+};
+
+
+/*
+ * When a filesystem is unable to read an inode due to an I/O error in
+ * its read_inode() function, it can call make_bad_inode() to return a
+ * set of stubs which will return EIO errors as required. 
+ *
+ * We only need to do limited initialisation: all other fields are
+ * preinitialised to zero automatically.
+ */
+ 
+/**
+ *	make_bad_inode - mark an inode bad due to an I/O error
+ *	@inode: Inode to mark bad
+ *
+ *	When an inode cannot be read due to a media or remote network
+ *	failure this function makes the inode "bad" and causes I/O operations
+ *	on it to fail from this point on.
+ */
+ 
+void make_bad_inode(struct inode *inode)
+{
+	remove_inode_hash(inode);
+
+	inode->i_mode = S_IFREG;
+	inode->i_atime = inode->i_mtime = inode->i_ctime =
+		current_fs_time(inode->i_sb);
+	inode->i_op = &bad_inode_ops;	
+	inode->i_fop = &bad_file_ops;	
+}
+EXPORT_SYMBOL(make_bad_inode);
+
+/*
+ * This tests whether an inode has been flagged as bad. The test uses
+ * &bad_inode_ops to cover the case of invalidated inodes as well as
+ * those created by make_bad_inode() above.
+ */
+ 
+/**
+ *	is_bad_inode - is an inode errored
+ *	@inode: inode to test
+ *
+ *	Returns true if the inode in question has been marked as bad.
+ */
+ 
+int is_bad_inode(struct inode *inode)
+{
+	return (inode->i_op == &bad_inode_ops);	
+}
+
+EXPORT_SYMBOL(is_bad_inode);
+
+/**
+ * iget_failed - Mark an under-construction inode as dead and release it
+ * @inode: The inode to discard
+ *
+ * Mark an under-construction inode as dead and release it.
+ */
+void iget_failed(struct inode *inode)
+{
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+}
+EXPORT_SYMBOL(iget_failed);
diff --git a/kernel/fs/befs/ChangeLog b/kernel/fs/befs/ChangeLog
new file mode 100644
index 000000000..75a461cfa
--- /dev/null
+++ b/kernel/fs/befs/ChangeLog
@@ -0,0 +1,417 @@
+Version 0.92 (2002-03-29)
+==========
+* Minor cleanup. Ran Lindent on the sources.
+
+Version 0.92 (2002-03-27)
+==========
+* Fixed module makefile problem. It was not compiling all the correct 
+    source files!
+* Removed duplicated function definition
+* Fixed potential null pointer dereference when reporting an error
+
+Version 0.91 (2002-03-26)
+==========
+* Oy! Fixed stupid bug that would cause an unresolved symbol error.
+	Thanks to Laszlo Boszormenyi for pointing this out to me.
+
+Version 0.9 (2002-03-14)
+==========
+* Added Sergey S. Kostyliov's patch to eliminate memcpy() overhead
+	from b+tree operations. Changes the befs_read_datastream() interface.
+
+* Segregated the functions that interface directly with the linux  vfs 
+	interface into their own file called linuxvfs.c. [WD]
+
+Version 0.64 (2002-02-07)
+==========
+* Did the string comparison really right this time (btree.c) [WD]
+
+* Fixed up some places where I assumed that a long int could hold
+	a pointer value. (btree.c) [WD]
+
+* Andrew Farnham <andrewfarnham@uq.net.au> pointed out that the module
+	wouldn't work on older (<2.4.10) kernels due to an unresolved symbol.
+	This is bad, since 2.4.9 is still the current RedHat kernel. I added
+	a workaround for this problem (compatibility.h) [WD]
+
+* Sergey S. Kostyliov made befs_find_key() use a binary search to find 
+	keys within btree nodes, rather than the linear search we were using 
+	before. (btree.c) [Sergey S. Kostyliov <rathamahata@php4.ru>]
+
+* Made a debian package of the source for use with kernel-package. [WD]
+
+
+Version 0.63 (2002-01-31)
+==========
+* Fixed bug in befs_find_brun_indirect() that would result in the wrong
+	block being read. It was introduced when adding byteswapping in 
+	0.61. (datastream.c) [WD]
+
+* Fixed a longstanding bug in befs_find_key() that would result in it 
+	finding the first key that is a substring of the string it is searching
+	for. For example, this would cause files in the same directory with 
+	names like file1 and file2 to mysteriously be duplicates of each other 
+	(because they have the same inode number). Many thanks to Pavel Roskin 
+	for reporting this serious bug!!!
+	(btree.c) [WD]
+
+* Added support for long symlinks, after Axel Dorfler explained up how 
+	they work. I had forgotten all about them. (inode.c, symlink.c) [WD]
+
+* Documentation improvements in source. [WD]
+
+* Makefile fix for independent module when CONFIG_MODVERSION is set in 
+	kernel config [Pavel Roskin <proski@gnu.org>]
+
+* Compile warning fix for namei.c. [Sergey S. Kostyliov <rathamahata@php4.ru>]
+
+
+Version 0.62
+==========
+* Fixed makefile for module install [WD]
+
+
+Version 0.61 (2002-01-20)
+==========
+* Made functions in endian.h to do the correct byteswapping, no matter
+	the arch. [WD]
+
+* Abbandoned silly checks for a NULL superblock pointer in debug.c. [WD]
+
+* Misc code cleanups. Also cleanup of this changelog file. [WD]
+
+* Added byteswapping to all metadata reads from disk.
+	Uses the functions from endian.h [WD]
+
+* Remove the typedef of struct super_block to vfs_sb, as it offended
+	certain peoples' aesthetic sense. [WD]
+
+* Ditto with the befs_read_block() interface. [WD]
+ 
+
+Version 0.6 (2001-12-15)
+==========
+* Cleanup of NLS functions (util.c) [WD]
+
+* Make directory lookup/read use the NLS if an iocharset is provided. [WD]
+
+* Fixed stupid bug where specifying the uid or gid mount options as '0' 
+	would result in the filesystem using the on-disk uid and gid. [WD]
+
+* Added mount option to control debug printing. 
+	The option is, simply enough, 'debug'. 
+	(super.c, debug.c) [WD]
+
+* Removed notion of btree handle from btree.c. It was unnecessary, as the
+	linux VFS doesn't allow us to keep any state between calls. Updated 
+	dir.c, namei.c befs_fs.h to account for it. [WD]
+
+* Improved handleing of overflow nodes when listing directories. 
+	Now works for overflow nodes hanging off of nodes other than the root 
+	node. This is the cleaner solution to Brent Miszalaski's problem. [WD]
+
+* Added new debug/warning/error print functions in debug.c. 
+	More flexible. Will soon be controllable at mount time 
+	(see TODO). [WD]
+
+* Rewrote datastream position lookups.
+	(datastream.c) [WD]
+
+* Moved the TODO list to its own file.
+
+
+Version 0.50 (2001-11-13)
+==========
+* Added workaround for mis-understanding of the nature of the b+trees used 
+	in directories. A cleaner solution will come after I've thought about it 
+	for a while. Thanks to Brent Miszalaski for finding and reporting this bug. 
+	(btree.c) [WD]
+
+* Minor cleanups
+
+* Added test for "impossible" condition of empty internal nodes in 
+	seekleaf() in btree.c [WD]
+
+* Implemented the abstracted read_block() in io.c [WD]
+
+* Cleaned up the inode validation in inode.c [WD]
+
+* Anton Altaparmakov figured out (by asking Linus :) ) what was causing the 
+ 	hanging disk io problem. It turns out you need to have the sync_pages 
+	callback defined in your address_space_ops, even if it just uses the 
+	default linux-supplied implementation. Fixed. Works now.
+	(file.c) [WD]
+
+* Anton Altaparmakov and Christoph Hellwig alerted me to the fact that 
+	filesystem code should be using GFP_NOFS instead of GFP_KERNEL as the 
+	priority parameter to kmalloc(). Fixed. 
+	(datastream.c, btree.c super.c inode.c) [WD]
+
+* Anton also told me that the blocksize is not allowed to be larger than 
+	the page size in linux, which is 4k i386. Oops. Added a test for 
+	(blocksize > PAGE_SIZE), and refuse to mount in that case. What this 
+	practically means is that 8k blocksize volumes won't work without a major
+	restructuring of the driver (or an alpha or other 64bit hardware). [WD]
+
+* Cleaned up the befs_count_blocks() function. Much smarter now. 
+	And somewhat smaller too. [WD]
+
+* Made inode allocations use a slab cache 
+	(super.c inode.c) [WD]
+
+* Moved the freeing of the private inode section from put_inode() to 
+	clear_inode(). This fixes a potential free twice type bug. Put_inode() 
+	can be called multiple times for each inode struct. [WD]
+
+* Converted all non vfs-callback functions to use befs_sb_info as the 
+	superblock type, rather than struct super_block. This is for 
+	portablity. [WD]
+
+* Fixed a couple of compile warnings due to use of malloc.h, when slab.h 
+	is the new way. (inode.c, super.c) [WD]
+
+* Fixed erronous includes of linux/befs_fs_i.h and linux/befs_fs_sb.h 
+	in inode.c [WD]
+
+Version 0.45 (2001-10-29)
+==========
+* Added functions to get the private superblock and inode structures from 
+	their enclosing public structures. Switched all references to the 
+	private portions to use them. (many files) [WD]
+
+* Made read_super and read_inode allocate the private portions of those 
+	structures into the generic pointer fields of the public structures 
+	with kmalloc(). put_super and put_inode free them. This allows us not 
+	to have to touch the definitions of the public structures in 
+	include/linux/fs.h. Also, befs_inode_info is huge (because of the 
+	symlink string). (super.c, inode.c, befs_fs.h) [WD]
+
+* Fixed a thinko that was corrupting file reads after the first block_run 
+	is done being read. (datastream.c) [WD]
+
+* Removed fsync() hooks, since a read-only filesystem doesn't need them. 
+	[Christoph Hellwig].
+
+* Fixed befs_readlink() (symlink.c) [Christoph Hellwig].
+
+* Removed all the Read-Write stuff. I'll redo it when it is time to add 
+	write support (various files) [WD].
+
+* Removed prototypes for functions who's definitions have been removed 
+	(befs_fs.h) [WD].
+
+
+Version 0.4 (2001-10-28)
+==========
+* Made it an option to use the old non-pagecache befs_file_read() for 
+	testing purposes. (fs/Config.in)
+
+* Fixed unused variable warnings when compiling without debugging.
+
+* Fixed a bug where the inode and super_block didn't get their blockbits 
+	fields set (inode.c and super.c). 
+
+* Release patch version 11. AKA befs-driver version 0.4.
+
+* Thats right. New versioning scheme. 
+	I've done some serious testing on it now (on my box anyhow), and it 
+	seems stable and not outragously slow. Existing features are more-or-less 
+	correct (see TODO list). But it isn't 1.0 yet. I think 0.4 gives me some 
+	headroom before the big 1.0.
+
+
+2001-10-26
+==========
+* Fixed date format in this file. Was I smoking crack?
+
+* Removed old datastream code from file.c, since it is nolonger used.
+
+* Generic_read_file() is now used to read regular file data. 
+	It doesn't chew up the buffer cache (it does page io instead), and seems 
+	to be about as fast (even though it has to look up each file block 
+	indivdualy). And it knows about doing readahead, which is a major plus. 
+	So it does i/o in much larger chunks. It is the correct linux way. It 
+	uses befs_get_block() by way of befs_readpage() to find the disk offsets 
+	of blocks, which in turn calls befs_fpos2brun() in datastream.c to do 
+	the hard work of finding the disk block number.
+
+* Changed method of checking for a dirty filesystem in befs_read_super 
+	(super.c). Now we check to see if log_start and log_end differ. If so, 
+	the journal needs to be replayed, and the filesystem cannot be mounted.
+
+* Fixed an extra instance of MOD_DEC_USE_COUNT in super.c
+
+* Fixed a problem with reading the superblock on devices with large sector 
+	sizes (such as cdroms) on linux 2.4.10 and up.
+
+2001-10-24
+==========
+* Fix nasty bug in converting block numbers to struct befs_inode_addr. 
+	Subtle, because the old version was only sometimes wrong. 
+	Probably responsible for lots of problems. (inode.c)
+
+* Fix bug with reading an empty directory. (btree.c and dir.c)
+
+* This one looks good. Release patch version 10
+
+2001-10-23
+==========
+* Added btree searching function.
+
+* Use befs_btree_find in befs_lookup (namei.c)
+
+* Additional comments in btree.c
+
+2001-10-22
+==========
+* Added B+tree reading functions (in btree.c). 
+	Made befs_readdir() use them them instead of the cruft in index.c.
+
+2001-09-11
+==========
+* Converted befs_read_file() to use the new datastream code.
+
+* Finally updated the README file.
+
+* Added many comments.
+
+* Posted version 6
+
+* Removed byte-order conversion code. 
+	I have no intention of supporting it, and it was very ugly. 
+	Flow control with #ifdef (ugh). Maybe I'll redo it once 
+	native byteorder works 100%.
+
+2001-09-10
+==========
+* Finished implementing read_datastream()
+
+* made befs_read_brun() more general
+	Supports an offset to start at and a max bytes to read
+	Added a wrapper function to give the old call
+
+2001-09-30
+==========
+* Discovered that the datastream handleing code in file.c is quite deficient 
+	in several respects. For one thing, it doesn't deal with indirect blocks
+
+* Rewrote datastream handleing.
+
+* Created io.c, for io related functions.
+	Previously, the befs_bread() funtions lived in file.c
+	Created the befs_read_brun() function.
+
+
+2001-09-07
+==========
+* Made a function to actually count the number of fs blocks used by a file.
+	And helper functions.
+	(fs/befs/inode.c)
+
+2001-09-05
+==========
+* Fixed a misunderstanding of the inode fields. 
+	This fixed the problmem with wrong file sizes from du and others.
+	The i_blocks field of the inode struct is not the number of blocks for the
+	inode, it is the number of blocks for the file.	Also, i_blksize is not
+	necessarily the size of the inode, although in  practice it works out.
+	Changed to blocksize of filesystem.
+	(fs/befs/inode.c)
+
+* Permanently removed code that had been provisionally ifdefed out of befs_fs.h
+
+* Since we don't support access time, make that field zero, instead of 
+	copying m_time.
+	(fs/befs/inode.c)
+
+* Added sanity check for inode reading
+	Make sure inode we got was the one we asked for. 
+	(fs/befs/inode.c)
+
+* Code cleanup
+	Local pointers to commonly used structures in inode.c.
+	Got rid of abominations befs_iaddr2inode() and befs_inode2ino(). 
+	Replaced with single function iaddr2blockno().
+	(fs/befs/super.c) (fs/befs/inode.c)
+
+2001-09-01
+==========
+* Fixed the problem with statfs where it would always claim the disk was 
+	half full, due to improper understanding of the statfs fields.
+	(fs/befs/super.c)
+
+* Posted verion 4 of the patch
+
+2001-09-01
+==========
+* Changed the macros in befs_fs.h to inline functions.
+	More readable. Typesafe. Better
+	(include/linux/befs_fs.h)
+
+* Moved type definitions from befs_fs.h to a new file, befs_fs_types.h 
+	Because befs_fs_i.h and befs_fs_sb.h were including befs_fs.h for the 
+	typedefs, and they are inlcuded in <linux/fs.h>, which has definitions 
+	that I want the inline functions in befs_fs.h to be able to see. Nasty
+	circularity.
+	(include/linux/befs_fs.h)
+
+2001-08-30
+==========
+* Cleaned up some wording.
+
+* Added additional consitency checks on mount
+	Check block_size agrees with block_shift
+	Check flags == BEFS_CLEAN
+	(fs/befs/super.c)
+
+* Tell the kernel to only mount befs read-only. 
+	By setting the MS_RDONLY flag in befs_read_super().
+	Not that it was possible to write before. But now the kernel won't even try.
+	(fs/befs/super.c)
+
+* Got rid of kernel warning on mount.
+	The kernel doesn't like it if you call set_blocksize() on a device when 
+	you have some of its blocks open. Moved the second set_blocksize() to the
+	very end of befs_read_super(), after we are done with the disk superblock.
+	(fs/befs/super.c)
+	
+* Fixed wrong number of args bug in befs_dump_inode
+	(fs/befs/debug.c)
+
+* Solved lots of type mismatches in kprint()s
+	(everwhere)
+
+2001-08-27
+==========
+* Cleaned up the fs/Config.in entries a bit, now slightly more descriptive.
+
+* BeFS depends on NLS, so I made activating BeFS enable the NLS questions
+	(fs/nls/Config.in)
+
+* Added Configure.help entries for CONFIG_BEFS_FS and CONFIG_DEBUG_BEFS
+	(Documentation/Configure.help)
+
+2001-08-??
+==========
+* Removed superblock locking calls in befs_read_super(). In 2.4, the VFS 
+	hands us a super_block struct that is already locked.
+
+2001-08-13
+==========
+* Will Dyson <will_dyson@pobox.com> is now attempting to maintain this module
+	Makoto Kato <m_kato@ga2.so-net.ne.jp> is original author.Daniel Berlin 
+	also did some work on it (fixing it up for the later 2.3.x kernels, IIRC).
+
+* Fixed compile errors on 2.4.1 kernel (WD)
+	Resolve rejected patches
+	Accommodate changed NLS interface (util.h)
+	Needed to include <linux/slab.h> in most files
+	Makefile changes
+	fs/Config.in changes
+
+* Tried to niceify the code using the ext2 fs as a guide
+	Declare befs_fs_type using the DECLARE_FSTYPE_DEV() macro
+
+* Made it a configure option to turn on debugging (fs/Config.in)
+
+* Compiles on 2.4.7
diff --git a/kernel/fs/befs/Kconfig b/kernel/fs/befs/Kconfig
new file mode 100644
index 000000000..edc5cc2ae
--- /dev/null
+++ b/kernel/fs/befs/Kconfig
@@ -0,0 +1,26 @@
+config BEFS_FS
+	tristate "BeOS file system (BeFS) support (read only)"
+	depends on BLOCK
+	select NLS
+	help
+	  The BeOS File System (BeFS) is the native file system of Be, Inc's
+	  BeOS. Notable features include support for arbitrary attributes
+	  on files and directories, and database-like indices on selected
+	  attributes. (Also note that this driver doesn't make those features
+	  available at this time). It is a 64 bit filesystem, so it supports
+	  extremely large volumes and files.
+
+	  If you use this filesystem, you should also say Y to at least one
+	  of the NLS (native language support) options below.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be
+	  called befs.
+
+config BEFS_DEBUG
+	bool "Debug BeFS"
+	depends on BEFS_FS
+	help
+	  If you say Y here, you can use the 'debug' mount option to enable
+	  debugging output from the driver.
diff --git a/kernel/fs/befs/Makefile b/kernel/fs/befs/Makefile
new file mode 100644
index 000000000..8b9f66642
--- /dev/null
+++ b/kernel/fs/befs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux BeOS filesystem routines.
+#
+ 
+obj-$(CONFIG_BEFS_FS) += befs.o
+ccflags-$(CONFIG_BEFS_DEBUG)    += -DDEBUG
+befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/kernel/fs/befs/TODO b/kernel/fs/befs/TODO
new file mode 100644
index 000000000..3250921aa
--- /dev/null
+++ b/kernel/fs/befs/TODO
@@ -0,0 +1,14 @@
+TODO
+==========
+
+* Convert comments to the Kernel-Doc format.
+
+* Befs_fs.h has gotten big and messy. No reason not to break it up into 
+	smaller peices.
+
+* See if Alexander Viro's option parser made it into the kernel tree. 
+	Use that if we can. (include/linux/parser.h)
+
+* See if we really need separate types for on-disk and in-memory 
+	representations of the superblock and inode.
+
diff --git a/kernel/fs/befs/befs.h b/kernel/fs/befs/befs.h
new file mode 100644
index 000000000..1fead8d56
--- /dev/null
+++ b/kernel/fs/befs/befs.h
@@ -0,0 +1,157 @@
+/*
+ * befs.h
+ *
+ * Copyright (C) 2001-2002 Will Dyson <will_dyson@pobox.com>
+ * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ */
+
+#ifndef _LINUX_BEFS_H
+#define _LINUX_BEFS_H
+
+#include "befs_fs_types.h"
+
+/* used in debug.c */
+#define BEFS_VERSION "0.9.3"
+
+
+typedef u64 befs_blocknr_t;
+/*
+ * BeFS in memory structures
+ */
+
+struct befs_mount_options {
+	kgid_t gid;
+	kuid_t uid;
+	int use_gid;
+	int use_uid;
+	int debug;
+	char *iocharset;
+};
+
+struct befs_sb_info {
+	u32 magic1;
+	u32 block_size;
+	u32 block_shift;
+	int byte_order;
+	befs_off_t num_blocks;
+	befs_off_t used_blocks;
+	u32 inode_size;
+	u32 magic2;
+
+	/* Allocation group information */
+	u32 blocks_per_ag;
+	u32 ag_shift;
+	u32 num_ags;
+
+	/* jornal log entry */
+	befs_block_run log_blocks;
+	befs_off_t log_start;
+	befs_off_t log_end;
+
+	befs_inode_addr root_dir;
+	befs_inode_addr indices;
+	u32 magic3;
+
+	struct befs_mount_options mount_opts;
+	struct nls_table *nls;
+};
+
+struct befs_inode_info {
+	u32 i_flags;
+	u32 i_type;
+
+	befs_inode_addr i_inode_num;
+	befs_inode_addr i_parent;
+	befs_inode_addr i_attribute;
+
+	union {
+		befs_data_stream ds;
+		char symlink[BEFS_SYMLINK_LEN];
+	} i_data;
+
+	struct inode vfs_inode;
+};
+
+enum befs_err {
+	BEFS_OK,
+	BEFS_ERR,
+	BEFS_BAD_INODE,
+	BEFS_BT_END,
+	BEFS_BT_EMPTY,
+	BEFS_BT_MATCH,
+	BEFS_BT_PARMATCH,
+	BEFS_BT_NOT_FOUND
+};
+
+
+/****************************/
+/* debug.c */
+__printf(2, 3)
+void befs_error(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
+void befs_warning(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
+void befs_debug(const struct super_block *sb, const char *fmt, ...);
+
+void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
+void befs_dump_inode(const struct super_block *sb, befs_inode *);
+void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *);
+void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *);
+/****************************/
+
+
+/* Gets a pointer to the private portion of the super_block
+ * structure from the public part
+ */
+static inline struct befs_sb_info *
+BEFS_SB(const struct super_block *super)
+{
+	return (struct befs_sb_info *) super->s_fs_info;
+}
+
+static inline struct befs_inode_info *
+BEFS_I(const struct inode *inode)
+{
+	return list_entry(inode, struct befs_inode_info, vfs_inode);
+}
+
+static inline befs_blocknr_t
+iaddr2blockno(struct super_block *sb, befs_inode_addr * iaddr)
+{
+	return ((iaddr->allocation_group << BEFS_SB(sb)->ag_shift) +
+		iaddr->start);
+}
+
+static inline befs_inode_addr
+blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
+{
+	befs_inode_addr iaddr;
+	iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift;
+	iaddr.start =
+	    blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift);
+	iaddr.len = 1;
+
+	return iaddr;
+}
+
+static inline unsigned int
+befs_iaddrs_per_block(struct super_block *sb)
+{
+	return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
+}
+
+static inline int
+befs_iaddr_is_empty(befs_inode_addr * iaddr)
+{
+	return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len);
+}
+
+static inline size_t
+befs_brun_size(struct super_block *sb, befs_block_run run)
+{
+	return BEFS_SB(sb)->block_size * run.len;
+}
+
+#include "endian.h"
+
+#endif				/* _LINUX_BEFS_H */
diff --git a/kernel/fs/befs/befs_fs_types.h b/kernel/fs/befs/befs_fs_types.h
new file mode 100644
index 000000000..eb557d9dc
--- /dev/null
+++ b/kernel/fs/befs/befs_fs_types.h
@@ -0,0 +1,255 @@
+/*
+ * fs/befs/befs_fs_types.h
+ *
+ * Copyright (C) 2001 Will Dyson (will@cs.earlham.edu)
+ *
+ *
+ *
+ * from linux/include/linux/befs_fs.h
+ *
+ * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ *
+ */
+
+#ifndef _LINUX_BEFS_FS_TYPES
+#define _LINUX_BEFS_FS_TYPES
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#endif /*__KERNEL__*/
+
+#define PACKED __attribute__ ((__packed__))
+
+/*
+ * Max name lengths of BFS
+ */
+
+#define BEFS_NAME_LEN 255
+
+#define BEFS_SYMLINK_LEN 144
+#define BEFS_NUM_DIRECT_BLOCKS 12
+#define B_OS_NAME_LENGTH 32
+
+/* The datastream blocks mapped by the double-indirect
+ * block are always 4 fs blocks long.
+ * This eliminates the need for linear searches among
+ * the potentially huge number of indirect blocks
+ *
+ * Err. Should that be 4 fs blocks or 4k???
+ * It matters on large blocksize volumes
+ */
+#define BEFS_DBLINDIR_BRUN_LEN 4
+
+/*
+ * Flags of superblock
+ */
+
+enum super_flags {
+	BEFS_BYTESEX_BE,
+	BEFS_BYTESEX_LE,
+	BEFS_CLEAN = 0x434c454e,
+	BEFS_DIRTY = 0x44495254,
+	BEFS_SUPER_MAGIC1 = 0x42465331,	/* BFS1 */
+	BEFS_SUPER_MAGIC2 = 0xdd121031,
+	BEFS_SUPER_MAGIC3 = 0x15b6830e,
+};
+
+#define BEFS_BYTEORDER_NATIVE 0x42494745
+#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)
+#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)
+
+#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1
+#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)
+#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)
+
+/*
+ * Flags of inode
+ */
+
+#define BEFS_INODE_MAGIC1 0x3bbe0ad9
+
+enum inode_flags {
+	BEFS_INODE_IN_USE = 0x00000001,
+	BEFS_ATTR_INODE = 0x00000004,
+	BEFS_INODE_LOGGED = 0x00000008,
+	BEFS_INODE_DELETED = 0x00000010,
+	BEFS_LONG_SYMLINK = 0x00000040,
+	BEFS_PERMANENT_FLAG = 0x0000ffff,
+	BEFS_INODE_NO_CREATE = 0x00010000,
+	BEFS_INODE_WAS_WRITTEN = 0x00020000,
+	BEFS_NO_TRANSACTION = 0x00040000,
+};
+/* 
+ * On-Disk datastructures of BeFS
+ */
+
+typedef u64 __bitwise fs64;
+typedef u32 __bitwise fs32;
+typedef u16 __bitwise fs16;
+
+typedef u64 befs_off_t;
+typedef fs64 befs_time_t;
+
+/* Block runs */
+typedef struct {
+	fs32 allocation_group;
+	fs16 start;
+	fs16 len;
+} PACKED befs_disk_block_run;
+
+typedef struct {
+	u32 allocation_group;
+	u16 start;
+	u16 len;
+} PACKED befs_block_run;
+
+typedef befs_disk_block_run befs_disk_inode_addr;
+typedef befs_block_run befs_inode_addr;
+
+/*
+ * The Superblock Structure
+ */
+typedef struct {
+	char name[B_OS_NAME_LENGTH];
+	fs32 magic1;
+	fs32 fs_byte_order;
+
+	fs32 block_size;
+	fs32 block_shift;
+
+	fs64 num_blocks;
+	fs64 used_blocks;
+
+	fs32 inode_size;
+
+	fs32 magic2;
+	fs32 blocks_per_ag;
+	fs32 ag_shift;
+	fs32 num_ags;
+
+	fs32 flags;
+
+	befs_disk_block_run log_blocks;
+	fs64 log_start;
+	fs64 log_end;
+
+	fs32 magic3;
+	befs_disk_inode_addr root_dir;
+	befs_disk_inode_addr indices;
+
+} PACKED befs_super_block;
+
+/* 
+ * Note: the indirect and dbl_indir block_runs may
+ * be longer than one block!
+ */
+typedef struct {
+	befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
+	fs64 max_direct_range;
+	befs_disk_block_run indirect;
+	fs64 max_indirect_range;
+	befs_disk_block_run double_indirect;
+	fs64 max_double_indirect_range;
+	fs64 size;
+} PACKED befs_disk_data_stream;
+
+typedef struct {
+	befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
+	befs_off_t max_direct_range;
+	befs_block_run indirect;
+	befs_off_t max_indirect_range;
+	befs_block_run double_indirect;
+	befs_off_t max_double_indirect_range;
+	befs_off_t size;
+} PACKED befs_data_stream;
+
+/* Attribute */
+typedef struct {
+	fs32 type;
+	fs16 name_size;
+	fs16 data_size;
+	char name[1];
+} PACKED befs_small_data;
+
+/* Inode structure */
+typedef struct {
+	fs32 magic1;
+	befs_disk_inode_addr inode_num;
+	fs32 uid;
+	fs32 gid;
+	fs32 mode;
+	fs32 flags;
+	befs_time_t create_time;
+	befs_time_t last_modified_time;
+	befs_disk_inode_addr parent;
+	befs_disk_inode_addr attributes;
+	fs32 type;
+
+	fs32 inode_size;
+	fs32 etc;		/* not use */
+
+	union {
+		befs_disk_data_stream datastream;
+		char symlink[BEFS_SYMLINK_LEN];
+	} data;
+
+	fs32 pad[4];		/* not use */
+	befs_small_data small_data[1];
+} PACKED befs_inode;
+
+/*
+ * B+tree superblock
+ */
+
+#define BEFS_BTREE_MAGIC 0x69f6c2e8
+
+enum btree_types {
+	BTREE_STRING_TYPE = 0,
+	BTREE_INT32_TYPE = 1,
+	BTREE_UINT32_TYPE = 2,
+	BTREE_INT64_TYPE = 3,
+	BTREE_UINT64_TYPE = 4,
+	BTREE_FLOAT_TYPE = 5,
+	BTREE_DOUBLE_TYPE = 6
+};
+
+typedef struct {
+	fs32 magic;
+	fs32 node_size;
+	fs32 max_depth;
+	fs32 data_type;
+	fs64 root_node_ptr;
+	fs64 free_node_ptr;
+	fs64 max_size;
+} PACKED befs_disk_btree_super;
+
+typedef struct {
+	u32 magic;
+	u32 node_size;
+	u32 max_depth;
+	u32 data_type;
+	befs_off_t root_node_ptr;
+	befs_off_t free_node_ptr;
+	befs_off_t max_size;
+} PACKED befs_btree_super;
+
+/*
+ * Header structure of each btree node
+ */
+typedef struct {
+	fs64 left;
+	fs64 right;
+	fs64 overflow;
+	fs16 all_key_count;
+	fs16 all_key_length;
+} PACKED befs_btree_nodehead;
+
+typedef struct {
+	befs_off_t left;
+	befs_off_t right;
+	befs_off_t overflow;
+	u16 all_key_count;
+	u16 all_key_length;
+} PACKED befs_host_btree_nodehead;
+
+#endif				/* _LINUX_BEFS_FS_TYPES */
diff --git a/kernel/fs/befs/btree.c b/kernel/fs/befs/btree.c
new file mode 100644
index 000000000..0826e91da
--- /dev/null
+++ b/kernel/fs/befs/btree.c
@@ -0,0 +1,793 @@
+/*
+ * linux/fs/befs/btree.c
+ *
+ * Copyright (C) 2001-2002 Will Dyson <will_dyson@pobox.com>
+ *
+ * Licensed under the GNU GPL. See the file COPYING for details.
+ *
+ * 2002-02-05: Sergey S. Kostyliov added binary search within
+ * 		btree nodes.
+ *
+ * Many thanks to:
+ *
+ * Dominic Giampaolo, author of "Practical File System
+ * Design with the Be File System", for such a helpful book.
+ * 
+ * Marcus J. Ranum, author of the b+tree package in 
+ * comp.sources.misc volume 10. This code is not copied from that
+ * work, but it is partially based on it.
+ *
+ * Makoto Kato, author of the original BeFS for linux filesystem
+ * driver.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+
+#include "befs.h"
+#include "btree.h"
+#include "datastream.h"
+
+/*
+ * The btree functions in this file are built on top of the
+ * datastream.c interface, which is in turn built on top of the
+ * io.c interface.
+ */
+
+/* Befs B+tree structure:
+ * 
+ * The first thing in the tree is the tree superblock. It tells you
+ * all kinds of useful things about the tree, like where the rootnode
+ * is located, and the size of the nodes (always 1024 with current version
+ * of BeOS).
+ *
+ * The rest of the tree consists of a series of nodes. Nodes contain a header
+ * (struct befs_btree_nodehead), the packed key data, an array of shorts 
+ * containing the ending offsets for each of the keys, and an array of
+ * befs_off_t values. In interior nodes, the keys are the ending keys for 
+ * the childnode they point to, and the values are offsets into the 
+ * datastream containing the tree. 
+ */
+
+/* Note:
+ * 
+ * The book states 2 confusing things about befs b+trees. First, 
+ * it states that the overflow field of node headers is used by internal nodes
+ * to point to another node that "effectively continues this one". Here is what
+ * I believe that means. Each key in internal nodes points to another node that
+ * contains key values less than itself. Inspection reveals that the last key 
+ * in the internal node is not the last key in the index. Keys that are 
+ * greater than the last key in the internal node go into the overflow node. 
+ * I imagine there is a performance reason for this.
+ *
+ * Second, it states that the header of a btree node is sufficient to 
+ * distinguish internal nodes from leaf nodes. Without saying exactly how. 
+ * After figuring out the first, it becomes obvious that internal nodes have
+ * overflow nodes and leafnodes do not.
+ */
+
+/* 
+ * Currently, this code is only good for directory B+trees.
+ * In order to be used for other BFS indexes, it needs to be extended to handle
+ * duplicate keys and non-string keytypes (int32, int64, float, double).
+ */
+
+/*
+ * In memory structure of each btree node
+ */
+struct befs_btree_node {
+	befs_host_btree_nodehead head;	/* head of node converted to cpu byteorder */
+	struct buffer_head *bh;
+	befs_btree_nodehead *od_node;	/* on disk node */
+};
+
+/* local constants */
+static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
+
+/* local functions */
+static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
+			       befs_btree_super * bt_super,
+			       struct befs_btree_node *this_node,
+			       befs_off_t * node_off);
+
+static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
+			      befs_btree_super * sup);
+
+static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
+			     struct befs_btree_node *node,
+			     befs_off_t node_off);
+
+static int befs_leafnode(struct befs_btree_node *node);
+
+static fs16 *befs_bt_keylen_index(struct befs_btree_node *node);
+
+static fs64 *befs_bt_valarray(struct befs_btree_node *node);
+
+static char *befs_bt_keydata(struct befs_btree_node *node);
+
+static int befs_find_key(struct super_block *sb,
+			 struct befs_btree_node *node,
+			 const char *findkey, befs_off_t * value);
+
+static char *befs_bt_get_key(struct super_block *sb,
+			     struct befs_btree_node *node,
+			     int index, u16 * keylen);
+
+static int befs_compare_strings(const void *key1, int keylen1,
+				const void *key2, int keylen2);
+
+/**
+ * befs_bt_read_super - read in btree superblock convert to cpu byteorder
+ * @sb: Filesystem superblock
+ * @ds: Datastream to read from
+ * @sup: Buffer in which to place the btree superblock
+ *
+ * Calls befs_read_datastream to read in the btree superblock and
+ * makes sure it is in cpu byteorder, byteswapping if necessary.
+ *
+ * On success, returns BEFS_OK and *@sup contains the btree superblock,
+ * in cpu byte order.
+ *
+ * On failure, BEFS_ERR is returned.
+ */
+static int
+befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
+		   befs_btree_super * sup)
+{
+	struct buffer_head *bh = NULL;
+	befs_disk_btree_super *od_sup = NULL;
+
+	befs_debug(sb, "---> %s", __func__);
+
+	bh = befs_read_datastream(sb, ds, 0, NULL);
+
+	if (!bh) {
+		befs_error(sb, "Couldn't read index header.");
+		goto error;
+	}
+	od_sup = (befs_disk_btree_super *) bh->b_data;
+	befs_dump_index_entry(sb, od_sup);
+
+	sup->magic = fs32_to_cpu(sb, od_sup->magic);
+	sup->node_size = fs32_to_cpu(sb, od_sup->node_size);
+	sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth);
+	sup->data_type = fs32_to_cpu(sb, od_sup->data_type);
+	sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr);
+	sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr);
+	sup->max_size = fs64_to_cpu(sb, od_sup->max_size);
+
+	brelse(bh);
+	if (sup->magic != BEFS_BTREE_MAGIC) {
+		befs_error(sb, "Index header has bad magic.");
+		goto error;
+	}
+
+	befs_debug(sb, "<--- %s", __func__);
+	return BEFS_OK;
+
+      error:
+	befs_debug(sb, "<--- %s ERROR", __func__);
+	return BEFS_ERR;
+}
+
+/**
+ * befs_bt_read_node - read in btree node and convert to cpu byteorder
+ * @sb: Filesystem superblock
+ * @ds: Datastream to read from
+ * @node: Buffer in which to place the btree node
+ * @node_off: Starting offset (in bytes) of the node in @ds
+ *
+ * Calls befs_read_datastream to read in the indicated btree node and
+ * makes sure its header fields are in cpu byteorder, byteswapping if
+ * necessary.
+ * Note: node->bh must be NULL when this function called first
+ * time. Don't forget brelse(node->bh) after last call.
+ *
+ * On success, returns BEFS_OK and *@node contains the btree node that
+ * starts at @node_off, with the node->head fields in cpu byte order.
+ *
+ * On failure, BEFS_ERR is returned.
+ */
+
+static int
+befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
+		  struct befs_btree_node *node, befs_off_t node_off)
+{
+	uint off = 0;
+
+	befs_debug(sb, "---> %s", __func__);
+
+	if (node->bh)
+		brelse(node->bh);
+
+	node->bh = befs_read_datastream(sb, ds, node_off, &off);
+	if (!node->bh) {
+		befs_error(sb, "%s failed to read "
+			   "node at %llu", __func__, node_off);
+		befs_debug(sb, "<--- %s ERROR", __func__);
+
+		return BEFS_ERR;
+	}
+	node->od_node =
+	    (befs_btree_nodehead *) ((void *) node->bh->b_data + off);
+
+	befs_dump_index_node(sb, node->od_node);
+
+	node->head.left = fs64_to_cpu(sb, node->od_node->left);
+	node->head.right = fs64_to_cpu(sb, node->od_node->right);
+	node->head.overflow = fs64_to_cpu(sb, node->od_node->overflow);
+	node->head.all_key_count =
+	    fs16_to_cpu(sb, node->od_node->all_key_count);
+	node->head.all_key_length =
+	    fs16_to_cpu(sb, node->od_node->all_key_length);
+
+	befs_debug(sb, "<--- %s", __func__);
+	return BEFS_OK;
+}
+
+/**
+ * befs_btree_find - Find a key in a befs B+tree
+ * @sb: Filesystem superblock
+ * @ds: Datastream containing btree
+ * @key: Key string to lookup in btree
+ * @value: Value stored with @key
+ *
+ * On success, returns BEFS_OK and sets *@value to the value stored
+ * with @key (usually the disk block number of an inode).
+ *
+ * On failure, returns BEFS_ERR or BEFS_BT_NOT_FOUND.
+ * 
+ * Algorithm: 
+ *   Read the superblock and rootnode of the b+tree.
+ *   Drill down through the interior nodes using befs_find_key().
+ *   Once at the correct leaf node, use befs_find_key() again to get the
+ *   actuall value stored with the key.
+ */
+int
+befs_btree_find(struct super_block *sb, befs_data_stream * ds,
+		const char *key, befs_off_t * value)
+{
+	struct befs_btree_node *this_node = NULL;
+	befs_btree_super bt_super;
+	befs_off_t node_off;
+	int res;
+
+	befs_debug(sb, "---> %s Key: %s", __func__, key);
+
+	if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
+		befs_error(sb,
+			   "befs_btree_find() failed to read index superblock");
+		goto error;
+	}
+
+	this_node = kmalloc(sizeof(struct befs_btree_node),
+						GFP_NOFS);
+	if (!this_node) {
+		befs_error(sb, "befs_btree_find() failed to allocate %zu "
+			   "bytes of memory", sizeof(struct befs_btree_node));
+		goto error;
+	}
+
+	this_node->bh = NULL;
+
+	/* read in root node */
+	node_off = bt_super.root_node_ptr;
+	if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
+		befs_error(sb, "befs_btree_find() failed to read "
+			   "node at %llu", node_off);
+		goto error_alloc;
+	}
+
+	while (!befs_leafnode(this_node)) {
+		res = befs_find_key(sb, this_node, key, &node_off);
+		if (res == BEFS_BT_NOT_FOUND)
+			node_off = this_node->head.overflow;
+		/* if no match, go to overflow node */
+		if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
+			befs_error(sb, "befs_btree_find() failed to read "
+				   "node at %llu", node_off);
+			goto error_alloc;
+		}
+	}
+
+	/* at the correct leaf node now */
+
+	res = befs_find_key(sb, this_node, key, value);
+
+	brelse(this_node->bh);
+	kfree(this_node);
+
+	if (res != BEFS_BT_MATCH) {
+		befs_debug(sb, "<--- %s Key %s not found", __func__, key);
+		*value = 0;
+		return BEFS_BT_NOT_FOUND;
+	}
+	befs_debug(sb, "<--- %s Found key %s, value %llu", __func__,
+		   key, *value);
+	return BEFS_OK;
+
+      error_alloc:
+	kfree(this_node);
+      error:
+	*value = 0;
+	befs_debug(sb, "<--- %s ERROR", __func__);
+	return BEFS_ERR;
+}
+
+/**
+ * befs_find_key - Search for a key within a node
+ * @sb: Filesystem superblock
+ * @node: Node to find the key within
+ * @findkey: Keystring to search for
+ * @value: If key is found, the value stored with the key is put here
+ *
+ * finds exact match if one exists, and returns BEFS_BT_MATCH
+ * If no exact match, finds first key in node that is greater
+ * (alphabetically) than the search key and returns BEFS_BT_PARMATCH
+ * (for partial match, I guess). Can you think of something better to
+ * call it?
+ *
+ * If no key was a match or greater than the search key, return
+ * BEFS_BT_NOT_FOUND.
+ *
+ * Use binary search instead of a linear.
+ */
+static int
+befs_find_key(struct super_block *sb, struct befs_btree_node *node,
+	      const char *findkey, befs_off_t * value)
+{
+	int first, last, mid;
+	int eq;
+	u16 keylen;
+	int findkey_len;
+	char *thiskey;
+	fs64 *valarray;
+
+	befs_debug(sb, "---> %s %s", __func__, findkey);
+
+	*value = 0;
+
+	findkey_len = strlen(findkey);
+
+	/* if node can not contain key, just skeep this node */
+	last = node->head.all_key_count - 1;
+	thiskey = befs_bt_get_key(sb, node, last, &keylen);
+
+	eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
+	if (eq < 0) {
+		befs_debug(sb, "<--- %s %s not found", __func__, findkey);
+		return BEFS_BT_NOT_FOUND;
+	}
+
+	valarray = befs_bt_valarray(node);
+
+	/* simple binary search */
+	first = 0;
+	mid = 0;
+	while (last >= first) {
+		mid = (last + first) / 2;
+		befs_debug(sb, "first: %d, last: %d, mid: %d", first, last,
+			   mid);
+		thiskey = befs_bt_get_key(sb, node, mid, &keylen);
+		eq = befs_compare_strings(thiskey, keylen, findkey,
+					  findkey_len);
+
+		if (eq == 0) {
+			befs_debug(sb, "<--- %s found %s at %d",
+				   __func__, thiskey, mid);
+
+			*value = fs64_to_cpu(sb, valarray[mid]);
+			return BEFS_BT_MATCH;
+		}
+		if (eq > 0)
+			last = mid - 1;
+		else
+			first = mid + 1;
+	}
+	if (eq < 0)
+		*value = fs64_to_cpu(sb, valarray[mid + 1]);
+	else
+		*value = fs64_to_cpu(sb, valarray[mid]);
+	befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
+	return BEFS_BT_PARMATCH;
+}
+
+/**
+ * befs_btree_read - Traverse leafnodes of a btree
+ * @sb: Filesystem superblock
+ * @ds: Datastream containing btree
+ * @key_no: Key number (alphabetical order) of key to read
+ * @bufsize: Size of the buffer to return key in
+ * @keybuf: Pointer to a buffer to put the key in
+ * @keysize: Length of the returned key
+ * @value: Value stored with the returned key
+ *
+ * Heres how it works: Key_no is the index of the key/value pair to 
+ * return in keybuf/value.
+ * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is 
+ * the number of characters in the key (just a convenience).
+ *
+ * Algorithm:
+ *   Get the first leafnode of the tree. See if the requested key is in that
+ *   node. If not, follow the node->right link to the next leafnode. Repeat 
+ *   until the (key_no)th key is found or the tree is out of keys.
+ */
+int
+befs_btree_read(struct super_block *sb, befs_data_stream * ds,
+		loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize,
+		befs_off_t * value)
+{
+	struct befs_btree_node *this_node;
+	befs_btree_super bt_super;
+	befs_off_t node_off = 0;
+	int cur_key;
+	fs64 *valarray;
+	char *keystart;
+	u16 keylen;
+	int res;
+
+	uint key_sum = 0;
+
+	befs_debug(sb, "---> %s", __func__);
+
+	if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
+		befs_error(sb,
+			   "befs_btree_read() failed to read index superblock");
+		goto error;
+	}
+
+	this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS);
+	if (this_node == NULL) {
+		befs_error(sb, "befs_btree_read() failed to allocate %zu "
+			   "bytes of memory", sizeof(struct befs_btree_node));
+		goto error;
+	}
+
+	node_off = bt_super.root_node_ptr;
+	this_node->bh = NULL;
+
+	/* seeks down to first leafnode, reads it into this_node */
+	res = befs_btree_seekleaf(sb, ds, &bt_super, this_node, &node_off);
+	if (res == BEFS_BT_EMPTY) {
+		brelse(this_node->bh);
+		kfree(this_node);
+		*value = 0;
+		*keysize = 0;
+		befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
+		return BEFS_BT_EMPTY;
+	} else if (res == BEFS_ERR) {
+		goto error_alloc;
+	}
+
+	/* find the leaf node containing the key_no key */
+
+	while (key_sum + this_node->head.all_key_count <= key_no) {
+
+		/* no more nodes to look in: key_no is too large */
+		if (this_node->head.right == befs_bt_inval) {
+			*keysize = 0;
+			*value = 0;
+			befs_debug(sb,
+				   "<--- %s END of keys at %llu", __func__,
+				   (unsigned long long)
+				   key_sum + this_node->head.all_key_count);
+			brelse(this_node->bh);
+			kfree(this_node);
+			return BEFS_BT_END;
+		}
+
+		key_sum += this_node->head.all_key_count;
+		node_off = this_node->head.right;
+
+		if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
+			befs_error(sb, "%s failed to read node at %llu",
+				  __func__, (unsigned long long)node_off);
+			goto error_alloc;
+		}
+	}
+
+	/* how many keys into this_node is key_no */
+	cur_key = key_no - key_sum;
+
+	/* get pointers to datastructures within the node body */
+	valarray = befs_bt_valarray(this_node);
+
+	keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
+
+	befs_debug(sb, "Read [%llu,%d]: keysize %d",
+		   (long long unsigned int)node_off, (int)cur_key,
+		   (int)keylen);
+
+	if (bufsize < keylen + 1) {
+		befs_error(sb, "%s keybuf too small (%zu) "
+			   "for key of size %d", __func__, bufsize, keylen);
+		brelse(this_node->bh);
+		goto error_alloc;
+	}
+
+	strlcpy(keybuf, keystart, keylen + 1);
+	*value = fs64_to_cpu(sb, valarray[cur_key]);
+	*keysize = keylen;
+
+	befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
+		   cur_key, keylen, keybuf, *value);
+
+	brelse(this_node->bh);
+	kfree(this_node);
+
+	befs_debug(sb, "<--- %s", __func__);
+
+	return BEFS_OK;
+
+      error_alloc:
+	kfree(this_node);
+
+      error:
+	*keysize = 0;
+	*value = 0;
+	befs_debug(sb, "<--- %s ERROR", __func__);
+	return BEFS_ERR;
+}
+
+/**
+ * befs_btree_seekleaf - Find the first leafnode in the btree
+ * @sb: Filesystem superblock
+ * @ds: Datastream containing btree
+ * @bt_super: Pointer to the superblock of the btree
+ * @this_node: Buffer to return the leafnode in
+ * @node_off: Pointer to offset of current node within datastream. Modified
+ * 		by the function.
+ *
+ *
+ * Helper function for btree traverse. Moves the current position to the 
+ * start of the first leaf node.
+ *
+ * Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY.
+ */
+static int
+befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
+		    befs_btree_super *bt_super,
+		    struct befs_btree_node *this_node,
+		    befs_off_t * node_off)
+{
+
+	befs_debug(sb, "---> %s", __func__);
+
+	if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
+		befs_error(sb, "%s failed to read "
+			   "node at %llu", __func__, *node_off);
+		goto error;
+	}
+	befs_debug(sb, "Seekleaf to root node %llu", *node_off);
+
+	if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
+		befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
+		return BEFS_BT_EMPTY;
+	}
+
+	while (!befs_leafnode(this_node)) {
+
+		if (this_node->head.all_key_count == 0) {
+			befs_debug(sb, "%s encountered "
+				   "an empty interior node: %llu. Using Overflow "
+				   "node: %llu", __func__, *node_off,
+				   this_node->head.overflow);
+			*node_off = this_node->head.overflow;
+		} else {
+			fs64 *valarray = befs_bt_valarray(this_node);
+			*node_off = fs64_to_cpu(sb, valarray[0]);
+		}
+		if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
+			befs_error(sb, "%s failed to read "
+				   "node at %llu", __func__, *node_off);
+			goto error;
+		}
+
+		befs_debug(sb, "Seekleaf to child node %llu", *node_off);
+	}
+	befs_debug(sb, "Node %llu is a leaf node", *node_off);
+
+	return BEFS_OK;
+
+      error:
+	befs_debug(sb, "<--- %s ERROR", __func__);
+	return BEFS_ERR;
+}
+
+/**
+ * befs_leafnode - Determine if the btree node is a leaf node or an 
+ * interior node
+ * @node: Pointer to node structure to test
+ * 
+ * Return 1 if leaf, 0 if interior
+ */
+static int
+befs_leafnode(struct befs_btree_node *node)
+{
+	/* all interior nodes (and only interior nodes) have an overflow node */
+	if (node->head.overflow == befs_bt_inval)
+		return 1;
+	else
+		return 0;
+}
+
+/**
+ * befs_bt_keylen_index - Finds start of keylen index in a node
+ * @node: Pointer to the node structure to find the keylen index within
+ *
+ * Returns a pointer to the start of the key length index array
+ * of the B+tree node *@node
+ *
+ * "The length of all the keys in the node is added to the size of the
+ * header and then rounded up to a multiple of four to get the beginning
+ * of the key length index" (p.88, practical filesystem design).
+ *
+ * Except that rounding up to 8 works, and rounding up to 4 doesn't.
+ */
+static fs16 *
+befs_bt_keylen_index(struct befs_btree_node *node)
+{
+	const int keylen_align = 8;
+	unsigned long int off =
+	    (sizeof (befs_btree_nodehead) + node->head.all_key_length);
+	ulong tmp = off % keylen_align;
+
+	if (tmp)
+		off += keylen_align - tmp;
+
+	return (fs16 *) ((void *) node->od_node + off);
+}
+
+/**
+ * befs_bt_valarray - Finds the start of value array in a node
+ * @node: Pointer to the node structure to find the value array within
+ *
+ * Returns a pointer to the start of the value array
+ * of the node pointed to by the node header
+ */
+static fs64 *
+befs_bt_valarray(struct befs_btree_node *node)
+{
+	void *keylen_index_start = (void *) befs_bt_keylen_index(node);
+	size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
+
+	return (fs64 *) (keylen_index_start + keylen_index_size);
+}
+
+/**
+ * befs_bt_keydata - Finds start of keydata array in a node
+ * @node: Pointer to the node structure to find the keydata array within
+ *
+ * Returns a pointer to the start of the keydata array
+ * of the node pointed to by the node header 
+ */
+static char *
+befs_bt_keydata(struct befs_btree_node *node)
+{
+	return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead));
+}
+
+/**
+ * befs_bt_get_key - returns a pointer to the start of a key
+ * @sb: filesystem superblock
+ * @node: node in which to look for the key
+ * @index: the index of the key to get
+ * @keylen: modified to be the length of the key at @index
+ *
+ * Returns a valid pointer into @node on success.
+ * Returns NULL on failure (bad input) and sets *@keylen = 0
+ */
+static char *
+befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
+		int index, u16 * keylen)
+{
+	int prev_key_end;
+	char *keystart;
+	fs16 *keylen_index;
+
+	if (index < 0 || index > node->head.all_key_count) {
+		*keylen = 0;
+		return NULL;
+	}
+
+	keystart = befs_bt_keydata(node);
+	keylen_index = befs_bt_keylen_index(node);
+
+	if (index == 0)
+		prev_key_end = 0;
+	else
+		prev_key_end = fs16_to_cpu(sb, keylen_index[index - 1]);
+
+	*keylen = fs16_to_cpu(sb, keylen_index[index]) - prev_key_end;
+
+	return keystart + prev_key_end;
+}
+
+/**
+ * befs_compare_strings - compare two strings
+ * @key1: pointer to the first key to be compared 
+ * @keylen1: length in bytes of key1
+ * @key2: pointer to the second key to be compared
+ * @keylen2: length in bytes of key2
+ *
+ * Returns 0 if @key1 and @key2 are equal.
+ * Returns >0 if @key1 is greater.
+ * Returns <0 if @key2 is greater..
+ */
+static int
+befs_compare_strings(const void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	int len = min_t(int, keylen1, keylen2);
+	int result = strncmp(key1, key2, len);
+	if (result == 0)
+		result = keylen1 - keylen2;
+	return result;
+}
+
+/* These will be used for non-string keyed btrees */
+#if 0
+static int
+btree_compare_int32(cont void *key1, int keylen1, const void *key2, int keylen2)
+{
+	return *(int32_t *) key1 - *(int32_t *) key2;
+}
+
+static int
+btree_compare_uint32(cont void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	if (*(u_int32_t *) key1 == *(u_int32_t *) key2)
+		return 0;
+	else if (*(u_int32_t *) key1 > *(u_int32_t *) key2)
+		return 1;
+
+	return -1;
+}
+static int
+btree_compare_int64(cont void *key1, int keylen1, const void *key2, int keylen2)
+{
+	if (*(int64_t *) key1 == *(int64_t *) key2)
+		return 0;
+	else if (*(int64_t *) key1 > *(int64_t *) key2)
+		return 1;
+
+	return -1;
+}
+
+static int
+btree_compare_uint64(cont void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	if (*(u_int64_t *) key1 == *(u_int64_t *) key2)
+		return 0;
+	else if (*(u_int64_t *) key1 > *(u_int64_t *) key2)
+		return 1;
+
+	return -1;
+}
+
+static int
+btree_compare_float(cont void *key1, int keylen1, const void *key2, int keylen2)
+{
+	float result = *(float *) key1 - *(float *) key2;
+	if (result == 0.0f)
+		return 0;
+
+	return (result < 0.0f) ? -1 : 1;
+}
+
+static int
+btree_compare_double(cont void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	double result = *(double *) key1 - *(double *) key2;
+	if (result == 0.0)
+		return 0;
+
+	return (result < 0.0) ? -1 : 1;
+}
+#endif				//0
diff --git a/kernel/fs/befs/btree.h b/kernel/fs/befs/btree.h
new file mode 100644
index 000000000..92e781e5f
--- /dev/null
+++ b/kernel/fs/befs/btree.h
@@ -0,0 +1,13 @@
+/*
+ * btree.h
+ * 
+ */
+
+
+int befs_btree_find(struct super_block *sb, befs_data_stream * ds,
+		    const char *key, befs_off_t * value);
+
+int befs_btree_read(struct super_block *sb, befs_data_stream * ds,
+		    loff_t key_no, size_t bufsize, char *keybuf,
+		    size_t * keysize, befs_off_t * value);
+
diff --git a/kernel/fs/befs/datastream.c b/kernel/fs/befs/datastream.c
new file mode 100644
index 000000000..ebd507186
--- /dev/null
+++ b/kernel/fs/befs/datastream.c
@@ -0,0 +1,529 @@
+/*
+ * linux/fs/befs/datastream.c
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
+ *
+ * Based on portions of file.c by Makoto Kato <m_kato@ga2.so-net.ne.jp>
+ *
+ * Many thanks to Dominic Giampaolo, author of "Practical File System
+ * Design with the Be File System", for such a helpful book.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+
+#include "befs.h"
+#include "datastream.h"
+#include "io.h"
+
+const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
+
+static int befs_find_brun_direct(struct super_block *sb,
+				 befs_data_stream * data,
+				 befs_blocknr_t blockno, befs_block_run * run);
+
+static int befs_find_brun_indirect(struct super_block *sb,
+				   befs_data_stream * data,
+				   befs_blocknr_t blockno,
+				   befs_block_run * run);
+
+static int befs_find_brun_dblindirect(struct super_block *sb,
+				      befs_data_stream * data,
+				      befs_blocknr_t blockno,
+				      befs_block_run * run);
+
+/**
+ * befs_read_datastream - get buffer_head containing data, starting from pos.
+ * @sb: Filesystem superblock
+ * @ds: datastrem to find data with
+ * @pos: start of data
+ * @off: offset of data in buffer_head->b_data
+ *
+ * Returns pointer to buffer_head containing data starting with offset @off,
+ * if you don't need to know offset just set @off = NULL.
+ */
+struct buffer_head *
+befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
+		     befs_off_t pos, uint * off)
+{
+	struct buffer_head *bh = NULL;
+	befs_block_run run;
+	befs_blocknr_t block;	/* block coresponding to pos */
+
+	befs_debug(sb, "---> %s %llu", __func__, pos);
+	block = pos >> BEFS_SB(sb)->block_shift;
+	if (off)
+		*off = pos - (block << BEFS_SB(sb)->block_shift);
+
+	if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
+		befs_error(sb, "BeFS: Error finding disk addr of block %lu",
+			   (unsigned long)block);
+		befs_debug(sb, "<--- %s ERROR", __func__);
+		return NULL;
+	}
+	bh = befs_bread_iaddr(sb, run);
+	if (!bh) {
+		befs_error(sb, "BeFS: Error reading block %lu from datastream",
+			   (unsigned long)block);
+		return NULL;
+	}
+
+	befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos);
+
+	return bh;
+}
+
+/*
+ * Takes a file position and gives back a brun who's starting block
+ * is block number fblock of the file.
+ * 
+ * Returns BEFS_OK or BEFS_ERR.
+ * 
+ * Calls specialized functions for each of the three possible
+ * datastream regions.
+ *
+ * 2001-11-15 Will Dyson
+ */
+int
+befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
+		 befs_blocknr_t fblock, befs_block_run * run)
+{
+	int err;
+	befs_off_t pos = fblock << BEFS_SB(sb)->block_shift;
+
+	if (pos < data->max_direct_range) {
+		err = befs_find_brun_direct(sb, data, fblock, run);
+
+	} else if (pos < data->max_indirect_range) {
+		err = befs_find_brun_indirect(sb, data, fblock, run);
+
+	} else if (pos < data->max_double_indirect_range) {
+		err = befs_find_brun_dblindirect(sb, data, fblock, run);
+
+	} else {
+		befs_error(sb,
+			   "befs_fblock2brun() was asked to find block %lu, "
+			   "which is not mapped by the datastream\n",
+			   (unsigned long)fblock);
+		err = BEFS_ERR;
+	}
+	return err;
+}
+
+/**
+ * befs_read_lsmylink - read long symlink from datastream.
+ * @sb: Filesystem superblock 
+ * @ds: Datastrem to read from
+ * @buff: Buffer in which to place long symlink data
+ * @len: Length of the long symlink in bytes
+ *
+ * Returns the number of bytes read
+ */
+size_t
+befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
+		   befs_off_t len)
+{
+	befs_off_t bytes_read = 0;	/* bytes readed */
+	u16 plen;
+	struct buffer_head *bh = NULL;
+	befs_debug(sb, "---> %s length: %llu", __func__, len);
+
+	while (bytes_read < len) {
+		bh = befs_read_datastream(sb, ds, bytes_read, NULL);
+		if (!bh) {
+			befs_error(sb, "BeFS: Error reading datastream block "
+				   "starting from %llu", bytes_read);
+			befs_debug(sb, "<--- %s ERROR", __func__);
+			return bytes_read;
+
+		}
+		plen = ((bytes_read + BEFS_SB(sb)->block_size) < len) ?
+		    BEFS_SB(sb)->block_size : len - bytes_read;
+		memcpy(buff + bytes_read, bh->b_data, plen);
+		brelse(bh);
+		bytes_read += plen;
+	}
+
+	befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int)
+		   bytes_read);
+	return bytes_read;
+}
+
+/**
+ * befs_count_blocks - blocks used by a file
+ * @sb: Filesystem superblock
+ * @ds: Datastream of the file
+ *
+ * Counts the number of fs blocks that the file represented by
+ * inode occupies on the filesystem, counting both regular file
+ * data and filesystem metadata (and eventually attribute data
+ * when we support attributes)
+*/
+
+befs_blocknr_t
+befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
+{
+	befs_blocknr_t blocks;
+	befs_blocknr_t datablocks;	/* File data blocks */
+	befs_blocknr_t metablocks;	/* FS metadata blocks */
+	struct befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	befs_debug(sb, "---> %s", __func__);
+
+	datablocks = ds->size >> befs_sb->block_shift;
+	if (ds->size & (befs_sb->block_size - 1))
+		datablocks += 1;
+
+	metablocks = 1;		/* Start with 1 block for inode */
+
+	/* Size of indirect block */
+	if (ds->size > ds->max_direct_range)
+		metablocks += ds->indirect.len;
+
+	/*
+	   Double indir block, plus all the indirect blocks it mapps
+	   In the double-indirect range, all block runs of data are
+	   BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know 
+	   how many data block runs are in the double-indirect region,
+	   and from that we know how many indirect blocks it takes to
+	   map them. We assume that the indirect blocks are also
+	   BEFS_DBLINDIR_BRUN_LEN blocks long.
+	 */
+	if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) {
+		uint dbl_bytes;
+		uint dbl_bruns;
+		uint indirblocks;
+
+		dbl_bytes =
+		    ds->max_double_indirect_range - ds->max_indirect_range;
+		dbl_bruns =
+		    dbl_bytes / (befs_sb->block_size * BEFS_DBLINDIR_BRUN_LEN);
+		indirblocks = dbl_bruns / befs_iaddrs_per_block(sb);
+
+		metablocks += ds->double_indirect.len;
+		metablocks += indirblocks;
+	}
+
+	blocks = datablocks + metablocks;
+	befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks);
+
+	return blocks;
+}
+
+/*
+	Finds the block run that starts at file block number blockno
+	in the file represented by the datastream data, if that 
+	blockno is in the direct region of the datastream.
+	
+	sb: the superblock
+	data: the datastream
+	blockno: the blocknumber to find
+	run: The found run is passed back through this pointer
+	
+	Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+	otherwise.
+	
+	Algorithm:
+	Linear search. Checks each element of array[] to see if it
+	contains the blockno-th filesystem block. This is necessary
+	because the block runs map variable amounts of data. Simply
+	keeps a count of the number of blocks searched so far (sum),
+	incrementing this by the length of each block run as we come
+	across it. Adds sum to *count before returning (this is so
+	you can search multiple arrays that are logicaly one array,
+	as in the indirect region code).
+	
+	When/if blockno is found, if blockno is inside of a block 
+	run as stored on disk, we offset the start and length members
+	of the block run, so that blockno is the start and len is
+	still valid (the run ends in the same place).
+	
+	2001-11-15 Will Dyson
+*/
+static int
+befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
+		      befs_blocknr_t blockno, befs_block_run * run)
+{
+	int i;
+	befs_block_run *array = data->direct;
+	befs_blocknr_t sum;
+	befs_blocknr_t max_block =
+	    data->max_direct_range >> BEFS_SB(sb)->block_shift;
+
+	befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
+
+	if (blockno > max_block) {
+		befs_error(sb, "%s passed block outside of direct region",
+			   __func__);
+		return BEFS_ERR;
+	}
+
+	for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS;
+	     sum += array[i].len, i++) {
+		if (blockno >= sum && blockno < sum + (array[i].len)) {
+			int offset = blockno - sum;
+			run->allocation_group = array[i].allocation_group;
+			run->start = array[i].start + offset;
+			run->len = array[i].len - offset;
+
+			befs_debug(sb, "---> %s, "
+				   "found %lu at direct[%d]", __func__,
+				   (unsigned long)blockno, i);
+			return BEFS_OK;
+		}
+	}
+
+	befs_debug(sb, "---> %s ERROR", __func__);
+	return BEFS_ERR;
+}
+
+/*
+	Finds the block run that starts at file block number blockno
+	in the file represented by the datastream data, if that 
+	blockno is in the indirect region of the datastream.
+	
+	sb: the superblock
+	data: the datastream
+	blockno: the blocknumber to find
+	run: The found run is passed back through this pointer
+	
+	Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+	otherwise.
+	
+	Algorithm:
+	For each block in the indirect run of the datastream, read
+	it in and search through it for	search_blk.
+	
+	XXX:
+	Really should check to make sure blockno is inside indirect
+	region.
+	
+	2001-11-15 Will Dyson
+*/
+static int
+befs_find_brun_indirect(struct super_block *sb,
+			befs_data_stream * data, befs_blocknr_t blockno,
+			befs_block_run * run)
+{
+	int i, j;
+	befs_blocknr_t sum = 0;
+	befs_blocknr_t indir_start_blk;
+	befs_blocknr_t search_blk;
+	struct buffer_head *indirblock;
+	befs_disk_block_run *array;
+
+	befs_block_run indirect = data->indirect;
+	befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
+	int arraylen = befs_iaddrs_per_block(sb);
+
+	befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
+
+	indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
+	search_blk = blockno - indir_start_blk;
+
+	/* Examine blocks of the indirect run one at a time */
+	for (i = 0; i < indirect.len; i++) {
+		indirblock = befs_bread(sb, indirblockno + i);
+		if (indirblock == NULL) {
+			befs_debug(sb, "---> %s failed to read "
+				   "disk block %lu from the indirect brun",
+				   __func__, (unsigned long)indirblockno + i);
+			return BEFS_ERR;
+		}
+
+		array = (befs_disk_block_run *) indirblock->b_data;
+
+		for (j = 0; j < arraylen; ++j) {
+			int len = fs16_to_cpu(sb, array[j].len);
+
+			if (search_blk >= sum && search_blk < sum + len) {
+				int offset = search_blk - sum;
+				run->allocation_group =
+				    fs32_to_cpu(sb, array[j].allocation_group);
+				run->start =
+				    fs16_to_cpu(sb, array[j].start) + offset;
+				run->len =
+				    fs16_to_cpu(sb, array[j].len) - offset;
+
+				brelse(indirblock);
+				befs_debug(sb,
+					   "<--- %s found file block "
+					   "%lu at indirect[%d]", __func__,
+					   (unsigned long)blockno,
+					   j + (i * arraylen));
+				return BEFS_OK;
+			}
+			sum += len;
+		}
+
+		brelse(indirblock);
+	}
+
+	/* Only fallthrough is an error */
+	befs_error(sb, "BeFS: %s failed to find "
+		   "file block %lu", __func__, (unsigned long)blockno);
+
+	befs_debug(sb, "<--- %s ERROR", __func__);
+	return BEFS_ERR;
+}
+
+/*
+	Finds the block run that starts at file block number blockno
+	in the file represented by the datastream data, if that 
+	blockno is in the double-indirect region of the datastream.
+	
+	sb: the superblock
+	data: the datastream
+	blockno: the blocknumber to find
+	run: The found run is passed back through this pointer
+	
+	Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+	otherwise.
+	
+	Algorithm:
+	The block runs in the double-indirect region are different.
+	They are always allocated 4 fs blocks at a time, so each
+	block run maps a constant amount of file data. This means
+	that we can directly calculate how many block runs into the
+	double-indirect region we need to go to get to the one that
+	maps a particular filesystem block.
+	
+	We do this in two stages. First we calculate which of the
+	inode addresses in the double-indirect block will point us
+	to the indirect block that contains the mapping for the data,
+	then we calculate which of the inode addresses in that 
+	indirect block maps the data block we are after.
+	
+	Oh, and once we've done that, we actually read in the blocks 
+	that contain the inode addresses we calculated above. Even 
+	though the double-indirect run may be several blocks long, 
+	we can calculate which of those blocks will contain the index
+	we are after and only read that one. We then follow it to 
+	the indirect block and perform a  similar process to find
+	the actual block run that maps the data block we are interested
+	in.
+	
+	Then we offset the run as in befs_find_brun_array() and we are 
+	done.
+	
+	2001-11-15 Will Dyson
+*/
+static int
+befs_find_brun_dblindirect(struct super_block *sb,
+			   befs_data_stream * data, befs_blocknr_t blockno,
+			   befs_block_run * run)
+{
+	int dblindir_indx;
+	int indir_indx;
+	int offset;
+	int dbl_which_block;
+	int which_block;
+	int dbl_block_indx;
+	int block_indx;
+	off_t dblindir_leftover;
+	befs_blocknr_t blockno_at_run_start;
+	struct buffer_head *dbl_indir_block;
+	struct buffer_head *indir_block;
+	befs_block_run indir_run;
+	befs_disk_inode_addr *iaddr_array = NULL;
+	struct befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	befs_blocknr_t indir_start_blk =
+	    data->max_indirect_range >> befs_sb->block_shift;
+
+	off_t dbl_indir_off = blockno - indir_start_blk;
+
+	/* number of data blocks mapped by each of the iaddrs in
+	 * the indirect block pointed to by the double indirect block
+	 */
+	size_t iblklen = BEFS_DBLINDIR_BRUN_LEN;
+
+	/* number of data blocks mapped by each of the iaddrs in
+	 * the double indirect block
+	 */
+	size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
+	    * BEFS_DBLINDIR_BRUN_LEN;
+
+	befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno);
+
+	/* First, discover which of the double_indir->indir blocks
+	 * contains pos. Then figure out how much of pos that
+	 * accounted for. Then discover which of the iaddrs in
+	 * the indirect block contains pos.
+	 */
+
+	dblindir_indx = dbl_indir_off / diblklen;
+	dblindir_leftover = dbl_indir_off % diblklen;
+	indir_indx = dblindir_leftover / diblklen;
+
+	/* Read double indirect block */
+	dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
+	if (dbl_which_block > data->double_indirect.len) {
+		befs_error(sb, "The double-indirect index calculated by "
+			   "%s, %d, is outside the range "
+			   "of the double-indirect block", __func__,
+			   dblindir_indx);
+		return BEFS_ERR;
+	}
+
+	dbl_indir_block =
+	    befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
+					dbl_which_block);
+	if (dbl_indir_block == NULL) {
+		befs_error(sb, "%s couldn't read the "
+			   "double-indirect block at blockno %lu", __func__,
+			   (unsigned long)
+			   iaddr2blockno(sb, &data->double_indirect) +
+			   dbl_which_block);
+		brelse(dbl_indir_block);
+		return BEFS_ERR;
+	}
+
+	dbl_block_indx =
+	    dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb));
+	iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
+	indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
+	brelse(dbl_indir_block);
+	iaddr_array = NULL;
+
+	/* Read indirect block */
+	which_block = indir_indx / befs_iaddrs_per_block(sb);
+	if (which_block > indir_run.len) {
+		befs_error(sb, "The indirect index calculated by "
+			   "%s, %d, is outside the range "
+			   "of the indirect block", __func__, indir_indx);
+		return BEFS_ERR;
+	}
+
+	indir_block =
+	    befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
+	if (indir_block == NULL) {
+		befs_error(sb, "%s couldn't read the indirect block "
+			   "at blockno %lu", __func__, (unsigned long)
+			   iaddr2blockno(sb, &indir_run) + which_block);
+		brelse(indir_block);
+		return BEFS_ERR;
+	}
+
+	block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb));
+	iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
+	*run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
+	brelse(indir_block);
+	iaddr_array = NULL;
+
+	blockno_at_run_start = indir_start_blk;
+	blockno_at_run_start += diblklen * dblindir_indx;
+	blockno_at_run_start += iblklen * indir_indx;
+	offset = blockno - blockno_at_run_start;
+
+	run->start += offset;
+	run->len -= offset;
+
+	befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
+		   " double_indirect_leftover = %lu", (unsigned long)
+		   blockno, dblindir_indx, indir_indx, dblindir_leftover);
+
+	return BEFS_OK;
+}
diff --git a/kernel/fs/befs/datastream.h b/kernel/fs/befs/datastream.h
new file mode 100644
index 000000000..45e8a3c98
--- /dev/null
+++ b/kernel/fs/befs/datastream.h
@@ -0,0 +1,19 @@
+/*
+ * datastream.h
+ *
+ */
+
+struct buffer_head *befs_read_datastream(struct super_block *sb,
+					 befs_data_stream * ds, befs_off_t pos,
+					 uint * off);
+
+int befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
+		     befs_blocknr_t fblock, befs_block_run * run);
+
+size_t befs_read_lsymlink(struct super_block *sb, befs_data_stream * data,
+			  void *buff, befs_off_t len);
+
+befs_blocknr_t befs_count_blocks(struct super_block *sb, befs_data_stream * ds);
+
+extern const befs_inode_addr BAD_IADDR;
+
diff --git a/kernel/fs/befs/debug.c b/kernel/fs/befs/debug.c
new file mode 100644
index 000000000..4de7cffcd
--- /dev/null
+++ b/kernel/fs/befs/debug.c
@@ -0,0 +1,258 @@
+/*
+ *  linux/fs/befs/debug.c
+ * 
+ * Copyright (C) 2001 Will Dyson (will_dyson at pobox.com)
+ *
+ * With help from the ntfs-tng driver by Anton Altparmakov
+ *
+ * Copyright (C) 1999  Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ *
+ * debug functions
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#ifdef __KERNEL__
+
+#include <stdarg.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#endif				/* __KERNEL__ */
+
+#include "befs.h"
+
+void
+befs_error(const struct super_block *sb, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_err("(%s): %pV\n", sb->s_id, &vaf);
+	va_end(args);
+}
+
+void
+befs_warning(const struct super_block *sb, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_warn("(%s): %pV\n", sb->s_id, &vaf);
+	va_end(args);
+}
+
+void
+befs_debug(const struct super_block *sb, const char *fmt, ...)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	struct va_format vaf;
+	va_list args;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_debug("(%s): %pV\n", sb->s_id, &vaf);
+	va_end(args);
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+void
+befs_dump_inode(const struct super_block *sb, befs_inode * inode)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_block_run tmp_run;
+
+	befs_debug(sb, "befs_inode information");
+
+	befs_debug(sb, "  magic1 %08x", fs32_to_cpu(sb, inode->magic1));
+
+	tmp_run = fsrun_to_cpu(sb, inode->inode_num);
+	befs_debug(sb, "  inode_num %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	befs_debug(sb, "  uid %u", fs32_to_cpu(sb, inode->uid));
+	befs_debug(sb, "  gid %u", fs32_to_cpu(sb, inode->gid));
+	befs_debug(sb, "  mode %08x", fs32_to_cpu(sb, inode->mode));
+	befs_debug(sb, "  flags %08x", fs32_to_cpu(sb, inode->flags));
+	befs_debug(sb, "  create_time %llu",
+		   fs64_to_cpu(sb, inode->create_time));
+	befs_debug(sb, "  last_modified_time %llu",
+		   fs64_to_cpu(sb, inode->last_modified_time));
+
+	tmp_run = fsrun_to_cpu(sb, inode->parent);
+	befs_debug(sb, "  parent [%u, %hu, %hu]",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	tmp_run = fsrun_to_cpu(sb, inode->attributes);
+	befs_debug(sb, "  attributes [%u, %hu, %hu]",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	befs_debug(sb, "  type %08x", fs32_to_cpu(sb, inode->type));
+	befs_debug(sb, "  inode_size %u", fs32_to_cpu(sb, inode->inode_size));
+
+	if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) {
+		befs_debug(sb, "  Symbolic link [%s]", inode->data.symlink);
+	} else {
+		int i;
+
+		for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; i++) {
+			tmp_run =
+			    fsrun_to_cpu(sb, inode->data.datastream.direct[i]);
+			befs_debug(sb, "  direct %d [%u, %hu, %hu]", i,
+				   tmp_run.allocation_group, tmp_run.start,
+				   tmp_run.len);
+		}
+		befs_debug(sb, "  max_direct_range %llu",
+			   fs64_to_cpu(sb,
+				       inode->data.datastream.
+				       max_direct_range));
+
+		tmp_run = fsrun_to_cpu(sb, inode->data.datastream.indirect);
+		befs_debug(sb, "  indirect [%u, %hu, %hu]",
+			   tmp_run.allocation_group,
+			   tmp_run.start, tmp_run.len);
+
+		befs_debug(sb, "  max_indirect_range %llu",
+			   fs64_to_cpu(sb,
+				       inode->data.datastream.
+				       max_indirect_range));
+
+		tmp_run =
+		    fsrun_to_cpu(sb, inode->data.datastream.double_indirect);
+		befs_debug(sb, "  double indirect [%u, %hu, %hu]",
+			   tmp_run.allocation_group, tmp_run.start,
+			   tmp_run.len);
+
+		befs_debug(sb, "  max_double_indirect_range %llu",
+			   fs64_to_cpu(sb,
+				       inode->data.datastream.
+				       max_double_indirect_range));
+
+		befs_debug(sb, "  size %llu",
+			   fs64_to_cpu(sb, inode->data.datastream.size));
+	}
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+/*
+ * Display super block structure for debug.
+ */
+
+void
+befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_block_run tmp_run;
+
+	befs_debug(sb, "befs_super_block information");
+
+	befs_debug(sb, "  name %s", sup->name);
+	befs_debug(sb, "  magic1 %08x", fs32_to_cpu(sb, sup->magic1));
+	befs_debug(sb, "  fs_byte_order %08x",
+		   fs32_to_cpu(sb, sup->fs_byte_order));
+
+	befs_debug(sb, "  block_size %u", fs32_to_cpu(sb, sup->block_size));
+	befs_debug(sb, "  block_shift %u", fs32_to_cpu(sb, sup->block_shift));
+
+	befs_debug(sb, "  num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
+	befs_debug(sb, "  used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
+
+	befs_debug(sb, "  magic2 %08x", fs32_to_cpu(sb, sup->magic2));
+	befs_debug(sb, "  blocks_per_ag %u",
+		   fs32_to_cpu(sb, sup->blocks_per_ag));
+	befs_debug(sb, "  ag_shift %u", fs32_to_cpu(sb, sup->ag_shift));
+	befs_debug(sb, "  num_ags %u", fs32_to_cpu(sb, sup->num_ags));
+
+	befs_debug(sb, "  flags %08x", fs32_to_cpu(sb, sup->flags));
+
+	tmp_run = fsrun_to_cpu(sb, sup->log_blocks);
+	befs_debug(sb, "  log_blocks %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	befs_debug(sb, "  log_start %lld", fs64_to_cpu(sb, sup->log_start));
+	befs_debug(sb, "  log_end %lld", fs64_to_cpu(sb, sup->log_end));
+
+	befs_debug(sb, "  magic3 %08x", fs32_to_cpu(sb, sup->magic3));
+
+	tmp_run = fsrun_to_cpu(sb, sup->root_dir);
+	befs_debug(sb, "  root_dir %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	tmp_run = fsrun_to_cpu(sb, sup->indices);
+	befs_debug(sb, "  indices %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+#if 0
+/* unused */
+void
+befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
+{
+}
+
+/* unused */
+void
+befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_block_run n = fsrun_to_cpu(sb, run);
+
+	befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len);
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+#endif  /*  0  */
+
+void
+befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_debug(sb, "Btree super structure");
+	befs_debug(sb, "  magic %08x", fs32_to_cpu(sb, super->magic));
+	befs_debug(sb, "  node_size %u", fs32_to_cpu(sb, super->node_size));
+	befs_debug(sb, "  max_depth %08x", fs32_to_cpu(sb, super->max_depth));
+
+	befs_debug(sb, "  data_type %08x", fs32_to_cpu(sb, super->data_type));
+	befs_debug(sb, "  root_node_pointer %016LX",
+		   fs64_to_cpu(sb, super->root_node_ptr));
+	befs_debug(sb, "  free_node_pointer %016LX",
+		   fs64_to_cpu(sb, super->free_node_ptr));
+	befs_debug(sb, "  maximum size %016LX",
+		   fs64_to_cpu(sb, super->max_size));
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+void
+befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead * node)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_debug(sb, "Btree node structure");
+	befs_debug(sb, "  left %016LX", fs64_to_cpu(sb, node->left));
+	befs_debug(sb, "  right %016LX", fs64_to_cpu(sb, node->right));
+	befs_debug(sb, "  overflow %016LX", fs64_to_cpu(sb, node->overflow));
+	befs_debug(sb, "  all_key_count %hu",
+		   fs16_to_cpu(sb, node->all_key_count));
+	befs_debug(sb, "  all_key_length %hu",
+		   fs16_to_cpu(sb, node->all_key_length));
+
+#endif				//CONFIG_BEFS_DEBUG
+}
diff --git a/kernel/fs/befs/endian.h b/kernel/fs/befs/endian.h
new file mode 100644
index 000000000..27223878b
--- /dev/null
+++ b/kernel/fs/befs/endian.h
@@ -0,0 +1,125 @@
+/*
+ * linux/fs/befs/endian.h
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
+ *
+ * Partially based on similar funtions in the sysv driver.
+ */
+
+#ifndef LINUX_BEFS_ENDIAN
+#define LINUX_BEFS_ENDIAN
+
+#include <asm/byteorder.h>
+
+static inline u64
+fs64_to_cpu(const struct super_block *sb, fs64 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return le64_to_cpu((__force __le64)n);
+	else
+		return be64_to_cpu((__force __be64)n);
+}
+
+static inline fs64
+cpu_to_fs64(const struct super_block *sb, u64 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return (__force fs64)cpu_to_le64(n);
+	else
+		return (__force fs64)cpu_to_be64(n);
+}
+
+static inline u32
+fs32_to_cpu(const struct super_block *sb, fs32 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return le32_to_cpu((__force __le32)n);
+	else
+		return be32_to_cpu((__force __be32)n);
+}
+
+static inline fs32
+cpu_to_fs32(const struct super_block *sb, u32 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return (__force fs32)cpu_to_le32(n);
+	else
+		return (__force fs32)cpu_to_be32(n);
+}
+
+static inline u16
+fs16_to_cpu(const struct super_block *sb, fs16 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return le16_to_cpu((__force __le16)n);
+	else
+		return be16_to_cpu((__force __be16)n);
+}
+
+static inline fs16
+cpu_to_fs16(const struct super_block *sb, u16 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return (__force fs16)cpu_to_le16(n);
+	else
+		return (__force fs16)cpu_to_be16(n);
+}
+
+/* Composite types below here */
+
+static inline befs_block_run
+fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n)
+{
+	befs_block_run run;
+
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
+		run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group);
+		run.start = le16_to_cpu((__force __le16)n.start);
+		run.len = le16_to_cpu((__force __le16)n.len);
+	} else {
+		run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group);
+		run.start = be16_to_cpu((__force __be16)n.start);
+		run.len = be16_to_cpu((__force __be16)n.len);
+	}
+	return run;
+}
+
+static inline befs_disk_block_run
+cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
+{
+	befs_disk_block_run run;
+
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
+		run.allocation_group = cpu_to_le32(n.allocation_group);
+		run.start = cpu_to_le16(n.start);
+		run.len = cpu_to_le16(n.len);
+	} else {
+		run.allocation_group = cpu_to_be32(n.allocation_group);
+		run.start = cpu_to_be16(n.start);
+		run.len = cpu_to_be16(n.len);
+	}
+	return run;
+}
+
+static inline befs_data_stream
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
+{
+	befs_data_stream data;
+	int i;
+
+	for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
+		data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
+
+	data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
+	data.indirect = fsrun_to_cpu(sb, n->indirect);
+	data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
+	data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
+	data.max_double_indirect_range = fs64_to_cpu(sb,
+						     n->
+						     max_double_indirect_range);
+	data.size = fs64_to_cpu(sb, n->size);
+
+	return data;
+}
+
+#endif				//LINUX_BEFS_ENDIAN
diff --git a/kernel/fs/befs/inode.c b/kernel/fs/befs/inode.c
new file mode 100644
index 000000000..fa4b718de
--- /dev/null
+++ b/kernel/fs/befs/inode.c
@@ -0,0 +1,54 @@
+/*
+ * inode.c
+ * 
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
+ */
+
+#include <linux/fs.h>
+
+#include "befs.h"
+#include "inode.h"
+
+/*
+	Validates the correctness of the befs inode
+	Returns BEFS_OK if the inode should be used, otherwise
+	returns BEFS_BAD_INODE
+*/
+int
+befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+		 befs_blocknr_t inode)
+{
+	u32 magic1 = fs32_to_cpu(sb, raw_inode->magic1);
+	befs_inode_addr ino_num = fsrun_to_cpu(sb, raw_inode->inode_num);
+	u32 flags = fs32_to_cpu(sb, raw_inode->flags);
+
+	/* check magic header. */
+	if (magic1 != BEFS_INODE_MAGIC1) {
+		befs_error(sb,
+			   "Inode has a bad magic header - inode = %lu",
+			   (unsigned long)inode);
+		return BEFS_BAD_INODE;
+	}
+
+	/*
+	 * Sanity check2: inodes store their own block address. Check it.
+	 */
+	if (inode != iaddr2blockno(sb, &ino_num)) {
+		befs_error(sb, "inode blocknr field disagrees with vfs "
+			   "VFS: %lu, Inode %lu", (unsigned long)
+			   inode, (unsigned long)iaddr2blockno(sb, &ino_num));
+		return BEFS_BAD_INODE;
+	}
+
+	/*
+	 * check flag
+	 */
+
+	if (!(flags & BEFS_INODE_IN_USE)) {
+		befs_error(sb, "inode is not used - inode = %lu",
+			   (unsigned long)inode);
+		return BEFS_BAD_INODE;
+	}
+
+	return BEFS_OK;
+}
diff --git a/kernel/fs/befs/inode.h b/kernel/fs/befs/inode.h
new file mode 100644
index 000000000..9dc7fd9b7
--- /dev/null
+++ b/kernel/fs/befs/inode.h
@@ -0,0 +1,8 @@
+/*
+ * inode.h
+ * 
+ */
+
+int befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+		     befs_blocknr_t inode);
+
diff --git a/kernel/fs/befs/io.c b/kernel/fs/befs/io.c
new file mode 100644
index 000000000..7a5b4ec21
--- /dev/null
+++ b/kernel/fs/befs/io.c
@@ -0,0 +1,85 @@
+/*
+ * linux/fs/befs/io.c
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com
+ *
+ * Based on portions of file.c and inode.c 
+ * by Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ *
+ * Many thanks to Dominic Giampaolo, author of Practical File System
+ * Design with the Be File System, for such a helpful book.
+ *
+ */
+
+#include <linux/buffer_head.h>
+
+#include "befs.h"
+#include "io.h"
+
+/*
+ * Converts befs notion of disk addr to a disk offset and uses
+ * linux kernel function sb_bread() to get the buffer containing
+ * the offset. -Will Dyson
+ *
+ */
+
+struct buffer_head *
+befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
+{
+	struct buffer_head *bh = NULL;
+	befs_blocknr_t block = 0;
+	struct befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	befs_debug(sb, "---> Enter %s "
+		   "[%u, %hu, %hu]", __func__, iaddr.allocation_group,
+		   iaddr.start, iaddr.len);
+
+	if (iaddr.allocation_group > befs_sb->num_ags) {
+		befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
+			   iaddr.allocation_group, befs_sb->num_ags);
+		goto error;
+	}
+
+	block = iaddr2blockno(sb, &iaddr);
+
+	befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block);
+
+	bh = sb_bread(sb, block);
+
+	if (bh == NULL) {
+		befs_error(sb, "Failed to read block %lu",
+			   (unsigned long)block);
+		goto error;
+	}
+
+	befs_debug(sb, "<--- %s", __func__);
+	return bh;
+
+      error:
+	befs_debug(sb, "<--- %s ERROR", __func__);
+	return NULL;
+}
+
+struct buffer_head *
+befs_bread(struct super_block *sb, befs_blocknr_t block)
+{
+	struct buffer_head *bh = NULL;
+
+	befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
+
+	bh = sb_bread(sb, block);
+
+	if (bh == NULL) {
+		befs_error(sb, "Failed to read block %lu",
+			   (unsigned long)block);
+		goto error;
+	}
+
+	befs_debug(sb, "<--- %s", __func__);
+
+	return bh;
+
+      error:
+	befs_debug(sb, "<--- %s ERROR", __func__);
+	return NULL;
+}
diff --git a/kernel/fs/befs/io.h b/kernel/fs/befs/io.h
new file mode 100644
index 000000000..9b78266b6
--- /dev/null
+++ b/kernel/fs/befs/io.h
@@ -0,0 +1,9 @@
+/*
+ * io.h
+ */
+
+struct buffer_head *befs_bread_iaddr(struct super_block *sb,
+				     befs_inode_addr iaddr);
+
+struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block);
+
diff --git a/kernel/fs/befs/linuxvfs.c b/kernel/fs/befs/linuxvfs.c
new file mode 100644
index 000000000..7943533c3
--- /dev/null
+++ b/kernel/fs/befs/linuxvfs.c
@@ -0,0 +1,981 @@
+/*
+ * linux/fs/befs/linuxvfs.c
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/nls.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include "befs.h"
+#include "btree.h"
+#include "inode.h"
+#include "datastream.h"
+#include "super.h"
+#include "io.h"
+
+MODULE_DESCRIPTION("BeOS File System (BeFS) driver");
+MODULE_AUTHOR("Will Dyson");
+MODULE_LICENSE("GPL");
+
+/* The units the vfs expects inode->i_blocks to be in */
+#define VFS_BLOCK_SIZE 512
+
+static int befs_readdir(struct file *, struct dir_context *);
+static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+static int befs_readpage(struct file *file, struct page *page);
+static sector_t befs_bmap(struct address_space *mapping, sector_t block);
+static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int);
+static struct inode *befs_iget(struct super_block *, unsigned long);
+static struct inode *befs_alloc_inode(struct super_block *sb);
+static void befs_destroy_inode(struct inode *inode);
+static void befs_destroy_inodecache(void);
+static void *befs_follow_link(struct dentry *, struct nameidata *);
+static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
+static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
+			char **out, int *out_len);
+static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
+			char **out, int *out_len);
+static void befs_put_super(struct super_block *);
+static int befs_remount(struct super_block *, int *, char *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
+static int parse_options(char *, struct befs_mount_options *);
+
+static const struct super_operations befs_sops = {
+	.alloc_inode	= befs_alloc_inode,	/* allocate a new inode */
+	.destroy_inode	= befs_destroy_inode, /* deallocate an inode */
+	.put_super	= befs_put_super,	/* uninit super */
+	.statfs		= befs_statfs,	/* statfs */
+	.remount_fs	= befs_remount,
+	.show_options	= generic_show_options,
+};
+
+/* slab cache for befs_inode_info objects */
+static struct kmem_cache *befs_inode_cachep;
+
+static const struct file_operations befs_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= befs_readdir,
+	.llseek		= generic_file_llseek,
+};
+
+static const struct inode_operations befs_dir_inode_operations = {
+	.lookup		= befs_lookup,
+};
+
+static const struct address_space_operations befs_aops = {
+	.readpage	= befs_readpage,
+	.bmap		= befs_bmap,
+};
+
+static const struct inode_operations befs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= befs_fast_follow_link,
+};
+
+static const struct inode_operations befs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= befs_follow_link,
+	.put_link	= kfree_put_link,
+};
+
+/* 
+ * Called by generic_file_read() to read a page of data
+ * 
+ * In turn, simply calls a generic block read function and
+ * passes it the address of befs_get_block, for mapping file
+ * positions to disk blocks.
+ */
+static int
+befs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, befs_get_block);
+}
+
+static sector_t
+befs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, befs_get_block);
+}
+
+/* 
+ * Generic function to map a file position (block) to a 
+ * disk offset (passed back in bh_result).
+ *
+ * Used by many higher level functions.
+ *
+ * Calls befs_fblock2brun() in datastream.c to do the real work.
+ *
+ * -WD 10-26-01
+ */
+
+static int
+befs_get_block(struct inode *inode, sector_t block,
+	       struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
+	befs_block_run run = BAD_IADDR;
+	int res = 0;
+	ulong disk_off;
+
+	befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
+		   (unsigned long)inode->i_ino, (long)block);
+	if (create) {
+		befs_error(sb, "befs_get_block() was asked to write to "
+			   "block %ld in inode %lu", (long)block,
+			   (unsigned long)inode->i_ino);
+		return -EPERM;
+	}
+
+	res = befs_fblock2brun(sb, ds, block, &run);
+	if (res != BEFS_OK) {
+		befs_error(sb,
+			   "<--- %s for inode %lu, block %ld ERROR",
+			   __func__, (unsigned long)inode->i_ino,
+			   (long)block);
+		return -EFBIG;
+	}
+
+	disk_off = (ulong) iaddr2blockno(sb, &run);
+
+	map_bh(bh_result, inode->i_sb, disk_off);
+
+	befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu",
+		  __func__, (unsigned long)inode->i_ino, (long)block,
+		  (unsigned long)disk_off);
+
+	return 0;
+}
+
+static struct dentry *
+befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = dir->i_sb;
+	befs_data_stream *ds = &BEFS_I(dir)->i_data.ds;
+	befs_off_t offset;
+	int ret;
+	int utfnamelen;
+	char *utfname;
+	const char *name = dentry->d_name.name;
+
+	befs_debug(sb, "---> %s name %pd inode %ld", __func__,
+		   dentry, dir->i_ino);
+
+	/* Convert to UTF-8 */
+	if (BEFS_SB(sb)->nls) {
+		ret =
+		    befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
+		if (ret < 0) {
+			befs_debug(sb, "<--- %s ERROR", __func__);
+			return ERR_PTR(ret);
+		}
+		ret = befs_btree_find(sb, ds, utfname, &offset);
+		kfree(utfname);
+
+	} else {
+		ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset);
+	}
+
+	if (ret == BEFS_BT_NOT_FOUND) {
+		befs_debug(sb, "<--- %s %pd not found", __func__, dentry);
+		return ERR_PTR(-ENOENT);
+
+	} else if (ret != BEFS_OK || offset == 0) {
+		befs_warning(sb, "<--- %s Error", __func__);
+		return ERR_PTR(-ENODATA);
+	}
+
+	inode = befs_iget(dir->i_sb, (ino_t) offset);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	d_add(dentry, inode);
+
+	befs_debug(sb, "<--- %s", __func__);
+
+	return NULL;
+}
+
+static int
+befs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
+	befs_off_t value;
+	int result;
+	size_t keysize;
+	unsigned char d_type;
+	char keybuf[BEFS_NAME_LEN + 1];
+
+	befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld",
+		  __func__, file, inode->i_ino, ctx->pos);
+
+more:
+	result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
+				 keybuf, &keysize, &value);
+
+	if (result == BEFS_ERR) {
+		befs_debug(sb, "<--- %s ERROR", __func__);
+		befs_error(sb, "IO error reading %pD (inode %lu)",
+			   file, inode->i_ino);
+		return -EIO;
+
+	} else if (result == BEFS_BT_END) {
+		befs_debug(sb, "<--- %s END", __func__);
+		return 0;
+
+	} else if (result == BEFS_BT_EMPTY) {
+		befs_debug(sb, "<--- %s Empty directory", __func__);
+		return 0;
+	}
+
+	d_type = DT_UNKNOWN;
+
+	/* Convert to NLS */
+	if (BEFS_SB(sb)->nls) {
+		char *nlsname;
+		int nlsnamelen;
+		result =
+		    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
+		if (result < 0) {
+			befs_debug(sb, "<--- %s ERROR", __func__);
+			return result;
+		}
+		if (!dir_emit(ctx, nlsname, nlsnamelen,
+				 (ino_t) value, d_type)) {
+			kfree(nlsname);
+			return 0;
+		}
+		kfree(nlsname);
+	} else {
+		if (!dir_emit(ctx, keybuf, keysize,
+				 (ino_t) value, d_type))
+			return 0;
+	}
+	ctx->pos++;
+	goto more;
+}
+
+static struct inode *
+befs_alloc_inode(struct super_block *sb)
+{
+	struct befs_inode_info *bi;
+
+	bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
+        if (!bi)
+                return NULL;
+        return &bi->vfs_inode;
+}
+
+static void befs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
+}
+
+static void befs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, befs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
+
+	inode_init_once(&bi->vfs_inode);
+}
+
+static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct buffer_head *bh = NULL;
+	befs_inode *raw_inode = NULL;
+	struct befs_sb_info *befs_sb = BEFS_SB(sb);
+	struct befs_inode_info *befs_ino = NULL;
+	struct inode *inode;
+	long ret = -EIO;
+
+	befs_debug(sb, "---> %s inode = %lu", __func__, ino);
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	befs_ino = BEFS_I(inode);
+
+	/* convert from vfs's inode number to befs's inode number */
+	befs_ino->i_inode_num = blockno2iaddr(sb, inode->i_ino);
+
+	befs_debug(sb, "  real inode number [%u, %hu, %hu]",
+		   befs_ino->i_inode_num.allocation_group,
+		   befs_ino->i_inode_num.start, befs_ino->i_inode_num.len);
+
+	bh = befs_bread(sb, inode->i_ino);
+	if (!bh) {
+		befs_error(sb, "unable to read inode block - "
+			   "inode = %lu", inode->i_ino);
+		goto unacquire_none;
+	}
+
+	raw_inode = (befs_inode *) bh->b_data;
+
+	befs_dump_inode(sb, raw_inode);
+
+	if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) {
+		befs_error(sb, "Bad inode: %lu", inode->i_ino);
+		goto unacquire_bh;
+	}
+
+	inode->i_mode = (umode_t) fs32_to_cpu(sb, raw_inode->mode);
+
+	/*
+	 * set uid and gid.  But since current BeOS is single user OS, so
+	 * you can change by "uid" or "gid" options.
+	 */   
+
+	inode->i_uid = befs_sb->mount_opts.use_uid ?
+		befs_sb->mount_opts.uid :
+		make_kuid(&init_user_ns, fs32_to_cpu(sb, raw_inode->uid));
+	inode->i_gid = befs_sb->mount_opts.use_gid ?
+		befs_sb->mount_opts.gid :
+		make_kgid(&init_user_ns, fs32_to_cpu(sb, raw_inode->gid));
+
+	set_nlink(inode, 1);
+
+	/*
+	 * BEFS's time is 64 bits, but current VFS is 32 bits...
+	 * BEFS don't have access time. Nor inode change time. VFS
+	 * doesn't have creation time.
+	 * Also, the lower 16 bits of the last_modified_time and 
+	 * create_time are just a counter to help ensure uniqueness
+	 * for indexing purposes. (PFD, page 54)
+	 */
+
+	inode->i_mtime.tv_sec =
+	    fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
+	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */	
+	inode->i_ctime = inode->i_mtime;
+	inode->i_atime = inode->i_mtime;
+
+	befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
+	befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
+	befs_ino->i_attribute = fsrun_to_cpu(sb, raw_inode->attributes);
+	befs_ino->i_flags = fs32_to_cpu(sb, raw_inode->flags);
+
+	if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){
+		inode->i_size = 0;
+		inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
+		strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
+			BEFS_SYMLINK_LEN);
+	} else {
+		int num_blks;
+
+		befs_ino->i_data.ds =
+		    fsds_to_cpu(sb, &raw_inode->data.datastream);
+
+		num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
+		inode->i_blocks =
+		    num_blks * (befs_sb->block_size / VFS_BLOCK_SIZE);
+		inode->i_size = befs_ino->i_data.ds.size;
+	}
+
+	inode->i_mapping->a_ops = &befs_aops;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &befs_dir_inode_operations;
+		inode->i_fop = &befs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (befs_ino->i_flags & BEFS_LONG_SYMLINK)
+			inode->i_op = &befs_symlink_inode_operations;
+		else
+			inode->i_op = &befs_fast_symlink_inode_operations;
+	} else {
+		befs_error(sb, "Inode %lu is not a regular file, "
+			   "directory or symlink. THAT IS WRONG! BeFS has no "
+			   "on disk special files", inode->i_ino);
+		goto unacquire_bh;
+	}
+
+	brelse(bh);
+	befs_debug(sb, "<--- %s", __func__);
+	unlock_new_inode(inode);
+	return inode;
+
+      unacquire_bh:
+	brelse(bh);
+
+      unacquire_none:
+	iget_failed(inode);
+	befs_debug(sb, "<--- %s - Bad inode", __func__);
+	return ERR_PTR(ret);
+}
+
+/* Initialize the inode cache. Called at fs setup.
+ *
+ * Taken from NFS implementation by Al Viro.
+ */
+static int __init
+befs_init_inodecache(void)
+{
+	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
+					      sizeof (struct befs_inode_info),
+					      0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					      init_once);
+	if (befs_inode_cachep == NULL) {
+		pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/* Called at fs teardown.
+ * 
+ * Taken from NFS implementation by Al Viro.
+ */
+static void
+befs_destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(befs_inode_cachep);
+}
+
+/*
+ * The inode of symbolic link is different to data stream.
+ * The data stream become link name. Unless the LONG_SYMLINK
+ * flag is set.
+ */
+static void *
+befs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+	befs_data_stream *data = &befs_ino->i_data.ds;
+	befs_off_t len = data->size;
+	char *link;
+
+	if (len == 0) {
+		befs_error(sb, "Long symlink with illegal length");
+		link = ERR_PTR(-EIO);
+	} else {
+		befs_debug(sb, "Follow long symlink");
+
+		link = kmalloc(len, GFP_NOFS);
+		if (!link) {
+			link = ERR_PTR(-ENOMEM);
+		} else if (befs_read_lsymlink(sb, data, link, len) != len) {
+			kfree(link);
+			befs_error(sb, "Failed to read entire long symlink");
+			link = ERR_PTR(-EIO);
+		} else {
+			link[len - 1] = '\0';
+		}
+	}
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+
+static void *
+befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+
+	nd_set_link(nd, befs_ino->i_data.symlink);
+	return NULL;
+}
+
+/*
+ * UTF-8 to NLS charset  convert routine
+ * 
+ *
+ * Changed 8/10/01 by Will Dyson. Now use uni2char() / char2uni() rather than
+ * the nls tables directly
+ */
+
+static int
+befs_utf2nls(struct super_block *sb, const char *in,
+	     int in_len, char **out, int *out_len)
+{
+	struct nls_table *nls = BEFS_SB(sb)->nls;
+	int i, o;
+	unicode_t uni;
+	int unilen, utflen;
+	char *result;
+	/* The utf8->nls conversion won't make the final nls string bigger
+	 * than the utf one, but if the string is pure ascii they'll have the
+	 * same width and an extra char is needed to save the additional \0
+	 */
+	int maxlen = in_len + 1;
+
+	befs_debug(sb, "---> %s", __func__);
+
+	if (!nls) {
+		befs_error(sb, "%s called with no NLS table loaded", __func__);
+		return -EINVAL;
+	}
+
+	*out = result = kmalloc(maxlen, GFP_NOFS);
+	if (!*out) {
+		befs_error(sb, "%s cannot allocate memory", __func__);
+		*out_len = 0;
+		return -ENOMEM;
+	}
+
+	for (i = o = 0; i < in_len; i += utflen, o += unilen) {
+
+		/* convert from UTF-8 to Unicode */
+		utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
+		if (utflen < 0)
+			goto conv_err;
+
+		/* convert from Unicode to nls */
+		if (uni > MAX_WCHAR_T)
+			goto conv_err;
+		unilen = nls->uni2char(uni, &result[o], in_len - o);
+		if (unilen < 0)
+			goto conv_err;
+	}
+	result[o] = '\0';
+	*out_len = o;
+
+	befs_debug(sb, "<--- %s", __func__);
+
+	return o;
+
+      conv_err:
+	befs_error(sb, "Name using character set %s contains a character that "
+		   "cannot be converted to unicode.", nls->charset);
+	befs_debug(sb, "<--- %s", __func__);
+	kfree(result);
+	return -EILSEQ;
+}
+
+/**
+ * befs_nls2utf - Convert NLS string to utf8 encodeing
+ * @sb: Superblock
+ * @in: Input string buffer in NLS format
+ * @in_len: Length of input string in bytes
+ * @out: The output string in UTF-8 format
+ * @out_len: Length of the output buffer
+ * 
+ * Converts input string @in, which is in the format of the loaded NLS map,
+ * into a utf8 string.
+ * 
+ * The destination string @out is allocated by this function and the caller is
+ * responsible for freeing it with kfree()
+ * 
+ * On return, *@out_len is the length of @out in bytes.
+ *
+ * On success, the return value is the number of utf8 characters written to
+ * the output buffer @out.
+ *  
+ * On Failure, a negative number coresponding to the error code is returned.
+ */
+
+static int
+befs_nls2utf(struct super_block *sb, const char *in,
+	     int in_len, char **out, int *out_len)
+{
+	struct nls_table *nls = BEFS_SB(sb)->nls;
+	int i, o;
+	wchar_t uni;
+	int unilen, utflen;
+	char *result;
+	/* There're nls characters that will translate to 3-chars-wide UTF-8
+	 * characters, a additional byte is needed to save the final \0
+	 * in special cases */
+	int maxlen = (3 * in_len) + 1;
+
+	befs_debug(sb, "---> %s\n", __func__);
+
+	if (!nls) {
+		befs_error(sb, "%s called with no NLS table loaded.",
+			   __func__);
+		return -EINVAL;
+	}
+
+	*out = result = kmalloc(maxlen, GFP_NOFS);
+	if (!*out) {
+		befs_error(sb, "%s cannot allocate memory", __func__);
+		*out_len = 0;
+		return -ENOMEM;
+	}
+
+	for (i = o = 0; i < in_len; i += unilen, o += utflen) {
+
+		/* convert from nls to unicode */
+		unilen = nls->char2uni(&in[i], in_len - i, &uni);
+		if (unilen < 0)
+			goto conv_err;
+
+		/* convert from unicode to UTF-8 */
+		utflen = utf32_to_utf8(uni, &result[o], 3);
+		if (utflen <= 0)
+			goto conv_err;
+	}
+
+	result[o] = '\0';
+	*out_len = o;
+
+	befs_debug(sb, "<--- %s", __func__);
+
+	return i;
+
+      conv_err:
+	befs_error(sb, "Name using charecter set %s contains a charecter that "
+		   "cannot be converted to unicode.", nls->charset);
+	befs_debug(sb, "<--- %s", __func__);
+	kfree(result);
+	return -EILSEQ;
+}
+
+/**
+ * Use the
+ *
+ */
+enum {
+	Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
+};
+
+static const match_table_t befs_tokens = {
+	{Opt_uid, "uid=%d"},
+	{Opt_gid, "gid=%d"},
+	{Opt_charset, "iocharset=%s"},
+	{Opt_debug, "debug"},
+	{Opt_err, NULL}
+};
+
+static int
+parse_options(char *options, struct befs_mount_options *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	kuid_t uid;
+	kgid_t gid;
+
+	/* Initialize options */
+	opts->uid = GLOBAL_ROOT_UID;
+	opts->gid = GLOBAL_ROOT_GID;
+	opts->use_uid = 0;
+	opts->use_gid = 0;
+	opts->iocharset = NULL;
+	opts->debug = 0;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, befs_tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			uid = INVALID_UID;
+			if (option >= 0)
+				uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid)) {
+				pr_err("Invalid uid %d, "
+				       "using default\n", option);
+				break;
+			}
+			opts->uid = uid;
+			opts->use_uid = 1;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			gid = INVALID_GID;
+			if (option >= 0)
+				gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid)) {
+				pr_err("Invalid gid %d, "
+				       "using default\n", option);
+				break;
+			}
+			opts->gid = gid;
+			opts->use_gid = 1;
+			break;
+		case Opt_charset:
+			kfree(opts->iocharset);
+			opts->iocharset = match_strdup(&args[0]);
+			if (!opts->iocharset) {
+				pr_err("allocation failure for "
+				       "iocharset string\n");
+				return 0;
+			}
+			break;
+		case Opt_debug:
+			opts->debug = 1;
+			break;
+		default:
+			pr_err("Unrecognized mount option \"%s\" "
+			       "or missing value\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/* This function has the responsibiltiy of getting the
+ * filesystem ready for unmounting. 
+ * Basically, we free everything that we allocated in
+ * befs_read_inode
+ */
+static void
+befs_put_super(struct super_block *sb)
+{
+	kfree(BEFS_SB(sb)->mount_opts.iocharset);
+	BEFS_SB(sb)->mount_opts.iocharset = NULL;
+	unload_nls(BEFS_SB(sb)->nls);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+/* Allocate private field of the superblock, fill it.
+ *
+ * Finish filling the public superblock fields
+ * Make the root directory
+ * Load a set of NLS translations if needed.
+ */
+static int
+befs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh;
+	struct befs_sb_info *befs_sb;
+	befs_super_block *disk_sb;
+	struct inode *root;
+	long ret = -EINVAL;
+	const unsigned long sb_block = 0;
+	const off_t x86_sb_off = 512;
+
+	save_mount_options(sb, data);
+
+	sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
+	if (sb->s_fs_info == NULL) {
+		pr_err("(%s): Unable to allocate memory for private "
+		       "portion of superblock. Bailing.\n", sb->s_id);
+		goto unacquire_none;
+	}
+	befs_sb = BEFS_SB(sb);
+
+	if (!parse_options((char *) data, &befs_sb->mount_opts)) {
+		befs_error(sb, "cannot parse mount options");
+		goto unacquire_priv_sbp;
+	}
+
+	befs_debug(sb, "---> %s", __func__);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		befs_warning(sb,
+			     "No write support. Marking filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	/*
+	 * Set dummy blocksize to read super block.
+	 * Will be set to real fs blocksize later.
+	 *
+	 * Linux 2.4.10 and later refuse to read blocks smaller than
+	 * the hardsect size for the device. But we also need to read at 
+	 * least 1k to get the second 512 bytes of the volume.
+	 * -WD 10-26-01
+	 */ 
+	sb_min_blocksize(sb, 1024);
+
+	if (!(bh = sb_bread(sb, sb_block))) {
+		befs_error(sb, "unable to read superblock");
+		goto unacquire_priv_sbp;
+	}
+
+	/* account for offset of super block on x86 */
+	disk_sb = (befs_super_block *) bh->b_data;
+	if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) ||
+	    (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) {
+		befs_debug(sb, "Using PPC superblock location");
+	} else {
+		befs_debug(sb, "Using x86 superblock location");
+		disk_sb =
+		    (befs_super_block *) ((void *) bh->b_data + x86_sb_off);
+	}
+
+	if ((befs_load_sb(sb, disk_sb) != BEFS_OK) ||
+	    (befs_check_sb(sb) != BEFS_OK))
+		goto unacquire_bh;
+
+	befs_dump_super_block(sb, disk_sb);
+
+	brelse(bh);
+
+	if( befs_sb->num_blocks > ~((sector_t)0) ) {
+		befs_error(sb, "blocks count: %llu "
+			"is larger than the host can use",
+			befs_sb->num_blocks);
+		goto unacquire_priv_sbp;
+	}
+
+	/*
+	 * set up enough so that it can read an inode
+	 * Fill in kernel superblock fields from private sb
+	 */
+	sb->s_magic = BEFS_SUPER_MAGIC;
+	/* Set real blocksize of fs */
+	sb_set_blocksize(sb, (ulong) befs_sb->block_size);
+	sb->s_op = &befs_sops;
+	root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto unacquire_priv_sbp;
+	}
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		befs_error(sb, "get root inode failed");
+		goto unacquire_priv_sbp;
+	}
+
+	/* load nls library */
+	if (befs_sb->mount_opts.iocharset) {
+		befs_debug(sb, "Loading nls: %s",
+			   befs_sb->mount_opts.iocharset);
+		befs_sb->nls = load_nls(befs_sb->mount_opts.iocharset);
+		if (!befs_sb->nls) {
+			befs_warning(sb, "Cannot load nls %s"
+					" loading default nls",
+					befs_sb->mount_opts.iocharset);
+			befs_sb->nls = load_nls_default();
+		}
+	/* load default nls if none is specified  in mount options */
+	} else {
+		befs_debug(sb, "Loading default nls");
+		befs_sb->nls = load_nls_default();
+	}
+
+	return 0;
+/*****************/
+      unacquire_bh:
+	brelse(bh);
+
+      unacquire_priv_sbp:
+	kfree(befs_sb->mount_opts.iocharset);
+	kfree(sb->s_fs_info);
+
+      unacquire_none:
+	sb->s_fs_info = NULL;
+	return ret;
+}
+
+static int
+befs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	if (!(*flags & MS_RDONLY))
+		return -EINVAL;
+	return 0;
+}
+
+static int
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	befs_debug(sb, "---> %s", __func__);
+
+	buf->f_type = BEFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = BEFS_SB(sb)->num_blocks;
+	buf->f_bfree = BEFS_SB(sb)->num_blocks - BEFS_SB(sb)->used_blocks;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = 0;	/* UNKNOWN */
+	buf->f_ffree = 0;	/* UNKNOWN */
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = BEFS_NAME_LEN;
+
+	befs_debug(sb, "<--- %s", __func__);
+
+	return 0;
+}
+
+static struct dentry *
+befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
+	    void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+}
+
+static struct file_system_type befs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "befs",
+	.mount		= befs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,	
+};
+MODULE_ALIAS_FS("befs");
+
+static int __init
+init_befs_fs(void)
+{
+	int err;
+
+	pr_info("version: %s\n", BEFS_VERSION);
+
+	err = befs_init_inodecache();
+	if (err)
+		goto unacquire_none;
+
+	err = register_filesystem(&befs_fs_type);
+	if (err)
+		goto unacquire_inodecache;
+
+	return 0;
+
+unacquire_inodecache:
+	befs_destroy_inodecache();
+
+unacquire_none:
+	return err;
+}
+
+static void __exit
+exit_befs_fs(void)
+{
+	befs_destroy_inodecache();
+
+	unregister_filesystem(&befs_fs_type);
+}
+
+/*
+Macros that typecheck the init and exit functions,
+ensures that they are called at init and cleanup,
+and eliminates warnings about unused functions.
+*/
+module_init(init_befs_fs)
+module_exit(exit_befs_fs)
diff --git a/kernel/fs/befs/super.c b/kernel/fs/befs/super.c
new file mode 100644
index 000000000..aeafc4d84
--- /dev/null
+++ b/kernel/fs/befs/super.c
@@ -0,0 +1,112 @@
+/*
+ * super.c
+ *
+ * Copyright (C) 2001-2002 Will Dyson <will_dyson@pobox.com>
+ *
+ * Licensed under the GNU GPL. See the file COPYING for details.
+ *
+ */
+
+#include <linux/fs.h>
+#include <asm/page.h> /* for PAGE_SIZE */
+
+#include "befs.h"
+#include "super.h"
+
+/**
+ * load_befs_sb -- Read from disk and properly byteswap all the fields
+ * of the befs superblock
+ *
+ *
+ *
+ *
+ */
+int
+befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
+{
+	struct befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	/* Check the byte order of the filesystem */
+	if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE)
+	    befs_sb->byte_order = BEFS_BYTESEX_LE;
+	else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE)
+	    befs_sb->byte_order = BEFS_BYTESEX_BE;
+
+	befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1);
+	befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2);
+	befs_sb->magic3 = fs32_to_cpu(sb, disk_sb->magic3);
+	befs_sb->block_size = fs32_to_cpu(sb, disk_sb->block_size);
+	befs_sb->block_shift = fs32_to_cpu(sb, disk_sb->block_shift);
+	befs_sb->num_blocks = fs64_to_cpu(sb, disk_sb->num_blocks);
+	befs_sb->used_blocks = fs64_to_cpu(sb, disk_sb->used_blocks);
+	befs_sb->inode_size = fs32_to_cpu(sb, disk_sb->inode_size);
+
+	befs_sb->blocks_per_ag = fs32_to_cpu(sb, disk_sb->blocks_per_ag);
+	befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift);
+	befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags);
+
+	befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks);
+	befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start);
+	befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end);
+
+	befs_sb->root_dir = fsrun_to_cpu(sb, disk_sb->root_dir);
+	befs_sb->indices = fsrun_to_cpu(sb, disk_sb->indices);
+	befs_sb->nls = NULL;
+
+	return BEFS_OK;
+}
+
+int
+befs_check_sb(struct super_block *sb)
+{
+	struct befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	/* Check magic headers of super block */
+	if ((befs_sb->magic1 != BEFS_SUPER_MAGIC1)
+	    || (befs_sb->magic2 != BEFS_SUPER_MAGIC2)
+	    || (befs_sb->magic3 != BEFS_SUPER_MAGIC3)) {
+		befs_error(sb, "invalid magic header");
+		return BEFS_ERR;
+	}
+
+	/*
+	 * Check blocksize of BEFS.
+	 *
+	 * Blocksize of BEFS is 1024, 2048, 4096 or 8192.
+	 */
+
+	if ((befs_sb->block_size != 1024)
+	    && (befs_sb->block_size != 2048)
+	    && (befs_sb->block_size != 4096)
+	    && (befs_sb->block_size != 8192)) {
+		befs_error(sb, "invalid blocksize: %u", befs_sb->block_size);
+		return BEFS_ERR;
+	}
+
+	if (befs_sb->block_size > PAGE_SIZE) {
+		befs_error(sb, "blocksize(%u) cannot be larger"
+			   "than system pagesize(%lu)", befs_sb->block_size,
+			   PAGE_SIZE);
+		return BEFS_ERR;
+	}
+
+	/*
+	   * block_shift and block_size encode the same information
+	   * in different ways as a consistency check.
+	 */
+
+	if ((1 << befs_sb->block_shift) != befs_sb->block_size) {
+		befs_error(sb, "block_shift disagrees with block_size. "
+			   "Corruption likely.");
+		return BEFS_ERR;
+	}
+
+	if (befs_sb->log_start != befs_sb->log_end) {
+		befs_error(sb, "Filesystem not clean! There are blocks in the "
+			   "journal. You must boot into BeOS and mount this volume "
+			   "to make it clean.");
+		return BEFS_ERR;
+	}
+
+	return BEFS_OK;
+}
diff --git a/kernel/fs/befs/super.h b/kernel/fs/befs/super.h
new file mode 100644
index 000000000..dc4556376
--- /dev/null
+++ b/kernel/fs/befs/super.h
@@ -0,0 +1,8 @@
+/*
+ * super.h
+ */
+
+int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb);
+
+int befs_check_sb(struct super_block *sb);
+
diff --git a/kernel/fs/bfs/Kconfig b/kernel/fs/bfs/Kconfig
new file mode 100644
index 000000000..3728a6479
--- /dev/null
+++ b/kernel/fs/bfs/Kconfig
@@ -0,0 +1,19 @@
+config BFS_FS
+	tristate "BFS file system support"
+	depends on BLOCK
+	help
+	  Boot File System (BFS) is a file system used under SCO UnixWare to
+	  allow the bootloader access to the kernel image and other important
+	  files during the boot process.  It is usually mounted under /stand
+	  and corresponds to the slice marked as "STAND" in the UnixWare
+	  partition.  You should say Y if you want to read or write the files
+	  on your /stand slice from within Linux.  You then also need to say Y
+	  to "UnixWare slices support", below.  More information about the BFS
+	  file system is contained in the file
+	  <file:Documentation/filesystems/bfs.txt>.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be called
+	  bfs.  Note that the file system of your root partition (the one
+	  containing the directory /) cannot be compiled as a module.
diff --git a/kernel/fs/bfs/Makefile b/kernel/fs/bfs/Makefile
new file mode 100644
index 000000000..c787b36d9
--- /dev/null
+++ b/kernel/fs/bfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for BFS filesystem.
+#
+
+obj-$(CONFIG_BFS_FS) += bfs.o
+
+bfs-objs := inode.o file.o dir.o
diff --git a/kernel/fs/bfs/bfs.h b/kernel/fs/bfs/bfs.h
new file mode 100644
index 000000000..f40006db3
--- /dev/null
+++ b/kernel/fs/bfs/bfs.h
@@ -0,0 +1,60 @@
+/*
+ *	fs/bfs/bfs.h
+ *	Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ */
+#ifndef _FS_BFS_BFS_H
+#define _FS_BFS_BFS_H
+
+#include <linux/bfs_fs.h>
+
+/*
+ * BFS file system in-core superblock info
+ */
+struct bfs_sb_info {
+	unsigned long si_blocks;
+	unsigned long si_freeb;
+	unsigned long si_freei;
+	unsigned long si_lf_eblk;
+	unsigned long si_lasti;
+	unsigned long *si_imap;
+	struct mutex bfs_lock;
+};
+
+/*
+ * BFS file system in-core inode info
+ */
+struct bfs_inode_info {
+	unsigned long i_dsk_ino; /* inode number from the disk, can be 0 */
+	unsigned long i_sblock;
+	unsigned long i_eblock;
+	struct inode vfs_inode;
+};
+
+static inline struct bfs_sb_info *BFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct bfs_inode_info *BFS_I(struct inode *inode)
+{
+	return container_of(inode, struct bfs_inode_info, vfs_inode);
+}
+
+
+#define printf(format, args...) \
+	printk(KERN_ERR "BFS-fs: %s(): " format, __func__, ## args)
+
+/* inode.c */
+extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino);
+extern void bfs_dump_imap(const char *, struct super_block *);
+
+/* file.c */
+extern const struct inode_operations bfs_file_inops;
+extern const struct file_operations bfs_file_operations;
+extern const struct address_space_operations bfs_aops;
+
+/* dir.c */
+extern const struct inode_operations bfs_dir_inops;
+extern const struct file_operations bfs_dir_operations;
+
+#endif /* _FS_BFS_BFS_H */
diff --git a/kernel/fs/bfs/dir.c b/kernel/fs/bfs/dir.c
new file mode 100644
index 000000000..3ec611314
--- /dev/null
+++ b/kernel/fs/bfs/dir.c
@@ -0,0 +1,365 @@
+/*
+ *	fs/bfs/dir.c
+ *	BFS directory operations.
+ *	Copyright (C) 1999,2000  Tigran Aivazian <tigran@veritas.com>
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
+ */
+
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include "bfs.h"
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define dprintf(x...)	printf(x)
+#else
+#define dprintf(x...)
+#endif
+
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
+						int namelen, int ino);
+static struct buffer_head *bfs_find_entry(struct inode *dir,
+				const unsigned char *name, int namelen,
+				struct bfs_dirent **res_dir);
+
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
+{
+	struct inode *dir = file_inode(f);
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	unsigned int offset;
+	int block;
+
+	if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
+		printf("Bad f_pos=%08lx for %s:%08lx\n",
+					(unsigned long)ctx->pos,
+					dir->i_sb->s_id, dir->i_ino);
+		return -EINVAL;
+	}
+
+	while (ctx->pos < dir->i_size) {
+		offset = ctx->pos & (BFS_BSIZE - 1);
+		block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
+		bh = sb_bread(dir->i_sb, block);
+		if (!bh) {
+			ctx->pos += BFS_BSIZE - offset;
+			continue;
+		}
+		do {
+			de = (struct bfs_dirent *)(bh->b_data + offset);
+			if (de->ino) {
+				int size = strnlen(de->name, BFS_NAMELEN);
+				if (!dir_emit(ctx, de->name, size,
+						le16_to_cpu(de->ino),
+						DT_UNKNOWN)) {
+					brelse(bh);
+					return 0;
+				}
+			}
+			offset += BFS_DIRENT_SIZE;
+			ctx->pos += BFS_DIRENT_SIZE;
+		} while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
+		brelse(bh);
+	}
+	return 0;
+}
+
+const struct file_operations bfs_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= bfs_readdir,
+	.fsync		= generic_file_fsync,
+	.llseek		= generic_file_llseek,
+};
+
+static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+						bool excl)
+{
+	int err;
+	struct inode *inode;
+	struct super_block *s = dir->i_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
+	unsigned long ino;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+	mutex_lock(&info->bfs_lock);
+	ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
+	if (ino > info->si_lasti) {
+		mutex_unlock(&info->bfs_lock);
+		iput(inode);
+		return -ENOSPC;
+	}
+	set_bit(ino, info->si_imap);
+	info->si_freei--;
+	inode_init_owner(inode, dir, mode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_blocks = 0;
+	inode->i_op = &bfs_file_inops;
+	inode->i_fop = &bfs_file_operations;
+	inode->i_mapping->a_ops = &bfs_aops;
+	inode->i_ino = ino;
+	BFS_I(inode)->i_dsk_ino = ino;
+	BFS_I(inode)->i_sblock = 0;
+	BFS_I(inode)->i_eblock = 0;
+	insert_inode_hash(inode);
+        mark_inode_dirty(inode);
+	bfs_dump_imap("create", s);
+
+	err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
+							inode->i_ino);
+	if (err) {
+		inode_dec_link_count(inode);
+		mutex_unlock(&info->bfs_lock);
+		iput(inode);
+		return err;
+	}
+	mutex_unlock(&info->bfs_lock);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
+						unsigned int flags)
+{
+	struct inode *inode = NULL;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
+
+	if (dentry->d_name.len > BFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	mutex_lock(&info->bfs_lock);
+	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+	if (bh) {
+		unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
+		brelse(bh);
+		inode = bfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode)) {
+			mutex_unlock(&info->bfs_lock);
+			return ERR_CAST(inode);
+		}
+	}
+	mutex_unlock(&info->bfs_lock);
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int bfs_link(struct dentry *old, struct inode *dir,
+						struct dentry *new)
+{
+	struct inode *inode = d_inode(old);
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+	int err;
+
+	mutex_lock(&info->bfs_lock);
+	err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
+							inode->i_ino);
+	if (err) {
+		mutex_unlock(&info->bfs_lock);
+		return err;
+	}
+	inc_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	ihold(inode);
+	d_instantiate(new, inode);
+	mutex_unlock(&info->bfs_lock);
+	return 0;
+}
+
+static int bfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error = -ENOENT;
+	struct inode *inode = d_inode(dentry);
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+
+	mutex_lock(&info->bfs_lock);
+	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+	if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
+		goto out_brelse;
+
+	if (!inode->i_nlink) {
+		printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
+					inode->i_sb->s_id, inode->i_ino,
+					inode->i_nlink);
+		set_nlink(inode, 1);
+	}
+	de->ino = 0;
+	mark_buffer_dirty_inode(bh, dir);
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	error = 0;
+
+out_brelse:
+	brelse(bh);
+	mutex_unlock(&info->bfs_lock);
+	return error;
+}
+
+static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode, *new_inode;
+	struct buffer_head *old_bh, *new_bh;
+	struct bfs_dirent *old_de, *new_de;
+	struct bfs_sb_info *info;
+	int error = -ENOENT;
+
+	old_bh = new_bh = NULL;
+	old_inode = d_inode(old_dentry);
+	if (S_ISDIR(old_inode->i_mode))
+		return -EINVAL;
+
+	info = BFS_SB(old_inode->i_sb);
+
+	mutex_lock(&info->bfs_lock);
+	old_bh = bfs_find_entry(old_dir, 
+				old_dentry->d_name.name, 
+				old_dentry->d_name.len, &old_de);
+
+	if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino))
+		goto end_rename;
+
+	error = -EPERM;
+	new_inode = d_inode(new_dentry);
+	new_bh = bfs_find_entry(new_dir, 
+				new_dentry->d_name.name, 
+				new_dentry->d_name.len, &new_de);
+
+	if (new_bh && !new_inode) {
+		brelse(new_bh);
+		new_bh = NULL;
+	}
+	if (!new_bh) {
+		error = bfs_add_entry(new_dir, 
+					new_dentry->d_name.name,
+					new_dentry->d_name.len,
+					old_inode->i_ino);
+		if (error)
+			goto end_rename;
+	}
+	old_de->ino = 0;
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_dir);
+	if (new_inode) {
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		inode_dec_link_count(new_inode);
+	}
+	mark_buffer_dirty_inode(old_bh, old_dir);
+	error = 0;
+
+end_rename:
+	mutex_unlock(&info->bfs_lock);
+	brelse(old_bh);
+	brelse(new_bh);
+	return error;
+}
+
+const struct inode_operations bfs_dir_inops = {
+	.create			= bfs_create,
+	.lookup			= bfs_lookup,
+	.link			= bfs_link,
+	.unlink			= bfs_unlink,
+	.rename			= bfs_rename,
+};
+
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
+							int namelen, int ino)
+{
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	int block, sblock, eblock, off, pos;
+	int i;
+
+	dprintf("name=%s, namelen=%d\n", name, namelen);
+
+	if (!namelen)
+		return -ENOENT;
+	if (namelen > BFS_NAMELEN)
+		return -ENAMETOOLONG;
+
+	sblock = BFS_I(dir)->i_sblock;
+	eblock = BFS_I(dir)->i_eblock;
+	for (block = sblock; block <= eblock; block++) {
+		bh = sb_bread(dir->i_sb, block);
+		if (!bh)
+			return -EIO;
+		for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) {
+			de = (struct bfs_dirent *)(bh->b_data + off);
+			if (!de->ino) {
+				pos = (block - sblock) * BFS_BSIZE + off;
+				if (pos >= dir->i_size) {
+					dir->i_size += BFS_DIRENT_SIZE;
+					dir->i_ctime = CURRENT_TIME_SEC;
+				}
+				dir->i_mtime = CURRENT_TIME_SEC;
+				mark_inode_dirty(dir);
+				de->ino = cpu_to_le16((u16)ino);
+				for (i = 0; i < BFS_NAMELEN; i++)
+					de->name[i] =
+						(i < namelen) ? name[i] : 0;
+				mark_buffer_dirty_inode(bh, dir);
+				brelse(bh);
+				return 0;
+			}
+		}
+		brelse(bh);
+	}
+	return -ENOSPC;
+}
+
+static inline int bfs_namecmp(int len, const unsigned char *name,
+							const char *buffer)
+{
+	if ((len < BFS_NAMELEN) && buffer[len])
+		return 0;
+	return !memcmp(name, buffer, len);
+}
+
+static struct buffer_head *bfs_find_entry(struct inode *dir,
+			const unsigned char *name, int namelen,
+			struct bfs_dirent **res_dir)
+{
+	unsigned long block = 0, offset = 0;
+	struct buffer_head *bh = NULL;
+	struct bfs_dirent *de;
+
+	*res_dir = NULL;
+	if (namelen > BFS_NAMELEN)
+		return NULL;
+
+	while (block * BFS_BSIZE + offset < dir->i_size) {
+		if (!bh) {
+			bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block);
+			if (!bh) {
+				block++;
+				continue;
+			}
+		}
+		de = (struct bfs_dirent *)(bh->b_data + offset);
+		offset += BFS_DIRENT_SIZE;
+		if (le16_to_cpu(de->ino) &&
+				bfs_namecmp(namelen, name, de->name)) {
+			*res_dir = de;
+			return bh;
+		}
+		if (offset < bh->b_size)
+			continue;
+		brelse(bh);
+		bh = NULL;
+		offset = 0;
+		block++;
+	}
+	brelse(bh);
+	return NULL;
+}
diff --git a/kernel/fs/bfs/file.c b/kernel/fs/bfs/file.c
new file mode 100644
index 000000000..97f1b5160
--- /dev/null
+++ b/kernel/fs/bfs/file.c
@@ -0,0 +1,197 @@
+/*
+ *	fs/bfs/file.c
+ *	BFS file operations.
+ *	Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ *
+ *	Make the file block allocation algorithm understand the size
+ *	of the underlying block device.
+ *	Copyright (C) 2007 Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include "bfs.h"
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define dprintf(x...)	printf(x)
+#else
+#define dprintf(x...)
+#endif
+
+const struct file_operations bfs_file_operations = {
+	.llseek 	= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int bfs_move_block(unsigned long from, unsigned long to,
+					struct super_block *sb)
+{
+	struct buffer_head *bh, *new;
+
+	bh = sb_bread(sb, from);
+	if (!bh)
+		return -EIO;
+	new = sb_getblk(sb, to);
+	memcpy(new->b_data, bh->b_data, bh->b_size);
+	mark_buffer_dirty(new);
+	bforget(bh);
+	brelse(new);
+	return 0;
+}
+
+static int bfs_move_blocks(struct super_block *sb, unsigned long start,
+				unsigned long end, unsigned long where)
+{
+	unsigned long i;
+
+	dprintf("%08lx-%08lx->%08lx\n", start, end, where);
+	for (i = start; i <= end; i++)
+		if(bfs_move_block(i, where + i, sb)) {
+			dprintf("failed to move block %08lx -> %08lx\n", i,
+								where + i);
+			return -EIO;
+		}
+	return 0;
+}
+
+static int bfs_get_block(struct inode *inode, sector_t block,
+			struct buffer_head *bh_result, int create)
+{
+	unsigned long phys;
+	int err;
+	struct super_block *sb = inode->i_sb;
+	struct bfs_sb_info *info = BFS_SB(sb);
+	struct bfs_inode_info *bi = BFS_I(inode);
+
+	phys = bi->i_sblock + block;
+	if (!create) {
+		if (phys <= bi->i_eblock) {
+			dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n",
+                                create, (unsigned long)block, phys);
+			map_bh(bh_result, sb, phys);
+		}
+		return 0;
+	}
+
+	/*
+	 * If the file is not empty and the requested block is within the
+	 * range of blocks allocated for this file, we can grant it.
+	 */
+	if (bi->i_sblock && (phys <= bi->i_eblock)) {
+		dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", 
+				create, (unsigned long)block, phys);
+		map_bh(bh_result, sb, phys);
+		return 0;
+	}
+
+	/* The file will be extended, so let's see if there is enough space. */
+	if (phys >= info->si_blocks)
+		return -ENOSPC;
+
+	/* The rest has to be protected against itself. */
+	mutex_lock(&info->bfs_lock);
+
+	/*
+	 * If the last data block for this file is the last allocated
+	 * block, we can extend the file trivially, without moving it
+	 * anywhere.
+	 */
+	if (bi->i_eblock == info->si_lf_eblk) {
+		dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", 
+				create, (unsigned long)block, phys);
+		map_bh(bh_result, sb, phys);
+		info->si_freeb -= phys - bi->i_eblock;
+		info->si_lf_eblk = bi->i_eblock = phys;
+		mark_inode_dirty(inode);
+		err = 0;
+		goto out;
+	}
+
+	/* Ok, we have to move this entire file to the next free block. */
+	phys = info->si_lf_eblk + 1;
+	if (phys + block >= info->si_blocks) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	if (bi->i_sblock) {
+		err = bfs_move_blocks(inode->i_sb, bi->i_sblock, 
+						bi->i_eblock, phys);
+		if (err) {
+			dprintf("failed to move ino=%08lx -> fs corruption\n",
+								inode->i_ino);
+			goto out;
+		}
+	} else
+		err = 0;
+
+	dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n",
+                create, (unsigned long)block, phys);
+	bi->i_sblock = phys;
+	phys += block;
+	info->si_lf_eblk = bi->i_eblock = phys;
+
+	/*
+	 * This assumes nothing can write the inode back while we are here
+	 * and thus update inode->i_blocks! (XXX)
+	 */
+	info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
+	mark_inode_dirty(inode);
+	map_bh(bh_result, sb, phys);
+out:
+	mutex_unlock(&info->bfs_lock);
+	return err;
+}
+
+static int bfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, bfs_get_block, wbc);
+}
+
+static int bfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, bfs_get_block);
+}
+
+static void bfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size)
+		truncate_pagecache(inode, inode->i_size);
+}
+
+static int bfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				bfs_get_block);
+	if (unlikely(ret))
+		bfs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, bfs_get_block);
+}
+
+const struct address_space_operations bfs_aops = {
+	.readpage	= bfs_readpage,
+	.writepage	= bfs_writepage,
+	.write_begin	= bfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= bfs_bmap,
+};
+
+const struct inode_operations bfs_file_inops;
diff --git a/kernel/fs/bfs/inode.c b/kernel/fs/bfs/inode.c
new file mode 100644
index 000000000..fdcb4d69f
--- /dev/null
+++ b/kernel/fs/bfs/inode.c
@@ -0,0 +1,499 @@
+/*
+ *	fs/bfs/inode.c
+ *	BFS superblock and inode operations.
+ *	Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *	From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
+ *
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include <asm/uaccess.h>
+#include "bfs.h"
+
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux");
+MODULE_LICENSE("GPL");
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define dprintf(x...)	printf(x)
+#else
+#define dprintf(x...)
+#endif
+
+struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct bfs_inode *di;
+	struct inode *inode;
+	struct buffer_head *bh;
+	int block, off;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
+		printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino);
+		goto error;
+	}
+
+	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh) {
+		printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id,
+									ino);
+		goto error;
+	}
+
+	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
+	di = (struct bfs_inode *)bh->b_data + off;
+
+	inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode);
+	if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
+		inode->i_mode |= S_IFDIR;
+		inode->i_op = &bfs_dir_inops;
+		inode->i_fop = &bfs_dir_operations;
+	} else if (le32_to_cpu(di->i_vtype) == BFS_VREG) {
+		inode->i_mode |= S_IFREG;
+		inode->i_op = &bfs_file_inops;
+		inode->i_fop = &bfs_file_operations;
+		inode->i_mapping->a_ops = &bfs_aops;
+	}
+
+	BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
+	BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
+	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
+	i_uid_write(inode, le32_to_cpu(di->i_uid));
+	i_gid_write(inode,  le32_to_cpu(di->i_gid));
+	set_nlink(inode, le32_to_cpu(di->i_nlink));
+	inode->i_size = BFS_FILESIZE(di);
+	inode->i_blocks = BFS_FILEBLOCKS(di);
+	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
+	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
+	inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+
+error:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p)
+{
+	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) {
+		printf("Bad inode number %s:%08x\n", sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	ino -= BFS_ROOT_INO;
+
+	*p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK);
+	if (!*p) {
+		printf("Unable to read inode %s:%08x\n", sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	return (struct bfs_inode *)(*p)->b_data +  ino % BFS_INODES_PER_BLOCK;
+}
+
+static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+	unsigned int ino = (u16)inode->i_ino;
+        unsigned long i_sblock;
+	struct bfs_inode *di;
+	struct buffer_head *bh;
+	int err = 0;
+
+        dprintf("ino=%08x\n", ino);
+
+	di = find_inode(inode->i_sb, ino, &bh);
+	if (IS_ERR(di))
+		return PTR_ERR(di);
+
+	mutex_lock(&info->bfs_lock);
+
+	if (ino == BFS_ROOT_INO)
+		di->i_vtype = cpu_to_le32(BFS_VDIR);
+	else
+		di->i_vtype = cpu_to_le32(BFS_VREG);
+
+	di->i_ino = cpu_to_le16(ino);
+	di->i_mode = cpu_to_le32(inode->i_mode);
+	di->i_uid = cpu_to_le32(i_uid_read(inode));
+	di->i_gid = cpu_to_le32(i_gid_read(inode));
+	di->i_nlink = cpu_to_le32(inode->i_nlink);
+	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+        i_sblock = BFS_I(inode)->i_sblock;
+	di->i_sblock = cpu_to_le32(i_sblock);
+	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
+	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
+
+	mark_buffer_dirty(bh);
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+	brelse(bh);
+	mutex_unlock(&info->bfs_lock);
+	return err;
+}
+
+static void bfs_evict_inode(struct inode *inode)
+{
+	unsigned long ino = inode->i_ino;
+	struct bfs_inode *di;
+	struct buffer_head *bh;
+	struct super_block *s = inode->i_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
+	struct bfs_inode_info *bi = BFS_I(inode);
+
+	dprintf("ino=%08lx\n", ino);
+
+	truncate_inode_pages_final(&inode->i_data);
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+
+	if (inode->i_nlink)
+		return;
+
+	di = find_inode(s, inode->i_ino, &bh);
+	if (IS_ERR(di))
+		return;
+
+	mutex_lock(&info->bfs_lock);
+	/* clear on-disk inode */
+	memset(di, 0, sizeof(struct bfs_inode));
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+        if (bi->i_dsk_ino) {
+		if (bi->i_sblock)
+			info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
+		info->si_freei++;
+		clear_bit(ino, info->si_imap);
+		bfs_dump_imap("delete_inode", s);
+        }
+
+	/*
+	 * If this was the last file, make the previous block
+	 * "last block of the last file" even if there is no
+	 * real file there, saves us 1 gap.
+	 */
+	if (info->si_lf_eblk == bi->i_eblock)
+		info->si_lf_eblk = bi->i_sblock - 1;
+	mutex_unlock(&info->bfs_lock);
+}
+
+static void bfs_put_super(struct super_block *s)
+{
+	struct bfs_sb_info *info = BFS_SB(s);
+
+	if (!info)
+		return;
+
+	mutex_destroy(&info->bfs_lock);
+	kfree(info->si_imap);
+	kfree(info);
+	s->s_fs_info = NULL;
+}
+
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+	buf->f_type = BFS_MAGIC;
+	buf->f_bsize = s->s_blocksize;
+	buf->f_blocks = info->si_blocks;
+	buf->f_bfree = buf->f_bavail = info->si_freeb;
+	buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO;
+	buf->f_ffree = info->si_freei;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = BFS_NAMELEN;
+	return 0;
+}
+
+static struct kmem_cache *bfs_inode_cachep;
+
+static struct inode *bfs_alloc_inode(struct super_block *sb)
+{
+	struct bfs_inode_info *bi;
+	bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
+	if (!bi)
+		return NULL;
+	return &bi->vfs_inode;
+}
+
+static void bfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
+}
+
+static void bfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct bfs_inode_info *bi = foo;
+
+	inode_init_once(&bi->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
+					     sizeof(struct bfs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (bfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(bfs_inode_cachep);
+}
+
+static const struct super_operations bfs_sops = {
+	.alloc_inode	= bfs_alloc_inode,
+	.destroy_inode	= bfs_destroy_inode,
+	.write_inode	= bfs_write_inode,
+	.evict_inode	= bfs_evict_inode,
+	.put_super	= bfs_put_super,
+	.statfs		= bfs_statfs,
+};
+
+void bfs_dump_imap(const char *prefix, struct super_block *s)
+{
+#ifdef DEBUG
+	int i;
+	char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL);
+
+	if (!tmpbuf)
+		return;
+	for (i = BFS_SB(s)->si_lasti; i >= 0; i--) {
+		if (i > PAGE_SIZE - 100) break;
+		if (test_bit(i, BFS_SB(s)->si_imap))
+			strcat(tmpbuf, "1");
+		else
+			strcat(tmpbuf, "0");
+	}
+	printf("BFS-fs: %s: lasti=%08lx <%s>\n",
+				prefix, BFS_SB(s)->si_lasti, tmpbuf);
+	free_page((unsigned long)tmpbuf);
+#endif
+}
+
+static int bfs_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh, *sbh;
+	struct bfs_super_block *bfs_sb;
+	struct inode *inode;
+	unsigned i, imap_len;
+	struct bfs_sb_info *info;
+	int ret = -EINVAL;
+	unsigned long i_sblock, i_eblock, i_eoff, s_size;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	mutex_init(&info->bfs_lock);
+	s->s_fs_info = info;
+
+	sb_set_blocksize(s, BFS_BSIZE);
+
+	sbh = sb_bread(s, 0);
+	if (!sbh)
+		goto out;
+	bfs_sb = (struct bfs_super_block *)sbh->b_data;
+	if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
+		if (!silent)
+			printf("No BFS filesystem on %s (magic=%08x)\n", 
+				s->s_id,  le32_to_cpu(bfs_sb->s_magic));
+		goto out1;
+	}
+	if (BFS_UNCLEAN(bfs_sb, s) && !silent)
+		printf("%s is unclean, continuing\n", s->s_id);
+
+	s->s_magic = BFS_MAGIC;
+
+	if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+		printf("Superblock is corrupted\n");
+		goto out1;
+	}
+
+	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
+					sizeof(struct bfs_inode)
+					+ BFS_ROOT_INO - 1;
+	imap_len = (info->si_lasti / 8) + 1;
+	info->si_imap = kzalloc(imap_len, GFP_KERNEL);
+	if (!info->si_imap)
+		goto out1;
+	for (i = 0; i < BFS_ROOT_INO; i++)
+		set_bit(i, info->si_imap);
+
+	s->s_op = &bfs_sops;
+	inode = bfs_iget(s, BFS_ROOT_INO);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out2;
+	}
+	s->s_root = d_make_root(inode);
+	if (!s->s_root) {
+		ret = -ENOMEM;
+		goto out2;
+	}
+
+	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
+			- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
+	info->si_freei = 0;
+	info->si_lf_eblk = 0;
+
+	/* can we read the last block? */
+	bh = sb_bread(s, info->si_blocks - 1);
+	if (!bh) {
+		printf("Last block not available: %lu\n", info->si_blocks - 1);
+		ret = -EIO;
+		goto out3;
+	}
+	brelse(bh);
+
+	bh = NULL;
+	for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
+		struct bfs_inode *di;
+		int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
+		int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
+		unsigned long eblock;
+
+		if (!off) {
+			brelse(bh);
+			bh = sb_bread(s, block);
+		}
+
+		if (!bh)
+			continue;
+
+		di = (struct bfs_inode *)bh->b_data + off;
+
+		/* test if filesystem is not corrupted */
+
+		i_eoff = le32_to_cpu(di->i_eoffset);
+		i_sblock = le32_to_cpu(di->i_sblock);
+		i_eblock = le32_to_cpu(di->i_eblock);
+		s_size = le32_to_cpu(bfs_sb->s_end);
+
+		if (i_sblock > info->si_blocks ||
+			i_eblock > info->si_blocks ||
+			i_sblock > i_eblock ||
+			i_eoff > s_size ||
+			i_sblock * BFS_BSIZE > i_eoff) {
+
+			printf("Inode 0x%08x corrupted\n", i);
+
+			brelse(bh);
+			ret = -EIO;
+			goto out3;
+		}
+
+		if (!di->i_ino) {
+			info->si_freei++;
+			continue;
+		}
+		set_bit(i, info->si_imap);
+		info->si_freeb -= BFS_FILEBLOCKS(di);
+
+		eblock =  le32_to_cpu(di->i_eblock);
+		if (eblock > info->si_lf_eblk)
+			info->si_lf_eblk = eblock;
+	}
+	brelse(bh);
+	brelse(sbh);
+	bfs_dump_imap("read_super", s);
+	return 0;
+
+out3:
+	dput(s->s_root);
+	s->s_root = NULL;
+out2:
+	kfree(info->si_imap);
+out1:
+	brelse(sbh);
+out:
+	mutex_destroy(&info->bfs_lock);
+	kfree(info);
+	s->s_fs_info = NULL;
+	return ret;
+}
+
+static struct dentry *bfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+}
+
+static struct file_system_type bfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bfs",
+	.mount		= bfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("bfs");
+
+static int __init init_bfs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+        err = register_filesystem(&bfs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_bfs_fs(void)
+{
+	unregister_filesystem(&bfs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_bfs_fs)
+module_exit(exit_bfs_fs)
diff --git a/kernel/fs/binfmt_aout.c b/kernel/fs/binfmt_aout.c
new file mode 100644
index 000000000..4c556680f
--- /dev/null
+++ b/kernel/fs/binfmt_aout.c
@@ -0,0 +1,423 @@
+/*
+ *  linux/fs/binfmt_aout.c
+ *
+ *  Copyright (C) 1991, 1992, 1996  Linus Torvalds
+ */
+
+#include <linux/module.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/a.out.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+#include <linux/coredump.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/a.out-core.h>
+
+static int load_aout_binary(struct linux_binprm *);
+static int load_aout_library(struct file*);
+
+#ifdef CONFIG_COREDUMP
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ *
+ * Note that setuid/setgid files won't make a core-dump if the uid/gid
+ * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
+ * field, which also makes sure the core-dumps won't be recursive if the
+ * dumping of the process results in another error..
+ */
+static int aout_core_dump(struct coredump_params *cprm)
+{
+	mm_segment_t fs;
+	int has_dumped = 0;
+	void __user *dump_start;
+	int dump_size;
+	struct user dump;
+#ifdef __alpha__
+#       define START_DATA(u)	((void __user *)u.start_data)
+#else
+#	define START_DATA(u)	((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+				 u.start_code))
+#endif
+#       define START_STACK(u)   ((void __user *)u.start_stack)
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	has_dumped = 1;
+       	strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
+	dump.u_ar0 = offsetof(struct user, regs);
+	dump.signal = cprm->siginfo->si_signo;
+	aout_dump_thread(cprm->regs, &dump);
+
+/* If the size of the dump file exceeds the rlimit, then see what would happen
+   if we wrote the stack, but not the data area.  */
+	if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
+		dump.u_dsize = 0;
+
+/* Make sure we have enough room to write the stack and data areas. */
+	if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
+		dump.u_ssize = 0;
+
+/* make sure we actually have a data and stack area to dump */
+	set_fs(USER_DS);
+	if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+		dump.u_dsize = 0;
+	if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+		dump.u_ssize = 0;
+
+	set_fs(KERNEL_DS);
+/* struct user */
+	if (!dump_emit(cprm, &dump, sizeof(dump)))
+		goto end_coredump;
+/* Now dump all of the user data.  Include malloced stuff as well */
+	if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
+		goto end_coredump;
+/* now we start writing out the user space info */
+	set_fs(USER_DS);
+/* Dump the data area */
+	if (dump.u_dsize != 0) {
+		dump_start = START_DATA(dump);
+		dump_size = dump.u_dsize << PAGE_SHIFT;
+		if (!dump_emit(cprm, dump_start, dump_size))
+			goto end_coredump;
+	}
+/* Now prepare to dump the stack area */
+	if (dump.u_ssize != 0) {
+		dump_start = START_STACK(dump);
+		dump_size = dump.u_ssize << PAGE_SHIFT;
+		if (!dump_emit(cprm, dump_start, dump_size))
+			goto end_coredump;
+	}
+end_coredump:
+	set_fs(fs);
+	return has_dumped;
+}
+#else
+#define aout_core_dump NULL
+#endif
+
+static struct linux_binfmt aout_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_aout_binary,
+	.load_shlib	= load_aout_library,
+	.core_dump	= aout_core_dump,
+	.min_coredump	= PAGE_SIZE
+};
+
+#define BAD_ADDR(x)	((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+	start = PAGE_ALIGN(start);
+	end = PAGE_ALIGN(end);
+	if (end > start) {
+		unsigned long addr;
+		addr = vm_brk(start, end - start);
+		if (BAD_ADDR(addr))
+			return addr;
+	}
+	return 0;
+}
+
+/*
+ * create_aout_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm)
+{
+	char __user * __user *argv;
+	char __user * __user *envp;
+	unsigned long __user *sp;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+
+	sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
+#ifdef __alpha__
+/* whee.. test-programs are so much fun. */
+	put_user(0, --sp);
+	put_user(0, --sp);
+	if (bprm->loader) {
+		put_user(0, --sp);
+		put_user(1003, --sp);
+		put_user(bprm->loader, --sp);
+		put_user(1002, --sp);
+	}
+	put_user(bprm->exec, --sp);
+	put_user(1001, --sp);
+#endif
+	sp -= envc+1;
+	envp = (char __user * __user *) sp;
+	sp -= argc+1;
+	argv = (char __user * __user *) sp;
+#ifndef __alpha__
+	put_user((unsigned long) envp,--sp);
+	put_user((unsigned long) argv,--sp);
+#endif
+	put_user(argc,--sp);
+	current->mm->arg_start = (unsigned long) p;
+	while (argc-->0) {
+		char c;
+		put_user(p,argv++);
+		do {
+			get_user(c,p++);
+		} while (c);
+	}
+	put_user(NULL,argv);
+	current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+	while (envc-->0) {
+		char c;
+		put_user(p,envp++);
+		do {
+			get_user(c,p++);
+		} while (c);
+	}
+	put_user(NULL,envp);
+	current->mm->env_end = (unsigned long) p;
+	return sp;
+}
+
+/*
+ * These are the functions used to load a.out style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+static int load_aout_binary(struct linux_binprm * bprm)
+{
+	struct pt_regs *regs = current_pt_regs();
+	struct exec ex;
+	unsigned long error;
+	unsigned long fd_offset;
+	unsigned long rlim;
+	int retval;
+
+	ex = *((struct exec *) bprm->buf);		/* exec-header */
+	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
+	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
+	    N_TRSIZE(ex) || N_DRSIZE(ex) ||
+	    i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+		return -ENOEXEC;
+	}
+
+	/*
+	 * Requires a mmap handler. This prevents people from using a.out
+	 * as part of an exploit attack against /proc-related vulnerabilities.
+	 */
+	if (!bprm->file->f_op->mmap)
+		return -ENOEXEC;
+
+	fd_offset = N_TXTOFF(ex);
+
+	/* Check initial limits. This avoids letting people circumvent
+	 * size limits imposed on them by creating programs with large
+	 * arrays in the data or bss.
+	 */
+	rlim = rlimit(RLIMIT_DATA);
+	if (rlim >= RLIM_INFINITY)
+		rlim = ~0;
+	if (ex.a_data + ex.a_bss > rlim)
+		return -ENOMEM;
+
+	/* Flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		return retval;
+
+	/* OK, This is the point of no return */
+#ifdef __alpha__
+	SET_AOUT_PERSONALITY(bprm, ex);
+#else
+	set_personality(PER_LINUX);
+#endif
+	setup_new_exec(bprm);
+
+	current->mm->end_code = ex.a_text +
+		(current->mm->start_code = N_TXTADDR(ex));
+	current->mm->end_data = ex.a_data +
+		(current->mm->start_data = N_DATADDR(ex));
+	current->mm->brk = ex.a_bss +
+		(current->mm->start_brk = N_BSSADDR(ex));
+
+	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0)
+		return retval;
+
+	install_exec_creds(bprm);
+
+	if (N_MAGIC(ex) == OMAGIC) {
+		unsigned long text_addr, map_size;
+		loff_t pos;
+
+		text_addr = N_TXTADDR(ex);
+
+#ifdef __alpha__
+		pos = fd_offset;
+		map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
+#else
+		pos = 32;
+		map_size = ex.a_text+ex.a_data;
+#endif
+		error = vm_brk(text_addr & PAGE_MASK, map_size);
+		if (error != (text_addr & PAGE_MASK))
+			return error;
+
+		error = read_code(bprm->file, text_addr, pos,
+				  ex.a_text+ex.a_data);
+		if ((signed long)error < 0)
+			return error;
+	} else {
+		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
+		{
+			printk(KERN_NOTICE "executable not page aligned\n");
+		}
+
+		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
+		{
+			printk(KERN_WARNING 
+			       "fd_offset is not page aligned. Please convert program: %pD\n",
+			       bprm->file);
+		}
+
+		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
+			vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
+				  ex.a_text + ex.a_data);
+			goto beyond_if;
+		}
+
+		error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+			PROT_READ | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+			fd_offset);
+
+		if (error != N_TXTADDR(ex))
+			return error;
+
+		error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+				PROT_READ | PROT_WRITE | PROT_EXEC,
+				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+				fd_offset + ex.a_text);
+		if (error != N_DATADDR(ex))
+			return error;
+	}
+beyond_if:
+	set_binfmt(&aout_format);
+
+	retval = set_brk(current->mm->start_brk, current->mm->brk);
+	if (retval < 0)
+		return retval;
+
+	current->mm->start_stack =
+		(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
+#ifdef __alpha__
+	regs->gp = ex.a_gpvalue;
+#endif
+	start_thread(regs, ex.a_entry, current->mm->start_stack);
+	return 0;
+}
+
+static int load_aout_library(struct file *file)
+{
+	struct inode * inode;
+	unsigned long bss, start_addr, len;
+	unsigned long error;
+	int retval;
+	struct exec ex;
+
+	inode = file_inode(file);
+
+	retval = -ENOEXEC;
+	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+	if (error != sizeof(ex))
+		goto out;
+
+	/* We come in here for the regular a.out style of shared libraries */
+	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
+	    N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
+	    i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+		goto out;
+	}
+
+	/*
+	 * Requires a mmap handler. This prevents people from using a.out
+	 * as part of an exploit attack against /proc-related vulnerabilities.
+	 */
+	if (!file->f_op->mmap)
+		goto out;
+
+	if (N_FLAGS(ex))
+		goto out;
+
+	/* For  QMAGIC, the starting address is 0x20 into the page.  We mask
+	   this off to get the starting address for the page */
+
+	start_addr =  ex.a_entry & 0xfffff000;
+
+	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
+		if (printk_ratelimit())
+		{
+			printk(KERN_WARNING 
+			       "N_TXTOFF is not page aligned. Please convert library: %pD\n",
+			       file);
+		}
+		vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+		
+		read_code(file, start_addr, N_TXTOFF(ex),
+			  ex.a_text + ex.a_data);
+		retval = 0;
+		goto out;
+	}
+	/* Now use mmap to map the library into memory. */
+	error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
+			PROT_READ | PROT_WRITE | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+			N_TXTOFF(ex));
+	retval = error;
+	if (error != start_addr)
+		goto out;
+
+	len = PAGE_ALIGN(ex.a_text + ex.a_data);
+	bss = ex.a_text + ex.a_data + ex.a_bss;
+	if (bss > len) {
+		error = vm_brk(start_addr + len, bss - len);
+		retval = error;
+		if (error != start_addr + len)
+			goto out;
+	}
+	retval = 0;
+out:
+	return retval;
+}
+
+static int __init init_aout_binfmt(void)
+{
+	register_binfmt(&aout_format);
+	return 0;
+}
+
+static void __exit exit_aout_binfmt(void)
+{
+	unregister_binfmt(&aout_format);
+}
+
+core_initcall(init_aout_binfmt);
+module_exit(exit_aout_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/binfmt_elf.c b/kernel/fs/binfmt_elf.c
new file mode 100644
index 000000000..cd46e4158
--- /dev/null
+++ b/kernel/fs/binfmt_elf.c
@@ -0,0 +1,2326 @@
+/*
+ * linux/fs/binfmt_elf.c
+ *
+ * These are the functions used to load ELF format executables as used
+ * on SVr4 machines.  Information on the format may be found in the book
+ * "UNIX SYSTEM V RELEASE 4 Programmers Guide: Ansi C and Programming Support
+ * Tools".
+ *
+ * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/personality.h>
+#include <linux/elfcore.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/security.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/elf-randomize.h>
+#include <linux/utsname.h>
+#include <linux/coredump.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/page.h>
+
+#ifndef user_long_t
+#define user_long_t long
+#endif
+#ifndef user_siginfo_t
+#define user_siginfo_t siginfo_t
+#endif
+
+static int load_elf_binary(struct linux_binprm *bprm);
+static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
+				int, int, unsigned long);
+
+#ifdef CONFIG_USELIB
+static int load_elf_library(struct file *);
+#else
+#define load_elf_library NULL
+#endif
+
+/*
+ * If we don't support core dumping, then supply a NULL so we
+ * don't even try.
+ */
+#ifdef CONFIG_ELF_CORE
+static int elf_core_dump(struct coredump_params *cprm);
+#else
+#define elf_core_dump	NULL
+#endif
+
+#if ELF_EXEC_PAGESIZE > PAGE_SIZE
+#define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
+#else
+#define ELF_MIN_ALIGN	PAGE_SIZE
+#endif
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS	0
+#endif
+
+#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
+#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
+
+static struct linux_binfmt elf_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_elf_binary,
+	.load_shlib	= load_elf_library,
+	.core_dump	= elf_core_dump,
+	.min_coredump	= ELF_EXEC_PAGESIZE,
+};
+
+#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+	start = ELF_PAGEALIGN(start);
+	end = ELF_PAGEALIGN(end);
+	if (end > start) {
+		unsigned long addr;
+		addr = vm_brk(start, end - start);
+		if (BAD_ADDR(addr))
+			return addr;
+	}
+	current->mm->start_brk = current->mm->brk = end;
+	return 0;
+}
+
+/* We need to explicitly zero any fractional pages
+   after the data section (i.e. bss).  This would
+   contain the junk from the file that should not
+   be in memory
+ */
+static int padzero(unsigned long elf_bss)
+{
+	unsigned long nbyte;
+
+	nbyte = ELF_PAGEOFFSET(elf_bss);
+	if (nbyte) {
+		nbyte = ELF_MIN_ALIGN - nbyte;
+		if (clear_user((void __user *) elf_bss, nbyte))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+/* Let's use some macros to make this stack manipulation a little clearer */
+#ifdef CONFIG_STACK_GROWSUP
+#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items))
+#define STACK_ROUND(sp, items) \
+	((15 + (unsigned long) ((sp) + (items))) &~ 15UL)
+#define STACK_ALLOC(sp, len) ({ \
+	elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \
+	old_sp; })
+#else
+#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
+#define STACK_ROUND(sp, items) \
+	(((unsigned long) (sp - items)) &~ 15UL)
+#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
+#endif
+
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
+static int
+create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
+		unsigned long load_addr, unsigned long interp_load_addr)
+{
+	unsigned long p = bprm->p;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+	elf_addr_t __user *argv;
+	elf_addr_t __user *envp;
+	elf_addr_t __user *sp;
+	elf_addr_t __user *u_platform;
+	elf_addr_t __user *u_base_platform;
+	elf_addr_t __user *u_rand_bytes;
+	const char *k_platform = ELF_PLATFORM;
+	const char *k_base_platform = ELF_BASE_PLATFORM;
+	unsigned char k_rand_bytes[16];
+	int items;
+	elf_addr_t *elf_info;
+	int ei_index = 0;
+	const struct cred *cred = current_cred();
+	struct vm_area_struct *vma;
+
+	/*
+	 * In some cases (e.g. Hyper-Threading), we want to avoid L1
+	 * evictions by the processes running on the same package. One
+	 * thing we can do is to shuffle the initial stack for them.
+	 */
+
+	p = arch_align_stack(p);
+
+	/*
+	 * If this architecture has a platform capability string, copy it
+	 * to userspace.  In some cases (Sparc), this info is impossible
+	 * for userspace to get any other way, in others (i386) it is
+	 * merely difficult.
+	 */
+	u_platform = NULL;
+	if (k_platform) {
+		size_t len = strlen(k_platform) + 1;
+
+		u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+		if (__copy_to_user(u_platform, k_platform, len))
+			return -EFAULT;
+	}
+
+	/*
+	 * If this architecture has a "base" platform capability
+	 * string, copy it to userspace.
+	 */
+	u_base_platform = NULL;
+	if (k_base_platform) {
+		size_t len = strlen(k_base_platform) + 1;
+
+		u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+		if (__copy_to_user(u_base_platform, k_base_platform, len))
+			return -EFAULT;
+	}
+
+	/*
+	 * Generate 16 random bytes for userspace PRNG seeding.
+	 */
+	get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+	u_rand_bytes = (elf_addr_t __user *)
+		       STACK_ALLOC(p, sizeof(k_rand_bytes));
+	if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+		return -EFAULT;
+
+	/* Create the ELF interpreter info */
+	elf_info = (elf_addr_t *)current->mm->saved_auxv;
+	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
+#define NEW_AUX_ENT(id, val) \
+	do { \
+		elf_info[ei_index++] = id; \
+		elf_info[ei_index++] = val; \
+	} while (0)
+
+#ifdef ARCH_DLINFO
+	/* 
+	 * ARCH_DLINFO must come first so PPC can do its special alignment of
+	 * AUXV.
+	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
+	 * ARCH_DLINFO changes
+	 */
+	ARCH_DLINFO;
+#endif
+	NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+	NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
+	NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
+	NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+	NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
+	NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
+	NEW_AUX_ENT(AT_BASE, interp_load_addr);
+	NEW_AUX_ENT(AT_FLAGS, 0);
+	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
+	NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
+	NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
+	NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
+	NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
+ 	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
+#ifdef ELF_HWCAP2
+	NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+#endif
+	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
+	if (k_platform) {
+		NEW_AUX_ENT(AT_PLATFORM,
+			    (elf_addr_t)(unsigned long)u_platform);
+	}
+	if (k_base_platform) {
+		NEW_AUX_ENT(AT_BASE_PLATFORM,
+			    (elf_addr_t)(unsigned long)u_base_platform);
+	}
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
+		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
+	}
+#undef NEW_AUX_ENT
+	/* AT_NULL is zero; clear the rest too */
+	memset(&elf_info[ei_index], 0,
+	       sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]);
+
+	/* And advance past the AT_NULL entry.  */
+	ei_index += 2;
+
+	sp = STACK_ADD(p, ei_index);
+
+	items = (argc + 1) + (envc + 1) + 1;
+	bprm->p = STACK_ROUND(sp, items);
+
+	/* Point sp at the lowest address on the stack */
+#ifdef CONFIG_STACK_GROWSUP
+	sp = (elf_addr_t __user *)bprm->p - items - ei_index;
+	bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */
+#else
+	sp = (elf_addr_t __user *)bprm->p;
+#endif
+
+
+	/*
+	 * Grow the stack manually; some architectures have a limit on how
+	 * far ahead a user-space access may be in order to grow the stack.
+	 */
+	vma = find_extend_vma(current->mm, bprm->p);
+	if (!vma)
+		return -EFAULT;
+
+	/* Now, let's put argc (and argv, envp if appropriate) on the stack */
+	if (__put_user(argc, sp++))
+		return -EFAULT;
+	argv = sp;
+	envp = argv + argc + 1;
+
+	/* Populate argv and envp */
+	p = current->mm->arg_end = current->mm->arg_start;
+	while (argc-- > 0) {
+		size_t len;
+		if (__put_user((elf_addr_t)p, argv++))
+			return -EFAULT;
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	if (__put_user(0, argv))
+		return -EFAULT;
+	current->mm->arg_end = current->mm->env_start = p;
+	while (envc-- > 0) {
+		size_t len;
+		if (__put_user((elf_addr_t)p, envp++))
+			return -EFAULT;
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	if (__put_user(0, envp))
+		return -EFAULT;
+	current->mm->env_end = p;
+
+	/* Put the elf_info on the stack in the right place.  */
+	sp = (elf_addr_t __user *)envp + 1;
+	if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
+		return -EFAULT;
+	return 0;
+}
+
+#ifndef elf_map
+
+static unsigned long elf_map(struct file *filep, unsigned long addr,
+		struct elf_phdr *eppnt, int prot, int type,
+		unsigned long total_size)
+{
+	unsigned long map_addr;
+	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+	addr = ELF_PAGESTART(addr);
+	size = ELF_PAGEALIGN(size);
+
+	/* mmap() will return -EINVAL if given a zero size, but a
+	 * segment with zero filesize is perfectly valid */
+	if (!size)
+		return addr;
+
+	/*
+	* total_size is the size of the ELF (interpreter) image.
+	* The _first_ mmap needs to know the full size, otherwise
+	* randomization might put this image into an overlapping
+	* position with the ELF binary image. (since size < total_size)
+	* So we first map the 'big' image - and unmap the remainder at
+	* the end. (which unmap is needed for ELF images with holes.)
+	*/
+	if (total_size) {
+		total_size = ELF_PAGEALIGN(total_size);
+		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+		if (!BAD_ADDR(map_addr))
+			vm_munmap(map_addr+size, total_size-size);
+	} else
+		map_addr = vm_mmap(filep, addr, size, prot, type, off);
+
+	return(map_addr);
+}
+
+#endif /* !elf_map */
+
+static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+{
+	int i, first_idx = -1, last_idx = -1;
+
+	for (i = 0; i < nr; i++) {
+		if (cmds[i].p_type == PT_LOAD) {
+			last_idx = i;
+			if (first_idx == -1)
+				first_idx = i;
+		}
+	}
+	if (first_idx == -1)
+		return 0;
+
+	return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+				ELF_PAGESTART(cmds[first_idx].p_vaddr);
+}
+
+/**
+ * load_elf_phdrs() - load ELF program headers
+ * @elf_ex:   ELF header of the binary whose program headers should be loaded
+ * @elf_file: the opened ELF binary file
+ *
+ * Loads ELF program headers from the binary file elf_file, which has the ELF
+ * header pointed to by elf_ex, into a newly allocated array. The caller is
+ * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+ */
+static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+				       struct file *elf_file)
+{
+	struct elf_phdr *elf_phdata = NULL;
+	int retval, size, err = -1;
+
+	/*
+	 * If the size of this structure has changed, then punt, since
+	 * we will be doing the wrong thing.
+	 */
+	if (elf_ex->e_phentsize != sizeof(struct elf_phdr))
+		goto out;
+
+	/* Sanity check the number of program headers... */
+	if (elf_ex->e_phnum < 1 ||
+		elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
+		goto out;
+
+	/* ...and their total size. */
+	size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
+	if (size > ELF_MIN_ALIGN)
+		goto out;
+
+	elf_phdata = kmalloc(size, GFP_KERNEL);
+	if (!elf_phdata)
+		goto out;
+
+	/* Read in the program headers */
+	retval = kernel_read(elf_file, elf_ex->e_phoff,
+			     (char *)elf_phdata, size);
+	if (retval != size) {
+		err = (retval < 0) ? retval : -EIO;
+		goto out;
+	}
+
+	/* Success! */
+	err = 0;
+out:
+	if (err) {
+		kfree(elf_phdata);
+		elf_phdata = NULL;
+	}
+	return elf_phdata;
+}
+
+#ifndef CONFIG_ARCH_BINFMT_ELF_STATE
+
+/**
+ * struct arch_elf_state - arch-specific ELF loading state
+ *
+ * This structure is used to preserve architecture specific data during
+ * the loading of an ELF file, throughout the checking of architecture
+ * specific ELF headers & through to the point where the ELF load is
+ * known to be proceeding (ie. SET_PERSONALITY).
+ *
+ * This implementation is a dummy for architectures which require no
+ * specific state.
+ */
+struct arch_elf_state {
+};
+
+#define INIT_ARCH_ELF_STATE {}
+
+/**
+ * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * @ehdr:	The main ELF header
+ * @phdr:	The program header to check
+ * @elf:	The open ELF file
+ * @is_interp:	True if the phdr is from the interpreter of the ELF being
+ *		loaded, else false.
+ * @state:	Architecture-specific state preserved throughout the process
+ *		of loading the ELF.
+ *
+ * Inspects the program header phdr to validate its correctness and/or
+ * suitability for the system. Called once per ELF program header in the
+ * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its
+ * interpreter.
+ *
+ * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+ *         with that return code.
+ */
+static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
+				   struct elf_phdr *phdr,
+				   struct file *elf, bool is_interp,
+				   struct arch_elf_state *state)
+{
+	/* Dummy implementation, always proceed */
+	return 0;
+}
+
+/**
+ * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * @ehdr:	The main ELF header
+ * @has_interp:	True if the ELF has an interpreter, else false.
+ * @state:	Architecture-specific state preserved throughout the process
+ *		of loading the ELF.
+ *
+ * Provides a final opportunity for architecture code to reject the loading
+ * of the ELF & cause an exec syscall to return an error. This is called after
+ * all program headers to be checked by arch_elf_pt_proc have been.
+ *
+ * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+ *         with that return code.
+ */
+static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
+				 struct arch_elf_state *state)
+{
+	/* Dummy implementation, always proceed */
+	return 0;
+}
+
+#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
+
+/* This is much more generalized than the library routine read function,
+   so we keep this separate.  Technically the library read function
+   is only provided so that we can read a.out libraries that have
+   an ELF header */
+
+static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
+		struct file *interpreter, unsigned long *interp_map_addr,
+		unsigned long no_base, struct elf_phdr *interp_elf_phdata)
+{
+	struct elf_phdr *eppnt;
+	unsigned long load_addr = 0;
+	int load_addr_set = 0;
+	unsigned long last_bss = 0, elf_bss = 0;
+	unsigned long error = ~0UL;
+	unsigned long total_size;
+	int i;
+
+	/* First of all, some simple consistency checks */
+	if (interp_elf_ex->e_type != ET_EXEC &&
+	    interp_elf_ex->e_type != ET_DYN)
+		goto out;
+	if (!elf_check_arch(interp_elf_ex))
+		goto out;
+	if (!interpreter->f_op->mmap)
+		goto out;
+
+	total_size = total_mapping_size(interp_elf_phdata,
+					interp_elf_ex->e_phnum);
+	if (!total_size) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	eppnt = interp_elf_phdata;
+	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+		if (eppnt->p_type == PT_LOAD) {
+			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+			int elf_prot = 0;
+			unsigned long vaddr = 0;
+			unsigned long k, map_addr;
+
+			if (eppnt->p_flags & PF_R)
+		    		elf_prot = PROT_READ;
+			if (eppnt->p_flags & PF_W)
+				elf_prot |= PROT_WRITE;
+			if (eppnt->p_flags & PF_X)
+				elf_prot |= PROT_EXEC;
+			vaddr = eppnt->p_vaddr;
+			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
+				elf_type |= MAP_FIXED;
+			else if (no_base && interp_elf_ex->e_type == ET_DYN)
+				load_addr = -vaddr;
+
+			map_addr = elf_map(interpreter, load_addr + vaddr,
+					eppnt, elf_prot, elf_type, total_size);
+			total_size = 0;
+			if (!*interp_map_addr)
+				*interp_map_addr = map_addr;
+			error = map_addr;
+			if (BAD_ADDR(map_addr))
+				goto out;
+
+			if (!load_addr_set &&
+			    interp_elf_ex->e_type == ET_DYN) {
+				load_addr = map_addr - ELF_PAGESTART(vaddr);
+				load_addr_set = 1;
+			}
+
+			/*
+			 * Check to see if the section's size will overflow the
+			 * allowed task size. Note that p_filesz must always be
+			 * <= p_memsize so it's only necessary to check p_memsz.
+			 */
+			k = load_addr + eppnt->p_vaddr;
+			if (BAD_ADDR(k) ||
+			    eppnt->p_filesz > eppnt->p_memsz ||
+			    eppnt->p_memsz > TASK_SIZE ||
+			    TASK_SIZE - eppnt->p_memsz < k) {
+				error = -ENOMEM;
+				goto out;
+			}
+
+			/*
+			 * Find the end of the file mapping for this phdr, and
+			 * keep track of the largest address we see for this.
+			 */
+			k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
+			if (k > elf_bss)
+				elf_bss = k;
+
+			/*
+			 * Do the same thing for the memory mapping - between
+			 * elf_bss and last_bss is the bss section.
+			 */
+			k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+			if (k > last_bss)
+				last_bss = k;
+		}
+	}
+
+	if (last_bss > elf_bss) {
+		/*
+		 * Now fill out the bss section.  First pad the last page up
+		 * to the page boundary, and then perform a mmap to make sure
+		 * that there are zero-mapped pages up to and including the
+		 * last bss page.
+		 */
+		if (padzero(elf_bss)) {
+			error = -EFAULT;
+			goto out;
+		}
+
+		/* What we have mapped so far */
+		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+
+		/* Map the last of the bss segment */
+		error = vm_brk(elf_bss, last_bss - elf_bss);
+		if (BAD_ADDR(error))
+			goto out;
+	}
+
+	error = load_addr;
+out:
+	return error;
+}
+
+/*
+ * These are the functions used to load ELF style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))	/* 8MB of VA */
+#endif
+
+static unsigned long randomize_stack_top(unsigned long stack_top)
+{
+	unsigned long random_variable = 0;
+
+	if ((current->flags & PF_RANDOMIZE) &&
+		!(current->personality & ADDR_NO_RANDOMIZE)) {
+		random_variable = (unsigned long) get_random_int();
+		random_variable &= STACK_RND_MASK;
+		random_variable <<= PAGE_SHIFT;
+	}
+#ifdef CONFIG_STACK_GROWSUP
+	return PAGE_ALIGN(stack_top) + random_variable;
+#else
+	return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+static int load_elf_binary(struct linux_binprm *bprm)
+{
+	struct file *interpreter = NULL; /* to shut gcc up */
+ 	unsigned long load_addr = 0, load_bias = 0;
+	int load_addr_set = 0;
+	char * elf_interpreter = NULL;
+	unsigned long error;
+	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
+	unsigned long elf_bss, elf_brk;
+	int retval, i;
+	unsigned long elf_entry;
+	unsigned long interp_load_addr = 0;
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long reloc_func_desc __maybe_unused = 0;
+	int executable_stack = EXSTACK_DEFAULT;
+	struct pt_regs *regs = current_pt_regs();
+	struct {
+		struct elfhdr elf_ex;
+		struct elfhdr interp_elf_ex;
+	} *loc;
+	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
+
+	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+	if (!loc) {
+		retval = -ENOMEM;
+		goto out_ret;
+	}
+	
+	/* Get the exec-header */
+	loc->elf_ex = *((struct elfhdr *)bprm->buf);
+
+	retval = -ENOEXEC;
+	/* First of all, some simple consistency checks */
+	if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+		goto out;
+
+	if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
+		goto out;
+	if (!elf_check_arch(&loc->elf_ex))
+		goto out;
+	if (!bprm->file->f_op->mmap)
+		goto out;
+
+	elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);
+	if (!elf_phdata)
+		goto out;
+
+	elf_ppnt = elf_phdata;
+	elf_bss = 0;
+	elf_brk = 0;
+
+	start_code = ~0UL;
+	end_code = 0;
+	start_data = 0;
+	end_data = 0;
+
+	for (i = 0; i < loc->elf_ex.e_phnum; i++) {
+		if (elf_ppnt->p_type == PT_INTERP) {
+			/* This is the program interpreter used for
+			 * shared libraries - for now assume that this
+			 * is an a.out format binary
+			 */
+			retval = -ENOEXEC;
+			if (elf_ppnt->p_filesz > PATH_MAX || 
+			    elf_ppnt->p_filesz < 2)
+				goto out_free_ph;
+
+			retval = -ENOMEM;
+			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
+						  GFP_KERNEL);
+			if (!elf_interpreter)
+				goto out_free_ph;
+
+			retval = kernel_read(bprm->file, elf_ppnt->p_offset,
+					     elf_interpreter,
+					     elf_ppnt->p_filesz);
+			if (retval != elf_ppnt->p_filesz) {
+				if (retval >= 0)
+					retval = -EIO;
+				goto out_free_interp;
+			}
+			/* make sure path is NULL terminated */
+			retval = -ENOEXEC;
+			if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
+				goto out_free_interp;
+
+			interpreter = open_exec(elf_interpreter);
+			retval = PTR_ERR(interpreter);
+			if (IS_ERR(interpreter))
+				goto out_free_interp;
+
+			/*
+			 * If the binary is not readable then enforce
+			 * mm->dumpable = 0 regardless of the interpreter's
+			 * permissions.
+			 */
+			would_dump(bprm, interpreter);
+
+			retval = kernel_read(interpreter, 0, bprm->buf,
+					     BINPRM_BUF_SIZE);
+			if (retval != BINPRM_BUF_SIZE) {
+				if (retval >= 0)
+					retval = -EIO;
+				goto out_free_dentry;
+			}
+
+			/* Get the exec headers */
+			loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
+			break;
+		}
+		elf_ppnt++;
+	}
+
+	elf_ppnt = elf_phdata;
+	for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
+		switch (elf_ppnt->p_type) {
+		case PT_GNU_STACK:
+			if (elf_ppnt->p_flags & PF_X)
+				executable_stack = EXSTACK_ENABLE_X;
+			else
+				executable_stack = EXSTACK_DISABLE_X;
+			break;
+
+		case PT_LOPROC ... PT_HIPROC:
+			retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt,
+						  bprm->file, false,
+						  &arch_state);
+			if (retval)
+				goto out_free_dentry;
+			break;
+		}
+
+	/* Some simple consistency checks for the interpreter */
+	if (elf_interpreter) {
+		retval = -ELIBBAD;
+		/* Not an ELF interpreter */
+		if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+			goto out_free_dentry;
+		/* Verify the interpreter has a valid arch */
+		if (!elf_check_arch(&loc->interp_elf_ex))
+			goto out_free_dentry;
+
+		/* Load the interpreter program headers */
+		interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
+						   interpreter);
+		if (!interp_elf_phdata)
+			goto out_free_dentry;
+
+		/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
+		elf_ppnt = interp_elf_phdata;
+		for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
+			switch (elf_ppnt->p_type) {
+			case PT_LOPROC ... PT_HIPROC:
+				retval = arch_elf_pt_proc(&loc->interp_elf_ex,
+							  elf_ppnt, interpreter,
+							  true, &arch_state);
+				if (retval)
+					goto out_free_dentry;
+				break;
+			}
+	}
+
+	/*
+	 * Allow arch code to reject the ELF at this point, whilst it's
+	 * still possible to return an error to the code that invoked
+	 * the exec syscall.
+	 */
+	retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state);
+	if (retval)
+		goto out_free_dentry;
+
+	/* Flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		goto out_free_dentry;
+
+	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
+	   may depend on the personality.  */
+	SET_PERSONALITY2(loc->elf_ex, &arch_state);
+	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
+		current->personality |= READ_IMPLIES_EXEC;
+
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		current->flags |= PF_RANDOMIZE;
+
+	setup_new_exec(bprm);
+
+	/* Do this so that we can load the interpreter, if need be.  We will
+	   change some of these later */
+	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
+				 executable_stack);
+	if (retval < 0)
+		goto out_free_dentry;
+	
+	current->mm->start_stack = bprm->p;
+
+	/* Now we do a little grungy work by mmapping the ELF image into
+	   the correct location in memory. */
+	for(i = 0, elf_ppnt = elf_phdata;
+	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+		int elf_prot = 0, elf_flags;
+		unsigned long k, vaddr;
+		unsigned long total_size = 0;
+
+		if (elf_ppnt->p_type != PT_LOAD)
+			continue;
+
+		if (unlikely (elf_brk > elf_bss)) {
+			unsigned long nbyte;
+	            
+			/* There was a PT_LOAD segment with p_memsz > p_filesz
+			   before this one. Map anonymous pages, if needed,
+			   and clear the area.  */
+			retval = set_brk(elf_bss + load_bias,
+					 elf_brk + load_bias);
+			if (retval)
+				goto out_free_dentry;
+			nbyte = ELF_PAGEOFFSET(elf_bss);
+			if (nbyte) {
+				nbyte = ELF_MIN_ALIGN - nbyte;
+				if (nbyte > elf_brk - elf_bss)
+					nbyte = elf_brk - elf_bss;
+				if (clear_user((void __user *)elf_bss +
+							load_bias, nbyte)) {
+					/*
+					 * This bss-zeroing can fail if the ELF
+					 * file specifies odd protections. So
+					 * we don't check the return value
+					 */
+				}
+			}
+		}
+
+		if (elf_ppnt->p_flags & PF_R)
+			elf_prot |= PROT_READ;
+		if (elf_ppnt->p_flags & PF_W)
+			elf_prot |= PROT_WRITE;
+		if (elf_ppnt->p_flags & PF_X)
+			elf_prot |= PROT_EXEC;
+
+		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+
+		vaddr = elf_ppnt->p_vaddr;
+		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
+			elf_flags |= MAP_FIXED;
+		} else if (loc->elf_ex.e_type == ET_DYN) {
+			/* Try and get dynamic programs out of the way of the
+			 * default mmap base, as well as whatever program they
+			 * might try to exec.  This is because the brk will
+			 * follow the loader, and is not movable.  */
+			load_bias = ELF_ET_DYN_BASE - vaddr;
+			if (current->flags & PF_RANDOMIZE)
+				load_bias += arch_mmap_rnd();
+			load_bias = ELF_PAGESTART(load_bias);
+			total_size = total_mapping_size(elf_phdata,
+							loc->elf_ex.e_phnum);
+			if (!total_size) {
+				retval = -EINVAL;
+				goto out_free_dentry;
+			}
+		}
+
+		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+				elf_prot, elf_flags, total_size);
+		if (BAD_ADDR(error)) {
+			retval = IS_ERR((void *)error) ?
+				PTR_ERR((void*)error) : -EINVAL;
+			goto out_free_dentry;
+		}
+
+		if (!load_addr_set) {
+			load_addr_set = 1;
+			load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+			if (loc->elf_ex.e_type == ET_DYN) {
+				load_bias += error -
+				             ELF_PAGESTART(load_bias + vaddr);
+				load_addr += load_bias;
+				reloc_func_desc = load_bias;
+			}
+		}
+		k = elf_ppnt->p_vaddr;
+		if (k < start_code)
+			start_code = k;
+		if (start_data < k)
+			start_data = k;
+
+		/*
+		 * Check to see if the section's size will overflow the
+		 * allowed task size. Note that p_filesz must always be
+		 * <= p_memsz so it is only necessary to check p_memsz.
+		 */
+		if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
+		    elf_ppnt->p_memsz > TASK_SIZE ||
+		    TASK_SIZE - elf_ppnt->p_memsz < k) {
+			/* set_brk can never work. Avoid overflows. */
+			retval = -EINVAL;
+			goto out_free_dentry;
+		}
+
+		k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
+
+		if (k > elf_bss)
+			elf_bss = k;
+		if ((elf_ppnt->p_flags & PF_X) && end_code < k)
+			end_code = k;
+		if (end_data < k)
+			end_data = k;
+		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
+		if (k > elf_brk)
+			elf_brk = k;
+	}
+
+	loc->elf_ex.e_entry += load_bias;
+	elf_bss += load_bias;
+	elf_brk += load_bias;
+	start_code += load_bias;
+	end_code += load_bias;
+	start_data += load_bias;
+	end_data += load_bias;
+
+	/* Calling set_brk effectively mmaps the pages that we need
+	 * for the bss and break sections.  We must do this before
+	 * mapping in the interpreter, to make sure it doesn't wind
+	 * up getting placed where the bss needs to go.
+	 */
+	retval = set_brk(elf_bss, elf_brk);
+	if (retval)
+		goto out_free_dentry;
+	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
+		retval = -EFAULT; /* Nobody gets to see this, but.. */
+		goto out_free_dentry;
+	}
+
+	if (elf_interpreter) {
+		unsigned long interp_map_addr = 0;
+
+		elf_entry = load_elf_interp(&loc->interp_elf_ex,
+					    interpreter,
+					    &interp_map_addr,
+					    load_bias, interp_elf_phdata);
+		if (!IS_ERR((void *)elf_entry)) {
+			/*
+			 * load_elf_interp() returns relocation
+			 * adjustment
+			 */
+			interp_load_addr = elf_entry;
+			elf_entry += loc->interp_elf_ex.e_entry;
+		}
+		if (BAD_ADDR(elf_entry)) {
+			retval = IS_ERR((void *)elf_entry) ?
+					(int)elf_entry : -EINVAL;
+			goto out_free_dentry;
+		}
+		reloc_func_desc = interp_load_addr;
+
+		allow_write_access(interpreter);
+		fput(interpreter);
+		kfree(elf_interpreter);
+	} else {
+		elf_entry = loc->elf_ex.e_entry;
+		if (BAD_ADDR(elf_entry)) {
+			retval = -EINVAL;
+			goto out_free_dentry;
+		}
+	}
+
+	kfree(interp_elf_phdata);
+	kfree(elf_phdata);
+
+	set_binfmt(&elf_format);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+	retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+	if (retval < 0)
+		goto out;
+#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
+
+	install_exec_creds(bprm);
+	retval = create_elf_tables(bprm, &loc->elf_ex,
+			  load_addr, interp_load_addr);
+	if (retval < 0)
+		goto out;
+	/* N.B. passed_fileno might not be initialized? */
+	current->mm->end_code = end_code;
+	current->mm->start_code = start_code;
+	current->mm->start_data = start_data;
+	current->mm->end_data = end_data;
+	current->mm->start_stack = bprm->p;
+
+	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+		current->mm->brk = current->mm->start_brk =
+			arch_randomize_brk(current->mm);
+#ifdef compat_brk_randomized
+		current->brk_randomized = 1;
+#endif
+	}
+
+	if (current->personality & MMAP_PAGE_ZERO) {
+		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
+		   and some applications "depend" upon this behavior.
+		   Since we do not have the power to recompile these, we
+		   emulate the SVr4 behavior. Sigh. */
+		error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
+				MAP_FIXED | MAP_PRIVATE, 0);
+	}
+
+#ifdef ELF_PLAT_INIT
+	/*
+	 * The ABI may specify that certain registers be set up in special
+	 * ways (on i386 %edx is the address of a DT_FINI function, for
+	 * example.  In addition, it may also specify (eg, PowerPC64 ELF)
+	 * that the e_entry field is the address of the function descriptor
+	 * for the startup routine, rather than the address of the startup
+	 * routine itself.  This macro performs whatever initialization to
+	 * the regs structure is required as well as any relocations to the
+	 * function descriptor entries when executing dynamically links apps.
+	 */
+	ELF_PLAT_INIT(regs, reloc_func_desc);
+#endif
+
+	start_thread(regs, elf_entry, bprm->p);
+	retval = 0;
+out:
+	kfree(loc);
+out_ret:
+	return retval;
+
+	/* error cleanup */
+out_free_dentry:
+	kfree(interp_elf_phdata);
+	allow_write_access(interpreter);
+	if (interpreter)
+		fput(interpreter);
+out_free_interp:
+	kfree(elf_interpreter);
+out_free_ph:
+	kfree(elf_phdata);
+	goto out;
+}
+
+#ifdef CONFIG_USELIB
+/* This is really simpleminded and specialized - we are loading an
+   a.out library that is given an ELF header. */
+static int load_elf_library(struct file *file)
+{
+	struct elf_phdr *elf_phdata;
+	struct elf_phdr *eppnt;
+	unsigned long elf_bss, bss, len;
+	int retval, error, i, j;
+	struct elfhdr elf_ex;
+
+	error = -ENOEXEC;
+	retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex));
+	if (retval != sizeof(elf_ex))
+		goto out;
+
+	if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+		goto out;
+
+	/* First of all, some simple consistency checks */
+	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
+	    !elf_check_arch(&elf_ex) || !file->f_op->mmap)
+		goto out;
+
+	/* Now read in all of the header information */
+
+	j = sizeof(struct elf_phdr) * elf_ex.e_phnum;
+	/* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */
+
+	error = -ENOMEM;
+	elf_phdata = kmalloc(j, GFP_KERNEL);
+	if (!elf_phdata)
+		goto out;
+
+	eppnt = elf_phdata;
+	error = -ENOEXEC;
+	retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j);
+	if (retval != j)
+		goto out_free_ph;
+
+	for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
+		if ((eppnt + i)->p_type == PT_LOAD)
+			j++;
+	if (j != 1)
+		goto out_free_ph;
+
+	while (eppnt->p_type != PT_LOAD)
+		eppnt++;
+
+	/* Now use mmap to map the library into memory. */
+	error = vm_mmap(file,
+			ELF_PAGESTART(eppnt->p_vaddr),
+			(eppnt->p_filesz +
+			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
+			PROT_READ | PROT_WRITE | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+			(eppnt->p_offset -
+			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
+	if (error != ELF_PAGESTART(eppnt->p_vaddr))
+		goto out_free_ph;
+
+	elf_bss = eppnt->p_vaddr + eppnt->p_filesz;
+	if (padzero(elf_bss)) {
+		error = -EFAULT;
+		goto out_free_ph;
+	}
+
+	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
+			    ELF_MIN_ALIGN - 1);
+	bss = eppnt->p_memsz + eppnt->p_vaddr;
+	if (bss > len)
+		vm_brk(len, bss - len);
+	error = 0;
+
+out_free_ph:
+	kfree(elf_phdata);
+out:
+	return error;
+}
+#endif /* #ifdef CONFIG_USELIB */
+
+#ifdef CONFIG_ELF_CORE
+/*
+ * ELF core dumper
+ *
+ * Modelled on fs/exec.c:aout_core_dump()
+ * Jeremy Fitzhardinge <jeremy@sw.oz.au>
+ */
+
+/*
+ * The purpose of always_dump_vma() is to make sure that special kernel mappings
+ * that are useful for post-mortem analysis are included in every core dump.
+ * In that way we ensure that the core dump is fully interpretable later
+ * without matching up the same kernel and hardware config to see what PC values
+ * meant. These special mappings include - vDSO, vsyscall, and other
+ * architecture specific mappings
+ */
+static bool always_dump_vma(struct vm_area_struct *vma)
+{
+	/* Any vsyscall mappings? */
+	if (vma == get_gate_vma(vma->vm_mm))
+		return true;
+
+	/*
+	 * Assume that all vmas with a .name op should always be dumped.
+	 * If this changes, a new vm_ops field can easily be added.
+	 */
+	if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
+		return true;
+
+	/*
+	 * arch_vma_name() returns non-NULL for special architecture mappings,
+	 * such as vDSO sections.
+	 */
+	if (arch_vma_name(vma))
+		return true;
+
+	return false;
+}
+
+/*
+ * Decide what to dump of a segment, part, all or none.
+ */
+static unsigned long vma_dump_size(struct vm_area_struct *vma,
+				   unsigned long mm_flags)
+{
+#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
+
+	/* always dump the vdso and vsyscall sections */
+	if (always_dump_vma(vma))
+		goto whole;
+
+	if (vma->vm_flags & VM_DONTDUMP)
+		return 0;
+
+	/* Hugetlb memory check */
+	if (vma->vm_flags & VM_HUGETLB) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+			goto whole;
+		return 0;
+	}
+
+	/* Do not dump I/O mapped devices or special mappings */
+	if (vma->vm_flags & VM_IO)
+		return 0;
+
+	/* By default, dump shared memory if mapped from an anonymous file. */
+	if (vma->vm_flags & VM_SHARED) {
+		if (file_inode(vma->vm_file)->i_nlink == 0 ?
+		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
+			goto whole;
+		return 0;
+	}
+
+	/* Dump segments that have been written to.  */
+	if (vma->anon_vma && FILTER(ANON_PRIVATE))
+		goto whole;
+	if (vma->vm_file == NULL)
+		return 0;
+
+	if (FILTER(MAPPED_PRIVATE))
+		goto whole;
+
+	/*
+	 * If this looks like the beginning of a DSO or executable mapping,
+	 * check for an ELF header.  If we find one, dump the first page to
+	 * aid in determining what was mapped here.
+	 */
+	if (FILTER(ELF_HEADERS) &&
+	    vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+		u32 __user *header = (u32 __user *) vma->vm_start;
+		u32 word;
+		mm_segment_t fs = get_fs();
+		/*
+		 * Doing it this way gets the constant folded by GCC.
+		 */
+		union {
+			u32 cmp;
+			char elfmag[SELFMAG];
+		} magic;
+		BUILD_BUG_ON(SELFMAG != sizeof word);
+		magic.elfmag[EI_MAG0] = ELFMAG0;
+		magic.elfmag[EI_MAG1] = ELFMAG1;
+		magic.elfmag[EI_MAG2] = ELFMAG2;
+		magic.elfmag[EI_MAG3] = ELFMAG3;
+		/*
+		 * Switch to the user "segment" for get_user(),
+		 * then put back what elf_core_dump() had in place.
+		 */
+		set_fs(USER_DS);
+		if (unlikely(get_user(word, header)))
+			word = 0;
+		set_fs(fs);
+		if (word == magic.cmp)
+			return PAGE_SIZE;
+	}
+
+#undef	FILTER
+
+	return 0;
+
+whole:
+	return vma->vm_end - vma->vm_start;
+}
+
+/* An ELF note in memory */
+struct memelfnote
+{
+	const char *name;
+	int type;
+	unsigned int datasz;
+	void *data;
+};
+
+static int notesize(struct memelfnote *en)
+{
+	int sz;
+
+	sz = sizeof(struct elf_note);
+	sz += roundup(strlen(en->name) + 1, 4);
+	sz += roundup(en->datasz, 4);
+
+	return sz;
+}
+
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
+{
+	struct elf_note en;
+	en.n_namesz = strlen(men->name) + 1;
+	en.n_descsz = men->datasz;
+	en.n_type = men->type;
+
+	return dump_emit(cprm, &en, sizeof(en)) &&
+	    dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+	    dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
+}
+
+static void fill_elf_header(struct elfhdr *elf, int segs,
+			    u16 machine, u32 flags)
+{
+	memset(elf, 0, sizeof(*elf));
+
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+
+	elf->e_type = ET_CORE;
+	elf->e_machine = machine;
+	elf->e_version = EV_CURRENT;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_flags = flags;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = segs;
+
+	return;
+}
+
+static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
+{
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = offset;
+	phdr->p_vaddr = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = sz;
+	phdr->p_memsz = 0;
+	phdr->p_flags = 0;
+	phdr->p_align = 0;
+	return;
+}
+
+static void fill_note(struct memelfnote *note, const char *name, int type, 
+		unsigned int sz, void *data)
+{
+	note->name = name;
+	note->type = type;
+	note->datasz = sz;
+	note->data = data;
+	return;
+}
+
+/*
+ * fill up all the fields in prstatus from the given task struct, except
+ * registers which need to be filled up separately.
+ */
+static void fill_prstatus(struct elf_prstatus *prstatus,
+		struct task_struct *p, long signr)
+{
+	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
+	prstatus->pr_sigpend = p->pending.signal.sig[0];
+	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	prstatus->pr_pid = task_pid_vnr(p);
+	prstatus->pr_pgrp = task_pgrp_vnr(p);
+	prstatus->pr_sid = task_session_vnr(p);
+	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
+		/*
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
+		 */
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+	} else {
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
+	}
+	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
+	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+}
+
+static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
+		       struct mm_struct *mm)
+{
+	const struct cred *cred;
+	unsigned int i, len;
+	
+	/* first copy the parameters from user space */
+	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
+
+	len = mm->arg_end - mm->arg_start;
+	if (len >= ELF_PRARGSZ)
+		len = ELF_PRARGSZ-1;
+	if (copy_from_user(&psinfo->pr_psargs,
+		           (const char __user *)mm->arg_start, len))
+		return -EFAULT;
+	for(i = 0; i < len; i++)
+		if (psinfo->pr_psargs[i] == 0)
+			psinfo->pr_psargs[i] = ' ';
+	psinfo->pr_psargs[len] = 0;
+
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	psinfo->pr_pid = task_pid_vnr(p);
+	psinfo->pr_pgrp = task_pgrp_vnr(p);
+	psinfo->pr_sid = task_session_vnr(p);
+
+	i = p->state ? ffz(~p->state) + 1 : 0;
+	psinfo->pr_state = i;
+	psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
+	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
+	psinfo->pr_nice = task_nice(p);
+	psinfo->pr_flag = p->flags;
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
+	rcu_read_unlock();
+	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+	
+	return 0;
+}
+
+static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
+{
+	elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
+	int i = 0;
+	do
+		i += 2;
+	while (auxv[i - 2] != AT_NULL);
+	fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+}
+
+static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
+		const siginfo_t *siginfo)
+{
+	mm_segment_t old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
+	set_fs(old_fs);
+	fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+}
+
+#define MAX_FILE_NOTE_SIZE (4*1024*1024)
+/*
+ * Format of NT_FILE note:
+ *
+ * long count     -- how many files are mapped
+ * long page_size -- units for file_ofs
+ * array of [COUNT] elements of
+ *   long start
+ *   long end
+ *   long file_ofs
+ * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
+ */
+static int fill_files_note(struct memelfnote *note)
+{
+	struct vm_area_struct *vma;
+	unsigned count, size, names_ofs, remaining, n;
+	user_long_t *data;
+	user_long_t *start_end_ofs;
+	char *name_base, *name_curpos;
+
+	/* *Estimated* file count and total data size needed */
+	count = current->mm->map_count;
+	size = count * 64;
+
+	names_ofs = (2 + 3 * count) * sizeof(data[0]);
+ alloc:
+	if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
+		return -EINVAL;
+	size = round_up(size, PAGE_SIZE);
+	data = vmalloc(size);
+	if (!data)
+		return -ENOMEM;
+
+	start_end_ofs = data + 2;
+	name_base = name_curpos = ((char *)data) + names_ofs;
+	remaining = size - names_ofs;
+	count = 0;
+	for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+		struct file *file;
+		const char *filename;
+
+		file = vma->vm_file;
+		if (!file)
+			continue;
+		filename = d_path(&file->f_path, name_curpos, remaining);
+		if (IS_ERR(filename)) {
+			if (PTR_ERR(filename) == -ENAMETOOLONG) {
+				vfree(data);
+				size = size * 5 / 4;
+				goto alloc;
+			}
+			continue;
+		}
+
+		/* d_path() fills at the end, move name down */
+		/* n = strlen(filename) + 1: */
+		n = (name_curpos + remaining) - filename;
+		remaining = filename - name_curpos;
+		memmove(name_curpos, filename, n);
+		name_curpos += n;
+
+		*start_end_ofs++ = vma->vm_start;
+		*start_end_ofs++ = vma->vm_end;
+		*start_end_ofs++ = vma->vm_pgoff;
+		count++;
+	}
+
+	/* Now we know exact count of files, can store it */
+	data[0] = count;
+	data[1] = PAGE_SIZE;
+	/*
+	 * Count usually is less than current->mm->map_count,
+	 * we need to move filenames down.
+	 */
+	n = current->mm->map_count - count;
+	if (n != 0) {
+		unsigned shift_bytes = n * 3 * sizeof(data[0]);
+		memmove(name_base - shift_bytes, name_base,
+			name_curpos - name_base);
+		name_curpos -= shift_bytes;
+	}
+
+	size = name_curpos - (char *)data;
+	fill_note(note, "CORE", NT_FILE, size, data);
+	return 0;
+}
+
+#ifdef CORE_DUMP_USE_REGSET
+#include <linux/regset.h>
+
+struct elf_thread_core_info {
+	struct elf_thread_core_info *next;
+	struct task_struct *task;
+	struct elf_prstatus prstatus;
+	struct memelfnote notes[0];
+};
+
+struct elf_note_info {
+	struct elf_thread_core_info *thread;
+	struct memelfnote psinfo;
+	struct memelfnote signote;
+	struct memelfnote auxv;
+	struct memelfnote files;
+	user_siginfo_t csigdata;
+	size_t size;
+	int thread_notes;
+};
+
+/*
+ * When a regset has a writeback hook, we call it on each thread before
+ * dumping user memory.  On register window machines, this makes sure the
+ * user memory backing the register data is up to date before we read it.
+ */
+static void do_thread_regset_writeback(struct task_struct *task,
+				       const struct user_regset *regset)
+{
+	if (regset->writeback)
+		regset->writeback(task, regset, 1);
+}
+
+#ifndef PR_REG_SIZE
+#define PR_REG_SIZE(S) sizeof(S)
+#endif
+
+#ifndef PRSTATUS_SIZE
+#define PRSTATUS_SIZE(S) sizeof(S)
+#endif
+
+#ifndef PR_REG_PTR
+#define PR_REG_PTR(S) (&((S)->pr_reg))
+#endif
+
+#ifndef SET_PR_FPVALID
+#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V))
+#endif
+
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+				 const struct user_regset_view *view,
+				 long signr, size_t *total)
+{
+	unsigned int i;
+
+	/*
+	 * NT_PRSTATUS is the one special case, because the regset data
+	 * goes into the pr_reg field inside the note contents, rather
+	 * than being the whole note contents.  We fill the reset in here.
+	 * We assume that regset 0 is NT_PRSTATUS.
+	 */
+	fill_prstatus(&t->prstatus, t->task, signr);
+	(void) view->regsets[0].get(t->task, &view->regsets[0],
+				    0, PR_REG_SIZE(t->prstatus.pr_reg),
+				    PR_REG_PTR(&t->prstatus), NULL);
+
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+		  PRSTATUS_SIZE(t->prstatus), &t->prstatus);
+	*total += notesize(&t->notes[0]);
+
+	do_thread_regset_writeback(t->task, &view->regsets[0]);
+
+	/*
+	 * Each other regset might generate a note too.  For each regset
+	 * that has no core_note_type or is inactive, we leave t->notes[i]
+	 * all zero and we'll know to skip writing it later.
+	 */
+	for (i = 1; i < view->n; ++i) {
+		const struct user_regset *regset = &view->regsets[i];
+		do_thread_regset_writeback(t->task, regset);
+		if (regset->core_note_type && regset->get &&
+		    (!regset->active || regset->active(t->task, regset))) {
+			int ret;
+			size_t size = regset->n * regset->size;
+			void *data = kmalloc(size, GFP_KERNEL);
+			if (unlikely(!data))
+				return 0;
+			ret = regset->get(t->task, regset,
+					  0, size, data, NULL);
+			if (unlikely(ret))
+				kfree(data);
+			else {
+				if (regset->core_note_type != NT_PRFPREG)
+					fill_note(&t->notes[i], "LINUX",
+						  regset->core_note_type,
+						  size, data);
+				else {
+					SET_PR_FPVALID(&t->prstatus, 1);
+					fill_note(&t->notes[i], "CORE",
+						  NT_PRFPREG, size, data);
+				}
+				*total += notesize(&t->notes[i]);
+			}
+		}
+	}
+
+	return 1;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+			  struct elf_note_info *info,
+			  const siginfo_t *siginfo, struct pt_regs *regs)
+{
+	struct task_struct *dump_task = current;
+	const struct user_regset_view *view = task_user_regset_view(dump_task);
+	struct elf_thread_core_info *t;
+	struct elf_prpsinfo *psinfo;
+	struct core_thread *ct;
+	unsigned int i;
+
+	info->size = 0;
+	info->thread = NULL;
+
+	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+	if (psinfo == NULL) {
+		info->psinfo.data = NULL; /* So we don't free this wrongly */
+		return 0;
+	}
+
+	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+
+	/*
+	 * Figure out how many notes we're going to need for each thread.
+	 */
+	info->thread_notes = 0;
+	for (i = 0; i < view->n; ++i)
+		if (view->regsets[i].core_note_type != 0)
+			++info->thread_notes;
+
+	/*
+	 * Sanity check.  We rely on regset 0 being in NT_PRSTATUS,
+	 * since it is our one special case.
+	 */
+	if (unlikely(info->thread_notes == 0) ||
+	    unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	/*
+	 * Initialize the ELF file header.
+	 */
+	fill_elf_header(elf, phdrs,
+			view->e_machine, view->e_flags);
+
+	/*
+	 * Allocate a structure for each thread.
+	 */
+	for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+		t = kzalloc(offsetof(struct elf_thread_core_info,
+				     notes[info->thread_notes]),
+			    GFP_KERNEL);
+		if (unlikely(!t))
+			return 0;
+
+		t->task = ct->task;
+		if (ct->task == dump_task || !info->thread) {
+			t->next = info->thread;
+			info->thread = t;
+		} else {
+			/*
+			 * Make sure to keep the original task at
+			 * the head of the list.
+			 */
+			t->next = info->thread->next;
+			info->thread->next = t;
+		}
+	}
+
+	/*
+	 * Now fill in each thread's information.
+	 */
+	for (t = info->thread; t != NULL; t = t->next)
+		if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+			return 0;
+
+	/*
+	 * Fill in the two process-wide notes.
+	 */
+	fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
+	info->size += notesize(&info->psinfo);
+
+	fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+	info->size += notesize(&info->signote);
+
+	fill_auxv_note(&info->auxv, current->mm);
+	info->size += notesize(&info->auxv);
+
+	if (fill_files_note(&info->files) == 0)
+		info->size += notesize(&info->files);
+
+	return 1;
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+	return info->size;
+}
+
+/*
+ * Write all the notes for each thread.  When writing the first thread, the
+ * process-wide notes are interleaved after the first thread-specific note.
+ */
+static int write_note_info(struct elf_note_info *info,
+			   struct coredump_params *cprm)
+{
+	bool first = true;
+	struct elf_thread_core_info *t = info->thread;
+
+	do {
+		int i;
+
+		if (!writenote(&t->notes[0], cprm))
+			return 0;
+
+		if (first && !writenote(&info->psinfo, cprm))
+			return 0;
+		if (first && !writenote(&info->signote, cprm))
+			return 0;
+		if (first && !writenote(&info->auxv, cprm))
+			return 0;
+		if (first && info->files.data &&
+				!writenote(&info->files, cprm))
+			return 0;
+
+		for (i = 1; i < info->thread_notes; ++i)
+			if (t->notes[i].data &&
+			    !writenote(&t->notes[i], cprm))
+				return 0;
+
+		first = false;
+		t = t->next;
+	} while (t);
+
+	return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+	struct elf_thread_core_info *threads = info->thread;
+	while (threads) {
+		unsigned int i;
+		struct elf_thread_core_info *t = threads;
+		threads = t->next;
+		WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
+		for (i = 1; i < info->thread_notes; ++i)
+			kfree(t->notes[i].data);
+		kfree(t);
+	}
+	kfree(info->psinfo.data);
+	vfree(info->files.data);
+}
+
+#else
+
+/* Here is the structure in which status of each thread is captured. */
+struct elf_thread_status
+{
+	struct list_head list;
+	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+	elf_fpregset_t fpu;		/* NT_PRFPREG */
+	struct task_struct *thread;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t xfpu;		/* ELF_CORE_XFPREG_TYPE */
+#endif
+	struct memelfnote notes[3];
+	int num_notes;
+};
+
+/*
+ * In order to add the specific thread information for the elf file format,
+ * we need to keep a linked list of every threads pr_status and then create
+ * a single section for them in the final core file.
+ */
+static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
+{
+	int sz = 0;
+	struct task_struct *p = t->thread;
+	t->num_notes = 0;
+
+	fill_prstatus(&t->prstatus, p, signr);
+	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
+	
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+		  &(t->prstatus));
+	t->num_notes++;
+	sz += notesize(&t->notes[0]);
+
+	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
+								&t->fpu))) {
+		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+			  &(t->fpu));
+		t->num_notes++;
+		sz += notesize(&t->notes[1]);
+	}
+
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
+		fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE,
+			  sizeof(t->xfpu), &t->xfpu);
+		t->num_notes++;
+		sz += notesize(&t->notes[2]);
+	}
+#endif	
+	return sz;
+}
+
+struct elf_note_info {
+	struct memelfnote *notes;
+	struct memelfnote *notes_files;
+	struct elf_prstatus *prstatus;	/* NT_PRSTATUS */
+	struct elf_prpsinfo *psinfo;	/* NT_PRPSINFO */
+	struct list_head thread_list;
+	elf_fpregset_t *fpu;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t *xfpu;
+#endif
+	user_siginfo_t csigdata;
+	int thread_status_size;
+	int numnote;
+};
+
+static int elf_note_info_init(struct elf_note_info *info)
+{
+	memset(info, 0, sizeof(*info));
+	INIT_LIST_HEAD(&info->thread_list);
+
+	/* Allocate space for ELF notes */
+	info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL);
+	if (!info->notes)
+		return 0;
+	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
+	if (!info->psinfo)
+		return 0;
+	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
+	if (!info->prstatus)
+		return 0;
+	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
+	if (!info->fpu)
+		return 0;
+#ifdef ELF_CORE_COPY_XFPREGS
+	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
+	if (!info->xfpu)
+		return 0;
+#endif
+	return 1;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+			  struct elf_note_info *info,
+			  const siginfo_t *siginfo, struct pt_regs *regs)
+{
+	struct list_head *t;
+	struct core_thread *ct;
+	struct elf_thread_status *ets;
+
+	if (!elf_note_info_init(info))
+		return 0;
+
+	for (ct = current->mm->core_state->dumper.next;
+					ct; ct = ct->next) {
+		ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+		if (!ets)
+			return 0;
+
+		ets->thread = ct->task;
+		list_add(&ets->list, &info->thread_list);
+	}
+
+	list_for_each(t, &info->thread_list) {
+		int sz;
+
+		ets = list_entry(t, struct elf_thread_status, list);
+		sz = elf_dump_thread_status(siginfo->si_signo, ets);
+		info->thread_status_size += sz;
+	}
+	/* now collect the dump for the current */
+	memset(info->prstatus, 0, sizeof(*info->prstatus));
+	fill_prstatus(info->prstatus, current, siginfo->si_signo);
+	elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+
+	/* Set up header */
+	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+
+	/*
+	 * Set up the notes in similar form to SVR4 core dumps made
+	 * with info from their /proc.
+	 */
+
+	fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
+		  sizeof(*info->prstatus), info->prstatus);
+	fill_psinfo(info->psinfo, current->group_leader, current->mm);
+	fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
+		  sizeof(*info->psinfo), info->psinfo);
+
+	fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+	fill_auxv_note(info->notes + 3, current->mm);
+	info->numnote = 4;
+
+	if (fill_files_note(info->notes + info->numnote) == 0) {
+		info->notes_files = info->notes + info->numnote;
+		info->numnote++;
+	}
+
+	/* Try to dump the FPU. */
+	info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
+							       info->fpu);
+	if (info->prstatus->pr_fpvalid)
+		fill_note(info->notes + info->numnote++,
+			  "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(current, info->xfpu))
+		fill_note(info->notes + info->numnote++,
+			  "LINUX", ELF_CORE_XFPREG_TYPE,
+			  sizeof(*info->xfpu), info->xfpu);
+#endif
+
+	return 1;
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+	int sz = 0;
+	int i;
+
+	for (i = 0; i < info->numnote; i++)
+		sz += notesize(info->notes + i);
+
+	sz += info->thread_status_size;
+
+	return sz;
+}
+
+static int write_note_info(struct elf_note_info *info,
+			   struct coredump_params *cprm)
+{
+	int i;
+	struct list_head *t;
+
+	for (i = 0; i < info->numnote; i++)
+		if (!writenote(info->notes + i, cprm))
+			return 0;
+
+	/* write out the thread status notes section */
+	list_for_each(t, &info->thread_list) {
+		struct elf_thread_status *tmp =
+				list_entry(t, struct elf_thread_status, list);
+
+		for (i = 0; i < tmp->num_notes; i++)
+			if (!writenote(&tmp->notes[i], cprm))
+				return 0;
+	}
+
+	return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+	while (!list_empty(&info->thread_list)) {
+		struct list_head *tmp = info->thread_list.next;
+		list_del(tmp);
+		kfree(list_entry(tmp, struct elf_thread_status, list));
+	}
+
+	/* Free data possibly allocated by fill_files_note(): */
+	if (info->notes_files)
+		vfree(info->notes_files->data);
+
+	kfree(info->prstatus);
+	kfree(info->psinfo);
+	kfree(info->notes);
+	kfree(info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	kfree(info->xfpu);
+#endif
+}
+
+#endif
+
+static struct vm_area_struct *first_vma(struct task_struct *tsk,
+					struct vm_area_struct *gate_vma)
+{
+	struct vm_area_struct *ret = tsk->mm->mmap;
+
+	if (ret)
+		return ret;
+	return gate_vma;
+}
+/*
+ * Helper function for iterating across a vma list.  It ensures that the caller
+ * will visit `gate_vma' prior to terminating the search.
+ */
+static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+					struct vm_area_struct *gate_vma)
+{
+	struct vm_area_struct *ret;
+
+	ret = this_vma->vm_next;
+	if (ret)
+		return ret;
+	if (this_vma == gate_vma)
+		return NULL;
+	return gate_vma;
+}
+
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+			     elf_addr_t e_shoff, int segs)
+{
+	elf->e_shoff = e_shoff;
+	elf->e_shentsize = sizeof(*shdr4extnum);
+	elf->e_shnum = 1;
+	elf->e_shstrndx = SHN_UNDEF;
+
+	memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+	shdr4extnum->sh_type = SHT_NULL;
+	shdr4extnum->sh_size = elf->e_shnum;
+	shdr4extnum->sh_link = elf->e_shstrndx;
+	shdr4extnum->sh_info = segs;
+}
+
+/*
+ * Actual dumper
+ *
+ * This is a two-pass process; first we find the offsets of the bits,
+ * and then they are actually written out.  If we run out of core limit
+ * we just truncate.
+ */
+static int elf_core_dump(struct coredump_params *cprm)
+{
+	int has_dumped = 0;
+	mm_segment_t fs;
+	int segs, i;
+	size_t vma_data_size = 0;
+	struct vm_area_struct *vma, *gate_vma;
+	struct elfhdr *elf = NULL;
+	loff_t offset = 0, dataoff;
+	struct elf_note_info info = { };
+	struct elf_phdr *phdr4note = NULL;
+	struct elf_shdr *shdr4extnum = NULL;
+	Elf_Half e_phnum;
+	elf_addr_t e_shoff;
+	elf_addr_t *vma_filesz = NULL;
+
+	/*
+	 * We no longer stop all VM operations.
+	 * 
+	 * This is because those proceses that could possibly change map_count
+	 * or the mmap / vma pages are now blocked in do_exit on current
+	 * finishing this core dump.
+	 *
+	 * Only ptrace can touch these memory addresses, but it doesn't change
+	 * the map_count or the pages allocated. So no possibility of crashing
+	 * exists while dumping the mm->vm_next areas to the core file.
+	 */
+  
+	/* alloc memory for large data structures: too large to be on stack */
+	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
+	if (!elf)
+		goto out;
+	/*
+	 * The number of segs are recored into ELF header as 16bit value.
+	 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
+	 */
+	segs = current->mm->map_count;
+	segs += elf_core_extra_phdrs();
+
+	gate_vma = get_gate_vma(current->mm);
+	if (gate_vma != NULL)
+		segs++;
+
+	/* for notes section */
+	segs++;
+
+	/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+	 * this, kernel supports extended numbering. Have a look at
+	 * include/linux/elf.h for further information. */
+	e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
+
+	/*
+	 * Collect all the non-memory information about the process for the
+	 * notes.  This also sets up the file header.
+	 */
+	if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+		goto cleanup;
+
+	has_dumped = 1;
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	offset += sizeof(*elf);				/* Elf header */
+	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
+
+	/* Write notes phdr entry */
+	{
+		size_t sz = get_note_info_size(&info);
+
+		sz += elf_coredump_extra_notes_size();
+
+		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+		if (!phdr4note)
+			goto end_coredump;
+
+		fill_elf_note_phdr(phdr4note, sz, offset);
+		offset += sz;
+	}
+
+	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
+
+	vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
+	if (!vma_filesz)
+		goto end_coredump;
+
+	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+			vma = next_vma(vma, gate_vma)) {
+		unsigned long dump_size;
+
+		dump_size = vma_dump_size(vma, cprm->mm_flags);
+		vma_filesz[i++] = dump_size;
+		vma_data_size += dump_size;
+	}
+
+	offset += vma_data_size;
+	offset += elf_core_extra_data_size();
+	e_shoff = offset;
+
+	if (e_phnum == PN_XNUM) {
+		shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+		if (!shdr4extnum)
+			goto end_coredump;
+		fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+	}
+
+	offset = dataoff;
+
+	if (!dump_emit(cprm, elf, sizeof(*elf)))
+		goto end_coredump;
+
+	if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
+		goto end_coredump;
+
+	/* Write program headers for segments dump */
+	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+			vma = next_vma(vma, gate_vma)) {
+		struct elf_phdr phdr;
+
+		phdr.p_type = PT_LOAD;
+		phdr.p_offset = offset;
+		phdr.p_vaddr = vma->vm_start;
+		phdr.p_paddr = 0;
+		phdr.p_filesz = vma_filesz[i++];
+		phdr.p_memsz = vma->vm_end - vma->vm_start;
+		offset += phdr.p_filesz;
+		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
+		if (vma->vm_flags & VM_WRITE)
+			phdr.p_flags |= PF_W;
+		if (vma->vm_flags & VM_EXEC)
+			phdr.p_flags |= PF_X;
+		phdr.p_align = ELF_EXEC_PAGESIZE;
+
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
+			goto end_coredump;
+	}
+
+	if (!elf_core_write_extra_phdrs(cprm, offset))
+		goto end_coredump;
+
+ 	/* write out the notes section */
+	if (!write_note_info(&info, cprm))
+		goto end_coredump;
+
+	if (elf_coredump_extra_notes_write(cprm))
+		goto end_coredump;
+
+	/* Align to page */
+	if (!dump_skip(cprm, dataoff - cprm->written))
+		goto end_coredump;
+
+	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+			vma = next_vma(vma, gate_vma)) {
+		unsigned long addr;
+		unsigned long end;
+
+		end = vma->vm_start + vma_filesz[i++];
+
+		for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
+			struct page *page;
+			int stop;
+
+			page = get_dump_page(addr);
+			if (page) {
+				void *kaddr = kmap(page);
+				stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
+				kunmap(page);
+				page_cache_release(page);
+			} else
+				stop = !dump_skip(cprm, PAGE_SIZE);
+			if (stop)
+				goto end_coredump;
+		}
+	}
+
+	if (!elf_core_write_extra_data(cprm))
+		goto end_coredump;
+
+	if (e_phnum == PN_XNUM) {
+		if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
+			goto end_coredump;
+	}
+
+end_coredump:
+	set_fs(fs);
+
+cleanup:
+	free_note_info(&info);
+	kfree(shdr4extnum);
+	kfree(vma_filesz);
+	kfree(phdr4note);
+	kfree(elf);
+out:
+	return has_dumped;
+}
+
+#endif		/* CONFIG_ELF_CORE */
+
+static int __init init_elf_binfmt(void)
+{
+	register_binfmt(&elf_format);
+	return 0;
+}
+
+static void __exit exit_elf_binfmt(void)
+{
+	/* Remove the COFF and ELF loaders. */
+	unregister_binfmt(&elf_format);
+}
+
+core_initcall(init_elf_binfmt);
+module_exit(exit_elf_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/binfmt_elf_fdpic.c b/kernel/fs/binfmt_elf_fdpic.c
new file mode 100644
index 000000000..d3634bfb7
--- /dev/null
+++ b/kernel/fs/binfmt_elf_fdpic.c
@@ -0,0 +1,1794 @@
+/* binfmt_elf_fdpic.c: FDPIC ELF binary format
+ *
+ * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Derived from binfmt_elf.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/highmem.h>
+#include <linux/highuid.h>
+#include <linux/personality.h>
+#include <linux/ptrace.h>
+#include <linux/init.h>
+#include <linux/elf.h>
+#include <linux/elf-fdpic.h>
+#include <linux/elfcore.h>
+#include <linux/coredump.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/pgalloc.h>
+
+typedef char *elf_caddr_t;
+
+#if 0
+#define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
+#else
+#define kdebug(fmt, ...) do {} while(0)
+#endif
+
+#if 0
+#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
+#else
+#define kdcore(fmt, ...) do {} while(0)
+#endif
+
+MODULE_LICENSE("GPL");
+
+static int load_elf_fdpic_binary(struct linux_binprm *);
+static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
+static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
+			      struct mm_struct *, const char *);
+
+static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *,
+				   struct elf_fdpic_params *,
+				   struct elf_fdpic_params *);
+
+#ifndef CONFIG_MMU
+static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *,
+					    unsigned long *);
+static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
+						   struct file *,
+						   struct mm_struct *);
+#endif
+
+static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
+					     struct file *, struct mm_struct *);
+
+#ifdef CONFIG_ELF_CORE
+static int elf_fdpic_core_dump(struct coredump_params *cprm);
+#endif
+
+static struct linux_binfmt elf_fdpic_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_elf_fdpic_binary,
+#ifdef CONFIG_ELF_CORE
+	.core_dump	= elf_fdpic_core_dump,
+#endif
+	.min_coredump	= ELF_EXEC_PAGESIZE,
+};
+
+static int __init init_elf_fdpic_binfmt(void)
+{
+	register_binfmt(&elf_fdpic_format);
+	return 0;
+}
+
+static void __exit exit_elf_fdpic_binfmt(void)
+{
+	unregister_binfmt(&elf_fdpic_format);
+}
+
+core_initcall(init_elf_fdpic_binfmt);
+module_exit(exit_elf_fdpic_binfmt);
+
+static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
+{
+	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0)
+		return 0;
+	if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)
+		return 0;
+	if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
+		return 0;
+	if (!file->f_op->mmap)
+		return 0;
+	return 1;
+}
+
+/*****************************************************************************/
+/*
+ * read the program headers table into memory
+ */
+static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
+				 struct file *file)
+{
+	struct elf32_phdr *phdr;
+	unsigned long size;
+	int retval, loop;
+
+	if (params->hdr.e_phentsize != sizeof(struct elf_phdr))
+		return -ENOMEM;
+	if (params->hdr.e_phnum > 65536U / sizeof(struct elf_phdr))
+		return -ENOMEM;
+
+	size = params->hdr.e_phnum * sizeof(struct elf_phdr);
+	params->phdrs = kmalloc(size, GFP_KERNEL);
+	if (!params->phdrs)
+		return -ENOMEM;
+
+	retval = kernel_read(file, params->hdr.e_phoff,
+			     (char *) params->phdrs, size);
+	if (unlikely(retval != size))
+		return retval < 0 ? retval : -ENOEXEC;
+
+	/* determine stack size for this binary */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (phdr->p_type != PT_GNU_STACK)
+			continue;
+
+		if (phdr->p_flags & PF_X)
+			params->flags |= ELF_FDPIC_FLAG_EXEC_STACK;
+		else
+			params->flags |= ELF_FDPIC_FLAG_NOEXEC_STACK;
+
+		params->stack_size = phdr->p_memsz;
+		break;
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * load an fdpic binary into various bits of memory
+ */
+static int load_elf_fdpic_binary(struct linux_binprm *bprm)
+{
+	struct elf_fdpic_params exec_params, interp_params;
+	struct pt_regs *regs = current_pt_regs();
+	struct elf_phdr *phdr;
+	unsigned long stack_size, entryaddr;
+#ifdef ELF_FDPIC_PLAT_INIT
+	unsigned long dynaddr;
+#endif
+#ifndef CONFIG_MMU
+	unsigned long stack_prot;
+#endif
+	struct file *interpreter = NULL; /* to shut gcc up */
+	char *interpreter_name = NULL;
+	int executable_stack;
+	int retval, i;
+
+	kdebug("____ LOAD %d ____", current->pid);
+
+	memset(&exec_params, 0, sizeof(exec_params));
+	memset(&interp_params, 0, sizeof(interp_params));
+
+	exec_params.hdr = *(struct elfhdr *) bprm->buf;
+	exec_params.flags = ELF_FDPIC_FLAG_PRESENT | ELF_FDPIC_FLAG_EXECUTABLE;
+
+	/* check that this is a binary we know how to deal with */
+	retval = -ENOEXEC;
+	if (!is_elf_fdpic(&exec_params.hdr, bprm->file))
+		goto error;
+
+	/* read the program header table */
+	retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file);
+	if (retval < 0)
+		goto error;
+
+	/* scan for a program header that specifies an interpreter */
+	phdr = exec_params.phdrs;
+
+	for (i = 0; i < exec_params.hdr.e_phnum; i++, phdr++) {
+		switch (phdr->p_type) {
+		case PT_INTERP:
+			retval = -ENOMEM;
+			if (phdr->p_filesz > PATH_MAX)
+				goto error;
+			retval = -ENOENT;
+			if (phdr->p_filesz < 2)
+				goto error;
+
+			/* read the name of the interpreter into memory */
+			interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL);
+			if (!interpreter_name)
+				goto error;
+
+			retval = kernel_read(bprm->file,
+					     phdr->p_offset,
+					     interpreter_name,
+					     phdr->p_filesz);
+			if (unlikely(retval != phdr->p_filesz)) {
+				if (retval >= 0)
+					retval = -ENOEXEC;
+				goto error;
+			}
+
+			retval = -ENOENT;
+			if (interpreter_name[phdr->p_filesz - 1] != '\0')
+				goto error;
+
+			kdebug("Using ELF interpreter %s", interpreter_name);
+
+			/* replace the program with the interpreter */
+			interpreter = open_exec(interpreter_name);
+			retval = PTR_ERR(interpreter);
+			if (IS_ERR(interpreter)) {
+				interpreter = NULL;
+				goto error;
+			}
+
+			/*
+			 * If the binary is not readable then enforce
+			 * mm->dumpable = 0 regardless of the interpreter's
+			 * permissions.
+			 */
+			would_dump(bprm, interpreter);
+
+			retval = kernel_read(interpreter, 0, bprm->buf,
+					     BINPRM_BUF_SIZE);
+			if (unlikely(retval != BINPRM_BUF_SIZE)) {
+				if (retval >= 0)
+					retval = -ENOEXEC;
+				goto error;
+			}
+
+			interp_params.hdr = *((struct elfhdr *) bprm->buf);
+			break;
+
+		case PT_LOAD:
+#ifdef CONFIG_MMU
+			if (exec_params.load_addr == 0)
+				exec_params.load_addr = phdr->p_vaddr;
+#endif
+			break;
+		}
+
+	}
+
+	if (elf_check_const_displacement(&exec_params.hdr))
+		exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
+
+	/* perform insanity checks on the interpreter */
+	if (interpreter_name) {
+		retval = -ELIBBAD;
+		if (!is_elf_fdpic(&interp_params.hdr, interpreter))
+			goto error;
+
+		interp_params.flags = ELF_FDPIC_FLAG_PRESENT;
+
+		/* read the interpreter's program header table */
+		retval = elf_fdpic_fetch_phdrs(&interp_params, interpreter);
+		if (retval < 0)
+			goto error;
+	}
+
+	stack_size = exec_params.stack_size;
+	if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+		executable_stack = EXSTACK_ENABLE_X;
+	else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+		executable_stack = EXSTACK_DISABLE_X;
+	else
+		executable_stack = EXSTACK_DEFAULT;
+
+	if (stack_size == 0) {
+		stack_size = interp_params.stack_size;
+		if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+			executable_stack = EXSTACK_ENABLE_X;
+		else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+			executable_stack = EXSTACK_DISABLE_X;
+		else
+			executable_stack = EXSTACK_DEFAULT;
+	}
+
+	retval = -ENOEXEC;
+	if (stack_size == 0)
+		goto error;
+
+	if (elf_check_const_displacement(&interp_params.hdr))
+		interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
+
+	/* flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		goto error;
+
+	/* there's now no turning back... the old userspace image is dead,
+	 * defunct, deceased, etc.
+	 */
+	set_personality(PER_LINUX_FDPIC);
+	if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
+		current->personality |= READ_IMPLIES_EXEC;
+
+	setup_new_exec(bprm);
+
+	set_binfmt(&elf_fdpic_format);
+
+	current->mm->start_code = 0;
+	current->mm->end_code = 0;
+	current->mm->start_stack = 0;
+	current->mm->start_data = 0;
+	current->mm->end_data = 0;
+	current->mm->context.exec_fdpic_loadmap = 0;
+	current->mm->context.interp_fdpic_loadmap = 0;
+
+#ifdef CONFIG_MMU
+	elf_fdpic_arch_lay_out_mm(&exec_params,
+				  &interp_params,
+				  &current->mm->start_stack,
+				  &current->mm->start_brk);
+
+	retval = setup_arg_pages(bprm, current->mm->start_stack,
+				 executable_stack);
+	if (retval < 0)
+		goto error;
+#endif
+
+	/* load the executable and interpreter into memory */
+	retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm,
+				    "executable");
+	if (retval < 0)
+		goto error;
+
+	if (interpreter_name) {
+		retval = elf_fdpic_map_file(&interp_params, interpreter,
+					    current->mm, "interpreter");
+		if (retval < 0) {
+			printk(KERN_ERR "Unable to load interpreter\n");
+			goto error;
+		}
+
+		allow_write_access(interpreter);
+		fput(interpreter);
+		interpreter = NULL;
+	}
+
+#ifdef CONFIG_MMU
+	if (!current->mm->start_brk)
+		current->mm->start_brk = current->mm->end_data;
+
+	current->mm->brk = current->mm->start_brk =
+		PAGE_ALIGN(current->mm->start_brk);
+
+#else
+	/* create a stack and brk area big enough for everyone
+	 * - the brk heap starts at the bottom and works up
+	 * - the stack starts at the top and works down
+	 */
+	stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK;
+	if (stack_size < PAGE_SIZE * 2)
+		stack_size = PAGE_SIZE * 2;
+
+	stack_prot = PROT_READ | PROT_WRITE;
+	if (executable_stack == EXSTACK_ENABLE_X ||
+	    (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
+		stack_prot |= PROT_EXEC;
+
+	current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot,
+					 MAP_PRIVATE | MAP_ANONYMOUS |
+					 MAP_UNINITIALIZED | MAP_GROWSDOWN,
+					 0);
+
+	if (IS_ERR_VALUE(current->mm->start_brk)) {
+		retval = current->mm->start_brk;
+		current->mm->start_brk = 0;
+		goto error;
+	}
+
+	current->mm->brk = current->mm->start_brk;
+	current->mm->context.end_brk = current->mm->start_brk;
+	current->mm->context.end_brk +=
+		(stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
+	current->mm->start_stack = current->mm->start_brk + stack_size;
+#endif
+
+	install_exec_creds(bprm);
+	if (create_elf_fdpic_tables(bprm, current->mm,
+				    &exec_params, &interp_params) < 0)
+		goto error;
+
+	kdebug("- start_code  %lx", current->mm->start_code);
+	kdebug("- end_code    %lx", current->mm->end_code);
+	kdebug("- start_data  %lx", current->mm->start_data);
+	kdebug("- end_data    %lx", current->mm->end_data);
+	kdebug("- start_brk   %lx", current->mm->start_brk);
+	kdebug("- brk         %lx", current->mm->brk);
+	kdebug("- start_stack %lx", current->mm->start_stack);
+
+#ifdef ELF_FDPIC_PLAT_INIT
+	/*
+	 * The ABI may specify that certain registers be set up in special
+	 * ways (on i386 %edx is the address of a DT_FINI function, for
+	 * example.  This macro performs whatever initialization to
+	 * the regs structure is required.
+	 */
+	dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr;
+	ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr,
+			    dynaddr);
+#endif
+
+	/* everything is now ready... get the userspace context ready to roll */
+	entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
+	start_thread(regs, entryaddr, current->mm->start_stack);
+
+	retval = 0;
+
+error:
+	if (interpreter) {
+		allow_write_access(interpreter);
+		fput(interpreter);
+	}
+	kfree(interpreter_name);
+	kfree(exec_params.phdrs);
+	kfree(exec_params.loadmap);
+	kfree(interp_params.phdrs);
+	kfree(interp_params.loadmap);
+	return retval;
+}
+
+/*****************************************************************************/
+
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
+/*
+ * present useful information to the program by shovelling it onto the new
+ * process's stack
+ */
+static int create_elf_fdpic_tables(struct linux_binprm *bprm,
+				   struct mm_struct *mm,
+				   struct elf_fdpic_params *exec_params,
+				   struct elf_fdpic_params *interp_params)
+{
+	const struct cred *cred = current_cred();
+	unsigned long sp, csp, nitems;
+	elf_caddr_t __user *argv, *envp;
+	size_t platform_len = 0, len;
+	char *k_platform, *k_base_platform;
+	char __user *u_platform, *u_base_platform, *p;
+	int loop;
+	int nr;	/* reset for each csp adjustment */
+
+#ifdef CONFIG_MMU
+	/* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
+	 * by the processes running on the same package. One thing we can do is
+	 * to shuffle the initial stack for them, so we give the architecture
+	 * an opportunity to do so here.
+	 */
+	sp = arch_align_stack(bprm->p);
+#else
+	sp = mm->start_stack;
+
+	/* stack the program arguments and environment */
+	if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0)
+		return -EFAULT;
+#endif
+
+	/*
+	 * If this architecture has a platform capability string, copy it
+	 * to userspace.  In some cases (Sparc), this info is impossible
+	 * for userspace to get any other way, in others (i386) it is
+	 * merely difficult.
+	 */
+	k_platform = ELF_PLATFORM;
+	u_platform = NULL;
+
+	if (k_platform) {
+		platform_len = strlen(k_platform) + 1;
+		sp -= platform_len;
+		u_platform = (char __user *) sp;
+		if (__copy_to_user(u_platform, k_platform, platform_len) != 0)
+			return -EFAULT;
+	}
+
+	/*
+	 * If this architecture has a "base" platform capability
+	 * string, copy it to userspace.
+	 */
+	k_base_platform = ELF_BASE_PLATFORM;
+	u_base_platform = NULL;
+
+	if (k_base_platform) {
+		platform_len = strlen(k_base_platform) + 1;
+		sp -= platform_len;
+		u_base_platform = (char __user *) sp;
+		if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0)
+			return -EFAULT;
+	}
+
+	sp &= ~7UL;
+
+	/* stack the load map(s) */
+	len = sizeof(struct elf32_fdpic_loadmap);
+	len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs;
+	sp = (sp - len) & ~7UL;
+	exec_params->map_addr = sp;
+
+	if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0)
+		return -EFAULT;
+
+	current->mm->context.exec_fdpic_loadmap = (unsigned long) sp;
+
+	if (interp_params->loadmap) {
+		len = sizeof(struct elf32_fdpic_loadmap);
+		len += sizeof(struct elf32_fdpic_loadseg) *
+			interp_params->loadmap->nsegs;
+		sp = (sp - len) & ~7UL;
+		interp_params->map_addr = sp;
+
+		if (copy_to_user((void __user *) sp, interp_params->loadmap,
+				 len) != 0)
+			return -EFAULT;
+
+		current->mm->context.interp_fdpic_loadmap = (unsigned long) sp;
+	}
+
+	/* force 16 byte _final_ alignment here for generality */
+#define DLINFO_ITEMS 15
+
+	nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) +
+		(k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
+
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD)
+		nitems++;
+
+	csp = sp;
+	sp -= nitems * 2 * sizeof(unsigned long);
+	sp -= (bprm->envc + 1) * sizeof(char *);	/* envv[] */
+	sp -= (bprm->argc + 1) * sizeof(char *);	/* argv[] */
+	sp -= 1 * sizeof(unsigned long);		/* argc */
+
+	csp -= sp & 15UL;
+	sp -= sp & 15UL;
+
+	/* put the ELF interpreter info on the stack */
+#define NEW_AUX_ENT(id, val)						\
+	do {								\
+		struct { unsigned long _id, _val; } __user *ent;	\
+									\
+		ent = (void __user *) csp;				\
+		__put_user((id), &ent[nr]._id);				\
+		__put_user((val), &ent[nr]._val);			\
+		nr++;							\
+	} while (0)
+
+	nr = 0;
+	csp -= 2 * sizeof(unsigned long);
+	NEW_AUX_ENT(AT_NULL, 0);
+	if (k_platform) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_PLATFORM,
+			    (elf_addr_t) (unsigned long) u_platform);
+	}
+
+	if (k_base_platform) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_BASE_PLATFORM,
+			    (elf_addr_t) (unsigned long) u_base_platform);
+	}
+
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
+	}
+
+	nr = 0;
+	csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
+	NEW_AUX_ENT(AT_HWCAP,	ELF_HWCAP);
+#ifdef ELF_HWCAP2
+	NEW_AUX_ENT(AT_HWCAP2,	ELF_HWCAP2);
+#endif
+	NEW_AUX_ENT(AT_PAGESZ,	PAGE_SIZE);
+	NEW_AUX_ENT(AT_CLKTCK,	CLOCKS_PER_SEC);
+	NEW_AUX_ENT(AT_PHDR,	exec_params->ph_addr);
+	NEW_AUX_ENT(AT_PHENT,	sizeof(struct elf_phdr));
+	NEW_AUX_ENT(AT_PHNUM,	exec_params->hdr.e_phnum);
+	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
+	NEW_AUX_ENT(AT_FLAGS,	0);
+	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid));
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid));
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid));
+	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
+
+#ifdef ARCH_DLINFO
+	nr = 0;
+	csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long);
+
+	/* ARCH_DLINFO must come last so platform specific code can enforce
+	 * special alignment requirements on the AUXV if necessary (eg. PPC).
+	 */
+	ARCH_DLINFO;
+#endif
+#undef NEW_AUX_ENT
+
+	/* allocate room for argv[] and envv[] */
+	csp -= (bprm->envc + 1) * sizeof(elf_caddr_t);
+	envp = (elf_caddr_t __user *) csp;
+	csp -= (bprm->argc + 1) * sizeof(elf_caddr_t);
+	argv = (elf_caddr_t __user *) csp;
+
+	/* stack argc */
+	csp -= sizeof(unsigned long);
+	__put_user(bprm->argc, (unsigned long __user *) csp);
+
+	BUG_ON(csp != sp);
+
+	/* fill in the argv[] array */
+#ifdef CONFIG_MMU
+	current->mm->arg_start = bprm->p;
+#else
+	current->mm->arg_start = current->mm->start_stack -
+		(MAX_ARG_PAGES * PAGE_SIZE - bprm->p);
+#endif
+
+	p = (char __user *) current->mm->arg_start;
+	for (loop = bprm->argc; loop > 0; loop--) {
+		__put_user((elf_caddr_t) p, argv++);
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	__put_user(NULL, argv);
+	current->mm->arg_end = (unsigned long) p;
+
+	/* fill in the envv[] array */
+	current->mm->env_start = (unsigned long) p;
+	for (loop = bprm->envc; loop > 0; loop--) {
+		__put_user((elf_caddr_t)(unsigned long) p, envp++);
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	__put_user(NULL, envp);
+	current->mm->env_end = (unsigned long) p;
+
+	mm->start_stack = (unsigned long) sp;
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * transfer the program arguments and environment from the holding pages onto
+ * the stack
+ */
+#ifndef CONFIG_MMU
+static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm,
+					    unsigned long *_sp)
+{
+	unsigned long index, stop, sp;
+	char *src;
+	int ret = 0;
+
+	stop = bprm->p >> PAGE_SHIFT;
+	sp = *_sp;
+
+	for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
+		src = kmap(bprm->page[index]);
+		sp -= PAGE_SIZE;
+		if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0)
+			ret = -EFAULT;
+		kunmap(bprm->page[index]);
+		if (ret < 0)
+			goto out;
+	}
+
+	*_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15;
+
+out:
+	return ret;
+}
+#endif
+
+/*****************************************************************************/
+/*
+ * load the appropriate binary image (executable or interpreter) into memory
+ * - we assume no MMU is available
+ * - if no other PIC bits are set in params->hdr->e_flags
+ *   - we assume that the LOADable segments in the binary are independently relocatable
+ *   - we assume R/O executable segments are shareable
+ * - else
+ *   - we assume the loadable parts of the image to require fixed displacement
+ *   - the image is not shareable
+ */
+static int elf_fdpic_map_file(struct elf_fdpic_params *params,
+			      struct file *file,
+			      struct mm_struct *mm,
+			      const char *what)
+{
+	struct elf32_fdpic_loadmap *loadmap;
+#ifdef CONFIG_MMU
+	struct elf32_fdpic_loadseg *mseg;
+#endif
+	struct elf32_fdpic_loadseg *seg;
+	struct elf32_phdr *phdr;
+	unsigned long load_addr, stop;
+	unsigned nloads, tmp;
+	size_t size;
+	int loop, ret;
+
+	/* allocate a load map table */
+	nloads = 0;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++)
+		if (params->phdrs[loop].p_type == PT_LOAD)
+			nloads++;
+
+	if (nloads == 0)
+		return -ELIBBAD;
+
+	size = sizeof(*loadmap) + nloads * sizeof(*seg);
+	loadmap = kzalloc(size, GFP_KERNEL);
+	if (!loadmap)
+		return -ENOMEM;
+
+	params->loadmap = loadmap;
+
+	loadmap->version = ELF32_FDPIC_LOADMAP_VERSION;
+	loadmap->nsegs = nloads;
+
+	load_addr = params->load_addr;
+	seg = loadmap->segs;
+
+	/* map the requested LOADs into the memory space */
+	switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
+	case ELF_FDPIC_FLAG_CONSTDISP:
+	case ELF_FDPIC_FLAG_CONTIGUOUS:
+#ifndef CONFIG_MMU
+		ret = elf_fdpic_map_file_constdisp_on_uclinux(params, file, mm);
+		if (ret < 0)
+			return ret;
+		break;
+#endif
+	default:
+		ret = elf_fdpic_map_file_by_direct_mmap(params, file, mm);
+		if (ret < 0)
+			return ret;
+		break;
+	}
+
+	/* map the entry point */
+	if (params->hdr.e_entry) {
+		seg = loadmap->segs;
+		for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
+			if (params->hdr.e_entry >= seg->p_vaddr &&
+			    params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) {
+				params->entry_addr =
+					(params->hdr.e_entry - seg->p_vaddr) +
+					seg->addr;
+				break;
+			}
+		}
+	}
+
+	/* determine where the program header table has wound up if mapped */
+	stop = params->hdr.e_phoff;
+	stop += params->hdr.e_phnum * sizeof (struct elf_phdr);
+	phdr = params->phdrs;
+
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		if (phdr->p_offset > params->hdr.e_phoff ||
+		    phdr->p_offset + phdr->p_filesz < stop)
+			continue;
+
+		seg = loadmap->segs;
+		for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
+			if (phdr->p_vaddr >= seg->p_vaddr &&
+			    phdr->p_vaddr + phdr->p_filesz <=
+			    seg->p_vaddr + seg->p_memsz) {
+				params->ph_addr =
+					(phdr->p_vaddr - seg->p_vaddr) +
+					seg->addr +
+					params->hdr.e_phoff - phdr->p_offset;
+				break;
+			}
+		}
+		break;
+	}
+
+	/* determine where the dynamic section has wound up if there is one */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (phdr->p_type != PT_DYNAMIC)
+			continue;
+
+		seg = loadmap->segs;
+		for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
+			if (phdr->p_vaddr >= seg->p_vaddr &&
+			    phdr->p_vaddr + phdr->p_memsz <=
+			    seg->p_vaddr + seg->p_memsz) {
+				params->dynamic_addr =
+					(phdr->p_vaddr - seg->p_vaddr) +
+					seg->addr;
+
+				/* check the dynamic section contains at least
+				 * one item, and that the last item is a NULL
+				 * entry */
+				if (phdr->p_memsz == 0 ||
+				    phdr->p_memsz % sizeof(Elf32_Dyn) != 0)
+					goto dynamic_error;
+
+				tmp = phdr->p_memsz / sizeof(Elf32_Dyn);
+				if (((Elf32_Dyn *)
+				     params->dynamic_addr)[tmp - 1].d_tag != 0)
+					goto dynamic_error;
+				break;
+			}
+		}
+		break;
+	}
+
+	/* now elide adjacent segments in the load map on MMU linux
+	 * - on uClinux the holes between may actually be filled with system
+	 *   stuff or stuff from other processes
+	 */
+#ifdef CONFIG_MMU
+	nloads = loadmap->nsegs;
+	mseg = loadmap->segs;
+	seg = mseg + 1;
+	for (loop = 1; loop < nloads; loop++) {
+		/* see if we have a candidate for merging */
+		if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) {
+			load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz);
+			if (load_addr == (seg->addr & PAGE_MASK)) {
+				mseg->p_memsz +=
+					load_addr -
+					(mseg->addr + mseg->p_memsz);
+				mseg->p_memsz += seg->addr & ~PAGE_MASK;
+				mseg->p_memsz += seg->p_memsz;
+				loadmap->nsegs--;
+				continue;
+			}
+		}
+
+		mseg++;
+		if (mseg != seg)
+			*mseg = *seg;
+	}
+#endif
+
+	kdebug("Mapped Object [%s]:", what);
+	kdebug("- elfhdr   : %lx", params->elfhdr_addr);
+	kdebug("- entry    : %lx", params->entry_addr);
+	kdebug("- PHDR[]   : %lx", params->ph_addr);
+	kdebug("- DYNAMIC[]: %lx", params->dynamic_addr);
+	seg = loadmap->segs;
+	for (loop = 0; loop < loadmap->nsegs; loop++, seg++)
+		kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]",
+		       loop,
+		       seg->addr, seg->addr + seg->p_memsz - 1,
+		       seg->p_vaddr, seg->p_memsz);
+
+	return 0;
+
+dynamic_error:
+	printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n",
+	       what, file_inode(file)->i_ino);
+	return -ELIBBAD;
+}
+
+/*****************************************************************************/
+/*
+ * map a file with constant displacement under uClinux
+ */
+#ifndef CONFIG_MMU
+static int elf_fdpic_map_file_constdisp_on_uclinux(
+	struct elf_fdpic_params *params,
+	struct file *file,
+	struct mm_struct *mm)
+{
+	struct elf32_fdpic_loadseg *seg;
+	struct elf32_phdr *phdr;
+	unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
+	int loop, ret;
+
+	load_addr = params->load_addr;
+	seg = params->loadmap->segs;
+
+	/* determine the bounds of the contiguous overall allocation we must
+	 * make */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (params->phdrs[loop].p_type != PT_LOAD)
+			continue;
+
+		if (base > phdr->p_vaddr)
+			base = phdr->p_vaddr;
+		if (top < phdr->p_vaddr + phdr->p_memsz)
+			top = phdr->p_vaddr + phdr->p_memsz;
+	}
+
+	/* allocate one big anon block for everything */
+	mflags = MAP_PRIVATE;
+	if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
+		mflags |= MAP_EXECUTABLE;
+
+	maddr = vm_mmap(NULL, load_addr, top - base,
+			PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
+	if (IS_ERR_VALUE(maddr))
+		return (int) maddr;
+
+	if (load_addr != 0)
+		load_addr += PAGE_ALIGN(top - base);
+
+	/* and then load the file segments into it */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (params->phdrs[loop].p_type != PT_LOAD)
+			continue;
+
+		seg->addr = maddr + (phdr->p_vaddr - base);
+		seg->p_vaddr = phdr->p_vaddr;
+		seg->p_memsz = phdr->p_memsz;
+
+		ret = read_code(file, seg->addr, phdr->p_offset,
+				       phdr->p_filesz);
+		if (ret < 0)
+			return ret;
+
+		/* map the ELF header address if in this segment */
+		if (phdr->p_offset == 0)
+			params->elfhdr_addr = seg->addr;
+
+		/* clear any space allocated but not loaded */
+		if (phdr->p_filesz < phdr->p_memsz) {
+			if (clear_user((void *) (seg->addr + phdr->p_filesz),
+				       phdr->p_memsz - phdr->p_filesz))
+				return -EFAULT;
+		}
+
+		if (mm) {
+			if (phdr->p_flags & PF_X) {
+				if (!mm->start_code) {
+					mm->start_code = seg->addr;
+					mm->end_code = seg->addr +
+						phdr->p_memsz;
+				}
+			} else if (!mm->start_data) {
+				mm->start_data = seg->addr;
+				mm->end_data = seg->addr + phdr->p_memsz;
+			}
+		}
+
+		seg++;
+	}
+
+	return 0;
+}
+#endif
+
+/*****************************************************************************/
+/*
+ * map a binary by direct mmap() of the individual PT_LOAD segments
+ */
+static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
+					     struct file *file,
+					     struct mm_struct *mm)
+{
+	struct elf32_fdpic_loadseg *seg;
+	struct elf32_phdr *phdr;
+	unsigned long load_addr, delta_vaddr;
+	int loop, dvset;
+
+	load_addr = params->load_addr;
+	delta_vaddr = 0;
+	dvset = 0;
+
+	seg = params->loadmap->segs;
+
+	/* deal with each load segment separately */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		unsigned long maddr, disp, excess, excess1;
+		int prot = 0, flags;
+
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		kdebug("[LOAD] va=%lx of=%lx fs=%lx ms=%lx",
+		       (unsigned long) phdr->p_vaddr,
+		       (unsigned long) phdr->p_offset,
+		       (unsigned long) phdr->p_filesz,
+		       (unsigned long) phdr->p_memsz);
+
+		/* determine the mapping parameters */
+		if (phdr->p_flags & PF_R) prot |= PROT_READ;
+		if (phdr->p_flags & PF_W) prot |= PROT_WRITE;
+		if (phdr->p_flags & PF_X) prot |= PROT_EXEC;
+
+		flags = MAP_PRIVATE | MAP_DENYWRITE;
+		if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
+			flags |= MAP_EXECUTABLE;
+
+		maddr = 0;
+
+		switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
+		case ELF_FDPIC_FLAG_INDEPENDENT:
+			/* PT_LOADs are independently locatable */
+			break;
+
+		case ELF_FDPIC_FLAG_HONOURVADDR:
+			/* the specified virtual address must be honoured */
+			maddr = phdr->p_vaddr;
+			flags |= MAP_FIXED;
+			break;
+
+		case ELF_FDPIC_FLAG_CONSTDISP:
+			/* constant displacement
+			 * - can be mapped anywhere, but must be mapped as a
+			 *   unit
+			 */
+			if (!dvset) {
+				maddr = load_addr;
+				delta_vaddr = phdr->p_vaddr;
+				dvset = 1;
+			} else {
+				maddr = load_addr + phdr->p_vaddr - delta_vaddr;
+				flags |= MAP_FIXED;
+			}
+			break;
+
+		case ELF_FDPIC_FLAG_CONTIGUOUS:
+			/* contiguity handled later */
+			break;
+
+		default:
+			BUG();
+		}
+
+		maddr &= PAGE_MASK;
+
+		/* create the mapping */
+		disp = phdr->p_vaddr & ~PAGE_MASK;
+		maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
+				phdr->p_offset - disp);
+
+		kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
+		       loop, phdr->p_memsz + disp, prot, flags,
+		       phdr->p_offset - disp, maddr);
+
+		if (IS_ERR_VALUE(maddr))
+			return (int) maddr;
+
+		if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) ==
+		    ELF_FDPIC_FLAG_CONTIGUOUS)
+			load_addr += PAGE_ALIGN(phdr->p_memsz + disp);
+
+		seg->addr = maddr + disp;
+		seg->p_vaddr = phdr->p_vaddr;
+		seg->p_memsz = phdr->p_memsz;
+
+		/* map the ELF header address if in this segment */
+		if (phdr->p_offset == 0)
+			params->elfhdr_addr = seg->addr;
+
+		/* clear the bit between beginning of mapping and beginning of
+		 * PT_LOAD */
+		if (prot & PROT_WRITE && disp > 0) {
+			kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
+			if (clear_user((void __user *) maddr, disp))
+				return -EFAULT;
+			maddr += disp;
+		}
+
+		/* clear any space allocated but not loaded
+		 * - on uClinux we can just clear the lot
+		 * - on MMU linux we'll get a SIGBUS beyond the last page
+		 *   extant in the file
+		 */
+		excess = phdr->p_memsz - phdr->p_filesz;
+		excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
+
+#ifdef CONFIG_MMU
+		if (excess > excess1) {
+			unsigned long xaddr = maddr + phdr->p_filesz + excess1;
+			unsigned long xmaddr;
+
+			flags |= MAP_FIXED | MAP_ANONYMOUS;
+			xmaddr = vm_mmap(NULL, xaddr, excess - excess1,
+					 prot, flags, 0);
+
+			kdebug("mmap[%d] <anon>"
+			       " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx",
+			       loop, xaddr, excess - excess1, prot, flags,
+			       xmaddr);
+
+			if (xmaddr != xaddr)
+				return -ENOMEM;
+		}
+
+		if (prot & PROT_WRITE && excess1 > 0) {
+			kdebug("clear[%d] ad=%lx sz=%lx",
+			       loop, maddr + phdr->p_filesz, excess1);
+			if (clear_user((void __user *) maddr + phdr->p_filesz,
+				       excess1))
+				return -EFAULT;
+		}
+
+#else
+		if (excess > 0) {
+			kdebug("clear[%d] ad=%lx sz=%lx",
+			       loop, maddr + phdr->p_filesz, excess);
+			if (clear_user((void *) maddr + phdr->p_filesz, excess))
+				return -EFAULT;
+		}
+#endif
+
+		if (mm) {
+			if (phdr->p_flags & PF_X) {
+				if (!mm->start_code) {
+					mm->start_code = maddr;
+					mm->end_code = maddr + phdr->p_memsz;
+				}
+			} else if (!mm->start_data) {
+				mm->start_data = maddr;
+				mm->end_data = maddr + phdr->p_memsz;
+			}
+		}
+
+		seg++;
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * ELF-FDPIC core dumper
+ *
+ * Modelled on fs/exec.c:aout_core_dump()
+ * Jeremy Fitzhardinge <jeremy@sw.oz.au>
+ *
+ * Modelled on fs/binfmt_elf.c core dumper
+ */
+#ifdef CONFIG_ELF_CORE
+
+/*
+ * Decide whether a segment is worth dumping; default is yes to be
+ * sure (missing info is worse than too much; etc).
+ * Personally I'd include everything, and use the coredump limit...
+ *
+ * I think we should skip something. But I am not sure how. H.J.
+ */
+static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
+{
+	int dump_ok;
+
+	/* Do not dump I/O mapped devices or special mappings */
+	if (vma->vm_flags & VM_IO) {
+		kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
+		return 0;
+	}
+
+	/* If we may not read the contents, don't allow us to dump
+	 * them either. "dump_write()" can't handle it anyway.
+	 */
+	if (!(vma->vm_flags & VM_READ)) {
+		kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags);
+		return 0;
+	}
+
+	/* By default, dump shared memory if mapped from an anonymous file. */
+	if (vma->vm_flags & VM_SHARED) {
+		if (file_inode(vma->vm_file)->i_nlink == 0) {
+			dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
+			kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
+			       vma->vm_flags, dump_ok ? "yes" : "no");
+			return dump_ok;
+		}
+
+		dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags);
+		kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
+		       vma->vm_flags, dump_ok ? "yes" : "no");
+		return dump_ok;
+	}
+
+#ifdef CONFIG_MMU
+	/* By default, if it hasn't been written to, don't write it out */
+	if (!vma->anon_vma) {
+		dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags);
+		kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start,
+		       vma->vm_flags, dump_ok ? "yes" : "no");
+		return dump_ok;
+	}
+#endif
+
+	dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags);
+	kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags,
+	       dump_ok ? "yes" : "no");
+	return dump_ok;
+}
+
+/* An ELF note in memory */
+struct memelfnote
+{
+	const char *name;
+	int type;
+	unsigned int datasz;
+	void *data;
+};
+
+static int notesize(struct memelfnote *en)
+{
+	int sz;
+
+	sz = sizeof(struct elf_note);
+	sz += roundup(strlen(en->name) + 1, 4);
+	sz += roundup(en->datasz, 4);
+
+	return sz;
+}
+
+/* #define DEBUG */
+
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
+{
+	struct elf_note en;
+	en.n_namesz = strlen(men->name) + 1;
+	en.n_descsz = men->datasz;
+	en.n_type = men->type;
+
+	return dump_emit(cprm, &en, sizeof(en)) &&
+		dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+		dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
+}
+
+static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
+{
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+
+	elf->e_type = ET_CORE;
+	elf->e_machine = ELF_ARCH;
+	elf->e_version = EV_CURRENT;
+	elf->e_entry = 0;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_shoff = 0;
+	elf->e_flags = ELF_FDPIC_CORE_EFLAGS;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = segs;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+	return;
+}
+
+static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
+{
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = offset;
+	phdr->p_vaddr = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = sz;
+	phdr->p_memsz = 0;
+	phdr->p_flags = 0;
+	phdr->p_align = 0;
+	return;
+}
+
+static inline void fill_note(struct memelfnote *note, const char *name, int type,
+		unsigned int sz, void *data)
+{
+	note->name = name;
+	note->type = type;
+	note->datasz = sz;
+	note->data = data;
+	return;
+}
+
+/*
+ * fill up all the fields in prstatus from the given task struct, except
+ * registers which need to be filled up separately.
+ */
+static void fill_prstatus(struct elf_prstatus *prstatus,
+			  struct task_struct *p, long signr)
+{
+	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
+	prstatus->pr_sigpend = p->pending.signal.sig[0];
+	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	prstatus->pr_pid = task_pid_vnr(p);
+	prstatus->pr_pgrp = task_pgrp_vnr(p);
+	prstatus->pr_sid = task_session_vnr(p);
+	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
+		/*
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
+		 */
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+	} else {
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
+	}
+	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
+	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+
+	prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
+	prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
+}
+
+static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
+		       struct mm_struct *mm)
+{
+	const struct cred *cred;
+	unsigned int i, len;
+
+	/* first copy the parameters from user space */
+	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
+
+	len = mm->arg_end - mm->arg_start;
+	if (len >= ELF_PRARGSZ)
+		len = ELF_PRARGSZ - 1;
+	if (copy_from_user(&psinfo->pr_psargs,
+		           (const char __user *) mm->arg_start, len))
+		return -EFAULT;
+	for (i = 0; i < len; i++)
+		if (psinfo->pr_psargs[i] == 0)
+			psinfo->pr_psargs[i] = ' ';
+	psinfo->pr_psargs[len] = 0;
+
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	psinfo->pr_pid = task_pid_vnr(p);
+	psinfo->pr_pgrp = task_pgrp_vnr(p);
+	psinfo->pr_sid = task_session_vnr(p);
+
+	i = p->state ? ffz(~p->state) + 1 : 0;
+	psinfo->pr_state = i;
+	psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
+	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
+	psinfo->pr_nice = task_nice(p);
+	psinfo->pr_flag = p->flags;
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
+	rcu_read_unlock();
+	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+
+	return 0;
+}
+
+/* Here is the structure in which status of each thread is captured. */
+struct elf_thread_status
+{
+	struct list_head list;
+	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+	elf_fpregset_t fpu;		/* NT_PRFPREG */
+	struct task_struct *thread;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t xfpu;		/* ELF_CORE_XFPREG_TYPE */
+#endif
+	struct memelfnote notes[3];
+	int num_notes;
+};
+
+/*
+ * In order to add the specific thread information for the elf file format,
+ * we need to keep a linked list of every thread's pr_status and then create
+ * a single section for them in the final core file.
+ */
+static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
+{
+	struct task_struct *p = t->thread;
+	int sz = 0;
+
+	t->num_notes = 0;
+
+	fill_prstatus(&t->prstatus, p, signr);
+	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+		  &t->prstatus);
+	t->num_notes++;
+	sz += notesize(&t->notes[0]);
+
+	t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu);
+	if (t->prstatus.pr_fpvalid) {
+		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+			  &t->fpu);
+		t->num_notes++;
+		sz += notesize(&t->notes[1]);
+	}
+
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
+		fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE,
+			  sizeof(t->xfpu), &t->xfpu);
+		t->num_notes++;
+		sz += notesize(&t->notes[2]);
+	}
+#endif
+	return sz;
+}
+
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+			     elf_addr_t e_shoff, int segs)
+{
+	elf->e_shoff = e_shoff;
+	elf->e_shentsize = sizeof(*shdr4extnum);
+	elf->e_shnum = 1;
+	elf->e_shstrndx = SHN_UNDEF;
+
+	memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+	shdr4extnum->sh_type = SHT_NULL;
+	shdr4extnum->sh_size = elf->e_shnum;
+	shdr4extnum->sh_link = elf->e_shstrndx;
+	shdr4extnum->sh_info = segs;
+}
+
+/*
+ * dump the segments for an MMU process
+ */
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
+{
+	struct vm_area_struct *vma;
+
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+		unsigned long addr;
+
+		if (!maydump(vma, cprm->mm_flags))
+			continue;
+
+#ifdef CONFIG_MMU
+		for (addr = vma->vm_start; addr < vma->vm_end;
+							addr += PAGE_SIZE) {
+			bool res;
+			struct page *page = get_dump_page(addr);
+			if (page) {
+				void *kaddr = kmap(page);
+				res = dump_emit(cprm, kaddr, PAGE_SIZE);
+				kunmap(page);
+				page_cache_release(page);
+			} else {
+				res = dump_skip(cprm, PAGE_SIZE);
+			}
+			if (!res)
+				return false;
+		}
+#else
+		if (!dump_emit(cprm, (void *) vma->vm_start,
+				vma->vm_end - vma->vm_start))
+			return false;
+#endif
+	}
+	return true;
+}
+
+static size_t elf_core_vma_data_size(unsigned long mm_flags)
+{
+	struct vm_area_struct *vma;
+	size_t size = 0;
+
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next)
+		if (maydump(vma, mm_flags))
+			size += vma->vm_end - vma->vm_start;
+	return size;
+}
+
+/*
+ * Actual dumper
+ *
+ * This is a two-pass process; first we find the offsets of the bits,
+ * and then they are actually written out.  If we run out of core limit
+ * we just truncate.
+ */
+static int elf_fdpic_core_dump(struct coredump_params *cprm)
+{
+#define	NUM_NOTES	6
+	int has_dumped = 0;
+	mm_segment_t fs;
+	int segs;
+	int i;
+	struct vm_area_struct *vma;
+	struct elfhdr *elf = NULL;
+	loff_t offset = 0, dataoff;
+	int numnote;
+	struct memelfnote *notes = NULL;
+	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
+	struct elf_prpsinfo *psinfo = NULL;	/* NT_PRPSINFO */
+ 	LIST_HEAD(thread_list);
+ 	struct list_head *t;
+	elf_fpregset_t *fpu = NULL;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t *xfpu = NULL;
+#endif
+	int thread_status_size = 0;
+	elf_addr_t *auxv;
+	struct elf_phdr *phdr4note = NULL;
+	struct elf_shdr *shdr4extnum = NULL;
+	Elf_Half e_phnum;
+	elf_addr_t e_shoff;
+	struct core_thread *ct;
+	struct elf_thread_status *tmp;
+
+	/*
+	 * We no longer stop all VM operations.
+	 *
+	 * This is because those proceses that could possibly change map_count
+	 * or the mmap / vma pages are now blocked in do_exit on current
+	 * finishing this core dump.
+	 *
+	 * Only ptrace can touch these memory addresses, but it doesn't change
+	 * the map_count or the pages allocated. So no possibility of crashing
+	 * exists while dumping the mm->vm_next areas to the core file.
+	 */
+
+	/* alloc memory for large data structures: too large to be on stack */
+	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
+	if (!elf)
+		goto cleanup;
+	prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL);
+	if (!prstatus)
+		goto cleanup;
+	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+	if (!psinfo)
+		goto cleanup;
+	notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
+	if (!notes)
+		goto cleanup;
+	fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
+	if (!fpu)
+		goto cleanup;
+#ifdef ELF_CORE_COPY_XFPREGS
+	xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
+	if (!xfpu)
+		goto cleanup;
+#endif
+
+	for (ct = current->mm->core_state->dumper.next;
+					ct; ct = ct->next) {
+		tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+		if (!tmp)
+			goto cleanup;
+
+		tmp->thread = ct->task;
+		list_add(&tmp->list, &thread_list);
+	}
+
+	list_for_each(t, &thread_list) {
+		struct elf_thread_status *tmp;
+		int sz;
+
+		tmp = list_entry(t, struct elf_thread_status, list);
+		sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
+		thread_status_size += sz;
+	}
+
+	/* now collect the dump for the current */
+	fill_prstatus(prstatus, current, cprm->siginfo->si_signo);
+	elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
+
+	segs = current->mm->map_count;
+	segs += elf_core_extra_phdrs();
+
+	/* for notes section */
+	segs++;
+
+	/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+	 * this, kernel supports extended numbering. Have a look at
+	 * include/linux/elf.h for further information. */
+	e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
+
+	/* Set up header */
+	fill_elf_fdpic_header(elf, e_phnum);
+
+	has_dumped = 1;
+	/*
+	 * Set up the notes in similar form to SVR4 core dumps made
+	 * with info from their /proc.
+	 */
+
+	fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
+	fill_psinfo(psinfo, current->group_leader, current->mm);
+	fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+
+	numnote = 2;
+
+	auxv = (elf_addr_t *) current->mm->saved_auxv;
+
+	i = 0;
+	do
+		i += 2;
+	while (auxv[i - 2] != AT_NULL);
+	fill_note(&notes[numnote++], "CORE", NT_AUXV,
+		  i * sizeof(elf_addr_t), auxv);
+
+  	/* Try to dump the FPU. */
+	if ((prstatus->pr_fpvalid =
+	     elf_core_copy_task_fpregs(current, cprm->regs, fpu)))
+		fill_note(notes + numnote++,
+			  "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(current, xfpu))
+		fill_note(notes + numnote++,
+			  "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
+#endif
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	offset += sizeof(*elf);				/* Elf header */
+	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
+
+	/* Write notes phdr entry */
+	{
+		int sz = 0;
+
+		for (i = 0; i < numnote; i++)
+			sz += notesize(notes + i);
+
+		sz += thread_status_size;
+
+		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+		if (!phdr4note)
+			goto end_coredump;
+
+		fill_elf_note_phdr(phdr4note, sz, offset);
+		offset += sz;
+	}
+
+	/* Page-align dumped data */
+	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
+
+	offset += elf_core_vma_data_size(cprm->mm_flags);
+	offset += elf_core_extra_data_size();
+	e_shoff = offset;
+
+	if (e_phnum == PN_XNUM) {
+		shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+		if (!shdr4extnum)
+			goto end_coredump;
+		fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+	}
+
+	offset = dataoff;
+
+	if (!dump_emit(cprm, elf, sizeof(*elf)))
+		goto end_coredump;
+
+	if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
+		goto end_coredump;
+
+	/* write program headers for segments dump */
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+		struct elf_phdr phdr;
+		size_t sz;
+
+		sz = vma->vm_end - vma->vm_start;
+
+		phdr.p_type = PT_LOAD;
+		phdr.p_offset = offset;
+		phdr.p_vaddr = vma->vm_start;
+		phdr.p_paddr = 0;
+		phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
+		phdr.p_memsz = sz;
+		offset += phdr.p_filesz;
+		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
+		if (vma->vm_flags & VM_WRITE)
+			phdr.p_flags |= PF_W;
+		if (vma->vm_flags & VM_EXEC)
+			phdr.p_flags |= PF_X;
+		phdr.p_align = ELF_EXEC_PAGESIZE;
+
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
+			goto end_coredump;
+	}
+
+	if (!elf_core_write_extra_phdrs(cprm, offset))
+		goto end_coredump;
+
+ 	/* write out the notes section */
+	for (i = 0; i < numnote; i++)
+		if (!writenote(notes + i, cprm))
+			goto end_coredump;
+
+	/* write out the thread status notes section */
+	list_for_each(t, &thread_list) {
+		struct elf_thread_status *tmp =
+				list_entry(t, struct elf_thread_status, list);
+
+		for (i = 0; i < tmp->num_notes; i++)
+			if (!writenote(&tmp->notes[i], cprm))
+				goto end_coredump;
+	}
+
+	if (!dump_skip(cprm, dataoff - cprm->written))
+		goto end_coredump;
+
+	if (!elf_fdpic_dump_segments(cprm))
+		goto end_coredump;
+
+	if (!elf_core_write_extra_data(cprm))
+		goto end_coredump;
+
+	if (e_phnum == PN_XNUM) {
+		if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
+			goto end_coredump;
+	}
+
+	if (cprm->file->f_pos != offset) {
+		/* Sanity check */
+		printk(KERN_WARNING
+		       "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n",
+		       cprm->file->f_pos, offset);
+	}
+
+end_coredump:
+	set_fs(fs);
+
+cleanup:
+	while (!list_empty(&thread_list)) {
+		struct list_head *tmp = thread_list.next;
+		list_del(tmp);
+		kfree(list_entry(tmp, struct elf_thread_status, list));
+	}
+	kfree(phdr4note);
+	kfree(elf);
+	kfree(prstatus);
+	kfree(psinfo);
+	kfree(notes);
+	kfree(fpu);
+	kfree(shdr4extnum);
+#ifdef ELF_CORE_COPY_XFPREGS
+	kfree(xfpu);
+#endif
+	return has_dumped;
+#undef NUM_NOTES
+}
+
+#endif		/* CONFIG_ELF_CORE */
diff --git a/kernel/fs/binfmt_em86.c b/kernel/fs/binfmt_em86.c
new file mode 100644
index 000000000..490538536
--- /dev/null
+++ b/kernel/fs/binfmt_em86.c
@@ -0,0 +1,117 @@
+/*
+ *  linux/fs/binfmt_em86.c
+ *
+ *  Based on linux/fs/binfmt_script.c
+ *  Copyright (C) 1996  Martin von Löwis
+ *  original #!-checking implemented by tytso.
+ *
+ *  em86 changes Copyright (C) 1997  Jim Paradis
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/binfmts.h>
+#include <linux/elf.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+
+
+#define EM86_INTERP	"/usr/bin/em86"
+#define EM86_I_NAME	"em86"
+
+static int load_em86(struct linux_binprm *bprm)
+{
+	char *interp, *i_name, *i_arg;
+	struct file * file;
+	int retval;
+	struct elfhdr	elf_ex;
+
+	/* Make sure this is a Linux/Intel ELF executable... */
+	elf_ex = *((struct elfhdr *)bprm->buf);
+
+	if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+		return  -ENOEXEC;
+
+	/* First of all, some simple consistency checks */
+	if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) ||
+		(!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) ||
+		!bprm->file->f_op->mmap) {
+			return -ENOEXEC;
+	}
+
+	/* Need to be able to load the file after exec */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
+	allow_write_access(bprm->file);
+	fput(bprm->file);
+	bprm->file = NULL;
+
+	/* Unlike in the script case, we don't have to do any hairy
+	 * parsing to find our interpreter... it's hardcoded!
+	 */
+	interp = EM86_INTERP;
+	i_name = EM86_I_NAME;
+	i_arg = NULL;		/* We reserve the right to add an arg later */
+
+	/*
+	 * Splice in (1) the interpreter's name for argv[0]
+	 *           (2) (optional) argument to interpreter
+	 *           (3) filename of emulated file (replace argv[0])
+	 *
+	 * This is done in reverse order, because of how the
+	 * user environment and arguments are stored.
+	 */
+	remove_arg_zero(bprm);
+	retval = copy_strings_kernel(1, &bprm->filename, bprm);
+	if (retval < 0) return retval; 
+	bprm->argc++;
+	if (i_arg) {
+		retval = copy_strings_kernel(1, &i_arg, bprm);
+		if (retval < 0) return retval; 
+		bprm->argc++;
+	}
+	retval = copy_strings_kernel(1, &i_name, bprm);
+	if (retval < 0)	return retval;
+	bprm->argc++;
+
+	/*
+	 * OK, now restart the process with the interpreter's inode.
+	 * Note that we use open_exec() as the name is now in kernel
+	 * space, and we don't need to copy it.
+	 */
+	file = open_exec(interp);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	bprm->file = file;
+
+	retval = prepare_binprm(bprm);
+	if (retval < 0)
+		return retval;
+
+	return search_binary_handler(bprm);
+}
+
+static struct linux_binfmt em86_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_em86,
+};
+
+static int __init init_em86_binfmt(void)
+{
+	register_binfmt(&em86_format);
+	return 0;
+}
+
+static void __exit exit_em86_binfmt(void)
+{
+	unregister_binfmt(&em86_format);
+}
+
+core_initcall(init_em86_binfmt);
+module_exit(exit_em86_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/binfmt_flat.c b/kernel/fs/binfmt_flat.c
new file mode 100644
index 000000000..f723cd3a4
--- /dev/null
+++ b/kernel/fs/binfmt_flat.c
@@ -0,0 +1,953 @@
+/****************************************************************************/
+/*
+ *  linux/fs/binfmt_flat.c
+ *
+ *	Copyright (C) 2000-2003 David McCullough <davidm@snapgear.com>
+ *	Copyright (C) 2002 Greg Ungerer <gerg@snapgear.com>
+ *	Copyright (C) 2002 SnapGear, by Paul Dale <pauli@snapgear.com>
+ *	Copyright (C) 2000, 2001 Lineo, by David McCullough <davidm@lineo.com>
+ *  based heavily on:
+ *
+ *  linux/fs/binfmt_aout.c:
+ *      Copyright (C) 1991, 1992, 1996  Linus Torvalds
+ *  linux/fs/binfmt_flat.c for 2.0 kernel
+ *	    Copyright (C) 1998  Kenneth Albanowski <kjahds@kjahds.com>
+ *	JAN/99 -- coded full program relocation (gerg@snapgear.com)
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/slab.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+#include <linux/flat.h>
+#include <linux/syscalls.h>
+
+#include <asm/byteorder.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <asm/cacheflush.h>
+#include <asm/page.h>
+
+/****************************************************************************/
+
+#if 0
+#define DEBUG 1
+#endif
+
+#ifdef DEBUG
+#define	DBG_FLT(a...)	printk(a)
+#else
+#define	DBG_FLT(a...)
+#endif
+
+/*
+ * User data (data section and bss) needs to be aligned.
+ * We pick 0x20 here because it is the max value elf2flt has always
+ * used in producing FLAT files, and because it seems to be large
+ * enough to make all the gcc alignment related tests happy.
+ */
+#define FLAT_DATA_ALIGN	(0x20)
+
+/*
+ * User data (stack) also needs to be aligned.
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
+ */
+#define FLAT_STACK_ALIGN	max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN)
+
+#define RELOC_FAILED 0xff00ff01		/* Relocation incorrect somewhere */
+#define UNLOADED_LIB 0x7ff000ff		/* Placeholder for unused library */
+
+struct lib_info {
+	struct {
+		unsigned long start_code;		/* Start of text segment */
+		unsigned long start_data;		/* Start of data segment */
+		unsigned long start_brk;		/* End of data segment */
+		unsigned long text_len;			/* Length of text segment */
+		unsigned long entry;			/* Start address for this module */
+		unsigned long build_date;		/* When this one was compiled */
+		short loaded;				/* Has this library been loaded? */
+	} lib_list[MAX_SHARED_LIBS];
+};
+
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+static int load_flat_shared_library(int id, struct lib_info *p);
+#endif
+
+static int load_flat_binary(struct linux_binprm *);
+static int flat_core_dump(struct coredump_params *cprm);
+
+static struct linux_binfmt flat_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_flat_binary,
+	.core_dump	= flat_core_dump,
+	.min_coredump	= PAGE_SIZE
+};
+
+/****************************************************************************/
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ */
+
+static int flat_core_dump(struct coredump_params *cprm)
+{
+	printk("Process %s:%d received signr %d and should have core dumped\n",
+			current->comm, current->pid, (int) cprm->siginfo->si_signo);
+	return(1);
+}
+
+/****************************************************************************/
+/*
+ * create_flat_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+
+static unsigned long create_flat_tables(
+	unsigned long pp,
+	struct linux_binprm * bprm)
+{
+	unsigned long *argv,*envp;
+	unsigned long * sp;
+	char * p = (char*)pp;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+	char uninitialized_var(dummy);
+
+	sp = (unsigned long *)p;
+	sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+	sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
+	argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+	envp = argv + (argc + 1);
+
+	if (flat_argvp_envp_on_stack()) {
+		put_user((unsigned long) envp, sp + 2);
+		put_user((unsigned long) argv, sp + 1);
+	}
+
+	put_user(argc, sp);
+	current->mm->arg_start = (unsigned long) p;
+	while (argc-->0) {
+		put_user((unsigned long) p, argv++);
+		do {
+			get_user(dummy, p); p++;
+		} while (dummy);
+	}
+	put_user((unsigned long) NULL, argv);
+	current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+	while (envc-->0) {
+		put_user((unsigned long)p, envp); envp++;
+		do {
+			get_user(dummy, p); p++;
+		} while (dummy);
+	}
+	put_user((unsigned long) NULL, envp);
+	current->mm->env_end = (unsigned long) p;
+	return (unsigned long)sp;
+}
+
+/****************************************************************************/
+
+#ifdef CONFIG_BINFMT_ZFLAT
+
+#include <linux/zlib.h>
+
+#define LBUFSIZE	4000
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
+#define RESERVED     0xC0 /* bit 6,7:   reserved */
+
+static int decompress_exec(
+	struct linux_binprm *bprm,
+	unsigned long offset,
+	char *dst,
+	long len,
+	int fd)
+{
+	unsigned char *buf;
+	z_stream strm;
+	loff_t fpos;
+	int ret, retval;
+
+	DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len);
+
+	memset(&strm, 0, sizeof(strm));
+	strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+	if (strm.workspace == NULL) {
+		DBG_FLT("binfmt_flat: no memory for decompress workspace\n");
+		return -ENOMEM;
+	}
+	buf = kmalloc(LBUFSIZE, GFP_KERNEL);
+	if (buf == NULL) {
+		DBG_FLT("binfmt_flat: no memory for read buffer\n");
+		retval = -ENOMEM;
+		goto out_free;
+	}
+
+	/* Read in first chunk of data and parse gzip header. */
+	fpos = offset;
+	ret = kernel_read(bprm->file, offset, buf, LBUFSIZE);
+
+	strm.next_in = buf;
+	strm.avail_in = ret;
+	strm.total_in = 0;
+	fpos += ret;
+
+	retval = -ENOEXEC;
+
+	/* Check minimum size -- gzip header */
+	if (ret < 10) {
+		DBG_FLT("binfmt_flat: file too small?\n");
+		goto out_free_buf;
+	}
+
+	/* Check gzip magic number */
+	if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) {
+		DBG_FLT("binfmt_flat: unknown compression magic?\n");
+		goto out_free_buf;
+	}
+
+	/* Check gzip method */
+	if (buf[2] != 8) {
+		DBG_FLT("binfmt_flat: unknown compression method?\n");
+		goto out_free_buf;
+	}
+	/* Check gzip flags */
+	if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) ||
+	    (buf[3] & RESERVED)) {
+		DBG_FLT("binfmt_flat: unknown flags?\n");
+		goto out_free_buf;
+	}
+
+	ret = 10;
+	if (buf[3] & EXTRA_FIELD) {
+		ret += 2 + buf[10] + (buf[11] << 8);
+		if (unlikely(LBUFSIZE <= ret)) {
+			DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
+			goto out_free_buf;
+		}
+	}
+	if (buf[3] & ORIG_NAME) {
+		while (ret < LBUFSIZE && buf[ret++] != 0)
+			;
+		if (unlikely(LBUFSIZE == ret)) {
+			DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
+			goto out_free_buf;
+		}
+	}
+	if (buf[3] & COMMENT) {
+		while (ret < LBUFSIZE && buf[ret++] != 0)
+			;
+		if (unlikely(LBUFSIZE == ret)) {
+			DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
+			goto out_free_buf;
+		}
+	}
+
+	strm.next_in += ret;
+	strm.avail_in -= ret;
+
+	strm.next_out = dst;
+	strm.avail_out = len;
+	strm.total_out = 0;
+
+	if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) {
+		DBG_FLT("binfmt_flat: zlib init failed?\n");
+		goto out_free_buf;
+	}
+
+	while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) {
+		ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE);
+		if (ret <= 0)
+			break;
+		len -= ret;
+
+		strm.next_in = buf;
+		strm.avail_in = ret;
+		strm.total_in = 0;
+		fpos += ret;
+	}
+
+	if (ret < 0) {
+		DBG_FLT("binfmt_flat: decompression failed (%d), %s\n",
+			ret, strm.msg);
+		goto out_zlib;
+	}
+
+	retval = 0;
+out_zlib:
+	zlib_inflateEnd(&strm);
+out_free_buf:
+	kfree(buf);
+out_free:
+	kfree(strm.workspace);
+	return retval;
+}
+
+#endif /* CONFIG_BINFMT_ZFLAT */
+
+/****************************************************************************/
+
+static unsigned long
+calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
+{
+	unsigned long addr;
+	int id;
+	unsigned long start_brk;
+	unsigned long start_data;
+	unsigned long text_len;
+	unsigned long start_code;
+
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+	if (r == 0)
+		id = curid;	/* Relocs of 0 are always self referring */
+	else {
+		id = (r >> 24) & 0xff;	/* Find ID for this reloc */
+		r &= 0x00ffffff;	/* Trim ID off here */
+	}
+	if (id >= MAX_SHARED_LIBS) {
+		printk("BINFMT_FLAT: reference 0x%x to shared library %d",
+				(unsigned) r, id);
+		goto failed;
+	}
+	if (curid != id) {
+		if (internalp) {
+			printk("BINFMT_FLAT: reloc address 0x%x not in same module "
+					"(%d != %d)", (unsigned) r, curid, id);
+			goto failed;
+		} else if ( ! p->lib_list[id].loaded &&
+				IS_ERR_VALUE(load_flat_shared_library(id, p))) {
+			printk("BINFMT_FLAT: failed to load library %d", id);
+			goto failed;
+		}
+		/* Check versioning information (i.e. time stamps) */
+		if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
+				p->lib_list[curid].build_date < p->lib_list[id].build_date) {
+			printk("BINFMT_FLAT: library %d is younger than %d", id, curid);
+			goto failed;
+		}
+	}
+#else
+	id = 0;
+#endif
+
+	start_brk = p->lib_list[id].start_brk;
+	start_data = p->lib_list[id].start_data;
+	start_code = p->lib_list[id].start_code;
+	text_len = p->lib_list[id].text_len;
+
+	if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
+		printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
+		       (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
+		goto failed;
+	}
+
+	if (r < text_len)			/* In text segment */
+		addr = r + start_code;
+	else					/* In data segment */
+		addr = r - text_len + start_data;
+
+	/* Range checked already above so doing the range tests is redundant...*/
+	return(addr);
+
+failed:
+	printk(", killing %s!\n", current->comm);
+	send_sig(SIGSEGV, current, 0);
+
+	return RELOC_FAILED;
+}
+
+/****************************************************************************/
+
+static void old_reloc(unsigned long rl)
+{
+#ifdef DEBUG
+	char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
+#endif
+	flat_v2_reloc_t	r;
+	unsigned long *ptr;
+	
+	r.value = rl;
+#if defined(CONFIG_COLDFIRE)
+	ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset);
+#else
+	ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset);
+#endif
+
+#ifdef DEBUG
+	printk("Relocation of variable at DATASEG+%x "
+		"(address %p, currently %x) into segment %s\n",
+		r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]);
+#endif
+	
+	switch (r.reloc.type) {
+	case OLD_FLAT_RELOC_TYPE_TEXT:
+		*ptr += current->mm->start_code;
+		break;
+	case OLD_FLAT_RELOC_TYPE_DATA:
+		*ptr += current->mm->start_data;
+		break;
+	case OLD_FLAT_RELOC_TYPE_BSS:
+		*ptr += current->mm->end_data;
+		break;
+	default:
+		printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type);
+		break;
+	}
+
+#ifdef DEBUG
+	printk("Relocation became %x\n", (int)*ptr);
+#endif
+}		
+
+/****************************************************************************/
+
+static int load_flat_file(struct linux_binprm * bprm,
+		struct lib_info *libinfo, int id, unsigned long *extra_stack)
+{
+	struct flat_hdr * hdr;
+	unsigned long textpos = 0, datapos = 0, result;
+	unsigned long realdatastart = 0;
+	unsigned long text_len, data_len, bss_len, stack_len, flags;
+	unsigned long full_data;
+	unsigned long len, memp = 0;
+	unsigned long memp_size, extra, rlim;
+	unsigned long *reloc = 0, *rp;
+	struct inode *inode;
+	int i, rev, relocs = 0;
+	loff_t fpos;
+	unsigned long start_code, end_code;
+	int ret;
+
+	hdr = ((struct flat_hdr *) bprm->buf);		/* exec-header */
+	inode = file_inode(bprm->file);
+
+	text_len  = ntohl(hdr->data_start);
+	data_len  = ntohl(hdr->data_end) - ntohl(hdr->data_start);
+	bss_len   = ntohl(hdr->bss_end) - ntohl(hdr->data_end);
+	stack_len = ntohl(hdr->stack_size);
+	if (extra_stack) {
+		stack_len += *extra_stack;
+		*extra_stack = stack_len;
+	}
+	relocs    = ntohl(hdr->reloc_count);
+	flags     = ntohl(hdr->flags);
+	rev       = ntohl(hdr->rev);
+	full_data = data_len + relocs * sizeof(unsigned long);
+
+	if (strncmp(hdr->magic, "bFLT", 4)) {
+		/*
+		 * Previously, here was a printk to tell people
+		 *   "BINFMT_FLAT: bad header magic".
+		 * But for the kernel which also use ELF FD-PIC format, this
+		 * error message is confusing.
+		 * because a lot of people do not manage to produce good
+		 */
+		ret = -ENOEXEC;
+		goto err;
+	}
+
+	if (flags & FLAT_FLAG_KTRACE)
+		printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
+
+	if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
+		printk("BINFMT_FLAT: bad flat file version 0x%x (supported "
+			"0x%lx and 0x%lx)\n",
+			rev, FLAT_VERSION, OLD_FLAT_VERSION);
+		ret = -ENOEXEC;
+		goto err;
+	}
+	
+	/* Don't allow old format executables to use shared libraries */
+	if (rev == OLD_FLAT_VERSION && id != 0) {
+		printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n",
+				(int) FLAT_VERSION);
+		ret = -ENOEXEC;
+		goto err;
+	}
+
+	/*
+	 * fix up the flags for the older format,  there were all kinds
+	 * of endian hacks,  this only works for the simple cases
+	 */
+	if (rev == OLD_FLAT_VERSION && flat_old_ram_flag(flags))
+		flags = FLAT_FLAG_RAM;
+
+#ifndef CONFIG_BINFMT_ZFLAT
+	if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) {
+		printk("Support for ZFLAT executables is not enabled.\n");
+		ret = -ENOEXEC;
+		goto err;
+	}
+#endif
+
+	/*
+	 * Check initial limits. This avoids letting people circumvent
+	 * size limits imposed on them by creating programs with large
+	 * arrays in the data or bss.
+	 */
+	rlim = rlimit(RLIMIT_DATA);
+	if (rlim >= RLIM_INFINITY)
+		rlim = ~0;
+	if (data_len + bss_len > rlim) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* Flush all traces of the currently running executable */
+	if (id == 0) {
+		result = flush_old_exec(bprm);
+		if (result) {
+			ret = result;
+			goto err;
+		}
+
+		/* OK, This is the point of no return */
+		set_personality(PER_LINUX_32BIT);
+		setup_new_exec(bprm);
+	}
+
+	/*
+	 * calculate the extra space we need to map in
+	 */
+	extra = max_t(unsigned long, bss_len + stack_len,
+			relocs * sizeof(unsigned long));
+
+	/*
+	 * there are a couple of cases here,  the separate code/data
+	 * case,  and then the fully copied to RAM case which lumps
+	 * it all together.
+	 */
+	if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) {
+		/*
+		 * this should give us a ROM ptr,  but if it doesn't we don't
+		 * really care
+		 */
+		DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
+
+		textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
+				  MAP_PRIVATE|MAP_EXECUTABLE, 0);
+		if (!textpos || IS_ERR_VALUE(textpos)) {
+			if (!textpos)
+				textpos = (unsigned long) -ENOMEM;
+			printk("Unable to mmap process text, errno %d\n", (int)-textpos);
+			ret = textpos;
+			goto err;
+		}
+
+		len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+		len = PAGE_ALIGN(len);
+		realdatastart = vm_mmap(0, 0, len,
+			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
+
+		if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
+			if (!realdatastart)
+				realdatastart = (unsigned long) -ENOMEM;
+			printk("Unable to allocate RAM for process data, errno %d\n",
+					(int)-realdatastart);
+			vm_munmap(textpos, text_len);
+			ret = realdatastart;
+			goto err;
+		}
+		datapos = ALIGN(realdatastart +
+				MAX_SHARED_LIBS * sizeof(unsigned long),
+				FLAT_DATA_ALIGN);
+
+		DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
+				(int)(data_len + bss_len + stack_len), (int)datapos);
+
+		fpos = ntohl(hdr->data_start);
+#ifdef CONFIG_BINFMT_ZFLAT
+		if (flags & FLAT_FLAG_GZDATA) {
+			result = decompress_exec(bprm, fpos, (char *) datapos, 
+						 full_data, 0);
+		} else
+#endif
+		{
+			result = read_code(bprm->file, datapos, fpos,
+					full_data);
+		}
+		if (IS_ERR_VALUE(result)) {
+			printk("Unable to read data+bss, errno %d\n", (int)-result);
+			vm_munmap(textpos, text_len);
+			vm_munmap(realdatastart, len);
+			ret = result;
+			goto err;
+		}
+
+		reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
+		memp = realdatastart;
+		memp_size = len;
+	} else {
+
+		len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+		len = PAGE_ALIGN(len);
+		textpos = vm_mmap(0, 0, len,
+			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
+
+		if (!textpos || IS_ERR_VALUE(textpos)) {
+			if (!textpos)
+				textpos = (unsigned long) -ENOMEM;
+			printk("Unable to allocate RAM for process text/data, errno %d\n",
+					(int)-textpos);
+			ret = textpos;
+			goto err;
+		}
+
+		realdatastart = textpos + ntohl(hdr->data_start);
+		datapos = ALIGN(realdatastart +
+				MAX_SHARED_LIBS * sizeof(unsigned long),
+				FLAT_DATA_ALIGN);
+
+		reloc = (unsigned long *)
+			(datapos + (ntohl(hdr->reloc_start) - text_len));
+		memp = textpos;
+		memp_size = len;
+#ifdef CONFIG_BINFMT_ZFLAT
+		/*
+		 * load it all in and treat it like a RAM load from now on
+		 */
+		if (flags & FLAT_FLAG_GZIP) {
+			result = decompress_exec(bprm, sizeof (struct flat_hdr),
+					 (((char *) textpos) + sizeof (struct flat_hdr)),
+					 (text_len + full_data
+						  - sizeof (struct flat_hdr)),
+					 0);
+			memmove((void *) datapos, (void *) realdatastart,
+					full_data);
+		} else if (flags & FLAT_FLAG_GZDATA) {
+			result = read_code(bprm->file, textpos, 0, text_len);
+			if (!IS_ERR_VALUE(result))
+				result = decompress_exec(bprm, text_len, (char *) datapos,
+						 full_data, 0);
+		}
+		else
+#endif
+		{
+			result = read_code(bprm->file, textpos, 0, text_len);
+			if (!IS_ERR_VALUE(result))
+				result = read_code(bprm->file, datapos,
+						   ntohl(hdr->data_start),
+						   full_data);
+		}
+		if (IS_ERR_VALUE(result)) {
+			printk("Unable to read code+data+bss, errno %d\n",(int)-result);
+			vm_munmap(textpos, text_len + data_len + extra +
+				MAX_SHARED_LIBS * sizeof(unsigned long));
+			ret = result;
+			goto err;
+		}
+	}
+
+	if (flags & FLAT_FLAG_KTRACE)
+		printk("Mapping is %x, Entry point is %x, data_start is %x\n",
+			(int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+
+	/* The main program needs a little extra setup in the task structure */
+	start_code = textpos + sizeof (struct flat_hdr);
+	end_code = textpos + text_len;
+	if (id == 0) {
+		current->mm->start_code = start_code;
+		current->mm->end_code = end_code;
+		current->mm->start_data = datapos;
+		current->mm->end_data = datapos + data_len;
+		/*
+		 * set up the brk stuff, uses any slack left in data/bss/stack
+		 * allocation.  We put the brk after the bss (between the bss
+		 * and stack) like other platforms.
+		 * Userspace code relies on the stack pointer starting out at
+		 * an address right at the end of a page.
+		 */
+		current->mm->start_brk = datapos + data_len + bss_len;
+		current->mm->brk = (current->mm->start_brk + 3) & ~3;
+		current->mm->context.end_brk = memp + memp_size - stack_len;
+	}
+
+	if (flags & FLAT_FLAG_KTRACE)
+		printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n",
+			id ? "Lib" : "Load", bprm->filename,
+			(int) start_code, (int) end_code,
+			(int) datapos,
+			(int) (datapos + data_len),
+			(int) (datapos + data_len),
+			(int) (((datapos + data_len + bss_len) + 3) & ~3));
+
+	text_len -= sizeof(struct flat_hdr); /* the real code len */
+
+	/* Store the current module values into the global library structure */
+	libinfo->lib_list[id].start_code = start_code;
+	libinfo->lib_list[id].start_data = datapos;
+	libinfo->lib_list[id].start_brk = datapos + data_len + bss_len;
+	libinfo->lib_list[id].text_len = text_len;
+	libinfo->lib_list[id].loaded = 1;
+	libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
+	libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
+	
+	/*
+	 * We just load the allocations into some temporary memory to
+	 * help simplify all this mumbo jumbo
+	 *
+	 * We've got two different sections of relocation entries.
+	 * The first is the GOT which resides at the beginning of the data segment
+	 * and is terminated with a -1.  This one can be relocated in place.
+	 * The second is the extra relocation entries tacked after the image's
+	 * data segment. These require a little more processing as the entry is
+	 * really an offset into the image which contains an offset into the
+	 * image.
+	 */
+	if (flags & FLAT_FLAG_GOTPIC) {
+		for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) {
+			unsigned long addr;
+			if (*rp) {
+				addr = calc_reloc(*rp, libinfo, id, 0);
+				if (addr == RELOC_FAILED) {
+					ret = -ENOEXEC;
+					goto err;
+				}
+				*rp = addr;
+			}
+		}
+	}
+
+	/*
+	 * Now run through the relocation entries.
+	 * We've got to be careful here as C++ produces relocatable zero
+	 * entries in the constructor and destructor tables which are then
+	 * tested for being not zero (which will always occur unless we're
+	 * based from address zero).  This causes an endless loop as __start
+	 * is at zero.  The solution used is to not relocate zero addresses.
+	 * This has the negative side effect of not allowing a global data
+	 * reference to be statically initialised to _stext (I've moved
+	 * __start to address 4 so that is okay).
+	 */
+	if (rev > OLD_FLAT_VERSION) {
+		unsigned long persistent = 0;
+		for (i=0; i < relocs; i++) {
+			unsigned long addr, relval;
+
+			/* Get the address of the pointer to be
+			   relocated (of course, the address has to be
+			   relocated first).  */
+			relval = ntohl(reloc[i]);
+			if (flat_set_persistent (relval, &persistent))
+				continue;
+			addr = flat_get_relocate_addr(relval);
+			rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1);
+			if (rp == (unsigned long *)RELOC_FAILED) {
+				ret = -ENOEXEC;
+				goto err;
+			}
+
+			/* Get the pointer's value.  */
+			addr = flat_get_addr_from_rp(rp, relval, flags,
+							&persistent);
+			if (addr != 0) {
+				/*
+				 * Do the relocation.  PIC relocs in the data section are
+				 * already in target order
+				 */
+				if ((flags & FLAT_FLAG_GOTPIC) == 0)
+					addr = ntohl(addr);
+				addr = calc_reloc(addr, libinfo, id, 0);
+				if (addr == RELOC_FAILED) {
+					ret = -ENOEXEC;
+					goto err;
+				}
+
+				/* Write back the relocated pointer.  */
+				flat_put_addr_at_rp(rp, addr, relval);
+			}
+		}
+	} else {
+		for (i=0; i < relocs; i++)
+			old_reloc(ntohl(reloc[i]));
+	}
+	
+	flush_icache_range(start_code, end_code);
+
+	/* zero the BSS,  BRK and stack areas */
+	memset((void*)(datapos + data_len), 0, bss_len + 
+			(memp + memp_size - stack_len -		/* end brk */
+			libinfo->lib_list[id].start_brk) +	/* start brk */
+			stack_len);
+
+	return 0;
+err:
+	return ret;
+}
+
+
+/****************************************************************************/
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+
+/*
+ * Load a shared library into memory.  The library gets its own data
+ * segment (including bss) but not argv/argc/environ.
+ */
+
+static int load_flat_shared_library(int id, struct lib_info *libs)
+{
+	struct linux_binprm bprm;
+	int res;
+	char buf[16];
+
+	memset(&bprm, 0, sizeof(bprm));
+
+	/* Create the file name */
+	sprintf(buf, "/lib/lib%d.so", id);
+
+	/* Open the file up */
+	bprm.filename = buf;
+	bprm.file = open_exec(bprm.filename);
+	res = PTR_ERR(bprm.file);
+	if (IS_ERR(bprm.file))
+		return res;
+
+	bprm.cred = prepare_exec_creds();
+	res = -ENOMEM;
+	if (!bprm.cred)
+		goto out;
+
+	/* We don't really care about recalculating credentials at this point
+	 * as we're past the point of no return and are dealing with shared
+	 * libraries.
+	 */
+	bprm.cred_prepared = 1;
+
+	res = prepare_binprm(&bprm);
+
+	if (!IS_ERR_VALUE(res))
+		res = load_flat_file(&bprm, libs, id, NULL);
+
+	abort_creds(bprm.cred);
+
+out:
+	allow_write_access(bprm.file);
+	fput(bprm.file);
+
+	return(res);
+}
+
+#endif /* CONFIG_BINFMT_SHARED_FLAT */
+/****************************************************************************/
+
+/*
+ * These are the functions used to load flat style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+static int load_flat_binary(struct linux_binprm * bprm)
+{
+	struct lib_info libinfo;
+	struct pt_regs *regs = current_pt_regs();
+	unsigned long p = bprm->p;
+	unsigned long stack_len;
+	unsigned long start_addr;
+	unsigned long *sp;
+	int res;
+	int i, j;
+
+	memset(&libinfo, 0, sizeof(libinfo));
+	/*
+	 * We have to add the size of our arguments to our stack size
+	 * otherwise it's too easy for users to create stack overflows
+	 * by passing in a huge argument list.  And yes,  we have to be
+	 * pedantic and include space for the argv/envp array as it may have
+	 * a lot of entries.
+	 */
+#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *))
+	stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
+	stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
+	stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
+	stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
+	
+	res = load_flat_file(bprm, &libinfo, 0, &stack_len);
+	if (IS_ERR_VALUE(res))
+		return res;
+	
+	/* Update data segment pointers for all libraries */
+	for (i=0; i<MAX_SHARED_LIBS; i++)
+		if (libinfo.lib_list[i].loaded)
+			for (j=0; j<MAX_SHARED_LIBS; j++)
+				(-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] =
+					(libinfo.lib_list[j].loaded)?
+						libinfo.lib_list[j].start_data:UNLOADED_LIB;
+
+	install_exec_creds(bprm);
+
+	set_binfmt(&flat_format);
+
+	p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
+	DBG_FLT("p=%x\n", (int)p);
+
+	/* copy the arg pages onto the stack, this could be more efficient :-) */
+	for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--)
+		* (char *) --p =
+			((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE];
+
+	sp = (unsigned long *) create_flat_tables(p, bprm);
+	
+	/* Fake some return addresses to ensure the call chain will
+	 * initialise library in order for us.  We are required to call
+	 * lib 1 first, then 2, ... and finally the main program (id 0).
+	 */
+	start_addr = libinfo.lib_list[0].entry;
+
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+	for (i = MAX_SHARED_LIBS-1; i>0; i--) {
+		if (libinfo.lib_list[i].loaded) {
+			/* Push previos first to call address */
+			--sp;	put_user(start_addr, sp);
+			start_addr = libinfo.lib_list[i].entry;
+		}
+	}
+#endif
+	
+	/* Stash our initial stack pointer into the mm structure */
+	current->mm->start_stack = (unsigned long )sp;
+
+#ifdef FLAT_PLAT_INIT
+	FLAT_PLAT_INIT(regs);
+#endif
+	DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
+		(int)regs, (int)start_addr, (int)current->mm->start_stack);
+	
+	start_thread(regs, start_addr, current->mm->start_stack);
+
+	return 0;
+}
+
+/****************************************************************************/
+
+static int __init init_flat_binfmt(void)
+{
+	register_binfmt(&flat_format);
+	return 0;
+}
+
+/****************************************************************************/
+
+core_initcall(init_flat_binfmt);
+
+/****************************************************************************/
diff --git a/kernel/fs/binfmt_misc.c b/kernel/fs/binfmt_misc.c
new file mode 100644
index 000000000..78f005f37
--- /dev/null
+++ b/kernel/fs/binfmt_misc.c
@@ -0,0 +1,835 @@
+/*
+ * binfmt_misc.c
+ *
+ * Copyright (C) 1997 Richard Günther
+ *
+ * binfmt_misc detects binaries via a magic or filename extension and invokes
+ * a specified wrapper. See Documentation/binfmt_misc.txt for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/binfmts.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/string_helpers.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+
+#ifdef DEBUG
+# define USE_DEBUG 1
+#else
+# define USE_DEBUG 0
+#endif
+
+enum {
+	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
+};
+
+static LIST_HEAD(entries);
+static int enabled = 1;
+
+enum {Enabled, Magic};
+#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
+#define MISC_FMT_OPEN_BINARY (1 << 30)
+#define MISC_FMT_CREDENTIALS (1 << 29)
+
+typedef struct {
+	struct list_head list;
+	unsigned long flags;		/* type, status, etc. */
+	int offset;			/* offset of magic */
+	int size;			/* size of magic/mask */
+	char *magic;			/* magic or filename extension */
+	char *mask;			/* mask, NULL for exact match */
+	char *interpreter;		/* filename of interpreter */
+	char *name;
+	struct dentry *dentry;
+} Node;
+
+static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
+static struct vfsmount *bm_mnt;
+static int entry_count;
+
+/*
+ * Max length of the register string.  Determined by:
+ *  - 7 delimiters
+ *  - name:   ~50 bytes
+ *  - type:   1 byte
+ *  - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE)
+ *  - magic:  128 bytes (512 in escaped form)
+ *  - mask:   128 bytes (512 in escaped form)
+ *  - interp: ~50 bytes
+ *  - flags:  5 bytes
+ * Round that up a bit, and then back off to hold the internal data
+ * (like struct Node).
+ */
+#define MAX_REGISTER_LENGTH 1920
+
+/*
+ * Check if we support the binfmt
+ * if we do, return the node, else NULL
+ * locking is done in load_misc_binary
+ */
+static Node *check_file(struct linux_binprm *bprm)
+{
+	char *p = strrchr(bprm->interp, '.');
+	struct list_head *l;
+
+	/* Walk all the registered handlers. */
+	list_for_each(l, &entries) {
+		Node *e = list_entry(l, Node, list);
+		char *s;
+		int j;
+
+		/* Make sure this one is currently enabled. */
+		if (!test_bit(Enabled, &e->flags))
+			continue;
+
+		/* Do matching based on extension if applicable. */
+		if (!test_bit(Magic, &e->flags)) {
+			if (p && !strcmp(e->magic, p + 1))
+				return e;
+			continue;
+		}
+
+		/* Do matching based on magic & mask. */
+		s = bprm->buf + e->offset;
+		if (e->mask) {
+			for (j = 0; j < e->size; j++)
+				if ((*s++ ^ e->magic[j]) & e->mask[j])
+					break;
+		} else {
+			for (j = 0; j < e->size; j++)
+				if ((*s++ ^ e->magic[j]))
+					break;
+		}
+		if (j == e->size)
+			return e;
+	}
+	return NULL;
+}
+
+/*
+ * the loader itself
+ */
+static int load_misc_binary(struct linux_binprm *bprm)
+{
+	Node *fmt;
+	struct file *interp_file = NULL;
+	char iname[BINPRM_BUF_SIZE];
+	const char *iname_addr = iname;
+	int retval;
+	int fd_binary = -1;
+
+	retval = -ENOEXEC;
+	if (!enabled)
+		goto ret;
+
+	/* to keep locking time low, we copy the interpreter string */
+	read_lock(&entries_lock);
+	fmt = check_file(bprm);
+	if (fmt)
+		strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
+	read_unlock(&entries_lock);
+	if (!fmt)
+		goto ret;
+
+	/* Need to be able to load the file after exec */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
+	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
+		retval = remove_arg_zero(bprm);
+		if (retval)
+			goto ret;
+	}
+
+	if (fmt->flags & MISC_FMT_OPEN_BINARY) {
+
+		/* if the binary should be opened on behalf of the
+		 * interpreter than keep it open and assign descriptor
+		 * to it
+		 */
+		fd_binary = get_unused_fd_flags(0);
+		if (fd_binary < 0) {
+			retval = fd_binary;
+			goto ret;
+		}
+		fd_install(fd_binary, bprm->file);
+
+		/* if the binary is not readable than enforce mm->dumpable=0
+		   regardless of the interpreter's permissions */
+		would_dump(bprm, bprm->file);
+
+		allow_write_access(bprm->file);
+		bprm->file = NULL;
+
+		/* mark the bprm that fd should be passed to interp */
+		bprm->interp_flags |= BINPRM_FLAGS_EXECFD;
+		bprm->interp_data = fd_binary;
+
+	} else {
+		allow_write_access(bprm->file);
+		fput(bprm->file);
+		bprm->file = NULL;
+	}
+	/* make argv[1] be the path to the binary */
+	retval = copy_strings_kernel(1, &bprm->interp, bprm);
+	if (retval < 0)
+		goto error;
+	bprm->argc++;
+
+	/* add the interp as argv[0] */
+	retval = copy_strings_kernel(1, &iname_addr, bprm);
+	if (retval < 0)
+		goto error;
+	bprm->argc++;
+
+	/* Update interp in case binfmt_script needs it. */
+	retval = bprm_change_interp(iname, bprm);
+	if (retval < 0)
+		goto error;
+
+	interp_file = open_exec(iname);
+	retval = PTR_ERR(interp_file);
+	if (IS_ERR(interp_file))
+		goto error;
+
+	bprm->file = interp_file;
+	if (fmt->flags & MISC_FMT_CREDENTIALS) {
+		/*
+		 * No need to call prepare_binprm(), it's already been
+		 * done.  bprm->buf is stale, update from interp_file.
+		 */
+		memset(bprm->buf, 0, BINPRM_BUF_SIZE);
+		retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
+	} else
+		retval = prepare_binprm(bprm);
+
+	if (retval < 0)
+		goto error;
+
+	retval = search_binary_handler(bprm);
+	if (retval < 0)
+		goto error;
+
+ret:
+	return retval;
+error:
+	if (fd_binary > 0)
+		sys_close(fd_binary);
+	bprm->interp_flags = 0;
+	bprm->interp_data = 0;
+	goto ret;
+}
+
+/* Command parsers */
+
+/*
+ * parses and copies one argument enclosed in del from *sp to *dp,
+ * recognising the \x special.
+ * returns pointer to the copied argument or NULL in case of an
+ * error (and sets err) or null argument length.
+ */
+static char *scanarg(char *s, char del)
+{
+	char c;
+
+	while ((c = *s++) != del) {
+		if (c == '\\' && *s == 'x') {
+			s++;
+			if (!isxdigit(*s++))
+				return NULL;
+			if (!isxdigit(*s++))
+				return NULL;
+		}
+	}
+	s[-1] ='\0';
+	return s;
+}
+
+static char *check_special_flags(char *sfs, Node *e)
+{
+	char *p = sfs;
+	int cont = 1;
+
+	/* special flags */
+	while (cont) {
+		switch (*p) {
+		case 'P':
+			pr_debug("register: flag: P (preserve argv0)\n");
+			p++;
+			e->flags |= MISC_FMT_PRESERVE_ARGV0;
+			break;
+		case 'O':
+			pr_debug("register: flag: O (open binary)\n");
+			p++;
+			e->flags |= MISC_FMT_OPEN_BINARY;
+			break;
+		case 'C':
+			pr_debug("register: flag: C (preserve creds)\n");
+			p++;
+			/* this flags also implies the
+			   open-binary flag */
+			e->flags |= (MISC_FMT_CREDENTIALS |
+					MISC_FMT_OPEN_BINARY);
+			break;
+		default:
+			cont = 0;
+		}
+	}
+
+	return p;
+}
+
+/*
+ * This registers a new binary format, it recognises the syntax
+ * ':name:type:offset:magic:mask:interpreter:flags'
+ * where the ':' is the IFS, that can be chosen with the first char
+ */
+static Node *create_entry(const char __user *buffer, size_t count)
+{
+	Node *e;
+	int memsize, err;
+	char *buf, *p;
+	char del;
+
+	pr_debug("register: received %zu bytes\n", count);
+
+	/* some sanity checks */
+	err = -EINVAL;
+	if ((count < 11) || (count > MAX_REGISTER_LENGTH))
+		goto out;
+
+	err = -ENOMEM;
+	memsize = sizeof(Node) + count + 8;
+	e = kmalloc(memsize, GFP_KERNEL);
+	if (!e)
+		goto out;
+
+	p = buf = (char *)e + sizeof(Node);
+
+	memset(e, 0, sizeof(Node));
+	if (copy_from_user(buf, buffer, count))
+		goto efault;
+
+	del = *p++;	/* delimeter */
+
+	pr_debug("register: delim: %#x {%c}\n", del, del);
+
+	/* Pad the buffer with the delim to simplify parsing below. */
+	memset(buf + count, del, 8);
+
+	/* Parse the 'name' field. */
+	e->name = p;
+	p = strchr(p, del);
+	if (!p)
+		goto einval;
+	*p++ = '\0';
+	if (!e->name[0] ||
+	    !strcmp(e->name, ".") ||
+	    !strcmp(e->name, "..") ||
+	    strchr(e->name, '/'))
+		goto einval;
+
+	pr_debug("register: name: {%s}\n", e->name);
+
+	/* Parse the 'type' field. */
+	switch (*p++) {
+	case 'E':
+		pr_debug("register: type: E (extension)\n");
+		e->flags = 1 << Enabled;
+		break;
+	case 'M':
+		pr_debug("register: type: M (magic)\n");
+		e->flags = (1 << Enabled) | (1 << Magic);
+		break;
+	default:
+		goto einval;
+	}
+	if (*p++ != del)
+		goto einval;
+
+	if (test_bit(Magic, &e->flags)) {
+		/* Handle the 'M' (magic) format. */
+		char *s;
+
+		/* Parse the 'offset' field. */
+		s = strchr(p, del);
+		if (!s)
+			goto einval;
+		*s++ = '\0';
+		e->offset = simple_strtoul(p, &p, 10);
+		if (*p++)
+			goto einval;
+		pr_debug("register: offset: %#x\n", e->offset);
+
+		/* Parse the 'magic' field. */
+		e->magic = p;
+		p = scanarg(p, del);
+		if (!p)
+			goto einval;
+		if (!e->magic[0])
+			goto einval;
+		if (USE_DEBUG)
+			print_hex_dump_bytes(
+				KBUILD_MODNAME ": register: magic[raw]: ",
+				DUMP_PREFIX_NONE, e->magic, p - e->magic);
+
+		/* Parse the 'mask' field. */
+		e->mask = p;
+		p = scanarg(p, del);
+		if (!p)
+			goto einval;
+		if (!e->mask[0]) {
+			e->mask = NULL;
+			pr_debug("register:  mask[raw]: none\n");
+		} else if (USE_DEBUG)
+			print_hex_dump_bytes(
+				KBUILD_MODNAME ": register:  mask[raw]: ",
+				DUMP_PREFIX_NONE, e->mask, p - e->mask);
+
+		/*
+		 * Decode the magic & mask fields.
+		 * Note: while we might have accepted embedded NUL bytes from
+		 * above, the unescape helpers here will stop at the first one
+		 * it encounters.
+		 */
+		e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
+		if (e->mask &&
+		    string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
+			goto einval;
+		if (e->size + e->offset > BINPRM_BUF_SIZE)
+			goto einval;
+		pr_debug("register: magic/mask length: %i\n", e->size);
+		if (USE_DEBUG) {
+			print_hex_dump_bytes(
+				KBUILD_MODNAME ": register: magic[decoded]: ",
+				DUMP_PREFIX_NONE, e->magic, e->size);
+
+			if (e->mask) {
+				int i;
+				char *masked = kmalloc(e->size, GFP_KERNEL);
+
+				print_hex_dump_bytes(
+					KBUILD_MODNAME ": register:  mask[decoded]: ",
+					DUMP_PREFIX_NONE, e->mask, e->size);
+
+				if (masked) {
+					for (i = 0; i < e->size; ++i)
+						masked[i] = e->magic[i] & e->mask[i];
+					print_hex_dump_bytes(
+						KBUILD_MODNAME ": register:  magic[masked]: ",
+						DUMP_PREFIX_NONE, masked, e->size);
+
+					kfree(masked);
+				}
+			}
+		}
+	} else {
+		/* Handle the 'E' (extension) format. */
+
+		/* Skip the 'offset' field. */
+		p = strchr(p, del);
+		if (!p)
+			goto einval;
+		*p++ = '\0';
+
+		/* Parse the 'magic' field. */
+		e->magic = p;
+		p = strchr(p, del);
+		if (!p)
+			goto einval;
+		*p++ = '\0';
+		if (!e->magic[0] || strchr(e->magic, '/'))
+			goto einval;
+		pr_debug("register: extension: {%s}\n", e->magic);
+
+		/* Skip the 'mask' field. */
+		p = strchr(p, del);
+		if (!p)
+			goto einval;
+		*p++ = '\0';
+	}
+
+	/* Parse the 'interpreter' field. */
+	e->interpreter = p;
+	p = strchr(p, del);
+	if (!p)
+		goto einval;
+	*p++ = '\0';
+	if (!e->interpreter[0])
+		goto einval;
+	pr_debug("register: interpreter: {%s}\n", e->interpreter);
+
+	/* Parse the 'flags' field. */
+	p = check_special_flags(p, e);
+	if (*p == '\n')
+		p++;
+	if (p != buf + count)
+		goto einval;
+
+	return e;
+
+out:
+	return ERR_PTR(err);
+
+efault:
+	kfree(e);
+	return ERR_PTR(-EFAULT);
+einval:
+	kfree(e);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Set status of entry/binfmt_misc:
+ * '1' enables, '0' disables and '-1' clears entry/binfmt_misc
+ */
+static int parse_command(const char __user *buffer, size_t count)
+{
+	char s[4];
+
+	if (count > 3)
+		return -EINVAL;
+	if (copy_from_user(s, buffer, count))
+		return -EFAULT;
+	if (!count)
+		return 0;
+	if (s[count - 1] == '\n')
+		count--;
+	if (count == 1 && s[0] == '0')
+		return 1;
+	if (count == 1 && s[0] == '1')
+		return 2;
+	if (count == 2 && s[0] == '-' && s[1] == '1')
+		return 3;
+	return -EINVAL;
+}
+
+/* generic stuff */
+
+static void entry_status(Node *e, char *page)
+{
+	char *dp = page;
+	const char *status = "disabled";
+
+	if (test_bit(Enabled, &e->flags))
+		status = "enabled";
+
+	if (!VERBOSE_STATUS) {
+		sprintf(page, "%s\n", status);
+		return;
+	}
+
+	dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter);
+
+	/* print the special flags */
+	dp += sprintf(dp, "flags: ");
+	if (e->flags & MISC_FMT_PRESERVE_ARGV0)
+		*dp++ = 'P';
+	if (e->flags & MISC_FMT_OPEN_BINARY)
+		*dp++ = 'O';
+	if (e->flags & MISC_FMT_CREDENTIALS)
+		*dp++ = 'C';
+	*dp++ = '\n';
+
+	if (!test_bit(Magic, &e->flags)) {
+		sprintf(dp, "extension .%s\n", e->magic);
+	} else {
+		dp += sprintf(dp, "offset %i\nmagic ", e->offset);
+		dp = bin2hex(dp, e->magic, e->size);
+		if (e->mask) {
+			dp += sprintf(dp, "\nmask ");
+			dp = bin2hex(dp, e->mask, e->size);
+		}
+		*dp++ = '\n';
+		*dp = '\0';
+	}
+}
+
+static struct inode *bm_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+			current_fs_time(inode->i_sb);
+	}
+	return inode;
+}
+
+static void bm_evict_inode(struct inode *inode)
+{
+	clear_inode(inode);
+	kfree(inode->i_private);
+}
+
+static void kill_node(Node *e)
+{
+	struct dentry *dentry;
+
+	write_lock(&entries_lock);
+	dentry = e->dentry;
+	if (dentry) {
+		list_del_init(&e->list);
+		e->dentry = NULL;
+	}
+	write_unlock(&entries_lock);
+
+	if (dentry) {
+		drop_nlink(d_inode(dentry));
+		d_drop(dentry);
+		dput(dentry);
+		simple_release_fs(&bm_mnt, &entry_count);
+	}
+}
+
+/* /<entry> */
+
+static ssize_t
+bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	Node *e = file_inode(file)->i_private;
+	ssize_t res;
+	char *page;
+
+	page = (char *) __get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	entry_status(e, page);
+
+	res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
+
+	free_page((unsigned long) page);
+	return res;
+}
+
+static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct dentry *root;
+	Node *e = file_inode(file)->i_private;
+	int res = parse_command(buffer, count);
+
+	switch (res) {
+	case 1:
+		/* Disable this handler. */
+		clear_bit(Enabled, &e->flags);
+		break;
+	case 2:
+		/* Enable this handler. */
+		set_bit(Enabled, &e->flags);
+		break;
+	case 3:
+		/* Delete this handler. */
+		root = dget(file->f_path.dentry->d_sb->s_root);
+		mutex_lock(&d_inode(root)->i_mutex);
+
+		kill_node(e);
+
+		mutex_unlock(&d_inode(root)->i_mutex);
+		dput(root);
+		break;
+	default:
+		return res;
+	}
+
+	return count;
+}
+
+static const struct file_operations bm_entry_operations = {
+	.read		= bm_entry_read,
+	.write		= bm_entry_write,
+	.llseek		= default_llseek,
+};
+
+/* /register */
+
+static ssize_t bm_register_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *ppos)
+{
+	Node *e;
+	struct inode *inode;
+	struct dentry *root, *dentry;
+	struct super_block *sb = file->f_path.dentry->d_sb;
+	int err = 0;
+
+	e = create_entry(buffer, count);
+
+	if (IS_ERR(e))
+		return PTR_ERR(e);
+
+	root = dget(sb->s_root);
+	mutex_lock(&d_inode(root)->i_mutex);
+	dentry = lookup_one_len(e->name, root, strlen(e->name));
+	err = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out;
+
+	err = -EEXIST;
+	if (d_really_is_positive(dentry))
+		goto out2;
+
+	inode = bm_get_inode(sb, S_IFREG | 0644);
+
+	err = -ENOMEM;
+	if (!inode)
+		goto out2;
+
+	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
+	if (err) {
+		iput(inode);
+		inode = NULL;
+		goto out2;
+	}
+
+	e->dentry = dget(dentry);
+	inode->i_private = e;
+	inode->i_fop = &bm_entry_operations;
+
+	d_instantiate(dentry, inode);
+	write_lock(&entries_lock);
+	list_add(&e->list, &entries);
+	write_unlock(&entries_lock);
+
+	err = 0;
+out2:
+	dput(dentry);
+out:
+	mutex_unlock(&d_inode(root)->i_mutex);
+	dput(root);
+
+	if (err) {
+		kfree(e);
+		return -EINVAL;
+	}
+	return count;
+}
+
+static const struct file_operations bm_register_operations = {
+	.write		= bm_register_write,
+	.llseek		= noop_llseek,
+};
+
+/* /status */
+
+static ssize_t
+bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	char *s = enabled ? "enabled\n" : "disabled\n";
+
+	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
+}
+
+static ssize_t bm_status_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
+{
+	int res = parse_command(buffer, count);
+	struct dentry *root;
+
+	switch (res) {
+	case 1:
+		/* Disable all handlers. */
+		enabled = 0;
+		break;
+	case 2:
+		/* Enable all handlers. */
+		enabled = 1;
+		break;
+	case 3:
+		/* Delete all handlers. */
+		root = dget(file->f_path.dentry->d_sb->s_root);
+		mutex_lock(&d_inode(root)->i_mutex);
+
+		while (!list_empty(&entries))
+			kill_node(list_entry(entries.next, Node, list));
+
+		mutex_unlock(&d_inode(root)->i_mutex);
+		dput(root);
+		break;
+	default:
+		return res;
+	}
+
+	return count;
+}
+
+static const struct file_operations bm_status_operations = {
+	.read		= bm_status_read,
+	.write		= bm_status_write,
+	.llseek		= default_llseek,
+};
+
+/* Superblock handling */
+
+static const struct super_operations s_ops = {
+	.statfs		= simple_statfs,
+	.evict_inode	= bm_evict_inode,
+};
+
+static int bm_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int err;
+	static struct tree_descr bm_files[] = {
+		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
+		[3] = {"register", &bm_register_operations, S_IWUSR},
+		/* last one */ {""}
+	};
+
+	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
+	if (!err)
+		sb->s_op = &s_ops;
+	return err;
+}
+
+static struct dentry *bm_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, bm_fill_super);
+}
+
+static struct linux_binfmt misc_format = {
+	.module = THIS_MODULE,
+	.load_binary = load_misc_binary,
+};
+
+static struct file_system_type bm_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "binfmt_misc",
+	.mount		= bm_mount,
+	.kill_sb	= kill_litter_super,
+};
+MODULE_ALIAS_FS("binfmt_misc");
+
+static int __init init_misc_binfmt(void)
+{
+	int err = register_filesystem(&bm_fs_type);
+	if (!err)
+		insert_binfmt(&misc_format);
+	return err;
+}
+
+static void __exit exit_misc_binfmt(void)
+{
+	unregister_binfmt(&misc_format);
+	unregister_filesystem(&bm_fs_type);
+}
+
+core_initcall(init_misc_binfmt);
+module_exit(exit_misc_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/binfmt_script.c b/kernel/fs/binfmt_script.c
new file mode 100644
index 000000000..afdf4e3ca
--- /dev/null
+++ b/kernel/fs/binfmt_script.c
@@ -0,0 +1,129 @@
+/*
+ *  linux/fs/binfmt_script.c
+ *
+ *  Copyright (C) 1996  Martin von Löwis
+ *  original #!-checking implemented by tytso.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/binfmts.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+
+static int load_script(struct linux_binprm *bprm)
+{
+	const char *i_arg, *i_name;
+	char *cp;
+	struct file *file;
+	char interp[BINPRM_BUF_SIZE];
+	int retval;
+
+	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
+		return -ENOEXEC;
+
+	/*
+	 * If the script filename will be inaccessible after exec, typically
+	 * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give
+	 * up now (on the assumption that the interpreter will want to load
+	 * this file).
+	 */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
+	/*
+	 * This section does the #! interpretation.
+	 * Sorta complicated, but hopefully it will work.  -TYT
+	 */
+
+	allow_write_access(bprm->file);
+	fput(bprm->file);
+	bprm->file = NULL;
+
+	bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
+	if ((cp = strchr(bprm->buf, '\n')) == NULL)
+		cp = bprm->buf+BINPRM_BUF_SIZE-1;
+	*cp = '\0';
+	while (cp > bprm->buf) {
+		cp--;
+		if ((*cp == ' ') || (*cp == '\t'))
+			*cp = '\0';
+		else
+			break;
+	}
+	for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++);
+	if (*cp == '\0') 
+		return -ENOEXEC; /* No interpreter name found */
+	i_name = cp;
+	i_arg = NULL;
+	for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++)
+		/* nothing */ ;
+	while ((*cp == ' ') || (*cp == '\t'))
+		*cp++ = '\0';
+	if (*cp)
+		i_arg = cp;
+	strcpy (interp, i_name);
+	/*
+	 * OK, we've parsed out the interpreter name and
+	 * (optional) argument.
+	 * Splice in (1) the interpreter's name for argv[0]
+	 *           (2) (optional) argument to interpreter
+	 *           (3) filename of shell script (replace argv[0])
+	 *
+	 * This is done in reverse order, because of how the
+	 * user environment and arguments are stored.
+	 */
+	retval = remove_arg_zero(bprm);
+	if (retval)
+		return retval;
+	retval = copy_strings_kernel(1, &bprm->interp, bprm);
+	if (retval < 0) return retval; 
+	bprm->argc++;
+	if (i_arg) {
+		retval = copy_strings_kernel(1, &i_arg, bprm);
+		if (retval < 0) return retval; 
+		bprm->argc++;
+	}
+	retval = copy_strings_kernel(1, &i_name, bprm);
+	if (retval) return retval; 
+	bprm->argc++;
+	retval = bprm_change_interp(interp, bprm);
+	if (retval < 0)
+		return retval;
+
+	/*
+	 * OK, now restart the process with the interpreter's dentry.
+	 */
+	file = open_exec(interp);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	bprm->file = file;
+	retval = prepare_binprm(bprm);
+	if (retval < 0)
+		return retval;
+	return search_binary_handler(bprm);
+}
+
+static struct linux_binfmt script_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_script,
+};
+
+static int __init init_script_binfmt(void)
+{
+	register_binfmt(&script_format);
+	return 0;
+}
+
+static void __exit exit_script_binfmt(void)
+{
+	unregister_binfmt(&script_format);
+}
+
+core_initcall(init_script_binfmt);
+module_exit(exit_script_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/block_dev.c b/kernel/fs/block_dev.c
new file mode 100644
index 000000000..c7e4163ed
--- /dev/null
+++ b/kernel/fs/block_dev.c
@@ -0,0 +1,1795 @@
+/*
+ *  linux/fs/block_dev.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/major.h>
+#include <linux/device_cgroup.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/blkpg.h>
+#include <linux/magic.h>
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/mount.h>
+#include <linux/uio.h>
+#include <linux/namei.h>
+#include <linux/log2.h>
+#include <linux/cleancache.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct bdev_inode {
+	struct block_device bdev;
+	struct inode vfs_inode;
+};
+
+static const struct address_space_operations def_blk_aops;
+
+static inline struct bdev_inode *BDEV_I(struct inode *inode)
+{
+	return container_of(inode, struct bdev_inode, vfs_inode);
+}
+
+inline struct block_device *I_BDEV(struct inode *inode)
+{
+	return &BDEV_I(inode)->bdev;
+}
+EXPORT_SYMBOL(I_BDEV);
+
+static void bdev_write_inode(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	while (inode->i_state & I_DIRTY) {
+		spin_unlock(&inode->i_lock);
+		WARN_ON_ONCE(write_inode_now(inode, true));
+		spin_lock(&inode->i_lock);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+/* Kill _all_ buffers and pagecache , dirty or not.. */
+void kill_bdev(struct block_device *bdev)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+		return;
+
+	invalidate_bh_lrus();
+	truncate_inode_pages(mapping, 0);
+}	
+EXPORT_SYMBOL(kill_bdev);
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	invalidate_bh_lrus();
+	lru_add_drain_all();	/* make sure all lru add caches are flushed */
+	invalidate_mapping_pages(mapping, 0, -1);
+	/* 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_invalidate_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
+
+int set_blocksize(struct block_device *bdev, int size)
+{
+	/* Size must be a power of two, and between 512 and PAGE_SIZE */
+	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
+		return -EINVAL;
+
+	/* Size cannot be smaller than the size supported by the device */
+	if (size < bdev_logical_block_size(bdev))
+		return -EINVAL;
+
+	/* Don't change the size if it is same as current */
+	if (bdev->bd_block_size != size) {
+		sync_blockdev(bdev);
+		bdev->bd_block_size = size;
+		bdev->bd_inode->i_blkbits = blksize_bits(size);
+		kill_bdev(bdev);
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(set_blocksize);
+
+int sb_set_blocksize(struct super_block *sb, int size)
+{
+	if (set_blocksize(sb->s_bdev, size))
+		return 0;
+	/* If we get here, we know size is power of two
+	 * and it's value is between 512 and PAGE_SIZE */
+	sb->s_blocksize = size;
+	sb->s_blocksize_bits = blksize_bits(size);
+	return sb->s_blocksize;
+}
+
+EXPORT_SYMBOL(sb_set_blocksize);
+
+int sb_min_blocksize(struct super_block *sb, int size)
+{
+	int minsize = bdev_logical_block_size(sb->s_bdev);
+	if (size < minsize)
+		size = minsize;
+	return sb_set_blocksize(sb, size);
+}
+
+EXPORT_SYMBOL(sb_min_blocksize);
+
+static int
+blkdev_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh, int create)
+{
+	bh->b_bdev = I_BDEV(inode);
+	bh->b_blocknr = iblock;
+	set_buffer_mapped(bh);
+	return 0;
+}
+
+static ssize_t
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
+				    blkdev_get_block, NULL, NULL,
+				    DIO_SKIP_DIO_COUNT);
+}
+
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	if (!bdev)
+		return 0;
+	if (!wait)
+		return filemap_flush(bdev->bd_inode->i_mapping);
+	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+	return __sync_blockdev(bdev, 1);
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	if (sb) {
+		int res = sync_filesystem(sb);
+		drop_super(sb);
+		return res;
+	}
+	return sync_blockdev(bdev);
+}
+EXPORT_SYMBOL(fsync_bdev);
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:	blockdevice to lock
+ *
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (++bdev->bd_fsfreeze_count > 1) {
+		/*
+		 * We don't even need to grab a reference - the first call
+		 * to freeze_bdev grab an active reference and only the last
+		 * thaw_bdev drops it.
+		 */
+		sb = get_super(bdev);
+		drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+
+	sb = get_active_super(bdev);
+	if (!sb)
+		goto out;
+	if (sb->s_op->freeze_super)
+		error = sb->s_op->freeze_super(sb);
+	else
+		error = freeze_super(sb);
+	if (error) {
+		deactivate_super(sb);
+		bdev->bd_fsfreeze_count--;
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return ERR_PTR(error);
+	}
+	deactivate_super(sb);
+ out:
+	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return sb;	/* thaw_bdev releases s->s_umount */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:	blockdevice to unlock
+ * @sb:		associated superblock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	int error = -EINVAL;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count)
+		goto out;
+
+	error = 0;
+	if (--bdev->bd_fsfreeze_count > 0)
+		goto out;
+
+	if (!sb)
+		goto out;
+
+	if (sb->s_op->thaw_super)
+		error = sb->s_op->thaw_super(sb);
+	else
+		error = thaw_super(sb);
+	if (error) {
+		bdev->bd_fsfreeze_count++;
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return error;
+	}
+out:
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_bdev);
+
+static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, blkdev_get_block, wbc);
+}
+
+static int blkdev_readpage(struct file * file, struct page * page)
+{
+	return block_read_full_page(page, blkdev_get_block);
+}
+
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
+static int blkdev_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	return block_write_begin(mapping, pos, len, flags, pagep,
+				 blkdev_get_block);
+}
+
+static int blkdev_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int ret;
+	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return ret;
+}
+
+/*
+ * private llseek:
+ * for a block special file file_inode(file)->i_size is zero
+ * so we compute the size by hand (just as in block_read/write above)
+ */
+static loff_t block_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *bd_inode = file->f_mapping->host;
+	loff_t retval;
+
+	mutex_lock(&bd_inode->i_mutex);
+	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
+	mutex_unlock(&bd_inode->i_mutex);
+	return retval;
+}
+	
+int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+	struct inode *bd_inode = filp->f_mapping->host;
+	struct block_device *bdev = I_BDEV(bd_inode);
+	int error;
+	
+	error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+	if (error)
+		return error;
+
+	/*
+	 * There is no need to serialise calls to blkdev_issue_flush with
+	 * i_mutex and doing so causes performance issues with concurrent
+	 * O_SYNC writers to a block device.
+	 */
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
+	if (error == -EOPNOTSUPP)
+		error = 0;
+
+	return error;
+}
+EXPORT_SYMBOL(blkdev_fsync);
+
+/**
+ * bdev_read_page() - Start reading a page from a block device
+ * @bdev: The device to read the page from
+ * @sector: The offset on the device to read the page to (need not be aligned)
+ * @page: The page to read
+ *
+ * On entry, the page should be locked.  It will be unlocked when the page
+ * has been read.  If the block driver implements rw_page synchronously,
+ * that will be true on exit from this function, but it need not be.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to read this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_read_page(struct block_device *bdev, sector_t sector,
+			struct page *page)
+{
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+	if (!ops->rw_page)
+		return -EOPNOTSUPP;
+	return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+}
+EXPORT_SYMBOL_GPL(bdev_read_page);
+
+/**
+ * bdev_write_page() - Start writing a page to a block device
+ * @bdev: The device to write the page to
+ * @sector: The offset on the device to write the page to (need not be aligned)
+ * @page: The page to write
+ * @wbc: The writeback_control for the write
+ *
+ * On entry, the page should be locked and not currently under writeback.
+ * On exit, if the write started successfully, the page will be unlocked and
+ * under writeback.  If the write failed already (eg the driver failed to
+ * queue the page to the device), the page will still be locked.  If the
+ * caller is a ->writepage implementation, it will need to unlock the page.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to write this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_write_page(struct block_device *bdev, sector_t sector,
+			struct page *page, struct writeback_control *wbc)
+{
+	int result;
+	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+	if (!ops->rw_page)
+		return -EOPNOTSUPP;
+	set_page_writeback(page);
+	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+	if (result)
+		end_page_writeback(page);
+	else
+		unlock_page(page);
+	return result;
+}
+EXPORT_SYMBOL_GPL(bdev_write_page);
+
+/**
+ * bdev_direct_access() - Get the address for directly-accessibly memory
+ * @bdev: The device containing the memory
+ * @sector: The offset within the device
+ * @addr: Where to put the address of the memory
+ * @pfn: The Page Frame Number for the memory
+ * @size: The number of bytes requested
+ *
+ * If a block device is made up of directly addressable memory, this function
+ * will tell the caller the PFN and the address of the memory.  The address
+ * may be directly dereferenced within the kernel without the need to call
+ * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
+ * page tables.
+ *
+ * Return: negative errno if an error occurs, otherwise the number of bytes
+ * accessible at this address.
+ */
+long bdev_direct_access(struct block_device *bdev, sector_t sector,
+			void **addr, unsigned long *pfn, long size)
+{
+	long avail;
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+	if (size < 0)
+		return size;
+	if (!ops->direct_access)
+		return -EOPNOTSUPP;
+	if ((sector + DIV_ROUND_UP(size, 512)) >
+					part_nr_sects_read(bdev->bd_part))
+		return -ERANGE;
+	sector += get_start_sect(bdev);
+	if (sector % (PAGE_SIZE / 512))
+		return -EINVAL;
+	avail = ops->direct_access(bdev, sector, addr, pfn, size);
+	if (!avail)
+		return -ERANGE;
+	return min(avail, size);
+}
+EXPORT_SYMBOL_GPL(bdev_direct_access);
+
+/*
+ * pseudo-fs
+ */
+
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
+static struct kmem_cache * bdev_cachep __read_mostly;
+
+static struct inode *bdev_alloc_inode(struct super_block *sb)
+{
+	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void bdev_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct bdev_inode *bdi = BDEV_I(inode);
+
+	kmem_cache_free(bdev_cachep, bdi);
+}
+
+static void bdev_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bdev_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct bdev_inode *ei = (struct bdev_inode *) foo;
+	struct block_device *bdev = &ei->bdev;
+
+	memset(bdev, 0, sizeof(*bdev));
+	mutex_init(&bdev->bd_mutex);
+	INIT_LIST_HEAD(&bdev->bd_inodes);
+	INIT_LIST_HEAD(&bdev->bd_list);
+#ifdef CONFIG_SYSFS
+	INIT_LIST_HEAD(&bdev->bd_holder_disks);
+#endif
+	inode_init_once(&ei->vfs_inode);
+	/* Initialize mutex for freeze. */
+	mutex_init(&bdev->bd_fsfreeze_mutex);
+}
+
+static inline void __bd_forget(struct inode *inode)
+{
+	list_del_init(&inode->i_devices);
+	inode->i_bdev = NULL;
+	inode->i_mapping = &inode->i_data;
+}
+
+static void bdev_evict_inode(struct inode *inode)
+{
+	struct block_device *bdev = &BDEV_I(inode)->bdev;
+	struct list_head *p;
+	truncate_inode_pages_final(&inode->i_data);
+	invalidate_inode_buffers(inode); /* is it needed here? */
+	clear_inode(inode);
+	spin_lock(&bdev_lock);
+	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
+		__bd_forget(list_entry(p, struct inode, i_devices));
+	}
+	list_del_init(&bdev->bd_list);
+	spin_unlock(&bdev_lock);
+}
+
+static const struct super_operations bdev_sops = {
+	.statfs = simple_statfs,
+	.alloc_inode = bdev_alloc_inode,
+	.destroy_inode = bdev_destroy_inode,
+	.drop_inode = generic_delete_inode,
+	.evict_inode = bdev_evict_inode,
+};
+
+static struct dentry *bd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+}
+
+static struct file_system_type bd_type = {
+	.name		= "bdev",
+	.mount		= bd_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct super_block *blockdev_superblock __read_mostly;
+
+void __init bdev_cache_init(void)
+{
+	int err;
+	static struct vfsmount *bd_mnt;
+
+	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
+			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+				SLAB_MEM_SPREAD|SLAB_PANIC),
+			init_once);
+	err = register_filesystem(&bd_type);
+	if (err)
+		panic("Cannot register bdev pseudo-fs");
+	bd_mnt = kern_mount(&bd_type);
+	if (IS_ERR(bd_mnt))
+		panic("Cannot create bdev pseudo-fs");
+	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
+}
+
+/*
+ * Most likely _very_ bad one - but then it's hardly critical for small
+ * /dev and can be fixed when somebody will need really large one.
+ * Keep in mind that it will be fed through icache hash function too.
+ */
+static inline unsigned long hash(dev_t dev)
+{
+	return MAJOR(dev)+MINOR(dev);
+}
+
+static int bdev_test(struct inode *inode, void *data)
+{
+	return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
+}
+
+static int bdev_set(struct inode *inode, void *data)
+{
+	BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
+	return 0;
+}
+
+static LIST_HEAD(all_bdevs);
+
+struct block_device *bdget(dev_t dev)
+{
+	struct block_device *bdev;
+	struct inode *inode;
+
+	inode = iget5_locked(blockdev_superblock, hash(dev),
+			bdev_test, bdev_set, &dev);
+
+	if (!inode)
+		return NULL;
+
+	bdev = &BDEV_I(inode)->bdev;
+
+	if (inode->i_state & I_NEW) {
+		bdev->bd_contains = NULL;
+		bdev->bd_super = NULL;
+		bdev->bd_inode = inode;
+		bdev->bd_block_size = (1 << inode->i_blkbits);
+		bdev->bd_part_count = 0;
+		bdev->bd_invalidated = 0;
+		inode->i_mode = S_IFBLK;
+		inode->i_rdev = dev;
+		inode->i_bdev = bdev;
+		inode->i_data.a_ops = &def_blk_aops;
+		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+		spin_lock(&bdev_lock);
+		list_add(&bdev->bd_list, &all_bdevs);
+		spin_unlock(&bdev_lock);
+		unlock_new_inode(inode);
+	}
+	return bdev;
+}
+
+EXPORT_SYMBOL(bdget);
+
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:	Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+	ihold(bdev->bd_inode);
+	return bdev;
+}
+EXPORT_SYMBOL(bdgrab);
+
+long nr_blockdev_pages(void)
+{
+	struct block_device *bdev;
+	long ret = 0;
+	spin_lock(&bdev_lock);
+	list_for_each_entry(bdev, &all_bdevs, bd_list) {
+		ret += bdev->bd_inode->i_mapping->nrpages;
+	}
+	spin_unlock(&bdev_lock);
+	return ret;
+}
+
+void bdput(struct block_device *bdev)
+{
+	iput(bdev->bd_inode);
+}
+
+EXPORT_SYMBOL(bdput);
+ 
+static struct block_device *bd_acquire(struct inode *inode)
+{
+	struct block_device *bdev;
+
+	spin_lock(&bdev_lock);
+	bdev = inode->i_bdev;
+	if (bdev) {
+		ihold(bdev->bd_inode);
+		spin_unlock(&bdev_lock);
+		return bdev;
+	}
+	spin_unlock(&bdev_lock);
+
+	bdev = bdget(inode->i_rdev);
+	if (bdev) {
+		spin_lock(&bdev_lock);
+		if (!inode->i_bdev) {
+			/*
+			 * We take an additional reference to bd_inode,
+			 * and it's released in clear_inode() of inode.
+			 * So, we can access it via ->i_mapping always
+			 * without igrab().
+			 */
+			ihold(bdev->bd_inode);
+			inode->i_bdev = bdev;
+			inode->i_mapping = bdev->bd_inode->i_mapping;
+			list_add(&inode->i_devices, &bdev->bd_inodes);
+		}
+		spin_unlock(&bdev_lock);
+	}
+	return bdev;
+}
+
+int sb_is_blkdev_sb(struct super_block *sb)
+{
+	return sb == blockdev_superblock;
+}
+
+/* Call when you free inode */
+
+void bd_forget(struct inode *inode)
+{
+	struct block_device *bdev = NULL;
+
+	spin_lock(&bdev_lock);
+	if (!sb_is_blkdev_sb(inode->i_sb))
+		bdev = inode->i_bdev;
+	__bd_forget(inode);
+	spin_unlock(&bdev_lock);
+
+	if (bdev)
+		iput(bdev->bd_inode);
+}
+
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whether @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+			 void *holder)
+{
+	if (bdev->bd_holder == holder)
+		return true;	 /* already a holder */
+	else if (bdev->bd_holder != NULL)
+		return false; 	 /* held by someone else */
+	else if (bdev->bd_contains == bdev)
+		return true;  	 /* is a whole device which isn't held */
+
+	else if (whole->bd_holder == bd_may_claim)
+		return true; 	 /* is a partition of a device that is being partitioned */
+	else if (whole->bd_holder != NULL)
+		return false;	 /* is a partition of a held device */
+	else
+		return true;	 /* is a partition of an un-held device */
+}
+
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+			       struct block_device *whole, void *holder)
+{
+retry:
+	/* if someone else claimed, fail */
+	if (!bd_may_claim(bdev, whole, holder))
+		return -EBUSY;
+
+	/* if claiming is already in progress, wait for it to finish */
+	if (whole->bd_claiming) {
+		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&bdev_lock);
+		schedule();
+		finish_wait(wq, &wait);
+		spin_lock(&bdev_lock);
+		goto retry;
+	}
+
+	/* yay, all mine */
+	return 0;
+}
+
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+					      void *holder)
+{
+	struct gendisk *disk;
+	struct block_device *whole;
+	int partno, err;
+
+	might_sleep();
+
+	/*
+	 * @bdev might not have been initialized properly yet, look up
+	 * and grab the outer block device the hard way.
+	 */
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		return ERR_PTR(-ENXIO);
+
+	/*
+	 * Normally, @bdev should equal what's returned from bdget_disk()
+	 * if partno is 0; however, some drivers (floppy) use multiple
+	 * bdev's for the same physical device and @bdev may be one of the
+	 * aliases.  Keep @bdev if partno is 0.  This means claimer
+	 * tracking is broken for those devices but it has always been that
+	 * way.
+	 */
+	if (partno)
+		whole = bdget_disk(disk, 0);
+	else
+		whole = bdgrab(bdev);
+
+	module_put(disk->fops->owner);
+	put_disk(disk);
+	if (!whole)
+		return ERR_PTR(-ENOMEM);
+
+	/* prepare to claim, if successful, mark claiming in progress */
+	spin_lock(&bdev_lock);
+
+	err = bd_prepare_to_claim(bdev, whole, holder);
+	if (err == 0) {
+		whole->bd_claiming = holder;
+		spin_unlock(&bdev_lock);
+		return whole;
+	} else {
+		spin_unlock(&bdev_lock);
+		bdput(whole);
+		return ERR_PTR(err);
+	}
+}
+
+#ifdef CONFIG_SYSFS
+struct bd_holder_disk {
+	struct list_head	list;
+	struct gendisk		*disk;
+	int			refcnt;
+};
+
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+						  struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+
+	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+		if (holder->disk == disk)
+			return holder;
+	return NULL;
+}
+
+static int add_symlink(struct kobject *from, struct kobject *to)
+{
+	return sysfs_create_link(from, to, kobject_name(to));
+}
+
+static void del_symlink(struct kobject *from, struct kobject *to)
+{
+	sysfs_remove_link(from, kobject_name(to));
+}
+
+/**
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * This functions creates the following sysfs symlinks.
+ *
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
+ *
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
+ *
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
+ *
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+	int ret = 0;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	WARN_ON_ONCE(!bdev->bd_holder);
+
+	/* FIXME: remove the following once add_disk() handles errors */
+	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+		goto out_unlock;
+
+	holder = bd_find_holder_disk(bdev, disk);
+	if (holder) {
+		holder->refcnt++;
+		goto out_unlock;
+	}
+
+	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+	if (!holder) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	INIT_LIST_HEAD(&holder->list);
+	holder->disk = disk;
+	holder->refcnt = 1;
+
+	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	if (ret)
+		goto out_free;
+
+	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	if (ret)
+		goto out_del;
+	/*
+	 * bdev could be deleted beneath us which would implicitly destroy
+	 * the holder directory.  Hold on to it.
+	 */
+	kobject_get(bdev->bd_part->holder_dir);
+
+	list_add(&holder->list, &bdev->bd_holder_disks);
+	goto out_unlock;
+
+out_del:
+	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+out_free:
+	kfree(holder);
+out_unlock:
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
+
+/**
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	holder = bd_find_holder_disk(bdev, disk);
+
+	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		del_symlink(bdev->bd_part->holder_dir,
+			    &disk_to_dev(disk)->kobj);
+		kobject_put(bdev->bd_part->holder_dir);
+		list_del_init(&holder->list);
+		kfree(holder);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+}
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+#endif
+
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
+{
+	if (__invalidate_device(bdev, kill_dirty)) {
+		char name[BDEVNAME_SIZE] = "";
+
+		if (bdev->bd_disk)
+			disk_name(bdev->bd_disk, 0, name);
+		printk(KERN_WARNING "VFS: busy inodes on changed media or "
+		       "resized disk %s\n", name);
+	}
+
+	if (!bdev->bd_disk)
+		return;
+	if (disk_part_scan_enabled(bdev->bd_disk))
+		bdev->bd_invalidated = 1;
+}
+
+/**
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+	loff_t disk_size, bdev_size;
+
+	disk_size = (loff_t)get_capacity(disk) << 9;
+	bdev_size = i_size_read(bdev->bd_inode);
+	if (disk_size != bdev_size) {
+		char name[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, name);
+		printk(KERN_INFO
+		       "%s: detected capacity change from %lld to %lld\n",
+		       name, bdev_size, disk_size);
+		i_size_write(bdev->bd_inode, disk_size);
+		flush_disk(bdev, false);
+	}
+}
+EXPORT_SYMBOL(check_disk_size_change);
+
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs.  It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+	struct block_device *bdev;
+	int ret = 0;
+
+	if (disk->fops->revalidate_disk)
+		ret = disk->fops->revalidate_disk(disk);
+
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		return ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+	return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
+
+/*
+ * This routine checks whether a removable media has been changed,
+ * and invalidates all buffer-cache-entries in that case. This
+ * is a relatively slow routine, so we have to try to minimize using
+ * it. Thus it is called only upon a 'mount' or 'open'. This
+ * is the best way of combining speed and utility, I think.
+ * People changing diskettes in the middle of an operation deserve
+ * to lose :-)
+ */
+int check_disk_change(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	const struct block_device_operations *bdops = disk->fops;
+	unsigned int events;
+
+	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
+				   DISK_EVENT_EJECT_REQUEST);
+	if (!(events & DISK_EVENT_MEDIA_CHANGE))
+		return 0;
+
+	flush_disk(bdev, true);
+	if (bdops->revalidate_disk)
+		bdops->revalidate_disk(bdev->bd_disk);
+	return 1;
+}
+
+EXPORT_SYMBOL(check_disk_change);
+
+void bd_set_size(struct block_device *bdev, loff_t size)
+{
+	unsigned bsize = bdev_logical_block_size(bdev);
+
+	mutex_lock(&bdev->bd_inode->i_mutex);
+	i_size_write(bdev->bd_inode, size);
+	mutex_unlock(&bdev->bd_inode->i_mutex);
+	while (bsize < PAGE_CACHE_SIZE) {
+		if (size & bsize)
+			break;
+		bsize <<= 1;
+	}
+	bdev->bd_block_size = bsize;
+	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+EXPORT_SYMBOL(bd_set_size);
+
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+
+/*
+ * bd_mutex locking:
+ *
+ *  mutex_lock(part->bd_mutex)
+ *    mutex_lock_nested(whole->bd_mutex, 1)
+ */
+
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
+{
+	struct gendisk *disk;
+	struct module *owner;
+	int ret;
+	int partno;
+	int perm = 0;
+
+	if (mode & FMODE_READ)
+		perm |= MAY_READ;
+	if (mode & FMODE_WRITE)
+		perm |= MAY_WRITE;
+	/*
+	 * hooks: /n/, see "layering violations".
+	 */
+	if (!for_part) {
+		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+		if (ret != 0) {
+			bdput(bdev);
+			return ret;
+		}
+	}
+
+ restart:
+
+	ret = -ENXIO;
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		goto out;
+	owner = disk->fops->owner;
+
+	disk_block_events(disk);
+	mutex_lock_nested(&bdev->bd_mutex, for_part);
+	if (!bdev->bd_openers) {
+		bdev->bd_disk = disk;
+		bdev->bd_queue = disk->queue;
+		bdev->bd_contains = bdev;
+		if (!partno) {
+			ret = -ENXIO;
+			bdev->bd_part = disk_get_part(disk, partno);
+			if (!bdev->bd_part)
+				goto out_clear;
+
+			ret = 0;
+			if (disk->fops->open) {
+				ret = disk->fops->open(bdev, mode);
+				if (ret == -ERESTARTSYS) {
+					/* Lost a race with 'disk' being
+					 * deleted, try again.
+					 * See md.c
+					 */
+					disk_put_part(bdev->bd_part);
+					bdev->bd_part = NULL;
+					bdev->bd_disk = NULL;
+					bdev->bd_queue = NULL;
+					mutex_unlock(&bdev->bd_mutex);
+					disk_unblock_events(disk);
+					put_disk(disk);
+					module_put(owner);
+					goto restart;
+				}
+			}
+
+			if (!ret)
+				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+
+			/*
+			 * If the device is invalidated, rescan partition
+			 * if open succeeded or failed with -ENOMEDIUM.
+			 * The latter is necessary to prevent ghost
+			 * partitions on a removed medium.
+			 */
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(disk, bdev);
+			}
+			if (ret)
+				goto out_clear;
+		} else {
+			struct block_device *whole;
+			whole = bdget_disk(disk, 0);
+			ret = -ENOMEM;
+			if (!whole)
+				goto out_clear;
+			BUG_ON(for_part);
+			ret = __blkdev_get(whole, mode, 1);
+			if (ret)
+				goto out_clear;
+			bdev->bd_contains = whole;
+			bdev->bd_part = disk_get_part(disk, partno);
+			if (!(disk->flags & GENHD_FL_UP) ||
+			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
+				ret = -ENXIO;
+				goto out_clear;
+			}
+			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+		}
+	} else {
+		if (bdev->bd_contains == bdev) {
+			ret = 0;
+			if (bdev->bd_disk->fops->open)
+				ret = bdev->bd_disk->fops->open(bdev, mode);
+			/* the same as first opener case, read comment there */
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(bdev->bd_disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(bdev->bd_disk, bdev);
+			}
+			if (ret)
+				goto out_unlock_bdev;
+		}
+		/* only one opener holds refs to the module and disk */
+		put_disk(disk);
+		module_put(owner);
+	}
+	bdev->bd_openers++;
+	if (for_part)
+		bdev->bd_part_count++;
+	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
+	return 0;
+
+ out_clear:
+	disk_put_part(bdev->bd_part);
+	bdev->bd_disk = NULL;
+	bdev->bd_part = NULL;
+	bdev->bd_queue = NULL;
+	if (bdev != bdev->bd_contains)
+		__blkdev_put(bdev->bd_contains, mode, 1);
+	bdev->bd_contains = NULL;
+ out_unlock_bdev:
+	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
+	put_disk(disk);
+	module_put(owner);
+ out:
+	bdput(bdev);
+
+	return ret;
+}
+
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+{
+	struct block_device *whole = NULL;
+	int res;
+
+	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+
+	if ((mode & FMODE_EXCL) && holder) {
+		whole = bd_start_claiming(bdev, holder);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
+	res = __blkdev_get(bdev, mode, 0);
+
+	if (whole) {
+		struct gendisk *disk = whole->bd_disk;
+
+		/* finish claiming */
+		mutex_lock(&bdev->bd_mutex);
+		spin_lock(&bdev_lock);
+
+		if (!res) {
+			BUG_ON(!bd_may_claim(bdev, whole, holder));
+			/*
+			 * Note that for a whole device bd_holders
+			 * will be incremented twice, and bd_holder
+			 * will be set to bd_may_claim before being
+			 * set to holder
+			 */
+			whole->bd_holders++;
+			whole->bd_holder = bd_may_claim;
+			bdev->bd_holders++;
+			bdev->bd_holder = holder;
+		}
+
+		/* tell others that we're done */
+		BUG_ON(whole->bd_claiming != holder);
+		whole->bd_claiming = NULL;
+		wake_up_bit(&whole->bd_claiming, 0);
+
+		spin_unlock(&bdev_lock);
+
+		/*
+		 * Block event polling for write claims if requested.  Any
+		 * write holder makes the write_holder state stick until
+		 * all are released.  This is good enough and tracking
+		 * individual writeable reference is too fragile given the
+		 * way @mode is used in blkdev_get/put().
+		 */
+		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+			bdev->bd_write_holder = true;
+			disk_block_events(disk);
+		}
+
+		mutex_unlock(&bdev->bd_mutex);
+		bdput(whole);
+	}
+
+	return res;
+}
+EXPORT_SYMBOL(blkdev_get);
+
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev))
+		return bdev;
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		blkdev_put(bdev, mode);
+		return ERR_PTR(-EACCES);
+	}
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return ERR_PTR(-ENOMEM);
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
+static int blkdev_open(struct inode * inode, struct file * filp)
+{
+	struct block_device *bdev;
+
+	/*
+	 * Preserve backwards compatibility and allow large file access
+	 * even if userspace doesn't ask for it explicitly. Some mkfs
+	 * binary needs it. We might want to drop this workaround
+	 * during an unstable branch.
+	 */
+	filp->f_flags |= O_LARGEFILE;
+
+	if (filp->f_flags & O_NDELAY)
+		filp->f_mode |= FMODE_NDELAY;
+	if (filp->f_flags & O_EXCL)
+		filp->f_mode |= FMODE_EXCL;
+	if ((filp->f_flags & O_ACCMODE) == 3)
+		filp->f_mode |= FMODE_WRITE_IOCTL;
+
+	bdev = bd_acquire(inode);
+	if (bdev == NULL)
+		return -ENOMEM;
+
+	filp->f_mapping = bdev->bd_inode->i_mapping;
+
+	return blkdev_get(bdev, filp->f_mode, filp);
+}
+
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct block_device *victim = NULL;
+
+	mutex_lock_nested(&bdev->bd_mutex, for_part);
+	if (for_part)
+		bdev->bd_part_count--;
+
+	if (!--bdev->bd_openers) {
+		WARN_ON_ONCE(bdev->bd_holders);
+		sync_blockdev(bdev);
+		kill_bdev(bdev);
+		/*
+		 * ->release can cause the queue to disappear, so flush all
+		 * dirty data before.
+		 */
+		bdev_write_inode(bdev->bd_inode);
+	}
+	if (bdev->bd_contains == bdev) {
+		if (disk->fops->release)
+			disk->fops->release(disk, mode);
+	}
+	if (!bdev->bd_openers) {
+		struct module *owner = disk->fops->owner;
+
+		disk_put_part(bdev->bd_part);
+		bdev->bd_part = NULL;
+		bdev->bd_disk = NULL;
+		if (bdev != bdev->bd_contains)
+			victim = bdev->bd_contains;
+		bdev->bd_contains = NULL;
+
+		put_disk(disk);
+		module_put(owner);
+	}
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+	if (victim)
+		__blkdev_put(victim, mode, 1);
+}
+
+void blkdev_put(struct block_device *bdev, fmode_t mode)
+{
+	mutex_lock(&bdev->bd_mutex);
+
+	if (mode & FMODE_EXCL) {
+		bool bdev_free;
+
+		/*
+		 * Release a claim on the device.  The holder fields
+		 * are protected with bdev_lock.  bd_mutex is to
+		 * synchronize disk_holder unlinking.
+		 */
+		spin_lock(&bdev_lock);
+
+		WARN_ON_ONCE(--bdev->bd_holders < 0);
+		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+
+		/* bd_contains might point to self, check in a separate step */
+		if ((bdev_free = !bdev->bd_holders))
+			bdev->bd_holder = NULL;
+		if (!bdev->bd_contains->bd_holders)
+			bdev->bd_contains->bd_holder = NULL;
+
+		spin_unlock(&bdev_lock);
+
+		/*
+		 * If this was the last claim, remove holder link and
+		 * unblock evpoll if it was a write holder.
+		 */
+		if (bdev_free && bdev->bd_write_holder) {
+			disk_unblock_events(bdev->bd_disk);
+			bdev->bd_write_holder = false;
+		}
+	}
+
+	/*
+	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+	 * event.  This is to ensure detection of media removal commanded
+	 * from userland - e.g. eject(1).
+	 */
+	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
+
+	mutex_unlock(&bdev->bd_mutex);
+
+	__blkdev_put(bdev, mode, 0);
+}
+EXPORT_SYMBOL(blkdev_put);
+
+static int blkdev_close(struct inode * inode, struct file * filp)
+{
+	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+	blkdev_put(bdev, filp->f_mode);
+	return 0;
+}
+
+static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+	fmode_t mode = file->f_mode;
+
+	/*
+	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+	 * to updated it before every ioctl.
+	 */
+	if (file->f_flags & O_NDELAY)
+		mode |= FMODE_NDELAY;
+	else
+		mode &= ~FMODE_NDELAY;
+
+	return blkdev_ioctl(bdev, mode, cmd, arg);
+}
+
+/*
+ * Write data to the block device.  Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *bd_inode = file->f_mapping->host;
+	loff_t size = i_size_read(bd_inode);
+	struct blk_plug plug;
+	ssize_t ret;
+
+	if (bdev_read_only(I_BDEV(bd_inode)))
+		return -EPERM;
+
+	if (!iov_iter_count(from))
+		return 0;
+
+	if (iocb->ki_pos >= size)
+		return -ENOSPC;
+
+	iov_iter_truncate(from, size - iocb->ki_pos);
+
+	blk_start_plug(&plug);
+	ret = __generic_file_write_iter(iocb, from);
+	if (ret > 0) {
+		ssize_t err;
+		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	blk_finish_plug(&plug);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_write_iter);
+
+ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *bd_inode = file->f_mapping->host;
+	loff_t size = i_size_read(bd_inode);
+	loff_t pos = iocb->ki_pos;
+
+	if (pos >= size)
+		return 0;
+
+	size -= pos;
+	iov_iter_truncate(to, size);
+	return generic_file_read_iter(iocb, to);
+}
+EXPORT_SYMBOL_GPL(blkdev_read_iter);
+
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+
+	if (super && super->s_op->bdev_try_to_free_page)
+		return super->s_op->bdev_try_to_free_page(super, page, wait);
+
+	return try_to_free_buffers(page);
+}
+
+static const struct address_space_operations def_blk_aops = {
+	.readpage	= blkdev_readpage,
+	.readpages	= blkdev_readpages,
+	.writepage	= blkdev_writepage,
+	.write_begin	= blkdev_write_begin,
+	.write_end	= blkdev_write_end,
+	.writepages	= generic_writepages,
+	.releasepage	= blkdev_releasepage,
+	.direct_IO	= blkdev_direct_IO,
+	.is_dirty_writeback = buffer_check_dirty_writeback,
+};
+
+const struct file_operations def_blk_fops = {
+	.open		= blkdev_open,
+	.release	= blkdev_close,
+	.llseek		= block_llseek,
+	.read_iter	= blkdev_read_iter,
+	.write_iter	= blkdev_write_iter,
+	.mmap		= generic_file_mmap,
+	.fsync		= blkdev_fsync,
+	.unlocked_ioctl	= block_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_blkdev_ioctl,
+#endif
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+};
+
+int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
+{
+	int res;
+	mm_segment_t old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	res = blkdev_ioctl(bdev, 0, cmd, arg);
+	set_fs(old_fs);
+	return res;
+}
+
+EXPORT_SYMBOL(ioctl_by_bdev);
+
+/**
+ * lookup_bdev  - lookup a struct block_device by name
+ * @pathname:	special file representing the block device
+ *
+ * Get a reference to the blockdevice at @pathname in the current
+ * namespace if possible and return it.  Return ERR_PTR(error)
+ * otherwise.
+ */
+struct block_device *lookup_bdev(const char *pathname)
+{
+	struct block_device *bdev;
+	struct inode *inode;
+	struct path path;
+	int error;
+
+	if (!pathname || !*pathname)
+		return ERR_PTR(-EINVAL);
+
+	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
+	if (error)
+		return ERR_PTR(error);
+
+	inode = d_backing_inode(path.dentry);
+	error = -ENOTBLK;
+	if (!S_ISBLK(inode->i_mode))
+		goto fail;
+	error = -EACCES;
+	if (path.mnt->mnt_flags & MNT_NODEV)
+		goto fail;
+	error = -ENOMEM;
+	bdev = bd_acquire(inode);
+	if (!bdev)
+		goto fail;
+out:
+	path_put(&path);
+	return bdev;
+fail:
+	bdev = ERR_PTR(error);
+	goto out;
+}
+EXPORT_SYMBOL(lookup_bdev);
+
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
+{
+	struct super_block *sb = get_super(bdev);
+	int res = 0;
+
+	if (sb) {
+		/*
+		 * no need to lock the super, get_super holds the
+		 * read mutex so the filesystem cannot go away
+		 * under us (->put_super runs with the write lock
+		 * hold).
+		 */
+		shrink_dcache_sb(sb);
+		res = invalidate_inodes(sb, kill_dirty);
+		drop_super(sb);
+	}
+	invalidate_bdev(bdev);
+	return res;
+}
+EXPORT_SYMBOL(__invalidate_device);
+
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+	struct inode *inode, *old_inode = NULL;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+		struct address_space *mapping = inode->i_mapping;
+
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+		    mapping->nrpages == 0) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
+		iput(old_inode);
+		old_inode = inode;
+
+		func(I_BDEV(inode), arg);
+
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+}
diff --git a/kernel/fs/btrfs/Kconfig b/kernel/fs/btrfs/Kconfig
new file mode 100644
index 000000000..80e9c18ea
--- /dev/null
+++ b/kernel/fs/btrfs/Kconfig
@@ -0,0 +1,91 @@
+config BTRFS_FS
+	tristate "Btrfs filesystem support"
+	select CRYPTO
+	select CRYPTO_CRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	select RAID6_PQ
+	select XOR_BLOCKS
+	select SRCU
+
+	help
+	  Btrfs is a general purpose copy-on-write filesystem with extents,
+	  writable snapshotting, support for multiple devices and many more
+	  features focused on fault tolerance, repair and easy administration.
+
+	  The filesystem disk format is no longer unstable, and it's not
+	  expected to change unless there are strong reasons to do so. If there
+	  is a format change, file systems with a unchanged format will
+	  continue to be mountable and usable by newer kernels.
+
+	  For more information, please see the web pages at
+	  http://btrfs.wiki.kernel.org.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called btrfs.
+
+	  If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+	bool "Btrfs POSIX Access Control Lists"
+	depends on BTRFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+	bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+	depends on BTRFS_FS
+	help
+	  Adds code that examines all block write requests (including
+	  writes of the super block). The goal is to verify that the
+	  state of the filesystem on disk is always consistent, i.e.,
+	  after a power-loss or kernel panic event the filesystem is
+	  in a consistent state.
+
+	  If the integrity check tool is included and activated in
+	  the mount options, plenty of kernel memory is used, and
+	  plenty of additional CPU cycles are spent. Enabling this
+	  functionality is not intended for normal use.
+
+	  In most cases, unless you are a btrfs developer who needs
+	  to verify the integrity of (super)-block write requests
+	  during the run of a regression test, say N
+
+config BTRFS_FS_RUN_SANITY_TESTS
+	bool "Btrfs will run sanity tests upon loading"
+	depends on BTRFS_FS
+	help
+	  This will run some basic sanity tests on the free space cache
+	  code to make sure it is acting as it should.  These are mostly
+	  regression tests and are only really interesting to btrfs
+	  developers.
+
+	  If unsure, say N.
+
+config BTRFS_DEBUG
+	bool "Btrfs debugging support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time debugging support for the btrfs filesystem. This may
+	  enable additional and expensive checks with negative impact on
+	  performance, or export extra information via sysfs.
+
+	  If unsure, say N.
+
+config BTRFS_ASSERT
+	bool "Btrfs assert support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time assertion checking.  This will result in panics if
+	  any of the assertions trip.  This is meant for btrfs developers only.
+
+	  If unsure, say N.
diff --git a/kernel/fs/btrfs/Makefile b/kernel/fs/btrfs/Makefile
new file mode 100644
index 000000000..6d1d0b93b
--- /dev/null
+++ b/kernel/fs/btrfs/Makefile
@@ -0,0 +1,19 @@
+
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+	   file-item.o inode-item.o inode-map.o disk-io.o \
+	   transaction.o inode.o file.o tree-defrag.o \
+	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
+	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+	   uuid-tree.o props.o hash.o
+
+btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+	tests/extent-buffer-tests.o tests/btrfs-tests.o \
+	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/kernel/fs/btrfs/acl.c b/kernel/fs/btrfs/acl.c
new file mode 100644
index 000000000..9a0124a95
--- /dev/null
+++ b/kernel/fs/btrfs/acl.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+	int size;
+	const char *name;
+	char *value = NULL;
+	struct posix_acl *acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	size = __btrfs_getxattr(inode, name, "", 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __btrfs_getxattr(inode, name, value, size);
+	}
+	if (size > 0) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
+		/* FIXME, who returns -ENOENT?  I think nobody */
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(-EIO);
+	}
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+			 struct inode *inode, struct posix_acl *acl, int type)
+{
+	int ret, size = 0;
+	const char *name;
+	char *value = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (ret < 0)
+				return ret;
+			if (ret == 0)
+				acl = NULL;
+		}
+		ret = 0;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EINVAL : 0;
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+out:
+	kfree(value);
+
+	if (!ret)
+		set_cached_acl(inode, type, acl);
+
+	return ret;
+}
+
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	return __btrfs_set_acl(NULL, inode, acl, type);
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that.  If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *default_acl, *acl;
+	int ret = 0;
+
+	/* this happens with subvols */
+	if (!dir)
+		return 0;
+
+	ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (ret)
+		return ret;
+
+	if (default_acl) {
+		ret = __btrfs_set_acl(trans, inode, default_acl,
+				      ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+
+	if (acl) {
+		if (!ret)
+			ret = __btrfs_set_acl(trans, inode, acl,
+					      ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+
+	if (!default_acl && !acl)
+		cache_no_acl(inode);
+	return ret;
+}
diff --git a/kernel/fs/btrfs/async-thread.c b/kernel/fs/btrfs/async-thread.c
new file mode 100644
index 000000000..df9932b00
--- /dev/null
+++ b/kernel/fs/btrfs/async-thread.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+#include "ctree.h"
+
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct __btrfs_workqueue {
+	struct workqueue_struct *normal_wq;
+	/* List head pointing to ordered work list */
+	struct list_head ordered_list;
+
+	/* Spinlock for ordered_list */
+	spinlock_t list_lock;
+
+	/* Thresholding related variants */
+	atomic_t pending;
+	int max_active;
+	int current_max;
+	int thresh;
+	unsigned int count;
+	spinlock_t thres_lock;
+};
+
+struct btrfs_workqueue {
+	struct __btrfs_workqueue *normal;
+	struct __btrfs_workqueue *high;
+};
+
+static void normal_work_helper(struct btrfs_work *work);
+
+#define BTRFS_WORK_HELPER(name)					\
+void btrfs_##name(struct work_struct *arg)				\
+{									\
+	struct btrfs_work *work = container_of(arg, struct btrfs_work,	\
+					       normal_work);		\
+	normal_work_helper(work);					\
+}
+
+BTRFS_WORK_HELPER(worker_helper);
+BTRFS_WORK_HELPER(delalloc_helper);
+BTRFS_WORK_HELPER(flush_delalloc_helper);
+BTRFS_WORK_HELPER(cache_helper);
+BTRFS_WORK_HELPER(submit_helper);
+BTRFS_WORK_HELPER(fixup_helper);
+BTRFS_WORK_HELPER(endio_helper);
+BTRFS_WORK_HELPER(endio_meta_helper);
+BTRFS_WORK_HELPER(endio_meta_write_helper);
+BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
+BTRFS_WORK_HELPER(rmw_helper);
+BTRFS_WORK_HELPER(endio_write_helper);
+BTRFS_WORK_HELPER(freespace_write_helper);
+BTRFS_WORK_HELPER(delayed_meta_helper);
+BTRFS_WORK_HELPER(readahead_helper);
+BTRFS_WORK_HELPER(qgroup_rescan_helper);
+BTRFS_WORK_HELPER(extent_refs_helper);
+BTRFS_WORK_HELPER(scrub_helper);
+BTRFS_WORK_HELPER(scrubwrc_helper);
+BTRFS_WORK_HELPER(scrubnc_helper);
+
+static struct __btrfs_workqueue *
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+			 int thresh)
+{
+	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+
+	if (!ret)
+		return NULL;
+
+	ret->max_active = max_active;
+	atomic_set(&ret->pending, 0);
+	if (thresh == 0)
+		thresh = DFT_THRESHOLD;
+	/* For low threshold, disabling threshold is a better choice */
+	if (thresh < DFT_THRESHOLD) {
+		ret->current_max = max_active;
+		ret->thresh = NO_THRESHOLD;
+	} else {
+		ret->current_max = 1;
+		ret->thresh = thresh;
+	}
+
+	if (flags & WQ_HIGHPRI)
+		ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+						 ret->max_active,
+						 "btrfs", name);
+	else
+		ret->normal_wq = alloc_workqueue("%s-%s", flags,
+						 ret->max_active, "btrfs",
+						 name);
+	if (!ret->normal_wq) {
+		kfree(ret);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&ret->ordered_list);
+	spin_lock_init(&ret->list_lock);
+	spin_lock_init(&ret->thres_lock);
+	trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+	return ret;
+}
+
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
+
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+					      unsigned int flags,
+					      int max_active,
+					      int thresh)
+{
+	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+
+	if (!ret)
+		return NULL;
+
+	ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+					      max_active, thresh);
+	if (!ret->normal) {
+		kfree(ret);
+		return NULL;
+	}
+
+	if (flags & WQ_HIGHPRI) {
+		ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+						    thresh);
+		if (!ret->high) {
+			__btrfs_destroy_workqueue(ret->normal);
+			kfree(ret);
+			return NULL;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
+ */
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
+{
+	if (wq->thresh == NO_THRESHOLD)
+		return;
+	atomic_inc(&wq->pending);
+}
+
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
+{
+	int new_max_active;
+	long pending;
+	int need_change = 0;
+
+	if (wq->thresh == NO_THRESHOLD)
+		return;
+
+	atomic_dec(&wq->pending);
+	spin_lock(&wq->thres_lock);
+	/*
+	 * Use wq->count to limit the calling frequency of
+	 * workqueue_set_max_active.
+	 */
+	wq->count++;
+	wq->count %= (wq->thresh / 4);
+	if (!wq->count)
+		goto  out;
+	new_max_active = wq->current_max;
+
+	/*
+	 * pending may be changed later, but it's OK since we really
+	 * don't need it so accurate to calculate new_max_active.
+	 */
+	pending = atomic_read(&wq->pending);
+	if (pending > wq->thresh)
+		new_max_active++;
+	if (pending < wq->thresh / 2)
+		new_max_active--;
+	new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+	if (new_max_active != wq->current_max)  {
+		need_change = 1;
+		wq->current_max = new_max_active;
+	}
+out:
+	spin_unlock(&wq->thres_lock);
+
+	if (need_change) {
+		workqueue_set_max_active(wq->normal_wq, wq->current_max);
+	}
+}
+
+static void run_ordered_work(struct __btrfs_workqueue *wq)
+{
+	struct list_head *list = &wq->ordered_list;
+	struct btrfs_work *work;
+	spinlock_t *lock = &wq->list_lock;
+	unsigned long flags;
+
+	while (1) {
+		spin_lock_irqsave(lock, flags);
+		if (list_empty(list))
+			break;
+		work = list_entry(list->next, struct btrfs_work,
+				  ordered_list);
+		if (!test_bit(WORK_DONE_BIT, &work->flags))
+			break;
+
+		/*
+		 * we are going to call the ordered done function, but
+		 * we leave the work item on the list as a barrier so
+		 * that later work items that are done don't have their
+		 * functions called before this one returns
+		 */
+		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+			break;
+		trace_btrfs_ordered_sched(work);
+		spin_unlock_irqrestore(lock, flags);
+		work->ordered_func(work);
+
+		/* now take the lock again and drop our item from the list */
+		spin_lock_irqsave(lock, flags);
+		list_del(&work->ordered_list);
+		spin_unlock_irqrestore(lock, flags);
+
+		/*
+		 * we don't want to call the ordered free functions
+		 * with the lock held though
+		 */
+		work->ordered_free(work);
+		trace_btrfs_all_work_done(work);
+	}
+	spin_unlock_irqrestore(lock, flags);
+}
+
+static void normal_work_helper(struct btrfs_work *work)
+{
+	struct __btrfs_workqueue *wq;
+	int need_order = 0;
+
+	/*
+	 * We should not touch things inside work in the following cases:
+	 * 1) after work->func() if it has no ordered_free
+	 *    Since the struct is freed in work->func().
+	 * 2) after setting WORK_DONE_BIT
+	 *    The work may be freed in other threads almost instantly.
+	 * So we save the needed things here.
+	 */
+	if (work->ordered_func)
+		need_order = 1;
+	wq = work->wq;
+
+	trace_btrfs_work_sched(work);
+	thresh_exec_hook(wq);
+	work->func(work);
+	if (need_order) {
+		set_bit(WORK_DONE_BIT, &work->flags);
+		run_ordered_work(wq);
+	}
+	if (!need_order)
+		trace_btrfs_all_work_done(work);
+}
+
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free)
+{
+	work->func = func;
+	work->ordered_func = ordered_func;
+	work->ordered_free = ordered_free;
+	INIT_WORK(&work->normal_work, uniq_func);
+	INIT_LIST_HEAD(&work->ordered_list);
+	work->flags = 0;
+}
+
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+				      struct btrfs_work *work)
+{
+	unsigned long flags;
+
+	work->wq = wq;
+	thresh_queue_hook(wq);
+	if (work->ordered_func) {
+		spin_lock_irqsave(&wq->list_lock, flags);
+		list_add_tail(&work->ordered_list, &wq->ordered_list);
+		spin_unlock_irqrestore(&wq->list_lock, flags);
+	}
+	queue_work(wq->normal_wq, &work->normal_work);
+	trace_btrfs_work_queued(work);
+}
+
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work)
+{
+	struct __btrfs_workqueue *dest_wq;
+
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+		dest_wq = wq->high;
+	else
+		dest_wq = wq->normal;
+	__btrfs_queue_work(dest_wq, work);
+}
+
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
+{
+	destroy_workqueue(wq->normal_wq);
+	trace_btrfs_workqueue_destroy(wq);
+	kfree(wq);
+}
+
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
+{
+	if (!wq)
+		return;
+	if (wq->high)
+		__btrfs_destroy_workqueue(wq->high);
+	__btrfs_destroy_workqueue(wq->normal);
+	kfree(wq);
+}
+
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
+{
+	if (!wq)
+		return;
+	wq->normal->max_active = max;
+	if (wq->high)
+		wq->high->max_active = max;
+}
+
+void btrfs_set_work_high_priority(struct btrfs_work *work)
+{
+	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
diff --git a/kernel/fs/btrfs/async-thread.h b/kernel/fs/btrfs/async-thread.h
new file mode 100644
index 000000000..ec2ee477f
--- /dev/null
+++ b/kernel/fs/btrfs/async-thread.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+#include <linux/workqueue.h>
+
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_work_func_t)(struct work_struct *arg);
+
+struct btrfs_work {
+	btrfs_func_t func;
+	btrfs_func_t ordered_func;
+	btrfs_func_t ordered_free;
+
+	/* Don't touch things below */
+	struct work_struct normal_work;
+	struct list_head ordered_list;
+	struct __btrfs_workqueue *wq;
+	unsigned long flags;
+};
+
+#define BTRFS_WORK_HELPER_PROTO(name)					\
+void btrfs_##name(struct work_struct *arg)
+
+BTRFS_WORK_HELPER_PROTO(worker_helper);
+BTRFS_WORK_HELPER_PROTO(delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(cache_helper);
+BTRFS_WORK_HELPER_PROTO(submit_helper);
+BTRFS_WORK_HELPER_PROTO(fixup_helper);
+BTRFS_WORK_HELPER_PROTO(endio_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
+BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
+BTRFS_WORK_HELPER_PROTO(rmw_helper);
+BTRFS_WORK_HELPER_PROTO(endio_write_helper);
+BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
+BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
+BTRFS_WORK_HELPER_PROTO(readahead_helper);
+BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
+BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
+BTRFS_WORK_HELPER_PROTO(scrub_helper);
+BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+					      unsigned int flags,
+					      int max_active,
+					      int thresh);
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
+#endif
diff --git a/kernel/fs/btrfs/backref.c b/kernel/fs/btrfs/backref.c
new file mode 100644
index 000000000..614aaa196
--- /dev/null
+++ b/kernel/fs/btrfs/backref.c
@@ -0,0 +1,1978 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+#include "locking.h"
+
+/* Just an arbitrary number so we can be sure this happened */
+#define BACKREF_FOUND_SHARED 6
+
+struct extent_inode_elem {
+	u64 inum;
+	u64 offset;
+	struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+				struct btrfs_file_extent_item *fi,
+				u64 extent_item_pos,
+				struct extent_inode_elem **eie)
+{
+	u64 offset = 0;
+	struct extent_inode_elem *e;
+
+	if (!btrfs_file_extent_compression(eb, fi) &&
+	    !btrfs_file_extent_encryption(eb, fi) &&
+	    !btrfs_file_extent_other_encoding(eb, fi)) {
+		u64 data_offset;
+		u64 data_len;
+
+		data_offset = btrfs_file_extent_offset(eb, fi);
+		data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+		if (extent_item_pos < data_offset ||
+		    extent_item_pos >= data_offset + data_len)
+			return 1;
+		offset = extent_item_pos - data_offset;
+	}
+
+	e = kmalloc(sizeof(*e), GFP_NOFS);
+	if (!e)
+		return -ENOMEM;
+
+	e->next = *eie;
+	e->inum = key->objectid;
+	e->offset = key->offset + offset;
+	*eie = e;
+
+	return 0;
+}
+
+static void free_inode_elem_list(struct extent_inode_elem *eie)
+{
+	struct extent_inode_elem *eie_next;
+
+	for (; eie; eie = eie_next) {
+		eie_next = eie->next;
+		kfree(eie);
+	}
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+				u64 extent_item_pos,
+				struct extent_inode_elem **eie)
+{
+	u64 disk_byte;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int slot;
+	int nritems;
+	int extent_type;
+	int ret;
+
+	/*
+	 * from the shared data ref, we only have the leaf but we need
+	 * the key. thus, we must look into all items and see that we
+	 * find one (some) with a reference to our extent item.
+	 */
+	nritems = btrfs_header_nritems(eb);
+	for (slot = 0; slot < nritems; ++slot) {
+		btrfs_item_key_to_cpu(eb, &key, slot);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+		if (disk_byte != wanted_disk_byte)
+			continue;
+
+		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
+	struct list_head list;
+	u64 root_id;
+	struct btrfs_key key_for_search;
+	int level;
+	int count;
+	struct extent_inode_elem *inode_list;
+	u64 parent;
+	u64 wanted_disk_byte;
+};
+
+static struct kmem_cache *btrfs_prelim_ref_cache;
+
+int __init btrfs_prelim_ref_init(void)
+{
+	btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
+					sizeof(struct __prelim_ref),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_prelim_ref_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_prelim_ref_exit(void)
+{
+	if (btrfs_prelim_ref_cache)
+		kmem_cache_destroy(btrfs_prelim_ref_cache);
+}
+
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ *   block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    -   |     -
+ *      key to resolve |    -   |     y    |    y   |     y
+ *  tree block logical |    -   |     -    |    -   |     -
+ *  root for resolving |    y   |     y    |    y   |     y
+ *
+ * - column 1:       we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    y   |     -
+ *      key to resolve |    -   |     -    |    -   |     y
+ *  tree block logical |    y   |     y    |    y   |     y
+ *  root for resolving |    -   |     y    |    y   |     y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2:    we take the first key from the block to find the parent
+ *                (see __add_missing_keys)
+ * - column 4:    we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+			    struct btrfs_key *key, int level,
+			    u64 parent, u64 wanted_disk_byte, int count,
+			    gfp_t gfp_mask)
+{
+	struct __prelim_ref *ref;
+
+	if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		return 0;
+
+	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->root_id = root_id;
+	if (key)
+		ref->key_for_search = *key;
+	else
+		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+
+	ref->inode_list = NULL;
+	ref->level = level;
+	ref->count = count;
+	ref->parent = parent;
+	ref->wanted_disk_byte = wanted_disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+			   struct ulist *parents, struct __prelim_ref *ref,
+			   int level, u64 time_seq, const u64 *extent_item_pos,
+			   u64 total_refs)
+{
+	int ret = 0;
+	int slot;
+	struct extent_buffer *eb;
+	struct btrfs_key key;
+	struct btrfs_key *key_for_search = &ref->key_for_search;
+	struct btrfs_file_extent_item *fi;
+	struct extent_inode_elem *eie = NULL, *old = NULL;
+	u64 disk_byte;
+	u64 wanted_disk_byte = ref->wanted_disk_byte;
+	u64 count = 0;
+
+	if (level != 0) {
+		eb = path->nodes[level];
+		ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+		if (ret < 0)
+			return ret;
+		return 0;
+	}
+
+	/*
+	 * We normally enter this function with the path already pointing to
+	 * the first item to check. But sometimes, we may enter it with
+	 * slot==nritems. In that case, go to the next leaf before we continue.
+	 */
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+		ret = btrfs_next_old_leaf(root, path, time_seq);
+
+	while (!ret && count < total_refs) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(eb, &key, slot);
+
+		if (key.objectid != key_for_search->objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+
+		if (disk_byte == wanted_disk_byte) {
+			eie = NULL;
+			old = NULL;
+			count++;
+			if (extent_item_pos) {
+				ret = check_extent_in_eb(&key, eb, fi,
+						*extent_item_pos,
+						&eie);
+				if (ret < 0)
+					break;
+			}
+			if (ret > 0)
+				goto next;
+			ret = ulist_add_merge_ptr(parents, eb->start,
+						  eie, (void **)&old, GFP_NOFS);
+			if (ret < 0)
+				break;
+			if (!ret && extent_item_pos) {
+				while (old->next)
+					old = old->next;
+				old->next = eie;
+			}
+			eie = NULL;
+		}
+next:
+		ret = btrfs_next_old_item(root, path, time_seq);
+	}
+
+	if (ret > 0)
+		ret = 0;
+	else if (ret < 0)
+		free_inode_elem_list(eie);
+	return ret;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+				  struct btrfs_path *path, u64 time_seq,
+				  struct __prelim_ref *ref,
+				  struct ulist *parents,
+				  const u64 *extent_item_pos, u64 total_refs)
+{
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	struct extent_buffer *eb;
+	int ret = 0;
+	int root_level;
+	int level = ref->level;
+	int index;
+
+	root_key.objectid = ref->root_id;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	if (path->search_commit_root)
+		root_level = btrfs_header_level(root->commit_root);
+	else
+		root_level = btrfs_old_root_level(root, time_seq);
+
+	if (root_level + 1 == level) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		goto out;
+	}
+
+	path->lowest_level = level;
+	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+
+	/* root node has been locked, we can release @subvol_srcu safely here */
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+		 "%d for key (%llu %u %llu)\n",
+		 ref->root_id, level, ref->count, ret,
+		 ref->key_for_search.objectid, ref->key_for_search.type,
+		 ref->key_for_search.offset);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[level];
+	while (!eb) {
+		if (WARN_ON(!level)) {
+			ret = 1;
+			goto out;
+		}
+		level--;
+		eb = path->nodes[level];
+	}
+
+	ret = add_all_parents(root, path, parents, ref, level, time_seq,
+			      extent_item_pos, total_refs);
+out:
+	path->lowest_level = 0;
+	btrfs_release_path(path);
+	return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+				   struct btrfs_path *path, u64 time_seq,
+				   struct list_head *head,
+				   const u64 *extent_item_pos, u64 total_refs,
+				   u64 root_objectid)
+{
+	int err;
+	int ret = 0;
+	struct __prelim_ref *ref;
+	struct __prelim_ref *ref_safe;
+	struct __prelim_ref *new_ref;
+	struct ulist *parents;
+	struct ulist_node *node;
+	struct ulist_iterator uiter;
+
+	parents = ulist_alloc(GFP_NOFS);
+	if (!parents)
+		return -ENOMEM;
+
+	/*
+	 * _safe allows us to insert directly after the current item without
+	 * iterating over the newly inserted items.
+	 * we're also allowed to re-assign ref during iteration.
+	 */
+	list_for_each_entry_safe(ref, ref_safe, head, list) {
+		if (ref->parent)	/* already direct */
+			continue;
+		if (ref->count == 0)
+			continue;
+		if (root_objectid && ref->root_id != root_objectid) {
+			ret = BACKREF_FOUND_SHARED;
+			goto out;
+		}
+		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+					     parents, extent_item_pos,
+					     total_refs);
+		/*
+		 * we can only tolerate ENOENT,otherwise,we should catch error
+		 * and return directly.
+		 */
+		if (err == -ENOENT) {
+			continue;
+		} else if (err) {
+			ret = err;
+			goto out;
+		}
+
+		/* we put the first parent into the ref at hand */
+		ULIST_ITER_INIT(&uiter);
+		node = ulist_next(parents, &uiter);
+		ref->parent = node ? node->val : 0;
+		ref->inode_list = node ?
+			(struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
+
+		/* additional parents require new refs being added here */
+		while ((node = ulist_next(parents, &uiter))) {
+			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
+						   GFP_NOFS);
+			if (!new_ref) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			memcpy(new_ref, ref, sizeof(*ref));
+			new_ref->parent = node->val;
+			new_ref->inode_list = (struct extent_inode_elem *)
+							(uintptr_t)node->aux;
+			list_add(&new_ref->list, &ref->list);
+		}
+		ulist_reinit(parents);
+	}
+out:
+	ulist_free(parents);
+	return ret;
+}
+
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+				     struct __prelim_ref *ref2)
+{
+	if (ref1->level != ref2->level)
+		return 0;
+	if (ref1->root_id != ref2->root_id)
+		return 0;
+	if (ref1->key_for_search.type != ref2->key_for_search.type)
+		return 0;
+	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+		return 0;
+	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+		return 0;
+	if (ref1->parent != ref2->parent)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+			      struct list_head *head)
+{
+	struct list_head *pos;
+	struct extent_buffer *eb;
+
+	list_for_each(pos, head) {
+		struct __prelim_ref *ref;
+		ref = list_entry(pos, struct __prelim_ref, list);
+
+		if (ref->parent)
+			continue;
+		if (ref->key_for_search.type)
+			continue;
+		BUG_ON(!ref->wanted_disk_byte);
+		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+				     0);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
+		btrfs_tree_read_lock(eb);
+		if (btrfs_header_level(eb) == 0)
+			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+		else
+			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+		btrfs_tree_read_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return 0;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ *           additionally, we could even add a key range for the blocks we
+ *           looked into to merge even more (-> replace unresolved refs by those
+ *           having a parent).
+ * mode = 2: merge identical parents
+ */
+static void __merge_refs(struct list_head *head, int mode)
+{
+	struct list_head *pos1;
+
+	list_for_each(pos1, head) {
+		struct list_head *n2;
+		struct list_head *pos2;
+		struct __prelim_ref *ref1;
+
+		ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+		     pos2 = n2, n2 = pos2->next) {
+			struct __prelim_ref *ref2;
+			struct __prelim_ref *xchg;
+			struct extent_inode_elem *eie;
+
+			ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+			if (mode == 1) {
+				if (!ref_for_same_block(ref1, ref2))
+					continue;
+				if (!ref1->parent && ref2->parent) {
+					xchg = ref1;
+					ref1 = ref2;
+					ref2 = xchg;
+				}
+			} else {
+				if (ref1->parent != ref2->parent)
+					continue;
+			}
+
+			eie = ref1->inode_list;
+			while (eie && eie->next)
+				eie = eie->next;
+			if (eie)
+				eie->next = ref2->inode_list;
+			else
+				ref1->inode_list = ref2->inode_list;
+			ref1->count += ref2->count;
+
+			list_del(&ref2->list);
+			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+		}
+
+	}
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+			      struct list_head *prefs, u64 *total_refs,
+			      u64 inum)
+{
+	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+	struct rb_node *n = &head->node.rb_node;
+	struct btrfs_key key;
+	struct btrfs_key op_key = {0};
+	int sgn;
+	int ret = 0;
+
+	if (extent_op && extent_op->update_key)
+		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
+
+	spin_lock(&head->lock);
+	n = rb_first(&head->ref_root);
+	while (n) {
+		struct btrfs_delayed_ref_node *node;
+		node = rb_entry(n, struct btrfs_delayed_ref_node,
+				rb_node);
+		n = rb_next(n);
+		if (node->seq > seq)
+			continue;
+
+		switch (node->action) {
+		case BTRFS_ADD_DELAYED_EXTENT:
+		case BTRFS_UPDATE_DELAYED_HEAD:
+			WARN_ON(1);
+			continue;
+		case BTRFS_ADD_DELAYED_REF:
+			sgn = 1;
+			break;
+		case BTRFS_DROP_DELAYED_REF:
+			sgn = -1;
+			break;
+		default:
+			BUG_ON(1);
+		}
+		*total_refs += (node->ref_mod * sgn);
+		switch (node->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, &op_key,
+					       ref->level + 1, 0, node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, NULL,
+					       ref->level + 1, ref->parent,
+					       node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+
+			/*
+			 * Found a inum that doesn't match our known inum, we
+			 * know it's shared.
+			 */
+			if (inum && ref->objectid != inum) {
+				ret = BACKREF_FOUND_SHARED;
+				break;
+			}
+
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+					       ref->parent, node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			break;
+	}
+	spin_unlock(&head->lock);
+	return ret;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path, u64 bytenr,
+			     int *info_level, struct list_head *prefs,
+			     u64 *total_refs, u64 inum)
+{
+	int ret = 0;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	unsigned long ptr;
+	unsigned long end;
+	struct btrfs_extent_item *ei;
+	u64 flags;
+	u64 item_size;
+
+	/*
+	 * enumerate all inline refs
+	 */
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+	*total_refs += btrfs_extent_refs(leaf, ei);
+	btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+
+		info = (struct btrfs_tree_block_info *)ptr;
+		*info_level = btrfs_tree_block_level(leaf, info);
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+		*info_level = found_key.offset;
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		u64 offset;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+		switch (type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, NULL,
+						*info_level + 1, offset,
+						bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, offset, NULL,
+					       *info_level + 1, 0,
+					       bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+			if (inum && key.objectid != inum) {
+				ret = BACKREF_FOUND_SHARED;
+				break;
+			}
+
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			return ret;
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+			    struct btrfs_path *path, u64 bytenr,
+			    int info_level, struct list_head *prefs, u64 inum)
+{
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	while (1) {
+		ret = btrfs_next_item(extent_root, path);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = 0;
+			break;
+		}
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid != bytenr)
+			break;
+		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+			continue;
+		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+			break;
+
+		switch (key.type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, NULL,
+						info_level + 1, key.offset,
+						bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_shared_data_ref);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+						bytenr, count, GFP_NOFS);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, key.offset, NULL,
+					       info_level + 1, 0,
+					       bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_extent_data_ref);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+			if (inum && key.objectid != inum) {
+				ret = BACKREF_FOUND_SHARED;
+				break;
+			}
+
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			return ret;
+
+	}
+
+	return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * NOTE: This can return values > 0
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info, u64 bytenr,
+			     u64 time_seq, struct ulist *refs,
+			     struct ulist *roots, const u64 *extent_item_pos,
+			     u64 root_objectid, u64 inum)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_delayed_ref_root *delayed_refs = NULL;
+	struct btrfs_delayed_ref_head *head;
+	int info_level = 0;
+	int ret;
+	struct list_head prefs_delayed;
+	struct list_head prefs;
+	struct __prelim_ref *ref;
+	struct extent_inode_elem *eie = NULL;
+	u64 total_refs = 0;
+
+	INIT_LIST_HEAD(&prefs);
+	INIT_LIST_HEAD(&prefs_delayed);
+
+	key.objectid = bytenr;
+	key.offset = (u64)-1;
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	if (!trans) {
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+	}
+
+	/*
+	 * grab both a lock on the path and a lock on the delayed ref head.
+	 * We need both to get a consistent picture of how the refs look
+	 * at a specified point in time
+	 */
+again:
+	head = NULL;
+
+	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (trans && likely(trans->type != __TRANS_DUMMY)) {
+#else
+	if (trans) {
+#endif
+		/*
+		 * look if there are updates for this ref queued and lock the
+		 * head
+		 */
+		delayed_refs = &trans->transaction->delayed_refs;
+		spin_lock(&delayed_refs->lock);
+		head = btrfs_find_delayed_ref_head(trans, bytenr);
+		if (head) {
+			if (!mutex_trylock(&head->mutex)) {
+				atomic_inc(&head->node.refs);
+				spin_unlock(&delayed_refs->lock);
+
+				btrfs_release_path(path);
+
+				/*
+				 * Mutex was contended, block until it's
+				 * released and try again
+				 */
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+				btrfs_put_delayed_ref(&head->node);
+				goto again;
+			}
+			spin_unlock(&delayed_refs->lock);
+			ret = __add_delayed_refs(head, time_seq,
+						 &prefs_delayed, &total_refs,
+						 inum);
+			mutex_unlock(&head->mutex);
+			if (ret)
+				goto out;
+		} else {
+			spin_unlock(&delayed_refs->lock);
+		}
+	}
+
+	if (path->slots[0]) {
+		struct extent_buffer *leaf;
+		int slot;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid == bytenr &&
+		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		     key.type == BTRFS_METADATA_ITEM_KEY)) {
+			ret = __add_inline_refs(fs_info, path, bytenr,
+						&info_level, &prefs,
+						&total_refs, inum);
+			if (ret)
+				goto out;
+			ret = __add_keyed_refs(fs_info, path, bytenr,
+					       info_level, &prefs, inum);
+			if (ret)
+				goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	list_splice_init(&prefs_delayed, &prefs);
+
+	ret = __add_missing_keys(fs_info, &prefs);
+	if (ret)
+		goto out;
+
+	__merge_refs(&prefs, 1);
+
+	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+				      extent_item_pos, total_refs,
+				      root_objectid);
+	if (ret)
+		goto out;
+
+	__merge_refs(&prefs, 2);
+
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		WARN_ON(ref->count < 0);
+		if (roots && ref->count && ref->root_id && ref->parent == 0) {
+			if (root_objectid && ref->root_id != root_objectid) {
+				ret = BACKREF_FOUND_SHARED;
+				goto out;
+			}
+
+			/* no parent == root of tree */
+			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+			if (ret < 0)
+				goto out;
+		}
+		if (ref->count && ref->parent) {
+			if (extent_item_pos && !ref->inode_list &&
+			    ref->level == 0) {
+				struct extent_buffer *eb;
+
+				eb = read_tree_block(fs_info->extent_root,
+							   ref->parent, 0);
+				if (!eb || !extent_buffer_uptodate(eb)) {
+					free_extent_buffer(eb);
+					ret = -EIO;
+					goto out;
+				}
+				btrfs_tree_read_lock(eb);
+				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				ret = find_extent_in_eb(eb, bytenr,
+							*extent_item_pos, &eie);
+				btrfs_tree_read_unlock_blocking(eb);
+				free_extent_buffer(eb);
+				if (ret < 0)
+					goto out;
+				ref->inode_list = eie;
+			}
+			ret = ulist_add_merge_ptr(refs, ref->parent,
+						  ref->inode_list,
+						  (void **)&eie, GFP_NOFS);
+			if (ret < 0)
+				goto out;
+			if (!ret && extent_item_pos) {
+				/*
+				 * we've recorded that parent, so we must extend
+				 * its inode list here
+				 */
+				BUG_ON(!eie);
+				while (eie->next)
+					eie = eie->next;
+				eie->next = ref->inode_list;
+			}
+			eie = NULL;
+		}
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+
+out:
+	btrfs_free_path(path);
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+	while (!list_empty(&prefs_delayed)) {
+		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+				       list);
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+	if (ret < 0)
+		free_inode_elem_list(eie);
+	return ret;
+}
+
+static void free_leaf_list(struct ulist *blocks)
+{
+	struct ulist_node *node = NULL;
+	struct extent_inode_elem *eie;
+	struct ulist_iterator uiter;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((node = ulist_next(blocks, &uiter))) {
+		if (!node->aux)
+			continue;
+		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
+		free_inode_elem_list(eie);
+		node->aux = 0;
+	}
+
+	ulist_free(blocks);
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 time_seq, struct ulist **leafs,
+				const u64 *extent_item_pos)
+{
+	int ret;
+
+	*leafs = ulist_alloc(GFP_NOFS);
+	if (!*leafs)
+		return -ENOMEM;
+
+	ret = find_parent_nodes(trans, fs_info, bytenr,
+				time_seq, *leafs, NULL, extent_item_pos, 0, 0);
+	if (ret < 0 && ret != -ENOENT) {
+		free_leaf_list(*leafs);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info, u64 bytenr,
+				  u64 time_seq, struct ulist **roots)
+{
+	struct ulist *tmp;
+	struct ulist_node *node = NULL;
+	struct ulist_iterator uiter;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*roots = ulist_alloc(GFP_NOFS);
+	if (!*roots) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	ULIST_ITER_INIT(&uiter);
+	while (1) {
+		ret = find_parent_nodes(trans, fs_info, bytenr,
+					time_seq, tmp, *roots, NULL, 0, 0);
+		if (ret < 0 && ret != -ENOENT) {
+			ulist_free(tmp);
+			ulist_free(*roots);
+			return ret;
+		}
+		node = ulist_next(tmp, &uiter);
+		if (!node)
+			break;
+		bytenr = node->val;
+		cond_resched();
+	}
+
+	ulist_free(tmp);
+	return 0;
+}
+
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 bytenr,
+			 u64 time_seq, struct ulist **roots)
+{
+	int ret;
+
+	if (!trans)
+		down_read(&fs_info->commit_root_sem);
+	ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+	if (!trans)
+		up_read(&fs_info->commit_root_sem);
+	return ret;
+}
+
+/**
+ * btrfs_check_shared - tell us whether an extent is shared
+ *
+ * @trans: optional trans handle
+ *
+ * btrfs_check_shared uses the backref walking code but will short
+ * circuit as soon as it finds a root or inode that doesn't match the
+ * one passed in. This provides a significant performance benefit for
+ * callers (such as fiemap) which want to know whether the extent is
+ * shared but do not need a ref count.
+ *
+ * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
+ */
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 root_objectid,
+		       u64 inum, u64 bytenr)
+{
+	struct ulist *tmp = NULL;
+	struct ulist *roots = NULL;
+	struct ulist_iterator uiter;
+	struct ulist_node *node;
+	struct seq_list elem = SEQ_LIST_INIT(elem);
+	int ret = 0;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	roots = ulist_alloc(GFP_NOFS);
+	if (!tmp || !roots) {
+		ulist_free(tmp);
+		ulist_free(roots);
+		return -ENOMEM;
+	}
+
+	if (trans)
+		btrfs_get_tree_mod_seq(fs_info, &elem);
+	else
+		down_read(&fs_info->commit_root_sem);
+	ULIST_ITER_INIT(&uiter);
+	while (1) {
+		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
+					roots, NULL, root_objectid, inum);
+		if (ret == BACKREF_FOUND_SHARED) {
+			/* this is the only condition under which we return 1 */
+			ret = 1;
+			break;
+		}
+		if (ret < 0 && ret != -ENOENT)
+			break;
+		ret = 0;
+		node = ulist_next(tmp, &uiter);
+		if (!node)
+			break;
+		bytenr = node->val;
+		cond_resched();
+	}
+	if (trans)
+		btrfs_put_tree_mod_seq(fs_info, &elem);
+	else
+		up_read(&fs_info->commit_root_sem);
+	ulist_free(tmp);
+	ulist_free(roots);
+	return ret;
+}
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off)
+{
+	int ret, slot;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = start_off;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			/*
+			 * If the item at offset is not found,
+			 * btrfs_search_slot will point us to the slot
+			 * where it should be inserted. In our case
+			 * that will be the slot directly before the
+			 * next INODE_REF_KEY_V2 item. In the case
+			 * that we're pointing to the last slot in a
+			 * leaf, we must move one leaf over.
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret) {
+				if (ret >= 1)
+					ret = -ENOENT;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/*
+		 * Check that we're still looking at an extended ref key for
+		 * this particular objectid. If we have different
+		 * objectid or type then there are no more to be found
+		 * in the tree and we can exit.
+		 */
+		ret = -ENOENT;
+		if (found_key.objectid != inode_objectid)
+			break;
+		if (found_key.type != BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		ret = 0;
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		extref = (struct btrfs_inode_extref *)ptr;
+		*ret_extref = extref;
+		if (found_off)
+			*found_off = found_key.offset;
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * this iterates to turn a name (from iref/extref) into a full filesystem path.
+ * Elements of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size)
+{
+	int slot;
+	u64 next_inum;
+	int ret;
+	s64 bytes_left = ((s64)size) - 1;
+	struct extent_buffer *eb = eb_in;
+	struct btrfs_key found_key;
+	int leave_spinning = path->leave_spinning;
+	struct btrfs_inode_ref *iref;
+
+	if (bytes_left >= 0)
+		dest[bytes_left] = '\0';
+
+	path->leave_spinning = 1;
+	while (1) {
+		bytes_left -= name_len;
+		if (bytes_left >= 0)
+			read_extent_buffer(eb, dest + bytes_left,
+					   name_off, name_len);
+		if (eb != eb_in) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+		}
+		ret = btrfs_find_item(fs_root, path, parent, 0,
+				BTRFS_INODE_REF_KEY, &found_key);
+		if (ret > 0)
+			ret = -ENOENT;
+		if (ret)
+			break;
+
+		next_inum = found_key.offset;
+
+		/* regular exit ahead */
+		if (parent == next_inum)
+			break;
+
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		if (eb != eb_in) {
+			atomic_inc(&eb->refs);
+			btrfs_tree_read_lock(eb);
+			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		}
+		btrfs_release_path(path);
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		name_len = btrfs_inode_ref_name_len(eb, iref);
+		name_off = (unsigned long)(iref + 1);
+
+		parent = next_inum;
+		--bytes_left;
+		if (bytes_left >= 0)
+			dest[bytes_left] = '/';
+	}
+
+	btrfs_release_path(path);
+	path->leave_spinning = leave_spinning;
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	return dest + bytes_left;
+}
+
+/*
+ * this makes the path point to (logical EXTENT_ITEM *)
+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
+ * tree blocks and <0 on error.
+ */
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags_ret)
+{
+	int ret;
+	u64 flags;
+	u64 size = 0;
+	u32 item_size;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.objectid = logical;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		return ret;
+	}
+	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+		size = fs_info->extent_root->nodesize;
+	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+		size = found_key->offset;
+
+	if (found_key->objectid > logical ||
+	    found_key->objectid + size <= logical) {
+		pr_debug("logical %llu is not within any extent\n", logical);
+		return -ENOENT;
+	}
+
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	pr_debug("logical %llu is at position %llu within the extent (%llu "
+		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
+		 logical, logical - found_key->objectid, found_key->objectid,
+		 found_key->offset, flags, item_size);
+
+	WARN_ON(!flags_ret);
+	if (flags_ret) {
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+		else if (flags & BTRFS_EXTENT_FLAG_DATA)
+			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
+		else
+			BUG_ON(1);
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/*
+ * helper function to iterate extent inline refs. ptr must point to a 0 value
+ * for the first call and may be modified. it is used to track state.
+ * if more refs exist, 0 is returned and the next call to
+ * __get_extent_inline_ref must pass the modified ptr parameter to get the
+ * next ref. after the last ref was processed, 1 is returned.
+ * returns <0 on error
+ */
+static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
+				   struct btrfs_key *key,
+				   struct btrfs_extent_item *ei, u32 item_size,
+				   struct btrfs_extent_inline_ref **out_eiref,
+				   int *out_type)
+{
+	unsigned long end;
+	u64 flags;
+	struct btrfs_tree_block_info *info;
+
+	if (!*ptr) {
+		/* first call */
+		flags = btrfs_extent_flags(eb, ei);
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			if (key->type == BTRFS_METADATA_ITEM_KEY) {
+				/* a skinny metadata extent */
+				*out_eiref =
+				     (struct btrfs_extent_inline_ref *)(ei + 1);
+			} else {
+				WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+				info = (struct btrfs_tree_block_info *)(ei + 1);
+				*out_eiref =
+				   (struct btrfs_extent_inline_ref *)(info + 1);
+			}
+		} else {
+			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
+		}
+		*ptr = (unsigned long)*out_eiref;
+		if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
+			return -ENOENT;
+	}
+
+	end = (unsigned long)ei + item_size;
+	*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
+	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+
+	*ptr += btrfs_extent_inline_ref_size(*out_type);
+	WARN_ON(*ptr > end);
+	if (*ptr == end)
+		return 1; /* last */
+
+	return 0;
+}
+
+/*
+ * reads the tree block backref for an extent. tree level and root are returned
+ * through out_level and out_root. ptr must point to a 0 value for the first
+ * call and may be modified (see __get_extent_inline_ref comment).
+ * returns 0 if data was provided, 1 if there was no more data to provide or
+ * <0 on error.
+ */
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+			    struct btrfs_key *key, struct btrfs_extent_item *ei,
+			    u32 item_size, u64 *out_root, u8 *out_level)
+{
+	int ret;
+	int type;
+	struct btrfs_extent_inline_ref *eiref;
+
+	if (*ptr == (unsigned long)-1)
+		return 1;
+
+	while (1) {
+		ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
+					      &eiref, &type);
+		if (ret < 0)
+			return ret;
+
+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+		    type == BTRFS_SHARED_BLOCK_REF_KEY)
+			break;
+
+		if (ret == 1)
+			return 1;
+	}
+
+	/* we can treat both ref types equally here */
+	*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
+
+	if (key->type == BTRFS_EXTENT_ITEM_KEY) {
+		struct btrfs_tree_block_info *info;
+
+		info = (struct btrfs_tree_block_info *)(ei + 1);
+		*out_level = btrfs_tree_block_level(eb, info);
+	} else {
+		ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
+		*out_level = (u8)key->offset;
+	}
+
+	if (ret == 1)
+		*ptr = (unsigned long)-1;
+
+	return 0;
+}
+
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+				u64 root, u64 extent_item_objectid,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	struct extent_inode_elem *eie;
+	int ret = 0;
+
+	for (eie = inode_list; eie; eie = eie->next) {
+		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+			 "root %llu\n", extent_item_objectid,
+			 eie->inum, eie->offset, root);
+		ret = iterate(eie->inum, eie->offset, root, ctx);
+		if (ret) {
+			pr_debug("stopping iteration for %llu due to ret=%d\n",
+				 extent_item_objectid, ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * calls iterate() for every inode that references the extent identified by
+ * the given parameters.
+ * when the iterator function returns a non-zero value, iteration stops.
+ */
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				u64 extent_item_objectid, u64 extent_item_pos,
+				int search_commit_root,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	int ret;
+	struct btrfs_trans_handle *trans = NULL;
+	struct ulist *refs = NULL;
+	struct ulist *roots = NULL;
+	struct ulist_node *ref_node = NULL;
+	struct ulist_node *root_node = NULL;
+	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+	struct ulist_iterator ref_uiter;
+	struct ulist_iterator root_uiter;
+
+	pr_debug("resolving all inodes for extent %llu\n",
+			extent_item_objectid);
+
+	if (!search_commit_root) {
+		trans = btrfs_join_transaction(fs_info->extent_root);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+	} else {
+		down_read(&fs_info->commit_root_sem);
+	}
+
+	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+				   tree_mod_seq_elem.seq, &refs,
+				   &extent_item_pos);
+	if (ret)
+		goto out;
+
+	ULIST_ITER_INIT(&ref_uiter);
+	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+		ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
+					     tree_mod_seq_elem.seq, &roots);
+		if (ret)
+			break;
+		ULIST_ITER_INIT(&root_uiter);
+		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+			pr_debug("root %llu references leaf %llu, data list "
+				 "%#llx\n", root_node->val, ref_node->val,
+				 ref_node->aux);
+			ret = iterate_leaf_refs((struct extent_inode_elem *)
+						(uintptr_t)ref_node->aux,
+						root_node->val,
+						extent_item_objectid,
+						iterate, ctx);
+		}
+		ulist_free(roots);
+	}
+
+	free_leaf_list(refs);
+out:
+	if (!search_commit_root) {
+		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+		btrfs_end_transaction(trans, fs_info->extent_root);
+	} else {
+		up_read(&fs_info->commit_root_sem);
+	}
+
+	return ret;
+}
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	int ret;
+	u64 extent_item_pos;
+	u64 flags = 0;
+	struct btrfs_key found_key;
+	int search_commit_root = path->search_commit_root;
+
+	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
+	btrfs_release_path(path);
+	if (ret < 0)
+		return ret;
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		return -EINVAL;
+
+	extent_item_pos = logical - found_key.objectid;
+	ret = iterate_extent_inodes(fs_info, found_key.objectid,
+					extent_item_pos, search_commit_root,
+					iterate, ctx);
+
+	return ret;
+}
+
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+			      struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+			      struct btrfs_path *path,
+			      iterate_irefs_t *iterate, void *ctx)
+{
+	int ret = 0;
+	int slot;
+	u32 cur;
+	u32 len;
+	u32 name_len;
+	u64 parent = 0;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_key found_key;
+
+	while (!ret) {
+		ret = btrfs_find_item(fs_root, path, inum,
+				parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+				&found_key);
+
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		parent = found_key.offset;
+		slot = path->slots[0];
+		eb = btrfs_clone_extent_buffer(path->nodes[0]);
+		if (!eb) {
+			ret = -ENOMEM;
+			break;
+		}
+		extent_buffer_get(eb);
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_release_path(path);
+
+		item = btrfs_item_nr(slot);
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+			name_len = btrfs_inode_ref_name_len(eb, iref);
+			/* path must be released before calling iterate()! */
+			pr_debug("following ref at offset %u for inode %llu in "
+				 "tree %llu\n", cur, found_key.objectid,
+				 fs_root->objectid);
+			ret = iterate(parent, name_len,
+				      (unsigned long)(iref + 1), eb, ctx);
+			if (ret)
+				break;
+			len = sizeof(*iref) + name_len;
+			iref = (struct btrfs_inode_ref *)((char *)iref + len);
+		}
+		btrfs_tree_read_unlock_blocking(eb);
+		free_extent_buffer(eb);
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+				 struct btrfs_path *path,
+				 iterate_irefs_t *iterate, void *ctx)
+{
+	int ret;
+	int slot;
+	u64 offset = 0;
+	u64 parent;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u32 item_size;
+	u32 cur_offset;
+	unsigned long ptr;
+
+	while (1) {
+		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+					    &offset);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		slot = path->slots[0];
+		eb = btrfs_clone_extent_buffer(path->nodes[0]);
+		if (!eb) {
+			ret = -ENOMEM;
+			break;
+		}
+		extent_buffer_get(eb);
+
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_release_path(path);
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, slot);
+		ptr = btrfs_item_ptr_offset(leaf, slot);
+		cur_offset = 0;
+
+		while (cur_offset < item_size) {
+			u32 name_len;
+
+			extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+			parent = btrfs_inode_extref_parent(eb, extref);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			ret = iterate(parent, name_len,
+				      (unsigned long)&extref->name, eb, ctx);
+			if (ret)
+				break;
+
+			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += sizeof(*extref);
+		}
+		btrfs_tree_read_unlock_blocking(eb);
+		free_extent_buffer(eb);
+
+		offset++;
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+			 struct btrfs_path *path, iterate_irefs_t *iterate,
+			 void *ctx)
+{
+	int ret;
+	int found_refs = 0;
+
+	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+	if (!ret)
+		++found_refs;
+	else if (ret != -ENOENT)
+		return ret;
+
+	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+	if (ret == -ENOENT && found_refs)
+		return 0;
+
+	return ret;
+}
+
+/*
+ * returns 0 if the path could be dumped (probably truncated)
+ * returns <0 in case of an error
+ */
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+			 struct extent_buffer *eb, void *ctx)
+{
+	struct inode_fs_paths *ipath = ctx;
+	char *fspath;
+	char *fspath_min;
+	int i = ipath->fspath->elem_cnt;
+	const int s_ptr = sizeof(char *);
+	u32 bytes_left;
+
+	bytes_left = ipath->fspath->bytes_left > s_ptr ?
+					ipath->fspath->bytes_left - s_ptr : 0;
+
+	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
+	fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+				   name_off, eb, inum, fspath_min, bytes_left);
+	if (IS_ERR(fspath))
+		return PTR_ERR(fspath);
+
+	if (fspath > fspath_min) {
+		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
+		++ipath->fspath->elem_cnt;
+		ipath->fspath->bytes_left = fspath - fspath_min;
+	} else {
+		++ipath->fspath->elem_missed;
+		ipath->fspath->bytes_missing += fspath_min - fspath;
+		ipath->fspath->bytes_left = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * this dumps all file system paths to the inode into the ipath struct, provided
+ * is has been created large enough. each path is zero-terminated and accessed
+ * from ipath->fspath->val[i].
+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
+ * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
+ * have been needed to return all paths.
+ */
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
+{
+	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
+			     inode_to_path, ipath);
+}
+
+struct btrfs_data_container *init_data_container(u32 total_bytes)
+{
+	struct btrfs_data_container *data;
+	size_t alloc_bytes;
+
+	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
+	data = vmalloc(alloc_bytes);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	if (total_bytes >= sizeof(*data)) {
+		data->bytes_left = total_bytes - sizeof(*data);
+		data->bytes_missing = 0;
+	} else {
+		data->bytes_missing = sizeof(*data) - total_bytes;
+		data->bytes_left = 0;
+	}
+
+	data->elem_cnt = 0;
+	data->elem_missed = 0;
+
+	return data;
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path)
+{
+	struct inode_fs_paths *ifp;
+	struct btrfs_data_container *fspath;
+
+	fspath = init_data_container(total_bytes);
+	if (IS_ERR(fspath))
+		return (void *)fspath;
+
+	ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+	if (!ifp) {
+		kfree(fspath);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ifp->btrfs_path = path;
+	ifp->fspath = fspath;
+	ifp->fs_root = fs_root;
+
+	return ifp;
+}
+
+void free_ipath(struct inode_fs_paths *ipath)
+{
+	if (!ipath)
+		return;
+	vfree(ipath->fspath);
+	kfree(ipath);
+}
diff --git a/kernel/fs/btrfs/backref.h b/kernel/fs/btrfs/backref.h
new file mode 100644
index 000000000..9c41fbac3
--- /dev/null
+++ b/kernel/fs/btrfs/backref.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_BACKREF__
+#define __BTRFS_BACKREF__
+
+#include <linux/btrfs.h>
+#include "ulist.h"
+#include "extent_io.h"
+
+struct inode_fs_paths {
+	struct btrfs_path		*btrfs_path;
+	struct btrfs_root		*fs_root;
+	struct btrfs_data_container	*fspath;
+};
+
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
+		void *ctx);
+
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags);
+
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+			    struct btrfs_key *key, struct btrfs_extent_item *ei,
+			    u32 item_size, u64 *out_root, u8 *out_level);
+
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				u64 extent_item_objectid,
+				u64 extent_offset, int search_commit_root,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 bytenr,
+			 u64 time_seq, struct ulist **roots);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size);
+
+struct btrfs_data_container *init_data_container(u32 total_bytes);
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path);
+void free_ipath(struct inode_fs_paths *ipath);
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off);
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 root_objectid,
+		       u64 inum, u64 bytenr);
+
+int __init btrfs_prelim_ref_init(void);
+void btrfs_prelim_ref_exit(void);
+#endif
diff --git a/kernel/fs/btrfs/btrfs_inode.h b/kernel/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000..0ef5cc13f
--- /dev/null
+++ b/kernel/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,328 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+#include <linux/hash.h>
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+#include "delayed-inode.h"
+
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero.  When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE		0
+#define BTRFS_INODE_ORPHAN_META_RESERVED	1
+#define BTRFS_INODE_DUMMY			2
+#define BTRFS_INODE_IN_DEFRAG			3
+#define BTRFS_INODE_DELALLOC_META_RESERVED	4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM		5
+#define BTRFS_INODE_HAS_ASYNC_EXTENT		6
+#define BTRFS_INODE_NEEDS_FULL_SYNC		7
+#define BTRFS_INODE_COPY_EVERYTHING		8
+#define BTRFS_INODE_IN_DELALLOC_LIST		9
+#define BTRFS_INODE_READDIO_NEED_LOCK		10
+#define BTRFS_INODE_HAS_PROPS		        11
+/*
+ * The following 3 bits are meant only for the btree inode.
+ * When any of them is set, it means an error happened while writing an
+ * extent buffer belonging to:
+ * 1) a non-log btree
+ * 2) a log btree and first log sub-transaction
+ * 3) a log btree and second log sub-transaction
+ */
+#define BTRFS_INODE_BTREE_ERR		        12
+#define BTRFS_INODE_BTREE_LOG1_ERR		13
+#define BTRFS_INODE_BTREE_LOG2_ERR		14
+
+/* in memory btrfs inode */
+struct btrfs_inode {
+	/* which subvolume this inode belongs to */
+	struct btrfs_root *root;
+
+	/* key used to find this inode on disk.  This is used by the code
+	 * to read in roots of subvolumes
+	 */
+	struct btrfs_key location;
+
+	/*
+	 * Lock for counters and all fields used to determine if the inode is in
+	 * the log or not (last_trans, last_sub_trans, last_log_commit,
+	 * logged_trans).
+	 */
+	spinlock_t lock;
+
+	/* the extent_tree has caches of all the extent mappings to disk */
+	struct extent_map_tree extent_tree;
+
+	/* the io_tree does range state (DIRTY, LOCKED etc) */
+	struct extent_io_tree io_tree;
+
+	/* special utility tree used to record which mirrors have already been
+	 * tried when checksums fail for a given block
+	 */
+	struct extent_io_tree io_failure_tree;
+
+	/* held while logging the inode in tree-log.c */
+	struct mutex log_mutex;
+
+	/* held while doing delalloc reservations */
+	struct mutex delalloc_mutex;
+
+	/* used to order data wrt metadata */
+	struct btrfs_ordered_inode_tree ordered_tree;
+
+	/* list of all the delalloc inodes in the FS.  There are times we need
+	 * to write all the delalloc pages to disk, and this list is used
+	 * to walk them all.
+	 */
+	struct list_head delalloc_inodes;
+
+	/* node for the red-black tree that links inodes in subvolume root */
+	struct rb_node rb_node;
+
+	unsigned long runtime_flags;
+
+	/* Keep track of who's O_SYNC/fsyncing currently */
+	atomic_t sync_writers;
+
+	/* full 64 bit generation number, struct vfs_inode doesn't have a big
+	 * enough field for this.
+	 */
+	u64 generation;
+
+	/*
+	 * transid of the trans_handle that last modified this inode
+	 */
+	u64 last_trans;
+
+	/*
+	 * transid that last logged this inode
+	 */
+	u64 logged_trans;
+
+	/*
+	 * log transid when this inode was last modified
+	 */
+	int last_sub_trans;
+
+	/* a local copy of root's last_log_commit */
+	int last_log_commit;
+
+	/* total number of bytes pending delalloc, used by stat to calc the
+	 * real block usage of the file
+	 */
+	u64 delalloc_bytes;
+
+	/*
+	 * total number of bytes pending defrag, used by stat to check whether
+	 * it needs COW.
+	 */
+	u64 defrag_bytes;
+
+	/*
+	 * the size of the file stored in the metadata on disk.  data=ordered
+	 * means the in-memory i_size might be larger than the size on disk
+	 * because not all the blocks are written yet.
+	 */
+	u64 disk_i_size;
+
+	/*
+	 * if this is a directory then index_cnt is the counter for the index
+	 * number for new files that are created
+	 */
+	u64 index_cnt;
+
+	/* Cache the directory index number to speed the dir/file remove */
+	u64 dir_index;
+
+	/* the fsync log has some corner cases that mean we have to check
+	 * directories to see if any unlinks have been done before
+	 * the directory was logged.  See tree-log.c for all the
+	 * details
+	 */
+	u64 last_unlink_trans;
+
+	/*
+	 * Number of bytes outstanding that are going to need csums.  This is
+	 * used in ENOSPC accounting.
+	 */
+	u64 csum_bytes;
+
+	/* flags field from the on disk inode */
+	u32 flags;
+
+	/*
+	 * Counters to keep track of the number of extent item's we may use due
+	 * to delalloc and such.  outstanding_extents is the number of extent
+	 * items we think we'll end up using, and reserved_extents is the number
+	 * of extent items we've reserved metadata for.
+	 */
+	unsigned outstanding_extents;
+	unsigned reserved_extents;
+
+	/*
+	 * always compress this one file
+	 */
+	unsigned force_compress;
+
+	struct btrfs_delayed_node *delayed_node;
+
+	/* File creation time. */
+	struct timespec i_otime;
+
+	struct inode vfs_inode;
+};
+
+extern unsigned char btrfs_filetype_table[];
+
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+	return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+					     const struct btrfs_root *root)
+{
+	u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+	h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+	return (unsigned long)h;
+}
+
+static inline void btrfs_insert_inode_hash(struct inode *inode)
+{
+	unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
+
+	__insert_inode_hash(inode, h);
+}
+
+static inline u64 btrfs_ino(struct inode *inode)
+{
+	u64 ino = BTRFS_I(inode)->location.objectid;
+
+	/*
+	 * !ino: btree_inode
+	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
+	 */
+	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
+		ino = inode->i_ino;
+	return ino;
+}
+
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+	i_size_write(inode, size);
+	BTRFS_I(inode)->disk_i_size = size;
+}
+
+static inline bool btrfs_is_free_space_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (root == root->fs_info->tree_root &&
+	    btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
+		return true;
+	if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+		return true;
+	return false;
+}
+
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+	int ret = 0;
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	if (BTRFS_I(inode)->logged_trans == generation &&
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->last_log_commit &&
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->root->last_log_commit) {
+		/*
+		 * After a ranged fsync we might have left some extent maps
+		 * (that fall outside the fsync's range). So return false
+		 * here if the list isn't empty, to make sure btrfs_log_inode()
+		 * will be called and process those extent maps.
+		 */
+		smp_mb();
+		if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+			ret = 1;
+	}
+	spin_unlock(&BTRFS_I(inode)->lock);
+	return ret;
+}
+
+#define BTRFS_DIO_ORIG_BIO_SUBMITTED	0x1
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	unsigned long flags;
+	u64 logical_offset;
+	u64 disk_bytenr;
+	u64 bytes;
+	void *private;
+
+	/* number of bios pending for this dio */
+	atomic_t pending_bios;
+
+	/* IO errors */
+	int errors;
+
+	/* orig_bio is our btrfs_io_bio */
+	struct bio *orig_bio;
+
+	/* dio_bio came from fs/direct-io.c */
+	struct bio *dio_bio;
+
+	/*
+	 * The original bio may be splited to several sub-bios, this is
+	 * done during endio of sub-bios
+	 */
+	int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
+};
+
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+	smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb__before_atomic();
+	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+		  &BTRFS_I(inode)->runtime_flags);
+}
+
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+
+#endif
diff --git a/kernel/fs/btrfs/check-integrity.c b/kernel/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000..ce7dec88f
--- /dev/null
+++ b/kernel/fs/btrfs/check-integrity.c
@@ -0,0 +1,3245 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and no write error was indicated and a
+ *        FLUSH request to the device where these blocks are
+ *        located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ *
+ * Expect millions of lines of information in the kernel log with an
+ * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
+ * kernel config to at least 26 (which is 64MB). Usually the value is
+ * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
+ * changed like this before LOG_BUF_SHIFT can be set to a high value:
+ * config LOG_BUF_SHIFT
+ *       int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
+ *       range 12 30
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
+							 * excluding " [...]" */
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE		0x00002000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+	u32 magic_num;		/* only used for debug purposes */
+	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
+	unsigned int is_superblock:1;	/* if it is one of the superblocks */
+	unsigned int is_iodone:1;	/* if is done by lower subsystem */
+	unsigned int iodone_w_error:1;	/* error was indicated to endio */
+	unsigned int never_written:1;	/* block was added because it was
+					 * referenced, not because it was
+					 * written */
+	unsigned int mirror_num;	/* large enough to hold
+					 * BTRFS_SUPER_MIRROR_MAX */
+	struct btrfsic_dev_state *dev_state;
+	u64 dev_bytenr;		/* key, physical byte num on disk */
+	u64 logical_bytenr;	/* logical byte num on disk */
+	u64 generation;
+	struct btrfs_disk_key disk_key;	/* extra info to print in case of
+					 * issues, will not always be correct */
+	struct list_head collision_resolving_node;	/* list node */
+	struct list_head all_blocks_node;	/* list node */
+
+	/* the following two lists contain block_link items */
+	struct list_head ref_to_list;	/* list */
+	struct list_head ref_from_list;	/* list */
+	struct btrfsic_block *next_in_same_bio;
+	void *orig_bio_bh_private;
+	union {
+		bio_end_io_t *bio;
+		bh_end_io_t *bh;
+	} orig_bio_bh_end_io;
+	int submit_bio_bh_rw;
+	u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+	u32 magic_num;		/* only used for debug purposes */
+	u32 ref_cnt;
+	struct list_head node_ref_to;	/* list node */
+	struct list_head node_ref_from;	/* list node */
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block *block_ref_to;
+	struct btrfsic_block *block_ref_from;
+	u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+	u32 magic_num;		/* only used for debug purposes */
+	struct block_device *bdev;
+	struct btrfsic_state *state;
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block dummy_block_for_bio_bh_flush;
+	u64 last_flush_gen;
+	char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+	u64 start;		/* virtual bytenr */
+	u64 dev_bytenr;		/* physical bytenr on device */
+	u32 len;
+	struct btrfsic_dev_state *dev;
+	char **datav;
+	struct page **pagev;
+	void *mem_to_free;
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+	u32 magic;
+	u32 nr;
+	int error;
+	int i;
+	int limit_nesting;
+	int num_copies;
+	int mirror_num;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx *block_ctx;
+	struct btrfsic_block *next_block;
+	struct btrfsic_block_data_ctx next_block_ctx;
+	struct btrfs_header *hdr;
+	struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+	u32 print_mask;
+	int include_extent_data;
+	int csum_size;
+	struct list_head all_blocks_list;
+	struct btrfsic_block_hashtable block_hashtable;
+	struct btrfsic_block_link_hashtable block_link_hashtable;
+	struct btrfs_root *root;
+	u64 max_superblock_generation;
+	struct btrfsic_block *latest_superblock;
+	u32 metablock_size;
+	u32 datablock_size;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+				     struct btrfsic_block *block,
+				     struct btrfsic_block_data_ctx *block_ctx,
+				     int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+	struct btrfsic_block_data_ctx *block_ctx,
+	void *dst, u32 offset, size_t len);
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx
+		*block_ctx, u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+				      struct btrfsic_block *block,
+				      struct btrfsic_block_data_ctx *block_ctx,
+				      u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     char **datav, unsigned int num_pages);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr, char **mapped_datav,
+					  unsigned int num_pages,
+					  struct bio *bio, int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const block,
+		struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+					      const struct btrfsic_block *block,
+					      int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+	b->dev_state = NULL;
+	b->dev_bytenr = 0;
+	b->logical_bytenr = 0;
+	b->generation = BTRFSIC_GENERATION_UNKNOWN;
+	b->disk_key.objectid = 0;
+	b->disk_key.type = 0;
+	b->disk_key.offset = 0;
+	b->is_metadata = 0;
+	b->is_superblock = 0;
+	b->is_iodone = 0;
+	b->iodone_w_error = 0;
+	b->never_written = 0;
+	b->mirror_num = 0;
+	b->next_in_same_bio = NULL;
+	b->orig_bio_bh_private = NULL;
+	b->orig_bio_bh_end_io.bio = NULL;
+	INIT_LIST_HEAD(&b->collision_resolving_node);
+	INIT_LIST_HEAD(&b->all_blocks_node);
+	INIT_LIST_HEAD(&b->ref_to_list);
+	INIT_LIST_HEAD(&b->ref_from_list);
+	b->submit_bio_bh_rw = 0;
+	b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+	struct btrfsic_block *b;
+
+	b = kzalloc(sizeof(*b), GFP_NOFS);
+	if (NULL != b)
+		btrfsic_block_init(b);
+
+	return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+	kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+	l->ref_cnt = 1;
+	INIT_LIST_HEAD(&l->node_ref_to);
+	INIT_LIST_HEAD(&l->node_ref_from);
+	INIT_LIST_HEAD(&l->collision_resolving_node);
+	l->block_ref_to = NULL;
+	l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+	struct btrfsic_block_link *l;
+
+	l = kzalloc(sizeof(*l), GFP_NOFS);
+	if (NULL != l)
+		btrfsic_block_link_init(l);
+
+	return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+	kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+	ds->bdev = NULL;
+	ds->state = NULL;
+	ds->name[0] = '\0';
+	INIT_LIST_HEAD(&ds->collision_resolving_node);
+	ds->last_flush_gen = 0;
+	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = kzalloc(sizeof(*ds), GFP_NOFS);
+	if (NULL != ds)
+		btrfsic_dev_state_init(ds);
+
+	return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+	BUG_ON(!(NULL == ds ||
+		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+	kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(b->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+	list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+	list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block *const b =
+		    list_entry(elem, struct btrfsic_block,
+			       collision_resolving_node);
+
+		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+			return b;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+	BUG_ON(NULL == l->block_ref_to);
+	BUG_ON(NULL == l->block_ref_from);
+	list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+	list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
+	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block_link *const l =
+		    list_entry(elem, struct btrfsic_block_link,
+			       collision_resolving_node);
+
+		BUG_ON(NULL == l->block_ref_to);
+		BUG_ON(NULL == l->block_ref_from);
+		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+			return l;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)ds->bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+	list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+	list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_dev_state *const ds =
+		    list_entry(elem, struct btrfsic_dev_state,
+			       collision_resolving_node);
+
+		if (ds->bdev == bdev)
+			return ds;
+	}
+
+	return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices)
+{
+	int ret = 0;
+	struct btrfs_super_block *selected_super;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+	struct btrfsic_dev_state *selected_dev_state = NULL;
+	int pass;
+
+	BUG_ON(NULL == state);
+	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
+	if (NULL == selected_super) {
+		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+		return -1;
+	}
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		int i;
+		struct btrfsic_dev_state *dev_state;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		dev_state = btrfsic_dev_state_lookup(device->bdev);
+		BUG_ON(NULL == dev_state);
+		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			ret = btrfsic_process_superblock_dev_mirror(
+					state, dev_state, device, i,
+					&selected_dev_state, selected_super);
+			if (0 != ret && 0 == i) {
+				kfree(selected_super);
+				return ret;
+			}
+		}
+	}
+
+	if (NULL == state->latest_superblock) {
+		printk(KERN_INFO "btrfsic: no superblock found!\n");
+		kfree(selected_super);
+		return -1;
+	}
+
+	state->csum_size = btrfs_super_csum_size(selected_super);
+
+	for (pass = 0; pass < 3; pass++) {
+		int num_copies;
+		int mirror_num;
+		u64 next_bytenr;
+
+		switch (pass) {
+		case 0:
+			next_bytenr = btrfs_super_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n", next_bytenr);
+			break;
+		case 1:
+			next_bytenr = btrfs_super_chunk_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n", next_bytenr);
+			break;
+		case 2:
+			next_bytenr = btrfs_super_log_root(selected_super);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n", next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->metablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+
+			ret = btrfsic_map_block(state, next_bytenr,
+						state->metablock_size,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO "btrfsic:"
+				       " btrfsic_map_block(root @%llu,"
+				       " mirror %d) failed!\n",
+				       next_bytenr, mirror_num);
+				kfree(selected_super);
+				return -1;
+			}
+
+			next_block = btrfsic_block_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					&state->block_hashtable);
+			BUG_ON(NULL == next_block);
+
+			l = btrfsic_block_link_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					state->latest_superblock->dev_state->
+					bdev,
+					state->latest_superblock->dev_bytenr,
+					&state->block_link_hashtable);
+			BUG_ON(NULL == l);
+
+			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+			if (ret < (int)PAGE_CACHE_SIZE) {
+				printk(KERN_INFO
+				       "btrfsic: read @logical %llu failed!\n",
+				       tmp_next_block_ctx.start);
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				kfree(selected_super);
+				return -1;
+			}
+
+			ret = btrfsic_process_metablock(state,
+							next_block,
+							&tmp_next_block_ctx,
+							BTRFS_MAX_LEVEL + 3, 1);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+		}
+	}
+
+	kfree(selected_super);
+	return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super)
+{
+	struct btrfs_super_block *super_tmp;
+	u64 dev_bytenr;
+	struct buffer_head *bh;
+	struct btrfsic_block *superblock_tmp;
+	int pass;
+	struct block_device *const superblock_bdev = device->bdev;
+
+	/* super block bytenr is always the unmapped device bytenr */
+	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
+		return -1;
+	bh = __bread(superblock_bdev, dev_bytenr / 4096,
+		     BTRFS_SUPER_INFO_SIZE);
+	if (NULL == bh)
+		return -1;
+	super_tmp = (struct btrfs_super_block *)
+	    (bh->b_data + (dev_bytenr & 4095));
+
+	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+	    btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
+	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+	    btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
+		brelse(bh);
+		return 0;
+	}
+
+	superblock_tmp =
+	    btrfsic_block_hashtable_lookup(superblock_bdev,
+					   dev_bytenr,
+					   &state->block_hashtable);
+	if (NULL == superblock_tmp) {
+		superblock_tmp = btrfsic_block_alloc();
+		if (NULL == superblock_tmp) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			brelse(bh);
+			return -1;
+		}
+		/* for superblock, only the dev_bytenr makes sense */
+		superblock_tmp->dev_bytenr = dev_bytenr;
+		superblock_tmp->dev_state = dev_state;
+		superblock_tmp->logical_bytenr = dev_bytenr;
+		superblock_tmp->generation = btrfs_super_generation(super_tmp);
+		superblock_tmp->is_metadata = 1;
+		superblock_tmp->is_superblock = 1;
+		superblock_tmp->is_iodone = 1;
+		superblock_tmp->never_written = 0;
+		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
+				     " @%llu (%s/%llu/%d)\n",
+				     superblock_bdev,
+				     rcu_str_deref(device->name), dev_bytenr,
+				     dev_state->name, dev_bytenr,
+				     superblock_mirror_num);
+		list_add(&superblock_tmp->all_blocks_node,
+			 &state->all_blocks_list);
+		btrfsic_block_hashtable_add(superblock_tmp,
+					    &state->block_hashtable);
+	}
+
+	/* select the one with the highest generation field */
+	if (btrfs_super_generation(super_tmp) >
+	    state->max_superblock_generation ||
+	    0 == state->max_superblock_generation) {
+		memcpy(selected_super, super_tmp, sizeof(*selected_super));
+		*selected_dev_state = dev_state;
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_tmp);
+		state->latest_superblock = superblock_tmp;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		u64 next_bytenr;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key;
+
+		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_disk_key.offset = 0;
+		switch (pass) {
+		case 0:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "initial root ";
+			next_bytenr = btrfs_super_root(super_tmp);
+			break;
+		case 1:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "initial chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_tmp);
+			break;
+		case 2:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "initial log ";
+			next_bytenr = btrfs_super_log_root(super_tmp);
+			if (0 == next_bytenr)
+				continue;
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->metablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+
+			if (btrfsic_map_block(state, next_bytenr,
+					      state->metablock_size,
+					      &tmp_next_block_ctx,
+					      mirror_num)) {
+				printk(KERN_INFO "btrfsic: btrfsic_map_block("
+				       "bytenr @%llu, mirror %d) failed!\n",
+				       next_bytenr, mirror_num);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					additional_string, 1, 1, 0,
+					mirror_num, NULL);
+			if (NULL == next_block) {
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					next_block, superblock_tmp,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l) {
+				brelse(bh);
+				return -1;
+			}
+		}
+	}
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+		btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+	brelse(bh);
+	return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+	struct btrfsic_stack_frame *sf;
+
+	sf = kzalloc(sizeof(*sf), GFP_NOFS);
+	if (NULL == sf)
+		printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+	else
+		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+	return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+	BUG_ON(!(NULL == sf ||
+		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+	kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const first_block,
+		struct btrfsic_block_data_ctx *const first_block_ctx,
+		int first_limit_nesting, int force_iodone_flag)
+{
+	struct btrfsic_stack_frame initial_stack_frame = { 0 };
+	struct btrfsic_stack_frame *sf;
+	struct btrfsic_stack_frame *next_stack;
+	struct btrfs_header *const first_hdr =
+		(struct btrfs_header *)first_block_ctx->datav[0];
+
+	BUG_ON(!first_hdr);
+	sf = &initial_stack_frame;
+	sf->error = 0;
+	sf->i = -1;
+	sf->limit_nesting = first_limit_nesting;
+	sf->block = first_block;
+	sf->block_ctx = first_block_ctx;
+	sf->next_block = NULL;
+	sf->hdr = first_hdr;
+	sf->prev = NULL;
+
+continue_with_new_stack_frame:
+	sf->block->generation = le64_to_cpu(sf->hdr->generation);
+	if (0 == sf->hdr->level) {
+		struct btrfs_leaf *const leafhdr =
+		    (struct btrfs_leaf *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "leaf %llu items %d generation %llu"
+				       " owner %llu\n",
+				       sf->block_ctx->start, sf->nr,
+				       btrfs_stack_header_generation(
+					       &leafhdr->header),
+				       btrfs_stack_header_owner(
+					       &leafhdr->header));
+		}
+
+continue_with_current_leaf_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_item disk_item;
+			u32 disk_item_offset =
+				(uintptr_t)(leafhdr->items + sf->i) -
+				(uintptr_t)leafhdr;
+			struct btrfs_disk_key *disk_key;
+			u8 type;
+			u32 item_offset;
+			u32 item_size;
+
+			if (disk_item_offset + sizeof(struct btrfs_item) >
+			    sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+				printk(KERN_INFO
+				       "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+				       sf->block_ctx->start,
+				       sf->block_ctx->dev->name);
+				goto one_stack_frame_backwards;
+			}
+			btrfsic_read_from_block_data(sf->block_ctx,
+						     &disk_item,
+						     disk_item_offset,
+						     sizeof(struct btrfs_item));
+			item_offset = btrfs_stack_item_offset(&disk_item);
+			item_size = btrfs_stack_item_size(&disk_item);
+			disk_key = &disk_item.key;
+			type = btrfs_disk_key_type(disk_key);
+
+			if (BTRFS_ROOT_ITEM_KEY == type) {
+				struct btrfs_root_item root_item;
+				u32 root_item_offset;
+				u64 next_bytenr;
+
+				root_item_offset = item_offset +
+					offsetof(struct btrfs_leaf, items);
+				if (root_item_offset + item_size >
+				    sf->block_ctx->len)
+					goto leaf_item_out_of_bounce_error;
+				btrfsic_read_from_block_data(
+					sf->block_ctx, &root_item,
+					root_item_offset,
+					item_size);
+				next_bytenr = btrfs_root_bytenr(&root_item);
+
+				sf->error =
+				    btrfsic_create_link_to_next_block(
+						state,
+						sf->block,
+						sf->block_ctx,
+						next_bytenr,
+						sf->limit_nesting,
+						&sf->next_block_ctx,
+						&sf->next_block,
+						force_iodone_flag,
+						&sf->num_copies,
+						&sf->mirror_num,
+						disk_key,
+						btrfs_root_generation(
+						&root_item));
+				if (sf->error)
+					goto one_stack_frame_backwards;
+
+				if (NULL != sf->next_block) {
+					struct btrfs_header *const next_hdr =
+					    (struct btrfs_header *)
+					    sf->next_block_ctx.datav[0];
+
+					next_stack =
+					    btrfsic_stack_frame_alloc();
+					if (NULL == next_stack) {
+						sf->error = -1;
+						btrfsic_release_block_ctx(
+								&sf->
+								next_block_ctx);
+						goto one_stack_frame_backwards;
+					}
+
+					next_stack->i = -1;
+					next_stack->block = sf->next_block;
+					next_stack->block_ctx =
+					    &sf->next_block_ctx;
+					next_stack->next_block = NULL;
+					next_stack->hdr = next_hdr;
+					next_stack->limit_nesting =
+					    sf->limit_nesting - 1;
+					next_stack->prev = sf;
+					sf = next_stack;
+					goto continue_with_new_stack_frame;
+				}
+			} else if (BTRFS_EXTENT_DATA_KEY == type &&
+				   state->include_extent_data) {
+				sf->error = btrfsic_handle_extent_data(
+						state,
+						sf->block,
+						sf->block_ctx,
+						item_offset,
+						force_iodone_flag);
+				if (sf->error)
+					goto one_stack_frame_backwards;
+			}
+
+			goto continue_with_current_leaf_stack_frame;
+		}
+	} else {
+		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "node %llu level %d items %d"
+				       " generation %llu owner %llu\n",
+				       sf->block_ctx->start,
+				       nodehdr->header.level, sf->nr,
+				       btrfs_stack_header_generation(
+				       &nodehdr->header),
+				       btrfs_stack_header_owner(
+				       &nodehdr->header));
+		}
+
+continue_with_current_node_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_key_ptr key_ptr;
+			u32 key_ptr_offset;
+			u64 next_bytenr;
+
+			key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+					  (uintptr_t)nodehdr;
+			if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+			    sf->block_ctx->len) {
+				printk(KERN_INFO
+				       "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+				       sf->block_ctx->start,
+				       sf->block_ctx->dev->name);
+				goto one_stack_frame_backwards;
+			}
+			btrfsic_read_from_block_data(
+				sf->block_ctx, &key_ptr, key_ptr_offset,
+				sizeof(struct btrfs_key_ptr));
+			next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
+
+			sf->error = btrfsic_create_link_to_next_block(
+					state,
+					sf->block,
+					sf->block_ctx,
+					next_bytenr,
+					sf->limit_nesting,
+					&sf->next_block_ctx,
+					&sf->next_block,
+					force_iodone_flag,
+					&sf->num_copies,
+					&sf->mirror_num,
+					&key_ptr.key,
+					btrfs_stack_key_generation(&key_ptr));
+			if (sf->error)
+				goto one_stack_frame_backwards;
+
+			if (NULL != sf->next_block) {
+				struct btrfs_header *const next_hdr =
+				    (struct btrfs_header *)
+				    sf->next_block_ctx.datav[0];
+
+				next_stack = btrfsic_stack_frame_alloc();
+				if (NULL == next_stack) {
+					sf->error = -1;
+					goto one_stack_frame_backwards;
+				}
+
+				next_stack->i = -1;
+				next_stack->block = sf->next_block;
+				next_stack->block_ctx = &sf->next_block_ctx;
+				next_stack->next_block = NULL;
+				next_stack->hdr = next_hdr;
+				next_stack->limit_nesting =
+				    sf->limit_nesting - 1;
+				next_stack->prev = sf;
+				sf = next_stack;
+				goto continue_with_new_stack_frame;
+			}
+
+			goto continue_with_current_node_stack_frame;
+		}
+	}
+
+one_stack_frame_backwards:
+	if (NULL != sf->prev) {
+		struct btrfsic_stack_frame *const prev = sf->prev;
+
+		/* the one for the initial block is freed in the caller */
+		btrfsic_release_block_ctx(sf->block_ctx);
+
+		if (sf->error) {
+			prev->error = sf->error;
+			btrfsic_stack_frame_free(sf);
+			sf = prev;
+			goto one_stack_frame_backwards;
+		}
+
+		btrfsic_stack_frame_free(sf);
+		sf = prev;
+		goto continue_with_new_stack_frame;
+	} else {
+		BUG_ON(&initial_stack_frame != sf);
+	}
+
+	return sf->error;
+}
+
+static void btrfsic_read_from_block_data(
+	struct btrfsic_block_data_ctx *block_ctx,
+	void *dstv, u32 offset, size_t len)
+{
+	size_t cur;
+	size_t offset_in_page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(offset + len > block_ctx->len);
+	offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+		BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
+		kaddr = block_ctx->datav[i];
+		memcpy(dst, kaddr + offset_in_page, cur);
+
+		dst += cur;
+		len -= cur;
+		offset_in_page = 0;
+		i++;
+	}
+}
+
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation)
+{
+	struct btrfsic_block *next_block = NULL;
+	int ret;
+	struct btrfsic_block_link *l;
+	int did_alloc_block_link;
+	int block_was_created;
+
+	*next_blockp = NULL;
+	if (0 == *num_copiesp) {
+		*num_copiesp =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->metablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, *num_copiesp);
+		*mirror_nump = 1;
+	}
+
+	if (*mirror_nump > *num_copiesp)
+		return 0;
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+		printk(KERN_INFO
+		       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+		       *mirror_nump);
+	ret = btrfsic_map_block(state, next_bytenr,
+				state->metablock_size,
+				next_block_ctx, *mirror_nump);
+	if (ret) {
+		printk(KERN_INFO
+		       "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+		       next_bytenr, *mirror_nump);
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+
+	next_block = btrfsic_block_lookup_or_add(state,
+						 next_block_ctx, "referenced ",
+						 1, force_iodone_flag,
+						 !force_iodone_flag,
+						 *mirror_nump,
+						 &block_was_created);
+	if (NULL == next_block) {
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+	if (block_was_created) {
+		l = NULL;
+		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+	} else {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+			if (next_block->logical_bytenr != next_bytenr &&
+			    !(!next_block->is_metadata &&
+			      0 == next_block->logical_bytenr))
+				printk(KERN_INFO
+				       "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+				       next_bytenr, next_block_ctx->dev->name,
+				       next_block_ctx->dev_bytenr, *mirror_nump,
+				       btrfsic_get_block_type(state,
+							      next_block),
+				       next_block->logical_bytenr);
+			else
+				printk(KERN_INFO
+				       "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+				       next_bytenr, next_block_ctx->dev->name,
+				       next_block_ctx->dev_bytenr, *mirror_nump,
+				       btrfsic_get_block_type(state,
+							      next_block));
+		}
+		next_block->logical_bytenr = next_bytenr;
+
+		next_block->mirror_num = *mirror_nump;
+		l = btrfsic_block_link_hashtable_lookup(
+				next_block_ctx->dev->bdev,
+				next_block_ctx->dev_bytenr,
+				block_ctx->dev->bdev,
+				block_ctx->dev_bytenr,
+				&state->block_link_hashtable);
+	}
+
+	next_block->disk_key = *disk_key;
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		did_alloc_block_link = 1;
+		l->block_ref_to = next_block;
+		l->block_ref_from = block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		did_alloc_block_link = 0;
+		if (0 == limit_nesting) {
+			l->ref_cnt++;
+			l->parent_generation = parent_generation;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_add_link(state, l);
+		}
+	}
+
+	if (limit_nesting > 0 && did_alloc_block_link) {
+		ret = btrfsic_read_block(state, next_block_ctx);
+		if (ret < (int)next_block_ctx->len) {
+			printk(KERN_INFO
+			       "btrfsic: read block @logical %llu failed!\n",
+			       next_bytenr);
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		*next_blockp = next_block;
+	} else {
+		*next_blockp = NULL;
+	}
+	(*mirror_nump)++;
+
+	return 0;
+}
+
+static int btrfsic_handle_extent_data(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u32 item_offset, int force_iodone_flag)
+{
+	int ret;
+	struct btrfs_file_extent_item file_extent_item;
+	u64 file_extent_item_offset;
+	u64 next_bytenr;
+	u64 num_bytes;
+	u64 generation;
+	struct btrfsic_block_link *l;
+
+	file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+				  item_offset;
+	if (file_extent_item_offset +
+	    offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+	    block_ctx->len) {
+		printk(KERN_INFO
+		       "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+		       block_ctx->start, block_ctx->dev->name);
+		return -1;
+	}
+
+	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+		file_extent_item_offset,
+		offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+	if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+	    btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+			printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+			       file_extent_item.type,
+			       btrfs_stack_file_extent_disk_bytenr(
+			       &file_extent_item));
+		return 0;
+	}
+
+	if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+	    block_ctx->len) {
+		printk(KERN_INFO
+		       "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+		       block_ctx->start, block_ctx->dev->name);
+		return -1;
+	}
+	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+				     file_extent_item_offset,
+				     sizeof(struct btrfs_file_extent_item));
+	next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
+	if (btrfs_stack_file_extent_compression(&file_extent_item) ==
+	    BTRFS_COMPRESS_NONE) {
+		next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
+		num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+	} else {
+		num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
+	}
+	generation = btrfs_stack_file_extent_generation(&file_extent_item);
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+		printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+		       " offset = %llu, num_bytes = %llu\n",
+		       file_extent_item.type,
+		       btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
+		       btrfs_stack_file_extent_offset(&file_extent_item),
+		       num_bytes);
+	while (num_bytes > 0) {
+		u32 chunk_len;
+		int num_copies;
+		int mirror_num;
+
+		if (num_bytes > state->datablock_size)
+			chunk_len = state->datablock_size;
+		else
+			chunk_len = num_bytes;
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->datablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block_data_ctx next_block_ctx;
+			struct btrfsic_block *next_block;
+			int block_was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "btrfsic_handle_extent_data("
+				       "mirror_num=%d)\n", mirror_num);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+				printk(KERN_INFO
+				       "\tdisk_bytenr = %llu, num_bytes %u\n",
+				       next_bytenr, chunk_len);
+			ret = btrfsic_map_block(state, next_bytenr,
+						chunk_len, &next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       next_bytenr, mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&next_block_ctx,
+					"referenced ",
+					0,
+					force_iodone_flag,
+					!force_iodone_flag,
+					mirror_num,
+					&block_was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&next_block_ctx);
+				return -1;
+			}
+			if (!block_was_created) {
+				if ((state->print_mask &
+				     BTRFSIC_PRINT_MASK_VERBOSE) &&
+				    next_block->logical_bytenr != next_bytenr &&
+				    !(!next_block->is_metadata &&
+				      0 == next_block->logical_bytenr)) {
+					printk(KERN_INFO
+					       "Referenced block"
+					       " @%llu (%s/%llu/%d)"
+					       " found in hash table, D,"
+					       " bytenr mismatch"
+					       " (!= stored %llu).\n",
+					       next_bytenr,
+					       next_block_ctx.dev->name,
+					       next_block_ctx.dev_bytenr,
+					       mirror_num,
+					       next_block->logical_bytenr);
+				}
+				next_block->logical_bytenr = next_bytenr;
+				next_block->mirror_num = mirror_num;
+			}
+
+			l = btrfsic_block_link_lookup_or_add(state,
+							     &next_block_ctx,
+							     next_block, block,
+							     generation);
+			btrfsic_release_block_ctx(&next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+
+		next_bytenr += chunk_len;
+		num_bytes -= chunk_len;
+	}
+
+	return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num)
+{
+	int ret;
+	u64 length;
+	struct btrfs_bio *multi = NULL;
+	struct btrfs_device *device;
+
+	length = len;
+	ret = btrfs_map_block(state->root->fs_info, READ,
+			      bytenr, &length, &multi, mirror_num);
+
+	if (ret) {
+		block_ctx_out->start = 0;
+		block_ctx_out->dev_bytenr = 0;
+		block_ctx_out->len = 0;
+		block_ctx_out->dev = NULL;
+		block_ctx_out->datav = NULL;
+		block_ctx_out->pagev = NULL;
+		block_ctx_out->mem_to_free = NULL;
+
+		return ret;
+	}
+
+	device = multi->stripes[0].dev;
+	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+	block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+	block_ctx_out->start = bytenr;
+	block_ctx_out->len = len;
+	block_ctx_out->datav = NULL;
+	block_ctx_out->pagev = NULL;
+	block_ctx_out->mem_to_free = NULL;
+
+	kfree(multi);
+	if (NULL == block_ctx_out->dev) {
+		ret = -ENXIO;
+		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+	}
+
+	return ret;
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+	if (block_ctx->mem_to_free) {
+		unsigned int num_pages;
+
+		BUG_ON(!block_ctx->datav);
+		BUG_ON(!block_ctx->pagev);
+		num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+			    PAGE_CACHE_SHIFT;
+		while (num_pages > 0) {
+			num_pages--;
+			if (block_ctx->datav[num_pages]) {
+				kunmap(block_ctx->pagev[num_pages]);
+				block_ctx->datav[num_pages] = NULL;
+			}
+			if (block_ctx->pagev[num_pages]) {
+				__free_page(block_ctx->pagev[num_pages]);
+				block_ctx->pagev[num_pages] = NULL;
+			}
+		}
+
+		kfree(block_ctx->mem_to_free);
+		block_ctx->mem_to_free = NULL;
+		block_ctx->pagev = NULL;
+		block_ctx->datav = NULL;
+	}
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx)
+{
+	unsigned int num_pages;
+	unsigned int i;
+	u64 dev_bytenr;
+	int ret;
+
+	BUG_ON(block_ctx->datav);
+	BUG_ON(block_ctx->pagev);
+	BUG_ON(block_ctx->mem_to_free);
+	if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
+		printk(KERN_INFO
+		       "btrfsic: read_block() with unaligned bytenr %llu\n",
+		       block_ctx->dev_bytenr);
+		return -1;
+	}
+
+	num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+		    PAGE_CACHE_SHIFT;
+	block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+					  sizeof(*block_ctx->pagev)) *
+					 num_pages, GFP_NOFS);
+	if (!block_ctx->mem_to_free)
+		return -1;
+	block_ctx->datav = block_ctx->mem_to_free;
+	block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+	for (i = 0; i < num_pages; i++) {
+		block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+		if (!block_ctx->pagev[i])
+			return -1;
+	}
+
+	dev_bytenr = block_ctx->dev_bytenr;
+	for (i = 0; i < num_pages;) {
+		struct bio *bio;
+		unsigned int j;
+
+		bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
+		if (!bio) {
+			printk(KERN_INFO
+			       "btrfsic: bio_alloc() for %u pages failed!\n",
+			       num_pages - i);
+			return -1;
+		}
+		bio->bi_bdev = block_ctx->dev->bdev;
+		bio->bi_iter.bi_sector = dev_bytenr >> 9;
+
+		for (j = i; j < num_pages; j++) {
+			ret = bio_add_page(bio, block_ctx->pagev[j],
+					   PAGE_CACHE_SIZE, 0);
+			if (PAGE_CACHE_SIZE != ret)
+				break;
+		}
+		if (j == i) {
+			printk(KERN_INFO
+			       "btrfsic: error, failed to add a single page!\n");
+			return -1;
+		}
+		if (submit_bio_wait(READ, bio)) {
+			printk(KERN_INFO
+			       "btrfsic: read error at logical %llu dev %s!\n",
+			       block_ctx->start, block_ctx->dev->name);
+			bio_put(bio);
+			return -1;
+		}
+		bio_put(bio);
+		dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+		i = j;
+	}
+	for (i = 0; i < num_pages; i++) {
+		block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+		if (!block_ctx->datav[i]) {
+			printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+			       block_ctx->dev->name);
+			return -1;
+		}
+	}
+
+	return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+	struct list_head *elem_all;
+
+	BUG_ON(NULL == state);
+
+	printk(KERN_INFO "all_blocks_list:\n");
+	list_for_each(elem_all, &state->all_blocks_list) {
+		const struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *elem_ref_from;
+
+		printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+		       btrfsic_get_block_type(state, b_all),
+		       b_all->logical_bytenr, b_all->dev_state->name,
+		       b_all->dev_bytenr, b_all->mirror_num);
+
+		list_for_each(elem_ref_to, &b_all->ref_to_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " refers %u* to"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		}
+
+		list_for_each(elem_ref_from, &b_all->ref_from_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_from,
+				       struct btrfsic_block_link,
+				       node_ref_from);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		}
+
+		printk(KERN_INFO "\n");
+	}
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     char **datav, unsigned int num_pages)
+{
+	struct btrfs_header *h;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	unsigned int i;
+
+	if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+		return 1; /* not metadata */
+	num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+	h = (struct btrfs_header *)datav[0];
+
+	if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+		return 1;
+
+	for (i = 0; i < num_pages; i++) {
+		u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+		size_t sublen = i ? PAGE_CACHE_SIZE :
+				    (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
+
+		crc = btrfs_crc32c(crc, data, sublen);
+	}
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, h->csum, state->csum_size))
+		return 1;
+
+	return 0; /* is metadata */
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr, char **mapped_datav,
+					  unsigned int num_pages,
+					  struct bio *bio, int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw)
+{
+	int is_metadata;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx block_ctx;
+	int ret;
+	struct btrfsic_state *state = dev_state->state;
+	struct block_device *bdev = dev_state->bdev;
+	unsigned int processed_len;
+
+	if (NULL != bio_is_patched)
+		*bio_is_patched = 0;
+
+again:
+	if (num_pages == 0)
+		return;
+
+	processed_len = 0;
+	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+						      num_pages));
+
+	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL != block) {
+		u64 bytenr = 0;
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		if (block->is_superblock) {
+			bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
+						    mapped_datav[0]);
+			if (num_pages * PAGE_CACHE_SIZE <
+			    BTRFS_SUPER_INFO_SIZE) {
+				printk(KERN_INFO
+				       "btrfsic: cannot work with too short bios!\n");
+				return;
+			}
+			is_metadata = 1;
+			BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+			processed_len = BTRFS_SUPER_INFO_SIZE;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+				printk(KERN_INFO
+				       "[before new superblock is written]:\n");
+				btrfsic_dump_tree_sub(state, block, 0);
+			}
+		}
+		if (is_metadata) {
+			if (!block->is_superblock) {
+				if (num_pages * PAGE_CACHE_SIZE <
+				    state->metablock_size) {
+					printk(KERN_INFO
+					       "btrfsic: cannot work with too short bios!\n");
+					return;
+				}
+				processed_len = state->metablock_size;
+				bytenr = btrfs_stack_header_bytenr(
+						(struct btrfs_header *)
+						mapped_datav[0]);
+				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+							       dev_state,
+							       dev_bytenr);
+			}
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+				if (block->logical_bytenr != bytenr &&
+				    !(!block->is_metadata &&
+				      block->logical_bytenr == 0))
+					printk(KERN_INFO
+					       "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+					       bytenr, dev_state->name,
+					       dev_bytenr,
+					       block->mirror_num,
+					       btrfsic_get_block_type(state,
+								      block),
+					       block->logical_bytenr);
+				else
+					printk(KERN_INFO
+					       "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+					       bytenr, dev_state->name,
+					       dev_bytenr, block->mirror_num,
+					       btrfsic_get_block_type(state,
+								      block));
+			}
+			block->logical_bytenr = bytenr;
+		} else {
+			if (num_pages * PAGE_CACHE_SIZE <
+			    state->datablock_size) {
+				printk(KERN_INFO
+				       "btrfsic: cannot work with too short bios!\n");
+				return;
+			}
+			processed_len = state->datablock_size;
+			bytenr = block->logical_bytenr;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c.\n",
+				       bytenr, dev_state->name, dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block));
+		}
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "ref_to_list: %cE, ref_from_list: %cE\n",
+			       list_empty(&block->ref_to_list) ? ' ' : '!',
+			       list_empty(&block->ref_from_list) ? ' ' : '!');
+		if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), old(gen=%llu,"
+			       " objectid=%llu, type=%d, offset=%llu),"
+			       " new(gen=%llu),"
+			       " which is referenced by most recent superblock"
+			       " (superblockgen=%llu)!\n",
+			       btrfsic_get_block_type(state, block), bytenr,
+			       dev_state->name, dev_bytenr, block->mirror_num,
+			       block->generation,
+			       btrfs_disk_key_objectid(&block->disk_key),
+			       block->disk_key.type,
+			       btrfs_disk_key_offset(&block->disk_key),
+			       btrfs_stack_header_generation(
+				       (struct btrfs_header *) mapped_datav[0]),
+			       state->max_superblock_generation);
+			btrfsic_dump_tree(state);
+		}
+
+		if (!block->is_iodone && !block->never_written) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, block), bytenr,
+			       dev_state->name, dev_bytenr, block->mirror_num,
+			       block->generation,
+			       btrfs_stack_header_generation(
+				       (struct btrfs_header *)
+				       mapped_datav[0]));
+			/* it would not be safe to go on */
+			btrfsic_dump_tree(state);
+			goto continue_loop;
+		}
+
+		/*
+		 * Clear all references of this block. Do not free
+		 * the block itself even if is not referenced anymore
+		 * because it still carries valueable information
+		 * like whether it was ever written and IO completed.
+		 */
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &block->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+			l->ref_cnt--;
+			if (0 == l->ref_cnt) {
+				list_del(&l->node_ref_to);
+				list_del(&l->node_ref_from);
+				btrfsic_block_link_hashtable_remove(l);
+				btrfsic_block_link_free(l);
+			}
+		}
+
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+		block_ctx.start = bytenr;
+		block_ctx.len = processed_len;
+		block_ctx.pagev = NULL;
+		block_ctx.mem_to_free = NULL;
+		block_ctx.datav = mapped_datav;
+
+		if (is_metadata || state->include_extent_data) {
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			if (NULL != bio) {
+				block->is_iodone = 0;
+				BUG_ON(NULL == bio_is_patched);
+				if (!*bio_is_patched) {
+					block->orig_bio_bh_private =
+					    bio->bi_private;
+					block->orig_bio_bh_end_io.bio =
+					    bio->bi_end_io;
+					block->next_in_same_bio = NULL;
+					bio->bi_private = block;
+					bio->bi_end_io = btrfsic_bio_end_io;
+					*bio_is_patched = 1;
+				} else {
+					struct btrfsic_block *chained_block =
+					    (struct btrfsic_block *)
+					    bio->bi_private;
+
+					BUG_ON(NULL == chained_block);
+					block->orig_bio_bh_private =
+					    chained_block->orig_bio_bh_private;
+					block->orig_bio_bh_end_io.bio =
+					    chained_block->orig_bio_bh_end_io.
+					    bio;
+					block->next_in_same_bio = chained_block;
+					bio->bi_private = block;
+				}
+			} else if (NULL != bh) {
+				block->is_iodone = 0;
+				block->orig_bio_bh_private = bh->b_private;
+				block->orig_bio_bh_end_io.bh = bh->b_end_io;
+				block->next_in_same_bio = NULL;
+				bh->b_private = block;
+				bh->b_end_io = btrfsic_bh_end_io;
+			} else {
+				block->is_iodone = 1;
+				block->orig_bio_bh_private = NULL;
+				block->orig_bio_bh_end_io.bio = NULL;
+				block->next_in_same_bio = NULL;
+			}
+		}
+
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (is_metadata) {
+			block->logical_bytenr = bytenr;
+			block->is_metadata = 1;
+			if (block->is_superblock) {
+				BUG_ON(PAGE_CACHE_SIZE !=
+				       BTRFS_SUPER_INFO_SIZE);
+				ret = btrfsic_process_written_superblock(
+						state,
+						block,
+						(struct btrfs_super_block *)
+						mapped_datav[0]);
+				if (state->print_mask &
+				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+					printk(KERN_INFO
+					"[after new superblock is written]:\n");
+					btrfsic_dump_tree_sub(state, block, 0);
+				}
+			} else {
+				block->mirror_num = 0;	/* unknown */
+				ret = btrfsic_process_metablock(
+						state,
+						block,
+						&block_ctx,
+						0, 0);
+			}
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_process_metablock"
+				       "(root @%llu) failed!\n",
+				       dev_bytenr);
+		} else {
+			block->is_metadata = 0;
+			block->mirror_num = 0;	/* unknown */
+			block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			if (!state->include_extent_data
+			    && list_empty(&block->ref_from_list)) {
+				/*
+				 * disk block is overwritten with extent
+				 * data (not meta data) and we are configured
+				 * to not include extent data: take the
+				 * chance and free the block's memory
+				 */
+				btrfsic_block_hashtable_remove(block);
+				list_del(&block->all_blocks_node);
+				btrfsic_block_free(block);
+			}
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	} else {
+		/* block has not been found in hash table */
+		u64 bytenr;
+
+		if (!is_metadata) {
+			processed_len = state->datablock_size;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "Written block (%s/%llu/?)"
+				       " !found in hash table, D.\n",
+				       dev_state->name, dev_bytenr);
+			if (!state->include_extent_data) {
+				/* ignore that written D block */
+				goto continue_loop;
+			}
+
+			/* this is getting ugly for the
+			 * include_extent_data case... */
+			bytenr = 0;	/* unknown */
+		} else {
+			processed_len = state->metablock_size;
+			bytenr = btrfs_stack_header_bytenr(
+					(struct btrfs_header *)
+					mapped_datav[0]);
+			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+						       dev_bytenr);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/?)"
+				       " !found in hash table, M.\n",
+				       bytenr, dev_state->name, dev_bytenr);
+		}
+
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+		block_ctx.start = bytenr;
+		block_ctx.len = processed_len;
+		block_ctx.pagev = NULL;
+		block_ctx.mem_to_free = NULL;
+		block_ctx.datav = mapped_datav;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(&block_ctx);
+			goto continue_loop;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = dev_bytenr;
+		block->logical_bytenr = bytenr;
+		block->is_metadata = is_metadata;
+		block->never_written = 0;
+		block->iodone_w_error = 0;
+		block->mirror_num = 0;	/* unknown */
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (NULL != bio) {
+			block->is_iodone = 0;
+			BUG_ON(NULL == bio_is_patched);
+			if (!*bio_is_patched) {
+				block->orig_bio_bh_private = bio->bi_private;
+				block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+				block->next_in_same_bio = NULL;
+				bio->bi_private = block;
+				bio->bi_end_io = btrfsic_bio_end_io;
+				*bio_is_patched = 1;
+			} else {
+				struct btrfsic_block *chained_block =
+				    (struct btrfsic_block *)
+				    bio->bi_private;
+
+				BUG_ON(NULL == chained_block);
+				block->orig_bio_bh_private =
+				    chained_block->orig_bio_bh_private;
+				block->orig_bio_bh_end_io.bio =
+				    chained_block->orig_bio_bh_end_io.bio;
+				block->next_in_same_bio = chained_block;
+				bio->bi_private = block;
+			}
+		} else if (NULL != bh) {
+			block->is_iodone = 0;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		} else {
+			block->is_iodone = 1;
+			block->orig_bio_bh_private = NULL;
+			block->orig_bio_bh_end_io.bio = NULL;
+			block->next_in_same_bio = NULL;
+		}
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New written %c-block @%llu (%s/%llu/%d)\n",
+			       is_metadata ? 'M' : 'D',
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+		if (is_metadata) {
+			ret = btrfsic_process_metablock(state, block,
+							&block_ctx, 0, 0);
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: process_metablock(root @%llu)"
+				       " failed!\n",
+				       dev_bytenr);
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+
+continue_loop:
+	BUG_ON(!processed_len);
+	dev_bytenr += processed_len;
+	mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+	num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+	goto again;
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+	int iodone_w_error;
+
+	/* mutex is not held! This is not save if IO is not yet completed
+	 * on umount */
+	iodone_w_error = 0;
+	if (bio_error_status)
+		iodone_w_error = 1;
+
+	BUG_ON(NULL == block);
+	bp->bi_private = block->orig_bio_bh_private;
+	bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+	do {
+		struct btrfsic_block *next_block;
+		struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+			       bio_error_status,
+			       btrfsic_get_block_type(dev_state->state, block),
+			       block->logical_bytenr, dev_state->name,
+			       block->dev_bytenr, block->mirror_num);
+		next_block = block->next_in_same_bio;
+		block->iodone_w_error = iodone_w_error;
+		if (block->submit_bio_bh_rw & REQ_FLUSH) {
+			dev_state->last_flush_gen++;
+			if ((dev_state->state->print_mask &
+			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+				printk(KERN_INFO
+				       "bio_end_io() new %s flush_gen=%llu\n",
+				       dev_state->name,
+				       dev_state->last_flush_gen);
+		}
+		if (block->submit_bio_bh_rw & REQ_FUA)
+			block->flush_gen = 0; /* FUA completed means block is
+					       * on disk */
+		block->is_iodone = 1; /* for FLUSH, this releases the block */
+		block = next_block;
+	} while (NULL != block);
+
+	bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+	int iodone_w_error = !uptodate;
+	struct btrfsic_dev_state *dev_state;
+
+	BUG_ON(NULL == block);
+	dev_state = block->dev_state;
+	if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+		printk(KERN_INFO
+		       "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+		       iodone_w_error,
+		       btrfsic_get_block_type(dev_state->state, block),
+		       block->logical_bytenr, block->dev_state->name,
+		       block->dev_bytenr, block->mirror_num);
+
+	block->iodone_w_error = iodone_w_error;
+	if (block->submit_bio_bh_rw & REQ_FLUSH) {
+		dev_state->last_flush_gen++;
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bh_end_io() new %s flush_gen=%llu\n",
+			       dev_state->name, dev_state->last_flush_gen);
+	}
+	if (block->submit_bio_bh_rw & REQ_FUA)
+		block->flush_gen = 0; /* FUA completed means block is on disk */
+
+	bh->b_private = block->orig_bio_bh_private;
+	bh->b_end_io = block->orig_bio_bh_end_io.bh;
+	block->is_iodone = 1; /* for FLUSH, this releases the block */
+	bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const superblock,
+		struct btrfs_super_block *const super_hdr)
+{
+	int pass;
+
+	superblock->generation = btrfs_super_generation(super_hdr);
+	if (!(superblock->generation > state->max_superblock_generation ||
+	      0 == state->max_superblock_generation)) {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: superblock @%llu (%s/%llu/%d)"
+			       " with old gen %llu <= %llu\n",
+			       superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       superblock->dev_bytenr, superblock->mirror_num,
+			       btrfs_super_generation(super_hdr),
+			       state->max_superblock_generation);
+	} else {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+			       " with new gen %llu > %llu\n",
+			       superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       superblock->dev_bytenr, superblock->mirror_num,
+			       btrfs_super_generation(super_hdr),
+			       state->max_superblock_generation);
+
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_hdr);
+		state->latest_superblock = superblock;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		int ret;
+		u64 next_bytenr;
+		struct btrfsic_block *next_block;
+		struct btrfsic_block_data_ctx tmp_next_block_ctx;
+		struct btrfsic_block_link *l;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key = {0};
+
+		btrfs_set_disk_key_objectid(&tmp_disk_key,
+					    BTRFS_ROOT_ITEM_KEY);
+		btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
+
+		switch (pass) {
+		case 0:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "root ";
+			next_bytenr = btrfs_super_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n", next_bytenr);
+			break;
+		case 1:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n", next_bytenr);
+			break;
+		case 2:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "log ";
+			next_bytenr = btrfs_super_log_root(super_hdr);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n", next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, BTRFS_SUPER_INFO_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			int was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "btrfsic_process_written_superblock("
+				       "mirror_num=%d)\n", mirror_num);
+			ret = btrfsic_map_block(state, next_bytenr,
+						BTRFS_SUPER_INFO_SIZE,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       next_bytenr, mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					additional_string,
+					1, 0, 1,
+					mirror_num,
+					&was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			if (was_created)
+				next_block->generation =
+				    BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					next_block,
+					superblock,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+	}
+
+	if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
+		btrfsic_dump_tree(state);
+
+	return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level)
+{
+	struct list_head *elem_ref_to;
+	int ret = 0;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/*
+		 * Note that this situation can happen and does not
+		 * indicate an error in regular cases. It happens
+		 * when disk blocks are freed and later reused.
+		 * The check-integrity module is not aware of any
+		 * block free operations, it just recognizes block
+		 * write operations. Therefore it keeps the linkage
+		 * information for a block until a block is
+		 * rewritten. This can temporarily cause incorrect
+		 * and even circular linkage informations. This
+		 * causes no harm unless such blocks are referenced
+		 * by the most recent super block.
+		 */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 1).\n");
+
+		return ret;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack
+	 * space is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " %u* refers to %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		if (l->block_ref_to->never_written) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is never written!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (!l->block_ref_to->is_iodone) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (l->block_ref_to->iodone_w_error) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which has write error!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (l->parent_generation !=
+			   l->block_ref_to->generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->parent_generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->block_ref_to->generation) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " with generation %llu !="
+			       " parent generation %llu!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num,
+			       l->block_ref_to->generation,
+			       l->parent_generation);
+			ret = -1;
+		} else if (l->block_ref_to->flush_gen >
+			   l->block_ref_to->dev_state->last_flush_gen) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not flushed out of disk's write cache"
+			       " (block flush_gen=%llu,"
+			       " dev->flush_gen=%llu)!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num, block->flush_gen,
+			       l->block_ref_to->dev_state->last_flush_gen);
+			ret = -1;
+		} else if (-1 == btrfsic_check_all_ref_blocks(state,
+							      l->block_ref_to,
+							      recursion_level +
+							      1)) {
+			ret = -1;
+		}
+	}
+
+	return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+		const struct btrfsic_state *state,
+		const struct btrfsic_block *block,
+		int recursion_level)
+{
+	struct list_head *elem_ref_from;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/* refer to comment at "abort cyclic linkage (case 1)" */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 2).\n");
+
+		return 0;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_from, &block->ref_from_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_from, struct btrfsic_block_link,
+			       node_ref_from);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		if (l->block_ref_from->is_superblock &&
+		    state->latest_superblock->dev_bytenr ==
+		    l->block_ref_from->dev_bytenr &&
+		    state->latest_superblock->dev_state->bdev ==
+		    l->block_ref_from->dev_state->bdev)
+			return 1;
+		else if (btrfsic_is_block_ref_by_superblock(state,
+							    l->block_ref_from,
+							    recursion_level +
+							    1))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Add %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Rem %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block)
+{
+	if (block->is_superblock &&
+	    state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+	    state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+		return 'S';
+	else if (block->is_superblock)
+		return 's';
+	else if (block->is_metadata)
+		return 'M';
+	else
+		return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+	btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level)
+{
+	struct list_head *elem_ref_to;
+	int indent_add;
+	static char buf[80];
+	int cursor_position;
+
+	/*
+	 * Should better fill an on-stack buffer with a complete line and
+	 * dump it at once when it is time to print a newline character.
+	 */
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+			     btrfsic_get_block_type(state, block),
+			     block->logical_bytenr, block->dev_state->name,
+			     block->dev_bytenr, block->mirror_num);
+	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+		printk("[...]\n");
+		return;
+	}
+	printk(buf);
+	indent_level += indent_add;
+	if (list_empty(&block->ref_to_list)) {
+		printk("\n");
+		return;
+	}
+	if (block->mirror_num > 1 &&
+	    !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+		printk(" [...]\n");
+		return;
+	}
+
+	cursor_position = indent_level;
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		while (cursor_position < indent_level) {
+			printk(" ");
+			cursor_position++;
+		}
+		if (l->ref_cnt > 1)
+			indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+		else
+			indent_add = sprintf(buf, " --> ");
+		if (indent_level + indent_add >
+		    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+			printk("[...]\n");
+			cursor_position = 0;
+			continue;
+		}
+
+		printk(buf);
+
+		btrfsic_dump_tree_sub(state, l->block_ref_to,
+				      indent_level + indent_add);
+		cursor_position = 0;
+	}
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation)
+{
+	struct btrfsic_block_link *l;
+
+	l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+						next_block_ctx->dev_bytenr,
+						from_block->dev_state->bdev,
+						from_block->dev_bytenr,
+						&state->block_link_hashtable);
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO
+			       "btrfsic: error, kmalloc" " failed!\n");
+			return NULL;
+		}
+
+		l->block_ref_to = next_block;
+		l->block_ref_from = from_block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &from_block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		l->ref_cnt++;
+		l->parent_generation = parent_generation;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+	}
+
+	return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created)
+{
+	struct btrfsic_block *block;
+
+	block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+					       block_ctx->dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL == block) {
+		struct btrfsic_dev_state *dev_state;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			return NULL;
+		}
+		dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+		if (NULL == dev_state) {
+			printk(KERN_INFO
+			       "btrfsic: error, lookup dev_state failed!\n");
+			btrfsic_block_free(block);
+			return NULL;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = block_ctx->dev_bytenr;
+		block->logical_bytenr = block_ctx->start;
+		block->is_metadata = is_metadata;
+		block->is_iodone = is_iodone;
+		block->never_written = never_written;
+		block->mirror_num = mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New %s%c-block @%llu (%s/%llu/%d)\n",
+			       additional_string,
+			       btrfsic_get_block_type(state, block),
+			       block->logical_bytenr, dev_state->name,
+			       block->dev_bytenr, mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+		if (NULL != was_created)
+			*was_created = 1;
+	} else {
+		if (NULL != was_created)
+			*was_created = 0;
+	}
+
+	return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr)
+{
+	int num_copies;
+	int mirror_num;
+	int ret;
+	struct btrfsic_block_data_ctx block_ctx;
+	int match = 0;
+
+	num_copies = btrfs_num_copies(state->root->fs_info,
+				      bytenr, state->metablock_size);
+
+	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+		ret = btrfsic_map_block(state, bytenr, state->metablock_size,
+					&block_ctx, mirror_num);
+		if (ret) {
+			printk(KERN_INFO "btrfsic:"
+			       " btrfsic_map_block(logical @%llu,"
+			       " mirror %d) failed!\n",
+			       bytenr, mirror_num);
+			continue;
+		}
+
+		if (dev_state->bdev == block_ctx.dev->bdev &&
+		    dev_bytenr == block_ctx.dev_bytenr) {
+			match++;
+			btrfsic_release_block_ctx(&block_ctx);
+			break;
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+
+	if (WARN_ON(!match)) {
+		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+		       " phys_bytenr=%llu)!\n",
+		       bytenr, dev_state->name, dev_bytenr);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			ret = btrfsic_map_block(state, bytenr,
+						state->metablock_size,
+						&block_ctx, mirror_num);
+			if (ret)
+				continue;
+
+			printk(KERN_INFO "Read logical bytenr @%llu maps to"
+			       " (%s/%llu/%d)\n",
+			       bytenr, block_ctx.dev->name,
+			       block_ctx.dev_bytenr, mirror_num);
+		}
+	}
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = btrfsic_dev_state_hashtable_lookup(bdev,
+						&btrfsic_dev_state_hashtable);
+	return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized)
+		return submit_bh(rw, bh);
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bh() might also be called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+	/* Only called to write the superblock (incl. FLUSH/FUA) */
+	if (NULL != dev_state &&
+	    (rw & WRITE) && bh->b_size > 0) {
+		u64 dev_bytenr;
+
+		dev_bytenr = 4096 * bh->b_blocknr;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu),"
+			       " size=%zu, data=%p, bdev=%p)\n",
+			       rw, (unsigned long long)bh->b_blocknr,
+			       dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
+		btrfsic_process_written_block(dev_state, dev_bytenr,
+					      &bh->b_data, 1, NULL,
+					      NULL, bh, rw);
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
+			       rw, bh->b_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bh(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		}
+	}
+	mutex_unlock(&btrfsic_mutex);
+	return submit_bh(rw, bh);
+}
+
+static void __btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized)
+		return;
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bio() is also called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+	if (NULL != dev_state &&
+	    (rw & WRITE) && NULL != bio->bi_io_vec) {
+		unsigned int i;
+		u64 dev_bytenr;
+		u64 cur_bytenr;
+		int bio_is_patched;
+		char **mapped_datav;
+
+		dev_bytenr = 512 * bio->bi_iter.bi_sector;
+		bio_is_patched = 0;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
+			       " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+			       rw, bio->bi_vcnt,
+			       (unsigned long long)bio->bi_iter.bi_sector,
+			       dev_bytenr, bio->bi_bdev);
+
+		mapped_datav = kmalloc_array(bio->bi_vcnt,
+					     sizeof(*mapped_datav), GFP_NOFS);
+		if (!mapped_datav)
+			goto leave;
+		cur_bytenr = dev_bytenr;
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+			mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+			if (!mapped_datav[i]) {
+				while (i > 0) {
+					i--;
+					kunmap(bio->bi_io_vec[i].bv_page);
+				}
+				kfree(mapped_datav);
+				goto leave;
+			}
+			if (dev_state->state->print_mask &
+			    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+				printk(KERN_INFO
+				       "#%u: bytenr=%llu, len=%u, offset=%u\n",
+				       i, cur_bytenr, bio->bi_io_vec[i].bv_len,
+				       bio->bi_io_vec[i].bv_offset);
+			cur_bytenr += bio->bi_io_vec[i].bv_len;
+		}
+		btrfsic_process_written_block(dev_state, dev_bytenr,
+					      mapped_datav, bio->bi_vcnt,
+					      bio, &bio_is_patched,
+					      NULL, rw);
+		while (i > 0) {
+			i--;
+			kunmap(bio->bi_io_vec[i].bv_page);
+		}
+		kfree(mapped_datav);
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
+			       rw, bio->bi_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bio(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bio->bi_private;
+			block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+			block->next_in_same_bio = NULL;
+			bio->bi_private = block;
+			bio->bi_end_io = btrfsic_bio_end_io;
+		}
+	}
+leave:
+	mutex_unlock(&btrfsic_mutex);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
+	submit_bio(rw, bio);
+}
+
+int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
+	return submit_bio_wait(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask)
+{
+	int ret;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+		printk(KERN_INFO
+		       "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+		       root->nodesize, PAGE_CACHE_SIZE);
+		return -1;
+	}
+	if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+		printk(KERN_INFO
+		       "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+		       root->sectorsize, PAGE_CACHE_SIZE);
+		return -1;
+	}
+	state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!state) {
+		state = vzalloc(sizeof(*state));
+		if (!state) {
+			printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
+			return -1;
+		}
+	}
+
+	if (!btrfsic_is_initialized) {
+		mutex_init(&btrfsic_mutex);
+		btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+		btrfsic_is_initialized = 1;
+	}
+	mutex_lock(&btrfsic_mutex);
+	state->root = root;
+	state->print_mask = print_mask;
+	state->include_extent_data = including_extent_data;
+	state->csum_size = 0;
+	state->metablock_size = root->nodesize;
+	state->datablock_size = root->sectorsize;
+	INIT_LIST_HEAD(&state->all_blocks_list);
+	btrfsic_block_hashtable_init(&state->block_hashtable);
+	btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+	state->max_superblock_generation = 0;
+	state->latest_superblock = NULL;
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+		char *p;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_alloc();
+		if (NULL == ds) {
+			printk(KERN_INFO
+			       "btrfs check-integrity: kmalloc() failed!\n");
+			mutex_unlock(&btrfsic_mutex);
+			return -1;
+		}
+		ds->bdev = device->bdev;
+		ds->state = state;
+		bdevname(ds->bdev, ds->name);
+		ds->name[BDEVNAME_SIZE - 1] = '\0';
+		for (p = ds->name; *p != '\0'; p++);
+		while (p > ds->name && *p != '/')
+			p--;
+		if (*p == '/')
+			p++;
+		strlcpy(ds->name, p, sizeof(ds->name));
+		btrfsic_dev_state_hashtable_add(ds,
+						&btrfsic_dev_state_hashtable);
+	}
+
+	ret = btrfsic_process_superblock(state, fs_devices);
+	if (0 != ret) {
+		mutex_unlock(&btrfsic_mutex);
+		btrfsic_unmount(root, fs_devices);
+		return ret;
+	}
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+		btrfsic_dump_database(state);
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+		btrfsic_dump_tree(state);
+
+	mutex_unlock(&btrfsic_mutex);
+	return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *elem_all;
+	struct list_head *tmp_all;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	if (!btrfsic_is_initialized)
+		return;
+
+	mutex_lock(&btrfsic_mutex);
+
+	state = NULL;
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_hashtable_lookup(
+				device->bdev,
+				&btrfsic_dev_state_hashtable);
+		if (NULL != ds) {
+			state = ds->state;
+			btrfsic_dev_state_hashtable_remove(ds);
+			btrfsic_dev_state_free(ds);
+		}
+	}
+
+	if (NULL == state) {
+		printk(KERN_INFO
+		       "btrfsic: error, cannot find state information"
+		       " on umount!\n");
+		mutex_unlock(&btrfsic_mutex);
+		return;
+	}
+
+	/*
+	 * Don't care about keeping the lists' state up to date,
+	 * just free all memory that was allocated dynamically.
+	 * Free the blocks and the block_links.
+	 */
+	list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+		struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &b_all->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+
+			l->ref_cnt--;
+			if (0 == l->ref_cnt)
+				btrfsic_block_link_free(l);
+		}
+
+		if (b_all->is_iodone || b_all->never_written)
+			btrfsic_block_free(b_all);
+		else
+			printk(KERN_INFO "btrfs: attempt to free %c-block"
+			       " @%llu (%s/%llu/%d) on umount which is"
+			       " not yet iodone!\n",
+			       btrfsic_get_block_type(state, b_all),
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num);
+	}
+
+	mutex_unlock(&btrfsic_mutex);
+
+	kvfree(state);
+}
diff --git a/kernel/fs/btrfs/check-integrity.h b/kernel/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000..13b8566c9
--- /dev/null
+++ b/kernel/fs/btrfs/check-integrity.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+int btrfsic_submit_bio_wait(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#define btrfsic_submit_bio_wait submit_bio_wait
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices);
+
+#endif
diff --git a/kernel/fs/btrfs/compression.c b/kernel/fs/btrfs/compression.c
new file mode 100644
index 000000000..ce62324c7
--- /dev/null
+++ b/kernel/fs/btrfs/compression.c
@@ -0,0 +1,1091 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+	/* number of bios pending for this compressed extent */
+	atomic_t pending_bios;
+
+	/* the pages with the compressed data on them */
+	struct page **compressed_pages;
+
+	/* inode that owns this data */
+	struct inode *inode;
+
+	/* starting offset in the inode for our pages */
+	u64 start;
+
+	/* number of bytes in the inode we're working on */
+	unsigned long len;
+
+	/* number of bytes on disk */
+	unsigned long compressed_len;
+
+	/* the compression algorithm for this bio */
+	int compress_type;
+
+	/* number of compressed pages in the array */
+	unsigned long nr_pages;
+
+	/* IO errors */
+	int errors;
+	int mirror_num;
+
+	/* for reads, this is the bio we are copying the data into */
+	struct bio *orig_bio;
+
+	/*
+	 * the start of a variable length array of checksums only
+	 * used by reads
+	 */
+	u32 sums;
+};
+
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen);
+
+static inline int compressed_bio_size(struct btrfs_root *root,
+				      unsigned long disk_size)
+{
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	return sizeof(struct compressed_bio) +
+		(DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
+}
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+					u64 first_byte, gfp_t gfp_flags)
+{
+	int nr_vecs;
+
+	nr_vecs = bio_get_nr_vecs(bdev);
+	return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
+}
+
+static int check_compressed_csum(struct inode *inode,
+				 struct compressed_bio *cb,
+				 u64 disk_start)
+{
+	int ret;
+	struct page *page;
+	unsigned long i;
+	char *kaddr;
+	u32 csum;
+	u32 *cb_sum = &cb->sums;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+		return 0;
+
+	for (i = 0; i < cb->nr_pages; i++) {
+		page = cb->compressed_pages[i];
+		csum = ~(u32)0;
+
+		kaddr = kmap_atomic(page);
+		csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
+		btrfs_csum_final(csum, (char *)&csum);
+		kunmap_atomic(kaddr);
+
+		if (csum != *cb_sum) {
+			btrfs_info(BTRFS_I(inode)->root->fs_info,
+			   "csum failed ino %llu extent %llu csum %u wanted %u mirror %d",
+			   btrfs_ino(inode), disk_start, csum, *cb_sum,
+			   cb->mirror_num);
+			ret = -EIO;
+			goto fail;
+		}
+		cb_sum++;
+
+	}
+	ret = 0;
+fail:
+	return ret;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+	int ret;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	inode = cb->inode;
+	ret = check_compressed_csum(inode, cb,
+				    (u64)bio->bi_iter.bi_sector << 9);
+	if (ret)
+		goto csum_failed;
+
+	/* ok, we're the last bio for this extent, lets start
+	 * the decompression.
+	 */
+	ret = btrfs_decompress_biovec(cb->compress_type,
+				      cb->compressed_pages,
+				      cb->start,
+				      cb->orig_bio->bi_io_vec,
+				      cb->orig_bio->bi_vcnt,
+				      cb->compressed_len);
+csum_failed:
+	if (ret)
+		cb->errors = 1;
+
+	/* release the compressed pages */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* do io completion on the original bio */
+	if (cb->errors) {
+		bio_io_error(cb->orig_bio);
+	} else {
+		int i;
+		struct bio_vec *bvec;
+
+		/*
+		 * we have verified the checksum already, set page
+		 * checked so the end_io handlers know about it
+		 */
+		bio_for_each_segment_all(bvec, cb->orig_bio, i)
+			SetPageChecked(bvec->bv_page);
+
+		bio_endio(cb->orig_bio, 0);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline void end_compressed_writeback(struct inode *inode,
+					      const struct compressed_bio *cb)
+{
+	unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
+	struct page *pages[16];
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int ret;
+
+	if (cb->errors)
+		mapping_set_error(inode->i_mapping, -EIO);
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			nr_pages -= 1;
+			index += 1;
+			continue;
+		}
+		for (i = 0; i < ret; i++) {
+			if (cb->errors)
+				SetPageError(pages[i]);
+			end_page_writeback(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+	}
+	/* the inode may be gone now */
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, step one is to
+	 * call back into the FS and do all the end_io operations
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+					 cb->start,
+					 cb->start + cb->len - 1,
+					 NULL,
+					 err ? 0 : 1);
+	cb->compressed_pages[0]->mapping = NULL;
+
+	end_compressed_writeback(inode, cb);
+	/* note, our inode could be gone now */
+
+	/*
+	 * release the compressed pages, these came from alloc_page and
+	 * are not attached to the inode at all
+	 */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				 unsigned long len, u64 disk_start,
+				 unsigned long compressed_len,
+				 struct page **compressed_pages,
+				 unsigned long nr_pages)
+{
+	struct bio *bio = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct compressed_bio *cb;
+	unsigned long bytes_left;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int pg_index = 0;
+	struct page *page;
+	u64 first_byte = disk_start;
+	struct block_device *bdev;
+	int ret;
+	int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	if (!cb)
+		return -ENOMEM;
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->start = start;
+	cb->len = len;
+	cb->mirror_num = 0;
+	cb->compressed_pages = compressed_pages;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = NULL;
+	cb->nr_pages = nr_pages;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	if (!bio) {
+		kfree(cb);
+		return -ENOMEM;
+	}
+	bio->bi_private = cb;
+	bio->bi_end_io = end_compressed_bio_write;
+	atomic_inc(&cb->pending_bios);
+
+	/* create and submit bios for the compressed pages */
+	bytes_left = compressed_len;
+	for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+		page = compressed_pages[pg_index];
+		page->mapping = inode->i_mapping;
+		if (bio->bi_iter.bi_size)
+			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
+							   PAGE_CACHE_SIZE,
+							   bio, 0);
+		else
+			ret = 0;
+
+		page->mapping = NULL;
+		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(bio);
+
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+			ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+					BTRFS_WQ_ENDIO_DATA);
+			BUG_ON(ret); /* -ENOMEM */
+
+			if (!skip_sum) {
+				ret = btrfs_csum_one_bio(root, inode, bio,
+							 start, 1);
+				BUG_ON(ret); /* -ENOMEM */
+			}
+
+			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+			BUG_ON(ret); /* -ENOMEM */
+
+			bio_put(bio);
+
+			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			BUG_ON(!bio);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		if (bytes_left < PAGE_CACHE_SIZE) {
+			btrfs_info(BTRFS_I(inode)->root->fs_info,
+					"bytes left %lu compress len %lu nr %lu",
+			       bytes_left, cb->compressed_len, cb->nr_pages);
+		}
+		bytes_left -= PAGE_CACHE_SIZE;
+		first_byte += PAGE_CACHE_SIZE;
+		cond_resched();
+	}
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+	BUG_ON(ret); /* -ENOMEM */
+
+	if (!skip_sum) {
+		ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+
+	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+	BUG_ON(ret); /* -ENOMEM */
+
+	bio_put(bio);
+	return 0;
+}
+
+static noinline int add_ra_bio_pages(struct inode *inode,
+				     u64 compressed_end,
+				     struct compressed_bio *cb)
+{
+	unsigned long end_index;
+	unsigned long pg_index;
+	u64 last_offset;
+	u64 isize = i_size_read(inode);
+	int ret;
+	struct page *page;
+	unsigned long nr_pages = 0;
+	struct extent_map *em;
+	struct address_space *mapping = inode->i_mapping;
+	struct extent_map_tree *em_tree;
+	struct extent_io_tree *tree;
+	u64 end;
+	int misses = 0;
+
+	page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+	last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	tree = &BTRFS_I(inode)->io_tree;
+
+	if (isize == 0)
+		return 0;
+
+	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	while (last_offset < compressed_end) {
+		pg_index = last_offset >> PAGE_CACHE_SHIFT;
+
+		if (pg_index > end_index)
+			break;
+
+		rcu_read_lock();
+		page = radix_tree_lookup(&mapping->page_tree, pg_index);
+		rcu_read_unlock();
+		if (page && !radix_tree_exceptional_entry(page)) {
+			misses++;
+			if (misses > 4)
+				break;
+			goto next;
+		}
+
+		page = __page_cache_alloc(mapping_gfp_mask(mapping) &
+								~__GFP_FS);
+		if (!page)
+			break;
+
+		if (add_to_page_cache_lru(page, mapping, pg_index,
+								GFP_NOFS)) {
+			page_cache_release(page);
+			goto next;
+		}
+
+		end = last_offset + PAGE_CACHE_SIZE - 1;
+		/*
+		 * at this point, we have a locked page in the page cache
+		 * for these bytes in the file.  But, we have to make
+		 * sure they map to this compressed extent on disk.
+		 */
+		set_page_extent_mapped(page);
+		lock_extent(tree, last_offset, end);
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, last_offset,
+					   PAGE_CACHE_SIZE);
+		read_unlock(&em_tree->lock);
+
+		if (!em || last_offset < em->start ||
+		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
+			free_extent_map(em);
+			unlock_extent(tree, last_offset, end);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+		free_extent_map(em);
+
+		if (page->index == end_index) {
+			char *userpage;
+			size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+			if (zero_offset) {
+				int zeros;
+				zeros = PAGE_CACHE_SIZE - zero_offset;
+				userpage = kmap_atomic(page);
+				memset(userpage + zero_offset, 0, zeros);
+				flush_dcache_page(page);
+				kunmap_atomic(userpage);
+			}
+		}
+
+		ret = bio_add_page(cb->orig_bio, page,
+				   PAGE_CACHE_SIZE, 0);
+
+		if (ret == PAGE_CACHE_SIZE) {
+			nr_pages++;
+			page_cache_release(page);
+		} else {
+			unlock_extent(tree, last_offset, end);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+next:
+		last_offset += PAGE_CACHE_SIZE;
+	}
+	return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_iter.bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *em_tree;
+	struct compressed_bio *cb;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long compressed_len;
+	unsigned long nr_pages;
+	unsigned long pg_index;
+	struct page *page;
+	struct block_device *bdev;
+	struct bio *comp_bio;
+	u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
+	u64 em_len;
+	u64 em_start;
+	struct extent_map *em;
+	int ret = -ENOMEM;
+	int faili = 0;
+	u32 *sums;
+
+	tree = &BTRFS_I(inode)->io_tree;
+	em_tree = &BTRFS_I(inode)->extent_tree;
+
+	/* we need the actual starting offset of this extent in the file */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree,
+				   page_offset(bio->bi_io_vec->bv_page),
+				   PAGE_CACHE_SIZE);
+	read_unlock(&em_tree->lock);
+	if (!em)
+		return -EIO;
+
+	compressed_len = em->block_len;
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	if (!cb)
+		goto out;
+
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->mirror_num = mirror_num;
+	sums = &cb->sums;
+
+	cb->start = em->orig_start;
+	em_len = em->len;
+	em_start = em->start;
+
+	free_extent_map(em);
+	em = NULL;
+
+	cb->len = uncompressed_len;
+	cb->compressed_len = compressed_len;
+	cb->compress_type = extent_compress_type(bio_flags);
+	cb->orig_bio = bio;
+
+	nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
+	cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
+				       GFP_NOFS);
+	if (!cb->compressed_pages)
+		goto fail1;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
+							      __GFP_HIGHMEM);
+		if (!cb->compressed_pages[pg_index]) {
+			faili = pg_index - 1;
+			ret = -ENOMEM;
+			goto fail2;
+		}
+	}
+	faili = nr_pages - 1;
+	cb->nr_pages = nr_pages;
+
+	/* In the parent-locked case, we only locked the range we are
+	 * interested in.  In all other cases, we can opportunistically
+	 * cache decompressed data that goes beyond the requested range. */
+	if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
+		add_ra_bio_pages(inode, em_start + em_len, cb);
+
+	/* include any pages we added in add_ra-bio_pages */
+	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	cb->len = uncompressed_len;
+
+	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	if (!comp_bio)
+		goto fail2;
+	comp_bio->bi_private = cb;
+	comp_bio->bi_end_io = end_compressed_bio_read;
+	atomic_inc(&cb->pending_bios);
+
+	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+		page = cb->compressed_pages[pg_index];
+		page->mapping = inode->i_mapping;
+		page->index = em_start >> PAGE_CACHE_SHIFT;
+
+		if (comp_bio->bi_iter.bi_size)
+			ret = tree->ops->merge_bio_hook(READ, page, 0,
+							PAGE_CACHE_SIZE,
+							comp_bio, 0);
+		else
+			ret = 0;
+
+		page->mapping = NULL;
+		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(comp_bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+					BTRFS_WQ_ENDIO_DATA);
+			BUG_ON(ret); /* -ENOMEM */
+
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+
+			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+				ret = btrfs_lookup_bio_sums(root, inode,
+							comp_bio, sums);
+				BUG_ON(ret); /* -ENOMEM */
+			}
+			sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
+					     root->sectorsize);
+
+			ret = btrfs_map_bio(root, READ, comp_bio,
+					    mirror_num, 0);
+			if (ret)
+				bio_endio(comp_bio, ret);
+
+			bio_put(comp_bio);
+
+			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+							GFP_NOFS);
+			BUG_ON(!comp_bio);
+			comp_bio->bi_private = cb;
+			comp_bio->bi_end_io = end_compressed_bio_read;
+
+			bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		cur_disk_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(comp_bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+			BTRFS_WQ_ENDIO_DATA);
+	BUG_ON(ret); /* -ENOMEM */
+
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+
+	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+	if (ret)
+		bio_endio(comp_bio, ret);
+
+	bio_put(comp_bio);
+	return 0;
+
+fail2:
+	while (faili >= 0) {
+		__free_page(cb->compressed_pages[faili]);
+		faili--;
+	}
+
+	kfree(cb->compressed_pages);
+fail1:
+	kfree(cb);
+out:
+	free_extent_map(em);
+	return ret;
+}
+
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+
+static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+	&btrfs_zlib_compress,
+	&btrfs_lzo_compress,
+};
+
+void __init btrfs_init_compress(void)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		INIT_LIST_HEAD(&comp_idle_workspace[i]);
+		spin_lock_init(&comp_workspace_lock[i]);
+		atomic_set(&comp_alloc_workspace[i], 0);
+		init_waitqueue_head(&comp_workspace_wait[i]);
+	}
+}
+
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+	struct list_head *workspace;
+	int cpus = num_online_cpus();
+	int idx = type - 1;
+
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+again:
+	spin_lock(workspace_lock);
+	if (!list_empty(idle_workspace)) {
+		workspace = idle_workspace->next;
+		list_del(workspace);
+		(*num_workspace)--;
+		spin_unlock(workspace_lock);
+		return workspace;
+
+	}
+	if (atomic_read(alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+
+		spin_unlock(workspace_lock);
+		prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+			schedule();
+		finish_wait(workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(alloc_workspace);
+	spin_unlock(workspace_lock);
+
+	workspace = btrfs_compress_op[idx]->alloc_workspace();
+	if (IS_ERR(workspace)) {
+		atomic_dec(alloc_workspace);
+		wake_up(workspace_wait);
+	}
+	return workspace;
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+	int idx = type - 1;
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+
+	spin_lock(workspace_lock);
+	if (*num_workspace < num_online_cpus()) {
+		list_add(workspace, idle_workspace);
+		(*num_workspace)++;
+		spin_unlock(workspace_lock);
+		goto wake;
+	}
+	spin_unlock(workspace_lock);
+
+	btrfs_compress_op[idx]->free_workspace(workspace);
+	atomic_dec(alloc_workspace);
+wake:
+	smp_mb();
+	if (waitqueue_active(workspace_wait))
+		wake_up(workspace_wait);
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct list_head *workspace;
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		while (!list_empty(&comp_idle_workspace[i])) {
+			workspace = comp_idle_workspace[i].next;
+			list_del(workspace);
+			btrfs_compress_op[i]->free_workspace(workspace);
+			atomic_dec(&comp_alloc_workspace[i]);
+		}
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return PTR_ERR(workspace);
+
+	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+						      start, len, pages,
+						      nr_dest_pages, out_pages,
+						      total_in, total_out,
+						      max_out);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return PTR_ERR(workspace);
+
+	ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+							 disk_start,
+							 bvec, vcnt, srclen);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return PTR_ERR(workspace);
+
+	ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+						  dest_page, start_byte,
+						  srclen, destlen);
+
+	free_workspace(type, workspace);
+	return ret;
+}
+
+void btrfs_exit_compress(void)
+{
+	free_workspaces();
+}
+
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *pg_index,
+			      unsigned long *pg_offset)
+{
+	unsigned long buf_offset;
+	unsigned long current_buf_start;
+	unsigned long start_byte;
+	unsigned long working_bytes = total_out - buf_start;
+	unsigned long bytes;
+	char *kaddr;
+	struct page *page_out = bvec[*pg_index].bv_page;
+
+	/*
+	 * start byte is the first byte of the page we're currently
+	 * copying into relative to the start of the compressed data.
+	 */
+	start_byte = page_offset(page_out) - disk_start;
+
+	/* we haven't yet hit data corresponding to this page */
+	if (total_out <= start_byte)
+		return 1;
+
+	/*
+	 * the start of the data we care about is offset into
+	 * the middle of our working buffer
+	 */
+	if (total_out > start_byte && buf_start < start_byte) {
+		buf_offset = start_byte - buf_start;
+		working_bytes -= buf_offset;
+	} else {
+		buf_offset = 0;
+	}
+	current_buf_start = buf_start;
+
+	/* copy bytes from the working buffer into the pages */
+	while (working_bytes > 0) {
+		bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, working_bytes);
+		kaddr = kmap_atomic(page_out);
+		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+		kunmap_atomic(kaddr);
+		flush_dcache_page(page_out);
+
+		*pg_offset += bytes;
+		buf_offset += bytes;
+		working_bytes -= bytes;
+		current_buf_start += bytes;
+
+		/* check if we need to pick another page */
+		if (*pg_offset == PAGE_CACHE_SIZE) {
+			(*pg_index)++;
+			if (*pg_index >= vcnt)
+				return 0;
+
+			page_out = bvec[*pg_index].bv_page;
+			*pg_offset = 0;
+			start_byte = page_offset(page_out) - disk_start;
+
+			/*
+			 * make sure our new page is covered by this
+			 * working buffer
+			 */
+			if (total_out <= start_byte)
+				return 1;
+
+			/*
+			 * the next page in the biovec might not be adjacent
+			 * to the last page, but it might still be found
+			 * inside this working buffer. bump our offset pointer
+			 */
+			if (total_out > start_byte &&
+			    current_buf_start < start_byte) {
+				buf_offset = start_byte - buf_start;
+				working_bytes = total_out - start_byte;
+				current_buf_start = buf_start + buf_offset;
+			}
+		}
+	}
+
+	return 1;
+}
+
+/*
+ * When uncompressing data, we need to make sure and zero any parts of
+ * the biovec that were not filled in by the decompression code.  pg_index
+ * and pg_offset indicate the last page and the last offset of that page
+ * that have been filled in.  This will zero everything remaining in the
+ * biovec.
+ */
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+				   unsigned long pg_index,
+				   unsigned long pg_offset)
+{
+	while (pg_index < vcnt) {
+		struct page *page = bvec[pg_index].bv_page;
+		unsigned long off = bvec[pg_index].bv_offset;
+		unsigned long len = bvec[pg_index].bv_len;
+
+		if (pg_offset < off)
+			pg_offset = off;
+		if (pg_offset < off + len) {
+			unsigned long bytes = off + len - pg_offset;
+			char *kaddr;
+
+			kaddr = kmap_atomic(page);
+			memset(kaddr + pg_offset, 0, bytes);
+			kunmap_atomic(kaddr);
+		}
+		pg_index++;
+		pg_offset = 0;
+	}
+}
diff --git a/kernel/fs/btrfs/compression.h b/kernel/fs/btrfs/compression.h
new file mode 100644
index 000000000..13a4dc043
--- /dev/null
+++ b/kernel/fs/btrfs/compression.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+void btrfs_init_compress(void);
+void btrfs_exit_compress(void);
+
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *pg_index,
+			      unsigned long *pg_offset);
+
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				  unsigned long len, u64 disk_start,
+				  unsigned long compressed_len,
+				  struct page **compressed_pages,
+				  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags);
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+				   unsigned long pg_index,
+				   unsigned long pg_offset);
+struct btrfs_compress_op {
+	struct list_head *(*alloc_workspace)(void);
+
+	void (*free_workspace)(struct list_head *workspace);
+
+	int (*compress_pages)(struct list_head *workspace,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+
+	int (*decompress_biovec)(struct list_head *workspace,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen);
+
+	int (*decompress)(struct list_head *workspace,
+			  unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+};
+
+extern const struct btrfs_compress_op btrfs_zlib_compress;
+extern const struct btrfs_compress_op btrfs_lzo_compress;
+
+#endif
diff --git a/kernel/fs/btrfs/ctree.c b/kernel/fs/btrfs/ctree.c
new file mode 100644
index 000000000..0f11ebc92
--- /dev/null
+++ b/kernel/fs/btrfs/ctree.c
@@ -0,0 +1,5910 @@
+/*
+ * Copyright (C) 2007,2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *ins_key,
+		      struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
+			  struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst_buf,
+			      struct extent_buffer *src_buf);
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot);
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+				 struct extent_buffer *eb);
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+	struct btrfs_path *path;
+	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+	return path;
+}
+
+/*
+ * set all locked nodes in the path to blocking locks.  This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+	int i;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (!p->nodes[i] || !p->locks[i])
+			continue;
+		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
+		if (p->locks[i] == BTRFS_READ_LOCK)
+			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+		else if (p->locks[i] == BTRFS_WRITE_LOCK)
+			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+	}
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+					struct extent_buffer *held, int held_rw)
+{
+	int i;
+
+	if (held) {
+		btrfs_set_lock_blocking_rw(held, held_rw);
+		if (held_rw == BTRFS_WRITE_LOCK)
+			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
+		else if (held_rw == BTRFS_READ_LOCK)
+			held_rw = BTRFS_READ_LOCK_BLOCKING;
+	}
+	btrfs_set_path_blocking(p);
+
+	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
+		if (p->nodes[i] && p->locks[i]) {
+			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
+			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
+				p->locks[i] = BTRFS_WRITE_LOCK;
+			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
+				p->locks[i] = BTRFS_READ_LOCK;
+		}
+	}
+
+	if (held)
+		btrfs_clear_lock_blocking_rw(held, held_rw);
+}
+
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+	if (!p)
+		return;
+	btrfs_release_path(p);
+	kmem_cache_free(btrfs_path_cachep, p);
+}
+
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_path *p)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		p->slots[i] = 0;
+		if (!p->nodes[i])
+			continue;
+		if (p->locks[i]) {
+			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
+			p->locks[i] = 0;
+		}
+		free_extent_buffer(p->nodes[i]);
+		p->nodes[i] = NULL;
+	}
+}
+
+/*
+ * safely gets a reference on the root node of a tree.  A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree.  See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear.  It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while (1) {
+		rcu_read_lock();
+		eb = rcu_dereference(root->node);
+
+		/*
+		 * RCU really hurts here, we could free up the root node because
+		 * it was cow'ed but we may not get the new root node yet so do
+		 * the inc_not_zero dance and if it doesn't work then
+		 * synchronize_rcu and try again.
+		 */
+		if (atomic_inc_not_zero(&eb->refs)) {
+			rcu_read_unlock();
+			break;
+		}
+		rcu_read_unlock();
+		synchronize_rcu();
+	}
+	return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while (1) {
+		eb = btrfs_root_node(root);
+		btrfs_tree_lock(eb);
+		if (eb == root->node)
+			break;
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while (1) {
+		eb = btrfs_root_node(root);
+		btrfs_tree_read_lock(eb);
+		if (eb == root->node)
+			break;
+		btrfs_tree_read_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return eb;
+}
+
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list.  transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+		return;
+
+	spin_lock(&root->fs_info->trans_lock);
+	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+		/* Want the extent tree to be the last on the list */
+		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+			list_move_tail(&root->dirty_list,
+				       &root->fs_info->dirty_cowonly_roots);
+		else
+			list_move(&root->dirty_list,
+				  &root->fs_info->dirty_cowonly_roots);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+}
+
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid.  The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+	struct extent_buffer *cow;
+	int ret = 0;
+	int level;
+	struct btrfs_disk_key disk_key;
+
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->fs_info->running_transaction->transid);
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
+
+	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
+			&disk_key, level, buf->start, 0);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, new_root_objectid);
+
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
+			    BTRFS_FSID_SIZE);
+
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		ret = btrfs_inc_ref(trans, root, cow, 1);
+	else
+		ret = btrfs_inc_ref(trans, root, cow, 0);
+
+	if (ret)
+		return ret;
+
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+enum mod_log_op {
+	MOD_LOG_KEY_REPLACE,
+	MOD_LOG_KEY_ADD,
+	MOD_LOG_KEY_REMOVE,
+	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+	MOD_LOG_MOVE_KEYS,
+	MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+	int dst_slot;
+	int nr_items;
+};
+
+struct tree_mod_root {
+	u64 logical;
+	u8 level;
+};
+
+struct tree_mod_elem {
+	struct rb_node node;
+	u64 index;		/* shifted logical */
+	u64 seq;
+	enum mod_log_op op;
+
+	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+	int slot;
+
+	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+	u64 generation;
+
+	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+	struct btrfs_disk_key key;
+	u64 blockptr;
+
+	/* this is used for op == MOD_LOG_MOVE_KEYS */
+	struct tree_mod_move move;
+
+	/* this is used for op == MOD_LOG_ROOT_REPLACE */
+	struct tree_mod_root old_root;
+};
+
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
+{
+	read_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
+{
+	read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+	write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
+{
+	write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Pull a new tree mod seq number for our operation.
+ */
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+	return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem)
+{
+	tree_mod_log_write_lock(fs_info);
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!elem->seq) {
+		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	tree_mod_log_write_unlock(fs_info);
+
+	return elem->seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct rb_node *next;
+	struct seq_list *cur_elem;
+	struct tree_mod_elem *tm;
+	u64 min_seq = (u64)-1;
+	u64 seq_putting = elem->seq;
+
+	if (!seq_putting)
+		return;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	list_del(&elem->list);
+	elem->seq = 0;
+
+	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+		if (cur_elem->seq < min_seq) {
+			if (seq_putting > cur_elem->seq) {
+				/*
+				 * blocker with lower sequence number exists, we
+				 * cannot remove anything from the log
+				 */
+				spin_unlock(&fs_info->tree_mod_seq_lock);
+				return;
+			}
+			min_seq = cur_elem->seq;
+		}
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	/*
+	 * anything that's lower than the lowest existing (read: blocked)
+	 * sequence number can be removed from the tree.
+	 */
+	tree_mod_log_write_lock(fs_info);
+	tm_root = &fs_info->tree_mod_log;
+	for (node = rb_first(tm_root); node; node = next) {
+		next = rb_next(node);
+		tm = container_of(node, struct tree_mod_elem, node);
+		if (tm->seq > min_seq)
+			continue;
+		rb_erase(node, tm_root);
+		kfree(tm);
+	}
+	tree_mod_log_write_unlock(fs_info);
+}
+
+/*
+ * key order of the log:
+ *       index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ *
+ * Note: must be called with write lock (tree_mod_log_write_lock).
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+	struct rb_root *tm_root;
+	struct rb_node **new;
+	struct rb_node *parent = NULL;
+	struct tree_mod_elem *cur;
+
+	BUG_ON(!tm);
+
+	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+
+	tm_root = &fs_info->tree_mod_log;
+	new = &tm_root->rb_node;
+	while (*new) {
+		cur = container_of(*new, struct tree_mod_elem, node);
+		parent = *new;
+		if (cur->index < tm->index)
+			new = &((*new)->rb_left);
+		else if (cur->index > tm->index)
+			new = &((*new)->rb_right);
+		else if (cur->seq < tm->seq)
+			new = &((*new)->rb_left);
+		else if (cur->seq > tm->seq)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&tm->node, parent, new);
+	rb_insert_color(&tm->node, tm_root);
+	return 0;
+}
+
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+				    struct extent_buffer *eb) {
+	smp_mb();
+	if (list_empty(&(fs_info)->tree_mod_seq_list))
+		return 1;
+	if (eb && btrfs_header_level(eb) == 0)
+		return 1;
+
+	tree_mod_log_write_lock(fs_info);
+	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+		tree_mod_log_write_unlock(fs_info);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+				    struct extent_buffer *eb)
+{
+	smp_mb();
+	if (list_empty(&(fs_info)->tree_mod_seq_list))
+		return 0;
+	if (eb && btrfs_header_level(eb) == 0)
+		return 0;
+
+	return 1;
+}
+
+static struct tree_mod_elem *
+alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
+		    enum mod_log_op op, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return NULL;
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	if (op != MOD_LOG_KEY_ADD) {
+		btrfs_node_key(eb, &tm->key, slot);
+		tm->blockptr = btrfs_node_blockptr(eb, slot);
+	}
+	tm->op = op;
+	tm->slot = slot;
+	tm->generation = btrfs_node_ptr_generation(eb, slot);
+	RB_CLEAR_NODE(&tm->node);
+
+	return tm;
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+			struct extent_buffer *eb, int slot,
+			enum mod_log_op op, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+	int ret;
+
+	if (!tree_mod_need_log(fs_info, eb))
+		return 0;
+
+	tm = alloc_tree_mod_elem(eb, slot, op, flags);
+	if (!tm)
+		return -ENOMEM;
+
+	if (tree_mod_dont_log(fs_info, eb)) {
+		kfree(tm);
+		return 0;
+	}
+
+	ret = __tree_mod_log_insert(fs_info, tm);
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		kfree(tm);
+
+	return ret;
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *eb, int dst_slot, int src_slot,
+			 int nr_items, gfp_t flags)
+{
+	struct tree_mod_elem *tm = NULL;
+	struct tree_mod_elem **tm_list = NULL;
+	int ret = 0;
+	int i;
+	int locked = 0;
+
+	if (!tree_mod_need_log(fs_info, eb))
+		return 0;
+
+	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
+	if (!tm_list)
+		return -ENOMEM;
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm) {
+		ret = -ENOMEM;
+		goto free_tms;
+	}
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	tm->slot = src_slot;
+	tm->move.dst_slot = dst_slot;
+	tm->move.nr_items = nr_items;
+	tm->op = MOD_LOG_MOVE_KEYS;
+
+	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+		if (!tm_list[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, eb))
+		goto free_tms;
+	locked = 1;
+
+	/*
+	 * When we override something during the move, we log these removals.
+	 * This can only happen when we move towards the beginning of the
+	 * buffer, i.e. dst_slot < src_slot.
+	 */
+	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+		if (ret)
+			goto free_tms;
+	}
+
+	ret = __tree_mod_log_insert(fs_info, tm);
+	if (ret)
+		goto free_tms;
+	tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return 0;
+free_tms:
+	for (i = 0; i < nr_items; i++) {
+		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+		kfree(tm_list[i]);
+	}
+	if (locked)
+		tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+	kfree(tm);
+
+	return ret;
+}
+
+static inline int
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+		       struct tree_mod_elem **tm_list,
+		       int nritems)
+{
+	int i, j;
+	int ret;
+
+	for (i = nritems - 1; i >= 0; i--) {
+		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+		if (ret) {
+			for (j = nritems - 1; j > i; j--)
+				rb_erase(&tm_list[j]->node,
+					 &fs_info->tree_mod_log);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *old_root,
+			 struct extent_buffer *new_root, gfp_t flags,
+			 int log_removal)
+{
+	struct tree_mod_elem *tm = NULL;
+	struct tree_mod_elem **tm_list = NULL;
+	int nritems = 0;
+	int ret = 0;
+	int i;
+
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
+
+	if (log_removal && btrfs_header_level(old_root) > 0) {
+		nritems = btrfs_header_nritems(old_root);
+		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
+				  flags);
+		if (!tm_list) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+		for (i = 0; i < nritems; i++) {
+			tm_list[i] = alloc_tree_mod_elem(old_root, i,
+			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+			if (!tm_list[i]) {
+				ret = -ENOMEM;
+				goto free_tms;
+			}
+		}
+	}
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm) {
+		ret = -ENOMEM;
+		goto free_tms;
+	}
+
+	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+	tm->old_root.logical = old_root->start;
+	tm->old_root.level = btrfs_header_level(old_root);
+	tm->generation = btrfs_header_generation(old_root);
+	tm->op = MOD_LOG_ROOT_REPLACE;
+
+	if (tree_mod_dont_log(fs_info, NULL))
+		goto free_tms;
+
+	if (tm_list)
+		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+	if (!ret)
+		ret = __tree_mod_log_insert(fs_info, tm);
+
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		goto free_tms;
+	kfree(tm_list);
+
+	return ret;
+
+free_tms:
+	if (tm_list) {
+		for (i = 0; i < nritems; i++)
+			kfree(tm_list[i]);
+		kfree(tm_list);
+	}
+	kfree(tm);
+
+	return ret;
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+		      int smallest)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct tree_mod_elem *cur = NULL;
+	struct tree_mod_elem *found = NULL;
+	u64 index = start >> PAGE_CACHE_SHIFT;
+
+	tree_mod_log_read_lock(fs_info);
+	tm_root = &fs_info->tree_mod_log;
+	node = tm_root->rb_node;
+	while (node) {
+		cur = container_of(node, struct tree_mod_elem, node);
+		if (cur->index < index) {
+			node = node->rb_left;
+		} else if (cur->index > index) {
+			node = node->rb_right;
+		} else if (cur->seq < min_seq) {
+			node = node->rb_left;
+		} else if (!smallest) {
+			/* we want the node with the highest seq */
+			if (found)
+				BUG_ON(found->seq > cur->seq);
+			found = cur;
+			node = node->rb_left;
+		} else if (cur->seq > min_seq) {
+			/* we want the node with the smallest seq */
+			if (found)
+				BUG_ON(found->seq < cur->seq);
+			found = cur;
+			node = node->rb_right;
+		} else {
+			found = cur;
+			break;
+		}
+	}
+	tree_mod_log_read_unlock(fs_info);
+
+	return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+			   u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static noinline int
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+		     struct extent_buffer *src, unsigned long dst_offset,
+		     unsigned long src_offset, int nr_items)
+{
+	int ret = 0;
+	struct tree_mod_elem **tm_list = NULL;
+	struct tree_mod_elem **tm_list_add, **tm_list_rem;
+	int i;
+	int locked = 0;
+
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
+
+	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+		return 0;
+
+	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
+			  GFP_NOFS);
+	if (!tm_list)
+		return -ENOMEM;
+
+	tm_list_add = tm_list;
+	tm_list_rem = tm_list + nr_items;
+	for (i = 0; i < nr_items; i++) {
+		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		if (!tm_list_rem[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+
+		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+		    MOD_LOG_KEY_ADD, GFP_NOFS);
+		if (!tm_list_add[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, NULL))
+		goto free_tms;
+	locked = 1;
+
+	for (i = 0; i < nr_items; i++) {
+		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
+		if (ret)
+			goto free_tms;
+		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
+		if (ret)
+			goto free_tms;
+	}
+
+	tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return 0;
+
+free_tms:
+	for (i = 0; i < nr_items * 2; i++) {
+		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+		kfree(tm_list[i]);
+	}
+	if (locked)
+		tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return ret;
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+		     int dst_offset, int src_offset, int nr_items)
+{
+	int ret;
+	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+				       nr_items, GFP_NOFS);
+	BUG_ON(ret < 0);
+}
+
+static noinline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+			  struct extent_buffer *eb, int slot, int atomic)
+{
+	int ret;
+
+	ret = tree_mod_log_insert_key(fs_info, eb, slot,
+					MOD_LOG_KEY_REPLACE,
+					atomic ? GFP_ATOMIC : GFP_NOFS);
+	BUG_ON(ret < 0);
+}
+
+static noinline int
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+	struct tree_mod_elem **tm_list = NULL;
+	int nritems = 0;
+	int i;
+	int ret = 0;
+
+	if (btrfs_header_level(eb) == 0)
+		return 0;
+
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
+
+	nritems = btrfs_header_nritems(eb);
+	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
+	if (!tm_list)
+		return -ENOMEM;
+
+	for (i = 0; i < nritems; i++) {
+		tm_list[i] = alloc_tree_mod_elem(eb, i,
+		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+		if (!tm_list[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, eb))
+		goto free_tms;
+
+	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		goto free_tms;
+	kfree(tm_list);
+
+	return 0;
+
+free_tms:
+	for (i = 0; i < nritems; i++)
+		kfree(tm_list[i]);
+	kfree(tm_list);
+
+	return ret;
+}
+
+static noinline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+			      struct extent_buffer *new_root_node,
+			      int log_removal)
+{
+	int ret;
+	ret = tree_mod_log_insert_root(root->fs_info, root->node,
+				       new_root_node, GFP_NOFS, log_removal);
+	BUG_ON(ret < 0);
+}
+
+/*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf)
+{
+	/*
+	 * Tree blocks not in refernece counted trees and tree roots
+	 * are never shared. If a block was allocated after the last
+	 * snapshot and the block was not allocated by tree relocation,
+	 * we know the block is not shared.
+	 */
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+	    buf != root->node && buf != root->commit_root &&
+	    (btrfs_header_generation(buf) <=
+	     btrfs_root_last_snapshot(&root->root_item) ||
+	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+		return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+#endif
+	return 0;
+}
+
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct extent_buffer *buf,
+				       struct extent_buffer *cow,
+				       int *last_ref)
+{
+	u64 refs;
+	u64 owner;
+	u64 flags;
+	u64 new_flags = 0;
+	int ret;
+
+	/*
+	 * Backrefs update rules:
+	 *
+	 * Always use full backrefs for extent pointers in tree block
+	 * allocated by tree relocation.
+	 *
+	 * If a shared tree block is no longer referenced by its owner
+	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+	 * use full backrefs for extent pointers in tree block.
+	 *
+	 * If a tree block is been relocating
+	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+	 * use full backrefs for extent pointers in tree block.
+	 * The reason for this is some operations (such as drop tree)
+	 * are only allowed for blocks use full backrefs.
+	 */
+
+	if (btrfs_block_can_be_shared(root, buf)) {
+		ret = btrfs_lookup_extent_info(trans, root, buf->start,
+					       btrfs_header_level(buf), 1,
+					       &refs, &flags);
+		if (ret)
+			return ret;
+		if (refs == 0) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			return ret;
+		}
+	} else {
+		refs = 1;
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		else
+			flags = 0;
+	}
+
+	owner = btrfs_header_owner(buf);
+	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
+	if (refs > 1) {
+		if ((owner == root->root_key.objectid ||
+		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+			ret = btrfs_inc_ref(trans, root, buf, 1);
+			BUG_ON(ret); /* -ENOMEM */
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				ret = btrfs_dec_ref(trans, root, buf, 0);
+				BUG_ON(ret); /* -ENOMEM */
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+				BUG_ON(ret); /* -ENOMEM */
+			}
+			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		} else {
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret); /* -ENOMEM */
+		}
+		if (new_flags != 0) {
+			int level = btrfs_header_level(buf);
+
+			ret = btrfs_set_disk_extent_flags(trans, root,
+							  buf->start,
+							  buf->len,
+							  new_flags, level, 0);
+			if (ret)
+				return ret;
+		}
+	} else {
+		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret); /* -ENOMEM */
+			ret = btrfs_dec_ref(trans, root, buf, 1);
+			BUG_ON(ret); /* -ENOMEM */
+		}
+		clean_tree_block(trans, root->fs_info, buf);
+		*last_ref = 1;
+	}
+	return 0;
+}
+
+/*
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct extent_buffer *buf,
+			     struct extent_buffer *parent, int parent_slot,
+			     struct extent_buffer **cow_ret,
+			     u64 search_start, u64 empty_size)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *cow;
+	int level, ret;
+	int last_ref = 0;
+	int unlock_orig = 0;
+	u64 parent_start;
+
+	if (*cow_ret == buf)
+		unlock_orig = 1;
+
+	btrfs_assert_tree_locked(buf);
+
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->fs_info->running_transaction->transid);
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+	} else
+		parent_start = 0;
+
+	cow = btrfs_alloc_tree_block(trans, root, parent_start,
+			root->root_key.objectid, &disk_key, level,
+			search_start, empty_size);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	/* cow is set to blocking by btrfs_init_new_buffer */
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, root->root_key.objectid);
+
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
+			    BTRFS_FSID_SIZE);
+
+	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+		if (ret)
+			return ret;
+	}
+
+	if (buf == root->node) {
+		WARN_ON(parent && parent != buf);
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			parent_start = buf->start;
+		else
+			parent_start = 0;
+
+		extent_buffer_get(cow);
+		tree_mod_log_set_root_pointer(root, cow, 1);
+		rcu_assign_pointer(root->node, cow);
+
+		btrfs_free_tree_block(trans, root, buf, parent_start,
+				      last_ref);
+		free_extent_buffer(buf);
+		add_root_to_dirty_list(root);
+	} else {
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+
+		WARN_ON(trans->transid != btrfs_header_generation(parent));
+		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+					MOD_LOG_KEY_REPLACE, GFP_NOFS);
+		btrfs_set_node_blockptr(parent, parent_slot,
+					cow->start);
+		btrfs_set_node_ptr_generation(parent, parent_slot,
+					      trans->transid);
+		btrfs_mark_buffer_dirty(parent);
+		if (last_ref) {
+			ret = tree_mod_log_free_eb(root->fs_info, buf);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				return ret;
+			}
+		}
+		btrfs_free_tree_block(trans, root, buf, parent_start,
+				      last_ref);
+	}
+	if (unlock_orig)
+		btrfs_tree_unlock(buf);
+	free_extent_buffer_stale(buf);
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *eb_root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	struct tree_mod_elem *found = NULL;
+	u64 root_logical = eb_root->start;
+	int looped = 0;
+
+	if (!time_seq)
+		return NULL;
+
+	/*
+	 * the very last operation that's logged for a root is the replacement
+	 * operation (if it is replaced at all). this has the index of the *new*
+	 * root, making it the very first operation that's logged for this root.
+	 */
+	while (1) {
+		tm = tree_mod_log_search_oldest(fs_info, root_logical,
+						time_seq);
+		if (!looped && !tm)
+			return NULL;
+		/*
+		 * if there are no tree operation for the oldest root, we simply
+		 * return it. this should only happen if that (old) root is at
+		 * level 0.
+		 */
+		if (!tm)
+			break;
+
+		/*
+		 * if there's an operation that's not a root replacement, we
+		 * found the oldest version of our root. normally, we'll find a
+		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+		 */
+		if (tm->op != MOD_LOG_ROOT_REPLACE)
+			break;
+
+		found = tm;
+		root_logical = tm->old_root.logical;
+		looped = 1;
+	}
+
+	/* if there's no old root to return, return what we found instead */
+	if (!found)
+		found = tm;
+
+	return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+		      u64 time_seq, struct tree_mod_elem *first_tm)
+{
+	u32 n;
+	struct rb_node *next;
+	struct tree_mod_elem *tm = first_tm;
+	unsigned long o_dst;
+	unsigned long o_src;
+	unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+	n = btrfs_header_nritems(eb);
+	tree_mod_log_read_lock(fs_info);
+	while (tm && tm->seq >= time_seq) {
+		/*
+		 * all the operations are recorded with the operator used for
+		 * the modification. as we're going backwards, we do the
+		 * opposite of each operation here.
+		 */
+		switch (tm->op) {
+		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+			BUG_ON(tm->slot < n);
+			/* Fallthrough */
+		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+		case MOD_LOG_KEY_REMOVE:
+			btrfs_set_node_key(eb, &tm->key, tm->slot);
+			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+			btrfs_set_node_ptr_generation(eb, tm->slot,
+						      tm->generation);
+			n++;
+			break;
+		case MOD_LOG_KEY_REPLACE:
+			BUG_ON(tm->slot >= n);
+			btrfs_set_node_key(eb, &tm->key, tm->slot);
+			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+			btrfs_set_node_ptr_generation(eb, tm->slot,
+						      tm->generation);
+			break;
+		case MOD_LOG_KEY_ADD:
+			/* if a move operation is needed it's in the log */
+			n--;
+			break;
+		case MOD_LOG_MOVE_KEYS:
+			o_dst = btrfs_node_key_ptr_offset(tm->slot);
+			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+			memmove_extent_buffer(eb, o_dst, o_src,
+					      tm->move.nr_items * p_size);
+			break;
+		case MOD_LOG_ROOT_REPLACE:
+			/*
+			 * this operation is special. for roots, this must be
+			 * handled explicitly before rewinding.
+			 * for non-roots, this operation may exist if the node
+			 * was a root: root A -> child B; then A gets empty and
+			 * B is promoted to the new root. in the mod log, we'll
+			 * have a root-replace operation for B, a tree block
+			 * that is no root. we simply ignore that operation.
+			 */
+			break;
+		}
+		next = rb_next(&tm->node);
+		if (!next)
+			break;
+		tm = container_of(next, struct tree_mod_elem, node);
+		if (tm->index != first_tm->index)
+			break;
+	}
+	tree_mod_log_read_unlock(fs_info);
+	btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		    struct extent_buffer *eb, u64 time_seq)
+{
+	struct extent_buffer *eb_rewin;
+	struct tree_mod_elem *tm;
+
+	if (!time_seq)
+		return eb;
+
+	if (btrfs_header_level(eb) == 0)
+		return eb;
+
+	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+	if (!tm)
+		return eb;
+
+	btrfs_set_path_blocking(path);
+	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		BUG_ON(tm->slot != 0);
+		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
+		btrfs_set_header_bytenr(eb_rewin, eb->start);
+		btrfs_set_header_backref_rev(eb_rewin,
+					     btrfs_header_backref_rev(eb));
+		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+	} else {
+		eb_rewin = btrfs_clone_extent_buffer(eb);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
+	}
+
+	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
+	btrfs_tree_read_unlock_blocking(eb);
+	free_extent_buffer(eb);
+
+	extent_buffer_get(eb_rewin);
+	btrfs_tree_read_lock(eb_rewin);
+	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+	WARN_ON(btrfs_header_nritems(eb_rewin) >
+		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
+
+	return eb_rewin;
+}
+
+/*
+ * get_old_root() rewinds the state of @root's root node to the given @time_seq
+ * value. If there are no changes, the current root->root_node is returned. If
+ * anything changed in between, there's a fresh buffer allocated on which the
+ * rewind operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	struct extent_buffer *eb = NULL;
+	struct extent_buffer *eb_root;
+	struct extent_buffer *old;
+	struct tree_mod_root *old_root = NULL;
+	u64 old_generation = 0;
+	u64 logical;
+
+	eb_root = btrfs_read_lock_root_node(root);
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+	if (!tm)
+		return eb_root;
+
+	if (tm->op == MOD_LOG_ROOT_REPLACE) {
+		old_root = &tm->old_root;
+		old_generation = tm->generation;
+		logical = old_root->logical;
+	} else {
+		logical = eb_root->start;
+	}
+
+	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
+		old = read_tree_block(root, logical, 0);
+		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
+			free_extent_buffer(old);
+			btrfs_warn(root->fs_info,
+				"failed to read tree block %llu from get_old_root", logical);
+		} else {
+			eb = btrfs_clone_extent_buffer(old);
+			free_extent_buffer(old);
+		}
+	} else if (old_root) {
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
+		eb = alloc_dummy_extent_buffer(root->fs_info, logical);
+	} else {
+		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+		eb = btrfs_clone_extent_buffer(eb_root);
+		btrfs_tree_read_unlock_blocking(eb_root);
+		free_extent_buffer(eb_root);
+	}
+
+	if (!eb)
+		return NULL;
+	extent_buffer_get(eb);
+	btrfs_tree_read_lock(eb);
+	if (old_root) {
+		btrfs_set_header_bytenr(eb, eb->start);
+		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+		btrfs_set_header_level(eb, old_root->level);
+		btrfs_set_header_generation(eb, old_generation);
+	}
+	if (tm)
+		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
+	else
+		WARN_ON(btrfs_header_level(eb) != 0);
+	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
+
+	return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	int level;
+	struct extent_buffer *eb_root = btrfs_root_node(root);
+
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
+		level = tm->old_root.level;
+	} else {
+		level = btrfs_header_level(eb_root);
+	}
+	free_extent_buffer(eb_root);
+
+	return level;
+}
+
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct extent_buffer *buf)
+{
+	if (btrfs_test_is_dummy_root(root))
+		return 0;
+
+	/* ensure we can see the force_cow */
+	smp_rmb();
+
+	/*
+	 * We do not need to cow a block if
+	 * 1) this block is not created or changed in this transaction;
+	 * 2) this block does not belong to TREE_RELOC tree;
+	 * 3) the root is not forced COW.
+	 *
+	 * What is forced COW:
+	 *    when we create snapshot during commiting the transaction,
+	 *    after we've finished coping src root, we must COW the shared
+	 *    block to ensure the metadata consistency.
+	 */
+	if (btrfs_header_generation(buf) == trans->transid &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
+		return 0;
+	return 1;
+}
+
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret)
+{
+	u64 search_start;
+	int ret;
+
+	if (trans->transaction != root->fs_info->running_transaction)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
+		       trans->transid,
+		       root->fs_info->running_transaction->transid);
+
+	if (trans->transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
+		       trans->transid, root->fs_info->generation);
+
+	if (!should_cow_block(trans, root, buf)) {
+		*cow_ret = buf;
+		return 0;
+	}
+
+	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+	if (parent)
+		btrfs_set_lock_blocking(parent);
+	btrfs_set_lock_blocking(buf);
+
+	ret = __btrfs_cow_block(trans, root, buf, parent,
+				 parent_slot, cow_ret, search_start, 0);
+
+	trace_btrfs_cow_block(root, buf, *cow_ret);
+
+	return ret;
+}
+
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+	if (blocknr < other && other - (blocknr + blocksize) < 32768)
+		return 1;
+	if (blocknr > other && blocknr - (other + blocksize) < 32768)
+		return 1;
+	return 0;
+}
+
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+	struct btrfs_key k1;
+
+	btrfs_disk_key_to_cpu(&k1, disk);
+
+	return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->type > k2->type)
+		return 1;
+	if (k1->type < k2->type)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct extent_buffer *parent,
+		       int start_slot, u64 *last_ret,
+		       struct btrfs_key *progress)
+{
+	struct extent_buffer *cur;
+	u64 blocknr;
+	u64 gen;
+	u64 search_start = *last_ret;
+	u64 last_block = 0;
+	u64 other;
+	u32 parent_nritems;
+	int end_slot;
+	int i;
+	int err = 0;
+	int parent_level;
+	int uptodate;
+	u32 blocksize;
+	int progress_passed = 0;
+	struct btrfs_disk_key disk_key;
+
+	parent_level = btrfs_header_level(parent);
+
+	WARN_ON(trans->transaction != root->fs_info->running_transaction);
+	WARN_ON(trans->transid != root->fs_info->generation);
+
+	parent_nritems = btrfs_header_nritems(parent);
+	blocksize = root->nodesize;
+	end_slot = parent_nritems - 1;
+
+	if (parent_nritems <= 1)
+		return 0;
+
+	btrfs_set_lock_blocking(parent);
+
+	for (i = start_slot; i <= end_slot; i++) {
+		int close = 1;
+
+		btrfs_node_key(parent, &disk_key, i);
+		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+			continue;
+
+		progress_passed = 1;
+		blocknr = btrfs_node_blockptr(parent, i);
+		gen = btrfs_node_ptr_generation(parent, i);
+		if (last_block == 0)
+			last_block = blocknr;
+
+		if (i > 0) {
+			other = btrfs_node_blockptr(parent, i - 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (!close && i < end_slot) {
+			other = btrfs_node_blockptr(parent, i + 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (close) {
+			last_block = blocknr;
+			continue;
+		}
+
+		cur = btrfs_find_tree_block(root->fs_info, blocknr);
+		if (cur)
+			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
+		else
+			uptodate = 0;
+		if (!cur || !uptodate) {
+			if (!cur) {
+				cur = read_tree_block(root, blocknr, gen);
+				if (!cur || !extent_buffer_uptodate(cur)) {
+					free_extent_buffer(cur);
+					return -EIO;
+				}
+			} else if (!uptodate) {
+				err = btrfs_read_buffer(cur, gen);
+				if (err) {
+					free_extent_buffer(cur);
+					return err;
+				}
+			}
+		}
+		if (search_start == 0)
+			search_start = last_block;
+
+		btrfs_tree_lock(cur);
+		btrfs_set_lock_blocking(cur);
+		err = __btrfs_cow_block(trans, root, cur, parent, i,
+					&cur, search_start,
+					min(16 * blocksize,
+					    (end_slot - i) * blocksize));
+		if (err) {
+			btrfs_tree_unlock(cur);
+			free_extent_buffer(cur);
+			break;
+		}
+		search_start = cur->start;
+		last_block = cur->start;
+		*last_ret = search_start;
+		btrfs_tree_unlock(cur);
+		free_extent_buffer(cur);
+	}
+	return err;
+}
+
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+					 struct extent_buffer *leaf)
+{
+	u32 nr = btrfs_header_nritems(leaf);
+	if (nr == 0)
+		return BTRFS_LEAF_DATA_SIZE(root);
+	return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
+
+/*
+ * search for key in the extent_buffer.  The items start at offset p,
+ * and they are item_size apart.  There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+				       unsigned long p,
+				       int item_size, struct btrfs_key *key,
+				       int max, int *slot)
+{
+	int low = 0;
+	int high = max;
+	int mid;
+	int ret;
+	struct btrfs_disk_key *tmp = NULL;
+	struct btrfs_disk_key unaligned;
+	unsigned long offset;
+	char *kaddr = NULL;
+	unsigned long map_start = 0;
+	unsigned long map_len = 0;
+	int err;
+
+	while (low < high) {
+		mid = (low + high) / 2;
+		offset = p + mid * item_size;
+
+		if (!kaddr || offset < map_start ||
+		    (offset + sizeof(struct btrfs_disk_key)) >
+		    map_start + map_len) {
+
+			err = map_private_extent_buffer(eb, offset,
+						sizeof(struct btrfs_disk_key),
+						&kaddr, &map_start, &map_len);
+
+			if (!err) {
+				tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+			} else {
+				read_extent_buffer(eb, &unaligned,
+						   offset, sizeof(unaligned));
+				tmp = &unaligned;
+			}
+
+		} else {
+			tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+		}
+		ret = comp_keys(tmp, key);
+
+		if (ret < 0)
+			low = mid + 1;
+		else if (ret > 0)
+			high = mid;
+		else {
+			*slot = mid;
+			return 0;
+		}
+	}
+	*slot = low;
+	return 1;
+}
+
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		      int level, int *slot)
+{
+	if (level == 0)
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_leaf, items),
+					  sizeof(struct btrfs_item),
+					  key, btrfs_header_nritems(eb),
+					  slot);
+	else
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_node, ptrs),
+					  sizeof(struct btrfs_key_ptr),
+					  key, btrfs_header_nritems(eb),
+					  slot);
+}
+
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot)
+{
+	return bin_search(eb, key, level, slot);
+}
+
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+	spin_lock(&root->accounting_lock);
+	btrfs_set_root_used(&root->root_item,
+			    btrfs_root_used(&root->root_item) + size);
+	spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+	spin_lock(&root->accounting_lock);
+	btrfs_set_root_used(&root->root_item,
+			    btrfs_root_used(&root->root_item) - size);
+	spin_unlock(&root->accounting_lock);
+}
+
+/* given a node and slot number, this reads the blocks it points to.  The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+				   struct extent_buffer *parent, int slot)
+{
+	int level = btrfs_header_level(parent);
+	struct extent_buffer *eb;
+
+	if (slot < 0)
+		return NULL;
+	if (slot >= btrfs_header_nritems(parent))
+		return NULL;
+
+	BUG_ON(level == 0);
+
+	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
+			     btrfs_node_ptr_generation(parent, slot));
+	if (eb && !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		eb = NULL;
+	}
+
+	return eb;
+}
+
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion.  We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path, int level)
+{
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int orig_slot = path->slots[level];
+	u64 orig_ptr;
+
+	if (level == 0)
+		return 0;
+
+	mid = path->nodes[level];
+
+	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
+		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+	if (level < BTRFS_MAX_LEVEL - 1) {
+		parent = path->nodes[level + 1];
+		pslot = path->slots[level + 1];
+	}
+
+	/*
+	 * deal with the case where there is only one pointer in the root
+	 * by promoting the node below to a root
+	 */
+	if (!parent) {
+		struct extent_buffer *child;
+
+		if (btrfs_header_nritems(mid) != 1)
+			return 0;
+
+		/* promote the child to a root */
+		child = read_node_slot(root, mid, 0);
+		if (!child) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			goto enospc;
+		}
+
+		btrfs_tree_lock(child);
+		btrfs_set_lock_blocking(child);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+		if (ret) {
+			btrfs_tree_unlock(child);
+			free_extent_buffer(child);
+			goto enospc;
+		}
+
+		tree_mod_log_set_root_pointer(root, child, 1);
+		rcu_assign_pointer(root->node, child);
+
+		add_root_to_dirty_list(root);
+		btrfs_tree_unlock(child);
+
+		path->locks[level] = 0;
+		path->nodes[level] = NULL;
+		clean_tree_block(trans, root->fs_info, mid);
+		btrfs_tree_unlock(mid);
+		/* once for the path */
+		free_extent_buffer(mid);
+
+		root_sub_used(root, mid->len);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		/* once for the root ptr */
+		free_extent_buffer_stale(mid);
+		return 0;
+	}
+	if (btrfs_header_nritems(mid) >
+	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+		return 0;
+
+	left = read_node_slot(root, parent, pslot - 1);
+	if (left) {
+		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
+		wret = btrfs_cow_block(trans, root, left,
+				       parent, pslot - 1, &left);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+	}
+	right = read_node_slot(root, parent, pslot + 1);
+	if (right) {
+		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
+		wret = btrfs_cow_block(trans, root, right,
+				       parent, pslot + 1, &right);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+	}
+
+	/* first, try to make some room in the middle buffer */
+	if (left) {
+		orig_slot += btrfs_header_nritems(left);
+		wret = push_node_left(trans, root, left, mid, 1);
+		if (wret < 0)
+			ret = wret;
+	}
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
+	if (right) {
+		wret = push_node_left(trans, root, mid, right, 1);
+		if (wret < 0 && wret != -ENOSPC)
+			ret = wret;
+		if (btrfs_header_nritems(right) == 0) {
+			clean_tree_block(trans, root->fs_info, right);
+			btrfs_tree_unlock(right);
+			del_ptr(root, path, level + 1, pslot + 1);
+			root_sub_used(root, right->len);
+			btrfs_free_tree_block(trans, root, right, 0, 1);
+			free_extent_buffer_stale(right);
+			right = NULL;
+		} else {
+			struct btrfs_disk_key right_key;
+			btrfs_node_key(right, &right_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot + 1, 0);
+			btrfs_set_node_key(parent, &right_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
+		}
+	}
+	if (btrfs_header_nritems(mid) == 1) {
+		/*
+		 * we're not allowed to leave a node with one item in the
+		 * tree during a delete.  A deletion from lower in the tree
+		 * could try to delete the only pointer in this node.
+		 * So, pull some keys from the left.
+		 * There has to be a left pointer at this point because
+		 * otherwise we would have pulled some pointers from the
+		 * right
+		 */
+		if (!left) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			goto enospc;
+		}
+		wret = balance_node_right(trans, root, mid, left);
+		if (wret < 0) {
+			ret = wret;
+			goto enospc;
+		}
+		if (wret == 1) {
+			wret = push_node_left(trans, root, left, mid, 1);
+			if (wret < 0)
+				ret = wret;
+		}
+		BUG_ON(wret == 1);
+	}
+	if (btrfs_header_nritems(mid) == 0) {
+		clean_tree_block(trans, root->fs_info, mid);
+		btrfs_tree_unlock(mid);
+		del_ptr(root, path, level + 1, pslot);
+		root_sub_used(root, mid->len);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		free_extent_buffer_stale(mid);
+		mid = NULL;
+	} else {
+		/* update the parent key to reflect our changes */
+		struct btrfs_disk_key mid_key;
+		btrfs_node_key(mid, &mid_key, 0);
+		tree_mod_log_set_node_key(root->fs_info, parent,
+					  pslot, 0);
+		btrfs_set_node_key(parent, &mid_key, pslot);
+		btrfs_mark_buffer_dirty(parent);
+	}
+
+	/* update the path */
+	if (left) {
+		if (btrfs_header_nritems(left) > orig_slot) {
+			extent_buffer_get(left);
+			/* left was locked after cow */
+			path->nodes[level] = left;
+			path->slots[level + 1] -= 1;
+			path->slots[level] = orig_slot;
+			if (mid) {
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			}
+		} else {
+			orig_slot -= btrfs_header_nritems(left);
+			path->slots[level] = orig_slot;
+		}
+	}
+	/* double check we haven't messed things up */
+	if (orig_ptr !=
+	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+		BUG();
+enospc:
+	if (right) {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	if (left) {
+		if (path->nodes[level] != left)
+			btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+	}
+	return ret;
+}
+
+/* Node balancing for insertion.  Here we only split or push nodes around
+ * when they are completely full.  This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, int level)
+{
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int orig_slot = path->slots[level];
+
+	if (level == 0)
+		return 1;
+
+	mid = path->nodes[level];
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+	if (level < BTRFS_MAX_LEVEL - 1) {
+		parent = path->nodes[level + 1];
+		pslot = path->slots[level + 1];
+	}
+
+	if (!parent)
+		return 1;
+
+	left = read_node_slot(root, parent, pslot - 1);
+
+	/* first, try to make some room in the middle buffer */
+	if (left) {
+		u32 left_nr;
+
+		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
+
+		left_nr = btrfs_header_nritems(left);
+		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			ret = btrfs_cow_block(trans, root, left, parent,
+					      pslot - 1, &left);
+			if (ret)
+				wret = 1;
+			else {
+				wret = push_node_left(trans, root,
+						      left, mid, 0);
+			}
+		}
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			struct btrfs_disk_key disk_key;
+			orig_slot += left_nr;
+			btrfs_node_key(mid, &disk_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot);
+			btrfs_mark_buffer_dirty(parent);
+			if (btrfs_header_nritems(left) > orig_slot) {
+				path->nodes[level] = left;
+				path->slots[level + 1] -= 1;
+				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			} else {
+				orig_slot -=
+					btrfs_header_nritems(left);
+				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(left);
+				free_extent_buffer(left);
+			}
+			return 0;
+		}
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+	}
+	right = read_node_slot(root, parent, pslot + 1);
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
+	if (right) {
+		u32 right_nr;
+
+		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
+
+		right_nr = btrfs_header_nritems(right);
+		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			ret = btrfs_cow_block(trans, root, right,
+					      parent, pslot + 1,
+					      &right);
+			if (ret)
+				wret = 1;
+			else {
+				wret = balance_node_right(trans, root,
+							  right, mid);
+			}
+		}
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			struct btrfs_disk_key disk_key;
+
+			btrfs_node_key(right, &disk_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot + 1, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
+
+			if (btrfs_header_nritems(mid) <= orig_slot) {
+				path->nodes[level] = right;
+				path->slots[level + 1] += 1;
+				path->slots[level] = orig_slot -
+					btrfs_header_nritems(mid);
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			} else {
+				btrfs_tree_unlock(right);
+				free_extent_buffer(right);
+			}
+			return 0;
+		}
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	return 1;
+}
+
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static void reada_for_search(struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     int level, int slot, u64 objectid)
+{
+	struct extent_buffer *node;
+	struct btrfs_disk_key disk_key;
+	u32 nritems;
+	u64 search;
+	u64 target;
+	u64 nread = 0;
+	u64 gen;
+	int direction = path->reada;
+	struct extent_buffer *eb;
+	u32 nr;
+	u32 blocksize;
+	u32 nscan = 0;
+
+	if (level != 1)
+		return;
+
+	if (!path->nodes[level])
+		return;
+
+	node = path->nodes[level];
+
+	search = btrfs_node_blockptr(node, slot);
+	blocksize = root->nodesize;
+	eb = btrfs_find_tree_block(root->fs_info, search);
+	if (eb) {
+		free_extent_buffer(eb);
+		return;
+	}
+
+	target = search;
+
+	nritems = btrfs_header_nritems(node);
+	nr = slot;
+
+	while (1) {
+		if (direction < 0) {
+			if (nr == 0)
+				break;
+			nr--;
+		} else if (direction > 0) {
+			nr++;
+			if (nr >= nritems)
+				break;
+		}
+		if (path->reada < 0 && objectid) {
+			btrfs_node_key(node, &disk_key, nr);
+			if (btrfs_disk_key_objectid(&disk_key) != objectid)
+				break;
+		}
+		search = btrfs_node_blockptr(node, nr);
+		if ((search <= target && target - search <= 65536) ||
+		    (search > target && search - target <= 65536)) {
+			gen = btrfs_node_ptr_generation(node, nr);
+			readahead_tree_block(root, search);
+			nread += blocksize;
+		}
+		nscan++;
+		if ((nread > 65536 || nscan > 32))
+			break;
+	}
+}
+
+static noinline void reada_for_balance(struct btrfs_root *root,
+				       struct btrfs_path *path, int level)
+{
+	int slot;
+	int nritems;
+	struct extent_buffer *parent;
+	struct extent_buffer *eb;
+	u64 gen;
+	u64 block1 = 0;
+	u64 block2 = 0;
+
+	parent = path->nodes[level + 1];
+	if (!parent)
+		return;
+
+	nritems = btrfs_header_nritems(parent);
+	slot = path->slots[level + 1];
+
+	if (slot > 0) {
+		block1 = btrfs_node_blockptr(parent, slot - 1);
+		gen = btrfs_node_ptr_generation(parent, slot - 1);
+		eb = btrfs_find_tree_block(root->fs_info, block1);
+		/*
+		 * if we get -eagain from btrfs_buffer_uptodate, we
+		 * don't want to return eagain here.  That will loop
+		 * forever
+		 */
+		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
+			block1 = 0;
+		free_extent_buffer(eb);
+	}
+	if (slot + 1 < nritems) {
+		block2 = btrfs_node_blockptr(parent, slot + 1);
+		gen = btrfs_node_ptr_generation(parent, slot + 1);
+		eb = btrfs_find_tree_block(root->fs_info, block2);
+		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
+			block2 = 0;
+		free_extent_buffer(eb);
+	}
+
+	if (block1)
+		readahead_tree_block(root, block1);
+	if (block2)
+		readahead_tree_block(root, block2);
+}
+
+
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+			       int lowest_unlock, int min_write_lock_level,
+			       int *write_lock_level)
+{
+	int i;
+	int skip_level = level;
+	int no_skips = 0;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			break;
+		if (!path->locks[i])
+			break;
+		if (!no_skips && path->slots[i] == 0) {
+			skip_level = i + 1;
+			continue;
+		}
+		if (!no_skips && path->keep_locks) {
+			u32 nritems;
+			t = path->nodes[i];
+			nritems = btrfs_header_nritems(t);
+			if (nritems < 1 || path->slots[i] >= nritems - 1) {
+				skip_level = i + 1;
+				continue;
+			}
+		}
+		if (skip_level < i && i >= lowest_unlock)
+			no_skips = 1;
+
+		t = path->nodes[i];
+		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+			btrfs_tree_unlock_rw(t, path->locks[i]);
+			path->locks[i] = 0;
+			if (write_lock_level &&
+			    i > min_write_lock_level &&
+			    i <= *write_lock_level) {
+				*write_lock_level = i - 1;
+			}
+		}
+	}
+}
+
+/*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node.  This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+	int i;
+
+	if (path->keep_locks)
+		return;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			continue;
+		if (!path->locks[i])
+			continue;
+		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+		path->locks[i] = 0;
+	}
+}
+
+/*
+ * helper function for btrfs_search_slot.  The goal is to find a block
+ * in cache without setting the path to blocking.  If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada.  -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *p,
+		       struct extent_buffer **eb_ret, int level, int slot,
+		       struct btrfs_key *key, u64 time_seq)
+{
+	u64 blocknr;
+	u64 gen;
+	struct extent_buffer *b = *eb_ret;
+	struct extent_buffer *tmp;
+	int ret;
+
+	blocknr = btrfs_node_blockptr(b, slot);
+	gen = btrfs_node_ptr_generation(b, slot);
+
+	tmp = btrfs_find_tree_block(root->fs_info, blocknr);
+	if (tmp) {
+		/* first we do an atomic uptodate check */
+		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+			*eb_ret = tmp;
+			return 0;
+		}
+
+		/* the pages were up to date, but we failed
+		 * the generation number check.  Do a full
+		 * read for the generation number that is correct.
+		 * We must do this without dropping locks so
+		 * we can trust our generation number
+		 */
+		btrfs_set_path_blocking(p);
+
+		/* now we're allowed to do a blocking uptodate check */
+		ret = btrfs_read_buffer(tmp, gen);
+		if (!ret) {
+			*eb_ret = tmp;
+			return 0;
+		}
+		free_extent_buffer(tmp);
+		btrfs_release_path(p);
+		return -EIO;
+	}
+
+	/*
+	 * reduce lock contention at high levels
+	 * of the btree by dropping locks before
+	 * we read.  Don't release the lock on the current
+	 * level because we need to walk this node to figure
+	 * out which blocks to read.
+	 */
+	btrfs_unlock_up_safe(p, level + 1);
+	btrfs_set_path_blocking(p);
+
+	free_extent_buffer(tmp);
+	if (p->reada)
+		reada_for_search(root, p, level, slot, key->objectid);
+
+	btrfs_release_path(p);
+
+	ret = -EAGAIN;
+	tmp = read_tree_block(root, blocknr, 0);
+	if (tmp) {
+		/*
+		 * If the read above didn't mark this buffer up to date,
+		 * it will never end up being up to date.  Set ret to EIO now
+		 * and give up so that our caller doesn't loop forever
+		 * on our EAGAINs.
+		 */
+		if (!btrfs_buffer_uptodate(tmp, 0, 0))
+			ret = -EIO;
+		free_extent_buffer(tmp);
+	}
+	return ret;
+}
+
+/*
+ * helper function for btrfs_search_slot.  This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned.  If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *p,
+		       struct extent_buffer *b, int level, int ins_len,
+		       int *write_lock_level)
+{
+	int ret;
+	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+		int sret;
+
+		if (*write_lock_level < level + 1) {
+			*write_lock_level = level + 1;
+			btrfs_release_path(p);
+			goto again;
+		}
+
+		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
+		sret = split_node(trans, root, p, level);
+		btrfs_clear_path_blocking(p, NULL, 0);
+
+		BUG_ON(sret > 0);
+		if (sret) {
+			ret = sret;
+			goto done;
+		}
+		b = p->nodes[level];
+	} else if (ins_len < 0 && btrfs_header_nritems(b) <
+		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
+		int sret;
+
+		if (*write_lock_level < level + 1) {
+			*write_lock_level = level + 1;
+			btrfs_release_path(p);
+			goto again;
+		}
+
+		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
+		sret = balance_level(trans, root, p, level);
+		btrfs_clear_path_blocking(p, NULL, 0);
+
+		if (sret) {
+			ret = sret;
+			goto done;
+		}
+		b = p->nodes[level];
+		if (!b) {
+			btrfs_release_path(p);
+			goto again;
+		}
+		BUG_ON(btrfs_header_nritems(b) == 1);
+	}
+	return 0;
+
+again:
+	ret = -EAGAIN;
+done:
+	return ret;
+}
+
+static void key_search_validate(struct extent_buffer *b,
+				struct btrfs_key *key,
+				int level)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+	struct btrfs_disk_key disk_key;
+
+	btrfs_cpu_key_to_disk(&disk_key, key);
+
+	if (level == 0)
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_leaf, items[0].key),
+		    sizeof(disk_key)));
+	else
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_node, ptrs[0].key),
+		    sizeof(disk_key)));
+#endif
+}
+
+static int key_search(struct extent_buffer *b, struct btrfs_key *key,
+		      int level, int *prev_cmp, int *slot)
+{
+	if (*prev_cmp != 0) {
+		*prev_cmp = bin_search(b, key, level, slot);
+		return *prev_cmp;
+	}
+
+	key_search_validate(b, key, level);
+	*slot = 0;
+
+	return 0;
+}
+
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+		u64 iobjectid, u64 ioff, u8 key_type,
+		struct btrfs_key *found_key)
+{
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+
+	ASSERT(path);
+	ASSERT(found_key);
+
+	key.type = key_type;
+	key.objectid = iobjectid;
+	key.offset = ioff;
+
+	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	eb = path->nodes[0];
+	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+		ret = btrfs_next_leaf(fs_root, path);
+		if (ret)
+			return ret;
+		eb = path->nodes[0];
+	}
+
+	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+	if (found_key->type != key.type ||
+			found_key->objectid != key.objectid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * look for key in the tree.  path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned.  If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow)
+{
+	struct extent_buffer *b;
+	int slot;
+	int ret;
+	int err;
+	int level;
+	int lowest_unlock = 1;
+	int root_lock;
+	/* everything at write_lock_level or lower must be write locked */
+	int write_lock_level = 0;
+	u8 lowest_level = 0;
+	int min_write_lock_level;
+	int prev_cmp;
+
+	lowest_level = p->lowest_level;
+	WARN_ON(lowest_level && ins_len > 0);
+	WARN_ON(p->nodes[0] != NULL);
+	BUG_ON(!cow && ins_len);
+
+	if (ins_len < 0) {
+		lowest_unlock = 2;
+
+		/* when we are removing items, we might have to go up to level
+		 * two as we update tree pointers  Make sure we keep write
+		 * for those levels as well
+		 */
+		write_lock_level = 2;
+	} else if (ins_len > 0) {
+		/*
+		 * for inserting items, make sure we have a write lock on
+		 * level 1 so we can update keys
+		 */
+		write_lock_level = 1;
+	}
+
+	if (!cow)
+		write_lock_level = -1;
+
+	if (cow && (p->keep_locks || p->lowest_level))
+		write_lock_level = BTRFS_MAX_LEVEL;
+
+	min_write_lock_level = write_lock_level;
+
+again:
+	prev_cmp = -1;
+	/*
+	 * we try very hard to do read locks on the root
+	 */
+	root_lock = BTRFS_READ_LOCK;
+	level = 0;
+	if (p->search_commit_root) {
+		/*
+		 * the commit roots are read only
+		 * so we always do read locks
+		 */
+		if (p->need_commit_sem)
+			down_read(&root->fs_info->commit_root_sem);
+		b = root->commit_root;
+		extent_buffer_get(b);
+		level = btrfs_header_level(b);
+		if (p->need_commit_sem)
+			up_read(&root->fs_info->commit_root_sem);
+		if (!p->skip_locking)
+			btrfs_tree_read_lock(b);
+	} else {
+		if (p->skip_locking) {
+			b = btrfs_root_node(root);
+			level = btrfs_header_level(b);
+		} else {
+			/* we don't know the level of the root node
+			 * until we actually have it read locked
+			 */
+			b = btrfs_read_lock_root_node(root);
+			level = btrfs_header_level(b);
+			if (level <= write_lock_level) {
+				/* whoops, must trade for write lock */
+				btrfs_tree_read_unlock(b);
+				free_extent_buffer(b);
+				b = btrfs_lock_root_node(root);
+				root_lock = BTRFS_WRITE_LOCK;
+
+				/* the level might have changed, check again */
+				level = btrfs_header_level(b);
+			}
+		}
+	}
+	p->nodes[level] = b;
+	if (!p->skip_locking)
+		p->locks[level] = root_lock;
+
+	while (b) {
+		level = btrfs_header_level(b);
+
+		/*
+		 * setup the path here so we can release it under lock
+		 * contention with the cow code
+		 */
+		if (cow) {
+			/*
+			 * if we don't really need to cow this block
+			 * then we don't want to set the path blocking,
+			 * so we test it here
+			 */
+			if (!should_cow_block(trans, root, b))
+				goto cow_done;
+
+			/*
+			 * must have write locks on this node and the
+			 * parent
+			 */
+			if (level > write_lock_level ||
+			    (level + 1 > write_lock_level &&
+			    level + 1 < BTRFS_MAX_LEVEL &&
+			    p->nodes[level + 1])) {
+				write_lock_level = level + 1;
+				btrfs_release_path(p);
+				goto again;
+			}
+
+			btrfs_set_path_blocking(p);
+			err = btrfs_cow_block(trans, root, b,
+					      p->nodes[level + 1],
+					      p->slots[level + 1], &b);
+			if (err) {
+				ret = err;
+				goto done;
+			}
+		}
+cow_done:
+		p->nodes[level] = b;
+		btrfs_clear_path_blocking(p, NULL, 0);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 *
+		 * If we're inserting or deleting (ins_len != 0), then we might
+		 * be changing slot zero, which may require changing the parent.
+		 * So, we can't drop the lock until after we know which slot
+		 * we're operating on.
+		 */
+		if (!ins_len && !p->keep_locks) {
+			int u = level + 1;
+
+			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+				p->locks[u] = 0;
+			}
+		}
+
+		ret = key_search(b, key, level, &prev_cmp, &slot);
+
+		if (level != 0) {
+			int dec = 0;
+			if (ret && slot > 0) {
+				dec = 1;
+				slot -= 1;
+			}
+			p->slots[level] = slot;
+			err = setup_nodes_for_search(trans, root, p, b, level,
+					     ins_len, &write_lock_level);
+			if (err == -EAGAIN)
+				goto again;
+			if (err) {
+				ret = err;
+				goto done;
+			}
+			b = p->nodes[level];
+			slot = p->slots[level];
+
+			/*
+			 * slot 0 is special, if we change the key
+			 * we have to update the parent pointer
+			 * which means we must have a write lock
+			 * on the parent
+			 */
+			if (slot == 0 && ins_len &&
+			    write_lock_level < level + 1) {
+				write_lock_level = level + 1;
+				btrfs_release_path(p);
+				goto again;
+			}
+
+			unlock_up(p, level, lowest_unlock,
+				  min_write_lock_level, &write_lock_level);
+
+			if (level == lowest_level) {
+				if (dec)
+					p->slots[level]++;
+				goto done;
+			}
+
+			err = read_block_for_search(trans, root, p,
+						    &b, level, slot, key, 0);
+			if (err == -EAGAIN)
+				goto again;
+			if (err) {
+				ret = err;
+				goto done;
+			}
+
+			if (!p->skip_locking) {
+				level = btrfs_header_level(b);
+				if (level <= write_lock_level) {
+					err = btrfs_try_tree_write_lock(b);
+					if (!err) {
+						btrfs_set_path_blocking(p);
+						btrfs_tree_lock(b);
+						btrfs_clear_path_blocking(p, b,
+								  BTRFS_WRITE_LOCK);
+					}
+					p->locks[level] = BTRFS_WRITE_LOCK;
+				} else {
+					err = btrfs_tree_read_lock_atomic(b);
+					if (!err) {
+						btrfs_set_path_blocking(p);
+						btrfs_tree_read_lock(b);
+						btrfs_clear_path_blocking(p, b,
+								  BTRFS_READ_LOCK);
+					}
+					p->locks[level] = BTRFS_READ_LOCK;
+				}
+				p->nodes[level] = b;
+			}
+		} else {
+			p->slots[level] = slot;
+			if (ins_len > 0 &&
+			    btrfs_leaf_free_space(root, b) < ins_len) {
+				if (write_lock_level < 1) {
+					write_lock_level = 1;
+					btrfs_release_path(p);
+					goto again;
+				}
+
+				btrfs_set_path_blocking(p);
+				err = split_leaf(trans, root, key,
+						 p, ins_len, ret == 0);
+				btrfs_clear_path_blocking(p, NULL, 0);
+
+				BUG_ON(err > 0);
+				if (err) {
+					ret = err;
+					goto done;
+				}
+			}
+			if (!p->search_for_split)
+				unlock_up(p, level, lowest_unlock,
+					  min_write_lock_level, &write_lock_level);
+			goto done;
+		}
+	}
+	ret = 1;
+done:
+	/*
+	 * we don't really know what they plan on doing with the path
+	 * from here on, so for now just mark it as blocking
+	 */
+	if (!p->leave_spinning)
+		btrfs_set_path_blocking(p);
+	if (ret < 0 && !p->skip_release_on_error)
+		btrfs_release_path(p);
+	return ret;
+}
+
+/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+			  struct btrfs_path *p, u64 time_seq)
+{
+	struct extent_buffer *b;
+	int slot;
+	int ret;
+	int err;
+	int level;
+	int lowest_unlock = 1;
+	u8 lowest_level = 0;
+	int prev_cmp = -1;
+
+	lowest_level = p->lowest_level;
+	WARN_ON(p->nodes[0] != NULL);
+
+	if (p->search_commit_root) {
+		BUG_ON(time_seq);
+		return btrfs_search_slot(NULL, root, key, p, 0, 0);
+	}
+
+again:
+	b = get_old_root(root, time_seq);
+	level = btrfs_header_level(b);
+	p->locks[level] = BTRFS_READ_LOCK;
+
+	while (b) {
+		level = btrfs_header_level(b);
+		p->nodes[level] = b;
+		btrfs_clear_path_blocking(p, NULL, 0);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 */
+		btrfs_unlock_up_safe(p, level + 1);
+
+		/*
+		 * Since we can unwind eb's we want to do a real search every
+		 * time.
+		 */
+		prev_cmp = -1;
+		ret = key_search(b, key, level, &prev_cmp, &slot);
+
+		if (level != 0) {
+			int dec = 0;
+			if (ret && slot > 0) {
+				dec = 1;
+				slot -= 1;
+			}
+			p->slots[level] = slot;
+			unlock_up(p, level, lowest_unlock, 0, NULL);
+
+			if (level == lowest_level) {
+				if (dec)
+					p->slots[level]++;
+				goto done;
+			}
+
+			err = read_block_for_search(NULL, root, p, &b, level,
+						    slot, key, time_seq);
+			if (err == -EAGAIN)
+				goto again;
+			if (err) {
+				ret = err;
+				goto done;
+			}
+
+			level = btrfs_header_level(b);
+			err = btrfs_tree_read_lock_atomic(b);
+			if (!err) {
+				btrfs_set_path_blocking(p);
+				btrfs_tree_read_lock(b);
+				btrfs_clear_path_blocking(p, b,
+							  BTRFS_READ_LOCK);
+			}
+			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
+			if (!b) {
+				ret = -ENOMEM;
+				goto done;
+			}
+			p->locks[level] = BTRFS_READ_LOCK;
+			p->nodes[level] = b;
+		} else {
+			p->slots[level] = slot;
+			unlock_up(p, level, lowest_unlock, 0, NULL);
+			goto done;
+		}
+	}
+	ret = 1;
+done:
+	if (!p->leave_spinning)
+		btrfs_set_path_blocking(p);
+	if (ret < 0)
+		btrfs_release_path(p);
+
+	return ret;
+}
+
+/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any)
+{
+	int ret;
+	struct extent_buffer *leaf;
+
+again:
+	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+	if (ret <= 0)
+		return ret;
+	/*
+	 * a return value of 1 means the path is at the position where the
+	 * item should be inserted. Normally this is the next bigger item,
+	 * but in case the previous item is the last in a leaf, path points
+	 * to the first free slot in the previous leaf, i.e. at an invalid
+	 * item.
+	 */
+	leaf = p->nodes[0];
+
+	if (find_higher) {
+		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, p);
+			if (ret <= 0)
+				return ret;
+			if (!return_any)
+				return 1;
+			/*
+			 * no higher item found, return the next
+			 * lower instead
+			 */
+			return_any = 0;
+			find_higher = 0;
+			btrfs_release_path(p);
+			goto again;
+		}
+	} else {
+		if (p->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, p);
+			if (ret < 0)
+				return ret;
+			if (!ret) {
+				leaf = p->nodes[0];
+				if (p->slots[0] == btrfs_header_nritems(leaf))
+					p->slots[0]--;
+				return 0;
+			}
+			if (!return_any)
+				return 1;
+			/*
+			 * no lower item found, return the next
+			 * higher instead
+			 */
+			return_any = 0;
+			find_higher = 1;
+			btrfs_release_path(p);
+			goto again;
+		} else {
+			--p->slots[0];
+		}
+	}
+	return 0;
+}
+
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ */
+static void fixup_low_keys(struct btrfs_fs_info *fs_info,
+			   struct btrfs_path *path,
+			   struct btrfs_disk_key *key, int level)
+{
+	int i;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		int tslot = path->slots[i];
+		if (!path->nodes[i])
+			break;
+		t = path->nodes[i];
+		tree_mod_log_set_node_key(fs_info, t, tslot, 1);
+		btrfs_set_node_key(t, key, tslot);
+		btrfs_mark_buffer_dirty(path->nodes[i]);
+		if (tslot != 0)
+			break;
+	}
+}
+
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path,
+			     struct btrfs_key *new_key)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	if (slot > 0) {
+		btrfs_item_key(eb, &disk_key, slot - 1);
+		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
+	}
+	if (slot < btrfs_header_nritems(eb) - 1) {
+		btrfs_item_key(eb, &disk_key, slot + 1);
+		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(eb, &disk_key, slot);
+	btrfs_mark_buffer_dirty(eb);
+	if (slot == 0)
+		fixup_low_keys(fs_info, path, &disk_key, 1);
+}
+
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
+			  struct extent_buffer *src, int empty)
+{
+	int push_items = 0;
+	int src_nritems;
+	int dst_nritems;
+	int ret = 0;
+
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+	if (!empty && src_nritems <= 8)
+		return 1;
+
+	if (push_items <= 0)
+		return 1;
+
+	if (empty) {
+		push_items = min(src_nritems, push_items);
+		if (push_items < src_nritems) {
+			/* leave at least 8 pointers in the node if
+			 * we aren't going to empty it
+			 */
+			if (src_nritems - push_items < 8) {
+				if (push_items <= 8)
+					return 1;
+				push_items -= 8;
+			}
+		}
+	} else
+		push_items = min(src_nritems - 8, push_items);
+
+	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+				   push_items);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(dst_nritems),
+			   btrfs_node_key_ptr_offset(0),
+			   push_items * sizeof(struct btrfs_key_ptr));
+
+	if (push_items < src_nritems) {
+		/*
+		 * don't call tree_mod_log_eb_move here, key removal was already
+		 * fully logged by tree_mod_log_eb_copy above.
+		 */
+		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+				      btrfs_node_key_ptr_offset(push_items),
+				      (src_nritems - push_items) *
+				      sizeof(struct btrfs_key_ptr));
+	}
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
+
+	return ret;
+}
+
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will  only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst,
+			      struct extent_buffer *src)
+{
+	int push_items = 0;
+	int max_push;
+	int src_nritems;
+	int dst_nritems;
+	int ret = 0;
+
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+	if (push_items <= 0)
+		return 1;
+
+	if (src_nritems < 4)
+		return 1;
+
+	max_push = src_nritems / 2 + 1;
+	/* don't try to empty the node */
+	if (max_push >= src_nritems)
+		return 1;
+
+	if (max_push < push_items)
+		push_items = max_push;
+
+	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
+	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+				      btrfs_node_key_ptr_offset(0),
+				      (dst_nritems) *
+				      sizeof(struct btrfs_key_ptr));
+
+	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+				   src_nritems - push_items, push_items);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(src_nritems - push_items),
+			   push_items * sizeof(struct btrfs_key_ptr));
+
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
+
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
+
+	return ret;
+}
+
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_path *path, int level)
+{
+	u64 lower_gen;
+	struct extent_buffer *lower;
+	struct extent_buffer *c;
+	struct extent_buffer *old;
+	struct btrfs_disk_key lower_key;
+
+	BUG_ON(path->nodes[level]);
+	BUG_ON(path->nodes[level-1] != root->node);
+
+	lower = path->nodes[level-1];
+	if (level == 1)
+		btrfs_item_key(lower, &lower_key, 0);
+	else
+		btrfs_node_key(lower, &lower_key, 0);
+
+	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+				   &lower_key, level, root->node->start, 0);
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	root_add_used(root, root->nodesize);
+
+	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_nritems(c, 1);
+	btrfs_set_header_level(c, level);
+	btrfs_set_header_bytenr(c, c->start);
+	btrfs_set_header_generation(c, trans->transid);
+	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(c, root->root_key.objectid);
+
+	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
+			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
+
+	btrfs_set_node_key(c, &lower_key, 0);
+	btrfs_set_node_blockptr(c, 0, lower->start);
+	lower_gen = btrfs_header_generation(lower);
+	WARN_ON(lower_gen != trans->transid);
+
+	btrfs_set_node_ptr_generation(c, 0, lower_gen);
+
+	btrfs_mark_buffer_dirty(c);
+
+	old = root->node;
+	tree_mod_log_set_root_pointer(root, c, 0);
+	rcu_assign_pointer(root->node, c);
+
+	/* the super has an extra ref to root->node */
+	free_extent_buffer(old);
+
+	add_root_to_dirty_list(root);
+	extent_buffer_get(c);
+	path->nodes[level] = c;
+	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+	path->slots[level] = 0;
+	return 0;
+}
+
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ */
+static void insert_ptr(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *path,
+		       struct btrfs_disk_key *key, u64 bytenr,
+		       int slot, int level)
+{
+	struct extent_buffer *lower;
+	int nritems;
+	int ret;
+
+	BUG_ON(!path->nodes[level]);
+	btrfs_assert_tree_locked(path->nodes[level]);
+	lower = path->nodes[level];
+	nritems = btrfs_header_nritems(lower);
+	BUG_ON(slot > nritems);
+	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
+	if (slot != nritems) {
+		if (level)
+			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+					     slot, nritems - slot);
+		memmove_extent_buffer(lower,
+			      btrfs_node_key_ptr_offset(slot + 1),
+			      btrfs_node_key_ptr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
+	}
+	if (level) {
+		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+					      MOD_LOG_KEY_ADD, GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
+	btrfs_set_node_key(lower, key, slot);
+	btrfs_set_node_blockptr(lower, slot, bytenr);
+	WARN_ON(trans->transid == 0);
+	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+	btrfs_set_header_nritems(lower, nritems + 1);
+	btrfs_mark_buffer_dirty(lower);
+}
+
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path, int level)
+{
+	struct extent_buffer *c;
+	struct extent_buffer *split;
+	struct btrfs_disk_key disk_key;
+	int mid;
+	int ret;
+	u32 c_nritems;
+
+	c = path->nodes[level];
+	WARN_ON(btrfs_header_generation(c) != trans->transid);
+	if (c == root->node) {
+		/*
+		 * trying to split the root, lets make a new one
+		 *
+		 * tree mod log: We don't log_removal old root in
+		 * insert_new_root, because that root buffer will be kept as a
+		 * normal node. We are going to log removal of half of the
+		 * elements below with tree_mod_log_eb_copy. We're holding a
+		 * tree lock on the buffer, which is why we cannot race with
+		 * other tree_mod_log users.
+		 */
+		ret = insert_new_root(trans, root, path, level + 1);
+		if (ret)
+			return ret;
+	} else {
+		ret = push_nodes_for_insert(trans, root, path, level);
+		c = path->nodes[level];
+		if (!ret && btrfs_header_nritems(c) <
+		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+			return 0;
+		if (ret < 0)
+			return ret;
+	}
+
+	c_nritems = btrfs_header_nritems(c);
+	mid = (c_nritems + 1) / 2;
+	btrfs_node_key(c, &disk_key, mid);
+
+	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+			&disk_key, level, c->start, 0);
+	if (IS_ERR(split))
+		return PTR_ERR(split);
+
+	root_add_used(root, root->nodesize);
+
+	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_level(split, btrfs_header_level(c));
+	btrfs_set_header_bytenr(split, split->start);
+	btrfs_set_header_generation(split, trans->transid);
+	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(split, root->root_key.objectid);
+	write_extent_buffer(split, root->fs_info->fsid,
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
+	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+			    btrfs_header_chunk_tree_uuid(split),
+			    BTRFS_UUID_SIZE);
+
+	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
+				   mid, c_nritems - mid);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+	copy_extent_buffer(split, c,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(mid),
+			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+	btrfs_set_header_nritems(split, c_nritems - mid);
+	btrfs_set_header_nritems(c, mid);
+	ret = 0;
+
+	btrfs_mark_buffer_dirty(c);
+	btrfs_mark_buffer_dirty(split);
+
+	insert_ptr(trans, root, path, &disk_key, split->start,
+		   path->slots[level + 1] + 1, level + 1);
+
+	if (path->slots[level] >= mid) {
+		path->slots[level] -= mid;
+		btrfs_tree_unlock(c);
+		free_extent_buffer(c);
+		path->nodes[level] = split;
+		path->slots[level + 1] += 1;
+	} else {
+		btrfs_tree_unlock(split);
+		free_extent_buffer(split);
+	}
+	return ret;
+}
+
+/*
+ * how many bytes are required to store the items in a leaf.  start
+ * and nr indicate which items in the leaf to check.  This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+	struct btrfs_item *start_item;
+	struct btrfs_item *end_item;
+	struct btrfs_map_token token;
+	int data_len;
+	int nritems = btrfs_header_nritems(l);
+	int end = min(nritems, start + nr) - 1;
+
+	if (!nr)
+		return 0;
+	btrfs_init_map_token(&token);
+	start_item = btrfs_item_nr(start);
+	end_item = btrfs_item_nr(end);
+	data_len = btrfs_token_item_offset(l, start_item, &token) +
+		btrfs_token_item_size(l, start_item, &token);
+	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
+	data_len += sizeof(struct btrfs_item) * nr;
+	WARN_ON(data_len < 0);
+	return data_len;
+}
+
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data.  IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+				   struct extent_buffer *leaf)
+{
+	int nritems = btrfs_header_nritems(leaf);
+	int ret;
+	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+	if (ret < 0) {
+		btrfs_crit(root->fs_info,
+			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
+		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+		       leaf_space_used(leaf, 0, nritems), nritems);
+	}
+	return ret;
+}
+
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right.  We'll push up to and including min_slot, but no lower
+ */
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      int data_size, int empty,
+				      struct extent_buffer *right,
+				      int free_space, u32 left_nritems,
+				      u32 min_slot)
+{
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *upper = path->nodes[1];
+	struct btrfs_map_token token;
+	struct btrfs_disk_key disk_key;
+	int slot;
+	u32 i;
+	int push_space = 0;
+	int push_items = 0;
+	struct btrfs_item *item;
+	u32 nr;
+	u32 right_nritems;
+	u32 data_end;
+	u32 this_item_size;
+
+	btrfs_init_map_token(&token);
+
+	if (empty)
+		nr = 0;
+	else
+		nr = max_t(u32, 1, min_slot);
+
+	if (path->slots[0] >= left_nritems)
+		push_space += data_size;
+
+	slot = path->slots[1];
+	i = left_nritems - 1;
+	while (i >= nr) {
+		item = btrfs_item_nr(i);
+
+		if (!empty && push_items > 0) {
+			if (path->slots[0] > i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, left);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
+		if (path->slots[0] == i)
+			push_space += data_size;
+
+		this_item_size = btrfs_item_size(left, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
+			break;
+
+		push_items++;
+		push_space += this_item_size + sizeof(*item);
+		if (i == 0)
+			break;
+		i--;
+	}
+
+	if (push_items == 0)
+		goto out_unlock;
+
+	WARN_ON(!empty && push_items == left_nritems);
+
+	/* push left to right */
+	right_nritems = btrfs_header_nritems(right);
+
+	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+	push_space -= leaf_data_end(root, left);
+
+	/* make room in the right data area */
+	data_end = leaf_data_end(root, right);
+	memmove_extent_buffer(right,
+			      btrfs_leaf_data(right) + data_end - push_space,
+			      btrfs_leaf_data(right) + data_end,
+			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
+
+	/* copy from the left data area */
+	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
+		     btrfs_leaf_data(left) + leaf_data_end(root, left),
+		     push_space);
+
+	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+			      btrfs_item_nr_offset(0),
+			      right_nritems * sizeof(struct btrfs_item));
+
+	/* copy the items from left to right */
+	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+		   btrfs_item_nr_offset(left_nritems - push_items),
+		   push_items * sizeof(struct btrfs_item));
+
+	/* update the item pointers */
+	right_nritems += push_items;
+	btrfs_set_header_nritems(right, right_nritems);
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
+	for (i = 0; i < right_nritems; i++) {
+		item = btrfs_item_nr(i);
+		push_space -= btrfs_token_item_size(right, item, &token);
+		btrfs_set_token_item_offset(right, item, push_space, &token);
+	}
+
+	left_nritems -= push_items;
+	btrfs_set_header_nritems(left, left_nritems);
+
+	if (left_nritems)
+		btrfs_mark_buffer_dirty(left);
+	else
+		clean_tree_block(trans, root->fs_info, left);
+
+	btrfs_mark_buffer_dirty(right);
+
+	btrfs_item_key(right, &disk_key, 0);
+	btrfs_set_node_key(upper, &disk_key, slot + 1);
+	btrfs_mark_buffer_dirty(upper);
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[0] >= left_nritems) {
+		path->slots[0] -= left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root->fs_info, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	return 0;
+
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
+}
+
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf.  It won't
+ * push any slot lower than min_slot
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path,
+			   int min_data_size, int data_size,
+			   int empty, u32 min_slot)
+{
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *right;
+	struct extent_buffer *upper;
+	int slot;
+	int free_space;
+	u32 left_nritems;
+	int ret;
+
+	if (!path->nodes[1])
+		return 1;
+
+	slot = path->slots[1];
+	upper = path->nodes[1];
+	if (slot >= btrfs_header_nritems(upper) - 1)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	right = read_node_slot(root, upper, slot + 1);
+	if (right == NULL)
+		return 1;
+
+	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, right, upper,
+			      slot + 1, &right);
+	if (ret)
+		goto out_unlock;
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	left_nritems = btrfs_header_nritems(left);
+	if (left_nritems == 0)
+		goto out_unlock;
+
+	if (path->slots[0] == left_nritems && !empty) {
+		/* Key greater than all keys in the leaf, right neighbor has
+		 * enough room for it and we're not emptying our leaf to delete
+		 * it, therefore use right neighbor to insert the new item and
+		 * no need to touch/dirty our left leaft. */
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+		path->nodes[0] = right;
+		path->slots[0] = 0;
+		path->slots[1]++;
+		return 0;
+	}
+
+	return __push_leaf_right(trans, root, path, min_data_size, empty,
+				right, free_space, left_nritems, min_slot);
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+ * items
+ */
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path, int data_size,
+				     int empty, struct extent_buffer *left,
+				     int free_space, u32 right_nritems,
+				     u32 max_slot)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *right = path->nodes[0];
+	int i;
+	int push_space = 0;
+	int push_items = 0;
+	struct btrfs_item *item;
+	u32 old_left_nritems;
+	u32 nr;
+	int ret = 0;
+	u32 this_item_size;
+	u32 old_left_item_size;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	if (empty)
+		nr = min(right_nritems, max_slot);
+	else
+		nr = min(right_nritems - 1, max_slot);
+
+	for (i = 0; i < nr; i++) {
+		item = btrfs_item_nr(i);
+
+		if (!empty && push_items > 0) {
+			if (path->slots[0] < i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, right);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
+		if (path->slots[0] == i)
+			push_space += data_size;
+
+		this_item_size = btrfs_item_size(right, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
+			break;
+
+		push_items++;
+		push_space += this_item_size + sizeof(*item);
+	}
+
+	if (push_items == 0) {
+		ret = 1;
+		goto out;
+	}
+	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
+
+	/* push data from right to left */
+	copy_extent_buffer(left, right,
+			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
+			   btrfs_item_nr_offset(0),
+			   push_items * sizeof(struct btrfs_item));
+
+	push_space = BTRFS_LEAF_DATA_SIZE(root) -
+		     btrfs_item_offset_nr(right, push_items - 1);
+
+	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+		     leaf_data_end(root, left) - push_space,
+		     btrfs_leaf_data(right) +
+		     btrfs_item_offset_nr(right, push_items - 1),
+		     push_space);
+	old_left_nritems = btrfs_header_nritems(left);
+	BUG_ON(old_left_nritems <= 0);
+
+	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+		u32 ioff;
+
+		item = btrfs_item_nr(i);
+
+		ioff = btrfs_token_item_offset(left, item, &token);
+		btrfs_set_token_item_offset(left, item,
+		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+		      &token);
+	}
+	btrfs_set_header_nritems(left, old_left_nritems + push_items);
+
+	/* fixup right node */
+	if (push_items > right_nritems)
+		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
+		       right_nritems);
+
+	if (push_items < right_nritems) {
+		push_space = btrfs_item_offset_nr(right, push_items - 1) -
+						  leaf_data_end(root, right);
+		memmove_extent_buffer(right, btrfs_leaf_data(right) +
+				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+				      btrfs_leaf_data(right) +
+				      leaf_data_end(root, right), push_space);
+
+		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+			      btrfs_item_nr_offset(push_items),
+			     (btrfs_header_nritems(right) - push_items) *
+			     sizeof(struct btrfs_item));
+	}
+	right_nritems -= push_items;
+	btrfs_set_header_nritems(right, right_nritems);
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
+	for (i = 0; i < right_nritems; i++) {
+		item = btrfs_item_nr(i);
+
+		push_space = push_space - btrfs_token_item_size(right,
+								item, &token);
+		btrfs_set_token_item_offset(right, item, push_space, &token);
+	}
+
+	btrfs_mark_buffer_dirty(left);
+	if (right_nritems)
+		btrfs_mark_buffer_dirty(right);
+	else
+		clean_tree_block(trans, root->fs_info, right);
+
+	btrfs_item_key(right, &disk_key, 0);
+	fixup_low_keys(root->fs_info, path, &disk_key, 1);
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[0] < push_items) {
+		path->slots[0] += old_left_nritems;
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = left;
+		path->slots[1] -= 1;
+	} else {
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+		path->slots[0] -= push_items;
+	}
+	BUG_ON(path->slots[0] < 0);
+	return ret;
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
+ * items
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int min_data_size,
+			  int data_size, int empty, u32 max_slot)
+{
+	struct extent_buffer *right = path->nodes[0];
+	struct extent_buffer *left;
+	int slot;
+	int free_space;
+	u32 right_nritems;
+	int ret = 0;
+
+	slot = path->slots[1];
+	if (slot == 0)
+		return 1;
+	if (!path->nodes[1])
+		return 1;
+
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	left = read_node_slot(root, path->nodes[1], slot - 1);
+	if (left == NULL)
+		return 1;
+
+	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, left,
+			      path->nodes[1], slot - 1, &left);
+	if (ret) {
+		/* we hit -ENOSPC, but it isn't fatal here */
+		if (ret == -ENOSPC)
+			ret = 1;
+		goto out;
+	}
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	return __push_leaf_left(trans, root, path, min_data_size,
+			       empty, left, free_space, right_nritems,
+			       max_slot);
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ */
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *l,
+				    struct extent_buffer *right,
+				    int slot, int mid, int nritems)
+{
+	int data_copy_size;
+	int rt_data_off;
+	int i;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	nritems = nritems - mid;
+	btrfs_set_header_nritems(right, nritems);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(mid),
+			   nritems * sizeof(struct btrfs_item));
+
+	copy_extent_buffer(right, l,
+		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		     data_copy_size, btrfs_leaf_data(l) +
+		     leaf_data_end(root, l), data_copy_size);
+
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_end_nr(l, mid);
+
+	for (i = 0; i < nritems; i++) {
+		struct btrfs_item *item = btrfs_item_nr(i);
+		u32 ioff;
+
+		ioff = btrfs_token_item_offset(right, item, &token);
+		btrfs_set_token_item_offset(right, item,
+					    ioff + rt_data_off, &token);
+	}
+
+	btrfs_set_header_nritems(l, mid);
+	btrfs_item_key(right, &disk_key, 0);
+	insert_ptr(trans, root, path, &disk_key, right->start,
+		   path->slots[1] + 1, 1);
+
+	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(l);
+	BUG_ON(path->slots[0] != slot);
+
+	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[0] -= mid;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+
+	BUG_ON(path->slots[0] < 0);
+}
+
+/*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf.  A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ *          A                 B                 C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves.  If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  int data_size)
+{
+	int ret;
+	int progress = 0;
+	int slot;
+	u32 nritems;
+	int space_needed = data_size;
+
+	slot = path->slots[0];
+	if (slot < btrfs_header_nritems(path->nodes[0]))
+		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
+
+	/*
+	 * try to push all the items after our slot into the
+	 * right leaf
+	 */
+	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * our goal is to get our slot at the start or end of a leaf.  If
+	 * we've done so we're done
+	 */
+	if (path->slots[0] == 0 || path->slots[0] == nritems)
+		return 0;
+
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+
+	/* try to push all the items before our slot into the next leaf */
+	slot = path->slots[0];
+	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	if (progress)
+		return 0;
+	return 1;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_key *ins_key,
+			       struct btrfs_path *path, int data_size,
+			       int extend)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *l;
+	u32 nritems;
+	int mid;
+	int slot;
+	struct extent_buffer *right;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret = 0;
+	int wret;
+	int split;
+	int num_doubles = 0;
+	int tried_avoid_double = 0;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	if (extend && data_size + btrfs_item_size_nr(l, slot) +
+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+		return -EOVERFLOW;
+
+	/* first try to make some room by pushing left and right */
+	if (data_size && path->nodes[1]) {
+		int space_needed = data_size;
+
+		if (slot < btrfs_header_nritems(l))
+			space_needed -= btrfs_leaf_free_space(root, l);
+
+		wret = push_leaf_right(trans, root, path, space_needed,
+				       space_needed, 0, 0);
+		if (wret < 0)
+			return wret;
+		if (wret) {
+			wret = push_leaf_left(trans, root, path, space_needed,
+					      space_needed, 0, (u32)-1);
+			if (wret < 0)
+				return wret;
+		}
+		l = path->nodes[0];
+
+		/* did the pushes work? */
+		if (btrfs_leaf_free_space(root, l) >= data_size)
+			return 0;
+	}
+
+	if (!path->nodes[1]) {
+		ret = insert_new_root(trans, root, path, 1);
+		if (ret)
+			return ret;
+	}
+again:
+	split = 1;
+	l = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(l);
+	mid = (nritems + 1) / 2;
+
+	if (mid <= slot) {
+		if (nritems == 1 ||
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot >= nritems) {
+				split = 0;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
+					split = 2;
+				}
+			}
+		}
+	} else {
+		if (leaf_space_used(l, 0, mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (!extend && data_size && slot == 0) {
+				split = 0;
+			} else if ((extend || !data_size) && slot == 0) {
+				mid = 1;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
+					split = 2;
+				}
+			}
+		}
+	}
+
+	if (split == 0)
+		btrfs_cpu_key_to_disk(&disk_key, ins_key);
+	else
+		btrfs_item_key(l, &disk_key, mid);
+
+	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+			&disk_key, 0, l->start, 0);
+	if (IS_ERR(right))
+		return PTR_ERR(right);
+
+	root_add_used(root, root->nodesize);
+
+	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(right, right->start);
+	btrfs_set_header_generation(right, trans->transid);
+	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(right, root->root_key.objectid);
+	btrfs_set_header_level(right, 0);
+	write_extent_buffer(right, fs_info->fsid,
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
+
+	write_extent_buffer(right, fs_info->chunk_tree_uuid,
+			    btrfs_header_chunk_tree_uuid(right),
+			    BTRFS_UUID_SIZE);
+
+	if (split == 0) {
+		if (mid <= slot) {
+			btrfs_set_header_nritems(right, 0);
+			insert_ptr(trans, root, path, &disk_key, right->start,
+				   path->slots[1] + 1, 1);
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			path->slots[1] += 1;
+		} else {
+			btrfs_set_header_nritems(right, 0);
+			insert_ptr(trans, root, path, &disk_key, right->start,
+					  path->slots[1], 1);
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			if (path->slots[1] == 0)
+				fixup_low_keys(fs_info, path, &disk_key, 1);
+		}
+		btrfs_mark_buffer_dirty(right);
+		return ret;
+	}
+
+	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
+
+	if (split == 2) {
+		BUG_ON(num_doubles != 0);
+		num_doubles++;
+		goto again;
+	}
+
+	return 0;
+
+push_for_double:
+	push_for_double_split(trans, root, path, data_size);
+	tried_avoid_double = 1;
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+	goto again;
+}
+
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_path *path, int ins_len)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	u64 extent_len = 0;
+	u32 item_size;
+	int ret;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_EXTENT_CSUM_KEY);
+
+	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+		return 0;
+
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (key.type == BTRFS_EXTENT_DATA_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+	}
+	btrfs_release_path(path);
+
+	path->keep_locks = 1;
+	path->search_for_split = 1;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	path->search_for_split = 0;
+	if (ret > 0)
+		ret = -EAGAIN;
+	if (ret < 0)
+		goto err;
+
+	ret = -EAGAIN;
+	leaf = path->nodes[0];
+	/* if our item isn't there, return now */
+	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+		goto err;
+
+	/* the leaf has  changed, it now has room.  return now */
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+		goto err;
+
+	if (key.type == BTRFS_EXTENT_DATA_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+			goto err;
+	}
+
+	btrfs_set_path_blocking(path);
+	ret = split_leaf(trans, root, &key, path, ins_len, 1);
+	if (ret)
+		goto err;
+
+	path->keep_locks = 0;
+	btrfs_unlock_up_safe(path, 1);
+	return 0;
+err:
+	path->keep_locks = 0;
+	return ret;
+}
+
+static noinline int split_item(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *new_key,
+			       unsigned long split_offset)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	struct btrfs_item *new_item;
+	int slot;
+	char *buf;
+	u32 nritems;
+	u32 item_size;
+	u32 orig_offset;
+	struct btrfs_disk_key disk_key;
+
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+	btrfs_set_path_blocking(path);
+
+	item = btrfs_item_nr(path->slots[0]);
+	orig_offset = btrfs_item_offset(leaf, item);
+	item_size = btrfs_item_size(leaf, item);
+
+	buf = kmalloc(item_size, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+			    path->slots[0]), item_size);
+
+	slot = path->slots[0] + 1;
+	nritems = btrfs_header_nritems(leaf);
+	if (slot != nritems) {
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+				btrfs_item_nr_offset(slot),
+				(nritems - slot) * sizeof(struct btrfs_item));
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(leaf, &disk_key, slot);
+
+	new_item = btrfs_item_nr(slot);
+
+	btrfs_set_item_offset(leaf, new_item, orig_offset);
+	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+
+	btrfs_set_item_offset(leaf, item,
+			      orig_offset + item_size - split_offset);
+	btrfs_set_item_size(leaf, item, split_offset);
+
+	btrfs_set_header_nritems(leaf, nritems + 1);
+
+	/* write the data for the start of the original item */
+	write_extent_buffer(leaf, buf,
+			    btrfs_item_ptr_offset(leaf, path->slots[0]),
+			    split_offset);
+
+	/* write the data for the new item */
+	write_extent_buffer(leaf, buf + split_offset,
+			    btrfs_item_ptr_offset(leaf, slot),
+			    item_size - split_offset);
+	btrfs_mark_buffer_dirty(leaf);
+
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
+	kfree(buf);
+	return 0;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset)
+{
+	int ret;
+	ret = setup_leaf_for_split(trans, root, path,
+				   sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	ret = split_item(trans, root, path, new_key, split_offset);
+	return ret;
+}
+
+/*
+ * This function duplicate a item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item
+ * is contiguous with the original item.
+ *
+ * This allows us to split file extent in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 struct btrfs_key *new_key)
+{
+	struct extent_buffer *leaf;
+	int ret;
+	u32 item_size;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ret = setup_leaf_for_split(trans, root, path,
+				   item_size + sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	path->slots[0]++;
+	setup_items_for_insert(root, path, new_key, &item_size,
+			       item_size, item_size +
+			       sizeof(struct btrfs_item), 1);
+	leaf = path->nodes[0];
+	memcpy_extent_buffer(leaf,
+			     btrfs_item_ptr_offset(leaf, path->slots[0]),
+			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+			     item_size);
+	return 0;
+}
+
+/*
+ * make the item pointed to by the path smaller.  new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+			 u32 new_size, int from_end)
+{
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data_start;
+	unsigned int old_size;
+	unsigned int size_diff;
+	int i;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	old_size = btrfs_item_size_nr(leaf, slot);
+	if (old_size == new_size)
+		return;
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	old_data_start = btrfs_item_offset_nr(leaf, slot);
+
+	size_diff = old_size - new_size;
+
+	BUG_ON(slot < 0);
+	BUG_ON(slot >= nritems);
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff;
+		item = btrfs_item_nr(i);
+
+		ioff = btrfs_token_item_offset(leaf, item, &token);
+		btrfs_set_token_item_offset(leaf, item,
+					    ioff + size_diff, &token);
+	}
+
+	/* shift the data */
+	if (from_end) {
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start + new_size - data_end);
+	} else {
+		struct btrfs_disk_key disk_key;
+		u64 offset;
+
+		btrfs_item_key(leaf, &disk_key, slot);
+
+		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+			unsigned long ptr;
+			struct btrfs_file_extent_item *fi;
+
+			fi = btrfs_item_ptr(leaf, slot,
+					    struct btrfs_file_extent_item);
+			fi = (struct btrfs_file_extent_item *)(
+			     (unsigned long)fi - size_diff);
+
+			if (btrfs_file_extent_type(leaf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				ptr = btrfs_item_ptr_offset(leaf, slot);
+				memmove_extent_buffer(leaf, ptr,
+				      (unsigned long)fi,
+				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
+			}
+		}
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start - data_end);
+
+		offset = btrfs_disk_key_offset(&disk_key);
+		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+		btrfs_set_item_key(leaf, &disk_key, slot);
+		if (slot == 0)
+			fixup_low_keys(root->fs_info, path, &disk_key, 1);
+	}
+
+	item = btrfs_item_nr(slot);
+	btrfs_set_item_size(leaf, item, new_size);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+}
+
+/*
+ * make the item pointed to by the path bigger, data_size is the added size.
+ */
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+		       u32 data_size)
+{
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data;
+	unsigned int old_size;
+	int i;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < data_size) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	slot = path->slots[0];
+	old_data = btrfs_item_end_nr(leaf, slot);
+
+	BUG_ON(slot < 0);
+	if (slot >= nritems) {
+		btrfs_print_leaf(root, leaf);
+		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
+		       slot, nritems);
+		BUG_ON(1);
+	}
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff;
+		item = btrfs_item_nr(i);
+
+		ioff = btrfs_token_item_offset(leaf, item, &token);
+		btrfs_set_token_item_offset(leaf, item,
+					    ioff - data_size, &token);
+	}
+
+	/* shift the data */
+	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+		      data_end - data_size, btrfs_leaf_data(leaf) +
+		      data_end, old_data - data_end);
+
+	data_end = old_data;
+	old_size = btrfs_item_size_nr(leaf, slot);
+	item = btrfs_item_nr(slot);
+	btrfs_set_item_size(leaf, item, old_size + data_size);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+}
+
+/*
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
+ */
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    u32 total_data, u32 total_size, int nr)
+{
+	struct btrfs_item *item;
+	int i;
+	u32 nritems;
+	unsigned int data_end;
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *leaf;
+	int slot;
+	struct btrfs_map_token token;
+
+	if (path->slots[0] == 0) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		fixup_low_keys(root->fs_info, path, &disk_key, 1);
+	}
+	btrfs_unlock_up_safe(path, 1);
+
+	btrfs_init_map_token(&token);
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < total_size) {
+		btrfs_print_leaf(root, leaf);
+		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
+		       total_size, btrfs_leaf_free_space(root, leaf));
+		BUG();
+	}
+
+	if (slot != nritems) {
+		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+		if (old_data < data_end) {
+			btrfs_print_leaf(root, leaf);
+			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
+			       slot, old_data, data_end);
+			BUG_ON(1);
+		}
+		/*
+		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+		 */
+		/* first correct the data pointers */
+		for (i = slot; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr( i);
+			ioff = btrfs_token_item_offset(leaf, item, &token);
+			btrfs_set_token_item_offset(leaf, item,
+						    ioff - total_data, &token);
+		}
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+		/* shift the data */
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end - total_data, btrfs_leaf_data(leaf) +
+			      data_end, old_data - data_end);
+		data_end = old_data;
+	}
+
+	/* setup the item for the new data */
+	for (i = 0; i < nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+		btrfs_set_item_key(leaf, &disk_key, slot + i);
+		item = btrfs_item_nr(slot + i);
+		btrfs_set_token_item_offset(leaf, item,
+					    data_end - data_size[i], &token);
+		data_end -= data_size[i];
+		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
+	}
+
+	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	int ret = 0;
+	int slot;
+	int i;
+	u32 total_size = 0;
+	u32 total_data = 0;
+
+	for (i = 0; i < nr; i++)
+		total_data += data_size[i];
+
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		return ret;
+
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	setup_items_for_insert(root, path, cpu_key, data_size,
+			       total_data, total_size, nr);
+	return 0;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *cpu_key, void *data, u32
+		      data_size)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+	if (!ret) {
+		leaf = path->nodes[0];
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		write_extent_buffer(leaf, data, ptr, data_size);
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot)
+{
+	struct extent_buffer *parent = path->nodes[level];
+	u32 nritems;
+	int ret;
+
+	nritems = btrfs_header_nritems(parent);
+	if (slot != nritems - 1) {
+		if (level)
+			tree_mod_log_eb_move(root->fs_info, parent, slot,
+					     slot + 1, nritems - slot - 1);
+		memmove_extent_buffer(parent,
+			      btrfs_node_key_ptr_offset(slot),
+			      btrfs_node_key_ptr_offset(slot + 1),
+			      sizeof(struct btrfs_key_ptr) *
+			      (nritems - slot - 1));
+	} else if (level) {
+		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
+
+	nritems--;
+	btrfs_set_header_nritems(parent, nritems);
+	if (nritems == 0 && parent == root->node) {
+		BUG_ON(btrfs_header_level(root->node) != 1);
+		/* just turn the root into a leaf and break */
+		btrfs_set_header_level(root->node, 0);
+	} else if (slot == 0) {
+		struct btrfs_disk_key disk_key;
+
+		btrfs_node_key(parent, &disk_key, 0);
+		fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
+	}
+	btrfs_mark_buffer_dirty(parent);
+}
+
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent.  zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing.  path->nodes[1] must be locked.
+ */
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *leaf)
+{
+	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
+	del_ptr(root, path, 1, path->slots[1]);
+
+	/*
+	 * btrfs_free_extent is expensive, we want to make sure we
+	 * aren't holding any locks when we call it
+	 */
+	btrfs_unlock_up_safe(path, 0);
+
+	root_sub_used(root, leaf->len);
+
+	extent_buffer_get(leaf);
+	btrfs_free_tree_block(trans, root, leaf, 0, 1);
+	free_extent_buffer_stale(leaf);
+}
+/*
+ * delete the item at the leaf level in path.  If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct btrfs_path *path, int slot, int nr)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	int last_off;
+	int dsize = 0;
+	int ret = 0;
+	int wret;
+	int i;
+	u32 nritems;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	leaf = path->nodes[0];
+	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+
+	for (i = 0; i < nr; i++)
+		dsize += btrfs_item_size_nr(leaf, slot + i);
+
+	nritems = btrfs_header_nritems(leaf);
+
+	if (slot + nr != nritems) {
+		int data_end = leaf_data_end(root, leaf);
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + dsize,
+			      btrfs_leaf_data(leaf) + data_end,
+			      last_off - data_end);
+
+		for (i = slot + nr; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(i);
+			ioff = btrfs_token_item_offset(leaf, item, &token);
+			btrfs_set_token_item_offset(leaf, item,
+						    ioff + dsize, &token);
+		}
+
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+			      btrfs_item_nr_offset(slot + nr),
+			      sizeof(struct btrfs_item) *
+			      (nritems - slot - nr));
+	}
+	btrfs_set_header_nritems(leaf, nritems - nr);
+	nritems -= nr;
+
+	/* delete the leaf if we've emptied it */
+	if (nritems == 0) {
+		if (leaf == root->node) {
+			btrfs_set_header_level(leaf, 0);
+		} else {
+			btrfs_set_path_blocking(path);
+			clean_tree_block(trans, root->fs_info, leaf);
+			btrfs_del_leaf(trans, root, path, leaf);
+		}
+	} else {
+		int used = leaf_space_used(leaf, 0, nritems);
+		if (slot == 0) {
+			struct btrfs_disk_key disk_key;
+
+			btrfs_item_key(leaf, &disk_key, 0);
+			fixup_low_keys(root->fs_info, path, &disk_key, 1);
+		}
+
+		/* delete the leaf if it is mostly empty */
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+			/* push_leaf_left fixes the path.
+			 * make sure the path still points to our leaf
+			 * for possible call to del_ptr below
+			 */
+			slot = path->slots[1];
+			extent_buffer_get(leaf);
+
+			btrfs_set_path_blocking(path);
+			wret = push_leaf_left(trans, root, path, 1, 1,
+					      1, (u32)-1);
+			if (wret < 0 && wret != -ENOSPC)
+				ret = wret;
+
+			if (path->nodes[0] == leaf &&
+			    btrfs_header_nritems(leaf)) {
+				wret = push_leaf_right(trans, root, path, 1,
+						       1, 1, 0);
+				if (wret < 0 && wret != -ENOSPC)
+					ret = wret;
+			}
+
+			if (btrfs_header_nritems(leaf) == 0) {
+				path->slots[1] = slot;
+				btrfs_del_leaf(trans, root, path, leaf);
+				free_extent_buffer(leaf);
+				ret = 0;
+			} else {
+				/* if we're still in the path, make sure
+				 * we're dirty.  Otherwise, one of the
+				 * push_leaf functions must have already
+				 * dirtied this buffer
+				 */
+				if (path->nodes[0] == leaf)
+					btrfs_mark_buffer_dirty(leaf);
+				free_extent_buffer(leaf);
+			}
+		} else {
+			btrfs_mark_buffer_dirty(leaf);
+		}
+	}
+	return ret;
+}
+
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct btrfs_disk_key found_key;
+	int ret;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+
+	if (key.offset > 0) {
+		key.offset--;
+	} else if (key.type > 0) {
+		key.type--;
+		key.offset = (u64)-1;
+	} else if (key.objectid > 0) {
+		key.objectid--;
+		key.type = (u8)-1;
+		key.offset = (u64)-1;
+	} else {
+		return 1;
+	}
+
+	btrfs_release_path(path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	btrfs_item_key(path->nodes[0], &found_key, 0);
+	ret = comp_keys(&found_key, &key);
+	/*
+	 * We might have had an item with the previous key in the tree right
+	 * before we released our path. And after we released our path, that
+	 * item might have been pushed to the first slot (0) of the leaf we
+	 * were holding due to a tree balance. Alternatively, an item with the
+	 * previous key can exist as the only element of a leaf (big fat item).
+	 * Therefore account for these 2 cases, so that our callers (like
+	 * btrfs_previous_item) don't miss an existing item with a key matching
+	 * the previous key we computed above.
+	 */
+	if (ret <= 0)
+		return 0;
+	return 1;
+}
+
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through.  Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_path *path,
+			 u64 min_trans)
+{
+	struct extent_buffer *cur;
+	struct btrfs_key found_key;
+	int slot;
+	int sret;
+	u32 nritems;
+	int level;
+	int ret = 1;
+	int keep_locks = path->keep_locks;
+
+	path->keep_locks = 1;
+again:
+	cur = btrfs_read_lock_root_node(root);
+	level = btrfs_header_level(cur);
+	WARN_ON(path->nodes[level]);
+	path->nodes[level] = cur;
+	path->locks[level] = BTRFS_READ_LOCK;
+
+	if (btrfs_header_generation(cur) < min_trans) {
+		ret = 1;
+		goto out;
+	}
+	while (1) {
+		nritems = btrfs_header_nritems(cur);
+		level = btrfs_header_level(cur);
+		sret = bin_search(cur, min_key, level, &slot);
+
+		/* at the lowest level, we're done, setup the path and exit */
+		if (level == path->lowest_level) {
+			if (slot >= nritems)
+				goto find_next_key;
+			ret = 0;
+			path->slots[level] = slot;
+			btrfs_item_key_to_cpu(cur, &found_key, slot);
+			goto out;
+		}
+		if (sret && slot > 0)
+			slot--;
+		/*
+		 * check this node pointer against the min_trans parameters.
+		 * If it is too old, old, skip to the next one.
+		 */
+		while (slot < nritems) {
+			u64 gen;
+
+			gen = btrfs_node_ptr_generation(cur, slot);
+			if (gen < min_trans) {
+				slot++;
+				continue;
+			}
+			break;
+		}
+find_next_key:
+		/*
+		 * we didn't find a candidate key in this node, walk forward
+		 * and find another one
+		 */
+		if (slot >= nritems) {
+			path->slots[level] = slot;
+			btrfs_set_path_blocking(path);
+			sret = btrfs_find_next_key(root, path, min_key, level,
+						  min_trans);
+			if (sret == 0) {
+				btrfs_release_path(path);
+				goto again;
+			} else {
+				goto out;
+			}
+		}
+		/* save our key for returning back */
+		btrfs_node_key_to_cpu(cur, &found_key, slot);
+		path->slots[level] = slot;
+		if (level == path->lowest_level) {
+			ret = 0;
+			goto out;
+		}
+		btrfs_set_path_blocking(path);
+		cur = read_node_slot(root, cur, slot);
+		BUG_ON(!cur); /* -ENOMEM */
+
+		btrfs_tree_read_lock(cur);
+
+		path->locks[level - 1] = BTRFS_READ_LOCK;
+		path->nodes[level - 1] = cur;
+		unlock_up(path, level, 1, 0, NULL);
+		btrfs_clear_path_blocking(path, NULL, 0);
+	}
+out:
+	path->keep_locks = keep_locks;
+	if (ret == 0) {
+		btrfs_unlock_up_safe(path, path->lowest_level + 1);
+		btrfs_set_path_blocking(path);
+		memcpy(min_key, &found_key, sizeof(found_key));
+	}
+	return ret;
+}
+
+static void tree_move_down(struct btrfs_root *root,
+			   struct btrfs_path *path,
+			   int *level, int root_level)
+{
+	BUG_ON(*level == 0);
+	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
+					path->slots[*level]);
+	path->slots[*level - 1] = 0;
+	(*level)--;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    int *level, int root_level)
+{
+	int ret = 0;
+	int nritems;
+	nritems = btrfs_header_nritems(path->nodes[*level]);
+
+	path->slots[*level]++;
+
+	while (path->slots[*level] >= nritems) {
+		if (*level == root_level)
+			return -1;
+
+		/* move upnext */
+		path->slots[*level] = 0;
+		free_extent_buffer(path->nodes[*level]);
+		path->nodes[*level] = NULL;
+		(*level)++;
+		path->slots[*level]++;
+
+		nritems = btrfs_header_nritems(path->nodes[*level]);
+		ret = 1;
+	}
+	return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_root *root,
+			struct btrfs_path *path,
+			int *level, int root_level,
+			int allow_down,
+			struct btrfs_key *key)
+{
+	int ret;
+
+	if (*level == 0 || !allow_down) {
+		ret = tree_move_next_or_upnext(root, path, level, root_level);
+	} else {
+		tree_move_down(root, path, level, root_level);
+		ret = 0;
+	}
+	if (ret >= 0) {
+		if (*level == 0)
+			btrfs_item_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+		else
+			btrfs_node_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+	}
+	return ret;
+}
+
+static int tree_compare_item(struct btrfs_root *left_root,
+			     struct btrfs_path *left_path,
+			     struct btrfs_path *right_path,
+			     char *tmp_buf)
+{
+	int cmp;
+	int len1, len2;
+	unsigned long off1, off2;
+
+	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+	if (len1 != len2)
+		return 1;
+
+	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+				right_path->slots[0]);
+
+	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+	if (cmp)
+		return 1;
+	return 0;
+}
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+int btrfs_compare_trees(struct btrfs_root *left_root,
+			struct btrfs_root *right_root,
+			btrfs_changed_cb_t changed_cb, void *ctx)
+{
+	int ret;
+	int cmp;
+	struct btrfs_path *left_path = NULL;
+	struct btrfs_path *right_path = NULL;
+	struct btrfs_key left_key;
+	struct btrfs_key right_key;
+	char *tmp_buf = NULL;
+	int left_root_level;
+	int right_root_level;
+	int left_level;
+	int right_level;
+	int left_end_reached;
+	int right_end_reached;
+	int advance_left;
+	int advance_right;
+	u64 left_blockptr;
+	u64 right_blockptr;
+	u64 left_gen;
+	u64 right_gen;
+
+	left_path = btrfs_alloc_path();
+	if (!left_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	right_path = btrfs_alloc_path();
+	if (!right_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
+	if (!tmp_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	left_path->search_commit_root = 1;
+	left_path->skip_locking = 1;
+	right_path->search_commit_root = 1;
+	right_path->skip_locking = 1;
+
+	/*
+	 * Strategy: Go to the first items of both trees. Then do
+	 *
+	 * If both trees are at level 0
+	 *   Compare keys of current items
+	 *     If left < right treat left item as new, advance left tree
+	 *       and repeat
+	 *     If left > right treat right item as deleted, advance right tree
+	 *       and repeat
+	 *     If left == right do deep compare of items, treat as changed if
+	 *       needed, advance both trees and repeat
+	 * If both trees are at the same level but not at level 0
+	 *   Compare keys of current nodes/leafs
+	 *     If left < right advance left tree and repeat
+	 *     If left > right advance right tree and repeat
+	 *     If left == right compare blockptrs of the next nodes/leafs
+	 *       If they match advance both trees but stay at the same level
+	 *         and repeat
+	 *       If they don't match advance both trees while allowing to go
+	 *         deeper and repeat
+	 * If tree levels are different
+	 *   Advance the tree that needs it and repeat
+	 *
+	 * Advancing a tree means:
+	 *   If we are at level 0, try to go to the next slot. If that's not
+	 *   possible, go one level up and repeat. Stop when we found a level
+	 *   where we could go to the next slot. We may at this point be on a
+	 *   node or a leaf.
+	 *
+	 *   If we are not at level 0 and not on shared tree blocks, go one
+	 *   level deeper.
+	 *
+	 *   If we are not at level 0 and on shared tree blocks, go one slot to
+	 *   the right if possible or go up and right.
+	 */
+
+	down_read(&left_root->fs_info->commit_root_sem);
+	left_level = btrfs_header_level(left_root->commit_root);
+	left_root_level = left_level;
+	left_path->nodes[left_level] = left_root->commit_root;
+	extent_buffer_get(left_path->nodes[left_level]);
+
+	right_level = btrfs_header_level(right_root->commit_root);
+	right_root_level = right_level;
+	right_path->nodes[right_level] = right_root->commit_root;
+	extent_buffer_get(right_path->nodes[right_level]);
+	up_read(&left_root->fs_info->commit_root_sem);
+
+	if (left_level == 0)
+		btrfs_item_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	else
+		btrfs_node_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	if (right_level == 0)
+		btrfs_item_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+	else
+		btrfs_node_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+
+	left_end_reached = right_end_reached = 0;
+	advance_left = advance_right = 0;
+
+	while (1) {
+		if (advance_left && !left_end_reached) {
+			ret = tree_advance(left_root, left_path, &left_level,
+					left_root_level,
+					advance_left != ADVANCE_ONLY_NEXT,
+					&left_key);
+			if (ret < 0)
+				left_end_reached = ADVANCE;
+			advance_left = 0;
+		}
+		if (advance_right && !right_end_reached) {
+			ret = tree_advance(right_root, right_path, &right_level,
+					right_root_level,
+					advance_right != ADVANCE_ONLY_NEXT,
+					&right_key);
+			if (ret < 0)
+				right_end_reached = ADVANCE;
+			advance_right = 0;
+		}
+
+		if (left_end_reached && right_end_reached) {
+			ret = 0;
+			goto out;
+		} else if (left_end_reached) {
+			if (right_level == 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_right = ADVANCE;
+			continue;
+		} else if (right_end_reached) {
+			if (left_level == 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_left = ADVANCE;
+			continue;
+		}
+
+		if (left_level == 0 && right_level == 0) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_right = ADVANCE;
+			} else {
+				enum btrfs_compare_tree_result result;
+
+				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+				ret = tree_compare_item(left_root, left_path,
+						right_path, tmp_buf);
+				if (ret)
+					result = BTRFS_COMPARE_TREE_CHANGED;
+				else
+					result = BTRFS_COMPARE_TREE_SAME;
+				ret = changed_cb(left_root, right_root,
+						 left_path, right_path,
+						 &left_key, result, ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+				advance_right = ADVANCE;
+			}
+		} else if (left_level == right_level) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				advance_right = ADVANCE;
+			} else {
+				left_blockptr = btrfs_node_blockptr(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_blockptr = btrfs_node_blockptr(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				left_gen = btrfs_node_ptr_generation(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_gen = btrfs_node_ptr_generation(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				if (left_blockptr == right_blockptr &&
+				    left_gen == right_gen) {
+					/*
+					 * As we're on a shared block, don't
+					 * allow to go deeper.
+					 */
+					advance_left = ADVANCE_ONLY_NEXT;
+					advance_right = ADVANCE_ONLY_NEXT;
+				} else {
+					advance_left = ADVANCE;
+					advance_right = ADVANCE;
+				}
+			}
+		} else if (left_level < right_level) {
+			advance_right = ADVANCE;
+		} else {
+			advance_left = ADVANCE;
+		}
+	}
+
+out:
+	btrfs_free_path(left_path);
+	btrfs_free_path(right_path);
+	kfree(tmp_buf);
+	return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path.  It looks for and returns the next key in the
+ * tree based on the current path and the min_trans parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int level, u64 min_trans)
+{
+	int slot;
+	struct extent_buffer *c;
+
+	WARN_ON(!path->keep_locks);
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			return 1;
+
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+next:
+		if (slot >= btrfs_header_nritems(c)) {
+			int ret;
+			int orig_lowest;
+			struct btrfs_key cur_key;
+			if (level + 1 >= BTRFS_MAX_LEVEL ||
+			    !path->nodes[level + 1])
+				return 1;
+
+			if (path->locks[level + 1]) {
+				level++;
+				continue;
+			}
+
+			slot = btrfs_header_nritems(c) - 1;
+			if (level == 0)
+				btrfs_item_key_to_cpu(c, &cur_key, slot);
+			else
+				btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+			orig_lowest = path->lowest_level;
+			btrfs_release_path(path);
+			path->lowest_level = level;
+			ret = btrfs_search_slot(NULL, root, &cur_key, path,
+						0, 0);
+			path->lowest_level = orig_lowest;
+			if (ret < 0)
+				return ret;
+
+			c = path->nodes[level];
+			slot = path->slots[level];
+			if (ret == 0)
+				slot++;
+			goto next;
+		}
+
+		if (level == 0)
+			btrfs_item_key_to_cpu(c, key, slot);
+		else {
+			u64 gen = btrfs_node_ptr_generation(c, slot);
+
+			if (gen < min_trans) {
+				slot++;
+				goto next;
+			}
+			btrfs_node_key_to_cpu(c, key, slot);
+		}
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	return btrfs_next_old_leaf(root, path, 0);
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+			u64 time_seq)
+{
+	int slot;
+	int level;
+	struct extent_buffer *c;
+	struct extent_buffer *next;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+	int old_spinning = path->leave_spinning;
+	int next_rw_lock = 0;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (nritems == 0)
+		return 1;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+	level = 1;
+	next = NULL;
+	next_rw_lock = 0;
+	btrfs_release_path(path);
+
+	path->keep_locks = 1;
+	path->leave_spinning = 1;
+
+	if (time_seq)
+		ret = btrfs_search_old_slot(root, &key, path, time_seq);
+	else
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	path->keep_locks = 0;
+
+	if (ret < 0)
+		return ret;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * by releasing the path above we dropped all our locks.  A balance
+	 * could have added more items next to the key that used to be
+	 * at the very end of the block.  So, check again here and
+	 * advance the path if there are now more items available.
+	 */
+	if (nritems > 0 && path->slots[0] < nritems - 1) {
+		if (ret == 0)
+			path->slots[0]++;
+		ret = 0;
+		goto done;
+	}
+	/*
+	 * So the above check misses one case:
+	 * - after releasing the path above, someone has removed the item that
+	 *   used to be at the very end of the block, and balance between leafs
+	 *   gets another one with bigger key.offset to replace it.
+	 *
+	 * This one should be returned as well, or we can get leaf corruption
+	 * later(esp. in __btrfs_drop_extents()).
+	 *
+	 * And a bit more explanation about this check,
+	 * with ret > 0, the key isn't found, the path points to the slot
+	 * where it should be inserted, so the path->slots[0] item must be the
+	 * bigger one.
+	 */
+	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+		ret = 0;
+		goto done;
+	}
+
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level]) {
+			ret = 1;
+			goto done;
+		}
+
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+		if (slot >= btrfs_header_nritems(c)) {
+			level++;
+			if (level == BTRFS_MAX_LEVEL) {
+				ret = 1;
+				goto done;
+			}
+			continue;
+		}
+
+		if (next) {
+			btrfs_tree_unlock_rw(next, next_rw_lock);
+			free_extent_buffer(next);
+		}
+
+		next = c;
+		next_rw_lock = path->locks[level];
+		ret = read_block_for_search(NULL, root, path, &next, level,
+					    slot, &key, 0);
+		if (ret == -EAGAIN)
+			goto again;
+
+		if (ret < 0) {
+			btrfs_release_path(path);
+			goto done;
+		}
+
+		if (!path->skip_locking) {
+			ret = btrfs_try_tree_read_lock(next);
+			if (!ret && time_seq) {
+				/*
+				 * If we don't get the lock, we may be racing
+				 * with push_leaf_left, holding that lock while
+				 * itself waiting for the leaf we've currently
+				 * locked. To solve this situation, we give up
+				 * on our lock and cycle.
+				 */
+				free_extent_buffer(next);
+				btrfs_release_path(path);
+				cond_resched();
+				goto again;
+			}
+			if (!ret) {
+				btrfs_set_path_blocking(path);
+				btrfs_tree_read_lock(next);
+				btrfs_clear_path_blocking(path, next,
+							  BTRFS_READ_LOCK);
+			}
+			next_rw_lock = BTRFS_READ_LOCK;
+		}
+		break;
+	}
+	path->slots[level] = slot;
+	while (1) {
+		level--;
+		c = path->nodes[level];
+		if (path->locks[level])
+			btrfs_tree_unlock_rw(c, path->locks[level]);
+
+		free_extent_buffer(c);
+		path->nodes[level] = next;
+		path->slots[level] = 0;
+		if (!path->skip_locking)
+			path->locks[level] = next_rw_lock;
+		if (!level)
+			break;
+
+		ret = read_block_for_search(NULL, root, path, &next, level,
+					    0, &key, 0);
+		if (ret == -EAGAIN)
+			goto again;
+
+		if (ret < 0) {
+			btrfs_release_path(path);
+			goto done;
+		}
+
+		if (!path->skip_locking) {
+			ret = btrfs_try_tree_read_lock(next);
+			if (!ret) {
+				btrfs_set_path_blocking(path);
+				btrfs_tree_read_lock(next);
+				btrfs_clear_path_blocking(path, next,
+							  BTRFS_READ_LOCK);
+			}
+			next_rw_lock = BTRFS_READ_LOCK;
+		}
+	}
+	ret = 0;
+done:
+	unlock_up(path, 0, 1, 0, NULL);
+	path->leave_spinning = old_spinning;
+	if (!old_spinning)
+		btrfs_set_path_blocking(path);
+
+	return ret;
+}
+
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+
+	while (1) {
+		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.type == type)
+			return 0;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < type)
+			break;
+	}
+	return 1;
+}
+
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+
+	while (1) {
+		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    found_key.type == BTRFS_METADATA_ITEM_KEY)
+			return 0;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
+			break;
+	}
+	return 1;
+}
diff --git a/kernel/fs/btrfs/ctree.h b/kernel/fs/btrfs/ctree.h
new file mode 100644
index 000000000..6f364e1d8
--- /dev/null
+++ b/kernel/fs/btrfs/ctree.h
@@ -0,0 +1,4247 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/kobject.h>
+#include <trace/events/btrfs.h>
+#include <asm/kmap_types.h>
+#include <linux/pagemap.h>
+#include <linux/btrfs.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+struct btrfs_pending_snapshot;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
+struct btrfs_ordered_sum;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
+
+#define BTRFS_MAX_MIRRORS 3
+
+#define BTRFS_MAX_LEVEL 8
+
+#define BTRFS_COMPAT_EXTENT_TREE_V0
+
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
+/* for storing items that use the BTRFS_UUID_KEY* types */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
+/*
+ * The inode number assigned to the special inode for storing
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
+
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+
+
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
+#define BTRFS_DEV_REPLACE_DEVID 0ULL
+
+/*
+ * the max metadata block size.  This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32	0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS	(1 << 30)
+
+#define BTRFS_FT_UNKNOWN	0
+#define BTRFS_FT_REG_FILE	1
+#define BTRFS_FT_DIR		2
+#define BTRFS_FT_CHRDEV		3
+#define BTRFS_FT_BLKDEV		4
+#define BTRFS_FT_FIFO		5
+#define BTRFS_FT_SOCK		6
+#define BTRFS_FT_SYMLINK	7
+#define BTRFS_FT_XATTR		8
+#define BTRFS_FT_MAX		9
+
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
+#define BTRFS_DIRTY_METADATA_THRESH	(32 * 1024 * 1024)
+
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+
+/*
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
+ * in cpu native order.  Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+	__le64 objectid;
+	u8 type;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
+	u64 objectid;
+	u8 type;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_mapping_tree {
+	struct extent_map_tree map_tree;
+};
+
+struct btrfs_dev_item {
+	/* the internal btrfs device id */
+	__le64 devid;
+
+	/* size of the device */
+	__le64 total_bytes;
+
+	/* bytes used */
+	__le64 bytes_used;
+
+	/* optimal io alignment for this device */
+	__le32 io_align;
+
+	/* optimal io width for this device */
+	__le32 io_width;
+
+	/* minimal io size for this device */
+	__le32 sector_size;
+
+	/* type and info about this device */
+	__le64 type;
+
+	/* expected generation for this device */
+	__le64 generation;
+
+	/*
+	 * starting byte of this partition on the device,
+	 * to allow for stripe alignment in the future
+	 */
+	__le64 start_offset;
+
+	/* grouping information for allocation decisions */
+	__le32 dev_group;
+
+	/* seek speed 0-100 where 100 is fastest */
+	u8 seek_speed;
+
+	/* bandwidth 0-100 where 100 is fastest */
+	u8 bandwidth;
+
+	/* btrfs generated uuid for this device */
+	u8 uuid[BTRFS_UUID_SIZE];
+
+	/* uuid of FS who owns this device */
+	u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+	__le64 devid;
+	__le64 offset;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+	/* size of this chunk in bytes */
+	__le64 length;
+
+	/* objectid of the root referencing this chunk */
+	__le64 owner;
+
+	__le64 stripe_len;
+	__le64 type;
+
+	/* optimal io alignment for this chunk */
+	__le32 io_align;
+
+	/* optimal io width for this chunk */
+	__le32 io_width;
+
+	/* minimal io size for this chunk */
+	__le32 sector_size;
+
+	/* 2^16 stripes is quite a lot, a second limit is the size of a single
+	 * item in the btree
+	 */
+	__le16 num_stripes;
+
+	/* sub stripes only matter for raid10 */
+	__le16 sub_stripes;
+	struct btrfs_stripe stripe;
+	/* additional stripes go here */
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_EXTENT	1
+#define BTRFS_FREE_SPACE_BITMAP	2
+
+struct btrfs_free_space_entry {
+	__le64 offset;
+	__le64 bytes;
+	u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+	struct btrfs_disk_key location;
+	__le64 generation;
+	__le64 num_entries;
+	__le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+	BUG_ON(num_stripes == 0);
+	return sizeof(struct btrfs_chunk) +
+		sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
+#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
+
+/*
+ * File system states
+ */
+#define BTRFS_FS_STATE_ERROR		0
+#define BTRFS_FS_STATE_REMOUNTING	1
+#define BTRFS_FS_STATE_TRANS_ABORTED	2
+#define BTRFS_FS_STATE_DEV_REPLACING	3
+
+/* Super block flags */
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2)
+
+#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
+
+#define BTRFS_BACKREF_REV_MAX		256
+#define BTRFS_BACKREF_REV_SHIFT		56
+#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+					 BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV		0
+#define BTRFS_MIXED_BACKREF_REV		1
+
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+	/* these first four must match the super block */
+	u8 csum[BTRFS_CSUM_SIZE];
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+	__le64 bytenr; /* which block this node is supposed to live in */
+	__le64 flags;
+
+	/* allowed to be different from the super from here on down */
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	__le64 generation;
+	__le64 owner;
+	__le32 nritems;
+	u8 level;
+} __attribute__ ((__packed__));
+
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+				      sizeof(struct btrfs_header)) / \
+				     sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START		\
+		(offsetof(struct btrfs_file_extent_item, disk_bytenr))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+					sizeof(struct btrfs_item) - \
+					BTRFS_FILE_EXTENT_INLINE_DATA_START)
+#define BTRFS_MAX_XATTR_SIZE(r)	(BTRFS_LEAF_DATA_SIZE(r) - \
+				 sizeof(struct btrfs_item) -\
+				 sizeof(struct btrfs_dir_item))
+
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+
+/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+	__le64 tree_root;
+	__le64 tree_root_gen;
+
+	__le64 chunk_root;
+	__le64 chunk_root_gen;
+
+	__le64 extent_root;
+	__le64 extent_root_gen;
+
+	__le64 fs_root;
+	__le64 fs_root_gen;
+
+	__le64 dev_root;
+	__le64 dev_root_gen;
+
+	__le64 csum_root;
+	__le64 csum_root_gen;
+
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 num_devices;
+	/* future */
+	__le64 unused_64[4];
+
+	u8 tree_root_level;
+	u8 chunk_root_level;
+	u8 extent_root_level;
+	u8 fs_root_level;
+	u8 dev_root_level;
+	u8 csum_root_level;
+	/* future and to align */
+	u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+	u8 csum[BTRFS_CSUM_SIZE];
+	/* the first 4 fields must match struct btrfs_header */
+	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
+	__le64 bytenr; /* this block number */
+	__le64 flags;
+
+	/* allowed to be different from the btrfs_header from here own down */
+	__le64 magic;
+	__le64 generation;
+	__le64 root;
+	__le64 chunk_root;
+	__le64 log_root;
+
+	/* this will help find the new super based on the log root */
+	__le64 log_root_transid;
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 root_dir_objectid;
+	__le64 num_devices;
+	__le32 sectorsize;
+	__le32 nodesize;
+	__le32 __unused_leafsize;
+	__le32 stripesize;
+	__le32 sys_chunk_array_size;
+	__le64 chunk_root_generation;
+	__le64 compat_flags;
+	__le64 compat_ro_flags;
+	__le64 incompat_flags;
+	__le16 csum_type;
+	u8 root_level;
+	u8 chunk_root_level;
+	u8 log_root_level;
+	struct btrfs_dev_item dev_item;
+
+	char label[BTRFS_LABEL_SIZE];
+
+	__le64 cache_generation;
+	__le64 uuid_tree_generation;
+
+	/* future expansion */
+	__le64 reserved[30];
+	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+} __attribute__ ((__packed__));
+
+/*
+ * Compat flags that we support.  If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2	(1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy.  Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)
+
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
+
+#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
+
+#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
+	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL
+
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+	struct btrfs_disk_key key;
+	__le32 offset;
+	__le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+	struct btrfs_header header;
+	struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+	struct btrfs_disk_key key;
+	__le64 blockptr;
+	__le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+	struct btrfs_header header;
+	struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+	int slots[BTRFS_MAX_LEVEL];
+	/* if there is real range locking, this locks field will change */
+	int locks[BTRFS_MAX_LEVEL];
+	int reada;
+	/* keep some upper locks as we walk down */
+	int lowest_level;
+
+	/*
+	 * set by btrfs_split_item, tells search_slot to keep all locks
+	 * and to force calls to keep space in the nodes
+	 */
+	unsigned int search_for_split:1;
+	unsigned int keep_locks:1;
+	unsigned int skip_locking:1;
+	unsigned int leave_spinning:1;
+	unsigned int search_commit_root:1;
+	unsigned int need_commit_sem:1;
+	unsigned int skip_release_on_error:1;
+};
+
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+
+struct btrfs_extent_item {
+	__le64 refs;
+	__le64 generation;
+	__le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_item_v0 {
+	__le32 refs;
+} __attribute__ ((__packed__));
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+					sizeof(struct btrfs_item))
+
+#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)
+
+/* following flags only apply to tree blocks */
+
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)
+
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER		(1ULL << 48)
+
+struct btrfs_tree_block_info {
+	struct btrfs_disk_key key;
+	u8 level;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_data_ref {
+	__le64 root;
+	__le64 objectid;
+	__le64 offset;
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_shared_data_ref {
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_inline_ref {
+	u8 type;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
+	__le64 root;
+	__le64 generation;
+	__le64 objectid;
+	__le32 count;
+} __attribute__ ((__packed__));
+
+
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent.  The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 chunk_offset;
+	__le64 length;
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_ref {
+	__le64 index;
+	__le16 name_len;
+	/* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_extref {
+	__le64 parent_objectid;
+	__le64 index;
+	__le16 name_len;
+	__u8   name[0];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_timespec {
+	__le64 sec;
+	__le32 nsec;
+} __attribute__ ((__packed__));
+
+enum btrfs_compression_type {
+	BTRFS_COMPRESS_NONE  = 0,
+	BTRFS_COMPRESS_ZLIB  = 1,
+	BTRFS_COMPRESS_LZO   = 2,
+	BTRFS_COMPRESS_TYPES = 2,
+	BTRFS_COMPRESS_LAST  = 3,
+};
+
+struct btrfs_inode_item {
+	/* nfs style generation number */
+	__le64 generation;
+	/* transid that last touched this inode */
+	__le64 transid;
+	__le64 size;
+	__le64 nbytes;
+	__le64 block_group;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le64 rdev;
+	__le64 flags;
+
+	/* modification sequence number for NFS */
+	__le64 sequence;
+
+	/*
+	 * a little future expansion, for more than this we can
+	 * just grow the inode item and version it
+	 */
+	__le64 reserved[4];
+	struct btrfs_timespec atime;
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec mtime;
+	struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_log_item {
+	__le64 end;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+	struct btrfs_disk_key location;
+	__le64 transid;
+	__le16 data_len;
+	__le16 name_len;
+	u8 type;
+} __attribute__ ((__packed__));
+
+#define BTRFS_ROOT_SUBVOL_RDONLY	(1ULL << 0)
+
+/*
+ * Internal in-memory flag that a subvolume has been marked for deletion but
+ * still visible as a directory
+ */
+#define BTRFS_ROOT_SUBVOL_DEAD		(1ULL << 48)
+
+struct btrfs_root_item {
+	struct btrfs_inode_item inode;
+	__le64 generation;
+	__le64 root_dirid;
+	__le64 bytenr;
+	__le64 byte_limit;
+	__le64 bytes_used;
+	__le64 last_snapshot;
+	__le64 flags;
+	__le32 refs;
+	struct btrfs_disk_key drop_progress;
+	u8 drop_level;
+	u8 level;
+
+	/*
+	 * The following fields appear after subvol_uuids+subvol_times
+	 * were introduced.
+	 */
+
+	/*
+	 * This generation number is used to test if the new fields are valid
+	 * and up to date while reading the root item. Everytime the root item
+	 * is written out, the "generation" field is copied into this field. If
+	 * anyone ever mounted the fs with an older kernel, we will have
+	 * mismatching generation values here and thus must invalidate the
+	 * new fields. See btrfs_update_root and btrfs_find_last_root for
+	 * details.
+	 * the offset of generation_v2 is also used as the start for the memset
+	 * when invalidating the fields.
+	 */
+	__le64 generation_v2;
+	u8 uuid[BTRFS_UUID_SIZE];
+	u8 parent_uuid[BTRFS_UUID_SIZE];
+	u8 received_uuid[BTRFS_UUID_SIZE];
+	__le64 ctransid; /* updated when an inode changes */
+	__le64 otransid; /* trans when created */
+	__le64 stransid; /* trans when sent. non-zero for received subvol */
+	__le64 rtransid; /* trans when received. non-zero for received subvol */
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec otime;
+	struct btrfs_timespec stime;
+	struct btrfs_timespec rtime;
+	__le64 reserved[8]; /* for future */
+} __attribute__ ((__packed__));
+
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+	__le64 dirid;
+	__le64 sequence;
+	__le16 name_len;
+} __attribute__ ((__packed__));
+
+struct btrfs_disk_balance_args {
+	/*
+	 * profiles to operate on, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 profiles;
+
+	/* usage filter */
+	__le64 usage;
+
+	/* devid filter */
+	__le64 devid;
+
+	/* devid subset filter [pstart..pend) */
+	__le64 pstart;
+	__le64 pend;
+
+	/* btrfs virtual address space subset filter [vstart..vend) */
+	__le64 vstart;
+	__le64 vend;
+
+	/*
+	 * profile to convert to, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 target;
+
+	/* BTRFS_BALANCE_ARGS_* */
+	__le64 flags;
+
+	/* BTRFS_BALANCE_ARGS_LIMIT value */
+	__le64 limit;
+
+	__le64 unused[7];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+	/* BTRFS_BALANCE_* */
+	__le64 flags;
+
+	struct btrfs_disk_balance_args data;
+	struct btrfs_disk_balance_args meta;
+	struct btrfs_disk_balance_args sys;
+
+	__le64 unused[4];
+} __attribute__ ((__packed__));
+
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+
+struct btrfs_file_extent_item {
+	/*
+	 * transaction id that created this extent
+	 */
+	__le64 generation;
+	/*
+	 * max number of bytes to hold this extent in ram
+	 * when we split a compressed extent we can't know how big
+	 * each of the resulting pieces will be.  So, this is
+	 * an upper limit on the size of the extent in ram instead of
+	 * an exact limit.
+	 */
+	__le64 ram_bytes;
+
+	/*
+	 * 32 bits for the various ways we might encode the data,
+	 * including compression and encryption.  If any of these
+	 * are set to something a given disk format doesn't understand
+	 * it is treated like an incompat flag for reading and writing,
+	 * but not for stat.
+	 */
+	u8 compression;
+	u8 encryption;
+	__le16 other_encoding; /* spare for later use */
+
+	/* are we inline data or a real extent? */
+	u8 type;
+
+	/*
+	 * disk space consumed by the extent, checksum blocks are included
+	 * in these numbers
+	 *
+	 * At this offset in the structure, the inline extent data start.
+	 */
+	__le64 disk_bytenr;
+	__le64 disk_num_bytes;
+	/*
+	 * the logical offset in file blocks (no csums)
+	 * this extent record is for.  This allows a file extent to point
+	 * into the middle of an existing extent on disk, sharing it
+	 * between two snapshots (useful if some bytes in the middle of the
+	 * extent have changed
+	 */
+	__le64 offset;
+	/*
+	 * the logical number of file blocks (no csums included).  This
+	 * always reflects the size uncompressed and without encoding.
+	 */
+	__le64 num_bytes;
+
+} __attribute__ ((__packed__));
+
+struct btrfs_csum_item {
+	u8 csum;
+} __attribute__ ((__packed__));
+
+struct btrfs_dev_stats_item {
+	/*
+	 * grow this item struct at the end for future enhancements and keep
+	 * the existing values unchanged
+	 */
+	__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED	0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED		1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED		2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED		3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED		4
+
+struct btrfs_dev_replace {
+	u64 replace_state;	/* see #define above */
+	u64 time_started;	/* seconds since 1-Jan-1970 */
+	u64 time_stopped;	/* seconds since 1-Jan-1970 */
+	atomic64_t num_write_errors;
+	atomic64_t num_uncorrectable_read_errors;
+
+	u64 cursor_left;
+	u64 committed_cursor_left;
+	u64 cursor_left_last_write_of_item;
+	u64 cursor_right;
+
+	u64 cont_reading_from_srcdev_mode;	/* see #define above */
+
+	int is_valid;
+	int item_needs_writeback;
+	struct btrfs_device *srcdev;
+	struct btrfs_device *tgtdev;
+
+	pid_t lock_owner;
+	atomic_t nesting_level;
+	struct mutex lock_finishing_cancel_unmount;
+	struct mutex lock_management_lock;
+	struct mutex lock;
+
+	struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+	/*
+	 * grow this item struct at the end for future enhancements and keep
+	 * the existing values unchanged
+	 */
+	__le64 src_devid;
+	__le64 cursor_left;
+	__le64 cursor_right;
+	__le64 cont_reading_from_srcdev_mode;
+
+	__le64 replace_state;
+	__le64 time_started;
+	__le64 time_stopped;
+	__le64 num_write_errors;
+	__le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA	(1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0		(1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
+#define BTRFS_BLOCK_GROUP_RESERVED	(BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+					 BTRFS_SPACE_INFO_GLOBAL_RSV)
+
+enum btrfs_raid_types {
+	BTRFS_RAID_RAID10,
+	BTRFS_RAID_RAID1,
+	BTRFS_RAID_DUP,
+	BTRFS_RAID_RAID0,
+	BTRFS_RAID_SINGLE,
+	BTRFS_RAID_RAID5,
+	BTRFS_RAID_RAID6,
+	BTRFS_NR_RAID_TYPES
+};
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
+					 BTRFS_BLOCK_GROUP_SYSTEM |  \
+					 BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
+					 BTRFS_BLOCK_GROUP_RAID1 |   \
+					 BTRFS_BLOCK_GROUP_RAID5 |   \
+					 BTRFS_BLOCK_GROUP_RAID6 |   \
+					 BTRFS_BLOCK_GROUP_DUP |     \
+					 BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK	(BTRFS_BLOCK_GROUP_RAID5 |   \
+					 BTRFS_BLOCK_GROUP_RAID6)
+
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE	(1ULL << 48)
+
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV	(1ULL << 49)
+
+#define BTRFS_EXTENDED_PROFILE_MASK	(BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+					 BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline u64 chunk_to_extended(u64 flags)
+{
+	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+		flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	return flags;
+}
+static inline u64 extended_to_chunk(u64 flags)
+{
+	return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
+
+struct btrfs_block_group_item {
+	__le64 used;
+	__le64 chunk_objectid;
+	__le64 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_QGROUP_LEVEL_SHIFT		48
+static inline u64 btrfs_qgroup_level(u64 qgroupid)
+{
+	return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+}
+
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON		(1ULL << 0)
+/*
+ * RESCAN is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_RESCAN		(1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT	(1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION        1
+
+struct btrfs_qgroup_status_item {
+	__le64 version;
+	/*
+	 * the generation is updated during every commit. As older
+	 * versions of btrfs are not aware of qgroups, it will be
+	 * possible to detect inconsistencies by checking the
+	 * generation on mount time
+	 */
+	__le64 generation;
+
+	/* flag definitions see above */
+	__le64 flags;
+
+	/*
+	 * only used during scanning to record the progress
+	 * of the scan. It contains a logical address
+	 */
+	__le64 rescan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+	__le64 generation;
+	__le64 rfer;
+	__le64 rfer_cmpr;
+	__le64 excl;
+	__le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER	(1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL	(1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER	(1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL	(1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR	(1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR	(1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+	/*
+	 * only updated when any of the other values change
+	 */
+	__le64 flags;
+	__le64 max_rfer;
+	__le64 max_excl;
+	__le64 rsv_rfer;
+	__le64 rsv_excl;
+} __attribute__ ((__packed__));
+
+/* For raid type sysfs entries */
+struct raid_kobject {
+	int raid_type;
+	struct kobject kobj;
+};
+
+struct btrfs_space_info {
+	spinlock_t lock;
+
+	u64 total_bytes;	/* total bytes in the space,
+				   this doesn't take mirrors into account */
+	u64 bytes_used;		/* total bytes used,
+				   this doesn't take mirrors into account */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc/allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	unsigned int full:1;	/* indicates that we cannot allocate any more
+				   chunks for this space */
+	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
+
+	unsigned int flush:1;		/* set if we are trying to make space */
+
+	unsigned int force_alloc;	/* set if we need to force a chunk
+					   alloc for this space */
+
+	u64 disk_used;		/* total bytes used on disk */
+	u64 disk_total;		/* total bytes on disk, takes mirrors into
+				   account */
+
+	u64 flags;
+
+	/*
+	 * bytes_pinned is kept in line with what is actually pinned, as in
+	 * we've called update_block_group and dropped the bytes_used counter
+	 * and increased the bytes_pinned counter.  However this means that
+	 * bytes_pinned does not reflect the bytes that will be pinned once the
+	 * delayed refs are flushed, so this counter is inc'ed everytime we call
+	 * btrfs_free_extent so it is a realtime count of what will be freed
+	 * once the transaction is committed.  It will be zero'ed everytime the
+	 * transaction commits.
+	 */
+	struct percpu_counter total_bytes_pinned;
+
+	struct list_head list;
+	/* Protected by the spinlock 'lock'. */
+	struct list_head ro_bgs;
+
+	struct rw_semaphore groups_sem;
+	/* for block groups in our same type */
+	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
+	wait_queue_head_t wait;
+
+	struct kobject kobj;
+	struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+};
+
+#define	BTRFS_BLOCK_RSV_GLOBAL		1
+#define	BTRFS_BLOCK_RSV_DELALLOC	2
+#define	BTRFS_BLOCK_RSV_TRANS		3
+#define	BTRFS_BLOCK_RSV_CHUNK		4
+#define	BTRFS_BLOCK_RSV_DELOPS		5
+#define	BTRFS_BLOCK_RSV_EMPTY		6
+#define	BTRFS_BLOCK_RSV_TEMP		7
+
+struct btrfs_block_rsv {
+	u64 size;
+	u64 reserved;
+	struct btrfs_space_info *space_info;
+	spinlock_t lock;
+	unsigned short full;
+	unsigned short type;
+	unsigned short failfast;
+};
+
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes.  They are used for all metadata
+ * allocations and data allocations in ssd mode.
+ */
+struct btrfs_free_cluster {
+	spinlock_t lock;
+	spinlock_t refill_lock;
+	struct rb_root root;
+
+	/* largest extent in this cluster */
+	u64 max_size;
+
+	/* first extent starting offset */
+	u64 window_start;
+
+	struct btrfs_block_group_cache *block_group;
+	/*
+	 * when a cluster is allocated from a block group, we put the
+	 * cluster onto a list in the block group so that it can
+	 * be freed before the block group is freed.
+	 */
+	struct list_head block_group_list;
+};
+
+enum btrfs_caching_type {
+	BTRFS_CACHE_NO		= 0,
+	BTRFS_CACHE_STARTED	= 1,
+	BTRFS_CACHE_FAST	= 2,
+	BTRFS_CACHE_FINISHED	= 3,
+	BTRFS_CACHE_ERROR	= 4,
+};
+
+enum btrfs_disk_cache_state {
+	BTRFS_DC_WRITTEN	= 0,
+	BTRFS_DC_ERROR		= 1,
+	BTRFS_DC_CLEAR		= 2,
+	BTRFS_DC_SETUP		= 3,
+};
+
+struct btrfs_caching_control {
+	struct list_head list;
+	struct mutex mutex;
+	wait_queue_head_t wait;
+	struct btrfs_work work;
+	struct btrfs_block_group_cache *block_group;
+	u64 progress;
+	atomic_t count;
+};
+
+struct btrfs_io_ctl {
+	void *cur, *orig;
+	struct page *page;
+	struct page **pages;
+	struct btrfs_root *root;
+	struct inode *inode;
+	unsigned long size;
+	int index;
+	int num_pages;
+	int entries;
+	int bitmaps;
+	unsigned check_crcs:1;
+};
+
+struct btrfs_block_group_cache {
+	struct btrfs_key key;
+	struct btrfs_block_group_item item;
+	struct btrfs_fs_info *fs_info;
+	struct inode *inode;
+	spinlock_t lock;
+	u64 pinned;
+	u64 reserved;
+	u64 delalloc_bytes;
+	u64 bytes_super;
+	u64 flags;
+	u64 sectorsize;
+	u64 cache_generation;
+
+	/*
+	 * It is just used for the delayed data space allocation because
+	 * only the data space allocation and the relative metadata update
+	 * can be done cross the transaction.
+	 */
+	struct rw_semaphore data_rwsem;
+
+	/* for raid56, this is a full stripe, without parity */
+	unsigned long full_stripe_len;
+
+	unsigned int ro:1;
+	unsigned int iref:1;
+	unsigned int has_caching_ctl:1;
+	unsigned int removed:1;
+
+	int disk_cache_state;
+
+	/* cache tracking stuff */
+	int cached;
+	struct btrfs_caching_control *caching_ctl;
+	u64 last_byte_to_unpin;
+
+	struct btrfs_space_info *space_info;
+
+	/* free space cache stuff */
+	struct btrfs_free_space_ctl *free_space_ctl;
+
+	/* block group cache stuff */
+	struct rb_node cache_node;
+
+	/* for block groups in the same raid type */
+	struct list_head list;
+
+	/* usage count */
+	atomic_t count;
+
+	/* List of struct btrfs_free_clusters for this block group.
+	 * Today it will only have one thing on it, but that may change
+	 */
+	struct list_head cluster_list;
+
+	/* For delayed block group creation or deletion of empty block groups */
+	struct list_head bg_list;
+
+	/* For read-only block groups */
+	struct list_head ro_list;
+
+	atomic_t trimming;
+
+	/* For dirty block groups */
+	struct list_head dirty_list;
+	struct list_head io_list;
+
+	struct btrfs_io_ctl io_ctl;
+};
+
+/* delayed seq elem */
+struct seq_list {
+	struct list_head list;
+	u64 seq;
+};
+
+#define SEQ_LIST_INIT(name)	{ .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+
+enum btrfs_orphan_cleanup_state {
+	ORPHAN_CLEANUP_STARTED	= 1,
+	ORPHAN_CLEANUP_DONE	= 2,
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+	struct list_head hash_list;
+	wait_queue_head_t wait;
+	spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+	struct list_head stripe_cache;
+	spinlock_t cache_lock;
+	int cache_size;
+	struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
+void btrfs_init_async_reclaim_work(struct work_struct *work);
+
+/* fs_info */
+struct reloc_control;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct btrfs_fs_info {
+	u8 fsid[BTRFS_FSID_SIZE];
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	struct btrfs_root *extent_root;
+	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *csum_root;
+	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
+
+	/* the log root tree is a directory of all the other log roots */
+	struct btrfs_root *log_root_tree;
+
+	spinlock_t fs_roots_radix_lock;
+	struct radix_tree_root fs_roots_radix;
+
+	/* block group cache stuff */
+	spinlock_t block_group_cache_lock;
+	u64 first_logical_byte;
+	struct rb_root block_group_cache_tree;
+
+	/* keep track of unallocated space */
+	spinlock_t free_chunk_lock;
+	u64 free_chunk_space;
+
+	struct extent_io_tree freed_extents[2];
+	struct extent_io_tree *pinned_extents;
+
+	/* logical->physical extent mapping */
+	struct btrfs_mapping_tree mapping_tree;
+
+	/*
+	 * block reservation for extent, checksum, root tree and
+	 * delayed dir index item
+	 */
+	struct btrfs_block_rsv global_block_rsv;
+	/* block reservation for delay allocation */
+	struct btrfs_block_rsv delalloc_block_rsv;
+	/* block reservation for metadata operations */
+	struct btrfs_block_rsv trans_block_rsv;
+	/* block reservation for chunk tree */
+	struct btrfs_block_rsv chunk_block_rsv;
+	/* block reservation for delayed operations */
+	struct btrfs_block_rsv delayed_block_rsv;
+
+	struct btrfs_block_rsv empty_block_rsv;
+
+	u64 generation;
+	u64 last_trans_committed;
+	u64 avg_delayed_ref_runtime;
+
+	/*
+	 * this is updated to the current trans every time a full commit
+	 * is required instead of the faster short fsync log commits
+	 */
+	u64 last_trans_log_full_commit;
+	unsigned long mount_opt;
+	/*
+	 * Track requests for actions that need to be done during transaction
+	 * commit (like for some mount options).
+	 */
+	unsigned long pending_changes;
+	unsigned long compress_type:4;
+	int commit_interval;
+	/*
+	 * It is a suggestive number, the read side is safe even it gets a
+	 * wrong number because we will write out the data into a regular
+	 * extent. The write side(mount/remount) is under ->s_umount lock,
+	 * so it is also safe.
+	 */
+	u64 max_inline;
+	/*
+	 * Protected by ->chunk_mutex and sb->s_umount.
+	 *
+	 * The reason that we use two lock to protect it is because only
+	 * remount and mount operations can change it and these two operations
+	 * are under sb->s_umount, but the read side (chunk allocation) can not
+	 * acquire sb->s_umount or the deadlock would happen. So we use two
+	 * locks to protect it. On the write side, we must acquire two locks,
+	 * and on the read side, we just need acquire one of them.
+	 */
+	u64 alloc_start;
+	struct btrfs_transaction *running_transaction;
+	wait_queue_head_t transaction_throttle;
+	wait_queue_head_t transaction_wait;
+	wait_queue_head_t transaction_blocked_wait;
+	wait_queue_head_t async_submit_wait;
+
+	/*
+	 * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+	 * when they are updated.
+	 *
+	 * Because we do not clear the flags for ever, so we needn't use
+	 * the lock on the read side.
+	 *
+	 * We also needn't use the lock when we mount the fs, because
+	 * there is no other task which will update the flag.
+	 */
+	spinlock_t super_lock;
+	struct btrfs_super_block *super_copy;
+	struct btrfs_super_block *super_for_commit;
+	struct block_device *__bdev;
+	struct super_block *sb;
+	struct inode *btree_inode;
+	struct backing_dev_info bdi;
+	struct mutex tree_log_mutex;
+	struct mutex transaction_kthread_mutex;
+	struct mutex cleaner_mutex;
+	struct mutex chunk_mutex;
+	struct mutex volume_mutex;
+
+	/*
+	 * this is taken to make sure we don't set block groups ro after
+	 * the free space cache has been allocated on them
+	 */
+	struct mutex ro_block_group_mutex;
+
+	/* this is used during read/modify/write to make sure
+	 * no two ios are trying to mod the same stripe at the same
+	 * time
+	 */
+	struct btrfs_stripe_hash_table *stripe_hash_table;
+
+	/*
+	 * this protects the ordered operations list only while we are
+	 * processing all of the entries on it.  This way we make
+	 * sure the commit code doesn't find the list temporarily empty
+	 * because another function happens to be doing non-waiting preflush
+	 * before jumping into the main commit.
+	 */
+	struct mutex ordered_operations_mutex;
+
+	/*
+	 * Same as ordered_operations_mutex except this is for ordered extents
+	 * and not the operations.
+	 */
+	struct mutex ordered_extent_flush_mutex;
+
+	struct rw_semaphore commit_root_sem;
+
+	struct rw_semaphore cleanup_work_sem;
+
+	struct rw_semaphore subvol_sem;
+	struct srcu_struct subvol_srcu;
+
+	spinlock_t trans_lock;
+	/*
+	 * the reloc mutex goes with the trans lock, it is taken
+	 * during commit to protect us from the relocation code
+	 */
+	struct mutex reloc_mutex;
+
+	struct list_head trans_list;
+	struct list_head dead_roots;
+	struct list_head caching_block_groups;
+
+	spinlock_t delayed_iput_lock;
+	struct list_head delayed_iputs;
+	struct rw_semaphore delayed_iput_sem;
+
+	/* this protects tree_mod_seq_list */
+	spinlock_t tree_mod_seq_lock;
+	atomic64_t tree_mod_seq;
+	struct list_head tree_mod_seq_list;
+
+	/* this protects tree_mod_log */
+	rwlock_t tree_mod_log_lock;
+	struct rb_root tree_mod_log;
+
+	atomic_t nr_async_submits;
+	atomic_t async_submit_draining;
+	atomic_t nr_async_bios;
+	atomic_t async_delalloc_pages;
+	atomic_t open_ioctl_trans;
+
+	/*
+	 * this is used to protect the following list -- ordered_roots.
+	 */
+	spinlock_t ordered_root_lock;
+
+	/*
+	 * all fs/file tree roots in which there are data=ordered extents
+	 * pending writeback are added into this list.
+	 *
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
+	struct list_head ordered_roots;
+
+	struct mutex delalloc_root_mutex;
+	spinlock_t delalloc_root_lock;
+	/* all fs/file tree roots that have delalloc inodes. */
+	struct list_head delalloc_roots;
+
+	/*
+	 * there is a pool of worker threads for checksumming during writes
+	 * and a pool for checksumming after reads.  This is because readers
+	 * can run with FS locks held, and the writers may be waiting for
+	 * those locks.  We don't want ordering in the pending list to cause
+	 * deadlocks, and so the two are serviced separately.
+	 *
+	 * A third pool does submit_bio to avoid deadlocking with the other
+	 * two
+	 */
+	struct btrfs_workqueue *workers;
+	struct btrfs_workqueue *delalloc_workers;
+	struct btrfs_workqueue *flush_workers;
+	struct btrfs_workqueue *endio_workers;
+	struct btrfs_workqueue *endio_meta_workers;
+	struct btrfs_workqueue *endio_raid56_workers;
+	struct btrfs_workqueue *endio_repair_workers;
+	struct btrfs_workqueue *rmw_workers;
+	struct btrfs_workqueue *endio_meta_write_workers;
+	struct btrfs_workqueue *endio_write_workers;
+	struct btrfs_workqueue *endio_freespace_worker;
+	struct btrfs_workqueue *submit_workers;
+	struct btrfs_workqueue *caching_workers;
+	struct btrfs_workqueue *readahead_workers;
+
+	/*
+	 * fixup workers take dirty pages that didn't properly go through
+	 * the cow mechanism and make them safe to write.  It happens
+	 * for the sys_munmap function call path
+	 */
+	struct btrfs_workqueue *fixup_workers;
+	struct btrfs_workqueue *delayed_workers;
+
+	/* the extent workers do delayed refs on the extent allocation tree */
+	struct btrfs_workqueue *extent_workers;
+	struct task_struct *transaction_kthread;
+	struct task_struct *cleaner_kthread;
+	int thread_pool_size;
+
+	struct kobject super_kobj;
+	struct kobject *space_info_kobj;
+	struct kobject *device_dir_kobj;
+	struct completion kobj_unregister;
+	int do_barriers;
+	int closing;
+	int log_root_recovering;
+	int open;
+
+	u64 total_pinned;
+
+	/* used to keep from writing metadata until there is a nice batch */
+	struct percpu_counter dirty_metadata_bytes;
+	struct percpu_counter delalloc_bytes;
+	s32 dirty_metadata_batch;
+	s32 delalloc_batch;
+
+	struct list_head dirty_cowonly_roots;
+
+	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * the space_info list is almost entirely read only.  It only changes
+	 * when we add a new raid type to the FS, and that happens
+	 * very rarely.  RCU is used to protect it.
+	 */
+	struct list_head space_info;
+
+	struct btrfs_space_info *data_sinfo;
+
+	struct reloc_control *reloc_ctl;
+
+	/* data_alloc_cluster is only used in ssd mode */
+	struct btrfs_free_cluster data_alloc_cluster;
+
+	/* all metadata allocations go through this cluster */
+	struct btrfs_free_cluster meta_alloc_cluster;
+
+	/* auto defrag inodes go here */
+	spinlock_t defrag_inodes_lock;
+	struct rb_root defrag_inodes;
+	atomic_t defrag_running;
+
+	/* Used to protect avail_{data, metadata, system}_alloc_bits */
+	seqlock_t profiles_lock;
+	/*
+	 * these three are in extended format (availability of single
+	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+	 */
+	u64 avail_data_alloc_bits;
+	u64 avail_metadata_alloc_bits;
+	u64 avail_system_alloc_bits;
+
+	/* restriper state */
+	spinlock_t balance_lock;
+	struct mutex balance_mutex;
+	atomic_t balance_running;
+	atomic_t balance_pause_req;
+	atomic_t balance_cancel_req;
+	struct btrfs_balance_control *balance_ctl;
+	wait_queue_head_t balance_wait_q;
+
+	unsigned data_chunk_allocations;
+	unsigned metadata_ratio;
+
+	void *bdev_holder;
+
+	/* private scrub information */
+	struct mutex scrub_lock;
+	atomic_t scrubs_running;
+	atomic_t scrub_pause_req;
+	atomic_t scrubs_paused;
+	atomic_t scrub_cancel_req;
+	wait_queue_head_t scrub_pause_wait;
+	int scrub_workers_refcnt;
+	struct btrfs_workqueue *scrub_workers;
+	struct btrfs_workqueue *scrub_wr_completion_workers;
+	struct btrfs_workqueue *scrub_nocow_workers;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	u32 check_integrity_print_mask;
+#endif
+	/*
+	 * quota information
+	 */
+	unsigned int quota_enabled:1;
+
+	/*
+	 * quota_enabled only changes state after a commit. This holds the
+	 * next state.
+	 */
+	unsigned int pending_quota_state:1;
+
+	/* is qgroup tracking in a consistent state? */
+	u64 qgroup_flags;
+
+	/* holds configuration and tracking. Protected by qgroup_lock */
+	struct rb_root qgroup_tree;
+	struct rb_root qgroup_op_tree;
+	spinlock_t qgroup_lock;
+	spinlock_t qgroup_op_lock;
+	atomic_t qgroup_op_seq;
+
+	/*
+	 * used to avoid frequently calling ulist_alloc()/ulist_free()
+	 * when doing qgroup accounting, it must be protected by qgroup_lock.
+	 */
+	struct ulist *qgroup_ulist;
+
+	/* protect user change for quota operations */
+	struct mutex qgroup_ioctl_lock;
+
+	/* list of dirty qgroups to be written at next commit */
+	struct list_head dirty_qgroups;
+
+	/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+	u64 qgroup_seq;
+
+	/* qgroup rescan items */
+	struct mutex qgroup_rescan_lock; /* protects the progress item */
+	struct btrfs_key qgroup_rescan_progress;
+	struct btrfs_workqueue *qgroup_rescan_workers;
+	struct completion qgroup_rescan_completion;
+	struct btrfs_work qgroup_rescan_work;
+
+	/* filesystem state */
+	unsigned long fs_state;
+
+	struct btrfs_delayed_root *delayed_root;
+
+	/* readahead tree */
+	spinlock_t reada_lock;
+	struct radix_tree_root reada_tree;
+
+	/* Extent buffer radix tree */
+	spinlock_t buffer_lock;
+	struct radix_tree_root buffer_radix;
+
+	/* next backup root to be overwritten */
+	int backup_root_index;
+
+	int num_tolerated_disk_barrier_failures;
+
+	/* device replace state */
+	struct btrfs_dev_replace dev_replace;
+
+	atomic_t mutually_exclusive_operation_running;
+
+	struct percpu_counter bio_counter;
+	wait_queue_head_t replace_wait;
+
+	struct semaphore uuid_tree_rescan_sem;
+	unsigned int update_uuid_tree_gen:1;
+
+	/* Used to reclaim the metadata space in the background. */
+	struct work_struct async_reclaim_work;
+
+	spinlock_t unused_bgs_lock;
+	struct list_head unused_bgs;
+	struct mutex unused_bg_unpin_mutex;
+
+	/* For btrfs to record security options */
+	struct security_mnt_opts security_opts;
+
+	/*
+	 * Chunks that can't be freed yet (under a trim/discard operation)
+	 * and will be latter freed. Protected by fs_info->chunk_mutex.
+	 */
+	struct list_head pinned_chunks;
+};
+
+struct btrfs_subvolume_writers {
+	struct percpu_counter	counter;
+	wait_queue_head_t	wait;
+};
+
+/*
+ * The state of btrfs root
+ */
+/*
+ * btrfs_record_root_in_trans is a multi-step process,
+ * and it can race with the balancing code.   But the
+ * race is very small, and only the first time the root
+ * is added to each transaction.  So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+#define BTRFS_ROOT_IN_TRANS_SETUP	0
+#define BTRFS_ROOT_REF_COWS		1
+#define BTRFS_ROOT_TRACK_DIRTY		2
+#define BTRFS_ROOT_IN_RADIX		3
+#define BTRFS_ROOT_DUMMY_ROOT		4
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED	5
+#define BTRFS_ROOT_DEFRAG_RUNNING	6
+#define BTRFS_ROOT_FORCE_COW		7
+#define BTRFS_ROOT_MULTI_LOG_TASKS	8
+#define BTRFS_ROOT_DIRTY		9
+
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_root {
+	struct extent_buffer *node;
+
+	struct extent_buffer *commit_root;
+	struct btrfs_root *log_root;
+	struct btrfs_root *reloc_root;
+
+	unsigned long state;
+	struct btrfs_root_item root_item;
+	struct btrfs_key root_key;
+	struct btrfs_fs_info *fs_info;
+	struct extent_io_tree dirty_log_pages;
+
+	struct mutex objectid_mutex;
+
+	spinlock_t accounting_lock;
+	struct btrfs_block_rsv *block_rsv;
+
+	/* free ino cache stuff */
+	struct btrfs_free_space_ctl *free_ino_ctl;
+	enum btrfs_caching_type ino_cache_state;
+	spinlock_t ino_cache_lock;
+	wait_queue_head_t ino_cache_wait;
+	struct btrfs_free_space_ctl *free_ino_pinned;
+	u64 ino_cache_progress;
+	struct inode *ino_cache_inode;
+
+	struct mutex log_mutex;
+	wait_queue_head_t log_writer_wait;
+	wait_queue_head_t log_commit_wait[2];
+	struct list_head log_ctxs[2];
+	atomic_t log_writers;
+	atomic_t log_commit[2];
+	atomic_t log_batch;
+	int log_transid;
+	/* No matter the commit succeeds or not*/
+	int log_transid_committed;
+	/* Just be updated when the commit succeeds. */
+	int last_log_commit;
+	pid_t log_start_pid;
+
+	u64 objectid;
+	u64 last_trans;
+
+	/* data allocations are done in sectorsize units */
+	u32 sectorsize;
+
+	/* node allocations are done in nodesize units */
+	u32 nodesize;
+
+	u32 stripesize;
+
+	u32 type;
+
+	u64 highest_objectid;
+
+	/* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
+	u64 alloc_bytenr;
+
+	u64 defrag_trans_start;
+	struct btrfs_key defrag_progress;
+	struct btrfs_key defrag_max;
+	char *name;
+
+	/* the dirty list is only used by non-reference counted roots */
+	struct list_head dirty_list;
+
+	struct list_head root_list;
+
+	spinlock_t log_extents_lock[2];
+	struct list_head logged_list[2];
+
+	spinlock_t orphan_lock;
+	atomic_t orphan_inodes;
+	struct btrfs_block_rsv *orphan_block_rsv;
+	int orphan_cleanup_state;
+
+	spinlock_t inode_lock;
+	/* red-black tree that keeps track of in-memory inodes */
+	struct rb_root inode_tree;
+
+	/*
+	 * radix tree that keeps track of delayed nodes of every inode,
+	 * protected by inode_lock
+	 */
+	struct radix_tree_root delayed_nodes_tree;
+	/*
+	 * right now this just gets used so that a root has its own devid
+	 * for stat.  It may be used for more later
+	 */
+	dev_t anon_dev;
+
+	spinlock_t root_item_lock;
+	atomic_t refs;
+
+	struct mutex delalloc_mutex;
+	spinlock_t delalloc_lock;
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
+	struct list_head delalloc_inodes;
+	struct list_head delalloc_root;
+	u64 nr_delalloc_inodes;
+
+	struct mutex ordered_extent_mutex;
+	/*
+	 * this is used by the balancing code to wait for all the pending
+	 * ordered extents
+	 */
+	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
+	struct list_head ordered_extents;
+	struct list_head ordered_root;
+	u64 nr_ordered_extents;
+
+	/*
+	 * Number of currently running SEND ioctls to prevent
+	 * manipulation with the read-only status via SUBVOL_SETFLAGS
+	 */
+	int send_in_progress;
+	struct btrfs_subvolume_writers *subv_writers;
+	atomic_t will_be_snapshoted;
+};
+
+struct btrfs_ioctl_defrag_range_args {
+	/* start of the defrag operation */
+	__u64 start;
+
+	/* number of bytes to defrag, use (u64)-1 to say all */
+	__u64 len;
+
+	/*
+	 * flags for the operation, which can include turning
+	 * on compression for this one defrag
+	 */
+	__u64 flags;
+
+	/*
+	 * any extent bigger than this will be considered
+	 * already defragged.  Use 0 to take the kernel default
+	 * Use 1 to say every single extent must be rewritten
+	 */
+	__u32 extent_thresh;
+
+	/*
+	 * which compression method to use if turning on compression
+	 * for this defrag operation.  If unspecified, zlib will
+	 * be used
+	 */
+	__u32 compress_type;
+
+	/* spare for later */
+	__u32 unused[4];
+};
+
+
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics.  There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY		1
+#define BTRFS_INODE_REF_KEY		12
+#define BTRFS_INODE_EXTREF_KEY		13
+#define BTRFS_XATTR_ITEM_KEY		24
+#define BTRFS_ORPHAN_ITEM_KEY		48
+/* reserve 2-15 close to the inode for later flexibility */
+
+/*
+ * dir items are the name -> inode pointers in a directory.  There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY  60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY	84
+#define BTRFS_DIR_INDEX_KEY	96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY	108
+
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY	128
+
+/*
+ * root items point to tree roots.  They are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY	132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY	144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root.  They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY	156
+
+/*
+ * extent items are in the extent map tree.  These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY	168
+
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY	169
+
+#define BTRFS_TREE_BLOCK_REF_KEY	176
+
+#define BTRFS_EXTENT_DATA_REF_KEY	178
+
+#define BTRFS_EXTENT_REF_V0_KEY		180
+
+#define BTRFS_SHARED_BLOCK_REF_KEY	182
+
+#define BTRFS_SHARED_DATA_REF_KEY	184
+
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+
+#define BTRFS_DEV_EXTENT_KEY	204
+#define BTRFS_DEV_ITEM_KEY	216
+#define BTRFS_CHUNK_ITEM_KEY	228
+
+/*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY         240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY           242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY          244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY       246
+
+#define BTRFS_BALANCE_ITEM_KEY	248
+
+/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY	249
+
+/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY	250
+
+/*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These items are part of the filesystem UUID tree.
+ * The key is built like this:
+ * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
+ */
+#if BTRFS_UUID_SIZE != 16
+#error "UUID items require BTRFS_UUID_SIZE == 16!"
+#endif
+#define BTRFS_UUID_KEY_SUBVOL	251	/* for UUIDs assigned to subvols */
+#define BTRFS_UUID_KEY_RECEIVED_SUBVOL	252	/* for UUIDs assigned to
+						 * received subvols */
+
+/*
+ * string items are for debugging.  They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY	253
+
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
+#define BTRFS_MOUNT_NODATASUM		(1 << 0)
+#define BTRFS_MOUNT_NODATACOW		(1 << 1)
+#define BTRFS_MOUNT_NOBARRIER		(1 << 2)
+#define BTRFS_MOUNT_SSD			(1 << 3)
+#define BTRFS_MOUNT_DEGRADED		(1 << 4)
+#define BTRFS_MOUNT_COMPRESS		(1 << 5)
+#define BTRFS_MOUNT_NOTREELOG           (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
+#define BTRFS_MOUNT_NOSSD		(1 << 9)
+#define BTRFS_MOUNT_DISCARD		(1 << 10)
+#define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
+#define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
+#define BTRFS_MOUNT_RECOVERY		(1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE	(1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
+#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
+#define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
+#define BTRFS_DEFAULT_MAX_INLINE	(8192)
+
+#define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
+					 BTRFS_MOUNT_##opt)
+
+#define btrfs_set_and_info(root, opt, fmt, args...)			\
+{									\
+	if (!btrfs_test_opt(root, opt))					\
+		btrfs_info(root->fs_info, fmt, ##args);			\
+	btrfs_set_opt(root->fs_info->mount_opt, opt);			\
+}
+
+#define btrfs_clear_and_info(root, opt, fmt, args...)			\
+{									\
+	if (btrfs_test_opt(root, opt))					\
+		btrfs_info(root->fs_info, fmt, ##args);			\
+	btrfs_clear_opt(root->fs_info->mount_opt, opt);			\
+}
+
+/*
+ * Requests for changes that need to be done during transaction commit.
+ *
+ * Internal mount options that are used for special handling of the real
+ * mount options (eg. cannot be set during remount and have to be set during
+ * transaction commit)
+ */
+
+#define BTRFS_PENDING_SET_INODE_MAP_CACHE	(0)
+#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE	(1)
+#define BTRFS_PENDING_COMMIT			(2)
+
+#define btrfs_test_pending(info, opt)	\
+	test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_set_pending(info, opt)	\
+	set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_clear_pending(info, opt)	\
+	clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+
+/*
+ * Helpers for setting pending mount option changes.
+ *
+ * Expects corresponding macros
+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
+ */
+#define btrfs_set_pending_and_info(info, opt, fmt, args...)            \
+do {                                                                   \
+       if (!btrfs_raw_test_opt((info)->mount_opt, opt)) {              \
+               btrfs_info((info), fmt, ##args);                        \
+               btrfs_set_pending((info), SET_##opt);                   \
+               btrfs_clear_pending((info), CLEAR_##opt);               \
+       }                                                               \
+} while(0)
+
+#define btrfs_clear_pending_and_info(info, opt, fmt, args...)          \
+do {                                                                   \
+       if (btrfs_raw_test_opt((info)->mount_opt, opt)) {               \
+               btrfs_info((info), fmt, ##args);                        \
+               btrfs_set_pending((info), CLEAR_##opt);                 \
+               btrfs_clear_pending((info), SET_##opt);                 \
+       }                                                               \
+} while(0)
+
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM		(1 << 0)
+#define BTRFS_INODE_NODATACOW		(1 << 1)
+#define BTRFS_INODE_READONLY		(1 << 2)
+#define BTRFS_INODE_NOCOMPRESS		(1 << 3)
+#define BTRFS_INODE_PREALLOC		(1 << 4)
+#define BTRFS_INODE_SYNC		(1 << 5)
+#define BTRFS_INODE_IMMUTABLE		(1 << 6)
+#define BTRFS_INODE_APPEND		(1 << 7)
+#define BTRFS_INODE_NODUMP		(1 << 8)
+#define BTRFS_INODE_NOATIME		(1 << 9)
+#define BTRFS_INODE_DIRSYNC		(1 << 10)
+#define BTRFS_INODE_COMPRESS		(1 << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
+
+struct btrfs_map_token {
+	struct extent_buffer *eb;
+	char *kaddr;
+	unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+	token->kaddr = NULL;
+}
+
+/* some macros to generate set/get funcs for the struct fields.  This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+#define read_eb_member(eb, ptr, type, member, result) (			\
+	read_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (		\
+	write_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define DECLARE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			       unsigned long off,			\
+                              struct btrfs_map_token *token);		\
+void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			    unsigned long off, u##bits val,		\
+			    struct btrfs_map_token *token);		\
+static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+				       unsigned long off)		\
+{									\
+	return btrfs_get_token_##bits(eb, ptr, off, NULL);		\
+}									\
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+				    unsigned long off, u##bits val)	\
+{									\
+       btrfs_set_token_##bits(eb, ptr, off, val, NULL);			\
+}
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb, type *s,	\
+				    u##bits val)			\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
+}									\
+static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+					 struct btrfs_map_token *token)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+}									\
+static inline void btrfs_set_token_##name(struct extent_buffer *eb,	\
+					  type *s, u##bits val,		\
+                                         struct btrfs_map_token *token)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+}
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
+{									\
+	type *p = page_address(eb->pages[0]);				\
+	u##bits res = le##bits##_to_cpu(p->member);			\
+	return res;							\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb,		\
+				    u##bits val)			\
+{									\
+	type *p = page_address(eb->pages[0]);				\
+	p->member = cpu_to_le##bits(val);				\
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(type *s)				\
+{									\
+	return le##bits##_to_cpu(s->member);				\
+}									\
+static inline void btrfs_set_##name(type *s, u##bits val)		\
+{									\
+	s->member = cpu_to_le##bits(val);				\
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+		   start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+			 dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+			 seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+			 bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+			 generation, 64);
+
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+	return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+			 stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+			 sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+						   int nr)
+{
+	unsigned long offset = (unsigned long)c;
+	offset += offsetof(struct btrfs_chunk, stripe);
+	offset += nr * sizeof(struct btrfs_stripe);
+	return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+			struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+		   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+		   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+			struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+		   parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+		   name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+			 sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+			 transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+			 nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+			 block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+		   chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+		   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+		   chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+	return (unsigned long)dev + ptr;
+}
+
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
+
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+					struct btrfs_tree_block_info *item,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+					    struct btrfs_tree_block_info *item,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+		   root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+		   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+		   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+		   type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+		   offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    type == BTRFS_SHARED_BLOCK_REF_KEY)
+		return sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_SHARED_DATA_REF_KEY)
+		return sizeof(struct btrfs_shared_data_ref) +
+		       sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY)
+		return sizeof(struct btrfs_extent_data_ref) +
+		       offsetof(struct btrfs_extent_inline_ref, offset);
+	BUG();
+	return 0;
+}
+
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
+			 blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+			 generation, 64);
+
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+					   int nr, u64 val)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+						 int nr, u64 val)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+	return offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+				      struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr;
+	ptr = btrfs_node_key_ptr_offset(nr);
+	write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+	return offsetof(struct btrfs_leaf, items) +
+		sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(int nr)
+{
+	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+				 struct btrfs_item *item)
+{
+	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_end(eb, btrfs_item_nr(nr));
+}
+
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_offset(eb, btrfs_item_nr(nr));
+}
+
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_size(eb, btrfs_item_nr(nr));
+}
+
+static inline void btrfs_item_key(struct extent_buffer *eb,
+			   struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(nr);
+	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+			       struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(nr);
+	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
+			 data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
+			 name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
+			 transid, 64);
+
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+					  struct btrfs_dir_item *item,
+					  struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+		   num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+		   num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+		   generation, 64);
+
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+					struct btrfs_free_space_header *h,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+					    struct btrfs_free_space_header *h,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+			 objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+					 struct btrfs_disk_key *disk)
+{
+	cpu->offset = le64_to_cpu(disk->offset);
+	cpu->type = disk->type;
+	cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+					 struct btrfs_key *cpu)
+{
+	disk->offset = cpu_to_le64(cpu->offset);
+	disk->type = cpu->type;
+	disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_node_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_item_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_key *key)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_dir_item_key(eb, item, &disk_key);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+	return key->type;
+}
+
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+	key->type = val;
+}
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+			  generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
+			 nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
+
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags | flag);
+	return (flags & flag) == flag;
+}
+
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags & ~flag);
+	return (flags & flag) == flag;
+}
+
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+	u64 flags = btrfs_header_flags(eb);
+	return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+						int rev)
+{
+	u64 flags = btrfs_header_flags(eb);
+	flags &= ~BTRFS_BACKREF_REV_MASK;
+	flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+	btrfs_set_header_flags(eb, flags);
+}
+
+static inline unsigned long btrfs_header_fsid(void)
+{
+	return offsetof(struct btrfs_header, fsid);
+}
+
+static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+	return offsetof(struct btrfs_header, chunk_tree_uuid);
+}
+
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+	return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+			 last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+			 generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+			 ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+			 otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+			 stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+			 rtransid, 64);
+
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+static inline bool btrfs_root_dead(struct btrfs_root *root)
+{
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+		   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+		   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+		   tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+		   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+		   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+		   chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+		   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+		   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+		   extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+		   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+		   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+		   fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+		   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+		   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+		   dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+		   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+		   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+		   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+		   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+		   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+		   num_devices, 64);
+
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+				     struct btrfs_balance_item *bi,
+				     struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+					 struct btrfs_balance_item *bi,
+					 struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+			       struct btrfs_disk_balance_args *disk)
+{
+	memset(cpu, 0, sizeof(*cpu));
+
+	cpu->profiles = le64_to_cpu(disk->profiles);
+	cpu->usage = le64_to_cpu(disk->usage);
+	cpu->devid = le64_to_cpu(disk->devid);
+	cpu->pstart = le64_to_cpu(disk->pstart);
+	cpu->pend = le64_to_cpu(disk->pend);
+	cpu->vstart = le64_to_cpu(disk->vstart);
+	cpu->vend = le64_to_cpu(disk->vend);
+	cpu->target = le64_to_cpu(disk->target);
+	cpu->flags = le64_to_cpu(disk->flags);
+	cpu->limit = le64_to_cpu(disk->limit);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+			       struct btrfs_balance_args *cpu)
+{
+	memset(disk, 0, sizeof(*disk));
+
+	disk->profiles = cpu_to_le64(cpu->profiles);
+	disk->usage = cpu_to_le64(cpu->usage);
+	disk->devid = cpu_to_le64(cpu->devid);
+	disk->pstart = cpu_to_le64(cpu->pstart);
+	disk->pend = cpu_to_le64(cpu->pend);
+	disk->vstart = cpu_to_le64(cpu->vstart);
+	disk->vend = cpu_to_le64(cpu->vend);
+	disk->target = cpu_to_le64(cpu->target);
+	disk->flags = cpu_to_le64(cpu->flags);
+	disk->limit = cpu_to_le64(cpu->limit);
+}
+
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+			 struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+			 struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+			 chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+			 chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+			 log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+			 log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+			 sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+			 nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+			 stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+			 root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+			 num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+			 compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+			 compat_ro_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+			 incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+			 csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+			 cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+			 uuid_tree_generation, 64);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+	u16 t = btrfs_super_csum_type(s);
+	/*
+	 * csum type is validated at mount time
+	 */
+	return btrfs_csum_sizes[t];
+}
+
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+	return offsetof(struct btrfs_leaf, items);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+			 struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+			 struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+			 struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+			 struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+			 struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+			 struct btrfs_file_extent_item, compression, 8);
+
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+	return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+	return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+		   disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+		   disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+		  offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+		   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+		   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+		   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+		   other_encoding, 16);
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+						    struct btrfs_item *e)
+{
+	return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       int slot,
+					       struct btrfs_file_extent_item *fi)
+{
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+	/*
+	 * return the space used on disk if this item isn't
+	 * compressed or encoded
+	 */
+	if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 &&
+	    btrfs_token_file_extent_encryption(eb, fi, &token) == 0 &&
+	    btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) {
+		return btrfs_file_extent_inline_item_len(eb,
+							 btrfs_item_nr(slot));
+	}
+
+	/* otherwise use the ram bytes field */
+	return btrfs_token_file_extent_ram_bytes(eb, fi, &token);
+}
+
+
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+					struct btrfs_dev_stats_item *ptr,
+					int index)
+{
+	u64 val;
+
+	read_extent_buffer(eb, &val,
+			   offsetof(struct btrfs_dev_stats_item, values) +
+			    ((unsigned long)ptr) + (index * sizeof(u64)),
+			   sizeof(val));
+	return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+					     struct btrfs_dev_stats_item *ptr,
+					     int index, u64 val)
+{
+	write_extent_buffer(eb, &val,
+			    offsetof(struct btrfs_dev_stats_item, values) +
+			     ((unsigned long)ptr) + (index * sizeof(u64)),
+			    sizeof(val));
+}
+
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+		   version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+		   rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+		   rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+		   excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+			 struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+			 rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+			 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+			 excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+			 struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+		   max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+		   max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+		   rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+		   rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+		   struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+		   replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+		   time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+		   time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+		   num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+		   cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+		   cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+			 struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+			 struct btrfs_dev_replace_item,
+			 cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+			 struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+			 struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+			 struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+			 struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+			 struct btrfs_dev_replace_item,
+			 num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+			 struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+			 struct btrfs_dev_replace_item, cursor_right, 64);
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+	((type *)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset_nr(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+	((unsigned long)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset_nr(leaf, slot)))
+
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+	return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
+/* extent-tree.c */
+
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
+
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+						 unsigned num_items)
+{
+	return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+		2 * num_items;
+}
+
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+						 unsigned num_items)
+{
+	return root->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count);
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+				 unsigned long count, int wait);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 offset, int metadata, u64 *refs, u64 *flags);
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+				 struct extent_buffer *eb);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int get_block_group_index(struct btrfs_block_group_cache *cache);
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root, u64 parent,
+					u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   u64 parent, int last_ref);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+			 struct btrfs_key *ins, int is_data, int delalloc);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int level, int is_data);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int no_quota);
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+			       int delalloc);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len);
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 owner, u64 offset, int no_quota);
+
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+			   u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start,
+			     struct extent_map *em);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+	/* If we are in the transaction, we can't flush anything.*/
+	BTRFS_RESERVE_NO_FLUSH,
+	/*
+	 * Flushing delalloc may cause deadlock somewhere, in this
+	 * case, use FLUSH LIMIT
+	 */
+	BTRFS_RESERVE_FLUSH_LIMIT,
+	BTRFS_RESERVE_FLUSH_ALL,
+};
+
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+				  struct inode *inode);
+void btrfs_orphan_release_metadata(struct inode *inode);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int nitems,
+				     u64 *qgroup_reserved, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+			  struct btrfs_block_rsv *rsv);
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+			    struct btrfs_block_rsv *dst_rsv,
+			    u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+			     struct btrfs_block_rsv *block_rsv,
+			     u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache);
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+			      struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+				   u64 start, u64 end);
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+			 u64 num_bytes, u64 *actual_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type);
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+/* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type);
+int btrfs_previous_extent_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid);
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path,
+			     struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int lowest_level,
+			u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_path *path,
+			 u64 min_trans);
+enum btrfs_compare_tree_result {
+	BTRFS_COMPARE_TREE_NEW,
+	BTRFS_COMPARE_TREE_DELETED,
+	BTRFS_COMPARE_TREE_CHANGED,
+	BTRFS_COMPARE_TREE_SAME,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
+				  struct btrfs_root *right_root,
+				  struct btrfs_path *left_path,
+				  struct btrfs_path *right_path,
+				  struct btrfs_key *key,
+				  enum btrfs_compare_tree_result result,
+				  void *ctx);
+int btrfs_compare_trees(struct btrfs_root *left_root,
+			struct btrfs_root *right_root,
+			btrfs_changed_cb_t cb, void *ctx);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf);
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+		       u32 data_size);
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+			 u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 struct btrfs_key *new_key);
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+		u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+			  struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct extent_buffer *parent,
+		       int start_slot, u64 *last_ret,
+		       struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p,
+			       struct extent_buffer *held, int held_rw);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int slot, int nr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path)
+{
+	return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    u32 total_data, u32 total_size, int nr);
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     struct btrfs_key *cpu_key, u32 *data_size, int nr);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_key *key,
+					  u32 data_size)
+{
+	return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+			u64 time_seq);
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+				      struct btrfs_path *p, u64 time_seq)
+{
+	++p->slots[0];
+	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+		return btrfs_next_old_leaf(root, p, time_seq);
+	return 0;
+}
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+	return btrfs_next_old_item(root, p, 0);
+}
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
+				     struct btrfs_block_rsv *block_rsv,
+				     int update_ref, int for_reloc);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * Get synced with close_ctree()
+	 */
+	smp_mb();
+	return fs_info->closing;
+}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+	return (root->fs_info->sb->s_flags & MS_RDONLY ||
+		btrfs_fs_closing(root->fs_info));
+}
+
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+	kfree(fs_info->balance_ctl);
+	kfree(fs_info->delayed_root);
+	kfree(fs_info->extent_root);
+	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
+	kfree(fs_info->quota_root);
+	kfree(fs_info->uuid_root);
+	kfree(fs_info->super_copy);
+	kfree(fs_info->super_for_commit);
+	security_free_mnt_opts(&fs_info->security_opts);
+	kfree(fs_info);
+}
+
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+			struct btrfs_path *path,
+			u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+		       const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+		       const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_key *key,
+				   struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+			 struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
+
+/* uuid-tree.c */
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+					      u64));
+
+/* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+			  const char *name, int name_len);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, const char *name,
+			  int name_len, struct inode *dir,
+			  struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dirid,
+			    const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 objectid,
+			    const char *name, u16 name_len,
+			    const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod);
+int verify_dir_item(struct btrfs_root *root,
+		    struct extent_buffer *leaf,
+		    struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+						 struct btrfs_path *path,
+						 const char *name,
+						 int name_len);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod);
+
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+				   u64 ref_objectid, const char *name,
+				   int name_len,
+				   struct btrfs_inode_extref **extref_ret);
+
+/* file-item.c */
+struct btrfs_dio_private;
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 logical_offset);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio, u64 file_start, int contig);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list, int search_commit);
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+				     const struct btrfs_path *path,
+				     struct btrfs_file_extent_item *fi,
+				     const bool new_inline,
+				     struct extent_map *em);
+
+/* inode.c */
+struct btrfs_delalloc_work {
+	struct inode *inode;
+	int wait;
+	int delay_iput;
+	struct completion completion;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+					   size_t pg_offset, u64 start, u64 len,
+					   int create);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes);
+
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+
+/* This forces readahead on a given range of bytes in an inode */
+static inline void btrfs_force_ra(struct address_space *mapping,
+				  struct file_ra_state *ra, struct file *file,
+				  pgoff_t offset, unsigned long req_size)
+{
+	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct inode *dir, u64 objectid,
+			const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode, u64 new_size,
+			       u32 min_type);
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root,
+			     struct btrfs_root *parent_root,
+			     u64 new_dirid);
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_evict_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *was_new);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t pg_offset, u64 start, u64 end,
+				    int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
+void btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
+extern const struct dentry_operations btrfs_dentry_operations;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
+#endif
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_is_empty_uuid(u8 *uuid);
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space);
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+			       struct btrfs_ioctl_balance_args *bargs);
+
+
+/* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct inode *inode);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned);
+extern const struct file_operations btrfs_file_operations;
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *drop_end, int drop_cache,
+			 int replace_extent,
+			 u32 extent_item_size,
+			 int *key_inserted);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, int drop_cache);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+		      struct page **pages, size_t num_pages,
+		      loff_t pos, size_t write_bytes,
+		      struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root);
+
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* super.c */
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+#ifdef DEBUG
+#define btrfs_debug(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+    no_printk(KERN_DEBUG fmt, ##args)
+#endif
+
+#ifdef CONFIG_BTRFS_ASSERT
+
+static inline void assfail(char *expr, char *file, int line)
+{
+	pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
+	       expr, file, line);
+	BUG();
+}
+
+#define ASSERT(expr)	\
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr)	((void)0)
+#endif
+
+#define btrfs_assert()
+__printf(5, 6)
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno, const char *fmt, ...);
+
+
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, const char *function,
+			       unsigned int line, int errno);
+
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
+					   u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & flag)) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_incompat_flags(disk_super);
+		if (!(features & flag)) {
+			features |= flag;
+			btrfs_set_super_incompat_flags(disk_super, features);
+			btrfs_info(fs_info, "setting %llu feature flag",
+					 flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+#define btrfs_fs_incompat(fs_info, opt) \
+	__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	disk_super = fs_info->super_copy;
+	return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
+#define btrfs_abort_transaction(trans, root, errno)		\
+do {								\
+	__btrfs_abort_transaction(trans, root, __func__,	\
+				  __LINE__, errno);		\
+} while (0)
+
+#define btrfs_std_error(fs_info, errno)				\
+do {								\
+	if ((errno))						\
+		__btrfs_std_error((fs_info), __func__,		\
+				   __LINE__, (errno), NULL);	\
+} while (0)
+
+#define btrfs_error(fs_info, errno, fmt, args...)		\
+do {								\
+	__btrfs_std_error((fs_info), __func__, __LINE__,	\
+			  (errno), fmt, ##args);		\
+} while (0)
+
+__printf(5, 6)
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...);
+
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...)			\
+do {									\
+	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
+	BUG();								\
+} while (0)
+
+/* acl.c */
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir);
+#else
+#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
+static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
+				 struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif
+
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending,
+			      u64 *bytes_to_reserve);
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending);
+
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_root *root);
+void btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+			   struct btrfs_device *dev);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+			 struct btrfs_scrub_progress *progress);
+
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+	btrfs_bio_counter_sub(fs_info, 1);
+}
+
+/* reada.c */
+struct reada_control {
+	struct btrfs_root	*root;		/* tree to prefetch */
+	struct btrfs_key	key_start;
+	struct btrfs_key	key_end;	/* exclusive */
+	atomic_t		elems;
+	struct kref		refcnt;
+	wait_queue_head_t	wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			      struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err);
+
+static inline int is_fstree(u64 rootid)
+{
+	if (rootid == BTRFS_FS_TREE_OBJECTID ||
+	    ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
+	      !btrfs_qgroup_level(rootid)))
+		return 1;
+	return 0;
+}
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+	return signal_pending(current);
+}
+
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+#endif
+
+static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+		return 1;
+#endif
+	return 0;
+}
+
+#endif
diff --git a/kernel/fs/btrfs/delayed-inode.c b/kernel/fs/btrfs/delayed-inode.c
new file mode 100644
index 000000000..a2ae42720
--- /dev/null
+++ b/kernel/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1997 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "ctree.h"
+
+#define BTRFS_DELAYED_WRITEBACK		512
+#define BTRFS_DELAYED_BACKGROUND	128
+#define BTRFS_DELAYED_BATCH		16
+
+static struct kmem_cache *delayed_node_cache;
+
+int __init btrfs_delayed_inode_init(void)
+{
+	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
+					sizeof(struct btrfs_delayed_node),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!delayed_node_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_delayed_inode_exit(void)
+{
+	if (delayed_node_cache)
+		kmem_cache_destroy(delayed_node_cache);
+}
+
+static inline void btrfs_init_delayed_node(
+				struct btrfs_delayed_node *delayed_node,
+				struct btrfs_root *root, u64 inode_id)
+{
+	delayed_node->root = root;
+	delayed_node->inode_id = inode_id;
+	atomic_set(&delayed_node->refs, 0);
+	delayed_node->count = 0;
+	delayed_node->flags = 0;
+	delayed_node->ins_root = RB_ROOT;
+	delayed_node->del_root = RB_ROOT;
+	mutex_init(&delayed_node->mutex);
+	delayed_node->index_cnt = 0;
+	INIT_LIST_HEAD(&delayed_node->n_list);
+	INIT_LIST_HEAD(&delayed_node->p_list);
+	delayed_node->bytes_reserved = 0;
+	memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
+}
+
+static inline int btrfs_is_continuous_delayed_item(
+					struct btrfs_delayed_item *item1,
+					struct btrfs_delayed_item *item2)
+{
+	if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
+	    item1->key.objectid == item2->key.objectid &&
+	    item1->key.type == item2->key.type &&
+	    item1->key.offset + 1 == item2->key.offset)
+		return 1;
+	return 0;
+}
+
+static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
+							struct btrfs_root *root)
+{
+	return root->fs_info->delayed_root;
+}
+
+static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
+{
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+	struct btrfs_root *root = btrfs_inode->root;
+	u64 ino = btrfs_ino(inode);
+	struct btrfs_delayed_node *node;
+
+	node = ACCESS_ONCE(btrfs_inode->delayed_node);
+	if (node) {
+		atomic_inc(&node->refs);
+		return node;
+	}
+
+	spin_lock(&root->inode_lock);
+	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+	if (node) {
+		if (btrfs_inode->delayed_node) {
+			atomic_inc(&node->refs);	/* can be accessed */
+			BUG_ON(btrfs_inode->delayed_node != node);
+			spin_unlock(&root->inode_lock);
+			return node;
+		}
+		btrfs_inode->delayed_node = node;
+		/* can be accessed and cached in the inode */
+		atomic_add(2, &node->refs);
+		spin_unlock(&root->inode_lock);
+		return node;
+	}
+	spin_unlock(&root->inode_lock);
+
+	return NULL;
+}
+
+/* Will return either the node or PTR_ERR(-ENOMEM) */
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+							struct inode *inode)
+{
+	struct btrfs_delayed_node *node;
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+	struct btrfs_root *root = btrfs_inode->root;
+	u64 ino = btrfs_ino(inode);
+	int ret;
+
+again:
+	node = btrfs_get_delayed_node(inode);
+	if (node)
+		return node;
+
+	node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	btrfs_init_delayed_node(node, root, ino);
+
+	/* cached in the btrfs inode and can be accessed */
+	atomic_add(2, &node->refs);
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret) {
+		kmem_cache_free(delayed_node_cache, node);
+		return ERR_PTR(ret);
+	}
+
+	spin_lock(&root->inode_lock);
+	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+	if (ret == -EEXIST) {
+		spin_unlock(&root->inode_lock);
+		kmem_cache_free(delayed_node_cache, node);
+		radix_tree_preload_end();
+		goto again;
+	}
+	btrfs_inode->delayed_node = node;
+	spin_unlock(&root->inode_lock);
+	radix_tree_preload_end();
+
+	return node;
+}
+
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+				     struct btrfs_delayed_node *node,
+				     int mod)
+{
+	spin_lock(&root->lock);
+	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+		if (!list_empty(&node->p_list))
+			list_move_tail(&node->p_list, &root->prepare_list);
+		else if (mod)
+			list_add_tail(&node->p_list, &root->prepare_list);
+	} else {
+		list_add_tail(&node->n_list, &root->node_list);
+		list_add_tail(&node->p_list, &root->prepare_list);
+		atomic_inc(&node->refs);	/* inserted into list */
+		root->nodes++;
+		set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+	}
+	spin_unlock(&root->lock);
+}
+
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+				       struct btrfs_delayed_node *node)
+{
+	spin_lock(&root->lock);
+	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+		root->nodes--;
+		atomic_dec(&node->refs);	/* not in the list */
+		list_del_init(&node->n_list);
+		if (!list_empty(&node->p_list))
+			list_del_init(&node->p_list);
+		clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+	}
+	spin_unlock(&root->lock);
+}
+
+static struct btrfs_delayed_node *btrfs_first_delayed_node(
+			struct btrfs_delayed_root *delayed_root)
+{
+	struct list_head *p;
+	struct btrfs_delayed_node *node = NULL;
+
+	spin_lock(&delayed_root->lock);
+	if (list_empty(&delayed_root->node_list))
+		goto out;
+
+	p = delayed_root->node_list.next;
+	node = list_entry(p, struct btrfs_delayed_node, n_list);
+	atomic_inc(&node->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return node;
+}
+
+static struct btrfs_delayed_node *btrfs_next_delayed_node(
+						struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct list_head *p;
+	struct btrfs_delayed_node *next = NULL;
+
+	delayed_root = node->root->fs_info->delayed_root;
+	spin_lock(&delayed_root->lock);
+	if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+		/* not in the list */
+		if (list_empty(&delayed_root->node_list))
+			goto out;
+		p = delayed_root->node_list.next;
+	} else if (list_is_last(&node->n_list, &delayed_root->node_list))
+		goto out;
+	else
+		p = node->n_list.next;
+
+	next = list_entry(p, struct btrfs_delayed_node, n_list);
+	atomic_inc(&next->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return next;
+}
+
+static void __btrfs_release_delayed_node(
+				struct btrfs_delayed_node *delayed_node,
+				int mod)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	if (!delayed_node)
+		return;
+
+	delayed_root = delayed_node->root->fs_info->delayed_root;
+
+	mutex_lock(&delayed_node->mutex);
+	if (delayed_node->count)
+		btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+	else
+		btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+	mutex_unlock(&delayed_node->mutex);
+
+	if (atomic_dec_and_test(&delayed_node->refs)) {
+		bool free = false;
+		struct btrfs_root *root = delayed_node->root;
+		spin_lock(&root->inode_lock);
+		if (atomic_read(&delayed_node->refs) == 0) {
+			radix_tree_delete(&root->delayed_nodes_tree,
+					  delayed_node->inode_id);
+			free = true;
+		}
+		spin_unlock(&root->inode_lock);
+		if (free)
+			kmem_cache_free(delayed_node_cache, delayed_node);
+	}
+}
+
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+	__btrfs_release_delayed_node(node, 0);
+}
+
+static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+					struct btrfs_delayed_root *delayed_root)
+{
+	struct list_head *p;
+	struct btrfs_delayed_node *node = NULL;
+
+	spin_lock(&delayed_root->lock);
+	if (list_empty(&delayed_root->prepare_list))
+		goto out;
+
+	p = delayed_root->prepare_list.next;
+	list_del_init(p);
+	node = list_entry(p, struct btrfs_delayed_node, p_list);
+	atomic_inc(&node->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return node;
+}
+
+static inline void btrfs_release_prepared_delayed_node(
+					struct btrfs_delayed_node *node)
+{
+	__btrfs_release_delayed_node(node, 1);
+}
+
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+{
+	struct btrfs_delayed_item *item;
+	item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+	if (item) {
+		item->data_len = data_len;
+		item->ins_or_del = 0;
+		item->bytes_reserved = 0;
+		item->delayed_node = NULL;
+		atomic_set(&item->refs, 1);
+	}
+	return item;
+}
+
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @key:	  the key to look up
+ * @prev:	  used to store the prev item if the right item isn't found
+ * @next:	  used to store the next item if the right item isn't found
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+				struct rb_root *root,
+				struct btrfs_key *key,
+				struct btrfs_delayed_item **prev,
+				struct btrfs_delayed_item **next)
+{
+	struct rb_node *node, *prev_node = NULL;
+	struct btrfs_delayed_item *delayed_item = NULL;
+	int ret = 0;
+
+	node = root->rb_node;
+
+	while (node) {
+		delayed_item = rb_entry(node, struct btrfs_delayed_item,
+					rb_node);
+		prev_node = node;
+		ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
+		if (ret < 0)
+			node = node->rb_right;
+		else if (ret > 0)
+			node = node->rb_left;
+		else
+			return delayed_item;
+	}
+
+	if (prev) {
+		if (!prev_node)
+			*prev = NULL;
+		else if (ret < 0)
+			*prev = delayed_item;
+		else if ((node = rb_prev(prev_node)) != NULL) {
+			*prev = rb_entry(node, struct btrfs_delayed_item,
+					 rb_node);
+		} else
+			*prev = NULL;
+	}
+
+	if (next) {
+		if (!prev_node)
+			*next = NULL;
+		else if (ret > 0)
+			*next = delayed_item;
+		else if ((node = rb_next(prev_node)) != NULL) {
+			*next = rb_entry(node, struct btrfs_delayed_item,
+					 rb_node);
+		} else
+			*next = NULL;
+	}
+	return NULL;
+}
+
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+					struct btrfs_delayed_node *delayed_node,
+					struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item;
+
+	item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+					   NULL, NULL);
+	return item;
+}
+
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+				    struct btrfs_delayed_item *ins,
+				    int action)
+{
+	struct rb_node **p, *node;
+	struct rb_node *parent_node = NULL;
+	struct rb_root *root;
+	struct btrfs_delayed_item *item;
+	int cmp;
+
+	if (action == BTRFS_DELAYED_INSERTION_ITEM)
+		root = &delayed_node->ins_root;
+	else if (action == BTRFS_DELAYED_DELETION_ITEM)
+		root = &delayed_node->del_root;
+	else
+		BUG();
+	p = &root->rb_node;
+	node = &ins->rb_node;
+
+	while (*p) {
+		parent_node = *p;
+		item = rb_entry(parent_node, struct btrfs_delayed_item,
+				 rb_node);
+
+		cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
+		if (cmp < 0)
+			p = &(*p)->rb_right;
+		else if (cmp > 0)
+			p = &(*p)->rb_left;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	ins->delayed_node = delayed_node;
+	ins->ins_or_del = action;
+
+	if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
+	    action == BTRFS_DELAYED_INSERTION_ITEM &&
+	    ins->key.offset >= delayed_node->index_cnt)
+			delayed_node->index_cnt = ins->key.offset + 1;
+
+	delayed_node->count++;
+	atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+	return 0;
+}
+
+static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
+					      struct btrfs_delayed_item *item)
+{
+	return __btrfs_add_delayed_item(node, item,
+					BTRFS_DELAYED_INSERTION_ITEM);
+}
+
+static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
+					     struct btrfs_delayed_item *item)
+{
+	return __btrfs_add_delayed_item(node, item,
+					BTRFS_DELAYED_DELETION_ITEM);
+}
+
+static void finish_one_item(struct btrfs_delayed_root *delayed_root)
+{
+	int seq = atomic_inc_return(&delayed_root->items_seq);
+	if ((atomic_dec_return(&delayed_root->items) <
+	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
+	    waitqueue_active(&delayed_root->wait))
+		wake_up(&delayed_root->wait);
+}
+
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+	struct rb_root *root;
+	struct btrfs_delayed_root *delayed_root;
+
+	delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+
+	BUG_ON(!delayed_root);
+	BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
+	       delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
+
+	if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+		root = &delayed_item->delayed_node->ins_root;
+	else
+		root = &delayed_item->delayed_node->del_root;
+
+	rb_erase(&delayed_item->rb_node, root);
+	delayed_item->delayed_node->count--;
+
+	finish_one_item(delayed_root);
+}
+
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+	if (item) {
+		__btrfs_remove_delayed_item(item);
+		if (atomic_dec_and_test(&item->refs))
+			kfree(item);
+	}
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+					struct btrfs_delayed_node *delayed_node)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *item = NULL;
+
+	p = rb_first(&delayed_node->ins_root);
+	if (p)
+		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+					struct btrfs_delayed_node *delayed_node)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *item = NULL;
+
+	p = rb_first(&delayed_node->del_root);
+	if (p)
+		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_next_delayed_item(
+						struct btrfs_delayed_item *item)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *next = NULL;
+
+	p = rb_next(&item->rb_node);
+	if (p)
+		next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return next;
+}
+
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+					       struct btrfs_root *root,
+					       struct btrfs_delayed_item *item)
+{
+	struct btrfs_block_rsv *src_rsv;
+	struct btrfs_block_rsv *dst_rsv;
+	u64 num_bytes;
+	int ret;
+
+	if (!trans->bytes_reserved)
+		return 0;
+
+	src_rsv = trans->block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+					      item->key.objectid,
+					      num_bytes, 1);
+		item->bytes_reserved = num_bytes;
+	}
+
+	return ret;
+}
+
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+						struct btrfs_delayed_item *item)
+{
+	struct btrfs_block_rsv *rsv;
+
+	if (!item->bytes_reserved)
+		return;
+
+	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+				      item->key.objectid, item->bytes_reserved,
+				      0);
+	btrfs_block_rsv_release(root, rsv,
+				item->bytes_reserved);
+}
+
+static int btrfs_delayed_inode_reserve_metadata(
+					struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct inode *inode,
+					struct btrfs_delayed_node *node)
+{
+	struct btrfs_block_rsv *src_rsv;
+	struct btrfs_block_rsv *dst_rsv;
+	u64 num_bytes;
+	int ret;
+	bool release = false;
+
+	src_rsv = trans->block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+	/*
+	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+	 * which doesn't reserve space for speed.  This is a problem since we
+	 * still need to reserve space for this update, so try to reserve the
+	 * space.
+	 *
+	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+	 * we're accounted for.
+	 */
+	if (!src_rsv || (!trans->bytes_reserved &&
+			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
+		/*
+		 * Since we're under a transaction reserve_metadata_bytes could
+		 * try to commit the transaction which will make it return
+		 * EAGAIN to make us stop the transaction we have, so return
+		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+		 */
+		if (ret == -EAGAIN)
+			ret = -ENOSPC;
+		if (!ret) {
+			node->bytes_reserved = num_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "delayed_inode",
+						      btrfs_ino(inode),
+						      num_bytes, 1);
+		}
+		return ret;
+	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
+		spin_lock(&BTRFS_I(inode)->lock);
+		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+				       &BTRFS_I(inode)->runtime_flags)) {
+			spin_unlock(&BTRFS_I(inode)->lock);
+			release = true;
+			goto migrate;
+		}
+		spin_unlock(&BTRFS_I(inode)->lock);
+
+		/* Ok we didn't have space pre-reserved.  This shouldn't happen
+		 * too often but it can happen if we do delalloc to an existing
+		 * inode which gets dirtied because of the time update, and then
+		 * isn't touched again until after the transaction commits and
+		 * then we try to write out the data.  First try to be nice and
+		 * reserve something strictly for us.  If not be a pain and try
+		 * to steal from the delalloc block rsv.
+		 */
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
+		if (!ret)
+			goto out;
+
+		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+		if (!WARN_ON(ret))
+			goto out;
+
+		/*
+		 * Ok this is a problem, let's just steal from the global rsv
+		 * since this really shouldn't happen that often.
+		 */
+		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+					      dst_rsv, num_bytes);
+		goto out;
+	}
+
+migrate:
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+
+out:
+	/*
+	 * Migrate only takes a reservation, it doesn't touch the size of the
+	 * block_rsv.  This is to simplify people who don't normally have things
+	 * migrated from their block rsv.  If they go to release their
+	 * reservation, that will decrease the size as well, so if migrate
+	 * reduced size we'd end up with a negative size.  But for the
+	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+	 * but we could in fact do this reserve/migrate dance several times
+	 * between the time we did the original reservation and we'd clean it
+	 * up.  So to take care of this, release the space for the meta
+	 * reservation here.  I think it may be time for a documentation page on
+	 * how block rsvs. work.
+	 */
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+					      btrfs_ino(inode), num_bytes, 1);
+		node->bytes_reserved = num_bytes;
+	}
+
+	if (release) {
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), num_bytes, 0);
+		btrfs_block_rsv_release(root, src_rsv, num_bytes);
+	}
+
+	return ret;
+}
+
+static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+						struct btrfs_delayed_node *node)
+{
+	struct btrfs_block_rsv *rsv;
+
+	if (!node->bytes_reserved)
+		return;
+
+	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+				      node->inode_id, node->bytes_reserved, 0);
+	btrfs_block_rsv_release(root, rsv,
+				node->bytes_reserved);
+	node->bytes_reserved = 0;
+}
+
+/*
+ * This helper will insert some continuous items into the same leaf according
+ * to the free space of the leaf.
+ */
+static int btrfs_batch_insert_items(struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct btrfs_delayed_item *item)
+{
+	struct btrfs_delayed_item *curr, *next;
+	int free_space;
+	int total_data_size = 0, total_size = 0;
+	struct extent_buffer *leaf;
+	char *data_ptr;
+	struct btrfs_key *keys;
+	u32 *data_size;
+	struct list_head head;
+	int slot;
+	int nitems;
+	int i;
+	int ret = 0;
+
+	BUG_ON(!path->nodes[0]);
+
+	leaf = path->nodes[0];
+	free_space = btrfs_leaf_free_space(root, leaf);
+	INIT_LIST_HEAD(&head);
+
+	next = item;
+	nitems = 0;
+
+	/*
+	 * count the number of the continuous items that we can insert in batch
+	 */
+	while (total_size + next->data_len + sizeof(struct btrfs_item) <=
+	       free_space) {
+		total_data_size += next->data_len;
+		total_size += next->data_len + sizeof(struct btrfs_item);
+		list_add_tail(&next->tree_list, &head);
+		nitems++;
+
+		curr = next;
+		next = __btrfs_next_delayed_item(curr);
+		if (!next)
+			break;
+
+		if (!btrfs_is_continuous_delayed_item(curr, next))
+			break;
+	}
+
+	if (!nitems) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * we need allocate some memory space, but it might cause the task
+	 * to sleep, so we set all locked nodes in the path to blocking locks
+	 * first.
+	 */
+	btrfs_set_path_blocking(path);
+
+	keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
+	if (!keys) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
+	if (!data_size) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* get keys of all the delayed items */
+	i = 0;
+	list_for_each_entry(next, &head, tree_list) {
+		keys[i] = next->key;
+		data_size[i] = next->data_len;
+		i++;
+	}
+
+	/* reset all the locked nodes in the patch to spinning locks. */
+	btrfs_clear_path_blocking(path, NULL, 0);
+
+	/* insert the keys of the items */
+	setup_items_for_insert(root, path, keys, data_size,
+			       total_data_size, total_size, nitems);
+
+	/* insert the dir index items */
+	slot = path->slots[0];
+	list_for_each_entry_safe(curr, next, &head, tree_list) {
+		data_ptr = btrfs_item_ptr(leaf, slot, char);
+		write_extent_buffer(leaf, &curr->data,
+				    (unsigned long)data_ptr,
+				    curr->data_len);
+		slot++;
+
+		btrfs_delayed_item_release_metadata(root, curr);
+
+		list_del(&curr->tree_list);
+		btrfs_release_delayed_item(curr);
+	}
+
+error:
+	kfree(data_size);
+	kfree(keys);
+out:
+	return ret;
+}
+
+/*
+ * This helper can just do simple insertion that needn't extend item for new
+ * data, such as directory name index insertion, inode insertion.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path,
+				     struct btrfs_delayed_item *delayed_item)
+{
+	struct extent_buffer *leaf;
+	char *ptr;
+	int ret;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+				      delayed_item->data_len);
+	if (ret < 0 && ret != -EEXIST)
+		return ret;
+
+	leaf = path->nodes[0];
+
+	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+
+	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
+			    delayed_item->data_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_delayed_item_release_metadata(root, delayed_item);
+	return 0;
+}
+
+/*
+ * we insert an item first, then if there are some continuous items, we try
+ * to insert those items into the same leaf.
+ */
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+				      struct btrfs_path *path,
+				      struct btrfs_root *root,
+				      struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_item *curr, *prev;
+	int ret = 0;
+
+do_again:
+	mutex_lock(&node->mutex);
+	curr = __btrfs_first_delayed_insertion_item(node);
+	if (!curr)
+		goto insert_end;
+
+	ret = btrfs_insert_delayed_item(trans, root, path, curr);
+	if (ret < 0) {
+		btrfs_release_path(path);
+		goto insert_end;
+	}
+
+	prev = curr;
+	curr = __btrfs_next_delayed_item(prev);
+	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
+		/* insert the continuous items into the same leaf */
+		path->slots[0]++;
+		btrfs_batch_insert_items(root, path, curr);
+	}
+	btrfs_release_delayed_item(prev);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	goto do_again;
+
+insert_end:
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct btrfs_delayed_item *item)
+{
+	struct btrfs_delayed_item *curr, *next;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct list_head head;
+	int nitems, i, last_item;
+	int ret = 0;
+
+	BUG_ON(!path->nodes[0]);
+
+	leaf = path->nodes[0];
+
+	i = path->slots[0];
+	last_item = btrfs_header_nritems(leaf) - 1;
+	if (i > last_item)
+		return -ENOENT;	/* FIXME: Is errno suitable? */
+
+	next = item;
+	INIT_LIST_HEAD(&head);
+	btrfs_item_key_to_cpu(leaf, &key, i);
+	nitems = 0;
+	/*
+	 * count the number of the dir index items that we can delete in batch
+	 */
+	while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
+		list_add_tail(&next->tree_list, &head);
+		nitems++;
+
+		curr = next;
+		next = __btrfs_next_delayed_item(curr);
+		if (!next)
+			break;
+
+		if (!btrfs_is_continuous_delayed_item(curr, next))
+			break;
+
+		i++;
+		if (i > last_item)
+			break;
+		btrfs_item_key_to_cpu(leaf, &key, i);
+	}
+
+	if (!nitems)
+		return 0;
+
+	ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+	if (ret)
+		goto out;
+
+	list_for_each_entry_safe(curr, next, &head, tree_list) {
+		btrfs_delayed_item_release_metadata(root, curr);
+		list_del(&curr->tree_list);
+		btrfs_release_delayed_item(curr);
+	}
+
+out:
+	return ret;
+}
+
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+				      struct btrfs_path *path,
+				      struct btrfs_root *root,
+				      struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_item *curr, *prev;
+	int ret = 0;
+
+do_again:
+	mutex_lock(&node->mutex);
+	curr = __btrfs_first_delayed_deletion_item(node);
+	if (!curr)
+		goto delete_fail;
+
+	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+	if (ret < 0)
+		goto delete_fail;
+	else if (ret > 0) {
+		/*
+		 * can't find the item which the node points to, so this node
+		 * is invalid, just drop it.
+		 */
+		prev = curr;
+		curr = __btrfs_next_delayed_item(prev);
+		btrfs_release_delayed_item(prev);
+		ret = 0;
+		btrfs_release_path(path);
+		if (curr) {
+			mutex_unlock(&node->mutex);
+			goto do_again;
+		} else
+			goto delete_fail;
+	}
+
+	btrfs_batch_delete_items(trans, root, path, curr);
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	goto do_again;
+
+delete_fail:
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	if (delayed_node &&
+	    test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		BUG_ON(!delayed_node->root);
+		clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+		delayed_node->count--;
+
+		delayed_root = delayed_node->root->fs_info->delayed_root;
+		finish_one_item(delayed_root);
+	}
+}
+
+static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	ASSERT(delayed_node->root);
+	clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+	delayed_node->count--;
+
+	delayed_root = delayed_node->root->fs_info->delayed_root;
+	finish_one_item(delayed_root);
+}
+
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_delayed_node *node)
+{
+	struct btrfs_key key;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	int mod;
+	int ret;
+
+	key.objectid = node->inode_id;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+		mod = -1;
+	else
+		mod = 1;
+
+	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
+	if (ret > 0) {
+		btrfs_release_path(path);
+		return -ENOENT;
+	} else if (ret < 0) {
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+			    sizeof(struct btrfs_inode_item));
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+		goto no_iref;
+
+	path->slots[0]++;
+	if (path->slots[0] >= btrfs_header_nritems(leaf))
+		goto search;
+again:
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (key.objectid != node->inode_id)
+		goto out;
+
+	if (key.type != BTRFS_INODE_REF_KEY &&
+	    key.type != BTRFS_INODE_EXTREF_KEY)
+		goto out;
+
+	/*
+	 * Delayed iref deletion is for the inode who has only one link,
+	 * so there is only one iref. The case that several irefs are
+	 * in the same item doesn't exist.
+	 */
+	btrfs_del_item(trans, root, path);
+out:
+	btrfs_release_delayed_iref(node);
+no_iref:
+	btrfs_release_path(path);
+err_out:
+	btrfs_delayed_inode_release_metadata(root, node);
+	btrfs_release_delayed_inode(node);
+
+	return ret;
+
+search:
+	btrfs_release_path(path);
+
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = -1;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto err_out;
+	ASSERT(ret);
+
+	ret = 0;
+	leaf = path->nodes[0];
+	path->slots[0]--;
+	goto again;
+}
+
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path,
+					     struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	mutex_lock(&node->mutex);
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
+		mutex_unlock(&node->mutex);
+		return 0;
+	}
+
+	ret = __btrfs_update_delayed_inode(trans, root, path, node);
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path,
+				   struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+	return ret;
+}
+
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, int nr)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_delayed_node *curr_node, *prev_node;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret = 0;
+	bool count = (nr > 0);
+
+	if (trans->aborted)
+		return -EIO;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	curr_node = btrfs_first_delayed_node(delayed_root);
+	while (curr_node && (!count || (count && nr--))) {
+		ret = __btrfs_commit_inode_delayed_items(trans, path,
+							 curr_node);
+		if (ret) {
+			btrfs_release_delayed_node(curr_node);
+			curr_node = NULL;
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
+
+		prev_node = curr_node;
+		curr_node = btrfs_next_delayed_node(curr_node);
+		btrfs_release_delayed_node(prev_node);
+	}
+
+	if (curr_node)
+		btrfs_release_delayed_node(curr_node);
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+
+	return ret;
+}
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	return __btrfs_run_delayed_items(trans, root, -1);
+}
+
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, int nr)
+{
+	return __btrfs_run_delayed_items(trans, root, nr);
+}
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret;
+
+	if (!delayed_node)
+		return 0;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!delayed_node->count) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return 0;
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		btrfs_release_delayed_node(delayed_node);
+		return -ENOMEM;
+	}
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+	btrfs_release_delayed_node(delayed_node);
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+
+	return ret;
+}
+
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret;
+
+	if (!delayed_node)
+		return 0;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return 0;
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+	trans = btrfs_join_transaction(delayed_node->root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto trans_out;
+	}
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+	mutex_lock(&delayed_node->mutex);
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
+		ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+						   path, delayed_node);
+	else
+		ret = 0;
+	mutex_unlock(&delayed_node->mutex);
+
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+trans_out:
+	btrfs_end_transaction(trans, delayed_node->root);
+	btrfs_btree_balance_dirty(delayed_node->root);
+out:
+	btrfs_release_delayed_node(delayed_node);
+
+	return ret;
+}
+
+void btrfs_remove_delayed_node(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
+	if (!delayed_node)
+		return;
+
+	BTRFS_I(inode)->delayed_node = NULL;
+	btrfs_release_delayed_node(delayed_node);
+}
+
+struct btrfs_async_delayed_work {
+	struct btrfs_delayed_root *delayed_root;
+	int nr;
+	struct btrfs_work work;
+};
+
+static void btrfs_async_run_delayed_root(struct btrfs_work *work)
+{
+	struct btrfs_async_delayed_work *async_work;
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_delayed_node *delayed_node = NULL;
+	struct btrfs_root *root;
+	struct btrfs_block_rsv *block_rsv;
+	int total_done = 0;
+
+	async_work = container_of(work, struct btrfs_async_delayed_work, work);
+	delayed_root = async_work->delayed_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out;
+
+again:
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
+		goto free_path;
+
+	delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+	if (!delayed_node)
+		goto free_path;
+
+	path->leave_spinning = 1;
+	root = delayed_node->root;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		goto release_path;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
+
+	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+	trans->block_rsv = block_rsv;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty_nodelay(root);
+
+release_path:
+	btrfs_release_path(path);
+	total_done++;
+
+	btrfs_release_prepared_delayed_node(delayed_node);
+	if (async_work->nr == 0 || total_done < async_work->nr)
+		goto again;
+
+free_path:
+	btrfs_free_path(path);
+out:
+	wake_up(&delayed_root->wait);
+	kfree(async_work);
+}
+
+
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+				     struct btrfs_fs_info *fs_info, int nr)
+{
+	struct btrfs_async_delayed_work *async_work;
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return 0;
+
+	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
+	if (!async_work)
+		return -ENOMEM;
+
+	async_work->delayed_root = delayed_root;
+	btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
+			btrfs_async_run_delayed_root, NULL, NULL);
+	async_work->nr = nr;
+
+	btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
+	return 0;
+}
+
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	delayed_root = btrfs_get_delayed_root(root);
+	WARN_ON(btrfs_first_delayed_node(delayed_root));
+}
+
+static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+{
+	int val = atomic_read(&delayed_root->items_seq);
+
+	if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
+		return 1;
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return 1;
+
+	return 0;
+}
+
+void btrfs_balance_delayed_items(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return;
+
+	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+		int seq;
+		int ret;
+
+		seq = atomic_read(&delayed_root->items_seq);
+
+		ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
+		if (ret)
+			return;
+
+		wait_event_interruptible(delayed_root->wait,
+					 could_end_wait(delayed_root, seq));
+		return;
+	}
+
+	btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
+}
+
+/* Will return 0 or -ENOMEM */
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, const char *name,
+				   int name_len, struct inode *dir,
+				   struct btrfs_disk_key *disk_key, u8 type,
+				   u64 index)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_delayed_item *delayed_item;
+	struct btrfs_dir_item *dir_item;
+	int ret;
+
+	delayed_node = btrfs_get_or_create_delayed_node(dir);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+	if (!delayed_item) {
+		ret = -ENOMEM;
+		goto release_node;
+	}
+
+	delayed_item->key.objectid = btrfs_ino(dir);
+	delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
+	delayed_item->key.offset = index;
+
+	dir_item = (struct btrfs_dir_item *)delayed_item->data;
+	dir_item->location = *disk_key;
+	btrfs_set_stack_dir_transid(dir_item, trans->transid);
+	btrfs_set_stack_dir_data_len(dir_item, 0);
+	btrfs_set_stack_dir_name_len(dir_item, name_len);
+	btrfs_set_stack_dir_type(dir_item, type);
+	memcpy((char *)(dir_item + 1), name, name_len);
+
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
+
+	mutex_lock(&delayed_node->mutex);
+	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+	if (unlikely(ret)) {
+		btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
+				"into the insertion tree of the delayed node"
+				"(root id: %llu, inode id: %llu, errno: %d)",
+				name_len, name, delayed_node->root->objectid,
+				delayed_node->inode_id, ret);
+		BUG();
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+release_node:
+	btrfs_release_delayed_node(delayed_node);
+	return ret;
+}
+
+static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+					       struct btrfs_delayed_node *node,
+					       struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item;
+
+	mutex_lock(&node->mutex);
+	item = __btrfs_lookup_delayed_insertion_item(node, key);
+	if (!item) {
+		mutex_unlock(&node->mutex);
+		return 1;
+	}
+
+	btrfs_delayed_item_release_metadata(root, item);
+	btrfs_release_delayed_item(item);
+	mutex_unlock(&node->mutex);
+	return 0;
+}
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct inode *dir,
+				   u64 index)
+{
+	struct btrfs_delayed_node *node;
+	struct btrfs_delayed_item *item;
+	struct btrfs_key item_key;
+	int ret;
+
+	node = btrfs_get_or_create_delayed_node(dir);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	item_key.objectid = btrfs_ino(dir);
+	item_key.type = BTRFS_DIR_INDEX_KEY;
+	item_key.offset = index;
+
+	ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+	if (!ret)
+		goto end;
+
+	item = btrfs_alloc_delayed_item(0);
+	if (!item) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	item->key = item_key;
+
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible.
+	 */
+	BUG_ON(ret);
+
+	mutex_lock(&node->mutex);
+	ret = __btrfs_add_delayed_deletion_item(node, item);
+	if (unlikely(ret)) {
+		btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
+				"into the deletion tree of the delayed node"
+				"(root id: %llu, inode id: %llu, errno: %d)",
+				index, node->root->objectid, node->inode_id,
+				ret);
+		BUG();
+	}
+	mutex_unlock(&node->mutex);
+end:
+	btrfs_release_delayed_node(node);
+	return ret;
+}
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+
+	if (!delayed_node)
+		return -ENOENT;
+
+	/*
+	 * Since we have held i_mutex of this directory, it is impossible that
+	 * a new directory index is added into the delayed node and index_cnt
+	 * is updated now. So we needn't lock the delayed node.
+	 */
+	if (!delayed_node->index_cnt) {
+		btrfs_release_delayed_node(delayed_node);
+		return -EINVAL;
+	}
+
+	BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+			     struct list_head *del_list)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_delayed_item *item;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return;
+
+	mutex_lock(&delayed_node->mutex);
+	item = __btrfs_first_delayed_insertion_item(delayed_node);
+	while (item) {
+		atomic_inc(&item->refs);
+		list_add_tail(&item->readdir_list, ins_list);
+		item = __btrfs_next_delayed_item(item);
+	}
+
+	item = __btrfs_first_delayed_deletion_item(delayed_node);
+	while (item) {
+		atomic_inc(&item->refs);
+		list_add_tail(&item->readdir_list, del_list);
+		item = __btrfs_next_delayed_item(item);
+	}
+	mutex_unlock(&delayed_node->mutex);
+	/*
+	 * This delayed node is still cached in the btrfs inode, so refs
+	 * must be > 1 now, and we needn't check it is going to be freed
+	 * or not.
+	 *
+	 * Besides that, this function is used to read dir, we do not
+	 * insert/delete delayed items in this period. So we also needn't
+	 * requeue or dequeue this delayed node.
+	 */
+	atomic_dec(&delayed_node->refs);
+}
+
+void btrfs_put_delayed_items(struct list_head *ins_list,
+			     struct list_head *del_list)
+{
+	struct btrfs_delayed_item *curr, *next;
+
+	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+		list_del(&curr->readdir_list);
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+	}
+
+	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+		list_del(&curr->readdir_list);
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+	}
+}
+
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+				  u64 index)
+{
+	struct btrfs_delayed_item *curr, *next;
+	int ret;
+
+	if (list_empty(del_list))
+		return 0;
+
+	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+		if (curr->key.offset > index)
+			break;
+
+		list_del(&curr->readdir_list);
+		ret = (curr->key.offset == index);
+
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+
+		if (ret)
+			return 1;
+		else
+			continue;
+	}
+	return 0;
+}
+
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+				    struct list_head *ins_list)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_delayed_item *curr, *next;
+	struct btrfs_key location;
+	char *name;
+	int name_len;
+	int over = 0;
+	unsigned char d_type;
+
+	if (list_empty(ins_list))
+		return 0;
+
+	/*
+	 * Changing the data of the delayed item is impossible. So
+	 * we needn't lock them. And we have held i_mutex of the
+	 * directory, nobody can delete any directory indexes now.
+	 */
+	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+		list_del(&curr->readdir_list);
+
+		if (curr->key.offset < ctx->pos) {
+			if (atomic_dec_and_test(&curr->refs))
+				kfree(curr);
+			continue;
+		}
+
+		ctx->pos = curr->key.offset;
+
+		di = (struct btrfs_dir_item *)curr->data;
+		name = (char *)(di + 1);
+		name_len = btrfs_stack_dir_name_len(di);
+
+		d_type = btrfs_filetype_table[di->type];
+		btrfs_disk_key_to_cpu(&location, &di->location);
+
+		over = !dir_emit(ctx, name, name_len,
+			       location.objectid, d_type);
+
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+
+		if (over)
+			return 1;
+	}
+	return 0;
+}
+
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+				  struct btrfs_inode_item *inode_item,
+				  struct inode *inode)
+{
+	btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
+	btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
+	btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+	btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+	btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+	btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+	btrfs_set_stack_inode_generation(inode_item,
+					 BTRFS_I(inode)->generation);
+	btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
+	btrfs_set_stack_inode_transid(inode_item, trans->transid);
+	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+	btrfs_set_stack_inode_block_group(inode_item, 0);
+
+	btrfs_set_stack_timespec_sec(&inode_item->atime,
+				     inode->i_atime.tv_sec);
+	btrfs_set_stack_timespec_nsec(&inode_item->atime,
+				      inode->i_atime.tv_nsec);
+
+	btrfs_set_stack_timespec_sec(&inode_item->mtime,
+				     inode->i_mtime.tv_sec);
+	btrfs_set_stack_timespec_nsec(&inode_item->mtime,
+				      inode->i_mtime.tv_nsec);
+
+	btrfs_set_stack_timespec_sec(&inode_item->ctime,
+				     inode->i_ctime.tv_sec);
+	btrfs_set_stack_timespec_nsec(&inode_item->ctime,
+				      inode->i_ctime.tv_nsec);
+
+	btrfs_set_stack_timespec_sec(&inode_item->otime,
+				     BTRFS_I(inode)->i_otime.tv_sec);
+	btrfs_set_stack_timespec_nsec(&inode_item->otime,
+				     BTRFS_I(inode)->i_otime.tv_nsec);
+}
+
+int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_inode_item *inode_item;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return -ENOENT;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return -ENOENT;
+	}
+
+	inode_item = &delayed_node->inode_item;
+
+	i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
+	i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
+	btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+	inode->i_mode = btrfs_stack_inode_mode(inode_item);
+	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
+	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
+	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+        BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
+
+	inode->i_version = btrfs_stack_inode_sequence(inode_item);
+	inode->i_rdev = 0;
+	*rdev = btrfs_stack_inode_rdev(inode_item);
+	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+
+	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+	inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+	inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+
+	inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+	inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
+
+	BTRFS_I(inode)->i_otime.tv_sec =
+		btrfs_stack_timespec_sec(&inode_item->otime);
+	BTRFS_I(inode)->i_otime.tv_nsec =
+		btrfs_stack_timespec_nsec(&inode_item->otime);
+
+	inode->i_generation = BTRFS_I(inode)->generation;
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+	int ret = 0;
+
+	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	mutex_lock(&delayed_node->mutex);
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+		goto release_node;
+	}
+
+	ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+						   delayed_node);
+	if (ret)
+		goto release_node;
+
+	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+	set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+	delayed_node->count++;
+	atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return ret;
+}
+
+int btrfs_delayed_delete_inode_ref(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	/*
+	 * we don't do delayed inode updates during log recovery because it
+	 * leads to enospc problems.  This means we also can't do
+	 * delayed inode refs
+	 */
+	if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
+		return -EAGAIN;
+
+	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	/*
+	 * We don't reserve space for inode ref deletion is because:
+	 * - We ONLY do async inode ref deletion for the inode who has only
+	 *   one link(i_nlink == 1), it means there is only one inode ref.
+	 *   And in most case, the inode ref and the inode item are in the
+	 *   same leaf, and we will deal with them at the same time.
+	 *   Since we are sure we will reserve the space for the inode item,
+	 *   it is unnecessary to reserve space for inode ref deletion.
+	 * - If the inode ref and the inode item are not in the same leaf,
+	 *   We also needn't worry about enospc problem, because we reserve
+	 *   much more space for the inode update than it needs.
+	 * - At the worst, we can steal some space from the global reservation.
+	 *   It is very rare.
+	 */
+	mutex_lock(&delayed_node->mutex);
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+		goto release_node;
+
+	set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+	delayed_node->count++;
+	atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
+release_node:
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_root *root = delayed_node->root;
+	struct btrfs_delayed_item *curr_item, *prev_item;
+
+	mutex_lock(&delayed_node->mutex);
+	curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+	while (curr_item) {
+		btrfs_delayed_item_release_metadata(root, curr_item);
+		prev_item = curr_item;
+		curr_item = __btrfs_next_delayed_item(prev_item);
+		btrfs_release_delayed_item(prev_item);
+	}
+
+	curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+	while (curr_item) {
+		btrfs_delayed_item_release_metadata(root, curr_item);
+		prev_item = curr_item;
+		curr_item = __btrfs_next_delayed_item(prev_item);
+		btrfs_release_delayed_item(prev_item);
+	}
+
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+		btrfs_release_delayed_iref(delayed_node);
+
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		btrfs_delayed_inode_release_metadata(root, delayed_node);
+		btrfs_release_delayed_inode(delayed_node);
+	}
+	mutex_unlock(&delayed_node->mutex);
+}
+
+void btrfs_kill_delayed_inode_items(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return;
+
+	__btrfs_kill_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node);
+}
+
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+	u64 inode_id = 0;
+	struct btrfs_delayed_node *delayed_nodes[8];
+	int i, n;
+
+	while (1) {
+		spin_lock(&root->inode_lock);
+		n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+					   (void **)delayed_nodes, inode_id,
+					   ARRAY_SIZE(delayed_nodes));
+		if (!n) {
+			spin_unlock(&root->inode_lock);
+			break;
+		}
+
+		inode_id = delayed_nodes[n - 1]->inode_id + 1;
+
+		for (i = 0; i < n; i++)
+			atomic_inc(&delayed_nodes[i]->refs);
+		spin_unlock(&root->inode_lock);
+
+		for (i = 0; i < n; i++) {
+			__btrfs_kill_delayed_node(delayed_nodes[i]);
+			btrfs_release_delayed_node(delayed_nodes[i]);
+		}
+	}
+}
+
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_delayed_node *curr_node, *prev_node;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	curr_node = btrfs_first_delayed_node(delayed_root);
+	while (curr_node) {
+		__btrfs_kill_delayed_node(curr_node);
+
+		prev_node = curr_node;
+		curr_node = btrfs_next_delayed_node(curr_node);
+		btrfs_release_delayed_node(prev_node);
+	}
+}
+
diff --git a/kernel/fs/btrfs/delayed-inode.h b/kernel/fs/btrfs/delayed-inode.h
new file mode 100644
index 000000000..f70119f25
--- /dev/null
+++ b/kernel/fs/btrfs/delayed-inode.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DELAYED_TREE_OPERATION_H
+#define __DELAYED_TREE_OPERATION_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/atomic.h>
+
+#include "ctree.h"
+
+/* types of the delayed item */
+#define BTRFS_DELAYED_INSERTION_ITEM	1
+#define BTRFS_DELAYED_DELETION_ITEM	2
+
+struct btrfs_delayed_root {
+	spinlock_t lock;
+	struct list_head node_list;
+	/*
+	 * Used for delayed nodes which is waiting to be dealt with by the
+	 * worker. If the delayed node is inserted into the work queue, we
+	 * drop it from this list.
+	 */
+	struct list_head prepare_list;
+	atomic_t items;		/* for delayed items */
+	atomic_t items_seq;	/* for delayed items */
+	int nodes;		/* for delayed nodes */
+	wait_queue_head_t wait;
+};
+
+#define BTRFS_DELAYED_NODE_IN_LIST	0
+#define BTRFS_DELAYED_NODE_INODE_DIRTY	1
+#define BTRFS_DELAYED_NODE_DEL_IREF	2
+
+struct btrfs_delayed_node {
+	u64 inode_id;
+	u64 bytes_reserved;
+	struct btrfs_root *root;
+	/* Used to add the node into the delayed root's node list. */
+	struct list_head n_list;
+	/*
+	 * Used to add the node into the prepare list, the nodes in this list
+	 * is waiting to be dealt with by the async worker.
+	 */
+	struct list_head p_list;
+	struct rb_root ins_root;
+	struct rb_root del_root;
+	struct mutex mutex;
+	struct btrfs_inode_item inode_item;
+	atomic_t refs;
+	u64 index_cnt;
+	unsigned long flags;
+	int count;
+};
+
+struct btrfs_delayed_item {
+	struct rb_node rb_node;
+	struct btrfs_key key;
+	struct list_head tree_list;	/* used for batch insert/delete items */
+	struct list_head readdir_list;	/* used for readdir items */
+	u64 bytes_reserved;
+	struct btrfs_delayed_node *delayed_node;
+	atomic_t refs;
+	int ins_or_del;
+	u32 data_len;
+	char data[0];
+};
+
+static inline void btrfs_init_delayed_root(
+				struct btrfs_delayed_root *delayed_root)
+{
+	atomic_set(&delayed_root->items, 0);
+	atomic_set(&delayed_root->items_seq, 0);
+	delayed_root->nodes = 0;
+	spin_lock_init(&delayed_root->lock);
+	init_waitqueue_head(&delayed_root->wait);
+	INIT_LIST_HEAD(&delayed_root->node_list);
+	INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, const char *name,
+				   int name_len, struct inode *dir,
+				   struct btrfs_disk_key *disk_key, u8 type,
+				   u64 index);
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct inode *dir,
+				   u64 index);
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode);
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, int nr);
+
+void btrfs_balance_delayed_items(struct btrfs_root *root);
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct inode *inode);
+void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
+
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, struct inode *inode);
+int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_delayed_delete_inode_ref(struct inode *inode);
+
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+
+/* Used for readdir() */
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+			     struct list_head *del_list);
+void btrfs_put_delayed_items(struct list_head *ins_list,
+			     struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+				  u64 index);
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+				    struct list_head *ins_list);
+
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void btrfs_delayed_inode_exit(void);
+
+/* for debugging */
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
+
+#endif
diff --git a/kernel/fs/btrfs/delayed-ref.c b/kernel/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000..8f8ed7d20
--- /dev/null
+++ b/kernel/fs/btrfs/delayed-ref.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ */
+
+/*
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+			  struct btrfs_delayed_tree_ref *ref1, int type)
+{
+	if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * compare two delayed data backrefs with same bytenr and type
+ */
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
+			  struct btrfs_delayed_data_ref *ref1)
+{
+	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+		if (ref1->objectid < ref2->objectid)
+			return -1;
+		if (ref1->objectid > ref2->objectid)
+			return 1;
+		if (ref1->offset < ref2->offset)
+			return -1;
+		if (ref1->offset > ref2->offset)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+		      struct btrfs_delayed_ref_node *ref1,
+		      bool compare_seq)
+{
+	if (ref1->bytenr < ref2->bytenr)
+		return -1;
+	if (ref1->bytenr > ref2->bytenr)
+		return 1;
+	if (ref1->is_head && ref2->is_head)
+		return 0;
+	if (ref2->is_head)
+		return -1;
+	if (ref1->is_head)
+		return 1;
+	if (ref1->type < ref2->type)
+		return -1;
+	if (ref1->type > ref2->type)
+		return 1;
+	if (ref1->no_quota > ref2->no_quota)
+		return 1;
+	if (ref1->no_quota < ref2->no_quota)
+		return -1;
+	/* merging of sequenced refs is not allowed */
+	if (compare_seq) {
+		if (ref1->seq < ref2->seq)
+			return -1;
+		if (ref1->seq > ref2->seq)
+			return 1;
+	}
+	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+				      btrfs_delayed_node_to_tree_ref(ref1),
+				      ref1->type);
+	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+				      btrfs_delayed_node_to_data_ref(ref1));
+	}
+	BUG();
+	return 0;
+}
+
+/*
+ * insert a new ref into the rbtree.  This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+						  struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_node *entry;
+	struct btrfs_delayed_ref_node *ins;
+	int cmp;
+
+	ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+				 rb_node);
+
+		cmp = comp_entry(entry, ins, 1);
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+						   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_head *entry;
+	struct btrfs_delayed_ref_head *ins;
+	u64 bytenr;
+
+	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+	bytenr = ins->node.bytenr;
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+				 href_node);
+
+		if (bytenr < entry->node.bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->node.bytenr)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * find an head entry based on bytenr. This returns the delayed ref
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
+ */
+static struct btrfs_delayed_ref_head *
+find_ref_head(struct rb_root *root, u64 bytenr,
+	      int return_bigger)
+{
+	struct rb_node *n;
+	struct btrfs_delayed_ref_head *entry;
+
+	n = root->rb_node;
+	entry = NULL;
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+		if (bytenr < entry->node.bytenr)
+			n = n->rb_left;
+		else if (bytenr > entry->node.bytenr)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	if (entry && return_bigger) {
+		if (bytenr > entry->node.bytenr) {
+			n = rb_next(&entry->href_node);
+			if (!n)
+				n = rb_first(root);
+			entry = rb_entry(n, struct btrfs_delayed_ref_head,
+					 href_node);
+			return entry;
+		}
+		return entry;
+	}
+	return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	assert_spin_locked(&delayed_refs->lock);
+	if (mutex_trylock(&head->mutex))
+		return 0;
+
+	atomic_inc(&head->node.refs);
+	spin_unlock(&delayed_refs->lock);
+
+	mutex_lock(&head->mutex);
+	spin_lock(&delayed_refs->lock);
+	if (!head->node.in_tree) {
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+	btrfs_put_delayed_ref(&head->node);
+	return 0;
+}
+
+static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
+				    struct btrfs_delayed_ref_root *delayed_refs,
+				    struct btrfs_delayed_ref_head *head,
+				    struct btrfs_delayed_ref_node *ref)
+{
+	if (btrfs_delayed_ref_is_head(ref)) {
+		head = btrfs_delayed_node_to_head(ref);
+		rb_erase(&head->href_node, &delayed_refs->href_root);
+	} else {
+		assert_spin_locked(&head->lock);
+		rb_erase(&ref->rb_node, &head->ref_root);
+	}
+	ref->in_tree = 0;
+	btrfs_put_delayed_ref(ref);
+	atomic_dec(&delayed_refs->num_entries);
+	if (trans->delayed_ref_updates)
+		trans->delayed_ref_updates--;
+}
+
+static int merge_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_root *delayed_refs,
+		     struct btrfs_delayed_ref_head *head,
+		     struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+	struct rb_node *node;
+	int mod = 0;
+	int done = 0;
+
+	node = rb_next(&ref->rb_node);
+	while (!done && node) {
+		struct btrfs_delayed_ref_node *next;
+
+		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
+		if (seq && next->seq >= seq)
+			break;
+		if (comp_entry(ref, next, 0))
+			continue;
+
+		if (ref->action == next->action) {
+			mod = next->ref_mod;
+		} else {
+			if (ref->ref_mod < next->ref_mod) {
+				struct btrfs_delayed_ref_node *tmp;
+
+				tmp = ref;
+				ref = next;
+				next = tmp;
+				done = 1;
+			}
+			mod = -next->ref_mod;
+		}
+
+		drop_delayed_ref(trans, delayed_refs, head, next);
+		ref->ref_mod += mod;
+		if (ref->ref_mod == 0) {
+			drop_delayed_ref(trans, delayed_refs, head, ref);
+			done = 1;
+		} else {
+			/*
+			 * You can't have multiples of the same ref on a tree
+			 * block.
+			 */
+			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+	}
+	return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head)
+{
+	struct rb_node *node;
+	u64 seq = 0;
+
+	assert_spin_locked(&head->lock);
+	/*
+	 * We don't have too much refs to merge in the case of delayed data
+	 * refs.
+	 */
+	if (head->is_data)
+		return;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		struct seq_list *elem;
+
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		seq = elem->seq;
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	node = rb_first(&head->ref_root);
+	while (node) {
+		struct btrfs_delayed_ref_node *ref;
+
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+			       rb_node);
+		/* We can't merge refs that are outside of our seq count */
+		if (seq && ref->seq >= seq)
+			break;
+		if (merge_ref(trans, delayed_refs, head, ref, seq))
+			node = rb_first(&head->ref_root);
+		else
+			node = rb_next(&ref->rb_node);
+	}
+}
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq)
+{
+	struct seq_list *elem;
+	int ret = 0;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		if (seq >= elem->seq) {
+			pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+				 (u32)(seq >> 32), (u32)seq,
+				 (u32)(elem->seq >> 32), (u32)elem->seq,
+				 delayed_refs);
+			ret = 1;
+		}
+	}
+
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	return ret;
+}
+
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_head *head;
+	u64 start;
+	bool loop = false;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+again:
+	start = delayed_refs->run_delayed_start;
+	head = find_ref_head(&delayed_refs->href_root, start, 1);
+	if (!head && !loop) {
+		delayed_refs->run_delayed_start = 0;
+		start = 0;
+		loop = true;
+		head = find_ref_head(&delayed_refs->href_root, start, 1);
+		if (!head)
+			return NULL;
+	} else if (!head && loop) {
+		return NULL;
+	}
+
+	while (head->processing) {
+		struct rb_node *node;
+
+		node = rb_next(&head->href_node);
+		if (!node) {
+			if (loop)
+				return NULL;
+			delayed_refs->run_delayed_start = 0;
+			start = 0;
+			loop = true;
+			goto again;
+		}
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+	}
+
+	head->processing = 1;
+	WARN_ON(delayed_refs->num_heads_ready == 0);
+	delayed_refs->num_heads_ready--;
+	delayed_refs->run_delayed_start = head->node.bytenr +
+		head->node.num_bytes;
+	return head;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree.  existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+		    struct btrfs_delayed_ref_root *delayed_refs,
+		    struct btrfs_delayed_ref_head *head,
+		    struct btrfs_delayed_ref_node *existing,
+		    struct btrfs_delayed_ref_node *update)
+{
+	if (update->action != existing->action) {
+		/*
+		 * this is effectively undoing either an add or a
+		 * drop.  We decrement the ref_mod, and if it goes
+		 * down to zero we just delete the entry without
+		 * every changing the extent allocation tree.
+		 */
+		existing->ref_mod--;
+		if (existing->ref_mod == 0)
+			drop_delayed_ref(trans, delayed_refs, head, existing);
+		else
+			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+	} else {
+		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		/*
+		 * the action on the existing ref matches
+		 * the action on the ref we're trying to add.
+		 * Bump the ref_mod by one so the backref that
+		 * is eventually added/removed has the correct
+		 * reference count
+		 */
+		existing->ref_mod += update->ref_mod;
+	}
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+			 struct btrfs_delayed_ref_node *existing,
+			 struct btrfs_delayed_ref_node *update)
+{
+	struct btrfs_delayed_ref_head *existing_ref;
+	struct btrfs_delayed_ref_head *ref;
+	int old_ref_mod;
+
+	existing_ref = btrfs_delayed_node_to_head(existing);
+	ref = btrfs_delayed_node_to_head(update);
+	BUG_ON(existing_ref->is_data != ref->is_data);
+
+	spin_lock(&existing_ref->lock);
+	if (ref->must_insert_reserved) {
+		/* if the extent was freed and then
+		 * reallocated before the delayed ref
+		 * entries were processed, we can end up
+		 * with an existing head ref without
+		 * the must_insert_reserved flag set.
+		 * Set it again here
+		 */
+		existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+		/*
+		 * update the num_bytes so we make sure the accounting
+		 * is done correctly
+		 */
+		existing->num_bytes = update->num_bytes;
+
+	}
+
+	if (ref->extent_op) {
+		if (!existing_ref->extent_op) {
+			existing_ref->extent_op = ref->extent_op;
+		} else {
+			if (ref->extent_op->update_key) {
+				memcpy(&existing_ref->extent_op->key,
+				       &ref->extent_op->key,
+				       sizeof(ref->extent_op->key));
+				existing_ref->extent_op->update_key = 1;
+			}
+			if (ref->extent_op->update_flags) {
+				existing_ref->extent_op->flags_to_set |=
+					ref->extent_op->flags_to_set;
+				existing_ref->extent_op->update_flags = 1;
+			}
+			btrfs_free_delayed_extent_op(ref->extent_op);
+		}
+	}
+	/*
+	 * update the reference mod on the head to reflect this new operation,
+	 * only need the lock for this case cause we could be processing it
+	 * currently, for refs we just added we know we're a-ok.
+	 */
+	old_ref_mod = existing_ref->total_ref_mod;
+	existing->ref_mod += update->ref_mod;
+	existing_ref->total_ref_mod += update->ref_mod;
+
+	/*
+	 * If we are going to from a positive ref mod to a negative or vice
+	 * versa we need to make sure to adjust pending_csums accordingly.
+	 */
+	if (existing_ref->is_data) {
+		if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+			delayed_refs->pending_csums -= existing->num_bytes;
+		if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+			delayed_refs->pending_csums += existing->num_bytes;
+	}
+	spin_unlock(&existing_ref->lock);
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, int action, int is_data)
+{
+	struct btrfs_delayed_ref_head *existing;
+	struct btrfs_delayed_ref_head *head_ref = NULL;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int count_mod = 1;
+	int must_insert_reserved = 0;
+
+	/*
+	 * the head node stores the sum of all the mods, so dropping a ref
+	 * should drop the sum in the head node by one.
+	 */
+	if (action == BTRFS_UPDATE_DELAYED_HEAD)
+		count_mod = 0;
+	else if (action == BTRFS_DROP_DELAYED_REF)
+		count_mod = -1;
+
+	/*
+	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+	 * the reserved accounting when the extent is finally added, or
+	 * if a later modification deletes the delayed ref without ever
+	 * inserting the extent into the extent allocation tree.
+	 * ref->must_insert_reserved is the flag used to record
+	 * that accounting mods are required.
+	 *
+	 * Once we record must_insert_reserved, switch the action to
+	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+	 */
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		must_insert_reserved = 1;
+	else
+		must_insert_reserved = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = count_mod;
+	ref->type  = 0;
+	ref->action  = 0;
+	ref->is_head = 1;
+	ref->in_tree = 1;
+	ref->seq = 0;
+
+	head_ref = btrfs_delayed_node_to_head(ref);
+	head_ref->must_insert_reserved = must_insert_reserved;
+	head_ref->is_data = is_data;
+	head_ref->ref_root = RB_ROOT;
+	head_ref->processing = 0;
+	head_ref->total_ref_mod = count_mod;
+
+	spin_lock_init(&head_ref->lock);
+	mutex_init(&head_ref->mutex);
+
+	trace_add_delayed_ref_head(ref, head_ref, action);
+
+	existing = htree_insert(&delayed_refs->href_root,
+				&head_ref->href_node);
+	if (existing) {
+		update_existing_head_ref(delayed_refs, &existing->node, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+		head_ref = existing;
+	} else {
+		if (is_data && count_mod < 0)
+			delayed_refs->pending_csums += num_bytes;
+		delayed_refs->num_heads++;
+		delayed_refs->num_heads_ready++;
+		atomic_inc(&delayed_refs->num_entries);
+		trans->delayed_ref_updates++;
+	}
+	return head_ref;
+}
+
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline void
+add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_head *head_ref,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, u64 parent, u64 ref_root, int level,
+		     int action, int no_quota)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_tree_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	if (is_fstree(ref_root))
+		seq = atomic64_read(&fs_info->tree_mod_seq);
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
+	ref->no_quota = no_quota;
+	ref->seq = seq;
+
+	full_ref = btrfs_delayed_node_to_tree_ref(ref);
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
+		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
+	else
+		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
+	full_ref->level = level;
+
+	trace_add_delayed_tree_ref(ref, full_ref, action);
+
+	spin_lock(&head_ref->lock);
+	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
+	if (existing) {
+		update_existing_ref(trans, delayed_refs, head_ref, existing,
+				    ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
+	} else {
+		atomic_inc(&delayed_refs->num_entries);
+		trans->delayed_ref_updates++;
+	}
+	spin_unlock(&head_ref->lock);
+}
+
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline void
+add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_head *head_ref,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
+		     u64 offset, int action, int no_quota)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_data_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	if (is_fstree(ref_root))
+		seq = atomic64_read(&fs_info->tree_mod_seq);
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
+	ref->no_quota = no_quota;
+	ref->seq = seq;
+
+	full_ref = btrfs_delayed_node_to_data_ref(ref);
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
+		ref->type = BTRFS_SHARED_DATA_REF_KEY;
+	else
+		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+
+	full_ref->objectid = owner;
+	full_ref->offset = offset;
+
+	trace_add_delayed_data_ref(ref, full_ref, action);
+
+	spin_lock(&head_ref->lock);
+	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
+	if (existing) {
+		update_existing_ref(trans, delayed_refs, head_ref, existing,
+				    ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
+	} else {
+		atomic_inc(&delayed_refs->num_entries);
+		trans->delayed_ref_updates++;
+	}
+	spin_unlock(&head_ref->lock);
+}
+
+/*
+ * add a delayed tree ref.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root,  int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota)
+{
+	struct btrfs_delayed_tree_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+		no_quota = 0;
+
+	BUG_ON(extent_op && extent_op->is_data);
+	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+	if (!head_ref) {
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+		return -ENOMEM;
+	}
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+					bytenr, num_bytes, action, 0);
+
+	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, level, action,
+				   no_quota);
+	spin_unlock(&delayed_refs->lock);
+
+	return 0;
+}
+
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota)
+{
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+		no_quota = 0;
+
+	BUG_ON(extent_op && !extent_op->is_data);
+	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+	if (!head_ref) {
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+		return -ENOMEM;
+	}
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+					bytenr, num_bytes, action, 1);
+
+	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, owner, offset,
+				   action, no_quota);
+	spin_unlock(&delayed_refs->lock);
+
+	return 0;
+}
+
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+	if (!head_ref)
+		return -ENOMEM;
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+				   extent_op->is_data);
+
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+}
+
+void btrfs_delayed_ref_exit(void)
+{
+	if (btrfs_delayed_ref_head_cachep)
+		kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+	if (btrfs_delayed_tree_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+	if (btrfs_delayed_data_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+	if (btrfs_delayed_extent_op_cachep)
+		kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int btrfs_delayed_ref_init(void)
+{
+	btrfs_delayed_ref_head_cachep = kmem_cache_create(
+				"btrfs_delayed_ref_head",
+				sizeof(struct btrfs_delayed_ref_head), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_ref_head_cachep)
+		goto fail;
+
+	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_tree_ref",
+				sizeof(struct btrfs_delayed_tree_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_tree_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_data_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_data_ref",
+				sizeof(struct btrfs_delayed_data_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_data_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_extent_op_cachep = kmem_cache_create(
+				"btrfs_delayed_extent_op",
+				sizeof(struct btrfs_delayed_extent_op), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_extent_op_cachep)
+		goto fail;
+
+	return 0;
+fail:
+	btrfs_delayed_ref_exit();
+	return -ENOMEM;
+}
diff --git a/kernel/fs/btrfs/delayed-ref.h b/kernel/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000..5eb089239
--- /dev/null
+++ b/kernel/fs/btrfs/delayed-ref.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
+#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+	struct rb_node rb_node;
+
+	/* the starting bytenr of the extent */
+	u64 bytenr;
+
+	/* the size of the extent */
+	u64 num_bytes;
+
+	/* seq number to keep track of insertion order */
+	u64 seq;
+
+	/* ref count on this data structure */
+	atomic_t refs;
+
+	/*
+	 * how many refs is this entry adding or deleting.  For
+	 * head refs, this may be a negative number because it is keeping
+	 * track of the total mods done to the reference count.
+	 * For individual refs, this will always be a positive number
+	 *
+	 * It may be more than one, since it is possible for a single
+	 * parent to have more than one ref on an extent
+	 */
+	int ref_mod;
+
+	unsigned int action:8;
+	unsigned int type:8;
+	unsigned int no_quota:1;
+	/* is this node still in the rbtree? */
+	unsigned int is_head:1;
+	unsigned int in_tree:1;
+};
+
+struct btrfs_delayed_extent_op {
+	struct btrfs_disk_key key;
+	u64 flags_to_set;
+	int level;
+	unsigned int update_key:1;
+	unsigned int update_flags:1;
+	unsigned int is_data:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent.  They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+	struct btrfs_delayed_ref_node node;
+
+	/*
+	 * the mutex is held while running the refs, and it is also
+	 * held when checking the sum of reference modifications.
+	 */
+	struct mutex mutex;
+
+	spinlock_t lock;
+	struct rb_root ref_root;
+
+	struct rb_node href_node;
+
+	struct btrfs_delayed_extent_op *extent_op;
+
+	/*
+	 * This is used to track the final ref_mod from all the refs associated
+	 * with this head ref, this is not adjusted as delayed refs are run,
+	 * this is meant to track if we need to do the csum accounting or not.
+	 */
+	int total_ref_mod;
+
+	/*
+	 * when a new extent is allocated, it is just reserved in memory
+	 * The actual extent isn't inserted into the extent allocation tree
+	 * until the delayed ref is processed.  must_insert_reserved is
+	 * used to flag a delayed ref so the accounting can be updated
+	 * when a full insert is done.
+	 *
+	 * It is possible the extent will be freed before it is ever
+	 * inserted into the extent allocation tree.  In this case
+	 * we need to update the in ram accounting to properly reflect
+	 * the free has happened.
+	 */
+	unsigned int must_insert_reserved:1;
+	unsigned int is_data:1;
+	unsigned int processing:1;
+};
+
+struct btrfs_delayed_tree_ref {
+	struct btrfs_delayed_ref_node node;
+	u64 root;
+	u64 parent;
+	int level;
+};
+
+struct btrfs_delayed_data_ref {
+	struct btrfs_delayed_ref_node node;
+	u64 root;
+	u64 parent;
+	u64 objectid;
+	u64 offset;
+};
+
+struct btrfs_delayed_ref_root {
+	/* head ref rbtree */
+	struct rb_root href_root;
+
+	/* this spin lock protects the rbtree and the entries inside */
+	spinlock_t lock;
+
+	/* how many delayed ref updates we've queued, used by the
+	 * throttling code
+	 */
+	atomic_t num_entries;
+
+	/* total number of head nodes in tree */
+	unsigned long num_heads;
+
+	/* total number of head nodes ready for processing */
+	unsigned long num_heads_ready;
+
+	u64 pending_csums;
+
+	/*
+	 * set when the tree is flushing before a transaction commit,
+	 * used by the throttling code to decide if new updates need
+	 * to be run right away
+	 */
+	int flushing;
+
+	u64 run_delayed_start;
+};
+
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+	return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+	if (op)
+		kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+	WARN_ON(atomic_read(&ref->refs) == 0);
+	if (atomic_dec_and_test(&ref->refs)) {
+		WARN_ON(ref->in_tree);
+		switch (ref->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+		case BTRFS_SHARED_DATA_REF_KEY:
+			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+			break;
+		case 0:
+			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+			break;
+		default:
+			BUG();
+		}
+	}
+}
+
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root, int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+	mutex_unlock(&head->mutex);
+}
+
+
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq);
+
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+	return node->is_head;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_tree_ref *
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_data_ref, node);
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(!btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_ref_head, node);
+}
+#endif
diff --git a/kernel/fs/btrfs/dev-replace.c b/kernel/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000..0573848c7
--- /dev/null
+++ b/kernel/fs/btrfs/dev-replace.c
@@ -0,0 +1,937 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "sysfs.h"
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct extent_buffer *eb;
+	int slot;
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	int item_size;
+	struct btrfs_dev_replace_item *ptr;
+	u64 src_devid;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+	if (ret) {
+no_valid_dev_replace_entry_found:
+		ret = 0;
+		dev_replace->replace_state =
+			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+		dev_replace->cont_reading_from_srcdev_mode =
+		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+		dev_replace->replace_state = 0;
+		dev_replace->time_started = 0;
+		dev_replace->time_stopped = 0;
+		atomic64_set(&dev_replace->num_write_errors, 0);
+		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+		dev_replace->cursor_left = 0;
+		dev_replace->committed_cursor_left = 0;
+		dev_replace->cursor_left_last_write_of_item = 0;
+		dev_replace->cursor_right = 0;
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		dev_replace->is_valid = 0;
+		dev_replace->item_needs_writeback = 0;
+		goto out;
+	}
+	slot = path->slots[0];
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+		btrfs_warn(fs_info,
+			"dev_replace entry found has unexpected size, ignore entry");
+		goto no_valid_dev_replace_entry_found;
+	}
+
+	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+	dev_replace->cont_reading_from_srcdev_mode =
+		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+	dev_replace->time_stopped =
+		btrfs_dev_replace_time_stopped(eb, ptr);
+	atomic64_set(&dev_replace->num_write_errors,
+		     btrfs_dev_replace_num_write_errors(eb, ptr));
+	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+	dev_replace->committed_cursor_left = dev_replace->cursor_left;
+	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+	dev_replace->is_valid = 1;
+
+	dev_replace->item_needs_writeback = 0;
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+							NULL, NULL);
+		dev_replace->tgtdev = btrfs_find_device(fs_info,
+							BTRFS_DEV_REPLACE_DEVID,
+							NULL, NULL);
+		/*
+		 * allow 'btrfs dev replace_cancel' if src/tgt device is
+		 * missing
+		 */
+		if (!dev_replace->srcdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			btrfs_warn(fs_info,
+			   "cannot mount because device replace operation is ongoing and");
+			btrfs_warn(fs_info,
+			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+			   src_devid);
+		}
+		if (!dev_replace->tgtdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			btrfs_warn(fs_info,
+			   "cannot mount because device replace operation is ongoing and");
+			btrfs_warn(fs_info,
+			   "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+				BTRFS_DEV_REPLACE_DEVID);
+		}
+		if (dev_replace->tgtdev) {
+			if (dev_replace->srcdev) {
+				dev_replace->tgtdev->total_bytes =
+					dev_replace->srcdev->total_bytes;
+				dev_replace->tgtdev->disk_total_bytes =
+					dev_replace->srcdev->disk_total_bytes;
+				dev_replace->tgtdev->commit_total_bytes =
+					dev_replace->srcdev->commit_total_bytes;
+				dev_replace->tgtdev->bytes_used =
+					dev_replace->srcdev->bytes_used;
+				dev_replace->tgtdev->commit_bytes_used =
+					dev_replace->srcdev->commit_bytes_used;
+			}
+			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+				dev_replace->tgtdev);
+		}
+		break;
+	}
+
+out:
+	if (path)
+		btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_dev_replace_item *ptr;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (!dev_replace->is_valid ||
+	    !dev_replace->item_needs_writeback) {
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+	if (ret < 0) {
+		btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
+			ret);
+		goto out;
+	}
+
+	if (ret == 0 &&
+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+		/*
+		 * need to delete old one and insert a new one.
+		 * Since no attempt is made to recover any old state, if the
+		 * dev_replace state is 'running', the data on the target
+		 * drive is lost.
+		 * It would be possible to recover the state: just make sure
+		 * that the beginning of the item is never changed and always
+		 * contains all the essential information. Then read this
+		 * minimal set of information and use it as a base for the
+		 * new state.
+		 */
+		ret = btrfs_del_item(trans, dev_root, path);
+		if (ret != 0) {
+			btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
+				ret);
+			goto out;
+		}
+		ret = 1;
+	}
+
+	if (ret == 1) {
+		/* need to insert a new item */
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, dev_root, path,
+					      &key, sizeof(*ptr));
+		if (ret < 0) {
+			btrfs_warn(fs_info, "insert dev_replace item failed %d!",
+				ret);
+			goto out;
+		}
+	}
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr(eb, path->slots[0],
+			     struct btrfs_dev_replace_item);
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (dev_replace->srcdev)
+		btrfs_set_dev_replace_src_devid(eb, ptr,
+			dev_replace->srcdev->devid);
+	else
+		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+		dev_replace->cont_reading_from_srcdev_mode);
+	btrfs_set_dev_replace_replace_state(eb, ptr,
+		dev_replace->replace_state);
+	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+	btrfs_set_dev_replace_num_write_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_write_errors));
+	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+	dev_replace->cursor_left_last_write_of_item =
+		dev_replace->cursor_left;
+	btrfs_set_dev_replace_cursor_left(eb, ptr,
+		dev_replace->cursor_left_last_write_of_item);
+	btrfs_set_dev_replace_cursor_right(eb, ptr,
+		dev_replace->cursor_right);
+	dev_replace->item_needs_writeback = 0;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	dev_replace->committed_cursor_left =
+		dev_replace->cursor_left_last_write_of_item;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_device *src_device = NULL;
+
+	switch (args->start.cont_reading_from_srcdev_mode) {
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+	    args->start.tgtdev_name[0] == '\0')
+		return -EINVAL;
+
+	/*
+	 * Here we commit the transaction to make sure commit_total_bytes
+	 * of all the devices are updated.
+	 */
+	trans = btrfs_attach_transaction(root);
+	if (!IS_ERR(trans)) {
+		ret = btrfs_commit_transaction(trans, root);
+		if (ret)
+			return ret;
+	} else if (PTR_ERR(trans) != -ENOENT) {
+		return PTR_ERR(trans);
+	}
+
+	/* the disk copy procedure reuses the scrub code */
+	mutex_lock(&fs_info->volume_mutex);
+	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+					    args->start.srcdev_name,
+					    &src_device);
+	if (ret) {
+		mutex_unlock(&fs_info->volume_mutex);
+		return ret;
+	}
+
+	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+					    src_device, &tgt_device);
+	mutex_unlock(&fs_info->volume_mutex);
+	if (ret)
+		return ret;
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+		goto leave;
+	}
+
+	dev_replace->cont_reading_from_srcdev_mode =
+		args->start.cont_reading_from_srcdev_mode;
+	WARN_ON(!src_device);
+	dev_replace->srcdev = src_device;
+	WARN_ON(!tgt_device);
+	dev_replace->tgtdev = tgt_device;
+
+	printk_in_rcu(KERN_INFO
+		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+		      src_device->missing ? "<missing disk>" :
+		        rcu_str_deref(src_device->name),
+		      src_device->devid,
+		      rcu_str_deref(tgt_device->name));
+
+	/*
+	 * from now on, the writes to the srcdev are all duplicated to
+	 * go to the tgtdev as well (refer to btrfs_map_block()).
+	 */
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+	dev_replace->time_started = get_seconds();
+	dev_replace->cursor_left = 0;
+	dev_replace->committed_cursor_left = 0;
+	dev_replace->cursor_left_last_write_of_item = 0;
+	dev_replace->cursor_right = 0;
+	dev_replace->is_valid = 1;
+	dev_replace->item_needs_writeback = 1;
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_wait_ordered_roots(root->fs_info, -1);
+
+	/* force writing the updated state information to disk */
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_dev_replace_lock(dev_replace);
+		goto leave;
+	}
+
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	/* the disk copy procedure reuses the scrub code */
+	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+			      btrfs_device_get_total_bytes(src_device),
+			      &dev_replace->scrub_progress, 0, 1);
+
+	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+	/* don't warn if EINPROGRESS, someone else might be running scrub */
+	if (ret == -EINPROGRESS) {
+		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+		ret = 0;
+	} else {
+		WARN_ON(ret);
+	}
+
+	return ret;
+
+leave:
+	dev_replace->srcdev = NULL;
+	dev_replace->tgtdev = NULL;
+	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+	return ret;
+}
+
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	wait_event(fs_info->replace_wait, !percpu_counter_sum(
+		   &fs_info->bio_counter));
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device;
+	struct btrfs_device *src_device;
+	struct btrfs_root *root = fs_info->tree_root;
+	u8 uuid_tmp[BTRFS_UUID_SIZE];
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	/* don't allow cancel or unmount to disturb the finishing procedure */
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* was the operation canceled, or is it finished? */
+	if (dev_replace->replace_state !=
+	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return 0;
+	}
+
+	tgt_device = dev_replace->tgtdev;
+	src_device = dev_replace->srcdev;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	/*
+	 * flush all outstanding I/O and inode extent mappings before the
+	 * copy operation is declared as being finished
+	 */
+	ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+	if (ret) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return ret;
+	}
+	btrfs_wait_ordered_roots(root->fs_info, -1);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	mutex_lock(&uuid_mutex);
+	/* keep away write_all_supers() during the finishing procedure */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace->replace_state =
+		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+	dev_replace->tgtdev = NULL;
+	dev_replace->srcdev = NULL;
+	dev_replace->time_stopped = get_seconds();
+	dev_replace->item_needs_writeback = 1;
+
+	/* replace old device with new one in mapping tree */
+	if (!scrub_ret) {
+		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+								src_device,
+								tgt_device);
+	} else {
+		printk_in_rcu(KERN_ERR
+			      "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+			      src_device->missing ? "<missing disk>" :
+			        rcu_str_deref(src_device->name),
+			      src_device->devid,
+			      rcu_str_deref(tgt_device->name), scrub_ret);
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&root->fs_info->chunk_mutex);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		mutex_unlock(&uuid_mutex);
+		if (tgt_device)
+			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+		return scrub_ret;
+	}
+
+	printk_in_rcu(KERN_INFO
+		      "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+		      src_device->missing ? "<missing disk>" :
+		        rcu_str_deref(src_device->name),
+		      src_device->devid,
+		      rcu_str_deref(tgt_device->name));
+	tgt_device->is_tgtdev_for_dev_replace = 0;
+	tgt_device->devid = src_device->devid;
+	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+	btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
+	btrfs_device_set_disk_total_bytes(tgt_device,
+					  src_device->disk_total_bytes);
+	btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+	ASSERT(list_empty(&src_device->resized_list));
+	tgt_device->commit_total_bytes = src_device->commit_total_bytes;
+	tgt_device->commit_bytes_used = src_device->bytes_used;
+	if (fs_info->sb->s_bdev == src_device->bdev)
+		fs_info->sb->s_bdev = tgt_device->bdev;
+	if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+	fs_info->fs_devices->rw_devices++;
+
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_rm_dev_replace_blocked(fs_info);
+
+	btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
+
+	btrfs_rm_dev_replace_unblocked(fs_info);
+
+	/*
+	 * this is again a consistent state where no dev_replace procedure
+	 * is running, the target device is part of the filesystem, the
+	 * source device is not part of the filesystem anymore and its 1st
+	 * superblock is scratched out so that it is no longer marked to
+	 * belong to this filesystem.
+	 */
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&uuid_mutex);
+
+	/* replace the sysfs entry */
+	btrfs_kobj_rm_device(fs_info, src_device);
+	btrfs_kobj_add_device(fs_info, tgt_device);
+	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
+
+	/* write back the superblocks */
+	trans = btrfs_start_transaction(root, 0);
+	if (!IS_ERR(trans))
+		btrfs_commit_transaction(trans, root);
+
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+	return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev)
+{
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 start = 0;
+	int i;
+
+	write_lock(&em_tree->lock);
+	do {
+		em = lookup_extent_mapping(em_tree, start, (u64)-1);
+		if (!em)
+			break;
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++)
+			if (srcdev == map->stripes[i].dev)
+				map->stripes[i].dev = tgtdev;
+		start = em->start + em->len;
+		free_extent_map(em);
+	} while (start);
+	write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device)
+{
+	int ret;
+
+	if (srcdevid) {
+		ret = 0;
+		*device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+					    NULL);
+		if (!*device)
+			ret = -ENOENT;
+	} else {
+		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+							   device);
+	}
+	return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *srcdev;
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* even if !dev_replace_is_valid, the values are good enough for
+	 * the replace_status ioctl */
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	args->status.replace_state = dev_replace->replace_state;
+	args->status.time_started = dev_replace->time_started;
+	args->status.time_stopped = dev_replace->time_stopped;
+	args->status.num_write_errors =
+		atomic64_read(&dev_replace->num_write_errors);
+	args->status.num_uncorrectable_read_errors =
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		args->status.progress_1000 = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+		args->status.progress_1000 = 1000;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		srcdev = dev_replace->srcdev;
+		args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
+		break;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args)
+{
+	args->result = __btrfs_dev_replace_cancel(fs_info);
+	return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = fs_info->tree_root;
+	u64 result;
+	int ret;
+
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+		btrfs_dev_replace_unlock(dev_replace);
+		goto leave;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+		tgt_device = dev_replace->tgtdev;
+		dev_replace->tgtdev = NULL;
+		dev_replace->srcdev = NULL;
+		break;
+	}
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+	dev_replace->time_stopped = get_seconds();
+	dev_replace->item_needs_writeback = 1;
+	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_scrub_cancel(fs_info);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+	if (tgt_device)
+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+	return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+		dev_replace->time_stopped = get_seconds();
+		dev_replace->item_needs_writeback = 1;
+		btrfs_info(fs_info, "suspending dev_replace for unmount");
+		break;
+	}
+
+	btrfs_dev_replace_unlock(dev_replace);
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+		break;
+	}
+	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+		btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
+		btrfs_info(fs_info,
+			"you may cancel the operation after 'mount -o degraded'");
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	WARN_ON(atomic_xchg(
+		&fs_info->mutually_exclusive_operation_running, 1));
+	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+	return PTR_ERR_OR_ZERO(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_ioctl_dev_replace_args *status_args;
+	u64 progress;
+
+	status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+	if (status_args) {
+		btrfs_dev_replace_status(fs_info, status_args);
+		progress = status_args->status.progress_1000;
+		kfree(status_args);
+		progress = div_u64(progress, 10);
+		printk_in_rcu(KERN_INFO
+			"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+			dev_replace->srcdev->missing ? "<missing disk>" :
+			rcu_str_deref(dev_replace->srcdev->name),
+			dev_replace->srcdev->devid,
+			dev_replace->tgtdev ?
+			rcu_str_deref(dev_replace->tgtdev->name) :
+			"<missing target disk>",
+			(unsigned int)progress);
+	}
+	btrfs_dev_replace_continue_on_mount(fs_info);
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+	return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+
+	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+			      dev_replace->committed_cursor_left,
+			      btrfs_device_get_total_bytes(dev_replace->srcdev),
+			      &dev_replace->scrub_progress, 0, 1);
+	ret = btrfs_dev_replace_finishing(fs_info, ret);
+	WARN_ON(ret);
+	return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+	if (!dev_replace->is_valid)
+		return 0;
+
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		/*
+		 * return true even if tgtdev is missing (this is
+		 * something that can happen if the dev_replace
+		 * procedure is suspended by an umount and then
+		 * the tgtdev is missing (or "btrfs dev scan") was
+		 * not called and the the filesystem is remounted
+		 * in degraded state. This does not stop the
+		 * dev_replace procedure. It needs to be canceled
+		 * manually if the cancelation is wanted.
+		 */
+		break;
+	}
+	return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+	/* the beginning is just an optimization for the typical case */
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+		/* this is not a nested case where the same thread
+		 * is trying to acqurire the same lock twice */
+		mutex_lock(&dev_replace->lock);
+		mutex_lock(&dev_replace->lock_management_lock);
+		dev_replace->lock_owner = current->pid;
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_lock(&dev_replace->lock_management_lock);
+	if (atomic_read(&dev_replace->nesting_level) > 0 &&
+	    dev_replace->lock_owner == current->pid) {
+		WARN_ON(!mutex_is_locked(&dev_replace->lock));
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_unlock(&dev_replace->lock_management_lock);
+	goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+	WARN_ON(!mutex_is_locked(&dev_replace->lock));
+	mutex_lock(&dev_replace->lock_management_lock);
+	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+	WARN_ON(dev_replace->lock_owner != current->pid);
+	atomic_dec(&dev_replace->nesting_level);
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+		dev_replace->lock_owner = 0;
+		mutex_unlock(&dev_replace->lock_management_lock);
+		mutex_unlock(&dev_replace->lock);
+	} else {
+		mutex_unlock(&dev_replace->lock_management_lock);
+	}
+}
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+	percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
+{
+	percpu_counter_sub(&fs_info->bio_counter, amount);
+
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+	while (1) {
+		percpu_counter_inc(&fs_info->bio_counter);
+		if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+				     &fs_info->fs_state)))
+			break;
+
+		btrfs_bio_counter_dec(fs_info);
+		wait_event(fs_info->replace_wait,
+			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+				     &fs_info->fs_state));
+	}
+}
diff --git a/kernel/fs/btrfs/dev-replace.h b/kernel/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000..20035cbbf
--- /dev/null
+++ b/kernel/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+	atomic64_inc(stat_value);
+}
+#endif
diff --git a/kernel/fs/btrfs/dir-item.c b/kernel/fs/btrfs/dir-item.c
new file mode 100644
index 000000000..1752625fb
--- /dev/null
+++ b/kernel/fs/btrfs/dir-item.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision.  data_size indicates how big the item inserted should be.  On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+						   *trans,
+						   struct btrfs_root *root,
+						   struct btrfs_path *path,
+						   struct btrfs_key *cpu_key,
+						   u32 data_size,
+						   const char *name,
+						   int name_len)
+{
+	int ret;
+	char *ptr;
+	struct btrfs_item *item;
+	struct extent_buffer *leaf;
+
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+	if (ret == -EEXIST) {
+		struct btrfs_dir_item *di;
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (di)
+			return ERR_PTR(-EEXIST);
+		btrfs_extend_item(root, path, data_size);
+	} else if (ret < 0)
+		return ERR_PTR(ret);
+	WARN_ON(ret > 0);
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(path->slots[0]);
+	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+	BUG_ON(data_size > btrfs_item_size(leaf, item));
+	ptr += btrfs_item_size(leaf, item) - data_size;
+	return (struct btrfs_dir_item *)ptr;
+}
+
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 objectid,
+			    const char *name, u16 name_len,
+			    const void *data, u16 data_len)
+{
+	int ret = 0;
+	struct btrfs_dir_item *dir_item;
+	unsigned long name_ptr, data_ptr;
+	struct btrfs_key key, location;
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *leaf;
+	u32 data_size;
+
+	BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+
+	key.objectid = objectid;
+	key.type = BTRFS_XATTR_ITEM_KEY;
+	key.offset = btrfs_name_hash(name, name_len);
+
+	data_size = sizeof(*dir_item) + name_len + data_len;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	if (IS_ERR(dir_item))
+		return PTR_ERR(dir_item);
+	memset(&location, 0, sizeof(location));
+
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, &location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	btrfs_set_dir_data_len(leaf, dir_item, data_len);
+	name_ptr = (unsigned long)(dir_item + 1);
+	data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	write_extent_buffer(leaf, data, data_ptr, data_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	return ret;
+}
+
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, const char *name, int name_len,
+			  struct inode *dir, struct btrfs_key *location,
+			  u8 type, u64 index)
+{
+	int ret = 0;
+	int ret2 = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
+	struct extent_buffer *leaf;
+	unsigned long name_ptr;
+	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
+	u32 data_size;
+
+	key.objectid = btrfs_ino(dir);
+	key.type = BTRFS_DIR_ITEM_KEY;
+	key.offset = btrfs_name_hash(name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	btrfs_cpu_key_to_disk(&disk_key, location);
+
+	data_size = sizeof(*dir_item) + name_len;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		if (ret == -EEXIST)
+			goto second_insert;
+		goto out_free;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_data_len(leaf, dir_item, 0);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	name_ptr = (unsigned long)(dir_item + 1);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+second_insert:
+	/* FIXME, use some real flag for selecting the extra index */
+	if (root == root->fs_info->tree_root) {
+		ret = 0;
+		goto out_free;
+	}
+	btrfs_release_path(path);
+
+	ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
+					      &disk_key, type, index);
+out_free:
+	btrfs_free_path(path);
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
+}
+
+/*
+ * lookup a directory item based on name.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_ITEM_KEY;
+
+	key.offset = btrfs_name_hash(name, name_len);
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+				   const char *name, int name_len)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_item *di;
+	int data_size;
+	struct extent_buffer *leaf;
+	int slot;
+	struct btrfs_path *path;
+
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_ITEM_KEY;
+	key.offset = btrfs_name_hash(name, name_len);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+	/* return back any errors */
+	if (ret < 0)
+		goto out;
+
+	/* nothing found, we're safe */
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+
+	/* we found an item, look for our name in the item */
+	di = btrfs_match_dir_item_name(root, path, name, name_len);
+	if (di) {
+		/* our exact name was found */
+		ret = -EEXIST;
+		goto out;
+	}
+
+	/*
+	 * see if there is room in the item to insert this
+	 * name
+	 */
+	data_size = sizeof(*di) + name_len;
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	if (data_size + btrfs_item_size_nr(leaf, slot) +
+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+		ret = -EOVERFLOW;
+	} else {
+		/* plenty of insertion room */
+		ret = 0;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * lookup a directory item based on index.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = objectid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return ERR_PTR(-ENOENT);
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dirid,
+			    const char *name, int name_len)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+
+	key.objectid = dirid;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	while (1) {
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				return ERR_PTR(ret);
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+			break;
+
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (di)
+			return di;
+
+		path->slots[0]++;
+	}
+	return NULL;
+}
+
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = dir;
+	key.type = BTRFS_XATTR_ITEM_KEY;
+	key.offset = btrfs_name_hash(name, name_len);
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+						 struct btrfs_path *path,
+						 const char *name, int name_len)
+{
+	struct btrfs_dir_item *dir_item;
+	unsigned long name_ptr;
+	u32 total_len;
+	u32 cur = 0;
+	u32 this_len;
+	struct extent_buffer *leaf;
+
+	leaf = path->nodes[0];
+	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+	if (verify_dir_item(root, leaf, dir_item))
+		return NULL;
+
+	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	while (cur < total_len) {
+		this_len = sizeof(*dir_item) +
+			btrfs_dir_name_len(leaf, dir_item) +
+			btrfs_dir_data_len(leaf, dir_item);
+		name_ptr = (unsigned long)(dir_item + 1);
+
+		if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+		    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+			return dir_item;
+
+		cur += this_len;
+		dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+						     this_len);
+	}
+	return NULL;
+}
+
+/*
+ * given a pointer into a directory item, delete it.  This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di)
+{
+
+	struct extent_buffer *leaf;
+	u32 sub_item_len;
+	u32 item_len;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+		btrfs_dir_data_len(leaf, di);
+	item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (sub_item_len == item_len) {
+		ret = btrfs_del_item(trans, root, path);
+	} else {
+		/* MARKER */
+		unsigned long ptr = (unsigned long)di;
+		unsigned long start;
+
+		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+			item_len - (ptr + sub_item_len - start));
+		btrfs_truncate_item(root, path, item_len - sub_item_len, 1);
+	}
+	return ret;
+}
+
+int verify_dir_item(struct btrfs_root *root,
+		    struct extent_buffer *leaf,
+		    struct btrfs_dir_item *dir_item)
+{
+	u16 namelen = BTRFS_NAME_LEN;
+	u8 type = btrfs_dir_type(leaf, dir_item);
+
+	if (type >= BTRFS_FT_MAX) {
+		btrfs_crit(root->fs_info, "invalid dir item type: %d",
+		       (int)type);
+		return 1;
+	}
+
+	if (type == BTRFS_FT_XATTR)
+		namelen = XATTR_NAME_MAX;
+
+	if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
+		btrfs_crit(root->fs_info, "invalid dir item name len: %u",
+		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+		return 1;
+	}
+
+	/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
+	if ((btrfs_dir_data_len(leaf, dir_item) +
+	     btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
+		btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u",
+		       (unsigned)btrfs_dir_name_len(leaf, dir_item),
+		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/btrfs/disk-io.c b/kernel/fs/btrfs/disk-io.c
new file mode 100644
index 000000000..2ef9a4b72
--- /dev/null
+++ b/kernel/fs/btrfs/disk-io.c
@@ -0,0 +1,4338 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/slab.h>
+#include <linux/migrate.h>
+#include <linux/ratelimit.h>
+#include <linux/uuid.h>
+#include <linux/semaphore.h>
+#include <asm/unaligned.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "raid56.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
+static const struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+				    int read_only);
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
+static void btrfs_error_commit_super(struct btrfs_root *root);
+
+/*
+ * btrfs_end_io_wq structs are used to do processing in task context when an IO
+ * is complete.  This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct btrfs_end_io_wq {
+	struct bio *bio;
+	bio_end_io_t *end_io;
+	void *private;
+	struct btrfs_fs_info *info;
+	int error;
+	enum btrfs_wq_endio_type metadata;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
+static struct kmem_cache *btrfs_end_io_wq_cache;
+
+int __init btrfs_end_io_wq_init(void)
+{
+	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+					sizeof(struct btrfs_end_io_wq),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_end_io_wq_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_end_io_wq_exit(void)
+{
+	if (btrfs_end_io_wq_cache)
+		kmem_cache_destroy(btrfs_end_io_wq_cache);
+}
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads.  They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+	struct inode *inode;
+	struct bio *bio;
+	struct list_head list;
+	extent_submit_bio_hook_t *submit_bio_start;
+	extent_submit_bio_hook_t *submit_bio_done;
+	int rw;
+	int mirror_num;
+	unsigned long bio_flags;
+	/*
+	 * bio_offset is optional, can be used if the pages in the bio
+	 * can't tell us where in the file the bio should go
+	 */
+	u64 bio_offset;
+	struct btrfs_work work;
+	int error;
+};
+
+/*
+ * Lockdep class keys for extent_buffer->lock's in this root.  For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets.  As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->objectid.  This ensures that all special purpose roots
+ * have separate keysets.
+ *
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock.  As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
+ *
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked.  It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
+ *
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+
+static struct btrfs_lockdep_keyset {
+	u64			id;		/* root objectid */
+	const char		*name_stem;	/* lock name stem */
+	char			names[BTRFS_MAX_LEVEL + 1][20];
+	struct lock_class_key	keys[BTRFS_MAX_LEVEL + 1];
+} btrfs_lockdep_keysets[] = {
+	{ .id = BTRFS_ROOT_TREE_OBJECTID,	.name_stem = "root"	},
+	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	.name_stem = "extent"	},
+	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	.name_stem = "chunk"	},
+	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
+	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
+	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
+	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	.name_stem = "quota"	},
+	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
+	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
+	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
+	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
+	{ .id = 0,				.name_stem = "tree"	},
+};
+
+void __init btrfs_init_lockdep(void)
+{
+	int i, j;
+
+	/* initialize lockdep class names */
+	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
+		struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
+
+		for (j = 0; j < ARRAY_SIZE(ks->names); j++)
+			snprintf(ks->names[j], sizeof(ks->names[j]),
+				 "btrfs-%s-%02d", ks->name_stem, j);
+	}
+}
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
+				    int level)
+{
+	struct btrfs_lockdep_keyset *ks;
+
+	BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+	/* find the matching keyset, id 0 is the default entry */
+	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+		if (ks->id == objectid)
+			break;
+
+	lockdep_set_class_and_name(&eb->lock,
+				   &ks->keys[level], ks->names[level]);
+}
+
+#endif
+
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+		struct page *page, size_t pg_offset, u64 start, u64 len,
+		int create)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	if (em) {
+		em->bdev =
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+		read_unlock(&em_tree->lock);
+		goto out;
+	}
+	read_unlock(&em_tree->lock);
+
+	em = alloc_extent_map();
+	if (!em) {
+		em = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	em->start = 0;
+	em->len = (u64)-1;
+	em->block_len = (u64)-1;
+	em->block_start = 0;
+	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	write_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em, 0);
+	if (ret == -EEXIST) {
+		free_extent_map(em);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (!em)
+			em = ERR_PTR(-EIO);
+	} else if (ret) {
+		free_extent_map(em);
+		em = ERR_PTR(ret);
+	}
+	write_unlock(&em_tree->lock);
+
+out:
+	return em;
+}
+
+u32 btrfs_csum_data(char *data, u32 seed, size_t len)
+{
+	return btrfs_crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+	put_unaligned_le32(~crc, result);
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *buf,
+			   int verify)
+{
+	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	char *result = NULL;
+	unsigned long len;
+	unsigned long cur_len;
+	unsigned long offset = BTRFS_CSUM_SIZE;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	int err;
+	u32 crc = ~(u32)0;
+	unsigned long inline_result;
+
+	len = buf->len - offset;
+	while (len > 0) {
+		err = map_private_extent_buffer(buf, offset, 32,
+					&kaddr, &map_start, &map_len);
+		if (err)
+			return 1;
+		cur_len = min(len, map_len - (offset - map_start));
+		crc = btrfs_csum_data(kaddr + offset - map_start,
+				      crc, cur_len);
+		len -= cur_len;
+		offset += cur_len;
+	}
+	if (csum_size > sizeof(inline_result)) {
+		result = kzalloc(csum_size, GFP_NOFS);
+		if (!result)
+			return 1;
+	} else {
+		result = (char *)&inline_result;
+	}
+
+	btrfs_csum_final(crc, result);
+
+	if (verify) {
+		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+			u32 val;
+			u32 found = 0;
+			memcpy(&found, result, csum_size);
+
+			read_extent_buffer(buf, &val, 0, csum_size);
+			printk_ratelimited(KERN_WARNING
+				"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
+				"level %d\n",
+				fs_info->sb->s_id, buf->start,
+				val, found, btrfs_header_level(buf));
+			if (result != (char *)&inline_result)
+				kfree(result);
+			return 1;
+		}
+	} else {
+		write_extent_buffer(buf, result, 0, csum_size);
+	}
+	if (result != (char *)&inline_result)
+		kfree(result);
+	return 0;
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer.  This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+				 struct extent_buffer *eb, u64 parent_transid,
+				 int atomic)
+{
+	struct extent_state *cached_state = NULL;
+	int ret;
+	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
+
+	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+		return 0;
+
+	if (atomic)
+		return -EAGAIN;
+
+	if (need_lock) {
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+	}
+
+	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
+			 0, &cached_state);
+	if (extent_buffer_uptodate(eb) &&
+	    btrfs_header_generation(eb) == parent_transid) {
+		ret = 0;
+		goto out;
+	}
+	printk_ratelimited(KERN_ERR
+	    "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+			eb->fs_info->sb->s_id, eb->start,
+			parent_transid, btrfs_header_generation(eb));
+	ret = 1;
+
+	/*
+	 * Things reading via commit roots that don't have normal protection,
+	 * like send, can have a really old block in cache that may point at a
+	 * block that has been free'd and re-allocated.  So don't clear uptodate
+	 * if we find an eb that is under IO (dirty/writeback) because we could
+	 * end up reading in the stale data and then writing it back out and
+	 * making everybody very sad.
+	 */
+	if (!extent_buffer_under_io(eb))
+		clear_extent_buffer_uptodate(eb);
+out:
+	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
+			     &cached_state, GFP_NOFS);
+	if (need_lock)
+		btrfs_tree_read_unlock_blocking(eb);
+	return ret;
+}
+
+/*
+ * Return 0 if the superblock checksum type matches the checksum value of that
+ * algorithm. Pass the raw disk superblock data.
+ */
+static int btrfs_check_super_csum(char *raw_disk_sb)
+{
+	struct btrfs_super_block *disk_sb =
+		(struct btrfs_super_block *)raw_disk_sb;
+	u16 csum_type = btrfs_super_csum_type(disk_sb);
+	int ret = 0;
+
+	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
+		u32 crc = ~(u32)0;
+		const int csum_size = sizeof(crc);
+		char result[csum_size];
+
+		/*
+		 * The super_block structure does not span the whole
+		 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
+		 * is filled with zeros and is included in the checkum.
+		 */
+		crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
+				crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, result);
+
+		if (memcmp(raw_disk_sb, result, csum_size))
+			ret = 1;
+	}
+
+	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
+		printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
+				csum_type);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+					  struct extent_buffer *eb,
+					  u64 start, u64 parent_transid)
+{
+	struct extent_io_tree *io_tree;
+	int failed = 0;
+	int ret;
+	int num_copies = 0;
+	int mirror_num = 0;
+	int failed_mirror = 0;
+
+	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+	while (1) {
+		ret = read_extent_buffer_pages(io_tree, eb, start,
+					       WAIT_COMPLETE,
+					       btree_get_extent, mirror_num);
+		if (!ret) {
+			if (!verify_parent_transid(io_tree, eb,
+						   parent_transid, 0))
+				break;
+			else
+				ret = -EIO;
+		}
+
+		/*
+		 * This buffer's crc is fine, but its contents are corrupted, so
+		 * there is no reason to read the other copies, they won't be
+		 * any less wrong.
+		 */
+		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+			break;
+
+		num_copies = btrfs_num_copies(root->fs_info,
+					      eb->start, eb->len);
+		if (num_copies == 1)
+			break;
+
+		if (!failed_mirror) {
+			failed = 1;
+			failed_mirror = eb->read_mirror;
+		}
+
+		mirror_num++;
+		if (mirror_num == failed_mirror)
+			mirror_num++;
+
+		if (mirror_num > num_copies)
+			break;
+	}
+
+	if (failed && !ret && failed_mirror)
+		repair_eb_io_failure(root, eb, failed_mirror);
+
+	return ret;
+}
+
+/*
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
+{
+	u64 start = page_offset(page);
+	u64 found_start;
+	struct extent_buffer *eb;
+
+	eb = (struct extent_buffer *)page->private;
+	if (page != eb->pages[0])
+		return 0;
+	found_start = btrfs_header_bytenr(eb);
+	if (WARN_ON(found_start != start || !PageUptodate(page)))
+		return 0;
+	csum_tree_block(fs_info, eb, 0);
+	return 0;
+}
+
+static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	u8 fsid[BTRFS_UUID_SIZE];
+	int ret = 1;
+
+	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
+	while (fs_devices) {
+		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+			ret = 0;
+			break;
+		}
+		fs_devices = fs_devices->seed;
+	}
+	return ret;
+}
+
+#define CORRUPT(reason, eb, root, slot)				\
+	btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu,"	\
+		   "root=%llu, slot=%d", reason,			\
+	       btrfs_header_bytenr(eb),	root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+			       struct extent_buffer *leaf)
+{
+	struct btrfs_key key;
+	struct btrfs_key leaf_key;
+	u32 nritems = btrfs_header_nritems(leaf);
+	int slot;
+
+	if (nritems == 0)
+		return 0;
+
+	/* Check the 0 item */
+	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+	    BTRFS_LEAF_DATA_SIZE(root)) {
+		CORRUPT("invalid item offset size pair", leaf, root, 0);
+		return -EIO;
+	}
+
+	/*
+	 * Check to make sure each items keys are in the correct order and their
+	 * offsets make sense.  We only have to loop through nritems-1 because
+	 * we check the current slot against the next slot, which verifies the
+	 * next slot's offset+size makes sense and that the current's slot
+	 * offset is correct.
+	 */
+	for (slot = 0; slot < nritems - 1; slot++) {
+		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+		/* Make sure the keys are in the right order */
+		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+			CORRUPT("bad key order", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Make sure the offset and ends are right, remember that the
+		 * item data starts at the end of the leaf and grows towards the
+		 * front.
+		 */
+		if (btrfs_item_offset_nr(leaf, slot) !=
+			btrfs_item_end_nr(leaf, slot + 1)) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Check to make sure that we don't point outside of the leaf,
+		 * just incase all the items are consistent to eachother, but
+		 * all point outside of the leaf.
+		 */
+		if (btrfs_item_end_nr(leaf, slot) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			CORRUPT("slot end outside of leaf", leaf, root, slot);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
+{
+	u64 found_start;
+	int found_level;
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	int ret = 0;
+	int reads_done;
+
+	if (!page->private)
+		goto out;
+
+	eb = (struct extent_buffer *)page->private;
+
+	/* the pending IO might have been the only thing that kept this buffer
+	 * in memory.  Make sure we have a ref for all this other checks
+	 */
+	extent_buffer_get(eb);
+
+	reads_done = atomic_dec_and_test(&eb->io_pages);
+	if (!reads_done)
+		goto err;
+
+	eb->read_mirror = mirror;
+	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+		ret = -EIO;
+		goto err;
+	}
+
+	found_start = btrfs_header_bytenr(eb);
+	if (found_start != eb->start) {
+		printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
+			       "%llu %llu\n",
+			       eb->fs_info->sb->s_id, found_start, eb->start);
+		ret = -EIO;
+		goto err;
+	}
+	if (check_tree_block_fsid(root->fs_info, eb)) {
+		printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
+			       eb->fs_info->sb->s_id, eb->start);
+		ret = -EIO;
+		goto err;
+	}
+	found_level = btrfs_header_level(eb);
+	if (found_level >= BTRFS_MAX_LEVEL) {
+		btrfs_err(root->fs_info, "bad tree block level %d",
+			   (int)btrfs_header_level(eb));
+		ret = -EIO;
+		goto err;
+	}
+
+	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
+				       eb, found_level);
+
+	ret = csum_tree_block(root->fs_info, eb, 1);
+	if (ret) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/*
+	 * If this is a leaf block and it is corrupt, set the corrupt bit so
+	 * that we don't try and read the other copies of this block, just
+	 * return -EIO.
+	 */
+	if (found_level == 0 && check_leaf(root, eb)) {
+		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+		ret = -EIO;
+	}
+
+	if (!ret)
+		set_extent_buffer_uptodate(eb);
+err:
+	if (reads_done &&
+	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+		btree_readahead_hook(root, eb, eb->start, ret);
+
+	if (ret) {
+		/*
+		 * our io error hook is going to dec the io pages
+		 * again, we have to make sure it has something
+		 * to decrement
+		 */
+		atomic_inc(&eb->io_pages);
+		clear_extent_buffer_uptodate(eb);
+	}
+	free_extent_buffer(eb);
+out:
+	return ret;
+}
+
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
+{
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+	eb = (struct extent_buffer *)page->private;
+	set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+	eb->read_mirror = failed_mirror;
+	atomic_dec(&eb->io_pages);
+	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+		btree_readahead_hook(root, eb, eb->start, -EIO);
+	return -EIO;	/* we fixed nothing */
+}
+
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_workqueue *wq;
+	btrfs_work_func_t func;
+
+	fs_info = end_io_wq->info;
+	end_io_wq->error = err;
+
+	if (bio->bi_rw & REQ_WRITE) {
+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
+			wq = fs_info->endio_meta_write_workers;
+			func = btrfs_endio_meta_write_helper;
+		} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
+			wq = fs_info->endio_freespace_worker;
+			func = btrfs_freespace_write_helper;
+		} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+			wq = fs_info->endio_raid56_workers;
+			func = btrfs_endio_raid56_helper;
+		} else {
+			wq = fs_info->endio_write_workers;
+			func = btrfs_endio_write_helper;
+		}
+	} else {
+		if (unlikely(end_io_wq->metadata ==
+			     BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+			wq = fs_info->endio_repair_workers;
+			func = btrfs_endio_repair_helper;
+		} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+			wq = fs_info->endio_raid56_workers;
+			func = btrfs_endio_raid56_helper;
+		} else if (end_io_wq->metadata) {
+			wq = fs_info->endio_meta_workers;
+			func = btrfs_endio_meta_helper;
+		} else {
+			wq = fs_info->endio_workers;
+			func = btrfs_endio_helper;
+		}
+	}
+
+	btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+	btrfs_queue_work(wq, &end_io_wq->work);
+}
+
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			enum btrfs_wq_endio_type metadata)
+{
+	struct btrfs_end_io_wq *end_io_wq;
+
+	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
+	if (!end_io_wq)
+		return -ENOMEM;
+
+	end_io_wq->private = bio->bi_private;
+	end_io_wq->end_io = bio->bi_end_io;
+	end_io_wq->info = info;
+	end_io_wq->error = 0;
+	end_io_wq->bio = bio;
+	end_io_wq->metadata = metadata;
+
+	bio->bi_private = end_io_wq;
+	bio->bi_end_io = end_workqueue_bio;
+	return 0;
+}
+
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+	unsigned long limit = min_t(unsigned long,
+				    info->thread_pool_size,
+				    info->fs_devices->open_devices);
+	return 256 * limit;
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+	struct async_submit_bio *async;
+	int ret;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+				      async->mirror_num, async->bio_flags,
+				      async->bio_offset);
+	if (ret)
+		async->error = ret;
+}
+
+static void run_one_async_done(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct async_submit_bio *async;
+	int limit;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
+	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
+	    waitqueue_active(&fs_info->async_submit_wait))
+		wake_up(&fs_info->async_submit_wait);
+
+	/* If an error occured we just want to clean up the bio and move on */
+	if (async->error) {
+		bio_endio(async->bio, async->error);
+		return;
+	}
+
+	async->submit_bio_done(async->inode, async->rw, async->bio,
+			       async->mirror_num, async->bio_flags,
+			       async->bio_offset);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	kfree(async);
+}
+
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
+			u64 bio_offset,
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done)
+{
+	struct async_submit_bio *async;
+
+	async = kmalloc(sizeof(*async), GFP_NOFS);
+	if (!async)
+		return -ENOMEM;
+
+	async->inode = inode;
+	async->rw = rw;
+	async->bio = bio;
+	async->mirror_num = mirror_num;
+	async->submit_bio_start = submit_bio_start;
+	async->submit_bio_done = submit_bio_done;
+
+	btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
+			run_one_async_done, run_one_async_free);
+
+	async->bio_flags = bio_flags;
+	async->bio_offset = bio_offset;
+
+	async->error = 0;
+
+	atomic_inc(&fs_info->nr_async_submits);
+
+	if (rw & REQ_SYNC)
+		btrfs_set_work_high_priority(&async->work);
+
+	btrfs_queue_work(fs_info->workers, &async->work);
+
+	while (atomic_read(&fs_info->async_submit_draining) &&
+	      atomic_read(&fs_info->nr_async_submits)) {
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) == 0));
+	}
+
+	return 0;
+}
+
+static int btree_csum_one_bio(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	struct btrfs_root *root;
+	int i, ret = 0;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags,
+				    u64 bio_offset)
+{
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump into btrfs_map_bio
+	 */
+	return btree_csum_one_bio(bio);
+}
+
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
+{
+	int ret;
+
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump into btrfs_map_bio
+	 */
+	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
+}
+
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+	if (bio_flags & EXTENT_BIO_TREE_LOG)
+		return 0;
+#ifdef CONFIG_X86
+	if (cpu_has_xmm4_2)
+		return 0;
+#endif
+	return 1;
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
+{
+	int async = check_async_write(inode, bio_flags);
+	int ret;
+
+	if (!(rw & REQ_WRITE)) {
+		/*
+		 * called for a read, do the setup so that checksum validation
+		 * can happen in the async kernel threads
+		 */
+		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+					  bio, BTRFS_WQ_ENDIO_METADATA);
+		if (ret)
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
+	} else if (!async) {
+		ret = btree_csum_one_bio(bio);
+		if (ret)
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
+	} else {
+		/*
+		 * kthread helpers are used to submit writes so that
+		 * checksumming can happen in parallel across all CPUs
+		 */
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+					  inode, rw, bio, mirror_num, 0,
+					  bio_offset,
+					  __btree_submit_bio_start,
+					  __btree_submit_bio_done);
+	}
+
+	if (ret) {
+out_w_error:
+		bio_endio(bio, ret);
+	}
+	return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+			struct page *newpage, struct page *page,
+			enum migrate_mode mode)
+{
+	/*
+	 * we can't safely write a btree page from here,
+	 * we haven't done the locking hook
+	 */
+	if (PageDirty(page))
+		return -EAGAIN;
+	/*
+	 * Buffers may be managed in a filesystem specific way.
+	 * We must have no buffers or drop them.
+	 */
+	if (page_has_private(page) &&
+	    !try_to_release_page(page, GFP_KERNEL))
+		return -EAGAIN;
+	return migrate_page(mapping, newpage, page, mode);
+}
+#endif
+
+
+static int btree_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct btrfs_fs_info *fs_info;
+	int ret;
+
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+
+		if (wbc->for_kupdate)
+			return 0;
+
+		fs_info = BTRFS_I(mapping->host)->root->fs_info;
+		/* this is a bit racy, but that's ok */
+		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+					     BTRFS_DIRTY_METADATA_THRESH);
+		if (ret < 0)
+			return 0;
+	}
+	return btree_write_cache_pages(mapping, wbc);
+}
+
+static int btree_readpage(struct file *file, struct page *page)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_read_full_page(tree, page, btree_get_extent, 0);
+}
+
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	if (PageWriteback(page) || PageDirty(page))
+		return 0;
+
+	return try_release_extent_buffer(page);
+}
+
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	extent_invalidatepage(tree, page, offset);
+	btree_releasepage(page, GFP_NOFS);
+	if (PagePrivate(page)) {
+		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
+			   "page private not zero on page %llu",
+			   (unsigned long long)page_offset(page));
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+}
+
+static int btree_set_page_dirty(struct page *page)
+{
+#ifdef DEBUG
+	struct extent_buffer *eb;
+
+	BUG_ON(!PagePrivate(page));
+	eb = (struct extent_buffer *)page->private;
+	BUG_ON(!eb);
+	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+	BUG_ON(!atomic_read(&eb->refs));
+	btrfs_assert_tree_locked(eb);
+#endif
+	return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations btree_aops = {
+	.readpage	= btree_readpage,
+	.writepages	= btree_writepages,
+	.releasepage	= btree_releasepage,
+	.invalidatepage = btree_invalidatepage,
+#ifdef CONFIG_MIGRATION
+	.migratepage	= btree_migratepage,
+#endif
+	.set_page_dirty = btree_set_page_dirty,
+};
+
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+
+	buf = btrfs_find_create_tree_block(root, bytenr);
+	if (!buf)
+		return;
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+				 buf, 0, WAIT_NONE, btree_get_extent, 0);
+	free_extent_buffer(buf);
+}
+
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
+			 int mirror_num, struct extent_buffer **eb)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(root, bytenr);
+	if (!buf)
+		return 0;
+
+	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+				       btree_get_extent, mirror_num);
+	if (ret) {
+		free_extent_buffer(buf);
+		return ret;
+	}
+
+	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+		free_extent_buffer(buf);
+		return -EIO;
+	} else if (extent_buffer_uptodate(buf)) {
+		*eb = buf;
+	} else {
+		free_extent_buffer(buf);
+	}
+	return 0;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
+					    u64 bytenr)
+{
+	return find_extent_buffer(fs_info, bytenr);
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						 u64 bytenr)
+{
+	if (btrfs_test_is_dummy_root(root))
+		return alloc_test_extent_buffer(root->fs_info, bytenr);
+	return alloc_extent_buffer(root->fs_info, bytenr);
+}
+
+
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
+					buf->start + buf->len - 1);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+	return filemap_fdatawait_range(buf->pages[0]->mapping,
+				       buf->start, buf->start + buf->len - 1);
+}
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u64 parent_transid)
+{
+	struct extent_buffer *buf = NULL;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(root, bytenr);
+	if (!buf)
+		return NULL;
+
+	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+	if (ret) {
+		free_extent_buffer(buf);
+		return NULL;
+	}
+	return buf;
+
+}
+
+void clean_tree_block(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info,
+		      struct extent_buffer *buf)
+{
+	if (btrfs_header_generation(buf) ==
+	    fs_info->running_transaction->transid) {
+		btrfs_assert_tree_locked(buf);
+
+		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+					     -buf->len,
+					     fs_info->dirty_metadata_batch);
+			/* ugh, clear_extent_buffer_dirty needs to lock the page */
+			btrfs_set_lock_blocking(buf);
+			clear_extent_buffer_dirty(buf);
+		}
+	}
+}
+
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+	struct btrfs_subvolume_writers *writers;
+	int ret;
+
+	writers = kmalloc(sizeof(*writers), GFP_NOFS);
+	if (!writers)
+		return ERR_PTR(-ENOMEM);
+
+	ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(writers);
+		return ERR_PTR(ret);
+	}
+
+	init_waitqueue_head(&writers->wait);
+	return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+	percpu_counter_destroy(&writers->counter);
+	kfree(writers);
+}
+
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+			 struct btrfs_root *root, struct btrfs_fs_info *fs_info,
+			 u64 objectid)
+{
+	root->node = NULL;
+	root->commit_root = NULL;
+	root->sectorsize = sectorsize;
+	root->nodesize = nodesize;
+	root->stripesize = stripesize;
+	root->state = 0;
+	root->orphan_cleanup_state = 0;
+
+	root->objectid = objectid;
+	root->last_trans = 0;
+	root->highest_objectid = 0;
+	root->nr_delalloc_inodes = 0;
+	root->nr_ordered_extents = 0;
+	root->name = NULL;
+	root->inode_tree = RB_ROOT;
+	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+	root->block_rsv = NULL;
+	root->orphan_block_rsv = NULL;
+
+	INIT_LIST_HEAD(&root->dirty_list);
+	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD(&root->delalloc_inodes);
+	INIT_LIST_HEAD(&root->delalloc_root);
+	INIT_LIST_HEAD(&root->ordered_extents);
+	INIT_LIST_HEAD(&root->ordered_root);
+	INIT_LIST_HEAD(&root->logged_list[0]);
+	INIT_LIST_HEAD(&root->logged_list[1]);
+	spin_lock_init(&root->orphan_lock);
+	spin_lock_init(&root->inode_lock);
+	spin_lock_init(&root->delalloc_lock);
+	spin_lock_init(&root->ordered_extent_lock);
+	spin_lock_init(&root->accounting_lock);
+	spin_lock_init(&root->log_extents_lock[0]);
+	spin_lock_init(&root->log_extents_lock[1]);
+	mutex_init(&root->objectid_mutex);
+	mutex_init(&root->log_mutex);
+	mutex_init(&root->ordered_extent_mutex);
+	mutex_init(&root->delalloc_mutex);
+	init_waitqueue_head(&root->log_writer_wait);
+	init_waitqueue_head(&root->log_commit_wait[0]);
+	init_waitqueue_head(&root->log_commit_wait[1]);
+	INIT_LIST_HEAD(&root->log_ctxs[0]);
+	INIT_LIST_HEAD(&root->log_ctxs[1]);
+	atomic_set(&root->log_commit[0], 0);
+	atomic_set(&root->log_commit[1], 0);
+	atomic_set(&root->log_writers, 0);
+	atomic_set(&root->log_batch, 0);
+	atomic_set(&root->orphan_inodes, 0);
+	atomic_set(&root->refs, 1);
+	atomic_set(&root->will_be_snapshoted, 0);
+	root->log_transid = 0;
+	root->log_transid_committed = -1;
+	root->last_log_commit = 0;
+	if (fs_info)
+		extent_io_tree_init(&root->dirty_log_pages,
+				     fs_info->btree_inode->i_mapping);
+
+	memset(&root->root_key, 0, sizeof(root->root_key));
+	memset(&root->root_item, 0, sizeof(root->root_item));
+	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+	if (fs_info)
+		root->defrag_trans_start = fs_info->generation;
+	else
+		root->defrag_trans_start = 0;
+	root->root_key.objectid = objectid;
+	root->anon_dev = 0;
+
+	spin_lock_init(&root->root_item_lock);
+}
+
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (root)
+		root->fs_info = fs_info;
+	return root;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(void)
+{
+	struct btrfs_root *root;
+
+	root = btrfs_alloc_root(NULL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+	__setup_root(4096, 4096, 4096, root, NULL, 1);
+	set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
+	root->alloc_bytenr = 0;
+
+	return root;
+}
+#endif
+
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	int ret = 0;
+	uuid_le uuid;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	__setup_root(tree_root->nodesize, tree_root->sectorsize,
+		tree_root->stripesize, root, fs_info, objectid);
+	root->root_key.objectid = objectid;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
+
+	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		leaf = NULL;
+		goto fail;
+	}
+
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, objectid);
+	root->node = leaf;
+
+	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+			    btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	root->commit_root = btrfs_root_node(root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+
+	root->root_item.flags = 0;
+	root->root_item.byte_limit = 0;
+	btrfs_set_root_bytenr(&root->root_item, leaf->start);
+	btrfs_set_root_generation(&root->root_item, trans->transid);
+	btrfs_set_root_level(&root->root_item, 0);
+	btrfs_set_root_refs(&root->root_item, 1);
+	btrfs_set_root_used(&root->root_item, leaf->len);
+	btrfs_set_root_last_snapshot(&root->root_item, 0);
+	btrfs_set_root_dirid(&root->root_item, 0);
+	uuid_le_gen(&uuid);
+	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
+	root->root_item.drop_level = 0;
+
+	key.objectid = objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+	if (ret)
+		goto fail;
+
+	btrfs_tree_unlock(leaf);
+
+	return root;
+
+fail:
+	if (leaf) {
+		btrfs_tree_unlock(leaf);
+		free_extent_buffer(root->commit_root);
+		free_extent_buffer(leaf);
+	}
+	kfree(root);
+
+	return ERR_PTR(ret);
+}
+
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct extent_buffer *leaf;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	__setup_root(tree_root->nodesize, tree_root->sectorsize,
+		     tree_root->stripesize, root, fs_info,
+		     BTRFS_TREE_LOG_OBJECTID);
+
+	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+
+	/*
+	 * DON'T set REF_COWS for log trees
+	 *
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
+
+	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+			NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		kfree(root);
+		return ERR_CAST(leaf);
+	}
+
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+	root->node = leaf;
+
+	write_extent_buffer(root->node, root->fs_info->fsid,
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(root->node);
+	btrfs_tree_unlock(root->node);
+	return root;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *log_root;
+
+	log_root = alloc_log_tree(trans, fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+	WARN_ON(fs_info->log_root_tree);
+	fs_info->log_root_tree = log_root;
+	return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root)
+{
+	struct btrfs_root *log_root;
+	struct btrfs_inode_item *inode_item;
+
+	log_root = alloc_log_tree(trans, root->fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+
+	log_root->last_trans = trans->transid;
+	log_root->root_key.offset = root->root_key.objectid;
+
+	inode_item = &log_root->root_item.inode;
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+	btrfs_set_root_node(&log_root->root_item, log_root->node);
+
+	WARN_ON(root->log_root);
+	root->log_root = log_root;
+	root->log_transid = 0;
+	root->log_transid_committed = -1;
+	root->last_log_commit = 0;
+	return 0;
+}
+
+static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+					       struct btrfs_key *key)
+{
+	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_path *path;
+	u64 generation;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root) {
+		ret = -ENOMEM;
+		goto alloc_fail;
+	}
+
+	__setup_root(tree_root->nodesize, tree_root->sectorsize,
+		tree_root->stripesize, root, fs_info, key->objectid);
+
+	ret = btrfs_find_root(tree_root, key, path,
+			      &root->root_item, &root->root_key);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto find_fail;
+	}
+
+	generation = btrfs_root_generation(&root->root_item);
+	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+				     generation);
+	if (!root->node) {
+		ret = -ENOMEM;
+		goto find_fail;
+	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+		ret = -EIO;
+		goto read_fail;
+	}
+	root->commit_root = btrfs_root_node(root);
+out:
+	btrfs_free_path(path);
+	return root;
+
+read_fail:
+	free_extent_buffer(root->node);
+find_fail:
+	kfree(root);
+alloc_fail:
+	root = ERR_PTR(ret);
+	goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+
+	root = btrfs_read_tree_root(tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		set_bit(BTRFS_ROOT_REF_COWS, &root->state);
+		btrfs_check_and_init_root_item(&root->root_item);
+	}
+
+	return root;
+}
+
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+	int ret;
+	struct btrfs_subvolume_writers *writers;
+
+	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+					GFP_NOFS);
+	if (!root->free_ino_pinned || !root->free_ino_ctl) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	writers = btrfs_alloc_subvolume_writers();
+	if (IS_ERR(writers)) {
+		ret = PTR_ERR(writers);
+		goto fail;
+	}
+	root->subv_writers = writers;
+
+	btrfs_init_free_ino_ctl(root);
+	spin_lock_init(&root->ino_cache_lock);
+	init_waitqueue_head(&root->ino_cache_wait);
+
+	ret = get_anon_bdev(&root->anon_dev);
+	if (ret)
+		goto free_writers;
+	return 0;
+
+free_writers:
+	btrfs_free_subvolume_writers(root->subv_writers);
+fail:
+	kfree(root->free_ino_ctl);
+	kfree(root->free_ino_pinned);
+	return ret;
+}
+
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					       u64 root_id)
+{
+	struct btrfs_root *root;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)root_id);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root)
+{
+	int ret;
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		return ret;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+				(unsigned long)root->root_key.objectid,
+				root);
+	if (ret == 0)
+		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	radix_tree_preload_end();
+
+	return ret;
+}
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *location,
+				     bool check_ref)
+{
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+		return fs_info->chunk_root;
+	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+		return fs_info->dev_root;
+	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return fs_info->csum_root;
+	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+		return fs_info->quota_root ? fs_info->quota_root :
+					     ERR_PTR(-ENOENT);
+	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
+		return fs_info->uuid_root ? fs_info->uuid_root :
+					    ERR_PTR(-ENOENT);
+again:
+	root = btrfs_lookup_fs_root(fs_info, location->objectid);
+	if (root) {
+		if (check_ref && btrfs_root_refs(&root->root_item) == 0)
+			return ERR_PTR(-ENOENT);
+		return root;
+	}
+
+	root = btrfs_read_fs_root(fs_info->tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	ret = btrfs_init_fs_root(root);
+	if (ret)
+		goto fail;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = location->objectid;
+
+	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+	btrfs_free_path(path);
+	if (ret < 0)
+		goto fail;
+	if (ret == 0)
+		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
+
+	ret = btrfs_insert_fs_root(fs_info, root);
+	if (ret) {
+		if (ret == -EEXIST) {
+			free_fs_root(root);
+			goto again;
+		}
+		goto fail;
+	}
+	return root;
+fail:
+	free_fs_root(root);
+	return ERR_PTR(ret);
+}
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+	int ret = 0;
+	struct btrfs_device *device;
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
+		if (!device->bdev)
+			continue;
+		bdi = blk_get_backing_dev_info(device->bdev);
+		if (bdi_congested(bdi, bdi_bits)) {
+			ret = 1;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+	int err;
+
+	err = bdi_setup_and_register(bdi, "btrfs");
+	if (err)
+		return err;
+
+	bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+	bdi->congested_fn	= btrfs_congested_fn;
+	bdi->congested_data	= info;
+	return 0;
+}
+
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions.  This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+	struct bio *bio;
+	struct btrfs_end_io_wq *end_io_wq;
+	int error;
+
+	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
+	bio = end_io_wq->bio;
+
+	error = end_io_wq->error;
+	bio->bi_private = end_io_wq->private;
+	bio->bi_end_io = end_io_wq->end_io;
+	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
+	bio_endio_nodec(bio, error);
+}
+
+static int cleaner_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+	int again;
+
+	do {
+		again = 0;
+
+		/* Make the cleaner go to sleep early. */
+		if (btrfs_need_cleaner_sleep(root))
+			goto sleep;
+
+		if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+			goto sleep;
+
+		/*
+		 * Avoid the problem that we change the status of the fs
+		 * during the above check and trylock.
+		 */
+		if (btrfs_need_cleaner_sleep(root)) {
+			mutex_unlock(&root->fs_info->cleaner_mutex);
+			goto sleep;
+		}
+
+		btrfs_run_delayed_iputs(root);
+		btrfs_delete_unused_bgs(root->fs_info);
+		again = btrfs_clean_one_deleted_snapshot(root);
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		/*
+		 * The defragger has dealt with the R/O remount and umount,
+		 * needn't do anything special here.
+		 */
+		btrfs_run_defrag_inodes(root->fs_info);
+sleep:
+		if (!try_to_freeze() && !again) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!kthread_should_stop())
+				schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_transaction *cur;
+	u64 transid;
+	unsigned long now;
+	unsigned long delay;
+	bool cannot_commit;
+
+	do {
+		cannot_commit = false;
+		delay = HZ * root->fs_info->commit_interval;
+		mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+		spin_lock(&root->fs_info->trans_lock);
+		cur = root->fs_info->running_transaction;
+		if (!cur) {
+			spin_unlock(&root->fs_info->trans_lock);
+			goto sleep;
+		}
+
+		now = get_seconds();
+		if (cur->state < TRANS_STATE_BLOCKED &&
+		    (now < cur->start_time ||
+		     now - cur->start_time < root->fs_info->commit_interval)) {
+			spin_unlock(&root->fs_info->trans_lock);
+			delay = HZ * 5;
+			goto sleep;
+		}
+		transid = cur->transid;
+		spin_unlock(&root->fs_info->trans_lock);
+
+		/* If the file system is aborted, this will always fail. */
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) != -ENOENT)
+				cannot_commit = true;
+			goto sleep;
+		}
+		if (transid == trans->transid) {
+			btrfs_commit_transaction(trans, root);
+		} else {
+			btrfs_end_transaction(trans, root);
+		}
+sleep:
+		wake_up_process(root->fs_info->cleaner_kthread);
+		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
+				      &root->fs_info->fs_state)))
+			btrfs_cleanup_transaction(root);
+		if (!try_to_freeze()) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!kthread_should_stop() &&
+			    (!btrfs_transaction_blocked(root->fs_info) ||
+			     cannot_commit))
+				schedule_timeout(delay);
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+/*
+ * this will find the highest generation in the array of
+ * root backups.  The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest  root in the array with the generation
+ * in the super block.  If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+	u64 cur;
+	int newest_index = -1;
+	struct btrfs_root_backup *root_backup;
+	int i;
+
+	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+		root_backup = info->super_copy->super_roots + i;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = i;
+	}
+
+	/* check to see if we actually wrapped around */
+	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+		root_backup = info->super_copy->super_roots;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = 0;
+	}
+	return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array.  This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+				     u64 newest_gen)
+{
+	int newest_index = -1;
+
+	newest_index = find_newest_super_backup(info, newest_gen);
+	/* if there was garbage in there, just move along */
+	if (newest_index == -1) {
+		info->backup_root_index = 0;
+	} else {
+		info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+	}
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+	int next_backup;
+	struct btrfs_root_backup *root_backup;
+	int last_backup;
+
+	next_backup = info->backup_root_index;
+	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+		BTRFS_NUM_BACKUP_ROOTS;
+
+	/*
+	 * just overwrite the last backup if we're at the same generation
+	 * this happens only at umount
+	 */
+	root_backup = info->super_for_commit->super_roots + last_backup;
+	if (btrfs_backup_tree_root_gen(root_backup) ==
+	    btrfs_header_generation(info->tree_root->node))
+		next_backup = last_backup;
+
+	root_backup = info->super_for_commit->super_roots + next_backup;
+
+	/*
+	 * make sure all of our padding and empty slots get zero filled
+	 * regardless of which ones we use today
+	 */
+	memset(root_backup, 0, sizeof(*root_backup));
+
+	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+	btrfs_set_backup_tree_root_gen(root_backup,
+			       btrfs_header_generation(info->tree_root->node));
+
+	btrfs_set_backup_tree_root_level(root_backup,
+			       btrfs_header_level(info->tree_root->node));
+
+	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+	btrfs_set_backup_chunk_root_gen(root_backup,
+			       btrfs_header_generation(info->chunk_root->node));
+	btrfs_set_backup_chunk_root_level(root_backup,
+			       btrfs_header_level(info->chunk_root->node));
+
+	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+	btrfs_set_backup_extent_root_gen(root_backup,
+			       btrfs_header_generation(info->extent_root->node));
+	btrfs_set_backup_extent_root_level(root_backup,
+			       btrfs_header_level(info->extent_root->node));
+
+	/*
+	 * we might commit during log recovery, which happens before we set
+	 * the fs_root.  Make sure it is valid before we fill it in.
+	 */
+	if (info->fs_root && info->fs_root->node) {
+		btrfs_set_backup_fs_root(root_backup,
+					 info->fs_root->node->start);
+		btrfs_set_backup_fs_root_gen(root_backup,
+			       btrfs_header_generation(info->fs_root->node));
+		btrfs_set_backup_fs_root_level(root_backup,
+			       btrfs_header_level(info->fs_root->node));
+	}
+
+	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+	btrfs_set_backup_dev_root_gen(root_backup,
+			       btrfs_header_generation(info->dev_root->node));
+	btrfs_set_backup_dev_root_level(root_backup,
+				       btrfs_header_level(info->dev_root->node));
+
+	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+	btrfs_set_backup_csum_root_gen(root_backup,
+			       btrfs_header_generation(info->csum_root->node));
+	btrfs_set_backup_csum_root_level(root_backup,
+			       btrfs_header_level(info->csum_root->node));
+
+	btrfs_set_backup_total_bytes(root_backup,
+			     btrfs_super_total_bytes(info->super_copy));
+	btrfs_set_backup_bytes_used(root_backup,
+			     btrfs_super_bytes_used(info->super_copy));
+	btrfs_set_backup_num_devices(root_backup,
+			     btrfs_super_num_devices(info->super_copy));
+
+	/*
+	 * if we don't copy this out to the super_copy, it won't get remembered
+	 * for the next commit
+	 */
+	memcpy(&info->super_copy->super_roots,
+	       &info->super_for_commit->super_roots,
+	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block.  It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+				     struct btrfs_super_block *super,
+				     int *num_backups_tried, int *backup_index)
+{
+	struct btrfs_root_backup *root_backup;
+	int newest = *backup_index;
+
+	if (*num_backups_tried == 0) {
+		u64 gen = btrfs_super_generation(super);
+
+		newest = find_newest_super_backup(info, gen);
+		if (newest == -1)
+			return -1;
+
+		*backup_index = newest;
+		*num_backups_tried = 1;
+	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+		/* we've tried all the backups, all done */
+		return -1;
+	} else {
+		/* jump to the next oldest backup */
+		newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+			BTRFS_NUM_BACKUP_ROOTS;
+		*backup_index = newest;
+		*num_backups_tried += 1;
+	}
+	root_backup = super->super_roots + newest;
+
+	btrfs_set_super_generation(super,
+				   btrfs_backup_tree_root_gen(root_backup));
+	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+	btrfs_set_super_root_level(super,
+				   btrfs_backup_tree_root_level(root_backup));
+	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+	/*
+	 * fixme: the total bytes and num_devices need to match or we should
+	 * need a fsck
+	 */
+	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+	return 0;
+}
+
+/* helper to cleanup workers */
+static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+{
+	btrfs_destroy_workqueue(fs_info->fixup_workers);
+	btrfs_destroy_workqueue(fs_info->delalloc_workers);
+	btrfs_destroy_workqueue(fs_info->workers);
+	btrfs_destroy_workqueue(fs_info->endio_workers);
+	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+	btrfs_destroy_workqueue(fs_info->endio_repair_workers);
+	btrfs_destroy_workqueue(fs_info->rmw_workers);
+	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+	btrfs_destroy_workqueue(fs_info->submit_workers);
+	btrfs_destroy_workqueue(fs_info->delayed_workers);
+	btrfs_destroy_workqueue(fs_info->caching_workers);
+	btrfs_destroy_workqueue(fs_info->readahead_workers);
+	btrfs_destroy_workqueue(fs_info->flush_workers);
+	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+	btrfs_destroy_workqueue(fs_info->extent_workers);
+}
+
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+	if (root) {
+		free_extent_buffer(root->node);
+		free_extent_buffer(root->commit_root);
+		root->node = NULL;
+		root->commit_root = NULL;
+	}
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+	free_root_extent_buffers(info->tree_root);
+
+	free_root_extent_buffers(info->dev_root);
+	free_root_extent_buffers(info->extent_root);
+	free_root_extent_buffers(info->csum_root);
+	free_root_extent_buffers(info->quota_root);
+	free_root_extent_buffers(info->uuid_root);
+	if (chunk_root)
+		free_root_extent_buffers(info->chunk_root);
+}
+
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *gang[8];
+	int i;
+
+	while (!list_empty(&fs_info->dead_roots)) {
+		gang[0] = list_entry(fs_info->dead_roots.next,
+				     struct btrfs_root, root_list);
+		list_del(&gang[0]->root_list);
+
+		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
+			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+		} else {
+			free_extent_buffer(gang[0]->node);
+			free_extent_buffer(gang[0]->commit_root);
+			btrfs_put_fs_root(gang[0]);
+		}
+	}
+
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++)
+			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+	}
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		btrfs_free_log_root_tree(NULL, fs_info);
+		btrfs_destroy_pinned_extent(fs_info->tree_root,
+					    fs_info->pinned_extents);
+	}
+}
+
+static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
+{
+	mutex_init(&fs_info->scrub_lock);
+	atomic_set(&fs_info->scrubs_running, 0);
+	atomic_set(&fs_info->scrub_pause_req, 0);
+	atomic_set(&fs_info->scrubs_paused, 0);
+	atomic_set(&fs_info->scrub_cancel_req, 0);
+	init_waitqueue_head(&fs_info->scrub_pause_wait);
+	fs_info->scrub_workers_refcnt = 0;
+}
+
+static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+{
+	spin_lock_init(&fs_info->balance_lock);
+	mutex_init(&fs_info->balance_mutex);
+	atomic_set(&fs_info->balance_running, 0);
+	atomic_set(&fs_info->balance_pause_req, 0);
+	atomic_set(&fs_info->balance_cancel_req, 0);
+	fs_info->balance_ctl = NULL;
+	init_waitqueue_head(&fs_info->balance_wait_q);
+}
+
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
+				   struct btrfs_root *tree_root)
+{
+	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+	set_nlink(fs_info->btree_inode, 1);
+	/*
+	 * we set the i_size on the btree inode to the max possible int.
+	 * the real end of the address space is determined by all of
+	 * the devices in the system
+	 */
+	fs_info->btree_inode->i_size = OFFSET_MAX;
+	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+
+	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+			     fs_info->btree_inode->i_mapping);
+	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+
+	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+	       sizeof(struct btrfs_key));
+	set_bit(BTRFS_INODE_DUMMY,
+		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
+	btrfs_insert_inode_hash(fs_info->btree_inode);
+}
+
+static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+{
+	fs_info->dev_replace.lock_owner = 0;
+	atomic_set(&fs_info->dev_replace.nesting_level, 0);
+	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+	mutex_init(&fs_info->dev_replace.lock_management_lock);
+	mutex_init(&fs_info->dev_replace.lock);
+	init_waitqueue_head(&fs_info->replace_wait);
+}
+
+static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+{
+	spin_lock_init(&fs_info->qgroup_lock);
+	mutex_init(&fs_info->qgroup_ioctl_lock);
+	fs_info->qgroup_tree = RB_ROOT;
+	fs_info->qgroup_op_tree = RB_ROOT;
+	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+	fs_info->qgroup_seq = 1;
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+	fs_info->qgroup_ulist = NULL;
+	mutex_init(&fs_info->qgroup_rescan_lock);
+}
+
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+		struct btrfs_fs_devices *fs_devices)
+{
+	int max_active = fs_info->thread_pool_size;
+	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+
+	fs_info->workers =
+		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+				      max_active, 16);
+
+	fs_info->delalloc_workers =
+		btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+
+	fs_info->flush_workers =
+		btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+
+	fs_info->caching_workers =
+		btrfs_alloc_workqueue("cache", flags, max_active, 0);
+
+	/*
+	 * a higher idle thresh on the submit workers makes it much more
+	 * likely that bios will be send down in a sane order to the
+	 * devices
+	 */
+	fs_info->submit_workers =
+		btrfs_alloc_workqueue("submit", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 64);
+
+	fs_info->fixup_workers =
+		btrfs_alloc_workqueue("fixup", flags, 1, 0);
+
+	/*
+	 * endios are largely parallel and should have a very
+	 * low idle thresh
+	 */
+	fs_info->endio_workers =
+		btrfs_alloc_workqueue("endio", flags, max_active, 4);
+	fs_info->endio_meta_workers =
+		btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+	fs_info->endio_meta_write_workers =
+		btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+	fs_info->endio_raid56_workers =
+		btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+	fs_info->endio_repair_workers =
+		btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+	fs_info->rmw_workers =
+		btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+	fs_info->endio_write_workers =
+		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+	fs_info->endio_freespace_worker =
+		btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+	fs_info->delayed_workers =
+		btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+	fs_info->readahead_workers =
+		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+	fs_info->qgroup_rescan_workers =
+		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+	fs_info->extent_workers =
+		btrfs_alloc_workqueue("extent-refs", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 8);
+
+	if (!(fs_info->workers && fs_info->delalloc_workers &&
+	      fs_info->submit_workers && fs_info->flush_workers &&
+	      fs_info->endio_workers && fs_info->endio_meta_workers &&
+	      fs_info->endio_meta_write_workers &&
+	      fs_info->endio_repair_workers &&
+	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+	      fs_info->caching_workers && fs_info->readahead_workers &&
+	      fs_info->fixup_workers && fs_info->delayed_workers &&
+	      fs_info->extent_workers &&
+	      fs_info->qgroup_rescan_workers)) {
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+			    struct btrfs_fs_devices *fs_devices)
+{
+	int ret;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *log_tree_root;
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	u64 bytenr = btrfs_super_log_root(disk_super);
+
+	if (fs_devices->rw_devices == 0) {
+		printk(KERN_WARNING "BTRFS: log replay required "
+		       "on RO media\n");
+		return -EIO;
+	}
+
+	log_tree_root = btrfs_alloc_root(fs_info);
+	if (!log_tree_root)
+		return -ENOMEM;
+
+	__setup_root(tree_root->nodesize, tree_root->sectorsize,
+			tree_root->stripesize, log_tree_root, fs_info,
+			BTRFS_TREE_LOG_OBJECTID);
+
+	log_tree_root->node = read_tree_block(tree_root, bytenr,
+			fs_info->generation + 1);
+	if (!log_tree_root->node ||
+	    !extent_buffer_uptodate(log_tree_root->node)) {
+		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		free_extent_buffer(log_tree_root->node);
+		kfree(log_tree_root);
+		return -EIO;
+	}
+	/* returns with log_tree_root freed on success */
+	ret = btrfs_recover_log_trees(log_tree_root);
+	if (ret) {
+		btrfs_error(tree_root->fs_info, ret,
+			    "Failed to recover log tree");
+		free_extent_buffer(log_tree_root->node);
+		kfree(log_tree_root);
+		return ret;
+	}
+
+	if (fs_info->sb->s_flags & MS_RDONLY) {
+		ret = btrfs_commit_super(tree_root);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
+			    struct btrfs_root *tree_root)
+{
+	struct btrfs_root *root;
+	struct btrfs_key location;
+	int ret;
+
+	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = 0;
+
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+	fs_info->extent_root = root;
+
+	location.objectid = BTRFS_DEV_TREE_OBJECTID;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+	fs_info->dev_root = root;
+	btrfs_init_devices_late(fs_info);
+
+	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+	fs_info->csum_root = root;
+
+	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(root)) {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->quota_enabled = 1;
+		fs_info->pending_quota_state = 1;
+		fs_info->quota_root = root;
+	}
+
+	location.objectid = BTRFS_UUID_TREE_OBJECTID;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		if (ret != -ENOENT)
+			return ret;
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->uuid_root = root;
+	}
+
+	return 0;
+}
+
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options)
+{
+	u32 sectorsize;
+	u32 nodesize;
+	u32 stripesize;
+	u64 generation;
+	u64 features;
+	struct btrfs_key location;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	int ret;
+	int err = -EINVAL;
+	int num_backups_tried = 0;
+	int backup_index = 0;
+	int max_active;
+
+	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+	if (!tree_root || !chunk_root) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	ret = init_srcu_struct(&fs_info->subvol_srcu);
+	if (ret) {
+		err = ret;
+		goto fail;
+	}
+
+	ret = setup_bdi(fs_info, &fs_info->bdi);
+	if (ret) {
+		err = ret;
+		goto fail_srcu;
+	}
+
+	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+	if (ret) {
+		err = ret;
+		goto fail_bdi;
+	}
+	fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+					(1 + ilog2(nr_cpu_ids));
+
+	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
+	if (ret) {
+		err = ret;
+		goto fail_dirty_metadata_bytes;
+	}
+
+	ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
+	if (ret) {
+		err = ret;
+		goto fail_delalloc_bytes;
+	}
+
+	fs_info->btree_inode = new_inode(sb);
+	if (!fs_info->btree_inode) {
+		err = -ENOMEM;
+		goto fail_bio_counter;
+	}
+
+	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+	INIT_LIST_HEAD(&fs_info->trans_list);
+	INIT_LIST_HEAD(&fs_info->dead_roots);
+	INIT_LIST_HEAD(&fs_info->delayed_iputs);
+	INIT_LIST_HEAD(&fs_info->delalloc_roots);
+	INIT_LIST_HEAD(&fs_info->caching_block_groups);
+	spin_lock_init(&fs_info->delalloc_root_lock);
+	spin_lock_init(&fs_info->trans_lock);
+	spin_lock_init(&fs_info->fs_roots_radix_lock);
+	spin_lock_init(&fs_info->delayed_iput_lock);
+	spin_lock_init(&fs_info->defrag_inodes_lock);
+	spin_lock_init(&fs_info->free_chunk_lock);
+	spin_lock_init(&fs_info->tree_mod_seq_lock);
+	spin_lock_init(&fs_info->super_lock);
+	spin_lock_init(&fs_info->qgroup_op_lock);
+	spin_lock_init(&fs_info->buffer_lock);
+	spin_lock_init(&fs_info->unused_bgs_lock);
+	rwlock_init(&fs_info->tree_mod_log_lock);
+	mutex_init(&fs_info->unused_bg_unpin_mutex);
+	mutex_init(&fs_info->reloc_mutex);
+	mutex_init(&fs_info->delalloc_root_mutex);
+	seqlock_init(&fs_info->profiles_lock);
+	init_rwsem(&fs_info->delayed_iput_sem);
+
+	init_completion(&fs_info->kobj_unregister);
+	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+	INIT_LIST_HEAD(&fs_info->space_info);
+	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+	INIT_LIST_HEAD(&fs_info->unused_bgs);
+	btrfs_mapping_init(&fs_info->mapping_tree);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv,
+			     BTRFS_BLOCK_RSV_GLOBAL);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+			     BTRFS_BLOCK_RSV_DELALLOC);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+			     BTRFS_BLOCK_RSV_DELOPS);
+	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->async_delalloc_pages, 0);
+	atomic_set(&fs_info->async_submit_draining, 0);
+	atomic_set(&fs_info->nr_async_bios, 0);
+	atomic_set(&fs_info->defrag_running, 0);
+	atomic_set(&fs_info->qgroup_op_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
+	fs_info->sb = sb;
+	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+	fs_info->metadata_ratio = 0;
+	fs_info->defrag_inodes = RB_ROOT;
+	fs_info->free_chunk_space = 0;
+	fs_info->tree_mod_log = RB_ROOT;
+	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
+	/* readahead state */
+	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+	spin_lock_init(&fs_info->reada_lock);
+
+	fs_info->thread_pool_size = min_t(unsigned long,
+					  num_online_cpus() + 2, 8);
+
+	INIT_LIST_HEAD(&fs_info->ordered_roots);
+	spin_lock_init(&fs_info->ordered_root_lock);
+	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+					GFP_NOFS);
+	if (!fs_info->delayed_root) {
+		err = -ENOMEM;
+		goto fail_iput;
+	}
+	btrfs_init_delayed_root(fs_info->delayed_root);
+
+	btrfs_init_scrub(fs_info);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	fs_info->check_integrity_print_mask = 0;
+#endif
+	btrfs_init_balance(fs_info);
+	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
+
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = blksize_bits(4096);
+	sb->s_bdi = &fs_info->bdi;
+
+	btrfs_init_btree_inode(fs_info, tree_root);
+
+	spin_lock_init(&fs_info->block_group_cache_lock);
+	fs_info->block_group_cache_tree = RB_ROOT;
+	fs_info->first_logical_byte = (u64)-1;
+
+	extent_io_tree_init(&fs_info->freed_extents[0],
+			     fs_info->btree_inode->i_mapping);
+	extent_io_tree_init(&fs_info->freed_extents[1],
+			     fs_info->btree_inode->i_mapping);
+	fs_info->pinned_extents = &fs_info->freed_extents[0];
+	fs_info->do_barriers = 1;
+
+
+	mutex_init(&fs_info->ordered_operations_mutex);
+	mutex_init(&fs_info->ordered_extent_flush_mutex);
+	mutex_init(&fs_info->tree_log_mutex);
+	mutex_init(&fs_info->chunk_mutex);
+	mutex_init(&fs_info->transaction_kthread_mutex);
+	mutex_init(&fs_info->cleaner_mutex);
+	mutex_init(&fs_info->volume_mutex);
+	mutex_init(&fs_info->ro_block_group_mutex);
+	init_rwsem(&fs_info->commit_root_sem);
+	init_rwsem(&fs_info->cleanup_work_sem);
+	init_rwsem(&fs_info->subvol_sem);
+	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
+
+	btrfs_init_dev_replace_locks(fs_info);
+	btrfs_init_qgroup(fs_info);
+
+	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
+	init_waitqueue_head(&fs_info->transaction_throttle);
+	init_waitqueue_head(&fs_info->transaction_wait);
+	init_waitqueue_head(&fs_info->transaction_blocked_wait);
+	init_waitqueue_head(&fs_info->async_submit_wait);
+
+	INIT_LIST_HEAD(&fs_info->pinned_chunks);
+
+	ret = btrfs_alloc_stripe_hash_table(fs_info);
+	if (ret) {
+		err = ret;
+		goto fail_alloc;
+	}
+
+	__setup_root(4096, 4096, 4096, tree_root,
+		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+	invalidate_bdev(fs_devices->latest_bdev);
+
+	/*
+	 * Read super block and check the signature bytes only
+	 */
+	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+	if (!bh) {
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	/*
+	 * We want to check superblock checksum, the type is stored inside.
+	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
+	 */
+	if (btrfs_check_super_csum(bh->b_data)) {
+		printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	/*
+	 * super_copy is zeroed at allocation time and we never touch the
+	 * following bytes up to INFO_SIZE, the checksum is calculated from
+	 * the whole block of INFO_SIZE
+	 */
+	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+	memcpy(fs_info->super_for_commit, fs_info->super_copy,
+	       sizeof(*fs_info->super_for_commit));
+	brelse(bh);
+
+	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+
+	ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	disk_super = fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		goto fail_alloc;
+
+	/* check FS state, whether FS is broken. */
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
+	/*
+	 * run through our array of backup supers and setup
+	 * our ring pointer to the oldest one
+	 */
+	generation = btrfs_super_generation(disk_super);
+	find_oldest_super_backup(fs_info, generation);
+
+	/*
+	 * In the long term, we'll store the compression type in the super
+	 * block, and it'll be used for per file compression control.
+	 */
+	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
+	ret = btrfs_parse_options(tree_root, options);
+	if (ret) {
+		err = ret;
+		goto fail_alloc;
+	}
+
+	features = btrfs_super_incompat_flags(disk_super) &
+		~BTRFS_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		printk(KERN_ERR "BTRFS: couldn't mount because of "
+		       "unsupported optional features (%Lx).\n",
+		       features);
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	/*
+	 * Leafsize and nodesize were always equal, this is only a sanity check.
+	 */
+	if (le32_to_cpu(disk_super->__unused_leafsize) !=
+	    btrfs_super_nodesize(disk_super)) {
+		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+		       "blocksizes don't match.  node %d leaf %d\n",
+		       btrfs_super_nodesize(disk_super),
+		       le32_to_cpu(disk_super->__unused_leafsize));
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+	if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+		       "blocksize (%d) was too large\n",
+		       btrfs_super_nodesize(disk_super));
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	features = btrfs_super_incompat_flags(disk_super);
+	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+		printk(KERN_INFO "BTRFS: has skinny extents\n");
+
+	/*
+	 * flag our filesystem as having big metadata blocks if
+	 * they are bigger than the page size
+	 */
+	if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
+		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+			printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
+		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+	}
+
+	nodesize = btrfs_super_nodesize(disk_super);
+	sectorsize = btrfs_super_sectorsize(disk_super);
+	stripesize = btrfs_super_stripesize(disk_super);
+	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+	/*
+	 * mixed block groups end up with duplicate but slightly offset
+	 * extent buffers for the same range.  It leads to corruptions
+	 */
+	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+	    (sectorsize != nodesize)) {
+		printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
+				"are not allowed for mixed block groups on %s\n",
+				sb->s_id);
+		goto fail_alloc;
+	}
+
+	/*
+	 * Needn't use the lock because there is no other task which will
+	 * update the flag.
+	 */
+	btrfs_set_super_incompat_flags(disk_super, features);
+
+	features = btrfs_super_compat_ro_flags(disk_super) &
+		~BTRFS_FEATURE_COMPAT_RO_SUPP;
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+		       "unsupported option features (%Lx).\n",
+		       features);
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	max_active = fs_info->thread_pool_size;
+
+	ret = btrfs_init_workqueues(fs_info, fs_devices);
+	if (ret) {
+		err = ret;
+		goto fail_sb_buffer;
+	}
+
+	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+
+	tree_root->nodesize = nodesize;
+	tree_root->sectorsize = sectorsize;
+	tree_root->stripesize = stripesize;
+
+	sb->s_blocksize = sectorsize;
+	sb->s_blocksize_bits = blksize_bits(sectorsize);
+
+	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+		printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
+		goto fail_sb_buffer;
+	}
+
+	if (sectorsize != PAGE_SIZE) {
+		printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
+		       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
+		goto fail_sb_buffer;
+	}
+
+	mutex_lock(&fs_info->chunk_mutex);
+	ret = btrfs_read_sys_array(tree_root);
+	mutex_unlock(&fs_info->chunk_mutex);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: failed to read the system "
+		       "array on %s\n", sb->s_id);
+		goto fail_sb_buffer;
+	}
+
+	generation = btrfs_super_chunk_root_generation(disk_super);
+
+	__setup_root(nodesize, sectorsize, stripesize, chunk_root,
+		     fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+	chunk_root->node = read_tree_block(chunk_root,
+					   btrfs_super_chunk_root(disk_super),
+					   generation);
+	if (!chunk_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+		printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
+		       sb->s_id);
+		goto fail_tree_roots;
+	}
+	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+	chunk_root->commit_root = btrfs_root_node(chunk_root);
+
+	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+	   btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
+
+	ret = btrfs_read_chunk_tree(chunk_root);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
+		       sb->s_id);
+		goto fail_tree_roots;
+	}
+
+	/*
+	 * keep the device that is marked to be the target device for the
+	 * dev_replace procedure
+	 */
+	btrfs_close_extra_devices(fs_devices, 0);
+
+	if (!fs_devices->latest_bdev) {
+		printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
+		       sb->s_id);
+		goto fail_tree_roots;
+	}
+
+retry_root_backup:
+	generation = btrfs_super_generation(disk_super);
+
+	tree_root->node = read_tree_block(tree_root,
+					  btrfs_super_root(disk_super),
+					  generation);
+	if (!tree_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+		printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
+		       sb->s_id);
+
+		goto recovery_tree_root;
+	}
+
+	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+	tree_root->commit_root = btrfs_root_node(tree_root);
+	btrfs_set_root_refs(&tree_root->root_item, 1);
+
+	ret = btrfs_read_roots(fs_info, tree_root);
+	if (ret)
+		goto recovery_tree_root;
+
+	fs_info->generation = generation;
+	fs_info->last_trans_committed = generation;
+
+	ret = btrfs_recover_balance(fs_info);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: failed to recover balance\n");
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_dev_stats(fs_info);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
+		       ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_dev_replace(fs_info);
+	if (ret) {
+		pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	btrfs_close_extra_devices(fs_devices, 1);
+
+	ret = btrfs_sysfs_add_one(fs_info);
+	if (ret) {
+		pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_space_info(fs_info);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+		goto fail_sysfs;
+	}
+
+	ret = btrfs_read_block_groups(fs_info->extent_root);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
+		goto fail_sysfs;
+	}
+	fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	if (fs_info->fs_devices->missing_devices >
+	     fs_info->num_tolerated_disk_barrier_failures &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		printk(KERN_WARNING "BTRFS: "
+			"too many missing devices, writeable mount is not allowed\n");
+		goto fail_sysfs;
+	}
+
+	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+					       "btrfs-cleaner");
+	if (IS_ERR(fs_info->cleaner_kthread))
+		goto fail_sysfs;
+
+	fs_info->transaction_kthread = kthread_run(transaction_kthread,
+						   tree_root,
+						   "btrfs-transaction");
+	if (IS_ERR(fs_info->transaction_kthread))
+		goto fail_cleaner;
+
+	if (!btrfs_test_opt(tree_root, SSD) &&
+	    !btrfs_test_opt(tree_root, NOSSD) &&
+	    !fs_info->fs_devices->rotating) {
+		printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
+		       "mode\n");
+		btrfs_set_opt(fs_info->mount_opt, SSD);
+	}
+
+	/*
+	 * Mount does not set all options immediatelly, we can do it now and do
+	 * not have to wait for transaction commit
+	 */
+	btrfs_apply_pending_changes(fs_info);
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+		ret = btrfsic_mount(tree_root, fs_devices,
+				    btrfs_test_opt(tree_root,
+					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+				    1 : 0,
+				    fs_info->check_integrity_print_mask);
+		if (ret)
+			printk(KERN_WARNING "BTRFS: failed to initialize"
+			       " integrity check module %s\n", sb->s_id);
+	}
+#endif
+	ret = btrfs_read_qgroup_config(fs_info);
+	if (ret)
+		goto fail_trans_kthread;
+
+	/* do not make disk changes in broken FS */
+	if (btrfs_super_log_root(disk_super) != 0) {
+		ret = btrfs_replay_log(fs_info, fs_devices);
+		if (ret) {
+			err = ret;
+			goto fail_qgroup;
+		}
+	}
+
+	ret = btrfs_find_orphan_roots(tree_root);
+	if (ret)
+		goto fail_qgroup;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_cleanup_fs_roots(fs_info);
+		if (ret)
+			goto fail_qgroup;
+
+		mutex_lock(&fs_info->cleaner_mutex);
+		ret = btrfs_recover_relocation(tree_root);
+		mutex_unlock(&fs_info->cleaner_mutex);
+		if (ret < 0) {
+			printk(KERN_WARNING
+			       "BTRFS: failed to recover relocation\n");
+			err = -EINVAL;
+			goto fail_qgroup;
+		}
+	}
+
+	location.objectid = BTRFS_FS_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = 0;
+
+	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+	if (IS_ERR(fs_info->fs_root)) {
+		err = PTR_ERR(fs_info->fs_root);
+		goto fail_qgroup;
+	}
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	down_read(&fs_info->cleanup_work_sem);
+	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+		up_read(&fs_info->cleanup_work_sem);
+		close_ctree(tree_root);
+		return ret;
+	}
+	up_read(&fs_info->cleanup_work_sem);
+
+	ret = btrfs_resume_balance_async(fs_info);
+	if (ret) {
+		printk(KERN_WARNING "BTRFS: failed to resume balance\n");
+		close_ctree(tree_root);
+		return ret;
+	}
+
+	ret = btrfs_resume_dev_replace_async(fs_info);
+	if (ret) {
+		pr_warn("BTRFS: failed to resume dev_replace\n");
+		close_ctree(tree_root);
+		return ret;
+	}
+
+	btrfs_qgroup_rescan_resume(fs_info);
+
+	if (!fs_info->uuid_root) {
+		pr_info("BTRFS: creating UUID tree\n");
+		ret = btrfs_create_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("BTRFS: failed to create the UUID tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	} else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+		   fs_info->generation !=
+				btrfs_super_uuid_tree_generation(disk_super)) {
+		pr_info("BTRFS: checking UUID tree\n");
+		ret = btrfs_check_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("BTRFS: failed to check the UUID tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	} else {
+		fs_info->update_uuid_tree_gen = 1;
+	}
+
+	fs_info->open = 1;
+
+	return 0;
+
+fail_qgroup:
+	btrfs_free_qgroup_config(fs_info);
+fail_trans_kthread:
+	kthread_stop(fs_info->transaction_kthread);
+	btrfs_cleanup_transaction(fs_info->tree_root);
+	btrfs_free_fs_roots(fs_info);
+fail_cleaner:
+	kthread_stop(fs_info->cleaner_kthread);
+
+	/*
+	 * make sure we're done with the btree inode before we stop our
+	 * kthreads
+	 */
+	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+
+fail_sysfs:
+	btrfs_sysfs_remove_one(fs_info);
+
+fail_block_groups:
+	btrfs_put_block_group_cache(fs_info);
+	btrfs_free_block_groups(fs_info);
+
+fail_tree_roots:
+	free_root_pointers(fs_info, 1);
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_sb_buffer:
+	btrfs_stop_all_workers(fs_info);
+fail_alloc:
+fail_iput:
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+	iput(fs_info->btree_inode);
+fail_bio_counter:
+	percpu_counter_destroy(&fs_info->bio_counter);
+fail_delalloc_bytes:
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+fail_bdi:
+	bdi_destroy(&fs_info->bdi);
+fail_srcu:
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
+fail:
+	btrfs_free_stripe_hash_table(fs_info);
+	btrfs_close_devices(fs_info->fs_devices);
+	return err;
+
+recovery_tree_root:
+	if (!btrfs_test_opt(tree_root, RECOVERY))
+		goto fail_tree_roots;
+
+	free_root_pointers(fs_info, 0);
+
+	/* don't use the log in recovery mode, it won't be valid */
+	btrfs_set_super_log_root(disk_super, 0);
+
+	/* we can't trust the free space cache either */
+	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+	ret = next_root_backup(fs_info, fs_info->super_copy,
+			       &num_backups_tried, &backup_index);
+	if (ret == -1)
+		goto fail_block_groups;
+	goto retry_root_backup;
+}
+
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		struct btrfs_device *device = (struct btrfs_device *)
+			bh->b_private;
+
+		printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
+					  "I/O error on %s\n",
+					  rcu_str_deref(device->name));
+		/* note, we dont' set_buffer_write_io_error because we have
+		 * our own ways of dealing with the IO errors
+		 */
+		clear_buffer_uptodate(bh);
+		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+	struct buffer_head *bh;
+	struct buffer_head *latest = NULL;
+	struct btrfs_super_block *super;
+	int i;
+	u64 transid = 0;
+	u64 bytenr;
+
+	/* we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	for (i = 0; i < 1; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+					i_size_read(bdev->bd_inode))
+			break;
+		bh = __bread(bdev, bytenr / 4096,
+					BTRFS_SUPER_INFO_SIZE);
+		if (!bh)
+			continue;
+
+		super = (struct btrfs_super_block *)bh->b_data;
+		if (btrfs_super_bytenr(super) != bytenr ||
+		    btrfs_super_magic(super) != BTRFS_MAGIC) {
+			brelse(bh);
+			continue;
+		}
+
+		if (!latest || btrfs_super_generation(super) > transid) {
+			brelse(latest);
+			latest = bh;
+			transid = btrfs_super_generation(super);
+		} else {
+			brelse(bh);
+		}
+	}
+	return latest;
+}
+
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
+static int write_dev_supers(struct btrfs_device *device,
+			    struct btrfs_super_block *sb,
+			    int do_barriers, int wait, int max_mirrors)
+{
+	struct buffer_head *bh;
+	int i;
+	int ret;
+	int errors = 0;
+	u32 crc;
+	u64 bytenr;
+
+	if (max_mirrors == 0)
+		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+	for (i = 0; i < max_mirrors; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+		    device->commit_total_bytes)
+			break;
+
+		if (wait) {
+			bh = __find_get_block(device->bdev, bytenr / 4096,
+					      BTRFS_SUPER_INFO_SIZE);
+			if (!bh) {
+				errors++;
+				continue;
+			}
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				errors++;
+
+			/* drop our reference */
+			brelse(bh);
+
+			/* drop the reference from the wait == 0 run */
+			brelse(bh);
+			continue;
+		} else {
+			btrfs_set_super_bytenr(sb, bytenr);
+
+			crc = ~(u32)0;
+			crc = btrfs_csum_data((char *)sb +
+					      BTRFS_CSUM_SIZE, crc,
+					      BTRFS_SUPER_INFO_SIZE -
+					      BTRFS_CSUM_SIZE);
+			btrfs_csum_final(crc, sb->csum);
+
+			/*
+			 * one reference for us, and we leave it for the
+			 * caller
+			 */
+			bh = __getblk(device->bdev, bytenr / 4096,
+				      BTRFS_SUPER_INFO_SIZE);
+			if (!bh) {
+				printk(KERN_ERR "BTRFS: couldn't get super "
+				       "buffer head for bytenr %Lu\n", bytenr);
+				errors++;
+				continue;
+			}
+
+			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+			/* one reference for submit_bh */
+			get_bh(bh);
+
+			set_buffer_uptodate(bh);
+			lock_buffer(bh);
+			bh->b_end_io = btrfs_end_buffer_write_sync;
+			bh->b_private = device;
+		}
+
+		/*
+		 * we fua the first super.  The others we allow
+		 * to go down lazy.
+		 */
+		if (i == 0)
+			ret = btrfsic_submit_bh(WRITE_FUA, bh);
+		else
+			ret = btrfsic_submit_bh(WRITE_SYNC, bh);
+		if (ret)
+			errors++;
+	}
+	return errors < i ? 0 : -1;
+}
+
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	if (device->nobarriers)
+		return 0;
+
+	if (wait) {
+		bio = device->flush_bio;
+		if (!bio)
+			return 0;
+
+		wait_for_completion(&device->flush_wait);
+
+		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+			printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
+				      rcu_str_deref(device->name));
+			device->nobarriers = 1;
+		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
+			ret = -EIO;
+			btrfs_dev_stat_inc_and_print(device,
+				BTRFS_DEV_STAT_FLUSH_ERRS);
+		}
+
+		/* drop the reference from the wait == 0 run */
+		bio_put(bio);
+		device->flush_bio = NULL;
+
+		return ret;
+	}
+
+	/*
+	 * one reference for us, and we leave it for the
+	 * caller
+	 */
+	device->flush_bio = NULL;
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = btrfs_end_empty_barrier;
+	bio->bi_bdev = device->bdev;
+	init_completion(&device->flush_wait);
+	bio->bi_private = &device->flush_wait;
+	device->flush_bio = bio;
+
+	bio_get(bio);
+	btrfsic_submit_bio(WRITE_FLUSH, bio);
+
+	return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+	struct list_head *head;
+	struct btrfs_device *dev;
+	int errors_send = 0;
+	int errors_wait = 0;
+	int ret;
+
+	/* send down all the barriers */
+	head = &info->fs_devices->devices;
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
+		if (!dev->bdev) {
+			errors_send++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 0);
+		if (ret)
+			errors_send++;
+	}
+
+	/* wait for all the barriers */
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
+		if (!dev->bdev) {
+			errors_wait++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 1);
+		if (ret)
+			errors_wait++;
+	}
+	if (errors_send > info->num_tolerated_disk_barrier_failures ||
+	    errors_wait > info->num_tolerated_disk_barrier_failures)
+		return -EIO;
+	return 0;
+}
+
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_space_info *sinfo;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int i;
+	int c;
+	int num_tolerated_disk_barrier_failures =
+		(int)fs_info->fs_devices->num_devices;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		sinfo = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+			if (tmp->flags == types[i]) {
+				sinfo = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!sinfo)
+			continue;
+
+		down_read(&sinfo->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&sinfo->block_groups[c])) {
+				u64 flags;
+
+				btrfs_get_block_group_info(
+					&sinfo->block_groups[c], &space);
+				if (space.total_bytes == 0 ||
+				    space.used_bytes == 0)
+					continue;
+				flags = space.flags;
+				/*
+				 * return
+				 * 0: if dup, single or RAID0 is configured for
+				 *    any of metadata, system or data, else
+				 * 1: if RAID5 is configured, or if RAID1 or
+				 *    RAID10 is configured and only two mirrors
+				 *    are used, else
+				 * 2: if RAID6 is configured, else
+				 * num_mirrors - 1: if RAID1 or RAID10 is
+				 *                  configured and more than
+				 *                  2 mirrors are used.
+				 */
+				if (num_tolerated_disk_barrier_failures > 0 &&
+				    ((flags & (BTRFS_BLOCK_GROUP_DUP |
+					       BTRFS_BLOCK_GROUP_RAID0)) ||
+				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+				      == 0)))
+					num_tolerated_disk_barrier_failures = 0;
+				else if (num_tolerated_disk_barrier_failures > 1) {
+					if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					    BTRFS_BLOCK_GROUP_RAID5 |
+					    BTRFS_BLOCK_GROUP_RAID10)) {
+						num_tolerated_disk_barrier_failures = 1;
+					} else if (flags &
+						   BTRFS_BLOCK_GROUP_RAID6) {
+						num_tolerated_disk_barrier_failures = 2;
+					}
+				}
+			}
+		}
+		up_read(&sinfo->groups_sem);
+	}
+
+	return num_tolerated_disk_barrier_failures;
+}
+
+static int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+	struct list_head *head;
+	struct btrfs_device *dev;
+	struct btrfs_super_block *sb;
+	struct btrfs_dev_item *dev_item;
+	int ret;
+	int do_barriers;
+	int max_errors;
+	int total_errors = 0;
+	u64 flags;
+
+	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+	backup_super_roots(root->fs_info);
+
+	sb = root->fs_info->super_for_commit;
+	dev_item = &sb->dev_item;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	head = &root->fs_info->fs_devices->devices;
+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+
+	if (do_barriers) {
+		ret = barrier_all_devices(root->fs_info);
+		if (ret) {
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
+			btrfs_error(root->fs_info, ret,
+				    "errors while submitting device barriers.");
+			return ret;
+		}
+	}
+
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev) {
+			total_errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		btrfs_set_stack_device_generation(dev_item, 0);
+		btrfs_set_stack_device_type(dev_item, dev->type);
+		btrfs_set_stack_device_id(dev_item, dev->devid);
+		btrfs_set_stack_device_total_bytes(dev_item,
+						   dev->commit_total_bytes);
+		btrfs_set_stack_device_bytes_used(dev_item,
+						  dev->commit_bytes_used);
+		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
+		flags = btrfs_super_flags(sb);
+		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+		ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+		if (ret)
+			total_errors++;
+	}
+	if (total_errors > max_errors) {
+		btrfs_err(root->fs_info, "%d errors while writing supers",
+		       total_errors);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+		/* FUA is masked off if unsupported and can't be the reason */
+		btrfs_error(root->fs_info, -EIO,
+			    "%d errors while writing supers", total_errors);
+		return -EIO;
+	}
+
+	total_errors = 0;
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev)
+			continue;
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+		if (ret)
+			total_errors++;
+	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	if (total_errors > max_errors) {
+		btrfs_error(root->fs_info, -EIO,
+			    "%d errors while writing supers", total_errors);
+		return -EIO;
+	}
+	return 0;
+}
+
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors)
+{
+	return write_all_supers(root, max_mirrors);
+}
+
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				  struct btrfs_root *root)
+{
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	radix_tree_delete(&fs_info->fs_roots_radix,
+			  (unsigned long)root->root_key.objectid);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+
+	if (btrfs_root_refs(&root->root_item) == 0)
+		synchronize_srcu(&fs_info->subvol_srcu);
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		btrfs_free_log(NULL, root);
+
+	if (root->free_ino_pinned)
+		__btrfs_remove_free_space_cache(root->free_ino_pinned);
+	if (root->free_ino_ctl)
+		__btrfs_remove_free_space_cache(root->free_ino_ctl);
+	free_fs_root(root);
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+	iput(root->ino_cache_inode);
+	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
+	if (root->anon_dev)
+		free_anon_bdev(root->anon_dev);
+	if (root->subv_writers)
+		btrfs_free_subvolume_writers(root->subv_writers);
+	free_extent_buffer(root->node);
+	free_extent_buffer(root->commit_root);
+	kfree(root->free_ino_ctl);
+	kfree(root->free_ino_pinned);
+	kfree(root->name);
+	btrfs_put_fs_root(root);
+}
+
+void btrfs_free_fs_root(struct btrfs_root *root)
+{
+	free_fs_root(root);
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	u64 root_objectid = 0;
+	struct btrfs_root *gang[8];
+	int i = 0;
+	int err = 0;
+	unsigned int ret = 0;
+	int index;
+
+	while (1) {
+		index = srcu_read_lock(&fs_info->subvol_srcu);
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, root_objectid,
+					     ARRAY_SIZE(gang));
+		if (!ret) {
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+			break;
+		}
+		root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
+		for (i = 0; i < ret; i++) {
+			/* Avoid to grab roots in dead_roots */
+			if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+				gang[i] = NULL;
+				continue;
+			}
+			/* grab all the search result for later use */
+			gang[i] = btrfs_grab_fs_root(gang[i]);
+		}
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+		for (i = 0; i < ret; i++) {
+			if (!gang[i])
+				continue;
+			root_objectid = gang[i]->root_key.objectid;
+			err = btrfs_orphan_cleanup(gang[i]);
+			if (err)
+				break;
+			btrfs_put_fs_root(gang[i]);
+		}
+		root_objectid++;
+	}
+
+	/* release the uncleaned roots due to error */
+	for (; i < ret; i++) {
+		if (gang[i])
+			btrfs_put_fs_root(gang[i]);
+	}
+	return err;
+}
+
+int btrfs_commit_super(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_run_delayed_iputs(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+	wake_up_process(root->fs_info->cleaner_kthread);
+
+	/* wait until ongoing cleanup work done */
+	down_write(&root->fs_info->cleanup_work_sem);
+	up_write(&root->fs_info->cleanup_work_sem);
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	return btrfs_commit_transaction(trans, root);
+}
+
+void close_ctree(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	fs_info->closing = 1;
+	smp_mb();
+
+	/* wait for the uuid_scan task to finish */
+	down(&fs_info->uuid_tree_rescan_sem);
+	/* avoid complains from lockdep et al., set sem back to initial state */
+	up(&fs_info->uuid_tree_rescan_sem);
+
+	/* pause restriper - we want to resume on mount */
+	btrfs_pause_balance(fs_info);
+
+	btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+	btrfs_scrub_cancel(fs_info);
+
+	/* wait for any defraggers to finish */
+	wait_event(fs_info->transaction_wait,
+		   (atomic_read(&fs_info->defrag_running) == 0));
+
+	/* clear out the rbtree of defraggable inodes */
+	btrfs_cleanup_defrag_inodes(fs_info);
+
+	cancel_work_sync(&fs_info->async_reclaim_work);
+
+	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_commit_super(root);
+		if (ret)
+			btrfs_err(fs_info, "commit super ret %d", ret);
+	}
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		btrfs_error_commit_super(root);
+
+	kthread_stop(fs_info->transaction_kthread);
+	kthread_stop(fs_info->cleaner_kthread);
+
+	fs_info->closing = 2;
+	smp_mb();
+
+	btrfs_free_qgroup_config(fs_info);
+
+	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+		btrfs_info(fs_info, "at unmount delalloc count %lld",
+		       percpu_counter_sum(&fs_info->delalloc_bytes));
+	}
+
+	btrfs_sysfs_remove_one(fs_info);
+
+	btrfs_free_fs_roots(fs_info);
+
+	btrfs_put_block_group_cache(fs_info);
+
+	btrfs_free_block_groups(fs_info);
+
+	/*
+	 * we must make sure there is not any read request to
+	 * submit after we stopping all workers.
+	 */
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+	btrfs_stop_all_workers(fs_info);
+
+	fs_info->open = 0;
+	free_root_pointers(fs_info, 1);
+
+	iput(fs_info->btree_inode);
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
+	btrfs_close_devices(fs_info->fs_devices);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
+	percpu_counter_destroy(&fs_info->bio_counter);
+	bdi_destroy(&fs_info->bdi);
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
+
+	btrfs_free_stripe_hash_table(fs_info);
+
+	__btrfs_free_block_rsv(root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
+
+	lock_chunks(root);
+	while (!list_empty(&fs_info->pinned_chunks)) {
+		struct extent_map *em;
+
+		em = list_first_entry(&fs_info->pinned_chunks,
+				      struct extent_map, list);
+		list_del_init(&em->list);
+		free_extent_map(em);
+	}
+	unlock_chunks(root);
+}
+
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+			  int atomic)
+{
+	int ret;
+	struct inode *btree_inode = buf->pages[0]->mapping->host;
+
+	ret = extent_buffer_uptodate(buf);
+	if (!ret)
+		return ret;
+
+	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+				    parent_transid, atomic);
+	if (ret == -EAGAIN)
+		return ret;
+	return !ret;
+}
+
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+	return set_extent_buffer_uptodate(buf);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+	struct btrfs_root *root;
+	u64 transid = btrfs_header_generation(buf);
+	int was_dirty;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	/*
+	 * This is a fast path so only do this check if we have sanity tests
+	 * enabled.  Normal people shouldn't be marking dummy buffers as dirty
+	 * outside of the sanity tests.
+	 */
+	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
+		return;
+#endif
+	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	btrfs_assert_tree_locked(buf);
+	if (transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
+		       "found %llu running %llu\n",
+			buf->start, transid, root->fs_info->generation);
+	was_dirty = set_extent_buffer_dirty(buf);
+	if (!was_dirty)
+		__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+				     buf->len,
+				     root->fs_info->dirty_metadata_batch);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+		btrfs_print_leaf(root, buf);
+		ASSERT(0);
+	}
+#endif
+}
+
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+					int flush_delayed)
+{
+	/*
+	 * looks as though older kernels can get into trouble with
+	 * this code, they end up stuck in balance_dirty_pages forever
+	 */
+	int ret;
+
+	if (current->flags & PF_MEMALLOC)
+		return;
+
+	if (flush_delayed)
+		btrfs_balance_delayed_items(root);
+
+	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+				     BTRFS_DIRTY_METADATA_THRESH);
+	if (ret > 0) {
+		balance_dirty_pages_ratelimited(
+				   root->fs_info->btree_inode->i_mapping);
+	}
+	return;
+}
+
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
+{
+	__btrfs_btree_balance_dirty(root, 1);
+}
+
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+{
+	__btrfs_btree_balance_dirty(root, 0);
+}
+
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+}
+
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+			      int read_only)
+{
+	struct btrfs_super_block *sb = fs_info->super_copy;
+	int ret = 0;
+
+	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
+				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+		ret = -EINVAL;
+	}
+	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
+				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+		ret = -EINVAL;
+	}
+	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
+				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+		ret = -EINVAL;
+	}
+
+	/*
+	 * The common minimum, we don't know if we can trust the nodesize/sectorsize
+	 * items yet, they'll be verified later. Issue just a warning.
+	 */
+	if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+				btrfs_super_root(sb));
+	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+		printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
+				btrfs_super_chunk_root(sb));
+	if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
+		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+				btrfs_super_log_root(sb));
+
+	/*
+	 * Check the lower bound, the alignment and other constraints are
+	 * checked later.
+	 */
+	if (btrfs_super_nodesize(sb) < 4096) {
+		printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+				btrfs_super_nodesize(sb));
+		ret = -EINVAL;
+	}
+	if (btrfs_super_sectorsize(sb) < 4096) {
+		printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+				btrfs_super_sectorsize(sb));
+		ret = -EINVAL;
+	}
+
+	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+		printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+				fs_info->fsid, sb->dev_item.fsid);
+		ret = -EINVAL;
+	}
+
+	/*
+	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+	 * done later
+	 */
+	if (btrfs_super_num_devices(sb) > (1UL << 31))
+		printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+				btrfs_super_num_devices(sb));
+	if (btrfs_super_num_devices(sb) == 0) {
+		printk(KERN_ERR "BTRFS: number of devices is 0\n");
+		ret = -EINVAL;
+	}
+
+	if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
+		printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+				btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+		ret = -EINVAL;
+	}
+
+	/*
+	 * Obvious sys_chunk_array corruptions, it must hold at least one key
+	 * and one chunk
+	 */
+	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+		printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+				btrfs_super_sys_array_size(sb),
+				BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+		ret = -EINVAL;
+	}
+	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+			+ sizeof(struct btrfs_chunk)) {
+		printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
+				btrfs_super_sys_array_size(sb),
+				sizeof(struct btrfs_disk_key)
+				+ sizeof(struct btrfs_chunk));
+		ret = -EINVAL;
+	}
+
+	/*
+	 * The generation is a global counter, we'll trust it more than the others
+	 * but it's still possible that it's the one that's wrong.
+	 */
+	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+		printk(KERN_WARNING
+			"BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+			btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
+	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+	    && btrfs_super_cache_generation(sb) != (u64)-1)
+		printk(KERN_WARNING
+			"BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+			btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
+
+	return ret;
+}
+
+static void btrfs_error_commit_super(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_run_delayed_iputs(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+
+	down_write(&root->fs_info->cleanup_work_sem);
+	up_write(&root->fs_info->cleanup_work_sem);
+
+	/* cleanup FS via transaction */
+	btrfs_cleanup_transaction(root);
+}
+
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&root->ordered_extent_lock);
+	/*
+	 * This will just short circuit the ordered completion stuff which will
+	 * make sure the ordered extent gets properly cleaned up.
+	 */
+	list_for_each_entry(ordered, &root->ordered_extents,
+			    root_extent_list)
+		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+	spin_unlock(&root->ordered_extent_lock);
+}
+
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
+
+		spin_unlock(&fs_info->ordered_root_lock);
+		btrfs_destroy_ordered_extents(root);
+
+		cond_resched();
+		spin_lock(&fs_info->ordered_root_lock);
+	}
+	spin_unlock(&fs_info->ordered_root_lock);
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	int ret = 0;
+
+	delayed_refs = &trans->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	if (atomic_read(&delayed_refs->num_entries) == 0) {
+		spin_unlock(&delayed_refs->lock);
+		btrfs_info(root->fs_info, "delayed_refs has NO entry");
+		return ret;
+	}
+
+	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+		struct btrfs_delayed_ref_head *head;
+		bool pin_bytes = false;
+
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
+
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			spin_lock(&delayed_refs->lock);
+			continue;
+		}
+		spin_lock(&head->lock);
+		while ((node = rb_first(&head->ref_root)) != NULL) {
+			ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				       rb_node);
+			ref->in_tree = 0;
+			rb_erase(&ref->rb_node, &head->ref_root);
+			atomic_dec(&delayed_refs->num_entries);
+			btrfs_put_delayed_ref(ref);
+		}
+		if (head->must_insert_reserved)
+			pin_bytes = true;
+		btrfs_free_delayed_extent_op(head->extent_op);
+		delayed_refs->num_heads--;
+		if (head->processing == 0)
+			delayed_refs->num_heads_ready--;
+		atomic_dec(&delayed_refs->num_entries);
+		head->node.in_tree = 0;
+		rb_erase(&head->href_node, &delayed_refs->href_root);
+		spin_unlock(&head->lock);
+		spin_unlock(&delayed_refs->lock);
+		mutex_unlock(&head->mutex);
+
+		if (pin_bytes)
+			btrfs_pin_extent(root, head->node.bytenr,
+					 head->node.num_bytes, 1);
+		btrfs_put_delayed_ref(&head->node);
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+
+	spin_unlock(&delayed_refs->lock);
+
+	return ret;
+}
+
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
+
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+					       delalloc_inodes);
+
+		list_del_init(&btrfs_inode->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &btrfs_inode->runtime_flags);
+		spin_unlock(&root->delalloc_lock);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
+
+		spin_lock(&root->delalloc_lock);
+	}
+
+	spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					 delalloc_root);
+		list_del_init(&root->delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		spin_unlock(&fs_info->delalloc_root_lock);
+
+		btrfs_destroy_delalloc_inodes(root);
+		btrfs_put_fs_root(root);
+
+		spin_lock(&fs_info->delalloc_root_lock);
+	}
+	spin_unlock(&fs_info->delalloc_root_lock);
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark)
+{
+	int ret;
+	struct extent_buffer *eb;
+	u64 start = 0;
+	u64 end;
+
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    mark, NULL);
+		if (ret)
+			break;
+
+		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+		while (start <= end) {
+			eb = btrfs_find_tree_block(root->fs_info, start);
+			start += root->nodesize;
+			if (!eb)
+				continue;
+			wait_on_extent_buffer_writeback(eb);
+
+			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+					       &eb->bflags))
+				clear_extent_buffer_dirty(eb);
+			free_extent_buffer_stale(eb);
+		}
+	}
+
+	return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents)
+{
+	struct extent_io_tree *unpin;
+	u64 start;
+	u64 end;
+	int ret;
+	bool loop = true;
+
+	unpin = pinned_extents;
+again:
+	while (1) {
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY, NULL);
+		if (ret)
+			break;
+
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+		btrfs_error_unpin_extent_range(root, start, end);
+		cond_resched();
+	}
+
+	if (loop) {
+		if (unpin == &root->fs_info->freed_extents[0])
+			unpin = &root->fs_info->freed_extents[1];
+		else
+			unpin = &root->fs_info->freed_extents[0];
+		loop = false;
+		goto again;
+	}
+
+	return 0;
+}
+
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+				       struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&fs_info->trans_lock);
+	while (!list_empty(&cur_trans->pending_ordered)) {
+		ordered = list_first_entry(&cur_trans->pending_ordered,
+					   struct btrfs_ordered_extent,
+					   trans_list);
+		list_del_init(&ordered->trans_list);
+		spin_unlock(&fs_info->trans_lock);
+
+		btrfs_put_ordered_extent(ordered);
+		spin_lock(&fs_info->trans_lock);
+	}
+	spin_unlock(&fs_info->trans_lock);
+}
+
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+				   struct btrfs_root *root)
+{
+	btrfs_destroy_delayed_refs(cur_trans, root);
+
+	cur_trans->state = TRANS_STATE_COMMIT_START;
+	wake_up(&root->fs_info->transaction_blocked_wait);
+
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
+	wake_up(&root->fs_info->transaction_wait);
+
+	btrfs_free_pending_ordered(cur_trans, root->fs_info);
+	btrfs_destroy_delayed_inodes(root);
+	btrfs_assert_delayed_root_empty(root);
+
+	btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+				     EXTENT_DIRTY);
+	btrfs_destroy_pinned_extent(root,
+				    root->fs_info->pinned_extents);
+
+	cur_trans->state =TRANS_STATE_COMPLETED;
+	wake_up(&cur_trans->commit_wait);
+
+	/*
+	memset(cur_trans, 0, sizeof(*cur_trans));
+	kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+	*/
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+	struct btrfs_transaction *t;
+
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+	spin_lock(&root->fs_info->trans_lock);
+	while (!list_empty(&root->fs_info->trans_list)) {
+		t = list_first_entry(&root->fs_info->trans_list,
+				     struct btrfs_transaction, list);
+		if (t->state >= TRANS_STATE_COMMIT_START) {
+			atomic_inc(&t->use_count);
+			spin_unlock(&root->fs_info->trans_lock);
+			btrfs_wait_for_commit(root, t->transid);
+			btrfs_put_transaction(t);
+			spin_lock(&root->fs_info->trans_lock);
+			continue;
+		}
+		if (t == root->fs_info->running_transaction) {
+			t->state = TRANS_STATE_COMMIT_DOING;
+			spin_unlock(&root->fs_info->trans_lock);
+			/*
+			 * We wait for 0 num_writers since we don't hold a trans
+			 * handle open currently for this transaction.
+			 */
+			wait_event(t->writer_wait,
+				   atomic_read(&t->num_writers) == 0);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
+		}
+		btrfs_cleanup_one_transaction(t, root);
+
+		spin_lock(&root->fs_info->trans_lock);
+		if (t == root->fs_info->running_transaction)
+			root->fs_info->running_transaction = NULL;
+		list_del_init(&t->list);
+		spin_unlock(&root->fs_info->trans_lock);
+
+		btrfs_put_transaction(t);
+		trace_btrfs_transaction_commit(root);
+		spin_lock(&root->fs_info->trans_lock);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+	btrfs_destroy_all_ordered_extents(root->fs_info);
+	btrfs_destroy_delayed_inodes(root);
+	btrfs_assert_delayed_root_empty(root);
+	btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
+	btrfs_destroy_all_delalloc_inodes(root->fs_info);
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+	return 0;
+}
+
+static const struct extent_io_ops btree_extent_io_ops = {
+	.readpage_end_io_hook = btree_readpage_end_io_hook,
+	.readpage_io_failed_hook = btree_io_failed_hook,
+	.submit_bio_hook = btree_submit_bio_hook,
+	/* note we're sharing with inode.c for the merge bio hook */
+	.merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/kernel/fs/btrfs/disk-io.h b/kernel/fs/btrfs/disk-io.h
new file mode 100644
index 000000000..d4cbfeeee
--- /dev/null
+++ b/kernel/fs/btrfs/disk-io.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DISKIO__
+#define __DISKIO__
+
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX	 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+enum btrfs_wq_endio_type {
+	BTRFS_WQ_ENDIO_DATA = 0,
+	BTRFS_WQ_ENDIO_METADATA = 1,
+	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+	BTRFS_WQ_ENDIO_RAID56 = 3,
+	BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
+};
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+	u64 start = 16 * 1024;
+	if (mirror)
+		return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+	return BTRFS_SUPER_INFO_OFFSET;
+}
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u64 parent_transid);
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
+			 int mirror_num, struct extent_buffer **eb);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						   u64 bytenr);
+void clean_tree_block(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options);
+void close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
+					    u64 bytenr);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root);
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *key,
+				     bool check_ref);
+static inline struct btrfs_root *
+btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+			   struct btrfs_key *location)
+{
+	return btrfs_get_fs_root(fs_info, location, true);
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				 struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(void);
+#endif
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * 	fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+	if (atomic_inc_not_zero(&root->refs))
+		return root;
+	return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+	if (atomic_dec_and_test(&root->refs))
+		kfree(root);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+			  int atomic);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			enum btrfs_wq_endio_type metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags, u64 bio_offset,
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+				  struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+				void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info);
+int __init btrfs_end_io_wq_init(void);
+void btrfs_end_io_wq_exit(void);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_init_lockdep(void);
+void btrfs_set_buffer_lockdep_class(u64 objectid,
+			            struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_init_lockdep(void)
+{ }
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+					struct extent_buffer *eb, int level)
+{
+}
+#endif
+#endif
diff --git a/kernel/fs/btrfs/export.c b/kernel/fs/btrfs/export.c
new file mode 100644
index 000000000..8d052209f
--- /dev/null
+++ b/kernel/fs/btrfs/export.c
@@ -0,0 +1,304 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+						 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+					     parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			   struct inode *parent)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+	int len = *max_len;
+	int type;
+
+	if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
+		return FILEID_INVALID;
+	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+		return FILEID_INVALID;
+	}
+
+	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+	type = FILEID_BTRFS_WITHOUT_PARENT;
+
+	fid->objectid = btrfs_ino(inode);
+	fid->root_objectid = BTRFS_I(inode)->root->objectid;
+	fid->gen = inode->i_generation;
+
+	if (parent) {
+		u64 parent_root_id;
+
+		fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+		fid->parent_gen = parent->i_generation;
+		parent_root_id = BTRFS_I(parent)->root->objectid;
+
+		if (parent_root_id != fid->root_objectid) {
+			fid->parent_root_objectid = parent_root_id;
+			len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+			type = FILEID_BTRFS_WITH_PARENT_ROOT;
+		} else {
+			len = BTRFS_FID_SIZE_CONNECTABLE;
+			type = FILEID_BTRFS_WITH_PARENT;
+		}
+	}
+
+	*max_len = len;
+	return type;
+}
+
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+				       u64 root_objectid, u32 generation,
+				       int check_generation)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root;
+	struct inode *inode;
+	struct btrfs_key key;
+	int index;
+	int err = 0;
+
+	if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return ERR_PTR(-ESTALE);
+
+	key.objectid = root_objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto fail;
+	}
+
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto fail;
+	}
+
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	if (check_generation && generation != inode->i_generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return d_obtain_alias(inode);
+fail:
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+	return ERR_PTR(err);
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+		if (fh_len !=  BTRFS_FID_SIZE_CONNECTABLE)
+			return NULL;
+		root_objectid = fid->root_objectid;
+	} else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+		if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+			return NULL;
+		root_objectid = fid->parent_root_objectid;
+	} else
+		return NULL;
+
+	objectid = fid->parent_objectid;
+	generation = fid->parent_gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+	    (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+	    (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+		return NULL;
+
+	objectid = fid->objectid;
+	root_objectid = fid->root_objectid;
+	generation = fid->gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+	struct inode *dir = d_inode(child);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_root_ref *ref;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+
+	if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
+		key.objectid = root->root_key.objectid;
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = (u64)-1;
+		root = root->fs_info->tree_root;
+	} else {
+		key.objectid = btrfs_ino(dir);
+		key.type = BTRFS_INODE_REF_KEY;
+		key.offset = (u64)-1;
+	}
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+
+	BUG_ON(ret == 0); /* Key with offset of -1 found */
+	if (path->slots[0] == 0) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	path->slots[0]--;
+	leaf = path->nodes[0];
+
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	if (found_key.objectid != key.objectid || found_key.type != key.type) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+		key.objectid = btrfs_root_ref_dirid(leaf, ref);
+	} else {
+		key.objectid = found_key.offset;
+	}
+	btrfs_free_path(path);
+
+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+		return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+					found_key.offset, 0, 0);
+	}
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+fail:
+	btrfs_free_path(path);
+	return ERR_PTR(ret);
+}
+
+static int btrfs_get_name(struct dentry *parent, char *name,
+			  struct dentry *child)
+{
+	struct inode *inode = d_inode(child);
+	struct inode *dir = d_inode(parent);
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_root_ref *rref;
+	struct extent_buffer *leaf;
+	unsigned long name_ptr;
+	struct btrfs_key key;
+	int name_len;
+	int ret;
+	u64 ino;
+
+	if (!dir || !inode)
+		return -EINVAL;
+
+	if (!S_ISDIR(dir->i_mode))
+		return -EINVAL;
+
+	ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+		key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = (u64)-1;
+		root = root->fs_info->tree_root;
+	} else {
+		key.objectid = ino;
+		key.offset = btrfs_ino(dir);
+		key.type = BTRFS_INODE_REF_KEY;
+	}
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		btrfs_free_path(path);
+		return ret;
+	} else if (ret > 0) {
+		if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+			path->slots[0]--;
+		} else {
+			btrfs_free_path(path);
+			return -ENOENT;
+		}
+	}
+	leaf = path->nodes[0];
+
+	if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+		rref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+		name_ptr = (unsigned long)(rref + 1);
+		name_len = btrfs_root_ref_name_len(leaf, rref);
+	} else {
+		iref = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_inode_ref);
+		name_ptr = (unsigned long)(iref + 1);
+		name_len = btrfs_inode_ref_name_len(leaf, iref);
+	}
+
+	read_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_free_path(path);
+
+	/*
+	 * have to add the null termination to make sure that reconnect_path
+	 * gets the right len for strlen
+	 */
+	name[name_len] = '\0';
+
+	return 0;
+}
+
+const struct export_operations btrfs_export_ops = {
+	.encode_fh	= btrfs_encode_fh,
+	.fh_to_dentry	= btrfs_fh_to_dentry,
+	.fh_to_parent	= btrfs_fh_to_parent,
+	.get_parent	= btrfs_get_parent,
+	.get_name	= btrfs_get_name,
+};
diff --git a/kernel/fs/btrfs/export.h b/kernel/fs/btrfs/export.h
new file mode 100644
index 000000000..074348a95
--- /dev/null
+++ b/kernel/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+	u64 objectid;
+	u64 root_objectid;
+	u32 gen;
+
+	u64 parent_objectid;
+	u32 parent_gen;
+
+	u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+#endif
diff --git a/kernel/fs/btrfs/extent-tree.c b/kernel/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000..0ec3acd14
--- /dev/null
+++ b/kernel/fs/btrfs/extent-tree.c
@@ -0,0 +1,10174 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/rcupdate.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
+#include "hash.h"
+#include "tree-log.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "locking.h"
+#include "free-space-cache.h"
+#include "math.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#undef SCRAMBLE_DELAYED_REFS
+
+/*
+ * control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated.  This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ */
+enum {
+	CHUNK_ALLOC_NO_FORCE = 0,
+	CHUNK_ALLOC_LIMITED = 1,
+	CHUNK_ALLOC_FORCE = 2,
+};
+
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ *   ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+	RESERVE_FREE = 0,
+	RESERVE_ALLOC = 1,
+	RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root, u64 bytenr,
+			      u64 num_bytes, int alloc);
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extra_op,
+				int no_quota);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins,
+				     int no_quota);
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 flags,
+			  int force);
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+			    int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve,
+				       int delalloc);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes);
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved);
+
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+	smp_mb();
+	return cache->cached == BTRFS_CACHE_FINISHED ||
+		cache->cached == BTRFS_CACHE_ERROR;
+}
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+	return (cache->flags & bits) == bits;
+}
+
+static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+	atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->count)) {
+		WARN_ON(cache->pinned > 0);
+		WARN_ON(cache->reserved > 0);
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+	}
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+				struct btrfs_block_group_cache *block_group)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_block_group_cache *cache;
+
+	spin_lock(&info->block_group_cache_lock);
+	p = &info->block_group_cache_tree.rb_node;
+
+	while (*p) {
+		parent = *p;
+		cache = rb_entry(parent, struct btrfs_block_group_cache,
+				 cache_node);
+		if (block_group->key.objectid < cache->key.objectid) {
+			p = &(*p)->rb_left;
+		} else if (block_group->key.objectid > cache->key.objectid) {
+			p = &(*p)->rb_right;
+		} else {
+			spin_unlock(&info->block_group_cache_lock);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&block_group->cache_node, parent, p);
+	rb_insert_color(&block_group->cache_node,
+			&info->block_group_cache_tree);
+
+	if (info->first_logical_byte > block_group->key.objectid)
+		info->first_logical_byte = block_group->key.objectid;
+
+	spin_unlock(&info->block_group_cache_lock);
+
+	return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+			      int contains)
+{
+	struct btrfs_block_group_cache *cache, *ret = NULL;
+	struct rb_node *n;
+	u64 end, start;
+
+	spin_lock(&info->block_group_cache_lock);
+	n = info->block_group_cache_tree.rb_node;
+
+	while (n) {
+		cache = rb_entry(n, struct btrfs_block_group_cache,
+				 cache_node);
+		end = cache->key.objectid + cache->key.offset - 1;
+		start = cache->key.objectid;
+
+		if (bytenr < start) {
+			if (!contains && (!ret || start < ret->key.objectid))
+				ret = cache;
+			n = n->rb_left;
+		} else if (bytenr > start) {
+			if (contains && bytenr <= end) {
+				ret = cache;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			ret = cache;
+			break;
+		}
+	}
+	if (ret) {
+		btrfs_get_block_group(ret);
+		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+			info->first_logical_byte = ret->key.objectid;
+	}
+	spin_unlock(&info->block_group_cache_lock);
+
+	return ret;
+}
+
+static int add_excluded_extent(struct btrfs_root *root,
+			       u64 start, u64 num_bytes)
+{
+	u64 end = start + num_bytes - 1;
+	set_extent_bits(&root->fs_info->freed_extents[0],
+			start, end, EXTENT_UPTODATE, GFP_NOFS);
+	set_extent_bits(&root->fs_info->freed_extents[1],
+			start, end, EXTENT_UPTODATE, GFP_NOFS);
+	return 0;
+}
+
+static void free_excluded_extents(struct btrfs_root *root,
+				  struct btrfs_block_group_cache *cache)
+{
+	u64 start, end;
+
+	start = cache->key.objectid;
+	end = start + cache->key.offset - 1;
+
+	clear_extent_bits(&root->fs_info->freed_extents[0],
+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
+	clear_extent_bits(&root->fs_info->freed_extents[1],
+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
+}
+
+static int exclude_super_stripes(struct btrfs_root *root,
+				 struct btrfs_block_group_cache *cache)
+{
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+		cache->bytes_super += stripe_len;
+		ret = add_excluded_extent(root, cache->key.objectid,
+					  stripe_len);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+				       cache->key.objectid, bytenr,
+				       0, &logical, &nr, &stripe_len);
+		if (ret)
+			return ret;
+
+		while (nr--) {
+			u64 start, len;
+
+			if (logical[nr] > cache->key.objectid +
+			    cache->key.offset)
+				continue;
+
+			if (logical[nr] + stripe_len <= cache->key.objectid)
+				continue;
+
+			start = logical[nr];
+			if (start < cache->key.objectid) {
+				start = cache->key.objectid;
+				len = (logical[nr] + stripe_len) - start;
+			} else {
+				len = min_t(u64, stripe_len,
+					    cache->key.objectid +
+					    cache->key.offset - start);
+			}
+
+			cache->bytes_super += len;
+			ret = add_excluded_extent(root, start, len);
+			if (ret) {
+				kfree(logical);
+				return ret;
+			}
+		}
+
+		kfree(logical);
+	}
+	return 0;
+}
+
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *ctl;
+
+	spin_lock(&cache->lock);
+	if (!cache->caching_ctl) {
+		spin_unlock(&cache->lock);
+		return NULL;
+	}
+
+	ctl = cache->caching_ctl;
+	atomic_inc(&ctl->count);
+	spin_unlock(&cache->lock);
+	return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+	if (atomic_dec_and_test(&ctl->count))
+		kfree(ctl);
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_fs_info *info, u64 start, u64 end)
+{
+	u64 extent_start, extent_end, size, total_added = 0;
+	int ret;
+
+	while (start < end) {
+		ret = find_first_extent_bit(info->pinned_extents, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY | EXTENT_UPTODATE,
+					    NULL);
+		if (ret)
+			break;
+
+		if (extent_start <= start) {
+			start = extent_end + 1;
+		} else if (extent_start > start && extent_start < end) {
+			size = extent_start - start;
+			total_added += size;
+			ret = btrfs_add_free_space(block_group, start,
+						   size);
+			BUG_ON(ret); /* -ENOMEM or logic error */
+			start = extent_end + 1;
+		} else {
+			break;
+		}
+	}
+
+	if (start < end) {
+		size = end - start;
+		total_added += size;
+		ret = btrfs_add_free_space(block_group, start, size);
+		BUG_ON(ret); /* -ENOMEM or logic error */
+	}
+
+	return total_added;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_caching_control *caching_ctl;
+	struct btrfs_root *extent_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u64 total_found = 0;
+	u64 last = 0;
+	u32 nritems;
+	int ret = -ENOMEM;
+
+	caching_ctl = container_of(work, struct btrfs_caching_control, work);
+	block_group = caching_ctl->block_group;
+	fs_info = block_group->fs_info;
+	extent_root = fs_info->extent_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out;
+
+	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
+	/*
+	 * We don't want to deadlock with somebody trying to allocate a new
+	 * extent for the extent root while also trying to search the extent
+	 * root to add free space.  So we skip locking and search the commit
+	 * root, since its read-only
+	 */
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = 1;
+
+	key.objectid = last;
+	key.offset = 0;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+again:
+	mutex_lock(&caching_ctl->mutex);
+	/* need to make sure the commit_root doesn't disappear */
+	down_read(&fs_info->commit_root_sem);
+
+next:
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	while (1) {
+		if (btrfs_fs_closing(fs_info) > 1) {
+			last = (u64)-1;
+			break;
+		}
+
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		} else {
+			ret = find_next_key(path, 0, &key);
+			if (ret)
+				break;
+
+			if (need_resched() ||
+			    rwsem_is_contended(&fs_info->commit_root_sem)) {
+				caching_ctl->progress = last;
+				btrfs_release_path(path);
+				up_read(&fs_info->commit_root_sem);
+				mutex_unlock(&caching_ctl->mutex);
+				cond_resched();
+				goto again;
+			}
+
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto err;
+			if (ret)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
+		}
+
+		if (key.objectid < last) {
+			key.objectid = last;
+			key.offset = 0;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+
+			caching_ctl->progress = last;
+			btrfs_release_path(path);
+			goto next;
+		}
+
+		if (key.objectid < block_group->key.objectid) {
+			path->slots[0]++;
+			continue;
+		}
+
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset)
+			break;
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    key.type == BTRFS_METADATA_ITEM_KEY) {
+			total_found += add_new_free_space(block_group,
+							  fs_info, last,
+							  key.objectid);
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				last = key.objectid +
+					fs_info->tree_root->nodesize;
+			else
+				last = key.objectid + key.offset;
+
+			if (total_found > (1024 * 1024 * 2)) {
+				total_found = 0;
+				wake_up(&caching_ctl->wait);
+			}
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+
+	total_found += add_new_free_space(block_group, fs_info, last,
+					  block_group->key.objectid +
+					  block_group->key.offset);
+	caching_ctl->progress = (u64)-1;
+
+	spin_lock(&block_group->lock);
+	block_group->caching_ctl = NULL;
+	block_group->cached = BTRFS_CACHE_FINISHED;
+	spin_unlock(&block_group->lock);
+
+err:
+	btrfs_free_path(path);
+	up_read(&fs_info->commit_root_sem);
+
+	free_excluded_extents(extent_root, block_group);
+
+	mutex_unlock(&caching_ctl->mutex);
+out:
+	if (ret) {
+		spin_lock(&block_group->lock);
+		block_group->caching_ctl = NULL;
+		block_group->cached = BTRFS_CACHE_ERROR;
+		spin_unlock(&block_group->lock);
+	}
+	wake_up(&caching_ctl->wait);
+
+	put_caching_control(caching_ctl);
+	btrfs_put_block_group(block_group);
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+			     int load_cache_only)
+{
+	DEFINE_WAIT(wait);
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct btrfs_caching_control *caching_ctl;
+	int ret = 0;
+
+	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+	if (!caching_ctl)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&caching_ctl->list);
+	mutex_init(&caching_ctl->mutex);
+	init_waitqueue_head(&caching_ctl->wait);
+	caching_ctl->block_group = cache;
+	caching_ctl->progress = cache->key.objectid;
+	atomic_set(&caching_ctl->count, 1);
+	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+			caching_thread, NULL, NULL);
+
+	spin_lock(&cache->lock);
+	/*
+	 * This should be a rare occasion, but this could happen I think in the
+	 * case where one thread starts to load the space cache info, and then
+	 * some other thread starts a transaction commit which tries to do an
+	 * allocation while the other thread is still loading the space cache
+	 * info.  The previous loop should have kept us from choosing this block
+	 * group, but if we've moved to the state where we will wait on caching
+	 * block groups we need to first check if we're doing a fast load here,
+	 * so we can wait for it to finish, otherwise we could end up allocating
+	 * from a block group who's cache gets evicted for one reason or
+	 * another.
+	 */
+	while (cache->cached == BTRFS_CACHE_FAST) {
+		struct btrfs_caching_control *ctl;
+
+		ctl = cache->caching_ctl;
+		atomic_inc(&ctl->count);
+		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&cache->lock);
+
+		schedule();
+
+		finish_wait(&ctl->wait, &wait);
+		put_caching_control(ctl);
+		spin_lock(&cache->lock);
+	}
+
+	if (cache->cached != BTRFS_CACHE_NO) {
+		spin_unlock(&cache->lock);
+		kfree(caching_ctl);
+		return 0;
+	}
+	WARN_ON(cache->caching_ctl);
+	cache->caching_ctl = caching_ctl;
+	cache->cached = BTRFS_CACHE_FAST;
+	spin_unlock(&cache->lock);
+
+	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+		mutex_lock(&caching_ctl->mutex);
+		ret = load_free_space_cache(fs_info, cache);
+
+		spin_lock(&cache->lock);
+		if (ret == 1) {
+			cache->caching_ctl = NULL;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			cache->last_byte_to_unpin = (u64)-1;
+			caching_ctl->progress = (u64)-1;
+		} else {
+			if (load_cache_only) {
+				cache->caching_ctl = NULL;
+				cache->cached = BTRFS_CACHE_NO;
+			} else {
+				cache->cached = BTRFS_CACHE_STARTED;
+				cache->has_caching_ctl = 1;
+			}
+		}
+		spin_unlock(&cache->lock);
+		mutex_unlock(&caching_ctl->mutex);
+
+		wake_up(&caching_ctl->wait);
+		if (ret == 1) {
+			put_caching_control(caching_ctl);
+			free_excluded_extents(fs_info->extent_root, cache);
+			return 0;
+		}
+	} else {
+		/*
+		 * We are not going to do the fast caching, set cached to the
+		 * appropriate value and wakeup any waiters.
+		 */
+		spin_lock(&cache->lock);
+		if (load_cache_only) {
+			cache->caching_ctl = NULL;
+			cache->cached = BTRFS_CACHE_NO;
+		} else {
+			cache->cached = BTRFS_CACHE_STARTED;
+			cache->has_caching_ctl = 1;
+		}
+		spin_unlock(&cache->lock);
+		wake_up(&caching_ctl->wait);
+	}
+
+	if (load_cache_only) {
+		put_caching_control(caching_ctl);
+		return 0;
+	}
+
+	down_write(&fs_info->commit_root_sem);
+	atomic_inc(&caching_ctl->count);
+	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+	up_write(&fs_info->commit_root_sem);
+
+	btrfs_get_block_group(cache);
+
+	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+
+	return ret;
+}
+
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = block_group_cache_tree_search(info, bytenr, 0);
+
+	return cache;
+}
+
+/*
+ * return the block group that contains the given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = block_group_cache_tree_search(info, bytenr, 1);
+
+	return cache;
+}
+
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+						  u64 flags)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & flags) {
+			rcu_read_unlock();
+			return found;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list)
+		found->full = 0;
+	rcu_read_unlock();
+}
+
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = start;
+	key.offset = len;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+				0, 0);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper function to lookup reference count and flags of a tree block.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 offset, int metadata, u64 *refs, u64 *flags)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u32 item_size;
+	u64 num_refs;
+	u64 extent_flags;
+	int ret;
+
+	/*
+	 * If we don't have skinny metadata, don't bother doing anything
+	 * different
+	 */
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
+		offset = root->nodesize;
+		metadata = 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (!trans) {
+		path->skip_locking = 1;
+		path->search_commit_root = 1;
+	}
+
+search_again:
+	key.objectid = bytenr;
+	key.offset = offset;
+	if (metadata)
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out_free;
+
+	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == root->nodesize)
+				ret = 0;
+		}
+	}
+
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			ei = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_extent_item);
+			num_refs = btrfs_extent_refs(leaf, ei);
+			extent_flags = btrfs_extent_flags(leaf, ei);
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			struct btrfs_extent_item_v0 *ei0;
+			BUG_ON(item_size != sizeof(*ei0));
+			ei0 = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_item_v0);
+			num_refs = btrfs_extent_refs_v0(leaf, ei0);
+			/* FIXME: this isn't correct for data */
+			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+			BUG();
+#endif
+		}
+		BUG_ON(num_refs == 0);
+	} else {
+		num_refs = 0;
+		extent_flags = 0;
+		ret = 0;
+	}
+
+	if (!trans)
+		goto out;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (head) {
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
+
+			btrfs_release_path(path);
+
+			/*
+			 * Mutex was contended, block until it's released and try
+			 * again
+			 */
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			goto search_again;
+		}
+		spin_lock(&head->lock);
+		if (head->extent_op && head->extent_op->update_flags)
+			extent_flags |= head->extent_op->flags_to_set;
+		else
+			BUG_ON(num_refs == 0);
+
+		num_refs += head->node.ref_mod;
+		spin_unlock(&head->lock);
+		mutex_unlock(&head->mutex);
+	}
+	spin_unlock(&delayed_refs->lock);
+out:
+	WARN_ON(num_refs == 0);
+	if (refs)
+		*refs = num_refs;
+	if (flags)
+		*flags = extent_flags;
+out_free:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Back reference rules.  Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ *    when a reference is dropped we can make sure it was a valid reference
+ *    before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ *    if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ *    maintenance.  This is actually the same as #2, but with a slightly
+ *    different use case.
+ *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure for the implicit back refs has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - objectid of the file holding the reference
+ * - original offset in the file
+ * - how many bookend extents
+ *
+ * The key offset for the implicit back refs is hash of the first
+ * three fields.
+ *
+ * The extent ref structure for the full back refs has field for:
+ *
+ * - number of pointers in the tree leaf
+ *
+ * The key offset for the implicit back refs is the first byte of
+ * the tree leaf
+ *
+ * When a file extent is allocated, The implicit back refs is used.
+ * the fields are filled in:
+ *
+ *     (root_key.objectid, inode objectid, offset in file, 1)
+ *
+ * When a file extent is removed file truncation, we find the
+ * corresponding implicit back refs and check the following fields:
+ *
+ *     (btrfs_header_owner(leaf), inode objectid, offset in file)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ *
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
+ *
+ * When implicit back refs is used, information about the lowest key and
+ * level of the tree block are required. These information are stored in
+ * tree block info structure.
+ */
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  u64 owner, u32 extra_size)
+{
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_item_v0 *ei0;
+	struct btrfs_extent_ref_v0 *ref0;
+	struct btrfs_tree_block_info *bi;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u32 new_size = sizeof(*item);
+	u64 refs;
+	int ret;
+
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	ei0 = btrfs_item_ptr(leaf, path->slots[0],
+			     struct btrfs_extent_item_v0);
+	refs = btrfs_extent_refs_v0(leaf, ei0);
+
+	if (owner == (u64)-1) {
+		while (1) {
+			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0)
+					return ret;
+				BUG_ON(ret > 0); /* Corruption */
+				leaf = path->nodes[0];
+			}
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0]);
+			BUG_ON(key.objectid != found_key.objectid);
+			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+				path->slots[0]++;
+				continue;
+			}
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					      struct btrfs_extent_ref_v0);
+			owner = btrfs_ref_objectid_v0(leaf, ref0);
+			break;
+		}
+	}
+	btrfs_release_path(path);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID)
+		new_size += sizeof(*bi);
+
+	new_size -= sizeof(*ei0);
+	ret = btrfs_search_slot(trans, root, &key, path,
+				new_size + extra_size, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(ret); /* Corruption */
+
+	btrfs_extend_item(root, path, new_size);
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, item, refs);
+	/* FIXME: get real generation */
+	btrfs_set_extent_generation(leaf, item, 0);
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_set_extent_flags(leaf, item,
+				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
+				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
+		bi = (struct btrfs_tree_block_info *)(item + 1);
+		/* FIXME: get first key of the block */
+		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+		btrfs_set_tree_block_level(leaf, bi, (int)owner);
+	} else {
+		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+#endif
+
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+	u32 high_crc = ~(u32)0;
+	u32 low_crc = ~(u32)0;
+	__le64 lenum;
+
+	lenum = cpu_to_le64(root_objectid);
+	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(owner);
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(offset);
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+
+	return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+				     struct btrfs_extent_data_ref *ref)
+{
+	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+				    btrfs_extent_data_ref_objectid(leaf, ref),
+				    btrfs_extent_data_ref_offset(leaf, ref));
+}
+
+static int match_extent_data_ref(struct extent_buffer *leaf,
+				 struct btrfs_extent_data_ref *ref,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		return 0;
+	return 1;
+}
+
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid,
+					   u64 owner, u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+	int recow;
+	int err = -ENOENT;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+	}
+again:
+	recow = 0;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto fail;
+	}
+
+	if (parent) {
+		if (!ret)
+			return 0;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0) {
+			err = ret;
+			goto fail;
+		}
+		if (!ret)
+			return 0;
+#endif
+		goto fail;
+	}
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	while (1) {
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				err = ret;
+			if (ret)
+				goto fail;
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			recow = 1;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != bytenr ||
+		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
+			goto fail;
+
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+
+		if (match_extent_data_ref(leaf, ref, root_objectid,
+					  owner, offset)) {
+			if (recow) {
+				btrfs_release_path(path);
+				goto again;
+			}
+			err = 0;
+			break;
+		}
+		path->slots[0]++;
+	}
+fail:
+	return err;
+}
+
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid, u64 owner,
+					   u64 offset, int refs_to_add)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 size;
+	u32 num_refs;
+	int ret;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+		size = sizeof(struct btrfs_shared_data_ref);
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+		size = sizeof(struct btrfs_extent_data_ref);
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
+	if (ret && ret != -EEXIST)
+		goto fail;
+
+	leaf = path->nodes[0];
+	if (parent) {
+		struct btrfs_shared_data_ref *ref;
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_shared_data_ref);
+		if (ret == 0) {
+			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_shared_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
+		}
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		while (ret == -EEXIST) {
+			ref = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_data_ref);
+			if (match_extent_data_ref(leaf, ref, root_objectid,
+						  owner, offset))
+				break;
+			btrfs_release_path(path);
+			key.offset++;
+			ret = btrfs_insert_empty_item(trans, root, path, &key,
+						      size);
+			if (ret && ret != -EEXIST)
+				goto fail;
+
+			leaf = path->nodes[0];
+		}
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+		if (ret == 0) {
+			btrfs_set_extent_data_ref_root(leaf, ref,
+						       root_objectid);
+			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_extent_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
+		}
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+fail:
+	btrfs_release_path(path);
+	return ret;
+}
+
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   int refs_to_drop, int *last_ref)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref1 = NULL;
+	struct btrfs_shared_data_ref *ref2 = NULL;
+	struct extent_buffer *leaf;
+	u32 num_refs = 0;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+	} else {
+		BUG();
+	}
+
+	BUG_ON(num_refs < refs_to_drop);
+	num_refs -= refs_to_drop;
+
+	if (num_refs == 0) {
+		ret = btrfs_del_item(trans, root, path);
+		*last_ref = 1;
+	} else {
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		else {
+			struct btrfs_extent_ref_v0 *ref0;
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_extent_ref_v0);
+			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+		}
+#endif
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	return ret;
+}
+
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_extent_inline_ref *iref)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_data_ref *ref1;
+	struct btrfs_shared_data_ref *ref2;
+	u32 num_refs = 0;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (iref) {
+		if (btrfs_extent_inline_ref_type(leaf, iref) ==
+		    BTRFS_EXTENT_DATA_REF_KEY) {
+			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+		} else {
+			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+		}
+	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+	} else {
+		WARN_ON(1);
+	}
+	return num_refs;
+}
+
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ret == -ENOENT && parent) {
+		btrfs_release_path(path);
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0)
+			ret = -ENOENT;
+	}
+#endif
+	return ret;
+}
+
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+	btrfs_release_path(path);
+	return ret;
+}
+
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+	int type;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (parent > 0)
+			type = BTRFS_SHARED_BLOCK_REF_KEY;
+		else
+			type = BTRFS_TREE_BLOCK_REF_KEY;
+	} else {
+		if (parent > 0)
+			type = BTRFS_SHARED_DATA_REF_KEY;
+		else
+			type = BTRFS_EXTENT_DATA_REF_KEY;
+	}
+	return type;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key)
+
+{
+	for (; level < BTRFS_MAX_LEVEL; level++) {
+		if (!path->nodes[level])
+			break;
+		if (path->slots[level] + 1 >=
+		    btrfs_header_nritems(path->nodes[level]))
+			continue;
+		if (level == 0)
+			btrfs_item_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		else
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ *	 items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes,
+				 u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int insert)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	u64 flags;
+	u64 item_size;
+	unsigned long ptr;
+	unsigned long end;
+	int extra_size;
+	int type;
+	int want;
+	int ret;
+	int err = 0;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	want = extent_ref_type(parent, owner);
+	if (insert) {
+		extra_size = btrfs_extent_inline_ref_size(want);
+		path->keep_locks = 1;
+	} else
+		extra_size = -1;
+
+	/*
+	 * Owner is our parent level, so we can just add one to get the level
+	 * for the block we are interested in.
+	 */
+	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = owner;
+	}
+
+again:
+	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	/*
+	 * We may be a newly converted file system which still has the old fat
+	 * extent entries for metadata, so try and see if we have one of those.
+	 */
+	if (ret > 0 && skinny_metadata) {
+		skinny_metadata = false;
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes)
+				ret = 0;
+		}
+		if (ret) {
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
+	if (ret && !insert) {
+		err = -ENOENT;
+		goto out;
+	} else if (WARN_ON(ret)) {
+		err = -EIO;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		if (!insert) {
+			err = -ENOENT;
+			goto out;
+		}
+		ret = convert_extent_item_v0(trans, root, path, owner,
+					     extra_size);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	}
+
+	err = -ENOENT;
+	while (1) {
+		if (ptr >= end) {
+			WARN_ON(ptr > end);
+			break;
+		}
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		if (want < type)
+			break;
+		if (want > type) {
+			ptr += btrfs_extent_inline_ref_size(type);
+			continue;
+		}
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			if (match_extent_data_ref(leaf, dref, root_objectid,
+						  owner, offset)) {
+				err = 0;
+				break;
+			}
+			if (hash_extent_data_ref_item(leaf, dref) <
+			    hash_extent_data_ref(root_objectid, owner, offset))
+				break;
+		} else {
+			u64 ref_offset;
+			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+			if (parent > 0) {
+				if (parent == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < parent)
+					break;
+			} else {
+				if (root_objectid == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < root_objectid)
+					break;
+			}
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	if (err == -ENOENT && insert) {
+		if (item_size + extra_size >=
+		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+			err = -EAGAIN;
+			goto out;
+		}
+		/*
+		 * To add new inline back ref, we have to make sure
+		 * there is no corresponding back ref item.
+		 * For simplicity, we just do not add new inline back
+		 * ref if there is any kind of item for this block
+		 */
+		if (find_next_key(path, 0, &key) == 0 &&
+		    key.objectid == bytenr &&
+		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			err = -EAGAIN;
+			goto out;
+		}
+	}
+	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+	if (insert) {
+		path->keep_locks = 0;
+		btrfs_unlock_up_safe(path, 1);
+	}
+	return err;
+}
+
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+void setup_inline_extent_backref(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int refs_to_add,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	unsigned long ptr;
+	unsigned long end;
+	unsigned long item_offset;
+	u64 refs;
+	int size;
+	int type;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	item_offset = (unsigned long)iref - (unsigned long)ei;
+
+	type = extent_ref_type(parent, owner);
+	size = btrfs_extent_inline_ref_size(type);
+
+	btrfs_extend_item(root, path, size);
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	refs += refs_to_add;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	ptr = (unsigned long)ei + item_offset;
+	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+	if (ptr < end - size)
+		memmove_extent_buffer(leaf, ptr + size, ptr,
+				      end - size - ptr);
+
+	iref = (struct btrfs_extent_inline_ref *)ptr;
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		struct btrfs_extent_data_ref *dref;
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		struct btrfs_shared_data_ref *sref;
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 0);
+	if (ret != -ENOENT)
+		return ret;
+
+	btrfs_release_path(path);
+	*ref_ret = NULL;
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+					    root_objectid);
+	} else {
+		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+					     root_objectid, owner, offset);
+	}
+	return ret;
+}
+
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+void update_inline_extent_backref(struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  struct btrfs_extent_inline_ref *iref,
+				  int refs_to_mod,
+				  struct btrfs_delayed_extent_op *extent_op,
+				  int *last_ref)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_data_ref *dref = NULL;
+	struct btrfs_shared_data_ref *sref = NULL;
+	unsigned long ptr;
+	unsigned long end;
+	u32 item_size;
+	int size;
+	int type;
+	u64 refs;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+	refs += refs_to_mod;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	type = btrfs_extent_inline_ref_type(leaf, iref);
+
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		refs = btrfs_extent_data_ref_count(leaf, dref);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		refs = btrfs_shared_data_ref_count(leaf, sref);
+	} else {
+		refs = 1;
+		BUG_ON(refs_to_mod != -1);
+	}
+
+	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+	refs += refs_to_mod;
+
+	if (refs > 0) {
+		if (type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, dref, refs);
+		else
+			btrfs_set_shared_data_ref_count(leaf, sref, refs);
+	} else {
+		*last_ref = 1;
+		size =  btrfs_extent_inline_ref_size(type);
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = (unsigned long)iref;
+		end = (unsigned long)ei + item_size;
+		if (ptr + size < end)
+			memmove_extent_buffer(leaf, ptr, ptr + size,
+					      end - ptr - size);
+		item_size -= size;
+		btrfs_truncate_item(root, path, item_size, 1);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+}
+
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner,
+				 u64 offset, int refs_to_add,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_extent_inline_ref *iref;
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, &iref,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 1);
+	if (ret == 0) {
+		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+		update_inline_extent_backref(root, path, iref,
+					     refs_to_add, extent_op, NULL);
+	} else if (ret == -ENOENT) {
+		setup_inline_extent_backref(root, path, iref, parent,
+					    root_objectid, owner, offset,
+					    refs_to_add, extent_op);
+		ret = 0;
+	}
+	return ret;
+}
+
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int refs_to_add)
+{
+	int ret;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		BUG_ON(refs_to_add != 1);
+		ret = insert_tree_block_ref(trans, root, path, bytenr,
+					    parent, root_objectid);
+	} else {
+		ret = insert_extent_data_ref(trans, root, path, bytenr,
+					     parent, root_objectid,
+					     owner, offset, refs_to_add);
+	}
+	return ret;
+}
+
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 int refs_to_drop, int is_data, int *last_ref)
+{
+	int ret = 0;
+
+	BUG_ON(!is_data && refs_to_drop != 1);
+	if (iref) {
+		update_inline_extent_backref(root, path, iref,
+					     -refs_to_drop, NULL, last_ref);
+	} else if (is_data) {
+		ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
+					     last_ref);
+	} else {
+		*last_ref = 1;
+		ret = btrfs_del_item(trans, root, path);
+	}
+	return ret;
+}
+
+static int btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
+{
+	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+}
+
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+			 u64 num_bytes, u64 *actual_bytes)
+{
+	int ret;
+	u64 discarded_bytes = 0;
+	struct btrfs_bio *bbio = NULL;
+
+
+	/* Tell the block device(s) that the sectors can be discarded */
+	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
+			      bytenr, &num_bytes, &bbio, 0);
+	/* Error condition is -ENOMEM */
+	if (!ret) {
+		struct btrfs_bio_stripe *stripe = bbio->stripes;
+		int i;
+
+
+		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+			if (!stripe->dev->can_discard)
+				continue;
+
+			ret = btrfs_issue_discard(stripe->dev->bdev,
+						  stripe->physical,
+						  stripe->length);
+			if (!ret)
+				discarded_bytes += stripe->length;
+			else if (ret != -EOPNOTSUPP)
+				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
+
+			/*
+			 * Just in case we get back EOPNOTSUPP for some reason,
+			 * just ignore the return value so we don't screw up
+			 * people calling discard_extent.
+			 */
+			ret = 0;
+		}
+		btrfs_put_bbio(bbio);
+	}
+
+	if (actual_bytes)
+		*actual_bytes = discarded_bytes;
+
+
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
+	return ret;
+}
+
+/* Can return -ENOMEM */
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 owner, u64 offset,
+			 int no_quota)
+{
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+	} else {
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+					num_bytes,
+					parent, root_objectid, owner, offset,
+					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+	}
+	return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 bytenr, u64 num_bytes,
+				  u64 parent, u64 root_objectid,
+				  u64 owner, u64 offset, int refs_to_add,
+				  int no_quota,
+				  struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *item;
+	struct btrfs_key key;
+	u64 refs;
+	int ret;
+	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
+		no_quota = 1;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	/* this will setup the path even if it fails to insert the back ref */
+	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset,
+					   refs_to_add, extent_op);
+	if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+		goto out;
+	/*
+	 * Ok we were able to insert an inline extent and it appears to be a new
+	 * reference, deal with the qgroup accounting.
+	 */
+	if (!ret && !no_quota) {
+		ASSERT(root->fs_info->quota_enabled);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		item = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_item);
+		if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
+			type = BTRFS_QGROUP_OPER_ADD_SHARED;
+		btrfs_release_path(path);
+
+		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+					      bytenr, num_bytes, type, 0);
+		goto out;
+	}
+
+	/*
+	 * Ok we had -EAGAIN which means we didn't have space to insert and
+	 * inline extent ref, so just update the reference count and add a
+	 * normal backref.
+	 */
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, item);
+	if (refs)
+		type = BTRFS_QGROUP_OPER_ADD_SHARED;
+	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, item);
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	if (!no_quota) {
+		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+					      bytenr, num_bytes, type, 0);
+		if (ret)
+			goto out;
+	}
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	/* now insert the actual backref */
+	ret = insert_extent_backref(trans, root->fs_info->extent_root,
+				    path, bytenr, parent, root_objectid,
+				    owner, offset, refs_to_add);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
+{
+	int ret = 0;
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
+	u64 flags = 0;
+
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ref = btrfs_delayed_node_to_data_ref(node);
+	trace_run_delayed_data_ref(node, ref, node->action);
+
+	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+		parent = ref->parent;
+	ref_root = ref->root;
+
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		if (extent_op)
+			flags |= extent_op->flags_to_set;
+		ret = alloc_reserved_file_extent(trans, root,
+						 parent, ref_root, flags,
+						 ref->objectid, ref->offset,
+						 &ins, node->ref_mod);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent,
+					     ref_root, ref->objectid,
+					     ref->offset, node->ref_mod,
+					     node->no_quota, extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent,
+					  ref_root, ref->objectid,
+					  ref->offset, node->ref_mod,
+					  extent_op, node->no_quota);
+	} else {
+		BUG();
+	}
+	return ret;
+}
+
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei)
+{
+	u64 flags = btrfs_extent_flags(leaf, ei);
+	if (extent_op->update_flags) {
+		flags |= extent_op->flags_to_set;
+		btrfs_set_extent_flags(leaf, ei, flags);
+	}
+
+	if (extent_op->update_key) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+	}
+}
+
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_delayed_ref_node *node,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
+	u32 item_size;
+	int ret;
+	int err = 0;
+	int metadata = !extent_op->is_data;
+
+	if (trans->aborted)
+		return 0;
+
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		metadata = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = node->bytenr;
+
+	if (metadata) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = extent_op->level;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = node->num_bytes;
+	}
+
+again:
+	path->reada = 1;
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+				path, 0, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret > 0) {
+		if (metadata) {
+			if (path->slots[0] > 0) {
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == node->bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == node->num_bytes)
+					ret = 0;
+			}
+			if (ret > 0) {
+				btrfs_release_path(path);
+				metadata = 0;
+
+				key.objectid = node->bytenr;
+				key.offset = node->num_bytes;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				goto again;
+			}
+		} else {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+					     path, (u64)-1, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	__run_delayed_extent_op(extent_op, leaf, ei);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
+{
+	int ret = 0;
+	struct btrfs_delayed_tree_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	ref = btrfs_delayed_node_to_tree_ref(node);
+	trace_run_delayed_tree_ref(node, ref, node->action);
+
+	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		parent = ref->parent;
+	ref_root = ref->root;
+
+	ins.objectid = node->bytenr;
+	if (skinny_metadata) {
+		ins.offset = ref->level;
+		ins.type = BTRFS_METADATA_ITEM_KEY;
+	} else {
+		ins.offset = node->num_bytes;
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+	}
+
+	BUG_ON(node->ref_mod != 1);
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		BUG_ON(!extent_op || !extent_op->update_flags);
+		ret = alloc_reserved_tree_block(trans, root,
+						parent, ref_root,
+						extent_op->flags_to_set,
+						&extent_op->key,
+						ref->level, &ins,
+						node->no_quota);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent, ref_root,
+					     ref->level, 0, 1, node->no_quota,
+					     extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent, ref_root,
+					  ref->level, 0, 1, extent_op,
+					  node->no_quota);
+	} else {
+		BUG();
+	}
+	return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_delayed_ref_node *node,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int insert_reserved)
+{
+	int ret = 0;
+
+	if (trans->aborted) {
+		if (insert_reserved)
+			btrfs_pin_extent(root, node->bytenr,
+					 node->num_bytes, 1);
+		return 0;
+	}
+
+	if (btrfs_delayed_ref_is_head(node)) {
+		struct btrfs_delayed_ref_head *head;
+		/*
+		 * we've hit the end of the chain and we were supposed
+		 * to insert this extent into the tree.  But, it got
+		 * deleted before we ever needed to insert it, so all
+		 * we have to do is clean up the accounting
+		 */
+		BUG_ON(extent_op);
+		head = btrfs_delayed_node_to_head(node);
+		trace_run_delayed_ref_head(node, head, node->action);
+
+		if (insert_reserved) {
+			btrfs_pin_extent(root, node->bytenr,
+					 node->num_bytes, 1);
+			if (head->is_data) {
+				ret = btrfs_del_csums(trans, root,
+						      node->bytenr,
+						      node->num_bytes);
+			}
+		}
+		return ret;
+	}
+
+	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		ret = run_delayed_tree_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		 node->type == BTRFS_SHARED_DATA_REF_KEY)
+		ret = run_delayed_data_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else
+		BUG();
+	return ret;
+}
+
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_node *ref, *last = NULL;;
+
+	/*
+	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+	 * this prevents ref count from going down to zero when
+	 * there still are pending delayed ref.
+	 */
+	node = rb_first(&head->ref_root);
+	while (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				rb_node);
+		if (ref->action == BTRFS_ADD_DELAYED_REF)
+			return ref;
+		else if (last == NULL)
+			last = ref;
+		node = rb_next(node);
+	}
+	return last;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     unsigned long nr)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *locked_ref = NULL;
+	struct btrfs_delayed_extent_op *extent_op;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	ktime_t start = ktime_get();
+	int ret;
+	unsigned long count = 0;
+	unsigned long actual_count = 0;
+	int must_insert_reserved = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	while (1) {
+		if (!locked_ref) {
+			if (count >= nr)
+				break;
+
+			spin_lock(&delayed_refs->lock);
+			locked_ref = btrfs_select_ref_head(trans);
+			if (!locked_ref) {
+				spin_unlock(&delayed_refs->lock);
+				break;
+			}
+
+			/* grab the lock that says we are going to process
+			 * all the refs for this head */
+			ret = btrfs_delayed_ref_lock(trans, locked_ref);
+			spin_unlock(&delayed_refs->lock);
+			/*
+			 * we may have dropped the spin lock to get the head
+			 * mutex lock, and that might have given someone else
+			 * time to free the head.  If that's true, it has been
+			 * removed from our list and we can move on.
+			 */
+			if (ret == -EAGAIN) {
+				locked_ref = NULL;
+				count++;
+				continue;
+			}
+		}
+
+		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 */
+		spin_lock(&locked_ref->lock);
+		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+					 locked_ref);
+
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
+
+		if (ref && ref->seq &&
+		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+			spin_unlock(&locked_ref->lock);
+			btrfs_delayed_ref_unlock(locked_ref);
+			spin_lock(&delayed_refs->lock);
+			locked_ref->processing = 0;
+			delayed_refs->num_heads_ready++;
+			spin_unlock(&delayed_refs->lock);
+			locked_ref = NULL;
+			cond_resched();
+			count++;
+			continue;
+		}
+
+		/*
+		 * record the must insert reserved flag before we
+		 * drop the spin lock.
+		 */
+		must_insert_reserved = locked_ref->must_insert_reserved;
+		locked_ref->must_insert_reserved = 0;
+
+		extent_op = locked_ref->extent_op;
+		locked_ref->extent_op = NULL;
+
+		if (!ref) {
+
+
+			/* All delayed refs have been processed, Go ahead
+			 * and send the head node to run_one_delayed_ref,
+			 * so that any accounting fixes can happen
+			 */
+			ref = &locked_ref->node;
+
+			if (extent_op && must_insert_reserved) {
+				btrfs_free_delayed_extent_op(extent_op);
+				extent_op = NULL;
+			}
+
+			if (extent_op) {
+				spin_unlock(&locked_ref->lock);
+				ret = run_delayed_extent_op(trans, root,
+							    ref, extent_op);
+				btrfs_free_delayed_extent_op(extent_op);
+
+				if (ret) {
+					/*
+					 * Need to reset must_insert_reserved if
+					 * there was an error so the abort stuff
+					 * can cleanup the reserved space
+					 * properly.
+					 */
+					if (must_insert_reserved)
+						locked_ref->must_insert_reserved = 1;
+					locked_ref->processing = 0;
+					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+					btrfs_delayed_ref_unlock(locked_ref);
+					return ret;
+				}
+				continue;
+			}
+
+			/*
+			 * Need to drop our head ref lock and re-aqcuire the
+			 * delayed ref lock and then re-check to make sure
+			 * nobody got added.
+			 */
+			spin_unlock(&locked_ref->lock);
+			spin_lock(&delayed_refs->lock);
+			spin_lock(&locked_ref->lock);
+			if (rb_first(&locked_ref->ref_root) ||
+			    locked_ref->extent_op) {
+				spin_unlock(&locked_ref->lock);
+				spin_unlock(&delayed_refs->lock);
+				continue;
+			}
+			ref->in_tree = 0;
+			delayed_refs->num_heads--;
+			rb_erase(&locked_ref->href_node,
+				 &delayed_refs->href_root);
+			spin_unlock(&delayed_refs->lock);
+		} else {
+			actual_count++;
+			ref->in_tree = 0;
+			rb_erase(&ref->rb_node, &locked_ref->ref_root);
+		}
+		atomic_dec(&delayed_refs->num_entries);
+
+		if (!btrfs_delayed_ref_is_head(ref)) {
+			/*
+			 * when we play the delayed ref, also correct the
+			 * ref_mod on head
+			 */
+			switch (ref->action) {
+			case BTRFS_ADD_DELAYED_REF:
+			case BTRFS_ADD_DELAYED_EXTENT:
+				locked_ref->node.ref_mod -= ref->ref_mod;
+				break;
+			case BTRFS_DROP_DELAYED_REF:
+				locked_ref->node.ref_mod += ref->ref_mod;
+				break;
+			default:
+				WARN_ON(1);
+			}
+		}
+		spin_unlock(&locked_ref->lock);
+
+		ret = run_one_delayed_ref(trans, root, ref, extent_op,
+					  must_insert_reserved);
+
+		btrfs_free_delayed_extent_op(extent_op);
+		if (ret) {
+			locked_ref->processing = 0;
+			btrfs_delayed_ref_unlock(locked_ref);
+			btrfs_put_delayed_ref(ref);
+			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
+			return ret;
+		}
+
+		/*
+		 * If this node is a head, that means all the refs in this head
+		 * have been dealt with, and we will pick the next head to deal
+		 * with, so we must unlock the head and drop it from the cluster
+		 * list before we release it.
+		 */
+		if (btrfs_delayed_ref_is_head(ref)) {
+			if (locked_ref->is_data &&
+			    locked_ref->total_ref_mod < 0) {
+				spin_lock(&delayed_refs->lock);
+				delayed_refs->pending_csums -= ref->num_bytes;
+				spin_unlock(&delayed_refs->lock);
+			}
+			btrfs_delayed_ref_unlock(locked_ref);
+			locked_ref = NULL;
+		}
+		btrfs_put_delayed_ref(ref);
+		count++;
+		cond_resched();
+	}
+
+	/*
+	 * We don't want to include ref heads since we can have empty ref heads
+	 * and those will drastically skew our runtime down since we just do
+	 * accounting, no actual extent tree updates.
+	 */
+	if (actual_count > 0) {
+		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+		u64 avg;
+
+		/*
+		 * We weigh the current average higher than our current runtime
+		 * to avoid large swings in the average.
+		 */
+		spin_lock(&delayed_refs->lock);
+		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
+		spin_unlock(&delayed_refs->lock);
+	}
+	return 0;
+}
+
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_delayed_ref_node *entry;
+	int alt = 1;
+	u64 middle;
+	u64 first = 0, last = 0;
+
+	n = rb_first(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		first = entry->bytenr;
+	}
+	n = rb_last(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		last = entry->bytenr;
+	}
+	n = root->rb_node;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		middle = entry->bytenr;
+
+		if (alt)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+
+		alt = 1 - alt;
+	}
+	return middle;
+}
+#endif
+
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+	u64 num_bytes;
+
+	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+			     sizeof(struct btrfs_extent_inline_ref));
+	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+	/*
+	 * We don't ever fill up leaves all the way so multiply by 2 just to be
+	 * closer to what we're really going to want to ouse.
+	 */
+	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+	u64 csum_size;
+	u64 num_csums_per_leaf;
+	u64 num_csums;
+
+	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+	num_csums_per_leaf = div64_u64(csum_size,
+			(u64)btrfs_super_csum_size(root->fs_info->super_copy));
+	num_csums = div64_u64(csum_bytes, root->sectorsize);
+	num_csums += num_csums_per_leaf - 1;
+	num_csums = div64_u64(num_csums, num_csums_per_leaf);
+	return num_csums;
+}
+
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *global_rsv;
+	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+	u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+	u64 num_bytes, num_dirty_bgs_bytes;
+	int ret = 0;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	num_heads = heads_to_leaves(root, num_heads);
+	if (num_heads > 1)
+		num_bytes += (num_heads - 1) * root->nodesize;
+	num_bytes <<= 1;
+	num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+							     num_dirty_bgs);
+	global_rsv = &root->fs_info->global_block_rsv;
+
+	/*
+	 * If we can't allocate any more chunks lets make sure we have _lots_ of
+	 * wiggle room since running delayed refs can create more delayed refs.
+	 */
+	if (global_rsv->space_info->full) {
+		num_dirty_bgs_bytes <<= 1;
+		num_bytes <<= 1;
+	}
+
+	spin_lock(&global_rsv->lock);
+	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
+		ret = 1;
+	spin_unlock(&global_rsv->lock);
+	return ret;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 num_entries =
+		atomic_read(&trans->transaction->delayed_refs.num_entries);
+	u64 avg_runtime;
+	u64 val;
+
+	smp_mb();
+	avg_runtime = fs_info->avg_delayed_ref_runtime;
+	val = num_entries * avg_runtime;
+	if (num_entries * avg_runtime >= NSEC_PER_SEC)
+		return 1;
+	if (val >= NSEC_PER_SEC / 2)
+		return 2;
+
+	return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
+struct async_delayed_refs {
+	struct btrfs_root *root;
+	int count;
+	int error;
+	int sync;
+	struct completion wait;
+	struct btrfs_work work;
+};
+
+static void delayed_ref_async_start(struct btrfs_work *work)
+{
+	struct async_delayed_refs *async;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	async = container_of(work, struct async_delayed_refs, work);
+
+	trans = btrfs_join_transaction(async->root);
+	if (IS_ERR(trans)) {
+		async->error = PTR_ERR(trans);
+		goto done;
+	}
+
+	/*
+	 * trans->sync means that when we call end_transaciton, we won't
+	 * wait on delayed refs
+	 */
+	trans->sync = true;
+	ret = btrfs_run_delayed_refs(trans, async->root, async->count);
+	if (ret)
+		async->error = ret;
+
+	ret = btrfs_end_transaction(trans, async->root);
+	if (ret && !async->error)
+		async->error = ret;
+done:
+	if (async->sync)
+		complete(&async->wait);
+	else
+		kfree(async);
+}
+
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+				 unsigned long count, int wait)
+{
+	struct async_delayed_refs *async;
+	int ret;
+
+	async = kmalloc(sizeof(*async), GFP_NOFS);
+	if (!async)
+		return -ENOMEM;
+
+	async->root = root->fs_info->tree_root;
+	async->count = count;
+	async->error = 0;
+	if (wait)
+		async->sync = 1;
+	else
+		async->sync = 0;
+	init_completion(&async->wait);
+
+	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
+			delayed_ref_async_start, NULL, NULL);
+
+	btrfs_queue_work(root->fs_info->extent_workers, &async->work);
+
+	if (wait) {
+		wait_for_completion(&async->wait);
+		ret = async->error;
+		kfree(async);
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_head *head;
+	int ret;
+	int run_all = count == (unsigned long)-1;
+
+	/* We'll clean this up in btrfs_cleanup_transaction */
+	if (trans->aborted)
+		return 0;
+
+	if (root == root->fs_info->extent_root)
+		root = root->fs_info->tree_root;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	if (count == 0)
+		count = atomic_read(&delayed_refs->num_entries) * 2;
+
+again:
+#ifdef SCRAMBLE_DELAYED_REFS
+	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+	ret = __btrfs_run_delayed_refs(trans, root, count);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+
+	if (run_all) {
+		if (!list_empty(&trans->new_bgs))
+			btrfs_create_pending_block_groups(trans, root);
+
+		spin_lock(&delayed_refs->lock);
+		node = rb_first(&delayed_refs->href_root);
+		if (!node) {
+			spin_unlock(&delayed_refs->lock);
+			goto out;
+		}
+		count = (unsigned long)-1;
+
+		while (node) {
+			head = rb_entry(node, struct btrfs_delayed_ref_head,
+					href_node);
+			if (btrfs_delayed_ref_is_head(&head->node)) {
+				struct btrfs_delayed_ref_node *ref;
+
+				ref = &head->node;
+				atomic_inc(&ref->refs);
+
+				spin_unlock(&delayed_refs->lock);
+				/*
+				 * Mutex was contended, block until it's
+				 * released and try again
+				 */
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+
+				btrfs_put_delayed_ref(ref);
+				cond_resched();
+				goto again;
+			} else {
+				WARN_ON(1);
+			}
+			node = rb_next(node);
+		}
+		spin_unlock(&delayed_refs->lock);
+		cond_resched();
+		goto again;
+	}
+out:
+	ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
+	if (ret)
+		return ret;
+	assert_qgroups_uptodate(trans);
+	return 0;
+}
+
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int level, int is_data)
+{
+	struct btrfs_delayed_extent_op *extent_op;
+	int ret;
+
+	extent_op = btrfs_alloc_delayed_extent_op();
+	if (!extent_op)
+		return -ENOMEM;
+
+	extent_op->flags_to_set = flags;
+	extent_op->update_flags = 1;
+	extent_op->update_key = 0;
+	extent_op->is_data = is_data ? 1 : 0;
+	extent_op->level = level;
+
+	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+					  num_bytes, extent_op);
+	if (ret)
+		btrfs_free_delayed_extent_op(extent_op);
+	return ret;
+}
+
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_data_ref *data_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	int ret = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head) {
+		spin_unlock(&delayed_refs->lock);
+		return 0;
+	}
+
+	if (!mutex_trylock(&head->mutex)) {
+		atomic_inc(&head->node.refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(path);
+
+		/*
+		 * Mutex was contended, block until it's released and let
+		 * caller try again
+		 */
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+	spin_unlock(&delayed_refs->lock);
+
+	spin_lock(&head->lock);
+	node = rb_first(&head->ref_root);
+	while (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
+
+		/* If it's a shared ref we know a cross reference exists */
+		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+			ret = 1;
+			break;
+		}
+
+		data_ref = btrfs_delayed_node_to_data_ref(ref);
+
+		/*
+		 * If our ref doesn't match the one we're currently looking at
+		 * then we have a cross reference.
+		 */
+		if (data_ref->root != root->root_key.objectid ||
+		    data_ref->objectid != objectid ||
+		    data_ref->offset != offset) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&head->lock);
+	mutex_unlock(&head->mutex);
+	return ret;
+}
+
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_data_ref *ref;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	u32 item_size;
+	int ret;
+
+	key.objectid = bytenr;
+	key.offset = (u64)-1;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0); /* Corruption */
+
+	ret = -ENOENT;
+	if (path->slots[0] == 0)
+		goto out;
+
+	path->slots[0]--;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
+		goto out;
+
+	ret = 1;
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		goto out;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+
+	if (item_size != sizeof(*ei) +
+	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+		goto out;
+
+	if (btrfs_extent_generation(leaf, ei) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		goto out;
+
+	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	if (btrfs_extent_inline_ref_type(leaf, iref) !=
+	    BTRFS_EXTENT_DATA_REF_KEY)
+		goto out;
+
+	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+	if (btrfs_extent_refs(leaf, ei) !=
+	    btrfs_extent_data_ref_count(leaf, ref) ||
+	    btrfs_extent_data_ref_root(leaf, ref) !=
+	    root->root_key.objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_path *path;
+	int ret;
+	int ret2;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOENT;
+
+	do {
+		ret = check_committed_ref(trans, root, path, objectid,
+					  offset, bytenr);
+		if (ret && ret != -ENOENT)
+			goto out;
+
+		ret2 = check_delayed_ref(trans, root, path, objectid,
+					 offset, bytenr);
+	} while (ret2 == -EAGAIN);
+
+	if (ret2 && ret2 != -ENOENT) {
+		ret = ret2;
+		goto out;
+	}
+
+	if (ret != -ENOENT || ret2 != -ENOENT)
+		ret = 0;
+out:
+	btrfs_free_path(path);
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		WARN_ON(ret > 0);
+	return ret;
+}
+
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   int full_backref, int inc)
+{
+	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
+	u64 ref_root;
+	u32 nritems;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int level;
+	int ret = 0;
+	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+			    u64, u64, u64, u64, u64, u64, int);
+
+
+	if (btrfs_test_is_dummy_root(root))
+		return 0;
+
+	ref_root = btrfs_header_owner(buf);
+	nritems = btrfs_header_nritems(buf);
+	level = btrfs_header_level(buf);
+
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
+		return 0;
+
+	if (inc)
+		process_func = btrfs_inc_extent_ref;
+	else
+		process_func = btrfs_free_extent;
+
+	if (full_backref)
+		parent = buf->start;
+	else
+		parent = 0;
+
+	for (i = 0; i < nritems; i++) {
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (key.type != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+
+			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+			key.offset -= btrfs_file_extent_offset(buf, fi);
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, key.objectid,
+					   key.offset, 1);
+			if (ret)
+				goto fail;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, i);
+			num_bytes = root->nodesize;
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, level - 1, 0,
+					   1);
+			if (ret)
+				goto fail;
+		}
+	}
+	return 0;
+fail:
+	return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+}
+
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_block_group_cache *cache)
+{
+	int ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	unsigned long bi;
+	struct extent_buffer *leaf;
+
+	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto fail;
+	}
+
+	leaf = path->nodes[0];
+	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+	btrfs_mark_buffer_dirty(leaf);
+fail:
+	btrfs_release_path(path);
+	return ret;
+
+}
+
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *cache)
+{
+	struct rb_node *node;
+
+	spin_lock(&root->fs_info->block_group_cache_lock);
+
+	/* If our block group was removed, we need a full search. */
+	if (RB_EMPTY_NODE(&cache->cache_node)) {
+		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+		spin_unlock(&root->fs_info->block_group_cache_lock);
+		btrfs_put_block_group(cache);
+		cache = btrfs_lookup_first_block_group(root->fs_info,
+						       next_bytenr);
+		return cache;
+	}
+	node = rb_next(&cache->cache_node);
+	btrfs_put_block_group(cache);
+	if (node) {
+		cache = rb_entry(node, struct btrfs_block_group_cache,
+				 cache_node);
+		btrfs_get_block_group(cache);
+	} else
+		cache = NULL;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+	return cache;
+}
+
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_path *path)
+{
+	struct btrfs_root *root = block_group->fs_info->tree_root;
+	struct inode *inode = NULL;
+	u64 alloc_hint = 0;
+	int dcs = BTRFS_DC_ERROR;
+	u64 num_pages = 0;
+	int retries = 0;
+	int ret = 0;
+
+	/*
+	 * If this block group is smaller than 100 megs don't bother caching the
+	 * block group.
+	 */
+	if (block_group->key.offset < (100 * 1024 * 1024)) {
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+
+	if (trans->aborted)
+		return 0;
+again:
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+		ret = PTR_ERR(inode);
+		btrfs_release_path(path);
+		goto out;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retries);
+		retries++;
+
+		if (block_group->ro)
+			goto out_free;
+
+		ret = create_free_space_inode(root, trans, block_group, path);
+		if (ret)
+			goto out_free;
+		goto again;
+	}
+
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
+	/*
+	 * We want to set the generation to 0, that way if anything goes wrong
+	 * from here on out we know not to trust this cache when we load up next
+	 * time.
+	 */
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		/*
+		 * So theoretically we could recover from this, simply set the
+		 * super cache generation to 0 so we know to invalidate the
+		 * cache, but then we'd have to keep track of the block groups
+		 * that fail this way so we know we _have_ to reset this cache
+		 * before the next commit or risk reading stale cache.  So to
+		 * limit our exposure to horrible edge cases lets just abort the
+		 * transaction, this only happens in really bad situations
+		 * anyway.
+		 */
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_put;
+	}
+	WARN_ON(ret);
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_check_trunc_cache_free_space(root,
+					&root->fs_info->global_block_rsv);
+		if (ret)
+			goto out_put;
+
+		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
+		if (ret)
+			goto out_put;
+	}
+
+	spin_lock(&block_group->lock);
+	if (block_group->cached != BTRFS_CACHE_FINISHED ||
+	    !btrfs_test_opt(root, SPACE_CACHE)) {
+		/*
+		 * don't bother trying to write stuff out _if_
+		 * a) we're not cached,
+		 * b) we're with nospace_cache mount option.
+		 */
+		dcs = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		goto out_put;
+	}
+	spin_unlock(&block_group->lock);
+
+	/*
+	 * Try to preallocate enough space based on how big the block group is.
+	 * Keep in mind this has to include any pinned space which could end up
+	 * taking up quite a bit since it's not folded into the other space
+	 * cache.
+	 */
+	num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+	if (!num_pages)
+		num_pages = 1;
+
+	num_pages *= 16;
+	num_pages *= PAGE_CACHE_SIZE;
+
+	ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+					      num_pages, num_pages,
+					      &alloc_hint);
+	if (!ret)
+		dcs = BTRFS_DC_SETUP;
+	btrfs_free_reserved_data_space(inode, num_pages);
+
+out_put:
+	iput(inode);
+out_free:
+	btrfs_release_path(path);
+out:
+	spin_lock(&block_group->lock);
+	if (!ret && dcs == BTRFS_DC_SETUP)
+		block_group->cache_generation = trans->transid;
+	block_group->disk_cache_state = dcs;
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache, *tmp;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_path *path;
+
+	if (list_empty(&cur_trans->dirty_bgs) ||
+	    !btrfs_test_opt(root, SPACE_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Could add new block groups, use _safe just in case */
+	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+				 dirty_list) {
+		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+			cache_save_setup(cache, trans, path);
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS.  This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO.  There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int ret = 0;
+	int should_put;
+	struct btrfs_path *path = NULL;
+	LIST_HEAD(dirty);
+	struct list_head *io = &cur_trans->io_bgs;
+	int num_started = 0;
+	int loops = 0;
+
+	spin_lock(&cur_trans->dirty_bgs_lock);
+	if (list_empty(&cur_trans->dirty_bgs)) {
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+		return 0;
+	}
+	list_splice_init(&cur_trans->dirty_bgs, &dirty);
+	spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+	/*
+	 * make sure all the block groups on our dirty list actually
+	 * exist
+	 */
+	btrfs_create_pending_block_groups(trans, root);
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+	}
+
+	/*
+	 * cache_write_mutex is here only to save us from balance or automatic
+	 * removal of empty block groups deleting this block group while we are
+	 * writing out the cache
+	 */
+	mutex_lock(&trans->transaction->cache_write_mutex);
+	while (!list_empty(&dirty)) {
+		cache = list_first_entry(&dirty,
+					 struct btrfs_block_group_cache,
+					 dirty_list);
+		/*
+		 * this can happen if something re-dirties a block
+		 * group that is already under IO.  Just wait for it to
+		 * finish and then do it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(root, trans, cache,
+					    &cache->io_ctl, path,
+					    cache->key.objectid);
+			btrfs_put_block_group(cache);
+		}
+
+
+		/*
+		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
+		 * if it should update the cache_state.  Don't delete
+		 * until after we wait.
+		 *
+		 * Since we're not running in the commit critical section
+		 * we need the dirty_bgs_lock to protect from update_block_group
+		 */
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_del_init(&cache->dirty_list);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+
+		should_put = 1;
+
+		cache_save_setup(cache, trans, path);
+
+		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(root, trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+
+				/*
+				 * the cache_write_mutex is protecting
+				 * the io_list
+				 */
+				list_add_tail(&cache->io_list, io);
+			} else {
+				/*
+				 * if we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
+		if (!ret) {
+			ret = write_one_cache_group(trans, root, path, cache);
+			/*
+			 * Our block group might still be attached to the list
+			 * of new block groups in the transaction handle of some
+			 * other task (struct btrfs_trans_handle->new_bgs). This
+			 * means its block group item isn't yet in the extent
+			 * tree. If this happens ignore the error, as we will
+			 * try again later in the critical section of the
+			 * transaction commit.
+			 */
+			if (ret == -ENOENT) {
+				ret = 0;
+				spin_lock(&cur_trans->dirty_bgs_lock);
+				if (list_empty(&cache->dirty_list)) {
+					list_add_tail(&cache->dirty_list,
+						      &cur_trans->dirty_bgs);
+					btrfs_get_block_group(cache);
+				}
+				spin_unlock(&cur_trans->dirty_bgs_lock);
+			} else if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+			}
+		}
+
+		/* if its not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+
+		if (ret)
+			break;
+
+		/*
+		 * Avoid blocking other tasks for too long. It might even save
+		 * us from writing caches for block groups that are going to be
+		 * removed.
+		 */
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+		mutex_lock(&trans->transaction->cache_write_mutex);
+	}
+	mutex_unlock(&trans->transaction->cache_write_mutex);
+
+	/*
+	 * go through delayed refs for all the stuff we've just kicked off
+	 * and then loop back (just once)
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	if (!ret && loops == 0) {
+		loops++;
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_splice_init(&cur_trans->dirty_bgs, &dirty);
+		/*
+		 * dirty_bgs_lock protects us from concurrent block group
+		 * deletes too (not just cache_write_mutex).
+		 */
+		if (!list_empty(&dirty)) {
+			spin_unlock(&cur_trans->dirty_bgs_lock);
+			goto again;
+		}
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int ret = 0;
+	int should_put;
+	struct btrfs_path *path;
+	struct list_head *io = &cur_trans->io_bgs;
+	int num_started = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * We don't need the lock here since we are protected by the transaction
+	 * commit.  We want to do the cache_save_setup first and then run the
+	 * delayed refs to make sure we have the best chance at doing this all
+	 * in one shot.
+	 */
+	while (!list_empty(&cur_trans->dirty_bgs)) {
+		cache = list_first_entry(&cur_trans->dirty_bgs,
+					 struct btrfs_block_group_cache,
+					 dirty_list);
+
+		/*
+		 * this can happen if cache_save_setup re-dirties a block
+		 * group that is already under IO.  Just wait for it to
+		 * finish and then do it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(root, trans, cache,
+					    &cache->io_ctl, path,
+					    cache->key.objectid);
+			btrfs_put_block_group(cache);
+		}
+
+		/*
+		 * don't remove from the dirty list until after we've waited
+		 * on any pending IO
+		 */
+		list_del_init(&cache->dirty_list);
+		should_put = 1;
+
+		cache_save_setup(cache, trans, path);
+
+		if (!ret)
+			ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(root, trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+				list_add_tail(&cache->io_list, io);
+			} else {
+				/*
+				 * if we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
+		if (!ret) {
+			ret = write_one_cache_group(trans, root, path, cache);
+			if (ret)
+				btrfs_abort_transaction(trans, root, ret);
+		}
+
+		/* if its not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+	}
+
+	while (!list_empty(io)) {
+		cache = list_first_entry(io, struct btrfs_block_group_cache,
+					 io_list);
+		list_del_init(&cache->io_list);
+		btrfs_wait_cache_io(root, trans, cache,
+				    &cache->io_ctl, path, cache->key.objectid);
+		btrfs_put_block_group(cache);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_block_group_cache *block_group;
+	int readonly = 0;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+	if (!block_group || block_group->ro)
+		readonly = 1;
+	if (block_group)
+		btrfs_put_block_group(block_group);
+	return readonly;
+}
+
+static const char *alloc_name(u64 flags)
+{
+	switch (flags) {
+	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
+		return "mixed";
+	case BTRFS_BLOCK_GROUP_METADATA:
+		return "metadata";
+	case BTRFS_BLOCK_GROUP_DATA:
+		return "data";
+	case BTRFS_BLOCK_GROUP_SYSTEM:
+		return "system";
+	default:
+		WARN_ON(1);
+		return "invalid-combination";
+	};
+}
+
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+	int i;
+	int factor;
+	int ret;
+
+	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+		     BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
+
+	found = __find_space_info(info, flags);
+	if (found) {
+		spin_lock(&found->lock);
+		found->total_bytes += total_bytes;
+		found->disk_total += total_bytes * factor;
+		found->bytes_used += bytes_used;
+		found->disk_used += bytes_used * factor;
+		found->full = 0;
+		spin_unlock(&found->lock);
+		*space_info = found;
+		return 0;
+	}
+	found = kzalloc(sizeof(*found), GFP_NOFS);
+	if (!found)
+		return -ENOMEM;
+
+	ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
+	if (ret) {
+		kfree(found);
+		return ret;
+	}
+
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		INIT_LIST_HEAD(&found->block_groups[i]);
+	init_rwsem(&found->groups_sem);
+	spin_lock_init(&found->lock);
+	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	found->total_bytes = total_bytes;
+	found->disk_total = total_bytes * factor;
+	found->bytes_used = bytes_used;
+	found->disk_used = bytes_used * factor;
+	found->bytes_pinned = 0;
+	found->bytes_reserved = 0;
+	found->bytes_readonly = 0;
+	found->bytes_may_use = 0;
+	found->full = 0;
+	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+	found->chunk_alloc = 0;
+	found->flush = 0;
+	init_waitqueue_head(&found->wait);
+	INIT_LIST_HEAD(&found->ro_bgs);
+
+	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+				    info->space_info_kobj, "%s",
+				    alloc_name(found->flags));
+	if (ret) {
+		kfree(found);
+		return ret;
+	}
+
+	*space_info = found;
+	list_add_rcu(&found->list, &info->space_info);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		info->data_sinfo = found;
+
+	return ret;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits |= extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ *
+ * should be called with either volume_mutex or balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	u64 target = 0;
+
+	if (!bctl)
+		return 0;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA &&
+	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+	}
+
+	return target;
+}
+
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
+static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+	u64 num_devices = root->fs_info->fs_devices->rw_devices;
+	u64 target;
+	u64 tmp;
+
+	/*
+	 * see if restripe for this chunk_type is in progress, if so
+	 * try to reduce to the target profile
+	 */
+	spin_lock(&root->fs_info->balance_lock);
+	target = get_restripe_target(root->fs_info, flags);
+	if (target) {
+		/* pick target profile only if it's already available */
+		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+			spin_unlock(&root->fs_info->balance_lock);
+			return extended_to_chunk(target);
+		}
+	}
+	spin_unlock(&root->fs_info->balance_lock);
+
+	/* First, mask out the RAID levels which aren't possible */
+	if (num_devices == 1)
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+			   BTRFS_BLOCK_GROUP_RAID5);
+	if (num_devices < 3)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
+	if (num_devices < 4)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
+	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+	flags &= ~tmp;
+
+	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+		tmp = BTRFS_BLOCK_GROUP_RAID6;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+		tmp = BTRFS_BLOCK_GROUP_RAID5;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+		tmp = BTRFS_BLOCK_GROUP_RAID10;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+		tmp = BTRFS_BLOCK_GROUP_RAID1;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+		tmp = BTRFS_BLOCK_GROUP_RAID0;
+
+	return extended_to_chunk(flags | tmp);
+}
+
+static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
+{
+	unsigned seq;
+	u64 flags;
+
+	do {
+		flags = orig_flags;
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			flags |= root->fs_info->avail_data_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			flags |= root->fs_info->avail_system_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			flags |= root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
+	return btrfs_reduce_alloc_profile(root, flags);
+}
+
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+{
+	u64 flags;
+	u64 ret;
+
+	if (data)
+		flags = BTRFS_BLOCK_GROUP_DATA;
+	else if (root == root->fs_info->chunk_root)
+		flags = BTRFS_BLOCK_GROUP_SYSTEM;
+	else
+		flags = BTRFS_BLOCK_GROUP_METADATA;
+
+	ret = get_alloc_profile(root, flags);
+	return ret;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 used;
+	int ret = 0;
+	int need_commit = 2;
+	int have_pinned_space;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = ALIGN(bytes, root->sectorsize);
+
+	if (btrfs_is_free_space_inode(inode)) {
+		need_commit = 0;
+		ASSERT(current->journal_info);
+	}
+
+	data_sinfo = fs_info->data_sinfo;
+	if (!data_sinfo)
+		goto alloc;
+
+again:
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+		data_sinfo->bytes_may_use;
+
+	if (used + bytes > data_sinfo->total_bytes) {
+		struct btrfs_trans_handle *trans;
+
+		/*
+		 * if we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full) {
+			u64 alloc_target;
+
+			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
+			spin_unlock(&data_sinfo->lock);
+alloc:
+			alloc_target = btrfs_get_alloc_profile(root, 1);
+			/*
+			 * It is ugly that we don't call nolock join
+			 * transaction for the free space inode case here.
+			 * But it is safe because we only do the data space
+			 * reservation for the free space cache in the
+			 * transaction context, the common join transaction
+			 * just increase the counter of the current transaction
+			 * handler, doesn't try to acquire the trans_lock of
+			 * the fs.
+			 */
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     alloc_target,
+					     CHUNK_ALLOC_NO_FORCE);
+			btrfs_end_transaction(trans, root);
+			if (ret < 0) {
+				if (ret != -ENOSPC)
+					return ret;
+				else {
+					have_pinned_space = 1;
+					goto commit_trans;
+				}
+			}
+
+			if (!data_sinfo)
+				data_sinfo = fs_info->data_sinfo;
+
+			goto again;
+		}
+
+		/*
+		 * If we don't have enough pinned space to deal with this
+		 * allocation, and no removed chunk in current transaction,
+		 * don't bother committing the transaction.
+		 */
+		have_pinned_space = percpu_counter_compare(
+			&data_sinfo->total_bytes_pinned,
+			used + bytes - data_sinfo->total_bytes);
+		spin_unlock(&data_sinfo->lock);
+
+		/* commit the current transaction and try again */
+commit_trans:
+		if (need_commit &&
+		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
+			need_commit--;
+
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+			if (have_pinned_space >= 0 ||
+			    trans->transaction->have_free_bgs ||
+			    need_commit > 0) {
+				ret = btrfs_commit_transaction(trans, root);
+				if (ret)
+					return ret;
+				/*
+				 * make sure that all running delayed iput are
+				 * done
+				 */
+				down_write(&root->fs_info->delayed_iput_sem);
+				up_write(&root->fs_info->delayed_iput_sem);
+				goto again;
+			} else {
+				btrfs_end_transaction(trans, root);
+			}
+		}
+
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      data_sinfo->flags, bytes, 1);
+		return -ENOSPC;
+	}
+	ret = btrfs_qgroup_reserve(root, write_bytes);
+	if (ret)
+		goto out;
+	data_sinfo->bytes_may_use += bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, bytes, 1);
+out:
+	spin_unlock(&data_sinfo->lock);
+
+	return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_space_info *data_sinfo;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = ALIGN(bytes, root->sectorsize);
+
+	data_sinfo = root->fs_info->data_sinfo;
+	spin_lock(&data_sinfo->lock);
+	WARN_ON(data_sinfo->bytes_may_use < bytes);
+	data_sinfo->bytes_may_use -= bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, bytes, 0);
+	spin_unlock(&data_sinfo->lock);
+}
+
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+			found->force_alloc = CHUNK_ALLOC_FORCE;
+	}
+	rcu_read_unlock();
+}
+
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+	return (global->size << 1);
+}
+
+static int should_alloc_chunk(struct btrfs_root *root,
+			      struct btrfs_space_info *sinfo, int force)
+{
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
+	u64 thresh;
+
+	if (force == CHUNK_ALLOC_FORCE)
+		return 1;
+
+	/*
+	 * We need to take into account the global rsv because for all intents
+	 * and purposes it's used space.  Don't worry about locking the
+	 * global_rsv, it doesn't change except when the transaction commits.
+	 */
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+		num_allocated += calc_global_rsv_need_space(global_rsv);
+
+	/*
+	 * in limited mode, we want to have some free space up to
+	 * about 1% of the FS size.
+	 */
+	if (force == CHUNK_ALLOC_LIMITED) {
+		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+		thresh = max_t(u64, 64 * 1024 * 1024,
+			       div_factor_fine(thresh, 1));
+
+		if (num_bytes - num_allocated < thresh)
+			return 1;
+	}
+
+	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+		return 0;
+	return 1;
+}
+
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+	u64 num_dev;
+
+	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+		    BTRFS_BLOCK_GROUP_RAID0 |
+		    BTRFS_BLOCK_GROUP_RAID5 |
+		    BTRFS_BLOCK_GROUP_RAID6))
+		num_dev = root->fs_info->fs_devices->rw_devices;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		num_dev = 2;
+	else
+		num_dev = 1;	/* DUP or single */
+
+	/* metadata for updaing devices and chunk tree */
+	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
+
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, u64 type)
+{
+	struct btrfs_space_info *info;
+	u64 left;
+	u64 thresh;
+
+	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	spin_lock(&info->lock);
+	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+		info->bytes_reserved - info->bytes_readonly;
+	spin_unlock(&info->lock);
+
+	thresh = get_system_chunk_thresh(root, type);
+	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
+			left, thresh, type);
+		dump_space_info(info, 0, 0);
+	}
+
+	if (left < thresh) {
+		u64 flags;
+
+		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+		btrfs_alloc_chunk(trans, root, flags);
+	}
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 flags, int force)
+{
+	struct btrfs_space_info *space_info;
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
+	int wait_for_alloc = 0;
+	int ret = 0;
+
+	/* Don't re-enter if we're already allocating a chunk */
+	if (trans->allocating_chunk)
+		return -ENOSPC;
+
+	space_info = __find_space_info(extent_root->fs_info, flags);
+	if (!space_info) {
+		ret = update_space_info(extent_root->fs_info, flags,
+					0, 0, &space_info);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+	BUG_ON(!space_info); /* Logic error */
+
+again:
+	spin_lock(&space_info->lock);
+	if (force < space_info->force_alloc)
+		force = space_info->force_alloc;
+	if (space_info->full) {
+		if (should_alloc_chunk(extent_root, space_info, force))
+			ret = -ENOSPC;
+		else
+			ret = 0;
+		spin_unlock(&space_info->lock);
+		return ret;
+	}
+
+	if (!should_alloc_chunk(extent_root, space_info, force)) {
+		spin_unlock(&space_info->lock);
+		return 0;
+	} else if (space_info->chunk_alloc) {
+		wait_for_alloc = 1;
+	} else {
+		space_info->chunk_alloc = 1;
+	}
+
+	spin_unlock(&space_info->lock);
+
+	mutex_lock(&fs_info->chunk_mutex);
+
+	/*
+	 * The chunk_mutex is held throughout the entirety of a chunk
+	 * allocation, so once we've acquired the chunk_mutex we know that the
+	 * other guy is done and we need to recheck and see if we should
+	 * allocate.
+	 */
+	if (wait_for_alloc) {
+		mutex_unlock(&fs_info->chunk_mutex);
+		wait_for_alloc = 0;
+		goto again;
+	}
+
+	trans->allocating_chunk = true;
+
+	/*
+	 * If we have mixed data/metadata chunks we want to make sure we keep
+	 * allocating mixed chunks instead of individual chunks.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+	/*
+	 * if we're doing a data chunk, go ahead and make sure that
+	 * we keep a reasonable number of metadata chunks allocated in the
+	 * FS as well.
+	 */
+	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+		fs_info->data_chunk_allocations++;
+		if (!(fs_info->data_chunk_allocations %
+		      fs_info->metadata_ratio))
+			force_metadata_allocation(fs_info);
+	}
+
+	/*
+	 * Check if we have enough space in SYSTEM chunk because we may need
+	 * to update devices.
+	 */
+	check_system_chunk(trans, extent_root, flags);
+
+	ret = btrfs_alloc_chunk(trans, extent_root, flags);
+	trans->allocating_chunk = false;
+
+	spin_lock(&space_info->lock);
+	if (ret < 0 && ret != -ENOSPC)
+		goto out;
+	if (ret)
+		space_info->full = 1;
+	else
+		ret = 1;
+
+	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
+	space_info->chunk_alloc = 0;
+	spin_unlock(&space_info->lock);
+	mutex_unlock(&fs_info->chunk_mutex);
+	return ret;
+}
+
+static int can_overcommit(struct btrfs_root *root,
+			  struct btrfs_space_info *space_info, u64 bytes,
+			  enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	u64 profile = btrfs_get_alloc_profile(root, 0);
+	u64 space_size;
+	u64 avail;
+	u64 used;
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly;
+
+	/*
+	 * We only want to allow over committing if we have lots of actual space
+	 * free, but if we don't have enough space to handle the global reserve
+	 * space then we could end up having a real enospc problem when trying
+	 * to allocate a chunk or some other such important allocation.
+	 */
+	spin_lock(&global_rsv->lock);
+	space_size = calc_global_rsv_need_space(global_rsv);
+	spin_unlock(&global_rsv->lock);
+	if (used + space_size >= space_info->total_bytes)
+		return 0;
+
+	used += space_info->bytes_may_use;
+
+	spin_lock(&root->fs_info->free_chunk_lock);
+	avail = root->fs_info->free_chunk_space;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
+	/*
+	 * If we have dup, raid1 or raid10 then only half of the free
+	 * space is actually useable.  For raid56, the space info used
+	 * doesn't include the parity drive, so we don't have to
+	 * change the math
+	 */
+	if (profile & (BTRFS_BLOCK_GROUP_DUP |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID10))
+		avail >>= 1;
+
+	/*
+	 * If we aren't flushing all things, let us overcommit up to
+	 * 1/2th of the space. If we can flush, don't let us overcommit
+	 * too much, let it overcommit up to 1/8 of the space.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL)
+		avail >>= 3;
+	else
+		avail >>= 1;
+
+	if (used + bytes < space_info->total_bytes + avail)
+		return 1;
+	return 0;
+}
+
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+					 unsigned long nr_pages, int nr_items)
+{
+	struct super_block *sb = root->fs_info->sb;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+		up_read(&sb->s_umount);
+	} else {
+		/*
+		 * We needn't worry the filesystem going from r/w to r/o though
+		 * we don't acquire ->s_umount mutex, because the filesystem
+		 * should guarantee the delalloc inodes list be empty after
+		 * the filesystem is readonly(all dirty pages are written to
+		 * the disk).
+		 */
+		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
+		if (!current->journal_info)
+			btrfs_wait_ordered_roots(root->fs_info, nr_items);
+	}
+}
+
+static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+{
+	u64 bytes;
+	int nr;
+
+	bytes = btrfs_calc_trans_metadata_size(root, 1);
+	nr = (int)div64_u64(to_reclaim, bytes);
+	if (!nr)
+		nr = 1;
+	return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM	(256 * 1024)
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+			    bool wait_ordered)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_space_info *space_info;
+	struct btrfs_trans_handle *trans;
+	u64 delalloc_bytes;
+	u64 max_reclaim;
+	long time_left;
+	unsigned long nr_pages;
+	int loops;
+	int items;
+	enum btrfs_reserve_flush_enum flush;
+
+	/* Calc the number of the pages we need flush for space reservation */
+	items = calc_reclaim_items_nr(root, to_reclaim);
+	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	block_rsv = &root->fs_info->delalloc_block_rsv;
+	space_info = block_rsv->space_info;
+
+	delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
+	if (delalloc_bytes == 0) {
+		if (trans)
+			return;
+		if (wait_ordered)
+			btrfs_wait_ordered_roots(root->fs_info, items);
+		return;
+	}
+
+	loops = 0;
+	while (delalloc_bytes && loops < 3) {
+		max_reclaim = min(delalloc_bytes, to_reclaim);
+		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
+		/*
+		 * We need to wait for the async pages to actually start before
+		 * we do anything.
+		 */
+		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+		if (!max_reclaim)
+			goto skip_async;
+
+		if (max_reclaim <= nr_pages)
+			max_reclaim = 0;
+		else
+			max_reclaim -= nr_pages;
+
+		wait_event(root->fs_info->async_submit_wait,
+			   atomic_read(&root->fs_info->async_delalloc_pages) <=
+			   (int)max_reclaim);
+skip_async:
+		if (!trans)
+			flush = BTRFS_RESERVE_FLUSH_ALL;
+		else
+			flush = BTRFS_RESERVE_NO_FLUSH;
+		spin_lock(&space_info->lock);
+		if (can_overcommit(root, space_info, orig, flush)) {
+			spin_unlock(&space_info->lock);
+			break;
+		}
+		spin_unlock(&space_info->lock);
+
+		loops++;
+		if (wait_ordered && !trans) {
+			btrfs_wait_ordered_roots(root->fs_info, items);
+		} else {
+			time_left = schedule_timeout_killable(1);
+			if (time_left)
+				break;
+		}
+		delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
+	}
+}
+
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+				  struct btrfs_space_info *space_info,
+				  u64 bytes, int force)
+{
+	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+	struct btrfs_trans_handle *trans;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	if (trans)
+		return -EAGAIN;
+
+	if (force)
+		goto commit;
+
+	/* See if there is enough pinned space to make this reservation */
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes) >= 0)
+		goto commit;
+
+	/*
+	 * See if there is some space in the delayed insertion reservation for
+	 * this reservation.
+	 */
+	if (space_info != delayed_rsv->space_info)
+		return -ENOSPC;
+
+	spin_lock(&delayed_rsv->lock);
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes - delayed_rsv->size) >= 0) {
+		spin_unlock(&delayed_rsv->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&delayed_rsv->lock);
+
+commit:
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return -ENOSPC;
+
+	return btrfs_commit_transaction(trans, root);
+}
+
+enum flush_state {
+	FLUSH_DELAYED_ITEMS_NR	=	1,
+	FLUSH_DELAYED_ITEMS	=	2,
+	FLUSH_DELALLOC		=	3,
+	FLUSH_DELALLOC_WAIT	=	4,
+	ALLOC_CHUNK		=	5,
+	COMMIT_TRANS		=	6,
+};
+
+static int flush_space(struct btrfs_root *root,
+		       struct btrfs_space_info *space_info, u64 num_bytes,
+		       u64 orig_bytes, int state)
+{
+	struct btrfs_trans_handle *trans;
+	int nr;
+	int ret = 0;
+
+	switch (state) {
+	case FLUSH_DELAYED_ITEMS_NR:
+	case FLUSH_DELAYED_ITEMS:
+		if (state == FLUSH_DELAYED_ITEMS_NR)
+			nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+		else
+			nr = -1;
+
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = btrfs_run_delayed_items_nr(trans, root, nr);
+		btrfs_end_transaction(trans, root);
+		break;
+	case FLUSH_DELALLOC:
+	case FLUSH_DELALLOC_WAIT:
+		shrink_delalloc(root, num_bytes * 2, orig_bytes,
+				state == FLUSH_DELALLOC_WAIT);
+		break;
+	case ALLOC_CHUNK:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     btrfs_get_alloc_profile(root, 0),
+				     CHUNK_ALLOC_NO_FORCE);
+		btrfs_end_transaction(trans, root);
+		if (ret == -ENOSPC)
+			ret = 0;
+		break;
+	case COMMIT_TRANS:
+		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+		break;
+	default:
+		ret = -ENOSPC;
+		break;
+	}
+
+	return ret;
+}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
+				 struct btrfs_space_info *space_info)
+{
+	u64 used;
+	u64 expected;
+	u64 to_reclaim;
+
+	to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
+				16 * 1024 * 1024);
+	spin_lock(&space_info->lock);
+	if (can_overcommit(root, space_info, to_reclaim,
+			   BTRFS_RESERVE_FLUSH_ALL)) {
+		to_reclaim = 0;
+		goto out;
+	}
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+	       space_info->bytes_pinned + space_info->bytes_readonly +
+	       space_info->bytes_may_use;
+	if (can_overcommit(root, space_info, 1024 * 1024,
+			   BTRFS_RESERVE_FLUSH_ALL))
+		expected = div_factor_fine(space_info->total_bytes, 95);
+	else
+		expected = div_factor_fine(space_info->total_bytes, 90);
+
+	if (used > expected)
+		to_reclaim = used - expected;
+	else
+		to_reclaim = 0;
+	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+				     space_info->bytes_reserved);
+out:
+	spin_unlock(&space_info->lock);
+
+	return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
+					struct btrfs_fs_info *fs_info, u64 used)
+{
+	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+	/* If we're just plain full then async reclaim just slows us down. */
+	if (space_info->bytes_used >= thresh)
+		return 0;
+
+	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
+		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
+				       struct btrfs_fs_info *fs_info,
+				       int flush_state)
+{
+	u64 used;
+
+	spin_lock(&space_info->lock);
+	/*
+	 * We run out of space and have not got any free space via flush_space,
+	 * so don't bother doing async reclaim.
+	 */
+	if (flush_state > COMMIT_TRANS && space_info->full) {
+		spin_unlock(&space_info->lock);
+		return 0;
+	}
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+	       space_info->bytes_pinned + space_info->bytes_readonly +
+	       space_info->bytes_may_use;
+	if (need_do_async_reclaim(space_info, fs_info, used)) {
+		spin_unlock(&space_info->lock);
+		return 1;
+	}
+	spin_unlock(&space_info->lock);
+
+	return 0;
+}
+
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_space_info *space_info;
+	u64 to_reclaim;
+	int flush_state;
+
+	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+						      space_info);
+	if (!to_reclaim)
+		return;
+
+	flush_state = FLUSH_DELAYED_ITEMS_NR;
+	do {
+		flush_space(fs_info->fs_root, space_info, to_reclaim,
+			    to_reclaim, flush_state);
+		flush_state++;
+		if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+						 flush_state))
+			return;
+	} while (flush_state < COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 orig_bytes,
+				  enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+	u64 used;
+	u64 num_bytes = orig_bytes;
+	int flush_state = FLUSH_DELAYED_ITEMS_NR;
+	int ret = 0;
+	bool flushing = false;
+
+again:
+	ret = 0;
+	spin_lock(&space_info->lock);
+	/*
+	 * We only want to wait if somebody other than us is flushing and we
+	 * are actually allowed to flush all things.
+	 */
+	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+	       space_info->flush) {
+		spin_unlock(&space_info->lock);
+		/*
+		 * If we have a trans handle we can't wait because the flusher
+		 * may have to commit the transaction, which would mean we would
+		 * deadlock since we are waiting for the flusher to finish, but
+		 * hold the current transaction open.
+		 */
+		if (current->journal_info)
+			return -EAGAIN;
+		ret = wait_event_killable(space_info->wait, !space_info->flush);
+		/* Must have been killed, return */
+		if (ret)
+			return -EINTR;
+
+		spin_lock(&space_info->lock);
+	}
+
+	ret = -ENOSPC;
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
+
+	/*
+	 * The idea here is that we've not already over-reserved the block group
+	 * then we can go ahead and save our reservation first and then start
+	 * flushing if we need to.  Otherwise if we've already overcommitted
+	 * lets start flushing stuff first and then come back and try to make
+	 * our reservation.
+	 */
+	if (used <= space_info->total_bytes) {
+		if (used + orig_bytes <= space_info->total_bytes) {
+			space_info->bytes_may_use += orig_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+				"space_info", space_info->flags, orig_bytes, 1);
+			ret = 0;
+		} else {
+			/*
+			 * Ok set num_bytes to orig_bytes since we aren't
+			 * overocmmitted, this way we only try and reclaim what
+			 * we need.
+			 */
+			num_bytes = orig_bytes;
+		}
+	} else {
+		/*
+		 * Ok we're over committed, set num_bytes to the overcommitted
+		 * amount plus the amount of bytes that we need for this
+		 * reservation.
+		 */
+		num_bytes = used - space_info->total_bytes +
+			(orig_bytes * 2);
+	}
+
+	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+		space_info->bytes_may_use += orig_bytes;
+		trace_btrfs_space_reservation(root->fs_info, "space_info",
+					      space_info->flags, orig_bytes,
+					      1);
+		ret = 0;
+	}
+
+	/*
+	 * Couldn't make our reservation, save our place so while we're trying
+	 * to reclaim space we can actually use it instead of somebody else
+	 * stealing it from us.
+	 *
+	 * We make the other tasks wait for the flush only when we can flush
+	 * all things.
+	 */
+	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+		flushing = true;
+		space_info->flush = 1;
+	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		used += orig_bytes;
+		/*
+		 * We will do the space reservation dance during log replay,
+		 * which means we won't have fs_info->fs_root set, so don't do
+		 * the async reclaim as we will panic.
+		 */
+		if (!root->fs_info->log_root_recovering &&
+		    need_do_async_reclaim(space_info, root->fs_info, used) &&
+		    !work_busy(&root->fs_info->async_reclaim_work))
+			queue_work(system_unbound_wq,
+				   &root->fs_info->async_reclaim_work);
+	}
+	spin_unlock(&space_info->lock);
+
+	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+		goto out;
+
+	ret = flush_space(root, space_info, num_bytes, orig_bytes,
+			  flush_state);
+	flush_state++;
+
+	/*
+	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+	 * would happen. So skip delalloc flush.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+	    (flush_state == FLUSH_DELALLOC ||
+	     flush_state == FLUSH_DELALLOC_WAIT))
+		flush_state = ALLOC_CHUNK;
+
+	if (!ret)
+		goto again;
+	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+		 flush_state < COMMIT_TRANS)
+		goto again;
+	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+		 flush_state <= COMMIT_TRANS)
+		goto again;
+
+out:
+	if (ret == -ENOSPC &&
+	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+		struct btrfs_block_rsv *global_rsv =
+			&root->fs_info->global_block_rsv;
+
+		if (block_rsv != global_rsv &&
+		    !block_rsv_use_bytes(global_rsv, orig_bytes))
+			ret = 0;
+	}
+	if (ret == -ENOSPC)
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      space_info->flags, orig_bytes, 1);
+	if (flushing) {
+		spin_lock(&space_info->lock);
+		space_info->flush = 0;
+		wake_up_all(&space_info->wait);
+		spin_unlock(&space_info->lock);
+	}
+	return ret;
+}
+
+static struct btrfs_block_rsv *get_block_rsv(
+					const struct btrfs_trans_handle *trans,
+					const struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *block_rsv = NULL;
+
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		block_rsv = trans->block_rsv;
+
+	if (root == root->fs_info->csum_root && trans->adding_csums)
+		block_rsv = trans->block_rsv;
+
+	if (root == root->fs_info->uuid_root)
+		block_rsv = trans->block_rsv;
+
+	if (!block_rsv)
+		block_rsv = root->block_rsv;
+
+	if (!block_rsv)
+		block_rsv = &root->fs_info->empty_block_rsv;
+
+	return block_rsv;
+}
+
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes)
+{
+	int ret = -ENOSPC;
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved >= num_bytes) {
+		block_rsv->reserved -= num_bytes;
+		if (block_rsv->reserved < block_rsv->size)
+			block_rsv->full = 0;
+		ret = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+	return ret;
+}
+
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes, int update_size)
+{
+	spin_lock(&block_rsv->lock);
+	block_rsv->reserved += num_bytes;
+	if (update_size)
+		block_rsv->size += num_bytes;
+	else if (block_rsv->reserved >= block_rsv->size)
+		block_rsv->full = 1;
+	spin_unlock(&block_rsv->lock);
+}
+
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 min_bytes;
+
+	if (global_rsv->space_info != dest->space_info)
+		return -ENOSPC;
+
+	spin_lock(&global_rsv->lock);
+	min_bytes = div_factor(global_rsv->size, min_factor);
+	if (global_rsv->reserved < min_bytes + num_bytes) {
+		spin_unlock(&global_rsv->lock);
+		return -ENOSPC;
+	}
+	global_rsv->reserved -= num_bytes;
+	if (global_rsv->reserved < global_rsv->size)
+		global_rsv->full = 0;
+	spin_unlock(&global_rsv->lock);
+
+	block_rsv_add_bytes(dest, num_bytes, 1);
+	return 0;
+}
+
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_rsv *block_rsv,
+				    struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+
+	spin_lock(&block_rsv->lock);
+	if (num_bytes == (u64)-1)
+		num_bytes = block_rsv->size;
+	block_rsv->size -= num_bytes;
+	if (block_rsv->reserved >= block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		block_rsv->reserved = block_rsv->size;
+		block_rsv->full = 1;
+	} else {
+		num_bytes = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+
+	if (num_bytes > 0) {
+		if (dest) {
+			spin_lock(&dest->lock);
+			if (!dest->full) {
+				u64 bytes_to_add;
+
+				bytes_to_add = dest->size - dest->reserved;
+				bytes_to_add = min(num_bytes, bytes_to_add);
+				dest->reserved += bytes_to_add;
+				if (dest->reserved >= dest->size)
+					dest->full = 1;
+				num_bytes -= bytes_to_add;
+			}
+			spin_unlock(&dest->lock);
+		}
+		if (num_bytes) {
+			spin_lock(&space_info->lock);
+			space_info->bytes_may_use -= num_bytes;
+			trace_btrfs_space_reservation(fs_info, "space_info",
+					space_info->flags, num_bytes, 0);
+			spin_unlock(&space_info->lock);
+		}
+	}
+}
+
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+				   struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+	int ret;
+
+	ret = block_rsv_use_bytes(src, num_bytes);
+	if (ret)
+		return ret;
+
+	block_rsv_add_bytes(dst, num_bytes, 1);
+	return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+{
+	memset(rsv, 0, sizeof(*rsv));
+	spin_lock_init(&rsv->lock);
+	rsv->type = type;
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+	if (!block_rsv)
+		return NULL;
+
+	btrfs_init_block_rsv(block_rsv, type);
+	block_rsv->space_info = __find_space_info(fs_info,
+						  BTRFS_BLOCK_GROUP_METADATA);
+	return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_root *root,
+			  struct btrfs_block_rsv *rsv)
+{
+	if (!rsv)
+		return;
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	kfree(rsv);
+}
+
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+	kfree(rsv);
+}
+
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush)
+{
+	int ret;
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 1);
+		return 0;
+	}
+
+	return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv, int min_factor)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = div_factor(block_rsv->size, min_factor);
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	spin_unlock(&block_rsv->lock);
+
+	return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = min_reserved;
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	else
+		num_bytes -= block_rsv->reserved;
+	spin_unlock(&block_rsv->lock);
+
+	if (!ret)
+		return 0;
+
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
+		return 0;
+	}
+
+	return ret;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+			    struct btrfs_block_rsv *dst_rsv,
+			    u64 num_bytes)
+{
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_block_rsv_release(struct btrfs_root *root,
+			     struct btrfs_block_rsv *block_rsv,
+			     u64 num_bytes)
+{
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	if (global_rsv == block_rsv ||
+	    block_rsv->space_info != global_rsv->space_info)
+		global_rsv = NULL;
+	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+				num_bytes);
+}
+
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *sinfo;
+	u64 num_bytes;
+	u64 meta_used;
+	u64 data_used;
+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+	spin_lock(&sinfo->lock);
+	data_used = sinfo->bytes_used;
+	spin_unlock(&sinfo->lock);
+
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	spin_lock(&sinfo->lock);
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+		data_used = 0;
+	meta_used = sinfo->bytes_used;
+	spin_unlock(&sinfo->lock);
+
+	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+		    csum_size * 2;
+	num_bytes += div_u64(data_used + meta_used, 50);
+
+	if (num_bytes * 3 > meta_used)
+		num_bytes = div_u64(meta_used, 3);
+
+	return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
+}
+
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	struct btrfs_space_info *sinfo = block_rsv->space_info;
+	u64 num_bytes;
+
+	num_bytes = calc_global_metadata_size(fs_info);
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&block_rsv->lock);
+
+	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
+
+	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+		    sinfo->bytes_reserved + sinfo->bytes_readonly +
+		    sinfo->bytes_may_use;
+
+	if (sinfo->total_bytes > num_bytes) {
+		num_bytes = sinfo->total_bytes - num_bytes;
+		block_rsv->reserved += num_bytes;
+		sinfo->bytes_may_use += num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+				      sinfo->flags, num_bytes, 1);
+	}
+
+	if (block_rsv->reserved >= block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		sinfo->bytes_may_use -= num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+				      sinfo->flags, num_bytes, 0);
+		block_rsv->reserved = block_rsv->size;
+		block_rsv->full = 1;
+	}
+
+	spin_unlock(&block_rsv->lock);
+	spin_unlock(&sinfo->lock);
+}
+
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	fs_info->chunk_block_rsv.space_info = space_info;
+
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	fs_info->global_block_rsv.space_info = space_info;
+	fs_info->delalloc_block_rsv.space_info = space_info;
+	fs_info->trans_block_rsv.space_info = space_info;
+	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->delayed_block_rsv.space_info = space_info;
+
+	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+	if (fs_info->quota_root)
+		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+	update_global_block_rsv(fs_info);
+}
+
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+				(u64)-1);
+	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+	WARN_ON(fs_info->trans_block_rsv.size > 0);
+	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+	WARN_ON(fs_info->chunk_block_rsv.size > 0);
+	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+}
+
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	if (!trans->block_rsv)
+		return;
+
+	if (!trans->bytes_reserved)
+		return;
+
+	trace_btrfs_space_reservation(root->fs_info, "transaction",
+				      trans->transid, trans->bytes_reserved, 0);
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
+	trans->bytes_reserved = 0;
+}
+
+/* Can only return 0 or -ENOSPC */
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+				  struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+
+	/*
+	 * We need to hold space in order to delete our orphan item once we've
+	 * added it, so this takes the reservation so we can release it later
+	 * when we are truly done with the orphan item.
+	 */
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 1);
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 0);
+	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int items,
+				     u64 *qgroup_reserved,
+				     bool use_global_rsv)
+{
+	u64 num_bytes;
+	int ret;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+
+	if (root->fs_info->quota_enabled) {
+		/* One for parent inode, two for dir entries */
+		num_bytes = 3 * root->nodesize;
+		ret = btrfs_qgroup_reserve(root, num_bytes);
+		if (ret)
+			return ret;
+	} else {
+		num_bytes = 0;
+	}
+
+	*qgroup_reserved = num_bytes;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, items);
+	rsv->space_info = __find_space_info(root->fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
+
+	if (ret == -ENOSPC && use_global_rsv)
+		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+
+	if (ret) {
+		if (*qgroup_reserved)
+			btrfs_qgroup_free(root, *qgroup_reserved);
+	}
+
+	return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved)
+{
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+}
+
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written.  This will return the number of
+ * reserved extents that need to be freed.  This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
+{
+	unsigned drop_inode_space = 0;
+	unsigned dropped_extents = 0;
+	unsigned num_extents = 0;
+
+	num_extents = (unsigned)div64_u64(num_bytes +
+					  BTRFS_MAX_EXTENT_SIZE - 1,
+					  BTRFS_MAX_EXTENT_SIZE);
+	ASSERT(num_extents);
+	ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+	BTRFS_I(inode)->outstanding_extents -= num_extents;
+
+	if (BTRFS_I(inode)->outstanding_extents == 0 &&
+	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+			       &BTRFS_I(inode)->runtime_flags))
+		drop_inode_space = 1;
+
+	/*
+	 * If we have more or the same amount of outsanding extents than we have
+	 * reserved then we need to leave the reserved extents count alone.
+	 */
+	if (BTRFS_I(inode)->outstanding_extents >=
+	    BTRFS_I(inode)->reserved_extents)
+		return drop_inode_space;
+
+	dropped_extents = BTRFS_I(inode)->reserved_extents -
+		BTRFS_I(inode)->outstanding_extents;
+	BTRFS_I(inode)->reserved_extents -= dropped_extents;
+	return dropped_extents + drop_inode_space;
+}
+
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ *	reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed.  We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required.  If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved.  If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+				   int reserve)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 old_csums, num_csums;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+	    BTRFS_I(inode)->csum_bytes == 0)
+		return 0;
+
+	old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+	if (reserve)
+		BTRFS_I(inode)->csum_bytes += num_bytes;
+	else
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+	num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+
+	/* No change, no need to reserve more */
+	if (old_csums == num_csums)
+		return 0;
+
+	if (reserve)
+		return btrfs_calc_trans_metadata_size(root,
+						      num_csums - old_csums);
+
+	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
+}
+
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+	u64 to_reserve = 0;
+	u64 csum_bytes;
+	unsigned nr_extents = 0;
+	int extra_reserve = 0;
+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+	int ret = 0;
+	bool delalloc_lock = true;
+	u64 to_free = 0;
+	unsigned dropped;
+
+	/* If we are a free space inode we need to not flush since we will be in
+	 * the middle of a transaction commit.  We also don't need the delalloc
+	 * mutex since we won't race with anybody.  We need this mostly to make
+	 * lockdep shut its filthy mouth.
+	 */
+	if (btrfs_is_free_space_inode(inode)) {
+		flush = BTRFS_RESERVE_NO_FLUSH;
+		delalloc_lock = false;
+	}
+
+	if (flush != BTRFS_RESERVE_NO_FLUSH &&
+	    btrfs_transaction_in_commit(root->fs_info))
+		schedule_timeout(1);
+
+	if (delalloc_lock)
+		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
+	num_bytes = ALIGN(num_bytes, root->sectorsize);
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	nr_extents = (unsigned)div64_u64(num_bytes +
+					 BTRFS_MAX_EXTENT_SIZE - 1,
+					 BTRFS_MAX_EXTENT_SIZE);
+	BTRFS_I(inode)->outstanding_extents += nr_extents;
+	nr_extents = 0;
+
+	if (BTRFS_I(inode)->outstanding_extents >
+	    BTRFS_I(inode)->reserved_extents)
+		nr_extents = BTRFS_I(inode)->outstanding_extents -
+			BTRFS_I(inode)->reserved_extents;
+
+	/*
+	 * Add an item to reserve for updating the inode when we complete the
+	 * delalloc io.
+	 */
+	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+		      &BTRFS_I(inode)->runtime_flags)) {
+		nr_extents++;
+		extra_reserve = 1;
+	}
+
+	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+	csum_bytes = BTRFS_I(inode)->csum_bytes;
+	spin_unlock(&BTRFS_I(inode)->lock);
+
+	if (root->fs_info->quota_enabled) {
+		ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+		if (ret)
+			goto out_fail;
+	}
+
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	if (unlikely(ret)) {
+		if (root->fs_info->quota_enabled)
+			btrfs_qgroup_free(root, nr_extents * root->nodesize);
+		goto out_fail;
+	}
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	if (extra_reserve) {
+		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+			&BTRFS_I(inode)->runtime_flags);
+		nr_extents--;
+	}
+	BTRFS_I(inode)->reserved_extents += nr_extents;
+	spin_unlock(&BTRFS_I(inode)->lock);
+
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+	if (to_reserve)
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), to_reserve, 1);
+	block_rsv_add_bytes(block_rsv, to_reserve, 1);
+
+	return 0;
+
+out_fail:
+	spin_lock(&BTRFS_I(inode)->lock);
+	dropped = drop_outstanding_extent(inode, num_bytes);
+	/*
+	 * If the inodes csum_bytes is the same as the original
+	 * csum_bytes then we know we haven't raced with any free()ers
+	 * so we can just reduce our inodes csum bytes and carry on.
+	 */
+	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
+		calc_csum_metadata_size(inode, num_bytes, 0);
+	} else {
+		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
+		u64 bytes;
+
+		/*
+		 * This is tricky, but first we need to figure out how much we
+		 * free'd from any free-ers that occured during this
+		 * reservation, so we reset ->csum_bytes to the csum_bytes
+		 * before we dropped our lock, and then call the free for the
+		 * number of bytes that were freed while we were trying our
+		 * reservation.
+		 */
+		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
+		BTRFS_I(inode)->csum_bytes = csum_bytes;
+		to_free = calc_csum_metadata_size(inode, bytes, 0);
+
+
+		/*
+		 * Now we need to see how much we would have freed had we not
+		 * been making this reservation and our ->csum_bytes were not
+		 * artificially inflated.
+		 */
+		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
+		bytes = csum_bytes - orig_csum_bytes;
+		bytes = calc_csum_metadata_size(inode, bytes, 0);
+
+		/*
+		 * Now reset ->csum_bytes to what it should be.  If bytes is
+		 * more than to_free then we would have free'd more space had we
+		 * not had an artificially high ->csum_bytes, so we need to free
+		 * the remainder.  If bytes is the same or less then we don't
+		 * need to do anything, the other free-ers did the correct
+		 * thing.
+		 */
+		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
+		if (bytes > to_free)
+			to_free = bytes - to_free;
+		else
+			to_free = 0;
+	}
+	spin_unlock(&BTRFS_I(inode)->lock);
+	if (dropped)
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+	if (to_free) {
+		btrfs_block_rsv_release(root, block_rsv, to_free);
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), to_free, 0);
+	}
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+	return ret;
+}
+
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 to_free = 0;
+	unsigned dropped;
+
+	num_bytes = ALIGN(num_bytes, root->sectorsize);
+	spin_lock(&BTRFS_I(inode)->lock);
+	dropped = drop_outstanding_extent(inode, num_bytes);
+
+	if (num_bytes)
+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	spin_unlock(&BTRFS_I(inode)->lock);
+	if (dropped > 0)
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+	if (btrfs_test_is_dummy_root(root))
+		return;
+
+	trace_btrfs_space_reservation(root->fs_info, "delalloc",
+				      btrfs_ino(inode), to_free, 0);
+
+	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+				to_free);
+}
+
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ *   extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+	int ret;
+
+	ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+	if (ret) {
+		btrfs_free_reserved_data_space(inode, num_bytes);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+	btrfs_delalloc_release_metadata(inode, num_bytes);
+	btrfs_free_reserved_data_space(inode, num_bytes);
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root, u64 bytenr,
+			      u64 num_bytes, int alloc)
+{
+	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 total = num_bytes;
+	u64 old_val;
+	u64 byte_in_group;
+	int factor;
+
+	/* block accounting for super block */
+	spin_lock(&info->delalloc_root_lock);
+	old_val = btrfs_super_bytes_used(info->super_copy);
+	if (alloc)
+		old_val += num_bytes;
+	else
+		old_val -= num_bytes;
+	btrfs_set_super_bytes_used(info->super_copy, old_val);
+	spin_unlock(&info->delalloc_root_lock);
+
+	while (total) {
+		cache = btrfs_lookup_block_group(info, bytenr);
+		if (!cache)
+			return -ENOENT;
+		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+				    BTRFS_BLOCK_GROUP_RAID1 |
+				    BTRFS_BLOCK_GROUP_RAID10))
+			factor = 2;
+		else
+			factor = 1;
+		/*
+		 * If this block group has free space cache written out, we
+		 * need to make sure to load it if we are removing space.  This
+		 * is because we need the unpinning stage to actually add the
+		 * space back to the block group, otherwise we will leak space.
+		 */
+		if (!alloc && cache->cached == BTRFS_CACHE_NO)
+			cache_block_group(cache, 1);
+
+		byte_in_group = bytenr - cache->key.objectid;
+		WARN_ON(byte_in_group > cache->key.offset);
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
+
+		if (btrfs_test_opt(root, SPACE_CACHE) &&
+		    cache->disk_cache_state < BTRFS_DC_CLEAR)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
+		old_val = btrfs_block_group_used(&cache->item);
+		num_bytes = min(total, cache->key.offset - byte_in_group);
+		if (alloc) {
+			old_val += num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->reserved -= num_bytes;
+			cache->space_info->bytes_reserved -= num_bytes;
+			cache->space_info->bytes_used += num_bytes;
+			cache->space_info->disk_used += num_bytes * factor;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+		} else {
+			old_val -= num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->pinned += num_bytes;
+			cache->space_info->bytes_pinned += num_bytes;
+			cache->space_info->bytes_used -= num_bytes;
+			cache->space_info->disk_used -= num_bytes * factor;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+
+			set_extent_dirty(info->pinned_extents,
+					 bytenr, bytenr + num_bytes - 1,
+					 GFP_NOFS | __GFP_NOFAIL);
+			/*
+			 * No longer have used bytes in this block group, queue
+			 * it for deletion.
+			 */
+			if (old_val == 0) {
+				spin_lock(&info->unused_bgs_lock);
+				if (list_empty(&cache->bg_list)) {
+					btrfs_get_block_group(cache);
+					list_add_tail(&cache->bg_list,
+						      &info->unused_bgs);
+				}
+				spin_unlock(&info->unused_bgs_lock);
+			}
+		}
+
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+		if (list_empty(&cache->dirty_list)) {
+			list_add_tail(&cache->dirty_list,
+				      &trans->transaction->dirty_bgs);
+				trans->transaction->num_dirty_bgs++;
+			btrfs_get_block_group(cache);
+		}
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+		btrfs_put_block_group(cache);
+		total -= num_bytes;
+		bytenr += num_bytes;
+	}
+	return 0;
+}
+
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 bytenr;
+
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	bytenr = root->fs_info->first_logical_byte;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+
+	if (bytenr < (u64)-1)
+		return bytenr;
+
+	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+	if (!cache)
+		return 0;
+
+	bytenr = cache->key.objectid;
+	btrfs_put_block_group(cache);
+
+	return bytenr;
+}
+
+static int pin_down_extent(struct btrfs_root *root,
+			   struct btrfs_block_group_cache *cache,
+			   u64 bytenr, u64 num_bytes, int reserved)
+{
+	spin_lock(&cache->space_info->lock);
+	spin_lock(&cache->lock);
+	cache->pinned += num_bytes;
+	cache->space_info->bytes_pinned += num_bytes;
+	if (reserved) {
+		cache->reserved -= num_bytes;
+		cache->space_info->bytes_reserved -= num_bytes;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&cache->space_info->lock);
+
+	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	if (reserved)
+		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
+	return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	BUG_ON(!cache); /* Logic error */
+
+	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+
+	btrfs_put_block_group(cache);
+	return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_block_group_cache *cache;
+	int ret;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	if (!cache)
+		return -EINVAL;
+
+	/*
+	 * pull in the free space cache (if any) so that our pin
+	 * removes the free space from the cache.  We have load_only set
+	 * to one because the slow code to read in the free extents does check
+	 * the pinned extents.
+	 */
+	cache_block_group(cache, 1);
+
+	pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+	/* remove us from the free space cache (if we're there at all) */
+	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_caching_control *caching_ctl;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, start);
+	if (!block_group)
+		return -EINVAL;
+
+	cache_block_group(block_group, 0);
+	caching_ctl = get_caching_control(block_group);
+
+	if (!caching_ctl) {
+		/* Logic error */
+		BUG_ON(!block_group_cache_done(block_group));
+		ret = btrfs_remove_free_space(block_group, start, num_bytes);
+	} else {
+		mutex_lock(&caching_ctl->mutex);
+
+		if (start >= caching_ctl->progress) {
+			ret = add_excluded_extent(root, start, num_bytes);
+		} else if (start + num_bytes <= caching_ctl->progress) {
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+		} else {
+			num_bytes = caching_ctl->progress - start;
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			if (ret)
+				goto out_lock;
+
+			num_bytes = (start + num_bytes) -
+				caching_ctl->progress;
+			start = caching_ctl->progress;
+			ret = add_excluded_extent(root, start, num_bytes);
+		}
+out_lock:
+		mutex_unlock(&caching_ctl->mutex);
+		put_caching_control(caching_ctl);
+	}
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key key;
+	int found_type;
+	int i;
+
+	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+		return 0;
+
+	for (i = 0; i < btrfs_header_nritems(eb); i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(eb, item);
+		if (found_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			continue;
+		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		__exclude_logged_extent(log, key.objectid, key.offset);
+	}
+
+	return 0;
+}
+
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache:	The cache we are manipulating
+ * @num_bytes:	The number of bytes in question
+ * @reserve:	One of the reservation enums
+ * @delalloc:   The blocks are allocated for the delalloc write
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk.  For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting.  For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree.  We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
+ */
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve, int delalloc)
+{
+	struct btrfs_space_info *space_info = cache->space_info;
+	int ret = 0;
+
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+	if (reserve != RESERVE_FREE) {
+		if (cache->ro) {
+			ret = -EAGAIN;
+		} else {
+			cache->reserved += num_bytes;
+			space_info->bytes_reserved += num_bytes;
+			if (reserve == RESERVE_ALLOC) {
+				trace_btrfs_space_reservation(cache->fs_info,
+						"space_info", space_info->flags,
+						num_bytes, 0);
+				space_info->bytes_may_use -= num_bytes;
+			}
+
+			if (delalloc)
+				cache->delalloc_bytes += num_bytes;
+		}
+	} else {
+		if (cache->ro)
+			space_info->bytes_readonly += num_bytes;
+		cache->reserved -= num_bytes;
+		space_info->bytes_reserved -= num_bytes;
+
+		if (delalloc)
+			cache->delalloc_bytes -= num_bytes;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
+	return ret;
+}
+
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_caching_control *next;
+	struct btrfs_caching_control *caching_ctl;
+	struct btrfs_block_group_cache *cache;
+
+	down_write(&fs_info->commit_root_sem);
+
+	list_for_each_entry_safe(caching_ctl, next,
+				 &fs_info->caching_block_groups, list) {
+		cache = caching_ctl->block_group;
+		if (block_group_cache_done(cache)) {
+			cache->last_byte_to_unpin = (u64)-1;
+			list_del_init(&caching_ctl->list);
+			put_caching_control(caching_ctl);
+		} else {
+			cache->last_byte_to_unpin = caching_ctl->progress;
+		}
+	}
+
+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+		fs_info->pinned_extents = &fs_info->freed_extents[1];
+	else
+		fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+	up_write(&fs_info->commit_root_sem);
+
+	update_global_block_rsv(fs_info);
+}
+
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
+			      const bool return_free_space)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_space_info *space_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 len;
+	bool readonly;
+
+	while (start <= end) {
+		readonly = false;
+		if (!cache ||
+		    start >= cache->key.objectid + cache->key.offset) {
+			if (cache)
+				btrfs_put_block_group(cache);
+			cache = btrfs_lookup_block_group(fs_info, start);
+			BUG_ON(!cache); /* Logic error */
+		}
+
+		len = cache->key.objectid + cache->key.offset - start;
+		len = min(len, end + 1 - start);
+
+		if (start < cache->last_byte_to_unpin) {
+			len = min(len, cache->last_byte_to_unpin - start);
+			if (return_free_space)
+				btrfs_add_free_space(cache, start, len);
+		}
+
+		start += len;
+		space_info = cache->space_info;
+
+		spin_lock(&space_info->lock);
+		spin_lock(&cache->lock);
+		cache->pinned -= len;
+		space_info->bytes_pinned -= len;
+		percpu_counter_add(&space_info->total_bytes_pinned, -len);
+		if (cache->ro) {
+			space_info->bytes_readonly += len;
+			readonly = true;
+		}
+		spin_unlock(&cache->lock);
+		if (!readonly && global_rsv->space_info == space_info) {
+			spin_lock(&global_rsv->lock);
+			if (!global_rsv->full) {
+				len = min(len, global_rsv->size -
+					  global_rsv->reserved);
+				global_rsv->reserved += len;
+				space_info->bytes_may_use += len;
+				if (global_rsv->reserved >= global_rsv->size)
+					global_rsv->full = 1;
+			}
+			spin_unlock(&global_rsv->lock);
+		}
+		spin_unlock(&space_info->lock);
+	}
+
+	if (cache)
+		btrfs_put_block_group(cache);
+	return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_io_tree *unpin;
+	u64 start;
+	u64 end;
+	int ret;
+
+	if (trans->aborted)
+		return 0;
+
+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+		unpin = &fs_info->freed_extents[1];
+	else
+		unpin = &fs_info->freed_extents[0];
+
+	while (1) {
+		mutex_lock(&fs_info->unused_bg_unpin_mutex);
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY, NULL);
+		if (ret) {
+			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+			break;
+		}
+
+		if (btrfs_test_opt(root, DISCARD))
+			ret = btrfs_discard_extent(root, start,
+						   end + 1 - start, NULL);
+
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+		unpin_extent_range(root, start, end, true);
+		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+		cond_resched();
+	}
+
+	return 0;
+}
+
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+			     u64 owner, u64 root_objectid)
+{
+	struct btrfs_space_info *space_info;
+	u64 flags;
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+			flags = BTRFS_BLOCK_GROUP_SYSTEM;
+		else
+			flags = BTRFS_BLOCK_GROUP_METADATA;
+	} else {
+		flags = BTRFS_BLOCK_GROUP_DATA;
+	}
+
+	space_info = __find_space_info(fs_info, flags);
+	BUG_ON(!space_info); /* Logic bug */
+	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extent_op,
+				int no_quota)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	int ret;
+	int is_data;
+	int extent_slot = 0;
+	int found_extent = 0;
+	int num_to_del = 1;
+	u32 item_size;
+	u64 refs;
+	int last_ref = 0;
+	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	if (!info->quota_enabled || !is_fstree(root_objectid))
+		no_quota = 1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+
+	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
+	BUG_ON(!is_data && refs_to_drop != 1);
+
+	if (is_data)
+		skinny_metadata = 0;
+
+	ret = lookup_extent_backref(trans, extent_root, path, &iref,
+				    bytenr, num_bytes, parent,
+				    root_objectid, owner_objectid,
+				    owner_offset);
+	if (ret == 0) {
+		extent_slot = path->slots[0];
+		while (extent_slot >= 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      extent_slot);
+			if (key.objectid != bytenr)
+				break;
+			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes) {
+				found_extent = 1;
+				break;
+			}
+			if (key.type == BTRFS_METADATA_ITEM_KEY &&
+			    key.offset == owner_objectid) {
+				found_extent = 1;
+				break;
+			}
+			if (path->slots[0] - extent_slot > 5)
+				break;
+			extent_slot--;
+		}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+		if (found_extent && item_size < sizeof(*ei))
+			found_extent = 0;
+#endif
+		if (!found_extent) {
+			BUG_ON(iref);
+			ret = remove_extent_backref(trans, extent_root, path,
+						    NULL, refs_to_drop,
+						    is_data, &last_ref);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
+			btrfs_release_path(path);
+			path->leave_spinning = 1;
+
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+
+			if (!is_data && skinny_metadata) {
+				key.type = BTRFS_METADATA_ITEM_KEY;
+				key.offset = owner_objectid;
+			}
+
+			ret = btrfs_search_slot(trans, extent_root,
+						&key, path, -1, 1);
+			if (ret > 0 && skinny_metadata && path->slots[0]) {
+				/*
+				 * Couldn't find our skinny metadata item,
+				 * see if we have ye olde extent item.
+				 */
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == num_bytes)
+					ret = 0;
+			}
+
+			if (ret > 0 && skinny_metadata) {
+				skinny_metadata = false;
+				key.objectid = bytenr;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				key.offset = num_bytes;
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(trans, extent_root,
+							&key, path, -1, 1);
+			}
+
+			if (ret) {
+				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+					ret, bytenr);
+				if (ret > 0)
+					btrfs_print_leaf(extent_root,
+							 path->nodes[0]);
+			}
+			if (ret < 0) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
+			extent_slot = path->slots[0];
+		}
+	} else if (WARN_ON(ret == -ENOENT)) {
+		btrfs_print_leaf(extent_root, path->nodes[0]);
+		btrfs_err(info,
+			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
+			bytenr, parent, root_objectid, owner_objectid,
+			owner_offset);
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	} else {
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		BUG_ON(found_extent || extent_slot != path->slots[0]);
+		ret = convert_extent_item_v0(trans, extent_root, path,
+					     owner_objectid, 0);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+
+		btrfs_release_path(path);
+		path->leave_spinning = 1;
+
+		key.objectid = bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = num_bytes;
+
+		ret = btrfs_search_slot(trans, extent_root, &key, path,
+					-1, 1);
+		if (ret) {
+			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+				ret, bytenr);
+			btrfs_print_leaf(extent_root, path->nodes[0]);
+		}
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+
+		extent_slot = path->slots[0];
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, extent_slot);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+	ei = btrfs_item_ptr(leaf, extent_slot,
+			    struct btrfs_extent_item);
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
+	    key.type == BTRFS_EXTENT_ITEM_KEY) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
+	}
+
+	refs = btrfs_extent_refs(leaf, ei);
+	if (refs < refs_to_drop) {
+		btrfs_err(info, "trying to drop %d refs but we only have %Lu "
+			  "for bytenr %Lu", refs_to_drop, refs, bytenr);
+		ret = -EINVAL;
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	}
+	refs -= refs_to_drop;
+
+	if (refs > 0) {
+		type = BTRFS_QGROUP_OPER_SUB_SHARED;
+		if (extent_op)
+			__run_delayed_extent_op(extent_op, leaf, ei);
+		/*
+		 * In the case of inline back ref, reference count will
+		 * be updated by remove_extent_backref
+		 */
+		if (iref) {
+			BUG_ON(!found_extent);
+		} else {
+			btrfs_set_extent_refs(leaf, ei, refs);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+		if (found_extent) {
+			ret = remove_extent_backref(trans, extent_root, path,
+						    iref, refs_to_drop,
+						    is_data, &last_ref);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
+		}
+		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+				 root_objectid);
+	} else {
+		if (found_extent) {
+			BUG_ON(is_data && refs_to_drop !=
+			       extent_data_ref_count(root, path, iref));
+			if (iref) {
+				BUG_ON(path->slots[0] != extent_slot);
+			} else {
+				BUG_ON(path->slots[0] != extent_slot + 1);
+				path->slots[0] = extent_slot;
+				num_to_del = 2;
+			}
+		}
+
+		last_ref = 1;
+		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+				      num_to_del);
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+		btrfs_release_path(path);
+
+		if (is_data) {
+			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
+		}
+
+		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	/* Deal with the quota accounting */
+	if (!ret && last_ref && !no_quota) {
+		int mod_seq = 0;
+
+		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+		    type == BTRFS_QGROUP_OPER_SUB_SHARED)
+			mod_seq = 1;
+
+		ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
+					      bytenr, num_bytes, type,
+					      mod_seq);
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * when we free an block, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head)
+		goto out_delayed_unlock;
+
+	spin_lock(&head->lock);
+	if (rb_first(&head->ref_root))
+		goto out;
+
+	if (head->extent_op) {
+		if (!head->must_insert_reserved)
+			goto out;
+		btrfs_free_delayed_extent_op(head->extent_op);
+		head->extent_op = NULL;
+	}
+
+	/*
+	 * waiting for the lock here would deadlock.  If someone else has it
+	 * locked they are already in the process of dropping it anyway
+	 */
+	if (!mutex_trylock(&head->mutex))
+		goto out;
+
+	/*
+	 * at this point we have a head with no other entries.  Go
+	 * ahead and process it.
+	 */
+	head->node.in_tree = 0;
+	rb_erase(&head->href_node, &delayed_refs->href_root);
+
+	atomic_dec(&delayed_refs->num_entries);
+
+	/*
+	 * we don't take a ref on the node because we're removing it from the
+	 * tree, so we just steal the ref the tree was holding.
+	 */
+	delayed_refs->num_heads--;
+	if (head->processing == 0)
+		delayed_refs->num_heads_ready--;
+	head->processing = 0;
+	spin_unlock(&head->lock);
+	spin_unlock(&delayed_refs->lock);
+
+	BUG_ON(head->extent_op);
+	if (head->must_insert_reserved)
+		ret = 1;
+
+	mutex_unlock(&head->mutex);
+	btrfs_put_delayed_ref(&head->node);
+	return ret;
+out:
+	spin_unlock(&head->lock);
+
+out_delayed_unlock:
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   u64 parent, int last_ref)
+{
+	int pin = 1;
+	int ret;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					buf->start, buf->len,
+					parent, root->root_key.objectid,
+					btrfs_header_level(buf),
+					BTRFS_DROP_DELAYED_REF, NULL, 0);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+
+	if (!last_ref)
+		return;
+
+	if (btrfs_header_generation(buf) == trans->transid) {
+		struct btrfs_block_group_cache *cache;
+
+		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+			ret = check_ref_cleanup(trans, root, buf->start);
+			if (!ret)
+				goto out;
+		}
+
+		cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+
+		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+			pin_down_extent(root, cache, buf->start, buf->len, 1);
+			btrfs_put_block_group(cache);
+			goto out;
+		}
+
+		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+		btrfs_add_free_space(cache, buf->start, buf->len);
+		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+		btrfs_put_block_group(cache);
+		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
+		pin = 0;
+	}
+out:
+	if (pin)
+		add_pinned_bytes(root->fs_info, buf->len,
+				 btrfs_header_level(buf),
+				 root->root_key.objectid);
+
+	/*
+	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
+	 * anymore.
+	 */
+	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+}
+
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int no_quota)
+{
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	if (btrfs_test_is_dummy_root(root))
+		return 0;
+
+	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+
+	/*
+	 * tree log blocks never actually go into the extent allocation
+	 * tree, just update pinning info and exit early.
+	 */
+	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+		/* unlocks the pinned mutex */
+		btrfs_pin_extent(root, bytenr, num_bytes, 1);
+		ret = 0;
+	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+	} else {
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+						num_bytes,
+						parent, root_objectid, owner,
+						offset, BTRFS_DROP_DELAYED_REF,
+						NULL, no_quota);
+	}
+	return ret;
+}
+
+/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
+ */
+static noinline void
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+				u64 num_bytes)
+{
+	struct btrfs_caching_control *caching_ctl;
+
+	caching_ctl = get_caching_control(cache);
+	if (!caching_ctl)
+		return;
+
+	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+		   (cache->free_space_ctl->free_space >= num_bytes));
+
+	put_caching_control(caching_ctl);
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *caching_ctl;
+	int ret = 0;
+
+	caching_ctl = get_caching_control(cache);
+	if (!caching_ctl)
+		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
+
+	wait_event(caching_ctl->wait, block_group_cache_done(cache));
+	if (cache->cached == BTRFS_CACHE_ERROR)
+		ret = -EIO;
+	put_caching_control(caching_ctl);
+	return ret;
+}
+
+int __get_raid_index(u64 flags)
+{
+	if (flags & BTRFS_BLOCK_GROUP_RAID10)
+		return BTRFS_RAID_RAID10;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+		return BTRFS_RAID_RAID1;
+	else if (flags & BTRFS_BLOCK_GROUP_DUP)
+		return BTRFS_RAID_DUP;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+		return BTRFS_RAID_RAID0;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+		return BTRFS_RAID_RAID5;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+		return BTRFS_RAID_RAID6;
+
+	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
+int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+	return __get_raid_index(cache->flags);
+}
+
+static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10]	= "raid10",
+	[BTRFS_RAID_RAID1]	= "raid1",
+	[BTRFS_RAID_DUP]	= "dup",
+	[BTRFS_RAID_RAID0]	= "raid0",
+	[BTRFS_RAID_SINGLE]	= "single",
+	[BTRFS_RAID_RAID5]	= "raid5",
+	[BTRFS_RAID_RAID6]	= "raid6",
+};
+
+static const char *get_raid_name(enum btrfs_raid_types type)
+{
+	if (type >= BTRFS_NR_RAID_TYPES)
+		return NULL;
+
+	return btrfs_raid_type_names[type];
+}
+
+enum btrfs_loop_type {
+	LOOP_CACHING_NOWAIT = 0,
+	LOOP_CACHING_WAIT = 1,
+	LOOP_ALLOC_CHUNK = 2,
+	LOOP_NO_EMPTY_SIZE = 3,
+};
+
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+		       int delalloc)
+{
+	if (delalloc)
+		down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+		       int delalloc)
+{
+	btrfs_get_block_group(cache);
+	if (delalloc)
+		down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+		   struct btrfs_free_cluster *cluster,
+		   int delalloc)
+{
+	struct btrfs_block_group_cache *used_bg;
+	bool locked = false;
+again:
+	spin_lock(&cluster->refill_lock);
+	if (locked) {
+		if (used_bg == cluster->block_group)
+			return used_bg;
+
+		up_read(&used_bg->data_rwsem);
+		btrfs_put_block_group(used_bg);
+	}
+
+	used_bg = cluster->block_group;
+	if (!used_bg)
+		return NULL;
+
+	if (used_bg == block_group)
+		return used_bg;
+
+	btrfs_get_block_group(used_bg);
+
+	if (!delalloc)
+		return used_bg;
+
+	if (down_read_trylock(&used_bg->data_rwsem))
+		return used_bg;
+
+	spin_unlock(&cluster->refill_lock);
+	down_read(&used_bg->data_rwsem);
+	locked = true;
+	goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+			 int delalloc)
+{
+	if (delalloc)
+		up_read(&cache->data_rwsem);
+	btrfs_put_block_group(cache);
+}
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ * Any available blocks before search_start are skipped.
+ *
+ * If there is no suitable free space, we will record the max size of
+ * the free space extent currently.
+ */
+static noinline int find_free_extent(struct btrfs_root *orig_root,
+				     u64 num_bytes, u64 empty_size,
+				     u64 hint_byte, struct btrfs_key *ins,
+				     u64 flags, int delalloc)
+{
+	int ret = 0;
+	struct btrfs_root *root = orig_root->fs_info->extent_root;
+	struct btrfs_free_cluster *last_ptr = NULL;
+	struct btrfs_block_group_cache *block_group = NULL;
+	u64 search_start = 0;
+	u64 max_extent_size = 0;
+	int empty_cluster = 2 * 1024 * 1024;
+	struct btrfs_space_info *space_info;
+	int loop = 0;
+	int index = __get_raid_index(flags);
+	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
+		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
+	bool failed_cluster_refill = false;
+	bool failed_alloc = false;
+	bool use_cluster = true;
+	bool have_caching_bg = false;
+
+	WARN_ON(num_bytes < root->sectorsize);
+	ins->type = BTRFS_EXTENT_ITEM_KEY;
+	ins->objectid = 0;
+	ins->offset = 0;
+
+	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
+
+	space_info = __find_space_info(root->fs_info, flags);
+	if (!space_info) {
+		btrfs_err(root->fs_info, "No space info for %llu", flags);
+		return -ENOSPC;
+	}
+
+	/*
+	 * If the space info is for both data and metadata it means we have a
+	 * small filesystem and we can't use the clustering stuff.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		use_cluster = false;
+
+	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
+		last_ptr = &root->fs_info->meta_alloc_cluster;
+		if (!btrfs_test_opt(root, SSD))
+			empty_cluster = 64 * 1024;
+	}
+
+	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+	    btrfs_test_opt(root, SSD)) {
+		last_ptr = &root->fs_info->data_alloc_cluster;
+	}
+
+	if (last_ptr) {
+		spin_lock(&last_ptr->lock);
+		if (last_ptr->block_group)
+			hint_byte = last_ptr->window_start;
+		spin_unlock(&last_ptr->lock);
+	}
+
+	search_start = max(search_start, first_logical_byte(root, 0));
+	search_start = max(search_start, hint_byte);
+
+	if (!last_ptr)
+		empty_cluster = 0;
+
+	if (search_start == hint_byte) {
+		block_group = btrfs_lookup_block_group(root->fs_info,
+						       search_start);
+		/*
+		 * we don't want to use the block group if it doesn't match our
+		 * allocation bits, or if its not cached.
+		 *
+		 * However if we are re-searching with an ideal block group
+		 * picked out then we don't care that the block group is cached.
+		 */
+		if (block_group && block_group_bits(block_group, flags) &&
+		    block_group->cached != BTRFS_CACHE_NO) {
+			down_read(&space_info->groups_sem);
+			if (list_empty(&block_group->list) ||
+			    block_group->ro) {
+				/*
+				 * someone is removing this block group,
+				 * we can't jump into the have_block_group
+				 * target because our list pointers are not
+				 * valid
+				 */
+				btrfs_put_block_group(block_group);
+				up_read(&space_info->groups_sem);
+			} else {
+				index = get_block_group_index(block_group);
+				btrfs_lock_block_group(block_group, delalloc);
+				goto have_block_group;
+			}
+		} else if (block_group) {
+			btrfs_put_block_group(block_group);
+		}
+	}
+search:
+	have_caching_bg = false;
+	down_read(&space_info->groups_sem);
+	list_for_each_entry(block_group, &space_info->block_groups[index],
+			    list) {
+		u64 offset;
+		int cached;
+
+		btrfs_grab_block_group(block_group, delalloc);
+		search_start = block_group->key.objectid;
+
+		/*
+		 * this can happen if we end up cycling through all the
+		 * raid types, but we want to make sure we only allocate
+		 * for the proper type.
+		 */
+		if (!block_group_bits(block_group, flags)) {
+		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
+				BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6 |
+				BTRFS_BLOCK_GROUP_RAID10;
+
+			/*
+			 * if they asked for extra copies and this block group
+			 * doesn't provide them, bail.  This does allow us to
+			 * fill raid0 from raid1.
+			 */
+			if ((flags & extra) && !(block_group->flags & extra))
+				goto loop;
+		}
+
+have_block_group:
+		cached = block_group_cache_done(block_group);
+		if (unlikely(!cached)) {
+			ret = cache_block_group(block_group, 0);
+			BUG_ON(ret < 0);
+			ret = 0;
+		}
+
+		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+			goto loop;
+		if (unlikely(block_group->ro))
+			goto loop;
+
+		/*
+		 * Ok we want to try and use the cluster allocator, so
+		 * lets look there
+		 */
+		if (last_ptr) {
+			struct btrfs_block_group_cache *used_block_group;
+			unsigned long aligned_cluster;
+			/*
+			 * the refill lock keeps out other
+			 * people trying to start a new cluster
+			 */
+			used_block_group = btrfs_lock_cluster(block_group,
+							      last_ptr,
+							      delalloc);
+			if (!used_block_group)
+				goto refill_cluster;
+
+			if (used_block_group != block_group &&
+			    (used_block_group->ro ||
+			     !block_group_bits(used_block_group, flags)))
+				goto release_cluster;
+
+			offset = btrfs_alloc_from_cluster(used_block_group,
+						last_ptr,
+						num_bytes,
+						used_block_group->key.objectid,
+						&max_extent_size);
+			if (offset) {
+				/* we have a block, we're done */
+				spin_unlock(&last_ptr->refill_lock);
+				trace_btrfs_reserve_extent_cluster(root,
+						used_block_group,
+						search_start, num_bytes);
+				if (used_block_group != block_group) {
+					btrfs_release_block_group(block_group,
+								  delalloc);
+					block_group = used_block_group;
+				}
+				goto checks;
+			}
+
+			WARN_ON(last_ptr->block_group != used_block_group);
+release_cluster:
+			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
+			 * set up a new clusters, so lets just skip it
+			 * and let the allocator find whatever block
+			 * it can find.  If we reach this point, we
+			 * will have tried the cluster allocator
+			 * plenty of times and not have found
+			 * anything, so we are likely way too
+			 * fragmented for the clustering stuff to find
+			 * anything.
+			 *
+			 * However, if the cluster is taken from the
+			 * current block group, release the cluster
+			 * first, so that we stand a better chance of
+			 * succeeding in the unclustered
+			 * allocation.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE &&
+			    used_block_group != block_group) {
+				spin_unlock(&last_ptr->refill_lock);
+				btrfs_release_block_group(used_block_group,
+							  delalloc);
+				goto unclustered_alloc;
+			}
+
+			/*
+			 * this cluster didn't work out, free it and
+			 * start over
+			 */
+			btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+			if (used_block_group != block_group)
+				btrfs_release_block_group(used_block_group,
+							  delalloc);
+refill_cluster:
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
+				spin_unlock(&last_ptr->refill_lock);
+				goto unclustered_alloc;
+			}
+
+			aligned_cluster = max_t(unsigned long,
+						empty_cluster + empty_size,
+					      block_group->full_stripe_len);
+
+			/* allocate a cluster in this block group */
+			ret = btrfs_find_space_cluster(root, block_group,
+						       last_ptr, search_start,
+						       num_bytes,
+						       aligned_cluster);
+			if (ret == 0) {
+				/*
+				 * now pull our allocation out of this
+				 * cluster
+				 */
+				offset = btrfs_alloc_from_cluster(block_group,
+							last_ptr,
+							num_bytes,
+							search_start,
+							&max_extent_size);
+				if (offset) {
+					/* we found one, proceed */
+					spin_unlock(&last_ptr->refill_lock);
+					trace_btrfs_reserve_extent_cluster(root,
+						block_group, search_start,
+						num_bytes);
+					goto checks;
+				}
+			} else if (!cached && loop > LOOP_CACHING_NOWAIT
+				   && !failed_cluster_refill) {
+				spin_unlock(&last_ptr->refill_lock);
+
+				failed_cluster_refill = true;
+				wait_block_group_cache_progress(block_group,
+				       num_bytes + empty_cluster + empty_size);
+				goto have_block_group;
+			}
+
+			/*
+			 * at this point we either didn't find a cluster
+			 * or we weren't able to allocate a block from our
+			 * cluster.  Free the cluster we've been trying
+			 * to use, and go to the next block group
+			 */
+			btrfs_return_cluster_to_free_space(NULL, last_ptr);
+			spin_unlock(&last_ptr->refill_lock);
+			goto loop;
+		}
+
+unclustered_alloc:
+		spin_lock(&block_group->free_space_ctl->tree_lock);
+		if (cached &&
+		    block_group->free_space_ctl->free_space <
+		    num_bytes + empty_cluster + empty_size) {
+			if (block_group->free_space_ctl->free_space >
+			    max_extent_size)
+				max_extent_size =
+					block_group->free_space_ctl->free_space;
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->free_space_ctl->tree_lock);
+
+		offset = btrfs_find_space_for_alloc(block_group, search_start,
+						    num_bytes, empty_size,
+						    &max_extent_size);
+		/*
+		 * If we didn't find a chunk, and we haven't failed on this
+		 * block group before, and this block group is in the middle of
+		 * caching and we are ok with waiting, then go ahead and wait
+		 * for progress to be made, and set failed_alloc to true.
+		 *
+		 * If failed_alloc is true then we've already waited on this
+		 * block group once and should move on to the next block group.
+		 */
+		if (!offset && !failed_alloc && !cached &&
+		    loop > LOOP_CACHING_NOWAIT) {
+			wait_block_group_cache_progress(block_group,
+						num_bytes + empty_size);
+			failed_alloc = true;
+			goto have_block_group;
+		} else if (!offset) {
+			if (!cached)
+				have_caching_bg = true;
+			goto loop;
+		}
+checks:
+		search_start = ALIGN(offset, root->stripesize);
+
+		/* move on to the next group */
+		if (search_start + num_bytes >
+		    block_group->key.objectid + block_group->key.offset) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			goto loop;
+		}
+
+		if (offset < search_start)
+			btrfs_add_free_space(block_group, offset,
+					     search_start - offset);
+		BUG_ON(offset > search_start);
+
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+						  alloc_type, delalloc);
+		if (ret == -EAGAIN) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			goto loop;
+		}
+
+		/* we are all good, lets return */
+		ins->objectid = search_start;
+		ins->offset = num_bytes;
+
+		trace_btrfs_reserve_extent(orig_root, block_group,
+					   search_start, num_bytes);
+		btrfs_release_block_group(block_group, delalloc);
+		break;
+loop:
+		failed_cluster_refill = false;
+		failed_alloc = false;
+		BUG_ON(index != get_block_group_index(block_group));
+		btrfs_release_block_group(block_group, delalloc);
+	}
+	up_read(&space_info->groups_sem);
+
+	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+		goto search;
+
+	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+		goto search;
+
+	/*
+	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+	 *			caching kthreads as we move along
+	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+	 *			again
+	 */
+	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
+		index = 0;
+		loop++;
+		if (loop == LOOP_ALLOC_CHUNK) {
+			struct btrfs_trans_handle *trans;
+			int exist = 0;
+
+			trans = current->journal_info;
+			if (trans)
+				exist = 1;
+			else
+				trans = btrfs_join_transaction(root);
+
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+
+			ret = do_chunk_alloc(trans, root, flags,
+					     CHUNK_ALLOC_FORCE);
+			/*
+			 * Do not bail out on ENOSPC since we
+			 * can do more things.
+			 */
+			if (ret < 0 && ret != -ENOSPC)
+				btrfs_abort_transaction(trans,
+							root, ret);
+			else
+				ret = 0;
+			if (!exist)
+				btrfs_end_transaction(trans, root);
+			if (ret)
+				goto out;
+		}
+
+		if (loop == LOOP_NO_EMPTY_SIZE) {
+			empty_size = 0;
+			empty_cluster = 0;
+		}
+
+		goto search;
+	} else if (!ins->objectid) {
+		ret = -ENOSPC;
+	} else if (ins->objectid) {
+		ret = 0;
+	}
+out:
+	if (ret == -ENOSPC)
+		ins->offset = max_extent_size;
+	return ret;
+}
+
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+			    int dump_block_groups)
+{
+	struct btrfs_block_group_cache *cache;
+	int index = 0;
+
+	spin_lock(&info->lock);
+	printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
+	       info->flags,
+	       info->total_bytes - info->bytes_used - info->bytes_pinned -
+	       info->bytes_reserved - info->bytes_readonly,
+	       (info->full) ? "" : "not ");
+	printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
+	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
+	       info->total_bytes, info->bytes_used, info->bytes_pinned,
+	       info->bytes_reserved, info->bytes_may_use,
+	       info->bytes_readonly);
+	spin_unlock(&info->lock);
+
+	if (!dump_block_groups)
+		return;
+
+	down_read(&info->groups_sem);
+again:
+	list_for_each_entry(cache, &info->block_groups[index], list) {
+		spin_lock(&cache->lock);
+		printk(KERN_INFO "BTRFS: "
+			   "block group %llu has %llu bytes, "
+			   "%llu used %llu pinned %llu reserved %s\n",
+		       cache->key.objectid, cache->key.offset,
+		       btrfs_block_group_used(&cache->item), cache->pinned,
+		       cache->reserved, cache->ro ? "[readonly]" : "");
+		btrfs_dump_free_space(cache, bytes);
+		spin_unlock(&cache->lock);
+	}
+	if (++index < BTRFS_NR_RAID_TYPES)
+		goto again;
+	up_read(&info->groups_sem);
+}
+
+int btrfs_reserve_extent(struct btrfs_root *root,
+			 u64 num_bytes, u64 min_alloc_size,
+			 u64 empty_size, u64 hint_byte,
+			 struct btrfs_key *ins, int is_data, int delalloc)
+{
+	bool final_tried = false;
+	u64 flags;
+	int ret;
+
+	flags = btrfs_get_alloc_profile(root, is_data);
+again:
+	WARN_ON(num_bytes < root->sectorsize);
+	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
+			       flags, delalloc);
+
+	if (ret == -ENOSPC) {
+		if (!final_tried && ins->offset) {
+			num_bytes = min(num_bytes >> 1, ins->offset);
+			num_bytes = round_down(num_bytes, root->sectorsize);
+			num_bytes = max(num_bytes, min_alloc_size);
+			if (num_bytes == min_alloc_size)
+				final_tried = true;
+			goto again;
+		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+			struct btrfs_space_info *sinfo;
+
+			sinfo = __find_space_info(root->fs_info, flags);
+			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
+				flags, num_bytes);
+			if (sinfo)
+				dump_space_info(sinfo, num_bytes, 1);
+		}
+	}
+
+	return ret;
+}
+
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len,
+					int pin, int delalloc)
+{
+	struct btrfs_block_group_cache *cache;
+	int ret = 0;
+
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	if (!cache) {
+		btrfs_err(root->fs_info, "Unable to find block group for %llu",
+			start);
+		return -ENOSPC;
+	}
+
+	if (pin)
+		pin_down_extent(root, cache, start, len, 1);
+	else {
+		if (btrfs_test_opt(root, DISCARD))
+			ret = btrfs_discard_extent(root, start, len, NULL);
+		btrfs_add_free_space(cache, start, len);
+		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+	}
+
+	btrfs_put_block_group(cache);
+
+	trace_btrfs_reserved_extent_free(root, start, len);
+
+	return ret;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+			       u64 start, u64 len, int delalloc)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 1, 0);
+}
+
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod)
+{
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int type;
+	u32 size;
+
+	if (parent > 0)
+		type = BTRFS_SHARED_DATA_REF_KEY;
+	else
+		type = BTRFS_EXTENT_DATA_REF_KEY;
+
+	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
+	if (ret) {
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_DATA);
+
+	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (parent > 0) {
+		struct btrfs_shared_data_ref *ref;
+		ref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+	}
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	/* Always set parent to 0 here since its exclusive anyway. */
+	ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+				      ins->objectid, ins->offset,
+				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	if (ret)
+		return ret;
+
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+	if (ret) { /* -ENOENT, logic error */
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			ins->objectid, ins->offset);
+		BUG();
+	}
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
+	return ret;
+}
+
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins,
+				     int no_quota)
+{
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_tree_block_info *block_info;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u32 size = sizeof(*extent_item) + sizeof(*iref);
+	u64 num_bytes = ins->offset;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	if (!skinny_metadata)
+		size += sizeof(*block_info);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->nodesize);
+		return -ENOMEM;
+	}
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
+	if (ret) {
+		btrfs_free_path(path);
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->nodesize);
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, extent_item, 1);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+
+	if (skinny_metadata) {
+		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+		num_bytes = root->nodesize;
+	} else {
+		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+		btrfs_set_tree_block_key(leaf, block_info, key);
+		btrfs_set_tree_block_level(leaf, block_info, level);
+		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	}
+
+	if (parent > 0) {
+		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_SHARED_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_TREE_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	if (!no_quota) {
+		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+					      ins->objectid, num_bytes,
+					      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+		if (ret)
+			return ret;
+	}
+
+	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+				 1);
+	if (ret) { /* -ENOENT, logic error */
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			ins->objectid, ins->offset);
+		BUG();
+	}
+
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
+	return ret;
+}
+
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins)
+{
+	int ret;
+
+	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+					 ins->offset, 0,
+					 root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+	return ret;
+}
+
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+
+	/*
+	 * Mixed block groups will exclude before processing the log so we only
+	 * need to do the exlude dance if this fs isn't mixed.
+	 */
+	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+		ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
+		if (ret)
+			return ret;
+	}
+
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	if (!block_group)
+		return -EINVAL;
+
+	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+					  RESERVE_ALLOC_NO_ACCOUNT, 0);
+	BUG_ON(ret); /* logic error */
+	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+					 0, owner, offset, ins, 1);
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+static struct extent_buffer *
+btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, int level)
+{
+	struct extent_buffer *buf;
+
+	buf = btrfs_find_create_tree_block(root, bytenr);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
+	btrfs_tree_lock(buf);
+	clean_tree_block(trans, root->fs_info, buf);
+	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+
+	btrfs_set_lock_blocking(buf);
+	btrfs_set_buffer_uptodate(buf);
+
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		buf->log_index = root->log_transid % 2;
+		/*
+		 * we allow two log transactions at a time, use different
+		 * EXENT bit to differentiate dirty pages.
+		 */
+		if (buf->log_index == 0)
+			set_extent_dirty(&root->dirty_log_pages, buf->start,
+					buf->start + buf->len - 1, GFP_NOFS);
+		else
+			set_extent_new(&root->dirty_log_pages, buf->start,
+					buf->start + buf->len - 1, GFP_NOFS);
+	} else {
+		buf->log_index = -1;
+		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	}
+	trans->blocks_used++;
+	/* this returns a buffer locked for blocking */
+	return buf;
+}
+
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+	      struct btrfs_root *root, u32 blocksize)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	int ret;
+	bool global_updated = false;
+
+	block_rsv = get_block_rsv(trans, root);
+
+	if (unlikely(block_rsv->size == 0))
+		goto try_reserve;
+again:
+	ret = block_rsv_use_bytes(block_rsv, blocksize);
+	if (!ret)
+		return block_rsv;
+
+	if (block_rsv->failfast)
+		return ERR_PTR(ret);
+
+	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+		global_updated = true;
+		update_global_block_rsv(root->fs_info);
+		goto again;
+	}
+
+	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		static DEFINE_RATELIMIT_STATE(_rs,
+				DEFAULT_RATELIMIT_INTERVAL * 10,
+				/*DEFAULT_RATELIMIT_BURST*/ 1);
+		if (__ratelimit(&_rs))
+			WARN(1, KERN_DEBUG
+				"BTRFS: block rsv returned %d\n", ret);
+	}
+try_reserve:
+	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+				     BTRFS_RESERVE_NO_FLUSH);
+	if (!ret)
+		return block_rsv;
+	/*
+	 * If we couldn't reserve metadata bytes try and use some from
+	 * the global reserve if its space type is the same as the global
+	 * reservation.
+	 */
+	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+	    block_rsv->space_info == global_rsv->space_info) {
+		ret = block_rsv_use_bytes(global_rsv, blocksize);
+		if (!ret)
+			return global_rsv;
+	}
+	return ERR_PTR(ret);
+}
+
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+	block_rsv_add_bytes(block_rsv, blocksize, 0);
+	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns the tree buffer or an ERR_PTR on error.
+ */
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					u64 parent, u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size)
+{
+	struct btrfs_key ins;
+	struct btrfs_block_rsv *block_rsv;
+	struct extent_buffer *buf;
+	struct btrfs_delayed_extent_op *extent_op;
+	u64 flags = 0;
+	int ret;
+	u32 blocksize = root->nodesize;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	if (btrfs_test_is_dummy_root(root)) {
+		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
+					    level);
+		if (!IS_ERR(buf))
+			root->alloc_bytenr += blocksize;
+		return buf;
+	}
+
+	block_rsv = use_block_rsv(trans, root, blocksize);
+	if (IS_ERR(block_rsv))
+		return ERR_CAST(block_rsv);
+
+	ret = btrfs_reserve_extent(root, blocksize, blocksize,
+				   empty_size, hint, &ins, 0, 0);
+	if (ret)
+		goto out_unuse;
+
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		goto out_free_reserved;
+	}
+
+	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent == 0)
+			parent = ins.objectid;
+		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	} else
+		BUG_ON(parent > 0);
+
+	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+		extent_op = btrfs_alloc_delayed_extent_op();
+		if (!extent_op) {
+			ret = -ENOMEM;
+			goto out_free_buf;
+		}
+		if (key)
+			memcpy(&extent_op->key, key, sizeof(extent_op->key));
+		else
+			memset(&extent_op->key, 0, sizeof(extent_op->key));
+		extent_op->flags_to_set = flags;
+		if (skinny_metadata)
+			extent_op->update_key = 0;
+		else
+			extent_op->update_key = 1;
+		extent_op->update_flags = 1;
+		extent_op->is_data = 0;
+		extent_op->level = level;
+
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+						 ins.objectid, ins.offset,
+						 parent, root_objectid, level,
+						 BTRFS_ADD_DELAYED_EXTENT,
+						 extent_op, 0);
+		if (ret)
+			goto out_free_delayed;
+	}
+	return buf;
+
+out_free_delayed:
+	btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+	free_extent_buffer(buf);
+out_free_reserved:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+out_unuse:
+	unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+	return ERR_PTR(ret);
+}
+
+struct walk_control {
+	u64 refs[BTRFS_MAX_LEVEL];
+	u64 flags[BTRFS_MAX_LEVEL];
+	struct btrfs_key update_progress;
+	int stage;
+	int level;
+	int shared_level;
+	int update_ref;
+	int keep_locks;
+	int reada_slot;
+	int reada_count;
+	int for_reloc;
+};
+
+#define DROP_REFERENCE	1
+#define UPDATE_BACKREF	2
+
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct walk_control *wc,
+				     struct btrfs_path *path)
+{
+	u64 bytenr;
+	u64 generation;
+	u64 refs;
+	u64 flags;
+	u32 nritems;
+	u32 blocksize;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int ret;
+	int slot;
+	int nread = 0;
+
+	if (path->slots[wc->level] < wc->reada_slot) {
+		wc->reada_count = wc->reada_count * 2 / 3;
+		wc->reada_count = max(wc->reada_count, 2);
+	} else {
+		wc->reada_count = wc->reada_count * 3 / 2;
+		wc->reada_count = min_t(int, wc->reada_count,
+					BTRFS_NODEPTRS_PER_BLOCK(root));
+	}
+
+	eb = path->nodes[wc->level];
+	nritems = btrfs_header_nritems(eb);
+	blocksize = root->nodesize;
+
+	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+		if (nread >= wc->reada_count)
+			break;
+
+		cond_resched();
+		bytenr = btrfs_node_blockptr(eb, slot);
+		generation = btrfs_node_ptr_generation(eb, slot);
+
+		if (slot == path->slots[wc->level])
+			goto reada;
+
+		if (wc->stage == UPDATE_BACKREF &&
+		    generation <= root->root_key.offset)
+			continue;
+
+		/* We don't lock the tree block, it's OK to be racy here */
+		ret = btrfs_lookup_extent_info(trans, root, bytenr,
+					       wc->level - 1, 1, &refs,
+					       &flags);
+		/* We don't care about errors in readahead. */
+		if (ret < 0)
+			continue;
+		BUG_ON(refs == 0);
+
+		if (wc->stage == DROP_REFERENCE) {
+			if (refs == 1)
+				goto reada;
+
+			if (wc->level == 1 &&
+			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				continue;
+			if (!wc->update_ref ||
+			    generation <= root->root_key.offset)
+				continue;
+			btrfs_node_key_to_cpu(eb, &key, slot);
+			ret = btrfs_comp_cpu_keys(&key,
+						  &wc->update_progress);
+			if (ret < 0)
+				continue;
+		} else {
+			if (wc->level == 1 &&
+			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				continue;
+		}
+reada:
+		readahead_tree_block(root, bytenr);
+		nread++;
+	}
+	wc->reada_slot = slot;
+}
+
+static int account_leaf_items(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *eb)
+{
+	int nr = btrfs_header_nritems(eb);
+	int i, extent_type, ret;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	u64 bytenr, num_bytes;
+
+	for (i = 0; i < nr; i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+		/* filter out non qgroup-accountable extents  */
+		extent_type = btrfs_file_extent_type(eb, fi);
+
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+
+		bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+		if (!bytenr)
+			continue;
+
+		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+		ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+					      root->objectid,
+					      bytenr, num_bytes,
+					      BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+				struct btrfs_path *path, int root_level)
+{
+	int level = 0;
+	int nr, slot;
+	struct extent_buffer *eb;
+
+	if (root_level == 0)
+		return 1;
+
+	while (level <= root_level) {
+		eb = path->nodes[level];
+		nr = btrfs_header_nritems(eb);
+		path->slots[level]++;
+		slot = path->slots[level];
+		if (slot >= nr || level == 0) {
+			/*
+			 * Don't free the root -  we will detect this
+			 * condition after our loop and return a
+			 * positive value for caller to stop walking the tree.
+			 */
+			if (level != root_level) {
+				btrfs_tree_unlock_rw(eb, path->locks[level]);
+				path->locks[level] = 0;
+
+				free_extent_buffer(eb);
+				path->nodes[level] = NULL;
+				path->slots[level] = 0;
+			}
+		} else {
+			/*
+			 * We have a valid slot to walk back down
+			 * from. Stop here so caller can process these
+			 * new nodes.
+			 */
+			break;
+		}
+
+		level++;
+	}
+
+	eb = path->nodes[root_level];
+	if (path->slots[root_level] >= btrfs_header_nritems(eb))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * root_eb is the subtree root and is locked before this function is called.
+ */
+static int account_shared_subtree(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct extent_buffer *root_eb,
+				  u64 root_gen,
+				  int root_level)
+{
+	int ret = 0;
+	int level;
+	struct extent_buffer *eb = root_eb;
+	struct btrfs_path *path = NULL;
+
+	BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+	BUG_ON(root_eb == NULL);
+
+	if (!root->fs_info->quota_enabled)
+		return 0;
+
+	if (!extent_buffer_uptodate(root_eb)) {
+		ret = btrfs_read_buffer(root_eb, root_gen);
+		if (ret)
+			goto out;
+	}
+
+	if (root_level == 0) {
+		ret = account_leaf_items(trans, root, root_eb);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * Walk down the tree.  Missing extent blocks are filled in as
+	 * we go. Metadata is accounted every time we read a new
+	 * extent block.
+	 *
+	 * When we reach a leaf, we account for file extent items in it,
+	 * walk back up the tree (adjusting slot pointers as we go)
+	 * and restart the search process.
+	 */
+	extent_buffer_get(root_eb); /* For path */
+	path->nodes[root_level] = root_eb;
+	path->slots[root_level] = 0;
+	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+	level = root_level;
+	while (level >= 0) {
+		if (path->nodes[level] == NULL) {
+			int parent_slot;
+			u64 child_gen;
+			u64 child_bytenr;
+
+			/* We need to get child blockptr/gen from
+			 * parent before we can read it. */
+			eb = path->nodes[level + 1];
+			parent_slot = path->slots[level + 1];
+			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+			child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+
+			eb = read_tree_block(root, child_bytenr, child_gen);
+			if (!eb || !extent_buffer_uptodate(eb)) {
+				ret = -EIO;
+				goto out;
+			}
+
+			path->nodes[level] = eb;
+			path->slots[level] = 0;
+
+			btrfs_tree_read_lock(eb);
+			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+			path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+			ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+						root->objectid,
+						child_bytenr,
+						root->nodesize,
+						BTRFS_QGROUP_OPER_SUB_SUBTREE,
+						0);
+			if (ret)
+				goto out;
+
+		}
+
+		if (level == 0) {
+			ret = account_leaf_items(trans, root, path->nodes[level]);
+			if (ret)
+				goto out;
+
+			/* Nonzero return here means we completed our search */
+			ret = adjust_slots_upwards(root, path, root_level);
+			if (ret)
+				break;
+
+			/* Restart search with new slots */
+			goto walk_down;
+		}
+
+		level--;
+	}
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+/*
+ * helper to process tree block while walking down the tree.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct walk_control *wc, int lookup_info)
+{
+	int level = wc->level;
+	struct extent_buffer *eb = path->nodes[level];
+	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	int ret;
+
+	if (wc->stage == UPDATE_BACKREF &&
+	    btrfs_header_owner(eb) != root->root_key.objectid)
+		return 1;
+
+	/*
+	 * when reference count of tree block is 1, it won't increase
+	 * again. once full backref flag is set, we never clear it.
+	 */
+	if (lookup_info &&
+	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+		BUG_ON(!path->locks[level]);
+		ret = btrfs_lookup_extent_info(trans, root,
+					       eb->start, level, 1,
+					       &wc->refs[level],
+					       &wc->flags[level]);
+		BUG_ON(ret == -ENOMEM);
+		if (ret)
+			return ret;
+		BUG_ON(wc->refs[level] == 0);
+	}
+
+	if (wc->stage == DROP_REFERENCE) {
+		if (wc->refs[level] > 1)
+			return 1;
+
+		if (path->locks[level] && !wc->keep_locks) {
+			btrfs_tree_unlock_rw(eb, path->locks[level]);
+			path->locks[level] = 0;
+		}
+		return 0;
+	}
+
+	/* wc->stage == UPDATE_BACKREF */
+	if (!(wc->flags[level] & flag)) {
+		BUG_ON(!path->locks[level]);
+		ret = btrfs_inc_ref(trans, root, eb, 1);
+		BUG_ON(ret); /* -ENOMEM */
+		ret = btrfs_dec_ref(trans, root, eb, 0);
+		BUG_ON(ret); /* -ENOMEM */
+		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+						  eb->len, flag,
+						  btrfs_header_level(eb), 0);
+		BUG_ON(ret); /* -ENOMEM */
+		wc->flags[level] |= flag;
+	}
+
+	/*
+	 * the block is shared by multiple trees, so it's not good to
+	 * keep the tree lock
+	 */
+	if (path->locks[level] && level > 0) {
+		btrfs_tree_unlock_rw(eb, path->locks[level]);
+		path->locks[level] = 0;
+	}
+	return 0;
+}
+
+/*
+ * helper to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct walk_control *wc, int *lookup_info)
+{
+	u64 bytenr;
+	u64 generation;
+	u64 parent;
+	u32 blocksize;
+	struct btrfs_key key;
+	struct extent_buffer *next;
+	int level = wc->level;
+	int reada = 0;
+	int ret = 0;
+	bool need_account = false;
+
+	generation = btrfs_node_ptr_generation(path->nodes[level],
+					       path->slots[level]);
+	/*
+	 * if the lower level block was created before the snapshot
+	 * was created, we know there is no need to update back refs
+	 * for the subtree
+	 */
+	if (wc->stage == UPDATE_BACKREF &&
+	    generation <= root->root_key.offset) {
+		*lookup_info = 1;
+		return 1;
+	}
+
+	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+	blocksize = root->nodesize;
+
+	next = btrfs_find_tree_block(root->fs_info, bytenr);
+	if (!next) {
+		next = btrfs_find_create_tree_block(root, bytenr);
+		if (!next)
+			return -ENOMEM;
+		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
+					       level - 1);
+		reada = 1;
+	}
+	btrfs_tree_lock(next);
+	btrfs_set_lock_blocking(next);
+
+	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
+				       &wc->refs[level - 1],
+				       &wc->flags[level - 1]);
+	if (ret < 0) {
+		btrfs_tree_unlock(next);
+		return ret;
+	}
+
+	if (unlikely(wc->refs[level - 1] == 0)) {
+		btrfs_err(root->fs_info, "Missing references.");
+		BUG();
+	}
+	*lookup_info = 0;
+
+	if (wc->stage == DROP_REFERENCE) {
+		if (wc->refs[level - 1] > 1) {
+			need_account = true;
+			if (level == 1 &&
+			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				goto skip;
+
+			if (!wc->update_ref ||
+			    generation <= root->root_key.offset)
+				goto skip;
+
+			btrfs_node_key_to_cpu(path->nodes[level], &key,
+					      path->slots[level]);
+			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+			if (ret < 0)
+				goto skip;
+
+			wc->stage = UPDATE_BACKREF;
+			wc->shared_level = level - 1;
+		}
+	} else {
+		if (level == 1 &&
+		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+			goto skip;
+	}
+
+	if (!btrfs_buffer_uptodate(next, generation, 0)) {
+		btrfs_tree_unlock(next);
+		free_extent_buffer(next);
+		next = NULL;
+		*lookup_info = 1;
+	}
+
+	if (!next) {
+		if (reada && level == 1)
+			reada_walk_down(trans, root, wc, path);
+		next = read_tree_block(root, bytenr, generation);
+		if (!next || !extent_buffer_uptodate(next)) {
+			free_extent_buffer(next);
+			return -EIO;
+		}
+		btrfs_tree_lock(next);
+		btrfs_set_lock_blocking(next);
+	}
+
+	level--;
+	BUG_ON(level != btrfs_header_level(next));
+	path->nodes[level] = next;
+	path->slots[level] = 0;
+	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+	wc->level = level;
+	if (wc->level == 1)
+		wc->reada_slot = 0;
+	return 0;
+skip:
+	wc->refs[level - 1] = 0;
+	wc->flags[level - 1] = 0;
+	if (wc->stage == DROP_REFERENCE) {
+		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+			parent = path->nodes[level]->start;
+		} else {
+			BUG_ON(root->root_key.objectid !=
+			       btrfs_header_owner(path->nodes[level]));
+			parent = 0;
+		}
+
+		if (need_account) {
+			ret = account_shared_subtree(trans, root, next,
+						     generation, level - 1);
+			if (ret) {
+				printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+					"%d accounting shared subtree. Quota "
+					"is out of sync, rescan required.\n",
+					root->fs_info->sb->s_id, ret);
+			}
+		}
+		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+				root->root_key.objectid, level - 1, 0, 0);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+	btrfs_tree_unlock(next);
+	free_extent_buffer(next);
+	*lookup_info = 1;
+	return 1;
+}
+
+/*
+ * helper to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct walk_control *wc)
+{
+	int ret;
+	int level = wc->level;
+	struct extent_buffer *eb = path->nodes[level];
+	u64 parent = 0;
+
+	if (wc->stage == UPDATE_BACKREF) {
+		BUG_ON(wc->shared_level < level);
+		if (level < wc->shared_level)
+			goto out;
+
+		ret = find_next_key(path, level + 1, &wc->update_progress);
+		if (ret > 0)
+			wc->update_ref = 0;
+
+		wc->stage = DROP_REFERENCE;
+		wc->shared_level = -1;
+		path->slots[level] = 0;
+
+		/*
+		 * check reference count again if the block isn't locked.
+		 * we should start walking down the tree again if reference
+		 * count is one.
+		 */
+		if (!path->locks[level]) {
+			BUG_ON(level == 0);
+			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
+			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+
+			ret = btrfs_lookup_extent_info(trans, root,
+						       eb->start, level, 1,
+						       &wc->refs[level],
+						       &wc->flags[level]);
+			if (ret < 0) {
+				btrfs_tree_unlock_rw(eb, path->locks[level]);
+				path->locks[level] = 0;
+				return ret;
+			}
+			BUG_ON(wc->refs[level] == 0);
+			if (wc->refs[level] == 1) {
+				btrfs_tree_unlock_rw(eb, path->locks[level]);
+				path->locks[level] = 0;
+				return 1;
+			}
+		}
+	}
+
+	/* wc->stage == DROP_REFERENCE */
+	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+
+	if (wc->refs[level] == 1) {
+		if (level == 0) {
+			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+				ret = btrfs_dec_ref(trans, root, eb, 1);
+			else
+				ret = btrfs_dec_ref(trans, root, eb, 0);
+			BUG_ON(ret); /* -ENOMEM */
+			ret = account_leaf_items(trans, root, eb);
+			if (ret) {
+				printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+					"%d accounting leaf items. Quota "
+					"is out of sync, rescan required.\n",
+					root->fs_info->sb->s_id, ret);
+			}
+		}
+		/* make block locked assertion in clean_tree_block happy */
+		if (!path->locks[level] &&
+		    btrfs_header_generation(eb) == trans->transid) {
+			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
+			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+		}
+		clean_tree_block(trans, root->fs_info, eb);
+	}
+
+	if (eb == root->node) {
+		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+			parent = eb->start;
+		else
+			BUG_ON(root->root_key.objectid !=
+			       btrfs_header_owner(eb));
+	} else {
+		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+			parent = path->nodes[level + 1]->start;
+		else
+			BUG_ON(root->root_key.objectid !=
+			       btrfs_header_owner(path->nodes[level + 1]));
+	}
+
+	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+out:
+	wc->refs[level] = 0;
+	wc->flags[level] = 0;
+	return 0;
+}
+
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct walk_control *wc)
+{
+	int level = wc->level;
+	int lookup_info = 1;
+	int ret;
+
+	while (level >= 0) {
+		ret = walk_down_proc(trans, root, path, wc, lookup_info);
+		if (ret > 0)
+			break;
+
+		if (level == 0)
+			break;
+
+		if (path->slots[level] >=
+		    btrfs_header_nritems(path->nodes[level]))
+			break;
+
+		ret = do_walk_down(trans, root, path, wc, &lookup_info);
+		if (ret > 0) {
+			path->slots[level]++;
+			continue;
+		} else if (ret < 0)
+			return ret;
+		level = wc->level;
+	}
+	return 0;
+}
+
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct walk_control *wc, int max_level)
+{
+	int level = wc->level;
+	int ret;
+
+	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
+	while (level < max_level && path->nodes[level]) {
+		wc->level = level;
+		if (path->slots[level] + 1 <
+		    btrfs_header_nritems(path->nodes[level])) {
+			path->slots[level]++;
+			return 0;
+		} else {
+			ret = walk_up_proc(trans, root, path, wc);
+			if (ret > 0)
+				return 0;
+
+			if (path->locks[level]) {
+				btrfs_tree_unlock_rw(path->nodes[level],
+						     path->locks[level]);
+				path->locks[level] = 0;
+			}
+			free_extent_buffer(path->nodes[level]);
+			path->nodes[level] = NULL;
+			level++;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop a subvolume tree.
+ *
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
+ *
+ * If called with for_reloc == 0, may exit early with -EAGAIN
+ */
+int btrfs_drop_snapshot(struct btrfs_root *root,
+			 struct btrfs_block_rsv *block_rsv, int update_ref,
+			 int for_reloc)
+{
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+	struct btrfs_root_item *root_item = &root->root_item;
+	struct walk_control *wc;
+	struct btrfs_key key;
+	int err = 0;
+	int ret;
+	int level;
+	bool root_dropped = false;
+
+	btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	wc = kzalloc(sizeof(*wc), GFP_NOFS);
+	if (!wc) {
+		btrfs_free_path(path);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	trans = btrfs_start_transaction(tree_root, 0);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	if (block_rsv)
+		trans->block_rsv = block_rsv;
+
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		level = btrfs_header_level(root->node);
+		path->nodes[level] = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(path->nodes[level]);
+		path->slots[level] = 0;
+		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+		memset(&wc->update_progress, 0,
+		       sizeof(wc->update_progress));
+	} else {
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+		memcpy(&wc->update_progress, &key,
+		       sizeof(wc->update_progress));
+
+		level = root_item->drop_level;
+		BUG_ON(level == 0);
+		path->lowest_level = level;
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		path->lowest_level = 0;
+		if (ret < 0) {
+			err = ret;
+			goto out_end_trans;
+		}
+		WARN_ON(ret > 0);
+
+		/*
+		 * unlock our path, this is safe because only this
+		 * function is allowed to delete this snapshot
+		 */
+		btrfs_unlock_up_safe(path, 0);
+
+		level = btrfs_header_level(root->node);
+		while (1) {
+			btrfs_tree_lock(path->nodes[level]);
+			btrfs_set_lock_blocking(path->nodes[level]);
+			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+
+			ret = btrfs_lookup_extent_info(trans, root,
+						path->nodes[level]->start,
+						level, 1, &wc->refs[level],
+						&wc->flags[level]);
+			if (ret < 0) {
+				err = ret;
+				goto out_end_trans;
+			}
+			BUG_ON(wc->refs[level] == 0);
+
+			if (level == root_item->drop_level)
+				break;
+
+			btrfs_tree_unlock(path->nodes[level]);
+			path->locks[level] = 0;
+			WARN_ON(wc->refs[level] != 1);
+			level--;
+		}
+	}
+
+	wc->level = level;
+	wc->shared_level = -1;
+	wc->stage = DROP_REFERENCE;
+	wc->update_ref = update_ref;
+	wc->keep_locks = 0;
+	wc->for_reloc = for_reloc;
+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+	while (1) {
+
+		ret = walk_down_tree(trans, root, path, wc);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		if (ret > 0) {
+			BUG_ON(wc->stage != DROP_REFERENCE);
+			break;
+		}
+
+		if (wc->stage == DROP_REFERENCE) {
+			level = wc->level;
+			btrfs_node_key(path->nodes[level],
+				       &root_item->drop_progress,
+				       path->slots[level]);
+			root_item->drop_level = level;
+		}
+
+		BUG_ON(wc->level == 0);
+		if (btrfs_should_end_transaction(trans, tree_root) ||
+		    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
+			ret = btrfs_update_root(trans, tree_root,
+						&root->root_key,
+						root_item);
+			if (ret) {
+				btrfs_abort_transaction(trans, tree_root, ret);
+				err = ret;
+				goto out_end_trans;
+			}
+
+			/*
+			 * Qgroup update accounting is run from
+			 * delayed ref handling. This usually works
+			 * out because delayed refs are normally the
+			 * only way qgroup updates are added. However,
+			 * we may have added updates during our tree
+			 * walk so run qgroups here to make sure we
+			 * don't lose any updates.
+			 */
+			ret = btrfs_delayed_qgroup_accounting(trans,
+							      root->fs_info);
+			if (ret)
+				printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+						   "running qgroup updates "
+						   "during snapshot delete. "
+						   "Quota is out of sync, "
+						   "rescan required.\n", ret);
+
+			btrfs_end_transaction_throttle(trans, tree_root);
+			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+				pr_debug("BTRFS: drop snapshot early exit\n");
+				err = -EAGAIN;
+				goto out_free;
+			}
+
+			trans = btrfs_start_transaction(tree_root, 0);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				goto out_free;
+			}
+			if (block_rsv)
+				trans->block_rsv = block_rsv;
+		}
+	}
+	btrfs_release_path(path);
+	if (err)
+		goto out_end_trans;
+
+	ret = btrfs_del_root(trans, tree_root, &root->root_key);
+	if (ret) {
+		btrfs_abort_transaction(trans, tree_root, ret);
+		goto out_end_trans;
+	}
+
+	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+		ret = btrfs_find_root(tree_root, &root->root_key, path,
+				      NULL, NULL);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, tree_root, ret);
+			err = ret;
+			goto out_end_trans;
+		} else if (ret > 0) {
+			/* if we fail to delete the orphan item this time
+			 * around, it'll get picked up the next time.
+			 *
+			 * The most common failure here is just -ENOENT.
+			 */
+			btrfs_del_orphan_item(trans, tree_root,
+					      root->root_key.objectid);
+		}
+	}
+
+	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
+		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
+	} else {
+		free_extent_buffer(root->node);
+		free_extent_buffer(root->commit_root);
+		btrfs_put_fs_root(root);
+	}
+	root_dropped = true;
+out_end_trans:
+	ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
+	if (ret)
+		printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+				   "running qgroup updates "
+				   "during snapshot delete. "
+				   "Quota is out of sync, "
+				   "rescan required.\n", ret);
+
+	btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
+	kfree(wc);
+	btrfs_free_path(path);
+out:
+	/*
+	 * So if we need to stop dropping the snapshot for whatever reason we
+	 * need to make sure to add it back to the dead root list so that we
+	 * keep trying to do the work later.  This also cleans up roots if we
+	 * don't have it in the radix (like when we recover after a power fail
+	 * or unmount) so we don't leak memory.
+	 */
+	if (!for_reloc && root_dropped == false)
+		btrfs_add_dead_root(root);
+	if (err && err != -EAGAIN)
+		btrfs_std_error(root->fs_info, err);
+	return err;
+}
+
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
+ */
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent)
+{
+	struct btrfs_path *path;
+	struct walk_control *wc;
+	int level;
+	int parent_level;
+	int ret = 0;
+	int wret;
+
+	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	wc = kzalloc(sizeof(*wc), GFP_NOFS);
+	if (!wc) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	btrfs_assert_tree_locked(parent);
+	parent_level = btrfs_header_level(parent);
+	extent_buffer_get(parent);
+	path->nodes[parent_level] = parent;
+	path->slots[parent_level] = btrfs_header_nritems(parent);
+
+	btrfs_assert_tree_locked(node);
+	level = btrfs_header_level(node);
+	path->nodes[level] = node;
+	path->slots[level] = 0;
+	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+
+	wc->refs[parent_level] = 1;
+	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	wc->level = level;
+	wc->shared_level = -1;
+	wc->stage = DROP_REFERENCE;
+	wc->update_ref = 0;
+	wc->keep_locks = 1;
+	wc->for_reloc = 1;
+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+	while (1) {
+		wret = walk_down_tree(trans, root, path, wc);
+		if (wret < 0) {
+			ret = wret;
+			break;
+		}
+
+		wret = walk_up_tree(trans, root, path, wc, parent_level);
+		if (wret < 0)
+			ret = wret;
+		if (wret != 0)
+			break;
+	}
+
+	kfree(wc);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+	u64 num_devices;
+	u64 stripped;
+
+	/*
+	 * if restripe for this chunk_type is on pick target profile and
+	 * return, otherwise do the usual balance
+	 */
+	stripped = get_restripe_target(root->fs_info, flags);
+	if (stripped)
+		return extended_to_chunk(stripped);
+
+	num_devices = root->fs_info->fs_devices->rw_devices;
+
+	stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
+		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+	if (num_devices == 1) {
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* turn raid0 into single device chunks */
+		if (flags & BTRFS_BLOCK_GROUP_RAID0)
+			return stripped;
+
+		/* turn mirroring into duplication */
+		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+			     BTRFS_BLOCK_GROUP_RAID10))
+			return stripped | BTRFS_BLOCK_GROUP_DUP;
+	} else {
+		/* they already had raid on here, just return */
+		if (flags & stripped)
+			return flags;
+
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* switch duplicated blocks with raid1 */
+		if (flags & BTRFS_BLOCK_GROUP_DUP)
+			return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+		/* this is drive concat, leave it alone */
+	}
+
+	return flags;
+}
+
+static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+{
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+	u64 min_allocable_bytes;
+	int ret = -ENOSPC;
+
+
+	/*
+	 * We need some metadata space and system metadata space for
+	 * allocating chunks in some corner cases until we force to set
+	 * it to be readonly.
+	 */
+	if ((sinfo->flags &
+	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+	    !force)
+		min_allocable_bytes = 1 * 1024 * 1024;
+	else
+		min_allocable_bytes = 0;
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+
+	if (cache->ro) {
+		ret = 0;
+		goto out;
+	}
+
+	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+
+	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+	    min_allocable_bytes <= sinfo->total_bytes) {
+		sinfo->bytes_readonly += num_bytes;
+		cache->ro = 1;
+		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
+		ret = 0;
+	}
+out:
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
+	return ret;
+}
+
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache)
+
+{
+	struct btrfs_trans_handle *trans;
+	u64 alloc_flags;
+	int ret;
+
+	BUG_ON(cache->ro);
+
+again:
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	/*
+	 * we're not allowed to set block groups readonly after the dirty
+	 * block groups cache has started writing.  If it already started,
+	 * back off and let this transaction commit
+	 */
+	mutex_lock(&root->fs_info->ro_block_group_mutex);
+	if (trans->transaction->dirty_bg_run) {
+		u64 transid = trans->transid;
+
+		mutex_unlock(&root->fs_info->ro_block_group_mutex);
+		btrfs_end_transaction(trans, root);
+
+		ret = btrfs_wait_for_commit(root, transid);
+		if (ret)
+			return ret;
+		goto again;
+	}
+
+	/*
+	 * if we are changing raid levels, try to allocate a corresponding
+	 * block group with the new raid level.
+	 */
+	alloc_flags = update_block_group_flags(root, cache->flags);
+	if (alloc_flags != cache->flags) {
+		ret = do_chunk_alloc(trans, root, alloc_flags,
+				     CHUNK_ALLOC_FORCE);
+		/*
+		 * ENOSPC is allowed here, we may have enough space
+		 * already allocated at the new raid level to
+		 * carry on
+		 */
+		if (ret == -ENOSPC)
+			ret = 0;
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = set_block_group_ro(cache, 0);
+	if (!ret)
+		goto out;
+	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+	ret = do_chunk_alloc(trans, root, alloc_flags,
+			     CHUNK_ALLOC_FORCE);
+	if (ret < 0)
+		goto out;
+	ret = set_block_group_ro(cache, 0);
+out:
+	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+		alloc_flags = update_block_group_flags(root, cache->flags);
+		lock_chunks(root->fs_info->chunk_root);
+		check_system_chunk(trans, root, alloc_flags);
+		unlock_chunks(root->fs_info->chunk_root);
+	}
+	mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type)
+{
+	u64 alloc_flags = get_alloc_profile(root, type);
+	return do_chunk_alloc(trans, root, alloc_flags,
+			      CHUNK_ALLOC_FORCE);
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 free_bytes = 0;
+	int factor;
+
+	/* It's df, we don't care if it's racey */
+	if (list_empty(&sinfo->ro_bgs))
+		return 0;
+
+	spin_lock(&sinfo->lock);
+	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
+		spin_lock(&block_group->lock);
+
+		if (!block_group->ro) {
+			spin_unlock(&block_group->lock);
+			continue;
+		}
+
+		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					  BTRFS_BLOCK_GROUP_RAID10 |
+					  BTRFS_BLOCK_GROUP_DUP))
+			factor = 2;
+		else
+			factor = 1;
+
+		free_bytes += (block_group->key.offset -
+			       btrfs_block_group_used(&block_group->item)) *
+			       factor;
+
+		spin_unlock(&block_group->lock);
+	}
+	spin_unlock(&sinfo->lock);
+
+	return free_bytes;
+}
+
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+			      struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+
+	BUG_ON(!cache->ro);
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+	sinfo->bytes_readonly -= num_bytes;
+	cache->ro = 0;
+	list_del_init(&cache->ro_list);
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
+}
+
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_device *device;
+	struct btrfs_trans_handle *trans;
+	u64 min_free;
+	u64 dev_min = 1;
+	u64 dev_nr = 0;
+	u64 target;
+	int index;
+	int full = 0;
+	int ret = 0;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+
+	/* odd, couldn't find the block group, leave it alone */
+	if (!block_group)
+		return -1;
+
+	min_free = btrfs_block_group_used(&block_group->item);
+
+	/* no bytes used, we're good */
+	if (!min_free)
+		goto out;
+
+	space_info = block_group->space_info;
+	spin_lock(&space_info->lock);
+
+	full = space_info->full;
+
+	/*
+	 * if this is the last block group we have in this space, we can't
+	 * relocate it unless we're able to allocate a new chunk below.
+	 *
+	 * Otherwise, we need to make sure we have room in the space to handle
+	 * all of the extents from this block group.  If we can, we're good
+	 */
+	if ((space_info->total_bytes != block_group->key.offset) &&
+	    (space_info->bytes_used + space_info->bytes_reserved +
+	     space_info->bytes_pinned + space_info->bytes_readonly +
+	     min_free < space_info->total_bytes)) {
+		spin_unlock(&space_info->lock);
+		goto out;
+	}
+	spin_unlock(&space_info->lock);
+
+	/*
+	 * ok we don't have enough space, but maybe we have free space on our
+	 * devices to allocate new chunks for relocation, so loop through our
+	 * alloc devices and guess if we have enough space.  if this block
+	 * group is going to be restriped, run checks against the target
+	 * profile instead of the current one.
+	 */
+	ret = -1;
+
+	/*
+	 * index:
+	 *      0: raid10
+	 *      1: raid1
+	 *      2: dup
+	 *      3: raid0
+	 *      4: single
+	 */
+	target = get_restripe_target(root->fs_info, block_group->flags);
+	if (target) {
+		index = __get_raid_index(extended_to_chunk(target));
+	} else {
+		/*
+		 * this is just a balance, so if we were marked as full
+		 * we know there is no space for a new chunk
+		 */
+		if (full)
+			goto out;
+
+		index = get_block_group_index(block_group);
+	}
+
+	if (index == BTRFS_RAID_RAID10) {
+		dev_min = 4;
+		/* Divide by 2 */
+		min_free >>= 1;
+	} else if (index == BTRFS_RAID_RAID1) {
+		dev_min = 2;
+	} else if (index == BTRFS_RAID_DUP) {
+		/* Multiply by 2 */
+		min_free <<= 1;
+	} else if (index == BTRFS_RAID_RAID0) {
+		dev_min = fs_devices->rw_devices;
+		min_free = div64_u64(min_free, dev_min);
+	}
+
+	/* We need to do this so that we can look at pending chunks */
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	mutex_lock(&root->fs_info->chunk_mutex);
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+		u64 dev_offset;
+
+		/*
+		 * check to make sure we can actually find a chunk with enough
+		 * space to fit our block group in.
+		 */
+		if (device->total_bytes > device->bytes_used + min_free &&
+		    !device->is_tgtdev_for_dev_replace) {
+			ret = find_free_dev_extent(trans, device, min_free,
+						   &dev_offset, NULL);
+			if (!ret)
+				dev_nr++;
+
+			if (dev_nr >= dev_min)
+				break;
+
+			ret = -1;
+		}
+	}
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	btrfs_end_transaction(trans, root);
+out:
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+static int find_first_block_group(struct btrfs_root *root,
+		struct btrfs_path *path, struct btrfs_key *key)
+{
+	int ret = 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int slot;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid >= key->objectid &&
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			ret = 0;
+			goto out;
+		}
+		path->slots[0]++;
+	}
+out:
+	return ret;
+}
+
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 last = 0;
+
+	while (1) {
+		struct inode *inode;
+
+		block_group = btrfs_lookup_first_block_group(info, last);
+		while (block_group) {
+			spin_lock(&block_group->lock);
+			if (block_group->iref)
+				break;
+			spin_unlock(&block_group->lock);
+			block_group = next_block_group(info->tree_root,
+						       block_group);
+		}
+		if (!block_group) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		inode = block_group->inode;
+		block_group->iref = 0;
+		block_group->inode = NULL;
+		spin_unlock(&block_group->lock);
+		iput(inode);
+		last = block_group->key.objectid + block_group->key.offset;
+		btrfs_put_block_group(block_group);
+	}
+}
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_caching_control *caching_ctl;
+	struct rb_node *n;
+
+	down_write(&info->commit_root_sem);
+	while (!list_empty(&info->caching_block_groups)) {
+		caching_ctl = list_entry(info->caching_block_groups.next,
+					 struct btrfs_caching_control, list);
+		list_del(&caching_ctl->list);
+		put_caching_control(caching_ctl);
+	}
+	up_write(&info->commit_root_sem);
+
+	spin_lock(&info->unused_bgs_lock);
+	while (!list_empty(&info->unused_bgs)) {
+		block_group = list_first_entry(&info->unused_bgs,
+					       struct btrfs_block_group_cache,
+					       bg_list);
+		list_del_init(&block_group->bg_list);
+		btrfs_put_block_group(block_group);
+	}
+	spin_unlock(&info->unused_bgs_lock);
+
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		RB_CLEAR_NODE(&block_group->cache_node);
+		spin_unlock(&info->block_group_cache_lock);
+
+		down_write(&block_group->space_info->groups_sem);
+		list_del(&block_group->list);
+		up_write(&block_group->space_info->groups_sem);
+
+		if (block_group->cached == BTRFS_CACHE_STARTED)
+			wait_block_group_cache_done(block_group);
+
+		/*
+		 * We haven't cached this block group, which means we could
+		 * possibly have excluded extents on this block group.
+		 */
+		if (block_group->cached == BTRFS_CACHE_NO ||
+		    block_group->cached == BTRFS_CACHE_ERROR)
+			free_excluded_extents(info->extent_root, block_group);
+
+		btrfs_remove_free_space_cache(block_group);
+		btrfs_put_block_group(block_group);
+
+		spin_lock(&info->block_group_cache_lock);
+	}
+	spin_unlock(&info->block_group_cache_lock);
+
+	/* now that all the block groups are freed, go through and
+	 * free all the space_info structs.  This is only called during
+	 * the final stages of unmount, and so we know nobody is
+	 * using them.  We call synchronize_rcu() once before we start,
+	 * just to be on the safe side.
+	 */
+	synchronize_rcu();
+
+	release_global_block_rsv(info);
+
+	while (!list_empty(&info->space_info)) {
+		int i;
+
+		space_info = list_entry(info->space_info.next,
+					struct btrfs_space_info,
+					list);
+		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+			if (WARN_ON(space_info->bytes_pinned > 0 ||
+			    space_info->bytes_reserved > 0 ||
+			    space_info->bytes_may_use > 0)) {
+				dump_space_info(space_info, 0, 0);
+			}
+		}
+		list_del(&space_info->list);
+		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+			struct kobject *kobj;
+			kobj = space_info->block_group_kobjs[i];
+			space_info->block_group_kobjs[i] = NULL;
+			if (kobj) {
+				kobject_del(kobj);
+				kobject_put(kobj);
+			}
+		}
+		kobject_del(&space_info->kobj);
+		kobject_put(&space_info->kobj);
+	}
+	return 0;
+}
+
+static void __link_block_group(struct btrfs_space_info *space_info,
+			       struct btrfs_block_group_cache *cache)
+{
+	int index = get_block_group_index(cache);
+	bool first = false;
+
+	down_write(&space_info->groups_sem);
+	if (list_empty(&space_info->block_groups[index]))
+		first = true;
+	list_add_tail(&cache->list, &space_info->block_groups[index]);
+	up_write(&space_info->groups_sem);
+
+	if (first) {
+		struct raid_kobject *rkobj;
+		int ret;
+
+		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+		if (!rkobj)
+			goto out_err;
+		rkobj->raid_type = index;
+		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+				  "%s", get_raid_name(index));
+		if (ret) {
+			kobject_put(&rkobj->kobj);
+			goto out_err;
+		}
+		space_info->block_group_kobjs[index] = &rkobj->kobj;
+	}
+
+	return;
+out_err:
+	pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
+}
+
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = start;
+	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+	cache->sectorsize = root->sectorsize;
+	cache->fs_info = root->fs_info;
+	cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       start);
+	atomic_set(&cache->count, 1);
+	spin_lock_init(&cache->lock);
+	init_rwsem(&cache->data_rwsem);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->bg_list);
+	INIT_LIST_HEAD(&cache->ro_list);
+	INIT_LIST_HEAD(&cache->dirty_list);
+	INIT_LIST_HEAD(&cache->io_list);
+	btrfs_init_free_space_ctl(cache);
+	atomic_set(&cache->trimming, 0);
+
+	return cache;
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *space_info;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int need_clear = 0;
+	u64 cache_gen;
+
+	root = info->extent_root;
+	key.objectid = 0;
+	key.offset = 0;
+	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+	if (btrfs_test_opt(root, SPACE_CACHE) &&
+	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
+		need_clear = 1;
+	if (btrfs_test_opt(root, CLEAR_CACHE))
+		need_clear = 1;
+
+	while (1) {
+		ret = find_first_block_group(root, path, &key);
+		if (ret > 0)
+			break;
+		if (ret != 0)
+			goto error;
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		cache = btrfs_create_block_group_cache(root, found_key.objectid,
+						       found_key.offset);
+		if (!cache) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (need_clear) {
+			/*
+			 * When we mount with old space cache, we need to
+			 * set BTRFS_DC_CLEAR and set dirty flag.
+			 *
+			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+			 *    truncate the old free space cache inode and
+			 *    setup a new one.
+			 * b) Setting 'dirty flag' makes sure that we flush
+			 *    the new space cache info onto disk.
+			 */
+			if (btrfs_test_opt(root, SPACE_CACHE))
+				cache->disk_cache_state = BTRFS_DC_CLEAR;
+		}
+
+		read_extent_buffer(leaf, &cache->item,
+				   btrfs_item_ptr_offset(leaf, path->slots[0]),
+				   sizeof(cache->item));
+		cache->flags = btrfs_block_group_flags(&cache->item);
+
+		key.objectid = found_key.objectid + found_key.offset;
+		btrfs_release_path(path);
+
+		/*
+		 * We need to exclude the super stripes now so that the space
+		 * info has super bytes accounted for, otherwise we'll think
+		 * we have more space than we actually do.
+		 */
+		ret = exclude_super_stripes(root, cache);
+		if (ret) {
+			/*
+			 * We may have excluded something, so call this just in
+			 * case.
+			 */
+			free_excluded_extents(root, cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
+		/*
+		 * check for two cases, either we are full, and therefore
+		 * don't need to bother with the caching work since we won't
+		 * find any space, or we are empty, and we can just add all
+		 * the space in and be done with it.  This saves us _alot_ of
+		 * time, particularly in the full case.
+		 */
+		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+			cache->last_byte_to_unpin = (u64)-1;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			free_excluded_extents(root, cache);
+		} else if (btrfs_block_group_used(&cache->item) == 0) {
+			cache->last_byte_to_unpin = (u64)-1;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			add_new_free_space(cache, root->fs_info,
+					   found_key.objectid,
+					   found_key.objectid +
+					   found_key.offset);
+			free_excluded_extents(root, cache);
+		}
+
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
+		ret = update_space_info(info, cache->flags, found_key.offset,
+					btrfs_block_group_used(&cache->item),
+					&space_info);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			spin_lock(&info->block_group_cache_lock);
+			rb_erase(&cache->cache_node,
+				 &info->block_group_cache_tree);
+			RB_CLEAR_NODE(&cache->cache_node);
+			spin_unlock(&info->block_group_cache_lock);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
+		cache->space_info = space_info;
+		spin_lock(&cache->space_info->lock);
+		cache->space_info->bytes_readonly += cache->bytes_super;
+		spin_unlock(&cache->space_info->lock);
+
+		__link_block_group(space_info, cache);
+
+		set_avail_alloc_bits(root->fs_info, cache->flags);
+		if (btrfs_chunk_readonly(root, cache->key.objectid)) {
+			set_block_group_ro(cache, 1);
+		} else if (btrfs_block_group_used(&cache->item) == 0) {
+			spin_lock(&info->unused_bgs_lock);
+			/* Should always be true but just in case. */
+			if (list_empty(&cache->bg_list)) {
+				btrfs_get_block_group(cache);
+				list_add_tail(&cache->bg_list,
+					      &info->unused_bgs);
+			}
+			spin_unlock(&info->unused_bgs_lock);
+		}
+	}
+
+	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+		if (!(get_alloc_profile(root, space_info->flags) &
+		      (BTRFS_BLOCK_GROUP_RAID10 |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 |
+		       BTRFS_BLOCK_GROUP_DUP)))
+			continue;
+		/*
+		 * avoid allocating from un-mirrored block group if there are
+		 * mirrored block groups.
+		 */
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_RAID0],
+				list)
+			set_block_group_ro(cache, 1);
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_SINGLE],
+				list)
+			set_block_group_ro(cache, 1);
+	}
+
+	init_global_block_rsv(info);
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *block_group, *tmp;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_block_group_item item;
+	struct btrfs_key key;
+	int ret = 0;
+
+	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+		if (ret)
+			goto next;
+
+		spin_lock(&block_group->lock);
+		memcpy(&item, &block_group->item, sizeof(item));
+		memcpy(&key, &block_group->key, sizeof(key));
+		spin_unlock(&block_group->lock);
+
+		ret = btrfs_insert_item(trans, extent_root, &key, &item,
+					sizeof(item));
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+		ret = btrfs_finish_chunk_alloc(trans, extent_root,
+					       key.objectid, key.offset);
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+next:
+		list_del_init(&block_group->bg_list);
+	}
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+			   u64 size)
+{
+	int ret;
+	struct btrfs_root *extent_root;
+	struct btrfs_block_group_cache *cache;
+
+	extent_root = root->fs_info->extent_root;
+
+	btrfs_set_log_full_commit(root->fs_info, trans);
+
+	cache = btrfs_create_block_group_cache(root, chunk_offset, size);
+	if (!cache)
+		return -ENOMEM;
+
+	btrfs_set_block_group_used(&cache->item, bytes_used);
+	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+	btrfs_set_block_group_flags(&cache->item, type);
+
+	cache->flags = type;
+	cache->last_byte_to_unpin = (u64)-1;
+	cache->cached = BTRFS_CACHE_FINISHED;
+	ret = exclude_super_stripes(root, cache);
+	if (ret) {
+		/*
+		 * We may have excluded something, so call this just in
+		 * case.
+		 */
+		free_excluded_extents(root, cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
+	add_new_free_space(cache, root->fs_info, chunk_offset,
+			   chunk_offset + size);
+
+	free_excluded_extents(root, cache);
+
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
+	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+				&cache->space_info);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		spin_lock(&root->fs_info->block_group_cache_lock);
+		rb_erase(&cache->cache_node,
+			 &root->fs_info->block_group_cache_tree);
+		RB_CLEAR_NODE(&cache->cache_node);
+		spin_unlock(&root->fs_info->block_group_cache_lock);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+	update_global_block_rsv(root->fs_info);
+
+	spin_lock(&cache->space_info->lock);
+	cache->space_info->bytes_readonly += cache->bytes_super;
+	spin_unlock(&cache->space_info->lock);
+
+	__link_block_group(cache->space_info, cache);
+
+	list_add_tail(&cache->bg_list, &trans->new_bgs);
+
+	set_avail_alloc_bits(extent_root->fs_info, type);
+
+	return 0;
+}
+
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits &= ~extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start,
+			     struct extent_map *em)
+{
+	struct btrfs_path *path;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_free_cluster *cluster;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+	struct btrfs_key key;
+	struct inode *inode;
+	struct kobject *kobj = NULL;
+	int ret;
+	int index;
+	int factor;
+	struct btrfs_caching_control *caching_ctl = NULL;
+	bool remove_em;
+
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+	BUG_ON(!block_group);
+	BUG_ON(!block_group->ro);
+
+	/*
+	 * Free the reserved super bytes from this block group before
+	 * remove it.
+	 */
+	free_excluded_extents(root, block_group);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+	index = get_block_group_index(block_group);
+	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+				  BTRFS_BLOCK_GROUP_RAID1 |
+				  BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
+
+	/* make sure this block group isn't part of an allocation cluster */
+	cluster = &root->fs_info->data_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	/*
+	 * make sure this block group isn't part of a metadata
+	 * allocation cluster
+	 */
+	cluster = &root->fs_info->meta_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * get the inode first so any iput calls done for the io_list
+	 * aren't the final iput (no unlinks allowed now)
+	 */
+	inode = lookup_free_space_inode(tree_root, block_group, path);
+
+	mutex_lock(&trans->transaction->cache_write_mutex);
+	/*
+	 * make sure our free spache cache IO is done before remove the
+	 * free space inode
+	 */
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (!list_empty(&block_group->io_list)) {
+		list_del_init(&block_group->io_list);
+
+		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+		btrfs_wait_cache_io(root, trans, block_group,
+				    &block_group->io_ctl, path,
+				    block_group->key.objectid);
+		btrfs_put_block_group(block_group);
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+	}
+
+	if (!list_empty(&block_group->dirty_list)) {
+		list_del_init(&block_group->dirty_list);
+		btrfs_put_block_group(block_group);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
+	mutex_unlock(&trans->transaction->cache_write_mutex);
+
+	if (!IS_ERR(inode)) {
+		ret = btrfs_orphan_add(trans, inode);
+		if (ret) {
+			btrfs_add_delayed_iput(inode);
+			goto out;
+		}
+		clear_nlink(inode);
+		/* One for the block groups ref */
+		spin_lock(&block_group->lock);
+		if (block_group->iref) {
+			block_group->iref = 0;
+			block_group->inode = NULL;
+			spin_unlock(&block_group->lock);
+			iput(inode);
+		} else {
+			spin_unlock(&block_group->lock);
+		}
+		/* One for our lookup ref */
+		btrfs_add_delayed_iput(inode);
+	}
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0)
+		btrfs_release_path(path);
+	if (ret == 0) {
+		ret = btrfs_del_item(trans, tree_root, path);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+	}
+
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	rb_erase(&block_group->cache_node,
+		 &root->fs_info->block_group_cache_tree);
+	RB_CLEAR_NODE(&block_group->cache_node);
+
+	if (root->fs_info->first_logical_byte == block_group->key.objectid)
+		root->fs_info->first_logical_byte = (u64)-1;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+
+	down_write(&block_group->space_info->groups_sem);
+	/*
+	 * we must use list_del_init so people can check to see if they
+	 * are still on the list after taking the semaphore
+	 */
+	list_del_init(&block_group->list);
+	if (list_empty(&block_group->space_info->block_groups[index])) {
+		kobj = block_group->space_info->block_group_kobjs[index];
+		block_group->space_info->block_group_kobjs[index] = NULL;
+		clear_avail_alloc_bits(root->fs_info, block_group->flags);
+	}
+	up_write(&block_group->space_info->groups_sem);
+	if (kobj) {
+		kobject_del(kobj);
+		kobject_put(kobj);
+	}
+
+	if (block_group->has_caching_ctl)
+		caching_ctl = get_caching_control(block_group);
+	if (block_group->cached == BTRFS_CACHE_STARTED)
+		wait_block_group_cache_done(block_group);
+	if (block_group->has_caching_ctl) {
+		down_write(&root->fs_info->commit_root_sem);
+		if (!caching_ctl) {
+			struct btrfs_caching_control *ctl;
+
+			list_for_each_entry(ctl,
+				    &root->fs_info->caching_block_groups, list)
+				if (ctl->block_group == block_group) {
+					caching_ctl = ctl;
+					atomic_inc(&caching_ctl->count);
+					break;
+				}
+		}
+		if (caching_ctl)
+			list_del_init(&caching_ctl->list);
+		up_write(&root->fs_info->commit_root_sem);
+		if (caching_ctl) {
+			/* Once for the caching bgs list and once for us. */
+			put_caching_control(caching_ctl);
+			put_caching_control(caching_ctl);
+		}
+	}
+
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (!list_empty(&block_group->dirty_list)) {
+		WARN_ON(1);
+	}
+	if (!list_empty(&block_group->io_list)) {
+		WARN_ON(1);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
+	btrfs_remove_free_space_cache(block_group);
+
+	spin_lock(&block_group->space_info->lock);
+	list_del_init(&block_group->ro_list);
+
+	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		WARN_ON(block_group->space_info->total_bytes
+			< block_group->key.offset);
+		WARN_ON(block_group->space_info->bytes_readonly
+			< block_group->key.offset);
+		WARN_ON(block_group->space_info->disk_total
+			< block_group->key.offset * factor);
+	}
+	block_group->space_info->total_bytes -= block_group->key.offset;
+	block_group->space_info->bytes_readonly -= block_group->key.offset;
+	block_group->space_info->disk_total -= block_group->key.offset * factor;
+
+	spin_unlock(&block_group->space_info->lock);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+
+	lock_chunks(root);
+	if (!list_empty(&em->list)) {
+		/* We're in the transaction->pending_chunks list. */
+		free_extent_map(em);
+	}
+	spin_lock(&block_group->lock);
+	block_group->removed = 1;
+	/*
+	 * At this point trimming can't start on this block group, because we
+	 * removed the block group from the tree fs_info->block_group_cache_tree
+	 * so no one can't find it anymore and even if someone already got this
+	 * block group before we removed it from the rbtree, they have already
+	 * incremented block_group->trimming - if they didn't, they won't find
+	 * any free space entries because we already removed them all when we
+	 * called btrfs_remove_free_space_cache().
+	 *
+	 * And we must not remove the extent map from the fs_info->mapping_tree
+	 * to prevent the same logical address range and physical device space
+	 * ranges from being reused for a new block group. This is because our
+	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+	 * completely transactionless, so while it is trimming a range the
+	 * currently running transaction might finish and a new one start,
+	 * allowing for new block groups to be created that can reuse the same
+	 * physical device locations unless we take this special care.
+	 */
+	remove_em = (atomic_read(&block_group->trimming) == 0);
+	/*
+	 * Make sure a trimmer task always sees the em in the pinned_chunks list
+	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
+	 * before checking block_group->removed).
+	 */
+	if (!remove_em) {
+		/*
+		 * Our em might be in trans->transaction->pending_chunks which
+		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+		 * and so is the fs_info->pinned_chunks list.
+		 *
+		 * So at this point we must be holding the chunk_mutex to avoid
+		 * any races with chunk allocation (more specifically at
+		 * volumes.c:contains_pending_extent()), to ensure it always
+		 * sees the em, either in the pending_chunks list or in the
+		 * pinned_chunks list.
+		 */
+		list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+	}
+	spin_unlock(&block_group->lock);
+
+	if (remove_em) {
+		struct extent_map_tree *em_tree;
+
+		em_tree = &root->fs_info->mapping_tree.map_tree;
+		write_lock(&em_tree->lock);
+		/*
+		 * The em might be in the pending_chunks list, so make sure the
+		 * chunk mutex is locked, since remove_extent_mapping() will
+		 * delete us from that list.
+		 */
+		remove_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+
+	unlock_chunks(root);
+
+	btrfs_put_block_group(block_group);
+	btrfs_put_block_group(block_group);
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	if (!fs_info->open)
+		return;
+
+	spin_lock(&fs_info->unused_bgs_lock);
+	while (!list_empty(&fs_info->unused_bgs)) {
+		u64 start, end;
+
+		block_group = list_first_entry(&fs_info->unused_bgs,
+					       struct btrfs_block_group_cache,
+					       bg_list);
+		space_info = block_group->space_info;
+		list_del_init(&block_group->bg_list);
+		if (ret || btrfs_mixed_space_info(space_info)) {
+			btrfs_put_block_group(block_group);
+			continue;
+		}
+		spin_unlock(&fs_info->unused_bgs_lock);
+
+		/* Don't want to race with allocators so take the groups_sem */
+		down_write(&space_info->groups_sem);
+		spin_lock(&block_group->lock);
+		if (block_group->reserved ||
+		    btrfs_block_group_used(&block_group->item) ||
+		    block_group->ro) {
+			/*
+			 * We want to bail if we made new allocations or have
+			 * outstanding allocations in this block group.  We do
+			 * the ro check in case balance is currently acting on
+			 * this block group.
+			 */
+			spin_unlock(&block_group->lock);
+			up_write(&space_info->groups_sem);
+			goto next;
+		}
+		spin_unlock(&block_group->lock);
+
+		/* We don't want to force the issue, only flip if it's ok. */
+		ret = set_block_group_ro(block_group, 0);
+		up_write(&space_info->groups_sem);
+		if (ret < 0) {
+			ret = 0;
+			goto next;
+		}
+
+		/*
+		 * Want to do this before we do anything else so we can recover
+		 * properly if we fail to join the transaction.
+		 */
+		/* 1 for btrfs_orphan_reserve_metadata() */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			btrfs_set_block_group_rw(root, block_group);
+			ret = PTR_ERR(trans);
+			goto next;
+		}
+
+		/*
+		 * We could have pending pinned extents for this block group,
+		 * just delete them, we don't care about them anymore.
+		 */
+		start = block_group->key.objectid;
+		end = start + block_group->key.offset - 1;
+		/*
+		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
+		 * btrfs_finish_extent_commit(). If we are at transaction N,
+		 * another task might be running finish_extent_commit() for the
+		 * previous transaction N - 1, and have seen a range belonging
+		 * to the block group in freed_extents[] before we were able to
+		 * clear the whole block group range from freed_extents[]. This
+		 * means that task can lookup for the block group after we
+		 * unpinned it from freed_extents[] and removed it, leading to
+		 * a BUG_ON() at btrfs_unpin_extent_range().
+		 */
+		mutex_lock(&fs_info->unused_bg_unpin_mutex);
+		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
+				  EXTENT_DIRTY, GFP_NOFS);
+		if (ret) {
+			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+			btrfs_set_block_group_rw(root, block_group);
+			goto end_trans;
+		}
+		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
+				  EXTENT_DIRTY, GFP_NOFS);
+		if (ret) {
+			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+			btrfs_set_block_group_rw(root, block_group);
+			goto end_trans;
+		}
+		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+
+		/* Reset pinned so btrfs_put_block_group doesn't complain */
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+
+		space_info->bytes_pinned -= block_group->pinned;
+		space_info->bytes_readonly += block_group->pinned;
+		percpu_counter_add(&space_info->total_bytes_pinned,
+				   -block_group->pinned);
+		block_group->pinned = 0;
+
+		spin_unlock(&block_group->lock);
+		spin_unlock(&space_info->lock);
+
+		/*
+		 * Btrfs_remove_chunk will abort the transaction if things go
+		 * horribly wrong.
+		 */
+		ret = btrfs_remove_chunk(trans, root,
+					 block_group->key.objectid);
+end_trans:
+		btrfs_end_transaction(trans, root);
+next:
+		btrfs_put_block_group(block_group);
+		spin_lock(&fs_info->unused_bgs_lock);
+	}
+	spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+	struct btrfs_super_block *disk_super;
+	u64 features;
+	u64 flags;
+	int mixed = 0;
+	int ret;
+
+	disk_super = fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		return 1;
+
+	features = btrfs_super_incompat_flags(disk_super);
+	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = 1;
+
+	flags = BTRFS_BLOCK_GROUP_SYSTEM;
+	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	if (ret)
+		goto out;
+
+	if (mixed) {
+		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	} else {
+		flags = BTRFS_BLOCK_GROUP_METADATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+		if (ret)
+			goto out;
+
+		flags = BTRFS_BLOCK_GROUP_DATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	}
+out:
+	return ret;
+}
+
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+	return unpin_extent_range(root, start, end, false);
+}
+
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 group_trimmed;
+	u64 start;
+	u64 end;
+	u64 trimmed = 0;
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+	int ret = 0;
+
+	/*
+	 * try to trim all FS space, our block group may start from non-zero.
+	 */
+	if (range->len == total_bytes)
+		cache = btrfs_lookup_first_block_group(fs_info, range->start);
+	else
+		cache = btrfs_lookup_block_group(fs_info, range->start);
+
+	while (cache) {
+		if (cache->key.objectid >= (range->start + range->len)) {
+			btrfs_put_block_group(cache);
+			break;
+		}
+
+		start = max(range->start, cache->key.objectid);
+		end = min(range->start + range->len,
+				cache->key.objectid + cache->key.offset);
+
+		if (end - start >= range->minlen) {
+			if (!block_group_cache_done(cache)) {
+				ret = cache_block_group(cache, 0);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
+				ret = wait_block_group_cache_done(cache);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
+			}
+			ret = btrfs_trim_block_group(cache,
+						     &group_trimmed,
+						     start,
+						     end,
+						     range->minlen);
+
+			trimmed += group_trimmed;
+			if (ret) {
+				btrfs_put_block_group(cache);
+				break;
+			}
+		}
+
+		cache = next_block_group(fs_info->tree_root, cache);
+	}
+
+	range->len = trimmed;
+	return ret;
+}
+
+/*
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
+ */
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
+{
+	percpu_counter_dec(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we wake up
+	 * waiters.
+	 */
+	smp_mb();
+	if (waitqueue_active(&root->subv_writers->wait))
+		wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
+{
+	if (atomic_read(&root->will_be_snapshoted))
+		return 0;
+
+	percpu_counter_inc(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we check for snapshot creation.
+	 */
+	smp_mb();
+	if (atomic_read(&root->will_be_snapshoted)) {
+		btrfs_end_write_no_snapshoting(root);
+		return 0;
+	}
+	return 1;
+}
diff --git a/kernel/fs/btrfs/extent_io.c b/kernel/fs/btrfs/extent_io.c
new file mode 100644
index 000000000..c32d226bf
--- /dev/null
+++ b/kernel/fs/btrfs/extent_io.c
@@ -0,0 +1,5632 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/cleancache.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "check-integrity.h"
+#include "locking.h"
+#include "rcu-string.h"
+#include "backref.h"
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+static struct bio_set *btrfs_bioset;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+	return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline
+void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(new, head);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_del(struct list_head *entry)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_del(entry);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_check(void)
+{
+	struct extent_state *state;
+	struct extent_buffer *eb;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, leak_list);
+		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+		       state->start, state->end, state->state,
+		       extent_state_in_tree(state),
+		       atomic_read(&state->refs));
+		list_del(&state->leak_list);
+		kmem_cache_free(extent_state_cache, state);
+	}
+
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
+		       "refs %d\n",
+		       eb->start, eb->len, atomic_read(&eb->refs));
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end)		\
+	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+		struct extent_io_tree *tree, u64 start, u64 end)
+{
+	struct inode *inode;
+	u64 isize;
+
+	if (!tree->mapping)
+		return;
+
+	inode = tree->mapping->host;
+	isize = i_size_read(inode);
+	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+		printk_ratelimited(KERN_DEBUG
+		    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+				caller, btrfs_ino(inode), isize, start, end);
+	}
+}
+#else
+#define btrfs_leak_debug_add(new, head)	do {} while (0)
+#define btrfs_leak_debug_del(entry)	do {} while (0)
+#define btrfs_leak_debug_check()	do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
+#endif
+
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+	u64 start;
+	u64 end;
+	struct rb_node rb_node;
+};
+
+struct extent_page_data {
+	struct bio *bio;
+	struct extent_io_tree *tree;
+	get_extent_t *get_extent;
+	unsigned long bio_flags;
+
+	/* tells writepage not to lock the state bits for this range
+	 * it still does the unlocking
+	 */
+	unsigned int extent_locked:1;
+
+	/* tells the submit_bio code to use a WRITE_SYNC */
+	unsigned int sync_io:1;
+};
+
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+	if (!tree->mapping)
+		return NULL;
+	return btrfs_sb(tree->mapping->host->i_sb);
+}
+
+int __init extent_io_init(void)
+{
+	extent_state_cache = kmem_cache_create("btrfs_extent_state",
+			sizeof(struct extent_state), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!extent_state_cache)
+		return -ENOMEM;
+
+	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
+			sizeof(struct extent_buffer), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!extent_buffer_cache)
+		goto free_state_cache;
+
+	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
+				     offsetof(struct btrfs_io_bio, bio));
+	if (!btrfs_bioset)
+		goto free_buffer_cache;
+
+	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+		goto free_bioset;
+
+	return 0;
+
+free_bioset:
+	bioset_free(btrfs_bioset);
+	btrfs_bioset = NULL;
+
+free_buffer_cache:
+	kmem_cache_destroy(extent_buffer_cache);
+	extent_buffer_cache = NULL;
+
+free_state_cache:
+	kmem_cache_destroy(extent_state_cache);
+	extent_state_cache = NULL;
+	return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+	btrfs_leak_debug_check();
+
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
+	if (extent_state_cache)
+		kmem_cache_destroy(extent_state_cache);
+	if (extent_buffer_cache)
+		kmem_cache_destroy(extent_buffer_cache);
+	if (btrfs_bioset)
+		bioset_free(btrfs_bioset);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			 struct address_space *mapping)
+{
+	tree->state = RB_ROOT;
+	tree->ops = NULL;
+	tree->dirty_bytes = 0;
+	spin_lock_init(&tree->lock);
+	tree->mapping = mapping;
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+	struct extent_state *state;
+
+	state = kmem_cache_alloc(extent_state_cache, mask);
+	if (!state)
+		return state;
+	state->state = 0;
+	state->private = 0;
+	RB_CLEAR_NODE(&state->rb_node);
+	btrfs_leak_debug_add(&state->leak_list, &states);
+	atomic_set(&state->refs, 1);
+	init_waitqueue_head(&state->wq);
+	trace_alloc_extent_state(state, mask, _RET_IP_);
+	return state;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+	if (!state)
+		return;
+	if (atomic_dec_and_test(&state->refs)) {
+		WARN_ON(extent_state_in_tree(state));
+		btrfs_leak_debug_del(&state->leak_list);
+		trace_free_extent_state(state, _RET_IP_);
+		kmem_cache_free(extent_state_cache, state);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root,
+				   struct rb_node *search_start,
+				   u64 offset,
+				   struct rb_node *node,
+				   struct rb_node ***p_in,
+				   struct rb_node **parent_in)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct tree_entry *entry;
+
+	if (p_in && parent_in) {
+		p = *p_in;
+		parent = *parent_in;
+		goto do_insert;
+	}
+
+	p = search_start ? &search_start : &root->rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset > entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+do_insert:
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+				      struct rb_node **prev_ret,
+				      struct rb_node **next_ret,
+				      struct rb_node ***p_ret,
+				      struct rb_node **parent_ret)
+{
+	struct rb_root *root = &tree->state;
+	struct rb_node **n = &root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+
+	while (*n) {
+		prev = *n;
+		entry = rb_entry(prev, struct tree_entry, rb_node);
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = &(*n)->rb_left;
+		else if (offset > entry->end)
+			n = &(*n)->rb_right;
+		else
+			return *n;
+	}
+
+	if (p_ret)
+		*p_ret = n;
+	if (parent_ret)
+		*parent_ret = prev;
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while (prev && offset > prev_entry->end) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		while (prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+static inline struct rb_node *
+tree_search_for_insert(struct extent_io_tree *tree,
+		       u64 offset,
+		       struct rb_node ***p_ret,
+		       struct rb_node **parent_ret)
+{
+	struct rb_node *prev = NULL;
+	struct rb_node *ret;
+
+	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+					  u64 offset)
+{
+	return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+		     struct extent_state *other)
+{
+	if (tree->ops && tree->ops->merge_extent_hook)
+		tree->ops->merge_extent_hook(tree->mapping->host, new,
+					     other);
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree,
+		        struct extent_state *state)
+{
+	struct extent_state *other;
+	struct rb_node *other_node;
+
+	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+		return;
+
+	other_node = rb_prev(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->end == state->start - 1 &&
+		    other->state == state->state) {
+			merge_cb(tree, state, other);
+			state->start = other->start;
+			rb_erase(&other->rb_node, &tree->state);
+			RB_CLEAR_NODE(&other->rb_node);
+			free_extent_state(other);
+		}
+	}
+	other_node = rb_next(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->start == state->end + 1 &&
+		    other->state == state->state) {
+			merge_cb(tree, state, other);
+			state->end = other->end;
+			rb_erase(&other->rb_node, &tree->state);
+			RB_CLEAR_NODE(&other->rb_node);
+			free_extent_state(other);
+		}
+	}
+}
+
+static void set_state_cb(struct extent_io_tree *tree,
+			 struct extent_state *state, unsigned *bits)
+{
+	if (tree->ops && tree->ops->set_bit_hook)
+		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+}
+
+static void clear_state_cb(struct extent_io_tree *tree,
+			   struct extent_state *state, unsigned *bits)
+{
+	if (tree->ops && tree->ops->clear_bit_hook)
+		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+			   struct extent_state *state, unsigned *bits);
+
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+			struct extent_state *state, u64 start, u64 end,
+			struct rb_node ***p,
+			struct rb_node **parent,
+			unsigned *bits)
+{
+	struct rb_node *node;
+
+	if (end < start)
+		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
+		       end, start);
+	state->start = start;
+	state->end = end;
+
+	set_state_bits(tree, state, bits);
+
+	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
+		       "%llu %llu\n",
+		       found->start, found->end, start, end);
+		return -EEXIST;
+	}
+	merge_state(tree, state);
+	return 0;
+}
+
+static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+		     u64 split)
+{
+	if (tree->ops && tree->ops->split_extent_hook)
+		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+		       struct extent_state *prealloc, u64 split)
+{
+	struct rb_node *node;
+
+	split_cb(tree, orig, split);
+
+	prealloc->start = orig->start;
+	prealloc->end = split - 1;
+	prealloc->state = orig->state;
+	orig->start = split;
+
+	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+			   &prealloc->rb_node, NULL, NULL);
+	if (node) {
+		free_extent_state(prealloc);
+		return -EEXIST;
+	}
+	return 0;
+}
+
+static struct extent_state *next_state(struct extent_state *state)
+{
+	struct rb_node *next = rb_next(&state->rb_node);
+	if (next)
+		return rb_entry(next, struct extent_state, rb_node);
+	else
+		return NULL;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+					    struct extent_state *state,
+					    unsigned *bits, int wake)
+{
+	struct extent_state *next;
+	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+
+	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		WARN_ON(range > tree->dirty_bytes);
+		tree->dirty_bytes -= range;
+	}
+	clear_state_cb(tree, state, bits);
+	state->state &= ~bits_to_clear;
+	if (wake)
+		wake_up(&state->wq);
+	if (state->state == 0) {
+		next = next_state(state);
+		if (extent_state_in_tree(state)) {
+			rb_erase(&state->rb_node, &tree->state);
+			RB_CLEAR_NODE(&state->rb_node);
+			free_extent_state(state);
+		} else {
+			WARN_ON(1);
+		}
+	} else {
+		merge_state(tree, state);
+		next = next_state(state);
+	}
+	return next;
+}
+
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+	if (!prealloc)
+		prealloc = alloc_extent_state(GFP_ATOMIC);
+
+	return prealloc;
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+		    "Extent tree was modified by another "
+		    "thread while locked.");
+}
+
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     unsigned bits, int wake, int delete,
+		     struct extent_state **cached_state,
+		     gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *cached;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	u64 last_end;
+	int err;
+	int clear = 0;
+
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
+	if (bits & EXTENT_DELALLOC)
+		bits |= EXTENT_NORESERVE;
+
+	if (delete)
+		bits |= ~EXTENT_CTLBITS;
+	bits |= EXTENT_FIRST_DELALLOC;
+
+	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+		clear = 1;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		/*
+		 * Don't care for allocation failure here because we might end
+		 * up not needing the pre-allocated extent state at all, which
+		 * is the case if we only have in the tree extent states that
+		 * cover our input range and don't cover too any other range.
+		 * If we end up needing a new extent state we allocate it later.
+		 */
+		prealloc = alloc_extent_state(mask);
+	}
+
+	spin_lock(&tree->lock);
+	if (cached_state) {
+		cached = *cached_state;
+
+		if (clear) {
+			*cached_state = NULL;
+			cached_state = NULL;
+		}
+
+		if (cached && extent_state_in_tree(cached) &&
+		    cached->start <= start && cached->end > start) {
+			if (clear)
+				atomic_dec(&cached->refs);
+			state = cached;
+			goto hit_next;
+		}
+		if (clear)
+			free_extent_state(cached);
+	}
+	/*
+	 * this search will find the extents that end after
+	 * our range starts
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	if (state->start > end)
+		goto out;
+	WARN_ON(state->end < start);
+	last_end = state->end;
+
+	/* the state doesn't have the wanted bits, go ahead */
+	if (!(state->state & bits)) {
+		state = next_state(state);
+		goto next;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 *  | state | or
+	 *  | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip
+	 * bits on second half.
+	 *
+	 * If the extent we found extends past our range, we
+	 * just split and search again.  It'll get split again
+	 * the next time though.
+	 *
+	 * If the extent we found is inside our range, we clear
+	 * the desired bit on it.
+	 */
+
+	if (state->start < start) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, start);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			state = clear_state_bit(tree, state, &bits, wake);
+			goto next;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and clear the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, end + 1);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		if (wake)
+			wake_up(&state->wq);
+
+		clear_state_bit(tree, prealloc, &bits, wake);
+
+		prealloc = NULL;
+		goto out;
+	}
+
+	state = clear_state_bit(tree, state, &bits, wake);
+next:
+	if (last_end == (u64)-1)
+		goto out;
+	start = last_end + 1;
+	if (start <= end && state && !need_resched())
+		goto hit_next;
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return 0;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+			  struct extent_state *state)
+		__releases(tree->lock)
+		__acquires(tree->lock)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&tree->lock);
+	schedule();
+	spin_lock(&tree->lock);
+	finish_wait(&state->wq, &wait);
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			    unsigned long bits)
+{
+	struct extent_state *state;
+	struct rb_node *node;
+
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
+	spin_lock(&tree->lock);
+again:
+	while (1) {
+		/*
+		 * this search will find all the extents that end after
+		 * our range starts
+		 */
+		node = tree_search(tree, start);
+process_node:
+		if (!node)
+			break;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (state->start > end)
+			goto out;
+
+		if (state->state & bits) {
+			start = state->start;
+			atomic_inc(&state->refs);
+			wait_on_state(tree, state);
+			free_extent_state(state);
+			goto again;
+		}
+		start = state->end + 1;
+
+		if (start > end)
+			break;
+
+		if (!cond_resched_lock(&tree->lock)) {
+			node = rb_next(node);
+			goto process_node;
+		}
+	}
+out:
+	spin_unlock(&tree->lock);
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+			   struct extent_state *state,
+			   unsigned *bits)
+{
+	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+
+	set_state_cb(tree, state, bits);
+	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		tree->dirty_bytes += range;
+	}
+	state->state |= bits_to_set;
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+				 struct extent_state **cached_ptr,
+				 unsigned flags)
+{
+	if (cached_ptr && !(*cached_ptr)) {
+		if (!flags || (state->state & flags)) {
+			*cached_ptr = state;
+			atomic_inc(&state->refs);
+		}
+	}
+}
+
+static void cache_state(struct extent_state *state,
+			struct extent_state **cached_ptr)
+{
+	return cache_state_if_flags(state, cached_ptr,
+				    EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
+
+/*
+ * set some bits on a range in the tree.  This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set.  The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		 unsigned bits, unsigned exclusive_bits,
+		 u64 *failed_start, struct extent_state **cached_state,
+		 gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	struct rb_node **p;
+	struct rb_node *parent;
+	int err = 0;
+	u64 last_start;
+	u64 last_end;
+
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
+	bits |= EXTENT_FIRST_DELALLOC;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		BUG_ON(!prealloc);
+	}
+
+	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start <= start && state->end > start &&
+		    extent_state_in_tree(state)) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search_for_insert(tree, start, &p, &parent);
+	if (!node) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = insert_state(tree, prealloc, start, end,
+				   &p, &parent, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		cache_state(prealloc, cached_state);
+		prealloc = NULL;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		if (state->state & exclusive_bits) {
+			*failed_start = state->start;
+			err = -EEXIST;
+			goto out;
+		}
+
+		set_state_bits(tree, state, &bits);
+		cache_state(state, cached_state);
+		merge_state(tree, state);
+		if (last_end == (u64)-1)
+			goto out;
+		start = last_end + 1;
+		state = next_state(state);
+		if (start < end && state && state->start == start &&
+		    !need_resched())
+			goto hit_next;
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		if (state->state & exclusive_bits) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, start);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, &bits);
+			cache_state(state, cached_state);
+			merge_state(tree, state);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
+			state = next_state(state);
+			if (start < end && state && state->start == start &&
+			    !need_resched())
+				goto hit_next;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start - 1;
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+
+		/*
+		 * Avoid to free 'prealloc' if it can be merged with
+		 * the later extent.
+		 */
+		err = insert_state(tree, prealloc, start, this_end,
+				   NULL, NULL, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		cache_state(prealloc, cached_state);
+		prealloc = NULL;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		if (state->state & exclusive_bits) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, end + 1);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   unsigned bits, u64 * failed_start,
+		   struct extent_state **cached_state, gfp_t mask)
+{
+	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+				cached_state, mask);
+}
+
+
+/**
+ * convert_extent_bit - convert all bits in a given range from one bit to
+ * 			another
+ * @tree:	the io tree to search
+ * @start:	the start offset in bytes
+ * @end:	the end offset in bytes (inclusive)
+ * @bits:	the bits to set in this range
+ * @clear_bits:	the bits to clear in this range
+ * @cached_state:	state that we're going to cache
+ * @mask:	the allocation mask
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       unsigned bits, unsigned clear_bits,
+		       struct extent_state **cached_state, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	struct rb_node **p;
+	struct rb_node *parent;
+	int err = 0;
+	u64 last_start;
+	u64 last_end;
+	bool first_iteration = true;
+
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		/*
+		 * Best effort, don't worry if extent state allocation fails
+		 * here for the first iteration. We might have a cached state
+		 * that matches exactly the target range, in which case no
+		 * extent state allocations are needed. We'll only know this
+		 * after locking the tree.
+		 */
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc && !first_iteration)
+			return -ENOMEM;
+	}
+
+	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start <= start && state->end > start &&
+		    extent_state_in_tree(state)) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search_for_insert(tree, start, &p, &parent);
+	if (!node) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = insert_state(tree, prealloc, start, end,
+				   &p, &parent, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
+		prealloc = NULL;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		set_state_bits(tree, state, &bits);
+		cache_state(state, cached_state);
+		state = clear_state_bit(tree, state, &clear_bits, 0);
+		if (last_end == (u64)-1)
+			goto out;
+		start = last_end + 1;
+		if (start < end && state && state->start == start &&
+		    !need_resched())
+			goto hit_next;
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, start);
+		if (err)
+			extent_io_tree_panic(tree, err);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, &bits);
+			cache_state(state, cached_state);
+			state = clear_state_bit(tree, state, &clear_bits, 0);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
+			if (start < end && state && state->start == start &&
+			    !need_resched())
+				goto hit_next;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start - 1;
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		/*
+		 * Avoid to free 'prealloc' if it can be merged with
+		 * the later extent.
+		 */
+		err = insert_state(tree, prealloc, start, this_end,
+				   NULL, NULL, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
+		prealloc = NULL;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = split_state(tree, state, prealloc, end + 1);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
+		clear_state_bit(tree, prealloc, &clear_bits, 0);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	first_iteration = false;
+	goto again;
+}
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+			      NULL, mask);
+}
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    unsigned bits, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, bits, NULL,
+			      NULL, mask);
+}
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      unsigned bits, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+}
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE,
+			      NULL, cached_state, mask);
+}
+
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+			      NULL, cached_state, mask);
+}
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end,
+				EXTENT_DIRTY | EXTENT_DELALLOC |
+				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
+			      NULL, mask);
+}
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+			      cached_state, mask);
+}
+
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  struct extent_state **cached_state, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+				cached_state, mask);
+}
+
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     unsigned bits, struct extent_state **cached_state)
+{
+	int err;
+	u64 failed_start;
+
+	while (1) {
+		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+				       EXTENT_LOCKED, &failed_start,
+				       cached_state, GFP_NOFS);
+		if (err == -EEXIST) {
+			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+			start = failed_start;
+		} else
+			break;
+		WARN_ON(start > end);
+	}
+	return err;
+}
+
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	return lock_extent_bits(tree, start, end, 0, NULL);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	int err;
+	u64 failed_start;
+
+	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+			       &failed_start, NULL, GFP_NOFS);
+	if (err == -EEXIST) {
+		if (failed_start > start)
+			clear_extent_bit(tree, start, failed_start - 1,
+					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+		return 0;
+	}
+	return 1;
+}
+
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+				mask);
+}
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+				GFP_NOFS);
+}
+
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		clear_page_dirty_for_io(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		__set_page_dirty_nobuffers(page);
+		account_page_redirty(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		set_page_writeback(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
+ */
+static struct extent_state *
+find_first_extent_bit_state(struct extent_io_tree *tree,
+			    u64 start, unsigned bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits))
+			return state;
+
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	return NULL;
+}
+
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned. If found something, return 0.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, unsigned bits,
+			  struct extent_state **cached_state)
+{
+	struct extent_state *state;
+	struct rb_node *n;
+	int ret = 1;
+
+	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->end == start - 1 && extent_state_in_tree(state)) {
+			n = rb_next(&state->rb_node);
+			while (n) {
+				state = rb_entry(n, struct extent_state,
+						 rb_node);
+				if (state->state & bits)
+					goto got_it;
+				n = rb_next(n);
+			}
+			free_extent_state(*cached_state);
+			*cached_state = NULL;
+			goto out;
+		}
+		free_extent_state(*cached_state);
+		*cached_state = NULL;
+	}
+
+	state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+	if (state) {
+		cache_state_if_flags(state, cached_state, 0);
+		*start_ret = state->start;
+		*end_ret = state->end;
+		ret = 0;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+					u64 *start, u64 *end, u64 max_bytes,
+					struct extent_state **cached_state)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 found = 0;
+	u64 total_bytes = 0;
+
+	spin_lock(&tree->lock);
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, cur_start);
+	if (!node) {
+		if (!found)
+			*end = (u64)-1;
+		goto out;
+	}
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (found && (state->start != cur_start ||
+			      (state->state & EXTENT_BOUNDARY))) {
+			goto out;
+		}
+		if (!(state->state & EXTENT_DELALLOC)) {
+			if (!found)
+				*end = state->end;
+			goto out;
+		}
+		if (!found) {
+			*start = state->start;
+			*cached_state = state;
+			atomic_inc(&state->refs);
+		}
+		found++;
+		*end = state->end;
+		cur_start = state->end + 1;
+		node = rb_next(node);
+		total_bytes += state->end - state->start + 1;
+		if (total_bytes >= max_bytes)
+			break;
+		if (!node)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return found;
+}
+
+static noinline void __unlock_for_delalloc(struct inode *inode,
+					   struct page *locked_page,
+					   u64 start, u64 end)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+
+	if (index == locked_page->index && end_index == index)
+		return;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] != locked_page)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+					struct page *locked_page,
+					u64 delalloc_start,
+					u64 delalloc_end)
+{
+	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+	unsigned long start_index = index;
+	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+	unsigned long pages_locked = 0;
+	struct page *pages[16];
+	unsigned long nrpages;
+	int ret;
+	int i;
+
+	/* the caller is responsible for locking the start index */
+	if (index == locked_page->index && index == end_index)
+		return 0;
+
+	/* skip the page at the start index */
+	nrpages = end_index - index + 1;
+	while (nrpages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nrpages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			ret = -EAGAIN;
+			goto done;
+		}
+		/* now we have an array of pages, lock them all */
+		for (i = 0; i < ret; i++) {
+			/*
+			 * the caller is taking responsibility for
+			 * locked_page
+			 */
+			if (pages[i] != locked_page) {
+				lock_page(pages[i]);
+				if (!PageDirty(pages[i]) ||
+				    pages[i]->mapping != inode->i_mapping) {
+					ret = -EAGAIN;
+					unlock_page(pages[i]);
+					page_cache_release(pages[i]);
+					goto done;
+				}
+			}
+			page_cache_release(pages[i]);
+			pages_locked++;
+		}
+		nrpages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	ret = 0;
+done:
+	if (ret && pages_locked) {
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start,
+			      ((u64)(start_index + pages_locked - 1)) <<
+			      PAGE_CACHE_SHIFT);
+	}
+	return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+				    struct extent_io_tree *tree,
+				    struct page *locked_page, u64 *start,
+				    u64 *end, u64 max_bytes)
+{
+	u64 delalloc_start;
+	u64 delalloc_end;
+	u64 found;
+	struct extent_state *cached_state = NULL;
+	int ret;
+	int loops = 0;
+
+again:
+	/* step one, find a bunch of delalloc bytes starting at start */
+	delalloc_start = *start;
+	delalloc_end = 0;
+	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+				    max_bytes, &cached_state);
+	if (!found || delalloc_end <= *start) {
+		*start = delalloc_start;
+		*end = delalloc_end;
+		free_extent_state(cached_state);
+		return 0;
+	}
+
+	/*
+	 * start comes from the offset of locked_page.  We have to lock
+	 * pages in order, so we can't process delalloc bytes before
+	 * locked_page
+	 */
+	if (delalloc_start < *start)
+		delalloc_start = *start;
+
+	/*
+	 * make sure to limit the number of pages we try to lock down
+	 */
+	if (delalloc_end + 1 - delalloc_start > max_bytes)
+		delalloc_end = delalloc_start + max_bytes - 1;
+
+	/* step two, lock all the pages after the page that has start */
+	ret = lock_delalloc_pages(inode, locked_page,
+				  delalloc_start, delalloc_end);
+	if (ret == -EAGAIN) {
+		/* some of the pages are gone, lets avoid looping by
+		 * shortening the size of the delalloc range we're searching
+		 */
+		free_extent_state(cached_state);
+		cached_state = NULL;
+		if (!loops) {
+			max_bytes = PAGE_CACHE_SIZE;
+			loops = 1;
+			goto again;
+		} else {
+			found = 0;
+			goto out_failed;
+		}
+	}
+	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
+
+	/* step three, lock the state bits for the whole range */
+	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
+
+	/* then test to make sure it is all still delalloc */
+	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+			     EXTENT_DELALLOC, 1, cached_state);
+	if (!ret) {
+		unlock_extent_cached(tree, delalloc_start, delalloc_end,
+				     &cached_state, GFP_NOFS);
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start, delalloc_end);
+		cond_resched();
+		goto again;
+	}
+	free_extent_state(cached_state);
+	*start = delalloc_start;
+	*end = delalloc_end;
+out_failed:
+	return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned clear_bits,
+				 unsigned long page_ops)
+{
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+	if (page_ops == 0)
+		return 0;
+
+	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+		mapping_set_error(inode->i_mapping, -EIO);
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+
+			if (page_ops & PAGE_SET_PRIVATE2)
+				SetPagePrivate2(pages[i]);
+
+			if (pages[i] == locked_page) {
+				page_cache_release(pages[i]);
+				continue;
+			}
+			if (page_ops & PAGE_CLEAR_DIRTY)
+				clear_page_dirty_for_io(pages[i]);
+			if (page_ops & PAGE_SET_WRITEBACK)
+				set_page_writeback(pages[i]);
+			if (page_ops & PAGE_SET_ERROR)
+				SetPageError(pages[i]);
+			if (page_ops & PAGE_END_WRITEBACK)
+				end_page_writeback(pages[i]);
+			if (page_ops & PAGE_UNLOCK)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set.  This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached.  The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end, u64 max_bytes,
+		     unsigned bits, int contig)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 total_bytes = 0;
+	u64 last = 0;
+	int found = 0;
+
+	if (WARN_ON(search_end <= cur_start))
+		return 0;
+
+	spin_lock(&tree->lock);
+	if (cur_start == 0 && bits == EXTENT_DIRTY) {
+		total_bytes = tree->dirty_bytes;
+		goto out;
+	}
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, cur_start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start > search_end)
+			break;
+		if (contig && found && state->start > last + 1)
+			break;
+		if (state->end >= cur_start && (state->state & bits) == bits) {
+			total_bytes += min(search_end, state->end) + 1 -
+				       max(cur_start, state->start);
+			if (total_bytes >= max_bytes)
+				break;
+			if (!found) {
+				*start = max(cur_start, state->start);
+				found = 1;
+			}
+			last = state->end;
+		} else if (contig && found) {
+			break;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return total_bytes;
+}
+
+/*
+ * set the private field for a given byte offset in the tree.  If there isn't
+ * an extent_state there already, this does nothing.
+ */
+static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state->private = private;
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*private = state->private;
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   unsigned bits, int filled, struct extent_state *cached)
+{
+	struct extent_state *state = NULL;
+	struct rb_node *node;
+	int bitset = 0;
+
+	spin_lock(&tree->lock);
+	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+	    cached->end > start)
+		node = &cached->rb_node;
+	else
+		node = tree_search(tree, start);
+	while (node && start <= end) {
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (filled && state->start > start) {
+			bitset = 0;
+			break;
+		}
+
+		if (state->start > end)
+			break;
+
+		if (state->state & bits) {
+			bitset = 1;
+			if (!filled)
+				break;
+		} else if (filled) {
+			bitset = 0;
+			break;
+		}
+
+		if (state->end == (u64)-1)
+			break;
+
+		start = state->end + 1;
+		if (start > end)
+			break;
+		node = rb_next(node);
+		if (!node) {
+			if (filled)
+				bitset = 0;
+			break;
+		}
+	}
+	spin_unlock(&tree->lock);
+	return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
+{
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+		SetPageUptodate(page);
+}
+
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
+{
+	int ret;
+	int err = 0;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+	set_state_private(failure_tree, rec->start, 0);
+	ret = clear_extent_bits(failure_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+	if (ret)
+		err = ret;
+
+	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_DAMAGED, GFP_NOFS);
+	if (ret && !err)
+		err = ret;
+
+	kfree(rec);
+	return err;
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchronization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+		      struct page *page, unsigned int pg_offset, int mirror_num)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct bio *bio;
+	struct btrfs_device *dev;
+	u64 map_length = 0;
+	u64 sector;
+	struct btrfs_bio *bbio = NULL;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	int ret;
+
+	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
+	BUG_ON(!mirror_num);
+
+	/* we can't repair anything in raid56 yet */
+	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+		return 0;
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio)
+		return -EIO;
+	bio->bi_iter.bi_size = 0;
+	map_length = length;
+
+	ret = btrfs_map_block(fs_info, WRITE, logical,
+			      &map_length, &bbio, mirror_num);
+	if (ret) {
+		bio_put(bio);
+		return -EIO;
+	}
+	BUG_ON(mirror_num != bbio->mirror_num);
+	sector = bbio->stripes[mirror_num-1].physical >> 9;
+	bio->bi_iter.bi_sector = sector;
+	dev = bbio->stripes[mirror_num-1].dev;
+	btrfs_put_bbio(bbio);
+	if (!dev || !dev->bdev || !dev->writeable) {
+		bio_put(bio);
+		return -EIO;
+	}
+	bio->bi_bdev = dev->bdev;
+	bio_add_page(bio, page, length, pg_offset);
+
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
+		/* try to remap that extent elsewhere? */
+		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		return -EIO;
+	}
+
+	printk_ratelimited_in_rcu(KERN_INFO
+				  "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+				  btrfs_ino(inode), start,
+				  rcu_str_deref(dev->name), sector);
+	bio_put(bio);
+	return 0;
+}
+
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+			 int mirror_num)
+{
+	u64 start = eb->start;
+	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+	int ret = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = eb->pages[i];
+
+		ret = repair_io_failure(root->fs_info->btree_inode, start,
+					PAGE_CACHE_SIZE, start, p,
+					start - page_offset(p), mirror_num);
+		if (ret)
+			break;
+		start += PAGE_CACHE_SIZE;
+	}
+
+	return ret;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+		     unsigned int pg_offset)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failrec;
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_state *state;
+	int num_copies;
+	int ret;
+
+	private = 0;
+	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+				(u64)-1, 1, EXTENT_DIRTY, 0);
+	if (!ret)
+		return 0;
+
+	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+				&private_failure);
+	if (ret)
+		return 0;
+
+	failrec = (struct io_failure_record *)(unsigned long) private_failure;
+	BUG_ON(!failrec->this_mirror);
+
+	if (failrec->in_validation) {
+		/* there was no real error, just free the record */
+		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+			 failrec->start);
+		goto out;
+	}
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		goto out;
+
+	spin_lock(&BTRFS_I(inode)->io_tree.lock);
+	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+					    failrec->start,
+					    EXTENT_LOCKED);
+	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+	if (state && state->start <= failrec->start &&
+	    state->end >= failrec->start + failrec->len - 1) {
+		num_copies = btrfs_num_copies(fs_info, failrec->logical,
+					      failrec->len);
+		if (num_copies > 1)  {
+			repair_io_failure(inode, start, failrec->len,
+					  failrec->logical, page,
+					  pg_offset, failrec->failed_mirror);
+		}
+	}
+
+out:
+	free_io_failure(inode, failrec);
+
+	return 0;
+}
+
+/*
+ * Can be called when
+ * - hold extent lock
+ * - under ordered extent
+ * - the inode is freeing
+ */
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
+{
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct io_failure_record *failrec;
+	struct extent_state *state, *next;
+
+	if (RB_EMPTY_ROOT(&failure_tree->state))
+		return;
+
+	spin_lock(&failure_tree->lock);
+	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
+	while (state) {
+		if (state->start > end)
+			break;
+
+		ASSERT(state->end <= end);
+
+		next = next_state(state);
+
+		failrec = (struct io_failure_record *)(unsigned long)state->private;
+		free_extent_state(state);
+		kfree(failrec);
+
+		state = next;
+	}
+	spin_unlock(&failure_tree->lock);
+}
+
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+				struct io_failure_record **failrec_ret)
+{
+	struct io_failure_record *failrec;
+	u64 private;
+	struct extent_map *em;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret;
+	u64 logical;
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->this_mirror = 0;
+		failrec->bio_flags = 0;
+		failrec->in_validation = 0;
+
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (!em) {
+			read_unlock(&em_tree->lock);
+			kfree(failrec);
+			return -EIO;
+		}
+
+		if (em->start > start || em->start + em->len <= start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		read_unlock(&em_tree->lock);
+		if (!em) {
+			kfree(failrec);
+			return -EIO;
+		}
+
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
+		}
+
+		pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
+			 logical, start, failrec->len);
+
+		failrec->logical = logical;
+		free_extent_map(em);
+
+		/* set the bits in the private failure tree */
+		ret = set_extent_bits(failure_tree, start, end,
+					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		if (ret >= 0)
+			ret = set_state_private(failure_tree, start,
+						(u64)(unsigned long)failrec);
+		/* set the bits in the inode's tree */
+		if (ret >= 0)
+			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+						GFP_NOFS);
+		if (ret < 0) {
+			kfree(failrec);
+			return ret;
+		}
+	} else {
+		failrec = (struct io_failure_record *)(unsigned long)private;
+		pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
+			 failrec->logical, failrec->start, failrec->len,
+			 failrec->in_validation);
+		/*
+		 * when data can be on disk more than twice, add to failrec here
+		 * (e.g. with a list for failed_mirror) to make
+		 * clean_io_failure() clean all those errors at once.
+		 */
+	}
+
+	*failrec_ret = failrec;
+
+	return 0;
+}
+
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+			   struct io_failure_record *failrec, int failed_mirror)
+{
+	int num_copies;
+
+	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+				      failrec->logical, failrec->len);
+	if (num_copies == 1) {
+		/*
+		 * we only have a single copy of the data, so don't bother with
+		 * all the retry and error correction code that follows. no
+		 * matter what the error is, it is very likely to persist.
+		 */
+		pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		return 0;
+	}
+
+	/*
+	 * there are two premises:
+	 *	a) deliver good data to the caller
+	 *	b) correct the bad sectors on disk
+	 */
+	if (failed_bio->bi_vcnt > 1) {
+		/*
+		 * to fulfill b), we need to know the exact failing sectors, as
+		 * we don't want to rewrite any more than the failed ones. thus,
+		 * we need separate read requests for the failed bio
+		 *
+		 * if the following BUG_ON triggers, our validation request got
+		 * merged. we need separate requests for our algorithm to work.
+		 */
+		BUG_ON(failrec->in_validation);
+		failrec->in_validation = 1;
+		failrec->this_mirror = failed_mirror;
+	} else {
+		/*
+		 * we're ready to fulfill a) and b) alongside. get a good copy
+		 * of the failed sector and if we succeed, we have setup
+		 * everything for repair_io_failure to do the rest for us.
+		 */
+		if (failrec->in_validation) {
+			BUG_ON(failrec->this_mirror != failed_mirror);
+			failrec->in_validation = 0;
+			failrec->this_mirror = 0;
+		}
+		failrec->failed_mirror = failed_mirror;
+		failrec->this_mirror++;
+		if (failrec->this_mirror == failed_mirror)
+			failrec->this_mirror++;
+	}
+
+	if (failrec->this_mirror > num_copies) {
+		pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		return 0;
+	}
+
+	return 1;
+}
+
+
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+				    struct io_failure_record *failrec,
+				    struct page *page, int pg_offset, int icsum,
+				    bio_end_io_t *endio_func, void *data)
+{
+	struct bio *bio;
+	struct btrfs_io_bio *btrfs_failed_bio;
+	struct btrfs_io_bio *btrfs_bio;
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio)
+		return NULL;
+
+	bio->bi_end_io = endio_func;
+	bio->bi_iter.bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+	bio->bi_iter.bi_size = 0;
+	bio->bi_private = data;
+
+	btrfs_failed_bio = btrfs_io_bio(failed_bio);
+	if (btrfs_failed_bio->csum) {
+		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = btrfs_bio->csum_inline;
+		icsum *= csum_size;
+		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
+		       csum_size);
+	}
+
+	bio_add_page(bio, page, failrec->len, pg_offset);
+
+	return bio;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+			      struct page *page, u64 start, u64 end,
+			      int failed_mirror)
+{
+	struct io_failure_record *failrec;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct bio *bio;
+	int read_mode;
+	int ret;
+
+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+	if (ret)
+		return ret;
+
+	ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
+	if (!ret) {
+		free_io_failure(inode, failrec);
+		return -EIO;
+	}
+
+	if (failed_bio->bi_vcnt > 1)
+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+	else
+		read_mode = READ_SYNC;
+
+	phy_offset >>= inode->i_sb->s_blocksize_bits;
+	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+				      start - page_offset(page),
+				      (int)phy_offset, failed_bio->bi_end_io,
+				      NULL);
+	if (!bio) {
+		free_io_failure(inode, failrec);
+		return -EIO;
+	}
+
+	pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
+		 read_mode, failrec->this_mirror, failrec->in_validation);
+
+	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
+					 failrec->this_mirror,
+					 failrec->bio_flags, 0);
+	if (ret) {
+		free_io_failure(inode, failrec);
+		bio_put(bio);
+	}
+
+	return ret;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+{
+	int uptodate = (err == 0);
+	struct extent_io_tree *tree;
+	int ret = 0;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+	if (tree->ops && tree->ops->writepage_end_io_hook) {
+		ret = tree->ops->writepage_end_io_hook(page, start,
+					       end, NULL, uptodate);
+		if (ret)
+			uptodate = 0;
+	}
+
+	if (!uptodate) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+		ret = ret < 0 ? ret : -EIO;
+		mapping_set_error(page->mapping, ret);
+	}
+	return 0;
+}
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+	struct bio_vec *bvec;
+	u64 start;
+	u64 end;
+	int i;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "partial page write in btrfs with offset %u and length %u",
+					bvec->bv_offset, bvec->bv_len);
+			else
+				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "incomplete page write in btrfs with offset %u and "
+				   "length %u",
+					bvec->bv_offset, bvec->bv_len);
+		}
+
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
+
+		if (end_extent_writepage(page, err, start, end))
+			continue;
+
+		end_page_writeback(page);
+	}
+
+	bio_put(bio);
+}
+
+static void
+endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
+			      int uptodate)
+{
+	struct extent_state *cached = NULL;
+	u64 end = start + len - 1;
+
+	if (uptodate && tree->track_uptodate)
+		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
+	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+	struct bio_vec *bvec;
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct extent_io_tree *tree;
+	u64 offset = 0;
+	u64 start;
+	u64 end;
+	u64 len;
+	u64 extent_start = 0;
+	u64 extent_len = 0;
+	int mirror;
+	int ret;
+	int i;
+
+	if (err)
+		uptodate = 0;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		struct inode *inode = page->mapping->host;
+
+		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
+			 io_bio->mirror_num);
+		tree = &BTRFS_I(inode)->io_tree;
+
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "partial page read in btrfs with offset %u and length %u",
+					bvec->bv_offset, bvec->bv_len);
+			else
+				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "incomplete page read in btrfs with offset %u and "
+				   "length %u",
+					bvec->bv_offset, bvec->bv_len);
+		}
+
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
+		len = bvec->bv_len;
+
+		mirror = io_bio->mirror_num;
+		if (likely(uptodate && tree->ops &&
+			   tree->ops->readpage_end_io_hook)) {
+			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
+							      page, start, end,
+							      mirror);
+			if (ret)
+				uptodate = 0;
+			else
+				clean_io_failure(inode, start, page, 0);
+		}
+
+		if (likely(uptodate))
+			goto readpage_ok;
+
+		if (tree->ops && tree->ops->readpage_io_failed_hook) {
+			ret = tree->ops->readpage_io_failed_hook(page, mirror);
+			if (!ret && !err &&
+			    test_bit(BIO_UPTODATE, &bio->bi_flags))
+				uptodate = 1;
+		} else {
+			/*
+			 * The generic bio_readpage_error handles errors the
+			 * following way: If possible, new read requests are
+			 * created and submitted and will end up in
+			 * end_bio_extent_readpage as well (if we're lucky, not
+			 * in the !uptodate case). In that case it returns 0 and
+			 * we just go on with the next page in our bio. If it
+			 * can't handle the error it will return -EIO and we
+			 * remain responsible for that page.
+			 */
+			ret = bio_readpage_error(bio, offset, page, start, end,
+						 mirror);
+			if (ret == 0) {
+				uptodate =
+					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				if (err)
+					uptodate = 0;
+				offset += len;
+				continue;
+			}
+		}
+readpage_ok:
+		if (likely(uptodate)) {
+			loff_t i_size = i_size_read(inode);
+			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+			unsigned off;
+
+			/* Zero out the end if this page straddles i_size */
+			off = i_size & (PAGE_CACHE_SIZE-1);
+			if (page->index == end_index && off)
+				zero_user_segment(page, off, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+		offset += len;
+
+		if (unlikely(!uptodate)) {
+			if (extent_len) {
+				endio_readpage_release_extent(tree,
+							      extent_start,
+							      extent_len, 1);
+				extent_start = 0;
+				extent_len = 0;
+			}
+			endio_readpage_release_extent(tree, start,
+						      end - start + 1, 0);
+		} else if (!extent_len) {
+			extent_start = start;
+			extent_len = end + 1 - start;
+		} else if (extent_start + extent_len == start) {
+			extent_len += end + 1 - start;
+		} else {
+			endio_readpage_release_extent(tree, extent_start,
+						      extent_len, uptodate);
+			extent_start = start;
+			extent_len = end + 1 - start;
+		}
+	}
+
+	if (extent_len)
+		endio_readpage_release_extent(tree, extent_start, extent_len,
+					      uptodate);
+	if (io_bio->end_io)
+		io_bio->end_io(io_bio, err);
+	bio_put(bio);
+}
+
+/*
+ * this allocates from the btrfs_bioset.  We're returning a bio right now
+ * but you can call btrfs_io_bio for the appropriate container_of magic
+ */
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		gfp_t gfp_flags)
+{
+	struct btrfs_io_bio *btrfs_bio;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2)) {
+			bio = bio_alloc_bioset(gfp_flags,
+					       nr_vecs, btrfs_bioset);
+		}
+	}
+
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_iter.bi_sector = first_sector;
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
+	}
+	return bio;
+}
+
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+	struct btrfs_io_bio *btrfs_bio;
+	struct bio *new;
+
+	new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+	if (new) {
+		btrfs_bio = btrfs_io_bio(new);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
+	}
+	return new;
+}
+
+/* this also allocates from the btrfs_bioset */
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	struct btrfs_io_bio *btrfs_bio;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+	if (bio) {
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
+	}
+	return bio;
+}
+
+
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+				       int mirror_num, unsigned long bio_flags)
+{
+	int ret = 0;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct page *page = bvec->bv_page;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+
+	start = page_offset(page) + bvec->bv_offset;
+
+	bio->bi_private = NULL;
+
+	bio_get(bio);
+
+	if (tree->ops && tree->ops->submit_bio_hook)
+		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+					   mirror_num, bio_flags, start);
+	else
+		btrfsic_submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	bio_put(bio);
+	return ret;
+}
+
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
+		     unsigned long offset, size_t size, struct bio *bio,
+		     unsigned long bio_flags)
+{
+	int ret = 0;
+	if (tree->ops && tree->ops->merge_bio_hook)
+		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
+						bio_flags);
+	BUG_ON(ret < 0);
+	return ret;
+
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      struct bio **bio_ret,
+			      unsigned long max_pages,
+			      bio_end_io_t end_io_func,
+			      int mirror_num,
+			      unsigned long prev_bio_flags,
+			      unsigned long bio_flags)
+{
+	int ret = 0;
+	struct bio *bio;
+	int nr;
+	int contig = 0;
+	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+
+	if (bio_ret && *bio_ret) {
+		bio = *bio_ret;
+		if (old_compressed)
+			contig = bio->bi_iter.bi_sector == sector;
+		else
+			contig = bio_end_sector(bio) == sector;
+
+		if (prev_bio_flags != bio_flags || !contig ||
+		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
+		    bio_add_page(bio, page, page_size, offset) < page_size) {
+			ret = submit_one_bio(rw, bio, mirror_num,
+					     prev_bio_flags);
+			if (ret < 0) {
+				*bio_ret = NULL;
+				return ret;
+			}
+			bio = NULL;
+		} else {
+			return 0;
+		}
+	}
+	if (this_compressed)
+		nr = BIO_MAX_PAGES;
+	else
+		nr = bio_get_nr_vecs(bdev);
+
+	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	if (!bio)
+		return -ENOMEM;
+
+	bio_add_page(bio, page, page_size, offset);
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+
+	if (bio_ret)
+		*bio_ret = bio;
+	else
+		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+
+	return ret;
+}
+
+static void attach_extent_buffer_page(struct extent_buffer *eb,
+				      struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long)eb);
+	} else {
+		WARN_ON(page->private != (unsigned long)eb);
+	}
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, EXTENT_PAGE_PRIVATE);
+	}
+}
+
+static struct extent_map *
+__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+		 u64 start, u64 len, get_extent_t *get_extent,
+		 struct extent_map **em_cached)
+{
+	struct extent_map *em;
+
+	if (em_cached && *em_cached) {
+		em = *em_cached;
+		if (extent_map_in_tree(em) && start >= em->start &&
+		    start < extent_map_end(em)) {
+			atomic_inc(&em->refs);
+			return em;
+		}
+
+		free_extent_map(em);
+		*em_cached = NULL;
+	}
+
+	em = get_extent(inode, page, pg_offset, start, len, 0);
+	if (em_cached && !IS_ERR_OR_NULL(em)) {
+		BUG_ON(*em_cached);
+		atomic_inc(&em->refs);
+		*em_cached = em;
+	}
+	return em;
+}
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
+ */
+static int __do_readpage(struct extent_io_tree *tree,
+			 struct page *page,
+			 get_extent_t *get_extent,
+			 struct extent_map **em_cached,
+			 struct bio **bio, int mirror_num,
+			 unsigned long *bio_flags, int rw)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = page_offset(page);
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 cur_end;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+	size_t pg_offset = 0;
+	size_t iosize;
+	size_t disk_io_size;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+
+	set_page_extent_mapped(page);
+
+	end = page_end;
+	if (!PageUptodate(page)) {
+		if (cleancache_get_page(page) == 0) {
+			BUG_ON(blocksize != PAGE_SIZE);
+			unlock_extent(tree, start, end);
+			goto out;
+		}
+	}
+
+	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+		char *userpage;
+		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+		if (zero_offset) {
+			iosize = PAGE_CACHE_SIZE - zero_offset;
+			userpage = kmap_atomic(page);
+			memset(userpage + zero_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage);
+		}
+	}
+	while (cur <= end) {
+		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
+		if (cur >= last_byte) {
+			char *userpage;
+			struct extent_state *cached = NULL;
+
+			iosize = PAGE_CACHE_SIZE - pg_offset;
+			userpage = kmap_atomic(page);
+			memset(userpage + pg_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    &cached, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent_cached(tree, cur,
+						     cur + iosize - 1,
+						     &cached, GFP_NOFS);
+			break;
+		}
+		em = __get_extent_map(inode, page, pg_offset, cur,
+				      end - cur + 1, get_extent, em_cached);
+		if (IS_ERR_OR_NULL(em)) {
+			SetPageError(page);
+			if (!parent_locked)
+				unlock_extent(tree, cur, end);
+			break;
+		}
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			this_bio_flag |= EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&this_bio_flag,
+						 em->compress_type);
+		}
+
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		cur_end = min(extent_map_end(em) - 1, end);
+		iosize = ALIGN(iosize, blocksize);
+		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+			disk_io_size = em->block_len;
+			sector = em->block_start >> 9;
+		} else {
+			sector = (em->block_start + extent_offset) >> 9;
+			disk_io_size = iosize;
+		}
+		bdev = em->bdev;
+		block_start = em->block_start;
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			block_start = EXTENT_MAP_HOLE;
+		free_extent_map(em);
+		em = NULL;
+
+		/* we've found a hole, just zero and go on */
+		if (block_start == EXTENT_MAP_HOLE) {
+			char *userpage;
+			struct extent_state *cached = NULL;
+
+			userpage = kmap_atomic(page);
+			memset(userpage + pg_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage);
+
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    &cached, GFP_NOFS);
+			unlock_extent_cached(tree, cur, cur + iosize - 1,
+			                     &cached, GFP_NOFS);
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+		/* the get_extent function already copied into the page */
+		if (test_range_bit(tree, cur, cur_end,
+				   EXTENT_UPTODATE, 1, NULL)) {
+			check_page_uptodate(tree, page);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+		/* we have an inline extent but it didn't get marked up
+		 * to date.  Error out
+		 */
+		if (block_start == EXTENT_MAP_INLINE) {
+			SetPageError(page);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+
+		pnr -= page->index;
+		ret = submit_extent_page(rw, tree, page,
+					 sector, disk_io_size, pg_offset,
+					 bdev, bio, pnr,
+					 end_bio_extent_readpage, mirror_num,
+					 *bio_flags,
+					 this_bio_flag);
+		if (!ret) {
+			nr++;
+			*bio_flags = this_bio_flag;
+		} else {
+			SetPageError(page);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
+		}
+		cur = cur + iosize;
+		pg_offset += iosize;
+	}
+out:
+	if (!nr) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
+					     struct page *pages[], int nr_pages,
+					     u64 start, u64 end,
+					     get_extent_t *get_extent,
+					     struct extent_map **em_cached,
+					     struct bio **bio, int mirror_num,
+					     unsigned long *bio_flags, int rw)
+{
+	struct inode *inode;
+	struct btrfs_ordered_extent *ordered;
+	int index;
+
+	inode = pages[0]->mapping->host;
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_range(inode, start,
+						     end - start + 1);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	for (index = 0; index < nr_pages; index++) {
+		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
+			      mirror_num, bio_flags, rw);
+		page_cache_release(pages[index]);
+	}
+}
+
+static void __extent_readpages(struct extent_io_tree *tree,
+			       struct page *pages[],
+			       int nr_pages, get_extent_t *get_extent,
+			       struct extent_map **em_cached,
+			       struct bio **bio, int mirror_num,
+			       unsigned long *bio_flags, int rw)
+{
+	u64 start = 0;
+	u64 end = 0;
+	u64 page_start;
+	int index;
+	int first_index = 0;
+
+	for (index = 0; index < nr_pages; index++) {
+		page_start = page_offset(pages[index]);
+		if (!end) {
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		} else if (end + 1 == page_start) {
+			end += PAGE_CACHE_SIZE;
+		} else {
+			__do_contiguous_readpages(tree, &pages[first_index],
+						  index - first_index, start,
+						  end, get_extent, em_cached,
+						  bio, mirror_num, bio_flags,
+						  rw);
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		}
+	}
+
+	if (end)
+		__do_contiguous_readpages(tree, &pages[first_index],
+					  index - first_index, start,
+					  end, get_extent, em_cached, bio,
+					  mirror_num, bio_flags, rw);
+}
+
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags, int rw)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_ordered_extent *ordered;
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret;
+
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_extent(inode, start);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
+			    bio_flags, rw);
+	return ret;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			    get_extent_t *get_extent, int mirror_num)
+{
+	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
+	int ret;
+
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
+				      &bio_flags, READ);
+	if (bio)
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+	return ret;
+}
+
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num)
+{
+	struct bio *bio = NULL;
+	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
+	int ret;
+
+	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
+				      &bio_flags, READ);
+	if (bio)
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+	return ret;
+}
+
+static noinline void update_nr_written(struct page *page,
+				      struct writeback_control *wbc,
+				      unsigned long nr_written)
+{
+	wbc->nr_to_write -= nr_written;
+	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+		page->mapping->writeback_index = page->index + nr_written;
+}
+
+/*
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if our fill_delalloc function did all the work required
+ * to write the page (copy into inline extent).  In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int writepage_delalloc(struct inode *inode,
+			      struct page *page, struct writeback_control *wbc,
+			      struct extent_page_data *epd,
+			      u64 delalloc_start,
+			      unsigned long *nr_written)
+{
+	struct extent_io_tree *tree = epd->tree;
+	u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+	u64 nr_delalloc;
+	u64 delalloc_to_write = 0;
+	u64 delalloc_end = 0;
+	int ret;
+	int page_started = 0;
+
+	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+		return 0;
+
+	while (delalloc_end < page_end) {
+		nr_delalloc = find_lock_delalloc_range(inode, tree,
+					       page,
+					       &delalloc_start,
+					       &delalloc_end,
+					       BTRFS_MAX_EXTENT_SIZE);
+		if (nr_delalloc == 0) {
+			delalloc_start = delalloc_end + 1;
+			continue;
+		}
+		ret = tree->ops->fill_delalloc(inode, page,
+					       delalloc_start,
+					       delalloc_end,
+					       &page_started,
+					       nr_written);
+		/* File system has been set read-only */
+		if (ret) {
+			SetPageError(page);
+			/* fill_delalloc should be return < 0 for error
+			 * but just in case, we use > 0 here meaning the
+			 * IO is started, so we don't want to return > 0
+			 * unless things are going well.
+			 */
+			ret = ret < 0 ? ret : -EIO;
+			goto done;
+		}
+		/*
+		 * delalloc_end is already one less than the total
+		 * length, so we don't subtract one from
+		 * PAGE_CACHE_SIZE
+		 */
+		delalloc_to_write += (delalloc_end - delalloc_start +
+				      PAGE_CACHE_SIZE) >>
+				      PAGE_CACHE_SHIFT;
+		delalloc_start = delalloc_end + 1;
+	}
+	if (wbc->nr_to_write < delalloc_to_write) {
+		int thresh = 8192;
+
+		if (delalloc_to_write < thresh * 2)
+			thresh = delalloc_to_write;
+		wbc->nr_to_write = min_t(u64, delalloc_to_write,
+					 thresh);
+	}
+
+	/* did the fill delalloc function already unlock and start
+	 * the IO?
+	 */
+	if (page_started) {
+		/*
+		 * we've unlocked the page, so we can't update
+		 * the mapping's writeback index, just update
+		 * nr_to_write.
+		 */
+		wbc->nr_to_write -= *nr_written;
+		return 1;
+	}
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/*
+ * helper for __extent_writepage.  This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+				 struct page *page,
+				 struct writeback_control *wbc,
+				 struct extent_page_data *epd,
+				 loff_t i_size,
+				 unsigned long nr_written,
+				 int write_flags, int *nr_ret)
+{
+	struct extent_io_tree *tree = epd->tree;
+	u64 start = page_offset(page);
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 block_start;
+	u64 iosize;
+	sector_t sector;
+	struct extent_state *cached_state = NULL;
+	struct extent_map *em;
+	struct block_device *bdev;
+	size_t pg_offset = 0;
+	size_t blocksize;
+	int ret = 0;
+	int nr = 0;
+	bool compressed;
+
+	if (tree->ops && tree->ops->writepage_start_hook) {
+		ret = tree->ops->writepage_start_hook(page, start,
+						      page_end);
+		if (ret) {
+			/* Fixup worker will requeue */
+			if (ret == -EBUSY)
+				wbc->pages_skipped++;
+			else
+				redirty_page_for_writepage(wbc, page);
+
+			update_nr_written(page, wbc, nr_written);
+			unlock_page(page);
+			ret = 1;
+			goto done_unlocked;
+		}
+	}
+
+	/*
+	 * we don't want to touch the inode after unlocking the page,
+	 * so we update the mapping writeback index now
+	 */
+	update_nr_written(page, wbc, nr_written + 1);
+
+	end = page_end;
+	if (i_size <= start) {
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		goto done;
+	}
+
+	blocksize = inode->i_sb->s_blocksize;
+
+	while (cur <= end) {
+		u64 em_end;
+		if (cur >= i_size) {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 page_end, NULL, 1);
+			break;
+		}
+		em = epd->get_extent(inode, page, pg_offset, cur,
+				     end - cur + 1, 1);
+		if (IS_ERR_OR_NULL(em)) {
+			SetPageError(page);
+			ret = PTR_ERR_OR_ZERO(em);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		em_end = extent_map_end(em);
+		BUG_ON(em_end <= cur);
+		BUG_ON(end < cur);
+		iosize = min(em_end - cur, end - cur + 1);
+		iosize = ALIGN(iosize, blocksize);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		free_extent_map(em);
+		em = NULL;
+
+		/*
+		 * compressed and inline extents are written through other
+		 * paths in the FS
+		 */
+		if (compressed || block_start == EXTENT_MAP_HOLE ||
+		    block_start == EXTENT_MAP_INLINE) {
+			/*
+			 * end_io notification does not happen here for
+			 * compressed extents
+			 */
+			if (!compressed && tree->ops &&
+			    tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 cur + iosize - 1,
+							 NULL, 1);
+			else if (compressed) {
+				/* we don't want to end_page_writeback on
+				 * a compressed extent.  this happens
+				 * elsewhere
+				 */
+				nr++;
+			}
+
+			cur += iosize;
+			pg_offset += iosize;
+			continue;
+		}
+
+		if (tree->ops && tree->ops->writepage_io_hook) {
+			ret = tree->ops->writepage_io_hook(page, cur,
+						cur + iosize - 1);
+		} else {
+			ret = 0;
+		}
+		if (ret) {
+			SetPageError(page);
+		} else {
+			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
+
+			set_range_writeback(tree, cur, cur + iosize - 1);
+			if (!PageWriteback(page)) {
+				btrfs_err(BTRFS_I(inode)->root->fs_info,
+					   "page %lu not writeback, cur %llu end %llu",
+				       page->index, cur, end);
+			}
+
+			ret = submit_extent_page(write_flags, tree, page,
+						 sector, iosize, pg_offset,
+						 bdev, &epd->bio, max_nr,
+						 end_bio_extent_writepage,
+						 0, 0, 0);
+			if (ret)
+				SetPageError(page);
+		}
+		cur = cur + iosize;
+		pg_offset += iosize;
+		nr++;
+	}
+done:
+	*nr_ret = nr;
+
+done_unlocked:
+
+	/* drop our reference on any cached states */
+	free_extent_state(cached_state);
+	return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	u64 start = page_offset(page);
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	int ret;
+	int nr = 0;
+	size_t pg_offset = 0;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	int write_flags;
+	unsigned long nr_written = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		write_flags = WRITE_SYNC;
+	else
+		write_flags = WRITE;
+
+	trace___extent_writepage(page, inode, wbc);
+
+	WARN_ON(!PageLocked(page));
+
+	ClearPageError(page);
+
+	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	if (page->index > end_index ||
+	   (page->index == end_index && !pg_offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		char *userpage;
+
+		userpage = kmap_atomic(page);
+		memset(userpage + pg_offset, 0,
+		       PAGE_CACHE_SIZE - pg_offset);
+		kunmap_atomic(userpage);
+		flush_dcache_page(page);
+	}
+
+	pg_offset = 0;
+
+	set_page_extent_mapped(page);
+
+	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
+	if (ret == 1)
+		goto done_unlocked;
+	if (ret)
+		goto done;
+
+	ret = __extent_writepage_io(inode, page, wbc, epd,
+				    i_size, nr_written, write_flags, &nr);
+	if (ret == 1)
+		goto done_unlocked;
+
+done:
+	if (nr == 0) {
+		/* make sure the mapping tag for page dirty gets cleared */
+		set_page_writeback(page);
+		end_page_writeback(page);
+	}
+	if (PageError(page)) {
+		ret = ret < 0 ? ret : -EIO;
+		end_extent_writepage(page, ret, start, page_end);
+	}
+	unlock_page(page);
+	return ret;
+
+done_unlocked:
+	return 0;
+}
+
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+static noinline_for_stack int
+lock_extent_buffer_for_io(struct extent_buffer *eb,
+			  struct btrfs_fs_info *fs_info,
+			  struct extent_page_data *epd)
+{
+	unsigned long i, num_pages;
+	int flush = 0;
+	int ret = 0;
+
+	if (!btrfs_try_tree_write_lock(eb)) {
+		flush = 1;
+		flush_write_bio(epd);
+		btrfs_tree_lock(eb);
+	}
+
+	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+		btrfs_tree_unlock(eb);
+		if (!epd->sync_io)
+			return 0;
+		if (!flush) {
+			flush_write_bio(epd);
+			flush = 1;
+		}
+		while (1) {
+			wait_on_extent_buffer_writeback(eb);
+			btrfs_tree_lock(eb);
+			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+				break;
+			btrfs_tree_unlock(eb);
+		}
+	}
+
+	/*
+	 * We need to do this to prevent races in people who check if the eb is
+	 * under IO since we can end up having no IO bits set for a short period
+	 * of time.
+	 */
+	spin_lock(&eb->refs_lock);
+	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+		spin_unlock(&eb->refs_lock);
+		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+				     -eb->len,
+				     fs_info->dirty_metadata_batch);
+		ret = 1;
+	} else {
+		spin_unlock(&eb->refs_lock);
+	}
+
+	btrfs_tree_unlock(eb);
+
+	if (!ret)
+		return ret;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = eb->pages[i];
+
+		if (!trylock_page(p)) {
+			if (!flush) {
+				flush_write_bio(epd);
+				flush = 1;
+			}
+			lock_page(p);
+		}
+	}
+
+	return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+	smp_mb__after_atomic();
+	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void set_btree_ioerr(struct page *page)
+{
+	struct extent_buffer *eb = (struct extent_buffer *)page->private;
+	struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
+
+	SetPageError(page);
+	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+		return;
+
+	/*
+	 * If writeback for a btree extent that doesn't belong to a log tree
+	 * failed, increment the counter transaction->eb_write_errors.
+	 * We do this because while the transaction is running and before it's
+	 * committing (when we call filemap_fdata[write|wait]_range against
+	 * the btree inode), we might have
+	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+	 * returns an error or an error happens during writeback, when we're
+	 * committing the transaction we wouldn't know about it, since the pages
+	 * can be no longer dirty nor marked anymore for writeback (if a
+	 * subsequent modification to the extent buffer didn't happen before the
+	 * transaction commit), which makes filemap_fdata[write|wait]_range not
+	 * able to find the pages tagged with SetPageError at transaction
+	 * commit time. So if this happens we must abort the transaction,
+	 * otherwise we commit a super block with btree roots that point to
+	 * btree nodes/leafs whose content on disk is invalid - either garbage
+	 * or the content of some node/leaf from a past generation that got
+	 * cowed or deleted and is no longer valid.
+	 *
+	 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
+	 * not be enough - we need to distinguish between log tree extents vs
+	 * non-log tree extents, and the next filemap_fdatawait_range() call
+	 * will catch and clear such errors in the mapping - and that call might
+	 * be from a log sync and not from a transaction commit. Also, checking
+	 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
+	 * not done and would not be reliable - the eb might have been released
+	 * from memory and reading it back again means that flag would not be
+	 * set (since it's a runtime flag, not persisted on disk).
+	 *
+	 * Using the flags below in the btree inode also makes us achieve the
+	 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
+	 * writeback for all dirty pages and before filemap_fdatawait_range()
+	 * is called, the writeback for all dirty pages had already finished
+	 * with errors - because we were not using AS_EIO/AS_ENOSPC,
+	 * filemap_fdatawait_range() would return success, as it could not know
+	 * that writeback errors happened (the pages were no longer tagged for
+	 * writeback).
+	 */
+	switch (eb->log_index) {
+	case -1:
+		set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
+		break;
+	case 0:
+		set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+		break;
+	case 1:
+		set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+		break;
+	default:
+		BUG(); /* unexpected, logic error */
+	}
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+	struct bio_vec *bvec;
+	struct extent_buffer *eb;
+	int i, done;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		eb = (struct extent_buffer *)page->private;
+		BUG_ON(!eb);
+		done = atomic_dec_and_test(&eb->io_pages);
+
+		if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+			ClearPageUptodate(page);
+			set_btree_ioerr(page);
+		}
+
+		end_page_writeback(page);
+
+		if (!done)
+			continue;
+
+		end_extent_buffer_writeback(eb);
+	}
+
+	bio_put(bio);
+}
+
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+			struct btrfs_fs_info *fs_info,
+			struct writeback_control *wbc,
+			struct extent_page_data *epd)
+{
+	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+	u64 offset = eb->start;
+	unsigned long i, num_pages;
+	unsigned long bio_flags = 0;
+	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
+	int ret = 0;
+
+	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
+	num_pages = num_extent_pages(eb->start, eb->len);
+	atomic_set(&eb->io_pages, num_pages);
+	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+		bio_flags = EXTENT_BIO_TREE_LOG;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = eb->pages[i];
+
+		clear_page_dirty_for_io(p);
+		set_page_writeback(p);
+		ret = submit_extent_page(rw, tree, p, offset >> 9,
+					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+					 -1, end_bio_extent_buffer_writepage,
+					 0, epd->bio_flags, bio_flags);
+		epd->bio_flags = bio_flags;
+		if (ret) {
+			set_btree_ioerr(p);
+			end_page_writeback(p);
+			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+				end_extent_buffer_writeback(eb);
+			ret = -EIO;
+			break;
+		}
+		offset += PAGE_CACHE_SIZE;
+		update_nr_written(p, wbc, 1);
+		unlock_page(p);
+	}
+
+	if (unlikely(ret)) {
+		for (; i < num_pages; i++) {
+			struct page *p = eb->pages[i];
+			clear_page_dirty_for_io(p);
+			unlock_page(p);
+		}
+	}
+
+	return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+				   struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+	struct extent_buffer *eb, *prev_eb = NULL;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
+	};
+	int ret = 0;
+	int done = 0;
+	int nr_to_write_done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int tag;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		scanned = 1;
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
+	while (!done && !nr_to_write_done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (!PagePrivate(page))
+				continue;
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				break;
+			}
+
+			spin_lock(&mapping->private_lock);
+			if (!PagePrivate(page)) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			eb = (struct extent_buffer *)page->private;
+
+			/*
+			 * Shouldn't happen and normally this would be a BUG_ON
+			 * but no sense in crashing the users box for something
+			 * we can survive anyway.
+			 */
+			if (WARN_ON(!eb)) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			if (eb == prev_eb) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			ret = atomic_inc_not_zero(&eb->refs);
+			spin_unlock(&mapping->private_lock);
+			if (!ret)
+				continue;
+
+			prev_eb = eb;
+			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+			if (!ret) {
+				free_extent_buffer(eb);
+				continue;
+			}
+
+			ret = write_one_eb(eb, fs_info, wbc, &epd);
+			if (ret) {
+				done = 1;
+				free_extent_buffer(eb);
+				break;
+			}
+			free_extent_buffer(eb);
+
+			/*
+			 * the filesystem may choose to bump up nr_to_write.
+			 * We have to make sure to honor the new nr_to_write
+			 * at any time
+			 */
+			nr_to_write_done = wbc->nr_to_write <= 0;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	flush_write_bio(&epd);
+	return ret;
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+			     struct address_space *mapping,
+			     struct writeback_control *wbc,
+			     writepage_t writepage, void *data,
+			     void (*flush_fn)(void *))
+{
+	struct inode *inode = mapping->host;
+	int ret = 0;
+	int done = 0;
+	int err = 0;
+	int nr_to_write_done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int tag;
+
+	/*
+	 * We have to hold onto the inode so that ordered extents can do their
+	 * work when the IO finishes.  The alternative to this is failing to add
+	 * an ordered extent if the igrab() fails there and that is a huge pain
+	 * to deal with, so instead just hold onto the inode throughout the
+	 * writepages operation.  If it fails here we are freeing up the inode
+	 * anyway and we'd rather not waste our time writing out stuff that is
+	 * going to be truncated anyway.
+	 */
+	if (!igrab(inode))
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		scanned = 1;
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
+	while (!done && !nr_to_write_done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			if (!trylock_page(page)) {
+				flush_fn(data);
+				lock_page(page);
+			}
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				if (PageWriteback(page))
+					flush_fn(data);
+				wait_on_page_writeback(page);
+			}
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+				unlock_page(page);
+				ret = 0;
+			}
+			if (!err && ret < 0)
+				err = ret;
+
+			/*
+			 * the filesystem may choose to bump up nr_to_write.
+			 * We have to make sure to honor the new nr_to_write
+			 * at any time
+			 */
+			nr_to_write_done = wbc->nr_to_write <= 0;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done && !err) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	btrfs_add_delayed_iput(inode);
+	return err;
+}
+
+static void flush_epd_write_bio(struct extent_page_data *epd)
+{
+	if (epd->bio) {
+		int rw = WRITE;
+		int ret;
+
+		if (epd->sync_io)
+			rw = WRITE_SYNC;
+
+		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
+		BUG_ON(ret < 0); /* -ENOMEM */
+		epd->bio = NULL;
+	}
+}
+
+static noinline void flush_write_bio(void *data)
+{
+	struct extent_page_data *epd = data;
+	flush_epd_write_bio(epd);
+}
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	int ret;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
+	};
+
+	ret = __extent_writepage(page, wbc, &epd);
+
+	flush_epd_write_bio(&epd);
+	return ret;
+}
+
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode)
+{
+	int ret = 0;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 1,
+		.sync_io = mode == WB_SYNC_ALL,
+		.bio_flags = 0,
+	};
+	struct writeback_control wbc_writepages = {
+		.sync_mode	= mode,
+		.nr_to_write	= nr_pages * 2,
+		.range_start	= start,
+		.range_end	= end + 1,
+	};
+
+	while (start <= end) {
+		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		if (clear_page_dirty_for_io(page))
+			ret = __extent_writepage(page, &wbc_writepages, &epd);
+		else {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, start,
+						 start + PAGE_CACHE_SIZE - 1,
+						 NULL, 1);
+			unlock_page(page);
+		}
+		page_cache_release(page);
+		start += PAGE_CACHE_SIZE;
+	}
+
+	flush_epd_write_bio(&epd);
+	return ret;
+}
+
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc)
+{
+	int ret = 0;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
+	};
+
+	ret = extent_write_cache_pages(tree, mapping, wbc,
+				       __extent_writepage, &epd,
+				       flush_write_bio);
+	flush_epd_write_bio(&epd);
+	return ret;
+}
+
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	unsigned long bio_flags = 0;
+	struct page *pagepool[16];
+	struct page *page;
+	struct extent_map *em_cached = NULL;
+	int nr = 0;
+
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		if (add_to_page_cache_lru(page, mapping,
+					page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			continue;
+		}
+
+		pagepool[nr++] = page;
+		if (nr < ARRAY_SIZE(pagepool))
+			continue;
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
+		nr = 0;
+	}
+	if (nr)
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
+
+	if (em_cached)
+		free_extent_map(em_cached);
+
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		return submit_one_bio(READ, bio, 0, bio_flags);
+	return 0;
+}
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset)
+{
+	struct extent_state *cached_state = NULL;
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+	start += ALIGN(offset, blocksize);
+	if (start > end)
+		return 0;
+
+	lock_extent_bits(tree, start, end, 0, &cached_state);
+	wait_on_page_writeback(page);
+	clear_extent_bit(tree, start, end,
+			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+			 EXTENT_DO_ACCOUNTING,
+			 1, 1, &cached_state, GFP_NOFS);
+	return 0;
+}
+
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+static int try_release_extent_state(struct extent_map_tree *map,
+				    struct extent_io_tree *tree,
+				    struct page *page, gfp_t mask)
+{
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret = 1;
+
+	if (test_range_bit(tree, start, end,
+			   EXTENT_IOBITS, 0, NULL))
+		ret = 0;
+	else {
+		if ((mask & GFP_NOFS) == GFP_NOFS)
+			mask = GFP_NOFS;
+		/*
+		 * at this point we can safely clear everything except the
+		 * locked bit and the nodatasum bit
+		 */
+		ret = clear_extent_bit(tree, start, end,
+				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+				 0, 0, NULL, mask);
+
+		/* if clear_extent_bit failed for enomem reasons,
+		 * we can't allow the release to continue.
+		 */
+		if (ret < 0)
+			ret = 0;
+		else
+			ret = 1;
+	}
+	return ret;
+}
+
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask)
+{
+	struct extent_map *em;
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+
+	if ((mask & __GFP_WAIT) &&
+	    page->mapping->host->i_size > 16 * 1024 * 1024) {
+		u64 len;
+		while (start <= end) {
+			len = end - start + 1;
+			write_lock(&map->lock);
+			em = lookup_extent_mapping(map, start, len);
+			if (!em) {
+				write_unlock(&map->lock);
+				break;
+			}
+			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+			    em->start != start) {
+				write_unlock(&map->lock);
+				free_extent_map(em);
+				break;
+			}
+			if (!test_range_bit(tree, em->start,
+					    extent_map_end(em) - 1,
+					    EXTENT_LOCKED | EXTENT_WRITEBACK,
+					    0, NULL)) {
+				remove_extent_mapping(map, em);
+				/* once for the rb tree */
+				free_extent_map(em);
+			}
+			start = extent_map_end(em);
+			write_unlock(&map->lock);
+
+			/* once for us */
+			free_extent_map(em);
+		}
+	}
+	return try_release_extent_state(map, tree, page, mask);
+}
+
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+						u64 offset,
+						u64 last,
+						get_extent_t *get_extent)
+{
+	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+	struct extent_map *em;
+	u64 len;
+
+	if (offset >= last)
+		return NULL;
+
+	while (1) {
+		len = last - offset;
+		if (len == 0)
+			break;
+		len = ALIGN(len, sectorsize);
+		em = get_extent(inode, NULL, 0, offset, len, 0);
+		if (IS_ERR_OR_NULL(em))
+			return em;
+
+		/* if this isn't a hole return it */
+		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+		    em->block_start != EXTENT_MAP_HOLE) {
+			return em;
+		}
+
+		/* this is a hole, advance to the next extent */
+		offset = extent_map_end(em);
+		free_extent_map(em);
+		if (offset >= last)
+			break;
+	}
+	return NULL;
+}
+
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent)
+{
+	int ret = 0;
+	u64 off = start;
+	u64 max = start + len;
+	u32 flags = 0;
+	u32 found_type;
+	u64 last;
+	u64 last_for_get_extent = 0;
+	u64 disko = 0;
+	u64 isize = i_size_read(inode);
+	struct btrfs_key found_key;
+	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int end = 0;
+	u64 em_start = 0;
+	u64 em_len = 0;
+	u64 em_end = 0;
+
+	if (len == 0)
+		return -EINVAL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	start = round_down(start, BTRFS_I(inode)->root->sectorsize);
+	len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
+
+	/*
+	 * lookup the last file extent.  We're not using i_size here
+	 * because there might be preallocation past i_size
+	 */
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+				       0);
+	if (ret < 0) {
+		btrfs_free_path(path);
+		return ret;
+	}
+	WARN_ON(!ret);
+	path->slots[0]--;
+	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+	found_type = found_key.type;
+
+	/* No extents, but there might be delalloc bits */
+	if (found_key.objectid != btrfs_ino(inode) ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		/* have to trust i_size as the end */
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	} else {
+		/*
+		 * remember the start of the last extent.  There are a
+		 * bunch of different factors that go into the length of the
+		 * extent, so its much less complex to remember where it started
+		 */
+		last = found_key.offset;
+		last_for_get_extent = last + 1;
+	}
+	btrfs_release_path(path);
+
+	/*
+	 * we might have some extents allocated but more delalloc past those
+	 * extents.  so, we trust isize unless the start of the last extent is
+	 * beyond isize
+	 */
+	if (last < isize) {
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	}
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+			 &cached_state);
+
+	em = get_extent_skip_holes(inode, start, last_for_get_extent,
+				   get_extent);
+	if (!em)
+		goto out;
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	while (!end) {
+		u64 offset_in_extent = 0;
+
+		/* break if the extent we found is outside the range */
+		if (em->start >= max || extent_map_end(em) < off)
+			break;
+
+		/*
+		 * get_extent may return an extent that starts before our
+		 * requested range.  We have to make sure the ranges
+		 * we return to fiemap always move forward and don't
+		 * overlap, so adjust the offsets here
+		 */
+		em_start = max(em->start, off);
+
+		/*
+		 * record the offset from the start of the extent
+		 * for adjusting the disk offset below.  Only do this if the
+		 * extent isn't compressed since our in ram offset may be past
+		 * what we have actually allocated on disk.
+		 */
+		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			offset_in_extent = em_start - em->start;
+		em_end = extent_map_end(em);
+		em_len = em_end - em_start;
+		disko = 0;
+		flags = 0;
+
+		/*
+		 * bump off for our next call to get_extent
+		 */
+		off = extent_map_end(em);
+		if (off >= max)
+			end = 1;
+
+		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
+			end = 1;
+			flags |= FIEMAP_EXTENT_LAST;
+		} else if (em->block_start == EXTENT_MAP_INLINE) {
+			flags |= (FIEMAP_EXTENT_DATA_INLINE |
+				  FIEMAP_EXTENT_NOT_ALIGNED);
+		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+		} else if (fieinfo->fi_extents_max) {
+			u64 bytenr = em->block_start -
+				(em->start - em->orig_start);
+
+			disko = em->block_start + offset_in_extent;
+
+			/*
+			 * As btrfs supports shared space, this information
+			 * can be exported to userspace tools via
+			 * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
+			 * then we're just getting a count and we can skip the
+			 * lookup stuff.
+			 */
+			ret = btrfs_check_shared(NULL, root->fs_info,
+						 root->objectid,
+						 btrfs_ino(inode), bytenr);
+			if (ret < 0)
+				goto out_free;
+			if (ret)
+				flags |= FIEMAP_EXTENT_SHARED;
+			ret = 0;
+		}
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			flags |= FIEMAP_EXTENT_ENCODED;
+
+		free_extent_map(em);
+		em = NULL;
+		if ((em_start >= last) || em_len == (u64)-1 ||
+		   (last == (u64)-1 && isize <= em_end)) {
+			flags |= FIEMAP_EXTENT_LAST;
+			end = 1;
+		}
+
+		/* now scan forward to see if this is really the last extent. */
+		em = get_extent_skip_holes(inode, off, last_for_get_extent,
+					   get_extent);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+		if (!em) {
+			flags |= FIEMAP_EXTENT_LAST;
+			end = 1;
+		}
+		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+					      em_len, flags);
+		if (ret) {
+			if (ret == 1)
+				ret = 0;
+			goto out_free;
+		}
+	}
+out_free:
+	free_extent_map(em);
+out:
+	btrfs_free_path(path);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+			     &cached_state, GFP_NOFS);
+	return ret;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+	btrfs_leak_debug_del(&eb->leak_list);
+	kmem_cache_free(extent_buffer_cache, eb);
+}
+
+int extent_buffer_under_io(struct extent_buffer *eb)
+{
+	return (atomic_read(&eb->io_pages) ||
+		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+}
+
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
+{
+	unsigned long index;
+	struct page *page;
+	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+	BUG_ON(extent_buffer_under_io(eb));
+
+	index = num_extent_pages(eb->start, eb->len);
+	if (index == 0)
+		return;
+
+	do {
+		index--;
+		page = eb->pages[index];
+		if (!page)
+			continue;
+		if (mapped)
+			spin_lock(&page->mapping->private_lock);
+		/*
+		 * We do this since we'll remove the pages after we've
+		 * removed the eb from the radix tree, so we could race
+		 * and have this page now attached to the new eb.  So
+		 * only clear page_private if it's still connected to
+		 * this eb.
+		 */
+		if (PagePrivate(page) &&
+		    page->private == (unsigned long)eb) {
+			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+			BUG_ON(PageDirty(page));
+			BUG_ON(PageWriteback(page));
+			/*
+			 * We need to make sure we haven't be attached
+			 * to a new eb.
+			 */
+			ClearPagePrivate(page);
+			set_page_private(page, 0);
+			/* One for the page private */
+			page_cache_release(page);
+		}
+
+		if (mapped)
+			spin_unlock(&page->mapping->private_lock);
+
+		/* One for when we alloced the page */
+		page_cache_release(page);
+	} while (index != 0);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+	btrfs_release_extent_buffer_page(eb);
+	__free_extent_buffer(eb);
+}
+
+static struct extent_buffer *
+__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+		      unsigned long len)
+{
+	struct extent_buffer *eb = NULL;
+
+	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
+	if (eb == NULL)
+		return NULL;
+	eb->start = start;
+	eb->len = len;
+	eb->fs_info = fs_info;
+	eb->bflags = 0;
+	rwlock_init(&eb->lock);
+	atomic_set(&eb->write_locks, 0);
+	atomic_set(&eb->read_locks, 0);
+	atomic_set(&eb->blocking_readers, 0);
+	atomic_set(&eb->blocking_writers, 0);
+	atomic_set(&eb->spinning_readers, 0);
+	atomic_set(&eb->spinning_writers, 0);
+	eb->lock_nested = 0;
+	init_waitqueue_head(&eb->write_lock_wq);
+	init_waitqueue_head(&eb->read_lock_wq);
+
+	btrfs_leak_debug_add(&eb->leak_list, &buffers);
+
+	spin_lock_init(&eb->refs_lock);
+	atomic_set(&eb->refs, 1);
+	atomic_set(&eb->io_pages, 0);
+
+	/*
+	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+	 */
+	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+		> MAX_INLINE_EXTENT_BUFFER_SIZE);
+	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
+
+	return eb;
+}
+
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+	unsigned long i;
+	struct page *p;
+	struct extent_buffer *new;
+	unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+	if (new == NULL)
+		return NULL;
+
+	for (i = 0; i < num_pages; i++) {
+		p = alloc_page(GFP_NOFS);
+		if (!p) {
+			btrfs_release_extent_buffer(new);
+			return NULL;
+		}
+		attach_extent_buffer_page(new, p);
+		WARN_ON(PageDirty(p));
+		SetPageUptodate(p);
+		new->pages[i] = p;
+	}
+
+	copy_extent_buffer(new, src, 0, 0, src->len);
+	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+	return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+						u64 start)
+{
+	struct extent_buffer *eb;
+	unsigned long len;
+	unsigned long num_pages;
+	unsigned long i;
+
+	if (!fs_info) {
+		/*
+		 * Called only from tests that don't always have a fs_info
+		 * available, but we know that nodesize is 4096
+		 */
+		len = 4096;
+	} else {
+		len = fs_info->tree_root->nodesize;
+	}
+	num_pages = num_extent_pages(0, len);
+
+	eb = __alloc_extent_buffer(fs_info, start, len);
+	if (!eb)
+		return NULL;
+
+	for (i = 0; i < num_pages; i++) {
+		eb->pages[i] = alloc_page(GFP_NOFS);
+		if (!eb->pages[i])
+			goto err;
+	}
+	set_extent_buffer_uptodate(eb);
+	btrfs_set_header_nritems(eb, 0);
+	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+	return eb;
+err:
+	for (; i > 0; i--)
+		__free_page(eb->pages[i - 1]);
+	__free_extent_buffer(eb);
+	return NULL;
+}
+
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+	int refs;
+	/* the ref bit is tricky.  We have to make sure it is set
+	 * if we have the buffer dirty.   Otherwise the
+	 * code to free a buffer can end up dropping a dirty
+	 * page
+	 *
+	 * Once the ref bit is set, it won't go away while the
+	 * buffer is dirty or in writeback, and it also won't
+	 * go away while we have the reference count on the
+	 * eb bumped.
+	 *
+	 * We can't just set the ref bit without bumping the
+	 * ref on the eb because free_extent_buffer might
+	 * see the ref bit and try to clear it.  If this happens
+	 * free_extent_buffer might end up dropping our original
+	 * ref by mistake and freeing the page before we are able
+	 * to add one more ref.
+	 *
+	 * So bump the ref count first, then set the bit.  If someone
+	 * beat us to it, drop the ref we added.
+	 */
+	refs = atomic_read(&eb->refs);
+	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		return;
+
+	spin_lock(&eb->refs_lock);
+	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_inc(&eb->refs);
+	spin_unlock(&eb->refs_lock);
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+		struct page *accessed)
+{
+	unsigned long num_pages, i;
+
+	check_buffer_tree_ref(eb);
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = eb->pages[i];
+
+		if (p != accessed)
+			mark_page_accessed(p);
+	}
+}
+
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+					 u64 start)
+{
+	struct extent_buffer *eb;
+
+	rcu_read_lock();
+	eb = radix_tree_lookup(&fs_info->buffer_radix,
+			       start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
+		/*
+		 * Lock our eb's refs_lock to avoid races with
+		 * free_extent_buffer. When we get our eb it might be flagged
+		 * with EXTENT_BUFFER_STALE and another task running
+		 * free_extent_buffer might have seen that flag set,
+		 * eb->refs == 2, that the buffer isn't under IO (dirty and
+		 * writeback flags not set) and it's still in the tree (flag
+		 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
+		 * of decrementing the extent buffer's reference count twice.
+		 * So here we could race and increment the eb's reference count,
+		 * clear its stale flag, mark it as dirty and drop our reference
+		 * before the other task finishes executing free_extent_buffer,
+		 * which would later result in an attempt to free an extent
+		 * buffer that is dirty.
+		 */
+		if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+			spin_lock(&eb->refs_lock);
+			spin_unlock(&eb->refs_lock);
+		}
+		mark_extent_buffer_accessed(eb, NULL);
+		return eb;
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+					       u64 start)
+{
+	struct extent_buffer *eb, *exists = NULL;
+	int ret;
+
+	eb = find_extent_buffer(fs_info, start);
+	if (eb)
+		return eb;
+	eb = alloc_dummy_extent_buffer(fs_info, start);
+	if (!eb)
+		return NULL;
+	eb->fs_info = fs_info;
+again:
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto free_eb;
+	spin_lock(&fs_info->buffer_lock);
+	ret = radix_tree_insert(&fs_info->buffer_radix,
+				start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&fs_info->buffer_lock);
+	radix_tree_preload_end();
+	if (ret == -EEXIST) {
+		exists = find_extent_buffer(fs_info, start);
+		if (exists)
+			goto free_eb;
+		else
+			goto again;
+	}
+	check_buffer_tree_ref(eb);
+	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+	/*
+	 * We will free dummy extent buffer's if they come into
+	 * free_extent_buffer with a ref count of 2, but if we are using this we
+	 * want the buffers to stay in memory until we're done with them, so
+	 * bump the ref count again.
+	 */
+	atomic_inc(&eb->refs);
+	return eb;
+free_eb:
+	btrfs_release_extent_buffer(eb);
+	return exists;
+}
+#endif
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+					  u64 start)
+{
+	unsigned long len = fs_info->tree_root->nodesize;
+	unsigned long num_pages = num_extent_pages(start, len);
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct extent_buffer *exists = NULL;
+	struct page *p;
+	struct address_space *mapping = fs_info->btree_inode->i_mapping;
+	int uptodate = 1;
+	int ret;
+
+	eb = find_extent_buffer(fs_info, start);
+	if (eb)
+		return eb;
+
+	eb = __alloc_extent_buffer(fs_info, start, len);
+	if (!eb)
+		return NULL;
+
+	for (i = 0; i < num_pages; i++, index++) {
+		p = find_or_create_page(mapping, index, GFP_NOFS);
+		if (!p)
+			goto free_eb;
+
+		spin_lock(&mapping->private_lock);
+		if (PagePrivate(p)) {
+			/*
+			 * We could have already allocated an eb for this page
+			 * and attached one so lets see if we can get a ref on
+			 * the existing eb, and if we can we know it's good and
+			 * we can just return that one, else we know we can just
+			 * overwrite page->private.
+			 */
+			exists = (struct extent_buffer *)p->private;
+			if (atomic_inc_not_zero(&exists->refs)) {
+				spin_unlock(&mapping->private_lock);
+				unlock_page(p);
+				page_cache_release(p);
+				mark_extent_buffer_accessed(exists, p);
+				goto free_eb;
+			}
+			exists = NULL;
+
+			/*
+			 * Do this so attach doesn't complain and we need to
+			 * drop the ref the old guy had.
+			 */
+			ClearPagePrivate(p);
+			WARN_ON(PageDirty(p));
+			page_cache_release(p);
+		}
+		attach_extent_buffer_page(eb, p);
+		spin_unlock(&mapping->private_lock);
+		WARN_ON(PageDirty(p));
+		eb->pages[i] = p;
+		if (!PageUptodate(p))
+			uptodate = 0;
+
+		/*
+		 * see below about how we avoid a nasty race with release page
+		 * and why we unlock later
+		 */
+	}
+	if (uptodate)
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+again:
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto free_eb;
+
+	spin_lock(&fs_info->buffer_lock);
+	ret = radix_tree_insert(&fs_info->buffer_radix,
+				start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&fs_info->buffer_lock);
+	radix_tree_preload_end();
+	if (ret == -EEXIST) {
+		exists = find_extent_buffer(fs_info, start);
+		if (exists)
+			goto free_eb;
+		else
+			goto again;
+	}
+	/* add one reference for the tree */
+	check_buffer_tree_ref(eb);
+	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+	/*
+	 * there is a race where release page may have
+	 * tried to find this extent buffer in the radix
+	 * but failed.  It will tell the VM it is safe to
+	 * reclaim the, and it will clear the page private bit.
+	 * We must make sure to set the page private bit properly
+	 * after the extent buffer is in the radix tree so
+	 * it doesn't get lost
+	 */
+	SetPageChecked(eb->pages[0]);
+	for (i = 1; i < num_pages; i++) {
+		p = eb->pages[i];
+		ClearPageChecked(p);
+		unlock_page(p);
+	}
+	unlock_page(eb->pages[0]);
+	return eb;
+
+free_eb:
+	WARN_ON(!atomic_dec_and_test(&eb->refs));
+	for (i = 0; i < num_pages; i++) {
+		if (eb->pages[i])
+			unlock_page(eb->pages[i]);
+	}
+
+	btrfs_release_extent_buffer(eb);
+	return exists;
+}
+
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+	struct extent_buffer *eb =
+			container_of(head, struct extent_buffer, rcu_head);
+
+	__free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static int release_extent_buffer(struct extent_buffer *eb)
+{
+	WARN_ON(atomic_read(&eb->refs) == 0);
+	if (atomic_dec_and_test(&eb->refs)) {
+		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
+			struct btrfs_fs_info *fs_info = eb->fs_info;
+
+			spin_unlock(&eb->refs_lock);
+
+			spin_lock(&fs_info->buffer_lock);
+			radix_tree_delete(&fs_info->buffer_radix,
+					  eb->start >> PAGE_CACHE_SHIFT);
+			spin_unlock(&fs_info->buffer_lock);
+		} else {
+			spin_unlock(&eb->refs_lock);
+		}
+
+		/* Should be safe to release our pages at this point */
+		btrfs_release_extent_buffer_page(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+		if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+			__free_extent_buffer(eb);
+			return 1;
+		}
+#endif
+		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+		return 1;
+	}
+	spin_unlock(&eb->refs_lock);
+
+	return 0;
+}
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+	int refs;
+	int old;
+	if (!eb)
+		return;
+
+	while (1) {
+		refs = atomic_read(&eb->refs);
+		if (refs <= 3)
+			break;
+		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+		if (old == refs)
+			return;
+	}
+
+	spin_lock(&eb->refs_lock);
+	if (atomic_read(&eb->refs) == 2 &&
+	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+		atomic_dec(&eb->refs);
+
+	if (atomic_read(&eb->refs) == 2 &&
+	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+	    !extent_buffer_under_io(eb) &&
+	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_dec(&eb->refs);
+
+	/*
+	 * I know this is terrible, but it's temporary until we stop tracking
+	 * the uptodate bits and such for the extent buffers.
+	 */
+	release_extent_buffer(eb);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+	if (!eb)
+		return;
+
+	spin_lock(&eb->refs_lock);
+	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_dec(&eb->refs);
+	release_extent_buffer(eb);
+}
+
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+	struct page *page;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	for (i = 0; i < num_pages; i++) {
+		page = eb->pages[i];
+		if (!PageDirty(page))
+			continue;
+
+		lock_page(page);
+		WARN_ON(!PagePrivate(page));
+
+		clear_page_dirty_for_io(page);
+		spin_lock_irq(&page->mapping->tree_lock);
+		if (!PageDirty(page)) {
+			radix_tree_tag_clear(&page->mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		}
+		spin_unlock_irq(&page->mapping->tree_lock);
+		ClearPageError(page);
+		unlock_page(page);
+	}
+	WARN_ON(atomic_read(&eb->refs) == 0);
+}
+
+int set_extent_buffer_dirty(struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+	int was_dirty = 0;
+
+	check_buffer_tree_ref(eb);
+
+	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	WARN_ON(atomic_read(&eb->refs) == 0);
+	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
+	for (i = 0; i < num_pages; i++)
+		set_page_dirty(eb->pages[i]);
+	return was_dirty;
+}
+
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		page = eb->pages[i];
+		if (page)
+			ClearPageUptodate(page);
+	}
+	return 0;
+}
+
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		page = eb->pages[i];
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+
+int extent_buffer_uptodate(struct extent_buffer *eb)
+{
+	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+}
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb, u64 start, int wait,
+			     get_extent_t *get_extent, int mirror_num)
+{
+	unsigned long i;
+	unsigned long start_i;
+	struct page *page;
+	int err;
+	int ret = 0;
+	int locked_pages = 0;
+	int all_uptodate = 1;
+	unsigned long num_pages;
+	unsigned long num_reads = 0;
+	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
+
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+		return 0;
+
+	if (start) {
+		WARN_ON(start < eb->start);
+		start_i = (start >> PAGE_CACHE_SHIFT) -
+			(eb->start >> PAGE_CACHE_SHIFT);
+	} else {
+		start_i = 0;
+	}
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = start_i; i < num_pages; i++) {
+		page = eb->pages[i];
+		if (wait == WAIT_NONE) {
+			if (!trylock_page(page))
+				goto unlock_exit;
+		} else {
+			lock_page(page);
+		}
+		locked_pages++;
+		if (!PageUptodate(page)) {
+			num_reads++;
+			all_uptodate = 0;
+		}
+	}
+	if (all_uptodate) {
+		if (start_i == 0)
+			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+		goto unlock_exit;
+	}
+
+	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+	eb->read_mirror = 0;
+	atomic_set(&eb->io_pages, num_reads);
+	for (i = start_i; i < num_pages; i++) {
+		page = eb->pages[i];
+		if (!PageUptodate(page)) {
+			ClearPageError(page);
+			err = __extent_read_full_page(tree, page,
+						      get_extent, &bio,
+						      mirror_num, &bio_flags,
+						      READ | REQ_META);
+			if (err)
+				ret = err;
+		} else {
+			unlock_page(page);
+		}
+	}
+
+	if (bio) {
+		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
+				     bio_flags);
+		if (err)
+			return err;
+	}
+
+	if (ret || wait != WAIT_COMPLETE)
+		return ret;
+
+	for (i = start_i; i < num_pages; i++) {
+		page = eb->pages[i];
+		wait_on_page_locked(page);
+		if (!PageUptodate(page))
+			ret = -EIO;
+	}
+
+	return ret;
+
+unlock_exit:
+	i = start_i;
+	while (locked_pages > 0) {
+		page = eb->pages[i];
+		i++;
+		unlock_page(page);
+		locked_pages--;
+	}
+	return ret;
+}
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = eb->pages[i];
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		kaddr = page_address(page);
+		memcpy(dst, kaddr + offset, cur);
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char __user *dst = (char __user *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = eb->pages[i];
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		kaddr = page_address(page);
+		if (copy_to_user(dst, kaddr + offset, cur)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+
+	return ret;
+}
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+			       unsigned long min_len, char **map,
+			       unsigned long *map_start,
+			       unsigned long *map_len)
+{
+	size_t offset = start & (PAGE_CACHE_SIZE - 1);
+	char *kaddr;
+	struct page *p;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long end_i = (start_offset + start + min_len - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	if (i != end_i)
+		return -EINVAL;
+
+	if (i == 0) {
+		offset = start_offset;
+		*map_start = 0;
+	} else {
+		offset = 0;
+		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+	}
+
+	if (start + min_len > eb->len) {
+		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n",
+		       eb->start, eb->len, start, min_len);
+		return -EINVAL;
+	}
+
+	p = eb->pages[i];
+	kaddr = page_address(p);
+	*map = kaddr + offset;
+	*map_len = PAGE_CACHE_SIZE - offset;
+	return 0;
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *ptr = (char *)ptrv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = eb->pages[i];
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+		kaddr = page_address(page);
+		ret = memcmp(ptr, kaddr + offset, cur);
+		if (ret)
+			break;
+
+		ptr += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+	return ret;
+}
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+			 unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *src = (char *)srcv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = eb->pages[i];
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = page_address(page);
+		memcpy(kaddr + offset, src, cur);
+
+		src += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = eb->pages[i];
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = page_address(page);
+		memset(kaddr + offset, c, cur);
+
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len)
+{
+	u64 dst_len = dst->len;
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(src->len != dst_len);
+
+	offset = (start_offset + dst_offset) &
+		(PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = dst->pages[i];
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+		kaddr = page_address(page);
+		read_extent_buffer(src, kaddr + offset, src_offset, cur);
+
+		src_offset += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+	unsigned long distance = (src > dst) ? src - dst : dst - src;
+	return distance < len;
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = page_address(dst_page);
+	char *src_kaddr;
+	int must_memmove = 0;
+
+	if (dst_page != src_page) {
+		src_kaddr = page_address(src_page);
+	} else {
+		src_kaddr = dst_kaddr;
+		if (areas_overlap(src_off, dst_off, len))
+			must_memmove = 1;
+	}
+
+	if (must_memmove)
+		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	else
+		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu\n", src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+
+	while (len > 0) {
+		dst_off_in_page = (start_offset + dst_offset) &
+			(PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_offset) &
+			(PAGE_CACHE_SIZE - 1);
+
+		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+					       src_off_in_page));
+		cur = min_t(unsigned long, cur,
+			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+		copy_pages(dst->pages[dst_i], dst->pages[src_i],
+			   dst_off_in_page, src_off_in_page, cur);
+
+		src_offset += cur;
+		dst_offset += cur;
+		len -= cur;
+	}
+}
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	unsigned long dst_end = dst_offset + len - 1;
+	unsigned long src_end = src_offset + len - 1;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
+		       "len %lu len %lu\n", src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
+		       "len %lu len %lu\n", dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset < src_offset) {
+		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+		return;
+	}
+	while (len > 0) {
+		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+		dst_off_in_page = (start_offset + dst_end) &
+			(PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_end) &
+			(PAGE_CACHE_SIZE - 1);
+
+		cur = min_t(unsigned long, len, src_off_in_page + 1);
+		cur = min(cur, dst_off_in_page + 1);
+		copy_pages(dst->pages[dst_i], dst->pages[src_i],
+			   dst_off_in_page - cur + 1,
+			   src_off_in_page - cur + 1, cur);
+
+		dst_end -= cur;
+		src_end -= cur;
+		len -= cur;
+	}
+}
+
+int try_release_extent_buffer(struct page *page)
+{
+	struct extent_buffer *eb;
+
+	/*
+	 * We need to make sure noboody is attaching this page to an eb right
+	 * now.
+	 */
+	spin_lock(&page->mapping->private_lock);
+	if (!PagePrivate(page)) {
+		spin_unlock(&page->mapping->private_lock);
+		return 1;
+	}
+
+	eb = (struct extent_buffer *)page->private;
+	BUG_ON(!eb);
+
+	/*
+	 * This is a little awful but should be ok, we need to make sure that
+	 * the eb doesn't disappear out from under us while we're looking at
+	 * this page.
+	 */
+	spin_lock(&eb->refs_lock);
+	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+		spin_unlock(&eb->refs_lock);
+		spin_unlock(&page->mapping->private_lock);
+		return 0;
+	}
+	spin_unlock(&page->mapping->private_lock);
+
+	/*
+	 * If tree ref isn't set then we know the ref on this eb is a real ref,
+	 * so just return, this page will likely be freed soon anyway.
+	 */
+	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+		spin_unlock(&eb->refs_lock);
+		return 0;
+	}
+
+	return release_extent_buffer(eb);
+}
diff --git a/kernel/fs/btrfs/extent_io.h b/kernel/fs/btrfs/extent_io.h
new file mode 100644
index 000000000..c668f3689
--- /dev/null
+++ b/kernel/fs/btrfs/extent_io.h
@@ -0,0 +1,382 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY		(1U << 0)
+#define EXTENT_WRITEBACK	(1U << 1)
+#define EXTENT_UPTODATE		(1U << 2)
+#define EXTENT_LOCKED		(1U << 3)
+#define EXTENT_NEW		(1U << 4)
+#define EXTENT_DELALLOC		(1U << 5)
+#define EXTENT_DEFRAG		(1U << 6)
+#define EXTENT_BOUNDARY		(1U << 9)
+#define EXTENT_NODATASUM	(1U << 10)
+#define EXTENT_DO_ACCOUNTING	(1U << 11)
+#define EXTENT_FIRST_DELALLOC	(1U << 12)
+#define EXTENT_NEED_WAIT	(1U << 13)
+#define EXTENT_DAMAGED		(1U << 14)
+#define EXTENT_NORESERVE	(1U << 15)
+#define EXTENT_IOBITS		(EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS		(EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
+#define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
+#define EXTENT_BIO_PARENT_LOCKED 4
+#define EXTENT_BIO_FLAG_SHIFT 16
+
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_READ_ERR 8        /* read IO error */
+#define EXTENT_BUFFER_DUMMY 9
+#define EXTENT_BUFFER_IN_TREE 10
+#define EXTENT_BUFFER_WRITE_ERR 11    /* write IO error */
+
+/* these are flags for extent_clear_unlock_delalloc */
+#define PAGE_UNLOCK		(1 << 0)
+#define PAGE_CLEAR_DIRTY	(1 << 1)
+#define PAGE_SET_WRITEBACK	(1 << 2)
+#define PAGE_END_WRITEBACK	(1 << 3)
+#define PAGE_SET_PRIVATE2	(1 << 4)
+#define PAGE_SET_ERROR		(1 << 5)
+
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+
+struct extent_state;
+struct btrfs_root;
+struct btrfs_io_bio;
+
+typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+				       struct bio *bio, int mirror_num,
+				       unsigned long bio_flags, u64 bio_offset);
+struct extent_io_ops {
+	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+			     u64 start, u64 end, int *page_started,
+			     unsigned long *nr_written);
+	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+	extent_submit_bio_hook_t *submit_bio_hook;
+	int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
+			      size_t size, struct bio *bio,
+			      unsigned long bio_flags);
+	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
+				    struct page *page, u64 start, u64 end,
+				    int mirror);
+	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+				      struct extent_state *state, int uptodate);
+	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+			     unsigned *bits);
+	void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+			       unsigned *bits);
+	void (*merge_extent_hook)(struct inode *inode,
+				  struct extent_state *new,
+				  struct extent_state *other);
+	void (*split_extent_hook)(struct inode *inode,
+				  struct extent_state *orig, u64 split);
+};
+
+struct extent_io_tree {
+	struct rb_root state;
+	struct address_space *mapping;
+	u64 dirty_bytes;
+	int track_uptodate;
+	spinlock_t lock;
+	const struct extent_io_ops *ops;
+};
+
+struct extent_state {
+	u64 start;
+	u64 end; /* inclusive */
+	struct rb_node rb_node;
+
+	/* ADD NEW ELEMENTS AFTER THIS */
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned state;
+
+	/* for use by the FS */
+	u64 private;
+
+#ifdef CONFIG_BTRFS_DEBUG
+	struct list_head leak_list;
+#endif
+};
+
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
+struct extent_buffer {
+	u64 start;
+	unsigned long len;
+	unsigned long bflags;
+	struct btrfs_fs_info *fs_info;
+	spinlock_t refs_lock;
+	atomic_t refs;
+	atomic_t io_pages;
+	int read_mirror;
+	struct rcu_head rcu_head;
+	pid_t lock_owner;
+
+	/* count of read lock holders on the extent buffer */
+	atomic_t write_locks;
+	atomic_t read_locks;
+	atomic_t blocking_writers;
+	atomic_t blocking_readers;
+	atomic_t spinning_readers;
+	atomic_t spinning_writers;
+	short lock_nested;
+	/* >= 0 if eb belongs to a log tree, -1 otherwise */
+	short log_index;
+
+	/* protects write locks */
+	rwlock_t lock;
+
+	/* readers use lock_wq while they wait for the write
+	 * lock holders to unlock
+	 */
+	wait_queue_head_t write_lock_wq;
+
+	/* writers use read_lock_wq while they wait for readers
+	 * to unlock
+	 */
+	wait_queue_head_t read_lock_wq;
+	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+#ifdef CONFIG_BTRFS_DEBUG
+	struct list_head leak_list;
+#endif
+};
+
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+					    int compress_type)
+{
+	*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+	return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
+
+struct extent_map_tree;
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+					  struct page *page,
+					  size_t pg_offset,
+					  u64 start, u64 len,
+					  int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			 struct address_space *mapping);
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask);
+int try_release_extent_buffer(struct page *page);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     unsigned bits, struct extent_state **cached);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent, int mirror_num);
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end,
+		     u64 max_bytes, unsigned bits, int contig);
+
+void free_extent_state(struct extent_state *state);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   unsigned bits, int filled,
+		   struct extent_state *cached_state);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      unsigned bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     unsigned bits, int wake, int delete,
+		     struct extent_state **cached, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    unsigned bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   unsigned bits, u64 *failed_start,
+		   struct extent_state **cached_state, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  struct extent_state **cached_state, gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		   gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       unsigned bits, unsigned clear_bits,
+		       struct extent_state **cached_state, gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, unsigned bits,
+			  struct extent_state **cached_state);
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode);
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+			    struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent);
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+					  u64 start);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+		u64 start);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+					 u64 start);
+void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
+#define WAIT_NONE	0
+#define WAIT_COMPLETE	1
+#define WAIT_PAGE_LOCK	2
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb, u64 start, int wait,
+			     get_extent_t *get_extent, int mirror_num);
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+			unsigned long start,
+			unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+			       unsigned long start,
+			       unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+			 unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(struct extent_buffer *eb);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len);
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned bits_to_clear,
+				 unsigned long page_ops);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		gfp_t gfp_flags);
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
+
+struct btrfs_fs_info;
+
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+		      struct page *page, unsigned int pg_offset,
+		      int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+		     unsigned int pg_offset);
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+			 int mirror_num);
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	unsigned long bio_flags;
+	int this_mirror;
+	int failed_mirror;
+	int in_validation;
+};
+
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+				struct io_failure_record **failrec_ret);
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+			   struct io_failure_record *failrec, int fail_mirror);
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+				    struct io_failure_record *failrec,
+				    struct page *page, int pg_offset, int icsum,
+				    bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+				      struct extent_io_tree *tree,
+				      struct page *locked_page, u64 *start,
+				      u64 *end, u64 max_bytes);
+#endif
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+					       u64 start);
+#endif
diff --git a/kernel/fs/btrfs/extent_map.c b/kernel/fs/btrfs/extent_map.c
new file mode 100644
index 000000000..6a98bddd8
--- /dev/null
+++ b/kernel/fs/btrfs/extent_map.c
@@ -0,0 +1,455 @@
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include "ctree.h"
+#include "extent_map.h"
+
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+	extent_map_cache = kmem_cache_create("btrfs_extent_map",
+			sizeof(struct extent_map), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!extent_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void extent_map_exit(void)
+{
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+}
+
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree:		tree to initialize
+ *
+ * Initialize the extent tree @tree.  Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree)
+{
+	tree->map = RB_ROOT;
+	INIT_LIST_HEAD(&tree->modified_extents);
+	rwlock_init(&tree->lock);
+}
+
+/**
+ * alloc_extent_map - allocate new extent map structure
+ *
+ * Allocate a new extent_map structure.  The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(void)
+{
+	struct extent_map *em;
+	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
+	if (!em)
+		return NULL;
+	RB_CLEAR_NODE(&em->rb_node);
+	em->flags = 0;
+	em->compress_type = BTRFS_COMPRESS_NONE;
+	em->generation = 0;
+	atomic_set(&em->refs, 1);
+	INIT_LIST_HEAD(&em->list);
+	return em;
+}
+
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em:		extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+	if (!em)
+		return;
+	WARN_ON(atomic_read(&em->refs) == 0);
+	if (atomic_dec_and_test(&em->refs)) {
+		WARN_ON(extent_map_in_tree(em));
+		WARN_ON(!list_empty(&em->list));
+		if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+			kfree(em->bdev);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+	if (start + len < start)
+		return (u64)-1;
+	return start + len;
+}
+
+static int tree_insert(struct rb_root *root, struct extent_map *em)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_map *entry = NULL;
+	struct rb_node *orig_parent = NULL;
+	u64 end = range_end(em->start, em->len);
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		if (em->start < entry->start)
+			p = &(*p)->rb_left;
+		else if (em->start >= extent_map_end(entry))
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	orig_parent = parent;
+	while (parent && em->start >= extent_map_end(entry)) {
+		parent = rb_next(parent);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+	}
+	if (parent)
+		if (end > entry->start && em->start < extent_map_end(entry))
+			return -EEXIST;
+
+	parent = orig_parent;
+	entry = rb_entry(parent, struct extent_map, rb_node);
+	while (parent && em->start < entry->start) {
+		parent = rb_prev(parent);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+	}
+	if (parent)
+		if (end > entry->start && em->start < extent_map_end(entry))
+			return -EEXIST;
+
+	rb_link_node(&em->rb_node, orig_parent, p);
+	rb_insert_color(&em->rb_node, root);
+	return 0;
+}
+
+/*
+ * search through the tree for an extent_map with a given offset.  If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
+{
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct extent_map, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset >= extent_map_end(entry))
+			n = n->rb_right;
+		else
+			return n;
+	}
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while (prev && offset >= extent_map_end(prev_entry)) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		while (prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+		return 0;
+
+	/*
+	 * don't merge compressed extents, we need to know their
+	 * actual size
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+		return 0;
+
+	if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+	    test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+		return 0;
+
+	/*
+	 * We don't want to merge stuff that hasn't been written to the log yet
+	 * since it may not reflect exactly what is on disk, and that would be
+	 * bad.
+	 */
+	if (!list_empty(&prev->list) || !list_empty(&next->list))
+		return 0;
+
+	if (extent_map_end(prev) == next->start &&
+	    prev->flags == next->flags &&
+	    prev->bdev == next->bdev &&
+	    ((next->block_start == EXTENT_MAP_HOLE &&
+	      prev->block_start == EXTENT_MAP_HOLE) ||
+	     (next->block_start == EXTENT_MAP_INLINE &&
+	      prev->block_start == EXTENT_MAP_INLINE) ||
+	     (next->block_start == EXTENT_MAP_DELALLOC &&
+	      prev->block_start == EXTENT_MAP_DELALLOC) ||
+	     (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+	      next->block_start == extent_map_block_end(prev)))) {
+		return 1;
+	}
+	return 0;
+}
+
+static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
+{
+	struct extent_map *merge = NULL;
+	struct rb_node *rb;
+
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->orig_start = merge->orig_start;
+			em->len += merge->len;
+			em->block_len += merge->block_len;
+			em->block_start = merge->block_start;
+			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+			em->mod_start = merge->mod_start;
+			em->generation = max(em->generation, merge->generation);
+
+			rb_erase(&merge->rb_node, &tree->map);
+			RB_CLEAR_NODE(&merge->rb_node);
+			free_extent_map(merge);
+		}
+	}
+
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		em->block_len += merge->block_len;
+		rb_erase(&merge->rb_node, &tree->map);
+		RB_CLEAR_NODE(&merge->rb_node);
+		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+		em->generation = max(em->generation, merge->generation);
+		free_extent_map(merge);
+	}
+}
+
+/**
+ * unpin_extent_cache - unpin an extent from the cache
+ * @tree:	tree to unpin the extent in
+ * @start:	logical offset in the file
+ * @len:	length of the extent
+ * @gen:	generation that this extent has been modified in
+ *
+ * Called after an extent has been written to disk properly.  Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+		       u64 gen)
+{
+	int ret = 0;
+	struct extent_map *em;
+	bool prealloc = false;
+
+	write_lock(&tree->lock);
+	em = lookup_extent_mapping(tree, start, len);
+
+	WARN_ON(!em || em->start != start);
+
+	if (!em)
+		goto out;
+
+	em->generation = gen;
+	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+		prealloc = true;
+		clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+	}
+
+	try_merge_map(tree, em);
+
+	if (prealloc) {
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+	}
+
+	free_extent_map(em);
+out:
+	write_unlock(&tree->lock);
+	return ret;
+
+}
+
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+	clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+	if (extent_map_in_tree(em))
+		try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+					struct extent_map *em,
+					int modified)
+{
+	atomic_inc(&em->refs);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (modified)
+		list_move(&em->list, &tree->modified_extents);
+	else
+		try_merge_map(tree, em);
+}
+
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree:	tree to insert new map in
+ * @em:		map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings.  The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was successful.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em, int modified)
+{
+	int ret = 0;
+
+	ret = tree_insert(&tree->map, em);
+	if (ret)
+		goto out;
+
+	setup_extent_mapping(tree, em, modified);
+out:
+	return ret;
+}
+
+static struct extent_map *
+__lookup_extent_mapping(struct extent_map_tree *tree,
+			u64 start, u64 len, int strict)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
+	u64 end = range_end(start, len);
+
+	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	if (!rb_node) {
+		if (prev)
+			rb_node = prev;
+		else if (next)
+			rb_node = next;
+		else
+			return NULL;
+	}
+
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+
+	if (strict && !(end > em->start && start < extent_map_end(em)))
+		return NULL;
+
+	atomic_inc(&em->refs);
+	return em;
+}
+
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree:	tree to lookup in
+ * @start:	byte offset to start the search
+ * @len:	length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.  There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len)
+{
+	return __lookup_extent_mapping(tree, start, len, 1);
+}
+
+/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree:	tree to lookup in
+ * @start:	byte offset to start the search
+ * @len:	length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len)
+{
+	return __lookup_extent_mapping(tree, start, len, 0);
+}
+
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree:	extent tree to remove from
+ * @em:		extent map beeing removed
+ *
+ * Removes @em from @tree.  No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret = 0;
+
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+	rb_erase(&em->rb_node, &tree->map);
+	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+		list_del_init(&em->list);
+	RB_CLEAR_NODE(&em->rb_node);
+	return ret;
+}
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified)
+{
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+	ASSERT(extent_map_in_tree(cur));
+	if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+		list_del_init(&cur->list);
+	rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+	RB_CLEAR_NODE(&cur->rb_node);
+
+	setup_extent_mapping(tree, new, modified);
+}
diff --git a/kernel/fs/btrfs/extent_map.h b/kernel/fs/btrfs/extent_map.h
new file mode 100644
index 000000000..b2991fd85
--- /dev/null
+++ b/kernel/fs/btrfs/extent_map.h
@@ -0,0 +1,85 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+#define EXTENT_MAP_DELALLOC ((u64)-1)
+
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
+
+struct extent_map {
+	struct rb_node rb_node;
+
+	/* all of these are in bytes */
+	u64 start;
+	u64 len;
+	u64 mod_start;
+	u64 mod_len;
+	u64 orig_start;
+	u64 orig_block_len;
+	u64 ram_bytes;
+	u64 block_start;
+	u64 block_len;
+	u64 generation;
+	unsigned long flags;
+	struct block_device *bdev;
+	atomic_t refs;
+	unsigned int compress_type;
+	struct list_head list;
+};
+
+struct extent_map_tree {
+	struct rb_root map;
+	struct list_head modified_extents;
+	rwlock_t lock;
+};
+
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+	return !RB_EMPTY_NODE(&em->rb_node);
+}
+
+static inline u64 extent_map_end(struct extent_map *em)
+{
+	if (em->start + em->len < em->start)
+		return (u64)-1;
+	return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+	if (em->block_start + em->block_len < em->block_start)
+		return (u64)-1;
+	return em->block_start + em->block_len;
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em, int modified);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified);
+
+struct extent_map *alloc_extent_map(void);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len);
+#endif
diff --git a/kernel/fs/btrfs/file-item.c b/kernel/fs/btrfs/file-item.c
new file mode 100644
index 000000000..58ece6558
--- /dev/null
+++ b/kernel/fs/btrfs/file-item.c
@@ -0,0 +1,953 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "print-tree.h"
+
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
+				   sizeof(struct btrfs_item) * 2) / \
+				  size) - 1))
+
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+				       PAGE_CACHE_SIZE))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+				   sizeof(struct btrfs_ordered_sum)) / \
+				   sizeof(u32) * (r)->sectorsize)
+
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding)
+{
+	int ret = 0;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key file_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	file_key.objectid = objectid;
+	file_key.offset = pos;
+	file_key.type = BTRFS_EXTENT_DATA_KEY;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+				      sizeof(*item));
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret); /* Can't happen */
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, item, offset);
+	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, compression);
+	btrfs_set_file_extent_encryption(leaf, item, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static struct btrfs_csum_item *
+btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+		  struct btrfs_root *root,
+		  struct btrfs_path *path,
+		  u64 bytenr, int cow)
+{
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_key found_key;
+	struct btrfs_csum_item *item;
+	struct extent_buffer *leaf;
+	u64 csum_offset = 0;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	int csums_in_item;
+
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = bytenr;
+	file_key.type = BTRFS_EXTENT_CSUM_KEY;
+	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+	if (ret < 0)
+		goto fail;
+	leaf = path->nodes[0];
+	if (ret > 0) {
+		ret = 1;
+		if (path->slots[0] == 0)
+			goto fail;
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
+			goto fail;
+
+		csum_offset = (bytenr - found_key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+		csums_in_item /= csum_size;
+
+		if (csum_offset == csums_in_item) {
+			ret = -EFBIG;
+			goto fail;
+		} else if (csum_offset > csums_in_item) {
+			goto fail;
+		}
+	}
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * csum_size);
+	return item;
+fail:
+	if (ret > 0)
+		ret = -ENOENT;
+	return ERR_PTR(ret);
+}
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 offset, int mod)
+{
+	int ret;
+	struct btrfs_key file_key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	file_key.objectid = objectid;
+	file_key.offset = offset;
+	file_key.type = BTRFS_EXTENT_DATA_KEY;
+	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+	return ret;
+}
+
+static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
+{
+	kfree(bio->csum_allocated);
+}
+
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+				   struct inode *inode, struct bio *bio,
+				   u64 logical_offset, u32 *dst, int dio)
+{
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_path *path;
+	u8 *csum;
+	u64 offset = 0;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u64 disk_bytenr;
+	u32 diff;
+	int nblocks;
+	int bio_index = 0;
+	int count;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
+	if (!dst) {
+		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+			btrfs_bio->csum_allocated = kmalloc_array(nblocks,
+					csum_size, GFP_NOFS);
+			if (!btrfs_bio->csum_allocated) {
+				btrfs_free_path(path);
+				return -ENOMEM;
+			}
+			btrfs_bio->csum = btrfs_bio->csum_allocated;
+			btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
+		} else {
+			btrfs_bio->csum = btrfs_bio->csum_inline;
+		}
+		csum = btrfs_bio->csum;
+	} else {
+		csum = (u8 *)dst;
+	}
+
+	if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
+		path->reada = 2;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+
+	/*
+	 * the free space stuff is only read when it hasn't been
+	 * updated in the current transaction.  So, we can safely
+	 * read from the commit root and sidestep a nasty deadlock
+	 * between reading the free space cache and updating the csum tree.
+	 */
+	if (btrfs_is_free_space_inode(inode)) {
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+	}
+
+	disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
+	if (dio)
+		offset = logical_offset;
+	while (bio_index < bio->bi_vcnt) {
+		if (!dio)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
+					       (u32 *)csum, nblocks);
+		if (count)
+			goto found;
+
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(path);
+			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+						 path, disk_bytenr, 0);
+			if (IS_ERR(item)) {
+				count = 1;
+				memset(csum, 0, csum_size);
+				if (BTRFS_I(inode)->root->root_key.objectid ==
+				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+					set_extent_bits(io_tree, offset,
+						offset + bvec->bv_len - 1,
+						EXTENT_NODATASUM, GFP_NOFS);
+				} else {
+					btrfs_info(BTRFS_I(inode)->root->fs_info,
+						   "no csum found for inode %llu start %llu",
+					       btrfs_ino(inode), offset);
+				}
+				item = NULL;
+				btrfs_release_path(path);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / csum_size) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = disk_bytenr - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * csum_size;
+		count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
+					    inode->i_sb->s_blocksize_bits);
+		read_extent_buffer(path->nodes[0], csum,
+				   ((unsigned long)item) + diff,
+				   csum_size * count);
+found:
+		csum += count * csum_size;
+		nblocks -= count;
+		bio_index += count;
+		while (count--) {
+			disk_bytenr += bvec->bv_len;
+			offset += bvec->bv_len;
+			bvec++;
+		}
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 offset)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
+}
+
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list, int search_commit)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_csum_item *item;
+	LIST_HEAD(tmplist);
+	unsigned long offset;
+	int ret;
+	size_t size;
+	u64 csum_end;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	ASSERT(IS_ALIGNED(start, root->sectorsize) &&
+	       IS_ALIGNED(end + 1, root->sectorsize));
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (search_commit) {
+		path->skip_locking = 1;
+		path->reada = 2;
+		path->search_commit_root = 1;
+	}
+
+	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key.offset = start;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+	if (ret > 0 && path->slots[0] > 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+		    key.type == BTRFS_EXTENT_CSUM_KEY) {
+			offset = (start - key.offset) >>
+				 root->fs_info->sb->s_blocksize_bits;
+			if (offset * csum_size <
+			    btrfs_item_size_nr(leaf, path->slots[0] - 1))
+				path->slots[0]--;
+		}
+	}
+
+	while (start <= end) {
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto fail;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY ||
+		    key.offset > end)
+			break;
+
+		if (key.offset > start)
+			start = key.offset;
+
+		size = btrfs_item_size_nr(leaf, path->slots[0]);
+		csum_end = key.offset + (size / csum_size) * root->sectorsize;
+		if (csum_end <= start) {
+			path->slots[0]++;
+			continue;
+		}
+
+		csum_end = min(csum_end, end + 1);
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_csum_item);
+		while (start < csum_end) {
+			size = min_t(size_t, csum_end - start,
+				     MAX_ORDERED_SUM_BYTES(root));
+			sums = kzalloc(btrfs_ordered_sum_size(root, size),
+				       GFP_NOFS);
+			if (!sums) {
+				ret = -ENOMEM;
+				goto fail;
+			}
+
+			sums->bytenr = start;
+			sums->len = (int)size;
+
+			offset = (start - key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+			offset *= csum_size;
+			size >>= root->fs_info->sb->s_blocksize_bits;
+
+			read_extent_buffer(path->nodes[0],
+					   sums->sums,
+					   ((unsigned long)item) + offset,
+					   csum_size * size);
+
+			start += root->sectorsize * size;
+			list_add_tail(&sums->list, &tmplist);
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+fail:
+	while (ret < 0 && !list_empty(&tmplist)) {
+		sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	list_splice_tail(&tmplist, list);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio, u64 file_start, int contig)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_ordered_extent *ordered;
+	char *data;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	int index;
+	unsigned long total_bytes = 0;
+	unsigned long this_sum_bytes = 0;
+	u64 offset;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
+		       GFP_NOFS);
+	if (!sums)
+		return -ENOMEM;
+
+	sums->len = bio->bi_iter.bi_size;
+	INIT_LIST_HEAD(&sums->list);
+
+	if (contig)
+		offset = file_start;
+	else
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
+	BUG_ON(!ordered); /* Logic error */
+	sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
+	index = 0;
+
+	while (bio_index < bio->bi_vcnt) {
+		if (!contig)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+		if (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset) {
+			unsigned long bytes_left;
+			sums->len = this_sum_bytes;
+			this_sum_bytes = 0;
+			btrfs_add_ordered_sum(inode, ordered, sums);
+			btrfs_put_ordered_extent(ordered);
+
+			bytes_left = bio->bi_iter.bi_size - total_bytes;
+
+			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+				       GFP_NOFS);
+			BUG_ON(!sums); /* -ENOMEM */
+			sums->len = bytes_left;
+			ordered = btrfs_lookup_ordered_extent(inode, offset);
+			BUG_ON(!ordered); /* Logic error */
+			sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
+				       total_bytes;
+			index = 0;
+		}
+
+		data = kmap_atomic(bvec->bv_page);
+		sums->sums[index] = ~(u32)0;
+		sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+						    sums->sums[index],
+						    bvec->bv_len);
+		kunmap_atomic(data);
+		btrfs_csum_final(sums->sums[index],
+				 (char *)(sums->sums + index));
+
+		bio_index++;
+		index++;
+		total_bytes += bvec->bv_len;
+		this_sum_bytes += bvec->bv_len;
+		offset += bvec->bv_len;
+		bvec++;
+	}
+	this_sum_bytes = 0;
+	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline void truncate_one_csum(struct btrfs_root *root,
+				       struct btrfs_path *path,
+				       struct btrfs_key *key,
+				       u64 bytenr, u64 len)
+{
+	struct extent_buffer *leaf;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	u64 csum_end;
+	u64 end_byte = bytenr + len;
+	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+	leaf = path->nodes[0];
+	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+	csum_end <<= root->fs_info->sb->s_blocksize_bits;
+	csum_end += key->offset;
+
+	if (key->offset < bytenr && csum_end <= end_byte) {
+		/*
+		 *         [ bytenr - len ]
+		 *         [   ]
+		 *   [csum     ]
+		 *   A simple truncate off the end of the item
+		 */
+		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+		new_size *= csum_size;
+		btrfs_truncate_item(root, path, new_size, 1);
+	} else if (key->offset >= bytenr && csum_end > end_byte &&
+		   end_byte > key->offset) {
+		/*
+		 *         [ bytenr - len ]
+		 *                 [ ]
+		 *                 [csum     ]
+		 * we need to truncate from the beginning of the csum
+		 */
+		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+		new_size *= csum_size;
+
+		btrfs_truncate_item(root, path, new_size, 0);
+
+		key->offset = end_byte;
+		btrfs_set_item_key_safe(root->fs_info, path, key);
+	} else {
+		BUG();
+	}
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 end_byte = bytenr + len;
+	u64 csum_end;
+	struct extent_buffer *leaf;
+	int ret;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+	root = root->fs_info->csum_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+		key.offset = end_byte - 1;
+		key.type = BTRFS_EXTENT_CSUM_KEY;
+
+		path->leave_spinning = 1;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		} else if (ret < 0) {
+			break;
+		}
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY) {
+			break;
+		}
+
+		if (key.offset >= end_byte)
+			break;
+
+		csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+		csum_end <<= blocksize_bits;
+		csum_end += key.offset;
+
+		/* this csum ends before we start, we're done */
+		if (csum_end <= bytenr)
+			break;
+
+		/* delete the entire item, it is inside our range */
+		if (key.offset >= bytenr && csum_end <= end_byte) {
+			ret = btrfs_del_item(trans, root, path);
+			if (ret)
+				goto out;
+			if (key.offset == bytenr)
+				break;
+		} else if (key.offset < bytenr && csum_end > end_byte) {
+			unsigned long offset;
+			unsigned long shift_len;
+			unsigned long item_offset;
+			/*
+			 *        [ bytenr - len ]
+			 *     [csum                ]
+			 *
+			 * Our bytes are in the middle of the csum,
+			 * we need to split this item and insert a new one.
+			 *
+			 * But we can't drop the path because the
+			 * csum could change, get removed, extended etc.
+			 *
+			 * The trick here is the max size of a csum item leaves
+			 * enough room in the tree block for a single
+			 * item header.  So, we split the item in place,
+			 * adding a new header pointing to the existing
+			 * bytes.  Then we loop around again and we have
+			 * a nicely formed csum item that we can neatly
+			 * truncate.
+			 */
+			offset = (bytenr - key.offset) >> blocksize_bits;
+			offset *= csum_size;
+
+			shift_len = (len >> blocksize_bits) * csum_size;
+
+			item_offset = btrfs_item_ptr_offset(leaf,
+							    path->slots[0]);
+
+			memset_extent_buffer(leaf, 0, item_offset + offset,
+					     shift_len);
+			key.offset = bytenr;
+
+			/*
+			 * btrfs_split_item returns -EAGAIN when the
+			 * item changed size or key
+			 */
+			ret = btrfs_split_item(trans, root, path, &key, offset);
+			if (ret && ret != -EAGAIN) {
+				btrfs_abort_transaction(trans, root, ret);
+				goto out;
+			}
+
+			key.offset = end_byte - 1;
+		} else {
+			truncate_one_csum(root, path, &key, bytenr, len);
+			if (key.offset < bytenr)
+				break;
+		}
+		btrfs_release_path(path);
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_ordered_sum *sums)
+{
+	struct btrfs_key file_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item;
+	struct btrfs_csum_item *item_end;
+	struct extent_buffer *leaf = NULL;
+	u64 next_offset;
+	u64 total_bytes = 0;
+	u64 csum_offset;
+	u64 bytenr;
+	u32 nritems;
+	u32 ins_size;
+	int index = 0;
+	int found_next;
+	int ret;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+again:
+	next_offset = (u64)-1;
+	found_next = 0;
+	bytenr = sums->bytenr + total_bytes;
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = bytenr;
+	file_key.type = BTRFS_EXTENT_CSUM_KEY;
+
+	item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
+	if (!IS_ERR(item)) {
+		ret = 0;
+		leaf = path->nodes[0];
+		item_end = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_csum_item);
+		item_end = (struct btrfs_csum_item *)((char *)item_end +
+			   btrfs_item_size_nr(leaf, path->slots[0]));
+		goto found;
+	}
+	ret = PTR_ERR(item);
+	if (ret != -EFBIG && ret != -ENOENT)
+		goto fail_unlock;
+
+	if (ret == -EFBIG) {
+		u32 item_size;
+		/* we found one, but it isn't big enough yet */
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if ((item_size / csum_size) >=
+		    MAX_CSUM_ITEMS(root, csum_size)) {
+			/* already at max size, make a new one */
+			goto insert;
+		}
+	} else {
+		int slot = path->slots[0] + 1;
+		/* we didn't find a csum item, insert one */
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (!nritems || (path->slots[0] >= nritems - 1)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 1)
+				found_next = 1;
+			if (ret != 0)
+				goto insert;
+			slot = path->slots[0];
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+		if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+			found_next = 1;
+			goto insert;
+		}
+		next_offset = found_key.offset;
+		found_next = 1;
+		goto insert;
+	}
+
+	/*
+	 * at this point, we know the tree has an item, but it isn't big
+	 * enough yet to put our csum in.  Grow it
+	 */
+	btrfs_release_path(path);
+	ret = btrfs_search_slot(trans, root, &file_key, path,
+				csum_size, 1);
+	if (ret < 0)
+		goto fail_unlock;
+
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto insert;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	csum_offset = (bytenr - found_key.offset) >>
+			root->fs_info->sb->s_blocksize_bits;
+
+	if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+		goto insert;
+	}
+
+	if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
+	    csum_size) {
+		int extend_nr;
+		u64 tmp;
+		u32 diff;
+		u32 free_space;
+
+		if (btrfs_leaf_free_space(root, leaf) <
+				 sizeof(struct btrfs_item) + csum_size * 2)
+			goto insert;
+
+		free_space = btrfs_leaf_free_space(root, leaf) -
+					 sizeof(struct btrfs_item) - csum_size;
+		tmp = sums->len - total_bytes;
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		WARN_ON(tmp < 1);
+
+		extend_nr = max_t(int, 1, (int)tmp);
+		diff = (csum_offset + extend_nr) * csum_size;
+		diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
+
+		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+		diff = min(free_space, diff);
+		diff /= csum_size;
+		diff *= csum_size;
+
+		btrfs_extend_item(root, path, diff);
+		ret = 0;
+		goto csum;
+	}
+
+insert:
+	btrfs_release_path(path);
+	csum_offset = 0;
+	if (found_next) {
+		u64 tmp;
+
+		tmp = sums->len - total_bytes;
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		tmp = min(tmp, (next_offset - file_key.offset) >>
+					 root->fs_info->sb->s_blocksize_bits);
+
+		tmp = max((u64)1, tmp);
+		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+		ins_size = csum_size * tmp;
+	} else {
+		ins_size = csum_size;
+	}
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+				      ins_size);
+	path->leave_spinning = 0;
+	if (ret < 0)
+		goto fail_unlock;
+	if (WARN_ON(ret != 0))
+		goto fail_unlock;
+	leaf = path->nodes[0];
+csum:
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+				      btrfs_item_size_nr(leaf, path->slots[0]));
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * csum_size);
+found:
+	ins_size = (u32)(sums->len - total_bytes) >>
+		   root->fs_info->sb->s_blocksize_bits;
+	ins_size *= csum_size;
+	ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+			      ins_size);
+	write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+			    ins_size);
+
+	ins_size /= csum_size;
+	total_bytes += ins_size * root->sectorsize;
+	index += ins_size;
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	if (total_bytes < sums->len) {
+		btrfs_release_path(path);
+		cond_resched();
+		goto again;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+
+fail_unlock:
+	goto out;
+}
+
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+				     const struct btrfs_path *path,
+				     struct btrfs_file_extent_item *fi,
+				     const bool new_inline,
+				     struct extent_map *em)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf = path->nodes[0];
+	const int slot = path->slots[0];
+	struct btrfs_key key;
+	u64 extent_start, extent_end;
+	u64 bytenr;
+	u8 type = btrfs_file_extent_type(leaf, fi);
+	int compress_type = btrfs_file_extent_compression(leaf, fi);
+
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	extent_start = key.offset;
+
+	if (type == BTRFS_FILE_EXTENT_REG ||
+	    type == BTRFS_FILE_EXTENT_PREALLOC) {
+		extent_end = extent_start +
+			btrfs_file_extent_num_bytes(leaf, fi);
+	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+		size_t size;
+		size = btrfs_file_extent_inline_len(leaf, slot, fi);
+		extent_end = ALIGN(extent_start + size, root->sectorsize);
+	}
+
+	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+	if (type == BTRFS_FILE_EXTENT_REG ||
+	    type == BTRFS_FILE_EXTENT_PREALLOC) {
+		em->start = extent_start;
+		em->len = extent_end - extent_start;
+		em->orig_start = extent_start -
+			btrfs_file_extent_offset(leaf, fi);
+		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		if (bytenr == 0) {
+			em->block_start = EXTENT_MAP_HOLE;
+			return;
+		}
+		if (compress_type != BTRFS_COMPRESS_NONE) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+			em->block_start = bytenr;
+			em->block_len = em->orig_block_len;
+		} else {
+			bytenr += btrfs_file_extent_offset(leaf, fi);
+			em->block_start = bytenr;
+			em->block_len = em->len;
+			if (type == BTRFS_FILE_EXTENT_PREALLOC)
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		}
+	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+		em->block_start = EXTENT_MAP_INLINE;
+		em->start = extent_start;
+		em->len = extent_end - extent_start;
+		/*
+		 * Initialize orig_start and block_len with the same values
+		 * as in inode.c:btrfs_get_extent().
+		 */
+		em->orig_start = EXTENT_MAP_HOLE;
+		em->block_len = (u64)-1;
+		if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+		}
+	} else {
+		btrfs_err(root->fs_info,
+			  "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
+			  type, btrfs_ino(inode), extent_start,
+			  root->root_key.objectid);
+	}
+}
diff --git a/kernel/fs/btrfs/file.c b/kernel/fs/btrfs/file.c
new file mode 100644
index 000000000..b072e1747
--- /dev/null
+++ b/kernel/fs/btrfs/file.c
@@ -0,0 +1,2860 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/slab.h>
+#include <linux/btrfs.h>
+#include <linux/uio.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "volumes.h"
+#include "qgroup.h"
+
+static struct kmem_cache *btrfs_inode_defrag_cachep;
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+	struct rb_node rb_node;
+	/* objectid */
+	u64 ino;
+	/*
+	 * transid where the defrag was added, we search for
+	 * extents newer than this
+	 */
+	u64 transid;
+
+	/* root objectid */
+	u64 root;
+
+	/* last offset we were able to defrag */
+	u64 last_offset;
+
+	/* if we've wrapped around back to zero once already */
+	int cycled;
+};
+
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+				  struct inode_defrag *defrag2)
+{
+	if (defrag1->root > defrag2->root)
+		return 1;
+	else if (defrag1->root < defrag2->root)
+		return -1;
+	else if (defrag1->ino > defrag2->ino)
+		return 1;
+	else if (defrag1->ino < defrag2->ino)
+		return -1;
+	else
+		return 0;
+}
+
+/* pop a record for an inode into the defrag tree.  The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+				    struct inode_defrag *defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct inode_defrag *entry;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	int ret;
+
+	p = &root->fs_info->defrag_inodes.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		ret = __compare_inode_defrag(defrag, entry);
+		if (ret < 0)
+			p = &parent->rb_left;
+		else if (ret > 0)
+			p = &parent->rb_right;
+		else {
+			/* if we're reinserting an entry for
+			 * an old defrag run, make sure to
+			 * lower the transid of our existing record
+			 */
+			if (defrag->transid < entry->transid)
+				entry->transid = defrag->transid;
+			if (defrag->last_offset > entry->last_offset)
+				entry->last_offset = defrag->last_offset;
+			return -EEXIST;
+		}
+	}
+	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+	rb_link_node(&defrag->rb_node, parent, p);
+	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+	return 0;
+}
+
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+	if (!btrfs_test_opt(root, AUTO_DEFRAG))
+		return 0;
+
+	if (btrfs_fs_closing(root->fs_info))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct inode_defrag *defrag;
+	u64 transid;
+	int ret;
+
+	if (!__need_auto_defrag(root))
+		return 0;
+
+	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
+		return 0;
+
+	if (trans)
+		transid = trans->transid;
+	else
+		transid = BTRFS_I(inode)->root->last_trans;
+
+	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
+	if (!defrag)
+		return -ENOMEM;
+
+	defrag->ino = btrfs_ino(inode);
+	defrag->transid = transid;
+	defrag->root = root->root_key.objectid;
+
+	spin_lock(&root->fs_info->defrag_inodes_lock);
+	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+		/*
+		 * If we set IN_DEFRAG flag and evict the inode from memory,
+		 * and then re-read this inode, this new inode doesn't have
+		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
+		 */
+		ret = __btrfs_add_inode_defrag(inode, defrag);
+		if (ret)
+			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
+	spin_unlock(&root->fs_info->defrag_inodes_lock);
+	return 0;
+}
+
+/*
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
+ */
+static void btrfs_requeue_inode_defrag(struct inode *inode,
+				       struct inode_defrag *defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (!__need_auto_defrag(root))
+		goto out;
+
+	/*
+	 * Here we don't check the IN_DEFRAG flag, because we need merge
+	 * them together.
+	 */
+	spin_lock(&root->fs_info->defrag_inodes_lock);
+	ret = __btrfs_add_inode_defrag(inode, defrag);
+	spin_unlock(&root->fs_info->defrag_inodes_lock);
+	if (ret)
+		goto out;
+	return;
+out:
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
+{
+	struct inode_defrag *entry = NULL;
+	struct inode_defrag tmp;
+	struct rb_node *p;
+	struct rb_node *parent = NULL;
+	int ret;
+
+	tmp.ino = ino;
+	tmp.root = root;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	p = fs_info->defrag_inodes.rb_node;
+	while (p) {
+		parent = p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		ret = __compare_inode_defrag(&tmp, entry);
+		if (ret < 0)
+			p = parent->rb_left;
+		else if (ret > 0)
+			p = parent->rb_right;
+		else
+			goto out;
+	}
+
+	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+		parent = rb_next(parent);
+		if (parent)
+			entry = rb_entry(parent, struct inode_defrag, rb_node);
+		else
+			entry = NULL;
+	}
+out:
+	if (entry)
+		rb_erase(parent, &fs_info->defrag_inodes);
+	spin_unlock(&fs_info->defrag_inodes_lock);
+	return entry;
+}
+
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	struct rb_node *node;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	node = rb_first(&fs_info->defrag_inodes);
+	while (node) {
+		rb_erase(node, &fs_info->defrag_inodes);
+		defrag = rb_entry(node, struct inode_defrag, rb_node);
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+		cond_resched_lock(&fs_info->defrag_inodes_lock);
+
+		node = rb_first(&fs_info->defrag_inodes);
+	}
+	spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH	1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+				    struct inode_defrag *defrag)
+{
+	struct btrfs_root *inode_root;
+	struct inode *inode;
+	struct btrfs_key key;
+	struct btrfs_ioctl_defrag_range_args range;
+	int num_defrag;
+	int index;
+	int ret;
+
+	/* get the inode */
+	key.objectid = defrag->root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(inode_root)) {
+		ret = PTR_ERR(inode_root);
+		goto cleanup;
+	}
+
+	key.objectid = defrag->ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto cleanup;
+	}
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	/* do a chunk of defrag */
+	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+	memset(&range, 0, sizeof(range));
+	range.len = (u64)-1;
+	range.start = defrag->last_offset;
+
+	sb_start_write(fs_info->sb);
+	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+				       BTRFS_DEFRAG_BATCH);
+	sb_end_write(fs_info->sb);
+	/*
+	 * if we filled the whole defrag batch, there
+	 * must be more work to do.  Queue this defrag
+	 * again
+	 */
+	if (num_defrag == BTRFS_DEFRAG_BATCH) {
+		defrag->last_offset = range.start;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else if (defrag->last_offset && !defrag->cycled) {
+		/*
+		 * we didn't fill our defrag batch, but
+		 * we didn't start at zero.  Make sure we loop
+		 * around to the start of the file.
+		 */
+		defrag->last_offset = 0;
+		defrag->cycled = 1;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
+
+	iput(inode);
+	return 0;
+cleanup:
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	return ret;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	u64 first_ino = 0;
+	u64 root_objectid = 0;
+
+	atomic_inc(&fs_info->defrag_running);
+	while (1) {
+		/* Pause the auto defragger. */
+		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+			     &fs_info->fs_state))
+			break;
+
+		if (!__need_auto_defrag(fs_info->tree_root))
+			break;
+
+		/* find an inode to defrag */
+		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+						 first_ino);
+		if (!defrag) {
+			if (root_objectid || first_ino) {
+				root_objectid = 0;
+				first_ino = 0;
+				continue;
+			} else {
+				break;
+			}
+		}
+
+		first_ino = defrag->ino + 1;
+		root_objectid = defrag->root;
+
+		__btrfs_run_defrag_inode(fs_info, defrag);
+	}
+	atomic_dec(&fs_info->defrag_running);
+
+	/*
+	 * during unmount, we use the transaction_wait queue to
+	 * wait for the defragger to stop
+	 */
+	wake_up(&fs_info->transaction_wait);
+	return 0;
+}
+
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+					 size_t write_bytes,
+					 struct page **prepared_pages,
+					 struct iov_iter *i)
+{
+	size_t copied = 0;
+	size_t total_copied = 0;
+	int pg = 0;
+	int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	while (write_bytes > 0) {
+		size_t count = min_t(size_t,
+				     PAGE_CACHE_SIZE - offset, write_bytes);
+		struct page *page = prepared_pages[pg];
+		/*
+		 * Copy data from userspace to the current page
+		 */
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+
+		/* Flush processor's dcache for this page */
+		flush_dcache_page(page);
+
+		/*
+		 * if we get a partial write, we can end up with
+		 * partially up to date pages.  These add
+		 * a lot of complexity, so make sure they don't
+		 * happen by forcing this copy to be retried.
+		 *
+		 * The rest of the btrfs_file_write code will fall
+		 * back to page at a time copies after we return 0.
+		 */
+		if (!PageUptodate(page) && copied < count)
+			copied = 0;
+
+		iov_iter_advance(i, copied);
+		write_bytes -= copied;
+		total_copied += copied;
+
+		/* Return to btrfs_file_write_iter to fault page */
+		if (unlikely(copied == 0))
+			break;
+
+		if (copied < PAGE_CACHE_SIZE - offset) {
+			offset += copied;
+		} else {
+			pg++;
+			offset = 0;
+		}
+	}
+	return total_copied;
+}
+
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+	size_t i;
+	for (i = 0; i < num_pages; i++) {
+		/* page checked is some magic around finding pages that
+		 * have been modified without going through btrfs_set_page_dirty
+		 * clear it here. There should be no need to mark the pages
+		 * accessed as prepare_pages should have marked them accessed
+		 * in prepare_pages via find_or_create_page()
+		 */
+		ClearPageChecked(pages[i]);
+		unlock_page(pages[i]);
+		page_cache_release(pages[i]);
+	}
+}
+
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+			     struct page **pages, size_t num_pages,
+			     loff_t pos, size_t write_bytes,
+			     struct extent_state **cached)
+{
+	int err = 0;
+	int i;
+	u64 num_bytes;
+	u64 start_pos;
+	u64 end_of_last_block;
+	u64 end_pos = pos + write_bytes;
+	loff_t isize = i_size_read(inode);
+
+	start_pos = pos & ~((u64)root->sectorsize - 1);
+	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
+
+	end_of_last_block = start_pos + num_bytes - 1;
+	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+					cached);
+	if (err)
+		return err;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = pages[i];
+		SetPageUptodate(p);
+		ClearPageChecked(p);
+		set_page_dirty(p);
+	}
+
+	/*
+	 * we've only changed i_size in ram, and we haven't updated
+	 * the disk i_size.  There is no need to log the inode
+	 * at this time.
+	 */
+	if (end_pos > isize)
+		i_size_write(inode, end_pos);
+	return 0;
+}
+
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned)
+{
+	struct extent_map *em;
+	struct extent_map *split = NULL;
+	struct extent_map *split2 = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 len = end - start + 1;
+	u64 gen;
+	int ret;
+	int testend = 1;
+	unsigned long flags;
+	int compressed = 0;
+	bool modified;
+
+	WARN_ON(end < start);
+	if (end == (u64)-1) {
+		len = (u64)-1;
+		testend = 0;
+	}
+	while (1) {
+		int no_splits = 0;
+
+		modified = false;
+		if (!split)
+			split = alloc_extent_map();
+		if (!split2)
+			split2 = alloc_extent_map();
+		if (!split || !split2)
+			no_splits = 1;
+
+		write_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (!em) {
+			write_unlock(&em_tree->lock);
+			break;
+		}
+		flags = em->flags;
+		gen = em->generation;
+		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+			if (testend && em->start + em->len >= start + len) {
+				free_extent_map(em);
+				write_unlock(&em_tree->lock);
+				break;
+			}
+			start = em->start + em->len;
+			if (testend)
+				len = start + len - (em->start + em->len);
+			free_extent_map(em);
+			write_unlock(&em_tree->lock);
+			continue;
+		}
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		clear_bit(EXTENT_FLAG_LOGGING, &flags);
+		modified = !list_empty(&em->list);
+		if (no_splits)
+			goto next;
+
+		if (em->start < start) {
+			split->start = em->start;
+			split->len = start - em->start;
+
+			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+				split->orig_start = em->orig_start;
+				split->block_start = em->block_start;
+
+				if (compressed)
+					split->block_len = em->block_len;
+				else
+					split->block_len = split->len;
+				split->orig_block_len = max(split->block_len,
+						em->orig_block_len);
+				split->ram_bytes = em->ram_bytes;
+			} else {
+				split->orig_start = split->start;
+				split->block_len = 0;
+				split->block_start = em->block_start;
+				split->orig_block_len = 0;
+				split->ram_bytes = split->len;
+			}
+
+			split->generation = gen;
+			split->bdev = em->bdev;
+			split->flags = flags;
+			split->compress_type = em->compress_type;
+			replace_extent_mapping(em_tree, em, split, modified);
+			free_extent_map(split);
+			split = split2;
+			split2 = NULL;
+		}
+		if (testend && em->start + em->len > start + len) {
+			u64 diff = start + len - em->start;
+
+			split->start = start + len;
+			split->len = em->start + em->len - (start + len);
+			split->bdev = em->bdev;
+			split->flags = flags;
+			split->compress_type = em->compress_type;
+			split->generation = gen;
+
+			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+				split->orig_block_len = max(em->block_len,
+						    em->orig_block_len);
+
+				split->ram_bytes = em->ram_bytes;
+				if (compressed) {
+					split->block_len = em->block_len;
+					split->block_start = em->block_start;
+					split->orig_start = em->orig_start;
+				} else {
+					split->block_len = split->len;
+					split->block_start = em->block_start
+						+ diff;
+					split->orig_start = em->orig_start;
+				}
+			} else {
+				split->ram_bytes = split->len;
+				split->orig_start = split->start;
+				split->block_len = 0;
+				split->block_start = em->block_start;
+				split->orig_block_len = 0;
+			}
+
+			if (extent_map_in_tree(em)) {
+				replace_extent_mapping(em_tree, em, split,
+						       modified);
+			} else {
+				ret = add_extent_mapping(em_tree, split,
+							 modified);
+				ASSERT(ret == 0); /* Logic error */
+			}
+			free_extent_map(split);
+			split = NULL;
+		}
+next:
+		if (extent_map_in_tree(em))
+			remove_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree*/
+		free_extent_map(em);
+	}
+	if (split)
+		free_extent_map(split);
+	if (split2)
+		free_extent_map(split2);
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end.  hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split.  Anything entirely inside the range
+ * is deleted from the tree.
+ */
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *drop_end, int drop_cache,
+			 int replace_extent,
+			 u32 extent_item_size,
+			 int *key_inserted)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key new_key;
+	u64 ino = btrfs_ino(inode);
+	u64 search_start = start;
+	u64 disk_bytenr = 0;
+	u64 num_bytes = 0;
+	u64 extent_offset = 0;
+	u64 extent_end = 0;
+	int del_nr = 0;
+	int del_slot = 0;
+	int extent_type;
+	int recow;
+	int ret;
+	int modify_tree = -1;
+	int update_refs;
+	int found = 0;
+	int leafs_visited = 0;
+
+	if (drop_cache)
+		btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+		modify_tree = 0;
+
+	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+		       root == root->fs_info->tree_root);
+	while (1) {
+		recow = 0;
+		ret = btrfs_lookup_file_extent(trans, root, path, ino,
+					       search_start, modify_tree);
+		if (ret < 0)
+			break;
+		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+			if (key.objectid == ino &&
+			    key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+		ret = 0;
+		leafs_visited++;
+next_slot:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			BUG_ON(del_nr > 0);
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				break;
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			leafs_visited++;
+			leaf = path->nodes[0];
+			recow = 1;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid > ino ||
+		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+			break;
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+			extent_offset = btrfs_file_extent_offset(leaf, fi);
+			extent_end = key.offset +
+				btrfs_file_extent_num_bytes(leaf, fi);
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = key.offset +
+				btrfs_file_extent_inline_len(leaf,
+						     path->slots[0], fi);
+		} else {
+			WARN_ON(1);
+			extent_end = search_start;
+		}
+
+		/*
+		 * Don't skip extent items representing 0 byte lengths. They
+		 * used to be created (bug) if while punching holes we hit
+		 * -ENOSPC condition. So if we find one here, just ensure we
+		 * delete it, otherwise we would insert a new file extent item
+		 * with the same key (offset) as that 0 bytes length file
+		 * extent item in the call to setup_items_for_insert() later
+		 * in this function.
+		 */
+		if (extent_end == key.offset && extent_end >= search_start)
+			goto delete_extent_item;
+
+		if (extent_end <= search_start) {
+			path->slots[0]++;
+			goto next_slot;
+		}
+
+		found = 1;
+		search_start = max(key.offset, start);
+		if (recow || !modify_tree) {
+			modify_tree = -1;
+			btrfs_release_path(path);
+			continue;
+		}
+
+		/*
+		 *     | - range to drop - |
+		 *  | -------- extent -------- |
+		 */
+		if (start > key.offset && end < extent_end) {
+			BUG_ON(del_nr > 0);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.offset = start;
+			ret = btrfs_duplicate_item(trans, root, path,
+						   &new_key);
+			if (ret == -EAGAIN) {
+				btrfs_release_path(path);
+				continue;
+			}
+			if (ret < 0)
+				break;
+
+			leaf = path->nodes[0];
+			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+
+			extent_offset += start - key.offset;
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - start);
+			btrfs_mark_buffer_dirty(leaf);
+
+			if (update_refs && disk_bytenr > 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid,
+						new_key.objectid,
+						start - extent_offset, 1);
+				BUG_ON(ret); /* -ENOMEM */
+			}
+			key.offset = start;
+		}
+		/*
+		 *  | ---- range to drop ----- |
+		 *      | -------- extent -------- |
+		 */
+		if (start <= key.offset && end < extent_end) {
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.offset = end;
+			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+
+			extent_offset += end - key.offset;
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - end);
+			btrfs_mark_buffer_dirty(leaf);
+			if (update_refs && disk_bytenr > 0)
+				inode_sub_bytes(inode, end - key.offset);
+			break;
+		}
+
+		search_start = extent_end;
+		/*
+		 *       | ---- range to drop ----- |
+		 *  | -------- extent -------- |
+		 */
+		if (start > key.offset && end >= extent_end) {
+			BUG_ON(del_nr > 0);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+			btrfs_mark_buffer_dirty(leaf);
+			if (update_refs && disk_bytenr > 0)
+				inode_sub_bytes(inode, extent_end - start);
+			if (end == extent_end)
+				break;
+
+			path->slots[0]++;
+			goto next_slot;
+		}
+
+		/*
+		 *  | ---- range to drop ----- |
+		 *    | ------ extent ------ |
+		 */
+		if (start <= key.offset && end >= extent_end) {
+delete_extent_item:
+			if (del_nr == 0) {
+				del_slot = path->slots[0];
+				del_nr = 1;
+			} else {
+				BUG_ON(del_slot + del_nr != path->slots[0]);
+				del_nr++;
+			}
+
+			if (update_refs &&
+			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				inode_sub_bytes(inode,
+						extent_end - key.offset);
+				extent_end = ALIGN(extent_end,
+						   root->sectorsize);
+			} else if (update_refs && disk_bytenr > 0) {
+				ret = btrfs_free_extent(trans, root,
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid,
+						key.objectid, key.offset -
+						extent_offset, 0);
+				BUG_ON(ret); /* -ENOMEM */
+				inode_sub_bytes(inode,
+						extent_end - key.offset);
+			}
+
+			if (end == extent_end)
+				break;
+
+			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+				path->slots[0]++;
+				goto next_slot;
+			}
+
+			ret = btrfs_del_items(trans, root, path, del_slot,
+					      del_nr);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				break;
+			}
+
+			del_nr = 0;
+			del_slot = 0;
+
+			btrfs_release_path(path);
+			continue;
+		}
+
+		BUG_ON(1);
+	}
+
+	if (!ret && del_nr > 0) {
+		/*
+		 * Set path->slots[0] to first slot, so that after the delete
+		 * if items are move off from our leaf to its immediate left or
+		 * right neighbor leafs, we end up with a correct and adjusted
+		 * path->slots[0] for our insertion (if replace_extent != 0).
+		 */
+		path->slots[0] = del_slot;
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
+	}
+
+	leaf = path->nodes[0];
+	/*
+	 * If btrfs_del_items() was called, it might have deleted a leaf, in
+	 * which case it unlocked our path, so check path->locks[0] matches a
+	 * write lock.
+	 */
+	if (!ret && replace_extent && leafs_visited == 1 &&
+	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+	     path->locks[0] == BTRFS_WRITE_LOCK) &&
+	    btrfs_leaf_free_space(root, leaf) >=
+	    sizeof(struct btrfs_item) + extent_item_size) {
+
+		key.objectid = ino;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = start;
+		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+			struct btrfs_key slot_key;
+
+			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+				path->slots[0]++;
+		}
+		setup_items_for_insert(root, path, &key,
+				       &extent_item_size,
+				       extent_item_size,
+				       sizeof(struct btrfs_item) +
+				       extent_item_size, 1);
+		*key_inserted = 1;
+	}
+
+	if (!replace_extent || !(*key_inserted))
+		btrfs_release_path(path);
+	if (drop_end)
+		*drop_end = found ? min(end, extent_end) : end;
+	return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, int drop_cache)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+				   drop_cache, 0, 0, NULL);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+			    u64 objectid, u64 bytenr, u64 orig_offset,
+			    u64 *start, u64 *end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
+	    btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if ((*start && *start != key.offset) || (*end && *end != extent_end))
+		return 0;
+
+	*start = key.offset;
+	*end = extent_end;
+	return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct inode *inode, u64 start, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key new_key;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 extent_end;
+	u64 orig_offset;
+	u64 other_start;
+	u64 other_end;
+	u64 split;
+	int del_nr = 0;
+	int del_slot = 0;
+	int recow;
+	int ret;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+again:
+	recow = 0;
+	split = start;
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = split;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
+	       BTRFS_FILE_EXTENT_PREALLOC);
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	BUG_ON(key.offset > start || extent_end < end);
+
+	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+	memcpy(&new_key, &key, sizeof(new_key));
+
+	if (start == key.offset && end < extent_end) {
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1,
+				     ino, bytenr, orig_offset,
+				     &other_start, &other_end)) {
+			new_key.offset = end;
+			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - end);
+			btrfs_set_file_extent_offset(leaf, fi,
+						     end - orig_offset);
+			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							end - other_start);
+			btrfs_mark_buffer_dirty(leaf);
+			goto out;
+		}
+	}
+
+	if (start > key.offset && end == extent_end) {
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1,
+				     ino, bytenr, orig_offset,
+				     &other_start, &other_end)) {
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
+			path->slots[0]++;
+			new_key.offset = start;
+			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							other_end - start);
+			btrfs_set_file_extent_offset(leaf, fi,
+						     start - orig_offset);
+			btrfs_mark_buffer_dirty(leaf);
+			goto out;
+		}
+	}
+
+	while (start > key.offset || end < extent_end) {
+		if (key.offset == start)
+			split = end;
+
+		new_key.offset = split;
+		ret = btrfs_duplicate_item(trans, root, path, &new_key);
+		if (ret == -EAGAIN) {
+			btrfs_release_path(path);
+			goto again;
+		}
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+
+		leaf = path->nodes[0];
+		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						split - key.offset);
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - split);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+					   root->root_key.objectid,
+					   ino, orig_offset, 1);
+		BUG_ON(ret); /* -ENOMEM */
+
+		if (split == start) {
+			key.offset = start;
+		} else {
+			BUG_ON(start != key.offset);
+			path->slots[0]--;
+			extent_end = end;
+		}
+		recow = 1;
+	}
+
+	other_start = end;
+	other_end = 0;
+	if (extent_mergeable(leaf, path->slots[0] + 1,
+			     ino, bytenr, orig_offset,
+			     &other_start, &other_end)) {
+		if (recow) {
+			btrfs_release_path(path);
+			goto again;
+		}
+		extent_end = other_end;
+		del_slot = path->slots[0] + 1;
+		del_nr++;
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					0, root->root_key.objectid,
+					ino, orig_offset, 0);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+	other_start = 0;
+	other_end = start;
+	if (extent_mergeable(leaf, path->slots[0] - 1,
+			     ino, bytenr, orig_offset,
+			     &other_start, &other_end)) {
+		if (recow) {
+			btrfs_release_path(path);
+			goto again;
+		}
+		key.offset = other_start;
+		del_slot = path->slots[0];
+		del_nr++;
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					0, root->root_key.objectid,
+					ino, orig_offset, 0);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+	if (del_nr == 0) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+			   struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi,
+					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_mark_buffer_dirty(leaf);
+	} else {
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+			   struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi,
+					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - key.offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+	}
+out:
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos,
+				 bool force_uptodate)
+{
+	int ret = 0;
+
+	if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+	    !PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		if (ret)
+			return ret;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+/*
+ * this just gets pages into the page cache and locks them down.
+ */
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
+				  size_t num_pages, loff_t pos,
+				  size_t write_bytes, bool force_uptodate)
+{
+	int i;
+	unsigned long index = pos >> PAGE_CACHE_SHIFT;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	int err = 0;
+	int faili;
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = find_or_create_page(inode->i_mapping, index + i,
+					       mask | __GFP_WRITE);
+		if (!pages[i]) {
+			faili = i - 1;
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		if (i == 0)
+			err = prepare_uptodate_page(pages[i], pos,
+						    force_uptodate);
+		if (i == num_pages - 1)
+			err = prepare_uptodate_page(pages[i],
+						    pos + write_bytes, false);
+		if (err) {
+			page_cache_release(pages[i]);
+			faili = i - 1;
+			goto fail;
+		}
+		wait_on_page_writeback(pages[i]);
+	}
+
+	return 0;
+fail:
+	while (faili >= 0) {
+		unlock_page(pages[faili]);
+		page_cache_release(pages[faili]);
+		faili--;
+	}
+	return err;
+
+}
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+				size_t num_pages, loff_t pos,
+				u64 *lockstart, u64 *lockend,
+				struct extent_state **cached_state)
+{
+	u64 start_pos;
+	u64 last_pos;
+	int i;
+	int ret = 0;
+
+	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+
+	if (start_pos < inode->i_size) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent_bits(&BTRFS_I(inode)->io_tree,
+				 start_pos, last_pos, 0, cached_state);
+		ordered = btrfs_lookup_ordered_range(inode, start_pos,
+						     last_pos - start_pos + 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > start_pos &&
+		    ordered->file_offset <= last_pos) {
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     start_pos, last_pos,
+					     cached_state, GFP_NOFS);
+			for (i = 0; i < num_pages; i++) {
+				unlock_page(pages[i]);
+				page_cache_release(pages[i]);
+			}
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			return -EAGAIN;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
+				  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
+				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+				  0, 0, cached_state, GFP_NOFS);
+		*lockstart = start_pos;
+		*lockend = last_pos;
+		ret = 1;
+	}
+
+	for (i = 0; i < num_pages; i++) {
+		if (clear_page_dirty_for_io(pages[i]))
+			account_page_redirty(pages[i]);
+		set_page_extent_mapped(pages[i]);
+		WARN_ON(!PageLocked(pages[i]));
+	}
+
+	return ret;
+}
+
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+				    size_t *write_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_extent *ordered;
+	u64 lockstart, lockend;
+	u64 num_bytes;
+	int ret;
+
+	ret = btrfs_start_write_no_snapshoting(root);
+	if (!ret)
+		return -ENOSPC;
+
+	lockstart = round_down(pos, root->sectorsize);
+	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
+
+	while (1) {
+		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered) {
+			break;
+		}
+		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	num_bytes = lockend - lockstart + 1;
+	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
+	if (ret <= 0) {
+		ret = 0;
+		btrfs_end_write_no_snapshoting(root);
+	} else {
+		*write_bytes = min_t(size_t, *write_bytes ,
+				     num_bytes - pos + lockstart);
+	}
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+	return ret;
+}
+
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
+					       struct iov_iter *i,
+					       loff_t pos)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page **pages = NULL;
+	struct extent_state *cached_state = NULL;
+	u64 release_bytes = 0;
+	u64 lockstart;
+	u64 lockend;
+	unsigned long first_index;
+	size_t num_written = 0;
+	int nrptrs;
+	int ret = 0;
+	bool only_release_metadata = false;
+	bool force_page_uptodate = false;
+	bool need_unlock;
+
+	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
+			PAGE_CACHE_SIZE / (sizeof(struct page *)));
+	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+	nrptrs = max(nrptrs, 8);
+	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	first_index = pos >> PAGE_CACHE_SHIFT;
+
+	while (iov_iter_count(i) > 0) {
+		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t write_bytes = min(iov_iter_count(i),
+					 nrptrs * (size_t)PAGE_CACHE_SIZE -
+					 offset);
+		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
+						PAGE_CACHE_SIZE);
+		size_t reserve_bytes;
+		size_t dirty_pages;
+		size_t copied;
+
+		WARN_ON(num_pages > nrptrs);
+
+		/*
+		 * Fault pages before locking them in prepare_pages
+		 * to avoid recursive lock
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+		ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
+		if (ret == -ENOSPC &&
+		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					      BTRFS_INODE_PREALLOC))) {
+			ret = check_can_nocow(inode, pos, &write_bytes);
+			if (ret > 0) {
+				only_release_metadata = true;
+				/*
+				 * our prealloc extent may be smaller than
+				 * write_bytes, so scale down.
+				 */
+				num_pages = DIV_ROUND_UP(write_bytes + offset,
+							 PAGE_CACHE_SIZE);
+				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+				ret = 0;
+			} else {
+				ret = -ENOSPC;
+			}
+		}
+
+		if (ret)
+			break;
+
+		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+		if (ret) {
+			if (!only_release_metadata)
+				btrfs_free_reserved_data_space(inode,
+							       reserve_bytes);
+			else
+				btrfs_end_write_no_snapshoting(root);
+			break;
+		}
+
+		release_bytes = reserve_bytes;
+		need_unlock = false;
+again:
+		/*
+		 * This is going to setup the pages array with the number of
+		 * pages we want, so we don't really need to worry about the
+		 * contents of pages from loop to loop
+		 */
+		ret = prepare_pages(inode, pages, num_pages,
+				    pos, write_bytes,
+				    force_page_uptodate);
+		if (ret)
+			break;
+
+		ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+						      pos, &lockstart, &lockend,
+						      &cached_state);
+		if (ret < 0) {
+			if (ret == -EAGAIN)
+				goto again;
+			break;
+		} else if (ret > 0) {
+			need_unlock = true;
+			ret = 0;
+		}
+
+		copied = btrfs_copy_from_user(pos, num_pages,
+					   write_bytes, pages, i);
+
+		/*
+		 * if we have trouble faulting in the pages, fall
+		 * back to one page at a time
+		 */
+		if (copied < write_bytes)
+			nrptrs = 1;
+
+		if (copied == 0) {
+			force_page_uptodate = true;
+			dirty_pages = 0;
+		} else {
+			force_page_uptodate = false;
+			dirty_pages = DIV_ROUND_UP(copied + offset,
+						   PAGE_CACHE_SIZE);
+		}
+
+		/*
+		 * If we had a short copy we need to release the excess delaloc
+		 * bytes we reserved.  We need to increment outstanding_extents
+		 * because btrfs_delalloc_release_space will decrement it, but
+		 * we still have an outstanding extent for the chunk we actually
+		 * managed to copy.
+		 */
+		if (num_pages > dirty_pages) {
+			release_bytes = (num_pages - dirty_pages) <<
+				PAGE_CACHE_SHIFT;
+			if (copied > 0) {
+				spin_lock(&BTRFS_I(inode)->lock);
+				BTRFS_I(inode)->outstanding_extents++;
+				spin_unlock(&BTRFS_I(inode)->lock);
+			}
+			if (only_release_metadata)
+				btrfs_delalloc_release_metadata(inode,
+								release_bytes);
+			else
+				btrfs_delalloc_release_space(inode,
+							     release_bytes);
+		}
+
+		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
+
+		if (copied > 0)
+			ret = btrfs_dirty_pages(root, inode, pages,
+						dirty_pages, pos, copied,
+						NULL);
+		if (need_unlock)
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     lockstart, lockend, &cached_state,
+					     GFP_NOFS);
+		if (ret) {
+			btrfs_drop_pages(pages, num_pages);
+			break;
+		}
+
+		release_bytes = 0;
+		if (only_release_metadata)
+			btrfs_end_write_no_snapshoting(root);
+
+		if (only_release_metadata && copied > 0) {
+			lockstart = round_down(pos, root->sectorsize);
+			lockend = lockstart +
+				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				       lockend, EXTENT_NORESERVE, NULL,
+				       NULL, GFP_NOFS);
+			only_release_metadata = false;
+		}
+
+		btrfs_drop_pages(pages, num_pages);
+
+		cond_resched();
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
+			btrfs_btree_balance_dirty(root);
+
+		pos += copied;
+		num_written += copied;
+	}
+
+	kfree(pages);
+
+	if (release_bytes) {
+		if (only_release_metadata) {
+			btrfs_end_write_no_snapshoting(root);
+			btrfs_delalloc_release_metadata(inode, release_bytes);
+		} else {
+			btrfs_delalloc_release_space(inode, release_bytes);
+		}
+	}
+
+	return num_written ? num_written : ret;
+}
+
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
+				    struct iov_iter *from,
+				    loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	ssize_t written;
+	ssize_t written_buffered;
+	loff_t endbyte;
+	int err;
+
+	written = generic_file_direct_write(iocb, from, pos);
+
+	if (written < 0 || !iov_iter_count(from))
+		return written;
+
+	pos += written;
+	written_buffered = __btrfs_buffered_write(file, from, pos);
+	if (written_buffered < 0) {
+		err = written_buffered;
+		goto out;
+	}
+	/*
+	 * Ensure all data is persisted. We want the next direct IO read to be
+	 * able to read what was just written.
+	 */
+	endbyte = pos + written_buffered - 1;
+	err = btrfs_fdatawrite_range(inode, pos, endbyte);
+	if (err)
+		goto out;
+	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+	if (err)
+		goto out;
+	written += written_buffered;
+	iocb->ki_pos = pos + written_buffered;
+	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+				 endbyte >> PAGE_CACHE_SHIFT);
+out:
+	return written ? written : err;
+}
+
+static void update_time_for_write(struct inode *inode)
+{
+	struct timespec now;
+
+	if (IS_NOCMTIME(inode))
+		return;
+
+	now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, &now))
+		inode->i_mtime = now;
+
+	if (!timespec_equal(&inode->i_ctime, &now))
+		inode->i_ctime = now;
+
+	if (IS_I_VERSION(inode))
+		inode_inc_iversion(inode);
+}
+
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+				    struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start_pos;
+	u64 end_pos;
+	ssize_t num_written = 0;
+	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+	ssize_t err;
+	loff_t pos;
+	size_t count;
+
+	mutex_lock(&inode->i_mutex);
+	err = generic_write_checks(iocb, from);
+	if (err <= 0) {
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+
+	current->backing_dev_info = inode_to_bdi(inode);
+	err = file_remove_suid(file);
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	/*
+	 * If BTRFS flips readonly due to some impossible error
+	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+	 * although we have opened a file as writable, we have
+	 * to stop this write operation to ensure FS consistency.
+	 */
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+		mutex_unlock(&inode->i_mutex);
+		err = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * We reserve space for updating the inode when we reserve space for the
+	 * extent we are going to write, so we will enospc out there.  We don't
+	 * need to start yet another transaction to update the inode as we will
+	 * update the inode when we finish writing whatever data we write.
+	 */
+	update_time_for_write(inode);
+
+	pos = iocb->ki_pos;
+	count = iov_iter_count(from);
+	start_pos = round_down(pos, root->sectorsize);
+	if (start_pos > i_size_read(inode)) {
+		/* Expand hole size to cover write data, preventing empty gap */
+		end_pos = round_up(pos + count, root->sectorsize);
+		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
+		if (err) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+
+	if (sync)
+		atomic_inc(&BTRFS_I(inode)->sync_writers);
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		num_written = __btrfs_direct_write(iocb, from, pos);
+	} else {
+		num_written = __btrfs_buffered_write(file, from, pos);
+		if (num_written > 0)
+			iocb->ki_pos = pos + num_written;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	/*
+	 * We also have to set last_sub_trans to the current log transid,
+	 * otherwise subsequent syncs to a file that's been synced in this
+	 * transaction will appear to have already occured.
+	 */
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->last_sub_trans = root->log_transid;
+	spin_unlock(&BTRFS_I(inode)->lock);
+	if (num_written > 0) {
+		err = generic_write_sync(file, pos, num_written);
+		if (err < 0)
+			num_written = err;
+	}
+
+	if (sync)
+		atomic_dec(&BTRFS_I(inode)->sync_writers);
+out:
+	current->backing_dev_info = NULL;
+	return num_written ? num_written : err;
+}
+
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+	if (filp->private_data)
+		btrfs_ioctl_trans_end(filp);
+	/*
+	 * ordered_data_close is set by settattr when we are about to truncate
+	 * a file from a non-zero size to a zero size.  This tries to
+	 * flush down new bytes that may have been written if the
+	 * application were using truncate to replace a file in place.
+	 */
+	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+			       &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	return 0;
+}
+
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+	int ret;
+
+	atomic_inc(&BTRFS_I(inode)->sync_writers);
+	ret = btrfs_fdatawrite_range(inode, start, end);
+	atomic_dec(&BTRFS_I(inode)->sync_writers);
+
+	return ret;
+}
+
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_log_ctx ctx;
+	int ret = 0;
+	bool full_sync = 0;
+
+	trace_btrfs_sync_file(file, datasync);
+
+	/*
+	 * We write the dirty pages in the range and wait until they complete
+	 * out of the ->i_mutex. If so, we can flush the dirty pages by
+	 * multi-task, and make the performance up.  See
+	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
+	 */
+	ret = start_ordered_ops(inode, start, end);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	atomic_inc(&root->log_batch);
+	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			     &BTRFS_I(inode)->runtime_flags);
+	/*
+	 * We might have have had more pages made dirty after calling
+	 * start_ordered_ops and before acquiring the inode's i_mutex.
+	 */
+	if (full_sync) {
+		/*
+		 * For a full sync, we need to make sure any ordered operations
+		 * start and finish before we start logging the inode, so that
+		 * all extents are persisted and the respective file extent
+		 * items are in the fs/subvol btree.
+		 */
+		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+	} else {
+		/*
+		 * Start any new ordered operations before starting to log the
+		 * inode. We will wait for them to finish in btrfs_sync_log().
+		 *
+		 * Right before acquiring the inode's mutex, we might have new
+		 * writes dirtying pages, which won't immediately start the
+		 * respective ordered operations - that is done through the
+		 * fill_delalloc callbacks invoked from the writepage and
+		 * writepages address space operations. So make sure we start
+		 * all ordered operations before starting to log our inode. Not
+		 * doing this means that while logging the inode, writeback
+		 * could start and invoke writepage/writepages, which would call
+		 * the fill_delalloc callbacks (cow_file_range,
+		 * submit_compressed_extents). These callbacks add first an
+		 * extent map to the modified list of extents and then create
+		 * the respective ordered operation, which means in
+		 * tree-log.c:btrfs_log_inode() we might capture all existing
+		 * ordered operations (with btrfs_get_logged_extents()) before
+		 * the fill_delalloc callback adds its ordered operation, and by
+		 * the time we visit the modified list of extent maps (with
+		 * btrfs_log_changed_extents()), we see and process the extent
+		 * map they created. We then use the extent map to construct a
+		 * file extent item for logging without waiting for the
+		 * respective ordered operation to finish - this file extent
+		 * item points to a disk location that might not have yet been
+		 * written to, containing random data - so after a crash a log
+		 * replay will make our inode have file extent items that point
+		 * to disk locations containing invalid data, as we returned
+		 * success to userspace without waiting for the respective
+		 * ordered operation to finish, because it wasn't captured by
+		 * btrfs_get_logged_extents().
+		 */
+		ret = start_ordered_ops(inode, start, end);
+	}
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+	atomic_inc(&root->log_batch);
+
+	/*
+	 * If the last transaction that changed this file was before the current
+	 * transaction and we have the full sync flag set in our inode, we can
+	 * bail out now without any syncing.
+	 *
+	 * Note that we can't bail out if the full sync flag isn't set. This is
+	 * because when the full sync flag is set we start all ordered extents
+	 * and wait for them to fully complete - when they complete they update
+	 * the inode's last_trans field through:
+	 *
+	 *     btrfs_finish_ordered_io() ->
+	 *         btrfs_update_inode_fallback() ->
+	 *             btrfs_update_inode() ->
+	 *                 btrfs_set_inode_last_trans()
+	 *
+	 * So we are sure that last_trans is up to date and can do this check to
+	 * bail out safely. For the fast path, when the full sync flag is not
+	 * set in our inode, we can not do it because we start only our ordered
+	 * extents and don't wait for them to complete (that is when
+	 * btrfs_finish_ordered_io runs), so here at this point their last_trans
+	 * value might be less than or equals to fs_info->last_trans_committed,
+	 * and setting a speculative last_trans for an inode when a buffered
+	 * write is made (such as fs_info->generation + 1 for example) would not
+	 * be reliable since after setting the value and before fsync is called
+	 * any number of transactions can start and commit (transaction kthread
+	 * commits the current transaction periodically), and a transaction
+	 * commit does not start nor waits for ordered extents to complete.
+	 */
+	smp_mb();
+	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+	    (full_sync && BTRFS_I(inode)->last_trans <=
+	     root->fs_info->last_trans_committed)) {
+		/*
+		 * We'v had everything committed since the last time we were
+		 * modified so clear this flag in case it was set for whatever
+		 * reason, it's no longer relevant.
+		 */
+		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			  &BTRFS_I(inode)->runtime_flags);
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	/*
+	 * ok we haven't committed the transaction yet, lets do a commit
+	 */
+	if (file->private_data)
+		btrfs_ioctl_trans_end(file);
+
+	/*
+	 * We use start here because we will need to wait on the IO to complete
+	 * in btrfs_sync_log, which could require joining a transaction (for
+	 * example checking cross references in the nocow path).  If we use join
+	 * here we could get into a situation where we're waiting on IO to
+	 * happen that is blocked on a transaction trying to commit.  With start
+	 * we inc the extwriter counter, so we wait for all extwriters to exit
+	 * before we start blocking join'ers.  This comment is to keep somebody
+	 * from thinking they are super smart and changing this to
+	 * btrfs_join_transaction *cough*Josef*cough*.
+	 */
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+	trans->sync = true;
+
+	btrfs_init_log_ctx(&ctx);
+
+	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
+	if (ret < 0) {
+		/* Fallthrough and commit/free transaction. */
+		ret = 1;
+	}
+
+	/* we've logged all the items and now have a consistent
+	 * version of the file in the log.  It is possible that
+	 * someone will come in and modify the file, but that's
+	 * fine because the log is consistent on disk, and we
+	 * have references to all of the file's extents
+	 *
+	 * It is possible that someone will come in and log the
+	 * file again, but that will end up using the synchronization
+	 * inside btrfs_sync_log to keep things safe.
+	 */
+	mutex_unlock(&inode->i_mutex);
+
+	/*
+	 * If any of the ordered extents had an error, just return it to user
+	 * space, so that the application knows some writes didn't succeed and
+	 * can take proper action (retry for e.g.). Blindly committing the
+	 * transaction in this case, would fool userspace that everything was
+	 * successful. And we also want to make sure our log doesn't contain
+	 * file extent items pointing to extents that weren't fully written to -
+	 * just like in the non fast fsync path, where we check for the ordered
+	 * operation's error flag before writing to the log tree and return -EIO
+	 * if any of them had this flag set (btrfs_wait_ordered_range) -
+	 * therefore we need to check for errors in the ordered operations,
+	 * which are indicated by ctx.io_err.
+	 */
+	if (ctx.io_err) {
+		btrfs_end_transaction(trans, root);
+		ret = ctx.io_err;
+		goto out;
+	}
+
+	if (ret != BTRFS_NO_LOG_SYNC) {
+		if (!ret) {
+			ret = btrfs_sync_log(trans, root, &ctx);
+			if (!ret) {
+				ret = btrfs_end_transaction(trans, root);
+				goto out;
+			}
+		}
+		if (!full_sync) {
+			ret = btrfs_wait_ordered_range(inode, start,
+						       end - start + 1);
+			if (ret) {
+				btrfs_end_transaction(trans, root);
+				goto out;
+			}
+		}
+		ret = btrfs_commit_transaction(trans, root);
+	} else {
+		ret = btrfs_end_transaction(trans, root);
+	}
+out:
+	return ret > 0 ? -EIO : ret;
+}
+
+static const struct vm_operations_struct btrfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = filp->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+
+	file_accessed(filp);
+	vma->vm_ops = &btrfs_file_vm_ops;
+
+	return 0;
+}
+
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+			  int slot, u64 start, u64 end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != btrfs_ino(inode) ||
+	    key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_disk_bytenr(leaf, fi))
+		return 0;
+
+	if (key.offset == end)
+		return 1;
+	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+		return 1;
+	return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+		      struct btrfs_path *path, u64 offset, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct extent_map *hole_em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key key;
+	int ret;
+
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+		goto out;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(!ret);
+
+	leaf = path->nodes[0];
+	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]--;
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+			end - offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+
+	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
+		u64 num_bytes;
+
+		key.offset = offset;
+		btrfs_set_item_key_safe(root->fs_info, path, &key);
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+			offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, end - offset, 0, end - offset,
+				       0, 0, 0);
+	if (ret)
+		return ret;
+
+out:
+	btrfs_release_path(path);
+
+	hole_em = alloc_extent_map();
+	if (!hole_em) {
+		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+	} else {
+		hole_em->start = offset;
+		hole_em->len = end - offset;
+		hole_em->ram_bytes = hole_em->len;
+		hole_em->orig_start = offset;
+
+		hole_em->block_start = EXTENT_MAP_HOLE;
+		hole_em->block_len = 0;
+		hole_em->orig_block_len = 0;
+		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+		hole_em->compress_type = BTRFS_COMPRESS_NONE;
+		hole_em->generation = trans->transid;
+
+		do {
+			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, hole_em, 1);
+			write_unlock(&em_tree->lock);
+		} while (ret == -EEXIST);
+		free_extent_map(hole_em);
+		if (ret)
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+	}
+
+	return 0;
+}
+
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ *	   em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+{
+	struct extent_map *em;
+	int ret = 0;
+
+	em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
+	if (IS_ERR_OR_NULL(em)) {
+		if (!em)
+			ret = -ENOMEM;
+		else
+			ret = PTR_ERR(em);
+		return ret;
+	}
+
+	/* Hole or vacuum extent(only exists in no-hole mode) */
+	if (em->block_start == EXTENT_MAP_HOLE) {
+		ret = 1;
+		*len = em->start + em->len > *start + *len ?
+		       0 : *start + *len - em->start - em->len;
+		*start = em->start + em->len;
+	}
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *rsv;
+	struct btrfs_trans_handle *trans;
+	u64 lockstart;
+	u64 lockend;
+	u64 tail_start;
+	u64 tail_len;
+	u64 orig_start = offset;
+	u64 cur_offset;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+	u64 drop_end;
+	int ret = 0;
+	int err = 0;
+	int rsv_count;
+	bool same_page;
+	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+	u64 ino_size;
+	bool truncated_page = false;
+	bool updated_inode = false;
+
+	ret = btrfs_wait_ordered_range(inode, offset, len);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+	ret = find_first_non_hole(inode, &offset, &len);
+	if (ret < 0)
+		goto out_only_mutex;
+	if (ret && !len) {
+		/* Already in a large hole */
+		ret = 0;
+		goto out_only_mutex;
+	}
+
+	lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+	lockend = round_down(offset + len,
+			     BTRFS_I(inode)->root->sectorsize) - 1;
+	same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+		    ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
+	/*
+	 * We needn't truncate any page which is beyond the end of the file
+	 * because we are sure there is no data there.
+	 */
+	/*
+	 * Only do this if we are in the same page and we aren't doing the
+	 * entire page.
+	 */
+	if (same_page && len < PAGE_CACHE_SIZE) {
+		if (offset < ino_size) {
+			truncated_page = true;
+			ret = btrfs_truncate_page(inode, offset, len, 0);
+		} else {
+			ret = 0;
+		}
+		goto out_only_mutex;
+	}
+
+	/* zero back part of the first page */
+	if (offset < ino_size) {
+		truncated_page = true;
+		ret = btrfs_truncate_page(inode, offset, 0, 0);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	/* Check the aligned pages after the first unaligned page,
+	 * if offset != orig_start, which means the first unaligned page
+	 * including serveral following pages are already in holes,
+	 * the extra check can be skipped */
+	if (offset == orig_start) {
+		/* after truncate page, check hole again */
+		len = offset + len - lockstart;
+		offset = lockstart;
+		ret = find_first_non_hole(inode, &offset, &len);
+		if (ret < 0)
+			goto out_only_mutex;
+		if (ret && !len) {
+			ret = 0;
+			goto out_only_mutex;
+		}
+		lockstart = offset;
+	}
+
+	/* Check the tail unaligned part is in a hole */
+	tail_start = lockend + 1;
+	tail_len = offset + len - tail_start;
+	if (tail_len) {
+		ret = find_first_non_hole(inode, &tail_start, &tail_len);
+		if (unlikely(ret < 0))
+			goto out_only_mutex;
+		if (!ret) {
+			/* zero the front end of the last page */
+			if (tail_start + tail_len < ino_size) {
+				truncated_page = true;
+				ret = btrfs_truncate_page(inode,
+						tail_start + tail_len, 0, 1);
+				if (ret)
+					goto out_only_mutex;
+			}
+		}
+	}
+
+	if (lockend < lockstart) {
+		ret = 0;
+		goto out_only_mutex;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_NOFS);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+	rsv->failfast = 1;
+
+	/*
+	 * 1 - update the inode
+	 * 1 - removing the extents in the range
+	 * 1 - adding the hole extent if no_holes isn't set
+	 */
+	rsv_count = no_holes ? 2 : 3;
+	trans = btrfs_start_transaction(root, rsv_count);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
+	BUG_ON(ret);
+	trans->block_rsv = rsv;
+
+	cur_offset = lockstart;
+	len = lockend - cur_offset;
+	while (cur_offset < lockend) {
+		ret = __btrfs_drop_extents(trans, root, inode, path,
+					   cur_offset, lockend + 1,
+					   &drop_end, 1, 0, 0, NULL);
+		if (ret != -ENOSPC)
+			break;
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+		if (cur_offset < ino_size) {
+			ret = fill_holes(trans, inode, path, cur_offset,
+					 drop_end);
+			if (ret) {
+				err = ret;
+				break;
+			}
+		}
+
+		cur_offset = drop_end;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root);
+
+		trans = btrfs_start_transaction(root, rsv_count);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
+
+		ret = find_first_non_hole(inode, &cur_offset, &len);
+		if (unlikely(ret < 0))
+			break;
+		if (ret && !len) {
+			ret = 0;
+			break;
+		}
+	}
+
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	/*
+	 * Don't insert file hole extent item if it's for a range beyond eof
+	 * (because it's useless) or if it represents a 0 bytes range (when
+	 * cur_offset == drop_end).
+	 */
+	if (cur_offset < ino_size && cur_offset < drop_end) {
+		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+		if (ret) {
+			err = ret;
+			goto out_trans;
+		}
+	}
+
+out_trans:
+	if (!trans)
+		goto out_free;
+
+	inode_inc_iversion(inode);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = btrfs_update_inode(trans, root, inode);
+	updated_inode = true;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+out_free:
+	btrfs_free_path(path);
+	btrfs_free_block_rsv(root, rsv);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state, GFP_NOFS);
+out_only_mutex:
+	if (!updated_inode && truncated_page && !ret && !err) {
+		/*
+		 * If we only end up zeroing part of a page, we still need to
+		 * update the inode item, so that all the time fields are
+		 * updated as well as the necessary btrfs inode in memory fields
+		 * for detecting, at fsync time, if the inode isn't yet in the
+		 * log tree or it's there but not up to date.
+		 */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+		} else {
+			err = btrfs_update_inode(trans, root, inode);
+			ret = btrfs_end_transaction(trans, root);
+		}
+	}
+	mutex_unlock(&inode->i_mutex);
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
+static long btrfs_fallocate(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct extent_state *cached_state = NULL;
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 locked_end;
+	struct extent_map *em;
+	int blocksize = BTRFS_I(inode)->root->sectorsize;
+	int ret;
+
+	alloc_start = round_down(offset, blocksize);
+	alloc_end = round_up(offset + len, blocksize);
+
+	/* Make sure we aren't being give some crap mode */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return btrfs_punch_hole(inode, offset, len);
+
+	/*
+	 * Make sure we have enough space before we do the
+	 * allocation.
+	 */
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, alloc_end);
+	if (ret)
+		goto out;
+
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, i_size_read(inode),
+					alloc_start);
+		if (ret)
+			goto out;
+	} else {
+		/*
+		 * If we are fallocating from the end of the file onward we
+		 * need to zero out the end of the page if i_size lands in the
+		 * middle of a page.
+		 */
+		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	ret = btrfs_wait_ordered_range(inode, alloc_start,
+				       alloc_end - alloc_start);
+	if (ret)
+		goto out;
+
+	locked_end = alloc_end - 1;
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
+			ret = btrfs_wait_ordered_range(inode, alloc_start,
+						       alloc_end - alloc_start);
+			if (ret)
+				goto out;
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		u64 actual_end;
+
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		if (IS_ERR_OR_NULL(em)) {
+			if (!em)
+				ret = -ENOMEM;
+			else
+				ret = PTR_ERR(em);
+			break;
+		}
+		last_byte = min(extent_map_end(em), alloc_end);
+		actual_end = min_t(u64, extent_map_end(em), offset + len);
+		last_byte = ALIGN(last_byte, blocksize);
+
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    (cur_offset >= inode->i_size &&
+		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
+		} else if (actual_end > inode->i_size &&
+			   !(mode & FALLOC_FL_KEEP_SIZE)) {
+			struct btrfs_trans_handle *trans;
+			struct btrfs_root *root = BTRFS_I(inode)->root;
+
+			/*
+			 * We didn't need to allocate any more space, but we
+			 * still extended the size of the file so we need to
+			 * update i_size and the inode item.
+			 */
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+			} else {
+				inode->i_ctime = CURRENT_TIME;
+				i_size_write(inode, actual_end);
+				btrfs_ordered_update_i_size(inode, actual_end,
+							    NULL);
+				ret = btrfs_update_inode(trans, root, inode);
+				if (ret)
+					btrfs_end_transaction(trans, root);
+				else
+					ret = btrfs_end_transaction(trans,
+								    root);
+			}
+		}
+		free_extent_map(em);
+		if (ret < 0)
+			break;
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
+out:
+	mutex_unlock(&inode->i_mutex);
+	/* Let go of our reservation. */
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+	return ret;
+}
+
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
+	u64 lockstart;
+	u64 lockend;
+	u64 start;
+	u64 len;
+	int ret = 0;
+
+	if (inode->i_size == 0)
+		return -ENXIO;
+
+	/*
+	 * *offset can be negative, in this case we start finding DATA/HOLE from
+	 * the very start of the file.
+	 */
+	start = max_t(loff_t, 0, *offset);
+
+	lockstart = round_down(start, root->sectorsize);
+	lockend = round_up(i_size_read(inode), root->sectorsize);
+	if (lockend <= lockstart)
+		lockend = lockstart + root->sectorsize;
+	lockend--;
+	len = lockend - lockstart + 1;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+			 &cached_state);
+
+	while (start < inode->i_size) {
+		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			em = NULL;
+			break;
+		}
+
+		if (whence == SEEK_HOLE &&
+		    (em->block_start == EXTENT_MAP_HOLE ||
+		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
+		else if (whence == SEEK_DATA &&
+			   (em->block_start != EXTENT_MAP_HOLE &&
+			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
+
+		start = em->start + em->len;
+		free_extent_map(em);
+		em = NULL;
+		cond_resched();
+	}
+	free_extent_map(em);
+	if (!ret) {
+		if (whence == SEEK_DATA && start >= inode->i_size)
+			ret = -ENXIO;
+		else
+			*offset = min_t(loff_t, start, inode->i_size);
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state, GFP_NOFS);
+	return ret;
+}
+
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+	switch (whence) {
+	case SEEK_END:
+	case SEEK_CUR:
+		offset = generic_file_llseek(file, offset, whence);
+		goto out;
+	case SEEK_DATA:
+	case SEEK_HOLE:
+		if (offset >= i_size_read(inode)) {
+			mutex_unlock(&inode->i_mutex);
+			return -ENXIO;
+		}
+
+		ret = find_desired_extent(inode, &offset, whence);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+const struct file_operations btrfs_file_operations = {
+	.llseek		= btrfs_file_llseek,
+	.read_iter      = generic_file_read_iter,
+	.splice_read	= generic_file_splice_read,
+	.write_iter	= btrfs_file_write_iter,
+	.mmap		= btrfs_file_mmap,
+	.open		= generic_file_open,
+	.release	= btrfs_release_file,
+	.fsync		= btrfs_sync_file,
+	.fallocate	= btrfs_fallocate,
+	.unlocked_ioctl	= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_ioctl,
+#endif
+};
+
+void btrfs_auto_defrag_exit(void)
+{
+	if (btrfs_inode_defrag_cachep)
+		kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+					sizeof(struct inode_defrag), 0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_inode_defrag_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * So with compression we will find and lock a dirty page and clear the
+	 * first one as dirty, setup an async extent, and immediately return
+	 * with the entire range locked but with nobody actually marked with
+	 * writeback.  So we can't just filemap_write_and_wait_range() and
+	 * expect it to work since it will just kick off a thread to do the
+	 * actual work.  So we need to call filemap_fdatawrite_range _again_
+	 * since it will wait on the page lock, which won't be unlocked until
+	 * after the pages have been marked as writeback and so we're good to go
+	 * from there.  We have to do this otherwise we'll miss the ordered
+	 * extents and that results in badness.  Please Josef, do not think you
+	 * know better and pull this out at some point in the future, it is
+	 * right and you are wrong.
+	 */
+	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+
+	return ret;
+}
diff --git a/kernel/fs/btrfs/free-space-cache.c b/kernel/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000..9dbe5b548
--- /dev/null
+++ b/kernel/fs/btrfs/free-space-cache.c
@@ -0,0 +1,3653 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
+#include <linux/ratelimit.h>
+#include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "extent_io.h"
+#include "inode-map.h"
+#include "volumes.h"
+
+#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
+
+struct btrfs_trim_range {
+	u64 start;
+	u64 bytes;
+	struct list_head list;
+};
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info);
+
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
+					       struct btrfs_path *path,
+					       u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct inode *inode = NULL;
+	int ret;
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		btrfs_release_path(path);
+		return ERR_PTR(-ENOENT);
+	}
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_free_space_key(leaf, header, &disk_key);
+	btrfs_disk_key_to_cpu(&location, &disk_key);
+	btrfs_release_path(path);
+
+	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+	if (IS_ERR(inode))
+		return inode;
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		return ERR_PTR(-ENOENT);
+	}
+
+	mapping_set_gfp_mask(inode->i_mapping,
+			mapping_gfp_mask(inode->i_mapping) &
+			~(__GFP_FS | __GFP_HIGHMEM));
+
+	return inode;
+}
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path)
+{
+	struct inode *inode = NULL;
+	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
+	spin_lock(&block_group->lock);
+	if (block_group->inode)
+		inode = igrab(block_group->inode);
+	spin_unlock(&block_group->lock);
+	if (inode)
+		return inode;
+
+	inode = __lookup_free_space_inode(root, path,
+					  block_group->key.objectid);
+	if (IS_ERR(inode))
+		return inode;
+
+	spin_lock(&block_group->lock);
+	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
+		btrfs_info(root->fs_info,
+			"Old style space inode found, converting.");
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+			BTRFS_INODE_NODATACOW;
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+	}
+
+	if (!block_group->iref) {
+		block_group->inode = igrab(inode);
+		block_group->iref = 1;
+	}
+	spin_unlock(&block_group->lock);
+
+	return inode;
+}
+
+static int __create_free_space_inode(struct btrfs_root *root,
+				     struct btrfs_trans_handle *trans,
+				     struct btrfs_path *path,
+				     u64 ino, u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
+	int ret;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, ino);
+	if (ret)
+		return ret;
+
+	/* We inline crc's for the free disk space cache */
+	if (ino != BTRFS_FREE_INO_OBJECTID)
+		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	btrfs_item_key(leaf, &disk_key, path->slots[0]);
+	memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+			     sizeof(*inode_item));
+	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+	btrfs_set_inode_size(leaf, inode_item, 0);
+	btrfs_set_inode_nbytes(leaf, inode_item, 0);
+	btrfs_set_inode_uid(leaf, inode_item, 0);
+	btrfs_set_inode_gid(leaf, inode_item, 0);
+	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, inode_item, flags);
+	btrfs_set_inode_nlink(leaf, inode_item, 1);
+	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+	btrfs_set_inode_block_group(leaf, inode_item, offset);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_free_space_header));
+	if (ret < 0) {
+		btrfs_release_path(path);
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+	btrfs_set_free_space_key(leaf, header, &disk_key);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	return 0;
+}
+
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path)
+{
+	int ret;
+	u64 ino;
+
+	ret = btrfs_find_free_objectid(root, &ino);
+	if (ret < 0)
+		return ret;
+
+	return __create_free_space_inode(root, trans, path, ino,
+					 block_group->key.objectid);
+}
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv)
+{
+	u64 needed_bytes;
+	int ret;
+
+	/* 1 for slack space, 1 for updating the inode */
+	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
+		btrfs_calc_trans_metadata_size(root, 1);
+
+	spin_lock(&rsv->lock);
+	if (rsv->reserved < needed_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+	spin_unlock(&rsv->lock);
+	return ret;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_block_group_cache *block_group,
+				    struct inode *inode)
+{
+	int ret = 0;
+	struct btrfs_path *path = btrfs_alloc_path();
+
+	if (!path) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	if (block_group) {
+		mutex_lock(&trans->transaction->cache_write_mutex);
+		if (!list_empty(&block_group->io_list)) {
+			list_del_init(&block_group->io_list);
+
+			btrfs_wait_cache_io(root, trans, block_group,
+					    &block_group->io_ctl, path,
+					    block_group->key.objectid);
+			btrfs_put_block_group(block_group);
+		}
+
+		/*
+		 * now that we've truncated the cache away, its no longer
+		 * setup or written
+		 */
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+		spin_unlock(&block_group->lock);
+	}
+	btrfs_free_path(path);
+
+	btrfs_i_size_write(inode, 0);
+	truncate_pagecache(inode, 0);
+
+	/*
+	 * We don't need an orphan item because truncating the free space cache
+	 * will never be split across transactions.
+	 * We don't need to check for -EAGAIN because we're a free space
+	 * cache inode
+	 */
+	ret = btrfs_truncate_inode_items(trans, root, inode,
+					 0, BTRFS_EXTENT_DATA_KEY);
+	if (ret) {
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+
+	ret = btrfs_update_inode(trans, root, inode);
+
+	if (block_group)
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+
+fail:
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+
+	return ret;
+}
+
+static int readahead_cache(struct inode *inode)
+{
+	struct file_ra_state *ra;
+	unsigned long last_index;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	file_ra_state_init(ra, inode->i_mapping);
+	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+
+	kfree(ra);
+
+	return 0;
+}
+
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
+		       struct btrfs_root *root, int write)
+{
+	int num_pages;
+	int check_crcs = 0;
+
+	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+
+	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+		check_crcs = 1;
+
+	/* Make sure we can fit our crcs into the first page */
+	if (write && check_crcs &&
+	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+		return -ENOSPC;
+
+	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
+
+	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
+	if (!io_ctl->pages)
+		return -ENOMEM;
+
+	io_ctl->num_pages = num_pages;
+	io_ctl->root = root;
+	io_ctl->check_crcs = check_crcs;
+	io_ctl->inode = inode;
+
+	return 0;
+}
+
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
+{
+	kfree(io_ctl->pages);
+	io_ctl->pages = NULL;
+}
+
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
+{
+	if (io_ctl->cur) {
+		io_ctl->cur = NULL;
+		io_ctl->orig = NULL;
+	}
+}
+
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
+{
+	ASSERT(io_ctl->index < io_ctl->num_pages);
+	io_ctl->page = io_ctl->pages[io_ctl->index++];
+	io_ctl->cur = page_address(io_ctl->page);
+	io_ctl->orig = io_ctl->cur;
+	io_ctl->size = PAGE_CACHE_SIZE;
+	if (clear)
+		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
+{
+	int i;
+
+	io_ctl_unmap_page(io_ctl);
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		if (io_ctl->pages[i]) {
+			ClearPageChecked(io_ctl->pages[i]);
+			unlock_page(io_ctl->pages[i]);
+			page_cache_release(io_ctl->pages[i]);
+		}
+	}
+}
+
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
+				int uptodate)
+{
+	struct page *page;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	int i;
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		page = find_or_create_page(inode->i_mapping, i, mask);
+		if (!page) {
+			io_ctl_drop_pages(io_ctl);
+			return -ENOMEM;
+		}
+		io_ctl->pages[i] = page;
+		if (uptodate && !PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				btrfs_err(BTRFS_I(inode)->root->fs_info,
+					   "error reading free space cache");
+				io_ctl_drop_pages(io_ctl);
+				return -EIO;
+			}
+		}
+	}
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		clear_page_dirty_for_io(io_ctl->pages[i]);
+		set_page_extent_mapped(io_ctl->pages[i]);
+	}
+
+	return 0;
+}
+
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
+{
+	__le64 *val;
+
+	io_ctl_map_page(io_ctl, 1);
+
+	/*
+	 * Skip the csum areas.  If we don't check crcs then we just have a
+	 * 64bit chunk at the front of the first page.
+	 */
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
+
+	val = io_ctl->cur;
+	*val = cpu_to_le64(generation);
+	io_ctl->cur += sizeof(u64);
+}
+
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
+{
+	__le64 *gen;
+
+	/*
+	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
+	 * chunk at the front of the first page.
+	 */
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+		io_ctl->size -= sizeof(u64) +
+			(sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
+
+	gen = io_ctl->cur;
+	if (le64_to_cpu(*gen) != generation) {
+		printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
+				   "(%Lu) does not match inode (%Lu)\n", *gen,
+				   generation);
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+	io_ctl->cur += sizeof(u64);
+	return 0;
+}
+
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
+{
+	u32 *tmp;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_unmap_page(io_ctl);
+		return;
+	}
+
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;
+
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	io_ctl_unmap_page(io_ctl);
+	tmp = page_address(io_ctl->pages[0]);
+	tmp += index;
+	*tmp = crc;
+}
+
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
+{
+	u32 *tmp, val;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_map_page(io_ctl, 0);
+		return 0;
+	}
+
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;
+
+	tmp = page_address(io_ctl->pages[0]);
+	tmp += index;
+	val = *tmp;
+
+	io_ctl_map_page(io_ctl, 0);
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	if (val != crc) {
+		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
+				   "space cache\n");
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
+			    void *bitmap)
+{
+	struct btrfs_free_space_entry *entry;
+
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	entry = io_ctl->cur;
+	entry->offset = cpu_to_le64(offset);
+	entry->bytes = cpu_to_le64(bytes);
+	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+		BTRFS_FREE_SPACE_EXTENT;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+		return 0;
+
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+
+	/* No more pages to map */
+	if (io_ctl->index >= io_ctl->num_pages)
+		return 0;
+
+	/* map the next page */
+	io_ctl_map_page(io_ctl, 1);
+	return 0;
+}
+
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
+{
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	/*
+	 * If we aren't at the start of the current page, unmap this one and
+	 * map the next one if there is any left.
+	 */
+	if (io_ctl->cur != io_ctl->orig) {
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+		if (io_ctl->index >= io_ctl->num_pages)
+			return -ENOSPC;
+		io_ctl_map_page(io_ctl, 0);
+	}
+
+	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	if (io_ctl->index < io_ctl->num_pages)
+		io_ctl_map_page(io_ctl, 0);
+	return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
+{
+	/*
+	 * If we're not on the boundary we know we've modified the page and we
+	 * need to crc the page.
+	 */
+	if (io_ctl->cur != io_ctl->orig)
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	else
+		io_ctl_unmap_page(io_ctl);
+
+	while (io_ctl->index < io_ctl->num_pages) {
+		io_ctl_map_page(io_ctl, 1);
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	}
+}
+
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
+			    struct btrfs_free_space *entry, u8 *type)
+{
+	struct btrfs_free_space_entry *e;
+	int ret;
+
+	if (!io_ctl->cur) {
+		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+		if (ret)
+			return ret;
+	}
+
+	e = io_ctl->cur;
+	entry->offset = le64_to_cpu(e->offset);
+	entry->bytes = le64_to_cpu(e->bytes);
+	*type = e->type;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+		return 0;
+
+	io_ctl_unmap_page(io_ctl);
+
+	return 0;
+}
+
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
+			      struct btrfs_free_space *entry)
+{
+	int ret;
+
+	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+	if (ret)
+		return ret;
+
+	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+	io_ctl_unmap_page(io_ctl);
+
+	return 0;
+}
+
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries.  This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache.  So run through the space cache that we just
+ * loaded and merge contiguous entries.  This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *e, *prev = NULL;
+	struct rb_node *n;
+
+again:
+	spin_lock(&ctl->tree_lock);
+	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+		e = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (!prev)
+			goto next;
+		if (e->bitmap || prev->bitmap)
+			goto next;
+		if (prev->offset + prev->bytes == e->offset) {
+			unlink_free_space(ctl, prev);
+			unlink_free_space(ctl, e);
+			prev->bytes += e->bytes;
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			link_free_space(ctl, prev);
+			prev = NULL;
+			spin_unlock(&ctl->tree_lock);
+			goto again;
+		}
+next:
+		prev = e;
+	}
+	spin_unlock(&ctl->tree_lock);
+}
+
+static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_path *path, u64 offset)
+{
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct btrfs_io_ctl io_ctl;
+	struct btrfs_key key;
+	struct btrfs_free_space *e, *n;
+	LIST_HEAD(bitmaps);
+	u64 num_entries;
+	u64 num_bitmaps;
+	u64 generation;
+	u8 type;
+	int ret = 0;
+
+	/* Nothing in the space cache, goodbye */
+	if (!i_size_read(inode))
+		return 0;
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return 0;
+	else if (ret > 0) {
+		btrfs_release_path(path);
+		return 0;
+	}
+
+	ret = -1;
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	num_entries = btrfs_free_space_entries(leaf, header);
+	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+	generation = btrfs_free_space_generation(leaf, header);
+	btrfs_release_path(path);
+
+	if (!BTRFS_I(inode)->generation) {
+		btrfs_info(root->fs_info,
+			   "The free space cache file (%llu) is invalid. skip it\n",
+			   offset);
+		return 0;
+	}
+
+	if (BTRFS_I(inode)->generation != generation) {
+		btrfs_err(root->fs_info,
+			"free space inode generation (%llu) "
+			"did not match free space cache generation (%llu)",
+			BTRFS_I(inode)->generation, generation);
+		return 0;
+	}
+
+	if (!num_entries)
+		return 0;
+
+	ret = io_ctl_init(&io_ctl, inode, root, 0);
+	if (ret)
+		return ret;
+
+	ret = readahead_cache(inode);
+	if (ret)
+		goto out;
+
+	ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+	if (ret)
+		goto out;
+
+	ret = io_ctl_check_crc(&io_ctl, 0);
+	if (ret)
+		goto free_cache;
+
+	ret = io_ctl_check_generation(&io_ctl, generation);
+	if (ret)
+		goto free_cache;
+
+	while (num_entries) {
+		e = kmem_cache_zalloc(btrfs_free_space_cachep,
+				      GFP_NOFS);
+		if (!e)
+			goto free_cache;
+
+		ret = io_ctl_read_entry(&io_ctl, e, &type);
+		if (ret) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			goto free_cache;
+		}
+
+		if (!e->bytes) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			goto free_cache;
+		}
+
+		if (type == BTRFS_FREE_SPACE_EXTENT) {
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
+				kmem_cache_free(btrfs_free_space_cachep, e);
+				goto free_cache;
+			}
+		} else {
+			ASSERT(num_bitmaps);
+			num_bitmaps--;
+			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+			if (!e->bitmap) {
+				kmem_cache_free(
+					btrfs_free_space_cachep, e);
+				goto free_cache;
+			}
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			ctl->total_bitmaps++;
+			ctl->op->recalc_thresholds(ctl);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
+				kmem_cache_free(btrfs_free_space_cachep, e);
+				goto free_cache;
+			}
+			list_add_tail(&e->list, &bitmaps);
+		}
+
+		num_entries--;
+	}
+
+	io_ctl_unmap_page(&io_ctl);
+
+	/*
+	 * We add the bitmaps at the end of the entries in order that
+	 * the bitmap entries are added to the cache.
+	 */
+	list_for_each_entry_safe(e, n, &bitmaps, list) {
+		list_del_init(&e->list);
+		ret = io_ctl_read_bitmap(&io_ctl, e);
+		if (ret)
+			goto free_cache;
+	}
+
+	io_ctl_drop_pages(&io_ctl);
+	merge_space_tree(ctl);
+	ret = 1;
+out:
+	io_ctl_free(&io_ctl);
+	return ret;
+free_cache:
+	io_ctl_drop_pages(&io_ctl);
+	__btrfs_remove_free_space_cache(ctl);
+	goto out;
+}
+
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct inode *inode;
+	struct btrfs_path *path;
+	int ret = 0;
+	bool matched;
+	u64 used = btrfs_block_group_used(&block_group->item);
+
+	/*
+	 * If this block group has been marked to be cleared for one reason or
+	 * another then we can't trust the on disk cache, so just return.
+	 */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return 0;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode)) {
+		btrfs_free_path(path);
+		return 0;
+	}
+
+	/* We may have converted the inode and made the cache invalid. */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		spin_unlock(&block_group->lock);
+		btrfs_free_path(path);
+		goto out;
+	}
+	spin_unlock(&block_group->lock);
+
+	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+				      path, block_group->key.objectid);
+	btrfs_free_path(path);
+	if (ret <= 0)
+		goto out;
+
+	spin_lock(&ctl->tree_lock);
+	matched = (ctl->free_space == (block_group->key.offset - used -
+				       block_group->bytes_super));
+	spin_unlock(&ctl->tree_lock);
+
+	if (!matched) {
+		__btrfs_remove_free_space_cache(ctl);
+		btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
+			block_group->key.objectid);
+		ret = -1;
+	}
+out:
+	if (ret < 0) {
+		/* This cache is bogus, make sure it gets cleared */
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+		spin_unlock(&block_group->lock);
+		ret = 0;
+
+		btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+			block_group->key.objectid);
+	}
+
+	iput(inode);
+	return ret;
+}
+
+static noinline_for_stack
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
+			      struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_block_group_cache *block_group,
+			      int *entries, int *bitmaps,
+			      struct list_head *bitmap_list)
+{
+	int ret;
+	struct btrfs_free_cluster *cluster = NULL;
+	struct btrfs_free_cluster *cluster_locked = NULL;
+	struct rb_node *node = rb_first(&ctl->free_space_offset);
+	struct btrfs_trim_range *trim_entry;
+
+	/* Get the cluster for this block_group if it exists */
+	if (block_group && !list_empty(&block_group->cluster_list)) {
+		cluster = list_entry(block_group->cluster_list.next,
+				     struct btrfs_free_cluster,
+				     block_group_list);
+	}
+
+	if (!node && cluster) {
+		cluster_locked = cluster;
+		spin_lock(&cluster_locked->lock);
+		node = rb_first(&cluster->root);
+		cluster = NULL;
+	}
+
+	/* Write out the extent entries */
+	while (node) {
+		struct btrfs_free_space *e;
+
+		e = rb_entry(node, struct btrfs_free_space, offset_index);
+		*entries += 1;
+
+		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
+				       e->bitmap);
+		if (ret)
+			goto fail;
+
+		if (e->bitmap) {
+			list_add_tail(&e->list, bitmap_list);
+			*bitmaps += 1;
+		}
+		node = rb_next(node);
+		if (!node && cluster) {
+			node = rb_first(&cluster->root);
+			cluster_locked = cluster;
+			spin_lock(&cluster_locked->lock);
+			cluster = NULL;
+		}
+	}
+	if (cluster_locked) {
+		spin_unlock(&cluster_locked->lock);
+		cluster_locked = NULL;
+	}
+
+	/*
+	 * Make sure we don't miss any range that was removed from our rbtree
+	 * because trimming is running. Otherwise after a umount+mount (or crash
+	 * after committing the transaction) we would leak free space and get
+	 * an inconsistent free space cache report from fsck.
+	 */
+	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+				       trim_entry->bytes, NULL);
+		if (ret)
+			goto fail;
+		*entries += 1;
+	}
+
+	return 0;
+fail:
+	if (cluster_locked)
+		spin_unlock(&cluster_locked->lock);
+	return -ENOSPC;
+}
+
+static noinline_for_stack int
+update_cache_item(struct btrfs_trans_handle *trans,
+		  struct btrfs_root *root,
+		  struct inode *inode,
+		  struct btrfs_path *path, u64 offset,
+		  int entries, int bitmaps)
+{
+	struct btrfs_key key;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	int ret;
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
+		goto fail;
+	}
+	leaf = path->nodes[0];
+	if (ret > 0) {
+		struct btrfs_key found_key;
+		ASSERT(path->slots[0]);
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+		    found_key.offset != offset) {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+					 inode->i_size - 1,
+					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+					 NULL, GFP_NOFS);
+			btrfs_release_path(path);
+			goto fail;
+		}
+	}
+
+	BTRFS_I(inode)->generation = trans->transid;
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_set_free_space_entries(leaf, header, entries);
+	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+	btrfs_set_free_space_generation(leaf, header, trans->transid);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	return 0;
+
+fail:
+	return -1;
+}
+
+static noinline_for_stack int
+write_pinned_extent_entries(struct btrfs_root *root,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_io_ctl *io_ctl,
+			    int *entries)
+{
+	u64 start, extent_start, extent_end, len;
+	struct extent_io_tree *unpin = NULL;
+	int ret;
+
+	if (!block_group)
+		return 0;
+
+	/*
+	 * We want to add any pinned extents to our free space cache
+	 * so we don't leak the space
+	 *
+	 * We shouldn't have switched the pinned extents yet so this is the
+	 * right one
+	 */
+	unpin = root->fs_info->pinned_extents;
+
+	start = block_group->key.objectid;
+
+	while (start < block_group->key.objectid + block_group->key.offset) {
+		ret = find_first_extent_bit(unpin, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY, NULL);
+		if (ret)
+			return 0;
+
+		/* This pinned extent is out of our range */
+		if (extent_start >= block_group->key.objectid +
+		    block_group->key.offset)
+			return 0;
+
+		extent_start = max(extent_start, start);
+		extent_end = min(block_group->key.objectid +
+				 block_group->key.offset, extent_end + 1);
+		len = extent_end - extent_start;
+
+		*entries += 1;
+		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
+		if (ret)
+			return -ENOSPC;
+
+		start = extent_end;
+	}
+
+	return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+	struct list_head *pos, *n;
+	int ret;
+
+	/* Write out the bitmaps */
+	list_for_each_safe(pos, n, bitmap_list) {
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+
+		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
+		if (ret)
+			return -ENOSPC;
+		list_del_init(&entry->list);
+	}
+
+	return 0;
+}
+
+static int flush_dirty_cache(struct inode *inode)
+{
+	int ret;
+
+	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	if (ret)
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
+
+	return ret;
+}
+
+static void noinline_for_stack
+cleanup_bitmap_list(struct list_head *bitmap_list)
+{
+	struct list_head *pos, *n;
+
+	list_for_each_safe(pos, n, bitmap_list) {
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+		list_del_init(&entry->list);
+	}
+}
+
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+			   struct btrfs_io_ctl *io_ctl,
+			   struct extent_state **cached_state,
+			   struct list_head *bitmap_list)
+{
+	io_ctl_drop_pages(io_ctl);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, cached_state,
+			     GFP_NOFS);
+}
+
+int btrfs_wait_cache_io(struct btrfs_root *root,
+			struct btrfs_trans_handle *trans,
+			struct btrfs_block_group_cache *block_group,
+			struct btrfs_io_ctl *io_ctl,
+			struct btrfs_path *path, u64 offset)
+{
+	int ret;
+	struct inode *inode = io_ctl->inode;
+
+	if (!inode)
+		return 0;
+
+	if (block_group)
+		root = root->fs_info->tree_root;
+
+	/* Flush the dirty pages in the cache file. */
+	ret = flush_dirty_cache(inode);
+	if (ret)
+		goto out;
+
+	/* Update the cache item to tell everyone this cache file is valid. */
+	ret = update_cache_item(trans, root, inode, path, offset,
+				io_ctl->entries, io_ctl->bitmaps);
+out:
+	io_ctl_free(io_ctl);
+	if (ret) {
+		invalidate_inode_pages2(inode->i_mapping);
+		BTRFS_I(inode)->generation = 0;
+		if (block_group) {
+#ifdef DEBUG
+			btrfs_err(root->fs_info,
+				"failed to write free space cache for block group %llu",
+				block_group->key.objectid);
+#endif
+		}
+	}
+	btrfs_update_inode(trans, root, inode);
+
+	if (block_group) {
+		/* the dirty list is protected by the dirty_bgs_lock */
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+
+		/* the disk_cache_state is protected by the block group lock */
+		spin_lock(&block_group->lock);
+
+		/*
+		 * only mark this as written if we didn't get put back on
+		 * the dirty list while waiting for IO.   Otherwise our
+		 * cache state won't be right, and we won't get written again
+		 */
+		if (!ret && list_empty(&block_group->dirty_list))
+			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		else if (ret)
+			block_group->disk_cache_state = BTRFS_DC_ERROR;
+
+		spin_unlock(&block_group->lock);
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+		io_ctl->inode = NULL;
+		iput(inode);
+	}
+
+	return ret;
+
+}
+
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * or an errno if it was not.
+ */
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_block_group_cache *block_group,
+				   struct btrfs_io_ctl *io_ctl,
+				   struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path, u64 offset)
+{
+	struct extent_state *cached_state = NULL;
+	LIST_HEAD(bitmap_list);
+	int entries = 0;
+	int bitmaps = 0;
+	int ret;
+	int must_iput = 0;
+
+	if (!i_size_read(inode))
+		return -EIO;
+
+	WARN_ON(io_ctl->pages);
+	ret = io_ctl_init(io_ctl, inode, root, 1);
+	if (ret)
+		return ret;
+
+	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+		down_write(&block_group->data_rwsem);
+		spin_lock(&block_group->lock);
+		if (block_group->delalloc_bytes) {
+			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+			spin_unlock(&block_group->lock);
+			up_write(&block_group->data_rwsem);
+			BTRFS_I(inode)->generation = 0;
+			ret = 0;
+			must_iput = 1;
+			goto out;
+		}
+		spin_unlock(&block_group->lock);
+	}
+
+	/* Lock all pages first so we can lock the extent safely. */
+	ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+	if (ret)
+		goto out;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+			 0, &cached_state);
+
+	io_ctl_set_generation(io_ctl, trans->transid);
+
+	mutex_lock(&ctl->cache_writeout_mutex);
+	/* Write out the extent entries in the free space cache */
+	spin_lock(&ctl->tree_lock);
+	ret = write_cache_extent_entries(io_ctl, ctl,
+					 block_group, &entries, &bitmaps,
+					 &bitmap_list);
+	if (ret)
+		goto out_nospc_locked;
+
+	/*
+	 * Some spaces that are freed in the current transaction are pinned,
+	 * they will be added into free space cache after the transaction is
+	 * committed, we shouldn't lose them.
+	 *
+	 * If this changes while we are working we'll get added back to
+	 * the dirty list and redo it.  No locking needed
+	 */
+	ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
+	if (ret)
+		goto out_nospc_locked;
+
+	/*
+	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
+	 * locked while doing it because a concurrent trim can be manipulating
+	 * or freeing the bitmap.
+	 */
+	ret = write_bitmap_entries(io_ctl, &bitmap_list);
+	spin_unlock(&ctl->tree_lock);
+	mutex_unlock(&ctl->cache_writeout_mutex);
+	if (ret)
+		goto out_nospc;
+
+	/* Zero out the rest of the pages just to make sure */
+	io_ctl_zero_remaining_pages(io_ctl);
+
+	/* Everything is written out, now we dirty the pages in the file. */
+	ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
+				0, i_size_read(inode), &cached_state);
+	if (ret)
+		goto out_nospc;
+
+	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+		up_write(&block_group->data_rwsem);
+	/*
+	 * Release the pages and unlock the extent, we will flush
+	 * them out later
+	 */
+	io_ctl_drop_pages(io_ctl);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+	/*
+	 * at this point the pages are under IO and we're happy,
+	 * The caller is responsible for waiting on them and updating the
+	 * the cache and the inode
+	 */
+	io_ctl->entries = entries;
+	io_ctl->bitmaps = bitmaps;
+
+	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
+	if (ret)
+		goto out;
+
+	return 0;
+
+out:
+	io_ctl->inode = NULL;
+	io_ctl_free(io_ctl);
+	if (ret) {
+		invalidate_inode_pages2(inode->i_mapping);
+		BTRFS_I(inode)->generation = 0;
+	}
+	btrfs_update_inode(trans, root, inode);
+	if (must_iput)
+		iput(inode);
+	return ret;
+
+out_nospc_locked:
+	cleanup_bitmap_list(&bitmap_list);
+	spin_unlock(&ctl->tree_lock);
+	mutex_unlock(&ctl->cache_writeout_mutex);
+
+out_nospc:
+	cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
+
+	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+		up_write(&block_group->data_rwsem);
+
+	goto out;
+}
+
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct inode *inode;
+	int ret = 0;
+
+	root = root->fs_info->tree_root;
+
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode))
+		return 0;
+
+	ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
+				      &block_group->io_ctl, trans,
+				      path, block_group->key.objectid);
+	if (ret) {
+#ifdef DEBUG
+		btrfs_err(root->fs_info,
+			"failed to write free space cache for block group %llu",
+			block_group->key.objectid);
+#endif
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		spin_unlock(&block_group->lock);
+
+		block_group->io_ctl.inode = NULL;
+		iput(inode);
+	}
+
+	/*
+	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
+	 * to wait for IO and put the inode
+	 */
+
+	return ret;
+}
+
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
+					  u64 offset)
+{
+	ASSERT(offset >= bitmap_start);
+	offset -= bitmap_start;
+	return (unsigned long)(div_u64(offset, unit));
+}
+
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
+{
+	return (unsigned long)(div_u64(bytes, unit));
+}
+
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
+				   u64 offset)
+{
+	u64 bitmap_start;
+	u32 bytes_per_bitmap;
+
+	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
+	bitmap_start = offset - ctl->start;
+	bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
+	bitmap_start *= bytes_per_bitmap;
+	bitmap_start += ctl->start;
+
+	return bitmap_start;
+}
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+			      struct rb_node *node, int bitmap)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+		if (offset < info->offset) {
+			p = &(*p)->rb_left;
+		} else if (offset > info->offset) {
+			p = &(*p)->rb_right;
+		} else {
+			/*
+			 * we could have a bitmap entry and an extent entry
+			 * share the same offset.  If this is the case, we want
+			 * the extent entry to always be found first if we do a
+			 * linear search through the tree, since we want to have
+			 * the quickest allocation time, and allocating from an
+			 * extent is faster than allocating from a bitmap.  So
+			 * if we're inserting a bitmap and we find an entry at
+			 * this offset, we want to go right, or after this entry
+			 * logically.  If we are inserting an extent and we've
+			 * found a bitmap, we want to go left, or before
+			 * logically.
+			 */
+			if (bitmap) {
+				if (info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
+				p = &(*p)->rb_right;
+			} else {
+				if (!info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
+				p = &(*p)->rb_left;
+			}
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+/*
+ * searches the tree for the given offset.
+ *
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
+ */
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
+		   u64 offset, int bitmap_only, int fuzzy)
+{
+	struct rb_node *n = ctl->free_space_offset.rb_node;
+	struct btrfs_free_space *entry, *prev = NULL;
+
+	/* find entry that is closest to the 'offset' */
+	while (1) {
+		if (!n) {
+			entry = NULL;
+			break;
+		}
+
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+		prev = entry;
+
+		if (offset < entry->offset)
+			n = n->rb_left;
+		else if (offset > entry->offset)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	if (bitmap_only) {
+		if (!entry)
+			return NULL;
+		if (entry->bitmap)
+			return entry;
+
+		/*
+		 * bitmap entry and extent entry may share same offset,
+		 * in that case, bitmap entry comes after extent entry.
+		 */
+		n = rb_next(n);
+		if (!n)
+			return NULL;
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (entry->offset != offset)
+			return NULL;
+
+		WARN_ON(!entry->bitmap);
+		return entry;
+	} else if (entry) {
+		if (entry->bitmap) {
+			/*
+			 * if previous extent entry covers the offset,
+			 * we should return it instead of the bitmap entry
+			 */
+			n = rb_prev(&entry->offset_index);
+			if (n) {
+				prev = rb_entry(n, struct btrfs_free_space,
+						offset_index);
+				if (!prev->bitmap &&
+				    prev->offset + prev->bytes > offset)
+					entry = prev;
+			}
+		}
+		return entry;
+	}
+
+	if (!prev)
+		return NULL;
+
+	/* find last entry before the 'offset' */
+	entry = prev;
+	if (entry->offset > offset) {
+		n = rb_prev(&entry->offset_index);
+		if (n) {
+			entry = rb_entry(n, struct btrfs_free_space,
+					offset_index);
+			ASSERT(entry->offset <= offset);
+		} else {
+			if (fuzzy)
+				return entry;
+			else
+				return NULL;
+		}
+	}
+
+	if (entry->bitmap) {
+		n = rb_prev(&entry->offset_index);
+		if (n) {
+			prev = rb_entry(n, struct btrfs_free_space,
+					offset_index);
+			if (!prev->bitmap &&
+			    prev->offset + prev->bytes > offset)
+				return prev;
+		}
+		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
+			return entry;
+	} else if (entry->offset + entry->bytes > offset)
+		return entry;
+
+	if (!fuzzy)
+		return NULL;
+
+	while (1) {
+		if (entry->bitmap) {
+			if (entry->offset + BITS_PER_BITMAP *
+			    ctl->unit > offset)
+				break;
+		} else {
+			if (entry->offset + entry->bytes > offset)
+				break;
+		}
+
+		n = rb_next(&entry->offset_index);
+		if (!n)
+			return NULL;
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+	}
+	return entry;
+}
+
+static inline void
+__unlink_free_space(struct btrfs_free_space_ctl *ctl,
+		    struct btrfs_free_space *info)
+{
+	rb_erase(&info->offset_index, &ctl->free_space_offset);
+	ctl->free_extents--;
+}
+
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	__unlink_free_space(ctl, info);
+	ctl->free_space -= info->bytes;
+}
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info)
+{
+	int ret = 0;
+
+	ASSERT(info->bytes || info->bitmap);
+	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
+				 &info->offset_index, (info->bitmap != NULL));
+	if (ret)
+		return ret;
+
+	ctl->free_space += info->bytes;
+	ctl->free_extents++;
+	return ret;
+}
+
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_block_group_cache *block_group = ctl->private;
+	u64 max_bytes;
+	u64 bitmap_bytes;
+	u64 extent_bytes;
+	u64 size = block_group->key.offset;
+	u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+	u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
+
+	max_bitmaps = max_t(u32, max_bitmaps, 1);
+
+	ASSERT(ctl->total_bitmaps <= max_bitmaps);
+
+	/*
+	 * The goal is to keep the total amount of memory used per 1gb of space
+	 * at or below 32k, so we need to adjust how much memory we allow to be
+	 * used by extent based free space tracking
+	 */
+	if (size < 1024 * 1024 * 1024)
+		max_bytes = MAX_CACHE_BYTES_PER_GIG;
+	else
+		max_bytes = MAX_CACHE_BYTES_PER_GIG *
+			div_u64(size, 1024 * 1024 * 1024);
+
+	/*
+	 * we want to account for 1 more bitmap than what we have so we can make
+	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+	 * we add more bitmaps.
+	 */
+	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+
+	if (bitmap_bytes >= max_bytes) {
+		ctl->extents_thresh = 0;
+		return;
+	}
+
+	/*
+	 * we want the extent entry threshold to always be at most 1/2 the max
+	 * bytes we can have, or whatever is less than that.
+	 */
+	extent_bytes = max_bytes - bitmap_bytes;
+	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
+
+	ctl->extents_thresh =
+		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
+}
+
+static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+				       struct btrfs_free_space *info,
+				       u64 offset, u64 bytes)
+{
+	unsigned long start, count;
+
+	start = offset_to_bit(info->offset, ctl->unit, offset);
+	count = bytes_to_bits(bytes, ctl->unit);
+	ASSERT(start + count <= BITS_PER_BITMAP);
+
+	bitmap_clear(info->bitmap, start, count);
+
+	info->bytes -= bytes;
+}
+
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info, u64 offset,
+			      u64 bytes)
+{
+	__bitmap_clear_bits(ctl, info, offset, bytes);
+	ctl->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+			    struct btrfs_free_space *info, u64 offset,
+			    u64 bytes)
+{
+	unsigned long start, count;
+
+	start = offset_to_bit(info->offset, ctl->unit, offset);
+	count = bytes_to_bits(bytes, ctl->unit);
+	ASSERT(start + count <= BITS_PER_BITMAP);
+
+	bitmap_set(info->bitmap, start, count);
+
+	info->bytes += bytes;
+	ctl->free_space += bytes;
+}
+
+/*
+ * If we can not find suitable extent, we will use bytes to record
+ * the size of the max extent.
+ */
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+			 struct btrfs_free_space *bitmap_info, u64 *offset,
+			 u64 *bytes)
+{
+	unsigned long found_bits = 0;
+	unsigned long max_bits = 0;
+	unsigned long bits, i;
+	unsigned long next_zero;
+	unsigned long extent_bits;
+
+	i = offset_to_bit(bitmap_info->offset, ctl->unit,
+			  max_t(u64, *offset, bitmap_info->offset));
+	bits = bytes_to_bits(*bytes, ctl->unit);
+
+	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+		next_zero = find_next_zero_bit(bitmap_info->bitmap,
+					       BITS_PER_BITMAP, i);
+		extent_bits = next_zero - i;
+		if (extent_bits >= bits) {
+			found_bits = extent_bits;
+			break;
+		} else if (extent_bits > max_bits) {
+			max_bits = extent_bits;
+		}
+		i = next_zero;
+	}
+
+	if (found_bits) {
+		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
+		*bytes = (u64)(found_bits) * ctl->unit;
+		return 0;
+	}
+
+	*bytes = (u64)(max_bits) * ctl->unit;
+	return -1;
+}
+
+/* Cache the size of the max extent in bytes */
+static struct btrfs_free_space *
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+		unsigned long align, u64 *max_extent_size)
+{
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	u64 tmp;
+	u64 align_off;
+	int ret;
+
+	if (!ctl->free_space_offset.rb_node)
+		goto out;
+
+	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
+	if (!entry)
+		goto out;
+
+	for (node = &entry->offset_index; node; node = rb_next(node)) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (entry->bytes < *bytes) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
+			continue;
+		}
+
+		/* make sure the space returned is big enough
+		 * to match our requested alignment
+		 */
+		if (*bytes >= align) {
+			tmp = entry->offset - ctl->start + align - 1;
+			tmp = div64_u64(tmp, align);
+			tmp = tmp * align + ctl->start;
+			align_off = tmp - entry->offset;
+		} else {
+			align_off = 0;
+			tmp = entry->offset;
+		}
+
+		if (entry->bytes < *bytes + align_off) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
+			continue;
+		}
+
+		if (entry->bitmap) {
+			u64 size = *bytes;
+
+			ret = search_bitmap(ctl, entry, &tmp, &size);
+			if (!ret) {
+				*offset = tmp;
+				*bytes = size;
+				return entry;
+			} else if (size > *max_extent_size) {
+				*max_extent_size = size;
+			}
+			continue;
+		}
+
+		*offset = tmp;
+		*bytes = entry->bytes - align_off;
+		return entry;
+	}
+out:
+	return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info, u64 offset)
+{
+	info->offset = offset_to_bitmap(ctl, offset);
+	info->bytes = 0;
+	INIT_LIST_HEAD(&info->list);
+	link_free_space(ctl, info);
+	ctl->total_bitmaps++;
+
+	ctl->op->recalc_thresholds(ctl);
+}
+
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+			struct btrfs_free_space *bitmap_info)
+{
+	unlink_free_space(ctl, bitmap_info);
+	kfree(bitmap_info->bitmap);
+	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+	ctl->total_bitmaps--;
+	ctl->op->recalc_thresholds(ctl);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *bitmap_info,
+			      u64 *offset, u64 *bytes)
+{
+	u64 end;
+	u64 search_start, search_bytes;
+	int ret;
+
+again:
+	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
+
+	/*
+	 * We need to search for bits in this bitmap.  We could only cover some
+	 * of the extent in this bitmap thanks to how we add space, so we need
+	 * to search for as much as it as we can and clear that amount, and then
+	 * go searching for the next bit.
+	 */
+	search_start = *offset;
+	search_bytes = ctl->unit;
+	search_bytes = min(search_bytes, end - search_start + 1);
+	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+	if (ret < 0 || search_start != *offset)
+		return -EINVAL;
+
+	/* We may have found more bits than what we need */
+	search_bytes = min(search_bytes, *bytes);
+
+	/* Cannot clear past the end of the bitmap */
+	search_bytes = min(search_bytes, end - search_start + 1);
+
+	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+	*offset += search_bytes;
+	*bytes -= search_bytes;
+
+	if (*bytes) {
+		struct rb_node *next = rb_next(&bitmap_info->offset_index);
+		if (!bitmap_info->bytes)
+			free_bitmap(ctl, bitmap_info);
+
+		/*
+		 * no entry after this bitmap, but we still have bytes to
+		 * remove, so something has gone wrong.
+		 */
+		if (!next)
+			return -EINVAL;
+
+		bitmap_info = rb_entry(next, struct btrfs_free_space,
+				       offset_index);
+
+		/*
+		 * if the next entry isn't a bitmap we need to return to let the
+		 * extent stuff do its work.
+		 */
+		if (!bitmap_info->bitmap)
+			return -EAGAIN;
+
+		/*
+		 * Ok the next item is a bitmap, but it may not actually hold
+		 * the information for the rest of this free space stuff, so
+		 * look for it, and if we don't find it return so we can try
+		 * everything over again.
+		 */
+		search_start = *offset;
+		search_bytes = ctl->unit;
+		ret = search_bitmap(ctl, bitmap_info, &search_start,
+				    &search_bytes);
+		if (ret < 0 || search_start != *offset)
+			return -EAGAIN;
+
+		goto again;
+	} else if (!bitmap_info->bytes)
+		free_bitmap(ctl, bitmap_info);
+
+	return 0;
+}
+
+static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+			       struct btrfs_free_space *info, u64 offset,
+			       u64 bytes)
+{
+	u64 bytes_to_set = 0;
+	u64 end;
+
+	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+
+	bytes_to_set = min(end - offset, bytes);
+
+	bitmap_set_bits(ctl, info, offset, bytes_to_set);
+
+	return bytes_to_set;
+
+}
+
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+		      struct btrfs_free_space *info)
+{
+	struct btrfs_block_group_cache *block_group = ctl->private;
+
+	/*
+	 * If we are below the extents threshold then we can add this as an
+	 * extent, and don't have to deal with the bitmap
+	 */
+	if (ctl->free_extents < ctl->extents_thresh) {
+		/*
+		 * If this block group has some small extents we don't want to
+		 * use up all of our free slots in the cache with them, we want
+		 * to reserve them to larger extents, however if we have plent
+		 * of cache left then go ahead an dadd them, no sense in adding
+		 * the overhead of a bitmap if we don't have to.
+		 */
+		if (info->bytes <= block_group->sectorsize * 4) {
+			if (ctl->free_extents * 2 <= ctl->extents_thresh)
+				return false;
+		} else {
+			return false;
+		}
+	}
+
+	/*
+	 * The original block groups from mkfs can be really small, like 8
+	 * megabytes, so don't bother with a bitmap for those entries.  However
+	 * some block groups can be smaller than what a bitmap would cover but
+	 * are still large enough that they could overflow the 32k memory limit,
+	 * so allow those block groups to still be allowed to have a bitmap
+	 * entry.
+	 */
+	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_free_space_op free_space_op = {
+	.recalc_thresholds	= recalculate_thresholds,
+	.use_bitmap		= use_bitmap,
+};
+
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	struct btrfs_free_space *bitmap_info;
+	struct btrfs_block_group_cache *block_group = NULL;
+	int added = 0;
+	u64 bytes, offset, bytes_added;
+	int ret;
+
+	bytes = info->bytes;
+	offset = info->offset;
+
+	if (!ctl->op->use_bitmap(ctl, info))
+		return 0;
+
+	if (ctl->op == &free_space_op)
+		block_group = ctl->private;
+again:
+	/*
+	 * Since we link bitmaps right into the cluster we need to see if we
+	 * have a cluster here, and if so and it has our bitmap we need to add
+	 * the free space to that bitmap.
+	 */
+	if (block_group && !list_empty(&block_group->cluster_list)) {
+		struct btrfs_free_cluster *cluster;
+		struct rb_node *node;
+		struct btrfs_free_space *entry;
+
+		cluster = list_entry(block_group->cluster_list.next,
+				     struct btrfs_free_cluster,
+				     block_group_list);
+		spin_lock(&cluster->lock);
+		node = rb_first(&cluster->root);
+		if (!node) {
+			spin_unlock(&cluster->lock);
+			goto no_cluster_bitmap;
+		}
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (!entry->bitmap) {
+			spin_unlock(&cluster->lock);
+			goto no_cluster_bitmap;
+		}
+
+		if (entry->offset == offset_to_bitmap(ctl, offset)) {
+			bytes_added = add_bytes_to_bitmap(ctl, entry,
+							  offset, bytes);
+			bytes -= bytes_added;
+			offset += bytes_added;
+		}
+		spin_unlock(&cluster->lock);
+		if (!bytes) {
+			ret = 1;
+			goto out;
+		}
+	}
+
+no_cluster_bitmap:
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		ASSERT(added == 0);
+		goto new_bitmap;
+	}
+
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	added = 0;
+
+	if (!bytes) {
+		ret = 1;
+		goto out;
+	} else
+		goto again;
+
+new_bitmap:
+	if (info && info->bitmap) {
+		add_new_bitmap(ctl, info, offset);
+		added = 1;
+		info = NULL;
+		goto again;
+	} else {
+		spin_unlock(&ctl->tree_lock);
+
+		/* no pre-allocated info, allocate a new one */
+		if (!info) {
+			info = kmem_cache_zalloc(btrfs_free_space_cachep,
+						 GFP_NOFS);
+			if (!info) {
+				spin_lock(&ctl->tree_lock);
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		/* allocate the bitmap */
+		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		spin_lock(&ctl->tree_lock);
+		if (!info->bitmap) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		goto again;
+	}
+
+out:
+	if (info) {
+		if (info->bitmap)
+			kfree(info->bitmap);
+		kmem_cache_free(btrfs_free_space_cachep, info);
+	}
+
+	return ret;
+}
+
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
+			  struct btrfs_free_space *info, bool update_stat)
+{
+	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *right_info;
+	bool merged = false;
+	u64 offset = info->offset;
+	u64 bytes = info->bytes;
+
+	/*
+	 * first we want to see if there is free space adjacent to the range we
+	 * are adding, if there is remove that struct and add a new one to
+	 * cover the entire range
+	 */
+	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
+	if (right_info && rb_prev(&right_info->offset_index))
+		left_info = rb_entry(rb_prev(&right_info->offset_index),
+				     struct btrfs_free_space, offset_index);
+	else
+		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
+
+	if (right_info && !right_info->bitmap) {
+		if (update_stat)
+			unlink_free_space(ctl, right_info);
+		else
+			__unlink_free_space(ctl, right_info);
+		info->bytes += right_info->bytes;
+		kmem_cache_free(btrfs_free_space_cachep, right_info);
+		merged = true;
+	}
+
+	if (left_info && !left_info->bitmap &&
+	    left_info->offset + left_info->bytes == offset) {
+		if (update_stat)
+			unlink_free_space(ctl, left_info);
+		else
+			__unlink_free_space(ctl, left_info);
+		info->offset = left_info->offset;
+		info->bytes += left_info->bytes;
+		kmem_cache_free(btrfs_free_space_cachep, left_info);
+		merged = true;
+	}
+
+	return merged;
+}
+
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+				     struct btrfs_free_space *info,
+				     bool update_stat)
+{
+	struct btrfs_free_space *bitmap;
+	unsigned long i;
+	unsigned long j;
+	const u64 end = info->offset + info->bytes;
+	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+	u64 bytes;
+
+	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+	if (!bitmap)
+		return false;
+
+	i = offset_to_bit(bitmap->offset, ctl->unit, end);
+	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+	if (j == i)
+		return false;
+	bytes = (j - i) * ctl->unit;
+	info->bytes += bytes;
+
+	if (update_stat)
+		bitmap_clear_bits(ctl, bitmap, end, bytes);
+	else
+		__bitmap_clear_bits(ctl, bitmap, end, bytes);
+
+	if (!bitmap->bytes)
+		free_bitmap(ctl, bitmap);
+
+	return true;
+}
+
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+				       struct btrfs_free_space *info,
+				       bool update_stat)
+{
+	struct btrfs_free_space *bitmap;
+	u64 bitmap_offset;
+	unsigned long i;
+	unsigned long j;
+	unsigned long prev_j;
+	u64 bytes;
+
+	bitmap_offset = offset_to_bitmap(ctl, info->offset);
+	/* If we're on a boundary, try the previous logical bitmap. */
+	if (bitmap_offset == info->offset) {
+		if (info->offset == 0)
+			return false;
+		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+	}
+
+	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+	if (!bitmap)
+		return false;
+
+	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+	j = 0;
+	prev_j = (unsigned long)-1;
+	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+		if (j > i)
+			break;
+		prev_j = j;
+	}
+	if (prev_j == i)
+		return false;
+
+	if (prev_j == (unsigned long)-1)
+		bytes = (i + 1) * ctl->unit;
+	else
+		bytes = (i - prev_j) * ctl->unit;
+
+	info->offset -= bytes;
+	info->bytes += bytes;
+
+	if (update_stat)
+		bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+	else
+		__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+
+	if (!bitmap->bytes)
+		free_bitmap(ctl, bitmap);
+
+	return true;
+}
+
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info,
+			      bool update_stat)
+{
+	/*
+	 * Only work with disconnected entries, as we can change their offset,
+	 * and must be extent entries.
+	 */
+	ASSERT(!info->bitmap);
+	ASSERT(RB_EMPTY_NODE(&info->offset_index));
+
+	if (ctl->total_bitmaps > 0) {
+		bool stole_end;
+		bool stole_front = false;
+
+		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+		if (ctl->total_bitmaps > 0)
+			stole_front = steal_from_bitmap_to_front(ctl, info,
+								 update_stat);
+
+		if (stole_end || stole_front)
+			try_merge_free_space(ctl, info, update_stat);
+	}
+}
+
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+			   u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+	if (!info)
+		return -ENOMEM;
+
+	info->offset = offset;
+	info->bytes = bytes;
+	RB_CLEAR_NODE(&info->offset_index);
+
+	spin_lock(&ctl->tree_lock);
+
+	if (try_merge_free_space(ctl, info, true))
+		goto link;
+
+	/*
+	 * There was no extent directly to the left or right of this new
+	 * extent then we know we're going to have to allocate a new extent, so
+	 * before we do that see if we need to drop this into a bitmap
+	 */
+	ret = insert_into_bitmap(ctl, info);
+	if (ret < 0) {
+		goto out;
+	} else if (ret) {
+		ret = 0;
+		goto out;
+	}
+link:
+	/*
+	 * Only steal free space from adjacent bitmaps if we're sure we're not
+	 * going to add the new free space to existing bitmap entries - because
+	 * that would mean unnecessary work that would be reverted. Therefore
+	 * attempt to steal space from bitmaps if we're adding an extent entry.
+	 */
+	steal_from_bitmap(ctl, info, true);
+
+	ret = link_free_space(ctl, info);
+	if (ret)
+		kmem_cache_free(btrfs_free_space_cachep, info);
+out:
+	spin_unlock(&ctl->tree_lock);
+
+	if (ret) {
+		printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
+		ASSERT(ret != -EEXIST);
+	}
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *info;
+	int ret;
+	bool re_search = false;
+
+	spin_lock(&ctl->tree_lock);
+
+again:
+	ret = 0;
+	if (!bytes)
+		goto out_lock;
+
+	info = tree_search_offset(ctl, offset, 0, 0);
+	if (!info) {
+		/*
+		 * oops didn't find an extent that matched the space we wanted
+		 * to remove, look for a bitmap instead
+		 */
+		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					  1, 0);
+		if (!info) {
+			/*
+			 * If we found a partial bit of our free space in a
+			 * bitmap but then couldn't find the other part this may
+			 * be a problem, so WARN about it.
+			 */
+			WARN_ON(re_search);
+			goto out_lock;
+		}
+	}
+
+	re_search = false;
+	if (!info->bitmap) {
+		unlink_free_space(ctl, info);
+		if (offset == info->offset) {
+			u64 to_free = min(bytes, info->bytes);
+
+			info->bytes -= to_free;
+			info->offset += to_free;
+			if (info->bytes) {
+				ret = link_free_space(ctl, info);
+				WARN_ON(ret);
+			} else {
+				kmem_cache_free(btrfs_free_space_cachep, info);
+			}
+
+			offset += to_free;
+			bytes -= to_free;
+			goto again;
+		} else {
+			u64 old_end = info->bytes + info->offset;
+
+			info->bytes = offset - info->offset;
+			ret = link_free_space(ctl, info);
+			WARN_ON(ret);
+			if (ret)
+				goto out_lock;
+
+			/* Not enough bytes in this entry to satisfy us */
+			if (old_end < offset + bytes) {
+				bytes -= old_end - offset;
+				offset = old_end;
+				goto again;
+			} else if (old_end == offset + bytes) {
+				/* all done */
+				goto out_lock;
+			}
+			spin_unlock(&ctl->tree_lock);
+
+			ret = btrfs_add_free_space(block_group, offset + bytes,
+						   old_end - (offset + bytes));
+			WARN_ON(ret);
+			goto out;
+		}
+	}
+
+	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
+	if (ret == -EAGAIN) {
+		re_search = true;
+		goto again;
+	}
+out_lock:
+	spin_unlock(&ctl->tree_lock);
+out:
+	return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int count = 0;
+
+	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (info->bytes >= bytes && !block_group->ro)
+			count++;
+		btrfs_crit(block_group->fs_info,
+			   "entry offset %llu, bytes %llu, bitmap %s",
+			   info->offset, info->bytes,
+		       (info->bitmap) ? "yes" : "no");
+	}
+	btrfs_info(block_group->fs_info, "block group has cluster?: %s",
+	       list_empty(&block_group->cluster_list) ? "no" : "yes");
+	btrfs_info(block_group->fs_info,
+		   "%d blocks of free space at or bigger than bytes is", count);
+}
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+
+	spin_lock_init(&ctl->tree_lock);
+	ctl->unit = block_group->sectorsize;
+	ctl->start = block_group->key.objectid;
+	ctl->private = block_group;
+	ctl->op = &free_space_op;
+	INIT_LIST_HEAD(&ctl->trimming_ranges);
+	mutex_init(&ctl->cache_writeout_mutex);
+
+	/*
+	 * we only want to have 32k of ram per block group for keeping
+	 * track of free space, and if we pass 1/2 of that we want to
+	 * start converting things over to using bitmaps
+	 */
+	ctl->extents_thresh = ((1024 * 32) / 2) /
+				sizeof(struct btrfs_free_space);
+}
+
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache.  If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already.  In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+
+	spin_lock(&cluster->lock);
+	if (cluster->block_group != block_group)
+		goto out;
+
+	cluster->block_group = NULL;
+	cluster->window_start = 0;
+	list_del_init(&cluster->block_group_list);
+
+	node = rb_first(&cluster->root);
+	while (node) {
+		bool bitmap;
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		node = rb_next(&entry->offset_index);
+		rb_erase(&entry->offset_index, &cluster->root);
+		RB_CLEAR_NODE(&entry->offset_index);
+
+		bitmap = (entry->bitmap != NULL);
+		if (!bitmap) {
+			try_merge_free_space(ctl, entry, false);
+			steal_from_bitmap(ctl, entry, false);
+		}
+		tree_insert_offset(&ctl->free_space_offset,
+				   entry->offset, &entry->offset_index, bitmap);
+	}
+	cluster->root = RB_ROOT;
+
+out:
+	spin_unlock(&cluster->lock);
+	btrfs_put_block_group(block_group);
+	return 0;
+}
+
+static void __btrfs_remove_free_space_cache_locked(
+				struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *node;
+
+	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+		info = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (!info->bitmap) {
+			unlink_free_space(ctl, info);
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		} else {
+			free_bitmap(ctl, info);
+		}
+
+		cond_resched_lock(&ctl->tree_lock);
+	}
+}
+
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+	spin_lock(&ctl->tree_lock);
+	__btrfs_remove_free_space_cache_locked(ctl);
+	spin_unlock(&ctl->tree_lock);
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_cluster *cluster;
+	struct list_head *head;
+
+	spin_lock(&ctl->tree_lock);
+	while ((head = block_group->cluster_list.next) !=
+	       &block_group->cluster_list) {
+		cluster = list_entry(head, struct btrfs_free_cluster,
+				     block_group_list);
+
+		WARN_ON(cluster->block_group != block_group);
+		__btrfs_return_cluster_to_free_space(block_group, cluster);
+
+		cond_resched_lock(&ctl->tree_lock);
+	}
+	__btrfs_remove_free_space_cache_locked(ctl);
+	spin_unlock(&ctl->tree_lock);
+
+}
+
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry = NULL;
+	u64 bytes_search = bytes + empty_size;
+	u64 ret = 0;
+	u64 align_gap = 0;
+	u64 align_gap_len = 0;
+
+	spin_lock(&ctl->tree_lock);
+	entry = find_free_space(ctl, &offset, &bytes_search,
+				block_group->full_stripe_len, max_extent_size);
+	if (!entry)
+		goto out;
+
+	ret = offset;
+	if (entry->bitmap) {
+		bitmap_clear_bits(ctl, entry, offset, bytes);
+		if (!entry->bytes)
+			free_bitmap(ctl, entry);
+	} else {
+		unlink_free_space(ctl, entry);
+		align_gap_len = offset - entry->offset;
+		align_gap = entry->offset;
+
+		entry->offset = offset + bytes;
+		WARN_ON(entry->bytes < bytes + align_gap_len);
+
+		entry->bytes -= bytes + align_gap_len;
+		if (!entry->bytes)
+			kmem_cache_free(btrfs_free_space_cachep, entry);
+		else
+			link_free_space(ctl, entry);
+	}
+out:
+	spin_unlock(&ctl->tree_lock);
+
+	if (align_gap_len)
+		__btrfs_add_free_space(ctl, align_gap, align_gap_len);
+	return ret;
+}
+
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache.  If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+			       struct btrfs_block_group_cache *block_group,
+			       struct btrfs_free_cluster *cluster)
+{
+	struct btrfs_free_space_ctl *ctl;
+	int ret;
+
+	/* first, get a safe pointer to the block group */
+	spin_lock(&cluster->lock);
+	if (!block_group) {
+		block_group = cluster->block_group;
+		if (!block_group) {
+			spin_unlock(&cluster->lock);
+			return 0;
+		}
+	} else if (cluster->block_group != block_group) {
+		/* someone else has already freed it don't redo their work */
+		spin_unlock(&cluster->lock);
+		return 0;
+	}
+	atomic_inc(&block_group->count);
+	spin_unlock(&cluster->lock);
+
+	ctl = block_group->free_space_ctl;
+
+	/* now return any extents the cluster had on it */
+	spin_lock(&ctl->tree_lock);
+	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&ctl->tree_lock);
+
+	/* finally drop our ref */
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+				   struct btrfs_free_cluster *cluster,
+				   struct btrfs_free_space *entry,
+				   u64 bytes, u64 min_start,
+				   u64 *max_extent_size)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	int err;
+	u64 search_start = cluster->window_start;
+	u64 search_bytes = bytes;
+	u64 ret = 0;
+
+	search_start = min_start;
+	search_bytes = bytes;
+
+	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+	if (err) {
+		if (search_bytes > *max_extent_size)
+			*max_extent_size = search_bytes;
+		return 0;
+	}
+
+	ret = search_start;
+	__bitmap_clear_bits(ctl, entry, ret, bytes);
+
+	return ret;
+}
+
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster, u64 bytes,
+			     u64 min_start, u64 *max_extent_size)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry = NULL;
+	struct rb_node *node;
+	u64 ret = 0;
+
+	spin_lock(&cluster->lock);
+	if (bytes > cluster->max_size)
+		goto out;
+
+	if (cluster->block_group != block_group)
+		goto out;
+
+	node = rb_first(&cluster->root);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	while (1) {
+		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
+			*max_extent_size = entry->bytes;
+
+		if (entry->bytes < bytes ||
+		    (!entry->bitmap && entry->offset < min_start)) {
+			node = rb_next(&entry->offset_index);
+			if (!node)
+				break;
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+			continue;
+		}
+
+		if (entry->bitmap) {
+			ret = btrfs_alloc_from_bitmap(block_group,
+						      cluster, entry, bytes,
+						      cluster->window_start,
+						      max_extent_size);
+			if (ret == 0) {
+				node = rb_next(&entry->offset_index);
+				if (!node)
+					break;
+				entry = rb_entry(node, struct btrfs_free_space,
+						 offset_index);
+				continue;
+			}
+			cluster->window_start += bytes;
+		} else {
+			ret = entry->offset;
+
+			entry->offset += bytes;
+			entry->bytes -= bytes;
+		}
+
+		if (entry->bytes == 0)
+			rb_erase(&entry->offset_index, &cluster->root);
+		break;
+	}
+out:
+	spin_unlock(&cluster->lock);
+
+	if (!ret)
+		return 0;
+
+	spin_lock(&ctl->tree_lock);
+
+	ctl->free_space -= bytes;
+	if (entry->bytes == 0) {
+		ctl->free_extents--;
+		if (entry->bitmap) {
+			kfree(entry->bitmap);
+			ctl->total_bitmaps--;
+			ctl->op->recalc_thresholds(ctl);
+		}
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+	}
+
+	spin_unlock(&ctl->tree_lock);
+
+	return ret;
+}
+
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+				struct btrfs_free_space *entry,
+				struct btrfs_free_cluster *cluster,
+				u64 offset, u64 bytes,
+				u64 cont1_bytes, u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	unsigned long next_zero;
+	unsigned long i;
+	unsigned long want_bits;
+	unsigned long min_bits;
+	unsigned long found_bits;
+	unsigned long start = 0;
+	unsigned long total_found = 0;
+	int ret;
+
+	i = offset_to_bit(entry->offset, ctl->unit,
+			  max_t(u64, offset, entry->offset));
+	want_bits = bytes_to_bits(bytes, ctl->unit);
+	min_bits = bytes_to_bits(min_bytes, ctl->unit);
+
+again:
+	found_bits = 0;
+	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
+		next_zero = find_next_zero_bit(entry->bitmap,
+					       BITS_PER_BITMAP, i);
+		if (next_zero - i >= min_bits) {
+			found_bits = next_zero - i;
+			break;
+		}
+		i = next_zero;
+	}
+
+	if (!found_bits)
+		return -ENOSPC;
+
+	if (!total_found) {
+		start = i;
+		cluster->max_size = 0;
+	}
+
+	total_found += found_bits;
+
+	if (cluster->max_size < found_bits * ctl->unit)
+		cluster->max_size = found_bits * ctl->unit;
+
+	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+		i = next_zero + 1;
+		goto again;
+	}
+
+	cluster->window_start = start * ctl->unit + entry->offset;
+	rb_erase(&entry->offset_index, &ctl->free_space_offset);
+	ret = tree_insert_offset(&cluster->root, entry->offset,
+				 &entry->offset_index, 1);
+	ASSERT(!ret); /* -EEXIST; Logic error */
+
+	trace_btrfs_setup_cluster(block_group, cluster,
+				  total_found * ctl->unit, 1);
+	return 0;
+}
+
+/*
+ * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
+ */
+static noinline int
+setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+			struct btrfs_free_cluster *cluster,
+			struct list_head *bitmaps, u64 offset, u64 bytes,
+			u64 cont1_bytes, u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *first = NULL;
+	struct btrfs_free_space *entry = NULL;
+	struct btrfs_free_space *last;
+	struct rb_node *node;
+	u64 window_free;
+	u64 max_extent;
+	u64 total_size = 0;
+
+	entry = tree_search_offset(ctl, offset, 0, 1);
+	if (!entry)
+		return -ENOSPC;
+
+	/*
+	 * We don't want bitmaps, so just move along until we find a normal
+	 * extent entry.
+	 */
+	while (entry->bitmap || entry->bytes < min_bytes) {
+		if (entry->bitmap && list_empty(&entry->list))
+			list_add_tail(&entry->list, bitmaps);
+		node = rb_next(&entry->offset_index);
+		if (!node)
+			return -ENOSPC;
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	}
+
+	window_free = entry->bytes;
+	max_extent = entry->bytes;
+	first = entry;
+	last = entry;
+
+	for (node = rb_next(&entry->offset_index); node;
+	     node = rb_next(&entry->offset_index)) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+		if (entry->bitmap) {
+			if (list_empty(&entry->list))
+				list_add_tail(&entry->list, bitmaps);
+			continue;
+		}
+
+		if (entry->bytes < min_bytes)
+			continue;
+
+		last = entry;
+		window_free += entry->bytes;
+		if (entry->bytes > max_extent)
+			max_extent = entry->bytes;
+	}
+
+	if (window_free < bytes || max_extent < cont1_bytes)
+		return -ENOSPC;
+
+	cluster->window_start = first->offset;
+
+	node = &first->offset_index;
+
+	/*
+	 * now we've found our entries, pull them out of the free space
+	 * cache and put them into the cluster rbtree
+	 */
+	do {
+		int ret;
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		node = rb_next(&entry->offset_index);
+		if (entry->bitmap || entry->bytes < min_bytes)
+			continue;
+
+		rb_erase(&entry->offset_index, &ctl->free_space_offset);
+		ret = tree_insert_offset(&cluster->root, entry->offset,
+					 &entry->offset_index, 0);
+		total_size += entry->bytes;
+		ASSERT(!ret); /* -EEXIST; Logic error */
+	} while (node && entry != last);
+
+	cluster->max_size = max_extent;
+	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
+	return 0;
+}
+
+/*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static noinline int
+setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+		     struct btrfs_free_cluster *cluster,
+		     struct list_head *bitmaps, u64 offset, u64 bytes,
+		     u64 cont1_bytes, u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	int ret = -ENOSPC;
+	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
+
+	if (ctl->total_bitmaps == 0)
+		return -ENOSPC;
+
+	/*
+	 * The bitmap that covers offset won't be in the list unless offset
+	 * is just its start offset.
+	 */
+	entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+	if (entry->offset != bitmap_offset) {
+		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+		if (entry && list_empty(&entry->list))
+			list_add(&entry->list, bitmaps);
+	}
+
+	list_for_each_entry(entry, bitmaps, list) {
+		if (entry->bytes < bytes)
+			continue;
+		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+					   bytes, cont1_bytes, min_bytes);
+		if (!ret)
+			return 0;
+	}
+
+	/*
+	 * The bitmaps list has all the bitmaps that record free space
+	 * starting after offset, so no more search is required.
+	 */
+	return -ENOSPC;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes+empty_size.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster,
+			     u64 offset, u64 bytes, u64 empty_size)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry, *tmp;
+	LIST_HEAD(bitmaps);
+	u64 min_bytes;
+	u64 cont1_bytes;
+	int ret;
+
+	/*
+	 * Choose the minimum extent size we'll require for this
+	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+	 * For metadata, allow allocates with smaller extents.  For
+	 * data, keep it dense.
+	 */
+	if (btrfs_test_opt(root, SSD_SPREAD)) {
+		cont1_bytes = min_bytes = bytes + empty_size;
+	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		cont1_bytes = bytes;
+		min_bytes = block_group->sectorsize;
+	} else {
+		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+		min_bytes = block_group->sectorsize;
+	}
+
+	spin_lock(&ctl->tree_lock);
+
+	/*
+	 * If we know we don't have enough space to make a cluster don't even
+	 * bother doing all the work to try and find one.
+	 */
+	if (ctl->free_space < bytes) {
+		spin_unlock(&ctl->tree_lock);
+		return -ENOSPC;
+	}
+
+	spin_lock(&cluster->lock);
+
+	/* someone already found a cluster, hooray */
+	if (cluster->block_group) {
+		ret = 0;
+		goto out;
+	}
+
+	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+				 min_bytes);
+
+	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
+				      bytes + empty_size,
+				      cont1_bytes, min_bytes);
+	if (ret)
+		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
+					   offset, bytes + empty_size,
+					   cont1_bytes, min_bytes);
+
+	/* Clear our temporary list */
+	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
+		list_del_init(&entry->list);
+
+	if (!ret) {
+		atomic_inc(&block_group->count);
+		list_add_tail(&cluster->block_group_list,
+			      &block_group->cluster_list);
+		cluster->block_group = block_group;
+	} else {
+		trace_btrfs_failed_cluster_setup(block_group);
+	}
+out:
+	spin_unlock(&cluster->lock);
+	spin_unlock(&ctl->tree_lock);
+
+	return ret;
+}
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+	spin_lock_init(&cluster->lock);
+	spin_lock_init(&cluster->refill_lock);
+	cluster->root = RB_ROOT;
+	cluster->max_size = 0;
+	INIT_LIST_HEAD(&cluster->block_group_list);
+	cluster->block_group = NULL;
+}
+
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+		       u64 *total_trimmed, u64 start, u64 bytes,
+		       u64 reserved_start, u64 reserved_bytes,
+		       struct btrfs_trim_range *trim_entry)
+{
+	struct btrfs_space_info *space_info = block_group->space_info;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	int ret;
+	int update = 0;
+	u64 trimmed = 0;
+
+	spin_lock(&space_info->lock);
+	spin_lock(&block_group->lock);
+	if (!block_group->ro) {
+		block_group->reserved += reserved_bytes;
+		space_info->bytes_reserved += reserved_bytes;
+		update = 1;
+	}
+	spin_unlock(&block_group->lock);
+	spin_unlock(&space_info->lock);
+
+	ret = btrfs_discard_extent(fs_info->extent_root,
+				   start, bytes, &trimmed);
+	if (!ret)
+		*total_trimmed += trimmed;
+
+	mutex_lock(&ctl->cache_writeout_mutex);
+	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+	list_del(&trim_entry->list);
+	mutex_unlock(&ctl->cache_writeout_mutex);
+
+	if (update) {
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+		if (block_group->ro)
+			space_info->bytes_readonly += reserved_bytes;
+		block_group->reserved -= reserved_bytes;
+		space_info->bytes_reserved -= reserved_bytes;
+		spin_unlock(&space_info->lock);
+		spin_unlock(&block_group->lock);
+	}
+
+	return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret = 0;
+	u64 extent_start;
+	u64 extent_bytes;
+	u64 bytes;
+
+	while (start < end) {
+		struct btrfs_trim_range trim_entry;
+
+		mutex_lock(&ctl->cache_writeout_mutex);
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			break;
+		}
+
+		entry = tree_search_offset(ctl, start, 0, 1);
+		if (!entry) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			break;
+		}
+
+		/* skip bitmaps */
+		while (entry->bitmap) {
+			node = rb_next(&entry->offset_index);
+			if (!node) {
+				spin_unlock(&ctl->tree_lock);
+				mutex_unlock(&ctl->cache_writeout_mutex);
+				goto out;
+			}
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+		}
+
+		if (entry->offset >= end) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			break;
+		}
+
+		extent_start = entry->offset;
+		extent_bytes = entry->bytes;
+		start = max(start, extent_start);
+		bytes = min(extent_start + extent_bytes, end) - start;
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			goto next;
+		}
+
+		unlink_free_space(ctl, entry);
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+
+		spin_unlock(&ctl->tree_lock);
+		trim_entry.start = extent_start;
+		trim_entry.bytes = extent_bytes;
+		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+		mutex_unlock(&ctl->cache_writeout_mutex);
+
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  extent_start, extent_bytes, &trim_entry);
+		if (ret)
+			break;
+next:
+		start += bytes;
+
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+out:
+	return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	int ret = 0;
+	int ret2;
+	u64 bytes;
+	u64 offset = offset_to_bitmap(ctl, start);
+
+	while (offset < end) {
+		bool next_bitmap = false;
+		struct btrfs_trim_range trim_entry;
+
+		mutex_lock(&ctl->cache_writeout_mutex);
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			break;
+		}
+
+		entry = tree_search_offset(ctl, offset, 1, 0);
+		if (!entry) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = minlen;
+		ret2 = search_bitmap(ctl, entry, &start, &bytes);
+		if (ret2 || start >= end) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = min(bytes, end - start);
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			mutex_unlock(&ctl->cache_writeout_mutex);
+			goto next;
+		}
+
+		bitmap_clear_bits(ctl, entry, start, bytes);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+
+		spin_unlock(&ctl->tree_lock);
+		trim_entry.start = start;
+		trim_entry.bytes = bytes;
+		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+		mutex_unlock(&ctl->cache_writeout_mutex);
+
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  start, bytes, &trim_entry);
+		if (ret)
+			break;
+next:
+		if (next_bitmap) {
+			offset += BITS_PER_BITMAP * ctl->unit;
+		} else {
+			start += bytes;
+			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+				offset += BITS_PER_BITMAP * ctl->unit;
+		}
+
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+	int ret;
+
+	*trimmed = 0;
+
+	spin_lock(&block_group->lock);
+	if (block_group->removed) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	atomic_inc(&block_group->trimming);
+	spin_unlock(&block_group->lock);
+
+	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+	if (ret)
+		goto out;
+
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+	spin_lock(&block_group->lock);
+	if (atomic_dec_and_test(&block_group->trimming) &&
+	    block_group->removed) {
+		struct extent_map_tree *em_tree;
+		struct extent_map *em;
+
+		spin_unlock(&block_group->lock);
+
+		lock_chunks(block_group->fs_info->chunk_root);
+		em_tree = &block_group->fs_info->mapping_tree.map_tree;
+		write_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+					   1);
+		BUG_ON(!em); /* logic error, can't happen */
+		/*
+		 * remove_extent_mapping() will delete us from the pinned_chunks
+		 * list, which is protected by the chunk mutex.
+		 */
+		remove_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		unlock_chunks(block_group->fs_info->chunk_root);
+
+		/* once for us and once for the tree */
+		free_extent_map(em);
+		free_extent_map(em);
+
+		/*
+		 * We've left one free space entry and other tasks trimming
+		 * this block group have left 1 entry each one. Free them.
+		 */
+		__btrfs_remove_free_space_cache(block_group->free_space_ctl);
+	} else {
+		spin_unlock(&block_group->lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Find the left-most item in the cache tree, and then return the
+ * smallest inode number in the item.
+ *
+ * Note: the returned inode number may not be the smallest one in
+ * the tree, if the left-most item is a bitmap.
+ */
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+{
+	struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
+	struct btrfs_free_space *entry = NULL;
+	u64 ino = 0;
+
+	spin_lock(&ctl->tree_lock);
+
+	if (RB_EMPTY_ROOT(&ctl->free_space_offset))
+		goto out;
+
+	entry = rb_entry(rb_first(&ctl->free_space_offset),
+			 struct btrfs_free_space, offset_index);
+
+	if (!entry->bitmap) {
+		ino = entry->offset;
+
+		unlink_free_space(ctl, entry);
+		entry->offset++;
+		entry->bytes--;
+		if (!entry->bytes)
+			kmem_cache_free(btrfs_free_space_cachep, entry);
+		else
+			link_free_space(ctl, entry);
+	} else {
+		u64 offset = 0;
+		u64 count = 1;
+		int ret;
+
+		ret = search_bitmap(ctl, entry, &offset, &count);
+		/* Logic error; Should be empty if it can't find anything */
+		ASSERT(!ret);
+
+		ino = offset;
+		bitmap_clear_bits(ctl, entry, offset, 1);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+	}
+out:
+	spin_unlock(&ctl->tree_lock);
+
+	return ino;
+}
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+				    struct btrfs_path *path)
+{
+	struct inode *inode = NULL;
+
+	spin_lock(&root->ino_cache_lock);
+	if (root->ino_cache_inode)
+		inode = igrab(root->ino_cache_inode);
+	spin_unlock(&root->ino_cache_lock);
+	if (inode)
+		return inode;
+
+	inode = __lookup_free_space_inode(root, path, 0);
+	if (IS_ERR(inode))
+		return inode;
+
+	spin_lock(&root->ino_cache_lock);
+	if (!btrfs_fs_closing(root->fs_info))
+		root->ino_cache_inode = igrab(inode);
+	spin_unlock(&root->ino_cache_lock);
+
+	return inode;
+}
+
+int create_free_ino_inode(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_path *path)
+{
+	return __create_free_space_inode(root, trans, path,
+					 BTRFS_FREE_INO_OBJECTID, 0);
+}
+
+int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_path *path;
+	struct inode *inode;
+	int ret = 0;
+	u64 root_gen = btrfs_root_generation(&root->root_item);
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	/*
+	 * If we're unmounting then just return, since this does a search on the
+	 * normal root and not the commit root and we could deadlock.
+	 */
+	if (btrfs_fs_closing(fs_info))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return 0;
+
+	inode = lookup_free_ino_inode(root, path);
+	if (IS_ERR(inode))
+		goto out;
+
+	if (root_gen != BTRFS_I(inode)->generation)
+		goto out_put;
+
+	ret = __load_free_space_cache(root, inode, ctl, path, 0);
+
+	if (ret < 0)
+		btrfs_err(fs_info,
+			"failed to load free ino cache for root %llu",
+			root->root_key.objectid);
+out_put:
+	iput(inode);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+			      struct btrfs_trans_handle *trans,
+			      struct btrfs_path *path,
+			      struct inode *inode)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	int ret;
+	struct btrfs_io_ctl io_ctl;
+	bool release_metadata = true;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	memset(&io_ctl, 0, sizeof(io_ctl));
+	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
+				      trans, path, 0);
+	if (!ret) {
+		/*
+		 * At this point writepages() didn't error out, so our metadata
+		 * reservation is released when the writeback finishes, at
+		 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
+		 * with or without an error.
+		 */
+		release_metadata = false;
+		ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+	}
+
+	if (ret) {
+		if (release_metadata)
+			btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
+		btrfs_err(root->fs_info,
+			"failed to write free ino cache for root %llu",
+			root->root_key.objectid);
+#endif
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/*
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
+ */
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap)
+{
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info = NULL, *bitmap_info;
+	void *map = NULL;
+	u64 bytes_added;
+	int ret;
+
+again:
+	if (!info) {
+		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+		if (!info)
+			return -ENOMEM;
+	}
+
+	if (!bitmap) {
+		spin_lock(&ctl->tree_lock);
+		info->offset = offset;
+		info->bytes = bytes;
+		ret = link_free_space(ctl, info);
+		spin_unlock(&ctl->tree_lock);
+		if (ret)
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		return ret;
+	}
+
+	if (!map) {
+		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		if (!map) {
+			kmem_cache_free(btrfs_free_space_cachep, info);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock(&ctl->tree_lock);
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		info->bitmap = map;
+		map = NULL;
+		add_new_bitmap(ctl, info, offset);
+		bitmap_info = info;
+		info = NULL;
+	}
+
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	spin_unlock(&ctl->tree_lock);
+
+	if (bytes)
+		goto again;
+
+	if (info)
+		kmem_cache_free(btrfs_free_space_cachep, info);
+	if (map)
+		kfree(map);
+	return 0;
+}
+
+/*
+ * Checks to see if the given range is in the free space cache.  This is really
+ * just used to check the absence of space, so if there is free space in the
+ * range at all we will return 1.
+ */
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	spin_lock(&ctl->tree_lock);
+	info = tree_search_offset(ctl, offset, 0, 0);
+	if (!info) {
+		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					  1, 0);
+		if (!info)
+			goto out;
+	}
+
+have_info:
+	if (info->bitmap) {
+		u64 bit_off, bit_bytes;
+		struct rb_node *n;
+		struct btrfs_free_space *tmp;
+
+		bit_off = offset;
+		bit_bytes = ctl->unit;
+		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+		if (!ret) {
+			if (bit_off == offset) {
+				ret = 1;
+				goto out;
+			} else if (bit_off > offset &&
+				   offset + bytes > bit_off) {
+				ret = 1;
+				goto out;
+			}
+		}
+
+		n = rb_prev(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (tmp->offset + tmp->bytes < offset)
+				break;
+			if (offset + bytes < tmp->offset) {
+				n = rb_prev(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		n = rb_next(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (offset + bytes < tmp->offset)
+				break;
+			if (tmp->offset + tmp->bytes < offset) {
+				n = rb_next(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		ret = 0;
+		goto out;
+	}
+
+	if (info->offset == offset) {
+		ret = 1;
+		goto out;
+	}
+
+	if (offset > info->offset && offset < info->offset + info->bytes)
+		ret = 1;
+out:
+	spin_unlock(&ctl->tree_lock);
+	return ret;
+}
+#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/kernel/fs/btrfs/free-space-cache.h b/kernel/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000..a16a029ad
--- /dev/null
+++ b/kernel/fs/btrfs/free-space-cache.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+
+struct btrfs_free_space {
+	struct rb_node offset_index;
+	u64 offset;
+	u64 bytes;
+	unsigned long *bitmap;
+	struct list_head list;
+};
+
+struct btrfs_free_space_ctl {
+	spinlock_t tree_lock;
+	struct rb_root free_space_offset;
+	u64 free_space;
+	int extents_thresh;
+	int free_extents;
+	int total_bitmaps;
+	int unit;
+	u64 start;
+	struct btrfs_free_space_op *op;
+	void *private;
+	struct mutex cache_writeout_mutex;
+	struct list_head trimming_ranges;
+};
+
+struct btrfs_free_space_op {
+	void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
+	bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info);
+};
+
+struct btrfs_io_ctl;
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path);
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv);
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_block_group_cache *block_group,
+				    struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group);
+int btrfs_wait_cache_io(struct btrfs_root *root,
+			struct btrfs_trans_handle *trans,
+			struct btrfs_block_group_cache *block_group,
+			struct btrfs_io_ctl *io_ctl,
+			struct btrfs_path *path, u64 offset);
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path);
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+				    struct btrfs_path *path);
+int create_free_ino_inode(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_path *path);
+int load_free_ino_cache(struct btrfs_fs_info *fs_info,
+			struct btrfs_root *root);
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+			      struct btrfs_trans_handle *trans,
+			      struct btrfs_path *path,
+			      struct inode *inode);
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+			   u64 bytenr, u64 size);
+static inline int
+btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+		     u64 bytenr, u64 size)
+{
+	return __btrfs_add_free_space(block_group->free_space_ctl,
+				      bytenr, size);
+}
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 bytenr, u64 size);
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+				     *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size);
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes);
+int btrfs_find_space_cluster(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster,
+			     u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster, u64 bytes,
+			     u64 min_start, u64 *max_extent_size);
+int btrfs_return_cluster_to_free_space(
+			       struct btrfs_block_group_cache *block_group,
+			       struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen);
+
+/* Support functions for runnint our sanity tests */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap);
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes);
+#endif
+
+#endif
diff --git a/kernel/fs/btrfs/hash.c b/kernel/fs/btrfs/hash.c
new file mode 100644
index 000000000..aae520b2a
--- /dev/null
+++ b/kernel/fs/btrfs/hash.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include "hash.h"
+
+static struct crypto_shash *tfm;
+
+int __init btrfs_hash_init(void)
+{
+	tfm = crypto_alloc_shash("crc32c", 0, 0);
+
+	return PTR_ERR_OR_ZERO(tfm);
+}
+
+void btrfs_hash_exit(void)
+{
+	crypto_free_shash(tfm);
+}
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
+{
+	SHASH_DESC_ON_STACK(shash, tfm);
+	u32 *ctx = (u32 *)shash_desc_ctx(shash);
+	int err;
+
+	shash->tfm = tfm;
+	shash->flags = 0;
+	*ctx = crc;
+
+	err = crypto_shash_update(shash, address, length);
+	BUG_ON(err);
+
+	return *ctx;
+}
diff --git a/kernel/fs/btrfs/hash.h b/kernel/fs/btrfs/hash.h
new file mode 100644
index 000000000..118a2316e
--- /dev/null
+++ b/kernel/fs/btrfs/hash.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __HASH__
+#define __HASH__
+
+int __init btrfs_hash_init(void);
+
+void btrfs_hash_exit(void);
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
+
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+	return btrfs_crc32c((u32)~1, name, len);
+}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+				    int len)
+{
+	return (u64) btrfs_crc32c(parent_objectid, name, len);
+}
+
+#endif
diff --git a/kernel/fs/btrfs/inode-item.c b/kernel/fs/btrfs/inode-item.c
new file mode 100644
index 000000000..265e03c73
--- /dev/null
+++ b/kernel/fs/btrfs/inode-item.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+#include "print-tree.h"
+
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+			 int name_len, struct btrfs_inode_ref **ref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	while (cur_offset < item_size) {
+		ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+		len = btrfs_inode_ref_name_len(leaf, ref);
+		name_ptr = (unsigned long)(ref + 1);
+		cur_offset += len + sizeof(*ref);
+		if (len != name_len)
+			continue;
+		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+			*ref_ret = ref;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+				   const char *name, int name_len,
+				   struct btrfs_inode_extref **extref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_extref *extref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	/*
+	 * Search all extended backrefs in this item. We're only
+	 * looking through any collisions so most of the time this is
+	 * just going to compare against one buffer. If all is well,
+	 * we'll return success and the inode ref object.
+	 */
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+		name_ptr = (unsigned long)(&extref->name);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+		if (ref_name_len == name_len &&
+		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+			if (extref_ret)
+				*extref_ret = extref;
+			return 1;
+		}
+
+		cur_offset += ref_name_len + sizeof(*extref);
+	}
+	return 0;
+}
+
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+		return NULL;
+	return extref;
+}
+
+static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  const char *name, int name_len,
+				  u64 inode_objectid, u64 ref_objectid,
+				  u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	int ret;
+	int del_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Sanity check - did we find the right item for this name?
+	 * This should always succeed so error here will make the FS
+	 * readonly.
+	 */
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+					    name, name_len, &extref)) {
+		btrfs_std_error(root->fs_info, -ENOENT);
+		ret = -EROFS;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (index)
+		*index = btrfs_inode_extref_index(leaf, extref);
+
+	if (del_len == item_size) {
+		/*
+		 * Common case only one ref in the item, remove the
+		 * whole item.
+		 */
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+
+	ptr = (unsigned long)extref;
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	memmove_extent_buffer(leaf, ptr, ptr + del_len,
+			      item_size - (ptr + del_len - item_start));
+
+	btrfs_truncate_item(root, path, item_size - del_len, 1);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+	u32 sub_item_len;
+	int ret;
+	int search_ext_refs = 0;
+	int del_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	key.type = BTRFS_INODE_REF_KEY;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = -ENOENT;
+		search_ext_refs = 1;
+		goto out;
+	} else if (ret < 0) {
+		goto out;
+	}
+	if (!find_name_in_backref(path, name, name_len, &ref)) {
+		ret = -ENOENT;
+		search_ext_refs = 1;
+		goto out;
+	}
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+	if (index)
+		*index = btrfs_inode_ref_index(leaf, ref);
+
+	if (del_len == item_size) {
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+	ptr = (unsigned long)ref;
+	sub_item_len = name_len + sizeof(*ref);
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+			      item_size - (ptr + sub_item_len - item_start));
+	btrfs_truncate_item(root, path, item_size - sub_item_len, 1);
+out:
+	btrfs_free_path(path);
+
+	if (search_ext_refs) {
+		/*
+		 * No refs were found, or we could not find the
+		 * name in our ref array. Find and remove the extended
+		 * inode ref then.
+		 */
+		return btrfs_del_inode_extref(trans, root, name, name_len,
+					      inode_objectid, ref_objectid, index);
+	}
+
+	return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     const char *name, int name_len,
+				     u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_inode_extref *extref;
+	int ret;
+	int ins_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, name_len, NULL))
+			goto out;
+
+		btrfs_extend_item(root, path, ins_len);
+		ret = 0;
+	}
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(path->slots[0]);
+	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+	ptr += btrfs_item_size(leaf, item) - ins_len;
+	extref = (struct btrfs_inode_extref *)ptr;
+
+	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+	btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+	btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+	ptr = (unsigned long)&extref->name;
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	int ret;
+	int ins_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	key.type = BTRFS_INODE_REF_KEY;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	path->skip_release_on_error = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		u32 old_size;
+
+		if (find_name_in_backref(path, name, name_len, &ref))
+			goto out;
+
+		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		btrfs_extend_item(root, path, ins_len);
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+		ptr = (unsigned long)(ref + 1);
+		ret = 0;
+	} else if (ret < 0) {
+		if (ret == -EOVERFLOW) {
+			if (find_name_in_backref(path, name, name_len, &ref))
+				ret = -EEXIST;
+			else
+				ret = -EMLINK;
+		}
+		goto out;
+	} else {
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+		ptr = (unsigned long)(ref + 1);
+	}
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+	btrfs_free_path(path);
+
+	if (ret == -EMLINK) {
+		struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+		/* We ran out of space in the ref array. Need to
+		 * add an extended ref. */
+		if (btrfs_super_incompat_flags(disk_super)
+		    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+			ret = btrfs_insert_inode_extref(trans, root, name,
+							name_len,
+							inode_objectid,
+							ref_objectid, index);
+	}
+
+	return ret;
+}
+
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid)
+{
+	struct btrfs_key key;
+	int ret;
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_inode_item));
+	return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod)
+{
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key found_key;
+
+	ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+	if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
+	    location->offset == (u64)-1 && path->slots[0] != 0) {
+		slot = path->slots[0] - 1;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid == location->objectid &&
+		    found_key.type == location->type) {
+			path->slots[0]--;
+			return 0;
+		}
+	}
+	return ret;
+}
diff --git a/kernel/fs/btrfs/inode-map.c b/kernel/fs/btrfs/inode-map.c
new file mode 100644
index 000000000..f6a596d5a
--- /dev/null
+++ b/kernel/fs/btrfs/inode-map.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "transaction.h"
+
+static int caching_kthread(void *data)
+{
+	struct btrfs_root *root = data;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u64 last = (u64)-1;
+	int slot;
+	int ret;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Since the commit root is read-only, we can safely skip locking. */
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = 2;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_INODE_ITEM_KEY;
+again:
+	/* need to make sure the commit_root doesn't disappear */
+	down_read(&fs_info->commit_root_sem);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		if (btrfs_fs_closing(fs_info))
+			goto out;
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+
+			if (need_resched() ||
+			    btrfs_transaction_in_commit(fs_info)) {
+				leaf = path->nodes[0];
+
+				if (WARN_ON(btrfs_header_nritems(leaf) == 0))
+					break;
+
+				/*
+				 * Save the key so we can advances forward
+				 * in the next search.
+				 */
+				btrfs_item_key_to_cpu(leaf, &key, 0);
+				btrfs_release_path(path);
+				root->ino_cache_progress = last;
+				up_read(&fs_info->commit_root_sem);
+				schedule_timeout(1);
+				goto again;
+			} else
+				continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.type != BTRFS_INODE_ITEM_KEY)
+			goto next;
+
+		if (key.objectid >= root->highest_objectid)
+			break;
+
+		if (last != (u64)-1 && last + 1 != key.objectid) {
+			__btrfs_add_free_space(ctl, last + 1,
+					       key.objectid - last - 1);
+			wake_up(&root->ino_cache_wait);
+		}
+
+		last = key.objectid;
+next:
+		path->slots[0]++;
+	}
+
+	if (last < root->highest_objectid - 1) {
+		__btrfs_add_free_space(ctl, last + 1,
+				       root->highest_objectid - last - 1);
+	}
+
+	spin_lock(&root->ino_cache_lock);
+	root->ino_cache_state = BTRFS_CACHE_FINISHED;
+	spin_unlock(&root->ino_cache_lock);
+
+	root->ino_cache_progress = (u64)-1;
+	btrfs_unpin_free_ino(root);
+out:
+	wake_up(&root->ino_cache_wait);
+	up_read(&fs_info->commit_root_sem);
+
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+static void start_caching(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct task_struct *tsk;
+	int ret;
+	u64 objectid;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
+	spin_lock(&root->ino_cache_lock);
+	if (root->ino_cache_state != BTRFS_CACHE_NO) {
+		spin_unlock(&root->ino_cache_lock);
+		return;
+	}
+
+	root->ino_cache_state = BTRFS_CACHE_STARTED;
+	spin_unlock(&root->ino_cache_lock);
+
+	ret = load_free_ino_cache(root->fs_info, root);
+	if (ret == 1) {
+		spin_lock(&root->ino_cache_lock);
+		root->ino_cache_state = BTRFS_CACHE_FINISHED;
+		spin_unlock(&root->ino_cache_lock);
+		return;
+	}
+
+	/*
+	 * It can be quite time-consuming to fill the cache by searching
+	 * through the extent tree, and this can keep ino allocation path
+	 * waiting. Therefore at start we quickly find out the highest
+	 * inode number and we know we can use inode numbers which fall in
+	 * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+	 */
+	ret = btrfs_find_free_objectid(root, &objectid);
+	if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
+		__btrfs_add_free_space(ctl, objectid,
+				       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+	}
+
+	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
+			  root->root_key.objectid);
+	if (IS_ERR(tsk)) {
+		btrfs_warn(root->fs_info, "failed to start inode caching task");
+		btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
+				"disabling inode map caching");
+	}
+}
+
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
+{
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return btrfs_find_free_objectid(root, objectid);
+
+again:
+	*objectid = btrfs_find_ino_for_alloc(root);
+
+	if (*objectid != 0)
+		return 0;
+
+	start_caching(root);
+
+	wait_event(root->ino_cache_wait,
+		   root->ino_cache_state == BTRFS_CACHE_FINISHED ||
+		   root->free_ino_ctl->free_space > 0);
+
+	if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
+	    root->free_ino_ctl->free_space == 0)
+		return -ENOSPC;
+	else
+		goto again;
+}
+
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+again:
+	if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
+		__btrfs_add_free_space(pinned, objectid, 1);
+	} else {
+		down_write(&root->fs_info->commit_root_sem);
+		spin_lock(&root->ino_cache_lock);
+		if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
+			spin_unlock(&root->ino_cache_lock);
+			up_write(&root->fs_info->commit_root_sem);
+			goto again;
+		}
+		spin_unlock(&root->ino_cache_lock);
+
+		start_caching(root);
+
+		__btrfs_add_free_space(pinned, objectid, 1);
+
+		up_write(&root->fs_info->commit_root_sem);
+	}
+}
+
+/*
+ * When a transaction is committed, we'll move those inode numbers which are
+ * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
+ * others will just be dropped, because the commit root we were searching has
+ * changed.
+ *
+ * Must be called with root->fs_info->commit_root_sem held
+ */
+void btrfs_unpin_free_ino(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	u64 count;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
+	while (1) {
+		n = rb_first(rbroot);
+		if (!n)
+			break;
+
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		BUG_ON(info->bitmap); /* Logic error */
+
+		if (info->offset > root->ino_cache_progress)
+			goto free;
+		else if (info->offset + info->bytes > root->ino_cache_progress)
+			count = root->ino_cache_progress - info->offset + 1;
+		else
+			count = info->bytes;
+
+		__btrfs_add_free_space(ctl, info->offset, count);
+free:
+		rb_erase(&info->offset_index, rbroot);
+		kfree(info);
+	}
+}
+
+#define INIT_THRESHOLD	(((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+
+/*
+ * The goal is to keep the memory used by the free_ino tree won't
+ * exceed the memory if we use bitmaps only.
+ */
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int max_ino;
+	int max_bitmaps;
+
+	n = rb_last(&ctl->free_space_offset);
+	if (!n) {
+		ctl->extents_thresh = INIT_THRESHOLD;
+		return;
+	}
+	info = rb_entry(n, struct btrfs_free_space, offset_index);
+
+	/*
+	 * Find the maximum inode number in the filesystem. Note we
+	 * ignore the fact that this can be a bitmap, because we are
+	 * not doing precise calculation.
+	 */
+	max_ino = info->bytes - 1;
+
+	max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
+	if (max_bitmaps <= ctl->total_bitmaps) {
+		ctl->extents_thresh = 0;
+		return;
+	}
+
+	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
+				PAGE_CACHE_SIZE / sizeof(*info);
+}
+
+/*
+ * We don't fall back to bitmap, if we are below the extents threshold
+ * or this chunk of inode numbers is a big one.
+ */
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+		       struct btrfs_free_space *info)
+{
+	if (ctl->free_extents < ctl->extents_thresh ||
+	    info->bytes > INODES_PER_BITMAP / 10)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_free_space_op free_ino_op = {
+	.recalc_thresholds	= recalculate_thresholds,
+	.use_bitmap		= use_bitmap,
+};
+
+static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+}
+
+static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	/*
+	 * We always use extents for two reasons:
+	 *
+	 * - The pinned tree is only used during the process of caching
+	 *   work.
+	 * - Make code simpler. See btrfs_unpin_free_ino().
+	 */
+	return false;
+}
+
+static struct btrfs_free_space_op pinned_free_ino_op = {
+	.recalc_thresholds	= pinned_recalc_thresholds,
+	.use_bitmap		= pinned_use_bitmap,
+};
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+	spin_lock_init(&ctl->tree_lock);
+	ctl->unit = 1;
+	ctl->start = 0;
+	ctl->private = NULL;
+	ctl->op = &free_ino_op;
+	INIT_LIST_HEAD(&ctl->trimming_ranges);
+	mutex_init(&ctl->cache_writeout_mutex);
+
+	/*
+	 * Initially we allow to use 16K of ram to cache chunks of
+	 * inode numbers before we resort to bitmaps. This is somewhat
+	 * arbitrary, but it will be adjusted in runtime.
+	 */
+	ctl->extents_thresh = INIT_THRESHOLD;
+
+	spin_lock_init(&pinned->tree_lock);
+	pinned->unit = 1;
+	pinned->start = 0;
+	pinned->private = NULL;
+	pinned->extents_thresh = 0;
+	pinned->op = &pinned_free_ino_op;
+}
+
+int btrfs_save_ino_cache(struct btrfs_root *root,
+			 struct btrfs_trans_handle *trans)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_path *path;
+	struct inode *inode;
+	struct btrfs_block_rsv *rsv;
+	u64 num_bytes;
+	u64 alloc_hint = 0;
+	int ret;
+	int prealloc;
+	bool retry = false;
+
+	/* only fs tree and subvol/snap needs ino cache */
+	if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+	    (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+		return 0;
+
+	/* Don't save inode cache if we are deleting this root */
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return 0;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+	num_bytes = trans->bytes_reserved;
+	/*
+	 * 1 item for inode item insertion if need
+	 * 4 items for inode item update (in the worst case)
+	 * 1 items for slack space if we need do truncation
+	 * 1 item for free space object
+	 * 3 items for pre-allocation
+	 */
+	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
+	ret = btrfs_block_rsv_add(root, trans->block_rsv,
+				  trans->bytes_reserved,
+				  BTRFS_RESERVE_NO_FLUSH);
+	if (ret)
+		goto out;
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+				      trans->transid, trans->bytes_reserved, 1);
+again:
+	inode = lookup_free_ino_inode(root, path);
+	if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
+		ret = PTR_ERR(inode);
+		goto out_release;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retry); /* Logic error */
+		retry = true;
+
+		ret = create_free_ino_inode(root, trans, path);
+		if (ret)
+			goto out_release;
+		goto again;
+	}
+
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_put;
+	}
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
+		if (ret) {
+			if (ret != -ENOSPC)
+				btrfs_abort_transaction(trans, root, ret);
+			goto out_put;
+		}
+	}
+
+	spin_lock(&root->ino_cache_lock);
+	if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
+		ret = -1;
+		spin_unlock(&root->ino_cache_lock);
+		goto out_put;
+	}
+	spin_unlock(&root->ino_cache_lock);
+
+	spin_lock(&ctl->tree_lock);
+	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
+	prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
+	prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+	spin_unlock(&ctl->tree_lock);
+
+	/* Just to make sure we have enough space */
+	prealloc += 8 * PAGE_CACHE_SIZE;
+
+	ret = btrfs_delalloc_reserve_space(inode, prealloc);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
+					      prealloc, prealloc, &alloc_hint);
+	if (ret) {
+		btrfs_delalloc_release_space(inode, prealloc);
+		goto out_put;
+	}
+	btrfs_free_reserved_data_space(inode, prealloc);
+
+	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
+out_put:
+	iput(inode);
+out_release:
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+				      trans->transid, trans->bytes_reserved, 0);
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
+out:
+	trans->block_rsv = rsv;
+	trans->bytes_reserved = num_bytes;
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct extent_buffer *l;
+	struct btrfs_key search_key;
+	struct btrfs_key found_key;
+	int slot;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+	search_key.type = -1;
+	search_key.offset = (u64)-1;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	BUG_ON(ret == 0); /* Corruption */
+	if (path->slots[0] > 0) {
+		slot = path->slots[0] - 1;
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+		*objectid = max_t(u64, found_key.objectid,
+				  BTRFS_FIRST_FREE_OBJECTID - 1);
+	} else {
+		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+	int ret;
+	mutex_lock(&root->objectid_mutex);
+
+	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+		ret = btrfs_find_highest_objectid(root,
+						  &root->highest_objectid);
+		if (ret)
+			goto out;
+	}
+
+	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	*objectid = ++root->highest_objectid;
+	ret = 0;
+out:
+	mutex_unlock(&root->objectid_mutex);
+	return ret;
+}
diff --git a/kernel/fs/btrfs/inode-map.h b/kernel/fs/btrfs/inode-map.h
new file mode 100644
index 000000000..ddb347bfe
--- /dev/null
+++ b/kernel/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
+#ifndef __BTRFS_INODE_MAP
+#define __BTRFS_INODE_MAP
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root);
+void btrfs_unpin_free_ino(struct btrfs_root *root);
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
+int btrfs_save_ino_cache(struct btrfs_root *root,
+			 struct btrfs_trans_handle *trans);
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+
+#endif
diff --git a/kernel/fs/btrfs/inode.c b/kernel/fs/btrfs/inode.c
new file mode 100644
index 000000000..8bb013672
--- /dev/null
+++ b/kernel/fs/btrfs/inode.c
@@ -0,0 +1,9907 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "volumes.h"
+#include "compression.h"
+#include "locking.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "backref.h"
+#include "hash.h"
+#include "props.h"
+#include "qgroup.h"
+
+struct btrfs_iget_args {
+	struct btrfs_key *location;
+	struct btrfs_root *root;
+};
+
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
+static const struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+
+static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
+};
+
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
+static int btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type);
+
+static int btrfs_dirty_inode(struct inode *inode);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
+
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+				     struct inode *inode,  struct inode *dir,
+				     const struct qstr *qstr)
+{
+	int err;
+
+	err = btrfs_init_acl(trans, inode, dir);
+	if (!err)
+		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
+	return err;
+}
+
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_path *path, int extent_inserted,
+				struct btrfs_root *root, struct inode *inode,
+				u64 start, size_t size, size_t compressed_size,
+				int compress_type,
+				struct page **compressed_pages)
+{
+	struct extent_buffer *leaf;
+	struct page *page = NULL;
+	char *kaddr;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	int err = 0;
+	int ret;
+	size_t cur_size = size;
+	unsigned long offset;
+
+	if (compressed_size && compressed_pages)
+		cur_size = compressed_size;
+
+	inode_add_bytes(inode, size);
+
+	if (!extent_inserted) {
+		struct btrfs_key key;
+		size_t datasize;
+
+		key.objectid = btrfs_ino(inode);
+		key.offset = start;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+
+		datasize = btrfs_file_extent_calc_inline_size(cur_size);
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+					      datasize);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+	ptr = btrfs_file_extent_inline_start(ei);
+
+	if (compress_type != BTRFS_COMPRESS_NONE) {
+		struct page *cpage;
+		int i = 0;
+		while (compressed_size > 0) {
+			cpage = compressed_pages[i];
+			cur_size = min_t(unsigned long, compressed_size,
+				       PAGE_CACHE_SIZE);
+
+			kaddr = kmap_atomic(cpage);
+			write_extent_buffer(leaf, kaddr, ptr, cur_size);
+			kunmap_atomic(kaddr);
+
+			i++;
+			ptr += cur_size;
+			compressed_size -= cur_size;
+		}
+		btrfs_set_file_extent_compression(leaf, ei,
+						  compress_type);
+	} else {
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+		btrfs_set_file_extent_compression(leaf, ei, 0);
+		kaddr = kmap_atomic(page);
+		offset = start & (PAGE_CACHE_SIZE - 1);
+		write_extent_buffer(leaf, kaddr + offset, ptr, size);
+		kunmap_atomic(kaddr);
+		page_cache_release(page);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	/*
+	 * we're an inline extent, so nobody can
+	 * extend the file past i_size without locking
+	 * a page we already have locked.
+	 *
+	 * We must do any isize and inode updates
+	 * before we unlock the pages.  Otherwise we
+	 * could end up racing with unlink.
+	 */
+	BTRFS_I(inode)->disk_i_size = inode->i_size;
+	ret = btrfs_update_inode(trans, root, inode);
+
+	return ret;
+fail:
+	return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static noinline int cow_file_range_inline(struct btrfs_root *root,
+					  struct inode *inode, u64 start,
+					  u64 end, size_t compressed_size,
+					  int compress_type,
+					  struct page **compressed_pages)
+{
+	struct btrfs_trans_handle *trans;
+	u64 isize = i_size_read(inode);
+	u64 actual_end = min(end + 1, isize);
+	u64 inline_len = actual_end - start;
+	u64 aligned_end = ALIGN(end, root->sectorsize);
+	u64 data_len = inline_len;
+	int ret;
+	struct btrfs_path *path;
+	int extent_inserted = 0;
+	u32 extent_item_size;
+
+	if (compressed_size)
+		data_len = compressed_size;
+
+	if (start > 0 ||
+	    actual_end > PAGE_CACHE_SIZE ||
+	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    (!compressed_size &&
+	    (actual_end & (root->sectorsize - 1)) == 0) ||
+	    end + 1 < isize ||
+	    data_len > root->fs_info->max_inline) {
+		return 1;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	if (compressed_size && compressed_pages)
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		   compressed_size);
+	else
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		    inline_len);
+
+	ret = __btrfs_drop_extents(trans, root, inode, path,
+				   start, aligned_end, NULL,
+				   1, 1, extent_item_size, &extent_inserted);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+	if (isize > actual_end)
+		inline_len = min_t(u64, isize, actual_end);
+	ret = insert_inline_extent(trans, path, extent_inserted,
+				   root, inode, start,
+				   inline_len, compressed_size,
+				   compress_type, compressed_pages);
+	if (ret && ret != -ENOSPC) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	} else if (ret == -ENOSPC) {
+		ret = 1;
+		goto out;
+	}
+
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+	btrfs_delalloc_release_metadata(inode, end + 1 - start);
+	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
+out:
+	btrfs_free_path(path);
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+struct async_extent {
+	u64 start;
+	u64 ram_size;
+	u64 compressed_size;
+	struct page **pages;
+	unsigned long nr_pages;
+	int compress_type;
+	struct list_head list;
+};
+
+struct async_cow {
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct page *locked_page;
+	u64 start;
+	u64 end;
+	struct list_head extents;
+	struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+				     u64 start, u64 ram_size,
+				     u64 compressed_size,
+				     struct page **pages,
+				     unsigned long nr_pages,
+				     int compress_type)
+{
+	struct async_extent *async_extent;
+
+	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+	BUG_ON(!async_extent); /* -ENOMEM */
+	async_extent->start = start;
+	async_extent->ram_size = ram_size;
+	async_extent->compressed_size = compressed_size;
+	async_extent->pages = pages;
+	async_extent->nr_pages = nr_pages;
+	async_extent->compress_type = compress_type;
+	list_add_tail(&async_extent->list, &cow->extents);
+	return 0;
+}
+
+static inline int inode_need_compress(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	/* force compress */
+	if (btrfs_test_opt(root, FORCE_COMPRESS))
+		return 1;
+	/* bad compression ratios */
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+		return 0;
+	if (btrfs_test_opt(root, COMPRESS) ||
+	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+	    BTRFS_I(inode)->force_compress)
+		return 1;
+	return 0;
+}
+
+/*
+ * we create compressed extents in two phases.  The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus.  The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes.  This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that the flusher thread sent them
+ * down.
+ */
+static noinline void compress_file_range(struct inode *inode,
+					struct page *locked_page,
+					u64 start, u64 end,
+					struct async_cow *async_cow,
+					int *num_added)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 num_bytes;
+	u64 blocksize = root->sectorsize;
+	u64 actual_end;
+	u64 isize = i_size_read(inode);
+	int ret = 0;
+	struct page **pages = NULL;
+	unsigned long nr_pages;
+	unsigned long nr_pages_ret = 0;
+	unsigned long total_compressed = 0;
+	unsigned long total_in = 0;
+	unsigned long max_compressed = 128 * 1024;
+	unsigned long max_uncompressed = 128 * 1024;
+	int i;
+	int will_compress;
+	int compress_type = root->fs_info->compress_type;
+	int redirty = 0;
+
+	/* if this is a small write inside eof, kick off a defrag */
+	if ((end - start + 1) < 16 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+		btrfs_add_inode_defrag(NULL, inode);
+
+	actual_end = min_t(u64, isize, end + 1);
+again:
+	will_compress = 0;
+	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+
+	/*
+	 * we don't want to send crud past the end of i_size through
+	 * compression, that's just a waste of CPU time.  So, if the
+	 * end of the file is before the start of our current
+	 * requested range of bytes, we bail out to the uncompressed
+	 * cleanup code that can deal with all of this.
+	 *
+	 * It isn't really the fastest way to fix things, but this is a
+	 * very uncommon corner.
+	 */
+	if (actual_end <= start)
+		goto cleanup_and_bail_uncompressed;
+
+	total_compressed = actual_end - start;
+
+	/*
+	 * skip compression for a small file range(<=blocksize) that
+	 * isn't an inline extent, since it dosen't save disk space at all.
+	 */
+	if (total_compressed <= blocksize &&
+	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+		goto cleanup_and_bail_uncompressed;
+
+	/* we want to make sure that amount of ram required to uncompress
+	 * an extent is reasonable, so we limit the total size in ram
+	 * of a compressed extent to 128k.  This is a crucial number
+	 * because it also controls how easily we can spread reads across
+	 * cpus for decompression.
+	 *
+	 * We also want to make sure the amount of IO required to do
+	 * a random read is reasonably small, so we limit the size of
+	 * a compressed extent to 128k.
+	 */
+	total_compressed = min(total_compressed, max_uncompressed);
+	num_bytes = ALIGN(end - start + 1, blocksize);
+	num_bytes = max(blocksize,  num_bytes);
+	total_in = 0;
+	ret = 0;
+
+	/*
+	 * we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress.  This flag can
+	 * change at any time if we discover bad compression ratios.
+	 */
+	if (inode_need_compress(inode)) {
+		WARN_ON(pages);
+		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+		if (!pages) {
+			/* just bail out to the uncompressed code */
+			goto cont;
+		}
+
+		if (BTRFS_I(inode)->force_compress)
+			compress_type = BTRFS_I(inode)->force_compress;
+
+		/*
+		 * we need to call clear_page_dirty_for_io on each
+		 * page in the range.  Otherwise applications with the file
+		 * mmap'd can wander in and change the page contents while
+		 * we are compressing them.
+		 *
+		 * If the compression fails for any reason, we set the pages
+		 * dirty again later on.
+		 */
+		extent_range_clear_dirty_for_io(inode, start, end);
+		redirty = 1;
+		ret = btrfs_compress_pages(compress_type,
+					   inode->i_mapping, start,
+					   total_compressed, pages,
+					   nr_pages, &nr_pages_ret,
+					   &total_in,
+					   &total_compressed,
+					   max_compressed);
+
+		if (!ret) {
+			unsigned long offset = total_compressed &
+				(PAGE_CACHE_SIZE - 1);
+			struct page *page = pages[nr_pages_ret - 1];
+			char *kaddr;
+
+			/* zero the tail end of the last page, we might be
+			 * sending it down to disk
+			 */
+			if (offset) {
+				kaddr = kmap_atomic(page);
+				memset(kaddr + offset, 0,
+				       PAGE_CACHE_SIZE - offset);
+				kunmap_atomic(kaddr);
+			}
+			will_compress = 1;
+		}
+	}
+cont:
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		if (ret || total_in < (actual_end - start)) {
+			/* we didn't compress the entire range, try
+			 * to make an uncompressed inline extent.
+			 */
+			ret = cow_file_range_inline(root, inode, start, end,
+						    0, 0, NULL);
+		} else {
+			/* try making a compressed inline extent */
+			ret = cow_file_range_inline(root, inode, start, end,
+						    total_compressed,
+						    compress_type, pages);
+		}
+		if (ret <= 0) {
+			unsigned long clear_flags = EXTENT_DELALLOC |
+				EXTENT_DEFRAG;
+			unsigned long page_error_op;
+
+			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
+
+			/*
+			 * inline extent creation worked or returned error,
+			 * we don't need to create any more async work items.
+			 * Unlock and free up our temp pages.
+			 */
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+						     clear_flags, PAGE_UNLOCK |
+						     PAGE_CLEAR_DIRTY |
+						     PAGE_SET_WRITEBACK |
+						     page_error_op |
+						     PAGE_END_WRITEBACK);
+			goto free_pages_out;
+		}
+	}
+
+	if (will_compress) {
+		/*
+		 * we aren't doing an inline extent round the compressed size
+		 * up to a block size boundary so the allocator does sane
+		 * things
+		 */
+		total_compressed = ALIGN(total_compressed, blocksize);
+
+		/*
+		 * one last check to make sure the compression is really a
+		 * win, compare the page count read with the blocks on disk
+		 */
+		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
+		if (total_compressed >= total_in) {
+			will_compress = 0;
+		} else {
+			num_bytes = total_in;
+		}
+	}
+	if (!will_compress && pages) {
+		/*
+		 * the compression code ran but failed to make things smaller,
+		 * free any pages it allocated and our page pointer array
+		 */
+		for (i = 0; i < nr_pages_ret; i++) {
+			WARN_ON(pages[i]->mapping);
+			page_cache_release(pages[i]);
+		}
+		kfree(pages);
+		pages = NULL;
+		total_compressed = 0;
+		nr_pages_ret = 0;
+
+		/* flag the file so we don't compress in the future */
+		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+		    !(BTRFS_I(inode)->force_compress)) {
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		}
+	}
+	if (will_compress) {
+		*num_added += 1;
+
+		/* the async work queues will take care of doing actual
+		 * allocation on disk for these compressed pages,
+		 * and will submit them to the elevator.
+		 */
+		add_async_extent(async_cow, start, num_bytes,
+				 total_compressed, pages, nr_pages_ret,
+				 compress_type);
+
+		if (start + num_bytes < end) {
+			start += num_bytes;
+			pages = NULL;
+			cond_resched();
+			goto again;
+		}
+	} else {
+cleanup_and_bail_uncompressed:
+		/*
+		 * No compression, but we still need to write the pages in
+		 * the file we've been given so far.  redirty the locked
+		 * page if it corresponds to our extent and set things up
+		 * for the async work queue to run cow_file_range to do
+		 * the normal delalloc dance
+		 */
+		if (page_offset(locked_page) >= start &&
+		    page_offset(locked_page) <= end) {
+			__set_page_dirty_nobuffers(locked_page);
+			/* unlocked later on in the async handlers */
+		}
+		if (redirty)
+			extent_range_redirty_for_io(inode, start, end);
+		add_async_extent(async_cow, start, end - start + 1,
+				 0, NULL, 0, BTRFS_COMPRESS_NONE);
+		*num_added += 1;
+	}
+
+	return;
+
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++) {
+		WARN_ON(pages[i]->mapping);
+		page_cache_release(pages[i]);
+	}
+	kfree(pages);
+}
+
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+	int i;
+
+	if (!async_extent->pages)
+		return;
+
+	for (i = 0; i < async_extent->nr_pages; i++) {
+		WARN_ON(async_extent->pages[i]->mapping);
+		page_cache_release(async_extent->pages[i]);
+	}
+	kfree(async_extent->pages);
+	async_extent->nr_pages = 0;
+	async_extent->pages = NULL;
+}
+
+/*
+ * phase two of compressed writeback.  This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued.  We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct inode *inode,
+					      struct async_cow *async_cow)
+{
+	struct async_extent *async_extent;
+	u64 alloc_hint = 0;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree;
+	int ret = 0;
+
+again:
+	while (!list_empty(&async_cow->extents)) {
+		async_extent = list_entry(async_cow->extents.next,
+					  struct async_extent, list);
+		list_del(&async_extent->list);
+
+		io_tree = &BTRFS_I(inode)->io_tree;
+
+retry:
+		/* did the compression code fall back to uncompressed IO? */
+		if (!async_extent->pages) {
+			int page_started = 0;
+			unsigned long nr_written = 0;
+
+			lock_extent(io_tree, async_extent->start,
+					 async_extent->start +
+					 async_extent->ram_size - 1);
+
+			/* allocate blocks */
+			ret = cow_file_range(inode, async_cow->locked_page,
+					     async_extent->start,
+					     async_extent->start +
+					     async_extent->ram_size - 1,
+					     &page_started, &nr_written, 0);
+
+			/* JDM XXX */
+
+			/*
+			 * if page_started, cow_file_range inserted an
+			 * inline extent and took care of all the unlocking
+			 * and IO for us.  Otherwise, we need to submit
+			 * all those pages down to the drive.
+			 */
+			if (!page_started && !ret)
+				extent_write_locked_range(io_tree,
+						  inode, async_extent->start,
+						  async_extent->start +
+						  async_extent->ram_size - 1,
+						  btrfs_get_extent,
+						  WB_SYNC_ALL);
+			else if (ret)
+				unlock_page(async_cow->locked_page);
+			kfree(async_extent);
+			cond_resched();
+			continue;
+		}
+
+		lock_extent(io_tree, async_extent->start,
+			    async_extent->start + async_extent->ram_size - 1);
+
+		ret = btrfs_reserve_extent(root,
+					   async_extent->compressed_size,
+					   async_extent->compressed_size,
+					   0, alloc_hint, &ins, 1, 1);
+		if (ret) {
+			free_async_extent_pages(async_extent);
+
+			if (ret == -ENOSPC) {
+				unlock_extent(io_tree, async_extent->start,
+					      async_extent->start +
+					      async_extent->ram_size - 1);
+
+				/*
+				 * we need to redirty the pages if we decide to
+				 * fallback to uncompressed IO, otherwise we
+				 * will not submit these pages down to lower
+				 * layers.
+				 */
+				extent_range_redirty_for_io(inode,
+						async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1);
+
+				goto retry;
+			}
+			goto out_free;
+		}
+		/*
+		 * here we're doing allocation and writeback of the
+		 * compressed pages
+		 */
+		btrfs_drop_extent_cache(inode, async_extent->start,
+					async_extent->start +
+					async_extent->ram_size - 1, 0);
+
+		em = alloc_extent_map();
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_free_reserve;
+		}
+		em->start = async_extent->start;
+		em->len = async_extent->ram_size;
+		em->orig_start = em->start;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = async_extent->ram_size;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		em->compress_type = async_extent->compress_type;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		em->generation = -1;
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em, 1);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1, 0);
+		}
+
+		if (ret)
+			goto out_free_reserve;
+
+		ret = btrfs_add_ordered_extent_compress(inode,
+						async_extent->start,
+						ins.objectid,
+						async_extent->ram_size,
+						ins.offset,
+						BTRFS_ORDERED_COMPRESSED,
+						async_extent->compress_type);
+		if (ret) {
+			btrfs_drop_extent_cache(inode, async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1, 0);
+			goto out_free_reserve;
+		}
+
+		/*
+		 * clear dirty, set writeback and unlock the pages.
+		 */
+		extent_clear_unlock_delalloc(inode, async_extent->start,
+				async_extent->start +
+				async_extent->ram_size - 1,
+				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				PAGE_SET_WRITEBACK);
+		ret = btrfs_submit_compressed_write(inode,
+				    async_extent->start,
+				    async_extent->ram_size,
+				    ins.objectid,
+				    ins.offset, async_extent->pages,
+				    async_extent->nr_pages);
+		if (ret) {
+			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+			struct page *p = async_extent->pages[0];
+			const u64 start = async_extent->start;
+			const u64 end = start + async_extent->ram_size - 1;
+
+			p->mapping = inode->i_mapping;
+			tree->ops->writepage_end_io_hook(p, start, end,
+							 NULL, 0);
+			p->mapping = NULL;
+			extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+						     PAGE_END_WRITEBACK |
+						     PAGE_SET_ERROR);
+			free_async_extent_pages(async_extent);
+		}
+		alloc_hint = ins.objectid + ins.offset;
+		kfree(async_extent);
+		cond_resched();
+	}
+	return;
+out_free_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+out_free:
+	extent_clear_unlock_delalloc(inode, async_extent->start,
+				     async_extent->start +
+				     async_extent->ram_size - 1,
+				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+				     PAGE_SET_ERROR);
+	free_async_extent_pages(async_extent);
+	kfree(async_extent);
+	goto again;
+}
+
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+				      u64 num_bytes)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	u64 alloc_hint = 0;
+
+	read_lock(&em_tree->lock);
+	em = search_extent_mapping(em_tree, start, num_bytes);
+	if (em) {
+		/*
+		 * if block start isn't an actual block number then find the
+		 * first block in this inode and use that as a hint.  If that
+		 * block is also bogus then just don't worry about it.
+		 */
+		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+			free_extent_map(em);
+			em = search_extent_mapping(em_tree, 0, 0);
+			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+				alloc_hint = em->block_start;
+			if (em)
+				free_extent_map(em);
+		} else {
+			alloc_hint = em->block_start;
+			free_extent_map(em);
+		}
+	}
+	read_unlock(&em_tree->lock);
+
+	return alloc_hint;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written,
+				   int unlock)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 alloc_hint = 0;
+	u64 num_bytes;
+	unsigned long ram_size;
+	u64 disk_num_bytes;
+	u64 cur_alloc_size;
+	u64 blocksize = root->sectorsize;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
+
+	if (btrfs_is_free_space_inode(inode)) {
+		WARN_ON_ONCE(1);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	num_bytes = ALIGN(end - start + 1, blocksize);
+	num_bytes = max(blocksize,  num_bytes);
+	disk_num_bytes = num_bytes;
+
+	/* if this is a small write inside eof, kick off defrag */
+	if (num_bytes < 64 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+		btrfs_add_inode_defrag(NULL, inode);
+
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
+					    NULL);
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+				     EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG, PAGE_UNLOCK |
+				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+				     PAGE_END_WRITEBACK);
+
+			*nr_written = *nr_written +
+			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+			*page_started = 1;
+			goto out;
+		} else if (ret < 0) {
+			goto out_unlock;
+		}
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(root->fs_info->super_copy));
+
+	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+	while (disk_num_bytes > 0) {
+		unsigned long op;
+
+		cur_alloc_size = disk_num_bytes;
+		ret = btrfs_reserve_extent(root, cur_alloc_size,
+					   root->sectorsize, 0, alloc_hint,
+					   &ins, 1, 1);
+		if (ret < 0)
+			goto out_unlock;
+
+		em = alloc_extent_map();
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_reserve;
+		}
+		em->start = start;
+		em->orig_start = em->start;
+		ram_size = ins.offset;
+		em->len = ins.offset;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = ram_size;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		em->generation = -1;
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em, 1);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, start,
+						start + ram_size - 1, 0);
+		}
+		if (ret)
+			goto out_reserve;
+
+		cur_alloc_size = ins.offset;
+		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+					       ram_size, cur_alloc_size, 0);
+		if (ret)
+			goto out_drop_extent_cache;
+
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, start,
+						      cur_alloc_size);
+			if (ret)
+				goto out_drop_extent_cache;
+		}
+
+		if (disk_num_bytes < cur_alloc_size)
+			break;
+
+		/* we're not doing compressed IO, don't unlock the first
+		 * page (which the caller expects to stay locked), don't
+		 * clear any dirty bits and don't set any writeback bits
+		 *
+		 * Do set the Private2 bit so we know this page was properly
+		 * setup for writepage
+		 */
+		op = unlock ? PAGE_UNLOCK : 0;
+		op |= PAGE_SET_PRIVATE2;
+
+		extent_clear_unlock_delalloc(inode, start,
+					     start + ram_size - 1, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC,
+					     op);
+		disk_num_bytes -= cur_alloc_size;
+		num_bytes -= cur_alloc_size;
+		alloc_hint = ins.objectid + ins.offset;
+		start += cur_alloc_size;
+	}
+out:
+	return ret;
+
+out_drop_extent_cache:
+	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
+out_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+out_unlock:
+	extent_clear_unlock_delalloc(inode, start, end, locked_page,
+				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+				     EXTENT_DELALLOC | EXTENT_DEFRAG,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+	goto out;
+}
+
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	int num_added = 0;
+	async_cow = container_of(work, struct async_cow, work);
+
+	compress_file_range(async_cow->inode, async_cow->locked_page,
+			    async_cow->start, async_cow->end, async_cow,
+			    &num_added);
+	if (num_added == 0) {
+		btrfs_add_delayed_iput(async_cow->inode);
+		async_cow->inode = NULL;
+	}
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root;
+	unsigned long nr_pages;
+
+	async_cow = container_of(work, struct async_cow, work);
+
+	root = async_cow->root;
+	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
+	    5 * 1024 * 1024 &&
+	    waitqueue_active(&root->fs_info->async_submit_wait))
+		wake_up(&root->fs_info->async_submit_wait);
+
+	if (async_cow->inode)
+		submit_compressed_extents(async_cow->inode, async_cow);
+}
+
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	async_cow = container_of(work, struct async_cow, work);
+	if (async_cow->inode)
+		btrfs_add_delayed_iput(async_cow->inode);
+	kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+				u64 start, u64 end, int *page_started,
+				unsigned long *nr_written)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr_pages;
+	u64 cur_end;
+	int limit = 10 * 1024 * 1024;
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+			 1, 0, NULL, GFP_NOFS);
+	while (start < end) {
+		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+		BUG_ON(!async_cow); /* -ENOMEM */
+		async_cow->inode = igrab(inode);
+		async_cow->root = root;
+		async_cow->locked_page = locked_page;
+		async_cow->start = start;
+
+		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+		    !btrfs_test_opt(root, FORCE_COMPRESS))
+			cur_end = end;
+		else
+			cur_end = min(end, start + 512 * 1024 - 1);
+
+		async_cow->end = cur_end;
+		INIT_LIST_HEAD(&async_cow->extents);
+
+		btrfs_init_work(&async_cow->work,
+				btrfs_delalloc_helper,
+				async_cow_start, async_cow_submit,
+				async_cow_free);
+
+		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+			PAGE_CACHE_SHIFT;
+		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+		btrfs_queue_work(root->fs_info->delalloc_workers,
+				 &async_cow->work);
+
+		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->async_delalloc_pages) <
+			    limit));
+		}
+
+		while (atomic_read(&root->fs_info->async_submit_draining) &&
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
+			   0));
+		}
+
+		*nr_written += nr_pages;
+		start = cur_end + 1;
+	}
+	*page_started = 1;
+	return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_ordered_sum *sums;
+	LIST_HEAD(list);
+
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+				       bytenr + num_bytes - 1, &list, 0);
+	if (ret == 0 && list_empty(&list))
+		return 0;
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return 1;
+}
+
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static noinline int run_delalloc_nocow(struct inode *inode,
+				       struct page *locked_page,
+			      u64 start, u64 end, int *page_started, int force,
+			      unsigned long *nr_written)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key found_key;
+	u64 cow_start;
+	u64 cur_offset;
+	u64 extent_end;
+	u64 extent_offset;
+	u64 disk_bytenr;
+	u64 num_bytes;
+	u64 disk_num_bytes;
+	u64 ram_bytes;
+	int extent_type;
+	int ret, err;
+	int type;
+	int nocow;
+	int check_prev = 1;
+	bool nolock;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
+		return -ENOMEM;
+	}
+
+	nolock = btrfs_is_free_space_inode(inode);
+
+	if (nolock)
+		trans = btrfs_join_transaction_nolock(root);
+	else
+		trans = btrfs_join_transaction(root);
+
+	if (IS_ERR(trans)) {
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	cow_start = (u64)-1;
+	cur_offset = start;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, ino,
+					       cur_offset, 0);
+		if (ret < 0)
+			goto error;
+		if (ret > 0 && path->slots[0] > 0 && check_prev) {
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0] - 1);
+			if (found_key.objectid == ino &&
+			    found_key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+		check_prev = 0;
+next_slot:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto error;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		nocow = 0;
+		disk_bytenr = 0;
+		num_bytes = 0;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		if (found_key.objectid > ino ||
+		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
+		    found_key.offset > end)
+			break;
+
+		if (found_key.offset > cur_offset) {
+			extent_end = found_key.offset;
+			extent_type = 0;
+			goto out_check;
+		}
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
+
+		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			extent_offset = btrfs_file_extent_offset(leaf, fi);
+			extent_end = found_key.offset +
+				btrfs_file_extent_num_bytes(leaf, fi);
+			disk_num_bytes =
+				btrfs_file_extent_disk_num_bytes(leaf, fi);
+			if (extent_end <= start) {
+				path->slots[0]++;
+				goto next_slot;
+			}
+			if (disk_bytenr == 0)
+				goto out_check;
+			if (btrfs_file_extent_compression(leaf, fi) ||
+			    btrfs_file_extent_encryption(leaf, fi) ||
+			    btrfs_file_extent_other_encoding(leaf, fi))
+				goto out_check;
+			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+				goto out_check;
+			if (btrfs_extent_readonly(root, disk_bytenr))
+				goto out_check;
+			if (btrfs_cross_ref_exist(trans, root, ino,
+						  found_key.offset -
+						  extent_offset, disk_bytenr))
+				goto out_check;
+			disk_bytenr += extent_offset;
+			disk_bytenr += cur_offset - found_key.offset;
+			num_bytes = min(end + 1, extent_end) - cur_offset;
+			/*
+			 * if there are pending snapshots for this root,
+			 * we fall into common COW way.
+			 */
+			if (!nolock) {
+				err = btrfs_start_write_no_snapshoting(root);
+				if (!err)
+					goto out_check;
+			}
+			/*
+			 * force cow if csum exists in the range.
+			 * this ensure that csum for a given extent are
+			 * either valid or do not exist.
+			 */
+			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out_check;
+			nocow = 1;
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = found_key.offset +
+				btrfs_file_extent_inline_len(leaf,
+						     path->slots[0], fi);
+			extent_end = ALIGN(extent_end, root->sectorsize);
+		} else {
+			BUG_ON(1);
+		}
+out_check:
+		if (extent_end <= start) {
+			path->slots[0]++;
+			if (!nolock && nocow)
+				btrfs_end_write_no_snapshoting(root);
+			goto next_slot;
+		}
+		if (!nocow) {
+			if (cow_start == (u64)-1)
+				cow_start = cur_offset;
+			cur_offset = extent_end;
+			if (cur_offset > end)
+				break;
+			path->slots[0]++;
+			goto next_slot;
+		}
+
+		btrfs_release_path(path);
+		if (cow_start != (u64)-1) {
+			ret = cow_file_range(inode, locked_page,
+					     cow_start, found_key.offset - 1,
+					     page_started, nr_written, 1);
+			if (ret) {
+				if (!nolock && nocow)
+					btrfs_end_write_no_snapshoting(root);
+				goto error;
+			}
+			cow_start = (u64)-1;
+		}
+
+		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			struct extent_map *em;
+			struct extent_map_tree *em_tree;
+			em_tree = &BTRFS_I(inode)->extent_tree;
+			em = alloc_extent_map();
+			BUG_ON(!em); /* -ENOMEM */
+			em->start = cur_offset;
+			em->orig_start = found_key.offset - extent_offset;
+			em->len = num_bytes;
+			em->block_len = num_bytes;
+			em->block_start = disk_bytenr;
+			em->orig_block_len = disk_num_bytes;
+			em->ram_bytes = ram_bytes;
+			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			em->mod_start = em->start;
+			em->mod_len = em->len;
+			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			set_bit(EXTENT_FLAG_FILLING, &em->flags);
+			em->generation = -1;
+			while (1) {
+				write_lock(&em_tree->lock);
+				ret = add_extent_mapping(em_tree, em, 1);
+				write_unlock(&em_tree->lock);
+				if (ret != -EEXIST) {
+					free_extent_map(em);
+					break;
+				}
+				btrfs_drop_extent_cache(inode, em->start,
+						em->start + em->len - 1, 0);
+			}
+			type = BTRFS_ORDERED_PREALLOC;
+		} else {
+			type = BTRFS_ORDERED_NOCOW;
+		}
+
+		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+					       num_bytes, num_bytes, type);
+		BUG_ON(ret); /* -ENOMEM */
+
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, cur_offset,
+						      num_bytes);
+			if (ret) {
+				if (!nolock && nocow)
+					btrfs_end_write_no_snapshoting(root);
+				goto error;
+			}
+		}
+
+		extent_clear_unlock_delalloc(inode, cur_offset,
+					     cur_offset + num_bytes - 1,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC, PAGE_UNLOCK |
+					     PAGE_SET_PRIVATE2);
+		if (!nolock && nocow)
+			btrfs_end_write_no_snapshoting(root);
+		cur_offset = extent_end;
+		if (cur_offset > end)
+			break;
+	}
+	btrfs_release_path(path);
+
+	if (cur_offset <= end && cow_start == (u64)-1) {
+		cow_start = cur_offset;
+		cur_offset = end;
+	}
+
+	if (cow_start != (u64)-1) {
+		ret = cow_file_range(inode, locked_page, cow_start, end,
+				     page_started, nr_written, 1);
+		if (ret)
+			goto error;
+	}
+
+error:
+	err = btrfs_end_transaction(trans, root);
+	if (!ret)
+		ret = err;
+
+	if (ret && cur_offset < end)
+		extent_clear_unlock_delalloc(inode, cur_offset, end,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC | EXTENT_DEFRAG |
+					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+	    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+		return 0;
+
+	/*
+	 * @defrag_bytes is a hint value, no spinlock held here,
+	 * if is not zero, it means the file is defragging.
+	 * Force cow if given extent needs to be defragged.
+	 */
+	if (BTRFS_I(inode)->defrag_bytes &&
+	    test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+			   EXTENT_DEFRAG, 0, NULL))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started,
+			      unsigned long *nr_written)
+{
+	int ret;
+	int force_cow = need_force_cow(inode, start, end);
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 1, nr_written);
+	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 0, nr_written);
+	} else if (!inode_need_compress(inode)) {
+		ret = cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
+	} else {
+		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			&BTRFS_I(inode)->runtime_flags);
+		ret = cow_file_range_async(inode, locked_page, start, end,
+					   page_started, nr_written);
+	}
+	return ret;
+}
+
+static void btrfs_split_extent_hook(struct inode *inode,
+				    struct extent_state *orig, u64 split)
+{
+	u64 size;
+
+	/* not delalloc, ignore it */
+	if (!(orig->state & EXTENT_DELALLOC))
+		return;
+
+	size = orig->end - orig->start + 1;
+	if (size > BTRFS_MAX_EXTENT_SIZE) {
+		u64 num_extents;
+		u64 new_size;
+
+		/*
+		 * See the explanation in btrfs_merge_extent_hook, the same
+		 * applies here, just in reverse.
+		 */
+		new_size = orig->end - split + 1;
+		num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+					BTRFS_MAX_EXTENT_SIZE);
+		new_size = split - orig->start;
+		num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+					BTRFS_MAX_EXTENT_SIZE);
+		if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+			      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+			return;
+	}
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->outstanding_extents++;
+	spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static void btrfs_merge_extent_hook(struct inode *inode,
+				    struct extent_state *new,
+				    struct extent_state *other)
+{
+	u64 new_size, old_size;
+	u64 num_extents;
+
+	/* not delalloc, ignore it */
+	if (!(other->state & EXTENT_DELALLOC))
+		return;
+
+	if (new->start > other->start)
+		new_size = new->end - other->start + 1;
+	else
+		new_size = other->end - new->start + 1;
+
+	/* we're not bigger than the max, unreserve the space and go */
+	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->outstanding_extents--;
+		spin_unlock(&BTRFS_I(inode)->lock);
+		return;
+	}
+
+	/*
+	 * We have to add up either side to figure out how many extents were
+	 * accounted for before we merged into one big extent.  If the number of
+	 * extents we accounted for is <= the amount we need for the new range
+	 * then we can return, otherwise drop.  Think of it like this
+	 *
+	 * [ 4k][MAX_SIZE]
+	 *
+	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
+	 * need 2 outstanding extents, on one side we have 1 and the other side
+	 * we have 1 so they are == and we can return.  But in this case
+	 *
+	 * [MAX_SIZE+4k][MAX_SIZE+4k]
+	 *
+	 * Each range on their own accounts for 2 extents, but merged together
+	 * they are only 3 extents worth of accounting, so we need to drop in
+	 * this case.
+	 */
+	old_size = other->end - other->start + 1;
+	num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+				BTRFS_MAX_EXTENT_SIZE);
+	old_size = new->end - new->start + 1;
+	num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+				 BTRFS_MAX_EXTENT_SIZE);
+
+	if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+		      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+		return;
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->outstanding_extents--;
+	spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+				      struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+			      &root->delalloc_inodes);
+		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			&BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes++;
+		if (root->nr_delalloc_inodes == 1) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(!list_empty(&root->delalloc_root));
+			list_add_tail(&root->delalloc_root,
+				      &root->fs_info->delalloc_roots);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+				     struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes--;
+		if (!root->nr_delalloc_inodes) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(list_empty(&root->delalloc_root));
+			list_del_init(&root->delalloc_root);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
+}
+
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static void btrfs_set_bit_hook(struct inode *inode,
+			       struct extent_state *state, unsigned *bits)
+{
+
+	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+		WARN_ON(1);
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
+	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 len = state->end + 1 - state->start;
+		bool do_list = !btrfs_is_free_space_inode(inode);
+
+		if (*bits & EXTENT_FIRST_DELALLOC) {
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		} else {
+			spin_lock(&BTRFS_I(inode)->lock);
+			BTRFS_I(inode)->outstanding_extents++;
+			spin_unlock(&BTRFS_I(inode)->lock);
+		}
+
+		/* For sanity tests */
+		if (btrfs_test_is_dummy_root(root))
+			return;
+
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->delalloc_bytes += len;
+		if (*bits & EXTENT_DEFRAG)
+			BTRFS_I(inode)->defrag_bytes += len;
+		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+					 &BTRFS_I(inode)->runtime_flags))
+			btrfs_add_delalloc_inodes(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
+	}
+}
+
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static void btrfs_clear_bit_hook(struct inode *inode,
+				 struct extent_state *state,
+				 unsigned *bits)
+{
+	u64 len = state->end + 1 - state->start;
+	u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+				    BTRFS_MAX_EXTENT_SIZE);
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+		BTRFS_I(inode)->defrag_bytes -= len;
+	spin_unlock(&BTRFS_I(inode)->lock);
+
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
+	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		bool do_list = !btrfs_is_free_space_inode(inode);
+
+		if (*bits & EXTENT_FIRST_DELALLOC) {
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+			spin_lock(&BTRFS_I(inode)->lock);
+			BTRFS_I(inode)->outstanding_extents -= num_extents;
+			spin_unlock(&BTRFS_I(inode)->lock);
+		}
+
+		/*
+		 * We don't reserve metadata space for space cache inodes so we
+		 * don't need to call dellalloc_release_metadata if there is an
+		 * error.
+		 */
+		if (*bits & EXTENT_DO_ACCOUNTING &&
+		    root != root->fs_info->tree_root)
+			btrfs_delalloc_release_metadata(inode, len);
+
+		/* For sanity tests. */
+		if (btrfs_test_is_dummy_root(root))
+			return;
+
+		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+		    && do_list && !(state->state & EXTENT_NORESERVE))
+			btrfs_free_reserved_data_space(inode, len);
+
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->delalloc_bytes -= len;
+		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
+		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			     &BTRFS_I(inode)->runtime_flags))
+			btrfs_del_delalloc_inode(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
+	}
+}
+
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags)
+{
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	int ret;
+
+	if (bio_flags & EXTENT_BIO_COMPRESSED)
+		return 0;
+
+	length = bio->bi_iter.bi_size;
+	map_length = length;
+	ret = btrfs_map_block(root->fs_info, rw, logical,
+			      &map_length, NULL, 0);
+	/* Will always return 0 with map_multi == NULL */
+	BUG_ON(ret < 0);
+	if (map_length < length + size)
+		return 1;
+	return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags,
+				    u64 bio_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+	BUG_ON(ret); /* -ENOMEM */
+	return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
+}
+
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	int skip_sum;
+	int metadata = 0;
+	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	if (btrfs_is_free_space_inode(inode))
+		metadata = 2;
+
+	if (!(rw & REQ_WRITE)) {
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+		if (ret)
+			goto out;
+
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
+			ret = btrfs_submit_compressed_read(inode, bio,
+							   mirror_num,
+							   bio_flags);
+			goto out;
+		} else if (!skip_sum) {
+			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+			if (ret)
+				goto out;
+		}
+		goto mapit;
+	} else if (async && !skip_sum) {
+		/* csum items have already been cloned */
+		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+			goto mapit;
+		/* we're doing a write, do the async checksumming */
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, mirror_num,
+				   bio_flags, bio_offset,
+				   __btrfs_submit_bio_start,
+				   __btrfs_submit_bio_done);
+		goto out;
+	} else if (!skip_sum) {
+		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+		if (ret)
+			goto out;
+	}
+
+mapit:
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+	if (ret < 0)
+		bio_endio(bio, ret);
+	return ret;
+}
+
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+			     struct inode *inode, u64 file_offset,
+			     struct list_head *list)
+{
+	struct btrfs_ordered_sum *sum;
+
+	list_for_each_entry(sum, list, list) {
+		trans->adding_csums = 1;
+		btrfs_csum_file_blocks(trans,
+		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+		trans->adding_csums = 0;
+	}
+	return 0;
+}
+
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state)
+{
+	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
+	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+				   cached_state, GFP_NOFS);
+}
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+	struct page *page;
+	struct btrfs_work work;
+};
+
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	struct page *page;
+	struct inode *inode;
+	u64 page_start;
+	u64 page_end;
+	int ret;
+
+	fixup = container_of(work, struct btrfs_writepage_fixup, work);
+	page = fixup->page;
+again:
+	lock_page(page);
+	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+		ClearPageChecked(page);
+		goto out_page;
+	}
+
+	inode = page->mapping->host;
+	page_start = page_offset(page);
+	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+			 &cached_state);
+
+	/* already ordered? We're done */
+	if (PagePrivate2(page))
+		goto out;
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+				     page_end, &cached_state, GFP_NOFS);
+		unlock_page(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		end_extent_writepage(page, ret, page_start, page_end);
+		ClearPageChecked(page);
+		goto out;
+	 }
+
+	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
+	ClearPageChecked(page);
+	set_page_dirty(page);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			     &cached_state, GFP_NOFS);
+out_page:
+	unlock_page(page);
+	page_cache_release(page);
+	kfree(fixup);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea.  This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO.  We kick off an async process
+ * to fix it up.  The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	/* this page is properly in the ordered list */
+	if (TestClearPagePrivate2(page))
+		return 0;
+
+	if (PageChecked(page))
+		return -EAGAIN;
+
+	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+	if (!fixup)
+		return -EAGAIN;
+
+	SetPageChecked(page);
+	page_cache_get(page);
+	btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+			btrfs_writepage_fixup_worker, NULL, NULL);
+	fixup->page = page;
+	btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
+	return -EBUSY;
+}
+
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+				       struct inode *inode, u64 file_pos,
+				       u64 disk_bytenr, u64 disk_num_bytes,
+				       u64 num_bytes, u64 ram_bytes,
+				       u8 compression, u8 encryption,
+				       u16 other_encoding, int extent_type)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key ins;
+	int extent_inserted = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * we may be replacing one extent in the tree with another.
+	 * The new extent is pinned in the extent map, and we don't want
+	 * to drop it from the cache until it is completely in the btree.
+	 *
+	 * So, tell btrfs_drop_extents to leave this extent in the cache.
+	 * the caller is expected to unpin it and allow it to be merged
+	 * with the others.
+	 */
+	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
+				   file_pos + num_bytes, NULL, 0,
+				   1, sizeof(*fi), &extent_inserted);
+	if (ret)
+		goto out;
+
+	if (!extent_inserted) {
+		ins.objectid = btrfs_ino(inode);
+		ins.offset = file_pos;
+		ins.type = BTRFS_EXTENT_DATA_KEY;
+
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &ins,
+					      sizeof(*fi));
+		if (ret)
+			goto out;
+	}
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, 0);
+	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	inode_add_bytes(inode, num_bytes);
+
+	ins.objectid = disk_bytenr;
+	ins.offset = disk_num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_file_extent(trans, root,
+					root->root_key.objectid,
+					btrfs_ino(inode), file_pos, &ins);
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+	struct rb_node node;
+	struct old_sa_defrag_extent *old;
+	u64 root_id;
+	u64 inum;
+	u64 file_pos;
+	u64 extent_offset;
+	u64 num_bytes;
+	u64 generation;
+};
+
+struct old_sa_defrag_extent {
+	struct list_head list;
+	struct new_sa_defrag_extent *new;
+
+	u64 extent_offset;
+	u64 bytenr;
+	u64 offset;
+	u64 len;
+	int count;
+};
+
+struct new_sa_defrag_extent {
+	struct rb_root root;
+	struct list_head head;
+	struct btrfs_path *path;
+	struct inode *inode;
+	u64 file_pos;
+	u64 len;
+	u64 bytenr;
+	u64 disk_len;
+	u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+			struct sa_defrag_extent_backref *b2)
+{
+	if (b1->root_id < b2->root_id)
+		return -1;
+	else if (b1->root_id > b2->root_id)
+		return 1;
+
+	if (b1->inum < b2->inum)
+		return -1;
+	else if (b1->inum > b2->inum)
+		return 1;
+
+	if (b1->file_pos < b2->file_pos)
+		return -1;
+	else if (b1->file_pos > b2->file_pos)
+		return 1;
+
+	/*
+	 * [------------------------------] ===> (a range of space)
+	 *     |<--->|   |<---->| =============> (fs/file tree A)
+	 * |<---------------------------->| ===> (fs/file tree B)
+	 *
+	 * A range of space can refer to two file extents in one tree while
+	 * refer to only one file extent in another tree.
+	 *
+	 * So we may process a disk offset more than one time(two extents in A)
+	 * and locate at the same extent(one extent in B), then insert two same
+	 * backrefs(both refer to the extent in B).
+	 */
+	return 0;
+}
+
+static void backref_insert(struct rb_root *root,
+			   struct sa_defrag_extent_backref *backref)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sa_defrag_extent_backref *entry;
+	int ret;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+		ret = backref_comp(backref, entry);
+		if (ret < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&backref->node, parent, p);
+	rb_insert_color(&backref->node, root);
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+				       void *ctx)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_fs_info *fs_info;
+	struct old_sa_defrag_extent *old = ctx;
+	struct new_sa_defrag_extent *new = old->new;
+	struct btrfs_path *path = new->path;
+	struct btrfs_key key;
+	struct btrfs_root *root;
+	struct sa_defrag_extent_backref *backref;
+	struct extent_buffer *leaf;
+	struct inode *inode = new->inode;
+	int slot;
+	int ret;
+	u64 extent_offset;
+	u64 num_bytes;
+
+	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+	    inum == btrfs_ino(inode))
+		return 0;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(inode)->root->fs_info;
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		WARN_ON(1);
+		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+			 inum, offset, root_id);
+		return PTR_ERR(root);
+	}
+
+	key.objectid = inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (offset > (u64)-1 << 32)
+		key.offset = 0;
+	else
+		key.offset = offset;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (WARN_ON(ret < 0))
+		return ret;
+	ret = 0;
+
+	while (1) {
+		cond_resched();
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				goto out;
+			}
+			continue;
+		}
+
+		path->slots[0]++;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid > inum)
+			goto out;
+
+		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+			continue;
+
+		/*
+		 * 'offset' refers to the exact key.offset,
+		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
+		 * (key.offset - extent_offset).
+		 */
+		if (key.offset != offset)
+			continue;
+
+		extent_offset = btrfs_file_extent_offset(leaf, extent);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+
+		if (extent_offset >= old->extent_offset + old->offset +
+		    old->len || extent_offset + num_bytes <=
+		    old->extent_offset + old->offset)
+			continue;
+		break;
+	}
+
+	backref = kmalloc(sizeof(*backref), GFP_NOFS);
+	if (!backref) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	backref->root_id = root_id;
+	backref->inum = inum;
+	backref->file_pos = offset;
+	backref->num_bytes = num_bytes;
+	backref->extent_offset = extent_offset;
+	backref->generation = btrfs_file_extent_generation(leaf, extent);
+	backref->old = old;
+	backref_insert(&new->root, backref);
+	old->count++;
+out:
+	btrfs_release_path(path);
+	WARN_ON(ret);
+	return ret;
+}
+
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+				   struct new_sa_defrag_extent *new)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+	struct old_sa_defrag_extent *old, *tmp;
+	int ret;
+
+	new->path = path;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		ret = iterate_inodes_from_logical(old->bytenr +
+						  old->extent_offset, fs_info,
+						  path, record_one_backref,
+						  old);
+		if (ret < 0 && ret != -ENOENT)
+			return false;
+
+		/* no backref to be processed for this extent */
+		if (!old->count) {
+			list_del(&old->list);
+			kfree(old);
+		}
+	}
+
+	if (list_empty(&new->head))
+		return false;
+
+	return true;
+}
+
+static int relink_is_mergable(struct extent_buffer *leaf,
+			      struct btrfs_file_extent_item *fi,
+			      struct new_sa_defrag_extent *new)
+{
+	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
+		return 0;
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
+		return 0;
+
+	if (btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+				 struct sa_defrag_extent_backref *prev,
+				 struct sa_defrag_extent_backref *backref)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct old_sa_defrag_extent *old = backref->old;
+	struct new_sa_defrag_extent *new = old->new;
+	struct inode *src_inode = new->inode;
+	struct inode *inode;
+	struct extent_state *cached = NULL;
+	int ret = 0;
+	u64 start;
+	u64 len;
+	u64 lock_start;
+	u64 lock_end;
+	bool merge = false;
+	int index;
+
+	if (prev && prev->root_id == backref->root_id &&
+	    prev->inum == backref->inum &&
+	    prev->file_pos + prev->num_bytes == backref->file_pos)
+		merge = true;
+
+	/* step 1: get root */
+	key.objectid = backref->root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(src_inode)->root->fs_info;
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		return PTR_ERR(root);
+	}
+
+	if (btrfs_root_readonly(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
+	/* step 2: get inode */
+	key.objectid = backref->inum;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	/* step 3: relink backref */
+	lock_start = backref->file_pos;
+	lock_end = backref->file_pos + backref->num_bytes - 1;
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			 0, &cached);
+
+	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		goto out_unlock;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	key.objectid = backref->inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = backref->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out_free_path;
+	} else if (ret > 0) {
+		ret = 0;
+		goto out_free_path;
+	}
+
+	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+	    backref->generation)
+		goto out_free_path;
+
+	btrfs_release_path(path);
+
+	start = backref->file_pos;
+	if (backref->extent_offset < old->extent_offset + old->offset)
+		start += old->extent_offset + old->offset -
+			 backref->extent_offset;
+
+	len = min(backref->extent_offset + backref->num_bytes,
+		  old->extent_offset + old->offset + old->len);
+	len -= max(backref->extent_offset, old->extent_offset + old->offset);
+
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 start + len, 1);
+	if (ret)
+		goto out_free_path;
+again:
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	path->leave_spinning = 1;
+	if (merge) {
+		struct btrfs_file_extent_item *fi;
+		u64 extent_len;
+		struct btrfs_key found_key;
+
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto out_free_path;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+
+		if (extent_len + found_key.offset == start &&
+		    relink_is_mergable(leaf, fi, new)) {
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_len + len);
+			btrfs_mark_buffer_dirty(leaf);
+			inode_add_bytes(inode, len);
+
+			ret = 1;
+			goto out_free_path;
+		} else {
+			merge = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+					sizeof(*extent));
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+	btrfs_set_file_extent_num_bytes(leaf, item, len);
+	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+	btrfs_set_file_extent_encryption(leaf, item, 0);
+	btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+	inode_add_bytes(inode, len);
+	btrfs_release_path(path);
+
+	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+			new->disk_len, 0,
+			backref->root_id, backref->inum,
+			new->file_pos, 0);	/* start - extent_offset */
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	ret = 1;
+out_free_path:
+	btrfs_release_path(path);
+	path->leave_spinning = 0;
+	btrfs_end_transaction(trans, root);
+out_unlock:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			     &cached, GFP_NOFS);
+	iput(inode);
+	return ret;
+}
+
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+	struct old_sa_defrag_extent *old, *tmp;
+
+	if (!new)
+		return;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+	kfree(new);
+}
+
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+	struct btrfs_path *path;
+	struct sa_defrag_extent_backref *backref;
+	struct sa_defrag_extent_backref *prev = NULL;
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct rb_node *node;
+	int ret;
+
+	inode = new->inode;
+	root = BTRFS_I(inode)->root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+
+	if (!record_extent_backrefs(path, new)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	while (1) {
+		node = rb_first(&new->root);
+		if (!node)
+			break;
+		rb_erase(node, &new->root);
+
+		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+
+		ret = relink_extent_backref(path, prev, backref);
+		WARN_ON(ret < 0);
+
+		kfree(prev);
+
+		if (ret == 1)
+			prev = backref;
+		else
+			prev = NULL;
+		cond_resched();
+	}
+	kfree(prev);
+
+	btrfs_free_path(path);
+out:
+	free_sa_defrag_extent(new);
+
+	atomic_dec(&root->fs_info->defrag_running);
+	wake_up(&root->fs_info->transaction_wait);
+}
+
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+			struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct old_sa_defrag_extent *old;
+	struct new_sa_defrag_extent *new;
+	int ret;
+
+	new = kmalloc(sizeof(*new), GFP_NOFS);
+	if (!new)
+		return NULL;
+
+	new->inode = inode;
+	new->file_pos = ordered->file_offset;
+	new->len = ordered->len;
+	new->bytenr = ordered->start;
+	new->disk_len = ordered->disk_len;
+	new->compress_type = ordered->compress_type;
+	new->root = RB_ROOT;
+	INIT_LIST_HEAD(&new->head);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out_kfree;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = new->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out_free_path;
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	/* find out all the old extents for the file range */
+	while (1) {
+		struct btrfs_file_extent_item *extent;
+		struct extent_buffer *l;
+		int slot;
+		u64 num_bytes;
+		u64 offset;
+		u64 end;
+		u64 disk_bytenr;
+		u64 extent_offset;
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out_free_path;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid != btrfs_ino(inode))
+			break;
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+		if (key.offset >= new->file_pos + new->len)
+			break;
+
+		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+
+		num_bytes = btrfs_file_extent_num_bytes(l, extent);
+		if (key.offset + num_bytes < new->file_pos)
+			goto next;
+
+		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+		if (!disk_bytenr)
+			goto next;
+
+		extent_offset = btrfs_file_extent_offset(l, extent);
+
+		old = kmalloc(sizeof(*old), GFP_NOFS);
+		if (!old)
+			goto out_free_path;
+
+		offset = max(new->file_pos, key.offset);
+		end = min(new->file_pos + new->len, key.offset + num_bytes);
+
+		old->bytenr = disk_bytenr;
+		old->extent_offset = extent_offset;
+		old->offset = offset - key.offset;
+		old->len = end - offset;
+		old->new = new;
+		old->count = 0;
+		list_add_tail(&old->list, &new->head);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+
+	btrfs_free_path(path);
+	atomic_inc(&root->fs_info->defrag_running);
+
+	return new;
+
+out_free_path:
+	btrfs_free_path(path);
+out_kfree:
+	free_sa_defrag_extent(new);
+	return NULL;
+}
+
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+					 u64 start, u64 len)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	ASSERT(cache);
+
+	spin_lock(&cache->lock);
+	cache->delalloc_bytes -= len;
+	spin_unlock(&cache->lock);
+
+	btrfs_put_block_group(cache);
+}
+
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+{
+	struct inode *inode = ordered_extent->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_state *cached_state = NULL;
+	struct new_sa_defrag_extent *new = NULL;
+	int compress_type = 0;
+	int ret = 0;
+	u64 logical_len = ordered_extent->len;
+	bool nolock;
+	bool truncated = false;
+
+	nolock = btrfs_is_free_space_inode(inode);
+
+	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+				     ordered_extent->file_offset +
+				     ordered_extent->len - 1);
+
+	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+		truncated = true;
+		logical_len = ordered_extent->truncated_len;
+		/* Truncated the entire extent, don't bother adding */
+		if (!logical_len)
+			goto out;
+	}
+
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root);
+		else
+			trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
+		}
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+		ret = btrfs_update_inode_fallback(trans, root, inode);
+		if (ret) /* -ENOMEM or corruption */
+			btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+	lock_extent_bits(io_tree, ordered_extent->file_offset,
+			 ordered_extent->file_offset + ordered_extent->len - 1,
+			 0, &cached_state);
+
+	ret = test_range_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 1, cached_state);
+	if (ret) {
+		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+		if (0 && last_snapshot >= BTRFS_I(inode)->generation)
+			/* the inode is shared */
+			new = record_old_file_extents(inode, ordered_extent);
+
+		clear_extent_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+	}
+
+	if (nolock)
+		trans = btrfs_join_transaction_nolock(root);
+	else
+		trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out_unlock;
+	}
+
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+		compress_type = ordered_extent->compress_type;
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+		BUG_ON(compress_type);
+		ret = btrfs_mark_extent_written(trans, inode,
+						ordered_extent->file_offset,
+						ordered_extent->file_offset +
+						logical_len);
+	} else {
+		BUG_ON(root == root->fs_info->tree_root);
+		ret = insert_reserved_file_extent(trans, inode,
+						ordered_extent->file_offset,
+						ordered_extent->start,
+						ordered_extent->disk_len,
+						logical_len, logical_len,
+						compress_type, 0, 0,
+						BTRFS_FILE_EXTENT_REG);
+		if (!ret)
+			btrfs_release_delalloc_bytes(root,
+						     ordered_extent->start,
+						     ordered_extent->disk_len);
+	}
+	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+			   ordered_extent->file_offset, ordered_extent->len,
+			   trans->transid);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_unlock;
+	}
+
+	add_pending_csums(trans, inode, ordered_extent->file_offset,
+			  &ordered_extent->list);
+
+	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+	ret = btrfs_update_inode_fallback(trans, root, inode);
+	if (ret) { /* -ENOMEM or corruption */
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_unlock;
+	}
+	ret = 0;
+out_unlock:
+	unlock_extent_cached(io_tree, ordered_extent->file_offset,
+			     ordered_extent->file_offset +
+			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
+out:
+	if (root != root->fs_info->tree_root)
+		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+	if (trans)
+		btrfs_end_transaction(trans, root);
+
+	if (ret || truncated) {
+		u64 start, end;
+
+		if (truncated)
+			start = ordered_extent->file_offset + logical_len;
+		else
+			start = ordered_extent->file_offset;
+		end = ordered_extent->file_offset + ordered_extent->len - 1;
+		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+
+		/* Drop the cache for the part of the extent we didn't write. */
+		btrfs_drop_extent_cache(inode, start, end, 0);
+
+		/*
+		 * If the ordered extent had an IOERR or something else went
+		 * wrong we need to return the space for this ordered extent
+		 * back to the allocator.  We only free the extent in the
+		 * truncated case if we didn't write out the extent at all.
+		 */
+		if ((ret || !logical_len) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+			btrfs_free_reserved_extent(root, ordered_extent->start,
+						   ordered_extent->disk_len, 1);
+	}
+
+
+	/*
+	 * This needs to be done to make sure anybody waiting knows we are done
+	 * updating everything for this ordered extent.
+	 */
+	btrfs_remove_ordered_extent(inode, ordered_extent);
+
+	/* for snapshot-aware defrag */
+	if (new) {
+		if (ret) {
+			free_sa_defrag_extent(new);
+			atomic_dec(&root->fs_info->defrag_running);
+		} else {
+			relink_file_extents(new);
+		}
+	}
+
+	/* once for us */
+	btrfs_put_ordered_extent(ordered_extent);
+	/* once for the tree */
+	btrfs_put_ordered_extent(ordered_extent);
+
+	return ret;
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+	struct btrfs_ordered_extent *ordered_extent;
+	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+	btrfs_finish_ordered_io(ordered_extent);
+}
+
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_extent *ordered_extent = NULL;
+	struct btrfs_workqueue *wq;
+	btrfs_work_func_t func;
+
+	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
+
+	ClearPagePrivate2(page);
+	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+					    end - start + 1, uptodate))
+		return 0;
+
+	if (btrfs_is_free_space_inode(inode)) {
+		wq = root->fs_info->endio_freespace_worker;
+		func = btrfs_freespace_write_helper;
+	} else {
+		wq = root->fs_info->endio_write_workers;
+		func = btrfs_endio_write_helper;
+	}
+
+	btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
+			NULL);
+	btrfs_queue_work(wq, &ordered_extent->work);
+
+	return 0;
+}
+
+static int __readpage_endio_check(struct inode *inode,
+				  struct btrfs_io_bio *io_bio,
+				  int icsum, struct page *page,
+				  int pgoff, u64 start, size_t len)
+{
+	char *kaddr;
+	u32 csum_expected;
+	u32 csum = ~(u32)0;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+	kaddr = kmap_atomic(page);
+	csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
+	btrfs_csum_final(csum, (char *)&csum);
+	if (csum != csum_expected)
+		goto zeroit;
+
+	kunmap_atomic(kaddr);
+	return 0;
+zeroit:
+	if (__ratelimit(&_rs))
+		btrfs_warn(BTRFS_I(inode)->root->fs_info,
+			   "csum failed ino %llu off %llu csum %u expected csum %u",
+			   btrfs_ino(inode), start, csum, csum_expected);
+	memset(kaddr + pgoff, 1, len);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+	if (csum_expected == 0)
+		return 0;
+	return -EIO;
+}
+
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, the code in
+ * extent_io.c will try to find good copies for us.
+ */
+static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
+{
+	size_t offset = start - page_offset(page);
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		return 0;
+	}
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+		return 0;
+
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
+		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+				  GFP_NOFS);
+		return 0;
+	}
+
+	phy_offset >>= inode->i_sb->s_blocksize_bits;
+	return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+				      start, (size_t)(end - start + 1));
+}
+
+struct delayed_iput {
+	struct list_head list;
+	struct inode *inode;
+};
+
+/* JDM: If this is fs-wide, why can't we add a pointer to
+ * btrfs_inode instead and avoid the allocation? */
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct delayed_iput *delayed;
+
+	if (atomic_add_unless(&inode->i_count, -1, 1))
+		return;
+
+	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+	delayed->inode = inode;
+
+	spin_lock(&fs_info->delayed_iput_lock);
+	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+	spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+	LIST_HEAD(list);
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct delayed_iput *delayed;
+	int empty;
+
+	spin_lock(&fs_info->delayed_iput_lock);
+	empty = list_empty(&fs_info->delayed_iputs);
+	spin_unlock(&fs_info->delayed_iput_lock);
+	if (empty)
+		return;
+
+	down_read(&fs_info->delayed_iput_sem);
+
+	spin_lock(&fs_info->delayed_iput_lock);
+	list_splice_init(&fs_info->delayed_iputs, &list);
+	spin_unlock(&fs_info->delayed_iput_lock);
+
+	while (!list_empty(&list)) {
+		delayed = list_entry(list.next, struct delayed_iput, list);
+		list_del(&delayed->list);
+		iput(delayed->inode);
+		kfree(delayed);
+	}
+
+	up_read(&root->fs_info->delayed_iput_sem);
+}
+
+/*
+ * This is called in transaction commit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *block_rsv;
+	int ret;
+
+	if (atomic_read(&root->orphan_inodes) ||
+	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+		return;
+
+	spin_lock(&root->orphan_lock);
+	if (atomic_read(&root->orphan_inodes)) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	block_rsv = root->orphan_block_rsv;
+	root->orphan_block_rsv = NULL;
+	spin_unlock(&root->orphan_lock);
+
+	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
+	    btrfs_root_refs(&root->root_item) > 0) {
+		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+					    root->root_key.objectid);
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
+		else
+			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+				  &root->state);
+	}
+
+	if (block_rsv) {
+		WARN_ON(block_rsv->size > 0);
+		btrfs_free_block_rsv(root, block_rsv);
+	}
+}
+
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *	 this function.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *block_rsv = NULL;
+	int reserve = 0;
+	int insert = 0;
+	int ret;
+
+	if (!root->orphan_block_rsv) {
+		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+		if (!block_rsv)
+			return -ENOMEM;
+	}
+
+	spin_lock(&root->orphan_lock);
+	if (!root->orphan_block_rsv) {
+		root->orphan_block_rsv = block_rsv;
+	} else if (block_rsv) {
+		btrfs_free_block_rsv(root, block_rsv);
+		block_rsv = NULL;
+	}
+
+	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			      &BTRFS_I(inode)->runtime_flags)) {
+#if 0
+		/*
+		 * For proper ENOSPC handling, we should do orphan
+		 * cleanup when mounting. But this introduces backward
+		 * compatibility issue.
+		 */
+		if (!xchg(&root->orphan_item_inserted, 1))
+			insert = 2;
+		else
+			insert = 1;
+#endif
+		insert = 1;
+		atomic_inc(&root->orphan_inodes);
+	}
+
+	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+			      &BTRFS_I(inode)->runtime_flags))
+		reserve = 1;
+	spin_unlock(&root->orphan_lock);
+
+	/* grab metadata reservation from transaction handle */
+	if (reserve) {
+		ret = btrfs_orphan_reserve_metadata(trans, inode);
+		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
+	}
+
+	/* insert an orphan item to track this unlinked/truncated file */
+	if (insert >= 1) {
+		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
+		if (ret) {
+			atomic_dec(&root->orphan_inodes);
+			if (reserve) {
+				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_orphan_release_metadata(inode);
+			}
+			if (ret != -EEXIST) {
+				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_abort_transaction(trans, root, ret);
+				return ret;
+			}
+		}
+		ret = 0;
+	}
+
+	/* insert an orphan item to track subvolume contains orphan files */
+	if (insert >= 2) {
+		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+					       root->root_key.objectid);
+		if (ret && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+			    struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int delete_item = 0;
+	int release_rsv = 0;
+	int ret = 0;
+
+	spin_lock(&root->orphan_lock);
+	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			       &BTRFS_I(inode)->runtime_flags))
+		delete_item = 1;
+
+	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+			       &BTRFS_I(inode)->runtime_flags))
+		release_rsv = 1;
+	spin_unlock(&root->orphan_lock);
+
+	if (delete_item) {
+		atomic_dec(&root->orphan_inodes);
+		if (trans)
+			ret = btrfs_del_orphan_item(trans, root,
+						    btrfs_ino(inode));
+	}
+
+	if (release_rsv)
+		btrfs_orphan_release_metadata(inode);
+
+	return ret;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+int btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key, found_key;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode;
+	u64 last_objectid = 0;
+	int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	path->reada = -1;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		/*
+		 * if ret == 0 means we found what we were searching for, which
+		 * is weird, but possible, so only screw with path if we didn't
+		 * find the key and see if we have stuff that matches
+		 */
+		if (ret > 0) {
+			ret = 0;
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		/* pull out the item */
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		/* make sure the item matches what we want */
+		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+			break;
+		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		/* release the path since we're done with it */
+		btrfs_release_path(path);
+
+		/*
+		 * this is where we are basically btrfs_lookup, without the
+		 * crossing root thing.  we store the inode number in the
+		 * offset of the orphan item.
+		 */
+
+		if (found_key.offset == last_objectid) {
+			btrfs_err(root->fs_info,
+				"Error removing orphan entry, stopping orphan cleanup");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		last_objectid = found_key.offset;
+
+		found_key.objectid = found_key.offset;
+		found_key.type = BTRFS_INODE_ITEM_KEY;
+		found_key.offset = 0;
+		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
+		ret = PTR_ERR_OR_ZERO(inode);
+		if (ret && ret != -ESTALE)
+			goto out;
+
+		if (ret == -ESTALE && root == root->fs_info->tree_root) {
+			struct btrfs_root *dead_root;
+			struct btrfs_fs_info *fs_info = root->fs_info;
+			int is_dead_root = 0;
+
+			/*
+			 * this is an orphan in the tree root. Currently these
+			 * could come from 2 sources:
+			 *  a) a snapshot deletion in progress
+			 *  b) a free space cache inode
+			 * We need to distinguish those two, as the snapshot
+			 * orphan must not get deleted.
+			 * find_dead_roots already ran before us, so if this
+			 * is a snapshot deletion, we should find the root
+			 * in the dead_roots list
+			 */
+			spin_lock(&fs_info->trans_lock);
+			list_for_each_entry(dead_root, &fs_info->dead_roots,
+					    root_list) {
+				if (dead_root->root_key.objectid ==
+				    found_key.objectid) {
+					is_dead_root = 1;
+					break;
+				}
+			}
+			spin_unlock(&fs_info->trans_lock);
+			if (is_dead_root) {
+				/* prevent this orphan from being found again */
+				key.offset = found_key.objectid - 1;
+				continue;
+			}
+		}
+		/*
+		 * Inode is already gone but the orphan item is still there,
+		 * kill the orphan item.
+		 */
+		if (ret == -ESTALE) {
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+			btrfs_debug(root->fs_info, "auto deleting %Lu",
+				found_key.objectid);
+			ret = btrfs_del_orphan_item(trans, root,
+						    found_key.objectid);
+			btrfs_end_transaction(trans, root);
+			if (ret)
+				goto out;
+			continue;
+		}
+
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			&BTRFS_I(inode)->runtime_flags);
+		atomic_inc(&root->orphan_inodes);
+
+		/* if we have links, this was a truncate, lets do that */
+		if (inode->i_nlink) {
+			if (WARN_ON(!S_ISREG(inode->i_mode))) {
+				iput(inode);
+				continue;
+			}
+			nr_truncate++;
+
+			/* 1 for the orphan item deletion. */
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				iput(inode);
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+			ret = btrfs_orphan_add(trans, inode);
+			btrfs_end_transaction(trans, root);
+			if (ret) {
+				iput(inode);
+				goto out;
+			}
+
+			ret = btrfs_truncate(inode);
+			if (ret)
+				btrfs_orphan_del(NULL, inode);
+		} else {
+			nr_unlink++;
+		}
+
+		/* this will do delete_inode and everything for us */
+		iput(inode);
+		if (ret)
+			goto out;
+	}
+	/* release the path since we're done with it */
+	btrfs_release_path(path);
+
+	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+	if (root->orphan_block_rsv)
+		btrfs_block_rsv_release(root, root->orphan_block_rsv,
+					(u64)-1);
+
+	if (root->orphan_block_rsv ||
+	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+		trans = btrfs_join_transaction(root);
+		if (!IS_ERR(trans))
+			btrfs_end_transaction(trans, root);
+	}
+
+	if (nr_unlink)
+		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
+	if (nr_truncate)
+		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
+
+out:
+	if (ret)
+		btrfs_err(root->fs_info,
+			"could not do orphan cleanup %d", ret);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * very simple check to peek ahead in the leaf looking for xattrs.  If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+					  int slot, u64 objectid,
+					  int *first_xattr_slot)
+{
+	u32 nritems = btrfs_header_nritems(leaf);
+	struct btrfs_key found_key;
+	static u64 xattr_access = 0;
+	static u64 xattr_default = 0;
+	int scanned = 0;
+
+	if (!xattr_access) {
+		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+					strlen(POSIX_ACL_XATTR_ACCESS));
+		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+					strlen(POSIX_ACL_XATTR_DEFAULT));
+	}
+
+	slot++;
+	*first_xattr_slot = -1;
+	while (slot < nritems) {
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* we found a different objectid, there must not be acls */
+		if (found_key.objectid != objectid)
+			return 0;
+
+		/* we found an xattr, assume we've got an acl */
+		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+			if (*first_xattr_slot == -1)
+				*first_xattr_slot = slot;
+			if (found_key.offset == xattr_access ||
+			    found_key.offset == xattr_default)
+				return 1;
+		}
+
+		/*
+		 * we found a key greater than an xattr key, there can't
+		 * be any acls later on
+		 */
+		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+			return 0;
+
+		slot++;
+		scanned++;
+
+		/*
+		 * it goes inode, inode backrefs, xattrs, extents,
+		 * so if there are a ton of hard links to an inode there can
+		 * be a lot of backrefs.  Don't waste time searching too hard,
+		 * this is just an optimization
+		 */
+		if (scanned >= 8)
+			break;
+	}
+	/* we hit the end of the leaf before we found an xattr or
+	 * something larger than an xattr.  We have to assume the inode
+	 * has acls
+	 */
+	if (*first_xattr_slot == -1)
+		*first_xattr_slot = slot;
+	return 1;
+}
+
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+static void btrfs_read_locked_inode(struct inode *inode)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key location;
+	unsigned long ptr;
+	int maybe_acls;
+	u32 rdev;
+	int ret;
+	bool filled = false;
+	int first_xattr_slot;
+
+	ret = btrfs_fill_inode(inode, &rdev);
+	if (!ret)
+		filled = true;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto make_bad;
+
+	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
+	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+	if (ret)
+		goto make_bad;
+
+	leaf = path->nodes[0];
+
+	if (filled)
+		goto cache_index;
+
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
+	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
+	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
+	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+
+	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+
+	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+
+	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
+
+	BTRFS_I(inode)->i_otime.tv_sec =
+		btrfs_timespec_sec(leaf, &inode_item->otime);
+	BTRFS_I(inode)->i_otime.tv_nsec =
+		btrfs_timespec_nsec(leaf, &inode_item->otime);
+
+	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+	inode->i_generation = BTRFS_I(inode)->generation;
+	inode->i_rdev = 0;
+	rdev = btrfs_inode_rdev(leaf, inode_item);
+
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
+	/*
+	 * If we were modified in the current generation and evicted from memory
+	 * and then re-read we need to do a full sync since we don't have any
+	 * idea about which extents were modified before we were evicted from
+	 * cache.
+	 *
+	 * This is required for both inode re-read from disk and delayed inode
+	 * in delayed_nodes_tree.
+	 */
+	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+
+	path->slots[0]++;
+	if (inode->i_nlink != 1 ||
+	    path->slots[0] >= btrfs_header_nritems(leaf))
+		goto cache_acl;
+
+	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+	if (location.objectid != btrfs_ino(inode))
+		goto cache_acl;
+
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	if (location.type == BTRFS_INODE_REF_KEY) {
+		struct btrfs_inode_ref *ref;
+
+		ref = (struct btrfs_inode_ref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *extref;
+
+		extref = (struct btrfs_inode_extref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+								     extref);
+	}
+cache_acl:
+	/*
+	 * try to precache a NULL acl entry for files that don't have
+	 * any xattrs or acls
+	 */
+	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+					   btrfs_ino(inode), &first_xattr_slot);
+	if (first_xattr_slot != -1) {
+		path->slots[0] = first_xattr_slot;
+		ret = btrfs_load_inode_props(inode, path);
+		if (ret)
+			btrfs_err(root->fs_info,
+				  "error loading props for ino %llu (root %llu): %d",
+				  btrfs_ino(inode),
+				  root->root_key.objectid, ret);
+	}
+	btrfs_free_path(path);
+
+	if (!maybe_acls)
+		cache_no_acl(inode);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &btrfs_aops;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		break;
+	case S_IFDIR:
+		inode->i_fop = &btrfs_dir_file_operations;
+		if (root == root->fs_info->tree_root)
+			inode->i_op = &btrfs_dir_ro_inode_operations;
+		else
+			inode->i_op = &btrfs_dir_inode_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &btrfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &btrfs_symlink_aops;
+		break;
+	default:
+		inode->i_op = &btrfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+	}
+
+	btrfs_update_iflags(inode);
+	return;
+
+make_bad:
+	btrfs_free_path(path);
+	make_bad_inode(inode);
+}
+
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode)
+{
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+				   &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+	btrfs_set_token_timespec_sec(leaf, &item->atime,
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, &item->atime,
+				      inode->i_atime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, &item->mtime,
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
+				      inode->i_mtime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, &item->ctime,
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
+				      inode->i_ctime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, &item->otime,
+				     BTRFS_I(inode)->i_otime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, &item->otime,
+				      BTRFS_I(inode)->i_otime.tv_nsec, &token);
+
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+					 &token);
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
+				 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto failed;
+	}
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+
+	fill_inode_item(trans, leaf, inode_item, inode);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_set_inode_last_trans(trans, inode);
+	ret = 0;
+failed:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	int ret;
+
+	/*
+	 * If the inode is a free space inode, we can deadlock during commit
+	 * if we put it into the delayed code.
+	 *
+	 * The data relocation inode should also be directly updated
+	 * without delay
+	 */
+	if (!btrfs_is_free_space_inode(inode)
+	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+	    && !root->fs_info->log_root_recovering) {
+		btrfs_update_root_times(trans, root);
+
+		ret = btrfs_delayed_update_inode(trans, root, inode);
+		if (!ret)
+			btrfs_set_inode_last_trans(trans, inode);
+		return ret;
+	}
+
+	return btrfs_update_inode_item(trans, root, inode);
+}
+
+noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct inode *inode)
+{
+	int ret;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret == -ENOSPC)
+		return btrfs_update_inode_item(trans, root, inode);
+	return ret;
+}
+
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *dir, struct inode *inode,
+				const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	int ret = 0;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 index;
+	u64 ino = btrfs_ino(inode);
+	u64 dir_ino = btrfs_ino(dir);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	path->leave_spinning = 1;
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+				    name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		ret = -ENOENT;
+		goto err;
+	}
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	if (ret)
+		goto err;
+	btrfs_release_path(path);
+
+	/*
+	 * If we don't have dir index, we have to get it by looking up
+	 * the inode ref, since we get the inode ref, remove it directly,
+	 * it is unnecessary to do delayed deletion.
+	 *
+	 * But if we have dir index, needn't search inode ref to get it.
+	 * Since the inode ref is close to the inode item, it is better
+	 * that we delay to delete it, and just do this deletion when
+	 * we update the inode item.
+	 */
+	if (BTRFS_I(inode)->dir_index) {
+		ret = btrfs_delayed_delete_inode_ref(inode);
+		if (!ret) {
+			index = BTRFS_I(inode)->dir_index;
+			goto skip_backref;
+		}
+	}
+
+	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
+				  dir_ino, &index);
+	if (ret) {
+		btrfs_info(root->fs_info,
+			"failed to delete reference to %.*s, inode %llu parent %llu",
+			name_len, name, ino, dir_ino);
+		btrfs_abort_transaction(trans, root, ret);
+		goto err;
+	}
+skip_backref:
+	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto err;
+	}
+
+	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+					 inode, dir_ino);
+	if (ret != 0 && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto err;
+	}
+
+	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+					   dir, index);
+	if (ret == -ENOENT)
+		ret = 0;
+	else if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+err:
+	btrfs_free_path(path);
+	if (ret)
+		goto out;
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode_inc_iversion(inode);
+	inode_inc_iversion(dir);
+	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, dir);
+out:
+	return ret;
+}
+
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len)
+{
+	int ret;
+	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	if (!ret) {
+		drop_nlink(inode);
+		ret = btrfs_update_inode(trans, root, inode);
+	}
+	return ret;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int ret;
+
+	/*
+	 * 1 for the possible orphan item
+	 * 1 for the dir item
+	 * 1 for the dir index
+	 * 1 for the inode ref
+	 * 1 for the inode
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+		return trans;
+
+	if (PTR_ERR(trans) == -ENOSPC) {
+		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return trans;
+		ret = btrfs_cond_migrate_bytes(root->fs_info,
+					       &root->fs_info->trans_block_rsv,
+					       num_bytes, 5);
+		if (ret) {
+			btrfs_end_transaction(trans, root);
+			return ERR_PTR(ret);
+		}
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		trans->bytes_reserved = num_bytes;
+	}
+	return trans;
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode = d_inode(dentry);
+	int ret;
+
+	trans = __unlink_start_trans(dir);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0);
+
+	ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
+				 dentry->d_name.name, dentry->d_name.len);
+	if (ret)
+		goto out;
+
+	if (inode->i_nlink == 0) {
+		ret = btrfs_orphan_add(trans, inode);
+		if (ret)
+			goto out;
+	}
+
+out:
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+	return ret;
+}
+
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct inode *dir, u64 objectid,
+			const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 index;
+	int ret;
+	u64 dir_ino = btrfs_ino(dir);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+				   name, name_len, -1);
+	if (IS_ERR_OR_NULL(di)) {
+		if (!di)
+			ret = -ENOENT;
+		else
+			ret = PTR_ERR(di);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+				 objectid, root->root_key.objectid,
+				 dir_ino, &index, name, name_len);
+	if (ret < 0) {
+		if (ret != -ENOENT) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+		di = btrfs_search_dir_index_item(root, path, dir_ino,
+						 name, name_len);
+		if (IS_ERR_OR_NULL(di)) {
+			if (!di)
+				ret = -ENOENT;
+			else
+				ret = PTR_ERR(di);
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(path);
+		index = key.offset;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode_inc_iversion(dir);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode_fallback(trans, root, dir);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	int err = 0;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_trans_handle *trans;
+
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+		return -ENOTEMPTY;
+	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+		return -EPERM;
+
+	trans = __unlink_start_trans(dir);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+		err = btrfs_unlink_subvol(trans, root, dir,
+					  BTRFS_I(inode)->location.objectid,
+					  dentry->d_name.name,
+					  dentry->d_name.len);
+		goto out;
+	}
+
+	err = btrfs_orphan_add(trans, inode);
+	if (err)
+		goto out;
+
+	/* now the directory is empty */
+	err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
+				 dentry->d_name.name, dentry->d_name.len);
+	if (!err)
+		btrfs_i_size_write(inode, 0);
+out:
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+
+	return err;
+}
+
+static int truncate_space_check(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytes_deleted)
+{
+	int ret;
+
+	bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+	ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
+	if (!ret)
+		trans->bytes_reserved += bytes_deleted;
+	return ret;
+
+}
+
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to.  If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode,
+			       u64 new_size, u32 min_type)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 extent_start = 0;
+	u64 extent_num_bytes = 0;
+	u64 extent_offset = 0;
+	u64 item_end = 0;
+	u64 last_size = (u64)-1;
+	u32 found_type = (u8)-1;
+	int found_extent;
+	int del_item;
+	int pending_del_nr = 0;
+	int pending_del_slot = 0;
+	int extent_type = -1;
+	int ret;
+	int err = 0;
+	u64 ino = btrfs_ino(inode);
+	u64 bytes_deleted = 0;
+	bool be_nice = 0;
+	bool should_throttle = 0;
+	bool should_end = 0;
+
+	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+
+	/*
+	 * for non-free space inodes and ref cows, we want to back off from
+	 * time to time
+	 */
+	if (!btrfs_is_free_space_inode(inode) &&
+	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		be_nice = 1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = -1;
+
+	/*
+	 * We want to drop from the next block forward in case this new size is
+	 * not block aligned since we will be keeping the last block of the
+	 * extent just the way it is.
+	 */
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+	    root == root->fs_info->tree_root)
+		btrfs_drop_extent_cache(inode, ALIGN(new_size,
+					root->sectorsize), (u64)-1, 0);
+
+	/*
+	 * This function is also used to drop the items in the log tree before
+	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+	 * it is used to drop the loged items. So we shouldn't kill the delayed
+	 * items.
+	 */
+	if (min_type == 0 && root == BTRFS_I(inode)->root)
+		btrfs_kill_delayed_inode_items(inode);
+
+	key.objectid = ino;
+	key.offset = (u64)-1;
+	key.type = (u8)-1;
+
+search_again:
+	/*
+	 * with a 16K leaf size and 128MB extents, you can actually queue
+	 * up a huge file in a single leaf.  Most of the time that
+	 * bytes_deleted is > 0, it will be huge by the time we get here
+	 */
+	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+		if (btrfs_should_end_transaction(trans, root)) {
+			err = -EAGAIN;
+			goto error;
+		}
+	}
+
+
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret > 0) {
+		/* there are no items in the tree for us to truncate, we're
+		 * done
+		 */
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+
+	while (1) {
+		fi = NULL;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		found_type = found_key.type;
+
+		if (found_key.objectid != ino)
+			break;
+
+		if (found_type < min_type)
+			break;
+
+		item_end = found_key.offset;
+		if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			extent_type = btrfs_file_extent_type(leaf, fi);
+			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+				item_end +=
+				    btrfs_file_extent_num_bytes(leaf, fi);
+			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				item_end += btrfs_file_extent_inline_len(leaf,
+							 path->slots[0], fi);
+			}
+			item_end--;
+		}
+		if (found_type > min_type) {
+			del_item = 1;
+		} else {
+			if (item_end < new_size)
+				break;
+			if (found_key.offset >= new_size)
+				del_item = 1;
+			else
+				del_item = 0;
+		}
+		found_extent = 0;
+		/* FIXME, shrink the extent if the ref count is only 1 */
+		if (found_type != BTRFS_EXTENT_DATA_KEY)
+			goto delete;
+
+		if (del_item)
+			last_size = found_key.offset;
+		else
+			last_size = new_size;
+
+		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+			u64 num_dec;
+			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+			if (!del_item) {
+				u64 orig_num_bytes =
+					btrfs_file_extent_num_bytes(leaf, fi);
+				extent_num_bytes = ALIGN(new_size -
+						found_key.offset,
+						root->sectorsize);
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							 extent_num_bytes);
+				num_dec = (orig_num_bytes -
+					   extent_num_bytes);
+				if (test_bit(BTRFS_ROOT_REF_COWS,
+					     &root->state) &&
+				    extent_start != 0)
+					inode_sub_bytes(inode, num_dec);
+				btrfs_mark_buffer_dirty(leaf);
+			} else {
+				extent_num_bytes =
+					btrfs_file_extent_disk_num_bytes(leaf,
+									 fi);
+				extent_offset = found_key.offset -
+					btrfs_file_extent_offset(leaf, fi);
+
+				/* FIXME blocksize != 4096 */
+				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+				if (extent_start != 0) {
+					found_extent = 1;
+					if (test_bit(BTRFS_ROOT_REF_COWS,
+						     &root->state))
+						inode_sub_bytes(inode, num_dec);
+				}
+			}
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			/*
+			 * we can't truncate inline items that have had
+			 * special encodings
+			 */
+			if (!del_item &&
+			    btrfs_file_extent_compression(leaf, fi) == 0 &&
+			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+				u32 size = new_size - found_key.offset;
+
+				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+					inode_sub_bytes(inode, item_end + 1 -
+							new_size);
+
+				/*
+				 * update the ram bytes to properly reflect
+				 * the new size of our item
+				 */
+				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+				size =
+				    btrfs_file_extent_calc_inline_size(size);
+				btrfs_truncate_item(root, path, size, 1);
+			} else if (test_bit(BTRFS_ROOT_REF_COWS,
+					    &root->state)) {
+				inode_sub_bytes(inode, item_end + 1 -
+						found_key.offset);
+			}
+		}
+delete:
+		if (del_item) {
+			if (!pending_del_nr) {
+				/* no pending yet, add ourselves */
+				pending_del_slot = path->slots[0];
+				pending_del_nr = 1;
+			} else if (pending_del_nr &&
+				   path->slots[0] + 1 == pending_del_slot) {
+				/* hop on the pending chunk */
+				pending_del_nr++;
+				pending_del_slot = path->slots[0];
+			} else {
+				BUG();
+			}
+		} else {
+			break;
+		}
+		should_throttle = 0;
+
+		if (found_extent &&
+		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+		     root == root->fs_info->tree_root)) {
+			btrfs_set_path_blocking(path);
+			bytes_deleted += extent_num_bytes;
+			ret = btrfs_free_extent(trans, root, extent_start,
+						extent_num_bytes, 0,
+						btrfs_header_owner(leaf),
+						ino, extent_offset, 0);
+			BUG_ON(ret);
+			if (btrfs_should_throttle_delayed_refs(trans, root))
+				btrfs_async_run_delayed_refs(root,
+					trans->delayed_ref_updates * 2, 0);
+			if (be_nice) {
+				if (truncate_space_check(trans, root,
+							 extent_num_bytes)) {
+					should_end = 1;
+				}
+				if (btrfs_should_throttle_delayed_refs(trans,
+								       root)) {
+					should_throttle = 1;
+				}
+			}
+		}
+
+		if (found_type == BTRFS_INODE_ITEM_KEY)
+			break;
+
+		if (path->slots[0] == 0 ||
+		    path->slots[0] != pending_del_slot ||
+		    should_throttle || should_end) {
+			if (pending_del_nr) {
+				ret = btrfs_del_items(trans, root, path,
+						pending_del_slot,
+						pending_del_nr);
+				if (ret) {
+					btrfs_abort_transaction(trans,
+								root, ret);
+					goto error;
+				}
+				pending_del_nr = 0;
+			}
+			btrfs_release_path(path);
+			if (should_throttle) {
+				unsigned long updates = trans->delayed_ref_updates;
+				if (updates) {
+					trans->delayed_ref_updates = 0;
+					ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+					if (ret && !err)
+						err = ret;
+				}
+			}
+			/*
+			 * if we failed to refill our space rsv, bail out
+			 * and let the transaction restart
+			 */
+			if (should_end) {
+				err = -EAGAIN;
+				goto error;
+			}
+			goto search_again;
+		} else {
+			path->slots[0]--;
+		}
+	}
+out:
+	if (pending_del_nr) {
+		ret = btrfs_del_items(trans, root, path, pending_del_slot,
+				      pending_del_nr);
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
+	}
+error:
+	if (last_size != (u64)-1 &&
+	    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		btrfs_ordered_update_i_size(inode, last_size, NULL);
+
+	btrfs_free_path(path);
+
+	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+		unsigned long updates = trans->delayed_ref_updates;
+		if (updates) {
+			trans->delayed_ref_updates = 0;
+			ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+			if (ret && !err)
+				err = ret;
+		}
+	}
+	return err;
+}
+
+/*
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *	offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
+ */
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	char *kaddr;
+	u32 blocksize = root->sectorsize;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	struct page *page;
+	gfp_t mask = btrfs_alloc_write_mask(mapping);
+	int ret = 0;
+	u64 page_start;
+	u64 page_end;
+
+	if ((offset & (blocksize - 1)) == 0 &&
+	    (!len || ((len & (blocksize - 1)) == 0)))
+		goto out;
+	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (ret)
+		goto out;
+
+again:
+	page = find_or_create_page(mapping, index, mask);
+	if (!page) {
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if (!PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		lock_page(page);
+		if (page->mapping != mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+			goto out_unlock;
+		}
+	}
+	wait_on_page_writeback(page);
+
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+			  0, 0, &cached_state, GFP_NOFS);
+
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
+	if (ret) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		goto out_unlock;
+	}
+
+	if (offset != PAGE_CACHE_SIZE) {
+		if (!len)
+			len = PAGE_CACHE_SIZE - offset;
+		kaddr = kmap(page);
+		if (front)
+			memset(kaddr, 0, offset);
+		else
+			memset(kaddr + offset, 0, len);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	ClearPageChecked(page);
+	set_page_dirty(page);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+			     GFP_NOFS);
+
+out_unlock:
+	if (ret)
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+			     u64 offset, u64 len)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/*
+	 * Still need to make sure the inode looks like it's been updated so
+	 * that any holes get logged if we fsync.
+	 */
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+		BTRFS_I(inode)->last_trans = root->fs_info->generation;
+		BTRFS_I(inode)->last_sub_trans = root->log_transid;
+		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+		return 0;
+	}
+
+	/*
+	 * 1 - for the one we're dropping
+	 * 1 - for the one we're adding
+	 * 1 - for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, len, 0, len, 0, 0, 0);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	else
+		btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for.  So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 hole_start = ALIGN(oldsize, root->sectorsize);
+	u64 block_end = ALIGN(size, root->sectorsize);
+	u64 last_byte;
+	u64 cur_offset;
+	u64 hole_size;
+	int err = 0;
+
+	/*
+	 * If our size started in the middle of a page we need to zero out the
+	 * rest of the page before we expand the i_size, otherwise we could
+	 * expose stale data.
+	 */
+	err = btrfs_truncate_page(inode, oldsize, 0, 0);
+	if (err)
+		return err;
+
+	if (size <= hole_start)
+		return 0;
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+				 &cached_state);
+		ordered = btrfs_lookup_ordered_range(inode, hole_start,
+						     block_end - hole_start);
+		if (!ordered)
+			break;
+		unlock_extent_cached(io_tree, hole_start, block_end - 1,
+				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	cur_offset = hole_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				block_end - cur_offset, 0);
+		if (IS_ERR(em)) {
+			err = PTR_ERR(em);
+			em = NULL;
+			break;
+		}
+		last_byte = min(extent_map_end(em), block_end);
+		last_byte = ALIGN(last_byte , root->sectorsize);
+		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			struct extent_map *hole_em;
+			hole_size = last_byte - cur_offset;
+
+			err = maybe_insert_hole(root, inode, cur_offset,
+						hole_size);
+			if (err)
+				break;
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + hole_size - 1, 0);
+			hole_em = alloc_extent_map();
+			if (!hole_em) {
+				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+					&BTRFS_I(inode)->runtime_flags);
+				goto next;
+			}
+			hole_em->start = cur_offset;
+			hole_em->len = hole_size;
+			hole_em->orig_start = cur_offset;
+
+			hole_em->block_start = EXTENT_MAP_HOLE;
+			hole_em->block_len = 0;
+			hole_em->orig_block_len = 0;
+			hole_em->ram_bytes = hole_size;
+			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+			hole_em->compress_type = BTRFS_COMPRESS_NONE;
+			hole_em->generation = root->fs_info->generation;
+
+			while (1) {
+				write_lock(&em_tree->lock);
+				err = add_extent_mapping(em_tree, hole_em, 1);
+				write_unlock(&em_tree->lock);
+				if (err != -EEXIST)
+					break;
+				btrfs_drop_extent_cache(inode, cur_offset,
+							cur_offset +
+							hole_size - 1, 0);
+			}
+			free_extent_map(hole_em);
+		}
+next:
+		free_extent_map(em);
+		em = NULL;
+		cur_offset = last_byte;
+		if (cur_offset >= block_end)
+			break;
+	}
+	free_extent_map(em);
+	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+			     GFP_NOFS);
+	return err;
+}
+
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+	schedule();
+	return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+	while (true) {
+		int ret;
+
+		ret = btrfs_start_write_no_snapshoting(root);
+		if (ret)
+			break;
+		wait_on_atomic_t(&root->will_be_snapshoted,
+				 wait_snapshoting_atomic_t,
+				 TASK_UNINTERRUPTIBLE);
+	}
+}
+
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	loff_t oldsize = i_size_read(inode);
+	loff_t newsize = attr->ia_size;
+	int mask = attr->ia_valid;
+	int ret;
+
+	/*
+	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+	 * special case where we need to update the times despite not having
+	 * these flags set.  For all other operations the VFS set these flags
+	 * explicitly if it wants a timestamp update.
+	 */
+	if (newsize != oldsize) {
+		inode_inc_iversion(inode);
+		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+			inode->i_ctime = inode->i_mtime =
+				current_fs_time(inode->i_sb);
+	}
+
+	if (newsize > oldsize) {
+		truncate_pagecache(inode, newsize);
+		/*
+		 * Don't do an expanding truncate while snapshoting is ongoing.
+		 * This is to ensure the snapshot captures a fully consistent
+		 * state of this file - if the snapshot captures this expanding
+		 * truncation, it must capture all writes that happened before
+		 * this truncation.
+		 */
+		wait_for_snapshot_creation(root);
+		ret = btrfs_cont_expand(inode, oldsize, newsize);
+		if (ret) {
+			btrfs_end_write_no_snapshoting(root);
+			return ret;
+		}
+
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			btrfs_end_write_no_snapshoting(root);
+			return PTR_ERR(trans);
+		}
+
+		i_size_write(inode, newsize);
+		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+		ret = btrfs_update_inode(trans, root, inode);
+		btrfs_end_write_no_snapshoting(root);
+		btrfs_end_transaction(trans, root);
+	} else {
+
+		/*
+		 * We're truncating a file that used to have good data down to
+		 * zero. Make sure it gets into the ordered flush list so that
+		 * any new writes get down to disk quickly.
+		 */
+		if (newsize == 0)
+			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+				&BTRFS_I(inode)->runtime_flags);
+
+		/*
+		 * 1 for the orphan item we're going to add
+		 * 1 for the orphan item deletion.
+		 */
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		/*
+		 * We need to do this in case we fail at _any_ point during the
+		 * actual truncate.  Once we do the truncate_setsize we could
+		 * invalidate pages which forces any outstanding ordered io to
+		 * be instantly completed which will give us extents that need
+		 * to be truncated.  If we fail to get an orphan inode down we
+		 * could have left over extents that were never meant to live,
+		 * so we need to garuntee from this point on that everything
+		 * will be consistent.
+		 */
+		ret = btrfs_orphan_add(trans, inode);
+		btrfs_end_transaction(trans, root);
+		if (ret)
+			return ret;
+
+		/* we don't support swapfiles, so vmtruncate shouldn't fail */
+		truncate_setsize(inode, newsize);
+
+		/* Disable nonlocked read DIO to avoid the end less truncate */
+		btrfs_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+		btrfs_inode_resume_unlocked_dio(inode);
+
+		ret = btrfs_truncate(inode);
+		if (ret && inode->i_nlink) {
+			int err;
+
+			/*
+			 * failed to truncate, disk_i_size is only adjusted down
+			 * as we remove extents, so it should represent the true
+			 * size of the inode, so reset the in memory size and
+			 * delete our orphan entry.
+			 */
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans)) {
+				btrfs_orphan_del(NULL, inode);
+				return ret;
+			}
+			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+			err = btrfs_orphan_del(trans, inode);
+			if (err)
+				btrfs_abort_transaction(trans, root, err);
+			btrfs_end_transaction(trans, root);
+		}
+	}
+
+	return ret;
+}
+
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int err;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+		err = btrfs_setsize(inode, attr);
+		if (err)
+			return err;
+	}
+
+	if (attr->ia_valid) {
+		setattr_copy(inode, attr);
+		inode_inc_iversion(inode);
+		err = btrfs_dirty_inode(inode);
+
+		if (!err && attr->ia_valid & ATTR_MODE)
+			err = posix_acl_chmod(inode, inode->i_mode);
+	}
+
+	return err;
+}
+
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+	struct rb_node *node;
+
+	ASSERT(inode->i_state & I_FREEING);
+	truncate_inode_pages_final(&inode->i_data);
+
+	write_lock(&map_tree->lock);
+	while (!RB_EMPTY_ROOT(&map_tree->map)) {
+		struct extent_map *em;
+
+		node = rb_first(&map_tree->map);
+		em = rb_entry(node, struct extent_map, rb_node);
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+		remove_extent_mapping(map_tree, em);
+		free_extent_map(em);
+		if (need_resched()) {
+			write_unlock(&map_tree->lock);
+			cond_resched();
+			write_lock(&map_tree->lock);
+		}
+	}
+	write_unlock(&map_tree->lock);
+
+	spin_lock(&io_tree->lock);
+	while (!RB_EMPTY_ROOT(&io_tree->state)) {
+		struct extent_state *state;
+		struct extent_state *cached_state = NULL;
+
+		node = rb_first(&io_tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		atomic_inc(&state->refs);
+		spin_unlock(&io_tree->lock);
+
+		lock_extent_bits(io_tree, state->start, state->end,
+				 0, &cached_state);
+		clear_extent_bit(io_tree, state->start, state->end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+		free_extent_state(state);
+
+		cond_resched();
+		spin_lock(&io_tree->lock);
+	}
+	spin_unlock(&io_tree->lock);
+}
+
+void btrfs_evict_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *rsv, *global_rsv;
+	int steal_from_global = 0;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+	int ret;
+
+	trace_btrfs_inode_evict(inode);
+
+	evict_inode_truncate_pages(inode);
+
+	if (inode->i_nlink &&
+	    ((btrfs_root_refs(&root->root_item) != 0 &&
+	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+	     btrfs_is_free_space_inode(inode)))
+		goto no_delete;
+
+	if (is_bad_inode(inode)) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+
+	btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
+	if (root->fs_info->log_root_recovering) {
+		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+				 &BTRFS_I(inode)->runtime_flags));
+		goto no_delete;
+	}
+
+	if (inode->i_nlink > 0) {
+		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+		goto no_delete;
+	}
+
+	ret = btrfs_commit_inode_delayed_inode(inode);
+	if (ret) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+	rsv->size = min_size;
+	rsv->failfast = 1;
+	global_rsv = &root->fs_info->global_block_rsv;
+
+	btrfs_i_size_write(inode, 0);
+
+	/*
+	 * This is a bit simpler than btrfs_truncate since we've already
+	 * reserved our space for our orphan item in the unlink, so we just
+	 * need to reserve some slack space in case we add bytes and update
+	 * inode item when doing the truncate.
+	 */
+	while (1) {
+		ret = btrfs_block_rsv_refill(root, rsv, min_size,
+					     BTRFS_RESERVE_FLUSH_LIMIT);
+
+		/*
+		 * Try and steal from the global reserve since we will
+		 * likely not use this space anyway, we want to try as
+		 * hard as possible to get this to work.
+		 */
+		if (ret)
+			steal_from_global++;
+		else
+			steal_from_global = 0;
+		ret = 0;
+
+		/*
+		 * steal_from_global == 0: we reserved stuff, hooray!
+		 * steal_from_global == 1: we didn't reserve stuff, boo!
+		 * steal_from_global == 2: we've committed, still not a lot of
+		 * room but maybe we'll have room in the global reserve this
+		 * time.
+		 * steal_from_global == 3: abandon all hope!
+		 */
+		if (steal_from_global > 2) {
+			btrfs_warn(root->fs_info,
+				"Could not get space for a delete, will truncate on mount %d",
+				ret);
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
+		}
+
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
+		}
+
+		/*
+		 * We can't just steal from the global reserve, we need tomake
+		 * sure there is room to do it, if not we need to commit and try
+		 * again.
+		 */
+		if (steal_from_global) {
+			if (!btrfs_check_space_for_delayed_refs(trans, root))
+				ret = btrfs_block_rsv_migrate(global_rsv, rsv,
+							      min_size);
+			else
+				ret = -ENOSPC;
+		}
+
+		/*
+		 * Couldn't steal from the global reserve, we have too much
+		 * pending stuff built up, commit the transaction and try it
+		 * again.
+		 */
+		if (ret) {
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret) {
+				btrfs_orphan_del(NULL, inode);
+				btrfs_free_block_rsv(root, rsv);
+				goto no_delete;
+			}
+			continue;
+		} else {
+			steal_from_global = 0;
+		}
+
+		trans->block_rsv = rsv;
+
+		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+		if (ret != -ENOSPC && ret != -EAGAIN)
+			break;
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		btrfs_end_transaction(trans, root);
+		trans = NULL;
+		btrfs_btree_balance_dirty(root);
+	}
+
+	btrfs_free_block_rsv(root, rsv);
+
+	/*
+	 * Errors here aren't a big deal, it just means we leave orphan items
+	 * in the tree.  They will be cleaned up on the next mount.
+	 */
+	if (ret == 0) {
+		trans->block_rsv = root->orphan_block_rsv;
+		btrfs_orphan_del(trans, inode);
+	} else {
+		btrfs_orphan_del(NULL, inode);
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	if (!(root == root->fs_info->tree_root ||
+	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
+		btrfs_return_ino(root, btrfs_ino(inode));
+
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+no_delete:
+	btrfs_remove_delayed_node(inode);
+	clear_inode(inode);
+	return;
+}
+
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+			       struct btrfs_key *location)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
+				    namelen, 0);
+	if (IS_ERR(di))
+		ret = PTR_ERR(di);
+
+	if (IS_ERR_OR_NULL(di))
+		goto out_err;
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+	btrfs_free_path(path);
+	return ret;
+out_err:
+	location->objectid = 0;
+	goto out;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root.  This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+				    struct inode *dir,
+				    struct dentry *dentry,
+				    struct btrfs_key *location,
+				    struct btrfs_root **sub_root)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *new_root;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOENT;
+	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = location->objectid;
+
+	ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+				0, 0);
+	if (ret) {
+		if (ret < 0)
+			err = ret;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
+	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+		goto out;
+
+	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+				   (unsigned long)(ref + 1),
+				   dentry->d_name.len);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+	if (IS_ERR(new_root)) {
+		err = PTR_ERR(new_root);
+		goto out;
+	}
+
+	*sub_root = new_root;
+	location->objectid = btrfs_root_dirid(&new_root->root_item);
+	location->type = BTRFS_INODE_ITEM_KEY;
+	location->offset = 0;
+	err = 0;
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+static void inode_tree_add(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *entry;
+	struct rb_node **p;
+	struct rb_node *parent;
+	struct rb_node *new = &BTRFS_I(inode)->rb_node;
+	u64 ino = btrfs_ino(inode);
+
+	if (inode_unhashed(inode))
+		return;
+	parent = NULL;
+	spin_lock(&root->inode_lock);
+	p = &root->inode_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_inode, rb_node);
+
+		if (ino < btrfs_ino(&entry->vfs_inode))
+			p = &parent->rb_left;
+		else if (ino > btrfs_ino(&entry->vfs_inode))
+			p = &parent->rb_right;
+		else {
+			WARN_ON(!(entry->vfs_inode.i_state &
+				  (I_WILL_FREE | I_FREEING)));
+			rb_replace_node(parent, new, &root->inode_tree);
+			RB_CLEAR_NODE(parent);
+			spin_unlock(&root->inode_lock);
+			return;
+		}
+	}
+	rb_link_node(new, parent, p);
+	rb_insert_color(new, &root->inode_tree);
+	spin_unlock(&root->inode_lock);
+}
+
+static void inode_tree_del(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int empty = 0;
+
+	spin_lock(&root->inode_lock);
+	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
+	}
+	spin_unlock(&root->inode_lock);
+
+	if (empty && btrfs_root_refs(&root->root_item) == 0) {
+		synchronize_srcu(&root->fs_info->subvol_srcu);
+		spin_lock(&root->inode_lock);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
+		spin_unlock(&root->inode_lock);
+		if (empty)
+			btrfs_add_dead_root(root);
+	}
+}
+
+void btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+	u64 objectid = 0;
+
+	if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < btrfs_ino(&entry->vfs_inode))
+			node = node->rb_left;
+		else if (objectid > btrfs_ino(&entry->vfs_inode))
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		objectid = btrfs_ino(&entry->vfs_inode) + 1;
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			if (atomic_read(&inode->i_count) > 1)
+				d_prune_aliases(inode);
+			/*
+			 * btrfs_drop_inode will have it removed from
+			 * the inode cache when its usage count
+			 * hits zero.
+			 */
+			iput(inode);
+			cond_resched();
+			spin_lock(&root->inode_lock);
+			goto again;
+		}
+
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct btrfs_iget_args *args = p;
+	inode->i_ino = args->location->objectid;
+	memcpy(&BTRFS_I(inode)->location, args->location,
+	       sizeof(*args->location));
+	BTRFS_I(inode)->root = args->root;
+	return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct btrfs_iget_args *args = opaque;
+	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
+		args->root == BTRFS_I(inode)->root;
+}
+
+static struct inode *btrfs_iget_locked(struct super_block *s,
+				       struct btrfs_key *location,
+				       struct btrfs_root *root)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+
+	args.location = location;
+	args.root = root;
+
+	inode = iget5_locked(s, hashval, btrfs_find_actor,
+			     btrfs_init_locked_inode,
+			     (void *)&args);
+	return inode;
+}
+
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *new)
+{
+	struct inode *inode;
+
+	inode = btrfs_iget_locked(s, location, root);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		btrfs_read_locked_inode(inode);
+		if (!is_bad_inode(inode)) {
+			inode_tree_add(inode);
+			unlock_new_inode(inode);
+			if (new)
+				*new = 1;
+		} else {
+			unlock_new_inode(inode);
+			iput(inode);
+			inode = ERR_PTR(-ESTALE);
+		}
+	}
+
+	return inode;
+}
+
+static struct inode *new_simple_dir(struct super_block *s,
+				    struct btrfs_key *key,
+				    struct btrfs_root *root)
+{
+	struct inode *inode = new_inode(s);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	BTRFS_I(inode)->root = root;
+	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+
+	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+	inode->i_op = &btrfs_dir_ro_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+	inode->i_mtime = CURRENT_TIME;
+	inode->i_atime = inode->i_mtime;
+	inode->i_ctime = inode->i_mtime;
+	BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+	return inode;
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *sub_root = root;
+	struct btrfs_key location;
+	int index;
+	int ret = 0;
+
+	if (dentry->d_name.len > BTRFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ret = btrfs_inode_by_name(dir, dentry, &location);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (location.objectid == 0)
+		return ERR_PTR(-ENOENT);
+
+	if (location.type == BTRFS_INODE_ITEM_KEY) {
+		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+		return inode;
+	}
+
+	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+	index = srcu_read_lock(&root->fs_info->subvol_srcu);
+	ret = fixup_tree_root_location(root, dir, dentry,
+				       &location, &sub_root);
+	if (ret < 0) {
+		if (ret != -ENOENT)
+			inode = ERR_PTR(ret);
+		else
+			inode = new_simple_dir(dir->i_sb, &location, sub_root);
+	} else {
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
+	}
+	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
+	if (!IS_ERR(inode) && root != sub_root) {
+		down_read(&root->fs_info->cleanup_work_sem);
+		if (!(inode->i_sb->s_flags & MS_RDONLY))
+			ret = btrfs_orphan_cleanup(sub_root);
+		up_read(&root->fs_info->cleanup_work_sem);
+		if (ret) {
+			iput(inode);
+			inode = ERR_PTR(ret);
+		}
+	}
+
+	return inode;
+}
+
+static int btrfs_dentry_delete(const struct dentry *dentry)
+{
+	struct btrfs_root *root;
+	struct inode *inode = d_inode(dentry);
+
+	if (!inode && !IS_ROOT(dentry))
+		inode = d_inode(dentry->d_parent);
+
+	if (inode) {
+		root = BTRFS_I(inode)->root;
+		if (btrfs_root_refs(&root->root_item) == 0)
+			return 1;
+
+		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+			return 1;
+	}
+	return 0;
+}
+
+static void btrfs_dentry_release(struct dentry *dentry)
+{
+	kfree(dentry->d_fsdata);
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct inode *inode;
+
+	inode = btrfs_lookup_dentry(dir, dentry);
+	if (IS_ERR(inode)) {
+		if (PTR_ERR(inode) == -ENOENT)
+			inode = NULL;
+		else
+			return ERR_CAST(inode);
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+unsigned char btrfs_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_item *item;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct list_head ins_list;
+	struct list_head del_list;
+	int ret;
+	struct extent_buffer *leaf;
+	int slot;
+	unsigned char d_type;
+	int over = 0;
+	u32 di_cur;
+	u32 di_total;
+	u32 di_len;
+	int key_type = BTRFS_DIR_INDEX_KEY;
+	char tmp_name[32];
+	char *name_ptr;
+	int name_len;
+	int is_curr = 0;	/* ctx->pos points to the current index? */
+
+	/* FIXME, use a real flag for deciding about the key type */
+	if (root->fs_info->tree_root == root)
+		key_type = BTRFS_DIR_ITEM_KEY;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		INIT_LIST_HEAD(&ins_list);
+		INIT_LIST_HEAD(&del_list);
+		btrfs_get_delayed_items(inode, &ins_list, &del_list);
+	}
+
+	key.type = key_type;
+	key.offset = ctx->pos;
+	key.objectid = btrfs_ino(inode);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		item = btrfs_item_nr(slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid != key.objectid)
+			break;
+		if (found_key.type != key_type)
+			break;
+		if (found_key.offset < ctx->pos)
+			goto next;
+		if (key_type == BTRFS_DIR_INDEX_KEY &&
+		    btrfs_should_delete_dir_index(&del_list,
+						  found_key.offset))
+			goto next;
+
+		ctx->pos = found_key.offset;
+		is_curr = 1;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		di_cur = 0;
+		di_total = btrfs_item_size(leaf, item);
+
+		while (di_cur < di_total) {
+			struct btrfs_key location;
+
+			if (verify_dir_item(root, leaf, di))
+				break;
+
+			name_len = btrfs_dir_name_len(leaf, di);
+			if (name_len <= sizeof(tmp_name)) {
+				name_ptr = tmp_name;
+			} else {
+				name_ptr = kmalloc(name_len, GFP_NOFS);
+				if (!name_ptr) {
+					ret = -ENOMEM;
+					goto err;
+				}
+			}
+			read_extent_buffer(leaf, name_ptr,
+					   (unsigned long)(di + 1), name_len);
+
+			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+			btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+
+			/* is this a reference to our own snapshot? If so
+			 * skip it.
+			 *
+			 * In contrast to old kernels, we insert the snapshot's
+			 * dir item and dir index after it has been created, so
+			 * we won't find a reference to our own snapshot. We
+			 * still keep the following code for backward
+			 * compatibility.
+			 */
+			if (location.type == BTRFS_ROOT_ITEM_KEY &&
+			    location.objectid == root->root_key.objectid) {
+				over = 0;
+				goto skip;
+			}
+			over = !dir_emit(ctx, name_ptr, name_len,
+				       location.objectid, d_type);
+
+skip:
+			if (name_ptr != tmp_name)
+				kfree(name_ptr);
+
+			if (over)
+				goto nopos;
+			di_len = btrfs_dir_name_len(leaf, di) +
+				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
+			di_cur += di_len;
+			di = (struct btrfs_dir_item *)((char *)di + di_len);
+		}
+next:
+		path->slots[0]++;
+	}
+
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		if (is_curr)
+			ctx->pos++;
+		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+		if (ret)
+			goto nopos;
+	}
+
+	/* Reached end of directory/root. Bump pos past the last item. */
+	ctx->pos++;
+
+	/*
+	 * Stop new entries from being returned after we return the last
+	 * entry.
+	 *
+	 * New directory entries are assigned a strictly increasing
+	 * offset.  This means that new entries created during readdir
+	 * are *guaranteed* to be seen in the future by that readdir.
+	 * This has broken buggy programs which operate on names as
+	 * they're returned by readdir.  Until we re-use freed offsets
+	 * we have this hack to stop new entries from being returned
+	 * under the assumption that they'll never reach this huge
+	 * offset.
+	 *
+	 * This is being careful not to overflow 32bit loff_t unless the
+	 * last entry requires it because doing so has broken 32bit apps
+	 * in the past.
+	 */
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		if (ctx->pos >= INT_MAX)
+			ctx->pos = LLONG_MAX;
+		else
+			ctx->pos = INT_MAX;
+	}
+nopos:
+	ret = 0;
+err:
+	if (key_type == BTRFS_DIR_INDEX_KEY)
+		btrfs_put_delayed_items(&ins_list, &del_list);
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+	bool nolock = false;
+
+	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+		return 0;
+
+	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
+		nolock = true;
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root);
+		else
+			trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		ret = btrfs_commit_transaction(trans, root);
+	}
+	return ret;
+}
+
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes.  But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+static int btrfs_dirty_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+		return 0;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret && ret == -ENOSPC) {
+		/* whoops, lets try again with the full transaction */
+		btrfs_end_transaction(trans, root);
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		ret = btrfs_update_inode(trans, root, inode);
+	}
+	btrfs_end_transaction(trans, root);
+	if (BTRFS_I(inode)->delayed_node)
+		btrfs_balance_delayed_items(root);
+
+	return ret;
+}
+
+/*
+ * This is a copy of file_update_time.  We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+			     int flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	if (flags & S_VERSION)
+		inode_inc_iversion(inode);
+	if (flags & S_CTIME)
+		inode->i_ctime = *now;
+	if (flags & S_MTIME)
+		inode->i_mtime = *now;
+	if (flags & S_ATIME)
+		inode->i_atime = *now;
+	return btrfs_dirty_inode(inode);
+}
+
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key key, found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	/* FIXME: we should be able to handle this */
+	if (ret == 0)
+		goto out;
+	ret = 0;
+
+	/*
+	 * MAGIC NUMBER EXPLANATION:
+	 * since we search a directory based on f_pos we have to start at 2
+	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+	 * else has to start at 2
+	 */
+	if (path->slots[0] == 0) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != btrfs_ino(inode) ||
+	    found_key.type != BTRFS_DIR_INDEX_KEY) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+	int ret = 0;
+
+	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+		ret = btrfs_inode_delayed_dir_index_count(dir);
+		if (ret) {
+			ret = btrfs_set_inode_index_count(dir);
+			if (ret)
+				return ret;
+		}
+	}
+
+	*index = BTRFS_I(dir)->index_cnt;
+	BTRFS_I(dir)->index_cnt++;
+
+	return ret;
+}
+
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+	struct btrfs_iget_args args;
+	args.location = &BTRFS_I(inode)->location;
+	args.root = BTRFS_I(inode)->root;
+
+	return insert_inode_locked4(inode,
+		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+		   btrfs_find_actor, &args);
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *dir,
+				     const char *name, int name_len,
+				     u64 ref_objectid, u64 objectid,
+				     umode_t mode, u64 *index)
+{
+	struct inode *inode;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_key *location;
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_key key[2];
+	u32 sizes[2];
+	int nitems = name ? 2 : 1;
+	unsigned long ptr;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+
+	inode = new_inode(root->fs_info->sb);
+	if (!inode) {
+		btrfs_free_path(path);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * O_TMPFILE, set link count to 0, so that after this point,
+	 * we fill in an inode item with the correct link count.
+	 */
+	if (!name)
+		set_nlink(inode, 0);
+
+	/*
+	 * we have to initialize this early, so we can reclaim the inode
+	 * number if we fail afterwards in this function.
+	 */
+	inode->i_ino = objectid;
+
+	if (dir && name) {
+		trace_btrfs_inode_request(dir);
+
+		ret = btrfs_set_inode_index(dir, index);
+		if (ret) {
+			btrfs_free_path(path);
+			iput(inode);
+			return ERR_PTR(ret);
+		}
+	} else if (dir) {
+		*index = 0;
+	}
+	/*
+	 * index_cnt is ignored for everything but a dir,
+	 * btrfs_get_inode_index_count has an explanation for the magic
+	 * number
+	 */
+	BTRFS_I(inode)->index_cnt = 2;
+	BTRFS_I(inode)->dir_index = *index;
+	BTRFS_I(inode)->root = root;
+	BTRFS_I(inode)->generation = trans->transid;
+	inode->i_generation = BTRFS_I(inode)->generation;
+
+	/*
+	 * We could have gotten an inode number from somebody who was fsynced
+	 * and then removed in this same transaction, so let's just set full
+	 * sync since it will be a full sync anyway and this will blow away the
+	 * old info in the log.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
+	key[0].objectid = objectid;
+	key[0].type = BTRFS_INODE_ITEM_KEY;
+	key[0].offset = 0;
+
+	sizes[0] = sizeof(struct btrfs_inode_item);
+
+	if (name) {
+		/*
+		 * Start new inodes with an inode_ref. This is slightly more
+		 * efficient for small numbers of hard links since they will
+		 * be packed into one item. Extended refs will kick in if we
+		 * add more hard links than can fit in the ref item.
+		 */
+		key[1].objectid = objectid;
+		key[1].type = BTRFS_INODE_REF_KEY;
+		key[1].offset = ref_objectid;
+
+		sizes[1] = name_len + sizeof(*ref);
+	}
+
+	location = &BTRFS_I(inode)->location;
+	location->objectid = objectid;
+	location->offset = 0;
+	location->type = BTRFS_INODE_ITEM_KEY;
+
+	ret = btrfs_insert_inode_locked(inode);
+	if (ret < 0)
+		goto fail;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+	if (ret != 0)
+		goto fail_unlock;
+
+	inode_init_owner(inode, dir, mode);
+	inode_set_bytes(inode, 0);
+
+	inode->i_mtime = CURRENT_TIME;
+	inode->i_atime = inode->i_mtime;
+	inode->i_ctime = inode->i_mtime;
+	BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				  struct btrfs_inode_item);
+	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
+			     sizeof(*inode_item));
+	fill_inode_item(trans, path->nodes[0], inode_item, inode);
+
+	if (name) {
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+				     struct btrfs_inode_ref);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+		ptr = (unsigned long)(ref + 1);
+		write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	}
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	btrfs_inherit_iflags(inode, dir);
+
+	if (S_ISREG(mode)) {
+		if (btrfs_test_opt(root, NODATASUM))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+		if (btrfs_test_opt(root, NODATACOW))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+				BTRFS_INODE_NODATASUM;
+	}
+
+	inode_tree_add(inode);
+
+	trace_btrfs_inode_new(inode);
+	btrfs_set_inode_last_trans(trans, inode);
+
+	btrfs_update_root_times(trans, root);
+
+	ret = btrfs_inode_inherit_props(trans, inode, dir);
+	if (ret)
+		btrfs_err(root->fs_info,
+			  "error inheriting props for ino %llu (root %llu): %d",
+			  btrfs_ino(inode), root->root_key.objectid, ret);
+
+	return inode;
+
+fail_unlock:
+	unlock_new_inode(inode);
+fail:
+	if (dir && name)
+		BTRFS_I(dir)->index_cnt--;
+	btrfs_free_path(path);
+	iput(inode);
+	return ERR_PTR(ret);
+}
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index)
+{
+	int ret = 0;
+	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+	u64 ino = btrfs_ino(inode);
+	u64 parent_ino = btrfs_ino(parent_inode);
+
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+	} else {
+		key.objectid = ino;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		key.offset = 0;
+	}
+
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+					 key.objectid, root->root_key.objectid,
+					 parent_ino, index, name, name_len);
+	} else if (add_backref) {
+		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
+					     parent_ino, index);
+	}
+
+	/* Nothing to clean up yet */
+	if (ret)
+		return ret;
+
+	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+				    parent_inode, &key,
+				    btrfs_inode_type(inode), index);
+	if (ret == -EEXIST || ret == -EOVERFLOW)
+		goto fail_dir_item;
+	else if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+			   name_len * 2);
+	inode_inc_iversion(parent_inode);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, parent_inode);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	return ret;
+
+fail_dir_item:
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		u64 local_index;
+		int err;
+		err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+				 key.objectid, root->root_key.objectid,
+				 parent_ino, &local_index, name, name_len);
+
+	} else if (add_backref) {
+		u64 local_index;
+		int err;
+
+		err = btrfs_del_inode_ref(trans, root, name, name_len,
+					  ino, parent_ino, &local_index);
+	}
+	return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+			    struct inode *dir, struct dentry *dentry,
+			    struct inode *inode, int backref, u64 index)
+{
+	int err = btrfs_add_link(trans, dir, inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 backref, index);
+	if (err > 0)
+		err = -EEXIST;
+	return err;
+}
+
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+			umode_t mode, dev_t rdev)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	u64 index = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	/*
+	 * 2 for inode item and ref
+	 * 2 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+	inode->i_op = &btrfs_special_inode_operations;
+	init_special_inode(inode, inode->i_mode, rdev);
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_unlock_inode;
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	if (err) {
+		goto out_unlock_inode;
+	} else {
+		btrfs_update_inode(trans, root, inode);
+		unlock_new_inode(inode);
+		d_instantiate(dentry, inode);
+	}
+
+out_unlock:
+	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	return err;
+
+out_unlock_inode:
+	drop_inode = 1;
+	unlock_new_inode(inode);
+	goto out_unlock;
+
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+			umode_t mode, bool excl)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	int drop_inode_on_err = 0;
+	int err;
+	u64 objectid;
+	u64 index = 0;
+
+	/*
+	 * 2 for inode item and ref
+	 * 2 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+	drop_inode_on_err = 1;
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_aops;
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_unlock_inode;
+
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_unlock_inode;
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	if (err)
+		goto out_unlock_inode;
+
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+
+out_unlock:
+	btrfs_end_transaction(trans, root);
+	if (err && drop_inode_on_err) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
+	return err;
+
+out_unlock_inode:
+	unlock_new_inode(inode);
+	goto out_unlock;
+
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = d_inode(old_dentry);
+	u64 index;
+	int err;
+	int drop_inode = 0;
+
+	/* do not allow sys_link's with other subvols of the same device */
+	if (root->objectid != BTRFS_I(inode)->root->objectid)
+		return -EXDEV;
+
+	if (inode->i_nlink >= BTRFS_LINK_MAX)
+		return -EMLINK;
+
+	err = btrfs_set_inode_index(dir, &index);
+	if (err)
+		goto fail;
+
+	/*
+	 * 2 items for inode and inode ref
+	 * 2 items for dir items
+	 * 1 item for parent inode
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto fail;
+	}
+
+	/* There are several dir indexes for this inode, clear the cache. */
+	BTRFS_I(inode)->dir_index = 0ULL;
+	inc_nlink(inode);
+	inode_inc_iversion(inode);
+	inode->i_ctime = CURRENT_TIME;
+	ihold(inode);
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
+
+	if (err) {
+		drop_inode = 1;
+	} else {
+		struct dentry *parent = dentry->d_parent;
+		err = btrfs_update_inode(trans, root, inode);
+		if (err)
+			goto fail;
+		if (inode->i_nlink == 1) {
+			/*
+			 * If new hard link count is 1, it's a file created
+			 * with open(2) O_TMPFILE flag.
+			 */
+			err = btrfs_orphan_del(trans, inode);
+			if (err)
+				goto fail;
+		}
+		d_instantiate(dentry, inode);
+		btrfs_log_new_name(trans, inode, NULL, parent);
+	}
+
+	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
+fail:
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int err = 0;
+	int drop_on_err = 0;
+	u64 objectid = 0;
+	u64 index = 0;
+
+	/*
+	 * 2 items for inode and ref
+	 * 2 items for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_fail;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				S_IFDIR | mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_fail;
+	}
+
+	drop_on_err = 1;
+	/* these must be set before we unlock the inode */
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_fail_inode;
+
+	btrfs_i_size_write(inode, 0);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_fail_inode;
+
+	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
+			     dentry->d_name.len, 0, index);
+	if (err)
+		goto out_fail_inode;
+
+	d_instantiate(dentry, inode);
+	/*
+	 * mkdir is special.  We're unlocking after we call d_instantiate
+	 * to avoid a race with nfsd calling d_instantiate.
+	 */
+	unlock_new_inode(inode);
+	drop_on_err = 0;
+
+out_fail:
+	btrfs_end_transaction(trans, root);
+	if (drop_on_err) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
+	return err;
+
+out_fail_inode:
+	unlock_new_inode(inode);
+	goto out_fail;
+}
+
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+	struct rb_node *next;
+
+	next = rb_next(&em->rb_node);
+	if (!next)
+		return NULL;
+	return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+	struct rb_node *prev;
+
+	prev = rb_prev(&em->rb_node);
+	if (!prev)
+		return NULL;
+	return container_of(prev, struct extent_map, rb_node);
+}
+
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the best fitted new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+				struct extent_map *existing,
+				struct extent_map *em,
+				u64 map_start)
+{
+	struct extent_map *prev;
+	struct extent_map *next;
+	u64 start;
+	u64 end;
+	u64 start_diff;
+
+	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+
+	if (existing->start > map_start) {
+		next = existing;
+		prev = prev_extent_map(next);
+	} else {
+		prev = existing;
+		next = next_extent_map(prev);
+	}
+
+	start = prev ? extent_map_end(prev) : em->start;
+	start = max_t(u64, start, em->start);
+	end = next ? next->start : extent_map_end(em);
+	end = min_t(u64, end, extent_map_end(em));
+	start_diff = start - em->start;
+	em->start = start;
+	em->len = end - start;
+	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+		em->block_start += start_diff;
+		em->block_len -= start_diff;
+	}
+	return add_extent_mapping(em_tree, em, 0);
+}
+
+static noinline int uncompress_inline(struct btrfs_path *path,
+				      struct inode *inode, struct page *page,
+				      size_t pg_offset, u64 extent_offset,
+				      struct btrfs_file_extent_item *item)
+{
+	int ret;
+	struct extent_buffer *leaf = path->nodes[0];
+	char *tmp;
+	size_t max_size;
+	unsigned long inline_size;
+	unsigned long ptr;
+	int compress_type;
+
+	WARN_ON(pg_offset != 0);
+	compress_type = btrfs_file_extent_compression(leaf, item);
+	max_size = btrfs_file_extent_ram_bytes(leaf, item);
+	inline_size = btrfs_file_extent_inline_item_len(leaf,
+					btrfs_item_nr(path->slots[0]));
+	tmp = kmalloc(inline_size, GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	ptr = btrfs_file_extent_inline_start(item);
+
+	read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+	ret = btrfs_decompress(compress_type, tmp, page,
+			       extent_offset, inline_size, max_size);
+	kfree(tmp);
+	return ret;
+}
+
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t pg_offset, u64 start, u64 len,
+				    int create)
+{
+	int ret;
+	int err = 0;
+	u64 extent_start = 0;
+	u64 extent_end = 0;
+	u64 objectid = btrfs_ino(inode);
+	u32 found_type;
+	struct btrfs_path *path = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *item;
+	struct extent_buffer *leaf;
+	struct btrfs_key found_key;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_trans_handle *trans = NULL;
+	const bool new_inline = !page || create;
+
+again:
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	if (em)
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+	read_unlock(&em_tree->lock);
+
+	if (em) {
+		if (em->start > start || em->start + em->len <= start)
+			free_extent_map(em);
+		else if (em->block_start == EXTENT_MAP_INLINE && page)
+			free_extent_map(em);
+		else
+			goto out;
+	}
+	em = alloc_extent_map();
+	if (!em) {
+		err = -ENOMEM;
+		goto out;
+	}
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	em->start = EXTENT_MAP_HOLE;
+	em->orig_start = EXTENT_MAP_HOLE;
+	em->len = (u64)-1;
+	em->block_len = (u64)-1;
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		if (!path) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * Chances are we'll be called again, so go ahead and do
+		 * readahead
+		 */
+		path->reada = 1;
+	}
+
+	ret = btrfs_lookup_file_extent(trans, root, path,
+				       objectid, start, trans != NULL);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret != 0) {
+		if (path->slots[0] == 0)
+			goto not_found;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+	/* are we inside the extent that was found? */
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	found_type = found_key.type;
+	if (found_key.objectid != objectid ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		/*
+		 * If we backup past the first extent we want to move forward
+		 * and see if there is an extent in front of us, otherwise we'll
+		 * say there is a hole for our whole search range which can
+		 * cause problems.
+		 */
+		extent_end = start;
+		goto next;
+	}
+
+	found_type = btrfs_file_extent_type(leaf, item);
+	extent_start = found_key.offset;
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		extent_end = extent_start +
+		       btrfs_file_extent_num_bytes(leaf, item);
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size_t size;
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
+		extent_end = ALIGN(extent_start + size, root->sectorsize);
+	}
+next:
+	if (start >= extent_end) {
+		path->slots[0]++;
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			if (ret > 0)
+				goto not_found;
+			leaf = path->nodes[0];
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != objectid ||
+		    found_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto not_found;
+		if (start + len <= found_key.offset)
+			goto not_found;
+		if (start > found_key.offset)
+			goto next;
+		em->start = start;
+		em->orig_start = start;
+		em->len = found_key.offset - start;
+		goto not_found_em;
+	}
+
+	btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		goto insert;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		unsigned long ptr;
+		char *map;
+		size_t size;
+		size_t extent_offset;
+		size_t copy_size;
+
+		if (new_inline)
+			goto out;
+
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
+		extent_offset = page_offset(page) + pg_offset - extent_start;
+		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+				size - extent_offset);
+		em->start = extent_start + extent_offset;
+		em->len = ALIGN(copy_size, root->sectorsize);
+		em->orig_block_len = em->len;
+		em->orig_start = em->start;
+		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+		if (create == 0 && !PageUptodate(page)) {
+			if (btrfs_file_extent_compression(leaf, item) !=
+			    BTRFS_COMPRESS_NONE) {
+				ret = uncompress_inline(path, inode, page,
+							pg_offset,
+							extent_offset, item);
+				if (ret) {
+					err = ret;
+					goto out;
+				}
+			} else {
+				map = kmap(page);
+				read_extent_buffer(leaf, map + pg_offset, ptr,
+						   copy_size);
+				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+					memset(map + pg_offset + copy_size, 0,
+					       PAGE_CACHE_SIZE - pg_offset -
+					       copy_size);
+				}
+				kunmap(page);
+			}
+			flush_dcache_page(page);
+		} else if (create && PageUptodate(page)) {
+			BUG();
+			if (!trans) {
+				kunmap(page);
+				free_extent_map(em);
+				em = NULL;
+
+				btrfs_release_path(path);
+				trans = btrfs_join_transaction(root);
+
+				if (IS_ERR(trans))
+					return ERR_CAST(trans);
+				goto again;
+			}
+			map = kmap(page);
+			write_extent_buffer(leaf, map + pg_offset, ptr,
+					    copy_size);
+			kunmap(page);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+		set_extent_uptodate(io_tree, em->start,
+				    extent_map_end(em) - 1, NULL, GFP_NOFS);
+		goto insert;
+	}
+not_found:
+	em->start = start;
+	em->orig_start = start;
+	em->len = len;
+not_found_em:
+	em->block_start = EXTENT_MAP_HOLE;
+	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+	btrfs_release_path(path);
+	if (em->start > start || extent_map_end(em) <= start) {
+		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
+			em->start, em->len, start, len);
+		err = -EIO;
+		goto out;
+	}
+
+	err = 0;
+	write_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em, 0);
+	/* it is possible that someone inserted the extent into the tree
+	 * while we had the lock dropped.  It is also possible that
+	 * an overlapping map exists in the tree
+	 */
+	if (ret == -EEXIST) {
+		struct extent_map *existing;
+
+		ret = 0;
+
+		existing = search_extent_mapping(em_tree, start, len);
+		/*
+		 * existing will always be non-NULL, since there must be
+		 * extent causing the -EEXIST.
+		 */
+		if (start >= extent_map_end(existing) ||
+		    start <= existing->start) {
+			/*
+			 * The existing extent map is the one nearest to
+			 * the [start, start + len) range which overlaps
+			 */
+			err = merge_extent_mapping(em_tree, existing,
+						   em, start);
+			free_extent_map(existing);
+			if (err) {
+				free_extent_map(em);
+				em = NULL;
+			}
+		} else {
+			free_extent_map(em);
+			em = existing;
+			err = 0;
+		}
+	}
+	write_unlock(&em_tree->lock);
+out:
+
+	trace_btrfs_get_extent(root, em);
+
+	if (path)
+		btrfs_free_path(path);
+	if (trans) {
+		ret = btrfs_end_transaction(trans, root);
+		if (!err)
+			err = ret;
+	}
+	if (err) {
+		free_extent_map(em);
+		return ERR_PTR(err);
+	}
+	BUG_ON(!em); /* Error is always set */
+	return em;
+}
+
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+					   size_t pg_offset, u64 start, u64 len,
+					   int create)
+{
+	struct extent_map *em;
+	struct extent_map *hole_em = NULL;
+	u64 range_start = start;
+	u64 end;
+	u64 found;
+	u64 found_end;
+	int err = 0;
+
+	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+	if (IS_ERR(em))
+		return em;
+	if (em) {
+		/*
+		 * if our em maps to
+		 * -  a hole or
+		 * -  a pre-alloc extent,
+		 * there might actually be delalloc bytes behind it.
+		 */
+		if (em->block_start != EXTENT_MAP_HOLE &&
+		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			return em;
+		else
+			hole_em = em;
+	}
+
+	/* check to see if we've wrapped (len == -1 or similar) */
+	end = start + len;
+	if (end < start)
+		end = (u64)-1;
+	else
+		end -= 1;
+
+	em = NULL;
+
+	/* ok, we didn't find anything, lets look for delalloc */
+	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+				 end, len, EXTENT_DELALLOC, 1);
+	found_end = range_start + found;
+	if (found_end < range_start)
+		found_end = (u64)-1;
+
+	/*
+	 * we didn't find anything useful, return
+	 * the original results from get_extent()
+	 */
+	if (range_start > end || found_end <= start) {
+		em = hole_em;
+		hole_em = NULL;
+		goto out;
+	}
+
+	/* adjust the range_start to make sure it doesn't
+	 * go backwards from the start they passed in
+	 */
+	range_start = max(start, range_start);
+	found = found_end - range_start;
+
+	if (found > 0) {
+		u64 hole_start = start;
+		u64 hole_len = len;
+
+		em = alloc_extent_map();
+		if (!em) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * when btrfs_get_extent can't find anything it
+		 * returns one huge hole
+		 *
+		 * make sure what it found really fits our range, and
+		 * adjust to make sure it is based on the start from
+		 * the caller
+		 */
+		if (hole_em) {
+			u64 calc_end = extent_map_end(hole_em);
+
+			if (calc_end <= start || (hole_em->start > end)) {
+				free_extent_map(hole_em);
+				hole_em = NULL;
+			} else {
+				hole_start = max(hole_em->start, start);
+				hole_len = calc_end - hole_start;
+			}
+		}
+		em->bdev = NULL;
+		if (hole_em && range_start > hole_start) {
+			/* our hole starts before our delalloc, so we
+			 * have to return just the parts of the hole
+			 * that go until  the delalloc starts
+			 */
+			em->len = min(hole_len,
+				      range_start - hole_start);
+			em->start = hole_start;
+			em->orig_start = hole_start;
+			/*
+			 * don't adjust block start at all,
+			 * it is fixed at EXTENT_MAP_HOLE
+			 */
+			em->block_start = hole_em->block_start;
+			em->block_len = hole_len;
+			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		} else {
+			em->start = range_start;
+			em->len = found;
+			em->orig_start = range_start;
+			em->block_start = EXTENT_MAP_DELALLOC;
+			em->block_len = found;
+		}
+	} else if (hole_em) {
+		return hole_em;
+	}
+out:
+
+	free_extent_map(hole_em);
+	if (err) {
+		free_extent_map(em);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+						  u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map *em;
+	struct btrfs_key ins;
+	u64 alloc_hint;
+	int ret;
+
+	alloc_hint = get_extent_allocation_hint(inode, start, len);
+	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+				   alloc_hint, &ins, 1, 1);
+	if (ret)
+		return ERR_PTR(ret);
+
+	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+			      ins.offset, ins.offset, ins.offset, 0);
+	if (IS_ERR(em)) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+		return em;
+	}
+
+	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+					   ins.offset, ins.offset, 0);
+	if (ret) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+		free_extent_map(em);
+		return ERR_PTR(ret);
+	}
+
+	return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	int ret;
+	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 disk_bytenr;
+	u64 backref_offset;
+	u64 extent_end;
+	u64 num_bytes;
+	int slot;
+	int found_type;
+	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+				       offset, 0);
+	if (ret < 0)
+		goto out;
+
+	slot = path->slots[0];
+	if (ret == 1) {
+		if (slot == 0) {
+			/* can't find the item, must cow */
+			ret = 0;
+			goto out;
+		}
+		slot--;
+	}
+	ret = 0;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != btrfs_ino(inode) ||
+	    key.type != BTRFS_EXTENT_DATA_KEY) {
+		/* not our file or wrong item type, must cow */
+		goto out;
+	}
+
+	if (key.offset > offset) {
+		/* Wrong offset, must cow */
+		goto out;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(leaf, fi);
+	if (found_type != BTRFS_FILE_EXTENT_REG &&
+	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+		/* not a regular extent, must cow */
+		goto out;
+	}
+
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+		goto out;
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if (extent_end <= offset)
+		goto out;
+
+	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	if (disk_bytenr == 0)
+		goto out;
+
+	if (btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		goto out;
+
+	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (orig_start) {
+		*orig_start = key.offset - backref_offset;
+		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+	}
+
+	if (btrfs_extent_readonly(root, disk_bytenr))
+		goto out;
+
+	num_bytes = min(offset + *len, extent_end) - offset;
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 range_end;
+
+		range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+		ret = test_range_bit(io_tree, offset, range_end,
+				     EXTENT_DELALLOC, 0, NULL);
+		if (ret) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
+	btrfs_release_path(path);
+
+	/*
+	 * look for other files referencing this extent, if we
+	 * find any we must cow
+	 */
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
+				    key.offset - backref_offset, disk_bytenr);
+	btrfs_end_transaction(trans, root);
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * adjust disk_bytenr and num_bytes to cover just the bytes
+	 * in this extent we are about to write.  If there
+	 * are any csums in that range we have to cow in order
+	 * to keep the csums correct
+	 */
+	disk_bytenr += backref_offset;
+	disk_bytenr += offset - key.offset;
+	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out;
+	/*
+	 * all of the above have passed, it is safe to overwrite this extent
+	 * without cow
+	 */
+	*len = num_bytes;
+	ret = 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
+{
+	struct radix_tree_root *root = &inode->i_mapping->page_tree;
+	int found = false;
+	void **pagep = NULL;
+	struct page *page = NULL;
+	int start_idx;
+	int end_idx;
+
+	start_idx = start >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * end is the last byte in the last page.  end == start is legal
+	 */
+	end_idx = end >> PAGE_CACHE_SHIFT;
+
+	rcu_read_lock();
+
+	/* Most of the code in this while loop is lifted from
+	 * find_get_page.  It's been modified to begin searching from a
+	 * page and return just the first page found in that range.  If the
+	 * found idx is less than or equal to the end idx then we know that
+	 * a page exists.  If no pages are found or if those pages are
+	 * outside of the range then we're fine (yay!) */
+	while (page == NULL &&
+	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
+		page = radix_tree_deref_slot(pagep);
+		if (unlikely(!page))
+			break;
+
+		if (radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page)) {
+				page = NULL;
+				continue;
+			}
+			/*
+			 * Otherwise, shmem/tmpfs must be storing a swap entry
+			 * here as an exceptional entry: so return it without
+			 * attempting to raise page count.
+			 */
+			page = NULL;
+			break; /* TODO: Is this relevant for this use case? */
+		}
+
+		if (!page_cache_get_speculative(page)) {
+			page = NULL;
+			continue;
+		}
+
+		/*
+		 * Has the page moved?
+		 * This is part of the lockless pagecache protocol. See
+		 * include/linux/pagemap.h for details.
+		 */
+		if (unlikely(page != *pagep)) {
+			page_cache_release(page);
+			page = NULL;
+		}
+	}
+
+	if (page) {
+		if (page->index <= end_idx)
+			found = true;
+		page_cache_release(page);
+	}
+
+	rcu_read_unlock();
+	return found;
+}
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+			      struct extent_state **cached_state, int writing)
+{
+	struct btrfs_ordered_extent *ordered;
+	int ret = 0;
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, cached_state);
+		/*
+		 * We're concerned with the entire range that we're going to be
+		 * doing DIO to, so we need to make sure theres no ordered
+		 * extents in this range.
+		 */
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+
+		/*
+		 * We need to make sure there are no buffered pages in this
+		 * range either, we could have raced between the invalidate in
+		 * generic_file_direct_write and locking the extent.  The
+		 * invalidate needs to happen so that reads after a write do not
+		 * get stale data.
+		 */
+		if (!ordered &&
+		    (!writing ||
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
+			break;
+
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     cached_state, GFP_NOFS);
+
+		if (ordered) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+		} else {
+			/* Screw you mmap */
+			ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+			if (ret)
+				break;
+			ret = filemap_fdatawait_range(inode->i_mapping,
+						      lockstart,
+						      lockend);
+			if (ret)
+				break;
+
+			/*
+			 * If we found a page that couldn't be invalidated just
+			 * fall back to buffered.
+			 */
+			ret = invalidate_inode_pages2_range(inode->i_mapping,
+					lockstart >> PAGE_CACHE_SHIFT,
+					lockend >> PAGE_CACHE_SHIFT);
+			if (ret)
+				break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	em = alloc_extent_map();
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	em->start = start;
+	em->orig_start = orig_start;
+	em->mod_start = start;
+	em->mod_len = len;
+	em->len = len;
+	em->block_len = block_len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	em->orig_block_len = orig_block_len;
+	em->ram_bytes = ram_bytes;
+	em->generation = -1;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	if (type == BTRFS_ORDERED_PREALLOC)
+		set_bit(EXTENT_FLAG_FILLING, &em->flags);
+
+	do {
+		btrfs_drop_extent_cache(inode, em->start,
+				em->start + em->len - 1, 0);
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em, 1);
+		write_unlock(&em_tree->lock);
+	} while (ret == -EEXIST);
+
+	if (ret) {
+		free_extent_map(em);
+		return ERR_PTR(ret);
+	}
+
+	return em;
+}
+
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
+	u64 start = iblock << inode->i_blkbits;
+	u64 lockstart, lockend;
+	u64 len = bh_result->b_size;
+	u64 *outstanding_extents = NULL;
+	int unlock_bits = EXTENT_LOCKED;
+	int ret = 0;
+
+	if (create)
+		unlock_bits |= EXTENT_DIRTY;
+	else
+		len = min_t(u64, len, root->sectorsize);
+
+	lockstart = start;
+	lockend = start + len - 1;
+
+	if (current->journal_info) {
+		/*
+		 * Need to pull our outstanding extents and set journal_info to NULL so
+		 * that anything that needs to check if there's a transction doesn't get
+		 * confused.
+		 */
+		outstanding_extents = current->journal_info;
+		current->journal_info = NULL;
+	}
+
+	/*
+	 * If this errors out it's because we couldn't invalidate pagecache for
+	 * this range and we need to fallback to buffered.
+	 */
+	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+		return -ENOTBLK;
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
+
+	/*
+	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+	 * io.  INLINE is special, and we could probably kludge it in here, but
+	 * it's still buffered so for safety lets just fall back to the generic
+	 * buffered path.
+	 *
+	 * For COMPRESSED we _have_ to read the entire extent in so we can
+	 * decompress it, so there will be buffering required no matter what we
+	 * do, so go ahead and fallback to buffered.
+	 *
+	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+	 * to buffered IO.  Don't blame me, this is the price we pay for using
+	 * the generic code.
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	    em->block_start == EXTENT_MAP_INLINE) {
+		free_extent_map(em);
+		ret = -ENOTBLK;
+		goto unlock_err;
+	}
+
+	/* Just a good old fashioned hole, return */
+	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+		free_extent_map(em);
+		goto unlock_err;
+	}
+
+	/*
+	 * We don't allocate a new extent in the following cases
+	 *
+	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+	 * existing extent.
+	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
+	 * just use the extent.
+	 *
+	 */
+	if (!create) {
+		len = min(len, em->len - (start - em->start));
+		lockstart = start + len;
+		goto unlock;
+	}
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+	     em->block_start != EXTENT_MAP_HOLE)) {
+		int type;
+		u64 block_start, orig_start, orig_block_len, ram_bytes;
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			type = BTRFS_ORDERED_PREALLOC;
+		else
+			type = BTRFS_ORDERED_NOCOW;
+		len = min(len, em->len - (start - em->start));
+		block_start = em->block_start + (start - em->start);
+
+		if (can_nocow_extent(inode, start, &len, &orig_start,
+				     &orig_block_len, &ram_bytes) == 1) {
+			if (type == BTRFS_ORDERED_PREALLOC) {
+				free_extent_map(em);
+				em = create_pinned_em(inode, start, len,
+						       orig_start,
+						       block_start, len,
+						       orig_block_len,
+						       ram_bytes, type);
+				if (IS_ERR(em)) {
+					ret = PTR_ERR(em);
+					goto unlock_err;
+				}
+			}
+
+			ret = btrfs_add_ordered_extent_dio(inode, start,
+					   block_start, len, len, type);
+			if (ret) {
+				free_extent_map(em);
+				goto unlock_err;
+			}
+			goto unlock;
+		}
+	}
+
+	/*
+	 * this will cow the extent, reset the len in case we changed
+	 * it above
+	 */
+	len = bh_result->b_size;
+	free_extent_map(em);
+	em = btrfs_new_extent_direct(inode, start, len);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
+	len = min(len, em->len - (start - em->start));
+unlock:
+	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+		inode->i_blkbits;
+	bh_result->b_size = len;
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+	if (create) {
+		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			set_buffer_new(bh_result);
+
+		/*
+		 * Need to update the i_size under the extent lock so buffered
+		 * readers will get the updated i_size when we unlock.
+		 */
+		if (start + len > i_size_read(inode))
+			i_size_write(inode, start + len);
+
+		/*
+		 * If we have an outstanding_extents count still set then we're
+		 * within our reservation, otherwise we need to adjust our inode
+		 * counter appropriately.
+		 */
+		if (*outstanding_extents) {
+			(*outstanding_extents)--;
+		} else {
+			spin_lock(&BTRFS_I(inode)->lock);
+			BTRFS_I(inode)->outstanding_extents++;
+			spin_unlock(&BTRFS_I(inode)->lock);
+		}
+
+		current->journal_info = outstanding_extents;
+		btrfs_free_reserved_data_space(inode, len);
+	}
+
+	/*
+	 * In the case of write we need to clear and unlock the entire range,
+	 * in the case of read we need to unlock only the end area that we
+	 * aren't using if there is any left over space.
+	 */
+	if (lockstart < lockend) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				 lockend, unlock_bits, 1, 0,
+				 &cached_state, GFP_NOFS);
+	} else {
+		free_extent_state(cached_state);
+	}
+
+	free_extent_map(em);
+
+	return 0;
+
+unlock_err:
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+	if (outstanding_extents)
+		current->journal_info = outstanding_extents;
+	return ret;
+}
+
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+					int rw, int mirror_num)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	BUG_ON(rw & REQ_WRITE);
+
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+				  BTRFS_WQ_ENDIO_DIO_REPAIR);
+	if (ret)
+		goto err;
+
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+	bio_put(bio);
+	return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+				      struct bio *failed_bio,
+				      struct io_failure_record *failrec,
+				      int failed_mirror)
+{
+	int num_copies;
+
+	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+				      failrec->logical, failrec->len);
+	if (num_copies == 1) {
+		/*
+		 * we only have a single copy of the data, so don't bother with
+		 * all the retry and error correction code that follows. no
+		 * matter what the error is, it is very likely to persist.
+		 */
+		pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		return 0;
+	}
+
+	failrec->failed_mirror = failed_mirror;
+	failrec->this_mirror++;
+	if (failrec->this_mirror == failed_mirror)
+		failrec->this_mirror++;
+
+	if (failrec->this_mirror > num_copies) {
+		pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+			  struct page *page, u64 start, u64 end,
+			  int failed_mirror, bio_end_io_t *repair_endio,
+			  void *repair_arg)
+{
+	struct io_failure_record *failrec;
+	struct bio *bio;
+	int isector;
+	int read_mode;
+	int ret;
+
+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+	if (ret)
+		return ret;
+
+	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+					 failed_mirror);
+	if (!ret) {
+		free_io_failure(inode, failrec);
+		return -EIO;
+	}
+
+	if (failed_bio->bi_vcnt > 1)
+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+	else
+		read_mode = READ_SYNC;
+
+	isector = start - btrfs_io_bio(failed_bio)->logical;
+	isector >>= inode->i_sb->s_blocksize_bits;
+	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+				      0, isector, repair_endio, repair_arg);
+	if (!bio) {
+		free_io_failure(inode, failrec);
+		return -EIO;
+	}
+
+	btrfs_debug(BTRFS_I(inode)->root->fs_info,
+		    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+		    read_mode, failrec->this_mirror, failrec->in_validation);
+
+	ret = submit_dio_repair_bio(inode, bio, read_mode,
+				    failrec->this_mirror);
+	if (ret) {
+		free_io_failure(inode, failrec);
+		bio_put(bio);
+	}
+
+	return ret;
+}
+
+struct btrfs_retry_complete {
+	struct completion done;
+	struct inode *inode;
+	u64 start;
+	int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+	struct btrfs_retry_complete *done = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	if (err)
+		goto end;
+
+	done->uptodate = 1;
+	bio_for_each_segment_all(bvec, bio, i)
+		clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+	complete(&done->done);
+	bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+				       struct btrfs_io_bio *io_bio)
+{
+	struct bio_vec *bvec;
+	struct btrfs_retry_complete done;
+	u64 start;
+	int i;
+	int ret;
+
+	start = io_bio->logical;
+	done.inode = inode;
+
+	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+		done.uptodate = 0;
+		done.start = start;
+		init_completion(&done.done);
+
+		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+				     start + bvec->bv_len - 1,
+				     io_bio->mirror_num,
+				     btrfs_retry_endio_nocsum, &done);
+		if (ret)
+			return ret;
+
+		wait_for_completion(&done.done);
+
+		if (!done.uptodate) {
+			/* We might have another mirror, so try again */
+			goto try_again;
+		}
+
+		start += bvec->bv_len;
+	}
+
+	return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+	struct btrfs_retry_complete *done = bio->bi_private;
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct bio_vec *bvec;
+	int uptodate;
+	int ret;
+	int i;
+
+	if (err)
+		goto end;
+
+	uptodate = 1;
+	bio_for_each_segment_all(bvec, bio, i) {
+		ret = __readpage_endio_check(done->inode, io_bio, i,
+					     bvec->bv_page, 0,
+					     done->start, bvec->bv_len);
+		if (!ret)
+			clean_io_failure(done->inode, done->start,
+					 bvec->bv_page, 0);
+		else
+			uptodate = 0;
+	}
+
+	done->uptodate = uptodate;
+end:
+	complete(&done->done);
+	bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+				    struct btrfs_io_bio *io_bio, int err)
+{
+	struct bio_vec *bvec;
+	struct btrfs_retry_complete done;
+	u64 start;
+	u64 offset = 0;
+	int i;
+	int ret;
+
+	err = 0;
+	start = io_bio->logical;
+	done.inode = inode;
+
+	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+					     0, start, bvec->bv_len);
+		if (likely(!ret))
+			goto next;
+try_again:
+		done.uptodate = 0;
+		done.start = start;
+		init_completion(&done.done);
+
+		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+				     start + bvec->bv_len - 1,
+				     io_bio->mirror_num,
+				     btrfs_retry_endio, &done);
+		if (ret) {
+			err = ret;
+			goto next;
+		}
+
+		wait_for_completion(&done.done);
+
+		if (!done.uptodate) {
+			/* We might have another mirror, so try again */
+			goto try_again;
+		}
+next:
+		offset += bvec->bv_len;
+		start += bvec->bv_len;
+	}
+
+	return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+				  struct btrfs_io_bio *io_bio, int err)
+{
+	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	if (skip_csum) {
+		if (unlikely(err))
+			return __btrfs_correct_data_nocsum(inode, io_bio);
+		else
+			return 0;
+	} else {
+		return __btrfs_subio_endio_read(inode, io_bio, err);
+	}
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct bio *dio_bio;
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+		err = btrfs_subio_endio_read(inode, io_bio, err);
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+		      dip->logical_offset + dip->bytes - 1);
+	dio_bio = dip->dio_bio;
+
+	kfree(dip);
+
+	/* If we had a csum failure make sure to clear the uptodate flag */
+	if (err)
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+
+	if (io_bio->end_io)
+		io_bio->end_io(io_bio, err);
+	bio_put(bio);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_extent *ordered = NULL;
+	u64 ordered_offset = dip->logical_offset;
+	u64 ordered_bytes = dip->bytes;
+	struct bio *dio_bio;
+	int ret;
+
+	if (err)
+		goto out_done;
+again:
+	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
+						   &ordered_offset,
+						   ordered_bytes, !err);
+	if (!ret)
+		goto out_test;
+
+	btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+			finish_ordered_fn, NULL, NULL);
+	btrfs_queue_work(root->fs_info->endio_write_workers,
+			 &ordered->work);
+out_test:
+	/*
+	 * our bio might span multiple ordered extents.  If we haven't
+	 * completed the accounting for the whole dio, go back and try again
+	 */
+	if (ordered_offset < dip->logical_offset + dip->bytes) {
+		ordered_bytes = dip->logical_offset + dip->bytes -
+			ordered_offset;
+		ordered = NULL;
+		goto again;
+	}
+out_done:
+	dio_bio = dip->dio_bio;
+
+	kfree(dip);
+
+	/* If we had an error make sure to clear the uptodate flag */
+	if (err)
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+	bio_put(bio);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags, u64 offset)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+	BUG_ON(ret); /* -ENOMEM */
+	return 0;
+}
+
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+
+	if (err)
+		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+			   "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+			   btrfs_ino(dip->inode), bio->bi_rw,
+			   (unsigned long long)bio->bi_iter.bi_sector,
+			   bio->bi_iter.bi_size, err);
+
+	if (dip->subio_endio)
+		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
+	if (err) {
+		dip->errors = 1;
+
+		/*
+		 * before atomic variable goto zero, we must make sure
+		 * dip->errors is perceived to be set.
+		 */
+		smp_mb__before_atomic();
+	}
+
+	/* if there are more bios still pending for this dio, just exit */
+	if (!atomic_dec_and_test(&dip->pending_bios))
+		goto out;
+
+	if (dip->errors) {
+		bio_io_error(dip->orig_bio);
+	} else {
+		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
+		bio_endio(dip->orig_bio, 0);
+	}
+out:
+	bio_put(bio);
+}
+
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+				       u64 first_sector, gfp_t gfp_flags)
+{
+	int nr_vecs = bio_get_nr_vecs(bdev);
+	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+						 struct inode *inode,
+						 struct btrfs_dio_private *dip,
+						 struct bio *bio,
+						 u64 file_offset)
+{
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+	int ret;
+
+	/*
+	 * We load all the csum data we need when we submit
+	 * the first bio to reduce the csum tree search and
+	 * contention.
+	 */
+	if (dip->logical_offset == file_offset) {
+		ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+						file_offset);
+		if (ret)
+			return ret;
+	}
+
+	if (bio == dip->orig_bio)
+		return 0;
+
+	file_offset -= dip->logical_offset;
+	file_offset >>= inode->i_sb->s_blocksize_bits;
+	io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+	return 0;
+}
+
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+					 int rw, u64 file_offset, int skip_sum,
+					 int async_submit)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	int write = rw & REQ_WRITE;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (async_submit)
+		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
+	bio_get(bio);
+
+	if (!write) {
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+				BTRFS_WQ_ENDIO_DATA);
+		if (ret)
+			goto err;
+	}
+
+	if (skip_sum)
+		goto map;
+
+	if (write && async_submit) {
+		ret = btrfs_wq_submit_bio(root->fs_info,
+				   inode, rw, bio, 0, 0,
+				   file_offset,
+				   __btrfs_submit_bio_start_direct_io,
+				   __btrfs_submit_bio_done);
+		goto err;
+	} else if (write) {
+		/*
+		 * If we aren't doing async submit, calculate the csum of the
+		 * bio now.
+		 */
+		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+		if (ret)
+			goto err;
+	} else {
+		ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+						     file_offset);
+		if (ret)
+			goto err;
+	}
+map:
+	ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
+err:
+	bio_put(bio);
+	return ret;
+}
+
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+				    int skip_sum)
+{
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct bio *bio;
+	struct bio *orig_bio = dip->orig_bio;
+	struct bio_vec *bvec = orig_bio->bi_io_vec;
+	u64 start_sector = orig_bio->bi_iter.bi_sector;
+	u64 file_offset = dip->logical_offset;
+	u64 submit_len = 0;
+	u64 map_length;
+	int nr_pages = 0;
+	int ret;
+	int async_submit = 0;
+
+	map_length = orig_bio->bi_iter.bi_size;
+	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
+			      &map_length, NULL, 0);
+	if (ret)
+		return -EIO;
+
+	if (map_length >= orig_bio->bi_iter.bi_size) {
+		bio = orig_bio;
+		dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
+		goto submit;
+	}
+
+	/* async crcs make it difficult to collect full stripe writes. */
+	if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
+		async_submit = 0;
+	else
+		async_submit = 1;
+
+	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_private = dip;
+	bio->bi_end_io = btrfs_end_dio_bio;
+	btrfs_io_bio(bio)->logical = file_offset;
+	atomic_inc(&dip->pending_bios);
+
+	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+		if (map_length < submit_len + bvec->bv_len ||
+		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+				 bvec->bv_offset) < bvec->bv_len) {
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count. Otherwise, the dip might get freed
+			 * before we're done setting it up
+			 */
+			atomic_inc(&dip->pending_bios);
+			ret = __btrfs_submit_dio_bio(bio, inode, rw,
+						     file_offset, skip_sum,
+						     async_submit);
+			if (ret) {
+				bio_put(bio);
+				atomic_dec(&dip->pending_bios);
+				goto out_err;
+			}
+
+			start_sector += submit_len >> 9;
+			file_offset += submit_len;
+
+			submit_len = 0;
+			nr_pages = 0;
+
+			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+						  start_sector, GFP_NOFS);
+			if (!bio)
+				goto out_err;
+			bio->bi_private = dip;
+			bio->bi_end_io = btrfs_end_dio_bio;
+			btrfs_io_bio(bio)->logical = file_offset;
+
+			map_length = orig_bio->bi_iter.bi_size;
+			ret = btrfs_map_block(root->fs_info, rw,
+					      start_sector << 9,
+					      &map_length, NULL, 0);
+			if (ret) {
+				bio_put(bio);
+				goto out_err;
+			}
+		} else {
+			submit_len += bvec->bv_len;
+			nr_pages++;
+			bvec++;
+		}
+	}
+
+submit:
+	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+				     async_submit);
+	if (!ret)
+		return 0;
+
+	bio_put(bio);
+out_err:
+	dip->errors = 1;
+	/*
+	 * before atomic variable goto zero, we must
+	 * make sure dip->errors is perceived to be set.
+	 */
+	smp_mb__before_atomic();
+	if (atomic_dec_and_test(&dip->pending_bios))
+		bio_io_error(dip->orig_bio);
+
+	/* bio_end_io() will handle error, so we needn't return it */
+	return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+				struct inode *inode, loff_t file_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_dio_private *dip;
+	struct bio *io_bio;
+	struct btrfs_io_bio *btrfs_bio;
+	int skip_sum;
+	int write = rw & REQ_WRITE;
+	int ret = 0;
+
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+	if (!io_bio) {
+		ret = -ENOMEM;
+		goto free_ordered;
+	}
+
+	dip = kzalloc(sizeof(*dip), GFP_NOFS);
+	if (!dip) {
+		ret = -ENOMEM;
+		goto free_io_bio;
+	}
+
+	dip->private = dio_bio->bi_private;
+	dip->inode = inode;
+	dip->logical_offset = file_offset;
+	dip->bytes = dio_bio->bi_iter.bi_size;
+	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+	io_bio->bi_private = dip;
+	dip->orig_bio = io_bio;
+	dip->dio_bio = dio_bio;
+	atomic_set(&dip->pending_bios, 0);
+	btrfs_bio = btrfs_io_bio(io_bio);
+	btrfs_bio->logical = file_offset;
+
+	if (write) {
+		io_bio->bi_end_io = btrfs_endio_direct_write;
+	} else {
+		io_bio->bi_end_io = btrfs_endio_direct_read;
+		dip->subio_endio = btrfs_subio_endio_read;
+	}
+
+	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
+	if (!ret)
+		return;
+
+	if (btrfs_bio->end_io)
+		btrfs_bio->end_io(btrfs_bio, ret);
+free_io_bio:
+	bio_put(io_bio);
+
+free_ordered:
+	/*
+	 * If this is a write, we need to clean up the reserved space and kill
+	 * the ordered extent.
+	 */
+	if (write) {
+		struct btrfs_ordered_extent *ordered;
+		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+			btrfs_free_reserved_extent(root, ordered->start,
+						   ordered->disk_len, 1);
+		btrfs_put_ordered_extent(ordered);
+		btrfs_put_ordered_extent(ordered);
+	}
+	bio_endio(dio_bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
+			const struct iov_iter *iter, loff_t offset)
+{
+	int seg;
+	int i;
+	unsigned blocksize_mask = root->sectorsize - 1;
+	ssize_t retval = -EINVAL;
+
+	if (offset & blocksize_mask)
+		goto out;
+
+	if (iov_iter_alignment(iter) & blocksize_mask)
+		goto out;
+
+	/* If this is a write we don't need to check anymore */
+	if (iov_iter_rw(iter) == WRITE)
+		return 0;
+	/*
+	 * Check to make sure we don't have duplicate iov_base's in this
+	 * iovec, if so return EINVAL, otherwise we'll get csum errors
+	 * when reading back.
+	 */
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		for (i = seg + 1; i < iter->nr_segs; i++) {
+			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+				goto out;
+		}
+	}
+	retval = 0;
+out:
+	return retval;
+}
+
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			       loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	u64 outstanding_extents = 0;
+	size_t count = 0;
+	int flags = 0;
+	bool wakeup = true;
+	bool relock = false;
+	ssize_t ret;
+
+	if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
+		return 0;
+
+	inode_dio_begin(inode);
+	smp_mb__after_atomic();
+
+	/*
+	 * The generic stuff only does filemap_write_and_wait_range, which
+	 * isn't enough if we've written compressed pages to this area, so
+	 * we need to flush the dirty pages again to make absolutely sure
+	 * that any outstanding dirty pages are on disk.
+	 */
+	count = iov_iter_count(iter);
+	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+		     &BTRFS_I(inode)->runtime_flags))
+		filemap_fdatawrite_range(inode->i_mapping, offset,
+					 offset + count - 1);
+
+	if (iov_iter_rw(iter) == WRITE) {
+		/*
+		 * If the write DIO is beyond the EOF, we need update
+		 * the isize, but it is protected by i_mutex. So we can
+		 * not unlock the i_mutex at this case.
+		 */
+		if (offset + count <= inode->i_size) {
+			mutex_unlock(&inode->i_mutex);
+			relock = true;
+		}
+		ret = btrfs_delalloc_reserve_space(inode, count);
+		if (ret)
+			goto out;
+		outstanding_extents = div64_u64(count +
+						BTRFS_MAX_EXTENT_SIZE - 1,
+						BTRFS_MAX_EXTENT_SIZE);
+
+		/*
+		 * We need to know how many extents we reserved so that we can
+		 * do the accounting properly if we go over the number we
+		 * originally calculated.  Abuse current->journal_info for this.
+		 */
+		current->journal_info = &outstanding_extents;
+	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+				     &BTRFS_I(inode)->runtime_flags)) {
+		inode_dio_end(inode);
+		flags = DIO_LOCKING | DIO_SKIP_HOLES;
+		wakeup = false;
+	}
+
+	ret = __blockdev_direct_IO(iocb, inode,
+				   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+				   iter, offset, btrfs_get_blocks_direct, NULL,
+				   btrfs_submit_direct, flags);
+	if (iov_iter_rw(iter) == WRITE) {
+		current->journal_info = NULL;
+		if (ret < 0 && ret != -EIOCBQUEUED)
+			btrfs_delalloc_release_space(inode, count);
+		else if (ret >= 0 && (size_t)ret < count)
+			btrfs_delalloc_release_space(inode,
+						     count - (size_t)ret);
+	}
+out:
+	if (wakeup)
+		inode_dio_end(inode);
+	if (relock)
+		mutex_lock(&inode->i_mutex);
+
+	return ret;
+}
+
+#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	int	ret;
+
+	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
+}
+
+int btrfs_readpage(struct file *file, struct page *page)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+
+
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+
+static int btrfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	return extent_readpages(tree, mapping, pages, nr_pages,
+				btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
+	int ret;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+	return ret;
+}
+
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	if (PageWriteback(page) || PageDirty(page))
+		return 0;
+	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
+}
+
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	int inode_evicting = inode->i_state & I_FREEING;
+
+	/*
+	 * we have the page locked, so new writeback can't start,
+	 * and the dirty bit won't be cleared while we are here.
+	 *
+	 * Wait for IO on this page so that we can safely clear
+	 * the PagePrivate2 bit and do ordered accounting
+	 */
+	wait_on_page_writeback(page);
+
+	tree = &BTRFS_I(inode)->io_tree;
+	if (offset) {
+		btrfs_releasepage(page, GFP_NOFS);
+		return;
+	}
+
+	if (!inode_evicting)
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		/*
+		 * IO on this page will never be started, so we need
+		 * to account for any ordered extents now
+		 */
+		if (!inode_evicting)
+			clear_extent_bit(tree, page_start, page_end,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+					 EXTENT_DEFRAG, 1, 0, &cached_state,
+					 GFP_NOFS);
+		/*
+		 * whoever cleared the private bit is responsible
+		 * for the finish_ordered_io
+		 */
+		if (TestClearPagePrivate2(page)) {
+			struct btrfs_ordered_inode_tree *tree;
+			u64 new_len;
+
+			tree = &BTRFS_I(inode)->ordered_tree;
+
+			spin_lock_irq(&tree->lock);
+			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+			new_len = page_start - ordered->file_offset;
+			if (new_len < ordered->truncated_len)
+				ordered->truncated_len = new_len;
+			spin_unlock_irq(&tree->lock);
+
+			if (btrfs_dec_test_ordered_pending(inode, &ordered,
+							   page_start,
+							   PAGE_CACHE_SIZE, 1))
+				btrfs_finish_ordered_io(ordered);
+		}
+		btrfs_put_ordered_extent(ordered);
+		if (!inode_evicting) {
+			cached_state = NULL;
+			lock_extent_bits(tree, page_start, page_end, 0,
+					 &cached_state);
+		}
+	}
+
+	if (!inode_evicting) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+
+		__btrfs_releasepage(page, GFP_NOFS);
+	}
+
+	ClearPageChecked(page);
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	char *kaddr;
+	unsigned long zero_start;
+	loff_t size;
+	int ret;
+	int reserved = 0;
+	u64 page_start;
+	u64 page_end;
+
+	sb_start_pagefault(inode->i_sb);
+	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (!ret) {
+		ret = file_update_time(vma->vm_file);
+		reserved = 1;
+	}
+	if (ret) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
+		if (reserved)
+			goto out;
+		goto out_noreserve;
+	}
+
+	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+again:
+	lock_page(page);
+	size = i_size_read(inode);
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_start >= size)) {
+		/* page got truncated out from underneath us */
+		goto out_unlock;
+	}
+	wait_on_page_writeback(page);
+
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+	set_page_extent_mapped(page);
+
+	/*
+	 * we can't set the delalloc bits if there are pending ordered
+	 * extents.  Drop our locks and wait for them to finish
+	 */
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		unlock_page(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	/*
+	 * XXX - page_mkwrite gets called every time the page is dirtied, even
+	 * if it was already dirty, so for space accounting reasons we need to
+	 * clear any delalloc bits for the range we are fixing to save.  There
+	 * is probably a better way to do this, but for now keep consistent with
+	 * prepare_pages in the normal write path.
+	 */
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+			  0, 0, &cached_state, GFP_NOFS);
+
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
+	if (ret) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		ret = VM_FAULT_SIGBUS;
+		goto out_unlock;
+	}
+	ret = 0;
+
+	/* page is wholly or partially inside EOF */
+	if (page_start + PAGE_CACHE_SIZE > size)
+		zero_start = size & ~PAGE_CACHE_MASK;
+	else
+		zero_start = PAGE_CACHE_SIZE;
+
+	if (zero_start != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	ClearPageChecked(page);
+	set_page_dirty(page);
+	SetPageUptodate(page);
+
+	BTRFS_I(inode)->last_trans = root->fs_info->generation;
+	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
+
+out_unlock:
+	if (!ret) {
+		sb_end_pagefault(inode->i_sb);
+		return VM_FAULT_LOCKED;
+	}
+	unlock_page(page);
+out:
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
+static int btrfs_truncate(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *rsv;
+	int ret = 0;
+	int err = 0;
+	struct btrfs_trans_handle *trans;
+	u64 mask = root->sectorsize - 1;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+
+	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+				       (u64)-1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
+	 * 3 things going on here
+	 *
+	 * 1) We need to reserve space for our orphan item and the space to
+	 * delete our orphan item.  Lord knows we don't want to have a dangling
+	 * orphan item because we didn't reserve space to remove it.
+	 *
+	 * 2) We need to reserve space to update our inode.
+	 *
+	 * 3) We need to have something to cache all the space that is going to
+	 * be free'd up by the truncate operation, but also have some slack
+	 * space reserved in case it uses space during the truncate (thank you
+	 * very much snapshotting).
+	 *
+	 * And we need these to all be seperate.  The fact is we can use alot of
+	 * space doing the truncate, and we have no earthly idea how much space
+	 * we will use, so we need the truncate reservation to be seperate so it
+	 * doesn't end up using space reserved for updating the inode or
+	 * removing the orphan item.  We also need to be able to stop the
+	 * transaction and start a new one, which means we need to be able to
+	 * update the inode several times, and we have no idea of knowing how
+	 * many times that will be, so we can't just reserve 1 item for the
+	 * entirety of the opration, so that has to be done seperately as well.
+	 * Then there is the orphan item, which does indeed need to be held on
+	 * to for the whole operation, and we need nobody to touch this reserved
+	 * space except the orphan code.
+	 *
+	 * So that leaves us with
+	 *
+	 * 1) root->orphan_block_rsv - for the orphan deletion.
+	 * 2) rsv - for the truncate reservation, which we will steal from the
+	 * transaction reservation.
+	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+	 * updating the inode.
+	 */
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv)
+		return -ENOMEM;
+	rsv->size = min_size;
+	rsv->failfast = 1;
+
+	/*
+	 * 1 for the truncate slack space
+	 * 1 for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 2);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* Migrate the slack space for the truncate to our reserve */
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
+	BUG_ON(ret);
+
+	/*
+	 * So if we truncate and then write and fsync we normally would just
+	 * write the extents that changed, which is a problem if we need to
+	 * first truncate that entire inode.  So set this flag so we write out
+	 * all of the extents in the inode to the sync log so we're completely
+	 * safe.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+	trans->block_rsv = rsv;
+
+	while (1) {
+		ret = btrfs_truncate_inode_items(trans, root, inode,
+						 inode->i_size,
+						 BTRFS_EXTENT_DATA_KEY);
+		if (ret != -ENOSPC && ret != -EAGAIN) {
+			err = ret;
+			break;
+		}
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root);
+
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
+	}
+
+	if (ret == 0 && inode->i_nlink > 0) {
+		trans->block_rsv = root->orphan_block_rsv;
+		ret = btrfs_orphan_del(trans, inode);
+		if (ret)
+			err = ret;
+	}
+
+	if (trans) {
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret && !err)
+			err = ret;
+
+		ret = btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root);
+	}
+
+out:
+	btrfs_free_block_rsv(root, rsv);
+
+	if (ret && !err)
+		err = ret;
+
+	return err;
+}
+
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root,
+			     struct btrfs_root *parent_root,
+			     u64 new_dirid)
+{
+	struct inode *inode;
+	int err;
+	u64 index = 0;
+
+	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
+				new_dirid, new_dirid,
+				S_IFDIR | (~current_umask() & S_IRWXUGO),
+				&index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	set_nlink(inode, 1);
+	btrfs_i_size_write(inode, 0);
+	unlock_new_inode(inode);
+
+	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
+	if (err)
+		btrfs_err(new_root->fs_info,
+			  "error inheriting subvolume %llu properties: %d",
+			  new_root->root_key.objectid, err);
+
+	err = btrfs_update_inode(trans, new_root, inode);
+
+	iput(inode);
+	return err;
+}
+
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+	struct btrfs_inode *ei;
+	struct inode *inode;
+
+	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	ei->root = NULL;
+	ei->generation = 0;
+	ei->last_trans = 0;
+	ei->last_sub_trans = 0;
+	ei->logged_trans = 0;
+	ei->delalloc_bytes = 0;
+	ei->defrag_bytes = 0;
+	ei->disk_i_size = 0;
+	ei->flags = 0;
+	ei->csum_bytes = 0;
+	ei->index_cnt = (u64)-1;
+	ei->dir_index = 0;
+	ei->last_unlink_trans = 0;
+	ei->last_log_commit = 0;
+
+	spin_lock_init(&ei->lock);
+	ei->outstanding_extents = 0;
+	ei->reserved_extents = 0;
+
+	ei->runtime_flags = 0;
+	ei->force_compress = BTRFS_COMPRESS_NONE;
+
+	ei->delayed_node = NULL;
+
+	ei->i_otime.tv_sec = 0;
+	ei->i_otime.tv_nsec = 0;
+
+	inode = &ei->vfs_inode;
+	extent_map_tree_init(&ei->extent_tree);
+	extent_io_tree_init(&ei->io_tree, &inode->i_data);
+	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+	ei->io_tree.track_uptodate = 1;
+	ei->io_failure_tree.track_uptodate = 1;
+	atomic_set(&ei->sync_writers, 0);
+	mutex_init(&ei->log_mutex);
+	mutex_init(&ei->delalloc_mutex);
+	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+	INIT_LIST_HEAD(&ei->delalloc_inodes);
+	RB_CLEAR_NODE(&ei->rb_node);
+
+	return inode;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
+static void btrfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+void btrfs_destroy_inode(struct inode *inode)
+{
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	WARN_ON(!hlist_empty(&inode->i_dentry));
+	WARN_ON(inode->i_data.nrpages);
+	WARN_ON(BTRFS_I(inode)->outstanding_extents);
+	WARN_ON(BTRFS_I(inode)->reserved_extents);
+	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+	WARN_ON(BTRFS_I(inode)->csum_bytes);
+	WARN_ON(BTRFS_I(inode)->defrag_bytes);
+
+	/*
+	 * This can happen where we create an inode, but somebody else also
+	 * created the same inode and we need to destroy the one we already
+	 * created.
+	 */
+	if (!root)
+		goto free;
+
+	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+		     &BTRFS_I(inode)->runtime_flags)) {
+		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
+			btrfs_ino(inode));
+		atomic_dec(&root->orphan_inodes);
+	}
+
+	while (1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		if (!ordered)
+			break;
+		else {
+			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
+				ordered->file_offset, ordered->len);
+			btrfs_remove_ordered_extent(inode, ordered);
+			btrfs_put_ordered_extent(ordered);
+			btrfs_put_ordered_extent(ordered);
+		}
+	}
+	inode_tree_del(inode);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+free:
+	call_rcu(&inode->i_rcu, btrfs_i_callback);
+}
+
+int btrfs_drop_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (root == NULL)
+		return 1;
+
+	/* the snap/subvol tree is on deleting */
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return 1;
+	else
+		return generic_drop_inode(inode);
+}
+
+static void init_once(void *foo)
+{
+	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+void btrfs_destroy_cachep(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	if (btrfs_inode_cachep)
+		kmem_cache_destroy(btrfs_inode_cachep);
+	if (btrfs_trans_handle_cachep)
+		kmem_cache_destroy(btrfs_trans_handle_cachep);
+	if (btrfs_transaction_cachep)
+		kmem_cache_destroy(btrfs_transaction_cachep);
+	if (btrfs_path_cachep)
+		kmem_cache_destroy(btrfs_path_cachep);
+	if (btrfs_free_space_cachep)
+		kmem_cache_destroy(btrfs_free_space_cachep);
+	if (btrfs_delalloc_work_cachep)
+		kmem_cache_destroy(btrfs_delalloc_work_cachep);
+}
+
+int btrfs_init_cachep(void)
+{
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
+			sizeof(struct btrfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+	if (!btrfs_inode_cachep)
+		goto fail;
+
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
+			sizeof(struct btrfs_trans_handle), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_trans_handle_cachep)
+		goto fail;
+
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
+			sizeof(struct btrfs_transaction), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_transaction_cachep)
+		goto fail;
+
+	btrfs_path_cachep = kmem_cache_create("btrfs_path",
+			sizeof(struct btrfs_path), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_path_cachep)
+		goto fail;
+
+	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
+			sizeof(struct btrfs_free_space), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_free_space_cachep)
+		goto fail;
+
+	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+			sizeof(struct btrfs_delalloc_work), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+			NULL);
+	if (!btrfs_delalloc_work_cachep)
+		goto fail;
+
+	return 0;
+fail:
+	btrfs_destroy_cachep();
+	return -ENOMEM;
+}
+
+static int btrfs_getattr(struct vfsmount *mnt,
+			 struct dentry *dentry, struct kstat *stat)
+{
+	u64 delalloc_bytes;
+	struct inode *inode = d_inode(dentry);
+	u32 blocksize = inode->i_sb->s_blocksize;
+
+	generic_fillattr(inode, stat);
+	stat->dev = BTRFS_I(inode)->root->anon_dev;
+	stat->blksize = PAGE_CACHE_SIZE;
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+	spin_unlock(&BTRFS_I(inode)->lock);
+	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+			ALIGN(delalloc_bytes, blocksize)) >> 9;
+	return 0;
+}
+
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+	struct inode *new_inode = d_inode(new_dentry);
+	struct inode *old_inode = d_inode(old_dentry);
+	struct timespec ctime = CURRENT_TIME;
+	u64 index = 0;
+	u64 root_objectid;
+	int ret;
+	u64 old_ino = btrfs_ino(old_inode);
+
+	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+		return -EPERM;
+
+	/* we only allow rename subvolume link between subvolumes */
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+		return -EXDEV;
+
+	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
+		return -ENOTEMPTY;
+
+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+		return -ENOTEMPTY;
+
+
+	/* check for collisions, even if the  name isn't there */
+	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len);
+
+	if (ret) {
+		if (ret == -EEXIST) {
+			/* we shouldn't get
+			 * eexist without a new_inode */
+			if (WARN_ON(!new_inode)) {
+				return ret;
+			}
+		} else {
+			/* maybe -EOVERFLOW */
+			return ret;
+		}
+	}
+	ret = 0;
+
+	/*
+	 * we're using rename to replace one file with another.  Start IO on it
+	 * now so  we don't add too much work to the end of the transaction
+	 */
+	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
+		filemap_flush(old_inode->i_mapping);
+
+	/* close the racy window with snapshot create/destroy ioctl */
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+		down_read(&root->fs_info->subvol_sem);
+	/*
+	 * We want to reserve the absolute worst case amount of items.  So if
+	 * both inodes are subvols and we need to unlink them then that would
+	 * require 4 item modifications, but if they are both normal inodes it
+	 * would require 5 item modifications, so we'll assume their normal
+	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+	 * should cover the worst case number of items we'll modify.
+	 */
+	trans = btrfs_start_transaction(root, 11);
+	if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_notrans;
+        }
+
+	if (dest != root)
+		btrfs_record_root_in_trans(trans, dest);
+
+	ret = btrfs_set_inode_index(new_dir, &index);
+	if (ret)
+		goto out_fail;
+
+	BTRFS_I(old_inode)->dir_index = 0ULL;
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		/* force full log commit if subvolume involved. */
+		btrfs_set_log_full_commit(root->fs_info, trans);
+	} else {
+		ret = btrfs_insert_inode_ref(trans, dest,
+					     new_dentry->d_name.name,
+					     new_dentry->d_name.len,
+					     old_ino,
+					     btrfs_ino(new_dir), index);
+		if (ret)
+			goto out_fail;
+		/*
+		 * this is an ugly little race, but the rename is required
+		 * to make sure that if we crash, the inode is either at the
+		 * old name or the new one.  pinning the log transaction lets
+		 * us make sure we don't allow a log commit to come in after
+		 * we unlink the name but before we add the new name back in.
+		 */
+		btrfs_pin_log_trans(root);
+	}
+
+	inode_inc_iversion(old_dir);
+	inode_inc_iversion(new_dir);
+	inode_inc_iversion(old_inode);
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	old_inode->i_ctime = ctime;
+
+	if (old_dentry->d_parent != new_dentry->d_parent)
+		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+					old_dentry->d_name.name,
+					old_dentry->d_name.len);
+	} else {
+		ret = __btrfs_unlink_inode(trans, root, old_dir,
+					d_inode(old_dentry),
+					old_dentry->d_name.name,
+					old_dentry->d_name.len);
+		if (!ret)
+			ret = btrfs_update_inode(trans, root, old_inode);
+	}
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
+
+	if (new_inode) {
+		inode_inc_iversion(new_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (unlikely(btrfs_ino(new_inode) ==
+			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+			root_objectid = BTRFS_I(new_inode)->location.objectid;
+			ret = btrfs_unlink_subvol(trans, dest, new_dir,
+						root_objectid,
+						new_dentry->d_name.name,
+						new_dentry->d_name.len);
+			BUG_ON(new_inode->i_nlink == 0);
+		} else {
+			ret = btrfs_unlink_inode(trans, dest, new_dir,
+						 d_inode(new_dentry),
+						 new_dentry->d_name.name,
+						 new_dentry->d_name.len);
+		}
+		if (!ret && new_inode->i_nlink == 0)
+			ret = btrfs_orphan_add(trans, d_inode(new_dentry));
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out_fail;
+		}
+	}
+
+	ret = btrfs_add_link(trans, new_dir, old_inode,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len, 0, index);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
+
+	if (old_inode->i_nlink == 1)
+		BTRFS_I(old_inode)->dir_index = index;
+
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		struct dentry *parent = new_dentry->d_parent;
+		btrfs_log_new_name(trans, old_inode, old_dir, parent);
+		btrfs_end_log_trans(root);
+	}
+out_fail:
+	btrfs_end_transaction(trans, root);
+out_notrans:
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+		up_read(&root->fs_info->subvol_sem);
+
+	return ret;
+}
+
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			 struct inode *new_dir, struct dentry *new_dentry,
+			 unsigned int flags)
+{
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+	struct btrfs_delalloc_work *delalloc_work;
+	struct inode *inode;
+
+	delalloc_work = container_of(work, struct btrfs_delalloc_work,
+				     work);
+	inode = delalloc_work->inode;
+	if (delalloc_work->wait) {
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	} else {
+		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
+
+	if (delalloc_work->delay_iput)
+		btrfs_add_delayed_iput(inode);
+	else
+		iput(inode);
+	complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput)
+{
+	struct btrfs_delalloc_work *work;
+
+	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+	if (!work)
+		return NULL;
+
+	init_completion(&work->completion);
+	INIT_LIST_HEAD(&work->list);
+	work->inode = inode;
+	work->wait = wait;
+	work->delay_iput = delay_iput;
+	WARN_ON_ONCE(!inode);
+	btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+			btrfs_run_delalloc_work, NULL, NULL);
+
+	return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+	wait_for_completion(&work->completion);
+	kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+				   int nr)
+{
+	struct btrfs_inode *binode;
+	struct inode *inode;
+	struct btrfs_delalloc_work *work, *next;
+	struct list_head works;
+	struct list_head splice;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&works);
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&root->delalloc_mutex);
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
+	while (!list_empty(&splice)) {
+		binode = list_entry(splice.next, struct btrfs_inode,
+				    delalloc_inodes);
+
+		list_move_tail(&binode->delalloc_inodes,
+			       &root->delalloc_inodes);
+		inode = igrab(&binode->vfs_inode);
+		if (!inode) {
+			cond_resched_lock(&root->delalloc_lock);
+			continue;
+		}
+		spin_unlock(&root->delalloc_lock);
+
+		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+		if (!work) {
+			if (delay_iput)
+				btrfs_add_delayed_iput(inode);
+			else
+				iput(inode);
+			ret = -ENOMEM;
+			goto out;
+		}
+		list_add_tail(&work->list, &works);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &work->work);
+		ret++;
+		if (nr != -1 && ret >= nr)
+			goto out;
+		cond_resched();
+		spin_lock(&root->delalloc_lock);
+	}
+	spin_unlock(&root->delalloc_lock);
+
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&root->delalloc_lock);
+		list_splice_tail(&splice, &root->delalloc_inodes);
+		spin_unlock(&root->delalloc_lock);
+	}
+	mutex_unlock(&root->delalloc_mutex);
+	return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+	int ret;
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		return -EROFS;
+
+	ret = __start_delalloc_inodes(root, delay_iput, -1);
+	if (ret > 0)
+		ret = 0;
+	/*
+	 * the filemap_flush will queue IO into the worker threads, but
+	 * we have to make sure the IO is actually started and that
+	 * ordered extents get created before we return
+	 */
+	atomic_inc(&root->fs_info->async_submit_draining);
+	while (atomic_read(&root->fs_info->nr_async_submits) ||
+	      atomic_read(&root->fs_info->async_delalloc_pages)) {
+		wait_event(root->fs_info->async_submit_wait,
+		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+	}
+	atomic_dec(&root->fs_info->async_submit_draining);
+	return ret;
+}
+
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+	int ret;
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		return -EROFS;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&fs_info->delalloc_root_mutex);
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
+	while (!list_empty(&splice) && nr) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->delalloc_root,
+			       &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
+
+		ret = __start_delalloc_inodes(root, delay_iput, nr);
+		btrfs_put_fs_root(root);
+		if (ret < 0)
+			goto out;
+
+		if (nr != -1) {
+			nr -= ret;
+			WARN_ON(nr < 0);
+		}
+		spin_lock(&fs_info->delalloc_root_lock);
+	}
+	spin_unlock(&fs_info->delalloc_root_lock);
+
+	ret = 0;
+	atomic_inc(&fs_info->async_submit_draining);
+	while (atomic_read(&fs_info->nr_async_submits) ||
+	      atomic_read(&fs_info->async_delalloc_pages)) {
+		wait_event(fs_info->async_submit_wait,
+		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&fs_info->async_delalloc_pages) == 0));
+	}
+	atomic_dec(&fs_info->async_submit_draining);
+out:
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&fs_info->delalloc_root_lock);
+		list_splice_tail(&splice, &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
+	}
+	mutex_unlock(&fs_info->delalloc_root_mutex);
+	return ret;
+}
+
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	u64 index = 0;
+	int name_len;
+	int datasize;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	struct extent_buffer *leaf;
+
+	name_len = strlen(symname);
+	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+		return -ENAMETOOLONG;
+
+	/*
+	 * 2 items for inode item and ref
+	 * 2 items for dir items
+	 * 1 item for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				S_IFLNK|S_IRWXUGO, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_aops;
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_unlock_inode;
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	if (err)
+		goto out_unlock_inode;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out_unlock_inode;
+	}
+	key.objectid = btrfs_ino(inode);
+	key.offset = 0;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	datasize = btrfs_file_extent_calc_inline_size(name_len);
+	err = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	if (err) {
+		btrfs_free_path(path);
+		goto out_unlock_inode;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei,
+				   BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_compression(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
+	ptr = btrfs_file_extent_inline_start(ei);
+	write_extent_buffer(leaf, symname, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	inode->i_op = &btrfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_symlink_aops;
+	inode_set_bytes(inode, name_len);
+	btrfs_i_size_write(inode, name_len);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock_inode;
+	}
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+
+out_unlock:
+	btrfs_end_transaction(trans, root);
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+
+out_unlock_inode:
+	drop_inode = 1;
+	unlock_new_inode(inode);
+	goto out_unlock;
+}
+
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+				       u64 start, u64 num_bytes, u64 min_size,
+				       loff_t actual_len, u64 *alloc_hint,
+				       struct btrfs_trans_handle *trans)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key ins;
+	u64 cur_offset = start;
+	u64 i_size;
+	u64 cur_bytes;
+	int ret = 0;
+	bool own_trans = true;
+
+	if (trans)
+		own_trans = false;
+	while (num_bytes > 0) {
+		if (own_trans) {
+			trans = btrfs_start_transaction(root, 3);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
+		}
+
+		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+		cur_bytes = max(cur_bytes, min_size);
+		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
+					   *alloc_hint, &ins, 1, 0);
+		if (ret) {
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
+
+		ret = insert_reserved_file_extent(trans, inode,
+						  cur_offset, ins.objectid,
+						  ins.offset, ins.offset,
+						  ins.offset, 0, 0, 0,
+						  BTRFS_FILE_EXTENT_PREALLOC);
+		if (ret) {
+			btrfs_free_reserved_extent(root, ins.objectid,
+						   ins.offset, 0);
+			btrfs_abort_transaction(trans, root, ret);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
+
+		btrfs_drop_extent_cache(inode, cur_offset,
+					cur_offset + ins.offset -1, 0);
+
+		em = alloc_extent_map();
+		if (!em) {
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+			goto next;
+		}
+
+		em->start = cur_offset;
+		em->orig_start = cur_offset;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		em->generation = trans->transid;
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em, 1);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST)
+				break;
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + ins.offset - 1,
+						0);
+		}
+		free_extent_map(em);
+next:
+		num_bytes -= ins.offset;
+		cur_offset += ins.offset;
+		*alloc_hint = ins.objectid + ins.offset;
+
+		inode_inc_iversion(inode);
+		inode->i_ctime = CURRENT_TIME;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    (actual_len > inode->i_size) &&
+		    (cur_offset > inode->i_size)) {
+			if (cur_offset > actual_len)
+				i_size = actual_len;
+			else
+				i_size = cur_offset;
+			i_size_write(inode, i_size);
+			btrfs_ordered_update_i_size(inode, i_size, NULL);
+		}
+
+		ret = btrfs_update_inode(trans, root, inode);
+
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
+
+		if (own_trans)
+			btrfs_end_transaction(trans, root);
+	}
+	return ret;
+}
+
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint,
+					   NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint, trans);
+}
+
+static int btrfs_set_page_dirty(struct page *page)
+{
+	return __set_page_dirty_nobuffers(page);
+}
+
+static int btrfs_permission(struct inode *inode, int mask)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	umode_t mode = inode->i_mode;
+
+	if (mask & MAY_WRITE &&
+	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+		if (btrfs_root_readonly(root))
+			return -EROFS;
+		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+			return -EACCES;
+	}
+	return generic_permission(inode, mask);
+}
+
+static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	u64 objectid;
+	u64 index;
+	int ret = 0;
+
+	/*
+	 * 5 units required for adding orphan entry
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_find_free_ino(root, &objectid);
+	if (ret)
+		goto out;
+
+	inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+				btrfs_ino(dir), objectid, mode, &index);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		inode = NULL;
+		goto out;
+	}
+
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+
+	inode->i_mapping->a_ops = &btrfs_aops;
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+	if (ret)
+		goto out_inode;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		goto out_inode;
+	ret = btrfs_orphan_add(trans, inode);
+	if (ret)
+		goto out_inode;
+
+	/*
+	 * We set number of links to 0 in btrfs_new_inode(), and here we set
+	 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+	 * through:
+	 *
+	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+	 */
+	set_nlink(inode, 1);
+	unlock_new_inode(inode);
+	d_tmpfile(dentry, inode);
+	mark_inode_dirty(inode);
+
+out:
+	btrfs_end_transaction(trans, root);
+	if (ret)
+		iput(inode);
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
+	return ret;
+
+out_inode:
+	unlock_new_inode(inode);
+	goto out;
+
+}
+
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+	int ret = 0;
+
+	if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+	    test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+		ret = -ENOSPC;
+	if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+	    test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+		ret = -EIO;
+
+	return ret;
+}
+
+static const struct inode_operations btrfs_dir_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.lookup		= btrfs_lookup,
+	.create		= btrfs_create,
+	.unlink		= btrfs_unlink,
+	.link		= btrfs_link,
+	.mkdir		= btrfs_mkdir,
+	.rmdir		= btrfs_rmdir,
+	.rename2	= btrfs_rename2,
+	.symlink	= btrfs_symlink,
+	.setattr	= btrfs_setattr,
+	.mknod		= btrfs_mknod,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.permission	= btrfs_permission,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
+	.tmpfile        = btrfs_tmpfile,
+};
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
+	.lookup		= btrfs_lookup,
+	.permission	= btrfs_permission,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
+};
+
+static const struct file_operations btrfs_dir_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= btrfs_real_readdir,
+	.unlocked_ioctl	= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_ioctl,
+#endif
+	.release        = btrfs_release_file,
+	.fsync		= btrfs_sync_file,
+};
+
+static struct extent_io_ops btrfs_extent_io_ops = {
+	.fill_delalloc = run_delalloc_range,
+	.submit_bio_hook = btrfs_submit_bio_hook,
+	.merge_bio_hook = btrfs_merge_bio_hook,
+	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
+	.writepage_start_hook = btrfs_writepage_start_hook,
+	.set_bit_hook = btrfs_set_bit_hook,
+	.clear_bit_hook = btrfs_clear_bit_hook,
+	.merge_extent_hook = btrfs_merge_extent_hook,
+	.split_extent_hook = btrfs_split_extent_hook,
+};
+
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file.  They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen.  So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
+static const struct address_space_operations btrfs_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+	.writepages	= btrfs_writepages,
+	.readpages	= btrfs_readpages,
+	.direct_IO	= btrfs_direct_IO,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+	.set_page_dirty	= btrfs_set_page_dirty,
+	.error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations btrfs_symlink_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+};
+
+static const struct inode_operations btrfs_file_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr      = btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.permission	= btrfs_permission,
+	.fiemap		= btrfs_fiemap,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
+};
+static const struct inode_operations btrfs_special_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
+};
+static const struct inode_operations btrfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.update_time	= btrfs_update_time,
+};
+
+const struct dentry_operations btrfs_dentry_operations = {
+	.d_delete	= btrfs_dentry_delete,
+	.d_release	= btrfs_dentry_release,
+};
diff --git a/kernel/fs/btrfs/ioctl.c b/kernel/fs/btrfs/ioctl.c
new file mode 100644
index 000000000..1c22c6518
--- /dev/null
+++ b/kernel/fs/btrfs/ioctl.c
@@ -0,0 +1,5359 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include <linux/btrfs.h>
+#include <linux/uaccess.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+#include "inode-map.h"
+#include "backref.h"
+#include "rcu-string.h"
+#include "send.h"
+#include "dev-replace.h"
+#include "props.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+	__u64 sec;
+	__u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+	char	uuid[BTRFS_UUID_SIZE];	/* in */
+	__u64	stransid;		/* in */
+	__u64	rtransid;		/* out */
+	struct btrfs_ioctl_timespec_32 stime; /* in */
+	struct btrfs_ioctl_timespec_32 rtime; /* out */
+	__u64	flags;			/* in */
+	__u64	reserved[16];		/* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+				struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
+
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & ~FS_DIRSYNC_FL;
+	else
+		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+	unsigned int iflags = 0;
+
+	if (flags & BTRFS_INODE_SYNC)
+		iflags |= FS_SYNC_FL;
+	if (flags & BTRFS_INODE_IMMUTABLE)
+		iflags |= FS_IMMUTABLE_FL;
+	if (flags & BTRFS_INODE_APPEND)
+		iflags |= FS_APPEND_FL;
+	if (flags & BTRFS_INODE_NODUMP)
+		iflags |= FS_NODUMP_FL;
+	if (flags & BTRFS_INODE_NOATIME)
+		iflags |= FS_NOATIME_FL;
+	if (flags & BTRFS_INODE_DIRSYNC)
+		iflags |= FS_DIRSYNC_FL;
+	if (flags & BTRFS_INODE_NODATACOW)
+		iflags |= FS_NOCOW_FL;
+
+	if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
+		iflags |= FS_COMPR_FL;
+	else if (flags & BTRFS_INODE_NOCOMPRESS)
+		iflags |= FS_NOCOMP_FL;
+
+	return iflags;
+}
+
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+	struct btrfs_inode *ip = BTRFS_I(inode);
+	unsigned int new_fl = 0;
+
+	if (ip->flags & BTRFS_INODE_SYNC)
+		new_fl |= S_SYNC;
+	if (ip->flags & BTRFS_INODE_IMMUTABLE)
+		new_fl |= S_IMMUTABLE;
+	if (ip->flags & BTRFS_INODE_APPEND)
+		new_fl |= S_APPEND;
+	if (ip->flags & BTRFS_INODE_NOATIME)
+		new_fl |= S_NOATIME;
+	if (ip->flags & BTRFS_INODE_DIRSYNC)
+		new_fl |= S_DIRSYNC;
+
+	set_mask_bits(&inode->i_flags,
+		      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+		      new_fl);
+}
+
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Currently only the compression flags and the cow flags are inherited.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+	unsigned int flags;
+
+	if (!dir)
+		return;
+
+	flags = BTRFS_I(dir)->flags;
+
+	if (flags & BTRFS_INODE_NOCOMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+	} else if (flags & BTRFS_INODE_COMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	}
+
+	if (flags & BTRFS_INODE_NODATACOW) {
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+		if (S_ISREG(inode->i_mode))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+	}
+
+	btrfs_update_iflags(inode);
+}
+
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+	struct btrfs_inode *ip = BTRFS_I(file_inode(file));
+	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
+
+static int check_flags(unsigned int flags)
+{
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL | FS_DIRSYNC_FL | \
+		      FS_NOCOMP_FL | FS_COMPR_FL |
+		      FS_NOCOW_FL))
+		return -EOPNOTSUPP;
+
+	if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_inode *ip = BTRFS_I(inode);
+	struct btrfs_root *root = ip->root;
+	struct btrfs_trans_handle *trans;
+	unsigned int flags, oldflags;
+	int ret;
+	u64 ip_oldflags;
+	unsigned int i_oldflags;
+	umode_t mode;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	ret = check_flags(flags);
+	if (ret)
+		return ret;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	ip_oldflags = ip->flags;
+	i_oldflags = inode->i_flags;
+	mode = inode->i_mode;
+
+	flags = btrfs_mask_flags(inode->i_mode, flags);
+	oldflags = btrfs_flags_to_ioctl(ip->flags);
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+	}
+
+	if (flags & FS_SYNC_FL)
+		ip->flags |= BTRFS_INODE_SYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_SYNC;
+	if (flags & FS_IMMUTABLE_FL)
+		ip->flags |= BTRFS_INODE_IMMUTABLE;
+	else
+		ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+	if (flags & FS_APPEND_FL)
+		ip->flags |= BTRFS_INODE_APPEND;
+	else
+		ip->flags &= ~BTRFS_INODE_APPEND;
+	if (flags & FS_NODUMP_FL)
+		ip->flags |= BTRFS_INODE_NODUMP;
+	else
+		ip->flags &= ~BTRFS_INODE_NODUMP;
+	if (flags & FS_NOATIME_FL)
+		ip->flags |= BTRFS_INODE_NOATIME;
+	else
+		ip->flags &= ~BTRFS_INODE_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		ip->flags |= BTRFS_INODE_DIRSYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_DIRSYNC;
+	if (flags & FS_NOCOW_FL) {
+		if (S_ISREG(mode)) {
+			/*
+			 * It's safe to turn csums off here, no extents exist.
+			 * Otherwise we want the flag to reflect the real COW
+			 * status of the file and will not set it.
+			 */
+			if (inode->i_size == 0)
+				ip->flags |= BTRFS_INODE_NODATACOW
+					   | BTRFS_INODE_NODATASUM;
+		} else {
+			ip->flags |= BTRFS_INODE_NODATACOW;
+		}
+	} else {
+		/*
+		 * Revert back under same assuptions as above
+		 */
+		if (S_ISREG(mode)) {
+			if (inode->i_size == 0)
+				ip->flags &= ~(BTRFS_INODE_NODATACOW
+				             | BTRFS_INODE_NODATASUM);
+		} else {
+			ip->flags &= ~BTRFS_INODE_NODATACOW;
+		}
+	}
+
+	/*
+	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+	 * flag may be changed automatically if compression code won't make
+	 * things smaller.
+	 */
+	if (flags & FS_NOCOMP_FL) {
+		ip->flags &= ~BTRFS_INODE_COMPRESS;
+		ip->flags |= BTRFS_INODE_NOCOMPRESS;
+
+		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+		if (ret && ret != -ENODATA)
+			goto out_drop;
+	} else if (flags & FS_COMPR_FL) {
+		const char *comp;
+
+		ip->flags |= BTRFS_INODE_COMPRESS;
+		ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+
+		if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+			comp = "lzo";
+		else
+			comp = "zlib";
+		ret = btrfs_set_prop(inode, "btrfs.compression",
+				     comp, strlen(comp), 0);
+		if (ret)
+			goto out_drop;
+
+	} else {
+		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+		if (ret && ret != -ENODATA)
+			goto out_drop;
+		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_drop;
+	}
+
+	btrfs_update_iflags(inode);
+	inode_inc_iversion(inode);
+	inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, inode);
+
+	btrfs_end_transaction(trans, root);
+ out_drop:
+	if (ret) {
+		ip->flags = ip_oldflags;
+		inode->i_flags = i_oldflags;
+	}
+
+ out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+	struct inode *inode = file_inode(file);
+
+	return put_user(inode->i_generation, arg);
+}
+
+static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+	struct btrfs_device *device;
+	struct request_queue *q;
+	struct fstrim_range range;
+	u64 minlen = ULLONG_MAX;
+	u64 num_devices = 0;
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+				dev_list) {
+		if (!device->bdev)
+			continue;
+		q = bdev_get_queue(device->bdev);
+		if (blk_queue_discard(q)) {
+			num_devices++;
+			minlen = min((u64)q->limits.discard_granularity,
+				     minlen);
+		}
+	}
+	rcu_read_unlock();
+
+	if (!num_devices)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&range, arg, sizeof(range)))
+		return -EFAULT;
+	if (range.start > total_bytes ||
+	    range.len < fs_info->sb->s_blocksize)
+		return -EINVAL;
+
+	range.len = min(range.len, total_bytes - range.start);
+	range.minlen = max(range.minlen, minlen);
+	ret = btrfs_trim_fs(fs_info->tree_root, &range);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(arg, &range, sizeof(range)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int btrfs_is_empty_uuid(u8 *uuid)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_UUID_SIZE; i++) {
+		if (uuid[i])
+			return 0;
+	}
+	return 1;
+}
+
+static noinline int create_subvol(struct inode *dir,
+				  struct dentry *dentry,
+				  char *name, int namelen,
+				  u64 *async_transid,
+				  struct btrfs_qgroup_inherit *inherit)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *new_root;
+	struct btrfs_block_rsv block_rsv;
+	struct timespec cur_time = CURRENT_TIME;
+	struct inode *inode;
+	int ret;
+	int err;
+	u64 objectid;
+	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+	u64 index = 0;
+	u64 qgroup_reserved;
+	uuid_le new_uuid;
+
+	ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
+	if (ret)
+		return ret;
+
+	/*
+	 * Don't create subvolume whose level is not zero. Or qgroup will be
+	 * screwed up since it assume subvolme qgroup's level to be 0.
+	 */
+	if (btrfs_qgroup_level(objectid))
+		return -ENOSPC;
+
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * The same as the snapshot creation, please see the comment
+	 * of create_snapshot().
+	 */
+	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       8, &qgroup_reserved, false);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_subvolume_release_metadata(root, &block_rsv,
+						 qgroup_reserved);
+		return ret;
+	}
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
+
+	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
+	if (ret)
+		goto fail;
+
+	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		goto fail;
+	}
+
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, objectid);
+
+	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+			    btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	memset(&root_item, 0, sizeof(root_item));
+
+	inode_item = &root_item.inode;
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+	btrfs_set_root_flags(&root_item, 0);
+	btrfs_set_root_limit(&root_item, 0);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 1);
+	btrfs_set_root_used(&root_item, leaf->len);
+	btrfs_set_root_last_snapshot(&root_item, 0);
+
+	btrfs_set_root_generation_v2(&root_item,
+			btrfs_root_generation(&root_item));
+	uuid_le_gen(&new_uuid);
+	memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
+	btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
+	root_item.ctime = root_item.otime;
+	btrfs_set_root_ctransid(&root_item, trans->transid);
+	btrfs_set_root_otransid(&root_item, trans->transid);
+
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, new_dirid);
+
+	key.objectid = objectid;
+	key.offset = 0;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	key.offset = (u64)-1;
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+	if (IS_ERR(new_root)) {
+		btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
+		ret = PTR_ERR(new_root);
+		goto fail;
+	}
+
+	btrfs_record_root_in_trans(trans, new_root);
+
+	ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+	if (ret) {
+		/* We potentially lose an unused inode item here */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/*
+	 * insert the directory item
+	 */
+	ret = btrfs_set_inode_index(dir, &index);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_insert_dir_item(trans, root,
+				    name, namelen, dir, &key,
+				    BTRFS_FT_DIR, index);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+	ret = btrfs_update_inode(trans, root, dir);
+	BUG_ON(ret);
+
+	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+				 objectid, root->root_key.objectid,
+				 btrfs_ino(dir), index, name, namelen);
+	BUG_ON(ret);
+
+	ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+				  root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  objectid);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+
+fail:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+
+	if (async_transid) {
+		*async_transid = trans->transid;
+		err = btrfs_commit_transaction_async(trans, root, 1);
+		if (err)
+			err = btrfs_commit_transaction(trans, root);
+	} else {
+		err = btrfs_commit_transaction(trans, root);
+	}
+	if (err && !ret)
+		ret = err;
+
+	if (!ret) {
+		inode = btrfs_lookup_dentry(dir, dentry);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+		d_instantiate(dentry, inode);
+	}
+	return ret;
+}
+
+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
+{
+	s64 writers;
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(&root->subv_writers->wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		writers = percpu_counter_sum(&root->subv_writers->counter);
+		if (writers)
+			schedule();
+
+		finish_wait(&root->subv_writers->wait, &wait);
+	} while (writers);
+}
+
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+			   struct dentry *dentry, char *name, int namelen,
+			   u64 *async_transid, bool readonly,
+			   struct btrfs_qgroup_inherit *inherit)
+{
+	struct inode *inode;
+	struct btrfs_pending_snapshot *pending_snapshot;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		return -EINVAL;
+
+	atomic_inc(&root->will_be_snapshoted);
+	smp_mb__after_atomic();
+	btrfs_wait_for_no_snapshoting_writes(root);
+
+	ret = btrfs_start_delalloc_inodes(root, 0);
+	if (ret)
+		goto out;
+
+	btrfs_wait_ordered_extents(root, -1);
+
+	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+	if (!pending_snapshot) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+			     BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * 1 - parent dir inode
+	 * 2 - dir entries
+	 * 1 - root item
+	 * 2 - root ref/backref
+	 * 1 - root of snapshot
+	 * 1 - UUID item
+	 */
+	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+					&pending_snapshot->block_rsv, 8,
+					&pending_snapshot->qgroup_reserved,
+					false);
+	if (ret)
+		goto free;
+
+	pending_snapshot->dentry = dentry;
+	pending_snapshot->root = root;
+	pending_snapshot->readonly = readonly;
+	pending_snapshot->dir = dir;
+	pending_snapshot->inherit = inherit;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto fail;
+	}
+
+	spin_lock(&root->fs_info->trans_lock);
+	list_add(&pending_snapshot->list,
+		 &trans->transaction->pending_snapshots);
+	spin_unlock(&root->fs_info->trans_lock);
+	if (async_transid) {
+		*async_transid = trans->transid;
+		ret = btrfs_commit_transaction_async(trans,
+				     root->fs_info->extent_root, 1);
+		if (ret)
+			ret = btrfs_commit_transaction(trans, root);
+	} else {
+		ret = btrfs_commit_transaction(trans,
+					       root->fs_info->extent_root);
+	}
+	if (ret)
+		goto fail;
+
+	ret = pending_snapshot->error;
+	if (ret)
+		goto fail;
+
+	ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+	if (ret)
+		goto fail;
+
+	inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto fail;
+	}
+
+	d_instantiate(dentry, inode);
+	ret = 0;
+fail:
+	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+					 &pending_snapshot->block_rsv,
+					 pending_snapshot->qgroup_reserved);
+free:
+	kfree(pending_snapshot);
+out:
+	if (atomic_dec_and_test(&root->will_be_snapshoted))
+		wake_up_atomic_t(&root->will_be_snapshoted);
+	return ret;
+}
+
+/*  copy of may_delete in fs/namei.c()
+ *	Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *	a. be owner of dir, or
+ *	b. be owner of victim, or
+ *	c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+
+static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
+{
+	int error;
+
+	if (d_really_is_negative(victim))
+		return -ENOENT;
+
+	BUG_ON(d_inode(victim->d_parent) != dir);
+	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
+
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+	if (IS_APPEND(dir))
+		return -EPERM;
+	if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
+	    IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+		return -EPERM;
+	if (isdir) {
+		if (!d_is_dir(victim))
+			return -ENOTDIR;
+		if (IS_ROOT(victim))
+			return -EBUSY;
+	} else if (d_is_dir(victim))
+		return -EISDIR;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EBUSY;
+	return 0;
+}
+
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+	if (d_really_is_positive(child))
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent.  This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent,
+				   char *name, int namelen,
+				   struct btrfs_root *snap_src,
+				   u64 *async_transid, bool readonly,
+				   struct btrfs_qgroup_inherit *inherit)
+{
+	struct inode *dir  = d_inode(parent->dentry);
+	struct dentry *dentry;
+	int error;
+
+	error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (error == -EINTR)
+		return error;
+
+	dentry = lookup_one_len(name, parent->dentry, namelen);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+
+	error = -EEXIST;
+	if (d_really_is_positive(dentry))
+		goto out_dput;
+
+	error = btrfs_may_create(dir, dentry);
+	if (error)
+		goto out_dput;
+
+	/*
+	 * even if this name doesn't exist, we may get hash collisions.
+	 * check for them now when we can safely fail
+	 */
+	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+					       dir->i_ino, name,
+					       namelen);
+	if (error)
+		goto out_dput;
+
+	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+		goto out_up_read;
+
+	if (snap_src) {
+		error = create_snapshot(snap_src, dir, dentry, name, namelen,
+					async_transid, readonly, inherit);
+	} else {
+		error = create_subvol(dir, dentry, name, namelen,
+				      async_transid, inherit);
+	}
+	if (!error)
+		fsnotify_mkdir(dir, dentry);
+out_up_read:
+	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+out_dput:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&dir->i_mutex);
+	return error;
+}
+
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 end;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+	read_unlock(&em_tree->lock);
+
+	if (em) {
+		end = extent_map_end(em);
+		free_extent_map(em);
+		if (end - offset > thresh)
+			return 0;
+	}
+	/* if we already have a nice delalloc here, just stop */
+	thresh /= 2;
+	end = count_range_bits(io_tree, &offset, offset + thresh,
+			       thresh, EXTENT_DELALLOC, 1);
+	if (end >= thresh)
+		return 0;
+	return 1;
+}
+
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+			    struct inode *inode, u64 newer_than,
+			    u64 *off, u32 thresh)
+{
+	struct btrfs_path *path;
+	struct btrfs_key min_key;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *extent;
+	int type;
+	int ret;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	min_key.objectid = ino;
+	min_key.type = BTRFS_EXTENT_DATA_KEY;
+	min_key.offset = *off;
+
+	while (1) {
+		ret = btrfs_search_forward(root, &min_key, path, newer_than);
+		if (ret != 0)
+			goto none;
+process_slot:
+		if (min_key.objectid != ino)
+			goto none;
+		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto none;
+
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_file_extent_item);
+
+		type = btrfs_file_extent_type(leaf, extent);
+		if (type == BTRFS_FILE_EXTENT_REG &&
+		    btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+		    check_defrag_in_cache(inode, min_key.offset, thresh)) {
+			*off = min_key.offset;
+			btrfs_free_path(path);
+			return 0;
+		}
+
+		path->slots[0]++;
+		if (path->slots[0] < btrfs_header_nritems(leaf)) {
+			btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+			goto process_slot;
+		}
+
+		if (min_key.offset == (u64)-1)
+			goto none;
+
+		min_key.offset++;
+		btrfs_release_path(path);
+	}
+none:
+	btrfs_free_path(path);
+	return -ENOENT;
+}
+
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em;
+	u64 len = PAGE_CACHE_SIZE;
+
+	/*
+	 * hopefully we have this extent in the tree already, try without
+	 * the full extent lock
+	 */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		struct extent_state *cached = NULL;
+		u64 end = start + len - 1;
+
+		/* get the big lock and read metadata off disk */
+		lock_extent_bits(io_tree, start, end, 0, &cached);
+		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+		unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
+
+		if (IS_ERR(em))
+			return NULL;
+	}
+
+	return em;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+{
+	struct extent_map *next;
+	bool ret = true;
+
+	/* this is the last extent */
+	if (em->start + em->len >= i_size_read(inode))
+		return false;
+
+	next = defrag_lookup_extent(inode, em->start + em->len);
+	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+		ret = false;
+	else if ((em->block_start + em->block_len == next->block_start) &&
+		 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+		ret = false;
+
+	free_extent_map(next);
+	return ret;
+}
+
+static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
+			       u64 *last_len, u64 *skip, u64 *defrag_end,
+			       int compress)
+{
+	struct extent_map *em;
+	int ret = 1;
+	bool next_mergeable = true;
+
+	/*
+	 * make sure that once we start defragging an extent, we keep on
+	 * defragging it
+	 */
+	if (start < *defrag_end)
+		return 1;
+
+	*skip = 0;
+
+	em = defrag_lookup_extent(inode, start);
+	if (!em)
+		return 0;
+
+	/* this will cover holes, and inline extents */
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		ret = 0;
+		goto out;
+	}
+
+	next_mergeable = defrag_check_next_extent(inode, em);
+	/*
+	 * we hit a real extent, if it is big or the next extent is not a
+	 * real extent, don't bother defragging it
+	 */
+	if (!compress && (*last_len == 0 || *last_len >= thresh) &&
+	    (em->len >= thresh || !next_mergeable))
+		ret = 0;
+out:
+	/*
+	 * last_len ends up being a counter of how many bytes we've defragged.
+	 * every time we choose not to defrag an extent, we reset *last_len
+	 * so that the next tiny extent will force a defrag.
+	 *
+	 * The end result of this is that tiny extents before a single big
+	 * extent will force at least part of that big extent to be defragged.
+	 */
+	if (ret) {
+		*defrag_end = extent_map_end(em);
+	} else {
+		*last_len = 0;
+		*skip = extent_map_end(em);
+		*defrag_end = 0;
+	}
+
+	free_extent_map(em);
+	return ret;
+}
+
+/*
+ * it doesn't do much good to defrag one or two pages
+ * at a time.  This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+				    struct page **pages,
+				    unsigned long start_index,
+				    unsigned long num_pages)
+{
+	unsigned long file_end;
+	u64 isize = i_size_read(inode);
+	u64 page_start;
+	u64 page_end;
+	u64 page_cnt;
+	int ret;
+	int i;
+	int i_done;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *tree;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+
+	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+	if (!isize || start_index > file_end)
+		return 0;
+
+	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+
+	ret = btrfs_delalloc_reserve_space(inode,
+					   page_cnt << PAGE_CACHE_SHIFT);
+	if (ret)
+		return ret;
+	i_done = 0;
+	tree = &BTRFS_I(inode)->io_tree;
+
+	/* step one, lock all the pages */
+	for (i = 0; i < page_cnt; i++) {
+		struct page *page;
+again:
+		page = find_or_create_page(inode->i_mapping,
+					   start_index + i, mask);
+		if (!page)
+			break;
+
+		page_start = page_offset(page);
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		while (1) {
+			lock_extent_bits(tree, page_start, page_end,
+					 0, &cached_state);
+			ordered = btrfs_lookup_ordered_extent(inode,
+							      page_start);
+			unlock_extent_cached(tree, page_start, page_end,
+					     &cached_state, GFP_NOFS);
+			if (!ordered)
+				break;
+
+			unlock_page(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			lock_page(page);
+			/*
+			 * we unlocked the page above, so we need check if
+			 * it was released or not.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto again;
+			}
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				break;
+			}
+		}
+
+		if (page->mapping != inode->i_mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+
+		pages[i] = page;
+		i_done++;
+	}
+	if (!i_done || ret)
+		goto out;
+
+	if (!(inode->i_sb->s_flags & MS_ACTIVE))
+		goto out;
+
+	/*
+	 * so now we have a nice long stream of locked
+	 * and up to date pages, lets wait on them
+	 */
+	for (i = 0; i < i_done; i++)
+		wait_on_page_writeback(pages[i]);
+
+	page_start = page_offset(pages[0]);
+	page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree,
+			 page_start, page_end - 1, 0, &cached_state);
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+			  &cached_state, GFP_NOFS);
+
+	if (i_done != page_cnt) {
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->outstanding_extents++;
+		spin_unlock(&BTRFS_I(inode)->lock);
+		btrfs_delalloc_release_space(inode,
+				     (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+	}
+
+
+	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+			  &cached_state, GFP_NOFS);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+			     page_start, page_end - 1, &cached_state,
+			     GFP_NOFS);
+
+	for (i = 0; i < i_done; i++) {
+		clear_page_dirty_for_io(pages[i]);
+		ClearPageChecked(pages[i]);
+		set_page_extent_mapped(pages[i]);
+		set_page_dirty(pages[i]);
+		unlock_page(pages[i]);
+		page_cache_release(pages[i]);
+	}
+	return i_done;
+out:
+	for (i = 0; i < i_done; i++) {
+		unlock_page(pages[i]);
+		page_cache_release(pages[i]);
+	}
+	btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+	return ret;
+
+}
+
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_to_defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file_ra_state *ra = NULL;
+	unsigned long last_index;
+	u64 isize = i_size_read(inode);
+	u64 last_len = 0;
+	u64 skip = 0;
+	u64 defrag_end = 0;
+	u64 newer_off = range->start;
+	unsigned long i;
+	unsigned long ra_index = 0;
+	int ret;
+	int defrag_count = 0;
+	int compress_type = BTRFS_COMPRESS_ZLIB;
+	u32 extent_thresh = range->extent_thresh;
+	unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	unsigned long cluster = max_cluster;
+	u64 new_align = ~((u64)128 * 1024 - 1);
+	struct page **pages = NULL;
+
+	if (isize == 0)
+		return 0;
+
+	if (range->start >= isize)
+		return -EINVAL;
+
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		if (range->compress_type > BTRFS_COMPRESS_TYPES)
+			return -EINVAL;
+		if (range->compress_type)
+			compress_type = range->compress_type;
+	}
+
+	if (extent_thresh == 0)
+		extent_thresh = 256 * 1024;
+
+	/*
+	 * if we were not given a file, allocate a readahead
+	 * context
+	 */
+	if (!file) {
+		ra = kzalloc(sizeof(*ra), GFP_NOFS);
+		if (!ra)
+			return -ENOMEM;
+		file_ra_state_init(ra, inode->i_mapping);
+	} else {
+		ra = &file->f_ra;
+	}
+
+	pages = kmalloc_array(max_cluster, sizeof(struct page *),
+			GFP_NOFS);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out_ra;
+	}
+
+	/* find the last page to defrag */
+	if (range->start + range->len > range->start) {
+		last_index = min_t(u64, isize - 1,
+			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+	} else {
+		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	if (newer_than) {
+		ret = find_new_extents(root, inode, newer_than,
+				       &newer_off, 64 * 1024);
+		if (!ret) {
+			range->start = newer_off;
+			/*
+			 * we always align our defrag to help keep
+			 * the extents in the file evenly spaced
+			 */
+			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+		} else
+			goto out_ra;
+	} else {
+		i = range->start >> PAGE_CACHE_SHIFT;
+	}
+	if (!max_to_defrag)
+		max_to_defrag = last_index + 1;
+
+	/*
+	 * make writeback starts from i, so the defrag range can be
+	 * written sequentially.
+	 */
+	if (i < inode->i_mapping->writeback_index)
+		inode->i_mapping->writeback_index = i;
+
+	while (i <= last_index && defrag_count < max_to_defrag &&
+	       (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
+		/*
+		 * make sure we stop running if someone unmounts
+		 * the FS
+		 */
+		if (!(inode->i_sb->s_flags & MS_ACTIVE))
+			break;
+
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+					 extent_thresh, &last_len, &skip,
+					 &defrag_end, range->flags &
+					 BTRFS_DEFRAG_RANGE_COMPRESS)) {
+			unsigned long next;
+			/*
+			 * the should_defrag function tells us how much to skip
+			 * bump our counter by the suggested amount
+			 */
+			next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
+			i = max(i + 1, next);
+			continue;
+		}
+
+		if (!newer_than) {
+			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+				   PAGE_CACHE_SHIFT) - i;
+			cluster = min(cluster, max_cluster);
+		} else {
+			cluster = max_cluster;
+		}
+
+		if (i + cluster > ra_index) {
+			ra_index = max(i, ra_index);
+			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+				       cluster);
+			ra_index += max_cluster;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+			BTRFS_I(inode)->force_compress = compress_type;
+		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+		if (ret < 0) {
+			mutex_unlock(&inode->i_mutex);
+			goto out_ra;
+		}
+
+		defrag_count += ret;
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		mutex_unlock(&inode->i_mutex);
+
+		if (newer_than) {
+			if (newer_off == (u64)-1)
+				break;
+
+			if (ret > 0)
+				i += ret;
+
+			newer_off = max(newer_off + 1,
+					(u64)i << PAGE_CACHE_SHIFT);
+
+			ret = find_new_extents(root, inode,
+					       newer_than, &newer_off,
+					       64 * 1024);
+			if (!ret) {
+				range->start = newer_off;
+				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+			} else {
+				break;
+			}
+		} else {
+			if (ret > 0) {
+				i += ret;
+				last_len += ret << PAGE_CACHE_SHIFT;
+			} else {
+				i++;
+				last_len = 0;
+			}
+		}
+	}
+
+	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
+		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
+
+	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+		/* the filemap_flush will queue IO into the worker threads, but
+		 * we have to make sure the IO is actually started and that
+		 * ordered extents get created before we return
+		 */
+		atomic_inc(&root->fs_info->async_submit_draining);
+		while (atomic_read(&root->fs_info->nr_async_submits) ||
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+		}
+		atomic_dec(&root->fs_info->async_submit_draining);
+	}
+
+	if (range->compress_type == BTRFS_COMPRESS_LZO) {
+		btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
+	}
+
+	ret = defrag_count;
+
+out_ra:
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		mutex_lock(&inode->i_mutex);
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+		mutex_unlock(&inode->i_mutex);
+	}
+	if (!file)
+		kfree(ra);
+	kfree(pages);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_resize(struct file *file,
+					void __user *arg)
+{
+	u64 new_size;
+	u64 old_size;
+	u64 devid = 1;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device = NULL;
+	char *sizestr;
+	char *retptr;
+	char *devstr = NULL;
+	int ret = 0;
+	int mod = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		mnt_drop_write_file(file);
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+	sizestr = vol_args->name;
+	devstr = strchr(sizestr, ':');
+	if (devstr) {
+		sizestr = devstr + 1;
+		*devstr = '\0';
+		devstr = vol_args->name;
+		ret = kstrtoull(devstr, 10, &devid);
+		if (ret)
+			goto out_free;
+		if (!devid) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+		btrfs_info(root->fs_info, "resizing devid %llu", devid);
+	}
+
+	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+	if (!device) {
+		btrfs_info(root->fs_info, "resizer unable to find device %llu",
+		       devid);
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	if (!device->writeable) {
+		btrfs_info(root->fs_info,
+			   "resizer unable to apply on readonly device %llu",
+		       devid);
+		ret = -EPERM;
+		goto out_free;
+	}
+
+	if (!strcmp(sizestr, "max"))
+		new_size = device->bdev->bd_inode->i_size;
+	else {
+		if (sizestr[0] == '-') {
+			mod = -1;
+			sizestr++;
+		} else if (sizestr[0] == '+') {
+			mod = 1;
+			sizestr++;
+		}
+		new_size = memparse(sizestr, &retptr);
+		if (*retptr != '\0' || new_size == 0) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+	}
+
+	if (device->is_tgtdev_for_dev_replace) {
+		ret = -EPERM;
+		goto out_free;
+	}
+
+	old_size = btrfs_device_get_total_bytes(device);
+
+	if (mod < 0) {
+		if (new_size > old_size) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+		new_size = old_size - new_size;
+	} else if (mod > 0) {
+		if (new_size > ULLONG_MAX - old_size) {
+			ret = -ERANGE;
+			goto out_free;
+		}
+		new_size = old_size + new_size;
+	}
+
+	if (new_size < 256 * 1024 * 1024) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+	if (new_size > device->bdev->bd_inode->i_size) {
+		ret = -EFBIG;
+		goto out_free;
+	}
+
+	new_size = div_u64(new_size, root->sectorsize);
+	new_size *= root->sectorsize;
+
+	printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+		      rcu_str_deref(device->name), new_size);
+
+	if (new_size > old_size) {
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out_free;
+		}
+		ret = btrfs_grow_device(trans, device, new_size);
+		btrfs_commit_transaction(trans, root);
+	} else if (new_size < old_size) {
+		ret = btrfs_shrink_device(device, new_size);
+	} /* equal, nothing need to do */
+
+out_free:
+	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+				char *name, unsigned long fd, int subvol,
+				u64 *transid, bool readonly,
+				struct btrfs_qgroup_inherit *inherit)
+{
+	int namelen;
+	int ret = 0;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto out;
+
+	namelen = strlen(name);
+	if (strchr(name, '/')) {
+		ret = -EINVAL;
+		goto out_drop_write;
+	}
+
+	if (name[0] == '.' &&
+	   (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+		ret = -EEXIST;
+		goto out_drop_write;
+	}
+
+	if (subvol) {
+		ret = btrfs_mksubvol(&file->f_path, name, namelen,
+				     NULL, transid, readonly, inherit);
+	} else {
+		struct fd src = fdget(fd);
+		struct inode *src_inode;
+		if (!src.file) {
+			ret = -EINVAL;
+			goto out_drop_write;
+		}
+
+		src_inode = file_inode(src.file);
+		if (src_inode->i_sb != file_inode(file)->i_sb) {
+			btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+				   "Snapshot src from another FS");
+			ret = -EXDEV;
+		} else if (!inode_owner_or_capable(src_inode)) {
+			/*
+			 * Subvolume creation is not restricted, but snapshots
+			 * are limited to own subvolumes only
+			 */
+			ret = -EPERM;
+		} else {
+			ret = btrfs_mksubvol(&file->f_path, name, namelen,
+					     BTRFS_I(src_inode)->root,
+					     transid, readonly, inherit);
+		}
+		fdput(src);
+	}
+out_drop_write:
+	mnt_drop_write_file(file);
+out:
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+					    void __user *arg, int subvol)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol,
+					      NULL, false, NULL);
+
+	kfree(vol_args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+					       void __user *arg, int subvol)
+{
+	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	int ret;
+	u64 transid = 0;
+	u64 *ptr = NULL;
+	bool readonly = false;
+	struct btrfs_qgroup_inherit *inherit = NULL;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+
+	if (vol_args->flags &
+	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+	      BTRFS_SUBVOL_QGROUP_INHERIT)) {
+		ret = -EOPNOTSUPP;
+		goto free_args;
+	}
+
+	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+		ptr = &transid;
+	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+		readonly = true;
+	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+		if (vol_args->size > PAGE_CACHE_SIZE) {
+			ret = -EINVAL;
+			goto free_args;
+		}
+		inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+		if (IS_ERR(inherit)) {
+			ret = PTR_ERR(inherit);
+			goto free_args;
+		}
+	}
+
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol, ptr,
+					      readonly, inherit);
+	if (ret)
+		goto free_inherit;
+
+	if (ptr && copy_to_user(arg +
+				offsetof(struct btrfs_ioctl_vol_args_v2,
+					transid),
+				ptr, sizeof(*ptr)))
+		ret = -EFAULT;
+
+free_inherit:
+	kfree(inherit);
+free_args:
+	kfree(vol_args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+						void __user *arg)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	u64 flags = 0;
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
+		return -EINVAL;
+
+	down_read(&root->fs_info->subvol_sem);
+	if (btrfs_root_readonly(root))
+		flags |= BTRFS_SUBVOL_RDONLY;
+	up_read(&root->fs_info->subvol_sem);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+					      void __user *arg)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 root_flags;
+	u64 flags;
+	int ret = 0;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto out;
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		ret = -EINVAL;
+		goto out_drop_write;
+	}
+
+	if (copy_from_user(&flags, arg, sizeof(flags))) {
+		ret = -EFAULT;
+		goto out_drop_write;
+	}
+
+	if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+		ret = -EINVAL;
+		goto out_drop_write;
+	}
+
+	if (flags & ~BTRFS_SUBVOL_RDONLY) {
+		ret = -EOPNOTSUPP;
+		goto out_drop_write;
+	}
+
+	down_write(&root->fs_info->subvol_sem);
+
+	/* nothing to do */
+	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+		goto out_drop_sem;
+
+	root_flags = btrfs_root_flags(&root->root_item);
+	if (flags & BTRFS_SUBVOL_RDONLY) {
+		btrfs_set_root_flags(&root->root_item,
+				     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+	} else {
+		/*
+		 * Block RO -> RW transition if this subvolume is involved in
+		 * send
+		 */
+		spin_lock(&root->root_item_lock);
+		if (root->send_in_progress == 0) {
+			btrfs_set_root_flags(&root->root_item,
+				     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+			spin_unlock(&root->root_item_lock);
+		} else {
+			spin_unlock(&root->root_item_lock);
+			btrfs_warn(root->fs_info,
+			"Attempt to set subvolume %llu read-write during send",
+					root->root_key.objectid);
+			ret = -EPERM;
+			goto out_drop_sem;
+		}
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_reset;
+	}
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+
+	btrfs_commit_transaction(trans, root);
+out_reset:
+	if (ret)
+		btrfs_set_root_flags(&root->root_item, root_flags);
+out_drop_sem:
+	up_write(&root->fs_info->subvol_sem);
+out_drop_write:
+	mnt_drop_write_file(file);
+out:
+	return ret;
+}
+
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 dir_id;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Make sure this root isn't set as the default subvol */
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+		if (key.objectid == root->root_key.objectid) {
+			ret = -EPERM;
+			btrfs_err(root->fs_info, "deleting default subvolume "
+				  "%llu is not allowed", key.objectid);
+			goto out;
+		}
+		btrfs_release_path(path);
+	}
+
+	key.objectid = root->root_key.objectid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	ret = 0;
+	if (path->slots[0] > 0) {
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == root->root_key.objectid &&
+		    key.type == BTRFS_ROOT_REF_KEY)
+			ret = -ENOTEMPTY;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int key_in_sk(struct btrfs_key *key,
+			      struct btrfs_ioctl_search_key *sk)
+{
+	struct btrfs_key test;
+	int ret;
+
+	test.objectid = sk->min_objectid;
+	test.type = sk->min_type;
+	test.offset = sk->min_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret < 0)
+		return 0;
+
+	test.objectid = sk->max_objectid;
+	test.type = sk->max_type;
+	test.offset = sk->max_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret > 0)
+		return 0;
+	return 1;
+}
+
+static noinline int copy_to_sk(struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *key,
+			       struct btrfs_ioctl_search_key *sk,
+			       size_t *buf_size,
+			       char __user *ubuf,
+			       unsigned long *sk_offset,
+			       int *num_found)
+{
+	u64 found_transid;
+	struct extent_buffer *leaf;
+	struct btrfs_ioctl_search_header sh;
+	unsigned long item_off;
+	unsigned long item_len;
+	int nritems;
+	int i;
+	int slot;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	if (btrfs_header_generation(leaf) > sk->max_transid) {
+		i = nritems;
+		goto advance_key;
+	}
+	found_transid = btrfs_header_generation(leaf);
+
+	for (i = slot; i < nritems; i++) {
+		item_off = btrfs_item_ptr_offset(leaf, i);
+		item_len = btrfs_item_size_nr(leaf, i);
+
+		btrfs_item_key_to_cpu(leaf, key, i);
+		if (!key_in_sk(key, sk))
+			continue;
+
+		if (sizeof(sh) + item_len > *buf_size) {
+			if (*num_found) {
+				ret = 1;
+				goto out;
+			}
+
+			/*
+			 * return one empty item back for v1, which does not
+			 * handle -EOVERFLOW
+			 */
+
+			*buf_size = sizeof(sh) + item_len;
+			item_len = 0;
+			ret = -EOVERFLOW;
+		}
+
+		if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
+			ret = 1;
+			goto out;
+		}
+
+		sh.objectid = key->objectid;
+		sh.offset = key->offset;
+		sh.type = key->type;
+		sh.len = item_len;
+		sh.transid = found_transid;
+
+		/* copy search result header */
+		if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		*sk_offset += sizeof(sh);
+
+		if (item_len) {
+			char __user *up = ubuf + *sk_offset;
+			/* copy the item */
+			if (read_extent_buffer_to_user(leaf, up,
+						       item_off, item_len)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			*sk_offset += item_len;
+		}
+		(*num_found)++;
+
+		if (ret) /* -EOVERFLOW from above */
+			goto out;
+
+		if (*num_found >= sk->nr_items) {
+			ret = 1;
+			goto out;
+		}
+	}
+advance_key:
+	ret = 0;
+	if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+		key->offset++;
+	else if (key->type < (u8)-1 && key->type < sk->max_type) {
+		key->offset = 0;
+		key->type++;
+	} else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+		key->offset = 0;
+		key->type = 0;
+		key->objectid++;
+	} else
+		ret = 1;
+out:
+	/*
+	 *  0: all items from this leaf copied, continue with next
+	 *  1: * more items can be copied, but unused buffer is too small
+	 *     * all items were found
+	 *     Either way, it will stops the loop which iterates to the next
+	 *     leaf
+	 *  -EOVERFLOW: item was to large for buffer
+	 *  -EFAULT: could not copy extent buffer back to userspace
+	 */
+	return ret;
+}
+
+static noinline int search_ioctl(struct inode *inode,
+				 struct btrfs_ioctl_search_key *sk,
+				 size_t *buf_size,
+				 char __user *ubuf)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
+	int ret;
+	int num_found = 0;
+	unsigned long sk_offset = 0;
+
+	if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+		*buf_size = sizeof(struct btrfs_ioctl_search_header);
+		return -EOVERFLOW;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (sk->tree_id == 0) {
+		/* search the root of the inode that was passed */
+		root = BTRFS_I(inode)->root;
+	} else {
+		key.objectid = sk->tree_id;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		root = btrfs_read_fs_root_no_name(info, &key);
+		if (IS_ERR(root)) {
+			printk(KERN_ERR "BTRFS: could not find root %llu\n",
+			       sk->tree_id);
+			btrfs_free_path(path);
+			return -ENOENT;
+		}
+	}
+
+	key.objectid = sk->min_objectid;
+	key.type = sk->min_type;
+	key.offset = sk->min_offset;
+
+	while (1) {
+		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
+		if (ret != 0) {
+			if (ret > 0)
+				ret = 0;
+			goto err;
+		}
+		ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
+				 &sk_offset, &num_found);
+		btrfs_release_path(path);
+		if (ret)
+			break;
+
+	}
+	if (ret > 0)
+		ret = 0;
+err:
+	sk->nr_items = num_found;
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search(struct file *file,
+					   void __user *argp)
+{
+	struct btrfs_ioctl_search_args __user *uargs;
+	struct btrfs_ioctl_search_key sk;
+	struct inode *inode;
+	int ret;
+	size_t buf_size;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	uargs = (struct btrfs_ioctl_search_args __user *)argp;
+
+	if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+		return -EFAULT;
+
+	buf_size = sizeof(uargs->buf);
+
+	inode = file_inode(file);
+	ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+	/*
+	 * In the origin implementation an overflow is handled by returning a
+	 * search header with a len of zero, so reset ret.
+	 */
+	if (ret == -EOVERFLOW)
+		ret = 0;
+
+	if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+					       void __user *argp)
+{
+	struct btrfs_ioctl_search_args_v2 __user *uarg;
+	struct btrfs_ioctl_search_args_v2 args;
+	struct inode *inode;
+	int ret;
+	size_t buf_size;
+	const size_t buf_limit = 16 * 1024 * 1024;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* copy search header and buffer size */
+	uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+	if (copy_from_user(&args, uarg, sizeof(args)))
+		return -EFAULT;
+
+	buf_size = args.buf_size;
+
+	if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+		return -EOVERFLOW;
+
+	/* limit result size to 16MB */
+	if (buf_size > buf_limit)
+		buf_size = buf_limit;
+
+	inode = file_inode(file);
+	ret = search_ioctl(inode, &args.key, &buf_size,
+			   (char *)(&uarg->buf[0]));
+	if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+		ret = -EFAULT;
+	else if (ret == -EOVERFLOW &&
+		copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+/*
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+				u64 tree_id, u64 dirid, char *name)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	char *ptr;
+	int ret = -1;
+	int slot;
+	int len;
+	int total_len = 0;
+	struct btrfs_inode_ref *iref;
+	struct extent_buffer *l;
+	struct btrfs_path *path;
+
+	if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+		name[0]='\0';
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+
+	key.objectid = tree_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(info, &key);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	key.objectid = dirid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+		else if (ret > 0) {
+			ret = btrfs_previous_item(root, path, dirid,
+						  BTRFS_INODE_REF_KEY);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0) {
+				ret = -ENOENT;
+				goto out;
+			}
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+		len = btrfs_inode_ref_name_len(l, iref);
+		ptr -= len + 1;
+		total_len += len + 1;
+		if (ptr < name) {
+			ret = -ENAMETOOLONG;
+			goto out;
+		}
+
+		*(ptr + len) = '/';
+		read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
+
+		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+			break;
+
+		btrfs_release_path(path);
+		key.objectid = key.offset;
+		key.offset = (u64)-1;
+		dirid = key.objectid;
+	}
+	memmove(name, ptr, total_len);
+	name[total_len] = '\0';
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+					   void __user *argp)
+{
+	 struct btrfs_ioctl_ino_lookup_args *args;
+	 struct inode *inode;
+	 int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
+
+	inode = file_inode(file);
+
+	if (args->treeid == 0)
+		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+
+	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+					args->treeid, args->objectid,
+					args->name);
+
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+
+	kfree(args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+					     void __user *arg)
+{
+	struct dentry *parent = file->f_path.dentry;
+	struct dentry *dentry;
+	struct inode *dir = d_inode(parent);
+	struct inode *inode;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *dest = NULL;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_block_rsv block_rsv;
+	u64 root_flags;
+	u64 qgroup_reserved;
+	int namelen;
+	int ret;
+	int err = 0;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	namelen = strlen(vol_args->name);
+	if (strchr(vol_args->name, '/') ||
+	    strncmp(vol_args->name, "..", namelen) == 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mnt_want_write_file(file);
+	if (err)
+		goto out;
+
+
+	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (err == -EINTR)
+		goto out_drop_write;
+	dentry = lookup_one_len(vol_args->name, parent, namelen);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock_dir;
+	}
+
+	if (d_really_is_negative(dentry)) {
+		err = -ENOENT;
+		goto out_dput;
+	}
+
+	inode = d_inode(dentry);
+	dest = BTRFS_I(inode)->root;
+	if (!capable(CAP_SYS_ADMIN)) {
+		/*
+		 * Regular user.  Only allow this with a special mount
+		 * option, when the user has write+exec access to the
+		 * subvol root, and when rmdir(2) would have been
+		 * allowed.
+		 *
+		 * Note that this is _not_ check that the subvol is
+		 * empty or doesn't contain data that we wouldn't
+		 * otherwise be able to delete.
+		 *
+		 * Users who want to delete empty subvols should try
+		 * rmdir(2).
+		 */
+		err = -EPERM;
+		if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+			goto out_dput;
+
+		/*
+		 * Do not allow deletion if the parent dir is the same
+		 * as the dir to be deleted.  That means the ioctl
+		 * must be called on the dentry referencing the root
+		 * of the subvol, not a random directory contained
+		 * within it.
+		 */
+		err = -EINVAL;
+		if (root == dest)
+			goto out_dput;
+
+		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+		if (err)
+			goto out_dput;
+	}
+
+	/* check if subvolume may be deleted by a user */
+	err = btrfs_may_delete(dir, dentry, 1);
+	if (err)
+		goto out_dput;
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		err = -EINVAL;
+		goto out_dput;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * Don't allow to delete a subvolume with send in progress. This is
+	 * inside the i_mutex so the error handling that has to drop the bit
+	 * again is not run concurrently.
+	 */
+	spin_lock(&dest->root_item_lock);
+	root_flags = btrfs_root_flags(&dest->root_item);
+	if (dest->send_in_progress == 0) {
+		btrfs_set_root_flags(&dest->root_item,
+				root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+		spin_unlock(&dest->root_item_lock);
+	} else {
+		spin_unlock(&dest->root_item_lock);
+		btrfs_warn(root->fs_info,
+			"Attempt to delete subvolume %llu during send",
+			dest->root_key.objectid);
+		err = -EPERM;
+		goto out_unlock_inode;
+	}
+
+	d_invalidate(dentry);
+
+	down_write(&root->fs_info->subvol_sem);
+
+	err = may_destroy_subvol(dest);
+	if (err)
+		goto out_up_write;
+
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * One for dir inode, two for dir entries, two for root
+	 * ref/backref.
+	 */
+	err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       5, &qgroup_reserved, true);
+	if (err)
+		goto out_up_write;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_release;
+	}
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
+
+	ret = btrfs_unlink_subvol(trans, root, dir,
+				dest->root_key.objectid,
+				dentry->d_name.name,
+				dentry->d_name.len);
+	if (ret) {
+		err = ret;
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_end_trans;
+	}
+
+	btrfs_record_root_in_trans(trans, dest);
+
+	memset(&dest->root_item.drop_progress, 0,
+		sizeof(dest->root_item.drop_progress));
+	dest->root_item.drop_level = 0;
+	btrfs_set_root_refs(&dest->root_item, 0);
+
+	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
+		ret = btrfs_insert_orphan_item(trans,
+					root->fs_info->tree_root,
+					dest->root_key.objectid);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			err = ret;
+			goto out_end_trans;
+		}
+	}
+
+	ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				  dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  dest->root_key.objectid);
+	if (ret && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, root, ret);
+		err = ret;
+		goto out_end_trans;
+	}
+	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+		ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+					  dest->root_item.received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  dest->root_key.objectid);
+		if (ret && ret != -ENOENT) {
+			btrfs_abort_transaction(trans, root, ret);
+			err = ret;
+			goto out_end_trans;
+		}
+	}
+
+out_end_trans:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
+	ret = btrfs_end_transaction(trans, root);
+	if (ret && !err)
+		err = ret;
+	inode->i_flags |= S_DEAD;
+out_release:
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+out_up_write:
+	up_write(&root->fs_info->subvol_sem);
+	if (err) {
+		spin_lock(&dest->root_item_lock);
+		root_flags = btrfs_root_flags(&dest->root_item);
+		btrfs_set_root_flags(&dest->root_item,
+				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+		spin_unlock(&dest->root_item_lock);
+	}
+out_unlock_inode:
+	mutex_unlock(&inode->i_mutex);
+	if (!err) {
+		shrink_dcache_sb(root->fs_info->sb);
+		btrfs_invalidate_inodes(dest);
+		d_delete(dentry);
+		ASSERT(dest->send_in_progress == 0);
+
+		/* the last ref */
+		if (dest->ino_cache_inode) {
+			iput(dest->ino_cache_inode);
+			dest->ino_cache_inode = NULL;
+		}
+	}
+out_dput:
+	dput(dentry);
+out_unlock_dir:
+	mutex_unlock(&dir->i_mutex);
+out_drop_write:
+	mnt_drop_write_file(file);
+out:
+	kfree(vol_args);
+	return err;
+}
+
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ioctl_defrag_range_args *range;
+	int ret;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (btrfs_root_readonly(root)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out;
+		}
+		ret = btrfs_defrag_root(root);
+		if (ret)
+			goto out;
+		ret = btrfs_defrag_root(root->fs_info->extent_root);
+		break;
+	case S_IFREG:
+		if (!(file->f_mode & FMODE_WRITE)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (argp) {
+			if (copy_from_user(range, argp,
+					   sizeof(*range))) {
+				ret = -EFAULT;
+				kfree(range);
+				goto out;
+			}
+			/* compression requires us to start the IO */
+			if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+				range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
+				range->extent_thresh = (u32)-1;
+			}
+		} else {
+			/* the rest are all set to zero by kzalloc */
+			range->len = (u64)-1;
+		}
+		ret = btrfs_defrag_file(file_inode(file), file,
+					range, 0, 0);
+		if (ret > 0)
+			ret = 0;
+		kfree(range);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	ret = btrfs_init_new_device(root, vol_args->name);
+
+	if (!ret)
+		btrfs_info(root->fs_info, "disk added %s",vol_args->name);
+
+	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+	return ret;
+}
+
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto err_drop;
+	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	ret = btrfs_rm_device(root, vol_args->name);
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+	if (!ret)
+		btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+
+out:
+	kfree(vol_args);
+err_drop:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_fs_info_args *fi_args;
+	struct btrfs_device *device;
+	struct btrfs_device *next;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int ret = 0;
+
+	fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
+	if (!fi_args)
+		return -ENOMEM;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	fi_args->num_devices = fs_devices->num_devices;
+	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
+
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+		if (device->devid > fi_args->max_id)
+			fi_args->max_id = device->devid;
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	fi_args->nodesize = root->fs_info->super_copy->nodesize;
+	fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
+	fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+
+	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
+		ret = -EFAULT;
+
+	kfree(fi_args);
+	return ret;
+}
+
+static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_dev_info_args *di_args;
+	struct btrfs_device *dev;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int ret = 0;
+	char *s_uuid = NULL;
+
+	di_args = memdup_user(arg, sizeof(*di_args));
+	if (IS_ERR(di_args))
+		return PTR_ERR(di_args);
+
+	if (!btrfs_is_empty_uuid(di_args->uuid))
+		s_uuid = di_args->uuid;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
+
+	if (!dev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	di_args->devid = dev->devid;
+	di_args->bytes_used = btrfs_device_get_bytes_used(dev);
+	di_args->total_bytes = btrfs_device_get_total_bytes(dev);
+	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+	if (dev->name) {
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		strncpy(di_args->path, name->str, sizeof(di_args->path));
+		rcu_read_unlock();
+		di_args->path[sizeof(di_args->path) - 1] = 0;
+	} else {
+		di_args->path[0] = '\0';
+	}
+
+out:
+	mutex_unlock(&fs_devices->device_list_mutex);
+	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+		ret = -EFAULT;
+
+	kfree(di_args);
+	return ret;
+}
+
+static struct page *extent_same_get_page(struct inode *inode, u64 off)
+{
+	struct page *page;
+	pgoff_t index;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+	index = off >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(inode->i_mapping, index);
+	if (!page)
+		return NULL;
+
+	if (!PageUptodate(page)) {
+		if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
+						 0))
+			return NULL;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+			return NULL;
+		}
+	}
+	unlock_page(page);
+
+	return page;
+}
+
+static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+{
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    off + len - 1);
+		if ((!ordered ||
+		     ordered->file_offset + ordered->len <= off ||
+		     ordered->file_offset >= off + len) &&
+		    !test_range_bit(&BTRFS_I(inode)->io_tree, off,
+				    off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		btrfs_wait_ordered_range(inode, off, len);
+	}
+}
+
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+				struct inode *inode2, u64 loff2, u64 len)
+{
+	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+	mutex_unlock(&inode1->i_mutex);
+	mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+			      struct inode *inode2, u64 loff2, u64 len)
+{
+	if (inode1 < inode2) {
+		swap(inode1, inode2);
+		swap(loff1, loff2);
+	}
+
+	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+	lock_extent_range(inode1, loff1, len);
+	if (inode1 != inode2) {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+		lock_extent_range(inode2, loff2, len);
+	}
+}
+
+static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
+			  u64 dst_loff, u64 len)
+{
+	int ret = 0;
+	struct page *src_page, *dst_page;
+	unsigned int cmp_len = PAGE_CACHE_SIZE;
+	void *addr, *dst_addr;
+
+	while (len) {
+		if (len < PAGE_CACHE_SIZE)
+			cmp_len = len;
+
+		src_page = extent_same_get_page(src, loff);
+		if (!src_page)
+			return -EINVAL;
+		dst_page = extent_same_get_page(dst, dst_loff);
+		if (!dst_page) {
+			page_cache_release(src_page);
+			return -EINVAL;
+		}
+		addr = kmap_atomic(src_page);
+		dst_addr = kmap_atomic(dst_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dst_page);
+
+		if (memcmp(addr, dst_addr, cmp_len))
+			ret = BTRFS_SAME_DATA_DIFFERS;
+
+		kunmap_atomic(addr);
+		kunmap_atomic(dst_addr);
+		page_cache_release(src_page);
+		page_cache_release(dst_page);
+
+		if (ret)
+			break;
+
+		loff += cmp_len;
+		dst_loff += cmp_len;
+		len -= cmp_len;
+	}
+
+	return ret;
+}
+
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+{
+	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
+
+	if (off + len > inode->i_size || off + len < off)
+		return -EINVAL;
+	/* Check that we are block aligned - btrfs_clone() requires this */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+			     struct inode *dst, u64 dst_loff)
+{
+	int ret;
+
+	/*
+	 * btrfs_clone() can't handle extents in the same file
+	 * yet. Once that works, we can drop this check and replace it
+	 * with a check for the same inode, but overlapping extents.
+	 */
+	if (src == dst)
+		return -EINVAL;
+
+	if (len == 0)
+		return 0;
+
+	btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+	ret = extent_same_check_offsets(src, loff, len);
+	if (ret)
+		goto out_unlock;
+
+	ret = extent_same_check_offsets(dst, dst_loff, len);
+	if (ret)
+		goto out_unlock;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+	if (ret == 0)
+		ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out_unlock:
+	btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+	return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+			struct btrfs_ioctl_same_args __user *argp)
+{
+	struct btrfs_ioctl_same_args *same;
+	struct btrfs_ioctl_same_extent_info *info;
+	struct inode *src = file_inode(file);
+	u64 off;
+	u64 len;
+	int i;
+	int ret;
+	unsigned long size;
+	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+	bool is_admin = capable(CAP_SYS_ADMIN);
+	u16 count;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EINVAL;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (get_user(count, &argp->dest_count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
+
+	same = memdup_user(argp, size);
+
+	if (IS_ERR(same)) {
+		ret = PTR_ERR(same);
+		goto out;
+	}
+
+	off = same->logical_offset;
+	len = same->length;
+
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > BTRFS_MAX_DEDUPE_LEN)
+		len = BTRFS_MAX_DEDUPE_LEN;
+
+	if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+		/*
+		 * Btrfs does not support blocksize < page_size. As a
+		 * result, btrfs_cmp_data() won't correctly handle
+		 * this situation without an update.
+		 */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode))
+		goto out;
+
+	ret = -EACCES;
+	if (!S_ISREG(src->i_mode))
+		goto out;
+
+	/* pre-format output fields to sane values */
+	for (i = 0; i < count; i++) {
+		same->info[i].bytes_deduped = 0ULL;
+		same->info[i].status = 0;
+	}
+
+	for (i = 0, info = same->info; i < count; i++, info++) {
+		struct inode *dst;
+		struct fd dst_file = fdget(info->fd);
+		if (!dst_file.file) {
+			info->status = -EBADF;
+			continue;
+		}
+		dst = file_inode(dst_file.file);
+
+		if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
+			info->status = -EINVAL;
+		} else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+			info->status = -EXDEV;
+		} else if (S_ISDIR(dst->i_mode)) {
+			info->status = -EISDIR;
+		} else if (!S_ISREG(dst->i_mode)) {
+			info->status = -EACCES;
+		} else {
+			info->status = btrfs_extent_same(src, off, len, dst,
+							info->logical_offset);
+			if (info->status == 0)
+				info->bytes_deduped += len;
+		}
+		fdput(dst_file);
+	}
+
+	ret = copy_to_user(argp, same, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+/* Helper to check and see if this root currently has a ref on the given disk
+ * bytenr.  If it does then we need to update the quota for this root.  This
+ * doesn't do anything if quotas aren't enabled.
+ */
+static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     u64 disko)
+{
+	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+	struct ulist *roots;
+	struct ulist_iterator uiter;
+	struct ulist_node *root_node = NULL;
+	int ret;
+
+	if (!root->fs_info->quota_enabled)
+		return 1;
+
+	btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+	ret = btrfs_find_all_roots(trans, root->fs_info, disko,
+				   tree_mod_seq_elem.seq, &roots);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+	ULIST_ITER_INIT(&uiter);
+	while ((root_node = ulist_next(roots, &uiter))) {
+		if (root_node->val == root->objectid) {
+			ret = 1;
+			break;
+		}
+	}
+	ulist_free(roots);
+out:
+	btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+	return ret;
+}
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+				     struct inode *inode,
+				     u64 endoff,
+				     const u64 destoff,
+				     const u64 olen)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	inode_inc_iversion(inode);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	/*
+	 * We round up to the block size at eof when determining which
+	 * extents to clone above, but shouldn't round up the file size.
+	 */
+	if (endoff > destoff + olen)
+		endoff = destoff + olen;
+	if (endoff > inode->i_size)
+		btrfs_i_size_write(inode, endoff);
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
+	ret = btrfs_end_transaction(trans, root);
+out:
+	return ret;
+}
+
+static void clone_update_extent_map(struct inode *inode,
+				    const struct btrfs_trans_handle *trans,
+				    const struct btrfs_path *path,
+				    const u64 hole_offset,
+				    const u64 hole_len)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret;
+
+	em = alloc_extent_map();
+	if (!em) {
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+		return;
+	}
+
+	if (path) {
+		struct btrfs_file_extent_item *fi;
+
+		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
+		em->generation = -1;
+		if (btrfs_file_extent_type(path->nodes[0], fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+	} else {
+		em->start = hole_offset;
+		em->len = hole_len;
+		em->ram_bytes = em->len;
+		em->orig_start = hole_offset;
+		em->block_start = EXTENT_MAP_HOLE;
+		em->block_len = 0;
+		em->orig_block_len = 0;
+		em->compress_type = BTRFS_COMPRESS_NONE;
+		em->generation = trans->transid;
+	}
+
+	while (1) {
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em, 1);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
+			break;
+		}
+		btrfs_drop_extent_cache(inode, em->start,
+					em->start + em->len - 1, 0);
+	}
+
+	if (ret)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen, extent_same uses
+ *               identical values here
+ * @destoff: Offset within @inode to start clone
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       const u64 off, const u64 olen, const u64 olen_aligned,
+		       const u64 destoff)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *leaf;
+	struct btrfs_trans_handle *trans;
+	char *buf = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int slot;
+	int ret;
+	int no_quota;
+	const u64 len = olen_aligned;
+	u64 last_disko = 0;
+	u64 last_dest_end = destoff;
+
+	ret = -ENOMEM;
+	buf = vmalloc(root->nodesize);
+	if (!buf)
+		return ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
+		return ret;
+	}
+
+	path->reada = 2;
+	/* clone data */
+	key.objectid = btrfs_ino(src);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = off;
+
+	while (1) {
+		u64 next_key_min_offset = key.offset + 1;
+
+		/*
+		 * note the key will change type as we walk through the
+		 * tree.
+		 */
+		path->leave_spinning = 1;
+		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+				0, 0);
+		if (ret < 0)
+			goto out;
+		/*
+		 * First search, if no extent item that starts at offset off was
+		 * found but the previous item is an extent item, it's possible
+		 * it might overlap our target range, therefore process it.
+		 */
+		if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0] - 1);
+			if (key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+
+		nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+		no_quota = 1;
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			nritems = btrfs_header_nritems(path->nodes[0]);
+		}
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.type > BTRFS_EXTENT_DATA_KEY ||
+		    key.objectid != btrfs_ino(src))
+			break;
+
+		if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int type;
+			u32 size;
+			struct btrfs_key new_key;
+			u64 disko = 0, diskl = 0;
+			u64 datao = 0, datal = 0;
+			u8 comp;
+			u64 drop_start;
+
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			comp = btrfs_file_extent_compression(leaf, extent);
+			type = btrfs_file_extent_type(leaf, extent);
+			if (type == BTRFS_FILE_EXTENT_REG ||
+			    type == BTRFS_FILE_EXTENT_PREALLOC) {
+				disko = btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
+				datao = btrfs_file_extent_offset(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf,
+								    extent);
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				/* take upper bound, may be compressed */
+				datal = btrfs_file_extent_ram_bytes(leaf,
+								    extent);
+			}
+
+			/*
+			 * The first search might have left us at an extent
+			 * item that ends before our target range's start, can
+			 * happen if we have holes and NO_HOLES feature enabled.
+			 */
+			if (key.offset + datal <= off) {
+				path->slots[0]++;
+				goto process_slot;
+			} else if (key.offset >= off + len) {
+				break;
+			}
+			next_key_min_offset = key.offset + datal;
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+
+			btrfs_release_path(path);
+			path->leave_spinning = 0;
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.objectid = btrfs_ino(inode);
+			if (off <= key.offset)
+				new_key.offset = key.offset + destoff - off;
+			else
+				new_key.offset = destoff;
+
+			/*
+			 * Deal with a hole that doesn't have an extent item
+			 * that represents it (NO_HOLES feature enabled).
+			 * This hole is either in the middle of the cloning
+			 * range or at the beginning (fully overlaps it or
+			 * partially overlaps it).
+			 */
+			if (new_key.offset != last_dest_end)
+				drop_start = last_dest_end;
+			else
+				drop_start = new_key.offset;
+
+			/*
+			 * 1 - adjusting old extent (we may have to split it)
+			 * 1 - add new extent
+			 * 1 - inode update
+			 */
+			trans = btrfs_start_transaction(root, 3);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+
+			if (type == BTRFS_FILE_EXTENT_REG ||
+			    type == BTRFS_FILE_EXTENT_PREALLOC) {
+				/*
+				 *    a  | --- range to clone ---|  b
+				 * | ------------- extent ------------- |
+				 */
+
+				/* subtract range b */
+				if (key.offset + datal > off + len)
+					datal = off + len - key.offset;
+
+				/* subtract range a */
+				if (off > key.offset) {
+					datao += off - key.offset;
+					datal -= off - key.offset;
+				}
+
+				ret = btrfs_drop_extents(trans, root, inode,
+							 drop_start,
+							 new_key.offset + datal,
+							 1);
+				if (ret) {
+					if (ret != -EOPNOTSUPP)
+						btrfs_abort_transaction(trans,
+								root, ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
+
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+
+				extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+
+				/* disko == 0 means it's a hole */
+				if (!disko)
+					datao = 0;
+
+				btrfs_set_file_extent_offset(leaf, extent,
+							     datao);
+				btrfs_set_file_extent_num_bytes(leaf, extent,
+								datal);
+
+				/*
+				 * We need to look up the roots that point at
+				 * this bytenr and see if the new root does.  If
+				 * it does not we need to make sure we update
+				 * quotas appropriately.
+				 */
+				if (disko && root != BTRFS_I(src)->root &&
+				    disko != last_disko) {
+					no_quota = check_ref(trans, root,
+							     disko);
+					if (no_quota < 0) {
+						btrfs_abort_transaction(trans,
+									root,
+									ret);
+						btrfs_end_transaction(trans,
+								      root);
+						ret = no_quota;
+						goto out;
+					}
+				}
+
+				if (disko) {
+					inode_add_bytes(inode, datal);
+					ret = btrfs_inc_extent_ref(trans, root,
+							disko, diskl, 0,
+							root->root_key.objectid,
+							btrfs_ino(inode),
+							new_key.offset - datao,
+							no_quota);
+					if (ret) {
+						btrfs_abort_transaction(trans,
+									root,
+									ret);
+						btrfs_end_transaction(trans,
+								      root);
+						goto out;
+
+					}
+				}
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				u64 skip = 0;
+				u64 trim = 0;
+				u64 aligned_end = 0;
+
+				if (off > key.offset) {
+					skip = off - key.offset;
+					new_key.offset += skip;
+				}
+
+				if (key.offset + datal > off + len)
+					trim = key.offset + datal - (off + len);
+
+				if (comp && (skip || trim)) {
+					ret = -EINVAL;
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
+				size -= skip + trim;
+				datal -= skip + trim;
+
+				aligned_end = ALIGN(new_key.offset + datal,
+						    root->sectorsize);
+				ret = btrfs_drop_extents(trans, root, inode,
+							 drop_start,
+							 aligned_end,
+							 1);
+				if (ret) {
+					if (ret != -EOPNOTSUPP)
+						btrfs_abort_transaction(trans,
+							root, ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
+
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
+
+				if (skip) {
+					u32 start =
+					  btrfs_file_extent_calc_inline_size(0);
+					memmove(buf+start, buf+start+skip,
+						datal);
+				}
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+				inode_add_bytes(inode, datal);
+			}
+
+			/* If we have an implicit hole (NO_HOLES feature). */
+			if (drop_start < new_key.offset)
+				clone_update_extent_map(inode, trans,
+						NULL, drop_start,
+						new_key.offset - drop_start);
+
+			clone_update_extent_map(inode, trans, path, 0, 0);
+
+			btrfs_mark_buffer_dirty(leaf);
+			btrfs_release_path(path);
+
+			last_dest_end = ALIGN(new_key.offset + datal,
+					      root->sectorsize);
+			ret = clone_finish_inode_update(trans, inode,
+							last_dest_end,
+							destoff, olen);
+			if (ret)
+				goto out;
+			if (new_key.offset + datal >= destoff + len)
+				break;
+		}
+		btrfs_release_path(path);
+		key.offset = next_key_min_offset;
+	}
+	ret = 0;
+
+	if (last_dest_end < destoff + len) {
+		/*
+		 * We have an implicit hole (NO_HOLES feature is enabled) that
+		 * fully or partially overlaps our cloning range at its end.
+		 */
+		btrfs_release_path(path);
+
+		/*
+		 * 1 - remove extent(s)
+		 * 1 - inode update
+		 */
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+		ret = btrfs_drop_extents(trans, root, inode,
+					 last_dest_end, destoff + len, 1);
+		if (ret) {
+			if (ret != -EOPNOTSUPP)
+				btrfs_abort_transaction(trans, root, ret);
+			btrfs_end_transaction(trans, root);
+			goto out;
+		}
+		clone_update_extent_map(inode, trans, NULL, last_dest_end,
+					destoff + len - last_dest_end);
+		ret = clone_finish_inode_update(trans, inode, destoff + len,
+						destoff, olen);
+	}
+
+out:
+	btrfs_free_path(path);
+	vfree(buf);
+	return ret;
+}
+
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+				       u64 off, u64 olen, u64 destoff)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct fd src_file;
+	struct inode *src;
+	int ret;
+	u64 len = olen;
+	u64 bs = root->fs_info->sb->s_blocksize;
+	int same_inode = 0;
+
+	/*
+	 * TODO:
+	 * - split compressed inline extents.  annoying: we need to
+	 *   decompress into destination's address_space (the file offset
+	 *   may change, so source mapping won't do), then recompress (or
+	 *   otherwise reinsert) a subrange.
+	 *
+	 * - split destination inode's inline extents.  The inline extents can
+	 *   be either compressed or non-compressed.
+	 */
+
+	/* the destination must be opened for writing */
+	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+		return -EINVAL;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+
+	ret = -EXDEV;
+	if (src_file.file->f_path.mnt != file->f_path.mnt)
+		goto out_fput;
+
+	src = file_inode(src_file.file);
+
+	ret = -EINVAL;
+	if (src == inode)
+		same_inode = 1;
+
+	/* the src must be open for reading */
+	if (!(src_file.file->f_mode & FMODE_READ))
+		goto out_fput;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+		goto out_fput;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+		} else {
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		}
+	} else {
+		mutex_lock(&src->i_mutex);
+	}
+
+	/* determine range to clone */
+	ret = -EINVAL;
+	if (off + len > src->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - off;
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == src->i_size)
+		len = ALIGN(src->i_size, bs) - off;
+
+	if (len == 0) {
+		ret = 0;
+		goto out_unlock;
+	}
+
+	/* verify the end result is block aligned */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+	    !IS_ALIGNED(destoff, bs))
+		goto out_unlock;
+
+	/* verify if ranges are overlapped within the same file */
+	if (same_inode) {
+		if (destoff + len > off && destoff < off + len)
+			goto out_unlock;
+	}
+
+	if (destoff > inode->i_size) {
+		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/*
+	 * Lock the target range too. Right after we replace the file extent
+	 * items in the fs tree (which now point to the cloned data), we might
+	 * have a worker replace them with extent items relative to a write
+	 * operation that was issued before this clone operation (i.e. confront
+	 * with inode.c:btrfs_finish_ordered_io).
+	 */
+	if (same_inode) {
+		u64 lock_start = min_t(u64, off, destoff);
+		u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
+
+		lock_extent_range(src, lock_start, lock_len);
+	} else {
+		lock_extent_range(src, off, len);
+		lock_extent_range(inode, destoff, len);
+	}
+
+	ret = btrfs_clone(src, inode, off, olen, len, destoff);
+
+	if (same_inode) {
+		u64 lock_start = min_t(u64, off, destoff);
+		u64 lock_end = max_t(u64, off, destoff) + len - 1;
+
+		unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
+	} else {
+		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+		unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
+			      destoff + len - 1);
+	}
+	/*
+	 * Truncate page cache pages so that future reads will see the cloned
+	 * data immediately and not the previous data.
+	 */
+	truncate_inode_pages_range(&inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len) - 1);
+out_unlock:
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_unlock(&src->i_mutex);
+			mutex_unlock(&inode->i_mutex);
+		} else {
+			mutex_unlock(&inode->i_mutex);
+			mutex_unlock(&src->i_mutex);
+		}
+	} else {
+		mutex_unlock(&src->i_mutex);
+	}
+out_fput:
+	fdput(src_file);
+out_drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_clone_range_args args;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+				 args.src_length, args.dest_offset);
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	ret = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -EINPROGRESS;
+	if (file->private_data)
+		goto out;
+
+	ret = -EROFS;
+	if (btrfs_root_readonly(root))
+		goto out;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto out;
+
+	atomic_inc(&root->fs_info->open_ioctl_trans);
+
+	ret = -ENOMEM;
+	trans = btrfs_start_ioctl_transaction(root);
+	if (IS_ERR(trans))
+		goto out_drop;
+
+	file->private_data = trans;
+	return 0;
+
+out_drop:
+	atomic_dec(&root->fs_info->open_ioctl_trans);
+	mnt_drop_write_file(file);
+out:
+	return ret;
+}
+
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	u64 objectid = 0;
+	u64 dir_id;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!objectid)
+		objectid = BTRFS_FS_TREE_OBJECTID;
+
+	location.objectid = objectid;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = (u64)-1;
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	if (IS_ERR(new_root)) {
+		ret = PTR_ERR(new_root);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	path->leave_spinning = 1;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 1);
+	if (IS_ERR_OR_NULL(di)) {
+		btrfs_free_path(path);
+		btrfs_end_transaction(trans, root);
+		btrfs_err(new_root->fs_info, "Umm, you don't have the default dir"
+			   "item, this isn't going to work");
+		ret = -ENOENT;
+		goto out;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
+	btrfs_end_transaction(trans, root);
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space)
+{
+	struct btrfs_block_group_cache *block_group;
+
+	space->total_bytes = 0;
+	space->used_bytes = 0;
+	space->flags = 0;
+	list_for_each_entry(block_group, groups_list, list) {
+		space->flags = block_group->flags;
+		space->total_bytes += block_group->key.offset;
+		space->used_bytes +=
+			btrfs_block_group_used(&block_group->item);
+	}
+}
+
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_space_args space_args;
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_ioctl_space_info *dest;
+	struct btrfs_ioctl_space_info *dest_orig;
+	struct btrfs_ioctl_space_info __user *user_dest;
+	struct btrfs_space_info *info;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int alloc_size;
+	int ret = 0;
+	u64 slot_count = 0;
+	int i, c;
+
+	if (copy_from_user(&space_args,
+			   (struct btrfs_ioctl_space_args __user *)arg,
+			   sizeof(space_args)))
+		return -EFAULT;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!info)
+			continue;
+
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c]))
+				slot_count++;
+		}
+		up_read(&info->groups_sem);
+	}
+
+	/*
+	 * Global block reserve, exported as a space_info
+	 */
+	slot_count++;
+
+	/* space_slots == 0 means they are asking for a count */
+	if (space_args.space_slots == 0) {
+		space_args.total_spaces = slot_count;
+		goto out;
+	}
+
+	slot_count = min_t(u64, space_args.space_slots, slot_count);
+
+	alloc_size = sizeof(*dest) * slot_count;
+
+	/* we generally have at most 6 or so space infos, one for each raid
+	 * level.  So, a whole page should be more than enough for everyone
+	 */
+	if (alloc_size > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
+	space_args.total_spaces = 0;
+	dest = kmalloc(alloc_size, GFP_NOFS);
+	if (!dest)
+		return -ENOMEM;
+	dest_orig = dest;
+
+	/* now we have a buffer to copy into */
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		if (!slot_count)
+			break;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!info)
+			continue;
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c])) {
+				btrfs_get_block_group_info(
+					&info->block_groups[c], &space);
+				memcpy(dest, &space, sizeof(space));
+				dest++;
+				space_args.total_spaces++;
+				slot_count--;
+			}
+			if (!slot_count)
+				break;
+		}
+		up_read(&info->groups_sem);
+	}
+
+	/*
+	 * Add global block reserve
+	 */
+	if (slot_count) {
+		struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+
+		spin_lock(&block_rsv->lock);
+		space.total_bytes = block_rsv->size;
+		space.used_bytes = block_rsv->size - block_rsv->reserved;
+		spin_unlock(&block_rsv->lock);
+		space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+		memcpy(dest, &space, sizeof(space));
+		space_args.total_spaces++;
+	}
+
+	user_dest = (struct btrfs_ioctl_space_info __user *)
+		(arg + sizeof(struct btrfs_ioctl_space_args));
+
+	if (copy_to_user(user_dest, dest_orig, alloc_size))
+		ret = -EFAULT;
+
+	kfree(dest_orig);
+out:
+	if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+
+	trans = file->private_data;
+	if (!trans)
+		return -EINVAL;
+	file->private_data = NULL;
+
+	btrfs_end_transaction(trans, root);
+
+	atomic_dec(&root->fs_info->open_ioctl_trans);
+
+	mnt_drop_write_file(file);
+	return 0;
+}
+
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+					    void __user *argp)
+{
+	struct btrfs_trans_handle *trans;
+	u64 transid;
+	int ret;
+
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		if (PTR_ERR(trans) != -ENOENT)
+			return PTR_ERR(trans);
+
+		/* No running transaction, don't bother */
+		transid = root->fs_info->last_trans_committed;
+		goto out;
+	}
+	transid = trans->transid;
+	ret = btrfs_commit_transaction_async(trans, root, 0);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+out:
+	if (argp)
+		if (copy_to_user(argp, &transid, sizeof(transid)))
+			return -EFAULT;
+	return 0;
+}
+
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+					   void __user *argp)
+{
+	u64 transid;
+
+	if (argp) {
+		if (copy_from_user(&transid, argp, sizeof(transid)))
+			return -EFAULT;
+	} else {
+		transid = 0;  /* current trans */
+	}
+	return btrfs_wait_for_commit(root, transid);
+}
+
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_scrub_args *sa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+		ret = mnt_want_write_file(file);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+			      0);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	if (!(sa->flags & BTRFS_SCRUB_READONLY))
+		mnt_drop_write_file(file);
+out:
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_scrub_cancel(root->fs_info);
+}
+
+static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+				       void __user *arg)
+{
+	struct btrfs_ioctl_scrub_args *sa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+				      void __user *arg)
+{
+	struct btrfs_ioctl_get_dev_stats *sa;
+	int ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
+		kfree(sa);
+		return -EPERM;
+	}
+
+	ret = btrfs_get_dev_stats(root, sa);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_dev_replace_args *p;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	p = memdup_user(arg, sizeof(*p));
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	switch (p->cmd) {
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+		if (root->fs_info->sb->s_flags & MS_RDONLY) {
+			ret = -EROFS;
+			goto out;
+		}
+		if (atomic_xchg(
+			&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+			ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		} else {
+			ret = btrfs_dev_replace_start(root, p);
+			atomic_set(
+			 &root->fs_info->mutually_exclusive_operation_running,
+			 0);
+		}
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+		btrfs_dev_replace_status(root->fs_info, p);
+		ret = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+		ret = btrfs_dev_replace_cancel(root->fs_info, p);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (copy_to_user(arg, p, sizeof(*p)))
+		ret = -EFAULT;
+out:
+	kfree(p);
+	return ret;
+}
+
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+	int ret = 0;
+	int i;
+	u64 rel_ptr;
+	int size;
+	struct btrfs_ioctl_ino_path_args *ipa = NULL;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_path *path;
+
+	if (!capable(CAP_DAC_READ_SEARCH))
+		return -EPERM;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ipa = memdup_user(arg, sizeof(*ipa));
+	if (IS_ERR(ipa)) {
+		ret = PTR_ERR(ipa);
+		ipa = NULL;
+		goto out;
+	}
+
+	size = min_t(u32, ipa->size, 4096);
+	ipath = init_ipath(size, root, path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto out;
+	}
+
+	ret = paths_from_inode(ipa->inum, ipath);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+		rel_ptr = ipath->fspath->val[i] -
+			  (u64)(unsigned long)ipath->fspath->val;
+		ipath->fspath->val[i] = rel_ptr;
+	}
+
+	ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+			   (void *)(unsigned long)ipath->fspath, size);
+	if (ret) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+out:
+	btrfs_free_path(path);
+	free_ipath(ipath);
+	kfree(ipa);
+
+	return ret;
+}
+
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct btrfs_data_container *inodes = ctx;
+	const size_t c = 3 * sizeof(u64);
+
+	if (inodes->bytes_left >= c) {
+		inodes->bytes_left -= c;
+		inodes->val[inodes->elem_cnt] = inum;
+		inodes->val[inodes->elem_cnt + 1] = offset;
+		inodes->val[inodes->elem_cnt + 2] = root;
+		inodes->elem_cnt += 3;
+	} else {
+		inodes->bytes_missing += c - inodes->bytes_left;
+		inodes->bytes_left = 0;
+		inodes->elem_missed += 3;
+	}
+
+	return 0;
+}
+
+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+					void __user *arg)
+{
+	int ret = 0;
+	int size;
+	struct btrfs_ioctl_logical_ino_args *loi;
+	struct btrfs_data_container *inodes = NULL;
+	struct btrfs_path *path = NULL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	loi = memdup_user(arg, sizeof(*loi));
+	if (IS_ERR(loi)) {
+		ret = PTR_ERR(loi);
+		loi = NULL;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	size = min_t(u32, loi->size, 64 * 1024);
+	inodes = init_data_container(size);
+	if (IS_ERR(inodes)) {
+		ret = PTR_ERR(inodes);
+		inodes = NULL;
+		goto out;
+	}
+
+	ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+					  build_ino_list, inodes);
+	if (ret == -EINVAL)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	ret = copy_to_user((void *)(unsigned long)loi->inodes,
+			   (void *)(unsigned long)inodes, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	btrfs_free_path(path);
+	vfree(inodes);
+	kfree(loi);
+
+	return ret;
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+			       struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	bargs->flags = bctl->flags;
+
+	if (atomic_read(&fs_info->balance_running))
+		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+	if (atomic_read(&fs_info->balance_pause_req))
+		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+	if (atomic_read(&fs_info->balance_cancel_req))
+		bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+	if (lock) {
+		spin_lock(&fs_info->balance_lock);
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+		spin_unlock(&fs_info->balance_lock);
+	} else {
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+	}
+}
+
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	struct btrfs_balance_control *bctl;
+	bool need_unlock; /* for mut. excl. ops lock */
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+again:
+	if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+		need_unlock = true;
+		goto locked;
+	}
+
+	/*
+	 * mut. excl. ops lock is locked.  Three possibilites:
+	 *   (1) some other op is running
+	 *   (2) balance is running
+	 *   (3) balance is paused -- special case (think resume)
+	 */
+	mutex_lock(&fs_info->balance_mutex);
+	if (fs_info->balance_ctl) {
+		/* this is either (2) or (3) */
+		if (!atomic_read(&fs_info->balance_running)) {
+			mutex_unlock(&fs_info->balance_mutex);
+			if (!mutex_trylock(&fs_info->volume_mutex))
+				goto again;
+			mutex_lock(&fs_info->balance_mutex);
+
+			if (fs_info->balance_ctl &&
+			    !atomic_read(&fs_info->balance_running)) {
+				/* this is (3) */
+				need_unlock = false;
+				goto locked;
+			}
+
+			mutex_unlock(&fs_info->balance_mutex);
+			mutex_unlock(&fs_info->volume_mutex);
+			goto again;
+		} else {
+			/* this is (2) */
+			mutex_unlock(&fs_info->balance_mutex);
+			ret = -EINPROGRESS;
+			goto out;
+		}
+	} else {
+		/* this is (1) */
+		mutex_unlock(&fs_info->balance_mutex);
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+locked:
+	BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+
+	if (arg) {
+		bargs = memdup_user(arg, sizeof(*bargs));
+		if (IS_ERR(bargs)) {
+			ret = PTR_ERR(bargs);
+			goto out_unlock;
+		}
+
+		if (bargs->flags & BTRFS_BALANCE_RESUME) {
+			if (!fs_info->balance_ctl) {
+				ret = -ENOTCONN;
+				goto out_bargs;
+			}
+
+			bctl = fs_info->balance_ctl;
+			spin_lock(&fs_info->balance_lock);
+			bctl->flags |= BTRFS_BALANCE_RESUME;
+			spin_unlock(&fs_info->balance_lock);
+
+			goto do_balance;
+		}
+	} else {
+		bargs = NULL;
+	}
+
+	if (fs_info->balance_ctl) {
+		ret = -EINPROGRESS;
+		goto out_bargs;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out_bargs;
+	}
+
+	bctl->fs_info = fs_info;
+	if (arg) {
+		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+		bctl->flags = bargs->flags;
+	} else {
+		/* balance everything - no filters */
+		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+	}
+
+do_balance:
+	/*
+	 * Ownership of bctl and mutually_exclusive_operation_running
+	 * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
+	 * or, if restriper was paused all the way until unmount, in
+	 * free_fs_info.  mutually_exclusive_operation_running is
+	 * cleared in __cancel_balance.
+	 */
+	need_unlock = false;
+
+	ret = btrfs_balance(bctl, bargs);
+
+	if (arg) {
+		if (copy_to_user(arg, bargs, sizeof(*bargs)))
+			ret = -EFAULT;
+	}
+
+out_bargs:
+	kfree(bargs);
+out_unlock:
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+	if (need_unlock)
+		atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case BTRFS_BALANCE_CTL_PAUSE:
+		return btrfs_pause_balance(root->fs_info);
+	case BTRFS_BALANCE_CTL_CANCEL:
+		return btrfs_cancel_balance(root->fs_info);
+	}
+
+	return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+					 void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+	if (!bargs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	update_ioctl_balance_args(fs_info, 1, bargs);
+
+	if (copy_to_user(arg, bargs, sizeof(*bargs)))
+		ret = -EFAULT;
+
+	kfree(bargs);
+out:
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_ctl_args *sa;
+	struct btrfs_trans_handle *trans = NULL;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	down_write(&root->fs_info->subvol_sem);
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	switch (sa->cmd) {
+	case BTRFS_QUOTA_CTL_ENABLE:
+		ret = btrfs_quota_enable(trans, root->fs_info);
+		break;
+	case BTRFS_QUOTA_CTL_DISABLE:
+		ret = btrfs_quota_disable(trans, root->fs_info);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+	if (err && !ret)
+		ret = err;
+out:
+	kfree(sa);
+	up_write(&root->fs_info->subvol_sem);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_qgroup_assign_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->assign) {
+		ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	} else {
+		ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	}
+
+	/* update qgroup status and info */
+	err = btrfs_run_qgroups(trans, root->fs_info);
+	if (err < 0)
+		btrfs_error(root->fs_info, ret,
+			    "failed to update qgroup status and info\n");
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_qgroup_create_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	if (!sa->qgroupid) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->create) {
+		ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
+	} else {
+		ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+	}
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_qgroup_limit_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+	u64 qgroupid;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	qgroupid = sa->qgroupid;
+	if (!qgroupid) {
+		/* take the current subvol as qgroup */
+		qgroupid = root->root_key.objectid;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	qsa = memdup_user(arg, sizeof(*qsa));
+	if (IS_ERR(qsa)) {
+		ret = PTR_ERR(qsa);
+		goto drop_write;
+	}
+
+	if (qsa->flags) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_qgroup_rescan(root->fs_info);
+
+out:
+	kfree(qsa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+	if (!qsa)
+		return -ENOMEM;
+
+	if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		qsa->flags = 1;
+		qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+	}
+
+	if (copy_to_user(arg, qsa, sizeof(*qsa)))
+		ret = -EFAULT;
+
+	kfree(qsa);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+					    struct btrfs_ioctl_received_subvol_args *sa)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root_item *root_item = &root->root_item;
+	struct btrfs_trans_handle *trans;
+	struct timespec ct = CURRENT_TIME;
+	int ret = 0;
+	int received_uuid_changed;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret < 0)
+		return ret;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (btrfs_root_readonly(root)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * 1 - root item
+	 * 2 - uuid items (received uuid + subvol uuid)
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
+
+	sa->rtransid = trans->transid;
+	sa->rtime.sec = ct.tv_sec;
+	sa->rtime.nsec = ct.tv_nsec;
+
+	received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
+				       BTRFS_UUID_SIZE);
+	if (received_uuid_changed &&
+	    !btrfs_is_empty_uuid(root_item->received_uuid))
+		btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				    root_item->received_uuid,
+				    BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+				    root->root_key.objectid);
+	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
+	btrfs_set_root_stransid(root_item, sa->stransid);
+	btrfs_set_root_rtransid(root_item, sa->rtransid);
+	btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
+	btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+	if (ret < 0) {
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
+	if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
+		ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+					  sa->uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  root->root_key.objectid);
+		if (ret < 0 && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+out:
+	up_write(&root->fs_info->subvol_sem);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+						void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+	struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+	int ret = 0;
+
+	args32 = memdup_user(arg, sizeof(*args32));
+	if (IS_ERR(args32)) {
+		ret = PTR_ERR(args32);
+		args32 = NULL;
+		goto out;
+	}
+
+	args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+	if (!args64) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+	args64->stransid = args32->stransid;
+	args64->rtransid = args32->rtransid;
+	args64->stime.sec = args32->stime.sec;
+	args64->stime.nsec = args32->stime.nsec;
+	args64->rtime.sec = args32->rtime.sec;
+	args64->rtime.nsec = args32->rtime.nsec;
+	args64->flags = args32->flags;
+
+	ret = _btrfs_ioctl_set_received_subvol(file, args64);
+	if (ret)
+		goto out;
+
+	memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+	args32->stransid = args64->stransid;
+	args32->rtransid = args64->rtransid;
+	args32->stime.sec = args64->stime.sec;
+	args32->stime.nsec = args64->stime.nsec;
+	args32->rtime.sec = args64->rtime.sec;
+	args32->rtime.nsec = args64->rtime.nsec;
+	args32->flags = args64->flags;
+
+	ret = copy_to_user(arg, args32, sizeof(*args32));
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(args32);
+	kfree(args64);
+	return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+					    void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args *sa = NULL;
+	int ret = 0;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		sa = NULL;
+		goto out;
+	}
+
+	ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+	if (ret)
+		goto out;
+
+	ret = copy_to_user(arg, sa, sizeof(*sa));
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(sa);
+	return ret;
+}
+
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	size_t len;
+	int ret;
+	char label[BTRFS_LABEL_SIZE];
+
+	spin_lock(&root->fs_info->super_lock);
+	memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+	spin_unlock(&root->fs_info->super_lock);
+
+	len = strnlen(label, BTRFS_LABEL_SIZE);
+
+	if (len == BTRFS_LABEL_SIZE) {
+		btrfs_warn(root->fs_info,
+			"label is too long, return the first %zu bytes", --len);
+	}
+
+	ret = copy_to_user(arg, label, len);
+
+	return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_trans_handle *trans;
+	char label[BTRFS_LABEL_SIZE];
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(label, arg, sizeof(label)))
+		return -EFAULT;
+
+	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+		btrfs_err(root->fs_info, "unable to set label with more than %d bytes",
+		       BTRFS_LABEL_SIZE - 1);
+		return -EINVAL;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	spin_lock(&root->fs_info->super_lock);
+	strcpy(super_block->label, label);
+	spin_unlock(&root->fs_info->super_lock);
+	ret = btrfs_commit_transaction(trans, root);
+
+out_unlock:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+#define INIT_FEATURE_FLAGS(suffix) \
+	{ .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
+	  .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
+	  .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
+
+static int btrfs_ioctl_get_supported_features(struct file *file,
+					      void __user *arg)
+{
+	static struct btrfs_ioctl_feature_flags features[3] = {
+		INIT_FEATURE_FLAGS(SUPP),
+		INIT_FEATURE_FLAGS(SAFE_SET),
+		INIT_FEATURE_FLAGS(SAFE_CLEAR)
+	};
+
+	if (copy_to_user(arg, &features, sizeof(features)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_ioctl_feature_flags features;
+
+	features.compat_flags = btrfs_super_compat_flags(super_block);
+	features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
+	features.incompat_flags = btrfs_super_incompat_flags(super_block);
+
+	if (copy_to_user(arg, &features, sizeof(features)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int check_feature_bits(struct btrfs_root *root,
+			      enum btrfs_feature_set set,
+			      u64 change_mask, u64 flags, u64 supported_flags,
+			      u64 safe_set, u64 safe_clear)
+{
+	const char *type = btrfs_feature_set_names[set];
+	char *names;
+	u64 disallowed, unsupported;
+	u64 set_mask = flags & change_mask;
+	u64 clear_mask = ~flags & change_mask;
+
+	unsupported = set_mask & ~supported_flags;
+	if (unsupported) {
+		names = btrfs_printable_features(set, unsupported);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "this kernel does not support the %s feature bit%s",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
+			   "this kernel does not support %s bits 0x%llx",
+			   type, unsupported);
+		return -EOPNOTSUPP;
+	}
+
+	disallowed = set_mask & ~safe_set;
+	if (disallowed) {
+		names = btrfs_printable_features(set, disallowed);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "can't set the %s feature bit%s while mounted",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
+			   "can't set %s bits 0x%llx while mounted",
+			   type, disallowed);
+		return -EPERM;
+	}
+
+	disallowed = clear_mask & ~safe_clear;
+	if (disallowed) {
+		names = btrfs_printable_features(set, disallowed);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "can't clear the %s feature bit%s while mounted",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
+			   "can't clear %s bits 0x%llx while mounted",
+			   type, disallowed);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+#define check_feature(root, change_mask, flags, mask_base)	\
+check_feature_bits(root, FEAT_##mask_base, change_mask, flags,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SUPP,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
+
+static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_ioctl_feature_flags flags[2];
+	struct btrfs_trans_handle *trans;
+	u64 newflags;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	/* Nothing to do */
+	if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
+	    !flags[0].incompat_flags)
+		return 0;
+
+	ret = check_feature(root, flags[0].compat_flags,
+			    flags[1].compat_flags, COMPAT);
+	if (ret)
+		return ret;
+
+	ret = check_feature(root, flags[0].compat_ro_flags,
+			    flags[1].compat_ro_flags, COMPAT_RO);
+	if (ret)
+		return ret;
+
+	ret = check_feature(root, flags[0].incompat_flags,
+			    flags[1].incompat_flags, INCOMPAT);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	spin_lock(&root->fs_info->super_lock);
+	newflags = btrfs_super_compat_flags(super_block);
+	newflags |= flags[0].compat_flags & flags[1].compat_flags;
+	newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
+	btrfs_set_super_compat_flags(super_block, newflags);
+
+	newflags = btrfs_super_compat_ro_flags(super_block);
+	newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
+	newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
+	btrfs_set_super_compat_ro_flags(super_block, newflags);
+
+	newflags = btrfs_super_incompat_flags(super_block);
+	newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
+	newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
+	btrfs_set_super_incompat_flags(super_block, newflags);
+	spin_unlock(&root->fs_info->super_lock);
+
+	return btrfs_commit_transaction(trans, root);
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+		cmd, unsigned long arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return btrfs_ioctl_getflags(file, argp);
+	case FS_IOC_SETFLAGS:
+		return btrfs_ioctl_setflags(file, argp);
+	case FS_IOC_GETVERSION:
+		return btrfs_ioctl_getversion(file, argp);
+	case FITRIM:
+		return btrfs_ioctl_fitrim(file, argp);
+	case BTRFS_IOC_SNAP_CREATE:
+		return btrfs_ioctl_snap_create(file, argp, 0);
+	case BTRFS_IOC_SNAP_CREATE_V2:
+		return btrfs_ioctl_snap_create_v2(file, argp, 0);
+	case BTRFS_IOC_SUBVOL_CREATE:
+		return btrfs_ioctl_snap_create(file, argp, 1);
+	case BTRFS_IOC_SUBVOL_CREATE_V2:
+		return btrfs_ioctl_snap_create_v2(file, argp, 1);
+	case BTRFS_IOC_SNAP_DESTROY:
+		return btrfs_ioctl_snap_destroy(file, argp);
+	case BTRFS_IOC_SUBVOL_GETFLAGS:
+		return btrfs_ioctl_subvol_getflags(file, argp);
+	case BTRFS_IOC_SUBVOL_SETFLAGS:
+		return btrfs_ioctl_subvol_setflags(file, argp);
+	case BTRFS_IOC_DEFAULT_SUBVOL:
+		return btrfs_ioctl_default_subvol(file, argp);
+	case BTRFS_IOC_DEFRAG:
+		return btrfs_ioctl_defrag(file, NULL);
+	case BTRFS_IOC_DEFRAG_RANGE:
+		return btrfs_ioctl_defrag(file, argp);
+	case BTRFS_IOC_RESIZE:
+		return btrfs_ioctl_resize(file, argp);
+	case BTRFS_IOC_ADD_DEV:
+		return btrfs_ioctl_add_dev(root, argp);
+	case BTRFS_IOC_RM_DEV:
+		return btrfs_ioctl_rm_dev(file, argp);
+	case BTRFS_IOC_FS_INFO:
+		return btrfs_ioctl_fs_info(root, argp);
+	case BTRFS_IOC_DEV_INFO:
+		return btrfs_ioctl_dev_info(root, argp);
+	case BTRFS_IOC_BALANCE:
+		return btrfs_ioctl_balance(file, NULL);
+	case BTRFS_IOC_CLONE:
+		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+	case BTRFS_IOC_CLONE_RANGE:
+		return btrfs_ioctl_clone_range(file, argp);
+	case BTRFS_IOC_TRANS_START:
+		return btrfs_ioctl_trans_start(file);
+	case BTRFS_IOC_TRANS_END:
+		return btrfs_ioctl_trans_end(file);
+	case BTRFS_IOC_TREE_SEARCH:
+		return btrfs_ioctl_tree_search(file, argp);
+	case BTRFS_IOC_TREE_SEARCH_V2:
+		return btrfs_ioctl_tree_search_v2(file, argp);
+	case BTRFS_IOC_INO_LOOKUP:
+		return btrfs_ioctl_ino_lookup(file, argp);
+	case BTRFS_IOC_INO_PATHS:
+		return btrfs_ioctl_ino_to_path(root, argp);
+	case BTRFS_IOC_LOGICAL_INO:
+		return btrfs_ioctl_logical_to_ino(root, argp);
+	case BTRFS_IOC_SPACE_INFO:
+		return btrfs_ioctl_space_info(root, argp);
+	case BTRFS_IOC_SYNC: {
+		int ret;
+
+		ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+		if (ret)
+			return ret;
+		ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
+		/*
+		 * The transaction thread may want to do more work,
+		 * namely it pokes the cleaner ktread that will start
+		 * processing uncleaned subvols.
+		 */
+		wake_up_process(root->fs_info->transaction_kthread);
+		return ret;
+	}
+	case BTRFS_IOC_START_SYNC:
+		return btrfs_ioctl_start_sync(root, argp);
+	case BTRFS_IOC_WAIT_SYNC:
+		return btrfs_ioctl_wait_sync(root, argp);
+	case BTRFS_IOC_SCRUB:
+		return btrfs_ioctl_scrub(file, argp);
+	case BTRFS_IOC_SCRUB_CANCEL:
+		return btrfs_ioctl_scrub_cancel(root, argp);
+	case BTRFS_IOC_SCRUB_PROGRESS:
+		return btrfs_ioctl_scrub_progress(root, argp);
+	case BTRFS_IOC_BALANCE_V2:
+		return btrfs_ioctl_balance(file, argp);
+	case BTRFS_IOC_BALANCE_CTL:
+		return btrfs_ioctl_balance_ctl(root, arg);
+	case BTRFS_IOC_BALANCE_PROGRESS:
+		return btrfs_ioctl_balance_progress(root, argp);
+	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
+		return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+	case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+		return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
+	case BTRFS_IOC_SEND:
+		return btrfs_ioctl_send(file, argp);
+	case BTRFS_IOC_GET_DEV_STATS:
+		return btrfs_ioctl_get_dev_stats(root, argp);
+	case BTRFS_IOC_QUOTA_CTL:
+		return btrfs_ioctl_quota_ctl(file, argp);
+	case BTRFS_IOC_QGROUP_ASSIGN:
+		return btrfs_ioctl_qgroup_assign(file, argp);
+	case BTRFS_IOC_QGROUP_CREATE:
+		return btrfs_ioctl_qgroup_create(file, argp);
+	case BTRFS_IOC_QGROUP_LIMIT:
+		return btrfs_ioctl_qgroup_limit(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN:
+		return btrfs_ioctl_quota_rescan(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
+		return btrfs_ioctl_quota_rescan_status(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+		return btrfs_ioctl_quota_rescan_wait(file, argp);
+	case BTRFS_IOC_DEV_REPLACE:
+		return btrfs_ioctl_dev_replace(root, argp);
+	case BTRFS_IOC_GET_FSLABEL:
+		return btrfs_ioctl_get_fslabel(file, argp);
+	case BTRFS_IOC_SET_FSLABEL:
+		return btrfs_ioctl_set_fslabel(file, argp);
+	case BTRFS_IOC_FILE_EXTENT_SAME:
+		return btrfs_ioctl_file_extent_same(file, argp);
+	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+		return btrfs_ioctl_get_supported_features(file, argp);
+	case BTRFS_IOC_GET_FEATURES:
+		return btrfs_ioctl_get_features(file, argp);
+	case BTRFS_IOC_SET_FEATURES:
+		return btrfs_ioctl_set_features(file, argp);
+	}
+
+	return -ENOTTY;
+}
diff --git a/kernel/fs/btrfs/locking.c b/kernel/fs/btrfs/locking.c
new file mode 100644
index 000000000..f8229ef1b
--- /dev/null
+++ b/kernel/fs/btrfs/locking.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
+
+/*
+ * if we currently have a spinning reader or writer lock
+ * (indicated by the rw flag) this will bump the count
+ * of blocking holders and drop the spinlock.
+ */
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+{
+	/*
+	 * no lock is required.  The lock owner may change if
+	 * we have a read lock, but it won't change to or away
+	 * from us.  If we have the write lock, we are the owner
+	 * and it'll never change.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner)
+		return;
+	if (rw == BTRFS_WRITE_LOCK) {
+		if (atomic_read(&eb->blocking_writers) == 0) {
+			WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+			atomic_dec(&eb->spinning_writers);
+			btrfs_assert_tree_locked(eb);
+			atomic_inc(&eb->blocking_writers);
+			write_unlock(&eb->lock);
+		}
+	} else if (rw == BTRFS_READ_LOCK) {
+		btrfs_assert_tree_read_locked(eb);
+		atomic_inc(&eb->blocking_readers);
+		WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+		atomic_dec(&eb->spinning_readers);
+		read_unlock(&eb->lock);
+	}
+	return;
+}
+
+/*
+ * if we currently have a blocking lock, take the spinlock
+ * and drop our blocking count
+ */
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+{
+	/*
+	 * no lock is required.  The lock owner may change if
+	 * we have a read lock, but it won't change to or away
+	 * from us.  If we have the write lock, we are the owner
+	 * and it'll never change.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner)
+		return;
+
+	if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
+		BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+		write_lock(&eb->lock);
+		WARN_ON(atomic_read(&eb->spinning_writers));
+		atomic_inc(&eb->spinning_writers);
+		if (atomic_dec_and_test(&eb->blocking_writers) &&
+		    waitqueue_active(&eb->write_lock_wq))
+			wake_up(&eb->write_lock_wq);
+	} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
+		BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+		read_lock(&eb->lock);
+		atomic_inc(&eb->spinning_readers);
+		if (atomic_dec_and_test(&eb->blocking_readers) &&
+		    waitqueue_active(&eb->read_lock_wq))
+			wake_up(&eb->read_lock_wq);
+	}
+	return;
+}
+
+/*
+ * take a spinning read lock.  This will wait for any blocking
+ * writers
+ */
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+again:
+	BUG_ON(!atomic_read(&eb->blocking_writers) &&
+	       current->pid == eb->lock_owner);
+
+	read_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers) &&
+	    current->pid == eb->lock_owner) {
+		/*
+		 * This extent is already write-locked by our thread. We allow
+		 * an additional read lock to be added because it's for the same
+		 * thread. btrfs_find_all_roots() depends on this as it may be
+		 * called on a partly (write-)locked tree.
+		 */
+		BUG_ON(eb->lock_nested);
+		eb->lock_nested = 1;
+		read_unlock(&eb->lock);
+		return;
+	}
+	if (atomic_read(&eb->blocking_writers)) {
+		read_unlock(&eb->lock);
+		wait_event(eb->write_lock_wq,
+			   atomic_read(&eb->blocking_writers) == 0);
+		goto again;
+	}
+	atomic_inc(&eb->read_locks);
+	atomic_inc(&eb->spinning_readers);
+}
+
+/*
+ * take a spinning read lock.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
+{
+	if (atomic_read(&eb->blocking_writers))
+		return 0;
+
+	read_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers)) {
+		read_unlock(&eb->lock);
+		return 0;
+	}
+	atomic_inc(&eb->read_locks);
+	atomic_inc(&eb->spinning_readers);
+	return 1;
+}
+
+/*
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_try_tree_read_lock(struct extent_buffer *eb)
+{
+	if (atomic_read(&eb->blocking_writers))
+		return 0;
+
+	if (!read_trylock(&eb->lock))
+		return 0;
+
+	if (atomic_read(&eb->blocking_writers)) {
+		read_unlock(&eb->lock);
+		return 0;
+	}
+	atomic_inc(&eb->read_locks);
+	atomic_inc(&eb->spinning_readers);
+	return 1;
+}
+
+/*
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers or readers
+ */
+int btrfs_try_tree_write_lock(struct extent_buffer *eb)
+{
+	if (atomic_read(&eb->blocking_writers) ||
+	    atomic_read(&eb->blocking_readers))
+		return 0;
+
+	write_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers) ||
+	    atomic_read(&eb->blocking_readers)) {
+		write_unlock(&eb->lock);
+		return 0;
+	}
+	atomic_inc(&eb->write_locks);
+	atomic_inc(&eb->spinning_writers);
+	eb->lock_owner = current->pid;
+	return 1;
+}
+
+/*
+ * drop a spinning read lock
+ */
+void btrfs_tree_read_unlock(struct extent_buffer *eb)
+{
+	/*
+	 * if we're nested, we have the write lock.  No new locking
+	 * is needed as long as we are the lock owner.
+	 * The write unlock will do a barrier for us, and the lock_nested
+	 * field only matters to the lock owner.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner) {
+		eb->lock_nested = 0;
+		return;
+	}
+	btrfs_assert_tree_read_locked(eb);
+	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+	atomic_dec(&eb->spinning_readers);
+	atomic_dec(&eb->read_locks);
+	read_unlock(&eb->lock);
+}
+
+/*
+ * drop a blocking read lock
+ */
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
+{
+	/*
+	 * if we're nested, we have the write lock.  No new locking
+	 * is needed as long as we are the lock owner.
+	 * The write unlock will do a barrier for us, and the lock_nested
+	 * field only matters to the lock owner.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner) {
+		eb->lock_nested = 0;
+		return;
+	}
+	btrfs_assert_tree_read_locked(eb);
+	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+	if (atomic_dec_and_test(&eb->blocking_readers) &&
+	    waitqueue_active(&eb->read_lock_wq))
+		wake_up(&eb->read_lock_wq);
+	atomic_dec(&eb->read_locks);
+}
+
+/*
+ * take a spinning write lock.  This will wait for both
+ * blocking readers or writers
+ */
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+again:
+	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
+	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
+	write_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_readers)) {
+		write_unlock(&eb->lock);
+		wait_event(eb->read_lock_wq,
+			   atomic_read(&eb->blocking_readers) == 0);
+		goto again;
+	}
+	if (atomic_read(&eb->blocking_writers)) {
+		write_unlock(&eb->lock);
+		wait_event(eb->write_lock_wq,
+			   atomic_read(&eb->blocking_writers) == 0);
+		goto again;
+	}
+	WARN_ON(atomic_read(&eb->spinning_writers));
+	atomic_inc(&eb->spinning_writers);
+	atomic_inc(&eb->write_locks);
+	eb->lock_owner = current->pid;
+}
+
+/*
+ * drop a spinning or a blocking write lock.
+ */
+void btrfs_tree_unlock(struct extent_buffer *eb)
+{
+	int blockers = atomic_read(&eb->blocking_writers);
+
+	BUG_ON(blockers > 1);
+
+	btrfs_assert_tree_locked(eb);
+	eb->lock_owner = 0;
+	atomic_dec(&eb->write_locks);
+
+	if (blockers) {
+		WARN_ON(atomic_read(&eb->spinning_writers));
+		atomic_dec(&eb->blocking_writers);
+		smp_mb();
+		if (waitqueue_active(&eb->write_lock_wq))
+			wake_up(&eb->write_lock_wq);
+	} else {
+		WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+		atomic_dec(&eb->spinning_writers);
+		write_unlock(&eb->lock);
+	}
+}
+
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
+{
+	BUG_ON(!atomic_read(&eb->write_locks));
+}
+
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+	BUG_ON(!atomic_read(&eb->read_locks));
+}
diff --git a/kernel/fs/btrfs/locking.h b/kernel/fs/btrfs/locking.h
new file mode 100644
index 000000000..c44a9d5f5
--- /dev/null
+++ b/kernel/fs/btrfs/locking.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+#define BTRFS_WRITE_LOCK 1
+#define BTRFS_READ_LOCK 2
+#define BTRFS_WRITE_LOCK_BLOCKING 3
+#define BTRFS_READ_LOCK_BLOCKING 4
+
+void btrfs_tree_lock(struct extent_buffer *eb);
+void btrfs_tree_unlock(struct extent_buffer *eb);
+
+void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+
+
+static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
+{
+	if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+		btrfs_tree_unlock(eb);
+	else if (rw == BTRFS_READ_LOCK_BLOCKING)
+		btrfs_tree_read_unlock_blocking(eb);
+	else if (rw == BTRFS_READ_LOCK)
+		btrfs_tree_read_unlock(eb);
+	else
+		BUG();
+}
+
+static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+	btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
+}
+
+static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+	btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
+}
+#endif
diff --git a/kernel/fs/btrfs/lzo.c b/kernel/fs/btrfs/lzo.c
new file mode 100644
index 000000000..a2f051347
--- /dev/null
+++ b/kernel/fs/btrfs/lzo.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+
+#define LZO_LEN	4
+
+struct workspace {
+	void *mem;
+	void *buf;	/* where decompressed data goes */
+	void *cbuf;	/* where compressed data goes */
+	struct list_head list;
+};
+
+static void lzo_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	vfree(workspace->buf);
+	vfree(workspace->cbuf);
+	vfree(workspace->mem);
+	kfree(workspace);
+}
+
+static struct list_head *lzo_alloc_workspace(void)
+{
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+	workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	lzo_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+	__le32 dlen;
+
+	dlen = cpu_to_le32(len);
+	memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(char *buf)
+{
+	__le32 dlen;
+
+	memcpy(&dlen, buf, LZO_LEN);
+	return le32_to_cpu(dlen);
+}
+
+static int lzo_compress_pages(struct list_head *ws,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	unsigned long bytes_left;
+
+	size_t in_len;
+	size_t out_len;
+	char *buf;
+	unsigned long tot_in = 0;
+	unsigned long tot_out = 0;
+	unsigned long pg_bytes_left;
+	unsigned long out_offset;
+	unsigned long bytes;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	/*
+	 * store the size of all chunks of compressed data in
+	 * the first 4 bytes
+	 */
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	cpage_out = kmap(out_page);
+	out_offset = LZO_LEN;
+	tot_out = LZO_LEN;
+	pages[0] = out_page;
+	nr_pages = 1;
+	pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	/* compress at most one page of data each time */
+	in_len = min(len, PAGE_CACHE_SIZE);
+	while (tot_in < len) {
+		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+				       &out_len, workspace->mem);
+		if (ret != LZO_E_OK) {
+			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
+			       ret);
+			ret = -EIO;
+			goto out;
+		}
+
+		/* store the size of this chunk of compressed data */
+		write_compress_length(cpage_out + out_offset, out_len);
+		tot_out += LZO_LEN;
+		out_offset += LZO_LEN;
+		pg_bytes_left -= LZO_LEN;
+
+		tot_in += in_len;
+		tot_out += out_len;
+
+		/* copy bytes from the working buffer into the pages */
+		buf = workspace->cbuf;
+		while (out_len) {
+			bytes = min_t(unsigned long, pg_bytes_left, out_len);
+
+			memcpy(cpage_out + out_offset, buf, bytes);
+
+			out_len -= bytes;
+			pg_bytes_left -= bytes;
+			buf += bytes;
+			out_offset += bytes;
+
+			/*
+			 * we need another page for writing out.
+			 *
+			 * Note if there's less than 4 bytes left, we just
+			 * skip to a new page.
+			 */
+			if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+			    pg_bytes_left == 0) {
+				if (pg_bytes_left) {
+					memset(cpage_out + out_offset, 0,
+					       pg_bytes_left);
+					tot_out += pg_bytes_left;
+				}
+
+				/* we're done, don't allocate new page */
+				if (out_len == 0 && tot_in >= len)
+					break;
+
+				kunmap(out_page);
+				if (nr_pages == nr_dest_pages) {
+					out_page = NULL;
+					ret = -E2BIG;
+					goto out;
+				}
+
+				out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+				if (out_page == NULL) {
+					ret = -ENOMEM;
+					goto out;
+				}
+				cpage_out = kmap(out_page);
+				pages[nr_pages++] = out_page;
+
+				pg_bytes_left = PAGE_CACHE_SIZE;
+				out_offset = 0;
+			}
+		}
+
+		/* we're making it bigger, give up */
+		if (tot_in > 8192 && tot_in < tot_out) {
+			ret = -E2BIG;
+			goto out;
+		}
+
+		/* we're all done */
+		if (tot_in >= len)
+			break;
+
+		if (tot_out > max_out)
+			break;
+
+		bytes_left = len - tot_in;
+		kunmap(in_page);
+		page_cache_release(in_page);
+
+		start += PAGE_CACHE_SIZE;
+		in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		data_in = kmap(in_page);
+		in_len = min(bytes_left, PAGE_CACHE_SIZE);
+	}
+
+	if (tot_out > tot_in)
+		goto out;
+
+	/* store the size of all chunks of compressed data */
+	cpage_out = kmap(pages[0]);
+	write_compress_length(cpage_out, tot_out);
+
+	kunmap(pages[0]);
+
+	ret = 0;
+	*total_out = tot_out;
+	*total_in = tot_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+
+	return ret;
+}
+
+static int lzo_decompress_biovec(struct list_head *ws,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0, ret2;
+	char *data_in;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+	unsigned long buf_start;
+	unsigned long buf_offset = 0;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+
+	size_t in_len;
+	size_t out_len;
+	unsigned long in_offset;
+	unsigned long in_page_bytes_left;
+	unsigned long tot_in;
+	unsigned long tot_out;
+	unsigned long tot_len;
+	char *buf;
+	bool may_late_unmap, need_unmap;
+
+	data_in = kmap(pages_in[0]);
+	tot_len = read_compress_length(data_in);
+
+	tot_in = LZO_LEN;
+	in_offset = LZO_LEN;
+	tot_len = min_t(size_t, srclen, tot_len);
+	in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	tot_out = 0;
+	pg_offset = 0;
+
+	while (tot_in < tot_len) {
+		in_len = read_compress_length(data_in + in_offset);
+		in_page_bytes_left -= LZO_LEN;
+		in_offset += LZO_LEN;
+		tot_in += LZO_LEN;
+
+		tot_in += in_len;
+		working_bytes = in_len;
+		may_late_unmap = need_unmap = false;
+
+		/* fast path: avoid using the working buffer */
+		if (in_page_bytes_left >= in_len) {
+			buf = data_in + in_offset;
+			bytes = in_len;
+			may_late_unmap = true;
+			goto cont;
+		}
+
+		/* copy bytes from the pages into the working buffer */
+		buf = workspace->cbuf;
+		buf_offset = 0;
+		while (working_bytes) {
+			bytes = min(working_bytes, in_page_bytes_left);
+
+			memcpy(buf + buf_offset, data_in + in_offset, bytes);
+			buf_offset += bytes;
+cont:
+			working_bytes -= bytes;
+			in_page_bytes_left -= bytes;
+			in_offset += bytes;
+
+			/* check if we need to pick another page */
+			if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+			    || in_page_bytes_left == 0) {
+				tot_in += in_page_bytes_left;
+
+				if (working_bytes == 0 && tot_in >= tot_len)
+					break;
+
+				if (page_in_index + 1 >= total_pages_in) {
+					ret = -EIO;
+					goto done;
+				}
+
+				if (may_late_unmap)
+					need_unmap = true;
+				else
+					kunmap(pages_in[page_in_index]);
+
+				data_in = kmap(pages_in[++page_in_index]);
+
+				in_page_bytes_left = PAGE_CACHE_SIZE;
+				in_offset = 0;
+			}
+		}
+
+		out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+					    &out_len);
+		if (need_unmap)
+			kunmap(pages_in[page_in_index - 1]);
+		if (ret != LZO_E_OK) {
+			printk(KERN_WARNING "BTRFS: decompress failed\n");
+			ret = -EIO;
+			break;
+		}
+
+		buf_start = tot_out;
+		tot_out += out_len;
+
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 tot_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0)
+			break;
+	}
+done:
+	kunmap(pages_in[page_in_index]);
+	if (!ret)
+		btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
+	return ret;
+}
+
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	size_t in_len;
+	size_t out_len;
+	size_t tot_len;
+	int ret = 0;
+	char *kaddr;
+	unsigned long bytes;
+
+	BUG_ON(srclen < LZO_LEN);
+
+	tot_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	in_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	out_len = PAGE_CACHE_SIZE;
+	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+	if (ret != LZO_E_OK) {
+		printk(KERN_WARNING "BTRFS: decompress failed!\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	if (out_len < start_byte) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * the caller is already checking against PAGE_SIZE, but lets
+	 * move this check closer to the memcpy/memset
+	 */
+	destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+	bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+	kaddr = kmap_atomic(dest_page);
+	memcpy(kaddr, workspace->buf + start_byte, bytes);
+
+	/*
+	 * btrfs_getblock is doing a zero on the tail of the page too,
+	 * but this will cover anything missing from the decompressed
+	 * data.
+	 */
+	if (bytes < destlen)
+		memset(kaddr+bytes, 0, destlen-bytes);
+	kunmap_atomic(kaddr);
+out:
+	return ret;
+}
+
+const struct btrfs_compress_op btrfs_lzo_compress = {
+	.alloc_workspace	= lzo_alloc_workspace,
+	.free_workspace		= lzo_free_workspace,
+	.compress_pages		= lzo_compress_pages,
+	.decompress_biovec	= lzo_decompress_biovec,
+	.decompress		= lzo_decompress,
+};
diff --git a/kernel/fs/btrfs/math.h b/kernel/fs/btrfs/math.h
new file mode 100644
index 000000000..1b10a3cd1
--- /dev/null
+++ b/kernel/fs/btrfs/math.h
@@ -0,0 +1,42 @@
+
+/*
+ * Copyright (C) 2012 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+
+#include <asm/div64.h>
+
+static inline u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	return div_u64(num, 10);
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor == 100)
+		return num;
+	num *= factor;
+	return div_u64(num, 100);
+}
+
+#endif
diff --git a/kernel/fs/btrfs/ordered-data.c b/kernel/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000..760c4a5e0
--- /dev/null
+++ b/kernel/fs/btrfs/ordered-data.c
@@ -0,0 +1,1051 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+#include "disk-io.h"
+
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+	if (entry->file_offset + entry->len < entry->file_offset)
+		return (u64)-1;
+	return entry->file_offset + entry->len;
+}
+
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_ordered_extent *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+
+		if (file_offset < entry->file_offset)
+			p = &(*p)->rb_left;
+		else if (file_offset >= entry_end(entry))
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static void ordered_data_tree_panic(struct inode *inode, int errno,
+					       u64 offset)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
+		    "%llu", offset);
+}
+
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+				     struct rb_node **prev_ret)
+{
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *test;
+	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (file_offset < entry->file_offset)
+			n = n->rb_left;
+		else if (file_offset >= entry_end(entry))
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while (prev && file_offset >= entry_end(prev_entry)) {
+		test = rb_next(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		if (file_offset < entry_end(prev_entry))
+			break;
+
+		prev = test;
+	}
+	if (prev)
+		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+				      rb_node);
+	while (prev && file_offset < entry_end(prev_entry)) {
+		test = rb_prev(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		prev = test;
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+	if (file_offset < entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+			  u64 len)
+{
+	if (file_offset + len <= entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+					  u64 file_offset)
+{
+	struct rb_root *root = &tree->tree;
+	struct rb_node *prev = NULL;
+	struct rb_node *ret;
+	struct btrfs_ordered_extent *entry;
+
+	if (tree->last) {
+		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+				 rb_node);
+		if (offset_in_entry(entry, file_offset))
+			return tree->last;
+	}
+	ret = __tree_search(root, file_offset, &prev);
+	if (!ret)
+		ret = prev;
+	if (ret)
+		tree->last = ret;
+	return ret;
+}
+
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int dio, int compress_type)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->file_offset = file_offset;
+	entry->start = start;
+	entry->len = len;
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+	    !(type == BTRFS_ORDERED_NOCOW))
+		entry->csum_bytes_left = disk_len;
+	entry->disk_len = disk_len;
+	entry->bytes_left = len;
+	entry->inode = igrab(inode);
+	entry->compress_type = compress_type;
+	entry->truncated_len = (u64)-1;
+	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+		set_bit(type, &entry->flags);
+
+	if (dio)
+		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+
+	/* one ref for the tree */
+	atomic_set(&entry->refs, 1);
+	init_waitqueue_head(&entry->wait);
+	INIT_LIST_HEAD(&entry->list);
+	INIT_LIST_HEAD(&entry->root_extent_list);
+	INIT_LIST_HEAD(&entry->work_list);
+	init_completion(&entry->completion);
+	INIT_LIST_HEAD(&entry->log_list);
+	INIT_LIST_HEAD(&entry->trans_list);
+
+	trace_btrfs_ordered_extent_add(inode, entry);
+
+	spin_lock_irq(&tree->lock);
+	node = tree_insert(&tree->tree, file_offset,
+			   &entry->rb_node);
+	if (node)
+		ordered_data_tree_panic(inode, -EEXIST, file_offset);
+	spin_unlock_irq(&tree->lock);
+
+	spin_lock(&root->ordered_extent_lock);
+	list_add_tail(&entry->root_extent_list,
+		      &root->ordered_extents);
+	root->nr_ordered_extents++;
+	if (root->nr_ordered_extents == 1) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(!list_empty(&root->ordered_root));
+		list_add_tail(&root->ordered_root,
+			      &root->fs_info->ordered_roots);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
+
+	return 0;
+}
+
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0,
+					  BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+				 u64 start, u64 len, u64 disk_len, int type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 1,
+					  BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0,
+					  compress_type);
+}
+
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished.  If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+void btrfs_add_ordered_sum(struct inode *inode,
+			   struct btrfs_ordered_extent *entry,
+			   struct btrfs_ordered_sum *sum)
+{
+	struct btrfs_ordered_inode_tree *tree;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	list_add_tail(&sum->list, &entry->list);
+	WARN_ON(entry->csum_bytes_left < sum->len);
+	entry->csum_bytes_left -= sum->len;
+	if (entry->csum_bytes_left == 0)
+		wake_up(&entry->wait);
+	spin_unlock_irq(&tree->lock);
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO may span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete.  This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 *file_offset, u64 io_size, int uptodate)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+	int ret;
+	unsigned long flags;
+	u64 dec_end;
+	u64 dec_start;
+	u64 to_dec;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irqsave(&tree->lock, flags);
+	node = tree_search(tree, *file_offset);
+	if (!node) {
+		ret = 1;
+		goto out;
+	}
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, *file_offset)) {
+		ret = 1;
+		goto out;
+	}
+
+	dec_start = max(*file_offset, entry->file_offset);
+	dec_end = min(*file_offset + io_size, entry->file_offset +
+		      entry->len);
+	*file_offset = dec_end;
+	if (dec_start > dec_end) {
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			"bad ordering dec_start %llu end %llu", dec_start, dec_end);
+	}
+	to_dec = dec_end - dec_start;
+	if (to_dec > entry->bytes_left) {
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			"bad ordered accounting left %llu size %llu",
+			entry->bytes_left, to_dec);
+	}
+	entry->bytes_left -= to_dec;
+	if (!uptodate)
+		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+	if (entry->bytes_left == 0) {
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
+		ret = 1;
+	}
+out:
+	if (!ret && cached && entry) {
+		*cached = entry;
+		atomic_inc(&entry->refs);
+	}
+	spin_unlock_irqrestore(&tree->lock, flags);
+	return ret == 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO should not span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 file_offset, u64 io_size, int uptodate)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+	unsigned long flags;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irqsave(&tree->lock, flags);
+	if (cached && *cached) {
+		entry = *cached;
+		goto have_entry;
+	}
+
+	node = tree_search(tree, file_offset);
+	if (!node) {
+		ret = 1;
+		goto out;
+	}
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
+	if (!offset_in_entry(entry, file_offset)) {
+		ret = 1;
+		goto out;
+	}
+
+	if (io_size > entry->bytes_left) {
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			   "bad ordered accounting left %llu size %llu",
+		       entry->bytes_left, io_size);
+	}
+	entry->bytes_left -= io_size;
+	if (!uptodate)
+		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+	if (entry->bytes_left == 0) {
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
+		ret = 1;
+	}
+out:
+	if (!ret && cached && entry) {
+		*cached = entry;
+		atomic_inc(&entry->refs);
+	}
+	spin_unlock_irqrestore(&tree->lock, flags);
+	return ret == 0;
+}
+
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list,
+			      const loff_t start,
+			      const loff_t end)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	struct rb_node *n;
+	struct rb_node *prev;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	n = __tree_search(&tree->tree, end, &prev);
+	if (!n)
+		n = prev;
+	for (; n; n = rb_prev(n)) {
+		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		if (ordered->file_offset > end)
+			continue;
+		if (entry_end(ordered) <= start)
+			break;
+		if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+			continue;
+		list_add(&ordered->log_list, logged_list);
+		atomic_inc(&ordered->refs);
+	}
+	spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	while (!list_empty(logged_list)) {
+		ordered = list_first_entry(logged_list,
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		btrfs_put_ordered_extent(ordered);
+	}
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log)
+{
+	int index = log->log_transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	list_splice_tail(logged_list, &log->logged_list[index]);
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+
+		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+			struct inode *inode = ordered->inode;
+			u64 start = ordered->file_offset;
+			u64 end = ordered->file_offset + ordered->len - 1;
+
+			WARN_ON(!inode);
+			filemap_fdatawrite_range(inode->i_mapping, start, end);
+		}
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+						   &ordered->flags));
+
+		list_add_tail(&ordered->trans_list, &trans->ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		btrfs_put_ordered_extent(ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+/*
+ * used to drop a reference on an ordered extent.  This will free
+ * the extent if the last reference is dropped
+ */
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	trace_btrfs_ordered_extent_put(entry->inode, entry);
+
+	if (atomic_dec_and_test(&entry->refs)) {
+		if (entry->inode)
+			btrfs_add_delayed_iput(entry->inode);
+		while (!list_empty(&entry->list)) {
+			cur = entry->list.next;
+			sum = list_entry(cur, struct btrfs_ordered_sum, list);
+			list_del(&sum->list);
+			kfree(sum);
+		}
+		kmem_cache_free(btrfs_ordered_extent_cache, entry);
+	}
+}
+
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * and waiters are woken up.
+ */
+void btrfs_remove_ordered_extent(struct inode *inode,
+				 struct btrfs_ordered_extent *entry)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct rb_node *node;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	node = &entry->rb_node;
+	rb_erase(node, &tree->tree);
+	if (tree->last == node)
+		tree->last = NULL;
+	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	spin_unlock_irq(&tree->lock);
+
+	spin_lock(&root->ordered_extent_lock);
+	list_del_init(&entry->root_extent_list);
+	root->nr_ordered_extents--;
+
+	trace_btrfs_ordered_extent_remove(inode, entry);
+
+	if (!root->nr_ordered_extents) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(list_empty(&root->ordered_root));
+		list_del_init(&root->ordered_root);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
+	wake_up(&entry->wait);
+}
+
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+	complete(&ordered->completion);
+}
+
+/*
+ * wait for all the ordered extents in a root.  This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+{
+	struct list_head splice, works;
+	struct btrfs_ordered_extent *ordered, *next;
+	int count = 0;
+
+	INIT_LIST_HEAD(&splice);
+	INIT_LIST_HEAD(&works);
+
+	mutex_lock(&root->ordered_extent_mutex);
+	spin_lock(&root->ordered_extent_lock);
+	list_splice_init(&root->ordered_extents, &splice);
+	while (!list_empty(&splice) && nr) {
+		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+					   root_extent_list);
+		list_move_tail(&ordered->root_extent_list,
+			       &root->ordered_extents);
+		atomic_inc(&ordered->refs);
+		spin_unlock(&root->ordered_extent_lock);
+
+		btrfs_init_work(&ordered->flush_work,
+				btrfs_flush_delalloc_helper,
+				btrfs_run_ordered_extent_work, NULL, NULL);
+		list_add_tail(&ordered->work_list, &works);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &ordered->flush_work);
+
+		cond_resched();
+		spin_lock(&root->ordered_extent_lock);
+		if (nr != -1)
+			nr--;
+		count++;
+	}
+	list_splice_tail(&splice, &root->ordered_extents);
+	spin_unlock(&root->ordered_extent_lock);
+
+	list_for_each_entry_safe(ordered, next, &works, work_list) {
+		list_del_init(&ordered->work_list);
+		wait_for_completion(&ordered->completion);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+	mutex_unlock(&root->ordered_extent_mutex);
+
+	return count;
+}
+
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+	int done;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&fs_info->ordered_operations_mutex);
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
+	while (!list_empty(&splice) && nr) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
+		spin_unlock(&fs_info->ordered_root_lock);
+
+		done = btrfs_wait_ordered_extents(root, nr);
+		btrfs_put_fs_root(root);
+
+		spin_lock(&fs_info->ordered_root_lock);
+		if (nr != -1) {
+			nr -= done;
+			WARN_ON(nr < 0);
+		}
+	}
+	list_splice_tail(&splice, &fs_info->ordered_roots);
+	spin_unlock(&fs_info->ordered_root_lock);
+	mutex_unlock(&fs_info->ordered_operations_mutex);
+}
+
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+				       struct btrfs_ordered_extent *entry,
+				       int wait)
+{
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
+
+	trace_btrfs_ordered_extent_start(inode, entry);
+
+	/*
+	 * pages in the range can be dirty, clean or writeback.  We
+	 * start IO on any dirty ones so the wait doesn't stall waiting
+	 * for the flusher thread to find them
+	 */
+	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+		filemap_fdatawrite_range(inode->i_mapping, start, end);
+	if (wait) {
+		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						 &entry->flags));
+	}
+}
+
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+	int ret = 0;
+	int ret_wb = 0;
+	u64 end;
+	u64 orig_end;
+	struct btrfs_ordered_extent *ordered;
+
+	if (start + len < start) {
+		orig_end = INT_LIMIT(loff_t);
+	} else {
+		orig_end = start + len - 1;
+		if (orig_end > INT_LIMIT(loff_t))
+			orig_end = INT_LIMIT(loff_t);
+	}
+
+	/* start IO across the range first to instantiate any delalloc
+	 * extents
+	 */
+	ret = btrfs_fdatawrite_range(inode, start, orig_end);
+	if (ret)
+		return ret;
+
+	/*
+	 * If we have a writeback error don't return immediately. Wait first
+	 * for any ordered extents that haven't completed yet. This is to make
+	 * sure no one can dirty the same page ranges and call writepages()
+	 * before the ordered extents complete - to avoid failures (-EEXIST)
+	 * when adding the new ordered extents to the ordered tree.
+	 */
+	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+
+	end = orig_end;
+	while (1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+		if (!ordered)
+			break;
+		if (ordered->file_offset > orig_end) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered->file_offset + ordered->len <= start) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		end = ordered->file_offset;
+		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+			ret = -EIO;
+		btrfs_put_ordered_extent(ordered);
+		if (ret || end == 0 || end == start)
+			break;
+		end--;
+	}
+	return ret_wb ? ret_wb : ret;
+}
+
+/*
+ * find an ordered extent corresponding to file_offset.  return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset))
+		entry = NULL;
+	if (entry)
+		atomic_inc(&entry->refs);
+out:
+	spin_unlock_irq(&tree->lock);
+	return entry;
+}
+
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+							u64 file_offset,
+							u64 len)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node) {
+		node = tree_search(tree, file_offset + len);
+		if (!node)
+			goto out;
+	}
+
+	while (1) {
+		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (range_overlaps(entry, file_offset, len))
+			break;
+
+		if (entry->file_offset >= file_offset + len) {
+			entry = NULL;
+			break;
+		}
+		entry = NULL;
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	if (entry)
+		atomic_inc(&entry->refs);
+	spin_unlock_irq(&tree->lock);
+	return entry;
+}
+
+/*
+ * lookup and return any extent before 'file_offset'.  NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	atomic_inc(&entry->refs);
+out:
+	spin_unlock_irq(&tree->lock);
+	return entry;
+}
+
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size.  i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+				struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	u64 disk_i_size;
+	u64 new_i_size;
+	u64 i_size = i_size_read(inode);
+	struct rb_node *node;
+	struct rb_node *prev = NULL;
+	struct btrfs_ordered_extent *test;
+	int ret = 1;
+
+	spin_lock_irq(&tree->lock);
+	if (ordered) {
+		offset = entry_end(ordered);
+		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
+			offset = min(offset,
+				     ordered->file_offset +
+				     ordered->truncated_len);
+	} else {
+		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
+	}
+	disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+	/* truncate file */
+	if (disk_i_size > i_size) {
+		BTRFS_I(inode)->disk_i_size = i_size;
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * if the disk i_size is already at the inode->i_size, or
+	 * this ordered extent is inside the disk i_size, we're done
+	 */
+	if (disk_i_size == i_size)
+		goto out;
+
+	/*
+	 * We still need to update disk_i_size if outstanding_isize is greater
+	 * than disk_i_size.
+	 */
+	if (offset <= disk_i_size &&
+	    (!ordered || ordered->outstanding_isize <= disk_i_size))
+		goto out;
+
+	/*
+	 * walk backward from this ordered extent to disk_i_size.
+	 * if we find an ordered extent then we can't update disk i_size
+	 * yet
+	 */
+	if (ordered) {
+		node = rb_prev(&ordered->rb_node);
+	} else {
+		prev = tree_search(tree, offset);
+		/*
+		 * we insert file extents without involving ordered struct,
+		 * so there should be no ordered struct cover this offset
+		 */
+		if (prev) {
+			test = rb_entry(prev, struct btrfs_ordered_extent,
+					rb_node);
+			BUG_ON(offset_in_entry(test, offset));
+		}
+		node = prev;
+	}
+	for (; node; node = rb_prev(node)) {
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+		/* We treat this entry as if it doesnt exist */
+		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+			continue;
+		if (test->file_offset + test->len <= disk_i_size)
+			break;
+		if (test->file_offset >= i_size)
+			break;
+		if (entry_end(test) > disk_i_size) {
+			/*
+			 * we don't update disk_i_size now, so record this
+			 * undealt i_size. Or we will not know the real
+			 * i_size.
+			 */
+			if (test->outstanding_isize < offset)
+				test->outstanding_isize = offset;
+			if (ordered &&
+			    ordered->outstanding_isize >
+			    test->outstanding_isize)
+				test->outstanding_isize =
+						ordered->outstanding_isize;
+			goto out;
+		}
+	}
+	new_i_size = min_t(u64, offset, i_size);
+
+	/*
+	 * Some ordered extents may completed before the current one, and
+	 * we hold the real i_size in ->outstanding_isize.
+	 */
+	if (ordered && ordered->outstanding_isize > new_i_size)
+		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
+	BTRFS_I(inode)->disk_i_size = new_i_size;
+	ret = 0;
+out:
+	/*
+	 * We need to do this because we can't remove ordered extents until
+	 * after the i_disk_size has been updated and then the inode has been
+	 * updated to reflect the change, so we need to tell anybody who finds
+	 * this ordered extent that we've already done all the real work, we
+	 * just haven't completed all the other work.
+	 */
+	if (ordered)
+		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+	spin_unlock_irq(&tree->lock);
+	return ret;
+}
+
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum.  This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum, int len)
+{
+	struct btrfs_ordered_sum *ordered_sum;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	unsigned long num_sectors;
+	unsigned long i;
+	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+	int index = 0;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
+	if (!ordered)
+		return 0;
+
+	spin_lock_irq(&tree->lock);
+	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
+		if (disk_bytenr >= ordered_sum->bytenr &&
+		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
+			i = (disk_bytenr - ordered_sum->bytenr) >>
+			    inode->i_sb->s_blocksize_bits;
+			num_sectors = ordered_sum->len >>
+				      inode->i_sb->s_blocksize_bits;
+			num_sectors = min_t(int, len - index, num_sectors - i);
+			memcpy(sum + index, ordered_sum->sums + i,
+			       num_sectors);
+
+			index += (int)num_sectors;
+			if (index == len)
+				goto out;
+			disk_bytenr += num_sectors * sectorsize;
+		}
+	}
+out:
+	spin_unlock_irq(&tree->lock);
+	btrfs_put_ordered_extent(ordered);
+	return index;
+}
+
+int __init ordered_data_init(void)
+{
+	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+				     sizeof(struct btrfs_ordered_extent), 0,
+				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				     NULL);
+	if (!btrfs_ordered_extent_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ordered_data_exit(void)
+{
+	if (btrfs_ordered_extent_cache)
+		kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/kernel/fs/btrfs/ordered-data.h b/kernel/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000..e96cd4ccd
--- /dev/null
+++ b/kernel/fs/btrfs/ordered-data.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+	spinlock_t lock;
+	struct rb_root tree;
+	struct rb_node *last;
+};
+
+struct btrfs_ordered_sum {
+	/* bytenr is the start of this extent on disk */
+	u64 bytenr;
+
+	/*
+	 * this is the length in bytes covered by the sums array below.
+	 */
+	int len;
+	struct list_head list;
+	/* last field is a variable length array of csums */
+	u32 sums[];
+};
+
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters.  It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
+
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
+				       * has done its due diligence in updating
+				       * the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+				       ordered extent */
+#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+				 * in the logging code. */
+struct btrfs_ordered_extent {
+	/* logical offset in the file */
+	u64 file_offset;
+
+	/* disk byte number */
+	u64 start;
+
+	/* ram length of the extent in bytes */
+	u64 len;
+
+	/* extent length on disk */
+	u64 disk_len;
+
+	/* number of bytes that still need writing */
+	u64 bytes_left;
+
+	/* number of bytes that still need csumming */
+	u64 csum_bytes_left;
+
+	/*
+	 * the end of the ordered extent which is behind it but
+	 * didn't update disk_i_size. Please see the comment of
+	 * btrfs_ordered_update_i_size();
+	 */
+	u64 outstanding_isize;
+
+	/*
+	 * If we get truncated we need to adjust the file extent we enter for
+	 * this ordered extent so that we do not expose stale data.
+	 */
+	u64 truncated_len;
+
+	/* flags (described above) */
+	unsigned long flags;
+
+	/* compression algorithm */
+	int compress_type;
+
+	/* reference count */
+	atomic_t refs;
+
+	/* the inode we belong to */
+	struct inode *inode;
+
+	/* list of checksums for insertion when the extent io is done */
+	struct list_head list;
+
+	/* If we need to wait on this to be done */
+	struct list_head log_list;
+
+	/* If the transaction needs to wait on this ordered extent */
+	struct list_head trans_list;
+
+	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+	wait_queue_head_t wait;
+
+	/* our friendly rbtree entry */
+	struct rb_node rb_node;
+
+	/* a per root list of all the pending ordered extents */
+	struct list_head root_extent_list;
+
+	struct btrfs_work work;
+
+	struct completion completion;
+	struct btrfs_work flush_work;
+	struct list_head work_list;
+};
+
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+					 unsigned long bytes)
+{
+	int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+	return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
+}
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+	spin_lock_init(&t->lock);
+	t->tree = RB_ROOT;
+	t->last = NULL;
+}
+
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 file_offset, u64 io_size, int uptodate);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 *file_offset, u64 io_size,
+				   int uptodate);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+				 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type);
+void btrfs_add_ordered_sum(struct inode *inode,
+			   struct btrfs_ordered_extent *entry,
+			   struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+							u64 file_offset,
+							u64 len);
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+				struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum, int len);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list,
+			      const loff_t start,
+			      const loff_t end);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
+#endif
diff --git a/kernel/fs/btrfs/orphan.c b/kernel/fs/btrfs/orphan.c
new file mode 100644
index 000000000..47767d5b8
--- /dev/null
+++ b/kernel/fs/btrfs/orphan.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret) { /* JDM: Really? */
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/kernel/fs/btrfs/print-tree.c b/kernel/fs/btrfs/print-tree.c
new file mode 100644
index 000000000..647ab12fd
--- /dev/null
+++ b/kernel/fs/btrfs/print-tree.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+	int i;
+	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+	       "num_stripes %d\n",
+	       btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk),
+	       btrfs_chunk_type(eb, chunk), num_stripes);
+	for (i = 0 ; i < num_stripes ; i++) {
+		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+		      btrfs_stripe_devid_nr(eb, chunk, i),
+		      btrfs_stripe_offset_nr(eb, chunk, i));
+	}
+}
+static void print_dev_item(struct extent_buffer *eb,
+			   struct btrfs_dev_item *dev_item)
+{
+	printk(KERN_INFO "\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %llu\n",
+	       btrfs_device_id(eb, dev_item),
+	       btrfs_device_total_bytes(eb, dev_item),
+	       btrfs_device_bytes_used(eb, dev_item));
+}
+static void print_extent_data_ref(struct extent_buffer *eb,
+				  struct btrfs_extent_data_ref *ref)
+{
+	printk(KERN_INFO "\t\textent data backref root %llu "
+	       "objectid %llu offset %llu count %u\n",
+	       btrfs_extent_data_ref_root(eb, ref),
+	       btrfs_extent_data_ref_objectid(eb, ref),
+	       btrfs_extent_data_ref_offset(eb, ref),
+	       btrfs_extent_data_ref_count(eb, ref));
+}
+
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_disk_key key;
+	unsigned long end;
+	unsigned long ptr;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u64 flags;
+	u64 offset;
+
+	if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		struct btrfs_extent_item_v0 *ei0;
+		BUG_ON(item_size != sizeof(*ei0));
+		ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+		printk(KERN_INFO "\t\textent refs %u\n",
+		       btrfs_extent_refs_v0(eb, ei0));
+		return;
+#else
+		BUG();
+#endif
+	}
+
+	ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+	       btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
+	       flags);
+
+	if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+		info = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_tree_block_key(eb, info, &key);
+		printk(KERN_INFO "\t\ttree block key (%llu %u %llu) "
+		       "level %d\n",
+		       btrfs_disk_key_objectid(&key), key.type,
+		       btrfs_disk_key_offset(&key),
+		       btrfs_tree_block_level(eb, info));
+		iref = (struct btrfs_extent_inline_ref *)(info + 1);
+	} else {
+		iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	}
+
+	ptr = (unsigned long)iref;
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(eb, iref);
+		offset = btrfs_extent_inline_ref_offset(eb, iref);
+		switch (type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref "
+				"root %llu\n", offset);
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref "
+				"parent %llu\n", offset);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			print_extent_data_ref(eb, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = (struct btrfs_shared_data_ref *)(iref + 1);
+			printk(KERN_INFO "\t\tshared data backref "
+			       "parent %llu count %u\n",
+			       offset, btrfs_shared_data_ref_count(eb, sref));
+			break;
+		default:
+			BUG();
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	WARN_ON(ptr > end);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+	struct btrfs_extent_ref_v0 *ref0;
+
+	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+	printk("\t\textent back ref root %llu gen %llu "
+		"owner %llu num_refs %lu\n",
+		btrfs_ref_root_v0(eb, ref0),
+		btrfs_ref_generation_v0(eb, ref0),
+		btrfs_ref_objectid_v0(eb, ref0),
+		(unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
+
+static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
+			    u32 item_size)
+{
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+			(unsigned long)item_size);
+		return;
+	}
+	while (item_size) {
+		__le64 subvol_id;
+
+		read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
+		printk(KERN_INFO "\t\tsubvol_id %llu\n",
+		       (unsigned long long)le64_to_cpu(subvol_id));
+		item_size -= sizeof(u64);
+		offset += sizeof(u64);
+	}
+}
+
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+	int i;
+	u32 type, nr;
+	struct btrfs_item *item;
+	struct btrfs_root_item *ri;
+	struct btrfs_dir_item *di;
+	struct btrfs_inode_item *ii;
+	struct btrfs_block_group_item *bi;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	if (!l)
+		return;
+
+	nr = btrfs_header_nritems(l);
+
+	btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
+		   btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
+	for (i = 0 ; i < nr ; i++) {
+		item = btrfs_item_nr(i);
+		btrfs_item_key_to_cpu(l, &key, i);
+		type = key.type;
+		printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
+		       "itemsize %d\n",
+			i, key.objectid, type, key.offset,
+			btrfs_item_offset(l, item), btrfs_item_size(l, item));
+		switch (type) {
+		case BTRFS_INODE_ITEM_KEY:
+			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+			printk(KERN_INFO "\t\tinode generation %llu size %llu "
+			       "mode %o\n",
+			       btrfs_inode_generation(l, ii),
+			       btrfs_inode_size(l, ii),
+			       btrfs_inode_mode(l, ii));
+			break;
+		case BTRFS_DIR_ITEM_KEY:
+			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+			btrfs_dir_item_key_to_cpu(l, di, &found_key);
+			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+				found_key.objectid,
+				btrfs_dir_type(l, di));
+			break;
+		case BTRFS_ROOT_ITEM_KEY:
+			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+				btrfs_disk_root_bytenr(l, ri),
+				btrfs_disk_root_refs(l, ri));
+			break;
+		case BTRFS_EXTENT_ITEM_KEY:
+		case BTRFS_METADATA_ITEM_KEY:
+			print_extent_item(l, i, type);
+			break;
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref\n");
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref\n");
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = btrfs_item_ptr(l, i,
+					      struct btrfs_extent_data_ref);
+			print_extent_data_ref(l, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = btrfs_item_ptr(l, i,
+					      struct btrfs_shared_data_ref);
+			printk(KERN_INFO "\t\tshared data backref count %u\n",
+			       btrfs_shared_data_ref_count(l, sref));
+			break;
+		case BTRFS_EXTENT_DATA_KEY:
+			fi = btrfs_item_ptr(l, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(l, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				printk(KERN_INFO "\t\tinline extent data "
+				       "size %u\n",
+				       btrfs_file_extent_inline_len(l, i, fi));
+				break;
+			}
+			printk(KERN_INFO "\t\textent data disk bytenr %llu "
+			       "nr %llu\n",
+			       btrfs_file_extent_disk_bytenr(l, fi),
+			       btrfs_file_extent_disk_num_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data offset %llu "
+			       "nr %llu ram %llu\n",
+			       btrfs_file_extent_offset(l, fi),
+			       btrfs_file_extent_num_bytes(l, fi),
+			       btrfs_file_extent_ram_bytes(l, fi));
+			break;
+		case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			print_extent_ref_v0(l, i);
+#else
+			BUG();
+#endif
+			break;
+		case BTRFS_BLOCK_GROUP_ITEM_KEY:
+			bi = btrfs_item_ptr(l, i,
+					    struct btrfs_block_group_item);
+			printk(KERN_INFO "\t\tblock group used %llu\n",
+			       btrfs_disk_block_group_used(l, bi));
+			break;
+		case BTRFS_CHUNK_ITEM_KEY:
+			print_chunk(l, btrfs_item_ptr(l, i,
+						      struct btrfs_chunk));
+			break;
+		case BTRFS_DEV_ITEM_KEY:
+			print_dev_item(l, btrfs_item_ptr(l, i,
+					struct btrfs_dev_item));
+			break;
+		case BTRFS_DEV_EXTENT_KEY:
+			dev_extent = btrfs_item_ptr(l, i,
+						    struct btrfs_dev_extent);
+			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+			       "\t\tchunk objectid %llu chunk offset %llu "
+			       "length %llu\n",
+			       btrfs_dev_extent_chunk_tree(l, dev_extent),
+			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
+			       btrfs_dev_extent_chunk_offset(l, dev_extent),
+			       btrfs_dev_extent_length(l, dev_extent));
+			break;
+		case BTRFS_DEV_STATS_KEY:
+			printk(KERN_INFO "\t\tdevice stats\n");
+			break;
+		case BTRFS_DEV_REPLACE_KEY:
+			printk(KERN_INFO "\t\tdev replace\n");
+			break;
+		case BTRFS_UUID_KEY_SUBVOL:
+		case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+					btrfs_item_size_nr(l, i));
+			break;
+		};
+	}
+}
+
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+	int i; u32 nr;
+	struct btrfs_key key;
+	int level;
+
+	if (!c)
+		return;
+	nr = btrfs_header_nritems(c);
+	level = btrfs_header_level(c);
+	if (level == 0) {
+		btrfs_print_leaf(root, c);
+		return;
+	}
+	btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u",
+		btrfs_header_bytenr(c), level, nr,
+		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+	for (i = 0; i < nr; i++) {
+		btrfs_node_key_to_cpu(c, &key, i);
+		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+		       i, key.objectid, key.type, key.offset,
+		       btrfs_node_blockptr(c, i));
+	}
+	for (i = 0; i < nr; i++) {
+		struct extent_buffer *next = read_tree_block(root,
+					btrfs_node_blockptr(c, i),
+					btrfs_node_ptr_generation(c, i));
+		if (btrfs_is_leaf(next) &&
+		   level != 1)
+			BUG();
+		if (btrfs_header_level(next) !=
+		       level - 1)
+			BUG();
+		btrfs_print_tree(root, next);
+		free_extent_buffer(next);
+	}
+}
diff --git a/kernel/fs/btrfs/print-tree.h b/kernel/fs/btrfs/print-tree.h
new file mode 100644
index 000000000..7faddfacc
--- /dev/null
+++ b/kernel/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c);
+#endif
diff --git a/kernel/fs/btrfs/props.c b/kernel/fs/btrfs/props.c
new file mode 100644
index 000000000..dca137b04
--- /dev/null
+++ b/kernel/fs/btrfs/props.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/hashtable.h>
+#include "props.h"
+#include "btrfs_inode.h"
+#include "hash.h"
+#include "transaction.h"
+#include "xattr.h"
+
+#define BTRFS_PROP_HANDLERS_HT_BITS 8
+static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
+
+struct prop_handler {
+	struct hlist_node node;
+	const char *xattr_name;
+	int (*validate)(const char *value, size_t len);
+	int (*apply)(struct inode *inode, const char *value, size_t len);
+	const char *(*extract)(struct inode *inode);
+	int inheritable;
+};
+
+static int prop_compression_validate(const char *value, size_t len);
+static int prop_compression_apply(struct inode *inode,
+				  const char *value,
+				  size_t len);
+static const char *prop_compression_extract(struct inode *inode);
+
+static struct prop_handler prop_handlers[] = {
+	{
+		.xattr_name = XATTR_BTRFS_PREFIX "compression",
+		.validate = prop_compression_validate,
+		.apply = prop_compression_apply,
+		.extract = prop_compression_extract,
+		.inheritable = 1
+	},
+	{
+		.xattr_name = NULL
+	}
+};
+
+void __init btrfs_props_init(void)
+{
+	struct prop_handler *p;
+
+	hash_init(prop_handlers_ht);
+
+	for (p = &prop_handlers[0]; p->xattr_name; p++) {
+		u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
+
+		hash_add(prop_handlers_ht, &p->node, h);
+	}
+}
+
+static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
+{
+	struct hlist_head *h;
+
+	h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
+	if (hlist_empty(h))
+		return NULL;
+
+	return h;
+}
+
+static const struct prop_handler *
+find_prop_handler(const char *name,
+		  const struct hlist_head *handlers)
+{
+	struct prop_handler *h;
+
+	if (!handlers) {
+		u64 hash = btrfs_name_hash(name, strlen(name));
+
+		handlers = find_prop_handlers_by_hash(hash);
+		if (!handlers)
+			return NULL;
+	}
+
+	hlist_for_each_entry(h, handlers, node)
+		if (!strcmp(h->xattr_name, name))
+			return h;
+
+	return NULL;
+}
+
+static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
+			    struct inode *inode,
+			    const char *name,
+			    const char *value,
+			    size_t value_len,
+			    int flags)
+{
+	const struct prop_handler *handler;
+	int ret;
+
+	if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
+		return -EINVAL;
+
+	handler = find_prop_handler(name, NULL);
+	if (!handler)
+		return -EINVAL;
+
+	if (value_len == 0) {
+		ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+				       NULL, 0, flags);
+		if (ret)
+			return ret;
+
+		ret = handler->apply(inode, NULL, 0);
+		ASSERT(ret == 0);
+
+		return ret;
+	}
+
+	ret = handler->validate(value, value_len);
+	if (ret)
+		return ret;
+	ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+			       value, value_len, flags);
+	if (ret)
+		return ret;
+	ret = handler->apply(inode, value, value_len);
+	if (ret) {
+		__btrfs_setxattr(trans, inode, handler->xattr_name,
+				 NULL, 0, flags);
+		return ret;
+	}
+
+	set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+
+	return 0;
+}
+
+int btrfs_set_prop(struct inode *inode,
+		   const char *name,
+		   const char *value,
+		   size_t value_len,
+		   int flags)
+{
+	return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
+}
+
+static int iterate_object_props(struct btrfs_root *root,
+				struct btrfs_path *path,
+				u64 objectid,
+				void (*iterator)(void *,
+						 const struct prop_handler *,
+						 const char *,
+						 size_t),
+				void *ctx)
+{
+	int ret;
+	char *name_buf = NULL;
+	char *value_buf = NULL;
+	int name_buf_len = 0;
+	int value_buf_len = 0;
+
+	while (1) {
+		struct btrfs_key key;
+		struct btrfs_dir_item *di;
+		struct extent_buffer *leaf;
+		u32 total_len, cur, this_len;
+		int slot;
+		const struct hlist_head *handlers;
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != objectid)
+			break;
+		if (key.type != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		handlers = find_prop_handlers_by_hash(key.offset);
+		if (!handlers)
+			goto next_slot;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		cur = 0;
+		total_len = btrfs_item_size_nr(leaf, slot);
+
+		while (cur < total_len) {
+			u32 name_len = btrfs_dir_name_len(leaf, di);
+			u32 data_len = btrfs_dir_data_len(leaf, di);
+			unsigned long name_ptr, data_ptr;
+			const struct prop_handler *handler;
+
+			this_len = sizeof(*di) + name_len + data_len;
+			name_ptr = (unsigned long)(di + 1);
+			data_ptr = name_ptr + name_len;
+
+			if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
+			    memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
+						 name_ptr,
+						 XATTR_BTRFS_PREFIX_LEN))
+				goto next_dir_item;
+
+			if (name_len >= name_buf_len) {
+				kfree(name_buf);
+				name_buf_len = name_len + 1;
+				name_buf = kmalloc(name_buf_len, GFP_NOFS);
+				if (!name_buf) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+			read_extent_buffer(leaf, name_buf, name_ptr, name_len);
+			name_buf[name_len] = '\0';
+
+			handler = find_prop_handler(name_buf, handlers);
+			if (!handler)
+				goto next_dir_item;
+
+			if (data_len > value_buf_len) {
+				kfree(value_buf);
+				value_buf_len = data_len;
+				value_buf = kmalloc(data_len, GFP_NOFS);
+				if (!value_buf) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+			read_extent_buffer(leaf, value_buf, data_ptr, data_len);
+
+			iterator(ctx, handler, value_buf, data_len);
+next_dir_item:
+			cur += this_len;
+			di = (struct btrfs_dir_item *)((char *) di + this_len);
+		}
+
+next_slot:
+		path->slots[0]++;
+	}
+
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	kfree(name_buf);
+	kfree(value_buf);
+
+	return ret;
+}
+
+static void inode_prop_iterator(void *ctx,
+				const struct prop_handler *handler,
+				const char *value,
+				size_t len)
+{
+	struct inode *inode = ctx;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = handler->apply(inode, value, len);
+	if (unlikely(ret))
+		btrfs_warn(root->fs_info,
+			   "error applying prop %s to ino %llu (root %llu): %d",
+			   handler->xattr_name, btrfs_ino(inode),
+			   root->root_key.objectid, ret);
+	else
+		set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+}
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 ino = btrfs_ino(inode);
+	int ret;
+
+	ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+
+	return ret;
+}
+
+static int inherit_props(struct btrfs_trans_handle *trans,
+			 struct inode *inode,
+			 struct inode *parent)
+{
+	const struct prop_handler *h;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (!test_bit(BTRFS_INODE_HAS_PROPS,
+		      &BTRFS_I(parent)->runtime_flags))
+		return 0;
+
+	for (h = &prop_handlers[0]; h->xattr_name; h++) {
+		const char *value;
+		u64 num_bytes;
+
+		if (!h->inheritable)
+			continue;
+
+		value = h->extract(parent);
+		if (!value)
+			continue;
+
+		num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+		ret = btrfs_block_rsv_add(root, trans->block_rsv,
+					  num_bytes, BTRFS_RESERVE_NO_FLUSH);
+		if (ret)
+			goto out;
+		ret = __btrfs_set_prop(trans, inode, h->xattr_name,
+				       value, strlen(value), 0);
+		btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
+		if (ret)
+			goto out;
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+			      struct inode *inode,
+			      struct inode *dir)
+{
+	if (!dir)
+		return 0;
+
+	return inherit_props(trans, inode, dir);
+}
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_root *parent_root)
+{
+	struct btrfs_key key;
+	struct inode *parent_inode, *child_inode;
+	int ret;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
+				  parent_root, NULL);
+	if (IS_ERR(parent_inode))
+		return PTR_ERR(parent_inode);
+
+	child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+	if (IS_ERR(child_inode)) {
+		iput(parent_inode);
+		return PTR_ERR(child_inode);
+	}
+
+	ret = inherit_props(trans, child_inode, parent_inode);
+	iput(child_inode);
+	iput(parent_inode);
+
+	return ret;
+}
+
+static int prop_compression_validate(const char *value, size_t len)
+{
+	if (!strncmp("lzo", value, len))
+		return 0;
+	else if (!strncmp("zlib", value, len))
+		return 0;
+
+	return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode,
+				  const char *value,
+				  size_t len)
+{
+	int type;
+
+	if (len == 0) {
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+
+		return 0;
+	}
+
+	if (!strncmp("lzo", value, len))
+		type = BTRFS_COMPRESS_LZO;
+	else if (!strncmp("zlib", value, len))
+		type = BTRFS_COMPRESS_ZLIB;
+	else
+		return -EINVAL;
+
+	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	BTRFS_I(inode)->force_compress = type;
+
+	return 0;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+	switch (BTRFS_I(inode)->force_compress) {
+	case BTRFS_COMPRESS_ZLIB:
+		return "zlib";
+	case BTRFS_COMPRESS_LZO:
+		return "lzo";
+	}
+
+	return NULL;
+}
+
+
diff --git a/kernel/fs/btrfs/props.h b/kernel/fs/btrfs/props.h
new file mode 100644
index 000000000..100f18829
--- /dev/null
+++ b/kernel/fs/btrfs/props.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_PROPS_H
+#define __BTRFS_PROPS_H
+
+#include "ctree.h"
+
+void __init btrfs_props_init(void);
+
+int btrfs_set_prop(struct inode *inode,
+		   const char *name,
+		   const char *value,
+		   size_t value_len,
+		   int flags);
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+			      struct inode *inode,
+			      struct inode *dir);
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_root *parent_root);
+
+#endif
diff --git a/kernel/fs/btrfs/qgroup.c b/kernel/fs/btrfs/qgroup.c
new file mode 100644
index 000000000..3d6546581
--- /dev/null
+++ b/kernel/fs/btrfs/qgroup.c
@@ -0,0 +1,2966 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/btrfs.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "qgroup.h"
+
+/* TODO XXX FIXME
+ *  - subvol delete -> delete when ref goes to 0? delete limits also?
+ *  - reorganize keys
+ *  - compressed
+ *  - sync
+ *  - copy also limits on subvol creation
+ *  - limit
+ *  - caches fuer ulists
+ *  - performance benchmarks
+ *  - check all ioctl parameters
+ */
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+	u64 qgroupid;
+
+	/*
+	 * state
+	 */
+	u64 rfer;	/* referenced */
+	u64 rfer_cmpr;	/* referenced compressed */
+	u64 excl;	/* exclusive */
+	u64 excl_cmpr;	/* exclusive compressed */
+
+	/*
+	 * limits
+	 */
+	u64 lim_flags;	/* which limits are set */
+	u64 max_rfer;
+	u64 max_excl;
+	u64 rsv_rfer;
+	u64 rsv_excl;
+
+	/*
+	 * reservation tracking
+	 */
+	u64 reserved;
+
+	/*
+	 * lists
+	 */
+	struct list_head groups;  /* groups this group is member of */
+	struct list_head members; /* groups that are members of this group */
+	struct list_head dirty;   /* dirty groups */
+	struct rb_node node;	  /* tree of qgroups */
+
+	/*
+	 * temp variables for accounting operations
+	 */
+	u64 old_refcnt;
+	u64 new_refcnt;
+};
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+	struct list_head next_group;
+	struct list_head next_member;
+	struct btrfs_qgroup *group;
+	struct btrfs_qgroup *member;
+};
+
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+		   int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+
+/* must be called with qgroup_ioctl_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+					   u64 qgroupid)
+{
+	struct rb_node *n = fs_info->qgroup_tree.rb_node;
+	struct btrfs_qgroup *qgroup;
+
+	while (n) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		if (qgroup->qgroupid < qgroupid)
+			n = n->rb_left;
+		else if (qgroup->qgroupid > qgroupid)
+			n = n->rb_right;
+		else
+			return qgroup;
+	}
+	return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+					  u64 qgroupid)
+{
+	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_qgroup *qgroup;
+
+	while (*p) {
+		parent = *p;
+		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+		if (qgroup->qgroupid < qgroupid)
+			p = &(*p)->rb_left;
+		else if (qgroup->qgroupid > qgroupid)
+			p = &(*p)->rb_right;
+		else
+			return qgroup;
+	}
+
+	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+	if (!qgroup)
+		return ERR_PTR(-ENOMEM);
+
+	qgroup->qgroupid = qgroupid;
+	INIT_LIST_HEAD(&qgroup->groups);
+	INIT_LIST_HEAD(&qgroup->members);
+	INIT_LIST_HEAD(&qgroup->dirty);
+
+	rb_link_node(&qgroup->node, parent, p);
+	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+	return qgroup;
+}
+
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+{
+	struct btrfs_qgroup_list *list;
+
+	list_del(&qgroup->dirty);
+	while (!list_empty(&qgroup->groups)) {
+		list = list_first_entry(&qgroup->groups,
+					struct btrfs_qgroup_list, next_group);
+		list_del(&list->next_group);
+		list_del(&list->next_member);
+		kfree(list);
+	}
+
+	while (!list_empty(&qgroup->members)) {
+		list = list_first_entry(&qgroup->members,
+					struct btrfs_qgroup_list, next_member);
+		list_del(&list->next_group);
+		list_del(&list->next_member);
+		kfree(list);
+	}
+	kfree(qgroup);
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+
+	if (!qgroup)
+		return -ENOENT;
+
+	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+	__del_qgroup_rb(qgroup);
+	return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+			   u64 memberid, u64 parentid)
+{
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup_list *list;
+
+	member = find_qgroup_rb(fs_info, memberid);
+	parent = find_qgroup_rb(fs_info, parentid);
+	if (!member || !parent)
+		return -ENOENT;
+
+	list = kzalloc(sizeof(*list), GFP_ATOMIC);
+	if (!list)
+		return -ENOMEM;
+
+	list->group = parent;
+	list->member = member;
+	list_add_tail(&list->next_group, &member->groups);
+	list_add_tail(&list->next_member, &parent->members);
+
+	return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+			   u64 memberid, u64 parentid)
+{
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup_list *list;
+
+	member = find_qgroup_rb(fs_info, memberid);
+	parent = find_qgroup_rb(fs_info, parentid);
+	if (!member || !parent)
+		return -ENOENT;
+
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent) {
+			list_del(&list->next_group);
+			list_del(&list->next_member);
+			kfree(list);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+			       u64 rfer, u64 excl)
+{
+	struct btrfs_qgroup *qgroup;
+
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup)
+		return -EINVAL;
+	if (qgroup->rfer != rfer || qgroup->excl != excl)
+		return -EINVAL;
+	return 0;
+}
+#endif
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *l;
+	int slot;
+	int ret = 0;
+	u64 flags = 0;
+	u64 rescan_progress = 0;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+	if (!fs_info->qgroup_ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* default this to quota off, in case no status key is found */
+	fs_info->qgroup_flags = 0;
+
+	/*
+	 * pass 1: read status, all qgroup infos and limits
+	 */
+	key.objectid = 0;
+	key.type = 0;
+	key.offset = 0;
+	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+	if (ret)
+		goto out;
+
+	while (1) {
+		struct btrfs_qgroup *qgroup;
+
+		slot = path->slots[0];
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+			struct btrfs_qgroup_status_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_status_item);
+
+			if (btrfs_qgroup_status_version(l, ptr) !=
+			    BTRFS_QGROUP_STATUS_VERSION) {
+				btrfs_err(fs_info,
+				 "old qgroup version, quota disabled");
+				goto out;
+			}
+			if (btrfs_qgroup_status_generation(l, ptr) !=
+			    fs_info->generation) {
+				flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+				btrfs_err(fs_info,
+					"qgroup generation mismatch, "
+					"marked as inconsistent");
+			}
+			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+									  ptr);
+			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
+			goto next1;
+		}
+
+		if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+		    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+			goto next1;
+
+		qgroup = find_qgroup_rb(fs_info, found_key.offset);
+		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+			btrfs_err(fs_info, "inconsitent qgroup config");
+			flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		}
+		if (!qgroup) {
+			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			if (IS_ERR(qgroup)) {
+				ret = PTR_ERR(qgroup);
+				goto out;
+			}
+		}
+		switch (found_key.type) {
+		case BTRFS_QGROUP_INFO_KEY: {
+			struct btrfs_qgroup_info_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_info_item);
+			qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+			qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+			qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+			qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+			/* generation currently unused */
+			break;
+		}
+		case BTRFS_QGROUP_LIMIT_KEY: {
+			struct btrfs_qgroup_limit_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_limit_item);
+			qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+			qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+			qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+			qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+			qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+			break;
+		}
+		}
+next1:
+		ret = btrfs_next_item(quota_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+	}
+	btrfs_release_path(path);
+
+	/*
+	 * pass 2: read all qgroup relations
+	 */
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+	if (ret)
+		goto out;
+	while (1) {
+		slot = path->slots[0];
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+			goto next2;
+
+		if (found_key.objectid > found_key.offset) {
+			/* parent <- member, not needed to build config */
+			/* FIXME should we omit the key completely? */
+			goto next2;
+		}
+
+		ret = add_relation_rb(fs_info, found_key.objectid,
+				      found_key.offset);
+		if (ret == -ENOENT) {
+			btrfs_warn(fs_info,
+				"orphan qgroup relation 0x%llx->0x%llx",
+				found_key.objectid, found_key.offset);
+			ret = 0;	/* ignore the error */
+		}
+		if (ret)
+			goto out;
+next2:
+		ret = btrfs_next_item(quota_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+	}
+out:
+	fs_info->qgroup_flags |= flags;
+	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
+		fs_info->quota_enabled = 0;
+		fs_info->pending_quota_state = 0;
+	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+		   ret >= 0) {
+		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+	}
+	btrfs_free_path(path);
+
+	if (ret < 0) {
+		ulist_free(fs_info->qgroup_ulist);
+		fs_info->qgroup_ulist = NULL;
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
+ * first two are in single-threaded paths.And for the third one, we have set
+ * quota_root to be null with qgroup_lock held before, so it is safe to clean
+ * up the in-memory structures without qgroup_lock held.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+	struct rb_node *n;
+	struct btrfs_qgroup *qgroup;
+
+	while ((n = rb_first(&fs_info->qgroup_tree))) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		rb_erase(n, &fs_info->qgroup_tree);
+		__del_qgroup_rb(qgroup);
+	}
+	/*
+	 * we call btrfs_free_qgroup_config() when umounting
+	 * filesystem and disabling quota, so we set qgroup_ulit
+	 * to be null here to avoid double free.
+	 */
+	ulist_free(fs_info->qgroup_ulist);
+	fs_info->qgroup_ulist = NULL;
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *quota_root,
+				    u64 src, u64 dst)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = src;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = dst;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *quota_root,
+				    u64 src, u64 dst)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = src;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = dst;
+
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *quota_root, u64 qgroupid)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_qgroup_info_item *qgroup_info;
+	struct btrfs_qgroup_limit_item *qgroup_limit;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	if (btrfs_test_is_dummy_root(quota_root))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroupid;
+
+	/*
+	 * Avoid a transaction abort by catching -EEXIST here. In that
+	 * case, we proceed by re-initializing the existing structure
+	 * on disk.
+	 */
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*qgroup_info));
+	if (ret && ret != -EEXIST)
+		goto out;
+
+	leaf = path->nodes[0];
+	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+				 struct btrfs_qgroup_info_item);
+	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_release_path(path);
+
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*qgroup_limit));
+	if (ret && ret != -EEXIST)
+		goto out;
+
+	leaf = path->nodes[0];
+	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+				  struct btrfs_qgroup_limit_item);
+	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *quota_root, u64 qgroupid)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroupid;
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_qgroup *qgroup)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_limit_item *qgroup_limit;
+	int ret;
+	int slot;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	key.offset = qgroup->qgroupid;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
+	btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
+	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
+	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
+	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
+	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_qgroup *qgroup)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_info_item *qgroup_info;
+	int ret;
+	int slot;
+
+	if (btrfs_test_is_dummy_root(root))
+		return 0;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroup->qgroupid;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
+	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				    struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_status_item *ptr;
+	int ret;
+	int slot;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_STATUS_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+	btrfs_set_qgroup_status_rescan(l, ptr,
+				fs_info->qgroup_rescan_progress.objectid);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf = NULL;
+	int ret;
+	int nr = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	key.objectid = 0;
+	key.offset = 0;
+	key.type = 0;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			goto out;
+		leaf = path->nodes[0];
+		nr = btrfs_header_nritems(leaf);
+		if (!nr)
+			break;
+		/*
+		 * delete the leaf one by one
+		 * since the whole tree is going
+		 * to be deleted.
+		 */
+		path->slots[0] = 0;
+		ret = btrfs_del_items(trans, root, path, 0, nr);
+		if (ret)
+			goto out;
+
+		btrfs_release_path(path);
+	}
+	ret = 0;
+out:
+	root->fs_info->pending_quota_state = 0;
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_path *path = NULL;
+	struct btrfs_qgroup_status_item *ptr;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_qgroup *qgroup = NULL;
+	int ret = 0;
+	int slot;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	if (fs_info->quota_root) {
+		fs_info->pending_quota_state = 1;
+		goto out;
+	}
+
+	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+	if (!fs_info->qgroup_ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * initially create the quota tree
+	 */
+	quota_root = btrfs_create_tree(trans, fs_info,
+				       BTRFS_QUOTA_TREE_OBJECTID);
+	if (IS_ERR(quota_root)) {
+		ret =  PTR_ERR(quota_root);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out_free_root;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_STATUS_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*ptr));
+	if (ret)
+		goto out_free_path;
+
+	leaf = path->nodes[0];
+	ptr = btrfs_item_ptr(leaf, path->slots[0],
+				 struct btrfs_qgroup_status_item);
+	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	key.objectid = 0;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = 0;
+
+	btrfs_release_path(path);
+	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
+	if (ret > 0)
+		goto out_add_root;
+	if (ret < 0)
+		goto out_free_path;
+
+
+	while (1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.type == BTRFS_ROOT_REF_KEY) {
+			ret = add_qgroup_item(trans, quota_root,
+					      found_key.offset);
+			if (ret)
+				goto out_free_path;
+
+			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			if (IS_ERR(qgroup)) {
+				ret = PTR_ERR(qgroup);
+				goto out_free_path;
+			}
+		}
+		ret = btrfs_next_item(tree_root, path);
+		if (ret < 0)
+			goto out_free_path;
+		if (ret)
+			break;
+	}
+
+out_add_root:
+	btrfs_release_path(path);
+	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
+	if (ret)
+		goto out_free_path;
+
+	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
+	if (IS_ERR(qgroup)) {
+		ret = PTR_ERR(qgroup);
+		goto out_free_path;
+	}
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_root = quota_root;
+	fs_info->pending_quota_state = 1;
+	spin_unlock(&fs_info->qgroup_lock);
+out_free_path:
+	btrfs_free_path(path);
+out_free_root:
+	if (ret) {
+		free_extent_buffer(quota_root->node);
+		free_extent_buffer(quota_root->commit_root);
+		kfree(quota_root);
+	}
+out:
+	if (ret) {
+		ulist_free(fs_info->qgroup_ulist);
+		fs_info->qgroup_ulist = NULL;
+	}
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *quota_root;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	if (!fs_info->quota_root)
+		goto out;
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+	quota_root = fs_info->quota_root;
+	fs_info->quota_root = NULL;
+	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+	spin_unlock(&fs_info->qgroup_lock);
+
+	btrfs_free_qgroup_config(fs_info);
+
+	ret = btrfs_clean_quota_tree(trans, quota_root);
+	if (ret)
+		goto out;
+
+	ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+	if (ret)
+		goto out;
+
+	list_del(&quota_root->dirty_list);
+
+	btrfs_tree_lock(quota_root->node);
+	clean_tree_block(trans, tree_root->fs_info, quota_root->node);
+	btrfs_tree_unlock(quota_root->node);
+	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+
+	free_extent_buffer(quota_root->node);
+	free_extent_buffer(quota_root->commit_root);
+	kfree(quota_root);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+			 struct btrfs_qgroup *qgroup)
+{
+	if (list_empty(&qgroup->dirty))
+		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ *
+ * Caller should hold fs_info->qgroup_lock.
+ */
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+				    struct ulist *tmp, u64 ref_root,
+				    u64 num_bytes, int sign)
+{
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup_list *glist;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int ret = 0;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	qgroup->rfer += sign * num_bytes;
+	qgroup->rfer_cmpr += sign * num_bytes;
+
+	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+	qgroup->excl += sign * num_bytes;
+	qgroup->excl_cmpr += sign * num_bytes;
+	if (sign > 0)
+		qgroup->reserved -= num_bytes;
+
+	qgroup_dirty(fs_info, qgroup);
+
+	/* Get all of the parent groups that contain this qgroup */
+	list_for_each_entry(glist, &qgroup->groups, next_group) {
+		ret = ulist_add(tmp, glist->group->qgroupid,
+				ptr_to_u64(glist->group), GFP_ATOMIC);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* Iterate all of the parents and adjust their reference counts */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		qgroup = u64_to_ptr(unode->aux);
+		qgroup->rfer += sign * num_bytes;
+		qgroup->rfer_cmpr += sign * num_bytes;
+		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+		qgroup->excl += sign * num_bytes;
+		if (sign > 0)
+			qgroup->reserved -= num_bytes;
+		qgroup->excl_cmpr += sign * num_bytes;
+		qgroup_dirty(fs_info, qgroup);
+
+		/* Add any parents of the parents */
+		list_for_each_entry(glist, &qgroup->groups, next_group) {
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+
+/*
+ * Quick path for updating qgroup with only excl refs.
+ *
+ * In that case, just update all parent will be enough.
+ * Or we needs to do a full rescan.
+ * Caller should also hold fs_info->qgroup_lock.
+ *
+ * Return 0 for quick update, return >0 for need to full rescan
+ * and mark INCONSISTENT flag.
+ * Return < 0 for other error.
+ */
+static int quick_update_accounting(struct btrfs_fs_info *fs_info,
+				   struct ulist *tmp, u64 src, u64 dst,
+				   int sign)
+{
+	struct btrfs_qgroup *qgroup;
+	int ret = 1;
+	int err = 0;
+
+	qgroup = find_qgroup_rb(fs_info, src);
+	if (!qgroup)
+		goto out;
+	if (qgroup->excl == qgroup->rfer) {
+		ret = 0;
+		err = __qgroup_excl_accounting(fs_info, tmp, dst,
+					       qgroup->excl, sign);
+		if (err < 0) {
+			ret = err;
+			goto out;
+		}
+	}
+out:
+	if (ret)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	return ret;
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup_list *list;
+	struct ulist *tmp;
+	int ret = 0;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
+	/* Check the level of src and dst first */
+	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+		return -EINVAL;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+	member = find_qgroup_rb(fs_info, src);
+	parent = find_qgroup_rb(fs_info, dst);
+	if (!member || !parent) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* check if such qgroup relation exist firstly */
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	ret = add_qgroup_relation_item(trans, quota_root, src, dst);
+	if (ret)
+		goto out;
+
+	ret = add_qgroup_relation_item(trans, quota_root, dst, src);
+	if (ret) {
+		del_qgroup_relation_item(trans, quota_root, src, dst);
+		goto out;
+	}
+
+	spin_lock(&fs_info->qgroup_lock);
+	ret = add_relation_rb(quota_root->fs_info, src, dst);
+	if (ret < 0) {
+		spin_unlock(&fs_info->qgroup_lock);
+		goto out;
+	}
+	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	ulist_free(tmp);
+	return ret;
+}
+
+int __del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup_list *list;
+	struct ulist *tmp;
+	int ret = 0;
+	int err;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	member = find_qgroup_rb(fs_info, src);
+	parent = find_qgroup_rb(fs_info, dst);
+	if (!member || !parent) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* check if such qgroup relation exist firstly */
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent)
+			goto exist;
+	}
+	ret = -ENOENT;
+	goto out;
+exist:
+	ret = del_qgroup_relation_item(trans, quota_root, src, dst);
+	err = del_qgroup_relation_item(trans, quota_root, dst, src);
+	if (err && !ret)
+		ret = err;
+
+	spin_lock(&fs_info->qgroup_lock);
+	del_relation_rb(fs_info, src, dst);
+	ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	ulist_free(tmp);
+	return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	ret = __del_qgroup_relation(trans, fs_info, src, dst);
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+	return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (qgroup) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	ret = add_qgroup_item(trans, quota_root, qgroupid);
+	if (ret)
+		goto out;
+
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = add_qgroup_rb(fs_info, qgroupid);
+	spin_unlock(&fs_info->qgroup_lock);
+
+	if (IS_ERR(qgroup))
+		ret = PTR_ERR(qgroup);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup_list *list;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	} else {
+		/* check if there are no children of this qgroup */
+		if (!list_empty(&qgroup->members)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+	ret = del_qgroup_item(trans, quota_root, qgroupid);
+
+	while (!list_empty(&qgroup->groups)) {
+		list = list_first_entry(&qgroup->groups,
+					struct btrfs_qgroup_list, next_group);
+		ret = __del_qgroup_relation(trans, fs_info,
+					   qgroupid,
+					   list->group->qgroupid);
+		if (ret)
+			goto out;
+	}
+
+	spin_lock(&fs_info->qgroup_lock);
+	del_qgroup_rb(quota_root->fs_info, qgroupid);
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 qgroupid,
+		       struct btrfs_qgroup_limit *limit)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	spin_lock(&fs_info->qgroup_lock);
+	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
+		qgroup->max_rfer = limit->max_rfer;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+		qgroup->max_excl = limit->max_excl;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
+		qgroup->rsv_rfer = limit->rsv_rfer;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
+		qgroup->rsv_excl = limit->rsv_excl;
+	qgroup->lim_flags |= limit->flags;
+
+	spin_unlock(&fs_info->qgroup_lock);
+
+	ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+	if (ret) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		btrfs_info(fs_info, "unable to update quota limit for %llu",
+		       qgroupid);
+	}
+
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
+			   struct btrfs_qgroup_operation *oper2)
+{
+	/*
+	 * Ignore seq and type here, we're looking for any operation
+	 * at all related to this extent on that root.
+	 */
+	if (oper1->bytenr < oper2->bytenr)
+		return -1;
+	if (oper1->bytenr > oper2->bytenr)
+		return 1;
+	if (oper1->ref_root < oper2->ref_root)
+		return -1;
+	if (oper1->ref_root > oper2->ref_root)
+		return 1;
+	return 0;
+}
+
+static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
+			      struct btrfs_qgroup_operation *oper)
+{
+	struct rb_node *n;
+	struct btrfs_qgroup_operation *cur;
+	int cmp;
+
+	spin_lock(&fs_info->qgroup_op_lock);
+	n = fs_info->qgroup_op_tree.rb_node;
+	while (n) {
+		cur = rb_entry(n, struct btrfs_qgroup_operation, n);
+		cmp = comp_oper_exist(cur, oper);
+		if (cmp < 0) {
+			n = n->rb_right;
+		} else if (cmp) {
+			n = n->rb_left;
+		} else {
+			spin_unlock(&fs_info->qgroup_op_lock);
+			return -EEXIST;
+		}
+	}
+	spin_unlock(&fs_info->qgroup_op_lock);
+	return 0;
+}
+
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+		     struct btrfs_qgroup_operation *oper2)
+{
+	if (oper1->bytenr < oper2->bytenr)
+		return -1;
+	if (oper1->bytenr > oper2->bytenr)
+		return 1;
+	if (oper1->ref_root < oper2->ref_root)
+		return -1;
+	if (oper1->ref_root > oper2->ref_root)
+		return 1;
+	if (oper1->seq < oper2->seq)
+		return -1;
+	if (oper1->seq > oper2->seq)
+		return 1;
+	if (oper1->type < oper2->type)
+		return -1;
+	if (oper1->type > oper2->type)
+		return 1;
+	return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+			      struct btrfs_qgroup_operation *oper)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_qgroup_operation *cur;
+	int cmp;
+
+	spin_lock(&fs_info->qgroup_op_lock);
+	p = &fs_info->qgroup_op_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+		cmp = comp_oper(cur, oper);
+		if (cmp < 0) {
+			p = &(*p)->rb_right;
+		} else if (cmp) {
+			p = &(*p)->rb_left;
+		} else {
+			spin_unlock(&fs_info->qgroup_op_lock);
+			return -EEXIST;
+		}
+	}
+	rb_link_node(&oper->n, parent, p);
+	rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+	spin_unlock(&fs_info->qgroup_op_lock);
+	return 0;
+}
+
+/*
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point.  If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
+ */
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info, u64 ref_root,
+			    u64 bytenr, u64 num_bytes,
+			    enum btrfs_qgroup_operation_type type, int mod_seq)
+{
+	struct btrfs_qgroup_operation *oper;
+	int ret;
+
+	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+		return 0;
+
+	oper = kmalloc(sizeof(*oper), GFP_NOFS);
+	if (!oper)
+		return -ENOMEM;
+
+	oper->ref_root = ref_root;
+	oper->bytenr = bytenr;
+	oper->num_bytes = num_bytes;
+	oper->type = type;
+	oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+	INIT_LIST_HEAD(&oper->elem.list);
+	oper->elem.seq = 0;
+
+	trace_btrfs_qgroup_record_ref(oper);
+
+	if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
+		/*
+		 * If any operation for this bytenr/ref_root combo
+		 * exists, then we know it's not exclusively owned and
+		 * shouldn't be queued up.
+		 *
+		 * This also catches the case where we have a cloned
+		 * extent that gets queued up multiple times during
+		 * drop snapshot.
+		 */
+		if (qgroup_oper_exists(fs_info, oper)) {
+			kfree(oper);
+			return 0;
+		}
+	}
+
+	ret = insert_qgroup_oper(fs_info, oper);
+	if (ret) {
+		/* Shouldn't happen so have an assert for developers */
+		ASSERT(0);
+		kfree(oper);
+		return ret;
+	}
+	list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+	if (mod_seq)
+		btrfs_get_tree_mod_seq(fs_info, &oper->elem);
+
+	return 0;
+}
+
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+				  struct btrfs_qgroup_operation *oper)
+{
+	struct ulist *tmp;
+	int sign = 0;
+	int ret = 0;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
+	spin_lock(&fs_info->qgroup_lock);
+	if (!fs_info->quota_root)
+		goto out;
+
+	switch (oper->type) {
+	case BTRFS_QGROUP_OPER_ADD_EXCL:
+		sign = 1;
+		break;
+	case BTRFS_QGROUP_OPER_SUB_EXCL:
+		sign = -1;
+		break;
+	default:
+		ASSERT(0);
+	}
+	ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
+				       oper->num_bytes, sign);
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	ulist_free(tmp);
+	return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+				  u64 root_to_skip, struct ulist *tmp,
+				  struct ulist *roots, struct ulist *qgroups,
+				  u64 seq, int *old_roots, int rescan)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct ulist_node *tmp_unode;
+	struct ulist_iterator tmp_uiter;
+	struct btrfs_qgroup *qg;
+	int ret;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		/* We don't count our current root here */
+		if (unode->val == root_to_skip)
+			continue;
+		qg = find_qgroup_rb(fs_info, unode->val);
+		if (!qg)
+			continue;
+		/*
+		 * We could have a pending removal of this same ref so we may
+		 * not have actually found our ref root when doing
+		 * btrfs_find_all_roots, so we need to keep track of how many
+		 * old roots we find in case we removed ours and added a
+		 * different one at the same time.  I don't think this could
+		 * happen in practice but that sort of thinking leads to pain
+		 * and suffering and to the dark side.
+		 */
+		(*old_roots)++;
+
+		ulist_reinit(tmp);
+		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+				GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+		ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+		ULIST_ITER_INIT(&tmp_uiter);
+		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+			struct btrfs_qgroup_list *glist;
+
+			qg = u64_to_ptr(tmp_unode->aux);
+			/*
+			 * We use this sequence number to keep from having to
+			 * run the whole list and 0 out the refcnt every time.
+			 * We basically use sequnce as the known 0 count and
+			 * then add 1 everytime we see a qgroup.  This is how we
+			 * get how many of the roots actually point up to the
+			 * upper level qgroups in order to determine exclusive
+			 * counts.
+			 *
+			 * For rescan we want to set old_refcnt to seq so our
+			 * exclusive calculations end up correct.
+			 */
+			if (rescan)
+				qg->old_refcnt = seq;
+			else if (qg->old_refcnt < seq)
+				qg->old_refcnt = seq + 1;
+			else
+				qg->old_refcnt++;
+
+			if (qg->new_refcnt < seq)
+				qg->new_refcnt = seq + 1;
+			else
+				qg->new_refcnt++;
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ret = ulist_add(qgroups, glist->group->qgroupid,
+						ptr_to_u64(glist->group),
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+				ret = ulist_add(tmp, glist->group->qgroupid,
+						ptr_to_u64(glist->group),
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+				       struct btrfs_qgroup_operation *oper,
+				       struct ulist *tmp,
+				       struct ulist *qgroups, u64 seq,
+				       int *old_roots)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	struct btrfs_qgroup_operation *tmp_oper;
+	struct rb_node *n;
+	int ret;
+
+	ulist_reinit(tmp);
+
+	/*
+	 * We only walk forward in the tree since we're only interested in
+	 * removals that happened _after_  our operation.
+	 */
+	spin_lock(&fs_info->qgroup_op_lock);
+	n = rb_next(&oper->n);
+	spin_unlock(&fs_info->qgroup_op_lock);
+	if (!n)
+		return 0;
+	tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+	while (tmp_oper->bytenr == oper->bytenr) {
+		/*
+		 * If it's not a removal we don't care, additions work out
+		 * properly with our refcnt tracking.
+		 */
+		if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+		    tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+			goto next;
+		qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+		if (!qg)
+			goto next;
+		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+				GFP_ATOMIC);
+		if (ret) {
+			if (ret < 0)
+				return ret;
+			/*
+			 * We only want to increase old_roots if this qgroup is
+			 * not already in the list of qgroups.  If it is already
+			 * there then that means it must have been re-added or
+			 * the delete will be discarded because we had an
+			 * existing ref that we haven't looked up yet.  In this
+			 * case we don't want to increase old_roots.  So if ret
+			 * == 1 then we know that this is the first time we've
+			 * seen this qgroup and we can bump the old_roots.
+			 */
+			(*old_roots)++;
+			ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+					GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+		}
+next:
+		spin_lock(&fs_info->qgroup_op_lock);
+		n = rb_next(&tmp_oper->n);
+		spin_unlock(&fs_info->qgroup_op_lock);
+		if (!n)
+			break;
+		tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+	}
+
+	/* Ok now process the qgroups we found */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+		if (qg->old_refcnt < seq)
+			qg->old_refcnt = seq + 1;
+		else
+			qg->old_refcnt++;
+		if (qg->new_refcnt < seq)
+			qg->new_refcnt = seq + 1;
+		else
+			qg->new_refcnt++;
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(qgroups, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+				  struct btrfs_qgroup_operation *oper,
+				  struct btrfs_qgroup *qgroup,
+				  struct ulist *tmp, struct ulist *qgroups,
+				  u64 seq)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	int ret;
+
+	ulist_reinit(tmp);
+	ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+			GFP_ATOMIC);
+	if (ret < 0)
+		return ret;
+	ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+			GFP_ATOMIC);
+	if (ret < 0)
+		return ret;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+			if (qg->new_refcnt < seq)
+				qg->new_refcnt = seq + 1;
+			else
+				qg->new_refcnt++;
+		} else {
+			if (qg->old_refcnt < seq)
+				qg->old_refcnt = seq + 1;
+			else
+				qg->old_refcnt++;
+		}
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+			ret = ulist_add(qgroups, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+				  u64 root_to_skip, u64 num_bytes,
+				  struct ulist *qgroups, u64 seq,
+				  int old_roots, int new_roots, int rescan)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	u64 cur_new_count, cur_old_count;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(qgroups, &uiter))) {
+		bool dirty = false;
+
+		qg = u64_to_ptr(unode->aux);
+		/*
+		 * Wasn't referenced before but is now, add to the reference
+		 * counters.
+		 */
+		if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+			qg->rfer += num_bytes;
+			qg->rfer_cmpr += num_bytes;
+			dirty = true;
+		}
+
+		/*
+		 * Was referenced before but isn't now, subtract from the
+		 * reference counters.
+		 */
+		if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+			qg->rfer -= num_bytes;
+			qg->rfer_cmpr -= num_bytes;
+			dirty = true;
+		}
+
+		if (qg->old_refcnt < seq)
+			cur_old_count = 0;
+		else
+			cur_old_count = qg->old_refcnt - seq;
+		if (qg->new_refcnt < seq)
+			cur_new_count = 0;
+		else
+			cur_new_count = qg->new_refcnt - seq;
+
+		/*
+		 * If our refcount was the same as the roots previously but our
+		 * new count isn't the same as the number of roots now then we
+		 * went from having a exclusive reference on this range to not.
+		 */
+		if (old_roots && cur_old_count == old_roots &&
+		    (cur_new_count != new_roots || new_roots == 0)) {
+			WARN_ON(cur_new_count != new_roots && new_roots == 0);
+			qg->excl -= num_bytes;
+			qg->excl_cmpr -= num_bytes;
+			dirty = true;
+		}
+
+		/*
+		 * If we didn't reference all the roots before but now we do we
+		 * have an exclusive reference to this range.
+		 */
+		if ((!old_roots || (old_roots && cur_old_count != old_roots))
+		    && cur_new_count == new_roots) {
+			qg->excl += num_bytes;
+			qg->excl_cmpr += num_bytes;
+			dirty = true;
+		}
+
+		if (dirty)
+			qgroup_dirty(fs_info, qg);
+	}
+	return 0;
+}
+
+/*
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr.  If we do then we can just discard this operation.
+ */
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+			       struct btrfs_fs_info *fs_info,
+			       struct btrfs_qgroup_operation *oper)
+{
+	struct ulist *roots = NULL;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int ret = 0;
+
+	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+				   oper->elem.seq, &roots);
+	if (ret < 0)
+		return ret;
+	ret = 0;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		if (unode->val == oper->ref_root) {
+			ret = 1;
+			break;
+		}
+	}
+	ulist_free(roots);
+	btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+
+	return ret;
+}
+
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters.  The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count.  Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count.  This means that if
+ * anybody is equal to or below this sequence they were never referenced.  We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently.  This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently.  We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ *			[qgroup 1/0]
+ *		     /         |          \
+ *		[qg 0/0]   [qg 0/1]	[qg 0/2]
+ *		   \          |            /
+ *		  [	   extent	    ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0.  The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes.  Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info,
+				    struct btrfs_qgroup_operation *oper)
+{
+	struct ulist *roots = NULL;
+	struct ulist *qgroups, *tmp;
+	struct btrfs_qgroup *qgroup;
+	struct seq_list elem = SEQ_LIST_INIT(elem);
+	u64 seq;
+	int old_roots = 0;
+	int new_roots = 0;
+	int ret = 0;
+
+	if (oper->elem.seq) {
+		ret = check_existing_refs(trans, fs_info, oper);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			return 0;
+	}
+
+	qgroups = ulist_alloc(GFP_NOFS);
+	if (!qgroups)
+		return -ENOMEM;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp) {
+		ulist_free(qgroups);
+		return -ENOMEM;
+	}
+
+	btrfs_get_tree_mod_seq(fs_info, &elem);
+	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+				   &roots);
+	btrfs_put_tree_mod_seq(fs_info, &elem);
+	if (ret < 0) {
+		ulist_free(qgroups);
+		ulist_free(tmp);
+		return ret;
+	}
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+	if (!qgroup)
+		goto out;
+	seq = fs_info->qgroup_seq;
+
+	/*
+	 * So roots is the list of all the roots currently pointing at the
+	 * bytenr, including the ref we are adding if we are adding, or not if
+	 * we are removing a ref.  So we pass in the ref_root to skip that root
+	 * in our calculations.  We set old_refnct and new_refcnt cause who the
+	 * hell knows what everything looked like before, and it doesn't matter
+	 * except...
+	 */
+	ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+				     seq, &old_roots, 0);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Now adjust the refcounts of the qgroups that care about this
+	 * reference, either the old_count in the case of removal or new_count
+	 * in the case of an addition.
+	 */
+	ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+				     seq);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * ...in the case of removals.  If we had a removal before we got around
+	 * to processing this operation then we need to find that guy and count
+	 * his references as if they really existed so we don't end up screwing
+	 * up the exclusive counts.  Then whenever we go to process the delete
+	 * everything will be grand and we can account for whatever exclusive
+	 * changes need to be made there.  We also have to pass in old_roots so
+	 * we have an accurate count of the roots as it pertains to this
+	 * operations view of the world.
+	 */
+	ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+					  &old_roots);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * We are adding our root, need to adjust up the number of roots,
+	 * otherwise old_roots is the number of roots we want.
+	 */
+	if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+		new_roots = old_roots + 1;
+	} else {
+		new_roots = old_roots;
+		old_roots++;
+	}
+	fs_info->qgroup_seq += old_roots + 1;
+
+
+	/*
+	 * And now the magic happens, bless Arne for having a pretty elegant
+	 * solution for this.
+	 */
+	qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+			       qgroups, seq, old_roots, new_roots, 0);
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	ulist_free(qgroups);
+	ulist_free(roots);
+	ulist_free(tmp);
+	return ret;
+}
+
+/*
+ * Process a reference to a shared subtree. This type of operation is
+ * queued during snapshot removal when we encounter extents which are
+ * shared between more than one root.
+ */
+static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     struct btrfs_qgroup_operation *oper)
+{
+	struct ulist *roots = NULL;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup_list *glist;
+	struct ulist *parents;
+	int ret = 0;
+	int err;
+	struct btrfs_qgroup *qg;
+	u64 root_obj = 0;
+	struct seq_list elem = SEQ_LIST_INIT(elem);
+
+	parents = ulist_alloc(GFP_NOFS);
+	if (!parents)
+		return -ENOMEM;
+
+	btrfs_get_tree_mod_seq(fs_info, &elem);
+	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+				   elem.seq, &roots);
+	btrfs_put_tree_mod_seq(fs_info, &elem);
+	if (ret < 0)
+		goto out;
+
+	if (roots->nnodes != 1)
+		goto out;
+
+	ULIST_ITER_INIT(&uiter);
+	unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
+	/*
+	 * If we find our ref root then that means all refs
+	 * this extent has to the root have not yet been
+	 * deleted. In that case, we do nothing and let the
+	 * last ref for this bytenr drive our update.
+	 *
+	 * This can happen for example if an extent is
+	 * referenced multiple times in a snapshot (clone,
+	 * etc). If we are in the middle of snapshot removal,
+	 * queued updates for such an extent will find the
+	 * root if we have not yet finished removing the
+	 * snapshot.
+	 */
+	if (unode->val == oper->ref_root)
+		goto out;
+
+	root_obj = unode->val;
+	BUG_ON(!root_obj);
+
+	spin_lock(&fs_info->qgroup_lock);
+	qg = find_qgroup_rb(fs_info, root_obj);
+	if (!qg)
+		goto out_unlock;
+
+	qg->excl += oper->num_bytes;
+	qg->excl_cmpr += oper->num_bytes;
+	qgroup_dirty(fs_info, qg);
+
+	/*
+	 * Adjust counts for parent groups. First we find all
+	 * parents, then in the 2nd loop we do the adjustment
+	 * while adding parents of the parents to our ulist.
+	 */
+	list_for_each_entry(glist, &qg->groups, next_group) {
+		err = ulist_add(parents, glist->group->qgroupid,
+				ptr_to_u64(glist->group), GFP_ATOMIC);
+		if (err < 0) {
+			ret = err;
+			goto out_unlock;
+		}
+	}
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(parents, &uiter))) {
+		qg = u64_to_ptr(unode->aux);
+		qg->excl += oper->num_bytes;
+		qg->excl_cmpr += oper->num_bytes;
+		qgroup_dirty(fs_info, qg);
+
+		/* Add any parents of the parents */
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			err = ulist_add(parents, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (err < 0) {
+				ret = err;
+				goto out_unlock;
+			}
+		}
+	}
+
+out_unlock:
+	spin_unlock(&fs_info->qgroup_lock);
+
+out:
+	ulist_free(roots);
+	ulist_free(parents);
+	return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info,
+				struct btrfs_qgroup_operation *oper)
+{
+	int ret = 0;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	BUG_ON(!fs_info->quota_root);
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			return 0;
+		}
+	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	ASSERT(is_fstree(oper->ref_root));
+
+	trace_btrfs_qgroup_account(oper);
+
+	switch (oper->type) {
+	case BTRFS_QGROUP_OPER_ADD_EXCL:
+	case BTRFS_QGROUP_OPER_SUB_EXCL:
+		ret = qgroup_excl_accounting(fs_info, oper);
+		break;
+	case BTRFS_QGROUP_OPER_ADD_SHARED:
+	case BTRFS_QGROUP_OPER_SUB_SHARED:
+		ret = qgroup_shared_accounting(trans, fs_info, oper);
+		break;
+	case BTRFS_QGROUP_OPER_SUB_SUBTREE:
+		ret = qgroup_subtree_accounting(trans, fs_info, oper);
+		break;
+	default:
+		ASSERT(0);
+	}
+	return ret;
+}
+
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_qgroup_operation *oper;
+	int ret = 0;
+
+	while (!list_empty(&trans->qgroup_ref_list)) {
+		oper = list_first_entry(&trans->qgroup_ref_list,
+					struct btrfs_qgroup_operation, list);
+		list_del_init(&oper->list);
+		if (!ret || !trans->aborted)
+			ret = btrfs_qgroup_account(trans, fs_info, oper);
+		spin_lock(&fs_info->qgroup_op_lock);
+		rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+		spin_unlock(&fs_info->qgroup_op_lock);
+		btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+		kfree(oper);
+	}
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed qgroups to disk.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	int ret = 0;
+	int start_rescan_worker = 0;
+
+	if (!quota_root)
+		goto out;
+
+	if (!fs_info->quota_enabled && fs_info->pending_quota_state)
+		start_rescan_worker = 1;
+
+	fs_info->quota_enabled = fs_info->pending_quota_state;
+
+	spin_lock(&fs_info->qgroup_lock);
+	while (!list_empty(&fs_info->dirty_qgroups)) {
+		struct btrfs_qgroup *qgroup;
+		qgroup = list_first_entry(&fs_info->dirty_qgroups,
+					  struct btrfs_qgroup, dirty);
+		list_del_init(&qgroup->dirty);
+		spin_unlock(&fs_info->qgroup_lock);
+		ret = update_qgroup_info_item(trans, quota_root, qgroup);
+		if (ret)
+			fs_info->qgroup_flags |=
+					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+		if (ret)
+			fs_info->qgroup_flags |=
+					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		spin_lock(&fs_info->qgroup_lock);
+	}
+	if (fs_info->quota_enabled)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+	else
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+	spin_unlock(&fs_info->qgroup_lock);
+
+	ret = update_qgroup_status_item(trans, fs_info, quota_root);
+	if (ret)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+
+	if (!ret && start_rescan_worker) {
+		ret = qgroup_rescan_init(fs_info, 0, 1);
+		if (!ret) {
+			qgroup_rescan_zero_tracking(fs_info);
+			btrfs_queue_work(fs_info->qgroup_rescan_workers,
+					 &fs_info->qgroup_rescan_work);
+		}
+		ret = 0;
+	}
+
+out:
+
+	return ret;
+}
+
+/*
+ * copy the acounting information between qgroups. This is necessary when a
+ * snapshot or a subvolume is created
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+			 struct btrfs_qgroup_inherit *inherit)
+{
+	int ret = 0;
+	int i;
+	u64 *i_qgroups;
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_qgroup *srcgroup;
+	struct btrfs_qgroup *dstgroup;
+	u32 level_size = 0;
+	u64 nums;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	if (!fs_info->quota_enabled)
+		goto out;
+
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (inherit) {
+		i_qgroups = (u64 *)(inherit + 1);
+		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+		       2 * inherit->num_excl_copies;
+		for (i = 0; i < nums; ++i) {
+			srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
+			if (!srcgroup) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			++i_qgroups;
+		}
+	}
+
+	/*
+	 * create a tracking group for the subvol itself
+	 */
+	ret = add_qgroup_item(trans, quota_root, objectid);
+	if (ret)
+		goto out;
+
+	if (srcid) {
+		struct btrfs_root *srcroot;
+		struct btrfs_key srckey;
+
+		srckey.objectid = srcid;
+		srckey.type = BTRFS_ROOT_ITEM_KEY;
+		srckey.offset = (u64)-1;
+		srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
+		if (IS_ERR(srcroot)) {
+			ret = PTR_ERR(srcroot);
+			goto out;
+		}
+
+		rcu_read_lock();
+		level_size = srcroot->nodesize;
+		rcu_read_unlock();
+	}
+
+	/*
+	 * add qgroup to all inherited groups
+	 */
+	if (inherit) {
+		i_qgroups = (u64 *)(inherit + 1);
+		for (i = 0; i < inherit->num_qgroups; ++i) {
+			ret = add_qgroup_relation_item(trans, quota_root,
+						       objectid, *i_qgroups);
+			if (ret)
+				goto out;
+			ret = add_qgroup_relation_item(trans, quota_root,
+						       *i_qgroups, objectid);
+			if (ret)
+				goto out;
+			++i_qgroups;
+		}
+	}
+
+
+	spin_lock(&fs_info->qgroup_lock);
+
+	dstgroup = add_qgroup_rb(fs_info, objectid);
+	if (IS_ERR(dstgroup)) {
+		ret = PTR_ERR(dstgroup);
+		goto unlock;
+	}
+
+	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+		dstgroup->lim_flags = inherit->lim.flags;
+		dstgroup->max_rfer = inherit->lim.max_rfer;
+		dstgroup->max_excl = inherit->lim.max_excl;
+		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
+		dstgroup->rsv_excl = inherit->lim.rsv_excl;
+
+		ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
+		if (ret) {
+			fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+			btrfs_info(fs_info, "unable to update quota limit for %llu",
+			       dstgroup->qgroupid);
+			goto unlock;
+		}
+	}
+
+	if (srcid) {
+		srcgroup = find_qgroup_rb(fs_info, srcid);
+		if (!srcgroup)
+			goto unlock;
+
+		/*
+		 * We call inherit after we clone the root in order to make sure
+		 * our counts don't go crazy, so at this point the only
+		 * difference between the two roots should be the root node.
+		 */
+		dstgroup->rfer = srcgroup->rfer;
+		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+		dstgroup->excl = level_size;
+		dstgroup->excl_cmpr = level_size;
+		srcgroup->excl = level_size;
+		srcgroup->excl_cmpr = level_size;
+
+		/* inherit the limit info */
+		dstgroup->lim_flags = srcgroup->lim_flags;
+		dstgroup->max_rfer = srcgroup->max_rfer;
+		dstgroup->max_excl = srcgroup->max_excl;
+		dstgroup->rsv_rfer = srcgroup->rsv_rfer;
+		dstgroup->rsv_excl = srcgroup->rsv_excl;
+
+		qgroup_dirty(fs_info, dstgroup);
+		qgroup_dirty(fs_info, srcgroup);
+	}
+
+	if (!inherit)
+		goto unlock;
+
+	i_qgroups = (u64 *)(inherit + 1);
+	for (i = 0; i < inherit->num_qgroups; ++i) {
+		ret = add_relation_rb(quota_root->fs_info, objectid,
+				      *i_qgroups);
+		if (ret)
+			goto unlock;
+		++i_qgroups;
+	}
+
+	for (i = 0; i <  inherit->num_ref_copies; ++i) {
+		struct btrfs_qgroup *src;
+		struct btrfs_qgroup *dst;
+
+		src = find_qgroup_rb(fs_info, i_qgroups[0]);
+		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+		if (!src || !dst) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		dst->rfer = src->rfer - level_size;
+		dst->rfer_cmpr = src->rfer_cmpr - level_size;
+		i_qgroups += 2;
+	}
+	for (i = 0; i <  inherit->num_excl_copies; ++i) {
+		struct btrfs_qgroup *src;
+		struct btrfs_qgroup *dst;
+
+		src = find_qgroup_rb(fs_info, i_qgroups[0]);
+		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+		if (!src || !dst) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		dst->excl = src->excl + level_size;
+		dst->excl_cmpr = src->excl_cmpr + level_size;
+		i_qgroups += 2;
+	}
+
+unlock:
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 ref_root = root->root_key.objectid;
+	int ret = 0;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+
+	if (!is_fstree(ref_root))
+		return 0;
+
+	if (num_bytes == 0)
+		return 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto out;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	/*
+	 * in a first step, we check all affected qgroups if any limits would
+	 * be exceeded
+	 */
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+			(uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
+		goto out;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+
+		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+		    qg->reserved + (s64)qg->rfer + num_bytes >
+		    qg->max_rfer) {
+			ret = -EDQUOT;
+			goto out;
+		}
+
+		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+		    qg->reserved + (s64)qg->excl + num_bytes >
+		    qg->max_excl) {
+			ret = -EDQUOT;
+			goto out;
+		}
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+	ret = 0;
+	/*
+	 * no limits exceeded, now record the reservation into all qgroups
+	 */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+
+		qg = u64_to_ptr(unode->aux);
+
+		qg->reserved += num_bytes;
+	}
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	return ret;
+}
+
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	u64 ref_root = root->root_key.objectid;
+	int ret = 0;
+
+	if (!is_fstree(ref_root))
+		return;
+
+	if (num_bytes == 0)
+		return;
+
+	spin_lock(&fs_info->qgroup_lock);
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto out;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+			(uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
+		goto out;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+
+		qg->reserved -= num_bytes;
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+}
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
+{
+	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
+		return;
+	btrfs_err(trans->root->fs_info,
+		"qgroups not uptodate in trans handle %p:  list is%s empty, "
+		"seq is %#x.%x",
+		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
+		(u32)(trans->delayed_ref_elem.seq >> 32),
+		(u32)trans->delayed_ref_elem.seq);
+	BUG();
+}
+
+/*
+ * returns < 0 on error, 0 when more leafs are to be scanned.
+ * returns 1 when done.
+ */
+static int
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		   struct btrfs_trans_handle *trans, struct ulist *qgroups,
+		   struct ulist *tmp, struct extent_buffer *scratch_leaf)
+{
+	struct btrfs_key found;
+	struct ulist *roots = NULL;
+	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+	u64 num_bytes;
+	u64 seq;
+	int new_roots;
+	int slot;
+	int ret;
+
+	path->leave_spinning = 1;
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	ret = btrfs_search_slot_for_read(fs_info->extent_root,
+					 &fs_info->qgroup_rescan_progress,
+					 path, 1, 0);
+
+	pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
+		 fs_info->qgroup_rescan_progress.objectid,
+		 fs_info->qgroup_rescan_progress.type,
+		 fs_info->qgroup_rescan_progress.offset, ret);
+
+	if (ret) {
+		/*
+		 * The rescan is about to end, we will not be scanning any
+		 * further blocks. We cannot unset the RESCAN flag here, because
+		 * we want to commit the transaction if everything went well.
+		 * To make the live accounting work in this phase, we set our
+		 * scan progress pointer such that every real extent objectid
+		 * will be smaller.
+		 */
+		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
+		btrfs_release_path(path);
+		mutex_unlock(&fs_info->qgroup_rescan_lock);
+		return ret;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &found,
+			      btrfs_header_nritems(path->nodes[0]) - 1);
+	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
+
+	btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+	memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+	slot = path->slots[0];
+	btrfs_release_path(path);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
+		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
+		    found.type != BTRFS_METADATA_ITEM_KEY)
+			continue;
+		if (found.type == BTRFS_METADATA_ITEM_KEY)
+			num_bytes = fs_info->extent_root->nodesize;
+		else
+			num_bytes = found.offset;
+
+		ulist_reinit(qgroups);
+		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+					   &roots);
+		if (ret < 0)
+			goto out;
+		spin_lock(&fs_info->qgroup_lock);
+		seq = fs_info->qgroup_seq;
+		fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+		new_roots = 0;
+		ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+					     seq, &new_roots, 1);
+		if (ret < 0) {
+			spin_unlock(&fs_info->qgroup_lock);
+			ulist_free(roots);
+			goto out;
+		}
+
+		ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+					     seq, 0, new_roots, 1);
+		if (ret < 0) {
+			spin_unlock(&fs_info->qgroup_lock);
+			ulist_free(roots);
+			goto out;
+		}
+		spin_unlock(&fs_info->qgroup_lock);
+		ulist_free(roots);
+	}
+out:
+	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+
+	return ret;
+}
+
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+						     qgroup_rescan_work);
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans = NULL;
+	struct ulist *tmp = NULL, *qgroups = NULL;
+	struct extent_buffer *scratch_leaf = NULL;
+	int err = -ENOMEM;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out;
+	qgroups = ulist_alloc(GFP_NOFS);
+	if (!qgroups)
+		goto out;
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		goto out;
+	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
+	if (!scratch_leaf)
+		goto out;
+
+	err = 0;
+	while (!err) {
+		trans = btrfs_start_transaction(fs_info->fs_root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			break;
+		}
+		if (!fs_info->quota_enabled) {
+			err = -EINTR;
+		} else {
+			err = qgroup_rescan_leaf(fs_info, path, trans,
+						 qgroups, tmp, scratch_leaf);
+		}
+		if (err > 0)
+			btrfs_commit_transaction(trans, fs_info->fs_root);
+		else
+			btrfs_end_transaction(trans, fs_info->fs_root);
+	}
+
+out:
+	kfree(scratch_leaf);
+	ulist_free(qgroups);
+	ulist_free(tmp);
+	btrfs_free_path(path);
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+
+	if (err > 0 &&
+	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	} else if (err < 0) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	/*
+	 * only update status, since the previous part has alreay updated the
+	 * qgroup info.
+	 */
+	trans = btrfs_start_transaction(fs_info->quota_root, 1);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		btrfs_err(fs_info,
+			  "fail to start transaction for status update: %d\n",
+			  err);
+		goto done;
+	}
+	ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
+	if (ret < 0) {
+		err = ret;
+		btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
+	}
+	btrfs_end_transaction(trans, fs_info->quota_root);
+
+	if (err >= 0) {
+		btrfs_info(fs_info, "qgroup scan completed%s",
+			err > 0 ? " (inconsistency flag cleared)" : "");
+	} else {
+		btrfs_err(fs_info, "qgroup scan failed with %d", err);
+	}
+
+done:
+	complete_all(&fs_info->qgroup_rescan_completion);
+}
+
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+		   int init_flags)
+{
+	int ret = 0;
+
+	if (!init_flags &&
+	    (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+	     !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	spin_lock(&fs_info->qgroup_lock);
+
+	if (init_flags) {
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+			ret = -EINPROGRESS;
+		else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+			ret = -EINVAL;
+
+		if (ret) {
+			spin_unlock(&fs_info->qgroup_lock);
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			goto err;
+		}
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	}
+
+	memset(&fs_info->qgroup_rescan_progress, 0,
+		sizeof(fs_info->qgroup_rescan_progress));
+	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+	spin_unlock(&fs_info->qgroup_lock);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	init_completion(&fs_info->qgroup_rescan_completion);
+
+	memset(&fs_info->qgroup_rescan_work, 0,
+	       sizeof(fs_info->qgroup_rescan_work));
+	btrfs_init_work(&fs_info->qgroup_rescan_work,
+			btrfs_qgroup_rescan_helper,
+			btrfs_qgroup_rescan_worker, NULL, NULL);
+
+	if (ret) {
+err:
+		btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+	struct rb_node *n;
+	struct btrfs_qgroup *qgroup;
+
+	spin_lock(&fs_info->qgroup_lock);
+	/* clear all current qgroup tracking information */
+	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		qgroup->rfer = 0;
+		qgroup->rfer_cmpr = 0;
+		qgroup->excl = 0;
+		qgroup->excl_cmpr = 0;
+	}
+	spin_unlock(&fs_info->qgroup_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+	struct btrfs_trans_handle *trans;
+
+	ret = qgroup_rescan_init(fs_info, 0, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * We have set the rescan_progress to 0, which means no more
+	 * delayed refs will be accounted by btrfs_qgroup_account_ref.
+	 * However, btrfs_qgroup_account_ref may be right after its call
+	 * to btrfs_find_all_roots, in which case it would still do the
+	 * accounting.
+	 * To solve this, we're committing the transaction, which will
+	 * ensure we run all delayed refs and only after that, we are
+	 * going to clear all tracking information for a clean start.
+	 */
+
+	trans = btrfs_join_transaction(fs_info->fs_root);
+	if (IS_ERR(trans)) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+	if (ret) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+		return ret;
+	}
+
+	qgroup_rescan_zero_tracking(fs_info);
+
+	btrfs_queue_work(fs_info->qgroup_rescan_workers,
+			 &fs_info->qgroup_rescan_work);
+
+	return 0;
+}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+	int running;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	spin_lock(&fs_info->qgroup_lock);
+	running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	spin_unlock(&fs_info->qgroup_lock);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	if (running)
+		ret = wait_for_completion_interruptible(
+					&fs_info->qgroup_rescan_completion);
+
+	return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+		btrfs_queue_work(fs_info->qgroup_rescan_workers,
+				 &fs_info->qgroup_rescan_work);
+}
diff --git a/kernel/fs/btrfs/qgroup.h b/kernel/fs/btrfs/qgroup.h
new file mode 100644
index 000000000..c5242aa9a
--- /dev/null
+++ b/kernel/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2014 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_QGROUP__
+#define __BTRFS_QGROUP__
+
+/*
+ * A description of the operations, all of these operations only happen when we
+ * are adding the 1st reference for that subvolume in the case of adding space
+ * or on the last reference delete in the case of subtraction.  The only
+ * exception is the last one, which is added for confusion.
+ *
+ * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
+ * one pointing at the bytes we are adding.  This is called on the first
+ * allocation.
+ *
+ * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
+ * shared between subvols.  This is called on the creation of a ref that already
+ * has refs from a different subvolume, so basically reflink.
+ *
+ * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
+ * one referencing the range.
+ *
+ * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
+ * refs with other subvolumes.
+ */
+enum btrfs_qgroup_operation_type {
+	BTRFS_QGROUP_OPER_ADD_EXCL,
+	BTRFS_QGROUP_OPER_ADD_SHARED,
+	BTRFS_QGROUP_OPER_SUB_EXCL,
+	BTRFS_QGROUP_OPER_SUB_SHARED,
+	BTRFS_QGROUP_OPER_SUB_SUBTREE,
+};
+
+struct btrfs_qgroup_operation {
+	u64 ref_root;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 seq;
+	enum btrfs_qgroup_operation_type type;
+	struct seq_list elem;
+	struct rb_node n;
+	struct list_head list;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 qgroupid,
+		       struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info, u64 ref_root,
+			    u64 bytenr, u64 num_bytes,
+			    enum btrfs_qgroup_operation_type type,
+			    int mod_seq);
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info);
+void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info,
+				   struct btrfs_qgroup_operation *oper);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+			 struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+			       u64 rfer, u64 excl);
+#endif
+
+#endif /* __BTRFS_QGROUP__ */
diff --git a/kernel/fs/btrfs/raid56.c b/kernel/fs/btrfs/raid56.c
new file mode 100644
index 000000000..fa72068bd
--- /dev/null
+++ b/kernel/fs/btrfs/raid56.c
@@ -0,0 +1,2670 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <linux/vmalloc.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT	1
+
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT		2
+
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT	3
+
+#define RBIO_CACHE_SIZE 1024
+
+enum btrfs_rbio_ops {
+	BTRFS_RBIO_WRITE	= 0,
+	BTRFS_RBIO_READ_REBUILD	= 1,
+	BTRFS_RBIO_PARITY_SCRUB	= 2,
+};
+
+struct btrfs_raid_bio {
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_bio *bbio;
+
+	/* while we're doing rmw on a stripe
+	 * we put it into a hash table so we can
+	 * lock the stripe and merge more rbios
+	 * into it.
+	 */
+	struct list_head hash_list;
+
+	/*
+	 * LRU list for the stripe cache
+	 */
+	struct list_head stripe_cache;
+
+	/*
+	 * for scheduling work in the helper threads
+	 */
+	struct btrfs_work work;
+
+	/*
+	 * bio list and bio_list_lock are used
+	 * to add more bios into the stripe
+	 * in hopes of avoiding the full rmw
+	 */
+	struct bio_list bio_list;
+	spinlock_t bio_list_lock;
+
+	/* also protected by the bio_list_lock, the
+	 * plug list is used by the plugging code
+	 * to collect partial bios while plugged.  The
+	 * stripe locking code also uses it to hand off
+	 * the stripe lock to the next pending IO
+	 */
+	struct list_head plug_list;
+
+	/*
+	 * flags that tell us if it is safe to
+	 * merge with this bio
+	 */
+	unsigned long flags;
+
+	/* size of each individual stripe on disk */
+	int stripe_len;
+
+	/* number of data stripes (no p/q) */
+	int nr_data;
+
+	int real_stripes;
+
+	int stripe_npages;
+	/*
+	 * set if we're doing a parity rebuild
+	 * for a read from higher up, which is handled
+	 * differently from a parity rebuild as part of
+	 * rmw
+	 */
+	enum btrfs_rbio_ops operation;
+
+	/* first bad stripe */
+	int faila;
+
+	/* second bad stripe (for raid6 use) */
+	int failb;
+
+	int scrubp;
+	/*
+	 * number of pages needed to represent the full
+	 * stripe
+	 */
+	int nr_pages;
+
+	/*
+	 * size of all the bios in the bio_list.  This
+	 * helps us decide if the rbio maps to a full
+	 * stripe or not
+	 */
+	int bio_list_bytes;
+
+	int generic_bio_cnt;
+
+	atomic_t refs;
+
+	atomic_t stripes_pending;
+
+	atomic_t error;
+	/*
+	 * these are two arrays of pointers.  We allocate the
+	 * rbio big enough to hold them both and setup their
+	 * locations when the rbio is allocated
+	 */
+
+	/* pointers to pages that we allocated for
+	 * reading/writing stripes directly from the disk (including P/Q)
+	 */
+	struct page **stripe_pages;
+
+	/*
+	 * pointers to the pages in the bio_list.  Stored
+	 * here for faster lookup
+	 */
+	struct page **bio_pages;
+
+	/*
+	 * bitmap to record which horizontal stripe has data
+	 */
+	unsigned long *dbitmap;
+};
+
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+					 int need_check);
+static void async_scrub_parity(struct btrfs_raid_bio *rbio);
+
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+	struct btrfs_stripe_hash_table *table;
+	struct btrfs_stripe_hash_table *x;
+	struct btrfs_stripe_hash *cur;
+	struct btrfs_stripe_hash *h;
+	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+	int i;
+	int table_size;
+
+	if (info->stripe_hash_table)
+		return 0;
+
+	/*
+	 * The table is large, starting with order 4 and can go as high as
+	 * order 7 in case lock debugging is turned on.
+	 *
+	 * Try harder to allocate and fallback to vmalloc to lower the chance
+	 * of a failing mount.
+	 */
+	table_size = sizeof(*table) + sizeof(*h) * num_entries;
+	table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!table) {
+		table = vzalloc(table_size);
+		if (!table)
+			return -ENOMEM;
+	}
+
+	spin_lock_init(&table->cache_lock);
+	INIT_LIST_HEAD(&table->stripe_cache);
+
+	h = table->table;
+
+	for (i = 0; i < num_entries; i++) {
+		cur = h + i;
+		INIT_LIST_HEAD(&cur->hash_list);
+		spin_lock_init(&cur->lock);
+		init_waitqueue_head(&cur->wait);
+	}
+
+	x = cmpxchg(&info->stripe_hash_table, NULL, table);
+	if (x)
+		kvfree(x);
+	return 0;
+}
+
+/*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array.  We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	char *s;
+	char *d;
+	int ret;
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		return;
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (!rbio->bio_pages[i])
+			continue;
+
+		s = kmap(rbio->bio_pages[i]);
+		d = kmap(rbio->stripe_pages[i]);
+
+		memcpy(d, s, PAGE_CACHE_SIZE);
+
+		kunmap(rbio->bio_pages[i]);
+		kunmap(rbio->stripe_pages[i]);
+		SetPageUptodate(rbio->stripe_pages[i]);
+	}
+	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+	u64 num = rbio->bbio->raid_map[0];
+
+	/*
+	 * we shift down quite a bit.  We're using byte
+	 * addressing, and most of the lower bits are zeros.
+	 * This tends to upset hash_64, and it consistently
+	 * returns just one or two different values.
+	 *
+	 * shifting off the lower bits fixes things.
+	 */
+	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+
+/*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+	int i;
+	struct page *s;
+	struct page *d;
+
+	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+		return;
+
+	for (i = 0; i < dest->nr_pages; i++) {
+		s = src->stripe_pages[i];
+		if (!s || !PageUptodate(s)) {
+			continue;
+		}
+
+		d = dest->stripe_pages[i];
+		if (d)
+			__free_page(d);
+
+		dest->stripe_pages[i] = s;
+		src->stripe_pages[i] = NULL;
+	}
+}
+
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination.  The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+		       struct btrfs_raid_bio *victim)
+{
+	bio_list_merge(&dest->bio_list, &victim->bio_list);
+	dest->bio_list_bytes += victim->bio_list_bytes;
+	dest->generic_bio_cnt += victim->generic_bio_cnt;
+	bio_list_init(&victim->bio_list);
+}
+
+/*
+ * used to prune items that are in the cache.  The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+	int bucket = rbio_bucket(rbio);
+	struct btrfs_stripe_hash_table *table;
+	struct btrfs_stripe_hash *h;
+	int freeit = 0;
+
+	/*
+	 * check the bit again under the hash table lock.
+	 */
+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+	h = table->table + bucket;
+
+	/* hold the lock for the bucket because we may be
+	 * removing it from the hash table
+	 */
+	spin_lock(&h->lock);
+
+	/*
+	 * hold the lock for the bio list because we need
+	 * to make sure the bio list is empty
+	 */
+	spin_lock(&rbio->bio_list_lock);
+
+	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+		list_del_init(&rbio->stripe_cache);
+		table->cache_size -= 1;
+		freeit = 1;
+
+		/* if the bio list isn't empty, this rbio is
+		 * still involved in an IO.  We take it out
+		 * of the cache list, and drop the ref that
+		 * was held for the list.
+		 *
+		 * If the bio_list was empty, we also remove
+		 * the rbio from the hash_table, and drop
+		 * the corresponding ref
+		 */
+		if (bio_list_empty(&rbio->bio_list)) {
+			if (!list_empty(&rbio->hash_list)) {
+				list_del_init(&rbio->hash_list);
+				atomic_dec(&rbio->refs);
+				BUG_ON(!list_empty(&rbio->plug_list));
+			}
+		}
+	}
+
+	spin_unlock(&rbio->bio_list_lock);
+	spin_unlock(&h->lock);
+
+	if (freeit)
+		__free_raid_bio(rbio);
+}
+
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+
+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	__remove_rbio_from_cache(rbio);
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove everything in the cache
+ */
+static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+	struct btrfs_raid_bio *rbio;
+
+	table = info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	while (!list_empty(&table->stripe_cache)) {
+		rbio = list_entry(table->stripe_cache.next,
+				  struct btrfs_raid_bio,
+				  stripe_cache);
+		__remove_rbio_from_cache(rbio);
+	}
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+	if (!info->stripe_hash_table)
+		return;
+	btrfs_clear_rbio_cache(info);
+	kvfree(info->stripe_hash_table);
+	info->stripe_hash_table = NULL;
+}
+
+/*
+ * insert an rbio into the stripe cache.  It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+
+	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	spin_lock(&rbio->bio_list_lock);
+
+	/* bump our ref if we were not in the list before */
+	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+		atomic_inc(&rbio->refs);
+
+	if (!list_empty(&rbio->stripe_cache)){
+		list_move(&rbio->stripe_cache, &table->stripe_cache);
+	} else {
+		list_add(&rbio->stripe_cache, &table->stripe_cache);
+		table->cache_size += 1;
+	}
+
+	spin_unlock(&rbio->bio_list_lock);
+
+	if (table->cache_size > RBIO_CACHE_SIZE) {
+		struct btrfs_raid_bio *found;
+
+		found = list_entry(table->stripe_cache.prev,
+				  struct btrfs_raid_bio,
+				  stripe_cache);
+
+		if (found != rbio)
+			__remove_rbio_from_cache(found);
+	}
+
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+	return;
+}
+
+/*
+ * helper function to run the xor_blocks api.  It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+	int src_off = 0;
+	int xor_src_cnt = 0;
+	void *dest = pages[src_cnt];
+
+	while(src_cnt > 0) {
+		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+
+		src_cnt -= xor_src_cnt;
+		src_off += xor_src_cnt;
+	}
+}
+
+/*
+ * returns true if the bio list inside this rbio
+ * covers an entire stripe (no rmw required).
+ * Must be called with the bio list lock held, or
+ * at a time when you know it is impossible to add
+ * new bios into the list
+ */
+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+	unsigned long size = rbio->bio_list_bytes;
+	int ret = 1;
+
+	if (size != rbio->nr_data * rbio->stripe_len)
+		ret = 0;
+
+	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+	return ret;
+}
+
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
+	ret = __rbio_is_full(rbio);
+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+	return ret;
+}
+
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+			  struct btrfs_raid_bio *cur)
+{
+	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+		return 0;
+
+	/*
+	 * we can't merge with cached rbios, since the
+	 * idea is that when we merge the destination
+	 * rbio is going to run our IO for us.  We can
+	 * steal from cached rbio's though, other functions
+	 * handle that.
+	 */
+	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+	    test_bit(RBIO_CACHE_BIT, &cur->flags))
+		return 0;
+
+	if (last->bbio->raid_map[0] !=
+	    cur->bbio->raid_map[0])
+		return 0;
+
+	/* we can't merge with different operations */
+	if (last->operation != cur->operation)
+		return 0;
+	/*
+	 * We've need read the full stripe from the drive.
+	 * check and repair the parity and write the new results.
+	 *
+	 * We're not allowed to add any new bios to the
+	 * bio list here, anyone else that wants to
+	 * change this stripe needs to do their own rmw.
+	 */
+	if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
+	    cur->operation == BTRFS_RBIO_PARITY_SCRUB)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * helper to index into the pstripe
+ */
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * helper to index into the qstripe, returns null
+ * if there is no qstripe
+ */
+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+	if (rbio->nr_data + 1 == rbio->real_stripes)
+		return NULL;
+
+	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
+		PAGE_CACHE_SHIFT;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * The first stripe in the table for a logical address
+ * has the lock.  rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet.  The rbio is given
+ * the lock and 0 is returned.  The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner.  The rbio is freed and the IO will
+ * start automatically along with the existing rbio.  1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list.  When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission.  If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+	int bucket = rbio_bucket(rbio);
+	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+	struct btrfs_raid_bio *cur;
+	struct btrfs_raid_bio *pending;
+	unsigned long flags;
+	DEFINE_WAIT(wait);
+	struct btrfs_raid_bio *freeit = NULL;
+	struct btrfs_raid_bio *cache_drop = NULL;
+	int ret = 0;
+	int walk = 0;
+
+	spin_lock_irqsave(&h->lock, flags);
+	list_for_each_entry(cur, &h->hash_list, hash_list) {
+		walk++;
+		if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
+			spin_lock(&cur->bio_list_lock);
+
+			/* can we steal this cached rbio's pages? */
+			if (bio_list_empty(&cur->bio_list) &&
+			    list_empty(&cur->plug_list) &&
+			    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+			    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+				list_del_init(&cur->hash_list);
+				atomic_dec(&cur->refs);
+
+				steal_rbio(cur, rbio);
+				cache_drop = cur;
+				spin_unlock(&cur->bio_list_lock);
+
+				goto lockit;
+			}
+
+			/* can we merge into the lock owner? */
+			if (rbio_can_merge(cur, rbio)) {
+				merge_rbio(cur, rbio);
+				spin_unlock(&cur->bio_list_lock);
+				freeit = rbio;
+				ret = 1;
+				goto out;
+			}
+
+
+			/*
+			 * we couldn't merge with the running
+			 * rbio, see if we can merge with the
+			 * pending ones.  We don't have to
+			 * check for rmw_locked because there
+			 * is no way they are inside finish_rmw
+			 * right now
+			 */
+			list_for_each_entry(pending, &cur->plug_list,
+					    plug_list) {
+				if (rbio_can_merge(pending, rbio)) {
+					merge_rbio(pending, rbio);
+					spin_unlock(&cur->bio_list_lock);
+					freeit = rbio;
+					ret = 1;
+					goto out;
+				}
+			}
+
+			/* no merging, put us on the tail of the plug list,
+			 * our rbio will be started with the currently
+			 * running rbio unlocks
+			 */
+			list_add_tail(&rbio->plug_list, &cur->plug_list);
+			spin_unlock(&cur->bio_list_lock);
+			ret = 1;
+			goto out;
+		}
+	}
+lockit:
+	atomic_inc(&rbio->refs);
+	list_add(&rbio->hash_list, &h->hash_list);
+out:
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (cache_drop)
+		remove_rbio_from_cache(cache_drop);
+	if (freeit)
+		__free_raid_bio(freeit);
+	return ret;
+}
+
+/*
+ * called as rmw or parity rebuild is completed.  If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bucket;
+	struct btrfs_stripe_hash *h;
+	unsigned long flags;
+	int keep_cache = 0;
+
+	bucket = rbio_bucket(rbio);
+	h = rbio->fs_info->stripe_hash_table->table + bucket;
+
+	if (list_empty(&rbio->plug_list))
+		cache_rbio(rbio);
+
+	spin_lock_irqsave(&h->lock, flags);
+	spin_lock(&rbio->bio_list_lock);
+
+	if (!list_empty(&rbio->hash_list)) {
+		/*
+		 * if we're still cached and there is no other IO
+		 * to perform, just leave this rbio here for others
+		 * to steal from later
+		 */
+		if (list_empty(&rbio->plug_list) &&
+		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+			keep_cache = 1;
+			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+			BUG_ON(!bio_list_empty(&rbio->bio_list));
+			goto done;
+		}
+
+		list_del_init(&rbio->hash_list);
+		atomic_dec(&rbio->refs);
+
+		/*
+		 * we use the plug list to hold all the rbios
+		 * waiting for the chance to lock this stripe.
+		 * hand the lock over to one of them.
+		 */
+		if (!list_empty(&rbio->plug_list)) {
+			struct btrfs_raid_bio *next;
+			struct list_head *head = rbio->plug_list.next;
+
+			next = list_entry(head, struct btrfs_raid_bio,
+					  plug_list);
+
+			list_del_init(&rbio->plug_list);
+
+			list_add(&next->hash_list, &h->hash_list);
+			atomic_inc(&next->refs);
+			spin_unlock(&rbio->bio_list_lock);
+			spin_unlock_irqrestore(&h->lock, flags);
+
+			if (next->operation == BTRFS_RBIO_READ_REBUILD)
+				async_read_rebuild(next);
+			else if (next->operation == BTRFS_RBIO_WRITE) {
+				steal_rbio(rbio, next);
+				async_rmw_stripe(next);
+			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
+				steal_rbio(rbio, next);
+				async_scrub_parity(next);
+			}
+
+			goto done_nolock;
+		} else  if (waitqueue_active(&h->wait)) {
+			spin_unlock(&rbio->bio_list_lock);
+			spin_unlock_irqrestore(&h->lock, flags);
+			wake_up(&h->wait);
+			goto done_nolock;
+		}
+	}
+done:
+	spin_unlock(&rbio->bio_list_lock);
+	spin_unlock_irqrestore(&h->lock, flags);
+
+done_nolock:
+	if (!keep_cache)
+		remove_rbio_from_cache(rbio);
+}
+
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+	int i;
+
+	WARN_ON(atomic_read(&rbio->refs) < 0);
+	if (!atomic_dec_and_test(&rbio->refs))
+		return;
+
+	WARN_ON(!list_empty(&rbio->stripe_cache));
+	WARN_ON(!list_empty(&rbio->hash_list));
+	WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i]) {
+			__free_page(rbio->stripe_pages[i]);
+			rbio->stripe_pages[i] = NULL;
+		}
+	}
+
+	btrfs_put_bbio(rbio->bbio);
+	kfree(rbio);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+	unlock_stripe(rbio);
+	__free_raid_bio(rbio);
+}
+
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+{
+	struct bio *cur = bio_list_get(&rbio->bio_list);
+	struct bio *next;
+
+	if (rbio->generic_bio_cnt)
+		btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+
+	free_raid_bio(rbio);
+
+	while (cur) {
+		next = cur->bi_next;
+		cur->bi_next = NULL;
+		if (uptodate)
+			set_bit(BIO_UPTODATE, &cur->bi_flags);
+		bio_endio(cur, err);
+		cur = next;
+	}
+}
+
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->stripes_pending))
+		return;
+
+	err = 0;
+
+	/* OK, we have read all the stripes we need to. */
+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+		err = -EIO;
+
+	rbio_orig_end_io(rbio, err, 0);
+	return;
+}
+
+/*
+ * the read/modify/write code wants to use the original bio for
+ * any pages it included, and then use the rbio for everything
+ * else.  This function decides if a given index (stripe number)
+ * and page number in that stripe fall inside the original bio
+ * or the rbio.
+ *
+ * if you set bio_list_only, you'll get a NULL back for any ranges
+ * that are outside the bio_list
+ *
+ * This doesn't take any refs on anything, you get a bare page pointer
+ * and the caller must bump refs as required.
+ *
+ * You must call index_rbio_pages once before you can trust
+ * the answers from this function.
+ */
+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
+				 int index, int pagenr, int bio_list_only)
+{
+	int chunk_page;
+	struct page *p = NULL;
+
+	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+
+	spin_lock_irq(&rbio->bio_list_lock);
+	p = rbio->bio_pages[chunk_page];
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	if (p || bio_list_only)
+		return p;
+
+	return rbio->stripe_pages[chunk_page];
+}
+
+/*
+ * number of pages we need for the entire stripe across all the
+ * drives
+ */
+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
+{
+	unsigned long nr = stripe_len * nr_stripes;
+	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
+}
+
+/*
+ * allocation and initial setup for the btrfs_raid_bio.  Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
+			  struct btrfs_bio *bbio, u64 stripe_len)
+{
+	struct btrfs_raid_bio *rbio;
+	int nr_data = 0;
+	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
+	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
+	void *p;
+
+	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
+		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
+			GFP_NOFS);
+	if (!rbio)
+		return ERR_PTR(-ENOMEM);
+
+	bio_list_init(&rbio->bio_list);
+	INIT_LIST_HEAD(&rbio->plug_list);
+	spin_lock_init(&rbio->bio_list_lock);
+	INIT_LIST_HEAD(&rbio->stripe_cache);
+	INIT_LIST_HEAD(&rbio->hash_list);
+	rbio->bbio = bbio;
+	rbio->fs_info = root->fs_info;
+	rbio->stripe_len = stripe_len;
+	rbio->nr_pages = num_pages;
+	rbio->real_stripes = real_stripes;
+	rbio->stripe_npages = stripe_npages;
+	rbio->faila = -1;
+	rbio->failb = -1;
+	atomic_set(&rbio->refs, 1);
+	atomic_set(&rbio->error, 0);
+	atomic_set(&rbio->stripes_pending, 0);
+
+	/*
+	 * the stripe_pages and bio_pages array point to the extra
+	 * memory we allocated past the end of the rbio
+	 */
+	p = rbio + 1;
+	rbio->stripe_pages = p;
+	rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+	rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
+
+	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+		nr_data = real_stripes - 1;
+	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+		nr_data = real_stripes - 2;
+	else
+		BUG();
+
+	rbio->nr_data = nr_data;
+	return rbio;
+}
+
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i])
+			continue;
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+		rbio->stripe_pages[i] = page;
+		ClearPageUptodate(page);
+	}
+	return 0;
+}
+
+/* allocate pages for just the p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	struct page *page;
+
+	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+
+	for (; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i])
+			continue;
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+		rbio->stripe_pages[i] = page;
+	}
+	return 0;
+}
+
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+			    struct bio_list *bio_list,
+			    struct page *page,
+			    int stripe_nr,
+			    unsigned long page_index,
+			    unsigned long bio_max_len)
+{
+	struct bio *last = bio_list->tail;
+	u64 last_end = 0;
+	int ret;
+	struct bio *bio;
+	struct btrfs_bio_stripe *stripe;
+	u64 disk_start;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+
+	/* if the device is missing, just fail this stripe */
+	if (!stripe->dev->bdev)
+		return fail_rbio_index(rbio, stripe_nr);
+
+	/* see if we can add this page onto our existing bio */
+	if (last) {
+		last_end = (u64)last->bi_iter.bi_sector << 9;
+		last_end += last->bi_iter.bi_size;
+
+		/*
+		 * we can't merge these if they are from different
+		 * devices or if they are not contiguous
+		 */
+		if (last_end == disk_start && stripe->dev->bdev &&
+		    test_bit(BIO_UPTODATE, &last->bi_flags) &&
+		    last->bi_bdev == stripe->dev->bdev) {
+			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
+			if (ret == PAGE_CACHE_SIZE)
+				return 0;
+		}
+	}
+
+	/* put a new bio on the list */
+	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_size = 0;
+	bio->bi_bdev = stripe->dev->bdev;
+	bio->bi_iter.bi_sector = disk_start >> 9;
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	bio_list_add(bio_list, bio);
+	return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk.  This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction.  The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+	if (rbio->faila >= 0 || rbio->failb >= 0) {
+		BUG_ON(rbio->faila == rbio->real_stripes - 1);
+		__raid56_parity_recover(rbio);
+	} else {
+		finish_rmw(rbio);
+	}
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
+{
+	int index;
+	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
+	index += page;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result.  This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	struct bio *bio;
+	u64 start;
+	unsigned long stripe_offset;
+	unsigned long page_index;
+	struct page *p;
+	int i;
+
+	spin_lock_irq(&rbio->bio_list_lock);
+	bio_list_for_each(bio, &rbio->bio_list) {
+		start = (u64)bio->bi_iter.bi_sector << 9;
+		stripe_offset = start - rbio->bbio->raid_map[0];
+		page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			p = bio->bi_io_vec[i].bv_page;
+			rbio->bio_pages[page_index + i] = p;
+		}
+	}
+	spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+/*
+ * this is called from one of two situations.  We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_bio *bbio = rbio->bbio;
+	void *pointers[rbio->real_stripes];
+	int stripe_len = rbio->stripe_len;
+	int nr_data = rbio->nr_data;
+	int stripe;
+	int pagenr;
+	int p_stripe = -1;
+	int q_stripe = -1;
+	struct bio_list bio_list;
+	struct bio *bio;
+	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	bio_list_init(&bio_list);
+
+	if (rbio->real_stripes - rbio->nr_data == 1) {
+		p_stripe = rbio->real_stripes - 1;
+	} else if (rbio->real_stripes - rbio->nr_data == 2) {
+		p_stripe = rbio->real_stripes - 2;
+		q_stripe = rbio->real_stripes - 1;
+	} else {
+		BUG();
+	}
+
+	/* at this point we either have a full stripe,
+	 * or we've read the full stripe from the drive.
+	 * recalculate the parity and write the new results.
+	 *
+	 * We're not allowed to add any new bios to the
+	 * bio list here, anyone else that wants to
+	 * change this stripe needs to do their own rmw.
+	 */
+	spin_lock_irq(&rbio->bio_list_lock);
+	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	atomic_set(&rbio->error, 0);
+
+	/*
+	 * now that we've set rmw_locked, run through the
+	 * bio list one last time and map the page pointers
+	 *
+	 * We don't cache full rbios because we're assuming
+	 * the higher layers are unlikely to use this area of
+	 * the disk again soon.  If they do use it again,
+	 * hopefully they will send another full bio.
+	 */
+	index_rbio_pages(rbio);
+	if (!rbio_is_full(rbio))
+		cache_rbio_pages(rbio);
+	else
+		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+		struct page *p;
+		/* first collect one page from each data stripe */
+		for (stripe = 0; stripe < nr_data; stripe++) {
+			p = page_in_rbio(rbio, stripe, pagenr, 0);
+			pointers[stripe] = kmap(p);
+		}
+
+		/* then add the parity stripe */
+		p = rbio_pstripe_page(rbio, pagenr);
+		SetPageUptodate(p);
+		pointers[stripe++] = kmap(p);
+
+		if (q_stripe != -1) {
+
+			/*
+			 * raid6, add the qstripe and call the
+			 * library function to fill in our p/q
+			 */
+			p = rbio_qstripe_page(rbio, pagenr);
+			SetPageUptodate(p);
+			pointers[stripe++] = kmap(p);
+
+			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+						pointers);
+		} else {
+			/* raid5 */
+			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+		}
+
+
+		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+	}
+
+	/*
+	 * time to start writing.  Make bios for everything from the
+	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+	 * everything else.
+	 */
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       page, stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+	if (likely(!bbio->num_tgtdevs))
+		goto write_data;
+
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		if (!bbio->tgtdev_map[stripe])
+			continue;
+
+		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+
+			ret = rbio_add_io_page(rbio, &bio_list, page,
+					       rbio->bbio->tgtdev_map[stripe],
+					       pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+write_data:
+	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
+
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_write_end_io;
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(WRITE, bio);
+	}
+	return;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * helper to find the stripe number for a given bio.  Used to figure out which
+ * stripe has failed.  This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+			   struct bio *bio)
+{
+	u64 physical = bio->bi_iter.bi_sector;
+	u64 stripe_start;
+	int i;
+	struct btrfs_bio_stripe *stripe;
+
+	physical <<= 9;
+
+	for (i = 0; i < rbio->bbio->num_stripes; i++) {
+		stripe = &rbio->bbio->stripes[i];
+		stripe_start = stripe->physical;
+		if (physical >= stripe_start &&
+		    physical < stripe_start + rbio->stripe_len &&
+		    bio->bi_bdev == stripe->dev->bdev) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping).  Used to figure out which stripe has
+ * failed.  This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+				   struct bio *bio)
+{
+	u64 logical = bio->bi_iter.bi_sector;
+	u64 stripe_start;
+	int i;
+
+	logical <<= 9;
+
+	for (i = 0; i < rbio->nr_data; i++) {
+		stripe_start = rbio->bbio->raid_map[i];
+		if (logical >= stripe_start &&
+		    logical < stripe_start + rbio->stripe_len) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
+
+	/* we already know this stripe is bad, move on */
+	if (rbio->faila == failed || rbio->failb == failed)
+		goto out;
+
+	if (rbio->faila == -1) {
+		/* first failure on this rbio */
+		rbio->faila = failed;
+		atomic_inc(&rbio->error);
+	} else if (rbio->failb == -1) {
+		/* second failure on this rbio */
+		rbio->failb = failed;
+		atomic_inc(&rbio->error);
+	} else {
+		ret = -EIO;
+	}
+out:
+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+	return ret;
+}
+
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+			   struct bio *bio)
+{
+	int failed = find_bio_stripe(rbio, bio);
+
+	if (failed < 0)
+		return -EIO;
+
+	return fail_rbio_index(rbio, failed);
+}
+
+/*
+ * this sets each page in the bio uptodate.  It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct bio *bio)
+{
+	int i;
+	struct page *p;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		p = bio->bi_io_vec[i].bv_page;
+		SetPageUptodate(p);
+	}
+}
+
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid_rmw_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->stripes_pending))
+		return;
+
+	err = 0;
+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+		goto cleanup;
+
+	/*
+	 * this will normally call finish_rmw to start our write
+	 * but if there are any failed stripes we'll reconstruct
+	 * from parity first
+	 */
+	validate_rbio_for_rmw(rbio);
+	return;
+
+cleanup:
+
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+			rmw_work, NULL, NULL);
+
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
+}
+
+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
+{
+	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+			read_rebuild_work, NULL, NULL);
+
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
+}
+
+/*
+ * the stripe must be locked by the caller.  It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct bio_list bio_list;
+	int ret;
+	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
+	int pagenr;
+	int stripe;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	index_rbio_pages(rbio);
+
+	atomic_set(&rbio->error, 0);
+	/*
+	 * build a list of bios to read all the missing parts of this
+	 * stripe
+	 */
+	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+			struct page *page;
+			/*
+			 * we want to find all the pages missing from
+			 * the rbio and read them from the disk.  If
+			 * page_in_rbio finds a page in the bio list
+			 * we don't need to read it off the stripe.
+			 */
+			page = page_in_rbio(rbio, stripe, pagenr, 1);
+			if (page)
+				continue;
+
+			page = rbio_stripe_page(rbio, stripe, pagenr);
+			/*
+			 * the bio cache may have handed us an uptodate
+			 * page.  If so, be happy and use it
+			 */
+			if (PageUptodate(page))
+				continue;
+
+			ret = rbio_add_io_page(rbio, &bio_list, page,
+				       stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+	bios_to_read = bio_list_size(&bio_list);
+	if (!bios_to_read) {
+		/*
+		 * this can happen if others have merged with
+		 * us, it means there is nothing left to read.
+		 * But if there are missing devices it may not be
+		 * safe to do the full stripe write yet.
+		 */
+		goto finish;
+	}
+
+	/*
+	 * the bbio may be freed once we submit the last bio.  Make sure
+	 * not to touch it after that
+	 */
+	atomic_set(&rbio->stripes_pending, bios_to_read);
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_rmw_end_io;
+
+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
+				    BTRFS_WQ_ENDIO_RAID56);
+
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(READ, bio);
+	}
+	/* the actual write will happen once the reads are done */
+	return 0;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+	return -EIO;
+
+finish:
+	validate_rbio_for_rmw(rbio);
+	return 0;
+}
+
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+	int ret;
+
+	ret = alloc_rbio_parity_pages(rbio);
+	if (ret) {
+		__free_raid_bio(rbio);
+		return ret;
+	}
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0)
+		finish_rmw(rbio);
+	return 0;
+}
+
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+	int ret;
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0)
+		async_rmw_stripe(rbio);
+	return 0;
+}
+
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe.  So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+	/* head off into rmw land if we don't have a full stripe */
+	if (!rbio_is_full(rbio))
+		return partial_stripe_write(rbio);
+	return full_stripe_write(rbio);
+}
+
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list.  When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+	struct blk_plug_cb cb;
+	struct btrfs_fs_info *info;
+	struct list_head rbio_list;
+	struct btrfs_work work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+						 plug_list);
+	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+						 plug_list);
+	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
+	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
+
+	if (a_sector < b_sector)
+		return -1;
+	if (a_sector > b_sector)
+		return 1;
+	return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+	struct btrfs_raid_bio *cur;
+	struct btrfs_raid_bio *last = NULL;
+
+	/*
+	 * sort our plug list then try to merge
+	 * everything we can in hopes of creating full
+	 * stripes.
+	 */
+	list_sort(NULL, &plug->rbio_list, plug_cmp);
+	while (!list_empty(&plug->rbio_list)) {
+		cur = list_entry(plug->rbio_list.next,
+				 struct btrfs_raid_bio, plug_list);
+		list_del_init(&cur->plug_list);
+
+		if (rbio_is_full(cur)) {
+			/* we have a full stripe, send it down */
+			full_stripe_write(cur);
+			continue;
+		}
+		if (last) {
+			if (rbio_can_merge(last, cur)) {
+				merge_rbio(last, cur);
+				__free_raid_bio(cur);
+				continue;
+
+			}
+			__raid56_parity_write(last);
+		}
+		last = cur;
+	}
+	if (last) {
+		__raid56_parity_write(last);
+	}
+	kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+	struct btrfs_plug_cb *plug;
+	plug = container_of(work, struct btrfs_plug_cb, work);
+	run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct btrfs_plug_cb *plug;
+	plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+	if (from_schedule) {
+		btrfs_init_work(&plug->work, btrfs_rmw_helper,
+				unplug_work, NULL, NULL);
+		btrfs_queue_work(plug->info->rmw_workers,
+				 &plug->work);
+		return;
+	}
+	run_plug(plug);
+}
+
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+			struct btrfs_bio *bbio, u64 stripe_len)
+{
+	struct btrfs_raid_bio *rbio;
+	struct btrfs_plug_cb *plug = NULL;
+	struct blk_plug_cb *cb;
+	int ret;
+
+	rbio = alloc_rbio(root, bbio, stripe_len);
+	if (IS_ERR(rbio)) {
+		btrfs_put_bbio(bbio);
+		return PTR_ERR(rbio);
+	}
+	bio_list_add(&rbio->bio_list, bio);
+	rbio->bio_list_bytes = bio->bi_iter.bi_size;
+	rbio->operation = BTRFS_RBIO_WRITE;
+
+	btrfs_bio_counter_inc_noblocked(root->fs_info);
+	rbio->generic_bio_cnt = 1;
+
+	/*
+	 * don't plug on full rbios, just get them out the door
+	 * as quickly as we can
+	 */
+	if (rbio_is_full(rbio)) {
+		ret = full_stripe_write(rbio);
+		if (ret)
+			btrfs_bio_counter_dec(root->fs_info);
+		return ret;
+	}
+
+	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+			       sizeof(*plug));
+	if (cb) {
+		plug = container_of(cb, struct btrfs_plug_cb, cb);
+		if (!plug->info) {
+			plug->info = root->fs_info;
+			INIT_LIST_HEAD(&plug->rbio_list);
+		}
+		list_add_tail(&rbio->plug_list, &plug->rbio_list);
+		ret = 0;
+	} else {
+		ret = __raid56_parity_write(rbio);
+		if (ret)
+			btrfs_bio_counter_dec(root->fs_info);
+	}
+	return ret;
+}
+
+/*
+ * all parity reconstruction happens here.  We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+	int pagenr, stripe;
+	void **pointers;
+	int faila = -1, failb = -1;
+	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
+	struct page *page;
+	int err;
+	int i;
+
+	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+	if (!pointers) {
+		err = -ENOMEM;
+		goto cleanup_io;
+	}
+
+	faila = rbio->faila;
+	failb = rbio->failb;
+
+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+		spin_lock_irq(&rbio->bio_list_lock);
+		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+		spin_unlock_irq(&rbio->bio_list_lock);
+	}
+
+	index_rbio_pages(rbio);
+
+	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+		/*
+		 * Now we just use bitmap to mark the horizontal stripes in
+		 * which we have data when doing parity scrub.
+		 */
+		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+		    !test_bit(pagenr, rbio->dbitmap))
+			continue;
+
+		/* setup our array of pointers with pages
+		 * from each stripe
+		 */
+		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+			/*
+			 * if we're rebuilding a read, we have to use
+			 * pages from the bio list
+			 */
+			if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+			    (stripe == faila || stripe == failb)) {
+				page = page_in_rbio(rbio, stripe, pagenr, 0);
+			} else {
+				page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+			pointers[stripe] = kmap(page);
+		}
+
+		/* all raid6 handling here */
+		if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+			/*
+			 * single failure, rebuild from parity raid5
+			 * style
+			 */
+			if (failb < 0) {
+				if (faila == rbio->nr_data) {
+					/*
+					 * Just the P stripe has failed, without
+					 * a bad data or Q stripe.
+					 * TODO, we should redo the xor here.
+					 */
+					err = -EIO;
+					goto cleanup;
+				}
+				/*
+				 * a single failure in raid6 is rebuilt
+				 * in the pstripe code below
+				 */
+				goto pstripe;
+			}
+
+			/* make sure our ps and qs are in order */
+			if (faila > failb) {
+				int tmp = failb;
+				failb = faila;
+				faila = tmp;
+			}
+
+			/* if the q stripe is failed, do a pstripe reconstruction
+			 * from the xors.
+			 * If both the q stripe and the P stripe are failed, we're
+			 * here due to a crc mismatch and we can't give them the
+			 * data they want
+			 */
+			if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
+				if (rbio->bbio->raid_map[faila] ==
+				    RAID5_P_STRIPE) {
+					err = -EIO;
+					goto cleanup;
+				}
+				/*
+				 * otherwise we have one bad data stripe and
+				 * a good P stripe.  raid5!
+				 */
+				goto pstripe;
+			}
+
+			if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+				raid6_datap_recov(rbio->real_stripes,
+						  PAGE_SIZE, faila, pointers);
+			} else {
+				raid6_2data_recov(rbio->real_stripes,
+						  PAGE_SIZE, faila, failb,
+						  pointers);
+			}
+		} else {
+			void *p;
+
+			/* rebuild from P stripe here (raid5 or raid6) */
+			BUG_ON(failb != -1);
+pstripe:
+			/* Copy parity block into failed block to start with */
+			memcpy(pointers[faila],
+			       pointers[rbio->nr_data],
+			       PAGE_CACHE_SIZE);
+
+			/* rearrange the pointer array */
+			p = pointers[faila];
+			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+				pointers[stripe] = pointers[stripe + 1];
+			pointers[rbio->nr_data - 1] = p;
+
+			/* xor in the rest */
+			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+		}
+		/* if we're doing this rebuild as part of an rmw, go through
+		 * and set all of our private rbio pages in the
+		 * failed stripes as uptodate.  This way finish_rmw will
+		 * know they can be trusted.  If this was a read reconstruction,
+		 * other endio functions will fiddle the uptodate bits
+		 */
+		if (rbio->operation == BTRFS_RBIO_WRITE) {
+			for (i = 0;  i < nr_pages; i++) {
+				if (faila != -1) {
+					page = rbio_stripe_page(rbio, faila, i);
+					SetPageUptodate(page);
+				}
+				if (failb != -1) {
+					page = rbio_stripe_page(rbio, failb, i);
+					SetPageUptodate(page);
+				}
+			}
+		}
+		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+			/*
+			 * if we're rebuilding a read, we have to use
+			 * pages from the bio list
+			 */
+			if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+			    (stripe == faila || stripe == failb)) {
+				page = page_in_rbio(rbio, stripe, pagenr, 0);
+			} else {
+				page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+			kunmap(page);
+		}
+	}
+
+	err = 0;
+cleanup:
+	kfree(pointers);
+
+cleanup_io:
+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+		if (err == 0)
+			cache_rbio_pages(rbio);
+		else
+			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+		rbio_orig_end_io(rbio, err, err == 0);
+	} else if (err == 0) {
+		rbio->faila = -1;
+		rbio->failb = -1;
+
+		if (rbio->operation == BTRFS_RBIO_WRITE)
+			finish_rmw(rbio);
+		else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
+			finish_parity_scrub(rbio, 0);
+		else
+			BUG();
+	} else {
+		rbio_orig_end_io(rbio, err, 0);
+	}
+}
+
+/*
+ * This is called only for stripes we've read from disk to
+ * reconstruct the parity.
+ */
+static void raid_recover_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	/*
+	 * we only read stripe pages off the disk, set them
+	 * up to date if there were no errors
+	 */
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(bio);
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->stripes_pending))
+		return;
+
+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+		rbio_orig_end_io(rbio, -EIO, 0);
+	else
+		__raid_recover_end_io(rbio);
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct bio_list bio_list;
+	int ret;
+	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
+	int pagenr;
+	int stripe;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	atomic_set(&rbio->error, 0);
+
+	/*
+	 * read everything that hasn't failed.  Thanks to the
+	 * stripe cache, it is possible that some or all of these
+	 * pages are going to be uptodate.
+	 */
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		if (rbio->faila == stripe || rbio->failb == stripe) {
+			atomic_inc(&rbio->error);
+			continue;
+		}
+
+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+			struct page *p;
+
+			/*
+			 * the rmw code may have already read this
+			 * page in
+			 */
+			p = rbio_stripe_page(rbio, stripe, pagenr);
+			if (PageUptodate(p))
+				continue;
+
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       rbio_stripe_page(rbio, stripe, pagenr),
+				       stripe, pagenr, rbio->stripe_len);
+			if (ret < 0)
+				goto cleanup;
+		}
+	}
+
+	bios_to_read = bio_list_size(&bio_list);
+	if (!bios_to_read) {
+		/*
+		 * we might have no bios to read just because the pages
+		 * were up to date, or we might have no bios to read because
+		 * the devices were gone.
+		 */
+		if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+			__raid_recover_end_io(rbio);
+			goto out;
+		} else {
+			goto cleanup;
+		}
+	}
+
+	/*
+	 * the bbio may be freed once we submit the last bio.  Make sure
+	 * not to touch it after that
+	 */
+	atomic_set(&rbio->stripes_pending, bios_to_read);
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_recover_end_io;
+
+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
+				    BTRFS_WQ_ENDIO_RAID56);
+
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(READ, bio);
+	}
+out:
+	return 0;
+
+cleanup:
+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
+		rbio_orig_end_io(rbio, -EIO, 0);
+	return -EIO;
+}
+
+/*
+ * the main entry point for reads from the higher layers.  This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+			  struct btrfs_bio *bbio, u64 stripe_len,
+			  int mirror_num, int generic_io)
+{
+	struct btrfs_raid_bio *rbio;
+	int ret;
+
+	rbio = alloc_rbio(root, bbio, stripe_len);
+	if (IS_ERR(rbio)) {
+		if (generic_io)
+			btrfs_put_bbio(bbio);
+		return PTR_ERR(rbio);
+	}
+
+	rbio->operation = BTRFS_RBIO_READ_REBUILD;
+	bio_list_add(&rbio->bio_list, bio);
+	rbio->bio_list_bytes = bio->bi_iter.bi_size;
+
+	rbio->faila = find_logical_bio_stripe(rbio, bio);
+	if (rbio->faila == -1) {
+		BUG();
+		if (generic_io)
+			btrfs_put_bbio(bbio);
+		kfree(rbio);
+		return -EIO;
+	}
+
+	if (generic_io) {
+		btrfs_bio_counter_inc_noblocked(root->fs_info);
+		rbio->generic_bio_cnt = 1;
+	} else {
+		btrfs_get_bbio(bbio);
+	}
+
+	/*
+	 * reconstruct from the q stripe if they are
+	 * asking for mirror 3
+	 */
+	if (mirror_num == 3)
+		rbio->failb = rbio->real_stripes - 2;
+
+	ret = lock_stripe_add(rbio);
+
+	/*
+	 * __raid56_parity_recover will end the bio with
+	 * any errors it hits.  We don't want to return
+	 * its error value up the stack because our caller
+	 * will end up calling bio_endio with any nonzero
+	 * return
+	 */
+	if (ret == 0)
+		__raid56_parity_recover(rbio);
+	/*
+	 * our rbio has been added to the list of
+	 * rbios that will be handled after the
+	 * currently lock owner is done
+	 */
+	return 0;
+
+}
+
+static void rmw_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	raid56_rmw_stripe(rbio);
+}
+
+static void read_rebuild_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	__raid56_parity_recover(rbio);
+}
+
+/*
+ * The following code is used to scrub/replace the parity stripe
+ *
+ * Note: We need make sure all the pages that add into the scrub/replace
+ * raid bio are correct and not be changed during the scrub/replace. That
+ * is those pages just hold metadata or file data with checksum.
+ */
+
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+			       struct btrfs_bio *bbio, u64 stripe_len,
+			       struct btrfs_device *scrub_dev,
+			       unsigned long *dbitmap, int stripe_nsectors)
+{
+	struct btrfs_raid_bio *rbio;
+	int i;
+
+	rbio = alloc_rbio(root, bbio, stripe_len);
+	if (IS_ERR(rbio))
+		return NULL;
+	bio_list_add(&rbio->bio_list, bio);
+	/*
+	 * This is a special bio which is used to hold the completion handler
+	 * and make the scrub rbio is similar to the other types
+	 */
+	ASSERT(!bio->bi_iter.bi_size);
+	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
+
+	for (i = 0; i < rbio->real_stripes; i++) {
+		if (bbio->stripes[i].dev == scrub_dev) {
+			rbio->scrubp = i;
+			break;
+		}
+	}
+
+	/* Now we just support the sectorsize equals to page size */
+	ASSERT(root->sectorsize == PAGE_SIZE);
+	ASSERT(rbio->stripe_npages == stripe_nsectors);
+	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+
+	return rbio;
+}
+
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+				   struct page *page, u64 logical)
+{
+	int stripe_offset;
+	int index;
+
+	ASSERT(logical >= rbio->bbio->raid_map[0]);
+	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+				rbio->stripe_len * rbio->nr_data);
+	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+	index = stripe_offset >> PAGE_CACHE_SHIFT;
+	rbio->bio_pages[index] = page;
+}
+
+/*
+ * We just scrub the parity that we have correct data on the same horizontal,
+ * so we needn't allocate all pages for all the stripes.
+ */
+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	int bit;
+	int index;
+	struct page *page;
+
+	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
+		for (i = 0; i < rbio->real_stripes; i++) {
+			index = i * rbio->stripe_npages + bit;
+			if (rbio->stripe_pages[index])
+				continue;
+
+			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (!page)
+				return -ENOMEM;
+			rbio->stripe_pages[index] = page;
+			ClearPageUptodate(page);
+		}
+	}
+	return 0;
+}
+
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_parity_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->stripes_pending))
+		return;
+
+	err = 0;
+
+	if (atomic_read(&rbio->error))
+		err = -EIO;
+
+	rbio_orig_end_io(rbio, err, 0);
+}
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+					 int need_check)
+{
+	struct btrfs_bio *bbio = rbio->bbio;
+	void *pointers[rbio->real_stripes];
+	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+	int nr_data = rbio->nr_data;
+	int stripe;
+	int pagenr;
+	int p_stripe = -1;
+	int q_stripe = -1;
+	struct page *p_page = NULL;
+	struct page *q_page = NULL;
+	struct bio_list bio_list;
+	struct bio *bio;
+	int is_replace = 0;
+	int ret;
+
+	bio_list_init(&bio_list);
+
+	if (rbio->real_stripes - rbio->nr_data == 1) {
+		p_stripe = rbio->real_stripes - 1;
+	} else if (rbio->real_stripes - rbio->nr_data == 2) {
+		p_stripe = rbio->real_stripes - 2;
+		q_stripe = rbio->real_stripes - 1;
+	} else {
+		BUG();
+	}
+
+	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+		is_replace = 1;
+		bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+	}
+
+	/*
+	 * Because the higher layers(scrubber) are unlikely to
+	 * use this area of the disk again soon, so don't cache
+	 * it.
+	 */
+	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+	if (!need_check)
+		goto writeback;
+
+	p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (!p_page)
+		goto cleanup;
+	SetPageUptodate(p_page);
+
+	if (q_stripe != -1) {
+		q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!q_page) {
+			__free_page(p_page);
+			goto cleanup;
+		}
+		SetPageUptodate(q_page);
+	}
+
+	atomic_set(&rbio->error, 0);
+
+	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+		struct page *p;
+		void *parity;
+		/* first collect one page from each data stripe */
+		for (stripe = 0; stripe < nr_data; stripe++) {
+			p = page_in_rbio(rbio, stripe, pagenr, 0);
+			pointers[stripe] = kmap(p);
+		}
+
+		/* then add the parity stripe */
+		pointers[stripe++] = kmap(p_page);
+
+		if (q_stripe != -1) {
+
+			/*
+			 * raid6, add the qstripe and call the
+			 * library function to fill in our p/q
+			 */
+			pointers[stripe++] = kmap(q_page);
+
+			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+						pointers);
+		} else {
+			/* raid5 */
+			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+		}
+
+		/* Check scrubbing pairty and repair it */
+		p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+		parity = kmap(p);
+		if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
+			memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+		else
+			/* Parity is right, needn't writeback */
+			bitmap_clear(rbio->dbitmap, pagenr, 1);
+		kunmap(p);
+
+		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+	}
+
+	__free_page(p_page);
+	if (q_page)
+		__free_page(q_page);
+
+writeback:
+	/*
+	 * time to start writing.  Make bios for everything from the
+	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+	 * everything else.
+	 */
+	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+		struct page *page;
+
+		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+		ret = rbio_add_io_page(rbio, &bio_list,
+			       page, rbio->scrubp, pagenr, rbio->stripe_len);
+		if (ret)
+			goto cleanup;
+	}
+
+	if (!is_replace)
+		goto submit_write;
+
+	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
+		struct page *page;
+
+		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+		ret = rbio_add_io_page(rbio, &bio_list, page,
+				       bbio->tgtdev_map[rbio->scrubp],
+				       pagenr, rbio->stripe_len);
+		if (ret)
+			goto cleanup;
+	}
+
+submit_write:
+	nr_data = bio_list_size(&bio_list);
+	if (!nr_data) {
+		/* Every parity is right */
+		rbio_orig_end_io(rbio, 0, 0);
+		return;
+	}
+
+	atomic_set(&rbio->stripes_pending, nr_data);
+
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_write_parity_end_io;
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(WRITE, bio);
+	}
+	return;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
+{
+	if (stripe >= 0 && stripe < rbio->nr_data)
+		return 1;
+	return 0;
+}
+
+/*
+ * While we're doing the parity check and repair, we could have errors
+ * in reading pages off the disk.  This checks for errors and if we're
+ * not able to read the page it'll trigger parity reconstruction.  The
+ * parity scrub will be finished after we've reconstructed the failed
+ * stripes
+ */
+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+{
+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+		goto cleanup;
+
+	if (rbio->faila >= 0 || rbio->failb >= 0) {
+		int dfail = 0, failp = -1;
+
+		if (is_data_stripe(rbio, rbio->faila))
+			dfail++;
+		else if (is_parity_stripe(rbio->faila))
+			failp = rbio->faila;
+
+		if (is_data_stripe(rbio, rbio->failb))
+			dfail++;
+		else if (is_parity_stripe(rbio->failb))
+			failp = rbio->failb;
+
+		/*
+		 * Because we can not use a scrubbing parity to repair
+		 * the data, so the capability of the repair is declined.
+		 * (In the case of RAID5, we can not repair anything)
+		 */
+		if (dfail > rbio->bbio->max_errors - 1)
+			goto cleanup;
+
+		/*
+		 * If all data is good, only parity is correctly, just
+		 * repair the parity.
+		 */
+		if (dfail == 0) {
+			finish_parity_scrub(rbio, 0);
+			return;
+		}
+
+		/*
+		 * Here means we got one corrupted data stripe and one
+		 * corrupted parity on RAID6, if the corrupted parity
+		 * is scrubbing parity, luckly, use the other one to repair
+		 * the data, or we can not repair the data stripe.
+		 */
+		if (failp != rbio->scrubp)
+			goto cleanup;
+
+		__raid_recover_end_io(rbio);
+	} else {
+		finish_parity_scrub(rbio, 1);
+	}
+	return;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->stripes_pending))
+		return;
+
+	/*
+	 * this will normally call finish_rmw to start our write
+	 * but if there are any failed stripes we'll reconstruct
+	 * from parity first
+	 */
+	validate_rbio_for_parity_scrub(rbio);
+}
+
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct bio_list bio_list;
+	int ret;
+	int pagenr;
+	int stripe;
+	struct bio *bio;
+
+	ret = alloc_rbio_essential_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	bio_list_init(&bio_list);
+
+	atomic_set(&rbio->error, 0);
+	/*
+	 * build a list of bios to read all the missing parts of this
+	 * stripe
+	 */
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+			struct page *page;
+			/*
+			 * we want to find all the pages missing from
+			 * the rbio and read them from the disk.  If
+			 * page_in_rbio finds a page in the bio list
+			 * we don't need to read it off the stripe.
+			 */
+			page = page_in_rbio(rbio, stripe, pagenr, 1);
+			if (page)
+				continue;
+
+			page = rbio_stripe_page(rbio, stripe, pagenr);
+			/*
+			 * the bio cache may have handed us an uptodate
+			 * page.  If so, be happy and use it
+			 */
+			if (PageUptodate(page))
+				continue;
+
+			ret = rbio_add_io_page(rbio, &bio_list, page,
+				       stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+	bios_to_read = bio_list_size(&bio_list);
+	if (!bios_to_read) {
+		/*
+		 * this can happen if others have merged with
+		 * us, it means there is nothing left to read.
+		 * But if there are missing devices it may not be
+		 * safe to do the full stripe write yet.
+		 */
+		goto finish;
+	}
+
+	/*
+	 * the bbio may be freed once we submit the last bio.  Make sure
+	 * not to touch it after that
+	 */
+	atomic_set(&rbio->stripes_pending, bios_to_read);
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid56_parity_scrub_end_io;
+
+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
+				    BTRFS_WQ_ENDIO_RAID56);
+
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(READ, bio);
+	}
+	/* the actual write will happen once the reads are done */
+	return;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+	return;
+
+finish:
+	validate_rbio_for_parity_scrub(rbio);
+}
+
+static void scrub_parity_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	raid56_parity_scrub_stripe(rbio);
+}
+
+static void async_scrub_parity(struct btrfs_raid_bio *rbio)
+{
+	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+			scrub_parity_work, NULL, NULL);
+
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
+}
+
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+	if (!lock_stripe_add(rbio))
+		async_scrub_parity(rbio);
+}
diff --git a/kernel/fs/btrfs/raid56.h b/kernel/fs/btrfs/raid56.h
new file mode 100644
index 000000000..2b5d7977d
--- /dev/null
+++ b/kernel/fs/btrfs/raid56.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_RAID56__
+#define __BTRFS_RAID56__
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+	if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+		return 1;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+		return 2;
+	else
+		return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+	return map->num_stripes - nr_parity_stripes(map);
+}
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
+			     ((x) == RAID6_Q_STRIPE))
+
+struct btrfs_raid_bio;
+struct btrfs_device;
+
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+			  struct btrfs_bio *bbio, u64 stripe_len,
+			  int mirror_num, int generic_io);
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+			       struct btrfs_bio *bbio, u64 stripe_len);
+
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+			       struct btrfs_bio *bbio, u64 stripe_len,
+			       struct btrfs_device *scrub_dev,
+			       unsigned long *dbitmap, int stripe_nsectors);
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+				   struct page *page, u64 logical);
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+#endif
diff --git a/kernel/fs/btrfs/rcu-string.h b/kernel/fs/btrfs/rcu-string.h
new file mode 100644
index 000000000..9e111e457
--- /dev/null
+++ b/kernel/fs/btrfs/rcu-string.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct rcu_string {
+	struct rcu_head rcu;
+	char str[0];
+};
+
+static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
+{
+	size_t len = strlen(src) + 1;
+	struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
+					 (len * sizeof(char)), mask);
+	if (!ret)
+		return ret;
+	strncpy(ret->str, src, len);
+	return ret;
+}
+
+static inline void rcu_string_free(struct rcu_string *str)
+{
+	if (str)
+		kfree_rcu(str, rcu);
+}
+
+#define printk_in_rcu(fmt, ...) do {	\
+	rcu_read_lock();		\
+	printk(fmt, __VA_ARGS__);	\
+	rcu_read_unlock();		\
+} while (0)
+
+#define printk_ratelimited_in_rcu(fmt, ...) do {	\
+	rcu_read_lock();				\
+	printk_ratelimited(fmt, __VA_ARGS__);		\
+	rcu_read_unlock();				\
+} while (0)
+
+#define rcu_str_deref(rcu_str) ({				\
+	struct rcu_string *__str = rcu_dereference(rcu_str);	\
+	__str->str;						\
+})
diff --git a/kernel/fs/btrfs/reada.c b/kernel/fs/btrfs/reada.c
new file mode 100644
index 000000000..0e7beea92
--- /dev/null
+++ b/kernel/fs/btrfs/reada.c
@@ -0,0 +1,992 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "dev-replace.h"
+
+#undef DEBUG
+
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+
+#define MAX_IN_FLIGHT 6
+
+struct reada_extctl {
+	struct list_head	list;
+	struct reada_control	*rc;
+	u64			generation;
+};
+
+struct reada_extent {
+	u64			logical;
+	struct btrfs_key	top;
+	int			err;
+	struct list_head	extctl;
+	int 			refcnt;
+	spinlock_t		lock;
+	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
+	int			nzones;
+	struct btrfs_device	*scheduled_for;
+};
+
+struct reada_zone {
+	u64			start;
+	u64			end;
+	u64			elems;
+	struct list_head	list;
+	spinlock_t		lock;
+	int			locked;
+	struct btrfs_device	*device;
+	struct btrfs_device	*devs[BTRFS_MAX_MIRRORS]; /* full list, incl
+							   * self */
+	int			ndevs;
+	struct kref		refcnt;
+};
+
+struct reada_machine_work {
+	struct btrfs_work	work;
+	struct btrfs_fs_info	*fs_info;
+};
+
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation);
+
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			    u64 start, int err)
+{
+	int level = 0;
+	int nritems;
+	int i;
+	u64 bytenr;
+	u64 generation;
+	struct reada_extent *re;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head list;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct btrfs_device *for_dev;
+
+	if (eb)
+		level = btrfs_header_level(eb);
+
+	/* find extent */
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		re->refcnt++;
+	spin_unlock(&fs_info->reada_lock);
+
+	if (!re)
+		return -1;
+
+	spin_lock(&re->lock);
+	/*
+	 * just take the full list from the extent. afterwards we
+	 * don't need the lock anymore
+	 */
+	list_replace_init(&re->extctl, &list);
+	for_dev = re->scheduled_for;
+	re->scheduled_for = NULL;
+	spin_unlock(&re->lock);
+
+	if (err == 0) {
+		nritems = level ? btrfs_header_nritems(eb) : 0;
+		generation = btrfs_header_generation(eb);
+		/*
+		 * FIXME: currently we just set nritems to 0 if this is a leaf,
+		 * effectively ignoring the content. In a next step we could
+		 * trigger more readahead depending from the content, e.g.
+		 * fetch the checksums for the extents in the leaf.
+		 */
+	} else {
+		/*
+		 * this is the error case, the extent buffer has not been
+		 * read correctly. We won't access anything from it and
+		 * just cleanup our data structures. Effectively this will
+		 * cut the branch below this node from read ahead.
+		 */
+		nritems = 0;
+		generation = 0;
+	}
+
+	for (i = 0; i < nritems; i++) {
+		struct reada_extctl *rec;
+		u64 n_gen;
+		struct btrfs_key key;
+		struct btrfs_key next_key;
+
+		btrfs_node_key_to_cpu(eb, &key, i);
+		if (i + 1 < nritems)
+			btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+		else
+			next_key = re->top;
+		bytenr = btrfs_node_blockptr(eb, i);
+		n_gen = btrfs_node_ptr_generation(eb, i);
+
+		list_for_each_entry(rec, &list, list) {
+			struct reada_control *rc = rec->rc;
+
+			/*
+			 * if the generation doesn't match, just ignore this
+			 * extctl. This will probably cut off a branch from
+			 * prefetch. Alternatively one could start a new (sub-)
+			 * prefetch for this branch, starting again from root.
+			 * FIXME: move the generation check out of this loop
+			 */
+#ifdef DEBUG
+			if (rec->generation != generation) {
+				btrfs_debug(root->fs_info,
+					   "generation mismatch for (%llu,%d,%llu) %llu != %llu",
+				       key.objectid, key.type, key.offset,
+				       rec->generation, generation);
+			}
+#endif
+			if (rec->generation == generation &&
+			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+				reada_add_block(rc, bytenr, &next_key,
+						level - 1, n_gen);
+		}
+	}
+	/*
+	 * free extctl records
+	 */
+	while (!list_empty(&list)) {
+		struct reada_control *rc;
+		struct reada_extctl *rec;
+
+		rec = list_first_entry(&list, struct reada_extctl, list);
+		list_del(&rec->list);
+		rc = rec->rc;
+		kfree(rec);
+
+		kref_get(&rc->refcnt);
+		if (atomic_dec_and_test(&rc->elems)) {
+			kref_put(&rc->refcnt, reada_control_release);
+			wake_up(&rc->wait);
+		}
+		kref_put(&rc->refcnt, reada_control_release);
+
+		reada_extent_put(fs_info, re);	/* one ref for each entry */
+	}
+	reada_extent_put(fs_info, re);	/* our ref */
+	if (for_dev)
+		atomic_dec(&for_dev->reada_in_flight);
+
+	return 0;
+}
+
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err)
+{
+	int ret;
+
+	ret = __readahead_hook(root, eb, start, err);
+
+	reada_start_machine(root->fs_info);
+
+	return ret;
+}
+
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+					  struct btrfs_device *dev, u64 logical,
+					  struct btrfs_bio *bbio)
+{
+	int ret;
+	struct reada_zone *zone;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 start;
+	u64 end;
+	int i;
+
+	zone = NULL;
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+				     logical >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 1)
+		kref_get(&zone->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (ret == 1) {
+		if (logical >= zone->start && logical < zone->end)
+			return zone;
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+
+	cache = btrfs_lookup_block_group(fs_info, logical);
+	if (!cache)
+		return NULL;
+
+	start = cache->key.objectid;
+	end = start + cache->key.offset - 1;
+	btrfs_put_block_group(cache);
+
+	zone = kzalloc(sizeof(*zone), GFP_NOFS);
+	if (!zone)
+		return NULL;
+
+	zone->start = start;
+	zone->end = end;
+	INIT_LIST_HEAD(&zone->list);
+	spin_lock_init(&zone->lock);
+	zone->locked = 0;
+	kref_init(&zone->refcnt);
+	zone->elems = 0;
+	zone->device = dev; /* our device always sits at index 0 */
+	for (i = 0; i < bbio->num_stripes; ++i) {
+		/* bounds have already been checked */
+		zone->devs[i] = bbio->stripes[i].dev;
+	}
+	zone->ndevs = bbio->num_stripes;
+
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&dev->reada_zones,
+				(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
+				zone);
+
+	if (ret == -EEXIST) {
+		kfree(zone);
+		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+					     logical >> PAGE_CACHE_SHIFT, 1);
+		if (ret == 1)
+			kref_get(&zone->refcnt);
+	}
+	spin_unlock(&fs_info->reada_lock);
+
+	return zone;
+}
+
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+					      u64 logical,
+					      struct btrfs_key *top, int level)
+{
+	int ret;
+	struct reada_extent *re = NULL;
+	struct reada_extent *re_exist = NULL;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_bio *bbio = NULL;
+	struct btrfs_device *dev;
+	struct btrfs_device *prev_dev;
+	u32 blocksize;
+	u64 length;
+	int nzones = 0;
+	int i;
+	unsigned long index = logical >> PAGE_CACHE_SHIFT;
+	int dev_replace_is_ongoing;
+
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		re->refcnt++;
+	spin_unlock(&fs_info->reada_lock);
+
+	if (re)
+		return re;
+
+	re = kzalloc(sizeof(*re), GFP_NOFS);
+	if (!re)
+		return NULL;
+
+	blocksize = root->nodesize;
+	re->logical = logical;
+	re->top = *top;
+	INIT_LIST_HEAD(&re->extctl);
+	spin_lock_init(&re->lock);
+	re->refcnt = 1;
+
+	/*
+	 * map block
+	 */
+	length = blocksize;
+	ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+			      &bbio, 0);
+	if (ret || !bbio || length < blocksize)
+		goto error;
+
+	if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+		btrfs_err(root->fs_info,
+			   "readahead: more than %d copies not supported",
+			   BTRFS_MAX_MIRRORS);
+		goto error;
+	}
+
+	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+		struct reada_zone *zone;
+
+		dev = bbio->stripes[nzones].dev;
+		zone = reada_find_zone(fs_info, dev, logical, bbio);
+		if (!zone)
+			break;
+
+		re->zones[nzones] = zone;
+		spin_lock(&zone->lock);
+		if (!zone->elems)
+			kref_get(&zone->refcnt);
+		++zone->elems;
+		spin_unlock(&zone->lock);
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	re->nzones = nzones;
+	if (nzones == 0) {
+		/* not a single zone found, error and out */
+		goto error;
+	}
+
+	/* insert extent in reada_tree + all per-device trees, all or nothing */
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+	if (ret == -EEXIST) {
+		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
+		BUG_ON(!re_exist);
+		re_exist->refcnt++;
+		spin_unlock(&fs_info->reada_lock);
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
+		goto error;
+	}
+	if (ret) {
+		spin_unlock(&fs_info->reada_lock);
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
+		goto error;
+	}
+	prev_dev = NULL;
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+			&fs_info->dev_replace);
+	for (i = 0; i < nzones; ++i) {
+		dev = bbio->stripes[i].dev;
+		if (dev == prev_dev) {
+			/*
+			 * in case of DUP, just add the first zone. As both
+			 * are on the same device, there's nothing to gain
+			 * from adding both.
+			 * Also, it wouldn't work, as the tree is per device
+			 * and adding would fail with EEXIST
+			 */
+			continue;
+		}
+		if (!dev->bdev) {
+			/*
+			 * cannot read ahead on missing device, but for RAID5/6,
+			 * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+			 * device for such case.
+			 */
+			if (nzones > 1)
+				continue;
+		}
+		if (dev_replace_is_ongoing &&
+		    dev == fs_info->dev_replace.tgtdev) {
+			/*
+			 * as this device is selected for reading only as
+			 * a last resort, skip it for read ahead.
+			 */
+			continue;
+		}
+		prev_dev = dev;
+		ret = radix_tree_insert(&dev->reada_extents, index, re);
+		if (ret) {
+			while (--i >= 0) {
+				dev = bbio->stripes[i].dev;
+				BUG_ON(dev == NULL);
+				/* ignore whether the entry was inserted */
+				radix_tree_delete(&dev->reada_extents, index);
+			}
+			BUG_ON(fs_info == NULL);
+			radix_tree_delete(&fs_info->reada_tree, index);
+			spin_unlock(&fs_info->reada_lock);
+			btrfs_dev_replace_unlock(&fs_info->dev_replace);
+			goto error;
+		}
+	}
+	spin_unlock(&fs_info->reada_lock);
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	btrfs_put_bbio(bbio);
+	return re;
+
+error:
+	while (nzones) {
+		struct reada_zone *zone;
+
+		--nzones;
+		zone = re->zones[nzones];
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/*
+			 * no fs_info->reada_lock needed, as this can't be
+			 * the last ref
+			 */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	btrfs_put_bbio(bbio);
+	kfree(re);
+	return re_exist;
+}
+
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+			     struct reada_extent *re)
+{
+	int i;
+	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+
+	spin_lock(&fs_info->reada_lock);
+	if (--re->refcnt) {
+		spin_unlock(&fs_info->reada_lock);
+		return;
+	}
+
+	radix_tree_delete(&fs_info->reada_tree, index);
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		radix_tree_delete(&zone->device->reada_extents, index);
+	}
+
+	spin_unlock(&fs_info->reada_lock);
+
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/* no fs_info->reada_lock needed, as this can't be
+			 * the last ref */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	if (re->scheduled_for)
+		atomic_dec(&re->scheduled_for->reada_in_flight);
+
+	kfree(re);
+}
+
+static void reada_zone_release(struct kref *kref)
+{
+	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+
+	radix_tree_delete(&zone->device->reada_zones,
+			  zone->end >> PAGE_CACHE_SHIFT);
+
+	kfree(zone);
+}
+
+static void reada_control_release(struct kref *kref)
+{
+	struct reada_control *rc = container_of(kref, struct reada_control,
+						refcnt);
+
+	kfree(rc);
+}
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation)
+{
+	struct btrfs_root *root = rc->root;
+	struct reada_extent *re;
+	struct reada_extctl *rec;
+
+	re = reada_find_extent(root, logical, top, level); /* takes one ref */
+	if (!re)
+		return -1;
+
+	rec = kzalloc(sizeof(*rec), GFP_NOFS);
+	if (!rec) {
+		reada_extent_put(root->fs_info, re);
+		return -1;
+	}
+
+	rec->rc = rc;
+	rec->generation = generation;
+	atomic_inc(&rc->elems);
+
+	spin_lock(&re->lock);
+	list_add_tail(&rec->list, &re->extctl);
+	spin_unlock(&re->lock);
+
+	/* leave the ref on the extent */
+
+	return 0;
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+	int i;
+	unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < zone->ndevs; ++i) {
+		struct reada_zone *peer;
+		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+		if (peer && peer->device != zone->device)
+			peer->locked = lock;
+	}
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+	struct reada_zone *top_zone = NULL;
+	struct reada_zone *top_locked_zone = NULL;
+	u64 top_elems = 0;
+	u64 top_locked_elems = 0;
+	unsigned long index = 0;
+	int ret;
+
+	if (dev->reada_curr_zone) {
+		reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+		dev->reada_curr_zone = NULL;
+	}
+	/* pick the zone with the most elements */
+	while (1) {
+		struct reada_zone *zone;
+
+		ret = radix_tree_gang_lookup(&dev->reada_zones,
+					     (void **)&zone, index, 1);
+		if (ret == 0)
+			break;
+		index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		if (zone->locked) {
+			if (zone->elems > top_locked_elems) {
+				top_locked_elems = zone->elems;
+				top_locked_zone = zone;
+			}
+		} else {
+			if (zone->elems > top_elems) {
+				top_elems = zone->elems;
+				top_zone = zone;
+			}
+		}
+	}
+	if (top_zone)
+		dev->reada_curr_zone = top_zone;
+	else if (top_locked_zone)
+		dev->reada_curr_zone = top_locked_zone;
+	else
+		return 0;
+
+	dev->reada_next = dev->reada_curr_zone->start;
+	kref_get(&dev->reada_curr_zone->refcnt);
+	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+
+	return 1;
+}
+
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+				   struct btrfs_device *dev)
+{
+	struct reada_extent *re = NULL;
+	int mirror_num = 0;
+	struct extent_buffer *eb = NULL;
+	u64 logical;
+	int ret;
+	int i;
+	int need_kick = 0;
+
+	spin_lock(&fs_info->reada_lock);
+	if (dev->reada_curr_zone == NULL) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+	}
+	/*
+	 * FIXME currently we issue the reads one extent at a time. If we have
+	 * a contiguous block of extents, we could also coagulate them or use
+	 * plugging to speed things up
+	 */
+	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+				     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+		re = NULL;
+		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+					dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	}
+	if (ret == 0) {
+		spin_unlock(&fs_info->reada_lock);
+		return 0;
+	}
+	dev->reada_next = re->logical + fs_info->tree_root->nodesize;
+	re->refcnt++;
+
+	spin_unlock(&fs_info->reada_lock);
+
+	/*
+	 * find mirror num
+	 */
+	for (i = 0; i < re->nzones; ++i) {
+		if (re->zones[i]->device == dev) {
+			mirror_num = i + 1;
+			break;
+		}
+	}
+	logical = re->logical;
+
+	spin_lock(&re->lock);
+	if (re->scheduled_for == NULL) {
+		re->scheduled_for = dev;
+		need_kick = 1;
+	}
+	spin_unlock(&re->lock);
+
+	reada_extent_put(fs_info, re);
+
+	if (!need_kick)
+		return 0;
+
+	atomic_inc(&dev->reada_in_flight);
+	ret = reada_tree_block_flagged(fs_info->extent_root, logical,
+			mirror_num, &eb);
+	if (ret)
+		__readahead_hook(fs_info->extent_root, NULL, logical, ret);
+	else if (eb)
+		__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+
+	if (eb)
+		free_extent_buffer(eb);
+
+	return 1;
+
+}
+
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+	struct reada_machine_work *rmw;
+	struct btrfs_fs_info *fs_info;
+	int old_ioprio;
+
+	rmw = container_of(work, struct reada_machine_work, work);
+	fs_info = rmw->fs_info;
+
+	kfree(rmw);
+
+	old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+				       task_nice_ioprio(current));
+	set_task_ioprio(current, BTRFS_IOPRIO_READA);
+	__reada_start_machine(fs_info);
+	set_task_ioprio(current, old_ioprio);
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	u64 enqueued;
+	u64 total = 0;
+	int i;
+
+	do {
+		enqueued = 0;
+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
+			if (atomic_read(&device->reada_in_flight) <
+			    MAX_IN_FLIGHT)
+				enqueued += reada_start_machine_dev(fs_info,
+								    device);
+		}
+		total += enqueued;
+	} while (enqueued && total < 10000);
+
+	if (enqueued == 0)
+		return;
+
+	/*
+	 * If everything is already in the cache, this is effectively single
+	 * threaded. To a) not hold the caller for too long and b) to utilize
+	 * more cores, we broke the loop above after 10000 iterations and now
+	 * enqueue to workers to finish it. This will distribute the load to
+	 * the cores.
+	 */
+	for (i = 0; i < 2; ++i)
+		reada_start_machine(fs_info);
+}
+
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct reada_machine_work *rmw;
+
+	rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+	if (!rmw) {
+		/* FIXME we cannot handle this properly right now */
+		BUG();
+	}
+	btrfs_init_work(&rmw->work, btrfs_readahead_helper,
+			reada_start_machine_worker, NULL, NULL);
+	rmw->fs_info = fs_info;
+
+	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
+}
+
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	unsigned long index;
+	int ret;
+	int i;
+	int j;
+	int cnt;
+
+	spin_lock(&fs_info->reada_lock);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+			atomic_read(&device->reada_in_flight));
+		index = 0;
+		while (1) {
+			struct reada_zone *zone;
+			ret = radix_tree_gang_lookup(&device->reada_zones,
+						     (void **)&zone, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
+				"%d devs", zone->start, zone->end, zone->elems,
+				zone->locked);
+			for (j = 0; j < zone->ndevs; ++j) {
+				printk(KERN_CONT " %lld",
+					zone->devs[j]->devid);
+			}
+			if (device->reada_curr_zone == zone)
+				printk(KERN_CONT " curr off %llu",
+					device->reada_next - zone->start);
+			printk(KERN_CONT "\n");
+			index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		}
+		cnt = 0;
+		index = 0;
+		while (all) {
+			struct reada_extent *re = NULL;
+
+			ret = radix_tree_gang_lookup(&device->reada_extents,
+						     (void **)&re, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG
+				"  re: logical %llu size %u empty %d for %lld",
+				re->logical, fs_info->tree_root->nodesize,
+				list_empty(&re->extctl), re->scheduled_for ?
+				re->scheduled_for->devid : -1);
+
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+			printk(KERN_CONT "\n");
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			if (++cnt > 15)
+				break;
+		}
+	}
+
+	index = 0;
+	cnt = 0;
+	while (all) {
+		struct reada_extent *re = NULL;
+
+		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+					     index, 1);
+		if (ret == 0)
+			break;
+		if (!re->scheduled_for) {
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			continue;
+		}
+		printk(KERN_DEBUG
+			"re: logical %llu size %u list empty %d for %lld",
+			re->logical, fs_info->tree_root->nodesize,
+			list_empty(&re->extctl),
+			re->scheduled_for ? re->scheduled_for->devid : -1);
+		for (i = 0; i < re->nzones; ++i) {
+			printk(KERN_CONT " zone %llu-%llu devs",
+				re->zones[i]->start,
+				re->zones[i]->end);
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+		}
+		printk(KERN_CONT "\n");
+		index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+	}
+	spin_unlock(&fs_info->reada_lock);
+}
+#endif
+
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+	struct reada_control *rc;
+	u64 start;
+	u64 generation;
+	int level;
+	struct extent_buffer *node;
+	static struct btrfs_key max_key = {
+		.objectid = (u64)-1,
+		.type = (u8)-1,
+		.offset = (u64)-1
+	};
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return ERR_PTR(-ENOMEM);
+
+	rc->root = root;
+	rc->key_start = *key_start;
+	rc->key_end = *key_end;
+	atomic_set(&rc->elems, 0);
+	init_waitqueue_head(&rc->wait);
+	kref_init(&rc->refcnt);
+	kref_get(&rc->refcnt); /* one ref for having elements */
+
+	node = btrfs_root_node(root);
+	start = node->start;
+	level = btrfs_header_level(node);
+	generation = btrfs_header_generation(node);
+	free_extent_buffer(node);
+
+	if (reada_add_block(rc, start, &max_key, level, generation)) {
+		kfree(rc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	reada_start_machine(root->fs_info);
+
+	return rc;
+}
+
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+				   5 * HZ);
+		dump_devs(rc->root->fs_info,
+			  atomic_read(&rc->elems) < 10 ? 1 : 0);
+	}
+
+	dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+	}
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#endif
+
+void btrfs_reada_detach(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	kref_put(&rc->refcnt, reada_control_release);
+}
diff --git a/kernel/fs/btrfs/relocation.c b/kernel/fs/btrfs/relocation.c
new file mode 100644
index 000000000..74b24b01d
--- /dev/null
+++ b/kernel/fs/btrfs/relocation.c
@@ -0,0 +1,4658 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+	struct rb_node rb_node;
+	u64 bytenr;
+};
+
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+
+	u64 new_bytenr;
+	/* objectid of tree block owner, can be not uptodate */
+	u64 owner;
+	/* link to pending, changed or detached list */
+	struct list_head list;
+	/* list of upper level blocks reference this block */
+	struct list_head upper;
+	/* list of child blocks in the cache */
+	struct list_head lower;
+	/* NULL if this node is not tree root */
+	struct btrfs_root *root;
+	/* extent buffer got by COW the block */
+	struct extent_buffer *eb;
+	/* level of tree block */
+	unsigned int level:8;
+	/* is the block in non-reference counted tree */
+	unsigned int cowonly:1;
+	/* 1 if no child node in the cache */
+	unsigned int lowest:1;
+	/* is the extent buffer locked */
+	unsigned int locked:1;
+	/* has the block been processed */
+	unsigned int processed:1;
+	/* have backrefs of this block been checked */
+	unsigned int checked:1;
+	/*
+	 * 1 if corresponding block has been cowed but some upper
+	 * level block pointers may not point to the new location
+	 */
+	unsigned int pending:1;
+	/*
+	 * 1 if the backref node isn't connected to any other
+	 * backref node.
+	 */
+	unsigned int detached:1;
+};
+
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+	struct list_head list[2];
+	struct backref_node *node[2];
+};
+
+#define LOWER	0
+#define UPPER	1
+#define RELOCATION_RESERVED_NODES	256
+
+struct backref_cache {
+	/* red black tree of all backref nodes in the cache */
+	struct rb_root rb_root;
+	/* for passing backref nodes to btrfs_reloc_cow_block */
+	struct backref_node *path[BTRFS_MAX_LEVEL];
+	/*
+	 * list of blocks that have been cowed but some block
+	 * pointers in upper level blocks may not reflect the
+	 * new location
+	 */
+	struct list_head pending[BTRFS_MAX_LEVEL];
+	/* list of backref nodes with no child node */
+	struct list_head leaves;
+	/* list of blocks that have been cowed in current transaction */
+	struct list_head changed;
+	/* list of detached backref node. */
+	struct list_head detached;
+
+	u64 last_trans;
+
+	int nr_nodes;
+	int nr_edges;
+};
+
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+	void *data;
+};
+
+struct mapping_tree {
+	struct rb_root rb_root;
+	spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+	struct rb_node rb_node;
+	u64 bytenr;
+	struct btrfs_key key;
+	unsigned int level:8;
+	unsigned int key_ready:1;
+};
+
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+	u64 start;
+	u64 end;
+	u64 boundary[MAX_EXTENTS];
+	unsigned int nr;
+};
+
+struct reloc_control {
+	/* block group to relocate */
+	struct btrfs_block_group_cache *block_group;
+	/* extent tree */
+	struct btrfs_root *extent_root;
+	/* inode for moving data */
+	struct inode *data_inode;
+
+	struct btrfs_block_rsv *block_rsv;
+
+	struct backref_cache backref_cache;
+
+	struct file_extent_cluster cluster;
+	/* tree blocks have been processed */
+	struct extent_io_tree processed_blocks;
+	/* map start of tree root to corresponding reloc tree */
+	struct mapping_tree reloc_root_tree;
+	/* list of reloc trees */
+	struct list_head reloc_roots;
+	/* size of metadata reservation for merging reloc trees */
+	u64 merging_rsv_size;
+	/* size of relocated tree nodes */
+	u64 nodes_relocated;
+	/* reserved size for block group relocation*/
+	u64 reserved_bytes;
+
+	u64 search_start;
+	u64 extents_found;
+
+	unsigned int stage:8;
+	unsigned int create_reloc_tree:1;
+	unsigned int merge_reloc_tree:1;
+	unsigned int found_file_extent:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS	0
+#define UPDATE_DATA_PTRS	1
+
+static void remove_backref_node(struct backref_cache *cache,
+				struct backref_node *node);
+static void __mark_block_processed(struct reloc_control *rc,
+				   struct backref_node *node);
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+	tree->rb_root = RB_ROOT;
+	spin_lock_init(&tree->lock);
+}
+
+static void backref_cache_init(struct backref_cache *cache)
+{
+	int i;
+	cache->rb_root = RB_ROOT;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		INIT_LIST_HEAD(&cache->pending[i]);
+	INIT_LIST_HEAD(&cache->changed);
+	INIT_LIST_HEAD(&cache->detached);
+	INIT_LIST_HEAD(&cache->leaves);
+}
+
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+	struct backref_node *node;
+	int i;
+
+	while (!list_empty(&cache->detached)) {
+		node = list_entry(cache->detached.next,
+				  struct backref_node, list);
+		remove_backref_node(cache, node);
+	}
+
+	while (!list_empty(&cache->leaves)) {
+		node = list_entry(cache->leaves.next,
+				  struct backref_node, lower);
+		remove_backref_node(cache, node);
+	}
+
+	cache->last_trans = 0;
+
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		BUG_ON(!list_empty(&cache->pending[i]));
+	BUG_ON(!list_empty(&cache->changed));
+	BUG_ON(!list_empty(&cache->detached));
+	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+	BUG_ON(cache->nr_nodes);
+	BUG_ON(cache->nr_edges);
+}
+
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+	struct backref_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_NOFS);
+	if (node) {
+		INIT_LIST_HEAD(&node->list);
+		INIT_LIST_HEAD(&node->upper);
+		INIT_LIST_HEAD(&node->lower);
+		RB_CLEAR_NODE(&node->rb_node);
+		cache->nr_nodes++;
+	}
+	return node;
+}
+
+static void free_backref_node(struct backref_cache *cache,
+			      struct backref_node *node)
+{
+	if (node) {
+		cache->nr_nodes--;
+		kfree(node);
+	}
+}
+
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+	struct backref_edge *edge;
+
+	edge = kzalloc(sizeof(*edge), GFP_NOFS);
+	if (edge)
+		cache->nr_edges++;
+	return edge;
+}
+
+static void free_backref_edge(struct backref_cache *cache,
+			      struct backref_edge *edge)
+{
+	if (edge) {
+		cache->nr_edges--;
+		kfree(edge);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct tree_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+	struct rb_node *n = root->rb_node;
+	struct tree_entry *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			n = n->rb_left;
+		else if (bytenr > entry->bytenr)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
+{
+
+	struct btrfs_fs_info *fs_info = NULL;
+	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
+					      rb_node);
+	if (bnode->root)
+		fs_info = bnode->root->fs_info;
+	btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
+		    "found at offset %llu", bytenr);
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+					    struct backref_edge *edges[],
+					    int *index)
+{
+	struct backref_edge *edge;
+	int idx = *index;
+
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx++] = edge;
+		node = edge->node[UPPER];
+	}
+	BUG_ON(node->detached);
+	*index = idx;
+	return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+					      int *index)
+{
+	struct backref_edge *edge;
+	struct backref_node *lower;
+	int idx = *index;
+
+	while (idx > 0) {
+		edge = edges[idx - 1];
+		lower = edge->node[LOWER];
+		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+			idx--;
+			continue;
+		}
+		edge = list_entry(edge->list[LOWER].next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx - 1] = edge;
+		*index = idx;
+		return edge->node[UPPER];
+	}
+	*index = 0;
+	return NULL;
+}
+
+static void unlock_node_buffer(struct backref_node *node)
+{
+	if (node->locked) {
+		btrfs_tree_unlock(node->eb);
+		node->locked = 0;
+	}
+}
+
+static void drop_node_buffer(struct backref_node *node)
+{
+	if (node->eb) {
+		unlock_node_buffer(node);
+		free_extent_buffer(node->eb);
+		node->eb = NULL;
+	}
+}
+
+static void drop_backref_node(struct backref_cache *tree,
+			      struct backref_node *node)
+{
+	BUG_ON(!list_empty(&node->upper));
+
+	drop_node_buffer(node);
+	list_del(&node->list);
+	list_del(&node->lower);
+	if (!RB_EMPTY_NODE(&node->rb_node))
+		rb_erase(&node->rb_node, &tree->rb_root);
+	free_backref_node(tree, node);
+}
+
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+				struct backref_node *node)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+
+	if (!node)
+		return;
+
+	BUG_ON(!node->lowest && !node->detached);
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next, struct backref_edge,
+				  list[LOWER]);
+		upper = edge->node[UPPER];
+		list_del(&edge->list[LOWER]);
+		list_del(&edge->list[UPPER]);
+		free_backref_edge(cache, edge);
+
+		if (RB_EMPTY_NODE(&upper->rb_node)) {
+			BUG_ON(!list_empty(&node->upper));
+			drop_backref_node(cache, node);
+			node = upper;
+			node->lowest = 1;
+			continue;
+		}
+		/*
+		 * add the node to leaf node list if no other
+		 * child block cached.
+		 */
+		if (list_empty(&upper->lower)) {
+			list_add_tail(&upper->lower, &cache->leaves);
+			upper->lowest = 1;
+		}
+	}
+
+	drop_backref_node(cache, node);
+}
+
+static void update_backref_node(struct backref_cache *cache,
+				struct backref_node *node, u64 bytenr)
+{
+	struct rb_node *rb_node;
+	rb_erase(&node->rb_node, &cache->rb_root);
+	node->bytenr = bytenr;
+	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, bytenr);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+				struct backref_cache *cache)
+{
+	struct backref_node *node;
+	int level = 0;
+
+	if (cache->last_trans == 0) {
+		cache->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (cache->last_trans == trans->transid)
+		return 0;
+
+	/*
+	 * detached nodes are used to avoid unnecessary backref
+	 * lookup. transaction commit changes the extent tree.
+	 * so the detached nodes are no longer useful.
+	 */
+	while (!list_empty(&cache->detached)) {
+		node = list_entry(cache->detached.next,
+				  struct backref_node, list);
+		remove_backref_node(cache, node);
+	}
+
+	while (!list_empty(&cache->changed)) {
+		node = list_entry(cache->changed.next,
+				  struct backref_node, list);
+		list_del_init(&node->list);
+		BUG_ON(node->pending);
+		update_backref_node(cache, node, node->new_bytenr);
+	}
+
+	/*
+	 * some nodes can be left in the pending list if there were
+	 * errors during processing the pending nodes.
+	 */
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		list_for_each_entry(node, &cache->pending[level], list) {
+			BUG_ON(!node->pending);
+			if (node->bytenr == node->new_bytenr)
+				continue;
+			update_backref_node(cache, node, node->new_bytenr);
+		}
+	}
+
+	cache->last_trans = 0;
+	return 1;
+}
+
+
+static int should_ignore_root(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		return 0;
+
+	reloc_root = root->reloc_root;
+	if (!reloc_root)
+		return 0;
+
+	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+	    root->fs_info->running_transaction->transid - 1)
+		return 0;
+	/*
+	 * if there is reloc tree and it was created in previous
+	 * transaction backref lookup can find the reloc tree,
+	 * so backref node for the fs tree root is useless for
+	 * relocation.
+	 */
+	return 1;
+}
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+					  u64 bytenr)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct btrfs_root *root = NULL;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		root = (struct btrfs_root *)node->data;
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+	return root;
+}
+
+static int is_cowonly_root(u64 root_objectid)
+{
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
+	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
+	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_key key;
+
+	key.objectid = root_objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	if (is_cowonly_root(root_objectid))
+		key.offset = 0;
+	else
+		key.offset = (u64)-1;
+
+	return btrfs_get_fs_root(fs_info, &key, false);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+				  struct extent_buffer *leaf,
+				  struct btrfs_extent_ref_v0 *ref0)
+{
+	struct btrfs_root *root;
+	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+
+	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+	BUG_ON(IS_ERR(root));
+
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+	    generation != btrfs_root_generation(&root->root_item))
+		return NULL;
+
+	return root;
+}
+#endif
+
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+			unsigned long *ptr, unsigned long *end)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	u32 item_size;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		return 1;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
+
+	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    item_size <= sizeof(*ei) + sizeof(*bi)) {
+		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		return 1;
+	}
+	if (key.type == BTRFS_METADATA_ITEM_KEY &&
+	    item_size <= sizeof(*ei)) {
+		WARN_ON(item_size < sizeof(*ei));
+		return 1;
+	}
+
+	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		*ptr = (unsigned long)(bi + 1);
+	} else {
+		*ptr = (unsigned long)(ei + 1);
+	}
+	*end = (unsigned long)ei + item_size;
+	return 0;
+}
+
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static noinline_for_stack
+struct backref_node *build_backref_tree(struct reloc_control *rc,
+					struct btrfs_key *node_key,
+					int level, u64 bytenr)
+{
+	struct backref_cache *cache = &rc->backref_cache;
+	struct btrfs_path *path1;
+	struct btrfs_path *path2;
+	struct extent_buffer *eb;
+	struct btrfs_root *root;
+	struct backref_node *cur;
+	struct backref_node *upper;
+	struct backref_node *lower;
+	struct backref_node *node = NULL;
+	struct backref_node *exist = NULL;
+	struct backref_edge *edge;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	unsigned long end;
+	unsigned long ptr;
+	LIST_HEAD(list);
+	LIST_HEAD(useless);
+	int cowonly;
+	int ret;
+	int err = 0;
+	bool need_check = true;
+
+	path1 = btrfs_alloc_path();
+	path2 = btrfs_alloc_path();
+	if (!path1 || !path2) {
+		err = -ENOMEM;
+		goto out;
+	}
+	path1->reada = 1;
+	path2->reada = 2;
+
+	node = alloc_backref_node(cache);
+	if (!node) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	node->bytenr = bytenr;
+	node->level = level;
+	node->lowest = 1;
+	cur = node;
+again:
+	end = 0;
+	ptr = 0;
+	key.objectid = cur->bytenr;
+	key.type = BTRFS_METADATA_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	path1->search_commit_root = 1;
+	path1->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+				0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	ASSERT(ret);
+	ASSERT(path1->slots[0]);
+
+	path1->slots[0]--;
+
+	WARN_ON(cur->checked);
+	if (!list_empty(&cur->upper)) {
+		/*
+		 * the backref was added previously when processing
+		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
+		 */
+		ASSERT(list_is_singular(&cur->upper));
+		edge = list_entry(cur->upper.next, struct backref_edge,
+				  list[LOWER]);
+		ASSERT(list_empty(&edge->list[UPPER]));
+		exist = edge->node[UPPER];
+		/*
+		 * add the upper level block to pending list if we need
+		 * check its backrefs
+		 */
+		if (!exist->checked)
+			list_add_tail(&edge->list[UPPER], &list);
+	} else {
+		exist = NULL;
+	}
+
+	while (1) {
+		cond_resched();
+		eb = path1->nodes[0];
+
+		if (ptr >= end) {
+			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+				ret = btrfs_next_leaf(rc->extent_root, path1);
+				if (ret < 0) {
+					err = ret;
+					goto out;
+				}
+				if (ret > 0)
+					break;
+				eb = path1->nodes[0];
+			}
+
+			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+			if (key.objectid != cur->bytenr) {
+				WARN_ON(exist);
+				break;
+			}
+
+			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+			    key.type == BTRFS_METADATA_ITEM_KEY) {
+				ret = find_inline_backref(eb, path1->slots[0],
+							  &ptr, &end);
+				if (ret)
+					goto next;
+			}
+		}
+
+		if (ptr < end) {
+			/* update key for inline back ref */
+			struct btrfs_extent_inline_ref *iref;
+			iref = (struct btrfs_extent_inline_ref *)ptr;
+			key.type = btrfs_extent_inline_ref_type(eb, iref);
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+
+		if (exist &&
+		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+		      exist->owner == key.offset) ||
+		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+		      exist->bytenr == key.offset))) {
+			exist = NULL;
+			goto next;
+		}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+				struct btrfs_extent_ref_v0 *ref0;
+				ref0 = btrfs_item_ptr(eb, path1->slots[0],
+						struct btrfs_extent_ref_v0);
+				if (key.objectid == key.offset) {
+					root = find_tree_root(rc, eb, ref0);
+					if (root && !should_ignore_root(root))
+						cur->root = root;
+					else
+						list_add(&cur->list, &useless);
+					break;
+				}
+				if (is_cowonly_root(btrfs_ref_root_v0(eb,
+								      ref0)))
+					cur->cowonly = 1;
+			}
+#else
+		ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+			if (key.objectid == key.offset) {
+				/*
+				 * only root blocks of reloc trees use
+				 * backref of this type.
+				 */
+				root = find_reloc_root(rc, cur->bytenr);
+				ASSERT(root);
+				cur->root = root;
+				break;
+			}
+
+			edge = alloc_backref_edge(cache);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb_node = tree_search(&cache->rb_root, key.offset);
+			if (!rb_node) {
+				upper = alloc_backref_node(cache);
+				if (!upper) {
+					free_backref_edge(cache, edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				upper->bytenr = key.offset;
+				upper->level = cur->level + 1;
+				/*
+				 *  backrefs for the upper level block isn't
+				 *  cached, add the block to pending list
+				 */
+				list_add_tail(&edge->list[UPPER], &list);
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				ASSERT(upper->checked);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+			}
+			list_add_tail(&edge->list[LOWER], &cur->upper);
+			edge->node[LOWER] = cur;
+			edge->node[UPPER] = upper;
+
+			goto next;
+		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+			goto next;
+		}
+
+		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+		root = read_fs_root(rc->extent_root->fs_info, key.offset);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			goto out;
+		}
+
+		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+			cur->cowonly = 1;
+
+		if (btrfs_root_level(&root->root_item) == cur->level) {
+			/* tree root */
+			ASSERT(btrfs_root_bytenr(&root->root_item) ==
+			       cur->bytenr);
+			if (should_ignore_root(root))
+				list_add(&cur->list, &useless);
+			else
+				cur->root = root;
+			break;
+		}
+
+		level = cur->level + 1;
+
+		/*
+		 * searching the tree to find upper level blocks
+		 * reference the block.
+		 */
+		path2->search_commit_root = 1;
+		path2->skip_locking = 1;
+		path2->lowest_level = level;
+		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+		path2->lowest_level = 0;
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0 && path2->slots[level] > 0)
+			path2->slots[level]--;
+
+		eb = path2->nodes[level];
+		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+			cur->bytenr);
+
+		lower = cur;
+		need_check = true;
+		for (; level < BTRFS_MAX_LEVEL; level++) {
+			if (!path2->nodes[level]) {
+				ASSERT(btrfs_root_bytenr(&root->root_item) ==
+				       lower->bytenr);
+				if (should_ignore_root(root))
+					list_add(&lower->list, &useless);
+				else
+					lower->root = root;
+				break;
+			}
+
+			edge = alloc_backref_edge(cache);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			eb = path2->nodes[level];
+			rb_node = tree_search(&cache->rb_root, eb->start);
+			if (!rb_node) {
+				upper = alloc_backref_node(cache);
+				if (!upper) {
+					free_backref_edge(cache, edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				upper->bytenr = eb->start;
+				upper->owner = btrfs_header_owner(eb);
+				upper->level = lower->level + 1;
+				if (!test_bit(BTRFS_ROOT_REF_COWS,
+					      &root->state))
+					upper->cowonly = 1;
+
+				/*
+				 * if we know the block isn't shared
+				 * we can void checking its backrefs.
+				 */
+				if (btrfs_block_can_be_shared(root, eb))
+					upper->checked = 0;
+				else
+					upper->checked = 1;
+
+				/*
+				 * add the block to pending list if we
+				 * need check its backrefs, we only do this once
+				 * while walking up a tree as we will catch
+				 * anything else later on.
+				 */
+				if (!upper->checked && need_check) {
+					need_check = false;
+					list_add_tail(&edge->list[UPPER],
+						      &list);
+				} else {
+					if (upper->checked)
+						need_check = true;
+					INIT_LIST_HEAD(&edge->list[UPPER]);
+				}
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				ASSERT(upper->checked);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+				if (!upper->owner)
+					upper->owner = btrfs_header_owner(eb);
+			}
+			list_add_tail(&edge->list[LOWER], &lower->upper);
+			edge->node[LOWER] = lower;
+			edge->node[UPPER] = upper;
+
+			if (rb_node)
+				break;
+			lower = upper;
+			upper = NULL;
+		}
+		btrfs_release_path(path2);
+next:
+		if (ptr < end) {
+			ptr += btrfs_extent_inline_ref_size(key.type);
+			if (ptr >= end) {
+				WARN_ON(ptr > end);
+				ptr = 0;
+				end = 0;
+			}
+		}
+		if (ptr >= end)
+			path1->slots[0]++;
+	}
+	btrfs_release_path(path1);
+
+	cur->checked = 1;
+	WARN_ON(exist);
+
+	/* the pending list isn't empty, take the first block to process */
+	if (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		cur = edge->node[UPPER];
+		goto again;
+	}
+
+	/*
+	 * everything goes well, connect backref nodes and insert backref nodes
+	 * into the cache.
+	 */
+	ASSERT(node->checked);
+	cowonly = node->cowonly;
+	if (!cowonly) {
+		rb_node = tree_insert(&cache->rb_root, node->bytenr,
+				      &node->rb_node);
+		if (rb_node)
+			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
+		list_add_tail(&node->lower, &cache->leaves);
+	}
+
+	list_for_each_entry(edge, &node->upper, list[LOWER])
+		list_add_tail(&edge->list[UPPER], &list);
+
+	while (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		upper = edge->node[UPPER];
+		if (upper->detached) {
+			list_del(&edge->list[LOWER]);
+			lower = edge->node[LOWER];
+			free_backref_edge(cache, edge);
+			if (list_empty(&lower->upper))
+				list_add(&lower->list, &useless);
+			continue;
+		}
+
+		if (!RB_EMPTY_NODE(&upper->rb_node)) {
+			if (upper->lowest) {
+				list_del_init(&upper->lower);
+				upper->lowest = 0;
+			}
+
+			list_add_tail(&edge->list[UPPER], &upper->lower);
+			continue;
+		}
+
+		if (!upper->checked) {
+			/*
+			 * Still want to blow up for developers since this is a
+			 * logic bug.
+			 */
+			ASSERT(0);
+			err = -EINVAL;
+			goto out;
+		}
+		if (cowonly != upper->cowonly) {
+			ASSERT(0);
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (!cowonly) {
+			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+					      &upper->rb_node);
+			if (rb_node)
+				backref_tree_panic(rb_node, -EEXIST,
+						   upper->bytenr);
+		}
+
+		list_add_tail(&edge->list[UPPER], &upper->lower);
+
+		list_for_each_entry(edge, &upper->upper, list[LOWER])
+			list_add_tail(&edge->list[UPPER], &list);
+	}
+	/*
+	 * process useless backref nodes. backref nodes for tree leaves
+	 * are deleted from the cache. backref nodes for upper level
+	 * tree blocks are left in the cache to avoid unnecessary backref
+	 * lookup.
+	 */
+	while (!list_empty(&useless)) {
+		upper = list_entry(useless.next, struct backref_node, list);
+		list_del_init(&upper->list);
+		ASSERT(list_empty(&upper->upper));
+		if (upper == node)
+			node = NULL;
+		if (upper->lowest) {
+			list_del_init(&upper->lower);
+			upper->lowest = 0;
+		}
+		while (!list_empty(&upper->lower)) {
+			edge = list_entry(upper->lower.next,
+					  struct backref_edge, list[UPPER]);
+			list_del(&edge->list[UPPER]);
+			list_del(&edge->list[LOWER]);
+			lower = edge->node[LOWER];
+			free_backref_edge(cache, edge);
+
+			if (list_empty(&lower->upper))
+				list_add(&lower->list, &useless);
+		}
+		__mark_block_processed(rc, upper);
+		if (upper->level > 0) {
+			list_add(&upper->list, &cache->detached);
+			upper->detached = 1;
+		} else {
+			rb_erase(&upper->rb_node, &cache->rb_root);
+			free_backref_node(cache, upper);
+		}
+	}
+out:
+	btrfs_free_path(path1);
+	btrfs_free_path(path2);
+	if (err) {
+		while (!list_empty(&useless)) {
+			lower = list_entry(useless.next,
+					   struct backref_node, list);
+			list_del_init(&lower->list);
+		}
+		while (!list_empty(&list)) {
+			edge = list_first_entry(&list, struct backref_edge,
+						list[UPPER]);
+			list_del(&edge->list[UPPER]);
+			list_del(&edge->list[LOWER]);
+			lower = edge->node[LOWER];
+			upper = edge->node[UPPER];
+			free_backref_edge(cache, edge);
+
+			/*
+			 * Lower is no longer linked to any upper backref nodes
+			 * and isn't in the cache, we can free it ourselves.
+			 */
+			if (list_empty(&lower->upper) &&
+			    RB_EMPTY_NODE(&lower->rb_node))
+				list_add(&lower->list, &useless);
+
+			if (!RB_EMPTY_NODE(&upper->rb_node))
+				continue;
+
+			/* Add this guy's upper edges to the list to proces */
+			list_for_each_entry(edge, &upper->upper, list[LOWER])
+				list_add_tail(&edge->list[UPPER], &list);
+			if (list_empty(&upper->upper))
+				list_add(&upper->list, &useless);
+		}
+
+		while (!list_empty(&useless)) {
+			lower = list_entry(useless.next,
+					   struct backref_node, list);
+			list_del_init(&lower->list);
+			free_backref_node(cache, lower);
+		}
+		return ERR_PTR(err);
+	}
+	ASSERT(!node || !node->detached);
+	return node;
+}
+
+/*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+			      struct reloc_control *rc,
+			      struct btrfs_root *src,
+			      struct btrfs_root *dest)
+{
+	struct btrfs_root *reloc_root = src->reloc_root;
+	struct backref_cache *cache = &rc->backref_cache;
+	struct backref_node *node = NULL;
+	struct backref_node *new_node;
+	struct backref_edge *edge;
+	struct backref_edge *new_edge;
+	struct rb_node *rb_node;
+
+	if (cache->last_trans > 0)
+		update_backref_cache(trans, cache);
+
+	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct backref_node, rb_node);
+		if (node->detached)
+			node = NULL;
+		else
+			BUG_ON(node->new_bytenr != reloc_root->node->start);
+	}
+
+	if (!node) {
+		rb_node = tree_search(&cache->rb_root,
+				      reloc_root->commit_root->start);
+		if (rb_node) {
+			node = rb_entry(rb_node, struct backref_node,
+					rb_node);
+			BUG_ON(node->detached);
+		}
+	}
+
+	if (!node)
+		return 0;
+
+	new_node = alloc_backref_node(cache);
+	if (!new_node)
+		return -ENOMEM;
+
+	new_node->bytenr = dest->node->start;
+	new_node->level = node->level;
+	new_node->lowest = node->lowest;
+	new_node->checked = 1;
+	new_node->root = dest;
+
+	if (!node->lowest) {
+		list_for_each_entry(edge, &node->lower, list[UPPER]) {
+			new_edge = alloc_backref_edge(cache);
+			if (!new_edge)
+				goto fail;
+
+			new_edge->node[UPPER] = new_node;
+			new_edge->node[LOWER] = edge->node[LOWER];
+			list_add_tail(&new_edge->list[UPPER],
+				      &new_node->lower);
+		}
+	} else {
+		list_add_tail(&new_node->lower, &cache->leaves);
+	}
+
+	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+			      &new_node->rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
+
+	if (!new_node->lowest) {
+		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+			list_add_tail(&new_edge->list[LOWER],
+				      &new_edge->node[LOWER]->upper);
+		}
+	}
+	return 0;
+fail:
+	while (!list_empty(&new_node->lower)) {
+		new_edge = list_entry(new_node->lower.next,
+				      struct backref_edge, list[UPPER]);
+		list_del(&new_edge->list[UPPER]);
+		free_backref_edge(cache, new_edge);
+	}
+	free_backref_node(cache, new_node);
+	return -ENOMEM;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __must_check __add_reloc_root(struct btrfs_root *root)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	node = kmalloc(sizeof(*node), GFP_NOFS);
+	if (!node)
+		return -ENOMEM;
+
+	node->bytenr = root->node->start;
+	node->data = root;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+			      node->bytenr, &node->rb_node);
+	spin_unlock(&rc->reloc_root_tree.lock);
+	if (rb_node) {
+		btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
+			    "for start=%llu while inserting into relocation "
+			    "tree", node->bytenr);
+		kfree(node);
+		return -EEXIST;
+	}
+
+	list_add_tail(&root->root_list, &rc->reloc_roots);
+	return 0;
+}
+
+/*
+ * helper to delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static void __del_reloc_root(struct btrfs_root *root)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node = NULL;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+			      root->node->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+
+	if (!node)
+		return;
+	BUG_ON((struct btrfs_root *)node->data != root);
+
+	spin_lock(&root->fs_info->trans_lock);
+	list_del_init(&root->root_list);
+	spin_unlock(&root->fs_info->trans_lock);
+	kfree(node);
+}
+
+/*
+ * helper to update the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node = NULL;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+			      root->node->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+
+	if (!node)
+		return 0;
+	BUG_ON((struct btrfs_root *)node->data != root);
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	node->bytenr = new_bytenr;
+	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+			      node->bytenr, &node->rb_node);
+	spin_unlock(&rc->reloc_root_tree.lock);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
+	return 0;
+}
+
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb;
+	struct btrfs_root_item *root_item;
+	struct btrfs_key root_key;
+	u64 last_snap = 0;
+	int ret;
+
+	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	BUG_ON(!root_item);
+
+	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = objectid;
+
+	if (root->root_key.objectid == objectid) {
+		/* called by btrfs_init_reloc_root */
+		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+				      BTRFS_TREE_RELOC_OBJECTID);
+		BUG_ON(ret);
+
+		last_snap = btrfs_root_last_snapshot(&root->root_item);
+		btrfs_set_root_last_snapshot(&root->root_item,
+					     trans->transid - 1);
+	} else {
+		/*
+		 * called by btrfs_reloc_post_snapshot_hook.
+		 * the source tree is a reloc tree, all tree blocks
+		 * modified after it was created have RELOC flag
+		 * set in their headers. so it's OK to not update
+		 * the 'last_snapshot'.
+		 */
+		ret = btrfs_copy_root(trans, root, root->node, &eb,
+				      BTRFS_TREE_RELOC_OBJECTID);
+		BUG_ON(ret);
+	}
+
+	memcpy(root_item, &root->root_item, sizeof(*root_item));
+	btrfs_set_root_bytenr(root_item, eb->start);
+	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	btrfs_set_root_generation(root_item, trans->transid);
+
+	if (root->root_key.objectid == objectid) {
+		btrfs_set_root_refs(root_item, 0);
+		memset(&root_item->drop_progress, 0,
+		       sizeof(struct btrfs_disk_key));
+		root_item->drop_level = 0;
+		/*
+		 * abuse rtransid, it is safe because it is impossible to
+		 * receive data into a relocation tree.
+		 */
+		btrfs_set_root_rtransid(root_item, last_snap);
+		btrfs_set_root_otransid(root_item, trans->transid);
+	}
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root_key, root_item);
+	BUG_ON(ret);
+	kfree(root_item);
+
+	reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
+	BUG_ON(IS_ERR(reloc_root));
+	reloc_root->last_trans = trans->transid;
+	return reloc_root;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+	struct btrfs_block_rsv *rsv;
+	int clear_rsv = 0;
+	int ret;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		reloc_root->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (!rc || !rc->create_reloc_tree ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		return 0;
+
+	if (!trans->reloc_reserved) {
+		rsv = trans->block_rsv;
+		trans->block_rsv = rc->block_rsv;
+		clear_rsv = 1;
+	}
+	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+	if (clear_rsv)
+		trans->block_rsv = rsv;
+
+	ret = __add_reloc_root(reloc_root);
+	BUG_ON(ret < 0);
+	root->reloc_root = reloc_root;
+	return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	int ret;
+
+	if (!root->reloc_root)
+		goto out;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+	    btrfs_root_refs(root_item) == 0) {
+		root->reloc_root = NULL;
+		__del_reloc_root(reloc_root);
+	}
+
+	if (reloc_root->commit_root != reloc_root->node) {
+		btrfs_set_root_node(root_item, reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		reloc_root->commit_root = btrfs_root_node(reloc_root);
+	}
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&reloc_root->root_key, root_item);
+	BUG_ON(ret);
+
+out:
+	return 0;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < btrfs_ino(&entry->vfs_inode))
+			node = node->rb_left;
+		else if (objectid > btrfs_ino(&entry->vfs_inode))
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			return inode;
+		}
+
+		objectid = btrfs_ino(&entry->vfs_inode) + 1;
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+	return NULL;
+}
+
+static int in_block_group(u64 bytenr,
+			  struct btrfs_block_group_cache *block_group)
+{
+	if (bytenr >= block_group->key.objectid &&
+	    bytenr < block_group->key.objectid + block_group->key.offset)
+		return 1;
+	return 0;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+			    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
+				       bytenr, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+
+	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+	       btrfs_file_extent_compression(leaf, fi) ||
+	       btrfs_file_extent_encryption(leaf, fi) ||
+	       btrfs_file_extent_other_encoding(leaf, fi));
+
+	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
+			 struct btrfs_root *root,
+			 struct extent_buffer *leaf)
+{
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	struct inode *inode = NULL;
+	u64 parent;
+	u64 bytenr;
+	u64 new_bytenr = 0;
+	u64 num_bytes;
+	u64 end;
+	u32 nritems;
+	u32 i;
+	int ret = 0;
+	int first = 1;
+	int dirty = 0;
+
+	if (rc->stage != UPDATE_DATA_PTRS)
+		return 0;
+
+	/* reloc trees always use full backref */
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		parent = leaf->start;
+	else
+		parent = 0;
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		if (bytenr == 0)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group))
+			continue;
+
+		/*
+		 * if we are modifying block in fs tree, wait for readpage
+		 * to complete and drop the extent cache
+		 */
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+			if (first) {
+				inode = find_next_inode(root, key.objectid);
+				first = 0;
+			} else if (inode && btrfs_ino(inode) < key.objectid) {
+				btrfs_add_delayed_iput(inode);
+				inode = find_next_inode(root, key.objectid);
+			}
+			if (inode && btrfs_ino(inode) == key.objectid) {
+				end = key.offset +
+				      btrfs_file_extent_num_bytes(leaf, fi);
+				WARN_ON(!IS_ALIGNED(key.offset,
+						    root->sectorsize));
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+						      key.offset, end);
+				if (!ret)
+					continue;
+
+				btrfs_drop_extent_cache(inode, key.offset, end,
+							1);
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      key.offset, end);
+			}
+		}
+
+		ret = get_new_location(rc->data_inode, &new_bytenr,
+				       bytenr, num_bytes);
+		if (ret) {
+			/*
+			 * Don't have to abort since we've not changed anything
+			 * in the file extent yet.
+			 */
+			break;
+		}
+
+		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+		dirty = 1;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+					   num_bytes, parent,
+					   btrfs_header_owner(leaf),
+					   key.objectid, key.offset, 1);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
+
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					parent, btrfs_header_owner(leaf),
+					key.objectid, key.offset, 1);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
+	}
+	if (dirty)
+		btrfs_mark_buffer_dirty(leaf);
+	if (inode)
+		btrfs_add_delayed_iput(inode);
+	return ret;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+		     struct btrfs_path *path, int level)
+{
+	struct btrfs_disk_key key1;
+	struct btrfs_disk_key key2;
+	btrfs_node_key(eb, &key1, slot);
+	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+	return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans,
+		 struct btrfs_root *dest, struct btrfs_root *src,
+		 struct btrfs_path *path, struct btrfs_key *next_key,
+		 int lowest_level, int max_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 old_bytenr;
+	u64 new_bytenr;
+	u64 old_ptr_gen;
+	u64 new_ptr_gen;
+	u64 last_snapshot;
+	u32 blocksize;
+	int cow = 0;
+	int level;
+	int ret;
+	int slot;
+
+	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
+	slot = path->slots[lowest_level];
+	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+
+	eb = btrfs_lock_root_node(dest);
+	btrfs_set_lock_blocking(eb);
+	level = btrfs_header_level(eb);
+
+	if (level < lowest_level) {
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+		return 0;
+	}
+
+	if (cow) {
+		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+		BUG_ON(ret);
+	}
+	btrfs_set_lock_blocking(eb);
+
+	if (next_key) {
+		next_key->objectid = (u64)-1;
+		next_key->type = (u8)-1;
+		next_key->offset = (u64)-1;
+	}
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		BUG_ON(level < lowest_level);
+
+		ret = btrfs_bin_search(parent, &key, level, &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		if (next_key && slot + 1 < btrfs_header_nritems(parent))
+			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+
+		old_bytenr = btrfs_node_blockptr(parent, slot);
+		blocksize = dest->nodesize;
+		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+
+		if (level <= max_level) {
+			eb = path->nodes[level];
+			new_bytenr = btrfs_node_blockptr(eb,
+							path->slots[level]);
+			new_ptr_gen = btrfs_node_ptr_generation(eb,
+							path->slots[level]);
+		} else {
+			new_bytenr = 0;
+			new_ptr_gen = 0;
+		}
+
+		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
+			ret = level;
+			break;
+		}
+
+		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+		    memcmp_node_keys(parent, slot, path, level)) {
+			if (level <= lowest_level) {
+				ret = 0;
+				break;
+			}
+
+			eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
+			if (!eb || !extent_buffer_uptodate(eb)) {
+				ret = (!eb) ? -ENOMEM : -EIO;
+				free_extent_buffer(eb);
+				break;
+			}
+			btrfs_tree_lock(eb);
+			if (cow) {
+				ret = btrfs_cow_block(trans, dest, eb, parent,
+						      slot, &eb);
+				BUG_ON(ret);
+			}
+			btrfs_set_lock_blocking(eb);
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+
+			parent = eb;
+			continue;
+		}
+
+		if (!cow) {
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+			cow = 1;
+			goto again;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &key,
+				      path->slots[level]);
+		btrfs_release_path(path);
+
+		path->lowest_level = level;
+		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+		path->lowest_level = 0;
+		BUG_ON(ret);
+
+		/*
+		 * swap blocks in fs tree and reloc tree.
+		 */
+		btrfs_set_node_blockptr(parent, slot, new_bytenr);
+		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+		btrfs_mark_buffer_dirty(parent);
+
+		btrfs_set_node_blockptr(path->nodes[level],
+					path->slots[level], old_bytenr);
+		btrfs_set_node_ptr_generation(path->nodes[level],
+					      path->slots[level], old_ptr_gen);
+		btrfs_mark_buffer_dirty(path->nodes[level]);
+
+		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0,
+					1);
+		BUG_ON(ret);
+		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0, 1);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0,
+					1);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0, 1);
+		BUG_ON(ret);
+
+		btrfs_unlock_up_safe(path, 0);
+
+		ret = level;
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return ret;
+}
+
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+		       int *level)
+{
+	struct extent_buffer *eb;
+	int i;
+	u64 last_snapshot;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = 0; i < *level; i++) {
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+
+	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] + 1 < nritems) {
+			path->slots[i]++;
+			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+			    last_snapshot)
+				continue;
+
+			*level = i;
+			return 0;
+		}
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+	return 1;
+}
+
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+			 int *level)
+{
+	struct extent_buffer *eb = NULL;
+	int i;
+	u64 bytenr;
+	u64 ptr_gen = 0;
+	u64 last_snapshot;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = *level; i > 0; i--) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] < nritems) {
+			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+			if (ptr_gen > last_snapshot)
+				break;
+			path->slots[i]++;
+		}
+		if (path->slots[i] >= nritems) {
+			if (i == *level)
+				break;
+			*level = i + 1;
+			return 0;
+		}
+		if (i == 1) {
+			*level = i;
+			return 0;
+		}
+
+		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+		eb = read_tree_block(root, bytenr, ptr_gen);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
+		BUG_ON(btrfs_header_level(eb) != i - 1);
+		path->nodes[i - 1] = eb;
+		path->slots[i - 1] = 0;
+	}
+	return 1;
+}
+
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+				   struct btrfs_key *min_key,
+				   struct btrfs_key *max_key)
+{
+	struct inode *inode = NULL;
+	u64 objectid;
+	u64 start, end;
+	u64 ino;
+
+	objectid = min_key->objectid;
+	while (1) {
+		cond_resched();
+		iput(inode);
+
+		if (objectid > max_key->objectid)
+			break;
+
+		inode = find_next_inode(root, objectid);
+		if (!inode)
+			break;
+		ino = btrfs_ino(inode);
+
+		if (ino > max_key->objectid) {
+			iput(inode);
+			break;
+		}
+
+		objectid = ino + 1;
+		if (!S_ISREG(inode->i_mode))
+			continue;
+
+		if (unlikely(min_key->objectid == ino)) {
+			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+				start = 0;
+			else {
+				start = min_key->offset;
+				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+			}
+		} else {
+			start = 0;
+		}
+
+		if (unlikely(max_key->objectid == ino)) {
+			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+				end = (u64)-1;
+			} else {
+				if (max_key->offset == 0)
+					continue;
+				end = max_key->offset;
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+			}
+		} else {
+			end = (u64)-1;
+		}
+
+		/* the lock_extent waits for readpage to complete */
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+		btrfs_drop_extent_cache(inode, start, end, 1);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+	}
+	return 0;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key)
+
+{
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			break;
+		if (path->slots[level] + 1 <
+		    btrfs_header_nritems(path->nodes[level])) {
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+			return 0;
+		}
+		level++;
+	}
+	return 1;
+}
+
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+					       struct btrfs_root *root)
+{
+	LIST_HEAD(inode_list);
+	struct btrfs_key key;
+	struct btrfs_key next_key;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int level;
+	int max_level;
+	int replaced = 0;
+	int ret;
+	int err = 0;
+	u32 min_reserved;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		level = btrfs_root_level(root_item);
+		extent_buffer_get(reloc_root->node);
+		path->nodes[level] = reloc_root->node;
+		path->slots[level] = 0;
+	} else {
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+
+		level = root_item->drop_level;
+		BUG_ON(level == 0);
+		path->lowest_level = level;
+		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+		path->lowest_level = 0;
+		if (ret < 0) {
+			btrfs_free_path(path);
+			return ret;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+				      path->slots[level]);
+		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+
+		btrfs_unlock_up_safe(path, 0);
+	}
+
+	min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+	memset(&next_key, 0, sizeof(next_key));
+
+	while (1) {
+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+					     BTRFS_RESERVE_FLUSH_ALL);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
+		}
+		trans->block_rsv = rc->block_rsv;
+
+		replaced = 0;
+		max_level = level;
+
+		ret = walk_down_reloc_tree(reloc_root, path, &level);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0)
+			break;
+
+		if (!find_next_key(path, level, &key) &&
+		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+			ret = 0;
+		} else {
+			ret = replace_path(trans, root, reloc_root, path,
+					   &next_key, level, max_level);
+		}
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		if (ret > 0) {
+			level = ret;
+			btrfs_node_key_to_cpu(path->nodes[level], &key,
+					      path->slots[level]);
+			replaced = 1;
+		}
+
+		ret = walk_up_reloc_tree(reloc_root, path, &level);
+		if (ret > 0)
+			break;
+
+		BUG_ON(level == 0);
+		/*
+		 * save the merging progress in the drop_progress.
+		 * this is OK since root refs == 1 in this case.
+		 */
+		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+			       path->slots[level]);
+		root_item->drop_level = level;
+
+		btrfs_end_transaction_throttle(trans, root);
+		trans = NULL;
+
+		btrfs_btree_balance_dirty(root);
+
+		if (replaced && rc->stage == UPDATE_DATA_PTRS)
+			invalidate_extent_cache(root, &key, &next_key);
+	}
+
+	/*
+	 * handle the case only one block in the fs tree need to be
+	 * relocated and the block is tree root.
+	 */
+	leaf = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	if (ret < 0)
+		err = ret;
+out:
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		memset(&root_item->drop_progress, 0,
+		       sizeof(root_item->drop_progress));
+		root_item->drop_level = 0;
+		btrfs_set_root_refs(root_item, 0);
+		btrfs_update_reloc_root(trans, root);
+	}
+
+	if (trans)
+		btrfs_end_transaction_throttle(trans, root);
+
+	btrfs_btree_balance_dirty(root);
+
+	if (replaced && rc->stage == UPDATE_DATA_PTRS)
+		invalidate_extent_cache(root, &key, &next_key);
+
+	return err;
+}
+
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
+{
+	struct btrfs_root *root = rc->extent_root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_trans_handle *trans;
+	LIST_HEAD(reloc_roots);
+	u64 num_bytes = 0;
+	int ret;
+
+	mutex_lock(&root->fs_info->reloc_mutex);
+	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+	rc->merging_rsv_size += rc->nodes_relocated * 2;
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+again:
+	if (!err) {
+		num_bytes = rc->merging_rsv_size;
+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+					  BTRFS_RESERVE_FLUSH_ALL);
+		if (ret)
+			err = ret;
+	}
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans)) {
+		if (!err)
+			btrfs_block_rsv_release(rc->extent_root,
+						rc->block_rsv, num_bytes);
+		return PTR_ERR(trans);
+	}
+
+	if (!err) {
+		if (num_bytes != rc->merging_rsv_size) {
+			btrfs_end_transaction(trans, rc->extent_root);
+			btrfs_block_rsv_release(rc->extent_root,
+						rc->block_rsv, num_bytes);
+			goto again;
+		}
+	}
+
+	rc->merge_reloc_tree = 1;
+
+	while (!list_empty(&rc->reloc_roots)) {
+		reloc_root = list_entry(rc->reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del_init(&reloc_root->root_list);
+
+		root = read_fs_root(reloc_root->fs_info,
+				    reloc_root->root_key.offset);
+		BUG_ON(IS_ERR(root));
+		BUG_ON(root->reloc_root != reloc_root);
+
+		/*
+		 * set reference count to 1, so btrfs_recover_relocation
+		 * knows it should resumes merging
+		 */
+		if (!err)
+			btrfs_set_root_refs(&reloc_root->root_item, 1);
+		btrfs_update_reloc_root(trans, root);
+
+		list_add(&reloc_root->root_list, &reloc_roots);
+	}
+
+	list_splice(&reloc_roots, &rc->reloc_roots);
+
+	if (!err)
+		btrfs_commit_transaction(trans, rc->extent_root);
+	else
+		btrfs_end_transaction(trans, rc->extent_root);
+	return err;
+}
+
+static noinline_for_stack
+void free_reloc_roots(struct list_head *list)
+{
+	struct btrfs_root *reloc_root;
+
+	while (!list_empty(list)) {
+		reloc_root = list_entry(list->next, struct btrfs_root,
+					root_list);
+		__del_reloc_root(reloc_root);
+	}
+}
+
+static noinline_for_stack
+void merge_reloc_roots(struct reloc_control *rc)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *reloc_root;
+	u64 last_snap;
+	u64 otransid;
+	u64 objectid;
+	LIST_HEAD(reloc_roots);
+	int found = 0;
+	int ret = 0;
+again:
+	root = rc->extent_root;
+
+	/*
+	 * this serializes us with btrfs_record_root_in_transaction,
+	 * we have to make sure nobody is in the middle of
+	 * adding their roots to the list while we are
+	 * doing this splice
+	 */
+	mutex_lock(&root->fs_info->reloc_mutex);
+	list_splice_init(&rc->reloc_roots, &reloc_roots);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+	while (!list_empty(&reloc_roots)) {
+		found = 1;
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+
+		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+			root = read_fs_root(reloc_root->fs_info,
+					    reloc_root->root_key.offset);
+			BUG_ON(IS_ERR(root));
+			BUG_ON(root->reloc_root != reloc_root);
+
+			ret = merge_reloc_root(rc, root);
+			if (ret) {
+				if (list_empty(&reloc_root->root_list))
+					list_add_tail(&reloc_root->root_list,
+						      &reloc_roots);
+				goto out;
+			}
+		} else {
+			list_del_init(&reloc_root->root_list);
+		}
+
+		/*
+		 * we keep the old last snapshod transid in rtranid when we
+		 * created the relocation tree.
+		 */
+		last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+		otransid = btrfs_root_otransid(&reloc_root->root_item);
+		objectid = reloc_root->root_key.offset;
+
+		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+		if (ret < 0) {
+			if (list_empty(&reloc_root->root_list))
+				list_add_tail(&reloc_root->root_list,
+					      &reloc_roots);
+			goto out;
+		}
+	}
+
+	if (found) {
+		found = 0;
+		goto again;
+	}
+out:
+	if (ret) {
+		btrfs_std_error(root->fs_info, ret);
+		if (!list_empty(&reloc_roots))
+			free_reloc_roots(&reloc_roots);
+
+		/* new reloc root may be added */
+		mutex_lock(&root->fs_info->reloc_mutex);
+		list_splice_init(&rc->reloc_roots, &reloc_roots);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		if (!list_empty(&reloc_roots))
+			free_reloc_roots(&reloc_roots);
+	}
+
+	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+}
+
+static void free_block_list(struct rb_root *blocks)
+{
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	while ((rb_node = rb_first(blocks))) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		rb_erase(rb_node, blocks);
+		kfree(block);
+	}
+}
+
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *reloc_root)
+{
+	struct btrfs_root *root;
+
+	if (reloc_root->last_trans == trans->transid)
+		return 0;
+
+	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+	BUG_ON(IS_ERR(root));
+	BUG_ON(root->reloc_root != reloc_root);
+
+	return btrfs_record_root_in_trans(trans, root);
+}
+
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+				     struct reloc_control *rc,
+				     struct backref_node *node,
+				     struct backref_edge *edges[])
+{
+	struct backref_node *next;
+	struct btrfs_root *root;
+	int index = 0;
+
+	next = node;
+	while (1) {
+		cond_resched();
+		next = walk_up_backref(next, edges, &index);
+		root = next->root;
+		BUG_ON(!root);
+		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
+
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+			record_reloc_root_in_trans(trans, root);
+			break;
+		}
+
+		btrfs_record_root_in_trans(trans, root);
+		root = root->reloc_root;
+
+		if (next->new_bytenr != root->node->start) {
+			BUG_ON(next->new_bytenr);
+			BUG_ON(!list_empty(&next->list));
+			next->new_bytenr = root->node->start;
+			next->root = root;
+			list_add_tail(&next->list,
+				      &rc->backref_cache.changed);
+			__mark_block_processed(rc, next);
+			break;
+		}
+
+		WARN_ON(1);
+		root = NULL;
+		next = walk_down_backref(edges, &index);
+		if (!next || next->level <= node->level)
+			break;
+	}
+	if (!root)
+		return NULL;
+
+	next = node;
+	/* setup backref node path for btrfs_reloc_cow_block */
+	while (1) {
+		rc->backref_cache.path[next->level] = next;
+		if (--index < 0)
+			break;
+		next = edges[index]->node[UPPER];
+	}
+	return root;
+}
+
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+				   struct backref_node *node)
+{
+	struct backref_node *next;
+	struct btrfs_root *root;
+	struct btrfs_root *fs_root = NULL;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int index = 0;
+
+	next = node;
+	while (1) {
+		cond_resched();
+		next = walk_up_backref(next, edges, &index);
+		root = next->root;
+		BUG_ON(!root);
+
+		/* no other choice for non-references counted tree */
+		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+			return root;
+
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+			fs_root = root;
+
+		if (next != node)
+			return NULL;
+
+		next = walk_down_backref(edges, &index);
+		if (!next || next->level <= node->level)
+			break;
+	}
+
+	if (!fs_root)
+		return ERR_PTR(-ENOENT);
+	return fs_root;
+}
+
+static noinline_for_stack
+u64 calcu_metadata_size(struct reloc_control *rc,
+			struct backref_node *node, int reserve)
+{
+	struct backref_node *next = node;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	u64 num_bytes = 0;
+	int index = 0;
+
+	BUG_ON(reserve && node->processed);
+
+	while (next) {
+		cond_resched();
+		while (1) {
+			if (next->processed && (reserve || next != node))
+				break;
+
+			num_bytes += rc->extent_root->nodesize;
+
+			if (list_empty(&next->upper))
+				break;
+
+			edge = list_entry(next->upper.next,
+					  struct backref_edge, list[LOWER]);
+			edges[index++] = edge;
+			next = edge->node[UPPER];
+		}
+		next = walk_down_backref(edges, &index);
+	}
+	return num_bytes;
+}
+
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+				  struct reloc_control *rc,
+				  struct backref_node *node)
+{
+	struct btrfs_root *root = rc->extent_root;
+	u64 num_bytes;
+	int ret;
+	u64 tmp;
+
+	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
+
+	trans->block_rsv = rc->block_rsv;
+	rc->reserved_bytes += num_bytes;
+	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
+				BTRFS_RESERVE_FLUSH_ALL);
+	if (ret) {
+		if (ret == -EAGAIN) {
+			tmp = rc->extent_root->nodesize *
+				RELOCATION_RESERVED_NODES;
+			while (tmp <= rc->reserved_bytes)
+				tmp <<= 1;
+			/*
+			 * only one thread can access block_rsv at this point,
+			 * so we don't need hold lock to protect block_rsv.
+			 * we expand more reservation size here to allow enough
+			 * space for relocation and we will return eailer in
+			 * enospc case.
+			 */
+			rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+					      RELOCATION_RESERVED_NODES;
+		}
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
+			 struct backref_node *node,
+			 struct btrfs_key *key,
+			 struct btrfs_path *path, int lowest)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	u32 blocksize;
+	u64 bytenr;
+	u64 generation;
+	int slot;
+	int ret;
+	int err = 0;
+
+	BUG_ON(lowest && node->eb);
+
+	path->lowest_level = node->level + 1;
+	rc->backref_cache.path[node->level] = node;
+	list_for_each_entry(edge, &node->upper, list[LOWER]) {
+		cond_resched();
+
+		upper = edge->node[UPPER];
+		root = select_reloc_root(trans, rc, upper, edges);
+		BUG_ON(!root);
+
+		if (upper->eb && !upper->locked) {
+			if (!lowest) {
+				ret = btrfs_bin_search(upper->eb, key,
+						       upper->level, &slot);
+				BUG_ON(ret);
+				bytenr = btrfs_node_blockptr(upper->eb, slot);
+				if (node->eb->start == bytenr)
+					goto next;
+			}
+			drop_node_buffer(upper);
+		}
+
+		if (!upper->eb) {
+			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			BUG_ON(ret > 0);
+
+			if (!upper->eb) {
+				upper->eb = path->nodes[upper->level];
+				path->nodes[upper->level] = NULL;
+			} else {
+				BUG_ON(upper->eb != path->nodes[upper->level]);
+			}
+
+			upper->locked = 1;
+			path->locks[upper->level] = 0;
+
+			slot = path->slots[upper->level];
+			btrfs_release_path(path);
+		} else {
+			ret = btrfs_bin_search(upper->eb, key, upper->level,
+					       &slot);
+			BUG_ON(ret);
+		}
+
+		bytenr = btrfs_node_blockptr(upper->eb, slot);
+		if (lowest) {
+			BUG_ON(bytenr != node->bytenr);
+		} else {
+			if (node->eb->start == bytenr)
+				goto next;
+		}
+
+		blocksize = root->nodesize;
+		generation = btrfs_node_ptr_generation(upper->eb, slot);
+		eb = read_tree_block(root, bytenr, generation);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			err = -EIO;
+			goto next;
+		}
+		btrfs_tree_lock(eb);
+		btrfs_set_lock_blocking(eb);
+
+		if (!node->eb) {
+			ret = btrfs_cow_block(trans, root, eb, upper->eb,
+					      slot, &eb);
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+			if (ret < 0) {
+				err = ret;
+				goto next;
+			}
+			BUG_ON(node->eb != eb);
+		} else {
+			btrfs_set_node_blockptr(upper->eb, slot,
+						node->eb->start);
+			btrfs_set_node_ptr_generation(upper->eb, slot,
+						      trans->transid);
+			btrfs_mark_buffer_dirty(upper->eb);
+
+			ret = btrfs_inc_extent_ref(trans, root,
+						node->eb->start, blocksize,
+						upper->eb->start,
+						btrfs_header_owner(upper->eb),
+						node->level, 0, 1);
+			BUG_ON(ret);
+
+			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+			BUG_ON(ret);
+		}
+next:
+		if (!upper->pending)
+			drop_node_buffer(upper);
+		else
+			unlock_node_buffer(upper);
+		if (err)
+			break;
+	}
+
+	if (!err && node->pending) {
+		drop_node_buffer(node);
+		list_move_tail(&node->list, &rc->backref_cache.changed);
+		node->pending = 0;
+	}
+
+	path->lowest_level = 0;
+	BUG_ON(err == -ENOSPC);
+	return err;
+}
+
+static int link_to_upper(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
+			 struct backref_node *node,
+			 struct btrfs_path *path)
+{
+	struct btrfs_key key;
+
+	btrfs_node_key_to_cpu(node->eb, &key, 0);
+	return do_relocation(trans, rc, node, &key, path, 0);
+}
+
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct btrfs_path *path, int err)
+{
+	LIST_HEAD(list);
+	struct backref_cache *cache = &rc->backref_cache;
+	struct backref_node *node;
+	int level;
+	int ret;
+
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		while (!list_empty(&cache->pending[level])) {
+			node = list_entry(cache->pending[level].next,
+					  struct backref_node, list);
+			list_move_tail(&node->list, &list);
+			BUG_ON(!node->pending);
+
+			if (!err) {
+				ret = link_to_upper(trans, rc, node, path);
+				if (ret < 0)
+					err = ret;
+			}
+		}
+		list_splice_init(&list, &cache->pending[level]);
+	}
+	return err;
+}
+
+static void mark_block_processed(struct reloc_control *rc,
+				 u64 bytenr, u32 blocksize)
+{
+	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+			EXTENT_DIRTY, GFP_NOFS);
+}
+
+static void __mark_block_processed(struct reloc_control *rc,
+				   struct backref_node *node)
+{
+	u32 blocksize;
+	if (node->level == 0 ||
+	    in_block_group(node->bytenr, rc->block_group)) {
+		blocksize = rc->extent_root->nodesize;
+		mark_block_processed(rc, node->bytenr, blocksize);
+	}
+	node->processed = 1;
+}
+
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+				    struct backref_node *node)
+{
+	struct backref_node *next = node;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int index = 0;
+
+	while (next) {
+		cond_resched();
+		while (1) {
+			if (next->processed)
+				break;
+
+			__mark_block_processed(rc, next);
+
+			if (list_empty(&next->upper))
+				break;
+
+			edge = list_entry(next->upper.next,
+					  struct backref_edge, list[LOWER]);
+			edges[index++] = edge;
+			next = edge->node[UPPER];
+		}
+		next = walk_down_backref(edges, &index);
+	}
+}
+
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
+{
+	u32 blocksize = rc->extent_root->nodesize;
+
+	if (test_range_bit(&rc->processed_blocks, bytenr,
+			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+		return 1;
+	return 0;
+}
+
+static int get_tree_block_key(struct reloc_control *rc,
+			      struct tree_block *block)
+{
+	struct extent_buffer *eb;
+
+	BUG_ON(block->key_ready);
+	eb = read_tree_block(rc->extent_root, block->bytenr,
+			     block->key.offset);
+	if (!eb || !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		return -EIO;
+	}
+	WARN_ON(btrfs_header_level(eb) != block->level);
+	if (block->level == 0)
+		btrfs_item_key_to_cpu(eb, &block->key, 0);
+	else
+		btrfs_node_key_to_cpu(eb, &block->key, 0);
+	free_extent_buffer(eb);
+	block->key_ready = 1;
+	return 0;
+}
+
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct backref_node *node,
+				struct btrfs_key *key,
+				struct btrfs_path *path)
+{
+	struct btrfs_root *root;
+	int ret = 0;
+
+	if (!node)
+		return 0;
+
+	BUG_ON(node->processed);
+	root = select_one_root(trans, node);
+	if (root == ERR_PTR(-ENOENT)) {
+		update_processed_blocks(rc, node);
+		goto out;
+	}
+
+	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+		ret = reserve_metadata_space(trans, rc, node);
+		if (ret)
+			goto out;
+	}
+
+	if (root) {
+		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+			BUG_ON(node->new_bytenr);
+			BUG_ON(!list_empty(&node->list));
+			btrfs_record_root_in_trans(trans, root);
+			root = root->reloc_root;
+			node->new_bytenr = root->node->start;
+			node->root = root;
+			list_add_tail(&node->list, &rc->backref_cache.changed);
+		} else {
+			path->lowest_level = node->level;
+			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+			btrfs_release_path(path);
+			if (ret > 0)
+				ret = 0;
+		}
+		if (!ret)
+			update_processed_blocks(rc, node);
+	} else {
+		ret = do_relocation(trans, rc, node, key, path, 1);
+	}
+out:
+	if (ret || node->level == 0 || node->cowonly)
+		remove_backref_node(&rc->backref_cache, node);
+	return ret;
+}
+
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc, struct rb_root *blocks)
+{
+	struct backref_node *node;
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out_free_blocks;
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (!block->key_ready)
+			readahead_tree_block(rc->extent_root, block->bytenr);
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (!block->key_ready) {
+			err = get_tree_block_key(rc, block);
+			if (err)
+				goto out_free_path;
+		}
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+
+		node = build_backref_tree(rc, &block->key,
+					  block->level, block->bytenr);
+		if (IS_ERR(node)) {
+			err = PTR_ERR(node);
+			goto out;
+		}
+
+		ret = relocate_tree_block(trans, rc, node, &block->key,
+					  path);
+		if (ret < 0) {
+			if (ret != -EAGAIN || rb_node == rb_first(blocks))
+				err = ret;
+			goto out;
+		}
+		rb_node = rb_next(rb_node);
+	}
+out:
+	err = finish_pending_nodes(trans, rc, path, err);
+
+out_free_path:
+	btrfs_free_path(path);
+out_free_blocks:
+	free_block_list(blocks);
+	return err;
+}
+
+static noinline_for_stack
+int prealloc_file_extent_cluster(struct inode *inode,
+				 struct file_extent_cluster *cluster)
+{
+	u64 alloc_hint = 0;
+	u64 start;
+	u64 end;
+	u64 offset = BTRFS_I(inode)->index_cnt;
+	u64 num_bytes;
+	int nr = 0;
+	int ret = 0;
+
+	BUG_ON(cluster->start != cluster->boundary[0]);
+	mutex_lock(&inode->i_mutex);
+
+	ret = btrfs_check_data_free_space(inode, cluster->end +
+					  1 - cluster->start, 0);
+	if (ret)
+		goto out;
+
+	while (nr < cluster->nr) {
+		start = cluster->boundary[nr] - offset;
+		if (nr + 1 < cluster->nr)
+			end = cluster->boundary[nr + 1] - 1 - offset;
+		else
+			end = cluster->end - offset;
+
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+		num_bytes = end + 1 - start;
+		ret = btrfs_prealloc_file_range(inode, 0, start,
+						num_bytes, num_bytes,
+						end + 1, &alloc_hint);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+		if (ret)
+			break;
+		nr++;
+	}
+	btrfs_free_reserved_data_space(inode, cluster->end +
+				       1 - cluster->start);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static noinline_for_stack
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+			 u64 block_start)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret = 0;
+
+	em = alloc_extent_map();
+	if (!em)
+		return -ENOMEM;
+
+	em->start = start;
+	em->len = end + 1 - start;
+	em->block_len = em->len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+	while (1) {
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em, 0);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
+			break;
+		}
+		btrfs_drop_extent_cache(inode, start, end, 0);
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+	return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+					struct file_extent_cluster *cluster)
+{
+	u64 page_start;
+	u64 page_end;
+	u64 offset = BTRFS_I(inode)->index_cnt;
+	unsigned long index;
+	unsigned long last_index;
+	struct page *page;
+	struct file_ra_state *ra;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	int nr = 0;
+	int ret = 0;
+
+	if (!cluster->nr)
+		return 0;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	ret = prealloc_file_extent_cluster(inode, cluster);
+	if (ret)
+		goto out;
+
+	file_ra_state_init(ra, inode->i_mapping);
+
+	ret = setup_extent_mapping(inode, cluster->start - offset,
+				   cluster->end - offset, cluster->start);
+	if (ret)
+		goto out;
+
+	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+	while (index <= last_index) {
+		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+		if (ret)
+			goto out;
+
+		page = find_lock_page(inode->i_mapping, index);
+		if (!page) {
+			page_cache_sync_readahead(inode->i_mapping,
+						  ra, NULL, index,
+						  last_index + 1 - index);
+			page = find_or_create_page(inode->i_mapping, index,
+						   mask);
+			if (!page) {
+				btrfs_delalloc_release_metadata(inode,
+							PAGE_CACHE_SIZE);
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		if (PageReadahead(page)) {
+			page_cache_async_readahead(inode->i_mapping,
+						   ra, NULL, page, index,
+						   last_index + 1 - index);
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				btrfs_delalloc_release_metadata(inode,
+							PAGE_CACHE_SIZE);
+				ret = -EIO;
+				goto out;
+			}
+		}
+
+		page_start = page_offset(page);
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
+
+		set_page_extent_mapped(page);
+
+		if (nr < cluster->nr &&
+		    page_start + offset == cluster->boundary[nr]) {
+			set_extent_bits(&BTRFS_I(inode)->io_tree,
+					page_start, page_end,
+					EXTENT_BOUNDARY, GFP_NOFS);
+			nr++;
+		}
+
+		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+		set_page_dirty(page);
+
+		unlock_extent(&BTRFS_I(inode)->io_tree,
+			      page_start, page_end);
+		unlock_page(page);
+		page_cache_release(page);
+
+		index++;
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		btrfs_throttle(BTRFS_I(inode)->root);
+	}
+	WARN_ON(nr != cluster->nr);
+out:
+	kfree(ra);
+	return ret;
+}
+
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+			 struct file_extent_cluster *cluster)
+{
+	int ret;
+
+	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
+	}
+
+	if (!cluster->nr)
+		cluster->start = extent_key->objectid;
+	else
+		BUG_ON(cluster->nr >= MAX_EXTENTS);
+	cluster->end = extent_key->objectid + extent_key->offset - 1;
+	cluster->boundary[cluster->nr] = extent_key->objectid;
+	cluster->nr++;
+
+	if (cluster->nr >= MAX_EXTENTS) {
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
+	}
+	return 0;
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+			       struct btrfs_path *path,
+			       struct btrfs_key *extent_key,
+			       u64 *ref_objectid, int *path_change)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref_v0 *ref0;
+	int ret;
+	int slot;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	while (1) {
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0)
+				return ret;
+			BUG_ON(ret > 0);
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			if (path_change)
+				*path_change = 1;
+		}
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != extent_key->objectid)
+			return -ENOENT;
+
+		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+			slot++;
+			continue;
+		}
+		ref0 = btrfs_item_ptr(leaf, slot,
+				struct btrfs_extent_ref_v0);
+		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+		break;
+	}
+	return 0;
+}
+#endif
+
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+			  struct btrfs_key *extent_key,
+			  struct btrfs_path *path,
+			  struct rb_root *blocks)
+{
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	u32 item_size;
+	int level = -1;
+	u64 generation;
+
+	eb =  path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+	    item_size >= sizeof(*ei) + sizeof(*bi)) {
+		ei = btrfs_item_ptr(eb, path->slots[0],
+				struct btrfs_extent_item);
+		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
+			bi = (struct btrfs_tree_block_info *)(ei + 1);
+			level = btrfs_tree_block_level(eb, bi);
+		} else {
+			level = (int)extent_key->offset;
+		}
+		generation = btrfs_extent_generation(eb, ei);
+	} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		u64 ref_owner;
+		int ret;
+
+		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		ret = get_ref_objectid_v0(rc, path, extent_key,
+					  &ref_owner, NULL);
+		if (ret < 0)
+			return ret;
+		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+		level = (int)ref_owner;
+		/* FIXME: get real generation */
+		generation = 0;
+#else
+		BUG();
+#endif
+	}
+
+	btrfs_release_path(path);
+
+	BUG_ON(level == -1);
+
+	block = kmalloc(sizeof(*block), GFP_NOFS);
+	if (!block)
+		return -ENOMEM;
+
+	block->bytenr = extent_key->objectid;
+	block->key.objectid = rc->extent_root->nodesize;
+	block->key.offset = generation;
+	block->level = level;
+	block->key_ready = 0;
+
+	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
+
+	return 0;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+			    u64 bytenr, u32 blocksize,
+			    struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+	bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+					SKINNY_METADATA);
+
+	if (tree_block_processed(bytenr, rc))
+		return 0;
+
+	if (tree_search(blocks, bytenr))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+again:
+	key.objectid = bytenr;
+	if (skinny) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = (u64)-1;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = blocksize;
+	}
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0 && skinny) {
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    (key.type == BTRFS_METADATA_ITEM_KEY ||
+			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			      key.offset == blocksize)))
+				ret = 0;
+		}
+
+		if (ret) {
+			skinny = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+	BUG_ON(ret);
+
+	ret = add_tree_block(rc, &key, path, blocks);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+				  struct extent_buffer *eb)
+{
+	u64 flags;
+	int ret;
+
+	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+
+	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+				       eb->start, btrfs_header_level(eb), 1,
+				       NULL, &flags);
+	BUG_ON(ret);
+
+	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+		ret = 1;
+	else
+		ret = 0;
+	return ret;
+}
+
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_group_cache *block_group,
+				    struct inode *inode,
+				    u64 ino)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	if (inode)
+		goto truncate;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode) || is_bad_inode(inode)) {
+		if (!IS_ERR(inode))
+			iput(inode);
+		return -ENOENT;
+	}
+
+truncate:
+	ret = btrfs_check_trunc_cache_free_space(root,
+						 &fs_info->global_block_rsv);
+	if (ret)
+		goto out;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
+
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+out:
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+				struct btrfs_key *extent_key,
+				struct extent_buffer *leaf,
+				struct btrfs_extent_data_ref *ref,
+				struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct btrfs_root *root;
+	struct btrfs_file_extent_item *fi;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	u64 ref_root;
+	u64 ref_objectid;
+	u64 ref_offset;
+	u32 ref_count;
+	u32 nritems;
+	int err = 0;
+	int added = 0;
+	int counted;
+	int ret;
+
+	ref_root = btrfs_extent_data_ref_root(leaf, ref);
+	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+	ref_count = btrfs_extent_data_ref_count(leaf, ref);
+
+	/*
+	 * This is an extent belonging to the free space cache, lets just delete
+	 * it and redo the search.
+	 */
+	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+		ret = delete_block_group_cache(rc->extent_root->fs_info,
+					       rc->block_group,
+					       NULL, ref_objectid);
+		if (ret != -ENOENT)
+			return ret;
+		ret = 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	root = read_fs_root(rc->extent_root->fs_info, ref_root);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+
+	key.objectid = ref_objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (ref_offset > ((u64)-1 << 32))
+		key.offset = 0;
+	else
+		key.offset = ref_offset;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	/*
+	 * the references in tree blocks that use full backrefs
+	 * are not counted in
+	 */
+	if (block_use_full_backref(rc, leaf))
+		counted = 0;
+	else
+		counted = 1;
+	rb_node = tree_search(blocks, leaf->start);
+	if (rb_node) {
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+	}
+
+	while (ref_count > 0) {
+		while (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			if (WARN_ON(ret > 0))
+				goto out;
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			added = 0;
+
+			if (block_use_full_backref(rc, leaf))
+				counted = 0;
+			else
+				counted = 1;
+			rb_node = tree_search(blocks, leaf->start);
+			if (rb_node) {
+				if (counted)
+					added = 1;
+				else
+					path->slots[0] = nritems;
+			}
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (WARN_ON(key.objectid != ref_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY))
+			break;
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			goto next;
+
+		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+		    extent_key->objectid)
+			goto next;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		if (key.offset != ref_offset)
+			goto next;
+
+		if (counted)
+			ref_count--;
+		if (added)
+			goto next;
+
+		if (!tree_block_processed(leaf->start, rc)) {
+			block = kmalloc(sizeof(*block), GFP_NOFS);
+			if (!block) {
+				err = -ENOMEM;
+				break;
+			}
+			block->bytenr = leaf->start;
+			btrfs_item_key_to_cpu(leaf, &block->key, 0);
+			block->level = 0;
+			block->key_ready = 1;
+			rb_node = tree_insert(blocks, block->bytenr,
+					      &block->rb_node);
+			if (rb_node)
+				backref_tree_panic(rb_node, -EEXIST,
+						   block->bytenr);
+		}
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+next:
+		path->slots[0]++;
+
+	}
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+/*
+ * helper to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+			struct btrfs_key *extent_key,
+			struct btrfs_path *path,
+			struct rb_root *blocks)
+{
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_extent_inline_ref *iref;
+	unsigned long ptr;
+	unsigned long end;
+	u32 blocksize = rc->extent_root->nodesize;
+	int ret = 0;
+	int err = 0;
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+		ptr = end;
+	else
+#endif
+		ptr += sizeof(struct btrfs_extent_item);
+
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		key.type = btrfs_extent_inline_ref_type(eb, iref);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			BUG();
+		}
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+		ptr += btrfs_extent_inline_ref_size(key.type);
+	}
+	WARN_ON(ptr > end);
+
+	while (1) {
+		cond_resched();
+		eb = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			if (ret > 0)
+				break;
+			eb = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+		if (key.objectid != extent_key->objectid)
+			break;
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = btrfs_item_ptr(eb, path->slots[0],
+					      struct btrfs_extent_data_ref);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			ret = 0;
+		}
+		if (ret) {
+			err = ret;
+			break;
+		}
+		path->slots[0]++;
+	}
+out:
+	btrfs_release_path(path);
+	if (err)
+		free_block_list(blocks);
+	return err;
+}
+
+/*
+ * helper to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+		     struct reloc_control *rc, struct btrfs_path *path,
+		     struct btrfs_key *extent_key)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u64 start, end, last;
+	int ret;
+
+	last = rc->block_group->key.objectid + rc->block_group->key.offset;
+	while (1) {
+		cond_resched();
+		if (rc->search_start >= last) {
+			ret = 1;
+			break;
+		}
+
+		key.objectid = rc->search_start;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = 0;
+
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+					0, 0);
+		if (ret < 0)
+			break;
+next:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret != 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid >= last) {
+			ret = 1;
+			break;
+		}
+
+		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+		    key.type != BTRFS_METADATA_ITEM_KEY) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+		    key.objectid + key.offset <= rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		if (key.type == BTRFS_METADATA_ITEM_KEY &&
+		    key.objectid + rc->extent_root->nodesize <=
+		    rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		ret = find_first_extent_bit(&rc->processed_blocks,
+					    key.objectid, &start, &end,
+					    EXTENT_DIRTY, NULL);
+
+		if (ret == 0 && start <= key.objectid) {
+			btrfs_release_path(path);
+			rc->search_start = end + 1;
+		} else {
+			if (key.type == BTRFS_EXTENT_ITEM_KEY)
+				rc->search_start = key.objectid + key.offset;
+			else
+				rc->search_start = key.objectid +
+					rc->extent_root->nodesize;
+			memcpy(extent_key, &key, sizeof(key));
+			return 0;
+		}
+	}
+	btrfs_release_path(path);
+	return ret;
+}
+
+static void set_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+	mutex_lock(&fs_info->reloc_mutex);
+	fs_info->reloc_ctl = rc;
+	mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static void unset_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+	mutex_lock(&fs_info->reloc_mutex);
+	fs_info->reloc_ctl = NULL;
+	mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static int check_extent_flags(u64 flags)
+{
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+		return 1;
+	return 0;
+}
+
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+	struct btrfs_trans_handle *trans;
+
+	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+					      BTRFS_BLOCK_RSV_TEMP);
+	if (!rc->block_rsv)
+		return -ENOMEM;
+
+	memset(&rc->cluster, 0, sizeof(rc->cluster));
+	rc->search_start = rc->block_group->key.objectid;
+	rc->extents_found = 0;
+	rc->nodes_relocated = 0;
+	rc->merging_rsv_size = 0;
+	rc->reserved_bytes = 0;
+	rc->block_rsv->size = rc->extent_root->nodesize *
+			      RELOCATION_RESERVED_NODES;
+
+	rc->create_reloc_tree = 1;
+	set_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		/*
+		 * extent tree is not a ref_cow tree and has no reloc_root to
+		 * cleanup.  And callers are responsible to free the above
+		 * block rsv.
+		 */
+		return PTR_ERR(trans);
+	}
+	btrfs_commit_transaction(trans, rc->extent_root);
+	return 0;
+}
+
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+	struct rb_root blocks = RB_ROOT;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	u64 flags;
+	u32 item_size;
+	int ret;
+	int err = 0;
+	int progress = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	ret = prepare_to_relocate(rc);
+	if (ret) {
+		err = ret;
+		goto out_free;
+	}
+
+	while (1) {
+		rc->reserved_bytes = 0;
+		ret = btrfs_block_rsv_refill(rc->extent_root,
+					rc->block_rsv, rc->block_rsv->size,
+					BTRFS_RESERVE_FLUSH_ALL);
+		if (ret) {
+			err = ret;
+			break;
+		}
+		progress++;
+		trans = btrfs_start_transaction(rc->extent_root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+restart:
+		if (update_backref_cache(trans, &rc->backref_cache)) {
+			btrfs_end_transaction(trans, rc->extent_root);
+			continue;
+		}
+
+		ret = find_next_extent(trans, rc, path, &key);
+		if (ret < 0)
+			err = ret;
+		if (ret != 0)
+			break;
+
+		rc->extents_found++;
+
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_extent_item);
+		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			flags = btrfs_extent_flags(path->nodes[0], ei);
+			ret = check_extent_flags(flags);
+			BUG_ON(ret);
+
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			u64 ref_owner;
+			int path_change = 0;
+
+			BUG_ON(item_size !=
+			       sizeof(struct btrfs_extent_item_v0));
+			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+						  &path_change);
+			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+			else
+				flags = BTRFS_EXTENT_FLAG_DATA;
+
+			if (path_change) {
+				btrfs_release_path(path);
+
+				path->search_commit_root = 1;
+				path->skip_locking = 1;
+				ret = btrfs_search_slot(NULL, rc->extent_root,
+							&key, path, 0, 0);
+				if (ret < 0) {
+					err = ret;
+					break;
+				}
+				BUG_ON(ret > 0);
+			}
+#else
+			BUG();
+#endif
+		}
+
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			ret = add_tree_block(rc, &key, path, &blocks);
+		} else if (rc->stage == UPDATE_DATA_PTRS &&
+			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			ret = add_data_references(rc, &key, path, &blocks);
+		} else {
+			btrfs_release_path(path);
+			ret = 0;
+		}
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		if (!RB_EMPTY_ROOT(&blocks)) {
+			ret = relocate_tree_blocks(trans, rc, &blocks);
+			if (ret < 0) {
+				/*
+				 * if we fail to relocate tree blocks, force to update
+				 * backref cache when committing transaction.
+				 */
+				rc->backref_cache.last_trans = trans->transid - 1;
+
+				if (ret != -EAGAIN) {
+					err = ret;
+					break;
+				}
+				rc->extents_found--;
+				rc->search_start = key.objectid;
+			}
+		}
+
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		btrfs_btree_balance_dirty(rc->extent_root);
+		trans = NULL;
+
+		if (rc->stage == MOVE_DATA_EXTENTS &&
+		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			rc->found_file_extent = 1;
+			ret = relocate_data_extent(rc->data_inode,
+						   &key, &rc->cluster);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+		}
+	}
+	if (trans && progress && err == -ENOSPC) {
+		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+					      rc->block_group->flags);
+		if (ret == 0) {
+			err = 0;
+			progress = 0;
+			goto restart;
+		}
+	}
+
+	btrfs_release_path(path);
+	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+			  GFP_NOFS);
+
+	if (trans) {
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		btrfs_btree_balance_dirty(rc->extent_root);
+	}
+
+	if (!err) {
+		ret = relocate_file_extent_cluster(rc->data_inode,
+						   &rc->cluster);
+		if (ret < 0)
+			err = ret;
+	}
+
+	rc->create_reloc_tree = 0;
+	set_reloc_control(rc);
+
+	backref_cache_cleanup(&rc->backref_cache);
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+
+	err = prepare_to_merge(rc, err);
+
+	merge_reloc_roots(rc);
+
+	rc->merge_reloc_tree = 0;
+	unset_reloc_control(rc);
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+
+	/* get rid of pinned extents */
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+	btrfs_free_path(path);
+	return err;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+	btrfs_set_inode_generation(leaf, item, 1);
+	btrfs_set_inode_size(leaf, item, 0);
+	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+					  BTRFS_INODE_PREALLOC);
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_group_cache *group)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	int err = 0;
+
+	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	trans = btrfs_start_transaction(root, 6);
+	if (IS_ERR(trans))
+		return ERR_CAST(trans);
+
+	err = btrfs_find_free_objectid(root, &objectid);
+	if (err)
+		goto out;
+
+	err = __insert_orphan_inode(trans, root, objectid);
+	BUG_ON(err);
+
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+	BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+	err = btrfs_orphan_add(trans, inode);
+out:
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+	if (err) {
+		if (inode)
+			iput(inode);
+		inode = ERR_PTR(err);
+	}
+	return inode;
+}
+
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
+{
+	struct reloc_control *rc;
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return NULL;
+
+	INIT_LIST_HEAD(&rc->reloc_roots);
+	backref_cache_init(&rc->backref_cache);
+	mapping_tree_init(&rc->reloc_root_tree);
+	extent_io_tree_init(&rc->processed_blocks,
+			    fs_info->btree_inode->i_mapping);
+	return rc;
+}
+
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
+	struct reloc_control *rc;
+	struct inode *inode;
+	struct btrfs_path *path;
+	int ret;
+	int rw = 0;
+	int err = 0;
+
+	rc = alloc_reloc_control(fs_info);
+	if (!rc)
+		return -ENOMEM;
+
+	rc->extent_root = extent_root;
+
+	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+	BUG_ON(!rc->block_group);
+
+	if (!rc->block_group->ro) {
+		ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+		rw = 1;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+					path);
+	btrfs_free_path(path);
+
+	if (!IS_ERR(inode))
+		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
+	else
+		ret = PTR_ERR(inode);
+
+	if (ret && ret != -ENOENT) {
+		err = ret;
+		goto out;
+	}
+
+	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+	if (IS_ERR(rc->data_inode)) {
+		err = PTR_ERR(rc->data_inode);
+		rc->data_inode = NULL;
+		goto out;
+	}
+
+	btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
+	       rc->block_group->key.objectid, rc->block_group->flags);
+
+	ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	btrfs_wait_ordered_roots(fs_info, -1);
+
+	while (1) {
+		mutex_lock(&fs_info->cleaner_mutex);
+		ret = relocate_block_group(rc);
+		mutex_unlock(&fs_info->cleaner_mutex);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		if (rc->extents_found == 0)
+			break;
+
+		btrfs_info(extent_root->fs_info, "found %llu extents",
+			rc->extents_found);
+
+		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+						       (u64)-1);
+			if (ret) {
+				err = ret;
+				goto out;
+			}
+			invalidate_mapping_pages(rc->data_inode->i_mapping,
+						 0, -1);
+			rc->stage = UPDATE_DATA_PTRS;
+		}
+	}
+
+	WARN_ON(rc->block_group->pinned > 0);
+	WARN_ON(rc->block_group->reserved > 0);
+	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+	if (err && rw)
+		btrfs_set_block_group_rw(extent_root, rc->block_group);
+	iput(rc->data_inode);
+	btrfs_put_block_group(rc->block_group);
+	kfree(rc);
+	return err;
+}
+
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	int ret, err;
+
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	memset(&root->root_item.drop_progress, 0,
+		sizeof(root->root_item.drop_progress));
+	root->root_item.drop_level = 0;
+	btrfs_set_root_refs(&root->root_item, 0);
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+
+	err = btrfs_end_transaction(trans, root->fs_info->tree_root);
+	if (err)
+		return err;
+	return ret;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+	LIST_HEAD(reloc_roots);
+	struct btrfs_key key;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct reloc_control *rc = NULL;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = -1;
+
+	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+					path, 0, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(path);
+
+		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+		    key.type != BTRFS_ROOT_ITEM_KEY)
+			break;
+
+		reloc_root = btrfs_read_fs_root(root, &key);
+		if (IS_ERR(reloc_root)) {
+			err = PTR_ERR(reloc_root);
+			goto out;
+		}
+
+		list_add(&reloc_root->root_list, &reloc_roots);
+
+		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+			fs_root = read_fs_root(root->fs_info,
+					       reloc_root->root_key.offset);
+			if (IS_ERR(fs_root)) {
+				ret = PTR_ERR(fs_root);
+				if (ret != -ENOENT) {
+					err = ret;
+					goto out;
+				}
+				ret = mark_garbage_root(reloc_root);
+				if (ret < 0) {
+					err = ret;
+					goto out;
+				}
+			}
+		}
+
+		if (key.offset == 0)
+			break;
+
+		key.offset--;
+	}
+	btrfs_release_path(path);
+
+	if (list_empty(&reloc_roots))
+		goto out;
+
+	rc = alloc_reloc_control(root->fs_info);
+	if (!rc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rc->extent_root = root->fs_info->extent_root;
+
+	set_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	rc->merge_reloc_tree = 1;
+
+	while (!list_empty(&reloc_roots)) {
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del(&reloc_root->root_list);
+
+		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+			list_add_tail(&reloc_root->root_list,
+				      &rc->reloc_roots);
+			continue;
+		}
+
+		fs_root = read_fs_root(root->fs_info,
+				       reloc_root->root_key.offset);
+		if (IS_ERR(fs_root)) {
+			err = PTR_ERR(fs_root);
+			goto out_free;
+		}
+
+		err = __add_reloc_root(reloc_root);
+		BUG_ON(err < 0); /* -ENOMEM or logic error */
+		fs_root->reloc_root = reloc_root;
+	}
+
+	err = btrfs_commit_transaction(trans, rc->extent_root);
+	if (err)
+		goto out_free;
+
+	merge_reloc_roots(rc);
+
+	unset_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		err = btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+	kfree(rc);
+out:
+	if (!list_empty(&reloc_roots))
+		free_reloc_roots(&reloc_roots);
+
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		/* cleanup orphan inode in data relocation tree */
+		fs_root = read_fs_root(root->fs_info,
+				       BTRFS_DATA_RELOC_TREE_OBJECTID);
+		if (IS_ERR(fs_root))
+			err = PTR_ERR(fs_root);
+		else
+			err = btrfs_orphan_cleanup(fs_root);
+	}
+	return err;
+}
+
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	u64 disk_bytenr;
+	u64 new_bytenr;
+	LIST_HEAD(list);
+
+	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+				       disk_bytenr + len - 1, &list, 0);
+	if (ret)
+		goto out;
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del_init(&sums->list);
+
+		/*
+		 * We need to offset the new_bytenr based on where the csum is.
+		 * We need to do this because we will read in entire prealloc
+		 * extents but we may have written to say the middle of the
+		 * prealloc extent, so we need to make sure the csum goes with
+		 * the right disk offset.
+		 *
+		 * We can do this because the data reloc inode refers strictly
+		 * to the on disk bytes, so we don't have to worry about
+		 * disk_len vs real len like with real inodes since it's all
+		 * disk length.
+		 */
+		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+		sums->bytenr = new_bytenr;
+
+		btrfs_add_ordered_sum(inode, ordered, sums);
+	}
+out:
+	btrfs_put_ordered_extent(ordered);
+	return ret;
+}
+
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow)
+{
+	struct reloc_control *rc;
+	struct backref_node *node;
+	int first_cow = 0;
+	int level;
+	int ret = 0;
+
+	rc = root->fs_info->reloc_ctl;
+	if (!rc)
+		return 0;
+
+	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (buf == root->node)
+			__update_reloc_root(root, cow->start);
+	}
+
+	level = btrfs_header_level(buf);
+	if (btrfs_header_generation(buf) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		first_cow = 1;
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+	    rc->create_reloc_tree) {
+		WARN_ON(!first_cow && level == 0);
+
+		node = rc->backref_cache.path[level];
+		BUG_ON(node->bytenr != buf->start &&
+		       node->new_bytenr != buf->start);
+
+		drop_node_buffer(node);
+		extent_buffer_get(cow);
+		node->eb = cow;
+		node->new_bytenr = cow->start;
+
+		if (!node->pending) {
+			list_move_tail(&node->list,
+				       &rc->backref_cache.pending[level]);
+			node->pending = 1;
+		}
+
+		if (first_cow)
+			__mark_block_processed(rc, node);
+
+		if (first_cow && level > 0)
+			rc->nodes_relocated += buf->len;
+	}
+
+	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
+		ret = replace_file_extents(trans, rc, root, cow);
+	return ret;
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending,
+			      u64 *bytes_to_reserve)
+{
+	struct btrfs_root *root;
+	struct reloc_control *rc;
+
+	root = pending->root;
+	if (!root->reloc_root)
+		return;
+
+	rc = root->fs_info->reloc_ctl;
+	if (!rc->merge_reloc_tree)
+		return;
+
+	root = root->reloc_root;
+	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+	/*
+	 * relocation is in the stage of merging trees. the space
+	 * used by merging a reloc tree is twice the size of
+	 * relocated tree nodes in the worst case. half for cowing
+	 * the reloc tree, half for cowing the fs tree. the space
+	 * used by cowing the reloc tree will be freed after the
+	 * tree is dropped. if we create snapshot, cowing the fs
+	 * tree may use more space than it frees. so we need
+	 * reserve extra space.
+	 */
+	*bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+			       struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root *new_root;
+	struct reloc_control *rc;
+	int ret;
+
+	if (!root->reloc_root)
+		return 0;
+
+	rc = root->fs_info->reloc_ctl;
+	rc->merging_rsv_size += rc->nodes_relocated;
+
+	if (rc->merge_reloc_tree) {
+		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+					      rc->block_rsv,
+					      rc->nodes_relocated);
+		if (ret)
+			return ret;
+	}
+
+	new_root = pending->snap;
+	reloc_root = create_reloc_root(trans, root->reloc_root,
+				       new_root->root_key.objectid);
+	if (IS_ERR(reloc_root))
+		return PTR_ERR(reloc_root);
+
+	ret = __add_reloc_root(reloc_root);
+	BUG_ON(ret < 0);
+	new_root->reloc_root = reloc_root;
+
+	if (rc->create_reloc_tree)
+		ret = clone_backref_node(trans, rc, root, reloc_root);
+	return ret;
+}
diff --git a/kernel/fs/btrfs/root-tree.c b/kernel/fs/btrfs/root-tree.c
new file mode 100644
index 000000000..360a728a6
--- /dev/null
+++ b/kernel/fs/btrfs/root-tree.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/err.h>
+#include <linux/uuid.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ * Read a root item from the tree. In case we detect a root item smaller then
+ * sizeof(root_item), we know it's an old version of the root structure and
+ * initialize all new fields to zero. The same happens if we detect mismatching
+ * generation numbers as then we know the root was once mounted with an older
+ * kernel that was not aware of the root item structure change.
+ */
+static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+				struct btrfs_root_item *item)
+{
+	uuid_le uuid;
+	int len;
+	int need_reset = 0;
+
+	len = btrfs_item_size_nr(eb, slot);
+	read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+			min_t(int, len, (int)sizeof(*item)));
+	if (len < sizeof(*item))
+		need_reset = 1;
+	if (!need_reset && btrfs_root_generation(item)
+		!= btrfs_root_generation_v2(item)) {
+		if (btrfs_root_generation_v2(item) != 0) {
+			printk(KERN_WARNING "BTRFS: mismatching "
+					"generation and generation_v2 "
+					"found in root item. This root "
+					"was probably mounted with an "
+					"older kernel. Resetting all "
+					"new fields.\n");
+		}
+		need_reset = 1;
+	}
+	if (need_reset) {
+		memset(&item->generation_v2, 0,
+			sizeof(*item) - offsetof(struct btrfs_root_item,
+					generation_v2));
+
+		uuid_le_gen(&uuid);
+		memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+	}
+}
+
+/*
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
+ */
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *l;
+	int ret;
+	int slot;
+
+	ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	if (search_key->offset != -1ULL) {	/* the search key is exact */
+		if (ret > 0)
+			goto out;
+	} else {
+		BUG_ON(ret == 0);		/* Logical error */
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+		ret = 0;
+	}
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+
+	btrfs_item_key_to_cpu(l, &found_key, slot);
+	if (found_key.objectid != search_key->objectid ||
+	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
+		ret = 1;
+		goto out;
+	}
+
+	if (root_item)
+		btrfs_read_root_item(l, slot, root_item);
+	if (root_key)
+		memcpy(root_key, &found_key, sizeof(found_key));
+out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+void btrfs_set_root_node(struct btrfs_root_item *item,
+			 struct extent_buffer *node)
+{
+	btrfs_set_root_bytenr(item, node->start);
+	btrfs_set_root_level(item, btrfs_header_level(node));
+	btrfs_set_root_generation(item, btrfs_header_generation(node));
+}
+
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *l;
+	int ret;
+	int slot;
+	unsigned long ptr;
+	int old_len;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+	if (ret != 0) {
+		btrfs_print_leaf(root, path->nodes[0]);
+		btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu",
+		       key->objectid, key->type, key->offset);
+		BUG_ON(1);
+	}
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	ptr = btrfs_item_ptr_offset(l, slot);
+	old_len = btrfs_item_size_nr(l, slot);
+
+	/*
+	 * If this is the first time we update the root item which originated
+	 * from an older kernel, we need to enlarge the item size to make room
+	 * for the added fields.
+	 */
+	if (old_len < sizeof(*item)) {
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(trans, root, key, path,
+				-1, 1);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+
+		ret = btrfs_del_item(trans, root, path);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, root, path,
+				key, sizeof(*item));
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+		l = path->nodes[0];
+		slot = path->slots[0];
+		ptr = btrfs_item_ptr_offset(l, slot);
+	}
+
+	/*
+	 * Update generation_v2 so at the next mount we know the new root
+	 * fields are valid.
+	 */
+	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+
+	write_extent_buffer(l, item, ptr, sizeof(*item));
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct btrfs_key *key, struct btrfs_root_item *item)
+{
+	/*
+	 * Make sure generation v1 and v2 match. See update_root for details.
+	 */
+	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+	return btrfs_insert_item(trans, root, key, item, sizeof(*item));
+}
+
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key root_key;
+	struct btrfs_root *root;
+	int err = 0;
+	int ret;
+	bool can_recover = true;
+
+	if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+		can_recover = false;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = 0;
+
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(tree_root, path);
+			if (ret < 0)
+				err = ret;
+			if (ret != 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(path);
+
+		if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		root_key.objectid = key.offset;
+		key.offset++;
+
+		root = btrfs_read_fs_root(tree_root, &root_key);
+		err = PTR_ERR_OR_ZERO(root);
+		if (err && err != -ENOENT) {
+			break;
+		} else if (err == -ENOENT) {
+			struct btrfs_trans_handle *trans;
+
+			btrfs_release_path(path);
+
+			trans = btrfs_join_transaction(tree_root);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to start trans to delete "
+					    "orphan item");
+				break;
+			}
+			err = btrfs_del_orphan_item(trans, tree_root,
+						    root_key.objectid);
+			btrfs_end_transaction(trans, tree_root);
+			if (err) {
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to delete root orphan "
+					    "item");
+				break;
+			}
+			continue;
+		}
+
+		err = btrfs_init_fs_root(root);
+		if (err) {
+			btrfs_free_fs_root(root);
+			break;
+		}
+
+		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
+
+		err = btrfs_insert_fs_root(root->fs_info, root);
+		if (err) {
+			BUG_ON(err == -EEXIST);
+			btrfs_free_fs_root(root);
+			break;
+		}
+
+		if (btrfs_root_refs(&root->root_item) == 0)
+			btrfs_add_dead_root(root);
+	}
+
+	btrfs_free_path(path);
+	return err;
+}
+
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	BUG_ON(ret != 0);
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+		       const char *name, int name_len)
+
+{
+	struct btrfs_path *path;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	int err = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
+	key.offset = ref_id;
+again:
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	BUG_ON(ret < 0);
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+
+		WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+		WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+		ptr = (unsigned long)(ref + 1);
+		WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+		*sequence = btrfs_root_ref_sequence(leaf, ref);
+
+		ret = btrfs_del_item(trans, tree_root, path);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+	} else
+		err = -ENOENT;
+
+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+		btrfs_release_path(path);
+		key.objectid = ref_id;
+		key.type = BTRFS_ROOT_REF_KEY;
+		key.offset = root_id;
+		goto again;
+	}
+
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+/*
+ * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol  or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ *
+ * Will return 0, -ENOMEM, or anything from the CoW path
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+		       const char *name, int name_len)
+{
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
+	key.offset = ref_id;
+again:
+	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+				      sizeof(*ref) + name_len);
+	if (ret) {
+		btrfs_abort_transaction(trans, tree_root, ret);
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	btrfs_set_root_ref_dirid(leaf, ref, dirid);
+	btrfs_set_root_ref_sequence(leaf, ref, sequence);
+	btrfs_set_root_ref_name_len(leaf, ref, name_len);
+	ptr = (unsigned long)(ref + 1);
+	write_extent_buffer(leaf, name, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+		btrfs_release_path(path);
+		key.objectid = ref_id;
+		key.type = BTRFS_ROOT_REF_KEY;
+		key.offset = root_id;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+	u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
+
+	if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+		inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+		btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
+		btrfs_set_root_flags(root_item, 0);
+		btrfs_set_root_limit(root_item, 0);
+	}
+}
+
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root)
+{
+	struct btrfs_root_item *item = &root->root_item;
+	struct timespec ct = CURRENT_TIME;
+
+	spin_lock(&root->root_item_lock);
+	btrfs_set_root_ctransid(item, trans->transid);
+	btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
+	btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
+	spin_unlock(&root->root_item_lock);
+}
diff --git a/kernel/fs/btrfs/scrub.c b/kernel/fs/btrfs/scrub.c
new file mode 100644
index 000000000..ab5811545
--- /dev/null
+++ b/kernel/fs/btrfs/scrub.c
@@ -0,0 +1,4228 @@
+/*
+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/ratelimit.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "dev-replace.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "raid56.h"
+
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ *  - In case an unrepairable extent is encountered, track which files are
+ *    affected and report them
+ *  - track and record media errors, throw out bad devices
+ *  - add a mode to also read unallocated space
+ */
+
+struct scrub_block;
+struct scrub_ctx;
+
+/*
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
+
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
+#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
+
+struct scrub_recover {
+	atomic_t		refs;
+	struct btrfs_bio	*bbio;
+	u64			map_length;
+};
+
+struct scrub_page {
+	struct scrub_block	*sblock;
+	struct page		*page;
+	struct btrfs_device	*dev;
+	struct list_head	list;
+	u64			flags;  /* extent flags */
+	u64			generation;
+	u64			logical;
+	u64			physical;
+	u64			physical_for_dev_replace;
+	atomic_t		refs;
+	struct {
+		unsigned int	mirror_num:8;
+		unsigned int	have_csum:1;
+		unsigned int	io_error:1;
+	};
+	u8			csum[BTRFS_CSUM_SIZE];
+
+	struct scrub_recover	*recover;
+};
+
+struct scrub_bio {
+	int			index;
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
+	struct bio		*bio;
+	int			err;
+	u64			logical;
+	u64			physical;
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
+	int			page_count;
+	int			next_free;
+	struct btrfs_work	work;
+};
+
+struct scrub_block {
+	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+	int			page_count;
+	atomic_t		outstanding_pages;
+	atomic_t		refs; /* free mem on transition to zero */
+	struct scrub_ctx	*sctx;
+	struct scrub_parity	*sparity;
+	struct {
+		unsigned int	header_error:1;
+		unsigned int	checksum_error:1;
+		unsigned int	no_io_error_seen:1;
+		unsigned int	generation_error:1; /* also sets header_error */
+
+		/* The following is for the data used to check parity */
+		/* It is for the data with checksum */
+		unsigned int	data_corrected:1;
+	};
+};
+
+/* Used for the chunks with parity stripe such RAID5/6 */
+struct scrub_parity {
+	struct scrub_ctx	*sctx;
+
+	struct btrfs_device	*scrub_dev;
+
+	u64			logic_start;
+
+	u64			logic_end;
+
+	int			nsectors;
+
+	int			stripe_len;
+
+	atomic_t		refs;
+
+	struct list_head	spages;
+
+	/* Work of parity check and repair */
+	struct btrfs_work	work;
+
+	/* Mark the parity blocks which have data */
+	unsigned long		*dbitmap;
+
+	/*
+	 * Mark the parity blocks which have data, but errors happen when
+	 * read data or check data
+	 */
+	unsigned long		*ebitmap;
+
+	unsigned long		bitmap[0];
+};
+
+struct scrub_wr_ctx {
+	struct scrub_bio *wr_curr_bio;
+	struct btrfs_device *tgtdev;
+	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+	atomic_t flush_all_writes;
+	struct mutex wr_lock;
+};
+
+struct scrub_ctx {
+	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
+	struct btrfs_root	*dev_root;
+	int			first_free;
+	int			curr;
+	atomic_t		bios_in_flight;
+	atomic_t		workers_pending;
+	spinlock_t		list_lock;
+	wait_queue_head_t	list_wait;
+	u16			csum_size;
+	struct list_head	csum_list;
+	atomic_t		cancel_req;
+	int			readonly;
+	int			pages_per_rd_bio;
+	u32			sectorsize;
+	u32			nodesize;
+
+	int			is_dev_replace;
+	struct scrub_wr_ctx	wr_ctx;
+
+	/*
+	 * statistics
+	 */
+	struct btrfs_scrub_progress stat;
+	spinlock_t		stat_lock;
+
+	/*
+	 * Use a ref counter to avoid use-after-free issues. Scrub workers
+	 * decrement bios_in_flight and workers_pending and then do a wakeup
+	 * on the list_wait wait queue. We must ensure the main scrub task
+	 * doesn't free the scrub context before or while the workers are
+	 * doing the wakeup() call.
+	 */
+	atomic_t                refs;
+};
+
+struct scrub_fixup_nodatasum {
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
+	u64			logical;
+	struct btrfs_root	*root;
+	struct btrfs_work	work;
+	int			mirror_num;
+};
+
+struct scrub_nocow_inode {
+	u64			inum;
+	u64			offset;
+	u64			root;
+	struct list_head	list;
+};
+
+struct scrub_copy_nocow_ctx {
+	struct scrub_ctx	*sctx;
+	u64			logical;
+	u64			len;
+	int			mirror_num;
+	u64			physical_for_dev_replace;
+	struct list_head	inodes;
+	struct btrfs_work	work;
+};
+
+struct scrub_warning {
+	struct btrfs_path	*path;
+	u64			extent_item_size;
+	const char		*errstr;
+	sector_t		sector;
+	u64			logical;
+	struct btrfs_device	*dev;
+};
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+				     struct scrub_block *sblocks_for_recheck);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size, int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+					 struct scrub_block *sblock,
+					 int is_metadata, int have_csum,
+					 const u8 *csum, u64 generation,
+					 u16 csum_size);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+					     struct scrub_block *sblock_good);
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+					    struct scrub_block *sblock_good,
+					    int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_get(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
+static void scrub_parity_get(struct scrub_parity *sparity);
+static void scrub_parity_put(struct scrub_parity *sparity);
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace);
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      struct scrub_copy_nocow_ctx *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+	atomic_inc(&sctx->refs);
+	atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+	atomic_dec(&sctx->bios_in_flight);
+	wake_up(&sctx->list_wait);
+	scrub_put_ctx(sctx);
+}
+
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+	while (atomic_read(&fs_info->scrub_pause_req)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+		   atomic_read(&fs_info->scrub_pause_req) == 0);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+}
+
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+	atomic_inc(&fs_info->scrubs_paused);
+	wake_up(&fs_info->scrub_pause_wait);
+
+	mutex_lock(&fs_info->scrub_lock);
+	__scrub_blocked_if_needed(fs_info);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	wake_up(&fs_info->scrub_pause_wait);
+}
+
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	atomic_inc(&sctx->refs);
+	/*
+	 * increment scrubs_running to prevent cancel requests from
+	 * completing as long as a worker is running. we must also
+	 * increment scrubs_paused to prevent deadlocking on pause
+	 * requests used for transactions commits (as the worker uses a
+	 * transaction context). it is safe to regard the worker
+	 * as paused for all matters practical. effectively, we only
+	 * avoid cancellation requests from completing.
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_inc(&fs_info->scrubs_running);
+	atomic_inc(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	/*
+	 * check if @scrubs_running=@scrubs_paused condition
+	 * inside wait_event() is not an atomic operation.
+	 * which means we may inc/dec @scrub_running/paused
+	 * at any time. Let's wake up @scrub_pause_wait as
+	 * much as we can to let commit transaction blocked less.
+	 */
+	wake_up(&fs_info->scrub_pause_wait);
+
+	atomic_inc(&sctx->workers_pending);
+}
+
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	/*
+	 * see scrub_pending_trans_workers_inc() why we're pretending
+	 * to be paused in the scrub counters
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_dec(&sctx->workers_pending);
+	wake_up(&fs_info->scrub_pause_wait);
+	wake_up(&sctx->list_wait);
+	scrub_put_ctx(sctx);
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
+{
+	while (!list_empty(&sctx->csum_list)) {
+		struct btrfs_ordered_sum *sum;
+		sum = list_first_entry(&sctx->csum_list,
+				       struct btrfs_ordered_sum, list);
+		list_del(&sum->list);
+		kfree(sum);
+	}
+}
+
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
+{
+	int i;
+
+	if (!sctx)
+		return;
+
+	scrub_free_wr_ctx(&sctx->wr_ctx);
+
+	/* this can happen when scrub is cancelled */
+	if (sctx->curr != -1) {
+		struct scrub_bio *sbio = sctx->bios[sctx->curr];
+
+		for (i = 0; i < sbio->page_count; i++) {
+			WARN_ON(!sbio->pagev[i]->page);
+			scrub_block_put(sbio->pagev[i]->sblock);
+		}
+		bio_put(sbio->bio);
+	}
+
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+		struct scrub_bio *sbio = sctx->bios[i];
+
+		if (!sbio)
+			break;
+		kfree(sbio);
+	}
+
+	scrub_free_csums(sctx);
+	kfree(sctx);
+}
+
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+	if (atomic_dec_and_test(&sctx->refs))
+		scrub_free_ctx(sctx);
+}
+
+static noinline_for_stack
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
+{
+	struct scrub_ctx *sctx;
+	int		i;
+	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+	int pages_per_rd_bio;
+	int ret;
+
+	/*
+	 * the setting of pages_per_rd_bio is correct for scrub but might
+	 * be wrong for the dev_replace code where we might read from
+	 * different devices in the initial huge bios. However, that
+	 * code is able to correctly handle the case when adding a page
+	 * to a bio fails.
+	 */
+	if (dev->bdev)
+		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	else
+		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+	if (!sctx)
+		goto nomem;
+	atomic_set(&sctx->refs, 1);
+	sctx->is_dev_replace = is_dev_replace;
+	sctx->pages_per_rd_bio = pages_per_rd_bio;
+	sctx->curr = -1;
+	sctx->dev_root = dev->dev_root;
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+		struct scrub_bio *sbio;
+
+		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+		if (!sbio)
+			goto nomem;
+		sctx->bios[i] = sbio;
+
+		sbio->index = i;
+		sbio->sctx = sctx;
+		sbio->page_count = 0;
+		btrfs_init_work(&sbio->work, btrfs_scrub_helper,
+				scrub_bio_end_io_worker, NULL, NULL);
+
+		if (i != SCRUB_BIOS_PER_SCTX - 1)
+			sctx->bios[i]->next_free = i + 1;
+		else
+			sctx->bios[i]->next_free = -1;
+	}
+	sctx->first_free = 0;
+	sctx->nodesize = dev->dev_root->nodesize;
+	sctx->sectorsize = dev->dev_root->sectorsize;
+	atomic_set(&sctx->bios_in_flight, 0);
+	atomic_set(&sctx->workers_pending, 0);
+	atomic_set(&sctx->cancel_req, 0);
+	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	INIT_LIST_HEAD(&sctx->csum_list);
+
+	spin_lock_init(&sctx->list_lock);
+	spin_lock_init(&sctx->stat_lock);
+	init_waitqueue_head(&sctx->list_wait);
+
+	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+				 fs_info->dev_replace.tgtdev, is_dev_replace);
+	if (ret) {
+		scrub_free_ctx(sctx);
+		return ERR_PTR(ret);
+	}
+	return sctx;
+
+nomem:
+	scrub_free_ctx(sctx);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+				     void *warn_ctx)
+{
+	u64 isize;
+	u32 nlink;
+	int ret;
+	int i;
+	struct extent_buffer *eb;
+	struct btrfs_inode_item *inode_item;
+	struct scrub_warning *swarn = warn_ctx;
+	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_root *local_root;
+	struct btrfs_key root_key;
+	struct btrfs_key key;
+
+	root_key.objectid = root;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(local_root)) {
+		ret = PTR_ERR(local_root);
+		goto err;
+	}
+
+	/*
+	 * this makes the path point to (inum INODE_ITEM ioff)
+	 */
+	key.objectid = inum;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
+	if (ret) {
+		btrfs_release_path(swarn->path);
+		goto err;
+	}
+
+	eb = swarn->path->nodes[0];
+	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+					struct btrfs_inode_item);
+	isize = btrfs_inode_size(eb, inode_item);
+	nlink = btrfs_inode_nlink(eb, inode_item);
+	btrfs_release_path(swarn->path);
+
+	ipath = init_ipath(4096, local_root, swarn->path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto err;
+	}
+	ret = paths_from_inode(inum, ipath);
+
+	if (ret < 0)
+		goto err;
+
+	/*
+	 * we deliberately ignore the bit ipath might have been too small to
+	 * hold all of the paths here
+	 */
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
+			"length %llu, links %u (path: %s)\n", swarn->errstr,
+			swarn->logical, rcu_str_deref(swarn->dev->name),
+			(unsigned long long)swarn->sector, root, inum, offset,
+			min(isize - offset, (u64)PAGE_SIZE), nlink,
+			(char *)(unsigned long)ipath->fspath->val[i]);
+
+	free_ipath(ipath);
+	return 0;
+
+err:
+	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+		"resolving failed with ret=%d\n", swarn->errstr,
+		swarn->logical, rcu_str_deref(swarn->dev->name),
+		(unsigned long long)swarn->sector, root, inum, offset, ret);
+
+	free_ipath(ipath);
+	return 0;
+}
+
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+{
+	struct btrfs_device *dev;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct scrub_warning swarn;
+	unsigned long ptr = 0;
+	u64 extent_item_pos;
+	u64 flags = 0;
+	u64 ref_root;
+	u32 item_size;
+	u8 ref_level;
+	int ret;
+
+	WARN_ON(sblock->page_count < 1);
+	dev = sblock->pagev[0]->dev;
+	fs_info = sblock->sctx->dev_root->fs_info;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+
+	swarn.sector = (sblock->pagev[0]->physical) >> 9;
+	swarn.logical = sblock->pagev[0]->logical;
+	swarn.errstr = errstr;
+	swarn.dev = NULL;
+
+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+				  &flags);
+	if (ret < 0)
+		goto out;
+
+	extent_item_pos = swarn.logical - found_key.objectid;
+	swarn.extent_item_size = found_key.offset;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		do {
+			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+						      item_size, &ref_root,
+						      &ref_level);
+			printk_in_rcu(KERN_WARNING
+				"BTRFS: %s at logical %llu on dev %s, "
+				"sector %llu: metadata %s (level %d) in tree "
+				"%llu\n", errstr, swarn.logical,
+				rcu_str_deref(dev->name),
+				(unsigned long long)swarn.sector,
+				ref_level ? "node" : "leaf",
+				ret < 0 ? -1 : ref_level,
+				ret < 0 ? -1 : ref_root);
+		} while (ret != 1);
+		btrfs_release_path(path);
+	} else {
+		btrfs_release_path(path);
+		swarn.path = path;
+		swarn.dev = dev;
+		iterate_extent_inodes(fs_info, found_key.objectid,
+					extent_item_pos, 1,
+					scrub_print_warning_inode, &swarn);
+	}
+
+out:
+	btrfs_free_path(path);
+}
+
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
+{
+	struct page *page = NULL;
+	unsigned long index;
+	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
+	int ret;
+	int corrected = 0;
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	struct btrfs_fs_info *fs_info;
+	u64 end = offset + PAGE_SIZE - 1;
+	struct btrfs_root *local_root;
+	int srcu_index;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = fixup->root->fs_info;
+	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(local_root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+		return PTR_ERR(local_root);
+	}
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	index = offset >> PAGE_CACHE_SHIFT;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (PageUptodate(page)) {
+		if (PageDirty(page)) {
+			/*
+			 * we need to write the data to the defect sector. the
+			 * data that was in that sector is not in memory,
+			 * because the page was modified. we must not write the
+			 * modified page to that sector.
+			 *
+			 * TODO: what could be done here: wait for the delalloc
+			 *       runner to write out that page (might involve
+			 *       COW) and see whether the sector is still
+			 *       referenced afterwards.
+			 *
+			 * For the meantime, we'll treat this error
+			 * incorrectable, although there is a chance that a
+			 * later scrub will find the bad sector again and that
+			 * there's no dirty page in memory, then.
+			 */
+			ret = -EIO;
+			goto out;
+		}
+		ret = repair_io_failure(inode, offset, PAGE_SIZE,
+					fixup->logical, page,
+					offset - page_offset(page),
+					fixup->mirror_num);
+		unlock_page(page);
+		corrected = !ret;
+	} else {
+		/*
+		 * we need to get good data first. the general readpage path
+		 * will call repair_io_failure for us, we just have to make
+		 * sure we read the bad mirror.
+		 */
+		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, GFP_NOFS);
+		if (ret) {
+			/* set_extent_bits should give proper error */
+			WARN_ON(ret > 0);
+			if (ret > 0)
+				ret = -EFAULT;
+			goto out;
+		}
+
+		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+						btrfs_get_extent,
+						fixup->mirror_num);
+		wait_on_page_locked(page);
+
+		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+						end, EXTENT_DAMAGED, 0, NULL);
+		if (!corrected)
+			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+						EXTENT_DAMAGED, GFP_NOFS);
+	}
+
+out:
+	if (page)
+		put_page(page);
+
+	iput(inode);
+
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0 && corrected) {
+		/*
+		 * we only need to call readpage for one of the inodes belonging
+		 * to this extent. so make iterate_extent_inodes stop
+		 */
+		return 1;
+	}
+
+	return -EIO;
+}
+
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+	int ret;
+	struct scrub_fixup_nodatasum *fixup;
+	struct scrub_ctx *sctx;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_path *path;
+	int uncorrectable = 0;
+
+	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+	sctx = fixup->sctx;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.malloc_errors;
+		spin_unlock(&sctx->stat_lock);
+		uncorrectable = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(fixup->root);
+	if (IS_ERR(trans)) {
+		uncorrectable = 1;
+		goto out;
+	}
+
+	/*
+	 * the idea is to trigger a regular read through the standard path. we
+	 * read a page from the (failed) logical address by specifying the
+	 * corresponding copynum of the failed sector. thus, that readpage is
+	 * expected to fail.
+	 * that is the point where on-the-fly error correction will kick in
+	 * (once it's finished) and rewrite the failed sector if a good copy
+	 * can be found.
+	 */
+	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+						path, scrub_fixup_readpage,
+						fixup);
+	if (ret < 0) {
+		uncorrectable = 1;
+		goto out;
+	}
+	WARN_ON(ret != 1);
+
+	spin_lock(&sctx->stat_lock);
+	++sctx->stat.corrected_errors;
+	spin_unlock(&sctx->stat_lock);
+
+out:
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fixup->root);
+	if (uncorrectable) {
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.uncorrectable_errors;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_replace_stats_inc(
+			&sctx->dev_root->fs_info->dev_replace.
+			num_uncorrectable_read_errors);
+		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+			fixup->logical, rcu_str_deref(fixup->dev->name));
+	}
+
+	btrfs_free_path(path);
+	kfree(fixup);
+
+	scrub_pending_trans_workers_dec(sctx);
+}
+
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+	atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+	if (atomic_dec_and_test(&recover->refs)) {
+		btrfs_put_bbio(recover->bbio);
+		kfree(recover);
+	}
+}
+
+/*
+ * scrub_handle_errored_block gets called when either verification of the
+ * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all pages in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
+ */
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
+{
+	struct scrub_ctx *sctx = sblock_to_check->sctx;
+	struct btrfs_device *dev;
+	struct btrfs_fs_info *fs_info;
+	u64 length;
+	u64 logical;
+	u64 generation;
+	unsigned int failed_mirror_index;
+	unsigned int is_metadata;
+	unsigned int have_csum;
+	u8 *csum;
+	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+	struct scrub_block *sblock_bad;
+	int ret;
+	int mirror_index;
+	int page_num;
+	int success;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	BUG_ON(sblock_to_check->page_count < 1);
+	fs_info = sctx->dev_root->fs_info;
+	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+		/*
+		 * if we find an error in a super block, we just report it.
+		 * They will get written with the next transaction commit
+		 * anyway
+		 */
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
+		return 0;
+	}
+	length = sblock_to_check->page_count * PAGE_SIZE;
+	logical = sblock_to_check->pagev[0]->logical;
+	generation = sblock_to_check->pagev[0]->generation;
+	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+	is_metadata = !(sblock_to_check->pagev[0]->flags &
+			BTRFS_EXTENT_FLAG_DATA);
+	have_csum = sblock_to_check->pagev[0]->have_csum;
+	csum = sblock_to_check->pagev[0]->csum;
+	dev = sblock_to_check->pagev[0]->dev;
+
+	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+		sblocks_for_recheck = NULL;
+		goto nodatasum_case;
+	}
+
+	/*
+	 * read all mirrors one after the other. This includes to
+	 * re-read the extent or metadata block that failed (that was
+	 * the cause that this fixup code is called) another time,
+	 * page by page this time in order to know which pages
+	 * caused I/O errors and which ones are good (for all mirrors).
+	 * It is the goal to handle the situation when more than one
+	 * mirror contains I/O errors, but the errors do not
+	 * overlap, i.e. the data can be repaired by selecting the
+	 * pages from those mirrors without I/O error on the
+	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+	 * would be that mirror #1 has an I/O error on the first page,
+	 * the second page is good, and mirror #2 has an I/O error on
+	 * the second page, but the first page is good.
+	 * Then the first page of the first mirror can be repaired by
+	 * taking the first page of the second mirror, and the
+	 * second page of the second mirror can be repaired by
+	 * copying the contents of the 2nd page of the 1st mirror.
+	 * One more note: if the pages of one mirror contain I/O
+	 * errors, the checksum cannot be verified. In order to get
+	 * the best data for repairing, the first attempt is to find
+	 * a mirror without I/O errors and with a validated checksum.
+	 * Only if this is not possible, the pages are picked from
+	 * mirrors with I/O errors without considering the checksum.
+	 * If the latter is the case, at the end, the checksum of the
+	 * repaired area is verified in order to correctly maintain
+	 * the statistics.
+	 */
+
+	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
+				      sizeof(*sblocks_for_recheck), GFP_NOFS);
+	if (!sblocks_for_recheck) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+		goto out;
+	}
+
+	/* setup the context, map the logical blocks and alloc the pages */
+	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
+	if (ret) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+		goto out;
+	}
+	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+	sblock_bad = sblocks_for_recheck + failed_mirror_index;
+
+	/* build and submit the bios for the failed mirror, check checksums */
+	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+			    csum, generation, sctx->csum_size, 1);
+
+	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+	    sblock_bad->no_io_error_seen) {
+		/*
+		 * the error disappeared after reading page by page, or
+		 * the area was part of a huge bio and other parts of the
+		 * bio caused I/O errors, or the block layer merged several
+		 * read requests into one and the error is caused by a
+		 * different bio (usually one of the two latter cases is
+		 * the cause)
+		 */
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.unverified_errors++;
+		sblock_to_check->data_corrected = 1;
+		spin_unlock(&sctx->stat_lock);
+
+		if (sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock_bad);
+		goto out;
+	}
+
+	if (!sblock_bad->no_io_error_seen) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		spin_unlock(&sctx->stat_lock);
+		if (__ratelimit(&_rs))
+			scrub_print_warning("i/o error", sblock_to_check);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+	} else if (sblock_bad->checksum_error) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.csum_errors++;
+		spin_unlock(&sctx->stat_lock);
+		if (__ratelimit(&_rs))
+			scrub_print_warning("checksum error", sblock_to_check);
+		btrfs_dev_stat_inc_and_print(dev,
+					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
+	} else if (sblock_bad->header_error) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.verify_errors++;
+		spin_unlock(&sctx->stat_lock);
+		if (__ratelimit(&_rs))
+			scrub_print_warning("checksum/header error",
+					    sblock_to_check);
+		if (sblock_bad->generation_error)
+			btrfs_dev_stat_inc_and_print(dev,
+				BTRFS_DEV_STAT_GENERATION_ERRS);
+		else
+			btrfs_dev_stat_inc_and_print(dev,
+				BTRFS_DEV_STAT_CORRUPTION_ERRS);
+	}
+
+	if (sctx->readonly) {
+		ASSERT(!sctx->is_dev_replace);
+		goto out;
+	}
+
+	if (!is_metadata && !have_csum) {
+		struct scrub_fixup_nodatasum *fixup_nodatasum;
+
+		WARN_ON(sctx->is_dev_replace);
+
+nodatasum_case:
+
+		/*
+		 * !is_metadata and !have_csum, this means that the data
+		 * might not be COW'ed, that it might be modified
+		 * concurrently. The general strategy to work on the
+		 * commit root does not help in the case when COW is not
+		 * used.
+		 */
+		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+		if (!fixup_nodatasum)
+			goto did_not_correct_error;
+		fixup_nodatasum->sctx = sctx;
+		fixup_nodatasum->dev = dev;
+		fixup_nodatasum->logical = logical;
+		fixup_nodatasum->root = fs_info->extent_root;
+		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
+		scrub_pending_trans_workers_inc(sctx);
+		btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
+				scrub_fixup_nodatasum, NULL, NULL);
+		btrfs_queue_work(fs_info->scrub_workers,
+				 &fixup_nodatasum->work);
+		goto out;
+	}
+
+	/*
+	 * now build and submit the bios for the other mirrors, check
+	 * checksums.
+	 * First try to pick the mirror which is completely without I/O
+	 * errors and also does not have a checksum error.
+	 * If one is found, and if a checksum is present, the full block
+	 * that is known to contain an error is rewritten. Afterwards
+	 * the block is known to be corrected.
+	 * If a mirror is found which is completely correct, and no
+	 * checksum is present, only those pages are rewritten that had
+	 * an I/O error in the block to be repaired, since it cannot be
+	 * determined, which copy of the other pages is better (and it
+	 * could happen otherwise that a correct page would be
+	 * overwritten by a bad one).
+	 */
+	for (mirror_index = 0;
+	     mirror_index < BTRFS_MAX_MIRRORS &&
+	     sblocks_for_recheck[mirror_index].page_count > 0;
+	     mirror_index++) {
+		struct scrub_block *sblock_other;
+
+		if (mirror_index == failed_mirror_index)
+			continue;
+		sblock_other = sblocks_for_recheck + mirror_index;
+
+		/* build and submit the bios, check checksums */
+		scrub_recheck_block(fs_info, sblock_other, is_metadata,
+				    have_csum, csum, generation,
+				    sctx->csum_size, 0);
+
+		if (!sblock_other->header_error &&
+		    !sblock_other->checksum_error &&
+		    sblock_other->no_io_error_seen) {
+			if (sctx->is_dev_replace) {
+				scrub_write_block_to_dev_replace(sblock_other);
+				goto corrected_error;
+			} else {
+				ret = scrub_repair_block_from_good_copy(
+						sblock_bad, sblock_other);
+				if (!ret)
+					goto corrected_error;
+			}
+		}
+	}
+
+	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+		goto did_not_correct_error;
+
+	/*
+	 * In case of I/O errors in the area that is supposed to be
+	 * repaired, continue by picking good copies of those pages.
+	 * Select the good pages from mirrors to rewrite bad pages from
+	 * the area to fix. Afterwards verify the checksum of the block
+	 * that is supposed to be repaired. This verification step is
+	 * only done for the purpose of statistic counting and for the
+	 * final scrub report, whether errors remain.
+	 * A perfect algorithm could make use of the checksum and try
+	 * all possible combinations of pages from the different mirrors
+	 * until the checksum verification succeeds. For example, when
+	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+	 * of mirror #2 is readable but the final checksum test fails,
+	 * then the 2nd page of mirror #3 could be tried, whether now
+	 * the final checksum succeedes. But this would be a rare
+	 * exception and is therefore not implemented. At least it is
+	 * avoided that the good copy is overwritten.
+	 * A more useful improvement would be to pick the sectors
+	 * without I/O error based on sector sizes (512 bytes on legacy
+	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+	 * mirror could be repaired by taking 512 byte of a different
+	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+	 * area are unreadable.
+	 */
+	success = 1;
+	for (page_num = 0; page_num < sblock_bad->page_count;
+	     page_num++) {
+		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+		struct scrub_block *sblock_other = NULL;
+
+		/* skip no-io-error page in scrub */
+		if (!page_bad->io_error && !sctx->is_dev_replace)
+			continue;
+
+		/* try to find no-io-error page in mirrors */
+		if (page_bad->io_error) {
+			for (mirror_index = 0;
+			     mirror_index < BTRFS_MAX_MIRRORS &&
+			     sblocks_for_recheck[mirror_index].page_count > 0;
+			     mirror_index++) {
+				if (!sblocks_for_recheck[mirror_index].
+				    pagev[page_num]->io_error) {
+					sblock_other = sblocks_for_recheck +
+						       mirror_index;
+					break;
+				}
+			}
+			if (!sblock_other)
+				success = 0;
+		}
+
+		if (sctx->is_dev_replace) {
+			/*
+			 * did not find a mirror to fetch the page
+			 * from. scrub_write_page_to_dev_replace()
+			 * handles this case (page->io_error), by
+			 * filling the block with zeros before
+			 * submitting the write request
+			 */
+			if (!sblock_other)
+				sblock_other = sblock_bad;
+
+			if (scrub_write_page_to_dev_replace(sblock_other,
+							    page_num) != 0) {
+				btrfs_dev_replace_stats_inc(
+					&sctx->dev_root->
+					fs_info->dev_replace.
+					num_write_errors);
+				success = 0;
+			}
+		} else if (sblock_other) {
+			ret = scrub_repair_page_from_good_copy(sblock_bad,
+							       sblock_other,
+							       page_num, 0);
+			if (0 == ret)
+				page_bad->io_error = 0;
+			else
+				success = 0;
+		}
+	}
+
+	if (success && !sctx->is_dev_replace) {
+		if (is_metadata || have_csum) {
+			/*
+			 * need to verify the checksum now that all
+			 * sectors on disk are repaired (the write
+			 * request for data to be repaired is on its way).
+			 * Just be lazy and use scrub_recheck_block()
+			 * which re-reads the data before the checksum
+			 * is verified, but most likely the data comes out
+			 * of the page cache.
+			 */
+			scrub_recheck_block(fs_info, sblock_bad,
+					    is_metadata, have_csum, csum,
+					    generation, sctx->csum_size, 1);
+			if (!sblock_bad->header_error &&
+			    !sblock_bad->checksum_error &&
+			    sblock_bad->no_io_error_seen)
+				goto corrected_error;
+			else
+				goto did_not_correct_error;
+		} else {
+corrected_error:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.corrected_errors++;
+			sblock_to_check->data_corrected = 1;
+			spin_unlock(&sctx->stat_lock);
+			printk_ratelimited_in_rcu(KERN_ERR
+				"BTRFS: fixed up error at logical %llu on dev %s\n",
+				logical, rcu_str_deref(dev->name));
+		}
+	} else {
+did_not_correct_error:
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		printk_ratelimited_in_rcu(KERN_ERR
+			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+			logical, rcu_str_deref(dev->name));
+	}
+
+out:
+	if (sblocks_for_recheck) {
+		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+		     mirror_index++) {
+			struct scrub_block *sblock = sblocks_for_recheck +
+						     mirror_index;
+			struct scrub_recover *recover;
+			int page_index;
+
+			for (page_index = 0; page_index < sblock->page_count;
+			     page_index++) {
+				sblock->pagev[page_index]->sblock = NULL;
+				recover = sblock->pagev[page_index]->recover;
+				if (recover) {
+					scrub_put_recover(recover);
+					sblock->pagev[page_index]->recover =
+									NULL;
+				}
+				scrub_page_put(sblock->pagev[page_index]);
+			}
+		}
+		kfree(sblocks_for_recheck);
+	}
+
+	return 0;
+}
+
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+{
+	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+		return 2;
+	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+		return 3;
+	else
+		return (int)bbio->num_stripes;
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+						 u64 *raid_map,
+						 u64 mapped_length,
+						 int nstripes, int mirror,
+						 int *stripe_index,
+						 u64 *stripe_offset)
+{
+	int i;
+
+	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		/* RAID5/6 */
+		for (i = 0; i < nstripes; i++) {
+			if (raid_map[i] == RAID6_Q_STRIPE ||
+			    raid_map[i] == RAID5_P_STRIPE)
+				continue;
+
+			if (logical >= raid_map[i] &&
+			    logical < raid_map[i] + mapped_length)
+				break;
+		}
+
+		*stripe_index = i;
+		*stripe_offset = logical - raid_map[i];
+	} else {
+		/* The other RAID type */
+		*stripe_index = mirror;
+		*stripe_offset = 0;
+	}
+}
+
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+				     struct scrub_block *sblocks_for_recheck)
+{
+	struct scrub_ctx *sctx = original_sblock->sctx;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+	u64 length = original_sblock->page_count * PAGE_SIZE;
+	u64 logical = original_sblock->pagev[0]->logical;
+	struct scrub_recover *recover;
+	struct btrfs_bio *bbio;
+	u64 sublen;
+	u64 mapped_length;
+	u64 stripe_offset;
+	int stripe_index;
+	int page_index = 0;
+	int mirror_index;
+	int nmirrors;
+	int ret;
+
+	/*
+	 * note: the two members refs and outstanding_pages
+	 * are not used (and not set) in the blocks that are used for
+	 * the recheck procedure
+	 */
+
+	while (length > 0) {
+		sublen = min_t(u64, length, PAGE_SIZE);
+		mapped_length = sublen;
+		bbio = NULL;
+
+		/*
+		 * with a length of PAGE_SIZE, each returned stripe
+		 * represents one mirror
+		 */
+		ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+				       &mapped_length, &bbio, 0, 1);
+		if (ret || !bbio || mapped_length < sublen) {
+			btrfs_put_bbio(bbio);
+			return -EIO;
+		}
+
+		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+		if (!recover) {
+			btrfs_put_bbio(bbio);
+			return -ENOMEM;
+		}
+
+		atomic_set(&recover->refs, 1);
+		recover->bbio = bbio;
+		recover->map_length = mapped_length;
+
+		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
+
+		nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+
+		for (mirror_index = 0; mirror_index < nmirrors;
+		     mirror_index++) {
+			struct scrub_block *sblock;
+			struct scrub_page *page;
+
+			sblock = sblocks_for_recheck + mirror_index;
+			sblock->sctx = sctx;
+			page = kzalloc(sizeof(*page), GFP_NOFS);
+			if (!page) {
+leave_nomem:
+				spin_lock(&sctx->stat_lock);
+				sctx->stat.malloc_errors++;
+				spin_unlock(&sctx->stat_lock);
+				scrub_put_recover(recover);
+				return -ENOMEM;
+			}
+			scrub_page_get(page);
+			sblock->pagev[page_index] = page;
+			page->logical = logical;
+
+			scrub_stripe_index_and_offset(logical,
+						      bbio->map_type,
+						      bbio->raid_map,
+						      mapped_length,
+						      bbio->num_stripes -
+						      bbio->num_tgtdevs,
+						      mirror_index,
+						      &stripe_index,
+						      &stripe_offset);
+			page->physical = bbio->stripes[stripe_index].physical +
+					 stripe_offset;
+			page->dev = bbio->stripes[stripe_index].dev;
+
+			BUG_ON(page_index >= original_sblock->page_count);
+			page->physical_for_dev_replace =
+				original_sblock->pagev[page_index]->
+				physical_for_dev_replace;
+			/* for missing devices, dev->bdev is NULL */
+			page->mirror_num = mirror_index + 1;
+			sblock->page_count++;
+			page->page = alloc_page(GFP_NOFS);
+			if (!page->page)
+				goto leave_nomem;
+
+			scrub_get_recover(recover);
+			page->recover = recover;
+		}
+		scrub_put_recover(recover);
+		length -= sublen;
+		logical += sublen;
+		page_index++;
+	}
+
+	return 0;
+}
+
+struct scrub_bio_ret {
+	struct completion event;
+	int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+	struct scrub_bio_ret *ret = bio->bi_private;
+
+	ret->error = error;
+	complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+	return page->recover &&
+	       (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+					struct bio *bio,
+					struct scrub_page *page)
+{
+	struct scrub_bio_ret done;
+	int ret;
+
+	init_completion(&done.event);
+	done.error = 0;
+	bio->bi_iter.bi_sector = page->logical >> 9;
+	bio->bi_private = &done;
+	bio->bi_end_io = scrub_bio_wait_endio;
+
+	ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+				    page->recover->map_length,
+				    page->mirror_num, 0);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&done.event);
+	if (done.error)
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * this function will check the on disk data for checksum errors, header
+ * errors and read I/O errors. If any I/O errors happen, the exact pages
+ * which are errored are marked as being bad. The goal is to enable scrub
+ * to take those pages that are not errored from all the mirrors so that
+ * the pages that are errored in the just handled mirror can be repaired.
+ */
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size, int retry_failed_mirror)
+{
+	int page_num;
+
+	sblock->no_io_error_seen = 1;
+	sblock->header_error = 0;
+	sblock->checksum_error = 0;
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		struct bio *bio;
+		struct scrub_page *page = sblock->pagev[page_num];
+
+		if (page->dev->bdev == NULL) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
+			continue;
+		}
+
+		WARN_ON(!page->page);
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		if (!bio) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
+			continue;
+		}
+		bio->bi_bdev = page->dev->bdev;
+
+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
+		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+			if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+				sblock->no_io_error_seen = 0;
+		} else {
+			bio->bi_iter.bi_sector = page->physical >> 9;
+
+			if (btrfsic_submit_bio_wait(READ, bio))
+				sblock->no_io_error_seen = 0;
+		}
+
+		bio_put(bio);
+	}
+
+	if (sblock->no_io_error_seen)
+		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+					     have_csum, csum, generation,
+					     csum_size);
+
+	return;
+}
+
+static inline int scrub_check_fsid(u8 fsid[],
+				   struct scrub_page *spage)
+{
+	struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+	int ret;
+
+	ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
+	return !ret;
+}
+
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+					 struct scrub_block *sblock,
+					 int is_metadata, int have_csum,
+					 const u8 *csum, u64 generation,
+					 u16 csum_size)
+{
+	int page_num;
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	void *mapped_buffer;
+
+	WARN_ON(!sblock->pagev[0]->page);
+	if (is_metadata) {
+		struct btrfs_header *h;
+
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+		h = (struct btrfs_header *)mapped_buffer;
+
+		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
+		    !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
+		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+			   BTRFS_UUID_SIZE)) {
+			sblock->header_error = 1;
+		} else if (generation != btrfs_stack_header_generation(h)) {
+			sblock->header_error = 1;
+			sblock->generation_error = 1;
+		}
+		csum = h->csum;
+	} else {
+		if (!have_csum)
+			return;
+
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+	}
+
+	for (page_num = 0;;) {
+		if (page_num == 0 && is_metadata)
+			crc = btrfs_csum_data(
+				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+		else
+			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
+
+		kunmap_atomic(mapped_buffer);
+		page_num++;
+		if (page_num >= sblock->page_count)
+			break;
+		WARN_ON(!sblock->pagev[page_num]->page);
+
+		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, csum, csum_size))
+		sblock->checksum_error = 1;
+}
+
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+					     struct scrub_block *sblock_good)
+{
+	int page_num;
+	int ret = 0;
+
+	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+		int ret_sub;
+
+		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+							   sblock_good,
+							   page_num, 1);
+		if (ret_sub)
+			ret = ret_sub;
+	}
+
+	return ret;
+}
+
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+					    struct scrub_block *sblock_good,
+					    int page_num, int force_write)
+{
+	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+	struct scrub_page *page_good = sblock_good->pagev[page_num];
+
+	BUG_ON(page_bad->page == NULL);
+	BUG_ON(page_good->page == NULL);
+	if (force_write || sblock_bad->header_error ||
+	    sblock_bad->checksum_error || page_bad->io_error) {
+		struct bio *bio;
+		int ret;
+
+		if (!page_bad->dev->bdev) {
+			printk_ratelimited(KERN_WARNING "BTRFS: "
+				"scrub_repair_page_from_good_copy(bdev == NULL) "
+				"is unexpected!\n");
+			return -EIO;
+		}
+
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		if (!bio)
+			return -EIO;
+		bio->bi_bdev = page_bad->dev->bdev;
+		bio->bi_iter.bi_sector = page_bad->physical >> 9;
+
+		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+		if (PAGE_SIZE != ret) {
+			bio_put(bio);
+			return -EIO;
+		}
+
+		if (btrfsic_submit_bio_wait(WRITE, bio)) {
+			btrfs_dev_stat_inc_and_print(page_bad->dev,
+				BTRFS_DEV_STAT_WRITE_ERRS);
+			btrfs_dev_replace_stats_inc(
+				&sblock_bad->sctx->dev_root->fs_info->
+				dev_replace.num_write_errors);
+			bio_put(bio);
+			return -EIO;
+		}
+		bio_put(bio);
+	}
+
+	return 0;
+}
+
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+	int page_num;
+
+	/*
+	 * This block is used for the check of the parity on the source device,
+	 * so the data needn't be written into the destination device.
+	 */
+	if (sblock->sparity)
+		return;
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		int ret;
+
+		ret = scrub_write_page_to_dev_replace(sblock, page_num);
+		if (ret)
+			btrfs_dev_replace_stats_inc(
+				&sblock->sctx->dev_root->fs_info->dev_replace.
+				num_write_errors);
+	}
+}
+
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num)
+{
+	struct scrub_page *spage = sblock->pagev[page_num];
+
+	BUG_ON(spage->page == NULL);
+	if (spage->io_error) {
+		void *mapped_buffer = kmap_atomic(spage->page);
+
+		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+		flush_dcache_page(spage->page);
+		kunmap_atomic(mapped_buffer);
+	}
+	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
+	int ret;
+
+	mutex_lock(&wr_ctx->wr_lock);
+again:
+	if (!wr_ctx->wr_curr_bio) {
+		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+					      GFP_NOFS);
+		if (!wr_ctx->wr_curr_bio) {
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -ENOMEM;
+		}
+		wr_ctx->wr_curr_bio->sctx = sctx;
+		wr_ctx->wr_curr_bio->page_count = 0;
+	}
+	sbio = wr_ctx->wr_curr_bio;
+	if (sbio->page_count == 0) {
+		struct bio *bio;
+
+		sbio->physical = spage->physical_for_dev_replace;
+		sbio->logical = spage->logical;
+		sbio->dev = wr_ctx->tgtdev;
+		bio = sbio->bio;
+		if (!bio) {
+			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+			if (!bio) {
+				mutex_unlock(&wr_ctx->wr_lock);
+				return -ENOMEM;
+			}
+			sbio->bio = bio;
+		}
+
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_wr_bio_end_io;
+		bio->bi_bdev = sbio->dev->bdev;
+		bio->bi_iter.bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+		   spage->physical_for_dev_replace ||
+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
+		   spage->logical) {
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
+		if (sbio->page_count < 1) {
+			bio_put(sbio->bio);
+			sbio->bio = NULL;
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -EIO;
+		}
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	sbio->pagev[sbio->page_count] = spage;
+	scrub_page_get(spage);
+	sbio->page_count++;
+	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+		scrub_wr_submit(sctx);
+	mutex_unlock(&wr_ctx->wr_lock);
+
+	return 0;
+}
+
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
+
+	if (!wr_ctx->wr_curr_bio)
+		return;
+
+	sbio = wr_ctx->wr_curr_bio;
+	wr_ctx->wr_curr_bio = NULL;
+	WARN_ON(!sbio->bio->bi_bdev);
+	scrub_pending_bio_inc(sctx);
+	/* process all writes in a single worker thread. Then the block layer
+	 * orders the requests before sending them to the driver which
+	 * doubled the write performance on spinning disks when measured
+	 * with Linux 3.5 */
+	btrfsic_submit_bio(WRITE, sbio->bio);
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+	struct scrub_bio *sbio = bio->bi_private;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+	sbio->err = err;
+	sbio->bio = bio;
+
+	btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
+			 scrub_wr_bio_end_io_worker, NULL, NULL);
+	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+	struct scrub_ctx *sctx = sbio->sctx;
+	int i;
+
+	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+	if (sbio->err) {
+		struct btrfs_dev_replace *dev_replace =
+			&sbio->sctx->dev_root->fs_info->dev_replace;
+
+		for (i = 0; i < sbio->page_count; i++) {
+			struct scrub_page *spage = sbio->pagev[i];
+
+			spage->io_error = 1;
+			btrfs_dev_replace_stats_inc(&dev_replace->
+						    num_write_errors);
+		}
+	}
+
+	for (i = 0; i < sbio->page_count; i++)
+		scrub_page_put(sbio->pagev[i]);
+
+	bio_put(sbio->bio);
+	kfree(sbio);
+	scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
+{
+	u64 flags;
+	int ret;
+
+	WARN_ON(sblock->page_count < 1);
+	flags = sblock->pagev[0]->flags;
+	ret = 0;
+	if (flags & BTRFS_EXTENT_FLAG_DATA)
+		ret = scrub_checksum_data(sblock);
+	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		ret = scrub_checksum_tree_block(sblock);
+	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+		(void)scrub_checksum_super(sblock);
+	else
+		WARN_ON(1);
+	if (ret)
+		scrub_handle_errored_block(sblock);
+
+	return ret;
+}
+
+static int scrub_checksum_data(struct scrub_block *sblock)
+{
+	struct scrub_ctx *sctx = sblock->sctx;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u8 *on_disk_csum;
+	struct page *page;
+	void *buffer;
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	if (!sblock->pagev[0]->have_csum)
+		return 0;
+
+	on_disk_csum = sblock->pagev[0]->csum;
+	page = sblock->pagev[0]->page;
+	buffer = kmap_atomic(page);
+
+	len = sctx->sectorsize;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, PAGE_SIZE);
+
+		crc = btrfs_csum_data(buffer, crc, l);
+		kunmap_atomic(buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		buffer = kmap_atomic(page);
+	}
+
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, on_disk_csum, sctx->csum_size))
+		fail = 1;
+
+	return fail;
+}
+
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
+{
+	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_header *h;
+	struct btrfs_root *root = sctx->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u8 on_disk_csum[BTRFS_CSUM_SIZE];
+	struct page *page;
+	void *mapped_buffer;
+	u64 mapped_size;
+	void *p;
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	int crc_fail = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	page = sblock->pagev[0]->page;
+	mapped_buffer = kmap_atomic(page);
+	h = (struct btrfs_header *)mapped_buffer;
+	memcpy(on_disk_csum, h->csum, sctx->csum_size);
+
+	/*
+	 * we don't use the getter functions here, as we
+	 * a) don't have an extent buffer and
+	 * b) the page is already kmapped
+	 */
+
+	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
+		++fail;
+
+	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
+		++fail;
+
+	if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
+		++fail;
+
+	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+		   BTRFS_UUID_SIZE))
+		++fail;
+
+	len = sctx->nodesize - BTRFS_CSUM_SIZE;
+	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, mapped_size);
+
+		crc = btrfs_csum_data(p, crc, l);
+		kunmap_atomic(mapped_buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		mapped_buffer = kmap_atomic(page);
+		mapped_size = PAGE_SIZE;
+		p = mapped_buffer;
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+		++crc_fail;
+
+	return fail || crc_fail;
+}
+
+static int scrub_checksum_super(struct scrub_block *sblock)
+{
+	struct btrfs_super_block *s;
+	struct scrub_ctx *sctx = sblock->sctx;
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u8 on_disk_csum[BTRFS_CSUM_SIZE];
+	struct page *page;
+	void *mapped_buffer;
+	u64 mapped_size;
+	void *p;
+	u32 crc = ~(u32)0;
+	int fail_gen = 0;
+	int fail_cor = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	page = sblock->pagev[0]->page;
+	mapped_buffer = kmap_atomic(page);
+	s = (struct btrfs_super_block *)mapped_buffer;
+	memcpy(on_disk_csum, s->csum, sctx->csum_size);
+
+	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
+		++fail_cor;
+
+	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
+		++fail_gen;
+
+	if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
+		++fail_cor;
+
+	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, mapped_size);
+
+		crc = btrfs_csum_data(p, crc, l);
+		kunmap_atomic(mapped_buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		mapped_buffer = kmap_atomic(page);
+		mapped_size = PAGE_SIZE;
+		p = mapped_buffer;
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+		++fail_cor;
+
+	if (fail_cor + fail_gen) {
+		/*
+		 * if we find an error in a super block, we just report it.
+		 * They will get written with the next transaction commit
+		 * anyway
+		 */
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
+		if (fail_cor)
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+				BTRFS_DEV_STAT_CORRUPTION_ERRS);
+		else
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+				BTRFS_DEV_STAT_GENERATION_ERRS);
+	}
+
+	return fail_cor + fail_gen;
+}
+
+static void scrub_block_get(struct scrub_block *sblock)
+{
+	atomic_inc(&sblock->refs);
+}
+
+static void scrub_block_put(struct scrub_block *sblock)
+{
+	if (atomic_dec_and_test(&sblock->refs)) {
+		int i;
+
+		if (sblock->sparity)
+			scrub_parity_put(sblock->sparity);
+
+		for (i = 0; i < sblock->page_count; i++)
+			scrub_page_put(sblock->pagev[i]);
+		kfree(sblock);
+	}
+}
+
+static void scrub_page_get(struct scrub_page *spage)
+{
+	atomic_inc(&spage->refs);
+}
+
+static void scrub_page_put(struct scrub_page *spage)
+{
+	if (atomic_dec_and_test(&spage->refs)) {
+		if (spage->page)
+			__free_page(spage->page);
+		kfree(spage);
+	}
+}
+
+static void scrub_submit(struct scrub_ctx *sctx)
+{
+	struct scrub_bio *sbio;
+
+	if (sctx->curr == -1)
+		return;
+
+	sbio = sctx->bios[sctx->curr];
+	sctx->curr = -1;
+	scrub_pending_bio_inc(sctx);
+
+	if (!sbio->bio->bi_bdev) {
+		/*
+		 * this case should not happen. If btrfs_map_block() is
+		 * wrong, it could happen for dev-replace operations on
+		 * missing devices when no mirrors are available, but in
+		 * this case it should already fail the mount.
+		 * This case is handled correctly (but _very_ slowly).
+		 */
+		printk_ratelimited(KERN_WARNING
+			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
+		bio_endio(sbio->bio, -EIO);
+	} else {
+		btrfsic_submit_bio(READ, sbio->bio);
+	}
+}
+
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
+{
+	struct scrub_block *sblock = spage->sblock;
+	struct scrub_bio *sbio;
+	int ret;
+
+again:
+	/*
+	 * grab a fresh bio or wait for one to become available
+	 */
+	while (sctx->curr == -1) {
+		spin_lock(&sctx->list_lock);
+		sctx->curr = sctx->first_free;
+		if (sctx->curr != -1) {
+			sctx->first_free = sctx->bios[sctx->curr]->next_free;
+			sctx->bios[sctx->curr]->next_free = -1;
+			sctx->bios[sctx->curr]->page_count = 0;
+			spin_unlock(&sctx->list_lock);
+		} else {
+			spin_unlock(&sctx->list_lock);
+			wait_event(sctx->list_wait, sctx->first_free != -1);
+		}
+	}
+	sbio = sctx->bios[sctx->curr];
+	if (sbio->page_count == 0) {
+		struct bio *bio;
+
+		sbio->physical = spage->physical;
+		sbio->logical = spage->logical;
+		sbio->dev = spage->dev;
+		bio = sbio->bio;
+		if (!bio) {
+			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+			if (!bio)
+				return -ENOMEM;
+			sbio->bio = bio;
+		}
+
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_bio_end_io;
+		bio->bi_bdev = sbio->dev->bdev;
+		bio->bi_iter.bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+		   spage->physical ||
+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
+		   spage->logical ||
+		   sbio->dev != spage->dev) {
+		scrub_submit(sctx);
+		goto again;
+	}
+
+	sbio->pagev[sbio->page_count] = spage;
+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
+		if (sbio->page_count < 1) {
+			bio_put(sbio->bio);
+			sbio->bio = NULL;
+			return -EIO;
+		}
+		scrub_submit(sctx);
+		goto again;
+	}
+
+	scrub_block_get(sblock); /* one for the page added to the bio */
+	atomic_inc(&sblock->outstanding_pages);
+	sbio->page_count++;
+	if (sbio->page_count == sctx->pages_per_rd_bio)
+		scrub_submit(sctx);
+
+	return 0;
+}
+
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace)
+{
+	struct scrub_block *sblock;
+	int index;
+
+	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+	if (!sblock) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	/* one ref inside this function, plus one for each page added to
+	 * a bio later on */
+	atomic_set(&sblock->refs, 1);
+	sblock->sctx = sctx;
+	sblock->no_io_error_seen = 1;
+
+	for (index = 0; len > 0; index++) {
+		struct scrub_page *spage;
+		u64 l = min_t(u64, len, PAGE_SIZE);
+
+		spage = kzalloc(sizeof(*spage), GFP_NOFS);
+		if (!spage) {
+leave_nomem:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.malloc_errors++;
+			spin_unlock(&sctx->stat_lock);
+			scrub_block_put(sblock);
+			return -ENOMEM;
+		}
+		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		scrub_page_get(spage);
+		sblock->pagev[index] = spage;
+		spage->sblock = sblock;
+		spage->dev = dev;
+		spage->flags = flags;
+		spage->generation = gen;
+		spage->logical = logical;
+		spage->physical = physical;
+		spage->physical_for_dev_replace = physical_for_dev_replace;
+		spage->mirror_num = mirror_num;
+		if (csum) {
+			spage->have_csum = 1;
+			memcpy(spage->csum, csum, sctx->csum_size);
+		} else {
+			spage->have_csum = 0;
+		}
+		sblock->page_count++;
+		spage->page = alloc_page(GFP_NOFS);
+		if (!spage->page)
+			goto leave_nomem;
+		len -= l;
+		logical += l;
+		physical += l;
+		physical_for_dev_replace += l;
+	}
+
+	WARN_ON(sblock->page_count == 0);
+	for (index = 0; index < sblock->page_count; index++) {
+		struct scrub_page *spage = sblock->pagev[index];
+		int ret;
+
+		ret = scrub_add_page_to_rd_bio(sctx, spage);
+		if (ret) {
+			scrub_block_put(sblock);
+			return ret;
+		}
+	}
+
+	if (force)
+		scrub_submit(sctx);
+
+	/* last one frees, either here or in bio completion for last page */
+	scrub_block_put(sblock);
+	return 0;
+}
+
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+	struct scrub_bio *sbio = bio->bi_private;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+	sbio->err = err;
+	sbio->bio = bio;
+
+	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
+{
+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+	struct scrub_ctx *sctx = sbio->sctx;
+	int i;
+
+	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+	if (sbio->err) {
+		for (i = 0; i < sbio->page_count; i++) {
+			struct scrub_page *spage = sbio->pagev[i];
+
+			spage->io_error = 1;
+			spage->sblock->no_io_error_seen = 0;
+		}
+	}
+
+	/* now complete the scrub_block items that have all pages completed */
+	for (i = 0; i < sbio->page_count; i++) {
+		struct scrub_page *spage = sbio->pagev[i];
+		struct scrub_block *sblock = spage->sblock;
+
+		if (atomic_dec_and_test(&sblock->outstanding_pages))
+			scrub_block_complete(sblock);
+		scrub_block_put(sblock);
+	}
+
+	bio_put(sbio->bio);
+	sbio->bio = NULL;
+	spin_lock(&sctx->list_lock);
+	sbio->next_free = sctx->first_free;
+	sctx->first_free = sbio->index;
+	spin_unlock(&sctx->list_lock);
+
+	if (sctx->is_dev_replace &&
+	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+	}
+
+	scrub_pending_bio_dec(sctx);
+}
+
+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+				       unsigned long *bitmap,
+				       u64 start, u64 len)
+{
+	u32 offset;
+	int nsectors;
+	int sectorsize = sparity->sctx->dev_root->sectorsize;
+
+	if (len >= sparity->stripe_len) {
+		bitmap_set(bitmap, 0, sparity->nsectors);
+		return;
+	}
+
+	start -= sparity->logic_start;
+	start = div_u64_rem(start, sparity->stripe_len, &offset);
+	offset /= sectorsize;
+	nsectors = (int)len / sectorsize;
+
+	if (offset + nsectors <= sparity->nsectors) {
+		bitmap_set(bitmap, offset, nsectors);
+		return;
+	}
+
+	bitmap_set(bitmap, offset, sparity->nsectors - offset);
+	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+}
+
+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+						   u64 start, u64 len)
+{
+	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+}
+
+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+						  u64 start, u64 len)
+{
+	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+	int corrupted = 0;
+
+	if (!sblock->no_io_error_seen) {
+		corrupted = 1;
+		scrub_handle_errored_block(sblock);
+	} else {
+		/*
+		 * if has checksum error, write via repair mechanism in
+		 * dev replace case, otherwise write here in dev replace
+		 * case.
+		 */
+		corrupted = scrub_checksum(sblock);
+		if (!corrupted && sblock->sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock);
+	}
+
+	if (sblock->sparity && corrupted && !sblock->data_corrected) {
+		u64 start = sblock->pagev[0]->logical;
+		u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+			  PAGE_SIZE;
+
+		scrub_parity_mark_sectors_error(sblock->sparity,
+						start, end - start);
+	}
+}
+
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
+			   u8 *csum)
+{
+	struct btrfs_ordered_sum *sum = NULL;
+	unsigned long index;
+	unsigned long num_sectors;
+
+	while (!list_empty(&sctx->csum_list)) {
+		sum = list_first_entry(&sctx->csum_list,
+				       struct btrfs_ordered_sum, list);
+		if (sum->bytenr > logical)
+			return 0;
+		if (sum->bytenr + sum->len > logical)
+			break;
+
+		++sctx->stat.csum_discards;
+		list_del(&sum->list);
+		kfree(sum);
+		sum = NULL;
+	}
+	if (!sum)
+		return 0;
+
+	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
+	num_sectors = sum->len / sctx->sectorsize;
+	memcpy(csum, sum->sums + index, sctx->csum_size);
+	if (index == num_sectors - 1) {
+		list_del(&sum->list);
+		kfree(sum);
+	}
+	return 1;
+}
+
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+			u64 physical, struct btrfs_device *dev, u64 flags,
+			u64 gen, int mirror_num, u64 physical_for_dev_replace)
+{
+	int ret;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 blocksize;
+
+	if (flags & BTRFS_EXTENT_FLAG_DATA) {
+		blocksize = sctx->sectorsize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.data_extents_scrubbed++;
+		sctx->stat.data_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
+	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		blocksize = sctx->nodesize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.tree_extents_scrubbed++;
+		sctx->stat.tree_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
+	} else {
+		blocksize = sctx->sectorsize;
+		WARN_ON(1);
+	}
+
+	while (len) {
+		u64 l = min_t(u64, len, blocksize);
+		int have_csum = 0;
+
+		if (flags & BTRFS_EXTENT_FLAG_DATA) {
+			/* push csums to sbio */
+			have_csum = scrub_find_csum(sctx, logical, l, csum);
+			if (have_csum == 0)
+				++sctx->stat.no_csum;
+			if (sctx->is_dev_replace && !have_csum) {
+				ret = copy_nocow_pages(sctx, logical, l,
+						       mirror_num,
+						      physical_for_dev_replace);
+				goto behind_scrub_pages;
+			}
+		}
+		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
+				  mirror_num, have_csum ? csum : NULL, 0,
+				  physical_for_dev_replace);
+behind_scrub_pages:
+		if (ret)
+			return ret;
+		len -= l;
+		logical += l;
+		physical += l;
+		physical_for_dev_replace += l;
+	}
+	return 0;
+}
+
+static int scrub_pages_for_parity(struct scrub_parity *sparity,
+				  u64 logical, u64 len,
+				  u64 physical, struct btrfs_device *dev,
+				  u64 flags, u64 gen, int mirror_num, u8 *csum)
+{
+	struct scrub_ctx *sctx = sparity->sctx;
+	struct scrub_block *sblock;
+	int index;
+
+	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+	if (!sblock) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	/* one ref inside this function, plus one for each page added to
+	 * a bio later on */
+	atomic_set(&sblock->refs, 1);
+	sblock->sctx = sctx;
+	sblock->no_io_error_seen = 1;
+	sblock->sparity = sparity;
+	scrub_parity_get(sparity);
+
+	for (index = 0; len > 0; index++) {
+		struct scrub_page *spage;
+		u64 l = min_t(u64, len, PAGE_SIZE);
+
+		spage = kzalloc(sizeof(*spage), GFP_NOFS);
+		if (!spage) {
+leave_nomem:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.malloc_errors++;
+			spin_unlock(&sctx->stat_lock);
+			scrub_block_put(sblock);
+			return -ENOMEM;
+		}
+		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		/* For scrub block */
+		scrub_page_get(spage);
+		sblock->pagev[index] = spage;
+		/* For scrub parity */
+		scrub_page_get(spage);
+		list_add_tail(&spage->list, &sparity->spages);
+		spage->sblock = sblock;
+		spage->dev = dev;
+		spage->flags = flags;
+		spage->generation = gen;
+		spage->logical = logical;
+		spage->physical = physical;
+		spage->mirror_num = mirror_num;
+		if (csum) {
+			spage->have_csum = 1;
+			memcpy(spage->csum, csum, sctx->csum_size);
+		} else {
+			spage->have_csum = 0;
+		}
+		sblock->page_count++;
+		spage->page = alloc_page(GFP_NOFS);
+		if (!spage->page)
+			goto leave_nomem;
+		len -= l;
+		logical += l;
+		physical += l;
+	}
+
+	WARN_ON(sblock->page_count == 0);
+	for (index = 0; index < sblock->page_count; index++) {
+		struct scrub_page *spage = sblock->pagev[index];
+		int ret;
+
+		ret = scrub_add_page_to_rd_bio(sctx, spage);
+		if (ret) {
+			scrub_block_put(sblock);
+			return ret;
+		}
+	}
+
+	/* last one frees, either here or in bio completion for last page */
+	scrub_block_put(sblock);
+	return 0;
+}
+
+static int scrub_extent_for_parity(struct scrub_parity *sparity,
+				   u64 logical, u64 len,
+				   u64 physical, struct btrfs_device *dev,
+				   u64 flags, u64 gen, int mirror_num)
+{
+	struct scrub_ctx *sctx = sparity->sctx;
+	int ret;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 blocksize;
+
+	if (flags & BTRFS_EXTENT_FLAG_DATA) {
+		blocksize = sctx->sectorsize;
+	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		blocksize = sctx->nodesize;
+	} else {
+		blocksize = sctx->sectorsize;
+		WARN_ON(1);
+	}
+
+	while (len) {
+		u64 l = min_t(u64, len, blocksize);
+		int have_csum = 0;
+
+		if (flags & BTRFS_EXTENT_FLAG_DATA) {
+			/* push csums to sbio */
+			have_csum = scrub_find_csum(sctx, logical, l, csum);
+			if (have_csum == 0)
+				goto skip;
+		}
+		ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+					     flags, gen, mirror_num,
+					     have_csum ? csum : NULL);
+		if (ret)
+			return ret;
+skip:
+		len -= l;
+		logical += l;
+		physical += l;
+	}
+	return 0;
+}
+
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+				   struct map_lookup *map, u64 *offset,
+				   u64 *stripe_start)
+{
+	int i;
+	int j = 0;
+	u64 stripe_nr;
+	u64 last_offset;
+	u32 stripe_index;
+	u32 rot;
+
+	last_offset = (physical - map->stripes[num].physical) *
+		      nr_data_stripes(map);
+	if (stripe_start)
+		*stripe_start = last_offset;
+
+	*offset = last_offset;
+	for (i = 0; i < nr_data_stripes(map); i++) {
+		*offset = last_offset + i * map->stripe_len;
+
+		stripe_nr = div_u64(*offset, map->stripe_len);
+		stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
+
+		/* Work out the disk rotation on this stripe-set */
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
+		/* calculate which stripe this data locates */
+		rot += i;
+		stripe_index = rot % map->num_stripes;
+		if (stripe_index == num)
+			return 0;
+		if (stripe_index < num)
+			j++;
+	}
+	*offset = last_offset + j * map->stripe_len;
+	return 1;
+}
+
+static void scrub_free_parity(struct scrub_parity *sparity)
+{
+	struct scrub_ctx *sctx = sparity->sctx;
+	struct scrub_page *curr, *next;
+	int nbits;
+
+	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+	if (nbits) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors += nbits;
+		sctx->stat.uncorrectable_errors += nbits;
+		spin_unlock(&sctx->stat_lock);
+	}
+
+	list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+		list_del_init(&curr->list);
+		scrub_page_put(curr);
+	}
+
+	kfree(sparity);
+}
+
+static void scrub_parity_bio_endio(struct bio *bio, int error)
+{
+	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+	struct scrub_ctx *sctx = sparity->sctx;
+
+	if (error)
+		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+			  sparity->nsectors);
+
+	scrub_free_parity(sparity);
+	scrub_pending_bio_dec(sctx);
+	bio_put(bio);
+}
+
+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+{
+	struct scrub_ctx *sctx = sparity->sctx;
+	struct bio *bio;
+	struct btrfs_raid_bio *rbio;
+	struct scrub_page *spage;
+	struct btrfs_bio *bbio = NULL;
+	u64 length;
+	int ret;
+
+	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
+			   sparity->nsectors))
+		goto out;
+
+	length = sparity->logic_end - sparity->logic_start + 1;
+	ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
+			       sparity->logic_start,
+			       &length, &bbio, 0, 1);
+	if (ret || !bbio || !bbio->raid_map)
+		goto bbio_out;
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+	if (!bio)
+		goto bbio_out;
+
+	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+	bio->bi_private = sparity;
+	bio->bi_end_io = scrub_parity_bio_endio;
+
+	rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
+					      length, sparity->scrub_dev,
+					      sparity->dbitmap,
+					      sparity->nsectors);
+	if (!rbio)
+		goto rbio_out;
+
+	list_for_each_entry(spage, &sparity->spages, list)
+		raid56_parity_add_scrub_pages(rbio, spage->page,
+					      spage->logical);
+
+	scrub_pending_bio_inc(sctx);
+	raid56_parity_submit_scrub_rbio(rbio);
+	return;
+
+rbio_out:
+	bio_put(bio);
+bbio_out:
+	btrfs_put_bbio(bbio);
+	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+		  sparity->nsectors);
+	spin_lock(&sctx->stat_lock);
+	sctx->stat.malloc_errors++;
+	spin_unlock(&sctx->stat_lock);
+out:
+	scrub_free_parity(sparity);
+}
+
+static inline int scrub_calc_parity_bitmap_len(int nsectors)
+{
+	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+}
+
+static void scrub_parity_get(struct scrub_parity *sparity)
+{
+	atomic_inc(&sparity->refs);
+}
+
+static void scrub_parity_put(struct scrub_parity *sparity)
+{
+	if (!atomic_dec_and_test(&sparity->refs))
+		return;
+
+	scrub_parity_check_and_repair(sparity);
+}
+
+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+						  struct map_lookup *map,
+						  struct btrfs_device *sdev,
+						  struct btrfs_path *path,
+						  u64 logic_start,
+						  u64 logic_end)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *csum_root = fs_info->csum_root;
+	struct btrfs_extent_item *extent;
+	u64 flags;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	u64 generation;
+	u64 extent_logical;
+	u64 extent_physical;
+	u64 extent_len;
+	struct btrfs_device *extent_dev;
+	struct scrub_parity *sparity;
+	int nsectors;
+	int bitmap_len;
+	int extent_mirror_num;
+	int stop_loop = 0;
+
+	nsectors = map->stripe_len / root->sectorsize;
+	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
+	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
+			  GFP_NOFS);
+	if (!sparity) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	sparity->stripe_len = map->stripe_len;
+	sparity->nsectors = nsectors;
+	sparity->sctx = sctx;
+	sparity->scrub_dev = sdev;
+	sparity->logic_start = logic_start;
+	sparity->logic_end = logic_end;
+	atomic_set(&sparity->refs, 1);
+	INIT_LIST_HEAD(&sparity->spages);
+	sparity->dbitmap = sparity->bitmap;
+	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+
+	ret = 0;
+	while (logic_start < logic_end) {
+		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+			key.type = BTRFS_METADATA_ITEM_KEY;
+		else
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.objectid = logic_start;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		if (ret > 0) {
+			ret = btrfs_previous_extent_item(root, path, 0);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(NULL, root, &key,
+							path, 0, 0);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		stop_loop = 0;
+		while (1) {
+			u64 bytes;
+
+			l = path->nodes[0];
+			slot = path->slots[0];
+			if (slot >= btrfs_header_nritems(l)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret == 0)
+					continue;
+				if (ret < 0)
+					goto out;
+
+				stop_loop = 1;
+				break;
+			}
+			btrfs_item_key_to_cpu(l, &key, slot);
+
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				bytes = root->nodesize;
+			else
+				bytes = key.offset;
+
+			if (key.objectid + bytes <= logic_start)
+				goto next;
+
+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+			    key.type != BTRFS_METADATA_ITEM_KEY)
+				goto next;
+
+			if (key.objectid > logic_end) {
+				stop_loop = 1;
+				break;
+			}
+
+			while (key.objectid >= logic_start + map->stripe_len)
+				logic_start += map->stripe_len;
+
+			extent = btrfs_item_ptr(l, slot,
+						struct btrfs_extent_item);
+			flags = btrfs_extent_flags(l, extent);
+			generation = btrfs_extent_generation(l, extent);
+
+			if (key.objectid < logic_start &&
+			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+				btrfs_err(fs_info,
+					  "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+					   key.objectid, logic_start);
+				goto next;
+			}
+again:
+			extent_logical = key.objectid;
+			extent_len = bytes;
+
+			if (extent_logical < logic_start) {
+				extent_len -= logic_start - extent_logical;
+				extent_logical = logic_start;
+			}
+
+			if (extent_logical + extent_len >
+			    logic_start + map->stripe_len)
+				extent_len = logic_start + map->stripe_len -
+					     extent_logical;
+
+			scrub_parity_mark_sectors_data(sparity, extent_logical,
+						       extent_len);
+
+			scrub_remap_extent(fs_info, extent_logical,
+					   extent_len, &extent_physical,
+					   &extent_dev,
+					   &extent_mirror_num);
+
+			ret = btrfs_lookup_csums_range(csum_root,
+						extent_logical,
+						extent_logical + extent_len - 1,
+						&sctx->csum_list, 1);
+			if (ret)
+				goto out;
+
+			ret = scrub_extent_for_parity(sparity, extent_logical,
+						      extent_len,
+						      extent_physical,
+						      extent_dev, flags,
+						      generation,
+						      extent_mirror_num);
+			if (ret)
+				goto out;
+
+			scrub_free_csums(sctx);
+			if (extent_logical + extent_len <
+			    key.objectid + bytes) {
+				logic_start += map->stripe_len;
+
+				if (logic_start >= logic_end) {
+					stop_loop = 1;
+					break;
+				}
+
+				if (logic_start < key.objectid + bytes) {
+					cond_resched();
+					goto again;
+				}
+			}
+next:
+			path->slots[0]++;
+		}
+
+		btrfs_release_path(path);
+
+		if (stop_loop)
+			break;
+
+		logic_start += map->stripe_len;
+	}
+out:
+	if (ret < 0)
+		scrub_parity_mark_sectors_error(sparity, logic_start,
+						logic_end - logic_start + 1);
+	scrub_parity_put(sparity);
+	scrub_submit(sctx);
+	mutex_lock(&sctx->wr_ctx.wr_lock);
+	scrub_wr_submit(sctx);
+	mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+	btrfs_release_path(path);
+	return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+					   struct map_lookup *map,
+					   struct btrfs_device *scrub_dev,
+					   int num, u64 base, u64 length,
+					   int is_dev_replace)
+{
+	struct btrfs_path *path, *ppath;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *csum_root = fs_info->csum_root;
+	struct btrfs_extent_item *extent;
+	struct blk_plug plug;
+	u64 flags;
+	int ret;
+	int slot;
+	u64 nstripes;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	u64 physical;
+	u64 logical;
+	u64 logic_end;
+	u64 physical_end;
+	u64 generation;
+	int mirror_num;
+	struct reada_control *reada1;
+	struct reada_control *reada2;
+	struct btrfs_key key_start;
+	struct btrfs_key key_end;
+	u64 increment = map->stripe_len;
+	u64 offset;
+	u64 extent_logical;
+	u64 extent_physical;
+	u64 extent_len;
+	u64 stripe_logical;
+	u64 stripe_end;
+	struct btrfs_device *extent_dev;
+	int extent_mirror_num;
+	int stop_loop = 0;
+
+	physical = map->stripes[num].physical;
+	offset = 0;
+	nstripes = div_u64(length, map->stripe_len);
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		offset = map->stripe_len * num;
+		increment = map->stripe_len * map->num_stripes;
+		mirror_num = 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+		offset = map->stripe_len * (num / map->sub_stripes);
+		increment = map->stripe_len * factor;
+		mirror_num = num % map->sub_stripes + 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		increment = map->stripe_len;
+		mirror_num = num % map->num_stripes + 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		increment = map->stripe_len;
+		mirror_num = num % map->num_stripes + 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		get_raid56_logic_offset(physical, num, map, &offset, NULL);
+		increment = map->stripe_len * nr_data_stripes(map);
+		mirror_num = 1;
+	} else {
+		increment = map->stripe_len;
+		mirror_num = 1;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ppath = btrfs_alloc_path();
+	if (!ppath) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	/*
+	 * work on commit root. The related disk blocks are static as
+	 * long as COW is applied. This means, it is save to rewrite
+	 * them to repair disk errors without any race conditions
+	 */
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	ppath->search_commit_root = 1;
+	ppath->skip_locking = 1;
+	/*
+	 * trigger the readahead for extent tree csum tree and wait for
+	 * completion. During readahead, the scrub is officially paused
+	 * to not hold off transaction commits
+	 */
+	logical = base + offset;
+	physical_end = physical + nstripes * map->stripe_len;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		get_raid56_logic_offset(physical_end, num,
+					map, &logic_end, NULL);
+		logic_end += base;
+	} else {
+		logic_end = logical + increment * nstripes;
+	}
+	wait_event(sctx->list_wait,
+		   atomic_read(&sctx->bios_in_flight) == 0);
+	scrub_blocked_if_needed(fs_info);
+
+	/* FIXME it might be better to start readahead at commit root */
+	key_start.objectid = logical;
+	key_start.type = BTRFS_EXTENT_ITEM_KEY;
+	key_start.offset = (u64)0;
+	key_end.objectid = logic_end;
+	key_end.type = BTRFS_METADATA_ITEM_KEY;
+	key_end.offset = (u64)-1;
+	reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_start.type = BTRFS_EXTENT_CSUM_KEY;
+	key_start.offset = logical;
+	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_end.type = BTRFS_EXTENT_CSUM_KEY;
+	key_end.offset = logic_end;
+	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+	if (!IS_ERR(reada1))
+		btrfs_reada_wait(reada1);
+	if (!IS_ERR(reada2))
+		btrfs_reada_wait(reada2);
+
+
+	/*
+	 * collect all data csums for the stripe to avoid seeking during
+	 * the scrub. This might currently (crc32) end up to be about 1MB
+	 */
+	blk_start_plug(&plug);
+
+	/*
+	 * now find all extents for each stripe and scrub them
+	 */
+	ret = 0;
+	while (physical < physical_end) {
+		/* for raid56, we skip parity stripe */
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+			ret = get_raid56_logic_offset(physical, num,
+					map, &logical, &stripe_logical);
+			logical += base;
+			if (ret) {
+				stripe_logical += base;
+				stripe_end = stripe_logical + increment - 1;
+				ret = scrub_raid56_parity(sctx, map, scrub_dev,
+						ppath, stripe_logical,
+						stripe_end);
+				if (ret)
+					goto out;
+				goto skip;
+			}
+		}
+		/*
+		 * canceled?
+		 */
+		if (atomic_read(&fs_info->scrub_cancel_req) ||
+		    atomic_read(&sctx->cancel_req)) {
+			ret = -ECANCELED;
+			goto out;
+		}
+		/*
+		 * check to see if we have to pause
+		 */
+		if (atomic_read(&fs_info->scrub_pause_req)) {
+			/* push queued extents */
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+			scrub_submit(sctx);
+			mutex_lock(&sctx->wr_ctx.wr_lock);
+			scrub_wr_submit(sctx);
+			mutex_unlock(&sctx->wr_ctx.wr_lock);
+			wait_event(sctx->list_wait,
+				   atomic_read(&sctx->bios_in_flight) == 0);
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+			scrub_blocked_if_needed(fs_info);
+		}
+
+		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+			key.type = BTRFS_METADATA_ITEM_KEY;
+		else
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.objectid = logical;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		if (ret > 0) {
+			ret = btrfs_previous_extent_item(root, path, 0);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				/* there's no smaller item, so stick with the
+				 * larger one */
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(NULL, root, &key,
+							path, 0, 0);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		stop_loop = 0;
+		while (1) {
+			u64 bytes;
+
+			l = path->nodes[0];
+			slot = path->slots[0];
+			if (slot >= btrfs_header_nritems(l)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret == 0)
+					continue;
+				if (ret < 0)
+					goto out;
+
+				stop_loop = 1;
+				break;
+			}
+			btrfs_item_key_to_cpu(l, &key, slot);
+
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				bytes = root->nodesize;
+			else
+				bytes = key.offset;
+
+			if (key.objectid + bytes <= logical)
+				goto next;
+
+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+			    key.type != BTRFS_METADATA_ITEM_KEY)
+				goto next;
+
+			if (key.objectid >= logical + map->stripe_len) {
+				/* out of this device extent */
+				if (key.objectid >= logic_end)
+					stop_loop = 1;
+				break;
+			}
+
+			extent = btrfs_item_ptr(l, slot,
+						struct btrfs_extent_item);
+			flags = btrfs_extent_flags(l, extent);
+			generation = btrfs_extent_generation(l, extent);
+
+			if (key.objectid < logical &&
+			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+				btrfs_err(fs_info,
+					   "scrub: tree block %llu spanning "
+					   "stripes, ignored. logical=%llu",
+				       key.objectid, logical);
+				goto next;
+			}
+
+again:
+			extent_logical = key.objectid;
+			extent_len = bytes;
+
+			/*
+			 * trim extent to this stripe
+			 */
+			if (extent_logical < logical) {
+				extent_len -= logical - extent_logical;
+				extent_logical = logical;
+			}
+			if (extent_logical + extent_len >
+			    logical + map->stripe_len) {
+				extent_len = logical + map->stripe_len -
+					     extent_logical;
+			}
+
+			extent_physical = extent_logical - logical + physical;
+			extent_dev = scrub_dev;
+			extent_mirror_num = mirror_num;
+			if (is_dev_replace)
+				scrub_remap_extent(fs_info, extent_logical,
+						   extent_len, &extent_physical,
+						   &extent_dev,
+						   &extent_mirror_num);
+
+			ret = btrfs_lookup_csums_range(csum_root, logical,
+						logical + map->stripe_len - 1,
+						&sctx->csum_list, 1);
+			if (ret)
+				goto out;
+
+			ret = scrub_extent(sctx, extent_logical, extent_len,
+					   extent_physical, extent_dev, flags,
+					   generation, extent_mirror_num,
+					   extent_logical - logical + physical);
+			if (ret)
+				goto out;
+
+			scrub_free_csums(sctx);
+			if (extent_logical + extent_len <
+			    key.objectid + bytes) {
+				if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+					/*
+					 * loop until we find next data stripe
+					 * or we have finished all stripes.
+					 */
+loop:
+					physical += map->stripe_len;
+					ret = get_raid56_logic_offset(physical,
+							num, map, &logical,
+							&stripe_logical);
+					logical += base;
+
+					if (ret && physical < physical_end) {
+						stripe_logical += base;
+						stripe_end = stripe_logical +
+								increment - 1;
+						ret = scrub_raid56_parity(sctx,
+							map, scrub_dev, ppath,
+							stripe_logical,
+							stripe_end);
+						if (ret)
+							goto out;
+						goto loop;
+					}
+				} else {
+					physical += map->stripe_len;
+					logical += increment;
+				}
+				if (logical < key.objectid + bytes) {
+					cond_resched();
+					goto again;
+				}
+
+				if (physical >= physical_end) {
+					stop_loop = 1;
+					break;
+				}
+			}
+next:
+			path->slots[0]++;
+		}
+		btrfs_release_path(path);
+skip:
+		logical += increment;
+		physical += map->stripe_len;
+		spin_lock(&sctx->stat_lock);
+		if (stop_loop)
+			sctx->stat.last_physical = map->stripes[num].physical +
+						   length;
+		else
+			sctx->stat.last_physical = physical;
+		spin_unlock(&sctx->stat_lock);
+		if (stop_loop)
+			break;
+	}
+out:
+	/* push queued extents */
+	scrub_submit(sctx);
+	mutex_lock(&sctx->wr_ctx.wr_lock);
+	scrub_wr_submit(sctx);
+	mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+	blk_finish_plug(&plug);
+	btrfs_free_path(path);
+	btrfs_free_path(ppath);
+	return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+					  struct btrfs_device *scrub_dev,
+					  u64 chunk_tree, u64 chunk_objectid,
+					  u64 chunk_offset, u64 length,
+					  u64 dev_offset, int is_dev_replace)
+{
+	struct btrfs_mapping_tree *map_tree =
+		&sctx->dev_root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	int i;
+	int ret = 0;
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	read_unlock(&map_tree->map_tree.lock);
+
+	if (!em)
+		return -EINVAL;
+
+	map = (struct map_lookup *)em->bdev;
+	if (em->start != chunk_offset)
+		goto out;
+
+	if (em->len < length)
+		goto out;
+
+	for (i = 0; i < map->num_stripes; ++i) {
+		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
+		    map->stripes[i].physical == dev_offset) {
+			ret = scrub_stripe(sctx, map, scrub_dev, i,
+					   chunk_offset, length,
+					   is_dev_replace);
+			if (ret)
+				goto out;
+		}
+	}
+out:
+	free_extent_map(em);
+
+	return ret;
+}
+
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+			   struct btrfs_device *scrub_dev, u64 start, u64 end,
+			   int is_dev_replace)
+{
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	struct btrfs_root *root = sctx->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	key.objectid = scrub_dev->devid;
+	key.offset = 0ull;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] >=
+			    btrfs_header_nritems(path->nodes[0])) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.objectid != scrub_dev->devid)
+			break;
+
+		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+			break;
+
+		if (found_key.offset >= end)
+			break;
+
+		if (found_key.offset < key.offset)
+			break;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (found_key.offset + length <= start)
+			goto skip;
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+		/*
+		 * get a reference on the corresponding block group to prevent
+		 * the chunk from going away while we scrub it
+		 */
+		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+
+		/* some chunks are removed but not committed to disk yet,
+		 * continue scrubbing */
+		if (!cache)
+			goto skip;
+
+		dev_replace->cursor_right = found_key.offset + length;
+		dev_replace->cursor_left = found_key.offset;
+		dev_replace->item_needs_writeback = 1;
+		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+				  chunk_offset, length, found_key.offset,
+				  is_dev_replace);
+
+		/*
+		 * flush, submit all pending read and write bios, afterwards
+		 * wait for them.
+		 * Note that in the dev replace case, a read request causes
+		 * write requests that are submitted in the read completion
+		 * worker. Therefore in the current situation, it is required
+		 * that all write requests are flushed, so that all read and
+		 * write requests are really completed when bios_in_flight
+		 * changes to 0.
+		 */
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+		scrub_submit(sctx);
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->bios_in_flight) == 0);
+		atomic_inc(&fs_info->scrubs_paused);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		/*
+		 * must be called before we decrease @scrub_paused.
+		 * make sure we don't block transaction commit while
+		 * we are waiting pending workers finished.
+		 */
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->workers_pending) == 0);
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+
+		mutex_lock(&fs_info->scrub_lock);
+		__scrub_blocked_if_needed(fs_info);
+		atomic_dec(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		btrfs_put_block_group(cache);
+		if (ret)
+			break;
+		if (is_dev_replace &&
+		    atomic64_read(&dev_replace->num_write_errors) > 0) {
+			ret = -EIO;
+			break;
+		}
+		if (sctx->stat.malloc_errors > 0) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		dev_replace->cursor_left = dev_replace->cursor_right;
+		dev_replace->item_needs_writeback = 1;
+skip:
+		key.offset = found_key.offset + length;
+		btrfs_release_path(path);
+	}
+
+	btrfs_free_path(path);
+
+	/*
+	 * ret can still be 1 from search_slot or next_leaf,
+	 * that's not an error
+	 */
+	return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+					   struct btrfs_device *scrub_dev)
+{
+	int	i;
+	u64	bytenr;
+	u64	gen;
+	int	ret;
+	struct btrfs_root *root = sctx->dev_root;
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		return -EIO;
+
+	/* Seed devices of a new filesystem has their own generation. */
+	if (scrub_dev->fs_devices != root->fs_info->fs_devices)
+		gen = scrub_dev->generation;
+	else
+		gen = root->fs_info->last_trans_committed;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >
+		    scrub_dev->commit_total_bytes)
+			break;
+
+		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+				  NULL, 1, bytenr);
+		if (ret)
+			return ret;
+	}
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+
+	return 0;
+}
+
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+						int is_dev_replace)
+{
+	int ret = 0;
+	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
+	int max_active = fs_info->thread_pool_size;
+
+	if (fs_info->scrub_workers_refcnt == 0) {
+		if (is_dev_replace)
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      1, 4);
+		else
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      max_active, 4);
+		if (!fs_info->scrub_workers) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		fs_info->scrub_wr_completion_workers =
+			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+					      max_active, 2);
+		if (!fs_info->scrub_wr_completion_workers) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		fs_info->scrub_nocow_workers =
+			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+		if (!fs_info->scrub_nocow_workers) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+	++fs_info->scrub_workers_refcnt;
+out:
+	return ret;
+}
+
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+	if (--fs_info->scrub_workers_refcnt == 0) {
+		btrfs_destroy_workqueue(fs_info->scrub_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+	}
+	WARN_ON(fs_info->scrub_workers_refcnt < 0);
+}
+
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace)
+{
+	struct scrub_ctx *sctx;
+	int ret;
+	struct btrfs_device *dev;
+	struct rcu_string *name;
+
+	if (btrfs_fs_closing(fs_info))
+		return -EINVAL;
+
+	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
+		/*
+		 * in this case scrub is unable to calculate the checksum
+		 * the way scrub is implemented. Do not handle this
+		 * situation at all because it won't ever happen.
+		 */
+		btrfs_err(fs_info,
+			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
+		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
+		/* not supported for data w/o checksums */
+		btrfs_err(fs_info,
+			   "scrub: size assumption sectorsize != PAGE_SIZE "
+			   "(%d != %lu) fails",
+		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->nodesize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+	    fs_info->chunk_root->sectorsize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+		/*
+		 * would exhaust the array bounds of pagev member in
+		 * struct scrub_block
+		 */
+		btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
+			   "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
+		       fs_info->chunk_root->nodesize,
+		       SCRUB_MAX_PAGES_PER_BLOCK,
+		       fs_info->chunk_root->sectorsize,
+		       SCRUB_MAX_PAGES_PER_BLOCK);
+		return -EINVAL;
+	}
+
+
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+	if (!dev || (dev->missing && !is_dev_replace)) {
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return -ENODEV;
+	}
+
+	if (!is_dev_replace && !readonly && !dev->writeable) {
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		btrfs_err(fs_info, "scrub: device %s is not writable",
+			  name->str);
+		rcu_read_unlock();
+		return -EROFS;
+	}
+
+	mutex_lock(&fs_info->scrub_lock);
+	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return -EIO;
+	}
+
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (dev->scrub_device ||
+	    (!is_dev_replace &&
+	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return -EINPROGRESS;
+	}
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	ret = scrub_workers_get(fs_info, is_dev_replace);
+	if (ret) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return ret;
+	}
+
+	sctx = scrub_setup_ctx(dev, is_dev_replace);
+	if (IS_ERR(sctx)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
+		return PTR_ERR(sctx);
+	}
+	sctx->readonly = readonly;
+	dev->scrub_device = sctx;
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+	/*
+	 * checking @scrub_pause_req here, we can avoid
+	 * race between committing transaction and scrubbing.
+	 */
+	__scrub_blocked_if_needed(fs_info);
+	atomic_inc(&fs_info->scrubs_running);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	if (!is_dev_replace) {
+		/*
+		 * by holding device list mutex, we can
+		 * kick off writing super in log tree sync.
+		 */
+		mutex_lock(&fs_info->fs_devices->device_list_mutex);
+		ret = scrub_supers(sctx, dev);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	}
+
+	if (!ret)
+		ret = scrub_enumerate_chunks(sctx, dev, start, end,
+					     is_dev_replace);
+
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+	atomic_dec(&fs_info->scrubs_running);
+	wake_up(&fs_info->scrub_pause_wait);
+
+	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
+
+	if (progress)
+		memcpy(progress, &sctx->stat, sizeof(*progress));
+
+	mutex_lock(&fs_info->scrub_lock);
+	dev->scrub_device = NULL;
+	scrub_workers_put(fs_info);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	scrub_put_ctx(sctx);
+
+	return ret;
+}
+
+void btrfs_scrub_pause(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_inc(&fs_info->scrub_pause_req);
+	while (atomic_read(&fs_info->scrubs_paused) !=
+	       atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrubs_paused) ==
+			   atomic_read(&fs_info->scrubs_running));
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	mutex_unlock(&fs_info->scrub_lock);
+}
+
+void btrfs_scrub_continue(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	atomic_dec(&fs_info->scrub_pause_req);
+	wake_up(&fs_info->scrub_pause_wait);
+}
+
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+{
+	mutex_lock(&fs_info->scrub_lock);
+	if (!atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->scrub_cancel_req);
+	while (atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrubs_running) == 0);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	atomic_dec(&fs_info->scrub_cancel_req);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+			   struct btrfs_device *dev)
+{
+	struct scrub_ctx *sctx;
+
+	mutex_lock(&fs_info->scrub_lock);
+	sctx = dev->scrub_device;
+	if (!sctx) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -ENOTCONN;
+	}
+	atomic_inc(&sctx->cancel_req);
+	while (dev->scrub_device) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   dev->scrub_device == NULL);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+			 struct btrfs_scrub_progress *progress)
+{
+	struct btrfs_device *dev;
+	struct scrub_ctx *sctx = NULL;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+	if (dev)
+		sctx = dev->scrub_device;
+	if (sctx)
+		memcpy(progress, &sctx->stat, sizeof(*progress));
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num)
+{
+	u64 mapped_length;
+	struct btrfs_bio *bbio = NULL;
+	int ret;
+
+	mapped_length = extent_len;
+	ret = btrfs_map_block(fs_info, READ, extent_logical,
+			      &mapped_length, &bbio, 0);
+	if (ret || !bbio || mapped_length < extent_len ||
+	    !bbio->stripes[0].dev->bdev) {
+		btrfs_put_bbio(bbio);
+		return;
+	}
+
+	*extent_physical = bbio->stripes[0].physical;
+	*extent_mirror_num = bbio->mirror_num;
+	*extent_dev = bbio->stripes[0].dev;
+	btrfs_put_bbio(bbio);
+}
+
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace)
+{
+	WARN_ON(wr_ctx->wr_curr_bio != NULL);
+
+	mutex_init(&wr_ctx->wr_lock);
+	wr_ctx->wr_curr_bio = NULL;
+	if (!is_dev_replace)
+		return 0;
+
+	WARN_ON(!dev->bdev);
+	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	wr_ctx->tgtdev = dev;
+	atomic_set(&wr_ctx->flush_all_writes, 0);
+	return 0;
+}
+
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+	mutex_lock(&wr_ctx->wr_lock);
+	kfree(wr_ctx->wr_curr_bio);
+	wr_ctx->wr_curr_bio = NULL;
+	mutex_unlock(&wr_ctx->wr_lock);
+}
+
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+	if (!nocow_ctx) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	scrub_pending_trans_workers_inc(sctx);
+
+	nocow_ctx->sctx = sctx;
+	nocow_ctx->logical = logical;
+	nocow_ctx->len = len;
+	nocow_ctx->mirror_num = mirror_num;
+	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+	btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
+			copy_nocow_pages_worker, NULL, NULL);
+	INIT_LIST_HEAD(&nocow_ctx->inodes);
+	btrfs_queue_work(fs_info->scrub_nocow_workers,
+			 &nocow_ctx->work);
+
+	return 0;
+}
+
+static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+	struct scrub_nocow_inode *nocow_inode;
+
+	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
+	if (!nocow_inode)
+		return -ENOMEM;
+	nocow_inode->inum = inum;
+	nocow_inode->offset = offset;
+	nocow_inode->root = root;
+	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
+	return 0;
+}
+
+#define COPY_COMPLETE 1
+
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx =
+		container_of(work, struct scrub_copy_nocow_ctx, work);
+	struct scrub_ctx *sctx = nocow_ctx->sctx;
+	u64 logical = nocow_ctx->logical;
+	u64 len = nocow_ctx->len;
+	int mirror_num = nocow_ctx->mirror_num;
+	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+	int ret;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	int not_written = 0;
+
+	fs_info = sctx->dev_root->fs_info;
+	root = fs_info->extent_root;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		not_written = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		not_written = 1;
+		goto out;
+	}
+
+	ret = iterate_inodes_from_logical(logical, fs_info, path,
+					  record_inode_for_nocow, nocow_ctx);
+	if (ret != 0 && ret != -ENOENT) {
+		btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
+			"phys %llu, len %llu, mir %u, ret %d",
+			logical, physical_for_dev_replace, len, mirror_num,
+			ret);
+		not_written = 1;
+		goto out;
+	}
+
+	btrfs_end_transaction(trans, root);
+	trans = NULL;
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
+						 entry->root, nocow_ctx);
+		kfree(entry);
+		if (ret == COPY_COMPLETE) {
+			ret = 0;
+			break;
+		} else if (ret) {
+			break;
+		}
+	}
+out:
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		kfree(entry);
+	}
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, root);
+	if (not_written)
+		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+					    num_uncorrectable_read_errors);
+
+	btrfs_free_path(path);
+	kfree(nocow_ctx);
+
+	scrub_pending_trans_workers_dec(sctx);
+}
+
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+				 u64 logical)
+{
+	struct extent_state *cached_state = NULL;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_io_tree *io_tree;
+	struct extent_map *em;
+	u64 lockstart = start, lockend = start + len - 1;
+	int ret = 0;
+
+	io_tree = &BTRFS_I(inode)->io_tree;
+
+	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		ret = 1;
+		goto out_unlock;
+	}
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_unlock;
+	}
+
+	/*
+	 * This extent does not actually cover the logical extent anymore,
+	 * move on to the next inode.
+	 */
+	if (em->block_start > logical ||
+	    em->block_start + em->block_len < logical + len) {
+		free_extent_map(em);
+		ret = 1;
+		goto out_unlock;
+	}
+	free_extent_map(em);
+
+out_unlock:
+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+			     GFP_NOFS);
+	return ret;
+}
+
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      struct scrub_copy_nocow_ctx *nocow_ctx)
+{
+	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+	struct btrfs_key key;
+	struct inode *inode;
+	struct page *page;
+	struct btrfs_root *local_root;
+	struct extent_io_tree *io_tree;
+	u64 physical_for_dev_replace;
+	u64 nocow_ctx_logical;
+	u64 len = nocow_ctx->len;
+	unsigned long index;
+	int srcu_index;
+	int ret = 0;
+	int err = 0;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(local_root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+		return PTR_ERR(local_root);
+	}
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	/* Avoid truncate/dio/punch hole.. */
+	mutex_lock(&inode->i_mutex);
+	inode_dio_wait(inode);
+
+	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+	io_tree = &BTRFS_I(inode)->io_tree;
+	nocow_ctx_logical = nocow_ctx->logical;
+
+	ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+	if (ret) {
+		ret = ret > 0 ? 0 : ret;
+		goto out;
+	}
+
+	while (len >= PAGE_CACHE_SIZE) {
+		index = offset >> PAGE_CACHE_SHIFT;
+again:
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			btrfs_err(fs_info, "find_or_create_page() failed");
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (PageUptodate(page)) {
+			if (PageDirty(page))
+				goto next_page;
+		} else {
+			ClearPageError(page);
+			err = extent_read_full_page(io_tree, page,
+							   btrfs_get_extent,
+							   nocow_ctx->mirror_num);
+			if (err) {
+				ret = err;
+				goto next_page;
+			}
+
+			lock_page(page);
+			/*
+			 * If the page has been remove from the page cache,
+			 * the data on it is meaningless, because it may be
+			 * old one, the new data may be written into the new
+			 * page in the page cache.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto again;
+			}
+			if (!PageUptodate(page)) {
+				ret = -EIO;
+				goto next_page;
+			}
+		}
+
+		ret = check_extent_to_block(inode, offset, len,
+					    nocow_ctx_logical);
+		if (ret) {
+			ret = ret > 0 ? 0 : ret;
+			goto next_page;
+		}
+
+		err = write_page_nocow(nocow_ctx->sctx,
+				       physical_for_dev_replace, page);
+		if (err)
+			ret = err;
+next_page:
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (ret)
+			break;
+
+		offset += PAGE_CACHE_SIZE;
+		physical_for_dev_replace += PAGE_CACHE_SIZE;
+		nocow_ctx_logical += PAGE_CACHE_SIZE;
+		len -= PAGE_CACHE_SIZE;
+	}
+	ret = COPY_COMPLETE;
+out:
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+	return ret;
+}
+
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page)
+{
+	struct bio *bio;
+	struct btrfs_device *dev;
+	int ret;
+
+	dev = sctx->wr_ctx.tgtdev;
+	if (!dev)
+		return -EIO;
+	if (!dev->bdev) {
+		printk_ratelimited(KERN_WARNING
+			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+		return -EIO;
+	}
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+	bio->bi_iter.bi_size = 0;
+	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
+	bio->bi_bdev = dev->bdev;
+	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		return -EIO;
+	}
+
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
+		goto leave_with_eio;
+
+	bio_put(bio);
+	return 0;
+}
diff --git a/kernel/fs/btrfs/send.c b/kernel/fs/btrfs/send.c
new file mode 100644
index 000000000..a1216f9b4
--- /dev/null
+++ b/kernel/fs/btrfs/send.c
@@ -0,0 +1,5962 @@
+/*
+ * Copyright (C) 2012 Alexander Block.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bsearch.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sort.h>
+#include <linux/mount.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/radix-tree.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+
+#include "send.h"
+#include "backref.h"
+#include "hash.h"
+#include "locking.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+
+static int g_verbose = 0;
+
+#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__)
+
+/*
+ * A fs_path is a helper to dynamically build path names with unknown size.
+ * It reallocates the internal buffer on demand.
+ * It allows fast adding of path elements on the right side (normal path) and
+ * fast adding to the left side (reversed path). A reversed path can also be
+ * unreversed if needed.
+ */
+struct fs_path {
+	union {
+		struct {
+			char *start;
+			char *end;
+
+			char *buf;
+			unsigned short buf_len:15;
+			unsigned short reversed:1;
+			char inline_buf[];
+		};
+		/*
+		 * Average path length does not exceed 200 bytes, we'll have
+		 * better packing in the slab and higher chance to satisfy
+		 * a allocation later during send.
+		 */
+		char pad[256];
+	};
+};
+#define FS_PATH_INLINE_SIZE \
+	(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
+
+
+/* reused for each extent */
+struct clone_root {
+	struct btrfs_root *root;
+	u64 ino;
+	u64 offset;
+
+	u64 found_refs;
+};
+
+#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
+#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+
+struct send_ctx {
+	struct file *send_filp;
+	loff_t send_off;
+	char *send_buf;
+	u32 send_size;
+	u32 send_max_size;
+	u64 total_send_size;
+	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */
+
+	struct btrfs_root *send_root;
+	struct btrfs_root *parent_root;
+	struct clone_root *clone_roots;
+	int clone_roots_cnt;
+
+	/* current state of the compare_tree call */
+	struct btrfs_path *left_path;
+	struct btrfs_path *right_path;
+	struct btrfs_key *cmp_key;
+
+	/*
+	 * infos of the currently processed inode. In case of deleted inodes,
+	 * these are the values from the deleted inode.
+	 */
+	u64 cur_ino;
+	u64 cur_inode_gen;
+	int cur_inode_new;
+	int cur_inode_new_gen;
+	int cur_inode_deleted;
+	u64 cur_inode_size;
+	u64 cur_inode_mode;
+	u64 cur_inode_rdev;
+	u64 cur_inode_last_extent;
+
+	u64 send_progress;
+
+	struct list_head new_refs;
+	struct list_head deleted_refs;
+
+	struct radix_tree_root name_cache;
+	struct list_head name_cache_list;
+	int name_cache_size;
+
+	struct file_ra_state ra;
+
+	char *read_buf;
+
+	/*
+	 * We process inodes by their increasing order, so if before an
+	 * incremental send we reverse the parent/child relationship of
+	 * directories such that a directory with a lower inode number was
+	 * the parent of a directory with a higher inode number, and the one
+	 * becoming the new parent got renamed too, we can't rename/move the
+	 * directory with lower inode number when we finish processing it - we
+	 * must process the directory with higher inode number first, then
+	 * rename/move it and then rename/move the directory with lower inode
+	 * number. Example follows.
+	 *
+	 * Tree state when the first send was performed:
+	 *
+	 * .
+	 * |-- a                   (ino 257)
+	 *     |-- b               (ino 258)
+	 *         |
+	 *         |
+	 *         |-- c           (ino 259)
+	 *         |   |-- d       (ino 260)
+	 *         |
+	 *         |-- c2          (ino 261)
+	 *
+	 * Tree state when the second (incremental) send is performed:
+	 *
+	 * .
+	 * |-- a                   (ino 257)
+	 *     |-- b               (ino 258)
+	 *         |-- c2          (ino 261)
+	 *             |-- d2      (ino 260)
+	 *                 |-- cc  (ino 259)
+	 *
+	 * The sequence of steps that lead to the second state was:
+	 *
+	 * mv /a/b/c/d /a/b/c2/d2
+	 * mv /a/b/c /a/b/c2/d2/cc
+	 *
+	 * "c" has lower inode number, but we can't move it (2nd mv operation)
+	 * before we move "d", which has higher inode number.
+	 *
+	 * So we just memorize which move/rename operations must be performed
+	 * later when their respective parent is processed and moved/renamed.
+	 */
+
+	/* Indexed by parent directory inode number. */
+	struct rb_root pending_dir_moves;
+
+	/*
+	 * Reverse index, indexed by the inode number of a directory that
+	 * is waiting for the move/rename of its immediate parent before its
+	 * own move/rename can be performed.
+	 */
+	struct rb_root waiting_dir_moves;
+
+	/*
+	 * A directory that is going to be rm'ed might have a child directory
+	 * which is in the pending directory moves index above. In this case,
+	 * the directory can only be removed after the move/rename of its child
+	 * is performed. Example:
+	 *
+	 * Parent snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- c/           (ino 259)
+	 *         |   |-- x/       (ino 260)
+	 *         |
+	 *         |-- y/           (ino 261)
+	 *
+	 * Send snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- YY/          (ino 261)
+	 *              |-- x/      (ino 260)
+	 *
+	 * Sequence of steps that lead to the send snapshot:
+	 * rm -f /a/b/c/foo.txt
+	 * mv /a/b/y /a/b/YY
+	 * mv /a/b/c/x /a/b/YY
+	 * rmdir /a/b/c
+	 *
+	 * When the child is processed, its move/rename is delayed until its
+	 * parent is processed (as explained above), but all other operations
+	 * like update utimes, chown, chgrp, etc, are performed and the paths
+	 * that it uses for those operations must use the orphanized name of
+	 * its parent (the directory we're going to rm later), so we need to
+	 * memorize that name.
+	 *
+	 * Indexed by the inode number of the directory to be deleted.
+	 */
+	struct rb_root orphan_dirs;
+};
+
+struct pending_dir_move {
+	struct rb_node node;
+	struct list_head list;
+	u64 parent_ino;
+	u64 ino;
+	u64 gen;
+	bool is_orphan;
+	struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+	struct rb_node node;
+	u64 ino;
+	/*
+	 * There might be some directory that could not be removed because it
+	 * was waiting for this directory inode to be moved first. Therefore
+	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+	 */
+	u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+	struct rb_node node;
+	u64 ino;
+	u64 gen;
+};
+
+struct name_cache_entry {
+	struct list_head list;
+	/*
+	 * radix_tree has only 32bit entries but we need to handle 64bit inums.
+	 * We use the lower 32bit of the 64bit inum to store it in the tree. If
+	 * more then one inum would fall into the same entry, we use radix_list
+	 * to store the additional entries. radix_list is also used to store
+	 * entries where two entries have the same inum but different
+	 * generations.
+	 */
+	struct list_head radix_list;
+	u64 ino;
+	u64 gen;
+	u64 parent_ino;
+	u64 parent_gen;
+	int ret;
+	int need_later_update;
+	int name_len;
+	char name[];
+};
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
+static int need_send_hole(struct send_ctx *sctx)
+{
+	return (sctx->parent_root && !sctx->cur_inode_new &&
+		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+		S_ISREG(sctx->cur_inode_mode));
+}
+
+static void fs_path_reset(struct fs_path *p)
+{
+	if (p->reversed) {
+		p->start = p->buf + p->buf_len - 1;
+		p->end = p->start;
+		*p->start = 0;
+	} else {
+		p->start = p->buf;
+		p->end = p->start;
+		*p->start = 0;
+	}
+}
+
+static struct fs_path *fs_path_alloc(void)
+{
+	struct fs_path *p;
+
+	p = kmalloc(sizeof(*p), GFP_NOFS);
+	if (!p)
+		return NULL;
+	p->reversed = 0;
+	p->buf = p->inline_buf;
+	p->buf_len = FS_PATH_INLINE_SIZE;
+	fs_path_reset(p);
+	return p;
+}
+
+static struct fs_path *fs_path_alloc_reversed(void)
+{
+	struct fs_path *p;
+
+	p = fs_path_alloc();
+	if (!p)
+		return NULL;
+	p->reversed = 1;
+	fs_path_reset(p);
+	return p;
+}
+
+static void fs_path_free(struct fs_path *p)
+{
+	if (!p)
+		return;
+	if (p->buf != p->inline_buf)
+		kfree(p->buf);
+	kfree(p);
+}
+
+static int fs_path_len(struct fs_path *p)
+{
+	return p->end - p->start;
+}
+
+static int fs_path_ensure_buf(struct fs_path *p, int len)
+{
+	char *tmp_buf;
+	int path_len;
+	int old_buf_len;
+
+	len++;
+
+	if (p->buf_len >= len)
+		return 0;
+
+	if (len > PATH_MAX) {
+		WARN_ON(1);
+		return -ENOMEM;
+	}
+
+	path_len = p->end - p->start;
+	old_buf_len = p->buf_len;
+
+	/*
+	 * First time the inline_buf does not suffice
+	 */
+	if (p->buf == p->inline_buf) {
+		tmp_buf = kmalloc(len, GFP_NOFS);
+		if (tmp_buf)
+			memcpy(tmp_buf, p->buf, old_buf_len);
+	} else {
+		tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+	}
+	if (!tmp_buf)
+		return -ENOMEM;
+	p->buf = tmp_buf;
+	/*
+	 * The real size of the buffer is bigger, this will let the fast path
+	 * happen most of the time
+	 */
+	p->buf_len = ksize(p->buf);
+
+	if (p->reversed) {
+		tmp_buf = p->buf + old_buf_len - path_len - 1;
+		p->end = p->buf + p->buf_len - 1;
+		p->start = p->end - path_len;
+		memmove(p->start, tmp_buf, path_len + 1);
+	} else {
+		p->start = p->buf;
+		p->end = p->start + path_len;
+	}
+	return 0;
+}
+
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+				   char **prepared)
+{
+	int ret;
+	int new_len;
+
+	new_len = p->end - p->start + name_len;
+	if (p->start != p->end)
+		new_len++;
+	ret = fs_path_ensure_buf(p, new_len);
+	if (ret < 0)
+		goto out;
+
+	if (p->reversed) {
+		if (p->start != p->end)
+			*--p->start = '/';
+		p->start -= name_len;
+		*prepared = p->start;
+	} else {
+		if (p->start != p->end)
+			*p->end++ = '/';
+		*prepared = p->end;
+		p->end += name_len;
+		*p->end = 0;
+	}
+
+out:
+	return ret;
+}
+
+static int fs_path_add(struct fs_path *p, const char *name, int name_len)
+{
+	int ret;
+	char *prepared;
+
+	ret = fs_path_prepare_for_add(p, name_len, &prepared);
+	if (ret < 0)
+		goto out;
+	memcpy(prepared, name, name_len);
+
+out:
+	return ret;
+}
+
+static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+{
+	int ret;
+	char *prepared;
+
+	ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
+	if (ret < 0)
+		goto out;
+	memcpy(prepared, p2->start, p2->end - p2->start);
+
+out:
+	return ret;
+}
+
+static int fs_path_add_from_extent_buffer(struct fs_path *p,
+					  struct extent_buffer *eb,
+					  unsigned long off, int len)
+{
+	int ret;
+	char *prepared;
+
+	ret = fs_path_prepare_for_add(p, len, &prepared);
+	if (ret < 0)
+		goto out;
+
+	read_extent_buffer(eb, prepared, off, len);
+
+out:
+	return ret;
+}
+
+static int fs_path_copy(struct fs_path *p, struct fs_path *from)
+{
+	int ret;
+
+	p->reversed = from->reversed;
+	fs_path_reset(p);
+
+	ret = fs_path_add_path(p, from);
+
+	return ret;
+}
+
+
+static void fs_path_unreverse(struct fs_path *p)
+{
+	char *tmp;
+	int len;
+
+	if (!p->reversed)
+		return;
+
+	tmp = p->start;
+	len = p->end - p->start;
+	p->start = p->buf;
+	p->end = p->start + len;
+	memmove(p->start, tmp, len + 1);
+	p->reversed = 0;
+}
+
+static struct btrfs_path *alloc_path_for_send(void)
+{
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return NULL;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	path->need_commit_sem = 1;
+	return path;
+}
+
+static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
+{
+	int ret;
+	mm_segment_t old_fs;
+	u32 pos = 0;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	while (pos < len) {
+		ret = vfs_write(filp, (__force const char __user *)buf + pos,
+				len - pos, off);
+		/* TODO handle that correctly */
+		/*if (ret == -ERESTARTSYS) {
+			continue;
+		}*/
+		if (ret < 0)
+			goto out;
+		if (ret == 0) {
+			ret = -EIO;
+			goto out;
+		}
+		pos += ret;
+	}
+
+	ret = 0;
+
+out:
+	set_fs(old_fs);
+	return ret;
+}
+
+static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
+{
+	struct btrfs_tlv_header *hdr;
+	int total_len = sizeof(*hdr) + len;
+	int left = sctx->send_max_size - sctx->send_size;
+
+	if (unlikely(left < total_len))
+		return -EOVERFLOW;
+
+	hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
+	hdr->tlv_type = cpu_to_le16(attr);
+	hdr->tlv_len = cpu_to_le16(len);
+	memcpy(hdr + 1, data, len);
+	sctx->send_size += total_len;
+
+	return 0;
+}
+
+#define TLV_PUT_DEFINE_INT(bits) \
+	static int tlv_put_u##bits(struct send_ctx *sctx,	 	\
+			u##bits attr, u##bits value)			\
+	{								\
+		__le##bits __tmp = cpu_to_le##bits(value);		\
+		return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));	\
+	}
+
+TLV_PUT_DEFINE_INT(64)
+
+static int tlv_put_string(struct send_ctx *sctx, u16 attr,
+			  const char *str, int len)
+{
+	if (len == -1)
+		len = strlen(str);
+	return tlv_put(sctx, attr, str, len);
+}
+
+static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
+			const u8 *uuid)
+{
+	return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
+}
+
+static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
+				  struct extent_buffer *eb,
+				  struct btrfs_timespec *ts)
+{
+	struct btrfs_timespec bts;
+	read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
+	return tlv_put(sctx, attr, &bts, sizeof(bts));
+}
+
+
+#define TLV_PUT(sctx, attrtype, attrlen, data) \
+	do { \
+		ret = tlv_put(sctx, attrtype, attrlen, data); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+
+#define TLV_PUT_INT(sctx, attrtype, bits, value) \
+	do { \
+		ret = tlv_put_u##bits(sctx, attrtype, value); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+
+#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
+#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
+#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
+#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
+#define TLV_PUT_STRING(sctx, attrtype, str, len) \
+	do { \
+		ret = tlv_put_string(sctx, attrtype, str, len); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+#define TLV_PUT_PATH(sctx, attrtype, p) \
+	do { \
+		ret = tlv_put_string(sctx, attrtype, p->start, \
+			p->end - p->start); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while(0)
+#define TLV_PUT_UUID(sctx, attrtype, uuid) \
+	do { \
+		ret = tlv_put_uuid(sctx, attrtype, uuid); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
+	do { \
+		ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+
+static int send_header(struct send_ctx *sctx)
+{
+	struct btrfs_stream_header hdr;
+
+	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
+
+	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+					&sctx->send_off);
+}
+
+/*
+ * For each command/item we want to send to userspace, we call this function.
+ */
+static int begin_cmd(struct send_ctx *sctx, int cmd)
+{
+	struct btrfs_cmd_header *hdr;
+
+	if (WARN_ON(!sctx->send_buf))
+		return -EINVAL;
+
+	BUG_ON(sctx->send_size);
+
+	sctx->send_size += sizeof(*hdr);
+	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+	hdr->cmd = cpu_to_le16(cmd);
+
+	return 0;
+}
+
+static int send_cmd(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_cmd_header *hdr;
+	u32 crc;
+
+	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+	hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
+	hdr->crc = 0;
+
+	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+	hdr->crc = cpu_to_le32(crc);
+
+	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+					&sctx->send_off);
+
+	sctx->total_send_size += sctx->send_size;
+	sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+	sctx->send_size = 0;
+
+	return ret;
+}
+
+/*
+ * Sends a move instruction to user space
+ */
+static int send_rename(struct send_ctx *sctx,
+		     struct fs_path *from, struct fs_path *to)
+{
+	int ret;
+
+verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Sends a link instruction to user space
+ */
+static int send_link(struct send_ctx *sctx,
+		     struct fs_path *path, struct fs_path *lnk)
+{
+	int ret;
+
+verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Sends an unlink instruction to user space
+ */
+static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
+{
+	int ret;
+
+verbose_printk("btrfs: send_unlink %s\n", path->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Sends a rmdir instruction to user space
+ */
+static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
+{
+	int ret;
+
+verbose_printk("btrfs: send_rmdir %s\n", path->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Helper function to retrieve some fields from an inode item.
+ */
+static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
+			  u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
+			  u64 *gid, u64 *rdev)
+{
+	int ret;
+	struct btrfs_inode_item *ii;
+	struct btrfs_key key;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		return ret;
+	}
+
+	ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_inode_item);
+	if (size)
+		*size = btrfs_inode_size(path->nodes[0], ii);
+	if (gen)
+		*gen = btrfs_inode_generation(path->nodes[0], ii);
+	if (mode)
+		*mode = btrfs_inode_mode(path->nodes[0], ii);
+	if (uid)
+		*uid = btrfs_inode_uid(path->nodes[0], ii);
+	if (gid)
+		*gid = btrfs_inode_gid(path->nodes[0], ii);
+	if (rdev)
+		*rdev = btrfs_inode_rdev(path->nodes[0], ii);
+
+	return ret;
+}
+
+static int get_inode_info(struct btrfs_root *root,
+			  u64 ino, u64 *size, u64 *gen,
+			  u64 *mode, u64 *uid, u64 *gid,
+			  u64 *rdev)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+	ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
+			       rdev);
+	btrfs_free_path(path);
+	return ret;
+}
+
+typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
+				   struct fs_path *p,
+				   void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_inode_ref or
+ * btrfs_inode_extref.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the INODE_REF or INODE_EXTREF when called.
+ */
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+			     struct btrfs_key *found_key, int resolve,
+			     iterate_inode_ref_t iterate, void *ctx)
+{
+	struct extent_buffer *eb = path->nodes[0];
+	struct btrfs_item *item;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_inode_extref *extref;
+	struct btrfs_path *tmp_path;
+	struct fs_path *p;
+	u32 cur = 0;
+	u32 total;
+	int slot = path->slots[0];
+	u32 name_len;
+	char *start;
+	int ret = 0;
+	int num = 0;
+	int index;
+	u64 dir;
+	unsigned long name_off;
+	unsigned long elem_size;
+	unsigned long ptr;
+
+	p = fs_path_alloc_reversed();
+	if (!p)
+		return -ENOMEM;
+
+	tmp_path = alloc_path_for_send();
+	if (!tmp_path) {
+		fs_path_free(p);
+		return -ENOMEM;
+	}
+
+
+	if (found_key->type == BTRFS_INODE_REF_KEY) {
+		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
+						    struct btrfs_inode_ref);
+		item = btrfs_item_nr(slot);
+		total = btrfs_item_size(eb, item);
+		elem_size = sizeof(*iref);
+	} else {
+		ptr = btrfs_item_ptr_offset(eb, slot);
+		total = btrfs_item_size_nr(eb, slot);
+		elem_size = sizeof(*extref);
+	}
+
+	while (cur < total) {
+		fs_path_reset(p);
+
+		if (found_key->type == BTRFS_INODE_REF_KEY) {
+			iref = (struct btrfs_inode_ref *)(ptr + cur);
+			name_len = btrfs_inode_ref_name_len(eb, iref);
+			name_off = (unsigned long)(iref + 1);
+			index = btrfs_inode_ref_index(eb, iref);
+			dir = found_key->offset;
+		} else {
+			extref = (struct btrfs_inode_extref *)(ptr + cur);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			name_off = (unsigned long)&extref->name;
+			index = btrfs_inode_extref_index(eb, extref);
+			dir = btrfs_inode_extref_parent(eb, extref);
+		}
+
+		if (resolve) {
+			start = btrfs_ref_to_path(root, tmp_path, name_len,
+						  name_off, eb, dir,
+						  p->buf, p->buf_len);
+			if (IS_ERR(start)) {
+				ret = PTR_ERR(start);
+				goto out;
+			}
+			if (start < p->buf) {
+				/* overflow , try again with larger buffer */
+				ret = fs_path_ensure_buf(p,
+						p->buf_len + p->buf - start);
+				if (ret < 0)
+					goto out;
+				start = btrfs_ref_to_path(root, tmp_path,
+							  name_len, name_off,
+							  eb, dir,
+							  p->buf, p->buf_len);
+				if (IS_ERR(start)) {
+					ret = PTR_ERR(start);
+					goto out;
+				}
+				BUG_ON(start < p->buf);
+			}
+			p->start = start;
+		} else {
+			ret = fs_path_add_from_extent_buffer(p, eb, name_off,
+							     name_len);
+			if (ret < 0)
+				goto out;
+		}
+
+		cur += elem_size + name_len;
+		ret = iterate(num, dir, index, p, ctx);
+		if (ret)
+			goto out;
+		num++;
+	}
+
+out:
+	btrfs_free_path(tmp_path);
+	fs_path_free(p);
+	return ret;
+}
+
+typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
+				  const char *name, int name_len,
+				  const char *data, int data_len,
+				  u8 type, void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_dir_item.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the dir item when called.
+ */
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *found_key,
+			    iterate_dir_item_t iterate, void *ctx)
+{
+	int ret = 0;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	struct btrfs_dir_item *di;
+	struct btrfs_key di_key;
+	char *buf = NULL;
+	int buf_len;
+	u32 name_len;
+	u32 data_len;
+	u32 cur;
+	u32 len;
+	u32 total;
+	int slot;
+	int num;
+	u8 type;
+
+	/*
+	 * Start with a small buffer (1 page). If later we end up needing more
+	 * space, which can happen for xattrs on a fs with a leaf size greater
+	 * then the page size, attempt to increase the buffer. Typically xattr
+	 * values are small.
+	 */
+	buf_len = PATH_MAX;
+	buf = kmalloc(buf_len, GFP_NOFS);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item = btrfs_item_nr(slot);
+	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+	cur = 0;
+	len = 0;
+	total = btrfs_item_size(eb, item);
+
+	num = 0;
+	while (cur < total) {
+		name_len = btrfs_dir_name_len(eb, di);
+		data_len = btrfs_dir_data_len(eb, di);
+		type = btrfs_dir_type(eb, di);
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+		if (type == BTRFS_FT_XATTR) {
+			if (name_len > XATTR_NAME_MAX) {
+				ret = -ENAMETOOLONG;
+				goto out;
+			}
+			if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
+				ret = -E2BIG;
+				goto out;
+			}
+		} else {
+			/*
+			 * Path too long
+			 */
+			if (name_len + data_len > PATH_MAX) {
+				ret = -ENAMETOOLONG;
+				goto out;
+			}
+		}
+
+		if (name_len + data_len > buf_len) {
+			buf_len = name_len + data_len;
+			if (is_vmalloc_addr(buf)) {
+				vfree(buf);
+				buf = NULL;
+			} else {
+				char *tmp = krealloc(buf, buf_len,
+						     GFP_NOFS | __GFP_NOWARN);
+
+				if (!tmp)
+					kfree(buf);
+				buf = tmp;
+			}
+			if (!buf) {
+				buf = vmalloc(buf_len);
+				if (!buf) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+		}
+
+		read_extent_buffer(eb, buf, (unsigned long)(di + 1),
+				name_len + data_len);
+
+		len = sizeof(*di) + name_len + data_len;
+		di = (struct btrfs_dir_item *)((char *)di + len);
+		cur += len;
+
+		ret = iterate(num, &di_key, buf, name_len, buf + name_len,
+				data_len, type, ctx);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+
+		num++;
+	}
+
+out:
+	kvfree(buf);
+	return ret;
+}
+
+static int __copy_first_ref(int num, u64 dir, int index,
+			    struct fs_path *p, void *ctx)
+{
+	int ret;
+	struct fs_path *pt = ctx;
+
+	ret = fs_path_copy(pt, p);
+	if (ret < 0)
+		return ret;
+
+	/* we want the first only */
+	return 1;
+}
+
+/*
+ * Retrieve the first path of an inode. If an inode has more then one
+ * ref/hardlink, this is ignored.
+ */
+static int get_inode_path(struct btrfs_root *root,
+			  u64 ino, struct fs_path *path)
+{
+	int ret;
+	struct btrfs_key key, found_key;
+	struct btrfs_path *p;
+
+	p = alloc_path_for_send();
+	if (!p)
+		return -ENOMEM;
+
+	fs_path_reset(path);
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = 1;
+		goto out;
+	}
+	btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
+	if (found_key.objectid != ino ||
+	    (found_key.type != BTRFS_INODE_REF_KEY &&
+	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = iterate_inode_ref(root, p, &found_key, 1,
+				__copy_first_ref, path);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	btrfs_free_path(p);
+	return ret;
+}
+
+struct backref_ctx {
+	struct send_ctx *sctx;
+
+	struct btrfs_path *path;
+	/* number of total found references */
+	u64 found;
+
+	/*
+	 * used for clones found in send_root. clones found behind cur_objectid
+	 * and cur_offset are not considered as allowed clones.
+	 */
+	u64 cur_objectid;
+	u64 cur_offset;
+
+	/* may be truncated in case it's the last extent in a file */
+	u64 extent_len;
+
+	/* Just to check for bugs in backref resolving */
+	int found_itself;
+};
+
+static int __clone_root_cmp_bsearch(const void *key, const void *elt)
+{
+	u64 root = (u64)(uintptr_t)key;
+	struct clone_root *cr = (struct clone_root *)elt;
+
+	if (root < cr->root->objectid)
+		return -1;
+	if (root > cr->root->objectid)
+		return 1;
+	return 0;
+}
+
+static int __clone_root_cmp_sort(const void *e1, const void *e2)
+{
+	struct clone_root *cr1 = (struct clone_root *)e1;
+	struct clone_root *cr2 = (struct clone_root *)e2;
+
+	if (cr1->root->objectid < cr2->root->objectid)
+		return -1;
+	if (cr1->root->objectid > cr2->root->objectid)
+		return 1;
+	return 0;
+}
+
+/*
+ * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ */
+static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+{
+	struct backref_ctx *bctx = ctx_;
+	struct clone_root *found;
+	int ret;
+	u64 i_size;
+
+	/* First check if the root is in the list of accepted clone sources */
+	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
+			bctx->sctx->clone_roots_cnt,
+			sizeof(struct clone_root),
+			__clone_root_cmp_bsearch);
+	if (!found)
+		return 0;
+
+	if (found->root == bctx->sctx->send_root &&
+	    ino == bctx->cur_objectid &&
+	    offset == bctx->cur_offset) {
+		bctx->found_itself = 1;
+	}
+
+	/*
+	 * There are inodes that have extents that lie behind its i_size. Don't
+	 * accept clones from these extents.
+	 */
+	ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
+			       NULL, NULL, NULL);
+	btrfs_release_path(bctx->path);
+	if (ret < 0)
+		return ret;
+
+	if (offset + bctx->extent_len > i_size)
+		return 0;
+
+	/*
+	 * Make sure we don't consider clones from send_root that are
+	 * behind the current inode/offset.
+	 */
+	if (found->root == bctx->sctx->send_root) {
+		/*
+		 * TODO for the moment we don't accept clones from the inode
+		 * that is currently send. We may change this when
+		 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
+		 * file.
+		 */
+		if (ino >= bctx->cur_objectid)
+			return 0;
+#if 0
+		if (ino > bctx->cur_objectid)
+			return 0;
+		if (offset + bctx->extent_len > bctx->cur_offset)
+			return 0;
+#endif
+	}
+
+	bctx->found++;
+	found->found_refs++;
+	if (ino < found->ino) {
+		found->ino = ino;
+		found->offset = offset;
+	} else if (found->ino == ino) {
+		/*
+		 * same extent found more then once in the same file.
+		 */
+		if (found->offset > offset + bctx->extent_len)
+			found->offset = offset;
+	}
+
+	return 0;
+}
+
+/*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
+ * path must point to the extent item when called.
+ */
+static int find_extent_clone(struct send_ctx *sctx,
+			     struct btrfs_path *path,
+			     u64 ino, u64 data_offset,
+			     u64 ino_size,
+			     struct clone_root **found)
+{
+	int ret;
+	int extent_type;
+	u64 logical;
+	u64 disk_byte;
+	u64 num_bytes;
+	u64 extent_item_pos;
+	u64 flags = 0;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *eb = path->nodes[0];
+	struct backref_ctx *backref_ctx = NULL;
+	struct clone_root *cur_clone_root;
+	struct btrfs_key found_key;
+	struct btrfs_path *tmp_path;
+	int compressed;
+	u32 i;
+
+	tmp_path = alloc_path_for_send();
+	if (!tmp_path)
+		return -ENOMEM;
+
+	/* We only use this path under the commit sem */
+	tmp_path->need_commit_sem = 0;
+
+	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+	if (!backref_ctx) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	backref_ctx->path = tmp_path;
+
+	if (data_offset >= ino_size) {
+		/*
+		 * There may be extents that lie behind the file's size.
+		 * I at least had this in combination with snapshotting while
+		 * writing large files.
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	fi = btrfs_item_ptr(eb, path->slots[0],
+			struct btrfs_file_extent_item);
+	extent_type = btrfs_file_extent_type(eb, fi);
+	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+		ret = -ENOENT;
+		goto out;
+	}
+	compressed = btrfs_file_extent_compression(eb, fi);
+
+	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
+	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+	if (disk_byte == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+
+	down_read(&sctx->send_root->fs_info->commit_root_sem);
+	ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
+				  &found_key, &flags);
+	up_read(&sctx->send_root->fs_info->commit_root_sem);
+	btrfs_release_path(tmp_path);
+
+	if (ret < 0)
+		goto out;
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * Setup the clone roots.
+	 */
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		cur_clone_root = sctx->clone_roots + i;
+		cur_clone_root->ino = (u64)-1;
+		cur_clone_root->offset = 0;
+		cur_clone_root->found_refs = 0;
+	}
+
+	backref_ctx->sctx = sctx;
+	backref_ctx->found = 0;
+	backref_ctx->cur_objectid = ino;
+	backref_ctx->cur_offset = data_offset;
+	backref_ctx->found_itself = 0;
+	backref_ctx->extent_len = num_bytes;
+
+	/*
+	 * The last extent of a file may be too large due to page alignment.
+	 * We need to adjust extent_len in this case so that the checks in
+	 * __iterate_backrefs work.
+	 */
+	if (data_offset + num_bytes >= ino_size)
+		backref_ctx->extent_len = ino_size - data_offset;
+
+	/*
+	 * Now collect all backrefs.
+	 */
+	if (compressed == BTRFS_COMPRESS_NONE)
+		extent_item_pos = logical - found_key.objectid;
+	else
+		extent_item_pos = 0;
+	ret = iterate_extent_inodes(sctx->send_root->fs_info,
+					found_key.objectid, extent_item_pos, 1,
+					__iterate_backrefs, backref_ctx);
+
+	if (ret < 0)
+		goto out;
+
+	if (!backref_ctx->found_itself) {
+		/* found a bug in backref code? */
+		ret = -EIO;
+		btrfs_err(sctx->send_root->fs_info, "did not find backref in "
+				"send_root. inode=%llu, offset=%llu, "
+				"disk_byte=%llu found extent=%llu",
+				ino, data_offset, disk_byte, found_key.objectid);
+		goto out;
+	}
+
+verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
+		"ino=%llu, "
+		"num_bytes=%llu, logical=%llu\n",
+		data_offset, ino, num_bytes, logical);
+
+	if (!backref_ctx->found)
+		verbose_printk("btrfs:    no clones found\n");
+
+	cur_clone_root = NULL;
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		if (sctx->clone_roots[i].found_refs) {
+			if (!cur_clone_root)
+				cur_clone_root = sctx->clone_roots + i;
+			else if (sctx->clone_roots[i].root == sctx->send_root)
+				/* prefer clones from send_root over others */
+				cur_clone_root = sctx->clone_roots + i;
+		}
+
+	}
+
+	if (cur_clone_root) {
+		if (compressed != BTRFS_COMPRESS_NONE) {
+			/*
+			 * Offsets given by iterate_extent_inodes() are relative
+			 * to the start of the extent, we need to add logical
+			 * offset from the file extent item.
+			 * (See why at backref.c:check_extent_in_eb())
+			 */
+			cur_clone_root->offset += btrfs_file_extent_offset(eb,
+									   fi);
+		}
+		*found = cur_clone_root;
+		ret = 0;
+	} else {
+		ret = -ENOENT;
+	}
+
+out:
+	btrfs_free_path(tmp_path);
+	kfree(backref_ctx);
+	return ret;
+}
+
+static int read_symlink(struct btrfs_root *root,
+			u64 ino,
+			struct fs_path *dest)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *ei;
+	u8 type;
+	u8 compression;
+	unsigned long off;
+	int len;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], ei);
+	compression = btrfs_file_extent_compression(path->nodes[0], ei);
+	BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
+	BUG_ON(compression);
+
+	off = btrfs_file_extent_inline_start(ei);
+	len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
+
+	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Helper function to generate a file name that is unique in the root of
+ * send_root and parent_root. This is used to generate names for orphan inodes.
+ */
+static int gen_unique_name(struct send_ctx *sctx,
+			   u64 ino, u64 gen,
+			   struct fs_path *dest)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
+	char tmp[64];
+	int len;
+	u64 idx = 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
+				ino, gen, idx);
+		ASSERT(len < sizeof(tmp));
+
+		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
+				path, BTRFS_FIRST_FREE_OBJECTID,
+				tmp, strlen(tmp), 0);
+		btrfs_release_path(path);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (di) {
+			/* not unique, try again */
+			idx++;
+			continue;
+		}
+
+		if (!sctx->parent_root) {
+			/* unique */
+			ret = 0;
+			break;
+		}
+
+		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
+				path, BTRFS_FIRST_FREE_OBJECTID,
+				tmp, strlen(tmp), 0);
+		btrfs_release_path(path);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (di) {
+			/* not unique, try again */
+			idx++;
+			continue;
+		}
+		/* unique */
+		break;
+	}
+
+	ret = fs_path_add(dest, tmp, strlen(tmp));
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+enum inode_state {
+	inode_state_no_change,
+	inode_state_will_create,
+	inode_state_did_create,
+	inode_state_will_delete,
+	inode_state_did_delete,
+};
+
+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret;
+	int left_ret;
+	int right_ret;
+	u64 left_gen;
+	u64 right_gen;
+
+	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
+			NULL, NULL);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	left_ret = ret;
+
+	if (!sctx->parent_root) {
+		right_ret = -ENOENT;
+	} else {
+		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
+				NULL, NULL, NULL, NULL);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		right_ret = ret;
+	}
+
+	if (!left_ret && !right_ret) {
+		if (left_gen == gen && right_gen == gen) {
+			ret = inode_state_no_change;
+		} else if (left_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_create;
+			else
+				ret = inode_state_will_create;
+		} else if (right_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_delete;
+			else
+				ret = inode_state_will_delete;
+		} else  {
+			ret = -ENOENT;
+		}
+	} else if (!left_ret) {
+		if (left_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_create;
+			else
+				ret = inode_state_will_create;
+		} else {
+			ret = -ENOENT;
+		}
+	} else if (!right_ret) {
+		if (right_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_delete;
+			else
+				ret = inode_state_will_delete;
+		} else {
+			ret = -ENOENT;
+		}
+	} else {
+		ret = -ENOENT;
+	}
+
+out:
+	return ret;
+}
+
+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret;
+
+	ret = get_cur_inode_state(sctx, ino, gen);
+	if (ret < 0)
+		goto out;
+
+	if (ret == inode_state_no_change ||
+	    ret == inode_state_did_create ||
+	    ret == inode_state_will_delete)
+		ret = 1;
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * Helper function to lookup a dir item in a dir.
+ */
+static int lookup_dir_item_inode(struct btrfs_root *root,
+				 u64 dir, const char *name, int name_len,
+				 u64 *found_inode,
+				 u8 *found_type)
+{
+	int ret = 0;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(NULL, root, path,
+			dir, name, name_len, 0);
+	if (!di) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+	if (key.type == BTRFS_ROOT_ITEM_KEY) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*found_inode = key.objectid;
+	*found_type = btrfs_dir_type(path->nodes[0], di);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
+static int get_first_ref(struct btrfs_root *root, u64 ino,
+			 u64 *dir, u64 *dir_gen, struct fs_path *name)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	int len;
+	u64 parent_dir;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (!ret)
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				path->slots[0]);
+	if (ret || found_key.objectid != ino ||
+	    (found_key.type != BTRFS_INODE_REF_KEY &&
+	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (found_key.type == BTRFS_INODE_REF_KEY) {
+		struct btrfs_inode_ref *iref;
+		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_inode_ref);
+		len = btrfs_inode_ref_name_len(path->nodes[0], iref);
+		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+						     (unsigned long)(iref + 1),
+						     len);
+		parent_dir = found_key.offset;
+	} else {
+		struct btrfs_inode_extref *extref;
+		extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					struct btrfs_inode_extref);
+		len = btrfs_inode_extref_name_len(path->nodes[0], extref);
+		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+					(unsigned long)&extref->name, len);
+		parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
+	}
+	if (ret < 0)
+		goto out;
+	btrfs_release_path(path);
+
+	if (dir_gen) {
+		ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+	}
+
+	*dir = parent_dir;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int is_first_ref(struct btrfs_root *root,
+			u64 ino, u64 dir,
+			const char *name, int name_len)
+{
+	int ret;
+	struct fs_path *tmp_name;
+	u64 tmp_dir;
+
+	tmp_name = fs_path_alloc();
+	if (!tmp_name)
+		return -ENOMEM;
+
+	ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
+	if (ret < 0)
+		goto out;
+
+	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = !memcmp(tmp_name->start, name, name_len);
+
+out:
+	fs_path_free(tmp_name);
+	return ret;
+}
+
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
+static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+			      const char *name, int name_len,
+			      u64 *who_ino, u64 *who_gen)
+{
+	int ret = 0;
+	u64 gen;
+	u64 other_inode = 0;
+	u8 other_type = 0;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	ret = is_inode_existent(sctx, dir, dir_gen);
+	if (ret <= 0)
+		goto out;
+
+	/*
+	 * If we have a parent root we need to verify that the parent dir was
+	 * not delted and then re-created, if it was then we have no overwrite
+	 * and we can just unlink this entry.
+	 */
+	if (sctx->parent_root) {
+		ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+		if (gen != dir_gen)
+			goto out;
+	}
+
+	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
+			&other_inode, &other_type);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * Check if the overwritten ref was already processed. If yes, the ref
+	 * was already unlinked/moved, so we can safely assume that we will not
+	 * overwrite anything at this point in time.
+	 */
+	if (other_inode > sctx->send_progress) {
+		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
+				who_gen, NULL, NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+
+		ret = 1;
+		*who_ino = other_inode;
+	} else {
+		ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
+static int did_overwrite_ref(struct send_ctx *sctx,
+			    u64 dir, u64 dir_gen,
+			    u64 ino, u64 ino_gen,
+			    const char *name, int name_len)
+{
+	int ret = 0;
+	u64 gen;
+	u64 ow_inode;
+	u8 other_type;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	ret = is_inode_existent(sctx, dir, dir_gen);
+	if (ret <= 0)
+		goto out;
+
+	/* check if the ref was overwritten by another ref */
+	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
+			&ow_inode, &other_type);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	if (ret) {
+		/* was never and will never be overwritten */
+		ret = 0;
+		goto out;
+	}
+
+	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
+			NULL, NULL);
+	if (ret < 0)
+		goto out;
+
+	if (ow_inode == ino && gen == ino_gen) {
+		ret = 0;
+		goto out;
+	}
+
+	/* we know that it is or will be overwritten. check this now */
+	if (ow_inode < sctx->send_progress)
+		ret = 1;
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
+static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret = 0;
+	struct fs_path *name = NULL;
+	u64 dir;
+	u64 dir_gen;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	name = fs_path_alloc();
+	if (!name)
+		return -ENOMEM;
+
+	ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
+	if (ret < 0)
+		goto out;
+
+	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
+			name->start, fs_path_len(name));
+
+out:
+	fs_path_free(name);
+	return ret;
+}
+
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
+ */
+static int name_cache_insert(struct send_ctx *sctx,
+			     struct name_cache_entry *nce)
+{
+	int ret = 0;
+	struct list_head *nce_head;
+
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	if (!nce_head) {
+		nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+		if (!nce_head) {
+			kfree(nce);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(nce_head);
+
+		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+		if (ret < 0) {
+			kfree(nce_head);
+			kfree(nce);
+			return ret;
+		}
+	}
+	list_add_tail(&nce->radix_list, nce_head);
+	list_add_tail(&nce->list, &sctx->name_cache_list);
+	sctx->name_cache_size++;
+
+	return ret;
+}
+
+static void name_cache_delete(struct send_ctx *sctx,
+			      struct name_cache_entry *nce)
+{
+	struct list_head *nce_head;
+
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	if (!nce_head) {
+		btrfs_err(sctx->send_root->fs_info,
+	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+			nce->ino, sctx->name_cache_size);
+	}
+
+	list_del(&nce->radix_list);
+	list_del(&nce->list);
+	sctx->name_cache_size--;
+
+	/*
+	 * We may not get to the final release of nce_head if the lookup fails
+	 */
+	if (nce_head && list_empty(nce_head)) {
+		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+		kfree(nce_head);
+	}
+}
+
+static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+						    u64 ino, u64 gen)
+{
+	struct list_head *nce_head;
+	struct name_cache_entry *cur;
+
+	nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+	if (!nce_head)
+		return NULL;
+
+	list_for_each_entry(cur, nce_head, radix_list) {
+		if (cur->ino == ino && cur->gen == gen)
+			return cur;
+	}
+	return NULL;
+}
+
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
+static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
+{
+	list_del(&nce->list);
+	list_add_tail(&nce->list, &sctx->name_cache_list);
+}
+
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
+static void name_cache_clean_unused(struct send_ctx *sctx)
+{
+	struct name_cache_entry *nce;
+
+	if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
+		return;
+
+	while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
+		nce = list_entry(sctx->name_cache_list.next,
+				struct name_cache_entry, list);
+		name_cache_delete(sctx, nce);
+		kfree(nce);
+	}
+}
+
+static void name_cache_free(struct send_ctx *sctx)
+{
+	struct name_cache_entry *nce;
+
+	while (!list_empty(&sctx->name_cache_list)) {
+		nce = list_entry(sctx->name_cache_list.next,
+				struct name_cache_entry, list);
+		name_cache_delete(sctx, nce);
+		kfree(nce);
+	}
+}
+
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
+static int __get_cur_name_and_parent(struct send_ctx *sctx,
+				     u64 ino, u64 gen,
+				     u64 *parent_ino,
+				     u64 *parent_gen,
+				     struct fs_path *dest)
+{
+	int ret;
+	int nce_ret;
+	struct name_cache_entry *nce = NULL;
+
+	/*
+	 * First check if we already did a call to this function with the same
+	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+	 * return the cached result.
+	 */
+	nce = name_cache_search(sctx, ino, gen);
+	if (nce) {
+		if (ino < sctx->send_progress && nce->need_later_update) {
+			name_cache_delete(sctx, nce);
+			kfree(nce);
+			nce = NULL;
+		} else {
+			name_cache_used(sctx, nce);
+			*parent_ino = nce->parent_ino;
+			*parent_gen = nce->parent_gen;
+			ret = fs_path_add(dest, nce->name, nce->name_len);
+			if (ret < 0)
+				goto out;
+			ret = nce->ret;
+			goto out;
+		}
+	}
+
+	/*
+	 * If the inode is not existent yet, add the orphan name and return 1.
+	 * This should only happen for the parent dir that we determine in
+	 * __record_new_ref
+	 */
+	ret = is_inode_existent(sctx, ino, gen);
+	if (ret < 0)
+		goto out;
+
+	if (!ret) {
+		ret = gen_unique_name(sctx, ino, gen, dest);
+		if (ret < 0)
+			goto out;
+		ret = 1;
+		goto out_cache;
+	}
+
+	/*
+	 * Depending on whether the inode was already processed or not, use
+	 * send_root or parent_root for ref lookup.
+	 */
+	if (ino < sctx->send_progress)
+		ret = get_first_ref(sctx->send_root, ino,
+				    parent_ino, parent_gen, dest);
+	else
+		ret = get_first_ref(sctx->parent_root, ino,
+				    parent_ino, parent_gen, dest);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Check if the ref was overwritten by an inode's ref that was processed
+	 * earlier. If yes, treat as orphan and return 1.
+	 */
+	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
+			dest->start, dest->end - dest->start);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		fs_path_reset(dest);
+		ret = gen_unique_name(sctx, ino, gen, dest);
+		if (ret < 0)
+			goto out;
+		ret = 1;
+	}
+
+out_cache:
+	/*
+	 * Store the result of the lookup in the name cache.
+	 */
+	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
+	if (!nce) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nce->ino = ino;
+	nce->gen = gen;
+	nce->parent_ino = *parent_ino;
+	nce->parent_gen = *parent_gen;
+	nce->name_len = fs_path_len(dest);
+	nce->ret = ret;
+	strcpy(nce->name, dest->start);
+
+	if (ino < sctx->send_progress)
+		nce->need_later_update = 0;
+	else
+		nce->need_later_update = 1;
+
+	nce_ret = name_cache_insert(sctx, nce);
+	if (nce_ret < 0)
+		ret = nce_ret;
+	name_cache_clean_unused(sctx);
+
+out:
+	return ret;
+}
+
+/*
+ * Magic happens here. This function returns the first ref to an inode as it
+ * would look like while receiving the stream at this point in time.
+ * We walk the path up to the root. For every inode in between, we check if it
+ * was already processed/sent. If yes, we continue with the parent as found
+ * in send_root. If not, we continue with the parent as found in parent_root.
+ * If we encounter an inode that was deleted at this point in time, we use the
+ * inodes "orphan" name instead of the real name and stop. Same with new inodes
+ * that were not created yet and overwritten inodes/refs.
+ *
+ * When do we have have orphan inodes:
+ * 1. When an inode is freshly created and thus no valid refs are available yet
+ * 2. When a directory lost all it's refs (deleted) but still has dir items
+ *    inside which were not processed yet (pending for move/delete). If anyone
+ *    tried to get the path to the dir items, it would get a path inside that
+ *    orphan directory.
+ * 3. When an inode is moved around or gets new links, it may overwrite the ref
+ *    of an unprocessed inode. If in that case the first ref would be
+ *    overwritten, the overwritten inode gets "orphanized". Later when we
+ *    process this overwritten inode, it is restored at a new place by moving
+ *    the orphan inode.
+ *
+ * sctx->send_progress tells this function at which point in time receiving
+ * would be.
+ */
+static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
+			struct fs_path *dest)
+{
+	int ret = 0;
+	struct fs_path *name = NULL;
+	u64 parent_inode = 0;
+	u64 parent_gen = 0;
+	int stop = 0;
+
+	name = fs_path_alloc();
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dest->reversed = 1;
+	fs_path_reset(dest);
+
+	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+		fs_path_reset(name);
+
+		if (is_waiting_for_rm(sctx, ino)) {
+			ret = gen_unique_name(sctx, ino, gen, name);
+			if (ret < 0)
+				goto out;
+			ret = fs_path_add_path(dest, name);
+			break;
+		}
+
+		if (is_waiting_for_move(sctx, ino)) {
+			ret = get_first_ref(sctx->parent_root, ino,
+					    &parent_inode, &parent_gen, name);
+		} else {
+			ret = __get_cur_name_and_parent(sctx, ino, gen,
+							&parent_inode,
+							&parent_gen, name);
+			if (ret)
+				stop = 1;
+		}
+
+		if (ret < 0)
+			goto out;
+
+		ret = fs_path_add_path(dest, name);
+		if (ret < 0)
+			goto out;
+
+		ino = parent_inode;
+		gen = parent_gen;
+	}
+
+out:
+	fs_path_free(name);
+	if (!ret)
+		fs_path_unreverse(dest);
+	return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
+ */
+static int send_subvol_begin(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *send_root = sctx->send_root;
+	struct btrfs_root *parent_root = sctx->parent_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	char *name = NULL;
+	int namelen;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
+	if (!name) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	key.objectid = send_root->objectid;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
+				&key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
+	    key.objectid != send_root->objectid) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	namelen = btrfs_root_ref_name_len(leaf, ref);
+	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
+	btrfs_release_path(path);
+
+	if (parent_root) {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
+		if (ret < 0)
+			goto out;
+	} else {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
+		if (ret < 0)
+			goto out;
+	}
+
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
+	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+			sctx->send_root->root_item.uuid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
+		    le64_to_cpu(sctx->send_root->root_item.ctransid));
+	if (parent_root) {
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+				sctx->parent_root->root_item.uuid);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+			    le64_to_cpu(sctx->parent_root->root_item.ctransid));
+	}
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	btrfs_free_path(path);
+	kfree(name);
+	return ret;
+}
+
+static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret = 0;
+	struct fs_path *p = NULL;
+	struct btrfs_inode_item *ii;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	struct btrfs_key key;
+	int slot;
+
+verbose_printk("btrfs: send_utimes %llu\n", ino);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
+	/* TODO Add otime support when the otime patches get into upstream */
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
+ * a valid path yet because we did not process the refs yet. So, the inode
+ * is created as orphan.
+ */
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
+{
+	int ret = 0;
+	struct fs_path *p;
+	int cmd;
+	u64 gen;
+	u64 mode;
+	u64 rdev;
+
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	if (ino != sctx->cur_ino) {
+		ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+				     NULL, NULL, &rdev);
+		if (ret < 0)
+			goto out;
+	} else {
+		gen = sctx->cur_inode_gen;
+		mode = sctx->cur_inode_mode;
+		rdev = sctx->cur_inode_rdev;
+	}
+
+	if (S_ISREG(mode)) {
+		cmd = BTRFS_SEND_C_MKFILE;
+	} else if (S_ISDIR(mode)) {
+		cmd = BTRFS_SEND_C_MKDIR;
+	} else if (S_ISLNK(mode)) {
+		cmd = BTRFS_SEND_C_SYMLINK;
+	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		cmd = BTRFS_SEND_C_MKNOD;
+	} else if (S_ISFIFO(mode)) {
+		cmd = BTRFS_SEND_C_MKFIFO;
+	} else if (S_ISSOCK(mode)) {
+		cmd = BTRFS_SEND_C_MKSOCK;
+	} else {
+		printk(KERN_WARNING "btrfs: unexpected inode type %o",
+				(int)(mode & S_IFMT));
+		ret = -ENOTSUPP;
+		goto out;
+	}
+
+	ret = begin_cmd(sctx, cmd);
+	if (ret < 0)
+		goto out;
+
+	ret = gen_unique_name(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
+
+	if (S_ISLNK(mode)) {
+		fs_path_reset(p);
+		ret = read_symlink(sctx->send_root, ino, p);
+		if (ret < 0)
+			goto out;
+		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
+	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
+		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
+	}
+
+	ret = send_cmd(sctx);
+	if (ret < 0)
+		goto out;
+
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key di_key;
+	struct extent_buffer *eb;
+	struct btrfs_dir_item *di;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(sctx->send_root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+		if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
+		    di_key.objectid < sctx->send_progress) {
+			ret = 1;
+			goto out;
+		}
+
+		path->slots[0]++;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ *    directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+	int ret;
+
+	if (S_ISDIR(sctx->cur_inode_mode)) {
+		ret = did_create_dir(sctx, sctx->cur_ino);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	ret = send_create_inode(sctx, sctx->cur_ino);
+	if (ret < 0)
+		goto out;
+
+out:
+	return ret;
+}
+
+struct recorded_ref {
+	struct list_head list;
+	char *dir_path;
+	char *name;
+	struct fs_path *full_path;
+	u64 dir;
+	u64 dir_gen;
+	int dir_path_len;
+	int name_len;
+};
+
+/*
+ * We need to process new refs before deleted refs, but compare_tree gives us
+ * everything mixed. So we first record all refs and later process them.
+ * This function is a helper to record one ref.
+ */
+static int __record_ref(struct list_head *head, u64 dir,
+		      u64 dir_gen, struct fs_path *path)
+{
+	struct recorded_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->dir = dir;
+	ref->dir_gen = dir_gen;
+	ref->full_path = path;
+
+	ref->name = (char *)kbasename(ref->full_path->start);
+	ref->name_len = ref->full_path->end - ref->name;
+	ref->dir_path = ref->full_path->start;
+	if (ref->name == ref->full_path->start)
+		ref->dir_path_len = 0;
+	else
+		ref->dir_path_len = ref->full_path->end -
+				ref->full_path->start - 1 - ref->name_len;
+
+	list_add_tail(&ref->list, head);
+	return 0;
+}
+
+static int dup_ref(struct recorded_ref *ref, struct list_head *list)
+{
+	struct recorded_ref *new;
+
+	new = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!new)
+		return -ENOMEM;
+
+	new->dir = ref->dir;
+	new->dir_gen = ref->dir_gen;
+	new->full_path = NULL;
+	INIT_LIST_HEAD(&new->list);
+	list_add_tail(&new->list, list);
+	return 0;
+}
+
+static void __free_recorded_refs(struct list_head *head)
+{
+	struct recorded_ref *cur;
+
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, struct recorded_ref, list);
+		fs_path_free(cur->full_path);
+		list_del(&cur->list);
+		kfree(cur);
+	}
+}
+
+static void free_recorded_refs(struct send_ctx *sctx)
+{
+	__free_recorded_refs(&sctx->new_refs);
+	__free_recorded_refs(&sctx->deleted_refs);
+}
+
+/*
+ * Renames/moves a file/dir to its orphan name. Used when the first
+ * ref of an unprocessed inode gets overwritten and for all non empty
+ * directories.
+ */
+static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
+			  struct fs_path *path)
+{
+	int ret;
+	struct fs_path *orphan;
+
+	orphan = fs_path_alloc();
+	if (!orphan)
+		return -ENOMEM;
+
+	ret = gen_unique_name(sctx, ino, gen, orphan);
+	if (ret < 0)
+		goto out;
+
+	ret = send_rename(sctx, path, orphan);
+
+out:
+	fs_path_free(orphan);
+	return ret;
+}
+
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node **p = &sctx->orphan_dirs.rb_node;
+	struct rb_node *parent = NULL;
+	struct orphan_dir_info *entry, *odi;
+
+	odi = kmalloc(sizeof(*odi), GFP_NOFS);
+	if (!odi)
+		return ERR_PTR(-ENOMEM);
+	odi->ino = dir_ino;
+	odi->gen = 0;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino) {
+			p = &(*p)->rb_left;
+		} else if (dir_ino > entry->ino) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(odi);
+			return entry;
+		}
+	}
+
+	rb_link_node(&odi->node, parent, p);
+	rb_insert_color(&odi->node, &sctx->orphan_dirs);
+	return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node *n = sctx->orphan_dirs.rb_node;
+	struct orphan_dir_info *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino)
+			n = n->rb_left;
+		else if (dir_ino > entry->ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+	return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+				 struct orphan_dir_info *odi)
+{
+	if (!odi)
+		return;
+	rb_erase(&odi->node, &sctx->orphan_dirs);
+	kfree(odi);
+}
+
+/*
+ * Returns 1 if a directory can be removed at this point in time.
+ * We check this by iterating all dir items and checking if the inode behind
+ * the dir item was already processed.
+ */
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+		     u64 send_progress)
+{
+	int ret = 0;
+	struct btrfs_root *root = sctx->parent_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key loc;
+	struct btrfs_dir_item *di;
+
+	/*
+	 * Don't try to rmdir the top/root subvolume dir.
+	 */
+	if (dir == BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		struct waiting_dir_move *dm;
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type)
+			break;
+
+		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+
+		dm = get_waiting_dir_move(sctx, loc.objectid);
+		if (dm) {
+			struct orphan_dir_info *odi;
+
+			odi = add_orphan_dir_info(sctx, dir);
+			if (IS_ERR(odi)) {
+				ret = PTR_ERR(odi);
+				goto out;
+			}
+			odi->gen = dir_gen;
+			dm->rmdir_ino = dir;
+			ret = 0;
+			goto out;
+		}
+
+		if (loc.objectid > send_progress) {
+			ret = 0;
+			goto out;
+		}
+
+		path->slots[0]++;
+	}
+
+	ret = 1;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
+
+	return entry != NULL;
+}
+
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+	struct rb_node *parent = NULL;
+	struct waiting_dir_move *entry, *dm;
+
+	dm = kmalloc(sizeof(*dm), GFP_NOFS);
+	if (!dm)
+		return -ENOMEM;
+	dm->ino = ino;
+	dm->rmdir_ino = 0;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct waiting_dir_move, node);
+		if (ino < entry->ino) {
+			p = &(*p)->rb_left;
+		} else if (ino > entry->ino) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(dm);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&dm->node, parent, p);
+	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+	return 0;
+}
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+	struct waiting_dir_move *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct waiting_dir_move, node);
+		if (ino < entry->ino)
+			n = n->rb_left;
+		else if (ino > entry->ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+				  struct waiting_dir_move *dm)
+{
+	if (!dm)
+		return;
+	rb_erase(&dm->node, &sctx->waiting_dir_moves);
+	kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx,
+				u64 ino,
+				u64 ino_gen,
+				u64 parent_ino,
+				struct list_head *new_refs,
+				struct list_head *deleted_refs,
+				const bool is_orphan)
+{
+	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+	struct rb_node *parent = NULL;
+	struct pending_dir_move *entry = NULL, *pm;
+	struct recorded_ref *cur;
+	int exists = 0;
+	int ret;
+
+	pm = kmalloc(sizeof(*pm), GFP_NOFS);
+	if (!pm)
+		return -ENOMEM;
+	pm->parent_ino = parent_ino;
+	pm->ino = ino;
+	pm->gen = ino_gen;
+	pm->is_orphan = is_orphan;
+	INIT_LIST_HEAD(&pm->list);
+	INIT_LIST_HEAD(&pm->update_refs);
+	RB_CLEAR_NODE(&pm->node);
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct pending_dir_move, node);
+		if (parent_ino < entry->parent_ino) {
+			p = &(*p)->rb_left;
+		} else if (parent_ino > entry->parent_ino) {
+			p = &(*p)->rb_right;
+		} else {
+			exists = 1;
+			break;
+		}
+	}
+
+	list_for_each_entry(cur, deleted_refs, list) {
+		ret = dup_ref(cur, &pm->update_refs);
+		if (ret < 0)
+			goto out;
+	}
+	list_for_each_entry(cur, new_refs, list) {
+		ret = dup_ref(cur, &pm->update_refs);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = add_waiting_dir_move(sctx, pm->ino);
+	if (ret)
+		goto out;
+
+	if (exists) {
+		list_add_tail(&pm->list, &entry->list);
+	} else {
+		rb_link_node(&pm->node, parent, p);
+		rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+	}
+	ret = 0;
+out:
+	if (ret) {
+		__free_recorded_refs(&pm->update_refs);
+		kfree(pm);
+	}
+	return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+						      u64 parent_ino)
+{
+	struct rb_node *n = sctx->pending_dir_moves.rb_node;
+	struct pending_dir_move *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct pending_dir_move, node);
+		if (parent_ino < entry->parent_ino)
+			n = n->rb_left;
+		else if (parent_ino > entry->parent_ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+	struct fs_path *from_path = NULL;
+	struct fs_path *to_path = NULL;
+	struct fs_path *name = NULL;
+	u64 orig_progress = sctx->send_progress;
+	struct recorded_ref *cur;
+	u64 parent_ino, parent_gen;
+	struct waiting_dir_move *dm = NULL;
+	u64 rmdir_ino = 0;
+	int ret;
+
+	name = fs_path_alloc();
+	from_path = fs_path_alloc();
+	if (!name || !from_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dm = get_waiting_dir_move(sctx, pm->ino);
+	ASSERT(dm);
+	rmdir_ino = dm->rmdir_ino;
+	free_waiting_dir_move(sctx, dm);
+
+	if (pm->is_orphan) {
+		ret = gen_unique_name(sctx, pm->ino,
+				      pm->gen, from_path);
+	} else {
+		ret = get_first_ref(sctx->parent_root, pm->ino,
+				    &parent_ino, &parent_gen, name);
+		if (ret < 0)
+			goto out;
+		ret = get_cur_path(sctx, parent_ino, parent_gen,
+				   from_path);
+		if (ret < 0)
+			goto out;
+		ret = fs_path_add_path(from_path, name);
+	}
+	if (ret < 0)
+		goto out;
+
+	sctx->send_progress = sctx->cur_ino + 1;
+	fs_path_reset(name);
+	to_path = name;
+	name = NULL;
+	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+	if (ret < 0)
+		goto out;
+
+	ret = send_rename(sctx, from_path, to_path);
+	if (ret < 0)
+		goto out;
+
+	if (rmdir_ino) {
+		struct orphan_dir_info *odi;
+
+		odi = get_orphan_dir_info(sctx, rmdir_ino);
+		if (!odi) {
+			/* already deleted */
+			goto finish;
+		}
+		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+		if (ret < 0)
+			goto out;
+		if (!ret)
+			goto finish;
+
+		name = fs_path_alloc();
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+		if (ret < 0)
+			goto out;
+		ret = send_rmdir(sctx, name);
+		if (ret < 0)
+			goto out;
+		free_orphan_dir_info(sctx, odi);
+	}
+
+finish:
+	ret = send_utimes(sctx, pm->ino, pm->gen);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * After rename/move, need to update the utimes of both new parent(s)
+	 * and old parent(s).
+	 */
+	list_for_each_entry(cur, &pm->update_refs, list) {
+		if (cur->dir == rmdir_ino)
+			continue;
+		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	fs_path_free(name);
+	fs_path_free(from_path);
+	fs_path_free(to_path);
+	sctx->send_progress = orig_progress;
+
+	return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+	if (!list_empty(&m->list))
+		list_del(&m->list);
+	if (!RB_EMPTY_NODE(&m->node))
+		rb_erase(&m->node, &sctx->pending_dir_moves);
+	__free_recorded_refs(&m->update_refs);
+	kfree(m);
+}
+
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+				      struct list_head *stack)
+{
+	if (list_empty(&moves->list)) {
+		list_add_tail(&moves->list, stack);
+	} else {
+		LIST_HEAD(list);
+		list_splice_init(&moves->list, &list);
+		list_add_tail(&moves->list, stack);
+		list_splice_tail(&list, stack);
+	}
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+	struct pending_dir_move *pm;
+	struct list_head stack;
+	u64 parent_ino = sctx->cur_ino;
+	int ret = 0;
+
+	pm = get_pending_dir_moves(sctx, parent_ino);
+	if (!pm)
+		return 0;
+
+	INIT_LIST_HEAD(&stack);
+	tail_append_pending_moves(pm, &stack);
+
+	while (!list_empty(&stack)) {
+		pm = list_first_entry(&stack, struct pending_dir_move, list);
+		parent_ino = pm->ino;
+		ret = apply_dir_move(sctx, pm);
+		free_pending_move(sctx, pm);
+		if (ret)
+			goto out;
+		pm = get_pending_dir_moves(sctx, parent_ino);
+		if (pm)
+			tail_append_pending_moves(pm, &stack);
+	}
+	return 0;
+
+out:
+	while (!list_empty(&stack)) {
+		pm = list_first_entry(&stack, struct pending_dir_move, list);
+		free_pending_move(sctx, pm);
+	}
+	return ret;
+}
+
+/*
+ * We might need to delay a directory rename even when no ancestor directory
+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
+ * renamed. This happens when we rename a directory to the old name (the name
+ * in the parent root) of some other unrelated directory that got its rename
+ * delayed due to some ancestor with higher number that got renamed.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ * .                                       (ino 256)
+ * |---- a/                                (ino 257)
+ * |     |---- file                        (ino 260)
+ * |
+ * |---- b/                                (ino 258)
+ * |---- c/                                (ino 259)
+ *
+ * Send snapshot:
+ * .                                       (ino 256)
+ * |---- a/                                (ino 258)
+ * |---- x/                                (ino 259)
+ *       |---- y/                          (ino 257)
+ *             |----- file                 (ino 260)
+ *
+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
+ * must issue is:
+ *
+ * 1 - rename 259 from 'c' to 'x'
+ * 2 - rename 257 from 'a' to 'x/y'
+ * 3 - rename 258 from 'b' to 'a'
+ *
+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
+ * be done right away and < 0 on error.
+ */
+static int wait_for_dest_dir_move(struct send_ctx *sctx,
+				  struct recorded_ref *parent_ref,
+				  const bool is_orphan)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key di_key;
+	struct btrfs_dir_item *di;
+	u64 left_gen;
+	u64 right_gen;
+	int ret = 0;
+
+	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
+		return 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = parent_ref->dir;
+	key.type = BTRFS_DIR_ITEM_KEY;
+	key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
+
+	ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+
+	di = btrfs_match_dir_item_name(sctx->parent_root, path,
+				       parent_ref->name, parent_ref->name_len);
+	if (!di) {
+		ret = 0;
+		goto out;
+	}
+	/*
+	 * di_key.objectid has the number of the inode that has a dentry in the
+	 * parent directory with the same name that sctx->cur_ino is being
+	 * renamed to. We need to check if that inode is in the send root as
+	 * well and if it is currently marked as an inode with a pending rename,
+	 * if it is, we need to delay the rename of sctx->cur_ino as well, so
+	 * that it happens after that other inode is renamed.
+	 */
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
+	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
+			     &left_gen, NULL, NULL, NULL, NULL);
+	if (ret < 0)
+		goto out;
+	ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
+			     &right_gen, NULL, NULL, NULL, NULL);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	/* Different inode, no need to delay the rename of sctx->cur_ino */
+	if (right_gen != left_gen) {
+		ret = 0;
+		goto out;
+	}
+
+	if (is_waiting_for_move(sctx, di_key.objectid)) {
+		ret = add_pending_dir_move(sctx,
+					   sctx->cur_ino,
+					   sctx->cur_inode_gen,
+					   di_key.objectid,
+					   &sctx->new_refs,
+					   &sctx->deleted_refs,
+					   is_orphan);
+		if (!ret)
+			ret = 1;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+				struct recorded_ref *parent_ref)
+{
+	int ret = 0;
+	u64 ino = parent_ref->dir;
+	u64 parent_ino_before, parent_ino_after;
+	struct fs_path *path_before = NULL;
+	struct fs_path *path_after = NULL;
+	int len1, len2;
+
+	path_after = fs_path_alloc();
+	path_before = fs_path_alloc();
+	if (!path_after || !path_before) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Our current directory inode may not yet be renamed/moved because some
+	 * ancestor (immediate or not) has to be renamed/moved first. So find if
+	 * such ancestor exists and make sure our own rename/move happens after
+	 * that ancestor is processed.
+	 */
+	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+		if (is_waiting_for_move(sctx, ino)) {
+			ret = 1;
+			break;
+		}
+
+		fs_path_reset(path_before);
+		fs_path_reset(path_after);
+
+		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+				    NULL, path_after);
+		if (ret < 0)
+			goto out;
+		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+				    NULL, path_before);
+		if (ret < 0 && ret != -ENOENT) {
+			goto out;
+		} else if (ret == -ENOENT) {
+			ret = 0;
+			break;
+		}
+
+		len1 = fs_path_len(path_before);
+		len2 = fs_path_len(path_after);
+		if (ino > sctx->cur_ino &&
+		    (parent_ino_before != parent_ino_after || len1 != len2 ||
+		     memcmp(path_before->start, path_after->start, len1))) {
+			ret = 1;
+			break;
+		}
+		ino = parent_ino_after;
+	}
+
+out:
+	fs_path_free(path_before);
+	fs_path_free(path_after);
+
+	if (ret == 1) {
+		ret = add_pending_dir_move(sctx,
+					   sctx->cur_ino,
+					   sctx->cur_inode_gen,
+					   ino,
+					   &sctx->new_refs,
+					   &sctx->deleted_refs,
+					   false);
+		if (!ret)
+			ret = 1;
+	}
+
+	return ret;
+}
+
+/*
+ * This does all the move/link/unlink/rmdir magic.
+ */
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+{
+	int ret = 0;
+	struct recorded_ref *cur;
+	struct recorded_ref *cur2;
+	struct list_head check_dirs;
+	struct fs_path *valid_path = NULL;
+	u64 ow_inode = 0;
+	u64 ow_gen;
+	int did_overwrite = 0;
+	int is_orphan = 0;
+	u64 last_dir_ino_rm = 0;
+	bool can_rename = true;
+
+verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+
+	/*
+	 * This should never happen as the root dir always has the same ref
+	 * which is always '..'
+	 */
+	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+	INIT_LIST_HEAD(&check_dirs);
+
+	valid_path = fs_path_alloc();
+	if (!valid_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * First, check if the first ref of the current inode was overwritten
+	 * before. If yes, we know that the current inode was already orphanized
+	 * and thus use the orphan name. If not, we can use get_cur_path to
+	 * get the path of the first ref as it would like while receiving at
+	 * this point in time.
+	 * New inodes are always orphan at the beginning, so force to use the
+	 * orphan name in this case.
+	 * The first ref is stored in valid_path and will be updated if it
+	 * gets moved around.
+	 */
+	if (!sctx->cur_inode_new) {
+		ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
+				sctx->cur_inode_gen);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			did_overwrite = 1;
+	}
+	if (sctx->cur_inode_new || did_overwrite) {
+		ret = gen_unique_name(sctx, sctx->cur_ino,
+				sctx->cur_inode_gen, valid_path);
+		if (ret < 0)
+			goto out;
+		is_orphan = 1;
+	} else {
+		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				valid_path);
+		if (ret < 0)
+			goto out;
+	}
+
+	list_for_each_entry(cur, &sctx->new_refs, list) {
+		/*
+		 * We may have refs where the parent directory does not exist
+		 * yet. This happens if the parent directories inum is higher
+		 * the the current inum. To handle this case, we create the
+		 * parent directory out of order. But we need to check if this
+		 * did already happen before due to other refs in the same dir.
+		 */
+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+		if (ret == inode_state_will_create) {
+			ret = 0;
+			/*
+			 * First check if any of the current inodes refs did
+			 * already create the dir.
+			 */
+			list_for_each_entry(cur2, &sctx->new_refs, list) {
+				if (cur == cur2)
+					break;
+				if (cur2->dir == cur->dir) {
+					ret = 1;
+					break;
+				}
+			}
+
+			/*
+			 * If that did not happen, check if a previous inode
+			 * did already create the dir.
+			 */
+			if (!ret)
+				ret = did_create_dir(sctx, cur->dir);
+			if (ret < 0)
+				goto out;
+			if (!ret) {
+				ret = send_create_inode(sctx, cur->dir);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		/*
+		 * Check if this new ref would overwrite the first ref of
+		 * another unprocessed inode. If yes, orphanize the
+		 * overwritten inode. If we find an overwritten ref that is
+		 * not the first ref, simply unlink it.
+		 */
+		ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+				cur->name, cur->name_len,
+				&ow_inode, &ow_gen);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = is_first_ref(sctx->parent_root,
+					   ow_inode, cur->dir, cur->name,
+					   cur->name_len);
+			if (ret < 0)
+				goto out;
+			if (ret) {
+				struct name_cache_entry *nce;
+
+				ret = orphanize_inode(sctx, ow_inode, ow_gen,
+						cur->full_path);
+				if (ret < 0)
+					goto out;
+				/*
+				 * Make sure we clear our orphanized inode's
+				 * name from the name cache. This is because the
+				 * inode ow_inode might be an ancestor of some
+				 * other inode that will be orphanized as well
+				 * later and has an inode number greater than
+				 * sctx->send_progress. We need to prevent
+				 * future name lookups from using the old name
+				 * and get instead the orphan name.
+				 */
+				nce = name_cache_search(sctx, ow_inode, ow_gen);
+				if (nce) {
+					name_cache_delete(sctx, nce);
+					kfree(nce);
+				}
+			} else {
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
+			ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
+			if (ret < 0)
+				goto out;
+			if (ret == 1) {
+				can_rename = false;
+				*pending_move = 1;
+			}
+		}
+
+		/*
+		 * link/move the ref to the new place. If we have an orphan
+		 * inode, move it and update valid_path. If not, link or move
+		 * it depending on the inode mode.
+		 */
+		if (is_orphan && can_rename) {
+			ret = send_rename(sctx, valid_path, cur->full_path);
+			if (ret < 0)
+				goto out;
+			is_orphan = 0;
+			ret = fs_path_copy(valid_path, cur->full_path);
+			if (ret < 0)
+				goto out;
+		} else if (can_rename) {
+			if (S_ISDIR(sctx->cur_inode_mode)) {
+				/*
+				 * Dirs can't be linked, so move it. For moved
+				 * dirs, we always have one new and one deleted
+				 * ref. The deleted ref is ignored later.
+				 */
+				ret = wait_for_parent_move(sctx, cur);
+				if (ret < 0)
+					goto out;
+				if (ret) {
+					*pending_move = 1;
+				} else {
+					ret = send_rename(sctx, valid_path,
+							  cur->full_path);
+					if (!ret)
+						ret = fs_path_copy(valid_path,
+							       cur->full_path);
+				}
+				if (ret < 0)
+					goto out;
+			} else {
+				ret = send_link(sctx, cur->full_path,
+						valid_path);
+				if (ret < 0)
+					goto out;
+			}
+		}
+		ret = dup_ref(cur, &check_dirs);
+		if (ret < 0)
+			goto out;
+	}
+
+	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
+		/*
+		 * Check if we can already rmdir the directory. If not,
+		 * orphanize it. For every dir item inside that gets deleted
+		 * later, we do this check again and rmdir it then if possible.
+		 * See the use of check_dirs for more details.
+		 */
+		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				sctx->cur_ino);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = send_rmdir(sctx, valid_path);
+			if (ret < 0)
+				goto out;
+		} else if (!is_orphan) {
+			ret = orphanize_inode(sctx, sctx->cur_ino,
+					sctx->cur_inode_gen, valid_path);
+			if (ret < 0)
+				goto out;
+			is_orphan = 1;
+		}
+
+		list_for_each_entry(cur, &sctx->deleted_refs, list) {
+			ret = dup_ref(cur, &check_dirs);
+			if (ret < 0)
+				goto out;
+		}
+	} else if (S_ISDIR(sctx->cur_inode_mode) &&
+		   !list_empty(&sctx->deleted_refs)) {
+		/*
+		 * We have a moved dir. Add the old parent to check_dirs
+		 */
+		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+				list);
+		ret = dup_ref(cur, &check_dirs);
+		if (ret < 0)
+			goto out;
+	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
+		/*
+		 * We have a non dir inode. Go through all deleted refs and
+		 * unlink them if they were not already overwritten by other
+		 * inodes.
+		 */
+		list_for_each_entry(cur, &sctx->deleted_refs, list) {
+			ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+					sctx->cur_ino, sctx->cur_inode_gen,
+					cur->name, cur->name_len);
+			if (ret < 0)
+				goto out;
+			if (!ret) {
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
+			}
+			ret = dup_ref(cur, &check_dirs);
+			if (ret < 0)
+				goto out;
+		}
+		/*
+		 * If the inode is still orphan, unlink the orphan. This may
+		 * happen when a previous inode did overwrite the first ref
+		 * of this inode and no new refs were added for the current
+		 * inode. Unlinking does not mean that the inode is deleted in
+		 * all cases. There may still be links to this inode in other
+		 * places.
+		 */
+		if (is_orphan) {
+			ret = send_unlink(sctx, valid_path);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	/*
+	 * We did collect all parent dirs where cur_inode was once located. We
+	 * now go through all these dirs and check if they are pending for
+	 * deletion and if it's finally possible to perform the rmdir now.
+	 * We also update the inode stats of the parent dirs here.
+	 */
+	list_for_each_entry(cur, &check_dirs, list) {
+		/*
+		 * In case we had refs into dirs that were not processed yet,
+		 * we don't need to do the utime and rmdir logic for these dirs.
+		 * The dir will be processed later.
+		 */
+		if (cur->dir > sctx->cur_ino)
+			continue;
+
+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+
+		if (ret == inode_state_did_create ||
+		    ret == inode_state_no_change) {
+			/* TODO delayed utimes */
+			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+			if (ret < 0)
+				goto out;
+		} else if (ret == inode_state_did_delete &&
+			   cur->dir != last_dir_ino_rm) {
+			ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+					sctx->cur_ino);
+			if (ret < 0)
+				goto out;
+			if (ret) {
+				ret = get_cur_path(sctx, cur->dir,
+						   cur->dir_gen, valid_path);
+				if (ret < 0)
+					goto out;
+				ret = send_rmdir(sctx, valid_path);
+				if (ret < 0)
+					goto out;
+				last_dir_ino_rm = cur->dir;
+			}
+		}
+	}
+
+	ret = 0;
+
+out:
+	__free_recorded_refs(&check_dirs);
+	free_recorded_refs(sctx);
+	fs_path_free(valid_path);
+	return ret;
+}
+
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+		      struct fs_path *name, void *ctx, struct list_head *refs)
+{
+	int ret = 0;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+	u64 gen;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
+			NULL, NULL);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, dir, gen, p);
+	if (ret < 0)
+		goto out;
+	ret = fs_path_add_path(p, name);
+	if (ret < 0)
+		goto out;
+
+	ret = __record_ref(refs, dir, gen, p);
+
+out:
+	if (ret)
+		fs_path_free(p);
+	return ret;
+}
+
+static int __record_new_ref(int num, u64 dir, int index,
+			    struct fs_path *name,
+			    void *ctx)
+{
+	struct send_ctx *sctx = ctx;
+	return record_ref(sctx->send_root, num, dir, index, name,
+			  ctx, &sctx->new_refs);
+}
+
+
+static int __record_deleted_ref(int num, u64 dir, int index,
+				struct fs_path *name,
+				void *ctx)
+{
+	struct send_ctx *sctx = ctx;
+	return record_ref(sctx->parent_root, num, dir, index, name,
+			  ctx, &sctx->deleted_refs);
+}
+
+static int record_new_ref(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+				sctx->cmp_key, 0, __record_new_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int record_deleted_ref(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+				sctx->cmp_key, 0, __record_deleted_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	return ret;
+}
+
+struct find_ref_ctx {
+	u64 dir;
+	u64 dir_gen;
+	struct btrfs_root *root;
+	struct fs_path *name;
+	int found_idx;
+};
+
+static int __find_iref(int num, u64 dir, int index,
+		       struct fs_path *name,
+		       void *ctx_)
+{
+	struct find_ref_ctx *ctx = ctx_;
+	u64 dir_gen;
+	int ret;
+
+	if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
+	    strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
+		/*
+		 * To avoid doing extra lookups we'll only do this if everything
+		 * else matches.
+		 */
+		ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret)
+			return ret;
+		if (dir_gen != ctx->dir_gen)
+			return 0;
+		ctx->found_idx = num;
+		return 1;
+	}
+	return 0;
+}
+
+static int find_iref(struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *key,
+		     u64 dir, u64 dir_gen, struct fs_path *name)
+{
+	int ret;
+	struct find_ref_ctx ctx;
+
+	ctx.dir = dir;
+	ctx.name = name;
+	ctx.dir_gen = dir_gen;
+	ctx.found_idx = -1;
+	ctx.root = root;
+
+	ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
+	if (ret < 0)
+		return ret;
+
+	if (ctx.found_idx == -1)
+		return -ENOENT;
+
+	return ctx.found_idx;
+}
+
+static int __record_changed_new_ref(int num, u64 dir, int index,
+				    struct fs_path *name,
+				    void *ctx)
+{
+	u64 dir_gen;
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
+	ret = find_iref(sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, dir, dir_gen, name);
+	if (ret == -ENOENT)
+		ret = __record_new_ref(num, dir, index, name, sctx);
+	else if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int __record_changed_deleted_ref(int num, u64 dir, int index,
+					struct fs_path *name,
+					void *ctx)
+{
+	u64 dir_gen;
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
+	ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+			dir, dir_gen, name);
+	if (ret == -ENOENT)
+		ret = __record_deleted_ref(num, dir, index, name, sctx);
+	else if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int record_changed_ref(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+			sctx->cmp_key, 0, __record_changed_new_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * Record and process all refs at once. Needed when an inode changes the
+ * generation number, which means that it was deleted and recreated.
+ */
+static int process_all_refs(struct send_ctx *sctx,
+			    enum btrfs_compare_tree_result cmd)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+	iterate_inode_ref_t cb;
+	int pending_move = 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	if (cmd == BTRFS_COMPARE_TREE_NEW) {
+		root = sctx->send_root;
+		cb = __record_new_ref;
+	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
+		root = sctx->parent_root;
+		cb = __record_deleted_ref;
+	} else {
+		btrfs_err(sctx->send_root->fs_info,
+				"Wrong command %d in process_all_refs", cmd);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		if (found_key.objectid != key.objectid ||
+		    (found_key.type != BTRFS_INODE_REF_KEY &&
+		     found_key.type != BTRFS_INODE_EXTREF_KEY))
+			break;
+
+		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+		if (ret < 0)
+			goto out;
+
+		path->slots[0]++;
+	}
+	btrfs_release_path(path);
+
+	ret = process_recorded_refs(sctx, &pending_move);
+	/* Only applicable to an incremental send. */
+	ASSERT(pending_move == 0);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int send_set_xattr(struct send_ctx *sctx,
+			  struct fs_path *path,
+			  const char *name, int name_len,
+			  const char *data, int data_len)
+{
+	int ret = 0;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+static int send_remove_xattr(struct send_ctx *sctx,
+			  struct fs_path *path,
+			  const char *name, int name_len)
+{
+	int ret = 0;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+static int __process_new_xattr(int num, struct btrfs_key *di_key,
+			       const char *name, int name_len,
+			       const char *data, int data_len,
+			       u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+	posix_acl_xattr_header dummy_acl;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	/*
+	 * This hack is needed because empty acl's are stored as zero byte
+	 * data in xattrs. Problem with that is, that receiving these zero byte
+	 * acl's will fail later. To fix this, we send a dummy acl list that
+	 * only contains the version number and no entries.
+	 */
+	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
+	    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
+		if (data_len == 0) {
+			dummy_acl.a_version =
+					cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+			data = (char *)&dummy_acl;
+			data_len = sizeof(dummy_acl);
+		}
+	}
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
+
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
+				   const char *name, int name_len,
+				   const char *data, int data_len,
+				   u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	ret = send_remove_xattr(sctx, p, name, name_len);
+
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int process_new_xattr(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+			       sctx->cmp_key, __process_new_xattr, sctx);
+
+	return ret;
+}
+
+static int process_deleted_xattr(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+			       sctx->cmp_key, __process_deleted_xattr, sctx);
+
+	return ret;
+}
+
+struct find_xattr_ctx {
+	const char *name;
+	int name_len;
+	int found_idx;
+	char *found_data;
+	int found_data_len;
+};
+
+static int __find_xattr(int num, struct btrfs_key *di_key,
+			const char *name, int name_len,
+			const char *data, int data_len,
+			u8 type, void *vctx)
+{
+	struct find_xattr_ctx *ctx = vctx;
+
+	if (name_len == ctx->name_len &&
+	    strncmp(name, ctx->name, name_len) == 0) {
+		ctx->found_idx = num;
+		ctx->found_data_len = data_len;
+		ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
+		if (!ctx->found_data)
+			return -ENOMEM;
+		return 1;
+	}
+	return 0;
+}
+
+static int find_xattr(struct btrfs_root *root,
+		      struct btrfs_path *path,
+		      struct btrfs_key *key,
+		      const char *name, int name_len,
+		      char **data, int *data_len)
+{
+	int ret;
+	struct find_xattr_ctx ctx;
+
+	ctx.name = name;
+	ctx.name_len = name_len;
+	ctx.found_idx = -1;
+	ctx.found_data = NULL;
+	ctx.found_data_len = 0;
+
+	ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
+	if (ret < 0)
+		return ret;
+
+	if (ctx.found_idx == -1)
+		return -ENOENT;
+	if (data) {
+		*data = ctx.found_data;
+		*data_len = ctx.found_data_len;
+	} else {
+		kfree(ctx.found_data);
+	}
+	return ctx.found_idx;
+}
+
+
+static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
+				       const char *name, int name_len,
+				       const char *data, int data_len,
+				       u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	char *found_data = NULL;
+	int found_data_len  = 0;
+
+	ret = find_xattr(sctx->parent_root, sctx->right_path,
+			 sctx->cmp_key, name, name_len, &found_data,
+			 &found_data_len);
+	if (ret == -ENOENT) {
+		ret = __process_new_xattr(num, di_key, name, name_len, data,
+				data_len, type, ctx);
+	} else if (ret >= 0) {
+		if (data_len != found_data_len ||
+		    memcmp(data, found_data, data_len)) {
+			ret = __process_new_xattr(num, di_key, name, name_len,
+					data, data_len, type, ctx);
+		} else {
+			ret = 0;
+		}
+	}
+
+	kfree(found_data);
+	return ret;
+}
+
+static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
+					   const char *name, int name_len,
+					   const char *data, int data_len,
+					   u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+			 name, name_len, NULL, NULL);
+	if (ret == -ENOENT)
+		ret = __process_deleted_xattr(num, di_key, name, name_len, data,
+				data_len, type, ctx);
+	else if (ret >= 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int process_changed_xattr(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+			sctx->cmp_key, __process_changed_new_xattr, sctx);
+	if (ret < 0)
+		goto out;
+	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, __process_changed_deleted_xattr, sctx);
+
+out:
+	return ret;
+}
+
+static int process_all_new_xattrs(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	root = sctx->send_root;
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_XATTR_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = iterate_dir_item(root, path, &found_key,
+				       __process_new_xattr, sctx);
+		if (ret < 0)
+			goto out;
+
+		path->slots[0]++;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+{
+	struct btrfs_root *root = sctx->send_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct inode *inode;
+	struct page *page;
+	char *addr;
+	struct btrfs_key key;
+	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+	pgoff_t last_index;
+	unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+	ssize_t ret = 0;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (offset + len > i_size_read(inode)) {
+		if (offset > i_size_read(inode))
+			len = 0;
+		else
+			len = offset - i_size_read(inode);
+	}
+	if (len == 0)
+		goto out;
+
+	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+	/* initial readahead */
+	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+	file_ra_state_init(&sctx->ra, inode->i_mapping);
+	btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+		       last_index - index + 1);
+
+	while (index <= last_index) {
+		unsigned cur_len = min_t(unsigned, len,
+					 PAGE_CACHE_SIZE - pg_offset);
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				break;
+			}
+		}
+
+		addr = kmap(page);
+		memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+		pg_offset = 0;
+		len -= cur_len;
+		ret += cur_len;
+	}
+out:
+	iput(inode);
+	return ret;
+}
+
+/*
+ * Read some bytes from the current inode/file and send a write command to
+ * user space.
+ */
+static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
+{
+	int ret = 0;
+	struct fs_path *p;
+	ssize_t num_read = 0;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
+
+	num_read = fill_read_buf(sctx, offset, len);
+	if (num_read <= 0) {
+		if (num_read < 0)
+			ret = num_read;
+		goto out;
+	}
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	if (ret < 0)
+		return ret;
+	return num_read;
+}
+
+/*
+ * Send a clone command to user space.
+ */
+static int send_clone(struct send_ctx *sctx,
+		      u64 offset, u32 len,
+		      struct clone_root *clone_root)
+{
+	int ret = 0;
+	struct fs_path *p;
+	u64 gen;
+
+verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
+	       "clone_inode=%llu, clone_offset=%llu\n", offset, len,
+		clone_root->root->objectid, clone_root->ino,
+		clone_root->offset);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+
+	if (clone_root->root == sctx->send_root) {
+		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
+				&gen, NULL, NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+		ret = get_cur_path(sctx, clone_root->ino, gen, p);
+	} else {
+		ret = get_inode_path(clone_root->root, clone_root->ino, p);
+	}
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+			clone_root->root->root_item.uuid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+		    le64_to_cpu(clone_root->root->root_item.ctransid));
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
+			clone_root->offset);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+			      u64 offset, u32 len)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+	struct fs_path *p = NULL;
+	u64 offset = sctx->cur_inode_last_extent;
+	u64 len;
+	int ret = 0;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto tlv_put_failure;
+	memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+	while (offset < end) {
+		len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+
+		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+		if (ret < 0)
+			break;
+		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+		TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+		ret = send_cmd(sctx);
+		if (ret < 0)
+			break;
+		offset += len;
+	}
+tlv_put_failure:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_write_or_clone(struct send_ctx *sctx,
+			       struct btrfs_path *path,
+			       struct btrfs_key *key,
+			       struct clone_root *clone_root)
+{
+	int ret = 0;
+	struct btrfs_file_extent_item *ei;
+	u64 offset = key->offset;
+	u64 pos = 0;
+	u64 len;
+	u32 l;
+	u8 type;
+	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], ei);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		len = btrfs_file_extent_inline_len(path->nodes[0],
+						   path->slots[0], ei);
+		/*
+		 * it is possible the inline item won't cover the whole page,
+		 * but there may be items after this page.  Make
+		 * sure to send the whole thing
+		 */
+		len = PAGE_CACHE_ALIGN(len);
+	} else {
+		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+	}
+
+	if (offset + len > sctx->cur_inode_size)
+		len = sctx->cur_inode_size - offset;
+	if (len == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	if (clone_root && IS_ALIGNED(offset + len, bs)) {
+		ret = send_clone(sctx, offset, len, clone_root);
+	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+		ret = send_update_extent(sctx, offset, len);
+	} else {
+		while (pos < len) {
+			l = len - pos;
+			if (l > BTRFS_SEND_READ_SIZE)
+				l = BTRFS_SEND_READ_SIZE;
+			ret = send_write(sctx, pos + offset, l);
+			if (ret < 0)
+				goto out;
+			if (!ret)
+				break;
+			pos += ret;
+		}
+		ret = 0;
+	}
+out:
+	return ret;
+}
+
+static int is_extent_unchanged(struct send_ctx *sctx,
+			       struct btrfs_path *left_path,
+			       struct btrfs_key *ekey)
+{
+	int ret = 0;
+	struct btrfs_key key;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	int slot;
+	struct btrfs_key found_key;
+	struct btrfs_file_extent_item *ei;
+	u64 left_disknr;
+	u64 right_disknr;
+	u64 left_offset;
+	u64 right_offset;
+	u64 left_offset_fixed;
+	u64 left_len;
+	u64 right_len;
+	u64 left_gen;
+	u64 right_gen;
+	u8 left_type;
+	u8 right_type;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	eb = left_path->nodes[0];
+	slot = left_path->slots[0];
+	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	left_type = btrfs_file_extent_type(eb, ei);
+
+	if (left_type != BTRFS_FILE_EXTENT_REG) {
+		ret = 0;
+		goto out;
+	}
+	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+	left_len = btrfs_file_extent_num_bytes(eb, ei);
+	left_offset = btrfs_file_extent_offset(eb, ei);
+	left_gen = btrfs_file_extent_generation(eb, ei);
+
+	/*
+	 * Following comments will refer to these graphics. L is the left
+	 * extents which we are checking at the moment. 1-8 are the right
+	 * extents that we iterate.
+	 *
+	 *       |-----L-----|
+	 * |-1-|-2a-|-3-|-4-|-5-|-6-|
+	 *
+	 *       |-----L-----|
+	 * |--1--|-2b-|...(same as above)
+	 *
+	 * Alternative situation. Happens on files where extents got split.
+	 *       |-----L-----|
+	 * |-----------7-----------|-6-|
+	 *
+	 * Alternative situation. Happens on files which got larger.
+	 *       |-----L-----|
+	 * |-8-|
+	 * Nothing follows after 8.
+	 */
+
+	key.objectid = ekey->objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = ekey->offset;
+	ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * Handle special case where the right side has no extents at all.
+	 */
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	btrfs_item_key_to_cpu(eb, &found_key, slot);
+	if (found_key.objectid != key.objectid ||
+	    found_key.type != key.type) {
+		/* If we're a hole then just pretend nothing changed */
+		ret = (left_disknr) ? 0 : 1;
+		goto out;
+	}
+
+	/*
+	 * We're now on 2a, 2b or 7.
+	 */
+	key = found_key;
+	while (key.offset < ekey->offset + left_len) {
+		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		right_type = btrfs_file_extent_type(eb, ei);
+		if (right_type != BTRFS_FILE_EXTENT_REG) {
+			ret = 0;
+			goto out;
+		}
+
+		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+		right_len = btrfs_file_extent_num_bytes(eb, ei);
+		right_offset = btrfs_file_extent_offset(eb, ei);
+		right_gen = btrfs_file_extent_generation(eb, ei);
+
+		/*
+		 * Are we at extent 8? If yes, we know the extent is changed.
+		 * This may only happen on the first iteration.
+		 */
+		if (found_key.offset + right_len <= ekey->offset) {
+			/* If we're a hole just pretend nothing changed */
+			ret = (left_disknr) ? 0 : 1;
+			goto out;
+		}
+
+		left_offset_fixed = left_offset;
+		if (key.offset < ekey->offset) {
+			/* Fix the right offset for 2a and 7. */
+			right_offset += ekey->offset - key.offset;
+		} else {
+			/* Fix the left offset for all behind 2a and 2b */
+			left_offset_fixed += key.offset - ekey->offset;
+		}
+
+		/*
+		 * Check if we have the same extent.
+		 */
+		if (left_disknr != right_disknr ||
+		    left_offset_fixed != right_offset ||
+		    left_gen != right_gen) {
+			ret = 0;
+			goto out;
+		}
+
+		/*
+		 * Go to the next extent.
+		 */
+		ret = btrfs_next_item(sctx->parent_root, path);
+		if (ret < 0)
+			goto out;
+		if (!ret) {
+			eb = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(eb, &found_key, slot);
+		}
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			key.offset += right_len;
+			break;
+		}
+		if (found_key.offset != key.offset + right_len) {
+			ret = 0;
+			goto out;
+		}
+		key = found_key;
+	}
+
+	/*
+	 * We're now behind the left extent (treat as unchanged) or at the end
+	 * of the right side (treat as changed).
+	 */
+	if (key.offset >= ekey->offset + left_len)
+		ret = 1;
+	else
+		ret = 0;
+
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root = sctx->send_root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+	u8 type;
+	int ret;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	sctx->cur_inode_last_extent = 0;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+	ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+		goto out;
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], fi);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+							path->slots[0], fi);
+		extent_end = ALIGN(key.offset + size,
+				   sctx->send_root->sectorsize);
+	} else {
+		extent_end = key.offset +
+			btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	}
+	sctx->cur_inode_last_extent = extent_end;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+			   struct btrfs_key *key)
+{
+	struct btrfs_file_extent_item *fi;
+	u64 extent_end;
+	u8 type;
+	int ret = 0;
+
+	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+		return 0;
+
+	if (sctx->cur_inode_last_extent == (u64)-1) {
+		ret = get_last_extent(sctx, key->offset - 1);
+		if (ret)
+			return ret;
+	}
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], fi);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+							path->slots[0], fi);
+		extent_end = ALIGN(key->offset + size,
+				   sctx->send_root->sectorsize);
+	} else {
+		extent_end = key->offset +
+			btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	}
+
+	if (path->slots[0] == 0 &&
+	    sctx->cur_inode_last_extent < key->offset) {
+		/*
+		 * We might have skipped entire leafs that contained only
+		 * file extent items for our current inode. These leafs have
+		 * a generation number smaller (older) than the one in the
+		 * current leaf and the leaf our last extent came from, and
+		 * are located between these 2 leafs.
+		 */
+		ret = get_last_extent(sctx, key->offset - 1);
+		if (ret)
+			return ret;
+	}
+
+	if (sctx->cur_inode_last_extent < key->offset)
+		ret = send_hole(sctx, key->offset);
+	sctx->cur_inode_last_extent = extent_end;
+	return ret;
+}
+
+static int process_extent(struct send_ctx *sctx,
+			  struct btrfs_path *path,
+			  struct btrfs_key *key)
+{
+	struct clone_root *found_clone = NULL;
+	int ret = 0;
+
+	if (S_ISLNK(sctx->cur_inode_mode))
+		return 0;
+
+	if (sctx->parent_root && !sctx->cur_inode_new) {
+		ret = is_extent_unchanged(sctx, path, key);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out_hole;
+		}
+	} else {
+		struct btrfs_file_extent_item *ei;
+		u8 type;
+
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		type = btrfs_file_extent_type(path->nodes[0], ei);
+		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+		    type == BTRFS_FILE_EXTENT_REG) {
+			/*
+			 * The send spec does not have a prealloc command yet,
+			 * so just leave a hole for prealloc'ed extents until
+			 * we have enough commands queued up to justify rev'ing
+			 * the send spec.
+			 */
+			if (type == BTRFS_FILE_EXTENT_PREALLOC) {
+				ret = 0;
+				goto out;
+			}
+
+			/* Have a hole, just skip it. */
+			if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
+				ret = 0;
+				goto out;
+			}
+		}
+	}
+
+	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
+			sctx->cur_inode_size, &found_clone);
+	if (ret != -ENOENT && ret < 0)
+		goto out;
+
+	ret = send_write_or_clone(sctx, path, key, found_clone);
+	if (ret)
+		goto out;
+out_hole:
+	ret = maybe_send_hole(sctx, path, key);
+out:
+	return ret;
+}
+
+static int process_all_extents(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	root = sctx->send_root;
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = process_extent(sctx, path, &found_key);
+		if (ret < 0)
+			goto out;
+
+		path->slots[0]++;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+					   int *pending_move,
+					   int *refs_processed)
+{
+	int ret = 0;
+
+	if (sctx->cur_ino == 0)
+		goto out;
+	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
+	    sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
+		goto out;
+	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
+		goto out;
+
+	ret = process_recorded_refs(sctx, pending_move);
+	if (ret < 0)
+		goto out;
+
+	*refs_processed = 1;
+out:
+	return ret;
+}
+
+static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+{
+	int ret = 0;
+	u64 left_mode;
+	u64 left_uid;
+	u64 left_gid;
+	u64 right_mode;
+	u64 right_uid;
+	u64 right_gid;
+	int need_chmod = 0;
+	int need_chown = 0;
+	int pending_move = 0;
+	int refs_processed = 0;
+
+	ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+					      &refs_processed);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * We have processed the refs and thus need to advance send_progress.
+	 * Now, calls to get_cur_xxx will take the updated refs of the current
+	 * inode into account.
+	 *
+	 * On the other hand, if our current inode is a directory and couldn't
+	 * be moved/renamed because its parent was renamed/moved too and it has
+	 * a higher inode number, we can only move/rename our current inode
+	 * after we moved/renamed its parent. Therefore in this case operate on
+	 * the old path (pre move/rename) of our current inode, and the
+	 * move/rename will be performed later.
+	 */
+	if (refs_processed && !pending_move)
+		sctx->send_progress = sctx->cur_ino + 1;
+
+	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
+		goto out;
+	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
+		goto out;
+
+	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
+			&left_mode, &left_uid, &left_gid, NULL);
+	if (ret < 0)
+		goto out;
+
+	if (!sctx->parent_root || sctx->cur_inode_new) {
+		need_chown = 1;
+		if (!S_ISLNK(sctx->cur_inode_mode))
+			need_chmod = 1;
+	} else {
+		ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
+				NULL, NULL, &right_mode, &right_uid,
+				&right_gid, NULL);
+		if (ret < 0)
+			goto out;
+
+		if (left_uid != right_uid || left_gid != right_gid)
+			need_chown = 1;
+		if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
+			need_chmod = 1;
+	}
+
+	if (S_ISREG(sctx->cur_inode_mode)) {
+		if (need_send_hole(sctx)) {
+			if (sctx->cur_inode_last_extent == (u64)-1 ||
+			    sctx->cur_inode_last_extent <
+			    sctx->cur_inode_size) {
+				ret = get_last_extent(sctx, (u64)-1);
+				if (ret)
+					goto out;
+			}
+			if (sctx->cur_inode_last_extent <
+			    sctx->cur_inode_size) {
+				ret = send_hole(sctx, sctx->cur_inode_size);
+				if (ret)
+					goto out;
+			}
+		}
+		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				sctx->cur_inode_size);
+		if (ret < 0)
+			goto out;
+	}
+
+	if (need_chown) {
+		ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				left_uid, left_gid);
+		if (ret < 0)
+			goto out;
+	}
+	if (need_chmod) {
+		ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				left_mode);
+		if (ret < 0)
+			goto out;
+	}
+
+	/*
+	 * If other directory inodes depended on our current directory
+	 * inode's move/rename, now do their move/rename operations.
+	 */
+	if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+		ret = apply_children_dir_moves(sctx);
+		if (ret)
+			goto out;
+		/*
+		 * Need to send that every time, no matter if it actually
+		 * changed between the two trees as we have done changes to
+		 * the inode before. If our inode is a directory and it's
+		 * waiting to be moved/renamed, we will send its utimes when
+		 * it's moved/renamed, therefore we don't need to do it here.
+		 */
+		sctx->send_progress = sctx->cur_ino + 1;
+		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+static int changed_inode(struct send_ctx *sctx,
+			 enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+	struct btrfs_key *key = sctx->cmp_key;
+	struct btrfs_inode_item *left_ii = NULL;
+	struct btrfs_inode_item *right_ii = NULL;
+	u64 left_gen = 0;
+	u64 right_gen = 0;
+
+	sctx->cur_ino = key->objectid;
+	sctx->cur_inode_new_gen = 0;
+	sctx->cur_inode_last_extent = (u64)-1;
+
+	/*
+	 * Set send_progress to current inode. This will tell all get_cur_xxx
+	 * functions that the current inode's refs are not updated yet. Later,
+	 * when process_recorded_refs is finished, it is set to cur_ino + 1.
+	 */
+	sctx->send_progress = sctx->cur_ino;
+
+	if (result == BTRFS_COMPARE_TREE_NEW ||
+	    result == BTRFS_COMPARE_TREE_CHANGED) {
+		left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
+				sctx->left_path->slots[0],
+				struct btrfs_inode_item);
+		left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
+				left_ii);
+	} else {
+		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+				sctx->right_path->slots[0],
+				struct btrfs_inode_item);
+		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+				right_ii);
+	}
+	if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+				sctx->right_path->slots[0],
+				struct btrfs_inode_item);
+
+		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+				right_ii);
+
+		/*
+		 * The cur_ino = root dir case is special here. We can't treat
+		 * the inode as deleted+reused because it would generate a
+		 * stream that tries to delete/mkdir the root dir.
+		 */
+		if (left_gen != right_gen &&
+		    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+			sctx->cur_inode_new_gen = 1;
+	}
+
+	if (result == BTRFS_COMPARE_TREE_NEW) {
+		sctx->cur_inode_gen = left_gen;
+		sctx->cur_inode_new = 1;
+		sctx->cur_inode_deleted = 0;
+		sctx->cur_inode_size = btrfs_inode_size(
+				sctx->left_path->nodes[0], left_ii);
+		sctx->cur_inode_mode = btrfs_inode_mode(
+				sctx->left_path->nodes[0], left_ii);
+		sctx->cur_inode_rdev = btrfs_inode_rdev(
+				sctx->left_path->nodes[0], left_ii);
+		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+			ret = send_create_inode_if_needed(sctx);
+	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
+		sctx->cur_inode_gen = right_gen;
+		sctx->cur_inode_new = 0;
+		sctx->cur_inode_deleted = 1;
+		sctx->cur_inode_size = btrfs_inode_size(
+				sctx->right_path->nodes[0], right_ii);
+		sctx->cur_inode_mode = btrfs_inode_mode(
+				sctx->right_path->nodes[0], right_ii);
+	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		/*
+		 * We need to do some special handling in case the inode was
+		 * reported as changed with a changed generation number. This
+		 * means that the original inode was deleted and new inode
+		 * reused the same inum. So we have to treat the old inode as
+		 * deleted and the new one as new.
+		 */
+		if (sctx->cur_inode_new_gen) {
+			/*
+			 * First, process the inode as if it was deleted.
+			 */
+			sctx->cur_inode_gen = right_gen;
+			sctx->cur_inode_new = 0;
+			sctx->cur_inode_deleted = 1;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->right_path->nodes[0], right_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->right_path->nodes[0], right_ii);
+			ret = process_all_refs(sctx,
+					BTRFS_COMPARE_TREE_DELETED);
+			if (ret < 0)
+				goto out;
+
+			/*
+			 * Now process the inode as if it was new.
+			 */
+			sctx->cur_inode_gen = left_gen;
+			sctx->cur_inode_new = 1;
+			sctx->cur_inode_deleted = 0;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_rdev = btrfs_inode_rdev(
+					sctx->left_path->nodes[0], left_ii);
+			ret = send_create_inode_if_needed(sctx);
+			if (ret < 0)
+				goto out;
+
+			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+			if (ret < 0)
+				goto out;
+			/*
+			 * Advance send_progress now as we did not get into
+			 * process_recorded_refs_if_needed in the new_gen case.
+			 */
+			sctx->send_progress = sctx->cur_ino + 1;
+
+			/*
+			 * Now process all extents and xattrs of the inode as if
+			 * they were all new.
+			 */
+			ret = process_all_extents(sctx);
+			if (ret < 0)
+				goto out;
+			ret = process_all_new_xattrs(sctx);
+			if (ret < 0)
+				goto out;
+		} else {
+			sctx->cur_inode_gen = left_gen;
+			sctx->cur_inode_new = 0;
+			sctx->cur_inode_new_gen = 0;
+			sctx->cur_inode_deleted = 0;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->left_path->nodes[0], left_ii);
+		}
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
+static int changed_ref(struct send_ctx *sctx,
+		       enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen &&
+	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		if (result == BTRFS_COMPARE_TREE_NEW)
+			ret = record_new_ref(sctx);
+		else if (result == BTRFS_COMPARE_TREE_DELETED)
+			ret = record_deleted_ref(sctx);
+		else if (result == BTRFS_COMPARE_TREE_CHANGED)
+			ret = record_changed_ref(sctx);
+	}
+
+	return ret;
+}
+
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
+static int changed_xattr(struct send_ctx *sctx,
+			 enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+		if (result == BTRFS_COMPARE_TREE_NEW)
+			ret = process_new_xattr(sctx);
+		else if (result == BTRFS_COMPARE_TREE_DELETED)
+			ret = process_deleted_xattr(sctx);
+		else if (result == BTRFS_COMPARE_TREE_CHANGED)
+			ret = process_changed_xattr(sctx);
+	}
+
+	return ret;
+}
+
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
+static int changed_extent(struct send_ctx *sctx,
+			  enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+		if (result != BTRFS_COMPARE_TREE_DELETED)
+			ret = process_extent(sctx, sctx->left_path,
+					sctx->cmp_key);
+	}
+
+	return ret;
+}
+
+static int dir_changed(struct send_ctx *sctx, u64 dir)
+{
+	u64 orig_gen, new_gen;
+	int ret;
+
+	ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
+			     NULL, NULL);
+	if (ret)
+		return ret;
+
+	ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
+	return (orig_gen != new_gen) ? 1 : 0;
+}
+
+static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
+			struct btrfs_key *key)
+{
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u64 dirid = 0, last_dirid = 0;
+	unsigned long ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+	int ret = 0;
+
+	/* Easy case, just check this one dirid */
+	if (key->type == BTRFS_INODE_REF_KEY) {
+		dirid = key->offset;
+
+		ret = dir_changed(sctx, dirid);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *)(ptr +
+						       cur_offset);
+		dirid = btrfs_inode_extref_parent(leaf, extref);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+		cur_offset += ref_name_len + sizeof(*extref);
+		if (dirid == last_dirid)
+			continue;
+		ret = dir_changed(sctx, dirid);
+		if (ret)
+			break;
+		last_dirid = dirid;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
+static int changed_cb(struct btrfs_root *left_root,
+		      struct btrfs_root *right_root,
+		      struct btrfs_path *left_path,
+		      struct btrfs_path *right_path,
+		      struct btrfs_key *key,
+		      enum btrfs_compare_tree_result result,
+		      void *ctx)
+{
+	int ret = 0;
+	struct send_ctx *sctx = ctx;
+
+	if (result == BTRFS_COMPARE_TREE_SAME) {
+		if (key->type == BTRFS_INODE_REF_KEY ||
+		    key->type == BTRFS_INODE_EXTREF_KEY) {
+			ret = compare_refs(sctx, left_path, key);
+			if (!ret)
+				return 0;
+			if (ret < 0)
+				return ret;
+		} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+			return maybe_send_hole(sctx, left_path, key);
+		} else {
+			return 0;
+		}
+		result = BTRFS_COMPARE_TREE_CHANGED;
+		ret = 0;
+	}
+
+	sctx->left_path = left_path;
+	sctx->right_path = right_path;
+	sctx->cmp_key = key;
+
+	ret = finish_inode_if_needed(sctx, 0);
+	if (ret < 0)
+		goto out;
+
+	/* Ignore non-FS objects */
+	if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+	    key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+		goto out;
+
+	if (key->type == BTRFS_INODE_ITEM_KEY)
+		ret = changed_inode(sctx, result);
+	else if (key->type == BTRFS_INODE_REF_KEY ||
+		 key->type == BTRFS_INODE_EXTREF_KEY)
+		ret = changed_ref(sctx, result);
+	else if (key->type == BTRFS_XATTR_ITEM_KEY)
+		ret = changed_xattr(sctx, result);
+	else if (key->type == BTRFS_EXTENT_DATA_KEY)
+		ret = changed_extent(sctx, result);
+
+out:
+	return ret;
+}
+
+static int full_send_tree(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *send_root = sctx->send_root;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *eb;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret)
+		goto out_finish;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		ret = changed_cb(send_root, NULL, path, NULL,
+				&found_key, BTRFS_COMPARE_TREE_NEW, sctx);
+		if (ret < 0)
+			goto out;
+
+		key.objectid = found_key.objectid;
+		key.type = found_key.type;
+		key.offset = found_key.offset + 1;
+
+		ret = btrfs_next_item(send_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret  = 0;
+			break;
+		}
+	}
+
+out_finish:
+	ret = finish_inode_if_needed(sctx, 1);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int send_subvol(struct send_ctx *sctx)
+{
+	int ret;
+
+	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
+		ret = send_header(sctx);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = send_subvol_begin(sctx);
+	if (ret < 0)
+		goto out;
+
+	if (sctx->parent_root) {
+		ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
+				changed_cb, sctx);
+		if (ret < 0)
+			goto out;
+		ret = finish_inode_if_needed(sctx, 1);
+		if (ret < 0)
+			goto out;
+	} else {
+		ret = full_send_tree(sctx);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	free_recorded_refs(sctx);
+	return ret;
+}
+
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+	int i;
+	struct btrfs_trans_handle *trans = NULL;
+
+again:
+	if (sctx->parent_root &&
+	    sctx->parent_root->node != sctx->parent_root->commit_root)
+		goto commit_trans;
+
+	for (i = 0; i < sctx->clone_roots_cnt; i++)
+		if (sctx->clone_roots[i].root->node !=
+		    sctx->clone_roots[i].root->commit_root)
+			goto commit_trans;
+
+	if (trans)
+		return btrfs_end_transaction(trans, sctx->send_root);
+
+	return 0;
+
+commit_trans:
+	/* Use any root, all fs roots will get their commit roots updated. */
+	if (!trans) {
+		trans = btrfs_join_transaction(sctx->send_root);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		goto again;
+	}
+
+	return btrfs_commit_transaction(trans, sctx->send_root);
+}
+
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+	spin_lock(&root->root_item_lock);
+	root->send_in_progress--;
+	/*
+	 * Not much left to do, we don't know why it's unbalanced and
+	 * can't blindly reset it to 0.
+	 */
+	if (root->send_in_progress < 0)
+		btrfs_err(root->fs_info,
+			"send_in_progres unbalanced %d root %llu",
+			root->send_in_progress, root->root_key.objectid);
+	spin_unlock(&root->root_item_lock);
+}
+
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
+{
+	int ret = 0;
+	struct btrfs_root *send_root;
+	struct btrfs_root *clone_root;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_ioctl_send_args *arg = NULL;
+	struct btrfs_key key;
+	struct send_ctx *sctx = NULL;
+	u32 i;
+	u64 *clone_sources_tmp = NULL;
+	int clone_sources_to_rollback = 0;
+	int sort_clone_roots = 0;
+	int index;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	send_root = BTRFS_I(file_inode(mnt_file))->root;
+	fs_info = send_root->fs_info;
+
+	/*
+	 * The subvolume must remain read-only during send, protect against
+	 * making it RW. This also protects against deletion.
+	 */
+	spin_lock(&send_root->root_item_lock);
+	send_root->send_in_progress++;
+	spin_unlock(&send_root->root_item_lock);
+
+	/*
+	 * This is done when we lookup the root, it should already be complete
+	 * by the time we get here.
+	 */
+	WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+	/*
+	 * Userspace tools do the checks and warn the user if it's
+	 * not RO.
+	 */
+	if (!btrfs_root_readonly(send_root)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	arg = memdup_user(arg_, sizeof(*arg));
+	if (IS_ERR(arg)) {
+		ret = PTR_ERR(arg);
+		arg = NULL;
+		goto out;
+	}
+
+	if (!access_ok(VERIFY_READ, arg->clone_sources,
+			sizeof(*arg->clone_sources) *
+			arg->clone_sources_count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
+	if (!sctx) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&sctx->new_refs);
+	INIT_LIST_HEAD(&sctx->deleted_refs);
+	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
+	INIT_LIST_HEAD(&sctx->name_cache_list);
+
+	sctx->flags = arg->flags;
+
+	sctx->send_filp = fget(arg->send_fd);
+	if (!sctx->send_filp) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	sctx->send_root = send_root;
+	/*
+	 * Unlikely but possible, if the subvolume is marked for deletion but
+	 * is slow to remove the directory entry, send can still be started
+	 */
+	if (btrfs_root_dead(sctx->send_root)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	sctx->clone_roots_cnt = arg->clone_sources_count;
+
+	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
+	sctx->send_buf = vmalloc(sctx->send_max_size);
+	if (!sctx->send_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+	if (!sctx->read_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sctx->pending_dir_moves = RB_ROOT;
+	sctx->waiting_dir_moves = RB_ROOT;
+	sctx->orphan_dirs = RB_ROOT;
+
+	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
+			(arg->clone_sources_count + 1));
+	if (!sctx->clone_roots) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (arg->clone_sources_count) {
+		clone_sources_tmp = vmalloc(arg->clone_sources_count *
+				sizeof(*arg->clone_sources));
+		if (!clone_sources_tmp) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
+				arg->clone_sources_count *
+				sizeof(*arg->clone_sources));
+		if (ret) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		for (i = 0; i < arg->clone_sources_count; i++) {
+			key.objectid = clone_sources_tmp[i];
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.offset = (u64)-1;
+
+			index = srcu_read_lock(&fs_info->subvol_srcu);
+
+			clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+			if (IS_ERR(clone_root)) {
+				srcu_read_unlock(&fs_info->subvol_srcu, index);
+				ret = PTR_ERR(clone_root);
+				goto out;
+			}
+			spin_lock(&clone_root->root_item_lock);
+			if (!btrfs_root_readonly(clone_root) ||
+			    btrfs_root_dead(clone_root)) {
+				spin_unlock(&clone_root->root_item_lock);
+				srcu_read_unlock(&fs_info->subvol_srcu, index);
+				ret = -EPERM;
+				goto out;
+			}
+			clone_root->send_in_progress++;
+			spin_unlock(&clone_root->root_item_lock);
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+			sctx->clone_roots[i].root = clone_root;
+			clone_sources_to_rollback = i + 1;
+		}
+		vfree(clone_sources_tmp);
+		clone_sources_tmp = NULL;
+	}
+
+	if (arg->parent_root) {
+		key.objectid = arg->parent_root;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+
+		index = srcu_read_lock(&fs_info->subvol_srcu);
+
+		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (IS_ERR(sctx->parent_root)) {
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+			ret = PTR_ERR(sctx->parent_root);
+			goto out;
+		}
+
+		spin_lock(&sctx->parent_root->root_item_lock);
+		sctx->parent_root->send_in_progress++;
+		if (!btrfs_root_readonly(sctx->parent_root) ||
+				btrfs_root_dead(sctx->parent_root)) {
+			spin_unlock(&sctx->parent_root->root_item_lock);
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+			ret = -EPERM;
+			goto out;
+		}
+		spin_unlock(&sctx->parent_root->root_item_lock);
+
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+	}
+
+	/*
+	 * Clones from send_root are allowed, but only if the clone source
+	 * is behind the current send position. This is checked while searching
+	 * for possible clone sources.
+	 */
+	sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+
+	/* We do a bsearch later */
+	sort(sctx->clone_roots, sctx->clone_roots_cnt,
+			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
+			NULL);
+	sort_clone_roots = 1;
+
+	ret = ensure_commit_roots_uptodate(sctx);
+	if (ret)
+		goto out;
+
+	current->journal_info = BTRFS_SEND_TRANS_STUB;
+	ret = send_subvol(sctx);
+	current->journal_info = NULL;
+	if (ret < 0)
+		goto out;
+
+	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+		if (ret < 0)
+			goto out;
+		ret = send_cmd(sctx);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+		struct rb_node *n;
+		struct pending_dir_move *pm;
+
+		n = rb_first(&sctx->pending_dir_moves);
+		pm = rb_entry(n, struct pending_dir_move, node);
+		while (!list_empty(&pm->list)) {
+			struct pending_dir_move *pm2;
+
+			pm2 = list_first_entry(&pm->list,
+					       struct pending_dir_move, list);
+			free_pending_move(sctx, pm2);
+		}
+		free_pending_move(sctx, pm);
+	}
+
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+		struct rb_node *n;
+		struct waiting_dir_move *dm;
+
+		n = rb_first(&sctx->waiting_dir_moves);
+		dm = rb_entry(n, struct waiting_dir_move, node);
+		rb_erase(&dm->node, &sctx->waiting_dir_moves);
+		kfree(dm);
+	}
+
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+		struct rb_node *n;
+		struct orphan_dir_info *odi;
+
+		n = rb_first(&sctx->orphan_dirs);
+		odi = rb_entry(n, struct orphan_dir_info, node);
+		free_orphan_dir_info(sctx, odi);
+	}
+
+	if (sort_clone_roots) {
+		for (i = 0; i < sctx->clone_roots_cnt; i++)
+			btrfs_root_dec_send_in_progress(
+					sctx->clone_roots[i].root);
+	} else {
+		for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+			btrfs_root_dec_send_in_progress(
+					sctx->clone_roots[i].root);
+
+		btrfs_root_dec_send_in_progress(send_root);
+	}
+	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+		btrfs_root_dec_send_in_progress(sctx->parent_root);
+
+	kfree(arg);
+	vfree(clone_sources_tmp);
+
+	if (sctx) {
+		if (sctx->send_filp)
+			fput(sctx->send_filp);
+
+		vfree(sctx->clone_roots);
+		vfree(sctx->send_buf);
+		vfree(sctx->read_buf);
+
+		name_cache_free(sctx);
+
+		kfree(sctx);
+	}
+
+	return ret;
+}
diff --git a/kernel/fs/btrfs/send.h b/kernel/fs/btrfs/send.h
new file mode 100644
index 000000000..48d425aef
--- /dev/null
+++ b/kernel/fs/btrfs/send.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2012 Alexander Block.  All rights reserved.
+ * Copyright (C) 2012 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+
+#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
+#define BTRFS_SEND_STREAM_VERSION 1
+
+#define BTRFS_SEND_BUF_SIZE (1024 * 64)
+#define BTRFS_SEND_READ_SIZE (1024 * 48)
+
+enum btrfs_tlv_type {
+	BTRFS_TLV_U8,
+	BTRFS_TLV_U16,
+	BTRFS_TLV_U32,
+	BTRFS_TLV_U64,
+	BTRFS_TLV_BINARY,
+	BTRFS_TLV_STRING,
+	BTRFS_TLV_UUID,
+	BTRFS_TLV_TIMESPEC,
+};
+
+struct btrfs_stream_header {
+	char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)];
+	__le32 version;
+} __attribute__ ((__packed__));
+
+struct btrfs_cmd_header {
+	/* len excluding the header */
+	__le32 len;
+	__le16 cmd;
+	/* crc including the header with zero crc field */
+	__le32 crc;
+} __attribute__ ((__packed__));
+
+struct btrfs_tlv_header {
+	__le16 tlv_type;
+	/* len excluding the header */
+	__le16 tlv_len;
+} __attribute__ ((__packed__));
+
+/* commands */
+enum btrfs_send_cmd {
+	BTRFS_SEND_C_UNSPEC,
+
+	BTRFS_SEND_C_SUBVOL,
+	BTRFS_SEND_C_SNAPSHOT,
+
+	BTRFS_SEND_C_MKFILE,
+	BTRFS_SEND_C_MKDIR,
+	BTRFS_SEND_C_MKNOD,
+	BTRFS_SEND_C_MKFIFO,
+	BTRFS_SEND_C_MKSOCK,
+	BTRFS_SEND_C_SYMLINK,
+
+	BTRFS_SEND_C_RENAME,
+	BTRFS_SEND_C_LINK,
+	BTRFS_SEND_C_UNLINK,
+	BTRFS_SEND_C_RMDIR,
+
+	BTRFS_SEND_C_SET_XATTR,
+	BTRFS_SEND_C_REMOVE_XATTR,
+
+	BTRFS_SEND_C_WRITE,
+	BTRFS_SEND_C_CLONE,
+
+	BTRFS_SEND_C_TRUNCATE,
+	BTRFS_SEND_C_CHMOD,
+	BTRFS_SEND_C_CHOWN,
+	BTRFS_SEND_C_UTIMES,
+
+	BTRFS_SEND_C_END,
+	BTRFS_SEND_C_UPDATE_EXTENT,
+	__BTRFS_SEND_C_MAX,
+};
+#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
+
+/* attributes in send stream */
+enum {
+	BTRFS_SEND_A_UNSPEC,
+
+	BTRFS_SEND_A_UUID,
+	BTRFS_SEND_A_CTRANSID,
+
+	BTRFS_SEND_A_INO,
+	BTRFS_SEND_A_SIZE,
+	BTRFS_SEND_A_MODE,
+	BTRFS_SEND_A_UID,
+	BTRFS_SEND_A_GID,
+	BTRFS_SEND_A_RDEV,
+	BTRFS_SEND_A_CTIME,
+	BTRFS_SEND_A_MTIME,
+	BTRFS_SEND_A_ATIME,
+	BTRFS_SEND_A_OTIME,
+
+	BTRFS_SEND_A_XATTR_NAME,
+	BTRFS_SEND_A_XATTR_DATA,
+
+	BTRFS_SEND_A_PATH,
+	BTRFS_SEND_A_PATH_TO,
+	BTRFS_SEND_A_PATH_LINK,
+
+	BTRFS_SEND_A_FILE_OFFSET,
+	BTRFS_SEND_A_DATA,
+
+	BTRFS_SEND_A_CLONE_UUID,
+	BTRFS_SEND_A_CLONE_CTRANSID,
+	BTRFS_SEND_A_CLONE_PATH,
+	BTRFS_SEND_A_CLONE_OFFSET,
+	BTRFS_SEND_A_CLONE_LEN,
+
+	__BTRFS_SEND_A_MAX,
+};
+#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
+
+#ifdef __KERNEL__
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+#endif
diff --git a/kernel/fs/btrfs/struct-funcs.c b/kernel/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000..b976597b0
--- /dev/null
+++ b/kernel/fs/btrfs/struct-funcs.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/highmem.h>
+#include <asm/unaligned.h>
+
+#include "ctree.h"
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+       return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+       *(u8 *)p = val;
+}
+
+/*
+ * this is some deeply nasty code.
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions,
+ * which are wappers of btrfs_set_token_#bits functions and
+ * btrfs_get_token_#bits functions, which are defined in this file.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers.  Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type.  This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do the page spanning work required to
+ * have a metadata blocksize different from the page size.
+ */
+
+#define DEFINE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			       unsigned long off,			\
+			       struct btrfs_map_token *token)		\
+{									\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+	u##bits res;							\
+									\
+	if (token && token->kaddr && token->offset <= offset &&		\
+	    token->eb == eb &&						\
+	   (token->offset + PAGE_CACHE_SIZE >= offset + size)) {	\
+		kaddr = token->kaddr;					\
+		p = kaddr + part_offset - token->offset;		\
+		res = get_unaligned_le##bits(p + off);			\
+		return res;						\
+	}								\
+	err = map_private_extent_buffer(eb, offset, size,		\
+					&kaddr, &map_start, &map_len);	\
+	if (err) {							\
+		__le##bits leres;					\
+									\
+		read_extent_buffer(eb, &leres, offset, size);		\
+		return le##bits##_to_cpu(leres);			\
+	}								\
+	p = kaddr + part_offset - map_start;				\
+	res = get_unaligned_le##bits(p + off);				\
+	if (token) {							\
+		token->kaddr = kaddr;					\
+		token->offset = map_start;				\
+		token->eb = eb;						\
+	}								\
+	return res;							\
+}									\
+void btrfs_set_token_##bits(struct extent_buffer *eb,			\
+			    void *ptr, unsigned long off, u##bits val,	\
+			    struct btrfs_map_token *token)		\
+{									\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+									\
+	if (token && token->kaddr && token->offset <= offset &&		\
+	    token->eb == eb &&						\
+	   (token->offset + PAGE_CACHE_SIZE >= offset + size)) {	\
+		kaddr = token->kaddr;					\
+		p = kaddr + part_offset - token->offset;		\
+		put_unaligned_le##bits(val, p + off);			\
+		return;							\
+	}								\
+	err = map_private_extent_buffer(eb, offset, size,		\
+			&kaddr, &map_start, &map_len);			\
+	if (err) {							\
+		__le##bits val2;					\
+									\
+		val2 = cpu_to_le##bits(val);				\
+		write_extent_buffer(eb, &val2, offset, size);		\
+		return;							\
+	}								\
+	p = kaddr + part_offset - map_start;				\
+	put_unaligned_le##bits(val, p + off);				\
+	if (token) {							\
+		token->kaddr = kaddr;					\
+		token->offset = map_start;				\
+		token->eb = eb;						\
+	}								\
+}
+
+DEFINE_BTRFS_SETGET_BITS(8)
+DEFINE_BTRFS_SETGET_BITS(16)
+DEFINE_BTRFS_SETGET_BITS(32)
+DEFINE_BTRFS_SETGET_BITS(64)
+
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/kernel/fs/btrfs/super.c b/kernel/fs/btrfs/super.c
new file mode 100644
index 000000000..9e66f5e72
--- /dev/null
+++ b/kernel/fs/btrfs/super.c
@@ -0,0 +1,2220 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/cleancache.h>
+#include <linux/ratelimit.h>
+#include <linux/btrfs.h>
+#include "delayed-inode.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "hash.h"
+#include "props.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "export.h"
+#include "compression.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "free-space-cache.h"
+#include "backref.h"
+#include "tests/btrfs-tests.h"
+
+#include "qgroup.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
+
+static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+
+static const char *btrfs_decode_error(int errno)
+{
+	char *errstr = "unknown";
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		errstr = "Readonly filesystem";
+		break;
+	case -EEXIST:
+		errstr = "Object already exists";
+		break;
+	case -ENOSPC:
+		errstr = "No space left";
+		break;
+	case -ENOENT:
+		errstr = "No such entry";
+		break;
+	}
+
+	return errstr;
+}
+
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * today we only save the error info into ram.  Long term we'll
+	 * also send it down to the disk
+	 */
+	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+}
+
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+	struct super_block *sb = fs_info->sb;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		sb->s_flags |= MS_RDONLY;
+		btrfs_info(fs_info, "forced readonly");
+		/*
+		 * Note that a running device replace operation is not
+		 * canceled here although there is no way to update
+		 * the progress. It would add the risk of a deadlock,
+		 * therefore the canceling is ommited. The only penalty
+		 * is that some I/O remains active until the procedure
+		 * completes. The next time when the filesystem is
+		 * mounted writeable again, the device replace
+		 * operation continues.
+		 */
+	}
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		       unsigned int line, int errno, const char *fmt, ...)
+{
+	struct super_block *sb = fs_info->sb;
+	const char *errstr;
+
+	/*
+	 * Special case: if the error is EROFS, and we're already
+	 * under MS_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+  		return;
+
+	errstr = btrfs_decode_error(errno);
+	if (fmt) {
+		struct va_format vaf;
+		va_list args;
+
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_CRIT
+			"BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+			sb->s_id, function, line, errno, errstr, &vaf);
+		va_end(args);
+	} else {
+		printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
+			sb->s_id, function, line, errno, errstr);
+	}
+
+	/* Don't go through full error handling during mount */
+	save_error_info(fs_info);
+	if (sb->s_flags & MS_BORN)
+		btrfs_handle_error(fs_info);
+}
+
+static const char * const logtypes[] = {
+	"emergency",
+	"alert",
+	"critical",
+	"error",
+	"warning",
+	"notice",
+	"info",
+	"debug",
+};
+
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+	struct super_block *sb = fs_info->sb;
+	char lvl[4];
+	struct va_format vaf;
+	va_list args;
+	const char *type = logtypes[4];
+	int kern_level;
+
+	va_start(args, fmt);
+
+	kern_level = printk_get_level(fmt);
+	if (kern_level) {
+		size_t size = printk_skip_level(fmt) - fmt;
+		memcpy(lvl, fmt,  size);
+		lvl[size] = '\0';
+		fmt += size;
+		type = logtypes[kern_level - '0'];
+	} else
+		*lvl = '\0';
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+
+	va_end(args);
+}
+
+#else
+
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		       unsigned int line, int errno, const char *fmt, ...)
+{
+	struct super_block *sb = fs_info->sb;
+
+	/*
+	 * Special case: if the error is EROFS, and we're already
+	 * under MS_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+		return;
+
+	/* Don't go through full error handling during mount */
+	if (sb->s_flags & MS_BORN) {
+		save_error_info(fs_info);
+		btrfs_handle_error(fs_info);
+	}
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, const char *function,
+			       unsigned int line, int errno)
+{
+	/*
+	 * Report first abort since mount
+	 */
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
+				&root->fs_info->fs_state)) {
+		WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
+				errno);
+	}
+	trans->aborted = errno;
+	/* Nothing used. The other threads that have joined this
+	 * transaction may be able to continue. */
+	if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
+		const char *errstr;
+
+		errstr = btrfs_decode_error(errno);
+		btrfs_warn(root->fs_info,
+		           "%s:%d: Aborting unused transaction(%s).",
+		           function, line, errstr);
+		return;
+	}
+	ACCESS_ONCE(trans->transaction->aborted) = errno;
+	/* Wake up anybody who may be waiting on this transaction */
+	wake_up(&root->fs_info->transaction_wait);
+	wake_up(&root->fs_info->transaction_blocked_wait);
+	__btrfs_std_error(root->fs_info, function, line, errno, NULL);
+}
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ * issues an alert, and either panics or BUGs, depending on mount options.
+ */
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...)
+{
+	char *s_id = "<unknown>";
+	const char *errstr;
+	struct va_format vaf = { .fmt = fmt };
+	va_list args;
+
+	if (fs_info)
+		s_id = fs_info->sb->s_id;
+
+	va_start(args, fmt);
+	vaf.va = &args;
+
+	errstr = btrfs_decode_error(errno);
+	if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
+		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+			s_id, function, line, &vaf, errno, errstr);
+
+	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+		   function, line, &vaf, errno, errstr);
+	va_end(args);
+	/* Caller calls BUG() */
+}
+
+static void btrfs_put_super(struct super_block *sb)
+{
+	close_ctree(btrfs_sb(sb)->tree_root);
+}
+
+enum {
+	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
+	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
+	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
+	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+	Opt_check_integrity, Opt_check_integrity_including_extent_data,
+	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
+	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
+	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
+	Opt_datasum, Opt_treelog, Opt_noinode_cache,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_degraded, "degraded"},
+	{Opt_subvol, "subvol=%s"},
+	{Opt_subvolid, "subvolid=%s"},
+	{Opt_device, "device=%s"},
+	{Opt_nodatasum, "nodatasum"},
+	{Opt_datasum, "datasum"},
+	{Opt_nodatacow, "nodatacow"},
+	{Opt_datacow, "datacow"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_barrier, "barrier"},
+	{Opt_max_inline, "max_inline=%s"},
+	{Opt_alloc_start, "alloc_start=%s"},
+	{Opt_thread_pool, "thread_pool=%d"},
+	{Opt_compress, "compress"},
+	{Opt_compress_type, "compress=%s"},
+	{Opt_compress_force, "compress-force"},
+	{Opt_compress_force_type, "compress-force=%s"},
+	{Opt_ssd, "ssd"},
+	{Opt_ssd_spread, "ssd_spread"},
+	{Opt_nossd, "nossd"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_notreelog, "notreelog"},
+	{Opt_treelog, "treelog"},
+	{Opt_flushoncommit, "flushoncommit"},
+	{Opt_noflushoncommit, "noflushoncommit"},
+	{Opt_ratio, "metadata_ratio=%d"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_space_cache, "space_cache"},
+	{Opt_clear_cache, "clear_cache"},
+	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+	{Opt_enospc_debug, "enospc_debug"},
+	{Opt_noenospc_debug, "noenospc_debug"},
+	{Opt_subvolrootid, "subvolrootid=%d"},
+	{Opt_defrag, "autodefrag"},
+	{Opt_nodefrag, "noautodefrag"},
+	{Opt_inode_cache, "inode_cache"},
+	{Opt_noinode_cache, "noinode_cache"},
+	{Opt_no_space_cache, "nospace_cache"},
+	{Opt_recovery, "recovery"},
+	{Opt_skip_balance, "skip_balance"},
+	{Opt_check_integrity, "check_int"},
+	{Opt_check_integrity_including_extent_data, "check_int_data"},
+	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
+	{Opt_fatal_errors, "fatal_errors=%s"},
+	{Opt_commit_interval, "commit=%d"},
+	{Opt_err, NULL},
+};
+
+/*
+ * Regular mount options parser.  Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ * XXX JDM: This needs to be cleaned up for remount.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *num, *orig = NULL;
+	u64 cache_gen;
+	int intarg;
+	int ret = 0;
+	char *compress_type;
+	bool compress_force = false;
+
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+	if (cache_gen)
+		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+
+	if (!options)
+		goto out;
+
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	options = kstrdup(options, GFP_NOFS);
+	if (!options)
+		return -ENOMEM;
+
+	orig = options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_degraded:
+			btrfs_info(root->fs_info, "allowing degraded mounts");
+			btrfs_set_opt(info->mount_opt, DEGRADED);
+			break;
+		case Opt_subvol:
+		case Opt_subvolid:
+		case Opt_subvolrootid:
+		case Opt_device:
+			/*
+			 * These are parsed by btrfs_parse_early_options
+			 * and can be happily ignored here.
+			 */
+			break;
+		case Opt_nodatasum:
+			btrfs_set_and_info(root, NODATASUM,
+					   "setting nodatasum");
+			break;
+		case Opt_datasum:
+			if (btrfs_test_opt(root, NODATASUM)) {
+				if (btrfs_test_opt(root, NODATACOW))
+					btrfs_info(root->fs_info, "setting datasum, datacow enabled");
+				else
+					btrfs_info(root->fs_info, "setting datasum");
+			}
+			btrfs_clear_opt(info->mount_opt, NODATACOW);
+			btrfs_clear_opt(info->mount_opt, NODATASUM);
+			break;
+		case Opt_nodatacow:
+			if (!btrfs_test_opt(root, NODATACOW)) {
+				if (!btrfs_test_opt(root, COMPRESS) ||
+				    !btrfs_test_opt(root, FORCE_COMPRESS)) {
+					btrfs_info(root->fs_info,
+						   "setting nodatacow, compression disabled");
+				} else {
+					btrfs_info(root->fs_info, "setting nodatacow");
+				}
+			}
+			btrfs_clear_opt(info->mount_opt, COMPRESS);
+			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+			btrfs_set_opt(info->mount_opt, NODATACOW);
+			btrfs_set_opt(info->mount_opt, NODATASUM);
+			break;
+		case Opt_datacow:
+			btrfs_clear_and_info(root, NODATACOW,
+					     "setting datacow");
+			break;
+		case Opt_compress_force:
+		case Opt_compress_force_type:
+			compress_force = true;
+			/* Fallthrough */
+		case Opt_compress:
+		case Opt_compress_type:
+			if (token == Opt_compress ||
+			    token == Opt_compress_force ||
+			    strcmp(args[0].from, "zlib") == 0) {
+				compress_type = "zlib";
+				info->compress_type = BTRFS_COMPRESS_ZLIB;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
+			} else if (strcmp(args[0].from, "lzo") == 0) {
+				compress_type = "lzo";
+				info->compress_type = BTRFS_COMPRESS_LZO;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				btrfs_set_fs_incompat(info, COMPRESS_LZO);
+			} else if (strncmp(args[0].from, "no", 2) == 0) {
+				compress_type = "no";
+				btrfs_clear_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+				compress_force = false;
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			if (compress_force) {
+				btrfs_set_and_info(root, FORCE_COMPRESS,
+						   "force %s compression",
+						   compress_type);
+			} else {
+				if (!btrfs_test_opt(root, COMPRESS))
+					btrfs_info(root->fs_info,
+						   "btrfs: use %s compression",
+						   compress_type);
+				/*
+				 * If we remount from compress-force=xxx to
+				 * compress=xxx, we need clear FORCE_COMPRESS
+				 * flag, otherwise, there is no way for users
+				 * to disable forcible compression separately.
+				 */
+				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+			}
+			break;
+		case Opt_ssd:
+			btrfs_set_and_info(root, SSD,
+					   "use ssd allocation scheme");
+			break;
+		case Opt_ssd_spread:
+			btrfs_set_and_info(root, SSD_SPREAD,
+					   "use spread ssd allocation scheme");
+			btrfs_set_opt(info->mount_opt, SSD);
+			break;
+		case Opt_nossd:
+			btrfs_set_and_info(root, NOSSD,
+					     "not using ssd allocation scheme");
+			btrfs_clear_opt(info->mount_opt, SSD);
+			break;
+		case Opt_barrier:
+			btrfs_clear_and_info(root, NOBARRIER,
+					     "turning on barriers");
+			break;
+		case Opt_nobarrier:
+			btrfs_set_and_info(root, NOBARRIER,
+					   "turning off barriers");
+			break;
+		case Opt_thread_pool:
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg > 0) {
+				info->thread_pool_size = intarg;
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+		case Opt_max_inline:
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->max_inline = memparse(num, NULL);
+				kfree(num);
+
+				if (info->max_inline) {
+					info->max_inline = min_t(u64,
+						info->max_inline,
+						root->sectorsize);
+				}
+				btrfs_info(root->fs_info, "max_inline at %llu",
+					info->max_inline);
+			} else {
+				ret = -ENOMEM;
+				goto out;
+			}
+			break;
+		case Opt_alloc_start:
+			num = match_strdup(&args[0]);
+			if (num) {
+				mutex_lock(&info->chunk_mutex);
+				info->alloc_start = memparse(num, NULL);
+				mutex_unlock(&info->chunk_mutex);
+				kfree(num);
+				btrfs_info(root->fs_info, "allocations start at %llu",
+					info->alloc_start);
+			} else {
+				ret = -ENOMEM;
+				goto out;
+			}
+			break;
+		case Opt_acl:
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+			root->fs_info->sb->s_flags |= MS_POSIXACL;
+			break;
+#else
+			btrfs_err(root->fs_info,
+				"support for ACL not compiled in!");
+			ret = -EINVAL;
+			goto out;
+#endif
+		case Opt_noacl:
+			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+			break;
+		case Opt_notreelog:
+			btrfs_set_and_info(root, NOTREELOG,
+					   "disabling tree log");
+			break;
+		case Opt_treelog:
+			btrfs_clear_and_info(root, NOTREELOG,
+					     "enabling tree log");
+			break;
+		case Opt_flushoncommit:
+			btrfs_set_and_info(root, FLUSHONCOMMIT,
+					   "turning on flush-on-commit");
+			break;
+		case Opt_noflushoncommit:
+			btrfs_clear_and_info(root, FLUSHONCOMMIT,
+					     "turning off flush-on-commit");
+			break;
+		case Opt_ratio:
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
+				info->metadata_ratio = intarg;
+				btrfs_info(root->fs_info, "metadata ratio %d",
+				       info->metadata_ratio);
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+		case Opt_discard:
+			btrfs_set_and_info(root, DISCARD,
+					   "turning on discard");
+			break;
+		case Opt_nodiscard:
+			btrfs_clear_and_info(root, DISCARD,
+					     "turning off discard");
+			break;
+		case Opt_space_cache:
+			btrfs_set_and_info(root, SPACE_CACHE,
+					   "enabling disk space caching");
+			break;
+		case Opt_rescan_uuid_tree:
+			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
+			break;
+		case Opt_no_space_cache:
+			btrfs_clear_and_info(root, SPACE_CACHE,
+					     "disabling disk space caching");
+			break;
+		case Opt_inode_cache:
+			btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
+					   "enabling inode map caching");
+			break;
+		case Opt_noinode_cache:
+			btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
+					     "disabling inode map caching");
+			break;
+		case Opt_clear_cache:
+			btrfs_set_and_info(root, CLEAR_CACHE,
+					   "force clearing of disk cache");
+			break;
+		case Opt_user_subvol_rm_allowed:
+			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+			break;
+		case Opt_enospc_debug:
+			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+			break;
+		case Opt_noenospc_debug:
+			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+			break;
+		case Opt_defrag:
+			btrfs_set_and_info(root, AUTO_DEFRAG,
+					   "enabling auto defrag");
+			break;
+		case Opt_nodefrag:
+			btrfs_clear_and_info(root, AUTO_DEFRAG,
+					     "disabling auto defrag");
+			break;
+		case Opt_recovery:
+			btrfs_info(root->fs_info, "enabling auto recovery");
+			btrfs_set_opt(info->mount_opt, RECOVERY);
+			break;
+		case Opt_skip_balance:
+			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+			break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+		case Opt_check_integrity_including_extent_data:
+			btrfs_info(root->fs_info,
+				   "enabling check integrity including extent data");
+			btrfs_set_opt(info->mount_opt,
+				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity:
+			btrfs_info(root->fs_info, "enabling check integrity");
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity_print_mask:
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
+				info->check_integrity_print_mask = intarg;
+				btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
+				       info->check_integrity_print_mask);
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+#else
+		case Opt_check_integrity_including_extent_data:
+		case Opt_check_integrity:
+		case Opt_check_integrity_print_mask:
+			btrfs_err(root->fs_info,
+				"support for check_integrity* not compiled in!");
+			ret = -EINVAL;
+			goto out;
+#endif
+		case Opt_fatal_errors:
+			if (strcmp(args[0].from, "panic") == 0)
+				btrfs_set_opt(info->mount_opt,
+					      PANIC_ON_FATAL_ERROR);
+			else if (strcmp(args[0].from, "bug") == 0)
+				btrfs_clear_opt(info->mount_opt,
+					      PANIC_ON_FATAL_ERROR);
+			else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+		case Opt_commit_interval:
+			intarg = 0;
+			ret = match_int(&args[0], &intarg);
+			if (ret < 0) {
+				btrfs_err(root->fs_info, "invalid commit interval");
+				ret = -EINVAL;
+				goto out;
+			}
+			if (intarg > 0) {
+				if (intarg > 300) {
+					btrfs_warn(root->fs_info, "excessive commit interval %d",
+							intarg);
+				}
+				info->commit_interval = intarg;
+			} else {
+				btrfs_info(root->fs_info, "using default commit interval %ds",
+				    BTRFS_DEFAULT_COMMIT_INTERVAL);
+				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+			}
+			break;
+		case Opt_err:
+			btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
+			ret = -EINVAL;
+			goto out;
+		default:
+			break;
+		}
+	}
+out:
+	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+		btrfs_info(root->fs_info, "disk space caching is enabled");
+	kfree(orig);
+	return ret;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+		void *holder, char **subvol_name, u64 *subvol_objectid,
+		struct btrfs_fs_devices **fs_devices)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *device_name, *opts, *orig, *p;
+	char *num = NULL;
+	int error = 0;
+
+	if (!options)
+		return 0;
+
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	opts = kstrdup(options, GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+	orig = opts;
+
+	while ((p = strsep(&opts, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_subvol:
+			kfree(*subvol_name);
+			*subvol_name = match_strdup(&args[0]);
+			if (!*subvol_name) {
+				error = -ENOMEM;
+				goto out;
+			}
+			break;
+		case Opt_subvolid:
+			num = match_strdup(&args[0]);
+			if (num) {
+				*subvol_objectid = memparse(num, NULL);
+				kfree(num);
+				/* we want the original fs_tree */
+				if (!*subvol_objectid)
+					*subvol_objectid =
+						BTRFS_FS_TREE_OBJECTID;
+			} else {
+				error = -EINVAL;
+				goto out;
+			}
+			break;
+		case Opt_subvolrootid:
+			printk(KERN_WARNING
+				"BTRFS: 'subvolrootid' mount option is deprecated and has "
+				"no effect\n");
+			break;
+		case Opt_device:
+			device_name = match_strdup(&args[0]);
+			if (!device_name) {
+				error = -ENOMEM;
+				goto out;
+			}
+			error = btrfs_scan_one_device(device_name,
+					flags, holder, fs_devices);
+			kfree(device_name);
+			if (error)
+				goto out;
+			break;
+		default:
+			break;
+		}
+	}
+
+out:
+	kfree(orig);
+	return error;
+}
+
+static struct dentry *get_default_root(struct super_block *sb,
+				       u64 subvol_objectid)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct inode *inode;
+	u64 dir_id;
+	int new = 0;
+
+	/*
+	 * We have a specific subvol we want to mount, just setup location and
+	 * go look up the root.
+	 */
+	if (subvol_objectid) {
+		location.objectid = subvol_objectid;
+		location.type = BTRFS_ROOT_ITEM_KEY;
+		location.offset = (u64)-1;
+		goto find_root;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+	path->leave_spinning = 1;
+
+	/*
+	 * Find the "default" dir item which points to the root item that we
+	 * will mount by default if we haven't been given a specific subvolume
+	 * to mount.
+	 */
+	dir_id = btrfs_super_root_dir(fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+	if (IS_ERR(di)) {
+		btrfs_free_path(path);
+		return ERR_CAST(di);
+	}
+	if (!di) {
+		/*
+		 * Ok the default dir item isn't there.  This is weird since
+		 * it's always been there, but don't freak out, just try and
+		 * mount to root most subvolume.
+		 */
+		btrfs_free_path(path);
+		dir_id = BTRFS_FIRST_FREE_OBJECTID;
+		new_root = fs_info->fs_root;
+		goto setup_root;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+	btrfs_free_path(path);
+
+find_root:
+	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
+	if (IS_ERR(new_root))
+		return ERR_CAST(new_root);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		int ret;
+		down_read(&fs_info->cleanup_work_sem);
+		ret = btrfs_orphan_cleanup(new_root);
+		up_read(&fs_info->cleanup_work_sem);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	dir_id = btrfs_root_dirid(&new_root->root_item);
+setup_root:
+	location.objectid = dir_id;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	inode = btrfs_iget(sb, &location, new_root, &new);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	/*
+	 * If we're just mounting the root most subvol put the inode and return
+	 * a reference to the dentry.  We will have already gotten a reference
+	 * to the inode in btrfs_fill_super so we're good to go.
+	 */
+	if (!new && d_inode(sb->s_root) == inode) {
+		iput(inode);
+		return dget(sb->s_root);
+	}
+
+	return d_obtain_root(inode);
+}
+
+static int btrfs_fill_super(struct super_block *sb,
+			    struct btrfs_fs_devices *fs_devices,
+			    void *data, int silent)
+{
+	struct inode *inode;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_key key;
+	int err;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_magic = BTRFS_SUPER_MAGIC;
+	sb->s_op = &btrfs_super_ops;
+	sb->s_d_op = &btrfs_dentry_operations;
+	sb->s_export_op = &btrfs_export_ops;
+	sb->s_xattr = btrfs_xattr_handlers;
+	sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+	sb->s_flags |= MS_I_VERSION;
+	err = open_ctree(sb, fs_devices, (char *)data);
+	if (err) {
+		printk(KERN_ERR "BTRFS: open_ctree failed\n");
+		return err;
+	}
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto fail_close;
+	}
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto fail_close;
+	}
+
+	save_mount_options(sb, data);
+	cleancache_init_fs(sb);
+	sb->s_flags |= MS_ACTIVE;
+	return 0;
+
+fail_close:
+	close_ctree(fs_info->tree_root);
+	return err;
+}
+
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
+
+	trace_btrfs_sync_fs(wait);
+
+	if (!wait) {
+		filemap_flush(fs_info->btree_inode->i_mapping);
+		return 0;
+	}
+
+	btrfs_wait_ordered_roots(fs_info, -1);
+
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT) {
+			/*
+			 * Exit unless we have some pending changes
+			 * that need to go through commit
+			 */
+			if (fs_info->pending_changes == 0)
+				return 0;
+			/*
+			 * A non-blocking test if the fs is frozen. We must not
+			 * start a new transaction here otherwise a deadlock
+			 * happens. The pending operations are delayed to the
+			 * next commit after thawing.
+			 */
+			if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
+				__sb_end_write(sb, SB_FREEZE_WRITE);
+			else
+				return 0;
+			trans = btrfs_start_transaction(root, 0);
+		}
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+	}
+	return btrfs_commit_transaction(trans, root);
+}
+
+static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+	struct btrfs_root *root = info->tree_root;
+	char *compress_type;
+
+	if (btrfs_test_opt(root, DEGRADED))
+		seq_puts(seq, ",degraded");
+	if (btrfs_test_opt(root, NODATASUM))
+		seq_puts(seq, ",nodatasum");
+	if (btrfs_test_opt(root, NODATACOW))
+		seq_puts(seq, ",nodatacow");
+	if (btrfs_test_opt(root, NOBARRIER))
+		seq_puts(seq, ",nobarrier");
+	if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+		seq_printf(seq, ",max_inline=%llu", info->max_inline);
+	if (info->alloc_start != 0)
+		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+	if (info->thread_pool_size !=  min_t(unsigned long,
+					     num_online_cpus() + 2, 8))
+		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+	if (btrfs_test_opt(root, COMPRESS)) {
+		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
+			compress_type = "zlib";
+		else
+			compress_type = "lzo";
+		if (btrfs_test_opt(root, FORCE_COMPRESS))
+			seq_printf(seq, ",compress-force=%s", compress_type);
+		else
+			seq_printf(seq, ",compress=%s", compress_type);
+	}
+	if (btrfs_test_opt(root, NOSSD))
+		seq_puts(seq, ",nossd");
+	if (btrfs_test_opt(root, SSD_SPREAD))
+		seq_puts(seq, ",ssd_spread");
+	else if (btrfs_test_opt(root, SSD))
+		seq_puts(seq, ",ssd");
+	if (btrfs_test_opt(root, NOTREELOG))
+		seq_puts(seq, ",notreelog");
+	if (btrfs_test_opt(root, FLUSHONCOMMIT))
+		seq_puts(seq, ",flushoncommit");
+	if (btrfs_test_opt(root, DISCARD))
+		seq_puts(seq, ",discard");
+	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+		seq_puts(seq, ",noacl");
+	if (btrfs_test_opt(root, SPACE_CACHE))
+		seq_puts(seq, ",space_cache");
+	else
+		seq_puts(seq, ",nospace_cache");
+	if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+		seq_puts(seq, ",rescan_uuid_tree");
+	if (btrfs_test_opt(root, CLEAR_CACHE))
+		seq_puts(seq, ",clear_cache");
+	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+		seq_puts(seq, ",user_subvol_rm_allowed");
+	if (btrfs_test_opt(root, ENOSPC_DEBUG))
+		seq_puts(seq, ",enospc_debug");
+	if (btrfs_test_opt(root, AUTO_DEFRAG))
+		seq_puts(seq, ",autodefrag");
+	if (btrfs_test_opt(root, INODE_MAP_CACHE))
+		seq_puts(seq, ",inode_cache");
+	if (btrfs_test_opt(root, SKIP_BALANCE))
+		seq_puts(seq, ",skip_balance");
+	if (btrfs_test_opt(root, RECOVERY))
+		seq_puts(seq, ",recovery");
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+		seq_puts(seq, ",check_int_data");
+	else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		seq_puts(seq, ",check_int");
+	if (info->check_integrity_print_mask)
+		seq_printf(seq, ",check_int_print_mask=%d",
+				info->check_integrity_print_mask);
+#endif
+	if (info->metadata_ratio)
+		seq_printf(seq, ",metadata_ratio=%d",
+				info->metadata_ratio);
+	if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+		seq_puts(seq, ",fatal_errors=panic");
+	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
+		seq_printf(seq, ",commit=%d", info->commit_interval);
+	return 0;
+}
+
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+	struct btrfs_fs_info *p = data;
+	struct btrfs_fs_info *fs_info = btrfs_sb(s);
+
+	return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+	int err = set_anon_super(s, data);
+	if (!err)
+		s->s_fs_info = data;
+	return err;
+}
+
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+	unsigned len = strlen(args) + 2 + 1;
+	char *src, *dst, *buf;
+
+	/*
+	 * We need the same args as before, but with this substitution:
+	 * s!subvol=[^,]+!subvolid=0!
+	 *
+	 * Since the replacement string is up to 2 bytes longer than the
+	 * original, allocate strlen(args) + 2 + 1 bytes.
+	 */
+
+	src = strstr(args, "subvol=");
+	/* This shouldn't happen, but just in case.. */
+	if (!src)
+		return NULL;
+
+	buf = dst = kmalloc(len, GFP_NOFS);
+	if (!buf)
+		return NULL;
+
+	/*
+	 * If the subvol= arg is not at the start of the string,
+	 * copy whatever precedes it into buf.
+	 */
+	if (src != args) {
+		*src++ = '\0';
+		strcpy(buf, args);
+		dst += strlen(args);
+	}
+
+	strcpy(dst, "subvolid=0");
+	dst += strlen("subvolid=0");
+
+	/*
+	 * If there is a "," after the original subvol=... string,
+	 * copy that suffix into our buffer.  Otherwise, we're done.
+	 */
+	src = strchr(src, ',');
+	if (src)
+		strcpy(dst, src);
+
+	return buf;
+}
+
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+				   const char *device_name, char *data)
+{
+	struct dentry *root;
+	struct vfsmount *mnt;
+	char *newargs;
+
+	newargs = setup_root_args(data);
+	if (!newargs)
+		return ERR_PTR(-ENOMEM);
+	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+			     newargs);
+
+	if (PTR_RET(mnt) == -EBUSY) {
+		if (flags & MS_RDONLY) {
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
+					     newargs);
+		} else {
+			int r;
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
+					     newargs);
+			if (IS_ERR(mnt)) {
+				kfree(newargs);
+				return ERR_CAST(mnt);
+			}
+
+			r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+			if (r < 0) {
+				/* FIXME: release vfsmount mnt ??*/
+				kfree(newargs);
+				return ERR_PTR(r);
+			}
+		}
+	}
+
+	kfree(newargs);
+
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
+
+	root = mount_subtree(mnt, subvol_name);
+
+	if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
+		struct super_block *s = root->d_sb;
+		dput(root);
+		root = ERR_PTR(-EINVAL);
+		deactivate_locked_super(s);
+		printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
+				subvol_name);
+	}
+
+	return root;
+}
+
+static int parse_security_options(char *orig_opts,
+				  struct security_mnt_opts *sec_opts)
+{
+	char *secdata = NULL;
+	int ret = 0;
+
+	secdata = alloc_secdata();
+	if (!secdata)
+		return -ENOMEM;
+	ret = security_sb_copy_data(orig_opts, secdata);
+	if (ret) {
+		free_secdata(secdata);
+		return ret;
+	}
+	ret = security_sb_parse_opts_str(secdata, sec_opts);
+	free_secdata(secdata);
+	return ret;
+}
+
+static int setup_security_options(struct btrfs_fs_info *fs_info,
+				  struct super_block *sb,
+				  struct security_mnt_opts *sec_opts)
+{
+	int ret = 0;
+
+	/*
+	 * Call security_sb_set_mnt_opts() to check whether new sec_opts
+	 * is valid.
+	 */
+	ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_SECURITY
+	if (!fs_info->security_opts.num_mnt_opts) {
+		/* first time security setup, copy sec_opts to fs_info */
+		memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
+	} else {
+		/*
+		 * Since SELinux(the only one supports security_mnt_opts) does
+		 * NOT support changing context during remount/mount same sb,
+		 * This must be the same or part of the same security options,
+		 * just free it.
+		 */
+		security_free_mnt_opts(sec_opts);
+	}
+#endif
+	return ret;
+}
+
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
+ *	  for multiple device setup.  Make sure to keep it in sync.
+ */
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+		const char *device_name, void *data)
+{
+	struct block_device *bdev = NULL;
+	struct super_block *s;
+	struct dentry *root;
+	struct btrfs_fs_devices *fs_devices = NULL;
+	struct btrfs_fs_info *fs_info = NULL;
+	struct security_mnt_opts new_sec_opts;
+	fmode_t mode = FMODE_READ;
+	char *subvol_name = NULL;
+	u64 subvol_objectid = 0;
+	int error = 0;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	error = btrfs_parse_early_options(data, mode, fs_type,
+					  &subvol_name, &subvol_objectid,
+					  &fs_devices);
+	if (error) {
+		kfree(subvol_name);
+		return ERR_PTR(error);
+	}
+
+	if (subvol_name) {
+		root = mount_subvol(subvol_name, flags, device_name, data);
+		kfree(subvol_name);
+		return root;
+	}
+
+	security_init_mnt_opts(&new_sec_opts);
+	if (data) {
+		error = parse_security_options(data, &new_sec_opts);
+		if (error)
+			return ERR_PTR(error);
+	}
+
+	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
+	if (error)
+		goto error_sec_opts;
+
+	/*
+	 * Setup a dummy root and fs_info for test/set super.  This is because
+	 * we don't actually fill this stuff out until open_ctree, but we need
+	 * it for searching for existing supers, so this lets us do that and
+	 * then open_ctree will properly initialize everything later.
+	 */
+	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+	if (!fs_info) {
+		error = -ENOMEM;
+		goto error_sec_opts;
+	}
+
+	fs_info->fs_devices = fs_devices;
+
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	security_init_mnt_opts(&fs_info->security_opts);
+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
+		error = -ENOMEM;
+		goto error_fs_info;
+	}
+
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	if (error)
+		goto error_fs_info;
+
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
+		goto error_close_devices;
+	}
+
+	bdev = fs_devices->latest_bdev;
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+		 fs_info);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto error_close_devices;
+	}
+
+	if (s->s_root) {
+		btrfs_close_devices(fs_devices);
+		free_fs_info(fs_info);
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			error = -EBUSY;
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		btrfs_sb(s)->bdev_holder = fs_type;
+		error = btrfs_fill_super(s, fs_devices, data,
+					 flags & MS_SILENT ? 1 : 0);
+	}
+
+	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+	if (IS_ERR(root)) {
+		deactivate_locked_super(s);
+		error = PTR_ERR(root);
+		goto error_sec_opts;
+	}
+
+	fs_info = btrfs_sb(s);
+	error = setup_security_options(fs_info, s, &new_sec_opts);
+	if (error) {
+		dput(root);
+		deactivate_locked_super(s);
+		goto error_sec_opts;
+	}
+
+	return root;
+
+error_close_devices:
+	btrfs_close_devices(fs_devices);
+error_fs_info:
+	free_fs_info(fs_info);
+error_sec_opts:
+	security_free_mnt_opts(&new_sec_opts);
+	return ERR_PTR(error);
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+				     int new_pool_size, int old_pool_size)
+{
+	if (new_pool_size == old_pool_size)
+		return;
+
+	fs_info->thread_pool_size = new_pool_size;
+
+	btrfs_info(fs_info, "resize thread pool %d -> %d",
+	       old_pool_size, new_pool_size);
+
+	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+				new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+				new_pool_size);
+}
+
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
+{
+	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+				       unsigned long old_opts, int flags)
+{
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (flags & MS_RDONLY))) {
+		/* wait for any defraggers to finish */
+		wait_event(fs_info->transaction_wait,
+			   (atomic_read(&fs_info->defrag_running) == 0));
+		if (flags & MS_RDONLY)
+			sync_filesystem(fs_info->sb);
+	}
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+					 unsigned long old_opts)
+{
+	/*
+	 * We need cleanup all defragable inodes if the autodefragment is
+	 * close or the fs is R/O.
+	 */
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (fs_info->sb->s_flags & MS_RDONLY))) {
+		btrfs_cleanup_defrag_inodes(fs_info);
+	}
+
+	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
+	unsigned old_flags = sb->s_flags;
+	unsigned long old_opts = fs_info->mount_opt;
+	unsigned long old_compress_type = fs_info->compress_type;
+	u64 old_max_inline = fs_info->max_inline;
+	u64 old_alloc_start = fs_info->alloc_start;
+	int old_thread_pool_size = fs_info->thread_pool_size;
+	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
+	int ret;
+
+	sync_filesystem(sb);
+	btrfs_remount_prepare(fs_info);
+
+	if (data) {
+		struct security_mnt_opts new_sec_opts;
+
+		security_init_mnt_opts(&new_sec_opts);
+		ret = parse_security_options(data, &new_sec_opts);
+		if (ret)
+			goto restore;
+		ret = setup_security_options(fs_info, sb,
+					     &new_sec_opts);
+		if (ret) {
+			security_free_mnt_opts(&new_sec_opts);
+			goto restore;
+		}
+	}
+
+	ret = btrfs_parse_options(root, data);
+	if (ret) {
+		ret = -EINVAL;
+		goto restore;
+	}
+
+	btrfs_remount_begin(fs_info, old_opts, *flags);
+	btrfs_resize_thread_pool(fs_info,
+		fs_info->thread_pool_size, old_thread_pool_size);
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out;
+
+	if (*flags & MS_RDONLY) {
+		/*
+		 * this also happens on 'umount -rf' or on shutdown, when
+		 * the filesystem is busy.
+		 */
+		cancel_work_sync(&fs_info->async_reclaim_work);
+
+		/* wait for the uuid_scan task to finish */
+		down(&fs_info->uuid_tree_rescan_sem);
+		/* avoid complains from lockdep et al. */
+		up(&fs_info->uuid_tree_rescan_sem);
+
+		sb->s_flags |= MS_RDONLY;
+
+		btrfs_dev_replace_suspend_for_unmount(fs_info);
+		btrfs_scrub_cancel(fs_info);
+		btrfs_pause_balance(fs_info);
+
+		ret = btrfs_commit_super(root);
+		if (ret)
+			goto restore;
+	} else {
+		if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+			btrfs_err(fs_info,
+				"Remounting read-write after error is not allowed");
+			ret = -EINVAL;
+			goto restore;
+		}
+		if (fs_info->fs_devices->rw_devices == 0) {
+			ret = -EACCES;
+			goto restore;
+		}
+
+		if (fs_info->fs_devices->missing_devices >
+		     fs_info->num_tolerated_disk_barrier_failures &&
+		    !(*flags & MS_RDONLY)) {
+			btrfs_warn(fs_info,
+				"too many missing devices, writeable remount is not allowed");
+			ret = -EACCES;
+			goto restore;
+		}
+
+		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+			ret = -EINVAL;
+			goto restore;
+		}
+
+		ret = btrfs_cleanup_fs_roots(fs_info);
+		if (ret)
+			goto restore;
+
+		/* recover relocation */
+		mutex_lock(&fs_info->cleaner_mutex);
+		ret = btrfs_recover_relocation(root);
+		mutex_unlock(&fs_info->cleaner_mutex);
+		if (ret)
+			goto restore;
+
+		ret = btrfs_resume_balance_async(fs_info);
+		if (ret)
+			goto restore;
+
+		ret = btrfs_resume_dev_replace_async(fs_info);
+		if (ret) {
+			btrfs_warn(fs_info, "failed to resume dev_replace");
+			goto restore;
+		}
+
+		if (!fs_info->uuid_root) {
+			btrfs_info(fs_info, "creating UUID tree");
+			ret = btrfs_create_uuid_tree(fs_info);
+			if (ret) {
+				btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
+				goto restore;
+			}
+		}
+		sb->s_flags &= ~MS_RDONLY;
+	}
+out:
+	wake_up_process(fs_info->transaction_kthread);
+	btrfs_remount_cleanup(fs_info, old_opts);
+	return 0;
+
+restore:
+	/* We've hit an error - don't reset MS_RDONLY */
+	if (sb->s_flags & MS_RDONLY)
+		old_flags |= MS_RDONLY;
+	sb->s_flags = old_flags;
+	fs_info->mount_opt = old_opts;
+	fs_info->compress_type = old_compress_type;
+	fs_info->max_inline = old_max_inline;
+	mutex_lock(&fs_info->chunk_mutex);
+	fs_info->alloc_start = old_alloc_start;
+	mutex_unlock(&fs_info->chunk_mutex);
+	btrfs_resize_thread_pool(fs_info,
+		old_thread_pool_size, fs_info->thread_pool_size);
+	fs_info->metadata_ratio = old_metadata_ratio;
+	btrfs_remount_cleanup(fs_info, old_opts);
+	return ret;
+}
+
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+				       const void *dev_info2)
+{
+	if (((struct btrfs_device_info *)dev_info1)->max_avail >
+	    ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return -1;
+	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+		 ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return 1;
+	else
+	return 0;
+}
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+					struct btrfs_device_info *devices,
+					size_t nr_devices)
+{
+	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_free_bytes, NULL);
+}
+
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_device_info *devices_info;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	u64 skip_space;
+	u64 type;
+	u64 avail_space;
+	u64 used_space;
+	u64 min_stripe_size;
+	int min_stripes = 1, num_stripes = 1;
+	int i = 0, nr_devices;
+	int ret;
+
+	/*
+	 * We aren't under the device list lock, so this is racey-ish, but good
+	 * enough for our purposes.
+	 */
+	nr_devices = fs_info->fs_devices->open_devices;
+	if (!nr_devices) {
+		smp_mb();
+		nr_devices = fs_info->fs_devices->open_devices;
+		ASSERT(nr_devices);
+		if (!nr_devices) {
+			*free_bytes = 0;
+			return 0;
+		}
+	}
+
+	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	/* calc min stripe number for data space alloction */
+	type = btrfs_get_alloc_profile(root, 1);
+	if (type & BTRFS_BLOCK_GROUP_RAID0) {
+		min_stripes = 2;
+		num_stripes = nr_devices;
+	} else if (type & BTRFS_BLOCK_GROUP_RAID1) {
+		min_stripes = 2;
+		num_stripes = 2;
+	} else if (type & BTRFS_BLOCK_GROUP_RAID10) {
+		min_stripes = 4;
+		num_stripes = 4;
+	}
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+	else
+		min_stripe_size = BTRFS_STRIPE_LEN;
+
+	if (fs_info->alloc_start)
+		mutex_lock(&fs_devices->device_list_mutex);
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
+		if (!device->in_fs_metadata || !device->bdev ||
+		    device->is_tgtdev_for_dev_replace)
+			continue;
+
+		if (i >= nr_devices)
+			break;
+
+		avail_space = device->total_bytes - device->bytes_used;
+
+		/* align with stripe_len */
+		avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
+		avail_space *= BTRFS_STRIPE_LEN;
+
+		/*
+		 * In order to avoid overwritting the superblock on the drive,
+		 * btrfs starts at an offset of at least 1MB when doing chunk
+		 * allocation.
+		 */
+		skip_space = 1024 * 1024;
+
+		/* user can set the offset in fs_info->alloc_start. */
+		if (fs_info->alloc_start &&
+		    fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+		    device->total_bytes) {
+			rcu_read_unlock();
+			skip_space = max(fs_info->alloc_start, skip_space);
+
+			/*
+			 * btrfs can not use the free space in
+			 * [0, skip_space - 1], we must subtract it from the
+			 * total. In order to implement it, we account the used
+			 * space in this range first.
+			 */
+			ret = btrfs_account_dev_extents_size(device, 0,
+							     skip_space - 1,
+							     &used_space);
+			if (ret) {
+				kfree(devices_info);
+				mutex_unlock(&fs_devices->device_list_mutex);
+				return ret;
+			}
+
+			rcu_read_lock();
+
+			/* calc the free space in [0, skip_space - 1] */
+			skip_space -= used_space;
+		}
+
+		/*
+		 * we can use the free space in [0, skip_space - 1], subtract
+		 * it from the total.
+		 */
+		if (avail_space && avail_space >= skip_space)
+			avail_space -= skip_space;
+		else
+			avail_space = 0;
+
+		if (avail_space < min_stripe_size)
+			continue;
+
+		devices_info[i].dev = device;
+		devices_info[i].max_avail = avail_space;
+
+		i++;
+	}
+	rcu_read_unlock();
+	if (fs_info->alloc_start)
+		mutex_unlock(&fs_devices->device_list_mutex);
+
+	nr_devices = i;
+
+	btrfs_descending_sort_devices(devices_info, nr_devices);
+
+	i = nr_devices - 1;
+	avail_space = 0;
+	while (nr_devices >= min_stripes) {
+		if (num_stripes > nr_devices)
+			num_stripes = nr_devices;
+
+		if (devices_info[i].max_avail >= min_stripe_size) {
+			int j;
+			u64 alloc_size;
+
+			avail_space += devices_info[i].max_avail * num_stripes;
+			alloc_size = devices_info[i].max_avail;
+			for (j = i + 1 - num_stripes; j <= i; j++)
+				devices_info[j].max_avail -= alloc_size;
+		}
+		i--;
+		nr_devices--;
+	}
+
+	kfree(devices_info);
+	*free_bytes = avail_space;
+	return 0;
+}
+
+/*
+ * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
+ *
+ * If there's a redundant raid level at DATA block groups, use the respective
+ * multiplier to scale the sizes.
+ *
+ * Unused device space usage is based on simulating the chunk allocator
+ * algorithm that respects the device sizes, order of allocations and the
+ * 'alloc_start' value, this is a close approximation of the actual use but
+ * there are other factors that may change the result (like a new metadata
+ * chunk).
+ *
+ * FIXME: not accurate for mixed block groups, total and free/used are ok,
+ * available appears slightly larger.
+ */
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	struct list_head *head = &fs_info->space_info;
+	struct btrfs_space_info *found;
+	u64 total_used = 0;
+	u64 total_free_data = 0;
+	int bits = dentry->d_sb->s_blocksize_bits;
+	__be32 *fsid = (__be32 *)fs_info->fsid;
+	unsigned factor = 1;
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	int ret;
+
+	/*
+	 * holding chunk_muext to avoid allocating new chunks, holding
+	 * device_list_mutex to avoid the device being removed
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+			int i;
+
+			total_free_data += found->disk_total - found->disk_used;
+			total_free_data -=
+				btrfs_account_ro_block_groups_free_space(found);
+
+			for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+				if (!list_empty(&found->block_groups[i])) {
+					switch (i) {
+					case BTRFS_RAID_DUP:
+					case BTRFS_RAID_RAID1:
+					case BTRFS_RAID_RAID10:
+						factor = 2;
+					}
+				}
+			}
+		}
+
+		total_used += found->disk_used;
+	}
+
+	rcu_read_unlock();
+
+	buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
+	buf->f_blocks >>= bits;
+	buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
+
+	/* Account global block reserve as used, it's in logical size already */
+	spin_lock(&block_rsv->lock);
+	buf->f_bfree -= block_rsv->size >> bits;
+	spin_unlock(&block_rsv->lock);
+
+	buf->f_bavail = div_u64(total_free_data, factor);
+	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
+	if (ret)
+		return ret;
+	buf->f_bavail += div_u64(total_free_data, factor);
+	buf->f_bavail = buf->f_bavail >> bits;
+
+	buf->f_type = BTRFS_SUPER_MAGIC;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	buf->f_namelen = BTRFS_NAME_LEN;
+
+	/* We treat it as constant endianness (it doesn't matter _which_)
+	   because we want the fsid to come out the same whether mounted
+	   on a big-endian or little-endian host */
+	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+	/* Mask in the root object ID too, to disambiguate subvols */
+	buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
+	buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
+
+	return 0;
+}
+
+static void btrfs_kill_super(struct super_block *sb)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	kill_anon_super(sb);
+	free_fs_info(fs_info);
+}
+
+static struct file_system_type btrfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "btrfs",
+	.mount		= btrfs_mount,
+	.kill_sb	= btrfs_kill_super,
+	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("btrfs");
+
+static int btrfs_control_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * The control file's private_data is used to hold the
+	 * transaction when it is started and is used to keep
+	 * track of whether a transaction is already in progress.
+	 */
+	file->private_data = NULL;
+	return 0;
+}
+
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct btrfs_ioctl_vol_args *vol;
+	struct btrfs_fs_devices *fs_devices;
+	int ret = -ENOTTY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol = memdup_user((void __user *)arg, sizeof(*vol));
+	if (IS_ERR(vol))
+		return PTR_ERR(vol);
+
+	switch (cmd) {
+	case BTRFS_IOC_SCAN_DEV:
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+					    &btrfs_fs_type, &fs_devices);
+		break;
+	case BTRFS_IOC_DEVICES_READY:
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+					    &btrfs_fs_type, &fs_devices);
+		if (ret)
+			break;
+		ret = !(fs_devices->num_devices == fs_devices->total_devices);
+		break;
+	}
+
+	kfree(vol);
+	return ret;
+}
+
+static int btrfs_freeze(struct super_block *sb)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
+		return PTR_ERR(trans);
+	}
+	return btrfs_commit_transaction(trans, root);
+}
+
+static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+	struct btrfs_fs_devices *cur_devices;
+	struct btrfs_device *dev, *first_dev = NULL;
+	struct list_head *head;
+	struct rcu_string *name;
+
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	cur_devices = fs_info->fs_devices;
+	while (cur_devices) {
+		head = &cur_devices->devices;
+		list_for_each_entry(dev, head, dev_list) {
+			if (dev->missing)
+				continue;
+			if (!dev->name)
+				continue;
+			if (!first_dev || dev->devid < first_dev->devid)
+				first_dev = dev;
+		}
+		cur_devices = cur_devices->seed;
+	}
+
+	if (first_dev) {
+		rcu_read_lock();
+		name = rcu_dereference(first_dev->name);
+		seq_escape(m, name->str, " \t\n\\");
+		rcu_read_unlock();
+	} else {
+		WARN_ON(1);
+	}
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	return 0;
+}
+
+static const struct super_operations btrfs_super_ops = {
+	.drop_inode	= btrfs_drop_inode,
+	.evict_inode	= btrfs_evict_inode,
+	.put_super	= btrfs_put_super,
+	.sync_fs	= btrfs_sync_fs,
+	.show_options	= btrfs_show_options,
+	.show_devname	= btrfs_show_devname,
+	.write_inode	= btrfs_write_inode,
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_destroy_inode,
+	.statfs		= btrfs_statfs,
+	.remount_fs	= btrfs_remount,
+	.freeze_fs	= btrfs_freeze,
+};
+
+static const struct file_operations btrfs_ctl_fops = {
+	.open = btrfs_control_open,
+	.unlocked_ioctl	 = btrfs_control_ioctl,
+	.compat_ioctl = btrfs_control_ioctl,
+	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice btrfs_misc = {
+	.minor		= BTRFS_MINOR,
+	.name		= "btrfs-control",
+	.fops		= &btrfs_ctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
+static int btrfs_interface_init(void)
+{
+	return misc_register(&btrfs_misc);
+}
+
+static void btrfs_interface_exit(void)
+{
+	if (misc_deregister(&btrfs_misc) < 0)
+		printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
+}
+
+static void btrfs_print_info(void)
+{
+	printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+			", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_ASSERT
+			", assert=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+			", integrity-checker=on"
+#endif
+			"\n");
+}
+
+static int btrfs_run_sanity_tests(void)
+{
+	int ret;
+
+	ret = btrfs_init_test_fs();
+	if (ret)
+		return ret;
+
+	ret = btrfs_test_free_space_cache();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_buffer_operations();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_io();
+	if (ret)
+		goto out;
+	ret = btrfs_test_inodes();
+	if (ret)
+		goto out;
+	ret = btrfs_test_qgroups();
+out:
+	btrfs_destroy_test_fs();
+	return ret;
+}
+
+static int __init init_btrfs_fs(void)
+{
+	int err;
+
+	err = btrfs_hash_init();
+	if (err)
+		return err;
+
+	btrfs_props_init();
+
+	err = btrfs_init_sysfs();
+	if (err)
+		goto free_hash;
+
+	btrfs_init_compress();
+
+	err = btrfs_init_cachep();
+	if (err)
+		goto free_compress;
+
+	err = extent_io_init();
+	if (err)
+		goto free_cachep;
+
+	err = extent_map_init();
+	if (err)
+		goto free_extent_io;
+
+	err = ordered_data_init();
+	if (err)
+		goto free_extent_map;
+
+	err = btrfs_delayed_inode_init();
+	if (err)
+		goto free_ordered_data;
+
+	err = btrfs_auto_defrag_init();
+	if (err)
+		goto free_delayed_inode;
+
+	err = btrfs_delayed_ref_init();
+	if (err)
+		goto free_auto_defrag;
+
+	err = btrfs_prelim_ref_init();
+	if (err)
+		goto free_delayed_ref;
+
+	err = btrfs_end_io_wq_init();
+	if (err)
+		goto free_prelim_ref;
+
+	err = btrfs_interface_init();
+	if (err)
+		goto free_end_io_wq;
+
+	btrfs_init_lockdep();
+
+	btrfs_print_info();
+
+	err = btrfs_run_sanity_tests();
+	if (err)
+		goto unregister_ioctl;
+
+	err = register_filesystem(&btrfs_fs_type);
+	if (err)
+		goto unregister_ioctl;
+
+	return 0;
+
+unregister_ioctl:
+	btrfs_interface_exit();
+free_end_io_wq:
+	btrfs_end_io_wq_exit();
+free_prelim_ref:
+	btrfs_prelim_ref_exit();
+free_delayed_ref:
+	btrfs_delayed_ref_exit();
+free_auto_defrag:
+	btrfs_auto_defrag_exit();
+free_delayed_inode:
+	btrfs_delayed_inode_exit();
+free_ordered_data:
+	ordered_data_exit();
+free_extent_map:
+	extent_map_exit();
+free_extent_io:
+	extent_io_exit();
+free_cachep:
+	btrfs_destroy_cachep();
+free_compress:
+	btrfs_exit_compress();
+	btrfs_exit_sysfs();
+free_hash:
+	btrfs_hash_exit();
+	return err;
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+	btrfs_destroy_cachep();
+	btrfs_delayed_ref_exit();
+	btrfs_auto_defrag_exit();
+	btrfs_delayed_inode_exit();
+	btrfs_prelim_ref_exit();
+	ordered_data_exit();
+	extent_map_exit();
+	extent_io_exit();
+	btrfs_interface_exit();
+	btrfs_end_io_wq_exit();
+	unregister_filesystem(&btrfs_fs_type);
+	btrfs_exit_sysfs();
+	btrfs_cleanup_fs_uuids();
+	btrfs_exit_compress();
+	btrfs_hash_exit();
+}
+
+late_initcall(init_btrfs_fs);
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/btrfs/sysfs.c b/kernel/fs/btrfs/sysfs.c
new file mode 100644
index 000000000..e8a4c86d2
--- /dev/null
+++ b/kernel/fs/btrfs/sysfs.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kobject.h>
+#include <linux/bug.h>
+#include <linux/genhd.h>
+#include <linux/debugfs.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "sysfs.h"
+#include "volumes.h"
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+
+static u64 get_features(struct btrfs_fs_info *fs_info,
+			enum btrfs_feature_set set)
+{
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	if (set == FEAT_COMPAT)
+		return btrfs_super_compat_flags(disk_super);
+	else if (set == FEAT_COMPAT_RO)
+		return btrfs_super_compat_ro_flags(disk_super);
+	else
+		return btrfs_super_incompat_flags(disk_super);
+}
+
+static void set_features(struct btrfs_fs_info *fs_info,
+			 enum btrfs_feature_set set, u64 features)
+{
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	if (set == FEAT_COMPAT)
+		btrfs_set_super_compat_flags(disk_super, features);
+	else if (set == FEAT_COMPAT_RO)
+		btrfs_set_super_compat_ro_flags(disk_super, features);
+	else
+		btrfs_set_super_incompat_flags(disk_super, features);
+}
+
+static int can_modify_feature(struct btrfs_feature_attr *fa)
+{
+	int val = 0;
+	u64 set, clear;
+	switch (fa->feature_set) {
+	case FEAT_COMPAT:
+		set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+		break;
+	case FEAT_COMPAT_RO:
+		set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+		break;
+	case FEAT_INCOMPAT:
+		set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+		break;
+	default:
+		printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n",
+				fa->feature_set);
+		return 0;
+	}
+
+	if (set & fa->feature_bit)
+		val |= 1;
+	if (clear & fa->feature_bit)
+		val |= 2;
+
+	return val;
+}
+
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
+{
+	int val = 0;
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+	if (fs_info) {
+		u64 features = get_features(fs_info, fa->feature_set);
+		if (features & fa->feature_bit)
+			val = 1;
+	} else
+		val = can_modify_feature(fa);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
+					struct kobj_attribute *a,
+					const char *buf, size_t count)
+{
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+	u64 features, set, clear;
+	unsigned long val;
+	int ret;
+
+	fs_info = to_fs_info(kobj);
+	if (!fs_info)
+		return -EPERM;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &val);
+	if (ret)
+		return ret;
+
+	if (fa->feature_set == FEAT_COMPAT) {
+		set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+	} else if (fa->feature_set == FEAT_COMPAT_RO) {
+		set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+	} else {
+		set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+	}
+
+	features = get_features(fs_info, fa->feature_set);
+
+	/* Nothing to do */
+	if ((val && (features & fa->feature_bit)) ||
+	    (!val && !(features & fa->feature_bit)))
+		return count;
+
+	if ((val && !(set & fa->feature_bit)) ||
+	    (!val && !(clear & fa->feature_bit))) {
+		btrfs_info(fs_info,
+			"%sabling feature %s on mounted fs is not supported.",
+			val ? "En" : "Dis", fa->kobj_attr.attr.name);
+		return -EPERM;
+	}
+
+	btrfs_info(fs_info, "%s %s feature flag",
+		   val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
+
+	spin_lock(&fs_info->super_lock);
+	features = get_features(fs_info, fa->feature_set);
+	if (val)
+		features |= fa->feature_bit;
+	else
+		features &= ~fa->feature_bit;
+	set_features(fs_info, fa->feature_set, features);
+	spin_unlock(&fs_info->super_lock);
+
+	/*
+	 * We don't want to do full transaction commit from inside sysfs
+	 */
+	btrfs_set_pending(fs_info, COMMIT);
+	wake_up_process(fs_info->transaction_kthread);
+
+	return count;
+}
+
+static umode_t btrfs_feature_visible(struct kobject *kobj,
+				     struct attribute *attr, int unused)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	umode_t mode = attr->mode;
+
+	if (fs_info) {
+		struct btrfs_feature_attr *fa;
+		u64 features;
+
+		fa = attr_to_btrfs_feature_attr(attr);
+		features = get_features(fs_info, fa->feature_set);
+
+		if (can_modify_feature(fa))
+			mode |= S_IWUSR;
+		else if (!(features & fa->feature_bit))
+			mode = 0;
+	}
+
+	return mode;
+}
+
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
+BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
+BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
+BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+
+static struct attribute *btrfs_supported_feature_attrs[] = {
+	BTRFS_FEAT_ATTR_PTR(mixed_backref),
+	BTRFS_FEAT_ATTR_PTR(default_subvol),
+	BTRFS_FEAT_ATTR_PTR(mixed_groups),
+	BTRFS_FEAT_ATTR_PTR(compress_lzo),
+	BTRFS_FEAT_ATTR_PTR(big_metadata),
+	BTRFS_FEAT_ATTR_PTR(extended_iref),
+	BTRFS_FEAT_ATTR_PTR(raid56),
+	BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+	BTRFS_FEAT_ATTR_PTR(no_holes),
+	NULL
+};
+
+static const struct attribute_group btrfs_feature_attr_group = {
+	.name = "features",
+	.is_visible = btrfs_feature_visible,
+	.attrs = btrfs_supported_feature_attrs,
+};
+
+static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
+{
+	u64 val;
+	if (lock)
+		spin_lock(lock);
+	val = *value_ptr;
+	if (lock)
+		spin_unlock(lock);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static ssize_t global_rsv_size_show(struct kobject *kobj,
+				    struct kobj_attribute *ka, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
+
+static ssize_t global_rsv_reserved_show(struct kobject *kobj,
+					struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
+
+#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf);
+BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
+BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
+	struct btrfs_block_group_cache *block_group;
+	int index = to_raid_kobj(kobj)->raid_type;
+	u64 val = 0;
+
+	down_read(&sinfo->groups_sem);
+	list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
+		if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+			val += block_group->key.offset;
+		else
+			val += btrfs_block_group_used(&block_group->item);
+	}
+	up_read(&sinfo->groups_sem);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static struct attribute *raid_attributes[] = {
+	BTRFS_RAID_ATTR_PTR(total_bytes),
+	BTRFS_RAID_ATTR_PTR(used_bytes),
+	NULL
+};
+
+static void release_raid_kobj(struct kobject *kobj)
+{
+	kfree(to_raid_kobj(kobj));
+}
+
+struct kobj_type btrfs_raid_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = release_raid_kobj,
+	.default_attrs = raid_attributes,
+};
+
+#define SPACE_INFO_ATTR(field)						\
+static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,	\
+					     struct kobj_attribute *a,	\
+					     char *buf)			\
+{									\
+	struct btrfs_space_info *sinfo = to_space_info(kobj);		\
+	return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf);	\
+}									\
+BTRFS_ATTR(field, btrfs_space_info_show_##field)
+
+static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
+						       struct kobj_attribute *a,
+						       char *buf)
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
+	return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+}
+
+SPACE_INFO_ATTR(flags);
+SPACE_INFO_ATTR(total_bytes);
+SPACE_INFO_ATTR(bytes_used);
+SPACE_INFO_ATTR(bytes_pinned);
+SPACE_INFO_ATTR(bytes_reserved);
+SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(disk_used);
+SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
+
+static struct attribute *space_info_attrs[] = {
+	BTRFS_ATTR_PTR(flags),
+	BTRFS_ATTR_PTR(total_bytes),
+	BTRFS_ATTR_PTR(bytes_used),
+	BTRFS_ATTR_PTR(bytes_pinned),
+	BTRFS_ATTR_PTR(bytes_reserved),
+	BTRFS_ATTR_PTR(bytes_may_use),
+	BTRFS_ATTR_PTR(disk_used),
+	BTRFS_ATTR_PTR(disk_total),
+	BTRFS_ATTR_PTR(total_bytes_pinned),
+	NULL,
+};
+
+static void space_info_release(struct kobject *kobj)
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	percpu_counter_destroy(&sinfo->total_bytes_pinned);
+	kfree(sinfo);
+}
+
+struct kobj_type space_info_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = space_info_release,
+	.default_attrs = space_info_attrs,
+};
+
+static const struct attribute *allocation_attrs[] = {
+	BTRFS_ATTR_PTR(global_rsv_reserved),
+	BTRFS_ATTR_PTR(global_rsv_size),
+	NULL,
+};
+
+static ssize_t btrfs_label_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	char *label = fs_info->super_copy->label;
+	return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+}
+
+static ssize_t btrfs_label_store(struct kobject *kobj,
+				 struct kobj_attribute *a,
+				 const char *buf, size_t len)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	size_t p_len;
+
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	/*
+	 * p_len is the len until the first occurrence of either
+	 * '\n' or '\0'
+	 */
+	p_len = strcspn(buf, "\n");
+
+	if (p_len >= BTRFS_LABEL_SIZE)
+		return -EINVAL;
+
+	spin_lock(&fs_info->super_lock);
+	memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
+	memcpy(fs_info->super_copy->label, buf, p_len);
+	spin_unlock(&fs_info->super_lock);
+
+	/*
+	 * We don't want to do full transaction commit from inside sysfs
+	 */
+	btrfs_set_pending(fs_info, COMMIT);
+	wake_up_process(fs_info->transaction_kthread);
+
+	return len;
+}
+BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
+
+static ssize_t btrfs_nodesize_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+}
+
+BTRFS_ATTR(nodesize, btrfs_nodesize_show);
+
+static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
+
+static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
+
+static struct attribute *btrfs_attrs[] = {
+	BTRFS_ATTR_PTR(label),
+	BTRFS_ATTR_PTR(nodesize),
+	BTRFS_ATTR_PTR(sectorsize),
+	BTRFS_ATTR_PTR(clone_alignment),
+	NULL,
+};
+
+static void btrfs_release_super_kobj(struct kobject *kobj)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	complete(&fs_info->kobj_unregister);
+}
+
+static struct kobj_type btrfs_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.release	= btrfs_release_super_kobj,
+	.default_attrs	= btrfs_attrs,
+};
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+	if (kobj->ktype != &btrfs_ktype)
+		return NULL;
+	return container_of(kobj, struct btrfs_fs_info, super_kobj);
+}
+
+#define NUM_FEATURE_BITS 64
+static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
+static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+
+static const u64 supported_feature_masks[3] = {
+	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
+	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+	[FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
+};
+
+static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
+{
+	int set;
+
+	for (set = 0; set < FEAT_MAX; set++) {
+		int i;
+		struct attribute *attrs[2];
+		struct attribute_group agroup = {
+			.name = "features",
+			.attrs = attrs,
+		};
+		u64 features = get_features(fs_info, set);
+		features &= ~supported_feature_masks[set];
+
+		if (!features)
+			continue;
+
+		attrs[1] = NULL;
+		for (i = 0; i < NUM_FEATURE_BITS; i++) {
+			struct btrfs_feature_attr *fa;
+
+			if (!(features & (1ULL << i)))
+				continue;
+
+			fa = &btrfs_feature_attrs[set][i];
+			attrs[0] = &fa->kobj_attr.attr;
+			if (add) {
+				int ret;
+				ret = sysfs_merge_group(&fs_info->super_kobj,
+							&agroup);
+				if (ret)
+					return ret;
+			} else
+				sysfs_unmerge_group(&fs_info->super_kobj,
+						    &agroup);
+		}
+
+	}
+	return 0;
+}
+
+static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+	kobject_del(&fs_info->super_kobj);
+	kobject_put(&fs_info->super_kobj);
+	wait_for_completion(&fs_info->kobj_unregister);
+}
+
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->space_info_kobj) {
+		sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+		kobject_del(fs_info->space_info_kobj);
+		kobject_put(fs_info->space_info_kobj);
+	}
+	kobject_del(fs_info->device_dir_kobj);
+	kobject_put(fs_info->device_dir_kobj);
+	addrm_unknown_feature_attrs(fs_info, false);
+	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
+	__btrfs_sysfs_remove_one(fs_info);
+}
+
+const char * const btrfs_feature_set_names[3] = {
+	[FEAT_COMPAT]	 = "compat",
+	[FEAT_COMPAT_RO] = "compat_ro",
+	[FEAT_INCOMPAT]	 = "incompat",
+};
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
+{
+	size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
+	int len = 0;
+	int i;
+	char *str;
+
+	str = kmalloc(bufsize, GFP_KERNEL);
+	if (!str)
+		return str;
+
+	for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+		const char *name;
+
+		if (!(flags & (1ULL << i)))
+			continue;
+
+		name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
+		len += snprintf(str + len, bufsize - len, "%s%s",
+				len ? "," : "", name);
+	}
+
+	return str;
+}
+
+static void init_feature_attrs(void)
+{
+	struct btrfs_feature_attr *fa;
+	int set, i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+		     ARRAY_SIZE(btrfs_feature_attrs));
+	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+		     ARRAY_SIZE(btrfs_feature_attrs[0]));
+
+	memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+	memset(btrfs_unknown_feature_names, 0,
+	       sizeof(btrfs_unknown_feature_names));
+
+	for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
+		struct btrfs_feature_attr *sfa;
+		struct attribute *a = btrfs_supported_feature_attrs[i];
+		int bit;
+		sfa = attr_to_btrfs_feature_attr(a);
+		bit = ilog2(sfa->feature_bit);
+		fa = &btrfs_feature_attrs[sfa->feature_set][bit];
+
+		fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
+	}
+
+	for (set = 0; set < FEAT_MAX; set++) {
+		for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+			char *name = btrfs_unknown_feature_names[set][i];
+			fa = &btrfs_feature_attrs[set][i];
+
+			if (fa->kobj_attr.attr.name)
+				continue;
+
+			snprintf(name, 13, "%s:%u",
+				 btrfs_feature_set_names[set], i);
+
+			fa->kobj_attr.attr.name = name;
+			fa->kobj_attr.attr.mode = S_IRUGO;
+			fa->feature_set = set;
+			fa->feature_bit = 1ULL << i;
+		}
+	}
+}
+
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+		struct btrfs_device *one_device)
+{
+	struct hd_struct *disk;
+	struct kobject *disk_kobj;
+
+	if (!fs_info->device_dir_kobj)
+		return -EINVAL;
+
+	if (one_device && one_device->bdev) {
+		disk = one_device->bdev->bd_part;
+		disk_kobj = &part_to_dev(disk)->kobj;
+
+		sysfs_remove_link(fs_info->device_dir_kobj,
+						disk_kobj->name);
+	}
+
+	return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+		struct btrfs_device *one_device)
+{
+	int error = 0;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *dev;
+
+	if (!fs_info->device_dir_kobj)
+		fs_info->device_dir_kobj = kobject_create_and_add("devices",
+						&fs_info->super_kobj);
+
+	if (!fs_info->device_dir_kobj)
+		return -ENOMEM;
+
+	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+		struct hd_struct *disk;
+		struct kobject *disk_kobj;
+
+		if (!dev->bdev)
+			continue;
+
+		if (one_device && one_device != dev)
+			continue;
+
+		disk = dev->bdev->bd_part;
+		disk_kobj = &part_to_dev(disk)->kobj;
+
+		error = sysfs_create_link(fs_info->device_dir_kobj,
+					  disk_kobj, disk_kobj->name);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+{
+	int error;
+
+	init_completion(&fs_info->kobj_unregister);
+	fs_info->super_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
+				     "%pU", fs_info->fsid);
+	if (error)
+		return error;
+
+	error = sysfs_create_group(&fs_info->super_kobj,
+				   &btrfs_feature_attr_group);
+	if (error) {
+		__btrfs_sysfs_remove_one(fs_info);
+		return error;
+	}
+
+	error = addrm_unknown_feature_attrs(fs_info, true);
+	if (error)
+		goto failure;
+
+	error = btrfs_kobj_add_device(fs_info, NULL);
+	if (error)
+		goto failure;
+
+	fs_info->space_info_kobj = kobject_create_and_add("allocation",
+						  &fs_info->super_kobj);
+	if (!fs_info->space_info_kobj) {
+		error = -ENOMEM;
+		goto failure;
+	}
+
+	error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+	if (error)
+		goto failure;
+
+	return 0;
+failure:
+	btrfs_sysfs_remove_one(fs_info);
+	return error;
+}
+
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+	if (!btrfs_debugfs_root_dentry)
+		return -ENOMEM;
+
+	debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+			&btrfs_debugfs_test);
+#endif
+	return 0;
+}
+
+int btrfs_init_sysfs(void)
+{
+	int ret;
+
+	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+	if (!btrfs_kset)
+		return -ENOMEM;
+
+	ret = btrfs_init_debugfs();
+	if (ret)
+		goto out1;
+
+	init_feature_attrs();
+	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+	if (ret)
+		goto out2;
+
+	return 0;
+out2:
+	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+	kset_unregister(btrfs_kset);
+
+	return ret;
+}
+
+void btrfs_exit_sysfs(void)
+{
+	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+	kset_unregister(btrfs_kset);
+	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+}
+
diff --git a/kernel/fs/btrfs/sysfs.h b/kernel/fs/btrfs/sysfs.h
new file mode 100644
index 000000000..3a4bbed72
--- /dev/null
+++ b/kernel/fs/btrfs/sysfs.h
@@ -0,0 +1,89 @@
+#ifndef _BTRFS_SYSFS_H_
+#define _BTRFS_SYSFS_H_
+
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
+enum btrfs_feature_set {
+	FEAT_COMPAT,
+	FEAT_COMPAT_RO,
+	FEAT_INCOMPAT,
+	FEAT_MAX
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
+{									\
+	.attr	= { .name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,						\
+	.store	= _store,						\
+}
+
+#define BTRFS_ATTR_RW(_name, _show, _store)			\
+	static struct kobj_attribute btrfs_attr_##_name =		\
+			__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_name, _show)					\
+	static struct kobj_attribute btrfs_attr_##_name =		\
+			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#define BTRFS_ATTR_PTR(_name)    (&btrfs_attr_##_name.attr)
+
+#define BTRFS_RAID_ATTR(_name, _show)					\
+	static struct kobj_attribute btrfs_raid_attr_##_name =		\
+			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#define BTRFS_RAID_ATTR_PTR(_name)    (&btrfs_raid_attr_##_name.attr)
+
+
+struct btrfs_feature_attr {
+	struct kobj_attribute kobj_attr;
+	enum btrfs_feature_set feature_set;
+	u64 feature_bit;
+};
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit)	     \
+static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
+	.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,			     \
+				      btrfs_feature_attr_show,		     \
+				      btrfs_feature_attr_store),	     \
+	.feature_set	= _feature_set,					     \
+	.feature_bit	= _prefix ##_## _feature_bit,			     \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name)    (&btrfs_attr_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
+
+/* convert from attribute */
+static inline struct btrfs_feature_attr *
+to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+	return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+	return container_of(attr, struct kobj_attribute, attr);
+}
+
+static inline struct btrfs_feature_attr *
+attr_to_btrfs_feature_attr(struct attribute *attr)
+{
+	return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
+extern const char * const btrfs_feature_set_names[3];
+extern struct kobj_type space_info_ktype;
+extern struct kobj_type btrfs_raid_ktype;
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+		struct btrfs_device *one_device);
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+                struct btrfs_device *one_device);
+#endif /* _BTRFS_SYSFS_H_ */
diff --git a/kernel/fs/btrfs/tests/btrfs-tests.c b/kernel/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 000000000..9626252ee
--- /dev/null
+++ b/kernel/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static const struct super_operations btrfs_test_super_ops = {
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_test_destroy_inode,
+};
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+				       int flags, const char *dev_name,
+				       void *data)
+{
+	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+			    NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+	.name		= "btrfs_test_fs",
+	.mount		= btrfs_test_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+	return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+	int ret;
+
+	ret = register_filesystem(&test_type);
+	if (ret) {
+		printk(KERN_ERR "btrfs: cannot register test file system\n");
+		return ret;
+	}
+
+	test_mnt = kern_mount(&test_type);
+	if (IS_ERR(test_mnt)) {
+		printk(KERN_ERR "btrfs: cannot mount test file system\n");
+		unregister_filesystem(&test_type);
+		return ret;
+	}
+	return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+	kern_unmount(test_mnt);
+	unregister_filesystem(&test_type);
+}
+
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
+{
+	struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+						GFP_NOFS);
+
+	if (!fs_info)
+		return fs_info;
+	fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+				      GFP_NOFS);
+	if (!fs_info->fs_devices) {
+		kfree(fs_info);
+		return NULL;
+	}
+	fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
+				      GFP_NOFS);
+	if (!fs_info->super_copy) {
+		kfree(fs_info->fs_devices);
+		kfree(fs_info);
+		return NULL;
+	}
+
+	if (init_srcu_struct(&fs_info->subvol_srcu)) {
+		kfree(fs_info->fs_devices);
+		kfree(fs_info->super_copy);
+		kfree(fs_info);
+		return NULL;
+	}
+
+	spin_lock_init(&fs_info->buffer_lock);
+	spin_lock_init(&fs_info->qgroup_lock);
+	spin_lock_init(&fs_info->qgroup_op_lock);
+	spin_lock_init(&fs_info->super_lock);
+	spin_lock_init(&fs_info->fs_roots_radix_lock);
+	spin_lock_init(&fs_info->tree_mod_seq_lock);
+	mutex_init(&fs_info->qgroup_ioctl_lock);
+	mutex_init(&fs_info->qgroup_rescan_lock);
+	rwlock_init(&fs_info->tree_mod_log_lock);
+	fs_info->running_transaction = NULL;
+	fs_info->qgroup_tree = RB_ROOT;
+	fs_info->qgroup_ulist = NULL;
+	atomic64_set(&fs_info->tree_mod_seq, 0);
+	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+	INIT_LIST_HEAD(&fs_info->dead_roots);
+	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+	return fs_info;
+}
+
+static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+
+	spin_lock(&fs_info->buffer_lock);
+restart:
+	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+		struct extent_buffer *eb;
+
+		eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+		if (!eb)
+			continue;
+		/* Shouldn't happen but that kind of thinking creates CVE's */
+		if (radix_tree_exception(eb)) {
+			if (radix_tree_deref_retry(eb))
+				goto restart;
+			continue;
+		}
+		spin_unlock(&fs_info->buffer_lock);
+		free_extent_buffer_stale(eb);
+		spin_lock(&fs_info->buffer_lock);
+	}
+	spin_unlock(&fs_info->buffer_lock);
+
+	btrfs_free_qgroup_config(fs_info);
+	btrfs_free_fs_roots(fs_info);
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
+	kfree(fs_info->super_copy);
+	kfree(fs_info->fs_devices);
+	kfree(fs_info);
+}
+
+void btrfs_free_dummy_root(struct btrfs_root *root)
+{
+	if (!root)
+		return;
+	if (root->node)
+		free_extent_buffer(root->node);
+	if (root->fs_info)
+		btrfs_free_dummy_fs_info(root->fs_info);
+	kfree(root);
+}
+
diff --git a/kernel/fs/btrfs/tests/btrfs-tests.h b/kernel/fs/btrfs/tests/btrfs-tests.h
new file mode 100644
index 000000000..fd3954224
--- /dev/null
+++ b/kernel/fs/btrfs/tests/btrfs-tests.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TESTS
+#define __BTRFS_TESTS
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+
+struct btrfs_root;
+
+int btrfs_test_free_space_cache(void);
+int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_test_inodes(void);
+int btrfs_test_qgroups(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_root(struct btrfs_root *root);
+#else
+static inline int btrfs_test_free_space_cache(void)
+{
+	return 0;
+}
+static inline int btrfs_test_extent_buffer_operations(void)
+{
+	return 0;
+}
+static inline int btrfs_init_test_fs(void)
+{
+	return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+	return 0;
+}
+static inline int btrfs_test_inodes(void)
+{
+	return 0;
+}
+static inline int btrfs_test_qgroups(void)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/kernel/fs/btrfs/tests/extent-buffer-tests.c b/kernel/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644
index 000000000..f51963a8f
--- /dev/null
+++ b/kernel/fs/btrfs/tests/extent-buffer-tests.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(void)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	char *value = "mary had a little lamb";
+	char *split1 = "mary had a little";
+	char *split2 = " lamb";
+	char *split3 = "mary";
+	char *split4 = " had a little";
+	char buf[32];
+	struct btrfs_key key;
+	u32 value_len = strlen(value);
+	int ret = 0;
+
+	test_msg("Running btrfs_split_item tests\n");
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Could not allocate root\n");
+		return PTR_ERR(root);
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Could not allocate path\n");
+		kfree(root);
+		return -ENOMEM;
+	}
+
+	path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
+	if (!eb) {
+		test_msg("Could not allocate dummy buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	path->slots[0] = 0;
+
+	key.objectid = 0;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	item = btrfs_item_nr(0);
+	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+			    value_len);
+
+	key.offset = 3;
+
+	/*
+	 * Passing NULL trans here should be safe because we have plenty of
+	 * space in this leaf to split the item without having to split the
+	 * leaf.
+	 */
+	ret = btrfs_split_item(NULL, root, path, &key, 17);
+	if (ret) {
+		test_msg("Split item failed %d\n", ret);
+		goto out;
+	}
+
+	/*
+	 * Read the first slot, it should have the original key and contain only
+	 * 'mary had a little'
+	 */
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split1)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split1));
+	if (memcmp(buf, split1, strlen(split1))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the first split have='%.*s' want '%s'\n",
+			 (int)strlen(split1), buf, split1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key.offset = 1;
+	/* Do it again so we test memmoving the other items in the leaf */
+	ret = btrfs_split_item(NULL, root, path, &key, 4);
+	if (ret) {
+		test_msg("Second split item failed %d\n", ret);
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split3)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split3));
+	if (memcmp(buf, split3, strlen(split3))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the third split");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 1) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split4)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split4));
+	if (memcmp(buf, split4, strlen(split4))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the fourth split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 2);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 2\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(2);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the last chunk\n");
+		ret = -EINVAL;
+		goto out;
+	}
+out:
+	btrfs_free_path(path);
+	kfree(root);
+	return ret;
+}
+
+int btrfs_test_extent_buffer_operations(void)
+{
+	test_msg("Running extent buffer operation tests");
+	return test_btrfs_split_item();
+}
diff --git a/kernel/fs/btrfs/tests/extent-io-tests.c b/kernel/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 000000000..9e9f23681
--- /dev/null
+++ b/kernel/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK		(1 << 0)
+#define PROCESS_RELEASE		(1 << 1)
+#define PROCESS_TEST_LOCKED	(1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+				       unsigned long flags)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int count = 0;
+	int loops = 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (flags & PROCESS_TEST_LOCKED &&
+			    !PageLocked(pages[i]))
+				count++;
+			if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+			if (flags & PROCESS_RELEASE)
+				page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+		loops++;
+		if (loops > 100000) {
+			printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+			break;
+		}
+	}
+	return count;
+}
+
+static int test_find_delalloc(void)
+{
+	struct inode *inode;
+	struct extent_io_tree tmp;
+	struct page *page;
+	struct page *locked_page = NULL;
+	unsigned long index = 0;
+	u64 total_dirty = 256 * 1024 * 1024;
+	u64 max_bytes = 128 * 1024 * 1024;
+	u64 start, end, test_start;
+	u64 found;
+	int ret = -EINVAL;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Failed to allocate test inode\n");
+		return -ENOMEM;
+	}
+
+	extent_io_tree_init(&tmp, &inode->i_data);
+
+	/*
+	 * First go through and create and mark all of our pages dirty, we pin
+	 * everything to make sure our pages don't get evicted and screw up our
+	 * test.
+	 */
+	for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			test_msg("Failed to allocate test page\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+		SetPageDirty(page);
+		if (index) {
+			unlock_page(page);
+		} else {
+			page_cache_get(page);
+			locked_page = page;
+		}
+	}
+
+	/* Test this scenario
+	 * |--- delalloc ---|
+	 * |---  search  ---|
+	 */
+	set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+	start = 0;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Should have found at least one delalloc\n");
+		goto out_bits;
+	}
+	if (start != 0 || end != 4095) {
+		test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+			 start, end);
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	unlock_page(locked_page);
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 *
+	 * |--- delalloc ---|
+	 *           |--- search ---|
+	 */
+	test_start = 64 * 1024 * 1024;
+	locked_page = find_lock_page(inode->i_mapping,
+				     test_start >> PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Couldn't find the locked page\n");
+		goto out_bits;
+	}
+	set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Couldn't find delalloc in our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != max_bytes - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+			 "%Lu\n", test_start, max_bytes - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("There were unlocked pages in the range\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	/* locked_page was unlocked above */
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 * |--- delalloc ---|
+	 *                    |--- search ---|
+	 */
+	test_start = max_bytes + 4096;
+	locked_page = find_lock_page(inode->i_mapping, test_start >>
+				     PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Could'nt find the locked page\n");
+		goto out_bits;
+	}
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (found) {
+		test_msg("Found range when we shouldn't have\n");
+		goto out_bits;
+	}
+	if (end != (u64)-1) {
+		test_msg("Did not return the proper end offset\n");
+		goto out_bits;
+	}
+
+	/*
+	 * Test this scenario
+	 * [------- delalloc -------|
+	 * [max_bytes]|-- search--|
+	 *
+	 * We are re-using our test_start from above since it works out well.
+	 */
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != total_dirty - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, total_dirty - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+
+	/*
+	 * Now to test where we run into a page that is no longer dirty in the
+	 * range we want to find.
+	 */
+	page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+			     >> PAGE_CACHE_SHIFT);
+	if (!page) {
+		test_msg("Couldn't find our page\n");
+		goto out_bits;
+	}
+	ClearPageDirty(page);
+	page_cache_release(page);
+
+	/* We unlocked it in the previous test */
+	lock_page(locked_page);
+	start = test_start;
+	end = 0;
+	/*
+	 * Currently if we fail to find dirty pages in the delalloc range we
+	 * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search.  If
+	 * this changes at any point in the future we will need to fix this
+	 * tests expected behavior.
+	 */
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+			 end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+			       PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	ret = 0;
+out_bits:
+	clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
+out:
+	if (locked_page)
+		page_cache_release(locked_page);
+	process_page_range(inode, 0, total_dirty - 1,
+			   PROCESS_UNLOCK | PROCESS_RELEASE);
+	iput(inode);
+	return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+	test_msg("Running find delalloc tests\n");
+	return test_find_delalloc();
+}
diff --git a/kernel/fs/btrfs/tests/free-space-tests.c b/kernel/fs/btrfs/tests/free-space-tests.c
new file mode 100644
index 000000000..2299bfde3
--- /dev/null
+++ b/kernel/fs/btrfs/tests/free-space-tests.c
@@ -0,0 +1,909 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../free-space-cache.h"
+
+#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+static struct btrfs_block_group_cache *init_test_block_group(void)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = 0;
+	cache->key.offset = 1024 * 1024 * 1024;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	cache->sectorsize = 4096;
+	cache->full_stripe_len = 4096;
+
+	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->bg_list);
+
+	btrfs_init_free_space_ctl(cache);
+
+	return cache;
+}
+
+/*
+ * This test just does basic sanity checking, making sure we can add an exten
+ * entry and remove space from either end and the middle, and make sure we can
+ * remove space that covers adjacent extent entries.
+ */
+static int test_extents(struct btrfs_block_group_cache *cache)
+{
+	int ret = 0;
+
+	test_msg("Running extent only tests\n");
+
+	/* First just make sure we can remove an entire entry */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding initial extents %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing extent %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Full remove left some lingering space\n");
+		return -1;
+	}
+
+	/* Ok edge and middle cases now */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding half extent %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing tail end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing front end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+	if (ret) {
+		test_msg("Error removing middle piece %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the front\n");
+		return -1;
+	}
+
+	if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+		test_msg("Still have space in the middle\n");
+		return -1;
+	}
+
+	if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the end\n");
+		return -1;
+	}
+
+	/* Cleanup */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+static int test_bitmaps(struct btrfs_block_group_cache *cache)
+{
+	u64 next_bitmap_offset;
+	int ret;
+
+	test_msg("Running bitmap only tests\n");
+
+	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create a bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap full range %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Left some space in bitmap\n");
+		return -1;
+	}
+
+	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to our bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove middle chunk %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * The first bitmap we have starts at offset 0 so the next one is just
+	 * at the end of the first bitmap.
+	 */
+	next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+
+	/* Test a bit straddling two bitmaps */
+	ret = test_add_free_space_entry(cache, next_bitmap_offset -
+				   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space that straddles two bitmaps %d\n",
+				ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, next_bitmap_offset -
+				      (1 * 1024 * 1024), 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
+			 2 * 1024 * 1024)) {
+		test_msg("Left some space when removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+/* This is the high grade jackassery */
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+{
+	u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+	int ret;
+
+	test_msg("Running bitmap and extent tests\n");
+
+	/*
+	 * First let's do something simple, an extent at the same offset as the
+	 * bitmap, but the free space completely in the extent and then
+	 * completely in the bitmap.
+	 */
+	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove extent entry %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Left remnants after our remove\n");
+		return -1;
+	}
+
+	/* Now to add back the extent entry and remove from the bitmap */
+	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't re-add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove from bitmap %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Left remnants in the bitmap\n");
+		return -1;
+	}
+
+	/*
+	 * Ok so a little more evil, extent entry and bitmap at the same offset,
+	 * removing an overlapping chunk.
+	 */
+	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to a bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+		test_msg("Left over pieces after removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/* Now with the extent entry offset into the bitmap */
+	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space to the bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent to the cache %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Problem removing overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+		test_msg("Left something behind when removing space");
+		return -1;
+	}
+
+	/*
+	 * This has blown up in the past, the extent entry starts before the
+	 * bitmap entry, but we're trying to remove an offset that falls
+	 * completely within the bitmap range and is in both the extent entry
+	 * and the bitmap entry, looks like this
+	 *
+	 *   [ extent ]
+	 *      [ bitmap ]
+	 *        [ del ]
+	 */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
+				   4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
+				   5 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
+				      5 * 1024 * 1024);
+	if (ret) {
+		test_msg("Failed to free our space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
+			 5 * 1024 * 1024)) {
+		test_msg("Left stuff over\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/*
+	 * This blew up before, we have part of the free space in a bitmap and
+	 * then the entirety of the rest of the space in an extent.  This used
+	 * to return -EAGAIN back from btrfs_remove_extent, make sure this
+	 * doesn't happen.
+	 */
+	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+		return ret;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
+			    struct btrfs_free_space *info)
+{
+	return ctl->free_extents > 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int
+check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+			      const int num_extents,
+			      const int num_bitmaps)
+{
+	if (cache->free_space_ctl->free_extents != num_extents) {
+		test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+			 cache->free_space_ctl->free_extents, num_extents);
+		return -EINVAL;
+	}
+	if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
+		test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+			 cache->free_space_ctl->total_bitmaps, num_bitmaps);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int check_cache_empty(struct btrfs_block_group_cache *cache)
+{
+	u64 offset;
+	u64 max_extent_size;
+
+	/*
+	 * Now lets confirm that there's absolutely no free space left to
+	 * allocate.
+	 */
+	if (cache->free_space_ctl->free_space != 0) {
+		test_msg("Cache free space is not 0\n");
+		return -EINVAL;
+	}
+
+	/* And any allocation request, no matter how small, should fail now. */
+	offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
+					    &max_extent_size);
+	if (offset != 0) {
+		test_msg("Space allocation did not fail, returned offset: %llu",
+			 offset);
+		return -EINVAL;
+	}
+
+	/* And no extent nor bitmap entries in the cache anymore. */
+	return check_num_extents_and_bitmaps(cache, 0, 0);
+}
+
+/*
+ * Before we were able to steal free space from a bitmap entry to an extent
+ * entry, we could end up with 2 entries representing a contiguous free space.
+ * One would be an extent entry and the other a bitmap entry. Since in order
+ * to allocate space to a caller we use only 1 entry, we couldn't return that
+ * whole range to the caller if it was requested. This forced the caller to
+ * either assume ENOSPC or perform several smaller space allocations, which
+ * wasn't optimal as they could be spread all over the block group while under
+ * concurrency (extra overhead and fragmentation).
+ *
+ * This stealing approach is benefical, since we always prefer to allocate from
+ * extent entries, both for clustered and non-clustered allocation requests.
+ */
+static int
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+{
+	int ret;
+	u64 offset;
+	u64 max_extent_size;
+
+	bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
+			      struct btrfs_free_space *);
+
+	test_msg("Running space stealing from bitmap to extent\n");
+
+	/*
+	 * For this test, we want to ensure we end up with an extent entry
+	 * immediately adjacent to a bitmap entry, where the bitmap starts
+	 * at an offset where the extent entry ends. We keep adding and
+	 * removing free space to reach into this state, but to get there
+	 * we need to reach a point where marking new free space doesn't
+	 * result in adding new extent entries or merging the new space
+	 * with existing extent entries - the space ends up being marked
+	 * in an existing bitmap that covers the new free space range.
+	 *
+	 * To get there, we need to reach the threshold defined set at
+	 * cache->free_space_ctl->extents_thresh, which currently is
+	 * 256 extents on a x86_64 system at least, and a few other
+	 * conditions (check free_space_cache.c). Instead of making the
+	 * test much longer and complicated, use a "use_bitmap" operation
+	 * that forces use of bitmaps as soon as we have at least 1
+	 * extent entry.
+	 */
+	use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
+	cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+
+	/*
+	 * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
+	 */
+	ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
+					128 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	/* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
+	ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
+					128 * 1024 * 1024 - 512 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = check_num_extents_and_bitmaps(cache, 2, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now make only the first 256Kb of the bitmap marked as free, so that
+	 * we end up with only the following ranges marked as free space:
+	 *
+	 * [128Mb - 256Kb, 128Mb - 128Kb[
+	 * [128Mb + 512Kb, 128Mb + 768Kb[
+	 */
+	ret = btrfs_remove_free_space(cache,
+				      128 * 1024 * 1024 + 768 * 1024,
+				      128 * 1024 * 1024 - 768 * 1024);
+	if (ret) {
+		test_msg("Failed to free part of bitmap space %d\n", ret);
+		return ret;
+	}
+
+	/* Confirm that only those 2 ranges are marked as free. */
+	if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+			       128 * 1024)) {
+		test_msg("Free space range missing\n");
+		return -ENOENT;
+	}
+	if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
+			       256 * 1024)) {
+		test_msg("Free space range missing\n");
+		return -ENOENT;
+	}
+
+	/*
+	 * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
+	 * as free anymore.
+	 */
+	if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
+			      128 * 1024 * 1024 - 768 * 1024)) {
+		test_msg("Bitmap region not removed from space cache\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
+	 * covered by the bitmap, isn't marked as free.
+	 */
+	if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
+			      256 * 1024)) {
+		test_msg("Invalid bitmap region marked as free\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
+	 * by the bitmap too, isn't marked as free either.
+	 */
+	if (test_check_exists(cache, 128 * 1024 * 1024,
+			      256 * 1024)) {
+		test_msg("Invalid bitmap region marked as free\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
+	 * lets make sure the free space cache marks it as free in the bitmap,
+	 * and doesn't insert a new extent entry to represent this region.
+	 */
+	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+	if (ret) {
+		test_msg("Error adding free space: %d\n", ret);
+		return ret;
+	}
+	/* Confirm the region is marked as free. */
+	if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+		test_msg("Bitmap region not marked as free\n");
+		return -ENOENT;
+	}
+
+	/*
+	 * Confirm that no new extent entries or bitmap entries were added to
+	 * the cache after adding that free space region.
+	 */
+	ret = check_num_extents_and_bitmaps(cache, 2, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now lets add a small free space region to the right of the previous
+	 * one, which is not contiguous with it and is part of the bitmap too.
+	 * The goal is to test that the bitmap entry space stealing doesn't
+	 * steal this space region.
+	 */
+	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
+				   4096);
+	if (ret) {
+		test_msg("Error adding free space: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Confirm that no new extent entries or bitmap entries were added to
+	 * the cache after adding that free space region.
+	 */
+	ret = check_num_extents_and_bitmaps(cache, 2, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
+	 * expand the range covered by the existing extent entry that represents
+	 * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
+	 */
+	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
+				   128 * 1024);
+	if (ret) {
+		test_msg("Error adding free space: %d\n", ret);
+		return ret;
+	}
+	/* Confirm the region is marked as free. */
+	if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
+			       128 * 1024)) {
+		test_msg("Extent region not marked as free\n");
+		return -ENOENT;
+	}
+
+	/*
+	 * Confirm that our extent entry didn't stole all free space from the
+	 * bitmap, because of the small 4Kb free space region.
+	 */
+	ret = check_num_extents_and_bitmaps(cache, 2, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
+	 * space. Without stealing bitmap free space into extent entry space,
+	 * we would have all this free space represented by 2 entries in the
+	 * cache:
+	 *
+	 * extent entry covering range: [128Mb - 256Kb, 128Mb[
+	 * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
+	 *
+	 * Attempting to allocate the whole free space (1Mb) would fail, because
+	 * we can't allocate from multiple entries.
+	 * With the bitmap free space stealing, we get a single extent entry
+	 * that represents the 1Mb free space, and therefore we're able to
+	 * allocate the whole free space at once.
+	 */
+	if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+			       1 * 1024 * 1024)) {
+		test_msg("Expected region not marked as free\n");
+		return -ENOENT;
+	}
+
+	if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+		test_msg("Cache free space is not 1Mb + 4Kb\n");
+		return -EINVAL;
+	}
+
+	offset = btrfs_find_space_for_alloc(cache,
+					    0, 1 * 1024 * 1024, 0,
+					    &max_extent_size);
+	if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+		test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+			 offset);
+		return -EINVAL;
+	}
+
+	/* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+	ret = check_num_extents_and_bitmaps(cache, 1, 1);
+	if (ret)
+		return ret;
+
+	if (cache->free_space_ctl->free_space != 4096) {
+		test_msg("Cache free space is not 4Kb\n");
+		return -EINVAL;
+	}
+
+	offset = btrfs_find_space_for_alloc(cache,
+					    0, 4096, 0,
+					    &max_extent_size);
+	if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+		test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
+			 offset);
+		return -EINVAL;
+	}
+
+	ret = check_cache_empty(cache);
+	if (ret)
+		return ret;
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/*
+	 * Now test a similar scenario, but where our extent entry is located
+	 * to the right of the bitmap entry, so that we can check that stealing
+	 * space from a bitmap to the front of an extent entry works.
+	 */
+
+	/*
+	 * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
+	 */
+	ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
+					128 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
+	ret = test_add_free_space_entry(cache, 0,
+					128 * 1024 * 1024 - 512 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = check_num_extents_and_bitmaps(cache, 2, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now make only the last 256Kb of the bitmap marked as free, so that
+	 * we end up with only the following ranges marked as free space:
+	 *
+	 * [128Mb + 128b, 128Mb + 256Kb[
+	 * [128Mb - 768Kb, 128Mb - 512Kb[
+	 */
+	ret = btrfs_remove_free_space(cache,
+				      0,
+				      128 * 1024 * 1024 - 768 * 1024);
+	if (ret) {
+		test_msg("Failed to free part of bitmap space %d\n", ret);
+		return ret;
+	}
+
+	/* Confirm that only those 2 ranges are marked as free. */
+	if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
+			       128 * 1024)) {
+		test_msg("Free space range missing\n");
+		return -ENOENT;
+	}
+	if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+			       256 * 1024)) {
+		test_msg("Free space range missing\n");
+		return -ENOENT;
+	}
+
+	/*
+	 * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
+	 * as free anymore.
+	 */
+	if (test_check_exists(cache, 0,
+			      128 * 1024 * 1024 - 768 * 1024)) {
+		test_msg("Bitmap region not removed from space cache\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
+	 * covered by the bitmap, isn't marked as free.
+	 */
+	if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+			      512 * 1024)) {
+		test_msg("Invalid bitmap region marked as free\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
+	 * lets make sure the free space cache marks it as free in the bitmap,
+	 * and doesn't insert a new extent entry to represent this region.
+	 */
+	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
+				   512 * 1024);
+	if (ret) {
+		test_msg("Error adding free space: %d\n", ret);
+		return ret;
+	}
+	/* Confirm the region is marked as free. */
+	if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+			       512 * 1024)) {
+		test_msg("Bitmap region not marked as free\n");
+		return -ENOENT;
+	}
+
+	/*
+	 * Confirm that no new extent entries or bitmap entries were added to
+	 * the cache after adding that free space region.
+	 */
+	ret = check_num_extents_and_bitmaps(cache, 2, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now lets add a small free space region to the left of the previous
+	 * one, which is not contiguous with it and is part of the bitmap too.
+	 * The goal is to test that the bitmap entry space stealing doesn't
+	 * steal this space region.
+	 */
+	ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+	if (ret) {
+		test_msg("Error adding free space: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
+	 * expand the range covered by the existing extent entry that represents
+	 * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
+	 */
+	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+	if (ret) {
+		test_msg("Error adding free space: %d\n", ret);
+		return ret;
+	}
+	/* Confirm the region is marked as free. */
+	if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+		test_msg("Extent region not marked as free\n");
+		return -ENOENT;
+	}
+
+	/*
+	 * Confirm that our extent entry didn't stole all free space from the
+	 * bitmap, because of the small 8Kb free space region.
+	 */
+	ret = check_num_extents_and_bitmaps(cache, 2, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
+	 * space. Without stealing bitmap free space into extent entry space,
+	 * we would have all this free space represented by 2 entries in the
+	 * cache:
+	 *
+	 * extent entry covering range: [128Mb, 128Mb + 256Kb[
+	 * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
+	 *
+	 * Attempting to allocate the whole free space (1Mb) would fail, because
+	 * we can't allocate from multiple entries.
+	 * With the bitmap free space stealing, we get a single extent entry
+	 * that represents the 1Mb free space, and therefore we're able to
+	 * allocate the whole free space at once.
+	 */
+	if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+			       1 * 1024 * 1024)) {
+		test_msg("Expected region not marked as free\n");
+		return -ENOENT;
+	}
+
+	if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+		test_msg("Cache free space is not 1Mb + 8Kb\n");
+		return -EINVAL;
+	}
+
+	offset = btrfs_find_space_for_alloc(cache,
+					    0, 1 * 1024 * 1024, 0,
+					    &max_extent_size);
+	if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+		test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+			 offset);
+		return -EINVAL;
+	}
+
+	/* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+	ret = check_num_extents_and_bitmaps(cache, 1, 1);
+	if (ret)
+		return ret;
+
+	if (cache->free_space_ctl->free_space != 8192) {
+		test_msg("Cache free space is not 8Kb\n");
+		return -EINVAL;
+	}
+
+	offset = btrfs_find_space_for_alloc(cache,
+					    0, 8192, 0,
+					    &max_extent_size);
+	if (offset != (32 * 1024 * 1024)) {
+		test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+			 offset);
+		return -EINVAL;
+	}
+
+	ret = check_cache_empty(cache);
+	if (ret)
+		return ret;
+
+	cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+int btrfs_test_free_space_cache(void)
+{
+	struct btrfs_block_group_cache *cache;
+	int ret;
+
+	test_msg("Running btrfs free space cache tests\n");
+
+	cache = init_test_block_group();
+	if (!cache) {
+		test_msg("Couldn't run the tests\n");
+		return 0;
+	}
+
+	ret = test_extents(cache);
+	if (ret)
+		goto out;
+	ret = test_bitmaps(cache);
+	if (ret)
+		goto out;
+	ret = test_bitmaps_and_extents(cache);
+	if (ret)
+		goto out;
+
+	ret = test_steal_space_from_bitmap_to_extent(cache);
+out:
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	kfree(cache->free_space_ctl);
+	kfree(cache);
+	test_msg("Free space cache tests finished\n");
+	return ret;
+}
diff --git a/kernel/fs/btrfs/tests/inode-tests.c b/kernel/fs/btrfs/tests/inode-tests.c
new file mode 100644
index 000000000..054fc0d97
--- /dev/null
+++ b/kernel/fs/btrfs/tests/inode-tests.c
@@ -0,0 +1,1123 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+			  u64 ram_bytes, u64 offset, u64 disk_bytenr,
+			  u64 disk_len, u32 type, u8 compression, int slot)
+{
+	struct btrfs_path path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+	if (type == BTRFS_FILE_EXTENT_INLINE)
+		value_len += len;
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = slot;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, 1);
+	btrfs_set_file_extent_type(leaf, fi, type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+	btrfs_set_file_extent_offset(leaf, fi, offset);
+	btrfs_set_file_extent_num_bytes(leaf, fi, len);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, 0);
+	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+	struct btrfs_path path;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = 0;
+
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = 0;
+
+	key.objectid = BTRFS_INODE_ITEM_KEY;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen.  We want
+ * this so we can test all of the corner cases of btrfs_get_extent.  Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 -  6][6 - 10][10 - 4096][  4096 - 8192 ][8192 - 12288]
+ * [hole ][inline][ hole ][ regular ][regular1 split][    hole    ]
+ *
+ * [ 12288 - 20480][20480 - 24576][  24576 - 28672  ][28672 - 36864][36864 - 45056]
+ * [regular1 split][   prealloc1 ][prealloc1 written][   prealloc1 ][ compressed  ]
+ *
+ * [45056 - 49152][49152-53248][53248-61440][61440-65536][     65536+81920   ]
+ * [ compressed1 ][  regular  ][compressed1][  regular  ][ hole but no extent]
+ *
+ * [81920-86016]
+ * [  regular  ]
+ */
+static void setup_file_extents(struct btrfs_root *root)
+{
+	int slot = 0;
+	u64 disk_bytenr = 1 * 1024 * 1024;
+	u64 offset = 0;
+
+	/* First we want a hole */
+	insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 5;
+
+	/*
+	 * Now we want an inline extent, I don't think this is possible but hey
+	 * why not?  Also keep in mind if we have an inline extent it counts as
+	 * the whole first page.  If we were to expand it we would have to cow
+	 * and we wouldn't have an inline extent anymore.
+	 */
+	insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+		      slot);
+	slot++;
+	offset = 4096;
+
+	/* Now another hole */
+	insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 4;
+
+	/* Now for a regular extent */
+	insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	disk_bytenr += 4096;
+	offset += 4095;
+
+	/*
+	 * Now for 3 extents that were split from a hole punch so we test
+	 * offsets properly.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
+		      0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now for a unwritten prealloc extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+
+	/*
+	 * We want to jack up disk_bytenr a little more so the em stuff doesn't
+	 * merge our records.
+	 */
+	disk_bytenr += 8192;
+
+	/*
+	 * Now for a partially written prealloc extent, basically the same as
+	 * the hole punch example above.  Ram_bytes never changes when you mark
+	 * extents written btw.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now a normal compressed extent */
+	insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	/* No merges */
+	disk_bytenr += 8192;
+
+	/* Now a split compressed extent */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 8192;
+
+	/* Now extents that have a hole but no hole extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 16384;
+	disk_bytenr += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	u64 orig_start;
+	u64 disk_bytenr;
+	u64 offset;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	/*
+	 * We do this since btrfs_get_extent wants to assign em->bdev to
+	 * root->fs_info->fs_devices->latest_bdev.
+	 */
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(NULL, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	/*
+	 * We will just free a dummy node if it's ref count is 2 so we need an
+	 * extra ref so our searches don't accidently release our page.
+	 */
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	ret = -EINVAL;
+
+	/* First with no extents */
+	BTRFS_I(inode)->root = root;
+	em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+	if (IS_ERR(em)) {
+		em = NULL;
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+		test_msg("Vacancy flag wasn't set properly\n");
+		goto out;
+	}
+	free_extent_map(em);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+
+	/*
+	 * All of the magic numbers are based on the mapping setup in
+	 * setup_file_extents, so if you change anything there you need to
+	 * update the comment and update the expected values below.
+	 */
+	setup_file_extents(root);
+
+	em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 5) {
+		test_msg("Unexpected extent wanted start 0 len 5, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_INLINE) {
+		test_msg("Expected an inline, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4091) {
+		test_msg("Unexpected extent wanted start %llu len 1, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	/*
+	 * We don't test anything else for inline since it doesn't get set
+	 * unless we have a page for it to write into.  Maybe we should change
+	 * this?
+	 */
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4) {
+		test_msg("Unexpected extent wanted start %llu len 4, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Regular extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4095) {
+		test_msg("Unexpected extent wanted start %llu len 4095, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are split extents */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	disk_bytenr += (em->start - orig_start);
+	if (em->block_start != disk_bytenr) {
+		test_msg("Wrong block start, want %llu, have %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are a half written prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_HOLE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+			 em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Now for the compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Split compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != disk_bytenr) {
+		test_msg("Block start does not match, want %llu got %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* A hole between regular extents but no hole extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	/*
+	 * Currently we just return a length that we requested rather than the
+	 * length of the actual hole, if this changes we'll have to change this
+	 * test.
+	 */
+	if (em->start != offset || em->len != 12288) {
+		test_msg("Unexpected extent wanted start %llu len 12288, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 vacancy_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	btrfs_free_dummy_root(root);
+	return ret;
+}
+
+static int test_hole_first(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(NULL, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	BTRFS_I(inode)->root = root;
+	ret = -EINVAL;
+
+	/*
+	 * Need a blank inode item here just so we don't confuse
+	 * btrfs_get_extent.
+	 */
+	insert_inode_item_key(root);
+	insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, 1);
+	em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 0 len 4096, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+			 em->flags);
+		goto out;
+	}
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != 4096) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 4096 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 4096 len 4096, got "
+			 "start %llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, wanted 0 got %lu\n",
+			 em->flags);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	btrfs_free_dummy_root(root);
+	return ret;
+}
+
+static int test_extent_accounting(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	BTRFS_I(inode)->root = root;
+	btrfs_test_inode_set_ops(inode);
+
+	/* [BTRFS_MAX_EXTENT_SIZE] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+					NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 1) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 1, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE][4k] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+					BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 2) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 2, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+			       BTRFS_MAX_EXTENT_SIZE >> 1,
+			       (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+			       EXTENT_DELALLOC | EXTENT_DIRTY |
+			       EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+			       NULL, GFP_NOFS);
+	if (ret) {
+		test_msg("clear_extent_bit returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 2) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 2, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE][4K] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+					(BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+					NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 2) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 2, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/*
+	 * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+	 *
+	 * I'm artificially adding 2 to outstanding_extents because in the
+	 * buffered IO case we'd add things up as we go, but I don't feel like
+	 * doing that here, this isn't the interesting case we want to test.
+	 */
+	BTRFS_I(inode)->outstanding_extents += 2;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
+					(BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
+					NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 4) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 4, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+					BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 3) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 3, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+			       BTRFS_MAX_EXTENT_SIZE+4096,
+			       BTRFS_MAX_EXTENT_SIZE+8191,
+			       EXTENT_DIRTY | EXTENT_DELALLOC |
+			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+			       NULL, GFP_NOFS);
+	if (ret) {
+		test_msg("clear_extent_bit returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 4) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 4, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/*
+	 * Refill the hole again just for good measure, because I thought it
+	 * might fail and I'd rather satisfy my paranoia at this point.
+	 */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+					BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 3) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 3, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* Empty */
+	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+			       EXTENT_DIRTY | EXTENT_DELALLOC |
+			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+			       NULL, GFP_NOFS);
+	if (ret) {
+		test_msg("clear_extent_bit returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 0, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (ret)
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+				 NULL, GFP_NOFS);
+	iput(inode);
+	btrfs_free_dummy_root(root);
+	return ret;
+}
+
+int btrfs_test_inodes(void)
+{
+	int ret;
+
+	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+	test_msg("Running btrfs_get_extent tests\n");
+	ret = test_btrfs_get_extent();
+	if (ret)
+		return ret;
+	test_msg("Running hole first btrfs_get_extent test\n");
+	ret = test_hole_first();
+	if (ret)
+		return ret;
+	test_msg("Running outstanding_extents tests\n");
+	return test_extent_accounting();
+}
diff --git a/kernel/fs/btrfs/tests/qgroup-tests.c b/kernel/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 000000000..c32a7ba76
--- /dev/null
+++ b/kernel/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (C) 2013 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../transaction.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static void init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+	memset(trans, 0, sizeof(*trans));
+	trans->transid = 1;
+	INIT_LIST_HEAD(&trans->qgroup_ref_list);
+	trans->type = __TRANS_DUMMY;
+}
+
+static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
+				  u64 num_bytes, u64 parent, u64 root_objectid)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_tree_block_info *block_info;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key ins;
+	u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	ins.objectid = bytenr;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ins.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
+	if (ret) {
+		test_msg("Couldn't insert ref %d\n", ret);
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, item, 1);
+	btrfs_set_extent_generation(leaf, item, 1);
+	btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
+	block_info = (struct btrfs_tree_block_info *)(item + 1);
+	btrfs_set_tree_block_level(leaf, block_info, 1);
+	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	if (parent > 0) {
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_SHARED_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+			u64 parent, u64 root_objectid)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_extent_item *item;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 refs;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+	if (ret) {
+		test_msg("Couldn't find extent ref\n");
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_extent_item);
+	refs = btrfs_extent_refs(path->nodes[0], item);
+	btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
+	btrfs_release_path(path);
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
+	if (ret)
+		test_msg("Failed to insert backref\n");
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
+			      u64 num_bytes)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+	if (ret) {
+		test_msg("Didn't find our key %d\n", ret);
+		btrfs_free_path(path);
+		return ret;
+	}
+	btrfs_del_item(&trans, root, path);
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 parent, u64 root_objectid)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_extent_item *item;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 refs;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+	if (ret) {
+		test_msg("Couldn't find extent ref\n");
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_extent_item);
+	refs = btrfs_extent_refs(path->nodes[0], item);
+	btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
+	btrfs_release_path(path);
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+	if (ret) {
+		test_msg("Couldn't find backref %d\n", ret);
+		btrfs_free_path(path);
+		return ret;
+	}
+	btrfs_del_item(&trans, root, path);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int test_no_shared_qgroup(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	test_msg("Qgroup basic add\n");
+	ret = btrfs_create_qgroup(NULL, fs_info, 5);
+	if (ret) {
+		test_msg("Couldn't create a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	if (ret) {
+		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	ret = remove_extent_item(root, 4096, 4096);
+	if (ret)
+		return -EINVAL;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+				      BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+	if (ret) {
+		test_msg("Couldn't remove space from the qgroup %d\n", ret);
+		return -EINVAL;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Qgroup accounting failed %d\n", ret);
+		return -EINVAL;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Add a ref for two different roots to make sure the shared value comes out
+ * right, also remove one of the roots and make sure the exclusive count is
+ * adjusted properly.
+ */
+static int test_multiple_refs(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	test_msg("Qgroup multiple refs test\n");
+
+	/* We have 5 created already from the previous test */
+	ret = btrfs_create_qgroup(NULL, fs_info, 256);
+	if (ret) {
+		test_msg("Couldn't create a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+	if (ret)
+		return ret;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	if (ret) {
+		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	ret = add_tree_ref(root, 4096, 4096, 0, 256);
+	if (ret)
+		return ret;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+				      BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+	if (ret) {
+		test_msg("Qgroup record ref failed %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+	if (ret)
+		return ret;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+				      BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+	if (ret) {
+		test_msg("Qgroup record ref failed %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int btrfs_test_qgroups(void)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *tmp_root;
+	int ret = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		return PTR_ERR(root);
+	}
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	/* We are using this root as our extent root */
+	root->fs_info->extent_root = root;
+
+	/*
+	 * Some of the paths we test assume we have a filled out fs_info, so we
+	 * just need to add the root in there so we don't panic.
+	 */
+	root->fs_info->tree_root = root;
+	root->fs_info->quota_root = root;
+	root->fs_info->quota_enabled = 1;
+
+	/*
+	 * Can't use bytenr 0, some things freak out
+	 * *cough*backref walking code*cough*
+	 */
+	root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_nritems(root->node, 0);
+	root->alloc_bytenr += 8192;
+
+	tmp_root = btrfs_alloc_dummy_root();
+	if (IS_ERR(tmp_root)) {
+		test_msg("Couldn't allocate a fs root\n");
+		ret = PTR_ERR(tmp_root);
+		goto out;
+	}
+
+	tmp_root->root_key.objectid = 5;
+	root->fs_info->fs_root = tmp_root;
+	ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+	if (ret) {
+		test_msg("Couldn't insert fs root %d\n", ret);
+		goto out;
+	}
+
+	tmp_root = btrfs_alloc_dummy_root();
+	if (IS_ERR(tmp_root)) {
+		test_msg("Couldn't allocate a fs root\n");
+		ret = PTR_ERR(tmp_root);
+		goto out;
+	}
+
+	tmp_root->root_key.objectid = 256;
+	ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+	if (ret) {
+		test_msg("Couldn't insert fs root %d\n", ret);
+		goto out;
+	}
+
+	test_msg("Running qgroup tests\n");
+	ret = test_no_shared_qgroup(root);
+	if (ret)
+		goto out;
+	ret = test_multiple_refs(root);
+out:
+	btrfs_free_dummy_root(root);
+	return ret;
+}
diff --git a/kernel/fs/btrfs/transaction.c b/kernel/fs/btrfs/transaction.c
new file mode 100644
index 000000000..5628e2525
--- /dev/null
+++ b/kernel/fs/btrfs/transaction.c
@@ -0,0 +1,2204 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "inode-map.h"
+#include "volumes.h"
+#include "dev-replace.h"
+#include "qgroup.h"
+
+#define BTRFS_ROOT_TRANS_TAG 0
+
+static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+	[TRANS_STATE_RUNNING]		= 0U,
+	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START),
+	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH),
+	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN),
+	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+};
+
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
+{
+	WARN_ON(atomic_read(&transaction->use_count) == 0);
+	if (atomic_dec_and_test(&transaction->use_count)) {
+		BUG_ON(!list_empty(&transaction->list));
+		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+		if (transaction->delayed_refs.pending_csums)
+			printk(KERN_ERR "pending csums is %llu\n",
+			       transaction->delayed_refs.pending_csums);
+		while (!list_empty(&transaction->pending_chunks)) {
+			struct extent_map *em;
+
+			em = list_first_entry(&transaction->pending_chunks,
+					      struct extent_map, list);
+			list_del_init(&em->list);
+			free_extent_map(em);
+		}
+		kmem_cache_free(btrfs_transaction_cachep, transaction);
+	}
+}
+
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+	spin_lock(&tree->lock);
+	while (!RB_EMPTY_ROOT(&tree->state)) {
+		struct rb_node *node;
+		struct extent_state *state;
+
+		node = rb_first(&tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		rb_erase(&state->rb_node, &tree->state);
+		RB_CLEAR_NODE(&state->rb_node);
+		/*
+		 * btree io trees aren't supposed to have tasks waiting for
+		 * changes in the flags of extent states ever.
+		 */
+		ASSERT(!waitqueue_active(&state->wq));
+		free_extent_state(state);
+
+		cond_resched_lock(&tree->lock);
+	}
+	spin_unlock(&tree->lock);
+}
+
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+					 struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root, *tmp;
+
+	down_write(&fs_info->commit_root_sem);
+	list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+				 dirty_list) {
+		list_del_init(&root->dirty_list);
+		free_extent_buffer(root->commit_root);
+		root->commit_root = btrfs_root_node(root);
+		if (is_fstree(root->objectid))
+			btrfs_unpin_free_ino(root);
+		clear_btree_io_tree(&root->dirty_log_pages);
+	}
+	up_write(&fs_info->commit_root_sem);
+}
+
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+					  unsigned int type)
+{
+	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+	return atomic_read(&trans->num_extwriters);
+}
+
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
+{
+	struct btrfs_transaction *cur_trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	spin_lock(&fs_info->trans_lock);
+loop:
+	/* The file system has been taken offline. No new transactions. */
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		spin_unlock(&fs_info->trans_lock);
+		return -EROFS;
+	}
+
+	cur_trans = fs_info->running_transaction;
+	if (cur_trans) {
+		if (cur_trans->aborted) {
+			spin_unlock(&fs_info->trans_lock);
+			return cur_trans->aborted;
+		}
+		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
+			spin_unlock(&fs_info->trans_lock);
+			return -EBUSY;
+		}
+		atomic_inc(&cur_trans->use_count);
+		atomic_inc(&cur_trans->num_writers);
+		extwriter_counter_inc(cur_trans, type);
+		spin_unlock(&fs_info->trans_lock);
+		return 0;
+	}
+	spin_unlock(&fs_info->trans_lock);
+
+	/*
+	 * If we are ATTACH, we just want to catch the current transaction,
+	 * and commit it. If there is no transaction, just return ENOENT.
+	 */
+	if (type == TRANS_ATTACH)
+		return -ENOENT;
+
+	/*
+	 * JOIN_NOLOCK only happens during the transaction commit, so
+	 * it is impossible that ->running_transaction is NULL
+	 */
+	BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+	if (!cur_trans)
+		return -ENOMEM;
+
+	spin_lock(&fs_info->trans_lock);
+	if (fs_info->running_transaction) {
+		/*
+		 * someone started a transaction after we unlocked.  Make sure
+		 * to redo the checks above
+		 */
+		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		goto loop;
+	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		spin_unlock(&fs_info->trans_lock);
+		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		return -EROFS;
+	}
+
+	atomic_set(&cur_trans->num_writers, 1);
+	extwriter_counter_init(cur_trans, type);
+	init_waitqueue_head(&cur_trans->writer_wait);
+	init_waitqueue_head(&cur_trans->commit_wait);
+	cur_trans->state = TRANS_STATE_RUNNING;
+	/*
+	 * One for this trans handle, one so it will live on until we
+	 * commit the transaction.
+	 */
+	atomic_set(&cur_trans->use_count, 2);
+	cur_trans->have_free_bgs = 0;
+	cur_trans->start_time = get_seconds();
+	cur_trans->dirty_bg_run = 0;
+
+	cur_trans->delayed_refs.href_root = RB_ROOT;
+	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
+	cur_trans->delayed_refs.num_heads_ready = 0;
+	cur_trans->delayed_refs.pending_csums = 0;
+	cur_trans->delayed_refs.num_heads = 0;
+	cur_trans->delayed_refs.flushing = 0;
+	cur_trans->delayed_refs.run_delayed_start = 0;
+
+	/*
+	 * although the tree mod log is per file system and not per transaction,
+	 * the log must never go across transaction boundaries.
+	 */
+	smp_mb();
+	if (!list_empty(&fs_info->tree_mod_seq_list))
+		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
+			"creating a fresh transaction\n");
+	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
+			"creating a fresh transaction\n");
+	atomic64_set(&fs_info->tree_mod_seq, 0);
+
+	spin_lock_init(&cur_trans->delayed_refs.lock);
+
+	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+	INIT_LIST_HEAD(&cur_trans->pending_chunks);
+	INIT_LIST_HEAD(&cur_trans->switch_commits);
+	INIT_LIST_HEAD(&cur_trans->pending_ordered);
+	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+	INIT_LIST_HEAD(&cur_trans->io_bgs);
+	mutex_init(&cur_trans->cache_write_mutex);
+	cur_trans->num_dirty_bgs = 0;
+	spin_lock_init(&cur_trans->dirty_bgs_lock);
+	list_add_tail(&cur_trans->list, &fs_info->trans_list);
+	extent_io_tree_init(&cur_trans->dirty_pages,
+			     fs_info->btree_inode->i_mapping);
+	fs_info->generation++;
+	cur_trans->transid = fs_info->generation;
+	fs_info->running_transaction = cur_trans;
+	cur_trans->aborted = 0;
+	spin_unlock(&fs_info->trans_lock);
+
+	return 0;
+}
+
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+	    root->last_trans < trans->transid) {
+		WARN_ON(root == root->fs_info->extent_root);
+		WARN_ON(root->commit_root != root->node);
+
+		/*
+		 * see below for IN_TRANS_SETUP usage rules
+		 * we have the reloc mutex held now, so there
+		 * is only one writer in this function
+		 */
+		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
+
+		/* make sure readers find IN_TRANS_SETUP before
+		 * they find our root->last_trans update
+		 */
+		smp_wmb();
+
+		spin_lock(&root->fs_info->fs_roots_radix_lock);
+		if (root->last_trans == trans->transid) {
+			spin_unlock(&root->fs_info->fs_roots_radix_lock);
+			return 0;
+		}
+		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+			   (unsigned long)root->root_key.objectid,
+			   BTRFS_ROOT_TRANS_TAG);
+		spin_unlock(&root->fs_info->fs_roots_radix_lock);
+		root->last_trans = trans->transid;
+
+		/* this is pretty tricky.  We don't want to
+		 * take the relocation lock in btrfs_record_root_in_trans
+		 * unless we're really doing the first setup for this root in
+		 * this transaction.
+		 *
+		 * Normally we'd use root->last_trans as a flag to decide
+		 * if we want to take the expensive mutex.
+		 *
+		 * But, we have to set root->last_trans before we
+		 * init the relocation root, otherwise, we trip over warnings
+		 * in ctree.c.  The solution used here is to flag ourselves
+		 * with root IN_TRANS_SETUP.  When this is 1, we're still
+		 * fixing up the reloc trees and everyone must wait.
+		 *
+		 * When this is zero, they can trust root->last_trans and fly
+		 * through btrfs_record_root_in_trans without having to take the
+		 * lock.  smp_wmb() makes sure that all the writes above are
+		 * done before we pop in the zero below
+		 */
+		btrfs_init_reloc_root(trans, root);
+		smp_mb__before_atomic();
+		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
+	}
+	return 0;
+}
+
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		return 0;
+
+	/*
+	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
+	 * and barriers
+	 */
+	smp_rmb();
+	if (root->last_trans == trans->transid &&
+	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
+		return 0;
+
+	mutex_lock(&root->fs_info->reloc_mutex);
+	record_root_in_trans(trans, root);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+	return 0;
+}
+
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+	return (trans->state >= TRANS_STATE_BLOCKED &&
+		trans->state < TRANS_STATE_UNBLOCKED &&
+		!trans->aborted);
+}
+
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans;
+
+	spin_lock(&root->fs_info->trans_lock);
+	cur_trans = root->fs_info->running_transaction;
+	if (cur_trans && is_transaction_blocked(cur_trans)) {
+		atomic_inc(&cur_trans->use_count);
+		spin_unlock(&root->fs_info->trans_lock);
+
+		wait_event(root->fs_info->transaction_wait,
+			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+			   cur_trans->aborted);
+		btrfs_put_transaction(cur_trans);
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
+	}
+}
+
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+	if (root->fs_info->log_root_recovering)
+		return 0;
+
+	if (type == TRANS_USERSPACE)
+		return 1;
+
+	if (type == TRANS_START &&
+	    !atomic_read(&root->fs_info->open_ioctl_trans))
+		return 1;
+
+	return 0;
+}
+
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+	if (!root->fs_info->reloc_ctl ||
+	    !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+	    root->reloc_root)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
+		  enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_trans_handle *h;
+	struct btrfs_transaction *cur_trans;
+	u64 num_bytes = 0;
+	u64 qgroup_reserved = 0;
+	bool reloc_reserved = false;
+	int ret;
+
+	/* Send isn't supposed to start transactions. */
+	ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		return ERR_PTR(-EROFS);
+
+	if (current->journal_info) {
+		WARN_ON(type & TRANS_EXTWRITERS);
+		h = current->journal_info;
+		h->use_count++;
+		WARN_ON(h->use_count > 2);
+		h->orig_rsv = h->block_rsv;
+		h->block_rsv = NULL;
+		goto got_it;
+	}
+
+	/*
+	 * Do the reservation before we join the transaction so we can do all
+	 * the appropriate flushing if need be.
+	 */
+	if (num_items > 0 && root != root->fs_info->chunk_root) {
+		if (root->fs_info->quota_enabled &&
+		    is_fstree(root->root_key.objectid)) {
+			qgroup_reserved = num_items * root->nodesize;
+			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
+			if (ret)
+				return ERR_PTR(ret);
+		}
+
+		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+		/*
+		 * Do the reservation for the relocation root creation
+		 */
+		if (need_reserve_reloc_root(root)) {
+			num_bytes += root->nodesize;
+			reloc_reserved = true;
+		}
+
+		ret = btrfs_block_rsv_add(root,
+					  &root->fs_info->trans_block_rsv,
+					  num_bytes, flush);
+		if (ret)
+			goto reserve_fail;
+	}
+again:
+	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	if (!h) {
+		ret = -ENOMEM;
+		goto alloc_fail;
+	}
+
+	/*
+	 * If we are JOIN_NOLOCK we're already committing a transaction and
+	 * waiting on this guy, so we don't need to do the sb_start_intwrite
+	 * because we're already holding a ref.  We need this because we could
+	 * have raced in and did an fsync() on a file which can kick a commit
+	 * and then we deadlock with somebody doing a freeze.
+	 *
+	 * If we are ATTACH, it means we just want to catch the current
+	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
+	 */
+	if (type & __TRANS_FREEZABLE)
+		sb_start_intwrite(root->fs_info->sb);
+
+	if (may_wait_transaction(root, type))
+		wait_current_trans(root);
+
+	do {
+		ret = join_transaction(root, type);
+		if (ret == -EBUSY) {
+			wait_current_trans(root);
+			if (unlikely(type == TRANS_ATTACH))
+				ret = -ENOENT;
+		}
+	} while (ret == -EBUSY);
+
+	if (ret < 0) {
+		/* We must get the transaction if we are JOIN_NOLOCK. */
+		BUG_ON(type == TRANS_JOIN_NOLOCK);
+		goto join_fail;
+	}
+
+	cur_trans = root->fs_info->running_transaction;
+
+	h->transid = cur_trans->transid;
+	h->transaction = cur_trans;
+	h->blocks_used = 0;
+	h->bytes_reserved = 0;
+	h->root = root;
+	h->delayed_ref_updates = 0;
+	h->use_count = 1;
+	h->adding_csums = 0;
+	h->block_rsv = NULL;
+	h->orig_rsv = NULL;
+	h->aborted = 0;
+	h->qgroup_reserved = 0;
+	h->delayed_ref_elem.seq = 0;
+	h->type = type;
+	h->allocating_chunk = false;
+	h->reloc_reserved = false;
+	h->sync = false;
+	INIT_LIST_HEAD(&h->qgroup_ref_list);
+	INIT_LIST_HEAD(&h->new_bgs);
+	INIT_LIST_HEAD(&h->ordered);
+
+	smp_mb();
+	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+	    may_wait_transaction(root, type)) {
+		current->journal_info = h;
+		btrfs_commit_transaction(h, root);
+		goto again;
+	}
+
+	if (num_bytes) {
+		trace_btrfs_space_reservation(root->fs_info, "transaction",
+					      h->transid, num_bytes, 1);
+		h->block_rsv = &root->fs_info->trans_block_rsv;
+		h->bytes_reserved = num_bytes;
+		h->reloc_reserved = reloc_reserved;
+	}
+	h->qgroup_reserved = qgroup_reserved;
+
+got_it:
+	btrfs_record_root_in_trans(h, root);
+
+	if (!current->journal_info && type != TRANS_USERSPACE)
+		current->journal_info = h;
+	return h;
+
+join_fail:
+	if (type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+	kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+	if (num_bytes)
+		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+					num_bytes);
+reserve_fail:
+	if (qgroup_reserved)
+		btrfs_qgroup_free(root, qgroup_reserved);
+	return ERR_PTR(ret);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_items)
+{
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_ALL);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+					struct btrfs_root *root, int num_items)
+{
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_LIMIT);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN, 0);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
+}
+
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_USERSPACE, 0);
+}
+
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ *     btrfs_attach_transaction_barrier()
+ */
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_ATTACH, 0);
+}
+
+/*
+ * btrfs_attach_transaction_barrier() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+
+	trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+		btrfs_wait_for_commit(root, 0);
+
+	return trans;
+}
+
+/* wait for a transaction commit to be fully complete */
+static noinline void wait_for_commit(struct btrfs_root *root,
+				    struct btrfs_transaction *commit)
+{
+	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
+}
+
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+{
+	struct btrfs_transaction *cur_trans = NULL, *t;
+	int ret = 0;
+
+	if (transid) {
+		if (transid <= root->fs_info->last_trans_committed)
+			goto out;
+
+		/* find specified transaction */
+		spin_lock(&root->fs_info->trans_lock);
+		list_for_each_entry(t, &root->fs_info->trans_list, list) {
+			if (t->transid == transid) {
+				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
+				ret = 0;
+				break;
+			}
+			if (t->transid > transid) {
+				ret = 0;
+				break;
+			}
+		}
+		spin_unlock(&root->fs_info->trans_lock);
+
+		/*
+		 * The specified transaction doesn't exist, or we
+		 * raced with btrfs_commit_transaction
+		 */
+		if (!cur_trans) {
+			if (transid > root->fs_info->last_trans_committed)
+				ret = -EINVAL;
+			goto out;
+		}
+	} else {
+		/* find newest transaction that is committing | committed */
+		spin_lock(&root->fs_info->trans_lock);
+		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+					    list) {
+			if (t->state >= TRANS_STATE_COMMIT_START) {
+				if (t->state == TRANS_STATE_COMPLETED)
+					break;
+				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
+				break;
+			}
+		}
+		spin_unlock(&root->fs_info->trans_lock);
+		if (!cur_trans)
+			goto out;  /* nothing committing|committed */
+	}
+
+	wait_for_commit(root, cur_trans);
+	btrfs_put_transaction(cur_trans);
+out:
+	return ret;
+}
+
+void btrfs_throttle(struct btrfs_root *root)
+{
+	if (!atomic_read(&root->fs_info->open_ioctl_trans))
+		wait_current_trans(root);
+}
+
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	if (root->fs_info->global_block_rsv.space_info->full &&
+	    btrfs_check_space_for_delayed_refs(trans, root))
+		return 1;
+
+	return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
+}
+
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int updates;
+	int err;
+
+	smp_mb();
+	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+	    cur_trans->delayed_refs.flushing)
+		return 1;
+
+	updates = trans->delayed_ref_updates;
+	trans->delayed_ref_updates = 0;
+	if (updates) {
+		err = btrfs_run_delayed_refs(trans, root, updates * 2);
+		if (err) /* Error code will also eval true */
+			return err;
+	}
+
+	return should_end_transaction(trans, root);
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, int throttle)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_fs_info *info = root->fs_info;
+	unsigned long cur = trans->delayed_ref_updates;
+	int lock = (trans->type != TRANS_JOIN_NOLOCK);
+	int err = 0;
+	int must_run_delayed_refs = 0;
+
+	if (trans->use_count > 1) {
+		trans->use_count--;
+		trans->block_rsv = trans->orig_rsv;
+		return 0;
+	}
+
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
+	if (!list_empty(&trans->ordered)) {
+		spin_lock(&info->trans_lock);
+		list_splice(&trans->ordered, &cur_trans->pending_ordered);
+		spin_unlock(&info->trans_lock);
+	}
+
+	trans->delayed_ref_updates = 0;
+	if (!trans->sync) {
+		must_run_delayed_refs =
+			btrfs_should_throttle_delayed_refs(trans, root);
+		cur = max_t(unsigned long, cur, 32);
+
+		/*
+		 * don't make the caller wait if they are from a NOLOCK
+		 * or ATTACH transaction, it will deadlock with commit
+		 */
+		if (must_run_delayed_refs == 1 &&
+		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
+			must_run_delayed_refs = 2;
+	}
+
+	if (trans->qgroup_reserved) {
+		/*
+		 * the same root has to be passed here between start_transaction
+		 * and end_transaction. Subvolume quota depends on this.
+		 */
+		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
+	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+	    should_end_transaction(trans, root) &&
+	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+		spin_lock(&info->trans_lock);
+		if (cur_trans->state == TRANS_STATE_RUNNING)
+			cur_trans->state = TRANS_STATE_BLOCKED;
+		spin_unlock(&info->trans_lock);
+	}
+
+	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
+		if (throttle)
+			return btrfs_commit_transaction(trans, root);
+		else
+			wake_up_process(info->transaction_kthread);
+	}
+
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+
+	WARN_ON(cur_trans != info->running_transaction);
+	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
+	atomic_dec(&cur_trans->num_writers);
+	extwriter_counter_dec(cur_trans, trans->type);
+
+	smp_mb();
+	if (waitqueue_active(&cur_trans->writer_wait))
+		wake_up(&cur_trans->writer_wait);
+	btrfs_put_transaction(cur_trans);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
+	if (throttle)
+		btrfs_run_delayed_iputs(root);
+
+	if (trans->aborted ||
+	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+		wake_up_process(info->transaction_kthread);
+		err = -EIO;
+	}
+	assert_qgroups_uptodate(trans);
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+	if (must_run_delayed_refs) {
+		btrfs_async_run_delayed_refs(root, cur,
+					     must_run_delayed_refs == 1);
+	}
+	return err;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 0);
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 1);
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are sent to disk but does not wait on them
+ */
+int btrfs_write_marked_extents(struct btrfs_root *root,
+			       struct extent_io_tree *dirty_pages, int mark)
+{
+	int err = 0;
+	int werr = 0;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
+	u64 start = 0;
+	u64 end;
+
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      mark, &cached_state)) {
+		bool wait_writeback = false;
+
+		err = convert_extent_bit(dirty_pages, start, end,
+					 EXTENT_NEED_WAIT,
+					 mark, &cached_state, GFP_NOFS);
+		/*
+		 * convert_extent_bit can return -ENOMEM, which is most of the
+		 * time a temporary error. So when it happens, ignore the error
+		 * and wait for writeback of this range to finish - because we
+		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+		 * to btrfs_wait_marked_extents() would not know that writeback
+		 * for this range started and therefore wouldn't wait for it to
+		 * finish - we don't want to commit a superblock that points to
+		 * btree nodes/leafs for which writeback hasn't finished yet
+		 * (and without errors).
+		 * We cleanup any entries left in the io tree when committing
+		 * the transaction (through clear_btree_io_tree()).
+		 */
+		if (err == -ENOMEM) {
+			err = 0;
+			wait_writeback = true;
+		}
+		if (!err)
+			err = filemap_fdatawrite_range(mapping, start, end);
+		if (err)
+			werr = err;
+		else if (wait_writeback)
+			werr = filemap_fdatawait_range(mapping, start, end);
+		free_extent_state(cached_state);
+		cached_state = NULL;
+		cond_resched();
+		start = end + 1;
+	}
+	return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit.  We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+			      struct extent_io_tree *dirty_pages, int mark)
+{
+	int err = 0;
+	int werr = 0;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
+	u64 start = 0;
+	u64 end;
+	struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+	bool errors = false;
+
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      EXTENT_NEED_WAIT, &cached_state)) {
+		/*
+		 * Ignore -ENOMEM errors returned by clear_extent_bit().
+		 * When committing the transaction, we'll remove any entries
+		 * left in the io tree. For a log commit, we don't remove them
+		 * after committing the log because the tree can be accessed
+		 * concurrently - we do it only at transaction commit time when
+		 * it's safe to do it (through clear_btree_io_tree()).
+		 */
+		err = clear_extent_bit(dirty_pages, start, end,
+				       EXTENT_NEED_WAIT,
+				       0, 0, &cached_state, GFP_NOFS);
+		if (err == -ENOMEM)
+			err = 0;
+		if (!err)
+			err = filemap_fdatawait_range(mapping, start, end);
+		if (err)
+			werr = err;
+		free_extent_state(cached_state);
+		cached_state = NULL;
+		cond_resched();
+		start = end + 1;
+	}
+	if (err)
+		werr = err;
+
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		if ((mark & EXTENT_DIRTY) &&
+		    test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
+				       &btree_ino->runtime_flags))
+			errors = true;
+
+		if ((mark & EXTENT_NEW) &&
+		    test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
+				       &btree_ino->runtime_flags))
+			errors = true;
+	} else {
+		if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
+				       &btree_ino->runtime_flags))
+			errors = true;
+	}
+
+	if (errors && !werr)
+		werr = -EIO;
+
+	return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+				struct extent_io_tree *dirty_pages, int mark)
+{
+	int ret;
+	int ret2;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
+	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+	blk_finish_plug(&plug);
+	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
+
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
+}
+
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root)
+{
+	int ret;
+
+	ret = btrfs_write_and_wait_marked_extents(root,
+					   &trans->transaction->dirty_pages,
+					   EXTENT_DIRTY);
+	clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+	return ret;
+}
+
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	int ret;
+	u64 old_root_bytenr;
+	u64 old_root_used;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+
+	old_root_used = btrfs_root_used(&root->root_item);
+
+	while (1) {
+		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+		if (old_root_bytenr == root->node->start &&
+		    old_root_used == btrfs_root_used(&root->root_item))
+			break;
+
+		btrfs_set_root_node(&root->root_item, root->node);
+		ret = btrfs_update_root(trans, tree_root,
+					&root->root_key,
+					&root->root_item);
+		if (ret)
+			return ret;
+
+		old_root_used = btrfs_root_used(&root->root_item);
+	}
+
+	return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ *
+ * The error handling in this function may not be obvious. Any of the
+ * failures will cause the file system to go offline. We still need
+ * to clean up the delayed refs.
+ */
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+	struct list_head *io_bgs = &trans->transaction->io_bgs;
+	struct list_head *next;
+	struct extent_buffer *eb;
+	int ret;
+
+	eb = btrfs_lock_root_node(fs_info->tree_root);
+	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
+			      0, &eb);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	if (ret)
+		return ret;
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret)
+		return ret;
+
+	ret = btrfs_run_dev_stats(trans, root->fs_info);
+	if (ret)
+		return ret;
+	ret = btrfs_run_dev_replace(trans, root->fs_info);
+	if (ret)
+		return ret;
+	ret = btrfs_run_qgroups(trans, root->fs_info);
+	if (ret)
+		return ret;
+
+	ret = btrfs_setup_space_cache(trans, root);
+	if (ret)
+		return ret;
+
+	/* run_qgroups might have added some more refs */
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret)
+		return ret;
+again:
+	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+		next = fs_info->dirty_cowonly_roots.next;
+		list_del_init(next);
+		root = list_entry(next, struct btrfs_root, dirty_list);
+		clear_bit(BTRFS_ROOT_DIRTY, &root->state);
+
+		if (root != fs_info->extent_root)
+			list_add_tail(&root->dirty_list,
+				      &trans->transaction->switch_commits);
+		ret = update_cowonly_root(trans, root);
+		if (ret)
+			return ret;
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		if (ret)
+			return ret;
+	}
+
+	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
+		ret = btrfs_write_dirty_block_groups(trans, root);
+		if (ret)
+			return ret;
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		if (ret)
+			return ret;
+	}
+
+	if (!list_empty(&fs_info->dirty_cowonly_roots))
+		goto again;
+
+	list_add_tail(&fs_info->extent_root->dirty_list,
+		      &trans->transaction->switch_commits);
+	btrfs_after_dev_replace_commit(fs_info);
+
+	return 0;
+}
+
+/*
+ * dead roots are old snapshots that need to be deleted.  This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+void btrfs_add_dead_root(struct btrfs_root *root)
+{
+	spin_lock(&root->fs_info->trans_lock);
+	if (list_empty(&root->root_list))
+		list_add_tail(&root->root_list, &root->fs_info->dead_roots);
+	spin_unlock(&root->fs_info->trans_lock);
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
+{
+	struct btrfs_root *gang[8];
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int i;
+	int ret;
+	int err = 0;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	while (1) {
+		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+						 (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 BTRFS_ROOT_TRANS_TAG);
+		if (ret == 0)
+			break;
+		for (i = 0; i < ret; i++) {
+			root = gang[i];
+			radix_tree_tag_clear(&fs_info->fs_roots_radix,
+					(unsigned long)root->root_key.objectid,
+					BTRFS_ROOT_TRANS_TAG);
+			spin_unlock(&fs_info->fs_roots_radix_lock);
+
+			btrfs_free_log(trans, root);
+			btrfs_update_reloc_root(trans, root);
+			btrfs_orphan_commit_root(trans, root);
+
+			btrfs_save_ino_cache(root, trans);
+
+			/* see comments in should_cow_block() */
+			clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+			smp_mb__after_atomic();
+
+			if (root->commit_root != root->node) {
+				list_add_tail(&root->dirty_list,
+					&trans->transaction->switch_commits);
+				btrfs_set_root_node(&root->root_item,
+						    root->node);
+			}
+
+			err = btrfs_update_root(trans, fs_info->tree_root,
+						&root->root_key,
+						&root->root_item);
+			spin_lock(&fs_info->fs_roots_radix_lock);
+			if (err)
+				break;
+		}
+	}
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	return err;
+}
+
+/*
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+		return 0;
+
+	while (1) {
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		ret = btrfs_defrag_leaves(trans, root);
+
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(info->tree_root);
+		cond_resched();
+
+		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
+			break;
+
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			pr_debug("BTRFS: defrag_root cancelled\n");
+			ret = -EAGAIN;
+			break;
+		}
+	}
+	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+	return ret;
+}
+
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit.  This does the actual creation.
+ *
+ * Note:
+ * If the error which may affect the commitment of the current transaction
+ * happens, we should return the error number. If the error which just affect
+ * the creation of the pending snapshots, just return 0.
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info,
+				   struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item *new_root_item;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *parent_root;
+	struct btrfs_block_rsv *rsv;
+	struct inode *parent_inode;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
+	struct dentry *dentry;
+	struct extent_buffer *tmp;
+	struct extent_buffer *old;
+	struct timespec cur_time = CURRENT_TIME;
+	int ret = 0;
+	u64 to_reserve = 0;
+	u64 index = 0;
+	u64 objectid;
+	u64 root_flags;
+	uuid_le new_uuid;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		pending->error = -ENOMEM;
+		return 0;
+	}
+
+	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+	if (!new_root_item) {
+		pending->error = -ENOMEM;
+		goto root_item_alloc_fail;
+	}
+
+	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+	if (pending->error)
+		goto no_free_objectid;
+
+	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+
+	if (to_reserve > 0) {
+		pending->error = btrfs_block_rsv_add(root,
+						     &pending->block_rsv,
+						     to_reserve,
+						     BTRFS_RESERVE_NO_FLUSH);
+		if (pending->error)
+			goto no_free_objectid;
+	}
+
+	key.objectid = objectid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+
+	rsv = trans->block_rsv;
+	trans->block_rsv = &pending->block_rsv;
+	trans->bytes_reserved = trans->block_rsv->reserved;
+
+	dentry = pending->dentry;
+	parent_inode = pending->dir;
+	parent_root = BTRFS_I(parent_inode)->root;
+	record_root_in_trans(trans, parent_root);
+
+	/*
+	 * insert the directory item
+	 */
+	ret = btrfs_set_inode_index(parent_inode, &index);
+	BUG_ON(ret); /* -ENOMEM */
+
+	/* check if there is a file/dir which has the same name. */
+	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+					 btrfs_ino(parent_inode),
+					 dentry->d_name.name,
+					 dentry->d_name.len, 0);
+	if (dir_item != NULL && !IS_ERR(dir_item)) {
+		pending->error = -EEXIST;
+		goto dir_item_existed;
+	} else if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	btrfs_release_path(path);
+
+	/*
+	 * pull in the delayed directory update
+	 * and the delayed inode item
+	 * otherwise we corrupt the FS during
+	 * snapshot
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret) {	/* Transaction aborted */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	record_root_in_trans(trans, root);
+	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+	btrfs_check_and_init_root_item(new_root_item);
+
+	root_flags = btrfs_root_flags(new_root_item);
+	if (pending->readonly)
+		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+	else
+		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+	btrfs_set_root_flags(new_root_item, root_flags);
+
+	btrfs_set_root_generation_v2(new_root_item,
+			trans->transid);
+	uuid_le_gen(&new_uuid);
+	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
+			BTRFS_UUID_SIZE);
+	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
+		memset(new_root_item->received_uuid, 0,
+		       sizeof(new_root_item->received_uuid));
+		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
+		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
+		btrfs_set_root_stransid(new_root_item, 0);
+		btrfs_set_root_rtransid(new_root_item, 0);
+	}
+	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
+	btrfs_set_root_otransid(new_root_item, trans->transid);
+
+	old = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	if (ret) {
+		btrfs_tree_unlock(old);
+		free_extent_buffer(old);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	btrfs_set_lock_blocking(old);
+
+	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
+	/* clean up in any case */
+	btrfs_tree_unlock(old);
+	free_extent_buffer(old);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/*
+	 * We need to flush delayed refs in order to make sure all of our quota
+	 * operations have been done before we call btrfs_qgroup_inherit.
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_qgroup_inherit(trans, fs_info,
+				   root->root_key.objectid,
+				   objectid, pending->inherit);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/* see comments in should_cow_block() */
+	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+	smp_wmb();
+
+	btrfs_set_root_node(new_root_item, tmp);
+	/* record when the snapshot was created in key.offset */
+	key.offset = trans->transid;
+	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
+	btrfs_tree_unlock(tmp);
+	free_extent_buffer(tmp);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/*
+	 * insert root back/forward references
+	 */
+	ret = btrfs_add_root_ref(trans, tree_root, objectid,
+				 parent_root->root_key.objectid,
+				 btrfs_ino(parent_inode), index,
+				 dentry->d_name.name, dentry->d_name.len);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	key.offset = (u64)-1;
+	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+	if (IS_ERR(pending->snap)) {
+		ret = PTR_ERR(pending->snap);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_reloc_post_snapshot(trans, pending);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_insert_dir_item(trans, parent_root,
+				    dentry->d_name.name, dentry->d_name.len,
+				    parent_inode, &key,
+				    BTRFS_FT_DIR, index);
+	/* We have check then name at the beginning, so it is impossible. */
+	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+					 dentry->d_name.len * 2);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
+				  BTRFS_UUID_KEY_SUBVOL, objectid);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
+		ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+					  new_root_item->received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  objectid);
+		if (ret && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto fail;
+		}
+	}
+fail:
+	pending->error = ret;
+dir_item_existed:
+	trans->block_rsv = rsv;
+	trans->bytes_reserved = 0;
+no_free_objectid:
+	kfree(new_root_item);
+root_item_alloc_fail:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+					     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_pending_snapshot *pending, *next;
+	struct list_head *head = &trans->transaction->pending_snapshots;
+	int ret = 0;
+
+	list_for_each_entry_safe(pending, next, head, list) {
+		list_del(&pending->list);
+		ret = create_pending_snapshot(trans, fs_info, pending);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static void update_super_roots(struct btrfs_root *root)
+{
+	struct btrfs_root_item *root_item;
+	struct btrfs_super_block *super;
+
+	super = root->fs_info->super_copy;
+
+	root_item = &root->fs_info->chunk_root->root_item;
+	super->chunk_root = root_item->bytenr;
+	super->chunk_root_generation = root_item->generation;
+	super->chunk_root_level = root_item->level;
+
+	root_item = &root->fs_info->tree_root->root_item;
+	super->root = root_item->bytenr;
+	super->generation = root_item->generation;
+	super->root_level = root_item->level;
+	if (btrfs_test_opt(root, SPACE_CACHE))
+		super->cache_generation = root_item->generation;
+	if (root->fs_info->update_uuid_tree_gen)
+		super->uuid_tree_generation = root_item->generation;
+}
+
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+	struct btrfs_transaction *trans;
+	int ret = 0;
+
+	spin_lock(&info->trans_lock);
+	trans = info->running_transaction;
+	if (trans)
+		ret = (trans->state >= TRANS_STATE_COMMIT_START);
+	spin_unlock(&info->trans_lock);
+	return ret;
+}
+
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+	struct btrfs_transaction *trans;
+	int ret = 0;
+
+	spin_lock(&info->trans_lock);
+	trans = info->running_transaction;
+	if (trans)
+		ret = is_transaction_blocked(trans);
+	spin_unlock(&info->trans_lock);
+	return ret;
+}
+
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+					    struct btrfs_transaction *trans)
+{
+	wait_event(root->fs_info->transaction_blocked_wait,
+		   trans->state >= TRANS_STATE_COMMIT_START ||
+		   trans->aborted);
+}
+
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+					 struct btrfs_transaction *trans)
+{
+	wait_event(root->fs_info->transaction_wait,
+		   trans->state >= TRANS_STATE_UNBLOCKED ||
+		   trans->aborted);
+}
+
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+	struct btrfs_trans_handle *newtrans;
+	struct btrfs_root *root;
+	struct work_struct work;
+};
+
+static void do_async_commit(struct work_struct *work)
+{
+	struct btrfs_async_commit *ac =
+		container_of(work, struct btrfs_async_commit, work);
+
+	/*
+	 * We've got freeze protection passed with the transaction.
+	 * Tell lockdep about it.
+	 */
+	if (ac->newtrans->type & __TRANS_FREEZABLE)
+		rwsem_acquire_read(
+		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		     0, 1, _THIS_IP_);
+
+	current->journal_info = ac->newtrans;
+
+	btrfs_commit_transaction(ac->newtrans, ac->root);
+	kfree(ac);
+}
+
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock)
+{
+	struct btrfs_async_commit *ac;
+	struct btrfs_transaction *cur_trans;
+
+	ac = kmalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		return -ENOMEM;
+
+	INIT_WORK(&ac->work, do_async_commit);
+	ac->root = root;
+	ac->newtrans = btrfs_join_transaction(root);
+	if (IS_ERR(ac->newtrans)) {
+		int err = PTR_ERR(ac->newtrans);
+		kfree(ac);
+		return err;
+	}
+
+	/* take transaction reference */
+	cur_trans = trans->transaction;
+	atomic_inc(&cur_trans->use_count);
+
+	btrfs_end_transaction(trans, root);
+
+	/*
+	 * Tell lockdep we've released the freeze rwsem, since the
+	 * async commit thread will be the one to unlock it.
+	 */
+	if (ac->newtrans->type & __TRANS_FREEZABLE)
+		rwsem_release(
+			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+			1, _THIS_IP_);
+
+	schedule_work(&ac->work);
+
+	/* wait for transaction to start and unblock */
+	if (wait_for_unblock)
+		wait_current_trans_commit_start_and_unblock(root, cur_trans);
+	else
+		wait_current_trans_commit_start(root, cur_trans);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
+	btrfs_put_transaction(cur_trans);
+	return 0;
+}
+
+
+static void cleanup_transaction(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, int err)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	DEFINE_WAIT(wait);
+
+	WARN_ON(trans->use_count > 1);
+
+	btrfs_abort_transaction(trans, root, err);
+
+	spin_lock(&root->fs_info->trans_lock);
+
+	/*
+	 * If the transaction is removed from the list, it means this
+	 * transaction has been committed successfully, so it is impossible
+	 * to call the cleanup function.
+	 */
+	BUG_ON(list_empty(&cur_trans->list));
+
+	list_del_init(&cur_trans->list);
+	if (cur_trans == root->fs_info->running_transaction) {
+		cur_trans->state = TRANS_STATE_COMMIT_DOING;
+		spin_unlock(&root->fs_info->trans_lock);
+		wait_event(cur_trans->writer_wait,
+			   atomic_read(&cur_trans->num_writers) == 1);
+
+		spin_lock(&root->fs_info->trans_lock);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+
+	btrfs_cleanup_one_transaction(trans->transaction, root);
+
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans == root->fs_info->running_transaction)
+		root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->trans_lock);
+
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
+
+	trace_btrfs_transaction_commit(root);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+	btrfs_scrub_cancel(root->fs_info);
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+}
+
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		return btrfs_start_delalloc_roots(fs_info, 1, -1);
+	return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		btrfs_wait_ordered_roots(fs_info, -1);
+}
+
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+			   struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&fs_info->trans_lock);
+	while (!list_empty(&cur_trans->pending_ordered)) {
+		ordered = list_first_entry(&cur_trans->pending_ordered,
+					   struct btrfs_ordered_extent,
+					   trans_list);
+		list_del_init(&ordered->trans_list);
+		spin_unlock(&fs_info->trans_lock);
+
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						   &ordered->flags));
+		btrfs_put_ordered_extent(ordered);
+		spin_lock(&fs_info->trans_lock);
+	}
+	spin_unlock(&fs_info->trans_lock);
+}
+
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_transaction *prev_trans = NULL;
+	struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+	int ret;
+
+	/* Stop the commit early if ->aborted is set */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	/* make a pass through all the delayed refs we have so far
+	 * any runnings procs may add more while we are here
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+
+	cur_trans = trans->transaction;
+
+	/*
+	 * set the flushing flag so procs in this transaction have to
+	 * start sending their work down.
+	 */
+	cur_trans->delayed_refs.flushing = 1;
+	smp_wmb();
+
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	if (!cur_trans->dirty_bg_run) {
+		int run_it = 0;
+
+		/* this mutex is also taken before trying to set
+		 * block groups readonly.  We need to make sure
+		 * that nobody has set a block group readonly
+		 * after a extents from that block group have been
+		 * allocated for cache files.  btrfs_set_block_group_ro
+		 * will wait for the transaction to commit if it
+		 * finds dirty_bg_run = 1
+		 *
+		 * The dirty_bg_run flag is also used to make sure only
+		 * one process starts all the block group IO.  It wouldn't
+		 * hurt to have more than one go through, but there's no
+		 * real advantage to it either.
+		 */
+		mutex_lock(&root->fs_info->ro_block_group_mutex);
+		if (!cur_trans->dirty_bg_run) {
+			run_it = 1;
+			cur_trans->dirty_bg_run = 1;
+		}
+		mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+		if (run_it)
+			ret = btrfs_start_dirty_block_groups(trans, root);
+	}
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	spin_lock(&root->fs_info->trans_lock);
+	list_splice(&trans->ordered, &cur_trans->pending_ordered);
+	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+		spin_unlock(&root->fs_info->trans_lock);
+		atomic_inc(&cur_trans->use_count);
+		ret = btrfs_end_transaction(trans, root);
+
+		wait_for_commit(root, cur_trans);
+
+		if (unlikely(cur_trans->aborted))
+			ret = cur_trans->aborted;
+
+		btrfs_put_transaction(cur_trans);
+
+		return ret;
+	}
+
+	cur_trans->state = TRANS_STATE_COMMIT_START;
+	wake_up(&root->fs_info->transaction_blocked_wait);
+
+	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+		prev_trans = list_entry(cur_trans->list.prev,
+					struct btrfs_transaction, list);
+		if (prev_trans->state != TRANS_STATE_COMPLETED) {
+			atomic_inc(&prev_trans->use_count);
+			spin_unlock(&root->fs_info->trans_lock);
+
+			wait_for_commit(root, prev_trans);
+
+			btrfs_put_transaction(prev_trans);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
+		}
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
+	}
+
+	extwriter_counter_dec(cur_trans, trans->type);
+
+	ret = btrfs_start_delalloc_flush(root->fs_info);
+	if (ret)
+		goto cleanup_transaction;
+
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret)
+		goto cleanup_transaction;
+
+	wait_event(cur_trans->writer_wait,
+		   extwriter_counter_read(cur_trans) == 0);
+
+	/* some pending stuffs might be added after the previous flush. */
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret)
+		goto cleanup_transaction;
+
+	btrfs_wait_delalloc_flush(root->fs_info);
+
+	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
+	btrfs_scrub_pause(root);
+	/*
+	 * Ok now we need to make sure to block out any other joins while we
+	 * commit the transaction.  We could have started a join before setting
+	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
+	 */
+	spin_lock(&root->fs_info->trans_lock);
+	cur_trans->state = TRANS_STATE_COMMIT_DOING;
+	spin_unlock(&root->fs_info->trans_lock);
+	wait_event(cur_trans->writer_wait,
+		   atomic_read(&cur_trans->num_writers) == 1);
+
+	/* ->aborted might be set after the previous check, so check it */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		goto scrub_continue;
+	}
+	/*
+	 * the reloc mutex makes sure that we stop
+	 * the balancing code from coming in and moving
+	 * extents around in the middle of the commit
+	 */
+	mutex_lock(&root->fs_info->reloc_mutex);
+
+	/*
+	 * We needn't worry about the delayed items because we will
+	 * deal with them in create_pending_snapshot(), which is the
+	 * core function of the snapshot creation.
+	 */
+	ret = create_pending_snapshots(trans, root->fs_info);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * We insert the dir indexes of the snapshots and update the inode
+	 * of the snapshots' parents after the snapshot creation, so there
+	 * are some delayed items which are not dealt with. Now deal with
+	 * them.
+	 *
+	 * We needn't worry that this operation will corrupt the snapshots,
+	 * because all the tree which are snapshoted will be forced to COW
+	 * the nodes and leaves.
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * make sure none of the code above managed to slip in a
+	 * delayed item
+	 */
+	btrfs_assert_delayed_root_empty(root);
+
+	WARN_ON(cur_trans != trans->transaction);
+
+	/* btrfs_commit_tree_roots is responsible for getting the
+	 * various roots consistent with each other.  Every pointer
+	 * in the tree of tree roots has to point to the most up to date
+	 * root for every subvolume and other tree.  So, we have to keep
+	 * the tree logging code from jumping in and changing any
+	 * of the trees.
+	 *
+	 * At this point in the commit, there can't be any tree-log
+	 * writers, but a little lower down we drop the trans mutex
+	 * and let new people in.  By holding the tree_log_mutex
+	 * from now until after the super is written, we avoid races
+	 * with the tree-log code.
+	 */
+	mutex_lock(&root->fs_info->tree_log_mutex);
+
+	ret = commit_fs_roots(trans, root);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * Since the transaction is done, we can apply the pending changes
+	 * before the next transaction.
+	 */
+	btrfs_apply_pending_changes(root->fs_info);
+
+	/* commit_fs_roots gets rid of all the tree log roots, it is now
+	 * safe to free the root of tree log roots
+	 */
+	btrfs_free_log_root_tree(trans, root->fs_info);
+
+	ret = commit_cowonly_roots(trans, root);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * The tasks which save the space cache and inode cache may also
+	 * update ->aborted, check it.
+	 */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	btrfs_prepare_extent_commit(trans, root);
+
+	cur_trans = root->fs_info->running_transaction;
+
+	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
+			    root->fs_info->tree_root->node);
+	list_add_tail(&root->fs_info->tree_root->dirty_list,
+		      &cur_trans->switch_commits);
+
+	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
+			    root->fs_info->chunk_root->node);
+	list_add_tail(&root->fs_info->chunk_root->dirty_list,
+		      &cur_trans->switch_commits);
+
+	switch_commit_roots(cur_trans, root->fs_info);
+
+	assert_qgroups_uptodate(trans);
+	ASSERT(list_empty(&cur_trans->dirty_bgs));
+	ASSERT(list_empty(&cur_trans->io_bgs));
+	update_super_roots(root);
+
+	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+	btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
+	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+	       sizeof(*root->fs_info->super_copy));
+
+	btrfs_update_commit_device_size(root->fs_info);
+	btrfs_update_commit_device_bytes_used(root, cur_trans);
+
+	clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+	clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+
+	spin_lock(&root->fs_info->trans_lock);
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
+	root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->trans_lock);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+	wake_up(&root->fs_info->transaction_wait);
+
+	ret = btrfs_write_and_wait_transaction(trans, root);
+	if (ret) {
+		btrfs_error(root->fs_info, ret,
+			    "Error while writing out transaction");
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		goto scrub_continue;
+	}
+
+	ret = write_ctree_super(trans, root, 0);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * the super is written, we can safely allow the tree-loggers
+	 * to go about their business
+	 */
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+
+	btrfs_finish_extent_commit(trans, root);
+
+	if (cur_trans->have_free_bgs)
+		btrfs_clear_space_info_full(root->fs_info);
+
+	root->fs_info->last_trans_committed = cur_trans->transid;
+	/*
+	 * We needn't acquire the lock here because there is no other task
+	 * which can change it.
+	 */
+	cur_trans->state = TRANS_STATE_COMPLETED;
+	wake_up(&cur_trans->commit_wait);
+
+	spin_lock(&root->fs_info->trans_lock);
+	list_del_init(&cur_trans->list);
+	spin_unlock(&root->fs_info->trans_lock);
+
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
+
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+
+	trace_btrfs_transaction_commit(root);
+
+	btrfs_scrub_continue(root);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (current != root->fs_info->transaction_kthread)
+		btrfs_run_delayed_iputs(root);
+
+	return ret;
+
+scrub_continue:
+	btrfs_scrub_continue(root);
+cleanup_transaction:
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+	btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+	cleanup_transaction(trans, root, ret);
+
+	return ret;
+}
+
+/*
+ * return < 0 if error
+ * 0 if there are no more dead_roots at the time of call
+ * 1 there are more to be processed, call me again
+ *
+ * The return value indicates there are certainly more snapshots to delete, but
+ * if there comes a new one during processing, it may return 0. We don't mind,
+ * because btrfs_commit_super will poke cleaner thread and it will process it a
+ * few seconds later.
+ */
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
+{
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	spin_lock(&fs_info->trans_lock);
+	if (list_empty(&fs_info->dead_roots)) {
+		spin_unlock(&fs_info->trans_lock);
+		return 0;
+	}
+	root = list_first_entry(&fs_info->dead_roots,
+			struct btrfs_root, root_list);
+	list_del_init(&root->root_list);
+	spin_unlock(&fs_info->trans_lock);
+
+	pr_debug("BTRFS: cleaner removing %llu\n", root->objectid);
+
+	btrfs_kill_all_delayed_nodes(root);
+
+	if (btrfs_header_backref_rev(root->node) <
+			BTRFS_MIXED_BACKREF_REV)
+		ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+	else
+		ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+
+	return (ret < 0) ? 0 : 1;
+}
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+	unsigned long prev;
+	unsigned long bit;
+
+	prev = xchg(&fs_info->pending_changes, 0);
+	if (!prev)
+		return;
+
+	bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+	if (prev & bit)
+		btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+	prev &= ~bit;
+
+	bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+	if (prev & bit)
+		btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+	prev &= ~bit;
+
+	bit = 1 << BTRFS_PENDING_COMMIT;
+	if (prev & bit)
+		btrfs_debug(fs_info, "pending commit done");
+	prev &= ~bit;
+
+	if (prev)
+		btrfs_warn(fs_info,
+			"unknown pending changes left 0x%lx, ignoring", prev);
+}
diff --git a/kernel/fs/btrfs/transaction.h b/kernel/fs/btrfs/transaction.h
new file mode 100644
index 000000000..0b2475559
--- /dev/null
+++ b/kernel/fs/btrfs/transaction.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+#include "ctree.h"
+
+enum btrfs_trans_state {
+	TRANS_STATE_RUNNING		= 0,
+	TRANS_STATE_BLOCKED		= 1,
+	TRANS_STATE_COMMIT_START	= 2,
+	TRANS_STATE_COMMIT_DOING	= 3,
+	TRANS_STATE_UNBLOCKED		= 4,
+	TRANS_STATE_COMPLETED		= 5,
+	TRANS_STATE_MAX			= 6,
+};
+
+struct btrfs_transaction {
+	u64 transid;
+	/*
+	 * total external writers(USERSPACE/START/ATTACH) in this
+	 * transaction, it must be zero before the transaction is
+	 * being committed
+	 */
+	atomic_t num_extwriters;
+	/*
+	 * total writers in this transaction, it must be zero before the
+	 * transaction can end
+	 */
+	atomic_t num_writers;
+	atomic_t use_count;
+
+	/*
+	 * true if there is free bgs operations in this transaction
+	 */
+	int have_free_bgs;
+
+	/* Be protected by fs_info->trans_lock when we want to change it. */
+	enum btrfs_trans_state state;
+	struct list_head list;
+	struct extent_io_tree dirty_pages;
+	unsigned long start_time;
+	wait_queue_head_t writer_wait;
+	wait_queue_head_t commit_wait;
+	struct list_head pending_snapshots;
+	struct list_head pending_chunks;
+	struct list_head pending_ordered;
+	struct list_head switch_commits;
+	struct list_head dirty_bgs;
+	struct list_head io_bgs;
+	u64 num_dirty_bgs;
+
+	/*
+	 * we need to make sure block group deletion doesn't race with
+	 * free space cache writeout.  This mutex keeps them from stomping
+	 * on each other
+	 */
+	struct mutex cache_write_mutex;
+	spinlock_t dirty_bgs_lock;
+	struct btrfs_delayed_ref_root delayed_refs;
+	int aborted;
+	int dirty_bg_run;
+};
+
+#define __TRANS_FREEZABLE	(1U << 0)
+
+#define __TRANS_USERSPACE	(1U << 8)
+#define __TRANS_START		(1U << 9)
+#define __TRANS_ATTACH		(1U << 10)
+#define __TRANS_JOIN		(1U << 11)
+#define __TRANS_JOIN_NOLOCK	(1U << 12)
+#define __TRANS_DUMMY		(1U << 13)
+
+#define TRANS_USERSPACE		(__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH		(__TRANS_ATTACH)
+#define TRANS_JOIN		(__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK	(__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS	(__TRANS_USERSPACE | __TRANS_START |	\
+				 __TRANS_ATTACH)
+
+#define BTRFS_SEND_TRANS_STUB	((void *)1)
+
+struct btrfs_trans_handle {
+	u64 transid;
+	u64 bytes_reserved;
+	u64 qgroup_reserved;
+	unsigned long use_count;
+	unsigned long blocks_reserved;
+	unsigned long blocks_used;
+	unsigned long delayed_ref_updates;
+	struct btrfs_transaction *transaction;
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *orig_rsv;
+	short aborted;
+	short adding_csums;
+	bool allocating_chunk;
+	bool reloc_reserved;
+	bool sync;
+	unsigned int type;
+	/*
+	 * this root is only needed to validate that the root passed to
+	 * start_transaction is the same as the one passed to end_transaction.
+	 * Subvolume quota depends on this
+	 */
+	struct btrfs_root *root;
+	struct seq_list delayed_ref_elem;
+	struct list_head ordered;
+	struct list_head qgroup_ref_list;
+	struct list_head new_bgs;
+};
+
+struct btrfs_pending_snapshot {
+	struct dentry *dentry;
+	struct inode *dir;
+	struct btrfs_root *root;
+	struct btrfs_root *snap;
+	struct btrfs_qgroup_inherit *inherit;
+	/* block reservation for the operation */
+	struct btrfs_block_rsv block_rsv;
+	u64 qgroup_reserved;
+	/* extra metadata reseration for relocation */
+	int error;
+	bool readonly;
+	struct list_head list;
+};
+
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+					      struct inode *inode)
+{
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->last_trans = trans->transaction->transid;
+	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+	spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+					struct btrfs_root *root, int num_items);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+					struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
+
+void btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+				struct extent_io_tree *dirty_pages, int mark);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+				struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/kernel/fs/btrfs/tree-defrag.c b/kernel/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000..a63719cc9
--- /dev/null
+++ b/kernel/fs/btrfs/tree-defrag.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root)
+{
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	int ret = 0;
+	int wret;
+	int level;
+	int next_key_ret = 0;
+	u64 last_ret = 0;
+	u64 min_trans = 0;
+
+	if (root->fs_info->extent_root == root) {
+		/*
+		 * there's recursion here right now in the tree locking,
+		 * we can't defrag the extent root without deadlock
+		 */
+		goto out;
+	}
+
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		goto out;
+
+	if (btrfs_test_opt(root, SSD))
+		goto out;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(root->node);
+
+	if (level == 0)
+		goto out;
+
+	if (root->defrag_progress.objectid == 0) {
+		struct extent_buffer *root_node;
+		u32 nritems;
+
+		root_node = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(root_node);
+		nritems = btrfs_header_nritems(root_node);
+		root->defrag_max.objectid = 0;
+		/* from above we know this is not a leaf */
+		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+				      nritems - 1);
+		btrfs_tree_unlock(root_node);
+		free_extent_buffer(root_node);
+		memset(&key, 0, sizeof(key));
+	} else {
+		memcpy(&key, &root->defrag_progress, sizeof(key));
+	}
+
+	path->keep_locks = 1;
+
+	ret = btrfs_search_forward(root, &key, path, min_trans);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+	btrfs_release_path(path);
+	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+	if (wret < 0) {
+		ret = wret;
+		goto out;
+	}
+	if (!path->nodes[1]) {
+		ret = 0;
+		goto out;
+	}
+	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+					   min_trans);
+	ret = btrfs_realloc_node(trans, root,
+				 path->nodes[1], 0,
+				 &last_ret,
+				 &root->defrag_progress);
+	if (ret) {
+		WARN_ON(ret == -EAGAIN);
+		goto out;
+	}
+	if (next_key_ret == 0) {
+		memcpy(&root->defrag_progress, &key, sizeof(key));
+		ret = -EAGAIN;
+	}
+out:
+	if (path)
+		btrfs_free_path(path);
+	if (ret == -EAGAIN) {
+		if (root->defrag_max.objectid > root->defrag_progress.objectid)
+			goto done;
+		if (root->defrag_max.type > root->defrag_progress.type)
+			goto done;
+		if (root->defrag_max.offset > root->defrag_progress.offset)
+			goto done;
+		ret = 0;
+	}
+done:
+	if (ret != -EAGAIN) {
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+		root->defrag_trans_start = trans->transid;
+	}
+	return ret;
+}
diff --git a/kernel/fs/btrfs/tree-log.c b/kernel/fs/btrfs/tree-log.c
new file mode 100644
index 000000000..d04968374
--- /dev/null
+++ b/kernel/fs/btrfs/tree-log.c
@@ -0,0 +1,5089 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/list_sort.h>
+#include "tree-log.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "backref.h"
+#include "hash.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2).  After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ *  2a is actually the more important variant.  With the extra logging
+ *  a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir.  After a crash the rm -rf must
+ * be replayed.  This must be able to recurse down the entire
+ * directory tree.  The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_DIR_INDEX 2
+#define LOG_WALK_REPLAY_ALL 3
+
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct inode *inode,
+			   int inode_only,
+			   const loff_t start,
+			   const loff_t end,
+			   struct btrfs_log_ctx *ctx);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid, int del_all);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_log_ctx *ctx)
+{
+	int index;
+	int ret;
+
+	mutex_lock(&root->log_mutex);
+	if (root->log_root) {
+		if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+			ret = -EAGAIN;
+			goto out;
+		}
+		if (!root->log_start_pid) {
+			root->log_start_pid = current->pid;
+			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+		} else if (root->log_start_pid != current->pid) {
+			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+		}
+
+		atomic_inc(&root->log_batch);
+		atomic_inc(&root->log_writers);
+		if (ctx) {
+			index = root->log_transid % 2;
+			list_add_tail(&ctx->list, &root->log_ctxs[index]);
+			ctx->log_transid = root->log_transid;
+		}
+		mutex_unlock(&root->log_mutex);
+		return 0;
+	}
+
+	ret = 0;
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (!root->fs_info->log_root_tree)
+		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	if (ret)
+		goto out;
+
+	if (!root->log_root) {
+		ret = btrfs_add_log_tree(trans, root);
+		if (ret)
+			goto out;
+	}
+	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+	root->log_start_pid = current->pid;
+	atomic_inc(&root->log_batch);
+	atomic_inc(&root->log_writers);
+	if (ctx) {
+		index = root->log_transid % 2;
+		list_add_tail(&ctx->list, &root->log_ctxs[index]);
+		ctx->log_transid = root->log_transid;
+	}
+out:
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	smp_mb();
+	if (!root->log_root)
+		return -ENOENT;
+
+	mutex_lock(&root->log_mutex);
+	if (root->log_root) {
+		ret = 0;
+		atomic_inc(&root->log_writers);
+	}
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	mutex_lock(&root->log_mutex);
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+void btrfs_end_log_trans(struct btrfs_root *root)
+{
+	if (atomic_dec_and_test(&root->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&root->log_writer_wait))
+			wake_up(&root->log_writer_wait);
+	}
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+	/* should we free the extent on disk when done?  This is used
+	 * at transaction commit time while freeing a log tree
+	 */
+	int free;
+
+	/* should we write out the extent buffer?  This is used
+	 * while flushing the log tree to disk during a sync
+	 */
+	int write;
+
+	/* should we wait for the extent buffer io to finish?  Also used
+	 * while flushing the log tree to disk for a sync
+	 */
+	int wait;
+
+	/* pin only walk, we record which extents on disk belong to the
+	 * log trees
+	 */
+	int pin;
+
+	/* what stage of the replay code we're currently in */
+	int stage;
+
+	/* the root we are currently replaying */
+	struct btrfs_root *replay_dest;
+
+	/* the trans handle for the current replay */
+	struct btrfs_trans_handle *trans;
+
+	/* the function that gets used to process blocks we find in the
+	 * tree.  Note the extent_buffer might not be up to date when it is
+	 * passed in, and it must be checked or read if you need the data
+	 * inside it
+	 */
+	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+			    struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+			      struct extent_buffer *eb,
+			      struct walk_control *wc, u64 gen)
+{
+	int ret = 0;
+
+	/*
+	 * If this fs is mixed then we need to be able to process the leaves to
+	 * pin down any logged extents, so we have to read the block.
+	 */
+	if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+		ret = btrfs_read_buffer(eb, gen);
+		if (ret)
+			return ret;
+	}
+
+	if (wc->pin)
+		ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
+						      eb->start, eb->len);
+
+	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+		if (wc->pin && btrfs_header_level(eb) == 0)
+			ret = btrfs_exclude_logged_extents(log, eb);
+		if (wc->write)
+			btrfs_write_tree_block(eb);
+		if (wc->wait)
+			btrfs_wait_tree_block_writeback(eb);
+	}
+	return ret;
+}
+
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *eb, int slot,
+				   struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size;
+	u64 saved_i_size = 0;
+	int save_old_i_size = 0;
+	unsigned long src_ptr;
+	unsigned long dst_ptr;
+	int overwrite_root = 0;
+	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		overwrite_root = 1;
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+	/* look for the key in the destination tree */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0) {
+		char *src_copy;
+		char *dst_copy;
+		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+						  path->slots[0]);
+		if (dst_size != item_size)
+			goto insert;
+
+		if (item_size == 0) {
+			btrfs_release_path(path);
+			return 0;
+		}
+		dst_copy = kmalloc(item_size, GFP_NOFS);
+		src_copy = kmalloc(item_size, GFP_NOFS);
+		if (!dst_copy || !src_copy) {
+			btrfs_release_path(path);
+			kfree(dst_copy);
+			kfree(src_copy);
+			return -ENOMEM;
+		}
+
+		read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+				   item_size);
+		ret = memcmp(dst_copy, src_copy, item_size);
+
+		kfree(dst_copy);
+		kfree(src_copy);
+		/*
+		 * they have the same contents, just return, this saves
+		 * us from cowing blocks in the destination tree and doing
+		 * extra writes that may not have been done by a previous
+		 * sync
+		 */
+		if (ret == 0) {
+			btrfs_release_path(path);
+			return 0;
+		}
+
+		/*
+		 * We need to load the old nbytes into the inode so when we
+		 * replay the extents we've logged we get the right nbytes.
+		 */
+		if (inode_item) {
+			struct btrfs_inode_item *item;
+			u64 nbytes;
+			u32 mode;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+			item = btrfs_item_ptr(eb, slot,
+					      struct btrfs_inode_item);
+			btrfs_set_inode_nbytes(eb, item, nbytes);
+
+			/*
+			 * If this is a directory we need to reset the i_size to
+			 * 0 so that we can set it up properly when replaying
+			 * the rest of the items in this log.
+			 */
+			mode = btrfs_inode_mode(eb, item);
+			if (S_ISDIR(mode))
+				btrfs_set_inode_size(eb, item, 0);
+		}
+	} else if (inode_item) {
+		struct btrfs_inode_item *item;
+		u32 mode;
+
+		/*
+		 * New inode, set nbytes to 0 so that the nbytes comes out
+		 * properly when we replay the extents.
+		 */
+		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+		btrfs_set_inode_nbytes(eb, item, 0);
+
+		/*
+		 * If this is a directory we need to reset the i_size to 0 so
+		 * that we can set it up properly when replaying the rest of
+		 * the items in this log.
+		 */
+		mode = btrfs_inode_mode(eb, item);
+		if (S_ISDIR(mode))
+			btrfs_set_inode_size(eb, item, 0);
+	}
+insert:
+	btrfs_release_path(path);
+	/* try to insert the key into the destination tree */
+	path->skip_release_on_error = 1;
+	ret = btrfs_insert_empty_item(trans, root, path,
+				      key, item_size);
+	path->skip_release_on_error = 0;
+
+	/* make sure any existing item is the correct size */
+	if (ret == -EEXIST || ret == -EOVERFLOW) {
+		u32 found_size;
+		found_size = btrfs_item_size_nr(path->nodes[0],
+						path->slots[0]);
+		if (found_size > item_size)
+			btrfs_truncate_item(root, path, item_size, 1);
+		else if (found_size < item_size)
+			btrfs_extend_item(root, path,
+					  item_size - found_size);
+	} else if (ret) {
+		return ret;
+	}
+	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+					path->slots[0]);
+
+	/* don't overwrite an existing inode if the generation number
+	 * was logged as zero.  This is done when the tree logging code
+	 * is just logging an inode to make sure it exists after recovery.
+	 *
+	 * Also, don't overwrite i_size on directories during replay.
+	 * log replay inserts and removes directory items based on the
+	 * state of the tree found in the subvolume, and i_size is modified
+	 * as it goes
+	 */
+	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+		struct btrfs_inode_item *src_item;
+		struct btrfs_inode_item *dst_item;
+
+		src_item = (struct btrfs_inode_item *)src_ptr;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+		if (btrfs_inode_generation(eb, src_item) == 0) {
+			struct extent_buffer *dst_eb = path->nodes[0];
+			const u64 ino_size = btrfs_inode_size(eb, src_item);
+
+			/*
+			 * For regular files an ino_size == 0 is used only when
+			 * logging that an inode exists, as part of a directory
+			 * fsync, and the inode wasn't fsynced before. In this
+			 * case don't set the size of the inode in the fs/subvol
+			 * tree, otherwise we would be throwing valid data away.
+			 */
+			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+			    ino_size != 0) {
+				struct btrfs_map_token token;
+
+				btrfs_init_map_token(&token);
+				btrfs_set_token_inode_size(dst_eb, dst_item,
+							   ino_size, &token);
+			}
+			goto no_copy;
+		}
+
+		if (overwrite_root &&
+		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+			save_old_i_size = 1;
+			saved_i_size = btrfs_inode_size(path->nodes[0],
+							dst_item);
+		}
+	}
+
+	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+			   src_ptr, item_size);
+
+	if (save_old_i_size) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+	}
+
+	/* make sure the generation is filled in */
+	if (key->type == BTRFS_INODE_ITEM_KEY) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+			btrfs_set_inode_generation(path->nodes[0], dst_item,
+						   trans->transid);
+		}
+	}
+no_copy:
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(path);
+	return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+					     u64 objectid)
+{
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		inode = NULL;
+	} else if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int found_type;
+	u64 extent_end;
+	u64 start = key->offset;
+	u64 nbytes = 0;
+	struct btrfs_file_extent_item *item;
+	struct inode *inode = NULL;
+	unsigned long size;
+	int ret = 0;
+
+	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(eb, item);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		nbytes = btrfs_file_extent_num_bytes(eb, item);
+		extent_end = start + nbytes;
+
+		/*
+		 * We don't add to the inodes nbytes if we are prealloc or a
+		 * hole.
+		 */
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			nbytes = 0;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size = btrfs_file_extent_inline_len(eb, slot, item);
+		nbytes = btrfs_file_extent_ram_bytes(eb, item);
+		extent_end = ALIGN(start + size, root->sectorsize);
+	} else {
+		ret = 0;
+		goto out;
+	}
+
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * first check to see if we already have this extent in the
+	 * file.  This must be done before the btrfs_drop_extents run
+	 * so we don't try to drop this extent.
+	 */
+	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
+				       start, 0);
+
+	if (ret == 0 &&
+	    (found_type == BTRFS_FILE_EXTENT_REG ||
+	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+		struct btrfs_file_extent_item cmp1;
+		struct btrfs_file_extent_item cmp2;
+		struct btrfs_file_extent_item *existing;
+		struct extent_buffer *leaf;
+
+		leaf = path->nodes[0];
+		existing = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_file_extent_item);
+
+		read_extent_buffer(eb, &cmp1, (unsigned long)item,
+				   sizeof(cmp1));
+		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+				   sizeof(cmp2));
+
+		/*
+		 * we already have a pointer to this exact extent,
+		 * we don't have to do anything
+		 */
+		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+			btrfs_release_path(path);
+			goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	/* drop any overlapping extents */
+	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
+	if (ret)
+		goto out;
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 offset;
+		unsigned long dest_offset;
+		struct btrfs_key ins;
+
+		ret = btrfs_insert_empty_item(trans, root, path, key,
+					      sizeof(*item));
+		if (ret)
+			goto out;
+		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+						    path->slots[0]);
+		copy_extent_buffer(path->nodes[0], eb, dest_offset,
+				(unsigned long)item,  sizeof(*item));
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+		offset = key->offset - btrfs_file_extent_offset(eb, item);
+
+		if (ins.objectid > 0) {
+			u64 csum_start;
+			u64 csum_end;
+			LIST_HEAD(ordered_sums);
+			/*
+			 * is this extent already allocated in the extent
+			 * allocation tree?  If so, just add a reference
+			 */
+			ret = btrfs_lookup_data_extent(root, ins.objectid,
+						ins.offset);
+			if (ret == 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						ins.objectid, ins.offset,
+						0, root->root_key.objectid,
+						key->objectid, offset, 0);
+				if (ret)
+					goto out;
+			} else {
+				/*
+				 * insert the extent pointer in the extent
+				 * allocation tree
+				 */
+				ret = btrfs_alloc_logged_file_extent(trans,
+						root, root->root_key.objectid,
+						key->objectid, offset, &ins);
+				if (ret)
+					goto out;
+			}
+			btrfs_release_path(path);
+
+			if (btrfs_file_extent_compression(eb, item)) {
+				csum_start = ins.objectid;
+				csum_end = csum_start + ins.offset;
+			} else {
+				csum_start = ins.objectid +
+					btrfs_file_extent_offset(eb, item);
+				csum_end = csum_start +
+					btrfs_file_extent_num_bytes(eb, item);
+			}
+
+			ret = btrfs_lookup_csums_range(root->log_root,
+						csum_start, csum_end - 1,
+						&ordered_sums, 0);
+			if (ret)
+				goto out;
+			while (!list_empty(&ordered_sums)) {
+				struct btrfs_ordered_sum *sums;
+				sums = list_entry(ordered_sums.next,
+						struct btrfs_ordered_sum,
+						list);
+				if (!ret)
+					ret = btrfs_csum_file_blocks(trans,
+						root->fs_info->csum_root,
+						sums);
+				list_del(&sums->list);
+				kfree(sums);
+			}
+			if (ret)
+				goto out;
+		} else {
+			btrfs_release_path(path);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		if (ret)
+			goto out;
+	}
+
+	inode_add_bytes(inode, nbytes);
+	ret = btrfs_update_inode(trans, root, inode);
+out:
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct inode *dir,
+				      struct btrfs_dir_item *di)
+{
+	struct inode *inode;
+	char *name;
+	int name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_key location;
+	int ret;
+
+	leaf = path->nodes[0];
+
+	btrfs_dir_item_key_to_cpu(leaf, di, &location);
+	name_len = btrfs_dir_name_len(leaf, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
+	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+	btrfs_release_path(path);
+
+	inode = read_one_inode(root, location.objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	if (ret)
+		goto out;
+
+	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	if (ret)
+		goto out;
+	else
+		ret = btrfs_run_delayed_items(trans, root);
+out:
+	kfree(name);
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 dirid, u64 objectid, u64 index,
+				 const char *name, int name_len)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_key location;
+	int match = 0;
+
+	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+					 index, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	btrfs_release_path(path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	match = 1;
+out:
+	btrfs_release_path(path);
+	return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+				   struct btrfs_key *key,
+				   u64 ref_objectid,
+				   const char *name, int namelen)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	unsigned long name_ptr;
+	int found_name_len;
+	int item_size;
+	int ret;
+	int match = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+	if (ret != 0)
+		goto out;
+
+	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, namelen, NULL))
+			match = 1;
+
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		ref = (struct btrfs_inode_ref *)ptr;
+		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+		if (found_name_len == namelen) {
+			name_ptr = (unsigned long)(ref + 1);
+			ret = memcmp_extent_buffer(path->nodes[0], name,
+						   name_ptr, namelen);
+			if (ret == 0) {
+				match = 1;
+				goto out;
+			}
+		}
+		ptr = (unsigned long)(ref + 1) + found_name_len;
+	}
+out:
+	btrfs_free_path(path);
+	return match;
+}
+
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  struct btrfs_root *log_root,
+				  struct inode *dir, struct inode *inode,
+				  struct extent_buffer *eb,
+				  u64 inode_objectid, u64 parent_objectid,
+				  u64 ref_index, char *name, int namelen,
+				  int *search_done)
+{
+	int ret;
+	char *victim_name;
+	int victim_name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key search_key;
+	struct btrfs_inode_extref *extref;
+
+again:
+	/* Search old style refs */
+	search_key.objectid = inode_objectid;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = parent_objectid;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret == 0) {
+		struct btrfs_inode_ref *victim_ref;
+		unsigned long ptr;
+		unsigned long ptr_end;
+
+		leaf = path->nodes[0];
+
+		/* are we trying to overwrite a back ref for the root directory
+		 * if so, just jump out, we're done
+		 */
+		if (search_key.objectid == search_key.offset)
+			return 1;
+
+		/* check all the names in this back reference to see
+		 * if they are in the log.  if so, we allow them to stay
+		 * otherwise they must be unlinked as a conflict
+		 */
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		while (ptr < ptr_end) {
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			victim_name_len = btrfs_inode_ref_name_len(leaf,
+								   victim_ref);
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			if (!victim_name)
+				return -ENOMEM;
+
+			read_extent_buffer(leaf, victim_name,
+					   (unsigned long)(victim_ref + 1),
+					   victim_name_len);
+
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid,
+					    victim_name,
+					    victim_name_len)) {
+				inc_nlink(inode);
+				btrfs_release_path(path);
+
+				ret = btrfs_unlink_inode(trans, root, dir,
+							 inode, victim_name,
+							 victim_name_len);
+				kfree(victim_name);
+				if (ret)
+					return ret;
+				ret = btrfs_run_delayed_items(trans, root);
+				if (ret)
+					return ret;
+				*search_done = 1;
+				goto again;
+			}
+			kfree(victim_name);
+
+			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+		}
+
+		/*
+		 * NOTE: we have searched root tree and checked the
+		 * coresponding ref, it does not need to check again.
+		 */
+		*search_done = 1;
+	}
+	btrfs_release_path(path);
+
+	/* Same search but for extended refs */
+	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+					   inode_objectid, parent_objectid, 0,
+					   0);
+	if (!IS_ERR_OR_NULL(extref)) {
+		u32 item_size;
+		u32 cur_offset = 0;
+		unsigned long base;
+		struct inode *victim_parent;
+
+		leaf = path->nodes[0];
+
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *)(base + cur_offset);
+
+			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+				goto next;
+
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			if (!victim_name)
+				return -ENOMEM;
+			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+					   victim_name_len);
+
+			search_key.objectid = inode_objectid;
+			search_key.type = BTRFS_INODE_EXTREF_KEY;
+			search_key.offset = btrfs_extref_hash(parent_objectid,
+							      victim_name,
+							      victim_name_len);
+			ret = 0;
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid, victim_name,
+					    victim_name_len)) {
+				ret = -ENOENT;
+				victim_parent = read_one_inode(root,
+							       parent_objectid);
+				if (victim_parent) {
+					inc_nlink(inode);
+					btrfs_release_path(path);
+
+					ret = btrfs_unlink_inode(trans, root,
+								 victim_parent,
+								 inode,
+								 victim_name,
+								 victim_name_len);
+					if (!ret)
+						ret = btrfs_run_delayed_items(
+								  trans, root);
+				}
+				iput(victim_parent);
+				kfree(victim_name);
+				if (ret)
+					return ret;
+				*search_done = 1;
+				goto again;
+			}
+			kfree(victim_name);
+			if (ret)
+				return ret;
+next:
+			cur_offset += victim_name_len + sizeof(*extref);
+		}
+		*search_done = 1;
+	}
+	btrfs_release_path(path);
+
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+					 ref_index, name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		if (ret)
+			return ret;
+	}
+	btrfs_release_path(path);
+
+	/* look for a conflicing name */
+	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		if (ret)
+			return ret;
+	}
+	btrfs_release_path(path);
+
+	return 0;
+}
+
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			     u32 *namelen, char **name, u64 *index,
+			     u64 *parent_objectid)
+{
+	struct btrfs_inode_extref *extref;
+
+	extref = (struct btrfs_inode_extref *)ref_ptr;
+
+	*namelen = btrfs_inode_extref_name_len(eb, extref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+			   *namelen);
+
+	*index = btrfs_inode_extref_index(eb, extref);
+	if (parent_objectid)
+		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+	return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			  u32 *namelen, char **name, u64 *index)
+{
+	struct btrfs_inode_ref *ref;
+
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	*namelen = btrfs_inode_ref_name_len(eb, ref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+	*index = btrfs_inode_ref_index(eb, ref);
+
+	return 0;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct inode *dir = NULL;
+	struct inode *inode = NULL;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+	char *name = NULL;
+	int namelen;
+	int ret;
+	int search_done = 0;
+	int log_ref_ver = 0;
+	u64 parent_objectid;
+	u64 inode_objectid;
+	u64 ref_index = 0;
+	int ref_struct_size;
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *r;
+
+		ref_struct_size = sizeof(struct btrfs_inode_extref);
+		log_ref_ver = 1;
+		r = (struct btrfs_inode_extref *)ref_ptr;
+		parent_objectid = btrfs_inode_extref_parent(eb, r);
+	} else {
+		ref_struct_size = sizeof(struct btrfs_inode_ref);
+		parent_objectid = key->offset;
+	}
+	inode_objectid = key->objectid;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, parent_objectid);
+	if (!dir) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	inode = read_one_inode(root, inode_objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	while (ref_ptr < ref_end) {
+		if (log_ref_ver) {
+			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+						&ref_index, &parent_objectid);
+			/*
+			 * parent object can change from one array
+			 * item to another.
+			 */
+			if (!dir)
+				dir = read_one_inode(root, parent_objectid);
+			if (!dir) {
+				ret = -ENOENT;
+				goto out;
+			}
+		} else {
+			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+					     &ref_index);
+		}
+		if (ret)
+			goto out;
+
+		/* if we already have a perfect match, we're done */
+		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+				  ref_index, name, namelen)) {
+			/*
+			 * look for a conflicting back reference in the
+			 * metadata. if we find one we have to unlink that name
+			 * of the file before we add our new link.  Later on, we
+			 * overwrite any existing back reference, and we don't
+			 * want to create dangling pointers in the directory.
+			 */
+
+			if (!search_done) {
+				ret = __add_inode_ref(trans, root, path, log,
+						      dir, inode, eb,
+						      inode_objectid,
+						      parent_objectid,
+						      ref_index, name, namelen,
+						      &search_done);
+				if (ret) {
+					if (ret == 1)
+						ret = 0;
+					goto out;
+				}
+			}
+
+			/* insert our name */
+			ret = btrfs_add_link(trans, dir, inode, name, namelen,
+					     0, ref_index);
+			if (ret)
+				goto out;
+
+			btrfs_update_inode(trans, root, inode);
+		}
+
+		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+		kfree(name);
+		name = NULL;
+		if (log_ref_ver) {
+			iput(dir);
+			dir = NULL;
+		}
+	}
+
+	/* finally write the back reference in the inode */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
+out:
+	btrfs_release_path(path);
+	kfree(name);
+	iput(dir);
+	iput(inode);
+	return ret;
+}
+
+static int insert_orphan_item(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root, u64 ino)
+{
+	int ret;
+
+	ret = btrfs_insert_orphan_item(trans, root, ino);
+	if (ret == -EEXIST)
+		ret = 0;
+
+	return ret;
+}
+
+static int count_inode_extrefs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
+{
+	int ret = 0;
+	int name_len;
+	unsigned int nlink = 0;
+	u32 item_size;
+	u32 cur_offset = 0;
+	u64 inode_objectid = btrfs_ino(inode);
+	u64 offset = 0;
+	unsigned long ptr;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+
+	while (1) {
+		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+					    &extref, &offset);
+		if (ret)
+			break;
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		cur_offset = 0;
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+			name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			nlink++;
+
+			cur_offset += name_len + sizeof(*extref);
+		}
+
+		offset++;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+
+	if (ret < 0 && ret != -ENOENT)
+		return ret;
+	return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	unsigned int nlink = 0;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	int name_len;
+	u64 ino = btrfs_ino(inode);
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+process_slot:
+		btrfs_item_key_to_cpu(path->nodes[0], &key,
+				      path->slots[0]);
+		if (key.objectid != ino ||
+		    key.type != BTRFS_INODE_REF_KEY)
+			break;
+		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+						   path->slots[0]);
+		while (ptr < ptr_end) {
+			struct btrfs_inode_ref *ref;
+
+			ref = (struct btrfs_inode_ref *)ptr;
+			name_len = btrfs_inode_ref_name_len(path->nodes[0],
+							    ref);
+			ptr = (unsigned long)(ref + 1) + name_len;
+			nlink++;
+		}
+
+		if (key.offset == 0)
+			break;
+		if (path->slots[0] > 0) {
+			path->slots[0]--;
+			goto process_slot;
+		}
+		key.offset--;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+
+	return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	u64 nlink = 0;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = count_inode_refs(root, inode, path);
+	if (ret < 0)
+		goto out;
+
+	nlink = ret;
+
+	ret = count_inode_extrefs(root, inode, path);
+	if (ret < 0)
+		goto out;
+
+	nlink += ret;
+
+	ret = 0;
+
+	if (nlink != inode->i_nlink) {
+		set_nlink(inode, nlink);
+		btrfs_update_inode(trans, root, inode);
+	}
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
+	if (inode->i_nlink == 0) {
+		if (S_ISDIR(inode->i_mode)) {
+			ret = replay_dir_deletes(trans, root, NULL, path,
+						 ino, 1);
+			if (ret)
+				goto out;
+		}
+		ret = insert_orphan_item(trans, root, ino);
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = (u64)-1;
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+
+		if (ret == 1) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		ret = btrfs_del_item(trans, root, path);
+		if (ret)
+			goto out;
+
+		btrfs_release_path(path);
+		inode = read_one_inode(root, key.offset);
+		if (!inode)
+			return -EIO;
+
+		ret = fixup_inode_link_count(trans, root, inode);
+		iput(inode);
+		if (ret)
+			goto out;
+
+		/*
+		 * fixup on a directory may create new entries,
+		 * make sure we always look for the highset possible
+		 * offset
+		 */
+		key.offset = (u64)-1;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct inode *inode;
+
+	inode = read_one_inode(root, objectid);
+	if (!inode)
+		return -EIO;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = objectid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_release_path(path);
+	if (ret == 0) {
+		if (!inode->i_nlink)
+			set_nlink(inode, 1);
+		else
+			inc_nlink(inode);
+		ret = btrfs_update_inode(trans, root, inode);
+	} else if (ret == -EEXIST) {
+		ret = 0;
+	} else {
+		BUG(); /* Logic Error */
+	}
+	iput(inode);
+
+	return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    u64 dirid, u64 index,
+				    char *name, int name_len, u8 type,
+				    struct btrfs_key *location)
+{
+	struct inode *inode;
+	struct inode *dir;
+	int ret;
+
+	inode = read_one_inode(root, location->objectid);
+	if (!inode)
+		return -ENOENT;
+
+	dir = read_one_inode(root, dirid);
+	if (!dir) {
+		iput(inode);
+		return -EIO;
+	}
+
+	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+	/* FIXME, put inode into FIXUP list */
+
+	iput(inode);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+			    const char *name, const int name_len,
+			    const u64 dirid, const u64 ino)
+{
+	struct btrfs_key search_key;
+
+	search_key.objectid = ino;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = dirid;
+	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+		return true;
+
+	search_key.type = BTRFS_INODE_EXTREF_KEY;
+	search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+		return true;
+
+	return false;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *eb,
+				    struct btrfs_dir_item *di,
+				    struct btrfs_key *key)
+{
+	char *name;
+	int name_len;
+	struct btrfs_dir_item *dst_di;
+	struct btrfs_key found_key;
+	struct btrfs_key log_key;
+	struct inode *dir;
+	u8 log_type;
+	int exists;
+	int ret = 0;
+	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+
+	dir = read_one_inode(root, key->objectid);
+	if (!dir)
+		return -EIO;
+
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	log_type = btrfs_dir_type(eb, di);
+	read_extent_buffer(eb, name, (unsigned long)(di + 1),
+		   name_len);
+
+	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+	if (exists == 0)
+		exists = 1;
+	else
+		exists = 0;
+	btrfs_release_path(path);
+
+	if (key->type == BTRFS_DIR_ITEM_KEY) {
+		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+				       name, name_len, 1);
+	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
+		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						     key->objectid,
+						     key->offset, name,
+						     name_len, 1);
+	} else {
+		/* Corruption */
+		ret = -EINVAL;
+		goto out;
+	}
+	if (IS_ERR_OR_NULL(dst_di)) {
+		/* we need a sequence number to insert, so we only
+		 * do inserts for the BTRFS_DIR_INDEX_KEY types
+		 */
+		if (key->type != BTRFS_DIR_INDEX_KEY)
+			goto out;
+		goto insert;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* the existing item matches the logged item */
+	if (found_key.objectid == log_key.objectid &&
+	    found_key.type == log_key.type &&
+	    found_key.offset == log_key.offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+		update_size = false;
+		goto out;
+	}
+
+	/*
+	 * don't drop the conflicting directory entry if the inode
+	 * for the new entry doesn't exist
+	 */
+	if (!exists)
+		goto out;
+
+	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+	if (ret)
+		goto out;
+
+	if (key->type == BTRFS_DIR_INDEX_KEY)
+		goto insert;
+out:
+	btrfs_release_path(path);
+	if (!ret && update_size) {
+		btrfs_i_size_write(dir, dir->i_size + name_len * 2);
+		ret = btrfs_update_inode(trans, root, dir);
+	}
+	kfree(name);
+	iput(dir);
+	return ret;
+
+insert:
+	if (name_in_log_ref(root->log_root, name, name_len,
+			    key->objectid, log_key.objectid)) {
+		/* The dentry will be added later. */
+		ret = 0;
+		update_size = false;
+		goto out;
+	}
+	btrfs_release_path(path);
+	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+			      name, name_len, log_type, &log_key);
+	if (ret && ret != -ENOENT && ret != -EEXIST)
+		goto out;
+	update_size = false;
+	ret = 0;
+	goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct extent_buffer *eb, int slot,
+					struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	struct btrfs_dir_item *di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		if (verify_dir_item(root, eb, di))
+			return -EIO;
+		name_len = btrfs_dir_name_len(eb, di);
+		ret = replay_one_name(trans, root, path, eb, di, key);
+		if (ret)
+			return ret;
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	return 0;
+}
+
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   u64 dirid, int key_type,
+				   u64 *start_ret, u64 *end_ret)
+{
+	struct btrfs_key key;
+	u64 found_end;
+	struct btrfs_dir_log_item *item;
+	int ret;
+	int nritems;
+
+	if (*start_ret == (u64)-1)
+		return 1;
+
+	key.objectid = dirid;
+	key.type = key_type;
+	key.offset = *start_ret;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+	if (ret != 0)
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto next;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+	if (*start_ret >= key.offset && *start_ret <= found_end) {
+		ret = 0;
+		*start_ret = key.offset;
+		*end_ret = found_end;
+		goto out;
+	}
+	ret = 1;
+next:
+	/* check the next slot in the tree to see if it is a valid item */
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (path->slots[0] >= nritems) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	} else {
+		path->slots[0]++;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto out;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+	*start_ret = key.offset;
+	*end_ret = found_end;
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_root *log,
+				      struct btrfs_path *path,
+				      struct btrfs_path *log_path,
+				      struct inode *dir,
+				      struct btrfs_key *dir_key)
+{
+	int ret;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	struct btrfs_dir_item *di;
+	struct btrfs_dir_item *log_di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	char *name;
+	struct inode *inode;
+	struct btrfs_key location;
+
+again:
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		if (verify_dir_item(root, eb, di)) {
+			ret = -EIO;
+			goto out;
+		}
+
+		name_len = btrfs_dir_name_len(eb, di);
+		name = kmalloc(name_len, GFP_NOFS);
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		read_extent_buffer(eb, name, (unsigned long)(di + 1),
+				  name_len);
+		log_di = NULL;
+		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+			log_di = btrfs_lookup_dir_item(trans, log, log_path,
+						       dir_key->objectid,
+						       name, name_len, 0);
+		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+			log_di = btrfs_lookup_dir_index_item(trans, log,
+						     log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		}
+		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
+			btrfs_dir_item_key_to_cpu(eb, di, &location);
+			btrfs_release_path(path);
+			btrfs_release_path(log_path);
+			inode = read_one_inode(root, location.objectid);
+			if (!inode) {
+				kfree(name);
+				return -EIO;
+			}
+
+			ret = link_to_fixup_dir(trans, root,
+						path, location.objectid);
+			if (ret) {
+				kfree(name);
+				iput(inode);
+				goto out;
+			}
+
+			inc_nlink(inode);
+			ret = btrfs_unlink_inode(trans, root, dir, inode,
+						 name, name_len);
+			if (!ret)
+				ret = btrfs_run_delayed_items(trans, root);
+			kfree(name);
+			iput(inode);
+			if (ret)
+				goto out;
+
+			/* there might still be more names under this key
+			 * check and repeat if required
+			 */
+			ret = btrfs_search_slot(NULL, root, dir_key, path,
+						0, 0);
+			if (ret == 0)
+				goto again;
+			ret = 0;
+			goto out;
+		} else if (IS_ERR(log_di)) {
+			kfree(name);
+			return PTR_ERR(log_di);
+		}
+		btrfs_release_path(log_path);
+		kfree(name);
+
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	btrfs_release_path(log_path);
+	return ret;
+}
+
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_root *log,
+			      struct btrfs_path *path,
+			      const u64 ino)
+{
+	struct btrfs_key search_key;
+	struct btrfs_path *log_path;
+	int i;
+	int nritems;
+	int ret;
+
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	search_key.objectid = ino;
+	search_key.type = BTRFS_XATTR_ITEM_KEY;
+	search_key.offset = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+process_leaf:
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	for (i = path->slots[0]; i < nritems; i++) {
+		struct btrfs_key key;
+		struct btrfs_dir_item *di;
+		struct btrfs_dir_item *log_di;
+		u32 total_size;
+		u32 cur;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+			ret = 0;
+			goto out;
+		}
+
+		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+		total_size = btrfs_item_size_nr(path->nodes[0], i);
+		cur = 0;
+		while (cur < total_size) {
+			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+			u32 this_len = sizeof(*di) + name_len + data_len;
+			char *name;
+
+			name = kmalloc(name_len, GFP_NOFS);
+			if (!name) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			read_extent_buffer(path->nodes[0], name,
+					   (unsigned long)(di + 1), name_len);
+
+			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+						    name, name_len, 0);
+			btrfs_release_path(log_path);
+			if (!log_di) {
+				/* Doesn't exist in log tree, so delete it. */
+				btrfs_release_path(path);
+				di = btrfs_lookup_xattr(trans, root, path, ino,
+							name, name_len, -1);
+				kfree(name);
+				if (IS_ERR(di)) {
+					ret = PTR_ERR(di);
+					goto out;
+				}
+				ASSERT(di);
+				ret = btrfs_delete_one_dir_name(trans, root,
+								path, di);
+				if (ret)
+					goto out;
+				btrfs_release_path(path);
+				search_key = key;
+				goto again;
+			}
+			kfree(name);
+			if (IS_ERR(log_di)) {
+				ret = PTR_ERR(log_di);
+				goto out;
+			}
+			cur += this_len;
+			di = (struct btrfs_dir_item *)((char *)di + this_len);
+		}
+	}
+	ret = btrfs_next_leaf(root, path);
+	if (ret > 0)
+		ret = 0;
+	else if (ret == 0)
+		goto process_leaf;
+out:
+	btrfs_free_path(log_path);
+	btrfs_release_path(path);
+	return ret;
+}
+
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid, int del_all)
+{
+	u64 range_start;
+	u64 range_end;
+	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+	int ret = 0;
+	struct btrfs_key dir_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *log_path;
+	struct inode *dir;
+
+	dir_key.objectid = dirid;
+	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	dir = read_one_inode(root, dirid);
+	/* it isn't an error if the inode isn't there, that can happen
+	 * because we replay the deletes before we copy in the inode item
+	 * from the log
+	 */
+	if (!dir) {
+		btrfs_free_path(log_path);
+		return 0;
+	}
+again:
+	range_start = 0;
+	range_end = 0;
+	while (1) {
+		if (del_all)
+			range_end = (u64)-1;
+		else {
+			ret = find_dir_range(log, path, dirid, key_type,
+					     &range_start, &range_end);
+			if (ret != 0)
+				break;
+		}
+
+		dir_key.offset = range_start;
+		while (1) {
+			int nritems;
+			ret = btrfs_search_slot(NULL, root, &dir_key, path,
+						0, 0);
+			if (ret < 0)
+				goto out;
+
+			nritems = btrfs_header_nritems(path->nodes[0]);
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+			if (found_key.objectid != dirid ||
+			    found_key.type != dir_key.type)
+				goto next_type;
+
+			if (found_key.offset > range_end)
+				break;
+
+			ret = check_item_in_log(trans, root, log, path,
+						log_path, dir,
+						&found_key);
+			if (ret)
+				goto out;
+			if (found_key.offset == (u64)-1)
+				break;
+			dir_key.offset = found_key.offset + 1;
+		}
+		btrfs_release_path(path);
+		if (range_end == (u64)-1)
+			break;
+		range_start = range_end + 1;
+	}
+
+next_type:
+	ret = 0;
+	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+		key_type = BTRFS_DIR_LOG_INDEX_KEY;
+		dir_key.type = BTRFS_DIR_INDEX_KEY;
+		btrfs_release_path(path);
+		goto again;
+	}
+out:
+	btrfs_release_path(path);
+	btrfs_free_path(log_path);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+			     struct walk_control *wc, u64 gen)
+{
+	int nritems;
+	struct btrfs_path *path;
+	struct btrfs_root *root = wc->replay_dest;
+	struct btrfs_key key;
+	int level;
+	int i;
+	int ret;
+
+	ret = btrfs_read_buffer(eb, gen);
+	if (ret)
+		return ret;
+
+	level = btrfs_header_level(eb);
+
+	if (level != 0)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+
+		/* inode keys are done during the first stage */
+		if (key.type == BTRFS_INODE_ITEM_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_INODES) {
+			struct btrfs_inode_item *inode_item;
+			u32 mode;
+
+			inode_item = btrfs_item_ptr(eb, i,
+					    struct btrfs_inode_item);
+			ret = replay_xattr_deletes(wc->trans, root, log,
+						   path, key.objectid);
+			if (ret)
+				break;
+			mode = btrfs_inode_mode(eb, inode_item);
+			if (S_ISDIR(mode)) {
+				ret = replay_dir_deletes(wc->trans,
+					 root, log, path, key.objectid, 0);
+				if (ret)
+					break;
+			}
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			if (ret)
+				break;
+
+			/* for regular files, make sure corresponding
+			 * orhpan item exist. extents past the new EOF
+			 * will be truncated later by orphan cleanup.
+			 */
+			if (S_ISREG(mode)) {
+				ret = insert_orphan_item(wc->trans, root,
+							 key.objectid);
+				if (ret)
+					break;
+			}
+
+			ret = link_to_fixup_dir(wc->trans, root,
+						path, key.objectid);
+			if (ret)
+				break;
+		}
+
+		if (key.type == BTRFS_DIR_INDEX_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			if (ret)
+				break;
+		}
+
+		if (wc->stage < LOG_WALK_REPLAY_ALL)
+			continue;
+
+		/* these keys are simply copied */
+		if (key.type == BTRFS_XATTR_ITEM_KEY) {
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			if (ret)
+				break;
+		} else if (key.type == BTRFS_INODE_REF_KEY ||
+			   key.type == BTRFS_INODE_EXTREF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			if (ret && ret != -ENOENT)
+				break;
+			ret = 0;
+		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			ret = replay_one_extent(wc->trans, root, path,
+						eb, i, &key);
+			if (ret)
+				break;
+		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			if (ret)
+				break;
+		}
+	}
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level,
+				   struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 bytenr;
+	u64 ptr_gen;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u32 blocksize;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while (*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		WARN_ON(btrfs_header_level(cur) != *level);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(cur))
+			break;
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		blocksize = root->nodesize;
+
+		parent = path->nodes[*level];
+		root_owner = btrfs_header_owner(parent);
+
+		next = btrfs_find_create_tree_block(root, bytenr);
+		if (!next)
+			return -ENOMEM;
+
+		if (*level == 1) {
+			ret = wc->process_func(root, next, wc, ptr_gen);
+			if (ret) {
+				free_extent_buffer(next);
+				return ret;
+			}
+
+			path->slots[*level]++;
+			if (wc->free) {
+				ret = btrfs_read_buffer(next, ptr_gen);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
+				}
+
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root->fs_info,
+							next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
+
+				WARN_ON(root_owner !=
+					BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_and_pin_reserved_extent(root,
+							 bytenr, blocksize);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
+				}
+			}
+			free_extent_buffer(next);
+			continue;
+		}
+		ret = btrfs_read_buffer(next, ptr_gen);
+		if (ret) {
+			free_extent_buffer(next);
+			return ret;
+		}
+
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			free_extent_buffer(path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(next);
+		path->slots[*level] = 0;
+		cond_resched();
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
+
+	cond_resched();
+	return 0;
+}
+
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, int *level,
+				 struct walk_control *wc)
+{
+	u64 root_owner;
+	int i;
+	int slot;
+	int ret;
+
+	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
+			path->slots[i]++;
+			*level = i;
+			WARN_ON(*level == 0);
+			return 0;
+		} else {
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			ret = wc->process_func(root, path->nodes[*level], wc,
+				 btrfs_header_generation(path->nodes[*level]));
+			if (ret)
+				return ret;
+
+			if (wc->free) {
+				struct extent_buffer *next;
+
+				next = path->nodes[*level];
+
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root->fs_info,
+							next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
+
+				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_and_pin_reserved_extent(root,
+						path->nodes[*level]->start,
+						path->nodes[*level]->len);
+				if (ret)
+					return ret;
+			}
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *log, struct walk_control *wc)
+{
+	int ret = 0;
+	int wret;
+	int level;
+	struct btrfs_path *path;
+	int orig_level;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(log->node);
+	orig_level = level;
+	path->nodes[level] = log->node;
+	extent_buffer_get(log->node);
+	path->slots[level] = 0;
+
+	while (1) {
+		wret = walk_down_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0) {
+			ret = wret;
+			goto out;
+		}
+
+		wret = walk_up_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0) {
+			ret = wret;
+			goto out;
+		}
+	}
+
+	/* was the root node processed? if not, catch it here */
+	if (path->nodes[orig_level]) {
+		ret = wc->process_func(log, path->nodes[orig_level], wc,
+			 btrfs_header_generation(path->nodes[orig_level]));
+		if (ret)
+			goto out;
+		if (wc->free) {
+			struct extent_buffer *next;
+
+			next = path->nodes[orig_level];
+
+			if (trans) {
+				btrfs_tree_lock(next);
+				btrfs_set_lock_blocking(next);
+				clean_tree_block(trans, log->fs_info, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+			}
+
+			WARN_ON(log->root_key.objectid !=
+				BTRFS_TREE_LOG_OBJECTID);
+			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
+							 next->len);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	int ret;
+
+	if (log->log_transid == 1) {
+		/* insert root item on the first sync */
+		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	} else {
+		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	}
+	return ret;
+}
+
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, int transid)
+{
+	DEFINE_WAIT(wait);
+	int index = transid % 2;
+
+	/*
+	 * we only allow two pending log transactions at a time,
+	 * so we know that if ours is more than 2 older than the
+	 * current transaction, we're done
+	 */
+	do {
+		prepare_to_wait(&root->log_commit_wait[index],
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+
+		if (root->log_transid_committed < transid &&
+		    atomic_read(&root->log_commit[index]))
+			schedule();
+
+		finish_wait(&root->log_commit_wait[index], &wait);
+		mutex_lock(&root->log_mutex);
+	} while (root->log_transid_committed < transid &&
+		 atomic_read(&root->log_commit[index]));
+}
+
+static void wait_for_writer(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	DEFINE_WAIT(wait);
+
+	while (atomic_read(&root->log_writers)) {
+		prepare_to_wait(&root->log_writer_wait,
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (atomic_read(&root->log_writers))
+			schedule();
+		finish_wait(&root->log_writer_wait, &wait);
+		mutex_lock(&root->log_mutex);
+	}
+}
+
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+					struct btrfs_log_ctx *ctx)
+{
+	if (!ctx)
+		return;
+
+	mutex_lock(&root->log_mutex);
+	list_del_init(&ctx->list);
+	mutex_unlock(&root->log_mutex);
+}
+
+/* 
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+					     int index, int error)
+{
+	struct btrfs_log_ctx *ctx;
+
+	if (!error) {
+		INIT_LIST_HEAD(&root->log_ctxs[index]);
+		return;
+	}
+
+	list_for_each_entry(ctx, &root->log_ctxs[index], list)
+		ctx->log_ret = error;
+
+	INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
+{
+	int index1;
+	int index2;
+	int mark;
+	int ret;
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+	int log_transid = 0;
+	struct btrfs_log_ctx root_log_ctx;
+	struct blk_plug plug;
+
+	mutex_lock(&root->log_mutex);
+	log_transid = ctx->log_transid;
+	if (root->log_transid_committed >= log_transid) {
+		mutex_unlock(&root->log_mutex);
+		return ctx->log_ret;
+	}
+
+	index1 = log_transid % 2;
+	if (atomic_read(&root->log_commit[index1])) {
+		wait_log_commit(trans, root, log_transid);
+		mutex_unlock(&root->log_mutex);
+		return ctx->log_ret;
+	}
+	ASSERT(log_transid == root->log_transid);
+	atomic_set(&root->log_commit[index1], 1);
+
+	/* wait for previous tree log sync to complete */
+	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+		wait_log_commit(trans, root, log_transid - 1);
+
+	while (1) {
+		int batch = atomic_read(&root->log_batch);
+		/* when we're on an ssd, just kick the log commit out */
+		if (!btrfs_test_opt(root, SSD) &&
+		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
+			mutex_unlock(&root->log_mutex);
+			schedule_timeout_uninterruptible(1);
+			mutex_lock(&root->log_mutex);
+		}
+		wait_for_writer(trans, root);
+		if (batch == atomic_read(&root->log_batch))
+			break;
+	}
+
+	/* bail out if we need to do a full commit */
+	if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+		ret = -EAGAIN;
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&root->log_mutex);
+		goto out;
+	}
+
+	if (log_transid % 2 == 0)
+		mark = EXTENT_DIRTY;
+	else
+		mark = EXTENT_NEW;
+
+	/* we start IO on  all the marked extents here, but we don't actually
+	 * wait for them until later.
+	 */
+	blk_start_plug(&plug);
+	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
+	if (ret) {
+		blk_finish_plug(&plug);
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		mutex_unlock(&root->log_mutex);
+		goto out;
+	}
+
+	btrfs_set_root_node(&log->root_item, log->node);
+
+	root->log_transid++;
+	log->log_transid = root->log_transid;
+	root->log_start_pid = 0;
+	/*
+	 * IO has been started, blocks of the log tree have WRITTEN flag set
+	 * in their headers. new modifications of the log will be written to
+	 * new positions. so it's safe to allow log writers to go in.
+	 */
+	mutex_unlock(&root->log_mutex);
+
+	btrfs_init_log_ctx(&root_log_ctx);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	atomic_inc(&log_root_tree->log_batch);
+	atomic_inc(&log_root_tree->log_writers);
+
+	index2 = log_root_tree->log_transid % 2;
+	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+	root_log_ctx.log_transid = log_root_tree->log_transid;
+
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	ret = update_log_root(trans, log);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&log_root_tree->log_writer_wait))
+			wake_up(&log_root_tree->log_writer_wait);
+	}
+
+	if (ret) {
+		if (!list_empty(&root_log_ctx.list))
+			list_del_init(&root_log_ctx.list);
+
+		blk_finish_plug(&plug);
+		btrfs_set_log_full_commit(root->fs_info, trans);
+
+		if (ret != -ENOSPC) {
+			btrfs_abort_transaction(trans, root, ret);
+			mutex_unlock(&log_root_tree->log_mutex);
+			goto out;
+		}
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+		blk_finish_plug(&plug);
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = root_log_ctx.log_ret;
+		goto out;
+	}
+
+	index2 = root_log_ctx.log_transid % 2;
+	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		blk_finish_plug(&plug);
+		ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+						mark);
+		btrfs_wait_logged_extents(trans, log, log_transid);
+		wait_log_commit(trans, log_root_tree,
+				root_log_ctx.log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		if (!ret)
+			ret = root_log_ctx.log_ret;
+		goto out;
+	}
+	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
+	atomic_set(&log_root_tree->log_commit[index2], 1);
+
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+		wait_log_commit(trans, log_root_tree,
+				root_log_ctx.log_transid - 1);
+	}
+
+	wait_for_writer(trans, log_root_tree);
+
+	/*
+	 * now that we've moved on to the tree of log tree roots,
+	 * check the full commit flag again
+	 */
+	if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+		blk_finish_plug(&plug);
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = -EAGAIN;
+		goto out_wake_log_root;
+	}
+
+	ret = btrfs_write_marked_extents(log_root_tree,
+					 &log_root_tree->dirty_log_pages,
+					 EXTENT_DIRTY | EXTENT_NEW);
+	blk_finish_plug(&plug);
+	if (ret) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out_wake_log_root;
+	}
+	ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+	if (!ret)
+		ret = btrfs_wait_marked_extents(log_root_tree,
+						&log_root_tree->dirty_log_pages,
+						EXTENT_NEW | EXTENT_DIRTY);
+	if (ret) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out_wake_log_root;
+	}
+	btrfs_wait_logged_extents(trans, log, log_transid);
+
+	btrfs_set_super_log_root(root->fs_info->super_for_commit,
+				log_root_tree->node->start);
+	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
+				btrfs_header_level(log_root_tree->node));
+
+	log_root_tree->log_transid++;
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	/*
+	 * nobody else is going to jump in and write the the ctree
+	 * super here because the log_commit atomic below is protecting
+	 * us.  We must be called with a transaction handle pinning
+	 * the running transaction open, so a full commit can't hop
+	 * in and cause problems either.
+	 */
+	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
+	if (ret) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_wake_log_root;
+	}
+
+	mutex_lock(&root->log_mutex);
+	if (root->last_log_commit < log_transid)
+		root->last_log_commit = log_transid;
+	mutex_unlock(&root->log_mutex);
+
+out_wake_log_root:
+	/*
+	 * We needn't get log_mutex here because we are sure all
+	 * the other tasks are blocked.
+	 */
+	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_transid_committed++;
+	atomic_set(&log_root_tree->log_commit[index2], 0);
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+		wake_up(&log_root_tree->log_commit_wait[index2]);
+out:
+	/* See above. */
+	btrfs_remove_all_log_ctxs(root, index1, ret);
+
+	mutex_lock(&root->log_mutex);
+	root->log_transid_committed++;
+	atomic_set(&root->log_commit[index1], 0);
+	mutex_unlock(&root->log_mutex);
+
+	if (waitqueue_active(&root->log_commit_wait[index1]))
+		wake_up(&root->log_commit_wait[index1]);
+	return ret;
+}
+
+static void free_log_tree(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *log)
+{
+	int ret;
+	u64 start;
+	u64 end;
+	struct walk_control wc = {
+		.free = 1,
+		.process_func = process_one_buffer
+	};
+
+	ret = walk_log_tree(trans, log, &wc);
+	/* I don't think this can happen but just in case */
+	if (ret)
+		btrfs_abort_transaction(trans, log, ret);
+
+	while (1) {
+		ret = find_first_extent_bit(&log->dirty_log_pages,
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+				NULL);
+		if (ret)
+			break;
+
+		clear_extent_bits(&log->dirty_log_pages, start, end,
+				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+	}
+
+	/*
+	 * We may have short-circuited the log tree with the full commit logic
+	 * and left ordered extents on our list, so clear these out to keep us
+	 * from leaking inodes and memory.
+	 */
+	btrfs_free_logged_extents(log, 0);
+	btrfs_free_logged_extents(log, 1);
+
+	free_extent_buffer(log->node);
+	kfree(log);
+}
+
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+	if (root->log_root) {
+		free_log_tree(trans, root->log_root);
+		root->log_root = NULL;
+	}
+	return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->log_root_tree) {
+		free_log_tree(trans, fs_info->log_root_tree);
+		fs_info->log_root_tree = NULL;
+	}
+	return 0;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index)
+{
+	struct btrfs_root *log;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	int ret;
+	int err = 0;
+	int bytes_del = 0;
+	u64 dir_ino = btrfs_ino(dir);
+
+	if (BTRFS_I(dir)->logged_trans < trans->transid)
+		return 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+	log = root->log_root;
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
+				   name, name_len, -1);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto fail;
+	}
+	if (di) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
+	}
+	btrfs_release_path(path);
+	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+					 index, name, name_len, -1);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto fail;
+	}
+	if (di) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
+	}
+
+	/* update the directory size in the log to reflect the names
+	 * we have removed
+	 */
+	if (bytes_del) {
+		struct btrfs_key key;
+
+		key.objectid = dir_ino;
+		key.offset = 0;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		btrfs_release_path(path);
+
+		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+		if (ret < 0) {
+			err = ret;
+			goto fail;
+		}
+		if (ret == 0) {
+			struct btrfs_inode_item *item;
+			u64 i_size;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			i_size = btrfs_inode_size(path->nodes[0], item);
+			if (i_size > bytes_del)
+				i_size -= bytes_del;
+			else
+				i_size = 0;
+			btrfs_set_inode_size(path->nodes[0], item, i_size);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+		} else
+			ret = 0;
+		btrfs_release_path(path);
+	}
+fail:
+	btrfs_free_path(path);
+out_unlock:
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
+	if (ret == -ENOSPC) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		ret = 0;
+	} else if (ret < 0)
+		btrfs_abort_transaction(trans, root, ret);
+
+	btrfs_end_log_trans(root);
+
+	return err;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid)
+{
+	struct btrfs_root *log;
+	u64 index;
+	int ret;
+
+	if (BTRFS_I(inode)->logged_trans < trans->transid)
+		return 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+	log = root->log_root;
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
+				  dirid, &index);
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	if (ret == -ENOSPC) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		ret = 0;
+	} else if (ret < 0 && ret != -ENOENT)
+		btrfs_abort_transaction(trans, root, ret);
+	btrfs_end_log_trans(root);
+
+	return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       int key_type, u64 dirid,
+				       u64 first_offset, u64 last_offset)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_log_item *item;
+
+	key.objectid = dirid;
+	key.offset = first_offset;
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		key.type = BTRFS_DIR_LOG_ITEM_KEY;
+	else
+		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+	if (ret)
+		return ret;
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(path);
+	return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path, int key_type,
+			  struct btrfs_log_ctx *ctx,
+			  u64 min_offset, u64 *last_offset_ret)
+{
+	struct btrfs_key min_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src;
+	int err = 0;
+	int ret;
+	int i;
+	int nritems;
+	u64 first_offset = min_offset;
+	u64 last_offset = (u64)-1;
+	u64 ino = btrfs_ino(inode);
+
+	log = root->log_root;
+
+	min_key.objectid = ino;
+	min_key.type = key_type;
+	min_key.offset = min_offset;
+
+	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+
+	/*
+	 * we didn't find anything from this transaction, see if there
+	 * is anything at all
+	 */
+	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+		min_key.objectid = ino;
+		min_key.type = key_type;
+		min_key.offset = (u64)-1;
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_release_path(path);
+			return ret;
+		}
+		ret = btrfs_previous_item(root, path, ino, key_type);
+
+		/* if ret == 0 there are items for this type,
+		 * create a range to tell us the last key of this type.
+		 * otherwise, there are no items in this directory after
+		 * *min_offset, and we create a range to indicate that.
+		 */
+		if (ret == 0) {
+			struct btrfs_key tmp;
+			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+					      path->slots[0]);
+			if (key_type == tmp.type)
+				first_offset = max(min_offset, tmp.offset) + 1;
+		}
+		goto done;
+	}
+
+	/* go backward to find any previous key */
+	ret = btrfs_previous_item(root, path, ino, key_type);
+	if (ret == 0) {
+		struct btrfs_key tmp;
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (key_type == tmp.type) {
+			first_offset = tmp.offset;
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+			if (ret) {
+				err = ret;
+				goto done;
+			}
+		}
+	}
+	btrfs_release_path(path);
+
+	/* find the first key from this transaction again */
+	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (WARN_ON(ret != 0))
+		goto done;
+
+	/*
+	 * we have a block from this transaction, log every item in it
+	 * from our directory
+	 */
+	while (1) {
+		struct btrfs_key tmp;
+		src = path->nodes[0];
+		nritems = btrfs_header_nritems(src);
+		for (i = path->slots[0]; i < nritems; i++) {
+			struct btrfs_dir_item *di;
+
+			btrfs_item_key_to_cpu(src, &min_key, i);
+
+			if (min_key.objectid != ino || min_key.type != key_type)
+				goto done;
+			ret = overwrite_item(trans, log, dst_path, src, i,
+					     &min_key);
+			if (ret) {
+				err = ret;
+				goto done;
+			}
+
+			/*
+			 * We must make sure that when we log a directory entry,
+			 * the corresponding inode, after log replay, has a
+			 * matching link count. For example:
+			 *
+			 * touch foo
+			 * mkdir mydir
+			 * sync
+			 * ln foo mydir/bar
+			 * xfs_io -c "fsync" mydir
+			 * <crash>
+			 * <mount fs and log replay>
+			 *
+			 * Would result in a fsync log that when replayed, our
+			 * file inode would have a link count of 1, but we get
+			 * two directory entries pointing to the same inode.
+			 * After removing one of the names, it would not be
+			 * possible to remove the other name, which resulted
+			 * always in stale file handle errors, and would not
+			 * be possible to rmdir the parent directory, since
+			 * its i_size could never decrement to the value
+			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+			 */
+			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+			btrfs_dir_item_key_to_cpu(src, di, &tmp);
+			if (ctx &&
+			    (btrfs_dir_transid(src, di) == trans->transid ||
+			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+			    tmp.type != BTRFS_ROOT_ITEM_KEY)
+				ctx->log_new_dentries = true;
+		}
+		path->slots[0] = nritems;
+
+		/*
+		 * look ahead to the next item and see if it is also
+		 * from this directory and from this transaction
+		 */
+		ret = btrfs_next_leaf(root, path);
+		if (ret == 1) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (tmp.objectid != ino || tmp.type != key_type) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+			if (ret)
+				err = ret;
+			else
+				last_offset = tmp.offset;
+			goto done;
+		}
+	}
+done:
+	btrfs_release_path(path);
+	btrfs_release_path(dst_path);
+
+	if (err == 0) {
+		*last_offset_ret = last_offset;
+		/*
+		 * insert the log range keys to indicate where the log
+		 * is valid
+		 */
+		ret = insert_dir_log_key(trans, log, path, key_type,
+					 ino, first_offset, last_offset);
+		if (ret)
+			err = ret;
+	}
+	return err;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path,
+			  struct btrfs_log_ctx *ctx)
+{
+	u64 min_key;
+	u64 max_key;
+	int ret;
+	int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+	min_key = 0;
+	max_key = 0;
+	while (1) {
+		ret = log_dir_items(trans, root, inode, path,
+				    dst_path, key_type, ctx, min_key,
+				    &max_key);
+		if (ret)
+			return ret;
+		if (max_key == (u64)-1)
+			break;
+		min_key = max_key + 1;
+	}
+
+	if (key_type == BTRFS_DIR_ITEM_KEY) {
+		key_type = BTRFS_DIR_INDEX_KEY;
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  u64 objectid, int max_key_type)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int start_slot;
+
+	key.objectid = objectid;
+	key.type = max_key_type;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+		BUG_ON(ret == 0); /* Logic error */
+		if (ret < 0)
+			break;
+
+		if (path->slots[0] == 0)
+			break;
+
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+
+		if (found_key.objectid != objectid)
+			break;
+
+		found_key.offset = 0;
+		found_key.type = 0;
+		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+				       &start_slot);
+
+		ret = btrfs_del_items(trans, log, path, start_slot,
+				      path->slots[0] - start_slot + 1);
+		/*
+		 * If start slot isn't 0 then we don't need to re-search, we've
+		 * found the last guy with the objectid in this tree.
+		 */
+		if (ret || start_slot != 0)
+			break;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode, int log_inode_only,
+			    u64 logged_isize)
+{
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	if (log_inode_only) {
+		/* set the generation to zero so the recover code
+		 * can tell the difference between an logging
+		 * just to say 'this inode exists' and a logging
+		 * to say 'update this inode with these values'
+		 */
+		btrfs_set_token_inode_generation(leaf, item, 0, &token);
+		btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
+	} else {
+		btrfs_set_token_inode_generation(leaf, item,
+						 BTRFS_I(inode)->generation,
+						 &token);
+		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+	}
+
+	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+	btrfs_set_token_timespec_sec(leaf, &item->atime,
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, &item->atime,
+				      inode->i_atime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, &item->mtime,
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
+				      inode->i_mtime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, &item->ctime,
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
+				      inode->i_ctime.tv_nsec, &token);
+
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+
+static int log_inode_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *log, struct btrfs_path *path,
+			  struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	int ret;
+
+	ret = btrfs_insert_empty_item(trans, log, path,
+				      &BTRFS_I(inode)->location,
+				      sizeof(*inode_item));
+	if (ret && ret != -EEXIST)
+		return ret;
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_inode_item);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
+	btrfs_release_path(path);
+	return 0;
+}
+
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+			       struct inode *inode,
+			       struct btrfs_path *dst_path,
+			       struct btrfs_path *src_path, u64 *last_extent,
+			       int start_slot, int nr, int inode_only,
+			       u64 logged_isize)
+{
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *src = src_path->nodes[0];
+	struct btrfs_key first_key, last_key, key;
+	int ret;
+	struct btrfs_key *ins_keys;
+	u32 *ins_sizes;
+	char *ins_data;
+	int i;
+	struct list_head ordered_sums;
+	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	bool has_extents = false;
+	bool need_find_last_extent = true;
+	bool done = false;
+
+	INIT_LIST_HEAD(&ordered_sums);
+
+	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+			   nr * sizeof(u32), GFP_NOFS);
+	if (!ins_data)
+		return -ENOMEM;
+
+	first_key.objectid = (u64)-1;
+
+	ins_sizes = (u32 *)ins_data;
+	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+	for (i = 0; i < nr; i++) {
+		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+	}
+	ret = btrfs_insert_empty_items(trans, log, dst_path,
+				       ins_keys, ins_sizes, nr);
+	if (ret) {
+		kfree(ins_data);
+		return ret;
+	}
+
+	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+		if ((i == (nr - 1)))
+			last_key = ins_keys[i];
+
+		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			fill_inode_item(trans, dst_path->nodes[0], inode_item,
+					inode, inode_only == LOG_INODE_EXISTS,
+					logged_isize);
+		} else {
+			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+					   src_offset, ins_sizes[i]);
+		}
+
+		/*
+		 * We set need_find_last_extent here in case we know we were
+		 * processing other items and then walk into the first extent in
+		 * the inode.  If we don't hit an extent then nothing changes,
+		 * we'll do the last search the next time around.
+		 */
+		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+			has_extents = true;
+			if (first_key.objectid == (u64)-1)
+				first_key = ins_keys[i];
+		} else {
+			need_find_last_extent = false;
+		}
+
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
+		    !skip_csum) {
+			int found_type;
+			extent = btrfs_item_ptr(src, start_slot + i,
+						struct btrfs_file_extent_item);
+
+			if (btrfs_file_extent_generation(src, extent) < trans->transid)
+				continue;
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds, dl, cs, cl;
+				ds = btrfs_file_extent_disk_bytenr(src,
+								extent);
+				/* ds == 0 is a hole */
+				if (ds == 0)
+					continue;
+
+				dl = btrfs_file_extent_disk_num_bytes(src,
+								extent);
+				cs = btrfs_file_extent_offset(src, extent);
+				cl = btrfs_file_extent_num_bytes(src,
+								extent);
+				if (btrfs_file_extent_compression(src,
+								  extent)) {
+					cs = 0;
+					cl = dl;
+				}
+
+				ret = btrfs_lookup_csums_range(
+						log->fs_info->csum_root,
+						ds + cs, ds + cs + cl - 1,
+						&ordered_sums, 0);
+				if (ret) {
+					btrfs_release_path(dst_path);
+					kfree(ins_data);
+					return ret;
+				}
+			}
+		}
+	}
+
+	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+	btrfs_release_path(dst_path);
+	kfree(ins_data);
+
+	/*
+	 * we have to do this after the loop above to avoid changing the
+	 * log tree while trying to change the log tree.
+	 */
+	ret = 0;
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		if (!ret)
+			ret = btrfs_csum_file_blocks(trans, log, sums);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+
+	if (!has_extents)
+		return ret;
+
+	if (need_find_last_extent && *last_extent == first_key.offset) {
+		/*
+		 * We don't have any leafs between our current one and the one
+		 * we processed before that can have file extent items for our
+		 * inode (and have a generation number smaller than our current
+		 * transaction id).
+		 */
+		need_find_last_extent = false;
+	}
+
+	/*
+	 * Because we use btrfs_search_forward we could skip leaves that were
+	 * not modified and then assume *last_extent is valid when it really
+	 * isn't.  So back up to the previous leaf and read the end of the last
+	 * extent before we go and fill in holes.
+	 */
+	if (need_find_last_extent) {
+		u64 len;
+
+		ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			goto fill_holes;
+		if (src_path->slots[0])
+			src_path->slots[0]--;
+		src = src_path->nodes[0];
+		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+		if (key.objectid != btrfs_ino(inode) ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			goto fill_holes;
+		extent = btrfs_item_ptr(src, src_path->slots[0],
+					struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(src, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(src,
+							   src_path->slots[0],
+							   extent);
+			*last_extent = ALIGN(key.offset + len,
+					     log->sectorsize);
+		} else {
+			len = btrfs_file_extent_num_bytes(src, extent);
+			*last_extent = key.offset + len;
+		}
+	}
+fill_holes:
+	/* So we did prev_leaf, now we need to move to the next leaf, but a few
+	 * things could have happened
+	 *
+	 * 1) A merge could have happened, so we could currently be on a leaf
+	 * that holds what we were copying in the first place.
+	 * 2) A split could have happened, and now not all of the items we want
+	 * are on the same leaf.
+	 *
+	 * So we need to adjust how we search for holes, we need to drop the
+	 * path and re-search for the first extent key we found, and then walk
+	 * forward until we hit the last one we copied.
+	 */
+	if (need_find_last_extent) {
+		/* btrfs_prev_leaf could return 1 without releasing the path */
+		btrfs_release_path(src_path);
+		ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
+					src_path, 0, 0);
+		if (ret < 0)
+			return ret;
+		ASSERT(ret == 0);
+		src = src_path->nodes[0];
+		i = src_path->slots[0];
+	} else {
+		i = start_slot;
+	}
+
+	/*
+	 * Ok so here we need to go through and fill in any holes we may have
+	 * to make sure that holes are punched for those areas in case they had
+	 * extents previously.
+	 */
+	while (!done) {
+		u64 offset, len;
+		u64 extent_end;
+
+		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+			ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
+			if (ret < 0)
+				return ret;
+			ASSERT(ret == 0);
+			src = src_path->nodes[0];
+			i = 0;
+		}
+
+		btrfs_item_key_to_cpu(src, &key, i);
+		if (!btrfs_comp_cpu_keys(&key, &last_key))
+			done = true;
+		if (key.objectid != btrfs_ino(inode) ||
+		    key.type != BTRFS_EXTENT_DATA_KEY) {
+			i++;
+			continue;
+		}
+		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(src, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(src, i, extent);
+			extent_end = ALIGN(key.offset + len, log->sectorsize);
+		} else {
+			len = btrfs_file_extent_num_bytes(src, extent);
+			extent_end = key.offset + len;
+		}
+		i++;
+
+		if (*last_extent == key.offset) {
+			*last_extent = extent_end;
+			continue;
+		}
+		offset = *last_extent;
+		len = key.offset - *last_extent;
+		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+					       offset, 0, 0, len, 0, len, 0,
+					       0, 0);
+		if (ret)
+			break;
+		*last_extent = extent_end;
+	}
+	/*
+	 * Need to let the callers know we dropped the path so they should
+	 * re-search.
+	 */
+	if (!ret && need_find_last_extent)
+		ret = 1;
+	return ret;
+}
+
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct extent_map *em1, *em2;
+
+	em1 = list_entry(a, struct extent_map, list);
+	em2 = list_entry(b, struct extent_map, list);
+
+	if (em1->start < em2->start)
+		return -1;
+	else if (em1->start > em2->start)
+		return 1;
+	return 0;
+}
+
+static int wait_ordered_extents(struct btrfs_trans_handle *trans,
+				struct inode *inode,
+				struct btrfs_root *root,
+				const struct extent_map *em,
+				const struct list_head *logged_list,
+				bool *ordered_io_error)
+{
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *log = root->log_root;
+	u64 mod_start = em->mod_start;
+	u64 mod_len = em->mod_len;
+	const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	u64 csum_offset;
+	u64 csum_len;
+	LIST_HEAD(ordered_sums);
+	int ret = 0;
+
+	*ordered_io_error = false;
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	    em->block_start == EXTENT_MAP_HOLE)
+		return 0;
+
+	/*
+	 * Wait far any ordered extent that covers our extent map. If it
+	 * finishes without an error, first check and see if our csums are on
+	 * our outstanding ordered extents.
+	 */
+	list_for_each_entry(ordered, logged_list, log_list) {
+		struct btrfs_ordered_sum *sum;
+
+		if (!mod_len)
+			break;
+
+		if (ordered->file_offset + ordered->len <= mod_start ||
+		    mod_start + mod_len <= ordered->file_offset)
+			continue;
+
+		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+			const u64 start = ordered->file_offset;
+			const u64 end = ordered->file_offset + ordered->len - 1;
+
+			WARN_ON(ordered->inode != inode);
+			filemap_fdatawrite_range(inode->i_mapping, start, end);
+		}
+
+		wait_event(ordered->wait,
+			   (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
+			    test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
+
+		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+			/*
+			 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+			 * i_mapping flags, so that the next fsync won't get
+			 * an outdated io error too.
+			 */
+			btrfs_inode_check_errors(inode);
+			*ordered_io_error = true;
+			break;
+		}
+		/*
+		 * We are going to copy all the csums on this ordered extent, so
+		 * go ahead and adjust mod_start and mod_len in case this
+		 * ordered extent has already been logged.
+		 */
+		if (ordered->file_offset > mod_start) {
+			if (ordered->file_offset + ordered->len >=
+			    mod_start + mod_len)
+				mod_len = ordered->file_offset - mod_start;
+			/*
+			 * If we have this case
+			 *
+			 * |--------- logged extent ---------|
+			 *       |----- ordered extent ----|
+			 *
+			 * Just don't mess with mod_start and mod_len, we'll
+			 * just end up logging more csums than we need and it
+			 * will be ok.
+			 */
+		} else {
+			if (ordered->file_offset + ordered->len <
+			    mod_start + mod_len) {
+				mod_len = (mod_start + mod_len) -
+					(ordered->file_offset + ordered->len);
+				mod_start = ordered->file_offset +
+					ordered->len;
+			} else {
+				mod_len = 0;
+			}
+		}
+
+		if (skip_csum)
+			continue;
+
+		/*
+		 * To keep us from looping for the above case of an ordered
+		 * extent that falls inside of the logged extent.
+		 */
+		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+				     &ordered->flags))
+			continue;
+
+		if (ordered->csum_bytes_left) {
+			btrfs_start_ordered_extent(inode, ordered, 0);
+			wait_event(ordered->wait,
+				   ordered->csum_bytes_left == 0);
+		}
+
+		list_for_each_entry(sum, &ordered->list, list) {
+			ret = btrfs_csum_file_blocks(trans, log, sum);
+			if (ret)
+				break;
+		}
+	}
+
+	if (*ordered_io_error || !mod_len || ret || skip_csum)
+		return ret;
+
+	if (em->compress_type) {
+		csum_offset = 0;
+		csum_len = max(em->block_len, em->orig_block_len);
+	} else {
+		csum_offset = mod_start - em->start;
+		csum_len = mod_len;
+	}
+
+	/* block start is already adjusted for the file extent offset. */
+	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+				       em->block_start + csum_offset,
+				       em->block_start + csum_offset +
+				       csum_len - 1, &ordered_sums, 0);
+	if (ret)
+		return ret;
+
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		if (!ret)
+			ret = btrfs_csum_file_blocks(trans, log, sums);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+
+	return ret;
+}
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+			  struct inode *inode, struct btrfs_root *root,
+			  const struct extent_map *em,
+			  struct btrfs_path *path,
+			  const struct list_head *logged_list,
+			  struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	struct btrfs_map_token token;
+	struct btrfs_key key;
+	u64 extent_offset = em->start - em->orig_start;
+	u64 block_len;
+	int ret;
+	int extent_inserted = 0;
+	bool ordered_io_err = false;
+
+	ret = wait_ordered_extents(trans, inode, root, em, logged_list,
+				   &ordered_io_err);
+	if (ret)
+		return ret;
+
+	if (ordered_io_err) {
+		ctx->io_err = -EIO;
+		return 0;
+	}
+
+	btrfs_init_map_token(&token);
+
+	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+				   em->start + em->len, NULL, 0, 1,
+				   sizeof(*fi), &extent_inserted);
+	if (ret)
+		return ret;
+
+	if (!extent_inserted) {
+		key.objectid = btrfs_ino(inode);
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = em->start;
+
+		ret = btrfs_insert_empty_item(trans, log, path, &key,
+					      sizeof(*fi));
+		if (ret)
+			return ret;
+	}
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+
+	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
+					       &token);
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_PREALLOC,
+						 &token);
+	else
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_REG,
+						 &token);
+
+	block_len = max(em->block_len, em->orig_block_len);
+	if (em->compress_type != BTRFS_COMPRESS_NONE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start,
+							&token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start -
+							extent_offset, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+							   &token);
+	}
+
+	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
+	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+						&token);
+	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *inode,
+				     struct btrfs_path *path,
+				     struct list_head *logged_list,
+				     struct btrfs_log_ctx *ctx)
+{
+	struct extent_map *em, *n;
+	struct list_head extents;
+	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+	u64 test_gen;
+	int ret = 0;
+	int num = 0;
+
+	INIT_LIST_HEAD(&extents);
+
+	write_lock(&tree->lock);
+	test_gen = root->fs_info->last_trans_committed;
+
+	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+		list_del_init(&em->list);
+
+		/*
+		 * Just an arbitrary number, this can be really CPU intensive
+		 * once we start getting a lot of extents, and really once we
+		 * have a bunch of extents we just want to commit since it will
+		 * be faster.
+		 */
+		if (++num > 32768) {
+			list_del_init(&tree->modified_extents);
+			ret = -EFBIG;
+			goto process;
+		}
+
+		if (em->generation <= test_gen)
+			continue;
+		/* Need a ref to keep it from getting evicted from cache */
+		atomic_inc(&em->refs);
+		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+		list_add_tail(&em->list, &extents);
+		num++;
+	}
+
+	list_sort(NULL, &extents, extent_cmp);
+
+process:
+	while (!list_empty(&extents)) {
+		em = list_entry(extents.next, struct extent_map, list);
+
+		list_del_init(&em->list);
+
+		/*
+		 * If we had an error we just need to delete everybody from our
+		 * private list.
+		 */
+		if (ret) {
+			clear_em_logging(tree, em);
+			free_extent_map(em);
+			continue;
+		}
+
+		write_unlock(&tree->lock);
+
+		ret = log_one_extent(trans, inode, root, em, path, logged_list,
+				     ctx);
+		write_lock(&tree->lock);
+		clear_em_logging(tree, em);
+		free_extent_map(em);
+	}
+	WARN_ON(!list_empty(&extents));
+	write_unlock(&tree->lock);
+
+	btrfs_release_path(path);
+	return ret;
+}
+
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+			     struct btrfs_path *path, u64 *size_ret)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+	if (ret < 0) {
+		return ret;
+	} else if (ret > 0) {
+		*size_ret = 0;
+	} else {
+		struct btrfs_inode_item *item;
+
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_inode_item);
+		*size_ret = btrfs_inode_size(path->nodes[0], item);
+	}
+
+	btrfs_release_path(path);
+	return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct inode *inode,
+			   int inode_only,
+			   const loff_t start,
+			   const loff_t end,
+			   struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_path *path;
+	struct btrfs_path *dst_path;
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src = NULL;
+	LIST_HEAD(logged_list);
+	u64 last_extent = 0;
+	int err = 0;
+	int ret;
+	int nritems;
+	int ins_start_slot = 0;
+	int ins_nr;
+	bool fast_search = false;
+	u64 ino = btrfs_ino(inode);
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 logged_isize = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	dst_path = btrfs_alloc_path();
+	if (!dst_path) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	min_key.objectid = ino;
+	min_key.type = BTRFS_INODE_ITEM_KEY;
+	min_key.offset = 0;
+
+	max_key.objectid = ino;
+
+
+	/* today the code can only do partial logging of directories */
+	if (S_ISDIR(inode->i_mode) ||
+	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+		       &BTRFS_I(inode)->runtime_flags) &&
+	     inode_only == LOG_INODE_EXISTS))
+		max_key.type = BTRFS_XATTR_ITEM_KEY;
+	else
+		max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+	/*
+	 * Only run delayed items if we are a dir or a new file.
+	 * Otherwise commit the delayed inode only, which is needed in
+	 * order for the log replay code to mark inodes for link count
+	 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+	 */
+	if (S_ISDIR(inode->i_mode) ||
+	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
+		ret = btrfs_commit_inode_delayed_items(trans, inode);
+	else
+		ret = btrfs_commit_inode_delayed_inode(inode);
+
+	if (ret) {
+		btrfs_free_path(path);
+		btrfs_free_path(dst_path);
+		return ret;
+	}
+
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	btrfs_get_logged_extents(inode, &logged_list, start, end);
+
+	/*
+	 * a brute force approach to making sure we get the most uptodate
+	 * copies of everything.
+	 */
+	if (S_ISDIR(inode->i_mode)) {
+		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+		if (inode_only == LOG_INODE_EXISTS)
+			max_key_type = BTRFS_XATTR_ITEM_KEY;
+		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+	} else {
+		if (inode_only == LOG_INODE_EXISTS) {
+			/*
+			 * Make sure the new inode item we write to the log has
+			 * the same isize as the current one (if it exists).
+			 * This is necessary to prevent data loss after log
+			 * replay, and also to prevent doing a wrong expanding
+			 * truncate - for e.g. create file, write 4K into offset
+			 * 0, fsync, write 4K into offset 4096, add hard link,
+			 * fsync some other file (to sync log), power fail - if
+			 * we use the inode's current i_size, after log replay
+			 * we get a 8Kb file, with the last 4Kb extent as a hole
+			 * (zeroes), as if an expanding truncate happened,
+			 * instead of getting a file of 4Kb only.
+			 */
+			err = logged_inode_size(log, inode, path,
+						&logged_isize);
+			if (err)
+				goto out_unlock;
+		}
+		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			     &BTRFS_I(inode)->runtime_flags)) {
+			if (inode_only == LOG_INODE_EXISTS) {
+				max_key.type = BTRFS_XATTR_ITEM_KEY;
+				ret = drop_objectid_items(trans, log, path, ino,
+							  max_key.type);
+			} else {
+				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+					  &BTRFS_I(inode)->runtime_flags);
+				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+					  &BTRFS_I(inode)->runtime_flags);
+				while(1) {
+					ret = btrfs_truncate_inode_items(trans,
+							 log, inode, 0, 0);
+					if (ret != -EAGAIN)
+						break;
+				}
+			}
+		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+					      &BTRFS_I(inode)->runtime_flags) ||
+			   inode_only == LOG_INODE_EXISTS) {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
+			max_key.type = BTRFS_XATTR_ITEM_KEY;
+			ret = drop_objectid_items(trans, log, path, ino,
+						  max_key.type);
+		} else {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
+			ret = log_inode_item(trans, log, dst_path, inode);
+			if (ret) {
+				err = ret;
+				goto out_unlock;
+			}
+			goto log_extents;
+		}
+
+	}
+	if (ret) {
+		err = ret;
+		goto out_unlock;
+	}
+
+	while (1) {
+		ins_nr = 0;
+		ret = btrfs_search_forward(root, &min_key,
+					   path, trans->transid);
+		if (ret != 0)
+			break;
+again:
+		/* note, ins_nr might be > 0 here, cleanup outside the loop */
+		if (min_key.objectid != ino)
+			break;
+		if (min_key.type > max_key.type)
+			break;
+
+		src = path->nodes[0];
+		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+			ins_nr++;
+			goto next_slot;
+		} else if (!ins_nr) {
+			ins_start_slot = path->slots[0];
+			ins_nr = 1;
+			goto next_slot;
+		}
+
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 ins_start_slot, ins_nr, inode_only,
+				 logged_isize);
+		if (ret < 0) {
+			err = ret;
+			goto out_unlock;
+		}
+		if (ret) {
+			ins_nr = 0;
+			btrfs_release_path(path);
+			continue;
+		}
+		ins_nr = 1;
+		ins_start_slot = path->slots[0];
+next_slot:
+
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		path->slots[0]++;
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+					      path->slots[0]);
+			goto again;
+		}
+		if (ins_nr) {
+			ret = copy_items(trans, inode, dst_path, path,
+					 &last_extent, ins_start_slot,
+					 ins_nr, inode_only, logged_isize);
+			if (ret < 0) {
+				err = ret;
+				goto out_unlock;
+			}
+			ret = 0;
+			ins_nr = 0;
+		}
+		btrfs_release_path(path);
+
+		if (min_key.offset < (u64)-1) {
+			min_key.offset++;
+		} else if (min_key.type < max_key.type) {
+			min_key.type++;
+			min_key.offset = 0;
+		} else {
+			break;
+		}
+	}
+	if (ins_nr) {
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 ins_start_slot, ins_nr, inode_only,
+				 logged_isize);
+		if (ret < 0) {
+			err = ret;
+			goto out_unlock;
+		}
+		ret = 0;
+		ins_nr = 0;
+	}
+
+log_extents:
+	btrfs_release_path(path);
+	btrfs_release_path(dst_path);
+	if (fast_search) {
+		/*
+		 * Some ordered extents started by fsync might have completed
+		 * before we collected the ordered extents in logged_list, which
+		 * means they're gone, not in our logged_list nor in the inode's
+		 * ordered tree. We want the application/user space to know an
+		 * error happened while attempting to persist file data so that
+		 * it can take proper action. If such error happened, we leave
+		 * without writing to the log tree and the fsync must report the
+		 * file data write error and not commit the current transaction.
+		 */
+		err = btrfs_inode_check_errors(inode);
+		if (err) {
+			ctx->io_err = err;
+			goto out_unlock;
+		}
+		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+						&logged_list, ctx);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	} else if (inode_only == LOG_INODE_ALL) {
+		struct extent_map *em, *n;
+
+		write_lock(&em_tree->lock);
+		/*
+		 * We can't just remove every em if we're called for a ranged
+		 * fsync - that is, one that doesn't cover the whole possible
+		 * file range (0 to LLONG_MAX). This is because we can have
+		 * em's that fall outside the range we're logging and therefore
+		 * their ordered operations haven't completed yet
+		 * (btrfs_finish_ordered_io() not invoked yet). This means we
+		 * didn't get their respective file extent item in the fs/subvol
+		 * tree yet, and need to let the next fast fsync (one which
+		 * consults the list of modified extent maps) find the em so
+		 * that it logs a matching file extent item and waits for the
+		 * respective ordered operation to complete (if it's still
+		 * running).
+		 *
+		 * Removing every em outside the range we're logging would make
+		 * the next fast fsync not log their matching file extent items,
+		 * therefore making us lose data after a log replay.
+		 */
+		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+					 list) {
+			const u64 mod_end = em->mod_start + em->mod_len - 1;
+
+			if (em->mod_start >= start && mod_end <= end)
+				list_del_init(&em->list);
+		}
+		write_unlock(&em_tree->lock);
+	}
+
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+		ret = log_directory_changes(trans, root, inode, path, dst_path,
+					    ctx);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	}
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->logged_trans = trans->transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+	spin_unlock(&BTRFS_I(inode)->lock);
+out_unlock:
+	if (unlikely(err))
+		btrfs_put_logged_extents(&logged_list);
+	else
+		btrfs_submit_logged_extents(&logged_list, log);
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+	btrfs_free_path(path);
+	btrfs_free_path(dst_path);
+	return err;
+}
+
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged.  Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+					       struct inode *inode,
+					       struct dentry *parent,
+					       struct super_block *sb,
+					       u64 last_committed)
+{
+	int ret = 0;
+	struct btrfs_root *root;
+	struct dentry *old_parent = NULL;
+	struct inode *orig_inode = inode;
+
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
+			goto out;
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+			goto out;
+		inode = d_inode(parent);
+	}
+
+	while (1) {
+		/*
+		 * If we are logging a directory then we start with our inode,
+		 * not our parents inode, so we need to skipp setting the
+		 * logged_trans so that further down in the log code we don't
+		 * think this inode has already been logged.
+		 */
+		if (inode != orig_inode)
+			BTRFS_I(inode)->logged_trans = trans->transid;
+		smp_mb();
+
+		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+			root = BTRFS_I(inode)->root;
+
+			/*
+			 * make sure any commits to the log are forced
+			 * to be full commits
+			 */
+			btrfs_set_log_full_commit(root->fs_info, trans);
+			ret = 1;
+			break;
+		}
+
+		if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+			break;
+
+		if (IS_ROOT(parent))
+			break;
+
+		parent = dget_parent(parent);
+		dput(old_parent);
+		old_parent = parent;
+		inode = d_inode(parent);
+
+	}
+	dput(old_parent);
+out:
+	return ret;
+}
+
+struct btrfs_dir_list {
+	u64 ino;
+	struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ *        CPU0                                        CPU1
+ *        ----                                        ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ *                                            lock(sb_internal#2);
+ *                                            lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ *    that while logging the inode new references (names) are added or removed
+ *    from the inode, leaving the logged inode item with a link count that does
+ *    not match the number of logged inode reference items. This is fine because
+ *    at log replay time we compute the real number of links and correct the
+ *    link count in the inode item (see replay_one_buffer() and
+ *    link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ *    has a size that doesn't match the sum of the lengths of all the logged
+ *    names. This does not result in a problem because if a dir_item key is
+ *    logged but its matching dir_index key is not logged, at log replay time we
+ *    don't use it to replay the respective name (see replay_one_name()). On the
+ *    other hand if only the dir_index key ends up being logged, the respective
+ *    name is added to the fs/subvol tree with both the dir_item and dir_index
+ *    keys created (see replay_one_name()).
+ *    The directory's inode item with a wrong i_size is not a problem as well,
+ *    since we don't use it at log replay time to set the i_size in the inode
+ *    item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *start_inode,
+				struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_path *path;
+	LIST_HEAD(dir_list);
+	struct btrfs_dir_list *dir_elem;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+	if (!dir_elem) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+	dir_elem->ino = btrfs_ino(start_inode);
+	list_add_tail(&dir_elem->list, &dir_list);
+
+	while (!list_empty(&dir_list)) {
+		struct extent_buffer *leaf;
+		struct btrfs_key min_key;
+		int nritems;
+		int i;
+
+		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+					    list);
+		if (ret)
+			goto next_dir_inode;
+
+		min_key.objectid = dir_elem->ino;
+		min_key.type = BTRFS_DIR_ITEM_KEY;
+		min_key.offset = 0;
+again:
+		btrfs_release_path(path);
+		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+		if (ret < 0) {
+			goto next_dir_inode;
+		} else if (ret > 0) {
+			ret = 0;
+			goto next_dir_inode;
+		}
+
+process_leaf:
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		for (i = path->slots[0]; i < nritems; i++) {
+			struct btrfs_dir_item *di;
+			struct btrfs_key di_key;
+			struct inode *di_inode;
+			struct btrfs_dir_list *new_dir_elem;
+			int log_mode = LOG_INODE_EXISTS;
+			int type;
+
+			btrfs_item_key_to_cpu(leaf, &min_key, i);
+			if (min_key.objectid != dir_elem->ino ||
+			    min_key.type != BTRFS_DIR_ITEM_KEY)
+				goto next_dir_inode;
+
+			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+			type = btrfs_dir_type(leaf, di);
+			if (btrfs_dir_transid(leaf, di) < trans->transid &&
+			    type != BTRFS_FT_DIR)
+				continue;
+			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+				continue;
+
+			di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+					      root, NULL);
+			if (IS_ERR(di_inode)) {
+				ret = PTR_ERR(di_inode);
+				goto next_dir_inode;
+			}
+
+			if (btrfs_inode_in_log(di_inode, trans->transid)) {
+				iput(di_inode);
+				continue;
+			}
+
+			ctx->log_new_dentries = false;
+			if (type == BTRFS_FT_DIR)
+				log_mode = LOG_INODE_ALL;
+			btrfs_release_path(path);
+			ret = btrfs_log_inode(trans, root, di_inode,
+					      log_mode, 0, LLONG_MAX, ctx);
+			iput(di_inode);
+			if (ret)
+				goto next_dir_inode;
+			if (ctx->log_new_dentries) {
+				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+						       GFP_NOFS);
+				if (!new_dir_elem) {
+					ret = -ENOMEM;
+					goto next_dir_inode;
+				}
+				new_dir_elem->ino = di_key.objectid;
+				list_add_tail(&new_dir_elem->list, &dir_list);
+			}
+			break;
+		}
+		if (i == nritems) {
+			ret = btrfs_next_leaf(log, path);
+			if (ret < 0) {
+				goto next_dir_inode;
+			} else if (ret > 0) {
+				ret = 0;
+				goto next_dir_inode;
+			}
+			goto process_leaf;
+		}
+		if (min_key.offset < (u64)-1) {
+			min_key.offset++;
+			goto again;
+		}
+next_dir_inode:
+		list_del(&dir_elem->list);
+		kfree(dir_elem);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+			    	  struct btrfs_root *root, struct inode *inode,
+				  struct dentry *parent,
+				  const loff_t start,
+				  const loff_t end,
+				  int exists_only,
+				  struct btrfs_log_ctx *ctx)
+{
+	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
+	struct super_block *sb;
+	struct dentry *old_parent = NULL;
+	int ret = 0;
+	u64 last_committed = root->fs_info->last_trans_committed;
+	const struct dentry * const first_parent = parent;
+	const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+				 last_committed);
+	bool log_dentries = false;
+	struct inode *orig_inode = inode;
+
+	sb = inode->i_sb;
+
+	if (btrfs_test_opt(root, NOTREELOG)) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	/*
+	 * The prev transaction commit doesn't complete, we need do
+	 * full commit by ourselves.
+	 */
+	if (root->fs_info->last_trans_log_full_commit >
+	    root->fs_info->last_trans_committed) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	if (root != BTRFS_I(inode)->root ||
+	    btrfs_root_refs(&root->root_item) == 0) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	ret = check_parent_dirs_for_sync(trans, inode, parent,
+					 sb, last_committed);
+	if (ret)
+		goto end_no_trans;
+
+	if (btrfs_inode_in_log(inode, trans->transid)) {
+		ret = BTRFS_NO_LOG_SYNC;
+		goto end_no_trans;
+	}
+
+	ret = start_log_trans(trans, root, ctx);
+	if (ret)
+		goto end_no_trans;
+
+	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+	if (ret)
+		goto end_trans;
+
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+		ret = 0;
+		goto end_trans;
+	}
+
+	if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+		log_dentries = true;
+
+	while (1) {
+		if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+			break;
+
+		inode = d_inode(parent);
+		if (root != BTRFS_I(inode)->root)
+			break;
+
+		/*
+		 * On unlink we must make sure our immediate parent directory
+		 * inode is fully logged. This is to prevent leaving dangling
+		 * directory index entries and a wrong directory inode's i_size.
+		 * Not doing so can result in a directory being impossible to
+		 * delete after log replay (rmdir will always fail with error
+		 * -ENOTEMPTY).
+		 */
+		if (did_unlink && parent == first_parent)
+			inode_only = LOG_INODE_ALL;
+		else
+			inode_only = LOG_INODE_EXISTS;
+
+		if (BTRFS_I(inode)->generation >
+		    root->fs_info->last_trans_committed ||
+		    inode_only == LOG_INODE_ALL) {
+			ret = btrfs_log_inode(trans, root, inode, inode_only,
+					      0, LLONG_MAX, ctx);
+			if (ret)
+				goto end_trans;
+		}
+		if (IS_ROOT(parent))
+			break;
+
+		parent = dget_parent(parent);
+		dput(old_parent);
+		old_parent = parent;
+	}
+	if (log_dentries)
+		ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+	else
+		ret = 0;
+end_trans:
+	dput(old_parent);
+	if (ret < 0) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		ret = 1;
+	}
+
+	if (ret)
+		btrfs_remove_log_ctx(root, ctx);
+	btrfs_end_log_trans(root);
+end_no_trans:
+	return ret;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry,
+			  const loff_t start,
+			  const loff_t end,
+			  struct btrfs_log_ctx *ctx)
+{
+	struct dentry *parent = dget_parent(dentry);
+	int ret;
+
+	ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
+				     start, end, 0, ctx);
+	dput(parent);
+
+	return ret;
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key tmp_key;
+	struct btrfs_root *log;
+	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	struct walk_control wc = {
+		.process_func = process_one_buffer,
+		.stage = 0,
+	};
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	fs_info->log_root_recovering = 1;
+
+	trans = btrfs_start_transaction(fs_info->tree_root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto error;
+	}
+
+	wc.trans = trans;
+	wc.pin = 1;
+
+	ret = walk_log_tree(trans, log_root_tree, &wc);
+	if (ret) {
+		btrfs_error(fs_info, ret, "Failed to pin buffers while "
+			    "recovering log root tree.");
+		goto error;
+	}
+
+again:
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+
+		if (ret < 0) {
+			btrfs_error(fs_info, ret,
+				    "Couldn't find tree log root.");
+			goto error;
+		}
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		btrfs_release_path(path);
+		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			break;
+
+		log = btrfs_read_fs_root(log_root_tree, &found_key);
+		if (IS_ERR(log)) {
+			ret = PTR_ERR(log);
+			btrfs_error(fs_info, ret,
+				    "Couldn't read tree log root.");
+			goto error;
+		}
+
+		tmp_key.objectid = found_key.offset;
+		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_key.offset = (u64)-1;
+
+		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+		if (IS_ERR(wc.replay_dest)) {
+			ret = PTR_ERR(wc.replay_dest);
+			free_extent_buffer(log->node);
+			free_extent_buffer(log->commit_root);
+			kfree(log);
+			btrfs_error(fs_info, ret, "Couldn't read target root "
+				    "for tree log recovery.");
+			goto error;
+		}
+
+		wc.replay_dest->log_root = log;
+		btrfs_record_root_in_trans(trans, wc.replay_dest);
+		ret = walk_log_tree(trans, log, &wc);
+
+		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+			ret = fixup_inode_link_counts(trans, wc.replay_dest,
+						      path);
+		}
+
+		key.offset = found_key.offset - 1;
+		wc.replay_dest->log_root = NULL;
+		free_extent_buffer(log->node);
+		free_extent_buffer(log->commit_root);
+		kfree(log);
+
+		if (ret)
+			goto error;
+
+		if (found_key.offset == 0)
+			break;
+	}
+	btrfs_release_path(path);
+
+	/* step one is to pin it all, step two is to replay just inodes */
+	if (wc.pin) {
+		wc.pin = 0;
+		wc.process_func = replay_one_buffer;
+		wc.stage = LOG_WALK_REPLAY_INODES;
+		goto again;
+	}
+	/* step three is to replay everything */
+	if (wc.stage < LOG_WALK_REPLAY_ALL) {
+		wc.stage++;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+
+	/* step 4: commit the transaction, which also unpins the blocks */
+	ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+	if (ret)
+		return ret;
+
+	free_extent_buffer(log_root_tree->node);
+	log_root_tree->log_root = NULL;
+	fs_info->log_root_recovering = 0;
+	kfree(log_root_tree);
+
+	return 0;
+error:
+	if (wc.trans)
+		btrfs_end_transaction(wc.trans, fs_info->tree_root);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename)
+{
+	/*
+	 * when we're logging a file, if it hasn't been renamed
+	 * or unlinked, and its inode is fully committed on disk,
+	 * we don't have to worry about walking up the directory chain
+	 * to log its parents.
+	 *
+	 * So, we use the last_unlink_trans field to put this transid
+	 * into the file.  When the file is logged we check it and
+	 * don't log the parents if the file is fully on disk.
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+	/*
+	 * if this directory was already logged any new
+	 * names for this file/dir will get recorded
+	 */
+	smp_mb();
+	if (BTRFS_I(dir)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * if the inode we're about to unlink was logged,
+	 * the log will be properly updated for any new names
+	 */
+	if (BTRFS_I(inode)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * when renaming files across directories, if the directory
+	 * there we're unlinking from gets fsync'd later on, there's
+	 * no way to find the destination directory later and fsync it
+	 * properly.  So, we have to be conservative and force commits
+	 * so the new name gets discovered.
+	 */
+	if (for_rename)
+		goto record;
+
+	/* we can safely do the unlink without any special recording */
+	return;
+
+record:
+	BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent)
+{
+	struct btrfs_root * root = BTRFS_I(inode)->root;
+
+	/*
+	 * this will force the logging code to walk the dentry chain
+	 * up for the file
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+	/*
+	 * if this inode hasn't been logged and directory we're renaming it
+	 * from hasn't been logged, we don't need to log it
+	 */
+	if (BTRFS_I(inode)->logged_trans <=
+	    root->fs_info->last_trans_committed &&
+	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+		    root->fs_info->last_trans_committed))
+		return 0;
+
+	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+				      LLONG_MAX, 1, NULL);
+}
+
diff --git a/kernel/fs/btrfs/tree-log.h b/kernel/fs/btrfs/tree-log.h
new file mode 100644
index 000000000..6916a781e
--- /dev/null
+++ b/kernel/fs/btrfs/tree-log.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+#include "ctree.h"
+#include "transaction.h"
+
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
+struct btrfs_log_ctx {
+	int log_ret;
+	int log_transid;
+	int io_err;
+	bool log_new_dentries;
+	struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+	ctx->log_ret = 0;
+	ctx->log_transid = 0;
+	ctx->io_err = 0;
+	ctx->log_new_dentries = false;
+	INIT_LIST_HEAD(&ctx->list);
+}
+
+static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
+					     struct btrfs_trans_handle *trans)
+{
+	ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
+}
+
+static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
+					     struct btrfs_trans_handle *trans)
+{
+	return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
+		trans->transid;
+}
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry,
+			  const loff_t start,
+			  const loff_t end,
+			  struct btrfs_log_ctx *ctx);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid);
+void btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent);
+#endif
diff --git a/kernel/fs/btrfs/ulist.c b/kernel/fs/btrfs/ulist.c
new file mode 100644
index 000000000..840a38b27
--- /dev/null
+++ b/kernel/fs/btrfs/ulist.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include "ulist.h"
+#include "ctree.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * ULIST_ITER_INIT(&uiter);
+ *
+ * while ((elem = ulist_next(ulist, &uiter)) {
+ * 	for (all child nodes n in elem)
+ *		ulist_add(ulist, n);
+ *	do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist:	the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+	INIT_LIST_HEAD(&ulist->nodes);
+	ulist->root = RB_ROOT;
+	ulist->nnodes = 0;
+}
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist:	the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+static void ulist_fini(struct ulist *ulist)
+{
+	struct ulist_node *node;
+	struct ulist_node *next;
+
+	list_for_each_entry_safe(node, next, &ulist->nodes, list) {
+		kfree(node);
+	}
+	ulist->root = RB_ROOT;
+	INIT_LIST_HEAD(&ulist->nodes);
+}
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist:	ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+	ulist_fini(ulist);
+	ulist_init(ulist);
+}
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask:	allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(gfp_t gfp_mask)
+{
+	struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+	if (!ulist)
+		return NULL;
+
+	ulist_init(ulist);
+
+	return ulist;
+}
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist:	ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+	if (!ulist)
+		return;
+	ulist_fini(ulist);
+	kfree(ulist);
+}
+
+static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
+{
+	struct rb_node *n = ulist->root.rb_node;
+	struct ulist_node *u = NULL;
+
+	while (n) {
+		u = rb_entry(n, struct ulist_node, rb_node);
+		if (u->val < val)
+			n = n->rb_right;
+		else if (u->val > val)
+			n = n->rb_left;
+		else
+			return u;
+	}
+	return NULL;
+}
+
+static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
+{
+	struct rb_node **p = &ulist->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct ulist_node *cur = NULL;
+
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct ulist_node, rb_node);
+
+		if (cur->val < ins->val)
+			p = &(*p)->rb_right;
+		else if (cur->val > ins->val)
+			p = &(*p)->rb_left;
+		else
+			return -EEXIST;
+	}
+	rb_link_node(&ins->rb_node, parent, p);
+	rb_insert_color(&ins->rb_node, &ulist->root);
+	return 0;
+}
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist:	ulist to add the element to
+ * @val:	value to add to ulist
+ * @aux:	auxiliary value to store along with val
+ * @gfp_mask:	flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ *       locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
+{
+	return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask)
+{
+	int ret;
+	struct ulist_node *node;
+
+	node = ulist_rbtree_search(ulist, val);
+	if (node) {
+		if (old_aux)
+			*old_aux = node->aux;
+		return 0;
+	}
+	node = kmalloc(sizeof(*node), gfp_mask);
+	if (!node)
+		return -ENOMEM;
+
+	node->val = val;
+	node->aux = aux;
+#ifdef CONFIG_BTRFS_DEBUG
+	node->seqnum = ulist->nnodes;
+#endif
+
+	ret = ulist_rbtree_insert(ulist, node);
+	ASSERT(!ret);
+	list_add_tail(&node->list, &ulist->nodes);
+	ulist->nnodes++;
+
+	return 1;
+}
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist:	ulist to iterate
+ * @uiter:	iterator variable, initialized with ULIST_ITER_INIT(&iterator)
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ *       locking is needed
+ *
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
+{
+	struct ulist_node *node;
+
+	if (list_empty(&ulist->nodes))
+		return NULL;
+	if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
+		return NULL;
+	if (uiter->cur_list) {
+		uiter->cur_list = uiter->cur_list->next;
+	} else {
+		uiter->cur_list = ulist->nodes.next;
+#ifdef CONFIG_BTRFS_DEBUG
+		uiter->i = 0;
+#endif
+	}
+	node = list_entry(uiter->cur_list, struct ulist_node, list);
+#ifdef CONFIG_BTRFS_DEBUG
+	ASSERT(node->seqnum == uiter->i);
+	ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
+	uiter->i++;
+#endif
+	return node;
+}
diff --git a/kernel/fs/btrfs/ulist.h b/kernel/fs/btrfs/ulist.h
new file mode 100644
index 000000000..4c29db604
--- /dev/null
+++ b/kernel/fs/btrfs/ulist.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ */
+struct ulist_iterator {
+#ifdef CONFIG_BTRFS_DEBUG
+	int i;
+#endif
+	struct list_head *cur_list;  /* hint to start search */
+};
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+	u64 val;		/* value to store */
+	u64 aux;		/* auxiliary value saved along with the val */
+
+#ifdef CONFIG_BTRFS_DEBUG
+	int seqnum;		/* sequence number this node is added */
+#endif
+
+	struct list_head list;  /* used to link node */
+	struct rb_node rb_node;	/* used to speed up search */
+};
+
+struct ulist {
+	/*
+	 * number of elements stored in list
+	 */
+	unsigned long nnodes;
+
+	struct list_head nodes;
+	struct rb_root root;
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask);
+
+/* just like ulist_add_merge() but take a pointer for the aux data */
+static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+				      void **old_aux, gfp_t gfp_mask)
+{
+#if BITS_PER_LONG == 32
+	u64 old64 = (uintptr_t)*old_aux;
+	int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
+	*old_aux = (void *)((uintptr_t)old64);
+	return ret;
+#else
+	return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
+#endif
+}
+
+struct ulist_node *ulist_next(struct ulist *ulist,
+			      struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
+
+#endif
diff --git a/kernel/fs/btrfs/uuid-tree.c b/kernel/fs/btrfs/uuid-tree.c
new file mode 100644
index 000000000..778282944
--- /dev/null
+++ b/kernel/fs/btrfs/uuid-tree.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) STRATO AG 2013.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/uuid.h>
+#include <asm/unaligned.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+
+static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+{
+	key->type = type;
+	key->objectid = get_unaligned_le64(uuid);
+	key->offset = get_unaligned_le64(uuid + sizeof(u64));
+}
+
+/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+				  u8 type, u64 subid)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	unsigned long offset;
+	struct btrfs_key key;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+	ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	offset = btrfs_item_ptr_offset(eb, slot);
+	ret = -ENOENT;
+
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
+			(unsigned long)item_size);
+		goto out;
+	}
+	while (item_size) {
+		__le64 data;
+
+		read_extent_buffer(eb, &data, offset, sizeof(data));
+		if (le64_to_cpu(data) == subid) {
+			ret = 0;
+			break;
+		}
+		offset += sizeof(data);
+		item_size -= sizeof(data);
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid_cpu)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int slot;
+	unsigned long offset;
+	__le64 subid_le;
+
+	ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu);
+	if (ret != -ENOENT)
+		return ret;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
+				      sizeof(subid_le));
+	if (ret >= 0) {
+		/* Add an item for the type for the first time */
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		offset = btrfs_item_ptr_offset(eb, slot);
+	} else if (ret == -EEXIST) {
+		/*
+		 * An item with that type already exists.
+		 * Extend the item and store the new subid at the end.
+		 */
+		btrfs_extend_item(uuid_root, path, sizeof(subid_le));
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		offset = btrfs_item_ptr_offset(eb, slot);
+		offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+	} else if (ret < 0) {
+		btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d "
+			"(0x%016llx, 0x%016llx) type %u!",
+			ret, (unsigned long long)key.objectid,
+			(unsigned long long)key.offset, type);
+		goto out;
+	}
+
+	ret = 0;
+	subid_le = cpu_to_le64(subid_cpu);
+	write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int slot;
+	unsigned long offset;
+	u32 item_size;
+	unsigned long move_dst;
+	unsigned long move_src;
+	unsigned long move_len;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
+	if (ret < 0) {
+		btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!",
+			ret);
+		goto out;
+	}
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	offset = btrfs_item_ptr_offset(eb, slot);
+	item_size = btrfs_item_size_nr(eb, slot);
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
+			(unsigned long)item_size);
+		ret = -ENOENT;
+		goto out;
+	}
+	while (item_size) {
+		__le64 read_subid;
+
+		read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid));
+		if (le64_to_cpu(read_subid) == subid)
+			break;
+		offset += sizeof(read_subid);
+		item_size -= sizeof(read_subid);
+	}
+
+	if (!item_size) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	if (item_size == sizeof(subid)) {
+		ret = btrfs_del_item(trans, uuid_root, path);
+		goto out;
+	}
+
+	move_dst = offset;
+	move_src = offset + sizeof(subid);
+	move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
+	memmove_extent_buffer(eb, move_dst, move_src, move_len);
+	btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			       u64 subid)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/* 1 - for the uuid item */
+	trans = btrfs_start_transaction(uuid_root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid);
+	btrfs_end_transaction(trans, uuid_root);
+
+out:
+	return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+					      u64))
+{
+	struct btrfs_root *root = fs_info->uuid_root;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	int ret = 0;
+	struct extent_buffer *leaf;
+	int slot;
+	u32 item_size;
+	unsigned long offset;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = 0;
+	key.offset = 0;
+
+again_search_slot:
+	ret = btrfs_search_forward(root, &key, path, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto out;
+	}
+
+	while (1) {
+		cond_resched();
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.type != BTRFS_UUID_KEY_SUBVOL &&
+		    key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+			goto skip;
+
+		offset = btrfs_item_ptr_offset(leaf, slot);
+		item_size = btrfs_item_size_nr(leaf, slot);
+		if (!IS_ALIGNED(item_size, sizeof(u64))) {
+			btrfs_warn(fs_info, "uuid item with illegal size %lu!",
+				(unsigned long)item_size);
+			goto skip;
+		}
+		while (item_size) {
+			u8 uuid[BTRFS_UUID_SIZE];
+			__le64 subid_le;
+			u64 subid_cpu;
+
+			put_unaligned_le64(key.objectid, uuid);
+			put_unaligned_le64(key.offset, uuid + sizeof(u64));
+			read_extent_buffer(leaf, &subid_le, offset,
+					   sizeof(subid_le));
+			subid_cpu = le64_to_cpu(subid_le);
+			ret = check_func(fs_info, uuid, key.type, subid_cpu);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				btrfs_release_path(path);
+				ret = btrfs_uuid_iter_rem(root, uuid, key.type,
+							  subid_cpu);
+				if (ret == 0) {
+					/*
+					 * this might look inefficient, but the
+					 * justification is that it is an
+					 * exception that check_func returns 1,
+					 * and that in the regular case only one
+					 * entry per UUID exists.
+					 */
+					goto again_search_slot;
+				}
+				if (ret < 0 && ret != -ENOENT)
+					goto out;
+			}
+			item_size -= sizeof(subid_le);
+			offset += sizeof(subid_le);
+		}
+
+skip:
+		ret = btrfs_next_item(root, path);
+		if (ret == 0)
+			continue;
+		else if (ret > 0)
+			ret = 0;
+		break;
+	}
+
+out:
+	btrfs_free_path(path);
+	if (ret)
+		btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
+	return 0;
+}
diff --git a/kernel/fs/btrfs/volumes.c b/kernel/fs/btrfs/volumes.c
new file mode 100644
index 000000000..174f5e1e0
--- /dev/null
+++ b/kernel/fs/btrfs/volumes.c
@@ -0,0 +1,6730 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/semaphore.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
+#include "sysfs.h"
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+
+DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+static struct btrfs_fs_devices *__alloc_fs_devices(void)
+{
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+	if (!fs_devs)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&fs_devs->device_list_mutex);
+
+	INIT_LIST_HEAD(&fs_devs->devices);
+	INIT_LIST_HEAD(&fs_devs->resized_devices);
+	INIT_LIST_HEAD(&fs_devs->alloc_list);
+	INIT_LIST_HEAD(&fs_devs->list);
+
+	return fs_devs;
+}
+
+/**
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid:	a pointer to UUID for this FS.  If NULL a new UUID is
+ *		generated.
+ *
+ * Return: a pointer to a new &struct btrfs_fs_devices on success;
+ * ERR_PTR() on error.  Returned struct is not linked onto any lists and
+ * can be destroyed with kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = __alloc_fs_devices();
+	if (IS_ERR(fs_devs))
+		return fs_devs;
+
+	if (fsid)
+		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+	else
+		generate_random_uuid(fs_devs->fsid);
+
+	return fs_devs;
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+	WARN_ON(fs_devices->opened);
+	while (!list_empty(&fs_devices->devices)) {
+		device = list_entry(fs_devices->devices.next,
+				    struct btrfs_device, dev_list);
+		list_del(&device->dev_list);
+		rcu_string_free(device->name);
+		kfree(device);
+	}
+	kfree(fs_devices);
+}
+
+static void btrfs_kobject_uevent(struct block_device *bdev,
+				 enum kobject_action action)
+{
+	int ret;
+
+	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+	if (ret)
+		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+			action,
+			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+			&disk_to_dev(bdev->bd_disk)->kobj);
+}
+
+void btrfs_cleanup_fs_uuids(void)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	while (!list_empty(&fs_uuids)) {
+		fs_devices = list_entry(fs_uuids.next,
+					struct btrfs_fs_devices, list);
+		list_del(&fs_devices->list);
+		free_fs_devices(fs_devices);
+	}
+}
+
+static struct btrfs_device *__alloc_device(void)
+{
+	struct btrfs_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_NOFS);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dev->dev_list);
+	INIT_LIST_HEAD(&dev->dev_alloc_list);
+	INIT_LIST_HEAD(&dev->resized_list);
+
+	spin_lock_init(&dev->io_lock);
+
+	spin_lock_init(&dev->reada_lock);
+	atomic_set(&dev->reada_in_flight, 0);
+	atomic_set(&dev->dev_stats_ccnt, 0);
+	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
+	return dev;
+}
+
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+						   u64 devid, u8 *uuid)
+{
+	struct btrfs_device *dev;
+
+	list_for_each_entry(dev, head, dev_list) {
+		if (dev->devid == devid &&
+		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each_entry(fs_devices, &fs_uuids, list) {
+		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+			return fs_devices;
+	}
+	return NULL;
+}
+
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+		      int flush, struct block_device **bdev,
+		      struct buffer_head **bh)
+{
+	int ret;
+
+	*bdev = blkdev_get_by_path(device_path, flags, holder);
+
+	if (IS_ERR(*bdev)) {
+		ret = PTR_ERR(*bdev);
+		printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
+		goto error;
+	}
+
+	if (flush)
+		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+	ret = set_blocksize(*bdev, 4096);
+	if (ret) {
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+	invalidate_bdev(*bdev);
+	*bh = btrfs_read_dev_super(*bdev);
+	if (!*bh) {
+		ret = -EINVAL;
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+
+	return 0;
+
+error:
+	*bdev = NULL;
+	*bh = NULL;
+	return ret;
+}
+
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+			struct bio *head, struct bio *tail)
+{
+
+	struct bio *old_head;
+
+	old_head = pending_bios->head;
+	pending_bios->head = head;
+	if (pending_bios->tail)
+		tail->bi_next = old_head;
+	else
+		pending_bios->tail = tail;
+}
+
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline void run_scheduled_bios(struct btrfs_device *device)
+{
+	struct bio *pending;
+	struct backing_dev_info *bdi;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_pending_bios *pending_bios;
+	struct bio *tail;
+	struct bio *cur;
+	int again = 0;
+	unsigned long num_run;
+	unsigned long batch_run = 0;
+	unsigned long limit;
+	unsigned long last_waited = 0;
+	int force_reg = 0;
+	int sync_pending = 0;
+	struct blk_plug plug;
+
+	/*
+	 * this function runs all the bios we've collected for
+	 * a particular device.  We don't want to wander off to
+	 * another device without first sending all of these down.
+	 * So, setup a plug here and finish it off before we return
+	 */
+	blk_start_plug(&plug);
+
+	bdi = blk_get_backing_dev_info(device->bdev);
+	fs_info = device->dev_root->fs_info;
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
+loop:
+	spin_lock(&device->io_lock);
+
+loop_lock:
+	num_run = 0;
+
+	/* take all the bios off the list at once and process them
+	 * later on (without the lock held).  But, remember the
+	 * tail and other pointers so the bios can be properly reinserted
+	 * into the list if we hit congestion
+	 */
+	if (!force_reg && device->pending_sync_bios.head) {
+		pending_bios = &device->pending_sync_bios;
+		force_reg = 1;
+	} else {
+		pending_bios = &device->pending_bios;
+		force_reg = 0;
+	}
+
+	pending = pending_bios->head;
+	tail = pending_bios->tail;
+	WARN_ON(pending && !tail);
+
+	/*
+	 * if pending was null this time around, no bios need processing
+	 * at all and we can stop.  Otherwise it'll loop back up again
+	 * and do an additional check so no bios are missed.
+	 *
+	 * device->running_pending is used to synchronize with the
+	 * schedule_bio code.
+	 */
+	if (device->pending_sync_bios.head == NULL &&
+	    device->pending_bios.head == NULL) {
+		again = 0;
+		device->running_pending = 0;
+	} else {
+		again = 1;
+		device->running_pending = 1;
+	}
+
+	pending_bios->head = NULL;
+	pending_bios->tail = NULL;
+
+	spin_unlock(&device->io_lock);
+
+	while (pending) {
+
+		rmb();
+		/* we want to work on both lists, but do more bios on the
+		 * sync list than the regular list
+		 */
+		if ((num_run > 32 &&
+		    pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head) ||
+		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+		    device->pending_bios.head)) {
+			spin_lock(&device->io_lock);
+			requeue_list(pending_bios, pending, tail);
+			goto loop_lock;
+		}
+
+		cur = pending;
+		pending = pending->bi_next;
+		cur->bi_next = NULL;
+
+		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
+		    waitqueue_active(&fs_info->async_submit_wait))
+			wake_up(&fs_info->async_submit_wait);
+
+		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+
+		/*
+		 * if we're doing the sync list, record that our
+		 * plug has some sync requests on it
+		 *
+		 * If we're doing the regular list and there are
+		 * sync requests sitting around, unplug before
+		 * we add more
+		 */
+		if (pending_bios == &device->pending_sync_bios) {
+			sync_pending = 1;
+		} else if (sync_pending) {
+			blk_finish_plug(&plug);
+			blk_start_plug(&plug);
+			sync_pending = 0;
+		}
+
+		btrfsic_submit_bio(cur->bi_rw, cur);
+		num_run++;
+		batch_run++;
+
+		cond_resched();
+
+		/*
+		 * we made progress, there is more work to do and the bdi
+		 * is now congested.  Back off and let other work structs
+		 * run instead
+		 */
+		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
+		    fs_info->fs_devices->open_devices > 1) {
+			struct io_context *ioc;
+
+			ioc = current->io_context;
+
+			/*
+			 * the main goal here is that we don't want to
+			 * block if we're going to be able to submit
+			 * more requests without blocking.
+			 *
+			 * This code does two great things, it pokes into
+			 * the elevator code from a filesystem _and_
+			 * it makes assumptions about how batching works.
+			 */
+			if (ioc && ioc->nr_batch_requests > 0 &&
+			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+			    (last_waited == 0 ||
+			     ioc->last_waited == last_waited)) {
+				/*
+				 * we want to go through our batch of
+				 * requests and stop.  So, we copy out
+				 * the ioc->last_waited time and test
+				 * against it before looping
+				 */
+				last_waited = ioc->last_waited;
+				cond_resched();
+				continue;
+			}
+			spin_lock(&device->io_lock);
+			requeue_list(pending_bios, pending, tail);
+			device->running_pending = 1;
+
+			spin_unlock(&device->io_lock);
+			btrfs_queue_work(fs_info->submit_workers,
+					 &device->work);
+			goto done;
+		}
+		/* unplug every 64 requests just for good measure */
+		if (batch_run % 64 == 0) {
+			blk_finish_plug(&plug);
+			blk_start_plug(&plug);
+			sync_pending = 0;
+		}
+	}
+
+	cond_resched();
+	if (again)
+		goto loop;
+
+	spin_lock(&device->io_lock);
+	if (device->pending_bios.head || device->pending_sync_bios.head)
+		goto loop_lock;
+	spin_unlock(&device->io_lock);
+
+done:
+	blk_finish_plug(&plug);
+}
+
+static void pending_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, work);
+	run_scheduled_bios(device);
+}
+
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * 1   - first time device is seen
+ * 0   - device already known
+ * < 0 - error
+ */
+static noinline int device_list_add(const char *path,
+			   struct btrfs_super_block *disk_super,
+			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices;
+	struct rcu_string *name;
+	int ret = 0;
+	u64 found_transid = btrfs_super_generation(disk_super);
+
+	fs_devices = find_fsid(disk_super->fsid);
+	if (!fs_devices) {
+		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (IS_ERR(fs_devices))
+			return PTR_ERR(fs_devices);
+
+		list_add(&fs_devices->list, &fs_uuids);
+
+		device = NULL;
+	} else {
+		device = __find_device(&fs_devices->devices, devid,
+				       disk_super->dev_item.uuid);
+	}
+
+	if (!device) {
+		if (fs_devices->opened)
+			return -EBUSY;
+
+		device = btrfs_alloc_device(NULL, &devid,
+					    disk_super->dev_item.uuid);
+		if (IS_ERR(device)) {
+			/* we can safely leave the fs_devices entry around */
+			return PTR_ERR(device);
+		}
+
+		name = rcu_string_strdup(path, GFP_NOFS);
+		if (!name) {
+			kfree(device);
+			return -ENOMEM;
+		}
+		rcu_assign_pointer(device->name, name);
+
+		mutex_lock(&fs_devices->device_list_mutex);
+		list_add_rcu(&device->dev_list, &fs_devices->devices);
+		fs_devices->num_devices++;
+		mutex_unlock(&fs_devices->device_list_mutex);
+
+		ret = 1;
+		device->fs_devices = fs_devices;
+	} else if (!device->name || strcmp(device->name->str, path)) {
+		/*
+		 * When FS is already mounted.
+		 * 1. If you are here and if the device->name is NULL that
+		 *    means this device was missing at time of FS mount.
+		 * 2. If you are here and if the device->name is different
+		 *    from 'path' that means either
+		 *      a. The same device disappeared and reappeared with
+		 *         different name. or
+		 *      b. The missing-disk-which-was-replaced, has
+		 *         reappeared now.
+		 *
+		 * We must allow 1 and 2a above. But 2b would be a spurious
+		 * and unintentional.
+		 *
+		 * Further in case of 1 and 2a above, the disk at 'path'
+		 * would have missed some transaction when it was away and
+		 * in case of 2a the stale bdev has to be updated as well.
+		 * 2b must not be allowed at all time.
+		 */
+
+		/*
+		 * For now, we do allow update to btrfs_fs_device through the
+		 * btrfs dev scan cli after FS has been mounted.  We're still
+		 * tracking a problem where systems fail mount by subvolume id
+		 * when we reject replacement on a mounted FS.
+		 */
+		if (!fs_devices->opened && found_transid < device->generation) {
+			/*
+			 * That is if the FS is _not_ mounted and if you
+			 * are here, that means there is more than one
+			 * disk with same uuid and devid.We keep the one
+			 * with larger generation number or the last-in if
+			 * generation are equal.
+			 */
+			return -EEXIST;
+		}
+
+		name = rcu_string_strdup(path, GFP_NOFS);
+		if (!name)
+			return -ENOMEM;
+		rcu_string_free(device->name);
+		rcu_assign_pointer(device->name, name);
+		if (device->missing) {
+			fs_devices->missing_devices--;
+			device->missing = 0;
+		}
+	}
+
+	/*
+	 * Unmount does not free the btrfs_device struct but would zero
+	 * generation along with most of the other members. So just update
+	 * it back. We need it to pick the disk with largest generation
+	 * (as above).
+	 */
+	if (!fs_devices->opened)
+		device->generation = found_transid;
+
+	*fs_devices_ret = fs_devices;
+
+	return ret;
+}
+
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	struct btrfs_device *orig_dev;
+
+	fs_devices = alloc_fs_devices(orig->fsid);
+	if (IS_ERR(fs_devices))
+		return fs_devices;
+
+	mutex_lock(&orig->device_list_mutex);
+	fs_devices->total_devices = orig->total_devices;
+
+	/* We have held the volume lock, it is safe to get the devices. */
+	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+		struct rcu_string *name;
+
+		device = btrfs_alloc_device(NULL, &orig_dev->devid,
+					    orig_dev->uuid);
+		if (IS_ERR(device))
+			goto error;
+
+		/*
+		 * This is ok to do without rcu read locked because we hold the
+		 * uuid mutex so nothing we touch in here is going to disappear.
+		 */
+		if (orig_dev->name) {
+			name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+			if (!name) {
+				kfree(device);
+				goto error;
+			}
+			rcu_assign_pointer(device->name, name);
+		}
+
+		list_add(&device->dev_list, &fs_devices->devices);
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	}
+	mutex_unlock(&orig->device_list_mutex);
+	return fs_devices;
+error:
+	mutex_unlock(&orig->device_list_mutex);
+	free_fs_devices(fs_devices);
+	return ERR_PTR(-ENOMEM);
+}
+
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
+{
+	struct btrfs_device *device, *next;
+	struct btrfs_device *latest_dev = NULL;
+
+	mutex_lock(&uuid_mutex);
+again:
+	/* This is the initialized path, it is safe to release the devices. */
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+		if (device->in_fs_metadata) {
+			if (!device->is_tgtdev_for_dev_replace &&
+			    (!latest_dev ||
+			     device->generation > latest_dev->generation)) {
+				latest_dev = device;
+			}
+			continue;
+		}
+
+		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+			/*
+			 * In the first step, keep the device which has
+			 * the correct fsid and the devid that is used
+			 * for the dev_replace procedure.
+			 * In the second step, the dev_replace state is
+			 * read from the device tree and it is known
+			 * whether the procedure is really active or
+			 * not, which means whether this device is
+			 * used or whether it should be removed.
+			 */
+			if (step == 0 || device->is_tgtdev_for_dev_replace) {
+				continue;
+			}
+		}
+		if (device->bdev) {
+			blkdev_put(device->bdev, device->mode);
+			device->bdev = NULL;
+			fs_devices->open_devices--;
+		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			device->writeable = 0;
+			if (!device->is_tgtdev_for_dev_replace)
+				fs_devices->rw_devices--;
+		}
+		list_del_init(&device->dev_list);
+		fs_devices->num_devices--;
+		rcu_string_free(device->name);
+		kfree(device);
+	}
+
+	if (fs_devices->seed) {
+		fs_devices = fs_devices->seed;
+		goto again;
+	}
+
+	fs_devices->latest_bdev = latest_dev->bdev;
+
+	mutex_unlock(&uuid_mutex);
+}
+
+static void __free_device(struct work_struct *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, rcu_work);
+
+	if (device->bdev)
+		blkdev_put(device->bdev, device->mode);
+
+	rcu_string_free(device->name);
+	kfree(device);
+}
+
+static void free_device(struct rcu_head *head)
+{
+	struct btrfs_device *device;
+
+	device = container_of(head, struct btrfs_device, rcu);
+
+	INIT_WORK(&device->rcu_work, __free_device);
+	schedule_work(&device->rcu_work);
+}
+
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+
+	if (--fs_devices->opened > 0)
+		return 0;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		struct btrfs_device *new_device;
+		struct rcu_string *name;
+
+		if (device->bdev)
+			fs_devices->open_devices--;
+
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
+			list_del_init(&device->dev_alloc_list);
+			fs_devices->rw_devices--;
+		}
+
+		if (device->missing)
+			fs_devices->missing_devices--;
+
+		new_device = btrfs_alloc_device(NULL, &device->devid,
+						device->uuid);
+		BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+		/* Safe because we are under uuid_mutex */
+		if (device->name) {
+			name = rcu_string_strdup(device->name->str, GFP_NOFS);
+			BUG_ON(!name); /* -ENOMEM */
+			rcu_assign_pointer(new_device->name, name);
+		}
+
+		list_replace_rcu(&device->dev_list, &new_device->dev_list);
+		new_device->fs_devices = device->fs_devices;
+
+		call_rcu(&device->rcu, free_device);
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	WARN_ON(fs_devices->open_devices);
+	WARN_ON(fs_devices->rw_devices);
+	fs_devices->opened = 0;
+	fs_devices->seeding = 0;
+
+	return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_fs_devices *seed_devices = NULL;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = __btrfs_close_devices(fs_devices);
+	if (!fs_devices->opened) {
+		seed_devices = fs_devices->seed;
+		fs_devices->seed = NULL;
+	}
+	mutex_unlock(&uuid_mutex);
+
+	while (seed_devices) {
+		fs_devices = seed_devices;
+		seed_devices = fs_devices->seed;
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+	}
+	/*
+	 * Wait for rcu kworkers under __btrfs_close_devices
+	 * to finish all blkdev_puts so device is really
+	 * free when umount is done.
+	 */
+	rcu_barrier();
+	return ret;
+}
+
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+				fmode_t flags, void *holder)
+{
+	struct request_queue *q;
+	struct block_device *bdev;
+	struct list_head *head = &fs_devices->devices;
+	struct btrfs_device *device;
+	struct btrfs_device *latest_dev = NULL;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 devid;
+	int seeding = 1;
+	int ret = 0;
+
+	flags |= FMODE_EXCL;
+
+	list_for_each_entry(device, head, dev_list) {
+		if (device->bdev)
+			continue;
+		if (!device->name)
+			continue;
+
+		/* Just open everything we can; ignore failures here */
+		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+					    &bdev, &bh))
+			continue;
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
+		if (devid != device->devid)
+			goto error_brelse;
+
+		if (memcmp(device->uuid, disk_super->dev_item.uuid,
+			   BTRFS_UUID_SIZE))
+			goto error_brelse;
+
+		device->generation = btrfs_super_generation(disk_super);
+		if (!latest_dev ||
+		    device->generation > latest_dev->generation)
+			latest_dev = device;
+
+		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+			device->writeable = 0;
+		} else {
+			device->writeable = !bdev_read_only(bdev);
+			seeding = 0;
+		}
+
+		q = bdev_get_queue(bdev);
+		if (blk_queue_discard(q))
+			device->can_discard = 1;
+
+		device->bdev = bdev;
+		device->in_fs_metadata = 0;
+		device->mode = flags;
+
+		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+			fs_devices->rotating = 1;
+
+		fs_devices->open_devices++;
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
+			fs_devices->rw_devices++;
+			list_add(&device->dev_alloc_list,
+				 &fs_devices->alloc_list);
+		}
+		brelse(bh);
+		continue;
+
+error_brelse:
+		brelse(bh);
+		blkdev_put(bdev, flags);
+		continue;
+	}
+	if (fs_devices->open_devices == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+	fs_devices->seeding = seeding;
+	fs_devices->opened = 1;
+	fs_devices->latest_bdev = latest_dev->bdev;
+	fs_devices->total_rw_bytes = 0;
+out:
+	return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       fmode_t flags, void *holder)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	if (fs_devices->opened) {
+		fs_devices->opened++;
+		ret = 0;
+	} else {
+		ret = __btrfs_open_devices(fs_devices, flags, holder);
+	}
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_super_block *disk_super;
+	struct block_device *bdev;
+	struct page *page;
+	void *p;
+	int ret = -EINVAL;
+	u64 devid;
+	u64 transid;
+	u64 total_devices;
+	u64 bytenr;
+	pgoff_t index;
+
+	/*
+	 * we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	bytenr = btrfs_sb_offset(0);
+	flags |= FMODE_EXCL;
+	mutex_lock(&uuid_mutex);
+
+	bdev = blkdev_get_by_path(path, flags, holder);
+
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto error;
+	}
+
+	/* make sure our super fits in the device */
+	if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+		goto error_bdev_put;
+
+	/* make sure our super fits in the page */
+	if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+		goto error_bdev_put;
+
+	/* make sure our super doesn't straddle pages on disk */
+	index = bytenr >> PAGE_CACHE_SHIFT;
+	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+		goto error_bdev_put;
+
+	/* pull in the page with our super */
+	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+				   index, GFP_NOFS);
+
+	if (IS_ERR_OR_NULL(page))
+		goto error_bdev_put;
+
+	p = kmap(page);
+
+	/* align our pointer to the offset of the super block */
+	disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+	if (btrfs_super_bytenr(disk_super) != bytenr ||
+	    btrfs_super_magic(disk_super) != BTRFS_MAGIC)
+		goto error_unmap;
+
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
+	transid = btrfs_super_generation(disk_super);
+	total_devices = btrfs_super_num_devices(disk_super);
+
+	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+	if (ret > 0) {
+		if (disk_super->label[0]) {
+			if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+				disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+			printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
+		} else {
+			printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
+		}
+
+		printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+		ret = 0;
+	}
+	if (!ret && fs_devices_ret)
+		(*fs_devices_ret)->total_devices = total_devices;
+
+error_unmap:
+	kunmap(page);
+	page_cache_release(page);
+
+error_bdev_put:
+	blkdev_put(bdev, flags);
+error:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_path *path;
+	u64 extent_end;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+
+	*length = 0;
+
+	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid, key.type);
+		if (ret < 0)
+			goto out;
+	}
+
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			break;
+
+		if (key.type != BTRFS_DEV_EXTENT_KEY)
+			goto next;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (key.offset <= start && extent_end > end) {
+			*length = end - start + 1;
+			break;
+		} else if (key.offset <= start && extent_end > start)
+			*length += extent_end - start;
+		else if (key.offset > start && extent_end <= end)
+			*length += extent_end - key.offset;
+		else if (key.offset > start && key.offset <= end) {
+			*length += end - key.offset + 1;
+			break;
+		} else if (key.offset > end)
+			break;
+
+next:
+		path->slots[0]++;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_device *device,
+				   u64 *start, u64 len)
+{
+	struct extent_map *em;
+	struct list_head *search_list = &trans->transaction->pending_chunks;
+	int ret = 0;
+	u64 physical_start = *start;
+
+again:
+	list_for_each_entry(em, search_list, list) {
+		struct map_lookup *map;
+		int i;
+
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (map->stripes[i].dev != device)
+				continue;
+			if (map->stripes[i].physical >= physical_start + len ||
+			    map->stripes[i].physical + em->orig_block_len <=
+			    physical_start)
+				continue;
+			*start = map->stripes[i].physical +
+				em->orig_block_len;
+			ret = 1;
+		}
+	}
+	if (search_list == &trans->transaction->pending_chunks) {
+		search_list = &trans->root->fs_info->pinned_chunks;
+		goto again;
+	}
+
+	return ret;
+}
+
+
+/*
+ * find_free_dev_extent - find free space in the specified device
+ * @device:	the device which we search the free space in
+ * @num_bytes:	the size of the free space that we need
+ * @start:	store the start of the free space.
+ * @len:	the size of the free space. that we find, or the size of the max
+ * 		free space if we don't find suitable free space
+ *
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
+ */
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *len)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_path *path;
+	u64 hole_size;
+	u64 max_hole_start;
+	u64 max_hole_size;
+	u64 extent_end;
+	u64 search_start;
+	u64 search_end = device->total_bytes;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+
+	/* FIXME use last free of some kind */
+
+	/* we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	max_hole_start = search_start;
+	max_hole_size = 0;
+
+again:
+	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	key.objectid = device->devid;
+	key.offset = search_start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid, key.type);
+		if (ret < 0)
+			goto out;
+	}
+
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			break;
+
+		if (key.type != BTRFS_DEV_EXTENT_KEY)
+			goto next;
+
+		if (key.offset > search_start) {
+			hole_size = key.offset - search_start;
+
+			/*
+			 * Have to check before we set max_hole_start, otherwise
+			 * we could end up sending back this offset anyway.
+			 */
+			if (contains_pending_extent(trans, device,
+						    &search_start,
+						    hole_size)) {
+				if (key.offset >= search_start) {
+					hole_size = key.offset - search_start;
+				} else {
+					WARN_ON_ONCE(1);
+					hole_size = 0;
+				}
+			}
+
+			if (hole_size > max_hole_size) {
+				max_hole_start = search_start;
+				max_hole_size = hole_size;
+			}
+
+			/*
+			 * If this free space is greater than which we need,
+			 * it must be the max free space that we have found
+			 * until now, so max_hole_start must point to the start
+			 * of this free space and the length of this free space
+			 * is stored in max_hole_size. Thus, we return
+			 * max_hole_start and max_hole_size and go back to the
+			 * caller.
+			 */
+			if (hole_size >= num_bytes) {
+				ret = 0;
+				goto out;
+			}
+		}
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (extent_end > search_start)
+			search_start = extent_end;
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+
+	/*
+	 * At this point, search_start should be the end of
+	 * allocated dev extents, and when shrinking the device,
+	 * search_end may be smaller than search_start.
+	 */
+	if (search_end > search_start) {
+		hole_size = search_end - search_start;
+
+		if (contains_pending_extent(trans, device, &search_start,
+					    hole_size)) {
+			btrfs_release_path(path);
+			goto again;
+		}
+
+		if (hole_size > max_hole_size) {
+			max_hole_start = search_start;
+			max_hole_size = hole_size;
+		}
+	}
+
+	/* See above. */
+	if (max_hole_size < num_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+
+out:
+	btrfs_free_path(path);
+	*start = max_hole_start;
+	if (len)
+		*len = max_hole_size;
+	return ret;
+}
+
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_device *device,
+			  u64 start, u64 *dev_extent_len)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf = NULL;
+	struct btrfs_dev_extent *extent = NULL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+again:
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid,
+					  BTRFS_DEV_EXTENT_KEY);
+		if (ret)
+			goto out;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+		BUG_ON(found_key.offset > start || found_key.offset +
+		       btrfs_dev_extent_length(leaf, extent) < start);
+		key = found_key;
+		btrfs_release_path(path);
+		goto again;
+	} else if (ret == 0) {
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+	} else {
+		btrfs_error(root->fs_info, ret, "Slot search failed");
+		goto out;
+	}
+
+	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret) {
+		btrfs_error(root->fs_info, ret,
+			    "Failed to remove dev extent item");
+	} else {
+		trans->transaction->have_free_bgs = 1;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_device *device,
+				  u64 chunk_tree, u64 chunk_objectid,
+				  u64 chunk_offset, u64 start, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	WARN_ON(!device->in_fs_metadata);
+	WARN_ON(device->is_tgtdev_for_dev_replace);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*extent));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_dev_extent);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+		    btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
+
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct rb_node *n;
+	u64 ret = 0;
+
+	em_tree = &fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	n = rb_last(&em_tree->map);
+	if (n) {
+		em = rb_entry(n, struct extent_map, rb_node);
+		ret = em->start + em->len;
+	}
+	read_unlock(&em_tree->lock);
+
+	return ret;
+}
+
+static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
+				    u64 *devid_ret)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0); /* Corruption */
+
+	ret = btrfs_previous_item(fs_info->chunk_root, path,
+				  BTRFS_DEV_ITEMS_OBJECTID,
+				  BTRFS_DEV_ITEM_KEY);
+	if (ret) {
+		*devid_ret = 1;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*devid_ret = found_key.offset + 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+static int btrfs_add_device(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*dev_item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_generation(leaf, dev_item, 0);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_total_bytes(leaf, dev_item,
+				     btrfs_device_get_disk_total_bytes(device));
+	btrfs_set_device_bytes_used(leaf, dev_item,
+				    btrfs_device_get_bytes_used(device));
+	btrfs_set_device_group(leaf, dev_item, 0);
+	btrfs_set_device_seek_speed(leaf, dev_item, 0);
+	btrfs_set_device_bandwidth(leaf, dev_item, 0);
+	btrfs_set_device_start_offset(leaf, dev_item, 0);
+
+	ptr = btrfs_device_uuid(dev_item);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+	ptr = btrfs_device_fsid(dev_item);
+	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Function to update ctime/mtime for a given device path.
+ * Mainly used for ctime/mtime based probe like libblkid.
+ */
+static void update_dev_time(char *path_name)
+{
+	struct file *filp;
+
+	filp = filp_open(path_name, O_RDWR, 0);
+	if (IS_ERR(filp))
+		return;
+	file_update_time(filp);
+	filp_close(filp, NULL);
+	return;
+}
+
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+			     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+out:
+	btrfs_free_path(path);
+	btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_device *device;
+	struct btrfs_device *next_device;
+	struct block_device *bdev;
+	struct buffer_head *bh = NULL;
+	struct btrfs_super_block *disk_super;
+	struct btrfs_fs_devices *cur_devices;
+	u64 all_avail;
+	u64 devid;
+	u64 num_devices;
+	u8 *dev_uuid;
+	unsigned seq;
+	int ret = 0;
+	bool clear_super = false;
+
+	mutex_lock(&uuid_mutex);
+
+	do {
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+		all_avail = root->fs_info->avail_data_alloc_bits |
+			    root->fs_info->avail_system_alloc_bits |
+			    root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
+	num_devices = root->fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+		WARN_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
+		ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
+		ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+	    root->fs_info->fs_devices->rw_devices <= 2) {
+		ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
+		goto out;
+	}
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+	    root->fs_info->fs_devices->rw_devices <= 3) {
+		ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
+		goto out;
+	}
+
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *devices;
+		struct btrfs_device *tmp;
+
+		device = NULL;
+		devices = &root->fs_info->fs_devices->devices;
+		/*
+		 * It is safe to read the devices since the volume_mutex
+		 * is held.
+		 */
+		list_for_each_entry(tmp, devices, dev_list) {
+			if (tmp->in_fs_metadata &&
+			    !tmp->is_tgtdev_for_dev_replace &&
+			    !tmp->bdev) {
+				device = tmp;
+				break;
+			}
+		}
+		bdev = NULL;
+		bh = NULL;
+		disk_super = NULL;
+		if (!device) {
+			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+			goto out;
+		}
+	} else {
+		ret = btrfs_get_bdev_and_sb(device_path,
+					    FMODE_WRITE | FMODE_EXCL,
+					    root->fs_info->bdev_holder, 0,
+					    &bdev, &bh);
+		if (ret)
+			goto out;
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
+		dev_uuid = disk_super->dev_item.uuid;
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+					   disk_super->fsid);
+		if (!device) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+	}
+
+	if (device->is_tgtdev_for_dev_replace) {
+		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
+		goto error_brelse;
+	}
+
+	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
+		goto error_brelse;
+	}
+
+	if (device->writeable) {
+		lock_chunks(root);
+		list_del_init(&device->dev_alloc_list);
+		device->fs_devices->rw_devices--;
+		unlock_chunks(root);
+		clear_super = true;
+	}
+
+	mutex_unlock(&uuid_mutex);
+	ret = btrfs_shrink_device(device, 0);
+	mutex_lock(&uuid_mutex);
+	if (ret)
+		goto error_undo;
+
+	/*
+	 * TODO: the superblock still includes this device in its num_devices
+	 * counter although write_all_supers() is not locked out. This
+	 * could give a filesystem state which requires a degraded mount.
+	 */
+	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+	if (ret)
+		goto error_undo;
+
+	device->in_fs_metadata = 0;
+	btrfs_scrub_cancel_dev(root->fs_info, device);
+
+	/*
+	 * the device list mutex makes sure that we don't change
+	 * the device list while someone else is writing out all
+	 * the device supers. Whoever is writing all supers, should
+	 * lock the device list mutex before getting the number of
+	 * devices in the super block (super_copy). Conversely,
+	 * whoever updates the number of devices in the super block
+	 * (super_copy) should hold the device list mutex.
+	 */
+
+	cur_devices = device->fs_devices;
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	list_del_rcu(&device->dev_list);
+
+	device->fs_devices->num_devices--;
+	device->fs_devices->total_devices--;
+
+	if (device->missing)
+		device->fs_devices->missing_devices--;
+
+	next_device = list_entry(root->fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (device->bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_device->bdev;
+	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+	if (device->bdev) {
+		device->fs_devices->open_devices--;
+		/* remove sysfs entry */
+		btrfs_kobj_rm_device(root->fs_info, device);
+	}
+
+	call_rcu(&device->rcu, free_device);
+
+	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	if (cur_devices->open_devices == 0) {
+		struct btrfs_fs_devices *fs_devices;
+		fs_devices = root->fs_info->fs_devices;
+		while (fs_devices) {
+			if (fs_devices->seed == cur_devices) {
+				fs_devices->seed = cur_devices->seed;
+				break;
+			}
+			fs_devices = fs_devices->seed;
+		}
+		cur_devices->seed = NULL;
+		__btrfs_close_devices(cur_devices);
+		free_fs_devices(cur_devices);
+	}
+
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	if (clear_super && disk_super) {
+		u64 bytenr;
+		int i;
+
+		/* make sure this device isn't detected as part of
+		 * the FS anymore
+		 */
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+
+		/* clear the mirror copies of super block on the disk
+		 * being removed, 0th copy is been taken care above and
+		 * the below would take of the rest
+		 */
+		for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			bytenr = btrfs_sb_offset(i);
+			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+					i_size_read(bdev->bd_inode))
+				break;
+
+			brelse(bh);
+			bh = __bread(bdev, bytenr / 4096,
+					BTRFS_SUPER_INFO_SIZE);
+			if (!bh)
+				continue;
+
+			disk_super = (struct btrfs_super_block *)bh->b_data;
+
+			if (btrfs_super_bytenr(disk_super) != bytenr ||
+				btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+				continue;
+			}
+			memset(&disk_super->magic, 0,
+						sizeof(disk_super->magic));
+			set_buffer_dirty(bh);
+			sync_dirty_buffer(bh);
+		}
+	}
+
+	ret = 0;
+
+	if (bdev) {
+		/* Notify udev that device has changed */
+		btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+		/* Update ctime/mtime for device path for libblkid */
+		update_dev_time(device_path);
+	}
+
+error_brelse:
+	brelse(bh);
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
+out:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+error_undo:
+	if (device->writeable) {
+		lock_chunks(root);
+		list_add(&device->dev_alloc_list,
+			 &root->fs_info->fs_devices->alloc_list);
+		device->fs_devices->rw_devices++;
+		unlock_chunks(root);
+	}
+	goto error_brelse;
+}
+
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+					struct btrfs_device *srcdev)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+
+	/*
+	 * in case of fs with no seed, srcdev->fs_devices will point
+	 * to fs_devices of fs_info. However when the dev being replaced is
+	 * a seed dev it will point to the seed's local fs_devices. In short
+	 * srcdev will have its correct fs_devices in both the cases.
+	 */
+	fs_devices = srcdev->fs_devices;
+
+	list_del_rcu(&srcdev->dev_list);
+	list_del_rcu(&srcdev->dev_alloc_list);
+	fs_devices->num_devices--;
+	if (srcdev->missing)
+		fs_devices->missing_devices--;
+
+	if (srcdev->writeable) {
+		fs_devices->rw_devices--;
+		/* zero out the old super if it is writable */
+		btrfs_scratch_superblock(srcdev);
+	}
+
+	if (srcdev->bdev)
+		fs_devices->open_devices--;
+}
+
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *srcdev)
+{
+	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
+
+	call_rcu(&srcdev->rcu, free_device);
+
+	/*
+	 * unless fs_devices is seed fs, num_devices shouldn't go
+	 * zero
+	 */
+	BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
+
+	/* if this is no devs we rather delete the fs_devices */
+	if (!fs_devices->num_devices) {
+		struct btrfs_fs_devices *tmp_fs_devices;
+
+		tmp_fs_devices = fs_info->fs_devices;
+		while (tmp_fs_devices) {
+			if (tmp_fs_devices->seed == fs_devices) {
+				tmp_fs_devices->seed = fs_devices->seed;
+				break;
+			}
+			tmp_fs_devices = tmp_fs_devices->seed;
+		}
+		fs_devices->seed = NULL;
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+	}
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev)
+{
+	struct btrfs_device *next_device;
+
+	mutex_lock(&uuid_mutex);
+	WARN_ON(!tgtdev);
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	if (tgtdev->bdev) {
+		btrfs_scratch_superblock(tgtdev);
+		fs_info->fs_devices->open_devices--;
+	}
+	fs_info->fs_devices->num_devices--;
+
+	next_device = list_entry(fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (tgtdev->bdev == fs_info->sb->s_bdev)
+		fs_info->sb->s_bdev = next_device->bdev;
+	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+		fs_info->fs_devices->latest_bdev = next_device->bdev;
+	list_del_rcu(&tgtdev->dev_list);
+
+	call_rcu(&tgtdev->rcu, free_device);
+
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&uuid_mutex);
+}
+
+static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+				     struct btrfs_device **device)
+{
+	int ret = 0;
+	struct btrfs_super_block *disk_super;
+	u64 devid;
+	u8 *dev_uuid;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+
+	*device = NULL;
+	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+				    root->fs_info->bdev_holder, 0, &bdev, &bh);
+	if (ret)
+		return ret;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
+	dev_uuid = disk_super->dev_item.uuid;
+	*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+				    disk_super->fsid);
+	brelse(bh);
+	if (!*device)
+		ret = -ENOENT;
+	blkdev_put(bdev, FMODE_READ);
+	return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device)
+{
+	*device = NULL;
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *devices;
+		struct btrfs_device *tmp;
+
+		devices = &root->fs_info->fs_devices->devices;
+		/*
+		 * It is safe to read the devices since the volume_mutex
+		 * is held by the caller.
+		 */
+		list_for_each_entry(tmp, devices, dev_list) {
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				*device = tmp;
+				break;
+			}
+		}
+
+		if (!*device) {
+			btrfs_err(root->fs_info, "no missing device found");
+			return -ENOENT;
+		}
+
+		return 0;
+	} else {
+		return btrfs_find_device_by_path(root, device_path, device);
+	}
+}
+
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_root *root)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_fs_devices *old_devices;
+	struct btrfs_fs_devices *seed_devices;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+	struct btrfs_device *device;
+	u64 super_flags;
+
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	if (!fs_devices->seeding)
+		return -EINVAL;
+
+	seed_devices = __alloc_fs_devices();
+	if (IS_ERR(seed_devices))
+		return PTR_ERR(seed_devices);
+
+	old_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(old_devices)) {
+		kfree(seed_devices);
+		return PTR_ERR(old_devices);
+	}
+
+	list_add(&old_devices->list, &fs_uuids);
+
+	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+	seed_devices->opened = 1;
+	INIT_LIST_HEAD(&seed_devices->devices);
+	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	mutex_init(&seed_devices->device_list_mutex);
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+			      synchronize_rcu);
+	list_for_each_entry(device, &seed_devices->devices, dev_list)
+		device->fs_devices = seed_devices;
+
+	lock_chunks(root);
+	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+	unlock_chunks(root);
+
+	fs_devices->seeding = 0;
+	fs_devices->num_devices = 0;
+	fs_devices->open_devices = 0;
+	fs_devices->missing_devices = 0;
+	fs_devices->rotating = 0;
+	fs_devices->seed = seed_devices;
+
+	generate_random_uuid(fs_devices->fsid);
+	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	super_flags = btrfs_super_flags(disk_super) &
+		      ~BTRFS_SUPER_FLAG_SEEDING;
+	btrfs_set_super_flags(disk_super, super_flags);
+
+	return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_device *device;
+	struct btrfs_key key;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+	u64 devid;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root = root->fs_info->chunk_root;
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_DEV_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto error;
+
+		leaf = path->nodes[0];
+next_slot:
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret > 0)
+				break;
+			if (ret < 0)
+				goto error;
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+			btrfs_release_path(path);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+		    key.type != BTRFS_DEV_ITEM_KEY)
+			break;
+
+		dev_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_dev_item);
+		devid = btrfs_device_id(leaf, dev_item);
+		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
+				   BTRFS_UUID_SIZE);
+		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
+				   BTRFS_UUID_SIZE);
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+					   fs_uuid);
+		BUG_ON(!device); /* Logic error */
+
+		if (device->fs_devices->seeding) {
+			btrfs_set_device_generation(leaf, dev_item,
+						    device->generation);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+		path->slots[0]++;
+		goto next_slot;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+	struct request_queue *q;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct list_head *devices;
+	struct super_block *sb = root->fs_info->sb;
+	struct rcu_string *name;
+	u64 tmp;
+	int seeding_dev = 0;
+	int ret = 0;
+
+	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+		return -EROFS;
+
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+				  root->fs_info->bdev_holder);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	if (root->fs_info->fs_devices->seeding) {
+		seeding_dev = 1;
+		down_write(&sb->s_umount);
+		mutex_lock(&uuid_mutex);
+	}
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+	devices = &root->fs_info->fs_devices->devices;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	list_for_each_entry(device, devices, dev_list) {
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
+			goto error;
+		}
+	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	device = btrfs_alloc_device(root->fs_info, NULL, NULL);
+	if (IS_ERR(device)) {
+		/* we can safely leave the fs_devices entry around */
+		ret = PTR_ERR(device);
+		goto error;
+	}
+
+	name = rcu_string_strdup(device_path, GFP_NOFS);
+	if (!name) {
+		kfree(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+	rcu_assign_pointer(device->name, name);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		rcu_string_free(device->name);
+		kfree(device);
+		ret = PTR_ERR(trans);
+		goto error;
+	}
+
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
+	device->writeable = 1;
+	device->generation = trans->transid;
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->disk_total_bytes = device->total_bytes;
+	device->commit_total_bytes = device->total_bytes;
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = bdev;
+	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 0;
+	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
+	set_blocksize(device->bdev, 4096);
+
+	if (seeding_dev) {
+		sb->s_flags &= ~MS_RDONLY;
+		ret = btrfs_prepare_sprout(root);
+		BUG_ON(ret); /* -ENOMEM */
+	}
+
+	device->fs_devices = root->fs_info->fs_devices;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	lock_chunks(root);
+	list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
+	root->fs_info->fs_devices->rw_devices++;
+	root->fs_info->fs_devices->total_devices++;
+	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space += device->total_bytes;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
+	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+		root->fs_info->fs_devices->rotating = 1;
+
+	tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(root->fs_info->super_copy,
+				    tmp + device->total_bytes);
+
+	tmp = btrfs_super_num_devices(root->fs_info->super_copy);
+	btrfs_set_super_num_devices(root->fs_info->super_copy,
+				    tmp + 1);
+
+	/* add sysfs device entry */
+	btrfs_kobj_add_device(root->fs_info, device);
+
+	/*
+	 * we've got more storage, clear any full flags on the space
+	 * infos
+	 */
+	btrfs_clear_space_info_full(root->fs_info);
+
+	unlock_chunks(root);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	if (seeding_dev) {
+		lock_chunks(root);
+		ret = init_first_rw_device(trans, root, device);
+		unlock_chunks(root);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
+	}
+
+	ret = btrfs_add_device(trans, root, device);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto error_trans;
+	}
+
+	if (seeding_dev) {
+		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
+		ret = btrfs_finish_sprout(trans, root);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
+
+		/* Sprouting would change fsid of the mounted root,
+		 * so rename the fsid on the sysfs
+		 */
+		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
+						root->fs_info->fsid);
+		if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+			goto error_trans;
+	}
+
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+	ret = btrfs_commit_transaction(trans, root);
+
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+
+		if (ret) /* transaction commit */
+			return ret;
+
+		ret = btrfs_relocate_sys_chunks(root);
+		if (ret < 0)
+			btrfs_error(root->fs_info, ret,
+				    "Failed to relocate sys chunks after "
+				    "device initialization. This can be fixed "
+				    "using the \"btrfs balance\" command.");
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) == -ENOENT)
+				return 0;
+			return PTR_ERR(trans);
+		}
+		ret = btrfs_commit_transaction(trans, root);
+	}
+
+	/* Update ctime/mtime for libblkid */
+	update_dev_time(device_path);
+	return ret;
+
+error_trans:
+	btrfs_end_transaction(trans, root);
+	rcu_string_free(device->name);
+	btrfs_kobj_rm_device(root->fs_info, device);
+	kfree(device);
+error:
+	blkdev_put(bdev, FMODE_EXCL);
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+	}
+	return ret;
+}
+
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device *srcdev,
+				  struct btrfs_device **device_out)
+{
+	struct request_queue *q;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *devices;
+	struct rcu_string *name;
+	u64 devid = BTRFS_DEV_REPLACE_DEVID;
+	int ret = 0;
+
+	*device_out = NULL;
+	if (fs_info->fs_devices->seeding) {
+		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
+		return -EINVAL;
+	}
+
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+				  fs_info->bdev_holder);
+	if (IS_ERR(bdev)) {
+		btrfs_err(fs_info, "target device %s is invalid!", device_path);
+		return PTR_ERR(bdev);
+	}
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+	devices = &fs_info->fs_devices->devices;
+	list_for_each_entry(device, devices, dev_list) {
+		if (device->bdev == bdev) {
+			btrfs_err(fs_info, "target device is in the filesystem!");
+			ret = -EEXIST;
+			goto error;
+		}
+	}
+
+
+	if (i_size_read(bdev->bd_inode) <
+	    btrfs_device_get_total_bytes(srcdev)) {
+		btrfs_err(fs_info, "target device is smaller than source device!");
+		ret = -EINVAL;
+		goto error;
+	}
+
+
+	device = btrfs_alloc_device(NULL, &devid, NULL);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		goto error;
+	}
+
+	name = rcu_string_strdup(device_path, GFP_NOFS);
+	if (!name) {
+		kfree(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+	rcu_assign_pointer(device->name, name);
+
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	device->writeable = 1;
+	device->generation = 0;
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+	ASSERT(list_empty(&srcdev->resized_list));
+	device->commit_total_bytes = srcdev->commit_total_bytes;
+	device->commit_bytes_used = device->bytes_used;
+	device->dev_root = fs_info->dev_root;
+	device->bdev = bdev;
+	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 1;
+	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
+	set_blocksize(device->bdev, 4096);
+	device->fs_devices = fs_info->fs_devices;
+	list_add(&device->dev_list, &fs_info->fs_devices->devices);
+	fs_info->fs_devices->num_devices++;
+	fs_info->fs_devices->open_devices++;
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	*device_out = device;
+	return ret;
+
+error:
+	blkdev_put(bdev, FMODE_EXCL);
+	return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev)
+{
+	WARN_ON(fs_info->fs_devices->rw_devices == 0);
+	tgtdev->io_width = fs_info->dev_root->sectorsize;
+	tgtdev->io_align = fs_info->dev_root->sectorsize;
+	tgtdev->sector_size = fs_info->dev_root->sectorsize;
+	tgtdev->dev_root = fs_info->dev_root;
+	tgtdev->in_fs_metadata = 1;
+}
+
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+					struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	root = device->dev_root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_total_bytes(leaf, dev_item,
+				     btrfs_device_get_disk_total_bytes(device));
+	btrfs_set_device_bytes_used(leaf, dev_item,
+				    btrfs_device_get_bytes_used(device));
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_super_block *super_copy =
+		device->dev_root->fs_info->super_copy;
+	struct btrfs_fs_devices *fs_devices;
+	u64 old_total;
+	u64 diff;
+
+	if (!device->writeable)
+		return -EACCES;
+
+	lock_chunks(device->dev_root);
+	old_total = btrfs_super_total_bytes(super_copy);
+	diff = new_size - device->total_bytes;
+
+	if (new_size <= device->total_bytes ||
+	    device->is_tgtdev_for_dev_replace) {
+		unlock_chunks(device->dev_root);
+		return -EINVAL;
+	}
+
+	fs_devices = device->dev_root->fs_info->fs_devices;
+
+	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	device->fs_devices->total_rw_bytes += diff;
+
+	btrfs_device_set_total_bytes(device, new_size);
+	btrfs_device_set_disk_total_bytes(device, new_size);
+	btrfs_clear_space_info_full(device->dev_root->fs_info);
+	if (list_empty(&device->resized_list))
+		list_add_tail(&device->resized_list,
+			      &fs_devices->resized_devices);
+	unlock_chunks(device->dev_root);
+
+	return btrfs_update_device(trans, device);
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 chunk_objectid,
+			    u64 chunk_offset)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	root = root->fs_info->chunk_root;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = chunk_objectid;
+	key.offset = chunk_offset;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0) { /* Logic error or corruption */
+		btrfs_error(root->fs_info, -ENOENT,
+			    "Failed lookup while freeing chunk.");
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret < 0)
+		btrfs_error(root->fs_info, ret,
+			    "Failed to delete chunk item.");
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+			chunk_offset)
+{
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	lock_chunks(root);
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)(ptr + len);
+			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+			len += btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		if (key.objectid == chunk_objectid &&
+		    key.offset == chunk_offset) {
+			memmove(ptr, ptr + len, array_size - (cur + len));
+			array_size -= len;
+			btrfs_set_super_sys_array_size(super_copy, array_size);
+		} else {
+			ptr += len;
+			cur += len;
+		}
+	}
+	unlock_chunks(root);
+	return ret;
+}
+
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct map_lookup *map;
+	u64 dev_extent_len = 0;
+	u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	int i, ret = 0;
+
+	/* Just in case */
+	root = root->fs_info->chunk_root;
+	em_tree = &root->fs_info->mapping_tree.map_tree;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	read_unlock(&em_tree->lock);
+
+	if (!em || em->start > chunk_offset ||
+	    em->start + em->len < chunk_offset) {
+		/*
+		 * This is a logic error, but we don't want to just rely on the
+		 * user having built with ASSERT enabled, so if ASSERT doens't
+		 * do anything we still error out.
+		 */
+		ASSERT(0);
+		if (em)
+			free_extent_map(em);
+		return -EINVAL;
+	}
+	map = (struct map_lookup *)em->bdev;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		struct btrfs_device *device = map->stripes[i].dev;
+		ret = btrfs_free_dev_extent(trans, device,
+					    map->stripes[i].physical,
+					    &dev_extent_len);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+
+		if (device->bytes_used > 0) {
+			lock_chunks(root);
+			btrfs_device_set_bytes_used(device,
+					device->bytes_used - dev_extent_len);
+			spin_lock(&root->fs_info->free_chunk_lock);
+			root->fs_info->free_chunk_space += dev_extent_len;
+			spin_unlock(&root->fs_info->free_chunk_lock);
+			btrfs_clear_space_info_full(root->fs_info);
+			unlock_chunks(root);
+		}
+
+		if (map->stripes[i].dev) {
+			ret = btrfs_update_device(trans, map->stripes[i].dev);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				goto out;
+			}
+		}
+	}
+	ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+	trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+	}
+
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
+	if (ret) {
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	}
+
+out:
+	/* once for us */
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+				u64 chunk_objectid,
+				u64 chunk_offset)
+{
+	struct btrfs_root *extent_root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	root = root->fs_info->chunk_root;
+	extent_root = root->fs_info->extent_root;
+
+	ret = btrfs_can_relocate(extent_root, chunk_offset);
+	if (ret)
+		return -ENOSPC;
+
+	/* step one, relocate all the extents inside this chunk */
+	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_std_error(root->fs_info, ret);
+		return ret;
+	}
+
+	/*
+	 * step two, delete the device extents and the
+	 * chunk tree entries
+	 */
+	ret = btrfs_remove_chunk(trans, root, chunk_offset);
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 chunk_type;
+	bool retried = false;
+	int failed = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+again:
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0); /* Corruption */
+
+		ret = btrfs_previous_item(chunk_root, path, key.objectid,
+					  key.type);
+		if (ret < 0)
+			goto error;
+		if (ret > 0)
+			break;
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		chunk = btrfs_item_ptr(leaf, path->slots[0],
+				       struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
+		btrfs_release_path(path);
+
+		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+			ret = btrfs_relocate_chunk(chunk_root,
+						   found_key.objectid,
+						   found_key.offset);
+			if (ret == -ENOSPC)
+				failed++;
+			else
+				BUG_ON(ret);
+		}
+
+		if (found_key.offset == 0)
+			break;
+		key.offset = found_key.offset - 1;
+	}
+	ret = 0;
+	if (failed && !retried) {
+		failed = 0;
+		retried = true;
+		goto again;
+	} else if (WARN_ON(failed && retried)) {
+		ret = -ENOSPC;
+	}
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int insert_balance_item(struct btrfs_root *root,
+			       struct btrfs_balance_control *bctl)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+	btrfs_set_balance_data(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+	btrfs_set_balance_meta(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+	btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+	btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+	/*
+	 * Turn on soft mode for chunk types that were being converted.
+	 */
+	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+	/*
+	 * Turn on usage filter if is not already used.  The idea is
+	 * that chunks that we have already balanced should be
+	 * reasonably full.  Don't do it for chunks that are being
+	 * converted - that will keep us from relocating unconverted
+	 * (albeit full) chunks.
+	 */
+	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->data.usage = 90;
+	}
+	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->sys.usage = 90;
+	}
+	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->meta.usage = 90;
+	}
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+	BUG_ON(fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = bctl;
+	spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	BUG_ON(!fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = NULL;
+	spin_unlock(&fs_info->balance_lock);
+
+	kfree(bctl);
+}
+
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_type,
+				 struct btrfs_balance_args *bargs)
+{
+	chunk_type = chunk_to_extended(chunk_type) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	if (bargs->profiles & chunk_type)
+		return 0;
+
+	return 1;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 chunk_used, user_thresh;
+	int ret = 1;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	chunk_used = btrfs_block_group_used(&cache->item);
+
+	if (bargs->usage == 0)
+		user_thresh = 1;
+	else if (bargs->usage > 100)
+		user_thresh = cache->key.offset;
+	else
+		user_thresh = div_factor_fine(cache->key.offset,
+					      bargs->usage);
+
+	if (chunk_used < user_thresh)
+		ret = 0;
+
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+			      struct btrfs_chunk *chunk,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	int i;
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+			return 0;
+	}
+
+	return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	u64 stripe_offset;
+	u64 stripe_length;
+	int factor;
+	int i;
+
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+		return 0;
+
+	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+		factor = num_stripes / 2;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+		factor = num_stripes - 1;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+		factor = num_stripes - 2;
+	} else {
+		factor = num_stripes;
+	}
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+			continue;
+
+		stripe_offset = btrfs_stripe_offset(leaf, stripe);
+		stripe_length = btrfs_chunk_length(leaf, chunk);
+		stripe_length = div_u64(stripe_length, factor);
+
+		if (stripe_offset < bargs->pend &&
+		    stripe_offset + stripe_length > bargs->pstart)
+			return 0;
+	}
+
+	return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	if (chunk_offset < bargs->vend &&
+	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+		/* at least part of the chunk is inside this vrange */
+		return 0;
+
+	return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_type,
+				     struct btrfs_balance_args *bargs)
+{
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+		return 0;
+
+	chunk_type = chunk_to_extended(chunk_type) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	if (bargs->target == chunk_type)
+		return 1;
+
+	return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+				struct extent_buffer *leaf,
+				struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+	struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+	struct btrfs_balance_args *bargs = NULL;
+	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+	/* type filter */
+	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+		return 0;
+	}
+
+	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+		bargs = &bctl->data;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+		bargs = &bctl->sys;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+		bargs = &bctl->meta;
+
+	/* profiles filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+	    chunk_profiles_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
+	/* usage filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+	    chunk_devid_filter(leaf, chunk, bargs)) {
+		return 0;
+	}
+
+	/* drange filter, makes sense only with devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* vrange filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* soft profile changing mode */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+	    chunk_soft_convert_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
+	/*
+	 * limited by count, must be the last filter
+	 */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
+		if (bargs->limit == 0)
+			return 0;
+		else
+			bargs->limit--;
+	}
+
+	return 1;
+}
+
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	struct btrfs_root *chunk_root = fs_info->chunk_root;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct list_head *devices;
+	struct btrfs_device *device;
+	u64 old_size;
+	u64 size_to_free;
+	struct btrfs_chunk *chunk;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	int slot;
+	int ret;
+	int enospc_errors = 0;
+	bool counting = true;
+	u64 limit_data = bctl->data.limit;
+	u64 limit_meta = bctl->meta.limit;
+	u64 limit_sys = bctl->sys.limit;
+
+	/* step one make some room on all the devices */
+	devices = &fs_info->fs_devices->devices;
+	list_for_each_entry(device, devices, dev_list) {
+		old_size = btrfs_device_get_total_bytes(device);
+		size_to_free = div_factor(old_size, 1);
+		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+		if (!device->writeable ||
+		    btrfs_device_get_total_bytes(device) -
+		    btrfs_device_get_bytes_used(device) > size_to_free ||
+		    device->is_tgtdev_for_dev_replace)
+			continue;
+
+		ret = btrfs_shrink_device(device, old_size - size_to_free);
+		if (ret == -ENOSPC)
+			break;
+		BUG_ON(ret);
+
+		trans = btrfs_start_transaction(dev_root, 0);
+		BUG_ON(IS_ERR(trans));
+
+		ret = btrfs_grow_device(trans, device, old_size);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, dev_root);
+	}
+
+	/* step two, relocate all the chunks */
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* zero out stat counters */
+	spin_lock(&fs_info->balance_lock);
+	memset(&bctl->stat, 0, sizeof(bctl->stat));
+	spin_unlock(&fs_info->balance_lock);
+again:
+	if (!counting) {
+		bctl->data.limit = limit_data;
+		bctl->meta.limit = limit_meta;
+		bctl->sys.limit = limit_sys;
+	}
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+		    atomic_read(&fs_info->balance_cancel_req)) {
+			ret = -ECANCELED;
+			goto error;
+		}
+
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+
+		/*
+		 * this shouldn't happen, it means the last relocate
+		 * failed
+		 */
+		if (ret == 0)
+			BUG(); /* FIXME break ? */
+
+		ret = btrfs_previous_item(chunk_root, path, 0,
+					  BTRFS_CHUNK_ITEM_KEY);
+		if (ret) {
+			ret = 0;
+			break;
+		}
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid != key.objectid)
+			break;
+
+		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+		if (!counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.considered++;
+			spin_unlock(&fs_info->balance_lock);
+		}
+
+		ret = should_balance_chunk(chunk_root, leaf, chunk,
+					   found_key.offset);
+		btrfs_release_path(path);
+		if (!ret)
+			goto loop;
+
+		if (counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.expected++;
+			spin_unlock(&fs_info->balance_lock);
+			goto loop;
+		}
+
+		ret = btrfs_relocate_chunk(chunk_root,
+					   found_key.objectid,
+					   found_key.offset);
+		if (ret && ret != -ENOSPC)
+			goto error;
+		if (ret == -ENOSPC) {
+			enospc_errors++;
+		} else {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.completed++;
+			spin_unlock(&fs_info->balance_lock);
+		}
+loop:
+		if (found_key.offset == 0)
+			break;
+		key.offset = found_key.offset - 1;
+	}
+
+	if (counting) {
+		btrfs_release_path(path);
+		counting = false;
+		goto again;
+	}
+error:
+	btrfs_free_path(path);
+	if (enospc_errors) {
+		btrfs_info(fs_info, "%d enospc errors during balance",
+		       enospc_errors);
+		if (!ret)
+			ret = -ENOSPC;
+	}
+
+	return ret;
+}
+
+/**
+ * alloc_profile_is_valid - see if a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static int alloc_profile_is_valid(u64 flags, int extended)
+{
+	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
+			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+	/* 1) check that all other bits are zeroed */
+	if (flags & ~mask)
+		return 0;
+
+	/* 2) see if profile is reduced */
+	if (flags == 0)
+		return !extended; /* "0" is valid for usual profiles */
+
+	/* true if exactly one bit set */
+	return (flags & (flags - 1)) == 0;
+}
+
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+	/* cancel requested || normal exit path */
+	return atomic_read(&fs_info->balance_cancel_req) ||
+		(atomic_read(&fs_info->balance_pause_req) == 0 &&
+		 atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+
+	unset_balance_control(fs_info);
+	ret = del_balance_item(fs_info->tree_root);
+	if (ret)
+		btrfs_std_error(fs_info, ret);
+
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+}
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	u64 allowed;
+	int mixed = 0;
+	int ret;
+	u64 num_devices;
+	unsigned seq;
+
+	if (btrfs_fs_closing(fs_info) ||
+	    atomic_read(&fs_info->balance_pause_req) ||
+	    atomic_read(&fs_info->balance_cancel_req)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = 1;
+
+	/*
+	 * In case of mixed groups both data and meta should be picked,
+	 * and identical options should be given for both of them.
+	 */
+	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
+	if (mixed && (bctl->flags & allowed)) {
+		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+			btrfs_err(fs_info, "with mixed groups data and "
+				   "metadata balance options must be the same");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	num_devices = fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+		BUG_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+	if (num_devices == 1)
+		allowed |= BTRFS_BLOCK_GROUP_DUP;
+	else if (num_devices > 1)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+	if (num_devices > 2)
+		allowed |= BTRFS_BLOCK_GROUP_RAID5;
+	if (num_devices > 3)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
+			    BTRFS_BLOCK_GROUP_RAID6);
+	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
+	     (bctl->data.target & ~allowed))) {
+		btrfs_err(fs_info, "unable to start balance with target "
+			   "data profile %llu",
+		       bctl->data.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->meta.target, 1) ||
+	     (bctl->meta.target & ~allowed))) {
+		btrfs_err(fs_info,
+			   "unable to start balance with target metadata profile %llu",
+		       bctl->meta.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->sys.target, 1) ||
+	     (bctl->sys.target & ~allowed))) {
+		btrfs_err(fs_info,
+			   "unable to start balance with target system profile %llu",
+		       bctl->sys.target);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow dup'ed data chunks only in mixed mode */
+	if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
+		btrfs_err(fs_info, "dup for data is not allowed");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow to reduce meta or sys integrity only if force set */
+	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+			BTRFS_BLOCK_GROUP_RAID10 |
+			BTRFS_BLOCK_GROUP_RAID5 |
+			BTRFS_BLOCK_GROUP_RAID6;
+	do {
+		seq = read_seqbegin(&fs_info->profiles_lock);
+
+		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_system_alloc_bits & allowed) &&
+		     !(bctl->sys.target & allowed)) ||
+		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_metadata_alloc_bits & allowed) &&
+		     !(bctl->meta.target & allowed))) {
+			if (bctl->flags & BTRFS_BALANCE_FORCE) {
+				btrfs_info(fs_info, "force reducing metadata integrity");
+			} else {
+				btrfs_err(fs_info, "balance will reduce metadata "
+					   "integrity, use force if you want this");
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+	} while (read_seqretry(&fs_info->profiles_lock, seq));
+
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		int num_tolerated_disk_barrier_failures;
+		u64 target = bctl->sys.target;
+
+		num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+		if (num_tolerated_disk_barrier_failures > 0 &&
+		    (target &
+		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+			num_tolerated_disk_barrier_failures = 0;
+		else if (num_tolerated_disk_barrier_failures > 1 &&
+			 (target &
+			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+			num_tolerated_disk_barrier_failures = 1;
+
+		fs_info->num_tolerated_disk_barrier_failures =
+			num_tolerated_disk_barrier_failures;
+	}
+
+	ret = insert_balance_item(fs_info->tree_root, bctl);
+	if (ret && ret != -EEXIST)
+		goto out;
+
+	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+		BUG_ON(ret == -EEXIST);
+		set_balance_control(bctl);
+	} else {
+		BUG_ON(ret != -EEXIST);
+		spin_lock(&fs_info->balance_lock);
+		update_balance_args(bctl);
+		spin_unlock(&fs_info->balance_lock);
+	}
+
+	atomic_inc(&fs_info->balance_running);
+	mutex_unlock(&fs_info->balance_mutex);
+
+	ret = __btrfs_balance(fs_info);
+
+	mutex_lock(&fs_info->balance_mutex);
+	atomic_dec(&fs_info->balance_running);
+
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		fs_info->num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	}
+
+	if (bargs) {
+		memset(bargs, 0, sizeof(*bargs));
+		update_ioctl_balance_args(fs_info, 0, bargs);
+	}
+
+	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+	    balance_need_close(fs_info)) {
+		__cancel_balance(fs_info);
+	}
+
+	wake_up(&fs_info->balance_wait_q);
+
+	return ret;
+out:
+	if (bctl->flags & BTRFS_BALANCE_RESUME)
+		__cancel_balance(fs_info);
+	else {
+		kfree(bctl);
+		atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+	}
+	return ret;
+}
+
+static int balance_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	int ret = 0;
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	if (fs_info->balance_ctl) {
+		btrfs_info(fs_info, "continuing balance");
+		ret = btrfs_balance(fs_info->balance_ctl, NULL);
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+
+	return ret;
+}
+
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *tsk;
+
+	spin_lock(&fs_info->balance_lock);
+	if (!fs_info->balance_ctl) {
+		spin_unlock(&fs_info->balance_lock);
+		return 0;
+	}
+	spin_unlock(&fs_info->balance_lock);
+
+	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+		btrfs_info(fs_info, "force skipping balance");
+		return 0;
+	}
+
+	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
+	return PTR_ERR_OR_ZERO(tsk);
+}
+
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) { /* ret = -ENOENT; */
+		ret = 0;
+		goto out;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	bctl->fs_info = fs_info;
+	bctl->flags = btrfs_balance_flags(leaf, item);
+	bctl->flags |= BTRFS_BALANCE_RESUME;
+
+	btrfs_balance_data(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+	btrfs_balance_meta(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+	btrfs_balance_sys(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	set_balance_control(bctl);
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	if (atomic_read(&fs_info->balance_running)) {
+		atomic_inc(&fs_info->balance_pause_req);
+		mutex_unlock(&fs_info->balance_mutex);
+
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+
+		mutex_lock(&fs_info->balance_mutex);
+		/* we are good with balance_ctl ripped off from under us */
+		BUG_ON(atomic_read(&fs_info->balance_running));
+		atomic_dec(&fs_info->balance_pause_req);
+	} else {
+		ret = -ENOTCONN;
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->balance_cancel_req);
+	/*
+	 * if we are running just wait and return, balance item is
+	 * deleted in btrfs_balance in this case
+	 */
+	if (atomic_read(&fs_info->balance_running)) {
+		mutex_unlock(&fs_info->balance_mutex);
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+		mutex_lock(&fs_info->balance_mutex);
+	} else {
+		/* __cancel_balance needs volume_mutex */
+		mutex_unlock(&fs_info->balance_mutex);
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+
+		if (fs_info->balance_ctl)
+			__cancel_balance(fs_info);
+
+		mutex_unlock(&fs_info->volume_mutex);
+	}
+
+	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+	atomic_dec(&fs_info->balance_cancel_req);
+	mutex_unlock(&fs_info->balance_mutex);
+	return 0;
+}
+
+static int btrfs_uuid_scan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path = NULL;
+	int ret = 0;
+	struct extent_buffer *eb;
+	int slot;
+	struct btrfs_root_item root_item;
+	u32 item_size;
+	struct btrfs_trans_handle *trans = NULL;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+
+	max_key.objectid = (u64)-1;
+	max_key.type = BTRFS_ROOT_ITEM_KEY;
+	max_key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_forward(root, &key, path, 0);
+		if (ret) {
+			if (ret > 0)
+				ret = 0;
+			break;
+		}
+
+		if (key.type != BTRFS_ROOT_ITEM_KEY ||
+		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
+			goto skip;
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		item_size = btrfs_item_size_nr(eb, slot);
+		if (item_size < sizeof(root_item))
+			goto skip;
+
+		read_extent_buffer(eb, &root_item,
+				   btrfs_item_ptr_offset(eb, slot),
+				   (int)sizeof(root_item));
+		if (btrfs_root_refs(&root_item) == 0)
+			goto skip;
+
+		if (!btrfs_is_empty_uuid(root_item.uuid) ||
+		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
+			if (trans)
+				goto update_tree;
+
+			btrfs_release_path(path);
+			/*
+			 * 1 - subvol uuid item
+			 * 1 - received_subvol uuid item
+			 */
+			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
+			continue;
+		} else {
+			goto skip;
+		}
+update_tree:
+		if (!btrfs_is_empty_uuid(root_item.uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.uuid,
+						  BTRFS_UUID_KEY_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				btrfs_warn(fs_info, "uuid_tree_add failed %d",
+					ret);
+				break;
+			}
+		}
+
+		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.received_uuid,
+						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				btrfs_warn(fs_info, "uuid_tree_add failed %d",
+					ret);
+				break;
+			}
+		}
+
+skip:
+		if (trans) {
+			ret = btrfs_end_transaction(trans, fs_info->uuid_root);
+			trans = NULL;
+			if (ret)
+				break;
+		}
+
+		btrfs_release_path(path);
+		if (key.offset < (u64)-1) {
+			key.offset++;
+		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+		} else if (key.objectid < (u64)-1) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.objectid++;
+		} else {
+			break;
+		}
+		cond_resched();
+	}
+
+out:
+	btrfs_free_path(path);
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fs_info->uuid_root);
+	if (ret)
+		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
+	else
+		fs_info->update_uuid_tree_gen = 1;
+	up(&fs_info->uuid_tree_rescan_sem);
+	return 0;
+}
+
+/*
+ * Callback for btrfs_uuid_tree_iterate().
+ * returns:
+ * 0	check succeeded, the entry is not outdated.
+ * < 0	if an error occured.
+ * > 0	if the check failed, which means the caller shall remove the entry.
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+				       u8 *uuid, u8 type, u64 subid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct btrfs_root *subvol_root;
+
+	if (type != BTRFS_UUID_KEY_SUBVOL &&
+	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+		goto out;
+
+	key.objectid = subid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(subvol_root)) {
+		ret = PTR_ERR(subvol_root);
+		if (ret == -ENOENT)
+			ret = 1;
+		goto out;
+	}
+
+	switch (type) {
+	case BTRFS_UUID_KEY_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.received_uuid,
+			   BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	}
+
+out:
+	return ret;
+}
+
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+	int ret;
+
+	/*
+	 * 1st step is to iterate through the existing UUID tree and
+	 * to delete all entries that contain outdated data.
+	 * 2nd step is to add all missing entries to the UUID tree.
+	 */
+	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
+	if (ret < 0) {
+		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
+		up(&fs_info->uuid_tree_rescan_sem);
+		return ret;
+	}
+	return btrfs_uuid_scan_kthread(data);
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *uuid_root;
+	struct task_struct *task;
+	int ret;
+
+	/*
+	 * 1 - root node
+	 * 1 - root item
+	 */
+	trans = btrfs_start_transaction(tree_root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	uuid_root = btrfs_create_tree(trans, fs_info,
+				      BTRFS_UUID_TREE_OBJECTID);
+	if (IS_ERR(uuid_root)) {
+		btrfs_abort_transaction(trans, tree_root,
+					PTR_ERR(uuid_root));
+		return PTR_ERR(uuid_root);
+	}
+
+	fs_info->uuid_root = uuid_root;
+
+	ret = btrfs_commit_transaction(trans, tree_root);
+	if (ret)
+		return ret;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		btrfs_warn(fs_info, "failed to start uuid_scan task");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		btrfs_warn(fs_info, "failed to start uuid_rescan task");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	u64 length;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	int failed = 0;
+	bool retried = false;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 old_size = btrfs_device_get_total_bytes(device);
+	u64 diff = old_size - new_size;
+
+	if (device->is_tgtdev_for_dev_replace)
+		return -EINVAL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 2;
+
+	lock_chunks(root);
+
+	btrfs_device_set_total_bytes(device, new_size);
+	if (device->writeable) {
+		device->fs_devices->total_rw_bytes -= diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space -= diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
+	unlock_chunks(root);
+
+again:
+	key.objectid = device->devid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	do {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto done;
+
+		ret = btrfs_previous_item(root, path, 0, key.type);
+		if (ret < 0)
+			goto done;
+		if (ret) {
+			ret = 0;
+			btrfs_release_path(path);
+			break;
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+		if (key.objectid != device->devid) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (key.offset + length <= new_size) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+		btrfs_release_path(path);
+
+		ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+		if (ret && ret != -ENOSPC)
+			goto done;
+		if (ret == -ENOSPC)
+			failed++;
+	} while (key.offset-- > 0);
+
+	if (failed && !retried) {
+		failed = 0;
+		retried = true;
+		goto again;
+	} else if (failed && retried) {
+		ret = -ENOSPC;
+		lock_chunks(root);
+
+		btrfs_device_set_total_bytes(device, old_size);
+		if (device->writeable)
+			device->fs_devices->total_rw_bytes += diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+		unlock_chunks(root);
+		goto done;
+	}
+
+	/* Shrinking succeeded, else we would be at "done". */
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto done;
+	}
+
+	lock_chunks(root);
+	btrfs_device_set_disk_total_bytes(device, new_size);
+	if (list_empty(&device->resized_list))
+		list_add_tail(&device->resized_list,
+			      &root->fs_info->fs_devices->resized_devices);
+
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
+
+	/* Now btrfs_update_device() will change the on-disk size. */
+	ret = btrfs_update_device(trans, device);
+	btrfs_end_transaction(trans, root);
+done:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_add_system_chunk(struct btrfs_root *root,
+			   struct btrfs_key *key,
+			   struct btrfs_chunk *chunk, int item_size)
+{
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+	struct btrfs_disk_key disk_key;
+	u32 array_size;
+	u8 *ptr;
+
+	lock_chunks(root);
+	array_size = btrfs_super_sys_array_size(super_copy);
+	if (array_size + item_size + sizeof(disk_key)
+			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+		unlock_chunks(root);
+		return -EFBIG;
+	}
+
+	ptr = super_copy->sys_chunk_array + array_size;
+	btrfs_cpu_key_to_disk(&disk_key, key);
+	memcpy(ptr, &disk_key, sizeof(disk_key));
+	ptr += sizeof(disk_key);
+	memcpy(ptr, chunk, item_size);
+	item_size += sizeof(disk_key);
+	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+	unlock_chunks(root);
+
+	return 0;
+}
+
+/*
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
+static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = {
+		.sub_stripes	= 2,
+		.dev_stripes	= 1,
+		.devs_max	= 0,	/* 0 == as many as possible */
+		.devs_min	= 4,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID1] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 2,
+		.devs_min	= 2,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_DUP] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 2,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID0] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_SINGLE] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_RAID5] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID6] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 3,
+		.devs_increment	= 1,
+		.ncopies	= 3,
+	},
+};
+
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+	/* TODO allow them to set a preferred stripe size */
+	return 64 * 1024;
+}
+
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+		return;
+
+	btrfs_set_fs_incompat(info, RAID56);
+}
+
+#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)		\
+			- sizeof(struct btrfs_item)		\
+			- sizeof(struct btrfs_chunk))		\
+			/ sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
+				- 2 * sizeof(struct btrfs_disk_key)	\
+				- 2 * sizeof(struct btrfs_chunk))	\
+				/ sizeof(struct btrfs_stripe) + 1)
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, u64 start,
+			       u64 type)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
+	struct list_head *cur;
+	struct map_lookup *map = NULL;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_device_info *devices_info = NULL;
+	u64 total_avail;
+	int num_stripes;	/* total number of stripes to allocate */
+	int data_stripes;	/* number of stripes that count for
+				   block group size */
+	int sub_stripes;	/* sub_stripes info for map */
+	int dev_stripes;	/* stripes per dev */
+	int devs_max;		/* max devs to use */
+	int devs_min;		/* min devs needed */
+	int devs_increment;	/* ndevs has to be a multiple of this */
+	int ncopies;		/* how many copies to data has */
+	int ret;
+	u64 max_stripe_size;
+	u64 max_chunk_size;
+	u64 stripe_size;
+	u64 num_bytes;
+	u64 raid_stripe_len = BTRFS_STRIPE_LEN;
+	int ndevs;
+	int i;
+	int j;
+	int index;
+
+	BUG_ON(!alloc_profile_is_valid(type, 0));
+
+	if (list_empty(&fs_devices->alloc_list))
+		return -ENOSPC;
+
+	index = __get_raid_index(type);
+
+	sub_stripes = btrfs_raid_array[index].sub_stripes;
+	dev_stripes = btrfs_raid_array[index].dev_stripes;
+	devs_max = btrfs_raid_array[index].devs_max;
+	devs_min = btrfs_raid_array[index].devs_min;
+	devs_increment = btrfs_raid_array[index].devs_increment;
+	ncopies = btrfs_raid_array[index].ncopies;
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		max_stripe_size = 1024 * 1024 * 1024;
+		max_chunk_size = 10 * max_stripe_size;
+		if (!devs_max)
+			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		/* for larger filesystems, use larger metadata chunks */
+		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+			max_stripe_size = 1024 * 1024 * 1024;
+		else
+			max_stripe_size = 256 * 1024 * 1024;
+		max_chunk_size = max_stripe_size;
+		if (!devs_max)
+			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		max_stripe_size = 32 * 1024 * 1024;
+		max_chunk_size = 2 * max_stripe_size;
+		if (!devs_max)
+			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
+	} else {
+		btrfs_err(info, "invalid chunk type 0x%llx requested",
+		       type);
+		BUG_ON(1);
+	}
+
+	/* we don't want a chunk larger than 10% of writeable space */
+	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+			     max_chunk_size);
+
+	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	cur = fs_devices->alloc_list.next;
+
+	/*
+	 * in the first pass through the devices list, we gather information
+	 * about the available holes on each device.
+	 */
+	ndevs = 0;
+	while (cur != &fs_devices->alloc_list) {
+		struct btrfs_device *device;
+		u64 max_avail;
+		u64 dev_offset;
+
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+
+		cur = cur->next;
+
+		if (!device->writeable) {
+			WARN(1, KERN_ERR
+			       "BTRFS: read-only device in alloc_list\n");
+			continue;
+		}
+
+		if (!device->in_fs_metadata ||
+		    device->is_tgtdev_for_dev_replace)
+			continue;
+
+		if (device->total_bytes > device->bytes_used)
+			total_avail = device->total_bytes - device->bytes_used;
+		else
+			total_avail = 0;
+
+		/* If there is no space on this device, skip it. */
+		if (total_avail == 0)
+			continue;
+
+		ret = find_free_dev_extent(trans, device,
+					   max_stripe_size * dev_stripes,
+					   &dev_offset, &max_avail);
+		if (ret && ret != -ENOSPC)
+			goto error;
+
+		if (ret == 0)
+			max_avail = max_stripe_size * dev_stripes;
+
+		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+			continue;
+
+		if (ndevs == fs_devices->rw_devices) {
+			WARN(1, "%s: found more than %llu devices\n",
+			     __func__, fs_devices->rw_devices);
+			break;
+		}
+		devices_info[ndevs].dev_offset = dev_offset;
+		devices_info[ndevs].max_avail = max_avail;
+		devices_info[ndevs].total_avail = total_avail;
+		devices_info[ndevs].dev = device;
+		++ndevs;
+	}
+
+	/*
+	 * now sort the devices by hole size / available space
+	 */
+	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_info, NULL);
+
+	/* round down to number of usable stripes */
+	ndevs -= ndevs % devs_increment;
+
+	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+		ret = -ENOSPC;
+		goto error;
+	}
+
+	if (devs_max && ndevs > devs_max)
+		ndevs = devs_max;
+	/*
+	 * the primary goal is to maximize the number of stripes, so use as many
+	 * devices as possible, even if the stripes are not maximum sized.
+	 */
+	stripe_size = devices_info[ndevs-1].max_avail;
+	num_stripes = ndevs * dev_stripes;
+
+	/*
+	 * this will have to be fixed for RAID1 and RAID10 over
+	 * more drives
+	 */
+	data_stripes = num_stripes / ncopies;
+
+	if (type & BTRFS_BLOCK_GROUP_RAID5) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 1;
+	}
+	if (type & BTRFS_BLOCK_GROUP_RAID6) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 2;
+	}
+
+	/*
+	 * Use the number of data stripes to figure out how big this chunk
+	 * is really going to be in terms of logical address space,
+	 * and compare that answer with the max chunk size
+	 */
+	if (stripe_size * data_stripes > max_chunk_size) {
+		u64 mask = (1ULL << 24) - 1;
+
+		stripe_size = div_u64(max_chunk_size, data_stripes);
+
+		/* bump the answer up to a 16MB boundary */
+		stripe_size = (stripe_size + mask) & ~mask;
+
+		/* but don't go higher than the limits we found
+		 * while searching for free extents
+		 */
+		if (stripe_size > devices_info[ndevs-1].max_avail)
+			stripe_size = devices_info[ndevs-1].max_avail;
+	}
+
+	stripe_size = div_u64(stripe_size, dev_stripes);
+
+	/* align to BTRFS_STRIPE_LEN */
+	stripe_size = div_u64(stripe_size, raid_stripe_len);
+	stripe_size *= raid_stripe_len;
+
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	map->num_stripes = num_stripes;
+
+	for (i = 0; i < ndevs; ++i) {
+		for (j = 0; j < dev_stripes; ++j) {
+			int s = i * dev_stripes + j;
+			map->stripes[s].dev = devices_info[i].dev;
+			map->stripes[s].physical = devices_info[i].dev_offset +
+						   j * stripe_size;
+		}
+	}
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = raid_stripe_len;
+	map->io_align = raid_stripe_len;
+	map->io_width = raid_stripe_len;
+	map->type = type;
+	map->sub_stripes = sub_stripes;
+
+	num_bytes = stripe_size * data_stripes;
+
+	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
+
+	em = alloc_extent_map();
+	if (!em) {
+		kfree(map);
+		ret = -ENOMEM;
+		goto error;
+	}
+	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+	em->bdev = (struct block_device *)map;
+	em->start = start;
+	em->len = num_bytes;
+	em->block_start = 0;
+	em->block_len = em->len;
+	em->orig_block_len = stripe_size;
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	write_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em, 0);
+	if (!ret) {
+		list_add_tail(&em->list, &trans->transaction->pending_chunks);
+		atomic_inc(&em->refs);
+	}
+	write_unlock(&em_tree->lock);
+	if (ret) {
+		free_extent_map(em);
+		goto error;
+	}
+
+	ret = btrfs_make_block_group(trans, extent_root, 0, type,
+				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				     start, num_bytes);
+	if (ret)
+		goto error_del_extent;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+	}
+
+	spin_lock(&extent_root->fs_info->free_chunk_lock);
+	extent_root->fs_info->free_chunk_space -= (stripe_size *
+						   map->num_stripes);
+	spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
+	free_extent_map(em);
+	check_raid56_incompat_flag(extent_root->fs_info, type);
+
+	kfree(devices_info);
+	return 0;
+
+error_del_extent:
+	write_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	write_unlock(&em_tree->lock);
+
+	/* One for our allocation */
+	free_extent_map(em);
+	/* One for the tree reference */
+	free_extent_map(em);
+	/* One for the pending_chunks list reference */
+	free_extent_map(em);
+error:
+	kfree(devices_info);
+	return ret;
+}
+
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				u64 chunk_offset, u64 chunk_size)
+{
+	struct btrfs_key key;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_device *device;
+	struct btrfs_chunk *chunk;
+	struct btrfs_stripe *stripe;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	size_t item_size;
+	u64 dev_offset;
+	u64 stripe_size;
+	int i = 0;
+	int ret;
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		btrfs_crit(extent_root->fs_info, "unable to find logical "
+			   "%Lu len %Lu", chunk_offset, chunk_size);
+		return -EINVAL;
+	}
+
+	if (em->start != chunk_offset || em->len != chunk_size) {
+		btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+			  " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
+			  chunk_size, em->start, em->len);
+		free_extent_map(em);
+		return -EINVAL;
+	}
+
+	map = (struct map_lookup *)em->bdev;
+	item_size = btrfs_chunk_item_size(map->num_stripes);
+	stripe_size = em->orig_block_len;
+
+	chunk = kzalloc(item_size, GFP_NOFS);
+	if (!chunk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
+
+		ret = btrfs_update_device(trans, device);
+		if (ret)
+			goto out;
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     chunk_root->root_key.objectid,
+					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+					     chunk_offset, dev_offset,
+					     stripe_size);
+		if (ret)
+			goto out;
+	}
+
+	stripe = &chunk->stripe;
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
+
+		btrfs_set_stack_stripe_devid(stripe, device->devid);
+		btrfs_set_stack_stripe_offset(stripe, dev_offset);
+		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+		stripe++;
+	}
+
+	btrfs_set_stack_chunk_length(chunk, chunk_size);
+	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_type(chunk, map->type);
+	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	key.offset = chunk_offset;
+
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		/*
+		 * TODO: Cleanup of inserted chunk root in case of
+		 * failure.
+		 */
+		ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
+					     item_size);
+	}
+
+out:
+	kfree(chunk);
+	free_extent_map(em);
+	return ret;
+}
+
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type)
+{
+	u64 chunk_offset;
+
+	ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
+	chunk_offset = find_next_chunk(extent_root->fs_info);
+	return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
+}
+
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_device *device)
+{
+	u64 chunk_offset;
+	u64 sys_chunk_offset;
+	u64 alloc_profile;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+
+	chunk_offset = find_next_chunk(fs_info);
+	alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+				  alloc_profile);
+	if (ret)
+		return ret;
+
+	sys_chunk_offset = find_next_chunk(root->fs_info);
+	alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+				  alloc_profile);
+	return ret;
+}
+
+static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+{
+	int max_errors;
+
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_RAID10 |
+			 BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
+		max_errors = 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+		max_errors = 2;
+	} else {
+		max_errors = 0;
+	}
+
+	return max_errors;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	int readonly = 0;
+	int miss_ndevs = 0;
+	int i;
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	read_unlock(&map_tree->map_tree.lock);
+	if (!em)
+		return 1;
+
+	map = (struct map_lookup *)em->bdev;
+	for (i = 0; i < map->num_stripes; i++) {
+		if (map->stripes[i].dev->missing) {
+			miss_ndevs++;
+			continue;
+		}
+
+		if (!map->stripes[i].dev->writeable) {
+			readonly = 1;
+			goto end;
+		}
+	}
+
+	/*
+	 * If the number of missing devices is larger than max errors,
+	 * we can not write the data into that chunk successfully, so
+	 * set it readonly.
+	 */
+	if (miss_ndevs > btrfs_chunk_max_errors(map))
+		readonly = 1;
+end:
+	free_extent_map(em);
+	return readonly;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+	extent_map_tree_init(&tree->map_tree);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+	struct extent_map *em;
+
+	while (1) {
+		write_lock(&tree->map_tree.lock);
+		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		if (em)
+			remove_extent_mapping(&tree->map_tree, em);
+		write_unlock(&tree->map_tree.lock);
+		if (!em)
+			break;
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+}
+
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
+{
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+
+	/*
+	 * We could return errors for these cases, but that could get ugly and
+	 * we'd probably do the same thing which is just not do anything else
+	 * and exit, so return 1 so the callers don't try to use other copies.
+	 */
+	if (!em) {
+		btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
+			    logical+len);
+		return 1;
+	}
+
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
+			    "%Lu-%Lu", logical, logical+len, em->start,
+			    em->start + em->len);
+		free_extent_map(em);
+		return 1;
+	}
+
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+		ret = map->num_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		ret = map->sub_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+		ret = 2;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+		ret = 3;
+	else
+		ret = 1;
+	free_extent_map(em);
+
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+		ret++;
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	return ret;
+}
+
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	unsigned long len = root->sectorsize;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+		len = map->stripe_len * nr_data_stripes(map);
+	free_extent_map(em);
+	return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret = 0;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+		ret = 1;
+	free_extent_map(em);
+	return ret;
+}
+
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+			    struct map_lookup *map, int first, int num,
+			    int optimal, int dev_replace_is_ongoing)
+{
+	int i;
+	int tolerance;
+	struct btrfs_device *srcdev;
+
+	if (dev_replace_is_ongoing &&
+	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+		srcdev = fs_info->dev_replace.srcdev;
+	else
+		srcdev = NULL;
+
+	/*
+	 * try to avoid the drive that is the source drive for a
+	 * dev-replace procedure, only choose it if no other non-missing
+	 * mirror is available
+	 */
+	for (tolerance = 0; tolerance < 2; tolerance++) {
+		if (map->stripes[optimal].dev->bdev &&
+		    (tolerance || map->stripes[optimal].dev != srcdev))
+			return optimal;
+		for (i = first; i < first + num; i++) {
+			if (map->stripes[i].dev->bdev &&
+			    (tolerance || map->stripes[i].dev != srcdev))
+				return i;
+		}
+	}
+
+	/* we couldn't find one that doesn't fail.  Just return something
+	 * and the io error handling code will clean up eventually
+	 */
+	return optimal;
+}
+
+static inline int parity_smaller(u64 a, u64 b)
+{
+	return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+{
+	struct btrfs_bio_stripe s;
+	int i;
+	u64 l;
+	int again = 1;
+
+	while (again) {
+		again = 0;
+		for (i = 0; i < num_stripes - 1; i++) {
+			if (parity_smaller(bbio->raid_map[i],
+					   bbio->raid_map[i+1])) {
+				s = bbio->stripes[i];
+				l = bbio->raid_map[i];
+				bbio->stripes[i] = bbio->stripes[i+1];
+				bbio->raid_map[i] = bbio->raid_map[i+1];
+				bbio->stripes[i+1] = s;
+				bbio->raid_map[i+1] = l;
+
+				again = 1;
+			}
+		}
+	}
+}
+
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+	struct btrfs_bio *bbio = kzalloc(
+		 /* the size of the btrfs_bio */
+		sizeof(struct btrfs_bio) +
+		/* plus the variable array for the stripes */
+		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+		/* plus the variable array for the tgt dev */
+		sizeof(int) * (real_stripes) +
+		/*
+		 * plus the raid_map, which includes both the tgt dev
+		 * and the stripes
+		 */
+		sizeof(u64) * (total_stripes),
+		GFP_NOFS);
+	if (!bbio)
+		return NULL;
+
+	atomic_set(&bbio->error, 0);
+	atomic_set(&bbio->refs, 1);
+
+	return bbio;
+}
+
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+	WARN_ON(!atomic_read(&bbio->refs));
+	atomic_inc(&bbio->refs);
+}
+
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+	if (!bbio)
+		return;
+	if (atomic_dec_and_test(&bbio->refs))
+		kfree(bbio);
+}
+
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+			     u64 logical, u64 *length,
+			     struct btrfs_bio **bbio_ret,
+			     int mirror_num, int need_raid_map)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	u64 offset;
+	u64 stripe_offset;
+	u64 stripe_end_offset;
+	u64 stripe_nr;
+	u64 stripe_nr_orig;
+	u64 stripe_nr_end;
+	u64 stripe_len;
+	u32 stripe_index;
+	int i;
+	int ret = 0;
+	int num_stripes;
+	int max_errors = 0;
+	int tgtdev_indexes = 0;
+	struct btrfs_bio *bbio = NULL;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int dev_replace_is_ongoing = 0;
+	int num_alloc_stripes;
+	int patch_the_first_stripe_for_dev_replace = 0;
+	u64 physical_to_patch_in_first_stripe = 0;
+	u64 raid56_full_stripe_start = (u64)-1;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, *length);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		btrfs_crit(fs_info, "unable to find logical %llu len %llu",
+			logical, *length);
+		return -EINVAL;
+	}
+
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
+			   "found %Lu-%Lu", logical, em->start,
+			   em->start + em->len);
+		free_extent_map(em);
+		return -EINVAL;
+	}
+
+	map = (struct map_lookup *)em->bdev;
+	offset = logical - em->start;
+
+	stripe_len = map->stripe_len;
+	stripe_nr = offset;
+	/*
+	 * stripe_nr counts the total number of stripes we have to stride
+	 * to get to this block
+	 */
+	stripe_nr = div64_u64(stripe_nr, stripe_len);
+
+	stripe_offset = stripe_nr * stripe_len;
+	BUG_ON(offset < stripe_offset);
+
+	/* stripe_offset is the offset of this block in its stripe*/
+	stripe_offset = offset - stripe_offset;
+
+	/* if we're here for raid56, we need to know the stripe aligned start */
+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+		raid56_full_stripe_start = offset;
+
+		/* allow a write of a full stripe, but make sure we don't
+		 * allow straddling of stripes
+		 */
+		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+				full_stripe_len);
+		raid56_full_stripe_start *= full_stripe_len;
+	}
+
+	if (rw & REQ_DISCARD) {
+		/* we don't discard raid56 yet */
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+		*length = min_t(u64, em->len - offset, *length);
+	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+		u64 max_len;
+		/* For writes to RAID[56], allow a full stripeset across all disks.
+		   For other RAID types and for RAID[56] reads, just allow a single
+		   stripe (on a single disk). */
+		if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+		    (rw & REQ_WRITE)) {
+			max_len = stripe_len * nr_data_stripes(map) -
+				(offset - raid56_full_stripe_start);
+		} else {
+			/* we limit the length of each bio to what fits in a stripe */
+			max_len = stripe_len - stripe_offset;
+		}
+		*length = min_t(u64, em->len - offset, max_len);
+	} else {
+		*length = em->len - offset;
+	}
+
+	/* This is for when we're called from btrfs_merge_bio_hook() and all
+	   it cares about is the length */
+	if (!bbio_ret)
+		goto out;
+
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+	if (!dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
+
+	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+	    dev_replace->tgtdev != NULL) {
+		/*
+		 * in dev-replace case, for repair case (that's the only
+		 * case where the mirror is selected explicitly when
+		 * calling btrfs_map_block), blocks left of the left cursor
+		 * can also be read from the target drive.
+		 * For REQ_GET_READ_MIRRORS, the target drive is added as
+		 * the last one to the array of stripes. For READ, it also
+		 * needs to be supported using the same mirror number.
+		 * If the requested block is not left of the left cursor,
+		 * EIO is returned. This can happen because btrfs_num_copies()
+		 * returns one more in the dev-replace case.
+		 */
+		u64 tmp_length = *length;
+		struct btrfs_bio *tmp_bbio = NULL;
+		int tmp_num_stripes;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+			     logical, &tmp_length, &tmp_bbio, 0, 0);
+		if (ret) {
+			WARN_ON(tmp_bbio != NULL);
+			goto out;
+		}
+
+		tmp_num_stripes = tmp_bbio->num_stripes;
+		if (mirror_num > tmp_num_stripes) {
+			/*
+			 * REQ_GET_READ_MIRRORS does not contain this
+			 * mirror, that means that the requested area
+			 * is not left of the left cursor
+			 */
+			ret = -EIO;
+			btrfs_put_bbio(tmp_bbio);
+			goto out;
+		}
+
+		/*
+		 * process the rest of the function using the mirror_num
+		 * of the source drive. Therefore look it up first.
+		 * At the end, patch the device pointer to the one of the
+		 * target drive.
+		 */
+		for (i = 0; i < tmp_num_stripes; i++) {
+			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     tmp_bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found =
+					tmp_bbio->stripes[i].physical;
+			}
+		}
+
+		if (found) {
+			mirror_num = index_srcdev + 1;
+			patch_the_first_stripe_for_dev_replace = 1;
+			physical_to_patch_in_first_stripe = physical_of_found;
+		} else {
+			WARN_ON(1);
+			ret = -EIO;
+			btrfs_put_bbio(tmp_bbio);
+			goto out;
+		}
+
+		btrfs_put_bbio(tmp_bbio);
+	} else if (mirror_num > map->num_stripes) {
+		mirror_num = 0;
+	}
+
+	num_stripes = 1;
+	stripe_index = 0;
+	stripe_nr_orig = stripe_nr;
+	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
+	stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
+	stripe_end_offset = stripe_nr_end * map->stripe_len -
+			    (offset + *length);
+
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		if (rw & REQ_DISCARD)
+			num_stripes = min_t(u64, map->num_stripes,
+					    stripe_nr_end - stripe_nr_orig);
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+				&stripe_index);
+		if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
+			mirror_num = 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
+			num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
+		else {
+			stripe_index = find_live_mirror(fs_info, map, 0,
+					    map->num_stripes,
+					    current->pid % map->num_stripes,
+					    dev_replace_is_ongoing);
+			mirror_num = stripe_index + 1;
+		}
+
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
+			num_stripes = map->num_stripes;
+		} else if (mirror_num) {
+			stripe_index = mirror_num - 1;
+		} else {
+			mirror_num = 1;
+		}
+
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		u32 factor = map->num_stripes / map->sub_stripes;
+
+		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+		stripe_index *= map->sub_stripes;
+
+		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+			num_stripes = map->sub_stripes;
+		else if (rw & REQ_DISCARD)
+			num_stripes = min_t(u64, map->sub_stripes *
+					    (stripe_nr_end - stripe_nr_orig),
+					    map->num_stripes);
+		else if (mirror_num)
+			stripe_index += mirror_num - 1;
+		else {
+			int old_stripe_index = stripe_index;
+			stripe_index = find_live_mirror(fs_info, map,
+					      stripe_index,
+					      map->sub_stripes, stripe_index +
+					      current->pid % map->sub_stripes,
+					      dev_replace_is_ongoing);
+			mirror_num = stripe_index - old_stripe_index + 1;
+		}
+
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		if (need_raid_map &&
+		    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+		     mirror_num > 1)) {
+			/* push stripe_nr back to the start of the full stripe */
+			stripe_nr = div_u64(raid56_full_stripe_start,
+					stripe_len * nr_data_stripes(map));
+
+			/* RAID[56] write or recovery. Return all stripes */
+			num_stripes = map->num_stripes;
+			max_errors = nr_parity_stripes(map);
+
+			*length = map->stripe_len;
+			stripe_index = 0;
+			stripe_offset = 0;
+		} else {
+			/*
+			 * Mirror #0 or #1 means the original data block.
+			 * Mirror #2 is RAID5 parity block.
+			 * Mirror #3 is RAID6 Q block.
+			 */
+			stripe_nr = div_u64_rem(stripe_nr,
+					nr_data_stripes(map), &stripe_index);
+			if (mirror_num > 1)
+				stripe_index = nr_data_stripes(map) +
+						mirror_num - 2;
+
+			/* We distribute the parity blocks across stripes */
+			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+					&stripe_index);
+			if (!(rw & (REQ_WRITE | REQ_DISCARD |
+				    REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
+				mirror_num = 1;
+		}
+	} else {
+		/*
+		 * after this, stripe_nr is the number of stripes on this
+		 * device we have to walk to find the data, and stripe_index is
+		 * the number of our device in the stripe array
+		 */
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+				&stripe_index);
+		mirror_num = stripe_index + 1;
+	}
+	BUG_ON(stripe_index >= map->num_stripes);
+
+	num_alloc_stripes = num_stripes;
+	if (dev_replace_is_ongoing) {
+		if (rw & (REQ_WRITE | REQ_DISCARD))
+			num_alloc_stripes <<= 1;
+		if (rw & REQ_GET_READ_MIRRORS)
+			num_alloc_stripes++;
+		tgtdev_indexes = num_stripes;
+	}
+
+	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
+	if (!bbio) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	if (dev_replace_is_ongoing)
+		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+
+	/* build raid_map */
+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+	    need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+	    mirror_num > 1)) {
+		u64 tmp;
+		unsigned rot;
+
+		bbio->raid_map = (u64 *)((void *)bbio->stripes +
+				 sizeof(struct btrfs_bio_stripe) *
+				 num_alloc_stripes +
+				 sizeof(int) * tgtdev_indexes);
+
+		/* Work out the disk rotation on this stripe-set */
+		div_u64_rem(stripe_nr, num_stripes, &rot);
+
+		/* Fill in the logical address of each stripe */
+		tmp = stripe_nr * nr_data_stripes(map);
+		for (i = 0; i < nr_data_stripes(map); i++)
+			bbio->raid_map[(i+rot) % num_stripes] =
+				em->start + (tmp + i) * map->stripe_len;
+
+		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+			bbio->raid_map[(i+rot+1) % num_stripes] =
+				RAID6_Q_STRIPE;
+	}
+
+	if (rw & REQ_DISCARD) {
+		u32 factor = 0;
+		u32 sub_stripes = 0;
+		u64 stripes_per_dev = 0;
+		u32 remaining_stripes = 0;
+		u32 last_stripe = 0;
+
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+			if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+				sub_stripes = 1;
+			else
+				sub_stripes = map->sub_stripes;
+
+			factor = map->num_stripes / sub_stripes;
+			stripes_per_dev = div_u64_rem(stripe_nr_end -
+						      stripe_nr_orig,
+						      factor,
+						      &remaining_stripes);
+			div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+			last_stripe *= sub_stripes;
+		}
+
+		for (i = 0; i < num_stripes; i++) {
+			bbio->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset + stripe_nr * map->stripe_len;
+			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+			if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+					 BTRFS_BLOCK_GROUP_RAID10)) {
+				bbio->stripes[i].length = stripes_per_dev *
+							  map->stripe_len;
+
+				if (i / sub_stripes < remaining_stripes)
+					bbio->stripes[i].length +=
+						map->stripe_len;
+
+				/*
+				 * Special for the first stripe and
+				 * the last stripe:
+				 *
+				 * |-------|...|-------|
+				 *     |----------|
+				 *    off     end_off
+				 */
+				if (i < sub_stripes)
+					bbio->stripes[i].length -=
+						stripe_offset;
+
+				if (stripe_index >= last_stripe &&
+				    stripe_index <= (last_stripe +
+						     sub_stripes - 1))
+					bbio->stripes[i].length -=
+						stripe_end_offset;
+
+				if (i == sub_stripes - 1)
+					stripe_offset = 0;
+			} else
+				bbio->stripes[i].length = *length;
+
+			stripe_index++;
+			if (stripe_index == map->num_stripes) {
+				/* This could only happen for RAID0/10 */
+				stripe_index = 0;
+				stripe_nr++;
+			}
+		}
+	} else {
+		for (i = 0; i < num_stripes; i++) {
+			bbio->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset +
+				stripe_nr * map->stripe_len;
+			bbio->stripes[i].dev =
+				map->stripes[stripe_index].dev;
+			stripe_index++;
+		}
+	}
+
+	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+		max_errors = btrfs_chunk_max_errors(map);
+
+	if (bbio->raid_map)
+		sort_parity_stripes(bbio, num_stripes);
+
+	tgtdev_indexes = 0;
+	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	    dev_replace->tgtdev != NULL) {
+		int index_where_to_add;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+
+		/*
+		 * duplicate the write operations while the dev replace
+		 * procedure is running. Since the copying of the old disk
+		 * to the new disk takes place at run time while the
+		 * filesystem is mounted writable, the regular write
+		 * operations to the old disk have to be duplicated to go
+		 * to the new disk as well.
+		 * Note that device->missing is handled by the caller, and
+		 * that the write to the old disk is already set up in the
+		 * stripes array.
+		 */
+		index_where_to_add = num_stripes;
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/* write to new disk, too */
+				struct btrfs_bio_stripe *new =
+					bbio->stripes + index_where_to_add;
+				struct btrfs_bio_stripe *old =
+					bbio->stripes + i;
+
+				new->physical = old->physical;
+				new->length = old->length;
+				new->dev = dev_replace->tgtdev;
+				bbio->tgtdev_map[i] = index_where_to_add;
+				index_where_to_add++;
+				max_errors++;
+				tgtdev_indexes++;
+			}
+		}
+		num_stripes = index_where_to_add;
+	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+		   dev_replace->tgtdev != NULL) {
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		/*
+		 * During the dev-replace procedure, the target drive can
+		 * also be used to read data in case it is needed to repair
+		 * a corrupt block elsewhere. This is possible if the
+		 * requested area is left of the left cursor. In this area,
+		 * the target drive is a full copy of the source drive.
+		 */
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found = bbio->stripes[i].physical;
+			}
+		}
+		if (found) {
+			if (physical_of_found + map->stripe_len <=
+			    dev_replace->cursor_left) {
+				struct btrfs_bio_stripe *tgtdev_stripe =
+					bbio->stripes + num_stripes;
+
+				tgtdev_stripe->physical = physical_of_found;
+				tgtdev_stripe->length =
+					bbio->stripes[index_srcdev].length;
+				tgtdev_stripe->dev = dev_replace->tgtdev;
+				bbio->tgtdev_map[index_srcdev] = num_stripes;
+
+				tgtdev_indexes++;
+				num_stripes++;
+			}
+		}
+	}
+
+	*bbio_ret = bbio;
+	bbio->map_type = map->type;
+	bbio->num_stripes = num_stripes;
+	bbio->max_errors = max_errors;
+	bbio->mirror_num = mirror_num;
+	bbio->num_tgtdevs = tgtdev_indexes;
+
+	/*
+	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
+	 * mirror_num == num_stripes + 1 && dev_replace target drive is
+	 * available as a mirror
+	 */
+	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+		WARN_ON(num_stripes > 1);
+		bbio->stripes[0].dev = dev_replace->tgtdev;
+		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+		bbio->mirror_num = map->num_stripes + 1;
+	}
+out:
+	if (dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
+	free_extent_map(em);
+	return ret;
+}
+
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+		      u64 logical, u64 *length,
+		      struct btrfs_bio **bbio_ret, int mirror_num)
+{
+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+				 mirror_num, 0);
+}
+
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+		     u64 logical, u64 *length,
+		     struct btrfs_bio **bbio_ret, int mirror_num,
+		     int need_raid_map)
+{
+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+				 mirror_num, need_raid_map);
+}
+
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len)
+{
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 *buf;
+	u64 bytenr;
+	u64 length;
+	u64 stripe_nr;
+	u64 rmap_len;
+	int i, j, nr = 0;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
+		       chunk_start);
+		return -EIO;
+	}
+
+	if (em->start != chunk_start) {
+		printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
+		       em->start, chunk_start);
+		free_extent_map(em);
+		return -EIO;
+	}
+	map = (struct map_lookup *)em->bdev;
+
+	length = em->len;
+	rmap_len = map->stripe_len;
+
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		length = div_u64(length, map->num_stripes / map->sub_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+		length = div_u64(length, map->num_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		length = div_u64(length, nr_data_stripes(map));
+		rmap_len = map->stripe_len * nr_data_stripes(map);
+	}
+
+	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+	BUG_ON(!buf); /* -ENOMEM */
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (devid && map->stripes[i].dev->devid != devid)
+			continue;
+		if (map->stripes[i].physical > physical ||
+		    map->stripes[i].physical + length <= physical)
+			continue;
+
+		stripe_nr = physical - map->stripes[i].physical;
+		stripe_nr = div_u64(stripe_nr, map->stripe_len);
+
+		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+		} /* else if RAID[56], multiply by nr_data_stripes().
+		   * Alternatively, just use rmap_len below instead of
+		   * map->stripe_len */
+
+		bytenr = chunk_start + stripe_nr * rmap_len;
+		WARN_ON(nr >= map->num_stripes);
+		for (j = 0; j < nr; j++) {
+			if (buf[j] == bytenr)
+				break;
+		}
+		if (j == nr) {
+			WARN_ON(nr >= map->num_stripes);
+			buf[nr++] = bytenr;
+		}
+	}
+
+	*logical = buf;
+	*naddrs = nr;
+	*stripe_len = rmap_len;
+
+	free_extent_map(em);
+	return 0;
+}
+
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+		bio_endio_nodec(bio, err);
+	else
+		bio_endio(bio, err);
+	btrfs_put_bbio(bbio);
+}
+
+static void btrfs_end_bio(struct bio *bio, int err)
+{
+	struct btrfs_bio *bbio = bio->bi_private;
+	struct btrfs_device *dev = bbio->stripes[0].dev;
+	int is_orig_bio = 0;
+
+	if (err) {
+		atomic_inc(&bbio->error);
+		if (err == -EIO || err == -EREMOTEIO) {
+			unsigned int stripe_index =
+				btrfs_io_bio(bio)->stripe_index;
+
+			BUG_ON(stripe_index >= bbio->num_stripes);
+			dev = bbio->stripes[stripe_index].dev;
+			if (dev->bdev) {
+				if (bio->bi_rw & WRITE)
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_WRITE_ERRS);
+				else
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_READ_ERRS);
+				if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_FLUSH_ERRS);
+				btrfs_dev_stat_print_on_error(dev);
+			}
+		}
+	}
+
+	if (bio == bbio->orig_bio)
+		is_orig_bio = 1;
+
+	btrfs_bio_counter_dec(bbio->fs_info);
+
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+		if (!is_orig_bio) {
+			bio_put(bio);
+			bio = bbio->orig_bio;
+		}
+
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+		/* only send an error to the higher layers if it is
+		 * beyond the tolerance of the btrfs bio
+		 */
+		if (atomic_read(&bbio->error) > bbio->max_errors) {
+			err = -EIO;
+		} else {
+			/*
+			 * this bio is actually up to date, we didn't
+			 * go over the max number of errors
+			 */
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
+			err = 0;
+		}
+
+		btrfs_end_bbio(bbio, bio, err);
+	} else if (!is_orig_bio) {
+		bio_put(bio);
+	}
+}
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline void btrfs_schedule_bio(struct btrfs_root *root,
+					struct btrfs_device *device,
+					int rw, struct bio *bio)
+{
+	int should_queue = 1;
+	struct btrfs_pending_bios *pending_bios;
+
+	if (device->missing || !device->bdev) {
+		bio_endio(bio, -EIO);
+		return;
+	}
+
+	/* don't bother with additional async steps for reads, right now */
+	if (!(rw & REQ_WRITE)) {
+		bio_get(bio);
+		btrfsic_submit_bio(rw, bio);
+		bio_put(bio);
+		return;
+	}
+
+	/*
+	 * nr_async_bios allows us to reliably return congestion to the
+	 * higher layers.  Otherwise, the async bio makes it appear we have
+	 * made progress against dirty pages when we've really just put it
+	 * on a queue for later
+	 */
+	atomic_inc(&root->fs_info->nr_async_bios);
+	WARN_ON(bio->bi_next);
+	bio->bi_next = NULL;
+	bio->bi_rw |= rw;
+
+	spin_lock(&device->io_lock);
+	if (bio->bi_rw & REQ_SYNC)
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
+
+	if (pending_bios->tail)
+		pending_bios->tail->bi_next = bio;
+
+	pending_bios->tail = bio;
+	if (!pending_bios->head)
+		pending_bios->head = bio;
+	if (device->running_pending)
+		should_queue = 0;
+
+	spin_unlock(&device->io_lock);
+
+	if (should_queue)
+		btrfs_queue_work(root->fs_info->submit_workers,
+				 &device->work);
+}
+
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+		       sector_t sector)
+{
+	struct bio_vec *prev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned int max_sectors = queue_max_sectors(q);
+	struct bvec_merge_data bvm = {
+		.bi_bdev = bdev,
+		.bi_sector = sector,
+		.bi_rw = bio->bi_rw,
+	};
+
+	if (WARN_ON(bio->bi_vcnt == 0))
+		return 1;
+
+	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	if (bio_sectors(bio) > max_sectors)
+		return 0;
+
+	if (!q->merge_bvec_fn)
+		return 1;
+
+	bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
+	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+		return 0;
+	return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *bio, u64 physical, int dev_nr,
+			      int rw, int async)
+{
+	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+	bio->bi_private = bbio;
+	btrfs_io_bio(bio)->stripe_index = dev_nr;
+	bio->bi_end_io = btrfs_end_bio;
+	bio->bi_iter.bi_sector = physical >> 9;
+#ifdef DEBUG
+	{
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+			 "(%s id %llu), size=%u\n", rw,
+			 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
+			 name->str, dev->devid, bio->bi_iter.bi_size);
+		rcu_read_unlock();
+	}
+#endif
+	bio->bi_bdev = dev->bdev;
+
+	btrfs_bio_counter_inc_noblocked(root->fs_info);
+
+	if (async)
+		btrfs_schedule_bio(root, dev, rw, bio);
+	else
+		btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *first_bio, struct btrfs_device *dev,
+			      int dev_nr, int rw, int async)
+{
+	struct bio_vec *bvec = first_bio->bi_io_vec;
+	struct bio *bio;
+	int nr_vecs = bio_get_nr_vecs(dev->bdev);
+	u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+	if (!bio)
+		return -ENOMEM;
+
+	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+				 bvec->bv_offset) < bvec->bv_len) {
+			u64 len = bio->bi_iter.bi_size;
+
+			atomic_inc(&bbio->stripes_pending);
+			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+					  rw, async);
+			physical += len;
+			goto again;
+		}
+		bvec++;
+	}
+
+	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+	return 0;
+}
+
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+	atomic_inc(&bbio->error);
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+		/* Shoud be the original bio. */
+		WARN_ON(bio != bbio->orig_bio);
+
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+		bio->bi_iter.bi_sector = logical >> 9;
+
+		btrfs_end_bbio(bbio, bio, -EIO);
+	}
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num, int async_submit)
+{
+	struct btrfs_device *dev;
+	struct bio *first_bio = bio;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	int ret;
+	int dev_nr;
+	int total_devs;
+	struct btrfs_bio *bbio = NULL;
+
+	length = bio->bi_iter.bi_size;
+	map_length = length;
+
+	btrfs_bio_counter_inc_blocked(root->fs_info);
+	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+			      mirror_num, 1);
+	if (ret) {
+		btrfs_bio_counter_dec(root->fs_info);
+		return ret;
+	}
+
+	total_devs = bbio->num_stripes;
+	bbio->orig_bio = first_bio;
+	bbio->private = first_bio->bi_private;
+	bbio->end_io = first_bio->bi_end_io;
+	bbio->fs_info = root->fs_info;
+	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+
+	if (bbio->raid_map) {
+		/* In this case, map_length has been set to the length of
+		   a single stripe; not the whole write */
+		if (rw & WRITE) {
+			ret = raid56_parity_write(root, bio, bbio, map_length);
+		} else {
+			ret = raid56_parity_recover(root, bio, bbio, map_length,
+						    mirror_num, 1);
+		}
+
+		btrfs_bio_counter_dec(root->fs_info);
+		return ret;
+	}
+
+	if (map_length < length) {
+		btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
+			logical, length, map_length);
+		BUG();
+	}
+
+	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
+		dev = bbio->stripes[dev_nr].dev;
+		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+			bbio_error(bbio, first_bio, logical);
+			continue;
+		}
+
+		/*
+		 * Check and see if we're ok with this bio based on it's size
+		 * and offset with the given device.
+		 */
+		if (!bio_size_ok(dev->bdev, first_bio,
+				 bbio->stripes[dev_nr].physical >> 9)) {
+			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+						 dev_nr, rw, async_submit);
+			BUG_ON(ret);
+			continue;
+		}
+
+		if (dev_nr < total_devs - 1) {
+			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
+			BUG_ON(!bio); /* -ENOMEM */
+		} else {
+			bio = first_bio;
+			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
+		}
+
+		submit_stripe_bio(root, bbio, bio,
+				  bbio->stripes[dev_nr].physical, dev_nr, rw,
+				  async_submit);
+	}
+	btrfs_bio_counter_dec(root->fs_info);
+	return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
+				       u8 *uuid, u8 *fsid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *cur_devices;
+
+	cur_devices = fs_info->fs_devices;
+	while (cur_devices) {
+		if (!fsid ||
+		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			device = __find_device(&cur_devices->devices,
+					       devid, uuid);
+			if (device)
+				return device;
+		}
+		cur_devices = cur_devices->seed;
+	}
+	return NULL;
+}
+
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+					    struct btrfs_fs_devices *fs_devices,
+					    u64 devid, u8 *dev_uuid)
+{
+	struct btrfs_device *device;
+
+	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+	if (IS_ERR(device))
+		return NULL;
+
+	list_add(&device->dev_list, &fs_devices->devices);
+	device->fs_devices = fs_devices;
+	fs_devices->num_devices++;
+
+	device->missing = 1;
+	fs_devices->missing_devices++;
+
+	return device;
+}
+
+/**
+ * btrfs_alloc_device - allocate struct btrfs_device
+ * @fs_info:	used only for generating a new devid, can be NULL if
+ *		devid is provided (i.e. @devid != NULL).
+ * @devid:	a pointer to devid for this device.  If NULL a new devid
+ *		is generated.
+ * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
+ *		is generated.
+ *
+ * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
+ * on error.  Returned struct is not linked onto any lists and can be
+ * destroyed with kfree() right away.
+ */
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid)
+{
+	struct btrfs_device *dev;
+	u64 tmp;
+
+	if (WARN_ON(!devid && !fs_info))
+		return ERR_PTR(-EINVAL);
+
+	dev = __alloc_device();
+	if (IS_ERR(dev))
+		return dev;
+
+	if (devid)
+		tmp = *devid;
+	else {
+		int ret;
+
+		ret = find_next_devid(fs_info, &tmp);
+		if (ret) {
+			kfree(dev);
+			return ERR_PTR(ret);
+		}
+	}
+	dev->devid = tmp;
+
+	if (uuid)
+		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
+	else
+		generate_random_uuid(dev->uuid);
+
+	btrfs_init_work(&dev->work, btrfs_submit_helper,
+			pending_bios_fn, NULL, NULL);
+
+	return dev;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+			  struct extent_buffer *leaf,
+			  struct btrfs_chunk *chunk)
+{
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 logical;
+	u64 length;
+	u64 devid;
+	u8 uuid[BTRFS_UUID_SIZE];
+	int num_stripes;
+	int ret;
+	int i;
+
+	logical = key->offset;
+	length = btrfs_chunk_length(leaf, chunk);
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+	read_unlock(&map_tree->map_tree.lock);
+
+	/* already mapped? */
+	if (em && em->start <= logical && em->start + em->len > logical) {
+		free_extent_map(em);
+		return 0;
+	} else if (em) {
+		free_extent_map(em);
+	}
+
+	em = alloc_extent_map();
+	if (!em)
+		return -ENOMEM;
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+	em->bdev = (struct block_device *)map;
+	em->start = logical;
+	em->len = length;
+	em->orig_start = 0;
+	em->block_start = 0;
+	em->block_len = em->len;
+
+	map->num_stripes = num_stripes;
+	map->io_width = btrfs_chunk_io_width(leaf, chunk);
+	map->io_align = btrfs_chunk_io_align(leaf, chunk);
+	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	map->type = btrfs_chunk_type(leaf, chunk);
+	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+	for (i = 0; i < num_stripes; i++) {
+		map->stripes[i].physical =
+			btrfs_stripe_offset_nr(leaf, chunk, i);
+		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+		read_extent_buffer(leaf, uuid, (unsigned long)
+				   btrfs_stripe_dev_uuid_nr(chunk, i),
+				   BTRFS_UUID_SIZE);
+		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+							uuid, NULL);
+		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+			free_extent_map(em);
+			return -EIO;
+		}
+		if (!map->stripes[i].dev) {
+			map->stripes[i].dev =
+				add_missing_dev(root, root->fs_info->fs_devices,
+						devid, uuid);
+			if (!map->stripes[i].dev) {
+				free_extent_map(em);
+				return -EIO;
+			}
+		}
+		map->stripes[i].dev->in_fs_metadata = 1;
+	}
+
+	write_lock(&map_tree->map_tree.lock);
+	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
+	write_unlock(&map_tree->map_tree.lock);
+	BUG_ON(ret); /* Tree corruption */
+	free_extent_map(em);
+
+	return 0;
+}
+
+static void fill_device_from_item(struct extent_buffer *leaf,
+				 struct btrfs_dev_item *dev_item,
+				 struct btrfs_device *device)
+{
+	unsigned long ptr;
+
+	device->devid = btrfs_device_id(leaf, dev_item);
+	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->total_bytes = device->disk_total_bytes;
+	device->commit_total_bytes = device->disk_total_bytes;
+	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+	device->commit_bytes_used = device->bytes_used;
+	device->type = btrfs_device_type(leaf, dev_item);
+	device->io_align = btrfs_device_io_align(leaf, dev_item);
+	device->io_width = btrfs_device_io_width(leaf, dev_item);
+	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+	device->is_tgtdev_for_dev_replace = 0;
+
+	ptr = btrfs_device_uuid(dev_item);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+}
+
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
+						  u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
+
+	fs_devices = root->fs_info->fs_devices->seed;
+	while (fs_devices) {
+		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
+			return fs_devices;
+
+		fs_devices = fs_devices->seed;
+	}
+
+	fs_devices = find_fsid(fsid);
+	if (!fs_devices) {
+		if (!btrfs_test_opt(root, DEGRADED))
+			return ERR_PTR(-ENOENT);
+
+		fs_devices = alloc_fs_devices(fsid);
+		if (IS_ERR(fs_devices))
+			return fs_devices;
+
+		fs_devices->seeding = 1;
+		fs_devices->opened = 1;
+		return fs_devices;
+	}
+
+	fs_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(fs_devices))
+		return fs_devices;
+
+	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+				   root->fs_info->bdev_holder);
+	if (ret) {
+		free_fs_devices(fs_devices);
+		fs_devices = ERR_PTR(ret);
+		goto out;
+	}
+
+	if (!fs_devices->seeding) {
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+		fs_devices = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
+	fs_devices->seed = root->fs_info->fs_devices->seed;
+	root->fs_info->fs_devices->seed = fs_devices;
+out:
+	return fs_devices;
+}
+
+static int read_one_dev(struct btrfs_root *root,
+			struct extent_buffer *leaf,
+			struct btrfs_dev_item *dev_item)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_device *device;
+	u64 devid;
+	int ret;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+
+	devid = btrfs_device_id(leaf, dev_item);
+	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
+			   BTRFS_UUID_SIZE);
+	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
+			   BTRFS_UUID_SIZE);
+
+	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+		fs_devices = open_seed_devices(root, fs_uuid);
+		if (IS_ERR(fs_devices))
+			return PTR_ERR(fs_devices);
+	}
+
+	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
+	if (!device) {
+		if (!btrfs_test_opt(root, DEGRADED))
+			return -EIO;
+
+		btrfs_warn(root->fs_info, "devid %llu missing", devid);
+		device = add_missing_dev(root, fs_devices, devid, dev_uuid);
+		if (!device)
+			return -ENOMEM;
+	} else {
+		if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+			return -EIO;
+
+		if(!device->bdev && !device->missing) {
+			/*
+			 * this happens when a device that was properly setup
+			 * in the device info lists suddenly goes bad.
+			 * device->bdev is NULL, and so we have to set
+			 * device->missing to one here
+			 */
+			device->fs_devices->missing_devices++;
+			device->missing = 1;
+		}
+
+		/* Move the device to its own fs_devices */
+		if (device->fs_devices != fs_devices) {
+			ASSERT(device->missing);
+
+			list_move(&device->dev_list, &fs_devices->devices);
+			device->fs_devices->num_devices--;
+			fs_devices->num_devices++;
+
+			device->fs_devices->missing_devices--;
+			fs_devices->missing_devices++;
+
+			device->fs_devices = fs_devices;
+		}
+	}
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		if (device->generation !=
+		    btrfs_device_generation(leaf, dev_item))
+			return -EINVAL;
+	}
+
+	fill_device_from_item(leaf, dev_item, device);
+	device->in_fs_metadata = 1;
+	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+		device->fs_devices->total_rw_bytes += device->total_bytes;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += device->total_bytes -
+			device->bytes_used;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
+	ret = 0;
+	return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+	struct extent_buffer *sb;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *array_ptr;
+	unsigned long sb_array_offset;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur_offset;
+	struct btrfs_key key;
+
+	ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+	/*
+	 * This will create extent buffer of nodesize, superblock size is
+	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+	 * overallocate but we can keep it as-is, only the first page is used.
+	 */
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
+	if (!sb)
+		return -ENOMEM;
+	btrfs_set_buffer_uptodate(sb);
+	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
+	/*
+	 * The sb extent buffer is artifical and just used to read the system array.
+	 * btrfs_set_buffer_uptodate() call does not properly mark all it's
+	 * pages up-to-date when the page is larger: extent does not cover the
+	 * whole page and consequently check_page_uptodate does not find all
+	 * the page's extents up-to-date (the hole beyond sb),
+	 * write_extent_buffer then triggers a WARN_ON.
+	 *
+	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
+	 * but sb spans only this function. Add an explicit SetPageUptodate call
+	 * to silence the warning eg. on PowerPC 64.
+	 */
+	if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+		SetPageUptodate(sb->pages[0]);
+
+	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	array_ptr = super_copy->sys_chunk_array;
+	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+	cur_offset = 0;
+
+	while (cur_offset < array_size) {
+		disk_key = (struct btrfs_disk_key *)array_ptr;
+		len = sizeof(*disk_key);
+		if (cur_offset + len > array_size)
+			goto out_short_read;
+
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		array_ptr += len;
+		sb_array_offset += len;
+		cur_offset += len;
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)sb_array_offset;
+			/*
+			 * At least one btrfs_chunk with one stripe must be
+			 * present, exact stripe count check comes afterwards
+			 */
+			len = btrfs_chunk_item_size(1);
+			if (cur_offset + len > array_size)
+				goto out_short_read;
+
+			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+			len = btrfs_chunk_item_size(num_stripes);
+			if (cur_offset + len > array_size)
+				goto out_short_read;
+
+			ret = read_one_chunk(root, &key, sb, chunk);
+			if (ret)
+				break;
+		} else {
+			ret = -EIO;
+			break;
+		}
+		array_ptr += len;
+		sb_array_offset += len;
+		cur_offset += len;
+	}
+	free_extent_buffer(sb);
+	return ret;
+
+out_short_read:
+	printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+			len, cur_offset);
+	free_extent_buffer(sb);
+	return -EIO;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+	int slot;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	mutex_lock(&uuid_mutex);
+	lock_chunks(root);
+
+	/*
+	 * Read all device items, and then all the chunk items. All
+	 * device items are found before any chunk item (their object id
+	 * is smaller than the lowest possible object id for a chunk
+	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
+	 */
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+			struct btrfs_dev_item *dev_item;
+			dev_item = btrfs_item_ptr(leaf, slot,
+						  struct btrfs_dev_item);
+			ret = read_one_dev(root, leaf, dev_item);
+			if (ret)
+				goto error;
+		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+			struct btrfs_chunk *chunk;
+			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+			ret = read_one_chunk(root, &found_key, leaf, chunk);
+			if (ret)
+				goto error;
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+error:
+	unlock_chunks(root);
+	mutex_unlock(&uuid_mutex);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+
+	while (fs_devices) {
+		mutex_lock(&fs_devices->device_list_mutex);
+		list_for_each_entry(device, &fs_devices->devices, dev_list)
+			device->dev_root = fs_info->dev_root;
+		mutex_unlock(&fs_devices->device_list_mutex);
+
+		fs_devices = fs_devices->seed;
+	}
+}
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct extent_buffer *eb;
+	int slot;
+	int ret = 0;
+	struct btrfs_device *device;
+	struct btrfs_path *path = NULL;
+	int i;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		int item_size;
+		struct btrfs_dev_stats_item *ptr;
+
+		key.objectid = 0;
+		key.type = BTRFS_DEV_STATS_KEY;
+		key.offset = device->devid;
+		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+		if (ret) {
+			__btrfs_reset_dev_stats(device);
+			device->dev_stats_valid = 1;
+			btrfs_release_path(path);
+			continue;
+		}
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		item_size = btrfs_item_size_nr(eb, slot);
+
+		ptr = btrfs_item_ptr(eb, slot,
+				     struct btrfs_dev_stats_item);
+
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+			if (item_size >= (1 + i) * sizeof(__le64))
+				btrfs_dev_stat_set(device, i,
+					btrfs_dev_stats_value(eb, ptr, i));
+			else
+				btrfs_dev_stat_reset(device, i);
+		}
+
+		device->dev_stats_valid = 1;
+		btrfs_dev_stat_print_on_load(device);
+		btrfs_release_path(path);
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+	btrfs_free_path(path);
+	return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+				struct btrfs_root *dev_root,
+				struct btrfs_device *device)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_dev_stats_item *ptr;
+	int ret;
+	int i;
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_STATS_KEY;
+	key.offset = device->devid;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+	if (ret < 0) {
+		printk_in_rcu(KERN_WARNING "BTRFS: "
+			"error %d while searching for dev_stats item for device %s!\n",
+			      ret, rcu_str_deref(device->name));
+		goto out;
+	}
+
+	if (ret == 0 &&
+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+		/* need to delete old one and insert a new one */
+		ret = btrfs_del_item(trans, dev_root, path);
+		if (ret != 0) {
+			printk_in_rcu(KERN_WARNING "BTRFS: "
+				"delete too small dev_stats item for device %s failed %d!\n",
+				      rcu_str_deref(device->name), ret);
+			goto out;
+		}
+		ret = 1;
+	}
+
+	if (ret == 1) {
+		/* need to insert a new item */
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, dev_root, path,
+					      &key, sizeof(*ptr));
+		if (ret < 0) {
+			printk_in_rcu(KERN_WARNING "BTRFS: "
+					  "insert dev_stats item for device %s failed %d!\n",
+				      rcu_str_deref(device->name), ret);
+			goto out;
+		}
+	}
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		btrfs_set_dev_stats_value(eb, ptr, i,
+					  btrfs_dev_stat_read(device, i));
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	int stats_cnt;
+	int ret = 0;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
+			continue;
+
+		stats_cnt = atomic_read(&device->dev_stats_ccnt);
+		ret = update_dev_stat_item(trans, dev_root, device);
+		if (!ret)
+			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+	btrfs_dev_stat_inc(dev, index);
+	btrfs_dev_stat_print_on_error(dev);
+}
+
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+	if (!dev->dev_stats_valid)
+		return;
+	printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+			   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+			   rcu_str_deref(dev->name),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		if (btrfs_dev_stat_read(dev, i) != 0)
+			break;
+	if (i == BTRFS_DEV_STAT_VALUES_MAX)
+		return; /* all values == 0, suppress message */
+
+	printk_in_rcu(KERN_INFO "BTRFS: "
+		   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	       rcu_str_deref(dev->name),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+			struct btrfs_ioctl_get_dev_stats *stats)
+{
+	struct btrfs_device *dev;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int i;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	if (!dev) {
+		btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
+		return -ENODEV;
+	} else if (!dev->dev_stats_valid) {
+		btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
+		return -ENODEV;
+	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+			if (stats->nr_items > i)
+				stats->values[i] =
+					btrfs_dev_stat_read_and_reset(dev, i);
+			else
+				btrfs_dev_stat_reset(dev, i);
+		}
+	} else {
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+			if (stats->nr_items > i)
+				stats->values[i] = btrfs_dev_stat_read(dev, i);
+	}
+	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+	return 0;
+}
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+
+	bh = btrfs_read_dev_super(device->bdev);
+	if (!bh)
+		return -EINVAL;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	return 0;
+}
+
+/*
+ * Update the size of all devices, which is used for writing out the
+ * super blocks.
+ */
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *curr, *next;
+
+	if (list_empty(&fs_devices->resized_devices))
+		return;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	lock_chunks(fs_info->dev_root);
+	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
+				 resized_list) {
+		list_del_init(&curr->resized_list);
+		curr->commit_total_bytes = curr->disk_total_bytes;
+	}
+	unlock_chunks(fs_info->dev_root);
+	mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+/* Must be invoked during the transaction commit */
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+					struct btrfs_transaction *transaction)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_device *dev;
+	int i;
+
+	if (list_empty(&transaction->pending_chunks))
+		return;
+
+	/* In order to kick the device replace finish process */
+	lock_chunks(root);
+	list_for_each_entry(em, &transaction->pending_chunks, list) {
+		map = (struct map_lookup *)em->bdev;
+
+		for (i = 0; i < map->num_stripes; i++) {
+			dev = map->stripes[i].dev;
+			dev->commit_bytes_used = dev->bytes_used;
+		}
+	}
+	unlock_chunks(root);
+}
diff --git a/kernel/fs/btrfs/volumes.h b/kernel/fs/btrfs/volumes.h
new file mode 100644
index 000000000..ebc31331a
--- /dev/null
+++ b/kernel/fs/btrfs/volumes.h
@@ -0,0 +1,541 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+
+#include <linux/bio.h>
+#include <linux/sort.h>
+#include <linux/btrfs.h>
+#include "async-thread.h"
+
+extern struct mutex uuid_mutex;
+
+#define BTRFS_STRIPE_LEN	(64 * 1024)
+
+struct buffer_head;
+struct btrfs_pending_bios {
+	struct bio *head;
+	struct bio *tail;
+};
+
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device)	\
+	seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
+
+struct btrfs_device {
+	struct list_head dev_list;
+	struct list_head dev_alloc_list;
+	struct btrfs_fs_devices *fs_devices;
+
+	struct btrfs_root *dev_root;
+
+	struct rcu_string *name;
+
+	u64 generation;
+
+	spinlock_t io_lock ____cacheline_aligned;
+	int running_pending;
+	/* regular prio bios */
+	struct btrfs_pending_bios pending_bios;
+	/* WRITE_SYNC bios */
+	struct btrfs_pending_bios pending_sync_bios;
+
+	struct block_device *bdev;
+
+	/* the mode sent to blkdev_get */
+	fmode_t mode;
+
+	int writeable;
+	int in_fs_metadata;
+	int missing;
+	int can_discard;
+	int is_tgtdev_for_dev_replace;
+
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
+	seqcount_t data_seqcount;
+#endif
+
+	/* the internal btrfs device id */
+	u64 devid;
+
+	/* size of the device in memory */
+	u64 total_bytes;
+
+	/* size of the device on disk */
+	u64 disk_total_bytes;
+
+	/* bytes used */
+	u64 bytes_used;
+
+	/* optimal io alignment for this device */
+	u32 io_align;
+
+	/* optimal io width for this device */
+	u32 io_width;
+	/* type and info about this device */
+	u64 type;
+
+	/* minimal io size for this device */
+	u32 sector_size;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_UUID_SIZE];
+
+	/*
+	 * size of the device on the current transaction
+	 *
+	 * This variant is update when committing the transaction,
+	 * and protected by device_list_mutex
+	 */
+	u64 commit_total_bytes;
+
+	/* bytes used on the current transaction */
+	u64 commit_bytes_used;
+	/*
+	 * used to manage the device which is resized
+	 *
+	 * It is protected by chunk_lock.
+	 */
+	struct list_head resized_list;
+
+	/* for sending down flush barriers */
+	int nobarriers;
+	struct bio *flush_bio;
+	struct completion flush_wait;
+
+	/* per-device scrub information */
+	struct scrub_ctx *scrub_device;
+
+	struct btrfs_work work;
+	struct rcu_head rcu;
+	struct work_struct rcu_work;
+
+	/* readahead state */
+	spinlock_t reada_lock;
+	atomic_t reada_in_flight;
+	u64 reada_next;
+	struct reada_zone *reada_curr_zone;
+	struct radix_tree_root reada_zones;
+	struct radix_tree_root reada_extents;
+
+	/* disk I/O failure stats. For detailed description refer to
+	 * enum btrfs_dev_stat_values in ioctl.h */
+	int dev_stats_valid;
+
+	/* Counter to record the change of device stats */
+	atomic_t dev_stats_ccnt;
+	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+};
+
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name)					\
+static inline u64							\
+btrfs_device_get_##name(const struct btrfs_device *dev)			\
+{									\
+	u64 size;							\
+	unsigned int seq;						\
+									\
+	do {								\
+		seq = read_seqcount_begin(&dev->data_seqcount);		\
+		size = dev->name;					\
+	} while (read_seqcount_retry(&dev->data_seqcount, seq));	\
+	return size;							\
+}									\
+									\
+static inline void							\
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size)		\
+{									\
+	preempt_disable();						\
+	write_seqcount_begin(&dev->data_seqcount);			\
+	dev->name = size;						\
+	write_seqcount_end(&dev->data_seqcount);			\
+	preempt_enable();						\
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#define BTRFS_DEVICE_GETSET_FUNCS(name)					\
+static inline u64							\
+btrfs_device_get_##name(const struct btrfs_device *dev)			\
+{									\
+	u64 size;							\
+									\
+	preempt_disable();						\
+	size = dev->name;						\
+	preempt_enable();						\
+	return size;							\
+}									\
+									\
+static inline void							\
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size)		\
+{									\
+	preempt_disable();						\
+	dev->name = size;						\
+	preempt_enable();						\
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name)					\
+static inline u64							\
+btrfs_device_get_##name(const struct btrfs_device *dev)			\
+{									\
+	return dev->name;						\
+}									\
+									\
+static inline void							\
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size)		\
+{									\
+	dev->name = size;						\
+}
+#endif
+
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+
+struct btrfs_fs_devices {
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+	u64 num_devices;
+	u64 open_devices;
+	u64 rw_devices;
+	u64 missing_devices;
+	u64 total_rw_bytes;
+	u64 total_devices;
+	struct block_device *latest_bdev;
+
+	/* all of the devices in the FS, protected by a mutex
+	 * so we can safely walk it to write out the supers without
+	 * worrying about add/remove by the multi-device code.
+	 * Scrubbing super can kick off supers writing by holding
+	 * this mutex lock.
+	 */
+	struct mutex device_list_mutex;
+	struct list_head devices;
+
+	struct list_head resized_devices;
+	/* devices not currently being allocated */
+	struct list_head alloc_list;
+	struct list_head list;
+
+	struct btrfs_fs_devices *seed;
+	int seeding;
+
+	int opened;
+
+	/* set when we find or add a device that doesn't have the
+	 * nonrot flag set
+	 */
+	int rotating;
+};
+
+#define BTRFS_BIO_INLINE_CSUM_SIZE	64
+
+/*
+ * we need the mirror number and stripe index to be passed around
+ * the call chain while we are processing end_io (especially errors).
+ * Really, what we need is a btrfs_bio structure that has this info
+ * and is properly sized with its stripe array, but we're not there
+ * quite yet.  We have our own btrfs bioset, and all of the bios
+ * we allocate are actually btrfs_io_bios.  We'll cram as much of
+ * struct btrfs_bio as we can into this over time.
+ */
+typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
+struct btrfs_io_bio {
+	unsigned int mirror_num;
+	unsigned int stripe_index;
+	u64 logical;
+	u8 *csum;
+	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+	u8 *csum_allocated;
+	btrfs_io_bio_end_io_t *end_io;
+	struct bio bio;
+};
+
+static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+{
+	return container_of(bio, struct btrfs_io_bio, bio);
+}
+
+struct btrfs_bio_stripe {
+	struct btrfs_device *dev;
+	u64 physical;
+	u64 length; /* only used for discard mappings */
+};
+
+struct btrfs_bio;
+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED	(1 << 0)
+
+struct btrfs_bio {
+	atomic_t refs;
+	atomic_t stripes_pending;
+	struct btrfs_fs_info *fs_info;
+	u64 map_type; /* get from map_lookup->type */
+	bio_end_io_t *end_io;
+	struct bio *orig_bio;
+	unsigned long flags;
+	void *private;
+	atomic_t error;
+	int max_errors;
+	int num_stripes;
+	int mirror_num;
+	int num_tgtdevs;
+	int *tgtdev_map;
+	/*
+	 * logical block numbers for the start of each stripe
+	 * The last one or two are p/q.  These are sorted,
+	 * so raid_map[0] is the start of our full stripe
+	 */
+	u64 *raid_map;
+	struct btrfs_bio_stripe stripes[];
+};
+
+struct btrfs_device_info {
+	struct btrfs_device *dev;
+	u64 dev_offset;
+	u64 max_avail;
+	u64 total_avail;
+};
+
+struct btrfs_raid_attr {
+	int sub_stripes;	/* sub_stripes info for map */
+	int dev_stripes;	/* stripes per dev */
+	int devs_max;		/* max devs to use */
+	int devs_min;		/* min devs needed */
+	int devs_increment;	/* ndevs has to be a multiple of this */
+	int ncopies;		/* how many copies to data has */
+};
+
+struct map_lookup {
+	u64 type;
+	int io_align;
+	int io_width;
+	int stripe_len;
+	int sector_size;
+	int num_stripes;
+	int sub_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
+
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA		(1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM		(1ULL << 1)
+#define BTRFS_BALANCE_METADATA		(1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
+					 BTRFS_BALANCE_SYSTEM |	    \
+					 BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE		(1ULL << 3)
+#define BTRFS_BALANCE_RESUME		(1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
+#define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
+
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+	struct btrfs_fs_info *fs_info;
+
+	struct btrfs_balance_args data;
+	struct btrfs_balance_args meta;
+	struct btrfs_balance_args sys;
+
+	u64 flags;
+
+	struct btrfs_balance_progress stat;
+};
+
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length);
+void btrfs_get_bbio(struct btrfs_bio *bbio);
+void btrfs_put_bbio(struct btrfs_bio *bbio);
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+		    u64 logical, u64 *length,
+		    struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+		     u64 logical, u64 *length,
+		     struct btrfs_bio **bbio_ret, int mirror_num,
+		     int need_raid_map);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num, int async_submit);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device);
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+void btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
+				       u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device *srcdev,
+				  struct btrfs_device **device_out);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs);
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *max_avail);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+			struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+					struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, u64 chunk_offset);
+
+static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
+{
+	return atomic_read(&dev->dev_stats_ccnt);
+}
+
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+				      int index)
+{
+	atomic_inc(dev->dev_stat_values + index);
+	smp_mb__before_atomic();
+	atomic_inc(&dev->dev_stats_ccnt);
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+				      int index)
+{
+	return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+						int index)
+{
+	int ret;
+
+	ret = atomic_xchg(dev->dev_stat_values + index, 0);
+	smp_mb__before_atomic();
+	atomic_inc(&dev->dev_stats_ccnt);
+	return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+				      int index, unsigned long val)
+{
+	atomic_set(dev->dev_stat_values + index, val);
+	smp_mb__before_atomic();
+	atomic_inc(&dev->dev_stats_ccnt);
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+					int index)
+{
+	btrfs_dev_stat_set(dev, index, 0);
+}
+
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+					struct btrfs_transaction *transaction);
+
+static inline void lock_chunks(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static inline void unlock_chunks(struct btrfs_root *root)
+{
+	mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+
+#endif
diff --git a/kernel/fs/btrfs/xattr.c b/kernel/fs/btrfs/xattr.c
new file mode 100644
index 000000000..6f518c90e
--- /dev/null
+++ b/kernel/fs/btrfs/xattr.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+#include "props.h"
+#include "locking.h"
+
+
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret = 0;
+	unsigned long data_ptr;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* lookup the xattr by name */
+	di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
+				strlen(name), 0);
+	if (!di) {
+		ret = -ENODATA;
+		goto out;
+	} else if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	/* if size is 0, that means we want the size of the attr */
+	if (!size) {
+		ret = btrfs_dir_data_len(leaf, di);
+		goto out;
+	}
+
+	/* now get the data out of our dir_item */
+	if (btrfs_dir_data_len(leaf, di) > size) {
+		ret = -ERANGE;
+		goto out;
+	}
+
+	/*
+	 * The way things are packed into the leaf is like this
+	 * |struct btrfs_dir_item|name|data|
+	 * where name is the xattr name, so security.foo, and data is the
+	 * content of the xattr.  data_ptr points to the location in memory
+	 * where the data starts in the in memory leaf
+	 */
+	data_ptr = (unsigned long)((char *)(di + 1) +
+				   btrfs_dir_name_len(leaf, di));
+	read_extent_buffer(leaf, buffer, data_ptr,
+			   btrfs_dir_data_len(leaf, di));
+	ret = btrfs_dir_data_len(leaf, di);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int do_setxattr(struct btrfs_trans_handle *trans,
+		       struct inode *inode, const char *name,
+		       const void *value, size_t size, int flags)
+{
+	struct btrfs_dir_item *di = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	size_t name_len = strlen(name);
+	int ret = 0;
+
+	if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+		return -ENOSPC;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->skip_release_on_error = 1;
+
+	if (!value) {
+		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+					name, name_len, -1);
+		if (!di && (flags & XATTR_REPLACE))
+			ret = -ENODATA;
+		else if (IS_ERR(di))
+			ret = PTR_ERR(di);
+		else if (di)
+			ret = btrfs_delete_one_dir_name(trans, root, path, di);
+		goto out;
+	}
+
+	/*
+	 * For a replace we can't just do the insert blindly.
+	 * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+	 * doesn't exist. If it exists, fall down below to the insert/replace
+	 * path - we can't race with a concurrent xattr delete, because the VFS
+	 * locks the inode's i_mutex before calling setxattr or removexattr.
+	 */
+	if (flags & XATTR_REPLACE) {
+		ASSERT(mutex_is_locked(&inode->i_mutex));
+		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+					name, name_len, 0);
+		if (!di)
+			ret = -ENODATA;
+		else if (IS_ERR(di))
+			ret = PTR_ERR(di);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+		di = NULL;
+	}
+
+	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+				      name, name_len, value, size);
+	if (ret == -EOVERFLOW) {
+		/*
+		 * We have an existing item in a leaf, split_leaf couldn't
+		 * expand it. That item might have or not a dir_item that
+		 * matches our target xattr, so lets check.
+		 */
+		ret = 0;
+		btrfs_assert_tree_locked(path->nodes[0]);
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (!di && !(flags & XATTR_REPLACE)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	} else if (ret == -EEXIST) {
+		ret = 0;
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		ASSERT(di); /* logic error */
+	} else if (ret) {
+		goto out;
+	}
+
+	if (di && (flags & XATTR_CREATE)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (di) {
+		/*
+		 * We're doing a replace, and it must be atomic, that is, at
+		 * any point in time we have either the old or the new xattr
+		 * value in the tree. We don't want readers (getxattr and
+		 * listxattrs) to miss a value, this is specially important
+		 * for ACLs.
+		 */
+		const int slot = path->slots[0];
+		struct extent_buffer *leaf = path->nodes[0];
+		const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+		const u32 item_size = btrfs_item_size_nr(leaf, slot);
+		const u32 data_size = sizeof(*di) + name_len + size;
+		struct btrfs_item *item;
+		unsigned long data_ptr;
+		char *ptr;
+
+		if (size > old_data_len) {
+			if (btrfs_leaf_free_space(root, leaf) <
+			    (size - old_data_len)) {
+				ret = -ENOSPC;
+				goto out;
+			}
+		}
+
+		if (old_data_len + name_len + sizeof(*di) == item_size) {
+			/* No other xattrs packed in the same leaf item. */
+			if (size > old_data_len)
+				btrfs_extend_item(root, path,
+						  size - old_data_len);
+			else if (size < old_data_len)
+				btrfs_truncate_item(root, path, data_size, 1);
+		} else {
+			/* There are other xattrs packed in the same item. */
+			ret = btrfs_delete_one_dir_name(trans, root, path, di);
+			if (ret)
+				goto out;
+			btrfs_extend_item(root, path, data_size);
+		}
+
+		item = btrfs_item_nr(slot);
+		ptr = btrfs_item_ptr(leaf, slot, char);
+		ptr += btrfs_item_size(leaf, item) - data_size;
+		di = (struct btrfs_dir_item *)ptr;
+		btrfs_set_dir_data_len(leaf, di, size);
+		data_ptr = ((unsigned long)(di + 1)) + name_len;
+		write_extent_buffer(leaf, value, data_ptr, size);
+		btrfs_mark_buffer_dirty(leaf);
+	} else {
+		/*
+		 * Insert, and we had space for the xattr, so path->slots[0] is
+		 * where our xattr dir_item is and btrfs_insert_xattr_item()
+		 * filled it.
+		 */
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * @value: "" makes the attribute to empty, NULL removes it
+ */
+int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+		     struct inode *inode, const char *name,
+		     const void *value, size_t size, int flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (trans)
+		return do_setxattr(trans, inode, name, value, size, flags);
+
+	trans = btrfs_start_transaction(root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = do_setxattr(trans, inode, name, value, size, flags);
+	if (ret)
+		goto out;
+
+	inode_inc_iversion(inode);
+	inode->i_ctime = CURRENT_TIME;
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
+	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
+out:
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct btrfs_key key, found_key;
+	struct inode *inode = d_inode(dentry);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	int ret = 0, slot;
+	size_t total_size = 0, size_left = size;
+	unsigned long name_ptr;
+	size_t name_len;
+
+	/*
+	 * ok we want all objects associated with this id.
+	 * NOTE: we set key.offset = 0; because we want to start with the
+	 * first xattr that we find and walk forward
+	 */
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_XATTR_ITEM_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+
+	/* search for our xattrs */
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		/* this is where we start walking through the path */
+		if (slot >= btrfs_header_nritems(leaf)) {
+			/*
+			 * if we've reached the last slot in this leaf we need
+			 * to go to the next leaf and reset everything
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* check to make sure this item is what we want */
+		if (found_key.objectid != key.objectid)
+			break;
+		if (found_key.type != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		if (verify_dir_item(root, leaf, di))
+			goto next;
+
+		name_len = btrfs_dir_name_len(leaf, di);
+		total_size += name_len + 1;
+
+		/* we are just looking for how big our buffer needs to be */
+		if (!size)
+			goto next;
+
+		if (!buffer || (name_len + 1) > size_left) {
+			ret = -ERANGE;
+			goto err;
+		}
+
+		name_ptr = (unsigned long)(di + 1);
+		read_extent_buffer(leaf, buffer, name_ptr, name_len);
+		buffer[name_len] = '\0';
+
+		size_left -= name_len + 1;
+		buffer += name_len + 1;
+next:
+		path->slots[0]++;
+	}
+	ret = total_size;
+
+err:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL,
+};
+
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This is applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static int btrfs_is_valid_xattr(const char *name)
+{
+	int len = strlen(name);
+	int prefixlen = 0;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN))
+		prefixlen = XATTR_SECURITY_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		prefixlen = XATTR_SYSTEM_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		prefixlen = XATTR_TRUSTED_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+		prefixlen = XATTR_USER_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		prefixlen = XATTR_BTRFS_PREFIX_LEN;
+	else
+		return -EOPNOTSUPP;
+
+	/*
+	 * The name cannot consist of just prefix
+	 */
+	if (len <= prefixlen)
+		return -EINVAL;
+
+	return 0;
+}
+
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size)
+{
+	int ret;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, buffer, size);
+
+	ret = btrfs_is_valid_xattr(name);
+	if (ret)
+		return ret;
+	return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+}
+
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags)
+{
+	struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+	int ret;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, size, flags);
+
+	ret = btrfs_is_valid_xattr(name);
+	if (ret)
+		return ret;
+
+	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		return btrfs_set_prop(d_inode(dentry), name,
+				      value, size, flags);
+
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+
+	return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
+				flags);
+}
+
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+	int ret;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	ret = btrfs_is_valid_xattr(name);
+	if (ret)
+		return ret;
+
+	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		return btrfs_set_prop(d_inode(dentry), name,
+				      NULL, 0, XATTR_REPLACE);
+
+	return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
+				XATTR_REPLACE);
+}
+
+static int btrfs_initxattrs(struct inode *inode,
+			    const struct xattr *xattr_array, void *fs_info)
+{
+	const struct xattr *xattr;
+	struct btrfs_trans_handle *trans = fs_info;
+	char *name;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+			       strlen(xattr->name) + 1, GFP_NOFS);
+		if (!name) {
+			err = -ENOMEM;
+			break;
+		}
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+		err = __btrfs_setxattr(trans, inode, name,
+				       xattr->value, xattr->value_len, 0);
+		kfree(name);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+			      struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &btrfs_initxattrs, trans);
+}
diff --git a/kernel/fs/btrfs/xattr.h b/kernel/fs/btrfs/xattr.h
new file mode 100644
index 000000000..5049608d1
--- /dev/null
+++ b/kernel/fs/btrfs/xattr.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __XATTR__
+#define __XATTR__
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler *btrfs_xattr_handlers[];
+
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+		void *buffer, size_t size);
+extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+			    struct inode *inode, const char *name,
+			    const void *value, size_t size, int flags);
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+
+extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+				     struct inode *inode, struct inode *dir,
+				     const struct qstr *qstr);
+
+#endif /* __XATTR__ */
diff --git a/kernel/fs/btrfs/zlib.c b/kernel/fs/btrfs/zlib.c
new file mode 100644
index 000000000..82990b8f8
--- /dev/null
+++ b/kernel/fs/btrfs/zlib.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+
+struct workspace {
+	z_stream strm;
+	char *buf;
+	struct list_head list;
+};
+
+static void zlib_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	vfree(workspace->strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+}
+
+static struct list_head *zlib_alloc_workspace(void)
+{
+	struct workspace *workspace;
+	int workspacesize;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+			zlib_inflate_workspacesize());
+	workspace->strm.workspace = vmalloc(workspacesize);
+	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+	if (!workspace->strm.workspace || !workspace->buf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	zlib_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int zlib_compress_pages(struct list_head *ws,
+			       struct address_space *mapping,
+			       u64 start, unsigned long len,
+			       struct page **pages,
+			       unsigned long nr_dest_pages,
+			       unsigned long *out_pages,
+			       unsigned long *total_in,
+			       unsigned long *total_out,
+			       unsigned long max_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	unsigned long bytes_left;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
+		printk(KERN_WARNING "BTRFS: deflateInit failed\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	workspace->strm.total_in = 0;
+	workspace->strm.total_out = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	cpage_out = kmap(out_page);
+	pages[0] = out_page;
+	nr_pages = 1;
+
+	workspace->strm.next_in = data_in;
+	workspace->strm.next_out = cpage_out;
+	workspace->strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+	while (workspace->strm.total_in < len) {
+		ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
+		if (ret != Z_OK) {
+			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
+			       ret);
+			zlib_deflateEnd(&workspace->strm);
+			ret = -EIO;
+			goto out;
+		}
+
+		/* we're making it bigger, give up */
+		if (workspace->strm.total_in > 8192 &&
+		    workspace->strm.total_in <
+		    workspace->strm.total_out) {
+			ret = -E2BIG;
+			goto out;
+		}
+		/* we need another page for writing out.  Test this
+		 * before the total_in so we will pull in a new page for
+		 * the stream end if required
+		 */
+		if (workspace->strm.avail_out == 0) {
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -E2BIG;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (out_page == NULL) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			cpage_out = kmap(out_page);
+			pages[nr_pages] = out_page;
+			nr_pages++;
+			workspace->strm.avail_out = PAGE_CACHE_SIZE;
+			workspace->strm.next_out = cpage_out;
+		}
+		/* we're all done */
+		if (workspace->strm.total_in >= len)
+			break;
+
+		/* we've read in a full page, get a new one */
+		if (workspace->strm.avail_in == 0) {
+			if (workspace->strm.total_out > max_out)
+				break;
+
+			bytes_left = len - workspace->strm.total_in;
+			kunmap(in_page);
+			page_cache_release(in_page);
+
+			start += PAGE_CACHE_SIZE;
+			in_page = find_get_page(mapping,
+						start >> PAGE_CACHE_SHIFT);
+			data_in = kmap(in_page);
+			workspace->strm.avail_in = min(bytes_left,
+							   PAGE_CACHE_SIZE);
+			workspace->strm.next_in = data_in;
+		}
+	}
+	workspace->strm.avail_in = 0;
+	ret = zlib_deflate(&workspace->strm, Z_FINISH);
+	zlib_deflateEnd(&workspace->strm);
+
+	if (ret != Z_STREAM_END) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (workspace->strm.total_out >= workspace->strm.total_in) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	ret = 0;
+	*total_out = workspace->strm.total_out;
+	*total_in = workspace->strm.total_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+	return ret;
+}
+
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+				  u64 disk_start,
+				  struct bio_vec *bvec,
+				  int vcnt,
+				  size_t srclen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0, ret2;
+	int wbits = MAX_WBITS;
+	char *data_in;
+	size_t total_out = 0;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+	unsigned long buf_start;
+	unsigned long pg_offset;
+
+	data_in = kmap(pages_in[page_in_index]);
+	workspace->strm.next_in = data_in;
+	workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+	workspace->strm.total_in = 0;
+
+	workspace->strm.total_out = 0;
+	workspace->strm.next_out = workspace->buf;
+	workspace->strm.avail_out = PAGE_CACHE_SIZE;
+	pg_offset = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->strm.next_in += 2;
+		workspace->strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
+		printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+		return -EIO;
+	}
+	while (workspace->strm.total_in < srclen) {
+		ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END)
+			break;
+
+		buf_start = total_out;
+		total_out = workspace->strm.total_out;
+
+		/* we didn't make progress in this inflate call, we're done */
+		if (buf_start == total_out)
+			break;
+
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 total_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0) {
+			ret = 0;
+			goto done;
+		}
+
+		workspace->strm.next_out = workspace->buf;
+		workspace->strm.avail_out = PAGE_CACHE_SIZE;
+
+		if (workspace->strm.avail_in == 0) {
+			unsigned long tmp;
+			kunmap(pages_in[page_in_index]);
+			page_in_index++;
+			if (page_in_index >= total_pages_in) {
+				data_in = NULL;
+				break;
+			}
+			data_in = kmap(pages_in[page_in_index]);
+			workspace->strm.next_in = data_in;
+			tmp = srclen - workspace->strm.total_in;
+			workspace->strm.avail_in = min(tmp,
+							   PAGE_CACHE_SIZE);
+		}
+	}
+	if (ret != Z_STREAM_END)
+		ret = -EIO;
+	else
+		ret = 0;
+done:
+	zlib_inflateEnd(&workspace->strm);
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+	if (!ret)
+		btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
+	return ret;
+}
+
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+			   struct page *dest_page,
+			   unsigned long start_byte,
+			   size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	unsigned long bytes_left;
+	unsigned long total_out = 0;
+	unsigned long pg_offset = 0;
+	char *kaddr;
+
+	destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+	bytes_left = destlen;
+
+	workspace->strm.next_in = data_in;
+	workspace->strm.avail_in = srclen;
+	workspace->strm.total_in = 0;
+
+	workspace->strm.next_out = workspace->buf;
+	workspace->strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->strm.total_out = 0;
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->strm.next_in += 2;
+		workspace->strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
+		printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+		return -EIO;
+	}
+
+	while (bytes_left > 0) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+
+		ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END)
+			break;
+
+		buf_start = total_out;
+		total_out = workspace->strm.total_out;
+
+		if (total_out == buf_start) {
+			ret = -EIO;
+			break;
+		}
+
+		if (total_out <= start_byte)
+			goto next;
+
+		if (total_out > start_byte && buf_start < start_byte)
+			buf_offset = start_byte - buf_start;
+		else
+			buf_offset = 0;
+
+		bytes = min(PAGE_CACHE_SIZE - pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, bytes_left);
+
+		kaddr = kmap_atomic(dest_page);
+		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+		kunmap_atomic(kaddr);
+
+		pg_offset += bytes;
+		bytes_left -= bytes;
+next:
+		workspace->strm.next_out = workspace->buf;
+		workspace->strm.avail_out = PAGE_CACHE_SIZE;
+	}
+
+	if (ret != Z_STREAM_END && bytes_left != 0)
+		ret = -EIO;
+	else
+		ret = 0;
+
+	zlib_inflateEnd(&workspace->strm);
+
+	/*
+	 * this should only happen if zlib returned fewer bytes than we
+	 * expected.  btrfs_get_block is responsible for zeroing from the
+	 * end of the inline extent (destlen) to the end of the page
+	 */
+	if (pg_offset < destlen) {
+		kaddr = kmap_atomic(dest_page);
+		memset(kaddr + pg_offset, 0, destlen - pg_offset);
+		kunmap_atomic(kaddr);
+	}
+	return ret;
+}
+
+const struct btrfs_compress_op btrfs_zlib_compress = {
+	.alloc_workspace	= zlib_alloc_workspace,
+	.free_workspace		= zlib_free_workspace,
+	.compress_pages		= zlib_compress_pages,
+	.decompress_biovec	= zlib_decompress_biovec,
+	.decompress		= zlib_decompress,
+};
diff --git a/kernel/fs/buffer.c b/kernel/fs/buffer.c
new file mode 100644
index 000000000..2907544c3
--- /dev/null
+++ b/kernel/fs/buffer.c
@@ -0,0 +1,3422 @@
+/*
+ *  linux/fs/buffer.c
+ *
+ *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
+ */
+
+/*
+ * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
+ *
+ * Removed a lot of unnecessary code and simplified things now that
+ * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
+ *
+ * Speed up hash, lru, and free list operations.  Use gfp() for allocating
+ * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
+ *
+ * Added 32k buffer block sizes - these are required older ARM systems. - RMK
+ *
+ * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
+ */
+
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/export.h>
+#include <linux/writeback.h>
+#include <linux/hash.h>
+#include <linux/suspend.h>
+#include <linux/buffer_head.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/bio.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/mpage.h>
+#include <linux/bit_spinlock.h>
+#include <trace/events/block.h>
+
+static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+
+#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
+
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+{
+	bh->b_end_io = handler;
+	bh->b_private = private;
+}
+EXPORT_SYMBOL(init_buffer);
+
+inline void touch_buffer(struct buffer_head *bh)
+{
+	trace_block_touch_buffer(bh);
+	mark_page_accessed(bh->b_page);
+}
+EXPORT_SYMBOL(touch_buffer);
+
+void __lock_buffer(struct buffer_head *bh)
+{
+	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__lock_buffer);
+
+void unlock_buffer(struct buffer_head *bh)
+{
+	clear_bit_unlock(BH_Lock, &bh->b_state);
+	smp_mb__after_atomic();
+	wake_up_bit(&bh->b_state, BH_Lock);
+}
+EXPORT_SYMBOL(unlock_buffer);
+
+/*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback)
+{
+	struct buffer_head *head, *bh;
+	*dirty = false;
+	*writeback = false;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		return;
+
+	if (PageWriteback(page))
+		*writeback = true;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_locked(bh))
+			*writeback = true;
+
+		if (buffer_dirty(bh))
+			*dirty = true;
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
+/*
+ * Block until a buffer comes unlocked.  This doesn't stop it
+ * from becoming locked again - you have to lock it yourself
+ * if you want to preserve its state.
+ */
+void __wait_on_buffer(struct buffer_head * bh)
+{
+	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__wait_on_buffer);
+
+static void
+__clear_page_buffers(struct page *page)
+{
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page_cache_release(page);
+}
+
+static void buffer_io_error(struct buffer_head *bh, char *msg)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (!test_bit(BH_Quiet, &bh->b_state))
+		printk_ratelimited(KERN_ERR
+			"Buffer I/O error on dev %s, logical block %llu%s\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr, msg);
+}
+
+/*
+ * End-of-IO handler helper function which does not touch the bh after
+ * unlocking it.
+ * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
+ * a race there is benign: unlock_buffer() only use the bh's address for
+ * hashing after unlocking the buffer, so it doesn't actually touch the bh
+ * itself.
+ */
+static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		/* This happens, due to failed READA attempts. */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+}
+
+/*
+ * Default synchronous end-of-IO handler..  Just mark it up-to-date and
+ * unlock the buffer. This is what ll_rw_block uses too.
+ */
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
+{
+	__end_buffer_read_notouch(bh, uptodate);
+	put_bh(bh);
+}
+EXPORT_SYMBOL(end_buffer_read_sync);
+
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		buffer_io_error(bh, ", lost sync page write");
+		set_buffer_write_io_error(bh);
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+EXPORT_SYMBOL(end_buffer_write_sync);
+
+/*
+ * Various filesystems appear to want __find_get_block to be non-blocking.
+ * But it's the page lock which protects the buffers.  To get around this,
+ * we get exclusion from try_to_free_buffers with the blockdev mapping's
+ * private_lock.
+ *
+ * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
+ * may be quite high.  This code could TryLock the page, and if that
+ * succeeds, there is no need to take private_lock. (But if
+ * private_lock is contended then so is mapping->tree_lock).
+ */
+static struct buffer_head *
+__find_get_block_slow(struct block_device *bdev, sector_t block)
+{
+	struct inode *bd_inode = bdev->bd_inode;
+	struct address_space *bd_mapping = bd_inode->i_mapping;
+	struct buffer_head *ret = NULL;
+	pgoff_t index;
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct page *page;
+	int all_mapped = 1;
+
+	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
+	if (!page)
+		goto out;
+
+	spin_lock(&bd_mapping->private_lock);
+	if (!page_has_buffers(page))
+		goto out_unlock;
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (!buffer_mapped(bh))
+			all_mapped = 0;
+		else if (bh->b_blocknr == block) {
+			ret = bh;
+			get_bh(bh);
+			goto out_unlock;
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/* we might be here because some of the buffers on this page are
+	 * not mapped.  This is due to various races between
+	 * file io on the block device and getblk.  It gets dealt with
+	 * elsewhere, don't buffer_error if we had some unmapped buffers
+	 */
+	if (all_mapped) {
+		char b[BDEVNAME_SIZE];
+
+		printk("__find_get_block_slow() failed. "
+			"block=%llu, b_blocknr=%llu\n",
+			(unsigned long long)block,
+			(unsigned long long)bh->b_blocknr);
+		printk("b_state=0x%08lx, b_size=%zu\n",
+			bh->b_state, bh->b_size);
+		printk("device %s blocksize: %d\n", bdevname(bdev, b),
+			1 << bd_inode->i_blkbits);
+	}
+out_unlock:
+	spin_unlock(&bd_mapping->private_lock);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+/*
+ * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
+ */
+static void free_more_memory(void)
+{
+	struct zone *zone;
+	int nid;
+
+	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
+	yield();
+
+	for_each_online_node(nid) {
+		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+						gfp_zone(GFP_NOFS), NULL,
+						&zone);
+		if (zone)
+			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
+						GFP_NOFS, NULL);
+	}
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags;
+	struct buffer_head *first;
+	struct buffer_head *tmp;
+	struct page *page;
+	int page_uptodate = 1;
+
+	BUG_ON(!buffer_async_read(bh));
+
+	page = bh->b_page;
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		clear_buffer_uptodate(bh);
+		buffer_io_error(bh, ", async page read");
+		SetPageError(page);
+	}
+
+	/*
+	 * Be _very_ careful from here on. Bad things can happen if
+	 * two buffer heads end IO at almost the same time and both
+	 * decide that the page is now completely done.
+	 */
+	first = page_buffers(page);
+	flags = bh_uptodate_lock_irqsave(first);
+	clear_buffer_async_read(bh);
+	unlock_buffer(bh);
+	tmp = bh;
+	do {
+		if (!buffer_uptodate(tmp))
+			page_uptodate = 0;
+		if (buffer_async_read(tmp)) {
+			BUG_ON(!buffer_locked(tmp));
+			goto still_busy;
+		}
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+	bh_uptodate_unlock_irqrestore(first, flags);
+
+	/*
+	 * If none of the buffers had errors and they are all
+	 * uptodate then we can set the page uptodate.
+	 */
+	if (page_uptodate && !PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	return;
+
+still_busy:
+	bh_uptodate_unlock_irqrestore(first, flags);
+}
+
+/*
+ * Completion handler for block_write_full_page() - pages which are unlocked
+ * during I/O, and which have PageWriteback cleared upon I/O completion.
+ */
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags;
+	struct buffer_head *first;
+	struct buffer_head *tmp;
+	struct page *page;
+
+	BUG_ON(!buffer_async_write(bh));
+
+	page = bh->b_page;
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		buffer_io_error(bh, ", lost async page write");
+		set_bit(AS_EIO, &page->mapping->flags);
+		set_buffer_write_io_error(bh);
+		clear_buffer_uptodate(bh);
+		SetPageError(page);
+	}
+
+	first = page_buffers(page);
+	flags = bh_uptodate_lock_irqsave(first);
+
+	clear_buffer_async_write(bh);
+	unlock_buffer(bh);
+	tmp = bh->b_this_page;
+	while (tmp != bh) {
+		if (buffer_async_write(tmp)) {
+			BUG_ON(!buffer_locked(tmp));
+			goto still_busy;
+		}
+		tmp = tmp->b_this_page;
+	}
+	bh_uptodate_unlock_irqrestore(first, flags);
+	end_page_writeback(page);
+	return;
+
+still_busy:
+	bh_uptodate_unlock_irqrestore(first, flags);
+}
+EXPORT_SYMBOL(end_buffer_async_write);
+
+/*
+ * If a page's buffers are under async readin (end_buffer_async_read
+ * completion) then there is a possibility that another thread of
+ * control could lock one of the buffers after it has completed
+ * but while some of the other buffers have not completed.  This
+ * locked buffer would confuse end_buffer_async_read() into not unlocking
+ * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
+ * that this buffer is not under async I/O.
+ *
+ * The page comes unlocked when it has no locked buffer_async buffers
+ * left.
+ *
+ * PageLocked prevents anyone starting new async I/O reads any of
+ * the buffers.
+ *
+ * PageWriteback is used to prevent simultaneous writeout of the same
+ * page.
+ *
+ * PageLocked prevents anyone from starting writeback of a page which is
+ * under read I/O (PageWriteback is only ever set against a locked page).
+ */
+static void mark_buffer_async_read(struct buffer_head *bh)
+{
+	bh->b_end_io = end_buffer_async_read;
+	set_buffer_async_read(bh);
+}
+
+static void mark_buffer_async_write_endio(struct buffer_head *bh,
+					  bh_end_io_t *handler)
+{
+	bh->b_end_io = handler;
+	set_buffer_async_write(bh);
+}
+
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+	mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
+EXPORT_SYMBOL(mark_buffer_async_write);
+
+
+/*
+ * fs/buffer.c contains helper functions for buffer-backed address space's
+ * fsync functions.  A common requirement for buffer-based filesystems is
+ * that certain data from the backing blockdev needs to be written out for
+ * a successful fsync().  For example, ext2 indirect blocks need to be
+ * written back and waited upon before fsync() returns.
+ *
+ * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * inode_has_buffers() and invalidate_inode_buffers() are provided for the
+ * management of a list of dependent buffers at ->i_mapping->private_list.
+ *
+ * Locking is a little subtle: try_to_free_buffers() will remove buffers
+ * from their controlling inode's queue when they are being freed.  But
+ * try_to_free_buffers() will be operating against the *blockdev* mapping
+ * at the time, not against the S_ISREG file which depends on those buffers.
+ * So the locking for private_list is via the private_lock in the address_space
+ * which backs the buffers.  Which is different from the address_space 
+ * against which the buffers are listed.  So for a particular address_space,
+ * mapping->private_lock does *not* protect mapping->private_list!  In fact,
+ * mapping->private_list will always be protected by the backing blockdev's
+ * ->private_lock.
+ *
+ * Which introduces a requirement: all buffers on an address_space's
+ * ->private_list must be from the same address_space: the blockdev's.
+ *
+ * address_spaces which do not place buffers at ->private_list via these
+ * utility functions are free to use private_lock and private_list for
+ * whatever they want.  The only requirement is that list_empty(private_list)
+ * be true at clear_inode() time.
+ *
+ * FIXME: clear_inode should not call invalidate_inode_buffers().  The
+ * filesystems should do that.  invalidate_inode_buffers() should just go
+ * BUG_ON(!list_empty).
+ *
+ * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
+ * take an address_space, not an inode.  And it should be called
+ * mark_buffer_dirty_fsync() to clearly define why those buffers are being
+ * queued up.
+ *
+ * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
+ * list if it is already on a list.  Because if the buffer is on a list,
+ * it *must* already be on the right one.  If not, the filesystem is being
+ * silly.  This will save a ton of locking.  But first we have to ensure
+ * that buffers are taken *off* the old inode's list when they are freed
+ * (presumably in truncate).  That requires careful auditing of all
+ * filesystems (do it inside bforget()).  It could also be done by bringing
+ * b_inode back.
+ */
+
+/*
+ * The buffer's backing address_space's private_lock must be held
+ */
+static void __remove_assoc_queue(struct buffer_head *bh)
+{
+	list_del_init(&bh->b_assoc_buffers);
+	WARN_ON(!bh->b_assoc_map);
+	if (buffer_write_io_error(bh))
+		set_bit(AS_EIO, &bh->b_assoc_map->flags);
+	bh->b_assoc_map = NULL;
+}
+
+int inode_has_buffers(struct inode *inode)
+{
+	return !list_empty(&inode->i_data.private_list);
+}
+
+/*
+ * osync is designed to support O_SYNC io.  It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion.  Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct list_head *p;
+	int err = 0;
+
+	spin_lock(lock);
+repeat:
+	list_for_each_prev(p, list) {
+		bh = BH_ENTRY(p);
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			spin_unlock(lock);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			brelse(bh);
+			spin_lock(lock);
+			goto repeat;
+		}
+	}
+	spin_unlock(lock);
+	return err;
+}
+
+static void do_thaw_one(struct super_block *sb, void *unused)
+{
+	char b[BDEVNAME_SIZE];
+	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+		printk(KERN_WARNING "Emergency Thaw on %s\n",
+		       bdevname(sb->s_bdev, b));
+}
+
+static void do_thaw_all(struct work_struct *work)
+{
+	iterate_supers(do_thaw_one, NULL);
+	kfree(work);
+	printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_thaw_all);
+		schedule_work(work);
+	}
+}
+
+/**
+ * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
+ * @mapping: the mapping which wants those buffers written
+ *
+ * Starts I/O against the buffers at mapping->private_list, and waits upon
+ * that I/O.
+ *
+ * Basically, this is a convenience function for fsync().
+ * @mapping is a file or directory which needs those buffers to be written for
+ * a successful fsync().
+ */
+int sync_mapping_buffers(struct address_space *mapping)
+{
+	struct address_space *buffer_mapping = mapping->private_data;
+
+	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
+		return 0;
+
+	return fsync_buffers_list(&buffer_mapping->private_lock,
+					&mapping->private_list);
+}
+EXPORT_SYMBOL(sync_mapping_buffers);
+
+/*
+ * Called when we've recently written block `bblock', and it is known that
+ * `bblock' was for a buffer_boundary() buffer.  This means that the block at
+ * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
+ * dirty, schedule it for IO.  So that indirects merge nicely with their data.
+ */
+void write_boundary_block(struct block_device *bdev,
+			sector_t bblock, unsigned blocksize)
+{
+	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+	if (bh) {
+		if (buffer_dirty(bh))
+			ll_rw_block(WRITE, 1, &bh);
+		put_bh(bh);
+	}
+}
+
+void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct address_space *buffer_mapping = bh->b_page->mapping;
+
+	mark_buffer_dirty(bh);
+	if (!mapping->private_data) {
+		mapping->private_data = buffer_mapping;
+	} else {
+		BUG_ON(mapping->private_data != buffer_mapping);
+	}
+	if (!bh->b_assoc_map) {
+		spin_lock(&buffer_mapping->private_lock);
+		list_move_tail(&bh->b_assoc_buffers,
+				&mapping->private_list);
+		bh->b_assoc_map = mapping;
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+}
+EXPORT_SYMBOL(mark_buffer_dirty_inode);
+
+/*
+ * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * dirty.
+ *
+ * If warn is true, then emit a warning if the page is not uptodate and has
+ * not been truncated.
+ */
+static void __set_page_dirty(struct page *page,
+		struct address_space *mapping, int warn)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	if (page->mapping) {	/* Race with truncate? */
+		WARN_ON_ONCE(warn && !PageUptodate(page));
+		account_page_dirtied(page, mapping);
+		radix_tree_tag_set(&mapping->page_tree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+	}
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+}
+
+/*
+ * Add a page to the dirty page list.
+ *
+ * It is a sad fact of life that this function is called from several places
+ * deeply under spinlocking.  It may not sleep.
+ *
+ * If the page has buffers, the uptodate buffers are set dirty, to preserve
+ * dirty-state coherency between the page and the buffers.  It the page does
+ * not have buffers then when they are later attached they will all be set
+ * dirty.
+ *
+ * The buffers are dirtied before the page is dirtied.  There's a small race
+ * window in which a writepage caller may see the page cleanness but not the
+ * buffer dirtiness.  That's fine.  If this code were to set the page dirty
+ * before the buffers, a concurrent writepage caller could clear the page dirty
+ * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * page on the dirty page list.
+ *
+ * We use private_lock to lock against try_to_free_buffers while using the
+ * page's buffer list.  Also use this to protect against clean buffers being
+ * added to the page after it was set dirty.
+ *
+ * FIXME: may need to call ->reservepage here as well.  That's rather up to the
+ * address_space though.
+ */
+int __set_page_dirty_buffers(struct page *page)
+{
+	int newly_dirty;
+	struct address_space *mapping = page_mapping(page);
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	spin_lock(&mapping->private_lock);
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		do {
+			set_buffer_dirty(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	newly_dirty = !TestSetPageDirty(page);
+	spin_unlock(&mapping->private_lock);
+
+	if (newly_dirty)
+		__set_page_dirty(page, mapping, 1);
+	return newly_dirty;
+}
+EXPORT_SYMBOL(__set_page_dirty_buffers);
+
+/*
+ * Write out and wait upon a list of buffers.
+ *
+ * We have conflicting pressures: we want to make sure that all
+ * initially dirty buffers get waited on, but that any subsequently
+ * dirtied buffers don't.  After all, we don't want fsync to last
+ * forever if somebody is actively writing to the file.
+ *
+ * Do this in two main stages: first we copy dirty buffers to a
+ * temporary inode list, queueing the writes as we go.  Then we clean
+ * up, waiting for those writes to complete.
+ * 
+ * During this second stage, any subsequent updates to the file may end
+ * up refiling the buffer on the original inode's dirty list again, so
+ * there is a chance we will end up with a buffer queued for write but
+ * not yet completed on that list.  So, as a final cleanup we go through
+ * the osync code to catch these locked, dirty buffers without requeuing
+ * any newly dirty buffers for write.
+ */
+static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct list_head tmp;
+	struct address_space *mapping;
+	int err = 0, err2;
+	struct blk_plug plug;
+
+	INIT_LIST_HEAD(&tmp);
+	blk_start_plug(&plug);
+
+	spin_lock(lock);
+	while (!list_empty(list)) {
+		bh = BH_ENTRY(list->next);
+		mapping = bh->b_assoc_map;
+		__remove_assoc_queue(bh);
+		/* Avoid race with mark_buffer_dirty_inode() which does
+		 * a lockless check and we rely on seeing the dirty bit */
+		smp_mb();
+		if (buffer_dirty(bh) || buffer_locked(bh)) {
+			list_add(&bh->b_assoc_buffers, &tmp);
+			bh->b_assoc_map = mapping;
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(lock);
+				/*
+				 * Ensure any pending I/O completes so that
+				 * write_dirty_buffer() actually writes the
+				 * current contents - it is a noop if I/O is
+				 * still in flight on potentially older
+				 * contents.
+				 */
+				write_dirty_buffer(bh, WRITE_SYNC);
+
+				/*
+				 * Kick off IO for the previous mapping. Note
+				 * that we will not run the very last mapping,
+				 * wait_on_buffer() will do that for us
+				 * through sync_buffer().
+				 */
+				brelse(bh);
+				spin_lock(lock);
+			}
+		}
+	}
+
+	spin_unlock(lock);
+	blk_finish_plug(&plug);
+	spin_lock(lock);
+
+	while (!list_empty(&tmp)) {
+		bh = BH_ENTRY(tmp.prev);
+		get_bh(bh);
+		mapping = bh->b_assoc_map;
+		__remove_assoc_queue(bh);
+		/* Avoid race with mark_buffer_dirty_inode() which does
+		 * a lockless check and we rely on seeing the dirty bit */
+		smp_mb();
+		if (buffer_dirty(bh)) {
+			list_add(&bh->b_assoc_buffers,
+				 &mapping->private_list);
+			bh->b_assoc_map = mapping;
+		}
+		spin_unlock(lock);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+		spin_lock(lock);
+	}
+	
+	spin_unlock(lock);
+	err2 = osync_buffers_list(lock, list);
+	if (err)
+		return err;
+	else
+		return err2;
+}
+
+/*
+ * Invalidate any and all dirty buffers on a given inode.  We are
+ * probably unmounting the fs, but that doesn't mean we have already
+ * done a sync().  Just drop the buffers from the inode list.
+ *
+ * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
+ * assumes that all the buffers are against the blockdev.  Not true
+ * for reiserfs.
+ */
+void invalidate_inode_buffers(struct inode *inode)
+{
+	if (inode_has_buffers(inode)) {
+		struct address_space *mapping = &inode->i_data;
+		struct list_head *list = &mapping->private_list;
+		struct address_space *buffer_mapping = mapping->private_data;
+
+		spin_lock(&buffer_mapping->private_lock);
+		while (!list_empty(list))
+			__remove_assoc_queue(BH_ENTRY(list->next));
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+}
+EXPORT_SYMBOL(invalidate_inode_buffers);
+
+/*
+ * Remove any clean buffers from the inode's buffer list.  This is called
+ * when we're trying to free the inode itself.  Those buffers can pin it.
+ *
+ * Returns true if all buffers were removed.
+ */
+int remove_inode_buffers(struct inode *inode)
+{
+	int ret = 1;
+
+	if (inode_has_buffers(inode)) {
+		struct address_space *mapping = &inode->i_data;
+		struct list_head *list = &mapping->private_list;
+		struct address_space *buffer_mapping = mapping->private_data;
+
+		spin_lock(&buffer_mapping->private_lock);
+		while (!list_empty(list)) {
+			struct buffer_head *bh = BH_ENTRY(list->next);
+			if (buffer_dirty(bh)) {
+				ret = 0;
+				break;
+			}
+			__remove_assoc_queue(bh);
+		}
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+	return ret;
+}
+
+/*
+ * Create the appropriate buffers when given a page for data area and
+ * the size of each buffer.. Use the bh->b_this_page linked list to
+ * follow the buffers created.  Return NULL if unable to create more
+ * buffers.
+ *
+ * The retry flag is used to differentiate async IO (paging, swapping)
+ * which may not fail from ordinary buffer allocations.
+ */
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
+		int retry)
+{
+	struct buffer_head *bh, *head;
+	long offset;
+
+try_again:
+	head = NULL;
+	offset = PAGE_SIZE;
+	while ((offset -= size) >= 0) {
+		bh = alloc_buffer_head(GFP_NOFS);
+		if (!bh)
+			goto no_grow;
+
+		bh->b_this_page = head;
+		bh->b_blocknr = -1;
+		head = bh;
+
+		bh->b_size = size;
+
+		/* Link the buffer to its page */
+		set_bh_page(bh, page, offset);
+	}
+	return head;
+/*
+ * In case anything failed, we just free everything we got.
+ */
+no_grow:
+	if (head) {
+		do {
+			bh = head;
+			head = head->b_this_page;
+			free_buffer_head(bh);
+		} while (head);
+	}
+
+	/*
+	 * Return failure for non-async IO requests.  Async IO requests
+	 * are not allowed to fail, so we have to wait until buffer heads
+	 * become available.  But we don't want tasks sleeping with 
+	 * partially complete buffers, so all were released above.
+	 */
+	if (!retry)
+		return NULL;
+
+	/* We're _really_ low on memory. Now we just
+	 * wait for old buffer heads to become free due to
+	 * finishing IO.  Since this is an async request and
+	 * the reserve list is empty, we're sure there are 
+	 * async buffer heads in use.
+	 */
+	free_more_memory();
+	goto try_again;
+}
+EXPORT_SYMBOL_GPL(alloc_page_buffers);
+
+static inline void
+link_dev_buffers(struct page *page, struct buffer_head *head)
+{
+	struct buffer_head *bh, *tail;
+
+	bh = head;
+	do {
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+	attach_page_buffers(page, head);
+}
+
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+	sector_t retval = ~((sector_t)0);
+	loff_t sz = i_size_read(bdev->bd_inode);
+
+	if (sz) {
+		unsigned int sizebits = blksize_bits(size);
+		retval = (sz >> sizebits);
+	}
+	return retval;
+}
+
+/*
+ * Initialise the state of a blockdev page's buffers.
+ */ 
+static sector_t
+init_page_buffers(struct page *page, struct block_device *bdev,
+			sector_t block, int size)
+{
+	struct buffer_head *head = page_buffers(page);
+	struct buffer_head *bh = head;
+	int uptodate = PageUptodate(page);
+	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
+
+	do {
+		if (!buffer_mapped(bh)) {
+			init_buffer(bh, NULL, NULL);
+			bh->b_bdev = bdev;
+			bh->b_blocknr = block;
+			if (uptodate)
+				set_buffer_uptodate(bh);
+			if (block < end_block)
+				set_buffer_mapped(bh);
+		}
+		block++;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/*
+	 * Caller needs to validate requested block against end of device.
+	 */
+	return end_block;
+}
+
+/*
+ * Create the page-cache page that contains the requested block.
+ *
+ * This is used purely for blockdev mappings.
+ */
+static int
+grow_dev_page(struct block_device *bdev, sector_t block,
+	      pgoff_t index, int size, int sizebits, gfp_t gfp)
+{
+	struct inode *inode = bdev->bd_inode;
+	struct page *page;
+	struct buffer_head *bh;
+	sector_t end_block;
+	int ret = 0;		/* Will call free_more_memory() */
+	gfp_t gfp_mask;
+
+	gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+
+	/*
+	 * XXX: __getblk_slow() can not really deal with failure and
+	 * will endlessly loop on improvised global reclaim.  Prefer
+	 * looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp_mask |= __GFP_NOFAIL;
+
+	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
+	if (!page)
+		return ret;
+
+	BUG_ON(!PageLocked(page));
+
+	if (page_has_buffers(page)) {
+		bh = page_buffers(page);
+		if (bh->b_size == size) {
+			end_block = init_page_buffers(page, bdev,
+						(sector_t)index << sizebits,
+						size);
+			goto done;
+		}
+		if (!try_to_free_buffers(page))
+			goto failed;
+	}
+
+	/*
+	 * Allocate some buffers for this page
+	 */
+	bh = alloc_page_buffers(page, size, 0);
+	if (!bh)
+		goto failed;
+
+	/*
+	 * Link the page to the buffers and initialise them.  Take the
+	 * lock to be atomic wrt __find_get_block(), which does not
+	 * run under the page lock.
+	 */
+	spin_lock(&inode->i_mapping->private_lock);
+	link_dev_buffers(page, bh);
+	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
+			size);
+	spin_unlock(&inode->i_mapping->private_lock);
+done:
+	ret = (block < end_block) ? 1 : -ENXIO;
+failed:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+/*
+ * Create buffers for the specified block device block's page.  If
+ * that page was dirty, the buffers are set dirty also.
+ */
+static int
+grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
+{
+	pgoff_t index;
+	int sizebits;
+
+	sizebits = -1;
+	do {
+		sizebits++;
+	} while ((size << sizebits) < PAGE_SIZE);
+
+	index = block >> sizebits;
+
+	/*
+	 * Check for a block which wants to lie outside our maximum possible
+	 * pagecache index.  (this comparison is done using sector_t types).
+	 */
+	if (unlikely(index != block >> sizebits)) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_ERR "%s: requested out-of-range block %llu for "
+			"device %s\n",
+			__func__, (unsigned long long)block,
+			bdevname(bdev, b));
+		return -EIO;
+	}
+
+	/* Create a page with the proper size buffers.. */
+	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
+}
+
+struct buffer_head *
+__getblk_slow(struct block_device *bdev, sector_t block,
+	     unsigned size, gfp_t gfp)
+{
+	/* Size must be multiple of hard sectorsize */
+	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
+			(size < 512 || size > PAGE_SIZE))) {
+		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
+					size);
+		printk(KERN_ERR "logical block size: %d\n",
+					bdev_logical_block_size(bdev));
+
+		dump_stack();
+		return NULL;
+	}
+
+	for (;;) {
+		struct buffer_head *bh;
+		int ret;
+
+		bh = __find_get_block(bdev, block, size);
+		if (bh)
+			return bh;
+
+		ret = grow_buffers(bdev, block, size, gfp);
+		if (ret < 0)
+			return NULL;
+		if (ret == 0)
+			free_more_memory();
+	}
+}
+EXPORT_SYMBOL(__getblk_slow);
+
+/*
+ * The relationship between dirty buffers and dirty pages:
+ *
+ * Whenever a page has any dirty buffers, the page's dirty bit is set, and
+ * the page is tagged dirty in its radix tree.
+ *
+ * At all times, the dirtiness of the buffers represents the dirtiness of
+ * subsections of the page.  If the page has buffers, the page dirty bit is
+ * merely a hint about the true dirty state.
+ *
+ * When a page is set dirty in its entirety, all its buffers are marked dirty
+ * (if the page has buffers).
+ *
+ * When a buffer is marked dirty, its page is dirtied, but the page's other
+ * buffers are not.
+ *
+ * Also.  When blockdev buffers are explicitly read with bread(), they
+ * individually become uptodate.  But their backing page remains not
+ * uptodate - even if all of its buffers are uptodate.  A subsequent
+ * block_read_full_page() against that page will discover all the uptodate
+ * buffers, will set the page uptodate and will perform no I/O.
+ */
+
+/**
+ * mark_buffer_dirty - mark a buffer_head as needing writeout
+ * @bh: the buffer_head to mark dirty
+ *
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
+ * backing page dirty, then tag the page as dirty in its address_space's radix
+ * tree and then attach the address_space's inode to its superblock's dirty
+ * inode list.
+ *
+ * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
+ * mapping->tree_lock and mapping->host->i_lock.
+ */
+void mark_buffer_dirty(struct buffer_head *bh)
+{
+	WARN_ON_ONCE(!buffer_uptodate(bh));
+
+	trace_block_dirty_buffer(bh);
+
+	/*
+	 * Very *carefully* optimize the it-is-already-dirty case.
+	 *
+	 * Don't let the final "is it dirty" escape to before we
+	 * perhaps modified the buffer.
+	 */
+	if (buffer_dirty(bh)) {
+		smp_mb();
+		if (buffer_dirty(bh))
+			return;
+	}
+
+	if (!test_set_buffer_dirty(bh)) {
+		struct page *page = bh->b_page;
+		if (!TestSetPageDirty(page)) {
+			struct address_space *mapping = page_mapping(page);
+			if (mapping)
+				__set_page_dirty(page, mapping, 0);
+		}
+	}
+}
+EXPORT_SYMBOL(mark_buffer_dirty);
+
+/*
+ * Decrement a buffer_head's reference count.  If all buffers against a page
+ * have zero reference count, are clean and unlocked, and if the page is clean
+ * and unlocked then try_to_free_buffers() may strip the buffers from the page
+ * in preparation for freeing it (sometimes, rarely, buffers are removed from
+ * a page but it ends up not being freed, and buffers may later be reattached).
+ */
+void __brelse(struct buffer_head * buf)
+{
+	if (atomic_read(&buf->b_count)) {
+		put_bh(buf);
+		return;
+	}
+	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+}
+EXPORT_SYMBOL(__brelse);
+
+/*
+ * bforget() is like brelse(), except it discards any
+ * potentially dirty data.
+ */
+void __bforget(struct buffer_head *bh)
+{
+	clear_buffer_dirty(bh);
+	if (bh->b_assoc_map) {
+		struct address_space *buffer_mapping = bh->b_page->mapping;
+
+		spin_lock(&buffer_mapping->private_lock);
+		list_del_init(&bh->b_assoc_buffers);
+		bh->b_assoc_map = NULL;
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+	__brelse(bh);
+}
+EXPORT_SYMBOL(__bforget);
+
+static struct buffer_head *__bread_slow(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	} else {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+		wait_on_buffer(bh);
+		if (buffer_uptodate(bh))
+			return bh;
+	}
+	brelse(bh);
+	return NULL;
+}
+
+/*
+ * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
+ * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
+ * refcount elevated by one when they're in an LRU.  A buffer can only appear
+ * once in a particular CPU's LRU.  A single buffer can be present in multiple
+ * CPU's LRUs at the same time.
+ *
+ * This is a transparent caching front-end to sb_bread(), sb_getblk() and
+ * sb_find_get_block().
+ *
+ * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
+ * a local interrupt disable for that.
+ */
+
+#define BH_LRU_SIZE	16
+
+struct bh_lru {
+	struct buffer_head *bhs[BH_LRU_SIZE];
+};
+
+static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
+
+#ifdef CONFIG_SMP
+#define bh_lru_lock()	local_irq_disable()
+#define bh_lru_unlock()	local_irq_enable()
+#else
+#define bh_lru_lock()	preempt_disable()
+#define bh_lru_unlock()	preempt_enable()
+#endif
+
+static inline void check_irqs_on(void)
+{
+#ifdef irqs_disabled
+	BUG_ON(irqs_disabled());
+#endif
+}
+
+/*
+ * The LRU management algorithm is dopey-but-simple.  Sorry.
+ */
+static void bh_lru_install(struct buffer_head *bh)
+{
+	struct buffer_head *evictee = NULL;
+
+	check_irqs_on();
+	bh_lru_lock();
+	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
+		struct buffer_head *bhs[BH_LRU_SIZE];
+		int in;
+		int out = 0;
+
+		get_bh(bh);
+		bhs[out++] = bh;
+		for (in = 0; in < BH_LRU_SIZE; in++) {
+			struct buffer_head *bh2 =
+				__this_cpu_read(bh_lrus.bhs[in]);
+
+			if (bh2 == bh) {
+				__brelse(bh2);
+			} else {
+				if (out >= BH_LRU_SIZE) {
+					BUG_ON(evictee != NULL);
+					evictee = bh2;
+				} else {
+					bhs[out++] = bh2;
+				}
+			}
+		}
+		while (out < BH_LRU_SIZE)
+			bhs[out++] = NULL;
+		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+	}
+	bh_lru_unlock();
+
+	if (evictee)
+		__brelse(evictee);
+}
+
+/*
+ * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
+ */
+static struct buffer_head *
+lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *ret = NULL;
+	unsigned int i;
+
+	check_irqs_on();
+	bh_lru_lock();
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
+
+		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
+		    bh->b_size == size) {
+			if (i) {
+				while (i) {
+					__this_cpu_write(bh_lrus.bhs[i],
+						__this_cpu_read(bh_lrus.bhs[i - 1]));
+					i--;
+				}
+				__this_cpu_write(bh_lrus.bhs[0], bh);
+			}
+			get_bh(bh);
+			ret = bh;
+			break;
+		}
+	}
+	bh_lru_unlock();
+	return ret;
+}
+
+/*
+ * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
+ * it in the LRU and mark it as accessed.  If it is not present then return
+ * NULL
+ */
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
+
+	if (bh == NULL) {
+		/* __find_get_block_slow will mark the page accessed */
+		bh = __find_get_block_slow(bdev, block);
+		if (bh)
+			bh_lru_install(bh);
+	} else
+		touch_buffer(bh);
+
+	return bh;
+}
+EXPORT_SYMBOL(__find_get_block);
+
+/*
+ * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
+ * which corresponds to the passed block_device, block and size. The
+ * returned buffer has its reference count incremented.
+ *
+ * __getblk_gfp() will lock up the machine if grow_dev_page's
+ * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
+ */
+struct buffer_head *
+__getblk_gfp(struct block_device *bdev, sector_t block,
+	     unsigned size, gfp_t gfp)
+{
+	struct buffer_head *bh = __find_get_block(bdev, block, size);
+
+	might_sleep();
+	if (bh == NULL)
+		bh = __getblk_slow(bdev, block, size, gfp);
+	return bh;
+}
+EXPORT_SYMBOL(__getblk_gfp);
+
+/*
+ * Do async read-ahead on a buffer..
+ */
+void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *bh = __getblk(bdev, block, size);
+	if (likely(bh)) {
+		ll_rw_block(READA, 1, &bh);
+		brelse(bh);
+	}
+}
+EXPORT_SYMBOL(__breadahead);
+
+/**
+ *  __bread_gfp() - reads a specified block and returns the bh
+ *  @bdev: the block_device to read from
+ *  @block: number of block
+ *  @size: size (in bytes) to read
+ *  @gfp: page allocation flag
+ *
+ *  Reads a specified block, and returns buffer head that contains it.
+ *  The page cache can be allocated from non-movable area
+ *  not to prevent page migration if you set gfp to zero.
+ *  It returns NULL if the block was unreadable.
+ */
+struct buffer_head *
+__bread_gfp(struct block_device *bdev, sector_t block,
+		   unsigned size, gfp_t gfp)
+{
+	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+
+	if (likely(bh) && !buffer_uptodate(bh))
+		bh = __bread_slow(bh);
+	return bh;
+}
+EXPORT_SYMBOL(__bread_gfp);
+
+/*
+ * invalidate_bh_lrus() is called rarely - but not only at unmount.
+ * This doesn't race because it runs in each cpu either in irq
+ * or with preempt disabled.
+ */
+static void invalidate_bh_lru(void *arg)
+{
+	struct bh_lru *b = &get_cpu_var(bh_lrus);
+	int i;
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+	put_cpu_var(bh_lrus);
+}
+
+static bool has_bh_in_lru(int cpu, void *dummy)
+{
+	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+	int i;
+	
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		if (b->bhs[i])
+			return 1;
+	}
+
+	return 0;
+}
+
+void invalidate_bh_lrus(void)
+{
+	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
+
+void set_bh_page(struct buffer_head *bh,
+		struct page *page, unsigned long offset)
+{
+	bh->b_page = page;
+	BUG_ON(offset >= PAGE_SIZE);
+	if (PageHighMem(page))
+		/*
+		 * This catches illegal uses and preserves the offset:
+		 */
+		bh->b_data = (char *)(0 + offset);
+	else
+		bh->b_data = page_address(page) + offset;
+}
+EXPORT_SYMBOL(set_bh_page);
+
+/*
+ * Called when truncating a buffer on a page completely.
+ */
+
+/* Bits that are cleared during an invalidate */
+#define BUFFER_FLAGS_DISCARD \
+	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
+	 1 << BH_Delay | 1 << BH_Unwritten)
+
+static void discard_buffer(struct buffer_head * bh)
+{
+	unsigned long b_state, b_state_old;
+
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	bh->b_bdev = NULL;
+	b_state = bh->b_state;
+	for (;;) {
+		b_state_old = cmpxchg(&bh->b_state, b_state,
+				      (b_state & ~BUFFER_FLAGS_DISCARD));
+		if (b_state_old == b_state)
+			break;
+		b_state = b_state_old;
+	}
+	unlock_buffer(bh);
+}
+
+/**
+ * block_invalidatepage - invalidate part or all of a buffer-backed page
+ *
+ * @page: the page which is affected
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
+ *
+ * block_invalidatepage() is called when all or part of the page has become
+ * invalidated by a truncate operation.
+ *
+ * block_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point.  Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void block_invalidatepage(struct page *page, unsigned int offset,
+			  unsigned int length)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+	unsigned int stop = length + offset;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	/*
+	 * Check for overflow
+	 */
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		/*
+		 * Are we still fully in range ?
+		 */
+		if (next_off > stop)
+			goto out;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off)
+			discard_buffer(bh);
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * We release buffers only if the entire page is being invalidated.
+	 * The get_block cached value has been unconditionally invalidated,
+	 * so real IO is not possible anymore.
+	 */
+	if (offset == 0)
+		try_to_release_page(page, 0);
+out:
+	return;
+}
+EXPORT_SYMBOL(block_invalidatepage);
+
+
+/*
+ * We attach and possibly dirty the buffers atomically wrt
+ * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
+ * is already excluded via the page lock.
+ */
+void create_empty_buffers(struct page *page,
+			unsigned long blocksize, unsigned long b_state)
+{
+	struct buffer_head *bh, *head, *tail;
+
+	head = alloc_page_buffers(page, blocksize, 1);
+	bh = head;
+	do {
+		bh->b_state |= b_state;
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+
+	spin_lock(&page->mapping->private_lock);
+	if (PageUptodate(page) || PageDirty(page)) {
+		bh = head;
+		do {
+			if (PageDirty(page))
+				set_buffer_dirty(bh);
+			if (PageUptodate(page))
+				set_buffer_uptodate(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	attach_page_buffers(page, head);
+	spin_unlock(&page->mapping->private_lock);
+}
+EXPORT_SYMBOL(create_empty_buffers);
+
+/*
+ * We are taking a block for data and we don't want any output from any
+ * buffer-cache aliases starting from return from that function and
+ * until the moment when something will explicitly mark the buffer
+ * dirty (hopefully that will not happen until we will free that block ;-)
+ * We don't even need to mark it not-uptodate - nobody can expect
+ * anything from a newly allocated buffer anyway. We used to used
+ * unmap_buffer() for such invalidation, but that was wrong. We definitely
+ * don't want to mark the alias unmapped, for example - it would confuse
+ * anyone who might pick it with bread() afterwards...
+ *
+ * Also..  Note that bforget() doesn't lock the buffer.  So there can
+ * be writeout I/O going on against recently-freed buffers.  We don't
+ * wait on that I/O in bforget() - it's more efficient to wait on the I/O
+ * only if we really need to.  That happens here.
+ */
+void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
+{
+	struct buffer_head *old_bh;
+
+	might_sleep();
+
+	old_bh = __find_get_block_slow(bdev, block);
+	if (old_bh) {
+		clear_buffer_dirty(old_bh);
+		wait_on_buffer(old_bh);
+		clear_buffer_req(old_bh);
+		__brelse(old_bh);
+	}
+}
+EXPORT_SYMBOL(unmap_underlying_metadata);
+
+/*
+ * Size is a power-of-two in the range 512..PAGE_SIZE,
+ * and the case we care about most is PAGE_SIZE.
+ *
+ * So this *could* possibly be written with those
+ * constraints in mind (relevant mostly if some
+ * architecture has a slow bit-scan instruction)
+ */
+static inline int block_size_bits(unsigned int blocksize)
+{
+	return ilog2(blocksize);
+}
+
+static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+	return page_buffers(page);
+}
+
+/*
+ * NOTE! All mapped/uptodate combinations are valid:
+ *
+ *	Mapped	Uptodate	Meaning
+ *
+ *	No	No		"unknown" - must do get_block()
+ *	No	Yes		"hole" - zero-filled
+ *	Yes	No		"allocated" - allocated on disk, not read in
+ *	Yes	Yes		"valid" - allocated and up-to-date in memory.
+ *
+ * "Dirty" is valid only with the last case (mapped+uptodate).
+ */
+
+/*
+ * While block_write_full_page is writing back the dirty buffers under
+ * the page lock, whoever dirtied the buffers may decide to clean them
+ * again at any time.  We handle that by only looking at the buffer
+ * state inside lock_buffer().
+ *
+ * If block_write_full_page() is called for regular writeback
+ * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
+ * locked buffer.   This only can happen if someone has written the buffer
+ * directly, with submit_bh().  At the address_space level PageWriteback
+ * prevents this contention from occurring.
+ *
+ * If block_write_full_page() is called with wbc->sync_mode ==
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * causes the writes to be flagged as synchronous writes.
+ */
+static int __block_write_full_page(struct inode *inode, struct page *page,
+			get_block_t *get_block, struct writeback_control *wbc,
+			bh_end_io_t *handler)
+{
+	int err;
+	sector_t block;
+	sector_t last_block;
+	struct buffer_head *bh, *head;
+	unsigned int blocksize, bbits;
+	int nr_underway = 0;
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC : WRITE);
+
+	head = create_page_buffers(page, inode,
+					(1 << BH_Dirty)|(1 << BH_Uptodate));
+
+	/*
+	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+	 * here, and the (potentially unmapped) buffers may become dirty at
+	 * any time.  If a buffer becomes dirty here after we've inspected it
+	 * then we just miss that fact, and the page stays dirty.
+	 *
+	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+	 * handle that here by just cleaning them.
+	 */
+
+	bh = head;
+	blocksize = bh->b_size;
+	bbits = block_size_bits(blocksize);
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	last_block = (i_size_read(inode) - 1) >> bbits;
+
+	/*
+	 * Get all the dirty buffers mapped to disk addresses and
+	 * handle any aliases from the underlying blockdev's mapping.
+	 */
+	do {
+		if (block > last_block) {
+			/*
+			 * mapped buffers outside i_size will occur, because
+			 * this page can be outside i_size when there is a
+			 * truncate in progress.
+			 */
+			/*
+			 * The buffer was zeroed by block_write_full_page()
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				goto recover;
+			clear_buffer_delay(bh);
+			if (buffer_new(bh)) {
+				/* blockdev mappings never come here */
+				clear_buffer_new(bh);
+				unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from writeback threads
+		 * and kswapd activity, but those code paths have their own
+		 * higher-level throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			lock_buffer(bh);
+		} else if (!trylock_buffer(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write_endio(bh, handler);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	err = 0;
+done:
+	if (nr_underway == 0) {
+		/*
+		 * The page was marked dirty, but the buffers were
+		 * clean.  Someone wrote them back by hand with
+		 * ll_rw_block/submit_bh.  A rare case.
+		 */
+		end_page_writeback(page);
+
+		/*
+		 * The page and buffer_heads can be released at any time from
+		 * here on.
+		 */
+	}
+	return err;
+
+recover:
+	/*
+	 * ENOSPC, or some other error.  We may already have added some
+	 * blocks to the file, so we need to write these out to avoid
+	 * exposing stale data.
+	 * The page is currently locked and not marked for writeback
+	 */
+	bh = head;
+	/* Recovery: lock and submit the mapped buffers */
+	do {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
+			lock_buffer(bh);
+			mark_buffer_async_write_endio(bh, handler);
+		} else {
+			/*
+			 * The buffer may have been set dirty during
+			 * attachment to a dirty page.
+			 */
+			clear_buffer_dirty(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	mapping_set_error(page->mapping, err);
+	set_page_writeback(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+	goto done;
+}
+
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, size;
+
+					start = max(from, block_start);
+					size = min(to, block_end) - start;
+
+					zero_user(page, start, size);
+					set_buffer_uptodate(bh);
+				}
+
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
+		}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(page_zero_new_buffers);
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end;
+	sector_t block;
+	int err = 0;
+	unsigned blocksize, bbits;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(from > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > to);
+
+	head = create_page_buffers(page, inode, 0);
+	blocksize = head->b_size;
+	bbits = block_size_bits(blocksize);
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+
+	for(bh = head, block_start = 0; bh != head || !block_start;
+	    block++, block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+			}
+			continue;
+		}
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+		if (!buffer_mapped(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				break;
+			if (buffer_new(bh)) {
+				unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+				if (PageUptodate(page)) {
+					clear_buffer_new(bh);
+					set_buffer_uptodate(bh);
+					mark_buffer_dirty(bh);
+					continue;
+				}
+				if (block_end > to || block_start < from)
+					zero_user_segments(page,
+						to, block_end,
+						block_start, from);
+				continue;
+			}
+		}
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			continue; 
+		}
+		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+		    !buffer_unwritten(bh) &&
+		     (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+	}
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			err = -EIO;
+	}
+	if (unlikely(err))
+		page_zero_new_buffers(page, from, to);
+	return err;
+}
+EXPORT_SYMBOL(__block_write_begin);
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	bh = head = page_buffers(page);
+	blocksize = bh->b_size;
+
+	block_start = 0;
+	do {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+		}
+		clear_buffer_new(bh);
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/*
+	 * If this is a partial write which happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' whether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * The filesystem needs to handle block truncation upon failure.
+ */
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+		unsigned flags, struct page **pagep, get_block_t *get_block)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int status;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	status = __block_write_begin(page, pos, len, get_block);
+	if (unlikely(status)) {
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+	}
+
+	*pagep = page;
+	return status;
+}
+EXPORT_SYMBOL(block_write_begin);
+
+int block_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+
+	if (unlikely(copied < len)) {
+		/*
+		 * The buffers that were written will now be uptodate, so we
+		 * don't have to worry about a readpage reading them and
+		 * overwriting a partial write. However if we have encountered
+		 * a short write and only partially written into a buffer, it
+		 * will not be marked uptodate, so a readpage might come in and
+		 * destroy our partial write.
+		 *
+		 * Do the simplest thing, and just treat any short write to a
+		 * non uptodate page as a zero-length write, and force the
+		 * caller to redo the whole thing.
+		 */
+		if (!PageUptodate(page))
+			copied = 0;
+
+		page_zero_new_buffers(page, start+copied, start+len);
+	}
+	flush_dcache_page(page);
+
+	/* This could be a short (even 0-length) commit */
+	__block_commit_write(inode, page, start, start+copied);
+
+	return copied;
+}
+EXPORT_SYMBOL(block_write_end);
+
+int generic_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
+	int i_size_changed = 0;
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		i_size_changed = 1;
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
+	return copied;
+}
+EXPORT_SYMBOL(generic_write_end);
+
+/*
+ * block_is_partially_uptodate checks whether buffers within a page are
+ * uptodate or not.
+ *
+ * Returns true if all buffers which correspond to a file portion
+ * we want to read are uptodate.
+ */
+int block_is_partially_uptodate(struct page *page, unsigned long from,
+					unsigned long count)
+{
+	unsigned block_start, block_end, blocksize;
+	unsigned to;
+	struct buffer_head *bh, *head;
+	int ret = 1;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	head = page_buffers(page);
+	blocksize = head->b_size;
+	to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
+	to = from + to;
+	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+		return 0;
+
+	bh = head;
+	block_start = 0;
+	do {
+		block_end = block_start + blocksize;
+		if (block_end > from && block_start < to) {
+			if (!buffer_uptodate(bh)) {
+				ret = 0;
+				break;
+			}
+			if (block_end >= to)
+				break;
+		}
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+EXPORT_SYMBOL(block_is_partially_uptodate);
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	sector_t iblock, lblock;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize, bbits;
+	int nr, i;
+	int fully_mapped = 1;
+
+	head = create_page_buffers(page, inode, 0);
+	blocksize = head->b_size;
+	bbits = block_size_bits(blocksize);
+
+	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
+	bh = head;
+	nr = 0;
+	i = 0;
+
+	do {
+		if (buffer_uptodate(bh))
+			continue;
+
+		if (!buffer_mapped(bh)) {
+			int err = 0;
+
+			fully_mapped = 0;
+			if (iblock < lblock) {
+				WARN_ON(bh->b_size != blocksize);
+				err = get_block(inode, iblock, bh, 0);
+				if (err)
+					SetPageError(page);
+			}
+			if (!buffer_mapped(bh)) {
+				zero_user(page, i * blocksize, blocksize);
+				if (!err)
+					set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * get_block() might have updated the buffer
+			 * synchronously
+			 */
+			if (buffer_uptodate(bh))
+				continue;
+		}
+		arr[nr++] = bh;
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	if (fully_mapped)
+		SetPageMappedToDisk(page);
+
+	if (!nr) {
+		/*
+		 * All buffers are uptodate - we can set the page uptodate
+		 * as well. But not if get_block() returned an error.
+		 */
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	/* Stage two: lock the buffers */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		lock_buffer(bh);
+		mark_buffer_async_read(bh);
+	}
+
+	/*
+	 * Stage 3: start the IO.  Check for uptodateness
+	 * inside the buffer lock in case another process reading
+	 * the underlying blockdev brought it uptodate (the sct fix).
+	 */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		if (buffer_uptodate(bh))
+			end_buffer_async_read(bh, 1);
+		else
+			submit_bh(READ, bh);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(block_read_full_page);
+
+/* utility function for filesystems that need to do work on expanding
+ * truncates.  Uses filesystem pagecache writes to allow the filesystem to
+ * deal with the hole.  
+ */
+int generic_cont_expand_simple(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	void *fsdata;
+	int err;
+
+	err = inode_newsize_ok(inode, size);
+	if (err)
+		goto out;
+
+	err = pagecache_write_begin(NULL, mapping, size, 0,
+				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
+				&page, &fsdata);
+	if (err)
+		goto out;
+
+	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+	BUG_ON(err > 0);
+
+out:
+	return err;
+}
+EXPORT_SYMBOL(generic_cont_expand_simple);
+
+static int cont_expand_zero(struct file *file, struct address_space *mapping,
+			    loff_t pos, loff_t *bytes)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	struct page *page;
+	void *fsdata;
+	pgoff_t index, curidx;
+	loff_t curpos;
+	unsigned zerofrom, offset, len;
+	int err = 0;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
+
+	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
+		zerofrom = curpos & ~PAGE_CACHE_MASK;
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		len = PAGE_CACHE_SIZE - zerofrom;
+
+		err = pagecache_write_begin(file, mapping, curpos, len,
+						AOP_FLAG_UNINTERRUPTIBLE,
+						&page, &fsdata);
+		if (err)
+			goto out;
+		zero_user(page, zerofrom, len);
+		err = pagecache_write_end(file, mapping, curpos, len, len,
+						page, fsdata);
+		if (err < 0)
+			goto out;
+		BUG_ON(err != len);
+		err = 0;
+
+		balance_dirty_pages_ratelimited(mapping);
+
+		if (unlikely(fatal_signal_pending(current))) {
+			err = -EINTR;
+			goto out;
+		}
+	}
+
+	/* page covers the boundary, find the boundary offset */
+	if (index == curidx) {
+		zerofrom = curpos & ~PAGE_CACHE_MASK;
+		/* if we will expand the thing last block will be filled */
+		if (offset <= zerofrom) {
+			goto out;
+		}
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		len = offset - zerofrom;
+
+		err = pagecache_write_begin(file, mapping, curpos, len,
+						AOP_FLAG_UNINTERRUPTIBLE,
+						&page, &fsdata);
+		if (err)
+			goto out;
+		zero_user(page, zerofrom, len);
+		err = pagecache_write_end(file, mapping, curpos, len, len,
+						page, fsdata);
+		if (err < 0)
+			goto out;
+		BUG_ON(err != len);
+		err = 0;
+	}
+out:
+	return err;
+}
+
+/*
+ * For moronic filesystems that do not allow holes in file.
+ * We may have to extend the file.
+ */
+int cont_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block, loff_t *bytes)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	unsigned zerofrom;
+	int err;
+
+	err = cont_expand_zero(file, mapping, pos, bytes);
+	if (err)
+		return err;
+
+	zerofrom = *bytes & ~PAGE_CACHE_MASK;
+	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
+		*bytes |= (blocksize-1);
+		(*bytes)++;
+	}
+
+	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
+}
+EXPORT_SYMBOL(cont_write_begin);
+
+int block_commit_write(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	__block_commit_write(inode,page,from,to);
+	return 0;
+}
+EXPORT_SYMBOL(block_commit_write);
+
+/*
+ * block_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * truncate writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ *
+ * Direct callers of this function should protect against filesystem freezing
+ * using sb_start_write() - sb_end_write() functions.
+ */
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+			 get_block_t get_block)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	unsigned long end;
+	loff_t size;
+	int ret;
+
+	lock_page(page);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_offset(page) > size)) {
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+		end = size & ~PAGE_CACHE_MASK;
+	else
+		end = PAGE_CACHE_SIZE;
+
+	ret = __block_write_begin(page, 0, end, get_block);
+	if (!ret)
+		ret = block_commit_write(page, 0, end);
+
+	if (unlikely(ret < 0))
+		goto out_unlock;
+	set_page_dirty(page);
+	wait_for_stable_page(page);
+	return 0;
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+EXPORT_SYMBOL(__block_page_mkwrite);
+
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		   get_block_t get_block)
+{
+	int ret;
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	sb_start_pagefault(sb);
+
+	/*
+	 * Update file times before taking page lock. We may end up failing the
+	 * fault so this update may be superfluous but who really cares...
+	 */
+	file_update_time(vma->vm_file);
+
+	ret = __block_page_mkwrite(vma, vmf, get_block);
+	sb_end_pagefault(sb);
+	return block_page_mkwrite_return(ret);
+}
+EXPORT_SYMBOL(block_page_mkwrite);
+
+/*
+ * nobh_write_begin()'s prereads are special: the buffer_heads are freed
+ * immediately, while under the page lock.  So it needs a special end_io
+ * handler which does not touch the bh after unlocking it.
+ */
+static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
+{
+	__end_buffer_read_notouch(bh, uptodate);
+}
+
+/*
+ * Attach the singly-linked list of buffers created by nobh_write_begin, to
+ * the page (converting it to circular linked list and taking care of page
+ * dirty races).
+ */
+static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
+{
+	struct buffer_head *bh;
+
+	BUG_ON(!PageLocked(page));
+
+	spin_lock(&page->mapping->private_lock);
+	bh = head;
+	do {
+		if (PageDirty(page))
+			set_buffer_dirty(bh);
+		if (!bh->b_this_page)
+			bh->b_this_page = head;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	attach_page_buffers(page, head);
+	spin_unlock(&page->mapping->private_lock);
+}
+
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ * The filesystem needs to handle block truncation upon failure.
+ */
+int nobh_write_begin(struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block)
+{
+	struct inode *inode = mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	struct buffer_head *head, *bh;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	unsigned block_in_page;
+	unsigned block_start, block_end;
+	sector_t block_in_file;
+	int nr_reads = 0;
+	int ret = 0;
+	int is_mapped_to_disk = 1;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+	*fsdata = NULL;
+
+	if (page_has_buffers(page)) {
+		ret = __block_write_begin(page, pos, len, get_block);
+		if (unlikely(ret))
+			goto out_release;
+		return ret;
+	}
+
+	if (PageMappedToDisk(page))
+		return 0;
+
+	/*
+	 * Allocate buffers so that we can keep track of state, and potentially
+	 * attach them to the page if an error occurs. In the common case of
+	 * no error, they will just be freed again without ever being attached
+	 * to the page (which is all OK, because we're under the page lock).
+	 *
+	 * Be careful: the buffer linked list is a NULL terminated one, rather
+	 * than the circular one we're used to.
+	 */
+	head = alloc_page_buffers(page, blocksize, 0);
+	if (!head) {
+		ret = -ENOMEM;
+		goto out_release;
+	}
+
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+
+	/*
+	 * We loop across all blocks in the page, whether or not they are
+	 * part of the affected region.  This is so we can discover if the
+	 * page is fully mapped-to-disk.
+	 */
+	for (block_start = 0, block_in_page = 0, bh = head;
+		  block_start < PAGE_CACHE_SIZE;
+		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
+		int create;
+
+		block_end = block_start + blocksize;
+		bh->b_state = 0;
+		create = 1;
+		if (block_start >= to)
+			create = 0;
+		ret = get_block(inode, block_in_file + block_in_page,
+					bh, create);
+		if (ret)
+			goto failed;
+		if (!buffer_mapped(bh))
+			is_mapped_to_disk = 0;
+		if (buffer_new(bh))
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		if (PageUptodate(page)) {
+			set_buffer_uptodate(bh);
+			continue;
+		}
+		if (buffer_new(bh) || !buffer_mapped(bh)) {
+			zero_user_segments(page, block_start, from,
+							to, block_end);
+			continue;
+		}
+		if (buffer_uptodate(bh))
+			continue;	/* reiserfs does this */
+		if (block_start < from || block_end > to) {
+			lock_buffer(bh);
+			bh->b_end_io = end_buffer_read_nobh;
+			submit_bh(READ, bh);
+			nr_reads++;
+		}
+	}
+
+	if (nr_reads) {
+		/*
+		 * The page is locked, so these buffers are protected from
+		 * any VM or truncate activity.  Hence we don't need to care
+		 * for the buffer_head refcounts.
+		 */
+		for (bh = head; bh; bh = bh->b_this_page) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				ret = -EIO;
+		}
+		if (ret)
+			goto failed;
+	}
+
+	if (is_mapped_to_disk)
+		SetPageMappedToDisk(page);
+
+	*fsdata = head; /* to be released by nobh_write_end */
+
+	return 0;
+
+failed:
+	BUG_ON(!ret);
+	/*
+	 * Error recovery is a bit difficult. We need to zero out blocks that
+	 * were newly allocated, and dirty them to ensure they get written out.
+	 * Buffers need to be attached to the page at this point, otherwise
+	 * the handling of potential IO errors during writeout would be hard
+	 * (could try doing synchronous writeout, but what if that fails too?)
+	 */
+	attach_nobh_buffers(page, head);
+	page_zero_new_buffers(page, from, to);
+
+out_release:
+	unlock_page(page);
+	page_cache_release(page);
+	*pagep = NULL;
+
+	return ret;
+}
+EXPORT_SYMBOL(nobh_write_begin);
+
+int nobh_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *head = fsdata;
+	struct buffer_head *bh;
+	BUG_ON(fsdata != NULL && page_has_buffers(page));
+
+	if (unlikely(copied < len) && head)
+		attach_nobh_buffers(page, head);
+	if (page_has_buffers(page))
+		return generic_write_end(file, mapping, pos, len,
+					copied, page, fsdata);
+
+	SetPageUptodate(page);
+	set_page_dirty(page);
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		mark_inode_dirty(inode);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	while (head) {
+		bh = head;
+		head = head->b_this_page;
+		free_buffer_head(bh);
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL(nobh_write_end);
+
+/*
+ * nobh_writepage() - based on block_full_write_page() except
+ * that it tries to operate without attaching bufferheads to
+ * the page.
+ */
+int nobh_writepage(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	int ret;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto out;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+#if 0
+		/* Not really sure about this  - do we need this ? */
+		if (page->mapping->a_ops->invalidatepage)
+			page->mapping->a_ops->invalidatepage(page, offset);
+#endif
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+out:
+	ret = mpage_writepage(page, get_block, wbc);
+	if (ret == -EAGAIN)
+		ret = __block_write_full_page(inode, page, get_block, wbc,
+					      end_buffer_async_write);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_writepage);
+
+int nobh_truncate_page(struct address_space *mapping,
+			loff_t from, get_block_t *get_block)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	sector_t iblock;
+	unsigned length, pos;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head map_bh;
+	int err;
+
+	blocksize = 1 << inode->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	page = grab_cache_page(mapping, index);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+
+	if (page_has_buffers(page)) {
+has_buffers:
+		unlock_page(page);
+		page_cache_release(page);
+		return block_truncate_page(mapping, from, get_block);
+	}
+
+	/* Find the buffer that contains "offset" */
+	pos = blocksize;
+	while (offset >= pos) {
+		iblock++;
+		pos += blocksize;
+	}
+
+	map_bh.b_size = blocksize;
+	map_bh.b_state = 0;
+	err = get_block(inode, iblock, &map_bh, 0);
+	if (err)
+		goto unlock;
+	/* unmapped? It's a hole - nothing to do */
+	if (!buffer_mapped(&map_bh))
+		goto unlock;
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (!PageUptodate(page)) {
+		err = mapping->a_ops->readpage(NULL, page);
+		if (err) {
+			page_cache_release(page);
+			goto out;
+		}
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			err = -EIO;
+			goto unlock;
+		}
+		if (page_has_buffers(page))
+			goto has_buffers;
+	}
+	zero_user(page, offset, length);
+	set_page_dirty(page);
+	err = 0;
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return err;
+}
+EXPORT_SYMBOL(nobh_truncate_page);
+
+int block_truncate_page(struct address_space *mapping,
+			loff_t from, get_block_t *get_block)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	sector_t iblock;
+	unsigned length, pos;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head *bh;
+	int err;
+
+	blocksize = 1 << inode->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	
+	page = grab_cache_page(mapping, index);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (!buffer_mapped(bh)) {
+		WARN_ON(bh->b_size != blocksize);
+		err = get_block(inode, iblock, bh, 0);
+		if (err)
+			goto unlock;
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	zero_user(page, offset, length);
+	mark_buffer_dirty(bh);
+	err = 0;
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return err;
+}
+EXPORT_SYMBOL(block_truncate_page);
+
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		return __block_write_full_page(inode, page, get_block, wbc,
+					       end_buffer_async_write);
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __block_write_full_page(inode, page, get_block, wbc,
+							end_buffer_async_write);
+}
+EXPORT_SYMBOL(block_write_full_page);
+
+sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
+			    get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	struct inode *inode = mapping->host;
+	tmp.b_state = 0;
+	tmp.b_blocknr = 0;
+	tmp.b_size = 1 << inode->i_blkbits;
+	get_block(inode, block, &tmp, 0);
+	return tmp.b_blocknr;
+}
+EXPORT_SYMBOL(generic_block_bmap);
+
+static void end_bio_bh_io_sync(struct bio *bio, int err)
+{
+	struct buffer_head *bh = bio->bi_private;
+
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	}
+
+	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+		set_bit(BH_Quiet, &bh->b_state);
+
+	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
+	bio_put(bio);
+}
+
+/*
+ * This allows us to do IO even on the odd last sectors
+ * of a device, even if the block size is some multiple
+ * of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device,
+ * and clear the end of the buffer head manually.
+ *
+ * Truly out-of-range accesses will turn into actual IO
+ * errors, this only handles the "we need to be able to
+ * do IO at the final sector" case.
+ */
+void guard_bio_eod(int rw, struct bio *bio)
+{
+	sector_t maxsector;
+	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	unsigned truncated_bytes;
+
+	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+	if (!maxsector)
+		return;
+
+	/*
+	 * If the *whole* IO is past the end of the device,
+	 * let it through, and the IO layer will turn it into
+	 * an EIO.
+	 */
+	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
+		return;
+
+	maxsector -= bio->bi_iter.bi_sector;
+	if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
+		return;
+
+	/* Uhhuh. We've got a bio that straddles the device size! */
+	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
+
+	/* Truncate the bio.. */
+	bio->bi_iter.bi_size -= truncated_bytes;
+	bvec->bv_len -= truncated_bytes;
+
+	/* ..and clear the end of the buffer for reads */
+	if ((rw & RW_MASK) == READ) {
+		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+				truncated_bytes);
+	}
+}
+
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	BUG_ON(!buffer_locked(bh));
+	BUG_ON(!buffer_mapped(bh));
+	BUG_ON(!bh->b_end_io);
+	BUG_ON(buffer_delay(bh));
+	BUG_ON(buffer_unwritten(bh));
+
+	/*
+	 * Only clear out a write error when rewriting
+	 */
+	if (test_set_buffer_req(bh) && (rw & WRITE))
+		clear_buffer_write_io_error(bh);
+
+	/*
+	 * from here on down, it's all bio -- do the initial mapping,
+	 * submit_bio -> generic_make_request may further map this bio around
+	 */
+	bio = bio_alloc(GFP_NOIO, 1);
+
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_io_vec[0].bv_page = bh->b_page;
+	bio->bi_io_vec[0].bv_len = bh->b_size;
+	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
+
+	bio->bi_vcnt = 1;
+	bio->bi_iter.bi_size = bh->b_size;
+
+	bio->bi_end_io = end_bio_bh_io_sync;
+	bio->bi_private = bh;
+	bio->bi_flags |= bio_flags;
+
+	/* Take care of bh's that straddle the end of the device */
+	guard_bio_eod(rw, bio);
+
+	if (buffer_meta(bh))
+		rw |= REQ_META;
+	if (buffer_prio(bh))
+		rw |= REQ_PRIO;
+
+	bio_get(bio);
+	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(_submit_bh);
+
+int submit_bh(int rw, struct buffer_head *bh)
+{
+	return _submit_bh(rw, bh, 0);
+}
+EXPORT_SYMBOL(submit_bh);
+
+/**
+ * ll_rw_block: low-level access to block devices (DEPRECATED)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
+ * @nr: number of &struct buffer_heads in the array
+ * @bhs: array of pointers to &struct buffer_head
+ *
+ * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
+ * requests an I/O operation on them, either a %READ or a %WRITE.  The third
+ * %READA option is described in the documentation for generic_make_request()
+ * which ll_rw_block() calls.
+ *
+ * This function drops any buffer that it cannot get a lock on (with the
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
+ * request, and any buffer that appears to be up-to-date when doing read
+ * request.  Further it marks as clean buffers that are processed for
+ * writing (the buffer cache won't assume that they are actually clean
+ * until the buffer gets unlocked).
+ *
+ * ll_rw_block sets b_end_io to simple completion handler that marks
+ * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
+ * any waiters. 
+ *
+ * All of the buffers must be for the same device, and must also be a
+ * multiple of the current approved size for the device.
+ */
+void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
+{
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		struct buffer_head *bh = bhs[i];
+
+		if (!trylock_buffer(bh))
+			continue;
+		if (rw == WRITE) {
+			if (test_clear_buffer_dirty(bh)) {
+				bh->b_end_io = end_buffer_write_sync;
+				get_bh(bh);
+				submit_bh(WRITE, bh);
+				continue;
+			}
+		} else {
+			if (!buffer_uptodate(bh)) {
+				bh->b_end_io = end_buffer_read_sync;
+				get_bh(bh);
+				submit_bh(rw, bh);
+				continue;
+			}
+		}
+		unlock_buffer(bh);
+	}
+}
+EXPORT_SYMBOL(ll_rw_block);
+
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+	lock_buffer(bh);
+	if (!test_clear_buffer_dirty(bh)) {
+		unlock_buffer(bh);
+		return;
+	}
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
+
+/*
+ * For a data-integrity writeout, we need to wait upon any in-progress I/O
+ * and then start new I/O and then wait upon it.  The caller must have a ref on
+ * the buffer_head.
+ */
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
+{
+	int ret = 0;
+
+	WARN_ON(atomic_read(&bh->b_count) < 1);
+	lock_buffer(bh);
+	if (test_clear_buffer_dirty(bh)) {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_write_sync;
+		ret = submit_bh(rw, bh);
+		wait_on_buffer(bh);
+		if (!ret && !buffer_uptodate(bh))
+			ret = -EIO;
+	} else {
+		unlock_buffer(bh);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__sync_dirty_buffer);
+
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+	return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
+EXPORT_SYMBOL(sync_dirty_buffer);
+
+/*
+ * try_to_free_buffers() checks if all the buffers on this particular page
+ * are unused, and releases them if so.
+ *
+ * Exclusion against try_to_free_buffers may be obtained by either
+ * locking the page or by holding its mapping's private_lock.
+ *
+ * If the page is dirty but all the buffers are clean then we need to
+ * be sure to mark the page clean as well.  This is because the page
+ * may be against a block device, and a later reattachment of buffers
+ * to a dirty page will set *all* buffers dirty.  Which would corrupt
+ * filesystem data on the same device.
+ *
+ * The same applies to regular filesystem pages: if all the buffers are
+ * clean then we set the page clean and proceed.  To do that, we require
+ * total exclusion from __set_page_dirty_buffers().  That is obtained with
+ * private_lock.
+ *
+ * try_to_free_buffers() is non-blocking.
+ */
+static inline int buffer_busy(struct buffer_head *bh)
+{
+	return atomic_read(&bh->b_count) |
+		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
+}
+
+static int
+drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
+{
+	struct buffer_head *head = page_buffers(page);
+	struct buffer_head *bh;
+
+	bh = head;
+	do {
+		if (buffer_write_io_error(bh) && page->mapping)
+			set_bit(AS_EIO, &page->mapping->flags);
+		if (buffer_busy(bh))
+			goto failed;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+
+		if (bh->b_assoc_map)
+			__remove_assoc_queue(bh);
+		bh = next;
+	} while (bh != head);
+	*buffers_to_free = head;
+	__clear_page_buffers(page);
+	return 1;
+failed:
+	return 0;
+}
+
+int try_to_free_buffers(struct page *page)
+{
+	struct address_space * const mapping = page->mapping;
+	struct buffer_head *buffers_to_free = NULL;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (PageWriteback(page))
+		return 0;
+
+	if (mapping == NULL) {		/* can this still happen? */
+		ret = drop_buffers(page, &buffers_to_free);
+		goto out;
+	}
+
+	spin_lock(&mapping->private_lock);
+	ret = drop_buffers(page, &buffers_to_free);
+
+	/*
+	 * If the filesystem writes its buffers by hand (eg ext3)
+	 * then we can have clean buffers against a dirty page.  We
+	 * clean the page here; otherwise the VM will never notice
+	 * that the filesystem did any IO at all.
+	 *
+	 * Also, during truncate, discard_buffer will have marked all
+	 * the page's buffers clean.  We discover that here and clean
+	 * the page also.
+	 *
+	 * private_lock must be held over this entire operation in order
+	 * to synchronise against __set_page_dirty_buffers and prevent the
+	 * dirty bit from being lost.
+	 */
+	if (ret && TestClearPageDirty(page))
+		account_page_cleaned(page, mapping);
+	spin_unlock(&mapping->private_lock);
+out:
+	if (buffers_to_free) {
+		struct buffer_head *bh = buffers_to_free;
+
+		do {
+			struct buffer_head *next = bh->b_this_page;
+			free_buffer_head(bh);
+			bh = next;
+		} while (bh != buffers_to_free);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(try_to_free_buffers);
+
+/*
+ * There are no bdflush tunables left.  But distributions are
+ * still running obsolete flush daemons, so we terminate them here.
+ *
+ * Use of bdflush() is deprecated and will be removed in a future kernel.
+ * The `flush-X' kernel threads fully replace bdflush daemons and this call.
+ */
+SYSCALL_DEFINE2(bdflush, int, func, long, data)
+{
+	static int msg_count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (msg_count < 5) {
+		msg_count++;
+		printk(KERN_INFO
+			"warning: process `%s' used the obsolete bdflush"
+			" system call\n", current->comm);
+		printk(KERN_INFO "Fix your initscripts?\n");
+	}
+
+	if (func == 1)
+		do_exit(0);
+	return 0;
+}
+
+/*
+ * Buffer-head allocation
+ */
+static struct kmem_cache *bh_cachep __read_mostly;
+
+/*
+ * Once the number of bh's in the machine exceeds this level, we start
+ * stripping them in writeback.
+ */
+static unsigned long max_buffer_heads;
+
+int buffer_heads_over_limit;
+
+struct bh_accounting {
+	int nr;			/* Number of live bh's */
+	int ratelimit;		/* Limit cacheline bouncing */
+};
+
+static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
+
+static void recalc_bh_state(void)
+{
+	int i;
+	int tot = 0;
+
+	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
+		return;
+	__this_cpu_write(bh_accounting.ratelimit, 0);
+	for_each_online_cpu(i)
+		tot += per_cpu(bh_accounting, i).nr;
+	buffer_heads_over_limit = (tot > max_buffer_heads);
+}
+
+struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
+{
+	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
+	if (ret) {
+		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		buffer_head_init_locks(ret);
+		preempt_disable();
+		__this_cpu_inc(bh_accounting.nr);
+		recalc_bh_state();
+		preempt_enable();
+	}
+	return ret;
+}
+EXPORT_SYMBOL(alloc_buffer_head);
+
+void free_buffer_head(struct buffer_head *bh)
+{
+	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	kmem_cache_free(bh_cachep, bh);
+	preempt_disable();
+	__this_cpu_dec(bh_accounting.nr);
+	recalc_bh_state();
+	preempt_enable();
+}
+EXPORT_SYMBOL(free_buffer_head);
+
+static void buffer_exit_cpu(int cpu)
+{
+	int i;
+	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
+	per_cpu(bh_accounting, cpu).nr = 0;
+}
+
+static int buffer_cpu_notify(struct notifier_block *self,
+			      unsigned long action, void *hcpu)
+{
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+		buffer_exit_cpu((unsigned long)hcpu);
+	return NOTIFY_OK;
+}
+
+/**
+ * bh_uptodate_or_lock - Test whether the buffer is uptodate
+ * @bh: struct buffer_head
+ *
+ * Return true if the buffer is up-to-date and false,
+ * with the buffer locked, if not.
+ */
+int bh_uptodate_or_lock(struct buffer_head *bh)
+{
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+		if (!buffer_uptodate(bh))
+			return 0;
+		unlock_buffer(bh);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(bh_uptodate_or_lock);
+
+/**
+ * bh_submit_read - Submit a locked buffer for reading
+ * @bh: struct buffer_head
+ *
+ * Returns zero on success and -EIO on error.
+ */
+int bh_submit_read(struct buffer_head *bh)
+{
+	BUG_ON(!buffer_locked(bh));
+
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	submit_bh(READ, bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return 0;
+	return -EIO;
+}
+EXPORT_SYMBOL(bh_submit_read);
+
+void __init buffer_init(void)
+{
+	unsigned long nrpages;
+
+	bh_cachep = kmem_cache_create("buffer_head",
+			sizeof(struct buffer_head), 0,
+				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
+				SLAB_MEM_SPREAD),
+				NULL);
+
+	/*
+	 * Limit the bh occupancy to 10% of ZONE_NORMAL
+	 */
+	nrpages = (nr_free_buffer_pages() * 10) / 100;
+	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
+	hotcpu_notifier(buffer_cpu_notify, 0);
+}
diff --git a/kernel/fs/cachefiles/Kconfig b/kernel/fs/cachefiles/Kconfig
new file mode 100644
index 000000000..80e9c6167
--- /dev/null
+++ b/kernel/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
+
+config CACHEFILES
+	tristate "Filesystem caching on files"
+	depends on FSCACHE && BLOCK
+	help
+	  This permits use of a mounted filesystem as a cache for other
+	  filesystems - primarily networking filesystems - thus allowing fast
+	  local disk to enhance the speed of slower devices.
+
+	  See Documentation/filesystems/caching/cachefiles.txt for more
+	  information.
+
+config CACHEFILES_DEBUG
+	bool "Debug CacheFiles"
+	depends on CACHEFILES
+	help
+	  This permits debugging to be dynamically enabled in the filesystem
+	  caching on files module.  If this is set, the debugging output may be
+	  enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
+	  by including a debugging specifier in /etc/cachefilesd.conf.
+
+config CACHEFILES_HISTOGRAM
+	bool "Gather latency information on CacheFiles"
+	depends on CACHEFILES && PROC_FS
+	help
+
+	  This option causes latency information to be gathered on CacheFiles
+	  operation and exported through file:
+
+		/proc/fs/cachefiles/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/cachefiles.txt for more
+	  information.
diff --git a/kernel/fs/cachefiles/Makefile b/kernel/fs/cachefiles/Makefile
new file mode 100644
index 000000000..32cbab0ff
--- /dev/null
+++ b/kernel/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for caching in a mounted filesystem
+#
+
+cachefiles-y := \
+	bind.o \
+	daemon.o \
+	interface.o \
+	key.o \
+	main.o \
+	namei.o \
+	rdwr.o \
+	security.o \
+	xattr.o
+
+cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+
+obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/kernel/fs/cachefiles/bind.c b/kernel/fs/cachefiles/bind.c
new file mode 100644
index 000000000..6af790fc3
--- /dev/null
+++ b/kernel/fs/cachefiles/bind.c
@@ -0,0 +1,277 @@
+/* Bind and unbind a cache from the filesystem backing it
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include "internal.h"
+
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
+
+/*
+ * bind a directory as a cache
+ */
+int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
+{
+	_enter("{%u,%u,%u,%u,%u,%u},%s",
+	       cache->frun_percent,
+	       cache->fcull_percent,
+	       cache->fstop_percent,
+	       cache->brun_percent,
+	       cache->bcull_percent,
+	       cache->bstop_percent,
+	       args);
+
+	/* start by checking things over */
+	ASSERT(cache->fstop_percent >= 0 &&
+	       cache->fstop_percent < cache->fcull_percent &&
+	       cache->fcull_percent < cache->frun_percent &&
+	       cache->frun_percent  < 100);
+
+	ASSERT(cache->bstop_percent >= 0 &&
+	       cache->bstop_percent < cache->bcull_percent &&
+	       cache->bcull_percent < cache->brun_percent &&
+	       cache->brun_percent  < 100);
+
+	if (*args) {
+		pr_err("'bind' command doesn't take an argument\n");
+		return -EINVAL;
+	}
+
+	if (!cache->rootdirname) {
+		pr_err("No cache directory specified\n");
+		return -EINVAL;
+	}
+
+	/* don't permit already bound caches to be re-bound */
+	if (test_bit(CACHEFILES_READY, &cache->flags)) {
+		pr_err("Cache already bound\n");
+		return -EBUSY;
+	}
+
+	/* make sure we have copies of the tag and dirname strings */
+	if (!cache->tag) {
+		/* the tag string is released by the fops->release()
+		 * function, so we don't release it on error here */
+		cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+		if (!cache->tag)
+			return -ENOMEM;
+	}
+
+	/* add the cache */
+	return cachefiles_daemon_add_cache(cache);
+}
+
+/*
+ * add a cache
+ */
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
+{
+	struct cachefiles_object *fsdef;
+	struct path path;
+	struct kstatfs stats;
+	struct dentry *graveyard, *cachedir, *root;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("");
+
+	/* we want to work under the module's security ID */
+	ret = cachefiles_get_security_ID(cache);
+	if (ret < 0)
+		return ret;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+
+	/* allocate the root index object */
+	ret = -ENOMEM;
+
+	fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+	if (!fsdef)
+		goto error_root_object;
+
+	ASSERTCMP(fsdef->backer, ==, NULL);
+
+	atomic_set(&fsdef->usage, 1);
+	fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+
+	_debug("- fsdef %p", fsdef);
+
+	/* look up the directory at the root of the cache */
+	ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
+	if (ret < 0)
+		goto error_open_root;
+
+	cache->mnt = path.mnt;
+	root = path.dentry;
+
+	/* check parameters */
+	ret = -EOPNOTSUPP;
+	if (d_is_negative(root) ||
+	    !d_backing_inode(root)->i_op->lookup ||
+	    !d_backing_inode(root)->i_op->mkdir ||
+	    !d_backing_inode(root)->i_op->setxattr ||
+	    !d_backing_inode(root)->i_op->getxattr ||
+	    !root->d_sb->s_op->statfs ||
+	    !root->d_sb->s_op->sync_fs)
+		goto error_unsupported;
+
+	ret = -EROFS;
+	if (root->d_sb->s_flags & MS_RDONLY)
+		goto error_unsupported;
+
+	/* determine the security of the on-disk cache as this governs
+	 * security ID of files we create */
+	ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+	if (ret < 0)
+		goto error_unsupported;
+
+	/* get the cache size and blocksize */
+	ret = vfs_statfs(&path, &stats);
+	if (ret < 0)
+		goto error_unsupported;
+
+	ret = -ERANGE;
+	if (stats.f_bsize <= 0)
+		goto error_unsupported;
+
+	ret = -EOPNOTSUPP;
+	if (stats.f_bsize > PAGE_SIZE)
+		goto error_unsupported;
+
+	cache->bsize = stats.f_bsize;
+	cache->bshift = 0;
+	if (stats.f_bsize < PAGE_SIZE)
+		cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
+
+	_debug("blksize %u (shift %u)",
+	       cache->bsize, cache->bshift);
+
+	_debug("size %llu, avail %llu",
+	       (unsigned long long) stats.f_blocks,
+	       (unsigned long long) stats.f_bavail);
+
+	/* set up caching limits */
+	do_div(stats.f_files, 100);
+	cache->fstop = stats.f_files * cache->fstop_percent;
+	cache->fcull = stats.f_files * cache->fcull_percent;
+	cache->frun  = stats.f_files * cache->frun_percent;
+
+	_debug("limits {%llu,%llu,%llu} files",
+	       (unsigned long long) cache->frun,
+	       (unsigned long long) cache->fcull,
+	       (unsigned long long) cache->fstop);
+
+	stats.f_blocks >>= cache->bshift;
+	do_div(stats.f_blocks, 100);
+	cache->bstop = stats.f_blocks * cache->bstop_percent;
+	cache->bcull = stats.f_blocks * cache->bcull_percent;
+	cache->brun  = stats.f_blocks * cache->brun_percent;
+
+	_debug("limits {%llu,%llu,%llu} blocks",
+	       (unsigned long long) cache->brun,
+	       (unsigned long long) cache->bcull,
+	       (unsigned long long) cache->bstop);
+
+	/* get the cache directory and check its type */
+	cachedir = cachefiles_get_directory(cache, root, "cache");
+	if (IS_ERR(cachedir)) {
+		ret = PTR_ERR(cachedir);
+		goto error_unsupported;
+	}
+
+	fsdef->dentry = cachedir;
+	fsdef->fscache.cookie = NULL;
+
+	ret = cachefiles_check_object_type(fsdef);
+	if (ret < 0)
+		goto error_unsupported;
+
+	/* get the graveyard directory */
+	graveyard = cachefiles_get_directory(cache, root, "graveyard");
+	if (IS_ERR(graveyard)) {
+		ret = PTR_ERR(graveyard);
+		goto error_unsupported;
+	}
+
+	cache->graveyard = graveyard;
+
+	/* publish the cache */
+	fscache_init_cache(&cache->cache,
+			   &cachefiles_cache_ops,
+			   "%s",
+			   fsdef->dentry->d_sb->s_id);
+
+	fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
+
+	ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
+	if (ret < 0)
+		goto error_add_cache;
+
+	/* done */
+	set_bit(CACHEFILES_READY, &cache->flags);
+	dput(root);
+
+	pr_info("File cache on %s registered\n", cache->cache.identifier);
+
+	/* check how much space the cache has */
+	cachefiles_has_space(cache, 0, 0);
+	cachefiles_end_secure(cache, saved_cred);
+	return 0;
+
+error_add_cache:
+	dput(cache->graveyard);
+	cache->graveyard = NULL;
+error_unsupported:
+	mntput(cache->mnt);
+	cache->mnt = NULL;
+	dput(fsdef->dentry);
+	fsdef->dentry = NULL;
+	dput(root);
+error_open_root:
+	kmem_cache_free(cachefiles_object_jar, fsdef);
+error_root_object:
+	cachefiles_end_secure(cache, saved_cred);
+	pr_err("Failed to register: %d\n", ret);
+	return ret;
+}
+
+/*
+ * unbind a cache on fd release
+ */
+void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+	_enter("");
+
+	if (test_bit(CACHEFILES_READY, &cache->flags)) {
+		pr_info("File cache on %s unregistering\n",
+			cache->cache.identifier);
+
+		fscache_withdraw_cache(&cache->cache);
+	}
+
+	dput(cache->graveyard);
+	mntput(cache->mnt);
+
+	kfree(cache->rootdirname);
+	kfree(cache->secctx);
+	kfree(cache->tag);
+
+	_leave("");
+}
diff --git a/kernel/fs/cachefiles/daemon.c b/kernel/fs/cachefiles/daemon.c
new file mode 100644
index 000000000..f601def05
--- /dev/null
+++ b/kernel/fs/cachefiles/daemon.c
@@ -0,0 +1,751 @@
+/* Daemon interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+
+static int cachefiles_daemon_open(struct inode *, struct file *);
+static int cachefiles_daemon_release(struct inode *, struct file *);
+static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
+				      loff_t *);
+static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
+				       size_t, loff_t *);
+static unsigned int cachefiles_daemon_poll(struct file *,
+					   struct poll_table_struct *);
+static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+
+static unsigned long cachefiles_open;
+
+const struct file_operations cachefiles_daemon_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cachefiles_daemon_open,
+	.release	= cachefiles_daemon_release,
+	.read		= cachefiles_daemon_read,
+	.write		= cachefiles_daemon_write,
+	.poll		= cachefiles_daemon_poll,
+	.llseek		= noop_llseek,
+};
+
+struct cachefiles_daemon_cmd {
+	char name[8];
+	int (*handler)(struct cachefiles_cache *cache, char *args);
+};
+
+static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
+	{ "bind",	cachefiles_daemon_bind		},
+	{ "brun",	cachefiles_daemon_brun		},
+	{ "bcull",	cachefiles_daemon_bcull		},
+	{ "bstop",	cachefiles_daemon_bstop		},
+	{ "cull",	cachefiles_daemon_cull		},
+	{ "debug",	cachefiles_daemon_debug		},
+	{ "dir",	cachefiles_daemon_dir		},
+	{ "frun",	cachefiles_daemon_frun		},
+	{ "fcull",	cachefiles_daemon_fcull		},
+	{ "fstop",	cachefiles_daemon_fstop		},
+	{ "inuse",	cachefiles_daemon_inuse		},
+	{ "secctx",	cachefiles_daemon_secctx	},
+	{ "tag",	cachefiles_daemon_tag		},
+	{ "",		NULL				}
+};
+
+
+/*
+ * do various checks
+ */
+static int cachefiles_daemon_open(struct inode *inode, struct file *file)
+{
+	struct cachefiles_cache *cache;
+
+	_enter("");
+
+	/* only the superuser may do this */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* the cachefiles device may only be open once at a time */
+	if (xchg(&cachefiles_open, 1) == 1)
+		return -EBUSY;
+
+	/* allocate a cache record */
+	cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
+	if (!cache) {
+		cachefiles_open = 0;
+		return -ENOMEM;
+	}
+
+	mutex_init(&cache->daemon_mutex);
+	cache->active_nodes = RB_ROOT;
+	rwlock_init(&cache->active_lock);
+	init_waitqueue_head(&cache->daemon_pollwq);
+
+	/* set default caching limits
+	 * - limit at 1% free space and/or free files
+	 * - cull below 5% free space and/or free files
+	 * - cease culling above 7% free space and/or free files
+	 */
+	cache->frun_percent = 7;
+	cache->fcull_percent = 5;
+	cache->fstop_percent = 1;
+	cache->brun_percent = 7;
+	cache->bcull_percent = 5;
+	cache->bstop_percent = 1;
+
+	file->private_data = cache;
+	cache->cachefilesd = file;
+	return 0;
+}
+
+/*
+ * release a cache
+ */
+static int cachefiles_daemon_release(struct inode *inode, struct file *file)
+{
+	struct cachefiles_cache *cache = file->private_data;
+
+	_enter("");
+
+	ASSERT(cache);
+
+	set_bit(CACHEFILES_DEAD, &cache->flags);
+
+	cachefiles_daemon_unbind(cache);
+
+	ASSERT(!cache->active_nodes.rb_node);
+
+	/* clean up the control file interface */
+	cache->cachefilesd = NULL;
+	file->private_data = NULL;
+	cachefiles_open = 0;
+
+	kfree(cache);
+
+	_leave("");
+	return 0;
+}
+
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+				      size_t buflen, loff_t *pos)
+{
+	struct cachefiles_cache *cache = file->private_data;
+	char buffer[256];
+	int n;
+
+	//_enter(",,%zu,", buflen);
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags))
+		return 0;
+
+	/* check how much space the cache has */
+	cachefiles_has_space(cache, 0, 0);
+
+	/* summarise */
+	clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+
+	n = snprintf(buffer, sizeof(buffer),
+		     "cull=%c"
+		     " frun=%llx"
+		     " fcull=%llx"
+		     " fstop=%llx"
+		     " brun=%llx"
+		     " bcull=%llx"
+		     " bstop=%llx",
+		     test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
+		     (unsigned long long) cache->frun,
+		     (unsigned long long) cache->fcull,
+		     (unsigned long long) cache->fstop,
+		     (unsigned long long) cache->brun,
+		     (unsigned long long) cache->bcull,
+		     (unsigned long long) cache->bstop
+		     );
+
+	if (n > buflen)
+		return -EMSGSIZE;
+
+	if (copy_to_user(_buffer, buffer, n) != 0)
+		return -EFAULT;
+
+	return n;
+}
+
+/*
+ * command the cache
+ */
+static ssize_t cachefiles_daemon_write(struct file *file,
+				       const char __user *_data,
+				       size_t datalen,
+				       loff_t *pos)
+{
+	const struct cachefiles_daemon_cmd *cmd;
+	struct cachefiles_cache *cache = file->private_data;
+	ssize_t ret;
+	char *data, *args, *cp;
+
+	//_enter(",,%zu,", datalen);
+
+	ASSERT(cache);
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags))
+		return -EIO;
+
+	if (datalen < 0 || datalen > PAGE_SIZE - 1)
+		return -EOPNOTSUPP;
+
+	/* drag the command string into the kernel so we can parse it */
+	data = kmalloc(datalen + 1, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(data, _data, datalen) != 0)
+		goto error;
+
+	data[datalen] = '\0';
+
+	ret = -EINVAL;
+	if (memchr(data, '\0', datalen))
+		goto error;
+
+	/* strip any newline */
+	cp = memchr(data, '\n', datalen);
+	if (cp) {
+		if (cp == data)
+			goto error;
+
+		*cp = '\0';
+	}
+
+	/* parse the command */
+	ret = -EOPNOTSUPP;
+
+	for (args = data; *args; args++)
+		if (isspace(*args))
+			break;
+	if (*args) {
+		if (args == data)
+			goto error;
+		*args = '\0';
+		args = skip_spaces(++args);
+	}
+
+	/* run the appropriate command handler */
+	for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
+		if (strcmp(cmd->name, data) == 0)
+			goto found_command;
+
+error:
+	kfree(data);
+	//_leave(" = %zd", ret);
+	return ret;
+
+found_command:
+	mutex_lock(&cache->daemon_mutex);
+
+	ret = -EIO;
+	if (!test_bit(CACHEFILES_DEAD, &cache->flags))
+		ret = cmd->handler(cache, args);
+
+	mutex_unlock(&cache->daemon_mutex);
+
+	if (ret == 0)
+		ret = datalen;
+	goto error;
+}
+
+/*
+ * poll for culling state
+ * - use POLLOUT to indicate culling state
+ */
+static unsigned int cachefiles_daemon_poll(struct file *file,
+					   struct poll_table_struct *poll)
+{
+	struct cachefiles_cache *cache = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &cache->daemon_pollwq, poll);
+	mask = 0;
+
+	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+		mask |= POLLIN;
+
+	if (test_bit(CACHEFILES_CULLING, &cache->flags))
+		mask |= POLLOUT;
+
+	return mask;
+}
+
+/*
+ * give a range error for cache space constraints
+ * - can be tail-called
+ */
+static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
+					 char *args)
+{
+	pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
+
+	return -EINVAL;
+}
+
+/*
+ * set the percentage of files at which to stop culling
+ * - command: "frun <N>%"
+ */
+static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long frun;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	frun = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (frun <= cache->fcull_percent || frun >= 100)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->frun_percent = frun;
+	return 0;
+}
+
+/*
+ * set the percentage of files at which to start culling
+ * - command: "fcull <N>%"
+ */
+static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long fcull;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	fcull = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->fcull_percent = fcull;
+	return 0;
+}
+
+/*
+ * set the percentage of files at which to stop allocating
+ * - command: "fstop <N>%"
+ */
+static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long fstop;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	fstop = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (fstop < 0 || fstop >= cache->fcull_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->fstop_percent = fstop;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop culling
+ * - command: "brun <N>%"
+ */
+static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long brun;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	brun = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (brun <= cache->bcull_percent || brun >= 100)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->brun_percent = brun;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to start culling
+ * - command: "bcull <N>%"
+ */
+static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long bcull;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	bcull = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->bcull_percent = bcull;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop allocating
+ * - command: "bstop <N>%"
+ */
+static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long bstop;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	bstop = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (bstop < 0 || bstop >= cache->bcull_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->bstop_percent = bstop;
+	return 0;
+}
+
+/*
+ * set the cache directory
+ * - command: "dir <name>"
+ */
+static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
+{
+	char *dir;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		pr_err("Empty directory specified\n");
+		return -EINVAL;
+	}
+
+	if (cache->rootdirname) {
+		pr_err("Second cache directory specified\n");
+		return -EEXIST;
+	}
+
+	dir = kstrdup(args, GFP_KERNEL);
+	if (!dir)
+		return -ENOMEM;
+
+	cache->rootdirname = dir;
+	return 0;
+}
+
+/*
+ * set the cache security context
+ * - command: "secctx <ctx>"
+ */
+static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
+{
+	char *secctx;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		pr_err("Empty security context specified\n");
+		return -EINVAL;
+	}
+
+	if (cache->secctx) {
+		pr_err("Second security context specified\n");
+		return -EINVAL;
+	}
+
+	secctx = kstrdup(args, GFP_KERNEL);
+	if (!secctx)
+		return -ENOMEM;
+
+	cache->secctx = secctx;
+	return 0;
+}
+
+/*
+ * set the cache tag
+ * - command: "tag <name>"
+ */
+static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
+{
+	char *tag;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		pr_err("Empty tag specified\n");
+		return -EINVAL;
+	}
+
+	if (cache->tag)
+		return -EEXIST;
+
+	tag = kstrdup(args, GFP_KERNEL);
+	if (!tag)
+		return -ENOMEM;
+
+	cache->tag = tag;
+	return 0;
+}
+
+/*
+ * request a node in the cache be culled from the current working directory
+ * - command: "cull <name>"
+ */
+static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
+{
+	struct path path;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter(",%s", args);
+
+	if (strchr(args, '/'))
+		goto inval;
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+		pr_err("cull applied to unready cache\n");
+		return -EIO;
+	}
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		pr_err("cull applied to dead cache\n");
+		return -EIO;
+	}
+
+	/* extract the directory dentry from the cwd */
+	get_fs_pwd(current->fs, &path);
+
+	if (!d_can_lookup(path.dentry))
+		goto notdir;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_cull(cache, path.dentry, args);
+	cachefiles_end_secure(cache, saved_cred);
+
+	path_put(&path);
+	_leave(" = %d", ret);
+	return ret;
+
+notdir:
+	path_put(&path);
+	pr_err("cull command requires dirfd to be a directory\n");
+	return -ENOTDIR;
+
+inval:
+	pr_err("cull command requires dirfd and filename\n");
+	return -EINVAL;
+}
+
+/*
+ * set debugging mode
+ * - command: "debug <mask>"
+ */
+static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long mask;
+
+	_enter(",%s", args);
+
+	mask = simple_strtoul(args, &args, 0);
+	if (args[0] != '\0')
+		goto inval;
+
+	cachefiles_debug = mask;
+	_leave(" = 0");
+	return 0;
+
+inval:
+	pr_err("debug command requires mask\n");
+	return -EINVAL;
+}
+
+/*
+ * find out whether an object in the current working directory is in use or not
+ * - command: "inuse <name>"
+ */
+static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
+{
+	struct path path;
+	const struct cred *saved_cred;
+	int ret;
+
+	//_enter(",%s", args);
+
+	if (strchr(args, '/'))
+		goto inval;
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+		pr_err("inuse applied to unready cache\n");
+		return -EIO;
+	}
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		pr_err("inuse applied to dead cache\n");
+		return -EIO;
+	}
+
+	/* extract the directory dentry from the cwd */
+	get_fs_pwd(current->fs, &path);
+
+	if (!d_can_lookup(path.dentry))
+		goto notdir;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_check_in_use(cache, path.dentry, args);
+	cachefiles_end_secure(cache, saved_cred);
+
+	path_put(&path);
+	//_leave(" = %d", ret);
+	return ret;
+
+notdir:
+	path_put(&path);
+	pr_err("inuse command requires dirfd to be a directory\n");
+	return -ENOTDIR;
+
+inval:
+	pr_err("inuse command requires dirfd and filename\n");
+	return -EINVAL;
+}
+
+/*
+ * see if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+			 unsigned fnr, unsigned bnr)
+{
+	struct kstatfs stats;
+	struct path path = {
+		.mnt	= cache->mnt,
+		.dentry	= cache->mnt->mnt_root,
+	};
+	int ret;
+
+	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+	//       (unsigned long long) cache->frun,
+	//       (unsigned long long) cache->fcull,
+	//       (unsigned long long) cache->fstop,
+	//       (unsigned long long) cache->brun,
+	//       (unsigned long long) cache->bcull,
+	//       (unsigned long long) cache->bstop,
+	//       fnr, bnr);
+
+	/* find out how many pages of blockdev are available */
+	memset(&stats, 0, sizeof(stats));
+
+	ret = vfs_statfs(&path, &stats);
+	if (ret < 0) {
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "statfs failed");
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	stats.f_bavail >>= cache->bshift;
+
+	//_debug("avail %llu,%llu",
+	//       (unsigned long long) stats.f_ffree,
+	//       (unsigned long long) stats.f_bavail);
+
+	/* see if there is sufficient space */
+	if (stats.f_ffree > fnr)
+		stats.f_ffree -= fnr;
+	else
+		stats.f_ffree = 0;
+
+	if (stats.f_bavail > bnr)
+		stats.f_bavail -= bnr;
+	else
+		stats.f_bavail = 0;
+
+	ret = -ENOBUFS;
+	if (stats.f_ffree < cache->fstop ||
+	    stats.f_bavail < cache->bstop)
+		goto begin_cull;
+
+	ret = 0;
+	if (stats.f_ffree < cache->fcull ||
+	    stats.f_bavail < cache->bcull)
+		goto begin_cull;
+
+	if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+	    stats.f_ffree >= cache->frun &&
+	    stats.f_bavail >= cache->brun &&
+	    test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+	    ) {
+		_debug("cease culling");
+		cachefiles_state_changed(cache);
+	}
+
+	//_leave(" = 0");
+	return 0;
+
+begin_cull:
+	if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+		_debug("### CULL CACHE ###");
+		cachefiles_state_changed(cache);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/kernel/fs/cachefiles/interface.c b/kernel/fs/cachefiles/interface.c
new file mode 100644
index 000000000..afa023dde
--- /dev/null
+++ b/kernel/fs/cachefiles/interface.c
@@ -0,0 +1,557 @@
+/* FS-Cache interface to CacheFiles
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include "internal.h"
+
+struct cachefiles_lookup_data {
+	struct cachefiles_xattr	*auxdata;	/* auxiliary data */
+	char			*key;		/* key path */
+};
+
+static int cachefiles_attr_changed(struct fscache_object *_object);
+
+/*
+ * allocate an object record for a cookie lookup and prepare the lookup data
+ */
+static struct fscache_object *cachefiles_alloc_object(
+	struct fscache_cache *_cache,
+	struct fscache_cookie *cookie)
+{
+	struct cachefiles_lookup_data *lookup_data;
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct cachefiles_xattr *auxdata;
+	unsigned keylen, auxlen;
+	void *buffer;
+	char *key;
+
+	cache = container_of(_cache, struct cachefiles_cache, cache);
+
+	_enter("{%s},%p,", cache->cache.identifier, cookie);
+
+	lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
+	if (!lookup_data)
+		goto nomem_lookup_data;
+
+	/* create a new object record and a temporary leaf image */
+	object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
+	if (!object)
+		goto nomem_object;
+
+	ASSERTCMP(object->backer, ==, NULL);
+
+	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+	atomic_set(&object->usage, 1);
+
+	fscache_object_init(&object->fscache, cookie, &cache->cache);
+
+	object->type = cookie->def->type;
+
+	/* get hold of the raw key
+	 * - stick the length on the front and leave space on the back for the
+	 *   encoder
+	 */
+	buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
+	if (!buffer)
+		goto nomem_buffer;
+
+	keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
+	ASSERTCMP(keylen, <, 512);
+
+	*(uint16_t *)buffer = keylen;
+	((char *)buffer)[keylen + 2] = 0;
+	((char *)buffer)[keylen + 3] = 0;
+	((char *)buffer)[keylen + 4] = 0;
+
+	/* turn the raw key into something that can work with as a filename */
+	key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+	if (!key)
+		goto nomem_key;
+
+	/* get hold of the auxiliary data and prepend the object type */
+	auxdata = buffer;
+	auxlen = 0;
+	if (cookie->def->get_aux) {
+		auxlen = cookie->def->get_aux(cookie->netfs_data,
+					      auxdata->data, 511);
+		ASSERTCMP(auxlen, <, 511);
+	}
+
+	auxdata->len = auxlen + 1;
+	auxdata->type = cookie->def->type;
+
+	lookup_data->auxdata = auxdata;
+	lookup_data->key = key;
+	object->lookup_data = lookup_data;
+
+	_leave(" = %p [%p]", &object->fscache, lookup_data);
+	return &object->fscache;
+
+nomem_key:
+	kfree(buffer);
+nomem_buffer:
+	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+	kmem_cache_free(cachefiles_object_jar, object);
+	fscache_object_destroyed(&cache->cache);
+nomem_object:
+	kfree(lookup_data);
+nomem_lookup_data:
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * attempt to look up the nominated node in this cache
+ * - return -ETIMEDOUT to be scheduled again
+ */
+static int cachefiles_lookup_object(struct fscache_object *_object)
+{
+	struct cachefiles_lookup_data *lookup_data;
+	struct cachefiles_object *parent, *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("{OBJ%x}", _object->debug_id);
+
+	cache = container_of(_object->cache, struct cachefiles_cache, cache);
+	parent = container_of(_object->parent,
+			      struct cachefiles_object, fscache);
+	object = container_of(_object, struct cachefiles_object, fscache);
+	lookup_data = object->lookup_data;
+
+	ASSERTCMP(lookup_data, !=, NULL);
+
+	/* look up the key, creating any missing bits */
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_walk_to_object(parent, object,
+					lookup_data->key,
+					lookup_data->auxdata);
+	cachefiles_end_secure(cache, saved_cred);
+
+	/* polish off by setting the attributes of non-index files */
+	if (ret == 0 &&
+	    object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+		cachefiles_attr_changed(&object->fscache);
+
+	if (ret < 0 && ret != -ETIMEDOUT) {
+		if (ret != -ENOBUFS)
+			pr_warn("Lookup failed error %d\n", ret);
+		fscache_object_lookup_error(&object->fscache);
+	}
+
+	_leave(" [%d]", ret);
+	return ret;
+}
+
+/*
+ * indication of lookup completion
+ */
+static void cachefiles_lookup_complete(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+
+	if (object->lookup_data) {
+		kfree(object->lookup_data->key);
+		kfree(object->lookup_data->auxdata);
+		kfree(object->lookup_data);
+		object->lookup_data = NULL;
+	}
+}
+
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object =
+		container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	atomic_inc(&object->usage);
+	return &object->fscache;
+}
+
+/*
+ * update the auxiliary data for an object object on disk
+ */
+static void cachefiles_update_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_xattr *auxdata;
+	struct cachefiles_cache *cache;
+	struct fscache_cookie *cookie;
+	const struct cred *saved_cred;
+	unsigned auxlen;
+
+	_enter("{OBJ%x}", _object->debug_id);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache, struct cachefiles_cache,
+			     cache);
+
+	if (!fscache_use_cookie(_object)) {
+		_leave(" [relinq]");
+		return;
+	}
+
+	cookie = object->fscache.cookie;
+
+	if (!cookie->def->get_aux) {
+		fscache_unuse_cookie(_object);
+		_leave(" [no aux]");
+		return;
+	}
+
+	auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
+	if (!auxdata) {
+		fscache_unuse_cookie(_object);
+		_leave(" [nomem]");
+		return;
+	}
+
+	auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+	fscache_unuse_cookie(_object);
+	ASSERTCMP(auxlen, <, 511);
+
+	auxdata->len = auxlen + 1;
+	auxdata->type = cookie->def->type;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	cachefiles_update_object_xattr(object, auxdata);
+	cachefiles_end_secure(cache, saved_cred);
+	kfree(auxdata);
+	_leave("");
+}
+
+/*
+ * discard the resources pinned by an object and effect retirement if
+ * requested
+ */
+static void cachefiles_drop_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+
+	ASSERT(_object);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}",
+	       object->fscache.debug_id, atomic_read(&object->usage));
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	/* We need to tidy the object up if we did in fact manage to open it.
+	 * It's possible for us to get here before the object is fully
+	 * initialised if the parent goes away or the object gets retired
+	 * before we set it up.
+	 */
+	if (object->dentry) {
+		/* delete retired objects */
+		if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
+		    _object != cache->cache.fsdef
+		    ) {
+			_debug("- retire object OBJ%x", object->fscache.debug_id);
+			cachefiles_begin_secure(cache, &saved_cred);
+			cachefiles_delete_object(cache, object);
+			cachefiles_end_secure(cache, saved_cred);
+		}
+
+		/* close the filesystem stuff attached to the object */
+		if (object->backer != object->dentry)
+			dput(object->backer);
+		object->backer = NULL;
+	}
+
+	/* note that the object is now inactive */
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+		write_lock(&cache->active_lock);
+		if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
+					&object->flags))
+			BUG();
+		rb_erase(&object->active_node, &cache->active_nodes);
+		wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+		write_unlock(&cache->active_lock);
+	}
+
+	dput(object->dentry);
+	object->dentry = NULL;
+
+	_leave("");
+}
+
+/*
+ * dispose of a reference to an object
+ */
+static void cachefiles_put_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct fscache_cache *cache;
+
+	ASSERT(_object);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}",
+	       object->fscache.debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	ASSERTIFCMP(object->fscache.parent,
+		    object->fscache.parent->n_children, >, 0);
+
+	if (atomic_dec_and_test(&object->usage)) {
+		_debug("- kill object OBJ%x", object->fscache.debug_id);
+
+		ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+		ASSERTCMP(object->fscache.parent, ==, NULL);
+		ASSERTCMP(object->backer, ==, NULL);
+		ASSERTCMP(object->dentry, ==, NULL);
+		ASSERTCMP(object->fscache.n_ops, ==, 0);
+		ASSERTCMP(object->fscache.n_children, ==, 0);
+
+		if (object->lookup_data) {
+			kfree(object->lookup_data->key);
+			kfree(object->lookup_data->auxdata);
+			kfree(object->lookup_data);
+			object->lookup_data = NULL;
+		}
+
+		cache = object->fscache.cache;
+		fscache_object_destroy(&object->fscache);
+		kmem_cache_free(cachefiles_object_jar, object);
+		fscache_object_destroyed(cache);
+	}
+
+	_leave("");
+}
+
+/*
+ * sync a cache
+ */
+static void cachefiles_sync_cache(struct fscache_cache *_cache)
+{
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("%p", _cache);
+
+	cache = container_of(_cache, struct cachefiles_cache, cache);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disc */
+	cachefiles_begin_secure(cache, &saved_cred);
+	down_read(&cache->mnt->mnt_sb->s_umount);
+	ret = sync_filesystem(cache->mnt->mnt_sb);
+	up_read(&cache->mnt->mnt_sb->s_umount);
+	cachefiles_end_secure(cache, saved_cred);
+
+	if (ret == -EIO)
+		cachefiles_io_error(cache,
+				    "Attempt to sync backing fs superblock"
+				    " returned error %d",
+				    ret);
+}
+
+/*
+ * check if the backing cache is updated to FS-Cache
+ * - called by FS-Cache when evaluates if need to invalidate the cache
+ */
+static bool cachefiles_check_consistency(struct fscache_operation *op)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("{OBJ%x}", op->object->debug_id);
+
+	object = container_of(op->object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_check_auxdata(object);
+	cachefiles_end_secure(cache, saved_cred);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * notification the attributes on an object have changed
+ * - called with reads/writes excluded by FS-Cache
+ */
+static int cachefiles_attr_changed(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	struct iattr newattrs;
+	uint64_t ni_size;
+	loff_t oi_size;
+	int ret;
+
+	_object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+
+	_enter("{OBJ%x},[%llu]",
+	       _object->debug_id, (unsigned long long) ni_size);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	if (ni_size == object->i_size)
+		return 0;
+
+	if (!object->backer)
+		return -ENOBUFS;
+
+	ASSERT(d_is_reg(object->backer));
+
+	fscache_set_store_limit(&object->fscache, ni_size);
+
+	oi_size = i_size_read(d_backing_inode(object->backer));
+	if (oi_size == ni_size)
+		return 0;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	mutex_lock(&d_inode(object->backer)->i_mutex);
+
+	/* if there's an extension to a partial page at the end of the backing
+	 * file, we need to discard the partial page so that we pick up new
+	 * data after it */
+	if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+		_debug("discard tail %llx", oi_size);
+		newattrs.ia_valid = ATTR_SIZE;
+		newattrs.ia_size = oi_size & PAGE_MASK;
+		ret = notify_change(object->backer, &newattrs, NULL);
+		if (ret < 0)
+			goto truncate_failed;
+	}
+
+	newattrs.ia_valid = ATTR_SIZE;
+	newattrs.ia_size = ni_size;
+	ret = notify_change(object->backer, &newattrs, NULL);
+
+truncate_failed:
+	mutex_unlock(&d_inode(object->backer)->i_mutex);
+	cachefiles_end_secure(cache, saved_cred);
+
+	if (ret == -EIO) {
+		fscache_set_store_limit(&object->fscache, 0);
+		cachefiles_io_error_obj(object, "Size set failed");
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Invalidate an object
+ */
+static void cachefiles_invalidate_object(struct fscache_operation *op)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	struct path path;
+	uint64_t ni_size;
+	int ret;
+
+	object = container_of(op->object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
+					  &ni_size);
+
+	_enter("{OBJ%x},[%llu]",
+	       op->object->debug_id, (unsigned long long)ni_size);
+
+	if (object->backer) {
+		ASSERT(d_is_reg(object->backer));
+
+		fscache_set_store_limit(&object->fscache, ni_size);
+
+		path.dentry = object->backer;
+		path.mnt = cache->mnt;
+
+		cachefiles_begin_secure(cache, &saved_cred);
+		ret = vfs_truncate(&path, 0);
+		if (ret == 0)
+			ret = vfs_truncate(&path, ni_size);
+		cachefiles_end_secure(cache, saved_cred);
+
+		if (ret != 0) {
+			fscache_set_store_limit(&object->fscache, 0);
+			if (ret == -EIO)
+				cachefiles_io_error_obj(object,
+							"Invalidate failed");
+		}
+	}
+
+	fscache_op_complete(op, true);
+	_leave("");
+}
+
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefiles_dissociate_pages(struct fscache_cache *cache)
+{
+	_enter("");
+}
+
+const struct fscache_cache_ops cachefiles_cache_ops = {
+	.name			= "cachefiles",
+	.alloc_object		= cachefiles_alloc_object,
+	.lookup_object		= cachefiles_lookup_object,
+	.lookup_complete	= cachefiles_lookup_complete,
+	.grab_object		= cachefiles_grab_object,
+	.update_object		= cachefiles_update_object,
+	.invalidate_object	= cachefiles_invalidate_object,
+	.drop_object		= cachefiles_drop_object,
+	.put_object		= cachefiles_put_object,
+	.sync_cache		= cachefiles_sync_cache,
+	.attr_changed		= cachefiles_attr_changed,
+	.read_or_alloc_page	= cachefiles_read_or_alloc_page,
+	.read_or_alloc_pages	= cachefiles_read_or_alloc_pages,
+	.allocate_page		= cachefiles_allocate_page,
+	.allocate_pages		= cachefiles_allocate_pages,
+	.write_page		= cachefiles_write_page,
+	.uncache_page		= cachefiles_uncache_page,
+	.dissociate_pages	= cachefiles_dissociate_pages,
+	.check_consistency	= cachefiles_check_consistency,
+};
diff --git a/kernel/fs/cachefiles/internal.h b/kernel/fs/cachefiles/internal.h
new file mode 100644
index 000000000..8c52472d2
--- /dev/null
+++ b/kernel/fs/cachefiles/internal.h
@@ -0,0 +1,363 @@
+/* General netfs cache on cache files internal defs
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "CacheFiles: " fmt
+
+
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+
+struct cachefiles_cache;
+struct cachefiles_object;
+
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER	1
+#define CACHEFILES_DEBUG_KLEAVE	2
+#define CACHEFILES_DEBUG_KDEBUG	4
+
+#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+
+/*
+ * node records
+ */
+struct cachefiles_object {
+	struct fscache_object		fscache;	/* fscache handle */
+	struct cachefiles_lookup_data	*lookup_data;	/* cached lookup data */
+	struct dentry			*dentry;	/* the file/dir representing this object */
+	struct dentry			*backer;	/* backing file */
+	loff_t				i_size;		/* object size */
+	unsigned long			flags;
+#define CACHEFILES_OBJECT_ACTIVE	0		/* T if marked active */
+#define CACHEFILES_OBJECT_BURIED	1		/* T if preemptively buried */
+	atomic_t			usage;		/* object usage count */
+	uint8_t				type;		/* object type */
+	uint8_t				new;		/* T if object new */
+	spinlock_t			work_lock;
+	struct rb_node			active_node;	/* link in active tree (dentry is key) */
+};
+
+extern struct kmem_cache *cachefiles_object_jar;
+
+/*
+ * Cache files cache definition
+ */
+struct cachefiles_cache {
+	struct fscache_cache		cache;		/* FS-Cache record */
+	struct vfsmount			*mnt;		/* mountpoint holding the cache */
+	struct dentry			*graveyard;	/* directory into which dead objects go */
+	struct file			*cachefilesd;	/* manager daemon handle */
+	const struct cred		*cache_cred;	/* security override for accessing cache */
+	struct mutex			daemon_mutex;	/* command serialisation mutex */
+	wait_queue_head_t		daemon_pollwq;	/* poll waitqueue for daemon */
+	struct rb_root			active_nodes;	/* active nodes (can't be culled) */
+	rwlock_t			active_lock;	/* lock for active_nodes */
+	atomic_t			gravecounter;	/* graveyard uniquifier */
+	unsigned			frun_percent;	/* when to stop culling (% files) */
+	unsigned			fcull_percent;	/* when to start culling (% files) */
+	unsigned			fstop_percent;	/* when to stop allocating (% files) */
+	unsigned			brun_percent;	/* when to stop culling (% blocks) */
+	unsigned			bcull_percent;	/* when to start culling (% blocks) */
+	unsigned			bstop_percent;	/* when to stop allocating (% blocks) */
+	unsigned			bsize;		/* cache's block size */
+	unsigned			bshift;		/* min(ilog2(PAGE_SIZE / bsize), 0) */
+	uint64_t			frun;		/* when to stop culling */
+	uint64_t			fcull;		/* when to start culling */
+	uint64_t			fstop;		/* when to stop allocating */
+	sector_t			brun;		/* when to stop culling */
+	sector_t			bcull;		/* when to start culling */
+	sector_t			bstop;		/* when to stop allocating */
+	unsigned long			flags;
+#define CACHEFILES_READY		0	/* T if cache prepared */
+#define CACHEFILES_DEAD			1	/* T if cache dead */
+#define CACHEFILES_CULLING		2	/* T if cull engaged */
+#define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
+	char				*rootdirname;	/* name of cache root directory */
+	char				*secctx;	/* LSM security context */
+	char				*tag;		/* cache binding tag */
+};
+
+/*
+ * backing file read tracking
+ */
+struct cachefiles_one_read {
+	wait_queue_t			monitor;	/* link into monitored waitqueue */
+	struct page			*back_page;	/* backing file page we're waiting for */
+	struct page			*netfs_page;	/* netfs page we're going to fill */
+	struct fscache_retrieval	*op;		/* retrieval op covering this */
+	struct list_head		op_link;	/* link in op's todo list */
+};
+
+/*
+ * backing file write tracking
+ */
+struct cachefiles_one_write {
+	struct page			*netfs_page;	/* netfs page to copy */
+	struct cachefiles_object	*object;
+	struct list_head		obj_link;	/* link in object's lists */
+	fscache_rw_complete_t		end_io_func;
+	void				*context;
+};
+
+/*
+ * auxiliary data xattr buffer
+ */
+struct cachefiles_xattr {
+	uint16_t			len;
+	uint8_t				type;
+	uint8_t				data[];
+};
+
+/*
+ * note change of state for daemon
+ */
+static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
+{
+	set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+	wake_up_all(&cache->daemon_pollwq);
+}
+
+/*
+ * bind.c
+ */
+extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
+extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+
+/*
+ * daemon.c
+ */
+extern const struct file_operations cachefiles_daemon_fops;
+
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+				unsigned fnr, unsigned bnr);
+
+/*
+ * interface.c
+ */
+extern const struct fscache_cache_ops cachefiles_cache_ops;
+
+/*
+ * key.c
+ */
+extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+
+/*
+ * namei.c
+ */
+extern int cachefiles_delete_object(struct cachefiles_cache *cache,
+				    struct cachefiles_object *object);
+extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
+				     struct cachefiles_object *object,
+				     const char *key,
+				     struct cachefiles_xattr *auxdata);
+extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+					       struct dentry *dir,
+					       const char *name);
+
+extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+			   char *filename);
+
+extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
+				   struct dentry *dir, char *filename);
+
+/*
+ * proc.c
+ */
+#ifdef CONFIG_CACHEFILES_HISTOGRAM
+extern atomic_t cachefiles_lookup_histogram[HZ];
+extern atomic_t cachefiles_mkdir_histogram[HZ];
+extern atomic_t cachefiles_create_histogram[HZ];
+
+extern int __init cachefiles_proc_init(void);
+extern void cachefiles_proc_cleanup(void);
+static inline
+void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+#else
+#define cachefiles_proc_init()		(0)
+#define cachefiles_proc_cleanup()	do {} while (0)
+#define cachefiles_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * rdwr.c
+ */
+extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
+					 struct page *, gfp_t);
+extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
+					  struct list_head *, unsigned *,
+					  gfp_t);
+extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
+				    gfp_t);
+extern int cachefiles_allocate_pages(struct fscache_retrieval *,
+				     struct list_head *, unsigned *, gfp_t);
+extern int cachefiles_write_page(struct fscache_storage *, struct page *);
+extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+
+/*
+ * security.c
+ */
+extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
+extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+					       struct dentry *root,
+					       const struct cred **_saved_cred);
+
+static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
+					   const struct cred **_saved_cred)
+{
+	*_saved_cred = override_creds(cache->cache_cred);
+}
+
+static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
+					 const struct cred *saved_cred)
+{
+	revert_creds(saved_cred);
+}
+
+/*
+ * xattr.c
+ */
+extern int cachefiles_check_object_type(struct cachefiles_object *object);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
+				       struct cachefiles_xattr *auxdata);
+extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
+					  struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_auxdata(struct cachefiles_object *object);
+extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
+					 struct cachefiles_xattr *auxdata);
+extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+					  struct dentry *dentry);
+
+
+/*
+ * error handling
+ */
+
+#define cachefiles_io_error(___cache, FMT, ...)		\
+do {							\
+	pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__);	\
+	fscache_io_error(&(___cache)->cache);		\
+	set_bit(CACHEFILES_DEAD, &(___cache)->flags);	\
+} while (0)
+
+#define cachefiles_io_error_obj(object, FMT, ...)			\
+do {									\
+	struct cachefiles_cache *___cache;				\
+									\
+	___cache = container_of((object)->fscache.cache,		\
+				struct cachefiles_cache, cache);	\
+	cachefiles_io_error(___cache, FMT, ##__VA_ARGS__);		\
+} while (0)
+
+
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_CACHEFILES_DEBUG)
+#define _enter(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KENTER)	\
+		kenter(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#define _leave(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE)	\
+		kleave(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#define _debug(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG)	\
+		kdebug(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#else
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+#endif
+
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)							\
+do {									\
+	if (unlikely(!(X))) {						\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		pr_err("%lx " #OP " %lx is false\n",			\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIF(C, X)							\
+do {									\
+	if (unlikely((C) && !(X))) {					\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		pr_err("%lx " #OP " %lx is false\n",			\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#else
+
+#define ASSERT(X)			do {} while (0)
+#define ASSERTCMP(X, OP, Y)		do {} while (0)
+#define ASSERTIF(C, X)			do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)	do {} while (0)
+
+#endif
diff --git a/kernel/fs/cachefiles/key.c b/kernel/fs/cachefiles/key.c
new file mode 100644
index 000000000..33b58c60f
--- /dev/null
+++ b/kernel/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
+/* Key to pathname encoder
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include "internal.h"
+
+static const char cachefiles_charmap[64] =
+	"0123456789"			/* 0 - 9 */
+	"abcdefghijklmnopqrstuvwxyz"	/* 10 - 35 */
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"	/* 36 - 61 */
+	"_-"				/* 62 - 63 */
+	;
+
+static const char cachefiles_filecharmap[256] = {
+	/* we skip space and tab and control chars */
+	[33 ... 46] = 1,		/* '!' -> '.' */
+	/* we skip '/' as it's significant to pathwalk */
+	[48 ... 127] = 1,		/* '0' -> '~' */
+};
+
+/*
+ * turn the raw key into something cooked
+ * - the raw key should include the length in the two bytes at the front
+ * - the key may be up to 514 bytes in length (including the length word)
+ *   - "base64" encode the strange keys, mapping 3 bytes of raw to four of
+ *     cooked
+ *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
+ */
+char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+{
+	unsigned char csum, ch;
+	unsigned int acc;
+	char *key;
+	int loop, len, max, seg, mark, print;
+
+	_enter(",%d", keylen);
+
+	BUG_ON(keylen < 2 || keylen > 514);
+
+	csum = raw[0] + raw[1];
+	print = 1;
+	for (loop = 2; loop < keylen; loop++) {
+		ch = raw[loop];
+		csum += ch;
+		print &= cachefiles_filecharmap[ch];
+	}
+
+	if (print) {
+		/* if the path is usable ASCII, then we render it directly */
+		max = keylen - 2;
+		max += 2;	/* two base64'd length chars on the front */
+		max += 5;	/* @checksum/M */
+		max += 3 * 2;	/* maximum number of segment dividers (".../M")
+				 * is ((514 + 251) / 252) = 3
+				 */
+		max += 1;	/* NUL on end */
+	} else {
+		/* calculate the maximum length of the cooked key */
+		keylen = (keylen + 2) / 3;
+
+		max = keylen * 4;
+		max += 5;	/* @checksum/M */
+		max += 3 * 2;	/* maximum number of segment dividers (".../M")
+				 * is ((514 + 188) / 189) = 3
+				 */
+		max += 1;	/* NUL on end */
+	}
+
+	max += 1;	/* 2nd NUL on end */
+
+	_debug("max: %d", max);
+
+	key = kmalloc(max, cachefiles_gfp);
+	if (!key)
+		return NULL;
+
+	len = 0;
+
+	/* build the cooked key */
+	sprintf(key, "@%02x%c+", (unsigned) csum, 0);
+	len = 5;
+	mark = len - 1;
+
+	if (print) {
+		acc = *(uint16_t *) raw;
+		raw += 2;
+
+		key[len + 1] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		key[len] = cachefiles_charmap[acc & 63];
+		len += 2;
+
+		seg = 250;
+		for (loop = keylen; loop > 0; loop--) {
+			if (seg <= 0) {
+				key[len++] = '\0';
+				mark = len;
+				key[len++] = '+';
+				seg = 252;
+			}
+
+			key[len++] = *raw++;
+			ASSERT(len < max);
+		}
+
+		switch (type) {
+		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'I';	break;
+		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'D';	break;
+		default:				type = 'S';	break;
+		}
+	} else {
+		seg = 252;
+		for (loop = keylen; loop > 0; loop--) {
+			if (seg <= 0) {
+				key[len++] = '\0';
+				mark = len;
+				key[len++] = '+';
+				seg = 252;
+			}
+
+			acc = *raw++;
+			acc |= *raw++ << 8;
+			acc |= *raw++ << 16;
+
+			_debug("acc: %06x", acc);
+
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+
+			ASSERT(len < max);
+		}
+
+		switch (type) {
+		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'J';	break;
+		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'E';	break;
+		default:				type = 'T';	break;
+		}
+	}
+
+	key[mark] = type;
+	key[len++] = 0;
+	key[len] = 0;
+
+	_leave(" = %p %d", key, len);
+	return key;
+}
diff --git a/kernel/fs/cachefiles/main.c b/kernel/fs/cachefiles/main.c
new file mode 100644
index 000000000..711f13d8c
--- /dev/null
+++ b/kernel/fs/cachefiles/main.c
@@ -0,0 +1,105 @@
+/* Network filesystem caching backend to use cache files on a premounted
+ * filesystem
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/sysctl.h>
+#include <linux/miscdevice.h>
+#include "internal.h"
+
+unsigned cachefiles_debug;
+module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
+
+MODULE_DESCRIPTION("Mounted-filesystem based cache");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+struct kmem_cache *cachefiles_object_jar;
+
+static struct miscdevice cachefiles_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "cachefiles",
+	.fops	= &cachefiles_daemon_fops,
+};
+
+static void cachefiles_object_init_once(void *_object)
+{
+	struct cachefiles_object *object = _object;
+
+	memset(object, 0, sizeof(*object));
+	spin_lock_init(&object->work_lock);
+}
+
+/*
+ * initialise the fs caching module
+ */
+static int __init cachefiles_init(void)
+{
+	int ret;
+
+	ret = misc_register(&cachefiles_dev);
+	if (ret < 0)
+		goto error_dev;
+
+	/* create an object jar */
+	ret = -ENOMEM;
+	cachefiles_object_jar =
+		kmem_cache_create("cachefiles_object_jar",
+				  sizeof(struct cachefiles_object),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  cachefiles_object_init_once);
+	if (!cachefiles_object_jar) {
+		pr_notice("Failed to allocate an object jar\n");
+		goto error_object_jar;
+	}
+
+	ret = cachefiles_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+	pr_info("Loaded\n");
+	return 0;
+
+error_proc:
+	kmem_cache_destroy(cachefiles_object_jar);
+error_object_jar:
+	misc_deregister(&cachefiles_dev);
+error_dev:
+	pr_err("failed to register: %d\n", ret);
+	return ret;
+}
+
+fs_initcall(cachefiles_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit cachefiles_exit(void)
+{
+	pr_info("Unloading\n");
+
+	cachefiles_proc_cleanup();
+	kmem_cache_destroy(cachefiles_object_jar);
+	misc_deregister(&cachefiles_dev);
+}
+
+module_exit(cachefiles_exit);
diff --git a/kernel/fs/cachefiles/namei.c b/kernel/fs/cachefiles/namei.c
new file mode 100644
index 000000000..ab857ab9f
--- /dev/null
+++ b/kernel/fs/cachefiles/namei.c
@@ -0,0 +1,978 @@
+/* CacheFiles path walking and related routines
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+#define CACHEFILES_KEYBUF_SIZE 512
+
+/*
+ * dump debugging info about an object
+ */
+static noinline
+void __cachefiles_printk_object(struct cachefiles_object *object,
+				const char *prefix,
+				u8 *keybuf)
+{
+	struct fscache_cookie *cookie;
+	unsigned keylen, loop;
+
+	pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
+	pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
+	       prefix, object->fscache.state->name,
+	       object->fscache.flags, work_busy(&object->fscache.work),
+	       object->fscache.events, object->fscache.event_mask);
+	pr_err("%sops=%u inp=%u exc=%u\n",
+	       prefix, object->fscache.n_ops, object->fscache.n_in_progress,
+	       object->fscache.n_exclusive);
+	pr_err("%sparent=%p\n",
+	       prefix, object->fscache.parent);
+
+	spin_lock(&object->fscache.lock);
+	cookie = object->fscache.cookie;
+	if (cookie) {
+		pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+		       prefix,
+		       object->fscache.cookie,
+		       object->fscache.cookie->parent,
+		       object->fscache.cookie->netfs_data,
+		       object->fscache.cookie->flags);
+		if (keybuf && cookie->def)
+			keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
+						      CACHEFILES_KEYBUF_SIZE);
+		else
+			keylen = 0;
+	} else {
+		pr_err("%scookie=NULL\n", prefix);
+		keylen = 0;
+	}
+	spin_unlock(&object->fscache.lock);
+
+	if (keylen) {
+		pr_err("%skey=[%u] '", prefix, keylen);
+		for (loop = 0; loop < keylen; loop++)
+			pr_cont("%02x", keybuf[loop]);
+		pr_cont("'\n");
+	}
+}
+
+/*
+ * dump debugging info about a pair of objects
+ */
+static noinline void cachefiles_printk_object(struct cachefiles_object *object,
+					      struct cachefiles_object *xobject)
+{
+	u8 *keybuf;
+
+	keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
+	if (object)
+		__cachefiles_printk_object(object, "", keybuf);
+	if (xobject)
+		__cachefiles_printk_object(xobject, "x", keybuf);
+	kfree(keybuf);
+}
+
+/*
+ * mark the owner of a dentry, if there is one, to indicate that that dentry
+ * has been preemptively deleted
+ * - the caller must hold the i_mutex on the dentry's parent as required to
+ *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ */
+static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
+					  struct dentry *dentry)
+{
+	struct cachefiles_object *object;
+	struct rb_node *p;
+
+	_enter(",'%pd'", dentry);
+
+	write_lock(&cache->active_lock);
+
+	p = cache->active_nodes.rb_node;
+	while (p) {
+		object = rb_entry(p, struct cachefiles_object, active_node);
+		if (object->dentry > dentry)
+			p = p->rb_left;
+		else if (object->dentry < dentry)
+			p = p->rb_right;
+		else
+			goto found_dentry;
+	}
+
+	write_unlock(&cache->active_lock);
+	_leave(" [no owner]");
+	return;
+
+	/* found the dentry for  */
+found_dentry:
+	kdebug("preemptive burial: OBJ%x [%s] %p",
+	       object->fscache.debug_id,
+	       object->fscache.state->name,
+	       dentry);
+
+	if (fscache_object_is_live(&object->fscache)) {
+		pr_err("\n");
+		pr_err("Error: Can't preemptively bury live object\n");
+		cachefiles_printk_object(object, NULL);
+	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+		pr_err("Error: Object already preemptively buried\n");
+	}
+
+	write_unlock(&cache->active_lock);
+	_leave(" [owner marked]");
+}
+
+/*
+ * record the fact that an object is now active
+ */
+static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
+					 struct cachefiles_object *object)
+{
+	struct cachefiles_object *xobject;
+	struct rb_node **_p, *_parent = NULL;
+	struct dentry *dentry;
+
+	_enter(",%p", object);
+
+try_again:
+	write_lock(&cache->active_lock);
+
+	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+		pr_err("Error: Object already active\n");
+		cachefiles_printk_object(object, NULL);
+		BUG();
+	}
+
+	dentry = object->dentry;
+	_p = &cache->active_nodes.rb_node;
+	while (*_p) {
+		_parent = *_p;
+		xobject = rb_entry(_parent,
+				   struct cachefiles_object, active_node);
+
+		ASSERT(xobject != object);
+
+		if (xobject->dentry > dentry)
+			_p = &(*_p)->rb_left;
+		else if (xobject->dentry < dentry)
+			_p = &(*_p)->rb_right;
+		else
+			goto wait_for_old_object;
+	}
+
+	rb_link_node(&object->active_node, _parent, _p);
+	rb_insert_color(&object->active_node, &cache->active_nodes);
+
+	write_unlock(&cache->active_lock);
+	_leave(" = 0");
+	return 0;
+
+	/* an old object from a previous incarnation is hogging the slot - we
+	 * need to wait for it to be destroyed */
+wait_for_old_object:
+	if (fscache_object_is_live(&xobject->fscache)) {
+		pr_err("\n");
+		pr_err("Error: Unexpected object collision\n");
+		cachefiles_printk_object(object, xobject);
+		BUG();
+	}
+	atomic_inc(&xobject->usage);
+	write_unlock(&cache->active_lock);
+
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+		wait_queue_head_t *wq;
+
+		signed long timeout = 60 * HZ;
+		wait_queue_t wait;
+		bool requeue;
+
+		/* if the object we're waiting for is queued for processing,
+		 * then just put ourselves on the queue behind it */
+		if (work_pending(&xobject->fscache.work)) {
+			_debug("queue OBJ%x behind OBJ%x immediately",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		/* otherwise we sleep until either the object we're waiting for
+		 * is done, or the fscache_object is congested */
+		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
+		init_wait(&wait);
+		requeue = false;
+		do {
+			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
+				break;
+
+			requeue = fscache_object_sleep_till_congested(&timeout);
+		} while (timeout > 0 && !requeue);
+		finish_wait(wq, &wait);
+
+		if (requeue &&
+		    test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+			_debug("queue OBJ%x behind OBJ%x after wait",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		if (timeout <= 0) {
+			pr_err("\n");
+			pr_err("Error: Overlong wait for old active object to go away\n");
+			cachefiles_printk_object(object, xobject);
+			goto requeue;
+		}
+	}
+
+	ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
+
+	cache->cache.ops->put_object(&xobject->fscache);
+	goto try_again;
+
+requeue:
+	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	cache->cache.ops->put_object(&xobject->fscache);
+	_leave(" = -ETIMEDOUT");
+	return -ETIMEDOUT;
+}
+
+/*
+ * delete an object representation from the cache
+ * - file backed objects are unlinked
+ * - directory backed objects are stuffed into the graveyard for userspace to
+ *   delete
+ * - unlocks the directory mutex
+ */
+static int cachefiles_bury_object(struct cachefiles_cache *cache,
+				  struct dentry *dir,
+				  struct dentry *rep,
+				  bool preemptive)
+{
+	struct dentry *grave, *trap;
+	struct path path, path_to_graveyard;
+	char nbuffer[8 + 8 + 1];
+	int ret;
+
+	_enter(",'%pd','%pd'", dir, rep);
+
+	_debug("remove %p from %p", rep, dir);
+
+	/* non-directories can just be unlinked */
+	if (!d_is_dir(rep)) {
+		_debug("unlink stale object");
+
+		path.mnt = cache->mnt;
+		path.dentry = dir;
+		ret = security_path_unlink(&path, rep);
+		if (ret < 0) {
+			cachefiles_io_error(cache, "Unlink security error");
+		} else {
+			ret = vfs_unlink(d_inode(dir), rep, NULL);
+
+			if (preemptive)
+				cachefiles_mark_object_buried(cache, rep);
+		}
+
+		mutex_unlock(&d_inode(dir)->i_mutex);
+
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "Unlink failed");
+
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* directories have to be moved to the graveyard */
+	_debug("move stale object to graveyard");
+	mutex_unlock(&d_inode(dir)->i_mutex);
+
+try_again:
+	/* first step is to make up a grave dentry in the graveyard */
+	sprintf(nbuffer, "%08x%08x",
+		(uint32_t) get_seconds(),
+		(uint32_t) atomic_inc_return(&cache->gravecounter));
+
+	/* do the multiway lock magic */
+	trap = lock_rename(cache->graveyard, dir);
+
+	/* do some checks before getting the grave dentry */
+	if (rep->d_parent != dir) {
+		/* the entry was probably culled when we dropped the parent dir
+		 * lock */
+		unlock_rename(cache->graveyard, dir);
+		_leave(" = 0 [culled?]");
+		return 0;
+	}
+
+	if (!d_can_lookup(cache->graveyard)) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "Graveyard no longer a directory");
+		return -EIO;
+	}
+
+	if (trap == rep) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "May not make directory loop");
+		return -EIO;
+	}
+
+	if (d_mountpoint(rep)) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "Mountpoint in cache");
+		return -EIO;
+	}
+
+	grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+	if (IS_ERR(grave)) {
+		unlock_rename(cache->graveyard, dir);
+
+		if (PTR_ERR(grave) == -ENOMEM) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
+		}
+
+		cachefiles_io_error(cache, "Lookup error %ld",
+				    PTR_ERR(grave));
+		return -EIO;
+	}
+
+	if (d_is_positive(grave)) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		grave = NULL;
+		cond_resched();
+		goto try_again;
+	}
+
+	if (d_mountpoint(grave)) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		cachefiles_io_error(cache, "Mountpoint in graveyard");
+		return -EIO;
+	}
+
+	/* target should not be an ancestor of source */
+	if (trap == grave) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		cachefiles_io_error(cache, "May not make directory loop");
+		return -EIO;
+	}
+
+	/* attempt the rename */
+	path.mnt = cache->mnt;
+	path.dentry = dir;
+	path_to_graveyard.mnt = cache->mnt;
+	path_to_graveyard.dentry = cache->graveyard;
+	ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
+	if (ret < 0) {
+		cachefiles_io_error(cache, "Rename security error %d", ret);
+	} else {
+		ret = vfs_rename(d_inode(dir), rep,
+				 d_inode(cache->graveyard), grave, NULL, 0);
+		if (ret != 0 && ret != -ENOMEM)
+			cachefiles_io_error(cache,
+					    "Rename failed with error %d", ret);
+
+		if (preemptive)
+			cachefiles_mark_object_buried(cache, rep);
+	}
+
+	unlock_rename(cache->graveyard, dir);
+	dput(grave);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * delete an object representation from the cache
+ */
+int cachefiles_delete_object(struct cachefiles_cache *cache,
+			     struct cachefiles_object *object)
+{
+	struct dentry *dir;
+	int ret;
+
+	_enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
+
+	ASSERT(object->dentry);
+	ASSERT(d_backing_inode(object->dentry));
+	ASSERT(object->dentry->d_parent);
+
+	dir = dget_parent(object->dentry);
+
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+
+	if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+		/* object allocation for the same key preemptively deleted this
+		 * object's file so that it could create its own file */
+		_debug("object preemptively buried");
+		mutex_unlock(&d_inode(dir)->i_mutex);
+		ret = 0;
+	} else {
+		/* we need to check that our parent is _still_ our parent - it
+		 * may have been renamed */
+		if (dir == object->dentry->d_parent) {
+			ret = cachefiles_bury_object(cache, dir,
+						     object->dentry, false);
+		} else {
+			/* it got moved, presumably by cachefilesd culling it,
+			 * so it's no longer in the key path and we can ignore
+			 * it */
+			mutex_unlock(&d_inode(dir)->i_mutex);
+			ret = 0;
+		}
+	}
+
+	dput(dir);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
+ */
+int cachefiles_walk_to_object(struct cachefiles_object *parent,
+			      struct cachefiles_object *object,
+			      const char *key,
+			      struct cachefiles_xattr *auxdata)
+{
+	struct cachefiles_cache *cache;
+	struct dentry *dir, *next = NULL;
+	struct path path;
+	unsigned long start;
+	const char *name;
+	int ret, nlen;
+
+	_enter("OBJ%x{%p},OBJ%x,%s,",
+	       parent->fscache.debug_id, parent->dentry,
+	       object->fscache.debug_id, key);
+
+	cache = container_of(parent->fscache.cache,
+			     struct cachefiles_cache, cache);
+	path.mnt = cache->mnt;
+
+	ASSERT(parent->dentry);
+	ASSERT(d_backing_inode(parent->dentry));
+
+	if (!(d_is_dir(parent->dentry))) {
+		// TODO: convert file to dir
+		_leave("looking up in none directory");
+		return -ENOBUFS;
+	}
+
+	dir = dget(parent->dentry);
+
+advance:
+	/* attempt to transit the first directory component */
+	name = key;
+	nlen = strlen(key);
+
+	/* key ends in a double NUL */
+	key = key + nlen + 1;
+	if (!*key)
+		key = NULL;
+
+lookup_again:
+	/* search the current directory for the element name */
+	_debug("lookup '%s'", name);
+
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+
+	start = jiffies;
+	next = lookup_one_len(name, dir, nlen);
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(next))
+		goto lookup_error;
+
+	_debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative");
+
+	if (!key)
+		object->new = !d_backing_inode(next);
+
+	/* if this element of the path doesn't exist, then the lookup phase
+	 * failed, and we can release any readers in the certain knowledge that
+	 * there's nothing for them to actually read */
+	if (d_is_negative(next))
+		fscache_object_lookup_negative(&object->fscache);
+
+	/* we need to create the object if it's negative */
+	if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
+		/* index objects and intervening tree levels must be subdirs */
+		if (d_is_negative(next)) {
+			ret = cachefiles_has_space(cache, 1, 0);
+			if (ret < 0)
+				goto create_error;
+
+			path.dentry = dir;
+			ret = security_path_mkdir(&path, next, 0);
+			if (ret < 0)
+				goto create_error;
+			start = jiffies;
+			ret = vfs_mkdir(d_inode(dir), next, 0);
+			cachefiles_hist(cachefiles_mkdir_histogram, start);
+			if (ret < 0)
+				goto create_error;
+
+			ASSERT(d_backing_inode(next));
+
+			_debug("mkdir -> %p{%p{ino=%lu}}",
+			       next, d_backing_inode(next), d_backing_inode(next)->i_ino);
+
+		} else if (!d_can_lookup(next)) {
+			pr_err("inode %lu is not a directory\n",
+			       d_backing_inode(next)->i_ino);
+			ret = -ENOBUFS;
+			goto error;
+		}
+
+	} else {
+		/* non-index objects start out life as files */
+		if (d_is_negative(next)) {
+			ret = cachefiles_has_space(cache, 1, 0);
+			if (ret < 0)
+				goto create_error;
+
+			path.dentry = dir;
+			ret = security_path_mknod(&path, next, S_IFREG, 0);
+			if (ret < 0)
+				goto create_error;
+			start = jiffies;
+			ret = vfs_create(d_inode(dir), next, S_IFREG, true);
+			cachefiles_hist(cachefiles_create_histogram, start);
+			if (ret < 0)
+				goto create_error;
+
+			ASSERT(d_backing_inode(next));
+
+			_debug("create -> %p{%p{ino=%lu}}",
+			       next, d_backing_inode(next), d_backing_inode(next)->i_ino);
+
+		} else if (!d_can_lookup(next) &&
+			   !d_is_reg(next)
+			   ) {
+			pr_err("inode %lu is not a file or directory\n",
+			       d_backing_inode(next)->i_ino);
+			ret = -ENOBUFS;
+			goto error;
+		}
+	}
+
+	/* process the next component */
+	if (key) {
+		_debug("advance");
+		mutex_unlock(&d_inode(dir)->i_mutex);
+		dput(dir);
+		dir = next;
+		next = NULL;
+		goto advance;
+	}
+
+	/* we've found the object we were looking for */
+	object->dentry = next;
+
+	/* if we've found that the terminal object exists, then we need to
+	 * check its attributes and delete it if it's out of date */
+	if (!object->new) {
+		_debug("validate '%pd'", next);
+
+		ret = cachefiles_check_object_xattr(object, auxdata);
+		if (ret == -ESTALE) {
+			/* delete the object (the deleter drops the directory
+			 * mutex) */
+			object->dentry = NULL;
+
+			ret = cachefiles_bury_object(cache, dir, next, true);
+			dput(next);
+			next = NULL;
+
+			if (ret < 0)
+				goto delete_error;
+
+			_debug("redo lookup");
+			goto lookup_again;
+		}
+	}
+
+	/* note that we're now using this object */
+	ret = cachefiles_mark_object_active(cache, object);
+
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	dput(dir);
+	dir = NULL;
+
+	if (ret == -ETIMEDOUT)
+		goto mark_active_timed_out;
+
+	_debug("=== OBTAINED_OBJECT ===");
+
+	if (object->new) {
+		/* attach data to a newly constructed terminal object */
+		ret = cachefiles_set_object_xattr(object, auxdata);
+		if (ret < 0)
+			goto check_error;
+	} else {
+		/* always update the atime on an object we've just looked up
+		 * (this is used to keep track of culling, and atimes are only
+		 * updated by read, write and readdir but not lookup or
+		 * open) */
+		path.dentry = next;
+		touch_atime(&path);
+	}
+
+	/* open a file interface onto a data file */
+	if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		if (d_is_reg(object->dentry)) {
+			const struct address_space_operations *aops;
+
+			ret = -EPERM;
+			aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
+			if (!aops->bmap)
+				goto check_error;
+
+			object->backer = object->dentry;
+		} else {
+			BUG(); // TODO: open file in data-class subdir
+		}
+	}
+
+	object->new = 0;
+	fscache_obtained_object(&object->fscache);
+
+	_leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino);
+	return 0;
+
+create_error:
+	_debug("create error %d", ret);
+	if (ret == -EIO)
+		cachefiles_io_error(cache, "Create/mkdir failed");
+	goto error;
+
+mark_active_timed_out:
+	_debug("mark active timed out");
+	goto release_dentry;
+
+check_error:
+	_debug("check error %d", ret);
+	write_lock(&cache->active_lock);
+	rb_erase(&object->active_node, &cache->active_nodes);
+	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+	write_unlock(&cache->active_lock);
+release_dentry:
+	dput(object->dentry);
+	object->dentry = NULL;
+	goto error_out;
+
+delete_error:
+	_debug("delete error %d", ret);
+	goto error_out2;
+
+lookup_error:
+	_debug("lookup error %ld", PTR_ERR(next));
+	ret = PTR_ERR(next);
+	if (ret == -EIO)
+		cachefiles_io_error(cache, "Lookup failed");
+	next = NULL;
+error:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	dput(next);
+error_out2:
+	dput(dir);
+error_out:
+	_leave(" = error %d", -ret);
+	return ret;
+}
+
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+					struct dentry *dir,
+					const char *dirname)
+{
+	struct dentry *subdir;
+	unsigned long start;
+	struct path path;
+	int ret;
+
+	_enter(",,%s", dirname);
+
+	/* search the current directory for the element name */
+	mutex_lock(&d_inode(dir)->i_mutex);
+
+	start = jiffies;
+	subdir = lookup_one_len(dirname, dir, strlen(dirname));
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(subdir)) {
+		if (PTR_ERR(subdir) == -ENOMEM)
+			goto nomem_d_alloc;
+		goto lookup_error;
+	}
+
+	_debug("subdir -> %p %s",
+	       subdir, d_backing_inode(subdir) ? "positive" : "negative");
+
+	/* we need to create the subdir if it doesn't exist yet */
+	if (d_is_negative(subdir)) {
+		ret = cachefiles_has_space(cache, 1, 0);
+		if (ret < 0)
+			goto mkdir_error;
+
+		_debug("attempt mkdir");
+
+		path.mnt = cache->mnt;
+		path.dentry = dir;
+		ret = security_path_mkdir(&path, subdir, 0700);
+		if (ret < 0)
+			goto mkdir_error;
+		ret = vfs_mkdir(d_inode(dir), subdir, 0700);
+		if (ret < 0)
+			goto mkdir_error;
+
+		ASSERT(d_backing_inode(subdir));
+
+		_debug("mkdir -> %p{%p{ino=%lu}}",
+		       subdir,
+		       d_backing_inode(subdir),
+		       d_backing_inode(subdir)->i_ino);
+	}
+
+	mutex_unlock(&d_inode(dir)->i_mutex);
+
+	/* we need to make sure the subdir is a directory */
+	ASSERT(d_backing_inode(subdir));
+
+	if (!d_can_lookup(subdir)) {
+		pr_err("%s is not a directory\n", dirname);
+		ret = -EIO;
+		goto check_error;
+	}
+
+	ret = -EPERM;
+	if (!d_backing_inode(subdir)->i_op->setxattr ||
+	    !d_backing_inode(subdir)->i_op->getxattr ||
+	    !d_backing_inode(subdir)->i_op->lookup ||
+	    !d_backing_inode(subdir)->i_op->mkdir ||
+	    !d_backing_inode(subdir)->i_op->create ||
+	    (!d_backing_inode(subdir)->i_op->rename &&
+	     !d_backing_inode(subdir)->i_op->rename2) ||
+	    !d_backing_inode(subdir)->i_op->rmdir ||
+	    !d_backing_inode(subdir)->i_op->unlink)
+		goto check_error;
+
+	_leave(" = [%lu]", d_backing_inode(subdir)->i_ino);
+	return subdir;
+
+check_error:
+	dput(subdir);
+	_leave(" = %d [check]", ret);
+	return ERR_PTR(ret);
+
+mkdir_error:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	dput(subdir);
+	pr_err("mkdir %s failed with error %d\n", dirname, ret);
+	return ERR_PTR(ret);
+
+lookup_error:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	ret = PTR_ERR(subdir);
+	pr_err("Lookup %s failed with error %d\n", dirname, ret);
+	return ERR_PTR(ret);
+
+nomem_d_alloc:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * find out if an object is in use or not
+ * - if finds object and it's not in use:
+ *   - returns a pointer to the object and a reference on it
+ *   - returns with the directory locked
+ */
+static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
+					      struct dentry *dir,
+					      char *filename)
+{
+	struct cachefiles_object *object;
+	struct rb_node *_n;
+	struct dentry *victim;
+	unsigned long start;
+	int ret;
+
+	//_enter(",%pd/,%s",
+	//       dir, filename);
+
+	/* look up the victim */
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+
+	start = jiffies;
+	victim = lookup_one_len(filename, dir, strlen(filename));
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(victim))
+		goto lookup_error;
+
+	//_debug("victim -> %p %s",
+	//       victim, d_backing_inode(victim) ? "positive" : "negative");
+
+	/* if the object is no longer there then we probably retired the object
+	 * at the netfs's request whilst the cull was in progress
+	 */
+	if (d_is_negative(victim)) {
+		mutex_unlock(&d_inode(dir)->i_mutex);
+		dput(victim);
+		_leave(" = -ENOENT [absent]");
+		return ERR_PTR(-ENOENT);
+	}
+
+	/* check to see if we're using this object */
+	read_lock(&cache->active_lock);
+
+	_n = cache->active_nodes.rb_node;
+
+	while (_n) {
+		object = rb_entry(_n, struct cachefiles_object, active_node);
+
+		if (object->dentry > victim)
+			_n = _n->rb_left;
+		else if (object->dentry < victim)
+			_n = _n->rb_right;
+		else
+			goto object_in_use;
+	}
+
+	read_unlock(&cache->active_lock);
+
+	//_leave(" = %p", victim);
+	return victim;
+
+object_in_use:
+	read_unlock(&cache->active_lock);
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	dput(victim);
+	//_leave(" = -EBUSY [in use]");
+	return ERR_PTR(-EBUSY);
+
+lookup_error:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	ret = PTR_ERR(victim);
+	if (ret == -ENOENT) {
+		/* file or dir now absent - probably retired by netfs */
+		_leave(" = -ESTALE [absent]");
+		return ERR_PTR(-ESTALE);
+	}
+
+	if (ret == -EIO) {
+		cachefiles_io_error(cache, "Lookup failed");
+	} else if (ret != -ENOMEM) {
+		pr_err("Internal error: %d\n", ret);
+		ret = -EIO;
+	}
+
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * cull an object if it's not in use
+ * - called only by cache manager daemon
+ */
+int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+		    char *filename)
+{
+	struct dentry *victim;
+	int ret;
+
+	_enter(",%pd/,%s", dir, filename);
+
+	victim = cachefiles_check_active(cache, dir, filename);
+	if (IS_ERR(victim))
+		return PTR_ERR(victim);
+
+	_debug("victim -> %p %s",
+	       victim, d_backing_inode(victim) ? "positive" : "negative");
+
+	/* okay... the victim is not being used so we can cull it
+	 * - start by marking it as stale
+	 */
+	_debug("victim is cullable");
+
+	ret = cachefiles_remove_object_xattr(cache, victim);
+	if (ret < 0)
+		goto error_unlock;
+
+	/*  actually remove the victim (drops the dir mutex) */
+	_debug("bury");
+
+	ret = cachefiles_bury_object(cache, dir, victim, false);
+	if (ret < 0)
+		goto error;
+
+	dput(victim);
+	_leave(" = 0");
+	return 0;
+
+error_unlock:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+error:
+	dput(victim);
+	if (ret == -ENOENT) {
+		/* file or dir now absent - probably retired by netfs */
+		_leave(" = -ESTALE [absent]");
+		return -ESTALE;
+	}
+
+	if (ret != -ENOMEM) {
+		pr_err("Internal error: %d\n", ret);
+		ret = -EIO;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * find out if an object is in use or not
+ * - called only by cache manager daemon
+ * - returns -EBUSY or 0 to indicate whether an object is in use or not
+ */
+int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
+			    char *filename)
+{
+	struct dentry *victim;
+
+	//_enter(",%pd/,%s",
+	//       dir, filename);
+
+	victim = cachefiles_check_active(cache, dir, filename);
+	if (IS_ERR(victim))
+		return PTR_ERR(victim);
+
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	dput(victim);
+	//_leave(" = 0");
+	return 0;
+}
diff --git a/kernel/fs/cachefiles/proc.c b/kernel/fs/cachefiles/proc.c
new file mode 100644
index 000000000..eccd33941
--- /dev/null
+++ b/kernel/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
+/* CacheFiles statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t cachefiles_lookup_histogram[HZ];
+atomic_t cachefiles_mkdir_histogram[HZ];
+atomic_t cachefiles_create_histogram[HZ];
+
+/*
+ * display the latency histogram
+ */
+static int cachefiles_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned x, y, z, t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  LOOKUPS   MKDIRS    CREATES\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		x = atomic_read(&cachefiles_lookup_histogram[index]);
+		y = atomic_read(&cachefiles_mkdir_histogram[index]);
+		z = atomic_read(&cachefiles_create_histogram[index]);
+		if (x == 0 && y == 0 && z == 0)
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u\n", index, t, x, y, z);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void cachefiles_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations cachefiles_histogram_ops = {
+	.start		= cachefiles_histogram_start,
+	.stop		= cachefiles_histogram_stop,
+	.next		= cachefiles_histogram_next,
+	.show		= cachefiles_histogram_show,
+};
+
+/*
+ * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
+ */
+static int cachefiles_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cachefiles_histogram_ops);
+}
+
+static const struct file_operations cachefiles_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cachefiles_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * initialise the /proc/fs/cachefiles/ directory
+ */
+int __init cachefiles_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/cachefiles", NULL))
+		goto error_dir;
+
+	if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+			 &cachefiles_histogram_fops))
+		goto error_histogram;
+
+	_leave(" = 0");
+	return 0;
+
+error_histogram:
+	remove_proc_entry("fs/cachefiles", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/cachefiles/ directory
+ */
+void cachefiles_proc_cleanup(void)
+{
+	remove_proc_entry("fs/cachefiles/histogram", NULL);
+	remove_proc_entry("fs/cachefiles", NULL);
+}
diff --git a/kernel/fs/cachefiles/rdwr.c b/kernel/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000..3cbb0e834
--- /dev/null
+++ b/kernel/fs/cachefiles/rdwr.c
@@ -0,0 +1,967 @@
+/* Storage object read/write
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include "internal.h"
+
+/*
+ * detect wake up events generated by the unlocking of pages in which we're
+ * interested
+ * - we use this to detect read completion of backing pages
+ * - the caller holds the waitqueue lock
+ */
+static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+				  int sync, void *_key)
+{
+	struct cachefiles_one_read *monitor =
+		container_of(wait, struct cachefiles_one_read, monitor);
+	struct cachefiles_object *object;
+	struct wait_bit_key *key = _key;
+	struct page *page = wait->private;
+
+	ASSERT(key);
+
+	_enter("{%lu},%u,%d,{%p,%u}",
+	       monitor->netfs_page->index, mode, sync,
+	       key->flags, key->bit_nr);
+
+	if (key->flags != &page->flags ||
+	    key->bit_nr != PG_locked)
+		return 0;
+
+	_debug("--- monitor %p %lx ---", page, page->flags);
+
+	if (!PageUptodate(page) && !PageError(page)) {
+		/* unlocked, not uptodate and not erronous? */
+		_debug("page probably truncated");
+	}
+
+	/* remove from the waitqueue */
+	list_del(&wait->task_list);
+
+	/* move onto the action list and queue for FS-Cache thread pool */
+	ASSERT(monitor->op);
+
+	object = container_of(monitor->op->op.object,
+			      struct cachefiles_object, fscache);
+
+	spin_lock(&object->work_lock);
+	list_add_tail(&monitor->op_link, &monitor->op->to_do);
+	spin_unlock(&object->work_lock);
+
+	fscache_enqueue_retrieval(monitor->op);
+	return 0;
+}
+
+/*
+ * handle a probably truncated page
+ * - check to see if the page is still relevant and reissue the read if
+ *   possible
+ * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
+ *   must wait again and 0 if successful
+ */
+static int cachefiles_read_reissue(struct cachefiles_object *object,
+				   struct cachefiles_one_read *monitor)
+{
+	struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping;
+	struct page *backpage = monitor->back_page, *backpage2;
+	int ret;
+
+	_enter("{ino=%lx},{%lx,%lx}",
+	       d_backing_inode(object->backer)->i_ino,
+	       backpage->index, backpage->flags);
+
+	/* skip if the page was truncated away completely */
+	if (backpage->mapping != bmapping) {
+		_leave(" = -ENODATA [mapping]");
+		return -ENODATA;
+	}
+
+	backpage2 = find_get_page(bmapping, backpage->index);
+	if (!backpage2) {
+		_leave(" = -ENODATA [gone]");
+		return -ENODATA;
+	}
+
+	if (backpage != backpage2) {
+		put_page(backpage2);
+		_leave(" = -ENODATA [different]");
+		return -ENODATA;
+	}
+
+	/* the page is still there and we already have a ref on it, so we don't
+	 * need a second */
+	put_page(backpage2);
+
+	INIT_LIST_HEAD(&monitor->op_link);
+	add_page_wait_queue(backpage, &monitor->monitor);
+
+	if (trylock_page(backpage)) {
+		ret = -EIO;
+		if (PageError(backpage))
+			goto unlock_discard;
+		ret = 0;
+		if (PageUptodate(backpage))
+			goto unlock_discard;
+
+		_debug("reissue read");
+		ret = bmapping->a_ops->readpage(NULL, backpage);
+		if (ret < 0)
+			goto unlock_discard;
+	}
+
+	/* but the page may have been read before the monitor was installed, so
+	 * the monitor may miss the event - so we have to ensure that we do get
+	 * one in such a case */
+	if (trylock_page(backpage)) {
+		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
+		unlock_page(backpage);
+	}
+
+	/* it'll reappear on the todo list */
+	_leave(" = -EINPROGRESS");
+	return -EINPROGRESS;
+
+unlock_discard:
+	unlock_page(backpage);
+	spin_lock_irq(&object->work_lock);
+	list_del(&monitor->op_link);
+	spin_unlock_irq(&object->work_lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * copy data from backing pages to netfs pages to complete a read operation
+ * - driven by FS-Cache's thread pool
+ */
+static void cachefiles_read_copier(struct fscache_operation *_op)
+{
+	struct cachefiles_one_read *monitor;
+	struct cachefiles_object *object;
+	struct fscache_retrieval *op;
+	int error, max;
+
+	op = container_of(_op, struct fscache_retrieval, op);
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+
+	_enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino);
+
+	max = 8;
+	spin_lock_irq(&object->work_lock);
+
+	while (!list_empty(&op->to_do)) {
+		monitor = list_entry(op->to_do.next,
+				     struct cachefiles_one_read, op_link);
+		list_del(&monitor->op_link);
+
+		spin_unlock_irq(&object->work_lock);
+
+		_debug("- copy {%lu}", monitor->back_page->index);
+
+	recheck:
+		if (test_bit(FSCACHE_COOKIE_INVALIDATING,
+			     &object->fscache.cookie->flags)) {
+			error = -ESTALE;
+		} else if (PageUptodate(monitor->back_page)) {
+			copy_highpage(monitor->netfs_page, monitor->back_page);
+			fscache_mark_page_cached(monitor->op,
+						 monitor->netfs_page);
+			error = 0;
+		} else if (!PageError(monitor->back_page)) {
+			/* the page has probably been truncated */
+			error = cachefiles_read_reissue(object, monitor);
+			if (error == -EINPROGRESS)
+				goto next;
+			goto recheck;
+		} else {
+			cachefiles_io_error_obj(
+				object,
+				"Readpage failed on backing file %lx",
+				(unsigned long) monitor->back_page->flags);
+			error = -EIO;
+		}
+
+		page_cache_release(monitor->back_page);
+
+		fscache_end_io(op, monitor->netfs_page, error);
+		page_cache_release(monitor->netfs_page);
+		fscache_retrieval_complete(op, 1);
+		fscache_put_retrieval(op);
+		kfree(monitor);
+
+	next:
+		/* let the thread pool have some air occasionally */
+		max--;
+		if (max < 0 || need_resched()) {
+			if (!list_empty(&op->to_do))
+				fscache_enqueue_retrieval(op);
+			_leave(" [maxed out]");
+			return;
+		}
+
+		spin_lock_irq(&object->work_lock);
+	}
+
+	spin_unlock_irq(&object->work_lock);
+	_leave("");
+}
+
+/*
+ * read the corresponding page to the given set from the backing file
+ * - an uncertain page is simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
+					    struct fscache_retrieval *op,
+					    struct page *netpage)
+{
+	struct cachefiles_one_read *monitor;
+	struct address_space *bmapping;
+	struct page *newpage, *backpage;
+	int ret;
+
+	_enter("");
+
+	_debug("read back %p{%lu,%d}",
+	       netpage, netpage->index, page_count(netpage));
+
+	monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
+	if (!monitor)
+		goto nomem;
+
+	monitor->netfs_page = netpage;
+	monitor->op = fscache_get_retrieval(op);
+
+	init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
+
+	/* attempt to get hold of the backing page */
+	bmapping = d_backing_inode(object->backer)->i_mapping;
+	newpage = NULL;
+
+	for (;;) {
+		backpage = find_get_page(bmapping, netpage->index);
+		if (backpage)
+			goto backing_page_already_present;
+
+		if (!newpage) {
+			newpage = __page_cache_alloc(cachefiles_gfp |
+						     __GFP_COLD);
+			if (!newpage)
+				goto nomem_monitor;
+		}
+
+		ret = add_to_page_cache_lru(newpage, bmapping,
+					    netpage->index, cachefiles_gfp);
+		if (ret == 0)
+			goto installed_new_backing_page;
+		if (ret != -EEXIST)
+			goto nomem_page;
+	}
+
+	/* we've installed a new backing page, so now we need to start
+	 * it reading */
+installed_new_backing_page:
+	_debug("- new %p", newpage);
+
+	backpage = newpage;
+	newpage = NULL;
+
+read_backing_page:
+	ret = bmapping->a_ops->readpage(NULL, backpage);
+	if (ret < 0)
+		goto read_error;
+
+	/* set the monitor to transfer the data across */
+monitor_backing_page:
+	_debug("- monitor add");
+
+	/* install the monitor */
+	page_cache_get(monitor->netfs_page);
+	page_cache_get(backpage);
+	monitor->back_page = backpage;
+	monitor->monitor.private = backpage;
+	add_page_wait_queue(backpage, &monitor->monitor);
+	monitor = NULL;
+
+	/* but the page may have been read before the monitor was installed, so
+	 * the monitor may miss the event - so we have to ensure that we do get
+	 * one in such a case */
+	if (trylock_page(backpage)) {
+		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
+		unlock_page(backpage);
+	}
+	goto success;
+
+	/* if the backing page is already present, it can be in one of
+	 * three states: read in progress, read failed or read okay */
+backing_page_already_present:
+	_debug("- present");
+
+	if (newpage) {
+		page_cache_release(newpage);
+		newpage = NULL;
+	}
+
+	if (PageError(backpage))
+		goto io_error;
+
+	if (PageUptodate(backpage))
+		goto backing_page_already_uptodate;
+
+	if (!trylock_page(backpage))
+		goto monitor_backing_page;
+	_debug("read %p {%lx}", backpage, backpage->flags);
+	goto read_backing_page;
+
+	/* the backing page is already up to date, attach the netfs
+	 * page to the pagecache and LRU and copy the data across */
+backing_page_already_uptodate:
+	_debug("- uptodate");
+
+	fscache_mark_page_cached(op, netpage);
+
+	copy_highpage(netpage, backpage);
+	fscache_end_io(op, netpage, 0);
+	fscache_retrieval_complete(op, 1);
+
+success:
+	_debug("success");
+	ret = 0;
+
+out:
+	if (backpage)
+		page_cache_release(backpage);
+	if (monitor) {
+		fscache_put_retrieval(monitor->op);
+		kfree(monitor);
+	}
+	_leave(" = %d", ret);
+	return ret;
+
+read_error:
+	_debug("read error %d", ret);
+	if (ret == -ENOMEM) {
+		fscache_retrieval_complete(op, 1);
+		goto out;
+	}
+io_error:
+	cachefiles_io_error_obj(object, "Page read error on backing file");
+	fscache_retrieval_complete(op, 1);
+	ret = -ENOBUFS;
+	goto out;
+
+nomem_page:
+	page_cache_release(newpage);
+nomem_monitor:
+	fscache_put_retrieval(monitor->op);
+	kfree(monitor);
+nomem:
+	fscache_retrieval_complete(op, 1);
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ *   - a read will be started which will call the callback on completion
+ *   - 0 will be returned
+ * - else if the page is unbacked:
+ *   - the metadata will be retained
+ *   - -ENODATA will be returned
+ */
+int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
+				  struct page *page,
+				  gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct inode *inode;
+	sector_t block0, block;
+	unsigned shift;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("{%p},{%lx},,,", object, page->index);
+
+	if (!object->backer)
+		goto enobufs;
+
+	inode = d_backing_inode(object->backer);
+	ASSERT(S_ISREG(inode->i_mode));
+	ASSERT(inode->i_mapping->a_ops->bmap);
+	ASSERT(inode->i_mapping->a_ops->readpages);
+
+	/* calculate the shift required to use bmap */
+	if (inode->i_sb->s_blocksize > PAGE_SIZE)
+		goto enobufs;
+
+	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_ASYNC;
+	op->op.processor = cachefiles_read_copier;
+
+	/* we assume the absence or presence of the first block is a good
+	 * enough indication for the page as a whole
+	 * - TODO: don't use bmap() for this as it is _not_ actually good
+	 *   enough for this as it doesn't indicate errors, but it's all we've
+	 *   got for the moment
+	 */
+	block0 = page->index;
+	block0 <<= shift;
+
+	block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+	_debug("%llx -> %llx",
+	       (unsigned long long) block0,
+	       (unsigned long long) block);
+
+	if (block) {
+		/* submit the apparently valid page to the backing fs to be
+		 * read from disk */
+		ret = cachefiles_read_backing_file_one(object, op, page);
+	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
+		/* there's space in the cache we can use */
+		fscache_mark_page_cached(op, page);
+		fscache_retrieval_complete(op, 1);
+		ret = -ENODATA;
+	} else {
+		goto enobufs;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+enobufs:
+	fscache_retrieval_complete(op, 1);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+
+/*
+ * read the corresponding pages to the given set from the backing file
+ * - any uncertain pages are simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file(struct cachefiles_object *object,
+					struct fscache_retrieval *op,
+					struct list_head *list)
+{
+	struct cachefiles_one_read *monitor = NULL;
+	struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping;
+	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
+	int ret = 0;
+
+	_enter("");
+
+	list_for_each_entry_safe(netpage, _n, list, lru) {
+		list_del(&netpage->lru);
+
+		_debug("read back %p{%lu,%d}",
+		       netpage, netpage->index, page_count(netpage));
+
+		if (!monitor) {
+			monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
+			if (!monitor)
+				goto nomem;
+
+			monitor->op = fscache_get_retrieval(op);
+			init_waitqueue_func_entry(&monitor->monitor,
+						  cachefiles_read_waiter);
+		}
+
+		for (;;) {
+			backpage = find_get_page(bmapping, netpage->index);
+			if (backpage)
+				goto backing_page_already_present;
+
+			if (!newpage) {
+				newpage = __page_cache_alloc(cachefiles_gfp |
+							     __GFP_COLD);
+				if (!newpage)
+					goto nomem;
+			}
+
+			ret = add_to_page_cache_lru(newpage, bmapping,
+						    netpage->index,
+						    cachefiles_gfp);
+			if (ret == 0)
+				goto installed_new_backing_page;
+			if (ret != -EEXIST)
+				goto nomem;
+		}
+
+		/* we've installed a new backing page, so now we need
+		 * to start it reading */
+	installed_new_backing_page:
+		_debug("- new %p", newpage);
+
+		backpage = newpage;
+		newpage = NULL;
+
+	reread_backing_page:
+		ret = bmapping->a_ops->readpage(NULL, backpage);
+		if (ret < 0)
+			goto read_error;
+
+		/* add the netfs page to the pagecache and LRU, and set the
+		 * monitor to transfer the data across */
+	monitor_backing_page:
+		_debug("- monitor add");
+
+		ret = add_to_page_cache_lru(netpage, op->mapping,
+					    netpage->index, cachefiles_gfp);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				page_cache_release(netpage);
+				fscache_retrieval_complete(op, 1);
+				continue;
+			}
+			goto nomem;
+		}
+
+		/* install a monitor */
+		page_cache_get(netpage);
+		monitor->netfs_page = netpage;
+
+		page_cache_get(backpage);
+		monitor->back_page = backpage;
+		monitor->monitor.private = backpage;
+		add_page_wait_queue(backpage, &monitor->monitor);
+		monitor = NULL;
+
+		/* but the page may have been read before the monitor was
+		 * installed, so the monitor may miss the event - so we have to
+		 * ensure that we do get one in such a case */
+		if (trylock_page(backpage)) {
+			_debug("2unlock %p {%lx}", backpage, backpage->flags);
+			unlock_page(backpage);
+		}
+
+		page_cache_release(backpage);
+		backpage = NULL;
+
+		page_cache_release(netpage);
+		netpage = NULL;
+		continue;
+
+		/* if the backing page is already present, it can be in one of
+		 * three states: read in progress, read failed or read okay */
+	backing_page_already_present:
+		_debug("- present %p", backpage);
+
+		if (PageError(backpage))
+			goto io_error;
+
+		if (PageUptodate(backpage))
+			goto backing_page_already_uptodate;
+
+		_debug("- not ready %p{%lx}", backpage, backpage->flags);
+
+		if (!trylock_page(backpage))
+			goto monitor_backing_page;
+
+		if (PageError(backpage)) {
+			_debug("error %lx", backpage->flags);
+			unlock_page(backpage);
+			goto io_error;
+		}
+
+		if (PageUptodate(backpage))
+			goto backing_page_already_uptodate_unlock;
+
+		/* we've locked a page that's neither up to date nor erroneous,
+		 * so we need to attempt to read it again */
+		goto reread_backing_page;
+
+		/* the backing page is already up to date, attach the netfs
+		 * page to the pagecache and LRU and copy the data across */
+	backing_page_already_uptodate_unlock:
+		_debug("uptodate %lx", backpage->flags);
+		unlock_page(backpage);
+	backing_page_already_uptodate:
+		_debug("- uptodate");
+
+		ret = add_to_page_cache_lru(netpage, op->mapping,
+					    netpage->index, cachefiles_gfp);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				page_cache_release(netpage);
+				fscache_retrieval_complete(op, 1);
+				continue;
+			}
+			goto nomem;
+		}
+
+		copy_highpage(netpage, backpage);
+
+		page_cache_release(backpage);
+		backpage = NULL;
+
+		fscache_mark_page_cached(op, netpage);
+
+		/* the netpage is unlocked and marked up to date here */
+		fscache_end_io(op, netpage, 0);
+		page_cache_release(netpage);
+		netpage = NULL;
+		fscache_retrieval_complete(op, 1);
+		continue;
+	}
+
+	netpage = NULL;
+
+	_debug("out");
+
+out:
+	/* tidy up */
+	if (newpage)
+		page_cache_release(newpage);
+	if (netpage)
+		page_cache_release(netpage);
+	if (backpage)
+		page_cache_release(backpage);
+	if (monitor) {
+		fscache_put_retrieval(op);
+		kfree(monitor);
+	}
+
+	list_for_each_entry_safe(netpage, _n, list, lru) {
+		list_del(&netpage->lru);
+		page_cache_release(netpage);
+		fscache_retrieval_complete(op, 1);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+nomem:
+	_debug("nomem");
+	ret = -ENOMEM;
+	goto record_page_complete;
+
+read_error:
+	_debug("read error %d", ret);
+	if (ret == -ENOMEM)
+		goto record_page_complete;
+io_error:
+	cachefiles_io_error_obj(object, "Page read error on backing file");
+	ret = -ENOBUFS;
+record_page_complete:
+	fscache_retrieval_complete(op, 1);
+	goto out;
+}
+
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
+				   struct list_head *pages,
+				   unsigned *nr_pages,
+				   gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct list_head backpages;
+	struct pagevec pagevec;
+	struct inode *inode;
+	struct page *page, *_n;
+	unsigned shift, nrbackpages;
+	int ret, ret2, space;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("{OBJ%x,%d},,%d,,",
+	       object->fscache.debug_id, atomic_read(&op->op.usage),
+	       *nr_pages);
+
+	if (!object->backer)
+		goto all_enobufs;
+
+	space = 1;
+	if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
+		space = 0;
+
+	inode = d_backing_inode(object->backer);
+	ASSERT(S_ISREG(inode->i_mode));
+	ASSERT(inode->i_mapping->a_ops->bmap);
+	ASSERT(inode->i_mapping->a_ops->readpages);
+
+	/* calculate the shift required to use bmap */
+	if (inode->i_sb->s_blocksize > PAGE_SIZE)
+		goto all_enobufs;
+
+	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+	pagevec_init(&pagevec, 0);
+
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_ASYNC;
+	op->op.processor = cachefiles_read_copier;
+
+	INIT_LIST_HEAD(&backpages);
+	nrbackpages = 0;
+
+	ret = space ? -ENODATA : -ENOBUFS;
+	list_for_each_entry_safe(page, _n, pages, lru) {
+		sector_t block0, block;
+
+		/* we assume the absence or presence of the first block is a
+		 * good enough indication for the page as a whole
+		 * - TODO: don't use bmap() for this as it is _not_ actually
+		 *   good enough for this as it doesn't indicate errors, but
+		 *   it's all we've got for the moment
+		 */
+		block0 = page->index;
+		block0 <<= shift;
+
+		block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
+						      block0);
+		_debug("%llx -> %llx",
+		       (unsigned long long) block0,
+		       (unsigned long long) block);
+
+		if (block) {
+			/* we have data - add it to the list to give to the
+			 * backing fs */
+			list_move(&page->lru, &backpages);
+			(*nr_pages)--;
+			nrbackpages++;
+		} else if (space && pagevec_add(&pagevec, page) == 0) {
+			fscache_mark_pages_cached(op, &pagevec);
+			fscache_retrieval_complete(op, 1);
+			ret = -ENODATA;
+		} else {
+			fscache_retrieval_complete(op, 1);
+		}
+	}
+
+	if (pagevec_count(&pagevec) > 0)
+		fscache_mark_pages_cached(op, &pagevec);
+
+	if (list_empty(pages))
+		ret = 0;
+
+	/* submit the apparently valid pages to the backing fs to be read from
+	 * disk */
+	if (nrbackpages > 0) {
+		ret2 = cachefiles_read_backing_file(object, op, &backpages);
+		if (ret2 == -ENOMEM || ret2 == -EINTR)
+			ret = ret2;
+	}
+
+	_leave(" = %d [nr=%u%s]",
+	       ret, *nr_pages, list_empty(pages) ? " empty" : "");
+	return ret;
+
+all_enobufs:
+	fscache_retrieval_complete(op, *nr_pages);
+	return -ENOBUFS;
+}
+
+/*
+ * allocate a block in the cache in which to store a page
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ *   - the metadata will be retained
+ *   - 0 will be returned
+ */
+int cachefiles_allocate_page(struct fscache_retrieval *op,
+			     struct page *page,
+			     gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,{%lx},", object, page->index);
+
+	ret = cachefiles_has_space(cache, 0, 1);
+	if (ret == 0)
+		fscache_mark_page_cached(op, page);
+	else
+		ret = -ENOBUFS;
+
+	fscache_retrieval_complete(op, 1);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * allocate blocks in the cache in which to store a set of pages
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if some buffers couldn't be made available
+ * - returns -ENOBUFS if some pages are beyond EOF
+ * - otherwise:
+ *   - -ENODATA will be returned
+ * - metadata will be retained for any page marked
+ */
+int cachefiles_allocate_pages(struct fscache_retrieval *op,
+			      struct list_head *pages,
+			      unsigned *nr_pages,
+			      gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct pagevec pagevec;
+	struct page *page;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,,,%d,", object, *nr_pages);
+
+	ret = cachefiles_has_space(cache, 0, *nr_pages);
+	if (ret == 0) {
+		pagevec_init(&pagevec, 0);
+
+		list_for_each_entry(page, pages, lru) {
+			if (pagevec_add(&pagevec, page) == 0)
+				fscache_mark_pages_cached(op, &pagevec);
+		}
+
+		if (pagevec_count(&pagevec) > 0)
+			fscache_mark_pages_cached(op, &pagevec);
+		ret = -ENODATA;
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	fscache_retrieval_complete(op, *nr_pages);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ *   case -ENOBUFS will be returned
+ * - if the op is in progress, 0 will be returned
+ */
+int cachefiles_write_page(struct fscache_storage *op, struct page *page)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct file *file;
+	struct path path;
+	loff_t pos, eof;
+	size_t len;
+	void *data;
+	int ret;
+
+	ASSERT(op != NULL);
+	ASSERT(page != NULL);
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+
+	_enter("%p,%p{%lx},,,", object, page, page->index);
+
+	if (!object->backer) {
+		_leave(" = -ENOBUFS");
+		return -ENOBUFS;
+	}
+
+	ASSERT(d_is_reg(object->backer));
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	/* write the page to the backing filesystem and let it store it in its
+	 * own time */
+	path.mnt = cache->mnt;
+	path.dentry = object->backer;
+	file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+	} else {
+		pos = (loff_t) page->index << PAGE_SHIFT;
+
+		/* we mustn't write more data than we have, so we have
+		 * to beware of a partial page at EOF */
+		eof = object->fscache.store_limit_l;
+		len = PAGE_SIZE;
+		if (eof & ~PAGE_MASK) {
+			ASSERTCMP(pos, <, eof);
+			if (eof - pos < PAGE_SIZE) {
+				_debug("cut short %llx to %llx",
+				       pos, eof);
+				len = eof - pos;
+				ASSERTCMP(pos + len, ==, eof);
+			}
+		}
+
+		data = kmap(page);
+		ret = __kernel_write(file, data, len, &pos);
+		kunmap(page);
+		if (ret != len)
+			ret = -EIO;
+		fput(file);
+	}
+
+	if (ret < 0) {
+		if (ret == -EIO)
+			cachefiles_io_error_obj(
+				object, "Write page to backing file failed");
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,{%lu}", object, page->index);
+
+	spin_unlock(&object->fscache.cookie->lock);
+}
diff --git a/kernel/fs/cachefiles/security.c b/kernel/fs/cachefiles/security.c
new file mode 100644
index 000000000..31bbc0528
--- /dev/null
+++ b/kernel/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
+/* CacheFiles security management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include "internal.h"
+
+/*
+ * determine the security context within which we access the cache from within
+ * the kernel
+ */
+int cachefiles_get_security_ID(struct cachefiles_cache *cache)
+{
+	struct cred *new;
+	int ret;
+
+	_enter("{%s}", cache->secctx);
+
+	new = prepare_kernel_cred(current);
+	if (!new) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	if (cache->secctx) {
+		ret = set_security_override_from_ctx(new, cache->secctx);
+		if (ret < 0) {
+			put_cred(new);
+			pr_err("Security denies permission to nominate security context: error %d\n",
+			       ret);
+			goto error;
+		}
+	}
+
+	cache->cache_cred = new;
+	ret = 0;
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * see if mkdir and create can be performed in the root directory
+ */
+static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
+				      struct dentry *root)
+{
+	int ret;
+
+	ret = security_inode_mkdir(d_backing_inode(root), root, 0);
+	if (ret < 0) {
+		pr_err("Security denies permission to make dirs: error %d",
+		       ret);
+		return ret;
+	}
+
+	ret = security_inode_create(d_backing_inode(root), root, 0);
+	if (ret < 0)
+		pr_err("Security denies permission to create files: error %d",
+		       ret);
+
+	return ret;
+}
+
+/*
+ * check the security details of the on-disk cache
+ * - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ *   error
+ */
+int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+					struct dentry *root,
+					const struct cred **_saved_cred)
+{
+	struct cred *new;
+	int ret;
+
+	_enter("");
+
+	/* duplicate the cache creds for COW (the override is currently in
+	 * force, so we can use prepare_creds() to do this) */
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	cachefiles_end_secure(cache, *_saved_cred);
+
+	/* use the cache root dir's security context as the basis with
+	 * which create files */
+	ret = set_create_files_as(new, d_backing_inode(root));
+	if (ret < 0) {
+		abort_creds(new);
+		cachefiles_begin_secure(cache, _saved_cred);
+		_leave(" = %d [cfa]", ret);
+		return ret;
+	}
+
+	put_cred(cache->cache_cred);
+	cache->cache_cred = new;
+
+	cachefiles_begin_secure(cache, _saved_cred);
+	ret = cachefiles_check_cache_dir(cache, root);
+
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/kernel/fs/cachefiles/xattr.c b/kernel/fs/cachefiles/xattr.c
new file mode 100644
index 000000000..d31c1a72d
--- /dev/null
+++ b/kernel/fs/cachefiles/xattr.c
@@ -0,0 +1,324 @@
+/* CacheFiles extended attribute management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static const char cachefiles_xattr_cache[] =
+	XATTR_USER_PREFIX "CacheFiles.cache";
+
+/*
+ * check the type label on an object
+ * - done using xattrs
+ */
+int cachefiles_check_object_type(struct cachefiles_object *object)
+{
+	struct dentry *dentry = object->dentry;
+	char type[3], xtype[3];
+	int ret;
+
+	ASSERT(dentry);
+	ASSERT(d_backing_inode(dentry));
+
+	if (!object->fscache.cookie)
+		strcpy(type, "C3");
+	else
+		snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+
+	_enter("%p{%s}", object, type);
+
+	/* attempt to install a type label directly */
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+			   XATTR_CREATE);
+	if (ret == 0) {
+		_debug("SET"); /* we succeeded */
+		goto error;
+	}
+
+	if (ret != -EEXIST) {
+		pr_err("Can't set xattr on %pd [%lu] (err %d)\n",
+		       dentry, d_backing_inode(dentry)->i_ino,
+		       -ret);
+		goto error;
+	}
+
+	/* read the current type label */
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+	if (ret < 0) {
+		if (ret == -ERANGE)
+			goto bad_type_length;
+
+		pr_err("Can't read xattr on %pd [%lu] (err %d)\n",
+		       dentry, d_backing_inode(dentry)->i_ino,
+		       -ret);
+		goto error;
+	}
+
+	/* check the type is what we're expecting */
+	if (ret != 2)
+		goto bad_type_length;
+
+	if (xtype[0] != type[0] || xtype[1] != type[1])
+		goto bad_type;
+
+	ret = 0;
+
+error:
+	_leave(" = %d", ret);
+	return ret;
+
+bad_type_length:
+	pr_err("Cache object %lu type xattr length incorrect\n",
+	       d_backing_inode(dentry)->i_ino);
+	ret = -EIO;
+	goto error;
+
+bad_type:
+	xtype[2] = 0;
+	pr_err("Cache object %pd [%lu] type %s not %s\n",
+	       dentry, d_backing_inode(dentry)->i_ino,
+	       xtype, type);
+	ret = -EIO;
+	goto error;
+}
+
+/*
+ * set the state xattr on a cache file
+ */
+int cachefiles_set_object_xattr(struct cachefiles_object *object,
+				struct cachefiles_xattr *auxdata)
+{
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	ASSERT(dentry);
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	/* attempt to install the cache metadata directly */
+	_debug("SET #%u", auxdata->len);
+
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len,
+			   XATTR_CREATE);
+	if (ret < 0 && ret != -ENOMEM)
+		cachefiles_io_error_obj(
+			object,
+			"Failed to set xattr with error %d", ret);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * update the state xattr on a cache file
+ */
+int cachefiles_update_object_xattr(struct cachefiles_object *object,
+				   struct cachefiles_xattr *auxdata)
+{
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	ASSERT(dentry);
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	/* attempt to install the cache metadata directly */
+	_debug("SET #%u", auxdata->len);
+
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len,
+			   XATTR_REPLACE);
+	if (ret < 0 && ret != -ENOMEM)
+		cachefiles_io_error_obj(
+			object,
+			"Failed to update xattr with error %d", ret);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * check the consistency between the backing cache and the FS-Cache cookie
+ */
+int cachefiles_check_auxdata(struct cachefiles_object *object)
+{
+	struct cachefiles_xattr *auxbuf;
+	enum fscache_checkaux validity;
+	struct dentry *dentry = object->dentry;
+	ssize_t xlen;
+	int ret;
+
+	ASSERT(dentry);
+	ASSERT(d_backing_inode(dentry));
+	ASSERT(object->fscache.cookie->def->check_aux);
+
+	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+	if (!auxbuf)
+		return -ENOMEM;
+
+	xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
+			    &auxbuf->type, 512 + 1);
+	ret = -ESTALE;
+	if (xlen < 1 ||
+	    auxbuf->type != object->fscache.cookie->def->type)
+		goto error;
+
+	xlen--;
+	validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
+	if (validity != FSCACHE_CHECKAUX_OKAY)
+		goto error;
+
+	ret = 0;
+error:
+	kfree(auxbuf);
+	return ret;
+}
+
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+				  struct cachefiles_xattr *auxdata)
+{
+	struct cachefiles_xattr *auxbuf;
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	ASSERT(dentry);
+	ASSERT(d_backing_inode(dentry));
+
+	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
+	if (!auxbuf) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* read the current type label */
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+			   &auxbuf->type, 512 + 1);
+	if (ret < 0) {
+		if (ret == -ENODATA)
+			goto stale; /* no attribute - power went off
+				     * mid-cull? */
+
+		if (ret == -ERANGE)
+			goto bad_type_length;
+
+		cachefiles_io_error_obj(object,
+					"Can't read xattr on %lu (err %d)",
+					d_backing_inode(dentry)->i_ino, -ret);
+		goto error;
+	}
+
+	/* check the on-disk object */
+	if (ret < 1)
+		goto bad_type_length;
+
+	if (auxbuf->type != auxdata->type)
+		goto stale;
+
+	auxbuf->len = ret;
+
+	/* consult the netfs */
+	if (object->fscache.cookie->def->check_aux) {
+		enum fscache_checkaux result;
+		unsigned int dlen;
+
+		dlen = auxbuf->len - 1;
+
+		_debug("checkaux %s #%u",
+		       object->fscache.cookie->def->name, dlen);
+
+		result = fscache_check_aux(&object->fscache,
+					   &auxbuf->data, dlen);
+
+		switch (result) {
+			/* entry okay as is */
+		case FSCACHE_CHECKAUX_OKAY:
+			goto okay;
+
+			/* entry requires update */
+		case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+			break;
+
+			/* entry requires deletion */
+		case FSCACHE_CHECKAUX_OBSOLETE:
+			goto stale;
+
+		default:
+			BUG();
+		}
+
+		/* update the current label */
+		ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+				   &auxdata->type, auxdata->len,
+				   XATTR_REPLACE);
+		if (ret < 0) {
+			cachefiles_io_error_obj(object,
+						"Can't update xattr on %lu"
+						" (error %d)",
+						d_backing_inode(dentry)->i_ino, -ret);
+			goto error;
+		}
+	}
+
+okay:
+	ret = 0;
+
+error:
+	kfree(auxbuf);
+	_leave(" = %d", ret);
+	return ret;
+
+bad_type_length:
+	pr_err("Cache object %lu xattr length incorrect\n",
+	       d_backing_inode(dentry)->i_ino);
+	ret = -EIO;
+	goto error;
+
+stale:
+	ret = -ESTALE;
+	goto error;
+}
+
+/*
+ * remove the object's xattr to mark it stale
+ */
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+				   struct dentry *dentry)
+{
+	int ret;
+
+	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+	if (ret < 0) {
+		if (ret == -ENOENT || ret == -ENODATA)
+			ret = 0;
+		else if (ret != -ENOMEM)
+			cachefiles_io_error(cache,
+					    "Can't remove xattr from %lu"
+					    " (error %d)",
+					    d_backing_inode(dentry)->i_ino, -ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/kernel/fs/ceph/Kconfig b/kernel/fs/ceph/Kconfig
new file mode 100644
index 000000000..264e9bf83
--- /dev/null
+++ b/kernel/fs/ceph/Kconfig
@@ -0,0 +1,40 @@
+config CEPH_FS
+	tristate "Ceph distributed file system"
+	depends on INET
+	select CEPH_LIB
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	default n
+	help
+	  Choose Y or M here to include support for mounting the
+	  experimental Ceph distributed file system.  Ceph is an extremely
+	  scalable file system designed to provide high performance,
+	  reliable access to petabytes of storage.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+if CEPH_FS
+config CEPH_FSCACHE
+	bool "Enable Ceph client caching support"
+	depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
+	help
+	  Choose Y here to enable persistent, read-only local
+	  caching support for Ceph clients using FS-Cache
+
+endif
+
+config CEPH_FS_POSIX_ACL
+	bool "Ceph POSIX Access Control Lists"
+	depends on CEPH_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/kernel/fs/ceph/Makefile b/kernel/fs/ceph/Makefile
new file mode 100644
index 000000000..85a4230b9
--- /dev/null
+++ b/kernel/fs/ceph/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+	export.o caps.o snap.o xattr.o \
+	mds_client.o mdsmap.o strings.o ceph_frag.o \
+	debugfs.o
+
+ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/kernel/fs/ceph/acl.c b/kernel/fs/ceph/acl.c
new file mode 100644
index 000000000..64fa24834
--- /dev/null
+++ b/kernel/fs/ceph/acl.c
@@ -0,0 +1,263 @@
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "super.h"
+
+static inline void ceph_set_cached_acl(struct inode *inode,
+					int type, struct posix_acl *acl)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+		set_cached_acl(inode, type, acl);
+	spin_unlock(&ci->i_ceph_lock);
+}
+
+struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+{
+	int size;
+	const char *name;
+	char *value = NULL;
+	struct posix_acl *acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	size = __ceph_getxattr(inode, name, "", 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __ceph_getxattr(inode, name, value, size);
+	}
+
+	if (size > 0)
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	else if (size == -ERANGE || size == -ENODATA || size == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(-EIO);
+
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		ceph_set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int ret = 0, size = 0;
+	const char *name = NULL;
+	char *value = NULL;
+	struct iattr newattrs;
+	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+	struct dentry *dentry;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &new_mode);
+			if (ret < 0)
+				goto out;
+			if (ret == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode)) {
+			ret = acl ? -EINVAL : 0;
+			goto out;
+		}
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (ret < 0)
+			goto out_free;
+	}
+
+	dentry = d_find_alias(inode);
+	if (new_mode != old_mode) {
+		newattrs.ia_mode = new_mode;
+		newattrs.ia_valid = ATTR_MODE;
+		ret = ceph_setattr(dentry, &newattrs);
+		if (ret)
+			goto out_dput;
+	}
+
+	ret = __ceph_setxattr(dentry, name, value, size, 0);
+	if (ret) {
+		if (new_mode != old_mode) {
+			newattrs.ia_mode = old_mode;
+			newattrs.ia_valid = ATTR_MODE;
+			ceph_setattr(dentry, &newattrs);
+		}
+		goto out_dput;
+	}
+
+	ceph_set_cached_acl(inode, type, acl);
+
+out_dput:
+	dput(dentry);
+out_free:
+	kfree(value);
+out:
+	return ret;
+}
+
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+		       struct ceph_acls_info *info)
+{
+	struct posix_acl *acl, *default_acl;
+	size_t val_size1 = 0, val_size2 = 0;
+	struct ceph_pagelist *pagelist = NULL;
+	void *tmp_buf = NULL;
+	int err;
+
+	err = posix_acl_create(dir, mode, &default_acl, &acl);
+	if (err)
+		return err;
+
+	if (acl) {
+		int ret = posix_acl_equiv_mode(acl, mode);
+		if (ret < 0)
+			goto out_err;
+		if (ret == 0) {
+			posix_acl_release(acl);
+			acl = NULL;
+		}
+	}
+
+	if (!default_acl && !acl)
+		return 0;
+
+	if (acl)
+		val_size1 = posix_acl_xattr_size(acl->a_count);
+	if (default_acl)
+		val_size2 = posix_acl_xattr_size(default_acl->a_count);
+
+	err = -ENOMEM;
+	tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
+	if (!tmp_buf)
+		goto out_err;
+	pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
+	if (!pagelist)
+		goto out_err;
+	ceph_pagelist_init(pagelist);
+
+	err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
+	if (err)
+		goto out_err;
+
+	ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
+
+	if (acl) {
+		size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+		err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
+		if (err)
+			goto out_err;
+		ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+					    len);
+		err = posix_acl_to_xattr(&init_user_ns, acl,
+					 tmp_buf, val_size1);
+		if (err < 0)
+			goto out_err;
+		ceph_pagelist_encode_32(pagelist, val_size1);
+		ceph_pagelist_append(pagelist, tmp_buf, val_size1);
+	}
+	if (default_acl) {
+		size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+		err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
+		if (err)
+			goto out_err;
+		err = ceph_pagelist_encode_string(pagelist,
+						  POSIX_ACL_XATTR_DEFAULT, len);
+		err = posix_acl_to_xattr(&init_user_ns, default_acl,
+					 tmp_buf, val_size2);
+		if (err < 0)
+			goto out_err;
+		ceph_pagelist_encode_32(pagelist, val_size2);
+		ceph_pagelist_append(pagelist, tmp_buf, val_size2);
+	}
+
+	kfree(tmp_buf);
+
+	info->acl = acl;
+	info->default_acl = default_acl;
+	info->pagelist = pagelist;
+	return 0;
+
+out_err:
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
+	kfree(tmp_buf);
+	if (pagelist)
+		ceph_pagelist_release(pagelist);
+	return err;
+}
+
+void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
+{
+	if (!inode)
+		return;
+	ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
+	ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
+}
+
+void ceph_release_acls_info(struct ceph_acls_info *info)
+{
+	posix_acl_release(info->acl);
+	posix_acl_release(info->default_acl);
+	if (info->pagelist)
+		ceph_pagelist_release(info->pagelist);
+}
diff --git a/kernel/fs/ceph/addr.c b/kernel/fs/ceph/addr.c
new file mode 100644
index 000000000..e162bcd10
--- /dev/null
+++ b/kernel/fs/ceph/addr.c
@@ -0,0 +1,1599 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>	/* generic_writepages */
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+#include <linux/ceph/osd_client.h>
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absence of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)				\
+	(CONGESTION_ON_THRESH(congestion_kb) -				\
+	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+static inline struct ceph_snap_context *page_snap_context(struct page *page)
+{
+	if (PagePrivate(page))
+		return (void *)page->private;
+	return NULL;
+}
+
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_snap_context *snapc;
+	int ret;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	if (PageDirty(page)) {
+		dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+		     mapping->host, page, page->index);
+		BUG_ON(!PagePrivate(page));
+		return 0;
+	}
+
+	inode = mapping->host;
+	ci = ceph_inode(inode);
+
+	/*
+	 * Note that we're grabbing a snapc ref here without holding
+	 * any locks!
+	 */
+	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+	/* dirty the head */
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_head_snapc == NULL)
+		ci->i_head_snapc = ceph_get_snap_context(snapc);
+	++ci->i_wrbuffer_ref_head;
+	if (ci->i_wrbuffer_ref == 0)
+		ihold(inode);
+	++ci->i_wrbuffer_ref;
+	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+	     "snapc %p seq %lld (%d snaps)\n",
+	     mapping->host, page, page->index,
+	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+	     snapc, snapc->seq, snapc->num_snaps);
+	spin_unlock(&ci->i_ceph_lock);
+
+	/*
+	 * Reference snap context in page->private.  Also set
+	 * PagePrivate so that we get invalidatepage callback.
+	 */
+	BUG_ON(PagePrivate(page));
+	page->private = (unsigned long)snapc;
+	SetPagePrivate(page);
+
+	ret = __set_page_dirty_nobuffers(page);
+	WARN_ON(!PageLocked(page));
+	WARN_ON(!page->mapping);
+
+	return ret;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_snap_context *snapc = page_snap_context(page);
+
+	inode = page->mapping->host;
+	ci = ceph_inode(inode);
+
+	if (offset != 0 || length != PAGE_CACHE_SIZE) {
+		dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
+		     inode, page, page->index, offset, length);
+		return;
+	}
+
+	ceph_invalidate_fscache_page(inode, page);
+
+	if (!PagePrivate(page))
+		return;
+
+	/*
+	 * We can get non-dirty pages here due to races between
+	 * set_page_dirty and truncate_complete_page; just spit out a
+	 * warning, in case we end up with accounting problems later.
+	 */
+	if (!PageDirty(page))
+		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+	ClearPageChecked(page);
+
+	dout("%p invalidatepage %p idx %lu full dirty page\n",
+	     inode, page, page->index);
+
+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	ceph_put_snap_context(snapc);
+	page->private = 0;
+	ClearPagePrivate(page);
+}
+
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+	struct inode *inode = page->mapping ? page->mapping->host : NULL;
+	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+	WARN_ON(PageDirty(page));
+
+	/* Can we release the page from the cache? */
+	if (!ceph_release_fscache_page(page, g))
+		return 0;
+
+	return !PagePrivate(page);
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+	struct inode *inode = file_inode(filp);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
+	int err = 0;
+	u64 off = page_offset(page);
+	u64 len = PAGE_CACHE_SIZE;
+
+	if (off >= i_size_read(inode)) {
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+		return 0;
+	}
+
+	if (ci->i_inline_version != CEPH_INLINE_NONE) {
+		/*
+		 * Uptodate inline data should have been added
+		 * into page cache while getting Fcr caps.
+		 */
+		if (off == 0)
+			return -EINVAL;
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+		return 0;
+	}
+
+	err = ceph_readpage_from_fscache(inode, page);
+	if (err == 0)
+		goto out;
+
+	dout("readpage inode %p file %p page %p index %lu\n",
+	     inode, filp, page, page->index);
+	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				  off, &len,
+				  ci->i_truncate_seq, ci->i_truncate_size,
+				  &page, 1, 0);
+	if (err == -ENOENT)
+		err = 0;
+	if (err < 0) {
+		SetPageError(page);
+		ceph_fscache_readpage_cancel(inode, page);
+		goto out;
+	}
+	if (err < PAGE_CACHE_SIZE)
+		/* zero fill remainder of page */
+		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+	else
+		flush_dcache_page(page);
+
+	SetPageUptodate(page);
+	ceph_readpage_to_fscache(inode, page);
+
+out:
+	return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+	int r = readpage_nounlock(filp, page);
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * Finish an async read(ahead) op.
+ */
+static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+	struct inode *inode = req->r_inode;
+	struct ceph_osd_data *osd_data;
+	int rc = req->r_result;
+	int bytes = le32_to_cpu(msg->hdr.data_len);
+	int num_pages;
+	int i;
+
+	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+
+	/* unlock all pages, zeroing any data we didn't read */
+	osd_data = osd_req_op_extent_osd_data(req, 0);
+	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+	num_pages = calc_pages_for((u64)osd_data->alignment,
+					(u64)osd_data->length);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = osd_data->pages[i];
+
+		if (rc < 0)
+			goto unlock;
+		if (bytes < (int)PAGE_CACHE_SIZE) {
+			/* zero (remainder of) page */
+			int s = bytes < 0 ? 0 : bytes;
+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
+		}
+ 		dout("finish_read %p uptodate %p idx %lu\n", inode, page,
+		     page->index);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		ceph_readpage_to_fscache(inode, page);
+unlock:
+		unlock_page(page);
+		page_cache_release(page);
+		bytes -= PAGE_CACHE_SIZE;
+	}
+	kfree(osd_data->pages);
+}
+
+static void ceph_unlock_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		unlock_page(pages[i]);
+}
+
+/*
+ * start an async read(ahead) operation.  return nr_pages we submitted
+ * a read for on success, or negative error code.
+ */
+static int start_read(struct inode *inode, struct list_head *page_list, int max)
+{
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct page *page = list_entry(page_list->prev, struct page, lru);
+	struct ceph_vino vino;
+	struct ceph_osd_request *req;
+	u64 off;
+	u64 len;
+	int i;
+	struct page **pages;
+	pgoff_t next_index;
+	int nr_pages = 0;
+	int ret;
+
+	off = (u64) page_offset(page);
+
+	/* count pages */
+	next_index = page->index;
+	list_for_each_entry_reverse(page, page_list, lru) {
+		if (page->index != next_index)
+			break;
+		nr_pages++;
+		next_index++;
+		if (max && nr_pages == max)
+			break;
+	}
+	len = nr_pages << PAGE_CACHE_SHIFT;
+	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
+	     off, len);
+	vino = ceph_vino(inode);
+	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
+				    0, 1, CEPH_OSD_OP_READ,
+				    CEPH_OSD_FLAG_READ, NULL,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	/* build page vector */
+	nr_pages = calc_pages_for(0, len);
+	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+	ret = -ENOMEM;
+	if (!pages)
+		goto out;
+	for (i = 0; i < nr_pages; ++i) {
+		page = list_entry(page_list->prev, struct page, lru);
+		BUG_ON(PageLocked(page));
+		list_del(&page->lru);
+
+ 		dout("start_read %p adding %p idx %lu\n", inode, page,
+		     page->index);
+		if (add_to_page_cache_lru(page, &inode->i_data, page->index,
+					  GFP_NOFS)) {
+			ceph_fscache_uncache_page(inode, page);
+			page_cache_release(page);
+			dout("start_read %p add_to_page_cache failed %p\n",
+			     inode, page);
+			nr_pages = i;
+			goto out_pages;
+		}
+		pages[i] = page;
+	}
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
+	req->r_callback = finish_read;
+	req->r_inode = inode;
+
+	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
+
+	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
+	ret = ceph_osdc_start_request(osdc, req, false);
+	if (ret < 0)
+		goto out_pages;
+	ceph_osdc_put_request(req);
+	return nr_pages;
+
+out_pages:
+	ceph_unlock_page_vector(pages, nr_pages);
+	ceph_release_page_vector(pages, nr_pages);
+out:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	int rc = 0;
+	int max = 0;
+
+	if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
+		return -EINVAL;
+
+	rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
+					 &nr_pages);
+
+	if (rc == 0)
+		goto out;
+
+	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
+
+	dout("readpages %p file %p nr_pages %d max %d\n", inode,
+		file, nr_pages,
+	     max);
+	while (!list_empty(page_list)) {
+		rc = start_read(inode, page_list, max);
+		if (rc < 0)
+			goto out;
+		BUG_ON(rc == 0);
+	}
+out:
+	ceph_fscache_readpages_cancel(inode, page_list);
+
+	dout("readpages %p file %p ret %d\n", inode, file, rc);
+	return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ */
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+						    u64 *snap_size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc = NULL;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&ci->i_ceph_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+		     capsnap->context, capsnap->dirty_pages);
+		if (capsnap->dirty_pages) {
+			snapc = ceph_get_snap_context(capsnap->context);
+			if (snap_size)
+				*snap_size = capsnap->size;
+			break;
+		}
+	}
+	if (!snapc && ci->i_wrbuffer_ref_head) {
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
+		dout(" head snapc %p has %d dirty pages\n",
+		     snapc, ci->i_wrbuffer_ref_head);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_fs_client *fsc;
+	struct ceph_osd_client *osdc;
+	struct ceph_snap_context *snapc, *oldest;
+	loff_t page_off = page_offset(page);
+	long writeback_stat;
+	u64 truncate_size, snap_size = 0;
+	u32 truncate_seq;
+	int err = 0, len = PAGE_CACHE_SIZE;
+
+	dout("writepage %p idx %lu\n", page, page->index);
+
+	if (!page->mapping || !page->mapping->host) {
+		dout("writepage %p - no mapping\n", page);
+		return -EFAULT;
+	}
+	inode = page->mapping->host;
+	ci = ceph_inode(inode);
+	fsc = ceph_inode_to_client(inode);
+	osdc = &fsc->client->osdc;
+
+	/* verify this is a writeable snap context */
+	snapc = page_snap_context(page);
+	if (snapc == NULL) {
+		dout("writepage %p page %p not dirty?\n", inode, page);
+		goto out;
+	}
+	oldest = get_oldest_context(inode, &snap_size);
+	if (snapc->seq > oldest->seq) {
+		dout("writepage %p page %p snapc %p not writeable - noop\n",
+		     inode, page, snapc);
+		/* we should only noop if called by kswapd */
+		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		ceph_put_snap_context(oldest);
+		goto out;
+	}
+	ceph_put_snap_context(oldest);
+
+	spin_lock(&ci->i_ceph_lock);
+	truncate_seq = ci->i_truncate_seq;
+	truncate_size = ci->i_truncate_size;
+	if (!snap_size)
+		snap_size = i_size_read(inode);
+	spin_unlock(&ci->i_ceph_lock);
+
+	/* is this a partial page at end of file? */
+	if (page_off >= snap_size) {
+		dout("%p page eof %llu\n", page, snap_size);
+		goto out;
+	}
+	if (snap_size < page_off + len)
+		len = snap_size - page_off;
+
+	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
+	     inode, page, page->index, page_off, len, snapc);
+
+	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
+	if (writeback_stat >
+	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+
+	ceph_readpage_to_fscache(inode, page);
+
+	set_page_writeback(page);
+	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+				   &ci->i_layout, snapc,
+				   page_off, len,
+				   truncate_seq, truncate_size,
+				   &inode->i_mtime, &page, 1);
+	if (err < 0) {
+		dout("writepage setting page/mapping error %d %p\n", err, page);
+		SetPageError(page);
+		mapping_set_error(&inode->i_data, err);
+		if (wbc)
+			wbc->pages_skipped++;
+	} else {
+		dout("writepage cleaned page %p\n", page);
+		err = 0;  /* vfs expects us to return 0 */
+	}
+	page->private = 0;
+	ClearPagePrivate(page);
+	end_page_writeback(page);
+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	ceph_put_snap_context(snapc);  /* page's reference */
+out:
+	return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err;
+	struct inode *inode = page->mapping->host;
+	BUG_ON(!inode);
+	ihold(inode);
+	err = writepage_nounlock(page, wbc);
+	unlock_page(page);
+	iput(inode);
+	return err;
+}
+
+
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+	struct pagevec pvec;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	for (i = 0; i < num; i++) {
+		if (pagevec_add(&pvec, pages[i]) == 0)
+			pagevec_release(&pvec);
+	}
+	pagevec_release(&pvec);
+}
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct inode *inode = req->r_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_data *osd_data;
+	unsigned wrote;
+	struct page *page;
+	int num_pages;
+	int i;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	struct address_space *mapping = inode->i_mapping;
+	int rc = req->r_result;
+	u64 bytes = req->r_ops[0].extent.length;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	long writeback_stat;
+	unsigned issued = ceph_caps_issued(ci);
+
+	osd_data = osd_req_op_extent_osd_data(req, 0);
+	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+	num_pages = calc_pages_for((u64)osd_data->alignment,
+					(u64)osd_data->length);
+	if (rc >= 0) {
+		/*
+		 * Assume we wrote the pages we originally sent.  The
+		 * osd might reply with fewer pages if our writeback
+		 * raced with a truncation and was adjusted at the osd,
+		 * so don't believe the reply.
+		 */
+		wrote = num_pages;
+	} else {
+		wrote = 0;
+		mapping_set_error(mapping, rc);
+	}
+	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+	     inode, rc, bytes, wrote);
+
+	/* clean all pages */
+	for (i = 0; i < num_pages; i++) {
+		page = osd_data->pages[i];
+		BUG_ON(!page);
+		WARN_ON(!PageUptodate(page));
+
+		writeback_stat =
+			atomic_long_dec_return(&fsc->writeback_count);
+		if (writeback_stat <
+		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+			clear_bdi_congested(&fsc->backing_dev_info,
+					    BLK_RW_ASYNC);
+
+		ceph_put_snap_context(page_snap_context(page));
+		page->private = 0;
+		ClearPagePrivate(page);
+		dout("unlocking %d %p\n", i, page);
+		end_page_writeback(page);
+
+		/*
+		 * We lost the cache cap, need to truncate the page before
+		 * it is unlocked, otherwise we'd truncate it later in the
+		 * page truncation thread, possibly losing some data that
+		 * raced its way in
+		 */
+		if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
+			generic_error_remove_page(inode->i_mapping, page);
+
+		unlock_page(page);
+	}
+	dout("%p wrote+cleaned %d pages\n", inode, wrote);
+	ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
+
+	ceph_release_pages(osd_data->pages, num_pages);
+	if (osd_data->pages_from_pool)
+		mempool_free(osd_data->pages,
+			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+	else
+		kfree(osd_data->pages);
+	ceph_osdc_put_request(req);
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_vino vino = ceph_vino(inode);
+	pgoff_t index, start, end;
+	int range_whole = 0;
+	int should_loop = 1;
+	pgoff_t max_pages = 0, max_pages_ever = 0;
+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
+	struct pagevec pvec;
+	int done = 0;
+	int rc = 0;
+	unsigned wsize = 1 << inode->i_blkbits;
+	struct ceph_osd_request *req = NULL;
+	int do_sync = 0;
+	u64 truncate_size, snap_size;
+	u32 truncate_seq;
+
+	/*
+	 * Include a 'sync' in the OSD request if this is a data
+	 * integrity write (e.g., O_SYNC write or fsync()), or if our
+	 * cap is being revoked.
+	 */
+	if ((wbc->sync_mode == WB_SYNC_ALL) ||
+		ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+		do_sync = 1;
+	dout("writepages_start %p dosync=%d (mode=%s)\n",
+	     inode, do_sync,
+	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+		pr_warn("writepage_start %p on forced umount\n", inode);
+		return -EIO; /* we're in a forced umount, don't write! */
+	}
+	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+		wsize = fsc->mount_options->wsize;
+	if (wsize < PAGE_CACHE_SIZE)
+		wsize = PAGE_CACHE_SIZE;
+	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+
+	/* where to start/end? */
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+		dout(" cyclic, start at %lu\n", start);
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		should_loop = 0;
+		dout(" not cyclic, %lu to %lu\n", start, end);
+	}
+	index = start;
+
+retry:
+	/* find oldest snap context with dirty data */
+	ceph_put_snap_context(snapc);
+	snap_size = 0;
+	snapc = get_oldest_context(inode, &snap_size);
+	if (!snapc) {
+		/* hmm, why does writepages get called when there
+		   is no dirty data? */
+		dout(" no snap context with dirty data?\n");
+		goto out;
+	}
+	if (snap_size == 0)
+		snap_size = i_size_read(inode);
+	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+	     snapc, snapc->seq, snapc->num_snaps);
+
+	spin_lock(&ci->i_ceph_lock);
+	truncate_seq = ci->i_truncate_seq;
+	truncate_size = ci->i_truncate_size;
+	if (!snap_size)
+		snap_size = i_size_read(inode);
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (last_snapc && snapc != last_snapc) {
+		/* if we switched to a newer snapc, restart our scan at the
+		 * start of the original file range. */
+		dout("  snapc differs from last pass, restarting at %lu\n",
+		     index);
+		index = start;
+	}
+	last_snapc = snapc;
+
+	while (!done && index <= end) {
+		unsigned i;
+		int first;
+		pgoff_t next;
+		int pvec_pages, locked_pages;
+		struct page **pages = NULL;
+		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */
+		struct page *page;
+		int want;
+		u64 offset, len;
+		long writeback_stat;
+
+		next = 0;
+		locked_pages = 0;
+		max_pages = max_pages_ever;
+
+get_more_pages:
+		first = -1;
+		want = min(end - index,
+			   min((pgoff_t)PAGEVEC_SIZE,
+			       max_pages - (pgoff_t)locked_pages) - 1)
+			+ 1;
+		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+						PAGECACHE_TAG_DIRTY,
+						want);
+		dout("pagevec_lookup_tag got %d\n", pvec_pages);
+		if (!pvec_pages && !locked_pages)
+			break;
+		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+			page = pvec.pages[i];
+			dout("? %p idx %lu\n", page, page->index);
+			if (locked_pages == 0)
+				lock_page(page);  /* first page */
+			else if (!trylock_page(page))
+				break;
+
+			/* only dirty pages, or our accounting breaks */
+			if (unlikely(!PageDirty(page)) ||
+			    unlikely(page->mapping != mapping)) {
+				dout("!dirty or !mapping %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (!wbc->range_cyclic && page->index > end) {
+				dout("end of range %p\n", page);
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (next && (page->index != next)) {
+				dout("not consecutive %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				dout("waiting on writeback %p\n", page);
+				wait_on_page_writeback(page);
+			}
+			if (page_offset(page) >= snap_size) {
+				dout("%p page eof %llu\n", page, snap_size);
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (PageWriteback(page)) {
+				dout("%p under writeback\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* only if matching snap context */
+			pgsnapc = page_snap_context(page);
+			if (pgsnapc->seq > snapc->seq) {
+				dout("page snapc %p %lld > oldest %p %lld\n",
+				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+				unlock_page(page);
+				if (!locked_pages)
+					continue; /* keep looking for snap */
+				break;
+			}
+
+			if (!clear_page_dirty_for_io(page)) {
+				dout("%p !clear_page_dirty_for_io\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/*
+			 * We have something to write.  If this is
+			 * the first locked page this time through,
+			 * allocate an osd request and a page array
+			 * that it will use.
+			 */
+			if (locked_pages == 0) {
+				BUG_ON(pages);
+				/* prepare async write request */
+				offset = (u64)page_offset(page);
+				len = wsize;
+				req = ceph_osdc_new_request(&fsc->client->osdc,
+							&ci->i_layout, vino,
+							offset, &len, 0,
+							do_sync ? 2 : 1,
+							CEPH_OSD_OP_WRITE,
+							CEPH_OSD_FLAG_WRITE |
+							CEPH_OSD_FLAG_ONDISK,
+							snapc, truncate_seq,
+							truncate_size, true);
+				if (IS_ERR(req)) {
+					rc = PTR_ERR(req);
+					unlock_page(page);
+					break;
+				}
+
+				if (do_sync)
+					osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+
+				req->r_callback = writepages_finish;
+				req->r_inode = inode;
+
+				max_pages = calc_pages_for(0, (u64)len);
+				pages = kmalloc(max_pages * sizeof (*pages),
+						GFP_NOFS);
+				if (!pages) {
+					pool = fsc->wb_pagevec_pool;
+					pages = mempool_alloc(pool, GFP_NOFS);
+					BUG_ON(!pages);
+				}
+			}
+
+			/* note position of first page in pvec */
+			if (first < 0)
+				first = i;
+			dout("%p will write page %p idx %lu\n",
+			     inode, page, page->index);
+
+			writeback_stat =
+			       atomic_long_inc_return(&fsc->writeback_count);
+			if (writeback_stat > CONGESTION_ON_THRESH(
+				    fsc->mount_options->congestion_kb)) {
+				set_bdi_congested(&fsc->backing_dev_info,
+						  BLK_RW_ASYNC);
+			}
+
+			set_page_writeback(page);
+			pages[locked_pages] = page;
+			locked_pages++;
+			next = page->index + 1;
+		}
+
+		/* did we get anything? */
+		if (!locked_pages)
+			goto release_pvec_pages;
+		if (i) {
+			int j;
+			BUG_ON(!locked_pages || first < 0);
+
+			if (pvec_pages && i == pvec_pages &&
+			    locked_pages < max_pages) {
+				dout("reached end pvec, trying for more\n");
+				pagevec_reinit(&pvec);
+				goto get_more_pages;
+			}
+
+			/* shift unused pages over in the pvec...  we
+			 * will need to release them below. */
+			for (j = i; j < pvec_pages; j++) {
+				dout(" pvec leftover page %p\n",
+				     pvec.pages[j]);
+				pvec.pages[j-i+first] = pvec.pages[j];
+			}
+			pvec.nr -= i-first;
+		}
+
+		/* Format the osd request message and submit the write */
+
+		offset = page_offset(pages[0]);
+		len = min(snap_size - offset,
+			  (u64)locked_pages << PAGE_CACHE_SHIFT);
+		dout("writepages got %d pages at %llu~%llu\n",
+		     locked_pages, offset, len);
+
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+							!!pool, false);
+
+		pages = NULL;	/* request message now owns the pages array */
+		pool = NULL;
+
+		/* Update the write op length in case we changed it */
+
+		osd_req_op_extent_update(req, 0, len);
+
+		vino = ceph_vino(inode);
+		ceph_osdc_build_request(req, offset, snapc, vino.snap,
+					&inode->i_mtime);
+
+		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+		BUG_ON(rc);
+		req = NULL;
+
+		/* continue? */
+		index = next;
+		wbc->nr_to_write -= locked_pages;
+		if (wbc->nr_to_write <= 0)
+			done = 1;
+
+release_pvec_pages:
+		dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+		     pvec.nr ? pvec.pages[0] : NULL);
+		pagevec_release(&pvec);
+
+		if (locked_pages && !done)
+			goto retry;
+	}
+
+	if (should_loop && !done) {
+		/* more to do; loop back to beginning of file */
+		dout("writepages looping back to beginning of file\n");
+		should_loop = 0;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+out:
+	if (req)
+		ceph_osdc_put_request(req);
+	ceph_put_snap_context(snapc);
+	dout("writepages done, rc = %d\n", rc);
+	return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+					   struct ceph_snap_context *snapc)
+{
+	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+	int ret = !oldest || snapc->seq <= oldest->seq;
+
+	ceph_put_snap_context(oldest);
+	return ret;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+			    loff_t pos, unsigned len,
+			    struct page *page)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	loff_t page_off = pos & PAGE_CACHE_MASK;
+	int pos_in_page = pos & ~PAGE_CACHE_MASK;
+	int end_in_page = pos_in_page + len;
+	loff_t i_size;
+	int r;
+	struct ceph_snap_context *snapc, *oldest;
+
+retry_locked:
+	/* writepages currently holds page lock, but if we change that later, */
+	wait_on_page_writeback(page);
+
+	/* check snap context */
+	BUG_ON(!ci->i_snap_realm);
+	down_read(&mdsc->snap_rwsem);
+	BUG_ON(!ci->i_snap_realm->cached_context);
+	snapc = page_snap_context(page);
+	if (snapc && snapc != ci->i_head_snapc) {
+		/*
+		 * this page is already dirty in another (older) snap
+		 * context!  is it writeable now?
+		 */
+		oldest = get_oldest_context(inode, NULL);
+		up_read(&mdsc->snap_rwsem);
+
+		if (snapc->seq > oldest->seq) {
+			ceph_put_snap_context(oldest);
+			dout(" page %p snapc %p not current or oldest\n",
+			     page, snapc);
+			/*
+			 * queue for writeback, and wait for snapc to
+			 * be writeable or written
+			 */
+			snapc = ceph_get_snap_context(snapc);
+			unlock_page(page);
+			ceph_queue_writeback(inode);
+			r = wait_event_interruptible(ci->i_cap_wq,
+			       context_is_writeable_or_written(inode, snapc));
+			ceph_put_snap_context(snapc);
+			if (r == -ERESTARTSYS)
+				return r;
+			return -EAGAIN;
+		}
+		ceph_put_snap_context(oldest);
+
+		/* yay, writeable, do it now (without dropping page lock) */
+		dout(" page %p snapc %p not current, but oldest\n",
+		     page, snapc);
+		if (!clear_page_dirty_for_io(page))
+			goto retry_locked;
+		r = writepage_nounlock(page, NULL);
+		if (r < 0)
+			goto fail_nosnap;
+		goto retry_locked;
+	}
+
+	if (PageUptodate(page)) {
+		dout(" page %p already uptodate\n", page);
+		return 0;
+	}
+
+	/* full page? */
+	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+		return 0;
+
+	/* past end of file? */
+	i_size = inode->i_size;   /* caller holds i_mutex */
+
+	if (page_off >= i_size ||
+	    (pos_in_page == 0 && (pos+len) >= i_size &&
+	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+		dout(" zeroing %p 0 - %d and %d - %d\n",
+		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+		zero_user_segments(page,
+				   0, pos_in_page,
+				   end_in_page, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	/* we need to read it. */
+	up_read(&mdsc->snap_rwsem);
+	r = readpage_nounlock(file, page);
+	if (r < 0)
+		goto fail_nosnap;
+	goto retry_locked;
+fail_nosnap:
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = file_inode(file);
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int r;
+
+	do {
+		/* get a page */
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page)
+			return -ENOMEM;
+		*pagep = page;
+
+		dout("write_begin file %p inode %p page %p %d~%d\n", file,
+		     inode, page, (int)pos, (int)len);
+
+		r = ceph_update_writeable_page(file, pos, len, page);
+		if (r < 0)
+			page_cache_release(page);
+		else
+			*pagep = page;
+	} while (r == -EAGAIN);
+
+	return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int check_cap = 0;
+
+	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+	     inode, page, (int)pos, (int)copied, (int)len);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len)
+		zero_user_segment(page, from+copied, len);
+
+	/* did file size increase? */
+	/* (no need for i_size_read(); we caller holds i_mutex */
+	if (pos+copied > inode->i_size)
+		check_cap = ceph_inode_set_size(inode, pos+copied);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	set_page_dirty(page);
+
+	unlock_page(page);
+	up_read(&mdsc->snap_rwsem);
+	page_cache_release(page);
+
+	if (check_cap)
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+	return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter,
+			      loff_t pos)
+{
+	WARN_ON(1);
+	return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+	.readpage = ceph_readpage,
+	.readpages = ceph_readpages,
+	.writepage = ceph_writepage,
+	.writepages = ceph_writepages_start,
+	.write_begin = ceph_write_begin,
+	.write_end = ceph_write_end,
+	.set_page_dirty = ceph_set_page_dirty,
+	.invalidatepage = ceph_invalidatepage,
+	.releasepage = ceph_releasepage,
+	.direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *fi = vma->vm_file->private_data;
+	struct page *pinned_page = NULL;
+	loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+	int want, got, ret;
+
+	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
+	     inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_CACHE;
+	while (1) {
+		got = 0;
+		ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
+				    -1, &got, &pinned_page);
+		if (ret == 0)
+			break;
+		if (ret != -ERESTARTSYS) {
+			WARN_ON(1);
+			return VM_FAULT_SIGBUS;
+		}
+	}
+	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
+	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+
+	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
+	    ci->i_inline_version == CEPH_INLINE_NONE)
+		ret = filemap_fault(vma, vmf);
+	else
+		ret = -EAGAIN;
+
+	dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
+	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+	if (pinned_page)
+		page_cache_release(pinned_page);
+	ceph_put_cap_refs(ci, got);
+
+	if (ret != -EAGAIN)
+		return ret;
+
+	/* read inline data */
+	if (off >= PAGE_CACHE_SIZE) {
+		/* does not support inline data > PAGE_SIZE */
+		ret = VM_FAULT_SIGBUS;
+	} else {
+		int ret1;
+		struct address_space *mapping = inode->i_mapping;
+		struct page *page = find_or_create_page(mapping, 0,
+						mapping_gfp_mask(mapping) &
+						~__GFP_FS);
+		if (!page) {
+			ret = VM_FAULT_OOM;
+			goto out;
+		}
+		ret1 = __ceph_do_getattr(inode, page,
+					 CEPH_STAT_CAP_INLINE_DATA, true);
+		if (ret1 < 0 || off >= i_size_read(inode)) {
+			unlock_page(page);
+			page_cache_release(page);
+			ret = VM_FAULT_SIGBUS;
+			goto out;
+		}
+		if (ret1 < PAGE_CACHE_SIZE)
+			zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
+		else
+			flush_dcache_page(page);
+		SetPageUptodate(page);
+		vmf->page = page;
+		ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+	}
+out:
+	dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+	     inode, off, (size_t)PAGE_CACHE_SIZE, ret);
+	return ret;
+}
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *fi = vma->vm_file->private_data;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct page *page = vmf->page;
+	loff_t off = page_offset(page);
+	loff_t size = i_size_read(inode);
+	size_t len;
+	int want, got, ret;
+
+	if (ci->i_inline_version != CEPH_INLINE_NONE) {
+		struct page *locked_page = NULL;
+		if (off == 0) {
+			lock_page(page);
+			locked_page = page;
+		}
+		ret = ceph_uninline_data(vma->vm_file, locked_page);
+		if (locked_page)
+			unlock_page(locked_page);
+		if (ret < 0)
+			return VM_FAULT_SIGBUS;
+	}
+
+	if (off + PAGE_CACHE_SIZE <= size)
+		len = PAGE_CACHE_SIZE;
+	else
+		len = size & ~PAGE_CACHE_MASK;
+
+	dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
+	     inode, ceph_vinop(inode), off, len, size);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+	while (1) {
+		got = 0;
+		ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+				    &got, NULL);
+		if (ret == 0)
+			break;
+		if (ret != -ERESTARTSYS) {
+			WARN_ON(1);
+			return VM_FAULT_SIGBUS;
+		}
+	}
+	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
+	     inode, off, len, ceph_cap_string(got));
+
+	/* Update time before taking page lock */
+	file_update_time(vma->vm_file);
+
+	lock_page(page);
+
+	ret = VM_FAULT_NOPAGE;
+	if ((off > size) ||
+	    (page->mapping != inode->i_mapping))
+		goto out;
+
+	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+	if (ret == 0) {
+		/* success.  we'll keep the page locked. */
+		set_page_dirty(page);
+		up_read(&mdsc->snap_rwsem);
+		ret = VM_FAULT_LOCKED;
+	} else {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
+	}
+out:
+	if (ret != VM_FAULT_LOCKED)
+		unlock_page(page);
+	if (ret == VM_FAULT_LOCKED ||
+	    ci->i_inline_version != CEPH_INLINE_NONE) {
+		int dirty;
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_inline_version = CEPH_INLINE_NONE;
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
+	     inode, off, len, ceph_cap_string(got), ret);
+	ceph_put_cap_refs(ci, got);
+
+	return ret;
+}
+
+void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+			   char	*data, size_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+
+	if (locked_page) {
+		page = locked_page;
+	} else {
+		if (i_size_read(inode) == 0)
+			return;
+		page = find_or_create_page(mapping, 0,
+					   mapping_gfp_mask(mapping) & ~__GFP_FS);
+		if (!page)
+			return;
+		if (PageUptodate(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+			return;
+		}
+	}
+
+	dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
+	     inode, ceph_vinop(inode), len, locked_page);
+
+	if (len > 0) {
+		void *kaddr = kmap_atomic(page);
+		memcpy(kaddr, data, len);
+		kunmap_atomic(kaddr);
+	}
+
+	if (page != locked_page) {
+		if (len < PAGE_CACHE_SIZE)
+			zero_user_segment(page, len, PAGE_CACHE_SIZE);
+		else
+			flush_dcache_page(page);
+
+		SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+int ceph_uninline_data(struct file *filp, struct page *locked_page)
+{
+	struct inode *inode = file_inode(filp);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_osd_request *req;
+	struct page *page = NULL;
+	u64 len, inline_version;
+	int err = 0;
+	bool from_pagecache = false;
+
+	spin_lock(&ci->i_ceph_lock);
+	inline_version = ci->i_inline_version;
+	spin_unlock(&ci->i_ceph_lock);
+
+	dout("uninline_data %p %llx.%llx inline_version %llu\n",
+	     inode, ceph_vinop(inode), inline_version);
+
+	if (inline_version == 1 || /* initial version, no data */
+	    inline_version == CEPH_INLINE_NONE)
+		goto out;
+
+	if (locked_page) {
+		page = locked_page;
+		WARN_ON(!PageUptodate(page));
+	} else if (ceph_caps_issued(ci) &
+		   (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
+		page = find_get_page(inode->i_mapping, 0);
+		if (page) {
+			if (PageUptodate(page)) {
+				from_pagecache = true;
+				lock_page(page);
+			} else {
+				page_cache_release(page);
+				page = NULL;
+			}
+		}
+	}
+
+	if (page) {
+		len = i_size_read(inode);
+		if (len > PAGE_CACHE_SIZE)
+			len = PAGE_CACHE_SIZE;
+	} else {
+		page = __page_cache_alloc(GFP_NOFS);
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = __ceph_do_getattr(inode, page,
+					CEPH_STAT_CAP_INLINE_DATA, true);
+		if (err < 0) {
+			/* no inline data */
+			if (err == -ENODATA)
+				err = 0;
+			goto out;
+		}
+		len = err;
+	}
+
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+				    ceph_vino(inode), 0, &len, 0, 1,
+				    CEPH_OSD_OP_CREATE,
+				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+				    ci->i_snap_realm->cached_context,
+				    0, 0, false);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+	if (!err)
+		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	ceph_osdc_put_request(req);
+	if (err < 0)
+		goto out;
+
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+				    ceph_vino(inode), 0, &len, 1, 3,
+				    CEPH_OSD_OP_WRITE,
+				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+				    ci->i_snap_realm->cached_context,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    false);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+
+	{
+		__le64 xattr_buf = cpu_to_le64(inline_version);
+		err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
+					    "inline_version", &xattr_buf,
+					    sizeof(xattr_buf),
+					    CEPH_OSD_CMPXATTR_OP_GT,
+					    CEPH_OSD_CMPXATTR_MODE_U64);
+		if (err)
+			goto out_put;
+	}
+
+	{
+		char xattr_buf[32];
+		int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
+					 "%llu", inline_version);
+		err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
+					    "inline_version",
+					    xattr_buf, xattr_len, 0, 0);
+		if (err)
+			goto out_put;
+	}
+
+	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+	if (!err)
+		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+out_put:
+	ceph_osdc_put_request(req);
+	if (err == -ECANCELED)
+		err = 0;
+out:
+	if (page && page != locked_page) {
+		if (from_pagecache) {
+			unlock_page(page);
+			page_cache_release(page);
+		} else
+			__free_pages(page, 0);
+	}
+
+	dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
+	     inode, ceph_vinop(inode), inline_version, err);
+	return err;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+	.fault		= ceph_filemap_fault,
+	.page_mkwrite	= ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ceph_vmops;
+	return 0;
+}
diff --git a/kernel/fs/ceph/cache.c b/kernel/fs/ceph/cache.c
new file mode 100644
index 000000000..834f9f372
--- /dev/null
+++ b/kernel/fs/ceph/cache.c
@@ -0,0 +1,402 @@
+/*
+ * Ceph cache definitions.
+ *
+ *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ *  Written by Milosz Tanski (milosz@adfin.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include "super.h"
+#include "cache.h"
+
+struct ceph_aux_inode {
+	struct timespec	mtime;
+	loff_t          size;
+};
+
+struct fscache_netfs ceph_cache_netfs = {
+	.name		= "ceph",
+	.version	= 0,
+};
+
+static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
+					     void *buffer, uint16_t maxbuf)
+{
+	const struct ceph_fs_client* fsc = cookie_netfs_data;
+	uint16_t klen;
+
+	klen = sizeof(fsc->client->fsid);
+	if (klen > maxbuf)
+		return 0;
+
+	memcpy(buffer, &fsc->client->fsid, klen);
+	return klen;
+}
+
+static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
+	.name		= "CEPH.fsid",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= ceph_fscache_session_get_key,
+};
+
+int ceph_fscache_register(void)
+{
+	return fscache_register_netfs(&ceph_cache_netfs);
+}
+
+void ceph_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&ceph_cache_netfs);
+}
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+	fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
+					      &ceph_fscache_fsid_object_def,
+					      fsc, true);
+
+	if (fsc->fscache == NULL) {
+		pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+		return 0;
+	}
+
+	fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
+	if (fsc->revalidate_wq == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t maxbuf)
+{
+	const struct ceph_inode_info* ci = cookie_netfs_data;
+	uint16_t klen;
+
+	/* use ceph virtual inode (id + snaphot) */
+	klen = sizeof(ci->i_vino);
+	if (klen > maxbuf)
+		return 0;
+
+	memcpy(buffer, &ci->i_vino, klen);
+	return klen;
+}
+
+static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
+					   void *buffer, uint16_t bufmax)
+{
+	struct ceph_aux_inode aux;
+	const struct ceph_inode_info* ci = cookie_netfs_data;
+	const struct inode* inode = &ci->vfs_inode;
+
+	memset(&aux, 0, sizeof(aux));
+	aux.mtime = inode->i_mtime;
+	aux.size = inode->i_size;
+
+	memcpy(buffer, &aux, sizeof(aux));
+
+	return sizeof(aux);
+}
+
+static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
+					uint64_t *size)
+{
+	const struct ceph_inode_info* ci = cookie_netfs_data;
+	const struct inode* inode = &ci->vfs_inode;
+
+	*size = inode->i_size;
+}
+
+static enum fscache_checkaux ceph_fscache_inode_check_aux(
+	void *cookie_netfs_data, const void *data, uint16_t dlen)
+{
+	struct ceph_aux_inode aux;
+	struct ceph_inode_info* ci = cookie_netfs_data;
+	struct inode* inode = &ci->vfs_inode;
+
+	if (dlen != sizeof(aux))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&aux, 0, sizeof(aux));
+	aux.mtime = inode->i_mtime;
+	aux.size = inode->i_size;
+
+	if (memcmp(data, &aux, sizeof(aux)) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	dout("ceph inode 0x%p cached okay", ci);
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
+{
+	struct ceph_inode_info* ci = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	dout("ceph inode 0x%p now uncached", ci);
+
+	while (1) {
+		nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
+	.name		= "CEPH.inode",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= ceph_fscache_inode_get_key,
+	.get_attr	= ceph_fscache_inode_get_attr,
+	.get_aux	= ceph_fscache_inode_get_aux,
+	.check_aux	= ceph_fscache_inode_check_aux,
+	.now_uncached	= ceph_fscache_inode_now_uncached,
+};
+
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
+					struct ceph_inode_info* ci)
+{
+	struct inode* inode = &ci->vfs_inode;
+
+	/* No caching for filesystem */
+	if (fsc->fscache == NULL)
+		return;
+
+	/* Only cache for regular files that are read only */
+	if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
+		return;
+
+	/* Avoid multiple racing open requests */
+	mutex_lock(&inode->i_mutex);
+
+	if (ci->fscache)
+		goto done;
+
+	ci->fscache = fscache_acquire_cookie(fsc->fscache,
+					     &ceph_fscache_inode_object_def,
+					     ci, true);
+	fscache_check_consistency(ci->fscache);
+done:
+	mutex_unlock(&inode->i_mutex);
+
+}
+
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+	struct fscache_cookie* cookie;
+
+	if ((cookie = ci->fscache) == NULL)
+		return;
+
+	ci->fscache = NULL;
+
+	fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
+	fscache_relinquish_cookie(cookie, 0);
+}
+
+static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+}
+
+static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+
+	unlock_page(page);
+}
+
+static inline int cache_valid(struct ceph_inode_info *ci)
+{
+	return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
+		(ci->i_fscache_gen == ci->i_rdcache_gen));
+}
+
+
+/* Atempt to read from the fscache,
+ *
+ * This function is called from the readpage_nounlock context. DO NOT attempt to
+ * unlock the page here (or in the callback).
+ */
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!cache_valid(ci))
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_page(ci->fscache, page,
+					 ceph_vfs_readpage_complete, NULL,
+					 GFP_KERNEL);
+
+	switch (ret) {
+		case 0: /* Page found */
+			dout("page read submitted\n");
+			return 0;
+		case -ENOBUFS: /* Pages were not found, and can't be */
+		case -ENODATA: /* Pages were not found */
+			dout("page/inode not in cache\n");
+			return ret;
+		default:
+			dout("%s: unknown error ret = %i\n", __func__, ret);
+			return ret;
+	}
+}
+
+int ceph_readpages_from_fscache(struct inode *inode,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!cache_valid(ci))
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
+					  ceph_vfs_readpage_complete_unlock,
+					  NULL, mapping_gfp_mask(mapping));
+
+	switch (ret) {
+		case 0: /* All pages found */
+			dout("all-page read submitted\n");
+			return 0;
+		case -ENOBUFS: /* Some pages were not found, and can't be */
+		case -ENODATA: /* some pages were not found */
+			dout("page/inode not in cache\n");
+			return ret;
+		default:
+			dout("%s: unknown error ret = %i\n", __func__, ret);
+			return ret;
+	}
+}
+
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!PageFsCache(page))
+		return;
+
+	if (!cache_valid(ci))
+		return;
+
+	ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
+	if (ret)
+		 fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (!PageFsCache(page))
+		return;
+
+	fscache_wait_on_page_write(ci->fscache, page);
+	fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+	if (fsc->revalidate_wq)
+		destroy_workqueue(fsc->revalidate_wq);
+
+	fscache_relinquish_cookie(fsc->fscache, 0);
+	fsc->fscache = NULL;
+}
+
+static void ceph_revalidate_work(struct work_struct *work)
+{
+	int issued;
+	u32 orig_gen;
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_revalidate_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	spin_lock(&ci->i_ceph_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	orig_gen = ci->i_rdcache_gen;
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (!(issued & CEPH_CAP_FILE_CACHE)) {
+		dout("revalidate_work lost cache before validation %p\n",
+		     inode);
+		goto out;
+	}
+
+	if (!fscache_check_consistency(ci->fscache))
+		fscache_invalidate(ci->fscache);
+
+	spin_lock(&ci->i_ceph_lock);
+	/* Update the new valid generation (backwards sanity check too) */
+	if (orig_gen > ci->i_fscache_gen) {
+		ci->i_fscache_gen = orig_gen;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+out:
+	iput(&ci->vfs_inode);
+}
+
+void ceph_queue_revalidate(struct inode *inode)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
+		return;
+
+	ihold(inode);
+
+	if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
+		       &ci->i_revalidate_work)) {
+		dout("ceph_queue_revalidate %p\n", inode);
+	} else {
+		dout("ceph_queue_revalidate %p failed\n)", inode);
+		iput(inode);
+	}
+}
+
+void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+	ci->fscache = NULL;
+	/* The first load is verifed cookie open time */
+	ci->i_fscache_gen = 1;
+	INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
+}
diff --git a/kernel/fs/ceph/cache.h b/kernel/fs/ceph/cache.h
new file mode 100644
index 000000000..5ac591bd0
--- /dev/null
+++ b/kernel/fs/ceph/cache.h
@@ -0,0 +1,182 @@
+/*
+ * Ceph cache definitions.
+ *
+ *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ *  Written by Milosz Tanski (milosz@adfin.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#ifndef _CEPH_CACHE_H
+#define _CEPH_CACHE_H
+
+#ifdef CONFIG_CEPH_FSCACHE
+
+extern struct fscache_netfs ceph_cache_netfs;
+
+int ceph_fscache_register(void);
+void ceph_fscache_unregister(void);
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
+
+void ceph_fscache_inode_init(struct ceph_inode_info *ci);
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
+					struct ceph_inode_info* ci);
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
+int ceph_readpages_from_fscache(struct inode *inode,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned *nr_pages);
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
+void ceph_queue_revalidate(struct inode *inode);
+
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	fscache_attr_changed(ci->fscache);
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+	fscache_invalidate(ceph_inode(inode)->fscache);
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+					     struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return fscache_uncache_page(ci->fscache, page);
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+	struct inode* inode = page->mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return fscache_maybe_release_page(ci->fscache, page, gfp);
+}
+
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+						struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+		__fscache_uncache_page(ci->fscache, page);
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return fscache_readpages_cancel(ci->fscache, pages);
+}
+
+#else
+
+static inline int ceph_fscache_register(void)
+{
+	return 0;
+}
+
+static inline void ceph_fscache_unregister(void)
+{
+}
+
+static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+	return 0;
+}
+
+static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+}
+
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+}
+
+static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
+						      struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+					     struct page *pages)
+{
+}
+
+static inline int ceph_readpage_from_fscache(struct inode* inode,
+					     struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int ceph_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void ceph_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+}
+
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+}
+
+static inline void ceph_invalidate_fscache_page(struct inode *inode,
+						struct page *page)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+	return 1;
+}
+
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+						struct page *page)
+{
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+}
+
+static inline void ceph_queue_revalidate(struct inode *inode)
+{
+}
+
+#endif
+
+#endif
diff --git a/kernel/fs/ceph/caps.c b/kernel/fs/ceph/caps.c
new file mode 100644
index 000000000..be5ea6af8
--- /dev/null
+++ b/kernel/fs/ceph/caps.c
@@ -0,0 +1,3456 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+	if (c & CEPH_CAP_GSHARED)
+		*s++ = 's';
+	if (c & CEPH_CAP_GEXCL)
+		*s++ = 'x';
+	if (c & CEPH_CAP_GCACHE)
+		*s++ = 'c';
+	if (c & CEPH_CAP_GRD)
+		*s++ = 'r';
+	if (c & CEPH_CAP_GWR)
+		*s++ = 'w';
+	if (c & CEPH_CAP_GBUFFER)
+		*s++ = 'b';
+	if (c & CEPH_CAP_GLAZYIO)
+		*s++ = 'l';
+	return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+	int i;
+	char *s;
+	int c;
+
+	spin_lock(&cap_str_lock);
+	i = last_cap_str++;
+	if (last_cap_str == MAX_CAP_STR)
+		last_cap_str = 0;
+	spin_unlock(&cap_str_lock);
+
+	s = cap_str[i];
+
+	if (caps & CEPH_CAP_PIN)
+		*s++ = 'p';
+
+	c = (caps >> CEPH_CAP_SAUTH) & 3;
+	if (c) {
+		*s++ = 'A';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SLINK) & 3;
+	if (c) {
+		*s++ = 'L';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SXATTR) & 3;
+	if (c) {
+		*s++ = 'X';
+		s = gcap_string(s, c);
+	}
+
+	c = caps >> CEPH_CAP_SFILE;
+	if (c) {
+		*s++ = 'F';
+		s = gcap_string(s, c);
+	}
+
+	if (s == cap_str[i])
+		*s++ = '-';
+	*s = 0;
+	return cap_str[i];
+}
+
+void ceph_caps_init(struct ceph_mds_client *mdsc)
+{
+	INIT_LIST_HEAD(&mdsc->caps_list);
+	spin_lock_init(&mdsc->caps_list_lock);
+}
+
+void ceph_caps_finalize(struct ceph_mds_client *mdsc)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&mdsc->caps_list_lock);
+	while (!list_empty(&mdsc->caps_list)) {
+		cap = list_first_entry(&mdsc->caps_list,
+				       struct ceph_cap, caps_item);
+		list_del(&cap->caps_item);
+		kmem_cache_free(ceph_cap_cachep, cap);
+	}
+	mdsc->caps_total_count = 0;
+	mdsc->caps_avail_count = 0;
+	mdsc->caps_use_count = 0;
+	mdsc->caps_reserve_count = 0;
+	mdsc->caps_min_count = 0;
+	spin_unlock(&mdsc->caps_list_lock);
+}
+
+void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+{
+	spin_lock(&mdsc->caps_list_lock);
+	mdsc->caps_min_count += delta;
+	BUG_ON(mdsc->caps_min_count < 0);
+	spin_unlock(&mdsc->caps_list_lock);
+}
+
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
+		      struct ceph_cap_reservation *ctx, int need)
+{
+	int i;
+	struct ceph_cap *cap;
+	int have;
+	int alloc = 0;
+	LIST_HEAD(newcaps);
+
+	dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+	/* first reserve any caps that are already allocated */
+	spin_lock(&mdsc->caps_list_lock);
+	if (mdsc->caps_avail_count >= need)
+		have = need;
+	else
+		have = mdsc->caps_avail_count;
+	mdsc->caps_avail_count -= have;
+	mdsc->caps_reserve_count += have;
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+					 mdsc->caps_reserve_count +
+					 mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+
+	for (i = have; i < need; i++) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (!cap)
+			break;
+		list_add(&cap->caps_item, &newcaps);
+		alloc++;
+	}
+	/* we didn't manage to reserve as much as we needed */
+	if (have + alloc != need)
+		pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+			ctx, need, have + alloc);
+
+	spin_lock(&mdsc->caps_list_lock);
+	mdsc->caps_total_count += alloc;
+	mdsc->caps_reserve_count += alloc;
+	list_splice(&newcaps, &mdsc->caps_list);
+
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+					 mdsc->caps_reserve_count +
+					 mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+
+	ctx->count = need;
+	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+}
+
+int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+			struct ceph_cap_reservation *ctx)
+{
+	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+	if (ctx->count) {
+		spin_lock(&mdsc->caps_list_lock);
+		BUG_ON(mdsc->caps_reserve_count < ctx->count);
+		mdsc->caps_reserve_count -= ctx->count;
+		mdsc->caps_avail_count += ctx->count;
+		ctx->count = 0;
+		dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+		     mdsc->caps_total_count, mdsc->caps_use_count,
+		     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+		BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+						 mdsc->caps_reserve_count +
+						 mdsc->caps_avail_count);
+		spin_unlock(&mdsc->caps_list_lock);
+	}
+	return 0;
+}
+
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+			      struct ceph_cap_reservation *ctx)
+{
+	struct ceph_cap *cap = NULL;
+
+	/* temporary, until we do something about cap import/export */
+	if (!ctx) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (cap) {
+			spin_lock(&mdsc->caps_list_lock);
+			mdsc->caps_use_count++;
+			mdsc->caps_total_count++;
+			spin_unlock(&mdsc->caps_list_lock);
+		}
+		return cap;
+	}
+
+	spin_lock(&mdsc->caps_list_lock);
+	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+	     ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	BUG_ON(!ctx->count);
+	BUG_ON(ctx->count > mdsc->caps_reserve_count);
+	BUG_ON(list_empty(&mdsc->caps_list));
+
+	ctx->count--;
+	mdsc->caps_reserve_count--;
+	mdsc->caps_use_count++;
+
+	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
+	list_del(&cap->caps_item);
+
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+	return cap;
+}
+
+void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
+{
+	spin_lock(&mdsc->caps_list_lock);
+	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+	     cap, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	mdsc->caps_use_count--;
+	/*
+	 * Keep some preallocated caps around (ceph_min_count), to
+	 * avoid lots of free/alloc churn.
+	 */
+	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
+				      mdsc->caps_min_count) {
+		mdsc->caps_total_count--;
+		kmem_cache_free(ceph_cap_cachep, cap);
+	} else {
+		mdsc->caps_avail_count++;
+		list_add(&cap->caps_item, &mdsc->caps_list);
+	}
+
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_fs_client *fsc,
+			     int *total, int *avail, int *used, int *reserved,
+			     int *min)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	if (total)
+		*total = mdsc->caps_total_count;
+	if (avail)
+		*avail = mdsc->caps_avail_count;
+	if (used)
+		*used = mdsc->caps_use_count;
+	if (reserved)
+		*reserved = mdsc->caps_reserve_count;
+	if (min)
+		*min = mdsc->caps_min_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_ceph_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+	struct rb_node *n = ci->i_caps.rb_node;
+
+	while (n) {
+		cap = rb_entry(n, struct ceph_cap, ci_node);
+		if (mds < cap->mds)
+			n = n->rb_left;
+		else if (mds > cap->mds)
+			n = n->rb_right;
+		else
+			return cap;
+	}
+	return NULL;
+}
+
+struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	spin_unlock(&ci->i_ceph_lock);
+	return cap;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
+{
+	struct ceph_cap *cap;
+	int mds = -1;
+	struct rb_node *p;
+
+	/* prefer mds with WR|BUFFER|EXCL caps */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		mds = cap->mds;
+		if (cap->issued & (CEPH_CAP_FILE_WR |
+				   CEPH_CAP_FILE_BUFFER |
+				   CEPH_CAP_FILE_EXCL))
+			break;
+	}
+	return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds;
+	spin_lock(&ci->i_ceph_lock);
+	mds = __ceph_get_cap_mds(ceph_inode(inode));
+	spin_unlock(&ci->i_ceph_lock);
+	return mds;
+}
+
+/*
+ * Called under i_ceph_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+			      struct ceph_cap *new)
+{
+	struct rb_node **p = &ci->i_caps.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_cap *cap = NULL;
+
+	while (*p) {
+		parent = *p;
+		cap = rb_entry(parent, struct ceph_cap, ci_node);
+		if (new->mds < cap->mds)
+			p = &(*p)->rb_left;
+		else if (new->mds > cap->mds)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->ci_node, parent, p);
+	rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+
+	ci->i_hold_caps_min = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_min * HZ);
+	ci->i_hold_caps_max = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_max * HZ);
+	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_ceph_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+				struct ceph_inode_info *ci)
+{
+	__cap_set_timeouts(mdsc, ci);
+	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+	     ci->i_ceph_flags, ci->i_hold_caps_max);
+	if (!mdsc->stopping) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (!list_empty(&ci->i_cap_delay_list)) {
+			if (ci->i_ceph_flags & CEPH_I_FLUSH)
+				goto no_change;
+			list_del_init(&ci->i_cap_delay_list);
+		}
+		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+		spin_unlock(&mdsc->cap_delay_lock);
+	}
+}
+
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+				      struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+	spin_lock(&mdsc->cap_delay_lock);
+	ci->i_ceph_flags |= CEPH_I_FLUSH;
+	if (!list_empty(&ci->i_cap_delay_list))
+		list_del_init(&ci->i_cap_delay_list);
+	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_ceph_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+	if (list_empty(&ci->i_cap_delay_list))
+		return;
+	spin_lock(&mdsc->cap_delay_lock);
+	list_del_init(&ci->i_cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+			      unsigned issued)
+{
+	unsigned had = __ceph_caps_issued(ci, NULL);
+
+	/*
+	 * Each time we receive FILE_CACHE anew, we increment
+	 * i_rdcache_gen.
+	 */
+	if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
+		ci->i_rdcache_gen++;
+	}
+
+	/*
+	 * if we are newly issued FILE_SHARED, mark dir not complete; we
+	 * don't know what happened to this directory while we didn't
+	 * have the cap.
+	 */
+	if ((issued & CEPH_CAP_FILE_SHARED) &&
+	    (had & CEPH_CAP_FILE_SHARED) == 0) {
+		ci->i_shared_gen++;
+		if (S_ISDIR(ci->vfs_inode.i_mode)) {
+			dout(" marking %p NOT complete\n", &ci->vfs_inode);
+			__ceph_dir_clear_complete(ci);
+		}
+	}
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+void ceph_add_cap(struct inode *inode,
+		  struct ceph_mds_session *session, u64 cap_id,
+		  int fmode, unsigned issued, unsigned wanted,
+		  unsigned seq, unsigned mseq, u64 realmino, int flags,
+		  struct ceph_cap **new_cap)
+{
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	int mds = session->s_mds;
+	int actual_wanted;
+
+	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+	/*
+	 * If we are opening the file, include file mode wanted bits
+	 * in wanted.
+	 */
+	if (fmode >= 0)
+		wanted |= ceph_caps_for_mode(fmode);
+
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		cap = *new_cap;
+		*new_cap = NULL;
+
+		cap->issued = 0;
+		cap->implemented = 0;
+		cap->mds = mds;
+		cap->mds_wanted = 0;
+		cap->mseq = 0;
+
+		cap->ci = ci;
+		__insert_cap_node(ci, cap);
+
+		/* add to session cap list */
+		cap->session = session;
+		spin_lock(&session->s_cap_lock);
+		list_add_tail(&cap->session_caps, &session->s_caps);
+		session->s_nr_caps++;
+		spin_unlock(&session->s_cap_lock);
+	} else {
+		/*
+		 * auth mds of the inode changed. we received the cap export
+		 * message, but still haven't received the cap import message.
+		 * handle_cap_export() updated the new auth MDS' cap.
+		 *
+		 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+		 * a message that was send before the cap import message. So
+		 * don't remove caps.
+		 */
+		if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+			WARN_ON(cap != ci->i_auth_cap);
+			WARN_ON(cap->cap_id != cap_id);
+			seq = cap->seq;
+			mseq = cap->mseq;
+			issued |= cap->issued;
+			flags |= CEPH_CAP_FLAG_AUTH;
+		}
+	}
+
+	if (!ci->i_snap_realm) {
+		/*
+		 * add this inode to the appropriate snap realm
+		 */
+		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+							       realmino);
+		if (realm) {
+			spin_lock(&realm->inodes_with_caps_lock);
+			ci->i_snap_realm = realm;
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			spin_unlock(&realm->inodes_with_caps_lock);
+		} else {
+			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+			       realmino);
+			WARN_ON(!realm);
+		}
+	}
+
+	__check_cap_issue(ci, cap, issued);
+
+	/*
+	 * If we are issued caps we don't want, or the mds' wanted
+	 * value appears to be off, queue a check so we'll release
+	 * later and/or update the mds wanted value.
+	 */
+	actual_wanted = __ceph_caps_wanted(ci);
+	if ((wanted & ~actual_wanted) ||
+	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+		     ceph_cap_string(issued), ceph_cap_string(wanted),
+		     ceph_cap_string(actual_wanted));
+		__cap_delay_requeue(mdsc, ci);
+	}
+
+	if (flags & CEPH_CAP_FLAG_AUTH) {
+		if (ci->i_auth_cap == NULL ||
+		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
+			ci->i_auth_cap = cap;
+			cap->mds_wanted = wanted;
+		}
+	} else {
+		WARN_ON(ci->i_auth_cap == cap);
+	}
+
+	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+	     ceph_cap_string(issued|cap->issued), seq, mds);
+	cap->cap_id = cap_id;
+	cap->issued = issued;
+	cap->implemented |= issued;
+	if (ceph_seq_cmp(mseq, cap->mseq) > 0)
+		cap->mds_wanted = wanted;
+	else
+		cap->mds_wanted |= wanted;
+	cap->seq = seq;
+	cap->issue_seq = seq;
+	cap->mseq = mseq;
+	cap->cap_gen = session->s_cap_gen;
+
+	if (fmode >= 0)
+		__ceph_get_fmode(ci, fmode);
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+	unsigned long ttl;
+	u32 gen;
+
+	spin_lock(&cap->session->s_gen_ttl_lock);
+	gen = cap->session->s_cap_gen;
+	ttl = cap->session->s_cap_ttl;
+	spin_unlock(&cap->session->s_gen_ttl_lock);
+
+	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+		dout("__cap_is_valid %p cap %p issued %s "
+		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+	int have = ci->i_snap_caps;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	if (implemented)
+		*implemented = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		dout("__ceph_caps_issued %p cap %p issued %s\n",
+		     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+		have |= cap->issued;
+		if (implemented)
+			*implemented |= cap->implemented;
+	}
+	/*
+	 * exclude caps issued by non-auth MDS, but are been revoking
+	 * by the auth MDS. The non-auth MDS should be revoking/exporting
+	 * these caps, but the message is delayed.
+	 */
+	if (ci->i_auth_cap) {
+		cap = ci->i_auth_cap;
+		have &= ~cap->implemented | cap->issued;
+	}
+	return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+	int have = ci->i_snap_caps;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (cap == ocap)
+			continue;
+		if (!__cap_is_valid(cap))
+			continue;
+		have |= cap->issued;
+	}
+	return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *s = cap->session;
+
+	spin_lock(&s->s_cap_lock);
+	if (s->s_cap_iterator == NULL) {
+		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+		     s->s_mds);
+		list_move_tail(&cap->session_caps, &s->s_caps);
+	} else {
+		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+		     &cap->ci->vfs_inode, cap, s->s_mds);
+	}
+	spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int have = ci->i_snap_caps;
+
+	if ((have & mask) == mask) {
+		dout("__ceph_caps_issued_mask %p snap issued %s"
+		     " (mask %s)\n", &ci->vfs_inode,
+		     ceph_cap_string(have),
+		     ceph_cap_string(mask));
+		return 1;
+	}
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		if ((cap->issued & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p cap %p issued %s"
+			     " (mask %s)\n", &ci->vfs_inode, cap,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch)
+				__touch_cap(cap);
+			return 1;
+		}
+
+		/* does a combination of caps satisfy mask? */
+		have |= cap->issued;
+		if ((have & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p combo issued %s"
+			     " (mask %s)\n", &ci->vfs_inode,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch) {
+				struct rb_node *q;
+
+				/* touch this + preceding caps */
+				__touch_cap(cap);
+				for (q = rb_first(&ci->i_caps); q != p;
+				     q = rb_next(q)) {
+					cap = rb_entry(q, struct ceph_cap,
+						       ci_node);
+					if (!__cap_is_valid(cap))
+						continue;
+					__touch_cap(cap);
+				}
+			}
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+			       struct ceph_cap *ocap, int mask)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (cap != ocap &&
+		    (cap->implemented & ~cap->issued & mask))
+			return 1;
+	}
+	return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	ret = __ceph_caps_revoking_other(ci, NULL, mask);
+	spin_unlock(&ci->i_ceph_lock);
+	dout("ceph_caps_revoking %p %s = %d\n", inode,
+	     ceph_cap_string(mask), ret);
+	return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+	int used = 0;
+	if (ci->i_pin_ref)
+		used |= CEPH_CAP_PIN;
+	if (ci->i_rd_ref)
+		used |= CEPH_CAP_FILE_RD;
+	if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+		used |= CEPH_CAP_FILE_CACHE;
+	if (ci->i_wr_ref)
+		used |= CEPH_CAP_FILE_WR;
+	if (ci->i_wb_ref || ci->i_wrbuffer_ref)
+		used |= CEPH_CAP_FILE_BUFFER;
+	return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+	int want = 0;
+	int mode;
+	for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
+		if (ci->i_nr_by_mode[mode])
+			want |= ceph_caps_for_mode(mode);
+	return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int mds_wanted = 0;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		if (cap == ci->i_auth_cap)
+			mds_wanted |= cap->mds_wanted;
+		else
+			mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
+	}
+	return mds_wanted;
+}
+
+/*
+ * called under i_ceph_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+int ceph_is_any_caps(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	ret = __ceph_is_any_caps(ci);
+	spin_unlock(&ci->i_ceph_lock);
+
+	return ret;
+}
+
+static void drop_inode_snap_realm(struct ceph_inode_info *ci)
+{
+	struct ceph_snap_realm *realm = ci->i_snap_realm;
+	spin_lock(&realm->inodes_with_caps_lock);
+	list_del_init(&ci->i_snap_realm_item);
+	ci->i_snap_realm_counter++;
+	ci->i_snap_realm = NULL;
+	spin_unlock(&realm->inodes_with_caps_lock);
+	ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
+			    realm);
+}
+
+/*
+ * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
+ *
+ * caller should hold i_ceph_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+{
+	struct ceph_mds_session *session = cap->session;
+	struct ceph_inode_info *ci = cap->ci;
+	struct ceph_mds_client *mdsc =
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+	int removed = 0;
+
+	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+	/* remove from session list */
+	spin_lock(&session->s_cap_lock);
+	/*
+	 * s_cap_reconnect is protected by s_cap_lock. no one changes
+	 * s_cap_gen while session is in the reconnect state.
+	 */
+	if (queue_release &&
+	    (!session->s_cap_reconnect ||
+	     cap->cap_gen == session->s_cap_gen))
+		__queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
+				    cap->mseq, cap->issue_seq);
+
+	if (session->s_cap_iterator == cap) {
+		/* not yet, we are iterating over this very cap */
+		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+		     cap, cap->session);
+	} else {
+		list_del_init(&cap->session_caps);
+		session->s_nr_caps--;
+		cap->session = NULL;
+		removed = 1;
+	}
+	/* protect backpointer with s_cap_lock: see iterate_session_caps */
+	cap->ci = NULL;
+	spin_unlock(&session->s_cap_lock);
+
+	/* remove from inode list */
+	rb_erase(&cap->ci_node, &ci->i_caps);
+	if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	if (removed)
+		ceph_put_cap(mdsc, cap);
+
+	/* when reconnect denied, we remove session caps forcibly,
+	 * i_wr_ref can be non-zero. If there are ongoing write,
+	 * keep i_snap_realm.
+	 */
+	if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
+		drop_inode_snap_realm(ci);
+
+	if (!__ceph_is_any_real_caps(ci))
+		__cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+			u64 ino, u64 cid, int op,
+			int caps, int wanted, int dirty,
+			u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+			u64 size, u64 max_size,
+			struct timespec *mtime, struct timespec *atime,
+			u64 time_warp_seq,
+			kuid_t uid, kgid_t gid, umode_t mode,
+			u64 xattr_version,
+			struct ceph_buffer *xattrs_buf,
+			u64 follows, bool inline_data)
+{
+	struct ceph_mds_caps *fc;
+	struct ceph_msg *msg;
+	void *p;
+	size_t extra_len;
+
+	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+	     " seq %u/%u mseq %u follows %lld size %llu/%llu"
+	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+	     ceph_cap_string(dirty),
+	     seq, issue_seq, mseq, follows, size, max_size,
+	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+	/* flock buffer size + inline version + inline data size */
+	extra_len = 4 + 8 + 4;
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
+			   GFP_NOFS, false);
+	if (!msg)
+		return -ENOMEM;
+
+	msg->hdr.tid = cpu_to_le64(flush_tid);
+
+	fc = msg->front.iov_base;
+	memset(fc, 0, sizeof(*fc));
+
+	fc->cap_id = cpu_to_le64(cid);
+	fc->op = cpu_to_le32(op);
+	fc->seq = cpu_to_le32(seq);
+	fc->issue_seq = cpu_to_le32(issue_seq);
+	fc->migrate_seq = cpu_to_le32(mseq);
+	fc->caps = cpu_to_le32(caps);
+	fc->wanted = cpu_to_le32(wanted);
+	fc->dirty = cpu_to_le32(dirty);
+	fc->ino = cpu_to_le64(ino);
+	fc->snap_follows = cpu_to_le64(follows);
+
+	fc->size = cpu_to_le64(size);
+	fc->max_size = cpu_to_le64(max_size);
+	if (mtime)
+		ceph_encode_timespec(&fc->mtime, mtime);
+	if (atime)
+		ceph_encode_timespec(&fc->atime, atime);
+	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
+	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
+	fc->mode = cpu_to_le32(mode);
+
+	p = fc + 1;
+	/* flock buffer size */
+	ceph_encode_32(&p, 0);
+	/* inline version */
+	ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
+	/* inline data size */
+	ceph_encode_32(&p, 0);
+
+	fc->xattr_version = cpu_to_le64(xattr_version);
+	if (xattrs_buf) {
+		msg->middle = ceph_buffer_get(xattrs_buf);
+		fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+		msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+	}
+
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+void __queue_cap_release(struct ceph_mds_session *session,
+			 u64 ino, u64 cap_id, u32 migrate_seq,
+			 u32 issue_seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	struct ceph_mds_cap_item *item;
+
+	BUG_ON(!session->s_num_cap_releases);
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+
+	dout(" adding %llx release to mds%d msg %p (%d left)\n",
+	     ino, session->s_mds, msg, session->s_num_cap_releases);
+
+	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+	head = msg->front.iov_base;
+	le32_add_cpu(&head->num, 1);
+	item = msg->front.iov_base + msg->front.iov_len;
+	item->ino = cpu_to_le64(ino);
+	item->cap_id = cpu_to_le64(cap_id);
+	item->migrate_seq = cpu_to_le32(migrate_seq);
+	item->seq = cpu_to_le32(issue_seq);
+
+	session->s_num_cap_releases--;
+
+	msg->front.iov_len += sizeof(*item);
+	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+		dout(" release msg %p full\n", msg);
+		list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+	} else {
+		dout(" release msg %p at %d/%d (%d)\n", msg,
+		     (int)le32_to_cpu(head->num),
+		     (int)CEPH_CAPS_PER_RELEASE,
+		     (int)msg->front.iov_len);
+	}
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_ceph_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct rb_node *p;
+
+	p = rb_first(&ci->i_caps);
+	while (p) {
+		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+		p = rb_next(p);
+		__ceph_remove_cap(cap, true);
+	}
+}
+
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_ceph_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_ceph_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+		      int op, int used, int want, int retain, int flushing,
+		      unsigned *pflush_tid)
+	__releases(cap->ci->i_ceph_lock)
+{
+	struct ceph_inode_info *ci = cap->ci;
+	struct inode *inode = &ci->vfs_inode;
+	u64 cap_id = cap->cap_id;
+	int held, revoking, dropping, keep;
+	u64 seq, issue_seq, mseq, time_warp_seq, follows;
+	u64 size, max_size;
+	struct timespec mtime, atime;
+	int wake = 0;
+	umode_t mode;
+	kuid_t uid;
+	kgid_t gid;
+	struct ceph_mds_session *session;
+	u64 xattr_version = 0;
+	struct ceph_buffer *xattr_blob = NULL;
+	int delayed = 0;
+	u64 flush_tid = 0;
+	int i;
+	int ret;
+	bool inline_data;
+
+	held = cap->issued | cap->implemented;
+	revoking = cap->implemented & ~cap->issued;
+	retain &= ~revoking;
+	dropping = cap->issued & ~retain;
+
+	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+	     inode, cap, cap->session,
+	     ceph_cap_string(held), ceph_cap_string(held & retain),
+	     ceph_cap_string(revoking));
+	BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+	session = cap->session;
+
+	/* don't release wanted unless we've waited a bit. */
+	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+	    time_before(jiffies, ci->i_hold_caps_min)) {
+		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->issued & retain),
+		     ceph_cap_string(cap->mds_wanted),
+		     ceph_cap_string(want));
+		want |= cap->mds_wanted;
+		retain |= cap->issued;
+		delayed = 1;
+	}
+	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+	cap->issued &= retain;  /* drop bits we don't want */
+	if (cap->implemented & ~cap->issued) {
+		/*
+		 * Wake up any waiters on wanted -> needed transition.
+		 * This is due to the weird transition from buffered
+		 * to sync IO... we need to flush dirty pages _before_
+		 * allowing sync writes to avoid reordering.
+		 */
+		wake = 1;
+	}
+	cap->implemented &= cap->issued | used;
+	cap->mds_wanted = want;
+
+	if (flushing) {
+		/*
+		 * assign a tid for flush operations so we can avoid
+		 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+		 * clean type races.  track latest tid for every bit
+		 * so we can handle flush AxFw, flush Fw, and have the
+		 * first ack clean Ax.
+		 */
+		flush_tid = ++ci->i_cap_flush_last_tid;
+		if (pflush_tid)
+			*pflush_tid = flush_tid;
+		dout(" cap_flush_tid %d\n", (int)flush_tid);
+		for (i = 0; i < CEPH_CAP_BITS; i++)
+			if (flushing & (1 << i))
+				ci->i_cap_flush_tid[i] = flush_tid;
+
+		follows = ci->i_head_snapc->seq;
+	} else {
+		follows = 0;
+	}
+
+	keep = cap->implemented;
+	seq = cap->seq;
+	issue_seq = cap->issue_seq;
+	mseq = cap->mseq;
+	size = inode->i_size;
+	ci->i_reported_size = size;
+	max_size = ci->i_wanted_max_size;
+	ci->i_requested_max_size = max_size;
+	mtime = inode->i_mtime;
+	atime = inode->i_atime;
+	time_warp_seq = ci->i_time_warp_seq;
+	uid = inode->i_uid;
+	gid = inode->i_gid;
+	mode = inode->i_mode;
+
+	if (flushing & CEPH_CAP_XATTR_EXCL) {
+		__ceph_build_xattrs_blob(ci);
+		xattr_blob = ci->i_xattrs.blob;
+		xattr_version = ci->i_xattrs.version;
+	}
+
+	inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+	spin_unlock(&ci->i_ceph_lock);
+
+	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+		size, max_size, &mtime, &atime, time_warp_seq,
+		uid, gid, mode, xattr_version, xattr_blob,
+		follows, inline_data);
+	if (ret < 0) {
+		dout("error sending cap msg, must requeue %p\n", inode);
+		delayed = 1;
+	}
+
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Unless @again is true, skip cap_snaps that were already sent to
+ * the MDS (i.e., during this session).
+ *
+ * Called under i_ceph_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			struct ceph_mds_session **psession,
+			int again)
+		__releases(ci->i_ceph_lock)
+		__acquires(ci->i_ceph_lock)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int mds;
+	struct ceph_cap_snap *capsnap;
+	u32 mseq;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+						    session->s_mutex */
+	u64 next_follows = 0;  /* keep track of how far we've gotten through the
+			     i_cap_snaps list, and skip these entries next time
+			     around to avoid an infinite loop */
+
+	if (psession)
+		session = *psession;
+
+	dout("__flush_snaps %p\n", inode);
+retry:
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		/* avoid an infiniute loop after retry */
+		if (capsnap->follows < next_follows)
+			continue;
+		/*
+		 * we need to wait for sync writes to complete and for dirty
+		 * pages to be written out.
+		 */
+		if (capsnap->dirty_pages || capsnap->writing)
+			break;
+
+		/*
+		 * if cap writeback already occurred, we should have dropped
+		 * the capsnap in ceph_put_wrbuffer_cap_refs.
+		 */
+		BUG_ON(capsnap->dirty == 0);
+
+		/* pick mds, take s_mutex */
+		if (ci->i_auth_cap == NULL) {
+			dout("no auth cap (migrating?), doing nothing\n");
+			goto out;
+		}
+
+		/* only flush each capsnap once */
+		if (!again && !list_empty(&capsnap->flushing_item)) {
+			dout("already flushed %p, skipping\n", capsnap);
+			continue;
+		}
+
+		mds = ci->i_auth_cap->session->s_mds;
+		mseq = ci->i_auth_cap->mseq;
+
+		if (session && session->s_mds != mds) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			session = NULL;
+		}
+		if (!session) {
+			spin_unlock(&ci->i_ceph_lock);
+			mutex_lock(&mdsc->mutex);
+			session = __ceph_lookup_mds_session(mdsc, mds);
+			mutex_unlock(&mdsc->mutex);
+			if (session) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				mutex_lock(&session->s_mutex);
+			}
+			/*
+			 * if session == NULL, we raced against a cap
+			 * deletion or migration.  retry, and we'll
+			 * get a better @mds value next time.
+			 */
+			spin_lock(&ci->i_ceph_lock);
+			goto retry;
+		}
+
+		capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+		atomic_inc(&capsnap->nref);
+		if (!list_empty(&capsnap->flushing_item))
+			list_del_init(&capsnap->flushing_item);
+		list_add_tail(&capsnap->flushing_item,
+			      &session->s_cap_snaps_flushing);
+		spin_unlock(&ci->i_ceph_lock);
+
+		dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
+		     inode, capsnap, capsnap->follows, capsnap->flush_tid);
+		send_cap_msg(session, ceph_vino(inode).ino, 0,
+			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+			     capsnap->size, 0,
+			     &capsnap->mtime, &capsnap->atime,
+			     capsnap->time_warp_seq,
+			     capsnap->uid, capsnap->gid, capsnap->mode,
+			     capsnap->xattr_version, capsnap->xattr_blob,
+			     capsnap->follows, capsnap->inline_data);
+
+		next_follows = capsnap->follows + 1;
+		ceph_put_cap_snap(capsnap);
+
+		spin_lock(&ci->i_ceph_lock);
+		goto retry;
+	}
+
+	/* we flushed them all; remove this inode from the queue */
+	spin_lock(&mdsc->snap_flush_lock);
+	list_del_init(&ci->i_snap_flush_item);
+	spin_unlock(&mdsc->snap_flush_lock);
+
+out:
+	if (psession)
+		*psession = session;
+	else if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+	spin_lock(&ci->i_ceph_lock);
+	__ceph_flush_snaps(ci, NULL, 0);
+	spin_unlock(&ci->i_ceph_lock);
+}
+
+/*
+ * Mark caps dirty.  If inode is newly dirty, return the dirty flags.
+ * Caller is then responsible for calling __mark_inode_dirty with the
+ * returned flags value.
+ */
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+	struct ceph_mds_client *mdsc =
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	int was = ci->i_dirty_caps;
+	int dirty = 0;
+
+	if (!ci->i_auth_cap) {
+		pr_warn("__mark_dirty_caps %p %llx mask %s, "
+			"but no auth cap (session was closed?)\n",
+			inode, ceph_ino(inode), ceph_cap_string(mask));
+		return 0;
+	}
+
+	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+	     ceph_cap_string(mask), ceph_cap_string(was),
+	     ceph_cap_string(was | mask));
+	ci->i_dirty_caps |= mask;
+	if (was == 0) {
+		if (!ci->i_head_snapc)
+			ci->i_head_snapc = ceph_get_snap_context(
+				ci->i_snap_realm->cached_context);
+		dout(" inode %p now dirty snapc %p auth cap %p\n",
+		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+		BUG_ON(!list_empty(&ci->i_dirty_item));
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (ci->i_flushing_caps == 0) {
+			ihold(inode);
+			dirty |= I_DIRTY_SYNC;
+		}
+	}
+	BUG_ON(list_empty(&ci->i_dirty_item));
+	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+	    (mask & CEPH_CAP_FILE_BUFFER))
+		dirty |= I_DIRTY_DATASYNC;
+	__cap_delay_requeue(mdsc, ci);
+	return dirty;
+}
+
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_ceph_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int flushing;
+
+	BUG_ON(ci->i_dirty_caps == 0);
+	BUG_ON(list_empty(&ci->i_dirty_item));
+
+	flushing = ci->i_dirty_caps;
+	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+	     ceph_cap_string(flushing),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps | flushing));
+	ci->i_flushing_caps |= flushing;
+	ci->i_dirty_caps = 0;
+	dout(" inode %p now !dirty\n", inode);
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	list_del_init(&ci->i_dirty_item);
+
+	if (list_empty(&ci->i_flushing_item)) {
+		ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		mdsc->num_cap_flushing++;
+		dout(" inode %p now flushing seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	} else {
+		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		dout(" inode %p now flushing (more) seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+
+	return flushing;
+}
+
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u32 invalidating_gen = ci->i_rdcache_gen;
+
+	spin_unlock(&ci->i_ceph_lock);
+	invalidate_mapping_pages(&inode->i_data, 0, -1);
+	spin_lock(&ci->i_ceph_lock);
+
+	if (inode->i_data.nrpages == 0 &&
+	    invalidating_gen == ci->i_rdcache_gen) {
+		/* success. */
+		dout("try_nonblocking_invalidate %p success\n", inode);
+		/* save any racing async invalidate some trouble */
+		ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
+		return 0;
+	}
+	dout("try_nonblocking_invalidate %p failed\n", inode);
+	return -1;
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+		     struct ceph_mds_session *session)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	int file_wanted, used, cap_used;
+	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+	int issued, implemented, want, retain, revoking, flushing = 0;
+	int mds = -1;   /* keep track of how far we've gone through i_caps list
+			   to avoid an infinite loop on retry */
+	struct rb_node *p;
+	int tried_invalidate = 0;
+	int delayed = 0, sent = 0, force_requeue = 0, num;
+	int queue_invalidate = 0;
+	int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+	/* if we are unmounting, flush any unused caps immediately. */
+	if (mdsc->stopping)
+		is_delayed = 1;
+
+	spin_lock(&ci->i_ceph_lock);
+
+	if (ci->i_ceph_flags & CEPH_I_FLUSH)
+		flags |= CHECK_CAPS_FLUSH;
+
+	/* flush snaps first time around only */
+	if (!list_empty(&ci->i_cap_snaps))
+		__ceph_flush_snaps(ci, &session, 0);
+	goto retry_locked;
+retry:
+	spin_lock(&ci->i_ceph_lock);
+retry_locked:
+	file_wanted = __ceph_caps_file_wanted(ci);
+	used = __ceph_caps_used(ci);
+	want = file_wanted | used;
+	issued = __ceph_caps_issued(ci, &implemented);
+	revoking = implemented & ~issued;
+
+	retain = want | CEPH_CAP_PIN;
+	if (!mdsc->stopping && inode->i_nlink > 0) {
+		if (want) {
+			retain |= CEPH_CAP_ANY;       /* be greedy */
+		} else if (S_ISDIR(inode->i_mode) &&
+			   (issued & CEPH_CAP_FILE_SHARED) &&
+			    __ceph_dir_is_complete(ci)) {
+			/*
+			 * If a directory is complete, we want to keep
+			 * the exclusive cap. So that MDS does not end up
+			 * revoking the shared cap on every create/unlink
+			 * operation.
+			 */
+			want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+			retain |= want;
+		} else {
+
+			retain |= CEPH_CAP_ANY_SHARED;
+			/*
+			 * keep RD only if we didn't have the file open RW,
+			 * because then the mds would revoke it anyway to
+			 * journal max_size=0.
+			 */
+			if (ci->i_max_size == 0)
+				retain |= CEPH_CAP_ANY_RD;
+		}
+	}
+
+	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+	     " issued %s revoking %s retain %s %s%s%s\n", inode,
+	     ceph_cap_string(file_wanted),
+	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(issued), ceph_cap_string(revoking),
+	     ceph_cap_string(retain),
+	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+	/*
+	 * If we no longer need to hold onto old our caps, and we may
+	 * have cached pages, but don't want them, then try to invalidate.
+	 * If we fail, it's because pages are locked.... try again later.
+	 */
+	if ((!is_delayed || mdsc->stopping) &&
+	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+	    inode->i_data.nrpages &&                 /* have cached pages */
+	    (file_wanted == 0 ||                     /* no open files */
+	     (revoking & (CEPH_CAP_FILE_CACHE|
+			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
+	    !tried_invalidate) {
+		dout("check_caps trying to invalidate on %p\n", inode);
+		if (try_nonblocking_invalidate(inode) < 0) {
+			if (revoking & (CEPH_CAP_FILE_CACHE|
+					CEPH_CAP_FILE_LAZYIO)) {
+				dout("check_caps queuing invalidate\n");
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			} else {
+				dout("check_caps failed to invalidate pages\n");
+				/* we failed to invalidate pages.  check these
+				   caps again later. */
+				force_requeue = 1;
+				__cap_set_timeouts(mdsc, ci);
+			}
+		}
+		tried_invalidate = 1;
+		goto retry_locked;
+	}
+
+	num = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		num++;
+
+		/* avoid looping forever */
+		if (mds >= cap->mds ||
+		    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+			continue;
+
+		/* NOTE: no side-effects allowed, until we take s_mutex */
+
+		cap_used = used;
+		if (ci->i_auth_cap && cap != ci->i_auth_cap)
+			cap_used &= ~ci->i_auth_cap->issued;
+
+		revoking = cap->implemented & ~cap->issued;
+		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
+		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap_used),
+		     ceph_cap_string(cap->implemented),
+		     ceph_cap_string(revoking));
+
+		if (cap == ci->i_auth_cap &&
+		    (cap->issued & CEPH_CAP_FILE_WR)) {
+			/* request larger max_size from MDS? */
+			if (ci->i_wanted_max_size > ci->i_max_size &&
+			    ci->i_wanted_max_size > ci->i_requested_max_size) {
+				dout("requesting new max_size\n");
+				goto ack;
+			}
+
+			/* approaching file_max? */
+			if ((inode->i_size << 1) >= ci->i_max_size &&
+			    (ci->i_reported_size << 1) < ci->i_max_size) {
+				dout("i_size approaching max_size\n");
+				goto ack;
+			}
+		}
+		/* flush anything dirty? */
+		if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+		    ci->i_dirty_caps) {
+			dout("flushing dirty caps\n");
+			goto ack;
+		}
+
+		/* completed revocation? going down and there are no caps? */
+		if (revoking && (revoking & cap_used) == 0) {
+			dout("completed revocation of %s\n",
+			     ceph_cap_string(cap->implemented & ~cap->issued));
+			goto ack;
+		}
+
+		/* want more caps from mds? */
+		if (want & ~(cap->mds_wanted | cap->issued))
+			goto ack;
+
+		/* things we might delay */
+		if ((cap->issued & ~retain) == 0 &&
+		    cap->mds_wanted == want)
+			continue;     /* nope, all good */
+
+		if (is_delayed)
+			goto ack;
+
+		/* delay? */
+		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max)) {
+			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(cap->issued & retain),
+			     ceph_cap_string(cap->mds_wanted),
+			     ceph_cap_string(want));
+			delayed++;
+			continue;
+		}
+
+ack:
+		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+			dout(" skipping %p I_NOFLUSH set\n", inode);
+			continue;
+		}
+
+		if (session && session != cap->session) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			session = NULL;
+		}
+		if (!session) {
+			session = cap->session;
+			if (mutex_trylock(&session->s_mutex) == 0) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				spin_unlock(&ci->i_ceph_lock);
+				if (took_snap_rwsem) {
+					up_read(&mdsc->snap_rwsem);
+					took_snap_rwsem = 0;
+				}
+				mutex_lock(&session->s_mutex);
+				goto retry;
+			}
+		}
+		/* take snap_rwsem after session mutex */
+		if (!took_snap_rwsem) {
+			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+				dout("inverting snap/in locks on %p\n",
+				     inode);
+				spin_unlock(&ci->i_ceph_lock);
+				down_read(&mdsc->snap_rwsem);
+				took_snap_rwsem = 1;
+				goto retry;
+			}
+			took_snap_rwsem = 1;
+		}
+
+		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+			flushing = __mark_caps_flushing(inode, session);
+		else
+			flushing = 0;
+
+		mds = cap->mds;  /* remember mds, so we don't repeat */
+		sent++;
+
+		/* __send_cap drops i_ceph_lock */
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+				      want, retain, flushing, NULL);
+		goto retry; /* retake i_ceph_lock and restart our cap scan. */
+	}
+
+	/*
+	 * Reschedule delayed caps release if we delayed anything,
+	 * otherwise cancel.
+	 */
+	if (delayed && is_delayed)
+		force_requeue = 1;   /* __send_cap delayed release; requeue */
+	if (!delayed && !is_delayed)
+		__cap_delay_cancel(mdsc, ci);
+	else if (!is_delayed || force_requeue)
+		__cap_delay_requeue(mdsc, ci);
+
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+
+	if (session)
+		mutex_unlock(&session->s_mutex);
+	if (took_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int flushing = 0;
+	struct ceph_mds_session *session = NULL;
+
+retry:
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+		goto out;
+	}
+	if (ci->i_dirty_caps && ci->i_auth_cap) {
+		struct ceph_cap *cap = ci->i_auth_cap;
+		int used = __ceph_caps_used(ci);
+		int want = __ceph_caps_wanted(ci);
+		int delayed;
+
+		if (!session || session != cap->session) {
+			spin_unlock(&ci->i_ceph_lock);
+			if (session)
+				mutex_unlock(&session->s_mutex);
+			session = cap->session;
+			mutex_lock(&session->s_mutex);
+			goto retry;
+		}
+		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+			goto out;
+
+		flushing = __mark_caps_flushing(inode, session);
+
+		/* __send_cap drops i_ceph_lock */
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+				     cap->issued | cap->implemented, flushing,
+				     flush_tid);
+		if (!delayed)
+			goto out_unlocked;
+
+		spin_lock(&ci->i_ceph_lock);
+		__cap_delay_requeue(mdsc, ci);
+	}
+out:
+	spin_unlock(&ci->i_ceph_lock);
+out_unlocked:
+	if (session)
+		mutex_unlock(&session->s_mutex);
+	return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int i, ret = 1;
+
+	spin_lock(&ci->i_ceph_lock);
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((ci->i_flushing_caps & (1 << i)) &&
+		    ci->i_cap_flush_tid[i] <= tid) {
+			/* still flushing this bit */
+			ret = 0;
+			break;
+		}
+	spin_unlock(&ci->i_ceph_lock);
+	return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_writes;
+	struct ceph_osd_request *req;
+	u64 last_tid;
+
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	/* set upper bound as _last_ entry in chain */
+	req = list_entry(head->prev, struct ceph_osd_request,
+			 r_unsafe_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_osdc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("sync_write_wait on tid %llu (until %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_osdc_put_request(req);
+
+		/*
+		 * from here on look at first entry in chain, since we
+		 * only want to wait for anything older than last_tid
+		 */
+		if (list_empty(head))
+			break;
+		req = list_entry(head->next, struct ceph_osd_request,
+				 r_unsafe_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int ret;
+	int dirty;
+
+	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+	sync_write_wait(inode);
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret < 0)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
+	dirty = try_flush_caps(inode, &flush_tid);
+	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+	/*
+	 * only wait on non-file metadata writeback (the mds
+	 * can recover size and mtime, so we don't need to
+	 * wait for that)
+	 */
+	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+		dout("fsync waiting for flush_tid %u\n", flush_tid);
+		ret = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	}
+
+	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int err = 0;
+	int dirty;
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
+	dout("write_inode %p wait=%d\n", inode, wait);
+	if (wait) {
+		dirty = try_flush_caps(inode, &flush_tid);
+		if (dirty)
+			err = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	} else {
+		struct ceph_mds_client *mdsc =
+			ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&ci->i_ceph_lock);
+		if (__ceph_caps_dirty(ci))
+			__cap_delay_requeue_front(mdsc, ci);
+		spin_unlock(&ci->i_ceph_lock);
+	}
+	return err;
+}
+
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_cap_snap *capsnap;
+
+	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+			    flushing_item) {
+		struct ceph_inode_info *ci = capsnap->ci;
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+
+		spin_lock(&ci->i_ceph_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+			     cap, capsnap);
+			__ceph_flush_snaps(ci, &session, 1);
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+		}
+		spin_unlock(&ci->i_ceph_lock);
+	}
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci;
+
+	kick_flushing_capsnaps(mdsc, session);
+
+	dout("kick_flushing_caps mds%d\n", session->s_mds);
+	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+		int delayed = 0;
+
+		spin_lock(&ci->i_ceph_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p %s\n", inode,
+			     cap, ceph_cap_string(ci->i_flushing_caps));
+			delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+					     __ceph_caps_used(ci),
+					     __ceph_caps_wanted(ci),
+					     cap->issued | cap->implemented,
+					     ci->i_flushing_caps, NULL);
+			if (delayed) {
+				spin_lock(&ci->i_ceph_lock);
+				__cap_delay_requeue(mdsc, ci);
+				spin_unlock(&ci->i_ceph_lock);
+			}
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+			spin_unlock(&ci->i_ceph_lock);
+		}
+	}
+}
+
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_session *session,
+				     struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	int delayed = 0;
+
+	spin_lock(&ci->i_ceph_lock);
+	cap = ci->i_auth_cap;
+	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
+	__ceph_flush_snaps(ci, &session, 1);
+
+	if (ci->i_flushing_caps) {
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_move_tail(&ci->i_flushing_item,
+			       &cap->session->s_cap_flushing);
+		spin_unlock(&mdsc->cap_dirty_lock);
+
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     __ceph_caps_used(ci),
+				     __ceph_caps_wanted(ci),
+				     cap->issued | cap->implemented,
+				     ci->i_flushing_caps, NULL);
+		if (delayed) {
+			spin_lock(&ci->i_ceph_lock);
+			__cap_delay_requeue(mdsc, ci);
+			spin_unlock(&ci->i_ceph_lock);
+		}
+	} else {
+		spin_unlock(&ci->i_ceph_lock);
+	}
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_ceph_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+	if (got & CEPH_CAP_PIN)
+		ci->i_pin_ref++;
+	if (got & CEPH_CAP_FILE_RD)
+		ci->i_rd_ref++;
+	if (got & CEPH_CAP_FILE_CACHE)
+		ci->i_rdcache_ref++;
+	if (got & CEPH_CAP_FILE_WR)
+		ci->i_wr_ref++;
+	if (got & CEPH_CAP_FILE_BUFFER) {
+		if (ci->i_wb_ref == 0)
+			ihold(&ci->vfs_inode);
+		ci->i_wb_ref++;
+		dout("__take_cap_refs %p wb %d -> %d (?)\n",
+		     &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+	}
+}
+
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+			    loff_t endoff, int *got, int *check_max, int *err)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret = 0;
+	int have, implemented;
+	int file_wanted;
+
+	dout("get_cap_refs %p need %s want %s\n", inode,
+	     ceph_cap_string(need), ceph_cap_string(want));
+
+	spin_lock(&ci->i_ceph_lock);
+
+	/* make sure file is actually open */
+	file_wanted = __ceph_caps_file_wanted(ci);
+	if ((file_wanted & need) == 0) {
+		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+		     ceph_cap_string(need), ceph_cap_string(file_wanted));
+		*err = -EBADF;
+		ret = 1;
+		goto out_unlock;
+	}
+
+	/* finish pending truncate */
+	while (ci->i_truncate_pending) {
+		spin_unlock(&ci->i_ceph_lock);
+		__ceph_do_pending_vmtruncate(inode);
+		spin_lock(&ci->i_ceph_lock);
+	}
+
+	have = __ceph_caps_issued(ci, &implemented);
+
+	if (have & need & CEPH_CAP_FILE_WR) {
+		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+			     inode, endoff, ci->i_max_size);
+			if (endoff > ci->i_requested_max_size) {
+				*check_max = 1;
+				ret = 1;
+			}
+			goto out_unlock;
+		}
+		/*
+		 * If a sync write is in progress, we must wait, so that we
+		 * can get a final snapshot value for size+mtime.
+		 */
+		if (__ceph_have_pending_cap_snap(ci)) {
+			dout("get_cap_refs %p cap_snap_pending\n", inode);
+			goto out_unlock;
+		}
+	}
+
+	if ((have & need) == need) {
+		/*
+		 * Look at (implemented & ~have & not) so that we keep waiting
+		 * on transition from wanted -> needed caps.  This is needed
+		 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+		 * going before a prior buffered writeback happens.
+		 */
+		int not = want & ~(have & need);
+		int revoking = implemented & ~have;
+		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+		     inode, ceph_cap_string(have), ceph_cap_string(not),
+		     ceph_cap_string(revoking));
+		if ((revoking & not) == 0) {
+			*got = need | (have & want);
+			__take_cap_refs(ci, *got);
+			ret = 1;
+		}
+	} else {
+		int session_readonly = false;
+		if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+			struct ceph_mds_session *s = ci->i_auth_cap->session;
+			spin_lock(&s->s_cap_lock);
+			session_readonly = s->s_readonly;
+			spin_unlock(&s->s_cap_lock);
+		}
+		if (session_readonly) {
+			dout("get_cap_refs %p needed %s but mds%d readonly\n",
+			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+			*err = -EROFS;
+			ret = 1;
+			goto out_unlock;
+		}
+
+		dout("get_cap_refs %p have %s needed %s\n", inode,
+		     ceph_cap_string(have), ceph_cap_string(need));
+	}
+out_unlock:
+	spin_unlock(&ci->i_ceph_lock);
+
+	dout("get_cap_refs %p ret %d got %s\n", inode,
+	     ret, ceph_cap_string(*got));
+	return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int check = 0;
+
+	/* do we need to explicitly request a larger max_size? */
+	spin_lock(&ci->i_ceph_lock);
+	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
+		dout("write %p at large endoff %llu, req max_size\n",
+		     inode, endoff);
+		ci->i_wanted_max_size = endoff;
+	}
+	/* duplicate ceph_check_caps()'s logic */
+	if (ci->i_auth_cap &&
+	    (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
+	    ci->i_wanted_max_size > ci->i_max_size &&
+	    ci->i_wanted_max_size > ci->i_requested_max_size)
+		check = 1;
+	spin_unlock(&ci->i_ceph_lock);
+	if (check)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+		  loff_t endoff, int *got, struct page **pinned_page)
+{
+	int _got, check_max, ret, err = 0;
+
+retry:
+	if (endoff > 0)
+		check_max_size(&ci->vfs_inode, endoff);
+	_got = 0;
+	check_max = 0;
+	ret = wait_event_interruptible(ci->i_cap_wq,
+				try_get_cap_refs(ci, need, want, endoff,
+						 &_got, &check_max, &err));
+	if (err)
+		ret = err;
+	if (ret < 0)
+		return ret;
+
+	if (check_max)
+		goto retry;
+
+	if (ci->i_inline_version != CEPH_INLINE_NONE &&
+	    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+	    i_size_read(&ci->vfs_inode) > 0) {
+		struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
+		if (page) {
+			if (PageUptodate(page)) {
+				*pinned_page = page;
+				goto out;
+			}
+			page_cache_release(page);
+		}
+		/*
+		 * drop cap refs first because getattr while holding
+		 * caps refs can cause deadlock.
+		 */
+		ceph_put_cap_refs(ci, _got);
+		_got = 0;
+
+		/* getattr request will bring inline data into page cache */
+		ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+					CEPH_STAT_CAP_INLINE_DATA, true);
+		if (ret < 0)
+			return ret;
+		goto retry;
+	}
+out:
+	*got = _got;
+	return 0;
+}
+
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+	spin_lock(&ci->i_ceph_lock);
+	__take_cap_refs(ci, caps);
+	spin_unlock(&ci->i_ceph_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0, put = 0, flushsnaps = 0, wake = 0;
+	struct ceph_cap_snap *capsnap;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (had & CEPH_CAP_PIN)
+		--ci->i_pin_ref;
+	if (had & CEPH_CAP_FILE_RD)
+		if (--ci->i_rd_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_CACHE)
+		if (--ci->i_rdcache_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_BUFFER) {
+		if (--ci->i_wb_ref == 0) {
+			last++;
+			put++;
+		}
+		dout("put_cap_refs %p wb %d -> %d (?)\n",
+		     inode, ci->i_wb_ref+1, ci->i_wb_ref);
+	}
+	if (had & CEPH_CAP_FILE_WR)
+		if (--ci->i_wr_ref == 0) {
+			last++;
+			if (!list_empty(&ci->i_cap_snaps)) {
+				capsnap = list_first_entry(&ci->i_cap_snaps,
+						     struct ceph_cap_snap,
+						     ci_item);
+				if (capsnap->writing) {
+					capsnap->writing = 0;
+					flushsnaps =
+						__ceph_finish_cap_snap(ci,
+								       capsnap);
+					wake = 1;
+				}
+			}
+			/* see comment in __ceph_remove_cap() */
+			if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
+				drop_inode_snap_realm(ci);
+		}
+	spin_unlock(&ci->i_ceph_lock);
+
+	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
+	     last ? " last" : "", put ? " put" : "");
+
+	if (last && !flushsnaps)
+		ceph_check_caps(ci, 0, NULL);
+	else if (flushsnaps)
+		ceph_flush_snaps(ci);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+	if (put)
+		iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				struct ceph_snap_context *snapc)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+	int complete_capsnap = 0;
+	int drop_capsnap = 0;
+	int found = 0;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&ci->i_ceph_lock);
+	ci->i_wrbuffer_ref -= nr;
+	last = !ci->i_wrbuffer_ref;
+
+	if (ci->i_head_snapc == snapc) {
+		ci->i_wrbuffer_ref_head -= nr;
+		if (ci->i_wrbuffer_ref_head == 0 &&
+		    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+			BUG_ON(!ci->i_head_snapc);
+			ceph_put_snap_context(ci->i_head_snapc);
+			ci->i_head_snapc = NULL;
+		}
+		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+		     inode,
+		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+		     last ? " LAST" : "");
+	} else {
+		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->context == snapc) {
+				found = 1;
+				break;
+			}
+		}
+		BUG_ON(!found);
+		capsnap->dirty_pages -= nr;
+		if (capsnap->dirty_pages == 0) {
+			complete_capsnap = 1;
+			if (capsnap->dirty == 0)
+				/* cap writeback completed before we created
+				 * the cap_snap; no FLUSHSNAP is needed */
+				drop_capsnap = 1;
+		}
+		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+		     " snap %lld %d/%d -> %d/%d %s%s%s\n",
+		     inode, capsnap, capsnap->context->seq,
+		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
+		     last ? " (wrbuffer last)" : "",
+		     complete_capsnap ? " (complete capsnap)" : "",
+		     drop_capsnap ? " (drop capsnap)" : "");
+		if (drop_capsnap) {
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+		}
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (last) {
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		iput(inode);
+	} else if (complete_capsnap) {
+		ceph_flush_snaps(ci);
+		wake_up_all(&ci->i_cap_wq);
+	}
+	if (drop_capsnap)
+		iput(inode);
+}
+
+/*
+ * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
+ */
+static void invalidate_aliases(struct inode *inode)
+{
+	struct dentry *dn, *prev = NULL;
+
+	dout("invalidate_aliases inode %p\n", inode);
+	d_prune_aliases(inode);
+	/*
+	 * For non-directory inode, d_find_alias() only returns
+	 * hashed dentry. After calling d_invalidate(), the
+	 * dentry becomes unhashed.
+	 *
+	 * For directory inode, d_find_alias() can return
+	 * unhashed dentry. But directory inode should have
+	 * one alias at most.
+	 */
+	while ((dn = d_find_alias(inode))) {
+		if (dn == prev) {
+			dput(dn);
+			break;
+		}
+		d_invalidate(dn);
+		if (prev)
+			dput(prev);
+		prev = dn;
+	}
+	if (prev)
+		dput(prev);
+}
+
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex and i_ceph_lock, we drop both.
+ */
+static void handle_cap_grant(struct ceph_mds_client *mdsc,
+			     struct inode *inode, struct ceph_mds_caps *grant,
+			     u64 inline_version,
+			     void *inline_data, int inline_len,
+			     struct ceph_buffer *xattr_buf,
+			     struct ceph_mds_session *session,
+			     struct ceph_cap *cap, int issued)
+	__releases(ci->i_ceph_lock)
+	__releases(mdsc->snap_rwsem)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(grant->seq);
+	int newcaps = le32_to_cpu(grant->caps);
+	int used, wanted, dirty;
+	u64 size = le64_to_cpu(grant->size);
+	u64 max_size = le64_to_cpu(grant->max_size);
+	struct timespec mtime, atime, ctime;
+	int check_caps = 0;
+	bool wake = false;
+	bool writeback = false;
+	bool queue_trunc = false;
+	bool queue_invalidate = false;
+	bool queue_revalidate = false;
+	bool deleted_inode = false;
+	bool fill_inline = false;
+
+	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+	     inode, cap, mds, seq, ceph_cap_string(newcaps));
+	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+		inode->i_size);
+
+
+	/*
+	 * auth mds of the inode changed. we received the cap export message,
+	 * but still haven't received the cap import message. handle_cap_export
+	 * updated the new auth MDS' cap.
+	 *
+	 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+	 * that was sent before the cap import message. So don't remove caps.
+	 */
+	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+		WARN_ON(cap != ci->i_auth_cap);
+		WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+		seq = cap->seq;
+		newcaps |= cap->issued;
+	}
+
+	/*
+	 * If CACHE is being revoked, and we have no dirty buffers,
+	 * try to invalidate (once).  (If there are dirty buffers, we
+	 * will invalidate _after_ writeback.)
+	 */
+	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+	    !ci->i_wrbuffer_ref) {
+		if (try_nonblocking_invalidate(inode)) {
+			/* there were locked pages.. invalidate later
+			   in a separate thread. */
+			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+				queue_invalidate = true;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			}
+		}
+
+		ceph_fscache_invalidate(inode);
+	}
+
+	/* side effects now are allowed */
+	cap->cap_gen = session->s_cap_gen;
+	cap->seq = seq;
+
+	__check_cap_issue(ci, cap, newcaps);
+
+	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(grant->mode);
+		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
+		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kgid(&init_user_ns, inode->i_gid));
+	}
+
+	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+	    (issued & CEPH_CAP_LINK_EXCL) == 0) {
+		set_nlink(inode, le32_to_cpu(grant->nlink));
+		if (inode->i_nlink == 0 &&
+		    (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+			deleted_inode = true;
+	}
+
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+		int len = le32_to_cpu(grant->xattr_len);
+		u64 version = le64_to_cpu(grant->xattr_version);
+
+		if (version > ci->i_xattrs.version) {
+			dout(" got new xattrs v%llu on %p len %d\n",
+			     version, inode, len);
+			if (ci->i_xattrs.blob)
+				ceph_buffer_put(ci->i_xattrs.blob);
+			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+			ci->i_xattrs.version = version;
+			ceph_forget_all_cached_acls(inode);
+		}
+	}
+
+	/* Do we need to revalidate our fscache cookie. Don't bother on the
+	 * first cache cap as we already validate at cookie creation time. */
+	if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
+		queue_revalidate = true;
+
+	if (newcaps & CEPH_CAP_ANY_RD) {
+		/* ctime/mtime/atime? */
+		ceph_decode_timespec(&mtime, &grant->mtime);
+		ceph_decode_timespec(&atime, &grant->atime);
+		ceph_decode_timespec(&ctime, &grant->ctime);
+		ceph_fill_file_time(inode, issued,
+				    le32_to_cpu(grant->time_warp_seq),
+				    &ctime, &mtime, &atime);
+	}
+
+	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+		/* file layout may have changed */
+		ci->i_layout = grant->layout;
+		/* size/truncate_seq? */
+		queue_trunc = ceph_fill_file_size(inode, issued,
+					le32_to_cpu(grant->truncate_seq),
+					le64_to_cpu(grant->truncate_size),
+					size);
+		/* max size increase? */
+		if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+			dout("max_size %lld -> %llu\n",
+			     ci->i_max_size, max_size);
+			ci->i_max_size = max_size;
+			if (max_size >= ci->i_wanted_max_size) {
+				ci->i_wanted_max_size = 0;  /* reset */
+				ci->i_requested_max_size = 0;
+			}
+			wake = true;
+		}
+	}
+
+	/* check cap bits */
+	wanted = __ceph_caps_wanted(ci);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+	dout(" my wanted = %s, used = %s, dirty %s\n",
+	     ceph_cap_string(wanted),
+	     ceph_cap_string(used),
+	     ceph_cap_string(dirty));
+	if (wanted != le32_to_cpu(grant->wanted)) {
+		dout("mds wanted %s -> %s\n",
+		     ceph_cap_string(le32_to_cpu(grant->wanted)),
+		     ceph_cap_string(wanted));
+		/* imported cap may not have correct mds_wanted */
+		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+			check_caps = 1;
+	}
+
+	/* revocation, grant, or no-op? */
+	if (cap->issued & ~newcaps) {
+		int revoking = cap->issued & ~newcaps;
+
+		dout("revocation: %s -> %s (revoking %s)\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps),
+		     ceph_cap_string(revoking));
+		if (revoking & used & CEPH_CAP_FILE_BUFFER)
+			writeback = true;  /* initiate writeback; will delay ack */
+		else if (revoking == CEPH_CAP_FILE_CACHE &&
+			 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+			 queue_invalidate)
+			; /* do nothing yet, invalidation will be queued */
+		else if (cap == ci->i_auth_cap)
+			check_caps = 1; /* check auth cap only */
+		else
+			check_caps = 2; /* check all caps */
+		cap->issued = newcaps;
+		cap->implemented |= newcaps;
+	} else if (cap->issued == newcaps) {
+		dout("caps unchanged: %s -> %s\n",
+		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+	} else {
+		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		/* non-auth MDS is revoking the newly grant caps ? */
+		if (cap == ci->i_auth_cap &&
+		    __ceph_caps_revoking_other(ci, cap, newcaps))
+		    check_caps = 2;
+
+		cap->issued = newcaps;
+		cap->implemented |= newcaps; /* add bits only, to
+					      * avoid stepping on a
+					      * pending revocation */
+		wake = true;
+	}
+	BUG_ON(cap->issued & ~cap->implemented);
+
+	if (inline_version > 0 && inline_version >= ci->i_inline_version) {
+		ci->i_inline_version = inline_version;
+		if (ci->i_inline_version != CEPH_INLINE_NONE &&
+		    (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
+			fill_inline = true;
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+		kick_flushing_inode_caps(mdsc, session, inode);
+		up_read(&mdsc->snap_rwsem);
+		if (newcaps & ~issued)
+			wake = true;
+	}
+
+	if (fill_inline)
+		ceph_fill_inline_data(inode, NULL, inline_data, inline_len);
+
+	if (queue_trunc) {
+		ceph_queue_vmtruncate(inode);
+		ceph_queue_revalidate(inode);
+	} else if (queue_revalidate)
+		ceph_queue_revalidate(inode);
+
+	if (writeback)
+		/*
+		 * queue inode for writeback: we can't actually call
+		 * filemap_write_and_wait, etc. from message handler
+		 * context.
+		 */
+		ceph_queue_writeback(inode);
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+	if (deleted_inode)
+		invalidate_aliases(inode);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	if (check_caps == 1)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+				session);
+	else if (check_caps == 2)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+	else
+		mutex_unlock(&session->s_mutex);
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+				 struct ceph_mds_caps *m,
+				 struct ceph_mds_session *session,
+				 struct ceph_cap *cap)
+	__releases(ci->i_ceph_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	unsigned seq = le32_to_cpu(m->seq);
+	int dirty = le32_to_cpu(m->dirty);
+	int cleaned = 0;
+	int drop = 0;
+	int i;
+
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((dirty & (1 << i)) &&
+		    (u16)flush_tid == ci->i_cap_flush_tid[i])
+			cleaned |= 1 << i;
+
+	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+	     " flushing %s -> %s\n",
+	     inode, session->s_mds, seq, ceph_cap_string(dirty),
+	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+		goto out;
+
+	ci->i_flushing_caps &= ~cleaned;
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	if (ci->i_flushing_caps == 0) {
+		list_del_init(&ci->i_flushing_item);
+		if (!list_empty(&session->s_cap_flushing))
+			dout(" mds%d still flushing cap on %p\n",
+			     session->s_mds,
+			     &list_entry(session->s_cap_flushing.next,
+					 struct ceph_inode_info,
+					 i_flushing_item)->vfs_inode);
+		mdsc->num_cap_flushing--;
+		wake_up_all(&mdsc->cap_flushing_wq);
+		dout(" inode %p now !flushing\n", inode);
+
+		if (ci->i_dirty_caps == 0) {
+			dout(" inode %p now clean\n", inode);
+			BUG_ON(!list_empty(&ci->i_dirty_item));
+			drop = 1;
+			if (ci->i_wrbuffer_ref_head == 0) {
+				BUG_ON(!ci->i_head_snapc);
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
+			}
+		} else {
+			BUG_ON(list_empty(&ci->i_dirty_item));
+		}
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+	wake_up_all(&ci->i_cap_wq);
+
+out:
+	spin_unlock(&ci->i_ceph_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+				     struct ceph_mds_caps *m,
+				     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 follows = le64_to_cpu(m->snap_follows);
+	struct ceph_cap_snap *capsnap;
+	int drop = 0;
+
+	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+	     inode, ci, session->s_mds, follows);
+
+	spin_lock(&ci->i_ceph_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		if (capsnap->follows == follows) {
+			if (capsnap->flush_tid != flush_tid) {
+				dout(" cap_snap %p follows %lld tid %lld !="
+				     " %lld\n", capsnap, follows,
+				     flush_tid, capsnap->flush_tid);
+				break;
+			}
+			WARN_ON(capsnap->dirty_pages || capsnap->writing);
+			dout(" removing %p cap_snap %p follows %lld\n",
+			     inode, capsnap, follows);
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+			drop = 1;
+			break;
+		} else {
+			dout(" skipping cap_snap %p follows %lld\n",
+			     capsnap, capsnap->follows);
+		}
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+			     struct ceph_mds_caps *trunc,
+			     struct ceph_mds_session *session)
+	__releases(ci->i_ceph_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(trunc->seq);
+	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+	u64 size = le64_to_cpu(trunc->size);
+	int implemented = 0;
+	int dirty = __ceph_caps_dirty(ci);
+	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+	int queue_trunc = 0;
+
+	issued |= implemented | dirty;
+
+	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+	     inode, mds, seq, truncate_size, truncate_seq);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  truncate_seq, truncate_size, size);
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (queue_trunc) {
+		ceph_queue_vmtruncate(inode);
+		ceph_fscache_invalidate(inode);
+	}
+}
+
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+			      struct ceph_mds_cap_peer *ph,
+			      struct ceph_mds_session *session)
+{
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *tsession = NULL;
+	struct ceph_cap *cap, *tcap, *new_cap = NULL;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 t_cap_id;
+	unsigned mseq = le32_to_cpu(ex->migrate_seq);
+	unsigned t_seq, t_mseq;
+	int target, issued;
+	int mds = session->s_mds;
+
+	if (ph) {
+		t_cap_id = le64_to_cpu(ph->cap_id);
+		t_seq = le32_to_cpu(ph->seq);
+		t_mseq = le32_to_cpu(ph->mseq);
+		target = le32_to_cpu(ph->mds);
+	} else {
+		t_cap_id = t_seq = t_mseq = 0;
+		target = -1;
+	}
+
+	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
+	     inode, ci, mds, mseq, target);
+retry:
+	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
+		goto out_unlock;
+
+	if (target < 0) {
+		__ceph_remove_cap(cap, false);
+		goto out_unlock;
+	}
+
+	/*
+	 * now we know we haven't received the cap import message yet
+	 * because the exported cap still exist.
+	 */
+
+	issued = cap->issued;
+	WARN_ON(issued != cap->implemented);
+
+	tcap = __get_cap_for_mds(ci, target);
+	if (tcap) {
+		/* already have caps from the target */
+		if (tcap->cap_id != t_cap_id ||
+		    ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+			dout(" updating import cap %p mds%d\n", tcap, target);
+			tcap->cap_id = t_cap_id;
+			tcap->seq = t_seq - 1;
+			tcap->issue_seq = t_seq - 1;
+			tcap->mseq = t_mseq;
+			tcap->issued |= issued;
+			tcap->implemented |= issued;
+			if (cap == ci->i_auth_cap)
+				ci->i_auth_cap = tcap;
+			if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+				spin_lock(&mdsc->cap_dirty_lock);
+				list_move_tail(&ci->i_flushing_item,
+					       &tcap->session->s_cap_flushing);
+				spin_unlock(&mdsc->cap_dirty_lock);
+			}
+		}
+		__ceph_remove_cap(cap, false);
+		goto out_unlock;
+	} else if (tsession) {
+		/* add placeholder for the export tagert */
+		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+
+		__ceph_remove_cap(cap, false);
+		goto out_unlock;
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&session->s_mutex);
+
+	/* open target session */
+	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+	if (!IS_ERR(tsession)) {
+		if (mds > target) {
+			mutex_lock(&session->s_mutex);
+			mutex_lock_nested(&tsession->s_mutex,
+					  SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&tsession->s_mutex);
+			mutex_lock_nested(&session->s_mutex,
+					  SINGLE_DEPTH_NESTING);
+		}
+		ceph_add_cap_releases(mdsc, tsession);
+		new_cap = ceph_get_cap(mdsc, NULL);
+	} else {
+		WARN_ON(1);
+		tsession = NULL;
+		target = -1;
+	}
+	goto retry;
+
+out_unlock:
+	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&session->s_mutex);
+	if (tsession) {
+		mutex_unlock(&tsession->s_mutex);
+		ceph_put_mds_session(tsession);
+	}
+	if (new_cap)
+		ceph_put_cap(mdsc, new_cap);
+}
+
+/*
+ * Handle cap IMPORT.
+ *
+ * caller holds s_mutex. acquires i_ceph_lock
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+			      struct inode *inode, struct ceph_mds_caps *im,
+			      struct ceph_mds_cap_peer *ph,
+			      struct ceph_mds_session *session,
+			      struct ceph_cap **target_cap, int *old_issued)
+	__acquires(ci->i_ceph_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap, *ocap, *new_cap = NULL;
+	int mds = session->s_mds;
+	int issued;
+	unsigned caps = le32_to_cpu(im->caps);
+	unsigned wanted = le32_to_cpu(im->wanted);
+	unsigned seq = le32_to_cpu(im->seq);
+	unsigned mseq = le32_to_cpu(im->migrate_seq);
+	u64 realmino = le64_to_cpu(im->realm);
+	u64 cap_id = le64_to_cpu(im->cap_id);
+	u64 p_cap_id;
+	int peer;
+
+	if (ph) {
+		p_cap_id = le64_to_cpu(ph->cap_id);
+		peer = le32_to_cpu(ph->mds);
+	} else {
+		p_cap_id = 0;
+		peer = -1;
+	}
+
+	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
+	     inode, ci, mds, mseq, peer);
+
+retry:
+	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		if (!new_cap) {
+			spin_unlock(&ci->i_ceph_lock);
+			new_cap = ceph_get_cap(mdsc, NULL);
+			goto retry;
+		}
+		cap = new_cap;
+	} else {
+		if (new_cap) {
+			ceph_put_cap(mdsc, new_cap);
+			new_cap = NULL;
+		}
+	}
+
+	__ceph_caps_issued(ci, &issued);
+	issued |= __ceph_caps_dirty(ci);
+
+	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+
+	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+	if (ocap && ocap->cap_id == p_cap_id) {
+		dout(" remove export cap %p mds%d flags %d\n",
+		     ocap, peer, ph->flags);
+		if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+		    (ocap->seq != le32_to_cpu(ph->seq) ||
+		     ocap->mseq != le32_to_cpu(ph->mseq))) {
+			pr_err("handle_cap_import: mismatched seq/mseq: "
+			       "ino (%llx.%llx) mds%d seq %d mseq %d "
+			       "importer mds%d has peer seq %d mseq %d\n",
+			       ceph_vinop(inode), peer, ocap->seq,
+			       ocap->mseq, mds, le32_to_cpu(ph->seq),
+			       le32_to_cpu(ph->mseq));
+		}
+		__ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+	}
+
+	/* make sure we re-request max_size, if necessary */
+	ci->i_wanted_max_size = 0;
+	ci->i_requested_max_size = 0;
+
+	*old_issued = issued;
+	*target_cap = cap;
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_cap *cap;
+	struct ceph_mds_caps *h;
+	struct ceph_mds_cap_peer *peer = NULL;
+	struct ceph_snap_realm *realm;
+	int mds = session->s_mds;
+	int op, issued;
+	u32 seq, mseq;
+	struct ceph_vino vino;
+	u64 cap_id;
+	u64 size, max_size;
+	u64 tid;
+	u64 inline_version = 0;
+	void *inline_data = NULL;
+	u32  inline_len = 0;
+	void *snaptrace;
+	size_t snaptrace_len;
+	void *p, *end;
+
+	dout("handle_caps from mds%d\n", mds);
+
+	/* decode */
+	end = msg->front.iov_base + msg->front.iov_len;
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = msg->front.iov_base;
+	op = le32_to_cpu(h->op);
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	cap_id = le64_to_cpu(h->cap_id);
+	seq = le32_to_cpu(h->seq);
+	mseq = le32_to_cpu(h->migrate_seq);
+	size = le64_to_cpu(h->size);
+	max_size = le64_to_cpu(h->max_size);
+
+	snaptrace = h + 1;
+	snaptrace_len = le32_to_cpu(h->snap_trace_len);
+	p = snaptrace + snaptrace_len;
+
+	if (le16_to_cpu(msg->hdr.version) >= 2) {
+		u32 flock_len;
+		ceph_decode_32_safe(&p, end, flock_len, bad);
+		if (p + flock_len > end)
+			goto bad;
+		p += flock_len;
+	}
+
+	if (le16_to_cpu(msg->hdr.version) >= 3) {
+		if (op == CEPH_CAP_OP_IMPORT) {
+			if (p + sizeof(*peer) > end)
+				goto bad;
+			peer = p;
+			p += sizeof(*peer);
+		} else if (op == CEPH_CAP_OP_EXPORT) {
+			/* recorded in unused fields */
+			peer = (void *)&h->size;
+		}
+	}
+
+	if (le16_to_cpu(msg->hdr.version) >= 4) {
+		ceph_decode_64_safe(&p, end, inline_version, bad);
+		ceph_decode_32_safe(&p, end, inline_len, bad);
+		if (p + inline_len > end)
+			goto bad;
+		inline_data = p;
+		p += inline_len;
+	}
+
+	/* lookup ino */
+	inode = ceph_find_inode(sb, vino);
+	ci = ceph_inode(inode);
+	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+	     vino.snap, inode);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+	     (unsigned)seq);
+
+	if (op == CEPH_CAP_OP_IMPORT)
+		ceph_add_cap_releases(mdsc, session);
+
+	if (!inode) {
+		dout(" i don't have ino %llx\n", vino.ino);
+
+		if (op == CEPH_CAP_OP_IMPORT) {
+			spin_lock(&session->s_cap_lock);
+			__queue_cap_release(session, vino.ino, cap_id,
+					    mseq, seq);
+			spin_unlock(&session->s_cap_lock);
+		}
+		goto flush_cap_releases;
+	}
+
+	/* these will work even if we don't have a cap yet */
+	switch (op) {
+	case CEPH_CAP_OP_FLUSHSNAP_ACK:
+		handle_cap_flushsnap_ack(inode, tid, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_EXPORT:
+		handle_cap_export(inode, h, peer, session);
+		goto done_unlocked;
+
+	case CEPH_CAP_OP_IMPORT:
+		realm = NULL;
+		if (snaptrace_len) {
+			down_write(&mdsc->snap_rwsem);
+			ceph_update_snap_trace(mdsc, snaptrace,
+					       snaptrace + snaptrace_len,
+					       false, &realm);
+			downgrade_write(&mdsc->snap_rwsem);
+		} else {
+			down_read(&mdsc->snap_rwsem);
+		}
+		handle_cap_import(mdsc, inode, h, peer, session,
+				  &cap, &issued);
+		handle_cap_grant(mdsc, inode, h,
+				 inline_version, inline_data, inline_len,
+				 msg->middle, session, cap, issued);
+		if (realm)
+			ceph_put_snap_realm(mdsc, realm);
+		goto done_unlocked;
+	}
+
+	/* the rest require a cap */
+	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ceph_inode(inode), mds);
+	if (!cap) {
+		dout(" no cap on %p ino %llx.%llx from mds%d\n",
+		     inode, ceph_ino(inode), ceph_snap(inode), mds);
+		spin_unlock(&ci->i_ceph_lock);
+		goto flush_cap_releases;
+	}
+
+	/* note that each of these drops i_ceph_lock for us */
+	switch (op) {
+	case CEPH_CAP_OP_REVOKE:
+	case CEPH_CAP_OP_GRANT:
+		__ceph_caps_issued(ci, &issued);
+		issued |= __ceph_caps_dirty(ci);
+		handle_cap_grant(mdsc, inode, h,
+				 inline_version, inline_data, inline_len,
+				 msg->middle, session, cap, issued);
+		goto done_unlocked;
+
+	case CEPH_CAP_OP_FLUSH_ACK:
+		handle_cap_flush_ack(inode, tid, h, session, cap);
+		break;
+
+	case CEPH_CAP_OP_TRUNC:
+		handle_cap_trunc(inode, h, session);
+		break;
+
+	default:
+		spin_unlock(&ci->i_ceph_lock);
+		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+		       ceph_cap_op_name(op));
+	}
+
+	goto done;
+
+flush_cap_releases:
+	/*
+	 * send any full release message to try to move things
+	 * along for the mds (who clearly thinks we still have this
+	 * cap).
+	 */
+	ceph_add_cap_releases(mdsc, session);
+	ceph_send_cap_releases(mdsc, session);
+
+done:
+	mutex_unlock(&session->s_mutex);
+done_unlocked:
+	iput(inode);
+	return;
+
+bad:
+	pr_err("ceph_handle_caps: corrupt message\n");
+	ceph_msg_dump(msg);
+	return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	int flags = CHECK_CAPS_NODELAY;
+
+	dout("check_delayed_caps\n");
+	while (1) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (list_empty(&mdsc->cap_delay_list))
+			break;
+		ci = list_first_entry(&mdsc->cap_delay_list,
+				      struct ceph_inode_info,
+				      i_cap_delay_list);
+		if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max))
+			break;
+		list_del_init(&ci->i_cap_delay_list);
+		spin_unlock(&mdsc->cap_delay_lock);
+		dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+		ceph_check_caps(ci, flags, NULL);
+	}
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+
+	dout("flush_dirty_caps\n");
+	spin_lock(&mdsc->cap_dirty_lock);
+	while (!list_empty(&mdsc->cap_dirty)) {
+		ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+				      i_dirty_item);
+		inode = &ci->vfs_inode;
+		ihold(inode);
+		dout("flush_dirty_caps %p\n", inode);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+		iput(inode);
+		spin_lock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+	dout("flush_dirty_caps done\n");
+}
+
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+
+	spin_lock(&ci->i_ceph_lock);
+	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+	if (--ci->i_nr_by_mode[fmode] == 0)
+		last++;
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (last && ci->i_vino.snap == CEPH_NOSNAP)
+		ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+			      int mds, int drop, int unless, int force)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	struct ceph_mds_request_release *rel = *p;
+	int used, dirty;
+	int ret = 0;
+
+	spin_lock(&ci->i_ceph_lock);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+
+	dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
+	     inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
+	     ceph_cap_string(unless));
+
+	/* only drop unused, clean caps */
+	drop &= ~(used | dirty);
+
+	cap = __get_cap_for_mds(ci, mds);
+	if (cap && __cap_is_valid(cap)) {
+		if (force ||
+		    ((cap->issued & drop) &&
+		     (cap->issued & unless) == 0)) {
+			if ((cap->issued & drop) &&
+			    (cap->issued & unless) == 0) {
+				int wanted = __ceph_caps_wanted(ci);
+				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+					wanted |= cap->mds_wanted;
+				dout("encode_inode_release %p cap %p "
+				     "%s -> %s, wanted %s -> %s\n", inode, cap,
+				     ceph_cap_string(cap->issued),
+				     ceph_cap_string(cap->issued & ~drop),
+				     ceph_cap_string(cap->mds_wanted),
+				     ceph_cap_string(wanted));
+
+				cap->issued &= ~drop;
+				cap->implemented &= ~drop;
+				cap->mds_wanted = wanted;
+			} else {
+				dout("encode_inode_release %p cap %p %s"
+				     " (force)\n", inode, cap,
+				     ceph_cap_string(cap->issued));
+			}
+
+			rel->ino = cpu_to_le64(ceph_ino(inode));
+			rel->cap_id = cpu_to_le64(cap->cap_id);
+			rel->seq = cpu_to_le32(cap->seq);
+			rel->issue_seq = cpu_to_le32(cap->issue_seq);
+			rel->mseq = cpu_to_le32(cap->mseq);
+			rel->caps = cpu_to_le32(cap->implemented);
+			rel->wanted = cpu_to_le32(cap->mds_wanted);
+			rel->dname_len = 0;
+			rel->dname_seq = 0;
+			*p += sizeof(*rel);
+			ret = 1;
+		} else {
+			dout("encode_inode_release %p cap %p %s\n",
+			     inode, cap, ceph_cap_string(cap->issued));
+		}
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+			       int mds, int drop, int unless)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct ceph_mds_request_release *rel = *p;
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int force = 0;
+	int ret;
+
+	/*
+	 * force an record for the directory caps if we have a dentry lease.
+	 * this is racy (can't take i_ceph_lock and d_lock together), but it
+	 * doesn't have to be perfect; the mds will revoke anything we don't
+	 * release.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (di->lease_session && di->lease_session->s_mds == mds)
+		force = 1;
+	spin_unlock(&dentry->d_lock);
+
+	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+	spin_lock(&dentry->d_lock);
+	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+		dout("encode_dentry_release %p mds%d seq %d\n",
+		     dentry, mds, (int)di->lease_seq);
+		rel->dname_len = cpu_to_le32(dentry->d_name.len);
+		memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+		*p += dentry->d_name.len;
+		rel->dname_seq = cpu_to_le32(di->lease_seq);
+		__ceph_mdsc_drop_dentry_lease(dentry);
+	}
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
diff --git a/kernel/fs/ceph/ceph_frag.c b/kernel/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000..bdce8b1fb
--- /dev/null
+++ b/kernel/fs/ceph/ceph_frag.c
@@ -0,0 +1,22 @@
+/*
+ * Ceph 'frag' type
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+	unsigned va = ceph_frag_value(a);
+	unsigned vb = ceph_frag_value(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	va = ceph_frag_bits(a);
+	vb = ceph_frag_bits(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	return 0;
+}
diff --git a/kernel/fs/ceph/debugfs.c b/kernel/fs/ceph/debugfs.c
new file mode 100644
index 000000000..31f831471
--- /dev/null
+++ b/kernel/fs/ceph/debugfs.c
@@ -0,0 +1,321 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#include "super.h"
+
+#ifdef CONFIG_DEBUG_FS
+
+#include "mds_client.h"
+
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_fs_client *fsc = s->private;
+
+	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
+	seq_printf(s, "session_timeout %d\n",
+		       fsc->mdsc->mdsmap->m_session_timeout);
+	seq_printf(s, "session_autoclose %d\n",
+		       fsc->mdsc->mdsmap->m_session_autoclose);
+	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
+		struct ceph_entity_addr *addr =
+			&fsc->mdsc->mdsmap->m_info[i].addr;
+		int state = fsc->mdsc->mdsmap->m_info[i].state;
+
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+			       ceph_pr_addr(&addr->in_addr),
+			       ceph_mds_state_name(state));
+	}
+	return 0;
+}
+
+/*
+ * mdsc debugfs
+ */
+static int mdsc_show(struct seq_file *s, void *p)
+{
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	struct rb_node *rp;
+	int pathlen;
+	u64 pathbase;
+	char *path;
+
+	mutex_lock(&mdsc->mutex);
+	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+		req = rb_entry(rp, struct ceph_mds_request, r_node);
+
+		if (req->r_request && req->r_session)
+			seq_printf(s, "%lld\tmds%d\t", req->r_tid,
+				   req->r_session->s_mds);
+		else if (!req->r_request)
+			seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+		else
+			seq_printf(s, "%lld\t(no session)\t", req->r_tid);
+
+		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+		if (req->r_got_unsafe)
+			seq_puts(s, "\t(unsafe)");
+		else
+			seq_puts(s, "\t");
+
+		if (req->r_inode) {
+			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+		} else if (req->r_dentry) {
+			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
+			spin_lock(&req->r_dentry->d_lock);
+			seq_printf(s, " #%llx/%pd (%s)",
+				   ceph_ino(d_inode(req->r_dentry->d_parent)),
+				   req->r_dentry,
+				   path ? path : "");
+			spin_unlock(&req->r_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path1) {
+			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+				   req->r_path1);
+		} else {
+			seq_printf(s, " #%llx", req->r_ino1.ino);
+		}
+
+		if (req->r_old_dentry) {
+			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
+			spin_lock(&req->r_old_dentry->d_lock);
+			seq_printf(s, " #%llx/%pd (%s)",
+				   req->r_old_dentry_dir ?
+				   ceph_ino(req->r_old_dentry_dir) : 0,
+				   req->r_old_dentry,
+				   path ? path : "");
+			spin_unlock(&req->r_old_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path2) {
+			if (req->r_ino2.ino)
+				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+					   req->r_path2);
+			else
+				seq_printf(s, " %s", req->r_path2);
+		}
+
+		seq_puts(s, "\n");
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+
+static int caps_show(struct seq_file *s, void *p)
+{
+	struct ceph_fs_client *fsc = s->private;
+	int total, avail, used, reserved, min;
+
+	ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
+	seq_printf(s, "total\t\t%d\n"
+		   "avail\t\t%d\n"
+		   "used\t\t%d\n"
+		   "reserved\t%d\n"
+		   "min\t%d\n",
+		   total, avail, used, reserved, min);
+	return 0;
+}
+
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_dentry_info *di;
+
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+		struct dentry *dentry = di->dentry;
+		seq_printf(s, "%p %p\t%pd\n",
+			   di, dentry, dentry);
+	}
+	spin_unlock(&mdsc->dentry_lru_lock);
+
+	return 0;
+}
+
+static int mds_sessions_show(struct seq_file *s, void *ptr)
+{
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_auth_client *ac = fsc->client->monc.auth;
+	struct ceph_options *opt = fsc->client->options;
+	int mds = -1;
+
+	mutex_lock(&mdsc->mutex);
+
+	/* The 'num' portion of an 'entity name' */
+	seq_printf(s, "global_id %llu\n", ac->global_id);
+
+	/* The -o name mount argument */
+	seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : "");
+
+	/* The list of MDS session rank+state */
+	for (mds = 0; mds < mdsc->max_sessions; mds++) {
+		struct ceph_mds_session *session =
+			__ceph_lookup_mds_session(mdsc, mds);
+		if (!session) {
+			continue;
+		}
+		mutex_unlock(&mdsc->mutex);
+		seq_printf(s, "mds.%d %s\n",
+				session->s_mds,
+				ceph_session_state_name(session->s_state));
+
+		ceph_put_mds_session(session);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
+
+
+/*
+ * debugfs
+ */
+static int congestion_kb_set(void *data, u64 val)
+{
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+
+	fsc->mount_options->congestion_kb = (int)val;
+	return 0;
+}
+
+static int congestion_kb_get(void *data, u64 *val)
+{
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+
+	*val = (u64)fsc->mount_options->congestion_kb;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+			congestion_kb_set, "%llu\n");
+
+
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
+{
+	dout("ceph_fs_debugfs_cleanup\n");
+	debugfs_remove(fsc->debugfs_bdi);
+	debugfs_remove(fsc->debugfs_congestion_kb);
+	debugfs_remove(fsc->debugfs_mdsmap);
+	debugfs_remove(fsc->debugfs_mds_sessions);
+	debugfs_remove(fsc->debugfs_caps);
+	debugfs_remove(fsc->debugfs_mdsc);
+	debugfs_remove(fsc->debugfs_dentry_lru);
+}
+
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+{
+	char name[100];
+	int err = -ENOMEM;
+
+	dout("ceph_fs_debugfs_init\n");
+	BUG_ON(!fsc->client->debugfs_dir);
+	fsc->debugfs_congestion_kb =
+		debugfs_create_file("writeback_congestion_kb",
+				    0600,
+				    fsc->client->debugfs_dir,
+				    fsc,
+				    &congestion_kb_fops);
+	if (!fsc->debugfs_congestion_kb)
+		goto out;
+
+	snprintf(name, sizeof(name), "../../bdi/%s",
+		 dev_name(fsc->backing_dev_info.dev));
+	fsc->debugfs_bdi =
+		debugfs_create_symlink("bdi",
+				       fsc->client->debugfs_dir,
+				       name);
+	if (!fsc->debugfs_bdi)
+		goto out;
+
+	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&mdsmap_show_fops);
+	if (!fsc->debugfs_mdsmap)
+		goto out;
+
+	fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&mds_sessions_show_fops);
+	if (!fsc->debugfs_mds_sessions)
+		goto out;
+
+	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+						0600,
+						fsc->client->debugfs_dir,
+						fsc,
+						&mdsc_show_fops);
+	if (!fsc->debugfs_mdsc)
+		goto out;
+
+	fsc->debugfs_caps = debugfs_create_file("caps",
+						   0400,
+						   fsc->client->debugfs_dir,
+						   fsc,
+						   &caps_show_fops);
+	if (!fsc->debugfs_caps)
+		goto out;
+
+	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&dentry_lru_show_fops);
+	if (!fsc->debugfs_dentry_lru)
+		goto out;
+
+	return 0;
+
+out:
+	ceph_fs_debugfs_cleanup(fsc);
+	return err;
+}
+
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+{
+	return 0;
+}
+
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/kernel/fs/ceph/dir.c b/kernel/fs/ceph/dir.c
new file mode 100644
index 000000000..4248307fe
--- /dev/null
+++ b/kernel/fs/ceph/dir.c
@@ -0,0 +1,1411 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path.  Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino).  The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+
+	if (dentry->d_fsdata)
+		return 0;
+
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+	if (!di)
+		return -ENOMEM;          /* oh well */
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_fsdata) {
+		/* lost a race */
+		kmem_cache_free(ceph_dentry_cachep, di);
+		goto out_unlock;
+	}
+
+	if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP)
+		d_set_d_op(dentry, &ceph_dentry_ops);
+	else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR)
+		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
+	else
+		d_set_d_op(dentry, &ceph_snap_dentry_ops);
+
+	di->dentry = dentry;
+	di->lease_session = NULL;
+	dentry->d_time = jiffies;
+	/* avoid reordering d_fsdata setup so that the check above is safe */
+	smp_mb();
+	dentry->d_fsdata = di;
+	ceph_dentry_lru_add(dentry);
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+
+struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
+{
+	struct inode *inode = NULL;
+
+	if (!dentry)
+		return NULL;
+
+	spin_lock(&dentry->d_lock);
+	if (!IS_ROOT(dentry)) {
+		inode = d_inode(dentry->d_parent);
+		ihold(inode);
+	}
+	spin_unlock(&dentry->d_lock);
+	return inode;
+}
+
+
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+	return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+	return p & 0xffffffff;
+}
+
+static int fpos_cmp(loff_t l, loff_t r)
+{
+	int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
+	if (v)
+		return v;
+	return (int)(fpos_off(l) - fpos_off(r));
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache.  We make this work by carefully ordering dentries on
+ * d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * Complete dir indicates that we have all dentries in the dir.  It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
+			    u32 shared_gen)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct dentry *parent = file->f_path.dentry;
+	struct inode *dir = d_inode(parent);
+	struct list_head *p;
+	struct dentry *dentry, *last;
+	struct ceph_dentry_info *di;
+	int err = 0;
+
+	/* claim ref on last dentry we returned */
+	last = fi->dentry;
+	fi->dentry = NULL;
+
+	dout("__dcache_readdir %p v%u at %llu (last %p)\n",
+	     dir, shared_gen, ctx->pos, last);
+
+	spin_lock(&parent->d_lock);
+
+	/* start at beginning? */
+	if (ctx->pos == 2 || last == NULL ||
+	    fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
+		if (list_empty(&parent->d_subdirs))
+			goto out_unlock;
+		p = parent->d_subdirs.prev;
+		dout(" initial p %p/%p\n", p->prev, p->next);
+	} else {
+		p = last->d_child.prev;
+	}
+
+more:
+	dentry = list_entry(p, struct dentry, d_child);
+	di = ceph_dentry(dentry);
+	while (1) {
+		dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+		     d_unhashed(dentry) ? "!hashed" : "hashed",
+		     parent->d_subdirs.prev, parent->d_subdirs.next);
+		if (p == &parent->d_subdirs) {
+			fi->flags |= CEPH_F_ATEND;
+			goto out_unlock;
+		}
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (di->lease_shared_gen == shared_gen &&
+		    !d_unhashed(dentry) && d_really_is_positive(dentry) &&
+		    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
+		    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
+		    fpos_cmp(ctx->pos, di->offset) <= 0)
+			break;
+		dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
+		     dentry, di->offset,
+		     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
+		     !d_inode(dentry) ? " null" : "");
+		spin_unlock(&dentry->d_lock);
+		p = p->prev;
+		dentry = list_entry(p, struct dentry, d_child);
+		di = ceph_dentry(dentry);
+	}
+
+	dget_dlock(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&parent->d_lock);
+
+	/* make sure a dentry wasn't dropped while we didn't have parent lock */
+	if (!ceph_dir_is_complete_ordered(dir)) {
+		dout(" lost dir complete on %p; falling back to mds\n", dir);
+		dput(dentry);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+	     dentry, dentry, d_inode(dentry));
+	if (!dir_emit(ctx, dentry->d_name.name,
+		      dentry->d_name.len,
+		      ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino),
+		      d_inode(dentry)->i_mode >> 12)) {
+		if (last) {
+			/* remember our position */
+			fi->dentry = last;
+			fi->next_offset = fpos_off(di->offset);
+		}
+		dput(dentry);
+		return 0;
+	}
+
+	ctx->pos = di->offset + 1;
+
+	if (last)
+		dput(last);
+	last = dentry;
+
+	spin_lock(&parent->d_lock);
+	p = p->prev;	/* advance to next dentry */
+	goto more;
+
+out_unlock:
+	spin_unlock(&parent->d_lock);
+out:
+	if (last)
+		dput(last);
+	return err;
+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+			    int len)
+{
+	kfree(fi->last_name);
+	fi->last_name = kmalloc(len+1, GFP_NOFS);
+	if (!fi->last_name)
+		return -ENOMEM;
+	memcpy(fi->last_name, name, len);
+	fi->last_name[len] = 0;
+	dout("note_last_dentry '%s'\n", fi->last_name);
+	return 0;
+}
+
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	unsigned frag = fpos_frag(ctx->pos);
+	int off = fpos_off(ctx->pos);
+	int err;
+	u32 ftype;
+	struct ceph_mds_reply_info_parsed *rinfo;
+
+	dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+	if (fi->flags & CEPH_F_ATEND)
+		return 0;
+
+	/* always start with . and .. */
+	if (ctx->pos == 0) {
+		dout("readdir off 0 -> '.'\n");
+		if (!dir_emit(ctx, ".", 1, 
+			    ceph_translate_ino(inode->i_sb, inode->i_ino),
+			    inode->i_mode >> 12))
+			return 0;
+		ctx->pos = 1;
+		off = 1;
+	}
+	if (ctx->pos == 1) {
+		ino_t ino = parent_ino(file->f_path.dentry);
+		dout("readdir off 1 -> '..'\n");
+		if (!dir_emit(ctx, "..", 2,
+			    ceph_translate_ino(inode->i_sb, ino),
+			    inode->i_mode >> 12))
+			return 0;
+		ctx->pos = 2;
+		off = 2;
+	}
+
+	/* can we use the dcache? */
+	spin_lock(&ci->i_ceph_lock);
+	if ((ctx->pos == 2 || fi->dentry) &&
+	    ceph_test_mount_opt(fsc, DCACHE) &&
+	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
+	    ceph_snap(inode) != CEPH_SNAPDIR &&
+	    __ceph_dir_is_complete_ordered(ci) &&
+	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		u32 shared_gen = ci->i_shared_gen;
+		spin_unlock(&ci->i_ceph_lock);
+		err = __dcache_readdir(file, ctx, shared_gen);
+		if (err != -EAGAIN)
+			return err;
+		frag = fpos_frag(ctx->pos);
+		off = fpos_off(ctx->pos);
+	} else {
+		spin_unlock(&ci->i_ceph_lock);
+	}
+	if (fi->dentry) {
+		err = note_last_dentry(fi, fi->dentry->d_name.name,
+				       fi->dentry->d_name.len);
+		if (err)
+			return err;
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+
+	/* proceed with a normal readdir */
+
+	if (ctx->pos == 2) {
+		/* note dir version at start of readdir so we can tell
+		 * if any dentries get dropped */
+		fi->dir_release_count = atomic_read(&ci->i_release_count);
+		fi->dir_ordered_count = ci->i_ordered_count;
+	}
+
+more:
+	/* do we have the correct frag content buffered? */
+	if (fi->frag != frag || fi->last_readdir == NULL) {
+		struct ceph_mds_request *req;
+		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+		/* discard old result, if any */
+		if (fi->last_readdir) {
+			ceph_mdsc_put_request(fi->last_readdir);
+			fi->last_readdir = NULL;
+		}
+
+		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+		     ceph_vinop(inode), frag, fi->last_name);
+		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		err = ceph_alloc_readdir_reply_buffer(req, inode);
+		if (err) {
+			ceph_mdsc_put_request(req);
+			return err;
+		}
+		/* hints to request -> mds selection code */
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_direct_hash = ceph_frag_value(frag);
+		req->r_direct_is_hash = true;
+		if (fi->last_name) {
+			req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+			if (!req->r_path2) {
+				ceph_mdsc_put_request(req);
+				return -ENOMEM;
+			}
+		}
+		req->r_readdir_offset = fi->next_offset;
+		req->r_args.readdir.frag = cpu_to_le32(frag);
+
+		req->r_inode = inode;
+		ihold(inode);
+		req->r_dentry = dget(file->f_path.dentry);
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		if (err < 0) {
+			ceph_mdsc_put_request(req);
+			return err;
+		}
+		dout("readdir got and parsed readdir result=%d"
+		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		     (int)req->r_reply_info.dir_end,
+		     (int)req->r_reply_info.dir_complete);
+
+		if (!req->r_did_prepopulate) {
+			dout("readdir !did_prepopulate");
+			/* preclude from marking dir complete */
+			fi->dir_release_count--;
+		}
+
+		/* note next offset and last dentry name */
+		rinfo = &req->r_reply_info;
+		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+			frag = le32_to_cpu(rinfo->dir_dir->frag);
+			if (ceph_frag_is_leftmost(frag))
+				fi->next_offset = 2;
+			else
+				fi->next_offset = 0;
+			off = fi->next_offset;
+		}
+		fi->frag = frag;
+		fi->offset = fi->next_offset;
+		fi->last_readdir = req;
+
+		if (req->r_reply_info.dir_end) {
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+			if (ceph_frag_is_rightmost(frag))
+				fi->next_offset = 2;
+			else
+				fi->next_offset = 0;
+		} else {
+			err = note_last_dentry(fi,
+				       rinfo->dir_dname[rinfo->dir_nr-1],
+				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+			if (err)
+				return err;
+			fi->next_offset += rinfo->dir_nr;
+		}
+	}
+
+	rinfo = &fi->last_readdir->r_reply_info;
+	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+	     rinfo->dir_nr, off, fi->offset);
+
+	ctx->pos = ceph_make_fpos(frag, off);
+	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
+		struct ceph_mds_reply_inode *in =
+			rinfo->dir_in[off - fi->offset].in;
+		struct ceph_vino vino;
+		ino_t ino;
+
+		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
+		     rinfo->dir_dname_len[off - fi->offset],
+		     rinfo->dir_dname[off - fi->offset], in);
+		BUG_ON(!in);
+		ftype = le32_to_cpu(in->mode) >> 12;
+		vino.ino = le64_to_cpu(in->ino);
+		vino.snap = le64_to_cpu(in->snapid);
+		ino = ceph_vino_to_ino(vino);
+		if (!dir_emit(ctx,
+			    rinfo->dir_dname[off - fi->offset],
+			    rinfo->dir_dname_len[off - fi->offset],
+			    ceph_translate_ino(inode->i_sb, ino), ftype)) {
+			dout("filldir stopping us...\n");
+			return 0;
+		}
+		off++;
+		ctx->pos++;
+	}
+
+	if (fi->last_name) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+		goto more;
+	}
+
+	/* more frags? */
+	if (!ceph_frag_is_rightmost(frag)) {
+		frag = ceph_frag_next(frag);
+		off = 0;
+		ctx->pos = ceph_make_fpos(frag, off);
+		dout("readdir next frag is %x\n", frag);
+		goto more;
+	}
+	fi->flags |= CEPH_F_ATEND;
+
+	/*
+	 * if dir_release_count still matches the dir, no dentries
+	 * were released during the whole readdir, and we should have
+	 * the complete dir contents in our cache.
+	 */
+	spin_lock(&ci->i_ceph_lock);
+	if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
+		if (ci->i_ordered_count == fi->dir_ordered_count)
+			dout(" marking %p complete and ordered\n", inode);
+		else
+			dout(" marking %p complete\n", inode);
+		__ceph_dir_set_complete(ci, fi->dir_release_count,
+					fi->dir_ordered_count);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+	dout("readdir %p file %p done.\n", inode, file);
+	return 0;
+}
+
+static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+{
+	if (fi->last_readdir) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+	}
+	kfree(fi->last_name);
+	fi->last_name = NULL;
+	if (ceph_frag_is_leftmost(frag))
+		fi->next_offset = 2;  /* compensate for . and .. */
+	else
+		fi->next_offset = 0;
+	if (fi->dentry) {
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+	fi->flags &= ~CEPH_F_ATEND;
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_mapping->host;
+	loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
+	loff_t retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = -EINVAL;
+	switch (whence) {
+	case SEEK_END:
+		offset += inode->i_size + 2;   /* FIXME */
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+	case SEEK_SET:
+		break;
+	default:
+		goto out;
+	}
+
+	if (offset >= 0) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+			fi->flags &= ~CEPH_F_ATEND;
+		}
+		retval = offset;
+
+		/*
+		 * discard buffered readdir content on seekdir(0), or
+		 * seek to new frag, or seek prior to current chunk.
+		 */
+		if (offset == 0 ||
+		    fpos_frag(offset) != fi->frag ||
+		    fpos_off(offset) < fi->offset) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi, fpos_frag(offset));
+		}
+
+		/* bump dir_release_count if we did a forward seek */
+		if (fpos_cmp(offset, old_offset) > 0)
+			fi->dir_release_count--;
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
+/*
+ * Handle lookups for the hidden .snap directory.
+ */
+int ceph_handle_snapdir(struct ceph_mds_request *req,
+			struct dentry *dentry, int err)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
+
+	/* .snap dir? */
+	if (err == -ENOENT &&
+	    ceph_snap(parent) == CEPH_NOSNAP &&
+	    strcmp(dentry->d_name.name,
+		   fsc->mount_options->snapdir_name) == 0) {
+		struct inode *inode = ceph_get_snapdir(parent);
+		dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
+		     dentry, dentry, inode);
+		BUG_ON(!d_unhashed(dentry));
+		d_add(dentry, inode);
+		err = 0;
+	}
+	return err;
+}
+
+/*
+ * Figure out final result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+				  struct dentry *dentry, int err)
+{
+	if (err == -ENOENT) {
+		/* no trace? */
+		err = 0;
+		if (!req->r_reply_info.head->is_dentry) {
+			dout("ENOENT and no trace, dentry %p inode %p\n",
+			     dentry, d_inode(dentry));
+			if (d_really_is_positive(dentry)) {
+				d_drop(dentry);
+				err = -ENOENT;
+			} else {
+				d_add(dentry, NULL);
+			}
+		}
+	}
+	if (err)
+		dentry = ERR_PTR(err);
+	else if (dentry != req->r_dentry)
+		dentry = dget(req->r_dentry);   /* we got spliced */
+	else
+		dentry = NULL;
+	return dentry;
+}
+
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+	return ceph_ino(inode) == CEPH_INO_ROOT &&
+		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+
+/*
+ * Look up a single dir entry.  If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int op;
+	int err;
+
+	dout("lookup %p dentry %p '%pd'\n",
+	     dir, dentry, dentry);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	err = ceph_init_dentry(dentry);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	/* can we conclude ENOENT locally? */
+	if (d_really_is_negative(dentry)) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+		struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+		spin_lock(&ci->i_ceph_lock);
+		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+		if (strncmp(dentry->d_name.name,
+			    fsc->mount_options->snapdir_name,
+			    dentry->d_name.len) &&
+		    !is_root_ceph_dentry(dir, dentry) &&
+		    ceph_test_mount_opt(fsc, DCACHE) &&
+		    __ceph_dir_is_complete(ci) &&
+		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+			spin_unlock(&ci->i_ceph_lock);
+			dout(" dir %p complete, -ENOENT\n", dir);
+			d_add(dentry, NULL);
+			di->lease_shared_gen = ci->i_shared_gen;
+			return NULL;
+		}
+		spin_unlock(&ci->i_ceph_lock);
+	}
+
+	op = ceph_snap(dir) == CEPH_SNAPDIR ?
+		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	/* we only need inode linkage */
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_locked_dir = dir;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	err = ceph_handle_snapdir(req, dentry, err);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	ceph_mdsc_put_request(req);  /* will dput(dentry) */
+	dout("lookup result=%p\n", dentry);
+	return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *result = ceph_lookup(dir, dentry, 0);
+
+	if (result && !IS_ERR(result)) {
+		/*
+		 * We created the item, then did a lookup, and found
+		 * it was already linked to another inode we already
+		 * had in our cache (and thus got spliced). To not
+		 * confuse VFS (especially when inode is a directory),
+		 * we don't link our dentry to that inode, return an
+		 * error instead.
+		 *
+		 * This event should be rare and it happens only when
+		 * we talk to old MDS. Recent MDS does not send traceless
+		 * reply for request that creates new inode.
+		 */
+		d_drop(result);
+		return -ESTALE;
+	}
+	return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_acls_info acls = {};
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	err = ceph_pre_init_acls(dir, &mode, &acls);
+	if (err < 0)
+		return err;
+
+	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
+	     dir, dentry, mode, rdev);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mknod.mode = cpu_to_le32(mode);
+	req->r_args.mknod.rdev = cpu_to_le32(rdev);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	if (acls.pagelist) {
+		req->r_pagelist = acls.pagelist;
+		acls.pagelist = NULL;
+	}
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+out:
+	if (!err)
+		ceph_init_inode_acls(d_inode(dentry), &acls);
+	else
+		d_drop(dentry);
+	ceph_release_acls_info(&acls);
+	return err;
+}
+
+static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       bool excl)
+{
+	return ceph_mknod(dir, dentry, mode, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *dest)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_path2 = kstrdup(dest, GFP_NOFS);
+	if (!req->r_path2) {
+		err = -ENOMEM;
+		ceph_mdsc_put_request(req);
+		goto out;
+	}
+	req->r_locked_dir = dir;
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+out:
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_acls_info acls = {};
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* mkdir .snap/foo is a MKSNAP */
+		op = CEPH_MDS_OP_MKSNAP;
+		dout("mksnap dir %p snap '%pd' dn %p\n", dir,
+		     dentry, dentry);
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
+		op = CEPH_MDS_OP_MKDIR;
+	} else {
+		goto out;
+	}
+
+	mode |= S_IFDIR;
+	err = ceph_pre_init_acls(dir, &mode, &acls);
+	if (err < 0)
+		goto out;
+
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mkdir.mode = cpu_to_le32(mode);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	if (acls.pagelist) {
+		req->r_pagelist = acls.pagelist;
+		acls.pagelist = NULL;
+	}
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err &&
+	    !req->r_reply_info.head->is_target &&
+	    !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+out:
+	if (!err)
+		ceph_init_inode_acls(d_inode(dentry), &acls);
+	else
+		d_drop(dentry);
+	ceph_release_acls_info(&acls);
+	return err;
+}
+
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("link in dir %p old_dentry %p dentry %p\n", dir,
+	     old_dentry, dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry);
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	/* release LINK_SHARED on source inode (mds will lock it) */
+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (err) {
+		d_drop(dentry);
+	} else if (!req->r_reply_info.head->is_dentry) {
+		ihold(d_inode(old_dentry));
+		d_instantiate(dentry, d_inode(old_dentry));
+	}
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (inode->i_nlink == 1) {
+		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+		ci->i_ceph_flags |= CEPH_I_NODELAY;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	return drop;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct inode *inode = d_inode(dentry);
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* rmdir .snap/foo is RMSNAP */
+		dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
+		op = CEPH_MDS_OP_RMSNAP;
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("unlink/rmdir dir %p dn %p inode %p\n",
+		     dir, dentry, inode);
+		op = d_is_dir(dentry) ?
+			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+	} else
+		goto out;
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_inode_drop = drop_caps_for_unlink(inode);
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		d_delete(dentry);
+	ceph_mdsc_put_request(req);
+out:
+	return err;
+}
+
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int op = CEPH_MDS_OP_RENAME;
+	int err;
+
+	if (ceph_snap(old_dir) != ceph_snap(new_dir))
+		return -EXDEV;
+	if (ceph_snap(old_dir) != CEPH_NOSNAP) {
+		if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
+			op = CEPH_MDS_OP_RENAMESNAP;
+		else
+			return -EROFS;
+	}
+	dout("rename dir %p dentry %p to dir %p dentry %p\n",
+	     old_dir, old_dentry, new_dir, new_dentry);
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	ihold(old_dir);
+	req->r_dentry = dget(new_dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry);
+	req->r_old_dentry_dir = old_dir;
+	req->r_locked_dir = new_dir;
+	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	/* release LINK_RDCACHE on source inode (mds will lock it) */
+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+	if (d_really_is_positive(new_dentry))
+		req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry));
+	err = ceph_mdsc_do_request(mdsc, old_dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry) {
+		/*
+		 * Normally d_move() is done by fill_trace (called by
+		 * do_request, above).  If there is no trace, we need
+		 * to do it here.
+		 */
+
+		d_move(old_dentry, new_dentry);
+
+		/* ensure target dentry is invalidated, despite
+		   rehashing bug in vfs_rename_dir */
+		ceph_invalidate_dentry_lease(new_dentry);
+
+		/* d_move screws up sibling dentries' offsets */
+		ceph_dir_clear_complete(old_dir);
+		ceph_dir_clear_complete(new_dir);
+
+	}
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	ceph_dentry(dentry)->lease_shared_gen = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *s;
+	int valid = 0;
+	u32 gen;
+	unsigned long ttl;
+	struct ceph_mds_session *session = NULL;
+	struct inode *dir = NULL;
+	u32 seq = 0;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (di->lease_session) {
+		s = di->lease_session;
+		spin_lock(&s->s_gen_ttl_lock);
+		gen = s->s_cap_gen;
+		ttl = s->s_cap_ttl;
+		spin_unlock(&s->s_gen_ttl_lock);
+
+		if (di->lease_gen == gen &&
+		    time_before(jiffies, dentry->d_time) &&
+		    time_before(jiffies, ttl)) {
+			valid = 1;
+			if (di->lease_renew_after &&
+			    time_after(jiffies, di->lease_renew_after)) {
+				/* we should renew */
+				dir = d_inode(dentry->d_parent);
+				session = ceph_get_mds_session(s);
+				seq = di->lease_seq;
+				di->lease_renew_after = 0;
+				di->lease_renew_from = jiffies;
+			}
+		}
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (session) {
+		ceph_mdsc_lease_send_msg(session, dir, dentry,
+					 CEPH_MDS_LEASE_RENEW, seq);
+		ceph_put_mds_session(session);
+	}
+	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int valid = 0;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_shared_gen == di->lease_shared_gen)
+		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+	spin_unlock(&ci->i_ceph_lock);
+	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+	     dir, (unsigned)ci->i_shared_gen, dentry,
+	     (unsigned)di->lease_shared_gen, valid);
+	return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	int valid = 0;
+	struct inode *dir;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
+	     dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
+
+	dir = ceph_get_dentry_parent_inode(dentry);
+
+	/* always trust cached snapped dentries, snapdir dentry */
+	if (ceph_snap(dir) != CEPH_NOSNAP) {
+		dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
+		     dentry, d_inode(dentry));
+		valid = 1;
+	} else if (d_really_is_positive(dentry) &&
+		   ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
+		valid = 1;
+	} else if (dentry_lease_is_valid(dentry) ||
+		   dir_lease_is_valid(dir, dentry)) {
+		if (d_really_is_positive(dentry))
+			valid = ceph_is_any_caps(d_inode(dentry));
+		else
+			valid = 1;
+	}
+
+	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
+	if (valid) {
+		ceph_dentry_lru_touch(dentry);
+	} else {
+		ceph_dir_clear_complete(dir);
+	}
+	iput(dir);
+	return valid;
+}
+
+/*
+ * Release our ceph_dentry_info.
+ */
+static void ceph_d_release(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	dout("d_release %p\n", dentry);
+	ceph_dentry_lru_del(dentry);
+	if (di->lease_session)
+		ceph_put_mds_session(di->lease_session);
+	kmem_cache_free(ceph_dentry_cachep, di);
+	dentry->d_fsdata = NULL;
+}
+
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+					  unsigned int flags)
+{
+	/*
+	 * Eventually, we'll want to revalidate snapped metadata
+	 * too... probably...
+	 */
+	return 1;
+}
+
+/*
+ * When the VFS prunes a dentry from the cache, we need to clear the
+ * complete flag on the parent directory.
+ *
+ * Called under dentry->d_lock.
+ */
+static void ceph_d_prune(struct dentry *dentry)
+{
+	dout("ceph_d_prune %p\n", dentry);
+
+	/* do we have a valid parent? */
+	if (IS_ROOT(dentry))
+		return;
+
+	/* if we are not hashed, we don't affect dir's completeness */
+	if (d_unhashed(dentry))
+		return;
+
+	/*
+	 * we hold d_lock, so d_parent is stable, and d_fsdata is never
+	 * cleared until d_release
+	 */
+	ceph_dir_clear_complete(d_inode(dentry->d_parent));
+}
+
+/*
+ * read() on a dir.  This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+			     loff_t *ppos)
+{
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int left;
+	const int bufsize = 1024;
+
+	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+		return -EISDIR;
+
+	if (!cf->dir_info) {
+		cf->dir_info = kmalloc(bufsize, GFP_NOFS);
+		if (!cf->dir_info)
+			return -ENOMEM;
+		cf->dir_info_len =
+			snprintf(cf->dir_info, bufsize,
+				"entries:   %20lld\n"
+				" files:    %20lld\n"
+				" subdirs:  %20lld\n"
+				"rentries:  %20lld\n"
+				" rfiles:   %20lld\n"
+				" rsubdirs: %20lld\n"
+				"rbytes:    %20lld\n"
+				"rctime:    %10ld.%09ld\n",
+				ci->i_files + ci->i_subdirs,
+				ci->i_files,
+				ci->i_subdirs,
+				ci->i_rfiles + ci->i_rsubdirs,
+				ci->i_rfiles,
+				ci->i_rsubdirs,
+				ci->i_rbytes,
+				(long)ci->i_rctime.tv_sec,
+				(long)ci->i_rctime.tv_nsec);
+	}
+
+	if (*ppos >= cf->dir_info_len)
+		return 0;
+	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+	left = copy_to_user(buf, cf->dir_info + *ppos, size);
+	if (left == size)
+		return -EFAULT;
+	*ppos += (size - left);
+	return size - left;
+}
+
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_dirops;
+	struct ceph_mds_request *req;
+	u64 last_tid;
+	int ret = 0;
+
+	dout("dir_fsync %p\n", inode);
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	req = list_entry(head->prev,
+			 struct ceph_mds_request, r_unsafe_dir_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_mdsc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+
+		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+		     inode, req->r_tid, last_tid);
+		if (req->r_timeout) {
+			unsigned long time_left = wait_for_completion_timeout(
+							&req->r_safe_completion,
+							req->r_timeout);
+			if (time_left > 0)
+				ret = 0;
+			else
+				ret = -EIO;  /* timed out */
+		} else {
+			wait_for_completion(&req->r_safe_completion);
+		}
+		ceph_mdsc_put_request(req);
+
+		spin_lock(&ci->i_unsafe_lock);
+		if (ret || list_empty(head))
+			break;
+		req = list_entry(head->next,
+				 struct ceph_mds_request, r_unsafe_dir_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_add_tail(&di->lru, &mdsc->dentry_lru);
+	mdsc->num_dentry++;
+	spin_unlock(&mdsc->dentry_lru_lock);
+}
+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
+	     di->offset);
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_move_tail(&di->lru, &mdsc->dentry_lru);
+	spin_unlock(&mdsc->dentry_lru_lock);
+}
+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_del_init(&di->lru);
+	mdsc->num_dentry--;
+	spin_unlock(&mdsc->dentry_lru_lock);
+}
+
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
+{
+	struct ceph_inode_info *dci = ceph_inode(dir);
+
+	switch (dci->i_dir_layout.dl_dir_hash) {
+	case 0:	/* for backward compat */
+	case CEPH_STR_HASH_LINUX:
+		return dn->d_name.hash;
+
+	default:
+		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+				     dn->d_name.name, dn->d_name.len);
+	}
+}
+
+const struct file_operations ceph_dir_fops = {
+	.read = ceph_read_dir,
+	.iterate = ceph_readdir,
+	.llseek = ceph_dir_llseek,
+	.open = ceph_open,
+	.release = ceph_release,
+	.unlocked_ioctl = ceph_ioctl,
+	.fsync = ceph_dir_fsync,
+};
+
+const struct file_operations ceph_snapdir_fops = {
+	.iterate = ceph_readdir,
+	.llseek = ceph_dir_llseek,
+	.open = ceph_open,
+	.release = ceph_release,
+};
+
+const struct inode_operations ceph_dir_iops = {
+	.lookup = ceph_lookup,
+	.permission = ceph_permission,
+	.getattr = ceph_getattr,
+	.setattr = ceph_setattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+	.get_acl = ceph_get_acl,
+	.set_acl = ceph_set_acl,
+	.mknod = ceph_mknod,
+	.symlink = ceph_symlink,
+	.mkdir = ceph_mkdir,
+	.link = ceph_link,
+	.unlink = ceph_unlink,
+	.rmdir = ceph_unlink,
+	.rename = ceph_rename,
+	.create = ceph_create,
+	.atomic_open = ceph_atomic_open,
+};
+
+const struct inode_operations ceph_snapdir_iops = {
+	.lookup = ceph_lookup,
+	.permission = ceph_permission,
+	.getattr = ceph_getattr,
+	.mkdir = ceph_mkdir,
+	.rmdir = ceph_unlink,
+	.rename = ceph_rename,
+};
+
+const struct dentry_operations ceph_dentry_ops = {
+	.d_revalidate = ceph_d_revalidate,
+	.d_release = ceph_d_release,
+	.d_prune = ceph_d_prune,
+};
+
+const struct dentry_operations ceph_snapdir_dentry_ops = {
+	.d_revalidate = ceph_snapdir_d_revalidate,
+	.d_release = ceph_d_release,
+};
+
+const struct dentry_operations ceph_snap_dentry_ops = {
+	.d_release = ceph_d_release,
+	.d_prune = ceph_d_prune,
+};
diff --git a/kernel/fs/ceph/export.c b/kernel/fs/ceph/export.c
new file mode 100644
index 000000000..fe02ae7f0
--- /dev/null
+++ b/kernel/fs/ceph/export.c
@@ -0,0 +1,250 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+	u64 ino;
+} __attribute__ ((packed));
+
+/*
+ * Larger fh that includes parent ino.
+ */
+struct ceph_nfs_confh {
+	u64 ino, parent_ino;
+} __attribute__ ((packed));
+
+static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
+			  struct inode *parent_inode)
+{
+	int type;
+	struct ceph_nfs_fh *fh = (void *)rawfh;
+	struct ceph_nfs_confh *cfh = (void *)rawfh;
+	int connected_handle_length = sizeof(*cfh)/4;
+	int handle_length = sizeof(*fh)/4;
+
+	/* don't re-export snaps */
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EINVAL;
+
+	if (parent_inode && (*max_len < connected_handle_length)) {
+		*max_len = connected_handle_length;
+		return FILEID_INVALID;
+	} else if (*max_len < handle_length) {
+		*max_len = handle_length;
+		return FILEID_INVALID;
+	}
+
+	if (parent_inode) {
+		dout("encode_fh %llx with parent %llx\n",
+		     ceph_ino(inode), ceph_ino(parent_inode));
+		cfh->ino = ceph_ino(inode);
+		cfh->parent_ino = ceph_ino(parent_inode);
+		*max_len = connected_handle_length;
+		type = FILEID_INO32_GEN_PARENT;
+	} else {
+		dout("encode_fh %llx\n", ceph_ino(inode));
+		fh->ino = ceph_ino(inode);
+		*max_len = handle_length;
+		type = FILEID_INO32_GEN;
+	}
+	return type;
+}
+
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	vino.ino = ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode) {
+		struct ceph_mds_request *req;
+
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+					       USE_ANY_MDS);
+		if (IS_ERR(req))
+			return ERR_CAST(req);
+
+		req->r_ino1 = vino;
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		inode = req->r_target_inode;
+		if (inode)
+			ihold(inode);
+		ceph_mdsc_put_request(req);
+		if (!inode)
+			return ERR_PTR(-ESTALE);
+	}
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		dput(dentry);
+		return ERR_PTR(err);
+	}
+	dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
+	return dentry;
+}
+
+/*
+ * convert regular fh to dentry
+ */
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+					struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+	if (fh_type != FILEID_INO32_GEN  &&
+	    fh_type != FILEID_INO32_GEN_PARENT)
+		return NULL;
+	if (fh_len < sizeof(*fh) / 4)
+		return NULL;
+
+	dout("fh_to_dentry %llx\n", fh->ino);
+	return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+				   struct dentry *child, u64 ino)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_request *req;
+	struct inode *inode;
+	struct dentry *dentry;
+	int err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+				       USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+
+	if (child) {
+		req->r_inode = d_inode(child);
+		ihold(d_inode(child));
+	} else {
+		req->r_ino1 = (struct ceph_vino) {
+			.ino = ino,
+			.snap = CEPH_NOSNAP,
+		};
+	}
+	req->r_num_caps = 1;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	inode = req->r_target_inode;
+	if (inode)
+		ihold(inode);
+	ceph_mdsc_put_request(req);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		dput(dentry);
+		return ERR_PTR(err);
+	}
+	dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
+	     child ? ceph_ino(d_inode(child)) : ino,
+	     dentry, ceph_vinop(inode));
+	return dentry;
+}
+
+static struct dentry *ceph_get_parent(struct dentry *child)
+{
+	/* don't re-export snaps */
+	if (ceph_snap(d_inode(child)) != CEPH_NOSNAP)
+		return ERR_PTR(-EINVAL);
+
+	dout("get_parent %p ino %llx.%llx\n",
+	     child, ceph_vinop(d_inode(child)));
+	return __get_parent(child->d_sb, child, 0);
+}
+
+/*
+ * convert regular fh to parent
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+					struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct ceph_nfs_confh *cfh = (void *)fid->raw;
+	struct dentry *dentry;
+
+	if (fh_type != FILEID_INO32_GEN_PARENT)
+		return NULL;
+	if (fh_len < sizeof(*cfh) / 4)
+		return NULL;
+
+	dout("fh_to_parent %llx\n", cfh->parent_ino);
+	dentry = __get_parent(sb, NULL, cfh->ino);
+	if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+		dentry = __fh_to_dentry(sb, cfh->parent_ino);
+	return dentry;
+}
+
+static int ceph_get_name(struct dentry *parent, char *name,
+			 struct dentry *child)
+{
+	struct ceph_mds_client *mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	mdsc = ceph_inode_to_client(d_inode(child))->mdsc;
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+				       USE_ANY_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	mutex_lock(&d_inode(parent)->i_mutex);
+
+	req->r_inode = d_inode(child);
+	ihold(d_inode(child));
+	req->r_ino2 = ceph_vino(d_inode(parent));
+	req->r_locked_dir = d_inode(parent);
+	req->r_num_caps = 2;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+
+	mutex_unlock(&d_inode(parent)->i_mutex);
+
+	if (!err) {
+		struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+		memcpy(name, rinfo->dname, rinfo->dname_len);
+		name[rinfo->dname_len] = 0;
+		dout("get_name %p ino %llx.%llx name %s\n",
+		     child, ceph_vinop(d_inode(child)), name);
+	} else {
+		dout("get_name %p ino %llx.%llx err %d\n",
+		     child, ceph_vinop(d_inode(child)), err);
+	}
+
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+const struct export_operations ceph_export_ops = {
+	.encode_fh = ceph_encode_fh,
+	.fh_to_dentry = ceph_fh_to_dentry,
+	.fh_to_parent = ceph_fh_to_parent,
+	.get_parent = ceph_get_parent,
+	.get_name = ceph_get_name,
+};
diff --git a/kernel/fs/ceph/file.c b/kernel/fs/ceph/file.c
new file mode 100644
index 000000000..3b6b522b4
--- /dev/null
+++ b/kernel/fs/ceph/file.c
@@ -0,0 +1,1344 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ *  - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ *  - synchronous is used when there is multi-client read/write
+ *    sharing, avoids the page cache, and synchronously waits for an
+ *    ack from the OSD.
+ *
+ *  - direct io takes the variant of the sync path that references
+ *    user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+
+
+/*
+ * Prepare an open request.  Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int want_auth = USE_ANY_MDS;
+	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+
+	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+		want_auth = USE_AUTH_MDS;
+
+	req = ceph_mdsc_create_request(mdsc, op, want_auth);
+	if (IS_ERR(req))
+		goto out;
+	req->r_fmode = ceph_flags_to_mode(flags);
+	req->r_args.open.flags = cpu_to_le32(flags);
+	req->r_args.open.mode = cpu_to_le32(create_mode);
+out:
+	return req;
+}
+
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+	struct ceph_file_info *cf;
+	int ret = 0;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		/* First file open request creates the cookie, we want to keep
+		 * this cookie around for the filetime of the inode as not to
+		 * have to worry about fscache register / revoke / operation
+		 * races.
+		 *
+		 * Also, if we know the operation is going to invalidate data
+		 * (non readonly) just nuke the cache right away.
+		 */
+		ceph_fscache_register_inode_cookie(mdsc->fsc, ci);
+		if ((fmode & CEPH_FILE_MODE_WR))
+			ceph_fscache_invalidate(inode);
+	case S_IFDIR:
+		dout("init_file %p %p 0%o (regular)\n", inode, file,
+		     inode->i_mode);
+		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+		if (cf == NULL) {
+			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			return -ENOMEM;
+		}
+		cf->fmode = fmode;
+		cf->next_offset = 2;
+		file->private_data = cf;
+		BUG_ON(inode->i_fop->release != ceph_release);
+		break;
+
+	case S_IFLNK:
+		dout("init_file %p %p 0%o (symlink)\n", inode, file,
+		     inode->i_mode);
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		break;
+
+	default:
+		dout("init_file %p %p 0%o (special)\n", inode, file,
+		     inode->i_mode);
+		/*
+		 * we need to drop the open ref now, since we don't
+		 * have .release set to ceph_release.
+		 */
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		BUG_ON(inode->i_fop->release == ceph_release);
+
+		/* call the proper open fop */
+		ret = inode->i_fop->open(inode, file);
+	}
+	return ret;
+}
+
+/*
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS).  We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *parent_inode = NULL;
+	int err;
+	int flags, fmode, wanted;
+
+	if (cf) {
+		dout("open file %p is already opened\n", file);
+		return 0;
+	}
+
+	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
+	flags = file->f_flags & ~(O_CREAT|O_EXCL);
+	if (S_ISDIR(inode->i_mode))
+		flags = O_DIRECTORY;  /* mds likes to know */
+
+	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+	     ceph_vinop(inode), file, flags, file->f_flags);
+	fmode = ceph_flags_to_mode(flags);
+	wanted = ceph_caps_for_mode(fmode);
+
+	/* snapped files are read-only */
+	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+		return -EROFS;
+
+	/* trivially open snapdir */
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		spin_lock(&ci->i_ceph_lock);
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&ci->i_ceph_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+
+	/*
+	 * No need to block if we have caps on the auth MDS (for
+	 * write) or any MDS (for read).  Update wanted set
+	 * asynchronously.
+	 */
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_is_any_real_caps(ci) &&
+	    (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
+		int mds_wanted = __ceph_caps_mds_wanted(ci);
+		int issued = __ceph_caps_issued(ci, NULL);
+
+		dout("open %p fmode %d want %s issued %s using existing\n",
+		     inode, fmode, ceph_cap_string(wanted),
+		     ceph_cap_string(issued));
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&ci->i_ceph_lock);
+
+		/* adjust wanted? */
+		if ((issued & wanted) != wanted &&
+		    (mds_wanted & wanted) != wanted &&
+		    ceph_snap(inode) != CEPH_SNAPDIR)
+			ceph_check_caps(ci, 0, NULL);
+
+		return ceph_init_file(inode, file, fmode);
+	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
+		   (ci->i_snap_caps & wanted) == wanted) {
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&ci->i_ceph_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+
+	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+	req = prepare_open_request(inode->i_sb, flags, 0);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = inode;
+	ihold(inode);
+
+	req->r_num_caps = 1;
+	if (flags & O_CREAT)
+		parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	iput(parent_inode);
+	if (!err)
+		err = ceph_init_file(inode, file, req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+	return err;
+}
+
+
+/*
+ * Do a lookup + open with a single request.  If we get a non-existent
+ * file or symlink, return 1 so the VFS can retry.
+ */
+int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
+		     struct file *file, unsigned flags, umode_t mode,
+		     int *opened)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	struct dentry *dn;
+	struct ceph_acls_info acls = {};
+	int err;
+
+	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
+	     dir, dentry, dentry,
+	     d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return -ENAMETOOLONG;
+
+	err = ceph_init_dentry(dentry);
+	if (err < 0)
+		return err;
+
+	if (flags & O_CREAT) {
+		err = ceph_pre_init_acls(dir, &mode, &acls);
+		if (err < 0)
+			return err;
+	}
+
+	/* do the open */
+	req = prepare_open_request(dir->i_sb, flags, mode);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out_acl;
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	if (flags & O_CREAT) {
+		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+		if (acls.pagelist) {
+			req->r_pagelist = acls.pagelist;
+			acls.pagelist = NULL;
+		}
+	}
+	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+	err = ceph_mdsc_do_request(mdsc,
+				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
+				   req);
+	err = ceph_handle_snapdir(req, dentry, err);
+	if (err)
+		goto out_req;
+
+	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+
+	if (d_unhashed(dentry)) {
+		dn = ceph_finish_lookup(req, dentry, err);
+		if (IS_ERR(dn))
+			err = PTR_ERR(dn);
+	} else {
+		/* we were given a hashed negative dentry */
+		dn = NULL;
+	}
+	if (err)
+		goto out_req;
+	if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
+		/* make vfs retry on splice, ENOENT, or symlink */
+		dout("atomic_open finish_no_open on dn %p\n", dn);
+		err = finish_no_open(file, dn);
+	} else {
+		dout("atomic_open finish_open on dn %p\n", dn);
+		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+			ceph_init_inode_acls(d_inode(dentry), &acls);
+			*opened |= FILE_CREATED;
+		}
+		err = finish_open(file, dentry, ceph_open, opened);
+	}
+out_req:
+	if (!req->r_err && req->r_target_inode)
+		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
+	ceph_mdsc_put_request(req);
+out_acl:
+	ceph_release_acls_info(&acls);
+	dout("atomic_open result=%d\n", err);
+	return err;
+}
+
+int ceph_release(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *cf = file->private_data;
+
+	dout("release inode %p file %p\n", inode, file);
+	ceph_put_fmode(ci, cf->fmode);
+	if (cf->last_readdir)
+		ceph_mdsc_put_request(cf->last_readdir);
+	kfree(cf->last_name);
+	kfree(cf->dir_info);
+	dput(cf->dentry);
+	kmem_cache_free(ceph_file_cachep, cf);
+
+	/* wake up anyone waiting for caps on this inode */
+	wake_up_all(&ci->i_cap_wq);
+	return 0;
+}
+
+enum {
+	CHECK_EOF = 1,
+	READ_INLINE = 2,
+};
+
+/*
+ * Read a range of bytes striped over one or more objects.  Iterate over
+ * objects we stripe over.  (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+			u64 off, u64 len,
+			struct page **pages, int num_pages,
+			int *checkeof, bool o_direct,
+			unsigned long buf_align)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 pos, this_len, left;
+	int io_align, page_align;
+	int pages_left;
+	int read;
+	struct page **page_pos;
+	int ret;
+	bool hit_stripe, was_short;
+
+	/*
+	 * we may need to do multiple reads.  not atomic, unfortunately.
+	 */
+	pos = off;
+	left = len;
+	page_pos = pages;
+	pages_left = num_pages;
+	read = 0;
+	io_align = off & ~PAGE_MASK;
+
+more:
+	if (o_direct)
+		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+	else
+		page_align = pos & ~PAGE_MASK;
+	this_len = left;
+	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+				  &ci->i_layout, pos, &this_len,
+				  ci->i_truncate_seq,
+				  ci->i_truncate_size,
+				  page_pos, pages_left, page_align);
+	if (ret == -ENOENT)
+		ret = 0;
+	hit_stripe = this_len < left;
+	was_short = ret >= 0 && ret < this_len;
+	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
+	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+
+	if (ret >= 0) {
+		int didpages;
+		if (was_short && (pos + ret < inode->i_size)) {
+			int zlen = min(this_len - ret,
+				       inode->i_size - pos - ret);
+			int zoff = (o_direct ? buf_align : io_align) +
+				    read + ret;
+			dout(" zero gap %llu to %llu\n",
+				pos + ret, pos + ret + zlen);
+			ceph_zero_page_vector_range(zoff, zlen, pages);
+			ret += zlen;
+		}
+
+		didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
+		pos += ret;
+		read = pos - off;
+		left -= ret;
+		page_pos += didpages;
+		pages_left -= didpages;
+
+		/* hit stripe and need continue*/
+		if (left && hit_stripe && pos < inode->i_size)
+			goto more;
+	}
+
+	if (read > 0) {
+		ret = read;
+		/* did we bounce off eof? */
+		if (pos + left > inode->i_size)
+			*checkeof = CHECK_EOF;
+	}
+
+	dout("striped_read returns %d\n", ret);
+	return ret;
+}
+
+/*
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
+				int *checkeof)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct page **pages;
+	u64 off = iocb->ki_pos;
+	int num_pages, ret;
+	size_t len = iov_iter_count(i);
+
+	dout("sync_read on file %p %llu~%u %s\n", file, off,
+	     (unsigned)len,
+	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (!len)
+		return 0;
+	/*
+	 * flush any page cache pages in this range.  this
+	 * will make concurrent normal and sync io slow,
+	 * but it will at least behave sensibly when they are
+	 * in sequence.
+	 */
+	ret = filemap_write_and_wait_range(inode->i_mapping, off,
+						off + len);
+	if (ret < 0)
+		return ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		while (iov_iter_count(i)) {
+			size_t start;
+			ssize_t n;
+
+			n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
+			if (n < 0)
+				return n;
+
+			num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+			ret = striped_read(inode, off, n,
+					   pages, num_pages, checkeof,
+					   1, start);
+
+			ceph_put_page_vector(pages, num_pages, true);
+
+			if (ret <= 0)
+				break;
+			off += ret;
+			iov_iter_advance(i, ret);
+			if (ret < n)
+				break;
+		}
+	} else {
+		num_pages = calc_pages_for(off, len);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		if (IS_ERR(pages))
+			return PTR_ERR(pages);
+		ret = striped_read(inode, off, len, pages,
+					num_pages, checkeof, 0, 0);
+		if (ret > 0) {
+			int l, k = 0;
+			size_t left = ret;
+
+			while (left) {
+				size_t page_off = off & ~PAGE_MASK;
+				size_t copy = min_t(size_t,
+						    PAGE_SIZE - page_off, left);
+				l = copy_page_to_iter(pages[k++], page_off,
+						      copy, i);
+				off += l;
+				left -= l;
+				if (l < copy)
+					break;
+			}
+		}
+		ceph_release_page_vector(pages, num_pages);
+	}
+
+	if (off > iocb->ki_pos) {
+		ret = off - iocb->ki_pos;
+		iocb->ki_pos = off;
+	}
+
+	dout("sync_read result %d\n", ret);
+	return ret;
+}
+
+/*
+ * Write commit request unsafe callback, called to tell us when a
+ * request is unsafe (that is, in flight--has been handed to the
+ * messenger to send to its target osd).  It is called again when
+ * we've received a response message indicating the request is
+ * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
+ * is completed early (and unsuccessfully) due to a timeout or
+ * interrupt.
+ *
+ * This is used if we requested both an ACK and ONDISK commit reply
+ * from the OSD.
+ */
+static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
+{
+	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+
+	dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
+		unsafe ? "un" : "");
+	if (unsafe) {
+		ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		spin_lock(&ci->i_unsafe_lock);
+		list_add_tail(&req->r_unsafe_item,
+			      &ci->i_unsafe_writes);
+		spin_unlock(&ci->i_unsafe_lock);
+	} else {
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_item);
+		spin_unlock(&ci->i_unsafe_lock);
+		ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+	}
+}
+
+
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_vino vino;
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int num_pages;
+	int written = 0;
+	int flags;
+	int check_caps = 0;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+	size_t count = iov_iter_count(from);
+
+	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_direct_write on file %p %lld~%u\n", file, pos,
+	     (unsigned)count);
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + count) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE;
+
+	while (iov_iter_count(from) > 0) {
+		u64 len = iov_iter_single_seg_count(from);
+		size_t start;
+		ssize_t n;
+
+		snapc = ci->i_snap_realm->cached_context;
+		vino = ceph_vino(inode);
+		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					    vino, pos, &len, 0,
+					    2,/*include a 'startsync' command*/
+					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			break;
+		}
+
+		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+
+		n = iov_iter_get_pages_alloc(from, &pages, len, &start);
+		if (unlikely(n < 0)) {
+			ret = n;
+			ceph_osdc_put_request(req);
+			break;
+		}
+
+		num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+		/*
+		 * throw out any page cache pages in this range. this
+		 * may block.
+		 */
+		truncate_inode_pages_range(inode->i_mapping, pos,
+				   (pos+n) | (PAGE_CACHE_SIZE-1));
+		osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
+						false, false);
+
+		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
+		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+
+		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+		ceph_put_page_vector(pages, num_pages, false);
+
+		ceph_osdc_put_request(req);
+		if (ret)
+			break;
+		pos += n;
+		written += n;
+		iov_iter_advance(from, n);
+
+		if (pos > i_size_read(inode)) {
+			check_caps = ceph_inode_set_size(inode, pos);
+			if (check_caps)
+				ceph_check_caps(ceph_inode(inode),
+						CHECK_CAPS_AUTHONLY,
+						NULL);
+		}
+	}
+
+	if (ret != -EOLDSNAPC && written > 0) {
+		iocb->ki_pos = pos;
+		ret = written;
+	}
+	return ret;
+}
+
+
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_vino vino;
+	struct ceph_osd_request *req;
+	struct page **pages;
+	u64 len;
+	int num_pages;
+	int written = 0;
+	int flags;
+	int check_caps = 0;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+	size_t count = iov_iter_count(from);
+
+	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + count) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE |
+		CEPH_OSD_FLAG_ACK;
+
+	while ((len = iov_iter_count(from)) > 0) {
+		size_t left;
+		int n;
+
+		snapc = ci->i_snap_realm->cached_context;
+		vino = ceph_vino(inode);
+		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					    vino, pos, &len, 0, 1,
+					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			break;
+		}
+
+		/*
+		 * write from beginning of first page,
+		 * regardless of io alignment
+		 */
+		num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+
+		left = len;
+		for (n = 0; n < num_pages; n++) {
+			size_t plen = min_t(size_t, left, PAGE_SIZE);
+			ret = copy_page_from_iter(pages[n], 0, plen, from);
+			if (ret != plen) {
+				ret = -EFAULT;
+				break;
+			}
+			left -= ret;
+		}
+
+		if (ret < 0) {
+			ceph_release_page_vector(pages, num_pages);
+			goto out;
+		}
+
+		/* get a second commit callback */
+		req->r_unsafe_callback = ceph_sync_write_unsafe;
+		req->r_inode = inode;
+
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+						false, true);
+
+		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
+		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+
+		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+out:
+		ceph_osdc_put_request(req);
+		if (ret == 0) {
+			pos += len;
+			written += len;
+
+			if (pos > i_size_read(inode)) {
+				check_caps = ceph_inode_set_size(inode, pos);
+				if (check_caps)
+					ceph_check_caps(ceph_inode(inode),
+							CHECK_CAPS_AUTHONLY,
+							NULL);
+			}
+		} else
+			break;
+	}
+
+	if (ret != -EOLDSNAPC && written > 0) {
+		ret = written;
+		iocb->ki_pos = pos;
+	}
+	return ret;
+}
+
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *filp = iocb->ki_filp;
+	struct ceph_file_info *fi = filp->private_data;
+	size_t len = iov_iter_count(to);
+	struct inode *inode = file_inode(filp);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct page *pinned_page = NULL;
+	ssize_t ret;
+	int want, got = 0;
+	int retry_op = 0, read = 0;
+
+again:
+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_CACHE;
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+	if (ret < 0)
+		return ret;
+
+	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+	    (iocb->ki_flags & IOCB_DIRECT) ||
+	    (fi->flags & CEPH_F_SYNC)) {
+
+		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+		     ceph_cap_string(got));
+
+		if (ci->i_inline_version == CEPH_INLINE_NONE) {
+			/* hmm, this isn't really async... */
+			ret = ceph_sync_read(iocb, to, &retry_op);
+		} else {
+			retry_op = READ_INLINE;
+		}
+	} else {
+		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+		     ceph_cap_string(got));
+
+		ret = generic_file_read_iter(iocb, to);
+	}
+	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+	if (pinned_page) {
+		page_cache_release(pinned_page);
+		pinned_page = NULL;
+	}
+	ceph_put_cap_refs(ci, got);
+	if (retry_op && ret >= 0) {
+		int statret;
+		struct page *page = NULL;
+		loff_t i_size;
+		if (retry_op == READ_INLINE) {
+			page = __page_cache_alloc(GFP_NOFS);
+			if (!page)
+				return -ENOMEM;
+		}
+
+		statret = __ceph_do_getattr(inode, page,
+					    CEPH_STAT_CAP_INLINE_DATA, !!page);
+		if (statret < 0) {
+			 __free_page(page);
+			if (statret == -ENODATA) {
+				BUG_ON(retry_op != READ_INLINE);
+				goto again;
+			}
+			return statret;
+		}
+
+		i_size = i_size_read(inode);
+		if (retry_op == READ_INLINE) {
+			BUG_ON(ret > 0 || read > 0);
+			if (iocb->ki_pos < i_size &&
+			    iocb->ki_pos < PAGE_CACHE_SIZE) {
+				loff_t end = min_t(loff_t, i_size,
+						   iocb->ki_pos + len);
+				end = min_t(loff_t, end, PAGE_CACHE_SIZE);
+				if (statret < end)
+					zero_user_segment(page, statret, end);
+				ret = copy_page_to_iter(page,
+						iocb->ki_pos & ~PAGE_MASK,
+						end - iocb->ki_pos, to);
+				iocb->ki_pos += ret;
+				read += ret;
+			}
+			if (iocb->ki_pos < i_size && read < len) {
+				size_t zlen = min_t(size_t, len - read,
+						    i_size - iocb->ki_pos);
+				ret = iov_iter_zero(zlen, to);
+				iocb->ki_pos += ret;
+				read += ret;
+			}
+			__free_pages(page, 0);
+			return read;
+		}
+
+		/* hit EOF or hole? */
+		if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
+		    ret < len) {
+			dout("sync_read hit hole, ppos %lld < size %lld"
+			     ", reading more\n", iocb->ki_pos,
+			     inode->i_size);
+
+			read += ret;
+			len -= ret;
+			retry_op = 0;
+			goto again;
+		}
+	}
+
+	if (ret >= 0)
+		ret += read;
+
+	return ret;
+}
+
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	ssize_t count, written = 0;
+	int err, want, got;
+	loff_t pos;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	mutex_lock(&inode->i_mutex);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+
+	err = generic_write_checks(iocb, from);
+	if (err <= 0)
+		goto out;
+
+	pos = iocb->ki_pos;
+	count = iov_iter_count(from);
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+
+	err = file_update_time(file);
+	if (err)
+		goto out;
+
+	if (ci->i_inline_version != CEPH_INLINE_NONE) {
+		err = ceph_uninline_data(file, NULL);
+		if (err < 0)
+			goto out;
+	}
+
+retry_snap:
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, count, inode->i_size);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+	got = 0;
+	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
+			    &got, NULL);
+	if (err < 0)
+		goto out;
+
+	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+
+	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+	    (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+		struct iov_iter data;
+		mutex_unlock(&inode->i_mutex);
+		/* we might need to revert back to that point */
+		data = *from;
+		if (iocb->ki_flags & IOCB_DIRECT)
+			written = ceph_sync_direct_write(iocb, &data, pos);
+		else
+			written = ceph_sync_write(iocb, &data, pos);
+		if (written == -EOLDSNAPC) {
+			dout("aio_write %p %llx.%llx %llu~%u"
+				"got EOLDSNAPC, retrying\n",
+				inode, ceph_vinop(inode),
+				pos, (unsigned)count);
+			mutex_lock(&inode->i_mutex);
+			goto retry_snap;
+		}
+		if (written > 0)
+			iov_iter_advance(from, written);
+	} else {
+		loff_t old_size = inode->i_size;
+		/*
+		 * No need to acquire the i_truncate_mutex. Because
+		 * the MDS revokes Fwb caps before sending truncate
+		 * message to us. We can't get Fwb cap while there
+		 * are pending vmtruncate. So write and vmtruncate
+		 * can not run at the same time
+		 */
+		written = generic_perform_write(file, from, pos);
+		if (likely(written >= 0))
+			iocb->ki_pos = pos + written;
+		if (inode->i_size > old_size)
+			ceph_fscache_update_objectsize(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	if (written >= 0) {
+		int dirty;
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_inline_version = CEPH_INLINE_NONE;
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)count,
+	     ceph_cap_string(got));
+	ceph_put_cap_refs(ci, got);
+
+	if (written >= 0 &&
+	    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) ||
+	     ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+		err = vfs_fsync_range(file, pos, pos + written - 1, 1);
+		if (err < 0)
+			written = err;
+	}
+
+	goto out_unlocked;
+
+out:
+	mutex_unlock(&inode->i_mutex);
+out_unlocked:
+	current->backing_dev_info = NULL;
+	return written ? written : err;
+}
+
+/*
+ * llseek.  be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
+		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+		if (ret < 0) {
+			offset = ret;
+			goto out;
+		}
+	}
+
+	switch (whence) {
+	case SEEK_END:
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0) {
+			offset = file->f_pos;
+			goto out;
+		}
+		offset += file->f_pos;
+		break;
+	case SEEK_DATA:
+		if (offset >= inode->i_size) {
+			ret = -ENXIO;
+			goto out;
+		}
+		break;
+	case SEEK_HOLE:
+		if (offset >= inode->i_size) {
+			ret = -ENXIO;
+			goto out;
+		}
+		offset = inode->i_size;
+		break;
+	}
+
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+static inline void ceph_zero_partial_page(
+	struct inode *inode, loff_t offset, unsigned size)
+{
+	struct page *page;
+	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+	page = find_lock_page(inode->i_mapping, index);
+	if (page) {
+		wait_on_page_writeback(page);
+		zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+				      loff_t length)
+{
+	loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+	if (offset < nearly) {
+		loff_t size = nearly - offset;
+		if (length < size)
+			size = length;
+		ceph_zero_partial_page(inode, offset, size);
+		offset += size;
+		length -= size;
+	}
+	if (length >= PAGE_CACHE_SIZE) {
+		loff_t size = round_down(length, PAGE_CACHE_SIZE);
+		truncate_pagecache_range(inode, offset, offset + size - 1);
+		offset += size;
+		length -= size;
+	}
+	if (length)
+		ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+				    loff_t offset, loff_t *length)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_osd_request *req;
+	int ret = 0;
+	loff_t zero = 0;
+	int op;
+
+	if (!length) {
+		op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+		length = &zero;
+	} else {
+		op = CEPH_OSD_OP_ZERO;
+	}
+
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					ceph_vino(inode),
+					offset, length,
+					0, 1, op,
+					CEPH_OSD_FLAG_WRITE |
+					CEPH_OSD_FLAG_ONDISK,
+					NULL, 0, 0, false);
+	if (IS_ERR(req)) {
+		ret = PTR_ERR(req);
+		goto out;
+	}
+
+	ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+				&inode->i_mtime);
+
+	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+	if (!ret) {
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		if (ret == -ENOENT)
+			ret = 0;
+	}
+	ceph_osdc_put_request(req);
+
+out:
+	return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+	int ret = 0;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
+	s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+	s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+	u64 object_set_size = object_size * stripe_count;
+	u64 nearly, t;
+
+	/* round offset up to next period boundary */
+	nearly = offset + object_set_size - 1;
+	t = nearly;
+	nearly -= do_div(t, object_set_size);
+
+	while (length && offset < nearly) {
+		loff_t size = length;
+		ret = ceph_zero_partial_object(inode, offset, &size);
+		if (ret < 0)
+			return ret;
+		offset += size;
+		length -= size;
+	}
+	while (length >= object_set_size) {
+		int i;
+		loff_t pos = offset;
+		for (i = 0; i < stripe_count; ++i) {
+			ret = ceph_zero_partial_object(inode, pos, NULL);
+			if (ret < 0)
+				return ret;
+			pos += stripe_unit;
+		}
+		offset += object_set_size;
+		length -= object_set_size;
+	}
+	while (length) {
+		loff_t size = length;
+		ret = ceph_zero_partial_object(inode, offset, &size);
+		if (ret < 0)
+			return ret;
+		offset += size;
+		length -= size;
+	}
+	return ret;
+}
+
+static long ceph_fallocate(struct file *file, int mode,
+				loff_t offset, loff_t length)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
+	int want, got = 0;
+	int dirty;
+	int ret = 0;
+	loff_t endoff = 0;
+	loff_t size;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (ceph_snap(inode) != CEPH_NOSNAP) {
+		ret = -EROFS;
+		goto unlock;
+	}
+
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
+		!(mode & FALLOC_FL_PUNCH_HOLE)) {
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	if (ci->i_inline_version != CEPH_INLINE_NONE) {
+		ret = ceph_uninline_data(file, NULL);
+		if (ret < 0)
+			goto unlock;
+	}
+
+	size = i_size_read(inode);
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		endoff = offset + length;
+
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+	if (ret < 0)
+		goto unlock;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		if (offset < size)
+			ceph_zero_pagecache_range(inode, offset, length);
+		ret = ceph_zero_objects(inode, offset, length);
+	} else if (endoff > size) {
+		truncate_pagecache_range(inode, size, -1);
+		if (ceph_inode_set_size(inode, endoff))
+			ceph_check_caps(ceph_inode(inode),
+				CHECK_CAPS_AUTHONLY, NULL);
+	}
+
+	if (!ret) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_inline_version = CEPH_INLINE_NONE;
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+	ceph_put_cap_refs(ci, got);
+unlock:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+const struct file_operations ceph_file_fops = {
+	.open = ceph_open,
+	.release = ceph_release,
+	.llseek = ceph_llseek,
+	.read_iter = ceph_read_iter,
+	.write_iter = ceph_write_iter,
+	.mmap = ceph_mmap,
+	.fsync = ceph_fsync,
+	.lock = ceph_lock,
+	.flock = ceph_flock,
+	.splice_read = generic_file_splice_read,
+	.splice_write = iter_file_splice_write,
+	.unlocked_ioctl = ceph_ioctl,
+	.compat_ioctl	= ceph_ioctl,
+	.fallocate	= ceph_fallocate,
+};
+
diff --git a/kernel/fs/ceph/inode.c b/kernel/fs/ceph/inode.c
new file mode 100644
index 000000000..e876e1944
--- /dev/null
+++ b/kernel/fs/ceph/inode.c
@@ -0,0 +1,2016 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+#include <linux/ceph/decode.h>
+
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+
+static const struct inode_operations ceph_symlink_iops;
+
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
+
+/*
+ * find or create an inode, given the ceph ino number
+ */
+static int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	return 0;
+}
+
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+	struct inode *inode;
+	ino_t t = ceph_vino_to_ino(vino);
+
+	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (inode->i_state & I_NEW) {
+		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		unlock_new_inode(inode);
+	}
+
+	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+	     vino.snap, inode);
+	return inode;
+}
+
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+	struct ceph_vino vino = {
+		.ino = ceph_ino(parent),
+		.snap = CEPH_SNAPDIR,
+	};
+	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	BUG_ON(!S_ISDIR(parent->i_mode));
+	if (IS_ERR(inode))
+		return inode;
+	inode->i_mode = parent->i_mode;
+	inode->i_uid = parent->i_uid;
+	inode->i_gid = parent->i_gid;
+	inode->i_op = &ceph_snapdir_iops;
+	inode->i_fop = &ceph_snapdir_fops;
+	ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+	ci->i_rbytes = 0;
+	return inode;
+}
+
+const struct inode_operations ceph_file_iops = {
+	.permission = ceph_permission,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+	.get_acl = ceph_get_acl,
+	.set_acl = ceph_set_acl,
+};
+
+
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment).  We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+						    u32 f)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_frag *frag;
+	int c;
+
+	p = &ci->i_fragtree.rb_node;
+	while (*p) {
+		parent = *p;
+		frag = rb_entry(parent, struct ceph_inode_frag, node);
+		c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return frag;
+	}
+
+	frag = kmalloc(sizeof(*frag), GFP_NOFS);
+	if (!frag) {
+		pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+		       "frag %x\n", &ci->vfs_inode,
+		       ceph_vinop(&ci->vfs_inode), f);
+		return ERR_PTR(-ENOMEM);
+	}
+	frag->frag = f;
+	frag->split_by = 0;
+	frag->mds = -1;
+	frag->ndist = 0;
+
+	rb_link_node(&frag->node, parent, p);
+	rb_insert_color(&frag->node, &ci->i_fragtree);
+
+	dout("get_or_create_frag added %llx.%llx frag %x\n",
+	     ceph_vinop(&ci->vfs_inode), f);
+	return frag;
+}
+
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+	struct rb_node *n = ci->i_fragtree.rb_node;
+
+	while (n) {
+		struct ceph_inode_frag *frag =
+			rb_entry(n, struct ceph_inode_frag, node);
+		int c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return frag;
+	}
+	return NULL;
+}
+
+/*
+ * Choose frag containing the given value @v.  If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+			      struct ceph_inode_frag *pfrag, int *found)
+{
+	u32 t = ceph_frag_make(0, 0);
+	struct ceph_inode_frag *frag;
+	unsigned nway, i;
+	u32 n;
+
+	if (found)
+		*found = 0;
+
+	while (1) {
+		WARN_ON(!ceph_frag_contains_value(t, v));
+		frag = __ceph_find_frag(ci, t);
+		if (!frag)
+			break; /* t is a leaf */
+		if (frag->split_by == 0) {
+			if (pfrag)
+				memcpy(pfrag, frag, sizeof(*pfrag));
+			if (found)
+				*found = 1;
+			break;
+		}
+
+		/* choose child */
+		nway = 1 << frag->split_by;
+		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+		     frag->split_by, nway);
+		for (i = 0; i < nway; i++) {
+			n = ceph_frag_make_child(t, frag->split_by, i);
+			if (ceph_frag_contains_value(n, v)) {
+				t = n;
+				break;
+			}
+		}
+		BUG_ON(i == nway);
+	}
+	dout("choose_frag(%x) = %x\n", v, t);
+
+	return t;
+}
+
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+		     struct ceph_inode_frag *pfrag, int *found)
+{
+	u32 ret;
+	mutex_lock(&ci->i_fragtree_mutex);
+	ret = __ceph_choose_frag(ci, v, pfrag, found);
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return ret;
+}
+
+/*
+ * Process dirfrag (delegation) info from the mds.  Include leaf
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+			     struct ceph_mds_reply_dirfrag *dirinfo)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	u32 id = le32_to_cpu(dirinfo->frag);
+	int mds = le32_to_cpu(dirinfo->auth);
+	int ndist = le32_to_cpu(dirinfo->ndist);
+	int diri_auth = -1;
+	int i;
+	int err = 0;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_auth_cap)
+		diri_auth = ci->i_auth_cap->mds;
+	spin_unlock(&ci->i_ceph_lock);
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	if (ndist == 0 && mds == diri_auth) {
+		/* no delegation info needed. */
+		frag = __ceph_find_frag(ci, id);
+		if (!frag)
+			goto out;
+		if (frag->split_by == 0) {
+			/* tree leaf, remove */
+			dout("fill_dirfrag removed %llx.%llx frag %x"
+			     " (no ref)\n", ceph_vinop(inode), id);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+		} else {
+			/* tree branch, keep and clear */
+			dout("fill_dirfrag cleared %llx.%llx frag %x"
+			     " referral\n", ceph_vinop(inode), id);
+			frag->mds = -1;
+			frag->ndist = 0;
+		}
+		goto out;
+	}
+
+
+	/* find/add this frag to store mds delegation info */
+	frag = __get_or_create_frag(ci, id);
+	if (IS_ERR(frag)) {
+		/* this is not the end of the world; we can continue
+		   with bad/inaccurate delegation info */
+		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+		err = -ENOMEM;
+		goto out;
+	}
+
+	frag->mds = mds;
+	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+	for (i = 0; i < frag->ndist; i++)
+		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+	     ceph_vinop(inode), frag->frag, frag->ndist);
+
+out:
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return err;
+}
+
+static int ceph_fill_fragtree(struct inode *inode,
+			      struct ceph_frag_tree_head *fragtree,
+			      struct ceph_mds_reply_dirfrag *dirinfo)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	struct rb_node *rb_node;
+	int i;
+	u32 id, nsplits;
+	bool update = false;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	nsplits = le32_to_cpu(fragtree->nsplits);
+	if (nsplits) {
+		i = prandom_u32() % nsplits;
+		id = le32_to_cpu(fragtree->splits[i].frag);
+		if (!__ceph_find_frag(ci, id))
+			update = true;
+	} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+		rb_node = rb_first(&ci->i_fragtree);
+		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+		if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+			update = true;
+	}
+	if (!update && dirinfo) {
+		id = le32_to_cpu(dirinfo->frag);
+		if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+			update = true;
+	}
+	if (!update)
+		goto out_unlock;
+
+	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+	rb_node = rb_first(&ci->i_fragtree);
+	for (i = 0; i < nsplits; i++) {
+		id = le32_to_cpu(fragtree->splits[i].frag);
+		frag = NULL;
+		while (rb_node) {
+			frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+			if (ceph_frag_compare(frag->frag, id) >= 0) {
+				if (frag->frag != id)
+					frag = NULL;
+				else
+					rb_node = rb_next(rb_node);
+				break;
+			}
+			rb_node = rb_next(rb_node);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+			frag = NULL;
+		}
+		if (!frag) {
+			frag = __get_or_create_frag(ci, id);
+			if (IS_ERR(frag))
+				continue;
+		}
+		frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+	}
+	while (rb_node) {
+		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+		rb_node = rb_next(rb_node);
+		rb_erase(&frag->node, &ci->i_fragtree);
+		kfree(frag);
+	}
+out_unlock:
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return 0;
+}
+
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+	struct ceph_inode_info *ci;
+	int i;
+
+	ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+	if (!ci)
+		return NULL;
+
+	dout("alloc_inode %p\n", &ci->vfs_inode);
+
+	spin_lock_init(&ci->i_ceph_lock);
+
+	ci->i_version = 0;
+	ci->i_inline_version = 0;
+	ci->i_time_warp_seq = 0;
+	ci->i_ceph_flags = 0;
+	ci->i_ordered_count = 0;
+	atomic_set(&ci->i_release_count, 1);
+	atomic_set(&ci->i_complete_count, 0);
+	ci->i_symlink = NULL;
+
+	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+
+	ci->i_fragtree = RB_ROOT;
+	mutex_init(&ci->i_fragtree_mutex);
+
+	ci->i_xattrs.blob = NULL;
+	ci->i_xattrs.prealloc_blob = NULL;
+	ci->i_xattrs.dirty = false;
+	ci->i_xattrs.index = RB_ROOT;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.version = 0;
+	ci->i_xattrs.index_version = 0;
+
+	ci->i_caps = RB_ROOT;
+	ci->i_auth_cap = NULL;
+	ci->i_dirty_caps = 0;
+	ci->i_flushing_caps = 0;
+	INIT_LIST_HEAD(&ci->i_dirty_item);
+	INIT_LIST_HEAD(&ci->i_flushing_item);
+	ci->i_cap_flush_seq = 0;
+	ci->i_cap_flush_last_tid = 0;
+	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+	init_waitqueue_head(&ci->i_cap_wq);
+	ci->i_hold_caps_min = 0;
+	ci->i_hold_caps_max = 0;
+	INIT_LIST_HEAD(&ci->i_cap_delay_list);
+	INIT_LIST_HEAD(&ci->i_cap_snaps);
+	ci->i_head_snapc = NULL;
+	ci->i_snap_caps = 0;
+
+	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+		ci->i_nr_by_mode[i] = 0;
+
+	mutex_init(&ci->i_truncate_mutex);
+	ci->i_truncate_seq = 0;
+	ci->i_truncate_size = 0;
+	ci->i_truncate_pending = 0;
+
+	ci->i_max_size = 0;
+	ci->i_reported_size = 0;
+	ci->i_wanted_max_size = 0;
+	ci->i_requested_max_size = 0;
+
+	ci->i_pin_ref = 0;
+	ci->i_rd_ref = 0;
+	ci->i_rdcache_ref = 0;
+	ci->i_wr_ref = 0;
+	ci->i_wb_ref = 0;
+	ci->i_wrbuffer_ref = 0;
+	ci->i_wrbuffer_ref_head = 0;
+	ci->i_shared_gen = 0;
+	ci->i_rdcache_gen = 0;
+	ci->i_rdcache_revoking = 0;
+
+	INIT_LIST_HEAD(&ci->i_unsafe_writes);
+	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+	spin_lock_init(&ci->i_unsafe_lock);
+
+	ci->i_snap_realm = NULL;
+	INIT_LIST_HEAD(&ci->i_snap_realm_item);
+	INIT_LIST_HEAD(&ci->i_snap_flush_item);
+
+	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
+
+	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+
+	ceph_fscache_inode_init(ci);
+
+	return &ci->vfs_inode;
+}
+
+static void ceph_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	kmem_cache_free(ceph_inode_cachep, ci);
+}
+
+void ceph_destroy_inode(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	struct rb_node *n;
+
+	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+	ceph_fscache_unregister_inode_cookie(ci);
+
+	ceph_queue_caps_release(inode);
+
+	/*
+	 * we may still have a snap_realm reference if there are stray
+	 * caps in i_snap_caps.
+	 */
+	if (ci->i_snap_realm) {
+		struct ceph_mds_client *mdsc =
+			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+
+		dout(" dropping residual ref to snap realm %p\n", realm);
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
+	kfree(ci->i_symlink);
+	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+		frag = rb_entry(n, struct ceph_inode_frag, node);
+		rb_erase(n, &ci->i_fragtree);
+		kfree(frag);
+	}
+
+	__ceph_destroy_xattrs(ci);
+	if (ci->i_xattrs.blob)
+		ceph_buffer_put(ci->i_xattrs.blob);
+	if (ci->i_xattrs.prealloc_blob)
+		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+
+	call_rcu(&inode->i_rcu, ceph_i_callback);
+}
+
+int ceph_drop_inode(struct inode *inode)
+{
+	/*
+	 * Positve dentry and corresponding inode are always accompanied
+	 * in MDS reply. So no need to keep inode in the cache after
+	 * dropping all its aliases.
+	 */
+	return 1;
+}
+
+/*
+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+			u32 truncate_seq, u64 truncate_size, u64 size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int queue_trunc = 0;
+
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+		dout("size %lld -> %llu\n", inode->i_size, size);
+		inode->i_size = size;
+		inode->i_blocks = (size + (1<<9) - 1) >> 9;
+		ci->i_reported_size = size;
+		if (truncate_seq != ci->i_truncate_seq) {
+			dout("truncate_seq %u -> %u\n",
+			     ci->i_truncate_seq, truncate_seq);
+			ci->i_truncate_seq = truncate_seq;
+
+			/* the MDS should have revoked these caps */
+			WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
+					       CEPH_CAP_FILE_RD |
+					       CEPH_CAP_FILE_WR |
+					       CEPH_CAP_FILE_LAZYIO));
+			/*
+			 * If we hold relevant caps, or in the case where we're
+			 * not the only client referencing this file and we
+			 * don't hold those caps, then we need to check whether
+			 * the file is either opened or mmaped
+			 */
+			if ((issued & (CEPH_CAP_FILE_CACHE|
+				       CEPH_CAP_FILE_BUFFER)) ||
+			    mapping_mapped(inode->i_mapping) ||
+			    __ceph_caps_file_wanted(ci)) {
+				ci->i_truncate_pending++;
+				queue_trunc = 1;
+			}
+		}
+	}
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+	    ci->i_truncate_size != truncate_size) {
+		dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+		     truncate_size);
+		ci->i_truncate_size = truncate_size;
+	}
+
+	if (queue_trunc)
+		ceph_fscache_invalidate(inode);
+
+	return queue_trunc;
+}
+
+void ceph_fill_file_time(struct inode *inode, int issued,
+			 u64 time_warp_seq, struct timespec *ctime,
+			 struct timespec *mtime, struct timespec *atime)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int warn = 0;
+
+	if (issued & (CEPH_CAP_FILE_EXCL|
+		      CEPH_CAP_FILE_WR|
+		      CEPH_CAP_FILE_BUFFER|
+		      CEPH_CAP_AUTH_EXCL|
+		      CEPH_CAP_XATTR_EXCL)) {
+		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+			     ctime->tv_sec, ctime->tv_nsec);
+			inode->i_ctime = *ctime;
+		}
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+			/* the MDS did a utimes() */
+			dout("mtime %ld.%09ld -> %ld.%09ld "
+			     "tw %d -> %d\n",
+			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+			     mtime->tv_sec, mtime->tv_nsec,
+			     ci->i_time_warp_seq, (int)time_warp_seq);
+
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else if (time_warp_seq == ci->i_time_warp_seq) {
+			/* nobody did utimes(); take the max */
+			if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+				dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_mtime.tv_sec,
+				     inode->i_mtime.tv_nsec,
+				     mtime->tv_sec, mtime->tv_nsec);
+				inode->i_mtime = *mtime;
+			}
+			if (timespec_compare(atime, &inode->i_atime) > 0) {
+				dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_atime.tv_sec,
+				     inode->i_atime.tv_nsec,
+				     atime->tv_sec, atime->tv_nsec);
+				inode->i_atime = *atime;
+			}
+		} else if (issued & CEPH_CAP_FILE_EXCL) {
+			/* we did a utimes(); ignore mds values */
+		} else {
+			warn = 1;
+		}
+	} else {
+		/* we have no write|excl caps; whatever the MDS says is true */
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+			inode->i_ctime = *ctime;
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else {
+			warn = 1;
+		}
+	}
+	if (warn) /* time_warp_seq shouldn't go backwards */
+		dout("%p mds time_warp_seq %llu < %u\n",
+		     inode, time_warp_seq, ci->i_time_warp_seq);
+}
+
+/*
+ * Populate an inode based on info from mds.  May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode, struct page *locked_page,
+		      struct ceph_mds_reply_info_in *iinfo,
+		      struct ceph_mds_reply_dirfrag *dirinfo,
+		      struct ceph_mds_session *session,
+		      unsigned long ttl_from, int cap_fmode,
+		      struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_reply_inode *info = iinfo->in;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int issued = 0, implemented, new_issued;
+	struct timespec mtime, atime, ctime;
+	struct ceph_buffer *xattr_blob = NULL;
+	struct ceph_cap *new_cap = NULL;
+	int err = 0;
+	bool wake = false;
+	bool queue_trunc = false;
+	bool new_version = false;
+	bool fill_inline = false;
+
+	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
+	     ci->i_version);
+
+	/* prealloc new cap struct */
+	if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
+		new_cap = ceph_get_cap(mdsc, caps_reservation);
+
+	/*
+	 * prealloc xattr data, if it looks like we'll need it.  only
+	 * if len > 4 (meaning there are actually xattrs; the first 4
+	 * bytes are the xattr count).
+	 */
+	if (iinfo->xattr_len > 4) {
+		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
+		if (!xattr_blob)
+			pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+			       iinfo->xattr_len);
+	}
+
+	spin_lock(&ci->i_ceph_lock);
+
+	/*
+	 * provided version will be odd if inode value is projected,
+	 * even if stable.  skip the update if we have newer stable
+	 * info (ours>=theirs, e.g. due to racing mds replies), unless
+	 * we are getting projected (unstable) info (in which case the
+	 * version is odd, and we want ours>theirs).
+	 *   us   them
+	 *   2    2     skip
+	 *   3    2     skip
+	 *   3    3     update
+	 */
+	if (ci->i_version == 0 ||
+	    ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+	     le64_to_cpu(info->version) > (ci->i_version & ~1)))
+		new_version = true;
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+	new_issued = ~issued & le32_to_cpu(info->cap.caps);
+
+	/* update inode */
+	ci->i_version = le64_to_cpu(info->version);
+	inode->i_version++;
+	inode->i_rdev = le32_to_cpu(info->rdev);
+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+
+	if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(info->mode);
+		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
+		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kgid(&init_user_ns, inode->i_gid));
+	}
+
+	if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+	    (issued & CEPH_CAP_LINK_EXCL) == 0)
+		set_nlink(inode, le32_to_cpu(info->nlink));
+
+	if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+		/* be careful with mtime, atime, size */
+		ceph_decode_timespec(&atime, &info->atime);
+		ceph_decode_timespec(&mtime, &info->mtime);
+		ceph_decode_timespec(&ctime, &info->ctime);
+		ceph_fill_file_time(inode, issued,
+				le32_to_cpu(info->time_warp_seq),
+				&ctime, &mtime, &atime);
+	}
+
+	if (new_version ||
+	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		ci->i_layout = info->layout;
+		queue_trunc = ceph_fill_file_size(inode, issued,
+					le32_to_cpu(info->truncate_seq),
+					le64_to_cpu(info->truncate_size),
+					le64_to_cpu(info->size));
+		/* only update max_size on auth cap */
+		if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+		    ci->i_max_size != le64_to_cpu(info->max_size)) {
+			dout("max_size %lld -> %llu\n", ci->i_max_size,
+					le64_to_cpu(info->max_size));
+			ci->i_max_size = le64_to_cpu(info->max_size);
+		}
+	}
+
+	/* xattrs */
+	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+	if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))  &&
+	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = xattr_blob;
+		if (xattr_blob)
+			memcpy(ci->i_xattrs.blob->vec.iov_base,
+			       iinfo->xattr_data, iinfo->xattr_len);
+		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+		ceph_forget_all_cached_acls(inode);
+		xattr_blob = NULL;
+	}
+
+	inode->i_mapping->a_ops = &ceph_aops;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &ceph_file_iops;
+		break;
+	case S_IFREG:
+		inode->i_op = &ceph_file_iops;
+		inode->i_fop = &ceph_file_fops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ceph_symlink_iops;
+		if (!ci->i_symlink) {
+			u32 symlen = iinfo->symlink_len;
+			char *sym;
+
+			spin_unlock(&ci->i_ceph_lock);
+
+			err = -EINVAL;
+			if (WARN_ON(symlen != inode->i_size))
+				goto out;
+
+			err = -ENOMEM;
+			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
+			if (!sym)
+				goto out;
+
+			spin_lock(&ci->i_ceph_lock);
+			if (!ci->i_symlink)
+				ci->i_symlink = sym;
+			else
+				kfree(sym); /* lost a race */
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op = &ceph_dir_iops;
+		inode->i_fop = &ceph_dir_fops;
+
+		ci->i_dir_layout = iinfo->dir_layout;
+
+		ci->i_files = le64_to_cpu(info->files);
+		ci->i_subdirs = le64_to_cpu(info->subdirs);
+		ci->i_rbytes = le64_to_cpu(info->rbytes);
+		ci->i_rfiles = le64_to_cpu(info->rfiles);
+		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+		break;
+	default:
+		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+		       ceph_vinop(inode), inode->i_mode);
+	}
+
+	/* were we issued a capability? */
+	if (info->cap.caps) {
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			unsigned caps = le32_to_cpu(info->cap.caps);
+			ceph_add_cap(inode, session,
+				     le64_to_cpu(info->cap.cap_id),
+				     cap_fmode, caps,
+				     le32_to_cpu(info->cap.wanted),
+				     le32_to_cpu(info->cap.seq),
+				     le32_to_cpu(info->cap.mseq),
+				     le64_to_cpu(info->cap.realm),
+				     info->cap.flags, &new_cap);
+
+			/* set dir completion flag? */
+			if (S_ISDIR(inode->i_mode) &&
+			    ci->i_files == 0 && ci->i_subdirs == 0 &&
+			    (caps & CEPH_CAP_FILE_SHARED) &&
+			    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+			    !__ceph_dir_is_complete(ci)) {
+				dout(" marking %p complete (empty)\n", inode);
+				__ceph_dir_set_complete(ci,
+					atomic_read(&ci->i_release_count),
+					ci->i_ordered_count);
+			}
+
+			wake = true;
+		} else {
+			dout(" %p got snap_caps %s\n", inode,
+			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
+			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+			if (cap_fmode >= 0)
+				__ceph_get_fmode(ci, cap_fmode);
+		}
+	} else if (cap_fmode >= 0) {
+		pr_warn("mds issued no caps on %llx.%llx\n",
+			   ceph_vinop(inode));
+		__ceph_get_fmode(ci, cap_fmode);
+	}
+
+	if (iinfo->inline_version > 0 &&
+	    iinfo->inline_version >= ci->i_inline_version) {
+		int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+		ci->i_inline_version = iinfo->inline_version;
+		if (ci->i_inline_version != CEPH_INLINE_NONE &&
+		    (locked_page ||
+		     (le32_to_cpu(info->cap.caps) & cache_caps)))
+			fill_inline = true;
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (fill_inline)
+		ceph_fill_inline_data(inode, locked_page,
+				      iinfo->inline_data, iinfo->inline_len);
+
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	/* queue truncate if we saw i_size decrease */
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+
+	/* populate frag tree */
+	if (S_ISDIR(inode->i_mode))
+		ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
+
+	/* update delegation info? */
+	if (dirinfo)
+		ceph_fill_dirfrag(inode, dirinfo);
+
+	err = 0;
+out:
+	if (new_cap)
+		ceph_put_cap(mdsc, new_cap);
+	if (xattr_blob)
+		ceph_buffer_put(xattr_blob);
+	return err;
+}
+
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+				struct ceph_mds_reply_lease *lease,
+				struct ceph_mds_session *session,
+				unsigned long from_time)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	long unsigned duration = le32_to_cpu(lease->duration_ms);
+	long unsigned ttl = from_time + (duration * HZ) / 1000;
+	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+	struct inode *dir;
+
+	/* only track leases on regular dentries */
+	if (dentry->d_op != &ceph_dentry_ops)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
+	     dentry, duration, ttl);
+
+	/* make lease_rdcache_gen match directory */
+	dir = d_inode(dentry->d_parent);
+	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+
+	if (duration == 0)
+		goto out_unlock;
+
+	if (di->lease_gen == session->s_cap_gen &&
+	    time_before(ttl, dentry->d_time))
+		goto out_unlock;  /* we already have a newer lease. */
+
+	if (di->lease_session && di->lease_session != session)
+		goto out_unlock;
+
+	ceph_dentry_lru_touch(dentry);
+
+	if (!di->lease_session)
+		di->lease_session = ceph_get_mds_session(session);
+	di->lease_gen = session->s_cap_gen;
+	di->lease_seq = le32_to_cpu(lease->seq);
+	di->lease_renew_after = half_ttl;
+	di->lease_renew_from = 0;
+	dentry->d_time = ttl;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return;
+}
+
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+				    bool *prehash)
+{
+	struct dentry *realdn;
+
+	BUG_ON(d_inode(dn));
+
+	/* dn must be unhashed */
+	if (!d_unhashed(dn))
+		d_drop(dn);
+	realdn = d_splice_alias(in, dn);
+	if (IS_ERR(realdn)) {
+		pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
+		       PTR_ERR(realdn), dn, in, ceph_vinop(in));
+		if (prehash)
+			*prehash = false; /* don't rehash on error */
+		dn = realdn; /* note realdn contains the error */
+		goto out;
+	} else if (realdn) {
+		dout("dn %p (%d) spliced with %p (%d) "
+		     "inode %p ino %llx.%llx\n",
+		     dn, d_count(dn),
+		     realdn, d_count(realdn),
+		     d_inode(realdn), ceph_vinop(d_inode(realdn)));
+		dput(dn);
+		dn = realdn;
+	} else {
+		BUG_ON(!ceph_dentry(dn));
+		dout("dn %p attached to %p ino %llx.%llx\n",
+		     dn, d_inode(dn), ceph_vinop(d_inode(dn)));
+	}
+	if ((!prehash || *prehash) && d_unhashed(dn))
+		d_rehash(dn);
+out:
+	return dn;
+}
+
+/*
+ * Incorporate results into the local cache.  This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ *         a directory inode along with a dentry.
+ *  and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+		    struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct inode *in = NULL;
+	struct ceph_vino vino;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	int err = 0;
+
+	dout("fill_trace %p is_dentry %d is_target %d\n", req,
+	     rinfo->head->is_dentry, rinfo->head->is_target);
+
+#if 0
+	/*
+	 * Debugging hook:
+	 *
+	 * If we resend completed ops to a recovering mds, we get no
+	 * trace.  Since that is very rare, pretend this is the case
+	 * to ensure the 'no trace' handlers in the callers behave.
+	 *
+	 * Fill in inodes unconditionally to avoid breaking cap
+	 * invariants.
+	 */
+	if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+		pr_info("fill_trace faking empty trace on %lld %s\n",
+			req->r_tid, ceph_mds_op_name(rinfo->head->op));
+		if (rinfo->head->is_dentry) {
+			rinfo->head->is_dentry = 0;
+			err = fill_inode(req->r_locked_dir,
+					 &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1);
+		}
+		if (rinfo->head->is_target) {
+			rinfo->head->is_target = 0;
+			ininfo = rinfo->targeti.in;
+			vino.ino = le64_to_cpu(ininfo->ino);
+			vino.snap = le64_to_cpu(ininfo->snapid);
+			in = ceph_get_inode(sb, vino);
+			err = fill_inode(in, &rinfo->targeti, NULL,
+					 session, req->r_request_started,
+					 req->r_fmode);
+			iput(in);
+		}
+	}
+#endif
+
+	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+		dout("fill_trace reply is empty!\n");
+		if (rinfo->head->result == 0 && req->r_locked_dir)
+			ceph_invalidate_dir_request(req);
+		return 0;
+	}
+
+	if (rinfo->head->is_dentry) {
+		struct inode *dir = req->r_locked_dir;
+
+		if (dir) {
+			err = fill_inode(dir, NULL,
+					 &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1,
+					 &req->r_caps_reservation);
+			if (err < 0)
+				goto done;
+		} else {
+			WARN_ON_ONCE(1);
+		}
+
+		if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+			struct qstr dname;
+			struct dentry *dn, *parent;
+
+			BUG_ON(!rinfo->head->is_target);
+			BUG_ON(req->r_dentry);
+
+			parent = d_find_any_alias(dir);
+			BUG_ON(!parent);
+
+			dname.name = rinfo->dname;
+			dname.len = rinfo->dname_len;
+			dname.hash = full_name_hash(dname.name, dname.len);
+			vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+			vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+			dn = d_lookup(parent, &dname);
+			dout("d_lookup on parent=%p name=%.*s got %p\n",
+			     parent, dname.len, dname.name, dn);
+
+			if (!dn) {
+				dn = d_alloc(parent, &dname);
+				dout("d_alloc %p '%.*s' = %p\n", parent,
+				     dname.len, dname.name, dn);
+				if (dn == NULL) {
+					dput(parent);
+					err = -ENOMEM;
+					goto done;
+				}
+				err = ceph_init_dentry(dn);
+				if (err < 0) {
+					dput(dn);
+					dput(parent);
+					goto done;
+				}
+			} else if (d_really_is_positive(dn) &&
+				   (ceph_ino(d_inode(dn)) != vino.ino ||
+				    ceph_snap(d_inode(dn)) != vino.snap)) {
+				dout(" dn %p points to wrong inode %p\n",
+				     dn, d_inode(dn));
+				d_delete(dn);
+				dput(dn);
+				goto retry_lookup;
+			}
+
+			req->r_dentry = dn;
+			dput(parent);
+		}
+	}
+
+	if (rinfo->head->is_target) {
+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		in = ceph_get_inode(sb, vino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			goto done;
+		}
+		req->r_target_inode = in;
+
+		err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
+				session, req->r_request_started,
+				(!req->r_aborted && rinfo->head->result == 0) ?
+				req->r_fmode : -1,
+				&req->r_caps_reservation);
+		if (err < 0) {
+			pr_err("fill_inode badness %p %llx.%llx\n",
+				in, ceph_vinop(in));
+			goto done;
+		}
+	}
+
+	/*
+	 * ignore null lease/binding on snapdir ENOENT, or else we
+	 * will have trouble splicing in the virtual snapdir later
+	 */
+	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    req->r_locked_dir &&
+	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+					       fsc->mount_options->snapdir_name,
+					       req->r_dentry->d_name.len))) {
+		/*
+		 * lookup link rename   : null -> possibly existing inode
+		 * mknod symlink mkdir  : null -> new inode
+		 * unlink               : linked -> null
+		 */
+		struct inode *dir = req->r_locked_dir;
+		struct dentry *dn = req->r_dentry;
+		bool have_dir_cap, have_lease;
+
+		BUG_ON(!dn);
+		BUG_ON(!dir);
+		BUG_ON(d_inode(dn->d_parent) != dir);
+		BUG_ON(ceph_ino(dir) !=
+		       le64_to_cpu(rinfo->diri.in->ino));
+		BUG_ON(ceph_snap(dir) !=
+		       le64_to_cpu(rinfo->diri.in->snapid));
+
+		/* do we have a lease on the whole dir? */
+		have_dir_cap =
+			(le32_to_cpu(rinfo->diri.in->cap.caps) &
+			 CEPH_CAP_FILE_SHARED);
+
+		/* do we have a dn lease? */
+		have_lease = have_dir_cap ||
+			le32_to_cpu(rinfo->dlease->duration_ms);
+		if (!have_lease)
+			dout("fill_trace  no dentry lease or dir cap\n");
+
+		/* rename? */
+		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+			struct inode *olddir = req->r_old_dentry_dir;
+			BUG_ON(!olddir);
+
+			dout(" src %p '%pd' dst %p '%pd'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry,
+			     dn, dn);
+			dout("fill_trace doing d_move %p -> %p\n",
+			     req->r_old_dentry, dn);
+
+			d_move(req->r_old_dentry, dn);
+			dout(" src %p '%pd' dst %p '%pd'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry,
+			     dn, dn);
+
+			/* ensure target dentry is invalidated, despite
+			   rehashing bug in vfs_rename_dir */
+			ceph_invalidate_dentry_lease(dn);
+
+			/* d_move screws up sibling dentries' offsets */
+			ceph_dir_clear_ordered(dir);
+			ceph_dir_clear_ordered(olddir);
+
+			dout("dn %p gets new offset %lld\n", req->r_old_dentry,
+			     ceph_dentry(req->r_old_dentry)->offset);
+
+			dn = req->r_old_dentry;  /* use old_dentry */
+		}
+
+		/* null dentry? */
+		if (!rinfo->head->is_target) {
+			dout("fill_trace null dentry\n");
+			if (d_really_is_positive(dn)) {
+				ceph_dir_clear_ordered(dir);
+				dout("d_delete %p\n", dn);
+				d_delete(dn);
+			} else {
+				dout("d_instantiate %p NULL\n", dn);
+				d_instantiate(dn, NULL);
+				if (have_lease && d_unhashed(dn))
+					d_rehash(dn);
+				update_dentry_lease(dn, rinfo->dlease,
+						    session,
+						    req->r_request_started);
+			}
+			goto done;
+		}
+
+		/* attach proper inode */
+		if (d_really_is_negative(dn)) {
+			ceph_dir_clear_ordered(dir);
+			ihold(in);
+			dn = splice_dentry(dn, in, &have_lease);
+			if (IS_ERR(dn)) {
+				err = PTR_ERR(dn);
+				goto done;
+			}
+			req->r_dentry = dn;  /* may have spliced */
+		} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
+			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+			     dn, d_inode(dn), ceph_vinop(d_inode(dn)),
+			     ceph_vinop(in));
+			have_lease = false;
+		}
+
+		if (have_lease)
+			update_dentry_lease(dn, rinfo->dlease, session,
+					    req->r_request_started);
+		dout(" final dn %p\n", dn);
+	} else if (!req->r_aborted &&
+		   (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		    req->r_op == CEPH_MDS_OP_MKSNAP)) {
+		struct dentry *dn = req->r_dentry;
+		struct inode *dir = req->r_locked_dir;
+
+		/* fill out a snapdir LOOKUPSNAP dentry */
+		BUG_ON(!dn);
+		BUG_ON(!dir);
+		BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
+		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		ceph_dir_clear_ordered(dir);
+		ihold(in);
+		dn = splice_dentry(dn, in, NULL);
+		if (IS_ERR(dn)) {
+			err = PTR_ERR(dn);
+			goto done;
+		}
+		req->r_dentry = dn;  /* may have spliced */
+	}
+done:
+	dout("fill_trace done err=%d\n", err);
+	return err;
+}
+
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
+					   struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	int i, err = 0;
+
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+		struct inode *in;
+		int rc;
+
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+		in = ceph_get_inode(req->r_dentry->d_sb, vino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			dout("new_inode badness got %d\n", err);
+			continue;
+		}
+		rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+				req->r_request_started, -1,
+				&req->r_caps_reservation);
+		if (rc < 0) {
+			pr_err("fill_inode badness on %p got %d\n", in, rc);
+			err = rc;
+			continue;
+		}
+	}
+
+	return err;
+}
+
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+			     struct ceph_mds_session *session)
+{
+	struct dentry *parent = req->r_dentry;
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct qstr dname;
+	struct dentry *dn;
+	struct inode *in;
+	int err = 0, ret, i;
+	struct inode *snapdir = NULL;
+	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+	struct ceph_dentry_info *di;
+	u64 r_readdir_offset = req->r_readdir_offset;
+	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+
+	if (rinfo->dir_dir &&
+	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+		dout("readdir_prepopulate got new frag %x -> %x\n",
+		     frag, le32_to_cpu(rinfo->dir_dir->frag));
+		frag = le32_to_cpu(rinfo->dir_dir->frag);
+		if (ceph_frag_is_leftmost(frag))
+			r_readdir_offset = 2;
+		else
+			r_readdir_offset = 0;
+	}
+
+	if (req->r_aborted)
+		return readdir_prepopulate_inodes_only(req, session);
+
+	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+		snapdir = ceph_get_snapdir(d_inode(parent));
+		parent = d_find_alias(snapdir);
+		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+		     rinfo->dir_nr, parent);
+	} else {
+		dout("readdir_prepopulate %d items under dn %p\n",
+		     rinfo->dir_nr, parent);
+		if (rinfo->dir_dir)
+			ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
+	}
+
+	/* FIXME: release caps/leases if error occurs */
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+
+		dname.name = rinfo->dir_dname[i];
+		dname.len = rinfo->dir_dname_len[i];
+		dname.hash = full_name_hash(dname.name, dname.len);
+
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+retry_lookup:
+		dn = d_lookup(parent, &dname);
+		dout("d_lookup on parent=%p name=%.*s got %p\n",
+		     parent, dname.len, dname.name, dn);
+
+		if (!dn) {
+			dn = d_alloc(parent, &dname);
+			dout("d_alloc %p '%.*s' = %p\n", parent,
+			     dname.len, dname.name, dn);
+			if (dn == NULL) {
+				dout("d_alloc badness\n");
+				err = -ENOMEM;
+				goto out;
+			}
+			ret = ceph_init_dentry(dn);
+			if (ret < 0) {
+				dput(dn);
+				err = ret;
+				goto out;
+			}
+		} else if (d_really_is_positive(dn) &&
+			   (ceph_ino(d_inode(dn)) != vino.ino ||
+			    ceph_snap(d_inode(dn)) != vino.snap)) {
+			dout(" dn %p points to wrong inode %p\n",
+			     dn, d_inode(dn));
+			d_delete(dn);
+			dput(dn);
+			goto retry_lookup;
+		} else {
+			/* reorder parent's d_subdirs */
+			spin_lock(&parent->d_lock);
+			spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
+			list_move(&dn->d_child, &parent->d_subdirs);
+			spin_unlock(&dn->d_lock);
+			spin_unlock(&parent->d_lock);
+		}
+
+		/* inode */
+		if (d_really_is_positive(dn)) {
+			in = d_inode(dn);
+		} else {
+			in = ceph_get_inode(parent->d_sb, vino);
+			if (IS_ERR(in)) {
+				dout("new_inode badness\n");
+				d_drop(dn);
+				dput(dn);
+				err = PTR_ERR(in);
+				goto out;
+			}
+		}
+
+		if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+			       req->r_request_started, -1,
+			       &req->r_caps_reservation) < 0) {
+			pr_err("fill_inode badness on %p\n", in);
+			if (d_really_is_negative(dn))
+				iput(in);
+			d_drop(dn);
+			goto next_item;
+		}
+
+		if (d_really_is_negative(dn)) {
+			struct dentry *realdn = splice_dentry(dn, in, NULL);
+			if (IS_ERR(realdn)) {
+				err = PTR_ERR(realdn);
+				d_drop(dn);
+				dn = NULL;
+				goto next_item;
+			}
+			dn = realdn;
+		}
+
+		di = dn->d_fsdata;
+		di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+
+		update_dentry_lease(dn, rinfo->dir_dlease[i],
+				    req->r_session,
+				    req->r_request_started);
+next_item:
+		if (dn)
+			dput(dn);
+	}
+	if (err == 0)
+		req->r_did_prepopulate = true;
+
+out:
+	if (snapdir) {
+		iput(snapdir);
+		dput(parent);
+	}
+	dout("readdir_prepopulate done\n");
+	return err;
+}
+
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret = 0;
+
+	spin_lock(&ci->i_ceph_lock);
+	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+	inode->i_size = size;
+	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+
+	/* tell the MDS if we are approaching max_size */
+	if ((size << 1) >= ci->i_max_size &&
+	    (ci->i_reported_size << 1) < ci->i_max_size)
+		ret = 1;
+
+	spin_unlock(&ci->i_ceph_lock);
+	return ret;
+}
+
+/*
+ * Write back inode data in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+void ceph_queue_writeback(struct inode *inode)
+{
+	ihold(inode);
+	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+		       &ceph_inode(inode)->i_wb_work)) {
+		dout("ceph_queue_writeback %p\n", inode);
+	} else {
+		dout("ceph_queue_writeback %p failed\n", inode);
+		iput(inode);
+	}
+}
+
+static void ceph_writeback_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_wb_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("writeback %p\n", inode);
+	filemap_fdatawrite(&inode->i_data);
+	iput(inode);
+}
+
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+	ihold(inode);
+	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+		       &ceph_inode(inode)->i_pg_inv_work)) {
+		dout("ceph_queue_invalidate %p\n", inode);
+	} else {
+		dout("ceph_queue_invalidate %p failed\n", inode);
+		iput(inode);
+	}
+}
+
+/*
+ * Invalidate inode pages in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_invalidate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_pg_inv_work);
+	struct inode *inode = &ci->vfs_inode;
+	u32 orig_gen;
+	int check = 0;
+
+	mutex_lock(&ci->i_truncate_mutex);
+	spin_lock(&ci->i_ceph_lock);
+	dout("invalidate_pages %p gen %d revoking %d\n", inode,
+	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
+	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+			check = 1;
+		spin_unlock(&ci->i_ceph_lock);
+		mutex_unlock(&ci->i_truncate_mutex);
+		goto out;
+	}
+	orig_gen = ci->i_rdcache_gen;
+	spin_unlock(&ci->i_ceph_lock);
+
+	truncate_pagecache(inode, 0);
+
+	spin_lock(&ci->i_ceph_lock);
+	if (orig_gen == ci->i_rdcache_gen &&
+	    orig_gen == ci->i_rdcache_revoking) {
+		dout("invalidate_pages %p gen %d successful\n", inode,
+		     ci->i_rdcache_gen);
+		ci->i_rdcache_revoking--;
+		check = 1;
+	} else {
+		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
+		     inode, orig_gen, ci->i_rdcache_gen,
+		     ci->i_rdcache_revoking);
+		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+			check = 1;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&ci->i_truncate_mutex);
+out:
+	if (check)
+		ceph_check_caps(ci, 0, NULL);
+	iput(inode);
+}
+
+
+/*
+ * called by trunc_wq;
+ *
+ * We also truncate in a separate thread as well.
+ */
+static void ceph_vmtruncate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_vmtruncate_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("vmtruncate_work %p\n", inode);
+	__ceph_do_pending_vmtruncate(inode);
+	iput(inode);
+}
+
+/*
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	ihold(inode);
+
+	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
+		       &ci->i_vmtruncate_work)) {
+		dout("ceph_queue_vmtruncate %p\n", inode);
+	} else {
+		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+		     inode, ci->i_truncate_pending);
+		iput(inode);
+	}
+}
+
+/*
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 to;
+	int wrbuffer_refs, finish = 0;
+
+	mutex_lock(&ci->i_truncate_mutex);
+retry:
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_truncate_pending == 0) {
+		dout("__do_pending_vmtruncate %p none pending\n", inode);
+		spin_unlock(&ci->i_ceph_lock);
+		mutex_unlock(&ci->i_truncate_mutex);
+		return;
+	}
+
+	/*
+	 * make sure any dirty snapped pages are flushed before we
+	 * possibly truncate them.. so write AND block!
+	 */
+	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+		dout("__do_pending_vmtruncate %p flushing snaps first\n",
+		     inode);
+		spin_unlock(&ci->i_ceph_lock);
+		filemap_write_and_wait_range(&inode->i_data, 0,
+					     inode->i_sb->s_maxbytes);
+		goto retry;
+	}
+
+	/* there should be no reader or writer */
+	WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
+
+	to = ci->i_truncate_size;
+	wrbuffer_refs = ci->i_wrbuffer_ref;
+	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+	     ci->i_truncate_pending, to);
+	spin_unlock(&ci->i_ceph_lock);
+
+	truncate_pagecache(inode, to);
+
+	spin_lock(&ci->i_ceph_lock);
+	if (to == ci->i_truncate_size) {
+		ci->i_truncate_pending = 0;
+		finish = 1;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	if (!finish)
+		goto retry;
+
+	mutex_unlock(&ci->i_truncate_mutex);
+
+	if (wrbuffer_refs == 0)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+
+	wake_up_all(&ci->i_cap_wq);
+}
+
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
+	nd_set_link(nd, ci->i_symlink);
+	return NULL;
+}
+
+static const struct inode_operations ceph_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = ceph_sym_follow_link,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+};
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	const unsigned int ia_valid = attr->ia_valid;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	int issued;
+	int release = 0, dirtied = 0;
+	int mask = 0;
+	int err = 0;
+	int inode_dirty_flags = 0;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	err = inode_change_ok(inode, attr);
+	if (err != 0)
+		return err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	spin_lock(&ci->i_ceph_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (ia_valid & ATTR_UID) {
+		dout("setattr %p uid %d -> %d\n", inode,
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kuid(&init_user_ns, attr->ia_uid));
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_uid = attr->ia_uid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   !uid_eq(attr->ia_uid, inode->i_uid)) {
+			req->r_args.setattr.uid = cpu_to_le32(
+				from_kuid(&init_user_ns, attr->ia_uid));
+			mask |= CEPH_SETATTR_UID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_GID) {
+		dout("setattr %p gid %d -> %d\n", inode,
+		     from_kgid(&init_user_ns, inode->i_gid),
+		     from_kgid(&init_user_ns, attr->ia_gid));
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_gid = attr->ia_gid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   !gid_eq(attr->ia_gid, inode->i_gid)) {
+			req->r_args.setattr.gid = cpu_to_le32(
+				from_kgid(&init_user_ns, attr->ia_gid));
+			mask |= CEPH_SETATTR_GID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_MODE) {
+		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+		     attr->ia_mode);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_mode = attr->ia_mode;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_mode != inode->i_mode) {
+			inode->i_mode = attr->ia_mode;
+			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+			mask |= CEPH_SETATTR_MODE;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+
+	if (ia_valid & ATTR_ATIME) {
+		dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_atime,
+					    &attr->ia_atime) < 0) {
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+			ceph_encode_timespec(&req->r_args.setattr.atime,
+					     &attr->ia_atime);
+			mask |= CEPH_SETATTR_ATIME;
+			release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_MTIME) {
+		dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_mtime,
+					    &attr->ia_mtime) < 0) {
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			ceph_encode_timespec(&req->r_args.setattr.mtime,
+					     &attr->ia_mtime);
+			mask |= CEPH_SETATTR_MTIME;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_SIZE) {
+		dout("setattr %p size %lld -> %lld\n", inode,
+		     inode->i_size, attr->ia_size);
+		if ((issued & CEPH_CAP_FILE_EXCL) &&
+		    attr->ia_size > inode->i_size) {
+			inode->i_size = attr->ia_size;
+			inode->i_blocks =
+				(attr->ia_size + (1 << 9) - 1) >> 9;
+			inode->i_ctime = attr->ia_ctime;
+			ci->i_reported_size = attr->ia_size;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   attr->ia_size != inode->i_size) {
+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+			req->r_args.setattr.old_size =
+				cpu_to_le64(inode->i_size);
+			mask |= CEPH_SETATTR_SIZE;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+
+	/* these do nothing */
+	if (ia_valid & ATTR_CTIME) {
+		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+		dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+		     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+		     only ? "ctime only" : "ignored");
+		inode->i_ctime = attr->ia_ctime;
+		if (only) {
+			/*
+			 * if kernel wants to dirty ctime but nothing else,
+			 * we need to choose a cap to dirty under, or do
+			 * a almost-no-op setattr
+			 */
+			if (issued & CEPH_CAP_AUTH_EXCL)
+				dirtied |= CEPH_CAP_AUTH_EXCL;
+			else if (issued & CEPH_CAP_FILE_EXCL)
+				dirtied |= CEPH_CAP_FILE_EXCL;
+			else if (issued & CEPH_CAP_XATTR_EXCL)
+				dirtied |= CEPH_CAP_XATTR_EXCL;
+			else
+				mask |= CEPH_SETATTR_CTIME;
+		}
+	}
+	if (ia_valid & ATTR_FILE)
+		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+
+	if (dirtied) {
+		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+		inode->i_ctime = CURRENT_TIME;
+	}
+
+	release &= issued;
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (inode_dirty_flags)
+		__mark_inode_dirty(inode, inode_dirty_flags);
+
+	if (ia_valid & ATTR_MODE) {
+		err = posix_acl_chmod(inode, attr->ia_mode);
+		if (err)
+			goto out_put;
+	}
+
+	if (mask) {
+		req->r_inode = inode;
+		ihold(inode);
+		req->r_inode_drop = release;
+		req->r_args.setattr.mask = cpu_to_le32(mask);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+	}
+	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+	     ceph_cap_string(dirtied), mask);
+
+	ceph_mdsc_put_request(req);
+	if (mask & CEPH_SETATTR_SIZE)
+		__ceph_do_pending_vmtruncate(inode);
+	return err;
+out_put:
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Verify that we have a lease on the given mask.  If not,
+ * do a getattr against an mds.
+ */
+int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+		      int mask, bool force)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		dout("do_getattr inode %p SNAPDIR\n", inode);
+		return 0;
+	}
+
+	dout("do_getattr inode %p mask %s mode 0%o\n",
+	     inode, ceph_cap_string(mask), inode->i_mode);
+	if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+		return 0;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+	req->r_args.getattr.mask = cpu_to_le32(mask);
+	req->r_locked_page = locked_page;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	if (locked_page && err == 0) {
+		u64 inline_version = req->r_reply_info.targeti.inline_version;
+		if (inline_version == 0) {
+			/* the reply is supposed to contain inline data */
+			err = -EINVAL;
+		} else if (inline_version == CEPH_INLINE_NONE) {
+			err = -ENODATA;
+		} else {
+			err = req->r_reply_info.targeti.inline_len;
+		}
+	}
+	ceph_mdsc_put_request(req);
+	dout("do_getattr result=%d\n", err);
+	return err;
+}
+
+
+/*
+ * Check inode permissions.  We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+	int err;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
+
+	if (!err)
+		err = generic_permission(inode, mask);
+	return err;
+}
+
+/*
+ * Get all attributes.  Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int err;
+
+	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
+	if (!err) {
+		generic_fillattr(inode, stat);
+		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+		if (ceph_snap(inode) != CEPH_NOSNAP)
+			stat->dev = ceph_snap(inode);
+		else
+			stat->dev = 0;
+		if (S_ISDIR(inode->i_mode)) {
+			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+						RBYTES))
+				stat->size = ci->i_rbytes;
+			else
+				stat->size = ci->i_files + ci->i_subdirs;
+			stat->blocks = 0;
+			stat->blksize = 65536;
+		}
+	}
+	return err;
+}
diff --git a/kernel/fs/ceph/ioctl.c b/kernel/fs/ceph/ioctl.c
new file mode 100644
index 000000000..f851d8d70
--- /dev/null
+++ b/kernel/fs/ceph/ioctl.c
@@ -0,0 +1,295 @@
+#include <linux/ceph/ceph_debug.h>
+#include <linux/in.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "ioctl.h"
+
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
+	struct ceph_ioctl_layout l;
+	int err;
+
+	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
+	if (!err) {
+		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+		l.object_size = ceph_file_layout_object_size(ci->i_layout);
+		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		l.preferred_osd = (s32)-1;
+		if (copy_to_user(arg, &l, sizeof(l)))
+			return -EFAULT;
+	}
+
+	return err;
+}
+
+static long __validate_layout(struct ceph_mds_client *mdsc,
+			      struct ceph_ioctl_layout *l)
+{
+	int i, err;
+
+	/* validate striping parameters */
+	if ((l->object_size & ~PAGE_MASK) ||
+	    (l->stripe_unit & ~PAGE_MASK) ||
+	    ((unsigned)l->stripe_unit != 0 &&
+	     ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	mutex_lock(&mdsc->mutex);
+	err = -EINVAL;
+	for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+		if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) {
+			err = 0;
+			break;
+		}
+	mutex_unlock(&mdsc->mutex);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
+	struct ceph_ioctl_layout nl;
+	int err;
+
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	/* validate changed params against current layout */
+	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
+	if (err)
+		return err;
+
+	memset(&nl, 0, sizeof(nl));
+	if (l.stripe_count)
+		nl.stripe_count = l.stripe_count;
+	else
+		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+	if (l.stripe_unit)
+		nl.stripe_unit = l.stripe_unit;
+	else
+		nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+	if (l.object_size)
+		nl.object_size = l.object_size;
+	else
+		nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+	if (l.data_pool)
+		nl.data_pool = l.data_pool;
+	else
+		nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+
+	/* this is obsolete, and always -1 */
+	nl.preferred_osd = le64_to_cpu(-1);
+
+	err = __validate_layout(mdsc, &nl);
+	if (err)
+		return err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+
+	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+		cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+		cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+		cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	err = __validate_layout(mdsc, &l);
+	if (err)
+		return err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+				       USE_AUTH_MDS);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+			cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+			cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+			cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool =
+			cpu_to_le32(l.data_pool);
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+	struct ceph_ioctl_dataloc dl;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	struct ceph_object_locator oloc;
+	struct ceph_object_id oid;
+	u64 len = 1, olen;
+	u64 tmp;
+	struct ceph_pg pgid;
+	int r;
+
+	/* copy and validate */
+	if (copy_from_user(&dl, arg, sizeof(dl)))
+		return -EFAULT;
+
+	down_read(&osdc->map_sem);
+	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
+					  &dl.object_no, &dl.object_offset,
+					  &olen);
+	if (r < 0) {
+		up_read(&osdc->map_sem);
+		return -EIO;
+	}
+	dl.file_offset -= dl.object_offset;
+	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+	dl.block_size = ceph_file_layout_su(ci->i_layout);
+
+	/* block_offset = object_offset % block_size */
+	tmp = dl.object_offset;
+	dl.block_offset = do_div(tmp, dl.block_size);
+
+	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+		 ceph_ino(inode), dl.object_no);
+
+	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+	ceph_oid_set_name(&oid, dl.object_name);
+
+	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+	if (r < 0) {
+		up_read(&osdc->map_sem);
+		return r;
+	}
+
+	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	if (dl.osd >= 0) {
+		struct ceph_entity_addr *a =
+			ceph_osd_addr(osdc->osdmap, dl.osd);
+		if (a)
+			memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+	} else {
+		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+	}
+	up_read(&osdc->map_sem);
+
+	/* send result back to user */
+	if (copy_to_user(arg, &dl, sizeof(dl)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long ceph_ioctl_lazyio(struct file *file)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_nr_by_mode[fi->fmode]--;
+		fi->fmode |= CEPH_FILE_MODE_LAZY;
+		ci->i_nr_by_mode[fi->fmode]++;
+		spin_unlock(&ci->i_ceph_lock);
+		dout("ioctl_layzio: file %p marked lazy\n", file);
+
+		ceph_check_caps(ci, 0, NULL);
+	} else {
+		dout("ioctl_layzio: file %p already lazy\n", file);
+	}
+	return 0;
+}
+
+static long ceph_ioctl_syncio(struct file *file)
+{
+	struct ceph_file_info *fi = file->private_data;
+
+	fi->flags |= CEPH_F_SYNC;
+	return 0;
+}
+
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+	switch (cmd) {
+	case CEPH_IOC_GET_LAYOUT:
+		return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_SET_LAYOUT:
+		return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_SET_LAYOUT_POLICY:
+		return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
+	case CEPH_IOC_GET_DATALOC:
+		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+
+	case CEPH_IOC_LAZYIO:
+		return ceph_ioctl_lazyio(file);
+
+	case CEPH_IOC_SYNCIO:
+		return ceph_ioctl_syncio(file);
+	}
+
+	return -ENOTTY;
+}
diff --git a/kernel/fs/ceph/ioctl.h b/kernel/fs/ceph/ioctl.h
new file mode 100644
index 000000000..c77028afb
--- /dev/null
+++ b/kernel/fs/ceph/ioctl.h
@@ -0,0 +1,100 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/*
+ * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
+ * CEPH_IOC_SET_LAYOUT - set file layout
+ * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
+ *
+ * The file layout specifies how file data is striped over objects in
+ * the distributed object store, which object pool they belong to (if
+ * it differs from the default), and an optional 'preferred osd' to
+ * store them on.
+ *
+ * Files get a new layout based on the policy set on the containing
+ * directory or one of its ancestors.  The GET_LAYOUT ioctl will let
+ * you examine the layout for a file or the policy on a directory.
+ *
+ * SET_LAYOUT will let you set a layout on a newly created file.  This
+ * only works immediately after the file is created and before any
+ * data is written to it.
+ *
+ * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
+ * on a directory that will apply to any new files created in that
+ * directory (or any child directory that doesn't specify a layout of
+ * its own).
+ */
+
+/* use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+	__u64 stripe_unit, stripe_count, object_size;
+	__u64 data_pool;
+
+	/* obsolete.  new values ignored, always return -1 */
+	__s64 preferred_osd;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,		\
+				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
+				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,	\
+				   struct ceph_ioctl_layout)
+
+/*
+ * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
+ *
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+	__u64 file_offset;           /* in+out: file offset */
+	__u64 object_offset;         /* out: offset in object */
+	__u64 object_no;             /* out: object # */
+	__u64 object_size;           /* out: object size */
+	char object_name[64];        /* out: object name */
+	__u64 block_offset;          /* out: offset in block */
+	__u64 block_size;            /* out: block length */
+	__s64 osd;                   /* out: osd # */
+	struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
+				   struct ceph_ioctl_dataloc)
+
+/*
+ * CEPH_IOC_LAZYIO - relax consistency
+ *
+ * Normally Ceph switches to synchronous IO when multiple clients have
+ * the file open (and or more for write).  Reads and writes bypass the
+ * page cache and go directly to the OSD.  Setting this flag on a file
+ * descriptor will allow buffered IO for this file in cases where the
+ * application knows it won't interfere with other nodes (or doesn't
+ * care).
+ */
+#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+
+/*
+ * CEPH_IOC_SYNCIO - force synchronous IO
+ *
+ * This ioctl sets a file flag that forces the synchronous IO that
+ * bypasses the page cache, even if it is not necessary.  This is
+ * essentially the opposite behavior of IOC_LAZYIO.  This forces the
+ * same read/write path as a file opened by multiple clients when one
+ * or more of those clients is opened for write.
+ *
+ * Note that this type of sync IO takes a different path than a file
+ * opened with O_SYNC/D_SYNC (writes hit the page cache and are
+ * immediately flushed on page boundaries).  It is very similar to
+ * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
+ * are not copied (user page must remain stable) and O_DIRECT writes
+ * have alignment restrictions (on the buffer and file offset).
+ */
+#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
+
+#endif
diff --git a/kernel/fs/ceph/locks.c b/kernel/fs/ceph/locks.c
new file mode 100644
index 000000000..4347039ec
--- /dev/null
+++ b/kernel/fs/ceph/locks.c
@@ -0,0 +1,381 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/random.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/pagelist.h>
+
+static u64 lock_secret;
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+                                         struct ceph_mds_request *req);
+
+static inline u64 secure_addr(void *addr)
+{
+	u64 v = lock_secret ^ (u64)(unsigned long)addr;
+	/*
+	 * Set the most significant bit, so that MDS knows the 'owner'
+	 * is sufficient to identify the owner of lock. (old code uses
+	 * both 'owner' and 'pid')
+	 */
+	v |= (1ULL << 63);
+	return v;
+}
+
+void __init ceph_flock_init(void)
+{
+	get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
+
+/**
+ * Implement fcntl and flock locking functions.
+ */
+static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+			     int cmd, u8 wait, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+	u64 length = 0;
+	u64 owner;
+
+	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
+		wait = 0;
+
+	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+
+	/* mds requires start and length rather than start and end */
+	if (LLONG_MAX == fl->fl_end)
+		length = 0;
+	else
+		length = fl->fl_end - fl->fl_start + 1;
+
+	owner = secure_addr(fl->fl_owner);
+
+	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
+	     "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
+	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
+	     wait, fl->fl_type);
+
+	req->r_args.filelock_change.rule = lock_type;
+	req->r_args.filelock_change.type = cmd;
+	req->r_args.filelock_change.owner = cpu_to_le64(owner);
+	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
+	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
+	req->r_args.filelock_change.length = cpu_to_le64(length);
+	req->r_args.filelock_change.wait = wait;
+
+	if (wait)
+		req->r_wait_for_completion = ceph_lock_wait_for_completion;
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+
+	if (operation == CEPH_MDS_OP_GETFILELOCK) {
+		fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+			fl->fl_type = F_RDLCK;
+		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+			fl->fl_type = F_WRLCK;
+		else
+			fl->fl_type = F_UNLCK;
+
+		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+		if (length >= 1)
+			fl->fl_end = length -1;
+		else
+			fl->fl_end = 0;
+
+	}
+	ceph_mdsc_put_request(req);
+	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+	     "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
+	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
+	     length, wait, fl->fl_type, err);
+	return err;
+}
+
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+                                         struct ceph_mds_request *req)
+{
+	struct ceph_mds_request *intr_req;
+	struct inode *inode = req->r_inode;
+	int err, lock_type;
+
+	BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
+	if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
+		lock_type = CEPH_LOCK_FCNTL_INTR;
+	else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
+		lock_type = CEPH_LOCK_FLOCK_INTR;
+	else
+		BUG_ON(1);
+	BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
+
+	err = wait_for_completion_interruptible(&req->r_completion);
+	if (!err)
+		return 0;
+
+	dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
+	     req->r_tid);
+
+	intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
+					    USE_AUTH_MDS);
+	if (IS_ERR(intr_req))
+		return PTR_ERR(intr_req);
+
+	intr_req->r_inode = inode;
+	ihold(inode);
+	intr_req->r_num_caps = 1;
+
+	intr_req->r_args.filelock_change = req->r_args.filelock_change;
+	intr_req->r_args.filelock_change.rule = lock_type;
+	intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
+
+	err = ceph_mdsc_do_request(mdsc, inode, intr_req);
+	ceph_mdsc_put_request(intr_req);
+
+	if (err && err != -ERESTARTSYS)
+		return err;
+
+	wait_for_completion(&req->r_completion);
+	return 0;
+}
+
+/**
+ * Attempt to set an fcntl lock.
+ * For now, this just goes away to the server. Later it may be more awesome.
+ */
+int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	u8 lock_cmd;
+	int err;
+	u8 wait = 0;
+	u16 op = CEPH_MDS_OP_SETFILELOCK;
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	/* No mandatory locks */
+	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	dout("ceph_lock, fl_owner: %p", fl->fl_owner);
+
+	/* set wait bit as appropriate, then make command as Ceph expects it*/
+	if (IS_GETLK(cmd))
+		op = CEPH_MDS_OP_GETFILELOCK;
+	else if (IS_SETLKW(cmd))
+		wait = 1;
+
+	if (F_RDLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_SHARED;
+	else if (F_WRLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_EXCL;
+	else
+		lock_cmd = CEPH_LOCK_UNLOCK;
+
+	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
+	if (!err) {
+		if (op != CEPH_MDS_OP_GETFILELOCK) {
+			dout("mds locked, locking locally");
+			err = posix_lock_file(file, fl, NULL);
+			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+				/* undo! This should only happen if
+				 * the kernel detects local
+				 * deadlock. */
+				ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+						  CEPH_LOCK_UNLOCK, 0, fl);
+				dout("got %d on posix_lock_file, undid lock",
+				     err);
+			}
+		}
+	}
+	return err;
+}
+
+int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	u8 lock_cmd;
+	int err;
+	u8 wait = 0;
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	/* No mandatory locks */
+	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	dout("ceph_flock, fl_file: %p", fl->fl_file);
+
+	if (IS_SETLKW(cmd))
+		wait = 1;
+
+	if (F_RDLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_SHARED;
+	else if (F_WRLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_EXCL;
+	else
+		lock_cmd = CEPH_LOCK_UNLOCK;
+
+	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
+				file, lock_cmd, wait, fl);
+	if (!err) {
+		err = flock_lock_file_wait(file, fl);
+		if (err) {
+			ceph_lock_message(CEPH_LOCK_FLOCK,
+					  CEPH_MDS_OP_SETFILELOCK,
+					  file, CEPH_LOCK_UNLOCK, 0, fl);
+			dout("got %d on flock_lock_file_wait, undid lock", err);
+		}
+	}
+	return err;
+}
+
+/*
+ * Fills in the passed counter variables, so you can prepare pagelist metadata
+ * before calling ceph_encode_locks.
+ */
+void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
+{
+	struct file_lock *lock;
+	struct file_lock_context *ctx;
+
+	*fcntl_count = 0;
+	*flock_count = 0;
+
+	ctx = inode->i_flctx;
+	if (ctx) {
+		spin_lock(&ctx->flc_lock);
+		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+			++(*fcntl_count);
+		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
+			++(*flock_count);
+		spin_unlock(&ctx->flc_lock);
+	}
+	dout("counted %d flock locks and %d fcntl locks",
+	     *flock_count, *fcntl_count);
+}
+
+/**
+ * Encode the flock and fcntl locks for the given inode into the ceph_filelock
+ * array. Must be called with inode->i_lock already held.
+ * If we encounter more of a specific lock type than expected, return -ENOSPC.
+ */
+int ceph_encode_locks_to_buffer(struct inode *inode,
+				struct ceph_filelock *flocks,
+				int num_fcntl_locks, int num_flock_locks)
+{
+	struct file_lock *lock;
+	struct file_lock_context *ctx = inode->i_flctx;
+	int err = 0;
+	int seen_fcntl = 0;
+	int seen_flock = 0;
+	int l = 0;
+
+	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
+	     num_fcntl_locks);
+
+	if (!ctx)
+		return 0;
+
+	spin_lock(&ctx->flc_lock);
+	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+		++seen_fcntl;
+		if (seen_fcntl > num_fcntl_locks) {
+			err = -ENOSPC;
+			goto fail;
+		}
+		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		if (err)
+			goto fail;
+		++l;
+	}
+	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+		++seen_flock;
+		if (seen_flock > num_flock_locks) {
+			err = -ENOSPC;
+			goto fail;
+		}
+		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		if (err)
+			goto fail;
+		++l;
+	}
+fail:
+	spin_unlock(&ctx->flc_lock);
+	return err;
+}
+
+/**
+ * Copy the encoded flock and fcntl locks into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Returns zero on success.
+ */
+int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+			   struct ceph_pagelist *pagelist,
+			   int num_fcntl_locks, int num_flock_locks)
+{
+	int err = 0;
+	__le32 nlocks;
+
+	nlocks = cpu_to_le32(num_fcntl_locks);
+	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+	if (err)
+		goto out_fail;
+
+	err = ceph_pagelist_append(pagelist, flocks,
+				   num_fcntl_locks * sizeof(*flocks));
+	if (err)
+		goto out_fail;
+
+	nlocks = cpu_to_le32(num_flock_locks);
+	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+	if (err)
+		goto out_fail;
+
+	err = ceph_pagelist_append(pagelist,
+				   &flocks[num_fcntl_locks],
+				   num_flock_locks * sizeof(*flocks));
+out_fail:
+	return err;
+}
+
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+int lock_to_ceph_filelock(struct file_lock *lock,
+			  struct ceph_filelock *cephlock)
+{
+	int err = 0;
+	cephlock->start = cpu_to_le64(lock->fl_start);
+	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+	cephlock->client = cpu_to_le64(0);
+	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+
+	switch (lock->fl_type) {
+	case F_RDLCK:
+		cephlock->type = CEPH_LOCK_SHARED;
+		break;
+	case F_WRLCK:
+		cephlock->type = CEPH_LOCK_EXCL;
+		break;
+	case F_UNLCK:
+		cephlock->type = CEPH_LOCK_UNLOCK;
+		break;
+	default:
+		dout("Have unknown lock type %d", lock->fl_type);
+		err = -EINVAL;
+	}
+
+	return err;
+}
diff --git a/kernel/fs/ceph/mds_client.c b/kernel/fs/ceph/mds_client.c
new file mode 100644
index 000000000..84f37f34f
--- /dev/null
+++ b/kernel/fs/ceph/mds_client.c
@@ -0,0 +1,3871 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/utsname.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage.  Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid.  If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+struct ceph_reconnect_state {
+	int nr_caps;
+	struct ceph_pagelist *pagelist;
+	bool flock;
+};
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head);
+
+static const struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+			       struct ceph_mds_reply_info_in *info,
+			       u64 features)
+{
+	int err = -EIO;
+
+	info->in = *p;
+	*p += sizeof(struct ceph_mds_reply_inode) +
+		sizeof(*info->in->fragtree.splits) *
+		le32_to_cpu(info->in->fragtree.nsplits);
+
+	ceph_decode_32_safe(p, end, info->symlink_len, bad);
+	ceph_decode_need(p, end, info->symlink_len, bad);
+	info->symlink = *p;
+	*p += info->symlink_len;
+
+	if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+		ceph_decode_copy_safe(p, end, &info->dir_layout,
+				      sizeof(info->dir_layout), bad);
+	else
+		memset(&info->dir_layout, 0, sizeof(info->dir_layout));
+
+	ceph_decode_32_safe(p, end, info->xattr_len, bad);
+	ceph_decode_need(p, end, info->xattr_len, bad);
+	info->xattr_data = *p;
+	*p += info->xattr_len;
+
+	if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+		ceph_decode_64_safe(p, end, info->inline_version, bad);
+		ceph_decode_32_safe(p, end, info->inline_len, bad);
+		ceph_decode_need(p, end, info->inline_len, bad);
+		info->inline_data = *p;
+		*p += info->inline_len;
+	} else
+		info->inline_version = CEPH_INLINE_NONE;
+
+	return 0;
+bad:
+	return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  u64 features)
+{
+	int err;
+
+	if (info->head->is_dentry) {
+		err = parse_reply_info_in(p, end, &info->diri, features);
+		if (err < 0)
+			goto out_bad;
+
+		if (unlikely(*p + sizeof(*info->dirfrag) > end))
+			goto bad;
+		info->dirfrag = *p;
+		*p += sizeof(*info->dirfrag) +
+			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+		if (unlikely(*p > end))
+			goto bad;
+
+		ceph_decode_32_safe(p, end, info->dname_len, bad);
+		ceph_decode_need(p, end, info->dname_len, bad);
+		info->dname = *p;
+		*p += info->dname_len;
+		info->dlease = *p;
+		*p += sizeof(*info->dlease);
+	}
+
+	if (info->head->is_target) {
+		err = parse_reply_info_in(p, end, &info->targeti, features);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing mds trace %d\n", err);
+	return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+				struct ceph_mds_reply_info_parsed *info,
+				u64 features)
+{
+	u32 num, i = 0;
+	int err;
+
+	info->dir_dir = *p;
+	if (*p + sizeof(*info->dir_dir) > end)
+		goto bad;
+	*p += sizeof(*info->dir_dir) +
+		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+	if (*p > end)
+		goto bad;
+
+	ceph_decode_need(p, end, sizeof(num) + 2, bad);
+	num = ceph_decode_32(p);
+	info->dir_end = ceph_decode_8(p);
+	info->dir_complete = ceph_decode_8(p);
+	if (num == 0)
+		goto done;
+
+	BUG_ON(!info->dir_in);
+	info->dir_dname = (void *)(info->dir_in + num);
+	info->dir_dname_len = (void *)(info->dir_dname + num);
+	info->dir_dlease = (void *)(info->dir_dname_len + num);
+	if ((unsigned long)(info->dir_dlease + num) >
+	    (unsigned long)info->dir_in + info->dir_buf_size) {
+		pr_err("dir contents are larger than expected\n");
+		WARN_ON(1);
+		goto bad;
+	}
+
+	info->dir_nr = num;
+	while (num) {
+		/* dentry */
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		info->dir_dname_len[i] = ceph_decode_32(p);
+		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+		info->dir_dname[i] = *p;
+		*p += info->dir_dname_len[i];
+		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+		     info->dir_dname[i]);
+		info->dir_dlease[i] = *p;
+		*p += sizeof(struct ceph_mds_reply_lease);
+
+		/* inode */
+		err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+		if (err < 0)
+			goto out_bad;
+		i++;
+		num--;
+	}
+
+done:
+	if (*p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing dir contents %d\n", err);
+	return err;
+}
+
+/*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+				     struct ceph_mds_reply_info_parsed *info,
+				     u64 features)
+{
+	if (*p + sizeof(*info->filelock_reply) > end)
+		goto bad;
+
+	info->filelock_reply = *p;
+	*p += sizeof(*info->filelock_reply);
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
+/*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  u64 features)
+{
+	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+		if (*p == end) {
+			info->has_create_ino = false;
+		} else {
+			info->has_create_ino = true;
+			info->ino = ceph_decode_64(p);
+		}
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  u64 features)
+{
+	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+		return parse_reply_info_filelock(p, end, info, features);
+	else if (info->head->op == CEPH_MDS_OP_READDIR ||
+		 info->head->op == CEPH_MDS_OP_LSSNAP)
+		return parse_reply_info_dir(p, end, info, features);
+	else if (info->head->op == CEPH_MDS_OP_CREATE)
+		return parse_reply_info_create(p, end, info, features);
+	else
+		return -EIO;
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+			    struct ceph_mds_reply_info_parsed *info,
+			    u64 features)
+{
+	void *p, *end;
+	u32 len;
+	int err;
+
+	info->head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+	/* trace */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
+		err = parse_reply_info_trace(&p, p+len, info, features);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* extra */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
+		err = parse_reply_info_extra(&p, p+len, info, features);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* snap blob */
+	ceph_decode_32_safe(&p, end, len, bad);
+	info->snapblob_len = len;
+	info->snapblob = p;
+	p += len;
+
+	if (p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("mds parse_reply err %d\n", err);
+	return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+	if (!info->dir_in)
+		return;
+	free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+}
+
+
+/*
+ * sessions
+ */
+const char *ceph_session_state_name(int s)
+{
+	switch (s) {
+	case CEPH_MDS_SESSION_NEW: return "new";
+	case CEPH_MDS_SESSION_OPENING: return "opening";
+	case CEPH_MDS_SESSION_OPEN: return "open";
+	case CEPH_MDS_SESSION_HUNG: return "hung";
+	case CEPH_MDS_SESSION_CLOSING: return "closing";
+	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+	default: return "???";
+	}
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+	if (atomic_inc_not_zero(&s->s_ref)) {
+		dout("mdsc get_session %p %d -> %d\n", s,
+		     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+		return s;
+	} else {
+		dout("mdsc get_session %p 0 -- FAIL", s);
+		return NULL;
+	}
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+	dout("mdsc put_session %p %d -> %d\n", s,
+	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+	if (atomic_dec_and_test(&s->s_ref)) {
+		if (s->s_auth.authorizer)
+			ceph_auth_destroy_authorizer(
+				s->s_mdsc->fsc->client->monc.auth,
+				s->s_auth.authorizer);
+		kfree(s);
+	}
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+						   int mds)
+{
+	struct ceph_mds_session *session;
+
+	if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+		return NULL;
+	session = mdsc->sessions[mds];
+	dout("lookup_mds_session %p %d\n", session,
+	     atomic_read(&session->s_ref));
+	get_session(session);
+	return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+	if (mds >= mdsc->max_sessions)
+		return false;
+	return mdsc->sessions[mds];
+}
+
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+				       struct ceph_mds_session *s)
+{
+	if (s->s_mds >= mdsc->max_sessions ||
+	    mdsc->sessions[s->s_mds] != s)
+		return -ENOENT;
+	return 0;
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+						 int mds)
+{
+	struct ceph_mds_session *s;
+
+	if (mds >= mdsc->mdsmap->m_max_mds)
+		return ERR_PTR(-EINVAL);
+
+	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s)
+		return ERR_PTR(-ENOMEM);
+	s->s_mdsc = mdsc;
+	s->s_mds = mds;
+	s->s_state = CEPH_MDS_SESSION_NEW;
+	s->s_ttl = 0;
+	s->s_seq = 0;
+	mutex_init(&s->s_mutex);
+
+	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
+
+	spin_lock_init(&s->s_gen_ttl_lock);
+	s->s_cap_gen = 0;
+	s->s_cap_ttl = jiffies - 1;
+
+	spin_lock_init(&s->s_cap_lock);
+	s->s_renew_requested = 0;
+	s->s_renew_seq = 0;
+	INIT_LIST_HEAD(&s->s_caps);
+	s->s_nr_caps = 0;
+	s->s_trim_caps = 0;
+	atomic_set(&s->s_ref, 1);
+	INIT_LIST_HEAD(&s->s_waiting);
+	INIT_LIST_HEAD(&s->s_unsafe);
+	s->s_num_cap_releases = 0;
+	s->s_cap_reconnect = 0;
+	s->s_cap_iterator = NULL;
+	INIT_LIST_HEAD(&s->s_cap_releases);
+	INIT_LIST_HEAD(&s->s_cap_releases_done);
+	INIT_LIST_HEAD(&s->s_cap_flushing);
+	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+
+	dout("register_session mds%d\n", mds);
+	if (mds >= mdsc->max_sessions) {
+		int newmax = 1 << get_count_order(mds+1);
+		struct ceph_mds_session **sa;
+
+		dout("register_session realloc to %d\n", newmax);
+		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+		if (sa == NULL)
+			goto fail_realloc;
+		if (mdsc->sessions) {
+			memcpy(sa, mdsc->sessions,
+			       mdsc->max_sessions * sizeof(void *));
+			kfree(mdsc->sessions);
+		}
+		mdsc->sessions = sa;
+		mdsc->max_sessions = newmax;
+	}
+	mdsc->sessions[mds] = s;
+	atomic_inc(&mdsc->num_sessions);
+	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+
+	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	return s;
+
+fail_realloc:
+	kfree(s);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *s)
+{
+	dout("__unregister_session mds%d %p\n", s->s_mds, s);
+	BUG_ON(mdsc->sessions[s->s_mds] != s);
+	mdsc->sessions[s->s_mds] = NULL;
+	ceph_con_close(&s->s_con);
+	ceph_put_mds_session(s);
+	atomic_dec(&mdsc->num_sessions);
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+	if (req->r_session) {
+		ceph_put_mds_session(req->r_session);
+		req->r_session = NULL;
+	}
+}
+
+void ceph_mdsc_release_request(struct kref *kref)
+{
+	struct ceph_mds_request *req = container_of(kref,
+						    struct ceph_mds_request,
+						    r_kref);
+	destroy_reply_info(&req->r_reply_info);
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
+	if (req->r_inode) {
+		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+		iput(req->r_inode);
+	}
+	if (req->r_locked_dir)
+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+	iput(req->r_target_inode);
+	if (req->r_dentry)
+		dput(req->r_dentry);
+	if (req->r_old_dentry)
+		dput(req->r_old_dentry);
+	if (req->r_old_dentry_dir) {
+		/*
+		 * track (and drop pins for) r_old_dentry_dir
+		 * separately, since r_old_dentry's d_parent may have
+		 * changed between the dir mutex being dropped and
+		 * this request being freed.
+		 */
+		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
+				  CEPH_CAP_PIN);
+		iput(req->r_old_dentry_dir);
+	}
+	kfree(req->r_path1);
+	kfree(req->r_path2);
+	if (req->r_pagelist)
+		ceph_pagelist_release(req->r_pagelist);
+	put_request_session(req);
+	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
+	kfree(req);
+}
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+					     u64 tid)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *n = mdsc->request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mds_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else {
+			ceph_mdsc_get_request(req);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static void __insert_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *new)
+{
+	struct rb_node **p = &mdsc->request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mds_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mds_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &mdsc->request_tree);
+}
+
+/*
+ * Register an in-flight request, and assign a tid.  Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_request *req,
+			       struct inode *dir)
+{
+	req->r_tid = ++mdsc->last_tid;
+	if (req->r_num_caps)
+		ceph_reserve_caps(mdsc, &req->r_caps_reservation,
+				  req->r_num_caps);
+	dout("__register_request %p tid %lld\n", req, req->r_tid);
+	ceph_mdsc_get_request(req);
+	__insert_request(mdsc, req);
+
+	req->r_uid = current_fsuid();
+	req->r_gid = current_fsgid();
+
+	if (dir) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+
+		ihold(dir);
+		spin_lock(&ci->i_unsafe_lock);
+		req->r_unsafe_dir = dir;
+		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &mdsc->request_tree);
+	RB_CLEAR_NODE(&req->r_node);
+
+	if (req->r_unsafe_dir) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_dir_item);
+		spin_unlock(&ci->i_unsafe_lock);
+
+		iput(req->r_unsafe_dir);
+		req->r_unsafe_dir = NULL;
+	}
+
+	complete_all(&req->r_safe_completion);
+
+	ceph_mdsc_put_request(req);
+}
+
+/*
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+	/*
+	 * we don't need to worry about protecting the d_parent access
+	 * here because we never renaming inside the snapped namespace
+	 * except to resplice to another snapdir, and either the old or new
+	 * result is a valid result.
+	 */
+	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+		dentry = dentry->d_parent;
+	return dentry;
+}
+
+static int __choose_mds(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_cap *cap;
+	int mode = req->r_direct_mode;
+	int mds = -1;
+	u32 hash = req->r_direct_hash;
+	bool is_hash = req->r_direct_is_hash;
+
+	/*
+	 * is there a specific mds we should try?  ignore hint if we have
+	 * no session and the mds is not up (active or recovering).
+	 */
+	if (req->r_resend_mds >= 0 &&
+	    (__have_session(mdsc, req->r_resend_mds) ||
+	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+		dout("choose_mds using resend_mds mds%d\n",
+		     req->r_resend_mds);
+		return req->r_resend_mds;
+	}
+
+	if (mode == USE_RANDOM_MDS)
+		goto random;
+
+	inode = NULL;
+	if (req->r_inode) {
+		inode = req->r_inode;
+	} else if (req->r_dentry) {
+		/* ignore race with rename; old or new d_parent is okay */
+		struct dentry *parent = req->r_dentry->d_parent;
+		struct inode *dir = d_inode(parent);
+
+		if (dir->i_sb != mdsc->fsc->sb) {
+			/* not this fs! */
+			inode = d_inode(req->r_dentry);
+		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
+			/* direct snapped/virtual snapdir requests
+			 * based on parent dir inode */
+			struct dentry *dn = get_nonsnap_parent(parent);
+			inode = d_inode(dn);
+			dout("__choose_mds using nonsnap parent %p\n", inode);
+		} else {
+			/* dentry target */
+			inode = d_inode(req->r_dentry);
+			if (!inode || mode == USE_AUTH_MDS) {
+				/* dir + name */
+				inode = dir;
+				hash = ceph_dentry_hash(dir, req->r_dentry);
+				is_hash = true;
+			}
+		}
+	}
+
+	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+	     (int)hash, mode);
+	if (!inode)
+		goto random;
+	ci = ceph_inode(inode);
+
+	if (is_hash && S_ISDIR(inode->i_mode)) {
+		struct ceph_inode_frag frag;
+		int found;
+
+		ceph_choose_frag(ci, hash, &frag, &found);
+		if (found) {
+			if (mode == USE_ANY_MDS && frag.ndist > 0) {
+				u8 r;
+
+				/* choose a random replica */
+				get_random_bytes(&r, 1);
+				r %= frag.ndist;
+				mds = frag.dist[r];
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (%d/%d)\n",
+				     inode, ceph_vinop(inode),
+				     frag.frag, mds,
+				     (int)r, frag.ndist);
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
+			}
+
+			/* since this file/dir wasn't known to be
+			 * replicated, then we want to look for the
+			 * authoritative mds. */
+			mode = USE_AUTH_MDS;
+			if (frag.mds >= 0) {
+				/* choose auth mds */
+				mds = frag.mds;
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (auth)\n",
+				     inode, ceph_vinop(inode), frag.frag, mds);
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
+			}
+		}
+	}
+
+	spin_lock(&ci->i_ceph_lock);
+	cap = NULL;
+	if (mode == USE_AUTH_MDS)
+		cap = ci->i_auth_cap;
+	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+	if (!cap) {
+		spin_unlock(&ci->i_ceph_lock);
+		goto random;
+	}
+	mds = cap->session->s_mds;
+	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+	     inode, ceph_vinop(inode), mds,
+	     cap == ci->i_auth_cap ? "auth " : "", cap);
+	spin_unlock(&ci->i_ceph_lock);
+	return mds;
+
+random:
+	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+	dout("choose_mds chose random mds%d\n", mds);
+	return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_session_head *h;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
+			   false);
+	if (!msg) {
+		pr_err("create_session_msg ENOMEM creating msg\n");
+		return NULL;
+	}
+	h = msg->front.iov_base;
+	h->op = cpu_to_le32(op);
+	h->seq = cpu_to_le64(seq);
+
+	return msg;
+}
+
+/*
+ * session message, specialization for CEPH_SESSION_REQUEST_OPEN
+ * to include additional client metadata fields.
+ */
+static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_session_head *h;
+	int i = -1;
+	int metadata_bytes = 0;
+	int metadata_key_count = 0;
+	struct ceph_options *opt = mdsc->fsc->client->options;
+	void *p;
+
+	const char* metadata[][2] = {
+		{"hostname", utsname()->nodename},
+		{"kernel_version", utsname()->release},
+		{"entity_id", opt->name ? opt->name : ""},
+		{NULL, NULL}
+	};
+
+	/* Calculate serialized length of metadata */
+	metadata_bytes = 4;  /* map length */
+	for (i = 0; metadata[i][0] != NULL; ++i) {
+		metadata_bytes += 8 + strlen(metadata[i][0]) +
+			strlen(metadata[i][1]);
+		metadata_key_count++;
+	}
+
+	/* Allocate the message */
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes,
+			   GFP_NOFS, false);
+	if (!msg) {
+		pr_err("create_session_msg ENOMEM creating msg\n");
+		return NULL;
+	}
+	h = msg->front.iov_base;
+	h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+	h->seq = cpu_to_le64(seq);
+
+	/*
+	 * Serialize client metadata into waiting buffer space, using
+	 * the format that userspace expects for map<string, string>
+	 *
+	 * ClientSession messages with metadata are v2
+	 */
+	msg->hdr.version = cpu_to_le16(2);
+	msg->hdr.compat_version = cpu_to_le16(1);
+
+	/* The write pointer, following the session_head structure */
+	p = msg->front.iov_base + sizeof(*h);
+
+	/* Number of entries in the map */
+	ceph_encode_32(&p, metadata_key_count);
+
+	/* Two length-prefixed strings for each entry in the map */
+	for (i = 0; metadata[i][0] != NULL; ++i) {
+		size_t const key_len = strlen(metadata[i][0]);
+		size_t const val_len = strlen(metadata[i][1]);
+
+		ceph_encode_32(&p, key_len);
+		memcpy(p, metadata[i][0], key_len);
+		p += key_len;
+		ceph_encode_32(&p, val_len);
+		memcpy(p, metadata[i][1], val_len);
+		p += val_len;
+	}
+
+	return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int mstate;
+	int mds = session->s_mds;
+
+	/* wait for mds to go active? */
+	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+	dout("open_session to mds%d (%s)\n", mds,
+	     ceph_mds_state_name(mstate));
+	session->s_state = CEPH_MDS_SESSION_OPENING;
+	session->s_renew_requested = jiffies;
+
+	/* send connect message */
+	msg = create_session_open_msg(mdsc, session->s_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * open sessions for any export targets for the given mds
+ *
+ * called under mdsc->mutex
+ */
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+	struct ceph_mds_session *session;
+
+	session = __ceph_lookup_mds_session(mdsc, target);
+	if (!session) {
+		session = register_session(mdsc, target);
+		if (IS_ERR(session))
+			return session;
+	}
+	if (session->s_state == CEPH_MDS_SESSION_NEW ||
+	    session->s_state == CEPH_MDS_SESSION_CLOSING)
+		__open_session(mdsc, session);
+
+	return session;
+}
+
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+	struct ceph_mds_session *session;
+
+	dout("open_export_target_session to mds%d\n", target);
+
+	mutex_lock(&mdsc->mutex);
+	session = __open_export_target_session(mdsc, target);
+	mutex_unlock(&mdsc->mutex);
+
+	return session;
+}
+
+static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
+					  struct ceph_mds_session *session)
+{
+	struct ceph_mds_info *mi;
+	struct ceph_mds_session *ts;
+	int i, mds = session->s_mds;
+
+	if (mds >= mdsc->mdsmap->m_max_mds)
+		return;
+
+	mi = &mdsc->mdsmap->m_info[mds];
+	dout("open_export_target_sessions for mds%d (%d targets)\n",
+	     session->s_mds, mi->num_export_targets);
+
+	for (i = 0; i < mi->num_export_targets; i++) {
+		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
+		if (!IS_ERR(ts))
+			ceph_put_mds_session(ts);
+	}
+}
+
+void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+					   struct ceph_mds_session *session)
+{
+	mutex_lock(&mdsc->mutex);
+	__open_export_target_sessions(mdsc, session);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * session caps
+ */
+
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+static void cleanup_session_requests(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_session *session)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *p;
+
+	dout("cleanup_session_requests mds%d\n", session->s_mds);
+	mutex_lock(&mdsc->mutex);
+	while (!list_empty(&session->s_unsafe)) {
+		req = list_first_entry(&session->s_unsafe,
+				       struct ceph_mds_request, r_unsafe_item);
+		list_del_init(&req->r_unsafe_item);
+		pr_info(" dropping unsafe request %llu\n", req->r_tid);
+		__unregister_request(mdsc, req);
+	}
+	/* zero r_attempts, so kick_requests() will re-send requests */
+	p = rb_first(&mdsc->request_tree);
+	while (p) {
+		req = rb_entry(p, struct ceph_mds_request, r_node);
+		p = rb_next(p);
+		if (req->r_session &&
+		    req->r_session->s_mds == session->s_mds)
+			req->r_attempts = 0;
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
+ *
+ * Caller must hold session s_mutex.
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+				 int (*cb)(struct inode *, struct ceph_cap *,
+					    void *), void *arg)
+{
+	struct list_head *p;
+	struct ceph_cap *cap;
+	struct inode *inode, *last_inode = NULL;
+	struct ceph_cap *old_cap = NULL;
+	int ret;
+
+	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	p = session->s_caps.next;
+	while (p != &session->s_caps) {
+		cap = list_entry(p, struct ceph_cap, session_caps);
+		inode = igrab(&cap->ci->vfs_inode);
+		if (!inode) {
+			p = p->next;
+			continue;
+		}
+		session->s_cap_iterator = cap;
+		spin_unlock(&session->s_cap_lock);
+
+		if (last_inode) {
+			iput(last_inode);
+			last_inode = NULL;
+		}
+		if (old_cap) {
+			ceph_put_cap(session->s_mdsc, old_cap);
+			old_cap = NULL;
+		}
+
+		ret = cb(inode, cap, arg);
+		last_inode = inode;
+
+		spin_lock(&session->s_cap_lock);
+		p = p->next;
+		if (cap->ci == NULL) {
+			dout("iterate_session_caps  finishing cap %p removal\n",
+			     cap);
+			BUG_ON(cap->session != session);
+			list_del_init(&cap->session_caps);
+			session->s_nr_caps--;
+			cap->session = NULL;
+			old_cap = cap;  /* put_cap it w/o locks held */
+		}
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	session->s_cap_iterator = NULL;
+	spin_unlock(&session->s_cap_lock);
+
+	iput(last_inode);
+	if (old_cap)
+		ceph_put_cap(session->s_mdsc, old_cap);
+
+	return ret;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+				  void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = 0;
+
+	dout("removing cap %p, ci is %p, inode is %p\n",
+	     cap, ci, &ci->vfs_inode);
+	spin_lock(&ci->i_ceph_lock);
+	__ceph_remove_cap(cap, false);
+	if (!ci->i_auth_cap) {
+		struct ceph_mds_client *mdsc =
+			ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			pr_info(" dropping dirty %s state for %p %lld\n",
+				ceph_cap_string(ci->i_dirty_caps),
+				inode, ceph_ino(inode));
+			ci->i_dirty_caps = 0;
+			list_del_init(&ci->i_dirty_item);
+			drop = 1;
+		}
+		if (!list_empty(&ci->i_flushing_item)) {
+			pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+				ceph_cap_string(ci->i_flushing_caps),
+				inode, ceph_ino(inode));
+			ci->i_flushing_caps = 0;
+			list_del_init(&ci->i_flushing_item);
+			mdsc->num_cap_flushing--;
+			drop = 1;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	while (drop--)
+		iput(inode);
+	return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+	dout("remove_session_caps on %p\n", session);
+	iterate_session_caps(session, remove_session_caps_cb, NULL);
+
+	spin_lock(&session->s_cap_lock);
+	if (session->s_nr_caps > 0) {
+		struct super_block *sb = session->s_mdsc->fsc->sb;
+		struct inode *inode;
+		struct ceph_cap *cap, *prev = NULL;
+		struct ceph_vino vino;
+		/*
+		 * iterate_session_caps() skips inodes that are being
+		 * deleted, we need to wait until deletions are complete.
+		 * __wait_on_freeing_inode() is designed for the job,
+		 * but it is not exported, so use lookup inode function
+		 * to access it.
+		 */
+		while (!list_empty(&session->s_caps)) {
+			cap = list_entry(session->s_caps.next,
+					 struct ceph_cap, session_caps);
+			if (cap == prev)
+				break;
+			prev = cap;
+			vino = cap->ci->i_vino;
+			spin_unlock(&session->s_cap_lock);
+
+			inode = ceph_find_inode(sb, vino);
+			iput(inode);
+
+			spin_lock(&session->s_cap_lock);
+		}
+	}
+	spin_unlock(&session->s_cap_lock);
+
+	BUG_ON(session->s_nr_caps > 0);
+	BUG_ON(!list_empty(&session->s_cap_flushing));
+	cleanup_cap_releases(session);
+}
+
+/*
+ * wake up any threads waiting on this session's caps.  if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+			      void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	wake_up_all(&ci->i_cap_wq);
+	if (arg) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_wanted_max_size = 0;
+		ci->i_requested_max_size = 0;
+		spin_unlock(&ci->i_ceph_lock);
+	}
+	return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session,
+				 int reconnect)
+{
+	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+	iterate_session_caps(session, wake_up_session_cb,
+			     (void *)(unsigned long)reconnect);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps.  The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int state;
+
+	if (time_after_eq(jiffies, session->s_cap_ttl) &&
+	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+		pr_info("mds%d caps stale\n", session->s_mds);
+	session->s_renew_requested = jiffies;
+
+	/* do not try to renew caps until a recovering mds has reconnected
+	 * with its clients. */
+	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+	if (state < CEPH_MDS_STATE_RECONNECT) {
+		dout("send_renew_caps ignoring mds%d (%s)\n",
+		     session->s_mds, ceph_mds_state_name(state));
+		return 0;
+	}
+
+	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+		ceph_mds_state_name(state));
+	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+				 ++session->s_renew_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session, u64 seq)
+{
+	struct ceph_msg *msg;
+
+	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
+	     session->s_mds, ceph_session_state_name(session->s_state), seq);
+	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session, int is_renew)
+{
+	int was_stale;
+	int wake = 0;
+
+	spin_lock(&session->s_cap_lock);
+	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
+
+	session->s_cap_ttl = session->s_renew_requested +
+		mdsc->mdsmap->m_session_timeout*HZ;
+
+	if (was_stale) {
+		if (time_before(jiffies, session->s_cap_ttl)) {
+			pr_info("mds%d caps renewed\n", session->s_mds);
+			wake = 1;
+		} else {
+			pr_info("mds%d caps still stale\n", session->s_mds);
+		}
+	}
+	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+	spin_unlock(&session->s_cap_lock);
+
+	if (wake)
+		wake_up_session_caps(session, 0);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("request_close_session mds%d state %s seq %lld\n",
+	     session->s_mds, ceph_session_state_name(session->s_state),
+	     session->s_seq);
+	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session)
+{
+	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+		return 0;
+	session->s_state = CEPH_MDS_SESSION_CLOSING;
+	return request_close_session(mdsc, session);
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+	struct ceph_mds_session *session = arg;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int used, wanted, oissued, mine;
+
+	if (session->s_trim_caps <= 0)
+		return -1;
+
+	spin_lock(&ci->i_ceph_lock);
+	mine = cap->issued | cap->implemented;
+	used = __ceph_caps_used(ci);
+	wanted = __ceph_caps_file_wanted(ci);
+	oissued = __ceph_caps_issued_other(ci, cap);
+
+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
+	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+	     ceph_cap_string(used), ceph_cap_string(wanted));
+	if (cap == ci->i_auth_cap) {
+		if (ci->i_dirty_caps | ci->i_flushing_caps)
+			goto out;
+		if ((used | wanted) & CEPH_CAP_ANY_WR)
+			goto out;
+	}
+	if ((used | wanted) & ~oissued & mine)
+		goto out;   /* we need these caps */
+
+	session->s_trim_caps--;
+	if (oissued) {
+		/* we aren't the only cap.. just remove us */
+		__ceph_remove_cap(cap, true);
+	} else {
+		/* try to drop referring dentries */
+		spin_unlock(&ci->i_ceph_lock);
+		d_prune_aliases(inode);
+		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
+		     inode, cap, atomic_read(&inode->i_count));
+		return 0;
+	}
+
+out:
+	spin_unlock(&ci->i_ceph_lock);
+	return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+		     struct ceph_mds_session *session,
+		     int max_caps)
+{
+	int trim_caps = session->s_nr_caps - max_caps;
+
+	dout("trim_caps mds%d start: %d / %d, trim %d\n",
+	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+	if (trim_caps > 0) {
+		session->s_trim_caps = trim_caps;
+		iterate_session_caps(session, trim_caps_cb, session);
+		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+		     session->s_mds, session->s_nr_caps, max_caps,
+			trim_caps - session->s_trim_caps);
+		session->s_trim_caps = 0;
+	}
+
+	ceph_add_cap_releases(mdsc, session);
+	ceph_send_cap_releases(mdsc, session);
+	return 0;
+}
+
+/*
+ * Allocate cap_release messages.  If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg, *partial = NULL;
+	struct ceph_mds_cap_release *head;
+	int err = -ENOMEM;
+	int extra = mdsc->fsc->mount_options->cap_release_safety;
+	int num;
+
+	dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
+	     extra);
+
+	spin_lock(&session->s_cap_lock);
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				 list_head);
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		if (num) {
+			dout(" partial %p with (%d/%d)\n", msg, num,
+			     (int)CEPH_CAPS_PER_RELEASE);
+			extra += CEPH_CAPS_PER_RELEASE - num;
+			partial = msg;
+		}
+	}
+	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+		spin_unlock(&session->s_cap_lock);
+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+				   GFP_NOFS, false);
+		if (!msg)
+			goto out_unlocked;
+		dout("add_cap_releases %p msg %p now %d\n", session, msg,
+		     (int)msg->front.iov_len);
+		head = msg->front.iov_base;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		spin_lock(&session->s_cap_lock);
+		list_add(&msg->list_head, &session->s_cap_releases);
+		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+	}
+
+	if (partial) {
+		head = partial->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout(" queueing partial %p with %d/%d\n", partial, num,
+		     (int)CEPH_CAPS_PER_RELEASE);
+		list_move_tail(&partial->list_head,
+			       &session->s_cap_releases_done);
+		session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
+	}
+	err = 0;
+	spin_unlock(&session->s_cap_lock);
+out_unlocked:
+	return err;
+}
+
+static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_flushing_caps)
+		ret = ci->i_cap_flush_seq >= want_flush_seq;
+	else
+		ret = 1;
+	spin_unlock(&ci->i_ceph_lock);
+	return ret;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+	int mds;
+
+	dout("check_cap_flush want %lld\n", want_flush_seq);
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; mds < mdsc->max_sessions; mds++) {
+		struct ceph_mds_session *session = mdsc->sessions[mds];
+		struct inode *inode = NULL;
+
+		if (!session)
+			continue;
+		get_session(session);
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&session->s_mutex);
+		if (!list_empty(&session->s_cap_flushing)) {
+			struct ceph_inode_info *ci =
+				list_entry(session->s_cap_flushing.next,
+					   struct ceph_inode_info,
+					   i_flushing_item);
+
+			if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
+				dout("check_cap_flush still flushing %p "
+				     "seq %lld <= %lld to mds%d\n",
+				     &ci->vfs_inode, ci->i_cap_flush_seq,
+				     want_flush_seq, session->s_mds);
+				inode = igrab(&ci->vfs_inode);
+			}
+		}
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+
+		if (inode) {
+			wait_event(mdsc->cap_flushing_wq,
+				   check_cap_flush(inode, want_flush_seq));
+			iput(inode);
+		}
+
+		mutex_lock(&mdsc->mutex);
+	}
+
+	mutex_unlock(&mdsc->mutex);
+	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+}
+
+/*
+ * called under s_mutex
+ */
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+			    struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("send_cap_releases mds%d\n", session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		spin_unlock(&session->s_cap_lock);
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+		ceph_con_send(&session->s_con, msg);
+		spin_lock(&session->s_cap_lock);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	unsigned num;
+
+	dout("discard_cap_releases mds%d\n", session->s_mds);
+
+	if (!list_empty(&session->s_cap_releases)) {
+		/* zero out the in-progress message */
+		msg = list_first_entry(&session->s_cap_releases,
+					struct ceph_msg, list_head);
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout("discard_cap_releases mds%d %p %u\n",
+		     session->s_mds, msg, num);
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		session->s_num_cap_releases += num;
+	}
+
+	/* requeue completed messages */
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+		     num);
+		session->s_num_cap_releases += num;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		list_add(&msg->list_head, &session->s_cap_releases);
+	}
+}
+
+/*
+ * requests
+ */
+
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+				    struct inode *dir)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+	size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+		      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+	int order, num_entries;
+
+	spin_lock(&ci->i_ceph_lock);
+	num_entries = ci->i_files + ci->i_subdirs;
+	spin_unlock(&ci->i_ceph_lock);
+	num_entries = max(num_entries, 1);
+	num_entries = min(num_entries, opt->max_readdir);
+
+	order = get_order(size * num_entries);
+	while (order >= 0) {
+		rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+							order);
+		if (rinfo->dir_in)
+			break;
+		order--;
+	}
+	if (!rinfo->dir_in)
+		return -ENOMEM;
+
+	num_entries = (PAGE_SIZE << order) / size;
+	num_entries = min(num_entries, opt->max_readdir);
+
+	rinfo->dir_buf_size = PAGE_SIZE << order;
+	req->r_num_caps = num_entries + 1;
+	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+	return 0;
+}
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&req->r_fill_mutex);
+	req->r_mdsc = mdsc;
+	req->r_started = jiffies;
+	req->r_resend_mds = -1;
+	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+	req->r_fmode = -1;
+	kref_init(&req->r_kref);
+	INIT_LIST_HEAD(&req->r_wait);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+
+	req->r_stamp = CURRENT_TIME;
+
+	req->r_op = op;
+	req->r_direct_mode = mode;
+	return req;
+}
+
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+	if (RB_EMPTY_ROOT(&mdsc->request_tree))
+		return NULL;
+	return rb_entry(rb_first(&mdsc->request_tree),
+			struct ceph_mds_request, r_node);
+}
+
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req = __get_oldest_req(mdsc);
+
+	if (req)
+		return req->r_tid;
+	return 0;
+}
+
+/*
+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ *   foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+			   int stop_on_nosnap)
+{
+	struct dentry *temp;
+	char *path;
+	int len, pos;
+	unsigned seq;
+
+	if (dentry == NULL)
+		return ERR_PTR(-EINVAL);
+
+retry:
+	len = 0;
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	for (temp = dentry; !IS_ROOT(temp);) {
+		struct inode *inode = d_inode(temp);
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+			len++;  /* slash only */
+		else if (stop_on_nosnap && inode &&
+			 ceph_snap(inode) == CEPH_NOSNAP)
+			break;
+		else
+			len += 1 + temp->d_name.len;
+		temp = temp->d_parent;
+	}
+	rcu_read_unlock();
+	if (len)
+		len--;  /* no leading '/' */
+
+	path = kmalloc(len+1, GFP_NOFS);
+	if (path == NULL)
+		return ERR_PTR(-ENOMEM);
+	pos = len;
+	path[pos] = 0;	/* trailing null */
+	rcu_read_lock();
+	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+		struct inode *inode;
+
+		spin_lock(&temp->d_lock);
+		inode = d_inode(temp);
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+			dout("build_path path+%d: %p SNAPDIR\n",
+			     pos, temp);
+		} else if (stop_on_nosnap && inode &&
+			   ceph_snap(inode) == CEPH_NOSNAP) {
+			spin_unlock(&temp->d_lock);
+			break;
+		} else {
+			pos -= temp->d_name.len;
+			if (pos < 0) {
+				spin_unlock(&temp->d_lock);
+				break;
+			}
+			strncpy(path + pos, temp->d_name.name,
+				temp->d_name.len);
+		}
+		spin_unlock(&temp->d_lock);
+		if (pos)
+			path[--pos] = '/';
+		temp = temp->d_parent;
+	}
+	rcu_read_unlock();
+	if (pos != 0 || read_seqretry(&rename_lock, seq)) {
+		pr_err("build_path did not end path lookup where "
+		       "expected, namelen is %d, pos is %d\n", len, pos);
+		/* presumably this is only possible if racing with a
+		   rename of one of the parent directories (we can not
+		   lock the dentries above us to prevent this, but
+		   retrying should be harmless) */
+		kfree(path);
+		goto retry;
+	}
+
+	*base = ceph_ino(d_inode(temp));
+	*plen = len;
+	dout("build_path on %p %d built %llx '%.*s'\n",
+	     dentry, d_count(dentry), *base, len, path);
+	return path;
+}
+
+static int build_dentry_path(struct dentry *dentry,
+			     const char **ppath, int *ppathlen, u64 *pino,
+			     int *pfreepath)
+{
+	char *path;
+
+	if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
+		*pino = ceph_ino(d_inode(dentry->d_parent));
+		*ppath = dentry->d_name.name;
+		*ppathlen = dentry->d_name.len;
+		return 0;
+	}
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+static int build_inode_path(struct inode *inode,
+			    const char **ppath, int *ppathlen, u64 *pino,
+			    int *pfreepath)
+{
+	struct dentry *dentry;
+	char *path;
+
+	if (ceph_snap(inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(inode);
+		*ppathlen = 0;
+		return 0;
+	}
+	dentry = d_find_alias(inode);
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	dput(dentry);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+				  const char *rpath, u64 rino,
+				  const char **ppath, int *pathlen,
+				  u64 *ino, int *freepath)
+{
+	int r = 0;
+
+	if (rinode) {
+		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+		     ceph_snap(rinode));
+	} else if (rdentry) {
+		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+		     *ppath);
+	} else if (rpath || rino) {
+		*ino = rino;
+		*ppath = rpath;
+		*pathlen = rpath ? strlen(rpath) : 0;
+		dout(" path %.*s\n", *pathlen, rpath);
+	}
+
+	return r;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+					       struct ceph_mds_request *req,
+					       int mds, bool drop_cap_releases)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_request_head *head;
+	const char *path1 = NULL;
+	const char *path2 = NULL;
+	u64 ino1 = 0, ino2 = 0;
+	int pathlen1 = 0, pathlen2 = 0;
+	int freepath1 = 0, freepath2 = 0;
+	int len;
+	u16 releases;
+	void *p, *end;
+	int ret;
+
+	ret = set_request_path_attr(req->r_inode, req->r_dentry,
+			      req->r_path1, req->r_ino1.ino,
+			      &path1, &pathlen1, &ino1, &freepath1);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out;
+	}
+
+	ret = set_request_path_attr(NULL, req->r_old_dentry,
+			      req->r_path2, req->r_ino2.ino,
+			      &path2, &pathlen2, &ino2, &freepath2);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out_free1;
+	}
+
+	len = sizeof(*head) +
+		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+		sizeof(struct timespec);
+
+	/* calculate (max) length for cap releases */
+	len += sizeof(struct ceph_mds_request_release) *
+		(!!req->r_inode_drop + !!req->r_dentry_drop +
+		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+	if (req->r_dentry_drop)
+		len += req->r_dentry->d_name.len;
+	if (req->r_old_dentry_drop)
+		len += req->r_old_dentry->d_name.len;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
+	if (!msg) {
+		msg = ERR_PTR(-ENOMEM);
+		goto out_free2;
+	}
+
+	msg->hdr.version = cpu_to_le16(2);
+	msg->hdr.tid = cpu_to_le64(req->r_tid);
+
+	head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(*head);
+	end = msg->front.iov_base + msg->front.iov_len;
+
+	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+	head->op = cpu_to_le32(req->r_op);
+	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
+	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
+	head->args = req->r_args;
+
+	ceph_encode_filepath(&p, end, ino1, path1);
+	ceph_encode_filepath(&p, end, ino2, path2);
+
+	/* make note of release offset, in case we need to replay */
+	req->r_request_release_offset = p - msg->front.iov_base;
+
+	/* cap releases */
+	releases = 0;
+	if (req->r_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_inode ? req->r_inode : d_inode(req->r_dentry),
+		      mds, req->r_inode_drop, req->r_inode_unless, 0);
+	if (req->r_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_dentry,
+		       mds, req->r_dentry_drop, req->r_dentry_unless);
+	if (req->r_old_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+		       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+	if (req->r_old_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      d_inode(req->r_old_dentry),
+		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+
+	if (drop_cap_releases) {
+		releases = 0;
+		p = msg->front.iov_base + req->r_request_release_offset;
+	}
+
+	head->num_releases = cpu_to_le16(releases);
+
+	/* time stamp */
+	{
+		struct ceph_timespec ts;
+		ceph_encode_timespec(&ts, &req->r_stamp);
+		ceph_encode_copy(&p, &ts, sizeof(ts));
+	}
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+	if (req->r_pagelist) {
+		struct ceph_pagelist *pagelist = req->r_pagelist;
+		atomic_inc(&pagelist->refcnt);
+		ceph_msg_data_add_pagelist(msg, pagelist);
+		msg->hdr.data_len = cpu_to_le32(pagelist->length);
+	} else {
+		msg->hdr.data_len = 0;
+	}
+
+	msg->hdr.data_off = cpu_to_le16(0);
+
+out_free2:
+	if (freepath2)
+		kfree((char *)path2);
+out_free1:
+	if (freepath1)
+		kfree((char *)path1);
+out:
+	return msg;
+}
+
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(mdsc, req);
+	else
+		complete_all(&req->r_completion);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+				  struct ceph_mds_request *req,
+				  int mds, bool drop_cap_releases)
+{
+	struct ceph_mds_request_head *rhead;
+	struct ceph_msg *msg;
+	int flags = 0;
+
+	req->r_attempts++;
+	if (req->r_inode) {
+		struct ceph_cap *cap =
+			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
+
+		if (cap)
+			req->r_sent_on_mseq = cap->mseq;
+		else
+			req->r_sent_on_mseq = -1;
+	}
+	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+
+	if (req->r_got_unsafe) {
+		void *p;
+		/*
+		 * Replay.  Do not regenerate message (and rebuild
+		 * paths, etc.); just use the original message.
+		 * Rebuilding paths will break for renames because
+		 * d_move mangles the src name.
+		 */
+		msg = req->r_request;
+		rhead = msg->front.iov_base;
+
+		flags = le32_to_cpu(rhead->flags);
+		flags |= CEPH_MDS_FLAG_REPLAY;
+		rhead->flags = cpu_to_le32(flags);
+
+		if (req->r_target_inode)
+			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+		rhead->num_retry = req->r_attempts - 1;
+
+		/* remove cap/dentry releases from message */
+		rhead->num_releases = 0;
+
+		/* time stamp */
+		p = msg->front.iov_base + req->r_request_release_offset;
+		{
+			struct ceph_timespec ts;
+			ceph_encode_timespec(&ts, &req->r_stamp);
+			ceph_encode_copy(&p, &ts, sizeof(ts));
+		}
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		return 0;
+	}
+
+	if (req->r_request) {
+		ceph_msg_put(req->r_request);
+		req->r_request = NULL;
+	}
+	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
+	if (IS_ERR(msg)) {
+		req->r_err = PTR_ERR(msg);
+		complete_request(mdsc, req);
+		return PTR_ERR(msg);
+	}
+	req->r_request = msg;
+
+	rhead = msg->front.iov_base;
+	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+	if (req->r_got_unsafe)
+		flags |= CEPH_MDS_FLAG_REPLAY;
+	if (req->r_locked_dir)
+		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+	rhead->flags = cpu_to_le32(flags);
+	rhead->num_fwd = req->r_num_fwd;
+	rhead->num_retry = req->r_attempts - 1;
+	rhead->ino = 0;
+
+	dout(" r_locked_dir = %p\n", req->r_locked_dir);
+	return 0;
+}
+
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct ceph_mds_session *session = NULL;
+	int mds = -1;
+	int err = -EAGAIN;
+
+	if (req->r_err || req->r_got_result) {
+		if (req->r_aborted)
+			__unregister_request(mdsc, req);
+		goto out;
+	}
+
+	if (req->r_timeout &&
+	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+		dout("do_request timed out\n");
+		err = -EIO;
+		goto finish;
+	}
+
+	put_request_session(req);
+
+	mds = __choose_mds(mdsc, req);
+	if (mds < 0 ||
+	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+		dout("do_request no mds or not active, waiting for map\n");
+		list_add(&req->r_wait, &mdsc->waiting_for_map);
+		goto out;
+	}
+
+	/* get, open session */
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	if (!session) {
+		session = register_session(mdsc, mds);
+		if (IS_ERR(session)) {
+			err = PTR_ERR(session);
+			goto finish;
+		}
+	}
+	req->r_session = get_session(session);
+
+	dout("do_request mds%d session %p state %s\n", mds, session,
+	     ceph_session_state_name(session->s_state));
+	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+	    session->s_state != CEPH_MDS_SESSION_HUNG) {
+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+			__open_session(mdsc, session);
+		list_add(&req->r_wait, &session->s_waiting);
+		goto out_session;
+	}
+
+	/* send request */
+	req->r_resend_mds = -1;   /* forget any previous mds hint */
+
+	if (req->r_request_started == 0)   /* note request start time */
+		req->r_request_started = jiffies;
+
+	err = __prepare_send_request(mdsc, req, mds, false);
+	if (!err) {
+		ceph_msg_get(req->r_request);
+		ceph_con_send(&session->s_con, req->r_request);
+	}
+
+out_session:
+	ceph_put_mds_session(session);
+out:
+	return err;
+
+finish:
+	req->r_err = err;
+	complete_request(mdsc, req);
+	goto out;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head)
+{
+	struct ceph_mds_request *req;
+	LIST_HEAD(tmp_list);
+
+	list_splice_init(head, &tmp_list);
+
+	while (!list_empty(&tmp_list)) {
+		req = list_entry(tmp_list.next,
+				 struct ceph_mds_request, r_wait);
+		list_del_init(&req->r_wait);
+		dout(" wake request %p tid %llu\n", req, req->r_tid);
+		__do_request(mdsc, req);
+	}
+}
+
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *p = rb_first(&mdsc->request_tree);
+
+	dout("kick_requests mds%d\n", mds);
+	while (p) {
+		req = rb_entry(p, struct ceph_mds_request, r_node);
+		p = rb_next(p);
+		if (req->r_got_unsafe)
+			continue;
+		if (req->r_attempts > 0)
+			continue; /* only new requests */
+		if (req->r_session &&
+		    req->r_session->s_mds == mds) {
+			dout(" kicking tid %llu\n", req->r_tid);
+			list_del_init(&req->r_wait);
+			__do_request(mdsc, req);
+		}
+	}
+}
+
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+			      struct ceph_mds_request *req)
+{
+	dout("submit_request on %p\n", req);
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, NULL);
+	__do_request(mdsc, req);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+			 struct inode *dir,
+			 struct ceph_mds_request *req)
+{
+	int err;
+
+	dout("do_request on %p\n", req);
+
+	/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+	if (req->r_inode)
+		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+	if (req->r_locked_dir)
+		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+	if (req->r_old_dentry_dir)
+		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
+				  CEPH_CAP_PIN);
+
+	/* issue */
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, dir);
+	__do_request(mdsc, req);
+
+	if (req->r_err) {
+		err = req->r_err;
+		__unregister_request(mdsc, req);
+		dout("do_request early error %d\n", err);
+		goto out;
+	}
+
+	/* wait */
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request waiting\n");
+	if (req->r_timeout) {
+		err = (long)wait_for_completion_killable_timeout(
+			&req->r_completion, req->r_timeout);
+		if (err == 0)
+			err = -EIO;
+	} else if (req->r_wait_for_completion) {
+		err = req->r_wait_for_completion(mdsc, req);
+	} else {
+		err = wait_for_completion_killable(&req->r_completion);
+	}
+	dout("do_request waited, got %d\n", err);
+	mutex_lock(&mdsc->mutex);
+
+	/* only abort if we didn't race with a real reply */
+	if (req->r_got_result) {
+		err = le32_to_cpu(req->r_reply_info.head->result);
+	} else if (err < 0) {
+		dout("aborted request %lld with %d\n", req->r_tid, err);
+
+		/*
+		 * ensure we aren't running concurrently with
+		 * ceph_fill_trace or ceph_readdir_prepopulate, which
+		 * rely on locks (dir mutex) held by our caller.
+		 */
+		mutex_lock(&req->r_fill_mutex);
+		req->r_err = err;
+		req->r_aborted = true;
+		mutex_unlock(&req->r_fill_mutex);
+
+		if (req->r_locked_dir &&
+		    (req->r_op & CEPH_MDS_OP_WRITE))
+			ceph_invalidate_dir_request(req);
+	} else {
+		err = req->r_err;
+	}
+
+out:
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request %p done, result %d\n", req, err);
+	return err;
+}
+
+/*
+ * Invalidate dir's completeness, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+	struct inode *inode = req->r_locked_dir;
+
+	dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
+
+	ceph_dir_clear_complete(inode);
+	if (req->r_dentry)
+		ceph_invalidate_dentry_lease(req->r_dentry);
+	if (req->r_old_dentry)
+		ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_mds_reply_head *head = msg->front.iov_base;
+	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+	struct ceph_snap_realm *realm;
+	u64 tid;
+	int err, result;
+	int mds = session->s_mds;
+
+	if (msg->front.iov_len < sizeof(*head)) {
+		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+		ceph_msg_dump(msg);
+		return;
+	}
+
+	/* get request, session */
+	tid = le64_to_cpu(msg->hdr.tid);
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("handle_reply on unknown tid %llu\n", tid);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+	dout("handle_reply %p\n", req);
+
+	/* correct session? */
+	if (req->r_session != session) {
+		pr_err("mdsc_handle_reply got %llu on session mds%d"
+		       " not mds%d\n", tid, session->s_mds,
+		       req->r_session ? req->r_session->s_mds : -1);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	/* dup? */
+	if ((req->r_got_unsafe && !head->safe) ||
+	    (req->r_got_safe && head->safe)) {
+		pr_warn("got a dup %s reply on %llu from mds%d\n",
+			   head->safe ? "safe" : "unsafe", tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	if (req->r_got_safe && !head->safe) {
+		pr_warn("got unsafe after safe on %llu from mds%d\n",
+			   tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	result = le32_to_cpu(head->result);
+
+	/*
+	 * Handle an ESTALE
+	 * if we're not talking to the authority, send to them
+	 * if the authority has changed while we weren't looking,
+	 * send to new authority
+	 * Otherwise we just have to return an ESTALE
+	 */
+	if (result == -ESTALE) {
+		dout("got ESTALE on request %llu", req->r_tid);
+		req->r_resend_mds = -1;
+		if (req->r_direct_mode != USE_AUTH_MDS) {
+			dout("not using auth, setting for that now");
+			req->r_direct_mode = USE_AUTH_MDS;
+			__do_request(mdsc, req);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		} else  {
+			int mds = __choose_mds(mdsc, req);
+			if (mds >= 0 && mds != req->r_session->s_mds) {
+				dout("but auth changed, so resending");
+				__do_request(mdsc, req);
+				mutex_unlock(&mdsc->mutex);
+				goto out;
+			}
+		}
+		dout("have to return ESTALE on request %llu", req->r_tid);
+	}
+
+
+	if (head->safe) {
+		req->r_got_safe = true;
+		__unregister_request(mdsc, req);
+
+		if (req->r_got_unsafe) {
+			/*
+			 * We already handled the unsafe response, now do the
+			 * cleanup.  No need to examine the response; the MDS
+			 * doesn't include any result info in the safe
+			 * response.  And even if it did, there is nothing
+			 * useful we could do with a revised return value.
+			 */
+			dout("got safe reply %llu, mds%d\n", tid, mds);
+			list_del_init(&req->r_unsafe_item);
+
+			/* last unsafe request during umount? */
+			if (mdsc->stopping && !__get_oldest_req(mdsc))
+				complete_all(&mdsc->safe_umount_waiters);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	} else {
+		req->r_got_unsafe = true;
+		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+	}
+
+	dout("handle_reply tid %lld result %d\n", tid, result);
+	rinfo = &req->r_reply_info;
+	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+	if (err < 0) {
+		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
+		ceph_msg_dump(msg);
+		goto out_err;
+	}
+
+	/* snap trace */
+	realm = NULL;
+	if (rinfo->snapblob_len) {
+		down_write(&mdsc->snap_rwsem);
+		ceph_update_snap_trace(mdsc, rinfo->snapblob,
+				rinfo->snapblob + rinfo->snapblob_len,
+				le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+				&realm);
+		downgrade_write(&mdsc->snap_rwsem);
+	} else {
+		down_read(&mdsc->snap_rwsem);
+	}
+
+	/* insert trace into our cache */
+	mutex_lock(&req->r_fill_mutex);
+	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
+	if (err == 0) {
+		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+				    req->r_op == CEPH_MDS_OP_LSSNAP))
+			ceph_readdir_prepopulate(req, req->r_session);
+		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
+	}
+	mutex_unlock(&req->r_fill_mutex);
+
+	up_read(&mdsc->snap_rwsem);
+	if (realm)
+		ceph_put_snap_realm(mdsc, realm);
+out_err:
+	mutex_lock(&mdsc->mutex);
+	if (!req->r_aborted) {
+		if (err) {
+			req->r_err = err;
+		} else {
+			req->r_reply = msg;
+			ceph_msg_get(msg);
+			req->r_got_result = true;
+		}
+	} else {
+		dout("reply arrived after request %lld was aborted\n", tid);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_add_cap_releases(mdsc, req->r_session);
+	mutex_unlock(&session->s_mutex);
+
+	/* kick calling process */
+	complete_request(mdsc, req);
+out:
+	ceph_mdsc_put_request(req);
+	return;
+}
+
+
+
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_request *req;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 next_mds;
+	u32 fwd_seq;
+	int err = -EINVAL;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+
+	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+	next_mds = ceph_decode_32(&p);
+	fwd_seq = ceph_decode_32(&p);
+
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
+		goto out;  /* dup reply? */
+	}
+
+	if (req->r_aborted) {
+		dout("forward tid %llu aborted, unregistering\n", tid);
+		__unregister_request(mdsc, req);
+	} else if (fwd_seq <= req->r_num_fwd) {
+		dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+		     tid, next_mds, req->r_num_fwd, fwd_seq);
+	} else {
+		/* resend. forward race not possible; mds would drop */
+		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+		BUG_ON(req->r_err);
+		BUG_ON(req->r_got_result);
+		req->r_attempts = 0;
+		req->r_num_fwd = fwd_seq;
+		req->r_resend_mds = next_mds;
+		put_request_session(req);
+		__do_request(mdsc, req);
+	}
+	ceph_mdsc_put_request(req);
+out:
+	mutex_unlock(&mdsc->mutex);
+	return;
+
+bad:
+	pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	u32 op;
+	u64 seq;
+	int mds = session->s_mds;
+	struct ceph_mds_session_head *h = msg->front.iov_base;
+	int wake = 0;
+
+	/* decode */
+	if (msg->front.iov_len != sizeof(*h))
+		goto bad;
+	op = le32_to_cpu(h->op);
+	seq = le64_to_cpu(h->seq);
+
+	mutex_lock(&mdsc->mutex);
+	if (op == CEPH_SESSION_CLOSE)
+		__unregister_session(mdsc, session);
+	/* FIXME: this ttl calculation is generous */
+	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+
+	dout("handle_session mds%d %s %p state %s seq %llu\n",
+	     mds, ceph_session_op_name(op), session,
+	     ceph_session_state_name(session->s_state), seq);
+
+	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		pr_info("mds%d came back\n", session->s_mds);
+	}
+
+	switch (op) {
+	case CEPH_SESSION_OPEN:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect success\n", session->s_mds);
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		renewed_caps(mdsc, session, 0);
+		wake = 1;
+		if (mdsc->stopping)
+			__close_session(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RENEWCAPS:
+		if (session->s_renew_seq == seq)
+			renewed_caps(mdsc, session, 1);
+		break;
+
+	case CEPH_SESSION_CLOSE:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect denied\n", session->s_mds);
+		cleanup_session_requests(mdsc, session);
+		remove_session_caps(session);
+		wake = 2; /* for good measure */
+		wake_up_all(&mdsc->session_close_wq);
+		break;
+
+	case CEPH_SESSION_STALE:
+		pr_info("mds%d caps went stale, renewing\n",
+			session->s_mds);
+		spin_lock(&session->s_gen_ttl_lock);
+		session->s_cap_gen++;
+		session->s_cap_ttl = jiffies - 1;
+		spin_unlock(&session->s_gen_ttl_lock);
+		send_renew_caps(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RECALL_STATE:
+		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+		break;
+
+	case CEPH_SESSION_FLUSHMSG:
+		send_flushmsg_ack(mdsc, session, seq);
+		break;
+
+	case CEPH_SESSION_FORCE_RO:
+		dout("force_session_readonly %p\n", session);
+		spin_lock(&session->s_cap_lock);
+		session->s_readonly = true;
+		spin_unlock(&session->s_cap_lock);
+		wake_up_session_caps(session, 0);
+		break;
+
+	default:
+		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+		WARN_ON(1);
+	}
+
+	mutex_unlock(&session->s_mutex);
+	if (wake) {
+		mutex_lock(&mdsc->mutex);
+		__wake_requests(mdsc, &session->s_waiting);
+		if (wake == 2)
+			kick_requests(mdsc, mds);
+		mutex_unlock(&mdsc->mutex);
+	}
+	return;
+
+bad:
+	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+	       (int)msg->front.iov_len);
+	ceph_msg_dump(msg);
+	return;
+}
+
+
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_mds_request *req, *nreq;
+	struct rb_node *p;
+	int err;
+
+	dout("replay_unsafe_requests mds%d\n", session->s_mds);
+
+	mutex_lock(&mdsc->mutex);
+	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+		err = __prepare_send_request(mdsc, req, session->s_mds, true);
+		if (!err) {
+			ceph_msg_get(req->r_request);
+			ceph_con_send(&session->s_con, req->r_request);
+		}
+	}
+
+	/*
+	 * also re-send old requests when MDS enters reconnect stage. So that MDS
+	 * can process completed request in clientreplay stage.
+	 */
+	p = rb_first(&mdsc->request_tree);
+	while (p) {
+		req = rb_entry(p, struct ceph_mds_request, r_node);
+		p = rb_next(p);
+		if (req->r_got_unsafe)
+			continue;
+		if (req->r_attempts == 0)
+			continue; /* only old requests */
+		if (req->r_session &&
+		    req->r_session->s_mds == session->s_mds) {
+			err = __prepare_send_request(mdsc, req,
+						     session->s_mds, true);
+			if (!err) {
+				ceph_msg_get(req->r_request);
+				ceph_con_send(&session->s_con, req->r_request);
+			}
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+			  void *arg)
+{
+	union {
+		struct ceph_mds_cap_reconnect v2;
+		struct ceph_mds_cap_reconnect_v1 v1;
+	} rec;
+	size_t reclen;
+	struct ceph_inode_info *ci;
+	struct ceph_reconnect_state *recon_state = arg;
+	struct ceph_pagelist *pagelist = recon_state->pagelist;
+	char *path;
+	int pathlen, err;
+	u64 pathbase;
+	struct dentry *dentry;
+
+	ci = cap->ci;
+
+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+	     inode, ceph_vinop(inode), cap, cap->cap_id,
+	     ceph_cap_string(cap->issued));
+	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+	if (err)
+		return err;
+
+	dentry = d_find_alias(inode);
+	if (dentry) {
+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out_dput;
+		}
+	} else {
+		path = NULL;
+		pathlen = 0;
+	}
+	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
+	if (err)
+		goto out_free;
+
+	spin_lock(&ci->i_ceph_lock);
+	cap->seq = 0;        /* reset cap seq */
+	cap->issue_seq = 0;  /* and issue_seq */
+	cap->mseq = 0;       /* and migrate_seq */
+	cap->cap_gen = cap->session->s_cap_gen;
+
+	if (recon_state->flock) {
+		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
+		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+		rec.v2.issued = cpu_to_le32(cap->issued);
+		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+		rec.v2.pathbase = cpu_to_le64(pathbase);
+		rec.v2.flock_len = 0;
+		reclen = sizeof(rec.v2);
+	} else {
+		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
+		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+		rec.v1.issued = cpu_to_le32(cap->issued);
+		rec.v1.size = cpu_to_le64(inode->i_size);
+		ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
+		ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
+		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+		rec.v1.pathbase = cpu_to_le64(pathbase);
+		reclen = sizeof(rec.v1);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (recon_state->flock) {
+		int num_fcntl_locks, num_flock_locks;
+		struct ceph_filelock *flocks;
+
+encode_again:
+		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+		flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
+				 sizeof(struct ceph_filelock), GFP_NOFS);
+		if (!flocks) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		err = ceph_encode_locks_to_buffer(inode, flocks,
+						  num_fcntl_locks,
+						  num_flock_locks);
+		if (err) {
+			kfree(flocks);
+			if (err == -ENOSPC)
+				goto encode_again;
+			goto out_free;
+		}
+		/*
+		 * number of encoded locks is stable, so copy to pagelist
+		 */
+		rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
+				    (num_fcntl_locks+num_flock_locks) *
+				    sizeof(struct ceph_filelock));
+		err = ceph_pagelist_append(pagelist, &rec, reclen);
+		if (!err)
+			err = ceph_locks_to_pagelist(flocks, pagelist,
+						     num_fcntl_locks,
+						     num_flock_locks);
+		kfree(flocks);
+	} else {
+		err = ceph_pagelist_append(pagelist, &rec, reclen);
+	}
+
+	recon_state->nr_caps++;
+out_free:
+	kfree(path);
+out_dput:
+	dput(dentry);
+	return err;
+}
+
+
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state.  This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy.  Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *session)
+{
+	struct ceph_msg *reply;
+	struct rb_node *p;
+	int mds = session->s_mds;
+	int err = -ENOMEM;
+	int s_nr_caps;
+	struct ceph_pagelist *pagelist;
+	struct ceph_reconnect_state recon_state;
+
+	pr_info("mds%d reconnect start\n", mds);
+
+	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+	if (!pagelist)
+		goto fail_nopagelist;
+	ceph_pagelist_init(pagelist);
+
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
+	if (!reply)
+		goto fail_nomsg;
+
+	mutex_lock(&session->s_mutex);
+	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+	session->s_seq = 0;
+
+	dout("session %p state %s\n", session,
+	     ceph_session_state_name(session->s_state));
+
+	spin_lock(&session->s_gen_ttl_lock);
+	session->s_cap_gen++;
+	spin_unlock(&session->s_gen_ttl_lock);
+
+	spin_lock(&session->s_cap_lock);
+	/* don't know if session is readonly */
+	session->s_readonly = 0;
+	/*
+	 * notify __ceph_remove_cap() that we are composing cap reconnect.
+	 * If a cap get released before being added to the cap reconnect,
+	 * __ceph_remove_cap() should skip queuing cap release.
+	 */
+	session->s_cap_reconnect = 1;
+	/* drop old cap expires; we're about to reestablish that state */
+	discard_cap_releases(mdsc, session);
+	spin_unlock(&session->s_cap_lock);
+
+	/* trim unused caps to reduce MDS's cache rejoin time */
+	if (mdsc->fsc->sb->s_root)
+		shrink_dcache_parent(mdsc->fsc->sb->s_root);
+
+	ceph_con_close(&session->s_con);
+	ceph_con_open(&session->s_con,
+		      CEPH_ENTITY_TYPE_MDS, mds,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	/* replay unsafe requests */
+	replay_unsafe_requests(mdsc, session);
+
+	down_read(&mdsc->snap_rwsem);
+
+	/* traverse this session's caps */
+	s_nr_caps = session->s_nr_caps;
+	err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
+	if (err)
+		goto fail;
+
+	recon_state.nr_caps = 0;
+	recon_state.pagelist = pagelist;
+	recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
+	if (err < 0)
+		goto fail;
+
+	spin_lock(&session->s_cap_lock);
+	session->s_cap_reconnect = 0;
+	spin_unlock(&session->s_cap_lock);
+
+	/*
+	 * snaprealms.  we provide mds with the ino, seq (version), and
+	 * parent for all of our realms.  If the mds has any newer info,
+	 * it will tell us.
+	 */
+	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+		struct ceph_snap_realm *realm =
+			rb_entry(p, struct ceph_snap_realm, node);
+		struct ceph_mds_snaprealm_reconnect sr_rec;
+
+		dout(" adding snap realm %llx seq %lld parent %llx\n",
+		     realm->ino, realm->seq, realm->parent_ino);
+		sr_rec.ino = cpu_to_le64(realm->ino);
+		sr_rec.seq = cpu_to_le64(realm->seq);
+		sr_rec.parent = cpu_to_le64(realm->parent_ino);
+		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+		if (err)
+			goto fail;
+	}
+
+	if (recon_state.flock)
+		reply->hdr.version = cpu_to_le16(2);
+
+	/* raced with cap release? */
+	if (s_nr_caps != recon_state.nr_caps) {
+		struct page *page = list_first_entry(&pagelist->head,
+						     struct page, lru);
+		__le32 *addr = kmap_atomic(page);
+		*addr = cpu_to_le32(recon_state.nr_caps);
+		kunmap_atomic(addr);
+	}
+
+	reply->hdr.data_len = cpu_to_le32(pagelist->length);
+	ceph_msg_data_add_pagelist(reply, pagelist);
+	ceph_con_send(&session->s_con, reply);
+
+	mutex_unlock(&session->s_mutex);
+
+	mutex_lock(&mdsc->mutex);
+	__wake_requests(mdsc, &session->s_waiting);
+	mutex_unlock(&mdsc->mutex);
+
+	up_read(&mdsc->snap_rwsem);
+	return;
+
+fail:
+	ceph_msg_put(reply);
+	up_read(&mdsc->snap_rwsem);
+	mutex_unlock(&session->s_mutex);
+fail_nomsg:
+	ceph_pagelist_release(pagelist);
+fail_nopagelist:
+	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+	return;
+}
+
+
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+			  struct ceph_mdsmap *newmap,
+			  struct ceph_mdsmap *oldmap)
+{
+	int i;
+	int oldstate, newstate;
+	struct ceph_mds_session *s;
+
+	dout("check_new_map new %u old %u\n",
+	     newmap->m_epoch, oldmap->m_epoch);
+
+	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i] == NULL)
+			continue;
+		s = mdsc->sessions[i];
+		oldstate = ceph_mdsmap_get_state(oldmap, i);
+		newstate = ceph_mdsmap_get_state(newmap, i);
+
+		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
+		     i, ceph_mds_state_name(oldstate),
+		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
+		     ceph_mds_state_name(newstate),
+		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
+		     ceph_session_state_name(s->s_state));
+
+		if (i >= newmap->m_max_mds ||
+		    memcmp(ceph_mdsmap_get_addr(oldmap, i),
+			   ceph_mdsmap_get_addr(newmap, i),
+			   sizeof(struct ceph_entity_addr))) {
+			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+				/* the session never opened, just close it
+				 * out now */
+				__wake_requests(mdsc, &s->s_waiting);
+				__unregister_session(mdsc, s);
+			} else {
+				/* just close it */
+				mutex_unlock(&mdsc->mutex);
+				mutex_lock(&s->s_mutex);
+				mutex_lock(&mdsc->mutex);
+				ceph_con_close(&s->s_con);
+				mutex_unlock(&s->s_mutex);
+				s->s_state = CEPH_MDS_SESSION_RESTARTING;
+			}
+		} else if (oldstate == newstate) {
+			continue;  /* nothing new with this mds */
+		}
+
+		/*
+		 * send reconnect?
+		 */
+		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+		    newstate >= CEPH_MDS_STATE_RECONNECT) {
+			mutex_unlock(&mdsc->mutex);
+			send_mds_reconnect(mdsc, s);
+			mutex_lock(&mdsc->mutex);
+		}
+
+		/*
+		 * kick request on any mds that has gone active.
+		 */
+		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+		    newstate >= CEPH_MDS_STATE_ACTIVE) {
+			if (oldstate != CEPH_MDS_STATE_CREATING &&
+			    oldstate != CEPH_MDS_STATE_STARTING)
+				pr_info("mds%d recovery completed\n", s->s_mds);
+			kick_requests(mdsc, i);
+			ceph_kick_flushing_caps(mdsc, s);
+			wake_up_session_caps(s, 1);
+		}
+	}
+
+	for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		s = mdsc->sessions[i];
+		if (!s)
+			continue;
+		if (!ceph_mdsmap_is_laggy(newmap, i))
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG ||
+		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout(" connecting to export targets of laggy mds%d\n",
+			     i);
+			__open_export_target_sessions(mdsc, s);
+		}
+	}
+}
+
+
+
+/*
+ * leases
+ */
+
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	ceph_put_mds_session(di->lease_session);
+	di->lease_session = NULL;
+}
+
+static void handle_lease(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session,
+			 struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *inode;
+	struct dentry *parent, *dentry;
+	struct ceph_dentry_info *di;
+	int mds = session->s_mds;
+	struct ceph_mds_lease *h = msg->front.iov_base;
+	u32 seq;
+	struct ceph_vino vino;
+	struct qstr dname;
+	int release = 0;
+
+	dout("handle_lease from mds%d\n", mds);
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+		goto bad;
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	seq = le32_to_cpu(h->seq);
+	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+	if (dname.len != get_unaligned_le32(h+1))
+		goto bad;
+
+	/* lookup inode */
+	inode = ceph_find_inode(sb, vino);
+	dout("handle_lease %s, ino %llx %p %.*s\n",
+	     ceph_lease_op_name(h->action), vino.ino, inode,
+	     dname.len, dname.name);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+
+	if (inode == NULL) {
+		dout("handle_lease no inode %llx\n", vino.ino);
+		goto release;
+	}
+
+	/* dentry */
+	parent = d_find_alias(inode);
+	if (!parent) {
+		dout("no parent dentry on inode %p\n", inode);
+		WARN_ON(1);
+		goto release;  /* hrm... */
+	}
+	dname.hash = full_name_hash(dname.name, dname.len);
+	dentry = d_lookup(parent, &dname);
+	dput(parent);
+	if (!dentry)
+		goto release;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	switch (h->action) {
+	case CEPH_MDS_LEASE_REVOKE:
+		if (di->lease_session == session) {
+			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+				h->seq = cpu_to_le32(di->lease_seq);
+			__ceph_mdsc_drop_dentry_lease(dentry);
+		}
+		release = 1;
+		break;
+
+	case CEPH_MDS_LEASE_RENEW:
+		if (di->lease_session == session &&
+		    di->lease_gen == session->s_cap_gen &&
+		    di->lease_renew_from &&
+		    di->lease_renew_after == 0) {
+			unsigned long duration =
+				msecs_to_jiffies(le32_to_cpu(h->duration_ms));
+
+			di->lease_seq = seq;
+			dentry->d_time = di->lease_renew_from + duration;
+			di->lease_renew_after = di->lease_renew_from +
+				(duration >> 1);
+			di->lease_renew_from = 0;
+		}
+		break;
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+
+	if (!release)
+		goto out;
+
+release:
+	/* let's just reuse the same message */
+	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+	ceph_msg_get(msg);
+	ceph_con_send(&session->s_con, msg);
+
+out:
+	iput(inode);
+	mutex_unlock(&session->s_mutex);
+	return;
+
+bad:
+	pr_err("corrupt lease message\n");
+	ceph_msg_dump(msg);
+}
+
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+			      struct inode *inode,
+			      struct dentry *dentry, char action,
+			      u32 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_lease *lease;
+	int len = sizeof(*lease) + sizeof(u32);
+	int dnamelen = 0;
+
+	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
+	dnamelen = dentry->d_name.len;
+	len += dnamelen;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
+	if (!msg)
+		return;
+	lease = msg->front.iov_base;
+	lease->action = action;
+	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+	lease->seq = cpu_to_le32(seq);
+	put_unaligned_le32(dnamelen, lease + 1);
+	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+
+	/*
+	 * if this is a preemptive lease RELEASE, no need to
+	 * flush request stream, since the actual request will
+	 * soon follow.
+	 */
+	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+
+	ceph_con_send(&session->s_con, msg);
+}
+
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+			     struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *session;
+	u32 seq;
+
+	BUG_ON(inode == NULL);
+	BUG_ON(dentry == NULL);
+
+	/* is dentry lease valid? */
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (!di || !di->lease_session ||
+	    di->lease_session->s_mds < 0 ||
+	    di->lease_gen != di->lease_session->s_cap_gen ||
+	    !time_before(jiffies, dentry->d_time)) {
+		dout("lease_release inode %p dentry %p -- "
+		     "no lease\n",
+		     inode, dentry);
+		spin_unlock(&dentry->d_lock);
+		return;
+	}
+
+	/* we do have a lease on this dentry; note mds and seq */
+	session = ceph_get_mds_session(di->lease_session);
+	seq = di->lease_seq;
+	__ceph_mdsc_drop_dentry_lease(dentry);
+	spin_unlock(&dentry->d_lock);
+
+	dout("lease_release inode %p dentry %p to mds%d\n",
+	     inode, dentry, session->s_mds);
+	ceph_mdsc_lease_send_msg(session, inode, dentry,
+				 CEPH_MDS_LEASE_RELEASE, seq);
+	ceph_put_mds_session(session);
+}
+
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+	int i;
+
+	dout("drop_leases\n");
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&s->s_mutex);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+
+
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+	int delay = 5;
+	unsigned hz = round_jiffies_relative(HZ * delay);
+	schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+
+static void delayed_work(struct work_struct *work)
+{
+	int i;
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, delayed_work.work);
+	int renew_interval;
+	int renew_caps;
+
+	dout("mdsc delayed_work\n");
+	ceph_check_delayed_caps(mdsc);
+
+	mutex_lock(&mdsc->mutex);
+	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+				   mdsc->last_renew_caps);
+	if (renew_caps)
+		mdsc->last_renew_caps = jiffies;
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (s == NULL)
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout("resending session close request for mds%d\n",
+			     s->s_mds);
+			request_close_session(mdsc, s);
+			ceph_put_mds_session(s);
+			continue;
+		}
+		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+				s->s_state = CEPH_MDS_SESSION_HUNG;
+				pr_info("mds%d hung\n", s->s_mds);
+			}
+		}
+		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+			/* this mds is failed or recovering, just wait */
+			ceph_put_mds_session(s);
+			continue;
+		}
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&s->s_mutex);
+		if (renew_caps)
+			send_renew_caps(mdsc, s);
+		else
+			ceph_con_keepalive(&s->s_con);
+		ceph_add_cap_releases(mdsc, s);
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG)
+			ceph_send_cap_releases(mdsc, s);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	schedule_delayed(mdsc);
+}
+
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
+
+{
+	struct ceph_mds_client *mdsc;
+
+	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+	if (!mdsc)
+		return -ENOMEM;
+	mdsc->fsc = fsc;
+	fsc->mdsc = mdsc;
+	mutex_init(&mdsc->mutex);
+	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+	if (mdsc->mdsmap == NULL) {
+		kfree(mdsc);
+		return -ENOMEM;
+	}
+
+	init_completion(&mdsc->safe_umount_waiters);
+	init_waitqueue_head(&mdsc->session_close_wq);
+	INIT_LIST_HEAD(&mdsc->waiting_for_map);
+	mdsc->sessions = NULL;
+	atomic_set(&mdsc->num_sessions, 0);
+	mdsc->max_sessions = 0;
+	mdsc->stopping = 0;
+	init_rwsem(&mdsc->snap_rwsem);
+	mdsc->snap_realms = RB_ROOT;
+	INIT_LIST_HEAD(&mdsc->snap_empty);
+	spin_lock_init(&mdsc->snap_empty_lock);
+	mdsc->last_tid = 0;
+	mdsc->request_tree = RB_ROOT;
+	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+	mdsc->last_renew_caps = jiffies;
+	INIT_LIST_HEAD(&mdsc->cap_delay_list);
+	spin_lock_init(&mdsc->cap_delay_lock);
+	INIT_LIST_HEAD(&mdsc->snap_flush_list);
+	spin_lock_init(&mdsc->snap_flush_lock);
+	mdsc->cap_flush_seq = 0;
+	INIT_LIST_HEAD(&mdsc->cap_dirty);
+	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
+	mdsc->num_cap_flushing = 0;
+	spin_lock_init(&mdsc->cap_dirty_lock);
+	init_waitqueue_head(&mdsc->cap_flushing_wq);
+	spin_lock_init(&mdsc->dentry_lru_lock);
+	INIT_LIST_HEAD(&mdsc->dentry_lru);
+
+	ceph_caps_init(mdsc);
+	ceph_adjust_min_caps(mdsc, fsc->min_caps);
+
+	return 0;
+}
+
+/*
+ * Wait for safe replies on open mds requests.  If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req;
+	struct ceph_fs_client *fsc = mdsc->fsc;
+
+	mutex_lock(&mdsc->mutex);
+	if (__get_oldest_req(mdsc)) {
+		mutex_unlock(&mdsc->mutex);
+
+		dout("wait_requests waiting for requests\n");
+		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+				    fsc->client->options->mount_timeout * HZ);
+
+		/* tear down remaining requests */
+		mutex_lock(&mdsc->mutex);
+		while ((req = __get_oldest_req(mdsc))) {
+			dout("wait_requests timed out on tid %llu\n",
+			     req->r_tid);
+			__unregister_request(mdsc, req);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_requests done\n");
+}
+
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+	dout("pre_umount\n");
+	mdsc->stopping = 1;
+
+	drop_leases(mdsc);
+	ceph_flush_dirty_caps(mdsc);
+	wait_requests(mdsc);
+
+	/*
+	 * wait for reply handlers to drop their request refs and
+	 * their inode/dcache refs
+	 */
+	ceph_msgr_flush();
+}
+
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+	struct ceph_mds_request *req = NULL, *nextreq;
+	struct rb_node *n;
+
+	mutex_lock(&mdsc->mutex);
+	dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
+	req = __get_oldest_req(mdsc);
+	while (req && req->r_tid <= want_tid) {
+		/* find next request */
+		n = rb_next(&req->r_node);
+		if (n)
+			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+		else
+			nextreq = NULL;
+		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+			/* write op */
+			ceph_mdsc_get_request(req);
+			if (nextreq)
+				ceph_mdsc_get_request(nextreq);
+			mutex_unlock(&mdsc->mutex);
+			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+			     req->r_tid, want_tid);
+			wait_for_completion(&req->r_safe_completion);
+			mutex_lock(&mdsc->mutex);
+			ceph_mdsc_put_request(req);
+			if (!nextreq)
+				break;  /* next dne before, so we're done! */
+			if (RB_EMPTY_NODE(&nextreq->r_node)) {
+				/* next request was removed from tree */
+				ceph_mdsc_put_request(nextreq);
+				goto restart;
+			}
+			ceph_mdsc_put_request(nextreq);  /* won't go away */
+		}
+		req = nextreq;
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_unsafe_requests done\n");
+}
+
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+	u64 want_tid, want_flush;
+
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return;
+
+	dout("sync\n");
+	mutex_lock(&mdsc->mutex);
+	want_tid = mdsc->last_tid;
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_flush_dirty_caps(mdsc);
+	spin_lock(&mdsc->cap_dirty_lock);
+	want_flush = mdsc->cap_flush_seq;
+	spin_unlock(&mdsc->cap_dirty_lock);
+
+	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+
+	wait_unsafe_requests(mdsc, want_tid);
+	wait_caps_flush(mdsc, want_flush);
+}
+
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+static bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return true;
+	return atomic_read(&mdsc->num_sessions) == 0;
+}
+
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_session *session;
+	int i;
+	struct ceph_fs_client *fsc = mdsc->fsc;
+	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
+
+	dout("close_sessions\n");
+
+	/* close sessions */
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		session = __ceph_lookup_mds_session(mdsc, i);
+		if (!session)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&session->s_mutex);
+		__close_session(mdsc, session);
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	dout("waiting for sessions to close\n");
+	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+			   timeout);
+
+	/* tear down remaining sessions */
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i]) {
+			session = get_session(mdsc->sessions[i]);
+			__unregister_session(mdsc, session);
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			remove_session_caps(session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+		}
+	}
+	WARN_ON(!list_empty(&mdsc->cap_delay_list));
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_cleanup_empty_realms(mdsc);
+
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+
+	dout("stopped\n");
+}
+
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	if (mdsc->mdsmap)
+		ceph_mdsmap_destroy(mdsc->mdsmap);
+	kfree(mdsc->sessions);
+	ceph_caps_finalize(mdsc);
+}
+
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	dout("mdsc_destroy %p\n", mdsc);
+	ceph_mdsc_stop(mdsc);
+
+	/* flush out any connection work with references to us */
+	ceph_msgr_flush();
+
+	fsc->mdsc = NULL;
+	kfree(mdsc);
+	dout("mdsc_destroy %p done\n", mdsc);
+}
+
+
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+	u32 epoch;
+	u32 maplen;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+	struct ceph_mdsmap *newmap, *oldmap;
+	struct ceph_fsid fsid;
+	int err = -EINVAL;
+
+	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
+		return;
+	epoch = ceph_decode_32(&p);
+	maplen = ceph_decode_32(&p);
+	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+
+	/* do we need it? */
+	ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
+	mutex_lock(&mdsc->mutex);
+	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+		dout("handle_map epoch %u <= our %u\n",
+		     epoch, mdsc->mdsmap->m_epoch);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+
+	newmap = ceph_mdsmap_decode(&p, end);
+	if (IS_ERR(newmap)) {
+		err = PTR_ERR(newmap);
+		goto bad_unlock;
+	}
+
+	/* swap into place */
+	if (mdsc->mdsmap) {
+		oldmap = mdsc->mdsmap;
+		mdsc->mdsmap = newmap;
+		check_new_map(mdsc, newmap, oldmap);
+		ceph_mdsmap_destroy(oldmap);
+	} else {
+		mdsc->mdsmap = newmap;  /* first mds map */
+	}
+	mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+
+	__wake_requests(mdsc, &mdsc->waiting_for_map);
+
+	mutex_unlock(&mdsc->mutex);
+	schedule_delayed(mdsc);
+	return;
+
+bad_unlock:
+	mutex_unlock(&mdsc->mutex);
+bad:
+	pr_err("error decoding mdsmap %d\n", err);
+	return;
+}
+
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	if (get_session(s)) {
+		dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+		return con;
+	}
+	dout("mdsc con_get %p FAIL\n", s);
+	return NULL;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
+	ceph_put_mds_session(s);
+}
+
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+
+	pr_warn("mds%d closed our session\n", s->s_mds);
+	send_mds_reconnect(mdsc, s);
+}
+
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	mutex_lock(&mdsc->mutex);
+	if (__verify_registered_session(mdsc, s) < 0) {
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(mdsc, msg);
+		break;
+	case CEPH_MSG_CLIENT_SESSION:
+		handle_session(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REPLY:
+		handle_reply(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+		handle_forward(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_CAPS:
+		ceph_handle_caps(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_SNAP:
+		ceph_handle_snap(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_LEASE:
+		handle_lease(mdsc, s, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * authentication
+ */
+
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately.  Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
+					int *proto, int force_new)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+
+	if (force_new && auth->authorizer) {
+		ceph_auth_destroy_authorizer(ac, auth->authorizer);
+		auth->authorizer = NULL;
+	}
+	if (!auth->authorizer) {
+		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
+		if (ret)
+			return ERR_PTR(ret);
+	} else {
+		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	*proto = ac->protocol;
+
+	return auth;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+
+	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+
+	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+
+	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
+}
+
+static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr, int *skip)
+{
+	struct ceph_msg *msg;
+	int type = (int) le16_to_cpu(hdr->type);
+	int front_len = (int) le32_to_cpu(hdr->front_len);
+
+	if (con->in_msg)
+		return con->in_msg;
+
+	*skip = 0;
+	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
+	if (!msg) {
+		pr_err("unable to allocate msg type %d len %d\n",
+		       type, front_len);
+		return NULL;
+	}
+
+	return msg;
+}
+
+static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       struct ceph_mds_session *s = con->private;
+       struct ceph_auth_handshake *auth = &s->s_auth;
+       return ceph_auth_sign_message(auth, msg);
+}
+
+static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       struct ceph_mds_session *s = con->private;
+       struct ceph_auth_handshake *auth = &s->s_auth;
+       return ceph_auth_check_message_signature(auth, msg);
+}
+
+static const struct ceph_connection_operations mds_con_ops = {
+	.get = con_get,
+	.put = con_put,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.peer_reset = peer_reset,
+	.alloc_msg = mds_alloc_msg,
+	.sign_message = sign_message,
+	.check_message_signature = check_message_signature,
+};
+
+/* eof */
diff --git a/kernel/fs/ceph/mds_client.h b/kernel/fs/ceph/mds_client.h
new file mode 100644
index 000000000..1875b5d98
--- /dev/null
+++ b/kernel/fs/ceph/mds_client.h
@@ -0,0 +1,406 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ *         mdsc->mutex
+ *
+ *         mdsc->snap_rwsem
+ *
+ *         ci->i_ceph_lock
+ *                 mdsc->snap_flush_lock
+ *                 mdsc->cap_delay_lock
+ *
+ */
+
+struct ceph_fs_client;
+struct ceph_cap;
+
+/*
+ * parsed info about a single inode.  pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+	struct ceph_mds_reply_inode *in;
+	struct ceph_dir_layout dir_layout;
+	u32 symlink_len;
+	char *symlink;
+	u32 xattr_len;
+	char *xattr_data;
+	u64 inline_version;
+	u32 inline_len;
+	char *inline_data;
+};
+
+/*
+ * parsed info about an mds reply, including information about
+ * either: 1) the target inode and/or its parent directory and dentry,
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
+ */
+struct ceph_mds_reply_info_parsed {
+	struct ceph_mds_reply_head    *head;
+
+	/* trace */
+	struct ceph_mds_reply_info_in diri, targeti;
+	struct ceph_mds_reply_dirfrag *dirfrag;
+	char                          *dname;
+	u32                           dname_len;
+	struct ceph_mds_reply_lease   *dlease;
+
+	/* extra */
+	union {
+		/* for fcntl F_GETLK results */
+		struct ceph_filelock *filelock_reply;
+
+		/* for readdir results */
+		struct {
+			struct ceph_mds_reply_dirfrag *dir_dir;
+			size_t			      dir_buf_size;
+			int                           dir_nr;
+			char                          **dir_dname;
+			u32                           *dir_dname_len;
+			struct ceph_mds_reply_lease   **dir_dlease;
+			struct ceph_mds_reply_info_in *dir_in;
+			u8                            dir_complete, dir_end;
+		};
+
+		/* for create results */
+		struct {
+			bool has_create_ino;
+			u64 ino;
+		};
+	};
+
+	/* encoded blob describing snapshot contexts for certain
+	   operations (e.g., open) */
+	void *snapblob;
+	int snapblob_len;
+};
+
+
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
+				sizeof(struct ceph_mds_cap_release)) /	\
+			       sizeof(struct ceph_mds_cap_item))
+
+
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+	CEPH_MDS_SESSION_NEW = 1,
+	CEPH_MDS_SESSION_OPENING = 2,
+	CEPH_MDS_SESSION_OPEN = 3,
+	CEPH_MDS_SESSION_HUNG = 4,
+	CEPH_MDS_SESSION_CLOSING = 5,
+	CEPH_MDS_SESSION_RESTARTING = 6,
+	CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+
+struct ceph_mds_session {
+	struct ceph_mds_client *s_mdsc;
+	int               s_mds;
+	int               s_state;
+	unsigned long     s_ttl;      /* time until mds kills us */
+	u64               s_seq;      /* incoming msg seq # */
+	struct mutex      s_mutex;    /* serialize session messages */
+
+	struct ceph_connection s_con;
+
+	struct ceph_auth_handshake s_auth;
+
+	/* protected by s_gen_ttl_lock */
+	spinlock_t        s_gen_ttl_lock;
+	u32               s_cap_gen;  /* inc each time we get mds stale msg */
+	unsigned long     s_cap_ttl;  /* when session caps expire */
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
+	struct list_head  s_caps;     /* all caps issued by this session */
+	int               s_nr_caps, s_trim_caps;
+	int               s_num_cap_releases;
+	int		  s_cap_reconnect;
+	int		  s_readonly;
+	struct list_head  s_cap_releases; /* waiting cap_release messages */
+	struct list_head  s_cap_releases_done; /* ready to send */
+	struct ceph_cap  *s_cap_iterator;
+
+	/* protected by mutex */
+	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
+	struct list_head  s_cap_snaps_flushing;
+	unsigned long     s_renew_requested; /* last time we sent a renew req */
+	u64               s_renew_seq;
+
+	atomic_t          s_ref;
+	struct list_head  s_waiting;  /* waiting requests */
+	struct list_head  s_unsafe;   /* unsafe requests */
+};
+
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+	USE_ANY_MDS,
+	USE_RANDOM_MDS,
+	USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
+};
+
+struct ceph_mds_request;
+struct ceph_mds_client;
+
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+					     struct ceph_mds_request *req);
+/*
+ * wait for request completion callback
+ */
+typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
+						 struct ceph_mds_request *req);
+
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+	u64 r_tid;                   /* transaction id */
+	struct rb_node r_node;
+	struct ceph_mds_client *r_mdsc;
+
+	int r_op;                    /* mds op code */
+
+	/* operation on what? */
+	struct inode *r_inode;              /* arg1 */
+	struct dentry *r_dentry;            /* arg1 */
+	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+	struct inode *r_old_dentry_dir;     /* arg2: old dentry's parent dir */
+	char *r_path1, *r_path2;
+	struct ceph_vino r_ino1, r_ino2;
+
+	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+	struct inode *r_target_inode;       /* resulting inode */
+
+	struct mutex r_fill_mutex;
+
+	union ceph_mds_request_args r_args;
+	int r_fmode;        /* file mode, if expecting cap */
+	kuid_t r_uid;
+	kgid_t r_gid;
+	struct timespec r_stamp;
+
+	/* for choosing which mds to send this request to */
+	int r_direct_mode;
+	u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
+	bool r_direct_is_hash;  /* true if r_direct_hash is valid */
+
+	/* data payload is used for xattr ops */
+	struct ceph_pagelist *r_pagelist;
+
+	/* what caps shall we drop? */
+	int r_inode_drop, r_inode_unless;
+	int r_dentry_drop, r_dentry_unless;
+	int r_old_dentry_drop, r_old_dentry_unless;
+	struct inode *r_old_inode;
+	int r_old_inode_drop, r_old_inode_unless;
+
+	struct ceph_msg  *r_request;  /* original request */
+	int r_request_release_offset;
+	struct ceph_msg  *r_reply;
+	struct ceph_mds_reply_info_parsed r_reply_info;
+	struct page *r_locked_page;
+	int r_err;
+	bool r_aborted;
+
+	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_started;  /* start time to measure timeout against */
+	unsigned long r_request_started; /* start time for mds request only,
+					    used to measure lease durations */
+
+	/* link unsafe requests to parent directory, for fsync */
+	struct inode	*r_unsafe_dir;
+	struct list_head r_unsafe_dir_item;
+
+	struct ceph_mds_session *r_session;
+
+	int               r_attempts;   /* resend attempts */
+	int               r_num_fwd;    /* number of forward attempts */
+	int               r_resend_mds; /* mds to resend to next, if any*/
+	u32               r_sent_on_mseq; /* cap mseq request was sent at*/
+
+	struct kref       r_kref;
+	struct list_head  r_wait;
+	struct completion r_completion;
+	struct completion r_safe_completion;
+	ceph_mds_request_callback_t r_callback;
+	ceph_mds_request_wait_callback_t r_wait_for_completion;
+	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
+	bool		  r_got_unsafe, r_got_safe, r_got_result;
+
+	bool              r_did_prepopulate;
+	u32               r_readdir_offset;
+
+	struct ceph_cap_reservation r_caps_reservation;
+	int r_num_caps;
+};
+
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+	struct ceph_fs_client  *fsc;
+	struct mutex            mutex;         /* all nested structures */
+
+	struct ceph_mdsmap      *mdsmap;
+	struct completion       safe_umount_waiters;
+	wait_queue_head_t       session_close_wq;
+	struct list_head        waiting_for_map;
+
+	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+	atomic_t		num_sessions;
+	int                     max_sessions;  /* len of s_mds_sessions */
+	int                     stopping;      /* true if shutting down */
+
+	/*
+	 * snap_rwsem will cover cap linkage into snaprealms, and
+	 * realm snap contexts.  (later, we can do per-realm snap
+	 * contexts locks..)  the empty list contains realms with no
+	 * references (implying they contain no inodes with caps) that
+	 * should be destroyed.
+	 */
+	struct rw_semaphore     snap_rwsem;
+	struct rb_root          snap_realms;
+	struct list_head        snap_empty;
+	spinlock_t              snap_empty_lock;  /* protect snap_empty */
+
+	u64                    last_tid;      /* most recent mds request */
+	struct rb_root         request_tree;  /* pending mds requests */
+	struct delayed_work    delayed_work;  /* delayed work */
+	unsigned long    last_renew_caps;  /* last time we renewed our caps */
+	struct list_head cap_delay_list;   /* caps with delayed release */
+	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
+	spinlock_t       snap_flush_lock;
+
+	u64               cap_flush_seq;
+	struct list_head  cap_dirty;        /* inodes with dirty caps */
+	struct list_head  cap_dirty_migrating; /* ...that are migration... */
+	int               num_cap_flushing; /* # caps we are flushing */
+	spinlock_t        cap_dirty_lock;   /* protects above items */
+	wait_queue_head_t cap_flushing_wq;
+
+	/*
+	 * Cap reservations
+	 *
+	 * Maintain a global pool of preallocated struct ceph_caps, referenced
+	 * by struct ceph_caps_reservations.  This ensures that we preallocate
+	 * memory needed to successfully process an MDS response.  (If an MDS
+	 * sends us cap information and we fail to process it, we will have
+	 * problems due to the client and MDS being out of sync.)
+	 *
+	 * Reservations are 'owned' by a ceph_cap_reservation context.
+	 */
+	spinlock_t	caps_list_lock;
+	struct		list_head caps_list; /* unused (reserved or
+						unreserved) */
+	int		caps_total_count;    /* total caps allocated */
+	int		caps_use_count;      /* in use */
+	int		caps_reserve_count;  /* unused, reserved */
+	int		caps_avail_count;    /* unused, unreserved */
+	int		caps_min_count;      /* keep at least this many
+						(unreserved) */
+	spinlock_t	  dentry_lru_lock;
+	struct list_head  dentry_lru;
+	int		  num_dentry;
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+	atomic_inc(&s->s_ref);
+	return s;
+}
+
+extern const char *ceph_session_state_name(int s);
+
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+			     struct ceph_msg *msg, int mds);
+
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
+
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+				    struct inode *inode,
+				    struct dentry *dn);
+
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+					   struct inode *dir);
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+				struct inode *dir,
+				struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+	kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+	kref_put(&req->r_kref, ceph_mdsc_release_request);
+}
+
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session);
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session);
+
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+				  int stop_on_nosnap);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+				     struct inode *inode,
+				     struct dentry *dentry, char action,
+				     u32 seq);
+
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+				 struct ceph_msg *msg);
+
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
+extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+					  struct ceph_mds_session *session);
+
+#endif
diff --git a/kernel/fs/ceph/mdsmap.c b/kernel/fs/ceph/mdsmap.c
new file mode 100644
index 000000000..261531e55
--- /dev/null
+++ b/kernel/fs/ceph/mdsmap.c
@@ -0,0 +1,189 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+
+#include "super.h"
+
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+	int n = 0;
+	int i;
+
+	/* special case for one mds */
+	if (1 == m->m_max_mds && m->m_info[0].state > 0)
+		return 0;
+
+	/* count */
+	for (i = 0; i < m->m_max_mds; i++)
+		if (m->m_info[i].state > 0)
+			n++;
+	if (n == 0)
+		return -1;
+
+	/* pick */
+	n = prandom_u32() % n;
+	i = 0;
+	for (i = 0; n > 0; i++, n--)
+		while (m->m_info[i].state <= 0)
+			i++;
+
+	return i;
+}
+
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+	struct ceph_mdsmap *m;
+	const void *start = *p;
+	int i, j, n;
+	int err = -EINVAL;
+	u16 version;
+
+	m = kzalloc(sizeof(*m), GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > 3) {
+		pr_warn("got mdsmap version %d > 3, failing", version);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+	m->m_epoch = ceph_decode_32(p);
+	m->m_client_epoch = ceph_decode_32(p);
+	m->m_last_failure = ceph_decode_32(p);
+	m->m_root = ceph_decode_32(p);
+	m->m_session_timeout = ceph_decode_32(p);
+	m->m_session_autoclose = ceph_decode_32(p);
+	m->m_max_file_size = ceph_decode_64(p);
+	m->m_max_mds = ceph_decode_32(p);
+
+	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+	if (m->m_info == NULL)
+		goto badmem;
+
+	/* pick out active nodes from mds_info (state > 0) */
+	n = ceph_decode_32(p);
+	for (i = 0; i < n; i++) {
+		u64 global_id;
+		u32 namelen;
+		s32 mds, inc, state;
+		u64 state_seq;
+		u8 infoversion;
+		struct ceph_entity_addr addr;
+		u32 num_export_targets;
+		void *pexport_targets = NULL;
+		struct ceph_timespec laggy_since;
+		struct ceph_mds_info *info;
+
+		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+		global_id = ceph_decode_64(p);
+		infoversion = ceph_decode_8(p);
+		*p += sizeof(u64);
+		namelen = ceph_decode_32(p);  /* skip mds name */
+		*p += namelen;
+
+		ceph_decode_need(p, end,
+				 4*sizeof(u32) + sizeof(u64) +
+				 sizeof(addr) + sizeof(struct ceph_timespec),
+				 bad);
+		mds = ceph_decode_32(p);
+		inc = ceph_decode_32(p);
+		state = ceph_decode_32(p);
+		state_seq = ceph_decode_64(p);
+		ceph_decode_copy(p, &addr, sizeof(addr));
+		ceph_decode_addr(&addr);
+		ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+		*p += sizeof(u32);
+		ceph_decode_32_safe(p, end, namelen, bad);
+		*p += namelen;
+		if (infoversion >= 2) {
+			ceph_decode_32_safe(p, end, num_export_targets, bad);
+			pexport_targets = *p;
+			*p += num_export_targets * sizeof(u32);
+		} else {
+			num_export_targets = 0;
+		}
+
+		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+		     i+1, n, global_id, mds, inc,
+		     ceph_pr_addr(&addr.in_addr),
+		     ceph_mds_state_name(state));
+
+		if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+			continue;
+
+		info = &m->m_info[mds];
+		info->global_id = global_id;
+		info->state = state;
+		info->addr = addr;
+		info->laggy = (laggy_since.tv_sec != 0 ||
+			       laggy_since.tv_nsec != 0);
+		info->num_export_targets = num_export_targets;
+		if (num_export_targets) {
+			info->export_targets = kcalloc(num_export_targets,
+						       sizeof(u32), GFP_NOFS);
+			if (info->export_targets == NULL)
+				goto badmem;
+			for (j = 0; j < num_export_targets; j++)
+				info->export_targets[j] =
+				       ceph_decode_32(&pexport_targets);
+		} else {
+			info->export_targets = NULL;
+		}
+	}
+
+	/* pg_pools */
+	ceph_decode_32_safe(p, end, n, bad);
+	m->m_num_data_pg_pools = n;
+	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
+	if (!m->m_data_pg_pools)
+		goto badmem;
+	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
+	for (i = 0; i < n; i++)
+		m->m_data_pg_pools[i] = ceph_decode_64(p);
+	m->m_cas_pg_pool = ceph_decode_64(p);
+
+	/* ok, we don't care about the rest. */
+	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+	return m;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	pr_err("corrupt mdsmap\n");
+	print_hex_dump(KERN_DEBUG, "mdsmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	ceph_mdsmap_destroy(m);
+	return ERR_PTR(err);
+}
+
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+	int i;
+
+	for (i = 0; i < m->m_max_mds; i++)
+		kfree(m->m_info[i].export_targets);
+	kfree(m->m_info);
+	kfree(m->m_data_pg_pools);
+	kfree(m);
+}
diff --git a/kernel/fs/ceph/snap.c b/kernel/fs/ceph/snap.c
new file mode 100644
index 000000000..a97e39f09
--- /dev/null
+++ b/kernel/fs/ceph/snap.c
@@ -0,0 +1,977 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/sort.h>
+#include <linux/slab.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client.  In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot.  Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory.  This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots.  An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system.  (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm.  This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here.  If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("get_realm %p %d -> %d\n", realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+	/*
+	 * since we _only_ increment realm refs or empty the empty
+	 * list with snap_rwsem held, adjusting the empty list here is
+	 * safe.  we do need to protect against concurrent empty list
+	 * additions, however.
+	 */
+	if (atomic_inc_return(&realm->nref) == 1) {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_del_init(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+}
+
+static void __insert_snap_realm(struct rb_root *root,
+				struct ceph_snap_realm *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_snap_realm *r = NULL;
+
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct ceph_snap_realm, node);
+		if (new->ino < r->ino)
+			p = &(*p)->rb_left;
+		else if (new->ino > r->ino)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+	struct ceph_mds_client *mdsc,
+	u64 ino)
+{
+	struct ceph_snap_realm *realm;
+
+	realm = kzalloc(sizeof(*realm), GFP_NOFS);
+	if (!realm)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&realm->nref, 1);    /* for caller */
+	realm->ino = ino;
+	INIT_LIST_HEAD(&realm->children);
+	INIT_LIST_HEAD(&realm->child_item);
+	INIT_LIST_HEAD(&realm->empty_item);
+	INIT_LIST_HEAD(&realm->dirty_item);
+	INIT_LIST_HEAD(&realm->inodes_with_caps);
+	spin_lock_init(&realm->inodes_with_caps_lock);
+	__insert_snap_realm(&mdsc->snap_realms, realm);
+	dout("create_snap_realm %llx %p\n", realm->ino, realm);
+	return realm;
+}
+
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
+						   u64 ino)
+{
+	struct rb_node *n = mdsc->snap_realms.rb_node;
+	struct ceph_snap_realm *r;
+
+	while (n) {
+		r = rb_entry(n, struct ceph_snap_realm, node);
+		if (ino < r->ino)
+			n = n->rb_left;
+		else if (ino > r->ino)
+			n = n->rb_right;
+		else {
+			dout("lookup_snap_realm %llx %p\n", r->ino, r);
+			return r;
+		}
+	}
+	return NULL;
+}
+
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino)
+{
+	struct ceph_snap_realm *r;
+	r = __lookup_snap_realm(mdsc, ino);
+	if (r)
+		ceph_get_snap_realm(mdsc, r);
+	return r;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+				 struct ceph_snap_realm *realm)
+{
+	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+	rb_erase(&realm->node, &mdsc->snap_realms);
+
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		__put_snap_realm(mdsc, realm->parent);
+	}
+
+	kfree(realm->prior_parent_snaps);
+	kfree(realm->snaps);
+	ceph_put_snap_context(realm->cached_context);
+	kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm)
+{
+	dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (atomic_dec_and_test(&realm->nref))
+		__destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (!atomic_dec_and_test(&realm->nref))
+		return;
+
+	if (down_write_trylock(&mdsc->snap_rwsem)) {
+		__destroy_snap_realm(mdsc, realm);
+		up_write(&mdsc->snap_rwsem);
+	} else {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_add(&realm->empty_item, &mdsc->snap_empty);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero.  Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snap_realm *realm;
+
+	spin_lock(&mdsc->snap_empty_lock);
+	while (!list_empty(&mdsc->snap_empty)) {
+		realm = list_first_entry(&mdsc->snap_empty,
+				   struct ceph_snap_realm, empty_item);
+		list_del(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+		__destroy_snap_realm(mdsc, realm);
+		spin_lock(&mdsc->snap_empty_lock);
+	}
+	spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	down_write(&mdsc->snap_rwsem);
+	__cleanup_empty_realms(mdsc);
+	up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm.  adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+				    struct ceph_snap_realm *realm,
+				    u64 parentino)
+{
+	struct ceph_snap_realm *parent;
+
+	if (realm->parent_ino == parentino)
+		return 0;
+
+	parent = ceph_lookup_snap_realm(mdsc, parentino);
+	if (!parent) {
+		parent = ceph_create_snap_realm(mdsc, parentino);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+	}
+	dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+	     realm->ino, realm, realm->parent_ino, realm->parent,
+	     parentino, parent);
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		ceph_put_snap_realm(mdsc, realm->parent);
+	}
+	realm->parent_ino = parentino;
+	realm->parent = parent;
+	list_add(&realm->child_item, &parent->children);
+	return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+	if (*(u64 *)a < *(u64 *)b)
+		return 1;
+	if (*(u64 *)a > *(u64 *)b)
+		return -1;
+	return 0;
+}
+
+
+static struct ceph_snap_context *empty_snapc;
+
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *parent = realm->parent;
+	struct ceph_snap_context *snapc;
+	int err = 0;
+	u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+	/*
+	 * build parent context, if it hasn't been built.
+	 * conservatively estimate that all parent snaps might be
+	 * included by us.
+	 */
+	if (parent) {
+		if (!parent->cached_context) {
+			err = build_snap_context(parent);
+			if (err)
+				goto fail;
+		}
+		num += parent->cached_context->num_snaps;
+	}
+
+	/* do i actually need to update?  not if my context seq
+	   matches realm seq, and my parents' does to.  (this works
+	   because we rebuild_snap_realms() works _downward_ in
+	   hierarchy after each update.) */
+	if (realm->cached_context &&
+	    realm->cached_context->seq == realm->seq &&
+	    (!parent ||
+	     realm->cached_context->seq >= parent->cached_context->seq)) {
+		dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
+		     " (unchanged)\n",
+		     realm->ino, realm, realm->cached_context,
+		     realm->cached_context->seq,
+		     (unsigned int) realm->cached_context->num_snaps);
+		return 0;
+	}
+
+	if (num == 0 && realm->seq == empty_snapc->seq) {
+		ceph_get_snap_context(empty_snapc);
+		snapc = empty_snapc;
+		goto done;
+	}
+
+	/* alloc new snap context */
+	err = -ENOMEM;
+	if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
+		goto fail;
+	snapc = ceph_create_snap_context(num, GFP_NOFS);
+	if (!snapc)
+		goto fail;
+
+	/* build (reverse sorted) snap vector */
+	num = 0;
+	snapc->seq = realm->seq;
+	if (parent) {
+		u32 i;
+
+		/* include any of parent's snaps occurring _after_ my
+		   parent became my parent */
+		for (i = 0; i < parent->cached_context->num_snaps; i++)
+			if (parent->cached_context->snaps[i] >=
+			    realm->parent_since)
+				snapc->snaps[num++] =
+					parent->cached_context->snaps[i];
+		if (parent->cached_context->seq > snapc->seq)
+			snapc->seq = parent->cached_context->seq;
+	}
+	memcpy(snapc->snaps + num, realm->snaps,
+	       sizeof(u64)*realm->num_snaps);
+	num += realm->num_snaps;
+	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+	       sizeof(u64)*realm->num_prior_parent_snaps);
+	num += realm->num_prior_parent_snaps;
+
+	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+	snapc->num_snaps = num;
+	dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
+	     realm->ino, realm, snapc, snapc->seq,
+	     (unsigned int) snapc->num_snaps);
+
+done:
+	ceph_put_snap_context(realm->cached_context);
+	realm->cached_context = snapc;
+	return 0;
+
+fail:
+	/*
+	 * if we fail, clear old (incorrect) cached_context... hopefully
+	 * we'll have better luck building it later
+	 */
+	if (realm->cached_context) {
+		ceph_put_snap_context(realm->cached_context);
+		realm->cached_context = NULL;
+	}
+	pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+	       realm, err);
+	return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *child;
+
+	dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+	build_snap_context(realm);
+
+	list_for_each_entry(child, &realm->children, child_item)
+		rebuild_snap_realms(child);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids.  free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, u32 num)
+{
+	u32 i;
+
+	kfree(*dst);
+	if (num) {
+		*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+		if (!*dst)
+			return -ENOMEM;
+		for (i = 0; i < num; i++)
+			(*dst)[i] = get_unaligned_le64(src + i);
+	} else {
+		*dst = NULL;
+	}
+	return 0;
+}
+
+
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known).  In this case the
+ * cap_snap->writing = 1, and is said to be "pending."  When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap_snap *capsnap;
+	int used, dirty;
+
+	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+	if (!capsnap) {
+		pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+		return;
+	}
+
+	spin_lock(&ci->i_ceph_lock);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+
+	/*
+	 * If there is a write in progress, treat that as a dirty Fw,
+	 * even though it hasn't completed yet; by the time we finish
+	 * up this capsnap it will be.
+	 */
+	if (used & CEPH_CAP_FILE_WR)
+		dirty |= CEPH_CAP_FILE_WR;
+
+	if (__ceph_have_pending_cap_snap(ci)) {
+		/* there is no point in queuing multiple "pending" cap_snaps,
+		   as no new writes are allowed to start when pending, so any
+		   writes in progress now were started before the previous
+		   cap_snap.  lucky us. */
+		dout("queue_cap_snap %p already pending\n", inode);
+		kfree(capsnap);
+	} else if (ci->i_snap_realm->cached_context == empty_snapc) {
+		dout("queue_cap_snap %p empty snapc\n", inode);
+		kfree(capsnap);
+	} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+			    CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
+		struct ceph_snap_context *snapc = ci->i_head_snapc;
+
+		/*
+		 * if we are a sync write, we may need to go to the snaprealm
+		 * to get the current snapc.
+		 */
+		if (!snapc)
+			snapc = ci->i_snap_realm->cached_context;
+
+		dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
+		     inode, capsnap, snapc, ceph_cap_string(dirty));
+		ihold(inode);
+
+		atomic_set(&capsnap->nref, 1);
+		capsnap->ci = ci;
+		INIT_LIST_HEAD(&capsnap->ci_item);
+		INIT_LIST_HEAD(&capsnap->flushing_item);
+
+		capsnap->follows = snapc->seq;
+		capsnap->issued = __ceph_caps_issued(ci, NULL);
+		capsnap->dirty = dirty;
+
+		capsnap->mode = inode->i_mode;
+		capsnap->uid = inode->i_uid;
+		capsnap->gid = inode->i_gid;
+
+		if (dirty & CEPH_CAP_XATTR_EXCL) {
+			__ceph_build_xattrs_blob(ci);
+			capsnap->xattr_blob =
+				ceph_buffer_get(ci->i_xattrs.blob);
+			capsnap->xattr_version = ci->i_xattrs.version;
+		} else {
+			capsnap->xattr_blob = NULL;
+			capsnap->xattr_version = 0;
+		}
+
+		capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+		/* dirty page count moved from _head to this cap_snap;
+		   all subsequent writes page dirties occur _after_ this
+		   snapshot. */
+		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+		ci->i_wrbuffer_ref_head = 0;
+		capsnap->context = snapc;
+		ci->i_head_snapc =
+			ceph_get_snap_context(ci->i_snap_realm->cached_context);
+		dout(" new snapc is %p\n", ci->i_head_snapc);
+		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+		if (used & CEPH_CAP_FILE_WR) {
+			dout("queue_cap_snap %p cap_snap %p snapc %p"
+			     " seq %llu used WR, now pending\n", inode,
+			     capsnap, snapc, snapc->seq);
+			capsnap->writing = 1;
+		} else {
+			/* note mtime, size NOW. */
+			__ceph_finish_cap_snap(ci, capsnap);
+		}
+	} else {
+		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+		kfree(capsnap);
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_ceph_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+			    struct ceph_cap_snap *capsnap)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	BUG_ON(capsnap->writing);
+	capsnap->size = inode->i_size;
+	capsnap->mtime = inode->i_mtime;
+	capsnap->atime = inode->i_atime;
+	capsnap->ctime = inode->i_ctime;
+	capsnap->time_warp_seq = ci->i_time_warp_seq;
+	if (capsnap->dirty_pages) {
+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
+		     "still has %d dirty pages\n", inode, capsnap,
+		     capsnap->context, capsnap->context->seq,
+		     ceph_cap_string(capsnap->dirty), capsnap->size,
+		     capsnap->dirty_pages);
+		return 0;
+	}
+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
+	     inode, capsnap, capsnap->context,
+	     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+	     capsnap->size);
+
+	spin_lock(&mdsc->snap_flush_lock);
+	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	spin_unlock(&mdsc->snap_flush_lock);
+	return 1;  /* caller may want to ceph_flush_snaps */
+}
+
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+	struct ceph_inode_info *ci;
+	struct inode *lastinode = NULL;
+	struct ceph_snap_realm *child;
+
+	dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+
+	spin_lock(&realm->inodes_with_caps_lock);
+	list_for_each_entry(ci, &realm->inodes_with_caps,
+			    i_snap_realm_item) {
+		struct inode *inode = igrab(&ci->vfs_inode);
+		if (!inode)
+			continue;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		iput(lastinode);
+		lastinode = inode;
+		ceph_queue_cap_snap(ci);
+		spin_lock(&realm->inodes_with_caps_lock);
+	}
+	spin_unlock(&realm->inodes_with_caps_lock);
+	iput(lastinode);
+
+	list_for_each_entry(child, &realm->children, child_item) {
+		dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
+		     realm, realm->ino, child, child->ino);
+		list_del_init(&child->dirty_item);
+		list_add(&child->dirty_item, &realm->dirty_item);
+	}
+
+	list_del_init(&realm->dirty_item);
+	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+			   void *p, void *e, bool deletion,
+			   struct ceph_snap_realm **realm_ret)
+{
+	struct ceph_mds_snap_realm *ri;    /* encoded */
+	__le64 *snaps;                     /* encoded */
+	__le64 *prior_parent_snaps;        /* encoded */
+	struct ceph_snap_realm *realm = NULL;
+	struct ceph_snap_realm *first_realm = NULL;
+	int invalidate = 0;
+	int err = -ENOMEM;
+	LIST_HEAD(dirty_realms);
+
+	dout("update_snap_trace deletion=%d\n", deletion);
+more:
+	ceph_decode_need(&p, e, sizeof(*ri), bad);
+	ri = p;
+	p += sizeof(*ri);
+	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+			    le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+	snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+	prior_parent_snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+	if (!realm) {
+		realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+		if (IS_ERR(realm)) {
+			err = PTR_ERR(realm);
+			goto fail;
+		}
+	}
+
+	/* ensure the parent is correct */
+	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+	if (err < 0)
+		goto fail;
+	invalidate += err;
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+		/* update realm parameters, snap lists */
+		realm->seq = le64_to_cpu(ri->seq);
+		realm->created = le64_to_cpu(ri->created);
+		realm->parent_since = le64_to_cpu(ri->parent_since);
+
+		realm->num_snaps = le32_to_cpu(ri->num_snaps);
+		err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+		if (err < 0)
+			goto fail;
+
+		realm->num_prior_parent_snaps =
+			le32_to_cpu(ri->num_prior_parent_snaps);
+		err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+				realm->num_prior_parent_snaps);
+		if (err < 0)
+			goto fail;
+
+		/* queue realm for cap_snap creation */
+		list_add(&realm->dirty_item, &dirty_realms);
+
+		invalidate = 1;
+	} else if (!realm->cached_context) {
+		dout("update_snap_trace %llx %p seq %lld new\n",
+		     realm->ino, realm, realm->seq);
+		invalidate = 1;
+	} else {
+		dout("update_snap_trace %llx %p seq %lld unchanged\n",
+		     realm->ino, realm, realm->seq);
+	}
+
+	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+	     realm, invalidate, p, e);
+
+	/* invalidate when we reach the _end_ (root) of the trace */
+	if (invalidate && p >= e)
+		rebuild_snap_realms(realm);
+
+	if (!first_realm)
+		first_realm = realm;
+	else
+		ceph_put_snap_realm(mdsc, realm);
+
+	if (p < e)
+		goto more;
+
+	/*
+	 * queue cap snaps _after_ we've built the new snap contexts,
+	 * so that i_head_snapc can be set appropriately.
+	 */
+	while (!list_empty(&dirty_realms)) {
+		realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+					 dirty_item);
+		queue_realm_cap_snaps(realm);
+	}
+
+	if (realm_ret)
+		*realm_ret = first_realm;
+	else
+		ceph_put_snap_realm(mdsc, first_realm);
+
+	__cleanup_empty_realms(mdsc);
+	return 0;
+
+bad:
+	err = -EINVAL;
+fail:
+	if (realm && !IS_ERR(realm))
+		ceph_put_snap_realm(mdsc, realm);
+	if (first_realm)
+		ceph_put_snap_realm(mdsc, first_realm);
+	pr_err("update_snap_trace error %d\n", err);
+	return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush.  Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+	struct ceph_mds_session *session = NULL;
+
+	dout("flush_snaps\n");
+	spin_lock(&mdsc->snap_flush_lock);
+	while (!list_empty(&mdsc->snap_flush_list)) {
+		ci = list_first_entry(&mdsc->snap_flush_list,
+				struct ceph_inode_info, i_snap_flush_item);
+		inode = &ci->vfs_inode;
+		ihold(inode);
+		spin_unlock(&mdsc->snap_flush_lock);
+		spin_lock(&ci->i_ceph_lock);
+		__ceph_flush_snaps(ci, &session, 0);
+		spin_unlock(&ci->i_ceph_lock);
+		iput(inode);
+		spin_lock(&mdsc->snap_flush_lock);
+	}
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm.  This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+		      struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->fsc->sb;
+	int mds = session->s_mds;
+	u64 split;
+	int op;
+	int trace_len;
+	struct ceph_snap_realm *realm = NULL;
+	void *p = msg->front.iov_base;
+	void *e = p + msg->front.iov_len;
+	struct ceph_mds_snap_head *h;
+	int num_split_inos, num_split_realms;
+	__le64 *split_inos = NULL, *split_realms = NULL;
+	int i;
+	int locked_rwsem = 0;
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = p;
+	op = le32_to_cpu(h->op);
+	split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
+					  * existing realm */
+	num_split_inos = le32_to_cpu(h->num_split_inos);
+	num_split_realms = le32_to_cpu(h->num_split_realms);
+	trace_len = le32_to_cpu(h->trace_len);
+	p += sizeof(*h);
+
+	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+	     ceph_snap_op_name(op), split, trace_len);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	mutex_unlock(&session->s_mutex);
+
+	down_write(&mdsc->snap_rwsem);
+	locked_rwsem = 1;
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		struct ceph_mds_snap_realm *ri;
+
+		/*
+		 * A "split" breaks part of an existing realm off into
+		 * a new realm.  The MDS provides a list of inodes
+		 * (with caps) and child realms that belong to the new
+		 * child.
+		 */
+		split_inos = p;
+		p += sizeof(u64) * num_split_inos;
+		split_realms = p;
+		p += sizeof(u64) * num_split_realms;
+		ceph_decode_need(&p, e, sizeof(*ri), bad);
+		/* we will peek at realm info here, but will _not_
+		 * advance p, as the realm update will occur below in
+		 * ceph_update_snap_trace. */
+		ri = p;
+
+		realm = ceph_lookup_snap_realm(mdsc, split);
+		if (!realm) {
+			realm = ceph_create_snap_realm(mdsc, split);
+			if (IS_ERR(realm))
+				goto out;
+		}
+
+		dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+			struct ceph_snap_realm *oldrealm;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+
+			spin_lock(&ci->i_ceph_lock);
+			if (!ci->i_snap_realm)
+				goto skip_inode;
+			/*
+			 * If this inode belongs to a realm that was
+			 * created after our new realm, we experienced
+			 * a race (due to another split notifications
+			 * arriving from a different MDS).  So skip
+			 * this inode.
+			 */
+			if (ci->i_snap_realm->created >
+			    le64_to_cpu(ri->created)) {
+				dout(" leaving %p in newer realm %llx %p\n",
+				     inode, ci->i_snap_realm->ino,
+				     ci->i_snap_realm);
+				goto skip_inode;
+			}
+			dout(" will move %p to split realm %llx %p\n",
+			     inode, realm->ino, realm);
+			/*
+			 * Move the inode to the new realm
+			 */
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_del_init(&ci->i_snap_realm_item);
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			oldrealm = ci->i_snap_realm;
+			ci->i_snap_realm = realm;
+			spin_unlock(&realm->inodes_with_caps_lock);
+			spin_unlock(&ci->i_ceph_lock);
+
+			ceph_get_snap_realm(mdsc, realm);
+			ceph_put_snap_realm(mdsc, oldrealm);
+
+			iput(inode);
+			continue;
+
+skip_inode:
+			spin_unlock(&ci->i_ceph_lock);
+			iput(inode);
+		}
+
+		/* we may have taken some of the old realm's children. */
+		for (i = 0; i < num_split_realms; i++) {
+			struct ceph_snap_realm *child =
+				__lookup_snap_realm(mdsc,
+					   le64_to_cpu(split_realms[i]));
+			if (!child)
+				continue;
+			adjust_snap_realm_parent(mdsc, child, realm->ino);
+		}
+	}
+
+	/*
+	 * update using the provided snap trace. if we are deleting a
+	 * snap, we can avoid queueing cap_snaps.
+	 */
+	ceph_update_snap_trace(mdsc, p, e,
+			       op == CEPH_SNAP_OP_DESTROY, NULL);
+
+	if (op == CEPH_SNAP_OP_SPLIT)
+		/* we took a reference when we created the realm, above */
+		ceph_put_snap_realm(mdsc, realm);
+
+	__cleanup_empty_realms(mdsc);
+
+	up_write(&mdsc->snap_rwsem);
+
+	flush_snaps(mdsc);
+	return;
+
+bad:
+	pr_err("corrupt snap message from mds%d\n", mds);
+	ceph_msg_dump(msg);
+out:
+	if (locked_rwsem)
+		up_write(&mdsc->snap_rwsem);
+	return;
+}
+
+int __init ceph_snap_init(void)
+{
+	empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
+	if (!empty_snapc)
+		return -ENOMEM;
+	empty_snapc->seq = 1;
+	return 0;
+}
+
+void ceph_snap_exit(void)
+{
+	ceph_put_snap_context(empty_snapc);
+}
diff --git a/kernel/fs/ceph/strings.c b/kernel/fs/ceph/strings.c
new file mode 100644
index 000000000..89e6bc321
--- /dev/null
+++ b/kernel/fs/ceph/strings.c
@@ -0,0 +1,125 @@
+/*
+ * Ceph fs string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
+	case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
+	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
diff --git a/kernel/fs/ceph/super.c b/kernel/fs/ceph/super.c
new file mode 100644
index 000000000..4e9905374
--- /dev/null
+++ b/kernel/fs/ceph/super.c
@@ -0,0 +1,1059 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+
+	dout("put_super\n");
+	ceph_mdsc_close_sessions(fsc->mdsc);
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
+	struct ceph_monmap *monmap = fsc->client->monc.monmap;
+	struct ceph_statfs st;
+	u64 fsid;
+	int err;
+
+	dout("statfs\n");
+	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
+	if (err < 0)
+		return err;
+
+	/* fill in kstatfs */
+	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
+
+	/*
+	 * express utilization in terms of large blocks to avoid
+	 * overflow on 32-bit machines.
+	 *
+	 * NOTE: for the time being, we make bsize == frsize to humor
+	 * not-yet-ancient versions of glibc that are broken.
+	 * Someday, we will probably want to report a real block
+	 * size...  whatever that may mean for a network file system!
+	 */
+	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+	buf->f_files = le64_to_cpu(st.num_objects);
+	buf->f_ffree = -1;
+	buf->f_namelen = NAME_MAX;
+
+	/* leave fsid little-endian, regardless of host endianness */
+	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+	buf->f_fsid.val[0] = fsid & 0xffffffff;
+	buf->f_fsid.val[1] = fsid >> 32;
+
+	return 0;
+}
+
+
+static int ceph_sync_fs(struct super_block *sb, int wait)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	if (!wait) {
+		dout("sync_fs (non-blocking)\n");
+		ceph_flush_dirty_caps(fsc->mdsc);
+		dout("sync_fs (non-blocking) done\n");
+		return 0;
+	}
+
+	dout("sync_fs (blocking)\n");
+	ceph_osdc_sync(&fsc->client->osdc);
+	ceph_mdsc_sync(fsc->mdsc);
+	dout("sync_fs (blocking) done\n");
+	return 0;
+}
+
+/*
+ * mount options
+ */
+enum {
+	Opt_wsize,
+	Opt_rsize,
+	Opt_rasize,
+	Opt_caps_wanted_delay_min,
+	Opt_caps_wanted_delay_max,
+	Opt_cap_release_safety,
+	Opt_readdir_max_entries,
+	Opt_readdir_max_bytes,
+	Opt_congestion_kb,
+	Opt_last_int,
+	/* int args above */
+	Opt_snapdirname,
+	Opt_last_string,
+	/* string args above */
+	Opt_dirstat,
+	Opt_nodirstat,
+	Opt_rbytes,
+	Opt_norbytes,
+	Opt_asyncreaddir,
+	Opt_noasyncreaddir,
+	Opt_dcache,
+	Opt_nodcache,
+	Opt_ino32,
+	Opt_noino32,
+	Opt_fscache,
+	Opt_nofscache,
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	Opt_acl,
+#endif
+	Opt_noacl
+};
+
+static match_table_t fsopt_tokens = {
+	{Opt_wsize, "wsize=%d"},
+	{Opt_rsize, "rsize=%d"},
+	{Opt_rasize, "rasize=%d"},
+	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_cap_release_safety, "cap_release_safety=%d"},
+	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
+	{Opt_congestion_kb, "write_congestion_kb=%d"},
+	/* int args above */
+	{Opt_snapdirname, "snapdirname=%s"},
+	/* string args above */
+	{Opt_dirstat, "dirstat"},
+	{Opt_nodirstat, "nodirstat"},
+	{Opt_rbytes, "rbytes"},
+	{Opt_norbytes, "norbytes"},
+	{Opt_asyncreaddir, "asyncreaddir"},
+	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{Opt_dcache, "dcache"},
+	{Opt_nodcache, "nodcache"},
+	{Opt_ino32, "ino32"},
+	{Opt_noino32, "noino32"},
+	{Opt_fscache, "fsc"},
+	{Opt_nofscache, "nofsc"},
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	{Opt_acl, "acl"},
+#endif
+	{Opt_noacl, "noacl"},
+	{-1, NULL}
+};
+
+static int parse_fsopt_token(char *c, void *private)
+{
+	struct ceph_mount_options *fsopt = private;
+	substring_t argstr[MAX_OPT_ARGS];
+	int token, intval, ret;
+
+	token = match_token((char *)c, fsopt_tokens, argstr);
+	if (token < 0)
+		return -EINVAL;
+
+	if (token < Opt_last_int) {
+		ret = match_int(&argstr[0], &intval);
+		if (ret < 0) {
+			pr_err("bad mount option arg (not int) "
+			       "at '%s'\n", c);
+			return ret;
+		}
+		dout("got int token %d val %d\n", token, intval);
+	} else if (token > Opt_last_int && token < Opt_last_string) {
+		dout("got string token %d val %s\n", token,
+		     argstr[0].from);
+	} else {
+		dout("got token %d\n", token);
+	}
+
+	switch (token) {
+	case Opt_snapdirname:
+		kfree(fsopt->snapdir_name);
+		fsopt->snapdir_name = kstrndup(argstr[0].from,
+					       argstr[0].to-argstr[0].from,
+					       GFP_KERNEL);
+		if (!fsopt->snapdir_name)
+			return -ENOMEM;
+		break;
+
+		/* misc */
+	case Opt_wsize:
+		fsopt->wsize = intval;
+		break;
+	case Opt_rsize:
+		fsopt->rsize = intval;
+		break;
+	case Opt_rasize:
+		fsopt->rasize = intval;
+		break;
+	case Opt_caps_wanted_delay_min:
+		fsopt->caps_wanted_delay_min = intval;
+		break;
+	case Opt_caps_wanted_delay_max:
+		fsopt->caps_wanted_delay_max = intval;
+		break;
+	case Opt_readdir_max_entries:
+		fsopt->max_readdir = intval;
+		break;
+	case Opt_readdir_max_bytes:
+		fsopt->max_readdir_bytes = intval;
+		break;
+	case Opt_congestion_kb:
+		fsopt->congestion_kb = intval;
+		break;
+	case Opt_dirstat:
+		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_nodirstat:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_rbytes:
+		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_norbytes:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_asyncreaddir:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
+	case Opt_noasyncreaddir:
+		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
+	case Opt_dcache:
+		fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+		break;
+	case Opt_nodcache:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+		break;
+	case Opt_ino32:
+		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+		break;
+	case Opt_noino32:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+		break;
+	case Opt_fscache:
+		fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+		break;
+	case Opt_nofscache:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
+		break;
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	case Opt_acl:
+		fsopt->sb_flags |= MS_POSIXACL;
+		break;
+#endif
+	case Opt_noacl:
+		fsopt->sb_flags &= ~MS_POSIXACL;
+		break;
+	default:
+		BUG_ON(token);
+	}
+	return 0;
+}
+
+static void destroy_mount_options(struct ceph_mount_options *args)
+{
+	dout("destroy_mount_options %p\n", args);
+	kfree(args->snapdir_name);
+	kfree(args);
+}
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
+
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+				 struct ceph_options *new_opt,
+				 struct ceph_fs_client *fsc)
+{
+	struct ceph_mount_options *fsopt1 = new_fsopt;
+	struct ceph_mount_options *fsopt2 = fsc->mount_options;
+	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+	int ret;
+
+	ret = memcmp(fsopt1, fsopt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+	if (ret)
+		return ret;
+
+	return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+			       struct ceph_options **popt,
+			       int flags, char *options,
+			       const char *dev_name,
+			       const char **path)
+{
+	struct ceph_mount_options *fsopt;
+	const char *dev_name_end;
+	int err;
+
+	if (!dev_name || !*dev_name)
+		return -EINVAL;
+
+	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+	if (!fsopt)
+		return -ENOMEM;
+
+	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+	fsopt->sb_flags = flags;
+	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+	fsopt->rsize = CEPH_RSIZE_DEFAULT;
+	fsopt->rasize = CEPH_RASIZE_DEFAULT;
+	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	if (!fsopt->snapdir_name) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+	fsopt->congestion_kb = default_congestion_kb();
+
+	/*
+	 * Distinguish the server list from the path in "dev_name".
+	 * Internally we do not include the leading '/' in the path.
+	 *
+	 * "dev_name" will look like:
+	 *     <server_spec>[,<server_spec>...]:[<path>]
+	 * where
+	 *     <server_spec> is <ip>[:<port>]
+	 *     <path> is optional, but if present must begin with '/'
+	 */
+	dev_name_end = strchr(dev_name, '/');
+	if (dev_name_end) {
+		/* skip over leading '/' for path */
+		*path = dev_name_end + 1;
+	} else {
+		/* path is empty */
+		dev_name_end = dev_name + strlen(dev_name);
+		*path = dev_name_end;
+	}
+	err = -EINVAL;
+	dev_name_end--;		/* back up to ':' separator */
+	if (dev_name_end < dev_name || *dev_name_end != ':') {
+		pr_err("device name is missing path (no : separator in %s)\n",
+				dev_name);
+		goto out;
+	}
+	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
+	dout("server path '%s'\n", *path);
+
+	*popt = ceph_parse_options(options, dev_name, dev_name_end,
+				 parse_fsopt_token, (void *)fsopt);
+	if (IS_ERR(*popt)) {
+		err = PTR_ERR(*popt);
+		goto out;
+	}
+
+	/* success */
+	*pfsopt = fsopt;
+	return 0;
+
+out:
+	destroy_mount_options(fsopt);
+	return err;
+}
+
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @root: root of that (sub)tree
+ */
+static int ceph_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
+	struct ceph_mount_options *fsopt = fsc->mount_options;
+	size_t pos;
+	int ret;
+
+	/* a comma between MNT/MS and client options */
+	seq_putc(m, ',');
+	pos = m->count;
+
+	ret = ceph_print_client_options(m, fsc->client);
+	if (ret)
+		return ret;
+
+	/* retract our comma if no client options */
+	if (m->count == pos)
+		m->count--;
+
+	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+	if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
+		seq_puts(m, ",nodcache");
+	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
+		seq_puts(m, ",fsc");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	if (fsopt->sb_flags & MS_POSIXACL)
+		seq_puts(m, ",acl");
+	else
+		seq_puts(m, ",noacl");
+#endif
+
+	if (fsopt->wsize)
+		seq_printf(m, ",wsize=%d", fsopt->wsize);
+	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", fsopt->rsize);
+	if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
+		seq_printf(m, ",rasize=%d", fsopt->rasize);
+	if (fsopt->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 fsopt->caps_wanted_delay_min);
+	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   fsopt->caps_wanted_delay_max);
+	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   fsopt->cap_release_safety);
+	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+
+	return 0;
+}
+
+/*
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
+ */
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
+{
+	struct ceph_fs_client *fsc = client->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(fsc->mdsc, msg);
+		return 0;
+
+	default:
+		return -1;
+	}
+}
+
+/*
+ * create a new fs client
+ */
+static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+					struct ceph_options *opt)
+{
+	struct ceph_fs_client *fsc;
+	const u64 supported_features =
+		CEPH_FEATURE_FLOCK |
+		CEPH_FEATURE_DIRLAYOUTHASH |
+		CEPH_FEATURE_MDS_INLINE_DATA;
+	const u64 required_features = 0;
+	int page_count;
+	size_t size;
+	int err = -ENOMEM;
+
+	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+	if (!fsc)
+		return ERR_PTR(-ENOMEM);
+
+	fsc->client = ceph_create_client(opt, fsc, supported_features,
+					 required_features);
+	if (IS_ERR(fsc->client)) {
+		err = PTR_ERR(fsc->client);
+		goto fail;
+	}
+	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->monc.want_mdsmap = 1;
+
+	fsc->mount_options = fsopt;
+
+	fsc->sb = NULL;
+	fsc->mount_state = CEPH_MOUNT_MOUNTING;
+
+	atomic_long_set(&fsc->writeback_count, 0);
+
+	err = bdi_init(&fsc->backing_dev_info);
+	if (err < 0)
+		goto fail_client;
+
+	err = -ENOMEM;
+	/*
+	 * The number of concurrent works can be high but they don't need
+	 * to be processed in parallel, limit concurrency.
+	 */
+	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
+	if (fsc->wb_wq == NULL)
+		goto fail_bdi;
+	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
+	if (fsc->pg_inv_wq == NULL)
+		goto fail_wb_wq;
+	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
+	if (fsc->trunc_wq == NULL)
+		goto fail_pg_inv_wq;
+
+	/* set up mempools */
+	err = -ENOMEM;
+	page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
+	size = sizeof (struct page *) * (page_count ? page_count : 1);
+	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
+	if (!fsc->wb_pagevec_pool)
+		goto fail_trunc_wq;
+
+	/* setup fscache */
+	if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
+	    (ceph_fscache_register_fs(fsc) != 0))
+		goto fail_fscache;
+
+	/* caps */
+	fsc->min_caps = fsopt->max_readdir;
+
+	return fsc;
+
+fail_fscache:
+	ceph_fscache_unregister_fs(fsc);
+fail_trunc_wq:
+	destroy_workqueue(fsc->trunc_wq);
+fail_pg_inv_wq:
+	destroy_workqueue(fsc->pg_inv_wq);
+fail_wb_wq:
+	destroy_workqueue(fsc->wb_wq);
+fail_bdi:
+	bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+	ceph_destroy_client(fsc->client);
+fail:
+	kfree(fsc);
+	return ERR_PTR(err);
+}
+
+static void destroy_fs_client(struct ceph_fs_client *fsc)
+{
+	dout("destroy_fs_client %p\n", fsc);
+
+	ceph_fscache_unregister_fs(fsc);
+
+	destroy_workqueue(fsc->wb_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
+	destroy_workqueue(fsc->trunc_wq);
+
+	bdi_destroy(&fsc->backing_dev_info);
+
+	mempool_destroy(fsc->wb_pagevec_pool);
+
+	destroy_mount_options(fsc->mount_options);
+
+	ceph_fs_debugfs_cleanup(fsc);
+
+	ceph_destroy_client(fsc->client);
+
+	kfree(fsc);
+	dout("destroy_fs_client %p done\n", fsc);
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+	int error = -ENOMEM;
+
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
+	if ((error = ceph_fscache_register()))
+		goto bad_file;
+
+	return 0;
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return error;
+}
+
+static void destroy_caches(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
+
+	ceph_fscache_unregister();
+}
+
+
+/*
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!fsc)
+		return;
+	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
+}
+
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.drop_inode	= ceph_drop_inode,
+	.sync_fs        = ceph_sync_fs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
+/*
+ * Bootstrap mount by opening the root directory.  Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
+				       const char *path,
+				       unsigned long started)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req = NULL;
+	int err;
+	struct dentry *root;
+
+	/* open dir */
+	dout("open_root_inode opening '%s'\n", path);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_path1 = kstrdup(path, GFP_NOFS);
+	if (!req->r_path1) {
+		root = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	req->r_ino1.ino = CEPH_INO_ROOT;
+	req->r_ino1.snap = CEPH_NOSNAP;
+	req->r_started = started;
+	req->r_timeout = fsc->client->options->mount_timeout * HZ;
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_num_caps = 2;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	if (err == 0) {
+		struct inode *inode = req->r_target_inode;
+		req->r_target_inode = NULL;
+		dout("open_root_inode success\n");
+		if (ceph_ino(inode) == CEPH_INO_ROOT &&
+		    fsc->sb->s_root == NULL) {
+			root = d_make_root(inode);
+			if (!root) {
+				root = ERR_PTR(-ENOMEM);
+				goto out;
+			}
+		} else {
+			root = d_obtain_root(inode);
+		}
+		ceph_init_dentry(root);
+		dout("open_root_inode success, root dentry is %p\n", root);
+	} else {
+		root = ERR_PTR(err);
+	}
+out:
+	ceph_mdsc_put_request(req);
+	return root;
+}
+
+
+
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
+		      const char *path)
+{
+	int err;
+	unsigned long started = jiffies;  /* note the start time */
+	struct dentry *root;
+	int first = 0;   /* first vfsmount for this super_block */
+
+	dout("mount start\n");
+	mutex_lock(&fsc->client->mount_mutex);
+
+	err = __ceph_open_session(fsc->client, started);
+	if (err < 0)
+		goto out;
+
+	dout("mount opening root\n");
+	root = open_root_dentry(fsc, "", started);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+	if (fsc->sb->s_root) {
+		dput(root);
+	} else {
+		fsc->sb->s_root = root;
+		first = 1;
+
+		err = ceph_fs_debugfs_init(fsc);
+		if (err < 0)
+			goto fail;
+	}
+
+	if (path[0] == 0) {
+		dget(root);
+	} else {
+		dout("mount opening base mountpoint\n");
+		root = open_root_dentry(fsc, path, started);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			goto fail;
+		}
+	}
+
+	fsc->mount_state = CEPH_MOUNT_MOUNTED;
+	dout("mount success\n");
+	mutex_unlock(&fsc->client->mount_mutex);
+	return root;
+
+out:
+	mutex_unlock(&fsc->client->mount_mutex);
+	return ERR_PTR(err);
+
+fail:
+	if (first) {
+		dput(fsc->sb->s_root);
+		fsc->sb->s_root = NULL;
+	}
+	goto out;
+}
+
+static int ceph_set_super(struct super_block *s, void *data)
+{
+	struct ceph_fs_client *fsc = data;
+	int ret;
+
+	dout("set_super %p data %p\n", s, data);
+
+	s->s_flags = fsc->mount_options->sb_flags;
+	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+
+	s->s_xattr = ceph_xattr_handlers;
+	s->s_fs_info = fsc;
+	fsc->sb = s;
+
+	s->s_op = &ceph_super_ops;
+	s->s_export_op = &ceph_export_ops;
+
+	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+
+	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
+	if (ret != 0)
+		goto fail;
+
+	return ret;
+
+fail:
+	s->s_fs_info = NULL;
+	fsc->sb = NULL;
+	return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+	struct ceph_fs_client *new = data;
+	struct ceph_mount_options *fsopt = new->mount_options;
+	struct ceph_options *opt = new->client->options;
+	struct ceph_fs_client *other = ceph_sb_to_client(sb);
+
+	dout("ceph_compare_super %p\n", sb);
+
+	if (compare_mount_options(fsopt, opt, other)) {
+		dout("monitor(s)/mount options don't match\n");
+		return 0;
+	}
+	if ((opt->flags & CEPH_OPT_FSID) &&
+	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+		dout("fsid doesn't match\n");
+		return 0;
+	}
+	if (fsopt->sb_flags != other->mount_options->sb_flags) {
+		dout("flags differ\n");
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+static int ceph_register_bdi(struct super_block *sb,
+			     struct ceph_fs_client *fsc)
+{
+	int err;
+
+	/* set ra_pages based on rasize mount option? */
+	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
+		fsc->backing_dev_info.ra_pages =
+			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
+	else
+		fsc->backing_dev_info.ra_pages =
+			VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+
+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
+			   atomic_long_inc_return(&bdi_seq));
+	if (!err)
+		sb->s_bdi = &fsc->backing_dev_info;
+	return err;
+}
+
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data)
+{
+	struct super_block *sb;
+	struct ceph_fs_client *fsc;
+	struct dentry *res;
+	int err;
+	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+	const char *path = NULL;
+	struct ceph_mount_options *fsopt = NULL;
+	struct ceph_options *opt = NULL;
+
+	dout("ceph_mount\n");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	flags |= MS_POSIXACL;
+#endif
+	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+	if (err < 0) {
+		res = ERR_PTR(err);
+		goto out_final;
+	}
+
+	/* create client (which we may/may not use) */
+	fsc = create_fs_client(fsopt, opt);
+	if (IS_ERR(fsc)) {
+		res = ERR_CAST(fsc);
+		destroy_mount_options(fsopt);
+		ceph_destroy_options(opt);
+		goto out_final;
+	}
+
+	err = ceph_mdsc_init(fsc);
+	if (err < 0) {
+		res = ERR_PTR(err);
+		goto out;
+	}
+
+	if (ceph_test_opt(fsc->client, NOSHARE))
+		compare_super = NULL;
+	sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
+	if (IS_ERR(sb)) {
+		res = ERR_CAST(sb);
+		goto out;
+	}
+
+	if (ceph_sb_to_client(sb) != fsc) {
+		ceph_mdsc_destroy(fsc);
+		destroy_fs_client(fsc);
+		fsc = ceph_sb_to_client(sb);
+		dout("get_sb got existing client %p\n", fsc);
+	} else {
+		dout("get_sb using new client %p\n", fsc);
+		err = ceph_register_bdi(sb, fsc);
+		if (err < 0) {
+			res = ERR_PTR(err);
+			goto out_splat;
+		}
+	}
+
+	res = ceph_real_mount(fsc, path);
+	if (IS_ERR(res))
+		goto out_splat;
+	dout("root %p inode %p ino %llx.%llx\n", res,
+	     d_inode(res), ceph_vinop(d_inode(res)));
+	return res;
+
+out_splat:
+	ceph_mdsc_close_sessions(fsc->mdsc);
+	deactivate_locked_super(sb);
+	goto out_final;
+
+out:
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
+out_final:
+	dout("ceph_mount fail %ld\n", PTR_ERR(res));
+	return res;
+}
+
+static void ceph_kill_sb(struct super_block *s)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+	dev_t dev = s->s_dev;
+
+	dout("kill_sb %p\n", s);
+
+	ceph_mdsc_pre_umount(fsc->mdsc);
+	generic_shutdown_super(s);
+	ceph_mdsc_destroy(fsc);
+
+	destroy_fs_client(fsc);
+	free_anon_bdev(dev);
+}
+
+static struct file_system_type ceph_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ceph",
+	.mount		= ceph_mount,
+	.kill_sb	= ceph_kill_sb,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("ceph");
+
+static int __init init_ceph(void)
+{
+	int ret = init_caches();
+	if (ret)
+		goto out;
+
+	ceph_flock_init();
+	ceph_xattr_init();
+	ret = ceph_snap_init();
+	if (ret)
+		goto out_xattr;
+	ret = register_filesystem(&ceph_fs_type);
+	if (ret)
+		goto out_snap;
+
+	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
+	return 0;
+
+out_snap:
+	ceph_snap_exit();
+out_xattr:
+	ceph_xattr_exit();
+	destroy_caches();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph(void)
+{
+	dout("exit_ceph\n");
+	unregister_filesystem(&ceph_fs_type);
+	ceph_snap_exit();
+	ceph_xattr_exit();
+	destroy_caches();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/ceph/super.h b/kernel/fs/ceph/super.h
new file mode 100644
index 000000000..fa20e1318
--- /dev/null
+++ b/kernel/fs/ceph/super.h
@@ -0,0 +1,946 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/posix_acl.h>
+
+#include <linux/ceph/libceph.h>
+
+#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/fscache.h>
+#endif
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT   22  /* 4 MB */
+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
+
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
+#define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */
+
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES | \
+				   CEPH_MOUNT_OPT_DCACHE)
+
+#define ceph_set_mount_opt(fsc, opt) \
+	(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
+
+#define CEPH_RSIZE_DEFAULT             0           /* max read size */
+#define CEPH_RASIZE_DEFAULT            (8192*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT        1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
+
+struct ceph_mount_options {
+	int flags;
+	int sb_flags;
+
+	int wsize;            /* max write size */
+	int rsize;            /* max read size */
+	int rasize;           /* max readahead */
+	int congestion_kb;    /* max writeback in flight */
+	int caps_wanted_delay_min, caps_wanted_delay_max;
+	int cap_release_safety;
+	int max_readdir;       /* max readdir result (entires) */
+	int max_readdir_bytes; /* max readdir result (bytes) */
+
+	/*
+	 * everything above this point can be memcmp'd; everything below
+	 * is handled in compare_mount_options()
+	 */
+
+	char *snapdir_name;   /* default ".snap" */
+};
+
+struct ceph_fs_client {
+	struct super_block *sb;
+
+	struct ceph_mount_options *mount_options;
+	struct ceph_client *client;
+
+	unsigned long mount_state;
+	int min_caps;                  /* min caps i added */
+
+	struct ceph_mds_client *mdsc;
+
+	/* writeback */
+	mempool_t *wb_pagevec_pool;
+	struct workqueue_struct *wb_wq;
+	struct workqueue_struct *pg_inv_wq;
+	struct workqueue_struct *trunc_wq;
+	atomic_long_t writeback_count;
+
+	struct backing_dev_info backing_dev_info;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_congestion_kb;
+	struct dentry *debugfs_bdi;
+	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+	struct dentry *debugfs_mds_sessions;
+#endif
+
+#ifdef CONFIG_CEPH_FSCACHE
+	struct fscache_cookie *fscache;
+	struct workqueue_struct *revalidate_wq;
+#endif
+};
+
+
+/*
+ * File i/o capability.  This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data.  For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+	struct ceph_inode_info *ci;
+	struct rb_node ci_node;          /* per-ci cap tree */
+	struct ceph_mds_session *session;
+	struct list_head session_caps;   /* per-session caplist */
+	int mds;
+	u64 cap_id;       /* unique cap id (mds provided) */
+	int issued;       /* latest, from the mds */
+	int implemented;  /* implemented superset of issued (for revocation) */
+	int mds_wanted;
+	u32 seq, issue_seq, mseq;
+	u32 cap_gen;      /* active/stale cycle */
+	unsigned long last_used;
+	struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+
+/*
+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+	atomic_t nref;
+	struct ceph_inode_info *ci;
+	struct list_head ci_item, flushing_item;
+
+	u64 follows, flush_tid;
+	int issued, dirty;
+	struct ceph_snap_context *context;
+
+	umode_t mode;
+	kuid_t uid;
+	kgid_t gid;
+
+	struct ceph_buffer *xattr_blob;
+	u64 xattr_version;
+
+	u64 size;
+	struct timespec mtime, atime, ctime;
+	u64 time_warp_seq;
+	int writing;   /* a sync write is still in progress */
+	int dirty_pages;     /* dirty pages awaiting writeback */
+	bool inline_data;
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+	if (atomic_dec_and_test(&capsnap->nref)) {
+		if (capsnap->xattr_blob)
+			ceph_buffer_put(capsnap->xattr_blob);
+		kfree(capsnap);
+	}
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers.  It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info.  That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+	struct rb_node node;
+
+	/* fragtree state */
+	u32 frag;
+	int split_by;         /* i.e. 2^(split_by) children */
+
+	/* delegation and replication info */
+	int mds;              /* -1 if same authority as parent */
+	int ndist;            /* >0 if replicated */
+	int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+	struct rb_node node;
+
+	const char *name;
+	int name_len;
+	const char *val;
+	int val_len;
+	int dirty;
+
+	int should_free_name;
+	int should_free_val;
+};
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
+struct ceph_inode_xattrs_info {
+	/*
+	 * (still encoded) xattr blob. we avoid the overhead of parsing
+	 * this until someone actually calls getxattr, etc.
+	 *
+	 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+	 * NULL means we don't know.
+	*/
+	struct ceph_buffer *blob, *prealloc_blob;
+
+	struct rb_root index;
+	bool dirty;
+	int count;
+	int names_size;
+	int vals_size;
+	u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+struct ceph_inode_info {
+	struct ceph_vino i_vino;   /* ceph ino + snap */
+
+	spinlock_t i_ceph_lock;
+
+	u64 i_version;
+	u64 i_inline_version;
+	u32 i_time_warp_seq;
+
+	unsigned i_ceph_flags;
+	int i_ordered_count;
+	atomic_t i_release_count;
+	atomic_t i_complete_count;
+
+	struct ceph_dir_layout i_dir_layout;
+	struct ceph_file_layout i_layout;
+	char *i_symlink;
+
+	/* for dirs */
+	struct timespec i_rctime;
+	u64 i_rbytes, i_rfiles, i_rsubdirs;
+	u64 i_files, i_subdirs;
+
+	struct rb_root i_fragtree;
+	struct mutex i_fragtree_mutex;
+
+	struct ceph_inode_xattrs_info i_xattrs;
+
+	/* capabilities.  protected _both_ by i_ceph_lock and cap->session's
+	 * s_mutex. */
+	struct rb_root i_caps;           /* cap list */
+	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
+	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+	struct list_head i_dirty_item, i_flushing_item;
+	u64 i_cap_flush_seq;
+	/* we need to track cap writeback on a per-cap-bit basis, to allow
+	 * overlapping, pipelined cap flushes to the mds.  we can probably
+	 * reduce the tid to 8 bits if we're concerned about inode size. */
+	u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
+	unsigned long i_hold_caps_min; /* jiffies */
+	unsigned long i_hold_caps_max; /* jiffies */
+	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
+	struct ceph_cap_reservation i_cap_migration_resv;
+	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
+						    dirty|flushing caps */
+	unsigned i_snap_caps;           /* cap bits for snapped files */
+
+	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+
+	struct mutex i_truncate_mutex;
+	u32 i_truncate_seq;        /* last truncate to smaller size */
+	u64 i_truncate_size;       /*  and the size we last truncated down to */
+	int i_truncate_pending;    /*  still need to call vmtruncate */
+
+	u64 i_max_size;            /* max file size authorized by mds */
+	u64 i_reported_size; /* (max_)size reported to or requested of mds */
+	u64 i_wanted_max_size;     /* offset we'd like to write too */
+	u64 i_requested_max_size;  /* max_size we've requested */
+
+	/* held references to caps */
+	int i_pin_ref;
+	int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+	int i_wrbuffer_ref, i_wrbuffer_ref_head;
+	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
+	u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
+	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+	struct list_head i_unsafe_writes; /* uncommitted sync writes */
+	struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+	spinlock_t i_unsafe_lock;
+
+	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	int i_snap_realm_counter; /* snap realm (if caps) */
+	struct list_head i_snap_realm_item;
+	struct list_head i_snap_flush_item;
+
+	struct work_struct i_wb_work;  /* writeback work */
+	struct work_struct i_pg_inv_work;  /* page invalidation work */
+
+	struct work_struct i_vmtruncate_work;
+
+#ifdef CONFIG_CEPH_FSCACHE
+	struct fscache_cookie *fscache;
+	u32 i_fscache_gen; /* sequence, for delayed fscache validate */
+	struct work_struct i_revalidate_work;
+#endif
+	struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+	return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
+{
+	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
+{
+	return (struct ceph_fs_client *)sb->s_fs_info;
+}
+
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ *               i_ino (kernel inode)   st_ino (userspace)
+ * i386          32                     32
+ * x86_64+ino32  64                     32
+ * x86_64        64                     64
+ */
+static inline u32 ceph_ino_to_ino32(__u64 vino)
+{
+	u32 ino = vino & 0xffffffff;
+	ino ^= vino >> 32;
+	if (!ino)
+		ino = 2;
+	return ino;
+}
+
+/*
+ * kernel i_ino value
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+#if BITS_PER_LONG == 32
+	return ceph_ino_to_ino32(vino.ino);
+#else
+	return (ino_t)vino.ino;
+#endif
+}
+
+/*
+ * user-visible ino (stat, filldir)
+ */
+#if BITS_PER_LONG == 32
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+	return ino;
+}
+#else
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
+		ino = ceph_ino_to_ino32(ino);
+	return ino;
+}
+#endif
+
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_DIR_ORDERED	1  /* dentries in dir are ordered */
+#define CEPH_I_NODELAY		4  /* do not delay cap release */
+#define CEPH_I_FLUSH		8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH		16 /* do not flush dirty caps */
+
+static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
+					   int release_count, int ordered_count)
+{
+	atomic_set(&ci->i_complete_count, release_count);
+	if (ci->i_ordered_count == ordered_count)
+		ci->i_ceph_flags |= CEPH_I_DIR_ORDERED;
+	else
+		ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
+}
+
+static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
+{
+	atomic_inc(&ci->i_release_count);
+}
+
+static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
+{
+	return atomic_read(&ci->i_complete_count) ==
+		atomic_read(&ci->i_release_count);
+}
+
+static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
+{
+	return __ceph_dir_is_complete(ci) &&
+		(ci->i_ceph_flags & CEPH_I_DIR_ORDERED);
+}
+
+static inline void ceph_dir_clear_complete(struct inode *inode)
+{
+	__ceph_dir_clear_complete(ceph_inode(inode));
+}
+
+static inline void ceph_dir_clear_ordered(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	spin_lock(&ci->i_ceph_lock);
+	ci->i_ordered_count++;
+	ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
+	spin_unlock(&ci->i_ceph_lock);
+}
+
+static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool ret;
+	spin_lock(&ci->i_ceph_lock);
+	ret = __ceph_dir_is_complete_ordered(ci);
+	spin_unlock(&ci->i_ceph_lock);
+	return ret;
+}
+
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+						u32 f);
+
+/*
+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+			    struct ceph_inode_frag *pfrag,
+			    int *found);
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+	return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+	return ((loff_t)frag << 32) | (loff_t)off;
+}
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+				    struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+	int issued;
+	spin_lock(&ci->i_ceph_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	spin_unlock(&ci->i_ceph_lock);
+	return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+					int touch)
+{
+	int r;
+	spin_lock(&ci->i_ceph_lock);
+	r = __ceph_caps_issued_mask(ci, mask, touch);
+	spin_unlock(&ci->i_ceph_lock);
+	return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+	return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+				      struct ceph_cap *ocap, int mask);
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+	if (w & CEPH_CAP_FILE_BUFFER)
+		w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
+	return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+
+extern void ceph_caps_init(struct ceph_mds_client *mdsc);
+extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
+extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+			       struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_fs_client *client,
+				    int *total, int *avail, int *used,
+				    int *reserved, int *min);
+
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+#define CEPH_F_SYNC     1
+#define CEPH_F_ATEND    2
+
+struct ceph_file_info {
+	short fmode;     /* initialized on open */
+	short flags;     /* CEPH_F_* */
+
+	/* readdir: position within the dir */
+	u32 frag;
+	struct ceph_mds_request *last_readdir;
+
+	/* readdir: position within a frag */
+	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
+	unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
+	char *last_name;       /* last entry in previous chunk */
+	struct dentry *dentry; /* next dentry (for dcache readdir) */
+	int dir_release_count;
+	int dir_ordered_count;
+
+	/* used for -o dirstat read() on directory thing */
+	char *dir_info;
+	int dir_info_len;
+};
+
+
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it.  The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+	u64 ino;
+	atomic_t nref;
+	struct rb_node node;
+
+	u64 created, seq;
+	u64 parent_ino;
+	u64 parent_since;   /* snapid when our current parent became so */
+
+	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
+	u32 num_prior_parent_snaps;   /*  had prior to parent_since */
+	u64 *snaps;                   /* snaps specific to this realm */
+	u32 num_snaps;
+
+	struct ceph_snap_realm *parent;
+	struct list_head children;       /* list of child realms */
+	struct list_head child_item;
+
+	struct list_head empty_item;     /* if i have ref==0 */
+
+	struct list_head dirty_item;     /* if realm needs new context */
+
+	/* the current set of snaps for this realm */
+	struct ceph_snap_context *cached_context;
+
+	struct list_head inodes_with_caps;
+	spinlock_t inodes_with_caps_lock;
+};
+
+static inline int default_congestion_kb(void)
+{
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+				  void *p, void *e, bool deletion,
+				  struct ceph_snap_realm **realm_ret);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+				  struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern int ceph_snap_init(void);
+extern void ceph_snap_exit(void);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+	return !list_empty(&ci->i_cap_snaps) &&
+		list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+			   ci_item)->writing;
+}
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+extern int ceph_drop_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+				    struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+			       u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+				u64 time_warp_seq, struct timespec *ctime,
+				struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+			   struct ceph_mds_request *req,
+			   struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+				    struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+
+extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+			     int mask, bool force);
+static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
+{
+	return __ceph_do_getattr(inode, NULL, mask, force);
+}
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat);
+
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+			 size_t, int);
+int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
+int __ceph_removexattr(struct dentry *, const char *);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+extern void __init ceph_xattr_init(void);
+extern void ceph_xattr_exit(void);
+extern const struct xattr_handler *ceph_xattr_handlers[];
+
+/* acl.c */
+struct ceph_acls_info {
+	void *default_acl;
+	void *acl;
+	struct ceph_pagelist *pagelist;
+};
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+
+struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+		       struct ceph_acls_info *info);
+void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
+void ceph_release_acls_info(struct ceph_acls_info *info);
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+       forget_all_cached_acls(inode);
+}
+
+#else
+
+#define ceph_get_acl NULL
+#define ceph_set_acl NULL
+
+static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+				     struct ceph_acls_info *info)
+{
+	return 0;
+}
+static inline void ceph_init_inode_acls(struct inode *inode,
+					struct ceph_acls_info *info)
+{
+}
+static inline void ceph_release_acls_info(struct ceph_acls_info *info)
+{
+}
+static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+	return 0;
+}
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+
+#endif
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+				     struct ceph_cap_reservation *ctx);
+extern void ceph_add_cap(struct inode *inode,
+			 struct ceph_mds_session *session, u64 cap_id,
+			 int fmode, unsigned issued, unsigned wanted,
+			 unsigned cap, unsigned seq, u64 realmino, int flags,
+			 struct ceph_cap **new_cap);
+extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_put_cap(struct ceph_mds_client *mdsc,
+			 struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
+
+extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
+				u64 cap_id, u32 migrate_seq, u32 issue_seq);
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+				    struct ceph_mds_session *session);
+extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
+					     int mds);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				       struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			       struct ceph_mds_session **psession,
+			       int again);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+			    struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+				     int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+				      int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+			 loff_t endoff, int *got, struct page **pinned_page);
+
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+	ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+
+extern int ceph_open(struct inode *inode, struct file *file);
+extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
+			    struct file *file, unsigned flags, umode_t mode,
+			    int *opened);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+				  char *data, size_t len);
+int ceph_uninline_data(struct file *filp, struct page *locked_page);
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct file_operations ceph_snapdir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern const struct inode_operations ceph_snapdir_iops;
+extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+	ceph_snapdir_dentry_ops;
+
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+			       struct dentry *dentry, int err);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+					 struct dentry *dentry, int err);
+
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
+extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
+
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+
+
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* locks.c */
+extern __init void ceph_flock_init(void);
+extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
+extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
+extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
+extern int ceph_encode_locks_to_buffer(struct inode *inode,
+				       struct ceph_filelock *flocks,
+				       int num_fcntl_locks,
+				       int num_flock_locks);
+extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+				  struct ceph_pagelist *pagelist,
+				  int num_fcntl_locks, int num_flock_locks);
+extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
+
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/kernel/fs/ceph/xattr.c b/kernel/fs/ceph/xattr.c
new file mode 100644
index 000000000..cd7ffad40
--- /dev/null
+++ b/kernel/fs/ceph/xattr.c
@@ -0,0 +1,1117 @@
+#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/pagelist.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
+
+#define XATTR_CEPH_PREFIX "ceph."
+#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+			  struct ceph_inode_xattr *xattr);
+
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL,
+};
+
+static bool ceph_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr {
+	char *name;
+	size_t name_size;	/* strlen(name) + 1 (for '\0') */
+	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+			      size_t size);
+	bool readonly, hidden;
+	bool (*exists_cb)(struct ceph_inode_info *ci);
+};
+
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+	size_t s;
+	char *p = (char *)&ci->i_layout;
+
+	for (s = 0; s < sizeof(ci->i_layout); s++, p++)
+		if (*p)
+			return true;
+	return false;
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+				   size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+	char buf[128];
+
+	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name) {
+		size_t len = strlen(pool_name);
+		ret = snprintf(buf, sizeof(buf),
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+		if (!size) {
+			ret += len;
+		} else if (ret + len > size) {
+			ret = -ERANGE;
+		} else {
+			memcpy(val, buf, ret);
+			memcpy(val + ret, pool_name, len);
+			ret += len;
+		}
+	} else {
+		ret = snprintf(buf, sizeof(buf),
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		(unsigned long long)pool);
+		if (size) {
+			if (ret <= size)
+				memcpy(val, buf, ret);
+			else
+				ret = -ERANGE;
+		}
+	}
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+			(unsigned long long)ceph_file_layout_su(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+						char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+					char *val, size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size, "%s", pool_name);
+	else
+		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
+/* directories */
+
+static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
+				      size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files);
+}
+
+static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+
+static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+
+static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
+			(long)ci->i_rctime.tv_nsec);
+}
+
+
+#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2)	\
+	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
+
+#define XATTR_NAME_CEPH(_type, _name)					\
+	{								\
+		.name = CEPH_XATTR_NAME(_type, _name),			\
+		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+		.readonly = true,				\
+		.hidden = false,				\
+		.exists_cb = NULL,			\
+	}
+#define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
+	{								\
+		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
+		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+		.readonly = false,				\
+		.hidden = true,			\
+		.exists_cb = ceph_vxattrcb_layout_exists,	\
+	}
+
+static struct ceph_vxattr ceph_dir_vxattrs[] = {
+	{
+		.name = "ceph.dir.layout",
+		.name_size = sizeof("ceph.dir.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = true,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(dir, layout, object_size),
+	XATTR_LAYOUT_FIELD(dir, layout, pool),
+	XATTR_NAME_CEPH(dir, entries),
+	XATTR_NAME_CEPH(dir, files),
+	XATTR_NAME_CEPH(dir, subdirs),
+	XATTR_NAME_CEPH(dir, rentries),
+	XATTR_NAME_CEPH(dir, rfiles),
+	XATTR_NAME_CEPH(dir, rsubdirs),
+	XATTR_NAME_CEPH(dir, rbytes),
+	XATTR_NAME_CEPH(dir, rctime),
+	{ .name = NULL, 0 }	/* Required table terminator */
+};
+static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
+
+/* files */
+
+static struct ceph_vxattr ceph_file_vxattrs[] = {
+	{
+		.name = "ceph.file.layout",
+		.name_size = sizeof("ceph.file.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = true,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
+	XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(file, layout, object_size),
+	XATTR_LAYOUT_FIELD(file, layout, pool),
+	{ .name = NULL, 0 }	/* Required table terminator */
+};
+static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
+
+static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		return ceph_dir_vxattrs;
+	else if (S_ISREG(inode->i_mode))
+		return ceph_file_vxattrs;
+	return NULL;
+}
+
+static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	if (vxattrs == ceph_dir_vxattrs)
+		return ceph_dir_vxattrs_name_size;
+	if (vxattrs == ceph_file_vxattrs)
+		return ceph_file_vxattrs_name_size;
+	BUG_ON(vxattrs);
+	return 0;
+}
+
+/*
+ * Compute the aggregate size (including terminating '\0') of all
+ * virtual extended attribute names in the given vxattr table.
+ */
+static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	struct ceph_vxattr *vxattr;
+	size_t size = 0;
+
+	for (vxattr = vxattrs; vxattr->name; vxattr++)
+		if (!vxattr->hidden)
+			size += vxattr->name_size;
+
+	return size;
+}
+
+/* Routines called at initialization and exit time */
+
+void __init ceph_xattr_init(void)
+{
+	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
+	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
+}
+
+void ceph_xattr_exit(void)
+{
+	ceph_dir_vxattrs_name_size = 0;
+	ceph_file_vxattrs_name_size = 0;
+}
+
+static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
+						const char *name)
+{
+	struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
+
+	if (vxattr) {
+		while (vxattr->name) {
+			if (!strcmp(vxattr->name, name))
+				return vxattr;
+			vxattr++;
+		}
+	}
+
+	return NULL;
+}
+
+static int __set_xattr(struct ceph_inode_info *ci,
+			   const char *name, int name_len,
+			   const char *val, int val_len,
+			   int flags, int update_xattr,
+			   struct ceph_inode_xattr **newxattr)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+	int new = 0;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			if (name_len == xattr->name_len)
+				break;
+			else if (name_len < xattr->name_len)
+				p = &(*p)->rb_left;
+			else
+				p = &(*p)->rb_right;
+		}
+		xattr = NULL;
+	}
+
+	if (update_xattr) {
+		int err = 0;
+		if (xattr && (flags & XATTR_CREATE))
+			err = -EEXIST;
+		else if (!xattr && (flags & XATTR_REPLACE))
+			err = -ENODATA;
+		if (err) {
+			kfree(name);
+			kfree(val);
+			return err;
+		}
+		if (update_xattr < 0) {
+			if (xattr)
+				__remove_xattr(ci, xattr);
+			kfree(name);
+			return 0;
+		}
+	}
+
+	if (!xattr) {
+		new = 1;
+		xattr = *newxattr;
+		xattr->name = name;
+		xattr->name_len = name_len;
+		xattr->should_free_name = update_xattr;
+
+		ci->i_xattrs.count++;
+		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+	} else {
+		kfree(*newxattr);
+		*newxattr = NULL;
+		if (xattr->should_free_val)
+			kfree((void *)xattr->val);
+
+		if (update_xattr) {
+			kfree((void *)name);
+			name = xattr->name;
+		}
+		ci->i_xattrs.names_size -= xattr->name_len;
+		ci->i_xattrs.vals_size -= xattr->val_len;
+	}
+	ci->i_xattrs.names_size += name_len;
+	ci->i_xattrs.vals_size += val_len;
+	if (val)
+		xattr->val = val;
+	else
+		xattr->val = "";
+
+	xattr->val_len = val_len;
+	xattr->dirty = update_xattr;
+	xattr->should_free_val = (val && update_xattr);
+
+	if (new) {
+		rb_link_node(&xattr->node, parent, p);
+		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+		dout("__set_xattr_val p=%p\n", p);
+	}
+
+	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+
+	return 0;
+}
+
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int name_len = strlen(name);
+	int c;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, xattr->name_len);
+		if (c == 0 && name_len > xattr->name_len)
+			c = 1;
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			dout("__get_xattr %s: found %.*s\n", name,
+			     xattr->val_len, xattr->val);
+			return xattr;
+		}
+	}
+
+	dout("__get_xattr %s: not found\n", name);
+
+	return NULL;
+}
+
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+	BUG_ON(!xattr);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	kfree(xattr);
+}
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+			  struct ceph_inode_xattr *xattr)
+{
+	if (!xattr)
+		return -ENODATA;
+
+	rb_erase(&xattr->node, &ci->i_xattrs.index);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	ci->i_xattrs.names_size -= xattr->name_len;
+	ci->i_xattrs.vals_size -= xattr->val_len;
+	ci->i_xattrs.count--;
+	kfree(xattr);
+
+	return 0;
+}
+
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct ceph_inode_xattr *xattr;
+	int err;
+
+	p = &ci->i_xattrs.index.rb_node;
+	xattr = __get_xattr(ci, name);
+	err = __remove_xattr(ci, xattr);
+	return err;
+}
+
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+				char *dest)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		memcpy(dest, xattr->name, xattr->name_len);
+		dest[xattr->name_len] = '\0';
+
+		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+		     xattr->name_len, ci->i_xattrs.names_size);
+
+		dest += xattr->name_len + 1;
+		p = rb_next(p);
+	}
+
+	return dest;
+}
+
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+	struct rb_node *p, *tmp;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+
+	dout("__ceph_destroy_xattrs p=%p\n", p);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		tmp = p;
+		p = rb_next(tmp);
+		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+		     xattr->name_len, xattr->name);
+		rb_erase(tmp, &ci->i_xattrs.index);
+
+		__free_xattr(xattr);
+	}
+
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.index_version = 0;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.index = RB_ROOT;
+}
+
+static int __build_xattrs(struct inode *inode)
+	__releases(ci->i_ceph_lock)
+	__acquires(ci->i_ceph_lock)
+{
+	u32 namelen;
+	u32 numattr = 0;
+	void *p, *end;
+	u32 len;
+	const char *name, *val;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int xattr_version;
+	struct ceph_inode_xattr **xattrs = NULL;
+	int err = 0;
+	int i;
+
+	dout("__build_xattrs() len=%d\n",
+	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+
+	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+		return 0; /* already built */
+
+	__ceph_destroy_xattrs(ci);
+
+start:
+	/* updated internal xattr rb tree */
+	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+		p = ci->i_xattrs.blob->vec.iov_base;
+		end = p + ci->i_xattrs.blob->vec.iov_len;
+		ceph_decode_32_safe(&p, end, numattr, bad);
+		xattr_version = ci->i_xattrs.version;
+		spin_unlock(&ci->i_ceph_lock);
+
+		xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
+				 GFP_NOFS);
+		err = -ENOMEM;
+		if (!xattrs)
+			goto bad_lock;
+
+		for (i = 0; i < numattr; i++) {
+			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+					    GFP_NOFS);
+			if (!xattrs[i])
+				goto bad_lock;
+		}
+
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_xattrs.version != xattr_version) {
+			/* lost a race, retry */
+			for (i = 0; i < numattr; i++)
+				kfree(xattrs[i]);
+			kfree(xattrs);
+			xattrs = NULL;
+			goto start;
+		}
+		err = -EIO;
+		while (numattr--) {
+			ceph_decode_32_safe(&p, end, len, bad);
+			namelen = len;
+			name = p;
+			p += len;
+			ceph_decode_32_safe(&p, end, len, bad);
+			val = p;
+			p += len;
+
+			err = __set_xattr(ci, name, namelen, val, len,
+					  0, 0, &xattrs[numattr]);
+
+			if (err < 0)
+				goto bad;
+		}
+		kfree(xattrs);
+	}
+	ci->i_xattrs.index_version = ci->i_xattrs.version;
+	ci->i_xattrs.dirty = false;
+
+	return err;
+bad_lock:
+	spin_lock(&ci->i_ceph_lock);
+bad:
+	if (xattrs) {
+		for (i = 0; i < numattr; i++)
+			kfree(xattrs[i]);
+		kfree(xattrs);
+	}
+	ci->i_xattrs.names_size = 0;
+	return err;
+}
+
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+				    int val_size)
+{
+	/*
+	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
+	 * 4 bytes per each value
+	 */
+	int size = 4 + ci->i_xattrs.count*(4 + 4) +
+			     ci->i_xattrs.names_size +
+			     ci->i_xattrs.vals_size;
+	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+	     ci->i_xattrs.count, ci->i_xattrs.names_size,
+	     ci->i_xattrs.vals_size);
+
+	if (name_size)
+		size += 4 + 4 + name_size + val_size;
+
+	return size;
+}
+
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+	void *dest;
+
+	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+	if (ci->i_xattrs.dirty) {
+		int need = __get_required_blob_size(ci, 0, 0);
+
+		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+
+		p = rb_first(&ci->i_xattrs.index);
+		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		ceph_encode_32(&dest, ci->i_xattrs.count);
+		while (p) {
+			xattr = rb_entry(p, struct ceph_inode_xattr, node);
+
+			ceph_encode_32(&dest, xattr->name_len);
+			memcpy(dest, xattr->name, xattr->name_len);
+			dest += xattr->name_len;
+			ceph_encode_32(&dest, xattr->val_len);
+			memcpy(dest, xattr->val, xattr->val_len);
+			dest += xattr->val_len;
+
+			p = rb_next(p);
+		}
+
+		/* adjust buffer len; it may be larger than we need */
+		ci->i_xattrs.prealloc_blob->vec.iov_len =
+			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+		ci->i_xattrs.prealloc_blob = NULL;
+		ci->i_xattrs.dirty = false;
+		ci->i_xattrs.version++;
+	}
+}
+
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
+		      size_t size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int err;
+	struct ceph_inode_xattr *xattr;
+	struct ceph_vxattr *vxattr = NULL;
+
+	if (!ceph_is_valid_xattr(name))
+		return -ENODATA;
+
+	/* let's see if a virtual xattr was requested */
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		return err;
+	}
+
+	spin_lock(&ci->i_ceph_lock);
+	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (ci->i_xattrs.version == 0 ||
+	    !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
+		spin_unlock(&ci->i_ceph_lock);
+		/* get xattrs from mds (if we don't already have them) */
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
+		if (err)
+			return err;
+		spin_lock(&ci->i_ceph_lock);
+	}
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+	err = -ENODATA;  /* == ENOATTR */
+	xattr = __get_xattr(ci, name);
+	if (!xattr)
+		goto out;
+
+	err = -ERANGE;
+	if (size && size < xattr->val_len)
+		goto out;
+
+	err = xattr->val_len;
+	if (size == 0)
+		goto out;
+
+	memcpy(value, xattr->val, xattr->val_len);
+
+out:
+	spin_unlock(&ci->i_ceph_lock);
+	return err;
+}
+
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, value, size);
+
+	return __ceph_getxattr(d_inode(dentry), name, value, size);
+}
+
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
+	u32 vir_namelen = 0;
+	u32 namelen;
+	int err;
+	u32 len;
+	int i;
+
+	spin_lock(&ci->i_ceph_lock);
+	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (ci->i_xattrs.version == 0 ||
+	    !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
+		spin_unlock(&ci->i_ceph_lock);
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
+		if (err)
+			return err;
+		spin_lock(&ci->i_ceph_lock);
+	}
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+	/*
+	 * Start with virtual dir xattr names (if any) (including
+	 * terminating '\0' characters for each).
+	 */
+	vir_namelen = ceph_vxattrs_name_size(vxattrs);
+
+	/* adding 1 byte per each variable due to the null termination */
+	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
+	err = -ERANGE;
+	if (size && vir_namelen + namelen > size)
+		goto out;
+
+	err = namelen + vir_namelen;
+	if (size == 0)
+		goto out;
+
+	names = __copy_xattr_names(ci, names);
+
+	/* virtual xattr names, too */
+	err = namelen;
+	if (vxattrs) {
+		for (i = 0; vxattrs[i].name; i++) {
+			if (!vxattrs[i].hidden &&
+			    !(vxattrs[i].exists_cb &&
+			      !vxattrs[i].exists_cb(ci))) {
+				len = sprintf(names, "%s", vxattrs[i].name);
+				names += len + 1;
+				err += len + 1;
+			}
+		}
+	}
+
+out:
+	spin_unlock(&ci->i_ceph_lock);
+	return err;
+}
+
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+			      const char *value, size_t size, int flags)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct inode *inode = d_inode(dentry);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_pagelist *pagelist = NULL;
+	int err;
+
+	if (size > 0) {
+		/* copy value into pagelist */
+		pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+		if (!pagelist)
+			return -ENOMEM;
+
+		ceph_pagelist_init(pagelist);
+		err = ceph_pagelist_append(pagelist, value, size);
+		if (err)
+			goto out;
+	} else if (!value) {
+		flags |= CEPH_XATTR_REMOVE;
+	}
+
+	dout("setxattr value=%.*s\n", (int)size, value);
+
+	/* do request */
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->r_args.setxattr.flags = cpu_to_le32(flags);
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+	if (!req->r_path2) {
+		ceph_mdsc_put_request(req);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	req->r_pagelist = pagelist;
+	pagelist = NULL;
+
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+
+	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+
+out:
+	if (pagelist)
+		ceph_pagelist_release(pagelist);
+	return err;
+}
+
+int __ceph_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ceph_vxattr *vxattr;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int issued;
+	int err;
+	int dirty = 0;
+	int name_len = strlen(name);
+	int val_len = size;
+	char *newname = NULL;
+	char *newval = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int required_blob_size;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
+
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
+	/* preallocate memory for xattr name, value, index node */
+	err = -ENOMEM;
+	newname = kmemdup(name, name_len + 1, GFP_NOFS);
+	if (!newname)
+		goto out;
+
+	if (val_len) {
+		newval = kmemdup(value, val_len, GFP_NOFS);
+		if (!newval)
+			goto out;
+	}
+
+	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+	if (!xattr)
+		goto out;
+
+	spin_lock(&ci->i_ceph_lock);
+retry:
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+	if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+	__build_xattrs(inode);
+
+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob;
+
+		spin_unlock(&ci->i_ceph_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
+
+	err = __set_xattr(ci, newname, name_len, newval, val_len,
+			  flags, value ? 1 : -1, &xattr);
+
+	if (!err) {
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+		ci->i_xattrs.dirty = true;
+		inode->i_ctime = CURRENT_TIME;
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	return err;
+
+do_sync:
+	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
+	err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+	kfree(newname);
+	kfree(newval);
+	kfree(xattr);
+	return err;
+}
+
+int ceph_setxattr(struct dentry *dentry, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, size, flags);
+
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+
+	return __ceph_setxattr(dentry, name, value, size, flags);
+}
+
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct inode *inode = d_inode(dentry);
+	struct ceph_mds_request *req;
+	int err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+	if (!req->r_path2)
+		return -ENOMEM;
+
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+int __ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ceph_vxattr *vxattr;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int issued;
+	int err;
+	int required_blob_size;
+	int dirty;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
+
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
+	err = -ENOMEM;
+	spin_lock(&ci->i_ceph_lock);
+retry:
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+	__build_xattrs(inode);
+
+	required_blob_size = __get_required_blob_size(ci, 0, 0);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob;
+
+		spin_unlock(&ci->i_ceph_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
+
+	err = __remove_xattr_by_name(ceph_inode(inode), name);
+
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+	spin_unlock(&ci->i_ceph_lock);
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	return err;
+do_sync:
+	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
+	err = ceph_send_removexattr(dentry, name);
+out:
+	return err;
+}
+
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	return __ceph_removexattr(dentry, name);
+}
diff --git a/kernel/fs/char_dev.c b/kernel/fs/char_dev.c
new file mode 100644
index 000000000..ea06a3d03
--- /dev/null
+++ b/kernel/fs/char_dev.c
@@ -0,0 +1,569 @@
+/*
+ *  linux/fs/char_dev.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <linux/major.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <linux/kobject.h>
+#include <linux/kobj_map.h>
+#include <linux/cdev.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/tty.h>
+
+#include "internal.h"
+
+static struct kobj_map *cdev_map;
+
+static DEFINE_MUTEX(chrdevs_lock);
+
+static struct char_device_struct {
+	struct char_device_struct *next;
+	unsigned int major;
+	unsigned int baseminor;
+	int minorct;
+	char name[64];
+	struct cdev *cdev;		/* will die */
+} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
+
+/* index in the above */
+static inline int major_to_index(unsigned major)
+{
+	return major % CHRDEV_MAJOR_HASH_SIZE;
+}
+
+#ifdef CONFIG_PROC_FS
+
+void chrdev_show(struct seq_file *f, off_t offset)
+{
+	struct char_device_struct *cd;
+
+	if (offset < CHRDEV_MAJOR_HASH_SIZE) {
+		mutex_lock(&chrdevs_lock);
+		for (cd = chrdevs[offset]; cd; cd = cd->next)
+			seq_printf(f, "%3d %s\n", cd->major, cd->name);
+		mutex_unlock(&chrdevs_lock);
+	}
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Register a single major with a specified minor range.
+ *
+ * If major == 0 this functions will dynamically allocate a major and return
+ * its number.
+ *
+ * If major > 0 this function will attempt to reserve the passed range of
+ * minors and will return zero on success.
+ *
+ * Returns a -ve errno on failure.
+ */
+static struct char_device_struct *
+__register_chrdev_region(unsigned int major, unsigned int baseminor,
+			   int minorct, const char *name)
+{
+	struct char_device_struct *cd, **cp;
+	int ret = 0;
+	int i;
+
+	cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL);
+	if (cd == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&chrdevs_lock);
+
+	/* temporary */
+	if (major == 0) {
+		for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) {
+			if (chrdevs[i] == NULL)
+				break;
+		}
+
+		if (i == 0) {
+			ret = -EBUSY;
+			goto out;
+		}
+		major = i;
+	}
+
+	cd->major = major;
+	cd->baseminor = baseminor;
+	cd->minorct = minorct;
+	strlcpy(cd->name, name, sizeof(cd->name));
+
+	i = major_to_index(major);
+
+	for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
+		if ((*cp)->major > major ||
+		    ((*cp)->major == major &&
+		     (((*cp)->baseminor >= baseminor) ||
+		      ((*cp)->baseminor + (*cp)->minorct > baseminor))))
+			break;
+
+	/* Check for overlapping minor ranges.  */
+	if (*cp && (*cp)->major == major) {
+		int old_min = (*cp)->baseminor;
+		int old_max = (*cp)->baseminor + (*cp)->minorct - 1;
+		int new_min = baseminor;
+		int new_max = baseminor + minorct - 1;
+
+		/* New driver overlaps from the left.  */
+		if (new_max >= old_min && new_max <= old_max) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		/* New driver overlaps from the right.  */
+		if (new_min <= old_max && new_min >= old_min) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	cd->next = *cp;
+	*cp = cd;
+	mutex_unlock(&chrdevs_lock);
+	return cd;
+out:
+	mutex_unlock(&chrdevs_lock);
+	kfree(cd);
+	return ERR_PTR(ret);
+}
+
+static struct char_device_struct *
+__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
+{
+	struct char_device_struct *cd = NULL, **cp;
+	int i = major_to_index(major);
+
+	mutex_lock(&chrdevs_lock);
+	for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
+		if ((*cp)->major == major &&
+		    (*cp)->baseminor == baseminor &&
+		    (*cp)->minorct == minorct)
+			break;
+	if (*cp) {
+		cd = *cp;
+		*cp = cd->next;
+	}
+	mutex_unlock(&chrdevs_lock);
+	return cd;
+}
+
+/**
+ * register_chrdev_region() - register a range of device numbers
+ * @from: the first in the desired range of device numbers; must include
+ *        the major number.
+ * @count: the number of consecutive device numbers required
+ * @name: the name of the device or driver.
+ *
+ * Return value is zero on success, a negative error code on failure.
+ */
+int register_chrdev_region(dev_t from, unsigned count, const char *name)
+{
+	struct char_device_struct *cd;
+	dev_t to = from + count;
+	dev_t n, next;
+
+	for (n = from; n < to; n = next) {
+		next = MKDEV(MAJOR(n)+1, 0);
+		if (next > to)
+			next = to;
+		cd = __register_chrdev_region(MAJOR(n), MINOR(n),
+			       next - n, name);
+		if (IS_ERR(cd))
+			goto fail;
+	}
+	return 0;
+fail:
+	to = n;
+	for (n = from; n < to; n = next) {
+		next = MKDEV(MAJOR(n)+1, 0);
+		kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
+	}
+	return PTR_ERR(cd);
+}
+
+/**
+ * alloc_chrdev_region() - register a range of char device numbers
+ * @dev: output parameter for first assigned number
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
+ * @name: the name of the associated device or driver
+ *
+ * Allocates a range of char device numbers.  The major number will be
+ * chosen dynamically, and returned (along with the first minor number)
+ * in @dev.  Returns zero or a negative error code.
+ */
+int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
+			const char *name)
+{
+	struct char_device_struct *cd;
+	cd = __register_chrdev_region(0, baseminor, count, name);
+	if (IS_ERR(cd))
+		return PTR_ERR(cd);
+	*dev = MKDEV(cd->major, cd->baseminor);
+	return 0;
+}
+
+/**
+ * __register_chrdev() - create and register a cdev occupying a range of minors
+ * @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
+ * @name: name of this range of devices
+ * @fops: file operations associated with this devices
+ *
+ * If @major == 0 this functions will dynamically allocate a major and return
+ * its number.
+ *
+ * If @major > 0 this function will attempt to reserve a device with the given
+ * major number and will return zero on success.
+ *
+ * Returns a -ve errno on failure.
+ *
+ * The name of this device has nothing to do with the name of the device in
+ * /dev. It only helps to keep track of the different owners of devices. If
+ * your module name has only one type of devices it's ok to use e.g. the name
+ * of the module here.
+ */
+int __register_chrdev(unsigned int major, unsigned int baseminor,
+		      unsigned int count, const char *name,
+		      const struct file_operations *fops)
+{
+	struct char_device_struct *cd;
+	struct cdev *cdev;
+	int err = -ENOMEM;
+
+	cd = __register_chrdev_region(major, baseminor, count, name);
+	if (IS_ERR(cd))
+		return PTR_ERR(cd);
+
+	cdev = cdev_alloc();
+	if (!cdev)
+		goto out2;
+
+	cdev->owner = fops->owner;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, "%s", name);
+
+	err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
+	if (err)
+		goto out;
+
+	cd->cdev = cdev;
+
+	return major ? 0 : cd->major;
+out:
+	kobject_put(&cdev->kobj);
+out2:
+	kfree(__unregister_chrdev_region(cd->major, baseminor, count));
+	return err;
+}
+
+/**
+ * unregister_chrdev_region() - return a range of device numbers
+ * @from: the first in the range of numbers to unregister
+ * @count: the number of device numbers to unregister
+ *
+ * This function will unregister a range of @count device numbers,
+ * starting with @from.  The caller should normally be the one who
+ * allocated those numbers in the first place...
+ */
+void unregister_chrdev_region(dev_t from, unsigned count)
+{
+	dev_t to = from + count;
+	dev_t n, next;
+
+	for (n = from; n < to; n = next) {
+		next = MKDEV(MAJOR(n)+1, 0);
+		if (next > to)
+			next = to;
+		kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
+	}
+}
+
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count.  This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+			 unsigned int count, const char *name)
+{
+	struct char_device_struct *cd;
+
+	cd = __unregister_chrdev_region(major, baseminor, count);
+	if (cd && cd->cdev)
+		cdev_del(cd->cdev);
+	kfree(cd);
+}
+
+static DEFINE_SPINLOCK(cdev_lock);
+
+static struct kobject *cdev_get(struct cdev *p)
+{
+	struct module *owner = p->owner;
+	struct kobject *kobj;
+
+	if (owner && !try_module_get(owner))
+		return NULL;
+	kobj = kobject_get(&p->kobj);
+	if (!kobj)
+		module_put(owner);
+	return kobj;
+}
+
+void cdev_put(struct cdev *p)
+{
+	if (p) {
+		struct module *owner = p->owner;
+		kobject_put(&p->kobj);
+		module_put(owner);
+	}
+}
+
+/*
+ * Called every time a character special file is opened
+ */
+static int chrdev_open(struct inode *inode, struct file *filp)
+{
+	const struct file_operations *fops;
+	struct cdev *p;
+	struct cdev *new = NULL;
+	int ret = 0;
+
+	spin_lock(&cdev_lock);
+	p = inode->i_cdev;
+	if (!p) {
+		struct kobject *kobj;
+		int idx;
+		spin_unlock(&cdev_lock);
+		kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+		if (!kobj)
+			return -ENXIO;
+		new = container_of(kobj, struct cdev, kobj);
+		spin_lock(&cdev_lock);
+		/* Check i_cdev again in case somebody beat us to it while
+		   we dropped the lock. */
+		p = inode->i_cdev;
+		if (!p) {
+			inode->i_cdev = p = new;
+			list_add(&inode->i_devices, &p->list);
+			new = NULL;
+		} else if (!cdev_get(p))
+			ret = -ENXIO;
+	} else if (!cdev_get(p))
+		ret = -ENXIO;
+	spin_unlock(&cdev_lock);
+	cdev_put(new);
+	if (ret)
+		return ret;
+
+	ret = -ENXIO;
+	fops = fops_get(p->ops);
+	if (!fops)
+		goto out_cdev_put;
+
+	replace_fops(filp, fops);
+	if (filp->f_op->open) {
+		ret = filp->f_op->open(inode, filp);
+		if (ret)
+			goto out_cdev_put;
+	}
+
+	return 0;
+
+ out_cdev_put:
+	cdev_put(p);
+	return ret;
+}
+
+void cd_forget(struct inode *inode)
+{
+	spin_lock(&cdev_lock);
+	list_del_init(&inode->i_devices);
+	inode->i_cdev = NULL;
+	spin_unlock(&cdev_lock);
+}
+
+static void cdev_purge(struct cdev *cdev)
+{
+	spin_lock(&cdev_lock);
+	while (!list_empty(&cdev->list)) {
+		struct inode *inode;
+		inode = container_of(cdev->list.next, struct inode, i_devices);
+		list_del_init(&inode->i_devices);
+		inode->i_cdev = NULL;
+	}
+	spin_unlock(&cdev_lock);
+}
+
+/*
+ * Dummy default file-operations: the only thing this does
+ * is contain the open that then fills in the correct operations
+ * depending on the special file...
+ */
+const struct file_operations def_chr_fops = {
+	.open = chrdev_open,
+	.llseek = noop_llseek,
+};
+
+static struct kobject *exact_match(dev_t dev, int *part, void *data)
+{
+	struct cdev *p = data;
+	return &p->kobj;
+}
+
+static int exact_lock(dev_t dev, void *data)
+{
+	struct cdev *p = data;
+	return cdev_get(p) ? 0 : -1;
+}
+
+/**
+ * cdev_add() - add a char device to the system
+ * @p: the cdev structure for the device
+ * @dev: the first device number for which this device is responsible
+ * @count: the number of consecutive minor numbers corresponding to this
+ *         device
+ *
+ * cdev_add() adds the device represented by @p to the system, making it
+ * live immediately.  A negative error code is returned on failure.
+ */
+int cdev_add(struct cdev *p, dev_t dev, unsigned count)
+{
+	int error;
+
+	p->dev = dev;
+	p->count = count;
+
+	error = kobj_map(cdev_map, dev, count, NULL,
+			 exact_match, exact_lock, p);
+	if (error)
+		return error;
+
+	kobject_get(p->kobj.parent);
+
+	return 0;
+}
+
+static void cdev_unmap(dev_t dev, unsigned count)
+{
+	kobj_unmap(cdev_map, dev, count);
+}
+
+/**
+ * cdev_del() - remove a cdev from the system
+ * @p: the cdev structure to be removed
+ *
+ * cdev_del() removes @p from the system, possibly freeing the structure
+ * itself.
+ */
+void cdev_del(struct cdev *p)
+{
+	cdev_unmap(p->dev, p->count);
+	kobject_put(&p->kobj);
+}
+
+
+static void cdev_default_release(struct kobject *kobj)
+{
+	struct cdev *p = container_of(kobj, struct cdev, kobj);
+	struct kobject *parent = kobj->parent;
+
+	cdev_purge(p);
+	kobject_put(parent);
+}
+
+static void cdev_dynamic_release(struct kobject *kobj)
+{
+	struct cdev *p = container_of(kobj, struct cdev, kobj);
+	struct kobject *parent = kobj->parent;
+
+	cdev_purge(p);
+	kfree(p);
+	kobject_put(parent);
+}
+
+static struct kobj_type ktype_cdev_default = {
+	.release	= cdev_default_release,
+};
+
+static struct kobj_type ktype_cdev_dynamic = {
+	.release	= cdev_dynamic_release,
+};
+
+/**
+ * cdev_alloc() - allocate a cdev structure
+ *
+ * Allocates and returns a cdev structure, or NULL on failure.
+ */
+struct cdev *cdev_alloc(void)
+{
+	struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
+	if (p) {
+		INIT_LIST_HEAD(&p->list);
+		kobject_init(&p->kobj, &ktype_cdev_dynamic);
+	}
+	return p;
+}
+
+/**
+ * cdev_init() - initialize a cdev structure
+ * @cdev: the structure to initialize
+ * @fops: the file_operations for this device
+ *
+ * Initializes @cdev, remembering @fops, making it ready to add to the
+ * system with cdev_add().
+ */
+void cdev_init(struct cdev *cdev, const struct file_operations *fops)
+{
+	memset(cdev, 0, sizeof *cdev);
+	INIT_LIST_HEAD(&cdev->list);
+	kobject_init(&cdev->kobj, &ktype_cdev_default);
+	cdev->ops = fops;
+}
+
+static struct kobject *base_probe(dev_t dev, int *part, void *data)
+{
+	if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
+		/* Make old-style 2.4 aliases work */
+		request_module("char-major-%d", MAJOR(dev));
+	return NULL;
+}
+
+void __init chrdev_init(void)
+{
+	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
+}
+
+
+/* Let modules do char dev stuff */
+EXPORT_SYMBOL(register_chrdev_region);
+EXPORT_SYMBOL(unregister_chrdev_region);
+EXPORT_SYMBOL(alloc_chrdev_region);
+EXPORT_SYMBOL(cdev_init);
+EXPORT_SYMBOL(cdev_alloc);
+EXPORT_SYMBOL(cdev_del);
+EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(__register_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
diff --git a/kernel/fs/cifs/Kconfig b/kernel/fs/cifs/Kconfig
new file mode 100644
index 000000000..a2172f3f6
--- /dev/null
+++ b/kernel/fs/cifs/Kconfig
@@ -0,0 +1,202 @@
+config CIFS
+	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
+	depends on INET
+	select NLS
+	select CRYPTO
+	select CRYPTO_MD4
+	select CRYPTO_MD5
+	select CRYPTO_HMAC
+	select CRYPTO_ARC4
+	select CRYPTO_ECB
+	select CRYPTO_DES
+	select CRYPTO_SHA256
+	select CRYPTO_CMAC
+	help
+	  This is the client VFS module for the Common Internet File System
+	  (CIFS) protocol which is the successor to the Server Message Block
+	  (SMB) protocol, the native file sharing mechanism for most early
+	  PC operating systems.  The CIFS protocol is fully supported by
+	  file servers such as Windows 2000 (including Windows 2003, Windows 2008,
+	  NT 4 and Windows XP) as well by Samba (which provides excellent CIFS
+	  server support for Linux and many other operating systems). Limited
+	  support for OS/2 and Windows ME and similar servers is provided as
+	  well.
+
+	  The module also provides optional support for the followon
+	  protocols for CIFS including SMB3, which enables
+	  useful performance and security features (see the description
+	  of CONFIG_CIFS_SMB2).
+
+	  The cifs module provides an advanced network file system
+	  client for mounting to CIFS compliant servers.  It includes
+	  support for DFS (hierarchical name space), secure per-user
+	  session establishment via Kerberos or NTLM or NTLMv2,
+	  safe distributed caching (oplock), optional packet
+	  signing, Unicode and other internationalization improvements.
+	  If you need to mount to Samba or Windows from this machine, say Y.
+
+config CIFS_STATS
+        bool "CIFS statistics"
+        depends on CIFS
+        help
+          Enabling this option will cause statistics for each server share
+	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
+
+config CIFS_STATS2
+	bool "Extended statistics"
+	depends on CIFS_STATS
+	help
+	  Enabling this option will allow more detailed statistics on SMB
+	  request timing to be displayed in /proc/fs/cifs/DebugData and also
+	  allow optional logging of slow responses to dmesg (depending on the
+	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
+	  These additional statistics may have a minor effect on performance
+	  and memory utilization.
+
+	  Unless you are a developer or are doing network performance analysis
+	  or tuning, say N.
+
+config CIFS_WEAK_PW_HASH
+	bool "Support legacy servers which use weaker LANMAN security"
+	depends on CIFS
+	help
+	  Modern CIFS servers including Samba and most Windows versions
+	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+	  security mechanisms. These hash the password more securely
+	  than the mechanisms used in the older LANMAN version of the
+	  SMB protocol but LANMAN based authentication is needed to
+	  establish sessions with some old SMB servers.
+
+	  Enabling this option allows the cifs module to mount to older
+	  LANMAN based servers such as OS/2 and Windows 95, but such
+	  mounts may be less secure than mounts using NTLM or more recent
+	  security mechanisms if you are on a public network.  Unless you
+	  have a need to access old SMB servers (and are on a private
+	  network) you probably want to say N.  Even if this support
+	  is enabled in the kernel build, LANMAN authentication will not be
+	  used automatically. At runtime LANMAN mounts are disabled but
+	  can be set to required (or optional) either in
+	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+	  option on the mount command. This support is disabled by
+	  default in order to reduce the possibility of a downgrade
+	  attack.
+
+	  If unsure, say N.
+
+config CIFS_UPCALL
+	bool "Kerberos/SPNEGO advanced session setup"
+	depends on CIFS && KEYS
+	select DNS_RESOLVER
+	help
+	  Enables an upcall mechanism for CIFS which accesses userspace helper
+	  utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
+	  which are needed to mount to certain secure servers (for which more
+	  secure Kerberos authentication is required). If unsure, say N.
+
+config CIFS_XATTR
+        bool "CIFS extended attributes"
+        depends on CIFS
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).  CIFS maps the name of
+          extended attributes beginning with the user namespace prefix
+          to SMB/CIFS EAs. EAs are stored on Windows servers without the
+          user namespace prefix, but their names are seen by Linux cifs clients
+          prefaced by the user namespace prefix. The system namespace
+          (used by some filesystems to store ACLs) is not supported at
+          this time.
+
+          If unsure, say N.
+
+config CIFS_POSIX
+        bool "CIFS POSIX Extensions"
+        depends on CIFS_XATTR
+        help
+          Enabling this option will cause the cifs client to attempt to
+	  negotiate a newer dialect with servers, such as Samba 3.0.5
+	  or later, that optionally can handle more POSIX like (rather
+	  than Windows like) file behavior.  It also enables
+	  support for POSIX ACLs (getfacl and setfacl) to servers
+	  (such as Samba 3.10 and later) which can negotiate
+	  CIFS POSIX ACL support.  If unsure, say N.
+
+config CIFS_ACL
+	  bool "Provide CIFS ACL support"
+	  depends on CIFS_XATTR && KEYS
+	  help
+	    Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
+	    is handed over to the application/caller.  See the man
+	    page for getcifsacl for more information.
+
+config CIFS_DEBUG
+	bool "Enable CIFS debugging routines"
+	default y
+	depends on CIFS
+	help
+	   Enabling this option adds helpful debugging messages to
+	   the cifs code which increases the size of the cifs module.
+	   If unsure, say Y.
+config CIFS_DEBUG2
+	bool "Enable additional CIFS debugging routines"
+	depends on CIFS_DEBUG
+	help
+	   Enabling this option adds a few more debugging routines
+	   to the cifs code which slightly increases the size of
+	   the cifs module and can cause additional logging of debug
+	   messages in some error paths, slowing performance. This
+	   option can be turned off unless you are debugging
+	   cifs problems.  If unsure, say N.
+
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support"
+	  depends on CIFS && KEYS
+	  select DNS_RESOLVER
+	  help
+	    Distributed File System (DFS) support is used to access shares
+	    transparently in an enterprise name space, even if the share
+	    moves to a different server.  This feature also enables
+	    an upcall mechanism for CIFS which contacts userspace helper
+	    utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
+
+config CIFS_NFSD_EXPORT
+	  bool "Allow nfsd to export CIFS file system"
+	  depends on CIFS && BROKEN
+	  help
+	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+
+config CIFS_SMB2
+	bool "SMB2 and SMB3 network file system support"
+	depends on CIFS && INET
+	select NLS
+	select KEYS
+	select FSCACHE
+	select DNS_RESOLVER
+
+	help
+	  This enables support for the Server Message Block version 2
+	  family of protocols, including SMB3.  SMB3 support is
+	  enabled on mount by specifying "vers=3.0" in the mount
+	  options. These protocols are the successors to the popular
+	  CIFS and SMB network file sharing protocols. SMB3 is the
+	  native file sharing mechanism for the more recent
+	  versions of Windows (Windows 8 and Windows 2012 and
+	  later) and Samba server and many others support SMB3 well.
+	  In general SMB3 enables better performance, security
+	  and features, than would be possible with CIFS (Note that
+	  when mounting to Samba, due to the CIFS POSIX extensions,
+	  CIFS mounts can provide slightly better POSIX compatibility
+	  than SMB3 mounts do though). Note that SMB2/SMB3 mount
+	  options are also slightly simpler (compared to CIFS) due
+	  to protocol improvements.
+
+config CIFS_FSCACHE
+	  bool "Provide CIFS client caching support"
+	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+	  help
+	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+	    to be cached locally on disk through the general filesystem cache
+	    manager. If unsure, say N.
+
diff --git a/kernel/fs/cifs/Makefile b/kernel/fs/cifs/Makefile
new file mode 100644
index 000000000..1964d212a
--- /dev/null
+++ b/kernel/fs/cifs/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for Linux CIFS VFS client 
+#
+obj-$(CONFIG_CIFS) += cifs.o
+
+cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
+	  link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
+	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+	  readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o
+
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
+
+cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
+
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+
+cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+
+cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o smb2maperror.o smb2transport.o \
+			    smb2misc.o smb2pdu.o smb2inode.o smb2file.o
diff --git a/kernel/fs/cifs/asn1.c b/kernel/fs/cifs/asn1.c
new file mode 100644
index 000000000..a3b56544c
--- /dev/null
+++ b/kernel/fs/cifs/asn1.c
@@ -0,0 +1,623 @@
+/*
+ * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in
+ * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifsproto.h"
+
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* Class */
+#define ASN1_UNI	0	/* Universal */
+#define ASN1_APL	1	/* Application */
+#define ASN1_CTX	2	/* Context */
+#define ASN1_PRV	3	/* Private */
+
+/* Tag */
+#define ASN1_EOC	0	/* End Of Contents or N/A */
+#define ASN1_BOL	1	/* Boolean */
+#define ASN1_INT	2	/* Integer */
+#define ASN1_BTS	3	/* Bit String */
+#define ASN1_OTS	4	/* Octet String */
+#define ASN1_NUL	5	/* Null */
+#define ASN1_OJI	6	/* Object Identifier  */
+#define ASN1_OJD	7	/* Object Description */
+#define ASN1_EXT	8	/* External */
+#define ASN1_ENUM	10	/* Enumerated */
+#define ASN1_SEQ	16	/* Sequence */
+#define ASN1_SET	17	/* Set */
+#define ASN1_NUMSTR	18	/* Numerical String */
+#define ASN1_PRNSTR	19	/* Printable String */
+#define ASN1_TEXSTR	20	/* Teletext String */
+#define ASN1_VIDSTR	21	/* Video String */
+#define ASN1_IA5STR	22	/* IA5 String */
+#define ASN1_UNITIM	23	/* Universal Time */
+#define ASN1_GENTIM	24	/* General Time */
+#define ASN1_GRASTR	25	/* Graphical String */
+#define ASN1_VISSTR	26	/* Visible String */
+#define ASN1_GENSTR	27	/* General String */
+
+/* Primitive / Constructed methods*/
+#define ASN1_PRI	0	/* Primitive */
+#define ASN1_CON	1	/* Constructed */
+
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR		0
+#define ASN1_ERR_DEC_EMPTY		2
+#define ASN1_ERR_DEC_EOC_MISMATCH	3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH	4
+#define ASN1_ERR_DEC_BADVALUE		5
+
+#define SPNEGO_OID_LEN 7
+#define NTLMSSP_OID_LEN  10
+#define KRB5_OID_LEN  7
+#define KRB5U2U_OID_LEN  8
+#define MSKRB5_OID_LEN  7
+static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
+static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
+static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
+static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
+static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
+
+/*
+ * ASN.1 context.
+ */
+struct asn1_ctx {
+	int error;		/* Error condition */
+	unsigned char *pointer;	/* Octet just to be decoded */
+	unsigned char *begin;	/* First octet */
+	unsigned char *end;	/* Octet after last octet */
+};
+
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr {
+	unsigned char *data;
+	unsigned int len;
+};
+
+static void
+asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len)
+{
+	ctx->begin = buf;
+	ctx->end = buf + len;
+	ctx->pointer = buf;
+	ctx->error = ASN1_ERR_NOERROR;
+}
+
+static unsigned char
+asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+	if (ctx->pointer >= ctx->end) {
+		ctx->error = ASN1_ERR_DEC_EMPTY;
+		return 0;
+	}
+	*ch = *(ctx->pointer)++;
+	return 1;
+}
+
+#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */
+static unsigned char
+asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
+{
+	unsigned char ch;
+
+	if (ctx->pointer >= ctx->end) {
+		ctx->error = ASN1_ERR_DEC_EMPTY;
+		return 0;
+	}
+
+	ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
+	if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
+		*val = *(++(ctx->pointer)); /* value has enum value */
+	else
+		return 0;
+
+	ctx->pointer++;
+	return 1;
+}
+#endif
+
+static unsigned char
+asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+	unsigned char ch;
+
+	*tag = 0;
+
+	do {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+		*tag <<= 7;
+		*tag |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static unsigned char
+asn1_id_decode(struct asn1_ctx *ctx,
+	       unsigned int *cls, unsigned int *con, unsigned int *tag)
+{
+	unsigned char ch;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*cls = (ch & 0xC0) >> 6;
+	*con = (ch & 0x20) >> 5;
+	*tag = (ch & 0x1F);
+
+	if (*tag == 0x1F) {
+		if (!asn1_tag_decode(ctx, tag))
+			return 0;
+	}
+	return 1;
+}
+
+static unsigned char
+asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len)
+{
+	unsigned char ch, cnt;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	if (ch == 0x80)
+		*def = 0;
+	else {
+		*def = 1;
+
+		if (ch < 0x80)
+			*len = ch;
+		else {
+			cnt = (unsigned char) (ch & 0x7F);
+			*len = 0;
+
+			while (cnt > 0) {
+				if (!asn1_octet_decode(ctx, &ch))
+					return 0;
+				*len <<= 8;
+				*len |= ch;
+				cnt--;
+			}
+		}
+	}
+
+	/* don't trust len bigger than ctx buffer */
+	if (*len > ctx->end - ctx->pointer)
+		return 0;
+
+	return 1;
+}
+
+static unsigned char
+asn1_header_decode(struct asn1_ctx *ctx,
+		   unsigned char **eoc,
+		   unsigned int *cls, unsigned int *con, unsigned int *tag)
+{
+	unsigned int def = 0;
+	unsigned int len = 0;
+
+	if (!asn1_id_decode(ctx, cls, con, tag))
+		return 0;
+
+	if (!asn1_length_decode(ctx, &def, &len))
+		return 0;
+
+	/* primitive shall be definite, indefinite shall be constructed */
+	if (*con == ASN1_PRI && !def)
+		return 0;
+
+	if (def)
+		*eoc = ctx->pointer + len;
+	else
+		*eoc = NULL;
+	return 1;
+}
+
+static unsigned char
+asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+	unsigned char ch;
+
+	if (eoc == NULL) {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+		return 1;
+	} else {
+		if (ctx->pointer != eoc) {
+			ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+			return 0;
+		}
+		return 1;
+	}
+}
+
+/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc)
+{
+	ctx->pointer = eoc;
+	return 1;
+}
+
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc, long *integer)
+{
+	unsigned char ch;
+	unsigned int len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = (signed char) ch;
+	len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof(long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc,
+				      unsigned int *integer)
+{
+	unsigned char ch;
+	unsigned int len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = ch;
+	if (ch == 0)
+		len = 0;
+	else
+		len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof(unsigned int)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+				       unsigned char *eoc,
+				       unsigned long *integer)
+{
+	unsigned char ch;
+	unsigned int len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = ch;
+	if (ch == 0)
+		len = 0;
+	else
+		len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof(unsigned long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char
+asn1_octets_decode(struct asn1_ctx *ctx,
+		   unsigned char *eoc,
+		   unsigned char **octets, unsigned int *len)
+{
+	unsigned char *ptr;
+
+	*len = 0;
+
+	*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+	if (*octets == NULL) {
+		return 0;
+	}
+
+	ptr = *octets;
+	while (ctx->pointer < eoc) {
+		if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) {
+			kfree(*octets);
+			*octets = NULL;
+			return 0;
+		}
+		(*len)++;
+	}
+	return 1;
+} */
+
+static unsigned char
+asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid)
+{
+	unsigned char ch;
+
+	*subid = 0;
+
+	do {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*subid <<= 7;
+		*subid |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static int
+asn1_oid_decode(struct asn1_ctx *ctx,
+		unsigned char *eoc, unsigned long **oid, unsigned int *len)
+{
+	unsigned long subid;
+	unsigned int size;
+	unsigned long *optr;
+
+	size = eoc - ctx->pointer + 1;
+
+	/* first subid actually encodes first two subids */
+	if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
+		return 0;
+
+	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+	if (*oid == NULL)
+		return 0;
+
+	optr = *oid;
+
+	if (!asn1_subid_decode(ctx, &subid)) {
+		kfree(*oid);
+		*oid = NULL;
+		return 0;
+	}
+
+	if (subid < 40) {
+		optr[0] = 0;
+		optr[1] = subid;
+	} else if (subid < 80) {
+		optr[0] = 1;
+		optr[1] = subid - 40;
+	} else {
+		optr[0] = 2;
+		optr[1] = subid - 80;
+	}
+
+	*len = 2;
+	optr += 2;
+
+	while (ctx->pointer < eoc) {
+		if (++(*len) > size) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+
+		if (!asn1_subid_decode(ctx, optr++)) {
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int
+compare_oid(unsigned long *oid1, unsigned int oid1len,
+	    unsigned long *oid2, unsigned int oid2len)
+{
+	unsigned int i;
+
+	if (oid1len != oid2len)
+		return 0;
+	else {
+		for (i = 0; i < oid1len; i++) {
+			if (oid1[i] != oid2[i])
+				return 0;
+		}
+		return 1;
+	}
+}
+
+	/* BB check for endian conversion issues here */
+
+int
+decode_negTokenInit(unsigned char *security_blob, int length,
+		    struct TCP_Server_Info *server)
+{
+	struct asn1_ctx ctx;
+	unsigned char *end;
+	unsigned char *sequence_end;
+	unsigned long *oid = NULL;
+	unsigned int cls, con, tag, oidlen, rc;
+
+	/* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
+
+	asn1_open(&ctx, security_blob, length);
+
+	/* GSSAPI header */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
+		return 0;
+	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag);
+		return 0;
+	}
+
+	/* Check for SPNEGO OID -- remember to free obj->oid */
+	rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+	if (rc) {
+		if ((tag == ASN1_OJI) && (con == ASN1_PRI) &&
+		    (cls == ASN1_UNI)) {
+			rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
+			if (rc) {
+				rc = compare_oid(oid, oidlen, SPNEGO_OID,
+						 SPNEGO_OID_LEN);
+				kfree(oid);
+			}
+		} else
+			rc = 0;
+	}
+
+	/* SPNEGO OID not present or garbled -- bail out */
+	if (!rc) {
+		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
+		return 0;
+	}
+
+	/* SPNEGO */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cifs_dbg(FYI, "Error decoding negTokenInit\n");
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+			 cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* negTokenInit */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cifs_dbg(FYI, "Error decoding negTokenInit\n");
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+			 cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* sequence */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+			 cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* sequence of */
+	if (asn1_header_decode
+	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
+		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+			 cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* list of security mechanisms */
+	while (!asn1_eoc_decode(&ctx, sequence_end)) {
+		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+		if (!rc) {
+			cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n");
+			return 0;
+		}
+		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
+			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
+
+				cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n",
+					 oidlen, *oid, *(oid + 1), *(oid + 2),
+					 *(oid + 3));
+
+				if (compare_oid(oid, oidlen, MSKRB5_OID,
+						MSKRB5_OID_LEN))
+					server->sec_mskerberos = true;
+				else if (compare_oid(oid, oidlen, KRB5U2U_OID,
+						     KRB5U2U_OID_LEN))
+					server->sec_kerberosu2u = true;
+				else if (compare_oid(oid, oidlen, KRB5_OID,
+						     KRB5_OID_LEN))
+					server->sec_kerberos = true;
+				else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+						     NTLMSSP_OID_LEN))
+					server->sec_ntlmssp = true;
+
+				kfree(oid);
+			}
+		} else {
+			cifs_dbg(FYI, "Should be an oid what is going on?\n");
+		}
+	}
+
+	/*
+	 * We currently ignore anything at the end of the SPNEGO blob after
+	 * the mechTypes have been parsed, since none of that info is
+	 * used at the moment.
+	 */
+	return 1;
+}
diff --git a/kernel/fs/cifs/cache.c b/kernel/fs/cifs/cache.c
new file mode 100644
index 000000000..6c665bf4a
--- /dev/null
+++ b/kernel/fs/cifs/cache.c
@@ -0,0 +1,333 @@
+/*
+ *   fs/cifs/cache.c - CIFS filesystem cache index structure definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifs_debug.h"
+
+/*
+ * CIFS filesystem definition for FS-Cache
+ */
+struct fscache_netfs cifs_fscache_netfs = {
+	.name = "cifs",
+	.version = 0,
+};
+
+/*
+ * Register CIFS for caching with FS-Cache
+ */
+int cifs_fscache_register(void)
+{
+	return fscache_register_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Unregister CIFS for caching
+ */
+void cifs_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+	uint16_t	family;		/* address family */
+	__be16		port;		/* IP port */
+	union {
+		struct in_addr	ipv4_addr;
+		struct in6_addr	ipv6_addr;
+	} addr[0];
+};
+
+/*
+ * Server object keyed by {IPaddress,port,family} tuple
+ */
+static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t maxbuf)
+{
+	const struct TCP_Server_Info *server = cookie_netfs_data;
+	const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+	const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+	const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
+	struct cifs_server_key *key = buffer;
+	uint16_t key_len = sizeof(struct cifs_server_key);
+
+	memset(key, 0, key_len);
+
+	/*
+	 * Should not be a problem as sin_family/sin6_family overlays
+	 * sa_family field
+	 */
+	switch (sa->sa_family) {
+	case AF_INET:
+		key->family = sa->sa_family;
+		key->port = addr->sin_port;
+		key->addr[0].ipv4_addr = addr->sin_addr;
+		key_len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->family = sa->sa_family;
+		key->port = addr6->sin6_port;
+		key->addr[0].ipv6_addr = addr6->sin6_addr;
+		key_len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
+		key_len = 0;
+		break;
+	}
+
+	return key_len;
+}
+
+/*
+ * Server object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_server_index_def = {
+	.name = "CIFS.server",
+	.type = FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key = cifs_server_get_key,
+};
+
+/*
+ * Auxiliary data attached to CIFS superblock within the cache
+ */
+struct cifs_fscache_super_auxdata {
+	u64	resource_id;		/* unique server resource id */
+};
+
+static char *extract_sharename(const char *treename)
+{
+	const char *src;
+	char *delim, *dst;
+	int len;
+
+	/* skip double chars at the beginning */
+	src = treename + 2;
+
+	/* share name is always preceded by '\\' now */
+	delim = strchr(src, '\\');
+	if (!delim)
+		return ERR_PTR(-EINVAL);
+	delim++;
+	len = strlen(delim);
+
+	/* caller has to free the memory */
+	dst = kstrndup(delim, len, GFP_KERNEL);
+	if (!dst)
+		return ERR_PTR(-ENOMEM);
+
+	return dst;
+}
+
+/*
+ * Superblock object currently keyed by share name
+ */
+static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
+				   uint16_t maxbuf)
+{
+	const struct cifs_tcon *tcon = cookie_netfs_data;
+	char *sharename;
+	uint16_t len;
+
+	sharename = extract_sharename(tcon->treeName);
+	if (IS_ERR(sharename)) {
+		cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
+		sharename = NULL;
+		return 0;
+	}
+
+	len = strlen(sharename);
+	if (len > maxbuf)
+		return 0;
+
+	memcpy(buffer, sharename, len);
+
+	kfree(sharename);
+
+	return len;
+}
+
+static uint16_t
+cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
+			   uint16_t maxbuf)
+{
+	struct cifs_fscache_super_auxdata auxdata;
+	const struct cifs_tcon *tcon = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.resource_id = tcon->resource_id;
+
+	if (maxbuf > sizeof(auxdata))
+		maxbuf = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, maxbuf);
+
+	return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
+					      const void *data,
+					      uint16_t datalen)
+{
+	struct cifs_fscache_super_auxdata auxdata;
+	const struct cifs_tcon *tcon = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.resource_id = tcon->resource_id;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Superblock object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_super_index_def = {
+	.name = "CIFS.super",
+	.type = FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key = cifs_super_get_key,
+	.get_aux = cifs_fscache_super_get_aux,
+	.check_aux = cifs_fscache_super_check_aux,
+};
+
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+	struct timespec	last_write_time;
+	struct timespec	last_change_time;
+	u64		eof;
+};
+
+static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t maxbuf)
+{
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+	uint16_t keylen;
+
+	/* use the UniqueId as the key */
+	keylen = sizeof(cifsi->uniqueid);
+	if (keylen > maxbuf)
+		keylen = 0;
+	else
+		memcpy(buffer, &cifsi->uniqueid, keylen);
+
+	return keylen;
+}
+
+static void
+cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
+{
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	*size = cifsi->vfs_inode.i_size;
+}
+
+static uint16_t
+cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
+			   uint16_t maxbuf)
+{
+	struct cifs_fscache_inode_auxdata auxdata;
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.eof = cifsi->server_eof;
+	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+	if (maxbuf > sizeof(auxdata))
+		maxbuf = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, maxbuf);
+
+	return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
+					      const void *data,
+					      uint16_t datalen)
+{
+	struct cifs_fscache_inode_auxdata auxdata;
+	struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.eof = cifsi->server_eof;
+	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct cifsInodeInfo *cifsi = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
+
+	for (;;) {
+		nr_pages = pagevec_lookup(&pvec,
+					  cifsi->vfs_inode.i_mapping, first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+const struct fscache_cookie_def cifs_fscache_inode_object_def = {
+	.name		= "CIFS.uniqueid",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= cifs_fscache_inode_get_key,
+	.get_attr	= cifs_fscache_inode_get_attr,
+	.get_aux	= cifs_fscache_inode_get_aux,
+	.check_aux	= cifs_fscache_inode_check_aux,
+	.now_uncached	= cifs_fscache_inode_now_uncached,
+};
diff --git a/kernel/fs/cifs/cifs_debug.c b/kernel/fs/cifs/cifs_debug.c
new file mode 100644
index 000000000..7febcf247
--- /dev/null
+++ b/kernel/fs/cifs/cifs_debug.c
@@ -0,0 +1,701 @@
+/*
+ *   fs/cifs_debug.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2000,2005
+ *
+ *   Modified by Steve French (sfrench@us.ibm.com)
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifsfs.h"
+
+void
+cifs_dump_mem(char *label, void *data, int length)
+{
+	pr_debug("%s: dump of %d bytes of data at 0x%p\n", label, length, data);
+	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 16, 4,
+		       data, length, true);
+}
+
+#ifdef CONFIG_CIFS_DEBUG
+void cifs_vfs_err(const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("CIFS VFS: %pV", &vaf);
+
+	va_end(args);
+}
+#endif
+
+void cifs_dump_detail(void *buf)
+{
+#ifdef CONFIG_CIFS_DEBUG2
+	struct smb_hdr *smb = (struct smb_hdr *)buf;
+
+	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
+		 smb->Command, smb->Status.CifsError,
+		 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
+	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb));
+#endif /* CONFIG_CIFS_DEBUG2 */
+}
+
+void cifs_dump_mids(struct TCP_Server_Info *server)
+{
+#ifdef CONFIG_CIFS_DEBUG2
+	struct list_head *tmp;
+	struct mid_q_entry *mid_entry;
+
+	if (server == NULL)
+		return;
+
+	cifs_dbg(VFS, "Dump pending requests:\n");
+	spin_lock(&GlobalMid_Lock);
+	list_for_each(tmp, &server->pending_mid_q) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
+			 mid_entry->mid_state,
+			 le16_to_cpu(mid_entry->command),
+			 mid_entry->pid,
+			 mid_entry->callback_data,
+			 mid_entry->mid);
+#ifdef CONFIG_CIFS_STATS2
+		cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n",
+			 mid_entry->large_buf,
+			 mid_entry->resp_buf,
+			 mid_entry->when_received,
+			 jiffies);
+#endif /* STATS2 */
+		cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n",
+			 mid_entry->multiRsp, mid_entry->multiEnd);
+		if (mid_entry->resp_buf) {
+			cifs_dump_detail(mid_entry->resp_buf);
+			cifs_dump_mem("existing buf: ",
+				mid_entry->resp_buf, 62);
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+#endif /* CONFIG_CIFS_DEBUG2 */
+}
+
+#ifdef CONFIG_PROC_FS
+static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
+{
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct mid_q_entry *mid_entry;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	int i, j;
+	__u32 dev_type;
+
+	seq_puts(m,
+		    "Display Internal CIFS Data Structures for Debugging\n"
+		    "---------------------------------------------------\n");
+	seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+	seq_printf(m, "Features:");
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	seq_printf(m, " dfs");
+#endif
+#ifdef CONFIG_CIFS_FSCACHE
+	seq_printf(m, " fscache");
+#endif
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	seq_printf(m, " lanman");
+#endif
+#ifdef CONFIG_CIFS_POSIX
+	seq_printf(m, " posix");
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	seq_printf(m, " spnego");
+#endif
+#ifdef CONFIG_CIFS_XATTR
+	seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+	seq_printf(m, " acl");
+#endif
+	seq_putc(m, '\n');
+	seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
+	seq_printf(m, "Servers:");
+
+	i = 0;
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
+		i++;
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifs_ses,
+					 smb_ses_list);
+			if ((ses->serverDomain == NULL) ||
+				(ses->serverOS == NULL) ||
+				(ses->serverNOS == NULL)) {
+				seq_printf(m, "\n%d) entry for %s not fully "
+					   "displayed\n\t", i, ses->serverName);
+			} else {
+				seq_printf(m,
+				    "\n%d) Name: %s  Domain: %s Uses: %d OS:"
+				    " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
+				    " session status: %d\t",
+				i, ses->serverName, ses->serverDomain,
+				ses->ses_count, ses->serverOS, ses->serverNOS,
+				ses->capabilities, ses->status);
+			}
+			seq_printf(m, "TCP status: %d\n\tLocal Users To "
+				   "Server: %d SecMode: 0x%x Req On Wire: %d",
+				   server->tcpStatus, server->srv_count,
+				   server->sec_mode, in_flight(server));
+
+#ifdef CONFIG_CIFS_STATS2
+			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
+				atomic_read(&server->in_send),
+				atomic_read(&server->num_waiters));
+#endif
+
+			seq_puts(m, "\n\tShares:");
+			j = 0;
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3, struct cifs_tcon,
+						  tcon_list);
+				++j;
+				dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+				seq_printf(m, "\n\t%d) %s Mounts: %d ", j,
+					   tcon->treeName, tcon->tc_count);
+				if (tcon->nativeFileSystem) {
+					seq_printf(m, "Type: %s ",
+						   tcon->nativeFileSystem);
+				}
+				seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
+					"\n\tPathComponentMax: %d Status: %d",
+					le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
+					le32_to_cpu(tcon->fsAttrInfo.Attributes),
+					le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
+					tcon->tidStatus);
+				if (dev_type == FILE_DEVICE_DISK)
+					seq_puts(m, " type: DISK ");
+				else if (dev_type == FILE_DEVICE_CD_ROM)
+					seq_puts(m, " type: CDROM ");
+				else
+					seq_printf(m, " type: %d ", dev_type);
+				if (server->ops->dump_share_caps)
+					server->ops->dump_share_caps(m, tcon);
+
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_putc(m, '\n');
+			}
+
+			seq_puts(m, "\n\tMIDs:\n");
+
+			spin_lock(&GlobalMid_Lock);
+			list_for_each(tmp3, &server->pending_mid_q) {
+				mid_entry = list_entry(tmp3, struct mid_q_entry,
+					qhead);
+				seq_printf(m, "\tState: %d com: %d pid:"
+					      " %d cbdata: %p mid %llu\n",
+					      mid_entry->mid_state,
+					      le16_to_cpu(mid_entry->command),
+					      mid_entry->pid,
+					      mid_entry->callback_data,
+					      mid_entry->mid);
+			}
+			spin_unlock(&GlobalMid_Lock);
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	seq_putc(m, '\n');
+
+	/* BB add code to dump additional info such as TCP session info now */
+	return 0;
+}
+
+static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_debug_data_proc_show, NULL);
+}
+
+static const struct file_operations cifs_debug_data_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_debug_data_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#ifdef CONFIG_CIFS_STATS
+static ssize_t cifs_stats_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	bool bv;
+	int rc;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+
+	if (strtobool(&c, &bv) == 0) {
+#ifdef CONFIG_CIFS_STATS2
+		atomic_set(&totBufAllocCount, 0);
+		atomic_set(&totSmBufAllocCount, 0);
+#endif /* CONFIG_CIFS_STATS2 */
+		spin_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp1, &cifs_tcp_ses_list) {
+			server = list_entry(tmp1, struct TCP_Server_Info,
+					    tcp_ses_list);
+			list_for_each(tmp2, &server->smb_ses_list) {
+				ses = list_entry(tmp2, struct cifs_ses,
+						 smb_ses_list);
+				list_for_each(tmp3, &ses->tcon_list) {
+					tcon = list_entry(tmp3,
+							  struct cifs_tcon,
+							  tcon_list);
+					atomic_set(&tcon->num_smbs_sent, 0);
+					if (server->ops->clear_stats)
+						server->ops->clear_stats(tcon);
+				}
+			}
+		}
+		spin_unlock(&cifs_tcp_ses_lock);
+	}
+
+	return count;
+}
+
+static int cifs_stats_proc_show(struct seq_file *m, void *v)
+{
+	int i;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+
+	seq_printf(m,
+			"Resources in use\nCIFS Session: %d\n",
+			sesInfoAllocCount.counter);
+	seq_printf(m, "Share (unique mount targets): %d\n",
+			tconInfoAllocCount.counter);
+	seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
+			bufAllocCount.counter,
+			cifs_min_rcv + tcpSesAllocCount.counter);
+	seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
+			smBufAllocCount.counter, cifs_min_small);
+#ifdef CONFIG_CIFS_STATS2
+	seq_printf(m, "Total Large %d Small %d Allocations\n",
+				atomic_read(&totBufAllocCount),
+				atomic_read(&totSmBufAllocCount));
+#endif /* CONFIG_CIFS_STATS2 */
+
+	seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
+	seq_printf(m,
+		"\n%d session %d share reconnects\n",
+		tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
+
+	seq_printf(m,
+		"Total vfs operations: %d maximum at one time: %d\n",
+		GlobalCurrentXid, GlobalMaxActiveXid);
+
+	i = 0;
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifs_ses,
+					 smb_ses_list);
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3,
+						  struct cifs_tcon,
+						  tcon_list);
+				i++;
+				seq_printf(m, "\n%d) %s", i, tcon->treeName);
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_printf(m, "\nSMBs: %d",
+					   atomic_read(&tcon->num_smbs_sent));
+				if (server->ops->print_stats)
+					server->ops->print_stats(m, tcon);
+			}
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int cifs_stats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_stats_proc_show, NULL);
+}
+
+static const struct file_operations cifs_stats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_stats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_stats_proc_write,
+};
+#endif /* STATS */
+
+static struct proc_dir_entry *proc_fs_cifs;
+static const struct file_operations cifsFYI_proc_fops;
+static const struct file_operations cifs_lookup_cache_proc_fops;
+static const struct file_operations traceSMB_proc_fops;
+static const struct file_operations cifs_security_flags_proc_fops;
+static const struct file_operations cifs_linux_ext_proc_fops;
+
+void
+cifs_proc_init(void)
+{
+	proc_fs_cifs = proc_mkdir("fs/cifs", NULL);
+	if (proc_fs_cifs == NULL)
+		return;
+
+	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
+
+#ifdef CONFIG_CIFS_STATS
+	proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
+#endif /* STATS */
+	proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
+	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
+	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+		    &cifs_linux_ext_proc_fops);
+	proc_create("SecurityFlags", 0, proc_fs_cifs,
+		    &cifs_security_flags_proc_fops);
+	proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+		    &cifs_lookup_cache_proc_fops);
+}
+
+void
+cifs_proc_clean(void)
+{
+	if (proc_fs_cifs == NULL)
+		return;
+
+	remove_proc_entry("DebugData", proc_fs_cifs);
+	remove_proc_entry("cifsFYI", proc_fs_cifs);
+	remove_proc_entry("traceSMB", proc_fs_cifs);
+#ifdef CONFIG_CIFS_STATS
+	remove_proc_entry("Stats", proc_fs_cifs);
+#endif
+	remove_proc_entry("SecurityFlags", proc_fs_cifs);
+	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
+	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+	remove_proc_entry("fs/cifs", NULL);
+}
+
+static int cifsFYI_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", cifsFYI);
+	return 0;
+}
+
+static int cifsFYI_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifsFYI_proc_show, NULL);
+}
+
+static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
+{
+	char c;
+	bool bv;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+	if (strtobool(&c, &bv) == 0)
+		cifsFYI = bv;
+	else if ((c > '1') && (c <= '9'))
+		cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
+
+	return count;
+}
+
+static const struct file_operations cifsFYI_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifsFYI_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifsFYI_proc_write,
+};
+
+static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", linuxExtEnabled);
+	return 0;
+}
+
+static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_linux_ext_proc_show, NULL);
+}
+
+static ssize_t cifs_linux_ext_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	bool bv;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+
+	rc = strtobool(&c, &bv);
+	if (rc)
+		return rc;
+
+	linuxExtEnabled = bv;
+
+	return count;
+}
+
+static const struct file_operations cifs_linux_ext_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_linux_ext_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_linux_ext_proc_write,
+};
+
+static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", lookupCacheEnabled);
+	return 0;
+}
+
+static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_lookup_cache_proc_show, NULL);
+}
+
+static ssize_t cifs_lookup_cache_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	bool bv;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+
+	rc = strtobool(&c, &bv);
+	if (rc)
+		return rc;
+
+	lookupCacheEnabled = bv;
+
+	return count;
+}
+
+static const struct file_operations cifs_lookup_cache_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_lookup_cache_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_lookup_cache_proc_write,
+};
+
+static int traceSMB_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", traceSMB);
+	return 0;
+}
+
+static int traceSMB_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, traceSMB_proc_show, NULL);
+}
+
+static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
+{
+	char c;
+	bool bv;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+
+	rc = strtobool(&c, &bv);
+	if (rc)
+		return rc;
+
+	traceSMB = bv;
+
+	return count;
+}
+
+static const struct file_operations traceSMB_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= traceSMB_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= traceSMB_proc_write,
+};
+
+static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "0x%x\n", global_secflags);
+	return 0;
+}
+
+static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_security_flags_proc_show, NULL);
+}
+
+/*
+ * Ensure that if someone sets a MUST flag, that we disable all other MAY
+ * flags except for the ones corresponding to the given MUST flag. If there are
+ * multiple MUST flags, then try to prefer more secure ones.
+ */
+static void
+cifs_security_flags_handle_must_flags(unsigned int *flags)
+{
+	unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
+
+	if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+		*flags = CIFSSEC_MUST_KRB5;
+	else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+		*flags = CIFSSEC_MUST_NTLMSSP;
+	else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+		*flags = CIFSSEC_MUST_NTLMV2;
+	else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
+		*flags = CIFSSEC_MUST_NTLM;
+	else if (CIFSSEC_MUST_LANMAN &&
+		 (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+		*flags = CIFSSEC_MUST_LANMAN;
+	else if (CIFSSEC_MUST_PLNTXT &&
+		 (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+		*flags = CIFSSEC_MUST_PLNTXT;
+
+	*flags |= signflags;
+}
+
+static ssize_t cifs_security_flags_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	int rc;
+	unsigned int flags;
+	char flags_string[12];
+	char c;
+	bool bv;
+
+	if ((count < 1) || (count > 11))
+		return -EINVAL;
+
+	memset(flags_string, 0, 12);
+
+	if (copy_from_user(flags_string, buffer, count))
+		return -EFAULT;
+
+	if (count < 3) {
+		/* single char or single char followed by null */
+		c = flags_string[0];
+		if (strtobool(&c, &bv) == 0) {
+			global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
+			return count;
+		} else if (!isdigit(c)) {
+			cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+					flags_string);
+			return -EINVAL;
+		}
+	}
+
+	/* else we have a number */
+	rc = kstrtouint(flags_string, 0, &flags);
+	if (rc) {
+		cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+				flags_string);
+		return rc;
+	}
+
+	cifs_dbg(FYI, "sec flags 0x%x\n", flags);
+
+	if (flags == 0)  {
+		cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
+		return -EINVAL;
+	}
+
+	if (flags & ~CIFSSEC_MASK) {
+		cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
+			 flags & ~CIFSSEC_MASK);
+		return -EINVAL;
+	}
+
+	cifs_security_flags_handle_must_flags(&flags);
+
+	/* flags look ok - update the global security flags for cifs module */
+	global_secflags = flags;
+	if (global_secflags & CIFSSEC_MUST_SIGN) {
+		/* requiring signing implies signing is allowed */
+		global_secflags |= CIFSSEC_MAY_SIGN;
+		cifs_dbg(FYI, "packet signing now required\n");
+	} else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
+		cifs_dbg(FYI, "packet signing disabled\n");
+	}
+	/* BB should we turn on MAY flags for other MUST options? */
+	return count;
+}
+
+static const struct file_operations cifs_security_flags_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_security_flags_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_security_flags_proc_write,
+};
+#else
+inline void cifs_proc_init(void)
+{
+}
+
+inline void cifs_proc_clean(void)
+{
+}
+#endif /* PROC_FS */
diff --git a/kernel/fs/cifs/cifs_debug.h b/kernel/fs/cifs/cifs_debug.h
new file mode 100644
index 000000000..f40fbaca1
--- /dev/null
+++ b/kernel/fs/cifs/cifs_debug.h
@@ -0,0 +1,77 @@
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000,2002
+ *   Modified by Steve French (sfrench@us.ibm.com)
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+*/
+
+#ifndef _H_CIFS_DEBUG
+#define _H_CIFS_DEBUG
+
+void cifs_dump_mem(char *label, void *data, int length);
+void cifs_dump_detail(void *);
+void cifs_dump_mids(struct TCP_Server_Info *);
+extern int traceSMB;		/* flag which enables the function below */
+void dump_smb(void *, int);
+#define CIFS_INFO	0x01
+#define CIFS_RC		0x02
+#define CIFS_TIMER	0x04
+
+#define VFS 1
+#define FYI 2
+extern int cifsFYI;
+#ifdef CONFIG_CIFS_DEBUG2
+#define NOISY 4
+#else
+#define NOISY 0
+#endif
+
+/*
+ *	debug ON
+ *	--------
+ */
+#ifdef CONFIG_CIFS_DEBUG
+
+__printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
+
+/* information message: e.g., configuration, major event */
+#define cifs_dbg(type, fmt, ...)					\
+do {									\
+	if (type == FYI) {						\
+		if (cifsFYI & CIFS_INFO) {				\
+			pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__);	\
+		}							\
+	} else if (type == VFS) {					\
+		cifs_vfs_err(fmt, ##__VA_ARGS__);			\
+	} else if (type == NOISY && type != 0) {			\
+		pr_debug(fmt, ##__VA_ARGS__);				\
+	}								\
+} while (0)
+
+/*
+ *	debug OFF
+ *	---------
+ */
+#else		/* _CIFS_DEBUG */
+#define cifs_dbg(type, fmt, ...)					\
+do {									\
+	if (0)								\
+		pr_debug(fmt, ##__VA_ARGS__);				\
+} while (0)
+#endif
+
+#endif				/* _H_CIFS_DEBUG */
diff --git a/kernel/fs/cifs/cifs_dfs_ref.c b/kernel/fs/cifs/cifs_dfs_ref.c
new file mode 100644
index 000000000..7dc886c9a
--- /dev/null
+++ b/kernel/fs/cifs/cifs_dfs_ref.c
@@ -0,0 +1,379 @@
+/*
+ *   Contains the CIFS DFS referral mounting routines used for handling
+ *   traversal via DFS junction point
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Copyright (C) International Business Machines  Corp., 2008
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *		Steve French (sfrench@us.ibm.com)
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifsfs.h"
+#include "dns_resolve.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+
+static LIST_HEAD(cifs_dfs_automount_list);
+
+static void cifs_dfs_expire_automounts(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cifs_dfs_automount_task,
+			    cifs_dfs_expire_automounts);
+static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ;
+
+static void cifs_dfs_expire_automounts(struct work_struct *work)
+{
+	struct list_head *list = &cifs_dfs_automount_list;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&cifs_dfs_automount_task,
+				      cifs_dfs_mountpoint_expiry_timeout);
+}
+
+void cifs_dfs_release_automount_timer(void)
+{
+	BUG_ON(!list_empty(&cifs_dfs_automount_list));
+	cancel_delayed_work_sync(&cifs_dfs_automount_task);
+}
+
+/**
+ * cifs_build_devname - build a devicename from a UNC and optional prepath
+ * @nodename:	pointer to UNC string
+ * @prepath:	pointer to prefixpath (or NULL if there isn't one)
+ *
+ * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer
+ * big enough to hold the final thing. Copy the UNC from the nodename, and
+ * concatenate the prepath onto the end of it if there is one.
+ *
+ * Returns pointer to the built string, or a ERR_PTR. Caller is responsible
+ * for freeing the returned string.
+ */
+static char *
+cifs_build_devname(char *nodename, const char *prepath)
+{
+	size_t pplen;
+	size_t unclen;
+	char *dev;
+	char *pos;
+
+	/* skip over any preceding delimiters */
+	nodename += strspn(nodename, "\\");
+	if (!*nodename)
+		return ERR_PTR(-EINVAL);
+
+	/* get length of UNC and set pos to last char */
+	unclen = strlen(nodename);
+	pos = nodename + unclen - 1;
+
+	/* trim off any trailing delimiters */
+	while (*pos == '\\') {
+		--pos;
+		--unclen;
+	}
+
+	/* allocate a buffer:
+	 * +2 for preceding "//"
+	 * +1 for delimiter between UNC and prepath
+	 * +1 for trailing NULL
+	 */
+	pplen = prepath ? strlen(prepath) : 0;
+	dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	pos = dev;
+	/* add the initial "//" */
+	*pos = '/';
+	++pos;
+	*pos = '/';
+	++pos;
+
+	/* copy in the UNC portion from referral */
+	memcpy(pos, nodename, unclen);
+	pos += unclen;
+
+	/* copy the prefixpath remainder (if there is one) */
+	if (pplen) {
+		*pos = '/';
+		++pos;
+		memcpy(pos, prepath, pplen);
+		pos += pplen;
+	}
+
+	/* NULL terminator */
+	*pos = '\0';
+
+	convert_delimiter(dev, '/');
+	return dev;
+}
+
+
+/**
+ * cifs_compose_mount_options	-	creates mount options for refferral
+ * @sb_mountdata:	parent/root DFS mount options (template)
+ * @fullpath:		full path in UNC format
+ * @ref:		server's referral
+ * @devname:		pointer for saving device name
+ *
+ * creates mount options for submount based on template options sb_mountdata
+ * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
+ *
+ * Returns: pointer to new mount options or ERR_PTR.
+ * Caller is responcible for freeing retunrned value if it is not error.
+ */
+char *cifs_compose_mount_options(const char *sb_mountdata,
+				   const char *fullpath,
+				   const struct dfs_info3_param *ref,
+				   char **devname)
+{
+	int rc;
+	char *mountdata = NULL;
+	const char *prepath = NULL;
+	int md_len;
+	char *tkn_e;
+	char *srvIP = NULL;
+	char sep = ',';
+	int off, noff;
+
+	if (sb_mountdata == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (strlen(fullpath) - ref->path_consumed)
+		prepath = fullpath + ref->path_consumed;
+
+	*devname = cifs_build_devname(ref->node_name, prepath);
+	if (IS_ERR(*devname)) {
+		rc = PTR_ERR(*devname);
+		*devname = NULL;
+		goto compose_mount_options_err;
+	}
+
+	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
+	if (rc < 0) {
+		cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
+			 __func__, *devname, rc);
+		goto compose_mount_options_err;
+	}
+
+	/*
+	 * In most cases, we'll be building a shorter string than the original,
+	 * but we do have to assume that the address in the ip= option may be
+	 * much longer than the original. Add the max length of an address
+	 * string to the length of the original string to allow for worst case.
+	 */
+	md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
+	mountdata = kzalloc(md_len + 1, GFP_KERNEL);
+	if (mountdata == NULL) {
+		rc = -ENOMEM;
+		goto compose_mount_options_err;
+	}
+
+	/* copy all options except of unc,ip,prefixpath */
+	off = 0;
+	if (strncmp(sb_mountdata, "sep=", 4) == 0) {
+			sep = sb_mountdata[4];
+			strncpy(mountdata, sb_mountdata, 5);
+			off += 5;
+	}
+
+	do {
+		tkn_e = strchr(sb_mountdata + off, sep);
+		if (tkn_e == NULL)
+			noff = strlen(sb_mountdata + off);
+		else
+			noff = tkn_e - (sb_mountdata + off) + 1;
+
+		if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) {
+			off += noff;
+			continue;
+		}
+		if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) {
+			off += noff;
+			continue;
+		}
+		if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
+			off += noff;
+			continue;
+		}
+		strncat(mountdata, sb_mountdata + off, noff);
+		off += noff;
+	} while (tkn_e);
+	strcat(mountdata, sb_mountdata + off);
+	mountdata[md_len] = '\0';
+
+	/* copy new IP and ref share name */
+	if (mountdata[strlen(mountdata) - 1] != sep)
+		strncat(mountdata, &sep, 1);
+	strcat(mountdata, "ip=");
+	strcat(mountdata, srvIP);
+
+	/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
+	/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
+
+compose_mount_options_out:
+	kfree(srvIP);
+	return mountdata;
+
+compose_mount_options_err:
+	kfree(mountdata);
+	mountdata = ERR_PTR(rc);
+	kfree(*devname);
+	*devname = NULL;
+	goto compose_mount_options_out;
+}
+
+/**
+ * cifs_dfs_do_refmount - mounts specified path using provided refferal
+ * @cifs_sb:		parent/root superblock
+ * @fullpath:		full path in UNC format
+ * @ref:		server's referral
+ */
+static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+		const char *fullpath, const struct dfs_info3_param *ref)
+{
+	struct vfsmount *mnt;
+	char *mountdata;
+	char *devname = NULL;
+
+	/* strip first '\' from fullpath */
+	mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
+			fullpath + 1, ref, &devname);
+
+	if (IS_ERR(mountdata))
+		return (struct vfsmount *)mountdata;
+
+	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	kfree(mountdata);
+	kfree(devname);
+	return mnt;
+
+}
+
+static void dump_referral(const struct dfs_info3_param *ref)
+{
+	cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
+	cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
+	cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+		 ref->flags, ref->server_type);
+	cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+		 ref->ref_flag, ref->path_consumed);
+}
+
+/*
+ * Create a vfsmount that we can automount
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
+{
+	struct dfs_info3_param *referrals = NULL;
+	unsigned int num_referrals = 0;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_ses *ses;
+	char *full_path;
+	unsigned int xid;
+	int i;
+	int rc;
+	struct vfsmount *mnt;
+	struct tcon_link *tlink;
+
+	cifs_dbg(FYI, "in %s\n", __func__);
+	BUG_ON(IS_ROOT(mntpt));
+
+	/*
+	 * The MSDFS spec states that paths in DFS referral requests and
+	 * responses must be prefixed by a single '\' character instead of
+	 * the double backslashes usually used in the UNC. This function
+	 * gives us the latter, so we must adjust the result.
+	 */
+	mnt = ERR_PTR(-ENOMEM);
+	full_path = build_path_from_dentry(mntpt);
+	if (full_path == NULL)
+		goto cdda_exit;
+
+	cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		mnt = ERR_CAST(tlink);
+		goto free_full_path;
+	}
+	ses = tlink_tcon(tlink)->ses;
+
+	xid = get_xid();
+	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
+		&num_referrals, &referrals,
+		cifs_remap(cifs_sb));
+	free_xid(xid);
+
+	cifs_put_tlink(tlink);
+
+	mnt = ERR_PTR(-ENOENT);
+	for (i = 0; i < num_referrals; i++) {
+		int len;
+		dump_referral(referrals + i);
+		/* connect to a node */
+		len = strlen(referrals[i].node_name);
+		if (len < 2) {
+			cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
+				 __func__, referrals[i].node_name);
+			mnt = ERR_PTR(-EINVAL);
+			break;
+		}
+		mnt = cifs_dfs_do_refmount(cifs_sb,
+				full_path, referrals + i);
+		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
+			 __func__, referrals[i].node_name, mnt);
+		if (!IS_ERR(mnt))
+			goto success;
+	}
+
+	/* no valid submounts were found; return error from get_dfs_path() by
+	 * preference */
+	if (rc != 0)
+		mnt = ERR_PTR(rc);
+
+success:
+	free_dfs_info_array(referrals, num_referrals);
+free_full_path:
+	kfree(full_path);
+cdda_exit:
+	cifs_dbg(FYI, "leaving %s\n" , __func__);
+	return mnt;
+}
+
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+	struct vfsmount *newmnt;
+
+	cifs_dbg(FYI, "in %s\n", __func__);
+
+	newmnt = cifs_dfs_do_automount(path->dentry);
+	if (IS_ERR(newmnt)) {
+		cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__);
+		return newmnt;
+	}
+
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+	schedule_delayed_work(&cifs_dfs_automount_task,
+			      cifs_dfs_mountpoint_expiry_timeout);
+	cifs_dbg(FYI, "leaving %s [ok]\n" , __func__);
+	return newmnt;
+}
+
+const struct inode_operations cifs_dfs_referral_inode_operations = {
+};
diff --git a/kernel/fs/cifs/cifs_fs_sb.h b/kernel/fs/cifs/cifs_fs_sb.h
new file mode 100644
index 000000000..3182273a3
--- /dev/null
+++ b/kernel/fs/cifs/cifs_fs_sb.h
@@ -0,0 +1,71 @@
+/*
+ *   fs/cifs/cifs_fs_sb.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2004
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ */
+#include <linux/rbtree.h>
+
+#ifndef _CIFS_FS_SB_H
+#define _CIFS_FS_SB_H
+
+#include <linux/backing-dev.h>
+
+#define CIFS_MOUNT_NO_PERM      1 /* do not do client vfs_perm check */
+#define CIFS_MOUNT_SET_UID      2 /* set current's euid in create etc. */
+#define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server  */
+#define CIFS_MOUNT_DIRECT_IO    8 /* do not write nor read through page cache */
+#define CIFS_MOUNT_NO_XATTR     0x10  /* if set - disable xattr support       */
+#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames   */
+#define CIFS_MOUNT_POSIX_PATHS  0x40  /* Negotiate posix pathnames if possible*/
+#define CIFS_MOUNT_UNX_EMUL     0x80  /* Network compat with SFUnix emulation */
+#define CIFS_MOUNT_NO_BRL       0x100 /* No sending byte range locks to srv   */
+#define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
+#define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
+#define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
+#define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
+#define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
+#define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
+#define CIFS_MOUNT_FSCACHE	0x8000 /* local caching enabled */
+#define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
+#define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
+#define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
+#define CIFS_MOUNT_RWPIDFORWARD	0x80000 /* use pid forwarding for rw */
+#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
+#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
+#define CIFS_MOUNT_MAP_SFM_CHR	0x800000 /* SFM/MAC mapping for illegal chars */
+
+struct cifs_sb_info {
+	struct rb_root tlink_tree;
+	spinlock_t tlink_tree_lock;
+	struct tcon_link *master_tlink;
+	struct nls_table *local_nls;
+	unsigned int rsize;
+	unsigned int wsize;
+	unsigned long actimeo; /* attribute cache timeout (jiffies) */
+	atomic_t active;
+	kuid_t	mnt_uid;
+	kgid_t	mnt_gid;
+	kuid_t	mnt_backupuid;
+	kgid_t	mnt_backupgid;
+	umode_t	mnt_file_mode;
+	umode_t	mnt_dir_mode;
+	unsigned int mnt_cifs_flags;
+	char   *mountdata; /* options received at mount time or via DFS refs */
+	struct backing_dev_info bdi;
+	struct delayed_work prune_tlinks;
+	struct rcu_head rcu;
+};
+#endif				/* _CIFS_FS_SB_H */
diff --git a/kernel/fs/cifs/cifs_spnego.c b/kernel/fs/cifs/cifs_spnego.c
new file mode 100644
index 000000000..f4cf200b3
--- /dev/null
+++ b/kernel/fs/cifs/cifs_spnego.c
@@ -0,0 +1,179 @@
+/*
+ *   fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <keys/user-type.h>
+#include <linux/key-type.h>
+#include <linux/inet.h>
+#include "cifsglob.h"
+#include "cifs_spnego.h"
+#include "cifs_debug.h"
+
+/* create a new cifs key */
+static int
+cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
+{
+	char *payload;
+	int ret;
+
+	ret = -ENOMEM;
+	payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+	if (!payload)
+		goto error;
+
+	/* attach the data */
+	key->payload.data = payload;
+	ret = 0;
+
+error:
+	return ret;
+}
+
+static void
+cifs_spnego_key_destroy(struct key *key)
+{
+	kfree(key->payload.data);
+}
+
+
+/*
+ * keytype for CIFS spnego keys
+ */
+struct key_type cifs_spnego_key_type = {
+	.name		= "cifs.spnego",
+	.instantiate	= cifs_spnego_key_instantiate,
+	.destroy	= cifs_spnego_key_destroy,
+	.describe	= user_describe,
+};
+
+/* length of longest version string e.g.  strlen("ver=0xFF") */
+#define MAX_VER_STR_LEN		8
+
+/* length of longest security mechanism name, eg in future could have
+ * strlen(";sec=ntlmsspi") */
+#define MAX_MECH_STR_LEN	13
+
+/* strlen of "host=" */
+#define HOST_KEY_LEN		5
+
+/* strlen of ";ip4=" or ";ip6=" */
+#define IP_KEY_LEN		5
+
+/* strlen of ";uid=0x" */
+#define UID_KEY_LEN		7
+
+/* strlen of ";creduid=0x" */
+#define CREDUID_KEY_LEN		11
+
+/* strlen of ";user=" */
+#define USER_KEY_LEN		6
+
+/* strlen of ";pid=0x" */
+#define PID_KEY_LEN		7
+
+/* get a key struct with a SPNEGO security blob, suitable for session setup */
+struct key *
+cifs_get_spnego_key(struct cifs_ses *sesInfo)
+{
+	struct TCP_Server_Info *server = sesInfo->server;
+	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+	struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
+	char *description, *dp;
+	size_t desc_len;
+	struct key *spnego_key;
+	const char *hostname = server->hostname;
+
+	/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
+	   host=hostname sec=mechanism uid=0xFF user=username */
+	desc_len = MAX_VER_STR_LEN +
+		   HOST_KEY_LEN + strlen(hostname) +
+		   IP_KEY_LEN + INET6_ADDRSTRLEN +
+		   MAX_MECH_STR_LEN +
+		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
+		   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
+		   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
+
+	if (sesInfo->user_name)
+		desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
+
+	spnego_key = ERR_PTR(-ENOMEM);
+	description = kzalloc(desc_len, GFP_KERNEL);
+	if (description == NULL)
+		goto out;
+
+	dp = description;
+	/* start with version and hostname portion of UNC string */
+	spnego_key = ERR_PTR(-EINVAL);
+	sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+		hostname);
+	dp = description + strlen(description);
+
+	/* add the server address */
+	if (server->dstaddr.ss_family == AF_INET)
+		sprintf(dp, "ip4=%pI4", &sa->sin_addr);
+	else if (server->dstaddr.ss_family == AF_INET6)
+		sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
+	else
+		goto out;
+
+	dp = description + strlen(description);
+
+	/* for now, only sec=krb5 and sec=mskrb5 are valid */
+	if (server->sec_kerberos)
+		sprintf(dp, ";sec=krb5");
+	else if (server->sec_mskerberos)
+		sprintf(dp, ";sec=mskrb5");
+	else
+		goto out;
+
+	dp = description + strlen(description);
+	sprintf(dp, ";uid=0x%x",
+		from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
+
+	dp = description + strlen(description);
+	sprintf(dp, ";creduid=0x%x",
+		from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
+
+	if (sesInfo->user_name) {
+		dp = description + strlen(description);
+		sprintf(dp, ";user=%s", sesInfo->user_name);
+	}
+
+	dp = description + strlen(description);
+	sprintf(dp, ";pid=0x%x", current->pid);
+
+	cifs_dbg(FYI, "key description = %s\n", description);
+	spnego_key = request_key(&cifs_spnego_key_type, description, "");
+
+#ifdef CONFIG_CIFS_DEBUG2
+	if (cifsFYI && !IS_ERR(spnego_key)) {
+		struct cifs_spnego_msg *msg = spnego_key->payload.data;
+		cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U,
+				msg->secblob_len + msg->sesskey_len));
+	}
+#endif /* CONFIG_CIFS_DEBUG2 */
+
+out:
+	kfree(description);
+	return spnego_key;
+}
diff --git a/kernel/fs/cifs/cifs_spnego.h b/kernel/fs/cifs/cifs_spnego.h
new file mode 100644
index 000000000..31bef9ee0
--- /dev/null
+++ b/kernel/fs/cifs/cifs_spnego.h
@@ -0,0 +1,47 @@
+/*
+ *   fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFS_SPNEGO_H
+#define _CIFS_SPNEGO_H
+
+#define CIFS_SPNEGO_UPCALL_VERSION 2
+
+/*
+ * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
+ * The flags field is for future use. The request-key callout should set
+ * sesskey_len and secblob_len, and then concatenate the SessKey+SecBlob
+ * and stuff it in the data field.
+ */
+struct cifs_spnego_msg {
+	uint32_t	version;
+	uint32_t	flags;
+	uint32_t	sesskey_len;
+	uint32_t	secblob_len;
+	uint8_t		data[1];
+};
+
+#ifdef __KERNEL__
+extern struct key_type cifs_spnego_key_type;
+extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo);
+#endif /* KERNEL */
+
+#endif /* _CIFS_SPNEGO_H */
diff --git a/kernel/fs/cifs/cifs_unicode.c b/kernel/fs/cifs/cifs_unicode.c
new file mode 100644
index 000000000..5a53ac6b1
--- /dev/null
+++ b/kernel/fs/cifs/cifs_unicode.c
@@ -0,0 +1,611 @@
+/*
+ *   fs/cifs/cifs_unicode.c
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
+ *   Modified by Steve French (sfrench@us.ibm.com)
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#include "cifs_uniupr.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+
+int cifs_remap(struct cifs_sb_info *cifs_sb)
+{
+	int map_type;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
+		map_type = SFM_MAP_UNI_RSVD;
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+		map_type = SFU_MAP_UNI_RSVD;
+	else
+		map_type = NO_MAP_UNI_RSVD;
+
+	return map_type;
+}
+
+/* Convert character using the SFU - "Services for Unix" remapping range */
+static bool
+convert_sfu_char(const __u16 src_char, char *target)
+{
+	/*
+	 * BB: Cannot handle remapping UNI_SLASH until all the calls to
+	 *     build_path_from_dentry are modified, as they use slash as
+	 *     separator.
+	 */
+	switch (src_char) {
+	case UNI_COLON:
+		*target = ':';
+		break;
+	case UNI_ASTERISK:
+		*target = '*';
+		break;
+	case UNI_QUESTION:
+		*target = '?';
+		break;
+	case UNI_PIPE:
+		*target = '|';
+		break;
+	case UNI_GRTRTHAN:
+		*target = '>';
+		break;
+	case UNI_LESSTHAN:
+		*target = '<';
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
+/* Convert character using the SFM - "Services for Mac" remapping range */
+static bool
+convert_sfm_char(const __u16 src_char, char *target)
+{
+	switch (src_char) {
+	case SFM_COLON:
+		*target = ':';
+		break;
+	case SFM_ASTERISK:
+		*target = '*';
+		break;
+	case SFM_QUESTION:
+		*target = '?';
+		break;
+	case SFM_PIPE:
+		*target = '|';
+		break;
+	case SFM_GRTRTHAN:
+		*target = '>';
+		break;
+	case SFM_LESSTHAN:
+		*target = '<';
+		break;
+	case SFM_SLASH:
+		*target = '\\';
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
+
+/*
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
+ * @target - where converted character should be copied
+ * @src_char - 2 byte host-endian source character
+ * @cp - codepage to which character should be converted
+ * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2?
+ *
+ * This function handles the conversion of a single character. It is the
+ * responsibility of the caller to ensure that the target buffer is large
+ * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
+ */
+static int
+cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
+	     int maptype)
+{
+	int len = 1;
+	__u16 src_char;
+
+	src_char = *from;
+
+	if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
+		return len;
+	else if ((maptype == SFU_MAP_UNI_RSVD) &&
+		  convert_sfu_char(src_char, target))
+		return len;
+
+	/* if character not one of seven in special remap set */
+	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
+	if (len <= 0)
+		goto surrogate_pair;
+
+	return len;
+
+surrogate_pair:
+	/* convert SURROGATE_PAIR and IVS */
+	if (strcmp(cp->charset, "utf8"))
+		goto unknown;
+	len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+	if (len <= 0)
+		goto unknown;
+	return len;
+
+unknown:
+	*target = '?';
+	len = 1;
+	return len;
+}
+
+/*
+ * cifs_from_utf16 - convert utf16le string to local charset
+ * @to - destination buffer
+ * @from - source buffer
+ * @tolen - destination buffer size (in bytes)
+ * @fromlen - source buffer size (in bytes)
+ * @codepage - codepage to which characters should be converted
+ * @mapchar - should characters be remapped according to the mapchars option?
+ *
+ * Convert a little-endian utf16le string (as sent by the server) to a string
+ * in the provided codepage. The tolen and fromlen parameters are to ensure
+ * that the code doesn't walk off of the end of the buffer (which is always
+ * a danger if the alignment of the source buffer is off). The destination
+ * string is always properly null terminated and fits in the destination
+ * buffer. Returns the length of the destination string in bytes (including
+ * null terminator).
+ *
+ * Note that some windows versions actually send multiword UTF-16 characters
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
+ * deal with those characters properly. In the event that we get some of
+ * those characters, they won't be translated properly.
+ */
+int
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+		const struct nls_table *codepage, int map_type)
+{
+	int i, charlen, safelen;
+	int outlen = 0;
+	int nullsize = nls_nullsize(codepage);
+	int fromwords = fromlen / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp[3];		/* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
+
+	/*
+	 * because the chars can be of varying widths, we need to take care
+	 * not to overflow the destination buffer when we get close to the
+	 * end of it. Until we get to this offset, we don't need to check
+	 * for overflow however.
+	 */
+	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
+
+	for (i = 0; i < fromwords; i++) {
+		ftmp[0] = get_unaligned_le16(&from[i]);
+		if (ftmp[0] == 0)
+			break;
+		if (i + 1 < fromwords)
+			ftmp[1] = get_unaligned_le16(&from[i + 1]);
+		else
+			ftmp[1] = 0;
+		if (i + 2 < fromwords)
+			ftmp[2] = get_unaligned_le16(&from[i + 2]);
+		else
+			ftmp[2] = 0;
+
+		/*
+		 * check to see if converting this character might make the
+		 * conversion bleed into the null terminator
+		 */
+		if (outlen >= safelen) {
+			charlen = cifs_mapchar(tmp, ftmp, codepage, map_type);
+			if ((outlen + charlen) > (tolen - nullsize))
+				break;
+		}
+
+		/* put converted char into 'to' buffer */
+		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
+		outlen += charlen;
+
+		/* charlen (=bytes of UTF-8 for 1 character)
+		 * 4bytes UTF-8(surrogate pair) is charlen=4
+		 *   (4bytes UTF-16 code)
+		 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
+		 *   (2 UTF-8 pairs divided to 2 UTF-16 pairs) */
+		if (charlen == 4)
+			i++;
+		else if (charlen >= 5)
+			/* 5-6bytes UTF-8 */
+			i += 2;
+	}
+
+	/* properly null-terminate string */
+	for (i = 0; i < nullsize; i++)
+		to[outlen++] = 0;
+
+	return outlen;
+}
+
+/*
+ * NAME:	cifs_strtoUTF16()
+ *
+ * FUNCTION:	Convert character string to unicode string
+ *
+ */
+int
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
+	      const struct nls_table *codepage)
+{
+	int charlen;
+	int i;
+	wchar_t wchar_to; /* needed to quiet sparse */
+
+	/* special case for utf8 to handle no plane0 chars */
+	if (!strcmp(codepage->charset, "utf8")) {
+		/*
+		 * convert utf8 -> utf16, we assume we have enough space
+		 * as caller should have assumed conversion does not overflow
+		 * in destination len is length in wchar_t units (16bits)
+		 */
+		i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
+				       (wchar_t *) to, len);
+
+		/* if success terminate and exit */
+		if (i >= 0)
+			goto success;
+		/*
+		 * if fails fall back to UCS encoding as this
+		 * function should not return negative values
+		 * currently can fail only if source contains
+		 * invalid encoded characters
+		 */
+	}
+
+	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+		charlen = codepage->char2uni(from, len, &wchar_to);
+		if (charlen < 1) {
+			cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
+				 *from, charlen);
+			/* A question mark */
+			wchar_to = 0x003f;
+			charlen = 1;
+		}
+		put_unaligned_le16(wchar_to, &to[i]);
+	}
+
+success:
+	put_unaligned_le16(0, &to[i]);
+	return i;
+}
+
+/*
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
+ * @maxbytes - don't go past this many bytes of input string
+ * @codepage - destination codepage
+ *
+ * Walk a utf16le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ */
+int
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
+		const struct nls_table *codepage)
+{
+	int i;
+	int charlen, outlen = 0;
+	int maxwords = maxbytes / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp[3];
+
+	for (i = 0; i < maxwords; i++) {
+		ftmp[0] = get_unaligned_le16(&from[i]);
+		if (ftmp[0] == 0)
+			break;
+		if (i + 1 < maxwords)
+			ftmp[1] = get_unaligned_le16(&from[i + 1]);
+		else
+			ftmp[1] = 0;
+		if (i + 2 < maxwords)
+			ftmp[2] = get_unaligned_le16(&from[i + 2]);
+		else
+			ftmp[2] = 0;
+
+		charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD);
+		outlen += charlen;
+	}
+
+	return outlen;
+}
+
+/*
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
+ * @src - source string
+ * @maxlen - don't walk past this many bytes in the source string
+ * @is_unicode - is this a unicode string?
+ * @codepage - destination codepage
+ *
+ * Take a string given by the server, convert it to the local codepage and
+ * put it in a new buffer. Returns a pointer to the new string or NULL on
+ * error.
+ */
+char *
+cifs_strndup_from_utf16(const char *src, const int maxlen,
+			const bool is_unicode, const struct nls_table *codepage)
+{
+	int len;
+	char *dst;
+
+	if (is_unicode) {
+		len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
+		len += nls_nullsize(codepage);
+		dst = kmalloc(len, GFP_KERNEL);
+		if (!dst)
+			return NULL;
+		cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
+			       NO_MAP_UNI_RSVD);
+	} else {
+		len = strnlen(src, maxlen);
+		len++;
+		dst = kmalloc(len, GFP_KERNEL);
+		if (!dst)
+			return NULL;
+		strlcpy(dst, src, len);
+	}
+
+	return dst;
+}
+
+static __le16 convert_to_sfu_char(char src_char)
+{
+	__le16 dest_char;
+
+	switch (src_char) {
+	case ':':
+		dest_char = cpu_to_le16(UNI_COLON);
+		break;
+	case '*':
+		dest_char = cpu_to_le16(UNI_ASTERISK);
+		break;
+	case '?':
+		dest_char = cpu_to_le16(UNI_QUESTION);
+		break;
+	case '<':
+		dest_char = cpu_to_le16(UNI_LESSTHAN);
+		break;
+	case '>':
+		dest_char = cpu_to_le16(UNI_GRTRTHAN);
+		break;
+	case '|':
+		dest_char = cpu_to_le16(UNI_PIPE);
+		break;
+	default:
+		dest_char = 0;
+	}
+
+	return dest_char;
+}
+
+static __le16 convert_to_sfm_char(char src_char)
+{
+	__le16 dest_char;
+
+	switch (src_char) {
+	case ':':
+		dest_char = cpu_to_le16(SFM_COLON);
+		break;
+	case '*':
+		dest_char = cpu_to_le16(SFM_ASTERISK);
+		break;
+	case '?':
+		dest_char = cpu_to_le16(SFM_QUESTION);
+		break;
+	case '<':
+		dest_char = cpu_to_le16(SFM_LESSTHAN);
+		break;
+	case '>':
+		dest_char = cpu_to_le16(SFM_GRTRTHAN);
+		break;
+	case '|':
+		dest_char = cpu_to_le16(SFM_PIPE);
+		break;
+	default:
+		dest_char = 0;
+	}
+
+	return dest_char;
+}
+
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+int
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
+		 const struct nls_table *cp, int map_chars)
+{
+	int i, charlen;
+	int j = 0;
+	char src_char;
+	__le16 dst_char;
+	wchar_t tmp;
+	wchar_t *wchar_to;	/* UTF-16 */
+	int ret;
+	unicode_t u;
+
+	if (map_chars == NO_MAP_UNI_RSVD)
+		return cifs_strtoUTF16(target, source, PATH_MAX, cp);
+
+	wchar_to = kzalloc(6, GFP_KERNEL);
+
+	for (i = 0; i < srclen; j++) {
+		src_char = source[i];
+		charlen = 1;
+
+		/* check if end of string */
+		if (src_char == 0)
+			goto ctoUTF16_out;
+
+		/* see if we must remap this char */
+		if (map_chars == SFU_MAP_UNI_RSVD)
+			dst_char = convert_to_sfu_char(src_char);
+		else if (map_chars == SFM_MAP_UNI_RSVD)
+			dst_char = convert_to_sfm_char(src_char);
+		else
+			dst_char = 0;
+		/*
+		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
+		 * until all the calls to build_path_from_dentry are modified,
+		 * as they use backslash as separator.
+		 */
+		if (dst_char == 0) {
+			charlen = cp->char2uni(source + i, srclen - i, &tmp);
+			dst_char = cpu_to_le16(tmp);
+
+			/*
+			 * if no match, use question mark, which at least in
+			 * some cases serves as wild card
+			 */
+			if (charlen > 0)
+				goto ctoUTF16;
+
+			/* convert SURROGATE_PAIR */
+			if (strcmp(cp->charset, "utf8") || !wchar_to)
+				goto unknown;
+			if (*(source + i) & 0x80) {
+				charlen = utf8_to_utf32(source + i, 6, &u);
+				if (charlen < 0)
+					goto unknown;
+			} else
+				goto unknown;
+			ret  = utf8s_to_utf16s(source + i, charlen,
+					       UTF16_LITTLE_ENDIAN,
+					       wchar_to, 6);
+			if (ret < 0)
+				goto unknown;
+
+			i += charlen;
+			dst_char = cpu_to_le16(*wchar_to);
+			if (charlen <= 3)
+				/* 1-3bytes UTF-8 to 2bytes UTF-16 */
+				put_unaligned(dst_char, &target[j]);
+			else if (charlen == 4) {
+				/* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
+				 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
+				 *   (charlen=3+4 or 4+4) */
+				put_unaligned(dst_char, &target[j]);
+				dst_char = cpu_to_le16(*(wchar_to + 1));
+				j++;
+				put_unaligned(dst_char, &target[j]);
+			} else if (charlen >= 5) {
+				/* 5-6bytes UTF-8 to 6bytes UTF-16 */
+				put_unaligned(dst_char, &target[j]);
+				dst_char = cpu_to_le16(*(wchar_to + 1));
+				j++;
+				put_unaligned(dst_char, &target[j]);
+				dst_char = cpu_to_le16(*(wchar_to + 2));
+				j++;
+				put_unaligned(dst_char, &target[j]);
+			}
+			continue;
+
+unknown:
+			dst_char = cpu_to_le16(0x003f);
+			charlen = 1;
+		}
+
+ctoUTF16:
+		/*
+		 * character may take more than one byte in the source string,
+		 * but will take exactly two bytes in the target string
+		 */
+		i += charlen;
+		put_unaligned(dst_char, &target[j]);
+	}
+
+ctoUTF16_out:
+	put_unaligned(0, &target[j]); /* Null terminate target unicode string */
+	kfree(wchar_to);
+	return j;
+}
+
+#ifdef CONFIG_CIFS_SMB2
+/*
+ * cifs_local_to_utf16_bytes - how long will a string be after conversion?
+ * @from - pointer to input string
+ * @maxbytes - don't go past this many bytes of input string
+ * @codepage - source codepage
+ *
+ * Walk a string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ */
+
+static int
+cifs_local_to_utf16_bytes(const char *from, int len,
+			  const struct nls_table *codepage)
+{
+	int charlen;
+	int i;
+	wchar_t wchar_to;
+
+	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+		charlen = codepage->char2uni(from, len, &wchar_to);
+		/* Failed conversion defaults to a question mark */
+		if (charlen < 1)
+			charlen = 1;
+	}
+	return 2 * i; /* UTF16 characters are two bytes */
+}
+
+/*
+ * cifs_strndup_to_utf16 - copy a string to wire format from the local codepage
+ * @src - source string
+ * @maxlen - don't walk past this many bytes in the source string
+ * @utf16_len - the length of the allocated string in bytes (including null)
+ * @cp - source codepage
+ * @remap - map special chars
+ *
+ * Take a string convert it from the local codepage to UTF16 and
+ * put it in a new buffer. Returns a pointer to the new string or NULL on
+ * error.
+ */
+__le16 *
+cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
+		      const struct nls_table *cp, int remap)
+{
+	int len;
+	__le16 *dst;
+
+	len = cifs_local_to_utf16_bytes(src, maxlen, cp);
+	len += 2; /* NULL */
+	dst = kmalloc(len, GFP_KERNEL);
+	if (!dst) {
+		*utf16_len = 0;
+		return NULL;
+	}
+	cifsConvertToUTF16(dst, src, strlen(src), cp, remap);
+	*utf16_len = len;
+	return dst;
+}
+#endif /* CONFIG_CIFS_SMB2 */
diff --git a/kernel/fs/cifs/cifs_unicode.h b/kernel/fs/cifs/cifs_unicode.h
new file mode 100644
index 000000000..bdc52cb9a
--- /dev/null
+++ b/kernel/fs/cifs/cifs_unicode.h
@@ -0,0 +1,418 @@
+/*
+ * cifs_unicode:  Unicode kernel case support
+ *
+ * Function:
+ *     Convert a unicode character to upper or lower case using
+ *     compressed tables.
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * Notes:
+ *     These APIs are based on the C library functions.  The semantics
+ *     should match the C functions but with expanded size operands.
+ *
+ *     The upper/lower functions are based on a table created by mkupr.
+ *     This is a compressed table of upper and lower case conversion.
+ *
+ */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#define  UNIUPR_NOLOWER		/* Example to not expand lower case tables */
+
+/*
+ * Windows maps these to the user defined 16 bit Unicode range since they are
+ * reserved symbols (along with \ and /), otherwise illegal to store
+ * in filenames in NTFS
+ */
+#define UNI_ASTERISK    (__u16) ('*' + 0xF000)
+#define UNI_QUESTION    (__u16) ('?' + 0xF000)
+#define UNI_COLON       (__u16) (':' + 0xF000)
+#define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
+#define UNI_LESSTHAN    (__u16) ('<' + 0xF000)
+#define UNI_PIPE        (__u16) ('|' + 0xF000)
+#define UNI_SLASH       (__u16) ('\\' + 0xF000)
+
+/*
+ * Macs use an older "SFM" mapping of the symbols above. Fortunately it does
+ * not conflict (although almost does) with the mapping above.
+ */
+
+#define SFM_ASTERISK    ((__u16) 0xF021)
+#define SFM_QUESTION    ((__u16) 0xF025)
+#define SFM_COLON       ((__u16) 0xF022)
+#define SFM_GRTRTHAN    ((__u16) 0xF024)
+#define SFM_LESSTHAN    ((__u16) 0xF023)
+#define SFM_PIPE        ((__u16) 0xF027)
+#define SFM_SLASH       ((__u16) 0xF026)
+
+/*
+ * Mapping mechanism to use when one of the seven reserved characters is
+ * encountered.  We can only map using one of the mechanisms at a time
+ * since otherwise readdir could return directory entries which we would
+ * not be able to open
+ *
+ * NO_MAP_UNI_RSVD  = do not perform any remapping of the character
+ * SFM_MAP_UNI_RSVD = map reserved characters using SFM scheme (MAC compatible)
+ * SFU_MAP_UNI_RSVD = map reserved characters ala SFU ("mapchars" option)
+ *
+ */
+#define NO_MAP_UNI_RSVD		0
+#define SFM_MAP_UNI_RSVD	1
+#define SFU_MAP_UNI_RSVD	2
+
+/* Just define what we want from uniupr.h.  We don't want to define the tables
+ * in each source file.
+ */
+#ifndef	UNICASERANGE_DEFINED
+struct UniCaseRange {
+	wchar_t start;
+	wchar_t end;
+	signed char *table;
+};
+#endif				/* UNICASERANGE_DEFINED */
+
+#ifndef UNIUPR_NOUPPER
+extern signed char CifsUniUpperTable[512];
+extern const struct UniCaseRange CifsUniUpperRange[];
+#endif				/* UNIUPR_NOUPPER */
+
+#ifndef UNIUPR_NOLOWER
+extern signed char CifsUniLowerTable[512];
+extern const struct UniCaseRange CifsUniLowerRange[];
+#endif				/* UNIUPR_NOLOWER */
+
+#ifdef __KERNEL__
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+		    const struct nls_table *cp, int map_type);
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
+		     const struct nls_table *codepage);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
+			      const bool is_unicode,
+			      const struct nls_table *codepage);
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
+			      const struct nls_table *cp, int mapChars);
+extern int cifs_remap(struct cifs_sb_info *cifs_sb);
+#ifdef CONFIG_CIFS_SMB2
+extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
+				     int *utf16_len, const struct nls_table *cp,
+				     int remap);
+#endif /* CONFIG_CIFS_SMB2 */
+#endif
+
+wchar_t cifs_toupper(wchar_t in);
+
+/*
+ * UniStrcat:  Concatenate the second string to the first
+ *
+ * Returns:
+ *     Address of the first string
+ */
+static inline wchar_t *
+UniStrcat(wchar_t *ucs1, const wchar_t *ucs2)
+{
+	wchar_t *anchor = ucs1;	/* save a pointer to start of ucs1 */
+
+	while (*ucs1++) ;	/* To end of first string */
+	ucs1--;			/* Return to the null */
+	while ((*ucs1++ = *ucs2++)) ;	/* copy string 2 over */
+	return anchor;
+}
+
+/*
+ * UniStrchr:  Find a character in a string
+ *
+ * Returns:
+ *     Address of first occurrence of character in string
+ *     or NULL if the character is not in the string
+ */
+static inline wchar_t *
+UniStrchr(const wchar_t *ucs, wchar_t uc)
+{
+	while ((*ucs != uc) && *ucs)
+		ucs++;
+
+	if (*ucs == uc)
+		return (wchar_t *) ucs;
+	return NULL;
+}
+
+/*
+ * UniStrcmp:  Compare two strings
+ *
+ * Returns:
+ *     < 0:  First string is less than second
+ *     = 0:  Strings are equal
+ *     > 0:  First string is greater than second
+ */
+static inline int
+UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+	while ((*ucs1 == *ucs2) && *ucs1) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) *ucs2;
+}
+
+/*
+ * UniStrcpy:  Copy a string
+ */
+static inline wchar_t *
+UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2)
+{
+	wchar_t *anchor = ucs1;	/* save the start of result string */
+
+	while ((*ucs1++ = *ucs2++)) ;
+	return anchor;
+}
+
+/*
+ * UniStrlen:  Return the length of a string (in 16 bit Unicode chars not bytes)
+ */
+static inline size_t
+UniStrlen(const wchar_t *ucs1)
+{
+	int i = 0;
+
+	while (*ucs1++)
+		i++;
+	return i;
+}
+
+/*
+ * UniStrnlen:  Return the length (in 16 bit Unicode chars not bytes) of a
+ *		string (length limited)
+ */
+static inline size_t
+UniStrnlen(const wchar_t *ucs1, int maxlen)
+{
+	int i = 0;
+
+	while (*ucs1++) {
+		i++;
+		if (i >= maxlen)
+			break;
+	}
+	return i;
+}
+
+/*
+ * UniStrncat:  Concatenate length limited string
+ */
+static inline wchar_t *
+UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	wchar_t *anchor = ucs1;	/* save pointer to string 1 */
+
+	while (*ucs1++) ;
+	ucs1--;			/* point to null terminator of s1 */
+	while (n-- && (*ucs1 = *ucs2)) {	/* copy s2 after s1 */
+		ucs1++;
+		ucs2++;
+	}
+	*ucs1 = 0;		/* Null terminate the result */
+	return (anchor);
+}
+
+/*
+ * UniStrncmp:  Compare length limited string
+ */
+static inline int
+UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	if (!n)
+		return 0;	/* Null strings are equal */
+	while ((*ucs1 == *ucs2) && *ucs1 && --n) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) *ucs2;
+}
+
+/*
+ * UniStrncmp_le:  Compare length limited string - native to little-endian
+ */
+static inline int
+UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	if (!n)
+		return 0;	/* Null strings are equal */
+	while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) __le16_to_cpu(*ucs2);
+}
+
+/*
+ * UniStrncpy:  Copy length limited string with pad
+ */
+static inline wchar_t *
+UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	wchar_t *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = *ucs2++;
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrncpy_le:  Copy length limited string with pad to little-endian
+ */
+static inline wchar_t *
+UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	wchar_t *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = __le16_to_cpu(*ucs2++);
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrstr:  Find a string in a string
+ *
+ * Returns:
+ *     Address of first match found
+ *     NULL if no matching string is found
+ */
+static inline wchar_t *
+UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+	const wchar_t *anchor1 = ucs1;
+	const wchar_t *anchor2 = ucs2;
+
+	while (*ucs1) {
+		if (*ucs1 == *ucs2) {
+			/* Partial match found */
+			ucs1++;
+			ucs2++;
+		} else {
+			if (!*ucs2)	/* Match found */
+				return (wchar_t *) anchor1;
+			ucs1 = ++anchor1;	/* No match */
+			ucs2 = anchor2;
+		}
+	}
+
+	if (!*ucs2)		/* Both end together */
+		return (wchar_t *) anchor1;	/* Match found */
+	return NULL;		/* No match */
+}
+
+#ifndef UNIUPR_NOUPPER
+/*
+ * UniToupper:  Convert a unicode character to upper case
+ */
+static inline wchar_t
+UniToupper(register wchar_t uc)
+{
+	register const struct UniCaseRange *rp;
+
+	if (uc < sizeof(CifsUniUpperTable)) {
+		/* Latin characters */
+		return uc + CifsUniUpperTable[uc];	/* Use base tables */
+	} else {
+		rp = CifsUniUpperRange;	/* Use range tables */
+		while (rp->start) {
+			if (uc < rp->start)	/* Before start of range */
+				return uc;	/* Uppercase = input */
+			if (uc <= rp->end)	/* In range */
+				return uc + rp->table[uc - rp->start];
+			rp++;	/* Try next range */
+		}
+	}
+	return uc;		/* Past last range */
+}
+
+/*
+ * UniStrupr:  Upper case a unicode string
+ */
+static inline __le16 *
+UniStrupr(register __le16 *upin)
+{
+	register __le16 *up;
+
+	up = upin;
+	while (*up) {		/* For all characters */
+		*up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
+		up++;
+	}
+	return upin;		/* Return input pointer */
+}
+#endif				/* UNIUPR_NOUPPER */
+
+#ifndef UNIUPR_NOLOWER
+/*
+ * UniTolower:  Convert a unicode character to lower case
+ */
+static inline wchar_t
+UniTolower(register wchar_t uc)
+{
+	register const struct UniCaseRange *rp;
+
+	if (uc < sizeof(CifsUniLowerTable)) {
+		/* Latin characters */
+		return uc + CifsUniLowerTable[uc];	/* Use base tables */
+	} else {
+		rp = CifsUniLowerRange;	/* Use range tables */
+		while (rp->start) {
+			if (uc < rp->start)	/* Before start of range */
+				return uc;	/* Uppercase = input */
+			if (uc <= rp->end)	/* In range */
+				return uc + rp->table[uc - rp->start];
+			rp++;	/* Try next range */
+		}
+	}
+	return uc;		/* Past last range */
+}
+
+/*
+ * UniStrlwr:  Lower case a unicode string
+ */
+static inline wchar_t *
+UniStrlwr(register wchar_t *upin)
+{
+	register wchar_t *up;
+
+	up = upin;
+	while (*up) {		/* For all characters */
+		*up = UniTolower(*up);
+		up++;
+	}
+	return upin;		/* Return input pointer */
+}
+
+#endif
+
+#endif /* _CIFS_UNICODE_H */
diff --git a/kernel/fs/cifs/cifs_uniupr.h b/kernel/fs/cifs/cifs_uniupr.h
new file mode 100644
index 000000000..0ac7c5a86
--- /dev/null
+++ b/kernel/fs/cifs/cifs_uniupr.h
@@ -0,0 +1,253 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000,2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * uniupr.h - Unicode compressed case ranges
+ *
+*/
+
+#ifndef UNIUPR_NOUPPER
+/*
+ * Latin upper case
+ */
+signed char CifsUniUpperTable[512] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 000-00f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 010-01f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 020-02f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 030-03f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 040-04f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 050-05f */
+	0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 060-06f */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,	/* 070-07f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 080-08f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 090-09f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0a0-0af */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0b0-0bf */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0c0-0cf */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0d0-0df */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 0e0-0ef */
+	-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,	/* 0f0-0ff */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 100-10f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 110-11f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 120-12f */
+	0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,	/* 130-13f */
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1,	/* 140-14f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 150-15f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 160-16f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,	/* 170-17f */
+	0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0,	/* 180-18f */
+	0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0,	/* 190-19f */
+	0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0,	/* 1a0-1af */
+	-1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0,	/* 1b0-1bf */
+	0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0,	/* 1c0-1cf */
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e0-1ef */
+	0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1f0-1ff */
+};
+
+/* Upper case range - Greek */
+static signed char UniCaseRangeU03a0[47] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37,	/* 3a0-3af */
+	0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 3b0-3bf */
+	-32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64,
+	-63, -63,
+};
+
+/* Upper case range - Cyrillic */
+static signed char UniCaseRangeU0430[48] = {
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 430-43f */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 440-44f */
+	0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -80, -80,	/* 450-45f */
+};
+
+/* Upper case range - Extended cyrillic */
+static signed char UniCaseRangeU0490[61] = {
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 490-49f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 4a0-4af */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 4b0-4bf */
+	0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1,
+};
+
+/* Upper case range - Extended latin and greek */
+static signed char UniCaseRangeU1e00[509] = {
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e00-1e0f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e10-1e1f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e20-1e2f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e30-1e3f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e40-1e4f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e50-1e5f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e60-1e6f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e70-1e7f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e80-1e8f */
+	0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1,	/* 1e90-1e9f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ea0-1eaf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1eb0-1ebf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ec0-1ecf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ed0-1edf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ee0-1eef */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0,	/* 1ef0-1eff */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f00-1f0f */
+	8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f10-1f1f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f20-1f2f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f30-1f3f */
+	8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f40-1f4f */
+	0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f50-1f5f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f60-1f6f */
+	74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, 126, 126, 0, 0,	/* 1f70-1f7f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f80-1f8f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f90-1f9f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fa0-1faf */
+	8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fb0-1fbf */
+	0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fc0-1fcf */
+	8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fd0-1fdf */
+	8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fe0-1fef */
+	0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* Upper case range - Wide latin */
+static signed char UniCaseRangeUff40[27] = {
+	0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* ff40-ff4f */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+};
+
+/*
+ * Upper Case Range
+ */
+const struct UniCaseRange CifsUniUpperRange[] = {
+	{0x03a0, 0x03ce, UniCaseRangeU03a0},
+	{0x0430, 0x045f, UniCaseRangeU0430},
+	{0x0490, 0x04cc, UniCaseRangeU0490},
+	{0x1e00, 0x1ffc, UniCaseRangeU1e00},
+	{0xff40, 0xff5a, UniCaseRangeUff40},
+	{0}
+};
+#endif
+
+#ifndef UNIUPR_NOLOWER
+/*
+ * Latin lower case
+ */
+signed char CifsUniLowerTable[512] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 000-00f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 010-01f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 020-02f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 030-03f */
+	0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 040-04f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,	/* 050-05f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 060-06f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 070-07f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 080-08f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 090-09f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0a0-0af */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0b0-0bf */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 0c0-0cf */
+	32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 0,	/* 0d0-0df */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0e0-0ef */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0f0-0ff */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 100-10f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 110-11f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 120-12f */
+	0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,	/* 130-13f */
+	0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,	/* 140-14f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 150-15f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 160-16f */
+	1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, 0,	/* 170-17f */
+	0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, 0,	/* 180-18f */
+	0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,	/* 190-19f */
+	1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,	/* 1a0-1af */
+	0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,	/* 1b0-1bf */
+	0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1,	/* 1c0-1cf */
+	0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,	/* 1d0-1df */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e0-1ef */
+	0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1f0-1ff */
+};
+
+/* Lower case range - Greek */
+static signed char UniCaseRangeL0380[44] = {
+	0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63,	/* 380-38f */
+	0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 390-39f */
+	32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+};
+
+/* Lower case range - Cyrillic */
+static signed char UniCaseRangeL0400[48] = {
+	0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 0, 80, 80,	/* 400-40f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 410-41f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 420-42f */
+};
+
+/* Lower case range - Extended cyrillic */
+static signed char UniCaseRangeL0490[60] = {
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 490-49f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 4a0-4af */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 4b0-4bf */
+	0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
+};
+
+/* Lower case range - Extended latin and greek */
+static signed char UniCaseRangeL1e00[504] = {
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e00-1e0f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e10-1e1f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e20-1e2f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e30-1e3f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e40-1e4f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e50-1e5f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e60-1e6f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e70-1e7f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e80-1e8f */
+	1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,	/* 1e90-1e9f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ea0-1eaf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1eb0-1ebf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ec0-1ecf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ed0-1edf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ee0-1eef */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,	/* 1ef0-1eff */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f00-1f0f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0,	/* 1f10-1f1f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f20-1f2f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f30-1f3f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0,	/* 1f40-1f4f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8,	/* 1f50-1f5f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f60-1f6f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f70-1f7f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f80-1f8f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f90-1f9f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1fa0-1faf */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0,	/* 1fb0-1fbf */
+	0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, 0, 0,	/* 1fc0-1fcf */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0,	/* 1fd0-1fdf */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, 0, 0,	/* 1fe0-1fef */
+	0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* Lower case range - Wide latin */
+static signed char UniCaseRangeLff20[27] = {
+	0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* ff20-ff2f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+};
+
+/*
+ * Lower Case Range
+ */
+const struct UniCaseRange CifsUniLowerRange[] = {
+	{0x0380, 0x03ab, UniCaseRangeL0380},
+	{0x0400, 0x042f, UniCaseRangeL0400},
+	{0x0490, 0x04cb, UniCaseRangeL0490},
+	{0x1e00, 0x1ff7, UniCaseRangeL1e00},
+	{0xff20, 0xff3a, UniCaseRangeLff20},
+	{0}
+};
+#endif
diff --git a/kernel/fs/cifs/cifsacl.c b/kernel/fs/cifs/cifsacl.c
new file mode 100644
index 000000000..1ea780bc6
--- /dev/null
+++ b/kernel/fs/cifs/cifsacl.c
@@ -0,0 +1,1119 @@
+/*
+ *   fs/cifs/cifsacl.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2007,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Contains the routines for mapping CIFS/NTFS ACLs
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsacl.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+/* security id for everyone/world system group */
+static const struct cifs_sid sid_everyone = {
+	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+	1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
+/* group users */
+static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
+
+static const struct cred *root_cred;
+
+static int
+cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
+{
+	char *payload;
+
+	/*
+	 * If the payload is less than or equal to the size of a pointer, then
+	 * an allocation here is wasteful. Just copy the data directly to the
+	 * payload.value union member instead.
+	 *
+	 * With this however, you must check the datalen before trying to
+	 * dereference payload.data!
+	 */
+	if (prep->datalen <= sizeof(key->payload)) {
+		key->payload.value = 0;
+		memcpy(&key->payload.value, prep->data, prep->datalen);
+		key->datalen = prep->datalen;
+		return 0;
+	}
+	payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+	if (!payload)
+		return -ENOMEM;
+
+	key->payload.data = payload;
+	key->datalen = prep->datalen;
+	return 0;
+}
+
+static inline void
+cifs_idmap_key_destroy(struct key *key)
+{
+	if (key->datalen > sizeof(key->payload))
+		kfree(key->payload.data);
+}
+
+static struct key_type cifs_idmap_key_type = {
+	.name        = "cifs.idmap",
+	.instantiate = cifs_idmap_key_instantiate,
+	.destroy     = cifs_idmap_key_destroy,
+	.describe    = user_describe,
+};
+
+static char *
+sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
+{
+	int i, len;
+	unsigned int saval;
+	char *sidstr, *strptr;
+	unsigned long long id_auth_val;
+
+	/* 3 bytes for prefix */
+	sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
+			 (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
+			 GFP_KERNEL);
+	if (!sidstr)
+		return sidstr;
+
+	strptr = sidstr;
+	len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
+			sidptr->revision);
+	strptr += len;
+
+	/* The authority field is a single 48-bit number */
+	id_auth_val = (unsigned long long)sidptr->authority[5];
+	id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
+	id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
+	id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
+	id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
+	id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
+
+	/*
+	 * MS-DTYP states that if the authority is >= 2^32, then it should be
+	 * expressed as a hex value.
+	 */
+	if (id_auth_val <= UINT_MAX)
+		len = sprintf(strptr, "-%llu", id_auth_val);
+	else
+		len = sprintf(strptr, "-0x%llx", id_auth_val);
+
+	strptr += len;
+
+	for (i = 0; i < sidptr->num_subauth; ++i) {
+		saval = le32_to_cpu(sidptr->sub_auth[i]);
+		len = sprintf(strptr, "-%u", saval);
+		strptr += len;
+	}
+
+	return sidstr;
+}
+
+/*
+ * if the two SIDs (roughly equivalent to a UUID for a user or group) are
+ * the same returns zero, if they do not match returns non-zero.
+ */
+static int
+compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
+{
+	int i;
+	int num_subauth, num_sat, num_saw;
+
+	if ((!ctsid) || (!cwsid))
+		return 1;
+
+	/* compare the revision */
+	if (ctsid->revision != cwsid->revision) {
+		if (ctsid->revision > cwsid->revision)
+			return 1;
+		else
+			return -1;
+	}
+
+	/* compare all of the six auth values */
+	for (i = 0; i < NUM_AUTHS; ++i) {
+		if (ctsid->authority[i] != cwsid->authority[i]) {
+			if (ctsid->authority[i] > cwsid->authority[i])
+				return 1;
+			else
+				return -1;
+		}
+	}
+
+	/* compare all of the subauth values if any */
+	num_sat = ctsid->num_subauth;
+	num_saw = cwsid->num_subauth;
+	num_subauth = num_sat < num_saw ? num_sat : num_saw;
+	if (num_subauth) {
+		for (i = 0; i < num_subauth; ++i) {
+			if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
+				if (le32_to_cpu(ctsid->sub_auth[i]) >
+					le32_to_cpu(cwsid->sub_auth[i]))
+					return 1;
+				else
+					return -1;
+			}
+		}
+	}
+
+	return 0; /* sids compare/match */
+}
+
+static void
+cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
+{
+	int i;
+
+	dst->revision = src->revision;
+	dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
+	for (i = 0; i < NUM_AUTHS; ++i)
+		dst->authority[i] = src->authority[i];
+	for (i = 0; i < dst->num_subauth; ++i)
+		dst->sub_auth[i] = src->sub_auth[i];
+}
+
+static int
+id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
+{
+	int rc;
+	struct key *sidkey;
+	struct cifs_sid *ksid;
+	unsigned int ksid_size;
+	char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
+	const struct cred *saved_cred;
+
+	rc = snprintf(desc, sizeof(desc), "%ci:%u",
+			sidtype == SIDOWNER ? 'o' : 'g', cid);
+	if (rc >= sizeof(desc))
+		return -EINVAL;
+
+	rc = 0;
+	saved_cred = override_creds(root_cred);
+	sidkey = request_key(&cifs_idmap_key_type, desc, "");
+	if (IS_ERR(sidkey)) {
+		rc = -EINVAL;
+		cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n",
+			 __func__, sidtype == SIDOWNER ? 'u' : 'g', cid);
+		goto out_revert_creds;
+	} else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
+		rc = -EIO;
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+			 __func__, sidkey->datalen);
+		goto invalidate_key;
+	}
+
+	/*
+	 * A sid is usually too large to be embedded in payload.value, but if
+	 * there are no subauthorities and the host has 8-byte pointers, then
+	 * it could be.
+	 */
+	ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
+		(struct cifs_sid *)&sidkey->payload.value :
+		(struct cifs_sid *)sidkey->payload.data;
+
+	ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
+	if (ksid_size > sidkey->datalen) {
+		rc = -EIO;
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n",
+			 __func__, sidkey->datalen, ksid_size);
+		goto invalidate_key;
+	}
+
+	cifs_copy_sid(ssid, ksid);
+out_key_put:
+	key_put(sidkey);
+out_revert_creds:
+	revert_creds(saved_cred);
+	return rc;
+
+invalidate_key:
+	key_invalidate(sidkey);
+	goto out_key_put;
+}
+
+static int
+sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+		struct cifs_fattr *fattr, uint sidtype)
+{
+	int rc;
+	struct key *sidkey;
+	char *sidstr;
+	const struct cred *saved_cred;
+	kuid_t fuid = cifs_sb->mnt_uid;
+	kgid_t fgid = cifs_sb->mnt_gid;
+
+	/*
+	 * If we have too many subauthorities, then something is really wrong.
+	 * Just return an error.
+	 */
+	if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
+		cifs_dbg(FYI, "%s: %u subauthorities is too many!\n",
+			 __func__, psid->num_subauth);
+		return -EIO;
+	}
+
+	sidstr = sid_to_key_str(psid, sidtype);
+	if (!sidstr)
+		return -ENOMEM;
+
+	saved_cred = override_creds(root_cred);
+	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
+	if (IS_ERR(sidkey)) {
+		rc = -EINVAL;
+		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
+			 __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g');
+		goto out_revert_creds;
+	}
+
+	/*
+	 * FIXME: Here we assume that uid_t and gid_t are same size. It's
+	 * probably a safe assumption but might be better to check based on
+	 * sidtype.
+	 */
+	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+	if (sidkey->datalen != sizeof(uid_t)) {
+		rc = -EIO;
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+			 __func__, sidkey->datalen);
+		key_invalidate(sidkey);
+		goto out_key_put;
+	}
+
+	if (sidtype == SIDOWNER) {
+		kuid_t uid;
+		uid_t id;
+		memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+		uid = make_kuid(&init_user_ns, id);
+		if (uid_valid(uid))
+			fuid = uid;
+	} else {
+		kgid_t gid;
+		gid_t id;
+		memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+		gid = make_kgid(&init_user_ns, id);
+		if (gid_valid(gid))
+			fgid = gid;
+	}
+
+out_key_put:
+	key_put(sidkey);
+out_revert_creds:
+	revert_creds(saved_cred);
+	kfree(sidstr);
+
+	/*
+	 * Note that we return 0 here unconditionally. If the mapping
+	 * fails then we just fall back to using the mnt_uid/mnt_gid.
+	 */
+	if (sidtype == SIDOWNER)
+		fattr->cf_uid = fuid;
+	else
+		fattr->cf_gid = fgid;
+	return 0;
+}
+
+int
+init_cifs_idmap(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	cifs_dbg(FYI, "Registering the %s key type\n",
+		 cifs_idmap_key_type.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = keyring_alloc(".cifs_idmap",
+				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = register_key_type(&cifs_idmap_key_type);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	root_cred = cred;
+
+	cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void
+exit_cifs_idmap(void)
+{
+	key_revoke(root_cred->thread_keyring);
+	unregister_key_type(&cifs_idmap_key_type);
+	put_cred(root_cred);
+	cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name);
+}
+
+/* copy ntsd, owner sid, and group sid from a security descriptor to another */
+static void copy_sec_desc(const struct cifs_ntsd *pntsd,
+				struct cifs_ntsd *pnntsd, __u32 sidsoffset)
+{
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+
+	/* copy security descriptor control portion */
+	pnntsd->revision = pntsd->revision;
+	pnntsd->type = pntsd->type;
+	pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd));
+	pnntsd->sacloffset = 0;
+	pnntsd->osidoffset = cpu_to_le32(sidsoffset);
+	pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid));
+
+	/* copy owner sid */
+	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+	nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
+	cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
+
+	/* copy group sid */
+	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+	ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
+					sizeof(struct cifs_sid));
+	cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
+
+	return;
+}
+
+
+/*
+   change posix mode to reflect permissions
+   pmode is the existing mode (we only want to overwrite part of this
+   bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
+*/
+static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
+				 umode_t *pbits_to_set)
+{
+	__u32 flags = le32_to_cpu(ace_flags);
+	/* the order of ACEs is important.  The canonical order is to begin with
+	   DENY entries followed by ALLOW, otherwise an allow entry could be
+	   encountered first, making the subsequent deny entry like "dead code"
+	   which would be superflous since Windows stops when a match is made
+	   for the operation you are trying to perform for your user */
+
+	/* For deny ACEs we change the mask so that subsequent allow access
+	   control entries do not turn on the bits we are denying */
+	if (type == ACCESS_DENIED) {
+		if (flags & GENERIC_ALL)
+			*pbits_to_set &= ~S_IRWXUGO;
+
+		if ((flags & GENERIC_WRITE) ||
+			((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+			*pbits_to_set &= ~S_IWUGO;
+		if ((flags & GENERIC_READ) ||
+			((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+			*pbits_to_set &= ~S_IRUGO;
+		if ((flags & GENERIC_EXECUTE) ||
+			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+			*pbits_to_set &= ~S_IXUGO;
+		return;
+	} else if (type != ACCESS_ALLOWED) {
+		cifs_dbg(VFS, "unknown access control type %d\n", type);
+		return;
+	}
+	/* else ACCESS_ALLOWED type */
+
+	if (flags & GENERIC_ALL) {
+		*pmode |= (S_IRWXUGO & (*pbits_to_set));
+		cifs_dbg(NOISY, "all perms\n");
+		return;
+	}
+	if ((flags & GENERIC_WRITE) ||
+			((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+		*pmode |= (S_IWUGO & (*pbits_to_set));
+	if ((flags & GENERIC_READ) ||
+			((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+		*pmode |= (S_IRUGO & (*pbits_to_set));
+	if ((flags & GENERIC_EXECUTE) ||
+			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+		*pmode |= (S_IXUGO & (*pbits_to_set));
+
+	cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
+	return;
+}
+
+/*
+   Generate access flags to reflect permissions mode is the existing mode.
+   This function is called for every ACE in the DACL whose SID matches
+   with either owner or group or everyone.
+*/
+
+static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
+				__u32 *pace_flags)
+{
+	/* reset access mask */
+	*pace_flags = 0x0;
+
+	/* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */
+	mode &= bits_to_use;
+
+	/* check for R/W/X UGO since we do not know whose flags
+	   is this but we have cleared all the bits sans RWX for
+	   either user or group or other as per bits_to_use */
+	if (mode & S_IRUGO)
+		*pace_flags |= SET_FILE_READ_RIGHTS;
+	if (mode & S_IWUGO)
+		*pace_flags |= SET_FILE_WRITE_RIGHTS;
+	if (mode & S_IXUGO)
+		*pace_flags |= SET_FILE_EXEC_RIGHTS;
+
+	cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+		 mode, *pace_flags);
+	return;
+}
+
+static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
+			const struct cifs_sid *psid, __u64 nmode, umode_t bits)
+{
+	int i;
+	__u16 size = 0;
+	__u32 access_req = 0;
+
+	pntace->type = ACCESS_ALLOWED;
+	pntace->flags = 0x0;
+	mode_to_access_flags(nmode, bits, &access_req);
+	if (!access_req)
+		access_req = SET_MINIMUM_RIGHTS;
+	pntace->access_req = cpu_to_le32(access_req);
+
+	pntace->sid.revision = psid->revision;
+	pntace->sid.num_subauth = psid->num_subauth;
+	for (i = 0; i < NUM_AUTHS; i++)
+		pntace->sid.authority[i] = psid->authority[i];
+	for (i = 0; i < psid->num_subauth; i++)
+		pntace->sid.sub_auth[i] = psid->sub_auth[i];
+
+	size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
+	pntace->size = cpu_to_le16(size);
+
+	return size;
+}
+
+
+#ifdef CONFIG_CIFS_DEBUG2
+static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
+{
+	int num_subauth;
+
+	/* validate that we do not go past end of acl */
+
+	if (le16_to_cpu(pace->size) < 16) {
+		cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size));
+		return;
+	}
+
+	if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
+		cifs_dbg(VFS, "ACL too small to parse ACE\n");
+		return;
+	}
+
+	num_subauth = pace->sid.num_subauth;
+	if (num_subauth) {
+		int i;
+		cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n",
+			 pace->sid.revision, pace->sid.num_subauth, pace->type,
+			 pace->flags, le16_to_cpu(pace->size));
+		for (i = 0; i < num_subauth; ++i) {
+			cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n",
+				 i, le32_to_cpu(pace->sid.sub_auth[i]));
+		}
+
+		/* BB add length check to make sure that we do not have huge
+			num auths and therefore go off the end */
+	}
+
+	return;
+}
+#endif
+
+
+static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
+		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
+		       struct cifs_fattr *fattr)
+{
+	int i;
+	int num_aces = 0;
+	int acl_size;
+	char *acl_base;
+	struct cifs_ace **ppace;
+
+	/* BB need to add parm so we can store the SID BB */
+
+	if (!pdacl) {
+		/* no DACL in the security descriptor, set
+		   all the permissions for user/group/other */
+		fattr->cf_mode |= S_IRWXUGO;
+		return;
+	}
+
+	/* validate that we do not go past end of acl */
+	if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
+		cifs_dbg(VFS, "ACL too small to parse DACL\n");
+		return;
+	}
+
+	cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n",
+		 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
+		 le32_to_cpu(pdacl->num_aces));
+
+	/* reset rwx permissions for user/group/other.
+	   Also, if num_aces is 0 i.e. DACL has no ACEs,
+	   user/group/other have no permissions */
+	fattr->cf_mode &= ~(S_IRWXUGO);
+
+	acl_base = (char *)pdacl;
+	acl_size = sizeof(struct cifs_acl);
+
+	num_aces = le32_to_cpu(pdacl->num_aces);
+	if (num_aces > 0) {
+		umode_t user_mask = S_IRWXU;
+		umode_t group_mask = S_IRWXG;
+		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
+
+		if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+			return;
+		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
+				GFP_KERNEL);
+		if (!ppace)
+			return;
+
+		for (i = 0; i < num_aces; ++i) {
+			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
+#ifdef CONFIG_CIFS_DEBUG2
+			dump_ace(ppace[i], end_of_acl);
+#endif
+			if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &user_mask);
+			if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &group_mask);
+			if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &other_mask);
+			if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &other_mask);
+
+
+/*			memcpy((void *)(&(cifscred->aces[i])),
+				(void *)ppace[i],
+				sizeof(struct cifs_ace)); */
+
+			acl_base = (char *)ppace[i];
+			acl_size = le16_to_cpu(ppace[i]->size);
+		}
+
+		kfree(ppace);
+	}
+
+	return;
+}
+
+
+static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
+			struct cifs_sid *pgrpsid, __u64 nmode)
+{
+	u16 size = 0;
+	struct cifs_acl *pnndacl;
+
+	pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
+
+	size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size),
+					pownersid, nmode, S_IRWXU);
+	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+					pgrpsid, nmode, S_IRWXG);
+	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+					 &sid_everyone, nmode, S_IRWXO);
+
+	pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
+	pndacl->num_aces = cpu_to_le32(3);
+
+	return 0;
+}
+
+
+static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
+{
+	/* BB need to add parm so we can store the SID BB */
+
+	/* validate that we do not go past end of ACL - sid must be at least 8
+	   bytes long (assuming no sub-auths - e.g. the null SID */
+	if (end_of_acl < (char *)psid + 8) {
+		cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid);
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_CIFS_DEBUG2
+	if (psid->num_subauth) {
+		int i;
+		cifs_dbg(FYI, "SID revision %d num_auth %d\n",
+			 psid->revision, psid->num_subauth);
+
+		for (i = 0; i < psid->num_subauth; i++) {
+			cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n",
+				 i, le32_to_cpu(psid->sub_auth[i]));
+		}
+
+		/* BB add length check to make sure that we do not have huge
+			num auths and therefore go off the end */
+		cifs_dbg(FYI, "RID 0x%x\n",
+			 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
+	}
+#endif
+
+	return 0;
+}
+
+
+/* Convert CIFS ACL to POSIX form */
+static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
+		struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr)
+{
+	int rc = 0;
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
+	char *end_of_acl = ((char *)pntsd) + acl_len;
+	__u32 dacloffset;
+
+	if (pntsd == NULL)
+		return -EIO;
+
+	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+	dacloffset = le32_to_cpu(pntsd->dacloffset);
+	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+	cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n",
+		 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
+		 le32_to_cpu(pntsd->gsidoffset),
+		 le32_to_cpu(pntsd->sacloffset), dacloffset);
+/*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
+	rc = parse_sid(owner_sid_ptr, end_of_acl);
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc);
+		return rc;
+	}
+	rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n",
+			 __func__, rc);
+		return rc;
+	}
+
+	rc = parse_sid(group_sid_ptr, end_of_acl);
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n",
+			 __func__, rc);
+		return rc;
+	}
+	rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n",
+			 __func__, rc);
+		return rc;
+	}
+
+	if (dacloffset)
+		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
+			   group_sid_ptr, fattr);
+	else
+		cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */
+
+	return rc;
+}
+
+/* Convert permission bits from mode to equivalent CIFS ACL */
+static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
+	__u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag)
+{
+	int rc = 0;
+	__u32 dacloffset;
+	__u32 ndacloffset;
+	__u32 sidsoffset;
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+	struct cifs_acl *dacl_ptr = NULL;  /* no need for SACL ptr */
+	struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
+
+	if (nmode != NO_CHANGE_64) { /* chmod */
+		owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+		group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+		dacloffset = le32_to_cpu(pntsd->dacloffset);
+		dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+		ndacloffset = sizeof(struct cifs_ntsd);
+		ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+		ndacl_ptr->revision = dacl_ptr->revision;
+		ndacl_ptr->size = 0;
+		ndacl_ptr->num_aces = 0;
+
+		rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
+					nmode);
+		sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+		/* copy sec desc control portion & owner and group sids */
+		copy_sec_desc(pntsd, pnntsd, sidsoffset);
+		*aclflag = CIFS_ACL_DACL;
+	} else {
+		memcpy(pnntsd, pntsd, secdesclen);
+		if (uid_valid(uid)) { /* chown */
+			uid_t id;
+			owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+					le32_to_cpu(pnntsd->osidoffset));
+			nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+								GFP_KERNEL);
+			if (!nowner_sid_ptr)
+				return -ENOMEM;
+			id = from_kuid(&init_user_ns, uid);
+			rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
+			if (rc) {
+				cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n",
+					 __func__, rc, id);
+				kfree(nowner_sid_ptr);
+				return rc;
+			}
+			cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
+			kfree(nowner_sid_ptr);
+			*aclflag = CIFS_ACL_OWNER;
+		}
+		if (gid_valid(gid)) { /* chgrp */
+			gid_t id;
+			group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+					le32_to_cpu(pnntsd->gsidoffset));
+			ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+								GFP_KERNEL);
+			if (!ngroup_sid_ptr)
+				return -ENOMEM;
+			id = from_kgid(&init_user_ns, gid);
+			rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
+			if (rc) {
+				cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n",
+					 __func__, rc, id);
+				kfree(ngroup_sid_ptr);
+				return rc;
+			}
+			cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
+			kfree(ngroup_sid_ptr);
+			*aclflag = CIFS_ACL_GROUP;
+		}
+	}
+
+	return rc;
+}
+
+struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+		const struct cifs_fid *cifsfid, u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	unsigned int xid;
+	int rc;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return ERR_CAST(tlink);
+
+	xid = get_xid();
+	rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
+				pacllen);
+	free_xid(xid);
+
+	cifs_put_tlink(tlink);
+
+	cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
+	if (rc)
+		return ERR_PTR(rc);
+	return pntsd;
+}
+
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+		const char *path, u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	int oplock = 0;
+	unsigned int xid;
+	int rc, create_options = 0;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+
+	if (IS_ERR(tlink))
+		return ERR_CAST(tlink);
+
+	tcon = tlink_tcon(tlink);
+	xid = get_xid();
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = READ_CONTROL;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (!rc) {
+		rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
+		CIFSSMBClose(xid, tcon, fid.netfid);
+	}
+
+	cifs_put_tlink(tlink);
+	free_xid(xid);
+
+	cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
+	if (rc)
+		return ERR_PTR(rc);
+	return pntsd;
+}
+
+/* Retrieve an ACL from the server */
+struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+				      struct inode *inode, const char *path,
+				      u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	struct cifsFileInfo *open_file = NULL;
+
+	if (inode)
+		open_file = find_readable_file(CIFS_I(inode), true);
+	if (!open_file)
+		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+
+	pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen);
+	cifsFileInfo_put(open_file);
+	return pntsd;
+}
+
+ /* Set an ACL on the server */
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+			struct inode *inode, const char *path, int aclflag)
+{
+	int oplock = 0;
+	unsigned int xid;
+	int rc, access_flags, create_options = 0;
+	struct cifs_tcon *tcon;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	tcon = tlink_tcon(tlink);
+	xid = get_xid();
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
+		access_flags = WRITE_OWNER;
+	else
+		access_flags = WRITE_DAC;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = access_flags;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc) {
+		cifs_dbg(VFS, "Unable to open file to set ACL\n");
+		goto out;
+	}
+
+	rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag);
+	cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
+
+	CIFSSMBClose(xid, tcon, fid.netfid);
+out:
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
+int
+cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+		  struct inode *inode, const char *path,
+		  const struct cifs_fid *pfid)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	u32 acllen = 0;
+	int rc = 0;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_tcon *tcon;
+
+	cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	if (pfid && (tcon->ses->server->ops->get_acl_by_fid))
+		pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid,
+							  &acllen);
+	else if (tcon->ses->server->ops->get_acl)
+		pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+							&acllen);
+	else {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
+	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
+	if (IS_ERR(pntsd)) {
+		rc = PTR_ERR(pntsd);
+		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
+	} else {
+		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
+		kfree(pntsd);
+		if (rc)
+			cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
+	}
+
+	cifs_put_tlink(tlink);
+
+	return rc;
+}
+
+/* Convert mode bits to an ACL so we can update the ACL on the server */
+int
+id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
+			kuid_t uid, kgid_t gid)
+{
+	int rc = 0;
+	int aclflag = CIFS_ACL_DACL; /* default flag to set */
+	__u32 secdesclen = 0;
+	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
+	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_tcon *tcon;
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
+
+	/* Get the security descriptor */
+
+	if (tcon->ses->server->ops->get_acl == NULL) {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
+
+	pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+						&secdesclen);
+	if (IS_ERR(pntsd)) {
+		rc = PTR_ERR(pntsd);
+		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
+		cifs_put_tlink(tlink);
+		return rc;
+	}
+
+	/*
+	 * Add three ACEs for owner, group, everyone getting rid of other ACEs
+	 * as chmod disables ACEs and set the security descriptor. Allocate
+	 * memory for the smb header, set security descriptor request security
+	 * descriptor parameters, and secuirty descriptor itself
+	 */
+	secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
+	pnntsd = kmalloc(secdesclen, GFP_KERNEL);
+	if (!pnntsd) {
+		kfree(pntsd);
+		cifs_put_tlink(tlink);
+		return -ENOMEM;
+	}
+
+	rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+				&aclflag);
+
+	cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
+
+	if (tcon->ses->server->ops->set_acl == NULL)
+		rc = -EOPNOTSUPP;
+
+	if (!rc) {
+		/* Set the security descriptor */
+		rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode,
+						     path, aclflag);
+		cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
+	}
+	cifs_put_tlink(tlink);
+
+	kfree(pnntsd);
+	kfree(pntsd);
+	return rc;
+}
diff --git a/kernel/fs/cifs/cifsacl.h b/kernel/fs/cifs/cifsacl.h
new file mode 100644
index 000000000..4f3884835
--- /dev/null
+++ b/kernel/fs/cifs/cifsacl.h
@@ -0,0 +1,101 @@
+/*
+ *   fs/cifs/cifsacl.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFSACL_H
+#define _CIFSACL_H
+
+
+#define NUM_AUTHS (6)	/* number of authority fields */
+#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
+
+#define READ_BIT        0x4
+#define WRITE_BIT       0x2
+#define EXEC_BIT        0x1
+
+#define UBITSHIFT	6
+#define GBITSHIFT	3
+
+#define ACCESS_ALLOWED	0
+#define ACCESS_DENIED	1
+
+#define SIDOWNER 1
+#define SIDGROUP 2
+
+/*
+ * Security Descriptor length containing DACL with 3 ACEs (one each for
+ * owner, group and world).
+ */
+#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
+			      sizeof(struct cifs_acl) + \
+			      (sizeof(struct cifs_ace) * 3))
+
+/*
+ * Maximum size of a string representation of a SID:
+ *
+ * The fields are unsigned values in decimal. So:
+ *
+ * u8:  max 3 bytes in decimal
+ * u32: max 10 bytes in decimal
+ *
+ * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
+ *
+ * For authority field, max is when all 6 values are non-zero and it must be
+ * represented in hex. So "-0x" + 12 hex digits.
+ *
+ * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
+ */
+#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
+#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
+
+struct cifs_ntsd {
+	__le16 revision; /* revision level */
+	__le16 type;
+	__le32 osidoffset;
+	__le32 gsidoffset;
+	__le32 sacloffset;
+	__le32 dacloffset;
+} __attribute__((packed));
+
+struct cifs_sid {
+	__u8 revision; /* revision level */
+	__u8 num_subauth;
+	__u8 authority[NUM_AUTHS];
+	__le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
+} __attribute__((packed));
+
+/* size of a struct cifs_sid, sans sub_auth array */
+#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
+
+struct cifs_acl {
+	__le16 revision; /* revision level */
+	__le16 size;
+	__le32 num_aces;
+} __attribute__((packed));
+
+struct cifs_ace {
+	__u8 type;
+	__u8 flags;
+	__le16 size;
+	__le32 access_req;
+	struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
+} __attribute__((packed));
+
+#endif /* _CIFSACL_H */
diff --git a/kernel/fs/cifs/cifsencrypt.c b/kernel/fs/cifs/cifsencrypt.c
new file mode 100644
index 000000000..aa0dc2573
--- /dev/null
+++ b/kernel/fs/cifs/cifsencrypt.c
@@ -0,0 +1,818 @@
+/*
+ *   fs/cifs/cifsencrypt.c
+ *
+ *   Encryption and hashing operations relating to NTLM, NTLMv2.  See MS-NLMP
+ *   for more detailed information
+ *
+ *   Copyright (C) International Business Machines  Corp., 2005,2013
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "cifsproto.h"
+#include "ntlmssp.h"
+#include <linux/ctype.h>
+#include <linux/random.h>
+#include <linux/highmem.h>
+
+static int
+cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	if (server->secmech.sdescmd5 != NULL)
+		return 0; /* already allocated */
+
+	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+	if (IS_ERR(server->secmech.md5)) {
+		cifs_dbg(VFS, "could not allocate crypto md5\n");
+		rc = PTR_ERR(server->secmech.md5);
+		server->secmech.md5 = NULL;
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.md5);
+	server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdescmd5) {
+		crypto_free_shash(server->secmech.md5);
+		server->secmech.md5 = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+	server->secmech.sdescmd5->shash.flags = 0x0;
+
+	return 0;
+}
+
+/*
+ * Calculate and return the CIFS signature based on the mac key and SMB PDU.
+ * The 16 byte signature must be allocated by the caller. Note we only use the
+ * 1st eight bytes and that the smb header signature field on input contains
+ * the sequence number before this function is called. Also, this function
+ * should be called with the server->srv_mutex held.
+ */
+static int cifs_calc_signature(struct smb_rqst *rqst,
+			struct TCP_Server_Info *server, char *signature)
+{
+	int i;
+	int rc;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+
+	if (iov == NULL || signature == NULL || server == NULL)
+		return -EINVAL;
+
+	if (!server->secmech.sdescmd5) {
+		rc = cifs_crypto_shash_md5_allocate(server);
+		if (rc) {
+			cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
+			return -1;
+		}
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
+		server->session_key.response, server->session_key.len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		return rc;
+	}
+
+	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+		if (iov[i].iov_base == NULL) {
+			cifs_dbg(VFS, "null iovec entry\n");
+			return -EIO;
+		}
+		/* The first entry includes a length field (which does not get
+		   signed that occupies the first 4 bytes before the header */
+		if (i == 0) {
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+				break; /* nothing to sign or corrupt header */
+			rc =
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
+		} else {
+			rc =
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base, iov[i].iov_len);
+		}
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with payload\n",
+				 __func__);
+			return rc;
+		}
+	}
+
+	/* now hash over the rq_pages array */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		crypto_shash_update(&server->secmech.sdescmd5->shash,
+					p_iov.iov_base, p_iov.iov_len);
+		kunmap(rqst->rq_pages[i]);
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+
+	return rc;
+}
+
+/* must be called with server->srv_mutex held */
+int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+		   __u32 *pexpected_response_sequence_number)
+{
+	int rc = 0;
+	char smb_signature[20];
+	struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+
+	if ((cifs_pdu == NULL) || (server == NULL))
+		return -EINVAL;
+
+	if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
+	    server->tcpStatus == CifsNeedNegotiate)
+		return rc;
+
+	if (!server->session_estab) {
+		memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+		return rc;
+	}
+
+	cifs_pdu->Signature.Sequence.SequenceNumber =
+				cpu_to_le32(server->sequence_number);
+	cifs_pdu->Signature.Sequence.Reserved = 0;
+
+	*pexpected_response_sequence_number = ++server->sequence_number;
+	++server->sequence_number;
+
+	rc = cifs_calc_signature(rqst, server, smb_signature);
+	if (rc)
+		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
+	else
+		memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
+
+	return rc;
+}
+
+int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
+		   __u32 *pexpected_response_sequence)
+{
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = n_vec };
+
+	return cifs_sign_rqst(&rqst, server, pexpected_response_sequence);
+}
+
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
+		  __u32 *pexpected_response_sequence_number)
+{
+	struct kvec iov;
+
+	iov.iov_base = cifs_pdu;
+	iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
+
+	return cifs_sign_smbv(&iov, 1, server,
+			      pexpected_response_sequence_number);
+}
+
+int cifs_verify_signature(struct smb_rqst *rqst,
+			  struct TCP_Server_Info *server,
+			  __u32 expected_sequence_number)
+{
+	unsigned int rc;
+	char server_response_sig[8];
+	char what_we_think_sig_should_be[20];
+	struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+
+	if (cifs_pdu == NULL || server == NULL)
+		return -EINVAL;
+
+	if (!server->session_estab)
+		return 0;
+
+	if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) {
+		struct smb_com_lock_req *pSMB =
+			(struct smb_com_lock_req *)cifs_pdu;
+	    if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
+			return 0;
+	}
+
+	/* BB what if signatures are supposed to be on for session but
+	   server does not send one? BB */
+
+	/* Do not need to verify session setups with signature "BSRSPYL "  */
+	if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
+		cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+			 cifs_pdu->Command);
+
+	/* save off the origiginal signature so we can modify the smb and check
+		its signature against what the server sent */
+	memcpy(server_response_sig, cifs_pdu->Signature.SecuritySignature, 8);
+
+	cifs_pdu->Signature.Sequence.SequenceNumber =
+					cpu_to_le32(expected_sequence_number);
+	cifs_pdu->Signature.Sequence.Reserved = 0;
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be);
+	mutex_unlock(&server->srv_mutex);
+
+	if (rc)
+		return rc;
+
+/*	cifs_dump_mem("what we think it should be: ",
+		      what_we_think_sig_should_be, 16); */
+
+	if (memcmp(server_response_sig, what_we_think_sig_should_be, 8))
+		return -EACCES;
+	else
+		return 0;
+
+}
+
+/* first calculate 24 bytes ntlm response and then 16 byte session key */
+int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
+{
+	int rc = 0;
+	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
+	char temp_key[CIFS_SESS_KEY_SIZE];
+
+	if (!ses)
+		return -EINVAL;
+
+	ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
+	if (!ses->auth_key.response)
+		return -ENOMEM;
+
+	ses->auth_key.len = temp_len;
+
+	rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
+			ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
+	if (rc) {
+		cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n",
+			 __func__, rc);
+		return rc;
+	}
+
+	rc = E_md4hash(ses->password, temp_key, nls_cp);
+	if (rc) {
+		cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+			 __func__, rc);
+		return rc;
+	}
+
+	rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+	if (rc)
+		cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n",
+			 __func__, rc);
+
+	return rc;
+}
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
+			char *lnm_session_key)
+{
+	int i;
+	int rc;
+	char password_with_pad[CIFS_ENCPWD_SIZE];
+
+	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+	if (password)
+		strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
+
+	if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
+		memcpy(lnm_session_key, password_with_pad,
+			CIFS_ENCPWD_SIZE);
+		return 0;
+	}
+
+	/* calculate old style session key */
+	/* calling toupper is less broken than repeatedly
+	calling nls_toupper would be since that will never
+	work for UTF8, but neither handles multibyte code pages
+	but the only alternative would be converting to UCS-16 (Unicode)
+	(using a routine something like UniStrupr) then
+	uppercasing and then converting back from Unicode - which
+	would only worth doing it if we knew it were utf8. Basically
+	utf8 and other multibyte codepages each need their own strupper
+	function since a byte at a time will ont work. */
+
+	for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
+		password_with_pad[i] = toupper(password_with_pad[i]);
+
+	rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
+
+	return rc;
+}
+#endif /* CIFS_WEAK_PW_HASH */
+
+/* Build a proper attribute value/target info pairs blob.
+ * Fill in netbios and dns domain name and workstation name
+ * and client time (total five av pairs and + one end of fields indicator.
+ * Allocate domain name which gets freed when session struct is deallocated.
+ */
+static int
+build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
+{
+	unsigned int dlen;
+	unsigned int size = 2 * sizeof(struct ntlmssp2_name);
+	char *defdmname = "WORKGROUP";
+	unsigned char *blobptr;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->domainName) {
+		ses->domainName = kstrdup(defdmname, GFP_KERNEL);
+		if (!ses->domainName)
+			return -ENOMEM;
+	}
+
+	dlen = strlen(ses->domainName);
+
+	/*
+	 * The length of this blob is two times the size of a
+	 * structure (av pair) which holds name/size
+	 * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
+	 * unicode length of a netbios domain name
+	 */
+	ses->auth_key.len = size + 2 * dlen;
+	ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		ses->auth_key.len = 0;
+		return -ENOMEM;
+	}
+
+	blobptr = ses->auth_key.response;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	/*
+	 * As defined in MS-NTLM 3.3.2, just this av pair field
+	 * is sufficient as part of the temp
+	 */
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
+	attrptr->length = cpu_to_le16(2 * dlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+
+	return 0;
+}
+
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find netbios domain name to be used
+ * as part of ntlmv2 authentication (in Target String), if not already
+ * specified on the command line.
+ * If this function returns without any error but without fetching
+ * domain name, authentication may fail against some server but
+ * may not fail against other (those who are not very particular
+ * about target string i.e. for some, just user name might suffice.
+ */
+static int
+find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
+{
+	unsigned int attrsize;
+	unsigned int type;
+	unsigned int onesize = sizeof(struct ntlmssp2_name);
+	unsigned char *blobptr;
+	unsigned char *blobend;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->auth_key.len || !ses->auth_key.response)
+		return 0;
+
+	blobptr = ses->auth_key.response;
+	blobend = blobptr + ses->auth_key.len;
+
+	while (blobptr + onesize < blobend) {
+		attrptr = (struct ntlmssp2_name *) blobptr;
+		type = le16_to_cpu(attrptr->type);
+		if (type == NTLMSSP_AV_EOL)
+			break;
+		blobptr += 2; /* advance attr type */
+		attrsize = le16_to_cpu(attrptr->length);
+		blobptr += 2; /* advance attr size */
+		if (blobptr + attrsize > blobend)
+			break;
+		if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+			if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN)
+				break;
+			if (!ses->domainName) {
+				ses->domainName =
+					kmalloc(attrsize + 1, GFP_KERNEL);
+				if (!ses->domainName)
+						return -ENOMEM;
+				cifs_from_utf16(ses->domainName,
+					(__le16 *)blobptr, attrsize, attrsize,
+					nls_cp, NO_MAP_UNI_RSVD);
+				break;
+			}
+		}
+		blobptr += attrsize; /* advance attr  value */
+	}
+
+	return 0;
+}
+
+static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
+			    const struct nls_table *nls_cp)
+{
+	int rc = 0;
+	int len;
+	char nt_hash[CIFS_NTHASH_SIZE];
+	__le16 *user;
+	wchar_t *domain;
+	wchar_t *server;
+
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
+		return -1;
+	}
+
+	/* calculate md4 hash of password */
+	E_md4hash(ses->password, nt_hash, nls_cp);
+
+	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+				CIFS_NTHASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
+		return rc;
+	}
+
+	/* convert ses->user_name to unicode */
+	len = ses->user_name ? strlen(ses->user_name) : 0;
+	user = kmalloc(2 + (len * 2), GFP_KERNEL);
+	if (user == NULL) {
+		rc = -ENOMEM;
+		return rc;
+	}
+
+	if (len) {
+		len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
+		UniStrupr(user);
+	} else {
+		memset(user, '\0', 2);
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+				(char *)user, 2 * len);
+	kfree(user);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with user\n", __func__);
+		return rc;
+	}
+
+	/* convert ses->domainName to unicode and uppercase */
+	if (ses->domainName) {
+		len = strlen(ses->domainName);
+
+		domain = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if (domain == NULL) {
+			rc = -ENOMEM;
+			return rc;
+		}
+		len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
+				      nls_cp);
+		rc =
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)domain, 2 * len);
+		kfree(domain);
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with domain\n",
+				 __func__);
+			return rc;
+		}
+	} else {
+		/* We use ses->serverName if no domain name available */
+		len = strlen(ses->serverName);
+
+		server = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if (server == NULL) {
+			rc = -ENOMEM;
+			return rc;
+		}
+		len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
+					nls_cp);
+		rc =
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)server, 2 * len);
+		kfree(server);
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with server\n",
+				 __func__);
+			return rc;
+		}
+	}
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+					ntlmv2_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+
+	return rc;
+}
+
+static int
+CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
+{
+	int rc;
+	struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)
+	    (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	unsigned int hash_len;
+
+	/* The MD5 hash starts at challenge_key.key */
+	hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
+		offsetof(struct ntlmv2_resp, challenge.key[0]));
+
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
+		return -1;
+	}
+
+	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
+				 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+			 __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
+		return rc;
+	}
+
+	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
+		memcpy(ntlmv2->challenge.key,
+		       ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	else
+		memcpy(ntlmv2->challenge.key,
+		       ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+				 ntlmv2->challenge.key, hash_len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		return rc;
+	}
+
+	/* Note that the MD5 digest over writes anon.challenge_key.key */
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+				ntlmv2->ntlmv2_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+
+	return rc;
+}
+
+static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	/* check if already allocated */
+	if (server->secmech.sdeschmacmd5)
+		return 0;
+
+	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+	if (IS_ERR(server->secmech.hmacmd5)) {
+		cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
+		rc = PTR_ERR(server->secmech.hmacmd5);
+		server->secmech.hmacmd5 = NULL;
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.hmacmd5);
+	server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdeschmacmd5) {
+		crypto_free_shash(server->secmech.hmacmd5);
+		server->secmech.hmacmd5 = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+	server->secmech.sdeschmacmd5->shash.flags = 0x0;
+
+	return 0;
+}
+
+int
+setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
+{
+	int rc;
+	int baselen;
+	unsigned int tilen;
+	struct ntlmv2_resp *ntlmv2;
+	char ntlmv2_hash[16];
+	unsigned char *tiblob = NULL; /* target info blob */
+
+	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
+		if (!ses->domainName) {
+			rc = find_domain_name(ses, nls_cp);
+			if (rc) {
+				cifs_dbg(VFS, "error %d finding domain name\n",
+					 rc);
+				goto setup_ntlmv2_rsp_ret;
+			}
+		}
+	} else {
+		rc = build_avpair_blob(ses, nls_cp);
+		if (rc) {
+			cifs_dbg(VFS, "error %d building av pair blob\n", rc);
+			goto setup_ntlmv2_rsp_ret;
+		}
+	}
+
+	baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+	tilen = ses->auth_key.len;
+	tiblob = ses->auth_key.response;
+
+	ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		rc = ENOMEM;
+		ses->auth_key.len = 0;
+		goto setup_ntlmv2_rsp_ret;
+	}
+	ses->auth_key.len += baselen;
+
+	ntlmv2 = (struct ntlmv2_resp *)
+			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	ntlmv2->blob_signature = cpu_to_le32(0x00000101);
+	ntlmv2->reserved = 0;
+	/* Must be within 5 minutes of the server */
+	ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
+	ntlmv2->reserved2 = 0;
+
+	memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+
+	rc = crypto_hmacmd5_alloc(ses->server);
+	if (rc) {
+		cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	/* calculate ntlmv2_hash */
+	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
+	if (rc) {
+		cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	/* calculate first part of the client response (CR1) */
+	rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+	if (rc) {
+		cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	/* now calculate the session key for NTLMv2 */
+	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
+		ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+			 __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ntlmv2->ntlmv2_hash,
+		CIFS_HMAC_MD5_HASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+
+setup_ntlmv2_rsp_ret:
+	kfree(tiblob);
+
+	return rc;
+}
+
+int
+calc_seckey(struct cifs_ses *ses)
+{
+	int rc;
+	struct crypto_blkcipher *tfm_arc4;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+	unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+
+	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
+
+	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm_arc4)) {
+		rc = PTR_ERR(tfm_arc4);
+		cifs_dbg(VFS, "could not allocate crypto API arc4\n");
+		return rc;
+	}
+
+	desc.tfm = tfm_arc4;
+
+	rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+					CIFS_SESS_KEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set response as a key\n",
+			 __func__);
+		return rc;
+	}
+
+	sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
+	sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
+		crypto_free_blkcipher(tfm_arc4);
+		return rc;
+	}
+
+	/* make secondary_key/nonce as session key */
+	memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
+	/* and make len as that of session key only */
+	ses->auth_key.len = CIFS_SESS_KEY_SIZE;
+
+	crypto_free_blkcipher(tfm_arc4);
+
+	return rc;
+}
+
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+	if (server->secmech.cmacaes) {
+		crypto_free_shash(server->secmech.cmacaes);
+		server->secmech.cmacaes = NULL;
+	}
+
+	if (server->secmech.hmacsha256) {
+		crypto_free_shash(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
+	}
+
+	if (server->secmech.md5) {
+		crypto_free_shash(server->secmech.md5);
+		server->secmech.md5 = NULL;
+	}
+
+	if (server->secmech.hmacmd5) {
+		crypto_free_shash(server->secmech.hmacmd5);
+		server->secmech.hmacmd5 = NULL;
+	}
+
+	kfree(server->secmech.sdesccmacaes);
+	server->secmech.sdesccmacaes = NULL;
+	kfree(server->secmech.sdeschmacsha256);
+	server->secmech.sdeschmacsha256 = NULL;
+	kfree(server->secmech.sdeschmacmd5);
+	server->secmech.sdeschmacmd5 = NULL;
+	kfree(server->secmech.sdescmd5);
+	server->secmech.sdescmd5 = NULL;
+}
diff --git a/kernel/fs/cifs/cifsfs.c b/kernel/fs/cifs/cifsfs.c
new file mode 100644
index 000000000..0a9fb6b53
--- /dev/null
+++ b/kernel/fs/cifs/cifsfs.c
@@ -0,0 +1,1308 @@
+/*
+ *   fs/cifs/cifsfs.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Common Internet FileSystem (CIFS) client
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* Note that BB means BUGBUG (ie something to fix eventually) */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/vfs.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/namei.h>
+#include <linux/random.h>
+#include <net/ipv6.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#define DECLARE_GLOBALS_HERE
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include <linux/mm.h>
+#include <linux/key-type.h>
+#include "cifs_spnego.h"
+#include "fscache.h"
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2pdu.h"
+#endif
+
+int cifsFYI = 0;
+int traceSMB = 0;
+bool enable_oplocks = true;
+unsigned int linuxExtEnabled = 1;
+unsigned int lookupCacheEnabled = 1;
+unsigned int global_secflags = CIFSSEC_DEF;
+/* unsigned int ntlmv2_support = 0; */
+unsigned int sign_CIFS_PDUs = 1;
+static const struct super_operations cifs_super_ops;
+unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
+module_param(CIFSMaxBufSize, uint, 0);
+MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
+				 "Default: 16384 Range: 8192 to 130048");
+unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
+module_param(cifs_min_rcv, uint, 0);
+MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
+				"1 to 64");
+unsigned int cifs_min_small = 30;
+module_param(cifs_min_small, uint, 0);
+MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
+				 "Range: 2 to 256");
+unsigned int cifs_max_pending = CIFS_MAX_REQ;
+module_param(cifs_max_pending, uint, 0444);
+MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
+				   "Default: 32767 Range: 2 to 32767.");
+module_param(enable_oplocks, bool, 0644);
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
+
+extern mempool_t *cifs_sm_req_poolp;
+extern mempool_t *cifs_req_poolp;
+extern mempool_t *cifs_mid_poolp;
+
+struct workqueue_struct	*cifsiod_wq;
+
+/*
+ * Bumps refcount for cifs super block.
+ * Note that it should be only called if a referece to VFS super block is
+ * already held, e.g. in open-type syscalls context. Otherwise it can race with
+ * atomic_dec_and_test in deactivate_locked_super.
+ */
+void
+cifs_sb_active(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_inc_return(&server->active) == 1)
+		atomic_inc(&sb->s_active);
+}
+
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
+}
+
+static int
+cifs_read_super(struct super_block *sb)
+{
+	struct inode *inode;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	int rc = 0;
+
+	cifs_sb = CIFS_SB(sb);
+	tcon = cifs_sb_master_tcon(cifs_sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
+		sb->s_flags |= MS_POSIXACL;
+
+	if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+	else
+		sb->s_maxbytes = MAX_NON_LFS;
+
+	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
+	sb->s_time_gran = 100;
+
+	sb->s_magic = CIFS_MAGIC_NUMBER;
+	sb->s_op = &cifs_super_ops;
+	sb->s_bdi = &cifs_sb->bdi;
+	sb->s_blocksize = CIFS_MAX_MSGSIZE;
+	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
+	inode = cifs_root_iget(sb);
+
+	if (IS_ERR(inode)) {
+		rc = PTR_ERR(inode);
+		goto out_no_root;
+	}
+
+	if (tcon->nocase)
+		sb->s_d_op = &cifs_ci_dentry_ops;
+	else
+		sb->s_d_op = &cifs_dentry_ops;
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		rc = -ENOMEM;
+		goto out_no_root;
+	}
+
+#ifdef CONFIG_CIFS_NFSD_EXPORT
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+		cifs_dbg(FYI, "export ops supported\n");
+		sb->s_export_op = &cifs_export_ops;
+	}
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
+
+	return 0;
+
+out_no_root:
+	cifs_dbg(VFS, "%s: get root inode failed\n", __func__);
+	return rc;
+}
+
+static void cifs_kill_sb(struct super_block *sb)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	kill_anon_super(sb);
+	cifs_umount(cifs_sb);
+}
+
+static int
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int xid;
+	int rc = 0;
+
+	xid = get_xid();
+
+	/*
+	 * PATH_MAX may be too long - it would presumably be total path,
+	 * but note that some servers (includinng Samba 3) have a shorter
+	 * maximum path.
+	 *
+	 * Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO.
+	 */
+	buf->f_namelen = PATH_MAX;
+	buf->f_files = 0;	/* undefined */
+	buf->f_ffree = 0;	/* unlimited */
+
+	if (server->ops->queryfs)
+		rc = server->ops->queryfs(xid, tcon, buf);
+
+	free_xid(xid);
+	return 0;
+}
+
+static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	if (server->ops->fallocate)
+		return server->ops->fallocate(file, tcon, mode, off, len);
+
+	return -EOPNOTSUPP;
+}
+
+static int cifs_permission(struct inode *inode, int mask)
+{
+	struct cifs_sb_info *cifs_sb;
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
+		if ((mask & MAY_EXEC) && !execute_ok(inode))
+			return -EACCES;
+		else
+			return 0;
+	} else /* file mode might have been restricted at mount time
+		on the client (above and beyond ACL on servers) for
+		servers which do not support setting and viewing mode bits,
+		so allowing client to check permissions is useful */
+		return generic_permission(inode, mask);
+}
+
+static struct kmem_cache *cifs_inode_cachep;
+static struct kmem_cache *cifs_req_cachep;
+static struct kmem_cache *cifs_mid_cachep;
+static struct kmem_cache *cifs_sm_req_cachep;
+mempool_t *cifs_sm_req_poolp;
+mempool_t *cifs_req_poolp;
+mempool_t *cifs_mid_poolp;
+
+static struct inode *
+cifs_alloc_inode(struct super_block *sb)
+{
+	struct cifsInodeInfo *cifs_inode;
+	cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
+	if (!cifs_inode)
+		return NULL;
+	cifs_inode->cifsAttrs = 0x20;	/* default */
+	cifs_inode->time = 0;
+	/*
+	 * Until the file is open and we have gotten oplock info back from the
+	 * server, can not assume caching of file data or metadata.
+	 */
+	cifs_set_oplock_level(cifs_inode, 0);
+	cifs_inode->flags = 0;
+	spin_lock_init(&cifs_inode->writers_lock);
+	cifs_inode->writers = 0;
+	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
+	cifs_inode->server_eof = 0;
+	cifs_inode->uniqueid = 0;
+	cifs_inode->createtime = 0;
+	cifs_inode->epoch = 0;
+#ifdef CONFIG_CIFS_SMB2
+	get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
+#endif
+	/*
+	 * Can not set i_flags here - they get immediately overwritten to zero
+	 * by the VFS.
+	 */
+	/* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
+	INIT_LIST_HEAD(&cifs_inode->openFileList);
+	INIT_LIST_HEAD(&cifs_inode->llist);
+	return &cifs_inode->vfs_inode;
+}
+
+static void cifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
+
+static void
+cifs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, cifs_i_callback);
+}
+
+static void
+cifs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	cifs_fscache_release_inode_cookie(inode);
+}
+
+static void
+cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
+{
+	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+	struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
+
+	seq_puts(s, ",addr=");
+
+	switch (server->dstaddr.ss_family) {
+	case AF_INET:
+		seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
+		break;
+	case AF_INET6:
+		seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
+		if (sa6->sin6_scope_id)
+			seq_printf(s, "%%%u", sa6->sin6_scope_id);
+		break;
+	default:
+		seq_puts(s, "(unknown)");
+	}
+}
+
+static void
+cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
+{
+	if (ses->sectype == Unspecified)
+		return;
+
+	seq_puts(s, ",sec=");
+
+	switch (ses->sectype) {
+	case LANMAN:
+		seq_puts(s, "lanman");
+		break;
+	case NTLMv2:
+		seq_puts(s, "ntlmv2");
+		break;
+	case NTLM:
+		seq_puts(s, "ntlm");
+		break;
+	case Kerberos:
+		seq_puts(s, "krb5");
+		break;
+	case RawNTLMSSP:
+		seq_puts(s, "ntlmssp");
+		break;
+	default:
+		/* shouldn't ever happen */
+		seq_puts(s, "unknown");
+		break;
+	}
+
+	if (ses->sign)
+		seq_puts(s, "i");
+}
+
+static void
+cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
+{
+	seq_puts(s, ",cache=");
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+		seq_puts(s, "strict");
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+		seq_puts(s, "none");
+	else
+		seq_puts(s, "loose");
+}
+
+static void
+cifs_show_nls(struct seq_file *s, struct nls_table *cur)
+{
+	struct nls_table *def;
+
+	/* Display iocharset= option if it's not default charset */
+	def = load_nls_default();
+	if (def != cur)
+		seq_printf(s, ",iocharset=%s", cur->charset);
+	unload_nls(def);
+}
+
+/*
+ * cifs_show_options() is for displaying mount options in /proc/mounts.
+ * Not all settable options are displayed but most of the important
+ * ones are.
+ */
+static int
+cifs_show_options(struct seq_file *s, struct dentry *root)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct sockaddr *srcaddr;
+	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
+
+	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+	cifs_show_security(s, tcon->ses);
+	cifs_show_cache_flavor(s, cifs_sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
+		seq_puts(s, ",multiuser");
+	else if (tcon->ses->user_name)
+		seq_printf(s, ",username=%s", tcon->ses->user_name);
+
+	if (tcon->ses->domainName)
+		seq_printf(s, ",domain=%s", tcon->ses->domainName);
+
+	if (srcaddr->sa_family != AF_UNSPEC) {
+		struct sockaddr_in *saddr4;
+		struct sockaddr_in6 *saddr6;
+		saddr4 = (struct sockaddr_in *)srcaddr;
+		saddr6 = (struct sockaddr_in6 *)srcaddr;
+		if (srcaddr->sa_family == AF_INET6)
+			seq_printf(s, ",srcaddr=%pI6c",
+				   &saddr6->sin6_addr);
+		else if (srcaddr->sa_family == AF_INET)
+			seq_printf(s, ",srcaddr=%pI4",
+				   &saddr4->sin_addr.s_addr);
+		else
+			seq_printf(s, ",srcaddr=BAD-AF:%i",
+				   (int)(srcaddr->sa_family));
+	}
+
+	seq_printf(s, ",uid=%u",
+		   from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+		seq_puts(s, ",forceuid");
+	else
+		seq_puts(s, ",noforceuid");
+
+	seq_printf(s, ",gid=%u",
+		   from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+		seq_puts(s, ",forcegid");
+	else
+		seq_puts(s, ",noforcegid");
+
+	cifs_show_address(s, tcon->ses->server);
+
+	if (!tcon->unix_ext)
+		seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
+					   cifs_sb->mnt_file_mode,
+					   cifs_sb->mnt_dir_mode);
+
+	cifs_show_nls(s, cifs_sb->local_nls);
+
+	if (tcon->seal)
+		seq_puts(s, ",seal");
+	if (tcon->nocase)
+		seq_puts(s, ",nocase");
+	if (tcon->retry)
+		seq_puts(s, ",hard");
+	if (tcon->unix_ext)
+		seq_puts(s, ",unix");
+	else
+		seq_puts(s, ",nounix");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+		seq_puts(s, ",posixpaths");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
+		seq_puts(s, ",setuids");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		seq_puts(s, ",serverino");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		seq_puts(s, ",rwpidforward");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
+		seq_puts(s, ",forcemand");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+		seq_puts(s, ",nouser_xattr");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+		seq_puts(s, ",mapchars");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
+		seq_puts(s, ",mapposix");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+		seq_puts(s, ",sfu");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		seq_puts(s, ",nobrl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+		seq_puts(s, ",cifsacl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+		seq_puts(s, ",dynperm");
+	if (root->d_sb->s_flags & MS_POSIXACL)
+		seq_puts(s, ",acl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		seq_puts(s, ",mfsymlinks");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+		seq_puts(s, ",fsc");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
+		seq_puts(s, ",nostrictsync");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		seq_puts(s, ",noperm");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
+		seq_printf(s, ",backupuid=%u",
+			   from_kuid_munged(&init_user_ns,
+					    cifs_sb->mnt_backupuid));
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
+		seq_printf(s, ",backupgid=%u",
+			   from_kgid_munged(&init_user_ns,
+					    cifs_sb->mnt_backupgid));
+
+	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
+	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+	/* convert actimeo and display it in seconds */
+	seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
+
+	return 0;
+}
+
+static void cifs_umount_begin(struct super_block *sb)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_tcon *tcon;
+
+	if (cifs_sb == NULL)
+		return;
+
+	tcon = cifs_sb_master_tcon(cifs_sb);
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
+		/* we have other mounts to same share or we have
+		   already tried to force umount this and woken up
+		   all waiting network requests, nothing to do */
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	} else if (tcon->tc_count == 1)
+		tcon->tidStatus = CifsExiting;
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
+	/* cancel_notify_requests(tcon); */
+	if (tcon->ses && tcon->ses->server) {
+		cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n");
+		wake_up_all(&tcon->ses->server->request_q);
+		wake_up_all(&tcon->ses->server->response_q);
+		msleep(1); /* yield */
+		/* we have to kick the requests once more */
+		wake_up_all(&tcon->ses->server->response_q);
+		msleep(1);
+	}
+
+	return;
+}
+
+#ifdef CONFIG_CIFS_STATS2
+static int cifs_show_stats(struct seq_file *s, struct dentry *root)
+{
+	/* BB FIXME */
+	return 0;
+}
+#endif
+
+static int cifs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+static int cifs_drop_inode(struct inode *inode)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	/* no serverino => unconditional eviction */
+	return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
+		generic_drop_inode(inode);
+}
+
+static const struct super_operations cifs_super_ops = {
+	.statfs = cifs_statfs,
+	.alloc_inode = cifs_alloc_inode,
+	.destroy_inode = cifs_destroy_inode,
+	.drop_inode	= cifs_drop_inode,
+	.evict_inode	= cifs_evict_inode,
+/*	.delete_inode	= cifs_delete_inode,  */  /* Do not need above
+	function unless later we add lazy close of inodes or unless the
+	kernel forgets to call us with the same number of releases (closes)
+	as opens */
+	.show_options = cifs_show_options,
+	.umount_begin   = cifs_umount_begin,
+	.remount_fs = cifs_remount,
+#ifdef CONFIG_CIFS_STATS2
+	.show_stats = cifs_show_stats,
+#endif
+};
+
+/*
+ * Get root dentry from superblock according to prefix path mount option.
+ * Return dentry with refcount + 1 on success and NULL otherwise.
+ */
+static struct dentry *
+cifs_get_root(struct smb_vol *vol, struct super_block *sb)
+{
+	struct dentry *dentry;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	char *full_path = NULL;
+	char *s, *p;
+	char sep;
+
+	full_path = cifs_build_path_to_root(vol, cifs_sb,
+					    cifs_sb_master_tcon(cifs_sb));
+	if (full_path == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	cifs_dbg(FYI, "Get root dentry for %s\n", full_path);
+
+	sep = CIFS_DIR_SEP(cifs_sb);
+	dentry = dget(sb->s_root);
+	p = s = full_path;
+
+	do {
+		struct inode *dir = d_inode(dentry);
+		struct dentry *child;
+
+		if (!dir) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOENT);
+			break;
+		}
+		if (!S_ISDIR(dir->i_mode)) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOTDIR);
+			break;
+		}
+
+		/* skip separators */
+		while (*s == sep)
+			s++;
+		if (!*s)
+			break;
+		p = s++;
+		/* next separator */
+		while (*s && *s != sep)
+			s++;
+
+		mutex_lock(&dir->i_mutex);
+		child = lookup_one_len(p, dentry, s - p);
+		mutex_unlock(&dir->i_mutex);
+		dput(dentry);
+		dentry = child;
+	} while (!IS_ERR(dentry));
+	kfree(full_path);
+	return dentry;
+}
+
+static int cifs_set_super(struct super_block *sb, void *data)
+{
+	struct cifs_mnt_data *mnt_data = data;
+	sb->s_fs_info = mnt_data->cifs_sb;
+	return set_anon_super(sb, NULL);
+}
+
+static struct dentry *
+cifs_do_mount(struct file_system_type *fs_type,
+	      int flags, const char *dev_name, void *data)
+{
+	int rc;
+	struct super_block *sb;
+	struct cifs_sb_info *cifs_sb;
+	struct smb_vol *volume_info;
+	struct cifs_mnt_data mnt_data;
+	struct dentry *root;
+
+	cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+
+	volume_info = cifs_get_volume_info((char *)data, dev_name);
+	if (IS_ERR(volume_info))
+		return ERR_CAST(volume_info);
+
+	cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
+	if (cifs_sb == NULL) {
+		root = ERR_PTR(-ENOMEM);
+		goto out_nls;
+	}
+
+	cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
+	if (cifs_sb->mountdata == NULL) {
+		root = ERR_PTR(-ENOMEM);
+		goto out_cifs_sb;
+	}
+
+	cifs_setup_cifs_sb(volume_info, cifs_sb);
+
+	rc = cifs_mount(cifs_sb, volume_info);
+	if (rc) {
+		if (!(flags & MS_SILENT))
+			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
+				 rc);
+		root = ERR_PTR(rc);
+		goto out_mountdata;
+	}
+
+	mnt_data.vol = volume_info;
+	mnt_data.cifs_sb = cifs_sb;
+	mnt_data.flags = flags;
+
+	/* BB should we make this contingent on mount parm? */
+	flags |= MS_NODIRATIME | MS_NOATIME;
+
+	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
+	if (IS_ERR(sb)) {
+		root = ERR_CAST(sb);
+		cifs_umount(cifs_sb);
+		goto out;
+	}
+
+	if (sb->s_root) {
+		cifs_dbg(FYI, "Use existing superblock\n");
+		cifs_umount(cifs_sb);
+	} else {
+		rc = cifs_read_super(sb);
+		if (rc) {
+			root = ERR_PTR(rc);
+			goto out_super;
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	root = cifs_get_root(volume_info, sb);
+	if (IS_ERR(root))
+		goto out_super;
+
+	cifs_dbg(FYI, "dentry root is: %p\n", root);
+	goto out;
+
+out_super:
+	deactivate_locked_super(sb);
+out:
+	cifs_cleanup_volume_info(volume_info);
+	return root;
+
+out_mountdata:
+	kfree(cifs_sb->mountdata);
+out_cifs_sb:
+	kfree(cifs_sb);
+out_nls:
+	unload_nls(volume_info->local_nls);
+	goto out;
+}
+
+static ssize_t
+cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t rc;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	rc = cifs_revalidate_mapping(inode);
+	if (rc)
+		return rc;
+
+	return generic_file_read_iter(iocb, iter);
+}
+
+static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	ssize_t written;
+	int rc;
+
+	written = cifs_get_writer(cinode);
+	if (written)
+		return written;
+
+	written = generic_file_write_iter(iocb, from);
+
+	if (CIFS_CACHE_WRITE(CIFS_I(inode)))
+		goto out;
+
+	rc = filemap_fdatawrite(inode->i_mapping);
+	if (rc)
+		cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
+			 rc, inode);
+
+out:
+	cifs_put_writer(cinode);
+	return written;
+}
+
+static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
+{
+	/*
+	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+	 * the cached file length
+	 */
+	if (whence != SEEK_SET && whence != SEEK_CUR) {
+		int rc;
+		struct inode *inode = file_inode(file);
+
+		/*
+		 * We need to be sure that all dirty pages are written and the
+		 * server has the newest file length.
+		 */
+		if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
+		    inode->i_mapping->nrpages != 0) {
+			rc = filemap_fdatawait(inode->i_mapping);
+			if (rc) {
+				mapping_set_error(inode->i_mapping, rc);
+				return rc;
+			}
+		}
+		/*
+		 * Some applications poll for the file length in this strange
+		 * way so we must seek to end on non-oplocked files by
+		 * setting the revalidate time to zero.
+		 */
+		CIFS_I(inode)->time = 0;
+
+		rc = cifs_revalidate_file_attr(file);
+		if (rc < 0)
+			return (loff_t)rc;
+	}
+	return generic_file_llseek(file, offset, whence);
+}
+
+static int
+cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv)
+{
+	/*
+	 * Note that this is called by vfs setlease with i_lock held to
+	 * protect *lease from going away.
+	 */
+	struct inode *inode = file_inode(file);
+	struct cifsFileInfo *cfile = file->private_data;
+
+	if (!(S_ISREG(inode->i_mode)))
+		return -EINVAL;
+
+	/* Check if file is oplocked if this is request for new lease */
+	if (arg == F_UNLCK ||
+	    ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
+	    ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
+		return generic_setlease(file, arg, lease, priv);
+	else if (tlink_tcon(cfile->tlink)->local_lease &&
+		 !CIFS_CACHE_READ(CIFS_I(inode)))
+		/*
+		 * If the server claims to support oplock on this file, then we
+		 * still need to check oplock even if the local_lease mount
+		 * option is set, but there are servers which do not support
+		 * oplock for which this mount option may be useful if the user
+		 * knows that the file won't be changed on the server by anyone
+		 * else.
+		 */
+		return generic_setlease(file, arg, lease, priv);
+	else
+		return -EAGAIN;
+}
+
+struct file_system_type cifs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "cifs",
+	.mount = cifs_do_mount,
+	.kill_sb = cifs_kill_sb,
+	/*  .fs_flags */
+};
+MODULE_ALIAS_FS("cifs");
+const struct inode_operations cifs_dir_inode_ops = {
+	.create = cifs_create,
+	.atomic_open = cifs_atomic_open,
+	.lookup = cifs_lookup,
+	.getattr = cifs_getattr,
+	.unlink = cifs_unlink,
+	.link = cifs_hardlink,
+	.mkdir = cifs_mkdir,
+	.rmdir = cifs_rmdir,
+	.rename2 = cifs_rename2,
+	.permission = cifs_permission,
+/*	revalidate:cifs_revalidate,   */
+	.setattr = cifs_setattr,
+	.symlink = cifs_symlink,
+	.mknod   = cifs_mknod,
+#ifdef CONFIG_CIFS_XATTR
+	.setxattr = cifs_setxattr,
+	.getxattr = cifs_getxattr,
+	.listxattr = cifs_listxattr,
+	.removexattr = cifs_removexattr,
+#endif
+};
+
+const struct inode_operations cifs_file_inode_ops = {
+/*	revalidate:cifs_revalidate, */
+	.setattr = cifs_setattr,
+	.getattr = cifs_getattr, /* do we need this anymore? */
+	.permission = cifs_permission,
+#ifdef CONFIG_CIFS_XATTR
+	.setxattr = cifs_setxattr,
+	.getxattr = cifs_getxattr,
+	.listxattr = cifs_listxattr,
+	.removexattr = cifs_removexattr,
+#endif
+};
+
+const struct inode_operations cifs_symlink_inode_ops = {
+	.readlink = generic_readlink,
+	.follow_link = cifs_follow_link,
+	.put_link = kfree_put_link,
+	.permission = cifs_permission,
+	/* BB add the following two eventually */
+	/* revalidate: cifs_revalidate,
+	   setattr:    cifs_notify_change, *//* BB do we need notify change */
+#ifdef CONFIG_CIFS_XATTR
+	.setxattr = cifs_setxattr,
+	.getxattr = cifs_getxattr,
+	.listxattr = cifs_listxattr,
+	.removexattr = cifs_removexattr,
+#endif
+};
+
+const struct file_operations cifs_file_ops = {
+	.read_iter = cifs_loose_read_iter,
+	.write_iter = cifs_file_write_iter,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap  = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+	.fallocate = cifs_fallocate,
+};
+
+const struct file_operations cifs_file_strict_ops = {
+	.read_iter = cifs_strict_readv,
+	.write_iter = cifs_strict_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_strict_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+	.fallocate = cifs_fallocate,
+};
+
+const struct file_operations cifs_file_direct_ops = {
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read_iter = cifs_user_readv,
+	.write_iter = cifs_user_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl  = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.llseek = cifs_llseek,
+	.setlease = cifs_setlease,
+	.fallocate = cifs_fallocate,
+};
+
+const struct file_operations cifs_file_nobrl_ops = {
+	.read_iter = cifs_loose_read_iter,
+	.write_iter = cifs_file_write_iter,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap  = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+	.fallocate = cifs_fallocate,
+};
+
+const struct file_operations cifs_file_strict_nobrl_ops = {
+	.read_iter = cifs_strict_readv,
+	.write_iter = cifs_strict_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_strict_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+	.fallocate = cifs_fallocate,
+};
+
+const struct file_operations cifs_file_direct_nobrl_ops = {
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read_iter = cifs_user_readv,
+	.write_iter = cifs_user_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl  = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.llseek = cifs_llseek,
+	.setlease = cifs_setlease,
+	.fallocate = cifs_fallocate,
+};
+
+const struct file_operations cifs_dir_ops = {
+	.iterate = cifs_readdir,
+	.release = cifs_closedir,
+	.read    = generic_read_dir,
+	.unlocked_ioctl  = cifs_ioctl,
+	.llseek = generic_file_llseek,
+};
+
+static void
+cifs_init_once(void *inode)
+{
+	struct cifsInodeInfo *cifsi = inode;
+
+	inode_init_once(&cifsi->vfs_inode);
+	init_rwsem(&cifsi->lock_sem);
+}
+
+static int __init
+cifs_init_inodecache(void)
+{
+	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
+					      sizeof(struct cifsInodeInfo),
+					      0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					      cifs_init_once);
+	if (cifs_inode_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+cifs_destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(cifs_inode_cachep);
+}
+
+static int
+cifs_init_request_bufs(void)
+{
+	size_t max_hdr_size = MAX_CIFS_HDR_SIZE;
+#ifdef CONFIG_CIFS_SMB2
+	/*
+	 * SMB2 maximum header size is bigger than CIFS one - no problems to
+	 * allocate some more bytes for CIFS.
+	 */
+	max_hdr_size = MAX_SMB2_HDR_SIZE;
+#endif
+	if (CIFSMaxBufSize < 8192) {
+	/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
+	Unicode path name has to fit in any SMB/CIFS path based frames */
+		CIFSMaxBufSize = 8192;
+	} else if (CIFSMaxBufSize > 1024*127) {
+		CIFSMaxBufSize = 1024 * 127;
+	} else {
+		CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
+	}
+/*
+	cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n",
+		 CIFSMaxBufSize, CIFSMaxBufSize);
+*/
+	cifs_req_cachep = kmem_cache_create("cifs_request",
+					    CIFSMaxBufSize + max_hdr_size, 0,
+					    SLAB_HWCACHE_ALIGN, NULL);
+	if (cifs_req_cachep == NULL)
+		return -ENOMEM;
+
+	if (cifs_min_rcv < 1)
+		cifs_min_rcv = 1;
+	else if (cifs_min_rcv > 64) {
+		cifs_min_rcv = 64;
+		cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n");
+	}
+
+	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
+						  cifs_req_cachep);
+
+	if (cifs_req_poolp == NULL) {
+		kmem_cache_destroy(cifs_req_cachep);
+		return -ENOMEM;
+	}
+	/* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and
+	almost all handle based requests (but not write response, nor is it
+	sufficient for path based requests).  A smaller size would have
+	been more efficient (compacting multiple slab items on one 4k page)
+	for the case in which debug was on, but this larger size allows
+	more SMBs to use small buffer alloc and is still much more
+	efficient to alloc 1 per page off the slab compared to 17K (5page)
+	alloc of large cifs buffers even when page debugging is on */
+	cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",
+			MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
+			NULL);
+	if (cifs_sm_req_cachep == NULL) {
+		mempool_destroy(cifs_req_poolp);
+		kmem_cache_destroy(cifs_req_cachep);
+		return -ENOMEM;
+	}
+
+	if (cifs_min_small < 2)
+		cifs_min_small = 2;
+	else if (cifs_min_small > 256) {
+		cifs_min_small = 256;
+		cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n");
+	}
+
+	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
+						     cifs_sm_req_cachep);
+
+	if (cifs_sm_req_poolp == NULL) {
+		mempool_destroy(cifs_req_poolp);
+		kmem_cache_destroy(cifs_req_cachep);
+		kmem_cache_destroy(cifs_sm_req_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void
+cifs_destroy_request_bufs(void)
+{
+	mempool_destroy(cifs_req_poolp);
+	kmem_cache_destroy(cifs_req_cachep);
+	mempool_destroy(cifs_sm_req_poolp);
+	kmem_cache_destroy(cifs_sm_req_cachep);
+}
+
+static int
+cifs_init_mids(void)
+{
+	cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
+					    sizeof(struct mid_q_entry), 0,
+					    SLAB_HWCACHE_ALIGN, NULL);
+	if (cifs_mid_cachep == NULL)
+		return -ENOMEM;
+
+	/* 3 is a reasonable minimum number of simultaneous operations */
+	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
+	if (cifs_mid_poolp == NULL) {
+		kmem_cache_destroy(cifs_mid_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void
+cifs_destroy_mids(void)
+{
+	mempool_destroy(cifs_mid_poolp);
+	kmem_cache_destroy(cifs_mid_cachep);
+}
+
+static int __init
+init_cifs(void)
+{
+	int rc = 0;
+	cifs_proc_init();
+	INIT_LIST_HEAD(&cifs_tcp_ses_list);
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+	INIT_LIST_HEAD(&GlobalDnotifyReqList);
+	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
+/*
+ *  Initialize Global counters
+ */
+	atomic_set(&sesInfoAllocCount, 0);
+	atomic_set(&tconInfoAllocCount, 0);
+	atomic_set(&tcpSesAllocCount, 0);
+	atomic_set(&tcpSesReconnectCount, 0);
+	atomic_set(&tconInfoReconnectCount, 0);
+
+	atomic_set(&bufAllocCount, 0);
+	atomic_set(&smBufAllocCount, 0);
+#ifdef CONFIG_CIFS_STATS2
+	atomic_set(&totBufAllocCount, 0);
+	atomic_set(&totSmBufAllocCount, 0);
+#endif /* CONFIG_CIFS_STATS2 */
+
+	atomic_set(&midCount, 0);
+	GlobalCurrentXid = 0;
+	GlobalTotalActiveXid = 0;
+	GlobalMaxActiveXid = 0;
+	spin_lock_init(&cifs_tcp_ses_lock);
+	spin_lock_init(&cifs_file_list_lock);
+	spin_lock_init(&GlobalMid_Lock);
+
+	if (cifs_max_pending < 2) {
+		cifs_max_pending = 2;
+		cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
+	} else if (cifs_max_pending > CIFS_MAX_REQ) {
+		cifs_max_pending = CIFS_MAX_REQ;
+		cifs_dbg(FYI, "cifs_max_pending set to max of %u\n",
+			 CIFS_MAX_REQ);
+	}
+
+	cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+	if (!cifsiod_wq) {
+		rc = -ENOMEM;
+		goto out_clean_proc;
+	}
+
+	rc = cifs_fscache_register();
+	if (rc)
+		goto out_destroy_wq;
+
+	rc = cifs_init_inodecache();
+	if (rc)
+		goto out_unreg_fscache;
+
+	rc = cifs_init_mids();
+	if (rc)
+		goto out_destroy_inodecache;
+
+	rc = cifs_init_request_bufs();
+	if (rc)
+		goto out_destroy_mids;
+
+#ifdef CONFIG_CIFS_UPCALL
+	rc = register_key_type(&cifs_spnego_key_type);
+	if (rc)
+		goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_UPCALL */
+
+#ifdef CONFIG_CIFS_ACL
+	rc = init_cifs_idmap();
+	if (rc)
+		goto out_register_key_type;
+#endif /* CONFIG_CIFS_ACL */
+
+	rc = register_filesystem(&cifs_fs_type);
+	if (rc)
+		goto out_init_cifs_idmap;
+
+	return 0;
+
+out_init_cifs_idmap:
+#ifdef CONFIG_CIFS_ACL
+	exit_cifs_idmap();
+out_register_key_type:
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
+out_destroy_request_bufs:
+#endif
+	cifs_destroy_request_bufs();
+out_destroy_mids:
+	cifs_destroy_mids();
+out_destroy_inodecache:
+	cifs_destroy_inodecache();
+out_unreg_fscache:
+	cifs_fscache_unregister();
+out_destroy_wq:
+	destroy_workqueue(cifsiod_wq);
+out_clean_proc:
+	cifs_proc_clean();
+	return rc;
+}
+
+static void __exit
+exit_cifs(void)
+{
+	cifs_dbg(NOISY, "exit_cifs\n");
+	unregister_filesystem(&cifs_fs_type);
+	cifs_dfs_release_automount_timer();
+#ifdef CONFIG_CIFS_ACL
+	exit_cifs_idmap();
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
+#endif
+	cifs_destroy_request_bufs();
+	cifs_destroy_mids();
+	cifs_destroy_inodecache();
+	cifs_fscache_unregister();
+	destroy_workqueue(cifsiod_wq);
+	cifs_proc_clean();
+}
+
+MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
+MODULE_LICENSE("GPL");	/* combination of LGPL + GPL source behaves as GPL */
+MODULE_DESCRIPTION
+    ("VFS to access servers complying with the SNIA CIFS Specification "
+     "e.g. Samba and Windows");
+MODULE_VERSION(CIFS_VERSION);
+module_init(init_cifs)
+module_exit(exit_cifs)
diff --git a/kernel/fs/cifs/cifsfs.h b/kernel/fs/cifs/cifsfs.h
new file mode 100644
index 000000000..252f5c158
--- /dev/null
+++ b/kernel/fs/cifs/cifsfs.h
@@ -0,0 +1,140 @@
+/*
+ *   fs/cifs/cifsfs.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002, 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFSFS_H
+#define _CIFSFS_H
+
+#include <linux/hash.h>
+
+#define ROOT_I 2
+
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit. We use hash_64 to convert the value to 31 bits, and
+ * then add 1, to ensure that we don't end up with a 0 as the value.
+ */
+#if BITS_PER_LONG == 64
+static inline ino_t
+cifs_uniqueid_to_ino_t(u64 fileid)
+{
+	return (ino_t)fileid;
+}
+#else
+static inline ino_t
+cifs_uniqueid_to_ino_t(u64 fileid)
+{
+	return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+}
+#endif
+
+extern struct file_system_type cifs_fs_type;
+extern const struct address_space_operations cifs_addr_ops;
+extern const struct address_space_operations cifs_addr_ops_smallbuf;
+
+/* Functions related to super block operations */
+extern void cifs_sb_active(struct super_block *sb);
+extern void cifs_sb_deactive(struct super_block *sb);
+
+/* Functions related to inodes */
+extern const struct inode_operations cifs_dir_inode_ops;
+extern struct inode *cifs_root_iget(struct super_block *);
+extern int cifs_create(struct inode *, struct dentry *, umode_t,
+		       bool excl);
+extern int cifs_atomic_open(struct inode *, struct dentry *,
+			    struct file *, unsigned, umode_t,
+			    int *);
+extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
+				  unsigned int);
+extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
+extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
+extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int cifs_rmdir(struct inode *, struct dentry *);
+extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
+			struct dentry *, unsigned int);
+extern int cifs_revalidate_file_attr(struct file *filp);
+extern int cifs_revalidate_dentry_attr(struct dentry *);
+extern int cifs_revalidate_file(struct file *filp);
+extern int cifs_revalidate_dentry(struct dentry *);
+extern int cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_revalidate_mapping(struct inode *inode);
+extern int cifs_zap_mapping(struct inode *inode);
+extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int cifs_setattr(struct dentry *, struct iattr *);
+
+extern const struct inode_operations cifs_file_inode_ops;
+extern const struct inode_operations cifs_symlink_inode_ops;
+extern const struct inode_operations cifs_dfs_referral_inode_operations;
+
+
+/* Functions related to files and directories */
+extern const struct file_operations cifs_file_ops;
+extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
+extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
+extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_direct_nobrl_ops;
+extern const struct file_operations cifs_file_strict_nobrl_ops;
+extern int cifs_open(struct inode *inode, struct file *file);
+extern int cifs_close(struct inode *inode, struct file *file);
+extern int cifs_closedir(struct inode *inode, struct file *file);
+extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
+extern int cifs_lock(struct file *, int, struct file_lock *);
+extern int cifs_fsync(struct file *, loff_t, loff_t, int);
+extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
+extern int cifs_flush(struct file *, fl_owner_t id);
+extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
+extern const struct file_operations cifs_dir_ops;
+extern int cifs_dir_open(struct inode *inode, struct file *file);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
+
+/* Functions related to dir entries */
+extern const struct dentry_operations cifs_dentry_ops;
+extern const struct dentry_operations cifs_ci_dentry_ops;
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
+
+/* Functions related to symlinks */
+extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
+extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
+			 int buflen);
+extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
+			const char *symname);
+extern int	cifs_removexattr(struct dentry *, const char *);
+extern int	cifs_setxattr(struct dentry *, const char *, const void *,
+			size_t, int);
+extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
+extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+#ifdef CONFIG_CIFS_NFSD_EXPORT
+extern const struct export_operations cifs_export_ops;
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
+
+#define CIFS_VERSION   "2.06"
+#endif				/* _CIFSFS_H */
diff --git a/kernel/fs/cifs/cifsglob.h b/kernel/fs/cifs/cifsglob.h
new file mode 100644
index 000000000..22b289a3b
--- /dev/null
+++ b/kernel/fs/cifs/cifsglob.h
@@ -0,0 +1,1620 @@
+/*
+ *   fs/cifs/cifsglob.h
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ */
+#ifndef _CIFS_GLOB_H
+#define _CIFS_GLOB_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/workqueue.h>
+#include "cifs_fs_sb.h"
+#include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/cifs/cifs_mount.h>
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2pdu.h"
+#endif
+
+#define CIFS_MAGIC_NUMBER 0xFF534D42      /* the first four bytes of SMB PDUs */
+
+/*
+ * The sizes of various internal tables and strings
+ */
+#define MAX_UID_INFO 16
+#define MAX_SES_INFO 2
+#define MAX_TCON_INFO 4
+
+#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1)
+
+#define CIFS_MIN_RCV_POOL 4
+
+#define MAX_REOPEN_ATT	5 /* these many maximum attempts to reopen a file */
+/*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+
+/*
+ * MAX_REQ is the maximum number of requests that WE will send
+ * on one socket concurrently.
+ */
+#define CIFS_MAX_REQ 32767
+
+#define RFC1001_NAME_LEN 15
+#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
+
+/* currently length of NIP6_FMT */
+#define SERVER_NAME_LENGTH 40
+#define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
+
+/* SMB echo "timeout" -- FIXME: tunable? */
+#define SMB_ECHO_INTERVAL (60 * HZ)
+
+#include "cifspdu.h"
+
+#ifndef XATTR_DOS_ATTRIB
+#define XATTR_DOS_ATTRIB "user.DOSATTRIB"
+#endif
+
+/*
+ * CIFS vfs client Status information (based on what we know.)
+ */
+
+/* associated with each tcp and smb session */
+enum statusEnum {
+	CifsNew = 0,
+	CifsGood,
+	CifsExiting,
+	CifsNeedReconnect,
+	CifsNeedNegotiate
+};
+
+enum securityEnum {
+	Unspecified = 0,	/* not specified */
+	LANMAN,			/* Legacy LANMAN auth */
+	NTLM,			/* Legacy NTLM012 auth with NTLM hash */
+	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
+	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
+	Kerberos,		/* Kerberos via SPNEGO */
+};
+
+struct session_key {
+	unsigned int len;
+	char *response;
+};
+
+/* crypto security descriptor definition */
+struct sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
+/* crypto hashing related structure/fields, not specific to a sec mech */
+struct cifs_secmech {
+	struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
+	struct crypto_shash *md5; /* md5 hash function */
+	struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
+	struct crypto_shash *cmacaes; /* block-cipher based MAC function */
+	struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
+	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+	struct sdesc *sdeschmacsha256;  /* ctxt to generate smb2 signature */
+	struct sdesc *sdesccmacaes;  /* ctxt to generate smb3 signature */
+};
+
+/* per smb session structure/fields */
+struct ntlmssp_auth {
+	bool sesskey_per_smbsess; /* whether session key is per smb session */
+	__u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
+	__u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
+	unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
+};
+
+struct cifs_cred {
+	int uid;
+	int gid;
+	int mode;
+	int cecount;
+	struct cifs_sid osid;
+	struct cifs_sid gsid;
+	struct cifs_ntace *ntaces;
+	struct cifs_ace *aces;
+};
+
+/*
+ *****************************************************************
+ * Except the CIFS PDUs themselves all the
+ * globally interesting structs should go here
+ *****************************************************************
+ */
+
+/*
+ * A smb_rqst represents a complete request to be issued to a server. It's
+ * formed by a kvec array, followed by an array of pages. Page data is assumed
+ * to start at the beginning of the first page.
+ */
+struct smb_rqst {
+	struct kvec	*rq_iov;	/* array of kvecs */
+	unsigned int	rq_nvec;	/* number of kvecs in array */
+	struct page	**rq_pages;	/* pointer to array of page ptrs */
+	unsigned int	rq_npages;	/* number pages in array */
+	unsigned int	rq_pagesz;	/* page size to use */
+	unsigned int	rq_tailsz;	/* length of last page */
+};
+
+enum smb_version {
+	Smb_1 = 1,
+	Smb_20,
+	Smb_21,
+	Smb_30,
+	Smb_302,
+};
+
+struct mid_q_entry;
+struct TCP_Server_Info;
+struct cifsFileInfo;
+struct cifs_ses;
+struct cifs_tcon;
+struct dfs_info3_param;
+struct cifs_fattr;
+struct smb_vol;
+struct cifs_fid;
+struct cifs_readdata;
+struct cifs_writedata;
+struct cifs_io_parms;
+struct cifs_search_info;
+struct cifsInodeInfo;
+struct cifs_open_parms;
+
+struct smb_version_operations {
+	int (*send_cancel)(struct TCP_Server_Info *, void *,
+			   struct mid_q_entry *);
+	bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
+	/* setup request: allocate mid, sign message */
+	struct mid_q_entry *(*setup_request)(struct cifs_ses *,
+						struct smb_rqst *);
+	/* setup async request: allocate mid, sign message */
+	struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *,
+						struct smb_rqst *);
+	/* check response: verify signature, map error */
+	int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
+			     bool);
+	void (*add_credits)(struct TCP_Server_Info *, const unsigned int,
+			    const int);
+	void (*set_credits)(struct TCP_Server_Info *, const int);
+	int * (*get_credits_field)(struct TCP_Server_Info *, const int);
+	unsigned int (*get_credits)(struct mid_q_entry *);
+	__u64 (*get_next_mid)(struct TCP_Server_Info *);
+	/* data offset from read response message */
+	unsigned int (*read_data_offset)(char *);
+	/* data length from read response message */
+	unsigned int (*read_data_length)(char *);
+	/* map smb to linux error */
+	int (*map_error)(char *, bool);
+	/* find mid corresponding to the response message */
+	struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *);
+	void (*dump_detail)(void *);
+	void (*clear_stats)(struct cifs_tcon *);
+	void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
+	void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
+	/* verify the message */
+	int (*check_message)(char *, unsigned int);
+	bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+	void (*downgrade_oplock)(struct TCP_Server_Info *,
+					struct cifsInodeInfo *, bool);
+	/* process transaction2 response */
+	bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
+			     char *, int);
+	/* check if we need to negotiate */
+	bool (*need_neg)(struct TCP_Server_Info *);
+	/* negotiate to the server */
+	int (*negotiate)(const unsigned int, struct cifs_ses *);
+	/* set negotiated write size */
+	unsigned int (*negotiate_wsize)(struct cifs_tcon *, struct smb_vol *);
+	/* set negotiated read size */
+	unsigned int (*negotiate_rsize)(struct cifs_tcon *, struct smb_vol *);
+	/* setup smb sessionn */
+	int (*sess_setup)(const unsigned int, struct cifs_ses *,
+			  const struct nls_table *);
+	/* close smb session */
+	int (*logoff)(const unsigned int, struct cifs_ses *);
+	/* connect to a server share */
+	int (*tree_connect)(const unsigned int, struct cifs_ses *, const char *,
+			    struct cifs_tcon *, const struct nls_table *);
+	/* close tree connecion */
+	int (*tree_disconnect)(const unsigned int, struct cifs_tcon *);
+	/* get DFS referrals */
+	int (*get_dfs_refer)(const unsigned int, struct cifs_ses *,
+			     const char *, struct dfs_info3_param **,
+			     unsigned int *, const struct nls_table *, int);
+	/* informational QFS call */
+	void (*qfs_tcon)(const unsigned int, struct cifs_tcon *);
+	/* check if a path is accessible or not */
+	int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
+				  struct cifs_sb_info *, const char *);
+	/* query path data from the server */
+	int (*query_path_info)(const unsigned int, struct cifs_tcon *,
+			       struct cifs_sb_info *, const char *,
+			       FILE_ALL_INFO *, bool *, bool *);
+	/* query file data from the server */
+	int (*query_file_info)(const unsigned int, struct cifs_tcon *,
+			       struct cifs_fid *, FILE_ALL_INFO *);
+	/* get server index number */
+	int (*get_srv_inum)(const unsigned int, struct cifs_tcon *,
+			    struct cifs_sb_info *, const char *,
+			    u64 *uniqueid, FILE_ALL_INFO *);
+	/* set size by path */
+	int (*set_path_size)(const unsigned int, struct cifs_tcon *,
+			     const char *, __u64, struct cifs_sb_info *, bool);
+	/* set size by file handle */
+	int (*set_file_size)(const unsigned int, struct cifs_tcon *,
+			     struct cifsFileInfo *, __u64, bool);
+	/* set attributes */
+	int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
+			     const unsigned int);
+	int (*set_compression)(const unsigned int, struct cifs_tcon *,
+			       struct cifsFileInfo *);
+	/* check if we can send an echo or nor */
+	bool (*can_echo)(struct TCP_Server_Info *);
+	/* send echo request */
+	int (*echo)(struct TCP_Server_Info *);
+	/* create directory */
+	int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *,
+		     struct cifs_sb_info *);
+	/* set info on created directory */
+	void (*mkdir_setinfo)(struct inode *, const char *,
+			      struct cifs_sb_info *, struct cifs_tcon *,
+			      const unsigned int);
+	/* remove directory */
+	int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *,
+		     struct cifs_sb_info *);
+	/* unlink file */
+	int (*unlink)(const unsigned int, struct cifs_tcon *, const char *,
+		      struct cifs_sb_info *);
+	/* open, rename and delete file */
+	int (*rename_pending_delete)(const char *, struct dentry *,
+				     const unsigned int);
+	/* send rename request */
+	int (*rename)(const unsigned int, struct cifs_tcon *, const char *,
+		      const char *, struct cifs_sb_info *);
+	/* send create hardlink request */
+	int (*create_hardlink)(const unsigned int, struct cifs_tcon *,
+			       const char *, const char *,
+			       struct cifs_sb_info *);
+	/* query symlink target */
+	int (*query_symlink)(const unsigned int, struct cifs_tcon *,
+			     const char *, char **, struct cifs_sb_info *);
+	/* open a file for non-posix mounts */
+	int (*open)(const unsigned int, struct cifs_open_parms *,
+		    __u32 *, FILE_ALL_INFO *);
+	/* set fid protocol-specific info */
+	void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
+	/* close a file */
+	void (*close)(const unsigned int, struct cifs_tcon *,
+		      struct cifs_fid *);
+	/* send a flush request to the server */
+	int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *);
+	/* async read from the server */
+	int (*async_readv)(struct cifs_readdata *);
+	/* async write to the server */
+	int (*async_writev)(struct cifs_writedata *,
+			    void (*release)(struct kref *));
+	/* sync read from the server */
+	int (*sync_read)(const unsigned int, struct cifs_fid *,
+			 struct cifs_io_parms *, unsigned int *, char **,
+			 int *);
+	/* sync write to the server */
+	int (*sync_write)(const unsigned int, struct cifs_fid *,
+			  struct cifs_io_parms *, unsigned int *, struct kvec *,
+			  unsigned long);
+	/* open dir, start readdir */
+	int (*query_dir_first)(const unsigned int, struct cifs_tcon *,
+			       const char *, struct cifs_sb_info *,
+			       struct cifs_fid *, __u16,
+			       struct cifs_search_info *);
+	/* continue readdir */
+	int (*query_dir_next)(const unsigned int, struct cifs_tcon *,
+			      struct cifs_fid *,
+			      __u16, struct cifs_search_info *srch_inf);
+	/* close dir */
+	int (*close_dir)(const unsigned int, struct cifs_tcon *,
+			 struct cifs_fid *);
+	/* calculate a size of SMB message */
+	unsigned int (*calc_smb_size)(void *);
+	/* check for STATUS_PENDING and process it in a positive case */
+	bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+	/* send oplock break response */
+	int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *,
+			       struct cifsInodeInfo *);
+	/* query remote filesystem */
+	int (*queryfs)(const unsigned int, struct cifs_tcon *,
+		       struct kstatfs *);
+	/* send mandatory brlock to the server */
+	int (*mand_lock)(const unsigned int, struct cifsFileInfo *, __u64,
+			 __u64, __u32, int, int, bool);
+	/* unlock range of mandatory locks */
+	int (*mand_unlock_range)(struct cifsFileInfo *, struct file_lock *,
+				 const unsigned int);
+	/* push brlocks from the cache to the server */
+	int (*push_mand_locks)(struct cifsFileInfo *);
+	/* get lease key of the inode */
+	void (*get_lease_key)(struct inode *, struct cifs_fid *);
+	/* set lease key of the inode */
+	void (*set_lease_key)(struct inode *, struct cifs_fid *);
+	/* generate new lease key */
+	void (*new_lease_key)(struct cifs_fid *);
+	int (*generate_signingkey)(struct cifs_ses *);
+	int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+	int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
+				struct cifs_sb_info *, const unsigned char *,
+				char *, unsigned int *);
+	int (*create_mf_symlink)(unsigned int, struct cifs_tcon *,
+				 struct cifs_sb_info *, const unsigned char *,
+				 char *, unsigned int *);
+	/* if we can do cache read operations */
+	bool (*is_read_op)(__u32);
+	/* set oplock level for the inode */
+	void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int,
+				 bool *);
+	/* create lease context buffer for CREATE request */
+	char * (*create_lease_buf)(u8 *, u8);
+	/* parse lease context buffer and return oplock/epoch info */
+	__u8 (*parse_lease_buf)(void *, unsigned int *);
+	int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
+			struct cifsFileInfo *target_file, u64 src_off, u64 len,
+			u64 dest_off);
+	int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
+	ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
+			const unsigned char *, const unsigned char *, char *,
+			size_t, const struct nls_table *, int);
+	int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
+			const char *, const void *, const __u16,
+			const struct nls_table *, int);
+	struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
+			const char *, u32 *);
+	struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *,
+			const struct cifs_fid *, u32 *);
+	int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
+			int);
+	/* writepages retry size */
+	unsigned int (*wp_retry_size)(struct inode *);
+	/* get mtu credits */
+	int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
+				unsigned int *, unsigned int *);
+	/* check if we need to issue closedir */
+	bool (*dir_needs_close)(struct cifsFileInfo *);
+	long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
+			  loff_t);
+};
+
+struct smb_version_values {
+	char		*version_string;
+	__u16		protocol_id;
+	__u32		req_capabilities;
+	__u32		large_lock_type;
+	__u32		exclusive_lock_type;
+	__u32		shared_lock_type;
+	__u32		unlock_lock_type;
+	size_t		header_size;
+	size_t		max_header_size;
+	size_t		read_rsp_size;
+	__le16		lock_cmd;
+	unsigned int	cap_unix;
+	unsigned int	cap_nt_find;
+	unsigned int	cap_large_files;
+	__u16		signing_enabled;
+	__u16		signing_required;
+	size_t		create_lease_size;
+};
+
+#define HEADER_SIZE(server) (server->vals->header_size)
+#define MAX_HEADER_SIZE(server) (server->vals->max_header_size)
+
+struct smb_vol {
+	char *username;
+	char *password;
+	char *domainname;
+	char *UNC;
+	char *iocharset;  /* local code page for mapping to and from Unicode */
+	char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
+	char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+	kuid_t cred_uid;
+	kuid_t linux_uid;
+	kgid_t linux_gid;
+	kuid_t backupuid;
+	kgid_t backupgid;
+	umode_t file_mode;
+	umode_t dir_mode;
+	enum securityEnum sectype; /* sectype requested via mnt opts */
+	bool sign; /* was signing requested via mnt opts? */
+	bool retry:1;
+	bool intr:1;
+	bool setuids:1;
+	bool override_uid:1;
+	bool override_gid:1;
+	bool dynperm:1;
+	bool noperm:1;
+	bool no_psx_acl:1; /* set if posix acl support should be disabled */
+	bool cifs_acl:1;
+	bool backupuid_specified; /* mount option  backupuid  is specified */
+	bool backupgid_specified; /* mount option  backupgid  is specified */
+	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
+	bool server_ino:1; /* use inode numbers from server ie UniqueId */
+	bool direct_io:1;
+	bool strict_io:1; /* strict cache behavior */
+	bool remap:1;      /* set to remap seven reserved chars in filenames */
+	bool sfu_remap:1;  /* remap seven reserved chars ala SFU */
+	bool posix_paths:1; /* unset to not ask for posix pathnames. */
+	bool no_linux_ext:1;
+	bool sfu_emul:1;
+	bool nullauth:1;   /* attempt to authenticate with null user */
+	bool nocase:1;     /* request case insensitive filenames */
+	bool nobrl:1;      /* disable sending byte range locks to srv */
+	bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
+	bool seal:1;       /* request transport encryption on share */
+	bool nodfs:1;      /* Do not request DFS, even if available */
+	bool local_lease:1; /* check leases only on local system, not remote */
+	bool noblocksnd:1;
+	bool noautotune:1;
+	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+	bool fsc:1;	/* enable fscache */
+	bool mfsymlinks:1; /* use Minshall+French Symlinks */
+	bool multiuser:1;
+	bool rwpidforward:1; /* pid forward for read/write operations */
+	bool nosharesock;
+	unsigned int rsize;
+	unsigned int wsize;
+	bool sockopt_tcp_nodelay:1;
+	unsigned long actimeo; /* attribute cache timeout (jiffies) */
+	struct smb_version_operations *ops;
+	struct smb_version_values *vals;
+	char *prepath;
+	struct sockaddr_storage dstaddr; /* destination address */
+	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
+	struct nls_table *local_nls;
+};
+
+#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
+			 CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
+			 CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
+			 CIFS_MOUNT_MAP_SFM_CHR | \
+			 CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \
+			 CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \
+			 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
+			 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
+			 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
+			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
+			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
+
+#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
+		      MS_NODEV | MS_SYNCHRONOUS)
+
+struct cifs_mnt_data {
+	struct cifs_sb_info *cifs_sb;
+	struct smb_vol *vol;
+	int flags;
+};
+
+static inline unsigned int
+get_rfc1002_length(void *buf)
+{
+	return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
+}
+
+static inline void
+inc_rfc1001_len(void *buf, int count)
+{
+	be32_add_cpu((__be32 *)buf, count);
+}
+
+struct TCP_Server_Info {
+	struct list_head tcp_ses_list;
+	struct list_head smb_ses_list;
+	int srv_count; /* reference counter */
+	/* 15 character server name + 0x20 16th byte indicating type = srv */
+	char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	struct smb_version_operations	*ops;
+	struct smb_version_values	*vals;
+	enum statusEnum tcpStatus; /* what we think the status is */
+	char *hostname; /* hostname portion of UNC string */
+	struct socket *ssocket;
+	struct sockaddr_storage dstaddr;
+	struct sockaddr_storage srcaddr; /* locally bind to this IP */
+#ifdef CONFIG_NET_NS
+	struct net *net;
+#endif
+	wait_queue_head_t response_q;
+	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
+	struct list_head pending_mid_q;
+	bool noblocksnd;		/* use blocking sendmsg */
+	bool noautotune;		/* do not autotune send buf sizes */
+	bool tcp_nodelay;
+	int credits;  /* send no more requests at once */
+	unsigned int in_flight;  /* number of requests on the wire to server */
+	spinlock_t req_lock;  /* protect the two values above */
+	struct mutex srv_mutex;
+	struct task_struct *tsk;
+	char server_GUID[16];
+	__u16 sec_mode;
+	bool sign; /* is signing enabled on this connection? */
+	bool session_estab; /* mark when very first sess is established */
+#ifdef CONFIG_CIFS_SMB2
+	int echo_credits;  /* echo reserved slots */
+	int oplock_credits;  /* oplock break reserved slots */
+	bool echoes:1; /* enable echoes */
+	__u8 client_guid[SMB2_CLIENT_GUID_SIZE]; /* Client GUID */
+#endif
+	u16 dialect; /* dialect index that server chose */
+	bool oplocks:1; /* enable oplocks */
+	unsigned int maxReq;	/* Clients should submit no more */
+	/* than maxReq distinct unanswered SMBs to the server when using  */
+	/* multiplexed reads or writes */
+	unsigned int maxBuf;	/* maxBuf specifies the maximum */
+	/* message size the server can send or receive for non-raw SMBs */
+	/* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
+	/* when socket is setup (and during reconnect) before NegProt sent */
+	unsigned int max_rw;	/* maxRw specifies the maximum */
+	/* message size the server can send or receive for */
+	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+	unsigned int capabilities; /* selective disabling of caps by smb sess */
+	int timeAdj;  /* Adjust for difference in server time zone in sec */
+	__u64 CurrentMid;         /* multiplex id - rotating counter */
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
+	/* 16th byte of RFC1001 workstation name is always null */
+	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	__u32 sequence_number; /* for signing, protected by srv_mutex */
+	struct session_key session_key;
+	unsigned long lstrp; /* when we got last response from this server */
+	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
+#define	CIFS_NEGFLAVOR_LANMAN	0	/* wct == 13, LANMAN */
+#define	CIFS_NEGFLAVOR_UNENCAP	1	/* wct == 17, but no ext_sec */
+#define	CIFS_NEGFLAVOR_EXTENDED	2	/* wct == 17, ext_sec bit set */
+	char	negflavor;	/* NEGOTIATE response flavor */
+	/* extended security flavors that server supports */
+	bool	sec_ntlmssp;		/* supports NTLMSSP */
+	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
+	bool	sec_kerberos;		/* supports plain Kerberos */
+	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
+	bool	large_buf;		/* is current buffer large? */
+	struct delayed_work	echo; /* echo ping workqueue job */
+	struct kvec *iov;	/* reusable kvec array for receives */
+	unsigned int nr_iov;	/* number of kvecs in array */
+	char	*smallbuf;	/* pointer to current "small" buffer */
+	char	*bigbuf;	/* pointer to current "big" buffer */
+	unsigned int total_read; /* total amount of data read in this pass */
+#ifdef CONFIG_CIFS_FSCACHE
+	struct fscache_cookie   *fscache; /* client index cache cookie */
+#endif
+#ifdef CONFIG_CIFS_STATS2
+	atomic_t in_send; /* requests trying to send */
+	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+#endif
+#ifdef CONFIG_CIFS_SMB2
+	unsigned int	max_read;
+	unsigned int	max_write;
+#endif /* CONFIG_CIFS_SMB2 */
+};
+
+static inline unsigned int
+in_flight(struct TCP_Server_Info *server)
+{
+	unsigned int num;
+	spin_lock(&server->req_lock);
+	num = server->in_flight;
+	spin_unlock(&server->req_lock);
+	return num;
+}
+
+static inline bool
+has_credits(struct TCP_Server_Info *server, int *credits)
+{
+	int num;
+	spin_lock(&server->req_lock);
+	num = *credits;
+	spin_unlock(&server->req_lock);
+	return num > 0;
+}
+
+static inline void
+add_credits(struct TCP_Server_Info *server, const unsigned int add,
+	    const int optype)
+{
+	server->ops->add_credits(server, add, optype);
+}
+
+static inline void
+add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
+			const int optype)
+{
+	if (add) {
+		server->ops->add_credits(server, add, optype);
+		wake_up(&server->request_q);
+	}
+}
+
+static inline void
+set_credits(struct TCP_Server_Info *server, const int val)
+{
+	server->ops->set_credits(server, val);
+}
+
+static inline __le64
+get_next_mid64(struct TCP_Server_Info *server)
+{
+	return cpu_to_le64(server->ops->get_next_mid(server));
+}
+
+static inline __le16
+get_next_mid(struct TCP_Server_Info *server)
+{
+	__u16 mid = server->ops->get_next_mid(server);
+	/*
+	 * The value in the SMB header should be little endian for easy
+	 * on-the-wire decoding.
+	 */
+	return cpu_to_le16(mid);
+}
+
+static inline __u16
+get_mid(const struct smb_hdr *smb)
+{
+	return le16_to_cpu(smb->Mid);
+}
+
+static inline bool
+compare_mid(__u16 mid, const struct smb_hdr *smb)
+{
+	return mid == le16_to_cpu(smb->Mid);
+}
+
+/*
+ * When the server supports very large reads and writes via POSIX extensions,
+ * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
+ * including the RFC1001 length.
+ *
+ * Note that this might make for "interesting" allocation problems during
+ * writeback however as we have to allocate an array of pointers for the
+ * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ *
+ * For reads, there is a similar problem as we need to allocate an array
+ * of kvecs to handle the receive, though that should only need to be done
+ * once.
+ */
+#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
+
+/*
+ * When the server doesn't allow large posix writes, only allow a rsize/wsize
+ * of 2^17-1 minus the size of the call header. That allows for a read or
+ * write up to the maximum size described by RFC1002.
+ */
+#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
+
+/*
+ * The default wsize is 1M. find_get_pages seems to return a maximum of 256
+ * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
+ * a single wsize request with a single call.
+ */
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+
+/*
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
+ */
+#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
+
+/*
+ * Macros to allow the TCP_Server_Info->net field and related code to drop out
+ * when CONFIG_NET_NS isn't set.
+ */
+
+#ifdef CONFIG_NET_NS
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return srv->net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+	srv->net = net;
+}
+
+#else
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return &init_net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+}
+
+#endif
+
+/*
+ * Session structure.  One of these for each uid session with a particular host
+ */
+struct cifs_ses {
+	struct list_head smb_ses_list;
+	struct list_head tcon_list;
+	struct mutex session_mutex;
+	struct TCP_Server_Info *server;	/* pointer to server info */
+	int ses_count;		/* reference counter */
+	enum statusEnum status;
+	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
+	__u16 ipc_tid;		/* special tid for connection to IPC share */
+	char *serverOS;		/* name of operating system underlying server */
+	char *serverNOS;	/* name of network operating system of server */
+	char *serverDomain;	/* security realm of server */
+	__u64 Suid;		/* remote smb uid  */
+	kuid_t linux_uid;	/* overriding owner of files on the mount */
+	kuid_t cred_uid;	/* owner of credentials */
+	unsigned int capabilities;
+	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for
+				TCP names - will ipv6 and sctp addresses fit? */
+	char *user_name;	/* must not be null except during init of sess
+				   and after mount option parsing we fill it */
+	char *domainName;
+	char *password;
+	struct session_key auth_key;
+	struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
+	enum securityEnum sectype; /* what security flavor was specified? */
+	bool sign;		/* is signing required? */
+	bool need_reconnect:1; /* connection reset, uid now invalid */
+#ifdef CONFIG_CIFS_SMB2
+	__u16 session_flags;
+	char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
+#endif /* CONFIG_CIFS_SMB2 */
+};
+
+static inline bool
+cap_unix(struct cifs_ses *ses)
+{
+	return ses->server->vals->cap_unix & ses->capabilities;
+}
+
+/*
+ * there is one of these for each connection to a resource on a particular
+ * session
+ */
+struct cifs_tcon {
+	struct list_head tcon_list;
+	int tc_count;
+	struct list_head openFileList;
+	struct cifs_ses *ses;	/* pointer to session associated with */
+	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
+	char *nativeFileSystem;
+	char *password;		/* for share-level security */
+	__u32 tid;		/* The 4 byte tree id */
+	__u16 Flags;		/* optional support bits */
+	enum statusEnum tidStatus;
+#ifdef CONFIG_CIFS_STATS
+	atomic_t num_smbs_sent;
+	union {
+		struct {
+			atomic_t num_writes;
+			atomic_t num_reads;
+			atomic_t num_flushes;
+			atomic_t num_oplock_brks;
+			atomic_t num_opens;
+			atomic_t num_closes;
+			atomic_t num_deletes;
+			atomic_t num_mkdirs;
+			atomic_t num_posixopens;
+			atomic_t num_posixmkdirs;
+			atomic_t num_rmdirs;
+			atomic_t num_renames;
+			atomic_t num_t2renames;
+			atomic_t num_ffirst;
+			atomic_t num_fnext;
+			atomic_t num_fclose;
+			atomic_t num_hardlinks;
+			atomic_t num_symlinks;
+			atomic_t num_locks;
+			atomic_t num_acl_get;
+			atomic_t num_acl_set;
+		} cifs_stats;
+#ifdef CONFIG_CIFS_SMB2
+		struct {
+			atomic_t smb2_com_sent[NUMBER_OF_SMB2_COMMANDS];
+			atomic_t smb2_com_failed[NUMBER_OF_SMB2_COMMANDS];
+		} smb2_stats;
+#endif /* CONFIG_CIFS_SMB2 */
+	} stats;
+#ifdef CONFIG_CIFS_STATS2
+	unsigned long long time_writes;
+	unsigned long long time_reads;
+	unsigned long long time_opens;
+	unsigned long long time_deletes;
+	unsigned long long time_closes;
+	unsigned long long time_mkdirs;
+	unsigned long long time_rmdirs;
+	unsigned long long time_renames;
+	unsigned long long time_t2renames;
+	unsigned long long time_ffirst;
+	unsigned long long time_fnext;
+	unsigned long long time_fclose;
+#endif /* CONFIG_CIFS_STATS2 */
+	__u64    bytes_read;
+	__u64    bytes_written;
+	spinlock_t stat_lock;
+#endif /* CONFIG_CIFS_STATS */
+	FILE_SYSTEM_DEVICE_INFO fsDevInfo;
+	FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
+	FILE_SYSTEM_UNIX_INFO fsUnixInfo;
+	bool ipc:1;		/* set if connection to IPC$ eg for RPC/PIPES */
+	bool retry:1;
+	bool nocase:1;
+	bool seal:1;      /* transport encryption for this mounted share */
+	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
+				for this mount even if server would support */
+	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
+	bool broken_sparse_sup; /* if server or share does not support sparse */
+	bool need_reconnect:1; /* connection reset, tid now invalid */
+#ifdef CONFIG_CIFS_SMB2
+	bool print:1;		/* set if connection to printer share */
+	bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
+	__le32 capabilities;
+	__u32 share_flags;
+	__u32 maximal_access;
+	__u32 vol_serial_number;
+	__le64 vol_create_time;
+	__u32 ss_flags;		/* sector size flags */
+	__u32 perf_sector_size; /* best sector size for perf */
+	__u32 max_chunks;
+	__u32 max_bytes_chunk;
+	__u32 max_bytes_copy;
+#endif /* CONFIG_CIFS_SMB2 */
+#ifdef CONFIG_CIFS_FSCACHE
+	u64 resource_id;		/* server resource id */
+	struct fscache_cookie *fscache;	/* cookie for share */
+#endif
+	struct list_head pending_opens;	/* list of incomplete opens */
+	/* BB add field for back pointer to sb struct(s)? */
+};
+
+/*
+ * This is a refcounted and timestamped container for a tcon pointer. The
+ * container holds a tcon reference. It is considered safe to free one of
+ * these when the tl_count goes to 0. The tl_time is the time of the last
+ * "get" on the container.
+ */
+struct tcon_link {
+	struct rb_node		tl_rbnode;
+	kuid_t			tl_uid;
+	unsigned long		tl_flags;
+#define TCON_LINK_MASTER	0
+#define TCON_LINK_PENDING	1
+#define TCON_LINK_IN_TREE	2
+	unsigned long		tl_time;
+	atomic_t		tl_count;
+	struct cifs_tcon	*tl_tcon;
+};
+
+extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
+
+static inline struct cifs_tcon *
+tlink_tcon(struct tcon_link *tlink)
+{
+	return tlink->tl_tcon;
+}
+
+extern void cifs_put_tlink(struct tcon_link *tlink);
+
+static inline struct tcon_link *
+cifs_get_tlink(struct tcon_link *tlink)
+{
+	if (tlink && !IS_ERR(tlink))
+		atomic_inc(&tlink->tl_count);
+	return tlink;
+}
+
+/* This function is always expected to succeed */
+extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+
+#define CIFS_OPLOCK_NO_CHANGE 0xfe
+
+struct cifs_pending_open {
+	struct list_head olist;
+	struct tcon_link *tlink;
+	__u8 lease_key[16];
+	__u32 oplock;
+};
+
+/*
+ * This info hangs off the cifsFileInfo structure, pointed to by llist.
+ * This is used to track byte stream locks on the file
+ */
+struct cifsLockInfo {
+	struct list_head llist;	/* pointer to next cifsLockInfo */
+	struct list_head blist; /* pointer to locks blocked on this */
+	wait_queue_head_t block_q;
+	__u64 offset;
+	__u64 length;
+	__u32 pid;
+	__u32 type;
+};
+
+/*
+ * One of these for each open instance of a file
+ */
+struct cifs_search_info {
+	loff_t index_of_last_entry;
+	__u16 entries_in_buffer;
+	__u16 info_level;
+	__u32 resume_key;
+	char *ntwrk_buf_start;
+	char *srch_entries_start;
+	char *last_entry;
+	const char *presume_name;
+	unsigned int resume_name_len;
+	bool endOfSearch:1;
+	bool emptyDir:1;
+	bool unicode:1;
+	bool smallBuf:1; /* so we know which buf_release function to call */
+};
+
+struct cifs_open_parms {
+	struct cifs_tcon *tcon;
+	struct cifs_sb_info *cifs_sb;
+	int disposition;
+	int desired_access;
+	int create_options;
+	const char *path;
+	struct cifs_fid *fid;
+	bool reconnect:1;
+};
+
+struct cifs_fid {
+	__u16 netfid;
+#ifdef CONFIG_CIFS_SMB2
+	__u64 persistent_fid;	/* persist file id for smb2 */
+	__u64 volatile_fid;	/* volatile file id for smb2 */
+	__u8 lease_key[SMB2_LEASE_KEY_SIZE];	/* lease key for smb2 */
+#endif
+	struct cifs_pending_open *pending_open;
+	unsigned int epoch;
+	bool purge_cache;
+};
+
+struct cifs_fid_locks {
+	struct list_head llist;
+	struct cifsFileInfo *cfile;	/* fid that owns locks */
+	struct list_head locks;		/* locks held by fid above */
+};
+
+struct cifsFileInfo {
+	struct list_head tlist;	/* pointer to next fid owned by tcon */
+	struct list_head flist;	/* next fid (file instance) for this inode */
+	struct cifs_fid_locks *llist;	/* brlocks held by this fid */
+	kuid_t uid;		/* allows finding which FileInfo structure */
+	__u32 pid;		/* process id who opened file */
+	struct cifs_fid fid;	/* file id from remote */
+	/* BB add lock scope info here if needed */ ;
+	/* lock scope id (0 if none) */
+	struct dentry *dentry;
+	unsigned int f_flags;
+	struct tcon_link *tlink;
+	bool invalidHandle:1;	/* file closed via session abend */
+	bool oplock_break_cancelled:1;
+	int count;		/* refcount protected by cifs_file_list_lock */
+	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
+	struct cifs_search_info srch_inf;
+	struct work_struct oplock_break; /* work for oplock breaks */
+};
+
+struct cifs_io_parms {
+	__u16 netfid;
+#ifdef CONFIG_CIFS_SMB2
+	__u64 persistent_fid;	/* persist file id for smb2 */
+	__u64 volatile_fid;	/* volatile file id for smb2 */
+#endif
+	__u32 pid;
+	__u64 offset;
+	unsigned int length;
+	struct cifs_tcon *tcon;
+};
+
+struct cifs_readdata;
+
+/* asynchronous read support */
+struct cifs_readdata {
+	struct kref			refcount;
+	struct list_head		list;
+	struct completion		done;
+	struct cifsFileInfo		*cfile;
+	struct address_space		*mapping;
+	__u64				offset;
+	unsigned int			bytes;
+	unsigned int			got_bytes;
+	pid_t				pid;
+	int				result;
+	struct work_struct		work;
+	int (*read_into_pages)(struct TCP_Server_Info *server,
+				struct cifs_readdata *rdata,
+				unsigned int len);
+	struct kvec			iov;
+	unsigned int			pagesz;
+	unsigned int			tailsz;
+	unsigned int			credits;
+	unsigned int			nr_pages;
+	struct page			*pages[];
+};
+
+struct cifs_writedata;
+
+/* asynchronous write support */
+struct cifs_writedata {
+	struct kref			refcount;
+	struct list_head		list;
+	struct completion		done;
+	enum writeback_sync_modes	sync_mode;
+	struct work_struct		work;
+	struct cifsFileInfo		*cfile;
+	__u64				offset;
+	pid_t				pid;
+	unsigned int			bytes;
+	int				result;
+	unsigned int			pagesz;
+	unsigned int			tailsz;
+	unsigned int			credits;
+	unsigned int			nr_pages;
+	struct page			*pages[];
+};
+
+/*
+ * Take a reference on the file private data. Must be called with
+ * cifs_file_list_lock held.
+ */
+static inline void
+cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
+{
+	++cifs_file->count;
+}
+
+struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
+
+#define CIFS_CACHE_READ_FLG	1
+#define CIFS_CACHE_HANDLE_FLG	2
+#define CIFS_CACHE_RH_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_HANDLE_FLG)
+#define CIFS_CACHE_WRITE_FLG	4
+#define CIFS_CACHE_RW_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
+#define CIFS_CACHE_RHW_FLG	(CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
+
+#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG)
+#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
+#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG)
+
+/*
+ * One of these for each file inode
+ */
+
+struct cifsInodeInfo {
+	bool can_cache_brlcks;
+	struct list_head llist;	/* locks helb by this inode */
+	struct rw_semaphore lock_sem;	/* protect the fields above */
+	/* BB add in lists for dirty pages i.e. write caching info for oplock */
+	struct list_head openFileList;
+	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
+	unsigned int oplock;		/* oplock/lease level we have */
+	unsigned int epoch;		/* used to track lease state changes */
+#define CIFS_INODE_PENDING_OPLOCK_BREAK   (0) /* oplock break in progress */
+#define CIFS_INODE_PENDING_WRITERS	  (1) /* Writes in progress */
+#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+#define CIFS_INO_DELETE_PENDING		  (3) /* delete pending on server */
+#define CIFS_INO_INVALID_MAPPING	  (4) /* pagecache is invalid */
+#define CIFS_INO_LOCK			  (5) /* lock bit for synchronization */
+	unsigned long flags;
+	spinlock_t writers_lock;
+	unsigned int writers;		/* Number of writers on this inode */
+	unsigned long time;		/* jiffies of last update of inode */
+	u64  server_eof;		/* current file size on server -- protected by i_lock */
+	u64  uniqueid;			/* server inode number */
+	u64  createtime;		/* creation time on server */
+#ifdef CONFIG_CIFS_SMB2
+	__u8 lease_key[SMB2_LEASE_KEY_SIZE];	/* lease key for this inode */
+#endif
+#ifdef CONFIG_CIFS_FSCACHE
+	struct fscache_cookie *fscache;
+#endif
+	struct inode vfs_inode;
+};
+
+static inline struct cifsInodeInfo *
+CIFS_I(struct inode *inode)
+{
+	return container_of(inode, struct cifsInodeInfo, vfs_inode);
+}
+
+static inline struct cifs_sb_info *
+CIFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct cifs_sb_info *
+CIFS_FILE_SB(struct file *file)
+{
+	return CIFS_SB(file_inode(file)->i_sb);
+}
+
+static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+		return '/';
+	else
+		return '\\';
+}
+
+static inline void
+convert_delimiter(char *path, char delim)
+{
+	char old_delim, *pos;
+
+	if (delim == '/')
+		old_delim = '\\';
+	else
+		old_delim = '/';
+
+	pos = path;
+	while ((pos = strchr(pos, old_delim)))
+		*pos = delim;
+}
+
+#ifdef CONFIG_CIFS_STATS
+#define cifs_stats_inc atomic_inc
+
+static inline void cifs_stats_bytes_written(struct cifs_tcon *tcon,
+					    unsigned int bytes)
+{
+	if (bytes) {
+		spin_lock(&tcon->stat_lock);
+		tcon->bytes_written += bytes;
+		spin_unlock(&tcon->stat_lock);
+	}
+}
+
+static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
+					 unsigned int bytes)
+{
+	spin_lock(&tcon->stat_lock);
+	tcon->bytes_read += bytes;
+	spin_unlock(&tcon->stat_lock);
+}
+#else
+
+#define  cifs_stats_inc(field) do {} while (0)
+#define  cifs_stats_bytes_written(tcon, bytes) do {} while (0)
+#define  cifs_stats_bytes_read(tcon, bytes) do {} while (0)
+
+#endif
+
+
+/*
+ * This is the prototype for the mid receive function. This function is for
+ * receiving the rest of the SMB frame, starting with the WordCount (which is
+ * just after the MID in struct smb_hdr). Note:
+ *
+ * - This will be called by cifsd, with no locks held.
+ * - The mid will still be on the pending_mid_q.
+ * - mid->resp_buf will point to the current buffer.
+ *
+ * Returns zero on a successful receive, or an error. The receive state in
+ * the TCP_Server_Info will also be updated.
+ */
+typedef int (mid_receive_t)(struct TCP_Server_Info *server,
+			    struct mid_q_entry *mid);
+
+/*
+ * This is the prototype for the mid callback function. This is called once the
+ * mid has been received off of the socket. When creating one, take special
+ * care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd, with no locks held
+ * - the mid will be removed from any lists
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
+
+/* one of these for every pending CIFS request to the server */
+struct mid_q_entry {
+	struct list_head qhead;	/* mids waiting on reply from this server */
+	struct TCP_Server_Info *server;	/* server corresponding to this mid */
+	__u64 mid;		/* multiplex id */
+	__u32 pid;		/* process id */
+	__u32 sequence_number;  /* for CIFS signing */
+	unsigned long when_alloc;  /* when mid was created */
+#ifdef CONFIG_CIFS_STATS2
+	unsigned long when_sent; /* time when smb send finished */
+	unsigned long when_received; /* when demux complete (taken off wire) */
+#endif
+	mid_receive_t *receive; /* call receive callback */
+	mid_callback_t *callback; /* call completion callback */
+	void *callback_data;	  /* general purpose pointer for callback */
+	void *resp_buf;		/* pointer to received SMB header */
+	int mid_state;	/* wish this were enum but can not pass to wait_event */
+	__le16 command;		/* smb command code */
+	bool large_buf:1;	/* if valid response, is pointer to large buf */
+	bool multiRsp:1;	/* multiple trans2 responses for one request  */
+	bool multiEnd:1;	/* both received */
+};
+
+/*	Make code in transport.c a little cleaner by moving
+	update of optional stats into function below */
+#ifdef CONFIG_CIFS_STATS2
+
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+	atomic_inc(&server->in_send);
+}
+
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+	atomic_dec(&server->in_send);
+}
+
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+	atomic_inc(&server->num_waiters);
+}
+
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+	atomic_dec(&server->num_waiters);
+}
+
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+	mid->when_sent = jiffies;
+}
+#else
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+}
+#endif
+
+/* for pending dnotify requests */
+struct dir_notify_req {
+	struct list_head lhead;
+	__le16 Pid;
+	__le16 PidHigh;
+	__u16 Mid;
+	__u16 Tid;
+	__u16 Uid;
+	__u16 netfid;
+	__u32 filter; /* CompletionFilter (for multishot) */
+	int multishot;
+	struct file *pfile;
+};
+
+struct dfs_info3_param {
+	int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/
+	int path_consumed;
+	int server_type;
+	int ref_flag;
+	char *path_name;
+	char *node_name;
+};
+
+/*
+ * common struct for holding inode info when searching for or updating an
+ * inode with new info
+ */
+
+#define CIFS_FATTR_DFS_REFERRAL		0x1
+#define CIFS_FATTR_DELETE_PENDING	0x2
+#define CIFS_FATTR_NEED_REVAL		0x4
+#define CIFS_FATTR_INO_COLLISION	0x8
+#define CIFS_FATTR_UNKNOWN_NLINK	0x10
+
+struct cifs_fattr {
+	u32		cf_flags;
+	u32		cf_cifsattrs;
+	u64		cf_uniqueid;
+	u64		cf_eof;
+	u64		cf_bytes;
+	u64		cf_createtime;
+	kuid_t		cf_uid;
+	kgid_t		cf_gid;
+	umode_t		cf_mode;
+	dev_t		cf_rdev;
+	unsigned int	cf_nlink;
+	unsigned int	cf_dtype;
+	struct timespec	cf_atime;
+	struct timespec	cf_mtime;
+	struct timespec	cf_ctime;
+};
+
+static inline void free_dfs_info_param(struct dfs_info3_param *param)
+{
+	if (param) {
+		kfree(param->path_name);
+		kfree(param->node_name);
+		kfree(param);
+	}
+}
+
+static inline void free_dfs_info_array(struct dfs_info3_param *param,
+				       int number_of_items)
+{
+	int i;
+	if ((number_of_items == 0) || (param == NULL))
+		return;
+	for (i = 0; i < number_of_items; i++) {
+		kfree(param[i].path_name);
+		kfree(param[i].node_name);
+	}
+	kfree(param);
+}
+
+#define   MID_FREE 0
+#define   MID_REQUEST_ALLOCATED 1
+#define   MID_REQUEST_SUBMITTED 2
+#define   MID_RESPONSE_RECEIVED 4
+#define   MID_RETRY_NEEDED      8 /* session closed while this request out */
+#define   MID_RESPONSE_MALFORMED 0x10
+#define   MID_SHUTDOWN		 0x20
+
+/* Types of response buffer returned from SendReceive2 */
+#define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
+#define   CIFS_SMALL_BUFFER     1
+#define   CIFS_LARGE_BUFFER     2
+#define   CIFS_IOVEC            4    /* array of response buffers */
+
+/* Type of Request to SendReceive2 */
+#define   CIFS_BLOCKING_OP      1    /* operation can block */
+#define   CIFS_ASYNC_OP         2    /* do not wait for response */
+#define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
+#define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
+#define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
+#define   CIFS_NO_RESP      0x040    /* no response buffer required */
+
+/* Type of request operation */
+#define   CIFS_ECHO_OP      0x080    /* echo request */
+#define   CIFS_OBREAK_OP   0x0100    /* oplock break request */
+#define   CIFS_NEG_OP      0x0200    /* negotiate request */
+#define   CIFS_OP_MASK     0x0380    /* mask request type */
+#define   CIFS_HAS_CREDITS 0x0400    /* already has credits */
+
+/* Security Flags: indicate type of session setup needed */
+#define   CIFSSEC_MAY_SIGN	0x00001
+#define   CIFSSEC_MAY_NTLM	0x00002
+#define   CIFSSEC_MAY_NTLMV2	0x00004
+#define   CIFSSEC_MAY_KRB5	0x00008
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define   CIFSSEC_MAY_LANMAN	0x00010
+#define   CIFSSEC_MAY_PLNTXT	0x00020
+#else
+#define   CIFSSEC_MAY_LANMAN    0
+#define   CIFSSEC_MAY_PLNTXT    0
+#endif /* weak passwords */
+#define   CIFSSEC_MAY_SEAL	0x00040 /* not supported yet */
+#define   CIFSSEC_MAY_NTLMSSP	0x00080 /* raw ntlmssp with ntlmv2 */
+
+#define   CIFSSEC_MUST_SIGN	0x01001
+/* note that only one of the following can be set so the
+result of setting MUST flags more than once will be to
+require use of the stronger protocol */
+#define   CIFSSEC_MUST_NTLM	0x02002
+#define   CIFSSEC_MUST_NTLMV2	0x04004
+#define   CIFSSEC_MUST_KRB5	0x08008
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define   CIFSSEC_MUST_LANMAN	0x10010
+#define   CIFSSEC_MUST_PLNTXT	0x20020
+#ifdef CONFIG_CIFS_UPCALL
+#define   CIFSSEC_MASK          0xBF0BF /* allows weak security but also krb5 */
+#else
+#define   CIFSSEC_MASK          0xB70B7 /* current flags supported if weak */
+#endif /* UPCALL */
+#else /* do not allow weak pw hash */
+#define   CIFSSEC_MUST_LANMAN	0
+#define   CIFSSEC_MUST_PLNTXT	0
+#ifdef CONFIG_CIFS_UPCALL
+#define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
+#else
+#define	  CIFSSEC_MASK          0x87087 /* flags supported if no weak allowed */
+#endif /* UPCALL */
+#endif /* WEAK_PW_HASH */
+#define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
+#define   CIFSSEC_MUST_NTLMSSP	0x80080 /* raw ntlmssp with ntlmv2 */
+
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
+#define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
+#define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
+/*
+ *****************************************************************
+ * All constants go here
+ *****************************************************************
+ */
+
+#define UID_HASH (16)
+
+/*
+ * Note that ONE module should define _DECLARE_GLOBALS_HERE to cause the
+ * following to be declared.
+ */
+
+/****************************************************************************
+ *  Locking notes.  All updates to global variables and lists should be
+ *                  protected by spinlocks or semaphores.
+ *
+ *  Spinlocks
+ *  ---------
+ *  GlobalMid_Lock protects:
+ *	list operations on pending_mid_q and oplockQ
+ *      updates to XID counters, multiplex id  and SMB sequence numbers
+ *  cifs_file_list_lock protects:
+ *	list operations on tcp and SMB session lists and tCon lists
+ *  f_owner.lock protects certain per file struct operations
+ *  mapping->page_lock protects certain per page operations
+ *
+ *  Semaphores
+ *  ----------
+ *  sesSem     operations on smb session
+ *  tconSem    operations on tree connection
+ *  fh_sem      file handle reconnection operations
+ *
+ ****************************************************************************/
+
+#ifdef DECLARE_GLOBALS_HERE
+#define GLOBAL_EXTERN
+#else
+#define GLOBAL_EXTERN extern
+#endif
+
+/*
+ * the list of TCP_Server_Info structures, ie each of the sockets
+ * connecting our client to a distinct server (ip address), is
+ * chained together by cifs_tcp_ses_list. The list of all our SMB
+ * sessions (and from that the tree connections) can be found
+ * by iterating over cifs_tcp_ses_list
+ */
+GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
+
+/*
+ * This lock protects the cifs_tcp_ses_list, the list of smb sessions per
+ * tcp session, and the list of tcon's per smb session. It also protects
+ * the reference counters for the server, smb session, and tcon. Finally,
+ * changes to the tcon->tidStatus should be done while holding this lock.
+ */
+GLOBAL_EXTERN spinlock_t		cifs_tcp_ses_lock;
+
+/*
+ * This lock protects the cifs_file->llist and cifs_file->flist
+ * list operations, and updates to some flags (cifs_file->invalidHandle)
+ * It will be moved to either use the tcon->stat_lock or equivalent later.
+ * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
+ * the cifs_tcp_ses_lock must be grabbed first and released last.
+ */
+GLOBAL_EXTERN spinlock_t	cifs_file_list_lock;
+
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+/* Outstanding dir notify requests */
+GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
+/* DirNotify response queue */
+GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
+
+/*
+ * Global transaction id (XID) information
+ */
+GLOBAL_EXTERN unsigned int GlobalCurrentXid;	/* protected by GlobalMid_Sem */
+GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
+GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Sem */
+GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
+					  /* on midQ entries */
+/*
+ *  Global counters, updated atomically
+ */
+GLOBAL_EXTERN atomic_t sesInfoAllocCount;
+GLOBAL_EXTERN atomic_t tconInfoAllocCount;
+GLOBAL_EXTERN atomic_t tcpSesAllocCount;
+GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
+GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
+
+/* Various Debug counters */
+GLOBAL_EXTERN atomic_t bufAllocCount;    /* current number allocated  */
+#ifdef CONFIG_CIFS_STATS2
+GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
+GLOBAL_EXTERN atomic_t totSmBufAllocCount;
+#endif
+GLOBAL_EXTERN atomic_t smBufAllocCount;
+GLOBAL_EXTERN atomic_t midCount;
+
+/* Misc globals */
+GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
+GLOBAL_EXTERN unsigned int lookupCacheEnabled;
+GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
+				with more secure ntlmssp2 challenge/resp */
+GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
+GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
+GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
+GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
+GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+
+#ifdef CONFIG_CIFS_ACL
+GLOBAL_EXTERN struct rb_root uidtree;
+GLOBAL_EXTERN struct rb_root gidtree;
+GLOBAL_EXTERN spinlock_t siduidlock;
+GLOBAL_EXTERN spinlock_t sidgidlock;
+GLOBAL_EXTERN struct rb_root siduidtree;
+GLOBAL_EXTERN struct rb_root sidgidtree;
+GLOBAL_EXTERN spinlock_t uidsidlock;
+GLOBAL_EXTERN spinlock_t gidsidlock;
+#endif /* CONFIG_CIFS_ACL */
+
+void cifs_oplock_break(struct work_struct *work);
+
+extern const struct slow_work_ops cifs_oplock_break_ops;
+extern struct workqueue_struct *cifsiod_wq;
+
+extern mempool_t *cifs_mid_poolp;
+
+/* Operations for different SMB versions */
+#define SMB1_VERSION_STRING	"1.0"
+extern struct smb_version_operations smb1_operations;
+extern struct smb_version_values smb1_values;
+#define SMB20_VERSION_STRING	"2.0"
+extern struct smb_version_operations smb20_operations;
+extern struct smb_version_values smb20_values;
+#define SMB21_VERSION_STRING	"2.1"
+extern struct smb_version_operations smb21_operations;
+extern struct smb_version_values smb21_values;
+#define SMB30_VERSION_STRING	"3.0"
+extern struct smb_version_operations smb30_operations;
+extern struct smb_version_values smb30_values;
+#define SMB302_VERSION_STRING	"3.02"
+/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
+extern struct smb_version_values smb302_values;
+#endif	/* _CIFS_GLOB_H */
diff --git a/kernel/fs/cifs/cifspdu.h b/kernel/fs/cifs/cifspdu.h
new file mode 100644
index 000000000..5f9822ac0
--- /dev/null
+++ b/kernel/fs/cifs/cifspdu.h
@@ -0,0 +1,2739 @@
+/*
+ *   fs/cifs/cifspdu.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFSPDU_H
+#define _CIFSPDU_H
+
+#include <net/sock.h>
+#include <asm/unaligned.h>
+#include "smbfsctl.h"
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define LANMAN_PROT 0
+#define LANMAN2_PROT 1
+#define CIFS_PROT   2
+#else
+#define CIFS_PROT   0
+#endif
+#define POSIX_PROT  (CIFS_PROT+1)
+#define BAD_PROT 0xFFFF
+
+/* SMB command codes:
+ * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie which include no useful data other than the SMB error code itself).
+ * This can allow us to avoid response buffer allocations and copy in some cases
+ */
+#define SMB_COM_CREATE_DIRECTORY      0x00 /* trivial response */
+#define SMB_COM_DELETE_DIRECTORY      0x01 /* trivial response */
+#define SMB_COM_CLOSE                 0x04 /* triv req/rsp, timestamp ignored */
+#define SMB_COM_FLUSH                 0x05 /* triv req/rsp */
+#define SMB_COM_DELETE                0x06 /* trivial response */
+#define SMB_COM_RENAME                0x07 /* trivial response */
+#define SMB_COM_QUERY_INFORMATION     0x08 /* aka getattr */
+#define SMB_COM_SETATTR               0x09 /* trivial response */
+#define SMB_COM_LOCKING_ANDX          0x24 /* trivial response */
+#define SMB_COM_COPY                  0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO                  0x2B /* echo request */
+#define SMB_COM_OPEN_ANDX             0x2D /* Legacy open for old servers */
+#define SMB_COM_READ_ANDX             0x2E
+#define SMB_COM_WRITE_ANDX            0x2F
+#define SMB_COM_TRANSACTION2          0x32
+#define SMB_COM_TRANSACTION2_SECONDARY 0x33
+#define SMB_COM_FIND_CLOSE2           0x34 /* trivial response */
+#define SMB_COM_TREE_DISCONNECT       0x71 /* trivial response */
+#define SMB_COM_NEGOTIATE             0x72
+#define SMB_COM_SESSION_SETUP_ANDX    0x73
+#define SMB_COM_LOGOFF_ANDX           0x74 /* trivial response */
+#define SMB_COM_TREE_CONNECT_ANDX     0x75
+#define SMB_COM_NT_TRANSACT           0xA0
+#define SMB_COM_NT_TRANSACT_SECONDARY 0xA1
+#define SMB_COM_NT_CREATE_ANDX        0xA2
+#define SMB_COM_NT_CANCEL             0xA4 /* no response */
+#define SMB_COM_NT_RENAME             0xA5 /* trivial response */
+
+/* Transact2 subcommand codes */
+#define TRANS2_OPEN                   0x00
+#define TRANS2_FIND_FIRST             0x01
+#define TRANS2_FIND_NEXT              0x02
+#define TRANS2_QUERY_FS_INFORMATION   0x03
+#define TRANS2_SET_FS_INFORMATION     0x04
+#define TRANS2_QUERY_PATH_INFORMATION 0x05
+#define TRANS2_SET_PATH_INFORMATION   0x06
+#define TRANS2_QUERY_FILE_INFORMATION 0x07
+#define TRANS2_SET_FILE_INFORMATION   0x08
+#define TRANS2_GET_DFS_REFERRAL       0x10
+#define TRANS2_REPORT_DFS_INCOSISTENCY 0x11
+
+/* SMB Transact (Named Pipe) subcommand codes */
+#define TRANS_SET_NMPIPE_STATE      0x0001
+#define TRANS_RAW_READ_NMPIPE       0x0011
+#define TRANS_QUERY_NMPIPE_STATE    0x0021
+#define TRANS_QUERY_NMPIPE_INFO     0x0022
+#define TRANS_PEEK_NMPIPE           0x0023
+#define TRANS_TRANSACT_NMPIPE       0x0026
+#define TRANS_RAW_WRITE_NMPIPE      0x0031
+#define TRANS_READ_NMPIPE           0x0036
+#define TRANS_WRITE_NMPIPE          0x0037
+#define TRANS_WAIT_NMPIPE           0x0053
+#define TRANS_CALL_NMPIPE           0x0054
+
+/* NT Transact subcommand codes */
+#define NT_TRANSACT_CREATE            0x01
+#define NT_TRANSACT_IOCTL             0x02
+#define NT_TRANSACT_SET_SECURITY_DESC 0x03
+#define NT_TRANSACT_NOTIFY_CHANGE     0x04
+#define NT_TRANSACT_RENAME            0x05
+#define NT_TRANSACT_QUERY_SECURITY_DESC 0x06
+#define NT_TRANSACT_GET_USER_QUOTA    0x07
+#define NT_TRANSACT_SET_USER_QUOTA    0x08
+
+#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */
+/* future chained NTCreateXReadX bigger, but for time being NTCreateX biggest */
+/* among the requests (NTCreateX response is bigger with wct of 34) */
+#define MAX_CIFS_HDR_SIZE 0x58 /* 4 len + 32 hdr + (2*24 wct) + 2 bct + 2 pad */
+#define CIFS_SMALL_PATH 120 /* allows for (448-88)/3 */
+
+/* internal cifs vfs structures */
+/*****************************************************************
+ * All constants go here
+ *****************************************************************
+ */
+
+/*
+ * Starting value for maximum SMB size negotiation
+ */
+#define CIFS_MAX_MSGSIZE (4*4096)
+
+/*
+ * Size of encrypted user password in bytes
+ */
+#define CIFS_ENCPWD_SIZE (16)
+
+/*
+ * Size of the crypto key returned on the negotiate SMB in bytes
+ */
+#define CIFS_CRYPTO_KEY_SIZE (8)
+
+/*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE (24)
+
+/*
+ * Size of the session key (crypto key encrypted with the password
+ */
+#define CIFS_SESS_KEY_SIZE (16)
+
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE (16)
+
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
+
+/*
+ * Maximum user name length
+ */
+#define CIFS_UNLEN (20)
+
+/*
+ * Flags on SMB open
+ */
+#define SMBOPEN_WRITE_THROUGH 0x4000
+#define SMBOPEN_DENY_ALL      0x0010
+#define SMBOPEN_DENY_WRITE    0x0020
+#define SMBOPEN_DENY_READ     0x0030
+#define SMBOPEN_DENY_NONE     0x0040
+#define SMBOPEN_READ          0x0000
+#define SMBOPEN_WRITE         0x0001
+#define SMBOPEN_READWRITE     0x0002
+#define SMBOPEN_EXECUTE       0x0003
+
+#define SMBOPEN_OCREATE       0x0010
+#define SMBOPEN_OTRUNC        0x0002
+#define SMBOPEN_OAPPEND       0x0001
+
+/*
+ * SMB flag definitions
+ */
+#define SMBFLG_EXTD_LOCK 0x01	/* server supports lock-read write-unlock smb */
+#define SMBFLG_RCV_POSTED 0x02	/* obsolete */
+#define SMBFLG_RSVD 0x04
+#define SMBFLG_CASELESS 0x08	/* all pathnames treated as caseless (off
+				implies case sensitive file handling request) */
+#define SMBFLG_CANONICAL_PATH_FORMAT 0x10	/* obsolete */
+#define SMBFLG_OLD_OPLOCK 0x20	/* obsolete */
+#define SMBFLG_OLD_OPLOCK_NOTIFY 0x40	/* obsolete */
+#define SMBFLG_RESPONSE 0x80	/* this PDU is a response from server */
+
+/*
+ * SMB flag2 definitions
+ */
+#define SMBFLG2_KNOWS_LONG_NAMES cpu_to_le16(1)	/* can send long (non-8.3)
+						   path names in response */
+#define SMBFLG2_KNOWS_EAS cpu_to_le16(2)
+#define SMBFLG2_SECURITY_SIGNATURE cpu_to_le16(4)
+#define SMBFLG2_COMPRESSED (8)
+#define SMBFLG2_SECURITY_SIGNATURE_REQUIRED (0x10)
+#define SMBFLG2_IS_LONG_NAME cpu_to_le16(0x40)
+#define SMBFLG2_REPARSE_PATH (0x400)
+#define SMBFLG2_EXT_SEC cpu_to_le16(0x800)
+#define SMBFLG2_DFS cpu_to_le16(0x1000)
+#define SMBFLG2_PAGING_IO cpu_to_le16(0x2000)
+#define SMBFLG2_ERR_STATUS cpu_to_le16(0x4000)
+#define SMBFLG2_UNICODE cpu_to_le16(0x8000)
+
+/*
+ * These are the file access permission bits defined in CIFS for the
+ * NTCreateAndX as well as the level 0x107
+ * TRANS2_QUERY_PATH_INFORMATION API.  The level 0x107, SMB_QUERY_FILE_ALL_INFO
+ * responds with the AccessFlags.
+ * The AccessFlags specifies the access permissions a caller has to the
+ * file and can have any suitable combination of the following values:
+ */
+
+#define FILE_READ_DATA        0x00000001  /* Data can be read from the file   */
+#define FILE_WRITE_DATA       0x00000002  /* Data can be written to the file  */
+#define FILE_APPEND_DATA      0x00000004  /* Data can be appended to the file */
+#define FILE_READ_EA          0x00000008  /* Extended attributes associated   */
+					  /* with the file can be read        */
+#define FILE_WRITE_EA         0x00000010  /* Extended attributes associated   */
+					  /* with the file can be written     */
+#define FILE_EXECUTE          0x00000020  /*Data can be read into memory from */
+					  /* the file using system paging I/O */
+#define FILE_DELETE_CHILD     0x00000040
+#define FILE_READ_ATTRIBUTES  0x00000080  /* Attributes associated with the   */
+					  /* file can be read                 */
+#define FILE_WRITE_ATTRIBUTES 0x00000100  /* Attributes associated with the   */
+					  /* file can be written              */
+#define DELETE                0x00010000  /* The file can be deleted          */
+#define READ_CONTROL          0x00020000  /* The access control list and      */
+					  /* ownership associated with the    */
+					  /* file can be read                 */
+#define WRITE_DAC             0x00040000  /* The access control list and      */
+					  /* ownership associated with the    */
+					  /* file can be written.             */
+#define WRITE_OWNER           0x00080000  /* Ownership information associated */
+					  /* with the file can be written     */
+#define SYNCHRONIZE           0x00100000  /* The file handle can waited on to */
+					  /* synchronize with the completion  */
+					  /* of an input/output request       */
+#define GENERIC_ALL           0x10000000
+#define GENERIC_EXECUTE       0x20000000
+#define GENERIC_WRITE         0x40000000
+#define GENERIC_READ          0x80000000
+					 /* In summary - Relevant file       */
+					 /* access flags from CIFS are       */
+					 /* file_read_data, file_write_data  */
+					 /* file_execute, file_read_attributes*/
+					 /* write_dac, and delete.           */
+
+#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES)
+#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+				| FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
+#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+
+#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_WRITE_EA \
+				| FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+				| FILE_READ_EA | FILE_WRITE_EA \
+				| FILE_DELETE_CHILD | FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \
+				| FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+
+#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \
+				| READ_CONTROL | SYNCHRONIZE)
+
+
+/*
+ * Invalid readdir handle
+ */
+#define CIFS_NO_HANDLE        0xFFFF
+
+#define NO_CHANGE_64          0xFFFFFFFFFFFFFFFFULL
+
+/* IPC$ in ASCII */
+#define CIFS_IPC_RESOURCE "\x49\x50\x43\x24"
+
+/* IPC$ in Unicode */
+#define CIFS_IPC_UNICODE_RESOURCE "\x00\x49\x00\x50\x00\x43\x00\x24\x00\x00"
+
+/* Unicode Null terminate 2 bytes of 0 */
+#define UNICODE_NULL "\x00\x00"
+#define ASCII_NULL 0x00
+
+/*
+ * Server type values (returned on EnumServer API
+ */
+#define CIFS_SV_TYPE_DC     0x00000008
+#define CIFS_SV_TYPE_BACKDC 0x00000010
+
+/*
+ * Alias type flags (From EnumAlias API call
+ */
+#define CIFS_ALIAS_TYPE_FILE 0x0001
+#define CIFS_SHARE_TYPE_FILE 0x0000
+
+/*
+ * File Attribute flags
+ */
+#define ATTR_READONLY  0x0001
+#define ATTR_HIDDEN    0x0002
+#define ATTR_SYSTEM    0x0004
+#define ATTR_VOLUME    0x0008
+#define ATTR_DIRECTORY 0x0010
+#define ATTR_ARCHIVE   0x0020
+#define ATTR_DEVICE    0x0040
+#define ATTR_NORMAL    0x0080
+#define ATTR_TEMPORARY 0x0100
+#define ATTR_SPARSE    0x0200
+#define ATTR_REPARSE   0x0400
+#define ATTR_COMPRESSED 0x0800
+#define ATTR_OFFLINE    0x1000	/* ie file not immediately available -
+					on offline storage */
+#define ATTR_NOT_CONTENT_INDEXED 0x2000
+#define ATTR_ENCRYPTED  0x4000
+#define ATTR_POSIX_SEMANTICS 0x01000000
+#define ATTR_BACKUP_SEMANTICS 0x02000000
+#define ATTR_DELETE_ON_CLOSE 0x04000000
+#define ATTR_SEQUENTIAL_SCAN 0x08000000
+#define ATTR_RANDOM_ACCESS   0x10000000
+#define ATTR_NO_BUFFERING    0x20000000
+#define ATTR_WRITE_THROUGH   0x80000000
+
+/* ShareAccess flags */
+#define FILE_NO_SHARE     0x00000000
+#define FILE_SHARE_READ   0x00000001
+#define FILE_SHARE_WRITE  0x00000002
+#define FILE_SHARE_DELETE 0x00000004
+#define FILE_SHARE_ALL    0x00000007
+
+/* CreateDisposition flags, similar to CreateAction as well */
+#define FILE_SUPERSEDE    0x00000000
+#define FILE_OPEN         0x00000001
+#define FILE_CREATE       0x00000002
+#define FILE_OPEN_IF      0x00000003
+#define FILE_OVERWRITE    0x00000004
+#define FILE_OVERWRITE_IF 0x00000005
+
+/* CreateOptions */
+#define CREATE_NOT_FILE		0x00000001	/* if set must not be file */
+#define CREATE_WRITE_THROUGH	0x00000002
+#define CREATE_SEQUENTIAL       0x00000004
+#define CREATE_NO_BUFFER        0x00000008      /* should not buffer on srv */
+#define CREATE_SYNC_ALERT       0x00000010	/* MBZ */
+#define CREATE_ASYNC_ALERT      0x00000020	/* MBZ */
+#define CREATE_NOT_DIR		0x00000040    /* if set must not be directory */
+#define CREATE_TREE_CONNECTION  0x00000080	/* should be zero */
+#define CREATE_COMPLETE_IF_OPLK 0x00000100	/* should be zero */
+#define CREATE_NO_EA_KNOWLEDGE  0x00000200
+#define CREATE_EIGHT_DOT_THREE  0x00000400	/* doc says this is obsolete
+						 "open for recovery" flag should
+						 be zero in any case */
+#define CREATE_OPEN_FOR_RECOVERY 0x00000400
+#define CREATE_RANDOM_ACCESS	0x00000800
+#define CREATE_DELETE_ON_CLOSE	0x00001000
+#define CREATE_OPEN_BY_ID       0x00002000
+#define CREATE_OPEN_BACKUP_INTENT 0x00004000
+#define CREATE_NO_COMPRESSION   0x00008000
+#define CREATE_RESERVE_OPFILTER 0x00100000	/* should be zero */
+#define OPEN_REPARSE_POINT	0x00200000
+#define OPEN_NO_RECALL          0x00400000
+#define OPEN_FREE_SPACE_QUERY   0x00800000	/* should be zero */
+#define CREATE_OPTIONS_MASK     0x007FFFFF
+#define CREATE_OPTION_READONLY	0x10000000
+#define CREATE_OPTION_SPECIAL   0x20000000   /* system. NB not sent over wire */
+
+/* ImpersonationLevel flags */
+#define SECURITY_ANONYMOUS      0
+#define SECURITY_IDENTIFICATION 1
+#define SECURITY_IMPERSONATION  2
+#define SECURITY_DELEGATION     3
+
+/* SecurityFlags */
+#define SECURITY_CONTEXT_TRACKING 0x01
+#define SECURITY_EFFECTIVE_ONLY   0x02
+
+/*
+ * Default PID value, used in all SMBs where the PID is not important
+ */
+#define CIFS_DFT_PID  0x1234
+
+/*
+ * We use the same routine for Copy and Move SMBs.  This flag is used to
+ * distinguish
+ */
+#define CIFS_COPY_OP 1
+#define CIFS_RENAME_OP 2
+
+#define GETU16(var)  (*((__u16 *)var))	/* BB check for endian issues */
+#define GETU32(var)  (*((__u32 *)var))	/* BB check for endian issues */
+
+struct smb_hdr {
+	__be32 smb_buf_length;	/* BB length is only two (rarely three) bytes,
+		with one or two byte "type" preceding it that will be
+		zero - we could mask the type byte off */
+	__u8 Protocol[4];
+	__u8 Command;
+	union {
+		struct {
+			__u8 ErrorClass;
+			__u8 Reserved;
+			__le16 Error;
+		} __attribute__((packed)) DosError;
+		__le32 CifsError;
+	} __attribute__((packed)) Status;
+	__u8 Flags;
+	__le16 Flags2;		/* note: le */
+	__le16 PidHigh;
+	union {
+		struct {
+			__le32 SequenceNumber;  /* le */
+			__u32 Reserved; /* zero */
+		} __attribute__((packed)) Sequence;
+		__u8 SecuritySignature[8];	/* le */
+	} __attribute__((packed)) Signature;
+	__u8 pad[2];
+	__u16 Tid;
+	__le16 Pid;
+	__u16 Uid;
+	__le16 Mid;
+	__u8 WordCount;
+} __attribute__((packed));
+
+/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */
+static inline void *
+BCC(struct smb_hdr *smb)
+{
+	return (void *)smb + sizeof(*smb) + 2 * smb->WordCount;
+}
+
+/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
+#define pByteArea(smb_var) (BCC(smb_var) + 2)
+
+/* get the unconverted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc(struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	return get_unaligned_le16(bc_ptr);
+}
+
+/* set the ByteCount for a SMB packet in little-endian */
+static inline void
+put_bcc(__u16 count, struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	put_unaligned_le16(count, bc_ptr);
+}
+
+/*
+ * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
+ * No longer as important, now that TCP names are more commonly used to
+ * resolve hosts.
+ */
+#define CNLEN 15
+
+/*
+ * Share Name Length (SNLEN)
+ * Note:  This length was limited by the SMB used to get
+ *        the Share info.   NetShareEnum only returned 13
+ *        chars, including the null termination.
+ * This was removed because it no longer is limiting.
+ */
+
+/*
+ * Comment Length
+ */
+#define MAXCOMMENTLEN 40
+
+/*
+ * The OS/2 maximum path name
+ */
+#define MAX_PATHCONF 256
+
+/*
+ *  SMB frame definitions  (following must be packed structs)
+ *  See the SNIA CIFS Specification for details.
+ *
+ *  The Naming convention is the lower case version of the
+ *  smb command code name for the struct and this is typedef to the
+ *  uppercase version of the same name with the prefix SMB_ removed
+ *  for brevity.  Although typedefs are not commonly used for
+ *  structure definitions in the Linux kernel, their use in the
+ *  CIFS standards document, which this code is based on, may
+ *  make this one of the cases where typedefs for structures make
+ *  sense to improve readability for readers of the standards doc.
+ *  Typedefs can always be removed later if they are too distracting
+ *  and they are only used for the CIFSs PDUs themselves, not
+ *  internal cifs vfs structures
+ *
+ */
+
+typedef struct negotiate_req {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__le16 ByteCount;
+	unsigned char DialectsArray[1];
+} __attribute__((packed)) NEGOTIATE_REQ;
+
+/* Dialect index is 13 for LANMAN */
+
+#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
+
+typedef struct lanman_neg_rsp {
+	struct smb_hdr hdr;	/* wct = 13 */
+	__le16 DialectIndex;
+	__le16 SecurityMode;
+	__le16 MaxBufSize;
+	__le16 MaxMpxCount;
+	__le16 MaxNumberVcs;
+	__le16 RawMode;
+	__le32 SessionKey;
+	struct {
+		__le16 Time;
+		__le16 Date;
+	} __attribute__((packed)) SrvTime;
+	__le16 ServerTimeZone;
+	__le16 EncryptionKeyLength;
+	__le16 Reserved;
+	__u16  ByteCount;
+	unsigned char EncryptionKey[1];
+} __attribute__((packed)) LANMAN_NEG_RSP;
+
+#define READ_RAW_ENABLE 1
+#define WRITE_RAW_ENABLE 2
+#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
+#define SMB1_CLIENT_GUID_SIZE (16)
+typedef struct negotiate_rsp {
+	struct smb_hdr hdr;	/* wct = 17 */
+	__le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
+	__u8 SecurityMode;
+	__le16 MaxMpxCount;
+	__le16 MaxNumberVcs;
+	__le32 MaxBufferSize;
+	__le32 MaxRawSize;
+	__le32 SessionKey;
+	__le32 Capabilities;	/* see below */
+	__le32 SystemTimeLow;
+	__le32 SystemTimeHigh;
+	__le16 ServerTimeZone;
+	__u8 EncryptionKeyLength;
+	__u16 ByteCount;
+	union {
+		unsigned char EncryptionKey[1];	/* cap extended security off */
+		/* followed by Domain name - if extended security is off */
+		/* followed by 16 bytes of server GUID */
+		/* then security blob if cap_extended_security negotiated */
+		struct {
+			unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
+			unsigned char SecurityBlob[1];
+		} __attribute__((packed)) extended_response;
+	} __attribute__((packed)) u;
+} __attribute__((packed)) NEGOTIATE_RSP;
+
+/* SecurityMode bits */
+#define SECMODE_USER          0x01	/* off indicates share level security */
+#define SECMODE_PW_ENCRYPT    0x02
+#define SECMODE_SIGN_ENABLED  0x04	/* SMB security signatures enabled */
+#define SECMODE_SIGN_REQUIRED 0x08	/* SMB security signatures required */
+
+/* Negotiate response Capabilities */
+#define CAP_RAW_MODE           0x00000001
+#define CAP_MPX_MODE           0x00000002
+#define CAP_UNICODE            0x00000004
+#define CAP_LARGE_FILES        0x00000008
+#define CAP_NT_SMBS            0x00000010	/* implies CAP_NT_FIND */
+#define CAP_RPC_REMOTE_APIS    0x00000020
+#define CAP_STATUS32           0x00000040
+#define CAP_LEVEL_II_OPLOCKS   0x00000080
+#define CAP_LOCK_AND_READ      0x00000100
+#define CAP_NT_FIND            0x00000200
+#define CAP_DFS                0x00001000
+#define CAP_INFOLEVEL_PASSTHRU 0x00002000
+#define CAP_LARGE_READ_X       0x00004000
+#define CAP_LARGE_WRITE_X      0x00008000
+#define CAP_LWIO               0x00010000 /* support fctl_srv_req_resume_key */
+#define CAP_UNIX               0x00800000
+#define CAP_COMPRESSED_DATA    0x02000000
+#define CAP_DYNAMIC_REAUTH     0x20000000
+#define CAP_PERSISTENT_HANDLES 0x40000000
+#define CAP_EXTENDED_SECURITY  0x80000000
+
+typedef union smb_com_session_setup_andx {
+	struct {		/* request format */
+		struct smb_hdr hdr;	/* wct = 12 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 MaxBufferSize;
+		__le16 MaxMpxCount;
+		__le16 VcNumber;
+		__u32 SessionKey;
+		__le16 SecurityBlobLength;
+		__u32 Reserved;
+		__le32 Capabilities;	/* see below */
+		__le16 ByteCount;
+		unsigned char SecurityBlob[1];	/* followed by */
+		/* STRING NativeOS */
+		/* STRING NativeLanMan */
+	} __attribute__((packed)) req;	/* NTLM request format (with
+					extended security */
+
+	struct {		/* request format */
+		struct smb_hdr hdr;	/* wct = 13 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 MaxBufferSize;
+		__le16 MaxMpxCount;
+		__le16 VcNumber;
+		__u32 SessionKey;
+		__le16 CaseInsensitivePasswordLength; /* ASCII password len */
+		__le16 CaseSensitivePasswordLength; /* Unicode password length*/
+		__u32 Reserved;	/* see below */
+		__le32 Capabilities;
+		__le16 ByteCount;
+		unsigned char CaseInsensitivePassword[1];     /* followed by: */
+		/* unsigned char * CaseSensitivePassword; */
+		/* STRING AccountName */
+		/* STRING PrimaryDomain */
+		/* STRING NativeOS */
+		/* STRING NativeLanMan */
+	} __attribute__((packed)) req_no_secext; /* NTLM request format (without
+							extended security */
+
+	struct {		/* default (NTLM) response format */
+		struct smb_hdr hdr;	/* wct = 4 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 Action;	/* see below */
+		__le16 SecurityBlobLength;
+		__u16 ByteCount;
+		unsigned char SecurityBlob[1];	/* followed by */
+/*      unsigned char  * NativeOS;      */
+/*	unsigned char  * NativeLanMan;  */
+/*      unsigned char  * PrimaryDomain; */
+	} __attribute__((packed)) resp;	/* NTLM response
+					   (with or without extended sec) */
+
+	struct {		/* request format */
+		struct smb_hdr hdr;	/* wct = 10 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 MaxBufferSize;
+		__le16 MaxMpxCount;
+		__le16 VcNumber;
+		__u32 SessionKey;
+		__le16 PasswordLength;
+		__u32 Reserved; /* encrypt key len and offset */
+		__le16 ByteCount;
+		unsigned char AccountPassword[1];	/* followed by */
+		/* STRING AccountName */
+		/* STRING PrimaryDomain */
+		/* STRING NativeOS */
+		/* STRING NativeLanMan */
+	} __attribute__((packed)) old_req; /* pre-NTLM (LANMAN2.1) req format */
+
+	struct {		/* default (NTLM) response format */
+		struct smb_hdr hdr;	/* wct = 3 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 Action;	/* see below */
+		__u16 ByteCount;
+		unsigned char NativeOS[1];	/* followed by */
+/*	unsigned char * NativeLanMan; */
+/*      unsigned char * PrimaryDomain; */
+	} __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */
+} __attribute__((packed)) SESSION_SETUP_ANDX;
+
+/* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */
+
+#define NTLMSSP_SERVER_TYPE	1
+#define NTLMSSP_DOMAIN_TYPE	2
+#define NTLMSSP_FQ_DOMAIN_TYPE	3
+#define NTLMSSP_DNS_DOMAIN_TYPE	4
+#define NTLMSSP_DNS_PARENT_TYPE	5
+
+struct ntlmssp2_name {
+	__le16 type;
+	__le16 length;
+/*	char   name[length]; */
+} __attribute__((packed));
+
+struct ntlmv2_resp {
+	union {
+	    char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+	    struct {
+		__u8 reserved[8];
+		__u8 key[CIFS_SERVER_CHALLENGE_SIZE];
+	    } __attribute__((packed)) challenge;
+	} __attribute__((packed));
+	__le32 blob_signature;
+	__u32  reserved;
+	__le64  time;
+	__u64  client_chal; /* random */
+	__u32  reserved2;
+	/* array of name entries could follow ending in minimum 4 byte struct */
+} __attribute__((packed));
+
+
+#define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux"
+
+/* Capabilities bits (for NTLM SessSetup request) */
+#define CAP_UNICODE            0x00000004
+#define CAP_LARGE_FILES        0x00000008
+#define CAP_NT_SMBS            0x00000010
+#define CAP_STATUS32           0x00000040
+#define CAP_LEVEL_II_OPLOCKS   0x00000080
+#define CAP_NT_FIND            0x00000200	/* reserved should be zero
+				(because NT_SMBs implies the same thing?) */
+#define CAP_BULK_TRANSFER      0x20000000
+#define CAP_EXTENDED_SECURITY  0x80000000
+
+/* Action bits */
+#define GUEST_LOGIN 1
+
+typedef struct smb_com_tconx_req {
+	struct smb_hdr hdr;	/* wct = 4 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 Flags;		/* see below */
+	__le16 PasswordLength;
+	__le16 ByteCount;
+	unsigned char Password[1];	/* followed by */
+/* STRING Path    *//* \\server\share name */
+	/* STRING Service */
+} __attribute__((packed)) TCONX_REQ;
+
+typedef struct smb_com_tconx_rsp {
+	struct smb_hdr hdr;	/* wct = 3 , not extended response */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 OptionalSupport;	/* see below */
+	__u16 ByteCount;
+	unsigned char Service[1];	/* always ASCII, not Unicode */
+	/* STRING NativeFileSystem */
+} __attribute__((packed)) TCONX_RSP;
+
+typedef struct smb_com_tconx_rsp_ext {
+	struct smb_hdr hdr;	/* wct = 7, extended response */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 OptionalSupport;	/* see below */
+	__le32 MaximalShareAccessRights;
+	__le32 GuestMaximalShareAccessRights;
+	__u16 ByteCount;
+	unsigned char Service[1];	/* always ASCII, not Unicode */
+	/* STRING NativeFileSystem */
+} __attribute__((packed)) TCONX_RSP_EXT;
+
+
+/* tree connect Flags */
+#define DISCONNECT_TID          0x0001
+#define TCON_EXTENDED_SIGNATURES 0x0004
+#define TCON_EXTENDED_SECINFO   0x0008
+
+/* OptionalSupport bits */
+#define SMB_SUPPORT_SEARCH_BITS 0x0001	/* "must have" directory search bits
+					 (exclusive searches supported) */
+#define SMB_SHARE_IS_IN_DFS     0x0002
+#define SMB_CSC_MASK               0x000C
+/* CSC flags defined as follows */
+#define SMB_CSC_CACHE_MANUAL_REINT 0x0000
+#define SMB_CSC_CACHE_AUTO_REINT   0x0004
+#define SMB_CSC_CACHE_VDO          0x0008
+#define SMB_CSC_NO_CACHING         0x000C
+#define SMB_UNIQUE_FILE_NAME    0x0010
+#define SMB_EXTENDED_SIGNATURES 0x0020
+
+/* services
+ *
+ * A:       ie disk
+ * LPT1:    ie printer
+ * IPC      ie named pipe
+ * COMM
+ * ?????    ie any type
+ *
+ */
+
+typedef struct smb_com_echo_req {
+	struct	smb_hdr hdr;
+	__le16	EchoCount;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_REQ;
+
+typedef struct smb_com_echo_rsp {
+	struct	smb_hdr hdr;
+	__le16	SequenceNumber;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_RSP;
+
+typedef struct smb_com_logoff_andx_req {
+	struct smb_hdr hdr;	/* wct = 2 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__u16 AndXOffset;
+	__u16 ByteCount;
+} __attribute__((packed)) LOGOFF_ANDX_REQ;
+
+typedef struct smb_com_logoff_andx_rsp {
+	struct smb_hdr hdr;	/* wct = 2 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__u16 AndXOffset;
+	__u16 ByteCount;
+} __attribute__((packed)) LOGOFF_ANDX_RSP;
+
+typedef union smb_com_tree_disconnect {	/* as an altetnative can use flag on
+					tree_connect PDU to effect disconnect */
+					/* tdis is probably simplest SMB PDU */
+	struct {
+		struct smb_hdr hdr;	/* wct = 0 */
+		__u16 ByteCount;	/* bcc = 0 */
+	} __attribute__((packed)) req;
+	struct {
+		struct smb_hdr hdr;	/* wct = 0 */
+		__u16 ByteCount;	/* bcc = 0 */
+	} __attribute__((packed)) resp;
+} __attribute__((packed)) TREE_DISCONNECT;
+
+typedef struct smb_com_close_req {
+	struct smb_hdr hdr;	/* wct = 3 */
+	__u16 FileID;
+	__u32 LastWriteTime;	/* should be zero or -1 */
+	__u16 ByteCount;	/* 0 */
+} __attribute__((packed)) CLOSE_REQ;
+
+typedef struct smb_com_close_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) CLOSE_RSP;
+
+typedef struct smb_com_flush_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__u16 FileID;
+	__u16 ByteCount;	/* 0 */
+} __attribute__((packed)) FLUSH_REQ;
+
+typedef struct smb_com_findclose_req {
+	struct smb_hdr hdr; /* wct = 1 */
+	__u16 FileID;
+	__u16 ByteCount;    /* 0 */
+} __attribute__((packed)) FINDCLOSE_REQ;
+
+/* OpenFlags */
+#define REQ_MORE_INFO      0x00000001  /* legacy (OPEN_AND_X) only */
+#define REQ_OPLOCK         0x00000002
+#define REQ_BATCHOPLOCK    0x00000004
+#define REQ_OPENDIRONLY    0x00000008
+#define REQ_EXTENDED_INFO  0x00000010
+
+/* File type */
+#define DISK_TYPE		0x0000
+#define BYTE_PIPE_TYPE		0x0001
+#define MESSAGE_PIPE_TYPE	0x0002
+#define PRINTER_TYPE		0x0003
+#define COMM_DEV_TYPE		0x0004
+#define UNKNOWN_TYPE		0xFFFF
+
+/* Device Type or File Status Flags */
+#define NO_EAS			0x0001
+#define NO_SUBSTREAMS		0x0002
+#define NO_REPARSETAG		0x0004
+/* following flags can apply if pipe */
+#define ICOUNT_MASK		0x00FF
+#define PIPE_READ_MODE		0x0100
+#define NAMED_PIPE_TYPE		0x0400
+#define PIPE_END_POINT		0x4000
+#define BLOCKING_NAMED_PIPE	0x8000
+
+typedef struct smb_com_open_req {	/* also handles create */
+	struct smb_hdr hdr;	/* wct = 24 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u8 Reserved;		/* Must Be Zero */
+	__le16 NameLength;
+	__le32 OpenFlags;
+	__u32  RootDirectoryFid;
+	__le32 DesiredAccess;
+	__le64 AllocationSize;
+	__le32 FileAttributes;
+	__le32 ShareAccess;
+	__le32 CreateDisposition;
+	__le32 CreateOptions;
+	__le32 ImpersonationLevel;
+	__u8 SecurityFlags;
+	__le16 ByteCount;
+	char fileName[1];
+} __attribute__((packed)) OPEN_REQ;
+
+/* open response: oplock levels */
+#define OPLOCK_NONE  	 0
+#define OPLOCK_EXCLUSIVE 1
+#define OPLOCK_BATCH	 2
+#define OPLOCK_READ	 3  /* level 2 oplock */
+
+/* open response for CreateAction shifted left */
+#define CIFS_CREATE_ACTION 0x20000 /* file created */
+
+typedef struct smb_com_open_rsp {
+	struct smb_hdr hdr;	/* wct = 34 BB */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u8 OplockLevel;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 FileAttributes;
+	__le64 AllocationSize;
+	__le64 EndOfFile;
+	__le16 FileType;
+	__le16 DeviceState;
+	__u8 DirectoryFlag;
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) OPEN_RSP;
+
+typedef struct smb_com_open_rsp_ext {
+	struct smb_hdr hdr;     /* wct = 42 but meaningless due to MS bug? */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u8 OplockLevel;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 FileAttributes;
+	__le64 AllocationSize;
+	__le64 EndOfFile;
+	__le16 FileType;
+	__le16 DeviceState;
+	__u8 DirectoryFlag;
+	__u8 VolumeGUID[16];
+	__u64 FileId; /* note no endian conversion - is opaque UniqueID */
+	__le32 MaximalAccessRights;
+	__le32 GuestMaximalAccessRights;
+	__u16 ByteCount;        /* bct = 0 */
+} __attribute__((packed)) OPEN_RSP_EXT;
+
+
+/* format of legacy open request */
+typedef struct smb_com_openx_req {
+	struct smb_hdr	hdr;	/* wct = 15 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 OpenFlags;
+	__le16 Mode;
+	__le16 Sattr; /* search attributes */
+	__le16 FileAttributes;  /* dos attrs */
+	__le32 CreateTime; /* os2 format */
+	__le16 OpenFunction;
+	__le32 EndOfFile;
+	__le32 Timeout;
+	__le32 Reserved;
+	__le16  ByteCount;  /* file name follows */
+	char   fileName[1];
+} __attribute__((packed)) OPENX_REQ;
+
+typedef struct smb_com_openx_rsp {
+	struct smb_hdr	hdr;	/* wct = 15 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16  Fid;
+	__le16 FileAttributes;
+	__le32 LastWriteTime; /* os2 format */
+	__le32 EndOfFile;
+	__le16 Access;
+	__le16 FileType;
+	__le16 IPCState;
+	__le16 Action;
+	__u32  FileId;
+	__u16  Reserved;
+	__u16  ByteCount;
+} __attribute__((packed)) OPENX_RSP;
+
+/* For encoding of POSIX Open Request - see trans2 function 0x209 data struct */
+
+/* Legacy write request for older servers */
+typedef struct smb_com_writex_req {
+	struct smb_hdr hdr;     /* wct = 12 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__u32 Reserved; /* Timeout */
+	__le16 WriteMode; /* 1 = write through */
+	__le16 Remaining;
+	__le16 Reserved2;
+	__le16 DataLengthLow;
+	__le16 DataOffset;
+	__le16 ByteCount;
+	__u8 Pad;		/* BB check for whether padded to DWORD
+				   boundary and optimum performance here */
+	char Data[0];
+} __attribute__((packed)) WRITEX_REQ;
+
+typedef struct smb_com_write_req {
+	struct smb_hdr hdr;	/* wct = 14 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__u32 Reserved;
+	__le16 WriteMode;
+	__le16 Remaining;
+	__le16 DataLengthHigh;
+	__le16 DataLengthLow;
+	__le16 DataOffset;
+	__le32 OffsetHigh;
+	__le16 ByteCount;
+	__u8 Pad;		/* BB check for whether padded to DWORD
+				   boundary and optimum performance here */
+	char Data[0];
+} __attribute__((packed)) WRITE_REQ;
+
+typedef struct smb_com_write_rsp {
+	struct smb_hdr hdr;	/* wct = 6 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 Count;
+	__le16 Remaining;
+	__le16 CountHigh;
+	__u16  Reserved;
+	__u16 ByteCount;
+} __attribute__((packed)) WRITE_RSP;
+
+/* legacy read request for older servers */
+typedef struct smb_com_readx_req {
+	struct smb_hdr hdr;	/* wct = 10 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__le16 MaxCount;
+	__le16 MinCount;	/* obsolete */
+	__le32 Reserved;
+	__le16 Remaining;
+	__le16 ByteCount;
+} __attribute__((packed)) READX_REQ;
+
+typedef struct smb_com_read_req {
+	struct smb_hdr hdr;	/* wct = 12 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__le16 MaxCount;
+	__le16 MinCount;		/* obsolete */
+	__le32 MaxCountHigh;
+	__le16 Remaining;
+	__le32 OffsetHigh;
+	__le16 ByteCount;
+} __attribute__((packed)) READ_REQ;
+
+typedef struct smb_com_read_rsp {
+	struct smb_hdr hdr;	/* wct = 12 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 Remaining;
+	__le16 DataCompactionMode;
+	__le16 Reserved;
+	__le16 DataLength;
+	__le16 DataOffset;
+	__le16 DataLengthHigh;
+	__u64 Reserved2;
+	__u16 ByteCount;
+	/* read response data immediately follows */
+} __attribute__((packed)) READ_RSP;
+
+typedef struct locking_andx_range {
+	__le16 Pid;
+	__le16 Pad;
+	__le32 OffsetHigh;
+	__le32 OffsetLow;
+	__le32 LengthHigh;
+	__le32 LengthLow;
+} __attribute__((packed)) LOCKING_ANDX_RANGE;
+
+#define LOCKING_ANDX_SHARED_LOCK     0x01
+#define LOCKING_ANDX_OPLOCK_RELEASE  0x02
+#define LOCKING_ANDX_CHANGE_LOCKTYPE 0x04
+#define LOCKING_ANDX_CANCEL_LOCK     0x08
+#define LOCKING_ANDX_LARGE_FILES     0x10	/* always on for us */
+
+typedef struct smb_com_lock_req {
+	struct smb_hdr hdr;	/* wct = 8 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__u8 LockType;
+	__u8 OplockLevel;
+	__le32 Timeout;
+	__le16 NumberOfUnlocks;
+	__le16 NumberOfLocks;
+	__le16 ByteCount;
+	LOCKING_ANDX_RANGE Locks[1];
+} __attribute__((packed)) LOCK_REQ;
+
+/* lock type */
+#define CIFS_RDLCK	0
+#define CIFS_WRLCK	1
+#define CIFS_UNLCK      2
+typedef struct cifs_posix_lock {
+	__le16  lock_type;  /* 0 = Read, 1 = Write, 2 = Unlock */
+	__le16  lock_flags; /* 1 = Wait (only valid for setlock) */
+	__le32  pid;
+	__le64	start;
+	__le64	length;
+	/* BB what about additional owner info to identify network client */
+} __attribute__((packed)) CIFS_POSIX_LOCK;
+
+typedef struct smb_com_lock_rsp {
+	struct smb_hdr hdr;	/* wct = 2 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 ByteCount;
+} __attribute__((packed)) LOCK_RSP;
+
+typedef struct smb_com_rename_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__le16 SearchAttributes;	/* target file attributes */
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII or Unicode */
+	unsigned char OldFileName[1];
+	/* followed by __u8 BufferFormat2 */
+	/* followed by NewFileName */
+} __attribute__((packed)) RENAME_REQ;
+
+	/* copy request flags */
+#define COPY_MUST_BE_FILE      0x0001
+#define COPY_MUST_BE_DIR       0x0002
+#define COPY_TARGET_MODE_ASCII 0x0004 /* if not set, binary */
+#define COPY_SOURCE_MODE_ASCII 0x0008 /* if not set, binary */
+#define COPY_VERIFY_WRITES     0x0010
+#define COPY_TREE              0x0020
+
+typedef struct smb_com_copy_req {
+	struct smb_hdr hdr;	/* wct = 3 */
+	__u16 Tid2;
+	__le16 OpenFunction;
+	__le16 Flags;
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII or Unicode */
+	unsigned char OldFileName[1];
+	/* followed by __u8 BufferFormat2 */
+	/* followed by NewFileName string */
+} __attribute__((packed)) COPY_REQ;
+
+typedef struct smb_com_copy_rsp {
+	struct smb_hdr hdr;     /* wct = 1 */
+	__le16 CopyCount;    /* number of files copied */
+	__u16 ByteCount;    /* may be zero */
+	__u8 BufferFormat;  /* 0x04 - only present if errored file follows */
+	unsigned char ErrorFileName[1]; /* only present if error in copy */
+} __attribute__((packed)) COPY_RSP;
+
+#define CREATE_HARD_LINK		0x103
+#define MOVEFILE_COPY_ALLOWED		0x0002
+#define MOVEFILE_REPLACE_EXISTING	0x0001
+
+typedef struct smb_com_nt_rename_req {	/* A5 - also used for create hardlink */
+	struct smb_hdr hdr;	/* wct = 4 */
+	__le16 SearchAttributes;	/* target file attributes */
+	__le16 Flags;		/* spec says Information Level */
+	__le32 ClusterCount;
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII or Unicode */
+	unsigned char OldFileName[1];
+	/* followed by __u8 BufferFormat2 */
+	/* followed by NewFileName */
+} __attribute__((packed)) NT_RENAME_REQ;
+
+typedef struct smb_com_rename_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) RENAME_RSP;
+
+typedef struct smb_com_delete_file_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__le16 SearchAttributes;
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII */
+	unsigned char fileName[1];
+} __attribute__((packed)) DELETE_FILE_REQ;
+
+typedef struct smb_com_delete_file_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) DELETE_FILE_RSP;
+
+typedef struct smb_com_delete_directory_req {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII */
+	unsigned char DirName[1];
+} __attribute__((packed)) DELETE_DIRECTORY_REQ;
+
+typedef struct smb_com_delete_directory_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) DELETE_DIRECTORY_RSP;
+
+typedef struct smb_com_create_directory_req {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII */
+	unsigned char DirName[1];
+} __attribute__((packed)) CREATE_DIRECTORY_REQ;
+
+typedef struct smb_com_create_directory_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) CREATE_DIRECTORY_RSP;
+
+typedef struct smb_com_query_information_req {
+	struct smb_hdr hdr;     /* wct = 0 */
+	__le16 ByteCount;	/* 1 + namelen + 1 */
+	__u8 BufferFormat;      /* 4 = ASCII */
+	unsigned char FileName[1];
+} __attribute__((packed)) QUERY_INFORMATION_REQ;
+
+typedef struct smb_com_query_information_rsp {
+	struct smb_hdr hdr;     /* wct = 10 */
+	__le16 attr;
+	__le32  last_write_time;
+	__le32 size;
+	__u16  reserved[5];
+	__le16 ByteCount;	/* bcc = 0 */
+} __attribute__((packed)) QUERY_INFORMATION_RSP;
+
+typedef struct smb_com_setattr_req {
+	struct smb_hdr hdr; /* wct = 8 */
+	__le16 attr;
+	__le16 time_low;
+	__le16 time_high;
+	__le16 reserved[5]; /* must be zero */
+	__u16  ByteCount;
+	__u8   BufferFormat; /* 4 = ASCII */
+	unsigned char fileName[1];
+} __attribute__((packed)) SETATTR_REQ;
+
+typedef struct smb_com_setattr_rsp {
+	struct smb_hdr hdr;     /* wct = 0 */
+	__u16 ByteCount;        /* bct = 0 */
+} __attribute__((packed)) SETATTR_RSP;
+
+/* empty wct response to setattr */
+
+/*******************************************************/
+/* NT Transact structure definitions follow            */
+/* Currently only ioctl, acl (get security descriptor) */
+/* and notify are implemented                          */
+/*******************************************************/
+typedef struct smb_com_ntransact_req {
+	struct smb_hdr hdr; /* wct >= 19 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	/* SetupCount words follow then */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__u8 Parms[0];
+} __attribute__((packed)) NTRANSACT_REQ;
+
+typedef struct smb_com_ntransact_rsp {
+	struct smb_hdr hdr;     /* wct = 18 */
+	__u8 Reserved[3];
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 ParameterDisplacement;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__le32 DataDisplacement;
+	__u8 SetupCount;   /* 0 */
+	__u16 ByteCount;
+	/* __u8 Pad[3]; */
+	/* parms and data follow */
+} __attribute__((packed)) NTRANSACT_RSP;
+
+/* See MS-SMB 2.2.7.2.1.1 */
+struct srv_copychunk {
+	__le64 SourceOffset;
+	__le64 DestinationOffset;
+	__le32 CopyLength;
+	__u32  Reserved;
+} __packed;
+
+typedef struct smb_com_transaction_ioctl_req {
+	struct smb_hdr hdr;	/* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	__le32 FunctionCode;
+	__u16 Fid;
+	__u8 IsFsctl;  /* 1 = File System Control 0 = device control (IOCTL) */
+	__u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__u8 Data[1];
+} __attribute__((packed)) TRANSACT_IOCTL_REQ;
+
+typedef struct smb_com_transaction_compr_ioctl_req {
+	struct smb_hdr hdr;	/* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	__le32 FunctionCode;
+	__u16 Fid;
+	__u8 IsFsctl;  /* 1 = File System Control 0 = device control (IOCTL) */
+	__u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__le16 compression_state;  /* See below for valid flags */
+} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ;
+
+/* compression state flags */
+#define COMPRESSION_FORMAT_NONE		0x0000
+#define COMPRESSION_FORMAT_DEFAULT	0x0001
+#define COMPRESSION_FORMAT_LZNT1	0x0002
+
+typedef struct smb_com_transaction_ioctl_rsp {
+	struct smb_hdr hdr;	/* wct = 19 */
+	__u8 Reserved[3];
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 ParameterDisplacement;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__le32 DataDisplacement;
+	__u8 SetupCount;	/* 1 */
+	__le16 ReturnedDataLen;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACT_IOCTL_RSP;
+
+#define CIFS_ACL_OWNER 1
+#define CIFS_ACL_GROUP 2
+#define CIFS_ACL_DACL  4
+#define CIFS_ACL_SACL  8
+
+typedef struct smb_com_transaction_qsec_req {
+	struct smb_hdr hdr;     /* wct = 19 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* no setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 6 = QUERY_SECURITY_DESC */
+	__le16 ByteCount; /* bcc = 3 + 8 */
+	__u8 Pad[3];
+	__u16 Fid;
+	__u16 Reserved2;
+	__le32 AclFlags;
+} __attribute__((packed)) QUERY_SEC_DESC_REQ;
+
+
+typedef struct smb_com_transaction_ssec_req {
+	struct smb_hdr hdr;     /* wct = 19 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* no setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 3 = SET_SECURITY_DESC */
+	__le16 ByteCount; /* bcc = 3 + 8 */
+	__u8 Pad[3];
+	__u16 Fid;
+	__u16 Reserved2;
+	__le32 AclFlags;
+} __attribute__((packed)) SET_SEC_DESC_REQ;
+
+typedef struct smb_com_transaction_change_notify_req {
+	struct smb_hdr hdr;     /* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand;/* 4 = Change Notify */
+	__le32 CompletionFilter;  /* operation to monitor */
+	__u16 Fid;
+	__u8 WatchTree;  /* 1 = Monitor subdirectories */
+	__u8 Reserved2;
+	__le16 ByteCount;
+/* 	__u8 Pad[3];*/
+/*	__u8 Data[1];*/
+} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
+
+/* BB eventually change to use generic ntransact rsp struct
+      and validation routine */
+typedef struct smb_com_transaction_change_notify_rsp {
+	struct smb_hdr hdr;	/* wct = 18 */
+	__u8 Reserved[3];
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 ParameterDisplacement;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__le32 DataDisplacement;
+	__u8 SetupCount;   /* 0 */
+	__u16 ByteCount;
+	/* __u8 Pad[3]; */
+} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_RSP;
+/* Completion Filter flags for Notify */
+#define FILE_NOTIFY_CHANGE_FILE_NAME    0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME     0x00000002
+#define FILE_NOTIFY_CHANGE_NAME         0x00000003
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES   0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE         0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE   0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS  0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION     0x00000040
+#define FILE_NOTIFY_CHANGE_EA           0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY     0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME  0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE  0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
+
+#define FILE_ACTION_ADDED		0x00000001
+#define FILE_ACTION_REMOVED		0x00000002
+#define FILE_ACTION_MODIFIED		0x00000003
+#define FILE_ACTION_RENAMED_OLD_NAME	0x00000004
+#define FILE_ACTION_RENAMED_NEW_NAME	0x00000005
+#define FILE_ACTION_ADDED_STREAM	0x00000006
+#define FILE_ACTION_REMOVED_STREAM	0x00000007
+#define FILE_ACTION_MODIFIED_STREAM	0x00000008
+
+/* response contains array of the following structures */
+struct file_notify_information {
+	__le32 NextEntryOffset;
+	__le32 Action;
+	__le32 FileNameLength;
+	__u8  FileName[0];
+} __attribute__((packed));
+
+/* For IO_REPARSE_TAG_SYMLINK */
+struct reparse_symlink_data {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__le16	SubstituteNameOffset;
+	__le16	SubstituteNameLength;
+	__le16	PrintNameOffset;
+	__le16	PrintNameLength;
+	__le32	Flags;
+	char	PathBuffer[0];
+} __attribute__((packed));
+
+/* For IO_REPARSE_TAG_NFS */
+#define NFS_SPECFILE_LNK	0x00000000014B4E4C
+#define NFS_SPECFILE_CHR	0x0000000000524843
+#define NFS_SPECFILE_BLK	0x00000000004B4C42
+#define NFS_SPECFILE_FIFO	0x000000004F464946
+#define NFS_SPECFILE_SOCK	0x000000004B434F53
+struct reparse_posix_data {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__le64	InodeType; /* LNK, FIFO, CHR etc. */
+	char	PathBuffer[0];
+} __attribute__((packed));
+
+struct cifs_quota_data {
+	__u32	rsrvd1;  /* 0 */
+	__u32	sid_size;
+	__u64	rsrvd2;  /* 0 */
+	__u64	space_used;
+	__u64	soft_limit;
+	__u64	hard_limit;
+	char	sid[1];  /* variable size? */
+} __attribute__((packed));
+
+/* quota sub commands */
+#define QUOTA_LIST_CONTINUE	    0
+#define QUOTA_LIST_START	0x100
+#define QUOTA_FOR_SID		0x101
+
+struct trans2_req {
+	/* struct smb_hdr hdr precedes. Set wct = 14+ */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand; /* 1st setup word - SetupCount words follow */
+	__le16 ByteCount;
+} __attribute__((packed));
+
+struct smb_t2_req {
+	struct smb_hdr hdr;
+	struct trans2_req t2_req;
+} __attribute__((packed));
+
+struct trans2_resp {
+	/* struct smb_hdr hdr precedes. Note wct = 10 + setup count */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__u16 Reserved;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 ParameterDisplacement;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__le16 DataDisplacement;
+	__u8 SetupCount;
+	__u8 Reserved1;
+	/* SetupWords[SetupCount];
+	__u16 ByteCount;
+	__u16 Reserved2;*/
+	/* data area follows */
+} __attribute__((packed));
+
+struct smb_t2_rsp {
+	struct smb_hdr hdr;
+	struct trans2_resp t2_rsp;
+} __attribute__((packed));
+
+/* PathInfo/FileInfo infolevels */
+#define SMB_INFO_STANDARD                   1
+#define SMB_SET_FILE_EA                     2
+#define SMB_QUERY_FILE_EA_SIZE              2
+#define SMB_INFO_QUERY_EAS_FROM_LIST        3
+#define SMB_INFO_QUERY_ALL_EAS              4
+#define SMB_INFO_IS_NAME_VALID              6
+#define SMB_QUERY_FILE_BASIC_INFO       0x101
+#define SMB_QUERY_FILE_STANDARD_INFO    0x102
+#define SMB_QUERY_FILE_EA_INFO          0x103
+#define SMB_QUERY_FILE_NAME_INFO        0x104
+#define SMB_QUERY_FILE_ALLOCATION_INFO  0x105
+#define SMB_QUERY_FILE_END_OF_FILEINFO  0x106
+#define SMB_QUERY_FILE_ALL_INFO         0x107
+#define SMB_QUERY_ALT_NAME_INFO         0x108
+#define SMB_QUERY_FILE_STREAM_INFO      0x109
+#define SMB_QUERY_FILE_COMPRESSION_INFO 0x10B
+#define SMB_QUERY_FILE_UNIX_BASIC       0x200
+#define SMB_QUERY_FILE_UNIX_LINK        0x201
+#define SMB_QUERY_POSIX_ACL             0x204
+#define SMB_QUERY_XATTR                 0x205  /* e.g. system EA name space */
+#define SMB_QUERY_ATTR_FLAGS            0x206  /* append,immutable etc. */
+#define SMB_QUERY_POSIX_PERMISSION      0x207
+#define SMB_QUERY_POSIX_LOCK            0x208
+/* #define SMB_POSIX_OPEN               0x209 */
+/* #define SMB_POSIX_UNLINK             0x20a */
+#define SMB_QUERY_FILE__UNIX_INFO2      0x20b
+#define SMB_QUERY_FILE_INTERNAL_INFO    0x3ee
+#define SMB_QUERY_FILE_ACCESS_INFO      0x3f0
+#define SMB_QUERY_FILE_NAME_INFO2       0x3f1 /* 0x30 bytes */
+#define SMB_QUERY_FILE_POSITION_INFO    0x3f6
+#define SMB_QUERY_FILE_MODE_INFO        0x3f8
+#define SMB_QUERY_FILE_ALGN_INFO        0x3f9
+
+
+#define SMB_SET_FILE_BASIC_INFO	        0x101
+#define SMB_SET_FILE_DISPOSITION_INFO   0x102
+#define SMB_SET_FILE_ALLOCATION_INFO    0x103
+#define SMB_SET_FILE_END_OF_FILE_INFO   0x104
+#define SMB_SET_FILE_UNIX_BASIC         0x200
+#define SMB_SET_FILE_UNIX_LINK          0x201
+#define SMB_SET_FILE_UNIX_HLINK         0x203
+#define SMB_SET_POSIX_ACL               0x204
+#define SMB_SET_XATTR                   0x205
+#define SMB_SET_ATTR_FLAGS              0x206  /* append, immutable etc. */
+#define SMB_SET_POSIX_LOCK              0x208
+#define SMB_POSIX_OPEN                  0x209
+#define SMB_POSIX_UNLINK                0x20a
+#define SMB_SET_FILE_UNIX_INFO2         0x20b
+#define SMB_SET_FILE_BASIC_INFO2        0x3ec
+#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */
+#define SMB_FILE_ALL_INFO2              0x3fa
+#define SMB_SET_FILE_ALLOCATION_INFO2   0x3fb
+#define SMB_SET_FILE_END_OF_FILE_INFO2  0x3fc
+#define SMB_FILE_MOVE_CLUSTER_INFO      0x407
+#define SMB_FILE_QUOTA_INFO             0x408
+#define SMB_FILE_REPARSEPOINT_INFO      0x409
+#define SMB_FILE_MAXIMUM_INFO           0x40d
+
+/* Find File infolevels */
+#define SMB_FIND_FILE_INFO_STANDARD       0x001
+#define SMB_FIND_FILE_QUERY_EA_SIZE       0x002
+#define SMB_FIND_FILE_QUERY_EAS_FROM_LIST 0x003
+#define SMB_FIND_FILE_DIRECTORY_INFO      0x101
+#define SMB_FIND_FILE_FULL_DIRECTORY_INFO 0x102
+#define SMB_FIND_FILE_NAMES_INFO          0x103
+#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO 0x104
+#define SMB_FIND_FILE_ID_FULL_DIR_INFO    0x105
+#define SMB_FIND_FILE_ID_BOTH_DIR_INFO    0x106
+#define SMB_FIND_FILE_UNIX                0x202
+
+typedef struct smb_com_transaction2_qpi_req {
+	struct smb_hdr hdr;	/* wct = 14+ */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__le16 InformationLevel;
+	__u32 Reserved4;
+	char FileName[1];
+} __attribute__((packed)) TRANSACTION2_QPI_REQ;
+
+typedef struct smb_com_transaction2_qpi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2; /* parameter word is present for infolevels > 100 */
+} __attribute__((packed)) TRANSACTION2_QPI_RSP;
+
+typedef struct smb_com_transaction2_spi_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 Pad1;
+	__le16 InformationLevel;
+	__u32 Reserved4;
+	char FileName[1];
+} __attribute__((packed)) TRANSACTION2_SPI_REQ;
+
+typedef struct smb_com_transaction2_spi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2; /* parameter word is present for infolevels > 100 */
+} __attribute__((packed)) TRANSACTION2_SPI_RSP;
+
+struct set_file_rename {
+	__le32 overwrite;   /* 1 = overwrite dest */
+	__u32 root_fid;   /* zero */
+	__le32 target_name_len;
+	char  target_name[0];  /* Must be unicode */
+} __attribute__((packed));
+
+struct smb_com_transaction2_sfi_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 Pad1;
+	__u16 Fid;
+	__le16 InformationLevel;
+	__u16 Reserved4;
+} __attribute__((packed));
+
+struct smb_com_transaction2_sfi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2;	/* parameter word reserved -
+					present for infolevels > 100 */
+} __attribute__((packed));
+
+struct smb_t2_qfi_req {
+	struct	smb_hdr hdr;
+	struct	trans2_req t2;
+	__u8	Pad;
+	__u16	Fid;
+	__le16	InformationLevel;
+} __attribute__((packed));
+
+struct smb_t2_qfi_rsp {
+	struct smb_hdr hdr;     /* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2;        /* parameter word reserved -
+				   present for infolevels > 100 */
+} __attribute__((packed));
+
+/*
+ * Flags on T2 FINDFIRST and FINDNEXT
+ */
+#define CIFS_SEARCH_CLOSE_ALWAYS  0x0001
+#define CIFS_SEARCH_CLOSE_AT_END  0x0002
+#define CIFS_SEARCH_RETURN_RESUME 0x0004
+#define CIFS_SEARCH_CONTINUE_FROM_LAST 0x0008
+#define CIFS_SEARCH_BACKUP_SEARCH 0x0010
+
+/*
+ * Size of the resume key on FINDFIRST and FINDNEXT calls
+ */
+#define CIFS_SMB_RESUME_KEY_SIZE 4
+
+typedef struct smb_com_transaction2_ffirst_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_FIND_FIRST */
+	__le16 ByteCount;
+	__u8 Pad;
+	__le16 SearchAttributes;
+	__le16 SearchCount;
+	__le16 SearchFlags;
+	__le16 InformationLevel;
+	__le32 SearchStorageType;
+	char FileName[1];
+} __attribute__((packed)) TRANSACTION2_FFIRST_REQ;
+
+typedef struct smb_com_transaction2_ffirst_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACTION2_FFIRST_RSP;
+
+typedef struct smb_com_transaction2_ffirst_rsp_parms {
+	__u16 SearchHandle;
+	__le16 SearchCount;
+	__le16 EndofSearch;
+	__le16 EAErrorOffset;
+	__le16 LastNameOffset;
+} __attribute__((packed)) T2_FFIRST_RSP_PARMS;
+
+typedef struct smb_com_transaction2_fnext_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_FIND_NEXT */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 SearchHandle;
+	__le16 SearchCount;
+	__le16 InformationLevel;
+	__u32 ResumeKey;
+	__le16 SearchFlags;
+	char ResumeFileName[1];
+} __attribute__((packed)) TRANSACTION2_FNEXT_REQ;
+
+typedef struct smb_com_transaction2_fnext_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACTION2_FNEXT_RSP;
+
+typedef struct smb_com_transaction2_fnext_rsp_parms {
+	__le16 SearchCount;
+	__le16 EndofSearch;
+	__le16 EAErrorOffset;
+	__le16 LastNameOffset;
+} __attribute__((packed)) T2_FNEXT_RSP_PARMS;
+
+/* QFSInfo Levels */
+#define SMB_INFO_ALLOCATION         1
+#define SMB_INFO_VOLUME             2
+#define SMB_QUERY_FS_VOLUME_INFO    0x102
+#define SMB_QUERY_FS_SIZE_INFO      0x103
+#define SMB_QUERY_FS_DEVICE_INFO    0x104
+#define SMB_QUERY_FS_ATTRIBUTE_INFO 0x105
+#define SMB_QUERY_CIFS_UNIX_INFO    0x200
+#define SMB_QUERY_POSIX_FS_INFO     0x201
+#define SMB_QUERY_POSIX_WHO_AM_I    0x202
+#define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203
+#define SMB_QUERY_FS_PROXY          0x204 /* WAFS enabled. Returns structure
+					    FILE_SYSTEM__UNIX_INFO to tell
+					    whether new NTIOCTL available
+					    (0xACE) for WAN friendly SMB
+					    operations to be carried */
+#define SMB_QUERY_LABEL_INFO        0x3ea
+#define SMB_QUERY_FS_QUOTA_INFO     0x3ee
+#define SMB_QUERY_FS_FULL_SIZE_INFO 0x3ef
+#define SMB_QUERY_OBJECTID_INFO     0x3f0
+
+typedef struct smb_com_transaction2_qfsi_req {
+	struct smb_hdr hdr;	/* wct = 14+ */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__le16 InformationLevel;
+} __attribute__((packed)) TRANSACTION2_QFSI_REQ;
+
+typedef struct smb_com_transaction_qfsi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u8 Pad;	/* may be three bytes? *//* followed by data area */
+} __attribute__((packed)) TRANSACTION2_QFSI_RSP;
+
+typedef struct whoami_rsp_data { /* Query level 0x202 */
+	__u32 flags; /* 0 = Authenticated user 1 = GUEST */
+	__u32 mask; /* which flags bits server understands ie 0x0001 */
+	__u64 unix_user_id;
+	__u64 unix_user_gid;
+	__u32 number_of_supplementary_gids; /* may be zero */
+	__u32 number_of_sids; /* may be zero */
+	__u32 length_of_sid_array; /* in bytes - may be zero */
+	__u32 pad; /* reserved - MBZ */
+	/* __u64 gid_array[0]; */  /* may be empty */
+	/* __u8 * psid_list */  /* may be empty */
+} __attribute__((packed)) WHOAMI_RSP_DATA;
+
+/* SETFSInfo Levels */
+#define SMB_SET_CIFS_UNIX_INFO    0x200
+/* level 0x203 is defined above in list of QFS info levels */
+/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
+
+/* Level 0x200 request structure follows */
+typedef struct smb_com_transaction2_setfsi_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;	/* 4 */
+	__le16 ParameterOffset;
+	__le16 DataCount;	/* 12 */
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_SET_FS_INFORMATION */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 FileNum;		/* Parameters start. */
+	__le16 InformationLevel;/* Parameters end. */
+	__le16 ClientUnixMajor; /* Data start. */
+	__le16 ClientUnixMinor;
+	__le64 ClientUnixCap;   /* Data end */
+} __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
+
+/* level 0x203 request structure follows */
+typedef struct smb_com_transaction2_setfs_enc_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;	/* 4 */
+	__le16 ParameterOffset;
+	__le16 DataCount;	/* 12 */
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_SET_FS_INFORMATION */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16  Reserved4;	/* Parameters start. */
+	__le16 InformationLevel;/* Parameters end. */
+	/* NTLMSSP Blob, Data start. */
+} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
+
+/* response for setfsinfo levels 0x200 and 0x203 */
+typedef struct smb_com_transaction2_setfsi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
+
+typedef struct smb_com_transaction2_get_dfs_refer_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad[3];		/* Win2K has sent 0x0F01 (max response length
+				   perhaps?) followed by one byte pad - doesn't
+				   seem to matter though */
+	__le16 MaxReferralLevel;
+	char RequestFileName[1];
+} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ;
+
+#define DFS_VERSION cpu_to_le16(0x0003)
+
+/* DFS server target type */
+#define DFS_TYPE_LINK 0x0000  /* also for sysvol targets */
+#define DFS_TYPE_ROOT 0x0001
+
+/* Referral Entry Flags */
+#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */
+#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */
+
+typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */
+	__le16 VersionNumber;  /* must be 3 or 4 */
+	__le16 Size;
+	__le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
+	__le16 ReferralEntryFlags;
+	__le32 TimeToLive;
+	__le16 DfsPathOffset;
+	__le16 DfsAlternatePathOffset;
+	__le16 NetworkAddressOffset; /* offset of the link target */
+	__u8   ServiceSiteGuid[16];  /* MBZ, ignored */
+} __attribute__((packed)) REFERRAL3;
+
+typedef struct smb_com_transaction_get_dfs_refer_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u8 Pad;
+	__le16 PathConsumed;
+	__le16 NumberOfReferrals;
+	__le32 DFSFlags;
+	REFERRAL3 referrals[1];	/* array of level 3 dfs_referral structures */
+	/* followed by the strings pointed to by the referral structures */
+} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP;
+
+/* DFS Flags */
+#define DFSREF_REFERRAL_SERVER  0x00000001 /* all targets are DFS roots */
+#define DFSREF_STORAGE_SERVER   0x00000002 /* no further ref requests needed */
+#define DFSREF_TARGET_FAILBACK  0x00000004 /* only for DFS referral version 4 */
+
+/*
+ ************************************************************************
+ * All structs for everything above the SMB PDUs themselves
+ * (such as the T2 level specific data) go here
+ ************************************************************************
+ */
+
+/*
+ * Information on a server
+ */
+
+struct serverInfo {
+	char name[16];
+	unsigned char versionMajor;
+	unsigned char versionMinor;
+	unsigned long type;
+	unsigned int commentOffset;
+} __attribute__((packed));
+
+/*
+ * The following structure is the format of the data returned on a NetShareEnum
+ * with level "90" (x5A)
+ */
+
+struct shareInfo {
+	char shareName[13];
+	char pad;
+	unsigned short type;
+	unsigned int commentOffset;
+} __attribute__((packed));
+
+struct aliasInfo {
+	char aliasName[9];
+	char pad;
+	unsigned int commentOffset;
+	unsigned char type[2];
+} __attribute__((packed));
+
+struct aliasInfo92 {
+	int aliasNameOffset;
+	int serverNameOffset;
+	int shareNameOffset;
+} __attribute__((packed));
+
+typedef struct {
+	__le64 TotalAllocationUnits;
+	__le64 FreeAllocationUnits;
+	__le32 SectorsPerAllocationUnit;
+	__le32 BytesPerSector;
+} __attribute__((packed)) FILE_SYSTEM_INFO;	/* size info, level 0x103 */
+
+typedef struct {
+	__le32 fsid;
+	__le32 SectorsPerAllocationUnit;
+	__le32 TotalAllocationUnits;
+	__le32 FreeAllocationUnits;
+	__le16  BytesPerSector;
+} __attribute__((packed)) FILE_SYSTEM_ALLOC_INFO;
+
+typedef struct {
+	__le16 MajorVersionNumber;
+	__le16 MinorVersionNumber;
+	__le64 Capability;
+} __attribute__((packed)) FILE_SYSTEM_UNIX_INFO; /* Unix extension level 0x200*/
+
+/* Version numbers for CIFS UNIX major and minor. */
+#define CIFS_UNIX_MAJOR_VERSION 1
+#define CIFS_UNIX_MINOR_VERSION 0
+
+/* Linux/Unix extensions capability flags */
+#define CIFS_UNIX_FCNTL_CAP             0x00000001 /* support for fcntl locks */
+#define CIFS_UNIX_POSIX_ACL_CAP         0x00000002 /* support getfacl/setfacl */
+#define CIFS_UNIX_XATTR_CAP             0x00000004 /* support new namespace   */
+#define CIFS_UNIX_EXTATTR_CAP           0x00000008 /* support chattr/chflag   */
+#define CIFS_UNIX_POSIX_PATHNAMES_CAP   0x00000010 /* Allow POSIX path chars  */
+#define CIFS_UNIX_POSIX_PATH_OPS_CAP    0x00000020 /* Allow new POSIX path based
+						      calls including posix open
+						      and posix unlink */
+#define CIFS_UNIX_LARGE_READ_CAP        0x00000040 /* support reads >128K (up
+						      to 0xFFFF00 */
+#define CIFS_UNIX_LARGE_WRITE_CAP       0x00000080
+#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */
+#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP  0x00000200 /* must do  */
+#define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and
+						      QFS PROXY call */
+#ifdef CONFIG_CIFS_POSIX
+/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
+   LockingX instead of posix locking call on unix sess (and we do not expect
+   LockingX to use different (ie Windows) semantics than posix locking on
+   the same session (if WINE needs to do this later, we can add this cap
+   back in later */
+/* #define CIFS_UNIX_CAP_MASK              0x000000fb */
+#define CIFS_UNIX_CAP_MASK              0x000003db
+#else
+#define CIFS_UNIX_CAP_MASK              0x00000013
+#endif /* CONFIG_CIFS_POSIX */
+
+
+#define CIFS_POSIX_EXTENSIONS           0x00000010 /* support for new QFSInfo */
+
+typedef struct {
+	/* For undefined recommended transfer size return -1 in that field */
+	__le32 OptimalTransferSize;  /* bsize on some os, iosize on other os */
+	__le32 BlockSize;
+    /* The next three fields are in terms of the block size.
+	(above). If block size is unknown, 4096 would be a
+	reasonable block size for a server to report.
+	Note that returning the blocks/blocksavail removes need
+	to make a second call (to QFSInfo level 0x103 to get this info.
+	UserBlockAvail is typically less than or equal to BlocksAvail,
+	if no distinction is made return the same value in each */
+	__le64 TotalBlocks;
+	__le64 BlocksAvail;       /* bfree */
+	__le64 UserBlocksAvail;   /* bavail */
+    /* For undefined Node fields or FSID return -1 */
+	__le64 TotalFileNodes;
+	__le64 FreeFileNodes;
+	__le64 FileSysIdentifier;   /* fsid */
+	/* NB Namelen comes from FILE_SYSTEM_ATTRIBUTE_INFO call */
+	/* NB flags can come from FILE_SYSTEM_DEVICE_INFO call   */
+} __attribute__((packed)) FILE_SYSTEM_POSIX_INFO;
+
+/* DeviceType Flags */
+#define FILE_DEVICE_CD_ROM              0x00000002
+#define FILE_DEVICE_CD_ROM_FILE_SYSTEM  0x00000003
+#define FILE_DEVICE_DFS                 0x00000006
+#define FILE_DEVICE_DISK                0x00000007
+#define FILE_DEVICE_DISK_FILE_SYSTEM    0x00000008
+#define FILE_DEVICE_FILE_SYSTEM         0x00000009
+#define FILE_DEVICE_NAMED_PIPE          0x00000011
+#define FILE_DEVICE_NETWORK             0x00000012
+#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014
+#define FILE_DEVICE_NULL                0x00000015
+#define FILE_DEVICE_PARALLEL_PORT       0x00000016
+#define FILE_DEVICE_PRINTER             0x00000018
+#define FILE_DEVICE_SERIAL_PORT         0x0000001b
+#define FILE_DEVICE_STREAMS             0x0000001e
+#define FILE_DEVICE_TAPE                0x0000001f
+#define FILE_DEVICE_TAPE_FILE_SYSTEM    0x00000020
+#define FILE_DEVICE_VIRTUAL_DISK        0x00000024
+#define FILE_DEVICE_NETWORK_REDIRECTOR  0x00000028
+
+typedef struct {
+	__le32 DeviceType;
+	__le32 DeviceCharacteristics;
+} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
+
+/* minimum includes first three fields, and empty FS Name */
+#define MIN_FS_ATTR_INFO_SIZE 12
+
+
+/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
+#define FILE_SUPPORT_INTEGRITY_STREAMS	0x04000000
+#define FILE_SUPPORTS_USN_JOURNAL	0x02000000
+#define FILE_SUPPORTS_OPEN_BY_FILE_ID	0x01000000
+#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000
+#define FILE_SUPPORTS_HARD_LINKS	0x00400000
+#define FILE_SUPPORTS_TRANSACTIONS	0x00200000
+#define FILE_SEQUENTIAL_WRITE_ONCE	0x00100000
+#define FILE_READ_ONLY_VOLUME		0x00080000
+#define FILE_NAMED_STREAMS		0x00040000
+#define FILE_SUPPORTS_ENCRYPTION	0x00020000
+#define FILE_SUPPORTS_OBJECT_IDS	0x00010000
+#define FILE_VOLUME_IS_COMPRESSED	0x00008000
+#define FILE_SUPPORTS_REMOTE_STORAGE	0x00000100
+#define FILE_SUPPORTS_REPARSE_POINTS	0x00000080
+#define FILE_SUPPORTS_SPARSE_FILES	0x00000040
+#define FILE_VOLUME_QUOTAS		0x00000020
+#define FILE_FILE_COMPRESSION		0x00000010
+#define FILE_PERSISTENT_ACLS		0x00000008
+#define FILE_UNICODE_ON_DISK		0x00000004
+#define FILE_CASE_PRESERVED_NAMES	0x00000002
+#define FILE_CASE_SENSITIVE_SEARCH	0x00000001
+typedef struct {
+	__le32 Attributes;
+	__le32 MaxPathNameComponentLength;
+	__le32 FileSystemNameLen;
+	char FileSystemName[52]; /* do not have to save this - get subset? */
+} __attribute__((packed)) FILE_SYSTEM_ATTRIBUTE_INFO;
+
+/******************************************************************************/
+/* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */
+/******************************************************************************/
+typedef struct { /* data block encoding of response to level 263 QPathInfo */
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 Attributes;
+	__u32 Pad1;
+	__le64 AllocationSize;
+	__le64 EndOfFile;	/* size ie offset to first free byte in file */
+	__le32 NumberOfLinks;	/* hard links */
+	__u8 DeletePending;
+	__u8 Directory;
+	__u16 Pad2;
+	__le64 IndexNumber;
+	__le32 EASize;
+	__le32 AccessFlags;
+	__u64 IndexNumber1;
+	__le64 CurrentByteOffset;
+	__le32 Mode;
+	__le32 AlignmentRequirement;
+	__le32 FileNameLength;
+	char FileName[1];
+} __attribute__((packed)) FILE_ALL_INFO;	/* level 0x107 QPathInfo */
+
+/* defines for enumerating possible values of the Unix type field below */
+#define UNIX_FILE      0
+#define UNIX_DIR       1
+#define UNIX_SYMLINK   2
+#define UNIX_CHARDEV   3
+#define UNIX_BLOCKDEV  4
+#define UNIX_FIFO      5
+#define UNIX_SOCKET    6
+typedef struct {
+	__le64 EndOfFile;
+	__le64 NumOfBytes;
+	__le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */
+	__le64 LastAccessTime;
+	__le64 LastModificationTime;
+	__le64 Uid;
+	__le64 Gid;
+	__le32 Type;
+	__le64 DevMajor;
+	__le64 DevMinor;
+	__le64 UniqueId;
+	__le64 Permissions;
+	__le64 Nlinks;
+} __attribute__((packed)) FILE_UNIX_BASIC_INFO;	/* level 0x200 QPathInfo */
+
+typedef struct {
+	char LinkDest[1];
+} __attribute__((packed)) FILE_UNIX_LINK_INFO;	/* level 0x201 QPathInfo */
+
+/* The following three structures are needed only for
+	setting time to NT4 and some older servers via
+	the primitive DOS time format */
+typedef struct {
+	__u16 Day:5;
+	__u16 Month:4;
+	__u16 Year:7;
+} __attribute__((packed)) SMB_DATE;
+
+typedef struct {
+	__u16 TwoSeconds:5;
+	__u16 Minutes:6;
+	__u16 Hours:5;
+} __attribute__((packed)) SMB_TIME;
+
+typedef struct {
+	__le16 CreationDate; /* SMB Date see above */
+	__le16 CreationTime; /* SMB Time */
+	__le16 LastAccessDate;
+	__le16 LastAccessTime;
+	__le16 LastWriteDate;
+	__le16 LastWriteTime;
+	__le32 DataSize; /* File Size (EOF) */
+	__le32 AllocationSize;
+	__le16 Attributes; /* verify not u32 */
+	__le32 EASize;
+} __attribute__((packed)) FILE_INFO_STANDARD;  /* level 1 SetPath/FileInfo */
+
+typedef struct {
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 Attributes;
+	__u32 Pad;
+} __attribute__((packed)) FILE_BASIC_INFO;	/* size info, level 0x101 */
+
+struct file_allocation_info {
+	__le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
+} __attribute__((packed));	/* size used on disk, for level 0x103 for set,
+				   0x105 for query */
+
+struct file_end_of_file_info {
+	__le64 FileSize;		/* offset to end of file */
+} __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */
+
+struct file_alt_name_info {
+	__u8   alt_name[1];
+} __attribute__((packed));      /* level 0x0108 */
+
+struct file_stream_info {
+	__le32 number_of_streams;  /* BB check sizes and verify location */
+	/* followed by info on streams themselves
+		u64 size;
+		u64 allocation_size
+		stream info */
+};      /* level 0x109 */
+
+struct file_compression_info {
+	__le64 compressed_size;
+	__le16 format;
+	__u8   unit_shift;
+	__u8   ch_shift;
+	__u8   cl_shift;
+	__u8   pad[3];
+} __attribute__((packed));      /* level 0x10b */
+
+/* POSIX ACL set/query path info structures */
+#define CIFS_ACL_VERSION 1
+struct cifs_posix_ace { /* access control entry (ACE) */
+	__u8  cifs_e_tag;
+	__u8  cifs_e_perm;
+	__le64 cifs_uid; /* or gid */
+} __attribute__((packed));
+
+struct cifs_posix_acl { /* access conrol list  (ACL) */
+	__le16	version;
+	__le16	access_entry_count;  /* access ACL - count of entries */
+	__le16	default_entry_count; /* default ACL - count of entries */
+	struct cifs_posix_ace ace_array[0];
+	/* followed by
+	struct cifs_posix_ace default_ace_arraay[] */
+} __attribute__((packed));  /* level 0x204 */
+
+/* types of access control entries already defined in posix_acl.h */
+/* #define CIFS_POSIX_ACL_USER_OBJ	 0x01
+#define CIFS_POSIX_ACL_USER      0x02
+#define CIFS_POSIX_ACL_GROUP_OBJ 0x04
+#define CIFS_POSIX_ACL_GROUP     0x08
+#define CIFS_POSIX_ACL_MASK      0x10
+#define CIFS_POSIX_ACL_OTHER     0x20 */
+
+/* types of perms */
+/* #define CIFS_POSIX_ACL_EXECUTE   0x01
+#define CIFS_POSIX_ACL_WRITE     0x02
+#define CIFS_POSIX_ACL_READ	     0x04 */
+
+/* end of POSIX ACL definitions */
+
+/* POSIX Open Flags */
+#define SMB_O_RDONLY 	 0x1
+#define SMB_O_WRONLY 	0x2
+#define SMB_O_RDWR 	0x4
+#define SMB_O_CREAT 	0x10
+#define SMB_O_EXCL 	0x20
+#define SMB_O_TRUNC 	0x40
+#define SMB_O_APPEND 	0x80
+#define SMB_O_SYNC 	0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW 	0x400
+#define SMB_O_DIRECT 	0x800
+
+typedef struct {
+	__le32 OpenFlags; /* same as NT CreateX */
+	__le32 PosixOpenFlags;
+	__le64 Permissions;
+	__le16 Level; /* reply level requested (see QPathInfo levels) */
+} __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */
+
+typedef struct {
+	__le16 OplockFlags;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le16 ReturnedLevel;
+	__le16 Pad;
+	/* struct following varies based on requested level */
+} __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */
+
+#define SMB_POSIX_UNLINK_FILE_TARGET		0
+#define SMB_POSIX_UNLINK_DIRECTORY_TARGET	1
+
+struct unlink_psx_rq { /* level 0x20a SetPathInfo */
+	__le16 type;
+} __attribute__((packed));
+
+struct file_internal_info {
+	__le64  UniqueId; /* inode number */
+} __attribute__((packed));      /* level 0x3ee */
+
+struct file_mode_info {
+	__le32	Mode;
+} __attribute__((packed));      /* level 0x3f8 */
+
+struct file_attrib_tag {
+	__le32 Attribute;
+	__le32 ReparseTag;
+} __attribute__((packed));      /* level 0x40b */
+
+
+/********************************************************/
+/*  FindFirst/FindNext transact2 data buffer formats    */
+/********************************************************/
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 ResumeKey; /* as with FileIndex - no need to convert */
+	FILE_UNIX_BASIC_INFO basic;
+	char FileName[1];
+} __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	char FileName[1];
+} __attribute__((packed)) FILE_DIRECTORY_INFO;   /* level 0x101 FF resp data */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	__le32 EaSize; /* length of the xattrs */
+	char FileName[1];
+} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	__le32 EaSize; /* EA size */
+	__le32 Reserved;
+	__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
+	char FileName[1];
+} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	__le32 EaSize; /* length of the xattrs */
+	__u8   ShortNameLength;
+	__u8   Reserved;
+	__u8   ShortName[12];
+	char FileName[1];
+} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */
+
+typedef struct {
+	__u32  ResumeKey;
+	__le16 CreationDate; /* SMB Date */
+	__le16 CreationTime; /* SMB Time */
+	__le16 LastAccessDate;
+	__le16 LastAccessTime;
+	__le16 LastWriteDate;
+	__le16 LastWriteTime;
+	__le32 DataSize; /* File Size (EOF) */
+	__le32 AllocationSize;
+	__le16 Attributes; /* verify not u32 */
+	__u8   FileNameLength;
+	char FileName[1];
+} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */
+
+
+struct win_dev {
+	unsigned char type[8]; /* IntxCHR or IntxBLK */
+	__le64 major;
+	__le64 minor;
+} __attribute__((packed));
+
+struct gea {
+	unsigned char name_len;
+	char name[1];
+} __attribute__((packed));
+
+struct gealist {
+	unsigned long list_len;
+	struct gea list[1];
+} __attribute__((packed));
+
+struct fea {
+	unsigned char EA_flags;
+	__u8 name_len;
+	__le16 value_len;
+	char name[1];
+	/* optionally followed by value */
+} __attribute__((packed));
+/* flags for _FEA.fEA */
+#define FEA_NEEDEA         0x80	/* need EA bit */
+
+struct fealist {
+	__le32 list_len;
+	struct fea list[1];
+} __attribute__((packed));
+
+/* used to hold an arbitrary blob of data */
+struct data_blob {
+	__u8 *data;
+	size_t length;
+	void (*free) (struct data_blob *data_blob);
+} __attribute__((packed));
+
+
+#ifdef CONFIG_CIFS_POSIX
+/*
+	For better POSIX semantics from Linux client, (even better
+	than the existing CIFS Unix Extensions) we need updated PDUs for:
+
+	1) PosixCreateX - to set and return the mode, inode#, device info and
+	perhaps add a CreateDevice - to create Pipes and other special .inodes
+	Also note POSIX open flags
+	2) Close - to return the last write time to do cache across close
+		more safely
+	3) FindFirst return unique inode number - what about resume key, two
+	forms short (matches readdir) and full (enough info to cache inodes)
+	4) Mkdir - set mode
+
+	And under consideration:
+	5) FindClose2 (return nanosecond timestamp ??)
+	6) Use nanosecond timestamps throughout all time fields if
+	   corresponding attribute flag is set
+	7) sendfile - handle based copy
+
+	what about fixing 64 bit alignment
+
+	There are also various legacy SMB/CIFS requests used as is
+
+	From existing Lanman and NTLM dialects:
+	--------------------------------------
+	NEGOTIATE
+	SESSION_SETUP_ANDX (BB which?)
+	TREE_CONNECT_ANDX (BB which wct?)
+	TREE_DISCONNECT (BB add volume timestamp on response)
+	LOGOFF_ANDX
+	DELETE (note delete open file behavior)
+	DELETE_DIRECTORY
+	READ_AND_X
+	WRITE_AND_X
+	LOCKING_AND_X (note posix lock semantics)
+	RENAME (note rename across dirs and open file rename posix behaviors)
+	NT_RENAME (for hardlinks) Is this good enough for all features?
+	FIND_CLOSE2
+	TRANSACTION2 (18 cases)
+		SMB_SET_FILE_END_OF_FILE_INFO2 SMB_SET_PATH_END_OF_FILE_INFO2
+		(BB verify that never need to set allocation size)
+		SMB_SET_FILE_BASIC_INFO2 (setting times - BB can it be done via
+			 Unix ext?)
+
+	COPY (note support for copy across directories) - FUTURE, OPTIONAL
+	setting/getting OS/2 EAs - FUTURE (BB can this handle
+	setting Linux xattrs perfectly)         - OPTIONAL
+	dnotify                                 - FUTURE, OPTIONAL
+	quota                                   - FUTURE, OPTIONAL
+
+	Note that various requests implemented for NT interop such as
+		NT_TRANSACT (IOCTL) QueryReparseInfo
+	are unneeded to servers compliant with the CIFS POSIX extensions
+
+	From CIFS Unix Extensions:
+	-------------------------
+	T2 SET_PATH_INFO (SMB_SET_FILE_UNIX_LINK) for symlinks
+	T2 SET_PATH_INFO (SMB_SET_FILE_BASIC_INFO2)
+	T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_LINK)
+	T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_BASIC)	BB check for missing
+							inode fields
+				Actually a need QUERY_FILE_UNIX_INFO
+				since has inode num
+				BB what about a) blksize/blkbits/blocks
+							  b) i_version
+							  c) i_rdev
+							  d) notify mask?
+							  e) generation
+							  f) size_seqcount
+	T2 FIND_FIRST/FIND_NEXT FIND_FILE_UNIX
+	TRANS2_GET_DFS_REFERRAL		      - OPTIONAL but recommended
+	T2_QFS_INFO QueryDevice/AttributeInfo - OPTIONAL
+ */
+
+/* xsymlink is a symlink format (used by MacOS) that can be used
+   to save symlink info in a regular file when
+   mounted to operating systems that do not
+   support the cifs Unix extensions or EAs (for xattr
+   based symlinks).  For such a file to be recognized
+   as containing symlink data:
+
+   1) file size must be 1067,
+   2) signature must begin file data,
+   3) length field must be set to ASCII representation
+	of a number which is less than or equal to 1024,
+   4) md5 must match that of the path data */
+
+struct xsymlink {
+	/* 1067 bytes */
+	char signature[4]; /* XSym */ /* not null terminated */
+	char cr0;         /* \n */
+/* ASCII representation of length (4 bytes decimal) terminated by \n not null */
+	char length[4];
+	char cr1;         /* \n */
+/* md5 of valid subset of path ie path[0] through path[length-1] */
+	__u8 md5[32];
+	char cr2;        /* \n */
+/* if room left, then end with \n then 0x20s by convention but not required */
+	char path[1024];
+} __attribute__((packed));
+
+typedef struct file_xattr_info {
+	/* BB do we need another field for flags? BB */
+	__u32 xattr_name_len;
+	__u32 xattr_value_len;
+	char  xattr_name[0];
+	/* followed by xattr_value[xattr_value_len], no pad */
+} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
+					      level 0x205 */
+
+/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
+
+typedef struct file_chattr_info {
+	__le64	mask; /* list of all possible attribute bits */
+	__le64	mode; /* list of actual attribute bits on this inode */
+} __attribute__((packed)) FILE_CHATTR_INFO;  /* ext attributes
+						(chattr, chflags) level 0x206 */
+#endif 				/* POSIX */
+#endif				/* _CIFSPDU_H */
diff --git a/kernel/fs/cifs/cifsproto.h b/kernel/fs/cifs/cifsproto.h
new file mode 100644
index 000000000..c63fd1dde
--- /dev/null
+++ b/kernel/fs/cifs/cifsproto.h
@@ -0,0 +1,514 @@
+/*
+ *   fs/cifs/cifsproto.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFSPROTO_H
+#define _CIFSPROTO_H
+#include <linux/nls.h>
+
+struct statfs;
+struct smb_vol;
+struct smb_rqst;
+
+/*
+ *****************************************************************
+ * All Prototypes
+ *****************************************************************
+ */
+
+extern struct smb_hdr *cifs_buf_get(void);
+extern void cifs_buf_release(void *);
+extern struct smb_hdr *cifs_small_buf_get(void);
+extern void cifs_small_buf_release(void *);
+extern void free_rsp_buf(int, void *);
+extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
+					struct kvec *iov);
+extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
+			unsigned int /* length */);
+extern unsigned int _get_xid(void);
+extern void _free_xid(unsigned int);
+#define get_xid()						\
+({								\
+	unsigned int __xid = _get_xid();				\
+	cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n",	\
+		 __func__, __xid,					\
+		 from_kuid(&init_user_ns, current_fsuid()));		\
+	__xid;							\
+})
+
+#define free_xid(curr_xid)					\
+do {								\
+	_free_xid(curr_xid);					\
+	cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n",	\
+		 __func__, curr_xid, (int)rc);				\
+} while (0)
+extern int init_cifs_idmap(void);
+extern void exit_cifs_idmap(void);
+extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
+				     struct cifs_sb_info *cifs_sb,
+				     struct cifs_tcon *tcon);
+extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
+extern char *cifs_compose_mount_options(const char *sb_mountdata,
+		const char *fullpath, const struct dfs_info3_param *ref,
+		char **devname);
+/* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+					struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern void cifs_delete_mid(struct mid_q_entry *mid);
+extern void cifs_wake_up_task(struct mid_q_entry *mid);
+extern int cifs_call_async(struct TCP_Server_Info *server,
+			struct smb_rqst *rqst,
+			mid_receive_t *receive, mid_callback_t *callback,
+			void *cbdata, const int flags);
+extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
+			struct smb_hdr * /* input */ ,
+			struct smb_hdr * /* out */ ,
+			int * /* bytes returned */ , const int);
+extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
+			    char *in_buf, int flags);
+extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *,
+				struct smb_rqst *);
+extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
+						struct smb_rqst *);
+extern int cifs_check_receive(struct mid_q_entry *mid,
+			struct TCP_Server_Info *server, bool log_error);
+extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
+				 unsigned int size, unsigned int *num,
+				 unsigned int *credits);
+extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
+			struct kvec *, int /* nvec to send */,
+			int * /* type of buf returned */ , const int flags);
+extern int SendReceiveBlockingLock(const unsigned int xid,
+			struct cifs_tcon *ptcon,
+			struct smb_hdr *in_buf ,
+			struct smb_hdr *out_buf,
+			int *bytes_returned);
+extern int cifs_reconnect(struct TCP_Server_Info *server);
+extern int checkSMB(char *buf, unsigned int length);
+extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
+extern bool backup_cred(struct cifs_sb_info *);
+extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+			    unsigned int bytes_written);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
+extern unsigned int smbCalcSize(void *buf);
+extern int decode_negTokenInit(unsigned char *security_blob, int length,
+			struct TCP_Server_Info *server);
+extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
+extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
+extern int map_smb_to_linux_error(char *buf, bool logErr);
+extern void header_assemble(struct smb_hdr *, char /* command */ ,
+			    const struct cifs_tcon *, int /* length of
+			    fixed section (word count) in two byte units */);
+extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
+				struct cifs_ses *ses,
+				void **request_buf);
+extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
+				enum securityEnum requested);
+extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+			  const struct nls_table *nls_cp);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
+extern u64 cifs_UnixTimeToNT(struct timespec);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
+				      int offset);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
+extern int cifs_get_writer(struct cifsInodeInfo *cinode);
+extern void cifs_put_writer(struct cifsInodeInfo *cinode);
+extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode);
+extern int cifs_unlock_range(struct cifsFileInfo *cfile,
+			     struct file_lock *flock, const unsigned int xid);
+extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
+
+extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
+					      struct file *file,
+					      struct tcon_link *tlink,
+					      __u32 oplock);
+extern int cifs_posix_open(char *full_path, struct inode **inode,
+			   struct super_block *sb, int mode,
+			   unsigned int f_flags, __u32 *oplock, __u16 *netfid,
+			   unsigned int xid);
+void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
+extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
+				     FILE_UNIX_BASIC_INFO *info,
+				     struct cifs_sb_info *cifs_sb);
+extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *,
+					struct cifs_sb_info *);
+extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
+extern struct inode *cifs_iget(struct super_block *sb,
+			       struct cifs_fattr *fattr);
+
+extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
+			       FILE_ALL_INFO *data, struct super_block *sb,
+			       int xid, const struct cifs_fid *fid);
+extern int cifs_get_inode_info_unix(struct inode **pinode,
+			const unsigned char *search_path,
+			struct super_block *sb, unsigned int xid);
+extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs,
+			      unsigned int xid, char *full_path, __u32 dosattr);
+extern int cifs_rename_pending_delete(const char *full_path,
+				      struct dentry *dentry,
+				      const unsigned int xid);
+extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
+			      struct cifs_fattr *fattr, struct inode *inode,
+			      const char *path, const struct cifs_fid *pfid);
+extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
+					kuid_t, kgid_t);
+extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
+					const char *, u32 *);
+extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
+						const struct cifs_fid *, u32 *);
+extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
+				const char *, int);
+
+extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
+extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+		     unsigned int to_read);
+extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
+		struct kvec *iov_orig, unsigned int nr_segs,
+		unsigned int to_read);
+extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+			       struct cifs_sb_info *cifs_sb);
+extern int cifs_match_super(struct super_block *, void *);
+extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
+extern struct smb_vol *cifs_get_volume_info(char *mount_data,
+					    const char *devname);
+extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
+extern void cifs_umount(struct cifs_sb_info *);
+extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
+extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
+				    __u64 length, __u8 type,
+				    struct cifsLockInfo **conf_lock,
+				    int rw_check);
+extern void cifs_add_pending_open(struct cifs_fid *fid,
+				  struct tcon_link *tlink,
+				  struct cifs_pending_open *open);
+extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
+					 struct tcon_link *tlink,
+					 struct cifs_pending_open *open);
+extern void cifs_del_pending_open(struct cifs_pending_open *open);
+
+#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
+extern void cifs_dfs_release_automount_timer(void);
+#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+#define cifs_dfs_release_automount_timer()	do { } while (0)
+#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+
+void cifs_proc_init(void);
+void cifs_proc_clean(void);
+
+extern void cifs_move_llist(struct list_head *source, struct list_head *dest);
+extern void cifs_free_llist(struct list_head *llist);
+extern void cifs_del_lock_waiters(struct cifsLockInfo *lock);
+
+extern int cifs_negotiate_protocol(const unsigned int xid,
+				   struct cifs_ses *ses);
+extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+			      struct nls_table *nls_info);
+extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
+extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
+
+extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
+		    const char *tree, struct cifs_tcon *tcon,
+		    const struct nls_table *);
+
+extern int CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
+		const char *searchName, struct cifs_sb_info *cifs_sb,
+		__u16 *searchHandle, __u16 search_flags,
+		struct cifs_search_info *psrch_inf,
+		bool msearch);
+
+extern int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
+		__u16 searchHandle, __u16 search_flags,
+		struct cifs_search_info *psrch_inf);
+
+extern int CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
+			const __u16 search_handle);
+
+extern int CIFSSMBQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			u16 netfid, FILE_ALL_INFO *pFindData);
+extern int CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			    const char *search_Name, FILE_ALL_INFO *data,
+			    int legacy /* whether to use old info level */,
+			    const struct nls_table *nls_codepage, int remap);
+extern int SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon,
+			       const char *search_name, FILE_ALL_INFO *data,
+			       const struct nls_table *nls_codepage, int remap);
+
+extern int CIFSSMBUnixQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
+extern int CIFSSMBUnixQPathInfo(const unsigned int xid,
+			struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			FILE_UNIX_BASIC_INFO *pFindData,
+			const struct nls_table *nls_codepage, int remap);
+
+extern int CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
+			   const char *search_name,
+			   struct dfs_info3_param **target_nodes,
+			   unsigned int *num_of_nodes,
+			   const struct nls_table *nls_codepage, int remap);
+
+extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
+			const char *old_path,
+			const struct nls_table *nls_codepage,
+			unsigned int *num_referrals,
+			struct dfs_info3_param **referrals, int remap);
+extern void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
+				 struct cifs_sb_info *cifs_sb,
+				 struct smb_vol *vol);
+extern int CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			struct kstatfs *FSData);
+extern int SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			struct kstatfs *FSData);
+extern int CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			__u64 cap);
+
+extern int CIFSSMBQFSAttributeInfo(const unsigned int xid,
+			struct cifs_tcon *tcon);
+extern int CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			struct kstatfs *FSData);
+
+extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			const char *fileName, const FILE_BASIC_INFO *data,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			const FILE_BASIC_INFO *data, __u16 fid,
+			__u32 pid_of_opener);
+extern int CIFSSMBSetFileDisposition(const unsigned int xid,
+				     struct cifs_tcon *tcon,
+				     bool delete_file, __u16 fid,
+				     __u32 pid_of_opener);
+#if 0
+extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon,
+			char *fileName, __u16 dos_attributes,
+			const struct nls_table *nls_codepage);
+#endif /* possibly unneeded function */
+extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
+			 const char *file_name, __u64 size,
+			 struct cifs_sb_info *cifs_sb, bool set_allocation);
+extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
+			      struct cifsFileInfo *cfile, __u64 size,
+			      bool set_allocation);
+
+struct cifs_unix_set_info_args {
+	__u64	ctime;
+	__u64	atime;
+	__u64	mtime;
+	__u64	mode;
+	kuid_t	uid;
+	kgid_t	gid;
+	dev_t	device;
+};
+
+extern int CIFSSMBUnixSetFileInfo(const unsigned int xid,
+				  struct cifs_tcon *tcon,
+				  const struct cifs_unix_set_info_args *args,
+				  u16 fid, u32 pid_of_opener);
+
+extern int CIFSSMBUnixSetPathInfo(const unsigned int xid,
+				  struct cifs_tcon *tcon, const char *file_name,
+				  const struct cifs_unix_set_info_args *args,
+				  const struct nls_table *nls_codepage,
+				  int remap);
+
+extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
+			const char *name, struct cifs_sb_info *cifs_sb);
+extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
+			const char *name, struct cifs_sb_info *cifs_sb);
+extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
+			const char *name, __u16 type,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon,
+			  const char *name, struct cifs_sb_info *cifs_sb);
+extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+			 const char *from_name, const char *to_name,
+			 struct cifs_sb_info *cifs_sb);
+extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon,
+				 int netfid, const char *target_name,
+				 const struct nls_table *nls_codepage,
+				 int remap_special_chars);
+extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
+			      const char *from_name, const char *to_name,
+			      struct cifs_sb_info *cifs_sb);
+extern int CIFSUnixCreateHardLink(const unsigned int xid,
+			struct cifs_tcon *tcon,
+			const char *fromName, const char *toName,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSUnixCreateSymLink(const unsigned int xid,
+			struct cifs_tcon *tcon,
+			const char *fromName, const char *toName,
+			const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
+			struct cifs_tcon *tcon,
+			const unsigned char *searchName, char **syminfo,
+			const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
+			       __u16 fid, char **symlinkinfo,
+			       const struct nls_table *nls_codepage);
+extern int CIFSSMB_set_compression(const unsigned int xid,
+				   struct cifs_tcon *tcon, __u16 fid);
+extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
+		     int *oplock, FILE_ALL_INFO *buf);
+extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
+			const char *fileName, const int disposition,
+			const int access_flags, const int omode,
+			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
+			const struct nls_table *nls_codepage, int remap);
+extern int CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon,
+			u32 posix_flags, __u64 mode, __u16 *netfid,
+			FILE_UNIX_BASIC_INFO *pRetData,
+			__u32 *pOplock, const char *name,
+			const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon,
+			const int smb_file_id);
+
+extern int CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon,
+			const int smb_file_id);
+
+extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, char **buf,
+			int *return_buf_type);
+extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, const char *buf,
+			const char __user *ubuf, const int long_op);
+extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, struct kvec *iov, const int nvec);
+extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
+				 const char *search_name, __u64 *inode_number,
+				 const struct nls_table *nls_codepage,
+				 int remap);
+
+extern int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+		      const __u16 netfid, const __u8 lock_type,
+		      const __u32 num_unlock, const __u32 num_lock,
+		      LOCKING_ANDX_RANGE *buf);
+extern int CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
+			const __u16 netfid, const __u32 netpid, const __u64 len,
+			const __u64 offset, const __u32 numUnlock,
+			const __u32 numLock, const __u8 lockType,
+			const bool waitFlag, const __u8 oplock_level);
+extern int CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
+			const __u16 smb_file_id, const __u32 netpid,
+			const loff_t start_offset, const __u64 len,
+			struct file_lock *, const __u16 lock_type,
+			const bool waitFlag);
+extern int CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
+extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses);
+
+extern struct cifs_ses *sesInfoAlloc(void);
+extern void sesInfoFree(struct cifs_ses *);
+extern struct cifs_tcon *tconInfoAlloc(void);
+extern void tconInfoFree(struct cifs_tcon *);
+
+extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+		   __u32 *pexpected_response_sequence_number);
+extern int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
+			  __u32 *);
+extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
+extern int cifs_verify_signature(struct smb_rqst *rqst,
+				 struct TCP_Server_Info *server,
+				__u32 expected_sequence_number);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
+			const struct nls_table *);
+extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
+extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct cifs_ses *);
+extern int generate_smb3signingkey(struct cifs_ses *);
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+extern int calc_lanman_hash(const char *password, const char *cryptkey,
+				bool encrypt, char *lnm_session_key);
+#endif /* CIFS_WEAK_PW_HASH */
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+extern int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
+			const int notify_subdirs, const __u16 netfid,
+			__u32 filter, struct file *file, int multishot,
+			const struct nls_table *nls_codepage);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
+extern int CIFSSMBCopy(unsigned int xid,
+			struct cifs_tcon *source_tcon,
+			const char *fromName,
+			const __u16 target_tid,
+			const char *toName, const int flags,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			const unsigned char *ea_name, char *EAData,
+			size_t bufsize, const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
+		const char *fileName, const char *ea_name,
+		const void *ea_value, const __u16 ea_value_len,
+		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
+			__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
+extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
+			struct cifs_ntsd *, __u32, int);
+extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
+		const unsigned char *searchName,
+		char *acl_inf, const int buflen, const int acl_type,
+		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
+		const unsigned char *fileName,
+		const char *local_acl, const int buflen, const int acl_type,
+		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
+			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
+extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
+extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
+extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			      struct cifs_sb_info *cifs_sb,
+			      struct cifs_fattr *fattr,
+			      const unsigned char *path);
+extern int mdfour(unsigned char *, unsigned char *, int);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
+			const struct nls_table *codepage);
+extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+			unsigned char *p24);
+
+void cifs_readdata_release(struct kref *refcount);
+int cifs_async_readv(struct cifs_readdata *rdata);
+int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
+
+int cifs_async_writev(struct cifs_writedata *wdata,
+		      void (*release)(struct kref *kref));
+void cifs_writev_complete(struct work_struct *work);
+struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
+						work_func_t complete);
+void cifs_writedata_release(struct kref *refcount);
+int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			  struct cifs_sb_info *cifs_sb,
+			  const unsigned char *path, char *pbuf,
+			  unsigned int *pbytes_read);
+int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			   struct cifs_sb_info *cifs_sb,
+			   const unsigned char *path, char *pbuf,
+			   unsigned int *pbytes_written);
+#endif			/* _CIFSPROTO_H */
diff --git a/kernel/fs/cifs/cifssmb.c b/kernel/fs/cifs/cifssmb.c
new file mode 100644
index 000000000..f26ffbfc6
--- /dev/null
+++ b/kernel/fs/cifs/cifssmb.c
@@ -0,0 +1,6526 @@
+/*
+ *   fs/cifs/cifssmb.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Contains the routines for constructing the SMB PDUs themselves
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c   */
+ /* These are mostly routines that operate on a pathname, or on a tree id     */
+ /* (mounted volume), but there are eight handle based routines which must be */
+ /* treated slightly differently for reconnection purposes since we never     */
+ /* want to reuse a stale file handle and only the caller knows the file info */
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
+#include <asm/uaccess.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsacl.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "fscache.h"
+
+#ifdef CONFIG_CIFS_POSIX
+static struct {
+	int index;
+	char *name;
+} protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	{LANMAN_PROT, "\2LM1.2X002"},
+	{LANMAN2_PROT, "\2LANMAN2.1"},
+#endif /* weak password hashing for legacy clients */
+	{CIFS_PROT, "\2NT LM 0.12"},
+	{POSIX_PROT, "\2POSIX 2"},
+	{BAD_PROT, "\2"}
+};
+#else
+static struct {
+	int index;
+	char *name;
+} protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	{LANMAN_PROT, "\2LM1.2X002"},
+	{LANMAN2_PROT, "\2LANMAN2.1"},
+#endif /* weak password hashing for legacy clients */
+	{CIFS_PROT, "\2NT LM 0.12"},
+	{BAD_PROT, "\2"}
+};
+#endif
+
+/* define the number of elements in the cifs dialect array */
+#ifdef CONFIG_CIFS_POSIX
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 4
+#else
+#define CIFS_NUM_PROT 2
+#endif /* CIFS_WEAK_PW_HASH */
+#else /* not posix */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 3
+#else
+#define CIFS_NUM_PROT 1
+#endif /* CONFIG_CIFS_WEAK_PW_HASH */
+#endif /* CIFS_POSIX */
+
+/*
+ * Mark as invalid, all open files on tree connections since they
+ * were closed when session to server was lost.
+ */
+void
+cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
+{
+	struct cifsFileInfo *open_file = NULL;
+	struct list_head *tmp;
+	struct list_head *tmp1;
+
+	/* list all files open on tree connection and mark them invalid */
+	spin_lock(&cifs_file_list_lock);
+	list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
+		open_file = list_entry(tmp, struct cifsFileInfo, tlist);
+		open_file->invalidHandle = true;
+		open_file->oplock_break_cancelled = true;
+	}
+	spin_unlock(&cifs_file_list_lock);
+	/*
+	 * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+	 * to this tcon.
+	 */
+}
+
+/* reconnect the socket, tcon, and smb session if needed */
+static int
+cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
+{
+	int rc;
+	struct cifs_ses *ses;
+	struct TCP_Server_Info *server;
+	struct nls_table *nls_codepage;
+
+	/*
+	 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+	 * tcp and smb session status done differently for those three - in the
+	 * calling routine
+	 */
+	if (!tcon)
+		return 0;
+
+	ses = tcon->ses;
+	server = ses->server;
+
+	/*
+	 * only tree disconnect, open, and write, (and ulogoff which does not
+	 * have tcon) are allowed as we start force umount
+	 */
+	if (tcon->tidStatus == CifsExiting) {
+		if (smb_command != SMB_COM_WRITE_ANDX &&
+		    smb_command != SMB_COM_OPEN_ANDX &&
+		    smb_command != SMB_COM_TREE_DISCONNECT) {
+			cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+				 smb_command);
+			return -ENODEV;
+		}
+	}
+
+	/*
+	 * Give demultiplex thread up to 10 seconds to reconnect, should be
+	 * greater than cifs socket timeout which is 7 seconds
+	 */
+	while (server->tcpStatus == CifsNeedReconnect) {
+		wait_event_interruptible_timeout(server->response_q,
+			(server->tcpStatus != CifsNeedReconnect), 10 * HZ);
+
+		/* are we still trying to reconnect? */
+		if (server->tcpStatus != CifsNeedReconnect)
+			break;
+
+		/*
+		 * on "soft" mounts we wait once. Hard mounts keep
+		 * retrying until process is killed or server comes
+		 * back on-line
+		 */
+		if (!tcon->retry) {
+			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
+			return -EHOSTDOWN;
+		}
+	}
+
+	if (!ses->need_reconnect && !tcon->need_reconnect)
+		return 0;
+
+	nls_codepage = load_nls_default();
+
+	/*
+	 * need to prevent multiple threads trying to simultaneously
+	 * reconnect the same SMB session
+	 */
+	mutex_lock(&ses->session_mutex);
+	rc = cifs_negotiate_protocol(0, ses);
+	if (rc == 0 && ses->need_reconnect)
+		rc = cifs_setup_session(0, ses, nls_codepage);
+
+	/* do we need to reconnect tcon? */
+	if (rc || !tcon->need_reconnect) {
+		mutex_unlock(&ses->session_mutex);
+		goto out;
+	}
+
+	cifs_mark_open_files_invalid(tcon);
+	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+	mutex_unlock(&ses->session_mutex);
+	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
+
+	if (rc)
+		goto out;
+
+	atomic_inc(&tconInfoReconnectCount);
+
+	/* tell server Unix caps we support */
+	if (ses->capabilities & CAP_UNIX)
+		reset_cifs_unix_caps(0, tcon, NULL, NULL);
+
+	/*
+	 * Removed call to reopen open files here. It is safer (and faster) to
+	 * reopen files one at a time as needed in read and write.
+	 *
+	 * FIXME: what about file locks? don't we need to reclaim them ASAP?
+	 */
+
+out:
+	/*
+	 * Check if handle based operation so we know whether we can continue
+	 * or not without returning to caller to reset file handle
+	 */
+	switch (smb_command) {
+	case SMB_COM_READ_ANDX:
+	case SMB_COM_WRITE_ANDX:
+	case SMB_COM_CLOSE:
+	case SMB_COM_FIND_CLOSE2:
+	case SMB_COM_LOCKING_ANDX:
+		rc = -EAGAIN;
+	}
+
+	unload_nls(nls_codepage);
+	return rc;
+}
+
+/* Allocate and return pointer to an SMB request buffer, and set basic
+   SMB information in the SMB header.  If the return code is zero, this
+   function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
+		void **request_buf)
+{
+	int rc;
+
+	rc = cifs_reconnect_tcon(tcon, smb_command);
+	if (rc)
+		return rc;
+
+	*request_buf = cifs_small_buf_get();
+	if (*request_buf == NULL) {
+		/* BB should we add a retry in here if not a writepage? */
+		return -ENOMEM;
+	}
+
+	header_assemble((struct smb_hdr *) *request_buf, smb_command,
+			tcon, wct);
+
+	if (tcon != NULL)
+		cifs_stats_inc(&tcon->num_smbs_sent);
+
+	return 0;
+}
+
+int
+small_smb_init_no_tc(const int smb_command, const int wct,
+		     struct cifs_ses *ses, void **request_buf)
+{
+	int rc;
+	struct smb_hdr *buffer;
+
+	rc = small_smb_init(smb_command, wct, NULL, request_buf);
+	if (rc)
+		return rc;
+
+	buffer = (struct smb_hdr *)*request_buf;
+	buffer->Mid = get_next_mid(ses->server);
+	if (ses->capabilities & CAP_UNICODE)
+		buffer->Flags2 |= SMBFLG2_UNICODE;
+	if (ses->capabilities & CAP_STATUS32)
+		buffer->Flags2 |= SMBFLG2_ERR_STATUS;
+
+	/* uid, tid can stay at zero as set in header assemble */
+
+	/* BB add support for turning on the signing when
+	this function is used after 1st of session setup requests */
+
+	return rc;
+}
+
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+__smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
+			void **request_buf, void **response_buf)
+{
+	*request_buf = cifs_buf_get();
+	if (*request_buf == NULL) {
+		/* BB should we add a retry in here if not a writepage? */
+		return -ENOMEM;
+	}
+    /* Although the original thought was we needed the response buf for  */
+    /* potential retries of smb operations it turns out we can determine */
+    /* from the mid flags when the request buffer can be resent without  */
+    /* having to use a second distinct buffer for the response */
+	if (response_buf)
+		*response_buf = *request_buf;
+
+	header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon,
+			wct);
+
+	if (tcon != NULL)
+		cifs_stats_inc(&tcon->num_smbs_sent);
+
+	return 0;
+}
+
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
+	 void **request_buf, void **response_buf)
+{
+	int rc;
+
+	rc = cifs_reconnect_tcon(tcon, smb_command);
+	if (rc)
+		return rc;
+
+	return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+
+static int
+smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon,
+			void **request_buf, void **response_buf)
+{
+	if (tcon->ses->need_reconnect || tcon->need_reconnect)
+		return -EHOSTDOWN;
+
+	return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+
+static int validate_t2(struct smb_t2_rsp *pSMB)
+{
+	unsigned int total_size;
+
+	/* check for plausible wct */
+	if (pSMB->hdr.WordCount < 10)
+		goto vt2_err;
+
+	/* check for parm and data offset going beyond end of smb */
+	if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
+	    get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
+		goto vt2_err;
+
+	total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
+	if (total_size >= 512)
+		goto vt2_err;
+
+	/* check that bcc is at least as big as parms + data, and that it is
+	 * less than negotiated smb buffer
+	 */
+	total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
+	if (total_size > get_bcc(&pSMB->hdr) ||
+	    total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
+		goto vt2_err;
+
+	return 0;
+vt2_err:
+	cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
+		sizeof(struct smb_t2_rsp) + 16);
+	return -EINVAL;
+}
+
+static int
+decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
+{
+	int	rc = 0;
+	u16	count;
+	char	*guid = pSMBr->u.extended_response.GUID;
+	struct TCP_Server_Info *server = ses->server;
+
+	count = get_bcc(&pSMBr->hdr);
+	if (count < SMB1_CLIENT_GUID_SIZE)
+		return -EIO;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if (server->srv_count > 1) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
+			cifs_dbg(FYI, "server UID changed\n");
+			memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+		}
+	} else {
+		spin_unlock(&cifs_tcp_ses_lock);
+		memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+	}
+
+	if (count == SMB1_CLIENT_GUID_SIZE) {
+		server->sec_ntlmssp = true;
+	} else {
+		count -= SMB1_CLIENT_GUID_SIZE;
+		rc = decode_negTokenInit(
+			pSMBr->u.extended_response.SecurityBlob, count, server);
+		if (rc != 1)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+	bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+	bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+	bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
+
+	/*
+	 * Is signing required by mnt options? If not then check
+	 * global_secflags to see if it is there.
+	 */
+	if (!mnt_sign_required)
+		mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+						CIFSSEC_MUST_SIGN);
+
+	/*
+	 * If signing is required then it's automatically enabled too,
+	 * otherwise, check to see if the secflags allow it.
+	 */
+	mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+				(global_secflags & CIFSSEC_MAY_SIGN);
+
+	/* If server requires signing, does client allow it? */
+	if (srv_sign_required) {
+		if (!mnt_sign_enabled) {
+			cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
+			return -ENOTSUPP;
+		}
+		server->sign = true;
+	}
+
+	/* If client requires signing, does server allow it? */
+	if (mnt_sign_required) {
+		if (!srv_sign_enabled) {
+			cifs_dbg(VFS, "Server does not support signing!");
+			return -ENOTSUPP;
+		}
+		server->sign = true;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+	__s16 tmp;
+	struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
+
+	if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
+		return -EOPNOTSUPP;
+
+	server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+	server->maxReq = min_t(unsigned int,
+			       le16_to_cpu(rsp->MaxMpxCount),
+			       cifs_max_pending);
+	set_credits(server, server->maxReq);
+	server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
+	/* even though we do not use raw we might as well set this
+	accurately, in case we ever find a need for it */
+	if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+		server->max_rw = 0xFF00;
+		server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+	} else {
+		server->max_rw = 0;/* do not need to use raw anyway */
+		server->capabilities = CAP_MPX_MODE;
+	}
+	tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+	if (tmp == -1) {
+		/* OS/2 often does not set timezone therefore
+		 * we must use server time to calc time zone.
+		 * Could deviate slightly from the right zone.
+		 * Smallest defined timezone difference is 15 minutes
+		 * (i.e. Nepal).  Rounding up/down is done to match
+		 * this requirement.
+		 */
+		int val, seconds, remain, result;
+		struct timespec ts, utc;
+		utc = CURRENT_TIME;
+		ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+				    rsp->SrvTime.Time, 0);
+		cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
+			 (int)ts.tv_sec, (int)utc.tv_sec,
+			 (int)(utc.tv_sec - ts.tv_sec));
+		val = (int)(utc.tv_sec - ts.tv_sec);
+		seconds = abs(val);
+		result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+		remain = seconds % MIN_TZ_ADJ;
+		if (remain >= (MIN_TZ_ADJ / 2))
+			result += MIN_TZ_ADJ;
+		if (val < 0)
+			result = -result;
+		server->timeAdj = result;
+	} else {
+		server->timeAdj = (int)tmp;
+		server->timeAdj *= 60; /* also in seconds */
+	}
+	cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
+
+
+	/* BB get server time for time conversions and add
+	code to use it and timezone since this is not UTC */
+
+	if (rsp->EncryptionKeyLength ==
+			cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+		memcpy(server->cryptkey, rsp->EncryptionKey,
+			CIFS_CRYPTO_KEY_SIZE);
+	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+		return -EIO; /* need cryptkey unless plain text */
+	}
+
+	cifs_dbg(FYI, "LANMAN negotiated\n");
+	return 0;
+}
+#else
+static inline int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+	cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
+	return -EOPNOTSUPP;
+}
+#endif
+
+static bool
+should_set_ext_sec_flag(enum securityEnum sectype)
+{
+	switch (sectype) {
+	case RawNTLMSSP:
+	case Kerberos:
+		return true;
+	case Unspecified:
+		if (global_secflags &
+		    (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
+			return true;
+		/* Fallthrough */
+	default:
+		return false;
+	}
+}
+
+int
+CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
+{
+	NEGOTIATE_REQ *pSMB;
+	NEGOTIATE_RSP *pSMBr;
+	int rc = 0;
+	int bytes_returned;
+	int i;
+	struct TCP_Server_Info *server = ses->server;
+	u16 count;
+
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
+	}
+
+	rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
+		      (void **) &pSMB, (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Mid = get_next_mid(server);
+	pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
+
+	if (should_set_ext_sec_flag(ses->sectype)) {
+		cifs_dbg(FYI, "Requesting extended security.");
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	}
+
+	count = 0;
+	for (i = 0; i < CIFS_NUM_PROT; i++) {
+		strncpy(pSMB->DialectsArray+count, protocols[i].name, 16);
+		count += strlen(protocols[i].name) + 1;
+		/* null at end of source and target buffers anyway */
+	}
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc != 0)
+		goto neg_err_exit;
+
+	server->dialect = le16_to_cpu(pSMBr->DialectIndex);
+	cifs_dbg(FYI, "Dialect: %d\n", server->dialect);
+	/* Check wct = 1 error case */
+	if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
+		/* core returns wct = 1, but we do not ask for core - otherwise
+		small wct just comes when dialect index is -1 indicating we
+		could not negotiate a common dialect */
+		rc = -EOPNOTSUPP;
+		goto neg_err_exit;
+	} else if (pSMBr->hdr.WordCount == 13) {
+		server->negflavor = CIFS_NEGFLAVOR_LANMAN;
+		rc = decode_lanman_negprot_rsp(server, pSMBr);
+		goto signing_check;
+	} else if (pSMBr->hdr.WordCount != 17) {
+		/* unknown wct */
+		rc = -EOPNOTSUPP;
+		goto neg_err_exit;
+	}
+	/* else wct == 17, NTLM or better */
+
+	server->sec_mode = pSMBr->SecurityMode;
+	if ((server->sec_mode & SECMODE_USER) == 0)
+		cifs_dbg(FYI, "share mode security\n");
+
+	/* one byte, so no need to convert this or EncryptionKeyLen from
+	   little endian */
+	server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
+			       cifs_max_pending);
+	set_credits(server, server->maxReq);
+	/* probably no need to store and check maxvcs */
+	server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
+	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
+	cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
+	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
+	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
+	server->timeAdj *= 60;
+
+	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+		server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
+		memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
+		       CIFS_CRYPTO_KEY_SIZE);
+	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
+			server->capabilities & CAP_EXTENDED_SECURITY) &&
+				(pSMBr->EncryptionKeyLength == 0)) {
+		server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
+		rc = decode_ext_sec_blob(ses, pSMBr);
+	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+		rc = -EIO; /* no crypt key only if plain text pwd */
+	} else {
+		server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
+		server->capabilities &= ~CAP_EXTENDED_SECURITY;
+	}
+
+signing_check:
+	if (!rc)
+		rc = cifs_enable_signing(server, ses->sign);
+neg_err_exit:
+	cifs_buf_release(pSMB);
+
+	cifs_dbg(FYI, "negprot rc %d\n", rc);
+	return rc;
+}
+
+int
+CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	struct smb_hdr *smb_buffer;
+	int rc = 0;
+
+	cifs_dbg(FYI, "In tree disconnect\n");
+
+	/* BB: do we need to check this? These should never be NULL. */
+	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
+		return -EIO;
+
+	/*
+	 * No need to return error on this operation if tid invalidated and
+	 * closed on server already e.g. due to tcp session crashing. Also,
+	 * the tcon is no longer on the list, so no need to take lock before
+	 * checking this.
+	 */
+	if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
+		return 0;
+
+	rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
+			    (void **)&smb_buffer);
+	if (rc)
+		return rc;
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0);
+	if (rc)
+		cifs_dbg(FYI, "Tree disconnect failed %d\n", rc);
+
+	/* No need to return error on this operation if tid invalidated and
+	   closed on server already e.g. due to tcp session crashing */
+	if (rc == -EAGAIN)
+		rc = 0;
+
+	return rc;
+}
+
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+	struct TCP_Server_Info *server = mid->callback_data;
+
+	DeleteMidQEntry(mid);
+	add_credits(server, 1, CIFS_ECHO_OP);
+}
+
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+	ECHO_REQ *smb;
+	int rc = 0;
+	struct kvec iov;
+	struct smb_rqst rqst = { .rq_iov = &iov,
+				 .rq_nvec = 1 };
+
+	cifs_dbg(FYI, "In echo request\n");
+
+	rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+	if (rc)
+		return rc;
+
+	/* set up echo request */
+	smb->hdr.Tid = 0xffff;
+	smb->hdr.WordCount = 1;
+	put_unaligned_le16(1, &smb->EchoCount);
+	put_bcc(1, &smb->hdr);
+	smb->Data[0] = 'a';
+	inc_rfc1001_len(smb, 3);
+	iov.iov_base = smb;
+	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+	rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback,
+			     server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
+	if (rc)
+		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
+
+	cifs_small_buf_release(smb);
+
+	return rc;
+}
+
+int
+CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
+{
+	LOGOFF_ANDX_REQ *pSMB;
+	int rc = 0;
+
+	cifs_dbg(FYI, "In SMBLogoff for session disconnect\n");
+
+	/*
+	 * BB: do we need to check validity of ses and server? They should
+	 * always be valid since we have an active reference. If not, that
+	 * should probably be a BUG()
+	 */
+	if (!ses || !ses->server)
+		return -EIO;
+
+	mutex_lock(&ses->session_mutex);
+	if (ses->need_reconnect)
+		goto session_already_dead; /* no need to send SMBlogoff if uid
+					      already closed due to reconnect */
+	rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
+	if (rc) {
+		mutex_unlock(&ses->session_mutex);
+		return rc;
+	}
+
+	pSMB->hdr.Mid = get_next_mid(ses->server);
+
+	if (ses->server->sign)
+		pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	pSMB->hdr.Uid = ses->Suid;
+
+	pSMB->AndXCommand = 0xFF;
+	rc = SendReceiveNoRsp(xid, ses, (char *) pSMB, 0);
+session_already_dead:
+	mutex_unlock(&ses->session_mutex);
+
+	/* if session dead then we do not need to do ulogoff,
+		since server closed smb session, no sense reporting
+		error */
+	if (rc == -EAGAIN)
+		rc = 0;
+	return rc;
+}
+
+int
+CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
+		 const char *fileName, __u16 type,
+		 const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	struct unlink_psx_rq *pRqD;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cifs_dbg(FYI, "In POSIX delete\n");
+PsxDelete:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else { /* BB add path length overrun check */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+
+	params = 6 + name_len;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = 0; /* BB double check this with jra */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+
+	/* Setup pointer to Request Data (inode type) */
+	pRqD = (struct unlink_psx_rq *)(((char *)&pSMB->hdr.Protocol) + offset);
+	pRqD->type = cpu_to_le16(type);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + sizeof(struct unlink_psx_rq);
+
+	pSMB->DataCount = cpu_to_le16(sizeof(struct unlink_psx_rq));
+	pSMB->TotalDataCount = cpu_to_le16(sizeof(struct unlink_psx_rq));
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Posix delete returned %d\n", rc);
+	cifs_buf_release(pSMB);
+
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
+
+	if (rc == -EAGAIN)
+		goto PsxDelete;
+
+	return rc;
+}
+
+int
+CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	       struct cifs_sb_info *cifs_sb)
+{
+	DELETE_FILE_REQ *pSMB = NULL;
+	DELETE_FILE_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	int remap = cifs_remap(cifs_sb);
+
+DelFileRetry:
+	rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->fileName, name,
+					      PATH_MAX, cifs_sb->local_nls,
+					      remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {		/* BB improve check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->fileName, name, name_len);
+	}
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
+	if (rc)
+		cifs_dbg(FYI, "Error in RMFile = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto DelFileRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	     struct cifs_sb_info *cifs_sb)
+{
+	DELETE_DIRECTORY_REQ *pSMB = NULL;
+	DELETE_DIRECTORY_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	int remap = cifs_remap(cifs_sb);
+
+	cifs_dbg(FYI, "In CIFSSMBRmDir\n");
+RmDirRetry:
+	rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+					      PATH_MAX, cifs_sb->local_nls,
+					      remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {		/* BB improve check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->DirName, name, name_len);
+	}
+
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_rmdirs);
+	if (rc)
+		cifs_dbg(FYI, "Error in RMDir = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto RmDirRetry;
+	return rc;
+}
+
+int
+CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	     struct cifs_sb_info *cifs_sb)
+{
+	int rc = 0;
+	CREATE_DIRECTORY_REQ *pSMB = NULL;
+	CREATE_DIRECTORY_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len;
+	int remap = cifs_remap(cifs_sb);
+
+	cifs_dbg(FYI, "In CIFSSMBMkDir\n");
+MkDirRetry:
+	rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+					      PATH_MAX, cifs_sb->local_nls,
+					      remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {		/* BB improve check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->DirName, name, name_len);
+	}
+
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_mkdirs);
+	if (rc)
+		cifs_dbg(FYI, "Error in Mkdir = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto MkDirRetry;
+	return rc;
+}
+
+int
+CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon,
+		__u32 posix_flags, __u64 mode, __u16 *netfid,
+		FILE_UNIX_BASIC_INFO *pRetData, __u32 *pOplock,
+		const char *name, const struct nls_table *nls_codepage,
+		int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+	OPEN_PSX_REQ *pdata;
+	OPEN_PSX_RSP *psx_rsp;
+
+	cifs_dbg(FYI, "In POSIX Create\n");
+PsxCreat:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
+				       PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, name, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(OPEN_PSX_REQ);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);	/* large enough */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
+	pdata->Level = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pdata->Permissions = cpu_to_le64(mode);
+	pdata->PosixOpenFlags = cpu_to_le32(posix_flags);
+	pdata->OpenFlags =  cpu_to_le32(*pOplock);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Posix create returned %d\n", rc);
+		goto psx_create_err;
+	}
+
+	cifs_dbg(FYI, "copying inode info\n");
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+	if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
+		rc = -EIO;	/* bad smb */
+		goto psx_create_err;
+	}
+
+	/* copy return information to pRetData */
+	psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol
+			+ le16_to_cpu(pSMBr->t2.DataOffset));
+
+	*pOplock = le16_to_cpu(psx_rsp->OplockFlags);
+	if (netfid)
+		*netfid = psx_rsp->Fid;   /* cifs fid stays in le */
+	/* Let caller know file was created so we can set the mode. */
+	/* Do we care about the CreateAction in any other cases? */
+	if (cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction)
+		*pOplock |= CIFS_CREATE_ACTION;
+	/* check to make sure response data is there */
+	if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
+		pRetData->Type = cpu_to_le32(-1); /* unknown */
+		cifs_dbg(NOISY, "unknown type\n");
+	} else {
+		if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
+					+ sizeof(FILE_UNIX_BASIC_INFO)) {
+			cifs_dbg(VFS, "Open response data too small\n");
+			pRetData->Type = cpu_to_le32(-1);
+			goto psx_create_err;
+		}
+		memcpy((char *) pRetData,
+			(char *)psx_rsp + sizeof(OPEN_PSX_RSP),
+			sizeof(FILE_UNIX_BASIC_INFO));
+	}
+
+psx_create_err:
+	cifs_buf_release(pSMB);
+
+	if (posix_flags & SMB_O_DIRECTORY)
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_posixmkdirs);
+	else
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_posixopens);
+
+	if (rc == -EAGAIN)
+		goto PsxCreat;
+
+	return rc;
+}
+
+static __u16 convert_disposition(int disposition)
+{
+	__u16 ofun = 0;
+
+	switch (disposition) {
+		case FILE_SUPERSEDE:
+			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
+			break;
+		case FILE_OPEN:
+			ofun = SMBOPEN_OAPPEND;
+			break;
+		case FILE_CREATE:
+			ofun = SMBOPEN_OCREATE;
+			break;
+		case FILE_OPEN_IF:
+			ofun = SMBOPEN_OCREATE | SMBOPEN_OAPPEND;
+			break;
+		case FILE_OVERWRITE:
+			ofun = SMBOPEN_OTRUNC;
+			break;
+		case FILE_OVERWRITE_IF:
+			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
+			break;
+		default:
+			cifs_dbg(FYI, "unknown disposition %d\n", disposition);
+			ofun =  SMBOPEN_OAPPEND; /* regular open */
+	}
+	return ofun;
+}
+
+static int
+access_flags_to_smbopen_mode(const int access_flags)
+{
+	int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
+
+	if (masked_flags == GENERIC_READ)
+		return SMBOPEN_READ;
+	else if (masked_flags == GENERIC_WRITE)
+		return SMBOPEN_WRITE;
+
+	/* just go for read/write */
+	return SMBOPEN_READWRITE;
+}
+
+int
+SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
+	    const char *fileName, const int openDisposition,
+	    const int access_flags, const int create_options, __u16 *netfid,
+	    int *pOplock, FILE_ALL_INFO *pfile_info,
+	    const struct nls_table *nls_codepage, int remap)
+{
+	int rc = -EACCES;
+	OPENX_REQ *pSMB = NULL;
+	OPENX_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len;
+	__u16 count;
+
+OldOpenRetry:
+	rc = smb_init(SMB_COM_OPEN_ANDX, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->AndXCommand = 0xFF;       /* none */
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		count = 1;      /* account for one byte pad to word boundary */
+		name_len =
+		   cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+				      fileName, PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {                /* BB improve check for buffer overruns BB */
+		count = 0;      /* no pad */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->fileName, fileName, name_len);
+	}
+	if (*pOplock & REQ_OPLOCK)
+		pSMB->OpenFlags = cpu_to_le16(REQ_OPLOCK);
+	else if (*pOplock & REQ_BATCHOPLOCK)
+		pSMB->OpenFlags = cpu_to_le16(REQ_BATCHOPLOCK);
+
+	pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO);
+	pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags));
+	pSMB->Mode |= cpu_to_le16(0x40); /* deny none */
+	/* set file as system file if special file such
+	   as fifo and server expecting SFU style and
+	   no Unix extensions */
+
+	if (create_options & CREATE_OPTION_SPECIAL)
+		pSMB->FileAttributes = cpu_to_le16(ATTR_SYSTEM);
+	else /* BB FIXME BB */
+		pSMB->FileAttributes = cpu_to_le16(0/*ATTR_NORMAL*/);
+
+	if (create_options & CREATE_OPTION_READONLY)
+		pSMB->FileAttributes |= cpu_to_le16(ATTR_READONLY);
+
+	/* BB FIXME BB */
+/*	pSMB->CreateOptions = cpu_to_le32(create_options &
+						 CREATE_OPTIONS_MASK); */
+	/* BB FIXME END BB */
+
+	pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY);
+	pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition));
+	count += name_len;
+	inc_rfc1001_len(pSMB, count);
+
+	pSMB->ByteCount = cpu_to_le16(count);
+	/* long_op set to 1 to allow for oplock break timeouts */
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
+	if (rc) {
+		cifs_dbg(FYI, "Error in Open = %d\n", rc);
+	} else {
+	/* BB verify if wct == 15 */
+
+/*		*pOplock = pSMBr->OplockLevel; */ /* BB take from action field*/
+
+		*netfid = pSMBr->Fid;   /* cifs fid stays in le */
+		/* Let caller know file was created so we can set the mode. */
+		/* Do we care about the CreateAction in any other cases? */
+	/* BB FIXME BB */
+/*		if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
+			*pOplock |= CIFS_CREATE_ACTION; */
+	/* BB FIXME END */
+
+		if (pfile_info) {
+			pfile_info->CreationTime = 0; /* BB convert CreateTime*/
+			pfile_info->LastAccessTime = 0; /* BB fixme */
+			pfile_info->LastWriteTime = 0; /* BB fixme */
+			pfile_info->ChangeTime = 0;  /* BB fixme */
+			pfile_info->Attributes =
+				cpu_to_le32(le16_to_cpu(pSMBr->FileAttributes));
+			/* the file_info buf is endian converted by caller */
+			pfile_info->AllocationSize =
+				cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile));
+			pfile_info->EndOfFile = pfile_info->AllocationSize;
+			pfile_info->NumberOfLinks = cpu_to_le32(1);
+			pfile_info->DeletePending = 0;
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto OldOpenRetry;
+	return rc;
+}
+
+int
+CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
+	  FILE_ALL_INFO *buf)
+{
+	int rc = -EACCES;
+	OPEN_REQ *req = NULL;
+	OPEN_RSP *rsp = NULL;
+	int bytes_returned;
+	int name_len;
+	__u16 count;
+	struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
+	struct cifs_tcon *tcon = oparms->tcon;
+	int remap = cifs_remap(cifs_sb);
+	const struct nls_table *nls = cifs_sb->local_nls;
+	int create_options = oparms->create_options;
+	int desired_access = oparms->desired_access;
+	int disposition = oparms->disposition;
+	const char *path = oparms->path;
+
+openRetry:
+	rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req,
+		      (void **)&rsp);
+	if (rc)
+		return rc;
+
+	/* no commands go after this */
+	req->AndXCommand = 0xFF;
+
+	if (req->hdr.Flags2 & SMBFLG2_UNICODE) {
+		/* account for one byte pad to word boundary */
+		count = 1;
+		name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1),
+					      path, PATH_MAX, nls, remap);
+		/* trailing null */
+		name_len++;
+		name_len *= 2;
+		req->NameLength = cpu_to_le16(name_len);
+	} else {
+		/* BB improve check for buffer overruns BB */
+		/* no pad */
+		count = 0;
+		name_len = strnlen(path, PATH_MAX);
+		/* trailing null */
+		name_len++;
+		req->NameLength = cpu_to_le16(name_len);
+		strncpy(req->fileName, path, name_len);
+	}
+
+	if (*oplock & REQ_OPLOCK)
+		req->OpenFlags = cpu_to_le32(REQ_OPLOCK);
+	else if (*oplock & REQ_BATCHOPLOCK)
+		req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
+
+	req->DesiredAccess = cpu_to_le32(desired_access);
+	req->AllocationSize = 0;
+
+	/*
+	 * Set file as system file if special file such as fifo and server
+	 * expecting SFU style and no Unix extensions.
+	 */
+	if (create_options & CREATE_OPTION_SPECIAL)
+		req->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
+	else
+		req->FileAttributes = cpu_to_le32(ATTR_NORMAL);
+
+	/*
+	 * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case
+	 * sensitive checks for other servers such as Samba.
+	 */
+	if (tcon->ses->capabilities & CAP_UNIX)
+		req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
+
+	if (create_options & CREATE_OPTION_READONLY)
+		req->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+
+	req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
+	req->CreateDisposition = cpu_to_le32(disposition);
+	req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
+
+	/* BB Expirement with various impersonation levels and verify */
+	req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
+	req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY;
+
+	count += name_len;
+	inc_rfc1001_len(req, count);
+
+	req->ByteCount = cpu_to_le16(count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req,
+			 (struct smb_hdr *)rsp, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
+	if (rc) {
+		cifs_dbg(FYI, "Error in Open = %d\n", rc);
+		cifs_buf_release(req);
+		if (rc == -EAGAIN)
+			goto openRetry;
+		return rc;
+	}
+
+	/* 1 byte no need to le_to_cpu */
+	*oplock = rsp->OplockLevel;
+	/* cifs fid stays in le */
+	oparms->fid->netfid = rsp->Fid;
+
+	/* Let caller know file was created so we can set the mode. */
+	/* Do we care about the CreateAction in any other cases? */
+	if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction)
+		*oplock |= CIFS_CREATE_ACTION;
+
+	if (buf) {
+		/* copy from CreationTime to Attributes */
+		memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
+		/* the file_info buf is endian converted by caller */
+		buf->AllocationSize = rsp->AllocationSize;
+		buf->EndOfFile = rsp->EndOfFile;
+		buf->NumberOfLinks = cpu_to_le32(1);
+		buf->DeletePending = 0;
+	}
+
+	cifs_buf_release(req);
+	return rc;
+}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	unsigned int rfclen = get_rfc1002_length(server->smallbuf);
+	int remaining = rfclen + 4 - server->total_read;
+	struct cifs_readdata *rdata = mid->callback_data;
+
+	while (remaining > 0) {
+		int length;
+
+		length = cifs_read_from_socket(server, server->bigbuf,
+				min_t(unsigned int, remaining,
+				    CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
+		if (length < 0)
+			return length;
+		server->total_read += length;
+		remaining -= length;
+	}
+
+	dequeue_mid(mid, rdata->result);
+	return 0;
+}
+
+int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	int length, len;
+	unsigned int data_offset, data_len;
+	struct cifs_readdata *rdata = mid->callback_data;
+	char *buf = server->smallbuf;
+	unsigned int buflen = get_rfc1002_length(buf) + 4;
+
+	cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
+		 __func__, mid->mid, rdata->offset, rdata->bytes);
+
+	/*
+	 * read the rest of READ_RSP header (sans Data array), or whatever we
+	 * can if there's not enough data. At this point, we've read down to
+	 * the Mid.
+	 */
+	len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
+							HEADER_SIZE(server) + 1;
+
+	rdata->iov.iov_base = buf + HEADER_SIZE(server) - 1;
+	rdata->iov.iov_len = len;
+
+	length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
+	if (length < 0)
+		return length;
+	server->total_read += length;
+
+	/* Was the SMB read successful? */
+	rdata->result = server->ops->map_error(buf, false);
+	if (rdata->result != 0) {
+		cifs_dbg(FYI, "%s: server returned error %d\n",
+			 __func__, rdata->result);
+		return cifs_readv_discard(server, mid);
+	}
+
+	/* Is there enough to get to the rest of the READ_RSP header? */
+	if (server->total_read < server->vals->read_rsp_size) {
+		cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
+			 __func__, server->total_read,
+			 server->vals->read_rsp_size);
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	data_offset = server->ops->read_data_offset(buf) + 4;
+	if (data_offset < server->total_read) {
+		/*
+		 * win2k8 sometimes sends an offset of 0 when the read
+		 * is beyond the EOF. Treat it as if the data starts just after
+		 * the header.
+		 */
+		cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
+			 __func__, data_offset);
+		data_offset = server->total_read;
+	} else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+		/* data_offset is beyond the end of smallbuf */
+		cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
+			 __func__, data_offset);
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
+		 __func__, server->total_read, data_offset);
+
+	len = data_offset - server->total_read;
+	if (len > 0) {
+		/* read any junk before data into the rest of smallbuf */
+		rdata->iov.iov_base = buf + server->total_read;
+		rdata->iov.iov_len = len;
+		length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
+		if (length < 0)
+			return length;
+		server->total_read += length;
+	}
+
+	/* set up first iov for signature check */
+	rdata->iov.iov_base = buf;
+	rdata->iov.iov_len = server->total_read;
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov.iov_base, rdata->iov.iov_len);
+
+	/* how much data is in the response? */
+	data_len = server->ops->read_data_length(buf);
+	if (data_offset + data_len > buflen) {
+		/* data_len is corrupt -- discard frame */
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	length = rdata->read_into_pages(server, rdata, data_len);
+	if (length < 0)
+		return length;
+
+	server->total_read += length;
+
+	cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
+		 server->total_read, buflen, data_len);
+
+	/* discard anything left over */
+	if (server->total_read < buflen)
+		return cifs_readv_discard(server, mid);
+
+	dequeue_mid(mid, false);
+	return length;
+}
+
+static void
+cifs_readv_callback(struct mid_q_entry *mid)
+{
+	struct cifs_readdata *rdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1,
+				 .rq_pages = rdata->pages,
+				 .rq_npages = rdata->nr_pages,
+				 .rq_pagesz = rdata->pagesz,
+				 .rq_tailsz = rdata->tailsz };
+
+	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+		 __func__, mid->mid, mid->mid_state, rdata->result,
+		 rdata->bytes);
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		/* result already set, check signature */
+		if (server->sign) {
+			int rc = 0;
+
+			rc = cifs_verify_signature(&rqst, server,
+						  mid->sequence_number);
+			if (rc)
+				cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+					 rc);
+		}
+		/* FIXME: should this be counted toward the initiating task? */
+		task_io_account_read(rdata->got_bytes);
+		cifs_stats_bytes_read(tcon, rdata->got_bytes);
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		rdata->result = -EAGAIN;
+		if (server->sign && rdata->got_bytes)
+			/* reset bytes number since we can not check a sign */
+			rdata->got_bytes = 0;
+		/* FIXME: should this be counted toward the initiating task? */
+		task_io_account_read(rdata->got_bytes);
+		cifs_stats_bytes_read(tcon, rdata->got_bytes);
+		break;
+	default:
+		rdata->result = -EIO;
+	}
+
+	queue_work(cifsiod_wq, &rdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(server, 1, 0);
+}
+
+/* cifs_async_readv - send an async write, and set up mid to handle result */
+int
+cifs_async_readv(struct cifs_readdata *rdata)
+{
+	int rc;
+	READ_REQ *smb = NULL;
+	int wct;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1 };
+
+	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+		 __func__, rdata->offset, rdata->bytes);
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES)
+		wct = 12;
+	else {
+		wct = 10; /* old style read */
+		if ((rdata->offset >> 32) > 0)  {
+			/* can not handle this big offset for old */
+			return -EIO;
+		}
+	}
+
+	rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
+	if (rc)
+		return rc;
+
+	smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
+	smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+
+	smb->AndXCommand = 0xFF;	/* none */
+	smb->Fid = rdata->cfile->fid.netfid;
+	smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+	if (wct == 12)
+		smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+	smb->Remaining = 0;
+	smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
+	smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+	if (wct == 12)
+		smb->ByteCount = 0;
+	else {
+		/* old style read */
+		struct smb_com_readx_req *smbr =
+			(struct smb_com_readx_req *)smb;
+		smbr->ByteCount = 0;
+	}
+
+	/* 4 for RFC1001 length + 1 for BCC */
+	rdata->iov.iov_base = smb;
+	rdata->iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+	kref_get(&rdata->refcount);
+	rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
+			     cifs_readv_callback, rdata, 0);
+
+	if (rc == 0)
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
+	else
+		kref_put(&rdata->refcount, cifs_readdata_release);
+
+	cifs_small_buf_release(smb);
+	return rc;
+}
+
+int
+CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
+	    unsigned int *nbytes, char **buf, int *pbuf_type)
+{
+	int rc = -EACCES;
+	READ_REQ *pSMB = NULL;
+	READ_RSP *pSMBr = NULL;
+	char *pReadData = NULL;
+	int wct;
+	int resp_buf_type = 0;
+	struct kvec iov[1];
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
+
+	cifs_dbg(FYI, "Reading %d bytes on fid %d\n", count, netfid);
+	if (tcon->ses->capabilities & CAP_LARGE_FILES)
+		wct = 12;
+	else {
+		wct = 10; /* old style read */
+		if ((offset >> 32) > 0)  {
+			/* can not handle this big offset for old */
+			return -EIO;
+		}
+	}
+
+	*nbytes = 0;
+	rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
+	/* tcon and ses pointer are checked in smb_init */
+	if (tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	pSMB->AndXCommand = 0xFF;       /* none */
+	pSMB->Fid = netfid;
+	pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
+	if (wct == 12)
+		pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
+
+	pSMB->Remaining = 0;
+	pSMB->MaxCount = cpu_to_le16(count & 0xFFFF);
+	pSMB->MaxCountHigh = cpu_to_le32(count >> 16);
+	if (wct == 12)
+		pSMB->ByteCount = 0;  /* no need to do le conversion since 0 */
+	else {
+		/* old style read */
+		struct smb_com_readx_req *pSMBW =
+			(struct smb_com_readx_req *)pSMB;
+		pSMBW->ByteCount = 0;
+	}
+
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
+	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
+			 &resp_buf_type, CIFS_LOG_ERROR);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
+	pSMBr = (READ_RSP *)iov[0].iov_base;
+	if (rc) {
+		cifs_dbg(VFS, "Send error in read = %d\n", rc);
+	} else {
+		int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
+		data_length = data_length << 16;
+		data_length += le16_to_cpu(pSMBr->DataLength);
+		*nbytes = data_length;
+
+		/*check that DataLength would not go beyond end of SMB */
+		if ((data_length > CIFSMaxBufSize)
+				|| (data_length > count)) {
+			cifs_dbg(FYI, "bad length %d for count %d\n",
+				 data_length, count);
+			rc = -EIO;
+			*nbytes = 0;
+		} else {
+			pReadData = (char *) (&pSMBr->hdr.Protocol) +
+					le16_to_cpu(pSMBr->DataOffset);
+/*			if (rc = copy_to_user(buf, pReadData, data_length)) {
+				cifs_dbg(VFS, "Faulting on read rc = %d\n",rc);
+				rc = -EFAULT;
+			}*/ /* can not use copy_to_user when using page cache*/
+			if (*buf)
+				memcpy(*buf, pReadData, data_length);
+		}
+	}
+
+/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
+	if (*buf) {
+		free_rsp_buf(resp_buf_type, iov[0].iov_base);
+	} else if (resp_buf_type != CIFS_NO_BUFFER) {
+		/* return buffer to caller to free */
+		*buf = iov[0].iov_base;
+		if (resp_buf_type == CIFS_SMALL_BUFFER)
+			*pbuf_type = CIFS_SMALL_BUFFER;
+		else if (resp_buf_type == CIFS_LARGE_BUFFER)
+			*pbuf_type = CIFS_LARGE_BUFFER;
+	} /* else no valid buffer on return - leave as null */
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+	return rc;
+}
+
+
+int
+CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
+	     unsigned int *nbytes, const char *buf,
+	     const char __user *ubuf, const int long_op)
+{
+	int rc = -EACCES;
+	WRITE_REQ *pSMB = NULL;
+	WRITE_RSP *pSMBr = NULL;
+	int bytes_returned, wct;
+	__u32 bytes_sent;
+	__u16 byte_count;
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
+
+	*nbytes = 0;
+
+	/* cifs_dbg(FYI, "write at %lld %d bytes\n", offset, count);*/
+	if (tcon->ses == NULL)
+		return -ECONNABORTED;
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES)
+		wct = 14;
+	else {
+		wct = 12;
+		if ((offset >> 32) > 0) {
+			/* can not handle big offset for old srv */
+			return -EIO;
+		}
+	}
+
+	rc = smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
+	/* tcon and ses pointer are checked in smb_init */
+	if (tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	pSMB->AndXCommand = 0xFF;	/* none */
+	pSMB->Fid = netfid;
+	pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
+	if (wct == 14)
+		pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
+
+	pSMB->Reserved = 0xFFFFFFFF;
+	pSMB->WriteMode = 0;
+	pSMB->Remaining = 0;
+
+	/* Can increase buffer size if buffer is big enough in some cases ie we
+	can send more if LARGE_WRITE_X capability returned by the server and if
+	our buffer is big enough or if we convert to iovecs on socket writes
+	and eliminate the copy to the CIFS buffer */
+	if (tcon->ses->capabilities & CAP_LARGE_WRITE_X) {
+		bytes_sent = min_t(const unsigned int, CIFSMaxBufSize, count);
+	} else {
+		bytes_sent = (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)
+			 & ~0xFF;
+	}
+
+	if (bytes_sent > count)
+		bytes_sent = count;
+	pSMB->DataOffset =
+		cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+	if (buf)
+		memcpy(pSMB->Data, buf, bytes_sent);
+	else if (ubuf) {
+		if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) {
+			cifs_buf_release(pSMB);
+			return -EFAULT;
+		}
+	} else if (count != 0) {
+		/* No buffer */
+		cifs_buf_release(pSMB);
+		return -EINVAL;
+	} /* else setting file size with write of zero bytes */
+	if (wct == 14)
+		byte_count = bytes_sent + 1; /* pad */
+	else /* wct == 12 */
+		byte_count = bytes_sent + 5; /* bigger pad, smaller smb hdr */
+
+	pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF);
+	pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16);
+	inc_rfc1001_len(pSMB, byte_count);
+
+	if (wct == 14)
+		pSMB->ByteCount = cpu_to_le16(byte_count);
+	else { /* old style write has byte count 4 bytes earlier
+		  so 4 bytes pad  */
+		struct smb_com_writex_req *pSMBW =
+			(struct smb_com_writex_req *)pSMB;
+		pSMBW->ByteCount = cpu_to_le16(byte_count);
+	}
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in write = %d\n", rc);
+	} else {
+		*nbytes = le16_to_cpu(pSMBr->CountHigh);
+		*nbytes = (*nbytes) << 16;
+		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. Some
+		 * OS/2 servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
+	}
+
+	cifs_buf_release(pSMB);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+void
+cifs_writedata_release(struct kref *refcount)
+{
+	struct cifs_writedata *wdata = container_of(refcount,
+					struct cifs_writedata, refcount);
+
+	if (wdata->cfile)
+		cifsFileInfo_put(wdata->cfile);
+
+	kfree(wdata);
+}
+
+/*
+ * Write failed with a retryable error. Resend the write request. It's also
+ * possible that the page was redirtied so re-clean the page.
+ */
+static void
+cifs_writev_requeue(struct cifs_writedata *wdata)
+{
+	int i, rc = 0;
+	struct inode *inode = d_inode(wdata->cfile->dentry);
+	struct TCP_Server_Info *server;
+	unsigned int rest_len;
+
+	server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+	i = 0;
+	rest_len = wdata->bytes;
+	do {
+		struct cifs_writedata *wdata2;
+		unsigned int j, nr_pages, wsize, tailsz, cur_len;
+
+		wsize = server->ops->wp_retry_size(inode);
+		if (wsize < rest_len) {
+			nr_pages = wsize / PAGE_CACHE_SIZE;
+			if (!nr_pages) {
+				rc = -ENOTSUPP;
+				break;
+			}
+			cur_len = nr_pages * PAGE_CACHE_SIZE;
+			tailsz = PAGE_CACHE_SIZE;
+		} else {
+			nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
+			cur_len = rest_len;
+			tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
+		}
+
+		wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
+		if (!wdata2) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		for (j = 0; j < nr_pages; j++) {
+			wdata2->pages[j] = wdata->pages[i + j];
+			lock_page(wdata2->pages[j]);
+			clear_page_dirty_for_io(wdata2->pages[j]);
+		}
+
+		wdata2->sync_mode = wdata->sync_mode;
+		wdata2->nr_pages = nr_pages;
+		wdata2->offset = page_offset(wdata2->pages[0]);
+		wdata2->pagesz = PAGE_CACHE_SIZE;
+		wdata2->tailsz = tailsz;
+		wdata2->bytes = cur_len;
+
+		wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+		if (!wdata2->cfile) {
+			cifs_dbg(VFS, "No writable handles for inode\n");
+			rc = -EBADF;
+			break;
+		}
+		wdata2->pid = wdata2->cfile->pid;
+		rc = server->ops->async_writev(wdata2, cifs_writedata_release);
+
+		for (j = 0; j < nr_pages; j++) {
+			unlock_page(wdata2->pages[j]);
+			if (rc != 0 && rc != -EAGAIN) {
+				SetPageError(wdata2->pages[j]);
+				end_page_writeback(wdata2->pages[j]);
+				page_cache_release(wdata2->pages[j]);
+			}
+		}
+
+		if (rc) {
+			kref_put(&wdata2->refcount, cifs_writedata_release);
+			if (rc == -EAGAIN)
+				continue;
+			break;
+		}
+
+		rest_len -= cur_len;
+		i += nr_pages;
+	} while (i < wdata->nr_pages);
+
+	mapping_set_error(inode->i_mapping, rc);
+	kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+void
+cifs_writev_complete(struct work_struct *work)
+{
+	struct cifs_writedata *wdata = container_of(work,
+						struct cifs_writedata, work);
+	struct inode *inode = d_inode(wdata->cfile->dentry);
+	int i = 0;
+
+	if (wdata->result == 0) {
+		spin_lock(&inode->i_lock);
+		cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+		spin_unlock(&inode->i_lock);
+		cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
+					 wdata->bytes);
+	} else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
+		return cifs_writev_requeue(wdata);
+
+	for (i = 0; i < wdata->nr_pages; i++) {
+		struct page *page = wdata->pages[i];
+		if (wdata->result == -EAGAIN)
+			__set_page_dirty_nobuffers(page);
+		else if (wdata->result < 0)
+			SetPageError(page);
+		end_page_writeback(page);
+		page_cache_release(page);
+	}
+	if (wdata->result != -EAGAIN)
+		mapping_set_error(inode->i_mapping, wdata->result);
+	kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+struct cifs_writedata *
+cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+	struct cifs_writedata *wdata;
+
+	/* writedata + number of page pointers */
+	wdata = kzalloc(sizeof(*wdata) +
+			sizeof(struct page *) * nr_pages, GFP_NOFS);
+	if (wdata != NULL) {
+		kref_init(&wdata->refcount);
+		INIT_LIST_HEAD(&wdata->list);
+		init_completion(&wdata->done);
+		INIT_WORK(&wdata->work, complete);
+	}
+	return wdata;
+}
+
+/*
+ * Check the mid_state and signature on received buffer (if any), and queue the
+ * workqueue completion task.
+ */
+static void
+cifs_writev_callback(struct mid_q_entry *mid)
+{
+	struct cifs_writedata *wdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	unsigned int written;
+	WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
+		if (wdata->result != 0)
+			break;
+
+		written = le16_to_cpu(smb->CountHigh);
+		written <<= 16;
+		written += le16_to_cpu(smb->Count);
+		/*
+		 * Mask off high 16 bits when bytes written as returned
+		 * by the server is greater than bytes requested by the
+		 * client. OS/2 servers are known to set incorrect
+		 * CountHigh values.
+		 */
+		if (written > wdata->bytes)
+			written &= 0xFFFF;
+
+		if (written < wdata->bytes)
+			wdata->result = -ENOSPC;
+		else
+			wdata->bytes = written;
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		wdata->result = -EAGAIN;
+		break;
+	default:
+		wdata->result = -EIO;
+		break;
+	}
+
+	queue_work(cifsiod_wq, &wdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(tcon->ses->server, 1, 0);
+}
+
+/* cifs_async_writev - send an async write, and set up mid to handle result */
+int
+cifs_async_writev(struct cifs_writedata *wdata,
+		  void (*release)(struct kref *kref))
+{
+	int rc = -EACCES;
+	WRITE_REQ *smb = NULL;
+	int wct;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	struct kvec iov;
+	struct smb_rqst rqst = { };
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
+		wct = 14;
+	} else {
+		wct = 12;
+		if (wdata->offset >> 32 > 0) {
+			/* can not handle big offset for old srv */
+			return -EIO;
+		}
+	}
+
+	rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **)&smb);
+	if (rc)
+		goto async_writev_out;
+
+	smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid);
+	smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
+
+	smb->AndXCommand = 0xFF;	/* none */
+	smb->Fid = wdata->cfile->fid.netfid;
+	smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
+	if (wct == 14)
+		smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
+	smb->Reserved = 0xFFFFFFFF;
+	smb->WriteMode = 0;
+	smb->Remaining = 0;
+
+	smb->DataOffset =
+	    cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+
+	/* 4 for RFC1001 length + 1 for BCC */
+	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
+	iov.iov_base = smb;
+
+	rqst.rq_iov = &iov;
+	rqst.rq_nvec = 1;
+	rqst.rq_pages = wdata->pages;
+	rqst.rq_npages = wdata->nr_pages;
+	rqst.rq_pagesz = wdata->pagesz;
+	rqst.rq_tailsz = wdata->tailsz;
+
+	cifs_dbg(FYI, "async write at %llu %u bytes\n",
+		 wdata->offset, wdata->bytes);
+
+	smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
+	smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
+
+	if (wct == 14) {
+		inc_rfc1001_len(&smb->hdr, wdata->bytes + 1);
+		put_bcc(wdata->bytes + 1, &smb->hdr);
+	} else {
+		/* wct == 12 */
+		struct smb_com_writex_req *smbw =
+				(struct smb_com_writex_req *)smb;
+		inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
+		put_bcc(wdata->bytes + 5, &smbw->hdr);
+		iov.iov_len += 4; /* pad bigger by four bytes */
+	}
+
+	kref_get(&wdata->refcount);
+	rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
+				cifs_writev_callback, wdata, 0);
+
+	if (rc == 0)
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
+	else
+		kref_put(&wdata->refcount, release);
+
+async_writev_out:
+	cifs_small_buf_release(smb);
+	return rc;
+}
+
+int
+CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
+	      unsigned int *nbytes, struct kvec *iov, int n_vec)
+{
+	int rc = -EACCES;
+	WRITE_REQ *pSMB = NULL;
+	int wct;
+	int smb_hdr_len;
+	int resp_buf_type = 0;
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
+
+	*nbytes = 0;
+
+	cifs_dbg(FYI, "write2 at %lld %d bytes\n", (long long)offset, count);
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
+		wct = 14;
+	} else {
+		wct = 12;
+		if ((offset >> 32) > 0) {
+			/* can not handle big offset for old srv */
+			return -EIO;
+		}
+	}
+	rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
+	/* tcon and ses pointer are checked in smb_init */
+	if (tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	pSMB->AndXCommand = 0xFF;	/* none */
+	pSMB->Fid = netfid;
+	pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
+	if (wct == 14)
+		pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
+	pSMB->Reserved = 0xFFFFFFFF;
+	pSMB->WriteMode = 0;
+	pSMB->Remaining = 0;
+
+	pSMB->DataOffset =
+	    cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+
+	pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF);
+	pSMB->DataLengthHigh = cpu_to_le16(count >> 16);
+	/* header + 1 byte pad */
+	smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1;
+	if (wct == 14)
+		inc_rfc1001_len(pSMB, count + 1);
+	else /* wct == 12 */
+		inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */
+	if (wct == 14)
+		pSMB->ByteCount = cpu_to_le16(count + 1);
+	else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ {
+		struct smb_com_writex_req *pSMBW =
+				(struct smb_com_writex_req *)pSMB;
+		pSMBW->ByteCount = cpu_to_le16(count + 5);
+	}
+	iov[0].iov_base = pSMB;
+	if (wct == 14)
+		iov[0].iov_len = smb_hdr_len + 4;
+	else /* wct == 12 pad bigger by four bytes */
+		iov[0].iov_len = smb_hdr_len + 8;
+
+
+	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
+	if (rc) {
+		cifs_dbg(FYI, "Send error Write2 = %d\n", rc);
+	} else if (resp_buf_type == 0) {
+		/* presumably this can not happen, but best to be safe */
+		rc = -EIO;
+	} else {
+		WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
+		*nbytes = le16_to_cpu(pSMBr->CountHigh);
+		*nbytes = (*nbytes) << 16;
+		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. OS/2
+		 * servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
+	}
+
+/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
+	free_rsp_buf(resp_buf_type, iov[0].iov_base);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+	       const __u16 netfid, const __u8 lock_type, const __u32 num_unlock,
+	       const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
+{
+	int rc = 0;
+	LOCK_REQ *pSMB = NULL;
+	struct kvec iov[2];
+	int resp_buf_type;
+	__u16 count;
+
+	cifs_dbg(FYI, "cifs_lockv num lock %d num unlock %d\n",
+		 num_lock, num_unlock);
+
+	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->Timeout = 0;
+	pSMB->NumberOfLocks = cpu_to_le16(num_lock);
+	pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
+	pSMB->LockType = lock_type;
+	pSMB->AndXCommand = 0xFF; /* none */
+	pSMB->Fid = netfid; /* netfid stays le */
+
+	count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
+			 (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+	iov[1].iov_base = (char *)buf;
+	iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+	if (rc)
+		cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc);
+
+	return rc;
+}
+
+int
+CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
+	    const __u16 smb_file_id, const __u32 netpid, const __u64 len,
+	    const __u64 offset, const __u32 numUnlock,
+	    const __u32 numLock, const __u8 lockType,
+	    const bool waitFlag, const __u8 oplock_level)
+{
+	int rc = 0;
+	LOCK_REQ *pSMB = NULL;
+/*	LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */
+	int bytes_returned;
+	int flags = 0;
+	__u16 count;
+
+	cifs_dbg(FYI, "CIFSSMBLock timeout %d numLock %d\n",
+		 (int)waitFlag, numLock);
+	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
+		/* no response expected */
+		flags = CIFS_ASYNC_OP | CIFS_OBREAK_OP;
+		pSMB->Timeout = 0;
+	} else if (waitFlag) {
+		flags = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
+		pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */
+	} else {
+		pSMB->Timeout = 0;
+	}
+
+	pSMB->NumberOfLocks = cpu_to_le16(numLock);
+	pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
+	pSMB->LockType = lockType;
+	pSMB->OplockLevel = oplock_level;
+	pSMB->AndXCommand = 0xFF;	/* none */
+	pSMB->Fid = smb_file_id; /* netfid stays le */
+
+	if ((numLock != 0) || (numUnlock != 0)) {
+		pSMB->Locks[0].Pid = cpu_to_le16(netpid);
+		/* BB where to store pid high? */
+		pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
+		pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
+		pSMB->Locks[0].OffsetLow = cpu_to_le32((u32)offset);
+		pSMB->Locks[0].OffsetHigh = cpu_to_le32((u32)(offset>>32));
+		count = sizeof(LOCKING_ANDX_RANGE);
+	} else {
+		/* oplock break */
+		count = 0;
+	}
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	if (waitFlag) {
+		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *) pSMB, &bytes_returned);
+		cifs_small_buf_release(pSMB);
+	} else {
+		rc = SendReceiveNoRsp(xid, tcon->ses, (char *)pSMB, flags);
+		/* SMB buffer freed by function above */
+	}
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
+	if (rc)
+		cifs_dbg(FYI, "Send error in Lock = %d\n", rc);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+	since file handle passed in no longer valid */
+	return rc;
+}
+
+int
+CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
+		const __u16 smb_file_id, const __u32 netpid,
+		const loff_t start_offset, const __u64 len,
+		struct file_lock *pLockData, const __u16 lock_type,
+		const bool waitFlag)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
+	struct cifs_posix_lock *parm_data;
+	int rc = 0;
+	int timeout = 0;
+	int bytes_returned = 0;
+	int resp_buf_type = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+	struct kvec iov[1];
+
+	cifs_dbg(FYI, "Posix Lock\n");
+
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB;
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	count = sizeof(struct cifs_posix_lock);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	if (pLockData)
+		pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	else
+		pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	parm_data = (struct cifs_posix_lock *)
+			(((char *) &pSMB->hdr.Protocol) + offset);
+
+	parm_data->lock_type = cpu_to_le16(lock_type);
+	if (waitFlag) {
+		timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
+		parm_data->lock_flags = cpu_to_le16(1);
+		pSMB->Timeout = cpu_to_le32(-1);
+	} else
+		pSMB->Timeout = 0;
+
+	parm_data->pid = cpu_to_le32(netpid);
+	parm_data->start = cpu_to_le64(start_offset);
+	parm_data->length = cpu_to_le64(len);  /* normalize negative numbers */
+
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = smb_file_id;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	if (waitFlag) {
+		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *) pSMBr, &bytes_returned);
+	} else {
+		iov[0].iov_base = (char *)pSMB;
+		iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
+		rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
+				&resp_buf_type, timeout);
+		pSMB = NULL; /* request buf already freed by SendReceive2. Do
+				not try to free it twice below on exit */
+		pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base;
+	}
+
+	if (rc) {
+		cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc);
+	} else if (pLockData) {
+		/* lock structure can be returned on get */
+		__u16 data_offset;
+		__u16 data_count;
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) {
+			rc = -EIO;      /* bad smb */
+			goto plk_err_exit;
+		}
+		data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+		data_count  = le16_to_cpu(pSMBr->t2.DataCount);
+		if (data_count < sizeof(struct cifs_posix_lock)) {
+			rc = -EIO;
+			goto plk_err_exit;
+		}
+		parm_data = (struct cifs_posix_lock *)
+			((char *)&pSMBr->hdr.Protocol + data_offset);
+		if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
+			pLockData->fl_type = F_UNLCK;
+		else {
+			if (parm_data->lock_type ==
+					cpu_to_le16(CIFS_RDLCK))
+				pLockData->fl_type = F_RDLCK;
+			else if (parm_data->lock_type ==
+					cpu_to_le16(CIFS_WRLCK))
+				pLockData->fl_type = F_WRLCK;
+
+			pLockData->fl_start = le64_to_cpu(parm_data->start);
+			pLockData->fl_end = pLockData->fl_start +
+					le64_to_cpu(parm_data->length) - 1;
+			pLockData->fl_pid = le32_to_cpu(parm_data->pid);
+		}
+	}
+
+plk_err_exit:
+	if (pSMB)
+		cifs_small_buf_release(pSMB);
+
+	free_rsp_buf(resp_buf_type, iov[0].iov_base);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+	   since file handle passed in no longer valid */
+
+	return rc;
+}
+
+
+int
+CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
+{
+	int rc = 0;
+	CLOSE_REQ *pSMB = NULL;
+	cifs_dbg(FYI, "In CIFSSMBClose\n");
+
+/* do not retry on dead session on close */
+	rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
+	if (rc == -EAGAIN)
+		return 0;
+	if (rc)
+		return rc;
+
+	pSMB->FileID = (__u16) smb_file_id;
+	pSMB->LastWriteTime = 0xFFFFFFFF;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_closes);
+	if (rc) {
+		if (rc != -EINTR) {
+			/* EINTR is expected when user ctl-c to kill app */
+			cifs_dbg(VFS, "Send error in Close = %d\n", rc);
+		}
+	}
+
+	/* Since session is dead, file will be closed on server already */
+	if (rc == -EAGAIN)
+		rc = 0;
+
+	return rc;
+}
+
+int
+CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
+{
+	int rc = 0;
+	FLUSH_REQ *pSMB = NULL;
+	cifs_dbg(FYI, "In CIFSSMBFlush\n");
+
+	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->FileID = (__u16) smb_file_id;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes);
+	if (rc)
+		cifs_dbg(VFS, "Send error in Flush = %d\n", rc);
+
+	return rc;
+}
+
+int
+CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+	      const char *from_name, const char *to_name,
+	      struct cifs_sb_info *cifs_sb)
+{
+	int rc = 0;
+	RENAME_REQ *pSMB = NULL;
+	RENAME_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len, name_len2;
+	__u16 count;
+	int remap = cifs_remap(cifs_sb);
+
+	cifs_dbg(FYI, "In CIFSSMBRename\n");
+renameRetry:
+	rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->BufferFormat = 0x04;
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
+			ATTR_DIRECTORY);
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+					      from_name, PATH_MAX,
+					      cifs_sb->local_nls, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+		pSMB->OldFileName[name_len] = 0x04;	/* pad */
+	/* protocol requires ASCII signature byte on Unicode string */
+		pSMB->OldFileName[name_len + 1] = 0x00;
+		name_len2 =
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       to_name, PATH_MAX, cifs_sb->local_nls,
+				       remap);
+		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
+		name_len2 *= 2;	/* convert to bytes */
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(from_name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->OldFileName, from_name, name_len);
+		name_len2 = strnlen(to_name, PATH_MAX);
+		name_len2++;	/* trailing null */
+		pSMB->OldFileName[name_len] = 0x04;  /* 2nd buffer format */
+		strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
+		name_len2++;	/* trailing null */
+		name_len2++;	/* signature byte */
+	}
+
+	count = 1 /* 1st signature byte */  + name_len + name_len2;
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_renames);
+	if (rc)
+		cifs_dbg(FYI, "Send error in rename = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto renameRetry;
+
+	return rc;
+}
+
+int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
+		int netfid, const char *target_name,
+		const struct nls_table *nls_codepage, int remap)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
+	struct set_file_rename *rename_info;
+	char *data_offset;
+	char dummy_string[30];
+	int rc = 0;
+	int bytes_returned = 0;
+	int len_of_str;
+	__u16 params, param_offset, offset, count, byte_count;
+
+	cifs_dbg(FYI, "Rename to File by handle\n");
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	rename_info = (struct set_file_rename *) data_offset;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	/* construct random name ".cifs_tmp<inodenum><mid>" */
+	rename_info->overwrite = cpu_to_le32(1);
+	rename_info->root_fid  = 0;
+	/* unicode only call */
+	if (target_name == NULL) {
+		sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
+					dummy_string, 24, nls_codepage, remap);
+	} else {
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
+					target_name, PATH_MAX, nls_codepage,
+					remap);
+	}
+	rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
+	count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
+	byte_count += count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->Fid = netfid;
+	pSMB->InformationLevel =
+		cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&pTcon->stats.cifs_stats.num_t2renames);
+	if (rc)
+		cifs_dbg(FYI, "Send error in Rename (by file handle) = %d\n",
+			 rc);
+
+	cifs_buf_release(pSMB);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+int
+CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon,
+	    const char *fromName, const __u16 target_tid, const char *toName,
+	    const int flags, const struct nls_table *nls_codepage, int remap)
+{
+	int rc = 0;
+	COPY_REQ *pSMB = NULL;
+	COPY_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len, name_len2;
+	__u16 count;
+
+	cifs_dbg(FYI, "In CIFSSMBCopy\n");
+copyRetry:
+	rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->BufferFormat = 0x04;
+	pSMB->Tid2 = target_tid;
+
+	pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+					      fromName, PATH_MAX, nls_codepage,
+					      remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+		pSMB->OldFileName[name_len] = 0x04;     /* pad */
+		/* protocol requires ASCII signature byte on Unicode string */
+		pSMB->OldFileName[name_len + 1] = 0x00;
+		name_len2 =
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
+		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
+		name_len2 *= 2; /* convert to bytes */
+	} else { 	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fromName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->OldFileName, fromName, name_len);
+		name_len2 = strnlen(toName, PATH_MAX);
+		name_len2++;    /* trailing null */
+		pSMB->OldFileName[name_len] = 0x04;  /* 2nd buffer format */
+		strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2);
+		name_len2++;    /* trailing null */
+		name_len2++;    /* signature byte */
+	}
+
+	count = 1 /* 1st signature byte */  + name_len + name_len2;
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n",
+			 rc, le16_to_cpu(pSMBr->CopyCount));
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto copyRetry;
+
+	return rc;
+}
+
+int
+CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon,
+		      const char *fromName, const char *toName,
+		      const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	char *data_offset;
+	int name_len;
+	int name_len_target;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cifs_dbg(FYI, "In Symlink Unix style\n");
+createSymLinkRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fromName,
+				/* find define for this maxpathcomponent */
+					PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fromName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fromName, name_len);
+	}
+	params = 6 + name_len;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len_target =
+		    cifsConvertToUTF16((__le16 *) data_offset, toName,
+				/* find define for this maxpathcomponent */
+					PATH_MAX, nls_codepage, remap);
+		name_len_target++;	/* trailing null */
+		name_len_target *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len_target = strnlen(toName, PATH_MAX);
+		name_len_target++;	/* trailing null */
+		strncpy(data_offset, toName, name_len_target);
+	}
+
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max on data count below from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + name_len_target;
+	pSMB->DataCount = cpu_to_le16(name_len_target);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_symlinks);
+	if (rc)
+		cifs_dbg(FYI, "Send error in SetPathInfo create symlink = %d\n",
+			 rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto createSymLinkRetry;
+
+	return rc;
+}
+
+int
+CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
+		       const char *fromName, const char *toName,
+		       const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	char *data_offset;
+	int name_len;
+	int name_len_target;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cifs_dbg(FYI, "In Create Hard link Unix style\n");
+createHardLinkRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
+					      PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(toName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, toName, name_len);
+	}
+	params = 6 + name_len;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len_target =
+		    cifsConvertToUTF16((__le16 *) data_offset, fromName,
+				       PATH_MAX, nls_codepage, remap);
+		name_len_target++;	/* trailing null */
+		name_len_target *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len_target = strnlen(fromName, PATH_MAX);
+		name_len_target++;	/* trailing null */
+		strncpy(data_offset, fromName, name_len_target);
+	}
+
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max on data count below from sess*/
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + name_len_target;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->DataCount = cpu_to_le16(name_len_target);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
+	if (rc)
+		cifs_dbg(FYI, "Send error in SetPathInfo (hard link) = %d\n",
+			 rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto createHardLinkRetry;
+
+	return rc;
+}
+
+int
+CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *from_name, const char *to_name,
+		   struct cifs_sb_info *cifs_sb)
+{
+	int rc = 0;
+	NT_RENAME_REQ *pSMB = NULL;
+	RENAME_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len, name_len2;
+	__u16 count;
+	int remap = cifs_remap(cifs_sb);
+
+	cifs_dbg(FYI, "In CIFSCreateHardLink\n");
+winCreateHardLinkRetry:
+
+	rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
+			ATTR_DIRECTORY);
+	pSMB->Flags = cpu_to_le16(CREATE_HARD_LINK);
+	pSMB->ClusterCount = 0;
+
+	pSMB->BufferFormat = 0x04;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, from_name,
+				       PATH_MAX, cifs_sb->local_nls, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+
+		/* protocol specifies ASCII buffer format (0x04) for unicode */
+		pSMB->OldFileName[name_len] = 0x04;
+		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
+		name_len2 =
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       to_name, PATH_MAX, cifs_sb->local_nls,
+				       remap);
+		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
+		name_len2 *= 2;	/* convert to bytes */
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(from_name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->OldFileName, from_name, name_len);
+		name_len2 = strnlen(to_name, PATH_MAX);
+		name_len2++;	/* trailing null */
+		pSMB->OldFileName[name_len] = 0x04;	/* 2nd buffer format */
+		strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
+		name_len2++;	/* trailing null */
+		name_len2++;	/* signature byte */
+	}
+
+	count = 1 /* string type byte */  + name_len + name_len2;
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
+	if (rc)
+		cifs_dbg(FYI, "Send error in hard link (NT rename) = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto winCreateHardLinkRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName, char **symlinkinfo,
+			const struct nls_table *nls_codepage, int remap)
+{
+/* SMB_QUERY_FILE_UNIX_LINK */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+	char *data_start;
+
+	cifs_dbg(FYI, "In QPathSymLinkInfo (Unix) for path %s\n", searchName);
+
+querySymLinkRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QuerySymLinkInfo = %d\n", rc);
+	} else {
+		/* decode response */
+
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			rc = -EIO;
+		else {
+			bool is_unicode;
+			u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+
+			data_start = ((char *) &pSMBr->hdr.Protocol) +
+					   le16_to_cpu(pSMBr->t2.DataOffset);
+
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+				is_unicode = true;
+			else
+				is_unicode = false;
+
+			/* BB FIXME investigate remapping reserved chars here */
+			*symlinkinfo = cifs_strndup_from_utf16(data_start,
+					count, is_unicode, nls_codepage);
+			if (!*symlinkinfo)
+				rc = -ENOMEM;
+		}
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto querySymLinkRetry;
+	return rc;
+}
+
+/*
+ *	Recent Windows versions now create symlinks more frequently
+ *	and they use the "reparse point" mechanism below.  We can of course
+ *	do symlinks nicely to Samba and other servers which support the
+ *	CIFS Unix Extensions and we can also do SFU symlinks and "client only"
+ *	"MF" symlinks optionally, but for recent Windows we really need to
+ *	reenable the code below and fix the cifs_symlink callers to handle this.
+ *	In the interim this code has been moved to its own config option so
+ *	it is not compiled in by default until callers fixed up and more tested.
+ */
+int
+CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
+		    __u16 fid, char **symlinkinfo,
+		    const struct nls_table *nls_codepage)
+{
+	int rc = 0;
+	int bytes_returned;
+	struct smb_com_transaction_ioctl_req *pSMB;
+	struct smb_com_transaction_ioctl_rsp *pSMBr;
+	bool is_unicode;
+	unsigned int sub_len;
+	char *sub_start;
+	struct reparse_symlink_data *reparse_buf;
+	struct reparse_posix_data *posix_buf;
+	__u32 data_offset, data_count;
+	char *end_of_smb;
+
+	cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->TotalParameterCount = 0 ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le32(2);
+	/* BB find exact data count max from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 4;
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT);
+	pSMB->IsFsctl = 1; /* FSCTL */
+	pSMB->IsRootFlag = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	pSMB->ByteCount = 0;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
+		goto qreparse_out;
+	}
+
+	data_offset = le32_to_cpu(pSMBr->DataOffset);
+	data_count = le32_to_cpu(pSMBr->DataCount);
+	if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
+		/* BB also check enough total bytes returned */
+		rc = -EIO;	/* bad smb */
+		goto qreparse_out;
+	}
+	if (!data_count || (data_count > 2048)) {
+		rc = -EIO;
+		cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
+		goto qreparse_out;
+	}
+	end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
+	reparse_buf = (struct reparse_symlink_data *)
+				((char *)&pSMBr->hdr.Protocol + data_offset);
+	if ((char *)reparse_buf >= end_of_smb) {
+		rc = -EIO;
+		goto qreparse_out;
+	}
+	if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) {
+		cifs_dbg(FYI, "NFS style reparse tag\n");
+		posix_buf =  (struct reparse_posix_data *)reparse_buf;
+
+		if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) {
+			cifs_dbg(FYI, "unsupported file type 0x%llx\n",
+				 le64_to_cpu(posix_buf->InodeType));
+			rc = -EOPNOTSUPP;
+			goto qreparse_out;
+		}
+		is_unicode = true;
+		sub_len = le16_to_cpu(reparse_buf->ReparseDataLength);
+		if (posix_buf->PathBuffer + sub_len > end_of_smb) {
+			cifs_dbg(FYI, "reparse buf beyond SMB\n");
+			rc = -EIO;
+			goto qreparse_out;
+		}
+		*symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer,
+				sub_len, is_unicode, nls_codepage);
+		goto qreparse_out;
+	} else if (reparse_buf->ReparseTag !=
+			cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) {
+		rc = -EOPNOTSUPP;
+		goto qreparse_out;
+	}
+
+	/* Reparse tag is NTFS symlink */
+	sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) +
+				reparse_buf->PathBuffer;
+	sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength);
+	if (sub_start + sub_len > end_of_smb) {
+		cifs_dbg(FYI, "reparse buf beyond SMB\n");
+		rc = -EIO;
+		goto qreparse_out;
+	}
+	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+		is_unicode = true;
+	else
+		is_unicode = false;
+
+	/* BB FIXME investigate remapping reserved chars here */
+	*symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode,
+					       nls_codepage);
+	if (!*symlinkinfo)
+		rc = -ENOMEM;
+qreparse_out:
+	cifs_buf_release(pSMB);
+
+	/*
+	 * Note: On -EAGAIN error only caller can retry on handle based calls
+	 * since file handle passed in no longer valid.
+	 */
+	return rc;
+}
+
+int
+CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		    __u16 fid)
+{
+	int rc = 0;
+	int bytes_returned;
+	struct smb_com_transaction_compr_ioctl_req *pSMB;
+	struct smb_com_transaction_ioctl_rsp *pSMBr;
+
+	cifs_dbg(FYI, "Set compression for %u\n", fid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	pSMB->TotalParameterCount = 0;
+	pSMB->TotalDataCount = cpu_to_le32(2);
+	pSMB->MaxParameterCount = 0;
+	pSMB->MaxDataCount = 0;
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = cpu_to_le32(2);
+	pSMB->DataOffset =
+		cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
+				compression_state) - 4);  /* 84 */
+	pSMB->SetupCount = 4;
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
+	pSMB->ParameterCount = 0;
+	pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION);
+	pSMB->IsFsctl = 1; /* FSCTL */
+	pSMB->IsRootFlag = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	/* 3 byte pad, followed by 2 byte compress state */
+	pSMB->ByteCount = cpu_to_le16(5);
+	inc_rfc1001_len(pSMB, 5);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	/*
+	 * Note: On -EAGAIN error only caller can retry on handle based calls
+	 * since file handle passed in no longer valid.
+	 */
+	return rc;
+}
+
+
+#ifdef CONFIG_CIFS_POSIX
+
+/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
+static void cifs_convert_ace(posix_acl_xattr_entry *ace,
+			     struct cifs_posix_ace *cifs_ace)
+{
+	/* u8 cifs fields do not need le conversion */
+	ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
+	ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
+	ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
+/*
+	cifs_dbg(FYI, "perm %d tag %d id %d\n",
+		 ace->e_perm, ace->e_tag, ace->e_id);
+*/
+
+	return;
+}
+
+/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */
+static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
+			       const int acl_type, const int size_of_data_area)
+{
+	int size =  0;
+	int i;
+	__u16 count;
+	struct cifs_posix_ace *pACE;
+	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
+	posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt;
+
+	if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
+		return -EOPNOTSUPP;
+
+	if (acl_type & ACL_TYPE_ACCESS) {
+		count = le16_to_cpu(cifs_acl->access_entry_count);
+		pACE = &cifs_acl->ace_array[0];
+		size = sizeof(struct cifs_posix_acl);
+		size += sizeof(struct cifs_posix_ace) * count;
+		/* check if we would go beyond end of SMB */
+		if (size_of_data_area < size) {
+			cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n",
+				 size_of_data_area, size);
+			return -EINVAL;
+		}
+	} else if (acl_type & ACL_TYPE_DEFAULT) {
+		count = le16_to_cpu(cifs_acl->access_entry_count);
+		size = sizeof(struct cifs_posix_acl);
+		size += sizeof(struct cifs_posix_ace) * count;
+/* skip past access ACEs to get to default ACEs */
+		pACE = &cifs_acl->ace_array[count];
+		count = le16_to_cpu(cifs_acl->default_entry_count);
+		size += sizeof(struct cifs_posix_ace) * count;
+		/* check if we would go beyond end of SMB */
+		if (size_of_data_area < size)
+			return -EINVAL;
+	} else {
+		/* illegal type */
+		return -EINVAL;
+	}
+
+	size = posix_acl_xattr_size(count);
+	if ((buflen == 0) || (local_acl == NULL)) {
+		/* used to query ACL EA size */
+	} else if (size > buflen) {
+		return -ERANGE;
+	} else /* buffer big enough */ {
+		local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+		for (i = 0; i < count ; i++) {
+			cifs_convert_ace(&local_acl->a_entries[i], pACE);
+			pACE++;
+		}
+	}
+	return size;
+}
+
+static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
+				     const posix_acl_xattr_entry *local_ace)
+{
+	__u16 rc = 0; /* 0 = ACL converted ok */
+
+	cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
+	cifs_ace->cifs_e_tag =  le16_to_cpu(local_ace->e_tag);
+	/* BB is there a better way to handle the large uid? */
+	if (local_ace->e_id == cpu_to_le32(-1)) {
+	/* Probably no need to le convert -1 on any arch but can not hurt */
+		cifs_ace->cifs_uid = cpu_to_le64(-1);
+	} else
+		cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
+/*
+	cifs_dbg(FYI, "perm %d tag %d id %d\n",
+		 ace->e_perm, ace->e_tag, ace->e_id);
+*/
+	return rc;
+}
+
+/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
+static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
+			       const int buflen, const int acl_type)
+{
+	__u16 rc = 0;
+	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
+	posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL;
+	int count;
+	int i;
+
+	if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL))
+		return 0;
+
+	count = posix_acl_xattr_count((size_t)buflen);
+	cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
+		 count, buflen, le32_to_cpu(local_acl->a_version));
+	if (le32_to_cpu(local_acl->a_version) != 2) {
+		cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
+			 le32_to_cpu(local_acl->a_version));
+		return 0;
+	}
+	cifs_acl->version = cpu_to_le16(1);
+	if (acl_type == ACL_TYPE_ACCESS) {
+		cifs_acl->access_entry_count = cpu_to_le16(count);
+		cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
+	} else if (acl_type == ACL_TYPE_DEFAULT) {
+		cifs_acl->default_entry_count = cpu_to_le16(count);
+		cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
+	} else {
+		cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
+		return 0;
+	}
+	for (i = 0; i < count; i++) {
+		rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i],
+					&local_acl->a_entries[i]);
+		if (rc != 0) {
+			/* ACE not converted */
+			break;
+		}
+	}
+	if (rc == 0) {
+		rc = (__u16)(count * sizeof(struct cifs_posix_ace));
+		rc += sizeof(struct cifs_posix_acl);
+		/* BB add check to make sure ACL does not overflow SMB */
+	}
+	return rc;
+}
+
+int
+CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
+		   const unsigned char *searchName,
+		   char *acl_inf, const int buflen, const int acl_type,
+		   const struct nls_table *nls_codepage, int remap)
+{
+/* SMB_QUERY_POSIX_ACL */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName);
+
+queryAclRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+		pSMB->FileName[name_len] = 0;
+		pSMB->FileName[name_len+1] = 0;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(
+		offsetof(struct smb_com_transaction2_qpi_req,
+			 InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc);
+	} else {
+		/* decode response */
+
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			rc = cifs_copy_posix_acl(acl_inf,
+				(char *)&pSMBr->hdr.Protocol+data_offset,
+				buflen, acl_type, count);
+		}
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto queryAclRetry;
+	return rc;
+}
+
+int
+CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
+		   const unsigned char *fileName,
+		   const char *local_acl, const int buflen,
+		   const int acl_type,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	struct smb_com_transaction2_spi_req *pSMB = NULL;
+	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
+	char *parm_data;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count, data_count, param_offset, offset;
+
+	cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName);
+setAclRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+					   PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+	params = 6 + name_len;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB size from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	parm_data = ((char *) &pSMB->hdr.Protocol) + offset;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+
+	/* convert to on the wire format for POSIX ACL */
+	data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type);
+
+	if (data_count == 0) {
+		rc = -EOPNOTSUPP;
+		goto setACLerrorExit;
+	}
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL);
+	byte_count = 3 /* pad */  + params + data_count;
+	pSMB->DataCount = cpu_to_le16(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc);
+
+setACLerrorExit:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto setAclRetry;
+	return rc;
+}
+
+/* BB fix tabs in this function FIXME BB */
+int
+CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
+	       const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
+{
+	int rc = 0;
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In GetExtAttr\n");
+	if (tcon == NULL)
+		return -ENODEV;
+
+GetExtAttrRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(4000);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "error %d in GetExtAttr\n", rc);
+	} else {
+		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			/* If rc should we check for EOPNOSUPP and
+			   disable the srvino flag? or in caller? */
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			struct file_chattr_info *pfinfo;
+			/* BB Do we need a cast or hash here ? */
+			if (count != 16) {
+				cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n");
+				rc = -EIO;
+				goto GetExtAttrOut;
+			}
+			pfinfo = (struct file_chattr_info *)
+				 (data_offset + (char *) &pSMBr->hdr.Protocol);
+			*pExtAttrBits = le64_to_cpu(pfinfo->mode);
+			*pMask = le64_to_cpu(pfinfo->mask);
+		}
+	}
+GetExtAttrOut:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto GetExtAttrRetry;
+	return rc;
+}
+
+#endif /* CONFIG_POSIX */
+
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+		   const int parm_len, struct cifs_tcon *tcon,
+		   void **ret_buf)
+{
+	int rc;
+	__u32 temp_offset;
+	struct smb_com_ntransact_req *pSMB;
+
+	rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+				(void **)&pSMB);
+	if (rc)
+		return rc;
+	*ret_buf = (void *)pSMB;
+	pSMB->Reserved = 0;
+	pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+	pSMB->TotalDataCount  = 0;
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->DataCount  = pSMB->TotalDataCount;
+	temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+			(setup_count * 2) - 4 /* for rfc1001 length itself */;
+	pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+	pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+	pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+	pSMB->SubCommand = cpu_to_le16(sub_command);
+	return 0;
+}
+
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+		   __u32 *pparmlen, __u32 *pdatalen)
+{
+	char *end_of_smb;
+	__u32 data_count, data_offset, parm_count, parm_offset;
+	struct smb_com_ntransact_rsp *pSMBr;
+	u16 bcc;
+
+	*pdatalen = 0;
+	*pparmlen = 0;
+
+	if (buf == NULL)
+		return -EINVAL;
+
+	pSMBr = (struct smb_com_ntransact_rsp *)buf;
+
+	bcc = get_bcc(&pSMBr->hdr);
+	end_of_smb = 2 /* sizeof byte count */ + bcc +
+			(char *)&pSMBr->ByteCount;
+
+	data_offset = le32_to_cpu(pSMBr->DataOffset);
+	data_count = le32_to_cpu(pSMBr->DataCount);
+	parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+	parm_count = le32_to_cpu(pSMBr->ParameterCount);
+
+	*ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+	*ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+
+	/* should we also check that parm and data areas do not overlap? */
+	if (*ppparm > end_of_smb) {
+		cifs_dbg(FYI, "parms start after end of smb\n");
+		return -EINVAL;
+	} else if (parm_count + *ppparm > end_of_smb) {
+		cifs_dbg(FYI, "parm end after end of smb\n");
+		return -EINVAL;
+	} else if (*ppdata > end_of_smb) {
+		cifs_dbg(FYI, "data starts after end of smb\n");
+		return -EINVAL;
+	} else if (data_count + *ppdata > end_of_smb) {
+		cifs_dbg(FYI, "data %p + count %d (%p) past smb end %p start %p\n",
+			 *ppdata, data_count, (data_count + *ppdata),
+			 end_of_smb, pSMBr);
+		return -EINVAL;
+	} else if (parm_count + data_count > bcc) {
+		cifs_dbg(FYI, "parm count and data count larger than SMB\n");
+		return -EINVAL;
+	}
+	*pdatalen = data_count;
+	*pparmlen = parm_count;
+	return 0;
+}
+
+/* Get Security Descriptor (by handle) from remote server for a file or dir */
+int
+CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
+		  struct cifs_ntsd **acl_inf, __u32 *pbuflen)
+{
+	int rc = 0;
+	int buf_type = 0;
+	QUERY_SEC_DESC_REQ *pSMB;
+	struct kvec iov[1];
+
+	cifs_dbg(FYI, "GetCifsACL\n");
+
+	*pbuflen = 0;
+	*acl_inf = NULL;
+
+	rc = smb_init_nttransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
+			8 /* parm len */, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->MaxParameterCount = cpu_to_le32(4);
+	/* BB TEST with big acls that might need to be e.g. larger than 16K */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
+				     CIFS_ACL_DACL);
+	pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
+	inc_rfc1001_len(pSMB, 11);
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
+
+	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
+			 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc);
+	} else {                /* decode response */
+		__le32 *parm;
+		__u32 parm_len;
+		__u32 acl_len;
+		struct smb_com_ntransact_rsp *pSMBr;
+		char *pdata;
+
+/* validate_nttransact */
+		rc = validate_ntransact(iov[0].iov_base, (char **)&parm,
+					&pdata, &parm_len, pbuflen);
+		if (rc)
+			goto qsec_out;
+		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
+
+		cifs_dbg(FYI, "smb %p parm %p data %p\n",
+			 pSMBr, parm, *acl_inf);
+
+		if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
+			rc = -EIO;      /* bad smb */
+			*pbuflen = 0;
+			goto qsec_out;
+		}
+
+/* BB check that data area is minimum length and as big as acl_len */
+
+		acl_len = le32_to_cpu(*parm);
+		if (acl_len != *pbuflen) {
+			cifs_dbg(VFS, "acl length %d does not match %d\n",
+				 acl_len, *pbuflen);
+			if (*pbuflen > acl_len)
+				*pbuflen = acl_len;
+		}
+
+		/* check if buffer is big enough for the acl
+		   header followed by the smallest SID */
+		if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
+		    (*pbuflen >= 64 * 1024)) {
+			cifs_dbg(VFS, "bad acl length %d\n", *pbuflen);
+			rc = -EINVAL;
+			*pbuflen = 0;
+		} else {
+			*acl_inf = kmemdup(pdata, *pbuflen, GFP_KERNEL);
+			if (*acl_inf == NULL) {
+				*pbuflen = 0;
+				rc = -ENOMEM;
+			}
+		}
+	}
+qsec_out:
+	free_rsp_buf(buf_type, iov[0].iov_base);
+/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
+	return rc;
+}
+
+int
+CIFSSMBSetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
+			struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
+{
+	__u16 byte_count, param_count, data_count, param_offset, data_offset;
+	int rc = 0;
+	int bytes_returned = 0;
+	SET_SEC_DESC_REQ *pSMB = NULL;
+	void *pSMBr;
+
+setCifsAclRetry:
+	rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB, &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+
+	param_count = 8;
+	param_offset = offsetof(struct smb_com_transaction_ssec_req, Fid) - 4;
+	data_count = acllen;
+	data_offset = param_offset + param_count;
+	byte_count = 3 /* pad */  + param_count;
+
+	pSMB->DataCount = cpu_to_le32(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->MaxParameterCount = cpu_to_le32(4);
+	pSMB->MaxDataCount = cpu_to_le32(16384);
+	pSMB->ParameterCount = cpu_to_le32(param_count);
+	pSMB->ParameterOffset = cpu_to_le32(param_offset);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->DataOffset = cpu_to_le32(data_offset);
+	pSMB->SetupCount = 0;
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_SET_SECURITY_DESC);
+	pSMB->ByteCount = cpu_to_le16(byte_count+data_count);
+
+	pSMB->Fid = fid; /* file handle always le */
+	pSMB->Reserved2 = 0;
+	pSMB->AclFlags = cpu_to_le32(aclflag);
+
+	if (pntsd && acllen) {
+		memcpy((char *)pSMBr + offsetof(struct smb_hdr, Protocol) +
+				data_offset, pntsd, acllen);
+		inc_rfc1001_len(pSMB, byte_count + data_count);
+	} else
+		inc_rfc1001_len(pSMB, byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+
+	cifs_dbg(FYI, "SetCIFSACL bytes_returned: %d, rc: %d\n",
+		 bytes_returned, rc);
+	if (rc)
+		cifs_dbg(FYI, "Set CIFS ACL returned %d\n", rc);
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto setCifsAclRetry;
+
+	return (rc);
+}
+
+#endif /* CONFIG_CIFS_ACL */
+
+/* Legacy Query Path Information call for lookup to old servers such
+   as Win9x/WinME */
+int
+SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon,
+		    const char *search_name, FILE_ALL_INFO *data,
+		    const struct nls_table *nls_codepage, int remap)
+{
+	QUERY_INFORMATION_REQ *pSMB;
+	QUERY_INFORMATION_RSP *pSMBr;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+
+	cifs_dbg(FYI, "In SMBQPath path %s\n", search_name);
+QInfRetry:
+	rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   search_name, PATH_MAX, nls_codepage,
+					   remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {
+		name_len = strnlen(search_name, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, search_name, name_len);
+	}
+	pSMB->BufferFormat = 0x04;
+	name_len++; /* account for buffer type byte */
+	inc_rfc1001_len(pSMB, (__u16)name_len);
+	pSMB->ByteCount = cpu_to_le16(name_len);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc);
+	} else if (data) {
+		struct timespec ts;
+		__u32 time = le32_to_cpu(pSMBr->last_write_time);
+
+		/* decode response */
+		/* BB FIXME - add time zone adjustment BB */
+		memset(data, 0, sizeof(FILE_ALL_INFO));
+		ts.tv_nsec = 0;
+		ts.tv_sec = time;
+		/* decode time fields */
+		data->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
+		data->LastWriteTime = data->ChangeTime;
+		data->LastAccessTime = 0;
+		data->AllocationSize =
+			cpu_to_le64(le32_to_cpu(pSMBr->size));
+		data->EndOfFile = data->AllocationSize;
+		data->Attributes =
+			cpu_to_le32(le16_to_cpu(pSMBr->attr));
+	} else
+		rc = -EIO; /* bad buffer passed in */
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QInfRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		 u16 netfid, FILE_ALL_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+QFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QFileInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (get_bcc(&pSMBr->hdr) < 40)
+			rc = -EIO;	/* bad smb */
+		else if (pFindData) {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset, sizeof(FILE_ALL_INFO));
+		} else
+		    rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QFileInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		 const char *search_name, FILE_ALL_INFO *data,
+		 int legacy /* old style infolevel */,
+		 const struct nls_table *nls_codepage, int remap)
+{
+	/* level 263 SMB_QUERY_FILE_ALL_INFO */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+
+	/* cifs_dbg(FYI, "In QPathInfo path %s\n", search_name); */
+QPathInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, search_name,
+				       PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(search_name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, search_name, name_len);
+	}
+
+	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	if (legacy)
+		pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD);
+	else
+		pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (!legacy && get_bcc(&pSMBr->hdr) < 40)
+			rc = -EIO;	/* bad smb */
+		else if (legacy && get_bcc(&pSMBr->hdr) < 24)
+			rc = -EIO;  /* 24 or 26 expected but we do not read
+					last field */
+		else if (data) {
+			int size;
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+
+			/*
+			 * On legacy responses we do not read the last field,
+			 * EAsize, fortunately since it varies by subdialect and
+			 * also note it differs on Set vs Get, ie two bytes or 4
+			 * bytes depending but we don't care here.
+			 */
+			if (legacy)
+				size = sizeof(FILE_INFO_STANDARD);
+			else
+				size = sizeof(FILE_ALL_INFO);
+			memcpy((char *) data, (char *) &pSMBr->hdr.Protocol +
+			       data_offset, size);
+		} else
+		    rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QPathInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBUnixQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+UnixQFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
+			cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset,
+			       sizeof(FILE_UNIX_BASIC_INFO));
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto UnixQFileInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBUnixQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		     const unsigned char *searchName,
+		     FILE_UNIX_BASIC_INFO *pFindData,
+		     const struct nls_table *nls_codepage, int remap)
+{
+/* SMB_QUERY_FILE_UNIX_BASIC */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned = 0;
+	int name_len;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In QPathInfo (Unix) the path %s\n", searchName);
+UnixQPathInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
+			cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset,
+			       sizeof(FILE_UNIX_BASIC_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto UnixQPathInfoRetry;
+
+	return rc;
+}
+
+/* xid, tcon, searchName and codepage are input parms, rest are returned */
+int
+CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
+	      const char *searchName, struct cifs_sb_info *cifs_sb,
+	      __u16 *pnetfid, __u16 search_flags,
+	      struct cifs_search_info *psrch_inf, bool msearch)
+{
+/* level 257 SMB_ */
+	TRANSACTION2_FFIRST_REQ *pSMB = NULL;
+	TRANSACTION2_FFIRST_RSP *pSMBr = NULL;
+	T2_FFIRST_RSP_PARMS *parms;
+	int rc = 0;
+	int bytes_returned = 0;
+	int name_len, remap;
+	__u16 params, byte_count;
+	struct nls_table *nls_codepage;
+
+	cifs_dbg(FYI, "In FindFirst for %s\n", searchName);
+
+findFirstRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	nls_codepage = cifs_sb->local_nls;
+	remap = cifs_remap(cifs_sb);
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
+		/* We can not add the asterik earlier in case
+		it got remapped to 0xF03A as if it were part of the
+		directory name instead of a wildcard */
+		name_len *= 2;
+		if (msearch) {
+			pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb);
+			pSMB->FileName[name_len+1] = 0;
+			pSMB->FileName[name_len+2] = '*';
+			pSMB->FileName[name_len+3] = 0;
+			name_len += 4; /* now the trailing null */
+			/* null terminate just in case */
+			pSMB->FileName[name_len] = 0;
+			pSMB->FileName[name_len+1] = 0;
+			name_len += 2;
+		}
+	} else {	/* BB add check for overrun of SMB buf BB */
+		name_len = strnlen(searchName, PATH_MAX);
+/* BB fix here and in unicode clause above ie
+		if (name_len > buffersize-header)
+			free buffer exit; BB */
+		strncpy(pSMB->FileName, searchName, name_len);
+		if (msearch) {
+			pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb);
+			pSMB->FileName[name_len+1] = '*';
+			pSMB->FileName[name_len+2] = 0;
+			name_len += 3;
+		}
+	}
+
+	params = 12 + name_len /* includes null */ ;
+	pSMB->TotalDataCount = 0;	/* no EAs */
+	pSMB->MaxParameterCount = cpu_to_le16(10);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(
+	      offsetof(struct smb_com_transaction2_ffirst_req, SearchAttributes)
+		- 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;	/* one byte, no need to make endian neutral */
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_FIRST);
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
+			ATTR_DIRECTORY);
+	pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO));
+	pSMB->SearchFlags = cpu_to_le16(search_flags);
+	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
+
+	/* BB what should we set StorageType to? Does it matter? BB */
+	pSMB->SearchStorageType = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_ffirst);
+
+	if (rc) {/* BB add logic to retry regular search if Unix search
+			rejected unexpectedly by server */
+		/* BB Add code to handle unsupported level rc */
+		cifs_dbg(FYI, "Error in FindFirst = %d\n", rc);
+
+		cifs_buf_release(pSMB);
+
+		/* BB eventually could optimize out free and realloc of buf */
+		/*    for this case */
+		if (rc == -EAGAIN)
+			goto findFirstRetry;
+	} else { /* decode response */
+		/* BB remember to free buffer if error BB */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		if (rc == 0) {
+			unsigned int lnoff;
+
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+				psrch_inf->unicode = true;
+			else
+				psrch_inf->unicode = false;
+
+			psrch_inf->ntwrk_buf_start = (char *)pSMBr;
+			psrch_inf->smallBuf = 0;
+			psrch_inf->srch_entries_start =
+				(char *) &pSMBr->hdr.Protocol +
+					le16_to_cpu(pSMBr->t2.DataOffset);
+			parms = (T2_FFIRST_RSP_PARMS *)((char *) &pSMBr->hdr.Protocol +
+			       le16_to_cpu(pSMBr->t2.ParameterOffset));
+
+			if (parms->EndofSearch)
+				psrch_inf->endOfSearch = true;
+			else
+				psrch_inf->endOfSearch = false;
+
+			psrch_inf->entries_in_buffer =
+					le16_to_cpu(parms->SearchCount);
+			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
+				psrch_inf->entries_in_buffer;
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (CIFSMaxBufSize < lnoff) {
+				cifs_dbg(VFS, "ignoring corrupt resume name\n");
+				psrch_inf->last_entry = NULL;
+				return rc;
+			}
+
+			psrch_inf->last_entry = psrch_inf->srch_entries_start +
+							lnoff;
+
+			if (pnetfid)
+				*pnetfid = parms->SearchHandle;
+		} else {
+			cifs_buf_release(pSMB);
+		}
+	}
+
+	return rc;
+}
+
+int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
+		 __u16 searchHandle, __u16 search_flags,
+		 struct cifs_search_info *psrch_inf)
+{
+	TRANSACTION2_FNEXT_REQ *pSMB = NULL;
+	TRANSACTION2_FNEXT_RSP *pSMBr = NULL;
+	T2_FNEXT_RSP_PARMS *parms;
+	char *response_data;
+	int rc = 0;
+	int bytes_returned;
+	unsigned int name_len;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In FindNext\n");
+
+	if (psrch_inf->endOfSearch)
+		return -ENOENT;
+
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 14; /* includes 2 bytes of null string, converted to LE below*/
+	byte_count = 0;
+	pSMB->TotalDataCount = 0;       /* no EAs */
+	pSMB->MaxParameterCount = cpu_to_le16(8);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset =  cpu_to_le16(
+	      offsetof(struct smb_com_transaction2_fnext_req,SearchHandle) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_NEXT);
+	pSMB->SearchHandle = searchHandle;      /* always kept as le */
+	pSMB->SearchCount =
+		cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO));
+	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
+	pSMB->ResumeKey = psrch_inf->resume_key;
+	pSMB->SearchFlags = cpu_to_le16(search_flags);
+
+	name_len = psrch_inf->resume_name_len;
+	params += name_len;
+	if (name_len < PATH_MAX) {
+		memcpy(pSMB->ResumeFileName, psrch_inf->presume_name, name_len);
+		byte_count += name_len;
+		/* 14 byte parm len above enough for 2 byte null terminator */
+		pSMB->ResumeFileName[name_len] = 0;
+		pSMB->ResumeFileName[name_len+1] = 0;
+	} else {
+		rc = -EINVAL;
+		goto FNext2_err_exit;
+	}
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_fnext);
+	if (rc) {
+		if (rc == -EBADF) {
+			psrch_inf->endOfSearch = true;
+			cifs_buf_release(pSMB);
+			rc = 0; /* search probably was closed at end of search*/
+		} else
+			cifs_dbg(FYI, "FindNext returned = %d\n", rc);
+	} else {                /* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc == 0) {
+			unsigned int lnoff;
+
+			/* BB fixme add lock for file (srch_info) struct here */
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+				psrch_inf->unicode = true;
+			else
+				psrch_inf->unicode = false;
+			response_data = (char *) &pSMBr->hdr.Protocol +
+			       le16_to_cpu(pSMBr->t2.ParameterOffset);
+			parms = (T2_FNEXT_RSP_PARMS *)response_data;
+			response_data = (char *)&pSMBr->hdr.Protocol +
+				le16_to_cpu(pSMBr->t2.DataOffset);
+			if (psrch_inf->smallBuf)
+				cifs_small_buf_release(
+					psrch_inf->ntwrk_buf_start);
+			else
+				cifs_buf_release(psrch_inf->ntwrk_buf_start);
+			psrch_inf->srch_entries_start = response_data;
+			psrch_inf->ntwrk_buf_start = (char *)pSMB;
+			psrch_inf->smallBuf = 0;
+			if (parms->EndofSearch)
+				psrch_inf->endOfSearch = true;
+			else
+				psrch_inf->endOfSearch = false;
+			psrch_inf->entries_in_buffer =
+						le16_to_cpu(parms->SearchCount);
+			psrch_inf->index_of_last_entry +=
+				psrch_inf->entries_in_buffer;
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (CIFSMaxBufSize < lnoff) {
+				cifs_dbg(VFS, "ignoring corrupt resume name\n");
+				psrch_inf->last_entry = NULL;
+				return rc;
+			} else
+				psrch_inf->last_entry =
+					psrch_inf->srch_entries_start + lnoff;
+
+/*  cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n",
+    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
+
+			/* BB fixme add unlock here */
+		}
+
+	}
+
+	/* BB On error, should we leave previous search buf (and count and
+	last entry fields) intact or free the previous one? */
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+	since file handle passed in no longer valid */
+FNext2_err_exit:
+	if (rc != 0)
+		cifs_buf_release(pSMB);
+	return rc;
+}
+
+int
+CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
+	      const __u16 searchHandle)
+{
+	int rc = 0;
+	FINDCLOSE_REQ *pSMB = NULL;
+
+	cifs_dbg(FYI, "In CIFSSMBFindClose\n");
+	rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
+
+	/* no sense returning error if session restarted
+		as file handle has been closed */
+	if (rc == -EAGAIN)
+		return 0;
+	if (rc)
+		return rc;
+
+	pSMB->FileID = searchHandle;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	if (rc)
+		cifs_dbg(VFS, "Send error in FindClose = %d\n", rc);
+
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_fclose);
+
+	/* Since session is dead, search handle closed on server already */
+	if (rc == -EAGAIN)
+		rc = 0;
+
+	return rc;
+}
+
+int
+CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
+		      const char *search_name, __u64 *inode_number,
+		      const struct nls_table *nls_codepage, int remap)
+{
+	int rc = 0;
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int name_len, bytes_returned;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In GetSrvInodeNum for %s\n", search_name);
+	if (tcon == NULL)
+		return -ENODEV;
+
+GetInodeNumberRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   search_name, PATH_MAX, nls_codepage,
+					   remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(search_name, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, search_name, name_len);
+	}
+
+	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "error %d in QueryInternalInfo\n", rc);
+	} else {
+		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			/* If rc should we check for EOPNOSUPP and
+			disable the srvino flag? or in caller? */
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			struct file_internal_info *pfinfo;
+			/* BB Do we need a cast or hash here ? */
+			if (count < 8) {
+				cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n");
+				rc = -EIO;
+				goto GetInodeNumOut;
+			}
+			pfinfo = (struct file_internal_info *)
+				(data_offset + (char *) &pSMBr->hdr.Protocol);
+			*inode_number = le64_to_cpu(pfinfo->UniqueId);
+		}
+	}
+GetInodeNumOut:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto GetInodeNumberRetry;
+	return rc;
+}
+
+/* parses DFS refferal V3 structure
+ * caller is responsible for freeing target_nodes
+ * returns:
+ * 	on success - 0
+ *	on failure - errno
+ */
+static int
+parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
+		unsigned int *num_of_nodes,
+		struct dfs_info3_param **target_nodes,
+		const struct nls_table *nls_codepage, int remap,
+		const char *searchName)
+{
+	int i, rc = 0;
+	char *data_end;
+	bool is_unicode;
+	struct dfs_referral_level_3 *ref;
+
+	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+		is_unicode = true;
+	else
+		is_unicode = false;
+	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
+
+	if (*num_of_nodes < 1) {
+		cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n",
+			 *num_of_nodes);
+		rc = -EINVAL;
+		goto parse_DFS_referrals_exit;
+	}
+
+	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
+	if (ref->VersionNumber != cpu_to_le16(3)) {
+		cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n",
+			 le16_to_cpu(ref->VersionNumber));
+		rc = -EINVAL;
+		goto parse_DFS_referrals_exit;
+	}
+
+	/* get the upper boundary of the resp buffer */
+	data_end = (char *)(&(pSMBr->PathConsumed)) +
+				le16_to_cpu(pSMBr->t2.DataCount);
+
+	cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n",
+		 *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags));
+
+	*target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param),
+				GFP_KERNEL);
+	if (*target_nodes == NULL) {
+		rc = -ENOMEM;
+		goto parse_DFS_referrals_exit;
+	}
+
+	/* collect necessary data from referrals */
+	for (i = 0; i < *num_of_nodes; i++) {
+		char *temp;
+		int max_len;
+		struct dfs_info3_param *node = (*target_nodes)+i;
+
+		node->flags = le32_to_cpu(pSMBr->DFSFlags);
+		if (is_unicode) {
+			__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
+						GFP_KERNEL);
+			if (tmp == NULL) {
+				rc = -ENOMEM;
+				goto parse_DFS_referrals_exit;
+			}
+			cifsConvertToUTF16((__le16 *) tmp, searchName,
+					   PATH_MAX, nls_codepage, remap);
+			node->path_consumed = cifs_utf16_bytes(tmp,
+					le16_to_cpu(pSMBr->PathConsumed),
+					nls_codepage);
+			kfree(tmp);
+		} else
+			node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+
+		node->server_type = le16_to_cpu(ref->ServerType);
+		node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
+
+		/* copy DfsPath */
+		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
+		max_len = data_end - temp;
+		node->path_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
+		if (!node->path_name) {
+			rc = -ENOMEM;
+			goto parse_DFS_referrals_exit;
+		}
+
+		/* copy link target UNC */
+		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
+		max_len = data_end - temp;
+		node->node_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
+		if (!node->node_name) {
+			rc = -ENOMEM;
+			goto parse_DFS_referrals_exit;
+		}
+
+		ref++;
+	}
+
+parse_DFS_referrals_exit:
+	if (rc) {
+		free_dfs_info_array(*target_nodes, *num_of_nodes);
+		*target_nodes = NULL;
+		*num_of_nodes = 0;
+	}
+	return rc;
+}
+
+int
+CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
+		const char *search_name, struct dfs_info3_param **target_nodes,
+		unsigned int *num_of_nodes,
+		const struct nls_table *nls_codepage, int remap)
+{
+/* TRANS2_GET_DFS_REFERRAL */
+	TRANSACTION2_GET_DFS_REFER_REQ *pSMB = NULL;
+	TRANSACTION2_GET_DFS_REFER_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+	*num_of_nodes = 0;
+	*target_nodes = NULL;
+
+	cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name);
+	if (ses == NULL)
+		return -ENODEV;
+getDFSRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, NULL, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	/* server pointer checked in called function,
+	but should never be null here anyway */
+	pSMB->hdr.Mid = get_next_mid(ses->server);
+	pSMB->hdr.Tid = ses->ipc_tid;
+	pSMB->hdr.Uid = ses->Suid;
+	if (ses->capabilities & CAP_STATUS32)
+		pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS;
+	if (ses->capabilities & CAP_DFS)
+		pSMB->hdr.Flags2 |= SMBFLG2_DFS;
+
+	if (ses->capabilities & CAP_UNICODE) {
+		pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
+				       search_name, PATH_MAX, nls_codepage,
+				       remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(search_name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->RequestFileName, search_name, name_len);
+	}
+
+	if (ses->server->sign)
+		pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	pSMB->hdr.Uid = ses->Suid;
+
+	params = 2 /* level */  + name_len /*includes null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->MaxParameterCount = 0;
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	  struct smb_com_transaction2_get_dfs_refer_req, MaxReferralLevel) - 4);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_GET_DFS_REFERRAL);
+	byte_count = params + 3 /* pad */ ;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->MaxReferralLevel = cpu_to_le16(3);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in GetDFSRefer = %d\n", rc);
+		goto GetDFSRefExit;
+	}
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+	/* BB Also check if enough total bytes returned? */
+	if (rc || get_bcc(&pSMBr->hdr) < 17) {
+		rc = -EIO;      /* bad smb */
+		goto GetDFSRefExit;
+	}
+
+	cifs_dbg(FYI, "Decoding GetDFSRefer response BCC: %d  Offset %d\n",
+		 get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset));
+
+	/* parse returned result into more usable form */
+	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
+				 target_nodes, nls_codepage, remap,
+				 search_name);
+
+GetDFSRefExit:
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto getDFSRetry;
+
+	return rc;
+}
+
+/* Query File System Info such as free space to old servers such as Win 9x */
+int
+SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
+	      struct kstatfs *FSData)
+{
+/* level 0x01 SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_ALLOC_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "OldQFSInfo\n");
+oldQFSInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;     /* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
+	} else {                /* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 18)
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			cifs_dbg(FYI, "qfsinf resp BCC: %d  Offset %d\n",
+				 get_bcc(&pSMBr->hdr), data_offset);
+
+			response_data = (FILE_SYSTEM_ALLOC_INFO *)
+				(((char *) &pSMBr->hdr.Protocol) + data_offset);
+			FSData->f_bsize =
+				le16_to_cpu(response_data->BytesPerSector) *
+				le32_to_cpu(response_data->
+					SectorsPerAllocationUnit);
+			FSData->f_blocks =
+			       le32_to_cpu(response_data->TotalAllocationUnits);
+			FSData->f_bfree = FSData->f_bavail =
+				le32_to_cpu(response_data->FreeAllocationUnits);
+			cifs_dbg(FYI, "Blocks: %lld  Free: %lld Block size %ld\n",
+				 (unsigned long long)FSData->f_blocks,
+				 (unsigned long long)FSData->f_bfree,
+				 FSData->f_bsize);
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto oldQFSInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
+	       struct kstatfs *FSData)
+{
+/* level 0x103 SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In QFSInfo\n");
+QFSInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 24)
+			rc = -EIO;	/* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+
+			response_data =
+			    (FILE_SYSTEM_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			FSData->f_bsize =
+			    le32_to_cpu(response_data->BytesPerSector) *
+			    le32_to_cpu(response_data->
+					SectorsPerAllocationUnit);
+			FSData->f_blocks =
+			    le64_to_cpu(response_data->TotalAllocationUnits);
+			FSData->f_bfree = FSData->f_bavail =
+			    le64_to_cpu(response_data->FreeAllocationUnits);
+			cifs_dbg(FYI, "Blocks: %lld  Free: %lld Block size %ld\n",
+				 (unsigned long long)FSData->f_blocks,
+				 (unsigned long long)FSData->f_bfree,
+				 FSData->f_bsize);
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSAttributeInfo(const unsigned int xid, struct cifs_tcon *tcon)
+{
+/* level 0x105  SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_ATTRIBUTE_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In QFSAttributeInfo\n");
+QFSAttributeRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(VFS, "Send error in QFSAttributeInfo = %d\n", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
+			/* BB also check if enough bytes returned */
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_ATTRIBUTE_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			memcpy(&tcon->fsAttrInfo, response_data,
+			       sizeof(FILE_SYSTEM_ATTRIBUTE_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSAttributeRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon)
+{
+/* level 0x104 SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_DEVICE_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In QFSDeviceInfo\n");
+QFSDeviceRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QFSDeviceInfo = %d\n", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) <
+			  sizeof(FILE_SYSTEM_DEVICE_INFO))
+			rc = -EIO;	/* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_DEVICE_INFO *)
+				(((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			memcpy(&tcon->fsDevInfo, response_data,
+			       sizeof(FILE_SYSTEM_DEVICE_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSDeviceRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon)
+{
+/* level 0x200  SMB_QUERY_CIFS_UNIX_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_UNIX_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In QFSUnixInfo\n");
+QFSUnixRetry:
+	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+				   (void **) &pSMB, (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(struct
+			smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(VFS, "Send error in QFSUnixInfo = %d\n", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_UNIX_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			memcpy(&tcon->fsUnixInfo, response_data,
+			       sizeof(FILE_SYSTEM_UNIX_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSUnixRetry;
+
+
+	return rc;
+}
+
+int
+CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, __u64 cap)
+{
+/* level 0x200  SMB_SET_CIFS_UNIX_INFO */
+	TRANSACTION2_SETFSI_REQ *pSMB = NULL;
+	TRANSACTION2_SETFSI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cifs_dbg(FYI, "In SETFSUnixInfo\n");
+SETFSUnixRetry:
+	/* BB switch to small buf init to save memory */
+	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+					(void **) &pSMB, (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 4;	/* 2 bytes zero followed by info level. */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_setfsi_req, FileNum)
+				- 4;
+	offset = param_offset + params;
+
+	pSMB->MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FS_INFORMATION);
+	byte_count = 1 /* pad */ + params + 12;
+
+	pSMB->DataCount = cpu_to_le16(12);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+
+	/* Params. */
+	pSMB->FileNum = 0;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_CIFS_UNIX_INFO);
+
+	/* Data. */
+	pSMB->ClientUnixMajor = cpu_to_le16(CIFS_UNIX_MAJOR_VERSION);
+	pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION);
+	pSMB->ClientUnixCap = cpu_to_le64(cap);
+
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(VFS, "Send error in SETFSUnixInfo = %d\n", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		if (rc)
+			rc = -EIO;	/* bad smb */
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SETFSUnixRetry;
+
+	return rc;
+}
+
+
+
+int
+CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct kstatfs *FSData)
+{
+/* level 0x201  SMB_QUERY_CIFS_POSIX_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_POSIX_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In QFSPosixInfo\n");
+QFSPosixRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(struct
+			smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QFSUnixInfo = %d\n", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_POSIX_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			FSData->f_bsize =
+					le32_to_cpu(response_data->BlockSize);
+			FSData->f_blocks =
+					le64_to_cpu(response_data->TotalBlocks);
+			FSData->f_bfree =
+			    le64_to_cpu(response_data->BlocksAvail);
+			if (response_data->UserBlocksAvail == cpu_to_le64(-1)) {
+				FSData->f_bavail = FSData->f_bfree;
+			} else {
+				FSData->f_bavail =
+				    le64_to_cpu(response_data->UserBlocksAvail);
+			}
+			if (response_data->TotalFileNodes != cpu_to_le64(-1))
+				FSData->f_files =
+				     le64_to_cpu(response_data->TotalFileNodes);
+			if (response_data->FreeFileNodes != cpu_to_le64(-1))
+				FSData->f_ffree =
+				      le64_to_cpu(response_data->FreeFileNodes);
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSPosixRetry;
+
+	return rc;
+}
+
+
+/*
+ * We can not use write of zero bytes trick to set file size due to need for
+ * large file support. Also note that this SetPathInfo is preferred to
+ * SetFileInfo based method in next routine which is only needed to work around
+ * a sharing violation bugin Samba which this routine can run into.
+ */
+int
+CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
+	      const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb,
+	      bool set_allocation)
+{
+	struct smb_com_transaction2_spi_req *pSMB = NULL;
+	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
+	struct file_end_of_file_info *parm_data;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	int remap = cifs_remap(cifs_sb);
+
+	__u16 params, byte_count, data_count, param_offset, offset;
+
+	cifs_dbg(FYI, "In SetEOF\n");
+SetEOFRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name,
+				       PATH_MAX, cifs_sb->local_nls, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(file_name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, file_name, name_len);
+	}
+	params = 6 + name_len;
+	data_count = sizeof(struct file_end_of_file_info);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(4100);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	if (set_allocation) {
+		if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
+		else
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO);
+	} else /* Set File Size */  {
+	    if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2);
+	    else
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
+	}
+
+	parm_data =
+	    (struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol) +
+				       offset);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + data_count;
+	pSMB->DataCount = cpu_to_le16(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	parm_data->FileSize = cpu_to_le64(size);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "SetPathInfo (file size) returned %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetEOFRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile, __u64 size, bool set_allocation)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	struct file_end_of_file_info *parm_data;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cifs_dbg(FYI, "SetFileSize (via SetFileInfo) %lld\n",
+		 (long long)size);
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)cfile->pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(cfile->pid >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	count = sizeof(struct file_end_of_file_info);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	parm_data =
+		(struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol)
+				+ offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	parm_data->FileSize = cpu_to_le64(size);
+	pSMB->Fid = cfile->fid.netfid;
+	if (set_allocation) {
+		if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
+		else
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO);
+	} else /* Set File Size */  {
+	    if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2);
+	    else
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
+	}
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n",
+			 rc);
+	}
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+/* Some legacy servers such as NT4 require that the file times be set on
+   an open handle, rather than by pathname - this is awkward due to
+   potential access conflicts on the open, but it is unavoidable for these
+   old servers since the only other choice is to go from 100 nanosecond DCE
+   time and resort to the original setpathinfo level which takes the ancient
+   DOS time format with 2 second granularity */
+int
+CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		    const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	char *data_offset;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cifs_dbg(FYI, "Set Times (via SetFileInfo)\n");
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *)pSMB +
+			offsetof(struct smb_hdr, Protocol) + offset;
+
+	count = sizeof(FILE_BASIC_INFO);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2);
+	else
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	if (rc)
+		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+			 rc);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+int
+CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
+			  bool delete_file, __u16 fid, __u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	char *data_offset;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cifs_dbg(FYI, "Set File Disposition (via SetFileInfo)\n");
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+
+	count = 1;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	*data_offset = delete_file ? 1 : 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	if (rc)
+		cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc);
+
+	return rc;
+}
+
+int
+CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *fileName, const FILE_BASIC_INFO *data,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	char *data_offset;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cifs_dbg(FYI, "In SetTimes\n");
+
+SetTimesRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(FILE_BASIC_INFO);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2);
+	else
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "SetPathInfo (times) returned %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetTimesRetry;
+
+	return rc;
+}
+
+/* Can not be used to set time stamps yet (due to old DOS time format) */
+/* Can be used to set attributes */
+#if 0  /* Possibly not needed - since it turns out that strangely NT4 has a bug
+	  handling it anyway and NT4 was what we thought it would be needed for
+	  Do not delete it until we prove whether needed for Win9x though */
+int
+CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName,
+		__u16 dos_attrs, const struct nls_table *nls_codepage)
+{
+	SETATTR_REQ *pSMB = NULL;
+	SETATTR_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+
+	cifs_dbg(FYI, "In SetAttrLegacy\n");
+
+SetAttrLgcyRetry:
+	rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+				       PATH_MAX, nls_codepage);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->fileName, fileName, name_len);
+	}
+	pSMB->attr = cpu_to_le16(dos_attrs);
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetAttrLgcyRetry;
+
+	return rc;
+}
+#endif /* temporarily unneeded SetAttr legacy function */
+
+static void
+cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
+			const struct cifs_unix_set_info_args *args)
+{
+	u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64;
+	u64 mode = args->mode;
+
+	if (uid_valid(args->uid))
+		uid = from_kuid(&init_user_ns, args->uid);
+	if (gid_valid(args->gid))
+		gid = from_kgid(&init_user_ns, args->gid);
+
+	/*
+	 * Samba server ignores set of file size to zero due to bugs in some
+	 * older clients, but we should be precise - we use SetFileSize to
+	 * set file size and do not want to truncate file size to zero
+	 * accidentally as happened on one Samba server beta by putting
+	 * zero instead of -1 here
+	 */
+	data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
+	data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
+	data_offset->LastStatusChange = cpu_to_le64(args->ctime);
+	data_offset->LastAccessTime = cpu_to_le64(args->atime);
+	data_offset->LastModificationTime = cpu_to_le64(args->mtime);
+	data_offset->Uid = cpu_to_le64(uid);
+	data_offset->Gid = cpu_to_le64(gid);
+	/* better to leave device as zero when it is  */
+	data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
+	data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
+	data_offset->Permissions = cpu_to_le64(mode);
+
+	if (S_ISREG(mode))
+		data_offset->Type = cpu_to_le32(UNIX_FILE);
+	else if (S_ISDIR(mode))
+		data_offset->Type = cpu_to_le32(UNIX_DIR);
+	else if (S_ISLNK(mode))
+		data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
+	else if (S_ISCHR(mode))
+		data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
+	else if (S_ISBLK(mode))
+		data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
+	else if (S_ISFIFO(mode))
+		data_offset->Type = cpu_to_le32(UNIX_FIFO);
+	else if (S_ISSOCK(mode))
+		data_offset->Type = cpu_to_le32(UNIX_SOCKET);
+}
+
+int
+CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		       const struct cifs_unix_set_info_args *args,
+		       u16 fid, u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	char *data_offset;
+	int rc = 0;
+	u16 params, param_offset, offset, byte_count, count;
+
+	cifs_dbg(FYI, "Set Unix Info (via SetFileInfo)\n");
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *)pSMB +
+			offsetof(struct smb_hdr, Protocol) + offset;
+
+	count = sizeof(FILE_UNIX_BASIC_INFO);
+
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	cifs_fill_unix_set_info((FILE_UNIX_BASIC_INFO *)data_offset, args);
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	if (rc)
+		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+			 rc);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+int
+CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		       const char *file_name,
+		       const struct cifs_unix_set_info_args *args,
+		       const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	FILE_UNIX_BASIC_INFO *data_offset;
+	__u16 params, param_offset, offset, count, byte_count;
+
+	cifs_dbg(FYI, "In SetUID/GID/Mode\n");
+setPermsRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name,
+				       PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(file_name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, file_name, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(FILE_UNIX_BASIC_INFO);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	data_offset =
+	    (FILE_UNIX_BASIC_INFO *) ((char *) &pSMB->hdr.Protocol +
+				      offset);
+	memset(data_offset, 0, count);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+
+	cifs_fill_unix_set_info(data_offset, args);
+
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "SetPathInfo (perms) returned %d\n", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto setPermsRetry;
+	return rc;
+}
+
+#ifdef CONFIG_CIFS_XATTR
+/*
+ * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
+ * function used by listxattr and getxattr type calls. When ea_name is set,
+ * it looks for that attribute name and stuffs that value into the EAData
+ * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
+ * buffer. In both cases, the return value is either the length of the
+ * resulting data or a negative error code. If EAData is a NULL pointer then
+ * the data isn't copied to it, but the length is returned.
+ */
+ssize_t
+CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
+		const unsigned char *searchName, const unsigned char *ea_name,
+		char *EAData, size_t buf_size,
+		const struct nls_table *nls_codepage, int remap)
+{
+		/* BB assumes one setup word */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int list_len;
+	struct fealist *ea_response_data;
+	struct fea *temp_fea;
+	char *temp_ptr;
+	char *end_of_smb;
+	__u16 params, byte_count, data_offset;
+	unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
+
+	cifs_dbg(FYI, "In Query All EAs path %s\n", searchName);
+QAllEAsRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		list_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
+		list_len++;	/* trailing null */
+		list_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		list_len = strnlen(searchName, PATH_MAX);
+		list_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, list_len);
+	}
+
+	params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in QueryAllEAs = %d\n", rc);
+		goto QAllEAsOut;
+	}
+
+
+	/* BB also check enough total bytes returned */
+	/* BB we need to improve the validity checking
+	of these trans2 responses */
+
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+	if (rc || get_bcc(&pSMBr->hdr) < 4) {
+		rc = -EIO;	/* bad smb */
+		goto QAllEAsOut;
+	}
+
+	/* check that length of list is not more than bcc */
+	/* check that each entry does not go beyond length
+	   of list */
+	/* check that each element of each entry does not
+	   go beyond end of list */
+	/* validate_trans2_offsets() */
+	/* BB check if start of smb + data_offset > &bcc+ bcc */
+
+	data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+	ea_response_data = (struct fealist *)
+				(((char *) &pSMBr->hdr.Protocol) + data_offset);
+
+	list_len = le32_to_cpu(ea_response_data->list_len);
+	cifs_dbg(FYI, "ea length %d\n", list_len);
+	if (list_len <= 8) {
+		cifs_dbg(FYI, "empty EA list returned from server\n");
+		/* didn't find the named attribute */
+		if (ea_name)
+			rc = -ENODATA;
+		goto QAllEAsOut;
+	}
+
+	/* make sure list_len doesn't go past end of SMB */
+	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
+	if ((char *)ea_response_data + list_len > end_of_smb) {
+		cifs_dbg(FYI, "EA list appears to go beyond SMB\n");
+		rc = -EIO;
+		goto QAllEAsOut;
+	}
+
+	/* account for ea list len */
+	list_len -= 4;
+	temp_fea = ea_response_data->list;
+	temp_ptr = (char *)temp_fea;
+	while (list_len > 0) {
+		unsigned int name_len;
+		__u16 value_len;
+
+		list_len -= 4;
+		temp_ptr += 4;
+		/* make sure we can read name_len and value_len */
+		if (list_len < 0) {
+			cifs_dbg(FYI, "EA entry goes beyond length of list\n");
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
+
+		name_len = temp_fea->name_len;
+		value_len = le16_to_cpu(temp_fea->value_len);
+		list_len -= name_len + 1 + value_len;
+		if (list_len < 0) {
+			cifs_dbg(FYI, "EA entry goes beyond length of list\n");
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
+
+		if (ea_name) {
+			if (ea_name_len == name_len &&
+			    memcmp(ea_name, temp_ptr, name_len) == 0) {
+				temp_ptr += name_len + 1;
+				rc = value_len;
+				if (buf_size == 0)
+					goto QAllEAsOut;
+				if ((size_t)value_len > buf_size) {
+					rc = -ERANGE;
+					goto QAllEAsOut;
+				}
+				memcpy(EAData, temp_ptr, value_len);
+				goto QAllEAsOut;
+			}
+		} else {
+			/* account for prefix user. and trailing null */
+			rc += (5 + 1 + name_len);
+			if (rc < (int) buf_size) {
+				memcpy(EAData, "user.", 5);
+				EAData += 5;
+				memcpy(EAData, temp_ptr, name_len);
+				EAData += name_len;
+				/* null terminate name */
+				*EAData = 0;
+				++EAData;
+			} else if (buf_size == 0) {
+				/* skip copy - calc size only */
+			} else {
+				/* stop before overrun buffer */
+				rc = -ERANGE;
+				break;
+			}
+		}
+		temp_ptr += name_len + 1 + value_len;
+		temp_fea = (struct fea *)temp_ptr;
+	}
+
+	/* didn't find the named attribute */
+	if (ea_name)
+		rc = -ENODATA;
+
+QAllEAsOut:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QAllEAsRetry;
+
+	return (ssize_t)rc;
+}
+
+int
+CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
+	     const char *fileName, const char *ea_name, const void *ea_value,
+	     const __u16 ea_value_len, const struct nls_table *nls_codepage,
+	     int remap)
+{
+	struct smb_com_transaction2_spi_req *pSMB = NULL;
+	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
+	struct fealist *parm_data;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, byte_count, offset, count;
+
+	cifs_dbg(FYI, "In SetEA\n");
+SetEARetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+
+	params = 6 + name_len;
+
+	/* done calculating parms using name_len of file name,
+	now use name_len to calculate length of ea name
+	we are going to create in the inode xattrs */
+	if (ea_name == NULL)
+		name_len = 0;
+	else
+		name_len = strnlen(ea_name, 255);
+
+	count = sizeof(*parm_data) + ea_value_len + name_len;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	pSMB->InformationLevel =
+		cpu_to_le16(SMB_SET_FILE_EA);
+
+	parm_data =
+		(struct fealist *) (((char *) &pSMB->hdr.Protocol) +
+				       offset);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	parm_data->list_len = cpu_to_le32(count);
+	parm_data->list[0].EA_flags = 0;
+	/* we checked above that name len is less than 255 */
+	parm_data->list[0].name_len = (__u8)name_len;
+	/* EA names are always ASCII */
+	if (ea_name)
+		strncpy(parm_data->list[0].name, ea_name, name_len);
+	parm_data->list[0].name[name_len] = 0;
+	parm_data->list[0].value_len = cpu_to_le16(ea_value_len);
+	/* caller ensures that ea_value_len is less than 64K but
+	we need to ensure that it fits within the smb */
+
+	/*BB add length check to see if it would fit in
+	     negotiated SMB buffer size BB */
+	/* if (ea_value_len > buffer_size - 512 (enough for header)) */
+	if (ea_value_len)
+		memcpy(parm_data->list[0].name+name_len+1,
+		       ea_value, ea_value_len);
+
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "SetPathInfo (EA) returned %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetEARetry;
+
+	return rc;
+}
+#endif
+
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */
+/*
+ *	Years ago the kernel added a "dnotify" function for Samba server,
+ *	to allow network clients (such as Windows) to display updated
+ *	lists of files in directory listings automatically when
+ *	files are added by one user when another user has the
+ *	same directory open on their desktop.  The Linux cifs kernel
+ *	client hooked into the kernel side of this interface for
+ *	the same reason, but ironically when the VFS moved from
+ *	"dnotify" to "inotify" it became harder to plug in Linux
+ *	network file system clients (the most obvious use case
+ *	for notify interfaces is when multiple users can update
+ *	the contents of the same directory - exactly what network
+ *	file systems can do) although the server (Samba) could
+ *	still use it.  For the short term we leave the worker
+ *	function ifdeffed out (below) until inotify is fixed
+ *	in the VFS to make it easier to plug in network file
+ *	system clients.  If inotify turns out to be permanently
+ *	incompatible for network fs clients, we could instead simply
+ *	expose this config flag by adding a future cifs (and smb2) notify ioctl.
+ */
+int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
+		  const int notify_subdirs, const __u16 netfid,
+		  __u32 filter, struct file *pfile, int multishot,
+		  const struct nls_table *nls_codepage)
+{
+	int rc = 0;
+	struct smb_com_transaction_change_notify_req *pSMB = NULL;
+	struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
+	struct dir_notify_req *dnotify_req;
+	int bytes_returned;
+
+	cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->TotalParameterCount = 0 ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le32(2);
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 4; /* single byte does not need le conversion */
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	if (notify_subdirs)
+		pSMB->WatchTree = 1; /* one byte - no le conversion needed */
+	pSMB->Reserved2 = 0;
+	pSMB->CompletionFilter = cpu_to_le32(filter);
+	pSMB->Fid = netfid; /* file handle always le */
+	pSMB->ByteCount = 0;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *)pSMBr, &bytes_returned,
+			 CIFS_ASYNC_OP);
+	if (rc) {
+		cifs_dbg(FYI, "Error in Notify = %d\n", rc);
+	} else {
+		/* Add file to outstanding requests */
+		/* BB change to kmem cache alloc */
+		dnotify_req = kmalloc(
+						sizeof(struct dir_notify_req),
+						 GFP_KERNEL);
+		if (dnotify_req) {
+			dnotify_req->Pid = pSMB->hdr.Pid;
+			dnotify_req->PidHigh = pSMB->hdr.PidHigh;
+			dnotify_req->Mid = pSMB->hdr.Mid;
+			dnotify_req->Tid = pSMB->hdr.Tid;
+			dnotify_req->Uid = pSMB->hdr.Uid;
+			dnotify_req->netfid = netfid;
+			dnotify_req->pfile = pfile;
+			dnotify_req->filter = filter;
+			dnotify_req->multishot = multishot;
+			spin_lock(&GlobalMid_Lock);
+			list_add_tail(&dnotify_req->lhead,
+					&GlobalDnotifyReqList);
+			spin_unlock(&GlobalMid_Lock);
+		} else
+			rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	return rc;
+}
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
diff --git a/kernel/fs/cifs/connect.c b/kernel/fs/cifs/connect.c
new file mode 100644
index 000000000..8383d5ea4
--- /dev/null
+++ b/kernel/fs/cifs/connect.c
@@ -0,0 +1,4137 @@
+/*
+ *   fs/cifs/connect.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2011
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/ctype.h>
+#include <linux/utsname.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/pagevec.h>
+#include <linux/freezer.h>
+#include <linux/namei.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <linux/inet.h>
+#include <linux/module.h>
+#include <keys/user-type.h>
+#include <net/ipv6.h>
+#include <linux/parser.h>
+
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "ntlmssp.h"
+#include "nterr.h"
+#include "rfc1002pdu.h"
+#include "fscache.h"
+
+#define CIFS_PORT 445
+#define RFC1001_PORT 139
+
+extern mempool_t *cifs_req_poolp;
+
+/* FIXME: should these be tunable? */
+#define TLINK_ERROR_EXPIRE	(1 * HZ)
+#define TLINK_IDLE_EXPIRE	(600 * HZ)
+
+enum {
+
+	/* Mount options that take no arguments */
+	Opt_user_xattr, Opt_nouser_xattr,
+	Opt_forceuid, Opt_noforceuid,
+	Opt_forcegid, Opt_noforcegid,
+	Opt_noblocksend, Opt_noautotune,
+	Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
+	Opt_mapposix, Opt_nomapposix,
+	Opt_mapchars, Opt_nomapchars, Opt_sfu,
+	Opt_nosfu, Opt_nodfs, Opt_posixpaths,
+	Opt_noposixpaths, Opt_nounix,
+	Opt_nocase,
+	Opt_brl, Opt_nobrl,
+	Opt_forcemandatorylock, Opt_setuids,
+	Opt_nosetuids, Opt_dynperm, Opt_nodynperm,
+	Opt_nohard, Opt_nosoft,
+	Opt_nointr, Opt_intr,
+	Opt_nostrictsync, Opt_strictsync,
+	Opt_serverino, Opt_noserverino,
+	Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl,
+	Opt_acl, Opt_noacl, Opt_locallease,
+	Opt_sign, Opt_seal, Opt_noac,
+	Opt_fsc, Opt_mfsymlinks,
+	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
+
+	/* Mount options which take numeric value */
+	Opt_backupuid, Opt_backupgid, Opt_uid,
+	Opt_cruid, Opt_gid, Opt_file_mode,
+	Opt_dirmode, Opt_port,
+	Opt_rsize, Opt_wsize, Opt_actimeo,
+
+	/* Mount options which take string value */
+	Opt_user, Opt_pass, Opt_ip,
+	Opt_domain, Opt_srcaddr, Opt_iocharset,
+	Opt_netbiosname, Opt_servern,
+	Opt_ver, Opt_vers, Opt_sec, Opt_cache,
+
+	/* Mount options to be ignored */
+	Opt_ignore,
+
+	/* Options which could be blank */
+	Opt_blank_pass,
+	Opt_blank_user,
+	Opt_blank_ip,
+
+	Opt_err
+};
+
+static const match_table_t cifs_mount_option_tokens = {
+
+	{ Opt_user_xattr, "user_xattr" },
+	{ Opt_nouser_xattr, "nouser_xattr" },
+	{ Opt_forceuid, "forceuid" },
+	{ Opt_noforceuid, "noforceuid" },
+	{ Opt_forcegid, "forcegid" },
+	{ Opt_noforcegid, "noforcegid" },
+	{ Opt_noblocksend, "noblocksend" },
+	{ Opt_noautotune, "noautotune" },
+	{ Opt_hard, "hard" },
+	{ Opt_soft, "soft" },
+	{ Opt_perm, "perm" },
+	{ Opt_noperm, "noperm" },
+	{ Opt_mapchars, "mapchars" }, /* SFU style */
+	{ Opt_nomapchars, "nomapchars" },
+	{ Opt_mapposix, "mapposix" }, /* SFM style */
+	{ Opt_nomapposix, "nomapposix" },
+	{ Opt_sfu, "sfu" },
+	{ Opt_nosfu, "nosfu" },
+	{ Opt_nodfs, "nodfs" },
+	{ Opt_posixpaths, "posixpaths" },
+	{ Opt_noposixpaths, "noposixpaths" },
+	{ Opt_nounix, "nounix" },
+	{ Opt_nounix, "nolinux" },
+	{ Opt_nocase, "nocase" },
+	{ Opt_nocase, "ignorecase" },
+	{ Opt_brl, "brl" },
+	{ Opt_nobrl, "nobrl" },
+	{ Opt_nobrl, "nolock" },
+	{ Opt_forcemandatorylock, "forcemandatorylock" },
+	{ Opt_forcemandatorylock, "forcemand" },
+	{ Opt_setuids, "setuids" },
+	{ Opt_nosetuids, "nosetuids" },
+	{ Opt_dynperm, "dynperm" },
+	{ Opt_nodynperm, "nodynperm" },
+	{ Opt_nohard, "nohard" },
+	{ Opt_nosoft, "nosoft" },
+	{ Opt_nointr, "nointr" },
+	{ Opt_intr, "intr" },
+	{ Opt_nostrictsync, "nostrictsync" },
+	{ Opt_strictsync, "strictsync" },
+	{ Opt_serverino, "serverino" },
+	{ Opt_noserverino, "noserverino" },
+	{ Opt_rwpidforward, "rwpidforward" },
+	{ Opt_cifsacl, "cifsacl" },
+	{ Opt_nocifsacl, "nocifsacl" },
+	{ Opt_acl, "acl" },
+	{ Opt_noacl, "noacl" },
+	{ Opt_locallease, "locallease" },
+	{ Opt_sign, "sign" },
+	{ Opt_seal, "seal" },
+	{ Opt_noac, "noac" },
+	{ Opt_fsc, "fsc" },
+	{ Opt_mfsymlinks, "mfsymlinks" },
+	{ Opt_multiuser, "multiuser" },
+	{ Opt_sloppy, "sloppy" },
+	{ Opt_nosharesock, "nosharesock" },
+
+	{ Opt_backupuid, "backupuid=%s" },
+	{ Opt_backupgid, "backupgid=%s" },
+	{ Opt_uid, "uid=%s" },
+	{ Opt_cruid, "cruid=%s" },
+	{ Opt_gid, "gid=%s" },
+	{ Opt_file_mode, "file_mode=%s" },
+	{ Opt_dirmode, "dirmode=%s" },
+	{ Opt_dirmode, "dir_mode=%s" },
+	{ Opt_port, "port=%s" },
+	{ Opt_rsize, "rsize=%s" },
+	{ Opt_wsize, "wsize=%s" },
+	{ Opt_actimeo, "actimeo=%s" },
+
+	{ Opt_blank_user, "user=" },
+	{ Opt_blank_user, "username=" },
+	{ Opt_user, "user=%s" },
+	{ Opt_user, "username=%s" },
+	{ Opt_blank_pass, "pass=" },
+	{ Opt_blank_pass, "password=" },
+	{ Opt_pass, "pass=%s" },
+	{ Opt_pass, "password=%s" },
+	{ Opt_blank_ip, "ip=" },
+	{ Opt_blank_ip, "addr=" },
+	{ Opt_ip, "ip=%s" },
+	{ Opt_ip, "addr=%s" },
+	{ Opt_ignore, "unc=%s" },
+	{ Opt_ignore, "target=%s" },
+	{ Opt_ignore, "path=%s" },
+	{ Opt_domain, "dom=%s" },
+	{ Opt_domain, "domain=%s" },
+	{ Opt_domain, "workgroup=%s" },
+	{ Opt_srcaddr, "srcaddr=%s" },
+	{ Opt_ignore, "prefixpath=%s" },
+	{ Opt_iocharset, "iocharset=%s" },
+	{ Opt_netbiosname, "netbiosname=%s" },
+	{ Opt_servern, "servern=%s" },
+	{ Opt_ver, "ver=%s" },
+	{ Opt_vers, "vers=%s" },
+	{ Opt_sec, "sec=%s" },
+	{ Opt_cache, "cache=%s" },
+
+	{ Opt_ignore, "cred" },
+	{ Opt_ignore, "credentials" },
+	{ Opt_ignore, "cred=%s" },
+	{ Opt_ignore, "credentials=%s" },
+	{ Opt_ignore, "guest" },
+	{ Opt_ignore, "rw" },
+	{ Opt_ignore, "ro" },
+	{ Opt_ignore, "suid" },
+	{ Opt_ignore, "nosuid" },
+	{ Opt_ignore, "exec" },
+	{ Opt_ignore, "noexec" },
+	{ Opt_ignore, "nodev" },
+	{ Opt_ignore, "noauto" },
+	{ Opt_ignore, "dev" },
+	{ Opt_ignore, "mand" },
+	{ Opt_ignore, "nomand" },
+	{ Opt_ignore, "_netdev" },
+
+	{ Opt_err, NULL }
+};
+
+enum {
+	Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p,
+	Opt_sec_ntlmsspi, Opt_sec_ntlmssp,
+	Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2,
+	Opt_sec_ntlmv2i, Opt_sec_lanman,
+	Opt_sec_none,
+
+	Opt_sec_err
+};
+
+static const match_table_t cifs_secflavor_tokens = {
+	{ Opt_sec_krb5, "krb5" },
+	{ Opt_sec_krb5i, "krb5i" },
+	{ Opt_sec_krb5p, "krb5p" },
+	{ Opt_sec_ntlmsspi, "ntlmsspi" },
+	{ Opt_sec_ntlmssp, "ntlmssp" },
+	{ Opt_ntlm, "ntlm" },
+	{ Opt_sec_ntlmi, "ntlmi" },
+	{ Opt_sec_ntlmv2, "nontlm" },
+	{ Opt_sec_ntlmv2, "ntlmv2" },
+	{ Opt_sec_ntlmv2i, "ntlmv2i" },
+	{ Opt_sec_lanman, "lanman" },
+	{ Opt_sec_none, "none" },
+
+	{ Opt_sec_err, NULL }
+};
+
+/* cache flavors */
+enum {
+	Opt_cache_loose,
+	Opt_cache_strict,
+	Opt_cache_none,
+	Opt_cache_err
+};
+
+static const match_table_t cifs_cacheflavor_tokens = {
+	{ Opt_cache_loose, "loose" },
+	{ Opt_cache_strict, "strict" },
+	{ Opt_cache_none, "none" },
+	{ Opt_cache_err, NULL }
+};
+
+static const match_table_t cifs_smb_version_tokens = {
+	{ Smb_1, SMB1_VERSION_STRING },
+	{ Smb_20, SMB20_VERSION_STRING},
+	{ Smb_21, SMB21_VERSION_STRING },
+	{ Smb_30, SMB30_VERSION_STRING },
+	{ Smb_302, SMB302_VERSION_STRING },
+};
+
+static int ip_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
+static void cifs_prune_tlinks(struct work_struct *work);
+static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
+					const char *devname);
+
+/*
+ * cifs tcp session reconnection
+ *
+ * mark tcp session as reconnecting so temporarily locked
+ * mark all smb sessions as reconnecting for tcp session
+ * reconnect tcp session
+ * wake up waiters on reconnection? - (not needed currently)
+ */
+int
+cifs_reconnect(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	struct list_head *tmp, *tmp2;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct mid_q_entry *mid_entry;
+	struct list_head retry_list;
+
+	spin_lock(&GlobalMid_Lock);
+	if (server->tcpStatus == CifsExiting) {
+		/* the demux thread will exit normally
+		next time through the loop */
+		spin_unlock(&GlobalMid_Lock);
+		return rc;
+	} else
+		server->tcpStatus = CifsNeedReconnect;
+	spin_unlock(&GlobalMid_Lock);
+	server->maxBuf = 0;
+#ifdef CONFIG_CIFS_SMB2
+	server->max_read = 0;
+#endif
+
+	cifs_dbg(FYI, "Reconnecting tcp session\n");
+
+	/* before reconnecting the tcp session, mark the smb session (uid)
+		and the tid bad so they are not used until reconnected */
+	cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n",
+		 __func__);
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+		ses->need_reconnect = true;
+		ses->ipc_tid = 0;
+		list_for_each(tmp2, &ses->tcon_list) {
+			tcon = list_entry(tmp2, struct cifs_tcon, tcon_list);
+			tcon->need_reconnect = true;
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	/* do not want to be sending data on a socket we are freeing */
+	cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
+	mutex_lock(&server->srv_mutex);
+	if (server->ssocket) {
+		cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
+			 server->ssocket->state, server->ssocket->flags);
+		kernel_sock_shutdown(server->ssocket, SHUT_WR);
+		cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
+			 server->ssocket->state, server->ssocket->flags);
+		sock_release(server->ssocket);
+		server->ssocket = NULL;
+	}
+	server->sequence_number = 0;
+	server->session_estab = false;
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
+	server->lstrp = jiffies;
+	mutex_unlock(&server->srv_mutex);
+
+	/* mark submitted MIDs for retry and issue callback */
+	INIT_LIST_HEAD(&retry_list);
+	cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
+	spin_lock(&GlobalMid_Lock);
+	list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
+			mid_entry->mid_state = MID_RETRY_NEEDED;
+		list_move(&mid_entry->qhead, &retry_list);
+	}
+	spin_unlock(&GlobalMid_Lock);
+
+	cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
+	list_for_each_safe(tmp, tmp2, &retry_list) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		list_del_init(&mid_entry->qhead);
+		mid_entry->callback(mid_entry);
+	}
+
+	do {
+		try_to_freeze();
+
+		/* we should try only the port we connected to before */
+		mutex_lock(&server->srv_mutex);
+		rc = generic_ip_connect(server);
+		if (rc) {
+			cifs_dbg(FYI, "reconnect error %d\n", rc);
+			mutex_unlock(&server->srv_mutex);
+			msleep(3000);
+		} else {
+			atomic_inc(&tcpSesReconnectCount);
+			spin_lock(&GlobalMid_Lock);
+			if (server->tcpStatus != CifsExiting)
+				server->tcpStatus = CifsNeedNegotiate;
+			spin_unlock(&GlobalMid_Lock);
+			mutex_unlock(&server->srv_mutex);
+		}
+	} while (server->tcpStatus == CifsNeedReconnect);
+
+	return rc;
+}
+
+static void
+cifs_echo_request(struct work_struct *work)
+{
+	int rc;
+	struct TCP_Server_Info *server = container_of(work,
+					struct TCP_Server_Info, echo.work);
+
+	/*
+	 * We cannot send an echo if it is disabled or until the
+	 * NEGOTIATE_PROTOCOL request is done, which is indicated by
+	 * server->ops->need_neg() == true. Also, no need to ping if
+	 * we got a response recently.
+	 */
+	if (!server->ops->need_neg || server->ops->need_neg(server) ||
+	    (server->ops->can_echo && !server->ops->can_echo(server)) ||
+	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+		goto requeue_echo;
+
+	rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
+	if (rc)
+		cifs_dbg(FYI, "Unable to send echo request to server: %s\n",
+			 server->hostname);
+
+requeue_echo:
+	queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
+}
+
+static bool
+allocate_buffers(struct TCP_Server_Info *server)
+{
+	if (!server->bigbuf) {
+		server->bigbuf = (char *)cifs_buf_get();
+		if (!server->bigbuf) {
+			cifs_dbg(VFS, "No memory for large SMB response\n");
+			msleep(3000);
+			/* retry will check if exiting */
+			return false;
+		}
+	} else if (server->large_buf) {
+		/* we are reusing a dirty large buf, clear its start */
+		memset(server->bigbuf, 0, HEADER_SIZE(server));
+	}
+
+	if (!server->smallbuf) {
+		server->smallbuf = (char *)cifs_small_buf_get();
+		if (!server->smallbuf) {
+			cifs_dbg(VFS, "No memory for SMB response\n");
+			msleep(1000);
+			/* retry will check if exiting */
+			return false;
+		}
+		/* beginning of smb buffer is cleared in our buf_get */
+	} else {
+		/* if existing small buf clear beginning */
+		memset(server->smallbuf, 0, HEADER_SIZE(server));
+	}
+
+	return true;
+}
+
+static bool
+server_unresponsive(struct TCP_Server_Info *server)
+{
+	/*
+	 * We need to wait 2 echo intervals to make sure we handle such
+	 * situations right:
+	 * 1s  client sends a normal SMB request
+	 * 2s  client gets a response
+	 * 30s echo workqueue job pops, and decides we got a response recently
+	 *     and don't need to send another
+	 * ...
+	 * 65s kernel_recvmsg times out, and we see that we haven't gotten
+	 *     a response in >60s.
+	 */
+	if (server->tcpStatus == CifsGood &&
+	    time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
+		cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
+			 server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * kvec_array_init - clone a kvec array, and advance into it
+ * @new:	pointer to memory for cloned array
+ * @iov:	pointer to original array
+ * @nr_segs:	number of members in original array
+ * @bytes:	number of bytes to advance into the cloned array
+ *
+ * This function will copy the array provided in iov to a section of memory
+ * and advance the specified number of bytes into the new array. It returns
+ * the number of segments in the new array. "new" must be at least as big as
+ * the original iov array.
+ */
+static unsigned int
+kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
+		size_t bytes)
+{
+	size_t base = 0;
+
+	while (bytes || !iov->iov_len) {
+		int copy = min(bytes, iov->iov_len);
+
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			nr_segs--;
+			base = 0;
+		}
+	}
+	memcpy(new, iov, sizeof(*iov) * nr_segs);
+	new->iov_base += base;
+	new->iov_len -= base;
+	return nr_segs;
+}
+
+static struct kvec *
+get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
+{
+	struct kvec *new_iov;
+
+	if (server->iov && nr_segs <= server->nr_iov)
+		return server->iov;
+
+	/* not big enough -- allocate a new one and release the old */
+	new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
+	if (new_iov) {
+		kfree(server->iov);
+		server->iov = new_iov;
+		server->nr_iov = nr_segs;
+	}
+	return new_iov;
+}
+
+int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+		       unsigned int nr_segs, unsigned int to_read)
+{
+	int length = 0;
+	int total_read;
+	unsigned int segs;
+	struct msghdr smb_msg;
+	struct kvec *iov;
+
+	iov = get_server_iovec(server, nr_segs);
+	if (!iov)
+		return -ENOMEM;
+
+	smb_msg.msg_control = NULL;
+	smb_msg.msg_controllen = 0;
+
+	for (total_read = 0; to_read; total_read += length, to_read -= length) {
+		try_to_freeze();
+
+		if (server_unresponsive(server)) {
+			total_read = -ECONNABORTED;
+			break;
+		}
+
+		segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+
+		length = kernel_recvmsg(server->ssocket, &smb_msg,
+					iov, segs, to_read, 0);
+
+		if (server->tcpStatus == CifsExiting) {
+			total_read = -ESHUTDOWN;
+			break;
+		} else if (server->tcpStatus == CifsNeedReconnect) {
+			cifs_reconnect(server);
+			total_read = -ECONNABORTED;
+			break;
+		} else if (length == -ERESTARTSYS ||
+			   length == -EAGAIN ||
+			   length == -EINTR) {
+			/*
+			 * Minimum sleep to prevent looping, allowing socket
+			 * to clear and app threads to set tcpStatus
+			 * CifsNeedReconnect if server hung.
+			 */
+			usleep_range(1000, 2000);
+			length = 0;
+			continue;
+		} else if (length <= 0) {
+			cifs_dbg(FYI, "Received no data or error: expecting %d\n"
+				 "got %d", to_read, length);
+			cifs_reconnect(server);
+			total_read = -ECONNABORTED;
+			break;
+		}
+	}
+	return total_read;
+}
+
+int
+cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+		      unsigned int to_read)
+{
+	struct kvec iov;
+
+	iov.iov_base = buf;
+	iov.iov_len = to_read;
+
+	return cifs_readv_from_socket(server, &iov, 1, to_read);
+}
+
+static bool
+is_smb_response(struct TCP_Server_Info *server, unsigned char type)
+{
+	/*
+	 * The first byte big endian of the length field,
+	 * is actually not part of the length but the type
+	 * with the most common, zero, as regular data.
+	 */
+	switch (type) {
+	case RFC1002_SESSION_MESSAGE:
+		/* Regular SMB response */
+		return true;
+	case RFC1002_SESSION_KEEP_ALIVE:
+		cifs_dbg(FYI, "RFC 1002 session keep alive\n");
+		break;
+	case RFC1002_POSITIVE_SESSION_RESPONSE:
+		cifs_dbg(FYI, "RFC 1002 positive session response\n");
+		break;
+	case RFC1002_NEGATIVE_SESSION_RESPONSE:
+		/*
+		 * We get this from Windows 98 instead of an error on
+		 * SMB negprot response.
+		 */
+		cifs_dbg(FYI, "RFC 1002 negative session response\n");
+		/* give server a second to clean up */
+		msleep(1000);
+		/*
+		 * Always try 445 first on reconnect since we get NACK
+		 * on some if we ever connected to port 139 (the NACK
+		 * is since we do not begin with RFC1001 session
+		 * initialize frame).
+		 */
+		cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		break;
+	default:
+		cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
+		cifs_reconnect(server);
+	}
+
+	return false;
+}
+
+void
+dequeue_mid(struct mid_q_entry *mid, bool malformed)
+{
+#ifdef CONFIG_CIFS_STATS2
+	mid->when_received = jiffies;
+#endif
+	spin_lock(&GlobalMid_Lock);
+	if (!malformed)
+		mid->mid_state = MID_RESPONSE_RECEIVED;
+	else
+		mid->mid_state = MID_RESPONSE_MALFORMED;
+	list_del_init(&mid->qhead);
+	spin_unlock(&GlobalMid_Lock);
+}
+
+static void
+handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+	   char *buf, int malformed)
+{
+	if (server->ops->check_trans2 &&
+	    server->ops->check_trans2(mid, server, buf, malformed))
+		return;
+	mid->resp_buf = buf;
+	mid->large_buf = server->large_buf;
+	/* Was previous buf put in mpx struct for multi-rsp? */
+	if (!mid->multiRsp) {
+		/* smb buffer will be freed by user thread */
+		if (server->large_buf)
+			server->bigbuf = NULL;
+		else
+			server->smallbuf = NULL;
+	}
+	dequeue_mid(mid, malformed);
+}
+
+static void clean_demultiplex_info(struct TCP_Server_Info *server)
+{
+	int length;
+
+	/* take it off the list, if it's not already */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_del_init(&server->tcp_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	spin_lock(&GlobalMid_Lock);
+	server->tcpStatus = CifsExiting;
+	spin_unlock(&GlobalMid_Lock);
+	wake_up_all(&server->response_q);
+
+	/* check if we have blocked requests that need to free */
+	spin_lock(&server->req_lock);
+	if (server->credits <= 0)
+		server->credits = 1;
+	spin_unlock(&server->req_lock);
+	/*
+	 * Although there should not be any requests blocked on this queue it
+	 * can not hurt to be paranoid and try to wake up requests that may
+	 * haven been blocked when more than 50 at time were on the wire to the
+	 * same server - they now will see the session is in exit state and get
+	 * out of SendReceive.
+	 */
+	wake_up_all(&server->request_q);
+	/* give those requests time to exit */
+	msleep(125);
+
+	if (server->ssocket) {
+		sock_release(server->ssocket);
+		server->ssocket = NULL;
+	}
+
+	if (!list_empty(&server->pending_mid_q)) {
+		struct list_head dispose_list;
+		struct mid_q_entry *mid_entry;
+		struct list_head *tmp, *tmp2;
+
+		INIT_LIST_HEAD(&dispose_list);
+		spin_lock(&GlobalMid_Lock);
+		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+			cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
+			mid_entry->mid_state = MID_SHUTDOWN;
+			list_move(&mid_entry->qhead, &dispose_list);
+		}
+		spin_unlock(&GlobalMid_Lock);
+
+		/* now walk dispose list and issue callbacks */
+		list_for_each_safe(tmp, tmp2, &dispose_list) {
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+			cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
+			list_del_init(&mid_entry->qhead);
+			mid_entry->callback(mid_entry);
+		}
+		/* 1/8th of sec is more than enough time for them to exit */
+		msleep(125);
+	}
+
+	if (!list_empty(&server->pending_mid_q)) {
+		/*
+		 * mpx threads have not exited yet give them at least the smb
+		 * send timeout time for long ops.
+		 *
+		 * Due to delays on oplock break requests, we need to wait at
+		 * least 45 seconds before giving up on a request getting a
+		 * response and going ahead and killing cifsd.
+		 */
+		cifs_dbg(FYI, "Wait for exit from demultiplex thread\n");
+		msleep(46000);
+		/*
+		 * If threads still have not exited they are probably never
+		 * coming home not much else we can do but free the memory.
+		 */
+	}
+
+	kfree(server->hostname);
+	kfree(server->iov);
+	kfree(server);
+
+	length = atomic_dec_return(&tcpSesAllocCount);
+	if (length > 0)
+		mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
+}
+
+static int
+standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	int length;
+	char *buf = server->smallbuf;
+	unsigned int pdu_length = get_rfc1002_length(buf);
+
+	/* make sure this will fit in a large buffer */
+	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) {
+		cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return -ECONNABORTED;
+	}
+
+	/* switch to large buffer if too big for a small one */
+	if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+		server->large_buf = true;
+		memcpy(server->bigbuf, buf, server->total_read);
+		buf = server->bigbuf;
+	}
+
+	/* now read the rest */
+	length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
+				pdu_length - HEADER_SIZE(server) + 1 + 4);
+	if (length < 0)
+		return length;
+	server->total_read += length;
+
+	dump_smb(buf, server->total_read);
+
+	/*
+	 * We know that we received enough to get to the MID as we
+	 * checked the pdu_length earlier. Now check to see
+	 * if the rest of the header is OK. We borrow the length
+	 * var for the rest of the loop to avoid a new stack var.
+	 *
+	 * 48 bytes is enough to display the header and a little bit
+	 * into the payload for debugging purposes.
+	 */
+	length = server->ops->check_message(buf, server->total_read);
+	if (length != 0)
+		cifs_dump_mem("Bad SMB: ", buf,
+			min_t(unsigned int, server->total_read, 48));
+
+	if (server->ops->is_status_pending &&
+	    server->ops->is_status_pending(buf, server, length))
+		return -1;
+
+	if (!mid)
+		return length;
+
+	handle_mid(mid, server, buf, length);
+	return 0;
+}
+
+static int
+cifs_demultiplex_thread(void *p)
+{
+	int length;
+	struct TCP_Server_Info *server = p;
+	unsigned int pdu_length;
+	char *buf = NULL;
+	struct task_struct *task_to_wake = NULL;
+	struct mid_q_entry *mid_entry;
+
+	current->flags |= PF_MEMALLOC;
+	cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
+
+	length = atomic_inc_return(&tcpSesAllocCount);
+	if (length > 1)
+		mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
+
+	set_freezable();
+	while (server->tcpStatus != CifsExiting) {
+		if (try_to_freeze())
+			continue;
+
+		if (!allocate_buffers(server))
+			continue;
+
+		server->large_buf = false;
+		buf = server->smallbuf;
+		pdu_length = 4; /* enough to get RFC1001 header */
+
+		length = cifs_read_from_socket(server, buf, pdu_length);
+		if (length < 0)
+			continue;
+		server->total_read = length;
+
+		/*
+		 * The right amount was read from socket - 4 bytes,
+		 * so we can now interpret the length field.
+		 */
+		pdu_length = get_rfc1002_length(buf);
+
+		cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
+		if (!is_smb_response(server, buf[0]))
+			continue;
+
+		/* make sure we have enough to get to the MID */
+		if (pdu_length < HEADER_SIZE(server) - 1 - 4) {
+			cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
+				 pdu_length);
+			cifs_reconnect(server);
+			wake_up(&server->response_q);
+			continue;
+		}
+
+		/* read down to the MID */
+		length = cifs_read_from_socket(server, buf + 4,
+					       HEADER_SIZE(server) - 1 - 4);
+		if (length < 0)
+			continue;
+		server->total_read += length;
+
+		mid_entry = server->ops->find_mid(server, buf);
+
+		if (!mid_entry || !mid_entry->receive)
+			length = standard_receive3(server, mid_entry);
+		else
+			length = mid_entry->receive(server, mid_entry);
+
+		if (length < 0)
+			continue;
+
+		if (server->large_buf)
+			buf = server->bigbuf;
+
+		server->lstrp = jiffies;
+		if (mid_entry != NULL) {
+			if (!mid_entry->multiRsp || mid_entry->multiEnd)
+				mid_entry->callback(mid_entry);
+		} else if (!server->ops->is_oplock_break ||
+			   !server->ops->is_oplock_break(buf, server)) {
+			cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n",
+				 atomic_read(&midCount));
+			cifs_dump_mem("Received Data is: ", buf,
+				      HEADER_SIZE(server));
+#ifdef CONFIG_CIFS_DEBUG2
+			if (server->ops->dump_detail)
+				server->ops->dump_detail(buf);
+			cifs_dump_mids(server);
+#endif /* CIFS_DEBUG2 */
+
+		}
+	} /* end while !EXITING */
+
+	/* buffer usually freed in free_mid - need to free it here on exit */
+	cifs_buf_release(server->bigbuf);
+	if (server->smallbuf) /* no sense logging a debug message if NULL */
+		cifs_small_buf_release(server->smallbuf);
+
+	task_to_wake = xchg(&server->tsk, NULL);
+	clean_demultiplex_info(server);
+
+	/* if server->tsk was NULL then wait for a signal before exiting */
+	if (!task_to_wake) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		while (!signal_pending(current)) {
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+		set_current_state(TASK_RUNNING);
+	}
+
+	module_put_and_exit(0);
+}
+
+/* extract the host portion of the UNC string */
+static char *
+extract_hostname(const char *unc)
+{
+	const char *src;
+	char *dst, *delim;
+	unsigned int len;
+
+	/* skip double chars at beginning of string */
+	/* BB: check validity of these bytes? */
+	src = unc + 2;
+
+	/* delimiter between hostname and sharename is always '\\' now */
+	delim = strchr(src, '\\');
+	if (!delim)
+		return ERR_PTR(-EINVAL);
+
+	len = delim - src;
+	dst = kmalloc((len + 1), GFP_KERNEL);
+	if (dst == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(dst, src, len);
+	dst[len] = '\0';
+
+	return dst;
+}
+
+static int get_option_ul(substring_t args[], unsigned long *option)
+{
+	int rc;
+	char *string;
+
+	string = match_strdup(args);
+	if (string == NULL)
+		return -ENOMEM;
+	rc = kstrtoul(string, 0, option);
+	kfree(string);
+
+	return rc;
+}
+
+static int get_option_uid(substring_t args[], kuid_t *result)
+{
+	unsigned long value;
+	kuid_t uid;
+	int rc;
+
+	rc = get_option_ul(args, &value);
+	if (rc)
+		return rc;
+
+	uid = make_kuid(current_user_ns(), value);
+	if (!uid_valid(uid))
+		return -EINVAL;
+
+	*result = uid;
+	return 0;
+}
+
+static int get_option_gid(substring_t args[], kgid_t *result)
+{
+	unsigned long value;
+	kgid_t gid;
+	int rc;
+
+	rc = get_option_ul(args, &value);
+	if (rc)
+		return rc;
+
+	gid = make_kgid(current_user_ns(), value);
+	if (!gid_valid(gid))
+		return -EINVAL;
+
+	*result = gid;
+	return 0;
+}
+
+static int cifs_parse_security_flavors(char *value,
+				       struct smb_vol *vol)
+{
+
+	substring_t args[MAX_OPT_ARGS];
+
+	/*
+	 * With mount options, the last one should win. Reset any existing
+	 * settings back to default.
+	 */
+	vol->sectype = Unspecified;
+	vol->sign = false;
+
+	switch (match_token(value, cifs_secflavor_tokens, args)) {
+	case Opt_sec_krb5p:
+		cifs_dbg(VFS, "sec=krb5p is not supported!\n");
+		return 1;
+	case Opt_sec_krb5i:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_krb5:
+		vol->sectype = Kerberos;
+		break;
+	case Opt_sec_ntlmsspi:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_ntlmssp:
+		vol->sectype = RawNTLMSSP;
+		break;
+	case Opt_sec_ntlmi:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_ntlm:
+		vol->sectype = NTLM;
+		break;
+	case Opt_sec_ntlmv2i:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_ntlmv2:
+		vol->sectype = NTLMv2;
+		break;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	case Opt_sec_lanman:
+		vol->sectype = LANMAN;
+		break;
+#endif
+	case Opt_sec_none:
+		vol->nullauth = 1;
+		break;
+	default:
+		cifs_dbg(VFS, "bad security option: %s\n", value);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int
+cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, cifs_cacheflavor_tokens, args)) {
+	case Opt_cache_loose:
+		vol->direct_io = false;
+		vol->strict_io = false;
+		break;
+	case Opt_cache_strict:
+		vol->direct_io = false;
+		vol->strict_io = true;
+		break;
+	case Opt_cache_none:
+		vol->direct_io = true;
+		vol->strict_io = false;
+		break;
+	default:
+		cifs_dbg(VFS, "bad cache= option: %s\n", value);
+		return 1;
+	}
+	return 0;
+}
+
+static int
+cifs_parse_smb_version(char *value, struct smb_vol *vol)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, cifs_smb_version_tokens, args)) {
+	case Smb_1:
+		vol->ops = &smb1_operations;
+		vol->vals = &smb1_values;
+		break;
+#ifdef CONFIG_CIFS_SMB2
+	case Smb_20:
+		vol->ops = &smb20_operations;
+		vol->vals = &smb20_values;
+		break;
+	case Smb_21:
+		vol->ops = &smb21_operations;
+		vol->vals = &smb21_values;
+		break;
+	case Smb_30:
+		vol->ops = &smb30_operations;
+		vol->vals = &smb30_values;
+		break;
+	case Smb_302:
+		vol->ops = &smb30_operations; /* currently identical with 3.0 */
+		vol->vals = &smb302_values;
+		break;
+#endif
+	default:
+		cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Parse a devname into substrings and populate the vol->UNC and vol->prepath
+ * fields with the result. Returns 0 on success and an error otherwise.
+ */
+static int
+cifs_parse_devname(const char *devname, struct smb_vol *vol)
+{
+	char *pos;
+	const char *delims = "/\\";
+	size_t len;
+
+	/* make sure we have a valid UNC double delimiter prefix */
+	len = strspn(devname, delims);
+	if (len != 2)
+		return -EINVAL;
+
+	/* find delimiter between host and sharename */
+	pos = strpbrk(devname + 2, delims);
+	if (!pos)
+		return -EINVAL;
+
+	/* skip past delimiter */
+	++pos;
+
+	/* now go until next delimiter or end of string */
+	len = strcspn(pos, delims);
+
+	/* move "pos" up to delimiter or NULL */
+	pos += len;
+	vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
+	if (!vol->UNC)
+		return -ENOMEM;
+
+	convert_delimiter(vol->UNC, '\\');
+
+	/* If pos is NULL, or is a bogus trailing delimiter then no prepath */
+	if (!*pos++ || !*pos)
+		return 0;
+
+	vol->prepath = kstrdup(pos, GFP_KERNEL);
+	if (!vol->prepath)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int
+cifs_parse_mount_options(const char *mountdata, const char *devname,
+			 struct smb_vol *vol)
+{
+	char *data, *end;
+	char *mountdata_copy = NULL, *options;
+	unsigned int  temp_len, i, j;
+	char separator[2];
+	short int override_uid = -1;
+	short int override_gid = -1;
+	bool uid_specified = false;
+	bool gid_specified = false;
+	bool sloppy = false;
+	char *invalid = NULL;
+	char *nodename = utsname()->nodename;
+	char *string = NULL;
+	char *tmp_end, *value;
+	char delim;
+	bool got_ip = false;
+	unsigned short port = 0;
+	struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
+
+	separator[0] = ',';
+	separator[1] = 0;
+	delim = separator[0];
+
+	/* ensure we always start with zeroed-out smb_vol */
+	memset(vol, 0, sizeof(*vol));
+
+	/*
+	 * does not have to be perfect mapping since field is
+	 * informational, only used for servers that do not support
+	 * port 445 and it can be overridden at mount time
+	 */
+	memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
+	for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
+		vol->source_rfc1001_name[i] = toupper(nodename[i]);
+
+	vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
+	/* null target name indicates to use *SMBSERVR default called name
+	   if we end up sending RFC1001 session initialize */
+	vol->target_rfc1001_name[0] = 0;
+	vol->cred_uid = current_uid();
+	vol->linux_uid = current_uid();
+	vol->linux_gid = current_gid();
+
+	/*
+	 * default to SFM style remapping of seven reserved characters
+	 * unless user overrides it or we negotiate CIFS POSIX where
+	 * it is unnecessary.  Can not simultaneously use more than one mapping
+	 * since then readdir could list files that open could not open
+	 */
+	vol->remap = true;
+
+	/* default to only allowing write access to owner of the mount */
+	vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
+
+	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
+	/* default is always to request posix paths. */
+	vol->posix_paths = 1;
+	/* default to using server inode numbers where available */
+	vol->server_ino = 1;
+
+	/* default is to use strict cifs caching semantics */
+	vol->strict_io = true;
+
+	vol->actimeo = CIFS_DEF_ACTIMEO;
+
+	/* FIXME: add autonegotiation -- for now, SMB1 is default */
+	vol->ops = &smb1_operations;
+	vol->vals = &smb1_values;
+
+	if (!mountdata)
+		goto cifs_parse_mount_err;
+
+	mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL);
+	if (!mountdata_copy)
+		goto cifs_parse_mount_err;
+
+	options = mountdata_copy;
+	end = options + strlen(options);
+
+	if (strncmp(options, "sep=", 4) == 0) {
+		if (options[4] != 0) {
+			separator[0] = options[4];
+			options += 5;
+		} else {
+			cifs_dbg(FYI, "Null separator not allowed\n");
+		}
+	}
+	vol->backupuid_specified = false; /* no backup intent for a user */
+	vol->backupgid_specified = false; /* no backup intent for a group */
+
+	switch (cifs_parse_devname(devname, vol)) {
+	case 0:
+		break;
+	case -ENOMEM:
+		cifs_dbg(VFS, "Unable to allocate memory for devname.\n");
+		goto cifs_parse_mount_err;
+	case -EINVAL:
+		cifs_dbg(VFS, "Malformed UNC in devname.\n");
+		goto cifs_parse_mount_err;
+	default:
+		cifs_dbg(VFS, "Unknown error parsing devname.\n");
+		goto cifs_parse_mount_err;
+	}
+
+	while ((data = strsep(&options, separator)) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		unsigned long option;
+		int token;
+
+		if (!*data)
+			continue;
+
+		token = match_token(data, cifs_mount_option_tokens, args);
+
+		switch (token) {
+
+		/* Ingnore the following */
+		case Opt_ignore:
+			break;
+
+		/* Boolean values */
+		case Opt_user_xattr:
+			vol->no_xattr = 0;
+			break;
+		case Opt_nouser_xattr:
+			vol->no_xattr = 1;
+			break;
+		case Opt_forceuid:
+			override_uid = 1;
+			break;
+		case Opt_noforceuid:
+			override_uid = 0;
+			break;
+		case Opt_forcegid:
+			override_gid = 1;
+			break;
+		case Opt_noforcegid:
+			override_gid = 0;
+			break;
+		case Opt_noblocksend:
+			vol->noblocksnd = 1;
+			break;
+		case Opt_noautotune:
+			vol->noautotune = 1;
+			break;
+		case Opt_hard:
+			vol->retry = 1;
+			break;
+		case Opt_soft:
+			vol->retry = 0;
+			break;
+		case Opt_perm:
+			vol->noperm = 0;
+			break;
+		case Opt_noperm:
+			vol->noperm = 1;
+			break;
+		case Opt_mapchars:
+			vol->sfu_remap = true;
+			vol->remap = false; /* disable SFM mapping */
+			break;
+		case Opt_nomapchars:
+			vol->sfu_remap = false;
+			break;
+		case Opt_mapposix:
+			vol->remap = true;
+			vol->sfu_remap = false; /* disable SFU mapping */
+			break;
+		case Opt_nomapposix:
+			vol->remap = false;
+			break;
+		case Opt_sfu:
+			vol->sfu_emul = 1;
+			break;
+		case Opt_nosfu:
+			vol->sfu_emul = 0;
+			break;
+		case Opt_nodfs:
+			vol->nodfs = 1;
+			break;
+		case Opt_posixpaths:
+			vol->posix_paths = 1;
+			break;
+		case Opt_noposixpaths:
+			vol->posix_paths = 0;
+			break;
+		case Opt_nounix:
+			vol->no_linux_ext = 1;
+			break;
+		case Opt_nocase:
+			vol->nocase = 1;
+			break;
+		case Opt_brl:
+			vol->nobrl =  0;
+			break;
+		case Opt_nobrl:
+			vol->nobrl =  1;
+			/*
+			 * turn off mandatory locking in mode
+			 * if remote locking is turned off since the
+			 * local vfs will do advisory
+			 */
+			if (vol->file_mode ==
+				(S_IALLUGO & ~(S_ISUID | S_IXGRP)))
+				vol->file_mode = S_IALLUGO;
+			break;
+		case Opt_forcemandatorylock:
+			vol->mand_lock = 1;
+			break;
+		case Opt_setuids:
+			vol->setuids = 1;
+			break;
+		case Opt_nosetuids:
+			vol->setuids = 0;
+			break;
+		case Opt_dynperm:
+			vol->dynperm = true;
+			break;
+		case Opt_nodynperm:
+			vol->dynperm = false;
+			break;
+		case Opt_nohard:
+			vol->retry = 0;
+			break;
+		case Opt_nosoft:
+			vol->retry = 1;
+			break;
+		case Opt_nointr:
+			vol->intr = 0;
+			break;
+		case Opt_intr:
+			vol->intr = 1;
+			break;
+		case Opt_nostrictsync:
+			vol->nostrictsync = 1;
+			break;
+		case Opt_strictsync:
+			vol->nostrictsync = 0;
+			break;
+		case Opt_serverino:
+			vol->server_ino = 1;
+			break;
+		case Opt_noserverino:
+			vol->server_ino = 0;
+			break;
+		case Opt_rwpidforward:
+			vol->rwpidforward = 1;
+			break;
+		case Opt_cifsacl:
+			vol->cifs_acl = 1;
+			break;
+		case Opt_nocifsacl:
+			vol->cifs_acl = 0;
+			break;
+		case Opt_acl:
+			vol->no_psx_acl = 0;
+			break;
+		case Opt_noacl:
+			vol->no_psx_acl = 1;
+			break;
+		case Opt_locallease:
+			vol->local_lease = 1;
+			break;
+		case Opt_sign:
+			vol->sign = true;
+			break;
+		case Opt_seal:
+			/* we do not do the following in secFlags because seal
+			 * is a per tree connection (mount) not a per socket
+			 * or per-smb connection option in the protocol
+			 * vol->secFlg |= CIFSSEC_MUST_SEAL;
+			 */
+			vol->seal = 1;
+			break;
+		case Opt_noac:
+			pr_warn("CIFS: Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n");
+			break;
+		case Opt_fsc:
+#ifndef CONFIG_CIFS_FSCACHE
+			cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n");
+			goto cifs_parse_mount_err;
+#endif
+			vol->fsc = true;
+			break;
+		case Opt_mfsymlinks:
+			vol->mfsymlinks = true;
+			break;
+		case Opt_multiuser:
+			vol->multiuser = true;
+			break;
+		case Opt_sloppy:
+			sloppy = true;
+			break;
+		case Opt_nosharesock:
+			vol->nosharesock = true;
+			break;
+
+		/* Numeric Values */
+		case Opt_backupuid:
+			if (get_option_uid(args, &vol->backupuid)) {
+				cifs_dbg(VFS, "%s: Invalid backupuid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->backupuid_specified = true;
+			break;
+		case Opt_backupgid:
+			if (get_option_gid(args, &vol->backupgid)) {
+				cifs_dbg(VFS, "%s: Invalid backupgid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->backupgid_specified = true;
+			break;
+		case Opt_uid:
+			if (get_option_uid(args, &vol->linux_uid)) {
+				cifs_dbg(VFS, "%s: Invalid uid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			uid_specified = true;
+			break;
+		case Opt_cruid:
+			if (get_option_uid(args, &vol->cred_uid)) {
+				cifs_dbg(VFS, "%s: Invalid cruid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			break;
+		case Opt_gid:
+			if (get_option_gid(args, &vol->linux_gid)) {
+				cifs_dbg(VFS, "%s: Invalid gid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			gid_specified = true;
+			break;
+		case Opt_file_mode:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid file_mode value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->file_mode = option;
+			break;
+		case Opt_dirmode:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid dir_mode value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->dir_mode = option;
+			break;
+		case Opt_port:
+			if (get_option_ul(args, &option) ||
+			    option > USHRT_MAX) {
+				cifs_dbg(VFS, "%s: Invalid port value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			port = (unsigned short)option;
+			break;
+		case Opt_rsize:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid rsize value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->rsize = option;
+			break;
+		case Opt_wsize:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid wsize value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->wsize = option;
+			break;
+		case Opt_actimeo:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid actimeo value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->actimeo = HZ * option;
+			if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+				cifs_dbg(VFS, "attribute cache timeout too large\n");
+				goto cifs_parse_mount_err;
+			}
+			break;
+
+		/* String Arguments */
+
+		case Opt_blank_user:
+			/* null user, ie. anonymous authentication */
+			vol->nullauth = 1;
+			vol->username = NULL;
+			break;
+		case Opt_user:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strnlen(string, CIFS_MAX_USERNAME_LEN) >
+							CIFS_MAX_USERNAME_LEN) {
+				pr_warn("CIFS: username too long\n");
+				goto cifs_parse_mount_err;
+			}
+
+			kfree(vol->username);
+			vol->username = kstrdup(string, GFP_KERNEL);
+			if (!vol->username)
+				goto cifs_parse_mount_err;
+			break;
+		case Opt_blank_pass:
+			/* passwords have to be handled differently
+			 * to allow the character used for deliminator
+			 * to be passed within them
+			 */
+
+			/*
+			 * Check if this is a case where the  password
+			 * starts with a delimiter
+			 */
+			tmp_end = strchr(data, '=');
+			tmp_end++;
+			if (!(tmp_end < end && tmp_end[1] == delim)) {
+				/* No it is not. Set the password to NULL */
+				kfree(vol->password);
+				vol->password = NULL;
+				break;
+			}
+			/* Yes it is. Drop down to Opt_pass below.*/
+		case Opt_pass:
+			/* Obtain the value string */
+			value = strchr(data, '=');
+			value++;
+
+			/* Set tmp_end to end of the string */
+			tmp_end = (char *) value + strlen(value);
+
+			/* Check if following character is the deliminator
+			 * If yes, we have encountered a double deliminator
+			 * reset the NULL character to the deliminator
+			 */
+			if (tmp_end < end && tmp_end[1] == delim) {
+				tmp_end[0] = delim;
+
+				/* Keep iterating until we get to a single
+				 * deliminator OR the end
+				 */
+				while ((tmp_end = strchr(tmp_end, delim))
+					!= NULL && (tmp_end[1] == delim)) {
+						tmp_end = (char *) &tmp_end[2];
+				}
+
+				/* Reset var options to point to next element */
+				if (tmp_end) {
+					tmp_end[0] = '\0';
+					options = (char *) &tmp_end[1];
+				} else
+					/* Reached the end of the mount option
+					 * string */
+					options = end;
+			}
+
+			kfree(vol->password);
+			/* Now build new password string */
+			temp_len = strlen(value);
+			vol->password = kzalloc(temp_len+1, GFP_KERNEL);
+			if (vol->password == NULL) {
+				pr_warn("CIFS: no memory for password\n");
+				goto cifs_parse_mount_err;
+			}
+
+			for (i = 0, j = 0; i < temp_len; i++, j++) {
+				vol->password[j] = value[i];
+				if ((value[i] == delim) &&
+				     value[i+1] == delim)
+					/* skip the second deliminator */
+					i++;
+			}
+			vol->password[j] = '\0';
+			break;
+		case Opt_blank_ip:
+			/* FIXME: should this be an error instead? */
+			got_ip = false;
+			break;
+		case Opt_ip:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (!cifs_convert_address(dstaddr, string,
+					strlen(string))) {
+				pr_err("CIFS: bad ip= option (%s).\n", string);
+				goto cifs_parse_mount_err;
+			}
+			got_ip = true;
+			break;
+		case Opt_domain:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN)
+					== CIFS_MAX_DOMAINNAME_LEN) {
+				pr_warn("CIFS: domain name too long\n");
+				goto cifs_parse_mount_err;
+			}
+
+			kfree(vol->domainname);
+			vol->domainname = kstrdup(string, GFP_KERNEL);
+			if (!vol->domainname) {
+				pr_warn("CIFS: no memory for domainname\n");
+				goto cifs_parse_mount_err;
+			}
+			cifs_dbg(FYI, "Domain name set\n");
+			break;
+		case Opt_srcaddr:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (!cifs_convert_address(
+					(struct sockaddr *)&vol->srcaddr,
+					string, strlen(string))) {
+				pr_warn("CIFS: Could not parse srcaddr: %s\n",
+					string);
+				goto cifs_parse_mount_err;
+			}
+			break;
+		case Opt_iocharset:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strnlen(string, 1024) >= 65) {
+				pr_warn("CIFS: iocharset name too long.\n");
+				goto cifs_parse_mount_err;
+			}
+
+			 if (strncasecmp(string, "default", 7) != 0) {
+				kfree(vol->iocharset);
+				vol->iocharset = kstrdup(string,
+							 GFP_KERNEL);
+				if (!vol->iocharset) {
+					pr_warn("CIFS: no memory for charset\n");
+					goto cifs_parse_mount_err;
+				}
+			}
+			/* if iocharset not set then load_nls_default
+			 * is used by caller
+			 */
+			 cifs_dbg(FYI, "iocharset set to %s\n", string);
+			break;
+		case Opt_netbiosname:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			memset(vol->source_rfc1001_name, 0x20,
+				RFC1001_NAME_LEN);
+			/*
+			 * FIXME: are there cases in which a comma can
+			 * be valid in workstation netbios name (and
+			 * need special handling)?
+			 */
+			for (i = 0; i < RFC1001_NAME_LEN; i++) {
+				/* don't ucase netbiosname for user */
+				if (string[i] == 0)
+					break;
+				vol->source_rfc1001_name[i] = string[i];
+			}
+			/* The string has 16th byte zero still from
+			 * set at top of the function
+			 */
+			if (i == RFC1001_NAME_LEN && string[i] != 0)
+				pr_warn("CIFS: netbiosname longer than 15 truncated.\n");
+			break;
+		case Opt_servern:
+			/* servernetbiosname specified override *SMBSERVER */
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			/* last byte, type, is 0x20 for servr type */
+			memset(vol->target_rfc1001_name, 0x20,
+				RFC1001_NAME_LEN_WITH_NULL);
+
+			/* BB are there cases in which a comma can be
+			   valid in this workstation netbios name
+			   (and need special handling)? */
+
+			/* user or mount helper must uppercase the
+			   netbios name */
+			for (i = 0; i < 15; i++) {
+				if (string[i] == 0)
+					break;
+				vol->target_rfc1001_name[i] = string[i];
+			}
+			/* The string has 16th byte zero still from
+			   set at top of the function  */
+			if (i == RFC1001_NAME_LEN && string[i] != 0)
+				pr_warn("CIFS: server netbiosname longer than 15 truncated.\n");
+			break;
+		case Opt_ver:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strncasecmp(string, "1", 1) == 0) {
+				/* This is the default */
+				break;
+			}
+			/* For all other value, error */
+			pr_warn("CIFS: Invalid version specified\n");
+			goto cifs_parse_mount_err;
+		case Opt_vers:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_smb_version(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
+		case Opt_sec:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_security_flavors(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
+		case Opt_cache:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_cache_flavor(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
+		default:
+			/*
+			 * An option we don't recognize. Save it off for later
+			 * if we haven't already found one
+			 */
+			if (!invalid)
+				invalid = data;
+			break;
+		}
+		/* Free up any allocated string */
+		kfree(string);
+		string = NULL;
+	}
+
+	if (!sloppy && invalid) {
+		pr_err("CIFS: Unknown mount option \"%s\"\n", invalid);
+		goto cifs_parse_mount_err;
+	}
+
+#ifndef CONFIG_KEYS
+	/* Muliuser mounts require CONFIG_KEYS support */
+	if (vol->multiuser) {
+		cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n");
+		goto cifs_parse_mount_err;
+	}
+#endif
+	if (!vol->UNC) {
+		cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n");
+		goto cifs_parse_mount_err;
+	}
+
+	/* make sure UNC has a share name */
+	if (!strchr(vol->UNC + 3, '\\')) {
+		cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n");
+		goto cifs_parse_mount_err;
+	}
+
+	if (!got_ip) {
+		/* No ip= option specified? Try to get it from UNC */
+		if (!cifs_convert_address(dstaddr, &vol->UNC[2],
+						strlen(&vol->UNC[2]))) {
+			pr_err("Unable to determine destination address.\n");
+			goto cifs_parse_mount_err;
+		}
+	}
+
+	/* set the port that we got earlier */
+	cifs_set_port(dstaddr, port);
+
+	if (uid_specified)
+		vol->override_uid = override_uid;
+	else if (override_uid == 1)
+		pr_notice("CIFS: ignoring forceuid mount option specified with no uid= option.\n");
+
+	if (gid_specified)
+		vol->override_gid = override_gid;
+	else if (override_gid == 1)
+		pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n");
+
+	kfree(mountdata_copy);
+	return 0;
+
+out_nomem:
+	pr_warn("Could not allocate temporary buffer\n");
+cifs_parse_mount_err:
+	kfree(string);
+	kfree(mountdata_copy);
+	return 1;
+}
+
+/** Returns true if srcaddr isn't specified and rhs isn't
+ * specified, or if srcaddr is specified and
+ * matches the IP address of the rhs argument.
+ */
+static bool
+srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
+{
+	switch (srcaddr->sa_family) {
+	case AF_UNSPEC:
+		return (rhs->sa_family == AF_UNSPEC);
+	case AF_INET: {
+		struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
+		struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+		return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
+		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
+		return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
+	}
+	default:
+		WARN_ON(1);
+		return false; /* don't expect to be here */
+	}
+}
+
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
+static bool
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+	__be16 port, *sport;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
+		port = ((struct sockaddr_in *) addr)->sin_port;
+		break;
+	case AF_INET6:
+		sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
+		port = ((struct sockaddr_in6 *) addr)->sin6_port;
+		break;
+	default:
+		WARN_ON(1);
+		return false;
+	}
+
+	if (!port) {
+		port = htons(CIFS_PORT);
+		if (port == *sport)
+			return true;
+
+		port = htons(RFC1001_PORT);
+	}
+
+	return port == *sport;
+}
+
+static bool
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
+	      struct sockaddr *srcaddr)
+{
+	switch (addr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+		struct sockaddr_in *srv_addr4 =
+					(struct sockaddr_in *)&server->dstaddr;
+
+		if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
+			return false;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+		struct sockaddr_in6 *srv_addr6 =
+					(struct sockaddr_in6 *)&server->dstaddr;
+
+		if (!ipv6_addr_equal(&addr6->sin6_addr,
+				     &srv_addr6->sin6_addr))
+			return false;
+		if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
+			return false;
+		break;
+	}
+	default:
+		WARN_ON(1);
+		return false; /* don't expect to be here */
+	}
+
+	if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
+		return false;
+
+	return true;
+}
+
+static bool
+match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+	/*
+	 * The select_sectype function should either return the vol->sectype
+	 * that was specified, or "Unspecified" if that sectype was not
+	 * compatible with the given NEGOTIATE request.
+	 */
+	if (select_sectype(server, vol->sectype) == Unspecified)
+		return false;
+
+	/*
+	 * Now check if signing mode is acceptable. No need to check
+	 * global_secflags at this point since if MUST_SIGN is set then
+	 * the server->sign had better be too.
+	 */
+	if (vol->sign && !server->sign)
+		return false;
+
+	return true;
+}
+
+static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+	struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
+
+	if (vol->nosharesock)
+		return 0;
+
+	if ((server->vals != vol->vals) || (server->ops != vol->ops))
+		return 0;
+
+	if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+		return 0;
+
+	if (!match_address(server, addr,
+			   (struct sockaddr *)&vol->srcaddr))
+		return 0;
+
+	if (!match_port(server, addr))
+		return 0;
+
+	if (!match_security(server, vol))
+		return 0;
+
+	return 1;
+}
+
+static struct TCP_Server_Info *
+cifs_find_tcp_session(struct smb_vol *vol)
+{
+	struct TCP_Server_Info *server;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+		if (!match_server(server, vol))
+			continue;
+
+		++server->srv_count;
+		spin_unlock(&cifs_tcp_ses_lock);
+		cifs_dbg(FYI, "Existing tcp session with server found\n");
+		return server;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_tcp_session(struct TCP_Server_Info *server)
+{
+	struct task_struct *task;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if (--server->srv_count > 0) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	put_net(cifs_net_ns(server));
+
+	list_del_init(&server->tcp_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cancel_delayed_work_sync(&server->echo);
+
+	spin_lock(&GlobalMid_Lock);
+	server->tcpStatus = CifsExiting;
+	spin_unlock(&GlobalMid_Lock);
+
+	cifs_crypto_shash_release(server);
+	cifs_fscache_release_client_cookie(server);
+
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
+
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
+}
+
+static struct TCP_Server_Info *
+cifs_get_tcp_session(struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *tcp_ses = NULL;
+	int rc;
+
+	cifs_dbg(FYI, "UNC: %s\n", volume_info->UNC);
+
+	/* see if we already have a matching tcp_ses */
+	tcp_ses = cifs_find_tcp_session(volume_info);
+	if (tcp_ses)
+		return tcp_ses;
+
+	tcp_ses = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL);
+	if (!tcp_ses) {
+		rc = -ENOMEM;
+		goto out_err;
+	}
+
+	tcp_ses->ops = volume_info->ops;
+	tcp_ses->vals = volume_info->vals;
+	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
+	tcp_ses->hostname = extract_hostname(volume_info->UNC);
+	if (IS_ERR(tcp_ses->hostname)) {
+		rc = PTR_ERR(tcp_ses->hostname);
+		goto out_err_crypto_release;
+	}
+
+	tcp_ses->noblocksnd = volume_info->noblocksnd;
+	tcp_ses->noautotune = volume_info->noautotune;
+	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
+	tcp_ses->in_flight = 0;
+	tcp_ses->credits = 1;
+	init_waitqueue_head(&tcp_ses->response_q);
+	init_waitqueue_head(&tcp_ses->request_q);
+	INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
+	mutex_init(&tcp_ses->srv_mutex);
+	memcpy(tcp_ses->workstation_RFC1001_name,
+		volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+	memcpy(tcp_ses->server_RFC1001_name,
+		volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+	tcp_ses->session_estab = false;
+	tcp_ses->sequence_number = 0;
+	tcp_ses->lstrp = jiffies;
+	spin_lock_init(&tcp_ses->req_lock);
+	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
+	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
+	memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+	       sizeof(tcp_ses->srcaddr));
+	memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
+		sizeof(tcp_ses->dstaddr));
+#ifdef CONFIG_CIFS_SMB2
+	get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE);
+#endif
+	/*
+	 * at this point we are the only ones with the pointer
+	 * to the struct since the kernel thread not created yet
+	 * no need to spinlock this init of tcpStatus or srv_count
+	 */
+	tcp_ses->tcpStatus = CifsNew;
+	++tcp_ses->srv_count;
+
+	rc = ip_connect(tcp_ses);
+	if (rc < 0) {
+		cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
+		goto out_err_crypto_release;
+	}
+
+	/*
+	 * since we're in a cifs function already, we know that
+	 * this will succeed. No need for try_module_get().
+	 */
+	__module_get(THIS_MODULE);
+	tcp_ses->tsk = kthread_run(cifs_demultiplex_thread,
+				  tcp_ses, "cifsd");
+	if (IS_ERR(tcp_ses->tsk)) {
+		rc = PTR_ERR(tcp_ses->tsk);
+		cifs_dbg(VFS, "error %d create cifsd thread\n", rc);
+		module_put(THIS_MODULE);
+		goto out_err_crypto_release;
+	}
+	tcp_ses->tcpStatus = CifsNeedNegotiate;
+
+	/* thread spawned, put it on the list */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cifs_fscache_get_client_cookie(tcp_ses);
+
+	/* queue echo request delayed work */
+	queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+
+	return tcp_ses;
+
+out_err_crypto_release:
+	cifs_crypto_shash_release(tcp_ses);
+
+	put_net(cifs_net_ns(tcp_ses));
+
+out_err:
+	if (tcp_ses) {
+		if (!IS_ERR(tcp_ses->hostname))
+			kfree(tcp_ses->hostname);
+		if (tcp_ses->ssocket)
+			sock_release(tcp_ses->ssocket);
+		kfree(tcp_ses);
+	}
+	return ERR_PTR(rc);
+}
+
+static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
+{
+	if (vol->sectype != Unspecified &&
+	    vol->sectype != ses->sectype)
+		return 0;
+
+	switch (ses->sectype) {
+	case Kerberos:
+		if (!uid_eq(vol->cred_uid, ses->cred_uid))
+			return 0;
+		break;
+	default:
+		/* NULL username means anonymous session */
+		if (ses->user_name == NULL) {
+			if (!vol->nullauth)
+				return 0;
+			break;
+		}
+
+		/* anything else takes username/password */
+		if (strncmp(ses->user_name,
+			    vol->username ? vol->username : "",
+			    CIFS_MAX_USERNAME_LEN))
+			return 0;
+		if ((vol->username && strlen(vol->username) != 0) &&
+		    ses->password != NULL &&
+		    strncmp(ses->password,
+			    vol->password ? vol->password : "",
+			    CIFS_MAX_PASSWORD_LEN))
+			return 0;
+	}
+	return 1;
+}
+
+static struct cifs_ses *
+cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+	struct cifs_ses *ses;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		if (ses->status == CifsExiting)
+			continue;
+		if (!match_session(ses, vol))
+			continue;
+		++ses->ses_count;
+		spin_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_smb_ses(struct cifs_ses *ses)
+{
+	unsigned int rc, xid;
+	struct TCP_Server_Info *server = ses->server;
+
+	cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if (ses->status == CifsExiting) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+	if (--ses->ses_count > 0) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+	if (ses->status == CifsGood)
+		ses->status = CifsExiting;
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	if (ses->status == CifsExiting && server->ops->logoff) {
+		xid = get_xid();
+		rc = server->ops->logoff(xid, ses);
+		if (rc)
+			cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
+				__func__, rc);
+		_free_xid(xid);
+	}
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_del_init(&ses->smb_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	sesInfoFree(ses);
+	cifs_put_tcp_session(server);
+}
+
+#ifdef CONFIG_KEYS
+
+/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1)
+
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+	int rc = 0;
+	char *desc, *delim, *payload;
+	ssize_t len;
+	struct key *key;
+	struct TCP_Server_Info *server = ses->server;
+	struct sockaddr_in *sa;
+	struct sockaddr_in6 *sa6;
+	struct user_key_payload *upayload;
+
+	desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	/* try to find an address key first */
+	switch (server->dstaddr.ss_family) {
+	case AF_INET:
+		sa = (struct sockaddr_in *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+		break;
+	case AF_INET6:
+		sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+		break;
+	default:
+		cifs_dbg(FYI, "Bad ss_family (%hu)\n",
+			 server->dstaddr.ss_family);
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
+	key = request_key(&key_type_logon, desc, "");
+	if (IS_ERR(key)) {
+		if (!ses->domainName) {
+			cifs_dbg(FYI, "domainName is NULL\n");
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+
+		/* didn't work, try to find a domain key */
+		sprintf(desc, "cifs:d:%s", ses->domainName);
+		cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
+		key = request_key(&key_type_logon, desc, "");
+		if (IS_ERR(key)) {
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+	}
+
+	down_read(&key->sem);
+	upayload = key->payload.data;
+	if (IS_ERR_OR_NULL(upayload)) {
+		rc = upayload ? PTR_ERR(upayload) : -EINVAL;
+		goto out_key_put;
+	}
+
+	/* find first : in payload */
+	payload = (char *)upayload->data;
+	delim = strnchr(payload, upayload->datalen, ':');
+	cifs_dbg(FYI, "payload=%s\n", payload);
+	if (!delim) {
+		cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n",
+			 upayload->datalen);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	len = delim - payload;
+	if (len > CIFS_MAX_USERNAME_LEN || len <= 0) {
+		cifs_dbg(FYI, "Bad value from username search (len=%zd)\n",
+			 len);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	vol->username = kstrndup(payload, len, GFP_KERNEL);
+	if (!vol->username) {
+		cifs_dbg(FYI, "Unable to allocate %zd bytes for username\n",
+			 len);
+		rc = -ENOMEM;
+		goto out_key_put;
+	}
+	cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username);
+
+	len = key->datalen - (len + 1);
+	if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) {
+		cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len);
+		rc = -EINVAL;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+	++delim;
+	vol->password = kstrndup(delim, len, GFP_KERNEL);
+	if (!vol->password) {
+		cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n",
+			 len);
+		rc = -ENOMEM;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+out_key_put:
+	up_read(&key->sem);
+	key_put(key);
+out_err:
+	kfree(desc);
+	cifs_dbg(FYI, "%s: returning %d\n", __func__, rc);
+	return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+		   struct cifs_ses *ses __attribute__((unused)))
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
+
+static struct cifs_ses *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
+{
+	int rc = -ENOMEM;
+	unsigned int xid;
+	struct cifs_ses *ses;
+	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+
+	xid = get_xid();
+
+	ses = cifs_find_smb_ses(server, volume_info);
+	if (ses) {
+		cifs_dbg(FYI, "Existing smb sess found (status=%d)\n",
+			 ses->status);
+
+		mutex_lock(&ses->session_mutex);
+		rc = cifs_negotiate_protocol(xid, ses);
+		if (rc) {
+			mutex_unlock(&ses->session_mutex);
+			/* problem -- put our ses reference */
+			cifs_put_smb_ses(ses);
+			free_xid(xid);
+			return ERR_PTR(rc);
+		}
+		if (ses->need_reconnect) {
+			cifs_dbg(FYI, "Session needs reconnect\n");
+			rc = cifs_setup_session(xid, ses,
+						volume_info->local_nls);
+			if (rc) {
+				mutex_unlock(&ses->session_mutex);
+				/* problem -- put our reference */
+				cifs_put_smb_ses(ses);
+				free_xid(xid);
+				return ERR_PTR(rc);
+			}
+		}
+		mutex_unlock(&ses->session_mutex);
+
+		/* existing SMB ses has a server reference already */
+		cifs_put_tcp_session(server);
+		free_xid(xid);
+		return ses;
+	}
+
+	cifs_dbg(FYI, "Existing smb sess not found\n");
+	ses = sesInfoAlloc();
+	if (ses == NULL)
+		goto get_ses_fail;
+
+	/* new SMB session uses our server ref */
+	ses->server = server;
+	if (server->dstaddr.ss_family == AF_INET6)
+		sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
+	else
+		sprintf(ses->serverName, "%pI4", &addr->sin_addr);
+
+	if (volume_info->username) {
+		ses->user_name = kstrdup(volume_info->username, GFP_KERNEL);
+		if (!ses->user_name)
+			goto get_ses_fail;
+	}
+
+	/* volume_info->password freed at unmount */
+	if (volume_info->password) {
+		ses->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!ses->password)
+			goto get_ses_fail;
+	}
+	if (volume_info->domainname) {
+		ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
+		if (!ses->domainName)
+			goto get_ses_fail;
+	}
+	ses->cred_uid = volume_info->cred_uid;
+	ses->linux_uid = volume_info->linux_uid;
+
+	ses->sectype = volume_info->sectype;
+	ses->sign = volume_info->sign;
+
+	mutex_lock(&ses->session_mutex);
+	rc = cifs_negotiate_protocol(xid, ses);
+	if (!rc)
+		rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+	mutex_unlock(&ses->session_mutex);
+	if (rc)
+		goto get_ses_fail;
+
+	/* success, put it on the list */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_add(&ses->smb_ses_list, &server->smb_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	free_xid(xid);
+	return ses;
+
+get_ses_fail:
+	sesInfoFree(ses);
+	free_xid(xid);
+	return ERR_PTR(rc);
+}
+
+static int match_tcon(struct cifs_tcon *tcon, const char *unc)
+{
+	if (tcon->tidStatus == CifsExiting)
+		return 0;
+	if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+		return 0;
+	return 1;
+}
+
+static struct cifs_tcon *
+cifs_find_tcon(struct cifs_ses *ses, const char *unc)
+{
+	struct list_head *tmp;
+	struct cifs_tcon *tcon;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &ses->tcon_list) {
+		tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
+		if (!match_tcon(tcon, unc))
+			continue;
+		++tcon->tc_count;
+		spin_unlock(&cifs_tcp_ses_lock);
+		return tcon;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_tcon(struct cifs_tcon *tcon)
+{
+	unsigned int xid;
+	struct cifs_ses *ses = tcon->ses;
+
+	cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
+	spin_lock(&cifs_tcp_ses_lock);
+	if (--tcon->tc_count > 0) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&tcon->tcon_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	xid = get_xid();
+	if (ses->server->ops->tree_disconnect)
+		ses->server->ops->tree_disconnect(xid, tcon);
+	_free_xid(xid);
+
+	cifs_fscache_release_super_cookie(tcon);
+	tconInfoFree(tcon);
+	cifs_put_smb_ses(ses);
+}
+
+static struct cifs_tcon *
+cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
+{
+	int rc, xid;
+	struct cifs_tcon *tcon;
+
+	tcon = cifs_find_tcon(ses, volume_info->UNC);
+	if (tcon) {
+		cifs_dbg(FYI, "Found match on UNC path\n");
+		/* existing tcon already has a reference */
+		cifs_put_smb_ses(ses);
+		if (tcon->seal != volume_info->seal)
+			cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n");
+		return tcon;
+	}
+
+	if (!ses->server->ops->tree_connect) {
+		rc = -ENOSYS;
+		goto out_fail;
+	}
+
+	tcon = tconInfoAlloc();
+	if (tcon == NULL) {
+		rc = -ENOMEM;
+		goto out_fail;
+	}
+
+	tcon->ses = ses;
+	if (volume_info->password) {
+		tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!tcon->password) {
+			rc = -ENOMEM;
+			goto out_fail;
+		}
+	}
+
+	/*
+	 * BB Do we need to wrap session_mutex around this TCon call and Unix
+	 * SetFS as we do on SessSetup and reconnect?
+	 */
+	xid = get_xid();
+	rc = ses->server->ops->tree_connect(xid, ses, volume_info->UNC, tcon,
+					    volume_info->local_nls);
+	free_xid(xid);
+	cifs_dbg(FYI, "Tcon rc = %d\n", rc);
+	if (rc)
+		goto out_fail;
+
+	if (volume_info->nodfs) {
+		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+		cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
+	}
+	tcon->seal = volume_info->seal;
+	/*
+	 * We can have only one retry value for a connection to a share so for
+	 * resources mounted more than once to the same server share the last
+	 * value passed in for the retry flag is used.
+	 */
+	tcon->retry = volume_info->retry;
+	tcon->nocase = volume_info->nocase;
+	tcon->local_lease = volume_info->local_lease;
+	INIT_LIST_HEAD(&tcon->pending_opens);
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_add(&tcon->tcon_list, &ses->tcon_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cifs_fscache_get_super_cookie(tcon);
+
+	return tcon;
+
+out_fail:
+	tconInfoFree(tcon);
+	return ERR_PTR(rc);
+}
+
+void
+cifs_put_tlink(struct tcon_link *tlink)
+{
+	if (!tlink || IS_ERR(tlink))
+		return;
+
+	if (!atomic_dec_and_test(&tlink->tl_count) ||
+	    test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
+		tlink->tl_time = jiffies;
+		return;
+	}
+
+	if (!IS_ERR(tlink_tcon(tlink)))
+		cifs_put_tcon(tlink_tcon(tlink));
+	kfree(tlink);
+	return;
+}
+
+static inline struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+	return cifs_sb->master_tlink;
+}
+
+static int
+compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+	struct cifs_sb_info *old = CIFS_SB(sb);
+	struct cifs_sb_info *new = mnt_data->cifs_sb;
+
+	if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK))
+		return 0;
+
+	if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) !=
+	    (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
+		return 0;
+
+	/*
+	 * We want to share sb only if we don't specify an r/wsize or
+	 * specified r/wsize is greater than or equal to existing one.
+	 */
+	if (new->wsize && new->wsize < old->wsize)
+		return 0;
+
+	if (new->rsize && new->rsize < old->rsize)
+		return 0;
+
+	if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid))
+		return 0;
+
+	if (old->mnt_file_mode != new->mnt_file_mode ||
+	    old->mnt_dir_mode != new->mnt_dir_mode)
+		return 0;
+
+	if (strcmp(old->local_nls->charset, new->local_nls->charset))
+		return 0;
+
+	if (old->actimeo != new->actimeo)
+		return 0;
+
+	return 1;
+}
+
+int
+cifs_match_super(struct super_block *sb, void *data)
+{
+	struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data;
+	struct smb_vol *volume_info;
+	struct cifs_sb_info *cifs_sb;
+	struct TCP_Server_Info *tcp_srv;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink;
+	int rc = 0;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+	if (IS_ERR(tlink)) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return rc;
+	}
+	tcon = tlink_tcon(tlink);
+	ses = tcon->ses;
+	tcp_srv = ses->server;
+
+	volume_info = mnt_data->vol;
+
+	if (!match_server(tcp_srv, volume_info) ||
+	    !match_session(ses, volume_info) ||
+	    !match_tcon(tcon, volume_info->UNC)) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = compare_mount_options(sb, mnt_data);
+out:
+	spin_unlock(&cifs_tcp_ses_lock);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int
+get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
+	     const struct nls_table *nls_codepage, unsigned int *num_referrals,
+	     struct dfs_info3_param **referrals, int remap)
+{
+	char *temp_unc;
+	int rc = 0;
+
+	if (!ses->server->ops->tree_connect || !ses->server->ops->get_dfs_refer)
+		return -ENOSYS;
+
+	*num_referrals = 0;
+	*referrals = NULL;
+
+	if (ses->ipc_tid == 0) {
+		temp_unc = kmalloc(2 /* for slashes */ +
+			strnlen(ses->serverName, SERVER_NAME_LEN_WITH_NULL * 2)
+				+ 1 + 4 /* slash IPC$ */ + 2, GFP_KERNEL);
+		if (temp_unc == NULL)
+			return -ENOMEM;
+		temp_unc[0] = '\\';
+		temp_unc[1] = '\\';
+		strcpy(temp_unc + 2, ses->serverName);
+		strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$");
+		rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL,
+						    nls_codepage);
+		cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid);
+		kfree(temp_unc);
+	}
+	if (rc == 0)
+		rc = ses->server->ops->get_dfs_refer(xid, ses, old_path,
+						     referrals, num_referrals,
+						     nls_codepage, remap);
+	/*
+	 * BB - map targetUNCs to dfs_info3 structures, here or in
+	 * ses->server->ops->get_dfs_refer.
+	 */
+
+	return rc;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key cifs_key[2];
+static struct lock_class_key cifs_slock_key[2];
+
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
+		&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
+		&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
+}
+#else
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+}
+#endif
+
+/* See RFC1001 section 14 on representation of Netbios names */
+static void rfc1002mangle(char *target, char *source, unsigned int length)
+{
+	unsigned int i, j;
+
+	for (i = 0, j = 0; i < (length); i++) {
+		/* mask a nibble at a time and encode */
+		target[j] = 'A' + (0x0F & (source[i] >> 4));
+		target[j+1] = 'A' + (0x0F & source[i]);
+		j += 2;
+	}
+
+}
+
+static int
+bind_socket(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	if (server->srcaddr.ss_family != AF_UNSPEC) {
+		/* Bind to the specified local IP address */
+		struct socket *socket = server->ssocket;
+		rc = socket->ops->bind(socket,
+				       (struct sockaddr *) &server->srcaddr,
+				       sizeof(server->srcaddr));
+		if (rc < 0) {
+			struct sockaddr_in *saddr4;
+			struct sockaddr_in6 *saddr6;
+			saddr4 = (struct sockaddr_in *)&server->srcaddr;
+			saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
+			if (saddr6->sin6_family == AF_INET6)
+				cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n",
+					 &saddr6->sin6_addr, rc);
+			else
+				cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n",
+					 &saddr4->sin_addr.s_addr, rc);
+		}
+	}
+	return rc;
+}
+
+static int
+ip_rfc1001_connect(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	/*
+	 * some servers require RFC1001 sessinit before sending
+	 * negprot - BB check reconnection in case where second
+	 * sessinit is sent but no second negprot
+	 */
+	struct rfc1002_session_packet *ses_init_buf;
+	struct smb_hdr *smb_buf;
+	ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+			       GFP_KERNEL);
+	if (ses_init_buf) {
+		ses_init_buf->trailer.session_req.called_len = 32;
+
+		if (server->server_RFC1001_name &&
+		    server->server_RFC1001_name[0] != 0)
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.called_name,
+				      server->server_RFC1001_name,
+				      RFC1001_NAME_LEN_WITH_NULL);
+		else
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.called_name,
+				      DEFAULT_CIFS_CALLED_NAME,
+				      RFC1001_NAME_LEN_WITH_NULL);
+
+		ses_init_buf->trailer.session_req.calling_len = 32;
+
+		/*
+		 * calling name ends in null (byte 16) from old smb
+		 * convention.
+		 */
+		if (server->workstation_RFC1001_name[0] != 0)
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.calling_name,
+				      server->workstation_RFC1001_name,
+				      RFC1001_NAME_LEN_WITH_NULL);
+		else
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.calling_name,
+				      "LINUX_CIFS_CLNT",
+				      RFC1001_NAME_LEN_WITH_NULL);
+
+		ses_init_buf->trailer.session_req.scope1 = 0;
+		ses_init_buf->trailer.session_req.scope2 = 0;
+		smb_buf = (struct smb_hdr *)ses_init_buf;
+
+		/* sizeof RFC1002_SESSION_REQUEST with no scope */
+		smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
+		rc = smb_send(server, smb_buf, 0x44);
+		kfree(ses_init_buf);
+		/*
+		 * RFC1001 layer in at least one server
+		 * requires very short break before negprot
+		 * presumably because not expecting negprot
+		 * to follow so fast.  This is a simple
+		 * solution that works without
+		 * complicating the code and causes no
+		 * significant slowing down on mount
+		 * for everyone else
+		 */
+		usleep_range(1000, 2000);
+	}
+	/*
+	 * else the negprot may still work without this
+	 * even though malloc failed
+	 */
+
+	return rc;
+}
+
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	__be16 sport;
+	int slen, sfamily;
+	struct socket *socket = server->ssocket;
+	struct sockaddr *saddr;
+
+	saddr = (struct sockaddr *) &server->dstaddr;
+
+	if (server->dstaddr.ss_family == AF_INET6) {
+		sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+		slen = sizeof(struct sockaddr_in6);
+		sfamily = AF_INET6;
+	} else {
+		sport = ((struct sockaddr_in *) saddr)->sin_port;
+		slen = sizeof(struct sockaddr_in);
+		sfamily = AF_INET;
+	}
+
+	if (socket == NULL) {
+		rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
+				   IPPROTO_TCP, &socket, 1);
+		if (rc < 0) {
+			cifs_dbg(VFS, "Error %d creating socket\n", rc);
+			server->ssocket = NULL;
+			return rc;
+		}
+
+		/* BB other socket options to set KEEPALIVE, NODELAY? */
+		cifs_dbg(FYI, "Socket created\n");
+		server->ssocket = socket;
+		socket->sk->sk_allocation = GFP_NOFS;
+		if (sfamily == AF_INET6)
+			cifs_reclassify_socket6(socket);
+		else
+			cifs_reclassify_socket4(socket);
+	}
+
+	rc = bind_socket(server);
+	if (rc < 0)
+		return rc;
+
+	/*
+	 * Eventually check for other socket options to change from
+	 * the default. sock_setsockopt not used because it expects
+	 * user space buffer
+	 */
+	socket->sk->sk_rcvtimeo = 7 * HZ;
+	socket->sk->sk_sndtimeo = 5 * HZ;
+
+	/* make the bufsizes depend on wsize/rsize and max requests */
+	if (server->noautotune) {
+		if (socket->sk->sk_sndbuf < (200 * 1024))
+			socket->sk->sk_sndbuf = 200 * 1024;
+		if (socket->sk->sk_rcvbuf < (140 * 1024))
+			socket->sk->sk_rcvbuf = 140 * 1024;
+	}
+
+	if (server->tcp_nodelay) {
+		int val = 1;
+		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+				(char *)&val, sizeof(val));
+		if (rc)
+			cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n",
+				 rc);
+	}
+
+	cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n",
+		 socket->sk->sk_sndbuf,
+		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
+
+	rc = socket->ops->connect(socket, saddr, slen, 0);
+	if (rc < 0) {
+		cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+		sock_release(socket);
+		server->ssocket = NULL;
+		return rc;
+	}
+
+	if (sport == htons(RFC1001_PORT))
+		rc = ip_rfc1001_connect(server);
+
+	return rc;
+}
+
+static int
+ip_connect(struct TCP_Server_Info *server)
+{
+	__be16 *sport;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+
+	if (server->dstaddr.ss_family == AF_INET6)
+		sport = &addr6->sin6_port;
+	else
+		sport = &addr->sin_port;
+
+	if (*sport == 0) {
+		int rc;
+
+		/* try with 445 port at first */
+		*sport = htons(CIFS_PORT);
+
+		rc = generic_ip_connect(server);
+		if (rc >= 0)
+			return rc;
+
+		/* if it failed, try with 139 port */
+		*sport = htons(RFC1001_PORT);
+	}
+
+	return generic_ip_connect(server);
+}
+
+void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
+			  struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info)
+{
+	/* if we are reconnecting then should we check to see if
+	 * any requested capabilities changed locally e.g. via
+	 * remount but we can not do much about it here
+	 * if they have (even if we could detect it by the following)
+	 * Perhaps we could add a backpointer to array of sb from tcon
+	 * or if we change to make all sb to same share the same
+	 * sb as NFS - then we only have one backpointer to sb.
+	 * What if we wanted to mount the server share twice once with
+	 * and once without posixacls or posix paths? */
+	__u64 saved_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
+	if (vol_info && vol_info->no_linux_ext) {
+		tcon->fsUnixInfo.Capability = 0;
+		tcon->unix_ext = 0; /* Unix Extensions disabled */
+		cifs_dbg(FYI, "Linux protocol extensions disabled\n");
+		return;
+	} else if (vol_info)
+		tcon->unix_ext = 1; /* Unix Extensions supported */
+
+	if (tcon->unix_ext == 0) {
+		cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n");
+		return;
+	}
+
+	if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
+		__u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+		cifs_dbg(FYI, "unix caps which server supports %lld\n", cap);
+		/* check for reconnect case in which we do not
+		   want to change the mount behavior if we can avoid it */
+		if (vol_info == NULL) {
+			/* turn off POSIX ACL and PATHNAMES if not set
+			   originally at mount time */
+			if ((saved_cap & CIFS_UNIX_POSIX_ACL_CAP) == 0)
+				cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
+			if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
+				if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
+					cifs_dbg(VFS, "POSIXPATH support change\n");
+				cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
+			} else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
+				cifs_dbg(VFS, "possible reconnect error\n");
+				cifs_dbg(VFS, "server disabled POSIX path support\n");
+			}
+		}
+
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+			cifs_dbg(VFS, "per-share encryption not supported yet\n");
+
+		cap &= CIFS_UNIX_CAP_MASK;
+		if (vol_info && vol_info->no_psx_acl)
+			cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
+		else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
+			cifs_dbg(FYI, "negotiated posix acl support\n");
+			if (cifs_sb)
+				cifs_sb->mnt_cifs_flags |=
+					CIFS_MOUNT_POSIXACL;
+		}
+
+		if (vol_info && vol_info->posix_paths == 0)
+			cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
+		else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
+			cifs_dbg(FYI, "negotiate posix pathnames\n");
+			if (cifs_sb)
+				cifs_sb->mnt_cifs_flags |=
+					CIFS_MOUNT_POSIX_PATHS;
+		}
+
+		cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap);
+#ifdef CONFIG_CIFS_DEBUG2
+		if (cap & CIFS_UNIX_FCNTL_CAP)
+			cifs_dbg(FYI, "FCNTL cap\n");
+		if (cap & CIFS_UNIX_EXTATTR_CAP)
+			cifs_dbg(FYI, "EXTATTR cap\n");
+		if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
+			cifs_dbg(FYI, "POSIX path cap\n");
+		if (cap & CIFS_UNIX_XATTR_CAP)
+			cifs_dbg(FYI, "XATTR cap\n");
+		if (cap & CIFS_UNIX_POSIX_ACL_CAP)
+			cifs_dbg(FYI, "POSIX ACL cap\n");
+		if (cap & CIFS_UNIX_LARGE_READ_CAP)
+			cifs_dbg(FYI, "very large read cap\n");
+		if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
+			cifs_dbg(FYI, "very large write cap\n");
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP)
+			cifs_dbg(FYI, "transport encryption cap\n");
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+			cifs_dbg(FYI, "mandatory transport encryption cap\n");
+#endif /* CIFS_DEBUG2 */
+		if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
+			if (vol_info == NULL) {
+				cifs_dbg(FYI, "resetting capabilities failed\n");
+			} else
+				cifs_dbg(VFS, "Negotiating Unix capabilities with the server failed. Consider mounting with the Unix Extensions disabled if problems are found by specifying the nounix mount option.\n");
+
+		}
+	}
+}
+
+void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+			struct cifs_sb_info *cifs_sb)
+{
+	INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+
+	spin_lock_init(&cifs_sb->tlink_tree_lock);
+	cifs_sb->tlink_tree = RB_ROOT;
+
+	/*
+	 * Temporarily set r/wsize for matching superblock. If we end up using
+	 * new sb then client will later negotiate it downward if needed.
+	 */
+	cifs_sb->rsize = pvolume_info->rsize;
+	cifs_sb->wsize = pvolume_info->wsize;
+
+	cifs_sb->mnt_uid = pvolume_info->linux_uid;
+	cifs_sb->mnt_gid = pvolume_info->linux_gid;
+	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
+	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
+	cifs_dbg(FYI, "file mode: 0x%hx  dir mode: 0x%hx\n",
+		 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+
+	cifs_sb->actimeo = pvolume_info->actimeo;
+	cifs_sb->local_nls = pvolume_info->local_nls;
+
+	if (pvolume_info->noperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
+	if (pvolume_info->setuids)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
+	if (pvolume_info->server_ino)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
+	if (pvolume_info->remap)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR;
+	if (pvolume_info->sfu_remap)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
+	if (pvolume_info->no_xattr)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
+	if (pvolume_info->sfu_emul)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
+	if (pvolume_info->nobrl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->nostrictsync)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
+	if (pvolume_info->mand_lock)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
+	if (pvolume_info->rwpidforward)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
+	if (pvolume_info->cifs_acl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+	if (pvolume_info->backupuid_specified) {
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
+		cifs_sb->mnt_backupuid = pvolume_info->backupuid;
+	}
+	if (pvolume_info->backupgid_specified) {
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
+		cifs_sb->mnt_backupgid = pvolume_info->backupgid;
+	}
+	if (pvolume_info->override_uid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+	if (pvolume_info->override_gid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+	if (pvolume_info->dynperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+	if (pvolume_info->fsc)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
+	if (pvolume_info->multiuser)
+		cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
+					    CIFS_MOUNT_NO_PERM);
+	if (pvolume_info->strict_io)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
+	if (pvolume_info->direct_io) {
+		cifs_dbg(FYI, "mounting share using direct i/o\n");
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
+	}
+	if (pvolume_info->mfsymlinks) {
+		if (pvolume_info->sfu_emul) {
+			/*
+			 * Our SFU ("Services for Unix" emulation does not allow
+			 * creating symlinks but does allow reading existing SFU
+			 * symlinks (it does allow both creating and reading SFU
+			 * style mknod and FIFOs though). When "mfsymlinks" and
+			 * "sfu" are both enabled at the same time, it allows
+			 * reading both types of symlinks, but will only create
+			 * them with mfsymlinks format. This allows better
+			 * Apple compatibility (probably better for Samba too)
+			 * while still recognizing old Windows style symlinks.
+			 */
+			cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n");
+		}
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
+	}
+
+	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
+		cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
+}
+
+static void
+cleanup_volume_info_contents(struct smb_vol *volume_info)
+{
+	kfree(volume_info->username);
+	kzfree(volume_info->password);
+	kfree(volume_info->UNC);
+	kfree(volume_info->domainname);
+	kfree(volume_info->iocharset);
+	kfree(volume_info->prepath);
+}
+
+void
+cifs_cleanup_volume_info(struct smb_vol *volume_info)
+{
+	if (!volume_info)
+		return;
+	cleanup_volume_info_contents(volume_info);
+	kfree(volume_info);
+}
+
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+/*
+ * cifs_build_path_to_root returns full path to root when we do not have an
+ * exiting connection (tcon)
+ */
+static char *
+build_unc_path_to_root(const struct smb_vol *vol,
+		const struct cifs_sb_info *cifs_sb)
+{
+	char *full_path, *pos;
+	unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+	unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
+
+	full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
+	if (full_path == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	strncpy(full_path, vol->UNC, unc_len);
+	pos = full_path + unc_len;
+
+	if (pplen) {
+		*pos = CIFS_DIR_SEP(cifs_sb);
+		strncpy(pos + 1, vol->prepath, pplen);
+		pos += pplen;
+	}
+
+	*pos = '\0'; /* add trailing null */
+	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+	cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
+	return full_path;
+}
+
+/*
+ * Perform a dfs referral query for a share and (optionally) prefix
+ *
+ * If a referral is found, cifs_sb->mountdata will be (re-)allocated
+ * to a string containing updated options for the submount.  Otherwise it
+ * will be left untouched.
+ *
+ * Returns the rc from get_dfs_path to the caller, which can be used to
+ * determine whether there were referrals.
+ */
+static int
+expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
+		    struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb,
+		    int check_prefix)
+{
+	int rc;
+	unsigned int num_referrals = 0;
+	struct dfs_info3_param *referrals = NULL;
+	char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
+
+	full_path = build_unc_path_to_root(volume_info, cifs_sb);
+	if (IS_ERR(full_path))
+		return PTR_ERR(full_path);
+
+	/* For DFS paths, skip the first '\' of the UNC */
+	ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
+
+	rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls,
+			  &num_referrals, &referrals, cifs_remap(cifs_sb));
+
+	if (!rc && num_referrals > 0) {
+		char *fake_devname = NULL;
+
+		mdata = cifs_compose_mount_options(cifs_sb->mountdata,
+						   full_path + 1, referrals,
+						   &fake_devname);
+
+		free_dfs_info_array(referrals, num_referrals);
+
+		if (IS_ERR(mdata)) {
+			rc = PTR_ERR(mdata);
+			mdata = NULL;
+		} else {
+			cleanup_volume_info_contents(volume_info);
+			rc = cifs_setup_volume_info(volume_info, mdata,
+							fake_devname);
+		}
+		kfree(fake_devname);
+		kfree(cifs_sb->mountdata);
+		cifs_sb->mountdata = mdata;
+	}
+	kfree(full_path);
+	return rc;
+}
+#endif
+
+static int
+cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
+			const char *devname)
+{
+	int rc = 0;
+
+	if (cifs_parse_mount_options(mount_data, devname, volume_info))
+		return -EINVAL;
+
+	if (volume_info->nullauth) {
+		cifs_dbg(FYI, "Anonymous login\n");
+		kfree(volume_info->username);
+		volume_info->username = NULL;
+	} else if (volume_info->username) {
+		/* BB fixme parse for domain name here */
+		cifs_dbg(FYI, "Username: %s\n", volume_info->username);
+	} else {
+		cifs_dbg(VFS, "No username specified\n");
+	/* In userspace mount helper we can get user name from alternate
+	   locations such as env variables and files on disk */
+		return -EINVAL;
+	}
+
+	/* this is needed for ASCII cp to Unicode converts */
+	if (volume_info->iocharset == NULL) {
+		/* load_nls_default cannot return null */
+		volume_info->local_nls = load_nls_default();
+	} else {
+		volume_info->local_nls = load_nls(volume_info->iocharset);
+		if (volume_info->local_nls == NULL) {
+			cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n",
+				 volume_info->iocharset);
+			return -ELIBACC;
+		}
+	}
+
+	return rc;
+}
+
+struct smb_vol *
+cifs_get_volume_info(char *mount_data, const char *devname)
+{
+	int rc;
+	struct smb_vol *volume_info;
+
+	volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
+	if (!volume_info)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cifs_setup_volume_info(volume_info, mount_data, devname);
+	if (rc) {
+		cifs_cleanup_volume_info(volume_info);
+		volume_info = ERR_PTR(rc);
+	}
+
+	return volume_info;
+}
+
+int
+cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
+{
+	int rc;
+	unsigned int xid;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	char   *full_path;
+	struct tcon_link *tlink;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	int referral_walks_count = 0;
+#endif
+
+	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
+	if (rc)
+		return rc;
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+try_mount_again:
+	/* cleanup activities if we're chasing a referral */
+	if (referral_walks_count) {
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (ses)
+			cifs_put_smb_ses(ses);
+
+		free_xid(xid);
+	}
+#endif
+	rc = 0;
+	tcon = NULL;
+	ses = NULL;
+	server = NULL;
+	full_path = NULL;
+	tlink = NULL;
+
+	xid = get_xid();
+
+	/* get a reference to a tcp session */
+	server = cifs_get_tcp_session(volume_info);
+	if (IS_ERR(server)) {
+		rc = PTR_ERR(server);
+		bdi_destroy(&cifs_sb->bdi);
+		goto out;
+	}
+
+	/* get a reference to a SMB session */
+	ses = cifs_get_smb_ses(server, volume_info);
+	if (IS_ERR(ses)) {
+		rc = PTR_ERR(ses);
+		ses = NULL;
+		goto mount_fail_check;
+	}
+
+	/* search for existing tcon to this server share */
+	tcon = cifs_get_tcon(ses, volume_info);
+	if (IS_ERR(tcon)) {
+		rc = PTR_ERR(tcon);
+		tcon = NULL;
+		goto remote_path_check;
+	}
+
+	/* tell server which Unix caps we support */
+	if (cap_unix(tcon->ses)) {
+		/* reset of caps checks mount to see if unix extensions
+		   disabled for just this mount */
+		reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info);
+		if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+		    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+		     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
+			rc = -EACCES;
+			goto mount_fail_check;
+		}
+	} else
+		tcon->unix_ext = 0; /* server does not support them */
+
+	/* do not care if a following call succeed - informational */
+	if (!tcon->ipc && server->ops->qfs_tcon)
+		server->ops->qfs_tcon(xid, tcon);
+
+	cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
+	cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
+
+	/* tune readahead according to rsize */
+	cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+
+remote_path_check:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	/*
+	 * Perform an unconditional check for whether there are DFS
+	 * referrals for this path without prefix, to provide support
+	 * for DFS referrals from w2k8 servers which don't seem to respond
+	 * with PATH_NOT_COVERED to requests that include the prefix.
+	 * Chase the referral if found, otherwise continue normally.
+	 */
+	if (referral_walks_count == 0) {
+		int refrc = expand_dfs_referral(xid, ses, volume_info, cifs_sb,
+						false);
+		if (!refrc) {
+			referral_walks_count++;
+			goto try_mount_again;
+		}
+	}
+#endif
+
+	/* check if a whole path is not remote */
+	if (!rc && tcon) {
+		if (!server->ops->is_path_accessible) {
+			rc = -ENOSYS;
+			goto mount_fail_check;
+		}
+		/*
+		 * cifs_build_path_to_root works only when we have a valid tcon
+		 */
+		full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
+		if (full_path == NULL) {
+			rc = -ENOMEM;
+			goto mount_fail_check;
+		}
+		rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
+						     full_path);
+		if (rc != 0 && rc != -EREMOTE) {
+			kfree(full_path);
+			goto mount_fail_check;
+		}
+		kfree(full_path);
+	}
+
+	/* get referral if needed */
+	if (rc == -EREMOTE) {
+#ifdef CONFIG_CIFS_DFS_UPCALL
+		if (referral_walks_count > MAX_NESTED_LINKS) {
+			/*
+			 * BB: when we implement proper loop detection,
+			 *     we will remove this check. But now we need it
+			 *     to prevent an indefinite loop if 'DFS tree' is
+			 *     misconfigured (i.e. has loops).
+			 */
+			rc = -ELOOP;
+			goto mount_fail_check;
+		}
+
+		rc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, true);
+
+		if (!rc) {
+			referral_walks_count++;
+			goto try_mount_again;
+		}
+		goto mount_fail_check;
+#else /* No DFS support, return error on mount */
+		rc = -EOPNOTSUPP;
+#endif
+	}
+
+	if (rc)
+		goto mount_fail_check;
+
+	/* now, hang the tcon off of the superblock */
+	tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
+	if (tlink == NULL) {
+		rc = -ENOMEM;
+		goto mount_fail_check;
+	}
+
+	tlink->tl_uid = ses->linux_uid;
+	tlink->tl_tcon = tcon;
+	tlink->tl_time = jiffies;
+	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+	set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+
+	cifs_sb->master_tlink = tlink;
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+
+mount_fail_check:
+	/* on error free sesinfo and tcon struct if needed */
+	if (rc) {
+		/* If find_unc succeeded then rc == 0 so we can not end */
+		/* up accidentally freeing someone elses tcon struct */
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (ses)
+			cifs_put_smb_ses(ses);
+		else
+			cifs_put_tcp_session(server);
+		bdi_destroy(&cifs_sb->bdi);
+	}
+
+out:
+	free_xid(xid);
+	return rc;
+}
+
+/*
+ * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon
+ * pointer may be NULL.
+ */
+int
+CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
+	 const char *tree, struct cifs_tcon *tcon,
+	 const struct nls_table *nls_codepage)
+{
+	struct smb_hdr *smb_buffer;
+	struct smb_hdr *smb_buffer_response;
+	TCONX_REQ *pSMB;
+	TCONX_RSP *pSMBr;
+	unsigned char *bcc_ptr;
+	int rc = 0;
+	int length;
+	__u16 bytes_left, count;
+
+	if (ses == NULL)
+		return -EIO;
+
+	smb_buffer = cifs_buf_get();
+	if (smb_buffer == NULL)
+		return -ENOMEM;
+
+	smb_buffer_response = smb_buffer;
+
+	header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
+			NULL /*no tid */ , 4 /*wct */ );
+
+	smb_buffer->Mid = get_next_mid(ses->server);
+	smb_buffer->Uid = ses->Suid;
+	pSMB = (TCONX_REQ *) smb_buffer;
+	pSMBr = (TCONX_RSP *) smb_buffer_response;
+
+	pSMB->AndXCommand = 0xFF;
+	pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
+	bcc_ptr = &pSMB->Password[0];
+	if (!tcon || (ses->server->sec_mode & SECMODE_USER)) {
+		pSMB->PasswordLength = cpu_to_le16(1);	/* minimum */
+		*bcc_ptr = 0; /* password is null byte */
+		bcc_ptr++;              /* skip password */
+		/* already aligned so no need to do it below */
+	} else {
+		pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+		/* BB FIXME add code to fail this if NTLMv2 or Kerberos
+		   specified as required (when that support is added to
+		   the vfs in the future) as only NTLM or the much
+		   weaker LANMAN (which we do not send by default) is accepted
+		   by Samba (not sure whether other servers allow
+		   NTLMv2 password here) */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
+		    (ses->sectype == LANMAN))
+			calc_lanman_hash(tcon->password, ses->server->cryptkey,
+					 ses->server->sec_mode &
+					    SECMODE_PW_ENCRYPT ? true : false,
+					 bcc_ptr);
+		else
+#endif /* CIFS_WEAK_PW_HASH */
+		rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
+					bcc_ptr, nls_codepage);
+		if (rc) {
+			cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n",
+				 __func__, rc);
+			cifs_buf_release(smb_buffer);
+			return rc;
+		}
+
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
+		if (ses->capabilities & CAP_UNICODE) {
+			/* must align unicode strings */
+			*bcc_ptr = 0; /* null byte password */
+			bcc_ptr++;
+		}
+	}
+
+	if (ses->server->sign)
+		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	if (ses->capabilities & CAP_STATUS32) {
+		smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
+	}
+	if (ses->capabilities & CAP_DFS) {
+		smb_buffer->Flags2 |= SMBFLG2_DFS;
+	}
+	if (ses->capabilities & CAP_UNICODE) {
+		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
+		length =
+		    cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
+			6 /* max utf8 char length in bytes */ *
+			(/* server len*/ + 256 /* share len */), nls_codepage);
+		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */
+		bcc_ptr += 2;	/* skip trailing null */
+	} else {		/* ASCII */
+		strcpy(bcc_ptr, tree);
+		bcc_ptr += strlen(tree) + 1;
+	}
+	strcpy(bcc_ptr, "?????");
+	bcc_ptr += strlen("?????");
+	bcc_ptr += 1;
+	count = bcc_ptr - &pSMB->Password[0];
+	pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu(
+					pSMB->hdr.smb_buf_length) + count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
+			 0);
+
+	/* above now done in SendReceive */
+	if ((rc == 0) && (tcon != NULL)) {
+		bool is_unicode;
+
+		tcon->tidStatus = CifsGood;
+		tcon->need_reconnect = false;
+		tcon->tid = smb_buffer_response->Tid;
+		bcc_ptr = pByteArea(smb_buffer_response);
+		bytes_left = get_bcc(smb_buffer_response);
+		length = strnlen(bcc_ptr, bytes_left - 2);
+		if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
+			is_unicode = true;
+		else
+			is_unicode = false;
+
+
+		/* skip service field (NB: this field is always ASCII) */
+		if (length == 3) {
+			if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
+			    (bcc_ptr[2] == 'C')) {
+				cifs_dbg(FYI, "IPC connection\n");
+				tcon->ipc = 1;
+			}
+		} else if (length == 2) {
+			if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
+				/* the most common case */
+				cifs_dbg(FYI, "disk share connection\n");
+			}
+		}
+		bcc_ptr += length + 1;
+		bytes_left -= (length + 1);
+		strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
+
+		/* mostly informational -- no need to fail on error here */
+		kfree(tcon->nativeFileSystem);
+		tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
+						      bytes_left, is_unicode,
+						      nls_codepage);
+
+		cifs_dbg(FYI, "nativeFileSystem=%s\n", tcon->nativeFileSystem);
+
+		if ((smb_buffer_response->WordCount == 3) ||
+			 (smb_buffer_response->WordCount == 7))
+			/* field is in same location */
+			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
+		else
+			tcon->Flags = 0;
+		cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags);
+	} else if ((rc == 0) && tcon == NULL) {
+		/* all we need to save for IPC$ connection */
+		ses->ipc_tid = smb_buffer_response->Tid;
+	}
+
+	cifs_buf_release(smb_buffer);
+	return rc;
+}
+
+static void delayed_free(struct rcu_head *p)
+{
+	struct cifs_sb_info *sbi = container_of(p, struct cifs_sb_info, rcu);
+	unload_nls(sbi->local_nls);
+	kfree(sbi);
+}
+
+void
+cifs_umount(struct cifs_sb_info *cifs_sb)
+{
+	struct rb_root *root = &cifs_sb->tlink_tree;
+	struct rb_node *node;
+	struct tcon_link *tlink;
+
+	cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	while ((node = rb_first(root))) {
+		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+		cifs_get_tlink(tlink);
+		clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+		rb_erase(node, root);
+
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		cifs_put_tlink(tlink);
+		spin_lock(&cifs_sb->tlink_tree_lock);
+	}
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	bdi_destroy(&cifs_sb->bdi);
+	kfree(cifs_sb->mountdata);
+	call_rcu(&cifs_sb->rcu, delayed_free);
+}
+
+int
+cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
+{
+	int rc = 0;
+	struct TCP_Server_Info *server = ses->server;
+
+	if (!server->ops->need_neg || !server->ops->negotiate)
+		return -ENOSYS;
+
+	/* only send once per connect */
+	if (!server->ops->need_neg(server))
+		return 0;
+
+	set_credits(server, 1);
+
+	rc = server->ops->negotiate(xid, ses);
+	if (rc == 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (server->tcpStatus == CifsNeedNegotiate)
+			server->tcpStatus = CifsGood;
+		else
+			rc = -EHOSTDOWN;
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+	return rc;
+}
+
+int
+cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+		   struct nls_table *nls_info)
+{
+	int rc = -ENOSYS;
+	struct TCP_Server_Info *server = ses->server;
+
+	ses->capabilities = server->capabilities;
+	if (linuxExtEnabled == 0)
+		ses->capabilities &= (~server->vals->cap_unix);
+
+	cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n",
+		 server->sec_mode, server->capabilities, server->timeAdj);
+
+	if (server->ops->sess_setup)
+		rc = server->ops->sess_setup(xid, ses, nls_info);
+
+	if (rc)
+		cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc);
+
+	return rc;
+}
+
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
+{
+	vol->sectype = ses->sectype;
+
+	/* krb5 is special, since we don't need username or pw */
+	if (vol->sectype == Kerberos)
+		return 0;
+
+	return cifs_set_cifscreds(vol, ses);
+}
+
+static struct cifs_tcon *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
+{
+	int rc;
+	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon = NULL;
+	struct smb_vol *vol_info;
+
+	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
+	if (vol_info == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	vol_info->local_nls = cifs_sb->local_nls;
+	vol_info->linux_uid = fsuid;
+	vol_info->cred_uid = fsuid;
+	vol_info->UNC = master_tcon->treeName;
+	vol_info->retry = master_tcon->retry;
+	vol_info->nocase = master_tcon->nocase;
+	vol_info->local_lease = master_tcon->local_lease;
+	vol_info->no_linux_ext = !master_tcon->unix_ext;
+	vol_info->sectype = master_tcon->ses->sectype;
+	vol_info->sign = master_tcon->ses->sign;
+
+	rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
+	if (rc) {
+		tcon = ERR_PTR(rc);
+		goto out;
+	}
+
+	/* get a reference for the same TCP session */
+	spin_lock(&cifs_tcp_ses_lock);
+	++master_tcon->ses->server->srv_count;
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
+	if (IS_ERR(ses)) {
+		tcon = (struct cifs_tcon *)ses;
+		cifs_put_tcp_session(master_tcon->ses->server);
+		goto out;
+	}
+
+	tcon = cifs_get_tcon(ses, vol_info);
+	if (IS_ERR(tcon)) {
+		cifs_put_smb_ses(ses);
+		goto out;
+	}
+
+	if (cap_unix(ses))
+		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+out:
+	kfree(vol_info->username);
+	kfree(vol_info->password);
+	kfree(vol_info);
+
+	return tcon;
+}
+
+struct cifs_tcon *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+	return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
+}
+
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, kuid_t uid)
+{
+	struct rb_node *node = root->rb_node;
+	struct tcon_link *tlink;
+
+	while (node) {
+		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+
+		if (uid_gt(tlink->tl_uid, uid))
+			node = node->rb_left;
+		else if (uid_lt(tlink->tl_uid, uid))
+			node = node->rb_right;
+		else
+			return tlink;
+	}
+	return NULL;
+}
+
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct tcon_link *tlink;
+
+	while (*new) {
+		tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+		parent = *new;
+
+		if (uid_gt(tlink->tl_uid, new_tlink->tl_uid))
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&new_tlink->tl_rbnode, parent, new);
+	rb_insert_color(&new_tlink->tl_rbnode, root);
+}
+
+/*
+ * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
+ * current task.
+ *
+ * If the superblock doesn't refer to a multiuser mount, then just return
+ * the master tcon for the mount.
+ *
+ * First, search the rbtree for an existing tcon for this fsuid. If one
+ * exists, then check to see if it's pending construction. If it is then wait
+ * for construction to complete. Once it's no longer pending, check to see if
+ * it failed and either return an error or retry construction, depending on
+ * the timeout.
+ *
+ * If one doesn't exist then insert a new tcon_link struct into the tree and
+ * try to construct a new one.
+ */
+struct tcon_link *
+cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
+{
+	int ret;
+	kuid_t fsuid = current_fsuid();
+	struct tcon_link *tlink, *newtlink;
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+	if (tlink)
+		cifs_get_tlink(tlink);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	if (tlink == NULL) {
+		newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+		if (newtlink == NULL)
+			return ERR_PTR(-ENOMEM);
+		newtlink->tl_uid = fsuid;
+		newtlink->tl_tcon = ERR_PTR(-EACCES);
+		set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
+		set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
+		cifs_get_tlink(newtlink);
+
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		/* was one inserted after previous search? */
+		tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+		if (tlink) {
+			cifs_get_tlink(tlink);
+			spin_unlock(&cifs_sb->tlink_tree_lock);
+			kfree(newtlink);
+			goto wait_for_construction;
+		}
+		tlink = newtlink;
+		tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+	} else {
+wait_for_construction:
+		ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
+				  TASK_INTERRUPTIBLE);
+		if (ret) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(-ERESTARTSYS);
+		}
+
+		/* if it's good, return it */
+		if (!IS_ERR(tlink->tl_tcon))
+			return tlink;
+
+		/* return error if we tried this already recently */
+		if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(-EACCES);
+		}
+
+		if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
+			goto wait_for_construction;
+	}
+
+	tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
+	clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
+	wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
+
+	if (IS_ERR(tlink->tl_tcon)) {
+		cifs_put_tlink(tlink);
+		return ERR_PTR(-EACCES);
+	}
+
+	return tlink;
+}
+
+/*
+ * periodic workqueue job that scans tcon_tree for a superblock and closes
+ * out tcons.
+ */
+static void
+cifs_prune_tlinks(struct work_struct *work)
+{
+	struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
+						    prune_tlinks.work);
+	struct rb_root *root = &cifs_sb->tlink_tree;
+	struct rb_node *node = rb_first(root);
+	struct rb_node *tmp;
+	struct tcon_link *tlink;
+
+	/*
+	 * Because we drop the spinlock in the loop in order to put the tlink
+	 * it's not guarded against removal of links from the tree. The only
+	 * places that remove entries from the tree are this function and
+	 * umounts. Because this function is non-reentrant and is canceled
+	 * before umount can proceed, this is safe.
+	 */
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	node = rb_first(root);
+	while (node != NULL) {
+		tmp = node;
+		node = rb_next(tmp);
+		tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
+
+		if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
+		    atomic_read(&tlink->tl_count) != 0 ||
+		    time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
+			continue;
+
+		cifs_get_tlink(tlink);
+		clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+		rb_erase(tmp, root);
+
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		cifs_put_tlink(tlink);
+		spin_lock(&cifs_sb->tlink_tree_lock);
+	}
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+}
diff --git a/kernel/fs/cifs/dir.c b/kernel/fs/cifs/dir.c
new file mode 100644
index 000000000..c3eb998a9
--- /dev/null
+++ b/kernel/fs/cifs/dir.c
@@ -0,0 +1,924 @@
+/*
+ *   fs/cifs/dir.c
+ *
+ *   vfs operations that deal with dentries
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+
+static void
+renew_parental_timestamps(struct dentry *direntry)
+{
+	/* BB check if there is a way to get the kernel to do this or if we
+	   really need this */
+	do {
+		direntry->d_time = jiffies;
+		direntry = direntry->d_parent;
+	} while (!IS_ROOT(direntry));
+}
+
+char *
+cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+			struct cifs_tcon *tcon)
+{
+	int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+	int dfsplen;
+	char *full_path = NULL;
+
+	/* if no prefix path, simply set path to the root of share to "" */
+	if (pplen == 0) {
+		full_path = kzalloc(1, GFP_KERNEL);
+		return full_path;
+	}
+
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+	else
+		dfsplen = 0;
+
+	full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
+	if (full_path == NULL)
+		return full_path;
+
+	if (dfsplen)
+		strncpy(full_path, tcon->treeName, dfsplen);
+	full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
+	strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
+	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+	full_path[dfsplen + pplen] = 0; /* add trailing null */
+	return full_path;
+}
+
+/* Note: caller must free return buffer */
+char *
+build_path_from_dentry(struct dentry *direntry)
+{
+	struct dentry *temp;
+	int namelen;
+	int dfsplen;
+	char *full_path;
+	char dirsep;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	unsigned seq;
+
+	dirsep = CIFS_DIR_SEP(cifs_sb);
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+	else
+		dfsplen = 0;
+cifs_bp_rename_retry:
+	namelen = dfsplen;
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	for (temp = direntry; !IS_ROOT(temp);) {
+		namelen += (1 + temp->d_name.len);
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			cifs_dbg(VFS, "corrupt dentry\n");
+			rcu_read_unlock();
+			return NULL;
+		}
+	}
+	rcu_read_unlock();
+
+	full_path = kmalloc(namelen+1, GFP_KERNEL);
+	if (full_path == NULL)
+		return full_path;
+	full_path[namelen] = 0;	/* trailing null */
+	rcu_read_lock();
+	for (temp = direntry; !IS_ROOT(temp);) {
+		spin_lock(&temp->d_lock);
+		namelen -= 1 + temp->d_name.len;
+		if (namelen < 0) {
+			spin_unlock(&temp->d_lock);
+			break;
+		} else {
+			full_path[namelen] = dirsep;
+			strncpy(full_path + namelen + 1, temp->d_name.name,
+				temp->d_name.len);
+			cifs_dbg(FYI, "name: %s\n", full_path + namelen);
+		}
+		spin_unlock(&temp->d_lock);
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			cifs_dbg(VFS, "corrupt dentry\n");
+			rcu_read_unlock();
+			kfree(full_path);
+			return NULL;
+		}
+	}
+	rcu_read_unlock();
+	if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
+		cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
+			 namelen, dfsplen);
+		/* presumably this is only possible if racing with a rename
+		of one of the parent directories  (we can not lock the dentries
+		above us to prevent this, but retrying should be harmless) */
+		kfree(full_path);
+		goto cifs_bp_rename_retry;
+	}
+	/* DIR_SEP already set for byte  0 / vs \ but not for
+	   subsequent slashes in prepath which currently must
+	   be entered the right way - not sure if there is an alternative
+	   since the '\' is a valid posix character so we can not switch
+	   those safely to '/' if any are found in the middle of the prepath */
+	/* BB test paths to Windows with '/' in the midst of prepath */
+
+	if (dfsplen) {
+		strncpy(full_path, tcon->treeName, dfsplen);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+			int i;
+			for (i = 0; i < dfsplen; i++) {
+				if (full_path[i] == '\\')
+					full_path[i] = '/';
+			}
+		}
+	}
+	return full_path;
+}
+
+/*
+ * Don't allow the separator character in a path component.
+ * The VFS will not allow "/", but "\" is allowed by posix.
+ */
+static int
+check_name(struct dentry *direntry)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	int i;
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
+		for (i = 0; i < direntry->d_name.len; i++) {
+			if (direntry->d_name.name[i] == '\\') {
+				cifs_dbg(FYI, "Invalid file name\n");
+				return -EINVAL;
+			}
+		}
+	}
+	return 0;
+}
+
+
+/* Inode operations in similar order to how they appear in Linux file fs.h */
+
+static int
+cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
+	       struct tcon_link *tlink, unsigned oflags, umode_t mode,
+	       __u32 *oplock, struct cifs_fid *fid)
+{
+	int rc = -ENOENT;
+	int create_options = CREATE_NOT_DIR;
+	int desired_access;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_tcon *tcon = tlink_tcon(tlink);
+	char *full_path = NULL;
+	FILE_ALL_INFO *buf = NULL;
+	struct inode *newinode = NULL;
+	int disposition;
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifs_open_parms oparms;
+
+	*oplock = 0;
+	if (tcon->ses->server->oplocks)
+		*oplock = REQ_OPLOCK;
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_open(full_path, &newinode, inode->i_sb, mode,
+				     oflags, oplock, &fid->netfid, xid);
+		switch (rc) {
+		case 0:
+			if (newinode == NULL) {
+				/* query inode info */
+				goto cifs_create_get_file_info;
+			}
+
+			if (!S_ISREG(newinode->i_mode)) {
+				/*
+				 * The server may allow us to open things like
+				 * FIFOs, but the client isn't set up to deal
+				 * with that. If it's not a regular file, just
+				 * close it and proceed as if it were a normal
+				 * lookup.
+				 */
+				CIFSSMBClose(xid, tcon, fid->netfid);
+				goto cifs_create_get_file_info;
+			}
+			/* success, no need to query */
+			goto cifs_create_set_dentry;
+
+		case -ENOENT:
+			goto cifs_create_get_file_info;
+
+		case -EIO:
+		case -EINVAL:
+			/*
+			 * EIO could indicate that (posix open) operation is not
+			 * supported, despite what server claimed in capability
+			 * negotiation.
+			 *
+			 * POSIX open in samba versions 3.3.1 and earlier could
+			 * incorrectly fail with invalid parameter.
+			 */
+			tcon->broken_posix_open = true;
+			break;
+
+		case -EREMOTE:
+		case -EOPNOTSUPP:
+			/*
+			 * EREMOTE indicates DFS junction, which is not handled
+			 * in posix open.  If either that or op not supported
+			 * returned, follow the normal lookup.
+			 */
+			break;
+
+		default:
+			goto out;
+		}
+		/*
+		 * fallthrough to retry, using older open call, this is case
+		 * where server does not support this SMB level, and falsely
+		 * claims capability (also get here for DFS case which should be
+		 * rare for path not covered on files)
+		 */
+	}
+
+	desired_access = 0;
+	if (OPEN_FMODE(oflags) & FMODE_READ)
+		desired_access |= GENERIC_READ; /* is this too little? */
+	if (OPEN_FMODE(oflags) & FMODE_WRITE)
+		desired_access |= GENERIC_WRITE;
+
+	disposition = FILE_OVERWRITE_IF;
+	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		disposition = FILE_CREATE;
+	else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+		disposition = FILE_OVERWRITE_IF;
+	else if ((oflags & O_CREAT) == O_CREAT)
+		disposition = FILE_OPEN_IF;
+	else
+		cifs_dbg(FYI, "Create flag not set in create function\n");
+
+	/*
+	 * BB add processing to set equivalent of mode - e.g. via CreateX with
+	 * ACLs
+	 */
+
+	if (!server->ops->open) {
+		rc = -ENOSYS;
+		goto out;
+	}
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * if we're not using unix extensions, see if we need to set
+	 * ATTR_READONLY on the create call
+	 */
+	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
+		create_options |= CREATE_OPTION_READONLY;
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = desired_access;
+	oparms.create_options = create_options;
+	oparms.disposition = disposition;
+	oparms.path = full_path;
+	oparms.fid = fid;
+	oparms.reconnect = false;
+
+	rc = server->ops->open(xid, &oparms, oplock, buf);
+	if (rc) {
+		cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
+		goto out;
+	}
+
+	/*
+	 * If Open reported that we actually created a file then we now have to
+	 * set the mode if possible.
+	 */
+	if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) {
+		struct cifs_unix_set_info_args args = {
+				.mode	= mode,
+				.ctime	= NO_CHANGE_64,
+				.atime	= NO_CHANGE_64,
+				.mtime	= NO_CHANGE_64,
+				.device	= 0,
+		};
+
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				args.gid = inode->i_gid;
+			else
+				args.gid = current_fsgid();
+		} else {
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
+		}
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid,
+				       current->tgid);
+	} else {
+		/*
+		 * BB implement mode setting via Windows security
+		 * descriptors e.g.
+		 */
+		/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
+
+		/* Could set r/o dos attribute if mode & 0222 == 0 */
+	}
+
+cifs_create_get_file_info:
+	/* server might mask mode so we have to query for it */
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
+					      xid);
+	else {
+		rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
+					 xid, fid);
+		if (newinode) {
+			if (server->ops->set_lease_key)
+				server->ops->set_lease_key(newinode, fid);
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+				newinode->i_mode = mode;
+			if ((*oplock & CIFS_CREATE_ACTION) &&
+			    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+				newinode->i_uid = current_fsuid();
+				if (inode->i_mode & S_ISGID)
+					newinode->i_gid = inode->i_gid;
+				else
+					newinode->i_gid = current_fsgid();
+			}
+		}
+	}
+
+cifs_create_set_dentry:
+	if (rc != 0) {
+		cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
+			 rc);
+		if (server->ops->close)
+			server->ops->close(xid, tcon, fid);
+		goto out;
+	}
+	d_drop(direntry);
+	d_add(direntry, newinode);
+
+out:
+	kfree(buf);
+	kfree(full_path);
+	return rc;
+}
+
+int
+cifs_atomic_open(struct inode *inode, struct dentry *direntry,
+		 struct file *file, unsigned oflags, umode_t mode,
+		 int *opened)
+{
+	int rc;
+	unsigned int xid;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifs_fid fid;
+	struct cifs_pending_open open;
+	__u32 oplock;
+	struct cifsFileInfo *file_info;
+
+	/*
+	 * Posix open is only called (at lookup time) for file create now. For
+	 * opens (rather than creates), because we do not know if it is a file
+	 * or directory yet, and current Samba no longer allows us to do posix
+	 * open on dirs, we could end up wasting an open call on what turns out
+	 * to be a dir. For file opens, we wait to call posix open till
+	 * cifs_open.  It could be added to atomic_open in the future but the
+	 * performance tradeoff of the extra network request when EISDIR or
+	 * EACCES is returned would have to be weighed against the 50% reduction
+	 * in network traffic in the other paths.
+	 */
+	if (!(oflags & O_CREAT)) {
+		struct dentry *res;
+
+		/*
+		 * Check for hashed negative dentry. We have already revalidated
+		 * the dentry and it is fine. No need to perform another lookup.
+		 */
+		if (!d_unhashed(direntry))
+			return -ENOENT;
+
+		res = cifs_lookup(inode, direntry, 0);
+		if (IS_ERR(res))
+			return PTR_ERR(res);
+
+		return finish_no_open(file, res);
+	}
+
+	rc = check_name(direntry);
+	if (rc)
+		return rc;
+
+	xid = get_xid();
+
+	cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+		 inode, direntry, direntry);
+
+	tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto out_free_xid;
+	}
+
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (server->ops->new_lease_key)
+		server->ops->new_lease_key(&fid);
+
+	cifs_add_pending_open(&fid, tlink, &open);
+
+	rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
+			    &oplock, &fid);
+
+	if (rc) {
+		cifs_del_pending_open(&open);
+		goto out;
+	}
+
+	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		*opened |= FILE_CREATED;
+
+	rc = finish_open(file, direntry, generic_file_open, opened);
+	if (rc) {
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &fid);
+		cifs_del_pending_open(&open);
+		goto out;
+	}
+
+	if (file->f_flags & O_DIRECT &&
+	    CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+		if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			file->f_op = &cifs_file_direct_nobrl_ops;
+		else
+			file->f_op = &cifs_file_direct_ops;
+		}
+
+	file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
+	if (file_info == NULL) {
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &fid);
+		cifs_del_pending_open(&open);
+		fput(file);
+		rc = -ENOMEM;
+	}
+
+out:
+	cifs_put_tlink(tlink);
+out_free_xid:
+	free_xid(xid);
+	return rc;
+}
+
+int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
+		bool excl)
+{
+	int rc;
+	unsigned int xid = get_xid();
+	/*
+	 * BB below access is probably too much for mknod to request
+	 *    but we have to do query and setpathinfo so requesting
+	 *    less could fail (unless we want to request getatr and setatr
+	 *    permissions (only).  At least for POSIX we do not have to
+	 *    request so much.
+	 */
+	unsigned oflags = O_EXCL | O_CREAT | O_RDWR;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifs_fid fid;
+	__u32 oplock;
+
+	cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+		 inode, direntry, direntry);
+
+	tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
+	rc = PTR_ERR(tlink);
+	if (IS_ERR(tlink))
+		goto out_free_xid;
+
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (server->ops->new_lease_key)
+		server->ops->new_lease_key(&fid);
+
+	rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
+			    &oplock, &fid);
+	if (!rc && server->ops->close)
+		server->ops->close(xid, tcon, &fid);
+
+	cifs_put_tlink(tlink);
+out_free_xid:
+	free_xid(xid);
+	return rc;
+}
+
+int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
+		dev_t device_number)
+{
+	int rc = -EPERM;
+	unsigned int xid;
+	int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct cifs_io_parms io_parms;
+	char *full_path = NULL;
+	struct inode *newinode = NULL;
+	__u32 oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	FILE_ALL_INFO *buf = NULL;
+	unsigned int bytes_written;
+	struct win_dev *pdev;
+	struct kvec iov[2];
+
+	if (!old_valid_dev(device_number))
+		return -EINVAL;
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	tcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto mknod_out;
+	}
+
+	if (tcon->unix_ext) {
+		struct cifs_unix_set_info_args args = {
+			.mode	= mode & ~current_umask(),
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= device_number,
+		};
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = current_fsuid();
+			args.gid = current_fsgid();
+		} else {
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
+		}
+		rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+					    cifs_sb->local_nls,
+					    cifs_remap(cifs_sb));
+		if (rc)
+			goto mknod_out;
+
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+						inode->i_sb, xid);
+
+		if (rc == 0)
+			d_instantiate(direntry, newinode);
+		goto mknod_out;
+	}
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+		goto mknod_out;
+
+
+	cifs_dbg(FYI, "sfu compat create special file\n");
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		kfree(full_path);
+		rc = -ENOMEM;
+		free_xid(xid);
+		return rc;
+	}
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	if (tcon->ses->server->oplocks)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+	rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+	if (rc)
+		goto mknod_out;
+
+	/*
+	 * BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely.
+	 */
+
+	pdev = (struct win_dev *)buf;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = sizeof(struct win_dev);
+	iov[1].iov_base = buf;
+	iov[1].iov_len = sizeof(struct win_dev);
+	if (S_ISCHR(mode)) {
+		memcpy(pdev->type, "IntxCHR", 8);
+		pdev->major = cpu_to_le64(MAJOR(device_number));
+		pdev->minor = cpu_to_le64(MINOR(device_number));
+		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+							&bytes_written, iov, 1);
+	} else if (S_ISBLK(mode)) {
+		memcpy(pdev->type, "IntxBLK", 8);
+		pdev->major = cpu_to_le64(MAJOR(device_number));
+		pdev->minor = cpu_to_le64(MINOR(device_number));
+		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+							&bytes_written, iov, 1);
+	} /* else if (S_ISFIFO) */
+	tcon->ses->server->ops->close(xid, tcon, &fid);
+	d_drop(direntry);
+
+	/* FIXME: add code here to set EAs */
+
+mknod_out:
+	kfree(full_path);
+	kfree(buf);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+struct dentry *
+cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
+	    unsigned int flags)
+{
+	unsigned int xid;
+	int rc = 0; /* to get around spurious gcc warning, set to zero here */
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct inode *newInode = NULL;
+	char *full_path = NULL;
+
+	xid = get_xid();
+
+	cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+		 parent_dir_inode, direntry, direntry);
+
+	/* check whether path exists */
+
+	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		free_xid(xid);
+		return (struct dentry *)tlink;
+	}
+	pTcon = tlink_tcon(tlink);
+
+	rc = check_name(direntry);
+	if (rc)
+		goto lookup_out;
+
+	/* can not grab the rename sem here since it would
+	deadlock in the cases (beginning of sys_rename itself)
+	in which we already have the sb rename sem */
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto lookup_out;
+	}
+
+	if (d_really_is_positive(direntry)) {
+		cifs_dbg(FYI, "non-NULL inode in lookup\n");
+	} else {
+		cifs_dbg(FYI, "NULL inode in lookup\n");
+	}
+	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
+		 full_path, d_inode(direntry));
+
+	if (pTcon->unix_ext) {
+		rc = cifs_get_inode_info_unix(&newInode, full_path,
+					      parent_dir_inode->i_sb, xid);
+	} else {
+		rc = cifs_get_inode_info(&newInode, full_path, NULL,
+				parent_dir_inode->i_sb, xid, NULL);
+	}
+
+	if ((rc == 0) && (newInode != NULL)) {
+		d_add(direntry, newInode);
+		/* since paths are not looked up by component - the parent
+		   directories are presumed to be good here */
+		renew_parental_timestamps(direntry);
+
+	} else if (rc == -ENOENT) {
+		rc = 0;
+		direntry->d_time = jiffies;
+		d_add(direntry, NULL);
+	/*	if it was once a directory (but how can we tell?) we could do
+		shrink_dcache_parent(direntry); */
+	} else if (rc != -EACCES) {
+		cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
+		/* We special case check for Access Denied - since that
+		is a common return code */
+	}
+
+lookup_out:
+	kfree(full_path);
+	cifs_put_tlink(tlink);
+	free_xid(xid);
+	return ERR_PTR(rc);
+}
+
+static int
+cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
+{
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	if (d_really_is_positive(direntry)) {
+		if (cifs_revalidate_dentry(direntry))
+			return 0;
+		else {
+			/*
+			 * If the inode wasn't known to be a dfs entry when
+			 * the dentry was instantiated, such as when created
+			 * via ->readdir(), it needs to be set now since the
+			 * attributes will have been updated by
+			 * cifs_revalidate_dentry().
+			 */
+			if (IS_AUTOMOUNT(d_inode(direntry)) &&
+			   !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
+				spin_lock(&direntry->d_lock);
+				direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+				spin_unlock(&direntry->d_lock);
+			}
+
+			return 1;
+		}
+	}
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!flags)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+		return 0;
+
+	if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+		return 0;
+
+	return 1;
+}
+
+/* static int cifs_d_delete(struct dentry *direntry)
+{
+	int rc = 0;
+
+	cifs_dbg(FYI, "In cifs d_delete, name = %pd\n", direntry);
+
+	return rc;
+}     */
+
+const struct dentry_operations cifs_dentry_ops = {
+	.d_revalidate = cifs_d_revalidate,
+	.d_automount = cifs_dfs_d_automount,
+/* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
+};
+
+static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
+{
+	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
+	unsigned long hash;
+	wchar_t c;
+	int i, charlen;
+
+	hash = init_name_hash();
+	for (i = 0; i < q->len; i += charlen) {
+		charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+		/* error out if we can't convert the character */
+		if (unlikely(charlen < 0))
+			return charlen;
+		hash = partial_name_hash(cifs_toupper(c), hash);
+	}
+	q->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
+	wchar_t c1, c2;
+	int i, l1, l2;
+
+	/*
+	 * We make the assumption here that uppercase characters in the local
+	 * codepage are always the same length as their lowercase counterparts.
+	 *
+	 * If that's ever not the case, then this will fail to match it.
+	 */
+	if (name->len != len)
+		return 1;
+
+	for (i = 0; i < len; i += l1) {
+		/* Convert characters in both strings to UTF-16. */
+		l1 = codepage->char2uni(&str[i], len - i, &c1);
+		l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+
+		/*
+		 * If we can't convert either character, just declare it to
+		 * be 1 byte long and compare the original byte.
+		 */
+		if (unlikely(l1 < 0 && l2 < 0)) {
+			if (str[i] != name->name[i])
+				return 1;
+			l1 = 1;
+			continue;
+		}
+
+		/*
+		 * Here, we again ass|u|me that upper/lowercase versions of
+		 * a character are the same length in the local NLS.
+		 */
+		if (l1 != l2)
+			return 1;
+
+		/* Now compare uppercase versions of these characters */
+		if (cifs_toupper(c1) != cifs_toupper(c2))
+			return 1;
+	}
+
+	return 0;
+}
+
+const struct dentry_operations cifs_ci_dentry_ops = {
+	.d_revalidate = cifs_d_revalidate,
+	.d_hash = cifs_ci_hash,
+	.d_compare = cifs_ci_compare,
+	.d_automount = cifs_dfs_d_automount,
+};
diff --git a/kernel/fs/cifs/dns_resolve.c b/kernel/fs/cifs/dns_resolve.c
new file mode 100644
index 000000000..7ede73065
--- /dev/null
+++ b/kernel/fs/cifs/dns_resolve.c
@@ -0,0 +1,99 @@
+/*
+ *  fs/cifs/dns_resolve.c
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   Contains the CIFS DFS upcall routines used for hostname to
+ *   IP address translation.
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/slab.h>
+#include <linux/dns_resolver.h>
+#include "dns_resolve.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+/**
+ * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
+ * @unc: UNC path specifying the server (with '/' as delimiter)
+ * @ip_addr: Where to return the IP address.
+ *
+ * The IP address will be returned in string form, and the caller is
+ * responsible for freeing it.
+ *
+ * Returns length of result on success, -ve on error.
+ */
+int
+dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
+{
+	struct sockaddr_storage ss;
+	const char *hostname, *sep;
+	char *name;
+	int len, rc;
+
+	if (!ip_addr || !unc)
+		return -EINVAL;
+
+	len = strlen(unc);
+	if (len < 3) {
+		cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc);
+		return -EINVAL;
+	}
+
+	/* Discount leading slashes for cifs */
+	len -= 2;
+	hostname = unc + 2;
+
+	/* Search for server name delimiter */
+	sep = memchr(hostname, '/', len);
+	if (sep)
+		len = sep - hostname;
+	else
+		cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n",
+			 __func__, unc);
+
+	/* Try to interpret hostname as an IPv4 or IPv6 address */
+	rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
+	if (rc > 0)
+		goto name_is_IP_address;
+
+	/* Perform the upcall */
+	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+	if (rc < 0)
+		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
+			 __func__, len, len, hostname);
+	else
+		cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n",
+			 __func__, len, len, hostname, *ip_addr);
+	return rc;
+
+name_is_IP_address:
+	name = kmalloc(len + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+	memcpy(name, hostname, len);
+	name[len] = 0;
+	cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n",
+		 __func__, name);
+	*ip_addr = name;
+	return 0;
+}
diff --git a/kernel/fs/cifs/dns_resolve.h b/kernel/fs/cifs/dns_resolve.h
new file mode 100644
index 000000000..d3f5d27f4
--- /dev/null
+++ b/kernel/fs/cifs/dns_resolve.h
@@ -0,0 +1,30 @@
+/*
+ *   fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
+ *                            Handles host name to IP address resolution
+ *
+ *   Copyright (c) International Business Machines  Corp., 2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _DNS_RESOLVE_H
+#define _DNS_RESOLVE_H
+
+#ifdef __KERNEL__
+extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
+#endif /* KERNEL */
+
+#endif /* _DNS_RESOLVE_H */
diff --git a/kernel/fs/cifs/export.c b/kernel/fs/cifs/export.c
new file mode 100644
index 000000000..ce8b7f677
--- /dev/null
+++ b/kernel/fs/cifs/export.c
@@ -0,0 +1,67 @@
+/*
+ *   fs/cifs/export.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Common Internet FileSystem (CIFS) client
+ *
+ *   Operations related to support for exporting files via NFSD
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ /*
+  * See Documentation/filesystems/nfs/Exporting
+  * and examples in fs/exportfs
+  *
+  * Since cifs is a network file system, an "fsid" must be included for
+  * any nfs exports file entries which refer to cifs paths.  In addition
+  * the cifs mount must be mounted with the "serverino" option (ie use stable
+  * server inode numbers instead of locally generated temporary ones).
+  * Although cifs inodes do not use generation numbers (have generation number
+  * of zero) - the inode number alone should be good enough for simple cases
+  * in which users want to export cifs shares with NFS. The decode and encode
+  * could be improved by using a new routine which expects 64 bit inode numbers
+  * instead of the default 32 bit routines in fs/exportfs
+  *
+  */
+
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifsfs.h"
+
+#ifdef CONFIG_CIFS_NFSD_EXPORT
+static struct dentry *cifs_get_parent(struct dentry *dentry)
+{
+	/* BB need to add code here eventually to enable export via NFSD */
+	cifs_dbg(FYI, "get parent for %p\n", dentry);
+	return ERR_PTR(-EACCES);
+}
+
+const struct export_operations cifs_export_ops = {
+	.get_parent = cifs_get_parent,
+/*	Following five export operations are unneeded so far and can default:
+	.get_dentry =
+	.get_name =
+	.find_exported_dentry =
+	.decode_fh =
+	.encode_fs =  */
+};
+
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
+
diff --git a/kernel/fs/cifs/file.c b/kernel/fs/cifs/file.c
new file mode 100644
index 000000000..3f50cee79
--- /dev/null
+++ b/kernel/fs/cifs/file.c
@@ -0,0 +1,3896 @@
+/*
+ *   fs/cifs/file.c
+ *
+ *   vfs operations that deal with files
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/delay.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "fscache.h"
+
+
+static inline int cifs_convert_flags(unsigned int flags)
+{
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		return GENERIC_READ;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		return GENERIC_WRITE;
+	else if ((flags & O_ACCMODE) == O_RDWR) {
+		/* GENERIC_ALL is too much permission to request
+		   can cause unnecessary access denied on create */
+		/* return GENERIC_ALL; */
+		return (GENERIC_READ | GENERIC_WRITE);
+	}
+
+	return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
+		FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
+		FILE_READ_DATA);
+}
+
+static u32 cifs_posix_convert_flags(unsigned int flags)
+{
+	u32 posix_flags = 0;
+
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		posix_flags = SMB_O_RDONLY;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		posix_flags = SMB_O_WRONLY;
+	else if ((flags & O_ACCMODE) == O_RDWR)
+		posix_flags = SMB_O_RDWR;
+
+	if (flags & O_CREAT) {
+		posix_flags |= SMB_O_CREAT;
+		if (flags & O_EXCL)
+			posix_flags |= SMB_O_EXCL;
+	} else if (flags & O_EXCL)
+		cifs_dbg(FYI, "Application %s pid %d has incorrectly set O_EXCL flag but not O_CREAT on file open. Ignoring O_EXCL\n",
+			 current->comm, current->tgid);
+
+	if (flags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	/* be safe and imply O_SYNC for O_DSYNC */
+	if (flags & O_DSYNC)
+		posix_flags |= SMB_O_SYNC;
+	if (flags & O_DIRECTORY)
+		posix_flags |= SMB_O_DIRECTORY;
+	if (flags & O_NOFOLLOW)
+		posix_flags |= SMB_O_NOFOLLOW;
+	if (flags & O_DIRECT)
+		posix_flags |= SMB_O_DIRECT;
+
+	return posix_flags;
+}
+
+static inline int cifs_get_disposition(unsigned int flags)
+{
+	if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		return FILE_CREATE;
+	else if ((flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+		return FILE_OVERWRITE_IF;
+	else if ((flags & O_CREAT) == O_CREAT)
+		return FILE_OPEN_IF;
+	else if ((flags & O_TRUNC) == O_TRUNC)
+		return FILE_OVERWRITE;
+	else
+		return FILE_OPEN;
+}
+
+int cifs_posix_open(char *full_path, struct inode **pinode,
+			struct super_block *sb, int mode, unsigned int f_flags,
+			__u32 *poplock, __u16 *pnetfid, unsigned int xid)
+{
+	int rc;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_fattr fattr;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+
+	cifs_dbg(FYI, "posix open %s\n", full_path);
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto posix_open_ret;
+	}
+
+	tcon = tlink_tcon(tlink);
+	mode &= ~current_umask();
+
+	posix_flags = cifs_posix_convert_flags(f_flags);
+	rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
+			     poplock, full_path, cifs_sb->local_nls,
+			     cifs_remap(cifs_sb));
+	cifs_put_tlink(tlink);
+
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+
+	/* get new inode and set it up */
+	if (*pinode == NULL) {
+		cifs_fill_uniqueid(sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
+		if (!*pinode) {
+			rc = -ENOMEM;
+			goto posix_open_ret;
+		}
+	} else {
+		cifs_fattr_to_inode(*pinode, &fattr);
+	}
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
+static int
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+	     struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
+	     struct cifs_fid *fid, unsigned int xid)
+{
+	int rc;
+	int desired_access;
+	int disposition;
+	int create_options = CREATE_NOT_DIR;
+	FILE_ALL_INFO *buf;
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifs_open_parms oparms;
+
+	if (!server->ops->open)
+		return -ENOSYS;
+
+	desired_access = cifs_convert_flags(f_flags);
+
+/*********************************************************************
+ *  open flag mapping table:
+ *
+ *	POSIX Flag            CIFS Disposition
+ *	----------            ----------------
+ *	O_CREAT               FILE_OPEN_IF
+ *	O_CREAT | O_EXCL      FILE_CREATE
+ *	O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
+ *	O_TRUNC               FILE_OVERWRITE
+ *	none of the above     FILE_OPEN
+ *
+ *	Note that there is not a direct match between disposition
+ *	FILE_SUPERSEDE (ie create whether or not file exists although
+ *	O_CREAT | O_TRUNC is similar but truncates the existing
+ *	file rather than creating a new file as FILE_SUPERSEDE does
+ *	(which uses the attributes / metadata passed in on open call)
+ *?
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
+ *?  and the read write flags match reasonably.  O_LARGEFILE
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *	 O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+
+	disposition = cifs_get_disposition(f_flags);
+
+	/* BB pass O_SYNC flag through on file attributes .. BB */
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = desired_access;
+	oparms.create_options = create_options;
+	oparms.disposition = disposition;
+	oparms.path = full_path;
+	oparms.fid = fid;
+	oparms.reconnect = false;
+
+	rc = server->ops->open(xid, &oparms, oplock, buf);
+
+	if (rc)
+		goto out;
+
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+					      xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+					 xid, fid);
+
+out:
+	kfree(buf);
+	return rc;
+}
+
+static bool
+cifs_has_mand_locks(struct cifsInodeInfo *cinode)
+{
+	struct cifs_fid_locks *cur;
+	bool has_locks = false;
+
+	down_read(&cinode->lock_sem);
+	list_for_each_entry(cur, &cinode->llist, llist) {
+		if (!list_empty(&cur->locks)) {
+			has_locks = true;
+			break;
+		}
+	}
+	up_read(&cinode->lock_sem);
+	return has_locks;
+}
+
+struct cifsFileInfo *
+cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
+		  struct tcon_link *tlink, __u32 oplock)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifsFileInfo *cfile;
+	struct cifs_fid_locks *fdlocks;
+	struct cifs_tcon *tcon = tlink_tcon(tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (cfile == NULL)
+		return cfile;
+
+	fdlocks = kzalloc(sizeof(struct cifs_fid_locks), GFP_KERNEL);
+	if (!fdlocks) {
+		kfree(cfile);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&fdlocks->locks);
+	fdlocks->cfile = cfile;
+	cfile->llist = fdlocks;
+	down_write(&cinode->lock_sem);
+	list_add(&fdlocks->llist, &cinode->llist);
+	up_write(&cinode->lock_sem);
+
+	cfile->count = 1;
+	cfile->pid = current->tgid;
+	cfile->uid = current_fsuid();
+	cfile->dentry = dget(dentry);
+	cfile->f_flags = file->f_flags;
+	cfile->invalidHandle = false;
+	cfile->tlink = cifs_get_tlink(tlink);
+	INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
+	mutex_init(&cfile->fh_mutex);
+
+	cifs_sb_active(inode->i_sb);
+
+	/*
+	 * If the server returned a read oplock and we have mandatory brlocks,
+	 * set oplock level to None.
+	 */
+	if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) {
+		cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n");
+		oplock = 0;
+	}
+
+	spin_lock(&cifs_file_list_lock);
+	if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
+		oplock = fid->pending_open->oplock;
+	list_del(&fid->pending_open->olist);
+
+	fid->purge_cache = false;
+	server->ops->set_fid(cfile, fid, oplock);
+
+	list_add(&cfile->tlist, &tcon->openFileList);
+	/* if readable file instance put first in list*/
+	if (file->f_mode & FMODE_READ)
+		list_add(&cfile->flist, &cinode->openFileList);
+	else
+		list_add_tail(&cfile->flist, &cinode->openFileList);
+	spin_unlock(&cifs_file_list_lock);
+
+	if (fid->purge_cache)
+		cifs_zap_mapping(inode);
+
+	file->private_data = cfile;
+	return cfile;
+}
+
+struct cifsFileInfo *
+cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+	spin_lock(&cifs_file_list_lock);
+	cifsFileInfo_get_locked(cifs_file);
+	spin_unlock(&cifs_file_list_lock);
+	return cifs_file;
+}
+
+/*
+ * Release a reference on the file private data. This may involve closing
+ * the filehandle out on the server. Must be called without holding
+ * cifs_file_list_lock.
+ */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+	struct inode *inode = d_inode(cifs_file->dentry);
+	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifsLockInfo *li, *tmp;
+	struct cifs_fid fid;
+	struct cifs_pending_open open;
+	bool oplock_break_cancelled;
+
+	spin_lock(&cifs_file_list_lock);
+	if (--cifs_file->count > 0) {
+		spin_unlock(&cifs_file_list_lock);
+		return;
+	}
+
+	if (server->ops->get_lease_key)
+		server->ops->get_lease_key(inode, &fid);
+
+	/* store open in pending opens to make sure we don't miss lease break */
+	cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open);
+
+	/* remove it from the lists */
+	list_del(&cifs_file->flist);
+	list_del(&cifs_file->tlist);
+
+	if (list_empty(&cifsi->openFileList)) {
+		cifs_dbg(FYI, "closing last open instance for inode %p\n",
+			 d_inode(cifs_file->dentry));
+		/*
+		 * In strict cache mode we need invalidate mapping on the last
+		 * close  because it may cause a error when we open this file
+		 * again and get at least level II oplock.
+		 */
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+			set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags);
+		cifs_set_oplock_level(cifsi, 0);
+	}
+	spin_unlock(&cifs_file_list_lock);
+
+	oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
+
+	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
+		struct TCP_Server_Info *server = tcon->ses->server;
+		unsigned int xid;
+
+		xid = get_xid();
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &cifs_file->fid);
+		_free_xid(xid);
+	}
+
+	if (oplock_break_cancelled)
+		cifs_done_oplock_break(cifsi);
+
+	cifs_del_pending_open(&open);
+
+	/*
+	 * Delete any outstanding lock records. We'll lose them when the file
+	 * is closed anyway.
+	 */
+	down_write(&cifsi->lock_sem);
+	list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
+		list_del(&li->llist);
+		cifs_del_lock_waiters(li);
+		kfree(li);
+	}
+	list_del(&cifs_file->llist->llist);
+	kfree(cifs_file->llist);
+	up_write(&cifsi->lock_sem);
+
+	cifs_put_tlink(cifs_file->tlink);
+	dput(cifs_file->dentry);
+	cifs_sb_deactive(sb);
+	kfree(cifs_file);
+}
+
+int cifs_open(struct inode *inode, struct file *file)
+
+{
+	int rc = -EACCES;
+	unsigned int xid;
+	__u32 oplock;
+	struct cifs_sb_info *cifs_sb;
+	struct TCP_Server_Info *server;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink;
+	struct cifsFileInfo *cfile = NULL;
+	char *full_path = NULL;
+	bool posix_open_ok = false;
+	struct cifs_fid fid;
+	struct cifs_pending_open open;
+
+	xid = get_xid();
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		free_xid(xid);
+		return PTR_ERR(tlink);
+	}
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	full_path = build_path_from_dentry(file->f_path.dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
+		 inode, file->f_flags, full_path);
+
+	if (file->f_flags & O_DIRECT &&
+	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			file->f_op = &cifs_file_direct_nobrl_ops;
+		else
+			file->f_op = &cifs_file_direct_ops;
+	}
+
+	if (server->oplocks)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+
+	if (!tcon->broken_posix_open && tcon->unix_ext &&
+	    cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+				cifs_sb->mnt_file_mode /* ignored */,
+				file->f_flags, &oplock, &fid.netfid, xid);
+		if (rc == 0) {
+			cifs_dbg(FYI, "posix open succeeded\n");
+			posix_open_ok = true;
+		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			if (tcon->ses->serverNOS)
+				cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n",
+					 tcon->ses->serverName,
+					 tcon->ses->serverNOS);
+			tcon->broken_posix_open = true;
+		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto out;
+		/*
+		 * Else fallthrough to retry open the old way on network i/o
+		 * or DFS errors.
+		 */
+	}
+
+	if (server->ops->get_lease_key)
+		server->ops->get_lease_key(inode, &fid);
+
+	cifs_add_pending_open(&fid, tlink, &open);
+
+	if (!posix_open_ok) {
+		if (server->ops->get_lease_key)
+			server->ops->get_lease_key(inode, &fid);
+
+		rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
+				  file->f_flags, &oplock, &fid, xid);
+		if (rc) {
+			cifs_del_pending_open(&open);
+			goto out;
+		}
+	}
+
+	cfile = cifs_new_fileinfo(&fid, file, tlink, oplock);
+	if (cfile == NULL) {
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &fid);
+		cifs_del_pending_open(&open);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cifs_fscache_set_inode_cookie(inode, file);
+
+	if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
+		/*
+		 * Time to set mode which we can not set earlier due to
+		 * problems creating new read-only files.
+		 */
+		struct cifs_unix_set_info_args args = {
+			.mode	= inode->i_mode,
+			.uid	= INVALID_UID, /* no change */
+			.gid	= INVALID_GID, /* no change */
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= 0,
+		};
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid,
+				       cfile->pid);
+	}
+
+out:
+	kfree(full_path);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
+
+/*
+ * Try to reacquire byte range locks that were released when session
+ * to server was lost.
+ */
+static int
+cifs_relock_file(struct cifsFileInfo *cfile)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	int rc = 0;
+
+	down_read(&cinode->lock_sem);
+	if (cinode->can_cache_brlcks) {
+		/* can cache locks - no need to relock */
+		up_read(&cinode->lock_sem);
+		return rc;
+	}
+
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		rc = cifs_push_posix_locks(cfile);
+	else
+		rc = tcon->ses->server->ops->push_mand_locks(cfile);
+
+	up_read(&cinode->lock_sem);
+	return rc;
+}
+
+static int
+cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
+{
+	int rc = -EACCES;
+	unsigned int xid;
+	__u32 oplock;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifsInodeInfo *cinode;
+	struct inode *inode;
+	char *full_path = NULL;
+	int desired_access;
+	int disposition = FILE_OPEN;
+	int create_options = CREATE_NOT_DIR;
+	struct cifs_open_parms oparms;
+
+	xid = get_xid();
+	mutex_lock(&cfile->fh_mutex);
+	if (!cfile->invalidHandle) {
+		mutex_unlock(&cfile->fh_mutex);
+		rc = 0;
+		free_xid(xid);
+		return rc;
+	}
+
+	inode = d_inode(cfile->dentry);
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tcon = tlink_tcon(cfile->tlink);
+	server = tcon->ses->server;
+
+	/*
+	 * Can not grab rename sem here because various ops, including those
+	 * that already have the rename sem can end up causing writepage to get
+	 * called and if the server was down that means we end up here, and we
+	 * can never tell if the caller already has the rename_sem.
+	 */
+	full_path = build_path_from_dentry(cfile->dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		mutex_unlock(&cfile->fh_mutex);
+		free_xid(xid);
+		return rc;
+	}
+
+	cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n",
+		 inode, cfile->f_flags, full_path);
+
+	if (tcon->ses->server->oplocks)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+
+	if (tcon->unix_ext && cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		/*
+		 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
+		 * original open. Must mask them off for a reopen.
+		 */
+		unsigned int oflags = cfile->f_flags &
+						~(O_CREAT | O_EXCL | O_TRUNC);
+
+		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &cfile->fid.netfid, xid);
+		if (rc == 0) {
+			cifs_dbg(FYI, "posix reopen succeeded\n");
+			oparms.reconnect = true;
+			goto reopen_success;
+		}
+		/*
+		 * fallthrough to retry open the old way on errors, especially
+		 * in the reconnect path it is important to retry hard
+		 */
+	}
+
+	desired_access = cifs_convert_flags(cfile->f_flags);
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	if (server->ops->get_lease_key)
+		server->ops->get_lease_key(inode, &cfile->fid);
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = desired_access;
+	oparms.create_options = create_options;
+	oparms.disposition = disposition;
+	oparms.path = full_path;
+	oparms.fid = &cfile->fid;
+	oparms.reconnect = true;
+
+	/*
+	 * Can not refresh inode by passing in file_info buf to be returned by
+	 * ops->open and then calling get_inode_info with returned buf since
+	 * file might have write behind data that needs to be flushed and server
+	 * version of file size can be stale. If we knew for sure that inode was
+	 * not dirty locally we could do this.
+	 */
+	rc = server->ops->open(xid, &oparms, &oplock, NULL);
+	if (rc == -ENOENT && oparms.reconnect == false) {
+		/* durable handle timeout is expired - open the file again */
+		rc = server->ops->open(xid, &oparms, &oplock, NULL);
+		/* indicate that we need to relock the file */
+		oparms.reconnect = true;
+	}
+
+	if (rc) {
+		mutex_unlock(&cfile->fh_mutex);
+		cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
+		cifs_dbg(FYI, "oplock: %d\n", oplock);
+		goto reopen_error_exit;
+	}
+
+reopen_success:
+	cfile->invalidHandle = false;
+	mutex_unlock(&cfile->fh_mutex);
+	cinode = CIFS_I(inode);
+
+	if (can_flush) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		mapping_set_error(inode->i_mapping, rc);
+
+		if (tcon->unix_ext)
+			rc = cifs_get_inode_info_unix(&inode, full_path,
+						      inode->i_sb, xid);
+		else
+			rc = cifs_get_inode_info(&inode, full_path, NULL,
+						 inode->i_sb, xid, NULL);
+	}
+	/*
+	 * Else we are writing out data to server already and could deadlock if
+	 * we tried to flush data, and since we do not know if we have data that
+	 * would invalidate the current end of file on the server we can not go
+	 * to the server to get the new inode info.
+	 */
+
+	server->ops->set_fid(cfile, &cfile->fid, oplock);
+	if (oparms.reconnect)
+		cifs_relock_file(cfile);
+
+reopen_error_exit:
+	kfree(full_path);
+	free_xid(xid);
+	return rc;
+}
+
+int cifs_close(struct inode *inode, struct file *file)
+{
+	if (file->private_data != NULL) {
+		cifsFileInfo_put(file->private_data);
+		file->private_data = NULL;
+	}
+
+	/* return code from the ->release op is always ignored */
+	return 0;
+}
+
+int cifs_closedir(struct inode *inode, struct file *file)
+{
+	int rc = 0;
+	unsigned int xid;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	char *buf;
+
+	cifs_dbg(FYI, "Closedir inode = 0x%p\n", inode);
+
+	if (cfile == NULL)
+		return rc;
+
+	xid = get_xid();
+	tcon = tlink_tcon(cfile->tlink);
+	server = tcon->ses->server;
+
+	cifs_dbg(FYI, "Freeing private data in close dir\n");
+	spin_lock(&cifs_file_list_lock);
+	if (server->ops->dir_needs_close(cfile)) {
+		cfile->invalidHandle = true;
+		spin_unlock(&cifs_file_list_lock);
+		if (server->ops->close_dir)
+			rc = server->ops->close_dir(xid, tcon, &cfile->fid);
+		else
+			rc = -ENOSYS;
+		cifs_dbg(FYI, "Closing uncompleted readdir with rc %d\n", rc);
+		/* not much we can do if it fails anyway, ignore rc */
+		rc = 0;
+	} else
+		spin_unlock(&cifs_file_list_lock);
+
+	buf = cfile->srch_inf.ntwrk_buf_start;
+	if (buf) {
+		cifs_dbg(FYI, "closedir free smb buf in srch struct\n");
+		cfile->srch_inf.ntwrk_buf_start = NULL;
+		if (cfile->srch_inf.smallBuf)
+			cifs_small_buf_release(buf);
+		else
+			cifs_buf_release(buf);
+	}
+
+	cifs_put_tlink(cfile->tlink);
+	kfree(file->private_data);
+	file->private_data = NULL;
+	/* BB can we lock the filestruct while this is going on? */
+	free_xid(xid);
+	return rc;
+}
+
+static struct cifsLockInfo *
+cifs_lock_init(__u64 offset, __u64 length, __u8 type)
+{
+	struct cifsLockInfo *lock =
+		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
+	if (!lock)
+		return lock;
+	lock->offset = offset;
+	lock->length = length;
+	lock->type = type;
+	lock->pid = current->tgid;
+	INIT_LIST_HEAD(&lock->blist);
+	init_waitqueue_head(&lock->block_q);
+	return lock;
+}
+
+void
+cifs_del_lock_waiters(struct cifsLockInfo *lock)
+{
+	struct cifsLockInfo *li, *tmp;
+	list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
+		list_del_init(&li->blist);
+		wake_up(&li->block_q);
+	}
+}
+
+#define CIFS_LOCK_OP	0
+#define CIFS_READ_OP	1
+#define CIFS_WRITE_OP	2
+
+/* @rw_check : 0 - no op, 1 - read, 2 - write */
+static bool
+cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
+			    __u64 length, __u8 type, struct cifsFileInfo *cfile,
+			    struct cifsLockInfo **conf_lock, int rw_check)
+{
+	struct cifsLockInfo *li;
+	struct cifsFileInfo *cur_cfile = fdlocks->cfile;
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+
+	list_for_each_entry(li, &fdlocks->locks, llist) {
+		if (offset + length <= li->offset ||
+		    offset >= li->offset + li->length)
+			continue;
+		if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
+		    server->ops->compare_fids(cfile, cur_cfile)) {
+			/* shared lock prevents write op through the same fid */
+			if (!(li->type & server->vals->shared_lock_type) ||
+			    rw_check != CIFS_WRITE_OP)
+				continue;
+		}
+		if ((type & server->vals->shared_lock_type) &&
+		    ((server->ops->compare_fids(cfile, cur_cfile) &&
+		     current->tgid == li->pid) || type == li->type))
+			continue;
+		if (conf_lock)
+			*conf_lock = li;
+		return true;
+	}
+	return false;
+}
+
+bool
+cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+			__u8 type, struct cifsLockInfo **conf_lock,
+			int rw_check)
+{
+	bool rc = false;
+	struct cifs_fid_locks *cur;
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+
+	list_for_each_entry(cur, &cinode->llist, llist) {
+		rc = cifs_find_fid_lock_conflict(cur, offset, length, type,
+						 cfile, conf_lock, rw_check);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+/*
+ * Check if there is another lock that prevents us to set the lock (mandatory
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
+static int
+cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+	       __u8 type, struct file_lock *flock)
+{
+	int rc = 0;
+	struct cifsLockInfo *conf_lock;
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+	bool exist;
+
+	down_read(&cinode->lock_sem);
+
+	exist = cifs_find_lock_conflict(cfile, offset, length, type,
+					&conf_lock, CIFS_LOCK_OP);
+	if (exist) {
+		flock->fl_start = conf_lock->offset;
+		flock->fl_end = conf_lock->offset + conf_lock->length - 1;
+		flock->fl_pid = conf_lock->pid;
+		if (conf_lock->type & server->vals->shared_lock_type)
+			flock->fl_type = F_RDLCK;
+		else
+			flock->fl_type = F_WRLCK;
+	} else if (!cinode->can_cache_brlcks)
+		rc = 1;
+	else
+		flock->fl_type = F_UNLCK;
+
+	up_read(&cinode->lock_sem);
+	return rc;
+}
+
+static void
+cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	down_write(&cinode->lock_sem);
+	list_add_tail(&lock->llist, &cfile->llist->locks);
+	up_write(&cinode->lock_sem);
+}
+
+/*
+ * Set the byte-range lock (mandatory style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if no locks prevent us but we need to request to the server;
+ * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ */
+static int
+cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
+		 bool wait)
+{
+	struct cifsLockInfo *conf_lock;
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	bool exist;
+	int rc = 0;
+
+try_again:
+	exist = false;
+	down_write(&cinode->lock_sem);
+
+	exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
+					lock->type, &conf_lock, CIFS_LOCK_OP);
+	if (!exist && cinode->can_cache_brlcks) {
+		list_add_tail(&lock->llist, &cfile->llist->locks);
+		up_write(&cinode->lock_sem);
+		return rc;
+	}
+
+	if (!exist)
+		rc = 1;
+	else if (!wait)
+		rc = -EACCES;
+	else {
+		list_add_tail(&lock->blist, &conf_lock->blist);
+		up_write(&cinode->lock_sem);
+		rc = wait_event_interruptible(lock->block_q,
+					(lock->blist.prev == &lock->blist) &&
+					(lock->blist.next == &lock->blist));
+		if (!rc)
+			goto try_again;
+		down_write(&cinode->lock_sem);
+		list_del_init(&lock->blist);
+	}
+
+	up_write(&cinode->lock_sem);
+	return rc;
+}
+
+/*
+ * Check if there is another lock that prevents us to set the lock (posix
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
+static int
+cifs_posix_lock_test(struct file *file, struct file_lock *flock)
+{
+	int rc = 0;
+	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
+	unsigned char saved_type = flock->fl_type;
+
+	if ((flock->fl_flags & FL_POSIX) == 0)
+		return 1;
+
+	down_read(&cinode->lock_sem);
+	posix_test_lock(file, flock);
+
+	if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
+		flock->fl_type = saved_type;
+		rc = 1;
+	}
+
+	up_read(&cinode->lock_sem);
+	return rc;
+}
+
+/*
+ * Set the byte-range lock (posix style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if we need to request to the server;
+ * 3) <0, if the error occurs while setting the lock.
+ */
+static int
+cifs_posix_lock_set(struct file *file, struct file_lock *flock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
+	int rc = 1;
+
+	if ((flock->fl_flags & FL_POSIX) == 0)
+		return rc;
+
+try_again:
+	down_write(&cinode->lock_sem);
+	if (!cinode->can_cache_brlcks) {
+		up_write(&cinode->lock_sem);
+		return rc;
+	}
+
+	rc = posix_lock_file(file, flock, NULL);
+	up_write(&cinode->lock_sem);
+	if (rc == FILE_LOCK_DEFERRED) {
+		rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
+		if (!rc)
+			goto try_again;
+		posix_unblock_lock(flock);
+	}
+	return rc;
+}
+
+int
+cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
+{
+	unsigned int xid;
+	int rc = 0, stored_rc;
+	struct cifsLockInfo *li, *tmp;
+	struct cifs_tcon *tcon;
+	unsigned int num, max_num, max_buf;
+	LOCKING_ANDX_RANGE *buf, *cur;
+	int types[] = {LOCKING_ANDX_LARGE_FILES,
+		       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+	int i;
+
+	xid = get_xid();
+	tcon = tlink_tcon(cfile->tlink);
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tcon->ses->server->maxBuf;
+	if (!max_buf) {
+		free_xid(xid);
+		return -EINVAL;
+	}
+
+	max_num = (max_buf - sizeof(struct smb_hdr)) /
+						sizeof(LOCKING_ANDX_RANGE);
+	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+	if (!buf) {
+		free_xid(xid);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < 2; i++) {
+		cur = buf;
+		num = 0;
+		list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
+			if (li->type != types[i])
+				continue;
+			cur->Pid = cpu_to_le16(li->pid);
+			cur->LengthLow = cpu_to_le32((u32)li->length);
+			cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+			cur->OffsetLow = cpu_to_le32((u32)li->offset);
+			cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+			if (++num == max_num) {
+				stored_rc = cifs_lockv(xid, tcon,
+						       cfile->fid.netfid,
+						       (__u8)li->type, 0, num,
+						       buf);
+				if (stored_rc)
+					rc = stored_rc;
+				cur = buf;
+				num = 0;
+			} else
+				cur++;
+		}
+
+		if (num) {
+			stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid,
+					       (__u8)types[i], 0, num, buf);
+			if (stored_rc)
+				rc = stored_rc;
+		}
+	}
+
+	kfree(buf);
+	free_xid(xid);
+	return rc;
+}
+
+struct lock_to_push {
+	struct list_head llist;
+	__u64 offset;
+	__u64 length;
+	__u32 pid;
+	__u16 netfid;
+	__u8 type;
+};
+
+static int
+cifs_push_posix_locks(struct cifsFileInfo *cfile)
+{
+	struct inode *inode = d_inode(cfile->dentry);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct file_lock *flock;
+	struct file_lock_context *flctx = inode->i_flctx;
+	unsigned int count = 0, i;
+	int rc = 0, xid, type;
+	struct list_head locks_to_send, *el;
+	struct lock_to_push *lck, *tmp;
+	__u64 length;
+
+	xid = get_xid();
+
+	if (!flctx)
+		goto out;
+
+	spin_lock(&flctx->flc_lock);
+	list_for_each(el, &flctx->flc_posix) {
+		count++;
+	}
+	spin_unlock(&flctx->flc_lock);
+
+	INIT_LIST_HEAD(&locks_to_send);
+
+	/*
+	 * Allocating count locks is enough because no FL_POSIX locks can be
+	 * added to the list while we are holding cinode->lock_sem that
+	 * protects locking operations of this inode.
+	 */
+	for (i = 0; i < count; i++) {
+		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
+		if (!lck) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+		list_add_tail(&lck->llist, &locks_to_send);
+	}
+
+	el = locks_to_send.next;
+	spin_lock(&flctx->flc_lock);
+	list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
+		if (el == &locks_to_send) {
+			/*
+			 * The list ended. We don't have enough allocated
+			 * structures - something is really wrong.
+			 */
+			cifs_dbg(VFS, "Can't push all brlocks!\n");
+			break;
+		}
+		length = 1 + flock->fl_end - flock->fl_start;
+		if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+			type = CIFS_RDLCK;
+		else
+			type = CIFS_WRLCK;
+		lck = list_entry(el, struct lock_to_push, llist);
+		lck->pid = flock->fl_pid;
+		lck->netfid = cfile->fid.netfid;
+		lck->length = length;
+		lck->type = type;
+		lck->offset = flock->fl_start;
+	}
+	spin_unlock(&flctx->flc_lock);
+
+	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+		int stored_rc;
+
+		stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
+					     lck->offset, lck->length, NULL,
+					     lck->type, 0);
+		if (stored_rc)
+			rc = stored_rc;
+		list_del(&lck->llist);
+		kfree(lck);
+	}
+
+out:
+	free_xid(xid);
+	return rc;
+err_out:
+	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+		list_del(&lck->llist);
+		kfree(lck);
+	}
+	goto out;
+}
+
+static int
+cifs_push_locks(struct cifsFileInfo *cfile)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	int rc = 0;
+
+	/* we are going to update can_cache_brlcks here - need a write access */
+	down_write(&cinode->lock_sem);
+	if (!cinode->can_cache_brlcks) {
+		up_write(&cinode->lock_sem);
+		return rc;
+	}
+
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		rc = cifs_push_posix_locks(cfile);
+	else
+		rc = tcon->ses->server->ops->push_mand_locks(cfile);
+
+	cinode->can_cache_brlcks = false;
+	up_write(&cinode->lock_sem);
+	return rc;
+}
+
+static void
+cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
+		bool *wait_flag, struct TCP_Server_Info *server)
+{
+	if (flock->fl_flags & FL_POSIX)
+		cifs_dbg(FYI, "Posix\n");
+	if (flock->fl_flags & FL_FLOCK)
+		cifs_dbg(FYI, "Flock\n");
+	if (flock->fl_flags & FL_SLEEP) {
+		cifs_dbg(FYI, "Blocking lock\n");
+		*wait_flag = true;
+	}
+	if (flock->fl_flags & FL_ACCESS)
+		cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n");
+	if (flock->fl_flags & FL_LEASE)
+		cifs_dbg(FYI, "Lease on file - not implemented yet\n");
+	if (flock->fl_flags &
+	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP |
+	       FL_ACCESS | FL_LEASE | FL_CLOSE)))
+		cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
+
+	*type = server->vals->large_lock_type;
+	if (flock->fl_type == F_WRLCK) {
+		cifs_dbg(FYI, "F_WRLCK\n");
+		*type |= server->vals->exclusive_lock_type;
+		*lock = 1;
+	} else if (flock->fl_type == F_UNLCK) {
+		cifs_dbg(FYI, "F_UNLCK\n");
+		*type |= server->vals->unlock_lock_type;
+		*unlock = 1;
+		/* Check if unlock includes more than one lock range */
+	} else if (flock->fl_type == F_RDLCK) {
+		cifs_dbg(FYI, "F_RDLCK\n");
+		*type |= server->vals->shared_lock_type;
+		*lock = 1;
+	} else if (flock->fl_type == F_EXLCK) {
+		cifs_dbg(FYI, "F_EXLCK\n");
+		*type |= server->vals->exclusive_lock_type;
+		*lock = 1;
+	} else if (flock->fl_type == F_SHLCK) {
+		cifs_dbg(FYI, "F_SHLCK\n");
+		*type |= server->vals->shared_lock_type;
+		*lock = 1;
+	} else
+		cifs_dbg(FYI, "Unknown type of lock\n");
+}
+
+static int
+cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
+	   bool wait_flag, bool posix_lck, unsigned int xid)
+{
+	int rc = 0;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	__u16 netfid = cfile->fid.netfid;
+
+	if (posix_lck) {
+		int posix_lock_type;
+
+		rc = cifs_posix_lock_test(file, flock);
+		if (!rc)
+			return rc;
+
+		if (type & server->vals->shared_lock_type)
+			posix_lock_type = CIFS_RDLCK;
+		else
+			posix_lock_type = CIFS_WRLCK;
+		rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+				      flock->fl_start, length, flock,
+				      posix_lock_type, wait_flag);
+		return rc;
+	}
+
+	rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock);
+	if (!rc)
+		return rc;
+
+	/* BB we could chain these into one lock request BB */
+	rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type,
+				    1, 0, false);
+	if (rc == 0) {
+		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+					    type, 0, 1, false);
+		flock->fl_type = F_UNLCK;
+		if (rc != 0)
+			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+				 rc);
+		return 0;
+	}
+
+	if (type & server->vals->shared_lock_type) {
+		flock->fl_type = F_WRLCK;
+		return 0;
+	}
+
+	type &= ~server->vals->exclusive_lock_type;
+
+	rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+				    type | server->vals->shared_lock_type,
+				    1, 0, false);
+	if (rc == 0) {
+		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+			type | server->vals->shared_lock_type, 0, 1, false);
+		flock->fl_type = F_RDLCK;
+		if (rc != 0)
+			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+				 rc);
+	} else
+		flock->fl_type = F_WRLCK;
+
+	return 0;
+}
+
+void
+cifs_move_llist(struct list_head *source, struct list_head *dest)
+{
+	struct list_head *li, *tmp;
+	list_for_each_safe(li, tmp, source)
+		list_move(li, dest);
+}
+
+void
+cifs_free_llist(struct list_head *llist)
+{
+	struct cifsLockInfo *li, *tmp;
+	list_for_each_entry_safe(li, tmp, llist, llist) {
+		cifs_del_lock_waiters(li);
+		list_del(&li->llist);
+		kfree(li);
+	}
+}
+
+int
+cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
+		  unsigned int xid)
+{
+	int rc = 0, stored_rc;
+	int types[] = {LOCKING_ANDX_LARGE_FILES,
+		       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+	unsigned int i;
+	unsigned int max_num, num, max_buf;
+	LOCKING_ANDX_RANGE *buf, *cur;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	struct cifsLockInfo *li, *tmp;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct list_head tmp_llist;
+
+	INIT_LIST_HEAD(&tmp_llist);
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tcon->ses->server->maxBuf;
+	if (!max_buf)
+		return -EINVAL;
+
+	max_num = (max_buf - sizeof(struct smb_hdr)) /
+						sizeof(LOCKING_ANDX_RANGE);
+	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	down_write(&cinode->lock_sem);
+	for (i = 0; i < 2; i++) {
+		cur = buf;
+		num = 0;
+		list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
+			if (flock->fl_start > li->offset ||
+			    (flock->fl_start + length) <
+			    (li->offset + li->length))
+				continue;
+			if (current->tgid != li->pid)
+				continue;
+			if (types[i] != li->type)
+				continue;
+			if (cinode->can_cache_brlcks) {
+				/*
+				 * We can cache brlock requests - simply remove
+				 * a lock from the file's list.
+				 */
+				list_del(&li->llist);
+				cifs_del_lock_waiters(li);
+				kfree(li);
+				continue;
+			}
+			cur->Pid = cpu_to_le16(li->pid);
+			cur->LengthLow = cpu_to_le32((u32)li->length);
+			cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+			cur->OffsetLow = cpu_to_le32((u32)li->offset);
+			cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+			/*
+			 * We need to save a lock here to let us add it again to
+			 * the file's list if the unlock range request fails on
+			 * the server.
+			 */
+			list_move(&li->llist, &tmp_llist);
+			if (++num == max_num) {
+				stored_rc = cifs_lockv(xid, tcon,
+						       cfile->fid.netfid,
+						       li->type, num, 0, buf);
+				if (stored_rc) {
+					/*
+					 * We failed on the unlock range
+					 * request - add all locks from the tmp
+					 * list to the head of the file's list.
+					 */
+					cifs_move_llist(&tmp_llist,
+							&cfile->llist->locks);
+					rc = stored_rc;
+				} else
+					/*
+					 * The unlock range request succeed -
+					 * free the tmp list.
+					 */
+					cifs_free_llist(&tmp_llist);
+				cur = buf;
+				num = 0;
+			} else
+				cur++;
+		}
+		if (num) {
+			stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid,
+					       types[i], num, 0, buf);
+			if (stored_rc) {
+				cifs_move_llist(&tmp_llist,
+						&cfile->llist->locks);
+				rc = stored_rc;
+			} else
+				cifs_free_llist(&tmp_llist);
+		}
+	}
+
+	up_write(&cinode->lock_sem);
+	kfree(buf);
+	return rc;
+}
+
+static int
+cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
+	   bool wait_flag, bool posix_lck, int lock, int unlock,
+	   unsigned int xid)
+{
+	int rc = 0;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct inode *inode = d_inode(cfile->dentry);
+
+	if (posix_lck) {
+		int posix_lock_type;
+
+		rc = cifs_posix_lock_set(file, flock);
+		if (!rc || rc < 0)
+			return rc;
+
+		if (type & server->vals->shared_lock_type)
+			posix_lock_type = CIFS_RDLCK;
+		else
+			posix_lock_type = CIFS_WRLCK;
+
+		if (unlock == 1)
+			posix_lock_type = CIFS_UNLCK;
+
+		rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid,
+				      current->tgid, flock->fl_start, length,
+				      NULL, posix_lock_type, wait_flag);
+		goto out;
+	}
+
+	if (lock) {
+		struct cifsLockInfo *lock;
+
+		lock = cifs_lock_init(flock->fl_start, length, type);
+		if (!lock)
+			return -ENOMEM;
+
+		rc = cifs_lock_add_if(cfile, lock, wait_flag);
+		if (rc < 0) {
+			kfree(lock);
+			return rc;
+		}
+		if (!rc)
+			goto out;
+
+		/*
+		 * Windows 7 server can delay breaking lease from read to None
+		 * if we set a byte-range lock on a file - break it explicitly
+		 * before sending the lock to the server to be sure the next
+		 * read won't conflict with non-overlapted locks due to
+		 * pagereading.
+		 */
+		if (!CIFS_CACHE_WRITE(CIFS_I(inode)) &&
+					CIFS_CACHE_READ(CIFS_I(inode))) {
+			cifs_zap_mapping(inode);
+			cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
+				 inode);
+			CIFS_I(inode)->oplock = 0;
+		}
+
+		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+					    type, 1, 0, wait_flag);
+		if (rc) {
+			kfree(lock);
+			return rc;
+		}
+
+		cifs_lock_add(cfile, lock);
+	} else if (unlock)
+		rc = server->ops->mand_unlock_range(cfile, flock, xid);
+
+out:
+	if (flock->fl_flags & FL_POSIX && !rc)
+		rc = posix_lock_file_wait(file, flock);
+	return rc;
+}
+
+int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
+{
+	int rc, xid;
+	int lock = 0, unlock = 0;
+	bool wait_flag = false;
+	bool posix_lck = false;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *cinode;
+	struct cifsFileInfo *cfile;
+	__u16 netfid;
+	__u32 type;
+
+	rc = -EACCES;
+	xid = get_xid();
+
+	cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n",
+		 cmd, flock->fl_flags, flock->fl_type,
+		 flock->fl_start, flock->fl_end);
+
+	cfile = (struct cifsFileInfo *)file->private_data;
+	tcon = tlink_tcon(cfile->tlink);
+
+	cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
+			tcon->ses->server);
+
+	cifs_sb = CIFS_FILE_SB(file);
+	netfid = cfile->fid.netfid;
+	cinode = CIFS_I(file_inode(file));
+
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		posix_lck = true;
+	/*
+	 * BB add code here to normalize offset and length to account for
+	 * negative length which we can not accept over the wire.
+	 */
+	if (IS_GETLK(cmd)) {
+		rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
+		free_xid(xid);
+		return rc;
+	}
+
+	if (!lock && !unlock) {
+		/*
+		 * if no lock or unlock then nothing to do since we do not
+		 * know what it is
+		 */
+		free_xid(xid);
+		return -EOPNOTSUPP;
+	}
+
+	rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
+			xid);
+	free_xid(xid);
+	return rc;
+}
+
+/*
+ * update the file size (if needed) after a write. Should be called with
+ * the inode->i_lock held
+ */
+void
+cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+		      unsigned int bytes_written)
+{
+	loff_t end_of_write = offset + bytes_written;
+
+	if (end_of_write > cifsi->server_eof)
+		cifsi->server_eof = end_of_write;
+}
+
+static ssize_t
+cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
+	   size_t write_size, loff_t *offset)
+{
+	int rc = 0;
+	unsigned int bytes_written = 0;
+	unsigned int total_written;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	unsigned int xid;
+	struct dentry *dentry = open_file->dentry;
+	struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry));
+	struct cifs_io_parms io_parms;
+
+	cifs_sb = CIFS_SB(dentry->d_sb);
+
+	cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n",
+		 write_size, *offset, dentry);
+
+	tcon = tlink_tcon(open_file->tlink);
+	server = tcon->ses->server;
+
+	if (!server->ops->sync_write)
+		return -ENOSYS;
+
+	xid = get_xid();
+
+	for (total_written = 0; write_size > total_written;
+	     total_written += bytes_written) {
+		rc = -EAGAIN;
+		while (rc == -EAGAIN) {
+			struct kvec iov[2];
+			unsigned int len;
+
+			if (open_file->invalidHandle) {
+				/* we could deadlock if we called
+				   filemap_fdatawait from here so tell
+				   reopen_file not to flush data to
+				   server now */
+				rc = cifs_reopen_file(open_file, false);
+				if (rc != 0)
+					break;
+			}
+
+			len = min(server->ops->wp_retry_size(d_inode(dentry)),
+				  (unsigned int)write_size - total_written);
+			/* iov[0] is reserved for smb header */
+			iov[1].iov_base = (char *)write_data + total_written;
+			iov[1].iov_len = len;
+			io_parms.pid = pid;
+			io_parms.tcon = tcon;
+			io_parms.offset = *offset;
+			io_parms.length = len;
+			rc = server->ops->sync_write(xid, &open_file->fid,
+					&io_parms, &bytes_written, iov, 1);
+		}
+		if (rc || (bytes_written == 0)) {
+			if (total_written)
+				break;
+			else {
+				free_xid(xid);
+				return rc;
+			}
+		} else {
+			spin_lock(&d_inode(dentry)->i_lock);
+			cifs_update_eof(cifsi, *offset, bytes_written);
+			spin_unlock(&d_inode(dentry)->i_lock);
+			*offset += bytes_written;
+		}
+	}
+
+	cifs_stats_bytes_written(tcon, total_written);
+
+	if (total_written > 0) {
+		spin_lock(&d_inode(dentry)->i_lock);
+		if (*offset > d_inode(dentry)->i_size)
+			i_size_write(d_inode(dentry), *offset);
+		spin_unlock(&d_inode(dentry)->i_lock);
+	}
+	mark_inode_dirty_sync(d_inode(dentry));
+	free_xid(xid);
+	return total_written;
+}
+
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
+{
+	struct cifsFileInfo *open_file = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
+
+	spin_lock(&cifs_file_list_lock);
+	/* we could simply get the first_list_entry since write-only entries
+	   are always at the end of the list but since the first entry might
+	   have a close pending, we go through the whole list */
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
+			continue;
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
+			if (!open_file->invalidHandle) {
+				/* found a good file */
+				/* lock it so it will not be closed on us */
+				cifsFileInfo_get_locked(open_file);
+				spin_unlock(&cifs_file_list_lock);
+				return open_file;
+			} /* else might as well continue, and look for
+			     another, or simply have the caller reopen it
+			     again rather than trying to fix this handle */
+		} else /* write only file */
+			break; /* write only files are last so must be done */
+	}
+	spin_unlock(&cifs_file_list_lock);
+	return NULL;
+}
+
+struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
+{
+	struct cifsFileInfo *open_file, *inv_file = NULL;
+	struct cifs_sb_info *cifs_sb;
+	bool any_available = false;
+	int rc;
+	unsigned int refind = 0;
+
+	/* Having a null inode here (because mapping->host was set to zero by
+	the VFS or MM) should not happen but we had reports of on oops (due to
+	it being zero) during stress testcases so we need to check for it */
+
+	if (cifs_inode == NULL) {
+		cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
+		dump_stack();
+		return NULL;
+	}
+
+	cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
+
+	spin_lock(&cifs_file_list_lock);
+refind_writable:
+	if (refind > MAX_REOPEN_ATT) {
+		spin_unlock(&cifs_file_list_lock);
+		return NULL;
+	}
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (!any_available && open_file->pid != current->tgid)
+			continue;
+		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
+			continue;
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
+			if (!open_file->invalidHandle) {
+				/* found a good writable file */
+				cifsFileInfo_get_locked(open_file);
+				spin_unlock(&cifs_file_list_lock);
+				return open_file;
+			} else {
+				if (!inv_file)
+					inv_file = open_file;
+			}
+		}
+	}
+	/* couldn't find useable FH with same pid, try any available */
+	if (!any_available) {
+		any_available = true;
+		goto refind_writable;
+	}
+
+	if (inv_file) {
+		any_available = false;
+		cifsFileInfo_get_locked(inv_file);
+	}
+
+	spin_unlock(&cifs_file_list_lock);
+
+	if (inv_file) {
+		rc = cifs_reopen_file(inv_file, false);
+		if (!rc)
+			return inv_file;
+		else {
+			spin_lock(&cifs_file_list_lock);
+			list_move_tail(&inv_file->flist,
+					&cifs_inode->openFileList);
+			spin_unlock(&cifs_file_list_lock);
+			cifsFileInfo_put(inv_file);
+			spin_lock(&cifs_file_list_lock);
+			++refind;
+			inv_file = NULL;
+			goto refind_writable;
+		}
+	}
+
+	return NULL;
+}
+
+static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
+{
+	struct address_space *mapping = page->mapping;
+	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	char *write_data;
+	int rc = -EFAULT;
+	int bytes_written = 0;
+	struct inode *inode;
+	struct cifsFileInfo *open_file;
+
+	if (!mapping || !mapping->host)
+		return -EFAULT;
+
+	inode = page->mapping->host;
+
+	offset += (loff_t)from;
+	write_data = kmap(page);
+	write_data += from;
+
+	if ((to > PAGE_CACHE_SIZE) || (from > to)) {
+		kunmap(page);
+		return -EIO;
+	}
+
+	/* racing with truncate? */
+	if (offset > mapping->host->i_size) {
+		kunmap(page);
+		return 0; /* don't care */
+	}
+
+	/* check to make sure that we are not extending the file */
+	if (mapping->host->i_size - offset < (loff_t)to)
+		to = (unsigned)(mapping->host->i_size - offset);
+
+	open_file = find_writable_file(CIFS_I(mapping->host), false);
+	if (open_file) {
+		bytes_written = cifs_write(open_file, open_file->pid,
+					   write_data, to - from, &offset);
+		cifsFileInfo_put(open_file);
+		/* Does mm or vfs already set times? */
+		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
+		if ((bytes_written > 0) && (offset))
+			rc = 0;
+		else if (bytes_written < 0)
+			rc = bytes_written;
+	} else {
+		cifs_dbg(FYI, "No writeable filehandles for inode\n");
+		rc = -EIO;
+	}
+
+	kunmap(page);
+	return rc;
+}
+
+static struct cifs_writedata *
+wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
+			  pgoff_t end, pgoff_t *index,
+			  unsigned int *found_pages)
+{
+	unsigned int nr_pages;
+	struct page **pages;
+	struct cifs_writedata *wdata;
+
+	wdata = cifs_writedata_alloc((unsigned int)tofind,
+				     cifs_writev_complete);
+	if (!wdata)
+		return NULL;
+
+	/*
+	 * find_get_pages_tag seems to return a max of 256 on each
+	 * iteration, so we must call it several times in order to
+	 * fill the array or the wsize is effectively limited to
+	 * 256 * PAGE_CACHE_SIZE.
+	 */
+	*found_pages = 0;
+	pages = wdata->pages;
+	do {
+		nr_pages = find_get_pages_tag(mapping, index,
+					      PAGECACHE_TAG_DIRTY, tofind,
+					      pages);
+		*found_pages += nr_pages;
+		tofind -= nr_pages;
+		pages += nr_pages;
+	} while (nr_pages && tofind && *index <= end);
+
+	return wdata;
+}
+
+static unsigned int
+wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
+		    struct address_space *mapping,
+		    struct writeback_control *wbc,
+		    pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
+{
+	unsigned int nr_pages = 0, i;
+	struct page *page;
+
+	for (i = 0; i < found_pages; i++) {
+		page = wdata->pages[i];
+		/*
+		 * At this point we hold neither mapping->tree_lock nor
+		 * lock on the page itself: the page may be truncated or
+		 * invalidated (changing page->mapping to NULL), or even
+		 * swizzled back from swapper_space to tmpfs file
+		 * mapping
+		 */
+
+		if (nr_pages == 0)
+			lock_page(page);
+		else if (!trylock_page(page))
+			break;
+
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			break;
+		}
+
+		if (!wbc->range_cyclic && page->index > end) {
+			*done = true;
+			unlock_page(page);
+			break;
+		}
+
+		if (*next && (page->index != *next)) {
+			/* Not next consecutive page */
+			unlock_page(page);
+			break;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) ||
+				!clear_page_dirty_for_io(page)) {
+			unlock_page(page);
+			break;
+		}
+
+		/*
+		 * This actually clears the dirty bit in the radix tree.
+		 * See cifs_writepage() for more commentary.
+		 */
+		set_page_writeback(page);
+		if (page_offset(page) >= i_size_read(mapping->host)) {
+			*done = true;
+			unlock_page(page);
+			end_page_writeback(page);
+			break;
+		}
+
+		wdata->pages[i] = page;
+		*next = page->index + 1;
+		++nr_pages;
+	}
+
+	/* reset index to refind any pages skipped */
+	if (nr_pages == 0)
+		*index = wdata->pages[0]->index + 1;
+
+	/* put any pages we aren't going to use */
+	for (i = nr_pages; i < found_pages; i++) {
+		page_cache_release(wdata->pages[i]);
+		wdata->pages[i] = NULL;
+	}
+
+	return nr_pages;
+}
+
+static int
+wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
+		 struct address_space *mapping, struct writeback_control *wbc)
+{
+	int rc = 0;
+	struct TCP_Server_Info *server;
+	unsigned int i;
+
+	wdata->sync_mode = wbc->sync_mode;
+	wdata->nr_pages = nr_pages;
+	wdata->offset = page_offset(wdata->pages[0]);
+	wdata->pagesz = PAGE_CACHE_SIZE;
+	wdata->tailsz = min(i_size_read(mapping->host) -
+			page_offset(wdata->pages[nr_pages - 1]),
+			(loff_t)PAGE_CACHE_SIZE);
+	wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
+
+	if (wdata->cfile != NULL)
+		cifsFileInfo_put(wdata->cfile);
+	wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
+	if (!wdata->cfile) {
+		cifs_dbg(VFS, "No writable handles for inode\n");
+		rc = -EBADF;
+	} else {
+		wdata->pid = wdata->cfile->pid;
+		server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+		rc = server->ops->async_writev(wdata, cifs_writedata_release);
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		unlock_page(wdata->pages[i]);
+
+	return rc;
+}
+
+static int cifs_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+	struct TCP_Server_Info *server;
+	bool done = false, scanned = false, range_whole = false;
+	pgoff_t end, index;
+	struct cifs_writedata *wdata;
+	int rc = 0;
+
+	/*
+	 * If wsize is smaller than the page cache size, default to writing
+	 * one page at a time via cifs_writepage
+	 */
+	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
+		return generic_writepages(mapping, wbc);
+
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = true;
+		scanned = true;
+	}
+	server = cifs_sb_master_tcon(cifs_sb)->ses->server;
+retry:
+	while (!done && index <= end) {
+		unsigned int i, nr_pages, found_pages, wsize, credits;
+		pgoff_t next = 0, tofind, saved_index = index;
+
+		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+						   &wsize, &credits);
+		if (rc)
+			break;
+
+		tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
+
+		wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
+						  &found_pages);
+		if (!wdata) {
+			rc = -ENOMEM;
+			add_credits_and_wake_if(server, credits, 0);
+			break;
+		}
+
+		if (found_pages == 0) {
+			kref_put(&wdata->refcount, cifs_writedata_release);
+			add_credits_and_wake_if(server, credits, 0);
+			break;
+		}
+
+		nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
+					       end, &index, &next, &done);
+
+		/* nothing to write? */
+		if (nr_pages == 0) {
+			kref_put(&wdata->refcount, cifs_writedata_release);
+			add_credits_and_wake_if(server, credits, 0);
+			continue;
+		}
+
+		wdata->credits = credits;
+
+		rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+
+		/* send failure -- clean up the mess */
+		if (rc != 0) {
+			add_credits_and_wake_if(server, wdata->credits, 0);
+			for (i = 0; i < nr_pages; ++i) {
+				if (rc == -EAGAIN)
+					redirty_page_for_writepage(wbc,
+							   wdata->pages[i]);
+				else
+					SetPageError(wdata->pages[i]);
+				end_page_writeback(wdata->pages[i]);
+				page_cache_release(wdata->pages[i]);
+			}
+			if (rc != -EAGAIN)
+				mapping_set_error(mapping, rc);
+		}
+		kref_put(&wdata->refcount, cifs_writedata_release);
+
+		if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
+			index = saved_index;
+			continue;
+		}
+
+		wbc->nr_to_write -= nr_pages;
+		if (wbc->nr_to_write <= 0)
+			done = true;
+
+		index = next;
+	}
+
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = true;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+	return rc;
+}
+
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
+{
+	int rc;
+	unsigned int xid;
+
+	xid = get_xid();
+/* BB add check for wbc flags */
+	page_cache_get(page);
+	if (!PageUptodate(page))
+		cifs_dbg(FYI, "ppw - page not up to date\n");
+
+	/*
+	 * Set the "writeback" flag, and clear "dirty" in the radix tree.
+	 *
+	 * A writepage() implementation always needs to do either this,
+	 * or re-dirty the page with "redirty_page_for_writepage()" in
+	 * the case of a failure.
+	 *
+	 * Just unlocking the page will cause the radix tree tag-bits
+	 * to fail to update with the state of the page correctly.
+	 */
+	set_page_writeback(page);
+retry_write:
+	rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
+	if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
+		goto retry_write;
+	else if (rc == -EAGAIN)
+		redirty_page_for_writepage(wbc, page);
+	else if (rc != 0)
+		SetPageError(page);
+	else
+		SetPageUptodate(page);
+	end_page_writeback(page);
+	page_cache_release(page);
+	free_xid(xid);
+	return rc;
+}
+
+static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int rc = cifs_writepage_locked(page, wbc);
+	unlock_page(page);
+	return rc;
+}
+
+static int cifs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int rc;
+	struct inode *inode = mapping->host;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	__u32 pid;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = cfile->pid;
+	else
+		pid = current->tgid;
+
+	cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n",
+		 page, pos, copied);
+
+	if (PageChecked(page)) {
+		if (copied == len)
+			SetPageUptodate(page);
+		ClearPageChecked(page);
+	} else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+		SetPageUptodate(page);
+
+	if (!PageUptodate(page)) {
+		char *page_data;
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned int xid;
+
+		xid = get_xid();
+		/* this is probably better than directly calling
+		   partialpage_write since in this function the file handle is
+		   known which we might as well	leverage */
+		/* BB check if anything else missing out of ppw
+		   such as updating last write time */
+		page_data = kmap(page);
+		rc = cifs_write(cfile, pid, page_data + offset, copied, &pos);
+		/* if (rc < 0) should we set writebehind rc? */
+		kunmap(page);
+
+		free_xid(xid);
+	} else {
+		rc = copied;
+		pos += copied;
+		set_page_dirty(page);
+	}
+
+	if (rc > 0) {
+		spin_lock(&inode->i_lock);
+		if (pos > inode->i_size)
+			i_size_write(inode, pos);
+		spin_unlock(&inode->i_lock);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return rc;
+}
+
+int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
+{
+	unsigned int xid;
+	int rc = 0;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifsFileInfo *smbfile = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+	mutex_lock(&inode->i_mutex);
+
+	xid = get_xid();
+
+	cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
+		 file, datasync);
+
+	if (!CIFS_CACHE_READ(CIFS_I(inode))) {
+		rc = cifs_zap_mapping(inode);
+		if (rc) {
+			cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
+			rc = 0; /* don't care about it in fsync */
+		}
+	}
+
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
+		server = tcon->ses->server;
+		if (server->ops->flush)
+			rc = server->ops->flush(xid, tcon, &smbfile->fid);
+		else
+			rc = -ENOSYS;
+	}
+
+	free_xid(xid);
+	mutex_unlock(&inode->i_mutex);
+	return rc;
+}
+
+int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	unsigned int xid;
+	int rc = 0;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifsFileInfo *smbfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
+	struct inode *inode = file->f_mapping->host;
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+	mutex_lock(&inode->i_mutex);
+
+	xid = get_xid();
+
+	cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
+		 file, datasync);
+
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
+		server = tcon->ses->server;
+		if (server->ops->flush)
+			rc = server->ops->flush(xid, tcon, &smbfile->fid);
+		else
+			rc = -ENOSYS;
+	}
+
+	free_xid(xid);
+	mutex_unlock(&inode->i_mutex);
+	return rc;
+}
+
+/*
+ * As file closes, flush all cached write data for this inode checking
+ * for write behind errors.
+ */
+int cifs_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file_inode(file);
+	int rc = 0;
+
+	if (file->f_mode & FMODE_WRITE)
+		rc = filemap_write_and_wait(inode->i_mapping);
+
+	cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc);
+
+	return rc;
+}
+
+static int
+cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
+{
+	int rc = 0;
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		if (!pages[i]) {
+			/*
+			 * save number of pages we have already allocated and
+			 * return with ENOMEM error
+			 */
+			num_pages = i;
+			rc = -ENOMEM;
+			break;
+		}
+	}
+
+	if (rc) {
+		for (i = 0; i < num_pages; i++)
+			put_page(pages[i]);
+	}
+	return rc;
+}
+
+static inline
+size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
+{
+	size_t num_pages;
+	size_t clen;
+
+	clen = min_t(const size_t, len, wsize);
+	num_pages = DIV_ROUND_UP(clen, PAGE_SIZE);
+
+	if (cur_len)
+		*cur_len = clen;
+
+	return num_pages;
+}
+
+static void
+cifs_uncached_writedata_release(struct kref *refcount)
+{
+	int i;
+	struct cifs_writedata *wdata = container_of(refcount,
+					struct cifs_writedata, refcount);
+
+	for (i = 0; i < wdata->nr_pages; i++)
+		put_page(wdata->pages[i]);
+	cifs_writedata_release(refcount);
+}
+
+static void
+cifs_uncached_writev_complete(struct work_struct *work)
+{
+	struct cifs_writedata *wdata = container_of(work,
+					struct cifs_writedata, work);
+	struct inode *inode = d_inode(wdata->cfile->dentry);
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
+	if (cifsi->server_eof > inode->i_size)
+		i_size_write(inode, cifsi->server_eof);
+	spin_unlock(&inode->i_lock);
+
+	complete(&wdata->done);
+
+	kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+}
+
+static int
+wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
+		      size_t *len, unsigned long *num_pages)
+{
+	size_t save_len, copied, bytes, cur_len = *len;
+	unsigned long i, nr_pages = *num_pages;
+
+	save_len = cur_len;
+	for (i = 0; i < nr_pages; i++) {
+		bytes = min_t(const size_t, cur_len, PAGE_SIZE);
+		copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+		cur_len -= copied;
+		/*
+		 * If we didn't copy as much as we expected, then that
+		 * may mean we trod into an unmapped area. Stop copying
+		 * at that point. On the next pass through the big
+		 * loop, we'll likely end up getting a zero-length
+		 * write and bailing out of it.
+		 */
+		if (copied < bytes)
+			break;
+	}
+	cur_len = save_len - cur_len;
+	*len = cur_len;
+
+	/*
+	 * If we have no data to send, then that probably means that
+	 * the copy above failed altogether. That's most likely because
+	 * the address in the iovec was bogus. Return -EFAULT and let
+	 * the caller free anything we allocated and bail out.
+	 */
+	if (!cur_len)
+		return -EFAULT;
+
+	/*
+	 * i + 1 now represents the number of pages we actually used in
+	 * the copy phase above.
+	 */
+	*num_pages = i + 1;
+	return 0;
+}
+
+static int
+cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
+		     struct cifsFileInfo *open_file,
+		     struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
+{
+	int rc = 0;
+	size_t cur_len;
+	unsigned long nr_pages, num_pages, i;
+	struct cifs_writedata *wdata;
+	struct iov_iter saved_from;
+	loff_t saved_offset = offset;
+	pid_t pid;
+	struct TCP_Server_Info *server;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	server = tlink_tcon(open_file->tlink)->ses->server;
+	memcpy(&saved_from, from, sizeof(struct iov_iter));
+
+	do {
+		unsigned int wsize, credits;
+
+		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+						   &wsize, &credits);
+		if (rc)
+			break;
+
+		nr_pages = get_numpages(wsize, len, &cur_len);
+		wdata = cifs_writedata_alloc(nr_pages,
+					     cifs_uncached_writev_complete);
+		if (!wdata) {
+			rc = -ENOMEM;
+			add_credits_and_wake_if(server, credits, 0);
+			break;
+		}
+
+		rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
+		if (rc) {
+			kfree(wdata);
+			add_credits_and_wake_if(server, credits, 0);
+			break;
+		}
+
+		num_pages = nr_pages;
+		rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
+		if (rc) {
+			for (i = 0; i < nr_pages; i++)
+				put_page(wdata->pages[i]);
+			kfree(wdata);
+			add_credits_and_wake_if(server, credits, 0);
+			break;
+		}
+
+		/*
+		 * Bring nr_pages down to the number of pages we actually used,
+		 * and free any pages that we didn't use.
+		 */
+		for ( ; nr_pages > num_pages; nr_pages--)
+			put_page(wdata->pages[nr_pages - 1]);
+
+		wdata->sync_mode = WB_SYNC_ALL;
+		wdata->nr_pages = nr_pages;
+		wdata->offset = (__u64)offset;
+		wdata->cfile = cifsFileInfo_get(open_file);
+		wdata->pid = pid;
+		wdata->bytes = cur_len;
+		wdata->pagesz = PAGE_SIZE;
+		wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
+		wdata->credits = credits;
+
+		if (!wdata->cfile->invalidHandle ||
+		    !cifs_reopen_file(wdata->cfile, false))
+			rc = server->ops->async_writev(wdata,
+					cifs_uncached_writedata_release);
+		if (rc) {
+			add_credits_and_wake_if(server, wdata->credits, 0);
+			kref_put(&wdata->refcount,
+				 cifs_uncached_writedata_release);
+			if (rc == -EAGAIN) {
+				memcpy(from, &saved_from,
+				       sizeof(struct iov_iter));
+				iov_iter_advance(from, offset - saved_offset);
+				continue;
+			}
+			break;
+		}
+
+		list_add_tail(&wdata->list, wdata_list);
+		offset += cur_len;
+		len -= cur_len;
+	} while (len > 0);
+
+	return rc;
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t total_written = 0;
+	struct cifsFileInfo *open_file;
+	struct cifs_tcon *tcon;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_writedata *wdata, *tmp;
+	struct list_head wdata_list;
+	struct iov_iter saved_from;
+	int rc;
+
+	/*
+	 * BB - optimize the way when signing is disabled. We can drop this
+	 * extra memory-to-memory copying and use iovec buffers for constructing
+	 * write request.
+	 */
+
+	rc = generic_write_checks(iocb, from);
+	if (rc <= 0)
+		return rc;
+
+	INIT_LIST_HEAD(&wdata_list);
+	cifs_sb = CIFS_FILE_SB(file);
+	open_file = file->private_data;
+	tcon = tlink_tcon(open_file->tlink);
+
+	if (!tcon->ses->server->ops->async_writev)
+		return -ENOSYS;
+
+	memcpy(&saved_from, from, sizeof(struct iov_iter));
+
+	rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from,
+				  open_file, cifs_sb, &wdata_list);
+
+	/*
+	 * If at least one write was successfully sent, then discard any rc
+	 * value from the later writes. If the other write succeeds, then
+	 * we'll end up returning whatever was written. If it fails, then
+	 * we'll get a new rc value from that.
+	 */
+	if (!list_empty(&wdata_list))
+		rc = 0;
+
+	/*
+	 * Wait for and collect replies for any successful sends in order of
+	 * increasing offset. Once an error is hit or we get a fatal signal
+	 * while waiting, then return without waiting for any more replies.
+	 */
+restart_loop:
+	list_for_each_entry_safe(wdata, tmp, &wdata_list, list) {
+		if (!rc) {
+			/* FIXME: freezable too? */
+			rc = wait_for_completion_killable(&wdata->done);
+			if (rc)
+				rc = -EINTR;
+			else if (wdata->result)
+				rc = wdata->result;
+			else
+				total_written += wdata->bytes;
+
+			/* resend call if it's a retryable error */
+			if (rc == -EAGAIN) {
+				struct list_head tmp_list;
+				struct iov_iter tmp_from;
+
+				INIT_LIST_HEAD(&tmp_list);
+				list_del_init(&wdata->list);
+
+				memcpy(&tmp_from, &saved_from,
+				       sizeof(struct iov_iter));
+				iov_iter_advance(&tmp_from,
+						 wdata->offset - iocb->ki_pos);
+
+				rc = cifs_write_from_iter(wdata->offset,
+						wdata->bytes, &tmp_from,
+						open_file, cifs_sb, &tmp_list);
+
+				list_splice(&tmp_list, &wdata_list);
+
+				kref_put(&wdata->refcount,
+					 cifs_uncached_writedata_release);
+				goto restart_loop;
+			}
+		}
+		list_del_init(&wdata->list);
+		kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+	}
+
+	if (unlikely(!total_written))
+		return rc;
+
+	iocb->ki_pos += total_written;
+	set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags);
+	cifs_stats_bytes_written(tcon, total_written);
+	return total_written;
+}
+
+static ssize_t
+cifs_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct inode *inode = file->f_mapping->host;
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+	ssize_t rc;
+
+	/*
+	 * We need to hold the sem to be sure nobody modifies lock list
+	 * with a brlock that prevents writing.
+	 */
+	down_read(&cinode->lock_sem);
+	mutex_lock(&inode->i_mutex);
+
+	rc = generic_write_checks(iocb, from);
+	if (rc <= 0)
+		goto out;
+
+	if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
+				     server->vals->exclusive_lock_type, NULL,
+				     CIFS_WRITE_OP))
+		rc = __generic_file_write_iter(iocb, from);
+	else
+		rc = -EACCES;
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	if (rc > 0) {
+		ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+		if (err < 0)
+			rc = err;
+	}
+	up_read(&cinode->lock_sem);
+	return rc;
+}
+
+ssize_t
+cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)
+						iocb->ki_filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	ssize_t written;
+
+	written = cifs_get_writer(cinode);
+	if (written)
+		return written;
+
+	if (CIFS_CACHE_WRITE(cinode)) {
+		if (cap_unix(tcon->ses) &&
+		(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
+		  && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+			written = generic_file_write_iter(iocb, from);
+			goto out;
+		}
+		written = cifs_writev(iocb, from);
+		goto out;
+	}
+	/*
+	 * For non-oplocked files in strict cache mode we need to write the data
+	 * to the server exactly from the pos to pos+len-1 rather than flush all
+	 * affected pages because it may cause a error with mandatory locks on
+	 * these pages but not on the region from pos to ppos+len-1.
+	 */
+	written = cifs_user_writev(iocb, from);
+	if (written > 0 && CIFS_CACHE_READ(cinode)) {
+		/*
+		 * Windows 7 server can delay breaking level2 oplock if a write
+		 * request comes - break it on the client to prevent reading
+		 * an old data.
+		 */
+		cifs_zap_mapping(inode);
+		cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+			 inode);
+		cinode->oplock = 0;
+	}
+out:
+	cifs_put_writer(cinode);
+	return written;
+}
+
+static struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+	struct cifs_readdata *rdata;
+
+	rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages),
+			GFP_KERNEL);
+	if (rdata != NULL) {
+		kref_init(&rdata->refcount);
+		INIT_LIST_HEAD(&rdata->list);
+		init_completion(&rdata->done);
+		INIT_WORK(&rdata->work, complete);
+	}
+
+	return rdata;
+}
+
+void
+cifs_readdata_release(struct kref *refcount)
+{
+	struct cifs_readdata *rdata = container_of(refcount,
+					struct cifs_readdata, refcount);
+
+	if (rdata->cfile)
+		cifsFileInfo_put(rdata->cfile);
+
+	kfree(rdata);
+}
+
+static int
+cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages)
+{
+	int rc = 0;
+	struct page *page;
+	unsigned int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		if (!page) {
+			rc = -ENOMEM;
+			break;
+		}
+		rdata->pages[i] = page;
+	}
+
+	if (rc) {
+		for (i = 0; i < nr_pages; i++) {
+			put_page(rdata->pages[i]);
+			rdata->pages[i] = NULL;
+		}
+	}
+	return rc;
+}
+
+static void
+cifs_uncached_readdata_release(struct kref *refcount)
+{
+	struct cifs_readdata *rdata = container_of(refcount,
+					struct cifs_readdata, refcount);
+	unsigned int i;
+
+	for (i = 0; i < rdata->nr_pages; i++) {
+		put_page(rdata->pages[i]);
+		rdata->pages[i] = NULL;
+	}
+	cifs_readdata_release(refcount);
+}
+
+/**
+ * cifs_readdata_to_iov - copy data from pages in response to an iovec
+ * @rdata:	the readdata response with list of pages holding data
+ * @iter:	destination for our data
+ *
+ * This function copies data from a list of pages in a readdata response into
+ * an array of iovecs. It will first calculate where the data should go
+ * based on the info in the readdata and then copy the data into that spot.
+ */
+static int
+cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
+{
+	size_t remaining = rdata->got_bytes;
+	unsigned int i;
+
+	for (i = 0; i < rdata->nr_pages; i++) {
+		struct page *page = rdata->pages[i];
+		size_t copy = min_t(size_t, remaining, PAGE_SIZE);
+		size_t written = copy_page_to_iter(page, 0, copy, iter);
+		remaining -= written;
+		if (written < copy && iov_iter_count(iter) > 0)
+			break;
+	}
+	return remaining ? -EFAULT : 0;
+}
+
+static void
+cifs_uncached_readv_complete(struct work_struct *work)
+{
+	struct cifs_readdata *rdata = container_of(work,
+						struct cifs_readdata, work);
+
+	complete(&rdata->done);
+	kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+}
+
+static int
+cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
+			struct cifs_readdata *rdata, unsigned int len)
+{
+	int result = 0;
+	unsigned int i;
+	unsigned int nr_pages = rdata->nr_pages;
+	struct kvec iov;
+
+	rdata->got_bytes = 0;
+	rdata->tailsz = PAGE_SIZE;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = rdata->pages[i];
+
+		if (len >= PAGE_SIZE) {
+			/* enough data to fill the page */
+			iov.iov_base = kmap(page);
+			iov.iov_len = PAGE_SIZE;
+			cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+				 i, iov.iov_base, iov.iov_len);
+			len -= PAGE_SIZE;
+		} else if (len > 0) {
+			/* enough for partial page, fill and zero the rest */
+			iov.iov_base = kmap(page);
+			iov.iov_len = len;
+			cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+				 i, iov.iov_base, iov.iov_len);
+			memset(iov.iov_base + len, '\0', PAGE_SIZE - len);
+			rdata->tailsz = len;
+			len = 0;
+		} else {
+			/* no need to hold page hostage */
+			rdata->pages[i] = NULL;
+			rdata->nr_pages--;
+			put_page(page);
+			continue;
+		}
+
+		result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
+		kunmap(page);
+		if (result < 0)
+			break;
+
+		rdata->got_bytes += result;
+	}
+
+	return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+						rdata->got_bytes : result;
+}
+
+static int
+cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
+		     struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
+{
+	struct cifs_readdata *rdata;
+	unsigned int npages, rsize, credits;
+	size_t cur_len;
+	int rc;
+	pid_t pid;
+	struct TCP_Server_Info *server;
+
+	server = tlink_tcon(open_file->tlink)->ses->server;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	do {
+		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+						   &rsize, &credits);
+		if (rc)
+			break;
+
+		cur_len = min_t(const size_t, len, rsize);
+		npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
+
+		/* allocate a readdata struct */
+		rdata = cifs_readdata_alloc(npages,
+					    cifs_uncached_readv_complete);
+		if (!rdata) {
+			add_credits_and_wake_if(server, credits, 0);
+			rc = -ENOMEM;
+			break;
+		}
+
+		rc = cifs_read_allocate_pages(rdata, npages);
+		if (rc)
+			goto error;
+
+		rdata->cfile = cifsFileInfo_get(open_file);
+		rdata->nr_pages = npages;
+		rdata->offset = offset;
+		rdata->bytes = cur_len;
+		rdata->pid = pid;
+		rdata->pagesz = PAGE_SIZE;
+		rdata->read_into_pages = cifs_uncached_read_into_pages;
+		rdata->credits = credits;
+
+		if (!rdata->cfile->invalidHandle ||
+		    !cifs_reopen_file(rdata->cfile, true))
+			rc = server->ops->async_readv(rdata);
+error:
+		if (rc) {
+			add_credits_and_wake_if(server, rdata->credits, 0);
+			kref_put(&rdata->refcount,
+				 cifs_uncached_readdata_release);
+			if (rc == -EAGAIN)
+				continue;
+			break;
+		}
+
+		list_add_tail(&rdata->list, rdata_list);
+		offset += cur_len;
+		len -= cur_len;
+	} while (len > 0);
+
+	return rc;
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t rc;
+	size_t len;
+	ssize_t total_read = 0;
+	loff_t offset = iocb->ki_pos;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct cifsFileInfo *open_file;
+	struct cifs_readdata *rdata, *tmp;
+	struct list_head rdata_list;
+
+	len = iov_iter_count(to);
+	if (!len)
+		return 0;
+
+	INIT_LIST_HEAD(&rdata_list);
+	cifs_sb = CIFS_FILE_SB(file);
+	open_file = file->private_data;
+	tcon = tlink_tcon(open_file->tlink);
+
+	if (!tcon->ses->server->ops->async_readv)
+		return -ENOSYS;
+
+	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+		cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+	rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
+
+	/* if at least one read request send succeeded, then reset rc */
+	if (!list_empty(&rdata_list))
+		rc = 0;
+
+	len = iov_iter_count(to);
+	/* the loop below should proceed in the order of increasing offsets */
+again:
+	list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+		if (!rc) {
+			/* FIXME: freezable sleep too? */
+			rc = wait_for_completion_killable(&rdata->done);
+			if (rc)
+				rc = -EINTR;
+			else if (rdata->result == -EAGAIN) {
+				/* resend call if it's a retryable error */
+				struct list_head tmp_list;
+				unsigned int got_bytes = rdata->got_bytes;
+
+				list_del_init(&rdata->list);
+				INIT_LIST_HEAD(&tmp_list);
+
+				/*
+				 * Got a part of data and then reconnect has
+				 * happened -- fill the buffer and continue
+				 * reading.
+				 */
+				if (got_bytes && got_bytes < rdata->bytes) {
+					rc = cifs_readdata_to_iov(rdata, to);
+					if (rc) {
+						kref_put(&rdata->refcount,
+						cifs_uncached_readdata_release);
+						continue;
+					}
+				}
+
+				rc = cifs_send_async_read(
+						rdata->offset + got_bytes,
+						rdata->bytes - got_bytes,
+						rdata->cfile, cifs_sb,
+						&tmp_list);
+
+				list_splice(&tmp_list, &rdata_list);
+
+				kref_put(&rdata->refcount,
+					 cifs_uncached_readdata_release);
+				goto again;
+			} else if (rdata->result)
+				rc = rdata->result;
+			else
+				rc = cifs_readdata_to_iov(rdata, to);
+
+			/* if there was a short read -- discard anything left */
+			if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
+				rc = -ENODATA;
+		}
+		list_del_init(&rdata->list);
+		kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+	}
+
+	total_read = len - iov_iter_count(to);
+
+	cifs_stats_bytes_read(tcon, total_read);
+
+	/* mask nodata case */
+	if (rc == -ENODATA)
+		rc = 0;
+
+	if (total_read) {
+		iocb->ki_pos += total_read;
+		return total_read;
+	}
+	return rc;
+}
+
+ssize_t
+cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)
+						iocb->ki_filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	int rc = -EACCES;
+
+	/*
+	 * In strict cache mode we need to read from the server all the time
+	 * if we don't have level II oplock because the server can delay mtime
+	 * change - so we can't make a decision about inode invalidating.
+	 * And we can also fail with pagereading if there are mandatory locks
+	 * on pages affected by this read but not on the region from pos to
+	 * pos+len-1.
+	 */
+	if (!CIFS_CACHE_READ(cinode))
+		return cifs_user_readv(iocb, to);
+
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		return generic_file_read_iter(iocb, to);
+
+	/*
+	 * We need to hold the sem to be sure nobody modifies lock list
+	 * with a brlock that prevents reading.
+	 */
+	down_read(&cinode->lock_sem);
+	if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
+				     tcon->ses->server->vals->shared_lock_type,
+				     NULL, CIFS_READ_OP))
+		rc = generic_file_read_iter(iocb, to);
+	up_read(&cinode->lock_sem);
+	return rc;
+}
+
+static ssize_t
+cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
+{
+	int rc = -EACCES;
+	unsigned int bytes_read = 0;
+	unsigned int total_read;
+	unsigned int current_read_size;
+	unsigned int rsize;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	unsigned int xid;
+	char *cur_offset;
+	struct cifsFileInfo *open_file;
+	struct cifs_io_parms io_parms;
+	int buf_type = CIFS_NO_BUFFER;
+	__u32 pid;
+
+	xid = get_xid();
+	cifs_sb = CIFS_FILE_SB(file);
+
+	/* FIXME: set up handlers for larger reads and/or convert to async */
+	rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
+	if (file->private_data == NULL) {
+		rc = -EBADF;
+		free_xid(xid);
+		return rc;
+	}
+	open_file = file->private_data;
+	tcon = tlink_tcon(open_file->tlink);
+	server = tcon->ses->server;
+
+	if (!server->ops->sync_read) {
+		free_xid(xid);
+		return -ENOSYS;
+	}
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+		cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+	for (total_read = 0, cur_offset = read_data; read_size > total_read;
+	     total_read += bytes_read, cur_offset += bytes_read) {
+		do {
+			current_read_size = min_t(uint, read_size - total_read,
+						  rsize);
+			/*
+			 * For windows me and 9x we do not want to request more
+			 * than it negotiated since it will refuse the read
+			 * then.
+			 */
+			if ((tcon->ses) && !(tcon->ses->capabilities &
+				tcon->ses->server->vals->cap_large_files)) {
+				current_read_size = min_t(uint,
+					current_read_size, CIFSMaxBufSize);
+			}
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, true);
+				if (rc != 0)
+					break;
+			}
+			io_parms.pid = pid;
+			io_parms.tcon = tcon;
+			io_parms.offset = *offset;
+			io_parms.length = current_read_size;
+			rc = server->ops->sync_read(xid, &open_file->fid, &io_parms,
+						    &bytes_read, &cur_offset,
+						    &buf_type);
+		} while (rc == -EAGAIN);
+
+		if (rc || (bytes_read == 0)) {
+			if (total_read) {
+				break;
+			} else {
+				free_xid(xid);
+				return rc;
+			}
+		} else {
+			cifs_stats_bytes_read(tcon, total_read);
+			*offset += bytes_read;
+		}
+	}
+	free_xid(xid);
+	return total_read;
+}
+
+/*
+ * If the page is mmap'ed into a process' page tables, then we need to make
+ * sure that it doesn't change while being written back.
+ */
+static int
+cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+
+	lock_page(page);
+	return VM_FAULT_LOCKED;
+}
+
+static struct vm_operations_struct cifs_file_vm_ops = {
+	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = cifs_page_mkwrite,
+};
+
+int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int rc, xid;
+	struct inode *inode = file_inode(file);
+
+	xid = get_xid();
+
+	if (!CIFS_CACHE_READ(CIFS_I(inode))) {
+		rc = cifs_zap_mapping(inode);
+		if (rc)
+			return rc;
+	}
+
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0)
+		vma->vm_ops = &cifs_file_vm_ops;
+	free_xid(xid);
+	return rc;
+}
+
+int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int rc, xid;
+
+	xid = get_xid();
+	rc = cifs_revalidate_file(file);
+	if (rc) {
+		cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n",
+			 rc);
+		free_xid(xid);
+		return rc;
+	}
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0)
+		vma->vm_ops = &cifs_file_vm_ops;
+	free_xid(xid);
+	return rc;
+}
+
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+	unsigned int i, got_bytes;
+	struct cifs_readdata *rdata = container_of(work,
+						struct cifs_readdata, work);
+
+	got_bytes = rdata->got_bytes;
+	for (i = 0; i < rdata->nr_pages; i++) {
+		struct page *page = rdata->pages[i];
+
+		lru_cache_add_file(page);
+
+		if (rdata->result == 0 ||
+		    (rdata->result == -EAGAIN && got_bytes)) {
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+		}
+
+		unlock_page(page);
+
+		if (rdata->result == 0 ||
+		    (rdata->result == -EAGAIN && got_bytes))
+			cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+		got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
+
+		page_cache_release(page);
+		rdata->pages[i] = NULL;
+	}
+	kref_put(&rdata->refcount, cifs_readdata_release);
+}
+
+static int
+cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
+			struct cifs_readdata *rdata, unsigned int len)
+{
+	int result = 0;
+	unsigned int i;
+	u64 eof;
+	pgoff_t eof_index;
+	unsigned int nr_pages = rdata->nr_pages;
+	struct kvec iov;
+
+	/* determine the eof that the server (probably) has */
+	eof = CIFS_I(rdata->mapping->host)->server_eof;
+	eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+	cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+
+	rdata->got_bytes = 0;
+	rdata->tailsz = PAGE_CACHE_SIZE;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = rdata->pages[i];
+
+		if (len >= PAGE_CACHE_SIZE) {
+			/* enough data to fill the page */
+			iov.iov_base = kmap(page);
+			iov.iov_len = PAGE_CACHE_SIZE;
+			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+				 i, page->index, iov.iov_base, iov.iov_len);
+			len -= PAGE_CACHE_SIZE;
+		} else if (len > 0) {
+			/* enough for partial page, fill and zero the rest */
+			iov.iov_base = kmap(page);
+			iov.iov_len = len;
+			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+				 i, page->index, iov.iov_base, iov.iov_len);
+			memset(iov.iov_base + len,
+				'\0', PAGE_CACHE_SIZE - len);
+			rdata->tailsz = len;
+			len = 0;
+		} else if (page->index > eof_index) {
+			/*
+			 * The VFS will not try to do readahead past the
+			 * i_size, but it's possible that we have outstanding
+			 * writes with gaps in the middle and the i_size hasn't
+			 * caught up yet. Populate those with zeroed out pages
+			 * to prevent the VFS from repeatedly attempting to
+			 * fill them until the writes are flushed.
+			 */
+			zero_user(page, 0, PAGE_CACHE_SIZE);
+			lru_cache_add_file(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+			unlock_page(page);
+			page_cache_release(page);
+			rdata->pages[i] = NULL;
+			rdata->nr_pages--;
+			continue;
+		} else {
+			/* no need to hold page hostage */
+			lru_cache_add_file(page);
+			unlock_page(page);
+			page_cache_release(page);
+			rdata->pages[i] = NULL;
+			rdata->nr_pages--;
+			continue;
+		}
+
+		result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
+		kunmap(page);
+		if (result < 0)
+			break;
+
+		rdata->got_bytes += result;
+	}
+
+	return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+						rdata->got_bytes : result;
+}
+
+static int
+readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
+		    unsigned int rsize, struct list_head *tmplist,
+		    unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
+{
+	struct page *page, *tpage;
+	unsigned int expected_index;
+	int rc;
+
+	INIT_LIST_HEAD(tmplist);
+
+	page = list_entry(page_list->prev, struct page, lru);
+
+	/*
+	 * Lock the page and put it in the cache. Since no one else
+	 * should have access to this page, we're safe to simply set
+	 * PG_locked without checking it first.
+	 */
+	__set_page_locked(page);
+	rc = add_to_page_cache_locked(page, mapping,
+				      page->index, GFP_KERNEL);
+
+	/* give up if we can't stick it in the cache */
+	if (rc) {
+		__clear_page_locked(page);
+		return rc;
+	}
+
+	/* move first page to the tmplist */
+	*offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	*bytes = PAGE_CACHE_SIZE;
+	*nr_pages = 1;
+	list_move_tail(&page->lru, tmplist);
+
+	/* now try and add more pages onto the request */
+	expected_index = page->index + 1;
+	list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+		/* discontinuity ? */
+		if (page->index != expected_index)
+			break;
+
+		/* would this page push the read over the rsize? */
+		if (*bytes + PAGE_CACHE_SIZE > rsize)
+			break;
+
+		__set_page_locked(page);
+		if (add_to_page_cache_locked(page, mapping, page->index,
+								GFP_KERNEL)) {
+			__clear_page_locked(page);
+			break;
+		}
+		list_move_tail(&page->lru, tmplist);
+		(*bytes) += PAGE_CACHE_SIZE;
+		expected_index++;
+		(*nr_pages)++;
+	}
+	return rc;
+}
+
+static int cifs_readpages(struct file *file, struct address_space *mapping,
+	struct list_head *page_list, unsigned num_pages)
+{
+	int rc;
+	struct list_head tmplist;
+	struct cifsFileInfo *open_file = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
+	struct TCP_Server_Info *server;
+	pid_t pid;
+
+	/*
+	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
+	 * immediately if the cookie is negative
+	 *
+	 * After this point, every page in the list might have PG_fscache set,
+	 * so we will need to clean that up off of every page we don't use.
+	 */
+	rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
+					 &num_pages);
+	if (rc == 0)
+		return rc;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	rc = 0;
+	server = tlink_tcon(open_file->tlink)->ses->server;
+
+	cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
+		 __func__, file, mapping, num_pages);
+
+	/*
+	 * Start with the page at end of list and move it to private
+	 * list. Do the same with any following pages until we hit
+	 * the rsize limit, hit an index discontinuity, or run out of
+	 * pages. Issue the async read and then start the loop again
+	 * until the list is empty.
+	 *
+	 * Note that list order is important. The page_list is in
+	 * the order of declining indexes. When we put the pages in
+	 * the rdata->pages, then we want them in increasing order.
+	 */
+	while (!list_empty(page_list)) {
+		unsigned int i, nr_pages, bytes, rsize;
+		loff_t offset;
+		struct page *page, *tpage;
+		struct cifs_readdata *rdata;
+		unsigned credits;
+
+		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+						   &rsize, &credits);
+		if (rc)
+			break;
+
+		/*
+		 * Give up immediately if rsize is too small to read an entire
+		 * page. The VFS will fall back to readpage. We should never
+		 * reach this point however since we set ra_pages to 0 when the
+		 * rsize is smaller than a cache page.
+		 */
+		if (unlikely(rsize < PAGE_CACHE_SIZE)) {
+			add_credits_and_wake_if(server, credits, 0);
+			return 0;
+		}
+
+		rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
+					 &nr_pages, &offset, &bytes);
+		if (rc) {
+			add_credits_and_wake_if(server, credits, 0);
+			break;
+		}
+
+		rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
+		if (!rdata) {
+			/* best to give up if we're out of mem */
+			list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+				list_del(&page->lru);
+				lru_cache_add_file(page);
+				unlock_page(page);
+				page_cache_release(page);
+			}
+			rc = -ENOMEM;
+			add_credits_and_wake_if(server, credits, 0);
+			break;
+		}
+
+		rdata->cfile = cifsFileInfo_get(open_file);
+		rdata->mapping = mapping;
+		rdata->offset = offset;
+		rdata->bytes = bytes;
+		rdata->pid = pid;
+		rdata->pagesz = PAGE_CACHE_SIZE;
+		rdata->read_into_pages = cifs_readpages_read_into_pages;
+		rdata->credits = credits;
+
+		list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+			list_del(&page->lru);
+			rdata->pages[rdata->nr_pages++] = page;
+		}
+
+		if (!rdata->cfile->invalidHandle ||
+		    !cifs_reopen_file(rdata->cfile, true))
+			rc = server->ops->async_readv(rdata);
+		if (rc) {
+			add_credits_and_wake_if(server, rdata->credits, 0);
+			for (i = 0; i < rdata->nr_pages; i++) {
+				page = rdata->pages[i];
+				lru_cache_add_file(page);
+				unlock_page(page);
+				page_cache_release(page);
+			}
+			/* Fallback to the readpage in error/reconnect cases */
+			kref_put(&rdata->refcount, cifs_readdata_release);
+			break;
+		}
+
+		kref_put(&rdata->refcount, cifs_readdata_release);
+	}
+
+	/* Any pages that have been shown to fscache but didn't get added to
+	 * the pagecache must be uncached before they get returned to the
+	 * allocator.
+	 */
+	cifs_fscache_readpages_cancel(mapping->host, page_list);
+	return rc;
+}
+
+/*
+ * cifs_readpage_worker must be called with the page pinned
+ */
+static int cifs_readpage_worker(struct file *file, struct page *page,
+	loff_t *poffset)
+{
+	char *read_data;
+	int rc;
+
+	/* Is the page cached? */
+	rc = cifs_readpage_from_fscache(file_inode(file), page);
+	if (rc == 0)
+		goto read_complete;
+
+	read_data = kmap(page);
+	/* for reads over a certain size could initiate async read ahead */
+
+	rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset);
+
+	if (rc < 0)
+		goto io_error;
+	else
+		cifs_dbg(FYI, "Bytes read %d\n", rc);
+
+	file_inode(file)->i_atime =
+		current_fs_time(file_inode(file)->i_sb);
+
+	if (PAGE_CACHE_SIZE > rc)
+		memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+
+	/* send this page to the cache */
+	cifs_readpage_to_fscache(file_inode(file), page);
+
+	rc = 0;
+
+io_error:
+	kunmap(page);
+	unlock_page(page);
+
+read_complete:
+	return rc;
+}
+
+static int cifs_readpage(struct file *file, struct page *page)
+{
+	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int rc = -EACCES;
+	unsigned int xid;
+
+	xid = get_xid();
+
+	if (file->private_data == NULL) {
+		rc = -EBADF;
+		free_xid(xid);
+		return rc;
+	}
+
+	cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n",
+		 page, (int)offset, (int)offset);
+
+	rc = cifs_readpage_worker(file, page, &offset);
+
+	free_xid(xid);
+	return rc;
+}
+
+static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
+{
+	struct cifsFileInfo *open_file;
+
+	spin_lock(&cifs_file_list_lock);
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
+			spin_unlock(&cifs_file_list_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&cifs_file_list_lock);
+	return 0;
+}
+
+/* We do not want to update the file size from server for inodes
+   open for write - to avoid races with writepage extending
+   the file - in the future we could consider allowing
+   refreshing the inode only on increases in the file size
+   but this is tricky to do without racing with writebehind
+   page caching in the current Linux kernel design */
+bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
+{
+	if (!cifsInode)
+		return true;
+
+	if (is_inode_writable(cifsInode)) {
+		/* This inode is open for write at least once */
+		struct cifs_sb_info *cifs_sb;
+
+		cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			/* since no page cache to corrupt on directio
+			we can change size safely */
+			return true;
+		}
+
+		if (i_size_read(&cifsInode->vfs_inode) < end_of_file)
+			return true;
+
+		return false;
+	} else
+		return true;
+}
+
+static int cifs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int oncethru = 0;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+	loff_t page_start = pos & PAGE_MASK;
+	loff_t i_size;
+	struct page *page;
+	int rc = 0;
+
+	cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
+
+start:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (PageUptodate(page))
+		goto out;
+
+	/*
+	 * If we write a full page it will be up to date, no need to read from
+	 * the server. If the write is short, we'll end up doing a sync write
+	 * instead.
+	 */
+	if (len == PAGE_CACHE_SIZE)
+		goto out;
+
+	/*
+	 * optimize away the read when we have an oplock, and we're not
+	 * expecting to use any of the data we'd be reading in. That
+	 * is, when the page lies beyond the EOF, or straddles the EOF
+	 * and the write will cover all of the existing data.
+	 */
+	if (CIFS_CACHE_READ(CIFS_I(mapping->host))) {
+		i_size = i_size_read(mapping->host);
+		if (page_start >= i_size ||
+		    (offset == 0 && (pos + len) >= i_size)) {
+			zero_user_segments(page, 0, offset,
+					   offset + len,
+					   PAGE_CACHE_SIZE);
+			/*
+			 * PageChecked means that the parts of the page
+			 * to which we're not writing are considered up
+			 * to date. Once the data is copied to the
+			 * page, it can be set uptodate.
+			 */
+			SetPageChecked(page);
+			goto out;
+		}
+	}
+
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) {
+		/*
+		 * might as well read a page, it is fast enough. If we get
+		 * an error, we don't need to return it. cifs_write_end will
+		 * do a sync write instead since PG_uptodate isn't set.
+		 */
+		cifs_readpage_worker(file, page, &page_start);
+		page_cache_release(page);
+		oncethru = 1;
+		goto start;
+	} else {
+		/* we could try using another file handle if there is one -
+		   but how would we lock it to prevent close of that handle
+		   racing with this read? In any case
+		   this will be written out by write_end so is fine */
+	}
+out:
+	*pagep = page;
+	return rc;
+}
+
+static int cifs_release_page(struct page *page, gfp_t gfp)
+{
+	if (PagePrivate(page))
+		return 0;
+
+	return cifs_fscache_release_page(page, gfp);
+}
+
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
+
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
+		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
+}
+
+static int cifs_launder_page(struct page *page)
+{
+	int rc = 0;
+	loff_t range_start = page_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+
+	cifs_dbg(FYI, "Launder page: %p\n", page);
+
+	if (clear_page_dirty_for_io(page))
+		rc = cifs_writepage_locked(page, &wbc);
+
+	cifs_fscache_invalidate_page(page, page->mapping->host);
+	return rc;
+}
+
+void cifs_oplock_break(struct work_struct *work)
+{
+	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+						  oplock_break);
+	struct inode *inode = d_inode(cfile->dentry);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	int rc = 0;
+
+	wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
+			TASK_UNINTERRUPTIBLE);
+
+	server->ops->downgrade_oplock(server, cinode,
+		test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
+
+	if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
+						cifs_has_mand_locks(cinode)) {
+		cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
+			 inode);
+		cinode->oplock = 0;
+	}
+
+	if (inode && S_ISREG(inode->i_mode)) {
+		if (CIFS_CACHE_READ(cinode))
+			break_lease(inode, O_RDONLY);
+		else
+			break_lease(inode, O_WRONLY);
+		rc = filemap_fdatawrite(inode->i_mapping);
+		if (!CIFS_CACHE_READ(cinode)) {
+			rc = filemap_fdatawait(inode->i_mapping);
+			mapping_set_error(inode->i_mapping, rc);
+			cifs_zap_mapping(inode);
+		}
+		cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
+	}
+
+	rc = cifs_push_locks(cfile);
+	if (rc)
+		cifs_dbg(VFS, "Push locks rc = %d\n", rc);
+
+	/*
+	 * releasing stale oplock after recent reconnect of smb session using
+	 * a now incorrect file handle is not a data integrity issue but do
+	 * not bother sending an oplock release if session to server still is
+	 * disconnected since oplock already released by the server
+	 */
+	if (!cfile->oplock_break_cancelled) {
+		rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
+							     cinode);
+		cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
+	}
+	cifs_done_oplock_break(cinode);
+}
+
+/*
+ * The presence of cifs_direct_io() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
+ * so this method should never be called.
+ *
+ * Direct IO is not yet supported in the cached mode. 
+ */
+static ssize_t
+cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+{
+        /*
+         * FIXME
+         * Eventually need to support direct IO for non forcedirectio mounts
+         */
+        return -EINVAL;
+}
+
+
+const struct address_space_operations cifs_addr_ops = {
+	.readpage = cifs_readpage,
+	.readpages = cifs_readpages,
+	.writepage = cifs_writepage,
+	.writepages = cifs_writepages,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.releasepage = cifs_release_page,
+	.direct_IO = cifs_direct_io,
+	.invalidatepage = cifs_invalidate_page,
+	.launder_page = cifs_launder_page,
+};
+
+/*
+ * cifs_readpages requires the server to support a buffer large enough to
+ * contain the header plus one complete page of data.  Otherwise, we need
+ * to leave cifs_readpages out of the address space operations.
+ */
+const struct address_space_operations cifs_addr_ops_smallbuf = {
+	.readpage = cifs_readpage,
+	.writepage = cifs_writepage,
+	.writepages = cifs_writepages,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.releasepage = cifs_release_page,
+	.invalidatepage = cifs_invalidate_page,
+	.launder_page = cifs_launder_page,
+};
diff --git a/kernel/fs/cifs/fscache.c b/kernel/fs/cifs/fscache.c
new file mode 100644
index 000000000..8d4b7bc8a
--- /dev/null
+++ b/kernel/fs/cifs/fscache.c
@@ -0,0 +1,242 @@
+/*
+ *   fs/cifs/fscache.c - CIFS filesystem cache interface
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Author(s): Suresh Jayaraman <sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+
+void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
+{
+	server->fscache =
+		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
+				&cifs_fscache_server_index_def, server, true);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server, server->fscache);
+}
+
+void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
+{
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server, server->fscache);
+	fscache_relinquish_cookie(server->fscache, 0);
+	server->fscache = NULL;
+}
+
+void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	tcon->fscache =
+		fscache_acquire_cookie(server->fscache,
+				&cifs_fscache_super_index_def, tcon, true);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server->fscache, tcon->fscache);
+}
+
+void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
+{
+	cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
+	fscache_relinquish_cookie(tcon->fscache, 0);
+	tcon->fscache = NULL;
+}
+
+static void cifs_fscache_enable_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	if (cifsi->fscache)
+		return;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
+		cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+				&cifs_fscache_inode_object_def, cifsi, true);
+		cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
+			 __func__, tcon->fscache, cifsi->fscache);
+	}
+}
+
+void cifs_fscache_release_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	if (cifsi->fscache) {
+		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
+		fscache_relinquish_cookie(cifsi->fscache, 0);
+		cifsi->fscache = NULL;
+	}
+}
+
+static void cifs_fscache_disable_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	if (cifsi->fscache) {
+		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
+		fscache_uncache_all_inode_pages(cifsi->fscache, inode);
+		fscache_relinquish_cookie(cifsi->fscache, 1);
+		cifsi->fscache = NULL;
+	}
+}
+
+void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+		cifs_fscache_disable_inode_cookie(inode);
+	else
+		cifs_fscache_enable_inode_cookie(inode);
+}
+
+void cifs_fscache_reset_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct fscache_cookie *old = cifsi->fscache;
+
+	if (cifsi->fscache) {
+		/* retire the current fscache cache and get a new one */
+		fscache_relinquish_cookie(cifsi->fscache, 1);
+
+		cifsi->fscache = fscache_acquire_cookie(
+					cifs_sb_master_tcon(cifs_sb)->fscache,
+					&cifs_fscache_inode_object_def,
+					cifsi, true);
+		cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
+			 __func__, cifsi->fscache, old);
+	}
+}
+
+int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	if (PageFsCache(page)) {
+		struct inode *inode = page->mapping->host;
+		struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+		cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+			 __func__, page, cifsi->fscache);
+		if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
+			return 0;
+	}
+
+	return 1;
+}
+
+static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
+						int error)
+{
+	cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error);
+	if (!error)
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+
+	cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
+		 __func__, CIFS_I(inode)->fscache, page, inode);
+	ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
+					 cifs_readpage_from_fscache_complete,
+					 NULL,
+					 GFP_KERNEL);
+	switch (ret) {
+
+	case 0: /* page found in fscache, read submitted */
+		cifs_dbg(FYI, "%s: submitted\n", __func__);
+		return ret;
+	case -ENOBUFS:	/* page won't be cached */
+	case -ENODATA:	/* page not in cache */
+		cifs_dbg(FYI, "%s: %d\n", __func__, ret);
+		return 1;
+
+	default:
+		cifs_dbg(VFS, "unknown error ret = %d\n", ret);
+	}
+	return ret;
+}
+
+/*
+ * Retrieve a set of pages from FS-Cache
+ */
+int __cifs_readpages_from_fscache(struct inode *inode,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned *nr_pages)
+{
+	int ret;
+
+	cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n",
+		 __func__, CIFS_I(inode)->fscache, *nr_pages, inode);
+	ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
+					  pages, nr_pages,
+					  cifs_readpage_from_fscache_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+	switch (ret) {
+	case 0:	/* read submitted to the cache for all pages */
+		cifs_dbg(FYI, "%s: submitted\n", __func__);
+		return ret;
+
+	case -ENOBUFS:	/* some pages are not cached and can't be */
+	case -ENODATA:	/* some pages are not cached */
+		cifs_dbg(FYI, "%s: no page\n", __func__);
+		return 1;
+
+	default:
+		cifs_dbg(FYI, "unknown error ret = %d\n", ret);
+	}
+
+	return ret;
+}
+
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+
+	cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
+		 __func__, CIFS_I(inode)->fscache, page, inode);
+	ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+	if (ret != 0)
+		fscache_uncache_page(CIFS_I(inode)->fscache, page);
+}
+
+void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages)
+{
+	cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n",
+		 __func__, CIFS_I(inode)->fscache, inode);
+	fscache_readpages_cancel(CIFS_I(inode)->fscache, pages);
+}
+
+void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct fscache_cookie *cookie = cifsi->fscache;
+
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
+	fscache_wait_on_page_write(cookie, page);
+	fscache_uncache_page(cookie, page);
+}
+
diff --git a/kernel/fs/cifs/fscache.h b/kernel/fs/cifs/fscache.h
new file mode 100644
index 000000000..24794b6cd
--- /dev/null
+++ b/kernel/fs/cifs/fscache.h
@@ -0,0 +1,149 @@
+/*
+ *   fs/cifs/fscache.h - CIFS filesystem cache interface definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFS_FSCACHE_H
+#define _CIFS_FSCACHE_H
+
+#include <linux/fscache.h>
+
+#include "cifsglob.h"
+
+#ifdef CONFIG_CIFS_FSCACHE
+
+extern struct fscache_netfs cifs_fscache_netfs;
+extern const struct fscache_cookie_def cifs_fscache_server_index_def;
+extern const struct fscache_cookie_def cifs_fscache_super_index_def;
+extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
+
+extern int cifs_fscache_register(void);
+extern void cifs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
+extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
+
+extern void cifs_fscache_release_inode_cookie(struct inode *);
+extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void cifs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
+extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
+extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
+extern int __cifs_readpages_from_fscache(struct inode *,
+					 struct address_space *,
+					 struct list_head *,
+					 unsigned *);
+extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *);
+
+extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode)
+{
+	if (PageFsCache(page))
+		__cifs_fscache_invalidate_page(page, inode);
+}
+
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_readpage_from_fscache(inode, page);
+
+	return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_readpages_from_fscache(inode, mapping, pages,
+						     nr_pages);
+	return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+	if (PageFsCache(page))
+		__cifs_readpage_to_fscache(inode, page);
+}
+
+static inline void cifs_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_fscache_readpages_cancel(inode, pages);
+}
+
+#else /* CONFIG_CIFS_FSCACHE */
+static inline int cifs_fscache_register(void) { return 0; }
+static inline void cifs_fscache_unregister(void) {}
+
+static inline void
+cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
+static inline void
+cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
+static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {}
+static inline void
+cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
+
+static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
+						 struct file *filp) {}
+static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	return 1; /* May release page */
+}
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+			struct inode *inode) {}
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+			struct page *page) {}
+
+static inline void cifs_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+}
+
+#endif /* CONFIG_CIFS_FSCACHE */
+
+#endif /* _CIFS_FSCACHE_H */
diff --git a/kernel/fs/cifs/inode.c b/kernel/fs/cifs/inode.c
new file mode 100644
index 000000000..f621b44cb
--- /dev/null
+++ b/kernel/fs/cifs/inode.c
@@ -0,0 +1,2454 @@
+/*
+ *   fs/cifs/inode.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/freezer.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#include "fscache.h"
+
+
+static void cifs_set_ops(struct inode *inode)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &cifs_file_inode_ops;
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				inode->i_fop = &cifs_file_direct_nobrl_ops;
+			else
+				inode->i_fop = &cifs_file_direct_ops;
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				inode->i_fop = &cifs_file_strict_nobrl_ops;
+			else
+				inode->i_fop = &cifs_file_strict_ops;
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			inode->i_fop = &cifs_file_nobrl_ops;
+		else { /* not direct, send byte range locks */
+			inode->i_fop = &cifs_file_ops;
+		}
+
+		/* check if server can support readpages */
+		if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
+				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+			inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+		else
+			inode->i_data.a_ops = &cifs_addr_ops;
+		break;
+	case S_IFDIR:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+		if (IS_AUTOMOUNT(inode)) {
+			inode->i_op = &cifs_dfs_referral_inode_operations;
+		} else {
+#else /* NO DFS support, treat as a directory */
+		{
+#endif
+			inode->i_op = &cifs_dir_inode_ops;
+			inode->i_fop = &cifs_dir_ops;
+		}
+		break;
+	case S_IFLNK:
+		inode->i_op = &cifs_symlink_inode_ops;
+		break;
+	default:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	}
+}
+
+/* check inode attributes against fattr. If they don't match, tag the
+ * inode for cache invalidation
+ */
+static void
+cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cifs_dbg(FYI, "%s: revalidating inode %llu\n",
+		 __func__, cifs_i->uniqueid);
+
+	if (inode->i_state & I_NEW) {
+		cifs_dbg(FYI, "%s: inode %llu is new\n",
+			 __func__, cifs_i->uniqueid);
+		return;
+	}
+
+	/* don't bother with revalidation if we have an oplock */
+	if (CIFS_CACHE_READ(cifs_i)) {
+		cifs_dbg(FYI, "%s: inode %llu is oplocked\n",
+			 __func__, cifs_i->uniqueid);
+		return;
+	}
+
+	 /* revalidate if mtime or size have changed */
+	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	    cifs_i->server_eof == fattr->cf_eof) {
+		cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
+			 __func__, cifs_i->uniqueid);
+		return;
+	}
+
+	cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n",
+		 __func__, cifs_i->uniqueid);
+	set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
+}
+
+/*
+ * copy nlink to the inode, unless it wasn't provided.  Provide
+ * sane values if we don't have an existing one and none was provided
+ */
+static void
+cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+{
+	/*
+	 * if we're in a situation where we can't trust what we
+	 * got from the server (readdir, some non-unix cases)
+	 * fake reasonable values
+	 */
+	if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) {
+		/* only provide fake values on a new inode */
+		if (inode->i_state & I_NEW) {
+			if (fattr->cf_cifsattrs & ATTR_DIRECTORY)
+				set_nlink(inode, 2);
+			else
+				set_nlink(inode, 1);
+		}
+		return;
+	}
+
+	/* we trust the server, so update it */
+	set_nlink(inode, fattr->cf_nlink);
+}
+
+/* populate an inode with info from a cifs_fattr struct */
+void
+cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	cifs_revalidate_cache(inode, fattr);
+
+	spin_lock(&inode->i_lock);
+	inode->i_atime = fattr->cf_atime;
+	inode->i_mtime = fattr->cf_mtime;
+	inode->i_ctime = fattr->cf_ctime;
+	inode->i_rdev = fattr->cf_rdev;
+	cifs_nlink_fattr_to_inode(inode, fattr);
+	inode->i_uid = fattr->cf_uid;
+	inode->i_gid = fattr->cf_gid;
+
+	/* if dynperm is set, don't clobber existing mode */
+	if (inode->i_state & I_NEW ||
+	    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM))
+		inode->i_mode = fattr->cf_mode;
+
+	cifs_i->cifsAttrs = fattr->cf_cifsattrs;
+
+	if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+		cifs_i->time = 0;
+	else
+		cifs_i->time = jiffies;
+
+	if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)
+		set_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
+	else
+		clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
+
+	cifs_i->server_eof = fattr->cf_eof;
+	/*
+	 * Can't safely change the file size here if the client is writing to
+	 * it due to potential races.
+	 */
+	if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
+		i_size_write(inode, fattr->cf_eof);
+
+		/*
+		 * i_blocks is not related to (i_size / i_blksize),
+		 * but instead 512 byte (2**9) size is required for
+		 * calculating num blocks.
+		 */
+		inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+		inode->i_flags |= S_AUTOMOUNT;
+	if (inode->i_state & I_NEW)
+		cifs_set_ops(inode);
+}
+
+void
+cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		return;
+
+	fattr->cf_uniqueid = iunique(sb, ROOT_I);
+}
+
+/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
+void
+cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
+			 struct cifs_sb_info *cifs_sb)
+{
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_uniqueid = le64_to_cpu(info->UniqueId);
+	fattr->cf_bytes = le64_to_cpu(info->NumOfBytes);
+	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+
+	fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime);
+	fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
+	fattr->cf_mode = le64_to_cpu(info->Permissions);
+
+	/*
+	 * Since we set the inode type below we need to mask off
+	 * to avoid strange results if bits set above.
+	 */
+	fattr->cf_mode &= ~S_IFMT;
+	switch (le32_to_cpu(info->Type)) {
+	case UNIX_FILE:
+		fattr->cf_mode |= S_IFREG;
+		fattr->cf_dtype = DT_REG;
+		break;
+	case UNIX_SYMLINK:
+		fattr->cf_mode |= S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+		break;
+	case UNIX_DIR:
+		fattr->cf_mode |= S_IFDIR;
+		fattr->cf_dtype = DT_DIR;
+		break;
+	case UNIX_CHARDEV:
+		fattr->cf_mode |= S_IFCHR;
+		fattr->cf_dtype = DT_CHR;
+		fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+				       le64_to_cpu(info->DevMinor) & MINORMASK);
+		break;
+	case UNIX_BLOCKDEV:
+		fattr->cf_mode |= S_IFBLK;
+		fattr->cf_dtype = DT_BLK;
+		fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+				       le64_to_cpu(info->DevMinor) & MINORMASK);
+		break;
+	case UNIX_FIFO:
+		fattr->cf_mode |= S_IFIFO;
+		fattr->cf_dtype = DT_FIFO;
+		break;
+	case UNIX_SOCKET:
+		fattr->cf_mode |= S_IFSOCK;
+		fattr->cf_dtype = DT_SOCK;
+		break;
+	default:
+		/* safest to call it a file if we do not know */
+		fattr->cf_mode |= S_IFREG;
+		fattr->cf_dtype = DT_REG;
+		cifs_dbg(FYI, "unknown type %d\n", le32_to_cpu(info->Type));
+		break;
+	}
+
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) {
+		u64 id = le64_to_cpu(info->Uid);
+		if (id < ((uid_t)-1)) {
+			kuid_t uid = make_kuid(&init_user_ns, id);
+			if (uid_valid(uid))
+				fattr->cf_uid = uid;
+		}
+	}
+	
+	fattr->cf_gid = cifs_sb->mnt_gid;
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) {
+		u64 id = le64_to_cpu(info->Gid);
+		if (id < ((gid_t)-1)) {
+			kgid_t gid = make_kgid(&init_user_ns, id);
+			if (gid_valid(gid))
+				fattr->cf_gid = gid;
+		}
+	}
+
+	fattr->cf_nlink = le64_to_cpu(info->Nlinks);
+}
+
+/*
+ * Fill a cifs_fattr struct with fake inode info.
+ *
+ * Needed to setup cifs_fattr data for the directory which is the
+ * junction to the new submount (ie to setup the fake directory
+ * which represents a DFS referral).
+ */
+static void
+cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cifs_dbg(FYI, "creating fake fattr for DFS referral\n");
+
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	fattr->cf_gid = cifs_sb->mnt_gid;
+	fattr->cf_atime = CURRENT_TIME;
+	fattr->cf_ctime = CURRENT_TIME;
+	fattr->cf_mtime = CURRENT_TIME;
+	fattr->cf_nlink = 2;
+	fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
+}
+
+static int
+cifs_get_file_info_unix(struct file *filp)
+{
+	int rc;
+	unsigned int xid;
+	FILE_UNIX_BASIC_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = file_inode(filp);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+	xid = get_xid();
+	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data);
+	if (!rc) {
+		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	}
+
+	cifs_fattr_to_inode(inode, &fattr);
+	free_xid(xid);
+	return rc;
+}
+
+int cifs_get_inode_info_unix(struct inode **pinode,
+			     const unsigned char *full_path,
+			     struct super_block *sb, unsigned int xid)
+{
+	int rc;
+	FILE_UNIX_BASIC_INFO find_data;
+	struct cifs_fattr fattr;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cifs_dbg(FYI, "Getting info on %s\n", full_path);
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	/* could have done a find first instead but this returns more info */
+	rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
+				  cifs_sb->local_nls, cifs_remap(cifs_sb));
+	cifs_put_tlink(tlink);
+
+	if (!rc) {
+		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, sb);
+		rc = 0;
+	} else {
+		return rc;
+	}
+
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+					     full_path);
+		if (tmprc)
+			cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
+	}
+
+	if (*pinode == NULL) {
+		/* get new inode */
+		cifs_fill_uniqueid(sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
+		if (!*pinode)
+			rc = -ENOMEM;
+	} else {
+		/* we already have inode, update it */
+
+		/* if uniqueid is different, return error */
+		if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
+		    CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
+			rc = -ESTALE;
+			goto cgiiu_exit;
+		}
+
+		/* if filetype is different, return error */
+		if (unlikely(((*pinode)->i_mode & S_IFMT) !=
+		    (fattr.cf_mode & S_IFMT))) {
+			rc = -ESTALE;
+			goto cgiiu_exit;
+		}
+
+		cifs_fattr_to_inode(*pinode, &fattr);
+	}
+
+cgiiu_exit:
+	return rc;
+}
+
+static int
+cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
+	      struct cifs_sb_info *cifs_sb, unsigned int xid)
+{
+	int rc;
+	__u32 oplock;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
+	char buf[24];
+	unsigned int bytes_read;
+	char *pbuf;
+	int buf_type = CIFS_NO_BUFFER;
+
+	pbuf = buf;
+
+	fattr->cf_mode &= ~S_IFMT;
+
+	if (fattr->cf_eof == 0) {
+		fattr->cf_mode |= S_IFIFO;
+		fattr->cf_dtype = DT_FIFO;
+		return 0;
+	} else if (fattr->cf_eof < 8) {
+		fattr->cf_mode |= S_IFREG;
+		fattr->cf_dtype = DT_REG;
+		return -EINVAL;	 /* EOPNOTSUPP? */
+	}
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_READ;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	if (tcon->ses->server->oplocks)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+	rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
+	if (rc) {
+		cifs_dbg(FYI, "check sfu type of %s, open rc = %d\n", path, rc);
+		cifs_put_tlink(tlink);
+		return rc;
+	}
+
+	/* Read header */
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = 24;
+
+	rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms,
+					&bytes_read, &pbuf, &buf_type);
+	if ((rc == 0) && (bytes_read >= 8)) {
+		if (memcmp("IntxBLK", pbuf, 8) == 0) {
+			cifs_dbg(FYI, "Block device\n");
+			fattr->cf_mode |= S_IFBLK;
+			fattr->cf_dtype = DT_BLK;
+			if (bytes_read == 24) {
+				/* we have enough to decode dev num */
+				__u64 mjr; /* major */
+				__u64 mnr; /* minor */
+				mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+				mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+				fattr->cf_rdev = MKDEV(mjr, mnr);
+			}
+		} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+			cifs_dbg(FYI, "Char device\n");
+			fattr->cf_mode |= S_IFCHR;
+			fattr->cf_dtype = DT_CHR;
+			if (bytes_read == 24) {
+				/* we have enough to decode dev num */
+				__u64 mjr; /* major */
+				__u64 mnr; /* minor */
+				mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+				mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+				fattr->cf_rdev = MKDEV(mjr, mnr);
+			}
+		} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
+			cifs_dbg(FYI, "Symlink\n");
+			fattr->cf_mode |= S_IFLNK;
+			fattr->cf_dtype = DT_LNK;
+		} else {
+			fattr->cf_mode |= S_IFREG; /* file? */
+			fattr->cf_dtype = DT_REG;
+			rc = -EOPNOTSUPP;
+		}
+	} else {
+		fattr->cf_mode |= S_IFREG; /* then it is a file */
+		fattr->cf_dtype = DT_REG;
+		rc = -EOPNOTSUPP; /* or some unknown SFU type */
+	}
+
+	tcon->ses->server->ops->close(xid, tcon, &fid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+#define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID)  /* SETFILEBITS valid bits */
+
+/*
+ * Fetch mode bits as provided by SFU.
+ *
+ * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ?
+ */
+static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
+			 struct cifs_sb_info *cifs_sb, unsigned int xid)
+{
+#ifdef CONFIG_CIFS_XATTR
+	ssize_t rc;
+	char ea_value[4];
+	__u32 mode;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	if (tcon->ses->server->ops->query_all_EAs == NULL) {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
+
+	rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
+			"SETFILEBITS", ea_value, 4 /* size of buf */,
+			cifs_sb->local_nls,
+			cifs_remap(cifs_sb));
+	cifs_put_tlink(tlink);
+	if (rc < 0)
+		return (int)rc;
+	else if (rc > 3) {
+		mode = le32_to_cpu(*((__le32 *)ea_value));
+		fattr->cf_mode &= ~SFBITS_MASK;
+		cifs_dbg(FYI, "special bits 0%o org mode 0%o\n",
+			 mode, fattr->cf_mode);
+		fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
+		cifs_dbg(FYI, "special mode bits 0%o\n", mode);
+	}
+
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
+static void
+cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
+		       struct cifs_sb_info *cifs_sb, bool adjust_tz,
+		       bool symlink)
+{
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
+	if (info->DeletePending)
+		fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING;
+
+	if (info->LastAccessTime)
+		fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+	else
+		fattr->cf_atime = CURRENT_TIME;
+
+	fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
+	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+
+	if (adjust_tz) {
+		fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
+		fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
+	}
+
+	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
+
+	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+
+	if (symlink) {
+		fattr->cf_mode = S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+	} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
+		fattr->cf_dtype = DT_DIR;
+		/*
+		 * Server can return wrong NumberOfLinks value for directories
+		 * when Unix extensions are disabled - fake it.
+		 */
+		if (!tcon->unix_ext)
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+	} else {
+		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
+		fattr->cf_dtype = DT_REG;
+
+		/* clear write bits if ATTR_READONLY is set */
+		if (fattr->cf_cifsattrs & ATTR_READONLY)
+			fattr->cf_mode &= ~(S_IWUGO);
+
+		/*
+		 * Don't accept zero nlink from non-unix servers unless
+		 * delete is pending.  Instead mark it as unknown.
+		 */
+		if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
+		    !info->DeletePending) {
+			cifs_dbg(1, "bogus file nlink value %u\n",
+				fattr->cf_nlink);
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+		}
+	}
+
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	fattr->cf_gid = cifs_sb->mnt_gid;
+}
+
+static int
+cifs_get_file_info(struct file *filp)
+{
+	int rc;
+	unsigned int xid;
+	FILE_ALL_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = file_inode(filp);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	if (!server->ops->query_file_info)
+		return -ENOSYS;
+
+	xid = get_xid();
+	rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
+	switch (rc) {
+	case 0:
+		cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false,
+				       false);
+		break;
+	case -EREMOTE:
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+		break;
+	case -EOPNOTSUPP:
+	case -EINVAL:
+		/*
+		 * FIXME: legacy server -- fall back to path-based call?
+		 * for now, just skip revalidating and mark inode for
+		 * immediate reval.
+		 */
+		rc = 0;
+		CIFS_I(inode)->time = 0;
+	default:
+		goto cgfi_exit;
+	}
+
+	/*
+	 * don't bother with SFU junk here -- just mark inode as needing
+	 * revalidation.
+	 */
+	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
+	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+	cifs_fattr_to_inode(inode, &fattr);
+cgfi_exit:
+	free_xid(xid);
+	return rc;
+}
+
+int
+cifs_get_inode_info(struct inode **inode, const char *full_path,
+		    FILE_ALL_INFO *data, struct super_block *sb, int xid,
+		    const struct cifs_fid *fid)
+{
+	bool validinum = false;
+	__u16 srchflgs;
+	int rc = 0, tmprc = ENOSYS;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct tcon_link *tlink;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	char *buf = NULL;
+	bool adjust_tz = false;
+	struct cifs_fattr fattr;
+	struct cifs_search_info *srchinf = NULL;
+	bool symlink = false;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	cifs_dbg(FYI, "Getting info on %s\n", full_path);
+
+	if ((data == NULL) && (*inode != NULL)) {
+		if (CIFS_CACHE_READ(CIFS_I(*inode))) {
+			cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
+			goto cgii_exit;
+		}
+	}
+
+	/* if inode info is not passed, get it from server */
+	if (data == NULL) {
+		if (!server->ops->query_path_info) {
+			rc = -ENOSYS;
+			goto cgii_exit;
+		}
+		buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+		if (buf == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
+		data = (FILE_ALL_INFO *)buf;
+		rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path,
+						  data, &adjust_tz, &symlink);
+	}
+
+	if (!rc) {
+		cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz,
+				       symlink);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, sb);
+		rc = 0;
+	} else if (rc == -EACCES && backup_cred(cifs_sb)) {
+			srchinf = kzalloc(sizeof(struct cifs_search_info),
+						GFP_KERNEL);
+			if (srchinf == NULL) {
+				rc = -ENOMEM;
+				goto cgii_exit;
+			}
+
+			srchinf->endOfSearch = false;
+			srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+
+			srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
+					CIFS_SEARCH_CLOSE_AT_END |
+					CIFS_SEARCH_BACKUP_SEARCH;
+
+			rc = CIFSFindFirst(xid, tcon, full_path,
+				cifs_sb, NULL, srchflgs, srchinf, false);
+			if (!rc) {
+				data =
+				(FILE_ALL_INFO *)srchinf->srch_entries_start;
+
+				cifs_dir_info_to_fattr(&fattr,
+				(FILE_DIRECTORY_INFO *)data, cifs_sb);
+				fattr.cf_uniqueid = le64_to_cpu(
+				((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
+				validinum = true;
+
+				cifs_buf_release(srchinf->ntwrk_buf_start);
+			}
+			kfree(srchinf);
+			if (rc)
+				goto cgii_exit;
+	} else
+		goto cgii_exit;
+
+	/*
+	 * If an inode wasn't passed in, then get the inode number
+	 *
+	 * Is an i_ino of zero legal? Can we use that to check if the server
+	 * supports returning inode numbers?  Are there other sanity checks we
+	 * can use to ensure that the server is really filling in that field?
+	 */
+	if (*inode == NULL) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+			if (validinum == false) {
+				if (server->ops->get_srv_inum)
+					tmprc = server->ops->get_srv_inum(xid,
+						tcon, cifs_sb, full_path,
+						&fattr.cf_uniqueid, data);
+				if (tmprc) {
+					cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
+						 tmprc);
+					fattr.cf_uniqueid = iunique(sb, ROOT_I);
+					cifs_autodisable_serverino(cifs_sb);
+				}
+			}
+		} else
+			fattr.cf_uniqueid = iunique(sb, ROOT_I);
+	} else
+		fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+
+	/* query for SFU type info if supported and needed */
+	if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
+	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+		tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc);
+	}
+
+#ifdef CONFIG_CIFS_ACL
+	/* fill in 0777 bits from ACL */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid);
+		if (rc) {
+			cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
+				 __func__, rc);
+			goto cgii_exit;
+		}
+	}
+#endif /* CONFIG_CIFS_ACL */
+
+	/* fill in remaining high mode bits e.g. SUID, VTX */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+		cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
+
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+					 full_path);
+		if (tmprc)
+			cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
+	}
+
+	if (!*inode) {
+		*inode = cifs_iget(sb, &fattr);
+		if (!*inode)
+			rc = -ENOMEM;
+	} else {
+		/* we already have inode, update it */
+
+		/* if filetype is different, return error */
+		if (unlikely(((*inode)->i_mode & S_IFMT) !=
+		    (fattr.cf_mode & S_IFMT))) {
+			rc = -ESTALE;
+			goto cgii_exit;
+		}
+
+		cifs_fattr_to_inode(*inode, &fattr);
+	}
+
+cgii_exit:
+	kfree(buf);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+static const struct inode_operations cifs_ipc_inode_ops = {
+	.lookup = cifs_lookup,
+};
+
+static int
+cifs_find_inode(struct inode *inode, void *opaque)
+{
+	struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+
+	/* don't match inode with different uniqueid */
+	if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
+		return 0;
+
+	/* use createtime like an i_generation field */
+	if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+		return 0;
+
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
+		return 0;
+
+	/* if it's not a directory or has no dentries, then flag it */
+	if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry))
+		fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+
+	return 1;
+}
+
+static int
+cifs_init_inode(struct inode *inode, void *opaque)
+{
+	struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+
+	CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+	CIFS_I(inode)->createtime = fattr->cf_createtime;
+	return 0;
+}
+
+/*
+ * walk dentry list for an inode and report whether it has aliases that
+ * are hashed. We use this to determine if a directory inode can actually
+ * be used.
+ */
+static bool
+inode_has_hashed_dentries(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
+		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
+			spin_unlock(&inode->i_lock);
+			return true;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	return false;
+}
+
+/* Given fattrs, get a corresponding inode */
+struct inode *
+cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
+{
+	unsigned long hash;
+	struct inode *inode;
+
+retry_iget5_locked:
+	cifs_dbg(FYI, "looking for uniqueid=%llu\n", fattr->cf_uniqueid);
+
+	/* hash down to 32-bits on 32-bit arch */
+	hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
+
+	inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
+	if (inode) {
+		/* was there a potentially problematic inode collision? */
+		if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+			fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+
+			if (inode_has_hashed_dentries(inode)) {
+				cifs_autodisable_serverino(CIFS_SB(sb));
+				iput(inode);
+				fattr->cf_uniqueid = iunique(sb, ROOT_I);
+				goto retry_iget5_locked;
+			}
+		}
+
+		cifs_fattr_to_inode(inode, fattr);
+		if (sb->s_flags & MS_NOATIME)
+			inode->i_flags |= S_NOATIME | S_NOCMTIME;
+		if (inode->i_state & I_NEW) {
+			inode->i_ino = hash;
+#ifdef CONFIG_CIFS_FSCACHE
+			/* initialize per-inode cache cookie pointer */
+			CIFS_I(inode)->fscache = NULL;
+#endif
+			unlock_new_inode(inode);
+		}
+	}
+
+	return inode;
+}
+
+/* gets root inode */
+struct inode *cifs_root_iget(struct super_block *sb)
+{
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct inode *inode = NULL;
+	long rc;
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	xid = get_xid();
+	if (tcon->unix_ext) {
+		rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
+		/* some servers mistakenly claim POSIX support */
+		if (rc != -EOPNOTSUPP)
+			goto iget_no_retry;
+		cifs_dbg(VFS, "server does not support POSIX extensions");
+		tcon->unix_ext = false;
+	}
+
+	rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
+
+iget_no_retry:
+	if (!inode) {
+		inode = ERR_PTR(rc);
+		goto out;
+	}
+
+#ifdef CONFIG_CIFS_FSCACHE
+	/* populate tcon->resource_id */
+	tcon->resource_id = CIFS_I(inode)->uniqueid;
+#endif
+
+	if (rc && tcon->ipc) {
+		cifs_dbg(FYI, "ipc connection - fake read inode\n");
+		spin_lock(&inode->i_lock);
+		inode->i_mode |= S_IFDIR;
+		set_nlink(inode, 2);
+		inode->i_op = &cifs_ipc_inode_ops;
+		inode->i_fop = &simple_dir_operations;
+		inode->i_uid = cifs_sb->mnt_uid;
+		inode->i_gid = cifs_sb->mnt_gid;
+		spin_unlock(&inode->i_lock);
+	} else if (rc) {
+		iget_failed(inode);
+		inode = ERR_PTR(rc);
+	}
+
+out:
+	/* can not call macro free_xid here since in a void func
+	 * TODO: This is no longer true
+	 */
+	_free_xid(xid);
+	return inode;
+}
+
+int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
+		   char *full_path, __u32 dosattr)
+{
+	bool set_time = false;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct TCP_Server_Info *server;
+	FILE_BASIC_INFO	info_buf;
+
+	if (attrs == NULL)
+		return -EINVAL;
+
+	server = cifs_sb_master_tcon(cifs_sb)->ses->server;
+	if (!server->ops->set_file_info)
+		return -ENOSYS;
+
+	if (attrs->ia_valid & ATTR_ATIME) {
+		set_time = true;
+		info_buf.LastAccessTime =
+			cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+	} else
+		info_buf.LastAccessTime = 0;
+
+	if (attrs->ia_valid & ATTR_MTIME) {
+		set_time = true;
+		info_buf.LastWriteTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+	} else
+		info_buf.LastWriteTime = 0;
+
+	/*
+	 * Samba throws this field away, but windows may actually use it.
+	 * Do not set ctime unless other time stamps are changed explicitly
+	 * (i.e. by utimes()) since we would then have a mix of client and
+	 * server times.
+	 */
+	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
+		cifs_dbg(FYI, "CIFS - CTIME changed\n");
+		info_buf.ChangeTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+	} else
+		info_buf.ChangeTime = 0;
+
+	info_buf.CreationTime = 0;	/* don't change */
+	info_buf.Attributes = cpu_to_le32(dosattr);
+
+	return server->ops->set_file_info(inode, full_path, &info_buf, xid);
+}
+
+/*
+ * Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
+ * and rename it to a random name that hopefully won't conflict with
+ * anything else.
+ */
+int
+cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
+			   const unsigned int xid)
+{
+	int oplock = 0;
+	int rc;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct inode *inode = d_inode(dentry);
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	__u32 dosattr, origattr;
+	FILE_BASIC_INFO *info_buf = NULL;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	/*
+	 * We cannot rename the file if the server doesn't support
+	 * CAP_INFOLEVEL_PASSTHRU
+	 */
+	if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc != 0)
+		goto out;
+
+	origattr = cifsInode->cifsAttrs;
+	if (origattr == 0)
+		origattr |= ATTR_NORMAL;
+
+	dosattr = origattr & ~ATTR_READONLY;
+	if (dosattr == 0)
+		dosattr |= ATTR_NORMAL;
+	dosattr |= ATTR_HIDDEN;
+
+	/* set ATTR_HIDDEN and clear ATTR_READONLY, but only if needed */
+	if (dosattr != origattr) {
+		info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
+		if (info_buf == NULL) {
+			rc = -ENOMEM;
+			goto out_close;
+		}
+		info_buf->Attributes = cpu_to_le32(dosattr);
+		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
+					current->tgid);
+		/* although we would like to mark the file hidden
+ 		   if that fails we will still try to rename it */
+		if (!rc)
+			cifsInode->cifsAttrs = dosattr;
+		else
+			dosattr = origattr; /* since not able to change them */
+	}
+
+	/* rename the file */
+	rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
+				   cifs_sb->local_nls,
+				   cifs_remap(cifs_sb));
+	if (rc != 0) {
+		rc = -EBUSY;
+		goto undo_setattr;
+	}
+
+	/* try to set DELETE_ON_CLOSE */
+	if (!test_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags)) {
+		rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid,
+					       current->tgid);
+		/*
+		 * some samba versions return -ENOENT when we try to set the
+		 * file disposition here. Likely a samba bug, but work around
+		 * it for now. This means that some cifsXXX files may hang
+		 * around after they shouldn't.
+		 *
+		 * BB: remove this hack after more servers have the fix
+		 */
+		if (rc == -ENOENT)
+			rc = 0;
+		else if (rc != 0) {
+			rc = -EBUSY;
+			goto undo_rename;
+		}
+		set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
+	}
+
+out_close:
+	CIFSSMBClose(xid, tcon, fid.netfid);
+out:
+	kfree(info_buf);
+	cifs_put_tlink(tlink);
+	return rc;
+
+	/*
+	 * reset everything back to the original state. Don't bother
+	 * dealing with errors here since we can't do anything about
+	 * them anyway.
+	 */
+undo_rename:
+	CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+undo_setattr:
+	if (dosattr != origattr) {
+		info_buf->Attributes = cpu_to_le32(origattr);
+		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
+					current->tgid))
+			cifsInode->cifsAttrs = origattr;
+	}
+
+	goto out_close;
+}
+
+/* copied from fs/nfs/dir.c with small changes */
+static void
+cifs_drop_nlink(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink > 0)
+		drop_nlink(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * If d_inode(dentry) is null (usually meaning the cached dentry
+ * is a negative dentry) then we would attempt a standard SMB delete, but
+ * if that fails we can not attempt the fall back mechanisms on EACCESS
+ * but will return the EACCESS to the caller. Note that the VFS does not call
+ * unlink on negative dentries currently.
+ */
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int rc = 0;
+	unsigned int xid;
+	char *full_path = NULL;
+	struct inode *inode = d_inode(dentry);
+	struct cifsInodeInfo *cifs_inode;
+	struct super_block *sb = dir->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct iattr *attrs = NULL;
+	__u32 dosattr = 0, origattr = 0;
+
+	cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	xid = get_xid();
+
+	/* Unlink can be called from rename so we can not take the
+	 * sb->s_vfs_rename_mutex here */
+	full_path = build_path_from_dentry(dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto unlink_out;
+	}
+
+	if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
+			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
+			cifs_remap(cifs_sb));
+		cifs_dbg(FYI, "posix del rc %d\n", rc);
+		if ((rc == 0) || (rc == -ENOENT))
+			goto psx_del_no_retry;
+	}
+
+retry_std_delete:
+	if (!server->ops->unlink) {
+		rc = -ENOSYS;
+		goto psx_del_no_retry;
+	}
+
+	rc = server->ops->unlink(xid, tcon, full_path, cifs_sb);
+
+psx_del_no_retry:
+	if (!rc) {
+		if (inode)
+			cifs_drop_nlink(inode);
+	} else if (rc == -ENOENT) {
+		d_drop(dentry);
+	} else if (rc == -EBUSY) {
+		if (server->ops->rename_pending_delete) {
+			rc = server->ops->rename_pending_delete(full_path,
+								dentry, xid);
+			if (rc == 0)
+				cifs_drop_nlink(inode);
+		}
+	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
+		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+		if (attrs == NULL) {
+			rc = -ENOMEM;
+			goto out_reval;
+		}
+
+		/* try to reset dos attributes */
+		cifs_inode = CIFS_I(inode);
+		origattr = cifs_inode->cifsAttrs;
+		if (origattr == 0)
+			origattr |= ATTR_NORMAL;
+		dosattr = origattr & ~ATTR_READONLY;
+		if (dosattr == 0)
+			dosattr |= ATTR_NORMAL;
+		dosattr |= ATTR_HIDDEN;
+
+		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+		if (rc != 0)
+			goto out_reval;
+
+		goto retry_std_delete;
+	}
+
+	/* undo the setattr if we errored out and it's needed */
+	if (rc != 0 && dosattr != 0)
+		cifs_set_file_info(inode, attrs, xid, full_path, origattr);
+
+out_reval:
+	if (inode) {
+		cifs_inode = CIFS_I(inode);
+		cifs_inode->time = 0;	/* will force revalidate to get info
+					   when needed */
+		inode->i_ctime = current_fs_time(sb);
+	}
+	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+	cifs_inode = CIFS_I(dir);
+	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
+unlink_out:
+	kfree(full_path);
+	kfree(attrs);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+static int
+cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
+		 const char *full_path, struct cifs_sb_info *cifs_sb,
+		 struct cifs_tcon *tcon, const unsigned int xid)
+{
+	int rc = 0;
+	struct inode *inode = NULL;
+
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
+					      xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb,
+					 xid, NULL);
+
+	if (rc)
+		return rc;
+
+	/*
+	 * setting nlink not necessary except in cases where we failed to get it
+	 * from the server or was set bogus. Also, since this is a brand new
+	 * inode, no need to grab the i_lock before setting the i_nlink.
+	 */
+	if (inode->i_nlink < 2)
+		set_nlink(inode, 2);
+	mode &= ~current_umask();
+	/* must turn on setgid bit if parent dir has it */
+	if (parent->i_mode & S_ISGID)
+		mode |= S_ISGID;
+
+	if (tcon->unix_ext) {
+		struct cifs_unix_set_info_args args = {
+			.mode	= mode,
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= 0,
+		};
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = current_fsuid();
+			if (parent->i_mode & S_ISGID)
+				args.gid = parent->i_gid;
+			else
+				args.gid = current_fsgid();
+		} else {
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
+		}
+		CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+				       cifs_sb->local_nls,
+				       cifs_remap(cifs_sb));
+	} else {
+		struct TCP_Server_Info *server = tcon->ses->server;
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
+		    (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo)
+			server->ops->mkdir_setinfo(inode, full_path, cifs_sb,
+						   tcon, xid);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+			inode->i_mode = (mode | S_IFDIR);
+
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			inode->i_uid = current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				inode->i_gid = parent->i_gid;
+			else
+				inode->i_gid = current_fsgid();
+		}
+	}
+	d_instantiate(dentry, inode);
+	return rc;
+}
+
+static int
+cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
+		 const char *full_path, struct cifs_sb_info *cifs_sb,
+		 struct cifs_tcon *tcon, const unsigned int xid)
+{
+	int rc = 0;
+	u32 oplock = 0;
+	FILE_UNIX_BASIC_INFO *info = NULL;
+	struct inode *newinode = NULL;
+	struct cifs_fattr fattr;
+
+	info = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (info == NULL) {
+		rc = -ENOMEM;
+		goto posix_mkdir_out;
+	}
+
+	mode &= ~current_umask();
+	rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode,
+			     NULL /* netfid */, info, &oplock, full_path,
+			     cifs_sb->local_nls, cifs_remap(cifs_sb));
+	if (rc == -EOPNOTSUPP)
+		goto posix_mkdir_out;
+	else if (rc) {
+		cifs_dbg(FYI, "posix mkdir returned 0x%x\n", rc);
+		d_drop(dentry);
+		goto posix_mkdir_out;
+	}
+
+	if (info->Type == cpu_to_le32(-1))
+		/* no return info, go query for it */
+		goto posix_mkdir_get_info;
+	/*
+	 * BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if
+	 * need to set uid/gid.
+	 */
+
+	cifs_unix_basic_to_fattr(&fattr, info, cifs_sb);
+	cifs_fill_uniqueid(inode->i_sb, &fattr);
+	newinode = cifs_iget(inode->i_sb, &fattr);
+	if (!newinode)
+		goto posix_mkdir_get_info;
+
+	d_instantiate(dentry, newinode);
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cifs_dbg(FYI, "instantiated dentry %p %pd to inode %p\n",
+		 dentry, dentry, newinode);
+
+	if (newinode->i_nlink != 2)
+		cifs_dbg(FYI, "unexpected number of links %d\n",
+			 newinode->i_nlink);
+#endif
+
+posix_mkdir_out:
+	kfree(info);
+	return rc;
+posix_mkdir_get_info:
+	rc = cifs_mkdir_qinfo(inode, dentry, mode, full_path, cifs_sb, tcon,
+			      xid);
+	goto posix_mkdir_out;
+}
+
+int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
+{
+	int rc = 0;
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	char *full_path;
+
+	cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+		 mode, inode);
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto mkdir_out;
+	}
+
+	if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb,
+				      tcon, xid);
+		if (rc != -EOPNOTSUPP)
+			goto mkdir_out;
+	}
+
+	server = tcon->ses->server;
+
+	if (!server->ops->mkdir) {
+		rc = -ENOSYS;
+		goto mkdir_out;
+	}
+
+	/* BB add setting the equivalent of mode via CreateX w/ACLs */
+	rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
+	if (rc) {
+		cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc);
+		d_drop(direntry);
+		goto mkdir_out;
+	}
+
+	rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon,
+			      xid);
+mkdir_out:
+	/*
+	 * Force revalidate to get parent dir info when needed since cached
+	 * attributes are invalid now.
+	 */
+	CIFS_I(inode)->time = 0;
+	kfree(full_path);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int cifs_rmdir(struct inode *inode, struct dentry *direntry)
+{
+	int rc = 0;
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	char *full_path = NULL;
+	struct cifsInodeInfo *cifsInode;
+
+	cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode);
+
+	xid = get_xid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto rmdir_exit;
+	}
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto rmdir_exit;
+	}
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (!server->ops->rmdir) {
+		rc = -ENOSYS;
+		cifs_put_tlink(tlink);
+		goto rmdir_exit;
+	}
+
+	rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
+	cifs_put_tlink(tlink);
+
+	if (!rc) {
+		spin_lock(&d_inode(direntry)->i_lock);
+		i_size_write(d_inode(direntry), 0);
+		clear_nlink(d_inode(direntry));
+		spin_unlock(&d_inode(direntry)->i_lock);
+	}
+
+	cifsInode = CIFS_I(d_inode(direntry));
+	/* force revalidate to go get info when needed */
+	cifsInode->time = 0;
+
+	cifsInode = CIFS_I(inode);
+	/*
+	 * Force revalidate to get parent dir info when needed since cached
+	 * attributes are invalid now.
+	 */
+	cifsInode->time = 0;
+
+	d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime =
+		current_fs_time(inode->i_sb);
+
+rmdir_exit:
+	kfree(full_path);
+	free_xid(xid);
+	return rc;
+}
+
+static int
+cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
+	       const char *from_path, struct dentry *to_dentry,
+	       const char *to_path)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	int oplock, rc;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (!server->ops->rename)
+		return -ENOSYS;
+
+	/* try path-based rename first */
+	rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb);
+
+	/*
+	 * Don't bother with rename by filehandle unless file is busy and
+	 * source. Note that cross directory moves do not work with
+	 * rename by filehandle to various Windows servers.
+	 */
+	if (rc == 0 || rc != -EBUSY)
+		goto do_rename_exit;
+
+	/* open-file renames don't work across directories */
+	if (to_dentry->d_parent != from_dentry->d_parent)
+		goto do_rename_exit;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	/* open the file to be renamed -- we need DELETE perms */
+	oparms.desired_access = DELETE;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = from_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc == 0) {
+		rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
+				(const char *) to_dentry->d_name.name,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+		CIFSSMBClose(xid, tcon, fid.netfid);
+	}
+do_rename_exit:
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int
+cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
+	     struct inode *target_dir, struct dentry *target_dentry,
+	     unsigned int flags)
+{
+	char *from_name = NULL;
+	char *to_name = NULL;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+	FILE_UNIX_BASIC_INFO *info_buf_target;
+	unsigned int xid;
+	int rc, tmprc;
+
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	cifs_sb = CIFS_SB(source_dir->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	/*
+	 * we already have the rename sem so we do not need to
+	 * grab it again here to protect the path integrity
+	 */
+	from_name = build_path_from_dentry(source_dentry);
+	if (from_name == NULL) {
+		rc = -ENOMEM;
+		goto cifs_rename_exit;
+	}
+
+	to_name = build_path_from_dentry(target_dentry);
+	if (to_name == NULL) {
+		rc = -ENOMEM;
+		goto cifs_rename_exit;
+	}
+
+	rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
+			    to_name);
+
+	/*
+	 * No-replace is the natural behavior for CIFS, so skip unlink hacks.
+	 */
+	if (flags & RENAME_NOREPLACE)
+		goto cifs_rename_exit;
+
+	if (rc == -EEXIST && tcon->unix_ext) {
+		/*
+		 * Are src and dst hardlinks of same inode? We can only tell
+		 * with unix extensions enabled.
+		 */
+		info_buf_source =
+			kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
+					GFP_KERNEL);
+		if (info_buf_source == NULL) {
+			rc = -ENOMEM;
+			goto cifs_rename_exit;
+		}
+
+		info_buf_target = info_buf_source + 1;
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name,
+					     info_buf_source,
+					     cifs_sb->local_nls,
+					     cifs_remap(cifs_sb));
+		if (tmprc != 0)
+			goto unlink_target;
+
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name,
+					     info_buf_target,
+					     cifs_sb->local_nls,
+					     cifs_remap(cifs_sb));
+
+		if (tmprc == 0 && (info_buf_source->UniqueId ==
+				   info_buf_target->UniqueId)) {
+			/* same file, POSIX says that this is a noop */
+			rc = 0;
+			goto cifs_rename_exit;
+		}
+	}
+	/*
+	 * else ... BB we could add the same check for Windows by
+	 * checking the UniqueId via FILE_INTERNAL_INFO
+	 */
+
+unlink_target:
+	/* Try unlinking the target dentry if it's not negative */
+	if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
+		if (d_is_dir(target_dentry))
+			tmprc = cifs_rmdir(target_dir, target_dentry);
+		else
+			tmprc = cifs_unlink(target_dir, target_dentry);
+		if (tmprc)
+			goto cifs_rename_exit;
+		rc = cifs_do_rename(xid, source_dentry, from_name,
+				    target_dentry, to_name);
+	}
+
+	/* force revalidate to go get info when needed */
+	CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
+
+	source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
+		target_dir->i_mtime = current_fs_time(source_dir->i_sb);
+
+cifs_rename_exit:
+	kfree(info_buf_source);
+	kfree(from_name);
+	kfree(to_name);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+static bool
+cifs_inode_needs_reval(struct inode *inode)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	if (CIFS_CACHE_READ(cifs_i))
+		return false;
+
+	if (!lookupCacheEnabled)
+		return true;
+
+	if (cifs_i->time == 0)
+		return true;
+
+	if (!cifs_sb->actimeo)
+		return true;
+
+	if (!time_in_range(jiffies, cifs_i->time,
+				cifs_i->time + cifs_sb->actimeo))
+		return true;
+
+	/* hardlinked files w/ noserverino get "special" treatment */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+	    S_ISREG(inode->i_mode) && inode->i_nlink != 1)
+		return true;
+
+	return false;
+}
+
+/*
+ * Zap the cache. Called when invalid_mapping flag is set.
+ */
+int
+cifs_invalidate_mapping(struct inode *inode)
+{
+	int rc = 0;
+
+	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
+		rc = invalidate_inode_pages2(inode->i_mapping);
+		if (rc)
+			cifs_dbg(VFS, "%s: could not invalidate inode %p\n",
+				 __func__, inode);
+	}
+
+	cifs_fscache_reset_inode_cookie(inode);
+	return rc;
+}
+
+/**
+ * cifs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+static int
+cifs_wait_bit_killable(struct wait_bit_key *key)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	freezable_schedule_unsafe();
+	return 0;
+}
+
+int
+cifs_revalidate_mapping(struct inode *inode)
+{
+	int rc;
+	unsigned long *flags = &CIFS_I(inode)->flags;
+
+	rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
+				     TASK_KILLABLE);
+	if (rc)
+		return rc;
+
+	if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) {
+		rc = cifs_invalidate_mapping(inode);
+		if (rc)
+			set_bit(CIFS_INO_INVALID_MAPPING, flags);
+	}
+
+	clear_bit_unlock(CIFS_INO_LOCK, flags);
+	smp_mb__after_atomic();
+	wake_up_bit(flags, CIFS_INO_LOCK);
+
+	return rc;
+}
+
+int
+cifs_zap_mapping(struct inode *inode)
+{
+	set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
+	return cifs_revalidate_mapping(inode);
+}
+
+int cifs_revalidate_file_attr(struct file *filp)
+{
+	int rc = 0;
+	struct inode *inode = file_inode(filp);
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	if (!cifs_inode_needs_reval(inode))
+		return rc;
+
+	if (tlink_tcon(cfile->tlink)->unix_ext)
+		rc = cifs_get_file_info_unix(filp);
+	else
+		rc = cifs_get_file_info(filp);
+
+	return rc;
+}
+
+int cifs_revalidate_dentry_attr(struct dentry *dentry)
+{
+	unsigned int xid;
+	int rc = 0;
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = dentry->d_sb;
+	char *full_path = NULL;
+
+	if (inode == NULL)
+		return -ENOENT;
+
+	if (!cifs_inode_needs_reval(inode))
+		return rc;
+
+	xid = get_xid();
+
+	/* can not safely grab the rename sem here if rename calls revalidate
+	   since that would deadlock */
+	full_path = build_path_from_dentry(dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
+		 full_path, inode, inode->i_count.counter,
+		 dentry, dentry->d_time, jiffies);
+
+	if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
+					 xid, NULL);
+
+out:
+	kfree(full_path);
+	free_xid(xid);
+	return rc;
+}
+
+int cifs_revalidate_file(struct file *filp)
+{
+	int rc;
+	struct inode *inode = file_inode(filp);
+
+	rc = cifs_revalidate_file_attr(filp);
+	if (rc)
+		return rc;
+
+	return cifs_revalidate_mapping(inode);
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+	int rc;
+	struct inode *inode = d_inode(dentry);
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	return cifs_revalidate_mapping(inode);
+}
+
+int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct inode *inode = d_inode(dentry);
+	int rc;
+
+	/*
+	 * We need to be sure that all dirty pages are written and the server
+	 * has actual ctime, mtime and file length.
+	 */
+	if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
+	    inode->i_mapping->nrpages != 0) {
+		rc = filemap_fdatawait(inode->i_mapping);
+		if (rc) {
+			mapping_set_error(inode->i_mapping, rc);
+			return rc;
+		}
+	}
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	generic_fillattr(inode, stat);
+	stat->blksize = CIFS_MAX_MSGSIZE;
+	stat->ino = CIFS_I(inode)->uniqueid;
+
+	/*
+	 * If on a multiuser mount without unix extensions or cifsacl being
+	 * enabled, and the admin hasn't overridden them, set the ownership
+	 * to the fsuid/fsgid of the current process.
+	 */
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+	    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
+	    !tcon->unix_ext) {
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+			stat->uid = current_fsuid();
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+			stat->gid = current_fsgid();
+	}
+	return rc;
+}
+
+static int cifs_truncate_page(struct address_space *mapping, loff_t from)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+	struct page *page;
+	int rc = 0;
+
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	unlock_page(page);
+	page_cache_release(page);
+	return rc;
+}
+
+static void cifs_setsize(struct inode *inode, loff_t offset)
+{
+	spin_lock(&inode->i_lock);
+	i_size_write(inode, offset);
+	spin_unlock(&inode->i_lock);
+
+	truncate_pagecache(inode, offset);
+}
+
+static int
+cifs_set_file_size(struct inode *inode, struct iattr *attrs,
+		   unsigned int xid, char *full_path)
+{
+	int rc;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *tcon = NULL;
+	struct TCP_Server_Info *server;
+	struct cifs_io_parms io_parms;
+
+	/*
+	 * To avoid spurious oplock breaks from server, in the case of
+	 * inodes that we already have open, avoid doing path based
+	 * setting of file size if we can do it by handle.
+	 * This keeps our caching token (oplock) and avoids timeouts
+	 * when the local oplock break takes longer to flush
+	 * writebehind data than the SMB timeout for the SetPathInfo
+	 * request would allow
+	 */
+	open_file = find_writable_file(cifsInode, true);
+	if (open_file) {
+		tcon = tlink_tcon(open_file->tlink);
+		server = tcon->ses->server;
+		if (server->ops->set_file_size)
+			rc = server->ops->set_file_size(xid, tcon, open_file,
+							attrs->ia_size, false);
+		else
+			rc = -ENOSYS;
+		cifsFileInfo_put(open_file);
+		cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc);
+		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			unsigned int bytes_written;
+
+			io_parms.netfid = open_file->fid.netfid;
+			io_parms.pid = open_file->pid;
+			io_parms.tcon = tcon;
+			io_parms.offset = 0;
+			io_parms.length = attrs->ia_size;
+			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
+					  NULL, NULL, 1);
+			cifs_dbg(FYI, "Wrt seteof rc %d\n", rc);
+		}
+	} else
+		rc = -EINVAL;
+
+	if (!rc)
+		goto set_size_out;
+
+	if (tcon == NULL) {
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink))
+			return PTR_ERR(tlink);
+		tcon = tlink_tcon(tlink);
+		server = tcon->ses->server;
+	}
+
+	/*
+	 * Set file size by pathname rather than by handle either because no
+	 * valid, writeable file handle for it was found or because there was
+	 * an error setting it by handle.
+	 */
+	if (server->ops->set_path_size)
+		rc = server->ops->set_path_size(xid, tcon, full_path,
+						attrs->ia_size, cifs_sb, false);
+	else
+		rc = -ENOSYS;
+	cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
+	if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+		__u16 netfid;
+		int oplock = 0;
+
+		rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
+				   GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
+				   &oplock, NULL, cifs_sb->local_nls,
+				   cifs_remap(cifs_sb));
+		if (rc == 0) {
+			unsigned int bytes_written;
+
+			io_parms.netfid = netfid;
+			io_parms.pid = current->tgid;
+			io_parms.tcon = tcon;
+			io_parms.offset = 0;
+			io_parms.length = attrs->ia_size;
+			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
+					  NULL,  1);
+			cifs_dbg(FYI, "wrt seteof rc %d\n", rc);
+			CIFSSMBClose(xid, tcon, netfid);
+		}
+	}
+	if (tlink)
+		cifs_put_tlink(tlink);
+
+set_size_out:
+	if (rc == 0) {
+		cifsInode->server_eof = attrs->ia_size;
+		cifs_setsize(inode, attrs->ia_size);
+		cifs_truncate_page(inode->i_mapping, inode->i_size);
+	}
+
+	return rc;
+}
+
+static int
+cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
+{
+	int rc;
+	unsigned int xid;
+	char *full_path = NULL;
+	struct inode *inode = d_inode(direntry);
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct cifs_unix_set_info_args *args = NULL;
+	struct cifsFileInfo *open_file;
+
+	cifs_dbg(FYI, "setattr_unix on file %pd attrs->ia_valid=0x%x\n",
+		 direntry, attrs->ia_valid);
+
+	xid = get_xid();
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		attrs->ia_valid |= ATTR_FORCE;
+
+	rc = inode_change_ok(inode, attrs);
+	if (rc < 0)
+		goto out;
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Attempt to flush data before changing attributes. We need to do
+	 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
+	 * ownership or mode then we may also need to do this. Here, we take
+	 * the safe way out and just do the flush on all setattr requests. If
+	 * the flush returns error, store it to report later and continue.
+	 *
+	 * BB: This should be smarter. Why bother flushing pages that
+	 * will be truncated anyway? Also, should we error out here if
+	 * the flush returns error?
+	 */
+	rc = filemap_write_and_wait(inode->i_mapping);
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
+
+	if (attrs->ia_valid & ATTR_SIZE) {
+		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		if (rc != 0)
+			goto out;
+	}
+
+	/* skip mode change if it's just for clearing setuid/setgid */
+	if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+		attrs->ia_valid &= ~ATTR_MODE;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (args == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* set up the struct */
+	if (attrs->ia_valid & ATTR_MODE)
+		args->mode = attrs->ia_mode;
+	else
+		args->mode = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_UID)
+		args->uid = attrs->ia_uid;
+	else
+		args->uid = INVALID_UID; /* no change */
+
+	if (attrs->ia_valid & ATTR_GID)
+		args->gid = attrs->ia_gid;
+	else
+		args->gid = INVALID_GID; /* no change */
+
+	if (attrs->ia_valid & ATTR_ATIME)
+		args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
+	else
+		args->atime = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_MTIME)
+		args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime);
+	else
+		args->mtime = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_CTIME)
+		args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime);
+	else
+		args->ctime = NO_CHANGE_64;
+
+	args->device = 0;
+	open_file = find_writable_file(cifsInode, true);
+	if (open_file) {
+		u16 nfid = open_file->fid.netfid;
+		u32 npid = open_file->pid;
+		pTcon = tlink_tcon(open_file->tlink);
+		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
+		cifsFileInfo_put(open_file);
+	} else {
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink)) {
+			rc = PTR_ERR(tlink);
+			goto out;
+		}
+		pTcon = tlink_tcon(tlink);
+		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
+				    cifs_sb->local_nls,
+				    cifs_remap(cifs_sb));
+		cifs_put_tlink(tlink);
+	}
+
+	if (rc)
+		goto out;
+
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode))
+		truncate_setsize(inode, attrs->ia_size);
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+
+	/* force revalidate when any of these times are set since some
+	   of the fs types (eg ext3, fat) do not have fine enough
+	   time granularity to match protocol, and we do not have a
+	   a way (yet) to query the server fs's time granularity (and
+	   whether it rounds times down).
+	*/
+	if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		cifsInode->time = 0;
+out:
+	kfree(args);
+	kfree(full_path);
+	free_xid(xid);
+	return rc;
+}
+
+static int
+cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
+{
+	unsigned int xid;
+	kuid_t uid = INVALID_UID;
+	kgid_t gid = INVALID_GID;
+	struct inode *inode = d_inode(direntry);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	char *full_path = NULL;
+	int rc = -EACCES;
+	__u32 dosattr = 0;
+	__u64 mode = NO_CHANGE_64;
+
+	xid = get_xid();
+
+	cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n",
+		 direntry, attrs->ia_valid);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		attrs->ia_valid |= ATTR_FORCE;
+
+	rc = inode_change_ok(inode, attrs);
+	if (rc < 0) {
+		free_xid(xid);
+		return rc;
+	}
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		free_xid(xid);
+		return rc;
+	}
+
+	/*
+	 * Attempt to flush data before changing attributes. We need to do
+	 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
+	 * ownership or mode then we may also need to do this. Here, we take
+	 * the safe way out and just do the flush on all setattr requests. If
+	 * the flush returns error, store it to report later and continue.
+	 *
+	 * BB: This should be smarter. Why bother flushing pages that
+	 * will be truncated anyway? Also, should we error out here if
+	 * the flush returns error?
+	 */
+	rc = filemap_write_and_wait(inode->i_mapping);
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
+
+	if (attrs->ia_valid & ATTR_SIZE) {
+		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		if (rc != 0)
+			goto cifs_setattr_exit;
+	}
+
+	if (attrs->ia_valid & ATTR_UID)
+		uid = attrs->ia_uid;
+
+	if (attrs->ia_valid & ATTR_GID)
+		gid = attrs->ia_gid;
+
+#ifdef CONFIG_CIFS_ACL
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		if (uid_valid(uid) || gid_valid(gid)) {
+			rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
+							uid, gid);
+			if (rc) {
+				cifs_dbg(FYI, "%s: Setting id failed with error: %d\n",
+					 __func__, rc);
+				goto cifs_setattr_exit;
+			}
+		}
+	} else
+#endif /* CONFIG_CIFS_ACL */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
+		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
+
+	/* skip mode change if it's just for clearing setuid/setgid */
+	if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+		attrs->ia_valid &= ~ATTR_MODE;
+
+	if (attrs->ia_valid & ATTR_MODE) {
+		mode = attrs->ia_mode;
+		rc = 0;
+#ifdef CONFIG_CIFS_ACL
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+			rc = id_mode_to_cifs_acl(inode, full_path, mode,
+						INVALID_UID, INVALID_GID);
+			if (rc) {
+				cifs_dbg(FYI, "%s: Setting ACL failed with error: %d\n",
+					 __func__, rc);
+				goto cifs_setattr_exit;
+			}
+		} else
+#endif /* CONFIG_CIFS_ACL */
+		if (((mode & S_IWUGO) == 0) &&
+		    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
+
+			dosattr = cifsInode->cifsAttrs | ATTR_READONLY;
+
+			/* fix up mode if we're not using dynperm */
+			if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+				attrs->ia_mode = inode->i_mode & ~S_IWUGO;
+		} else if ((mode & S_IWUGO) &&
+			   (cifsInode->cifsAttrs & ATTR_READONLY)) {
+
+			dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+			/* Attributes of 0 are ignored */
+			if (dosattr == 0)
+				dosattr |= ATTR_NORMAL;
+
+			/* reset local inode permissions to normal */
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+				attrs->ia_mode &= ~(S_IALLUGO);
+				if (S_ISDIR(inode->i_mode))
+					attrs->ia_mode |=
+						cifs_sb->mnt_dir_mode;
+				else
+					attrs->ia_mode |=
+						cifs_sb->mnt_file_mode;
+			}
+		} else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+			/* ignore mode change - ATTR_READONLY hasn't changed */
+			attrs->ia_valid &= ~ATTR_MODE;
+		}
+	}
+
+	if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) ||
+	    ((attrs->ia_valid & ATTR_MODE) && dosattr)) {
+		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+		/* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */
+
+		/* Even if error on time set, no sense failing the call if
+		the server would set the time to a reasonable value anyway,
+		and this check ensures that we are not being called from
+		sys_utimes in which case we ought to fail the call back to
+		the user when the server rejects the call */
+		if ((rc) && (attrs->ia_valid &
+				(ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
+			rc = 0;
+	}
+
+	/* do not need local check to inode_check_ok since the server does
+	   that */
+	if (rc)
+		goto cifs_setattr_exit;
+
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode))
+		truncate_setsize(inode, attrs->ia_size);
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+
+cifs_setattr_exit:
+	kfree(full_path);
+	free_xid(xid);
+	return rc;
+}
+
+int
+cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+	struct inode *inode = d_inode(direntry);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
+
+	if (pTcon->unix_ext)
+		return cifs_setattr_unix(direntry, attrs);
+
+	return cifs_setattr_nounix(direntry, attrs);
+
+	/* BB: add cifs_setattr_legacy for really old servers */
+}
+
+#if 0
+void cifs_delete_inode(struct inode *inode)
+{
+	cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode);
+	/* may have to add back in if and when safe distributed caching of
+	   directories added e.g. via FindNotify */
+}
+#endif
diff --git a/kernel/fs/cifs/ioctl.c b/kernel/fs/cifs/ioctl.c
new file mode 100644
index 000000000..8b7898b76
--- /dev/null
+++ b/kernel/fs/cifs/ioctl.c
@@ -0,0 +1,217 @@
+/*
+ *   fs/cifs/ioctl.c
+ *
+ *   vfs operations that deal with io control
+ *
+ *   Copyright (C) International Business Machines  Corp., 2005,2013
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifsfs.h"
+
+#define CIFS_IOCTL_MAGIC	0xCF
+#define CIFS_IOC_COPYCHUNK_FILE	_IOW(CIFS_IOCTL_MAGIC, 3, int)
+
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+			unsigned long srcfd, u64 off, u64 len, u64 destoff)
+{
+	int rc;
+	struct cifsFileInfo *smb_file_target = dst_file->private_data;
+	struct inode *target_inode = file_inode(dst_file);
+	struct cifs_tcon *target_tcon;
+	struct fd src_file;
+	struct cifsFileInfo *smb_file_src;
+	struct inode *src_inode;
+	struct cifs_tcon *src_tcon;
+
+	cifs_dbg(FYI, "ioctl clone range\n");
+	/* the destination must be opened for writing */
+	if (!(dst_file->f_mode & FMODE_WRITE)) {
+		cifs_dbg(FYI, "file target not open for write\n");
+		return -EINVAL;
+	}
+
+	/* check if target volume is readonly and take reference */
+	rc = mnt_want_write_file(dst_file);
+	if (rc) {
+		cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+		return rc;
+	}
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		rc = -EBADF;
+		goto out_drop_write;
+	}
+
+	if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+		rc = -EBADF;
+		cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+		goto out_fput;
+	}
+
+	rc = -EXDEV;
+	smb_file_target = dst_file->private_data;
+	smb_file_src = src_file.file->private_data;
+	src_tcon = tlink_tcon(smb_file_src->tlink);
+	target_tcon = tlink_tcon(smb_file_target->tlink);
+
+	/* check if source and target are on same tree connection */
+	if (src_tcon != target_tcon) {
+		cifs_dbg(VFS, "file copy src and target on different volume\n");
+		goto out_fput;
+	}
+
+	src_inode = file_inode(src_file.file);
+	rc = -EINVAL;
+	if (S_ISDIR(src_inode->i_mode))
+		goto out_fput;
+
+	/*
+	 * Note: cifs case is easier than btrfs since server responsible for
+	 * checks for proper open modes and file type and if it wants
+	 * server could even support copy of range where source = target
+	 */
+	lock_two_nondirectories(target_inode, src_inode);
+
+	/* determine range to clone */
+	rc = -EINVAL;
+	if (off + len > src_inode->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		len = src_inode->i_size - off;
+
+	cifs_dbg(FYI, "about to flush pages\n");
+	/* should we flush first and last page first */
+	truncate_inode_pages_range(&target_inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len)-1);
+
+	if (target_tcon->ses->server->ops->clone_range)
+		rc = target_tcon->ses->server->ops->clone_range(xid,
+			smb_file_src, smb_file_target, off, len, destoff);
+
+	/* force revalidate of size and timestamps of target file now
+	   that target is updated on the server */
+	CIFS_I(target_inode)->time = 0;
+out_unlock:
+	/* although unlocking in the reverse order from locking is not
+	   strictly necessary here it is a little cleaner to be consistent */
+	unlock_two_nondirectories(src_inode, target_inode);
+out_fput:
+	fdput(src_file);
+out_drop_write:
+	mnt_drop_write_file(dst_file);
+	return rc;
+}
+
+long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
+{
+	struct inode *inode = file_inode(filep);
+	int rc = -ENOTTY; /* strange error - but the precedent */
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct cifsFileInfo *pSMBFile = filep->private_data;
+	struct cifs_tcon *tcon;
+	__u64	ExtAttrBits = 0;
+	__u64   caps;
+
+	xid = get_xid();
+
+	cifs_dbg(FYI, "ioctl file %p  cmd %u  arg %lu\n", filep, command, arg);
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+
+	switch (command) {
+		case FS_IOC_GETFLAGS:
+			if (pSMBFile == NULL)
+				break;
+			tcon = tlink_tcon(pSMBFile->tlink);
+			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+#ifdef CONFIG_CIFS_POSIX
+			if (CIFS_UNIX_EXTATTR_CAP & caps) {
+				__u64	ExtAttrMask = 0;
+				rc = CIFSGetExtAttr(xid, tcon,
+						    pSMBFile->fid.netfid,
+						    &ExtAttrBits, &ExtAttrMask);
+				if (rc == 0)
+					rc = put_user(ExtAttrBits &
+						FS_FL_USER_VISIBLE,
+						(int __user *)arg);
+				if (rc != EOPNOTSUPP)
+					break;
+			}
+#endif /* CONFIG_CIFS_POSIX */
+			rc = 0;
+			if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) {
+				/* add in the compressed bit */
+				ExtAttrBits = FS_COMPR_FL;
+				rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE,
+					      (int __user *)arg);
+			}
+			break;
+		case FS_IOC_SETFLAGS:
+			if (pSMBFile == NULL)
+				break;
+			tcon = tlink_tcon(pSMBFile->tlink);
+			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
+			if (get_user(ExtAttrBits, (int __user *)arg)) {
+				rc = -EFAULT;
+				break;
+			}
+
+			/*
+			 * if (CIFS_UNIX_EXTATTR_CAP & caps)
+			 *	rc = CIFSSetExtAttr(xid, tcon,
+			 *		       pSMBFile->fid.netfid,
+			 *		       extAttrBits,
+			 *		       &ExtAttrMask);
+			 * if (rc != EOPNOTSUPP)
+			 *	break;
+			 */
+
+			/* Currently only flag we can set is compressed flag */
+			if ((ExtAttrBits & FS_COMPR_FL) == 0)
+				break;
+
+			/* Try to set compress flag */
+			if (tcon->ses->server->ops->set_compression) {
+				rc = tcon->ses->server->ops->set_compression(
+							xid, tcon, pSMBFile);
+				cifs_dbg(FYI, "set compress flag rc %d\n", rc);
+			}
+			break;
+		case CIFS_IOC_COPYCHUNK_FILE:
+			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
+			break;
+		default:
+			cifs_dbg(FYI, "unsupported ioctl\n");
+			break;
+	}
+
+	free_xid(xid);
+	return rc;
+}
diff --git a/kernel/fs/cifs/link.c b/kernel/fs/cifs/link.c
new file mode 100644
index 000000000..e6c707cc6
--- /dev/null
+++ b/kernel/fs/cifs/link.c
@@ -0,0 +1,746 @@
+/*
+ *   fs/cifs/link.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2proto.h"
+#endif
+
+/*
+ * M-F Symlink Functions - Begin
+ */
+
+#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
+#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
+#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
+#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
+#define CIFS_MF_SYMLINK_FILE_SIZE \
+	(CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
+
+#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
+#define CIFS_MF_SYMLINK_MD5_FORMAT \
+	"%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
+	md5_hash[0],  md5_hash[1],  md5_hash[2],  md5_hash[3], \
+	md5_hash[4],  md5_hash[5],  md5_hash[6],  md5_hash[7], \
+	md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
+	md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+
+static int
+symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md5;
+	struct sdesc *sdescmd5;
+
+	md5 = crypto_alloc_shash("md5", 0, 0);
+	if (IS_ERR(md5)) {
+		rc = PTR_ERR(md5);
+		cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n",
+			 __func__, rc);
+		return rc;
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
+	sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd5) {
+		rc = -ENOMEM;
+		goto symlink_hash_err;
+	}
+	sdescmd5->shash.tfm = md5;
+	sdescmd5->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd5->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
+		goto symlink_hash_err;
+	}
+	rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
+		goto symlink_hash_err;
+	}
+	rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+
+symlink_hash_err:
+	crypto_free_shash(md5);
+	kfree(sdescmd5);
+
+	return rc;
+}
+
+static int
+parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
+		 char **_link_str)
+{
+	int rc;
+	unsigned int link_len;
+	const char *md5_str1;
+	const char *link_str;
+	u8 md5_hash[16];
+	char md5_str2[34];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
+	link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
+
+	rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
+	if (rc != 1)
+		return -EINVAL;
+
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
+
+	snprintf(md5_str2, sizeof(md5_str2),
+		 CIFS_MF_SYMLINK_MD5_FORMAT,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	if (strncmp(md5_str1, md5_str2, 17) != 0)
+		return -EINVAL;
+
+	if (_link_str) {
+		*_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
+		if (!*_link_str)
+			return -ENOMEM;
+	}
+
+	*_link_len = link_len;
+	return 0;
+}
+
+static int
+format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
+{
+	int rc;
+	unsigned int link_len;
+	unsigned int ofs;
+	u8 md5_hash[16];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	link_len = strlen(link_str);
+
+	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+		return -ENAMETOOLONG;
+
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
+
+	snprintf(buf, buf_len,
+		 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+		 link_len,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
+	memcpy(buf + ofs, link_str, link_len);
+
+	ofs += link_len;
+	if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = '\n';
+		ofs++;
+	}
+
+	while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = ' ';
+		ofs++;
+	}
+
+	return 0;
+}
+
+bool
+couldbe_mf_symlink(const struct cifs_fattr *fattr)
+{
+	if (!S_ISREG(fattr->cf_mode))
+		/* it's not a symlink */
+		return false;
+
+	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+		/* it's not a symlink */
+		return false;
+
+	return true;
+}
+
+static int
+create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		  struct cifs_sb_info *cifs_sb, const char *fromName,
+		  const char *toName)
+{
+	int rc;
+	u8 *buf;
+	unsigned int bytes_written = 0;
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+	if (rc)
+		goto out;
+
+	if (tcon->ses->server->ops->create_mf_symlink)
+		rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
+					cifs_sb, fromName, buf, &bytes_written);
+	else
+		rc = -EOPNOTSUPP;
+
+	if (rc)
+		goto out;
+
+	if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
+		rc = -EIO;
+out:
+	kfree(buf);
+	return rc;
+}
+
+static int
+query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		 struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		 char **symlinkinfo)
+{
+	int rc;
+	u8 *buf = NULL;
+	unsigned int link_len = 0;
+	unsigned int bytes_read = 0;
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (tcon->ses->server->ops->query_mf_symlink)
+		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+					      cifs_sb, path, buf, &bytes_read);
+	else
+		rc = -ENOSYS;
+
+	if (rc)
+		goto out;
+
+	if (bytes_read == 0) { /* not a symlink */
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
+out:
+	kfree(buf);
+	return rc;
+}
+
+int
+check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+		 const unsigned char *path)
+{
+	int rc;
+	u8 *buf = NULL;
+	unsigned int link_len = 0;
+	unsigned int bytes_read = 0;
+
+	if (!couldbe_mf_symlink(fattr))
+		/* it's not a symlink */
+		return 0;
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (tcon->ses->server->ops->query_mf_symlink)
+		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+					      cifs_sb, path, buf, &bytes_read);
+	else
+		rc = -ENOSYS;
+
+	if (rc)
+		goto out;
+
+	if (bytes_read == 0) /* not a symlink */
+		goto out;
+
+	rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
+	if (rc == -EINVAL) {
+		/* it's not a symlink */
+		rc = 0;
+		goto out;
+	}
+
+	if (rc != 0)
+		goto out;
+
+	/* it is a symlink */
+	fattr->cf_eof = link_len;
+	fattr->cf_mode &= ~S_IFMT;
+	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
+	fattr->cf_dtype = DT_LNK;
+out:
+	kfree(buf);
+	return rc;
+}
+
+/*
+ * SMB 1.0 Protocol specific functions
+ */
+
+int
+cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		      struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		      char *pbuf, unsigned int *pbytes_read)
+{
+	int rc;
+	int oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
+	int buf_type = CIFS_NO_BUFFER;
+	FILE_ALL_INFO file_info;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_READ;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+	if (rc)
+		return rc;
+
+	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+		rc = -ENOENT;
+		/* it's not a symlink */
+		goto out;
+	}
+
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+out:
+	CIFSSMBClose(xid, tcon, fid.netfid);
+	return rc;
+}
+
+int
+cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		       struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		       char *pbuf, unsigned int *pbytes_written)
+{
+	int rc;
+	int oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
+	int create_options = CREATE_NOT_DIR;
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc)
+		return rc;
+
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
+	CIFSSMBClose(xid, tcon, fid.netfid);
+	return rc;
+}
+
+/*
+ * SMB 2.1/SMB3 Protocol specific functions
+ */
+#ifdef CONFIG_CIFS_SMB2
+int
+smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		      struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		      char *pbuf, unsigned int *pbytes_read)
+{
+	int rc;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
+	int buf_type = CIFS_NO_BUFFER;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_II;
+	struct smb2_file_all_info *pfile_info = NULL;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_READ;
+	oparms.create_options = CREATE_NOT_DIR;
+	if (backup_cred(cifs_sb))
+		oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+	oparms.disposition = FILE_OPEN;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+	if (utf16_path == NULL)
+		return -ENOMEM;
+
+	pfile_info = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+			     GFP_KERNEL);
+
+	if (pfile_info == NULL) {
+		kfree(utf16_path);
+		return  -ENOMEM;
+	}
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL);
+	if (rc)
+		goto qmf_out_open_fail;
+
+	if (pfile_info->EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+		/* it's not a symlink */
+		rc = -ENOENT; /* Is there a better rc to return? */
+		goto qmf_out;
+	}
+
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+	io_parms.persistent_fid = fid.persistent_fid;
+	io_parms.volatile_fid = fid.volatile_fid;
+	rc = SMB2_read(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+qmf_out:
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+qmf_out_open_fail:
+	kfree(utf16_path);
+	kfree(pfile_info);
+	return rc;
+}
+
+int
+smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		       struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		       char *pbuf, unsigned int *pbytes_written)
+{
+	int rc;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
+	int create_options = CREATE_NOT_DIR;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+	struct kvec iov[2];
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
+
+	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+	if (rc) {
+		kfree(utf16_path);
+		return rc;
+	}
+
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+	io_parms.persistent_fid = fid.persistent_fid;
+	io_parms.volatile_fid = fid.volatile_fid;
+
+	/* iov[0] is reserved for smb header */
+	iov[1].iov_base = pbuf;
+	iov[1].iov_len = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = SMB2_write(xid, &io_parms, pbytes_written, iov, 1);
+
+	/* Make sure we wrote all of the symlink data */
+	if ((rc == 0) && (*pbytes_written != CIFS_MF_SYMLINK_FILE_SIZE))
+		rc = -EIO;
+
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+
+	kfree(utf16_path);
+	return rc;
+}
+#endif /* CONFIG_CIFS_SMB2 */
+
+/*
+ * M-F Symlink Functions - End
+ */
+
+int
+cifs_hardlink(struct dentry *old_file, struct inode *inode,
+	      struct dentry *direntry)
+{
+	int rc = -EACCES;
+	unsigned int xid;
+	char *from_name = NULL;
+	char *to_name = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifsInodeInfo *cifsInode;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	from_name = build_path_from_dentry(old_file);
+	to_name = build_path_from_dentry(direntry);
+	if ((from_name == NULL) || (to_name == NULL)) {
+		rc = -ENOMEM;
+		goto cifs_hl_exit;
+	}
+
+	if (tcon->unix_ext)
+		rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name,
+					    cifs_sb->local_nls,
+					    cifs_remap(cifs_sb));
+	else {
+		server = tcon->ses->server;
+		if (!server->ops->create_hardlink) {
+			rc = -ENOSYS;
+			goto cifs_hl_exit;
+		}
+		rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
+						  cifs_sb);
+		if ((rc == -EIO) || (rc == -EINVAL))
+			rc = -EOPNOTSUPP;
+	}
+
+	d_drop(direntry);	/* force new lookup from server of target */
+
+	/*
+	 * if source file is cached (oplocked) revalidate will not go to server
+	 * until the file is closed or oplock broken so update nlinks locally
+	 */
+	if (d_really_is_positive(old_file)) {
+		cifsInode = CIFS_I(d_inode(old_file));
+		if (rc == 0) {
+			spin_lock(&d_inode(old_file)->i_lock);
+			inc_nlink(d_inode(old_file));
+			spin_unlock(&d_inode(old_file)->i_lock);
+
+			/*
+			 * parent dir timestamps will update from srv within a
+			 * second, would it really be worth it to set the parent
+			 * dir cifs inode time to zero to force revalidate
+			 * (faster) for it too?
+			 */
+		}
+		/*
+		 * if not oplocked will force revalidate to get info on source
+		 * file from srv.  Note Samba server prior to 4.2 has bug -
+		 * not updating src file ctime on hardlinks but Windows servers
+		 * handle it properly
+		 */
+		cifsInode->time = 0;
+
+		/*
+		 * Will update parent dir timestamps from srv within a second.
+		 * Would it really be worth it to set the parent dir (cifs
+		 * inode) time field to zero to force revalidate on parent
+		 * directory faster ie
+		 *
+		 * CIFS_I(inode)->time = 0;
+		 */
+	}
+
+cifs_hl_exit:
+	kfree(from_name);
+	kfree(to_name);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+void *
+cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
+{
+	struct inode *inode = d_inode(direntry);
+	int rc = -ENOMEM;
+	unsigned int xid;
+	char *full_path = NULL;
+	char *target_path = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+
+	xid = get_xid();
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	full_path = build_path_from_dentry(direntry);
+	if (!full_path)
+		goto out;
+
+	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
+
+	rc = -EACCES;
+	/*
+	 * First try Minshall+French Symlinks, if configured
+	 * and fallback to UNIX Extensions Symlinks.
+	 */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
+				      &target_path);
+
+	if (rc != 0 && server->ops->query_symlink)
+		rc = server->ops->query_symlink(xid, tcon, full_path,
+						&target_path, cifs_sb);
+
+	kfree(full_path);
+out:
+	if (rc != 0) {
+		kfree(target_path);
+		target_path = ERR_PTR(rc);
+	}
+
+	free_xid(xid);
+	if (tlink)
+		cifs_put_tlink(tlink);
+	nd_set_link(nd, target_path);
+	return NULL;
+}
+
+int
+cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
+{
+	int rc = -EOPNOTSUPP;
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	char *full_path = NULL;
+	struct inode *newinode = NULL;
+
+	xid = get_xid();
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto symlink_exit;
+	}
+	pTcon = tlink_tcon(tlink);
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto symlink_exit;
+	}
+
+	cifs_dbg(FYI, "Full path: %s\n", full_path);
+	cifs_dbg(FYI, "symname is %s\n", symname);
+
+	/* BB what if DFS and this volume is on different share? BB */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
+	else if (pTcon->unix_ext)
+		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
+					   cifs_sb->local_nls,
+					   cifs_remap(cifs_sb));
+	/* else
+	   rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName,
+					cifs_sb_target->local_nls); */
+
+	if (rc == 0) {
+		if (pTcon->unix_ext)
+			rc = cifs_get_inode_info_unix(&newinode, full_path,
+						      inode->i_sb, xid);
+		else
+			rc = cifs_get_inode_info(&newinode, full_path, NULL,
+						 inode->i_sb, xid, NULL);
+
+		if (rc != 0) {
+			cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n",
+				 rc);
+		} else {
+			d_instantiate(direntry, newinode);
+		}
+	}
+symlink_exit:
+	kfree(full_path);
+	cifs_put_tlink(tlink);
+	free_xid(xid);
+	return rc;
+}
diff --git a/kernel/fs/cifs/misc.c b/kernel/fs/cifs/misc.c
new file mode 100644
index 000000000..8442b8b8e
--- /dev/null
+++ b/kernel/fs/cifs/misc.c
@@ -0,0 +1,641 @@
+/*
+ *   fs/cifs/misc.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/mempool.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "smberr.h"
+#include "nterr.h"
+#include "cifs_unicode.h"
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2pdu.h"
+#endif
+
+extern mempool_t *cifs_sm_req_poolp;
+extern mempool_t *cifs_req_poolp;
+
+/* The xid serves as a useful identifier for each incoming vfs request,
+   in a similar way to the mid which is useful to track each sent smb,
+   and CurrentXid can also provide a running counter (although it
+   will eventually wrap past zero) of the total vfs operations handled
+   since the cifs fs was mounted */
+
+unsigned int
+_get_xid(void)
+{
+	unsigned int xid;
+
+	spin_lock(&GlobalMid_Lock);
+	GlobalTotalActiveXid++;
+
+	/* keep high water mark for number of simultaneous ops in filesystem */
+	if (GlobalTotalActiveXid > GlobalMaxActiveXid)
+		GlobalMaxActiveXid = GlobalTotalActiveXid;
+	if (GlobalTotalActiveXid > 65000)
+		cifs_dbg(FYI, "warning: more than 65000 requests active\n");
+	xid = GlobalCurrentXid++;
+	spin_unlock(&GlobalMid_Lock);
+	return xid;
+}
+
+void
+_free_xid(unsigned int xid)
+{
+	spin_lock(&GlobalMid_Lock);
+	/* if (GlobalTotalActiveXid == 0)
+		BUG(); */
+	GlobalTotalActiveXid--;
+	spin_unlock(&GlobalMid_Lock);
+}
+
+struct cifs_ses *
+sesInfoAlloc(void)
+{
+	struct cifs_ses *ret_buf;
+
+	ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
+	if (ret_buf) {
+		atomic_inc(&sesInfoAllocCount);
+		ret_buf->status = CifsNew;
+		++ret_buf->ses_count;
+		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
+		mutex_init(&ret_buf->session_mutex);
+	}
+	return ret_buf;
+}
+
+void
+sesInfoFree(struct cifs_ses *buf_to_free)
+{
+	if (buf_to_free == NULL) {
+		cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
+		return;
+	}
+
+	atomic_dec(&sesInfoAllocCount);
+	kfree(buf_to_free->serverOS);
+	kfree(buf_to_free->serverDomain);
+	kfree(buf_to_free->serverNOS);
+	if (buf_to_free->password) {
+		memset(buf_to_free->password, 0, strlen(buf_to_free->password));
+		kfree(buf_to_free->password);
+	}
+	kfree(buf_to_free->user_name);
+	kfree(buf_to_free->domainName);
+	kfree(buf_to_free->auth_key.response);
+	kfree(buf_to_free);
+}
+
+struct cifs_tcon *
+tconInfoAlloc(void)
+{
+	struct cifs_tcon *ret_buf;
+	ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
+	if (ret_buf) {
+		atomic_inc(&tconInfoAllocCount);
+		ret_buf->tidStatus = CifsNew;
+		++ret_buf->tc_count;
+		INIT_LIST_HEAD(&ret_buf->openFileList);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
+#ifdef CONFIG_CIFS_STATS
+		spin_lock_init(&ret_buf->stat_lock);
+#endif
+	}
+	return ret_buf;
+}
+
+void
+tconInfoFree(struct cifs_tcon *buf_to_free)
+{
+	if (buf_to_free == NULL) {
+		cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
+		return;
+	}
+	atomic_dec(&tconInfoAllocCount);
+	kfree(buf_to_free->nativeFileSystem);
+	if (buf_to_free->password) {
+		memset(buf_to_free->password, 0, strlen(buf_to_free->password));
+		kfree(buf_to_free->password);
+	}
+	kfree(buf_to_free);
+}
+
+struct smb_hdr *
+cifs_buf_get(void)
+{
+	struct smb_hdr *ret_buf = NULL;
+	size_t buf_size = sizeof(struct smb_hdr);
+
+#ifdef CONFIG_CIFS_SMB2
+	/*
+	 * SMB2 header is bigger than CIFS one - no problems to clean some
+	 * more bytes for CIFS.
+	 */
+	buf_size = sizeof(struct smb2_hdr);
+#endif
+	/*
+	 * We could use negotiated size instead of max_msgsize -
+	 * but it may be more efficient to always alloc same size
+	 * albeit slightly larger than necessary and maxbuffersize
+	 * defaults to this and can not be bigger.
+	 */
+	ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
+
+	/* clear the first few header bytes */
+	/* for most paths, more is cleared in header_assemble */
+	if (ret_buf) {
+		memset(ret_buf, 0, buf_size + 3);
+		atomic_inc(&bufAllocCount);
+#ifdef CONFIG_CIFS_STATS2
+		atomic_inc(&totBufAllocCount);
+#endif /* CONFIG_CIFS_STATS2 */
+	}
+
+	return ret_buf;
+}
+
+void
+cifs_buf_release(void *buf_to_free)
+{
+	if (buf_to_free == NULL) {
+		/* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/
+		return;
+	}
+	mempool_free(buf_to_free, cifs_req_poolp);
+
+	atomic_dec(&bufAllocCount);
+	return;
+}
+
+struct smb_hdr *
+cifs_small_buf_get(void)
+{
+	struct smb_hdr *ret_buf = NULL;
+
+/* We could use negotiated size instead of max_msgsize -
+   but it may be more efficient to always alloc same size
+   albeit slightly larger than necessary and maxbuffersize
+   defaults to this and can not be bigger */
+	ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
+	if (ret_buf) {
+	/* No need to clear memory here, cleared in header assemble */
+	/*	memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
+		atomic_inc(&smBufAllocCount);
+#ifdef CONFIG_CIFS_STATS2
+		atomic_inc(&totSmBufAllocCount);
+#endif /* CONFIG_CIFS_STATS2 */
+
+	}
+	return ret_buf;
+}
+
+void
+cifs_small_buf_release(void *buf_to_free)
+{
+
+	if (buf_to_free == NULL) {
+		cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n");
+		return;
+	}
+	mempool_free(buf_to_free, cifs_sm_req_poolp);
+
+	atomic_dec(&smBufAllocCount);
+	return;
+}
+
+void
+free_rsp_buf(int resp_buftype, void *rsp)
+{
+	if (resp_buftype == CIFS_SMALL_BUFFER)
+		cifs_small_buf_release(rsp);
+	else if (resp_buftype == CIFS_LARGE_BUFFER)
+		cifs_buf_release(rsp);
+}
+
+/* NB: MID can not be set if treeCon not passed in, in that
+   case it is responsbility of caller to set the mid */
+void
+header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
+		const struct cifs_tcon *treeCon, int word_count
+		/* length of fixed section (word count) in two byte units  */)
+{
+	char *temp = (char *) buffer;
+
+	memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
+
+	buffer->smb_buf_length = cpu_to_be32(
+	    (2 * word_count) + sizeof(struct smb_hdr) -
+	    4 /*  RFC 1001 length field does not count */  +
+	    2 /* for bcc field itself */) ;
+
+	buffer->Protocol[0] = 0xFF;
+	buffer->Protocol[1] = 'S';
+	buffer->Protocol[2] = 'M';
+	buffer->Protocol[3] = 'B';
+	buffer->Command = smb_command;
+	buffer->Flags = 0x00;	/* case sensitive */
+	buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
+	buffer->Pid = cpu_to_le16((__u16)current->tgid);
+	buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
+	if (treeCon) {
+		buffer->Tid = treeCon->tid;
+		if (treeCon->ses) {
+			if (treeCon->ses->capabilities & CAP_UNICODE)
+				buffer->Flags2 |= SMBFLG2_UNICODE;
+			if (treeCon->ses->capabilities & CAP_STATUS32)
+				buffer->Flags2 |= SMBFLG2_ERR_STATUS;
+
+			/* Uid is not converted */
+			buffer->Uid = treeCon->ses->Suid;
+			buffer->Mid = get_next_mid(treeCon->ses->server);
+		}
+		if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
+			buffer->Flags2 |= SMBFLG2_DFS;
+		if (treeCon->nocase)
+			buffer->Flags  |= SMBFLG_CASELESS;
+		if ((treeCon->ses) && (treeCon->ses->server))
+			if (treeCon->ses->server->sign)
+				buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+	}
+
+/*  endian conversion of flags is now done just before sending */
+	buffer->WordCount = (char) word_count;
+	return;
+}
+
+static int
+check_smb_hdr(struct smb_hdr *smb)
+{
+	/* does it have the right SMB "signature" ? */
+	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
+		cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n",
+			 *(unsigned int *)smb->Protocol);
+		return 1;
+	}
+
+	/* if it's a response then accept */
+	if (smb->Flags & SMBFLG_RESPONSE)
+		return 0;
+
+	/* only one valid case where server sends us request */
+	if (smb->Command == SMB_COM_LOCKING_ANDX)
+		return 0;
+
+	cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
+		 get_mid(smb));
+	return 1;
+}
+
+int
+checkSMB(char *buf, unsigned int total_read)
+{
+	struct smb_hdr *smb = (struct smb_hdr *)buf;
+	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
+	__u32 clc_len;  /* calculated length */
+	cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
+		 total_read, rfclen);
+
+	/* is this frame too small to even get to a BCC? */
+	if (total_read < 2 + sizeof(struct smb_hdr)) {
+		if ((total_read >= sizeof(struct smb_hdr) - 1)
+			    && (smb->Status.CifsError != 0)) {
+			/* it's an error return */
+			smb->WordCount = 0;
+			/* some error cases do not return wct and bcc */
+			return 0;
+		} else if ((total_read == sizeof(struct smb_hdr) + 1) &&
+				(smb->WordCount == 0)) {
+			char *tmp = (char *)smb;
+			/* Need to work around a bug in two servers here */
+			/* First, check if the part of bcc they sent was zero */
+			if (tmp[sizeof(struct smb_hdr)] == 0) {
+				/* some servers return only half of bcc
+				 * on simple responses (wct, bcc both zero)
+				 * in particular have seen this on
+				 * ulogoffX and FindClose. This leaves
+				 * one byte of bcc potentially unitialized
+				 */
+				/* zero rest of bcc */
+				tmp[sizeof(struct smb_hdr)+1] = 0;
+				return 0;
+			}
+			cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n");
+		} else {
+			cifs_dbg(VFS, "Length less than smb header size\n");
+		}
+		return -EIO;
+	}
+
+	/* otherwise, there is enough to get to the BCC */
+	if (check_smb_hdr(smb))
+		return -EIO;
+	clc_len = smbCalcSize(smb);
+
+	if (4 + rfclen != total_read) {
+		cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
+			 rfclen);
+		return -EIO;
+	}
+
+	if (4 + rfclen != clc_len) {
+		__u16 mid = get_mid(smb);
+		/* check if bcc wrapped around for large read responses */
+		if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
+			/* check if lengths match mod 64K */
+			if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
+				return 0; /* bcc wrapped */
+		}
+		cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
+			 clc_len, 4 + rfclen, mid);
+
+		if (4 + rfclen < clc_len) {
+			cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
+				 rfclen, mid);
+			return -EIO;
+		} else if (rfclen > clc_len + 512) {
+			/*
+			 * Some servers (Windows XP in particular) send more
+			 * data than the lengths in the SMB packet would
+			 * indicate on certain calls (byte range locks and
+			 * trans2 find first calls in particular). While the
+			 * client can handle such a frame by ignoring the
+			 * trailing data, we choose limit the amount of extra
+			 * data to 512 bytes.
+			 */
+			cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
+				 rfclen, mid);
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+bool
+is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
+{
+	struct smb_hdr *buf = (struct smb_hdr *)buffer;
+	struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *pCifsInode;
+	struct cifsFileInfo *netfile;
+
+	cifs_dbg(FYI, "Checking for oplock break or dnotify response\n");
+	if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
+	   (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
+		struct smb_com_transaction_change_notify_rsp *pSMBr =
+			(struct smb_com_transaction_change_notify_rsp *)buf;
+		struct file_notify_information *pnotify;
+		__u32 data_offset = 0;
+		if (get_bcc(buf) > sizeof(struct file_notify_information)) {
+			data_offset = le32_to_cpu(pSMBr->DataOffset);
+
+			pnotify = (struct file_notify_information *)
+				((char *)&pSMBr->hdr.Protocol + data_offset);
+			cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n",
+				 pnotify->FileName, pnotify->Action);
+			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
+				sizeof(struct smb_hdr)+60); */
+			return true;
+		}
+		if (pSMBr->hdr.Status.CifsError) {
+			cifs_dbg(FYI, "notify err 0x%x\n",
+				 pSMBr->hdr.Status.CifsError);
+			return true;
+		}
+		return false;
+	}
+	if (pSMB->hdr.Command != SMB_COM_LOCKING_ANDX)
+		return false;
+	if (pSMB->hdr.Flags & SMBFLG_RESPONSE) {
+		/* no sense logging error on invalid handle on oplock
+		   break - harmless race between close request and oplock
+		   break response is expected from time to time writing out
+		   large dirty files cached on the client */
+		if ((NT_STATUS_INVALID_HANDLE) ==
+		   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
+			cifs_dbg(FYI, "invalid handle on oplock break\n");
+			return true;
+		} else if (ERRbadfid ==
+		   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
+			return true;
+		} else {
+			return false; /* on valid oplock brk we get "request" */
+		}
+	}
+	if (pSMB->hdr.WordCount != 8)
+		return false;
+
+	cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n",
+		 pSMB->LockType, pSMB->OplockLevel);
+	if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
+		return false;
+
+	/* look up tcon based on tid & uid */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &srv->smb_ses_list) {
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+		list_for_each(tmp1, &ses->tcon_list) {
+			tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+			if (tcon->tid != buf->Tid)
+				continue;
+
+			cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
+			spin_lock(&cifs_file_list_lock);
+			list_for_each(tmp2, &tcon->openFileList) {
+				netfile = list_entry(tmp2, struct cifsFileInfo,
+						     tlist);
+				if (pSMB->Fid != netfile->fid.netfid)
+					continue;
+
+				cifs_dbg(FYI, "file id match, oplock break\n");
+				pCifsInode = CIFS_I(d_inode(netfile->dentry));
+
+				set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+					&pCifsInode->flags);
+
+				/*
+				 * Set flag if the server downgrades the oplock
+				 * to L2 else clear.
+				 */
+				if (pSMB->OplockLevel)
+					set_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &pCifsInode->flags);
+				else
+					clear_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &pCifsInode->flags);
+
+				queue_work(cifsiod_wq,
+					   &netfile->oplock_break);
+				netfile->oplock_break_cancelled = false;
+
+				spin_unlock(&cifs_file_list_lock);
+				spin_unlock(&cifs_tcp_ses_lock);
+				return true;
+			}
+			spin_unlock(&cifs_file_list_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
+			cifs_dbg(FYI, "No matching file for oplock break\n");
+			return true;
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
+	return true;
+}
+
+void
+dump_smb(void *buf, int smb_buf_length)
+{
+	if (traceSMB == 0)
+		return;
+
+	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 8, 2, buf,
+		       smb_buf_length, true);
+}
+
+void
+cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+		cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n",
+			 cifs_sb_master_tcon(cifs_sb)->treeName);
+	}
+}
+
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
+{
+	oplock &= 0xF;
+
+	if (oplock == OPLOCK_EXCLUSIVE) {
+		cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
+		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else if (oplock == OPLOCK_READ) {
+		cinode->oplock = CIFS_CACHE_READ_FLG;
+		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else
+		cinode->oplock = 0;
+}
+
+/*
+ * We wait for oplock breaks to be processed before we attempt to perform
+ * writes.
+ */
+int cifs_get_writer(struct cifsInodeInfo *cinode)
+{
+	int rc;
+
+start:
+	rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
+			 TASK_KILLABLE);
+	if (rc)
+		return rc;
+
+	spin_lock(&cinode->writers_lock);
+	if (!cinode->writers)
+		set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+	cinode->writers++;
+	/* Check to see if we have started servicing an oplock break */
+	if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
+		cinode->writers--;
+		if (cinode->writers == 0) {
+			clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+			wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+		}
+		spin_unlock(&cinode->writers_lock);
+		goto start;
+	}
+	spin_unlock(&cinode->writers_lock);
+	return 0;
+}
+
+void cifs_put_writer(struct cifsInodeInfo *cinode)
+{
+	spin_lock(&cinode->writers_lock);
+	cinode->writers--;
+	if (cinode->writers == 0) {
+		clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+		wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+	}
+	spin_unlock(&cinode->writers_lock);
+}
+
+void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
+{
+	clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+	wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
+}
+
+bool
+backup_cred(struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
+		if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid()))
+			return true;
+	}
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
+		if (in_group_p(cifs_sb->mnt_backupgid))
+			return true;
+	}
+
+	return false;
+}
+
+void
+cifs_del_pending_open(struct cifs_pending_open *open)
+{
+	spin_lock(&cifs_file_list_lock);
+	list_del(&open->olist);
+	spin_unlock(&cifs_file_list_lock);
+}
+
+void
+cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink,
+			     struct cifs_pending_open *open)
+{
+#ifdef CONFIG_CIFS_SMB2
+	memcpy(open->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
+#endif
+	open->oplock = CIFS_OPLOCK_NO_CHANGE;
+	open->tlink = tlink;
+	fid->pending_open = open;
+	list_add_tail(&open->olist, &tlink_tcon(tlink)->pending_opens);
+}
+
+void
+cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
+		      struct cifs_pending_open *open)
+{
+	spin_lock(&cifs_file_list_lock);
+	cifs_add_pending_open_locked(fid, tlink, open);
+	spin_unlock(&cifs_file_list_lock);
+}
diff --git a/kernel/fs/cifs/netmisc.c b/kernel/fs/cifs/netmisc.c
new file mode 100644
index 000000000..abae6dd2c
--- /dev/null
+++ b/kernel/fs/cifs/netmisc.c
@@ -0,0 +1,1014 @@
+/*
+ *   fs/cifs/netmisc.c
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Error mapping routines from Samba libsmb/errormap.c
+ *   Copyright (C) Andrew Tridgell 2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <asm/div64.h>
+#include <asm/byteorder.h>
+#include <linux/inet.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "smberr.h"
+#include "cifs_debug.h"
+#include "nterr.h"
+
+struct smb_to_posix_error {
+	__u16 smb_err;
+	int posix_code;
+};
+
+static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
+	{ERRbadfunc, -EINVAL},
+	{ERRbadfile, -ENOENT},
+	{ERRbadpath, -ENOTDIR},
+	{ERRnofids, -EMFILE},
+	{ERRnoaccess, -EACCES},
+	{ERRbadfid, -EBADF},
+	{ERRbadmcb, -EIO},
+	{ERRnomem, -EREMOTEIO},
+	{ERRbadmem, -EFAULT},
+	{ERRbadenv, -EFAULT},
+	{ERRbadformat, -EINVAL},
+	{ERRbadaccess, -EACCES},
+	{ERRbaddata, -EIO},
+	{ERRbaddrive, -ENXIO},
+	{ERRremcd, -EACCES},
+	{ERRdiffdevice, -EXDEV},
+	{ERRnofiles, -ENOENT},
+	{ERRwriteprot, -EROFS},
+	{ERRbadshare, -EBUSY},
+	{ERRlock, -EACCES},
+	{ERRunsup, -EINVAL},
+	{ERRnosuchshare, -ENXIO},
+	{ERRfilexists, -EEXIST},
+	{ERRinvparm, -EINVAL},
+	{ERRdiskfull, -ENOSPC},
+	{ERRinvname, -ENOENT},
+	{ERRinvlevel, -EOPNOTSUPP},
+	{ERRdirnotempty, -ENOTEMPTY},
+	{ERRnotlocked, -ENOLCK},
+	{ERRcancelviolation, -ENOLCK},
+	{ERRalreadyexists, -EEXIST},
+	{ERRmoredata, -EOVERFLOW},
+	{ERReasnotsupported, -EOPNOTSUPP},
+	{ErrQuota, -EDQUOT},
+	{ErrNotALink, -ENOLINK},
+	{ERRnetlogonNotStarted, -ENOPROTOOPT},
+	{ERRsymlink, -EOPNOTSUPP},
+	{ErrTooManyLinks, -EMLINK},
+	{0, 0}
+};
+
+static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
+	{ERRerror, -EIO},
+	{ERRbadpw, -EACCES},  /* was EPERM */
+	{ERRbadtype, -EREMOTE},
+	{ERRaccess, -EACCES},
+	{ERRinvtid, -ENXIO},
+	{ERRinvnetname, -ENXIO},
+	{ERRinvdevice, -ENXIO},
+	{ERRqfull, -ENOSPC},
+	{ERRqtoobig, -ENOSPC},
+	{ERRqeof, -EIO},
+	{ERRinvpfid, -EBADF},
+	{ERRsmbcmd, -EBADRQC},
+	{ERRsrverror, -EIO},
+	{ERRbadBID, -EIO},
+	{ERRfilespecs, -EINVAL},
+	{ERRbadLink, -EIO},
+	{ERRbadpermits, -EINVAL},
+	{ERRbadPID, -ESRCH},
+	{ERRsetattrmode, -EINVAL},
+	{ERRpaused, -EHOSTDOWN},
+	{ERRmsgoff, -EHOSTDOWN},
+	{ERRnoroom, -ENOSPC},
+	{ERRrmuns, -EUSERS},
+	{ERRtimeout, -ETIME},
+	{ERRnoresource, -EREMOTEIO},
+	{ERRtoomanyuids, -EUSERS},
+	{ERRbaduid, -EACCES},
+	{ERRusempx, -EIO},
+	{ERRusestd, -EIO},
+	{ERR_NOTIFY_ENUM_DIR, -ENOBUFS},
+	{ERRnoSuchUser, -EACCES},
+/*	{ERRaccountexpired, -EACCES},
+	{ERRbadclient, -EACCES},
+	{ERRbadLogonTime, -EACCES},
+	{ERRpasswordExpired, -EACCES},*/
+	{ERRaccountexpired, -EKEYEXPIRED},
+	{ERRbadclient, -EACCES},
+	{ERRbadLogonTime, -EACCES},
+	{ERRpasswordExpired, -EKEYEXPIRED},
+
+	{ERRnosupport, -EINVAL},
+	{0, 0}
+};
+
+static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
+	{0, 0}
+};
+
+/*
+ * Convert a string containing text IPv4 or IPv6 address to binary form.
+ *
+ * Returns 0 on failure.
+ */
+static int
+cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
+{
+	int ret = 0;
+
+	/* calculate length by finding first slash or NULL */
+	if (address_family == AF_INET)
+		ret = in4_pton(cp, len, dst, '\\', NULL);
+	else if (address_family == AF_INET6)
+		ret = in6_pton(cp, len, dst , '\\', NULL);
+
+	cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n",
+		 ret, len, len, cp);
+	if (ret > 0)
+		ret = 1;
+	return ret;
+}
+
+/*
+ * Try to convert a string to an IPv4 address and then attempt to convert
+ * it to an IPv6 address if that fails. Set the family field if either
+ * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to
+ * treat the part following it as a numeric sin6_scope_id.
+ *
+ * Returns 0 on failure.
+ */
+int
+cifs_convert_address(struct sockaddr *dst, const char *src, int len)
+{
+	int rc, alen, slen;
+	const char *pct;
+	char scope_id[13];
+	struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
+	struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
+
+	/* IPv4 address */
+	if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
+		s4->sin_family = AF_INET;
+		return 1;
+	}
+
+	/* attempt to exclude the scope ID from the address part */
+	pct = memchr(src, '%', len);
+	alen = pct ? pct - src : len;
+
+	rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
+	if (!rc)
+		return rc;
+
+	s6->sin6_family = AF_INET6;
+	if (pct) {
+		/* grab the scope ID */
+		slen = len - (alen + 1);
+		if (slen <= 0 || slen > 12)
+			return 0;
+		memcpy(scope_id, pct + 1, slen);
+		scope_id[slen] = '\0';
+
+		rc = kstrtouint(scope_id, 0, &s6->sin6_scope_id);
+		rc = (rc == 0) ? 1 : 0;
+	}
+
+	return rc;
+}
+
+void
+cifs_set_port(struct sockaddr *addr, const unsigned short int port)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)addr)->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
+		break;
+	}
+}
+
+/*****************************************************************************
+convert a NT status code to a dos class/code
+ *****************************************************************************/
+/* NT status -> dos error map */
+static const struct {
+	__u8 dos_class;
+	__u16 dos_code;
+	__u32 ntstatus;
+} ntstatus_to_dos_map[] = {
+	{
+	ERRDOS, ERRgeneral, NT_STATUS_UNSUCCESSFUL}, {
+	ERRDOS, ERRbadfunc, NT_STATUS_NOT_IMPLEMENTED}, {
+	ERRDOS, ERRinvlevel, NT_STATUS_INVALID_INFO_CLASS}, {
+	ERRDOS, 24, NT_STATUS_INFO_LENGTH_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ACCESS_VIOLATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IN_PAGE_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA}, {
+	ERRDOS, ERRbadfid, NT_STATUS_INVALID_HANDLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_INITIAL_STACK}, {
+	ERRDOS, 193, NT_STATUS_BAD_INITIAL_PC}, {
+	ERRDOS, 87, NT_STATUS_INVALID_CID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TIMER_NOT_CANCELED}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER}, {
+	ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_DEVICE}, {
+	ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_FILE}, {
+	ERRDOS, ERRbadfunc, NT_STATUS_INVALID_DEVICE_REQUEST}, {
+	ERRDOS, 38, NT_STATUS_END_OF_FILE}, {
+	ERRDOS, 34, NT_STATUS_WRONG_VOLUME}, {
+	ERRDOS, 21, NT_STATUS_NO_MEDIA_IN_DEVICE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_MEDIA}, {
+	ERRDOS, 27, NT_STATUS_NONEXISTENT_SECTOR},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_MORE_PROCESSING_REQUIRED to NT_STATUS_OK
+	 during the session setup } */
+	{
+	ERRDOS, ERRnomem, NT_STATUS_NO_MEMORY}, {
+	ERRDOS, 487, NT_STATUS_CONFLICTING_ADDRESSES}, {
+	ERRDOS, 487, NT_STATUS_NOT_MAPPED_VIEW}, {
+	ERRDOS, 87, NT_STATUS_UNABLE_TO_FREE_VM}, {
+	ERRDOS, 87, NT_STATUS_UNABLE_TO_DELETE_SECTION}, {
+	ERRDOS, 2142, NT_STATUS_INVALID_SYSTEM_SERVICE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_INSTRUCTION}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_INVALID_LOCK_SEQUENCE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_INVALID_VIEW_SIZE}, {
+	ERRDOS, 193, NT_STATUS_INVALID_FILE_FOR_SECTION}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_ALREADY_COMMITTED},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_ACCESS_DENIED to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE
+	 during the session setup }   */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_ACCESS_DENIED}, {
+	ERRDOS, 111, NT_STATUS_BUFFER_TOO_SMALL}, {
+	ERRDOS, ERRbadfid, NT_STATUS_OBJECT_TYPE_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NONCONTINUABLE_EXCEPTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_DISPOSITION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNWIND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_STACK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_UNWIND_TARGET}, {
+	ERRDOS, 158, NT_STATUS_NOT_LOCKED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PARITY_ERROR}, {
+	ERRDOS, 487, NT_STATUS_UNABLE_TO_DECOMMIT_VM}, {
+	ERRDOS, 487, NT_STATUS_NOT_COMMITTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_PORT_ATTRIBUTES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PORT_MESSAGE_TOO_LONG}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_MIX}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_QUOTA_LOWER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_CORRUPT_ERROR}, {
+	 /* mapping changed since shell does lookup on * expects FileNotFound */
+	ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, {
+	ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_NOT_FOUND}, {
+	ERRDOS, ERRalreadyexists, NT_STATUS_OBJECT_NAME_COLLISION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_HANDLE_NOT_WAITABLE}, {
+	ERRDOS, ERRbadfid, NT_STATUS_PORT_DISCONNECTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_ALREADY_ATTACHED}, {
+	ERRDOS, 161, NT_STATUS_OBJECT_PATH_INVALID}, {
+	ERRDOS, ERRbadpath, NT_STATUS_OBJECT_PATH_NOT_FOUND}, {
+	ERRDOS, 161, NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DATA_OVERRUN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DATA_LATE_ERROR}, {
+	ERRDOS, 23, NT_STATUS_DATA_ERROR}, {
+	ERRDOS, 23, NT_STATUS_CRC_ERROR}, {
+	ERRDOS, ERRnomem, NT_STATUS_SECTION_TOO_BIG}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_PORT_CONNECTION_REFUSED}, {
+	ERRDOS, ERRbadfid, NT_STATUS_INVALID_PORT_HANDLE}, {
+	ERRDOS, ERRbadshare, NT_STATUS_SHARING_VIOLATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_QUOTA_EXCEEDED}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PAGE_PROTECTION}, {
+	ERRDOS, 288, NT_STATUS_MUTANT_NOT_OWNED}, {
+	ERRDOS, 298, NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, {
+	ERRDOS, 87, NT_STATUS_PORT_ALREADY_SET}, {
+	ERRDOS, 87, NT_STATUS_SECTION_NOT_IMAGE}, {
+	ERRDOS, 156, NT_STATUS_SUSPEND_COUNT_EXCEEDED}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_THREAD_IS_TERMINATING}, {
+	ERRDOS, 87, NT_STATUS_BAD_WORKING_SET_LIMIT}, {
+	ERRDOS, 87, NT_STATUS_INCOMPATIBLE_FILE_MAP}, {
+	ERRDOS, 87, NT_STATUS_SECTION_PROTECTION}, {
+	ERRDOS, ERReasnotsupported, NT_STATUS_EAS_NOT_SUPPORTED}, {
+	ERRDOS, 255, NT_STATUS_EA_TOO_LARGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NONEXISTENT_EA_ENTRY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_EAS_ON_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EA_CORRUPT_ERROR}, {
+	ERRDOS, ERRlock, NT_STATUS_FILE_LOCK_CONFLICT}, {
+	ERRDOS, ERRlock, NT_STATUS_LOCK_NOT_GRANTED}, {
+	ERRDOS, ERRbadfile, NT_STATUS_DELETE_PENDING}, {
+	ERRDOS, ERRunsup, NT_STATUS_CTL_FILE_NOT_SUPPORTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNKNOWN_REVISION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REVISION_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_OWNER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_PRIMARY_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_IMPERSONATION_TOKEN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_DISABLE_MANDATORY}, {
+	ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, {
+	ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_NO_SUCH_USER to NT_STATUS_LOGON_FAILURE
+	 during the session setup } */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_NO_SUCH_USER}, { /* could map to 2238 */
+	ERRHRD, ERRgeneral, NT_STATUS_GROUP_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LAST_ADMIN},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_WRONG_PASSWORD to NT_STATUS_LOGON_FAILURE
+	 during the session setup } */
+	{
+	ERRSRV, ERRbadpw, NT_STATUS_WRONG_PASSWORD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_PASSWORD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PASSWORD_RESTRICTION}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_LOGON_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, {
+	ERRSRV, ERRbadLogonTime, NT_STATUS_INVALID_LOGON_HOURS}, {
+	ERRSRV, ERRbadclient, NT_STATUS_INVALID_WORKSTATION}, {
+	ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, {
+	ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_DISABLED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LUIDS_EXHAUSTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SUB_AUTHORITY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SECURITY_DESCR}, {
+	ERRDOS, 127, NT_STATUS_PROCEDURE_NOT_FOUND}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_FORMAT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_TOKEN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_INHERITANCE_ACL}, {
+	ERRDOS, 158, NT_STATUS_RANGE_NOT_LOCKED}, {
+	ERRDOS, 112, NT_STATUS_DISK_FULL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SERVER_DISABLED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SERVER_NOT_DISABLED}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, {
+	ERRDOS, 259, NT_STATUS_GUIDS_EXHAUSTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ID_AUTHORITY}, {
+	ERRDOS, 259, NT_STATUS_AGENTS_EXHAUSTED}, {
+	ERRDOS, 154, NT_STATUS_INVALID_VOLUME_LABEL}, {
+	ERRDOS, 14, NT_STATUS_SECTION_NOT_EXTENDED}, {
+	ERRDOS, 487, NT_STATUS_NOT_MAPPED_DATA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_DATA_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_NAME_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DENORMAL_OPERAND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INEXACT_RESULT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INVALID_OPERATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_STACK_CHECK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_UNDERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, {
+	ERRDOS, 534, NT_STATUS_INTEGER_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PRIVILEGED_INSTRUCTION}, {
+	ERRDOS, ERRnomem, NT_STATUS_TOO_MANY_PAGING_FILES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FILE_INVALID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ALLOTTED_SPACE_EXCEEDED},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_INSUFFICIENT_RESOURCES to
+	 NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */
+	{
+	ERRDOS, ERRnoresource, NT_STATUS_INSUFFICIENT_RESOURCES}, {
+	ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, {
+	ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, {
+	ERRDOS, 21, NT_STATUS_DEVICE_POWER_FAILURE}, {
+	ERRDOS, 487, NT_STATUS_FREE_VM_NOT_AT_BASE}, {
+	ERRDOS, 487, NT_STATUS_MEMORY_NOT_ALLOCATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_WORKING_SET_QUOTA}, {
+	ERRDOS, 19, NT_STATUS_MEDIA_WRITE_PROTECTED}, {
+	ERRDOS, 21, NT_STATUS_DEVICE_NOT_READY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_GROUP_ATTRIBUTES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_IMPERSONATION_LEVEL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_OPEN_ANONYMOUS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_VALIDATION_CLASS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_TOKEN_TYPE}, {
+	ERRDOS, 87, NT_STATUS_BAD_MASTER_BOOT_RECORD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INSTRUCTION_MISALIGNMENT}, {
+	ERRDOS, ERRpipebusy, NT_STATUS_INSTANCE_NOT_AVAILABLE}, {
+	ERRDOS, ERRpipebusy, NT_STATUS_PIPE_NOT_AVAILABLE}, {
+	ERRDOS, ERRbadpipe, NT_STATUS_INVALID_PIPE_STATE}, {
+	ERRDOS, ERRpipebusy, NT_STATUS_PIPE_BUSY}, {
+	ERRDOS, ERRbadfunc, NT_STATUS_ILLEGAL_FUNCTION}, {
+	ERRDOS, ERRnotconnected, NT_STATUS_PIPE_DISCONNECTED}, {
+	ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_CLOSING}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PIPE_CONNECTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PIPE_LISTENING}, {
+	ERRDOS, ERRbadpipe, NT_STATUS_INVALID_READ_MODE}, {
+	ERRDOS, 121, NT_STATUS_IO_TIMEOUT}, {
+	ERRDOS, 38, NT_STATUS_FILE_FORCED_CLOSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STARTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STOPPED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_COULD_NOT_INTERPRET}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_FILE_IS_A_DIRECTORY}, {
+	ERRDOS, ERRunsup, NT_STATUS_NOT_SUPPORTED}, {
+	ERRDOS, 51, NT_STATUS_REMOTE_NOT_LISTENING}, {
+	ERRDOS, 52, NT_STATUS_DUPLICATE_NAME}, {
+	ERRDOS, 53, NT_STATUS_BAD_NETWORK_PATH}, {
+	ERRDOS, 54, NT_STATUS_NETWORK_BUSY}, {
+	ERRDOS, 55, NT_STATUS_DEVICE_DOES_NOT_EXIST}, {
+	ERRDOS, 56, NT_STATUS_TOO_MANY_COMMANDS}, {
+	ERRDOS, 57, NT_STATUS_ADAPTER_HARDWARE_ERROR}, {
+	ERRDOS, 58, NT_STATUS_INVALID_NETWORK_RESPONSE}, {
+	ERRDOS, 59, NT_STATUS_UNEXPECTED_NETWORK_ERROR}, {
+	ERRDOS, 60, NT_STATUS_BAD_REMOTE_ADAPTER}, {
+	ERRDOS, 61, NT_STATUS_PRINT_QUEUE_FULL}, {
+	ERRDOS, 62, NT_STATUS_NO_SPOOL_SPACE}, {
+	ERRDOS, 63, NT_STATUS_PRINT_CANCELLED}, {
+	ERRDOS, 64, NT_STATUS_NETWORK_NAME_DELETED}, {
+	ERRDOS, 65, NT_STATUS_NETWORK_ACCESS_DENIED}, {
+	ERRDOS, 66, NT_STATUS_BAD_DEVICE_TYPE}, {
+	ERRDOS, ERRnosuchshare, NT_STATUS_BAD_NETWORK_NAME}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_NAMES}, {
+	ERRDOS, 69, NT_STATUS_TOO_MANY_SESSIONS}, {
+	ERRDOS, 70, NT_STATUS_SHARING_PAUSED}, {
+	ERRDOS, 71, NT_STATUS_REQUEST_NOT_ACCEPTED}, {
+	ERRDOS, 72, NT_STATUS_REDIRECTOR_PAUSED}, {
+	ERRDOS, 88, NT_STATUS_NET_WRITE_FAULT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROFILING_AT_LIMIT}, {
+	ERRDOS, ERRdiffdevice, NT_STATUS_NOT_SAME_DEVICE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_FILE_RENAMED}, {
+	ERRDOS, 240, NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SECURITY_ON_OBJECT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_WAIT}, {
+	ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_EMPTY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_TERMINATE_SELF}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SERVER_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_ROLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_DOMAIN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, {
+	ERRDOS, 300, NT_STATUS_OPLOCK_NOT_GRANTED}, {
+	ERRDOS, 301, NT_STATUS_INVALID_OPLOCK_PROTOCOL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_CORRUPTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_GENERIC_NOT_MAPPED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_DESCRIPTOR_FORMAT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_USER_BUFFER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_IO_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_LOGON_PROCESS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_EXISTS}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_1}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_2}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_3}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_4}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_5}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_6}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_7}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_8}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_9}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_10}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_11}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_12}, {
+	ERRDOS, ERRbadpath, NT_STATUS_REDIRECTOR_NOT_STARTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REDIRECTOR_STARTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PACKAGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_FUNCTION_TABLE}, {
+	ERRDOS, 203, 0xc0000100}, {
+	ERRDOS, 145, NT_STATUS_DIRECTORY_NOT_EMPTY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FILE_CORRUPT_ERROR}, {
+	ERRDOS, 267, NT_STATUS_NOT_A_DIRECTORY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_LOGON_SESSION_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_COLLISION}, {
+	ERRDOS, 206, NT_STATUS_NAME_TOO_LONG}, {
+	ERRDOS, 2401, NT_STATUS_FILES_OPEN}, {
+	ERRDOS, 2404, NT_STATUS_CONNECTION_IN_USE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MESSAGE_NOT_FOUND}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_PROCESS_IS_TERMINATING}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LOGON_TYPE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_GUID_TRANSLATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANNOT_IMPERSONATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IMAGE_ALREADY_LOADED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_PRESENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_NOT_EXIST}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_ALREADY_OWNED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_LID_OWNER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_COMMAND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_LID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_SELECTOR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_LDT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_SIZE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_OFFSET}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_DESCRIPTOR}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NE_FORMAT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RXACT_INVALID_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RXACT_COMMIT_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MAPPED_FILE_SIZE_ZERO}, {
+	ERRDOS, ERRnofids, NT_STATUS_TOO_MANY_OPENED_FILES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANCELLED}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_CANNOT_DELETE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_COMPUTER_NAME}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_FILE_DELETED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_ACCOUNT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_USER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBERS_PRIMARY_GROUP}, {
+	ERRDOS, ERRbadfid, NT_STATUS_FILE_CLOSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_THREADS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_THREAD_NOT_IN_PROCESS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOKEN_ALREADY_IN_USE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_COMMITMENT_LIMIT}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_LE_FORMAT}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NOT_MZ}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_PROTECT}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_WIN_16}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_SERVER_CONFLICT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TIME_DIFFERENCE_AT_DC}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SYNCHRONIZATION_REQUIRED}, {
+	ERRDOS, 126, NT_STATUS_DLL_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_OPEN_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IO_PRIVILEGE_FAILED}, {
+	ERRDOS, 182, NT_STATUS_ORDINAL_NOT_FOUND}, {
+	ERRDOS, 127, NT_STATUS_ENTRYPOINT_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONTROL_C_EXIT}, {
+	ERRDOS, 64, NT_STATUS_LOCAL_DISCONNECT}, {
+	ERRDOS, 64, NT_STATUS_REMOTE_DISCONNECT}, {
+	ERRDOS, 51, NT_STATUS_REMOTE_RESOURCES}, {
+	ERRDOS, 59, NT_STATUS_LINK_FAILED}, {
+	ERRDOS, 59, NT_STATUS_LINK_TIMEOUT}, {
+	ERRDOS, 59, NT_STATUS_INVALID_CONNECTION}, {
+	ERRDOS, 59, NT_STATUS_INVALID_ADDRESS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DLL_INIT_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MISSING_SYSTEMFILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNHANDLED_EXCEPTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_APP_INIT_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_CREATE_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_PAGEFILE}, {
+	ERRDOS, 124, NT_STATUS_INVALID_LEVEL}, {
+	ERRDOS, 86, NT_STATUS_WRONG_PASSWORD_CORE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, {
+	ERRDOS, 109, NT_STATUS_PIPE_BROKEN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_CORRUPT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_IO_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_EVENT_PAIR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_VOLUME}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SERIAL_NO_DEVICE_INITED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_ALIAS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_ALIAS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_ALIAS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ALIAS_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_NOT_GRANTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SECRETS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SECRET_TOO_LONG}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FULLSCREEN_MODE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_CONTEXT_IDS}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_LOGON_TYPE_NOT_GRANTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_REGISTRY_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FT_MISSING_MEMBER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_CHARACTER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNMAPPABLE_CHARACTER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNDEFINED_CHARACTER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_VOLUME}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_WRONG_CYLINDER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_UNKNOWN_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_BAD_REGISTERS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_RECALIBRATE_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_OPERATION_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_RESET_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SHARED_IRQ_BUSY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FT_ORPHANING}, {
+	ERRHRD, ERRgeneral, 0xc000016e}, {
+	ERRHRD, ERRgeneral, 0xc000016f}, {
+	ERRHRD, ERRgeneral, 0xc0000170}, {
+	ERRHRD, ERRgeneral, 0xc0000171}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PARTITION_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_BLOCK_LENGTH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_PARTITIONED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_LOCK_MEDIA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EOM_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_MEDIA}, {
+	ERRHRD, ERRgeneral, 0xc0000179}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_MEMBER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_MEMBER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_KEY_DELETED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_LOG_SPACE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SIDS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_KEY_HAS_CHILDREN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CHILD_MUST_BE_VOLATILE}, {
+	ERRDOS, 87, NT_STATUS_DEVICE_CONFIGURATION_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DRIVER_INTERNAL_ERROR}, {
+	ERRDOS, 22, NT_STATUS_INVALID_DEVICE_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IO_DEVICE_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_PROTOCOL_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BACKUP_CONTROLLER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOG_FILE_FULL}, {
+	ERRDOS, 19, NT_STATUS_TOO_LATE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_LSA_SECRET},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_NO_TRUST_SAM_ACCOUNT to
+	 NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_SAM_ACCOUNT}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_DOMAIN_FAILURE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CORRUPT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_CANT_START}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_TRUST_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MUTANT_LIMIT_EXCEEDED}, {
+	ERRDOS, ERRnetlogonNotStarted, NT_STATUS_NETLOGON_NOT_STARTED}, {
+	ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_EXPIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_POSSIBLE_DEADLOCK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REMOTE_SESSION_LIMIT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CHANGED}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_DOMAIN_TRUST_INCONSISTENT to NT_STATUS_LOGON_FAILURE
+	 during the session setup }  */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FS_DRIVER_REQUIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, {
+	ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, {
+	ERRDOS, ERRnoresource, NT_STATUS_INSUFF_SERVER_RESOURCES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_ADDRESSES}, {
+	ERRDOS, 52, NT_STATUS_ADDRESS_ALREADY_EXISTS}, {
+	ERRDOS, 64, NT_STATUS_ADDRESS_CLOSED}, {
+	ERRDOS, 64, NT_STATUS_CONNECTION_DISCONNECTED}, {
+	ERRDOS, 64, NT_STATUS_CONNECTION_RESET}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_NODES}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_ABORTED}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_TIMED_OUT}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_NO_RELEASE}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_NO_MATCH}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_RESPONDED}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_ID}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_TYPE}, {
+	ERRDOS, ERRunsup, NT_STATUS_NOT_SERVER_SESSION}, {
+	ERRDOS, ERRunsup, NT_STATUS_NOT_CLIENT_SESSION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEBUG_ATTACH_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SYSTEM_PROCESS_TERMINATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DATA_NOT_ACCEPTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_BROWSER_SERVERS_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_VDM_HARD_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DRIVER_CANCEL_TIMEOUT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REPLY_MESSAGE_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MAPPED_ALIGNMENT}, {
+	ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, {
+	ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW_READ}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FAIL_CHECK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DUPLICATE_OBJECTID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_OBJECTID_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONVERT_TO_LARGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RETRY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FOUND_OUT_OF_SCOPE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ALLOCATE_BUCKET}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROPSET_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MARSHALL_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_VARIANT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_ACCOUNT_LOCKED_OUT}, {
+	ERRDOS, ERRbadfid, NT_STATUS_HANDLE_NOT_CLOSABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_REFUSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_GRACEFUL_DISCONNECT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_NOT_ASSOCIATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_INVALID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ACTIVE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NETWORK_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_HOST_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROTOCOL_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PORT_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REQUEST_ABORTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ABORTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_COMPRESSION_BUFFER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_USER_MAPPED_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_AUDIT_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TIMER_RESOLUTION_NOT_SET}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_COUNT_LIMIT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGIN_TIME_RESTRICTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGIN_WKSTA_RESTRICTION}, {
+	ERRDOS, 193, NT_STATUS_IMAGE_MP_UP_MISMATCH}, {
+	ERRHRD, ERRgeneral, 0xc000024a}, {
+	ERRHRD, ERRgeneral, 0xc000024b}, {
+	ERRHRD, ERRgeneral, 0xc000024c}, {
+	ERRHRD, ERRgeneral, 0xc000024d}, {
+	ERRHRD, ERRgeneral, 0xc000024e}, {
+	ERRHRD, ERRgeneral, 0xc000024f}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INSUFFICIENT_LOGON_INFO}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_DLL_ENTRYPOINT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_SERVICE_ENTRYPOINT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LPC_REPLY_LOST}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT1}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT2}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_QUOTA_LIMIT}, {
+	ERRSRV, 3, NT_STATUS_PATH_NOT_COVERED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_CALLBACK_ACTIVE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LICENSE_QUOTA_EXCEEDED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_SHORT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_RECENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PWD_HISTORY_CONFLICT}, {
+	ERRHRD, ERRgeneral, 0xc000025d}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PLUGPLAY_NO_DEVICE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNSUPPORTED_COMPRESSION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_HW_PROFILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, {
+	ERRDOS, 182, NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, {
+	ERRDOS, 127, NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, {
+	ERRDOS, 288, NT_STATUS_RESOURCE_NOT_OWNED}, {
+	ERRDOS, ErrTooManyLinks, NT_STATUS_TOO_MANY_LINKS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_QUOTA_LIST_INCONSISTENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FILE_IS_OFFLINE}, {
+	ERRDOS, 21, 0xc000026e}, {
+	ERRDOS, 161, 0xc0000281}, {
+	ERRDOS, ERRnoaccess, 0xc000028a}, {
+	ERRDOS, ERRnoaccess, 0xc000028b}, {
+	ERRHRD, ERRgeneral, 0xc000028c}, {
+	ERRDOS, ERRnoaccess, 0xc000028d}, {
+	ERRDOS, ERRnoaccess, 0xc000028e}, {
+	ERRDOS, ERRnoaccess, 0xc000028f}, {
+	ERRDOS, ERRnoaccess, 0xc0000290}, {
+	ERRDOS, ERRbadfunc, 0xc000029c}, {
+	ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
+	ERRDOS, ERRinvlevel, 0x007c0001}, {
+	0, 0, 0 }
+};
+
+/*****************************************************************************
+ Print an error message from the status code
+ *****************************************************************************/
+static void
+cifs_print_status(__u32 status_code)
+{
+	int idx = 0;
+
+	while (nt_errs[idx].nt_errstr != NULL) {
+		if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) ==
+		    (status_code & 0xFFFFFF)) {
+			pr_notice("Status code returned 0x%08x %s\n",
+				  status_code, nt_errs[idx].nt_errstr);
+		}
+		idx++;
+	}
+	return;
+}
+
+
+static void
+ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
+{
+	int i;
+	if (ntstatus == 0) {
+		*eclass = 0;
+		*ecode = 0;
+		return;
+	}
+	for (i = 0; ntstatus_to_dos_map[i].ntstatus; i++) {
+		if (ntstatus == ntstatus_to_dos_map[i].ntstatus) {
+			*eclass = ntstatus_to_dos_map[i].dos_class;
+			*ecode = ntstatus_to_dos_map[i].dos_code;
+			return;
+		}
+	}
+	*eclass = ERRHRD;
+	*ecode = ERRgeneral;
+}
+
+int
+map_smb_to_linux_error(char *buf, bool logErr)
+{
+	struct smb_hdr *smb = (struct smb_hdr *)buf;
+	unsigned int i;
+	int rc = -EIO;	/* if transport error smb error may not be set */
+	__u8 smberrclass;
+	__u16 smberrcode;
+
+	/* BB if NT Status codes - map NT BB */
+
+	/* old style smb error codes */
+	if (smb->Status.CifsError == 0)
+		return 0;
+
+	if (smb->Flags2 & SMBFLG2_ERR_STATUS) {
+		/* translate the newer STATUS codes to old style SMB errors
+		 * and then to POSIX errors */
+		__u32 err = le32_to_cpu(smb->Status.CifsError);
+		if (logErr && (err != (NT_STATUS_MORE_PROCESSING_REQUIRED)))
+			cifs_print_status(err);
+		else if (cifsFYI & CIFS_RC)
+			cifs_print_status(err);
+		ntstatus_to_dos(err, &smberrclass, &smberrcode);
+	} else {
+		smberrclass = smb->Status.DosError.ErrorClass;
+		smberrcode = le16_to_cpu(smb->Status.DosError.Error);
+	}
+
+	/* old style errors */
+
+	/* DOS class smb error codes - map DOS */
+	if (smberrclass == ERRDOS) {
+		/* 1 byte field no need to byte reverse */
+		for (i = 0;
+		     i <
+		     sizeof(mapping_table_ERRDOS) /
+		     sizeof(struct smb_to_posix_error); i++) {
+			if (mapping_table_ERRDOS[i].smb_err == 0)
+				break;
+			else if (mapping_table_ERRDOS[i].smb_err ==
+								smberrcode) {
+				rc = mapping_table_ERRDOS[i].posix_code;
+				break;
+			}
+			/* else try next error mapping one to see if match */
+		}
+	} else if (smberrclass == ERRSRV) {
+		/* server class of error codes */
+		for (i = 0;
+		     i <
+		     sizeof(mapping_table_ERRSRV) /
+		     sizeof(struct smb_to_posix_error); i++) {
+			if (mapping_table_ERRSRV[i].smb_err == 0)
+				break;
+			else if (mapping_table_ERRSRV[i].smb_err ==
+								smberrcode) {
+				rc = mapping_table_ERRSRV[i].posix_code;
+				break;
+			}
+			/* else try next error mapping to see if match */
+		}
+	}
+	/* else ERRHRD class errors or junk  - return EIO */
+
+	cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
+		 le32_to_cpu(smb->Status.CifsError), rc);
+
+	/* generic corrective action e.g. reconnect SMB session on
+	 * ERRbaduid could be added */
+
+	return rc;
+}
+
+/*
+ * calculate the size of the SMB message based on the fixed header
+ * portion, the number of word parameters and the data portion of the message
+ */
+unsigned int
+smbCalcSize(void *buf)
+{
+	struct smb_hdr *ptr = (struct smb_hdr *)buf;
+	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
+		2 /* size of the bcc field */ + get_bcc(ptr));
+}
+
+/* The following are taken from fs/ntfs/util.c */
+
+#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
+
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
+struct timespec
+cifs_NTtimeToUnix(__le64 ntutc)
+{
+	struct timespec ts;
+	/* BB what about the timezone? BB */
+
+	/* Subtract the NTFS time offset, then convert to 1s intervals. */
+	s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+	u64 abs_t;
+
+	/*
+	 * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+	 * the alternative, do_div, does not work with negative numbers so have
+	 * to special case them
+	 */
+	if (t < 0) {
+		abs_t = -t;
+		ts.tv_nsec = (long)(do_div(abs_t, 10000000) * 100);
+		ts.tv_nsec = -ts.tv_nsec;
+		ts.tv_sec = -abs_t;
+	} else {
+		abs_t = t;
+		ts.tv_nsec = (long)do_div(abs_t, 10000000) * 100;
+		ts.tv_sec = abs_t;
+	}
+
+	return ts;
+}
+
+/* Convert the Unix UTC into NT UTC. */
+u64
+cifs_UnixTimeToNT(struct timespec t)
+{
+	/* Convert to 100ns intervals and then add the NTFS time offset. */
+	return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
+}
+
+static const int total_days_of_prev_months[] = {
+	0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334
+};
+
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
+{
+	struct timespec ts;
+	int sec, min, days, month, year;
+	u16 date = le16_to_cpu(le_date);
+	u16 time = le16_to_cpu(le_time);
+	SMB_TIME *st = (SMB_TIME *)&time;
+	SMB_DATE *sd = (SMB_DATE *)&date;
+
+	cifs_dbg(FYI, "date %d time %d\n", date, time);
+
+	sec = 2 * st->TwoSeconds;
+	min = st->Minutes;
+	if ((sec > 59) || (min > 59))
+		cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec);
+	sec += (min * 60);
+	sec += 60 * 60 * st->Hours;
+	if (st->Hours > 24)
+		cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
+	days = sd->Day;
+	month = sd->Month;
+	if ((days > 31) || (month > 12)) {
+		cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
+		if (month > 12)
+			month = 12;
+	}
+	month -= 1;
+	days += total_days_of_prev_months[month];
+	days += 3652; /* account for difference in days between 1980 and 1970 */
+	year = sd->Year;
+	days += year * 365;
+	days += (year/4); /* leap year */
+	/* generalized leap year calculation is more complex, ie no leap year
+	for years/100 except for years/400, but since the maximum number for DOS
+	 year is 2**7, the last year is 1980+127, which means we need only
+	 consider 2 special case years, ie the years 2000 and 2100, and only
+	 adjust for the lack of leap year for the year 2100, as 2000 was a
+	 leap year (divisable by 400) */
+	if (year >= 120)  /* the year 2100 */
+		days = days - 1;  /* do not count leap year for the year 2100 */
+
+	/* adjust for leap year where we are still before leap day */
+	if (year != 120)
+		days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
+	sec += 24 * 60 * 60 * days;
+
+	ts.tv_sec = sec + offset;
+
+	/* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */
+
+	ts.tv_nsec = 0;
+	return ts;
+}
diff --git a/kernel/fs/cifs/nterr.c b/kernel/fs/cifs/nterr.c
new file mode 100644
index 000000000..b6023c646
--- /dev/null
+++ b/kernel/fs/cifs/nterr.c
@@ -0,0 +1,687 @@
+/*
+ *  Unix SMB/Netbios implementation.
+ *  Version 1.9.
+ *  RPC Pipe client / server routines
+ *  Copyright (C) Luke Kenneth Casson Leighton 1997-2001.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* NT error codes - see nterr.h */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "nterr.h"
+
+const struct nt_err_code_struct nt_errs[] = {
+	{"NT_STATUS_OK", NT_STATUS_OK},
+	{"NT_STATUS_UNSUCCESSFUL", NT_STATUS_UNSUCCESSFUL},
+	{"NT_STATUS_NOT_IMPLEMENTED", NT_STATUS_NOT_IMPLEMENTED},
+	{"NT_STATUS_INVALID_INFO_CLASS", NT_STATUS_INVALID_INFO_CLASS},
+	{"NT_STATUS_INFO_LENGTH_MISMATCH", NT_STATUS_INFO_LENGTH_MISMATCH},
+	{"NT_STATUS_ACCESS_VIOLATION", NT_STATUS_ACCESS_VIOLATION},
+	{"NT_STATUS_BUFFER_OVERFLOW", NT_STATUS_BUFFER_OVERFLOW},
+	{"NT_STATUS_IN_PAGE_ERROR", NT_STATUS_IN_PAGE_ERROR},
+	{"NT_STATUS_PAGEFILE_QUOTA", NT_STATUS_PAGEFILE_QUOTA},
+	{"NT_STATUS_INVALID_HANDLE", NT_STATUS_INVALID_HANDLE},
+	{"NT_STATUS_BAD_INITIAL_STACK", NT_STATUS_BAD_INITIAL_STACK},
+	{"NT_STATUS_BAD_INITIAL_PC", NT_STATUS_BAD_INITIAL_PC},
+	{"NT_STATUS_INVALID_CID", NT_STATUS_INVALID_CID},
+	{"NT_STATUS_TIMER_NOT_CANCELED", NT_STATUS_TIMER_NOT_CANCELED},
+	{"NT_STATUS_INVALID_PARAMETER", NT_STATUS_INVALID_PARAMETER},
+	{"NT_STATUS_NO_SUCH_DEVICE", NT_STATUS_NO_SUCH_DEVICE},
+	{"NT_STATUS_NO_SUCH_FILE", NT_STATUS_NO_SUCH_FILE},
+	{"NT_STATUS_INVALID_DEVICE_REQUEST",
+	 NT_STATUS_INVALID_DEVICE_REQUEST},
+	{"NT_STATUS_END_OF_FILE", NT_STATUS_END_OF_FILE},
+	{"NT_STATUS_WRONG_VOLUME", NT_STATUS_WRONG_VOLUME},
+	{"NT_STATUS_NO_MEDIA_IN_DEVICE", NT_STATUS_NO_MEDIA_IN_DEVICE},
+	{"NT_STATUS_UNRECOGNIZED_MEDIA", NT_STATUS_UNRECOGNIZED_MEDIA},
+	{"NT_STATUS_NONEXISTENT_SECTOR", NT_STATUS_NONEXISTENT_SECTOR},
+	{"NT_STATUS_MORE_PROCESSING_REQUIRED",
+	 NT_STATUS_MORE_PROCESSING_REQUIRED},
+	{"NT_STATUS_NO_MEMORY", NT_STATUS_NO_MEMORY},
+	{"NT_STATUS_CONFLICTING_ADDRESSES",
+	 NT_STATUS_CONFLICTING_ADDRESSES},
+	{"NT_STATUS_NOT_MAPPED_VIEW", NT_STATUS_NOT_MAPPED_VIEW},
+	{"NT_STATUS_UNABLE_TO_FREE_VM", NT_STATUS_UNABLE_TO_FREE_VM},
+	{"NT_STATUS_UNABLE_TO_DELETE_SECTION",
+	 NT_STATUS_UNABLE_TO_DELETE_SECTION},
+	{"NT_STATUS_INVALID_SYSTEM_SERVICE",
+	 NT_STATUS_INVALID_SYSTEM_SERVICE},
+	{"NT_STATUS_ILLEGAL_INSTRUCTION", NT_STATUS_ILLEGAL_INSTRUCTION},
+	{"NT_STATUS_INVALID_LOCK_SEQUENCE",
+	 NT_STATUS_INVALID_LOCK_SEQUENCE},
+	{"NT_STATUS_INVALID_VIEW_SIZE", NT_STATUS_INVALID_VIEW_SIZE},
+	{"NT_STATUS_INVALID_FILE_FOR_SECTION",
+	 NT_STATUS_INVALID_FILE_FOR_SECTION},
+	{"NT_STATUS_ALREADY_COMMITTED", NT_STATUS_ALREADY_COMMITTED},
+	{"NT_STATUS_ACCESS_DENIED", NT_STATUS_ACCESS_DENIED},
+	{"NT_STATUS_BUFFER_TOO_SMALL", NT_STATUS_BUFFER_TOO_SMALL},
+	{"NT_STATUS_OBJECT_TYPE_MISMATCH", NT_STATUS_OBJECT_TYPE_MISMATCH},
+	{"NT_STATUS_NONCONTINUABLE_EXCEPTION",
+	 NT_STATUS_NONCONTINUABLE_EXCEPTION},
+	{"NT_STATUS_INVALID_DISPOSITION", NT_STATUS_INVALID_DISPOSITION},
+	{"NT_STATUS_UNWIND", NT_STATUS_UNWIND},
+	{"NT_STATUS_BAD_STACK", NT_STATUS_BAD_STACK},
+	{"NT_STATUS_INVALID_UNWIND_TARGET",
+	 NT_STATUS_INVALID_UNWIND_TARGET},
+	{"NT_STATUS_NOT_LOCKED", NT_STATUS_NOT_LOCKED},
+	{"NT_STATUS_PARITY_ERROR", NT_STATUS_PARITY_ERROR},
+	{"NT_STATUS_UNABLE_TO_DECOMMIT_VM",
+	 NT_STATUS_UNABLE_TO_DECOMMIT_VM},
+	{"NT_STATUS_NOT_COMMITTED", NT_STATUS_NOT_COMMITTED},
+	{"NT_STATUS_INVALID_PORT_ATTRIBUTES",
+	 NT_STATUS_INVALID_PORT_ATTRIBUTES},
+	{"NT_STATUS_PORT_MESSAGE_TOO_LONG",
+	 NT_STATUS_PORT_MESSAGE_TOO_LONG},
+	{"NT_STATUS_INVALID_PARAMETER_MIX",
+	 NT_STATUS_INVALID_PARAMETER_MIX},
+	{"NT_STATUS_INVALID_QUOTA_LOWER", NT_STATUS_INVALID_QUOTA_LOWER},
+	{"NT_STATUS_DISK_CORRUPT_ERROR", NT_STATUS_DISK_CORRUPT_ERROR},
+	{"NT_STATUS_OBJECT_NAME_INVALID", NT_STATUS_OBJECT_NAME_INVALID},
+	{"NT_STATUS_OBJECT_NAME_NOT_FOUND",
+	 NT_STATUS_OBJECT_NAME_NOT_FOUND},
+	{"NT_STATUS_OBJECT_NAME_COLLISION",
+	 NT_STATUS_OBJECT_NAME_COLLISION},
+	{"NT_STATUS_HANDLE_NOT_WAITABLE", NT_STATUS_HANDLE_NOT_WAITABLE},
+	{"NT_STATUS_PORT_DISCONNECTED", NT_STATUS_PORT_DISCONNECTED},
+	{"NT_STATUS_DEVICE_ALREADY_ATTACHED",
+	 NT_STATUS_DEVICE_ALREADY_ATTACHED},
+	{"NT_STATUS_OBJECT_PATH_INVALID", NT_STATUS_OBJECT_PATH_INVALID},
+	{"NT_STATUS_OBJECT_PATH_NOT_FOUND",
+	 NT_STATUS_OBJECT_PATH_NOT_FOUND},
+	{"NT_STATUS_OBJECT_PATH_SYNTAX_BAD",
+	 NT_STATUS_OBJECT_PATH_SYNTAX_BAD},
+	{"NT_STATUS_DATA_OVERRUN", NT_STATUS_DATA_OVERRUN},
+	{"NT_STATUS_DATA_LATE_ERROR", NT_STATUS_DATA_LATE_ERROR},
+	{"NT_STATUS_DATA_ERROR", NT_STATUS_DATA_ERROR},
+	{"NT_STATUS_CRC_ERROR", NT_STATUS_CRC_ERROR},
+	{"NT_STATUS_SECTION_TOO_BIG", NT_STATUS_SECTION_TOO_BIG},
+	{"NT_STATUS_PORT_CONNECTION_REFUSED",
+	 NT_STATUS_PORT_CONNECTION_REFUSED},
+	{"NT_STATUS_INVALID_PORT_HANDLE", NT_STATUS_INVALID_PORT_HANDLE},
+	{"NT_STATUS_SHARING_VIOLATION", NT_STATUS_SHARING_VIOLATION},
+	{"NT_STATUS_QUOTA_EXCEEDED", NT_STATUS_QUOTA_EXCEEDED},
+	{"NT_STATUS_INVALID_PAGE_PROTECTION",
+	 NT_STATUS_INVALID_PAGE_PROTECTION},
+	{"NT_STATUS_MUTANT_NOT_OWNED", NT_STATUS_MUTANT_NOT_OWNED},
+	{"NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED",
+	 NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED},
+	{"NT_STATUS_PORT_ALREADY_SET", NT_STATUS_PORT_ALREADY_SET},
+	{"NT_STATUS_SECTION_NOT_IMAGE", NT_STATUS_SECTION_NOT_IMAGE},
+	{"NT_STATUS_SUSPEND_COUNT_EXCEEDED",
+	 NT_STATUS_SUSPEND_COUNT_EXCEEDED},
+	{"NT_STATUS_THREAD_IS_TERMINATING",
+	 NT_STATUS_THREAD_IS_TERMINATING},
+	{"NT_STATUS_BAD_WORKING_SET_LIMIT",
+	 NT_STATUS_BAD_WORKING_SET_LIMIT},
+	{"NT_STATUS_INCOMPATIBLE_FILE_MAP",
+	 NT_STATUS_INCOMPATIBLE_FILE_MAP},
+	{"NT_STATUS_SECTION_PROTECTION", NT_STATUS_SECTION_PROTECTION},
+	{"NT_STATUS_EAS_NOT_SUPPORTED", NT_STATUS_EAS_NOT_SUPPORTED},
+	{"NT_STATUS_EA_TOO_LARGE", NT_STATUS_EA_TOO_LARGE},
+	{"NT_STATUS_NONEXISTENT_EA_ENTRY", NT_STATUS_NONEXISTENT_EA_ENTRY},
+	{"NT_STATUS_NO_EAS_ON_FILE", NT_STATUS_NO_EAS_ON_FILE},
+	{"NT_STATUS_EA_CORRUPT_ERROR", NT_STATUS_EA_CORRUPT_ERROR},
+	{"NT_STATUS_FILE_LOCK_CONFLICT", NT_STATUS_FILE_LOCK_CONFLICT},
+	{"NT_STATUS_LOCK_NOT_GRANTED", NT_STATUS_LOCK_NOT_GRANTED},
+	{"NT_STATUS_DELETE_PENDING", NT_STATUS_DELETE_PENDING},
+	{"NT_STATUS_CTL_FILE_NOT_SUPPORTED",
+	 NT_STATUS_CTL_FILE_NOT_SUPPORTED},
+	{"NT_STATUS_UNKNOWN_REVISION", NT_STATUS_UNKNOWN_REVISION},
+	{"NT_STATUS_REVISION_MISMATCH", NT_STATUS_REVISION_MISMATCH},
+	{"NT_STATUS_INVALID_OWNER", NT_STATUS_INVALID_OWNER},
+	{"NT_STATUS_INVALID_PRIMARY_GROUP",
+	 NT_STATUS_INVALID_PRIMARY_GROUP},
+	{"NT_STATUS_NO_IMPERSONATION_TOKEN",
+	 NT_STATUS_NO_IMPERSONATION_TOKEN},
+	{"NT_STATUS_CANT_DISABLE_MANDATORY",
+	 NT_STATUS_CANT_DISABLE_MANDATORY},
+	{"NT_STATUS_NO_LOGON_SERVERS", NT_STATUS_NO_LOGON_SERVERS},
+	{"NT_STATUS_NO_SUCH_LOGON_SESSION",
+	 NT_STATUS_NO_SUCH_LOGON_SESSION},
+	{"NT_STATUS_NO_SUCH_PRIVILEGE", NT_STATUS_NO_SUCH_PRIVILEGE},
+	{"NT_STATUS_PRIVILEGE_NOT_HELD", NT_STATUS_PRIVILEGE_NOT_HELD},
+	{"NT_STATUS_INVALID_ACCOUNT_NAME", NT_STATUS_INVALID_ACCOUNT_NAME},
+	{"NT_STATUS_USER_EXISTS", NT_STATUS_USER_EXISTS},
+	{"NT_STATUS_NO_SUCH_USER", NT_STATUS_NO_SUCH_USER},
+	{"NT_STATUS_GROUP_EXISTS", NT_STATUS_GROUP_EXISTS},
+	{"NT_STATUS_NO_SUCH_GROUP", NT_STATUS_NO_SUCH_GROUP},
+	{"NT_STATUS_MEMBER_IN_GROUP", NT_STATUS_MEMBER_IN_GROUP},
+	{"NT_STATUS_MEMBER_NOT_IN_GROUP", NT_STATUS_MEMBER_NOT_IN_GROUP},
+	{"NT_STATUS_LAST_ADMIN", NT_STATUS_LAST_ADMIN},
+	{"NT_STATUS_WRONG_PASSWORD", NT_STATUS_WRONG_PASSWORD},
+	{"NT_STATUS_ILL_FORMED_PASSWORD", NT_STATUS_ILL_FORMED_PASSWORD},
+	{"NT_STATUS_PASSWORD_RESTRICTION", NT_STATUS_PASSWORD_RESTRICTION},
+	{"NT_STATUS_LOGON_FAILURE", NT_STATUS_LOGON_FAILURE},
+	{"NT_STATUS_ACCOUNT_RESTRICTION", NT_STATUS_ACCOUNT_RESTRICTION},
+	{"NT_STATUS_INVALID_LOGON_HOURS", NT_STATUS_INVALID_LOGON_HOURS},
+	{"NT_STATUS_INVALID_WORKSTATION", NT_STATUS_INVALID_WORKSTATION},
+	{"NT_STATUS_PASSWORD_EXPIRED", NT_STATUS_PASSWORD_EXPIRED},
+	{"NT_STATUS_ACCOUNT_DISABLED", NT_STATUS_ACCOUNT_DISABLED},
+	{"NT_STATUS_NONE_MAPPED", NT_STATUS_NONE_MAPPED},
+	{"NT_STATUS_TOO_MANY_LUIDS_REQUESTED",
+	 NT_STATUS_TOO_MANY_LUIDS_REQUESTED},
+	{"NT_STATUS_LUIDS_EXHAUSTED", NT_STATUS_LUIDS_EXHAUSTED},
+	{"NT_STATUS_INVALID_SUB_AUTHORITY",
+	 NT_STATUS_INVALID_SUB_AUTHORITY},
+	{"NT_STATUS_INVALID_ACL", NT_STATUS_INVALID_ACL},
+	{"NT_STATUS_INVALID_SID", NT_STATUS_INVALID_SID},
+	{"NT_STATUS_INVALID_SECURITY_DESCR",
+	 NT_STATUS_INVALID_SECURITY_DESCR},
+	{"NT_STATUS_PROCEDURE_NOT_FOUND", NT_STATUS_PROCEDURE_NOT_FOUND},
+	{"NT_STATUS_INVALID_IMAGE_FORMAT", NT_STATUS_INVALID_IMAGE_FORMAT},
+	{"NT_STATUS_NO_TOKEN", NT_STATUS_NO_TOKEN},
+	{"NT_STATUS_BAD_INHERITANCE_ACL", NT_STATUS_BAD_INHERITANCE_ACL},
+	{"NT_STATUS_RANGE_NOT_LOCKED", NT_STATUS_RANGE_NOT_LOCKED},
+	{"NT_STATUS_DISK_FULL", NT_STATUS_DISK_FULL},
+	{"NT_STATUS_SERVER_DISABLED", NT_STATUS_SERVER_DISABLED},
+	{"NT_STATUS_SERVER_NOT_DISABLED", NT_STATUS_SERVER_NOT_DISABLED},
+	{"NT_STATUS_TOO_MANY_GUIDS_REQUESTED",
+	 NT_STATUS_TOO_MANY_GUIDS_REQUESTED},
+	{"NT_STATUS_GUIDS_EXHAUSTED", NT_STATUS_GUIDS_EXHAUSTED},
+	{"NT_STATUS_INVALID_ID_AUTHORITY", NT_STATUS_INVALID_ID_AUTHORITY},
+	{"NT_STATUS_AGENTS_EXHAUSTED", NT_STATUS_AGENTS_EXHAUSTED},
+	{"NT_STATUS_INVALID_VOLUME_LABEL", NT_STATUS_INVALID_VOLUME_LABEL},
+	{"NT_STATUS_SECTION_NOT_EXTENDED", NT_STATUS_SECTION_NOT_EXTENDED},
+	{"NT_STATUS_NOT_MAPPED_DATA", NT_STATUS_NOT_MAPPED_DATA},
+	{"NT_STATUS_RESOURCE_DATA_NOT_FOUND",
+	 NT_STATUS_RESOURCE_DATA_NOT_FOUND},
+	{"NT_STATUS_RESOURCE_TYPE_NOT_FOUND",
+	 NT_STATUS_RESOURCE_TYPE_NOT_FOUND},
+	{"NT_STATUS_RESOURCE_NAME_NOT_FOUND",
+	 NT_STATUS_RESOURCE_NAME_NOT_FOUND},
+	{"NT_STATUS_ARRAY_BOUNDS_EXCEEDED",
+	 NT_STATUS_ARRAY_BOUNDS_EXCEEDED},
+	{"NT_STATUS_FLOAT_DENORMAL_OPERAND",
+	 NT_STATUS_FLOAT_DENORMAL_OPERAND},
+	{"NT_STATUS_FLOAT_DIVIDE_BY_ZERO", NT_STATUS_FLOAT_DIVIDE_BY_ZERO},
+	{"NT_STATUS_FLOAT_INEXACT_RESULT", NT_STATUS_FLOAT_INEXACT_RESULT},
+	{"NT_STATUS_FLOAT_INVALID_OPERATION",
+	 NT_STATUS_FLOAT_INVALID_OPERATION},
+	{"NT_STATUS_FLOAT_OVERFLOW", NT_STATUS_FLOAT_OVERFLOW},
+	{"NT_STATUS_FLOAT_STACK_CHECK", NT_STATUS_FLOAT_STACK_CHECK},
+	{"NT_STATUS_FLOAT_UNDERFLOW", NT_STATUS_FLOAT_UNDERFLOW},
+	{"NT_STATUS_INTEGER_DIVIDE_BY_ZERO",
+	 NT_STATUS_INTEGER_DIVIDE_BY_ZERO},
+	{"NT_STATUS_INTEGER_OVERFLOW", NT_STATUS_INTEGER_OVERFLOW},
+	{"NT_STATUS_PRIVILEGED_INSTRUCTION",
+	 NT_STATUS_PRIVILEGED_INSTRUCTION},
+	{"NT_STATUS_TOO_MANY_PAGING_FILES",
+	 NT_STATUS_TOO_MANY_PAGING_FILES},
+	{"NT_STATUS_FILE_INVALID", NT_STATUS_FILE_INVALID},
+	{"NT_STATUS_ALLOTTED_SPACE_EXCEEDED",
+	 NT_STATUS_ALLOTTED_SPACE_EXCEEDED},
+	{"NT_STATUS_INSUFFICIENT_RESOURCES",
+	 NT_STATUS_INSUFFICIENT_RESOURCES},
+	{"NT_STATUS_DFS_EXIT_PATH_FOUND", NT_STATUS_DFS_EXIT_PATH_FOUND},
+	{"NT_STATUS_DEVICE_DATA_ERROR", NT_STATUS_DEVICE_DATA_ERROR},
+	{"NT_STATUS_DEVICE_NOT_CONNECTED", NT_STATUS_DEVICE_NOT_CONNECTED},
+	{"NT_STATUS_DEVICE_POWER_FAILURE", NT_STATUS_DEVICE_POWER_FAILURE},
+	{"NT_STATUS_FREE_VM_NOT_AT_BASE", NT_STATUS_FREE_VM_NOT_AT_BASE},
+	{"NT_STATUS_MEMORY_NOT_ALLOCATED", NT_STATUS_MEMORY_NOT_ALLOCATED},
+	{"NT_STATUS_WORKING_SET_QUOTA", NT_STATUS_WORKING_SET_QUOTA},
+	{"NT_STATUS_MEDIA_WRITE_PROTECTED",
+	 NT_STATUS_MEDIA_WRITE_PROTECTED},
+	{"NT_STATUS_DEVICE_NOT_READY", NT_STATUS_DEVICE_NOT_READY},
+	{"NT_STATUS_INVALID_GROUP_ATTRIBUTES",
+	 NT_STATUS_INVALID_GROUP_ATTRIBUTES},
+	{"NT_STATUS_BAD_IMPERSONATION_LEVEL",
+	 NT_STATUS_BAD_IMPERSONATION_LEVEL},
+	{"NT_STATUS_CANT_OPEN_ANONYMOUS", NT_STATUS_CANT_OPEN_ANONYMOUS},
+	{"NT_STATUS_BAD_VALIDATION_CLASS", NT_STATUS_BAD_VALIDATION_CLASS},
+	{"NT_STATUS_BAD_TOKEN_TYPE", NT_STATUS_BAD_TOKEN_TYPE},
+	{"NT_STATUS_BAD_MASTER_BOOT_RECORD",
+	 NT_STATUS_BAD_MASTER_BOOT_RECORD},
+	{"NT_STATUS_INSTRUCTION_MISALIGNMENT",
+	 NT_STATUS_INSTRUCTION_MISALIGNMENT},
+	{"NT_STATUS_INSTANCE_NOT_AVAILABLE",
+	 NT_STATUS_INSTANCE_NOT_AVAILABLE},
+	{"NT_STATUS_PIPE_NOT_AVAILABLE", NT_STATUS_PIPE_NOT_AVAILABLE},
+	{"NT_STATUS_INVALID_PIPE_STATE", NT_STATUS_INVALID_PIPE_STATE},
+	{"NT_STATUS_PIPE_BUSY", NT_STATUS_PIPE_BUSY},
+	{"NT_STATUS_ILLEGAL_FUNCTION", NT_STATUS_ILLEGAL_FUNCTION},
+	{"NT_STATUS_PIPE_DISCONNECTED", NT_STATUS_PIPE_DISCONNECTED},
+	{"NT_STATUS_PIPE_CLOSING", NT_STATUS_PIPE_CLOSING},
+	{"NT_STATUS_PIPE_CONNECTED", NT_STATUS_PIPE_CONNECTED},
+	{"NT_STATUS_PIPE_LISTENING", NT_STATUS_PIPE_LISTENING},
+	{"NT_STATUS_INVALID_READ_MODE", NT_STATUS_INVALID_READ_MODE},
+	{"NT_STATUS_IO_TIMEOUT", NT_STATUS_IO_TIMEOUT},
+	{"NT_STATUS_FILE_FORCED_CLOSED", NT_STATUS_FILE_FORCED_CLOSED},
+	{"NT_STATUS_PROFILING_NOT_STARTED",
+	 NT_STATUS_PROFILING_NOT_STARTED},
+	{"NT_STATUS_PROFILING_NOT_STOPPED",
+	 NT_STATUS_PROFILING_NOT_STOPPED},
+	{"NT_STATUS_COULD_NOT_INTERPRET", NT_STATUS_COULD_NOT_INTERPRET},
+	{"NT_STATUS_FILE_IS_A_DIRECTORY", NT_STATUS_FILE_IS_A_DIRECTORY},
+	{"NT_STATUS_NOT_SUPPORTED", NT_STATUS_NOT_SUPPORTED},
+	{"NT_STATUS_REMOTE_NOT_LISTENING", NT_STATUS_REMOTE_NOT_LISTENING},
+	{"NT_STATUS_DUPLICATE_NAME", NT_STATUS_DUPLICATE_NAME},
+	{"NT_STATUS_BAD_NETWORK_PATH", NT_STATUS_BAD_NETWORK_PATH},
+	{"NT_STATUS_NETWORK_BUSY", NT_STATUS_NETWORK_BUSY},
+	{"NT_STATUS_DEVICE_DOES_NOT_EXIST",
+	 NT_STATUS_DEVICE_DOES_NOT_EXIST},
+	{"NT_STATUS_TOO_MANY_COMMANDS", NT_STATUS_TOO_MANY_COMMANDS},
+	{"NT_STATUS_ADAPTER_HARDWARE_ERROR",
+	 NT_STATUS_ADAPTER_HARDWARE_ERROR},
+	{"NT_STATUS_INVALID_NETWORK_RESPONSE",
+	 NT_STATUS_INVALID_NETWORK_RESPONSE},
+	{"NT_STATUS_UNEXPECTED_NETWORK_ERROR",
+	 NT_STATUS_UNEXPECTED_NETWORK_ERROR},
+	{"NT_STATUS_BAD_REMOTE_ADAPTER", NT_STATUS_BAD_REMOTE_ADAPTER},
+	{"NT_STATUS_PRINT_QUEUE_FULL", NT_STATUS_PRINT_QUEUE_FULL},
+	{"NT_STATUS_NO_SPOOL_SPACE", NT_STATUS_NO_SPOOL_SPACE},
+	{"NT_STATUS_PRINT_CANCELLED", NT_STATUS_PRINT_CANCELLED},
+	{"NT_STATUS_NETWORK_NAME_DELETED", NT_STATUS_NETWORK_NAME_DELETED},
+	{"NT_STATUS_NETWORK_ACCESS_DENIED",
+	 NT_STATUS_NETWORK_ACCESS_DENIED},
+	{"NT_STATUS_BAD_DEVICE_TYPE", NT_STATUS_BAD_DEVICE_TYPE},
+	{"NT_STATUS_BAD_NETWORK_NAME", NT_STATUS_BAD_NETWORK_NAME},
+	{"NT_STATUS_TOO_MANY_NAMES", NT_STATUS_TOO_MANY_NAMES},
+	{"NT_STATUS_TOO_MANY_SESSIONS", NT_STATUS_TOO_MANY_SESSIONS},
+	{"NT_STATUS_SHARING_PAUSED", NT_STATUS_SHARING_PAUSED},
+	{"NT_STATUS_REQUEST_NOT_ACCEPTED", NT_STATUS_REQUEST_NOT_ACCEPTED},
+	{"NT_STATUS_REDIRECTOR_PAUSED", NT_STATUS_REDIRECTOR_PAUSED},
+	{"NT_STATUS_NET_WRITE_FAULT", NT_STATUS_NET_WRITE_FAULT},
+	{"NT_STATUS_PROFILING_AT_LIMIT", NT_STATUS_PROFILING_AT_LIMIT},
+	{"NT_STATUS_NOT_SAME_DEVICE", NT_STATUS_NOT_SAME_DEVICE},
+	{"NT_STATUS_FILE_RENAMED", NT_STATUS_FILE_RENAMED},
+	{"NT_STATUS_VIRTUAL_CIRCUIT_CLOSED",
+	 NT_STATUS_VIRTUAL_CIRCUIT_CLOSED},
+	{"NT_STATUS_NO_SECURITY_ON_OBJECT",
+	 NT_STATUS_NO_SECURITY_ON_OBJECT},
+	{"NT_STATUS_CANT_WAIT", NT_STATUS_CANT_WAIT},
+	{"NT_STATUS_PIPE_EMPTY", NT_STATUS_PIPE_EMPTY},
+	{"NT_STATUS_CANT_ACCESS_DOMAIN_INFO",
+	 NT_STATUS_CANT_ACCESS_DOMAIN_INFO},
+	{"NT_STATUS_CANT_TERMINATE_SELF", NT_STATUS_CANT_TERMINATE_SELF},
+	{"NT_STATUS_INVALID_SERVER_STATE", NT_STATUS_INVALID_SERVER_STATE},
+	{"NT_STATUS_INVALID_DOMAIN_STATE", NT_STATUS_INVALID_DOMAIN_STATE},
+	{"NT_STATUS_INVALID_DOMAIN_ROLE", NT_STATUS_INVALID_DOMAIN_ROLE},
+	{"NT_STATUS_NO_SUCH_DOMAIN", NT_STATUS_NO_SUCH_DOMAIN},
+	{"NT_STATUS_DOMAIN_EXISTS", NT_STATUS_DOMAIN_EXISTS},
+	{"NT_STATUS_DOMAIN_LIMIT_EXCEEDED",
+	 NT_STATUS_DOMAIN_LIMIT_EXCEEDED},
+	{"NT_STATUS_OPLOCK_NOT_GRANTED", NT_STATUS_OPLOCK_NOT_GRANTED},
+	{"NT_STATUS_INVALID_OPLOCK_PROTOCOL",
+	 NT_STATUS_INVALID_OPLOCK_PROTOCOL},
+	{"NT_STATUS_INTERNAL_DB_CORRUPTION",
+	 NT_STATUS_INTERNAL_DB_CORRUPTION},
+	{"NT_STATUS_INTERNAL_ERROR", NT_STATUS_INTERNAL_ERROR},
+	{"NT_STATUS_GENERIC_NOT_MAPPED", NT_STATUS_GENERIC_NOT_MAPPED},
+	{"NT_STATUS_BAD_DESCRIPTOR_FORMAT",
+	 NT_STATUS_BAD_DESCRIPTOR_FORMAT},
+	{"NT_STATUS_INVALID_USER_BUFFER", NT_STATUS_INVALID_USER_BUFFER},
+	{"NT_STATUS_UNEXPECTED_IO_ERROR", NT_STATUS_UNEXPECTED_IO_ERROR},
+	{"NT_STATUS_UNEXPECTED_MM_CREATE_ERR",
+	 NT_STATUS_UNEXPECTED_MM_CREATE_ERR},
+	{"NT_STATUS_UNEXPECTED_MM_MAP_ERROR",
+	 NT_STATUS_UNEXPECTED_MM_MAP_ERROR},
+	{"NT_STATUS_UNEXPECTED_MM_EXTEND_ERR",
+	 NT_STATUS_UNEXPECTED_MM_EXTEND_ERR},
+	{"NT_STATUS_NOT_LOGON_PROCESS", NT_STATUS_NOT_LOGON_PROCESS},
+	{"NT_STATUS_LOGON_SESSION_EXISTS", NT_STATUS_LOGON_SESSION_EXISTS},
+	{"NT_STATUS_INVALID_PARAMETER_1", NT_STATUS_INVALID_PARAMETER_1},
+	{"NT_STATUS_INVALID_PARAMETER_2", NT_STATUS_INVALID_PARAMETER_2},
+	{"NT_STATUS_INVALID_PARAMETER_3", NT_STATUS_INVALID_PARAMETER_3},
+	{"NT_STATUS_INVALID_PARAMETER_4", NT_STATUS_INVALID_PARAMETER_4},
+	{"NT_STATUS_INVALID_PARAMETER_5", NT_STATUS_INVALID_PARAMETER_5},
+	{"NT_STATUS_INVALID_PARAMETER_6", NT_STATUS_INVALID_PARAMETER_6},
+	{"NT_STATUS_INVALID_PARAMETER_7", NT_STATUS_INVALID_PARAMETER_7},
+	{"NT_STATUS_INVALID_PARAMETER_8", NT_STATUS_INVALID_PARAMETER_8},
+	{"NT_STATUS_INVALID_PARAMETER_9", NT_STATUS_INVALID_PARAMETER_9},
+	{"NT_STATUS_INVALID_PARAMETER_10", NT_STATUS_INVALID_PARAMETER_10},
+	{"NT_STATUS_INVALID_PARAMETER_11", NT_STATUS_INVALID_PARAMETER_11},
+	{"NT_STATUS_INVALID_PARAMETER_12", NT_STATUS_INVALID_PARAMETER_12},
+	{"NT_STATUS_REDIRECTOR_NOT_STARTED",
+	 NT_STATUS_REDIRECTOR_NOT_STARTED},
+	{"NT_STATUS_REDIRECTOR_STARTED", NT_STATUS_REDIRECTOR_STARTED},
+	{"NT_STATUS_STACK_OVERFLOW", NT_STATUS_STACK_OVERFLOW},
+	{"NT_STATUS_NO_SUCH_PACKAGE", NT_STATUS_NO_SUCH_PACKAGE},
+	{"NT_STATUS_BAD_FUNCTION_TABLE", NT_STATUS_BAD_FUNCTION_TABLE},
+	{"NT_STATUS_DIRECTORY_NOT_EMPTY", NT_STATUS_DIRECTORY_NOT_EMPTY},
+	{"NT_STATUS_FILE_CORRUPT_ERROR", NT_STATUS_FILE_CORRUPT_ERROR},
+	{"NT_STATUS_NOT_A_DIRECTORY", NT_STATUS_NOT_A_DIRECTORY},
+	{"NT_STATUS_BAD_LOGON_SESSION_STATE",
+	 NT_STATUS_BAD_LOGON_SESSION_STATE},
+	{"NT_STATUS_LOGON_SESSION_COLLISION",
+	 NT_STATUS_LOGON_SESSION_COLLISION},
+	{"NT_STATUS_NAME_TOO_LONG", NT_STATUS_NAME_TOO_LONG},
+	{"NT_STATUS_FILES_OPEN", NT_STATUS_FILES_OPEN},
+	{"NT_STATUS_CONNECTION_IN_USE", NT_STATUS_CONNECTION_IN_USE},
+	{"NT_STATUS_MESSAGE_NOT_FOUND", NT_STATUS_MESSAGE_NOT_FOUND},
+	{"NT_STATUS_PROCESS_IS_TERMINATING",
+	 NT_STATUS_PROCESS_IS_TERMINATING},
+	{"NT_STATUS_INVALID_LOGON_TYPE", NT_STATUS_INVALID_LOGON_TYPE},
+	{"NT_STATUS_NO_GUID_TRANSLATION", NT_STATUS_NO_GUID_TRANSLATION},
+	{"NT_STATUS_CANNOT_IMPERSONATE", NT_STATUS_CANNOT_IMPERSONATE},
+	{"NT_STATUS_IMAGE_ALREADY_LOADED", NT_STATUS_IMAGE_ALREADY_LOADED},
+	{"NT_STATUS_ABIOS_NOT_PRESENT", NT_STATUS_ABIOS_NOT_PRESENT},
+	{"NT_STATUS_ABIOS_LID_NOT_EXIST", NT_STATUS_ABIOS_LID_NOT_EXIST},
+	{"NT_STATUS_ABIOS_LID_ALREADY_OWNED",
+	 NT_STATUS_ABIOS_LID_ALREADY_OWNED},
+	{"NT_STATUS_ABIOS_NOT_LID_OWNER", NT_STATUS_ABIOS_NOT_LID_OWNER},
+	{"NT_STATUS_ABIOS_INVALID_COMMAND",
+	 NT_STATUS_ABIOS_INVALID_COMMAND},
+	{"NT_STATUS_ABIOS_INVALID_LID", NT_STATUS_ABIOS_INVALID_LID},
+	{"NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE",
+	 NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE},
+	{"NT_STATUS_ABIOS_INVALID_SELECTOR",
+	 NT_STATUS_ABIOS_INVALID_SELECTOR},
+	{"NT_STATUS_NO_LDT", NT_STATUS_NO_LDT},
+	{"NT_STATUS_INVALID_LDT_SIZE", NT_STATUS_INVALID_LDT_SIZE},
+	{"NT_STATUS_INVALID_LDT_OFFSET", NT_STATUS_INVALID_LDT_OFFSET},
+	{"NT_STATUS_INVALID_LDT_DESCRIPTOR",
+	 NT_STATUS_INVALID_LDT_DESCRIPTOR},
+	{"NT_STATUS_INVALID_IMAGE_NE_FORMAT",
+	 NT_STATUS_INVALID_IMAGE_NE_FORMAT},
+	{"NT_STATUS_RXACT_INVALID_STATE", NT_STATUS_RXACT_INVALID_STATE},
+	{"NT_STATUS_RXACT_COMMIT_FAILURE", NT_STATUS_RXACT_COMMIT_FAILURE},
+	{"NT_STATUS_MAPPED_FILE_SIZE_ZERO",
+	 NT_STATUS_MAPPED_FILE_SIZE_ZERO},
+	{"NT_STATUS_TOO_MANY_OPENED_FILES",
+	 NT_STATUS_TOO_MANY_OPENED_FILES},
+	{"NT_STATUS_CANCELLED", NT_STATUS_CANCELLED},
+	{"NT_STATUS_CANNOT_DELETE", NT_STATUS_CANNOT_DELETE},
+	{"NT_STATUS_INVALID_COMPUTER_NAME",
+	 NT_STATUS_INVALID_COMPUTER_NAME},
+	{"NT_STATUS_FILE_DELETED", NT_STATUS_FILE_DELETED},
+	{"NT_STATUS_SPECIAL_ACCOUNT", NT_STATUS_SPECIAL_ACCOUNT},
+	{"NT_STATUS_SPECIAL_GROUP", NT_STATUS_SPECIAL_GROUP},
+	{"NT_STATUS_SPECIAL_USER", NT_STATUS_SPECIAL_USER},
+	{"NT_STATUS_MEMBERS_PRIMARY_GROUP",
+	 NT_STATUS_MEMBERS_PRIMARY_GROUP},
+	{"NT_STATUS_FILE_CLOSED", NT_STATUS_FILE_CLOSED},
+	{"NT_STATUS_TOO_MANY_THREADS", NT_STATUS_TOO_MANY_THREADS},
+	{"NT_STATUS_THREAD_NOT_IN_PROCESS",
+	 NT_STATUS_THREAD_NOT_IN_PROCESS},
+	{"NT_STATUS_TOKEN_ALREADY_IN_USE", NT_STATUS_TOKEN_ALREADY_IN_USE},
+	{"NT_STATUS_PAGEFILE_QUOTA_EXCEEDED",
+	 NT_STATUS_PAGEFILE_QUOTA_EXCEEDED},
+	{"NT_STATUS_COMMITMENT_LIMIT", NT_STATUS_COMMITMENT_LIMIT},
+	{"NT_STATUS_INVALID_IMAGE_LE_FORMAT",
+	 NT_STATUS_INVALID_IMAGE_LE_FORMAT},
+	{"NT_STATUS_INVALID_IMAGE_NOT_MZ", NT_STATUS_INVALID_IMAGE_NOT_MZ},
+	{"NT_STATUS_INVALID_IMAGE_PROTECT",
+	 NT_STATUS_INVALID_IMAGE_PROTECT},
+	{"NT_STATUS_INVALID_IMAGE_WIN_16", NT_STATUS_INVALID_IMAGE_WIN_16},
+	{"NT_STATUS_LOGON_SERVER_CONFLICT",
+	 NT_STATUS_LOGON_SERVER_CONFLICT},
+	{"NT_STATUS_TIME_DIFFERENCE_AT_DC",
+	 NT_STATUS_TIME_DIFFERENCE_AT_DC},
+	{"NT_STATUS_SYNCHRONIZATION_REQUIRED",
+	 NT_STATUS_SYNCHRONIZATION_REQUIRED},
+	{"NT_STATUS_DLL_NOT_FOUND", NT_STATUS_DLL_NOT_FOUND},
+	{"NT_STATUS_OPEN_FAILED", NT_STATUS_OPEN_FAILED},
+	{"NT_STATUS_IO_PRIVILEGE_FAILED", NT_STATUS_IO_PRIVILEGE_FAILED},
+	{"NT_STATUS_ORDINAL_NOT_FOUND", NT_STATUS_ORDINAL_NOT_FOUND},
+	{"NT_STATUS_ENTRYPOINT_NOT_FOUND", NT_STATUS_ENTRYPOINT_NOT_FOUND},
+	{"NT_STATUS_CONTROL_C_EXIT", NT_STATUS_CONTROL_C_EXIT},
+	{"NT_STATUS_LOCAL_DISCONNECT", NT_STATUS_LOCAL_DISCONNECT},
+	{"NT_STATUS_REMOTE_DISCONNECT", NT_STATUS_REMOTE_DISCONNECT},
+	{"NT_STATUS_REMOTE_RESOURCES", NT_STATUS_REMOTE_RESOURCES},
+	{"NT_STATUS_LINK_FAILED", NT_STATUS_LINK_FAILED},
+	{"NT_STATUS_LINK_TIMEOUT", NT_STATUS_LINK_TIMEOUT},
+	{"NT_STATUS_INVALID_CONNECTION", NT_STATUS_INVALID_CONNECTION},
+	{"NT_STATUS_INVALID_ADDRESS", NT_STATUS_INVALID_ADDRESS},
+	{"NT_STATUS_DLL_INIT_FAILED", NT_STATUS_DLL_INIT_FAILED},
+	{"NT_STATUS_MISSING_SYSTEMFILE", NT_STATUS_MISSING_SYSTEMFILE},
+	{"NT_STATUS_UNHANDLED_EXCEPTION", NT_STATUS_UNHANDLED_EXCEPTION},
+	{"NT_STATUS_APP_INIT_FAILURE", NT_STATUS_APP_INIT_FAILURE},
+	{"NT_STATUS_PAGEFILE_CREATE_FAILED",
+	 NT_STATUS_PAGEFILE_CREATE_FAILED},
+	{"NT_STATUS_NO_PAGEFILE", NT_STATUS_NO_PAGEFILE},
+	{"NT_STATUS_INVALID_LEVEL", NT_STATUS_INVALID_LEVEL},
+	{"NT_STATUS_WRONG_PASSWORD_CORE", NT_STATUS_WRONG_PASSWORD_CORE},
+	{"NT_STATUS_ILLEGAL_FLOAT_CONTEXT",
+	 NT_STATUS_ILLEGAL_FLOAT_CONTEXT},
+	{"NT_STATUS_PIPE_BROKEN", NT_STATUS_PIPE_BROKEN},
+	{"NT_STATUS_REGISTRY_CORRUPT", NT_STATUS_REGISTRY_CORRUPT},
+	{"NT_STATUS_REGISTRY_IO_FAILED", NT_STATUS_REGISTRY_IO_FAILED},
+	{"NT_STATUS_NO_EVENT_PAIR", NT_STATUS_NO_EVENT_PAIR},
+	{"NT_STATUS_UNRECOGNIZED_VOLUME", NT_STATUS_UNRECOGNIZED_VOLUME},
+	{"NT_STATUS_SERIAL_NO_DEVICE_INITED",
+	 NT_STATUS_SERIAL_NO_DEVICE_INITED},
+	{"NT_STATUS_NO_SUCH_ALIAS", NT_STATUS_NO_SUCH_ALIAS},
+	{"NT_STATUS_MEMBER_NOT_IN_ALIAS", NT_STATUS_MEMBER_NOT_IN_ALIAS},
+	{"NT_STATUS_MEMBER_IN_ALIAS", NT_STATUS_MEMBER_IN_ALIAS},
+	{"NT_STATUS_ALIAS_EXISTS", NT_STATUS_ALIAS_EXISTS},
+	{"NT_STATUS_LOGON_NOT_GRANTED", NT_STATUS_LOGON_NOT_GRANTED},
+	{"NT_STATUS_TOO_MANY_SECRETS", NT_STATUS_TOO_MANY_SECRETS},
+	{"NT_STATUS_SECRET_TOO_LONG", NT_STATUS_SECRET_TOO_LONG},
+	{"NT_STATUS_INTERNAL_DB_ERROR", NT_STATUS_INTERNAL_DB_ERROR},
+	{"NT_STATUS_FULLSCREEN_MODE", NT_STATUS_FULLSCREEN_MODE},
+	{"NT_STATUS_TOO_MANY_CONTEXT_IDS", NT_STATUS_TOO_MANY_CONTEXT_IDS},
+	{"NT_STATUS_LOGON_TYPE_NOT_GRANTED",
+	 NT_STATUS_LOGON_TYPE_NOT_GRANTED},
+	{"NT_STATUS_NOT_REGISTRY_FILE", NT_STATUS_NOT_REGISTRY_FILE},
+	{"NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED",
+	 NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED},
+	{"NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR",
+	 NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR},
+	{"NT_STATUS_FT_MISSING_MEMBER", NT_STATUS_FT_MISSING_MEMBER},
+	{"NT_STATUS_ILL_FORMED_SERVICE_ENTRY",
+	 NT_STATUS_ILL_FORMED_SERVICE_ENTRY},
+	{"NT_STATUS_ILLEGAL_CHARACTER", NT_STATUS_ILLEGAL_CHARACTER},
+	{"NT_STATUS_UNMAPPABLE_CHARACTER", NT_STATUS_UNMAPPABLE_CHARACTER},
+	{"NT_STATUS_UNDEFINED_CHARACTER", NT_STATUS_UNDEFINED_CHARACTER},
+	{"NT_STATUS_FLOPPY_VOLUME", NT_STATUS_FLOPPY_VOLUME},
+	{"NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND",
+	 NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND},
+	{"NT_STATUS_FLOPPY_WRONG_CYLINDER",
+	 NT_STATUS_FLOPPY_WRONG_CYLINDER},
+	{"NT_STATUS_FLOPPY_UNKNOWN_ERROR", NT_STATUS_FLOPPY_UNKNOWN_ERROR},
+	{"NT_STATUS_FLOPPY_BAD_REGISTERS", NT_STATUS_FLOPPY_BAD_REGISTERS},
+	{"NT_STATUS_DISK_RECALIBRATE_FAILED",
+	 NT_STATUS_DISK_RECALIBRATE_FAILED},
+	{"NT_STATUS_DISK_OPERATION_FAILED",
+	 NT_STATUS_DISK_OPERATION_FAILED},
+	{"NT_STATUS_DISK_RESET_FAILED", NT_STATUS_DISK_RESET_FAILED},
+	{"NT_STATUS_SHARED_IRQ_BUSY", NT_STATUS_SHARED_IRQ_BUSY},
+	{"NT_STATUS_FT_ORPHANING", NT_STATUS_FT_ORPHANING},
+	{"NT_STATUS_PARTITION_FAILURE", NT_STATUS_PARTITION_FAILURE},
+	{"NT_STATUS_INVALID_BLOCK_LENGTH", NT_STATUS_INVALID_BLOCK_LENGTH},
+	{"NT_STATUS_DEVICE_NOT_PARTITIONED",
+	 NT_STATUS_DEVICE_NOT_PARTITIONED},
+	{"NT_STATUS_UNABLE_TO_LOCK_MEDIA", NT_STATUS_UNABLE_TO_LOCK_MEDIA},
+	{"NT_STATUS_UNABLE_TO_UNLOAD_MEDIA",
+	 NT_STATUS_UNABLE_TO_UNLOAD_MEDIA},
+	{"NT_STATUS_EOM_OVERFLOW", NT_STATUS_EOM_OVERFLOW},
+	{"NT_STATUS_NO_MEDIA", NT_STATUS_NO_MEDIA},
+	{"NT_STATUS_NO_SUCH_MEMBER", NT_STATUS_NO_SUCH_MEMBER},
+	{"NT_STATUS_INVALID_MEMBER", NT_STATUS_INVALID_MEMBER},
+	{"NT_STATUS_KEY_DELETED", NT_STATUS_KEY_DELETED},
+	{"NT_STATUS_NO_LOG_SPACE", NT_STATUS_NO_LOG_SPACE},
+	{"NT_STATUS_TOO_MANY_SIDS", NT_STATUS_TOO_MANY_SIDS},
+	{"NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED",
+	 NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED},
+	{"NT_STATUS_KEY_HAS_CHILDREN", NT_STATUS_KEY_HAS_CHILDREN},
+	{"NT_STATUS_CHILD_MUST_BE_VOLATILE",
+	 NT_STATUS_CHILD_MUST_BE_VOLATILE},
+	{"NT_STATUS_DEVICE_CONFIGURATION_ERROR",
+	 NT_STATUS_DEVICE_CONFIGURATION_ERROR},
+	{"NT_STATUS_DRIVER_INTERNAL_ERROR",
+	 NT_STATUS_DRIVER_INTERNAL_ERROR},
+	{"NT_STATUS_INVALID_DEVICE_STATE", NT_STATUS_INVALID_DEVICE_STATE},
+	{"NT_STATUS_IO_DEVICE_ERROR", NT_STATUS_IO_DEVICE_ERROR},
+	{"NT_STATUS_DEVICE_PROTOCOL_ERROR",
+	 NT_STATUS_DEVICE_PROTOCOL_ERROR},
+	{"NT_STATUS_BACKUP_CONTROLLER", NT_STATUS_BACKUP_CONTROLLER},
+	{"NT_STATUS_LOG_FILE_FULL", NT_STATUS_LOG_FILE_FULL},
+	{"NT_STATUS_TOO_LATE", NT_STATUS_TOO_LATE},
+	{"NT_STATUS_NO_TRUST_LSA_SECRET", NT_STATUS_NO_TRUST_LSA_SECRET},
+	{"NT_STATUS_NO_TRUST_SAM_ACCOUNT", NT_STATUS_NO_TRUST_SAM_ACCOUNT},
+	{"NT_STATUS_TRUSTED_DOMAIN_FAILURE",
+	 NT_STATUS_TRUSTED_DOMAIN_FAILURE},
+	{"NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE",
+	 NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE},
+	{"NT_STATUS_EVENTLOG_FILE_CORRUPT",
+	 NT_STATUS_EVENTLOG_FILE_CORRUPT},
+	{"NT_STATUS_EVENTLOG_CANT_START", NT_STATUS_EVENTLOG_CANT_START},
+	{"NT_STATUS_TRUST_FAILURE", NT_STATUS_TRUST_FAILURE},
+	{"NT_STATUS_MUTANT_LIMIT_EXCEEDED",
+	 NT_STATUS_MUTANT_LIMIT_EXCEEDED},
+	{"NT_STATUS_NETLOGON_NOT_STARTED", NT_STATUS_NETLOGON_NOT_STARTED},
+	{"NT_STATUS_ACCOUNT_EXPIRED", NT_STATUS_ACCOUNT_EXPIRED},
+	{"NT_STATUS_POSSIBLE_DEADLOCK", NT_STATUS_POSSIBLE_DEADLOCK},
+	{"NT_STATUS_NETWORK_CREDENTIAL_CONFLICT",
+	 NT_STATUS_NETWORK_CREDENTIAL_CONFLICT},
+	{"NT_STATUS_REMOTE_SESSION_LIMIT", NT_STATUS_REMOTE_SESSION_LIMIT},
+	{"NT_STATUS_EVENTLOG_FILE_CHANGED",
+	 NT_STATUS_EVENTLOG_FILE_CHANGED},
+	{"NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT",
+	 NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT},
+	{"NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT",
+	 NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT},
+	{"NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT",
+	 NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT},
+	{"NT_STATUS_DOMAIN_TRUST_INCONSISTENT",
+	 NT_STATUS_DOMAIN_TRUST_INCONSISTENT},
+	{"NT_STATUS_FS_DRIVER_REQUIRED", NT_STATUS_FS_DRIVER_REQUIRED},
+	{"NT_STATUS_NO_USER_SESSION_KEY", NT_STATUS_NO_USER_SESSION_KEY},
+	{"NT_STATUS_USER_SESSION_DELETED", NT_STATUS_USER_SESSION_DELETED},
+	{"NT_STATUS_RESOURCE_LANG_NOT_FOUND",
+	 NT_STATUS_RESOURCE_LANG_NOT_FOUND},
+	{"NT_STATUS_INSUFF_SERVER_RESOURCES",
+	 NT_STATUS_INSUFF_SERVER_RESOURCES},
+	{"NT_STATUS_INVALID_BUFFER_SIZE", NT_STATUS_INVALID_BUFFER_SIZE},
+	{"NT_STATUS_INVALID_ADDRESS_COMPONENT",
+	 NT_STATUS_INVALID_ADDRESS_COMPONENT},
+	{"NT_STATUS_INVALID_ADDRESS_WILDCARD",
+	 NT_STATUS_INVALID_ADDRESS_WILDCARD},
+	{"NT_STATUS_TOO_MANY_ADDRESSES", NT_STATUS_TOO_MANY_ADDRESSES},
+	{"NT_STATUS_ADDRESS_ALREADY_EXISTS",
+	 NT_STATUS_ADDRESS_ALREADY_EXISTS},
+	{"NT_STATUS_ADDRESS_CLOSED", NT_STATUS_ADDRESS_CLOSED},
+	{"NT_STATUS_CONNECTION_DISCONNECTED",
+	 NT_STATUS_CONNECTION_DISCONNECTED},
+	{"NT_STATUS_CONNECTION_RESET", NT_STATUS_CONNECTION_RESET},
+	{"NT_STATUS_TOO_MANY_NODES", NT_STATUS_TOO_MANY_NODES},
+	{"NT_STATUS_TRANSACTION_ABORTED", NT_STATUS_TRANSACTION_ABORTED},
+	{"NT_STATUS_TRANSACTION_TIMED_OUT",
+	 NT_STATUS_TRANSACTION_TIMED_OUT},
+	{"NT_STATUS_TRANSACTION_NO_RELEASE",
+	 NT_STATUS_TRANSACTION_NO_RELEASE},
+	{"NT_STATUS_TRANSACTION_NO_MATCH", NT_STATUS_TRANSACTION_NO_MATCH},
+	{"NT_STATUS_TRANSACTION_RESPONDED",
+	 NT_STATUS_TRANSACTION_RESPONDED},
+	{"NT_STATUS_TRANSACTION_INVALID_ID",
+	 NT_STATUS_TRANSACTION_INVALID_ID},
+	{"NT_STATUS_TRANSACTION_INVALID_TYPE",
+	 NT_STATUS_TRANSACTION_INVALID_TYPE},
+	{"NT_STATUS_NOT_SERVER_SESSION", NT_STATUS_NOT_SERVER_SESSION},
+	{"NT_STATUS_NOT_CLIENT_SESSION", NT_STATUS_NOT_CLIENT_SESSION},
+	{"NT_STATUS_CANNOT_LOAD_REGISTRY_FILE",
+	 NT_STATUS_CANNOT_LOAD_REGISTRY_FILE},
+	{"NT_STATUS_DEBUG_ATTACH_FAILED", NT_STATUS_DEBUG_ATTACH_FAILED},
+	{"NT_STATUS_SYSTEM_PROCESS_TERMINATED",
+	 NT_STATUS_SYSTEM_PROCESS_TERMINATED},
+	{"NT_STATUS_DATA_NOT_ACCEPTED", NT_STATUS_DATA_NOT_ACCEPTED},
+	{"NT_STATUS_NO_BROWSER_SERVERS_FOUND",
+	 NT_STATUS_NO_BROWSER_SERVERS_FOUND},
+	{"NT_STATUS_VDM_HARD_ERROR", NT_STATUS_VDM_HARD_ERROR},
+	{"NT_STATUS_DRIVER_CANCEL_TIMEOUT",
+	 NT_STATUS_DRIVER_CANCEL_TIMEOUT},
+	{"NT_STATUS_REPLY_MESSAGE_MISMATCH",
+	 NT_STATUS_REPLY_MESSAGE_MISMATCH},
+	{"NT_STATUS_MAPPED_ALIGNMENT", NT_STATUS_MAPPED_ALIGNMENT},
+	{"NT_STATUS_IMAGE_CHECKSUM_MISMATCH",
+	 NT_STATUS_IMAGE_CHECKSUM_MISMATCH},
+	{"NT_STATUS_LOST_WRITEBEHIND_DATA",
+	 NT_STATUS_LOST_WRITEBEHIND_DATA},
+	{"NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID",
+	 NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID},
+	{"NT_STATUS_PASSWORD_MUST_CHANGE", NT_STATUS_PASSWORD_MUST_CHANGE},
+	{"NT_STATUS_NOT_FOUND", NT_STATUS_NOT_FOUND},
+	{"NT_STATUS_NOT_TINY_STREAM", NT_STATUS_NOT_TINY_STREAM},
+	{"NT_STATUS_RECOVERY_FAILURE", NT_STATUS_RECOVERY_FAILURE},
+	{"NT_STATUS_STACK_OVERFLOW_READ", NT_STATUS_STACK_OVERFLOW_READ},
+	{"NT_STATUS_FAIL_CHECK", NT_STATUS_FAIL_CHECK},
+	{"NT_STATUS_DUPLICATE_OBJECTID", NT_STATUS_DUPLICATE_OBJECTID},
+	{"NT_STATUS_OBJECTID_EXISTS", NT_STATUS_OBJECTID_EXISTS},
+	{"NT_STATUS_CONVERT_TO_LARGE", NT_STATUS_CONVERT_TO_LARGE},
+	{"NT_STATUS_RETRY", NT_STATUS_RETRY},
+	{"NT_STATUS_FOUND_OUT_OF_SCOPE", NT_STATUS_FOUND_OUT_OF_SCOPE},
+	{"NT_STATUS_ALLOCATE_BUCKET", NT_STATUS_ALLOCATE_BUCKET},
+	{"NT_STATUS_PROPSET_NOT_FOUND", NT_STATUS_PROPSET_NOT_FOUND},
+	{"NT_STATUS_MARSHALL_OVERFLOW", NT_STATUS_MARSHALL_OVERFLOW},
+	{"NT_STATUS_INVALID_VARIANT", NT_STATUS_INVALID_VARIANT},
+	{"NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND",
+	 NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND},
+	{"NT_STATUS_ACCOUNT_LOCKED_OUT", NT_STATUS_ACCOUNT_LOCKED_OUT},
+	{"NT_STATUS_HANDLE_NOT_CLOSABLE", NT_STATUS_HANDLE_NOT_CLOSABLE},
+	{"NT_STATUS_CONNECTION_REFUSED", NT_STATUS_CONNECTION_REFUSED},
+	{"NT_STATUS_GRACEFUL_DISCONNECT", NT_STATUS_GRACEFUL_DISCONNECT},
+	{"NT_STATUS_ADDRESS_ALREADY_ASSOCIATED",
+	 NT_STATUS_ADDRESS_ALREADY_ASSOCIATED},
+	{"NT_STATUS_ADDRESS_NOT_ASSOCIATED",
+	 NT_STATUS_ADDRESS_NOT_ASSOCIATED},
+	{"NT_STATUS_CONNECTION_INVALID", NT_STATUS_CONNECTION_INVALID},
+	{"NT_STATUS_CONNECTION_ACTIVE", NT_STATUS_CONNECTION_ACTIVE},
+	{"NT_STATUS_NETWORK_UNREACHABLE", NT_STATUS_NETWORK_UNREACHABLE},
+	{"NT_STATUS_HOST_UNREACHABLE", NT_STATUS_HOST_UNREACHABLE},
+	{"NT_STATUS_PROTOCOL_UNREACHABLE", NT_STATUS_PROTOCOL_UNREACHABLE},
+	{"NT_STATUS_PORT_UNREACHABLE", NT_STATUS_PORT_UNREACHABLE},
+	{"NT_STATUS_REQUEST_ABORTED", NT_STATUS_REQUEST_ABORTED},
+	{"NT_STATUS_CONNECTION_ABORTED", NT_STATUS_CONNECTION_ABORTED},
+	{"NT_STATUS_BAD_COMPRESSION_BUFFER",
+	 NT_STATUS_BAD_COMPRESSION_BUFFER},
+	{"NT_STATUS_USER_MAPPED_FILE", NT_STATUS_USER_MAPPED_FILE},
+	{"NT_STATUS_AUDIT_FAILED", NT_STATUS_AUDIT_FAILED},
+	{"NT_STATUS_TIMER_RESOLUTION_NOT_SET",
+	 NT_STATUS_TIMER_RESOLUTION_NOT_SET},
+	{"NT_STATUS_CONNECTION_COUNT_LIMIT",
+	 NT_STATUS_CONNECTION_COUNT_LIMIT},
+	{"NT_STATUS_LOGIN_TIME_RESTRICTION",
+	 NT_STATUS_LOGIN_TIME_RESTRICTION},
+	{"NT_STATUS_LOGIN_WKSTA_RESTRICTION",
+	 NT_STATUS_LOGIN_WKSTA_RESTRICTION},
+	{"NT_STATUS_IMAGE_MP_UP_MISMATCH", NT_STATUS_IMAGE_MP_UP_MISMATCH},
+	{"NT_STATUS_INSUFFICIENT_LOGON_INFO",
+	 NT_STATUS_INSUFFICIENT_LOGON_INFO},
+	{"NT_STATUS_BAD_DLL_ENTRYPOINT", NT_STATUS_BAD_DLL_ENTRYPOINT},
+	{"NT_STATUS_BAD_SERVICE_ENTRYPOINT",
+	 NT_STATUS_BAD_SERVICE_ENTRYPOINT},
+	{"NT_STATUS_LPC_REPLY_LOST", NT_STATUS_LPC_REPLY_LOST},
+	{"NT_STATUS_IP_ADDRESS_CONFLICT1", NT_STATUS_IP_ADDRESS_CONFLICT1},
+	{"NT_STATUS_IP_ADDRESS_CONFLICT2", NT_STATUS_IP_ADDRESS_CONFLICT2},
+	{"NT_STATUS_REGISTRY_QUOTA_LIMIT", NT_STATUS_REGISTRY_QUOTA_LIMIT},
+	{"NT_STATUS_PATH_NOT_COVERED", NT_STATUS_PATH_NOT_COVERED},
+	{"NT_STATUS_NO_CALLBACK_ACTIVE", NT_STATUS_NO_CALLBACK_ACTIVE},
+	{"NT_STATUS_LICENSE_QUOTA_EXCEEDED",
+	 NT_STATUS_LICENSE_QUOTA_EXCEEDED},
+	{"NT_STATUS_PWD_TOO_SHORT", NT_STATUS_PWD_TOO_SHORT},
+	{"NT_STATUS_PWD_TOO_RECENT", NT_STATUS_PWD_TOO_RECENT},
+	{"NT_STATUS_PWD_HISTORY_CONFLICT", NT_STATUS_PWD_HISTORY_CONFLICT},
+	{"NT_STATUS_PLUGPLAY_NO_DEVICE", NT_STATUS_PLUGPLAY_NO_DEVICE},
+	{"NT_STATUS_UNSUPPORTED_COMPRESSION",
+	 NT_STATUS_UNSUPPORTED_COMPRESSION},
+	{"NT_STATUS_INVALID_HW_PROFILE", NT_STATUS_INVALID_HW_PROFILE},
+	{"NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH",
+	 NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH},
+	{"NT_STATUS_DRIVER_ORDINAL_NOT_FOUND",
+	 NT_STATUS_DRIVER_ORDINAL_NOT_FOUND},
+	{"NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND",
+	 NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND},
+	{"NT_STATUS_RESOURCE_NOT_OWNED", NT_STATUS_RESOURCE_NOT_OWNED},
+	{"NT_STATUS_TOO_MANY_LINKS", NT_STATUS_TOO_MANY_LINKS},
+	{"NT_STATUS_QUOTA_LIST_INCONSISTENT",
+	 NT_STATUS_QUOTA_LIST_INCONSISTENT},
+	{"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE},
+	{"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES},
+	{"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES},
+	{"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED},
+	{NULL, 0}
+};
diff --git a/kernel/fs/cifs/nterr.h b/kernel/fs/cifs/nterr.h
new file mode 100644
index 000000000..7a0eae5ae
--- /dev/null
+++ b/kernel/fs/cifs/nterr.h
@@ -0,0 +1,563 @@
+/*
+   Unix SMB/Netbios implementation.
+   Version 1.9.
+   NT error code constants
+   Copyright (C) Andrew Tridgell              1992-2000
+   Copyright (C) John H Terpstra              1996-2000
+   Copyright (C) Luke Kenneth Casson Leighton 1996-2000
+   Copyright (C) Paul Ashton                  1998-2000
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+
+
+#ifndef _NTERR_H
+#define _NTERR_H
+
+struct nt_err_code_struct {
+	char *nt_errstr;
+	__u32 nt_errcode;
+};
+
+extern const struct nt_err_code_struct nt_errs[];
+
+/* Win32 Status codes. */
+#define NT_STATUS_MORE_ENTRIES         0x0105
+#define NT_ERROR_INVALID_PARAMETER     0x0057
+#define NT_ERROR_INSUFFICIENT_BUFFER   0x007a
+#define NT_STATUS_1804                 0x070c
+#define NT_STATUS_NOTIFY_ENUM_DIR      0x010c
+
+/*
+ * Win32 Error codes extracted using a loop in smbclient then printing a netmon
+ * sniff to a file.
+ */
+
+#define NT_STATUS_OK                   0x0000
+#define NT_STATUS_SOME_UNMAPPED        0x0107
+#define NT_STATUS_BUFFER_OVERFLOW  0x80000005
+#define NT_STATUS_NO_MORE_ENTRIES  0x8000001a
+#define NT_STATUS_MEDIA_CHANGED    0x8000001c
+#define NT_STATUS_END_OF_MEDIA     0x8000001e
+#define NT_STATUS_MEDIA_CHECK      0x80000020
+#define NT_STATUS_NO_DATA_DETECTED 0x8000001c
+#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d
+#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288
+#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288
+#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001
+#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002
+#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003
+#define NT_STATUS_INFO_LENGTH_MISMATCH 0xC0000000 | 0x0004
+#define NT_STATUS_ACCESS_VIOLATION 0xC0000000 | 0x0005
+#define NT_STATUS_IN_PAGE_ERROR 0xC0000000 | 0x0006
+#define NT_STATUS_PAGEFILE_QUOTA 0xC0000000 | 0x0007
+#define NT_STATUS_INVALID_HANDLE 0xC0000000 | 0x0008
+#define NT_STATUS_BAD_INITIAL_STACK 0xC0000000 | 0x0009
+#define NT_STATUS_BAD_INITIAL_PC 0xC0000000 | 0x000a
+#define NT_STATUS_INVALID_CID 0xC0000000 | 0x000b
+#define NT_STATUS_TIMER_NOT_CANCELED 0xC0000000 | 0x000c
+#define NT_STATUS_INVALID_PARAMETER 0xC0000000 | 0x000d
+#define NT_STATUS_NO_SUCH_DEVICE 0xC0000000 | 0x000e
+#define NT_STATUS_NO_SUCH_FILE 0xC0000000 | 0x000f
+#define NT_STATUS_INVALID_DEVICE_REQUEST 0xC0000000 | 0x0010
+#define NT_STATUS_END_OF_FILE 0xC0000000 | 0x0011
+#define NT_STATUS_WRONG_VOLUME 0xC0000000 | 0x0012
+#define NT_STATUS_NO_MEDIA_IN_DEVICE 0xC0000000 | 0x0013
+#define NT_STATUS_UNRECOGNIZED_MEDIA 0xC0000000 | 0x0014
+#define NT_STATUS_NONEXISTENT_SECTOR 0xC0000000 | 0x0015
+#define NT_STATUS_MORE_PROCESSING_REQUIRED 0xC0000000 | 0x0016
+#define NT_STATUS_NO_MEMORY 0xC0000000 | 0x0017
+#define NT_STATUS_CONFLICTING_ADDRESSES 0xC0000000 | 0x0018
+#define NT_STATUS_NOT_MAPPED_VIEW 0xC0000000 | 0x0019
+#define NT_STATUS_UNABLE_TO_FREE_VM 0x80000000 | 0x001a
+#define NT_STATUS_UNABLE_TO_DELETE_SECTION 0xC0000000 | 0x001b
+#define NT_STATUS_INVALID_SYSTEM_SERVICE 0xC0000000 | 0x001c
+#define NT_STATUS_ILLEGAL_INSTRUCTION 0xC0000000 | 0x001d
+#define NT_STATUS_INVALID_LOCK_SEQUENCE 0xC0000000 | 0x001e
+#define NT_STATUS_INVALID_VIEW_SIZE 0xC0000000 | 0x001f
+#define NT_STATUS_INVALID_FILE_FOR_SECTION 0xC0000000 | 0x0020
+#define NT_STATUS_ALREADY_COMMITTED 0xC0000000 | 0x0021
+#define NT_STATUS_ACCESS_DENIED 0xC0000000 | 0x0022
+#define NT_STATUS_BUFFER_TOO_SMALL 0xC0000000 | 0x0023
+#define NT_STATUS_OBJECT_TYPE_MISMATCH 0xC0000000 | 0x0024
+#define NT_STATUS_NONCONTINUABLE_EXCEPTION 0xC0000000 | 0x0025
+#define NT_STATUS_INVALID_DISPOSITION 0xC0000000 | 0x0026
+#define NT_STATUS_UNWIND 0xC0000000 | 0x0027
+#define NT_STATUS_BAD_STACK 0xC0000000 | 0x0028
+#define NT_STATUS_INVALID_UNWIND_TARGET 0xC0000000 | 0x0029
+#define NT_STATUS_NOT_LOCKED 0xC0000000 | 0x002a
+#define NT_STATUS_PARITY_ERROR 0xC0000000 | 0x002b
+#define NT_STATUS_UNABLE_TO_DECOMMIT_VM 0xC0000000 | 0x002c
+#define NT_STATUS_NOT_COMMITTED 0xC0000000 | 0x002d
+#define NT_STATUS_INVALID_PORT_ATTRIBUTES 0xC0000000 | 0x002e
+#define NT_STATUS_PORT_MESSAGE_TOO_LONG 0xC0000000 | 0x002f
+#define NT_STATUS_INVALID_PARAMETER_MIX 0xC0000000 | 0x0030
+#define NT_STATUS_INVALID_QUOTA_LOWER 0xC0000000 | 0x0031
+#define NT_STATUS_DISK_CORRUPT_ERROR 0xC0000000 | 0x0032
+#define NT_STATUS_OBJECT_NAME_INVALID 0xC0000000 | 0x0033
+#define NT_STATUS_OBJECT_NAME_NOT_FOUND 0xC0000000 | 0x0034
+#define NT_STATUS_OBJECT_NAME_COLLISION 0xC0000000 | 0x0035
+#define NT_STATUS_HANDLE_NOT_WAITABLE 0xC0000000 | 0x0036
+#define NT_STATUS_PORT_DISCONNECTED 0xC0000000 | 0x0037
+#define NT_STATUS_DEVICE_ALREADY_ATTACHED 0xC0000000 | 0x0038
+#define NT_STATUS_OBJECT_PATH_INVALID 0xC0000000 | 0x0039
+#define NT_STATUS_OBJECT_PATH_NOT_FOUND 0xC0000000 | 0x003a
+#define NT_STATUS_OBJECT_PATH_SYNTAX_BAD 0xC0000000 | 0x003b
+#define NT_STATUS_DATA_OVERRUN 0xC0000000 | 0x003c
+#define NT_STATUS_DATA_LATE_ERROR 0xC0000000 | 0x003d
+#define NT_STATUS_DATA_ERROR 0xC0000000 | 0x003e
+#define NT_STATUS_CRC_ERROR 0xC0000000 | 0x003f
+#define NT_STATUS_SECTION_TOO_BIG 0xC0000000 | 0x0040
+#define NT_STATUS_PORT_CONNECTION_REFUSED 0xC0000000 | 0x0041
+#define NT_STATUS_INVALID_PORT_HANDLE 0xC0000000 | 0x0042
+#define NT_STATUS_SHARING_VIOLATION 0xC0000000 | 0x0043
+#define NT_STATUS_QUOTA_EXCEEDED 0xC0000000 | 0x0044
+#define NT_STATUS_INVALID_PAGE_PROTECTION 0xC0000000 | 0x0045
+#define NT_STATUS_MUTANT_NOT_OWNED 0xC0000000 | 0x0046
+#define NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED 0xC0000000 | 0x0047
+#define NT_STATUS_PORT_ALREADY_SET 0xC0000000 | 0x0048
+#define NT_STATUS_SECTION_NOT_IMAGE 0xC0000000 | 0x0049
+#define NT_STATUS_SUSPEND_COUNT_EXCEEDED 0xC0000000 | 0x004a
+#define NT_STATUS_THREAD_IS_TERMINATING 0xC0000000 | 0x004b
+#define NT_STATUS_BAD_WORKING_SET_LIMIT 0xC0000000 | 0x004c
+#define NT_STATUS_INCOMPATIBLE_FILE_MAP 0xC0000000 | 0x004d
+#define NT_STATUS_SECTION_PROTECTION 0xC0000000 | 0x004e
+#define NT_STATUS_EAS_NOT_SUPPORTED 0xC0000000 | 0x004f
+#define NT_STATUS_EA_TOO_LARGE 0xC0000000 | 0x0050
+#define NT_STATUS_NONEXISTENT_EA_ENTRY 0xC0000000 | 0x0051
+#define NT_STATUS_NO_EAS_ON_FILE 0xC0000000 | 0x0052
+#define NT_STATUS_EA_CORRUPT_ERROR 0xC0000000 | 0x0053
+#define NT_STATUS_FILE_LOCK_CONFLICT 0xC0000000 | 0x0054
+#define NT_STATUS_LOCK_NOT_GRANTED 0xC0000000 | 0x0055
+#define NT_STATUS_DELETE_PENDING 0xC0000000 | 0x0056
+#define NT_STATUS_CTL_FILE_NOT_SUPPORTED 0xC0000000 | 0x0057
+#define NT_STATUS_UNKNOWN_REVISION 0xC0000000 | 0x0058
+#define NT_STATUS_REVISION_MISMATCH 0xC0000000 | 0x0059
+#define NT_STATUS_INVALID_OWNER 0xC0000000 | 0x005a
+#define NT_STATUS_INVALID_PRIMARY_GROUP 0xC0000000 | 0x005b
+#define NT_STATUS_NO_IMPERSONATION_TOKEN 0xC0000000 | 0x005c
+#define NT_STATUS_CANT_DISABLE_MANDATORY 0xC0000000 | 0x005d
+#define NT_STATUS_NO_LOGON_SERVERS 0xC0000000 | 0x005e
+#define NT_STATUS_NO_SUCH_LOGON_SESSION 0xC0000000 | 0x005f
+#define NT_STATUS_NO_SUCH_PRIVILEGE 0xC0000000 | 0x0060
+#define NT_STATUS_PRIVILEGE_NOT_HELD 0xC0000000 | 0x0061
+#define NT_STATUS_INVALID_ACCOUNT_NAME 0xC0000000 | 0x0062
+#define NT_STATUS_USER_EXISTS 0xC0000000 | 0x0063
+#define NT_STATUS_NO_SUCH_USER 0xC0000000 | 0x0064
+#define NT_STATUS_GROUP_EXISTS 0xC0000000 | 0x0065
+#define NT_STATUS_NO_SUCH_GROUP 0xC0000000 | 0x0066
+#define NT_STATUS_MEMBER_IN_GROUP 0xC0000000 | 0x0067
+#define NT_STATUS_MEMBER_NOT_IN_GROUP 0xC0000000 | 0x0068
+#define NT_STATUS_LAST_ADMIN 0xC0000000 | 0x0069
+#define NT_STATUS_WRONG_PASSWORD 0xC0000000 | 0x006a
+#define NT_STATUS_ILL_FORMED_PASSWORD 0xC0000000 | 0x006b
+#define NT_STATUS_PASSWORD_RESTRICTION 0xC0000000 | 0x006c
+#define NT_STATUS_LOGON_FAILURE 0xC0000000 | 0x006d
+#define NT_STATUS_ACCOUNT_RESTRICTION 0xC0000000 | 0x006e
+#define NT_STATUS_INVALID_LOGON_HOURS 0xC0000000 | 0x006f
+#define NT_STATUS_INVALID_WORKSTATION 0xC0000000 | 0x0070
+#define NT_STATUS_PASSWORD_EXPIRED 0xC0000000 | 0x0071
+#define NT_STATUS_ACCOUNT_DISABLED 0xC0000000 | 0x0072
+#define NT_STATUS_NONE_MAPPED 0xC0000000 | 0x0073
+#define NT_STATUS_TOO_MANY_LUIDS_REQUESTED 0xC0000000 | 0x0074
+#define NT_STATUS_LUIDS_EXHAUSTED 0xC0000000 | 0x0075
+#define NT_STATUS_INVALID_SUB_AUTHORITY 0xC0000000 | 0x0076
+#define NT_STATUS_INVALID_ACL 0xC0000000 | 0x0077
+#define NT_STATUS_INVALID_SID 0xC0000000 | 0x0078
+#define NT_STATUS_INVALID_SECURITY_DESCR 0xC0000000 | 0x0079
+#define NT_STATUS_PROCEDURE_NOT_FOUND 0xC0000000 | 0x007a
+#define NT_STATUS_INVALID_IMAGE_FORMAT 0xC0000000 | 0x007b
+#define NT_STATUS_NO_TOKEN 0xC0000000 | 0x007c
+#define NT_STATUS_BAD_INHERITANCE_ACL 0xC0000000 | 0x007d
+#define NT_STATUS_RANGE_NOT_LOCKED 0xC0000000 | 0x007e
+#define NT_STATUS_DISK_FULL 0xC0000000 | 0x007f
+#define NT_STATUS_SERVER_DISABLED 0xC0000000 | 0x0080
+#define NT_STATUS_SERVER_NOT_DISABLED 0xC0000000 | 0x0081
+#define NT_STATUS_TOO_MANY_GUIDS_REQUESTED 0xC0000000 | 0x0082
+#define NT_STATUS_GUIDS_EXHAUSTED 0xC0000000 | 0x0083
+#define NT_STATUS_INVALID_ID_AUTHORITY 0xC0000000 | 0x0084
+#define NT_STATUS_AGENTS_EXHAUSTED 0xC0000000 | 0x0085
+#define NT_STATUS_INVALID_VOLUME_LABEL 0xC0000000 | 0x0086
+#define NT_STATUS_SECTION_NOT_EXTENDED 0xC0000000 | 0x0087
+#define NT_STATUS_NOT_MAPPED_DATA 0xC0000000 | 0x0088
+#define NT_STATUS_RESOURCE_DATA_NOT_FOUND 0xC0000000 | 0x0089
+#define NT_STATUS_RESOURCE_TYPE_NOT_FOUND 0xC0000000 | 0x008a
+#define NT_STATUS_RESOURCE_NAME_NOT_FOUND 0xC0000000 | 0x008b
+#define NT_STATUS_ARRAY_BOUNDS_EXCEEDED 0xC0000000 | 0x008c
+#define NT_STATUS_FLOAT_DENORMAL_OPERAND 0xC0000000 | 0x008d
+#define NT_STATUS_FLOAT_DIVIDE_BY_ZERO 0xC0000000 | 0x008e
+#define NT_STATUS_FLOAT_INEXACT_RESULT 0xC0000000 | 0x008f
+#define NT_STATUS_FLOAT_INVALID_OPERATION 0xC0000000 | 0x0090
+#define NT_STATUS_FLOAT_OVERFLOW 0xC0000000 | 0x0091
+#define NT_STATUS_FLOAT_STACK_CHECK 0xC0000000 | 0x0092
+#define NT_STATUS_FLOAT_UNDERFLOW 0xC0000000 | 0x0093
+#define NT_STATUS_INTEGER_DIVIDE_BY_ZERO 0xC0000000 | 0x0094
+#define NT_STATUS_INTEGER_OVERFLOW 0xC0000000 | 0x0095
+#define NT_STATUS_PRIVILEGED_INSTRUCTION 0xC0000000 | 0x0096
+#define NT_STATUS_TOO_MANY_PAGING_FILES 0xC0000000 | 0x0097
+#define NT_STATUS_FILE_INVALID 0xC0000000 | 0x0098
+#define NT_STATUS_ALLOTTED_SPACE_EXCEEDED 0xC0000000 | 0x0099
+#define NT_STATUS_INSUFFICIENT_RESOURCES 0xC0000000 | 0x009a
+#define NT_STATUS_DFS_EXIT_PATH_FOUND 0xC0000000 | 0x009b
+#define NT_STATUS_DEVICE_DATA_ERROR 0xC0000000 | 0x009c
+#define NT_STATUS_DEVICE_NOT_CONNECTED 0xC0000000 | 0x009d
+#define NT_STATUS_DEVICE_POWER_FAILURE 0xC0000000 | 0x009e
+#define NT_STATUS_FREE_VM_NOT_AT_BASE 0xC0000000 | 0x009f
+#define NT_STATUS_MEMORY_NOT_ALLOCATED 0xC0000000 | 0x00a0
+#define NT_STATUS_WORKING_SET_QUOTA 0xC0000000 | 0x00a1
+#define NT_STATUS_MEDIA_WRITE_PROTECTED 0xC0000000 | 0x00a2
+#define NT_STATUS_DEVICE_NOT_READY 0xC0000000 | 0x00a3
+#define NT_STATUS_INVALID_GROUP_ATTRIBUTES 0xC0000000 | 0x00a4
+#define NT_STATUS_BAD_IMPERSONATION_LEVEL 0xC0000000 | 0x00a5
+#define NT_STATUS_CANT_OPEN_ANONYMOUS 0xC0000000 | 0x00a6
+#define NT_STATUS_BAD_VALIDATION_CLASS 0xC0000000 | 0x00a7
+#define NT_STATUS_BAD_TOKEN_TYPE 0xC0000000 | 0x00a8
+#define NT_STATUS_BAD_MASTER_BOOT_RECORD 0xC0000000 | 0x00a9
+#define NT_STATUS_INSTRUCTION_MISALIGNMENT 0xC0000000 | 0x00aa
+#define NT_STATUS_INSTANCE_NOT_AVAILABLE 0xC0000000 | 0x00ab
+#define NT_STATUS_PIPE_NOT_AVAILABLE 0xC0000000 | 0x00ac
+#define NT_STATUS_INVALID_PIPE_STATE 0xC0000000 | 0x00ad
+#define NT_STATUS_PIPE_BUSY 0xC0000000 | 0x00ae
+#define NT_STATUS_ILLEGAL_FUNCTION 0xC0000000 | 0x00af
+#define NT_STATUS_PIPE_DISCONNECTED 0xC0000000 | 0x00b0
+#define NT_STATUS_PIPE_CLOSING 0xC0000000 | 0x00b1
+#define NT_STATUS_PIPE_CONNECTED 0xC0000000 | 0x00b2
+#define NT_STATUS_PIPE_LISTENING 0xC0000000 | 0x00b3
+#define NT_STATUS_INVALID_READ_MODE 0xC0000000 | 0x00b4
+#define NT_STATUS_IO_TIMEOUT 0xC0000000 | 0x00b5
+#define NT_STATUS_FILE_FORCED_CLOSED 0xC0000000 | 0x00b6
+#define NT_STATUS_PROFILING_NOT_STARTED 0xC0000000 | 0x00b7
+#define NT_STATUS_PROFILING_NOT_STOPPED 0xC0000000 | 0x00b8
+#define NT_STATUS_COULD_NOT_INTERPRET 0xC0000000 | 0x00b9
+#define NT_STATUS_FILE_IS_A_DIRECTORY 0xC0000000 | 0x00ba
+#define NT_STATUS_NOT_SUPPORTED 0xC0000000 | 0x00bb
+#define NT_STATUS_REMOTE_NOT_LISTENING 0xC0000000 | 0x00bc
+#define NT_STATUS_DUPLICATE_NAME 0xC0000000 | 0x00bd
+#define NT_STATUS_BAD_NETWORK_PATH 0xC0000000 | 0x00be
+#define NT_STATUS_NETWORK_BUSY 0xC0000000 | 0x00bf
+#define NT_STATUS_DEVICE_DOES_NOT_EXIST 0xC0000000 | 0x00c0
+#define NT_STATUS_TOO_MANY_COMMANDS 0xC0000000 | 0x00c1
+#define NT_STATUS_ADAPTER_HARDWARE_ERROR 0xC0000000 | 0x00c2
+#define NT_STATUS_INVALID_NETWORK_RESPONSE 0xC0000000 | 0x00c3
+#define NT_STATUS_UNEXPECTED_NETWORK_ERROR 0xC0000000 | 0x00c4
+#define NT_STATUS_BAD_REMOTE_ADAPTER 0xC0000000 | 0x00c5
+#define NT_STATUS_PRINT_QUEUE_FULL 0xC0000000 | 0x00c6
+#define NT_STATUS_NO_SPOOL_SPACE 0xC0000000 | 0x00c7
+#define NT_STATUS_PRINT_CANCELLED 0xC0000000 | 0x00c8
+#define NT_STATUS_NETWORK_NAME_DELETED 0xC0000000 | 0x00c9
+#define NT_STATUS_NETWORK_ACCESS_DENIED 0xC0000000 | 0x00ca
+#define NT_STATUS_BAD_DEVICE_TYPE 0xC0000000 | 0x00cb
+#define NT_STATUS_BAD_NETWORK_NAME 0xC0000000 | 0x00cc
+#define NT_STATUS_TOO_MANY_NAMES 0xC0000000 | 0x00cd
+#define NT_STATUS_TOO_MANY_SESSIONS 0xC0000000 | 0x00ce
+#define NT_STATUS_SHARING_PAUSED 0xC0000000 | 0x00cf
+#define NT_STATUS_REQUEST_NOT_ACCEPTED 0xC0000000 | 0x00d0
+#define NT_STATUS_REDIRECTOR_PAUSED 0xC0000000 | 0x00d1
+#define NT_STATUS_NET_WRITE_FAULT 0xC0000000 | 0x00d2
+#define NT_STATUS_PROFILING_AT_LIMIT 0xC0000000 | 0x00d3
+#define NT_STATUS_NOT_SAME_DEVICE 0xC0000000 | 0x00d4
+#define NT_STATUS_FILE_RENAMED 0xC0000000 | 0x00d5
+#define NT_STATUS_VIRTUAL_CIRCUIT_CLOSED 0xC0000000 | 0x00d6
+#define NT_STATUS_NO_SECURITY_ON_OBJECT 0xC0000000 | 0x00d7
+#define NT_STATUS_CANT_WAIT 0xC0000000 | 0x00d8
+#define NT_STATUS_PIPE_EMPTY 0xC0000000 | 0x00d9
+#define NT_STATUS_CANT_ACCESS_DOMAIN_INFO 0xC0000000 | 0x00da
+#define NT_STATUS_CANT_TERMINATE_SELF 0xC0000000 | 0x00db
+#define NT_STATUS_INVALID_SERVER_STATE 0xC0000000 | 0x00dc
+#define NT_STATUS_INVALID_DOMAIN_STATE 0xC0000000 | 0x00dd
+#define NT_STATUS_INVALID_DOMAIN_ROLE 0xC0000000 | 0x00de
+#define NT_STATUS_NO_SUCH_DOMAIN 0xC0000000 | 0x00df
+#define NT_STATUS_DOMAIN_EXISTS 0xC0000000 | 0x00e0
+#define NT_STATUS_DOMAIN_LIMIT_EXCEEDED 0xC0000000 | 0x00e1
+#define NT_STATUS_OPLOCK_NOT_GRANTED 0xC0000000 | 0x00e2
+#define NT_STATUS_INVALID_OPLOCK_PROTOCOL 0xC0000000 | 0x00e3
+#define NT_STATUS_INTERNAL_DB_CORRUPTION 0xC0000000 | 0x00e4
+#define NT_STATUS_INTERNAL_ERROR 0xC0000000 | 0x00e5
+#define NT_STATUS_GENERIC_NOT_MAPPED 0xC0000000 | 0x00e6
+#define NT_STATUS_BAD_DESCRIPTOR_FORMAT 0xC0000000 | 0x00e7
+#define NT_STATUS_INVALID_USER_BUFFER 0xC0000000 | 0x00e8
+#define NT_STATUS_UNEXPECTED_IO_ERROR 0xC0000000 | 0x00e9
+#define NT_STATUS_UNEXPECTED_MM_CREATE_ERR 0xC0000000 | 0x00ea
+#define NT_STATUS_UNEXPECTED_MM_MAP_ERROR 0xC0000000 | 0x00eb
+#define NT_STATUS_UNEXPECTED_MM_EXTEND_ERR 0xC0000000 | 0x00ec
+#define NT_STATUS_NOT_LOGON_PROCESS 0xC0000000 | 0x00ed
+#define NT_STATUS_LOGON_SESSION_EXISTS 0xC0000000 | 0x00ee
+#define NT_STATUS_INVALID_PARAMETER_1 0xC0000000 | 0x00ef
+#define NT_STATUS_INVALID_PARAMETER_2 0xC0000000 | 0x00f0
+#define NT_STATUS_INVALID_PARAMETER_3 0xC0000000 | 0x00f1
+#define NT_STATUS_INVALID_PARAMETER_4 0xC0000000 | 0x00f2
+#define NT_STATUS_INVALID_PARAMETER_5 0xC0000000 | 0x00f3
+#define NT_STATUS_INVALID_PARAMETER_6 0xC0000000 | 0x00f4
+#define NT_STATUS_INVALID_PARAMETER_7 0xC0000000 | 0x00f5
+#define NT_STATUS_INVALID_PARAMETER_8 0xC0000000 | 0x00f6
+#define NT_STATUS_INVALID_PARAMETER_9 0xC0000000 | 0x00f7
+#define NT_STATUS_INVALID_PARAMETER_10 0xC0000000 | 0x00f8
+#define NT_STATUS_INVALID_PARAMETER_11 0xC0000000 | 0x00f9
+#define NT_STATUS_INVALID_PARAMETER_12 0xC0000000 | 0x00fa
+#define NT_STATUS_REDIRECTOR_NOT_STARTED 0xC0000000 | 0x00fb
+#define NT_STATUS_REDIRECTOR_STARTED 0xC0000000 | 0x00fc
+#define NT_STATUS_STACK_OVERFLOW 0xC0000000 | 0x00fd
+#define NT_STATUS_NO_SUCH_PACKAGE 0xC0000000 | 0x00fe
+#define NT_STATUS_BAD_FUNCTION_TABLE 0xC0000000 | 0x00ff
+#define NT_STATUS_DIRECTORY_NOT_EMPTY 0xC0000000 | 0x0101
+#define NT_STATUS_FILE_CORRUPT_ERROR 0xC0000000 | 0x0102
+#define NT_STATUS_NOT_A_DIRECTORY 0xC0000000 | 0x0103
+#define NT_STATUS_BAD_LOGON_SESSION_STATE 0xC0000000 | 0x0104
+#define NT_STATUS_LOGON_SESSION_COLLISION 0xC0000000 | 0x0105
+#define NT_STATUS_NAME_TOO_LONG 0xC0000000 | 0x0106
+#define NT_STATUS_FILES_OPEN 0xC0000000 | 0x0107
+#define NT_STATUS_CONNECTION_IN_USE 0xC0000000 | 0x0108
+#define NT_STATUS_MESSAGE_NOT_FOUND 0xC0000000 | 0x0109
+#define NT_STATUS_PROCESS_IS_TERMINATING 0xC0000000 | 0x010a
+#define NT_STATUS_INVALID_LOGON_TYPE 0xC0000000 | 0x010b
+#define NT_STATUS_NO_GUID_TRANSLATION 0xC0000000 | 0x010c
+#define NT_STATUS_CANNOT_IMPERSONATE 0xC0000000 | 0x010d
+#define NT_STATUS_IMAGE_ALREADY_LOADED 0xC0000000 | 0x010e
+#define NT_STATUS_ABIOS_NOT_PRESENT 0xC0000000 | 0x010f
+#define NT_STATUS_ABIOS_LID_NOT_EXIST 0xC0000000 | 0x0110
+#define NT_STATUS_ABIOS_LID_ALREADY_OWNED 0xC0000000 | 0x0111
+#define NT_STATUS_ABIOS_NOT_LID_OWNER 0xC0000000 | 0x0112
+#define NT_STATUS_ABIOS_INVALID_COMMAND 0xC0000000 | 0x0113
+#define NT_STATUS_ABIOS_INVALID_LID 0xC0000000 | 0x0114
+#define NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE 0xC0000000 | 0x0115
+#define NT_STATUS_ABIOS_INVALID_SELECTOR 0xC0000000 | 0x0116
+#define NT_STATUS_NO_LDT 0xC0000000 | 0x0117
+#define NT_STATUS_INVALID_LDT_SIZE 0xC0000000 | 0x0118
+#define NT_STATUS_INVALID_LDT_OFFSET 0xC0000000 | 0x0119
+#define NT_STATUS_INVALID_LDT_DESCRIPTOR 0xC0000000 | 0x011a
+#define NT_STATUS_INVALID_IMAGE_NE_FORMAT 0xC0000000 | 0x011b
+#define NT_STATUS_RXACT_INVALID_STATE 0xC0000000 | 0x011c
+#define NT_STATUS_RXACT_COMMIT_FAILURE 0xC0000000 | 0x011d
+#define NT_STATUS_MAPPED_FILE_SIZE_ZERO 0xC0000000 | 0x011e
+#define NT_STATUS_TOO_MANY_OPENED_FILES 0xC0000000 | 0x011f
+#define NT_STATUS_CANCELLED 0xC0000000 | 0x0120
+#define NT_STATUS_CANNOT_DELETE 0xC0000000 | 0x0121
+#define NT_STATUS_INVALID_COMPUTER_NAME 0xC0000000 | 0x0122
+#define NT_STATUS_FILE_DELETED 0xC0000000 | 0x0123
+#define NT_STATUS_SPECIAL_ACCOUNT 0xC0000000 | 0x0124
+#define NT_STATUS_SPECIAL_GROUP 0xC0000000 | 0x0125
+#define NT_STATUS_SPECIAL_USER 0xC0000000 | 0x0126
+#define NT_STATUS_MEMBERS_PRIMARY_GROUP 0xC0000000 | 0x0127
+#define NT_STATUS_FILE_CLOSED 0xC0000000 | 0x0128
+#define NT_STATUS_TOO_MANY_THREADS 0xC0000000 | 0x0129
+#define NT_STATUS_THREAD_NOT_IN_PROCESS 0xC0000000 | 0x012a
+#define NT_STATUS_TOKEN_ALREADY_IN_USE 0xC0000000 | 0x012b
+#define NT_STATUS_PAGEFILE_QUOTA_EXCEEDED 0xC0000000 | 0x012c
+#define NT_STATUS_COMMITMENT_LIMIT 0xC0000000 | 0x012d
+#define NT_STATUS_INVALID_IMAGE_LE_FORMAT 0xC0000000 | 0x012e
+#define NT_STATUS_INVALID_IMAGE_NOT_MZ 0xC0000000 | 0x012f
+#define NT_STATUS_INVALID_IMAGE_PROTECT 0xC0000000 | 0x0130
+#define NT_STATUS_INVALID_IMAGE_WIN_16 0xC0000000 | 0x0131
+#define NT_STATUS_LOGON_SERVER_CONFLICT 0xC0000000 | 0x0132
+#define NT_STATUS_TIME_DIFFERENCE_AT_DC 0xC0000000 | 0x0133
+#define NT_STATUS_SYNCHRONIZATION_REQUIRED 0xC0000000 | 0x0134
+#define NT_STATUS_DLL_NOT_FOUND 0xC0000000 | 0x0135
+#define NT_STATUS_OPEN_FAILED 0xC0000000 | 0x0136
+#define NT_STATUS_IO_PRIVILEGE_FAILED 0xC0000000 | 0x0137
+#define NT_STATUS_ORDINAL_NOT_FOUND 0xC0000000 | 0x0138
+#define NT_STATUS_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0139
+#define NT_STATUS_CONTROL_C_EXIT 0xC0000000 | 0x013a
+#define NT_STATUS_LOCAL_DISCONNECT 0xC0000000 | 0x013b
+#define NT_STATUS_REMOTE_DISCONNECT 0xC0000000 | 0x013c
+#define NT_STATUS_REMOTE_RESOURCES 0xC0000000 | 0x013d
+#define NT_STATUS_LINK_FAILED 0xC0000000 | 0x013e
+#define NT_STATUS_LINK_TIMEOUT 0xC0000000 | 0x013f
+#define NT_STATUS_INVALID_CONNECTION 0xC0000000 | 0x0140
+#define NT_STATUS_INVALID_ADDRESS 0xC0000000 | 0x0141
+#define NT_STATUS_DLL_INIT_FAILED 0xC0000000 | 0x0142
+#define NT_STATUS_MISSING_SYSTEMFILE 0xC0000000 | 0x0143
+#define NT_STATUS_UNHANDLED_EXCEPTION 0xC0000000 | 0x0144
+#define NT_STATUS_APP_INIT_FAILURE 0xC0000000 | 0x0145
+#define NT_STATUS_PAGEFILE_CREATE_FAILED 0xC0000000 | 0x0146
+#define NT_STATUS_NO_PAGEFILE 0xC0000000 | 0x0147
+#define NT_STATUS_INVALID_LEVEL 0xC0000000 | 0x0148
+#define NT_STATUS_WRONG_PASSWORD_CORE 0xC0000000 | 0x0149
+#define NT_STATUS_ILLEGAL_FLOAT_CONTEXT 0xC0000000 | 0x014a
+#define NT_STATUS_PIPE_BROKEN 0xC0000000 | 0x014b
+#define NT_STATUS_REGISTRY_CORRUPT 0xC0000000 | 0x014c
+#define NT_STATUS_REGISTRY_IO_FAILED 0xC0000000 | 0x014d
+#define NT_STATUS_NO_EVENT_PAIR 0xC0000000 | 0x014e
+#define NT_STATUS_UNRECOGNIZED_VOLUME 0xC0000000 | 0x014f
+#define NT_STATUS_SERIAL_NO_DEVICE_INITED 0xC0000000 | 0x0150
+#define NT_STATUS_NO_SUCH_ALIAS 0xC0000000 | 0x0151
+#define NT_STATUS_MEMBER_NOT_IN_ALIAS 0xC0000000 | 0x0152
+#define NT_STATUS_MEMBER_IN_ALIAS 0xC0000000 | 0x0153
+#define NT_STATUS_ALIAS_EXISTS 0xC0000000 | 0x0154
+#define NT_STATUS_LOGON_NOT_GRANTED 0xC0000000 | 0x0155
+#define NT_STATUS_TOO_MANY_SECRETS 0xC0000000 | 0x0156
+#define NT_STATUS_SECRET_TOO_LONG 0xC0000000 | 0x0157
+#define NT_STATUS_INTERNAL_DB_ERROR 0xC0000000 | 0x0158
+#define NT_STATUS_FULLSCREEN_MODE 0xC0000000 | 0x0159
+#define NT_STATUS_TOO_MANY_CONTEXT_IDS 0xC0000000 | 0x015a
+#define NT_STATUS_LOGON_TYPE_NOT_GRANTED 0xC0000000 | 0x015b
+#define NT_STATUS_NOT_REGISTRY_FILE 0xC0000000 | 0x015c
+#define NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x015d
+#define NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR 0xC0000000 | 0x015e
+#define NT_STATUS_FT_MISSING_MEMBER 0xC0000000 | 0x015f
+#define NT_STATUS_ILL_FORMED_SERVICE_ENTRY 0xC0000000 | 0x0160
+#define NT_STATUS_ILLEGAL_CHARACTER 0xC0000000 | 0x0161
+#define NT_STATUS_UNMAPPABLE_CHARACTER 0xC0000000 | 0x0162
+#define NT_STATUS_UNDEFINED_CHARACTER 0xC0000000 | 0x0163
+#define NT_STATUS_FLOPPY_VOLUME 0xC0000000 | 0x0164
+#define NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND 0xC0000000 | 0x0165
+#define NT_STATUS_FLOPPY_WRONG_CYLINDER 0xC0000000 | 0x0166
+#define NT_STATUS_FLOPPY_UNKNOWN_ERROR 0xC0000000 | 0x0167
+#define NT_STATUS_FLOPPY_BAD_REGISTERS 0xC0000000 | 0x0168
+#define NT_STATUS_DISK_RECALIBRATE_FAILED 0xC0000000 | 0x0169
+#define NT_STATUS_DISK_OPERATION_FAILED 0xC0000000 | 0x016a
+#define NT_STATUS_DISK_RESET_FAILED 0xC0000000 | 0x016b
+#define NT_STATUS_SHARED_IRQ_BUSY 0xC0000000 | 0x016c
+#define NT_STATUS_FT_ORPHANING 0xC0000000 | 0x016d
+#define NT_STATUS_PARTITION_FAILURE 0xC0000000 | 0x0172
+#define NT_STATUS_INVALID_BLOCK_LENGTH 0xC0000000 | 0x0173
+#define NT_STATUS_DEVICE_NOT_PARTITIONED 0xC0000000 | 0x0174
+#define NT_STATUS_UNABLE_TO_LOCK_MEDIA 0xC0000000 | 0x0175
+#define NT_STATUS_UNABLE_TO_UNLOAD_MEDIA 0xC0000000 | 0x0176
+#define NT_STATUS_EOM_OVERFLOW 0xC0000000 | 0x0177
+#define NT_STATUS_NO_MEDIA 0xC0000000 | 0x0178
+#define NT_STATUS_NO_SUCH_MEMBER 0xC0000000 | 0x017a
+#define NT_STATUS_INVALID_MEMBER 0xC0000000 | 0x017b
+#define NT_STATUS_KEY_DELETED 0xC0000000 | 0x017c
+#define NT_STATUS_NO_LOG_SPACE 0xC0000000 | 0x017d
+#define NT_STATUS_TOO_MANY_SIDS 0xC0000000 | 0x017e
+#define NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x017f
+#define NT_STATUS_KEY_HAS_CHILDREN 0xC0000000 | 0x0180
+#define NT_STATUS_CHILD_MUST_BE_VOLATILE 0xC0000000 | 0x0181
+#define NT_STATUS_DEVICE_CONFIGURATION_ERROR 0xC0000000 | 0x0182
+#define NT_STATUS_DRIVER_INTERNAL_ERROR 0xC0000000 | 0x0183
+#define NT_STATUS_INVALID_DEVICE_STATE 0xC0000000 | 0x0184
+#define NT_STATUS_IO_DEVICE_ERROR 0xC0000000 | 0x0185
+#define NT_STATUS_DEVICE_PROTOCOL_ERROR 0xC0000000 | 0x0186
+#define NT_STATUS_BACKUP_CONTROLLER 0xC0000000 | 0x0187
+#define NT_STATUS_LOG_FILE_FULL 0xC0000000 | 0x0188
+#define NT_STATUS_TOO_LATE 0xC0000000 | 0x0189
+#define NT_STATUS_NO_TRUST_LSA_SECRET 0xC0000000 | 0x018a
+#define NT_STATUS_NO_TRUST_SAM_ACCOUNT 0xC0000000 | 0x018b
+#define NT_STATUS_TRUSTED_DOMAIN_FAILURE 0xC0000000 | 0x018c
+#define NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE 0xC0000000 | 0x018d
+#define NT_STATUS_EVENTLOG_FILE_CORRUPT 0xC0000000 | 0x018e
+#define NT_STATUS_EVENTLOG_CANT_START 0xC0000000 | 0x018f
+#define NT_STATUS_TRUST_FAILURE 0xC0000000 | 0x0190
+#define NT_STATUS_MUTANT_LIMIT_EXCEEDED 0xC0000000 | 0x0191
+#define NT_STATUS_NETLOGON_NOT_STARTED 0xC0000000 | 0x0192
+#define NT_STATUS_ACCOUNT_EXPIRED 0xC0000000 | 0x0193
+#define NT_STATUS_POSSIBLE_DEADLOCK 0xC0000000 | 0x0194
+#define NT_STATUS_NETWORK_CREDENTIAL_CONFLICT 0xC0000000 | 0x0195
+#define NT_STATUS_REMOTE_SESSION_LIMIT 0xC0000000 | 0x0196
+#define NT_STATUS_EVENTLOG_FILE_CHANGED 0xC0000000 | 0x0197
+#define NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT 0xC0000000 | 0x0198
+#define NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT 0xC0000000 | 0x0199
+#define NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT 0xC0000000 | 0x019a
+#define NT_STATUS_DOMAIN_TRUST_INCONSISTENT 0xC0000000 | 0x019b
+#define NT_STATUS_FS_DRIVER_REQUIRED 0xC0000000 | 0x019c
+#define NT_STATUS_NO_USER_SESSION_KEY 0xC0000000 | 0x0202
+#define NT_STATUS_USER_SESSION_DELETED 0xC0000000 | 0x0203
+#define NT_STATUS_RESOURCE_LANG_NOT_FOUND 0xC0000000 | 0x0204
+#define NT_STATUS_INSUFF_SERVER_RESOURCES 0xC0000000 | 0x0205
+#define NT_STATUS_INVALID_BUFFER_SIZE 0xC0000000 | 0x0206
+#define NT_STATUS_INVALID_ADDRESS_COMPONENT 0xC0000000 | 0x0207
+#define NT_STATUS_INVALID_ADDRESS_WILDCARD 0xC0000000 | 0x0208
+#define NT_STATUS_TOO_MANY_ADDRESSES 0xC0000000 | 0x0209
+#define NT_STATUS_ADDRESS_ALREADY_EXISTS 0xC0000000 | 0x020a
+#define NT_STATUS_ADDRESS_CLOSED 0xC0000000 | 0x020b
+#define NT_STATUS_CONNECTION_DISCONNECTED 0xC0000000 | 0x020c
+#define NT_STATUS_CONNECTION_RESET 0xC0000000 | 0x020d
+#define NT_STATUS_TOO_MANY_NODES 0xC0000000 | 0x020e
+#define NT_STATUS_TRANSACTION_ABORTED 0xC0000000 | 0x020f
+#define NT_STATUS_TRANSACTION_TIMED_OUT 0xC0000000 | 0x0210
+#define NT_STATUS_TRANSACTION_NO_RELEASE 0xC0000000 | 0x0211
+#define NT_STATUS_TRANSACTION_NO_MATCH 0xC0000000 | 0x0212
+#define NT_STATUS_TRANSACTION_RESPONDED 0xC0000000 | 0x0213
+#define NT_STATUS_TRANSACTION_INVALID_ID 0xC0000000 | 0x0214
+#define NT_STATUS_TRANSACTION_INVALID_TYPE 0xC0000000 | 0x0215
+#define NT_STATUS_NOT_SERVER_SESSION 0xC0000000 | 0x0216
+#define NT_STATUS_NOT_CLIENT_SESSION 0xC0000000 | 0x0217
+#define NT_STATUS_CANNOT_LOAD_REGISTRY_FILE 0xC0000000 | 0x0218
+#define NT_STATUS_DEBUG_ATTACH_FAILED 0xC0000000 | 0x0219
+#define NT_STATUS_SYSTEM_PROCESS_TERMINATED 0xC0000000 | 0x021a
+#define NT_STATUS_DATA_NOT_ACCEPTED 0xC0000000 | 0x021b
+#define NT_STATUS_NO_BROWSER_SERVERS_FOUND 0xC0000000 | 0x021c
+#define NT_STATUS_VDM_HARD_ERROR 0xC0000000 | 0x021d
+#define NT_STATUS_DRIVER_CANCEL_TIMEOUT 0xC0000000 | 0x021e
+#define NT_STATUS_REPLY_MESSAGE_MISMATCH 0xC0000000 | 0x021f
+#define NT_STATUS_MAPPED_ALIGNMENT 0xC0000000 | 0x0220
+#define NT_STATUS_IMAGE_CHECKSUM_MISMATCH 0xC0000000 | 0x0221
+#define NT_STATUS_LOST_WRITEBEHIND_DATA 0xC0000000 | 0x0222
+#define NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID 0xC0000000 | 0x0223
+#define NT_STATUS_PASSWORD_MUST_CHANGE 0xC0000000 | 0x0224
+#define NT_STATUS_NOT_FOUND 0xC0000000 | 0x0225
+#define NT_STATUS_NOT_TINY_STREAM 0xC0000000 | 0x0226
+#define NT_STATUS_RECOVERY_FAILURE 0xC0000000 | 0x0227
+#define NT_STATUS_STACK_OVERFLOW_READ 0xC0000000 | 0x0228
+#define NT_STATUS_FAIL_CHECK 0xC0000000 | 0x0229
+#define NT_STATUS_DUPLICATE_OBJECTID 0xC0000000 | 0x022a
+#define NT_STATUS_OBJECTID_EXISTS 0xC0000000 | 0x022b
+#define NT_STATUS_CONVERT_TO_LARGE 0xC0000000 | 0x022c
+#define NT_STATUS_RETRY 0xC0000000 | 0x022d
+#define NT_STATUS_FOUND_OUT_OF_SCOPE 0xC0000000 | 0x022e
+#define NT_STATUS_ALLOCATE_BUCKET 0xC0000000 | 0x022f
+#define NT_STATUS_PROPSET_NOT_FOUND 0xC0000000 | 0x0230
+#define NT_STATUS_MARSHALL_OVERFLOW 0xC0000000 | 0x0231
+#define NT_STATUS_INVALID_VARIANT 0xC0000000 | 0x0232
+#define NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND 0xC0000000 | 0x0233
+#define NT_STATUS_ACCOUNT_LOCKED_OUT 0xC0000000 | 0x0234
+#define NT_STATUS_HANDLE_NOT_CLOSABLE 0xC0000000 | 0x0235
+#define NT_STATUS_CONNECTION_REFUSED 0xC0000000 | 0x0236
+#define NT_STATUS_GRACEFUL_DISCONNECT 0xC0000000 | 0x0237
+#define NT_STATUS_ADDRESS_ALREADY_ASSOCIATED 0xC0000000 | 0x0238
+#define NT_STATUS_ADDRESS_NOT_ASSOCIATED 0xC0000000 | 0x0239
+#define NT_STATUS_CONNECTION_INVALID 0xC0000000 | 0x023a
+#define NT_STATUS_CONNECTION_ACTIVE 0xC0000000 | 0x023b
+#define NT_STATUS_NETWORK_UNREACHABLE 0xC0000000 | 0x023c
+#define NT_STATUS_HOST_UNREACHABLE 0xC0000000 | 0x023d
+#define NT_STATUS_PROTOCOL_UNREACHABLE 0xC0000000 | 0x023e
+#define NT_STATUS_PORT_UNREACHABLE 0xC0000000 | 0x023f
+#define NT_STATUS_REQUEST_ABORTED 0xC0000000 | 0x0240
+#define NT_STATUS_CONNECTION_ABORTED 0xC0000000 | 0x0241
+#define NT_STATUS_BAD_COMPRESSION_BUFFER 0xC0000000 | 0x0242
+#define NT_STATUS_USER_MAPPED_FILE 0xC0000000 | 0x0243
+#define NT_STATUS_AUDIT_FAILED 0xC0000000 | 0x0244
+#define NT_STATUS_TIMER_RESOLUTION_NOT_SET 0xC0000000 | 0x0245
+#define NT_STATUS_CONNECTION_COUNT_LIMIT 0xC0000000 | 0x0246
+#define NT_STATUS_LOGIN_TIME_RESTRICTION 0xC0000000 | 0x0247
+#define NT_STATUS_LOGIN_WKSTA_RESTRICTION 0xC0000000 | 0x0248
+#define NT_STATUS_IMAGE_MP_UP_MISMATCH 0xC0000000 | 0x0249
+#define NT_STATUS_INSUFFICIENT_LOGON_INFO 0xC0000000 | 0x0250
+#define NT_STATUS_BAD_DLL_ENTRYPOINT 0xC0000000 | 0x0251
+#define NT_STATUS_BAD_SERVICE_ENTRYPOINT 0xC0000000 | 0x0252
+#define NT_STATUS_LPC_REPLY_LOST 0xC0000000 | 0x0253
+#define NT_STATUS_IP_ADDRESS_CONFLICT1 0xC0000000 | 0x0254
+#define NT_STATUS_IP_ADDRESS_CONFLICT2 0xC0000000 | 0x0255
+#define NT_STATUS_REGISTRY_QUOTA_LIMIT 0xC0000000 | 0x0256
+#define NT_STATUS_PATH_NOT_COVERED 0xC0000000 | 0x0257
+#define NT_STATUS_NO_CALLBACK_ACTIVE 0xC0000000 | 0x0258
+#define NT_STATUS_LICENSE_QUOTA_EXCEEDED 0xC0000000 | 0x0259
+#define NT_STATUS_PWD_TOO_SHORT 0xC0000000 | 0x025a
+#define NT_STATUS_PWD_TOO_RECENT 0xC0000000 | 0x025b
+#define NT_STATUS_PWD_HISTORY_CONFLICT 0xC0000000 | 0x025c
+#define NT_STATUS_PLUGPLAY_NO_DEVICE 0xC0000000 | 0x025e
+#define NT_STATUS_UNSUPPORTED_COMPRESSION 0xC0000000 | 0x025f
+#define NT_STATUS_INVALID_HW_PROFILE 0xC0000000 | 0x0260
+#define NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH 0xC0000000 | 0x0261
+#define NT_STATUS_DRIVER_ORDINAL_NOT_FOUND 0xC0000000 | 0x0262
+#define NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0263
+#define NT_STATUS_RESOURCE_NOT_OWNED 0xC0000000 | 0x0264
+#define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265
+#define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266
+#define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267
+#define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE	/* scheduler */
+
+#endif				/* _NTERR_H */
diff --git a/kernel/fs/cifs/ntlmssp.h b/kernel/fs/cifs/ntlmssp.h
new file mode 100644
index 000000000..848249fa1
--- /dev/null
+++ b/kernel/fs/cifs/ntlmssp.h
@@ -0,0 +1,138 @@
+/*
+ *   fs/cifs/ntlmssp.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define NTLMSSP_SIGNATURE "NTLMSSP"
+/* Message Types */
+#define NtLmNegotiate     cpu_to_le32(1)
+#define NtLmChallenge     cpu_to_le32(2)
+#define NtLmAuthenticate  cpu_to_le32(3)
+#define UnknownMessage    cpu_to_le32(8)
+
+/* Negotiate Flags */
+#define NTLMSSP_NEGOTIATE_UNICODE         0x01 /* Text strings are unicode */
+#define NTLMSSP_NEGOTIATE_OEM             0x02 /* Text strings are in OEM */
+#define NTLMSSP_REQUEST_TARGET            0x04 /* Srv returns its auth realm */
+/* define reserved9                       0x08 */
+#define NTLMSSP_NEGOTIATE_SIGN          0x0010 /* Request signing capability */
+#define NTLMSSP_NEGOTIATE_SEAL          0x0020 /* Request confidentiality */
+#define NTLMSSP_NEGOTIATE_DGRAM         0x0040
+#define NTLMSSP_NEGOTIATE_LM_KEY        0x0080 /* Use LM session key */
+/* defined reserved 8                   0x0100 */
+#define NTLMSSP_NEGOTIATE_NTLM          0x0200 /* NTLM authentication */
+#define NTLMSSP_NEGOTIATE_NT_ONLY       0x0400 /* Lanman not allowed */
+#define NTLMSSP_ANONYMOUS               0x0800
+#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */
+#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000
+#define NTLMSSP_NEGOTIATE_LOCAL_CALL    0x4000 /* client/server same machine */
+#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN   0x8000 /* Sign. All security levels  */
+#define NTLMSSP_TARGET_TYPE_DOMAIN     0x10000
+#define NTLMSSP_TARGET_TYPE_SERVER     0x20000
+#define NTLMSSP_TARGET_TYPE_SHARE      0x40000
+#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/
+/* #define NTLMSSP_REQUEST_INIT_RESP     0x100000 */
+#define NTLMSSP_NEGOTIATE_IDENTIFY    0x100000
+#define NTLMSSP_REQUEST_ACCEPT_RESP   0x200000 /* reserved5 */
+#define NTLMSSP_REQUEST_NON_NT_KEY    0x400000
+#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
+/* #define reserved4                 0x1000000 */
+#define NTLMSSP_NEGOTIATE_VERSION    0x2000000 /* we do not set */
+/* #define reserved3                 0x4000000 */
+/* #define reserved2                 0x8000000 */
+/* #define reserved1                0x10000000 */
+#define NTLMSSP_NEGOTIATE_128       0x20000000
+#define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
+#define NTLMSSP_NEGOTIATE_56        0x80000000
+
+/* Define AV Pair Field IDs */
+enum av_field_type {
+	NTLMSSP_AV_EOL = 0,
+	NTLMSSP_AV_NB_COMPUTER_NAME,
+	NTLMSSP_AV_NB_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_COMPUTER_NAME,
+	NTLMSSP_AV_DNS_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_TREE_NAME,
+	NTLMSSP_AV_FLAGS,
+	NTLMSSP_AV_TIMESTAMP,
+	NTLMSSP_AV_RESTRICTION,
+	NTLMSSP_AV_TARGET_NAME,
+	NTLMSSP_AV_CHANNEL_BINDINGS
+};
+
+/* Although typedefs are not commonly used for structure definitions */
+/* in the Linux kernel, in this particular case they are useful      */
+/* to more closely match the standards document for NTLMSSP from     */
+/* OpenGroup and to make the code more closely match the standard in */
+/* appearance */
+
+typedef struct _SECURITY_BUFFER {
+	__le16 Length;
+	__le16 MaximumLength;
+	__le32 BufferOffset;	/* offset to buffer */
+} __attribute__((packed)) SECURITY_BUFFER;
+
+typedef struct _NEGOTIATE_MESSAGE {
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+	__le32 MessageType;     /* NtLmNegotiate = 1 */
+	__le32 NegotiateFlags;
+	SECURITY_BUFFER DomainName;	/* RFC 1001 style and ASCII */
+	SECURITY_BUFFER WorkstationName;	/* RFC 1001 and ASCII */
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
+	char DomainString[0];
+	/* followed by WorkstationString */
+} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
+
+typedef struct _CHALLENGE_MESSAGE {
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+	__le32 MessageType;   /* NtLmChallenge = 2 */
+	SECURITY_BUFFER TargetName;
+	__le32 NegotiateFlags;
+	__u8 Challenge[CIFS_CRYPTO_KEY_SIZE];
+	__u8 Reserved[8];
+	SECURITY_BUFFER TargetInfoArray;
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
+} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE;
+
+typedef struct _AUTHENTICATE_MESSAGE {
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+	__le32 MessageType;  /* NtLmsAuthenticate = 3 */
+	SECURITY_BUFFER LmChallengeResponse;
+	SECURITY_BUFFER NtChallengeResponse;
+	SECURITY_BUFFER DomainName;
+	SECURITY_BUFFER UserName;
+	SECURITY_BUFFER WorkstationName;
+	SECURITY_BUFFER SessionKey;
+	__le32 NegotiateFlags;
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
+	char UserString[0];
+} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
+
+/*
+ * Size of the session key (crypto key encrypted with the password
+ */
+
+int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses);
+void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses);
+int build_ntlmssp_auth_blob(unsigned char *pbuffer, u16 *buflen,
+			struct cifs_ses *ses,
+			const struct nls_table *nls_cp);
diff --git a/kernel/fs/cifs/readdir.c b/kernel/fs/cifs/readdir.c
new file mode 100644
index 000000000..b1eede367
--- /dev/null
+++ b/kernel/fs/cifs/readdir.c
@@ -0,0 +1,875 @@
+/*
+ *   fs/cifs/readdir.c
+ *
+ *   Directory search handling
+ *
+ *   Copyright (C) International Business Machines  Corp., 2004, 2008
+ *   Copyright (C) Red Hat, Inc., 2011
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifsfs.h"
+
+/*
+ * To be safe - for UCS to UTF-8 with strings loaded with the rare long
+ * characters alloc more to account for such multibyte target UTF-8
+ * characters.
+ */
+#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2)
+
+#ifdef CONFIG_CIFS_DEBUG2
+static void dump_cifs_file_struct(struct file *file, char *label)
+{
+	struct cifsFileInfo *cf;
+
+	if (file) {
+		cf = file->private_data;
+		if (cf == NULL) {
+			cifs_dbg(FYI, "empty cifs private file data\n");
+			return;
+		}
+		if (cf->invalidHandle)
+			cifs_dbg(FYI, "invalid handle\n");
+		if (cf->srch_inf.endOfSearch)
+			cifs_dbg(FYI, "end of search\n");
+		if (cf->srch_inf.emptyDir)
+			cifs_dbg(FYI, "empty dir\n");
+	}
+}
+#else
+static inline void dump_cifs_file_struct(struct file *file, char *label)
+{
+}
+#endif /* DEBUG2 */
+
+/*
+ * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
+ *
+ * Find the dentry that matches "name". If there isn't one, create one. If it's
+ * a negative dentry or the uniqueid or filetype(mode) changed,
+ * then drop it and recreate it.
+ */
+static void
+cifs_prime_dcache(struct dentry *parent, struct qstr *name,
+		    struct cifs_fattr *fattr)
+{
+	struct dentry *dentry, *alias;
+	struct inode *inode;
+	struct super_block *sb = d_inode(parent)->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
+
+	dentry = d_hash_and_lookup(parent, name);
+	if (unlikely(IS_ERR(dentry)))
+		return;
+
+	if (dentry) {
+		inode = d_inode(dentry);
+		if (inode) {
+			if (d_mountpoint(dentry))
+				goto out;
+			/*
+			 * If we're generating inode numbers, then we don't
+			 * want to clobber the existing one with the one that
+			 * the readdir code created.
+			 */
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+				fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
+
+			/* update inode in place
+			 * if both i_ino and i_mode didn't change */
+			if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid &&
+			    (inode->i_mode & S_IFMT) ==
+			    (fattr->cf_mode & S_IFMT)) {
+				cifs_fattr_to_inode(inode, fattr);
+				goto out;
+			}
+		}
+		d_invalidate(dentry);
+		dput(dentry);
+	}
+
+	/*
+	 * If we know that the inode will need to be revalidated immediately,
+	 * then don't create a new dentry for it. We'll end up doing an on
+	 * the wire call either way and this spares us an invalidation.
+	 */
+	if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+		return;
+
+	dentry = d_alloc(parent, name);
+	if (!dentry)
+		return;
+
+	inode = cifs_iget(sb, fattr);
+	if (!inode)
+		goto out;
+
+	alias = d_splice_alias(inode, dentry);
+	if (alias && !IS_ERR(alias))
+		dput(alias);
+out:
+	dput(dentry);
+}
+
+static void
+cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
+{
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	fattr->cf_gid = cifs_sb->mnt_gid;
+
+	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
+		fattr->cf_dtype = DT_DIR;
+	} else {
+		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
+		fattr->cf_dtype = DT_REG;
+	}
+
+	/*
+	 * We need to revalidate it further to make a decision about whether it
+	 * is a symbolic link, DFS referral or a reparse point with a direct
+	 * access like junctions, deduplicated files, NFS symlinks.
+	 */
+	if (fattr->cf_cifsattrs & ATTR_REPARSE)
+		fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
+	/* non-unix readdir doesn't provide nlink */
+	fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+
+	if (fattr->cf_cifsattrs & ATTR_READONLY)
+		fattr->cf_mode &= ~S_IWUGO;
+
+	/*
+	 * We of course don't get ACL info in FIND_FIRST/NEXT results, so
+	 * mark it for revalidation so that "ls -l" will look right. It might
+	 * be super-slow, but if we don't do this then the ownership of files
+	 * may look wrong since the inodes may not have timed out by the time
+	 * "ls" does a stat() call on them.
+	 */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+		fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
+	    fattr->cf_cifsattrs & ATTR_SYSTEM) {
+		if (fattr->cf_eof == 0)  {
+			fattr->cf_mode &= ~S_IFMT;
+			fattr->cf_mode |= S_IFIFO;
+			fattr->cf_dtype = DT_FIFO;
+		} else {
+			/*
+			 * trying to get the type and mode via SFU can be slow,
+			 * so just call those regular files for now, and mark
+			 * for reval
+			 */
+			fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+		}
+	}
+}
+
+void
+cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
+		       struct cifs_sb_info *cifs_sb)
+{
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
+	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
+	fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+	fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
+	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+
+	cifs_fill_common_info(fattr, cifs_sb);
+}
+
+static void
+cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
+		       struct cifs_sb_info *cifs_sb)
+{
+	int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
+
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
+					    info->LastAccessTime, offset);
+	fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate,
+					    info->LastWriteTime, offset);
+	fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate,
+					    info->LastWriteTime, offset);
+
+	fattr->cf_cifsattrs = le16_to_cpu(info->Attributes);
+	fattr->cf_bytes = le32_to_cpu(info->AllocationSize);
+	fattr->cf_eof = le32_to_cpu(info->DataSize);
+
+	cifs_fill_common_info(fattr, cifs_sb);
+}
+
+/* BB eventually need to add the following helper function to
+      resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
+      we try to do FindFirst on (NTFS) directory symlinks */
+/*
+int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
+			     unsigned int xid)
+{
+	__u16 fid;
+	int len;
+	int oplock = 0;
+	int rc;
+	struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb);
+	char *tmpbuffer;
+
+	rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
+			OPEN_REPARSE_POINT, &fid, &oplock, NULL,
+			cifs_sb->local_nls,
+			cifs_remap(cifs_sb);
+	if (!rc) {
+		tmpbuffer = kmalloc(maxpath);
+		rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
+				tmpbuffer,
+				maxpath -1,
+				fid,
+				cifs_sb->local_nls);
+		if (CIFSSMBClose(xid, ptcon, fid)) {
+			cifs_dbg(FYI, "Error closing temporary reparsepoint open\n");
+		}
+	}
+}
+ */
+
+static int
+initiate_cifs_search(const unsigned int xid, struct file *file)
+{
+	__u16 search_flags;
+	int rc = 0;
+	char *full_path = NULL;
+	struct cifsFileInfo *cifsFile;
+	struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+
+	if (file->private_data == NULL) {
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink))
+			return PTR_ERR(tlink);
+
+		cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+		if (cifsFile == NULL) {
+			rc = -ENOMEM;
+			goto error_exit;
+		}
+		file->private_data = cifsFile;
+		cifsFile->tlink = cifs_get_tlink(tlink);
+		tcon = tlink_tcon(tlink);
+	} else {
+		cifsFile = file->private_data;
+		tcon = tlink_tcon(cifsFile->tlink);
+	}
+
+	server = tcon->ses->server;
+
+	if (!server->ops->query_dir_first) {
+		rc = -ENOSYS;
+		goto error_exit;
+	}
+
+	cifsFile->invalidHandle = true;
+	cifsFile->srch_inf.endOfSearch = false;
+
+	full_path = build_path_from_dentry(file->f_path.dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto error_exit;
+	}
+
+	cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos);
+
+ffirst_retry:
+	/* test for Unix extensions */
+	/* but now check for them on the share/mount not on the SMB session */
+	/* if (cap_unix(tcon->ses) { */
+	if (tcon->unix_ext)
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
+	else if ((tcon->ses->capabilities &
+		  tcon->ses->server->vals->cap_nt_find) == 0) {
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
+	} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+	} else /* not srvinos - BB fixme add check for backlevel? */ {
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
+	}
+
+	search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
+	if (backup_cred(cifs_sb))
+		search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
+
+	rc = server->ops->query_dir_first(xid, tcon, full_path, cifs_sb,
+					  &cifsFile->fid, search_flags,
+					  &cifsFile->srch_inf);
+
+	if (rc == 0)
+		cifsFile->invalidHandle = false;
+	/* BB add following call to handle readdir on new NTFS symlink errors
+	else if STATUS_STOPPED_ON_SYMLINK
+		call get_symlink_reparse_path and retry with new path */
+	else if ((rc == -EOPNOTSUPP) &&
+		(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+		goto ffirst_retry;
+	}
+error_exit:
+	kfree(full_path);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+/* return length of unicode string in bytes */
+static int cifs_unicode_bytelen(const char *str)
+{
+	int len;
+	const __le16 *ustr = (const __le16 *)str;
+
+	for (len = 0; len <= PATH_MAX; len++) {
+		if (ustr[len] == 0)
+			return len << 1;
+	}
+	cifs_dbg(FYI, "Unicode string longer than PATH_MAX found\n");
+	return len << 1;
+}
+
+static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
+{
+	char *new_entry;
+	FILE_DIRECTORY_INFO *pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
+
+	if (level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO *pfData;
+		pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
+
+		new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) +
+				pfData->FileNameLength;
+	} else
+		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
+	cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry);
+	/* validate that new_entry is not past end of SMB */
+	if (new_entry >= end_of_smb) {
+		cifs_dbg(VFS, "search entry %p began after end of SMB %p old entry %p\n",
+			 new_entry, end_of_smb, old_entry);
+		return NULL;
+	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
+		    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
+		  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
+		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
+		cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n",
+			 new_entry, end_of_smb);
+		return NULL;
+	} else
+		return new_entry;
+
+}
+
+struct cifs_dirent {
+	const char	*name;
+	size_t		namelen;
+	u32		resume_key;
+	u64		ino;
+};
+
+static void cifs_fill_dirent_unix(struct cifs_dirent *de,
+		const FILE_UNIX_INFO *info, bool is_unicode)
+{
+	de->name = &info->FileName[0];
+	if (is_unicode)
+		de->namelen = cifs_unicode_bytelen(de->name);
+	else
+		de->namelen = strnlen(de->name, PATH_MAX);
+	de->resume_key = info->ResumeKey;
+	de->ino = le64_to_cpu(info->basic.UniqueId);
+}
+
+static void cifs_fill_dirent_dir(struct cifs_dirent *de,
+		const FILE_DIRECTORY_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+}
+
+static void cifs_fill_dirent_full(struct cifs_dirent *de,
+		const FILE_FULL_DIRECTORY_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+}
+
+static void cifs_fill_dirent_search(struct cifs_dirent *de,
+		const SEARCH_ID_FULL_DIR_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+	de->ino = le64_to_cpu(info->UniqueId);
+}
+
+static void cifs_fill_dirent_both(struct cifs_dirent *de,
+		const FILE_BOTH_DIRECTORY_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+}
+
+static void cifs_fill_dirent_std(struct cifs_dirent *de,
+		const FIND_FILE_STANDARD_INFO *info)
+{
+	de->name = &info->FileName[0];
+	/* one byte length, no endianess conversion */
+	de->namelen = info->FileNameLength;
+	de->resume_key = info->ResumeKey;
+}
+
+static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
+		u16 level, bool is_unicode)
+{
+	memset(de, 0, sizeof(*de));
+
+	switch (level) {
+	case SMB_FIND_FILE_UNIX:
+		cifs_fill_dirent_unix(de, info, is_unicode);
+		break;
+	case SMB_FIND_FILE_DIRECTORY_INFO:
+		cifs_fill_dirent_dir(de, info);
+		break;
+	case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+		cifs_fill_dirent_full(de, info);
+		break;
+	case SMB_FIND_FILE_ID_FULL_DIR_INFO:
+		cifs_fill_dirent_search(de, info);
+		break;
+	case SMB_FIND_FILE_BOTH_DIRECTORY_INFO:
+		cifs_fill_dirent_both(de, info);
+		break;
+	case SMB_FIND_FILE_INFO_STANDARD:
+		cifs_fill_dirent_std(de, info);
+		break;
+	default:
+		cifs_dbg(FYI, "Unknown findfirst level %d\n", level);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define UNICODE_DOT cpu_to_le16(0x2e)
+
+/* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */
+static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
+{
+	int rc = 0;
+
+	if (!de->name)
+		return 0;
+
+	if (is_unicode) {
+		__le16 *ufilename = (__le16 *)de->name;
+		if (de->namelen == 2) {
+			/* check for . */
+			if (ufilename[0] == UNICODE_DOT)
+				rc = 1;
+		} else if (de->namelen == 4) {
+			/* check for .. */
+			if (ufilename[0] == UNICODE_DOT &&
+			    ufilename[1] == UNICODE_DOT)
+				rc = 2;
+		}
+	} else /* ASCII */ {
+		if (de->namelen == 1) {
+			if (de->name[0] == '.')
+				rc = 1;
+		} else if (de->namelen == 2) {
+			if (de->name[0] == '.' && de->name[1] == '.')
+				rc = 2;
+		}
+	}
+
+	return rc;
+}
+
+/* Check if directory that we are searching has changed so we can decide
+   whether we can use the cached search results from the previous search */
+static int is_dir_changed(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+
+	if (cifsInfo->time == 0)
+		return 1; /* directory was changed, perhaps due to unlink */
+	else
+		return 0;
+
+}
+
+static int cifs_save_resume_key(const char *current_entry,
+	struct cifsFileInfo *file_info)
+{
+	struct cifs_dirent de;
+	int rc;
+
+	rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level,
+			      file_info->srch_inf.unicode);
+	if (!rc) {
+		file_info->srch_inf.presume_name = de.name;
+		file_info->srch_inf.resume_name_len = de.namelen;
+		file_info->srch_inf.resume_key = de.resume_key;
+	}
+	return rc;
+}
+
+/*
+ * Find the corresponding entry in the search. Note that the SMB server returns
+ * search entries for . and .. which complicates logic here if we choose to
+ * parse for them and we do not assume that they are located in the findfirst
+ * return buffer. We start counting in the buffer with entry 2 and increment for
+ * every entry (do not increment for . or .. entry).
+ */
+static int
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
+		struct file *file, char **current_entry, int *num_to_ret)
+{
+	__u16 search_flags;
+	int rc = 0;
+	int pos_in_buf = 0;
+	loff_t first_entry_in_buffer;
+	loff_t index_to_find = pos;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	/* check if index in the buffer */
+
+	if (!server->ops->query_dir_first || !server->ops->query_dir_next)
+		return -ENOSYS;
+
+	if ((cfile == NULL) || (current_entry == NULL) || (num_to_ret == NULL))
+		return -ENOENT;
+
+	*current_entry = NULL;
+	first_entry_in_buffer = cfile->srch_inf.index_of_last_entry -
+					cfile->srch_inf.entries_in_buffer;
+
+	/*
+	 * If first entry in buf is zero then is first buffer
+	 * in search response data which means it is likely . and ..
+	 * will be in this buffer, although some servers do not return
+	 * . and .. for the root of a drive and for those we need
+	 * to start two entries earlier.
+	 */
+
+	dump_cifs_file_struct(file, "In fce ");
+	if (((index_to_find < cfile->srch_inf.index_of_last_entry) &&
+	     is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) {
+		/* close and restart search */
+		cifs_dbg(FYI, "search backing up - close and restart search\n");
+		spin_lock(&cifs_file_list_lock);
+		if (server->ops->dir_needs_close(cfile)) {
+			cfile->invalidHandle = true;
+			spin_unlock(&cifs_file_list_lock);
+			if (server->ops->close_dir)
+				server->ops->close_dir(xid, tcon, &cfile->fid);
+		} else
+			spin_unlock(&cifs_file_list_lock);
+		if (cfile->srch_inf.ntwrk_buf_start) {
+			cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n");
+			if (cfile->srch_inf.smallBuf)
+				cifs_small_buf_release(cfile->srch_inf.
+						ntwrk_buf_start);
+			else
+				cifs_buf_release(cfile->srch_inf.
+						ntwrk_buf_start);
+			cfile->srch_inf.ntwrk_buf_start = NULL;
+		}
+		rc = initiate_cifs_search(xid, file);
+		if (rc) {
+			cifs_dbg(FYI, "error %d reinitiating a search on rewind\n",
+				 rc);
+			return rc;
+		}
+		/* FindFirst/Next set last_entry to NULL on malformed reply */
+		if (cfile->srch_inf.last_entry)
+			cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
+	}
+
+	search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
+	if (backup_cred(cifs_sb))
+		search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
+
+	while ((index_to_find >= cfile->srch_inf.index_of_last_entry) &&
+	       (rc == 0) && !cfile->srch_inf.endOfSearch) {
+		cifs_dbg(FYI, "calling findnext2\n");
+		rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
+						 search_flags,
+						 &cfile->srch_inf);
+		/* FindFirst/Next set last_entry to NULL on malformed reply */
+		if (cfile->srch_inf.last_entry)
+			cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
+		if (rc)
+			return -ENOENT;
+	}
+	if (index_to_find < cfile->srch_inf.index_of_last_entry) {
+		/* we found the buffer that contains the entry */
+		/* scan and find it */
+		int i;
+		char *cur_ent;
+		char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
+			server->ops->calc_smb_size(
+					cfile->srch_inf.ntwrk_buf_start);
+
+		cur_ent = cfile->srch_inf.srch_entries_start;
+		first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
+					- cfile->srch_inf.entries_in_buffer;
+		pos_in_buf = index_to_find - first_entry_in_buffer;
+		cifs_dbg(FYI, "found entry - pos_in_buf %d\n", pos_in_buf);
+
+		for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) {
+			/* go entry by entry figuring out which is first */
+			cur_ent = nxt_dir_entry(cur_ent, end_of_smb,
+						cfile->srch_inf.info_level);
+		}
+		if ((cur_ent == NULL) && (i < pos_in_buf)) {
+			/* BB fixme - check if we should flag this error */
+			cifs_dbg(VFS, "reached end of buf searching for pos in buf %d index to find %lld rc %d\n",
+				 pos_in_buf, index_to_find, rc);
+		}
+		rc = 0;
+		*current_entry = cur_ent;
+	} else {
+		cifs_dbg(FYI, "index not in buffer - could not findnext into it\n");
+		return 0;
+	}
+
+	if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) {
+		cifs_dbg(FYI, "can not return entries pos_in_buf beyond last\n");
+		*num_to_ret = 0;
+	} else
+		*num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf;
+
+	return rc;
+}
+
+static int cifs_filldir(char *find_entry, struct file *file,
+		struct dir_context *ctx,
+		char *scratch_buf, unsigned int max_len)
+{
+	struct cifsFileInfo *file_info = file->private_data;
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_dirent de = { NULL, };
+	struct cifs_fattr fattr;
+	struct qstr name;
+	int rc = 0;
+	ino_t ino;
+
+	rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level,
+			      file_info->srch_inf.unicode);
+	if (rc)
+		return rc;
+
+	if (de.namelen > max_len) {
+		cifs_dbg(VFS, "bad search response length %zd past smb end\n",
+			 de.namelen);
+		return -EINVAL;
+	}
+
+	/* skip . and .. since we added them first */
+	if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode))
+		return 0;
+
+	if (file_info->srch_inf.unicode) {
+		struct nls_table *nlt = cifs_sb->local_nls;
+		int map_type;
+
+		map_type = cifs_remap(cifs_sb);
+		name.name = scratch_buf;
+		name.len =
+			cifs_from_utf16((char *)name.name, (__le16 *)de.name,
+					UNICODE_NAME_MAX,
+					min_t(size_t, de.namelen,
+					      (size_t)max_len), nlt, map_type);
+		name.len -= nls_nullsize(nlt);
+	} else {
+		name.name = de.name;
+		name.len = de.namelen;
+	}
+
+	switch (file_info->srch_inf.info_level) {
+	case SMB_FIND_FILE_UNIX:
+		cifs_unix_basic_to_fattr(&fattr,
+					 &((FILE_UNIX_INFO *)find_entry)->basic,
+					 cifs_sb);
+		break;
+	case SMB_FIND_FILE_INFO_STANDARD:
+		cifs_std_info_to_fattr(&fattr,
+				       (FIND_FILE_STANDARD_INFO *)find_entry,
+				       cifs_sb);
+		break;
+	default:
+		cifs_dir_info_to_fattr(&fattr,
+				       (FILE_DIRECTORY_INFO *)find_entry,
+				       cifs_sb);
+		break;
+	}
+
+	if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+		fattr.cf_uniqueid = de.ino;
+	} else {
+		fattr.cf_uniqueid = iunique(sb, ROOT_I);
+		cifs_autodisable_serverino(cifs_sb);
+	}
+
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
+	    couldbe_mf_symlink(&fattr))
+		/*
+		 * trying to get the type and mode can be slow,
+		 * so just call those regular files for now, and mark
+		 * for reval
+		 */
+		fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+
+	cifs_prime_dcache(file->f_path.dentry, &name, &fattr);
+
+	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
+	return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
+}
+
+
+int cifs_readdir(struct file *file, struct dir_context *ctx)
+{
+	int rc = 0;
+	unsigned int xid;
+	int i;
+	struct cifs_tcon *tcon;
+	struct cifsFileInfo *cifsFile = NULL;
+	char *current_entry;
+	int num_to_fill = 0;
+	char *tmp_buf = NULL;
+	char *end_of_smb;
+	unsigned int max_len;
+
+	xid = get_xid();
+
+	/*
+	 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
+	 * '..'. Otherwise we won't be able to notify VFS in case of failure.
+	 */
+	if (file->private_data == NULL) {
+		rc = initiate_cifs_search(xid, file);
+		cifs_dbg(FYI, "initiate cifs search rc %d\n", rc);
+		if (rc)
+			goto rddir2_exit;
+	}
+
+	if (!dir_emit_dots(file, ctx))
+		goto rddir2_exit;
+
+	/* 1) If search is active,
+		is in current search buffer?
+		if it before then restart search
+		if after then keep searching till find it */
+
+	cifsFile = file->private_data;
+	if (cifsFile->srch_inf.endOfSearch) {
+		if (cifsFile->srch_inf.emptyDir) {
+			cifs_dbg(FYI, "End of search, empty dir\n");
+			rc = 0;
+			goto rddir2_exit;
+		}
+	} /* else {
+		cifsFile->invalidHandle = true;
+		tcon->ses->server->close(xid, tcon, &cifsFile->fid);
+	} */
+
+	tcon = tlink_tcon(cifsFile->tlink);
+	rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
+			     &num_to_fill);
+	if (rc) {
+		cifs_dbg(FYI, "fce error %d\n", rc);
+		goto rddir2_exit;
+	} else if (current_entry != NULL) {
+		cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+	} else {
+		cifs_dbg(FYI, "could not find entry\n");
+		goto rddir2_exit;
+	}
+	cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+		 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+	max_len = tcon->ses->server->ops->calc_smb_size(
+			cifsFile->srch_inf.ntwrk_buf_start);
+	end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
+	tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+	if (tmp_buf == NULL) {
+		rc = -ENOMEM;
+		goto rddir2_exit;
+	}
+
+	for (i = 0; i < num_to_fill; i++) {
+		if (current_entry == NULL) {
+			/* evaluate whether this case is an error */
+			cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+				 num_to_fill, i);
+			break;
+		}
+		/*
+		 * if buggy server returns . and .. late do we want to
+		 * check for that here?
+		 */
+		rc = cifs_filldir(current_entry, file, ctx,
+				  tmp_buf, max_len);
+		if (rc) {
+			if (rc > 0)
+				rc = 0;
+			break;
+		}
+
+		ctx->pos++;
+		if (ctx->pos ==
+			cifsFile->srch_inf.index_of_last_entry) {
+			cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+				 ctx->pos, tmp_buf);
+			cifs_save_resume_key(current_entry, cifsFile);
+			break;
+		} else
+			current_entry =
+				nxt_dir_entry(current_entry, end_of_smb,
+					cifsFile->srch_inf.info_level);
+	}
+	kfree(tmp_buf);
+
+rddir2_exit:
+	free_xid(xid);
+	return rc;
+}
diff --git a/kernel/fs/cifs/rfc1002pdu.h b/kernel/fs/cifs/rfc1002pdu.h
new file mode 100644
index 000000000..8b69fcceb
--- /dev/null
+++ b/kernel/fs/cifs/rfc1002pdu.h
@@ -0,0 +1,74 @@
+/*
+ *   fs/cifs/rfc1002pdu.h
+ *
+ *   Protocol Data Unit definitions for RFC 1001/1002 support
+ *
+ *   Copyright (c) International Business Machines  Corp., 2004
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */
+
+	/* RFC 1002 session packet types */
+#define RFC1002_SESSION_MESSAGE 0x00
+#define RFC1002_SESSION_REQUEST  0x81
+#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82
+#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83
+#define RFC1002_RETARGET_SESSION_RESPONSE 0x84
+#define RFC1002_SESSION_KEEP_ALIVE 0x85
+
+	/* RFC 1002 flags (only one defined */
+#define RFC1002_LENGTH_EXTEND 0x80 /* high order bit of length (ie +64K) */
+
+struct rfc1002_session_packet {
+	__u8	type;
+	__u8	flags;
+	__u16	length;
+	union {
+		struct {
+			__u8 called_len;
+			__u8 called_name[32];
+			__u8 scope1; /* null */
+			__u8 calling_len;
+			__u8 calling_name[32];
+			__u8 scope2; /* null */
+		} __attribute__((packed)) session_req;
+		struct {
+			__u32 retarget_ip_addr;
+			__u16 port;
+		} __attribute__((packed)) retarget_resp;
+		__u8 neg_ses_resp_error_code;
+		/* POSITIVE_SESSION_RESPONSE packet does not include trailer.
+		SESSION_KEEP_ALIVE packet also does not include a trailer.
+		Trailer for the SESSION_MESSAGE packet is SMB/CIFS header */
+	} __attribute__((packed)) trailer;
+} __attribute__((packed));
+
+/* Negative Session Response error codes */
+#define RFC1002_NOT_LISTENING_CALLED  0x80 /* not listening on called name */
+#define RFC1002_NOT_LISTENING_CALLING 0x81 /* not listening on calling name */
+#define RFC1002_NOT_PRESENT           0x82 /* called name not present */
+#define RFC1002_INSUFFICIENT_RESOURCE 0x83
+#define RFC1002_UNSPECIFIED_ERROR     0x8F
+
+/* RFC 1002 Datagram service packets are not defined here as they
+are not needed for the network filesystem client unless we plan on
+implementing broadcast resolution of the server ip address (from
+server netbios name). Currently server names are resolved only via DNS
+(tcp name) or ip address or an /etc/hosts equivalent mapping to ip address.*/
+
+#define DEFAULT_CIFS_CALLED_NAME  "*SMBSERVER      "
diff --git a/kernel/fs/cifs/sess.c b/kernel/fs/cifs/sess.c
new file mode 100644
index 000000000..bce6fdcd5
--- /dev/null
+++ b/kernel/fs/cifs/sess.c
@@ -0,0 +1,1442 @@
+/*
+ *   fs/cifs/sess.c
+ *
+ *   SMB/CIFS session setup handling routines
+ *
+ *   Copyright (c) International Business Machines  Corp., 2006, 2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "ntlmssp.h"
+#include "nterr.h"
+#include <linux/utsname.h>
+#include <linux/slab.h>
+#include "cifs_spnego.h"
+
+static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
+{
+	__u32 capabilities = 0;
+
+	/* init fields common to all four types of SessSetup */
+	/* Note that offsets for first seven fields in req struct are same  */
+	/*	in CIFS Specs so does not matter which of 3 forms of struct */
+	/*	that we use in next few lines                               */
+	/* Note that header is initialized to zero in header_assemble */
+	pSMB->req.AndXCommand = 0xFF;
+	pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
+					CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
+					USHRT_MAX));
+	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+	pSMB->req.VcNumber = cpu_to_le16(1);
+
+	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
+
+	/* BB verify whether signing required on neg or just on auth frame
+	   (and NTLM case) */
+
+	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
+			CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
+
+	if (ses->server->sign)
+		pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	if (ses->capabilities & CAP_UNICODE) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE;
+		capabilities |= CAP_UNICODE;
+	}
+	if (ses->capabilities & CAP_STATUS32) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS;
+		capabilities |= CAP_STATUS32;
+	}
+	if (ses->capabilities & CAP_DFS) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_DFS;
+		capabilities |= CAP_DFS;
+	}
+	if (ses->capabilities & CAP_UNIX)
+		capabilities |= CAP_UNIX;
+
+	return capabilities;
+}
+
+static void
+unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+	int bytes_ret = 0;
+
+	/* Copy OS version */
+	bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
+				    nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
+				    32, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* trailing null */
+
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+				    32, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* trailing null */
+
+	*pbcc_area = bcc_ptr;
+}
+
+static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
+				   const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+	int bytes_ret = 0;
+
+	/* copy domain */
+	if (ses->domainName == NULL) {
+		/* Sending null domain better than using a bogus domain name (as
+		we did briefly in 2.6.18) since server will use its default */
+		*bcc_ptr = 0;
+		*(bcc_ptr+1) = 0;
+		bytes_ret = 0;
+	} else
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
+					    CIFS_MAX_DOMAINNAME_LEN, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2;  /* account for null terminator */
+
+	*pbcc_area = bcc_ptr;
+}
+
+
+static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
+				   const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+	int bytes_ret = 0;
+
+	/* BB FIXME add check that strings total less
+	than 335 or will need to send them as arrays */
+
+	/* unicode strings, must be word aligned before the call */
+/*	if ((long) bcc_ptr % 2)	{
+		*bcc_ptr = 0;
+		bcc_ptr++;
+	} */
+	/* copy user */
+	if (ses->user_name == NULL) {
+		/* null user mount */
+		*bcc_ptr = 0;
+		*(bcc_ptr+1) = 0;
+	} else {
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
+					    CIFS_MAX_USERNAME_LEN, nls_cp);
+	}
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* account for null termination */
+
+	unicode_domain_string(&bcc_ptr, ses, nls_cp);
+	unicode_oslm_strings(&bcc_ptr, nls_cp);
+
+	*pbcc_area = bcc_ptr;
+}
+
+static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
+				 const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+
+	/* copy user */
+	/* BB what about null user mounts - check that we do this BB */
+	/* copy user */
+	if (ses->user_name != NULL) {
+		strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN);
+		bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
+	}
+	/* else null user mount */
+	*bcc_ptr = 0;
+	bcc_ptr++; /* account for null termination */
+
+	/* copy domain */
+	if (ses->domainName != NULL) {
+		strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+		bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+	} /* else we will send a null domain name
+	     so the server will default to its own domain */
+	*bcc_ptr = 0;
+	bcc_ptr++;
+
+	/* BB check for overflow here */
+
+	strcpy(bcc_ptr, "Linux version ");
+	bcc_ptr += strlen("Linux version ");
+	strcpy(bcc_ptr, init_utsname()->release);
+	bcc_ptr += strlen(init_utsname()->release) + 1;
+
+	strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+	bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+
+	*pbcc_area = bcc_ptr;
+}
+
+static void
+decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
+		      const struct nls_table *nls_cp)
+{
+	int len;
+	char *data = *pbcc_area;
+
+	cifs_dbg(FYI, "bleft %d\n", bleft);
+
+	kfree(ses->serverOS);
+	ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
+	cifs_dbg(FYI, "serverOS=%s\n", ses->serverOS);
+	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
+	data += len;
+	bleft -= len;
+	if (bleft <= 0)
+		return;
+
+	kfree(ses->serverNOS);
+	ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
+	cifs_dbg(FYI, "serverNOS=%s\n", ses->serverNOS);
+	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
+	data += len;
+	bleft -= len;
+	if (bleft <= 0)
+		return;
+
+	kfree(ses->serverDomain);
+	ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
+	cifs_dbg(FYI, "serverDomain=%s\n", ses->serverDomain);
+
+	return;
+}
+
+static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
+				struct cifs_ses *ses,
+				const struct nls_table *nls_cp)
+{
+	int len;
+	char *bcc_ptr = *pbcc_area;
+
+	cifs_dbg(FYI, "decode sessetup ascii. bleft %d\n", bleft);
+
+	len = strnlen(bcc_ptr, bleft);
+	if (len >= bleft)
+		return;
+
+	kfree(ses->serverOS);
+
+	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
+	if (ses->serverOS) {
+		strncpy(ses->serverOS, bcc_ptr, len);
+		if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+			cifs_dbg(FYI, "OS/2 server\n");
+	}
+
+	bcc_ptr += len + 1;
+	bleft -= len + 1;
+
+	len = strnlen(bcc_ptr, bleft);
+	if (len >= bleft)
+		return;
+
+	kfree(ses->serverNOS);
+
+	ses->serverNOS = kzalloc(len + 1, GFP_KERNEL);
+	if (ses->serverNOS)
+		strncpy(ses->serverNOS, bcc_ptr, len);
+
+	bcc_ptr += len + 1;
+	bleft -= len + 1;
+
+	len = strnlen(bcc_ptr, bleft);
+	if (len > bleft)
+		return;
+
+	/* No domain field in LANMAN case. Domain is
+	   returned by old servers in the SMB negprot response */
+	/* BB For newer servers which do not support Unicode,
+	   but thus do return domain here we could add parsing
+	   for it later, but it is not very important */
+	cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
+}
+
+int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
+				    struct cifs_ses *ses)
+{
+	unsigned int tioffset; /* challenge message target info area */
+	unsigned int tilen; /* challenge message target info area length  */
+
+	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
+
+	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
+		cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len);
+		return -EINVAL;
+	}
+
+	if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
+		cifs_dbg(VFS, "blob signature incorrect %s\n",
+			 pblob->Signature);
+		return -EINVAL;
+	}
+	if (pblob->MessageType != NtLmChallenge) {
+		cifs_dbg(VFS, "Incorrect message type %d\n",
+			 pblob->MessageType);
+		return -EINVAL;
+	}
+
+	memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+	/* BB we could decode pblob->NegotiateFlags; some may be useful */
+	/* In particular we can examine sign flags */
+	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
+		we must set the MIC field of the AUTHENTICATE_MESSAGE */
+	ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
+	tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
+	tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
+	if (tioffset > blob_len || tioffset + tilen > blob_len) {
+		cifs_dbg(VFS, "tioffset + tilen too high %u + %u",
+			tioffset, tilen);
+		return -EINVAL;
+	}
+	if (tilen) {
+		ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen,
+						 GFP_KERNEL);
+		if (!ses->auth_key.response) {
+			cifs_dbg(VFS, "Challenge target info alloc failure");
+			return -ENOMEM;
+		}
+		ses->auth_key.len = tilen;
+	}
+
+	return 0;
+}
+
+/* BB Move to ntlmssp.c eventually */
+
+/* We do not malloc the blob, it is passed in pbuffer, because
+   it is fixed size, and small, making this approach cleaner */
+void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
+					 struct cifs_ses *ses)
+{
+	NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
+	__u32 flags;
+
+	memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
+	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+	sec_blob->MessageType = NtLmNegotiate;
+
+	/* BB is NTLMV2 session security format easier to use here? */
+	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
+		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+	if (ses->server->sign) {
+		flags |= NTLMSSP_NEGOTIATE_SIGN;
+		if (!ses->server->session_estab ||
+				ses->ntlmssp->sesskey_per_smbsess)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+	}
+
+	sec_blob->NegotiateFlags = cpu_to_le32(flags);
+
+	sec_blob->WorkstationName.BufferOffset = 0;
+	sec_blob->WorkstationName.Length = 0;
+	sec_blob->WorkstationName.MaximumLength = 0;
+
+	/* Domain name is sent on the Challenge not Negotiate NTLMSSP request */
+	sec_blob->DomainName.BufferOffset = 0;
+	sec_blob->DomainName.Length = 0;
+	sec_blob->DomainName.MaximumLength = 0;
+}
+
+/* We do not malloc the blob, it is passed in pbuffer, because its
+   maximum possible size is fixed and small, making this approach cleaner.
+   This function returns the length of the data in the blob */
+int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+					u16 *buflen,
+				   struct cifs_ses *ses,
+				   const struct nls_table *nls_cp)
+{
+	int rc;
+	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
+	__u32 flags;
+	unsigned char *tmp;
+
+	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+	sec_blob->MessageType = NtLmAuthenticate;
+
+	flags = NTLMSSP_NEGOTIATE_56 |
+		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
+		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+	if (ses->server->sign) {
+		flags |= NTLMSSP_NEGOTIATE_SIGN;
+		if (!ses->server->session_estab ||
+				ses->ntlmssp->sesskey_per_smbsess)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+	}
+
+	tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
+	sec_blob->NegotiateFlags = cpu_to_le32(flags);
+
+	sec_blob->LmChallengeResponse.BufferOffset =
+				cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
+	sec_blob->LmChallengeResponse.Length = 0;
+	sec_blob->LmChallengeResponse.MaximumLength = 0;
+
+	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
+	rc = setup_ntlmv2_rsp(ses, nls_cp);
+	if (rc) {
+		cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
+		goto setup_ntlmv2_ret;
+	}
+	memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+			ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+	sec_blob->NtChallengeResponse.Length =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	sec_blob->NtChallengeResponse.MaximumLength =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+
+	if (ses->domainName == NULL) {
+		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->DomainName.Length = 0;
+		sec_blob->DomainName.MaximumLength = 0;
+		tmp += 2;
+	} else {
+		int len;
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
+				      CIFS_MAX_USERNAME_LEN, nls_cp);
+		len *= 2; /* unicode is 2 bytes each */
+		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->DomainName.Length = cpu_to_le16(len);
+		sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
+		tmp += len;
+	}
+
+	if (ses->user_name == NULL) {
+		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->UserName.Length = 0;
+		sec_blob->UserName.MaximumLength = 0;
+		tmp += 2;
+	} else {
+		int len;
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
+				      CIFS_MAX_USERNAME_LEN, nls_cp);
+		len *= 2; /* unicode is 2 bytes each */
+		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->UserName.Length = cpu_to_le16(len);
+		sec_blob->UserName.MaximumLength = cpu_to_le16(len);
+		tmp += len;
+	}
+
+	sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+	sec_blob->WorkstationName.Length = 0;
+	sec_blob->WorkstationName.MaximumLength = 0;
+	tmp += 2;
+
+	if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
+		(ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+			&& !calc_seckey(ses)) {
+		memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.MaximumLength =
+				cpu_to_le16(CIFS_CPHTXT_SIZE);
+		tmp += CIFS_CPHTXT_SIZE;
+	} else {
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = 0;
+		sec_blob->SessionKey.MaximumLength = 0;
+	}
+
+setup_ntlmv2_ret:
+	*buflen = tmp - pbuffer;
+	return rc;
+}
+
+enum securityEnum
+select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
+{
+	switch (server->negflavor) {
+	case CIFS_NEGFLAVOR_EXTENDED:
+		switch (requested) {
+		case Kerberos:
+		case RawNTLMSSP:
+			return requested;
+		case Unspecified:
+			if (server->sec_ntlmssp &&
+			    (global_secflags & CIFSSEC_MAY_NTLMSSP))
+				return RawNTLMSSP;
+			if ((server->sec_kerberos || server->sec_mskerberos) &&
+			    (global_secflags & CIFSSEC_MAY_KRB5))
+				return Kerberos;
+			/* Fallthrough */
+		default:
+			return Unspecified;
+		}
+	case CIFS_NEGFLAVOR_UNENCAP:
+		switch (requested) {
+		case NTLM:
+		case NTLMv2:
+			return requested;
+		case Unspecified:
+			if (global_secflags & CIFSSEC_MAY_NTLMV2)
+				return NTLMv2;
+			if (global_secflags & CIFSSEC_MAY_NTLM)
+				return NTLM;
+		default:
+			/* Fallthrough to attempt LANMAN authentication next */
+			break;
+		}
+	case CIFS_NEGFLAVOR_LANMAN:
+		switch (requested) {
+		case LANMAN:
+			return requested;
+		case Unspecified:
+			if (global_secflags & CIFSSEC_MAY_LANMAN)
+				return LANMAN;
+			/* Fallthrough */
+		default:
+			return Unspecified;
+		}
+	default:
+		return Unspecified;
+	}
+}
+
+struct sess_data {
+	unsigned int xid;
+	struct cifs_ses *ses;
+	struct nls_table *nls_cp;
+	void (*func)(struct sess_data *);
+	int result;
+
+	/* we will send the SMB in three pieces:
+	 * a fixed length beginning part, an optional
+	 * SPNEGO blob (which can be zero length), and a
+	 * last part which will include the strings
+	 * and rest of bcc area. This allows us to avoid
+	 * a large buffer 17K allocation
+	 */
+	int buf0_type;
+	struct kvec iov[3];
+};
+
+static int
+sess_alloc_buffer(struct sess_data *sess_data, int wct)
+{
+	int rc;
+	struct cifs_ses *ses = sess_data->ses;
+	struct smb_hdr *smb_buf;
+
+	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
+				  (void **)&smb_buf);
+
+	if (rc)
+		return rc;
+
+	sess_data->iov[0].iov_base = (char *)smb_buf;
+	sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
+	/*
+	 * This variable will be used to clear the buffer
+	 * allocated above in case of any error in the calling function.
+	 */
+	sess_data->buf0_type = CIFS_SMALL_BUFFER;
+
+	/* 2000 big enough to fit max user, domain, NOS name etc. */
+	sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL);
+	if (!sess_data->iov[2].iov_base) {
+		rc = -ENOMEM;
+		goto out_free_smb_buf;
+	}
+
+	return 0;
+
+out_free_smb_buf:
+	kfree(smb_buf);
+	sess_data->iov[0].iov_base = NULL;
+	sess_data->iov[0].iov_len = 0;
+	sess_data->buf0_type = CIFS_NO_BUFFER;
+	return rc;
+}
+
+static void
+sess_free_buffer(struct sess_data *sess_data)
+{
+
+	free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+	sess_data->buf0_type = CIFS_NO_BUFFER;
+	kfree(sess_data->iov[2].iov_base);
+}
+
+static int
+sess_establish_session(struct sess_data *sess_data)
+{
+	struct cifs_ses *ses = sess_data->ses;
+
+	mutex_lock(&ses->server->srv_mutex);
+	if (!ses->server->session_estab) {
+		if (ses->server->sign) {
+			ses->server->session_key.response =
+				kmemdup(ses->auth_key.response,
+				ses->auth_key.len, GFP_KERNEL);
+			if (!ses->server->session_key.response) {
+				mutex_unlock(&ses->server->srv_mutex);
+				return -ENOMEM;
+			}
+			ses->server->session_key.len =
+						ses->auth_key.len;
+		}
+		ses->server->sequence_number = 0x2;
+		ses->server->session_estab = true;
+	}
+	mutex_unlock(&ses->server->srv_mutex);
+
+	cifs_dbg(FYI, "CIFS session established successfully\n");
+	spin_lock(&GlobalMid_Lock);
+	ses->status = CifsGood;
+	ses->need_reconnect = false;
+	spin_unlock(&GlobalMid_Lock);
+
+	return 0;
+}
+
+static int
+sess_sendreceive(struct sess_data *sess_data)
+{
+	int rc;
+	struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
+	__u16 count;
+
+	count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
+	smb_buf->smb_buf_length =
+		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
+	put_bcc(count, smb_buf);
+
+	rc = SendReceive2(sess_data->xid, sess_data->ses,
+			  sess_data->iov, 3 /* num_iovecs */,
+			  &sess_data->buf0_type,
+			  CIFS_LOG_ERROR);
+
+	return rc;
+}
+
+/*
+ * LANMAN and plaintext are less secure and off by default.
+ * So we make this explicitly be turned on in kconfig (in the
+ * build) and turned on at runtime (changed from the default)
+ * in proc/fs/cifs or via mount parm.  Unfortunately this is
+ * needed for old Win (e.g. Win95), some obscure NAS and OS/2
+ */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static void
+sess_auth_lanman(struct sess_data *sess_data)
+{
+	int rc = 0;
+	struct smb_hdr *smb_buf;
+	SESSION_SETUP_ANDX *pSMB;
+	char *bcc_ptr;
+	struct cifs_ses *ses = sess_data->ses;
+	char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+	__u32 capabilities;
+	__u16 bytes_remaining;
+
+	/* lanman 2 style sessionsetup */
+	/* wct = 10 */
+	rc = sess_alloc_buffer(sess_data, 10);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	bcc_ptr = sess_data->iov[2].iov_base;
+	capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+	pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
+
+	/* no capabilities flags in old lanman negotiation */
+	pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+	/* Calculate hash with password and copy into bcc_ptr.
+	 * Encryption Key (stored as in cryptkey) gets used if the
+	 * security mode bit in Negottiate Protocol response states
+	 * to use challenge/response method (i.e. Password bit is 1).
+	 */
+	rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+			      ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+			      true : false, lnm_session_key);
+
+	memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+	bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+	/*
+	 * can not sign if LANMAN negotiated so no need
+	 * to calculate signing key? but what if server
+	 * changed to do higher than lanman dialect and
+	 * we reconnected would we ever calc signing_key?
+	 */
+
+	cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
+	/* Unicode not allowed for LANMAN dialects */
+	ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+
+	sess_data->iov[2].iov_len = (long) bcc_ptr -
+			(long) sess_data->iov[2].iov_base;
+
+	rc = sess_sendreceive(sess_data);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+	/* lanman response has a word count of 3 */
+	if (smb_buf->WordCount != 3) {
+		rc = -EIO;
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+		goto out;
+	}
+
+	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+	bytes_remaining = get_bcc(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+
+	/* BB check if Unicode and decode strings */
+	if (bytes_remaining == 0) {
+		/* no string area to decode, do nothing */
+	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+		/* unicode string area must be word-aligned */
+		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+			++bcc_ptr;
+			--bytes_remaining;
+		}
+		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+				      sess_data->nls_cp);
+	} else {
+		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+				    sess_data->nls_cp);
+	}
+
+	rc = sess_establish_session(sess_data);
+out:
+	sess_data->result = rc;
+	sess_data->func = NULL;
+	sess_free_buffer(sess_data);
+}
+
+#endif
+
+static void
+sess_auth_ntlm(struct sess_data *sess_data)
+{
+	int rc = 0;
+	struct smb_hdr *smb_buf;
+	SESSION_SETUP_ANDX *pSMB;
+	char *bcc_ptr;
+	struct cifs_ses *ses = sess_data->ses;
+	__u32 capabilities;
+	__u16 bytes_remaining;
+
+	/* old style NTLM sessionsetup */
+	/* wct = 13 */
+	rc = sess_alloc_buffer(sess_data, 13);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	bcc_ptr = sess_data->iov[2].iov_base;
+	capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+	pSMB->req_no_secext.CaseInsensitivePasswordLength =
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+	pSMB->req_no_secext.CaseSensitivePasswordLength =
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+	/* calculate ntlm response and session key */
+	rc = setup_ntlm_response(ses, sess_data->nls_cp);
+	if (rc) {
+		cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+				 rc);
+		goto out;
+	}
+
+	/* copy ntlm response */
+	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+			CIFS_AUTH_RESP_SIZE);
+	bcc_ptr += CIFS_AUTH_RESP_SIZE;
+	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+			CIFS_AUTH_RESP_SIZE);
+	bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+	if (ses->capabilities & CAP_UNICODE) {
+		/* unicode strings must be word aligned */
+		if (sess_data->iov[0].iov_len % 2) {
+			*bcc_ptr = 0;
+			bcc_ptr++;
+		}
+		unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+	} else {
+		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+	}
+
+
+	sess_data->iov[2].iov_len = (long) bcc_ptr -
+			(long) sess_data->iov[2].iov_base;
+
+	rc = sess_sendreceive(sess_data);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+	if (smb_buf->WordCount != 3) {
+		rc = -EIO;
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+		goto out;
+	}
+
+	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+	bytes_remaining = get_bcc(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+
+	/* BB check if Unicode and decode strings */
+	if (bytes_remaining == 0) {
+		/* no string area to decode, do nothing */
+	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+		/* unicode string area must be word-aligned */
+		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+			++bcc_ptr;
+			--bytes_remaining;
+		}
+		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+				      sess_data->nls_cp);
+	} else {
+		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+				    sess_data->nls_cp);
+	}
+
+	rc = sess_establish_session(sess_data);
+out:
+	sess_data->result = rc;
+	sess_data->func = NULL;
+	sess_free_buffer(sess_data);
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+}
+
+static void
+sess_auth_ntlmv2(struct sess_data *sess_data)
+{
+	int rc = 0;
+	struct smb_hdr *smb_buf;
+	SESSION_SETUP_ANDX *pSMB;
+	char *bcc_ptr;
+	struct cifs_ses *ses = sess_data->ses;
+	__u32 capabilities;
+	__u16 bytes_remaining;
+
+	/* old style NTLM sessionsetup */
+	/* wct = 13 */
+	rc = sess_alloc_buffer(sess_data, 13);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	bcc_ptr = sess_data->iov[2].iov_base;
+	capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+
+	/* LM2 password would be here if we supported it */
+	pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+
+	/* calculate nlmv2 response and session key */
+	rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
+	if (rc) {
+		cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
+		goto out;
+	}
+
+	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+			ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+	/* set case sensitive password length after tilen may get
+	 * assigned, tilen is 0 otherwise.
+	 */
+	pSMB->req_no_secext.CaseSensitivePasswordLength =
+		cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+
+	if (ses->capabilities & CAP_UNICODE) {
+		if (sess_data->iov[0].iov_len % 2) {
+			*bcc_ptr = 0;
+			bcc_ptr++;
+		}
+		unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+	} else {
+		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+	}
+
+
+	sess_data->iov[2].iov_len = (long) bcc_ptr -
+			(long) sess_data->iov[2].iov_base;
+
+	rc = sess_sendreceive(sess_data);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+	if (smb_buf->WordCount != 3) {
+		rc = -EIO;
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+		goto out;
+	}
+
+	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+	bytes_remaining = get_bcc(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+
+	/* BB check if Unicode and decode strings */
+	if (bytes_remaining == 0) {
+		/* no string area to decode, do nothing */
+	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+		/* unicode string area must be word-aligned */
+		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+			++bcc_ptr;
+			--bytes_remaining;
+		}
+		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+				      sess_data->nls_cp);
+	} else {
+		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+				    sess_data->nls_cp);
+	}
+
+	rc = sess_establish_session(sess_data);
+out:
+	sess_data->result = rc;
+	sess_data->func = NULL;
+	sess_free_buffer(sess_data);
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+}
+
+#ifdef CONFIG_CIFS_UPCALL
+static void
+sess_auth_kerberos(struct sess_data *sess_data)
+{
+	int rc = 0;
+	struct smb_hdr *smb_buf;
+	SESSION_SETUP_ANDX *pSMB;
+	char *bcc_ptr;
+	struct cifs_ses *ses = sess_data->ses;
+	__u32 capabilities;
+	__u16 bytes_remaining;
+	struct key *spnego_key = NULL;
+	struct cifs_spnego_msg *msg;
+	u16 blob_len;
+
+	/* extended security */
+	/* wct = 12 */
+	rc = sess_alloc_buffer(sess_data, 12);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	bcc_ptr = sess_data->iov[2].iov_base;
+	capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+	spnego_key = cifs_get_spnego_key(ses);
+	if (IS_ERR(spnego_key)) {
+		rc = PTR_ERR(spnego_key);
+		spnego_key = NULL;
+		goto out;
+	}
+
+	msg = spnego_key->payload.data;
+	/*
+	 * check version field to make sure that cifs.upcall is
+	 * sending us a response in an expected form
+	 */
+	if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+		cifs_dbg(VFS,
+		  "incorrect version of cifs.upcall (expected %d but got %d)",
+			      CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+		rc = -EKEYREJECTED;
+		goto out_put_spnego_key;
+	}
+
+	ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+					 GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
+				msg->sesskey_len);
+		rc = -ENOMEM;
+		goto out_put_spnego_key;
+	}
+	ses->auth_key.len = msg->sesskey_len;
+
+	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	capabilities |= CAP_EXTENDED_SECURITY;
+	pSMB->req.Capabilities = cpu_to_le32(capabilities);
+	sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
+	sess_data->iov[1].iov_len = msg->secblob_len;
+	pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
+
+	if (ses->capabilities & CAP_UNICODE) {
+		/* unicode strings must be word aligned */
+		if ((sess_data->iov[0].iov_len
+			+ sess_data->iov[1].iov_len) % 2) {
+			*bcc_ptr = 0;
+			bcc_ptr++;
+		}
+		unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+		unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
+	} else {
+		/* BB: is this right? */
+		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+	}
+
+	sess_data->iov[2].iov_len = (long) bcc_ptr -
+			(long) sess_data->iov[2].iov_base;
+
+	rc = sess_sendreceive(sess_data);
+	if (rc)
+		goto out_put_spnego_key;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+	if (smb_buf->WordCount != 4) {
+		rc = -EIO;
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+		goto out_put_spnego_key;
+	}
+
+	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+	bytes_remaining = get_bcc(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+
+	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+	if (blob_len > bytes_remaining) {
+		cifs_dbg(VFS, "bad security blob length %d\n",
+				blob_len);
+		rc = -EINVAL;
+		goto out_put_spnego_key;
+	}
+	bcc_ptr += blob_len;
+	bytes_remaining -= blob_len;
+
+	/* BB check if Unicode and decode strings */
+	if (bytes_remaining == 0) {
+		/* no string area to decode, do nothing */
+	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+		/* unicode string area must be word-aligned */
+		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+			++bcc_ptr;
+			--bytes_remaining;
+		}
+		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+				      sess_data->nls_cp);
+	} else {
+		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+				    sess_data->nls_cp);
+	}
+
+	rc = sess_establish_session(sess_data);
+out_put_spnego_key:
+	key_invalidate(spnego_key);
+	key_put(spnego_key);
+out:
+	sess_data->result = rc;
+	sess_data->func = NULL;
+	sess_free_buffer(sess_data);
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+}
+
+#endif /* ! CONFIG_CIFS_UPCALL */
+
+/*
+ * The required kvec buffers have to be allocated before calling this
+ * function.
+ */
+static int
+_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
+{
+	struct smb_hdr *smb_buf;
+	SESSION_SETUP_ANDX *pSMB;
+	struct cifs_ses *ses = sess_data->ses;
+	__u32 capabilities;
+	char *bcc_ptr;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)pSMB;
+
+	capabilities = cifs_ssetup_hdr(ses, pSMB);
+	if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+		cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
+		return -ENOSYS;
+	}
+
+	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	capabilities |= CAP_EXTENDED_SECURITY;
+	pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+
+	bcc_ptr = sess_data->iov[2].iov_base;
+	/* unicode strings must be word aligned */
+	if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
+		*bcc_ptr = 0;
+		bcc_ptr++;
+	}
+	unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+
+	sess_data->iov[2].iov_len = (long) bcc_ptr -
+					(long) sess_data->iov[2].iov_base;
+
+	return 0;
+}
+
+static void
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data);
+
+static void
+sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
+{
+	int rc;
+	struct smb_hdr *smb_buf;
+	SESSION_SETUP_ANDX *pSMB;
+	struct cifs_ses *ses = sess_data->ses;
+	__u16 bytes_remaining;
+	char *bcc_ptr;
+	u16 blob_len;
+
+	cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
+
+	/*
+	 * if memory allocation is successful, caller of this function
+	 * frees it.
+	 */
+	ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+	if (!ses->ntlmssp) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	ses->ntlmssp->sesskey_per_smbsess = false;
+
+	/* wct = 12 */
+	rc = sess_alloc_buffer(sess_data, 12);
+	if (rc)
+		goto out;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+
+	/* Build security blob before we assemble the request */
+	build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
+	sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+	sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
+	pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+
+	rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+	if (rc)
+		goto out;
+
+	rc = sess_sendreceive(sess_data);
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+	/* If true, rc here is expected and not an error */
+	if (sess_data->buf0_type != CIFS_NO_BUFFER &&
+	    smb_buf->Status.CifsError ==
+			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
+		rc = 0;
+
+	if (rc)
+		goto out;
+
+	cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
+
+	if (smb_buf->WordCount != 4) {
+		rc = -EIO;
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+		goto out;
+	}
+
+	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+	bytes_remaining = get_bcc(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+
+	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+	if (blob_len > bytes_remaining) {
+		cifs_dbg(VFS, "bad security blob length %d\n",
+				blob_len);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+out:
+	sess_free_buffer(sess_data);
+
+	if (!rc) {
+		sess_data->func = sess_auth_rawntlmssp_authenticate;
+		return;
+	}
+
+	/* Else error. Cleanup */
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+	kfree(ses->ntlmssp);
+	ses->ntlmssp = NULL;
+
+	sess_data->func = NULL;
+	sess_data->result = rc;
+}
+
+static void
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
+{
+	int rc;
+	struct smb_hdr *smb_buf;
+	SESSION_SETUP_ANDX *pSMB;
+	struct cifs_ses *ses = sess_data->ses;
+	__u16 bytes_remaining;
+	char *bcc_ptr;
+	char *ntlmsspblob = NULL;
+	u16 blob_len;
+
+	cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
+
+	/* wct = 12 */
+	rc = sess_alloc_buffer(sess_data, 12);
+	if (rc)
+		goto out;
+
+	/* Build security blob before we assemble the request */
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)pSMB;
+	/*
+	 * 5 is an empirical value, large enough to hold
+	 * authenticate message plus max 10 of av paris,
+	 * domain, user, workstation names, flags, etc.
+	 */
+	ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
+				GFP_KERNEL);
+	if (!ntlmsspblob) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = build_ntlmssp_auth_blob(ntlmsspblob,
+					&blob_len, ses, sess_data->nls_cp);
+	if (rc)
+		goto out_free_ntlmsspblob;
+	sess_data->iov[1].iov_len = blob_len;
+	sess_data->iov[1].iov_base = ntlmsspblob;
+	pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
+	/*
+	 * Make sure that we tell the server that we are using
+	 * the uid that it just gave us back on the response
+	 * (challenge)
+	 */
+	smb_buf->Uid = ses->Suid;
+
+	rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+	if (rc)
+		goto out_free_ntlmsspblob;
+
+	rc = sess_sendreceive(sess_data);
+	if (rc)
+		goto out_free_ntlmsspblob;
+
+	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+	if (smb_buf->WordCount != 4) {
+		rc = -EIO;
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+		goto out_free_ntlmsspblob;
+	}
+
+	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+	if (ses->Suid != smb_buf->Uid) {
+		ses->Suid = smb_buf->Uid;
+		cifs_dbg(FYI, "UID changed! new UID = %llu\n", ses->Suid);
+	}
+
+	bytes_remaining = get_bcc(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+	if (blob_len > bytes_remaining) {
+		cifs_dbg(VFS, "bad security blob length %d\n",
+				blob_len);
+		rc = -EINVAL;
+		goto out_free_ntlmsspblob;
+	}
+	bcc_ptr += blob_len;
+	bytes_remaining -= blob_len;
+
+
+	/* BB check if Unicode and decode strings */
+	if (bytes_remaining == 0) {
+		/* no string area to decode, do nothing */
+	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+		/* unicode string area must be word-aligned */
+		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+			++bcc_ptr;
+			--bytes_remaining;
+		}
+		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+				      sess_data->nls_cp);
+	} else {
+		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+				    sess_data->nls_cp);
+	}
+
+out_free_ntlmsspblob:
+	kfree(ntlmsspblob);
+out:
+	sess_free_buffer(sess_data);
+
+	 if (!rc)
+		rc = sess_establish_session(sess_data);
+
+	/* Cleanup */
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+	kfree(ses->ntlmssp);
+	ses->ntlmssp = NULL;
+
+	sess_data->func = NULL;
+	sess_data->result = rc;
+}
+
+static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
+{
+	int type;
+
+	type = select_sectype(ses->server, ses->sectype);
+	cifs_dbg(FYI, "sess setup type %d\n", type);
+	if (type == Unspecified) {
+		cifs_dbg(VFS,
+			"Unable to select appropriate authentication method!");
+		return -EINVAL;
+	}
+
+	switch (type) {
+	case LANMAN:
+		/* LANMAN and plaintext are less secure and off by default.
+		 * So we make this explicitly be turned on in kconfig (in the
+		 * build) and turned on at runtime (changed from the default)
+		 * in proc/fs/cifs or via mount parm.  Unfortunately this is
+		 * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+		sess_data->func = sess_auth_lanman;
+		break;
+#else
+		return -EOPNOTSUPP;
+#endif
+	case NTLM:
+		sess_data->func = sess_auth_ntlm;
+		break;
+	case NTLMv2:
+		sess_data->func = sess_auth_ntlmv2;
+		break;
+	case Kerberos:
+#ifdef CONFIG_CIFS_UPCALL
+		sess_data->func = sess_auth_kerberos;
+		break;
+#else
+		cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
+		return -ENOSYS;
+		break;
+#endif /* CONFIG_CIFS_UPCALL */
+	case RawNTLMSSP:
+		sess_data->func = sess_auth_rawntlmssp_negotiate;
+		break;
+	default:
+		cifs_dbg(VFS, "secType %d not supported!\n", type);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+		    const struct nls_table *nls_cp)
+{
+	int rc = 0;
+	struct sess_data *sess_data;
+
+	if (ses == NULL) {
+		WARN(1, "%s: ses == NULL!", __func__);
+		return -EINVAL;
+	}
+
+	sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL);
+	if (!sess_data)
+		return -ENOMEM;
+
+	rc = select_sec(ses, sess_data);
+	if (rc)
+		goto out;
+
+	sess_data->xid = xid;
+	sess_data->ses = ses;
+	sess_data->buf0_type = CIFS_NO_BUFFER;
+	sess_data->nls_cp = (struct nls_table *) nls_cp;
+
+	while (sess_data->func)
+		sess_data->func(sess_data);
+
+	/* Store result before we free sess_data */
+	rc = sess_data->result;
+
+out:
+	kfree(sess_data);
+	return rc;
+}
diff --git a/kernel/fs/cifs/smb1ops.c b/kernel/fs/cifs/smb1ops.c
new file mode 100644
index 000000000..fc537c290
--- /dev/null
+++ b/kernel/fs/cifs/smb1ops.c
@@ -0,0 +1,1116 @@
+/*
+ *  SMB1 (CIFS) version specific operations
+ *
+ *  Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ *  This library is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License v2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *  the GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifspdu.h"
+#include "cifs_unicode.h"
+
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, void *buf,
+	       struct mid_q_entry *mid)
+{
+	int rc = 0;
+	struct smb_hdr *in_buf = (struct smb_hdr *)buf;
+
+	/* -4 for RFC1001 length and +2 for BCC field */
+	in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
+	in_buf->Command = SMB_COM_NT_CANCEL;
+	in_buf->WordCount = 0;
+	put_bcc(0, in_buf);
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		return rc;
+	}
+
+	/*
+	 * The response to this call was already factored into the sequence
+	 * number when the call went out, so we must adjust it back downward
+	 * after signing here.
+	 */
+	--server->sequence_number;
+	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	if (rc < 0)
+		server->sequence_number--;
+
+	mutex_unlock(&server->srv_mutex);
+
+	cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
+		 get_mid(in_buf), rc);
+
+	return rc;
+}
+
+static bool
+cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
+{
+	return ob1->fid.netfid == ob2->fid.netfid;
+}
+
+static unsigned int
+cifs_read_data_offset(char *buf)
+{
+	READ_RSP *rsp = (READ_RSP *)buf;
+	return le16_to_cpu(rsp->DataOffset);
+}
+
+static unsigned int
+cifs_read_data_length(char *buf)
+{
+	READ_RSP *rsp = (READ_RSP *)buf;
+	return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
+	       le16_to_cpu(rsp->DataLength);
+}
+
+static struct mid_q_entry *
+cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
+{
+	struct smb_hdr *buf = (struct smb_hdr *)buffer;
+	struct mid_q_entry *mid;
+
+	spin_lock(&GlobalMid_Lock);
+	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+		if (compare_mid(mid->mid, buf) &&
+		    mid->mid_state == MID_REQUEST_SUBMITTED &&
+		    le16_to_cpu(mid->command) == buf->Command) {
+			spin_unlock(&GlobalMid_Lock);
+			return mid;
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return NULL;
+}
+
+static void
+cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add,
+		 const int optype)
+{
+	spin_lock(&server->req_lock);
+	server->credits += add;
+	server->in_flight--;
+	spin_unlock(&server->req_lock);
+	wake_up(&server->request_q);
+}
+
+static void
+cifs_set_credits(struct TCP_Server_Info *server, const int val)
+{
+	spin_lock(&server->req_lock);
+	server->credits = val;
+	server->oplocks = val > 1 ? enable_oplocks : false;
+	spin_unlock(&server->req_lock);
+}
+
+static int *
+cifs_get_credits_field(struct TCP_Server_Info *server, const int optype)
+{
+	return &server->credits;
+}
+
+static unsigned int
+cifs_get_credits(struct mid_q_entry *mid)
+{
+	return 1;
+}
+
+/*
+ * Find a free multiplex id (SMB mid). Otherwise there could be
+ * mid collisions which might cause problems, demultiplexing the
+ * wrong response to this request. Multiplex ids could collide if
+ * one of a series requests takes much longer than the others, or
+ * if a very large number of long lived requests (byte range
+ * locks or FindNotify requests) are pending. No more than
+ * 64K-1 requests can be outstanding at one time. If no
+ * mids are available, return zero. A future optimization
+ * could make the combination of mids and uid the key we use
+ * to demultiplex on (rather than mid alone).
+ * In addition to the above check, the cifs demultiplex
+ * code already used the command code as a secondary
+ * check of the frame and if signing is negotiated the
+ * response would be discarded if the mid were the same
+ * but the signature was wrong. Since the mid is not put in the
+ * pending queue until later (when it is about to be dispatched)
+ * we do have to limit the number of outstanding requests
+ * to somewhat less than 64K-1 although it is hard to imagine
+ * so many threads being in the vfs at one time.
+ */
+static __u64
+cifs_get_next_mid(struct TCP_Server_Info *server)
+{
+	__u64 mid = 0;
+	__u16 last_mid, cur_mid;
+	bool collision;
+
+	spin_lock(&GlobalMid_Lock);
+
+	/* mid is 16 bit only for CIFS/SMB */
+	cur_mid = (__u16)((server->CurrentMid) & 0xffff);
+	/* we do not want to loop forever */
+	last_mid = cur_mid;
+	cur_mid++;
+
+	/*
+	 * This nested loop looks more expensive than it is.
+	 * In practice the list of pending requests is short,
+	 * fewer than 50, and the mids are likely to be unique
+	 * on the first pass through the loop unless some request
+	 * takes longer than the 64 thousand requests before it
+	 * (and it would also have to have been a request that
+	 * did not time out).
+	 */
+	while (cur_mid != last_mid) {
+		struct mid_q_entry *mid_entry;
+		unsigned int num_mids;
+
+		collision = false;
+		if (cur_mid == 0)
+			cur_mid++;
+
+		num_mids = 0;
+		list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+			++num_mids;
+			if (mid_entry->mid == cur_mid &&
+			    mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
+				/* This mid is in use, try a different one */
+				collision = true;
+				break;
+			}
+		}
+
+		/*
+		 * if we have more than 32k mids in the list, then something
+		 * is very wrong. Possibly a local user is trying to DoS the
+		 * box by issuing long-running calls and SIGKILL'ing them. If
+		 * we get to 2^16 mids then we're in big trouble as this
+		 * function could loop forever.
+		 *
+		 * Go ahead and assign out the mid in this situation, but force
+		 * an eventual reconnect to clean out the pending_mid_q.
+		 */
+		if (num_mids > 32768)
+			server->tcpStatus = CifsNeedReconnect;
+
+		if (!collision) {
+			mid = (__u64)cur_mid;
+			server->CurrentMid = mid;
+			break;
+		}
+		cur_mid++;
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return mid;
+}
+
+/*
+	return codes:
+		0	not a transact2, or all data present
+		>0	transact2 with that much data missing
+		-EINVAL	invalid transact2
+ */
+static int
+check2ndT2(char *buf)
+{
+	struct smb_hdr *pSMB = (struct smb_hdr *)buf;
+	struct smb_t2_rsp *pSMBt;
+	int remaining;
+	__u16 total_data_size, data_in_this_rsp;
+
+	if (pSMB->Command != SMB_COM_TRANSACTION2)
+		return 0;
+
+	/* check for plausible wct, bcc and t2 data and parm sizes */
+	/* check for parm and data offset going beyond end of smb */
+	if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
+		cifs_dbg(FYI, "invalid transact2 word count\n");
+		return -EINVAL;
+	}
+
+	pSMBt = (struct smb_t2_rsp *)pSMB;
+
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+	data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+
+	if (total_data_size == data_in_this_rsp)
+		return 0;
+	else if (total_data_size < data_in_this_rsp) {
+		cifs_dbg(FYI, "total data %d smaller than data in frame %d\n",
+			 total_data_size, data_in_this_rsp);
+		return -EINVAL;
+	}
+
+	remaining = total_data_size - data_in_this_rsp;
+
+	cifs_dbg(FYI, "missing %d bytes from transact2, check next response\n",
+		 remaining);
+	if (total_data_size > CIFSMaxBufSize) {
+		cifs_dbg(VFS, "TotalDataSize %d is over maximum buffer %d\n",
+			 total_data_size, CIFSMaxBufSize);
+		return -EINVAL;
+	}
+	return remaining;
+}
+
+static int
+coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
+{
+	struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)second_buf;
+	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)target_hdr;
+	char *data_area_of_tgt;
+	char *data_area_of_src;
+	int remaining;
+	unsigned int byte_count, total_in_tgt;
+	__u16 tgt_total_cnt, src_total_cnt, total_in_src;
+
+	src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+	tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+
+	if (tgt_total_cnt != src_total_cnt)
+		cifs_dbg(FYI, "total data count of primary and secondary t2 differ source=%hu target=%hu\n",
+			 src_total_cnt, tgt_total_cnt);
+
+	total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+
+	remaining = tgt_total_cnt - total_in_tgt;
+
+	if (remaining < 0) {
+		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+			 tgt_total_cnt, total_in_tgt);
+		return -EPROTO;
+	}
+
+	if (remaining == 0) {
+		/* nothing to do, ignore */
+		cifs_dbg(FYI, "no more data remains\n");
+		return 0;
+	}
+
+	total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
+	if (remaining < total_in_src)
+		cifs_dbg(FYI, "transact2 2nd response contains too much data\n");
+
+	/* find end of first SMB data area */
+	data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
+				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
+
+	/* validate target area */
+	data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+				get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
+
+	data_area_of_tgt += total_in_tgt;
+
+	total_in_tgt += total_in_src;
+	/* is the result too big for the field? */
+	if (total_in_tgt > USHRT_MAX) {
+		cifs_dbg(FYI, "coalesced DataCount too large (%u)\n",
+			 total_in_tgt);
+		return -EPROTO;
+	}
+	put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
+
+	/* fix up the BCC */
+	byte_count = get_bcc(target_hdr);
+	byte_count += total_in_src;
+	/* is the result too big for the field? */
+	if (byte_count > USHRT_MAX) {
+		cifs_dbg(FYI, "coalesced BCC too large (%u)\n", byte_count);
+		return -EPROTO;
+	}
+	put_bcc(byte_count, target_hdr);
+
+	byte_count = be32_to_cpu(target_hdr->smb_buf_length);
+	byte_count += total_in_src;
+	/* don't allow buffer to overflow */
+	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+		cifs_dbg(FYI, "coalesced BCC exceeds buffer size (%u)\n",
+			 byte_count);
+		return -ENOBUFS;
+	}
+	target_hdr->smb_buf_length = cpu_to_be32(byte_count);
+
+	/* copy second buffer into end of first buffer */
+	memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
+
+	if (remaining != total_in_src) {
+		/* more responses to go */
+		cifs_dbg(FYI, "waiting for more secondary responses\n");
+		return 1;
+	}
+
+	/* we are done */
+	cifs_dbg(FYI, "found the last secondary response\n");
+	return 0;
+}
+
+static void
+cifs_downgrade_oplock(struct TCP_Server_Info *server,
+			struct cifsInodeInfo *cinode, bool set_level2)
+{
+	if (set_level2)
+		cifs_set_oplock_level(cinode, OPLOCK_READ);
+	else
+		cifs_set_oplock_level(cinode, 0);
+}
+
+static bool
+cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+		  char *buf, int malformed)
+{
+	if (malformed)
+		return false;
+	if (check2ndT2(buf) <= 0)
+		return false;
+	mid->multiRsp = true;
+	if (mid->resp_buf) {
+		/* merge response - fix up 1st*/
+		malformed = coalesce_t2(buf, mid->resp_buf);
+		if (malformed > 0)
+			return true;
+		/* All parts received or packet is malformed. */
+		mid->multiEnd = true;
+		dequeue_mid(mid, malformed);
+		return true;
+	}
+	if (!server->large_buf) {
+		/*FIXME: switch to already allocated largebuf?*/
+		cifs_dbg(VFS, "1st trans2 resp needs bigbuf\n");
+	} else {
+		/* Have first buffer */
+		mid->resp_buf = buf;
+		mid->large_buf = true;
+		server->bigbuf = NULL;
+	}
+	return true;
+}
+
+static bool
+cifs_need_neg(struct TCP_Server_Info *server)
+{
+	return server->maxBuf == 0;
+}
+
+static int
+cifs_negotiate(const unsigned int xid, struct cifs_ses *ses)
+{
+	int rc;
+	rc = CIFSSMBNegotiate(xid, ses);
+	if (rc == -EAGAIN) {
+		/* retry only once on 1st time connection */
+		set_credits(ses->server, 1);
+		rc = CIFSSMBNegotiate(xid, ses);
+		if (rc == -EAGAIN)
+			rc = -EHOSTDOWN;
+	}
+	return rc;
+}
+
+static unsigned int
+cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int wsize;
+
+	/* start with specified wsize, or default */
+	if (volume_info->wsize)
+		wsize = volume_info->wsize;
+	else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+		wsize = CIFS_DEFAULT_IOSIZE;
+	else
+		wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
+
+	/* can server support 24-bit write sizes? (via UNIX extensions) */
+	if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+		wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE);
+
+	/*
+	 * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set?
+	 * Limit it to max buffer offered by the server, minus the size of the
+	 * WRITEX header, not including the 4 byte RFC1001 length.
+	 */
+	if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
+	    (!(server->capabilities & CAP_UNIX) && server->sign))
+		wsize = min_t(unsigned int, wsize,
+				server->maxBuf - sizeof(WRITE_REQ) + 4);
+
+	/* hard limit of CIFS_MAX_WSIZE */
+	wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
+
+	return wsize;
+}
+
+static unsigned int
+cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int rsize, defsize;
+
+	/*
+	 * Set default value...
+	 *
+	 * HACK alert! Ancient servers have very small buffers. Even though
+	 * MS-CIFS indicates that servers are only limited by the client's
+	 * bufsize for reads, testing against win98se shows that it throws
+	 * INVALID_PARAMETER errors if you try to request too large a read.
+	 * OS/2 just sends back short reads.
+	 *
+	 * If the server doesn't advertise CAP_LARGE_READ_X, then assume that
+	 * it can't handle a read request larger than its MaxBufferSize either.
+	 */
+	if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
+		defsize = CIFS_DEFAULT_IOSIZE;
+	else if (server->capabilities & CAP_LARGE_READ_X)
+		defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
+	else
+		defsize = server->maxBuf - sizeof(READ_RSP);
+
+	rsize = volume_info->rsize ? volume_info->rsize : defsize;
+
+	/*
+	 * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
+	 * the client's MaxBufferSize.
+	 */
+	if (!(server->capabilities & CAP_LARGE_READ_X))
+		rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+
+	/* hard limit of CIFS_MAX_RSIZE */
+	rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
+
+	return rsize;
+}
+
+static void
+cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	CIFSSMBQFSDeviceInfo(xid, tcon);
+	CIFSSMBQFSAttributeInfo(xid, tcon);
+}
+
+static int
+cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
+			struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	FILE_ALL_INFO *file_info;
+
+	file_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (file_info == NULL)
+		return -ENOMEM;
+
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, file_info,
+			      0 /* not legacy */, cifs_sb->local_nls,
+			      cifs_remap(cifs_sb));
+
+	if (rc == -EOPNOTSUPP || rc == -EINVAL)
+		rc = SMBQueryInformation(xid, tcon, full_path, file_info,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+	kfree(file_info);
+	return rc;
+}
+
+static int
+cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_sb_info *cifs_sb, const char *full_path,
+		     FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink)
+{
+	int rc;
+
+	*symlink = false;
+
+	/* could do find first instead but this returns more info */
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
+			      cifs_sb->local_nls, cifs_remap(cifs_sb));
+	/*
+	 * BB optimize code so we do not make the above call when server claims
+	 * no NT SMB support and the above call failed at least once - set flag
+	 * in tcon or mount.
+	 */
+	if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
+		rc = SMBQueryInformation(xid, tcon, full_path, data,
+					 cifs_sb->local_nls,
+					 cifs_remap(cifs_sb));
+		*adjustTZ = true;
+	}
+
+	if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
+		int tmprc;
+		int oplock = 0;
+		struct cifs_fid fid;
+		struct cifs_open_parms oparms;
+
+		oparms.tcon = tcon;
+		oparms.cifs_sb = cifs_sb;
+		oparms.desired_access = FILE_READ_ATTRIBUTES;
+		oparms.create_options = 0;
+		oparms.disposition = FILE_OPEN;
+		oparms.path = full_path;
+		oparms.fid = &fid;
+		oparms.reconnect = false;
+
+		/* Need to check if this is a symbolic link or not */
+		tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
+		if (tmprc == -EOPNOTSUPP)
+			*symlink = true;
+		else if (tmprc == 0)
+			CIFSSMBClose(xid, tcon, fid.netfid);
+	}
+
+	return rc;
+}
+
+static int
+cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+		  struct cifs_sb_info *cifs_sb, const char *full_path,
+		  u64 *uniqueid, FILE_ALL_INFO *data)
+{
+	/*
+	 * We can not use the IndexNumber field by default from Windows or
+	 * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA
+	 * CIFS spec claims that this value is unique within the scope of a
+	 * share, and the windows docs hint that it's actually unique
+	 * per-machine.
+	 *
+	 * There may be higher info levels that work but are there Windows
+	 * server or network appliances for which IndexNumber field is not
+	 * guaranteed unique?
+	 */
+	return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid,
+				     cifs_sb->local_nls,
+				     cifs_remap(cifs_sb));
+}
+
+static int
+cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_fid *fid, FILE_ALL_INFO *data)
+{
+	return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
+}
+
+static void
+cifs_clear_stats(struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	atomic_set(&tcon->stats.cifs_stats.num_writes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_reads, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_flushes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_oplock_brks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_opens, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_posixopens, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_posixmkdirs, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_closes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_deletes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_mkdirs, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_rmdirs, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_renames, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_t2renames, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_ffirst, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_fnext, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_fclose, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_hardlinks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_symlinks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_locks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_acl_get, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_acl_set, 0);
+#endif
+}
+
+static void
+cifs_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	seq_printf(m, " Oplocks breaks: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_oplock_brks));
+	seq_printf(m, "\nReads:  %d Bytes: %llu",
+		   atomic_read(&tcon->stats.cifs_stats.num_reads),
+		   (long long)(tcon->bytes_read));
+	seq_printf(m, "\nWrites: %d Bytes: %llu",
+		   atomic_read(&tcon->stats.cifs_stats.num_writes),
+		   (long long)(tcon->bytes_written));
+	seq_printf(m, "\nFlushes: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_flushes));
+	seq_printf(m, "\nLocks: %d HardLinks: %d Symlinks: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_locks),
+		   atomic_read(&tcon->stats.cifs_stats.num_hardlinks),
+		   atomic_read(&tcon->stats.cifs_stats.num_symlinks));
+	seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_opens),
+		   atomic_read(&tcon->stats.cifs_stats.num_closes),
+		   atomic_read(&tcon->stats.cifs_stats.num_deletes));
+	seq_printf(m, "\nPosix Opens: %d Posix Mkdirs: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_posixopens),
+		   atomic_read(&tcon->stats.cifs_stats.num_posixmkdirs));
+	seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_mkdirs),
+		   atomic_read(&tcon->stats.cifs_stats.num_rmdirs));
+	seq_printf(m, "\nRenames: %d T2 Renames %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_renames),
+		   atomic_read(&tcon->stats.cifs_stats.num_t2renames));
+	seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_ffirst),
+		   atomic_read(&tcon->stats.cifs_stats.num_fnext),
+		   atomic_read(&tcon->stats.cifs_stats.num_fclose));
+#endif
+}
+
+static void
+cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
+		   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
+		   const unsigned int xid)
+{
+	FILE_BASIC_INFO info;
+	struct cifsInodeInfo *cifsInode;
+	u32 dosattrs;
+	int rc;
+
+	memset(&info, 0, sizeof(info));
+	cifsInode = CIFS_I(inode);
+	dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
+	info.Attributes = cpu_to_le32(dosattrs);
+	rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls,
+				cifs_remap(cifs_sb));
+	if (rc == 0)
+		cifsInode->cifsAttrs = dosattrs;
+}
+
+static int
+cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+	       __u32 *oplock, FILE_ALL_INFO *buf)
+{
+	if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
+		return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
+				     oparms->disposition,
+				     oparms->desired_access,
+				     oparms->create_options,
+				     &oparms->fid->netfid, oplock, buf,
+				     oparms->cifs_sb->local_nls,
+				     cifs_remap(oparms->cifs_sb));
+	return CIFS_open(xid, oparms, oplock, buf);
+}
+
+static void
+cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	cfile->fid.netfid = fid->netfid;
+	cifs_set_oplock_level(cinode, oplock);
+	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
+}
+
+static void
+cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	CIFSSMBClose(xid, tcon, fid->netfid);
+}
+
+static int
+cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	return CIFSSMBFlush(xid, tcon, fid->netfid);
+}
+
+static int
+cifs_sync_read(const unsigned int xid, struct cifs_fid *pfid,
+	       struct cifs_io_parms *parms, unsigned int *bytes_read,
+	       char **buf, int *buf_type)
+{
+	parms->netfid = pfid->netfid;
+	return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type);
+}
+
+static int
+cifs_sync_write(const unsigned int xid, struct cifs_fid *pfid,
+		struct cifs_io_parms *parms, unsigned int *written,
+		struct kvec *iov, unsigned long nr_segs)
+{
+
+	parms->netfid = pfid->netfid;
+	return CIFSSMBWrite2(xid, parms, written, iov, nr_segs);
+}
+
+static int
+smb_set_file_info(struct inode *inode, const char *full_path,
+		  FILE_BASIC_INFO *buf, const unsigned int xid)
+{
+	int oplock = 0;
+	int rc;
+	__u32 netpid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *tcon;
+
+	/* if the file is already open for write, just use that fileid */
+	open_file = find_writable_file(cinode, true);
+	if (open_file) {
+		fid.netfid = open_file->fid.netfid;
+		netpid = open_file->pid;
+		tcon = tlink_tcon(open_file->tlink);
+		goto set_via_filehandle;
+	}
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
+				cifs_remap(cifs_sb));
+	if (rc == 0) {
+		cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+		goto out;
+	} else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
+		goto out;
+	}
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc != 0) {
+		if (rc == -EIO)
+			rc = -EINVAL;
+		goto out;
+	}
+
+	netpid = current->tgid;
+
+set_via_filehandle:
+	rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid);
+	if (!rc)
+		cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+
+	if (open_file == NULL)
+		CIFSSMBClose(xid, tcon, fid.netfid);
+	else
+		cifsFileInfo_put(open_file);
+out:
+	if (tlink != NULL)
+		cifs_put_tlink(tlink);
+	return rc;
+}
+
+static int
+cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid);
+}
+
+static int
+cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
+		     const char *path, struct cifs_sb_info *cifs_sb,
+		     struct cifs_fid *fid, __u16 search_flags,
+		     struct cifs_search_info *srch_inf)
+{
+	return CIFSFindFirst(xid, tcon, path, cifs_sb,
+			     &fid->netfid, search_flags, srch_inf, true);
+}
+
+static int
+cifs_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon,
+		    struct cifs_fid *fid, __u16 search_flags,
+		    struct cifs_search_info *srch_inf)
+{
+	return CIFSFindNext(xid, tcon, fid->netfid, search_flags, srch_inf);
+}
+
+static int
+cifs_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
+	       struct cifs_fid *fid)
+{
+	return CIFSFindClose(xid, tcon, fid->netfid);
+}
+
+static int
+cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
+		     struct cifsInodeInfo *cinode)
+{
+	return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0,
+			   LOCKING_ANDX_OPLOCK_RELEASE, false,
+			   CIFS_CACHE_READ(cinode) ? 1 : 0);
+}
+
+static int
+cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
+	     struct kstatfs *buf)
+{
+	int rc = -EOPNOTSUPP;
+
+	buf->f_type = CIFS_MAGIC_NUMBER;
+
+	/*
+	 * We could add a second check for a QFS Unix capability bit
+	 */
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability)))
+		rc = CIFSSMBQFSPosixInfo(xid, tcon, buf);
+
+	/*
+	 * Only need to call the old QFSInfo if failed on newer one,
+	 * e.g. by OS/2.
+	 **/
+	if (rc && (tcon->ses->capabilities & CAP_NT_SMBS))
+		rc = CIFSSMBQFSInfo(xid, tcon, buf);
+
+	/*
+	 * Some old Windows servers also do not support level 103, retry with
+	 * older level one if old server failed the previous call or we
+	 * bypassed it because we detected that this was an older LANMAN sess
+	 */
+	if (rc)
+		rc = SMBOldQFSInfo(xid, tcon, buf);
+	return rc;
+}
+
+static int
+cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
+	       __u64 length, __u32 type, int lock, int unlock, bool wait)
+{
+	return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->fid.netfid,
+			   current->tgid, length, offset, unlock, lock,
+			   (__u8)type, wait, 0);
+}
+
+static int
+cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
+		       const unsigned char *searchName, char **symlinkinfo,
+		       const struct nls_table *nls_codepage)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	int rc;
+	unsigned int num_referrals = 0;
+	struct dfs_info3_param *referrals = NULL;
+
+	rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
+			  &num_referrals, &referrals, 0);
+
+	if (!rc && num_referrals > 0) {
+		*symlinkinfo = kstrndup(referrals->node_name,
+					strlen(referrals->node_name),
+					GFP_KERNEL);
+		if (!*symlinkinfo)
+			rc = -ENOMEM;
+		free_dfs_info_array(referrals, num_referrals);
+	}
+	return rc;
+#else /* No DFS support */
+	return -EREMOTE;
+#endif
+}
+
+static int
+cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, char **target_path,
+		   struct cifs_sb_info *cifs_sb)
+{
+	int rc;
+	int oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+	/* Check for unix extensions */
+	if (cap_unix(tcon->ses)) {
+		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
+					     cifs_sb->local_nls,
+					     cifs_remap(cifs_sb));
+		if (rc == -EREMOTE)
+			rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
+						    target_path,
+						    cifs_sb->local_nls);
+
+		goto out;
+	}
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.create_options = OPEN_REPARSE_POINT;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc)
+		goto out;
+
+	rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
+				 cifs_sb->local_nls);
+	if (rc)
+		goto out_close;
+
+	convert_delimiter(*target_path, '/');
+out_close:
+	CIFSSMBClose(xid, tcon, fid.netfid);
+out:
+	if (!rc)
+		cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	return rc;
+}
+
+static bool
+cifs_is_read_op(__u32 oplock)
+{
+	return oplock == OPLOCK_READ;
+}
+
+static unsigned int
+cifs_wp_retry_size(struct inode *inode)
+{
+	return CIFS_SB(inode->i_sb)->wsize;
+}
+
+static bool
+cifs_dir_needs_close(struct cifsFileInfo *cfile)
+{
+	return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
+}
+
+struct smb_version_operations smb1_operations = {
+	.send_cancel = send_nt_cancel,
+	.compare_fids = cifs_compare_fids,
+	.setup_request = cifs_setup_request,
+	.setup_async_request = cifs_setup_async_request,
+	.check_receive = cifs_check_receive,
+	.add_credits = cifs_add_credits,
+	.set_credits = cifs_set_credits,
+	.get_credits_field = cifs_get_credits_field,
+	.get_credits = cifs_get_credits,
+	.wait_mtu_credits = cifs_wait_mtu_credits,
+	.get_next_mid = cifs_get_next_mid,
+	.read_data_offset = cifs_read_data_offset,
+	.read_data_length = cifs_read_data_length,
+	.map_error = map_smb_to_linux_error,
+	.find_mid = cifs_find_mid,
+	.check_message = checkSMB,
+	.dump_detail = cifs_dump_detail,
+	.clear_stats = cifs_clear_stats,
+	.print_stats = cifs_print_stats,
+	.is_oplock_break = is_valid_oplock_break,
+	.downgrade_oplock = cifs_downgrade_oplock,
+	.check_trans2 = cifs_check_trans2,
+	.need_neg = cifs_need_neg,
+	.negotiate = cifs_negotiate,
+	.negotiate_wsize = cifs_negotiate_wsize,
+	.negotiate_rsize = cifs_negotiate_rsize,
+	.sess_setup = CIFS_SessSetup,
+	.logoff = CIFSSMBLogoff,
+	.tree_connect = CIFSTCon,
+	.tree_disconnect = CIFSSMBTDis,
+	.get_dfs_refer = CIFSGetDFSRefer,
+	.qfs_tcon = cifs_qfs_tcon,
+	.is_path_accessible = cifs_is_path_accessible,
+	.query_path_info = cifs_query_path_info,
+	.query_file_info = cifs_query_file_info,
+	.get_srv_inum = cifs_get_srv_inum,
+	.set_path_size = CIFSSMBSetEOF,
+	.set_file_size = CIFSSMBSetFileSize,
+	.set_file_info = smb_set_file_info,
+	.set_compression = cifs_set_compression,
+	.echo = CIFSSMBEcho,
+	.mkdir = CIFSSMBMkDir,
+	.mkdir_setinfo = cifs_mkdir_setinfo,
+	.rmdir = CIFSSMBRmDir,
+	.unlink = CIFSSMBDelFile,
+	.rename_pending_delete = cifs_rename_pending_delete,
+	.rename = CIFSSMBRename,
+	.create_hardlink = CIFSCreateHardLink,
+	.query_symlink = cifs_query_symlink,
+	.open = cifs_open_file,
+	.set_fid = cifs_set_fid,
+	.close = cifs_close_file,
+	.flush = cifs_flush_file,
+	.async_readv = cifs_async_readv,
+	.async_writev = cifs_async_writev,
+	.sync_read = cifs_sync_read,
+	.sync_write = cifs_sync_write,
+	.query_dir_first = cifs_query_dir_first,
+	.query_dir_next = cifs_query_dir_next,
+	.close_dir = cifs_close_dir,
+	.calc_smb_size = smbCalcSize,
+	.oplock_response = cifs_oplock_response,
+	.queryfs = cifs_queryfs,
+	.mand_lock = cifs_mand_lock,
+	.mand_unlock_range = cifs_unlock_range,
+	.push_mand_locks = cifs_push_mandatory_locks,
+	.query_mf_symlink = cifs_query_mf_symlink,
+	.create_mf_symlink = cifs_create_mf_symlink,
+	.is_read_op = cifs_is_read_op,
+	.wp_retry_size = cifs_wp_retry_size,
+	.dir_needs_close = cifs_dir_needs_close,
+#ifdef CONFIG_CIFS_XATTR
+	.query_all_EAs = CIFSSMBQAllEAs,
+	.set_EA = CIFSSMBSetEA,
+#endif /* CIFS_XATTR */
+#ifdef CONFIG_CIFS_ACL
+	.get_acl = get_cifs_acl,
+	.get_acl_by_fid = get_cifs_acl_by_fid,
+	.set_acl = set_cifs_acl,
+#endif /* CIFS_ACL */
+};
+
+struct smb_version_values smb1_values = {
+	.version_string = SMB1_VERSION_STRING,
+	.large_lock_type = LOCKING_ANDX_LARGE_FILES,
+	.exclusive_lock_type = 0,
+	.shared_lock_type = LOCKING_ANDX_SHARED_LOCK,
+	.unlock_lock_type = 0,
+	.header_size = sizeof(struct smb_hdr),
+	.max_header_size = MAX_CIFS_HDR_SIZE,
+	.read_rsp_size = sizeof(READ_RSP),
+	.lock_cmd = cpu_to_le16(SMB_COM_LOCKING_ANDX),
+	.cap_unix = CAP_UNIX,
+	.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
+	.cap_large_files = CAP_LARGE_FILES,
+	.signing_enabled = SECMODE_SIGN_ENABLED,
+	.signing_required = SECMODE_SIGN_REQUIRED,
+};
diff --git a/kernel/fs/cifs/smb2file.c b/kernel/fs/cifs/smb2file.c
new file mode 100644
index 000000000..2ab297dae
--- /dev/null
+++ b/kernel/fs/cifs/smb2file.c
@@ -0,0 +1,265 @@
+/*
+ *   fs/cifs/smb2file.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *   Author(s): Steve French (sfrench@us.ibm.com),
+ *              Pavel Shilovsky ((pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#include "fscache.h"
+#include "smb2proto.h"
+
+int
+smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+	       __u32 *oplock, FILE_ALL_INFO *buf)
+{
+	int rc;
+	__le16 *smb2_path;
+	struct smb2_file_all_info *smb2_data = NULL;
+	__u8 smb2_oplock[17];
+	struct cifs_fid *fid = oparms->fid;
+
+	smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
+	if (smb2_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+			    GFP_KERNEL);
+	if (smb2_data == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	oparms->desired_access |= FILE_READ_ATTRIBUTES;
+	*smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
+
+	if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
+		memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
+
+	rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL);
+	if (rc)
+		goto out;
+
+	if (buf) {
+		/* open response does not have IndexNumber field - get it */
+		rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
+				      fid->volatile_fid,
+				      &smb2_data->IndexNumber);
+		if (rc) {
+			/* let get_inode_info disable server inode numbers */
+			smb2_data->IndexNumber = 0;
+			rc = 0;
+		}
+		move_smb2_info_to_cifs(buf, smb2_data);
+	}
+
+	*oplock = *smb2_oplock;
+out:
+	kfree(smb2_data);
+	kfree(smb2_path);
+	return rc;
+}
+
+int
+smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
+		  const unsigned int xid)
+{
+	int rc = 0, stored_rc;
+	unsigned int max_num, num = 0, max_buf;
+	struct smb2_lock_element *buf, *cur;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	struct cifsLockInfo *li, *tmp;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct list_head tmp_llist;
+
+	INIT_LIST_HEAD(&tmp_llist);
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tcon->ses->server->maxBuf;
+	if (!max_buf)
+		return -EINVAL;
+
+	max_num = max_buf / sizeof(struct smb2_lock_element);
+	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	cur = buf;
+
+	down_write(&cinode->lock_sem);
+	list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
+		if (flock->fl_start > li->offset ||
+		    (flock->fl_start + length) <
+		    (li->offset + li->length))
+			continue;
+		if (current->tgid != li->pid)
+			continue;
+		if (cinode->can_cache_brlcks) {
+			/*
+			 * We can cache brlock requests - simply remove a lock
+			 * from the file's list.
+			 */
+			list_del(&li->llist);
+			cifs_del_lock_waiters(li);
+			kfree(li);
+			continue;
+		}
+		cur->Length = cpu_to_le64(li->length);
+		cur->Offset = cpu_to_le64(li->offset);
+		cur->Flags = cpu_to_le32(SMB2_LOCKFLAG_UNLOCK);
+		/*
+		 * We need to save a lock here to let us add it again to the
+		 * file's list if the unlock range request fails on the server.
+		 */
+		list_move(&li->llist, &tmp_llist);
+		if (++num == max_num) {
+			stored_rc = smb2_lockv(xid, tcon,
+					       cfile->fid.persistent_fid,
+					       cfile->fid.volatile_fid,
+					       current->tgid, num, buf);
+			if (stored_rc) {
+				/*
+				 * We failed on the unlock range request - add
+				 * all locks from the tmp list to the head of
+				 * the file's list.
+				 */
+				cifs_move_llist(&tmp_llist,
+						&cfile->llist->locks);
+				rc = stored_rc;
+			} else
+				/*
+				 * The unlock range request succeed - free the
+				 * tmp list.
+				 */
+				cifs_free_llist(&tmp_llist);
+			cur = buf;
+			num = 0;
+		} else
+			cur++;
+	}
+	if (num) {
+		stored_rc = smb2_lockv(xid, tcon, cfile->fid.persistent_fid,
+				       cfile->fid.volatile_fid, current->tgid,
+				       num, buf);
+		if (stored_rc) {
+			cifs_move_llist(&tmp_llist, &cfile->llist->locks);
+			rc = stored_rc;
+		} else
+			cifs_free_llist(&tmp_llist);
+	}
+	up_write(&cinode->lock_sem);
+
+	kfree(buf);
+	return rc;
+}
+
+static int
+smb2_push_mand_fdlocks(struct cifs_fid_locks *fdlocks, const unsigned int xid,
+		       struct smb2_lock_element *buf, unsigned int max_num)
+{
+	int rc = 0, stored_rc;
+	struct cifsFileInfo *cfile = fdlocks->cfile;
+	struct cifsLockInfo *li;
+	unsigned int num = 0;
+	struct smb2_lock_element *cur = buf;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+	list_for_each_entry(li, &fdlocks->locks, llist) {
+		cur->Length = cpu_to_le64(li->length);
+		cur->Offset = cpu_to_le64(li->offset);
+		cur->Flags = cpu_to_le32(li->type |
+						SMB2_LOCKFLAG_FAIL_IMMEDIATELY);
+		if (++num == max_num) {
+			stored_rc = smb2_lockv(xid, tcon,
+					       cfile->fid.persistent_fid,
+					       cfile->fid.volatile_fid,
+					       current->tgid, num, buf);
+			if (stored_rc)
+				rc = stored_rc;
+			cur = buf;
+			num = 0;
+		} else
+			cur++;
+	}
+	if (num) {
+		stored_rc = smb2_lockv(xid, tcon,
+				       cfile->fid.persistent_fid,
+				       cfile->fid.volatile_fid,
+				       current->tgid, num, buf);
+		if (stored_rc)
+			rc = stored_rc;
+	}
+
+	return rc;
+}
+
+int
+smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
+{
+	int rc = 0, stored_rc;
+	unsigned int xid;
+	unsigned int max_num, max_buf;
+	struct smb2_lock_element *buf;
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	struct cifs_fid_locks *fdlocks;
+
+	xid = get_xid();
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
+	if (!max_buf) {
+		free_xid(xid);
+		return -EINVAL;
+	}
+
+	max_num = max_buf / sizeof(struct smb2_lock_element);
+	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
+	if (!buf) {
+		free_xid(xid);
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(fdlocks, &cinode->llist, llist) {
+		stored_rc = smb2_push_mand_fdlocks(fdlocks, xid, buf, max_num);
+		if (stored_rc)
+			rc = stored_rc;
+	}
+
+	kfree(buf);
+	free_xid(xid);
+	return rc;
+}
diff --git a/kernel/fs/cifs/smb2glob.h b/kernel/fs/cifs/smb2glob.h
new file mode 100644
index 000000000..bc0bb9c34
--- /dev/null
+++ b/kernel/fs/cifs/smb2glob.h
@@ -0,0 +1,63 @@
+/*
+ *   fs/cifs/smb2glob.h
+ *
+ *   Definitions for various global variables and structures
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ */
+#ifndef _SMB2_GLOB_H
+#define _SMB2_GLOB_H
+
+#define SMB2_MAGIC_NUMBER 0xFE534D42
+
+/*
+ *****************************************************************
+ * Constants go here
+ *****************************************************************
+ */
+
+/*
+ * Identifiers for functions that use the open, operation, close pattern
+ * in smb2inode.c:smb2_open_op_close()
+ */
+#define SMB2_OP_SET_DELETE 1
+#define SMB2_OP_SET_INFO 2
+#define SMB2_OP_QUERY_INFO 3
+#define SMB2_OP_QUERY_DIR 4
+#define SMB2_OP_MKDIR 5
+#define SMB2_OP_RENAME 6
+#define SMB2_OP_DELETE 7
+#define SMB2_OP_HARDLINK 8
+#define SMB2_OP_SET_EOF 9
+
+/* Used when constructing chained read requests. */
+#define CHAINED_REQUEST 1
+#define START_OF_CHAIN 2
+#define END_OF_CHAIN 4
+#define RELATED_REQUEST 8
+
+#define SMB2_SIGNATURE_SIZE (16)
+#define SMB2_NTLMV2_SESSKEY_SIZE (16)
+#define SMB2_HMACSHA256_SIZE (32)
+#define SMB2_CMACAES_SIZE (16)
+#define SMB3_SIGNKEY_SIZE (16)
+
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
+
+#endif	/* _SMB2_GLOB_H */
diff --git a/kernel/fs/cifs/smb2inode.c b/kernel/fs/cifs/smb2inode.c
new file mode 100644
index 000000000..899bbc86f
--- /dev/null
+++ b/kernel/fs/cifs/smb2inode.c
@@ -0,0 +1,273 @@
+/*
+ *   fs/cifs/smb2inode.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Pavel Shilovsky (pshilovsky@samba.org),
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#include "fscache.h"
+#include "smb2glob.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+
+static int
+smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifs_sb_info *cifs_sb, const char *full_path,
+		   __u32 desired_access, __u32 create_disposition,
+		   __u32 create_options, void *data, int command)
+{
+	int rc, tmprc = 0;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = desired_access;
+	oparms.disposition = create_disposition;
+	oparms.create_options = create_options;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+	if (rc) {
+		kfree(utf16_path);
+		return rc;
+	}
+
+	switch (command) {
+	case SMB2_OP_DELETE:
+		break;
+	case SMB2_OP_QUERY_INFO:
+		tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+					fid.volatile_fid,
+					(struct smb2_file_all_info *)data);
+		break;
+	case SMB2_OP_MKDIR:
+		/*
+		 * Directories are created through parameters in the
+		 * SMB2_open() call.
+		 */
+		break;
+	case SMB2_OP_RENAME:
+		tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
+				    fid.volatile_fid, (__le16 *)data);
+		break;
+	case SMB2_OP_HARDLINK:
+		tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
+					  fid.volatile_fid, (__le16 *)data);
+		break;
+	case SMB2_OP_SET_EOF:
+		tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
+				     fid.volatile_fid, current->tgid,
+				     (__le64 *)data, false);
+		break;
+	case SMB2_OP_SET_INFO:
+		tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
+				      fid.volatile_fid,
+				      (FILE_BASIC_INFO *)data);
+		break;
+	default:
+		cifs_dbg(VFS, "Invalid command\n");
+		break;
+	}
+
+	rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	if (tmprc)
+		rc = tmprc;
+	kfree(utf16_path);
+	return rc;
+}
+
+void
+move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
+{
+	memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src);
+	dst->CurrentByteOffset = src->CurrentByteOffset;
+	dst->Mode = src->Mode;
+	dst->AlignmentRequirement = src->AlignmentRequirement;
+	dst->IndexNumber1 = 0; /* we don't use it */
+}
+
+int
+smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_sb_info *cifs_sb, const char *full_path,
+		     FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink)
+{
+	int rc;
+	struct smb2_file_all_info *smb2_data;
+
+	*adjust_tz = false;
+	*symlink = false;
+
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+			    GFP_KERNEL);
+	if (smb2_data == NULL)
+		return -ENOMEM;
+
+	rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+				FILE_READ_ATTRIBUTES, FILE_OPEN, 0,
+				smb2_data, SMB2_OP_QUERY_INFO);
+	if (rc == -EOPNOTSUPP) {
+		*symlink = true;
+		/* Failed on a symbolic link - query a reparse point info */
+		rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+					FILE_READ_ATTRIBUTES, FILE_OPEN,
+					OPEN_REPARSE_POINT, smb2_data,
+					SMB2_OP_QUERY_INFO);
+	}
+	if (rc)
+		goto out;
+
+	move_smb2_info_to_cifs(data, smb2_data);
+out:
+	kfree(smb2_data);
+	return rc;
+}
+
+int
+smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	   struct cifs_sb_info *cifs_sb)
+{
+	return smb2_open_op_close(xid, tcon, cifs_sb, name,
+				  FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+				  CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
+}
+
+void
+smb2_mkdir_setinfo(struct inode *inode, const char *name,
+		   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
+		   const unsigned int xid)
+{
+	FILE_BASIC_INFO data;
+	struct cifsInodeInfo *cifs_i;
+	u32 dosattrs;
+	int tmprc;
+
+	memset(&data, 0, sizeof(data));
+	cifs_i = CIFS_I(inode);
+	dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
+	data.Attributes = cpu_to_le32(dosattrs);
+	tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
+				   FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+				   CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
+	if (tmprc == 0)
+		cifs_i->cifsAttrs = dosattrs;
+}
+
+int
+smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	   struct cifs_sb_info *cifs_sb)
+{
+	return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+				  CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
+				  NULL, SMB2_OP_DELETE);
+}
+
+int
+smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	    struct cifs_sb_info *cifs_sb)
+{
+	return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+				  CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+				  NULL, SMB2_OP_DELETE);
+}
+
+static int
+smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *from_name, const char *to_name,
+		   struct cifs_sb_info *cifs_sb, __u32 access, int command)
+{
+	__le16 *smb2_to_name = NULL;
+	int rc;
+
+	smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb);
+	if (smb2_to_name == NULL) {
+		rc = -ENOMEM;
+		goto smb2_rename_path;
+	}
+
+	rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
+				FILE_OPEN, 0, smb2_to_name, command);
+smb2_rename_path:
+	kfree(smb2_to_name);
+	return rc;
+}
+
+int
+smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
+		 const char *from_name, const char *to_name,
+		 struct cifs_sb_info *cifs_sb)
+{
+	return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+				  DELETE, SMB2_OP_RENAME);
+}
+
+int
+smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+		     const char *from_name, const char *to_name,
+		     struct cifs_sb_info *cifs_sb)
+{
+	return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+				  FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK);
+}
+
+int
+smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, __u64 size,
+		   struct cifs_sb_info *cifs_sb, bool set_alloc)
+{
+	__le64 eof = cpu_to_le64(size);
+	return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+				  FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
+				  SMB2_OP_SET_EOF);
+}
+
+int
+smb2_set_file_info(struct inode *inode, const char *full_path,
+		   FILE_BASIC_INFO *buf, const unsigned int xid)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	int rc;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
+				FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
+				SMB2_OP_SET_INFO);
+	cifs_put_tlink(tlink);
+	return rc;
+}
diff --git a/kernel/fs/cifs/smb2maperror.c b/kernel/fs/cifs/smb2maperror.c
new file mode 100644
index 000000000..8257a5a97
--- /dev/null
+++ b/kernel/fs/cifs/smb2maperror.c
@@ -0,0 +1,2481 @@
+/*
+ *   fs/smb2/smb2maperror.c
+ *
+ *   Functions which do error mapping of SMB2 status codes to POSIX errors
+ *
+ *   Copyright (C) International Business Machines  Corp., 2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/errno.h>
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+#include "smb2status.h"
+
+struct status_to_posix_error {
+	__le32 smb2_status;
+	int posix_error;
+	char *status_string;
+};
+
+static const struct status_to_posix_error smb2_error_map_table[] = {
+	{STATUS_SUCCESS, 0, "STATUS_SUCCESS"},
+	{STATUS_WAIT_0,  0, "STATUS_WAIT_0"},
+	{STATUS_WAIT_1, -EIO, "STATUS_WAIT_1"},
+	{STATUS_WAIT_2, -EIO, "STATUS_WAIT_2"},
+	{STATUS_WAIT_3, -EIO, "STATUS_WAIT_3"},
+	{STATUS_WAIT_63, -EIO, "STATUS_WAIT_63"},
+	{STATUS_ABANDONED, -EIO, "STATUS_ABANDONED"},
+	{STATUS_ABANDONED_WAIT_0, -EIO, "STATUS_ABANDONED_WAIT_0"},
+	{STATUS_ABANDONED_WAIT_63, -EIO, "STATUS_ABANDONED_WAIT_63"},
+	{STATUS_USER_APC, -EIO, "STATUS_USER_APC"},
+	{STATUS_KERNEL_APC, -EIO, "STATUS_KERNEL_APC"},
+	{STATUS_ALERTED, -EIO, "STATUS_ALERTED"},
+	{STATUS_TIMEOUT, -ETIMEDOUT, "STATUS_TIMEOUT"},
+	{STATUS_PENDING, -EIO, "STATUS_PENDING"},
+	{STATUS_REPARSE, -EIO, "STATUS_REPARSE"},
+	{STATUS_MORE_ENTRIES, -EIO, "STATUS_MORE_ENTRIES"},
+	{STATUS_NOT_ALL_ASSIGNED, -EIO, "STATUS_NOT_ALL_ASSIGNED"},
+	{STATUS_SOME_NOT_MAPPED, -EIO, "STATUS_SOME_NOT_MAPPED"},
+	{STATUS_OPLOCK_BREAK_IN_PROGRESS, -EIO,
+	"STATUS_OPLOCK_BREAK_IN_PROGRESS"},
+	{STATUS_VOLUME_MOUNTED, -EIO, "STATUS_VOLUME_MOUNTED"},
+	{STATUS_RXACT_COMMITTED, -EIO, "STATUS_RXACT_COMMITTED"},
+	{STATUS_NOTIFY_CLEANUP, -EIO, "STATUS_NOTIFY_CLEANUP"},
+	{STATUS_NOTIFY_ENUM_DIR, -EIO, "STATUS_NOTIFY_ENUM_DIR"},
+	{STATUS_NO_QUOTAS_FOR_ACCOUNT, -EIO, "STATUS_NO_QUOTAS_FOR_ACCOUNT"},
+	{STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED, -EIO,
+	"STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED"},
+	{STATUS_PAGE_FAULT_TRANSITION, -EIO, "STATUS_PAGE_FAULT_TRANSITION"},
+	{STATUS_PAGE_FAULT_DEMAND_ZERO, -EIO, "STATUS_PAGE_FAULT_DEMAND_ZERO"},
+	{STATUS_PAGE_FAULT_COPY_ON_WRITE, -EIO,
+	"STATUS_PAGE_FAULT_COPY_ON_WRITE"},
+	{STATUS_PAGE_FAULT_GUARD_PAGE, -EIO, "STATUS_PAGE_FAULT_GUARD_PAGE"},
+	{STATUS_PAGE_FAULT_PAGING_FILE, -EIO, "STATUS_PAGE_FAULT_PAGING_FILE"},
+	{STATUS_CACHE_PAGE_LOCKED, -EIO, "STATUS_CACHE_PAGE_LOCKED"},
+	{STATUS_CRASH_DUMP, -EIO, "STATUS_CRASH_DUMP"},
+	{STATUS_BUFFER_ALL_ZEROS, -EIO, "STATUS_BUFFER_ALL_ZEROS"},
+	{STATUS_REPARSE_OBJECT, -EIO, "STATUS_REPARSE_OBJECT"},
+	{STATUS_RESOURCE_REQUIREMENTS_CHANGED, -EIO,
+	"STATUS_RESOURCE_REQUIREMENTS_CHANGED"},
+	{STATUS_TRANSLATION_COMPLETE, -EIO, "STATUS_TRANSLATION_COMPLETE"},
+	{STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY, -EIO,
+	"STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY"},
+	{STATUS_NOTHING_TO_TERMINATE, -EIO, "STATUS_NOTHING_TO_TERMINATE"},
+	{STATUS_PROCESS_NOT_IN_JOB, -EIO, "STATUS_PROCESS_NOT_IN_JOB"},
+	{STATUS_PROCESS_IN_JOB, -EIO, "STATUS_PROCESS_IN_JOB"},
+	{STATUS_VOLSNAP_HIBERNATE_READY, -EIO,
+	"STATUS_VOLSNAP_HIBERNATE_READY"},
+	{STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY, -EIO,
+	"STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY"},
+	{STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED, -EIO,
+	"STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED"},
+	{STATUS_INTERRUPT_STILL_CONNECTED, -EIO,
+	"STATUS_INTERRUPT_STILL_CONNECTED"},
+	{STATUS_PROCESS_CLONED, -EIO, "STATUS_PROCESS_CLONED"},
+	{STATUS_FILE_LOCKED_WITH_ONLY_READERS, -EIO,
+	"STATUS_FILE_LOCKED_WITH_ONLY_READERS"},
+	{STATUS_FILE_LOCKED_WITH_WRITERS, -EIO,
+	"STATUS_FILE_LOCKED_WITH_WRITERS"},
+	{STATUS_RESOURCEMANAGER_READ_ONLY, -EROFS,
+	"STATUS_RESOURCEMANAGER_READ_ONLY"},
+	{STATUS_WAIT_FOR_OPLOCK, -EIO, "STATUS_WAIT_FOR_OPLOCK"},
+	{DBG_EXCEPTION_HANDLED, -EIO, "DBG_EXCEPTION_HANDLED"},
+	{DBG_CONTINUE, -EIO, "DBG_CONTINUE"},
+	{STATUS_FLT_IO_COMPLETE, -EIO, "STATUS_FLT_IO_COMPLETE"},
+	{STATUS_OBJECT_NAME_EXISTS, -EIO, "STATUS_OBJECT_NAME_EXISTS"},
+	{STATUS_THREAD_WAS_SUSPENDED, -EIO, "STATUS_THREAD_WAS_SUSPENDED"},
+	{STATUS_WORKING_SET_LIMIT_RANGE, -EIO,
+	"STATUS_WORKING_SET_LIMIT_RANGE"},
+	{STATUS_IMAGE_NOT_AT_BASE, -EIO, "STATUS_IMAGE_NOT_AT_BASE"},
+	{STATUS_RXACT_STATE_CREATED, -EIO, "STATUS_RXACT_STATE_CREATED"},
+	{STATUS_SEGMENT_NOTIFICATION, -EIO, "STATUS_SEGMENT_NOTIFICATION"},
+	{STATUS_LOCAL_USER_SESSION_KEY, -EIO, "STATUS_LOCAL_USER_SESSION_KEY"},
+	{STATUS_BAD_CURRENT_DIRECTORY, -EIO, "STATUS_BAD_CURRENT_DIRECTORY"},
+	{STATUS_SERIAL_MORE_WRITES, -EIO, "STATUS_SERIAL_MORE_WRITES"},
+	{STATUS_REGISTRY_RECOVERED, -EIO, "STATUS_REGISTRY_RECOVERED"},
+	{STATUS_FT_READ_RECOVERY_FROM_BACKUP, -EIO,
+	"STATUS_FT_READ_RECOVERY_FROM_BACKUP"},
+	{STATUS_FT_WRITE_RECOVERY, -EIO, "STATUS_FT_WRITE_RECOVERY"},
+	{STATUS_SERIAL_COUNTER_TIMEOUT, -ETIMEDOUT,
+	"STATUS_SERIAL_COUNTER_TIMEOUT"},
+	{STATUS_NULL_LM_PASSWORD, -EIO, "STATUS_NULL_LM_PASSWORD"},
+	{STATUS_IMAGE_MACHINE_TYPE_MISMATCH, -EIO,
+	"STATUS_IMAGE_MACHINE_TYPE_MISMATCH"},
+	{STATUS_RECEIVE_PARTIAL, -EIO, "STATUS_RECEIVE_PARTIAL"},
+	{STATUS_RECEIVE_EXPEDITED, -EIO, "STATUS_RECEIVE_EXPEDITED"},
+	{STATUS_RECEIVE_PARTIAL_EXPEDITED, -EIO,
+	"STATUS_RECEIVE_PARTIAL_EXPEDITED"},
+	{STATUS_EVENT_DONE, -EIO, "STATUS_EVENT_DONE"},
+	{STATUS_EVENT_PENDING, -EIO, "STATUS_EVENT_PENDING"},
+	{STATUS_CHECKING_FILE_SYSTEM, -EIO, "STATUS_CHECKING_FILE_SYSTEM"},
+	{STATUS_FATAL_APP_EXIT, -EIO, "STATUS_FATAL_APP_EXIT"},
+	{STATUS_PREDEFINED_HANDLE, -EIO, "STATUS_PREDEFINED_HANDLE"},
+	{STATUS_WAS_UNLOCKED, -EIO, "STATUS_WAS_UNLOCKED"},
+	{STATUS_SERVICE_NOTIFICATION, -EIO, "STATUS_SERVICE_NOTIFICATION"},
+	{STATUS_WAS_LOCKED, -EIO, "STATUS_WAS_LOCKED"},
+	{STATUS_LOG_HARD_ERROR, -EIO, "STATUS_LOG_HARD_ERROR"},
+	{STATUS_ALREADY_WIN32, -EIO, "STATUS_ALREADY_WIN32"},
+	{STATUS_WX86_UNSIMULATE, -EIO, "STATUS_WX86_UNSIMULATE"},
+	{STATUS_WX86_CONTINUE, -EIO, "STATUS_WX86_CONTINUE"},
+	{STATUS_WX86_SINGLE_STEP, -EIO, "STATUS_WX86_SINGLE_STEP"},
+	{STATUS_WX86_BREAKPOINT, -EIO, "STATUS_WX86_BREAKPOINT"},
+	{STATUS_WX86_EXCEPTION_CONTINUE, -EIO,
+	"STATUS_WX86_EXCEPTION_CONTINUE"},
+	{STATUS_WX86_EXCEPTION_LASTCHANCE, -EIO,
+	"STATUS_WX86_EXCEPTION_LASTCHANCE"},
+	{STATUS_WX86_EXCEPTION_CHAIN, -EIO, "STATUS_WX86_EXCEPTION_CHAIN"},
+	{STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE, -EIO,
+	"STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE"},
+	{STATUS_NO_YIELD_PERFORMED, -EIO, "STATUS_NO_YIELD_PERFORMED"},
+	{STATUS_TIMER_RESUME_IGNORED, -EIO, "STATUS_TIMER_RESUME_IGNORED"},
+	{STATUS_ARBITRATION_UNHANDLED, -EIO, "STATUS_ARBITRATION_UNHANDLED"},
+	{STATUS_CARDBUS_NOT_SUPPORTED, -ENOSYS, "STATUS_CARDBUS_NOT_SUPPORTED"},
+	{STATUS_WX86_CREATEWX86TIB, -EIO, "STATUS_WX86_CREATEWX86TIB"},
+	{STATUS_MP_PROCESSOR_MISMATCH, -EIO, "STATUS_MP_PROCESSOR_MISMATCH"},
+	{STATUS_HIBERNATED, -EIO, "STATUS_HIBERNATED"},
+	{STATUS_RESUME_HIBERNATION, -EIO, "STATUS_RESUME_HIBERNATION"},
+	{STATUS_FIRMWARE_UPDATED, -EIO, "STATUS_FIRMWARE_UPDATED"},
+	{STATUS_DRIVERS_LEAKING_LOCKED_PAGES, -EIO,
+	"STATUS_DRIVERS_LEAKING_LOCKED_PAGES"},
+	{STATUS_MESSAGE_RETRIEVED, -EIO, "STATUS_MESSAGE_RETRIEVED"},
+	{STATUS_SYSTEM_POWERSTATE_TRANSITION, -EIO,
+	"STATUS_SYSTEM_POWERSTATE_TRANSITION"},
+	{STATUS_ALPC_CHECK_COMPLETION_LIST, -EIO,
+	"STATUS_ALPC_CHECK_COMPLETION_LIST"},
+	{STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION, -EIO,
+	"STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION"},
+	{STATUS_ACCESS_AUDIT_BY_POLICY, -EIO, "STATUS_ACCESS_AUDIT_BY_POLICY"},
+	{STATUS_ABANDON_HIBERFILE, -EIO, "STATUS_ABANDON_HIBERFILE"},
+	{STATUS_BIZRULES_NOT_ENABLED, -EIO, "STATUS_BIZRULES_NOT_ENABLED"},
+	{STATUS_WAKE_SYSTEM, -EIO, "STATUS_WAKE_SYSTEM"},
+	{STATUS_DS_SHUTTING_DOWN, -EIO, "STATUS_DS_SHUTTING_DOWN"},
+	{DBG_REPLY_LATER, -EIO, "DBG_REPLY_LATER"},
+	{DBG_UNABLE_TO_PROVIDE_HANDLE, -EIO, "DBG_UNABLE_TO_PROVIDE_HANDLE"},
+	{DBG_TERMINATE_THREAD, -EIO, "DBG_TERMINATE_THREAD"},
+	{DBG_TERMINATE_PROCESS, -EIO, "DBG_TERMINATE_PROCESS"},
+	{DBG_CONTROL_C, -EIO, "DBG_CONTROL_C"},
+	{DBG_PRINTEXCEPTION_C, -EIO, "DBG_PRINTEXCEPTION_C"},
+	{DBG_RIPEXCEPTION, -EIO, "DBG_RIPEXCEPTION"},
+	{DBG_CONTROL_BREAK, -EIO, "DBG_CONTROL_BREAK"},
+	{DBG_COMMAND_EXCEPTION, -EIO, "DBG_COMMAND_EXCEPTION"},
+	{RPC_NT_UUID_LOCAL_ONLY, -EIO, "RPC_NT_UUID_LOCAL_ONLY"},
+	{RPC_NT_SEND_INCOMPLETE, -EIO, "RPC_NT_SEND_INCOMPLETE"},
+	{STATUS_CTX_CDM_CONNECT, -EIO, "STATUS_CTX_CDM_CONNECT"},
+	{STATUS_CTX_CDM_DISCONNECT, -EIO, "STATUS_CTX_CDM_DISCONNECT"},
+	{STATUS_SXS_RELEASE_ACTIVATION_CONTEXT, -EIO,
+	"STATUS_SXS_RELEASE_ACTIVATION_CONTEXT"},
+	{STATUS_RECOVERY_NOT_NEEDED, -EIO, "STATUS_RECOVERY_NOT_NEEDED"},
+	{STATUS_RM_ALREADY_STARTED, -EIO, "STATUS_RM_ALREADY_STARTED"},
+	{STATUS_LOG_NO_RESTART, -EIO, "STATUS_LOG_NO_RESTART"},
+	{STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST, -EIO,
+	"STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST"},
+	{STATUS_GRAPHICS_PARTIAL_DATA_POPULATED, -EIO,
+	"STATUS_GRAPHICS_PARTIAL_DATA_POPULATED"},
+	{STATUS_GRAPHICS_DRIVER_MISMATCH, -EIO,
+	"STATUS_GRAPHICS_DRIVER_MISMATCH"},
+	{STATUS_GRAPHICS_MODE_NOT_PINNED, -EIO,
+	"STATUS_GRAPHICS_MODE_NOT_PINNED"},
+	{STATUS_GRAPHICS_NO_PREFERRED_MODE, -EIO,
+	"STATUS_GRAPHICS_NO_PREFERRED_MODE"},
+	{STATUS_GRAPHICS_DATASET_IS_EMPTY, -EIO,
+	"STATUS_GRAPHICS_DATASET_IS_EMPTY"},
+	{STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET, -EIO,
+	"STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET"},
+	{STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED, -EIO,
+	"STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED"},
+	{STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS, -EIO,
+	"STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS"},
+	{STATUS_GRAPHICS_LEADLINK_START_DEFERRED, -EIO,
+	"STATUS_GRAPHICS_LEADLINK_START_DEFERRED"},
+	{STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY, -EIO,
+	"STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY"},
+	{STATUS_GRAPHICS_START_DEFERRED, -EIO,
+	"STATUS_GRAPHICS_START_DEFERRED"},
+	{STATUS_NDIS_INDICATION_REQUIRED, -EIO,
+	"STATUS_NDIS_INDICATION_REQUIRED"},
+	{STATUS_GUARD_PAGE_VIOLATION, -EIO, "STATUS_GUARD_PAGE_VIOLATION"},
+	{STATUS_DATATYPE_MISALIGNMENT, -EIO, "STATUS_DATATYPE_MISALIGNMENT"},
+	{STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
+	{STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
+	{STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
+	{STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
+	{STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
+	{STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
+	{STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
+	{STATUS_GUID_SUBSTITUTION_MADE, -EIO, "STATUS_GUID_SUBSTITUTION_MADE"},
+	{STATUS_PARTIAL_COPY, -EIO, "STATUS_PARTIAL_COPY"},
+	{STATUS_DEVICE_PAPER_EMPTY, -EIO, "STATUS_DEVICE_PAPER_EMPTY"},
+	{STATUS_DEVICE_POWERED_OFF, -EIO, "STATUS_DEVICE_POWERED_OFF"},
+	{STATUS_DEVICE_OFF_LINE, -EIO, "STATUS_DEVICE_OFF_LINE"},
+	{STATUS_DEVICE_BUSY, -EBUSY, "STATUS_DEVICE_BUSY"},
+	{STATUS_NO_MORE_EAS, -EIO, "STATUS_NO_MORE_EAS"},
+	{STATUS_INVALID_EA_NAME, -EINVAL, "STATUS_INVALID_EA_NAME"},
+	{STATUS_EA_LIST_INCONSISTENT, -EIO, "STATUS_EA_LIST_INCONSISTENT"},
+	{STATUS_INVALID_EA_FLAG, -EINVAL, "STATUS_INVALID_EA_FLAG"},
+	{STATUS_VERIFY_REQUIRED, -EIO, "STATUS_VERIFY_REQUIRED"},
+	{STATUS_EXTRANEOUS_INFORMATION, -EIO, "STATUS_EXTRANEOUS_INFORMATION"},
+	{STATUS_RXACT_COMMIT_NECESSARY, -EIO, "STATUS_RXACT_COMMIT_NECESSARY"},
+	{STATUS_NO_MORE_ENTRIES, -EIO, "STATUS_NO_MORE_ENTRIES"},
+	{STATUS_FILEMARK_DETECTED, -EIO, "STATUS_FILEMARK_DETECTED"},
+	{STATUS_MEDIA_CHANGED, -EIO, "STATUS_MEDIA_CHANGED"},
+	{STATUS_BUS_RESET, -EIO, "STATUS_BUS_RESET"},
+	{STATUS_END_OF_MEDIA, -EIO, "STATUS_END_OF_MEDIA"},
+	{STATUS_BEGINNING_OF_MEDIA, -EIO, "STATUS_BEGINNING_OF_MEDIA"},
+	{STATUS_MEDIA_CHECK, -EIO, "STATUS_MEDIA_CHECK"},
+	{STATUS_SETMARK_DETECTED, -EIO, "STATUS_SETMARK_DETECTED"},
+	{STATUS_NO_DATA_DETECTED, -EIO, "STATUS_NO_DATA_DETECTED"},
+	{STATUS_REDIRECTOR_HAS_OPEN_HANDLES, -EIO,
+	"STATUS_REDIRECTOR_HAS_OPEN_HANDLES"},
+	{STATUS_SERVER_HAS_OPEN_HANDLES, -EIO,
+	"STATUS_SERVER_HAS_OPEN_HANDLES"},
+	{STATUS_ALREADY_DISCONNECTED, -EIO, "STATUS_ALREADY_DISCONNECTED"},
+	{STATUS_LONGJUMP, -EIO, "STATUS_LONGJUMP"},
+	{STATUS_CLEANER_CARTRIDGE_INSTALLED, -EIO,
+	"STATUS_CLEANER_CARTRIDGE_INSTALLED"},
+	{STATUS_PLUGPLAY_QUERY_VETOED, -EIO, "STATUS_PLUGPLAY_QUERY_VETOED"},
+	{STATUS_UNWIND_CONSOLIDATE, -EIO, "STATUS_UNWIND_CONSOLIDATE"},
+	{STATUS_REGISTRY_HIVE_RECOVERED, -EIO,
+	"STATUS_REGISTRY_HIVE_RECOVERED"},
+	{STATUS_DLL_MIGHT_BE_INSECURE, -EIO, "STATUS_DLL_MIGHT_BE_INSECURE"},
+	{STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
+	"STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
+	{STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
+	{STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
+	"STATUS_REPARSE_NOT_HANDLED"},
+	{STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
+	"STATUS_DEVICE_REQUIRES_CLEANING"},
+	{STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
+	{STATUS_DATA_LOST_REPAIR, -EIO, "STATUS_DATA_LOST_REPAIR"},
+	{DBG_EXCEPTION_NOT_HANDLED, -EIO, "DBG_EXCEPTION_NOT_HANDLED"},
+	{STATUS_CLUSTER_NODE_ALREADY_UP, -EIO,
+	"STATUS_CLUSTER_NODE_ALREADY_UP"},
+	{STATUS_CLUSTER_NODE_ALREADY_DOWN, -EIO,
+	"STATUS_CLUSTER_NODE_ALREADY_DOWN"},
+	{STATUS_CLUSTER_NETWORK_ALREADY_ONLINE, -EIO,
+	"STATUS_CLUSTER_NETWORK_ALREADY_ONLINE"},
+	{STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE, -EIO,
+	"STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE"},
+	{STATUS_CLUSTER_NODE_ALREADY_MEMBER, -EIO,
+	"STATUS_CLUSTER_NODE_ALREADY_MEMBER"},
+	{STATUS_COULD_NOT_RESIZE_LOG, -EIO, "STATUS_COULD_NOT_RESIZE_LOG"},
+	{STATUS_NO_TXF_METADATA, -EIO, "STATUS_NO_TXF_METADATA"},
+	{STATUS_CANT_RECOVER_WITH_HANDLE_OPEN, -EIO,
+	"STATUS_CANT_RECOVER_WITH_HANDLE_OPEN"},
+	{STATUS_TXF_METADATA_ALREADY_PRESENT, -EIO,
+	"STATUS_TXF_METADATA_ALREADY_PRESENT"},
+	{STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET, -EIO,
+	"STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET"},
+	{STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED, -EIO,
+	"STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED"},
+	{STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"},
+	{STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"},
+	{STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"},
+	{STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"},
+	{STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"},
+	{STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"},
+	{STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"},
+	{STATUS_IN_PAGE_ERROR, -EFAULT, "STATUS_IN_PAGE_ERROR"},
+	{STATUS_PAGEFILE_QUOTA, -EDQUOT, "STATUS_PAGEFILE_QUOTA"},
+	{STATUS_INVALID_HANDLE, -EBADF, "STATUS_INVALID_HANDLE"},
+	{STATUS_BAD_INITIAL_STACK, -EIO, "STATUS_BAD_INITIAL_STACK"},
+	{STATUS_BAD_INITIAL_PC, -EIO, "STATUS_BAD_INITIAL_PC"},
+	{STATUS_INVALID_CID, -EIO, "STATUS_INVALID_CID"},
+	{STATUS_TIMER_NOT_CANCELED, -EIO, "STATUS_TIMER_NOT_CANCELED"},
+	{STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"},
+	{STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"},
+	{STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"},
+	{STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"},
+	{STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"},
+	{STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"},
+	{STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"},
+	{STATUS_UNRECOGNIZED_MEDIA, -EIO, "STATUS_UNRECOGNIZED_MEDIA"},
+	{STATUS_NONEXISTENT_SECTOR, -EIO, "STATUS_NONEXISTENT_SECTOR"},
+	{STATUS_MORE_PROCESSING_REQUIRED, -EIO,
+	"STATUS_MORE_PROCESSING_REQUIRED"},
+	{STATUS_NO_MEMORY, -EREMOTEIO, "STATUS_NO_MEMORY"},
+	{STATUS_CONFLICTING_ADDRESSES, -EADDRINUSE,
+	"STATUS_CONFLICTING_ADDRESSES"},
+	{STATUS_NOT_MAPPED_VIEW, -EIO, "STATUS_NOT_MAPPED_VIEW"},
+	{STATUS_UNABLE_TO_FREE_VM, -EIO, "STATUS_UNABLE_TO_FREE_VM"},
+	{STATUS_UNABLE_TO_DELETE_SECTION, -EIO,
+	"STATUS_UNABLE_TO_DELETE_SECTION"},
+	{STATUS_INVALID_SYSTEM_SERVICE, -EIO, "STATUS_INVALID_SYSTEM_SERVICE"},
+	{STATUS_ILLEGAL_INSTRUCTION, -EIO, "STATUS_ILLEGAL_INSTRUCTION"},
+	{STATUS_INVALID_LOCK_SEQUENCE, -EIO, "STATUS_INVALID_LOCK_SEQUENCE"},
+	{STATUS_INVALID_VIEW_SIZE, -EIO, "STATUS_INVALID_VIEW_SIZE"},
+	{STATUS_INVALID_FILE_FOR_SECTION, -EIO,
+	"STATUS_INVALID_FILE_FOR_SECTION"},
+	{STATUS_ALREADY_COMMITTED, -EIO, "STATUS_ALREADY_COMMITTED"},
+	{STATUS_ACCESS_DENIED, -EACCES, "STATUS_ACCESS_DENIED"},
+	{STATUS_BUFFER_TOO_SMALL, -EIO, "STATUS_BUFFER_TOO_SMALL"},
+	{STATUS_OBJECT_TYPE_MISMATCH, -EIO, "STATUS_OBJECT_TYPE_MISMATCH"},
+	{STATUS_NONCONTINUABLE_EXCEPTION, -EIO,
+	"STATUS_NONCONTINUABLE_EXCEPTION"},
+	{STATUS_INVALID_DISPOSITION, -EIO, "STATUS_INVALID_DISPOSITION"},
+	{STATUS_UNWIND, -EIO, "STATUS_UNWIND"},
+	{STATUS_BAD_STACK, -EIO, "STATUS_BAD_STACK"},
+	{STATUS_INVALID_UNWIND_TARGET, -EIO, "STATUS_INVALID_UNWIND_TARGET"},
+	{STATUS_NOT_LOCKED, -EIO, "STATUS_NOT_LOCKED"},
+	{STATUS_PARITY_ERROR, -EIO, "STATUS_PARITY_ERROR"},
+	{STATUS_UNABLE_TO_DECOMMIT_VM, -EIO, "STATUS_UNABLE_TO_DECOMMIT_VM"},
+	{STATUS_NOT_COMMITTED, -EIO, "STATUS_NOT_COMMITTED"},
+	{STATUS_INVALID_PORT_ATTRIBUTES, -EIO,
+	"STATUS_INVALID_PORT_ATTRIBUTES"},
+	{STATUS_PORT_MESSAGE_TOO_LONG, -EIO, "STATUS_PORT_MESSAGE_TOO_LONG"},
+	{STATUS_INVALID_PARAMETER_MIX, -EINVAL, "STATUS_INVALID_PARAMETER_MIX"},
+	{STATUS_INVALID_QUOTA_LOWER, -EIO, "STATUS_INVALID_QUOTA_LOWER"},
+	{STATUS_DISK_CORRUPT_ERROR, -EIO, "STATUS_DISK_CORRUPT_ERROR"},
+	{STATUS_OBJECT_NAME_INVALID, -ENOENT, "STATUS_OBJECT_NAME_INVALID"},
+	{STATUS_OBJECT_NAME_NOT_FOUND, -ENOENT, "STATUS_OBJECT_NAME_NOT_FOUND"},
+	{STATUS_OBJECT_NAME_COLLISION, -EEXIST, "STATUS_OBJECT_NAME_COLLISION"},
+	{STATUS_PORT_DISCONNECTED, -EIO, "STATUS_PORT_DISCONNECTED"},
+	{STATUS_DEVICE_ALREADY_ATTACHED, -EIO,
+	"STATUS_DEVICE_ALREADY_ATTACHED"},
+	{STATUS_OBJECT_PATH_INVALID, -ENOTDIR, "STATUS_OBJECT_PATH_INVALID"},
+	{STATUS_OBJECT_PATH_NOT_FOUND, -ENOENT, "STATUS_OBJECT_PATH_NOT_FOUND"},
+	{STATUS_OBJECT_PATH_SYNTAX_BAD, -EIO, "STATUS_OBJECT_PATH_SYNTAX_BAD"},
+	{STATUS_DATA_OVERRUN, -EIO, "STATUS_DATA_OVERRUN"},
+	{STATUS_DATA_LATE_ERROR, -EIO, "STATUS_DATA_LATE_ERROR"},
+	{STATUS_DATA_ERROR, -EIO, "STATUS_DATA_ERROR"},
+	{STATUS_CRC_ERROR, -EIO, "STATUS_CRC_ERROR"},
+	{STATUS_SECTION_TOO_BIG, -EIO, "STATUS_SECTION_TOO_BIG"},
+	{STATUS_PORT_CONNECTION_REFUSED, -ECONNREFUSED,
+	"STATUS_PORT_CONNECTION_REFUSED"},
+	{STATUS_INVALID_PORT_HANDLE, -EIO, "STATUS_INVALID_PORT_HANDLE"},
+	{STATUS_SHARING_VIOLATION, -EBUSY, "STATUS_SHARING_VIOLATION"},
+	{STATUS_QUOTA_EXCEEDED, -EDQUOT, "STATUS_QUOTA_EXCEEDED"},
+	{STATUS_INVALID_PAGE_PROTECTION, -EIO,
+	"STATUS_INVALID_PAGE_PROTECTION"},
+	{STATUS_MUTANT_NOT_OWNED, -EIO, "STATUS_MUTANT_NOT_OWNED"},
+	{STATUS_SEMAPHORE_LIMIT_EXCEEDED, -EIO,
+	"STATUS_SEMAPHORE_LIMIT_EXCEEDED"},
+	{STATUS_PORT_ALREADY_SET, -EIO, "STATUS_PORT_ALREADY_SET"},
+	{STATUS_SECTION_NOT_IMAGE, -EIO, "STATUS_SECTION_NOT_IMAGE"},
+	{STATUS_SUSPEND_COUNT_EXCEEDED, -EIO, "STATUS_SUSPEND_COUNT_EXCEEDED"},
+	{STATUS_THREAD_IS_TERMINATING, -EIO, "STATUS_THREAD_IS_TERMINATING"},
+	{STATUS_BAD_WORKING_SET_LIMIT, -EIO, "STATUS_BAD_WORKING_SET_LIMIT"},
+	{STATUS_INCOMPATIBLE_FILE_MAP, -EIO, "STATUS_INCOMPATIBLE_FILE_MAP"},
+	{STATUS_SECTION_PROTECTION, -EIO, "STATUS_SECTION_PROTECTION"},
+	{STATUS_EAS_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_EAS_NOT_SUPPORTED"},
+	{STATUS_EA_TOO_LARGE, -EIO, "STATUS_EA_TOO_LARGE"},
+	{STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"},
+	{STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"},
+	{STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"},
+	{STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"},
+	{STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"},
+	{STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"},
+	{STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS,
+	"STATUS_CTL_FILE_NOT_SUPPORTED"},
+	{STATUS_UNKNOWN_REVISION, -EIO, "STATUS_UNKNOWN_REVISION"},
+	{STATUS_REVISION_MISMATCH, -EIO, "STATUS_REVISION_MISMATCH"},
+	{STATUS_INVALID_OWNER, -EIO, "STATUS_INVALID_OWNER"},
+	{STATUS_INVALID_PRIMARY_GROUP, -EIO, "STATUS_INVALID_PRIMARY_GROUP"},
+	{STATUS_NO_IMPERSONATION_TOKEN, -EIO, "STATUS_NO_IMPERSONATION_TOKEN"},
+	{STATUS_CANT_DISABLE_MANDATORY, -EIO, "STATUS_CANT_DISABLE_MANDATORY"},
+	{STATUS_NO_LOGON_SERVERS, -EIO, "STATUS_NO_LOGON_SERVERS"},
+	{STATUS_NO_SUCH_LOGON_SESSION, -EIO, "STATUS_NO_SUCH_LOGON_SESSION"},
+	{STATUS_NO_SUCH_PRIVILEGE, -EIO, "STATUS_NO_SUCH_PRIVILEGE"},
+	{STATUS_PRIVILEGE_NOT_HELD, -EIO, "STATUS_PRIVILEGE_NOT_HELD"},
+	{STATUS_INVALID_ACCOUNT_NAME, -EIO, "STATUS_INVALID_ACCOUNT_NAME"},
+	{STATUS_USER_EXISTS, -EIO, "STATUS_USER_EXISTS"},
+	{STATUS_NO_SUCH_USER, -EIO, "STATUS_NO_SUCH_USER"},
+	{STATUS_GROUP_EXISTS, -EIO, "STATUS_GROUP_EXISTS"},
+	{STATUS_NO_SUCH_GROUP, -EIO, "STATUS_NO_SUCH_GROUP"},
+	{STATUS_MEMBER_IN_GROUP, -EIO, "STATUS_MEMBER_IN_GROUP"},
+	{STATUS_MEMBER_NOT_IN_GROUP, -EIO, "STATUS_MEMBER_NOT_IN_GROUP"},
+	{STATUS_LAST_ADMIN, -EIO, "STATUS_LAST_ADMIN"},
+	{STATUS_WRONG_PASSWORD, -EACCES, "STATUS_WRONG_PASSWORD"},
+	{STATUS_ILL_FORMED_PASSWORD, -EINVAL, "STATUS_ILL_FORMED_PASSWORD"},
+	{STATUS_PASSWORD_RESTRICTION, -EACCES, "STATUS_PASSWORD_RESTRICTION"},
+	{STATUS_LOGON_FAILURE, -EACCES, "STATUS_LOGON_FAILURE"},
+	{STATUS_ACCOUNT_RESTRICTION, -EACCES, "STATUS_ACCOUNT_RESTRICTION"},
+	{STATUS_INVALID_LOGON_HOURS, -EACCES, "STATUS_INVALID_LOGON_HOURS"},
+	{STATUS_INVALID_WORKSTATION, -EACCES, "STATUS_INVALID_WORKSTATION"},
+	{STATUS_PASSWORD_EXPIRED, -EKEYEXPIRED, "STATUS_PASSWORD_EXPIRED"},
+	{STATUS_ACCOUNT_DISABLED, -EKEYREVOKED, "STATUS_ACCOUNT_DISABLED"},
+	{STATUS_NONE_MAPPED, -EIO, "STATUS_NONE_MAPPED"},
+	{STATUS_TOO_MANY_LUIDS_REQUESTED, -EIO,
+	"STATUS_TOO_MANY_LUIDS_REQUESTED"},
+	{STATUS_LUIDS_EXHAUSTED, -EIO, "STATUS_LUIDS_EXHAUSTED"},
+	{STATUS_INVALID_SUB_AUTHORITY, -EIO, "STATUS_INVALID_SUB_AUTHORITY"},
+	{STATUS_INVALID_ACL, -EIO, "STATUS_INVALID_ACL"},
+	{STATUS_INVALID_SID, -EIO, "STATUS_INVALID_SID"},
+	{STATUS_INVALID_SECURITY_DESCR, -EIO, "STATUS_INVALID_SECURITY_DESCR"},
+	{STATUS_PROCEDURE_NOT_FOUND, -EIO, "STATUS_PROCEDURE_NOT_FOUND"},
+	{STATUS_INVALID_IMAGE_FORMAT, -EIO, "STATUS_INVALID_IMAGE_FORMAT"},
+	{STATUS_NO_TOKEN, -EIO, "STATUS_NO_TOKEN"},
+	{STATUS_BAD_INHERITANCE_ACL, -EIO, "STATUS_BAD_INHERITANCE_ACL"},
+	{STATUS_RANGE_NOT_LOCKED, -EIO, "STATUS_RANGE_NOT_LOCKED"},
+	{STATUS_DISK_FULL, -ENOSPC, "STATUS_DISK_FULL"},
+	{STATUS_SERVER_DISABLED, -EIO, "STATUS_SERVER_DISABLED"},
+	{STATUS_SERVER_NOT_DISABLED, -EIO, "STATUS_SERVER_NOT_DISABLED"},
+	{STATUS_TOO_MANY_GUIDS_REQUESTED, -EIO,
+	"STATUS_TOO_MANY_GUIDS_REQUESTED"},
+	{STATUS_GUIDS_EXHAUSTED, -EIO, "STATUS_GUIDS_EXHAUSTED"},
+	{STATUS_INVALID_ID_AUTHORITY, -EIO, "STATUS_INVALID_ID_AUTHORITY"},
+	{STATUS_AGENTS_EXHAUSTED, -EIO, "STATUS_AGENTS_EXHAUSTED"},
+	{STATUS_INVALID_VOLUME_LABEL, -EIO, "STATUS_INVALID_VOLUME_LABEL"},
+	{STATUS_SECTION_NOT_EXTENDED, -EIO, "STATUS_SECTION_NOT_EXTENDED"},
+	{STATUS_NOT_MAPPED_DATA, -EIO, "STATUS_NOT_MAPPED_DATA"},
+	{STATUS_RESOURCE_DATA_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_DATA_NOT_FOUND"},
+	{STATUS_RESOURCE_TYPE_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_TYPE_NOT_FOUND"},
+	{STATUS_RESOURCE_NAME_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_NAME_NOT_FOUND"},
+	{STATUS_ARRAY_BOUNDS_EXCEEDED, -EIO, "STATUS_ARRAY_BOUNDS_EXCEEDED"},
+	{STATUS_FLOAT_DENORMAL_OPERAND, -EIO, "STATUS_FLOAT_DENORMAL_OPERAND"},
+	{STATUS_FLOAT_DIVIDE_BY_ZERO, -EIO, "STATUS_FLOAT_DIVIDE_BY_ZERO"},
+	{STATUS_FLOAT_INEXACT_RESULT, -EIO, "STATUS_FLOAT_INEXACT_RESULT"},
+	{STATUS_FLOAT_INVALID_OPERATION, -EIO,
+	"STATUS_FLOAT_INVALID_OPERATION"},
+	{STATUS_FLOAT_OVERFLOW, -EIO, "STATUS_FLOAT_OVERFLOW"},
+	{STATUS_FLOAT_STACK_CHECK, -EIO, "STATUS_FLOAT_STACK_CHECK"},
+	{STATUS_FLOAT_UNDERFLOW, -EIO, "STATUS_FLOAT_UNDERFLOW"},
+	{STATUS_INTEGER_DIVIDE_BY_ZERO, -EIO, "STATUS_INTEGER_DIVIDE_BY_ZERO"},
+	{STATUS_INTEGER_OVERFLOW, -EIO, "STATUS_INTEGER_OVERFLOW"},
+	{STATUS_PRIVILEGED_INSTRUCTION, -EIO, "STATUS_PRIVILEGED_INSTRUCTION"},
+	{STATUS_TOO_MANY_PAGING_FILES, -EIO, "STATUS_TOO_MANY_PAGING_FILES"},
+	{STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"},
+	{STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO,
+	"STATUS_ALLOTTED_SPACE_EXCEEDED"},
+	{STATUS_INSUFFICIENT_RESOURCES, -EREMOTEIO,
+				"STATUS_INSUFFICIENT_RESOURCES"},
+	{STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"},
+	{STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"},
+	{STATUS_DEVICE_NOT_CONNECTED, -EIO, "STATUS_DEVICE_NOT_CONNECTED"},
+	{STATUS_DEVICE_POWER_FAILURE, -EIO, "STATUS_DEVICE_POWER_FAILURE"},
+	{STATUS_FREE_VM_NOT_AT_BASE, -EIO, "STATUS_FREE_VM_NOT_AT_BASE"},
+	{STATUS_MEMORY_NOT_ALLOCATED, -EFAULT, "STATUS_MEMORY_NOT_ALLOCATED"},
+	{STATUS_WORKING_SET_QUOTA, -EIO, "STATUS_WORKING_SET_QUOTA"},
+	{STATUS_MEDIA_WRITE_PROTECTED, -EROFS, "STATUS_MEDIA_WRITE_PROTECTED"},
+	{STATUS_DEVICE_NOT_READY, -EIO, "STATUS_DEVICE_NOT_READY"},
+	{STATUS_INVALID_GROUP_ATTRIBUTES, -EIO,
+	"STATUS_INVALID_GROUP_ATTRIBUTES"},
+	{STATUS_BAD_IMPERSONATION_LEVEL, -EIO,
+	"STATUS_BAD_IMPERSONATION_LEVEL"},
+	{STATUS_CANT_OPEN_ANONYMOUS, -EIO, "STATUS_CANT_OPEN_ANONYMOUS"},
+	{STATUS_BAD_VALIDATION_CLASS, -EIO, "STATUS_BAD_VALIDATION_CLASS"},
+	{STATUS_BAD_TOKEN_TYPE, -EIO, "STATUS_BAD_TOKEN_TYPE"},
+	{STATUS_BAD_MASTER_BOOT_RECORD, -EIO, "STATUS_BAD_MASTER_BOOT_RECORD"},
+	{STATUS_INSTRUCTION_MISALIGNMENT, -EIO,
+	"STATUS_INSTRUCTION_MISALIGNMENT"},
+	{STATUS_INSTANCE_NOT_AVAILABLE, -EIO, "STATUS_INSTANCE_NOT_AVAILABLE"},
+	{STATUS_PIPE_NOT_AVAILABLE, -EIO, "STATUS_PIPE_NOT_AVAILABLE"},
+	{STATUS_INVALID_PIPE_STATE, -EIO, "STATUS_INVALID_PIPE_STATE"},
+	{STATUS_PIPE_BUSY, -EBUSY, "STATUS_PIPE_BUSY"},
+	{STATUS_ILLEGAL_FUNCTION, -EIO, "STATUS_ILLEGAL_FUNCTION"},
+	{STATUS_PIPE_DISCONNECTED, -EPIPE, "STATUS_PIPE_DISCONNECTED"},
+	{STATUS_PIPE_CLOSING, -EIO, "STATUS_PIPE_CLOSING"},
+	{STATUS_PIPE_CONNECTED, -EIO, "STATUS_PIPE_CONNECTED"},
+	{STATUS_PIPE_LISTENING, -EIO, "STATUS_PIPE_LISTENING"},
+	{STATUS_INVALID_READ_MODE, -EIO, "STATUS_INVALID_READ_MODE"},
+	{STATUS_IO_TIMEOUT, -ETIMEDOUT, "STATUS_IO_TIMEOUT"},
+	{STATUS_FILE_FORCED_CLOSED, -EIO, "STATUS_FILE_FORCED_CLOSED"},
+	{STATUS_PROFILING_NOT_STARTED, -EIO, "STATUS_PROFILING_NOT_STARTED"},
+	{STATUS_PROFILING_NOT_STOPPED, -EIO, "STATUS_PROFILING_NOT_STOPPED"},
+	{STATUS_COULD_NOT_INTERPRET, -EIO, "STATUS_COULD_NOT_INTERPRET"},
+	{STATUS_FILE_IS_A_DIRECTORY, -EISDIR, "STATUS_FILE_IS_A_DIRECTORY"},
+	{STATUS_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_NOT_SUPPORTED"},
+	{STATUS_REMOTE_NOT_LISTENING, -EHOSTDOWN,
+	"STATUS_REMOTE_NOT_LISTENING"},
+	{STATUS_DUPLICATE_NAME, -ENOTUNIQ, "STATUS_DUPLICATE_NAME"},
+	{STATUS_BAD_NETWORK_PATH, -EINVAL, "STATUS_BAD_NETWORK_PATH"},
+	{STATUS_NETWORK_BUSY, -EBUSY, "STATUS_NETWORK_BUSY"},
+	{STATUS_DEVICE_DOES_NOT_EXIST, -ENODEV, "STATUS_DEVICE_DOES_NOT_EXIST"},
+	{STATUS_TOO_MANY_COMMANDS, -EIO, "STATUS_TOO_MANY_COMMANDS"},
+	{STATUS_ADAPTER_HARDWARE_ERROR, -EIO, "STATUS_ADAPTER_HARDWARE_ERROR"},
+	{STATUS_INVALID_NETWORK_RESPONSE, -EIO,
+	"STATUS_INVALID_NETWORK_RESPONSE"},
+	{STATUS_UNEXPECTED_NETWORK_ERROR, -EIO,
+	"STATUS_UNEXPECTED_NETWORK_ERROR"},
+	{STATUS_BAD_REMOTE_ADAPTER, -EIO, "STATUS_BAD_REMOTE_ADAPTER"},
+	{STATUS_PRINT_QUEUE_FULL, -EIO, "STATUS_PRINT_QUEUE_FULL"},
+	{STATUS_NO_SPOOL_SPACE, -EIO, "STATUS_NO_SPOOL_SPACE"},
+	{STATUS_PRINT_CANCELLED, -EIO, "STATUS_PRINT_CANCELLED"},
+	{STATUS_NETWORK_NAME_DELETED, -EIO, "STATUS_NETWORK_NAME_DELETED"},
+	{STATUS_NETWORK_ACCESS_DENIED, -EACCES, "STATUS_NETWORK_ACCESS_DENIED"},
+	{STATUS_BAD_DEVICE_TYPE, -EIO, "STATUS_BAD_DEVICE_TYPE"},
+	{STATUS_BAD_NETWORK_NAME, -ENOENT, "STATUS_BAD_NETWORK_NAME"},
+	{STATUS_TOO_MANY_NAMES, -EIO, "STATUS_TOO_MANY_NAMES"},
+	{STATUS_TOO_MANY_SESSIONS, -EIO, "STATUS_TOO_MANY_SESSIONS"},
+	{STATUS_SHARING_PAUSED, -EIO, "STATUS_SHARING_PAUSED"},
+	{STATUS_REQUEST_NOT_ACCEPTED, -EIO, "STATUS_REQUEST_NOT_ACCEPTED"},
+	{STATUS_REDIRECTOR_PAUSED, -EIO, "STATUS_REDIRECTOR_PAUSED"},
+	{STATUS_NET_WRITE_FAULT, -EIO, "STATUS_NET_WRITE_FAULT"},
+	{STATUS_PROFILING_AT_LIMIT, -EIO, "STATUS_PROFILING_AT_LIMIT"},
+	{STATUS_NOT_SAME_DEVICE, -EXDEV, "STATUS_NOT_SAME_DEVICE"},
+	{STATUS_FILE_RENAMED, -EIO, "STATUS_FILE_RENAMED"},
+	{STATUS_VIRTUAL_CIRCUIT_CLOSED, -EIO, "STATUS_VIRTUAL_CIRCUIT_CLOSED"},
+	{STATUS_NO_SECURITY_ON_OBJECT, -EIO, "STATUS_NO_SECURITY_ON_OBJECT"},
+	{STATUS_CANT_WAIT, -EIO, "STATUS_CANT_WAIT"},
+	{STATUS_PIPE_EMPTY, -EIO, "STATUS_PIPE_EMPTY"},
+	{STATUS_CANT_ACCESS_DOMAIN_INFO, -EIO,
+	"STATUS_CANT_ACCESS_DOMAIN_INFO"},
+	{STATUS_CANT_TERMINATE_SELF, -EIO, "STATUS_CANT_TERMINATE_SELF"},
+	{STATUS_INVALID_SERVER_STATE, -EIO, "STATUS_INVALID_SERVER_STATE"},
+	{STATUS_INVALID_DOMAIN_STATE, -EIO, "STATUS_INVALID_DOMAIN_STATE"},
+	{STATUS_INVALID_DOMAIN_ROLE, -EIO, "STATUS_INVALID_DOMAIN_ROLE"},
+	{STATUS_NO_SUCH_DOMAIN, -EIO, "STATUS_NO_SUCH_DOMAIN"},
+	{STATUS_DOMAIN_EXISTS, -EIO, "STATUS_DOMAIN_EXISTS"},
+	{STATUS_DOMAIN_LIMIT_EXCEEDED, -EIO, "STATUS_DOMAIN_LIMIT_EXCEEDED"},
+	{STATUS_OPLOCK_NOT_GRANTED, -EIO, "STATUS_OPLOCK_NOT_GRANTED"},
+	{STATUS_INVALID_OPLOCK_PROTOCOL, -EIO,
+	"STATUS_INVALID_OPLOCK_PROTOCOL"},
+	{STATUS_INTERNAL_DB_CORRUPTION, -EIO, "STATUS_INTERNAL_DB_CORRUPTION"},
+	{STATUS_INTERNAL_ERROR, -EIO, "STATUS_INTERNAL_ERROR"},
+	{STATUS_GENERIC_NOT_MAPPED, -EIO, "STATUS_GENERIC_NOT_MAPPED"},
+	{STATUS_BAD_DESCRIPTOR_FORMAT, -EIO, "STATUS_BAD_DESCRIPTOR_FORMAT"},
+	{STATUS_INVALID_USER_BUFFER, -EIO, "STATUS_INVALID_USER_BUFFER"},
+	{STATUS_UNEXPECTED_IO_ERROR, -EIO, "STATUS_UNEXPECTED_IO_ERROR"},
+	{STATUS_UNEXPECTED_MM_CREATE_ERR, -EIO,
+	"STATUS_UNEXPECTED_MM_CREATE_ERR"},
+	{STATUS_UNEXPECTED_MM_MAP_ERROR, -EIO,
+	"STATUS_UNEXPECTED_MM_MAP_ERROR"},
+	{STATUS_UNEXPECTED_MM_EXTEND_ERR, -EIO,
+	"STATUS_UNEXPECTED_MM_EXTEND_ERR"},
+	{STATUS_NOT_LOGON_PROCESS, -EIO, "STATUS_NOT_LOGON_PROCESS"},
+	{STATUS_LOGON_SESSION_EXISTS, -EIO, "STATUS_LOGON_SESSION_EXISTS"},
+	{STATUS_INVALID_PARAMETER_1, -EINVAL, "STATUS_INVALID_PARAMETER_1"},
+	{STATUS_INVALID_PARAMETER_2, -EINVAL, "STATUS_INVALID_PARAMETER_2"},
+	{STATUS_INVALID_PARAMETER_3, -EINVAL, "STATUS_INVALID_PARAMETER_3"},
+	{STATUS_INVALID_PARAMETER_4, -EINVAL, "STATUS_INVALID_PARAMETER_4"},
+	{STATUS_INVALID_PARAMETER_5, -EINVAL, "STATUS_INVALID_PARAMETER_5"},
+	{STATUS_INVALID_PARAMETER_6, -EINVAL, "STATUS_INVALID_PARAMETER_6"},
+	{STATUS_INVALID_PARAMETER_7, -EINVAL, "STATUS_INVALID_PARAMETER_7"},
+	{STATUS_INVALID_PARAMETER_8, -EINVAL, "STATUS_INVALID_PARAMETER_8"},
+	{STATUS_INVALID_PARAMETER_9, -EINVAL, "STATUS_INVALID_PARAMETER_9"},
+	{STATUS_INVALID_PARAMETER_10, -EINVAL, "STATUS_INVALID_PARAMETER_10"},
+	{STATUS_INVALID_PARAMETER_11, -EINVAL, "STATUS_INVALID_PARAMETER_11"},
+	{STATUS_INVALID_PARAMETER_12, -EINVAL, "STATUS_INVALID_PARAMETER_12"},
+	{STATUS_REDIRECTOR_NOT_STARTED, -EIO, "STATUS_REDIRECTOR_NOT_STARTED"},
+	{STATUS_REDIRECTOR_STARTED, -EIO, "STATUS_REDIRECTOR_STARTED"},
+	{STATUS_STACK_OVERFLOW, -EIO, "STATUS_STACK_OVERFLOW"},
+	{STATUS_NO_SUCH_PACKAGE, -EIO, "STATUS_NO_SUCH_PACKAGE"},
+	{STATUS_BAD_FUNCTION_TABLE, -EIO, "STATUS_BAD_FUNCTION_TABLE"},
+	{STATUS_VARIABLE_NOT_FOUND, -EIO, "STATUS_VARIABLE_NOT_FOUND"},
+	{STATUS_DIRECTORY_NOT_EMPTY, -ENOTEMPTY, "STATUS_DIRECTORY_NOT_EMPTY"},
+	{STATUS_FILE_CORRUPT_ERROR, -EIO, "STATUS_FILE_CORRUPT_ERROR"},
+	{STATUS_NOT_A_DIRECTORY, -ENOTDIR, "STATUS_NOT_A_DIRECTORY"},
+	{STATUS_BAD_LOGON_SESSION_STATE, -EIO,
+	"STATUS_BAD_LOGON_SESSION_STATE"},
+	{STATUS_LOGON_SESSION_COLLISION, -EIO,
+	"STATUS_LOGON_SESSION_COLLISION"},
+	{STATUS_NAME_TOO_LONG, -ENAMETOOLONG, "STATUS_NAME_TOO_LONG"},
+	{STATUS_FILES_OPEN, -EIO, "STATUS_FILES_OPEN"},
+	{STATUS_CONNECTION_IN_USE, -EIO, "STATUS_CONNECTION_IN_USE"},
+	{STATUS_MESSAGE_NOT_FOUND, -EIO, "STATUS_MESSAGE_NOT_FOUND"},
+	{STATUS_PROCESS_IS_TERMINATING, -EIO, "STATUS_PROCESS_IS_TERMINATING"},
+	{STATUS_INVALID_LOGON_TYPE, -EIO, "STATUS_INVALID_LOGON_TYPE"},
+	{STATUS_NO_GUID_TRANSLATION, -EIO, "STATUS_NO_GUID_TRANSLATION"},
+	{STATUS_CANNOT_IMPERSONATE, -EIO, "STATUS_CANNOT_IMPERSONATE"},
+	{STATUS_IMAGE_ALREADY_LOADED, -EIO, "STATUS_IMAGE_ALREADY_LOADED"},
+	{STATUS_ABIOS_NOT_PRESENT, -EIO, "STATUS_ABIOS_NOT_PRESENT"},
+	{STATUS_ABIOS_LID_NOT_EXIST, -EIO, "STATUS_ABIOS_LID_NOT_EXIST"},
+	{STATUS_ABIOS_LID_ALREADY_OWNED, -EIO,
+	"STATUS_ABIOS_LID_ALREADY_OWNED"},
+	{STATUS_ABIOS_NOT_LID_OWNER, -EIO, "STATUS_ABIOS_NOT_LID_OWNER"},
+	{STATUS_ABIOS_INVALID_COMMAND, -EIO, "STATUS_ABIOS_INVALID_COMMAND"},
+	{STATUS_ABIOS_INVALID_LID, -EIO, "STATUS_ABIOS_INVALID_LID"},
+	{STATUS_ABIOS_SELECTOR_NOT_AVAILABLE, -EIO,
+	"STATUS_ABIOS_SELECTOR_NOT_AVAILABLE"},
+	{STATUS_ABIOS_INVALID_SELECTOR, -EIO, "STATUS_ABIOS_INVALID_SELECTOR"},
+	{STATUS_NO_LDT, -EIO, "STATUS_NO_LDT"},
+	{STATUS_INVALID_LDT_SIZE, -EIO, "STATUS_INVALID_LDT_SIZE"},
+	{STATUS_INVALID_LDT_OFFSET, -EIO, "STATUS_INVALID_LDT_OFFSET"},
+	{STATUS_INVALID_LDT_DESCRIPTOR, -EIO, "STATUS_INVALID_LDT_DESCRIPTOR"},
+	{STATUS_INVALID_IMAGE_NE_FORMAT, -EIO,
+	"STATUS_INVALID_IMAGE_NE_FORMAT"},
+	{STATUS_RXACT_INVALID_STATE, -EIO, "STATUS_RXACT_INVALID_STATE"},
+	{STATUS_RXACT_COMMIT_FAILURE, -EIO, "STATUS_RXACT_COMMIT_FAILURE"},
+	{STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"},
+	{STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"},
+	{STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"},
+	{STATUS_CANNOT_DELETE, -EACCES, "STATUS_CANNOT_DELETE"},
+	{STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"},
+	{STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"},
+	{STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"},
+	{STATUS_SPECIAL_GROUP, -EIO, "STATUS_SPECIAL_GROUP"},
+	{STATUS_SPECIAL_USER, -EIO, "STATUS_SPECIAL_USER"},
+	{STATUS_MEMBERS_PRIMARY_GROUP, -EIO, "STATUS_MEMBERS_PRIMARY_GROUP"},
+	{STATUS_FILE_CLOSED, -EBADF, "STATUS_FILE_CLOSED"},
+	{STATUS_TOO_MANY_THREADS, -EIO, "STATUS_TOO_MANY_THREADS"},
+	{STATUS_THREAD_NOT_IN_PROCESS, -EIO, "STATUS_THREAD_NOT_IN_PROCESS"},
+	{STATUS_TOKEN_ALREADY_IN_USE, -EIO, "STATUS_TOKEN_ALREADY_IN_USE"},
+	{STATUS_PAGEFILE_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_PAGEFILE_QUOTA_EXCEEDED"},
+	{STATUS_COMMITMENT_LIMIT, -EIO, "STATUS_COMMITMENT_LIMIT"},
+	{STATUS_INVALID_IMAGE_LE_FORMAT, -EIO,
+	"STATUS_INVALID_IMAGE_LE_FORMAT"},
+	{STATUS_INVALID_IMAGE_NOT_MZ, -EIO, "STATUS_INVALID_IMAGE_NOT_MZ"},
+	{STATUS_INVALID_IMAGE_PROTECT, -EIO, "STATUS_INVALID_IMAGE_PROTECT"},
+	{STATUS_INVALID_IMAGE_WIN_16, -EIO, "STATUS_INVALID_IMAGE_WIN_16"},
+	{STATUS_LOGON_SERVER_CONFLICT, -EIO, "STATUS_LOGON_SERVER_CONFLICT"},
+	{STATUS_TIME_DIFFERENCE_AT_DC, -EIO, "STATUS_TIME_DIFFERENCE_AT_DC"},
+	{STATUS_SYNCHRONIZATION_REQUIRED, -EIO,
+	"STATUS_SYNCHRONIZATION_REQUIRED"},
+	{STATUS_DLL_NOT_FOUND, -ENOENT, "STATUS_DLL_NOT_FOUND"},
+	{STATUS_OPEN_FAILED, -EIO, "STATUS_OPEN_FAILED"},
+	{STATUS_IO_PRIVILEGE_FAILED, -EIO, "STATUS_IO_PRIVILEGE_FAILED"},
+	{STATUS_ORDINAL_NOT_FOUND, -EIO, "STATUS_ORDINAL_NOT_FOUND"},
+	{STATUS_ENTRYPOINT_NOT_FOUND, -EIO, "STATUS_ENTRYPOINT_NOT_FOUND"},
+	{STATUS_CONTROL_C_EXIT, -EIO, "STATUS_CONTROL_C_EXIT"},
+	{STATUS_LOCAL_DISCONNECT, -EIO, "STATUS_LOCAL_DISCONNECT"},
+	{STATUS_REMOTE_DISCONNECT, -ESHUTDOWN, "STATUS_REMOTE_DISCONNECT"},
+	{STATUS_REMOTE_RESOURCES, -EIO, "STATUS_REMOTE_RESOURCES"},
+	{STATUS_LINK_FAILED, -EXDEV, "STATUS_LINK_FAILED"},
+	{STATUS_LINK_TIMEOUT, -ETIMEDOUT, "STATUS_LINK_TIMEOUT"},
+	{STATUS_INVALID_CONNECTION, -EIO, "STATUS_INVALID_CONNECTION"},
+	{STATUS_INVALID_ADDRESS, -EIO, "STATUS_INVALID_ADDRESS"},
+	{STATUS_DLL_INIT_FAILED, -EIO, "STATUS_DLL_INIT_FAILED"},
+	{STATUS_MISSING_SYSTEMFILE, -EIO, "STATUS_MISSING_SYSTEMFILE"},
+	{STATUS_UNHANDLED_EXCEPTION, -EIO, "STATUS_UNHANDLED_EXCEPTION"},
+	{STATUS_APP_INIT_FAILURE, -EIO, "STATUS_APP_INIT_FAILURE"},
+	{STATUS_PAGEFILE_CREATE_FAILED, -EIO, "STATUS_PAGEFILE_CREATE_FAILED"},
+	{STATUS_NO_PAGEFILE, -EIO, "STATUS_NO_PAGEFILE"},
+	{STATUS_INVALID_LEVEL, -EIO, "STATUS_INVALID_LEVEL"},
+	{STATUS_WRONG_PASSWORD_CORE, -EIO, "STATUS_WRONG_PASSWORD_CORE"},
+	{STATUS_ILLEGAL_FLOAT_CONTEXT, -EIO, "STATUS_ILLEGAL_FLOAT_CONTEXT"},
+	{STATUS_PIPE_BROKEN, -EPIPE, "STATUS_PIPE_BROKEN"},
+	{STATUS_REGISTRY_CORRUPT, -EIO, "STATUS_REGISTRY_CORRUPT"},
+	{STATUS_REGISTRY_IO_FAILED, -EIO, "STATUS_REGISTRY_IO_FAILED"},
+	{STATUS_NO_EVENT_PAIR, -EIO, "STATUS_NO_EVENT_PAIR"},
+	{STATUS_UNRECOGNIZED_VOLUME, -EIO, "STATUS_UNRECOGNIZED_VOLUME"},
+	{STATUS_SERIAL_NO_DEVICE_INITED, -EIO,
+	"STATUS_SERIAL_NO_DEVICE_INITED"},
+	{STATUS_NO_SUCH_ALIAS, -EIO, "STATUS_NO_SUCH_ALIAS"},
+	{STATUS_MEMBER_NOT_IN_ALIAS, -EIO, "STATUS_MEMBER_NOT_IN_ALIAS"},
+	{STATUS_MEMBER_IN_ALIAS, -EIO, "STATUS_MEMBER_IN_ALIAS"},
+	{STATUS_ALIAS_EXISTS, -EIO, "STATUS_ALIAS_EXISTS"},
+	{STATUS_LOGON_NOT_GRANTED, -EIO, "STATUS_LOGON_NOT_GRANTED"},
+	{STATUS_TOO_MANY_SECRETS, -EIO, "STATUS_TOO_MANY_SECRETS"},
+	{STATUS_SECRET_TOO_LONG, -EIO, "STATUS_SECRET_TOO_LONG"},
+	{STATUS_INTERNAL_DB_ERROR, -EIO, "STATUS_INTERNAL_DB_ERROR"},
+	{STATUS_FULLSCREEN_MODE, -EIO, "STATUS_FULLSCREEN_MODE"},
+	{STATUS_TOO_MANY_CONTEXT_IDS, -EIO, "STATUS_TOO_MANY_CONTEXT_IDS"},
+	{STATUS_LOGON_TYPE_NOT_GRANTED, -EIO, "STATUS_LOGON_TYPE_NOT_GRANTED"},
+	{STATUS_NOT_REGISTRY_FILE, -EIO, "STATUS_NOT_REGISTRY_FILE"},
+	{STATUS_NT_CROSS_ENCRYPTION_REQUIRED, -EIO,
+	"STATUS_NT_CROSS_ENCRYPTION_REQUIRED"},
+	{STATUS_DOMAIN_CTRLR_CONFIG_ERROR, -EIO,
+	"STATUS_DOMAIN_CTRLR_CONFIG_ERROR"},
+	{STATUS_FT_MISSING_MEMBER, -EIO, "STATUS_FT_MISSING_MEMBER"},
+	{STATUS_ILL_FORMED_SERVICE_ENTRY, -EIO,
+	"STATUS_ILL_FORMED_SERVICE_ENTRY"},
+	{STATUS_ILLEGAL_CHARACTER, -EIO, "STATUS_ILLEGAL_CHARACTER"},
+	{STATUS_UNMAPPABLE_CHARACTER, -EIO, "STATUS_UNMAPPABLE_CHARACTER"},
+	{STATUS_UNDEFINED_CHARACTER, -EIO, "STATUS_UNDEFINED_CHARACTER"},
+	{STATUS_FLOPPY_VOLUME, -EIO, "STATUS_FLOPPY_VOLUME"},
+	{STATUS_FLOPPY_ID_MARK_NOT_FOUND, -EIO,
+	"STATUS_FLOPPY_ID_MARK_NOT_FOUND"},
+	{STATUS_FLOPPY_WRONG_CYLINDER, -EIO, "STATUS_FLOPPY_WRONG_CYLINDER"},
+	{STATUS_FLOPPY_UNKNOWN_ERROR, -EIO, "STATUS_FLOPPY_UNKNOWN_ERROR"},
+	{STATUS_FLOPPY_BAD_REGISTERS, -EIO, "STATUS_FLOPPY_BAD_REGISTERS"},
+	{STATUS_DISK_RECALIBRATE_FAILED, -EIO,
+	"STATUS_DISK_RECALIBRATE_FAILED"},
+	{STATUS_DISK_OPERATION_FAILED, -EIO, "STATUS_DISK_OPERATION_FAILED"},
+	{STATUS_DISK_RESET_FAILED, -EIO, "STATUS_DISK_RESET_FAILED"},
+	{STATUS_SHARED_IRQ_BUSY, -EBUSY, "STATUS_SHARED_IRQ_BUSY"},
+	{STATUS_FT_ORPHANING, -EIO, "STATUS_FT_ORPHANING"},
+	{STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT, -EIO,
+	"STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT"},
+	{STATUS_PARTITION_FAILURE, -EIO, "STATUS_PARTITION_FAILURE"},
+	{STATUS_INVALID_BLOCK_LENGTH, -EIO, "STATUS_INVALID_BLOCK_LENGTH"},
+	{STATUS_DEVICE_NOT_PARTITIONED, -EIO, "STATUS_DEVICE_NOT_PARTITIONED"},
+	{STATUS_UNABLE_TO_LOCK_MEDIA, -EIO, "STATUS_UNABLE_TO_LOCK_MEDIA"},
+	{STATUS_UNABLE_TO_UNLOAD_MEDIA, -EIO, "STATUS_UNABLE_TO_UNLOAD_MEDIA"},
+	{STATUS_EOM_OVERFLOW, -EIO, "STATUS_EOM_OVERFLOW"},
+	{STATUS_NO_MEDIA, -EIO, "STATUS_NO_MEDIA"},
+	{STATUS_NO_SUCH_MEMBER, -EIO, "STATUS_NO_SUCH_MEMBER"},
+	{STATUS_INVALID_MEMBER, -EIO, "STATUS_INVALID_MEMBER"},
+	{STATUS_KEY_DELETED, -EIO, "STATUS_KEY_DELETED"},
+	{STATUS_NO_LOG_SPACE, -EIO, "STATUS_NO_LOG_SPACE"},
+	{STATUS_TOO_MANY_SIDS, -EIO, "STATUS_TOO_MANY_SIDS"},
+	{STATUS_LM_CROSS_ENCRYPTION_REQUIRED, -EIO,
+	"STATUS_LM_CROSS_ENCRYPTION_REQUIRED"},
+	{STATUS_KEY_HAS_CHILDREN, -EIO, "STATUS_KEY_HAS_CHILDREN"},
+	{STATUS_CHILD_MUST_BE_VOLATILE, -EIO, "STATUS_CHILD_MUST_BE_VOLATILE"},
+	{STATUS_DEVICE_CONFIGURATION_ERROR, -EIO,
+	"STATUS_DEVICE_CONFIGURATION_ERROR"},
+	{STATUS_DRIVER_INTERNAL_ERROR, -EIO, "STATUS_DRIVER_INTERNAL_ERROR"},
+	{STATUS_INVALID_DEVICE_STATE, -EIO, "STATUS_INVALID_DEVICE_STATE"},
+	{STATUS_IO_DEVICE_ERROR, -EIO, "STATUS_IO_DEVICE_ERROR"},
+	{STATUS_DEVICE_PROTOCOL_ERROR, -EIO, "STATUS_DEVICE_PROTOCOL_ERROR"},
+	{STATUS_BACKUP_CONTROLLER, -EIO, "STATUS_BACKUP_CONTROLLER"},
+	{STATUS_LOG_FILE_FULL, -EIO, "STATUS_LOG_FILE_FULL"},
+	{STATUS_TOO_LATE, -EIO, "STATUS_TOO_LATE"},
+	{STATUS_NO_TRUST_LSA_SECRET, -EIO, "STATUS_NO_TRUST_LSA_SECRET"},
+	{STATUS_NO_TRUST_SAM_ACCOUNT, -EIO, "STATUS_NO_TRUST_SAM_ACCOUNT"},
+	{STATUS_TRUSTED_DOMAIN_FAILURE, -EIO, "STATUS_TRUSTED_DOMAIN_FAILURE"},
+	{STATUS_TRUSTED_RELATIONSHIP_FAILURE, -EIO,
+	"STATUS_TRUSTED_RELATIONSHIP_FAILURE"},
+	{STATUS_EVENTLOG_FILE_CORRUPT, -EIO, "STATUS_EVENTLOG_FILE_CORRUPT"},
+	{STATUS_EVENTLOG_CANT_START, -EIO, "STATUS_EVENTLOG_CANT_START"},
+	{STATUS_TRUST_FAILURE, -EIO, "STATUS_TRUST_FAILURE"},
+	{STATUS_MUTANT_LIMIT_EXCEEDED, -EIO, "STATUS_MUTANT_LIMIT_EXCEEDED"},
+	{STATUS_NETLOGON_NOT_STARTED, -EIO, "STATUS_NETLOGON_NOT_STARTED"},
+	{STATUS_ACCOUNT_EXPIRED, -EKEYEXPIRED, "STATUS_ACCOUNT_EXPIRED"},
+	{STATUS_POSSIBLE_DEADLOCK, -EIO, "STATUS_POSSIBLE_DEADLOCK"},
+	{STATUS_NETWORK_CREDENTIAL_CONFLICT, -EIO,
+	"STATUS_NETWORK_CREDENTIAL_CONFLICT"},
+	{STATUS_REMOTE_SESSION_LIMIT, -EIO, "STATUS_REMOTE_SESSION_LIMIT"},
+	{STATUS_EVENTLOG_FILE_CHANGED, -EIO, "STATUS_EVENTLOG_FILE_CHANGED"},
+	{STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT, -EIO,
+	"STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT"},
+	{STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT, -EIO,
+	"STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT"},
+	{STATUS_NOLOGON_SERVER_TRUST_ACCOUNT, -EIO,
+	"STATUS_NOLOGON_SERVER_TRUST_ACCOUNT"},
+	{STATUS_DOMAIN_TRUST_INCONSISTENT, -EIO,
+	"STATUS_DOMAIN_TRUST_INCONSISTENT"},
+	{STATUS_FS_DRIVER_REQUIRED, -EIO, "STATUS_FS_DRIVER_REQUIRED"},
+	{STATUS_IMAGE_ALREADY_LOADED_AS_DLL, -EIO,
+	"STATUS_IMAGE_ALREADY_LOADED_AS_DLL"},
+	{STATUS_NETWORK_OPEN_RESTRICTION, -EIO,
+	"STATUS_NETWORK_OPEN_RESTRICTION"},
+	{STATUS_NO_USER_SESSION_KEY, -EIO, "STATUS_NO_USER_SESSION_KEY"},
+	{STATUS_USER_SESSION_DELETED, -EIO, "STATUS_USER_SESSION_DELETED"},
+	{STATUS_RESOURCE_LANG_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_LANG_NOT_FOUND"},
+	{STATUS_INSUFF_SERVER_RESOURCES, -EIO,
+	"STATUS_INSUFF_SERVER_RESOURCES"},
+	{STATUS_INVALID_BUFFER_SIZE, -EIO, "STATUS_INVALID_BUFFER_SIZE"},
+	{STATUS_INVALID_ADDRESS_COMPONENT, -EIO,
+	"STATUS_INVALID_ADDRESS_COMPONENT"},
+	{STATUS_INVALID_ADDRESS_WILDCARD, -EIO,
+	"STATUS_INVALID_ADDRESS_WILDCARD"},
+	{STATUS_TOO_MANY_ADDRESSES, -EIO, "STATUS_TOO_MANY_ADDRESSES"},
+	{STATUS_ADDRESS_ALREADY_EXISTS, -EADDRINUSE,
+	"STATUS_ADDRESS_ALREADY_EXISTS"},
+	{STATUS_ADDRESS_CLOSED, -EIO, "STATUS_ADDRESS_CLOSED"},
+	{STATUS_CONNECTION_DISCONNECTED, -ECONNABORTED,
+	"STATUS_CONNECTION_DISCONNECTED"},
+	{STATUS_CONNECTION_RESET, -ENETRESET, "STATUS_CONNECTION_RESET"},
+	{STATUS_TOO_MANY_NODES, -EIO, "STATUS_TOO_MANY_NODES"},
+	{STATUS_TRANSACTION_ABORTED, -EIO, "STATUS_TRANSACTION_ABORTED"},
+	{STATUS_TRANSACTION_TIMED_OUT, -EIO, "STATUS_TRANSACTION_TIMED_OUT"},
+	{STATUS_TRANSACTION_NO_RELEASE, -EIO, "STATUS_TRANSACTION_NO_RELEASE"},
+	{STATUS_TRANSACTION_NO_MATCH, -EIO, "STATUS_TRANSACTION_NO_MATCH"},
+	{STATUS_TRANSACTION_RESPONDED, -EIO, "STATUS_TRANSACTION_RESPONDED"},
+	{STATUS_TRANSACTION_INVALID_ID, -EIO, "STATUS_TRANSACTION_INVALID_ID"},
+	{STATUS_TRANSACTION_INVALID_TYPE, -EIO,
+	"STATUS_TRANSACTION_INVALID_TYPE"},
+	{STATUS_NOT_SERVER_SESSION, -EIO, "STATUS_NOT_SERVER_SESSION"},
+	{STATUS_NOT_CLIENT_SESSION, -EIO, "STATUS_NOT_CLIENT_SESSION"},
+	{STATUS_CANNOT_LOAD_REGISTRY_FILE, -EIO,
+	"STATUS_CANNOT_LOAD_REGISTRY_FILE"},
+	{STATUS_DEBUG_ATTACH_FAILED, -EIO, "STATUS_DEBUG_ATTACH_FAILED"},
+	{STATUS_SYSTEM_PROCESS_TERMINATED, -EIO,
+	"STATUS_SYSTEM_PROCESS_TERMINATED"},
+	{STATUS_DATA_NOT_ACCEPTED, -EIO, "STATUS_DATA_NOT_ACCEPTED"},
+	{STATUS_NO_BROWSER_SERVERS_FOUND, -EIO,
+	"STATUS_NO_BROWSER_SERVERS_FOUND"},
+	{STATUS_VDM_HARD_ERROR, -EIO, "STATUS_VDM_HARD_ERROR"},
+	{STATUS_DRIVER_CANCEL_TIMEOUT, -EIO, "STATUS_DRIVER_CANCEL_TIMEOUT"},
+	{STATUS_REPLY_MESSAGE_MISMATCH, -EIO, "STATUS_REPLY_MESSAGE_MISMATCH"},
+	{STATUS_MAPPED_ALIGNMENT, -EIO, "STATUS_MAPPED_ALIGNMENT"},
+	{STATUS_IMAGE_CHECKSUM_MISMATCH, -EIO,
+	"STATUS_IMAGE_CHECKSUM_MISMATCH"},
+	{STATUS_LOST_WRITEBEHIND_DATA, -EIO, "STATUS_LOST_WRITEBEHIND_DATA"},
+	{STATUS_CLIENT_SERVER_PARAMETERS_INVALID, -EIO,
+	"STATUS_CLIENT_SERVER_PARAMETERS_INVALID"},
+	{STATUS_PASSWORD_MUST_CHANGE, -EIO, "STATUS_PASSWORD_MUST_CHANGE"},
+	{STATUS_NOT_FOUND, -ENOENT, "STATUS_NOT_FOUND"},
+	{STATUS_NOT_TINY_STREAM, -EIO, "STATUS_NOT_TINY_STREAM"},
+	{STATUS_RECOVERY_FAILURE, -EIO, "STATUS_RECOVERY_FAILURE"},
+	{STATUS_STACK_OVERFLOW_READ, -EIO, "STATUS_STACK_OVERFLOW_READ"},
+	{STATUS_FAIL_CHECK, -EIO, "STATUS_FAIL_CHECK"},
+	{STATUS_DUPLICATE_OBJECTID, -EIO, "STATUS_DUPLICATE_OBJECTID"},
+	{STATUS_OBJECTID_EXISTS, -EIO, "STATUS_OBJECTID_EXISTS"},
+	{STATUS_CONVERT_TO_LARGE, -EIO, "STATUS_CONVERT_TO_LARGE"},
+	{STATUS_RETRY, -EAGAIN, "STATUS_RETRY"},
+	{STATUS_FOUND_OUT_OF_SCOPE, -EIO, "STATUS_FOUND_OUT_OF_SCOPE"},
+	{STATUS_ALLOCATE_BUCKET, -EIO, "STATUS_ALLOCATE_BUCKET"},
+	{STATUS_PROPSET_NOT_FOUND, -EIO, "STATUS_PROPSET_NOT_FOUND"},
+	{STATUS_MARSHALL_OVERFLOW, -EIO, "STATUS_MARSHALL_OVERFLOW"},
+	{STATUS_INVALID_VARIANT, -EIO, "STATUS_INVALID_VARIANT"},
+	{STATUS_DOMAIN_CONTROLLER_NOT_FOUND, -EIO,
+	"STATUS_DOMAIN_CONTROLLER_NOT_FOUND"},
+	{STATUS_ACCOUNT_LOCKED_OUT, -EIO, "STATUS_ACCOUNT_LOCKED_OUT"},
+	{STATUS_HANDLE_NOT_CLOSABLE, -EIO, "STATUS_HANDLE_NOT_CLOSABLE"},
+	{STATUS_CONNECTION_REFUSED, -EIO, "STATUS_CONNECTION_REFUSED"},
+	{STATUS_GRACEFUL_DISCONNECT, -EIO, "STATUS_GRACEFUL_DISCONNECT"},
+	{STATUS_ADDRESS_ALREADY_ASSOCIATED, -EIO,
+	"STATUS_ADDRESS_ALREADY_ASSOCIATED"},
+	{STATUS_ADDRESS_NOT_ASSOCIATED, -EIO, "STATUS_ADDRESS_NOT_ASSOCIATED"},
+	{STATUS_CONNECTION_INVALID, -EIO, "STATUS_CONNECTION_INVALID"},
+	{STATUS_CONNECTION_ACTIVE, -EIO, "STATUS_CONNECTION_ACTIVE"},
+	{STATUS_NETWORK_UNREACHABLE, -ENETUNREACH,
+	"STATUS_NETWORK_UNREACHABLE"},
+	{STATUS_HOST_UNREACHABLE, -EHOSTDOWN, "STATUS_HOST_UNREACHABLE"},
+	{STATUS_PROTOCOL_UNREACHABLE, -ENETUNREACH,
+	"STATUS_PROTOCOL_UNREACHABLE"},
+	{STATUS_PORT_UNREACHABLE, -ENETUNREACH, "STATUS_PORT_UNREACHABLE"},
+	{STATUS_REQUEST_ABORTED, -EIO, "STATUS_REQUEST_ABORTED"},
+	{STATUS_CONNECTION_ABORTED, -ECONNABORTED, "STATUS_CONNECTION_ABORTED"},
+	{STATUS_BAD_COMPRESSION_BUFFER, -EIO, "STATUS_BAD_COMPRESSION_BUFFER"},
+	{STATUS_USER_MAPPED_FILE, -EIO, "STATUS_USER_MAPPED_FILE"},
+	{STATUS_AUDIT_FAILED, -EIO, "STATUS_AUDIT_FAILED"},
+	{STATUS_TIMER_RESOLUTION_NOT_SET, -EIO,
+	"STATUS_TIMER_RESOLUTION_NOT_SET"},
+	{STATUS_CONNECTION_COUNT_LIMIT, -EIO, "STATUS_CONNECTION_COUNT_LIMIT"},
+	{STATUS_LOGIN_TIME_RESTRICTION, -EACCES,
+	"STATUS_LOGIN_TIME_RESTRICTION"},
+	{STATUS_LOGIN_WKSTA_RESTRICTION, -EACCES,
+	"STATUS_LOGIN_WKSTA_RESTRICTION"},
+	{STATUS_IMAGE_MP_UP_MISMATCH, -EIO, "STATUS_IMAGE_MP_UP_MISMATCH"},
+	{STATUS_INSUFFICIENT_LOGON_INFO, -EIO,
+	"STATUS_INSUFFICIENT_LOGON_INFO"},
+	{STATUS_BAD_DLL_ENTRYPOINT, -EIO, "STATUS_BAD_DLL_ENTRYPOINT"},
+	{STATUS_BAD_SERVICE_ENTRYPOINT, -EIO, "STATUS_BAD_SERVICE_ENTRYPOINT"},
+	{STATUS_LPC_REPLY_LOST, -EIO, "STATUS_LPC_REPLY_LOST"},
+	{STATUS_IP_ADDRESS_CONFLICT1, -EIO, "STATUS_IP_ADDRESS_CONFLICT1"},
+	{STATUS_IP_ADDRESS_CONFLICT2, -EIO, "STATUS_IP_ADDRESS_CONFLICT2"},
+	{STATUS_REGISTRY_QUOTA_LIMIT, -EDQUOT, "STATUS_REGISTRY_QUOTA_LIMIT"},
+	{STATUS_PATH_NOT_COVERED, -EREMOTE, "STATUS_PATH_NOT_COVERED"},
+	{STATUS_NO_CALLBACK_ACTIVE, -EIO, "STATUS_NO_CALLBACK_ACTIVE"},
+	{STATUS_LICENSE_QUOTA_EXCEEDED, -EACCES,
+	"STATUS_LICENSE_QUOTA_EXCEEDED"},
+	{STATUS_PWD_TOO_SHORT, -EIO, "STATUS_PWD_TOO_SHORT"},
+	{STATUS_PWD_TOO_RECENT, -EIO, "STATUS_PWD_TOO_RECENT"},
+	{STATUS_PWD_HISTORY_CONFLICT, -EIO, "STATUS_PWD_HISTORY_CONFLICT"},
+	{STATUS_PLUGPLAY_NO_DEVICE, -EIO, "STATUS_PLUGPLAY_NO_DEVICE"},
+	{STATUS_UNSUPPORTED_COMPRESSION, -EIO,
+	"STATUS_UNSUPPORTED_COMPRESSION"},
+	{STATUS_INVALID_HW_PROFILE, -EIO, "STATUS_INVALID_HW_PROFILE"},
+	{STATUS_INVALID_PLUGPLAY_DEVICE_PATH, -EIO,
+	"STATUS_INVALID_PLUGPLAY_DEVICE_PATH"},
+	{STATUS_DRIVER_ORDINAL_NOT_FOUND, -EIO,
+	"STATUS_DRIVER_ORDINAL_NOT_FOUND"},
+	{STATUS_DRIVER_ENTRYPOINT_NOT_FOUND, -EIO,
+	"STATUS_DRIVER_ENTRYPOINT_NOT_FOUND"},
+	{STATUS_RESOURCE_NOT_OWNED, -EIO, "STATUS_RESOURCE_NOT_OWNED"},
+	{STATUS_TOO_MANY_LINKS, -EMLINK, "STATUS_TOO_MANY_LINKS"},
+	{STATUS_QUOTA_LIST_INCONSISTENT, -EIO,
+	"STATUS_QUOTA_LIST_INCONSISTENT"},
+	{STATUS_FILE_IS_OFFLINE, -EIO, "STATUS_FILE_IS_OFFLINE"},
+	{STATUS_EVALUATION_EXPIRATION, -EIO, "STATUS_EVALUATION_EXPIRATION"},
+	{STATUS_ILLEGAL_DLL_RELOCATION, -EIO, "STATUS_ILLEGAL_DLL_RELOCATION"},
+	{STATUS_LICENSE_VIOLATION, -EIO, "STATUS_LICENSE_VIOLATION"},
+	{STATUS_DLL_INIT_FAILED_LOGOFF, -EIO, "STATUS_DLL_INIT_FAILED_LOGOFF"},
+	{STATUS_DRIVER_UNABLE_TO_LOAD, -EIO, "STATUS_DRIVER_UNABLE_TO_LOAD"},
+	{STATUS_DFS_UNAVAILABLE, -EIO, "STATUS_DFS_UNAVAILABLE"},
+	{STATUS_VOLUME_DISMOUNTED, -EIO, "STATUS_VOLUME_DISMOUNTED"},
+	{STATUS_WX86_INTERNAL_ERROR, -EIO, "STATUS_WX86_INTERNAL_ERROR"},
+	{STATUS_WX86_FLOAT_STACK_CHECK, -EIO, "STATUS_WX86_FLOAT_STACK_CHECK"},
+	{STATUS_VALIDATE_CONTINUE, -EIO, "STATUS_VALIDATE_CONTINUE"},
+	{STATUS_NO_MATCH, -EIO, "STATUS_NO_MATCH"},
+	{STATUS_NO_MORE_MATCHES, -EIO, "STATUS_NO_MORE_MATCHES"},
+	{STATUS_NOT_A_REPARSE_POINT, -EIO, "STATUS_NOT_A_REPARSE_POINT"},
+	{STATUS_IO_REPARSE_TAG_INVALID, -EIO, "STATUS_IO_REPARSE_TAG_INVALID"},
+	{STATUS_IO_REPARSE_TAG_MISMATCH, -EIO,
+	"STATUS_IO_REPARSE_TAG_MISMATCH"},
+	{STATUS_IO_REPARSE_DATA_INVALID, -EIO,
+	"STATUS_IO_REPARSE_DATA_INVALID"},
+	{STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO,
+	"STATUS_IO_REPARSE_TAG_NOT_HANDLED"},
+	{STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO,
+	"STATUS_REPARSE_POINT_NOT_RESOLVED"},
+	{STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO,
+	"STATUS_DIRECTORY_IS_A_REPARSE_POINT"},
+	{STATUS_RANGE_LIST_CONFLICT, -EIO, "STATUS_RANGE_LIST_CONFLICT"},
+	{STATUS_SOURCE_ELEMENT_EMPTY, -EIO, "STATUS_SOURCE_ELEMENT_EMPTY"},
+	{STATUS_DESTINATION_ELEMENT_FULL, -EIO,
+	"STATUS_DESTINATION_ELEMENT_FULL"},
+	{STATUS_ILLEGAL_ELEMENT_ADDRESS, -EIO,
+	"STATUS_ILLEGAL_ELEMENT_ADDRESS"},
+	{STATUS_MAGAZINE_NOT_PRESENT, -EIO, "STATUS_MAGAZINE_NOT_PRESENT"},
+	{STATUS_REINITIALIZATION_NEEDED, -EIO,
+	"STATUS_REINITIALIZATION_NEEDED"},
+	{STATUS_ENCRYPTION_FAILED, -EIO, "STATUS_ENCRYPTION_FAILED"},
+	{STATUS_DECRYPTION_FAILED, -EIO, "STATUS_DECRYPTION_FAILED"},
+	{STATUS_RANGE_NOT_FOUND, -EIO, "STATUS_RANGE_NOT_FOUND"},
+	{STATUS_NO_RECOVERY_POLICY, -EIO, "STATUS_NO_RECOVERY_POLICY"},
+	{STATUS_NO_EFS, -EIO, "STATUS_NO_EFS"},
+	{STATUS_WRONG_EFS, -EIO, "STATUS_WRONG_EFS"},
+	{STATUS_NO_USER_KEYS, -EIO, "STATUS_NO_USER_KEYS"},
+	{STATUS_FILE_NOT_ENCRYPTED, -EIO, "STATUS_FILE_NOT_ENCRYPTED"},
+	{STATUS_NOT_EXPORT_FORMAT, -EIO, "STATUS_NOT_EXPORT_FORMAT"},
+	{STATUS_FILE_ENCRYPTED, -EIO, "STATUS_FILE_ENCRYPTED"},
+	{STATUS_WMI_GUID_NOT_FOUND, -EIO, "STATUS_WMI_GUID_NOT_FOUND"},
+	{STATUS_WMI_INSTANCE_NOT_FOUND, -EIO, "STATUS_WMI_INSTANCE_NOT_FOUND"},
+	{STATUS_WMI_ITEMID_NOT_FOUND, -EIO, "STATUS_WMI_ITEMID_NOT_FOUND"},
+	{STATUS_WMI_TRY_AGAIN, -EIO, "STATUS_WMI_TRY_AGAIN"},
+	{STATUS_SHARED_POLICY, -EIO, "STATUS_SHARED_POLICY"},
+	{STATUS_POLICY_OBJECT_NOT_FOUND, -EIO,
+	"STATUS_POLICY_OBJECT_NOT_FOUND"},
+	{STATUS_POLICY_ONLY_IN_DS, -EIO, "STATUS_POLICY_ONLY_IN_DS"},
+	{STATUS_VOLUME_NOT_UPGRADED, -EIO, "STATUS_VOLUME_NOT_UPGRADED"},
+	{STATUS_REMOTE_STORAGE_NOT_ACTIVE, -EIO,
+	"STATUS_REMOTE_STORAGE_NOT_ACTIVE"},
+	{STATUS_REMOTE_STORAGE_MEDIA_ERROR, -EIO,
+	"STATUS_REMOTE_STORAGE_MEDIA_ERROR"},
+	{STATUS_NO_TRACKING_SERVICE, -EIO, "STATUS_NO_TRACKING_SERVICE"},
+	{STATUS_SERVER_SID_MISMATCH, -EIO, "STATUS_SERVER_SID_MISMATCH"},
+	{STATUS_DS_NO_ATTRIBUTE_OR_VALUE, -EIO,
+	"STATUS_DS_NO_ATTRIBUTE_OR_VALUE"},
+	{STATUS_DS_INVALID_ATTRIBUTE_SYNTAX, -EIO,
+	"STATUS_DS_INVALID_ATTRIBUTE_SYNTAX"},
+	{STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED, -EIO,
+	"STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED"},
+	{STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS, -EIO,
+	"STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS"},
+	{STATUS_DS_BUSY, -EBUSY, "STATUS_DS_BUSY"},
+	{STATUS_DS_UNAVAILABLE, -EIO, "STATUS_DS_UNAVAILABLE"},
+	{STATUS_DS_NO_RIDS_ALLOCATED, -EIO, "STATUS_DS_NO_RIDS_ALLOCATED"},
+	{STATUS_DS_NO_MORE_RIDS, -EIO, "STATUS_DS_NO_MORE_RIDS"},
+	{STATUS_DS_INCORRECT_ROLE_OWNER, -EIO,
+	"STATUS_DS_INCORRECT_ROLE_OWNER"},
+	{STATUS_DS_RIDMGR_INIT_ERROR, -EIO, "STATUS_DS_RIDMGR_INIT_ERROR"},
+	{STATUS_DS_OBJ_CLASS_VIOLATION, -EIO, "STATUS_DS_OBJ_CLASS_VIOLATION"},
+	{STATUS_DS_CANT_ON_NON_LEAF, -EIO, "STATUS_DS_CANT_ON_NON_LEAF"},
+	{STATUS_DS_CANT_ON_RDN, -EIO, "STATUS_DS_CANT_ON_RDN"},
+	{STATUS_DS_CANT_MOD_OBJ_CLASS, -EIO, "STATUS_DS_CANT_MOD_OBJ_CLASS"},
+	{STATUS_DS_CROSS_DOM_MOVE_FAILED, -EIO,
+	"STATUS_DS_CROSS_DOM_MOVE_FAILED"},
+	{STATUS_DS_GC_NOT_AVAILABLE, -EIO, "STATUS_DS_GC_NOT_AVAILABLE"},
+	{STATUS_DIRECTORY_SERVICE_REQUIRED, -EIO,
+	"STATUS_DIRECTORY_SERVICE_REQUIRED"},
+	{STATUS_REPARSE_ATTRIBUTE_CONFLICT, -EIO,
+	"STATUS_REPARSE_ATTRIBUTE_CONFLICT"},
+	{STATUS_CANT_ENABLE_DENY_ONLY, -EIO, "STATUS_CANT_ENABLE_DENY_ONLY"},
+	{STATUS_FLOAT_MULTIPLE_FAULTS, -EIO, "STATUS_FLOAT_MULTIPLE_FAULTS"},
+	{STATUS_FLOAT_MULTIPLE_TRAPS, -EIO, "STATUS_FLOAT_MULTIPLE_TRAPS"},
+	{STATUS_DEVICE_REMOVED, -EIO, "STATUS_DEVICE_REMOVED"},
+	{STATUS_JOURNAL_DELETE_IN_PROGRESS, -EIO,
+	"STATUS_JOURNAL_DELETE_IN_PROGRESS"},
+	{STATUS_JOURNAL_NOT_ACTIVE, -EIO, "STATUS_JOURNAL_NOT_ACTIVE"},
+	{STATUS_NOINTERFACE, -EIO, "STATUS_NOINTERFACE"},
+	{STATUS_DS_ADMIN_LIMIT_EXCEEDED, -EIO,
+	"STATUS_DS_ADMIN_LIMIT_EXCEEDED"},
+	{STATUS_DRIVER_FAILED_SLEEP, -EIO, "STATUS_DRIVER_FAILED_SLEEP"},
+	{STATUS_MUTUAL_AUTHENTICATION_FAILED, -EIO,
+	"STATUS_MUTUAL_AUTHENTICATION_FAILED"},
+	{STATUS_CORRUPT_SYSTEM_FILE, -EIO, "STATUS_CORRUPT_SYSTEM_FILE"},
+	{STATUS_DATATYPE_MISALIGNMENT_ERROR, -EIO,
+	"STATUS_DATATYPE_MISALIGNMENT_ERROR"},
+	{STATUS_WMI_READ_ONLY, -EROFS, "STATUS_WMI_READ_ONLY"},
+	{STATUS_WMI_SET_FAILURE, -EIO, "STATUS_WMI_SET_FAILURE"},
+	{STATUS_COMMITMENT_MINIMUM, -EIO, "STATUS_COMMITMENT_MINIMUM"},
+	{STATUS_REG_NAT_CONSUMPTION, -EIO, "STATUS_REG_NAT_CONSUMPTION"},
+	{STATUS_TRANSPORT_FULL, -EIO, "STATUS_TRANSPORT_FULL"},
+	{STATUS_DS_SAM_INIT_FAILURE, -EIO, "STATUS_DS_SAM_INIT_FAILURE"},
+	{STATUS_ONLY_IF_CONNECTED, -EIO, "STATUS_ONLY_IF_CONNECTED"},
+	{STATUS_DS_SENSITIVE_GROUP_VIOLATION, -EIO,
+	"STATUS_DS_SENSITIVE_GROUP_VIOLATION"},
+	{STATUS_PNP_RESTART_ENUMERATION, -EIO,
+	"STATUS_PNP_RESTART_ENUMERATION"},
+	{STATUS_JOURNAL_ENTRY_DELETED, -EIO, "STATUS_JOURNAL_ENTRY_DELETED"},
+	{STATUS_DS_CANT_MOD_PRIMARYGROUPID, -EIO,
+	"STATUS_DS_CANT_MOD_PRIMARYGROUPID"},
+	{STATUS_SYSTEM_IMAGE_BAD_SIGNATURE, -EIO,
+	"STATUS_SYSTEM_IMAGE_BAD_SIGNATURE"},
+	{STATUS_PNP_REBOOT_REQUIRED, -EIO, "STATUS_PNP_REBOOT_REQUIRED"},
+	{STATUS_POWER_STATE_INVALID, -EIO, "STATUS_POWER_STATE_INVALID"},
+	{STATUS_DS_INVALID_GROUP_TYPE, -EIO, "STATUS_DS_INVALID_GROUP_TYPE"},
+	{STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN, -EIO,
+	"STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN"},
+	{STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN, -EIO,
+	"STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN"},
+	{STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER, -EIO,
+	"STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER"},
+	{STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER, -EIO,
+	"STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER"},
+	{STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER, -EIO,
+	"STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER"},
+	{STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER, -EIO,
+	"STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER"},
+	{STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER, -EIO,
+	"STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER"},
+	{STATUS_DS_HAVE_PRIMARY_MEMBERS, -EIO,
+	"STATUS_DS_HAVE_PRIMARY_MEMBERS"},
+	{STATUS_WMI_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_WMI_NOT_SUPPORTED"},
+	{STATUS_INSUFFICIENT_POWER, -EIO, "STATUS_INSUFFICIENT_POWER"},
+	{STATUS_SAM_NEED_BOOTKEY_PASSWORD, -EIO,
+	"STATUS_SAM_NEED_BOOTKEY_PASSWORD"},
+	{STATUS_SAM_NEED_BOOTKEY_FLOPPY, -EIO,
+	"STATUS_SAM_NEED_BOOTKEY_FLOPPY"},
+	{STATUS_DS_CANT_START, -EIO, "STATUS_DS_CANT_START"},
+	{STATUS_DS_INIT_FAILURE, -EIO, "STATUS_DS_INIT_FAILURE"},
+	{STATUS_SAM_INIT_FAILURE, -EIO, "STATUS_SAM_INIT_FAILURE"},
+	{STATUS_DS_GC_REQUIRED, -EIO, "STATUS_DS_GC_REQUIRED"},
+	{STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY, -EIO,
+	"STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY"},
+	{STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS, -EIO,
+	"STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS"},
+	{STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED"},
+	{STATUS_MULTIPLE_FAULT_VIOLATION, -EIO,
+	"STATUS_MULTIPLE_FAULT_VIOLATION"},
+	{STATUS_CURRENT_DOMAIN_NOT_ALLOWED, -EIO,
+	"STATUS_CURRENT_DOMAIN_NOT_ALLOWED"},
+	{STATUS_CANNOT_MAKE, -EIO, "STATUS_CANNOT_MAKE"},
+	{STATUS_SYSTEM_SHUTDOWN, -EIO, "STATUS_SYSTEM_SHUTDOWN"},
+	{STATUS_DS_INIT_FAILURE_CONSOLE, -EIO,
+	"STATUS_DS_INIT_FAILURE_CONSOLE"},
+	{STATUS_DS_SAM_INIT_FAILURE_CONSOLE, -EIO,
+	"STATUS_DS_SAM_INIT_FAILURE_CONSOLE"},
+	{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
+	"STATUS_UNFINISHED_CONTEXT_DELETED"},
+	{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
+	{STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+	{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
+	{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
+	"STATUS_WRONG_CREDENTIAL_HANDLE"},
+	{STATUS_CRYPTO_SYSTEM_INVALID, -EIO, "STATUS_CRYPTO_SYSTEM_INVALID"},
+	{STATUS_MAX_REFERRALS_EXCEEDED, -EIO, "STATUS_MAX_REFERRALS_EXCEEDED"},
+	{STATUS_MUST_BE_KDC, -EIO, "STATUS_MUST_BE_KDC"},
+	{STATUS_STRONG_CRYPTO_NOT_SUPPORTED, -EIO,
+	"STATUS_STRONG_CRYPTO_NOT_SUPPORTED"},
+	{STATUS_TOO_MANY_PRINCIPALS, -EIO, "STATUS_TOO_MANY_PRINCIPALS"},
+	{STATUS_NO_PA_DATA, -EIO, "STATUS_NO_PA_DATA"},
+	{STATUS_PKINIT_NAME_MISMATCH, -EIO, "STATUS_PKINIT_NAME_MISMATCH"},
+	{STATUS_SMARTCARD_LOGON_REQUIRED, -EIO,
+	"STATUS_SMARTCARD_LOGON_REQUIRED"},
+	{STATUS_KDC_INVALID_REQUEST, -EIO, "STATUS_KDC_INVALID_REQUEST"},
+	{STATUS_KDC_UNABLE_TO_REFER, -EIO, "STATUS_KDC_UNABLE_TO_REFER"},
+	{STATUS_KDC_UNKNOWN_ETYPE, -EIO, "STATUS_KDC_UNKNOWN_ETYPE"},
+	{STATUS_SHUTDOWN_IN_PROGRESS, -EIO, "STATUS_SHUTDOWN_IN_PROGRESS"},
+	{STATUS_SERVER_SHUTDOWN_IN_PROGRESS, -EIO,
+	"STATUS_SERVER_SHUTDOWN_IN_PROGRESS"},
+	{STATUS_NOT_SUPPORTED_ON_SBS, -EOPNOTSUPP,
+	"STATUS_NOT_SUPPORTED_ON_SBS"},
+	{STATUS_WMI_GUID_DISCONNECTED, -EIO, "STATUS_WMI_GUID_DISCONNECTED"},
+	{STATUS_WMI_ALREADY_DISABLED, -EIO, "STATUS_WMI_ALREADY_DISABLED"},
+	{STATUS_WMI_ALREADY_ENABLED, -EIO, "STATUS_WMI_ALREADY_ENABLED"},
+	{STATUS_MFT_TOO_FRAGMENTED, -EIO, "STATUS_MFT_TOO_FRAGMENTED"},
+	{STATUS_COPY_PROTECTION_FAILURE, -EIO,
+	"STATUS_COPY_PROTECTION_FAILURE"},
+	{STATUS_CSS_AUTHENTICATION_FAILURE, -EIO,
+	"STATUS_CSS_AUTHENTICATION_FAILURE"},
+	{STATUS_CSS_KEY_NOT_PRESENT, -EIO, "STATUS_CSS_KEY_NOT_PRESENT"},
+	{STATUS_CSS_KEY_NOT_ESTABLISHED, -EIO,
+	"STATUS_CSS_KEY_NOT_ESTABLISHED"},
+	{STATUS_CSS_SCRAMBLED_SECTOR, -EIO, "STATUS_CSS_SCRAMBLED_SECTOR"},
+	{STATUS_CSS_REGION_MISMATCH, -EIO, "STATUS_CSS_REGION_MISMATCH"},
+	{STATUS_CSS_RESETS_EXHAUSTED, -EIO, "STATUS_CSS_RESETS_EXHAUSTED"},
+	{STATUS_PKINIT_FAILURE, -EIO, "STATUS_PKINIT_FAILURE"},
+	{STATUS_SMARTCARD_SUBSYSTEM_FAILURE, -EIO,
+	"STATUS_SMARTCARD_SUBSYSTEM_FAILURE"},
+	{STATUS_NO_KERB_KEY, -EIO, "STATUS_NO_KERB_KEY"},
+	{STATUS_HOST_DOWN, -EIO, "STATUS_HOST_DOWN"},
+	{STATUS_UNSUPPORTED_PREAUTH, -EIO, "STATUS_UNSUPPORTED_PREAUTH"},
+	{STATUS_EFS_ALG_BLOB_TOO_BIG, -EIO, "STATUS_EFS_ALG_BLOB_TOO_BIG"},
+	{STATUS_PORT_NOT_SET, -EIO, "STATUS_PORT_NOT_SET"},
+	{STATUS_DEBUGGER_INACTIVE, -EIO, "STATUS_DEBUGGER_INACTIVE"},
+	{STATUS_DS_VERSION_CHECK_FAILURE, -EIO,
+	"STATUS_DS_VERSION_CHECK_FAILURE"},
+	{STATUS_AUDITING_DISABLED, -EIO, "STATUS_AUDITING_DISABLED"},
+	{STATUS_PRENT4_MACHINE_ACCOUNT, -EIO, "STATUS_PRENT4_MACHINE_ACCOUNT"},
+	{STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER, -EIO,
+	"STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER"},
+	{STATUS_INVALID_IMAGE_WIN_32, -EIO, "STATUS_INVALID_IMAGE_WIN_32"},
+	{STATUS_INVALID_IMAGE_WIN_64, -EIO, "STATUS_INVALID_IMAGE_WIN_64"},
+	{STATUS_BAD_BINDINGS, -EIO, "STATUS_BAD_BINDINGS"},
+	{STATUS_NETWORK_SESSION_EXPIRED, -EIO,
+	"STATUS_NETWORK_SESSION_EXPIRED"},
+	{STATUS_APPHELP_BLOCK, -EIO, "STATUS_APPHELP_BLOCK"},
+	{STATUS_ALL_SIDS_FILTERED, -EIO, "STATUS_ALL_SIDS_FILTERED"},
+	{STATUS_NOT_SAFE_MODE_DRIVER, -EIO, "STATUS_NOT_SAFE_MODE_DRIVER"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_PATH, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_PATH"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_OTHER, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_OTHER"},
+	{STATUS_FAILED_DRIVER_ENTRY, -EIO, "STATUS_FAILED_DRIVER_ENTRY"},
+	{STATUS_DEVICE_ENUMERATION_ERROR, -EIO,
+	"STATUS_DEVICE_ENUMERATION_ERROR"},
+	{STATUS_MOUNT_POINT_NOT_RESOLVED, -EIO,
+	"STATUS_MOUNT_POINT_NOT_RESOLVED"},
+	{STATUS_INVALID_DEVICE_OBJECT_PARAMETER, -EIO,
+	"STATUS_INVALID_DEVICE_OBJECT_PARAMETER"},
+	{STATUS_MCA_OCCURED, -EIO, "STATUS_MCA_OCCURED"},
+	{STATUS_DRIVER_BLOCKED_CRITICAL, -EIO,
+	"STATUS_DRIVER_BLOCKED_CRITICAL"},
+	{STATUS_DRIVER_BLOCKED, -EIO, "STATUS_DRIVER_BLOCKED"},
+	{STATUS_DRIVER_DATABASE_ERROR, -EIO, "STATUS_DRIVER_DATABASE_ERROR"},
+	{STATUS_SYSTEM_HIVE_TOO_LARGE, -EIO, "STATUS_SYSTEM_HIVE_TOO_LARGE"},
+	{STATUS_INVALID_IMPORT_OF_NON_DLL, -EIO,
+	"STATUS_INVALID_IMPORT_OF_NON_DLL"},
+	{STATUS_NO_SECRETS, -EIO, "STATUS_NO_SECRETS"},
+	{STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY, -EACCES,
+	"STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY"},
+	{STATUS_FAILED_STACK_SWITCH, -EIO, "STATUS_FAILED_STACK_SWITCH"},
+	{STATUS_HEAP_CORRUPTION, -EIO, "STATUS_HEAP_CORRUPTION"},
+	{STATUS_SMARTCARD_WRONG_PIN, -EIO, "STATUS_SMARTCARD_WRONG_PIN"},
+	{STATUS_SMARTCARD_CARD_BLOCKED, -EIO, "STATUS_SMARTCARD_CARD_BLOCKED"},
+	{STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED, -EIO,
+	"STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED"},
+	{STATUS_SMARTCARD_NO_CARD, -EIO, "STATUS_SMARTCARD_NO_CARD"},
+	{STATUS_SMARTCARD_NO_KEY_CONTAINER, -EIO,
+	"STATUS_SMARTCARD_NO_KEY_CONTAINER"},
+	{STATUS_SMARTCARD_NO_CERTIFICATE, -EIO,
+	"STATUS_SMARTCARD_NO_CERTIFICATE"},
+	{STATUS_SMARTCARD_NO_KEYSET, -EIO, "STATUS_SMARTCARD_NO_KEYSET"},
+	{STATUS_SMARTCARD_IO_ERROR, -EIO, "STATUS_SMARTCARD_IO_ERROR"},
+	{STATUS_DOWNGRADE_DETECTED, -EIO, "STATUS_DOWNGRADE_DETECTED"},
+	{STATUS_SMARTCARD_CERT_REVOKED, -EIO, "STATUS_SMARTCARD_CERT_REVOKED"},
+	{STATUS_ISSUING_CA_UNTRUSTED, -EIO, "STATUS_ISSUING_CA_UNTRUSTED"},
+	{STATUS_REVOCATION_OFFLINE_C, -EIO, "STATUS_REVOCATION_OFFLINE_C"},
+	{STATUS_PKINIT_CLIENT_FAILURE, -EIO, "STATUS_PKINIT_CLIENT_FAILURE"},
+	{STATUS_SMARTCARD_CERT_EXPIRED, -EIO, "STATUS_SMARTCARD_CERT_EXPIRED"},
+	{STATUS_DRIVER_FAILED_PRIOR_UNLOAD, -EIO,
+	"STATUS_DRIVER_FAILED_PRIOR_UNLOAD"},
+	{STATUS_SMARTCARD_SILENT_CONTEXT, -EIO,
+	"STATUS_SMARTCARD_SILENT_CONTEXT"},
+	{STATUS_PER_USER_TRUST_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_PER_USER_TRUST_QUOTA_EXCEEDED"},
+	{STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED"},
+	{STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED"},
+	{STATUS_DS_NAME_NOT_UNIQUE, -EIO, "STATUS_DS_NAME_NOT_UNIQUE"},
+	{STATUS_DS_DUPLICATE_ID_FOUND, -EIO, "STATUS_DS_DUPLICATE_ID_FOUND"},
+	{STATUS_DS_GROUP_CONVERSION_ERROR, -EIO,
+	"STATUS_DS_GROUP_CONVERSION_ERROR"},
+	{STATUS_VOLSNAP_PREPARE_HIBERNATE, -EIO,
+	"STATUS_VOLSNAP_PREPARE_HIBERNATE"},
+	{STATUS_USER2USER_REQUIRED, -EIO, "STATUS_USER2USER_REQUIRED"},
+	{STATUS_STACK_BUFFER_OVERRUN, -EIO, "STATUS_STACK_BUFFER_OVERRUN"},
+	{STATUS_NO_S4U_PROT_SUPPORT, -EIO, "STATUS_NO_S4U_PROT_SUPPORT"},
+	{STATUS_CROSSREALM_DELEGATION_FAILURE, -EIO,
+	"STATUS_CROSSREALM_DELEGATION_FAILURE"},
+	{STATUS_REVOCATION_OFFLINE_KDC, -EIO, "STATUS_REVOCATION_OFFLINE_KDC"},
+	{STATUS_ISSUING_CA_UNTRUSTED_KDC, -EIO,
+	"STATUS_ISSUING_CA_UNTRUSTED_KDC"},
+	{STATUS_KDC_CERT_EXPIRED, -EIO, "STATUS_KDC_CERT_EXPIRED"},
+	{STATUS_KDC_CERT_REVOKED, -EIO, "STATUS_KDC_CERT_REVOKED"},
+	{STATUS_PARAMETER_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_PARAMETER_QUOTA_EXCEEDED"},
+	{STATUS_HIBERNATION_FAILURE, -EIO, "STATUS_HIBERNATION_FAILURE"},
+	{STATUS_DELAY_LOAD_FAILED, -EIO, "STATUS_DELAY_LOAD_FAILED"},
+	{STATUS_AUTHENTICATION_FIREWALL_FAILED, -EIO,
+	"STATUS_AUTHENTICATION_FIREWALL_FAILED"},
+	{STATUS_VDM_DISALLOWED, -EIO, "STATUS_VDM_DISALLOWED"},
+	{STATUS_HUNG_DISPLAY_DRIVER_THREAD, -EIO,
+	"STATUS_HUNG_DISPLAY_DRIVER_THREAD"},
+	{STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE, -EIO,
+	"STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE"},
+	{STATUS_INVALID_CRUNTIME_PARAMETER, -EIO,
+	"STATUS_INVALID_CRUNTIME_PARAMETER"},
+	{STATUS_NTLM_BLOCKED, -EIO, "STATUS_NTLM_BLOCKED"},
+	{STATUS_ASSERTION_FAILURE, -EIO, "STATUS_ASSERTION_FAILURE"},
+	{STATUS_VERIFIER_STOP, -EIO, "STATUS_VERIFIER_STOP"},
+	{STATUS_CALLBACK_POP_STACK, -EIO, "STATUS_CALLBACK_POP_STACK"},
+	{STATUS_INCOMPATIBLE_DRIVER_BLOCKED, -EIO,
+	"STATUS_INCOMPATIBLE_DRIVER_BLOCKED"},
+	{STATUS_HIVE_UNLOADED, -EIO, "STATUS_HIVE_UNLOADED"},
+	{STATUS_COMPRESSION_DISABLED, -EIO, "STATUS_COMPRESSION_DISABLED"},
+	{STATUS_FILE_SYSTEM_LIMITATION, -EIO, "STATUS_FILE_SYSTEM_LIMITATION"},
+	{STATUS_INVALID_IMAGE_HASH, -EIO, "STATUS_INVALID_IMAGE_HASH"},
+	{STATUS_NOT_CAPABLE, -EIO, "STATUS_NOT_CAPABLE"},
+	{STATUS_REQUEST_OUT_OF_SEQUENCE, -EIO,
+	"STATUS_REQUEST_OUT_OF_SEQUENCE"},
+	{STATUS_IMPLEMENTATION_LIMIT, -EIO, "STATUS_IMPLEMENTATION_LIMIT"},
+	{STATUS_ELEVATION_REQUIRED, -EIO, "STATUS_ELEVATION_REQUIRED"},
+	{STATUS_BEYOND_VDL, -EIO, "STATUS_BEYOND_VDL"},
+	{STATUS_ENCOUNTERED_WRITE_IN_PROGRESS, -EIO,
+	"STATUS_ENCOUNTERED_WRITE_IN_PROGRESS"},
+	{STATUS_PTE_CHANGED, -EIO, "STATUS_PTE_CHANGED"},
+	{STATUS_PURGE_FAILED, -EIO, "STATUS_PURGE_FAILED"},
+	{STATUS_CRED_REQUIRES_CONFIRMATION, -EIO,
+	"STATUS_CRED_REQUIRES_CONFIRMATION"},
+	{STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE, -EIO,
+	"STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE"},
+	{STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER, -EIO,
+	"STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER"},
+	{STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE, -EIO,
+	"STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE"},
+	{STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE, -EIO,
+	"STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE"},
+	{STATUS_CS_ENCRYPTION_FILE_NOT_CSE, -EIO,
+	"STATUS_CS_ENCRYPTION_FILE_NOT_CSE"},
+	{STATUS_INVALID_LABEL, -EIO, "STATUS_INVALID_LABEL"},
+	{STATUS_DRIVER_PROCESS_TERMINATED, -EIO,
+	"STATUS_DRIVER_PROCESS_TERMINATED"},
+	{STATUS_AMBIGUOUS_SYSTEM_DEVICE, -EIO,
+	"STATUS_AMBIGUOUS_SYSTEM_DEVICE"},
+	{STATUS_SYSTEM_DEVICE_NOT_FOUND, -EIO,
+	"STATUS_SYSTEM_DEVICE_NOT_FOUND"},
+	{STATUS_RESTART_BOOT_APPLICATION, -EIO,
+	"STATUS_RESTART_BOOT_APPLICATION"},
+	{STATUS_INVALID_TASK_NAME, -EIO, "STATUS_INVALID_TASK_NAME"},
+	{STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"},
+	{STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"},
+	{STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"},
+	{STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"},
+	{STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"},
+	{STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"},
+	{STATUS_REQUEST_CANCELED, -EIO, "STATUS_REQUEST_CANCELED"},
+	{STATUS_RECURSIVE_DISPATCH, -EIO, "STATUS_RECURSIVE_DISPATCH"},
+	{STATUS_LPC_RECEIVE_BUFFER_EXPECTED, -EIO,
+	"STATUS_LPC_RECEIVE_BUFFER_EXPECTED"},
+	{STATUS_LPC_INVALID_CONNECTION_USAGE, -EIO,
+	"STATUS_LPC_INVALID_CONNECTION_USAGE"},
+	{STATUS_LPC_REQUESTS_NOT_ALLOWED, -EIO,
+	"STATUS_LPC_REQUESTS_NOT_ALLOWED"},
+	{STATUS_RESOURCE_IN_USE, -EIO, "STATUS_RESOURCE_IN_USE"},
+	{STATUS_HARDWARE_MEMORY_ERROR, -EIO, "STATUS_HARDWARE_MEMORY_ERROR"},
+	{STATUS_THREADPOOL_HANDLE_EXCEPTION, -EIO,
+	"STATUS_THREADPOOL_HANDLE_EXCEPTION"},
+	{STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_RELEASED_DURING_OPERATION, -EIO,
+	"STATUS_THREADPOOL_RELEASED_DURING_OPERATION"},
+	{STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING, -EIO,
+	"STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING"},
+	{STATUS_APC_RETURNED_WHILE_IMPERSONATING, -EIO,
+	"STATUS_APC_RETURNED_WHILE_IMPERSONATING"},
+	{STATUS_PROCESS_IS_PROTECTED, -EIO, "STATUS_PROCESS_IS_PROTECTED"},
+	{STATUS_MCA_EXCEPTION, -EIO, "STATUS_MCA_EXCEPTION"},
+	{STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE, -EIO,
+	"STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE"},
+	{STATUS_SYMLINK_CLASS_DISABLED, -EIO, "STATUS_SYMLINK_CLASS_DISABLED"},
+	{STATUS_INVALID_IDN_NORMALIZATION, -EIO,
+	"STATUS_INVALID_IDN_NORMALIZATION"},
+	{STATUS_NO_UNICODE_TRANSLATION, -EIO, "STATUS_NO_UNICODE_TRANSLATION"},
+	{STATUS_ALREADY_REGISTERED, -EIO, "STATUS_ALREADY_REGISTERED"},
+	{STATUS_CONTEXT_MISMATCH, -EIO, "STATUS_CONTEXT_MISMATCH"},
+	{STATUS_PORT_ALREADY_HAS_COMPLETION_LIST, -EIO,
+	"STATUS_PORT_ALREADY_HAS_COMPLETION_LIST"},
+	{STATUS_CALLBACK_RETURNED_THREAD_PRIORITY, -EIO,
+	"STATUS_CALLBACK_RETURNED_THREAD_PRIORITY"},
+	{STATUS_INVALID_THREAD, -EIO, "STATUS_INVALID_THREAD"},
+	{STATUS_CALLBACK_RETURNED_TRANSACTION, -EIO,
+	"STATUS_CALLBACK_RETURNED_TRANSACTION"},
+	{STATUS_CALLBACK_RETURNED_LDR_LOCK, -EIO,
+	"STATUS_CALLBACK_RETURNED_LDR_LOCK"},
+	{STATUS_CALLBACK_RETURNED_LANG, -EIO, "STATUS_CALLBACK_RETURNED_LANG"},
+	{STATUS_CALLBACK_RETURNED_PRI_BACK, -EIO,
+	"STATUS_CALLBACK_RETURNED_PRI_BACK"},
+	{STATUS_CALLBACK_RETURNED_THREAD_AFFINITY, -EIO,
+	"STATUS_CALLBACK_RETURNED_THREAD_AFFINITY"},
+	{STATUS_DISK_REPAIR_DISABLED, -EIO, "STATUS_DISK_REPAIR_DISABLED"},
+	{STATUS_DS_DOMAIN_RENAME_IN_PROGRESS, -EIO,
+	"STATUS_DS_DOMAIN_RENAME_IN_PROGRESS"},
+	{STATUS_DISK_QUOTA_EXCEEDED, -EDQUOT, "STATUS_DISK_QUOTA_EXCEEDED"},
+	{STATUS_CONTENT_BLOCKED, -EIO, "STATUS_CONTENT_BLOCKED"},
+	{STATUS_BAD_CLUSTERS, -EIO, "STATUS_BAD_CLUSTERS"},
+	{STATUS_VOLUME_DIRTY, -EIO, "STATUS_VOLUME_DIRTY"},
+	{STATUS_FILE_CHECKED_OUT, -EIO, "STATUS_FILE_CHECKED_OUT"},
+	{STATUS_CHECKOUT_REQUIRED, -EIO, "STATUS_CHECKOUT_REQUIRED"},
+	{STATUS_BAD_FILE_TYPE, -EIO, "STATUS_BAD_FILE_TYPE"},
+	{STATUS_FILE_TOO_LARGE, -EIO, "STATUS_FILE_TOO_LARGE"},
+	{STATUS_FORMS_AUTH_REQUIRED, -EIO, "STATUS_FORMS_AUTH_REQUIRED"},
+	{STATUS_VIRUS_INFECTED, -EIO, "STATUS_VIRUS_INFECTED"},
+	{STATUS_VIRUS_DELETED, -EIO, "STATUS_VIRUS_DELETED"},
+	{STATUS_BAD_MCFG_TABLE, -EIO, "STATUS_BAD_MCFG_TABLE"},
+	{STATUS_WOW_ASSERTION, -EIO, "STATUS_WOW_ASSERTION"},
+	{STATUS_INVALID_SIGNATURE, -EIO, "STATUS_INVALID_SIGNATURE"},
+	{STATUS_HMAC_NOT_SUPPORTED, -EIO, "STATUS_HMAC_NOT_SUPPORTED"},
+	{STATUS_IPSEC_QUEUE_OVERFLOW, -EIO, "STATUS_IPSEC_QUEUE_OVERFLOW"},
+	{STATUS_ND_QUEUE_OVERFLOW, -EIO, "STATUS_ND_QUEUE_OVERFLOW"},
+	{STATUS_HOPLIMIT_EXCEEDED, -EIO, "STATUS_HOPLIMIT_EXCEEDED"},
+	{STATUS_PROTOCOL_NOT_SUPPORTED, -EOPNOTSUPP,
+	"STATUS_PROTOCOL_NOT_SUPPORTED"},
+	{STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED, -EIO,
+	"STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED"},
+	{STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR, -EIO,
+	"STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR"},
+	{STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR, -EIO,
+	"STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR"},
+	{STATUS_XML_PARSE_ERROR, -EIO, "STATUS_XML_PARSE_ERROR"},
+	{STATUS_XMLDSIG_ERROR, -EIO, "STATUS_XMLDSIG_ERROR"},
+	{STATUS_WRONG_COMPARTMENT, -EIO, "STATUS_WRONG_COMPARTMENT"},
+	{STATUS_AUTHIP_FAILURE, -EIO, "STATUS_AUTHIP_FAILURE"},
+	{DBG_NO_STATE_CHANGE, -EIO, "DBG_NO_STATE_CHANGE"},
+	{DBG_APP_NOT_IDLE, -EIO, "DBG_APP_NOT_IDLE"},
+	{RPC_NT_INVALID_STRING_BINDING, -EIO, "RPC_NT_INVALID_STRING_BINDING"},
+	{RPC_NT_WRONG_KIND_OF_BINDING, -EIO, "RPC_NT_WRONG_KIND_OF_BINDING"},
+	{RPC_NT_INVALID_BINDING, -EIO, "RPC_NT_INVALID_BINDING"},
+	{RPC_NT_PROTSEQ_NOT_SUPPORTED, -EOPNOTSUPP,
+	"RPC_NT_PROTSEQ_NOT_SUPPORTED"},
+	{RPC_NT_INVALID_RPC_PROTSEQ, -EIO, "RPC_NT_INVALID_RPC_PROTSEQ"},
+	{RPC_NT_INVALID_STRING_UUID, -EIO, "RPC_NT_INVALID_STRING_UUID"},
+	{RPC_NT_INVALID_ENDPOINT_FORMAT, -EIO,
+	"RPC_NT_INVALID_ENDPOINT_FORMAT"},
+	{RPC_NT_INVALID_NET_ADDR, -EIO, "RPC_NT_INVALID_NET_ADDR"},
+	{RPC_NT_NO_ENDPOINT_FOUND, -EIO, "RPC_NT_NO_ENDPOINT_FOUND"},
+	{RPC_NT_INVALID_TIMEOUT, -EINVAL, "RPC_NT_INVALID_TIMEOUT"},
+	{RPC_NT_OBJECT_NOT_FOUND, -ENOENT, "RPC_NT_OBJECT_NOT_FOUND"},
+	{RPC_NT_ALREADY_REGISTERED, -EIO, "RPC_NT_ALREADY_REGISTERED"},
+	{RPC_NT_TYPE_ALREADY_REGISTERED, -EIO,
+	"RPC_NT_TYPE_ALREADY_REGISTERED"},
+	{RPC_NT_ALREADY_LISTENING, -EIO, "RPC_NT_ALREADY_LISTENING"},
+	{RPC_NT_NO_PROTSEQS_REGISTERED, -EIO, "RPC_NT_NO_PROTSEQS_REGISTERED"},
+	{RPC_NT_NOT_LISTENING, -EIO, "RPC_NT_NOT_LISTENING"},
+	{RPC_NT_UNKNOWN_MGR_TYPE, -EIO, "RPC_NT_UNKNOWN_MGR_TYPE"},
+	{RPC_NT_UNKNOWN_IF, -EIO, "RPC_NT_UNKNOWN_IF"},
+	{RPC_NT_NO_BINDINGS, -EIO, "RPC_NT_NO_BINDINGS"},
+	{RPC_NT_NO_PROTSEQS, -EIO, "RPC_NT_NO_PROTSEQS"},
+	{RPC_NT_CANT_CREATE_ENDPOINT, -EIO, "RPC_NT_CANT_CREATE_ENDPOINT"},
+	{RPC_NT_OUT_OF_RESOURCES, -EIO, "RPC_NT_OUT_OF_RESOURCES"},
+	{RPC_NT_SERVER_UNAVAILABLE, -EIO, "RPC_NT_SERVER_UNAVAILABLE"},
+	{RPC_NT_SERVER_TOO_BUSY, -EBUSY, "RPC_NT_SERVER_TOO_BUSY"},
+	{RPC_NT_INVALID_NETWORK_OPTIONS, -EIO,
+	"RPC_NT_INVALID_NETWORK_OPTIONS"},
+	{RPC_NT_NO_CALL_ACTIVE, -EIO, "RPC_NT_NO_CALL_ACTIVE"},
+	{RPC_NT_CALL_FAILED, -EIO, "RPC_NT_CALL_FAILED"},
+	{RPC_NT_CALL_FAILED_DNE, -EIO, "RPC_NT_CALL_FAILED_DNE"},
+	{RPC_NT_PROTOCOL_ERROR, -EIO, "RPC_NT_PROTOCOL_ERROR"},
+	{RPC_NT_UNSUPPORTED_TRANS_SYN, -EIO, "RPC_NT_UNSUPPORTED_TRANS_SYN"},
+	{RPC_NT_UNSUPPORTED_TYPE, -EIO, "RPC_NT_UNSUPPORTED_TYPE"},
+	{RPC_NT_INVALID_TAG, -EIO, "RPC_NT_INVALID_TAG"},
+	{RPC_NT_INVALID_BOUND, -EIO, "RPC_NT_INVALID_BOUND"},
+	{RPC_NT_NO_ENTRY_NAME, -EIO, "RPC_NT_NO_ENTRY_NAME"},
+	{RPC_NT_INVALID_NAME_SYNTAX, -EIO, "RPC_NT_INVALID_NAME_SYNTAX"},
+	{RPC_NT_UNSUPPORTED_NAME_SYNTAX, -EIO,
+	"RPC_NT_UNSUPPORTED_NAME_SYNTAX"},
+	{RPC_NT_UUID_NO_ADDRESS, -EIO, "RPC_NT_UUID_NO_ADDRESS"},
+	{RPC_NT_DUPLICATE_ENDPOINT, -ENOTUNIQ, "RPC_NT_DUPLICATE_ENDPOINT"},
+	{RPC_NT_UNKNOWN_AUTHN_TYPE, -EIO, "RPC_NT_UNKNOWN_AUTHN_TYPE"},
+	{RPC_NT_MAX_CALLS_TOO_SMALL, -EIO, "RPC_NT_MAX_CALLS_TOO_SMALL"},
+	{RPC_NT_STRING_TOO_LONG, -EIO, "RPC_NT_STRING_TOO_LONG"},
+	{RPC_NT_PROTSEQ_NOT_FOUND, -EIO, "RPC_NT_PROTSEQ_NOT_FOUND"},
+	{RPC_NT_PROCNUM_OUT_OF_RANGE, -EIO, "RPC_NT_PROCNUM_OUT_OF_RANGE"},
+	{RPC_NT_BINDING_HAS_NO_AUTH, -EIO, "RPC_NT_BINDING_HAS_NO_AUTH"},
+	{RPC_NT_UNKNOWN_AUTHN_SERVICE, -EIO, "RPC_NT_UNKNOWN_AUTHN_SERVICE"},
+	{RPC_NT_UNKNOWN_AUTHN_LEVEL, -EIO, "RPC_NT_UNKNOWN_AUTHN_LEVEL"},
+	{RPC_NT_INVALID_AUTH_IDENTITY, -EIO, "RPC_NT_INVALID_AUTH_IDENTITY"},
+	{RPC_NT_UNKNOWN_AUTHZ_SERVICE, -EIO, "RPC_NT_UNKNOWN_AUTHZ_SERVICE"},
+	{EPT_NT_INVALID_ENTRY, -EIO, "EPT_NT_INVALID_ENTRY"},
+	{EPT_NT_CANT_PERFORM_OP, -EIO, "EPT_NT_CANT_PERFORM_OP"},
+	{EPT_NT_NOT_REGISTERED, -EIO, "EPT_NT_NOT_REGISTERED"},
+	{RPC_NT_NOTHING_TO_EXPORT, -EIO, "RPC_NT_NOTHING_TO_EXPORT"},
+	{RPC_NT_INCOMPLETE_NAME, -EIO, "RPC_NT_INCOMPLETE_NAME"},
+	{RPC_NT_INVALID_VERS_OPTION, -EIO, "RPC_NT_INVALID_VERS_OPTION"},
+	{RPC_NT_NO_MORE_MEMBERS, -EIO, "RPC_NT_NO_MORE_MEMBERS"},
+	{RPC_NT_NOT_ALL_OBJS_UNEXPORTED, -EIO,
+	"RPC_NT_NOT_ALL_OBJS_UNEXPORTED"},
+	{RPC_NT_INTERFACE_NOT_FOUND, -EIO, "RPC_NT_INTERFACE_NOT_FOUND"},
+	{RPC_NT_ENTRY_ALREADY_EXISTS, -EIO, "RPC_NT_ENTRY_ALREADY_EXISTS"},
+	{RPC_NT_ENTRY_NOT_FOUND, -EIO, "RPC_NT_ENTRY_NOT_FOUND"},
+	{RPC_NT_NAME_SERVICE_UNAVAILABLE, -EIO,
+	"RPC_NT_NAME_SERVICE_UNAVAILABLE"},
+	{RPC_NT_INVALID_NAF_ID, -EIO, "RPC_NT_INVALID_NAF_ID"},
+	{RPC_NT_CANNOT_SUPPORT, -EOPNOTSUPP, "RPC_NT_CANNOT_SUPPORT"},
+	{RPC_NT_NO_CONTEXT_AVAILABLE, -EIO, "RPC_NT_NO_CONTEXT_AVAILABLE"},
+	{RPC_NT_INTERNAL_ERROR, -EIO, "RPC_NT_INTERNAL_ERROR"},
+	{RPC_NT_ZERO_DIVIDE, -EIO, "RPC_NT_ZERO_DIVIDE"},
+	{RPC_NT_ADDRESS_ERROR, -EIO, "RPC_NT_ADDRESS_ERROR"},
+	{RPC_NT_FP_DIV_ZERO, -EIO, "RPC_NT_FP_DIV_ZERO"},
+	{RPC_NT_FP_UNDERFLOW, -EIO, "RPC_NT_FP_UNDERFLOW"},
+	{RPC_NT_FP_OVERFLOW, -EIO, "RPC_NT_FP_OVERFLOW"},
+	{RPC_NT_CALL_IN_PROGRESS, -EIO, "RPC_NT_CALL_IN_PROGRESS"},
+	{RPC_NT_NO_MORE_BINDINGS, -EIO, "RPC_NT_NO_MORE_BINDINGS"},
+	{RPC_NT_GROUP_MEMBER_NOT_FOUND, -EIO, "RPC_NT_GROUP_MEMBER_NOT_FOUND"},
+	{EPT_NT_CANT_CREATE, -EIO, "EPT_NT_CANT_CREATE"},
+	{RPC_NT_INVALID_OBJECT, -EIO, "RPC_NT_INVALID_OBJECT"},
+	{RPC_NT_NO_INTERFACES, -EIO, "RPC_NT_NO_INTERFACES"},
+	{RPC_NT_CALL_CANCELLED, -EIO, "RPC_NT_CALL_CANCELLED"},
+	{RPC_NT_BINDING_INCOMPLETE, -EIO, "RPC_NT_BINDING_INCOMPLETE"},
+	{RPC_NT_COMM_FAILURE, -EIO, "RPC_NT_COMM_FAILURE"},
+	{RPC_NT_UNSUPPORTED_AUTHN_LEVEL, -EIO,
+	"RPC_NT_UNSUPPORTED_AUTHN_LEVEL"},
+	{RPC_NT_NO_PRINC_NAME, -EIO, "RPC_NT_NO_PRINC_NAME"},
+	{RPC_NT_NOT_RPC_ERROR, -EIO, "RPC_NT_NOT_RPC_ERROR"},
+	{RPC_NT_SEC_PKG_ERROR, -EIO, "RPC_NT_SEC_PKG_ERROR"},
+	{RPC_NT_NOT_CANCELLED, -EIO, "RPC_NT_NOT_CANCELLED"},
+	{RPC_NT_INVALID_ASYNC_HANDLE, -EIO, "RPC_NT_INVALID_ASYNC_HANDLE"},
+	{RPC_NT_INVALID_ASYNC_CALL, -EIO, "RPC_NT_INVALID_ASYNC_CALL"},
+	{RPC_NT_PROXY_ACCESS_DENIED, -EACCES, "RPC_NT_PROXY_ACCESS_DENIED"},
+	{RPC_NT_NO_MORE_ENTRIES, -EIO, "RPC_NT_NO_MORE_ENTRIES"},
+	{RPC_NT_SS_CHAR_TRANS_OPEN_FAIL, -EIO,
+	"RPC_NT_SS_CHAR_TRANS_OPEN_FAIL"},
+	{RPC_NT_SS_CHAR_TRANS_SHORT_FILE, -EIO,
+	"RPC_NT_SS_CHAR_TRANS_SHORT_FILE"},
+	{RPC_NT_SS_IN_NULL_CONTEXT, -EIO, "RPC_NT_SS_IN_NULL_CONTEXT"},
+	{RPC_NT_SS_CONTEXT_MISMATCH, -EIO, "RPC_NT_SS_CONTEXT_MISMATCH"},
+	{RPC_NT_SS_CONTEXT_DAMAGED, -EIO, "RPC_NT_SS_CONTEXT_DAMAGED"},
+	{RPC_NT_SS_HANDLES_MISMATCH, -EIO, "RPC_NT_SS_HANDLES_MISMATCH"},
+	{RPC_NT_SS_CANNOT_GET_CALL_HANDLE, -EIO,
+	"RPC_NT_SS_CANNOT_GET_CALL_HANDLE"},
+	{RPC_NT_NULL_REF_POINTER, -EIO, "RPC_NT_NULL_REF_POINTER"},
+	{RPC_NT_ENUM_VALUE_OUT_OF_RANGE, -EIO,
+	"RPC_NT_ENUM_VALUE_OUT_OF_RANGE"},
+	{RPC_NT_BYTE_COUNT_TOO_SMALL, -EIO, "RPC_NT_BYTE_COUNT_TOO_SMALL"},
+	{RPC_NT_BAD_STUB_DATA, -EIO, "RPC_NT_BAD_STUB_DATA"},
+	{RPC_NT_INVALID_ES_ACTION, -EIO, "RPC_NT_INVALID_ES_ACTION"},
+	{RPC_NT_WRONG_ES_VERSION, -EIO, "RPC_NT_WRONG_ES_VERSION"},
+	{RPC_NT_WRONG_STUB_VERSION, -EIO, "RPC_NT_WRONG_STUB_VERSION"},
+	{RPC_NT_INVALID_PIPE_OBJECT, -EIO, "RPC_NT_INVALID_PIPE_OBJECT"},
+	{RPC_NT_INVALID_PIPE_OPERATION, -EIO, "RPC_NT_INVALID_PIPE_OPERATION"},
+	{RPC_NT_WRONG_PIPE_VERSION, -EIO, "RPC_NT_WRONG_PIPE_VERSION"},
+	{RPC_NT_PIPE_CLOSED, -EIO, "RPC_NT_PIPE_CLOSED"},
+	{RPC_NT_PIPE_DISCIPLINE_ERROR, -EIO, "RPC_NT_PIPE_DISCIPLINE_ERROR"},
+	{RPC_NT_PIPE_EMPTY, -EIO, "RPC_NT_PIPE_EMPTY"},
+	{STATUS_PNP_BAD_MPS_TABLE, -EIO, "STATUS_PNP_BAD_MPS_TABLE"},
+	{STATUS_PNP_TRANSLATION_FAILED, -EIO, "STATUS_PNP_TRANSLATION_FAILED"},
+	{STATUS_PNP_IRQ_TRANSLATION_FAILED, -EIO,
+	"STATUS_PNP_IRQ_TRANSLATION_FAILED"},
+	{STATUS_PNP_INVALID_ID, -EIO, "STATUS_PNP_INVALID_ID"},
+	{STATUS_IO_REISSUE_AS_CACHED, -EIO, "STATUS_IO_REISSUE_AS_CACHED"},
+	{STATUS_CTX_WINSTATION_NAME_INVALID, -EIO,
+	"STATUS_CTX_WINSTATION_NAME_INVALID"},
+	{STATUS_CTX_INVALID_PD, -EIO, "STATUS_CTX_INVALID_PD"},
+	{STATUS_CTX_PD_NOT_FOUND, -EIO, "STATUS_CTX_PD_NOT_FOUND"},
+	{STATUS_CTX_CLOSE_PENDING, -EIO, "STATUS_CTX_CLOSE_PENDING"},
+	{STATUS_CTX_NO_OUTBUF, -EIO, "STATUS_CTX_NO_OUTBUF"},
+	{STATUS_CTX_MODEM_INF_NOT_FOUND, -EIO,
+	"STATUS_CTX_MODEM_INF_NOT_FOUND"},
+	{STATUS_CTX_INVALID_MODEMNAME, -EIO, "STATUS_CTX_INVALID_MODEMNAME"},
+	{STATUS_CTX_RESPONSE_ERROR, -EIO, "STATUS_CTX_RESPONSE_ERROR"},
+	{STATUS_CTX_MODEM_RESPONSE_TIMEOUT, -ETIMEDOUT,
+	"STATUS_CTX_MODEM_RESPONSE_TIMEOUT"},
+	{STATUS_CTX_MODEM_RESPONSE_NO_CARRIER, -EIO,
+	"STATUS_CTX_MODEM_RESPONSE_NO_CARRIER"},
+	{STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE, -EIO,
+	"STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE"},
+	{STATUS_CTX_MODEM_RESPONSE_BUSY, -EBUSY,
+	"STATUS_CTX_MODEM_RESPONSE_BUSY"},
+	{STATUS_CTX_MODEM_RESPONSE_VOICE, -EIO,
+	"STATUS_CTX_MODEM_RESPONSE_VOICE"},
+	{STATUS_CTX_TD_ERROR, -EIO, "STATUS_CTX_TD_ERROR"},
+	{STATUS_CTX_LICENSE_CLIENT_INVALID, -EIO,
+	"STATUS_CTX_LICENSE_CLIENT_INVALID"},
+	{STATUS_CTX_LICENSE_NOT_AVAILABLE, -EIO,
+	"STATUS_CTX_LICENSE_NOT_AVAILABLE"},
+	{STATUS_CTX_LICENSE_EXPIRED, -EIO, "STATUS_CTX_LICENSE_EXPIRED"},
+	{STATUS_CTX_WINSTATION_NOT_FOUND, -EIO,
+	"STATUS_CTX_WINSTATION_NOT_FOUND"},
+	{STATUS_CTX_WINSTATION_NAME_COLLISION, -EIO,
+	"STATUS_CTX_WINSTATION_NAME_COLLISION"},
+	{STATUS_CTX_WINSTATION_BUSY, -EBUSY, "STATUS_CTX_WINSTATION_BUSY"},
+	{STATUS_CTX_BAD_VIDEO_MODE, -EIO, "STATUS_CTX_BAD_VIDEO_MODE"},
+	{STATUS_CTX_GRAPHICS_INVALID, -EIO, "STATUS_CTX_GRAPHICS_INVALID"},
+	{STATUS_CTX_NOT_CONSOLE, -EIO, "STATUS_CTX_NOT_CONSOLE"},
+	{STATUS_CTX_CLIENT_QUERY_TIMEOUT, -EIO,
+	"STATUS_CTX_CLIENT_QUERY_TIMEOUT"},
+	{STATUS_CTX_CONSOLE_DISCONNECT, -EIO, "STATUS_CTX_CONSOLE_DISCONNECT"},
+	{STATUS_CTX_CONSOLE_CONNECT, -EIO, "STATUS_CTX_CONSOLE_CONNECT"},
+	{STATUS_CTX_SHADOW_DENIED, -EIO, "STATUS_CTX_SHADOW_DENIED"},
+	{STATUS_CTX_WINSTATION_ACCESS_DENIED, -EACCES,
+	"STATUS_CTX_WINSTATION_ACCESS_DENIED"},
+	{STATUS_CTX_INVALID_WD, -EIO, "STATUS_CTX_INVALID_WD"},
+	{STATUS_CTX_WD_NOT_FOUND, -EIO, "STATUS_CTX_WD_NOT_FOUND"},
+	{STATUS_CTX_SHADOW_INVALID, -EIO, "STATUS_CTX_SHADOW_INVALID"},
+	{STATUS_CTX_SHADOW_DISABLED, -EIO, "STATUS_CTX_SHADOW_DISABLED"},
+	{STATUS_RDP_PROTOCOL_ERROR, -EIO, "STATUS_RDP_PROTOCOL_ERROR"},
+	{STATUS_CTX_CLIENT_LICENSE_NOT_SET, -EIO,
+	"STATUS_CTX_CLIENT_LICENSE_NOT_SET"},
+	{STATUS_CTX_CLIENT_LICENSE_IN_USE, -EIO,
+	"STATUS_CTX_CLIENT_LICENSE_IN_USE"},
+	{STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE, -EIO,
+	"STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE"},
+	{STATUS_CTX_SHADOW_NOT_RUNNING, -EIO, "STATUS_CTX_SHADOW_NOT_RUNNING"},
+	{STATUS_CTX_LOGON_DISABLED, -EIO, "STATUS_CTX_LOGON_DISABLED"},
+	{STATUS_CTX_SECURITY_LAYER_ERROR, -EIO,
+	"STATUS_CTX_SECURITY_LAYER_ERROR"},
+	{STATUS_TS_INCOMPATIBLE_SESSIONS, -EIO,
+	"STATUS_TS_INCOMPATIBLE_SESSIONS"},
+	{STATUS_MUI_FILE_NOT_FOUND, -EIO, "STATUS_MUI_FILE_NOT_FOUND"},
+	{STATUS_MUI_INVALID_FILE, -EIO, "STATUS_MUI_INVALID_FILE"},
+	{STATUS_MUI_INVALID_RC_CONFIG, -EIO, "STATUS_MUI_INVALID_RC_CONFIG"},
+	{STATUS_MUI_INVALID_LOCALE_NAME, -EIO,
+	"STATUS_MUI_INVALID_LOCALE_NAME"},
+	{STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME, -EIO,
+	"STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME"},
+	{STATUS_MUI_FILE_NOT_LOADED, -EIO, "STATUS_MUI_FILE_NOT_LOADED"},
+	{STATUS_RESOURCE_ENUM_USER_STOP, -EIO,
+	"STATUS_RESOURCE_ENUM_USER_STOP"},
+	{STATUS_CLUSTER_INVALID_NODE, -EIO, "STATUS_CLUSTER_INVALID_NODE"},
+	{STATUS_CLUSTER_NODE_EXISTS, -EIO, "STATUS_CLUSTER_NODE_EXISTS"},
+	{STATUS_CLUSTER_JOIN_IN_PROGRESS, -EIO,
+	"STATUS_CLUSTER_JOIN_IN_PROGRESS"},
+	{STATUS_CLUSTER_NODE_NOT_FOUND, -EIO, "STATUS_CLUSTER_NODE_NOT_FOUND"},
+	{STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND, -EIO,
+	"STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND"},
+	{STATUS_CLUSTER_NETWORK_EXISTS, -EIO, "STATUS_CLUSTER_NETWORK_EXISTS"},
+	{STATUS_CLUSTER_NETWORK_NOT_FOUND, -EIO,
+	"STATUS_CLUSTER_NETWORK_NOT_FOUND"},
+	{STATUS_CLUSTER_NETINTERFACE_EXISTS, -EIO,
+	"STATUS_CLUSTER_NETINTERFACE_EXISTS"},
+	{STATUS_CLUSTER_NETINTERFACE_NOT_FOUND, -EIO,
+	"STATUS_CLUSTER_NETINTERFACE_NOT_FOUND"},
+	{STATUS_CLUSTER_INVALID_REQUEST, -EIO,
+	"STATUS_CLUSTER_INVALID_REQUEST"},
+	{STATUS_CLUSTER_INVALID_NETWORK_PROVIDER, -EIO,
+	"STATUS_CLUSTER_INVALID_NETWORK_PROVIDER"},
+	{STATUS_CLUSTER_NODE_DOWN, -EIO, "STATUS_CLUSTER_NODE_DOWN"},
+	{STATUS_CLUSTER_NODE_UNREACHABLE, -EIO,
+	"STATUS_CLUSTER_NODE_UNREACHABLE"},
+	{STATUS_CLUSTER_NODE_NOT_MEMBER, -EIO,
+	"STATUS_CLUSTER_NODE_NOT_MEMBER"},
+	{STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS, -EIO,
+	"STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS"},
+	{STATUS_CLUSTER_INVALID_NETWORK, -EIO,
+	"STATUS_CLUSTER_INVALID_NETWORK"},
+	{STATUS_CLUSTER_NO_NET_ADAPTERS, -EIO,
+	"STATUS_CLUSTER_NO_NET_ADAPTERS"},
+	{STATUS_CLUSTER_NODE_UP, -EIO, "STATUS_CLUSTER_NODE_UP"},
+	{STATUS_CLUSTER_NODE_PAUSED, -EIO, "STATUS_CLUSTER_NODE_PAUSED"},
+	{STATUS_CLUSTER_NODE_NOT_PAUSED, -EIO,
+	"STATUS_CLUSTER_NODE_NOT_PAUSED"},
+	{STATUS_CLUSTER_NO_SECURITY_CONTEXT, -EIO,
+	"STATUS_CLUSTER_NO_SECURITY_CONTEXT"},
+	{STATUS_CLUSTER_NETWORK_NOT_INTERNAL, -EIO,
+	"STATUS_CLUSTER_NETWORK_NOT_INTERNAL"},
+	{STATUS_CLUSTER_POISONED, -EIO, "STATUS_CLUSTER_POISONED"},
+	{STATUS_ACPI_INVALID_OPCODE, -EIO, "STATUS_ACPI_INVALID_OPCODE"},
+	{STATUS_ACPI_STACK_OVERFLOW, -EIO, "STATUS_ACPI_STACK_OVERFLOW"},
+	{STATUS_ACPI_ASSERT_FAILED, -EIO, "STATUS_ACPI_ASSERT_FAILED"},
+	{STATUS_ACPI_INVALID_INDEX, -EIO, "STATUS_ACPI_INVALID_INDEX"},
+	{STATUS_ACPI_INVALID_ARGUMENT, -EIO, "STATUS_ACPI_INVALID_ARGUMENT"},
+	{STATUS_ACPI_FATAL, -EIO, "STATUS_ACPI_FATAL"},
+	{STATUS_ACPI_INVALID_SUPERNAME, -EIO, "STATUS_ACPI_INVALID_SUPERNAME"},
+	{STATUS_ACPI_INVALID_ARGTYPE, -EIO, "STATUS_ACPI_INVALID_ARGTYPE"},
+	{STATUS_ACPI_INVALID_OBJTYPE, -EIO, "STATUS_ACPI_INVALID_OBJTYPE"},
+	{STATUS_ACPI_INVALID_TARGETTYPE, -EIO,
+	"STATUS_ACPI_INVALID_TARGETTYPE"},
+	{STATUS_ACPI_INCORRECT_ARGUMENT_COUNT, -EIO,
+	"STATUS_ACPI_INCORRECT_ARGUMENT_COUNT"},
+	{STATUS_ACPI_ADDRESS_NOT_MAPPED, -EIO,
+	"STATUS_ACPI_ADDRESS_NOT_MAPPED"},
+	{STATUS_ACPI_INVALID_EVENTTYPE, -EIO, "STATUS_ACPI_INVALID_EVENTTYPE"},
+	{STATUS_ACPI_HANDLER_COLLISION, -EIO, "STATUS_ACPI_HANDLER_COLLISION"},
+	{STATUS_ACPI_INVALID_DATA, -EIO, "STATUS_ACPI_INVALID_DATA"},
+	{STATUS_ACPI_INVALID_REGION, -EIO, "STATUS_ACPI_INVALID_REGION"},
+	{STATUS_ACPI_INVALID_ACCESS_SIZE, -EIO,
+	"STATUS_ACPI_INVALID_ACCESS_SIZE"},
+	{STATUS_ACPI_ACQUIRE_GLOBAL_LOCK, -EIO,
+	"STATUS_ACPI_ACQUIRE_GLOBAL_LOCK"},
+	{STATUS_ACPI_ALREADY_INITIALIZED, -EIO,
+	"STATUS_ACPI_ALREADY_INITIALIZED"},
+	{STATUS_ACPI_NOT_INITIALIZED, -EIO, "STATUS_ACPI_NOT_INITIALIZED"},
+	{STATUS_ACPI_INVALID_MUTEX_LEVEL, -EIO,
+	"STATUS_ACPI_INVALID_MUTEX_LEVEL"},
+	{STATUS_ACPI_MUTEX_NOT_OWNED, -EIO, "STATUS_ACPI_MUTEX_NOT_OWNED"},
+	{STATUS_ACPI_MUTEX_NOT_OWNER, -EIO, "STATUS_ACPI_MUTEX_NOT_OWNER"},
+	{STATUS_ACPI_RS_ACCESS, -EIO, "STATUS_ACPI_RS_ACCESS"},
+	{STATUS_ACPI_INVALID_TABLE, -EIO, "STATUS_ACPI_INVALID_TABLE"},
+	{STATUS_ACPI_REG_HANDLER_FAILED, -EIO,
+	"STATUS_ACPI_REG_HANDLER_FAILED"},
+	{STATUS_ACPI_POWER_REQUEST_FAILED, -EIO,
+	"STATUS_ACPI_POWER_REQUEST_FAILED"},
+	{STATUS_SXS_SECTION_NOT_FOUND, -EIO, "STATUS_SXS_SECTION_NOT_FOUND"},
+	{STATUS_SXS_CANT_GEN_ACTCTX, -EIO, "STATUS_SXS_CANT_GEN_ACTCTX"},
+	{STATUS_SXS_INVALID_ACTCTXDATA_FORMAT, -EIO,
+	"STATUS_SXS_INVALID_ACTCTXDATA_FORMAT"},
+	{STATUS_SXS_ASSEMBLY_NOT_FOUND, -EIO, "STATUS_SXS_ASSEMBLY_NOT_FOUND"},
+	{STATUS_SXS_MANIFEST_FORMAT_ERROR, -EIO,
+	"STATUS_SXS_MANIFEST_FORMAT_ERROR"},
+	{STATUS_SXS_MANIFEST_PARSE_ERROR, -EIO,
+	"STATUS_SXS_MANIFEST_PARSE_ERROR"},
+	{STATUS_SXS_ACTIVATION_CONTEXT_DISABLED, -EIO,
+	"STATUS_SXS_ACTIVATION_CONTEXT_DISABLED"},
+	{STATUS_SXS_KEY_NOT_FOUND, -EIO, "STATUS_SXS_KEY_NOT_FOUND"},
+	{STATUS_SXS_VERSION_CONFLICT, -EIO, "STATUS_SXS_VERSION_CONFLICT"},
+	{STATUS_SXS_WRONG_SECTION_TYPE, -EIO, "STATUS_SXS_WRONG_SECTION_TYPE"},
+	{STATUS_SXS_THREAD_QUERIES_DISABLED, -EIO,
+	"STATUS_SXS_THREAD_QUERIES_DISABLED"},
+	{STATUS_SXS_ASSEMBLY_MISSING, -EIO, "STATUS_SXS_ASSEMBLY_MISSING"},
+	{STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET, -EIO,
+	"STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET"},
+	{STATUS_SXS_EARLY_DEACTIVATION, -EIO, "STATUS_SXS_EARLY_DEACTIVATION"},
+	{STATUS_SXS_INVALID_DEACTIVATION, -EIO,
+	"STATUS_SXS_INVALID_DEACTIVATION"},
+	{STATUS_SXS_MULTIPLE_DEACTIVATION, -EIO,
+	"STATUS_SXS_MULTIPLE_DEACTIVATION"},
+	{STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY, -EIO,
+	"STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY"},
+	{STATUS_SXS_PROCESS_TERMINATION_REQUESTED, -EIO,
+	"STATUS_SXS_PROCESS_TERMINATION_REQUESTED"},
+	{STATUS_SXS_CORRUPT_ACTIVATION_STACK, -EIO,
+	"STATUS_SXS_CORRUPT_ACTIVATION_STACK"},
+	{STATUS_SXS_CORRUPTION, -EIO, "STATUS_SXS_CORRUPTION"},
+	{STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE, -EIO,
+	"STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE"},
+	{STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME, -EIO,
+	"STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME"},
+	{STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE, -EIO,
+	"STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE"},
+	{STATUS_SXS_IDENTITY_PARSE_ERROR, -EIO,
+	"STATUS_SXS_IDENTITY_PARSE_ERROR"},
+	{STATUS_SXS_COMPONENT_STORE_CORRUPT, -EIO,
+	"STATUS_SXS_COMPONENT_STORE_CORRUPT"},
+	{STATUS_SXS_FILE_HASH_MISMATCH, -EIO, "STATUS_SXS_FILE_HASH_MISMATCH"},
+	{STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT, -EIO,
+	"STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT"},
+	{STATUS_SXS_IDENTITIES_DIFFERENT, -EIO,
+	"STATUS_SXS_IDENTITIES_DIFFERENT"},
+	{STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT, -EIO,
+	"STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT"},
+	{STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY, -EIO,
+	"STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY"},
+	{STATUS_ADVANCED_INSTALLER_FAILED, -EIO,
+	"STATUS_ADVANCED_INSTALLER_FAILED"},
+	{STATUS_XML_ENCODING_MISMATCH, -EIO, "STATUS_XML_ENCODING_MISMATCH"},
+	{STATUS_SXS_MANIFEST_TOO_BIG, -EIO, "STATUS_SXS_MANIFEST_TOO_BIG"},
+	{STATUS_SXS_SETTING_NOT_REGISTERED, -EIO,
+	"STATUS_SXS_SETTING_NOT_REGISTERED"},
+	{STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE, -EIO,
+	"STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE"},
+	{STATUS_SMI_PRIMITIVE_INSTALLER_FAILED, -EIO,
+	"STATUS_SMI_PRIMITIVE_INSTALLER_FAILED"},
+	{STATUS_GENERIC_COMMAND_FAILED, -EIO, "STATUS_GENERIC_COMMAND_FAILED"},
+	{STATUS_SXS_FILE_HASH_MISSING, -EIO, "STATUS_SXS_FILE_HASH_MISSING"},
+	{STATUS_TRANSACTIONAL_CONFLICT, -EIO, "STATUS_TRANSACTIONAL_CONFLICT"},
+	{STATUS_INVALID_TRANSACTION, -EIO, "STATUS_INVALID_TRANSACTION"},
+	{STATUS_TRANSACTION_NOT_ACTIVE, -EIO, "STATUS_TRANSACTION_NOT_ACTIVE"},
+	{STATUS_TM_INITIALIZATION_FAILED, -EIO,
+	"STATUS_TM_INITIALIZATION_FAILED"},
+	{STATUS_RM_NOT_ACTIVE, -EIO, "STATUS_RM_NOT_ACTIVE"},
+	{STATUS_RM_METADATA_CORRUPT, -EIO, "STATUS_RM_METADATA_CORRUPT"},
+	{STATUS_TRANSACTION_NOT_JOINED, -EIO, "STATUS_TRANSACTION_NOT_JOINED"},
+	{STATUS_DIRECTORY_NOT_RM, -EIO, "STATUS_DIRECTORY_NOT_RM"},
+	{STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE, -EIO,
+	"STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE"},
+	{STATUS_LOG_RESIZE_INVALID_SIZE, -EIO,
+	"STATUS_LOG_RESIZE_INVALID_SIZE"},
+	{STATUS_REMOTE_FILE_VERSION_MISMATCH, -EIO,
+	"STATUS_REMOTE_FILE_VERSION_MISMATCH"},
+	{STATUS_CRM_PROTOCOL_ALREADY_EXISTS, -EIO,
+	"STATUS_CRM_PROTOCOL_ALREADY_EXISTS"},
+	{STATUS_TRANSACTION_PROPAGATION_FAILED, -EIO,
+	"STATUS_TRANSACTION_PROPAGATION_FAILED"},
+	{STATUS_CRM_PROTOCOL_NOT_FOUND, -EIO, "STATUS_CRM_PROTOCOL_NOT_FOUND"},
+	{STATUS_TRANSACTION_SUPERIOR_EXISTS, -EIO,
+	"STATUS_TRANSACTION_SUPERIOR_EXISTS"},
+	{STATUS_TRANSACTION_REQUEST_NOT_VALID, -EIO,
+	"STATUS_TRANSACTION_REQUEST_NOT_VALID"},
+	{STATUS_TRANSACTION_NOT_REQUESTED, -EIO,
+	"STATUS_TRANSACTION_NOT_REQUESTED"},
+	{STATUS_TRANSACTION_ALREADY_ABORTED, -EIO,
+	"STATUS_TRANSACTION_ALREADY_ABORTED"},
+	{STATUS_TRANSACTION_ALREADY_COMMITTED, -EIO,
+	"STATUS_TRANSACTION_ALREADY_COMMITTED"},
+	{STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER, -EIO,
+	"STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER"},
+	{STATUS_CURRENT_TRANSACTION_NOT_VALID, -EIO,
+	"STATUS_CURRENT_TRANSACTION_NOT_VALID"},
+	{STATUS_LOG_GROWTH_FAILED, -EIO, "STATUS_LOG_GROWTH_FAILED"},
+	{STATUS_OBJECT_NO_LONGER_EXISTS, -EIO,
+	"STATUS_OBJECT_NO_LONGER_EXISTS"},
+	{STATUS_STREAM_MINIVERSION_NOT_FOUND, -EIO,
+	"STATUS_STREAM_MINIVERSION_NOT_FOUND"},
+	{STATUS_STREAM_MINIVERSION_NOT_VALID, -EIO,
+	"STATUS_STREAM_MINIVERSION_NOT_VALID"},
+	{STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION, -EIO,
+	"STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION"},
+	{STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT, -EIO,
+	"STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT"},
+	{STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS, -EIO,
+	"STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS"},
+	{STATUS_HANDLE_NO_LONGER_VALID, -EIO, "STATUS_HANDLE_NO_LONGER_VALID"},
+	{STATUS_LOG_CORRUPTION_DETECTED, -EIO,
+	"STATUS_LOG_CORRUPTION_DETECTED"},
+	{STATUS_RM_DISCONNECTED, -EIO, "STATUS_RM_DISCONNECTED"},
+	{STATUS_ENLISTMENT_NOT_SUPERIOR, -EIO,
+	"STATUS_ENLISTMENT_NOT_SUPERIOR"},
+	{STATUS_FILE_IDENTITY_NOT_PERSISTENT, -EIO,
+	"STATUS_FILE_IDENTITY_NOT_PERSISTENT"},
+	{STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY, -EIO,
+	"STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY"},
+	{STATUS_CANT_CROSS_RM_BOUNDARY, -EIO, "STATUS_CANT_CROSS_RM_BOUNDARY"},
+	{STATUS_TXF_DIR_NOT_EMPTY, -EIO, "STATUS_TXF_DIR_NOT_EMPTY"},
+	{STATUS_INDOUBT_TRANSACTIONS_EXIST, -EIO,
+	"STATUS_INDOUBT_TRANSACTIONS_EXIST"},
+	{STATUS_TM_VOLATILE, -EIO, "STATUS_TM_VOLATILE"},
+	{STATUS_ROLLBACK_TIMER_EXPIRED, -EIO, "STATUS_ROLLBACK_TIMER_EXPIRED"},
+	{STATUS_TXF_ATTRIBUTE_CORRUPT, -EIO, "STATUS_TXF_ATTRIBUTE_CORRUPT"},
+	{STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION, -EIO,
+	"STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION"},
+	{STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED, -EIO,
+	"STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED"},
+	{STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE, -EIO,
+	"STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE"},
+	{STATUS_TRANSACTION_REQUIRED_PROMOTION, -EIO,
+	"STATUS_TRANSACTION_REQUIRED_PROMOTION"},
+	{STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION, -EIO,
+	"STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION"},
+	{STATUS_TRANSACTIONS_NOT_FROZEN, -EIO,
+	"STATUS_TRANSACTIONS_NOT_FROZEN"},
+	{STATUS_TRANSACTION_FREEZE_IN_PROGRESS, -EIO,
+	"STATUS_TRANSACTION_FREEZE_IN_PROGRESS"},
+	{STATUS_NOT_SNAPSHOT_VOLUME, -EIO, "STATUS_NOT_SNAPSHOT_VOLUME"},
+	{STATUS_NO_SAVEPOINT_WITH_OPEN_FILES, -EIO,
+	"STATUS_NO_SAVEPOINT_WITH_OPEN_FILES"},
+	{STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION, -EIO,
+	"STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION"},
+	{STATUS_TM_IDENTITY_MISMATCH, -EIO, "STATUS_TM_IDENTITY_MISMATCH"},
+	{STATUS_FLOATED_SECTION, -EIO, "STATUS_FLOATED_SECTION"},
+	{STATUS_CANNOT_ACCEPT_TRANSACTED_WORK, -EIO,
+	"STATUS_CANNOT_ACCEPT_TRANSACTED_WORK"},
+	{STATUS_CANNOT_ABORT_TRANSACTIONS, -EIO,
+	"STATUS_CANNOT_ABORT_TRANSACTIONS"},
+	{STATUS_TRANSACTION_NOT_FOUND, -EIO, "STATUS_TRANSACTION_NOT_FOUND"},
+	{STATUS_RESOURCEMANAGER_NOT_FOUND, -EIO,
+	"STATUS_RESOURCEMANAGER_NOT_FOUND"},
+	{STATUS_ENLISTMENT_NOT_FOUND, -EIO, "STATUS_ENLISTMENT_NOT_FOUND"},
+	{STATUS_TRANSACTIONMANAGER_NOT_FOUND, -EIO,
+	"STATUS_TRANSACTIONMANAGER_NOT_FOUND"},
+	{STATUS_TRANSACTIONMANAGER_NOT_ONLINE, -EIO,
+	"STATUS_TRANSACTIONMANAGER_NOT_ONLINE"},
+	{STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION, -EIO,
+	"STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION"},
+	{STATUS_TRANSACTION_NOT_ROOT, -EIO, "STATUS_TRANSACTION_NOT_ROOT"},
+	{STATUS_TRANSACTION_OBJECT_EXPIRED, -EIO,
+	"STATUS_TRANSACTION_OBJECT_EXPIRED"},
+	{STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION, -EIO,
+	"STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION"},
+	{STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED, -EIO,
+	"STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED"},
+	{STATUS_TRANSACTION_RECORD_TOO_LONG, -EIO,
+	"STATUS_TRANSACTION_RECORD_TOO_LONG"},
+	{STATUS_NO_LINK_TRACKING_IN_TRANSACTION, -EIO,
+	"STATUS_NO_LINK_TRACKING_IN_TRANSACTION"},
+	{STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION, -EOPNOTSUPP,
+	"STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION"},
+	{STATUS_TRANSACTION_INTEGRITY_VIOLATED, -EIO,
+	"STATUS_TRANSACTION_INTEGRITY_VIOLATED"},
+	{STATUS_LOG_SECTOR_INVALID, -EIO, "STATUS_LOG_SECTOR_INVALID"},
+	{STATUS_LOG_SECTOR_PARITY_INVALID, -EIO,
+	"STATUS_LOG_SECTOR_PARITY_INVALID"},
+	{STATUS_LOG_SECTOR_REMAPPED, -EIO, "STATUS_LOG_SECTOR_REMAPPED"},
+	{STATUS_LOG_BLOCK_INCOMPLETE, -EIO, "STATUS_LOG_BLOCK_INCOMPLETE"},
+	{STATUS_LOG_INVALID_RANGE, -EIO, "STATUS_LOG_INVALID_RANGE"},
+	{STATUS_LOG_BLOCKS_EXHAUSTED, -EIO, "STATUS_LOG_BLOCKS_EXHAUSTED"},
+	{STATUS_LOG_READ_CONTEXT_INVALID, -EIO,
+	"STATUS_LOG_READ_CONTEXT_INVALID"},
+	{STATUS_LOG_RESTART_INVALID, -EIO, "STATUS_LOG_RESTART_INVALID"},
+	{STATUS_LOG_BLOCK_VERSION, -EIO, "STATUS_LOG_BLOCK_VERSION"},
+	{STATUS_LOG_BLOCK_INVALID, -EIO, "STATUS_LOG_BLOCK_INVALID"},
+	{STATUS_LOG_READ_MODE_INVALID, -EIO, "STATUS_LOG_READ_MODE_INVALID"},
+	{STATUS_LOG_METADATA_CORRUPT, -EIO, "STATUS_LOG_METADATA_CORRUPT"},
+	{STATUS_LOG_METADATA_INVALID, -EIO, "STATUS_LOG_METADATA_INVALID"},
+	{STATUS_LOG_METADATA_INCONSISTENT, -EIO,
+	"STATUS_LOG_METADATA_INCONSISTENT"},
+	{STATUS_LOG_RESERVATION_INVALID, -EIO,
+	"STATUS_LOG_RESERVATION_INVALID"},
+	{STATUS_LOG_CANT_DELETE, -EIO, "STATUS_LOG_CANT_DELETE"},
+	{STATUS_LOG_CONTAINER_LIMIT_EXCEEDED, -EIO,
+	"STATUS_LOG_CONTAINER_LIMIT_EXCEEDED"},
+	{STATUS_LOG_START_OF_LOG, -EIO, "STATUS_LOG_START_OF_LOG"},
+	{STATUS_LOG_POLICY_ALREADY_INSTALLED, -EIO,
+	"STATUS_LOG_POLICY_ALREADY_INSTALLED"},
+	{STATUS_LOG_POLICY_NOT_INSTALLED, -EIO,
+	"STATUS_LOG_POLICY_NOT_INSTALLED"},
+	{STATUS_LOG_POLICY_INVALID, -EIO, "STATUS_LOG_POLICY_INVALID"},
+	{STATUS_LOG_POLICY_CONFLICT, -EIO, "STATUS_LOG_POLICY_CONFLICT"},
+	{STATUS_LOG_PINNED_ARCHIVE_TAIL, -EIO,
+	"STATUS_LOG_PINNED_ARCHIVE_TAIL"},
+	{STATUS_LOG_RECORD_NONEXISTENT, -EIO, "STATUS_LOG_RECORD_NONEXISTENT"},
+	{STATUS_LOG_RECORDS_RESERVED_INVALID, -EIO,
+	"STATUS_LOG_RECORDS_RESERVED_INVALID"},
+	{STATUS_LOG_SPACE_RESERVED_INVALID, -EIO,
+	"STATUS_LOG_SPACE_RESERVED_INVALID"},
+	{STATUS_LOG_TAIL_INVALID, -EIO, "STATUS_LOG_TAIL_INVALID"},
+	{STATUS_LOG_FULL, -EIO, "STATUS_LOG_FULL"},
+	{STATUS_LOG_MULTIPLEXED, -EIO, "STATUS_LOG_MULTIPLEXED"},
+	{STATUS_LOG_DEDICATED, -EIO, "STATUS_LOG_DEDICATED"},
+	{STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS, -EIO,
+	"STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS"},
+	{STATUS_LOG_ARCHIVE_IN_PROGRESS, -EIO,
+	"STATUS_LOG_ARCHIVE_IN_PROGRESS"},
+	{STATUS_LOG_EPHEMERAL, -EIO, "STATUS_LOG_EPHEMERAL"},
+	{STATUS_LOG_NOT_ENOUGH_CONTAINERS, -EIO,
+	"STATUS_LOG_NOT_ENOUGH_CONTAINERS"},
+	{STATUS_LOG_CLIENT_ALREADY_REGISTERED, -EIO,
+	"STATUS_LOG_CLIENT_ALREADY_REGISTERED"},
+	{STATUS_LOG_CLIENT_NOT_REGISTERED, -EIO,
+	"STATUS_LOG_CLIENT_NOT_REGISTERED"},
+	{STATUS_LOG_FULL_HANDLER_IN_PROGRESS, -EIO,
+	"STATUS_LOG_FULL_HANDLER_IN_PROGRESS"},
+	{STATUS_LOG_CONTAINER_READ_FAILED, -EIO,
+	"STATUS_LOG_CONTAINER_READ_FAILED"},
+	{STATUS_LOG_CONTAINER_WRITE_FAILED, -EIO,
+	"STATUS_LOG_CONTAINER_WRITE_FAILED"},
+	{STATUS_LOG_CONTAINER_OPEN_FAILED, -EIO,
+	"STATUS_LOG_CONTAINER_OPEN_FAILED"},
+	{STATUS_LOG_CONTAINER_STATE_INVALID, -EIO,
+	"STATUS_LOG_CONTAINER_STATE_INVALID"},
+	{STATUS_LOG_STATE_INVALID, -EIO, "STATUS_LOG_STATE_INVALID"},
+	{STATUS_LOG_PINNED, -EIO, "STATUS_LOG_PINNED"},
+	{STATUS_LOG_METADATA_FLUSH_FAILED, -EIO,
+	"STATUS_LOG_METADATA_FLUSH_FAILED"},
+	{STATUS_LOG_INCONSISTENT_SECURITY, -EIO,
+	"STATUS_LOG_INCONSISTENT_SECURITY"},
+	{STATUS_LOG_APPENDED_FLUSH_FAILED, -EIO,
+	"STATUS_LOG_APPENDED_FLUSH_FAILED"},
+	{STATUS_LOG_PINNED_RESERVATION, -EIO, "STATUS_LOG_PINNED_RESERVATION"},
+	{STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD, -EIO,
+	"STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD"},
+	{STATUS_FLT_NO_HANDLER_DEFINED, -EIO, "STATUS_FLT_NO_HANDLER_DEFINED"},
+	{STATUS_FLT_CONTEXT_ALREADY_DEFINED, -EIO,
+	"STATUS_FLT_CONTEXT_ALREADY_DEFINED"},
+	{STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST, -EIO,
+	"STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST"},
+	{STATUS_FLT_DISALLOW_FAST_IO, -EIO, "STATUS_FLT_DISALLOW_FAST_IO"},
+	{STATUS_FLT_INVALID_NAME_REQUEST, -EIO,
+	"STATUS_FLT_INVALID_NAME_REQUEST"},
+	{STATUS_FLT_NOT_SAFE_TO_POST_OPERATION, -EIO,
+	"STATUS_FLT_NOT_SAFE_TO_POST_OPERATION"},
+	{STATUS_FLT_NOT_INITIALIZED, -EIO, "STATUS_FLT_NOT_INITIALIZED"},
+	{STATUS_FLT_FILTER_NOT_READY, -EIO, "STATUS_FLT_FILTER_NOT_READY"},
+	{STATUS_FLT_POST_OPERATION_CLEANUP, -EIO,
+	"STATUS_FLT_POST_OPERATION_CLEANUP"},
+	{STATUS_FLT_INTERNAL_ERROR, -EIO, "STATUS_FLT_INTERNAL_ERROR"},
+	{STATUS_FLT_DELETING_OBJECT, -EIO, "STATUS_FLT_DELETING_OBJECT"},
+	{STATUS_FLT_MUST_BE_NONPAGED_POOL, -EIO,
+	"STATUS_FLT_MUST_BE_NONPAGED_POOL"},
+	{STATUS_FLT_DUPLICATE_ENTRY, -EIO, "STATUS_FLT_DUPLICATE_ENTRY"},
+	{STATUS_FLT_CBDQ_DISABLED, -EIO, "STATUS_FLT_CBDQ_DISABLED"},
+	{STATUS_FLT_DO_NOT_ATTACH, -EIO, "STATUS_FLT_DO_NOT_ATTACH"},
+	{STATUS_FLT_DO_NOT_DETACH, -EIO, "STATUS_FLT_DO_NOT_DETACH"},
+	{STATUS_FLT_INSTANCE_ALTITUDE_COLLISION, -EIO,
+	"STATUS_FLT_INSTANCE_ALTITUDE_COLLISION"},
+	{STATUS_FLT_INSTANCE_NAME_COLLISION, -EIO,
+	"STATUS_FLT_INSTANCE_NAME_COLLISION"},
+	{STATUS_FLT_FILTER_NOT_FOUND, -EIO, "STATUS_FLT_FILTER_NOT_FOUND"},
+	{STATUS_FLT_VOLUME_NOT_FOUND, -EIO, "STATUS_FLT_VOLUME_NOT_FOUND"},
+	{STATUS_FLT_INSTANCE_NOT_FOUND, -EIO, "STATUS_FLT_INSTANCE_NOT_FOUND"},
+	{STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND, -EIO,
+	"STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND"},
+	{STATUS_FLT_INVALID_CONTEXT_REGISTRATION, -EIO,
+	"STATUS_FLT_INVALID_CONTEXT_REGISTRATION"},
+	{STATUS_FLT_NAME_CACHE_MISS, -EIO, "STATUS_FLT_NAME_CACHE_MISS"},
+	{STATUS_FLT_NO_DEVICE_OBJECT, -EIO, "STATUS_FLT_NO_DEVICE_OBJECT"},
+	{STATUS_FLT_VOLUME_ALREADY_MOUNTED, -EIO,
+	"STATUS_FLT_VOLUME_ALREADY_MOUNTED"},
+	{STATUS_FLT_ALREADY_ENLISTED, -EIO, "STATUS_FLT_ALREADY_ENLISTED"},
+	{STATUS_FLT_CONTEXT_ALREADY_LINKED, -EIO,
+	"STATUS_FLT_CONTEXT_ALREADY_LINKED"},
+	{STATUS_FLT_NO_WAITER_FOR_REPLY, -EIO,
+	"STATUS_FLT_NO_WAITER_FOR_REPLY"},
+	{STATUS_MONITOR_NO_DESCRIPTOR, -EIO, "STATUS_MONITOR_NO_DESCRIPTOR"},
+	{STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT, -EIO,
+	"STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT"},
+	{STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM, -EIO,
+	"STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM"},
+	{STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK"},
+	{STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED, -EIO,
+	"STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED"},
+	{STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK"},
+	{STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK"},
+	{STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA, -EIO,
+	"STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA"},
+	{STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK"},
+	{STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER, -EIO,
+	"STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER"},
+	{STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER, -EIO,
+	"STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER"},
+	{STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER, -EIO,
+	"STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER"},
+	{STATUS_GRAPHICS_ADAPTER_WAS_RESET, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_WAS_RESET"},
+	{STATUS_GRAPHICS_INVALID_DRIVER_MODEL, -EIO,
+	"STATUS_GRAPHICS_INVALID_DRIVER_MODEL"},
+	{STATUS_GRAPHICS_PRESENT_MODE_CHANGED, -EIO,
+	"STATUS_GRAPHICS_PRESENT_MODE_CHANGED"},
+	{STATUS_GRAPHICS_PRESENT_OCCLUDED, -EIO,
+	"STATUS_GRAPHICS_PRESENT_OCCLUDED"},
+	{STATUS_GRAPHICS_PRESENT_DENIED, -EIO,
+	"STATUS_GRAPHICS_PRESENT_DENIED"},
+	{STATUS_GRAPHICS_CANNOTCOLORCONVERT, -EIO,
+	"STATUS_GRAPHICS_CANNOTCOLORCONVERT"},
+	{STATUS_GRAPHICS_NO_VIDEO_MEMORY, -EIO,
+	"STATUS_GRAPHICS_NO_VIDEO_MEMORY"},
+	{STATUS_GRAPHICS_CANT_LOCK_MEMORY, -EIO,
+	"STATUS_GRAPHICS_CANT_LOCK_MEMORY"},
+	{STATUS_GRAPHICS_ALLOCATION_BUSY, -EBUSY,
+	"STATUS_GRAPHICS_ALLOCATION_BUSY"},
+	{STATUS_GRAPHICS_TOO_MANY_REFERENCES, -EIO,
+	"STATUS_GRAPHICS_TOO_MANY_REFERENCES"},
+	{STATUS_GRAPHICS_TRY_AGAIN_LATER, -EIO,
+	"STATUS_GRAPHICS_TRY_AGAIN_LATER"},
+	{STATUS_GRAPHICS_TRY_AGAIN_NOW, -EIO, "STATUS_GRAPHICS_TRY_AGAIN_NOW"},
+	{STATUS_GRAPHICS_ALLOCATION_INVALID, -EIO,
+	"STATUS_GRAPHICS_ALLOCATION_INVALID"},
+	{STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE, -EIO,
+	"STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE"},
+	{STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED, -EIO,
+	"STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED"},
+	{STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION, -EIO,
+	"STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION"},
+	{STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE, -EIO,
+	"STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE"},
+	{STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION, -EIO,
+	"STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION"},
+	{STATUS_GRAPHICS_ALLOCATION_CLOSED, -EIO,
+	"STATUS_GRAPHICS_ALLOCATION_CLOSED"},
+	{STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE, -EIO,
+	"STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE"},
+	{STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE, -EIO,
+	"STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE"},
+	{STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE, -EIO,
+	"STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE"},
+	{STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST, -EIO,
+	"STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST"},
+	{STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE, -EIO,
+	"STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY"},
+	{STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_VIDPN, -EIO, "STATUS_GRAPHICS_INVALID_VIDPN"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET"},
+	{STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET"},
+	{STATUS_GRAPHICS_INVALID_FREQUENCY, -EIO,
+	"STATUS_GRAPHICS_INVALID_FREQUENCY"},
+	{STATUS_GRAPHICS_INVALID_ACTIVE_REGION, -EIO,
+	"STATUS_GRAPHICS_INVALID_ACTIVE_REGION"},
+	{STATUS_GRAPHICS_INVALID_TOTAL_REGION, -EIO,
+	"STATUS_GRAPHICS_INVALID_TOTAL_REGION"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE"},
+	{STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET, -EIO,
+	"STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET"},
+	{STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET, -EIO,
+	"STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET"},
+	{STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET"},
+	{STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET"},
+	{STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_TARGET_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_TARGET_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH"},
+	{STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE"},
+	{STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET, -EIO,
+	"STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET"},
+	{STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_STALE_MODESET, -EIO, "STATUS_GRAPHICS_STALE_MODESET"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE"},
+	{STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN, -EIO,
+	"STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN"},
+	{STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION, -EIO,
+	"STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION"},
+	{STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES, -EIO,
+	"STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES"},
+	{STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE"},
+	{STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET"},
+	{STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET"},
+	{STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR"},
+	{STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET, -EIO,
+	"STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET"},
+	{STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE"},
+	{STATUS_GRAPHICS_RESOURCES_NOT_RELATED, -EIO,
+	"STATUS_GRAPHICS_RESOURCES_NOT_RELATED"},
+	{STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET, -EIO,
+	"STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET"},
+	{STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER, -EIO,
+	"STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER"},
+	{STATUS_GRAPHICS_NO_VIDPNMGR, -EIO, "STATUS_GRAPHICS_NO_VIDPNMGR"},
+	{STATUS_GRAPHICS_NO_ACTIVE_VIDPN, -EIO,
+	"STATUS_GRAPHICS_NO_ACTIVE_VIDPN"},
+	{STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY"},
+	{STATUS_GRAPHICS_MONITOR_NOT_CONNECTED, -EIO,
+	"STATUS_GRAPHICS_MONITOR_NOT_CONNECTED"},
+	{STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE"},
+	{STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE"},
+	{STATUS_GRAPHICS_INVALID_STRIDE, -EIO,
+	"STATUS_GRAPHICS_INVALID_STRIDE"},
+	{STATUS_GRAPHICS_INVALID_PIXELFORMAT, -EIO,
+	"STATUS_GRAPHICS_INVALID_PIXELFORMAT"},
+	{STATUS_GRAPHICS_INVALID_COLORBASIS, -EIO,
+	"STATUS_GRAPHICS_INVALID_COLORBASIS"},
+	{STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE"},
+	{STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT, -EIO,
+	"STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT"},
+	{STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE, -EIO,
+	"STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE"},
+	{STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN, -EIO,
+	"STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN"},
+	{STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL, -EIO,
+	"STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL"},
+	{STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION, -EIO,
+	"STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION"},
+	{STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED,
+	-EIO,
+	"STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_GAMMA_RAMP, -EIO,
+	"STATUS_GRAPHICS_INVALID_GAMMA_RAMP"},
+	{STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_MODE_NOT_IN_MODESET, -EIO,
+	"STATUS_GRAPHICS_MODE_NOT_IN_MODESET"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON"},
+	{STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE"},
+	{STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE"},
+	{STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS, -EIO,
+	"STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS"},
+	{STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING, -EIO,
+	"STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING"},
+	{STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED, -EIO,
+	"STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED"},
+	{STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS, -EIO,
+	"STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS"},
+	{STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT, -EIO,
+	"STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT"},
+	{STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM, -EIO,
+	"STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT"},
+	{STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED, -EIO,
+	"STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED"},
+	{STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION, -EIO,
+	"STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION"},
+	{STATUS_GRAPHICS_INVALID_CLIENT_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_CLIENT_TYPE"},
+	{STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET, -EIO,
+	"STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET"},
+	{STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED, -EIO,
+	"STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED"},
+	{STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER, -EIO,
+	"STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER"},
+	{STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED, -EIO,
+	"STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED"},
+	{STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED, -EIO,
+	"STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED"},
+	{STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY"},
+	{STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED, -EIO,
+	"STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED"},
+	{STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON, -EIO,
+	"STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON"},
+	{STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE, -EIO,
+	"STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE"},
+	{STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER, -EIO,
+	"STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER"},
+	{STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED"},
+	{STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS,
+	-EIO,
+	"STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS"},
+	{STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST"},
+	{STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS, -EIO,
+	"STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS"},
+	{STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST"},
+	{STATUS_GRAPHICS_OPM_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_OPM_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_COPP_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_COPP_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_UAB_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_UAB_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS"},
+	{STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL, -EIO,
+	"STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL"},
+	{STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST, -EIO,
+	"STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST"},
+	{STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME, -EIO,
+	"STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME"},
+	{STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP, -EIO,
+	"STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP"},
+	{STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_OPM_INVALID_POINTER, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_POINTER"},
+	{STATUS_GRAPHICS_OPM_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_OPM_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_OPM_INVALID_HANDLE, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_HANDLE"},
+	{STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE, -EIO,
+	"STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE"},
+	{STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH, -EIO,
+	"STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH"},
+	{STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED, -EIO,
+	"STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED"},
+	{STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED, -EIO,
+	"STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED"},
+	{STATUS_GRAPHICS_PVP_HFS_FAILED, -EIO,
+	"STATUS_GRAPHICS_PVP_HFS_FAILED"},
+	{STATUS_GRAPHICS_OPM_INVALID_SRM, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_SRM"},
+	{STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP, -EIO,
+	"STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP"},
+	{STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP, -EIO,
+	"STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP"},
+	{STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA, -EIO,
+	"STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA"},
+	{STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET, -EIO,
+	"STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET"},
+	{STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH, -EIO,
+	"STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH"},
+	{STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE, -EIO,
+	"STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE"},
+	{STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS, -EIO,
+	"STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS"},
+	{STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS, -EIO,
+	"STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS"},
+	{STATUS_GRAPHICS_I2C_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_I2C_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST, -EIO,
+	"STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST"},
+	{STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA, -EIO,
+	"STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA"},
+	{STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA, -EIO,
+	"STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA"},
+	{STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_DATA, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_DATA"},
+	{STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE,
+	-EIO,
+	"STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING"},
+	{STATUS_GRAPHICS_MCA_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_MCA_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM"},
+	{STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE"},
+	{STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS, -EIO,
+	"STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS"},
+	{STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED"},
+	{STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME, -EIO,
+	"STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME"},
+	{STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP, -EIO,
+	"STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP"},
+	{STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_POINTER, -EIO,
+	"STATUS_GRAPHICS_INVALID_POINTER"},
+	{STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE, -EIO,
+	"STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE"},
+	{STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL, -EIO,
+	"STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL"},
+	{STATUS_GRAPHICS_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS, -EIO,
+	"STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS"},
+	{STATUS_FVE_LOCKED_VOLUME, -EIO, "STATUS_FVE_LOCKED_VOLUME"},
+	{STATUS_FVE_NOT_ENCRYPTED, -EIO, "STATUS_FVE_NOT_ENCRYPTED"},
+	{STATUS_FVE_BAD_INFORMATION, -EIO, "STATUS_FVE_BAD_INFORMATION"},
+	{STATUS_FVE_TOO_SMALL, -EIO, "STATUS_FVE_TOO_SMALL"},
+	{STATUS_FVE_FAILED_WRONG_FS, -EIO, "STATUS_FVE_FAILED_WRONG_FS"},
+	{STATUS_FVE_FAILED_BAD_FS, -EIO, "STATUS_FVE_FAILED_BAD_FS"},
+	{STATUS_FVE_FS_NOT_EXTENDED, -EIO, "STATUS_FVE_FS_NOT_EXTENDED"},
+	{STATUS_FVE_FS_MOUNTED, -EIO, "STATUS_FVE_FS_MOUNTED"},
+	{STATUS_FVE_NO_LICENSE, -EIO, "STATUS_FVE_NO_LICENSE"},
+	{STATUS_FVE_ACTION_NOT_ALLOWED, -EIO, "STATUS_FVE_ACTION_NOT_ALLOWED"},
+	{STATUS_FVE_BAD_DATA, -EIO, "STATUS_FVE_BAD_DATA"},
+	{STATUS_FVE_VOLUME_NOT_BOUND, -EIO, "STATUS_FVE_VOLUME_NOT_BOUND"},
+	{STATUS_FVE_NOT_DATA_VOLUME, -EIO, "STATUS_FVE_NOT_DATA_VOLUME"},
+	{STATUS_FVE_CONV_READ_ERROR, -EIO, "STATUS_FVE_CONV_READ_ERROR"},
+	{STATUS_FVE_CONV_WRITE_ERROR, -EIO, "STATUS_FVE_CONV_WRITE_ERROR"},
+	{STATUS_FVE_OVERLAPPED_UPDATE, -EIO, "STATUS_FVE_OVERLAPPED_UPDATE"},
+	{STATUS_FVE_FAILED_SECTOR_SIZE, -EIO, "STATUS_FVE_FAILED_SECTOR_SIZE"},
+	{STATUS_FVE_FAILED_AUTHENTICATION, -EIO,
+	"STATUS_FVE_FAILED_AUTHENTICATION"},
+	{STATUS_FVE_NOT_OS_VOLUME, -EIO, "STATUS_FVE_NOT_OS_VOLUME"},
+	{STATUS_FVE_KEYFILE_NOT_FOUND, -EIO, "STATUS_FVE_KEYFILE_NOT_FOUND"},
+	{STATUS_FVE_KEYFILE_INVALID, -EIO, "STATUS_FVE_KEYFILE_INVALID"},
+	{STATUS_FVE_KEYFILE_NO_VMK, -EIO, "STATUS_FVE_KEYFILE_NO_VMK"},
+	{STATUS_FVE_TPM_DISABLED, -EIO, "STATUS_FVE_TPM_DISABLED"},
+	{STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO, -EIO,
+	"STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO"},
+	{STATUS_FVE_TPM_INVALID_PCR, -EIO, "STATUS_FVE_TPM_INVALID_PCR"},
+	{STATUS_FVE_TPM_NO_VMK, -EIO, "STATUS_FVE_TPM_NO_VMK"},
+	{STATUS_FVE_PIN_INVALID, -EIO, "STATUS_FVE_PIN_INVALID"},
+	{STATUS_FVE_AUTH_INVALID_APPLICATION, -EIO,
+	"STATUS_FVE_AUTH_INVALID_APPLICATION"},
+	{STATUS_FVE_AUTH_INVALID_CONFIG, -EIO,
+	"STATUS_FVE_AUTH_INVALID_CONFIG"},
+	{STATUS_FVE_DEBUGGER_ENABLED, -EIO, "STATUS_FVE_DEBUGGER_ENABLED"},
+	{STATUS_FVE_DRY_RUN_FAILED, -EIO, "STATUS_FVE_DRY_RUN_FAILED"},
+	{STATUS_FVE_BAD_METADATA_POINTER, -EIO,
+	"STATUS_FVE_BAD_METADATA_POINTER"},
+	{STATUS_FVE_OLD_METADATA_COPY, -EIO, "STATUS_FVE_OLD_METADATA_COPY"},
+	{STATUS_FVE_REBOOT_REQUIRED, -EIO, "STATUS_FVE_REBOOT_REQUIRED"},
+	{STATUS_FVE_RAW_ACCESS, -EIO, "STATUS_FVE_RAW_ACCESS"},
+	{STATUS_FVE_RAW_BLOCKED, -EIO, "STATUS_FVE_RAW_BLOCKED"},
+	{STATUS_FWP_CALLOUT_NOT_FOUND, -EIO, "STATUS_FWP_CALLOUT_NOT_FOUND"},
+	{STATUS_FWP_CONDITION_NOT_FOUND, -EIO,
+	"STATUS_FWP_CONDITION_NOT_FOUND"},
+	{STATUS_FWP_FILTER_NOT_FOUND, -EIO, "STATUS_FWP_FILTER_NOT_FOUND"},
+	{STATUS_FWP_LAYER_NOT_FOUND, -EIO, "STATUS_FWP_LAYER_NOT_FOUND"},
+	{STATUS_FWP_PROVIDER_NOT_FOUND, -EIO, "STATUS_FWP_PROVIDER_NOT_FOUND"},
+	{STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND, -EIO,
+	"STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND"},
+	{STATUS_FWP_SUBLAYER_NOT_FOUND, -EIO, "STATUS_FWP_SUBLAYER_NOT_FOUND"},
+	{STATUS_FWP_NOT_FOUND, -EIO, "STATUS_FWP_NOT_FOUND"},
+	{STATUS_FWP_ALREADY_EXISTS, -EIO, "STATUS_FWP_ALREADY_EXISTS"},
+	{STATUS_FWP_IN_USE, -EIO, "STATUS_FWP_IN_USE"},
+	{STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS, -EIO,
+	"STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS"},
+	{STATUS_FWP_WRONG_SESSION, -EIO, "STATUS_FWP_WRONG_SESSION"},
+	{STATUS_FWP_NO_TXN_IN_PROGRESS, -EIO, "STATUS_FWP_NO_TXN_IN_PROGRESS"},
+	{STATUS_FWP_TXN_IN_PROGRESS, -EIO, "STATUS_FWP_TXN_IN_PROGRESS"},
+	{STATUS_FWP_TXN_ABORTED, -EIO, "STATUS_FWP_TXN_ABORTED"},
+	{STATUS_FWP_SESSION_ABORTED, -EIO, "STATUS_FWP_SESSION_ABORTED"},
+	{STATUS_FWP_INCOMPATIBLE_TXN, -EIO, "STATUS_FWP_INCOMPATIBLE_TXN"},
+	{STATUS_FWP_TIMEOUT, -ETIMEDOUT, "STATUS_FWP_TIMEOUT"},
+	{STATUS_FWP_NET_EVENTS_DISABLED, -EIO,
+	"STATUS_FWP_NET_EVENTS_DISABLED"},
+	{STATUS_FWP_INCOMPATIBLE_LAYER, -EIO, "STATUS_FWP_INCOMPATIBLE_LAYER"},
+	{STATUS_FWP_KM_CLIENTS_ONLY, -EIO, "STATUS_FWP_KM_CLIENTS_ONLY"},
+	{STATUS_FWP_LIFETIME_MISMATCH, -EIO, "STATUS_FWP_LIFETIME_MISMATCH"},
+	{STATUS_FWP_BUILTIN_OBJECT, -EIO, "STATUS_FWP_BUILTIN_OBJECT"},
+	{STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS, -EIO,
+	"STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS"},
+	{STATUS_FWP_TOO_MANY_CALLOUTS, -EIO, "STATUS_FWP_TOO_MANY_CALLOUTS"},
+	{STATUS_FWP_NOTIFICATION_DROPPED, -EIO,
+	"STATUS_FWP_NOTIFICATION_DROPPED"},
+	{STATUS_FWP_TRAFFIC_MISMATCH, -EIO, "STATUS_FWP_TRAFFIC_MISMATCH"},
+	{STATUS_FWP_INCOMPATIBLE_SA_STATE, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_SA_STATE"},
+	{STATUS_FWP_NULL_POINTER, -EIO, "STATUS_FWP_NULL_POINTER"},
+	{STATUS_FWP_INVALID_ENUMERATOR, -EIO, "STATUS_FWP_INVALID_ENUMERATOR"},
+	{STATUS_FWP_INVALID_FLAGS, -EIO, "STATUS_FWP_INVALID_FLAGS"},
+	{STATUS_FWP_INVALID_NET_MASK, -EIO, "STATUS_FWP_INVALID_NET_MASK"},
+	{STATUS_FWP_INVALID_RANGE, -EIO, "STATUS_FWP_INVALID_RANGE"},
+	{STATUS_FWP_INVALID_INTERVAL, -EIO, "STATUS_FWP_INVALID_INTERVAL"},
+	{STATUS_FWP_ZERO_LENGTH_ARRAY, -EIO, "STATUS_FWP_ZERO_LENGTH_ARRAY"},
+	{STATUS_FWP_NULL_DISPLAY_NAME, -EIO, "STATUS_FWP_NULL_DISPLAY_NAME"},
+	{STATUS_FWP_INVALID_ACTION_TYPE, -EIO,
+	"STATUS_FWP_INVALID_ACTION_TYPE"},
+	{STATUS_FWP_INVALID_WEIGHT, -EIO, "STATUS_FWP_INVALID_WEIGHT"},
+	{STATUS_FWP_MATCH_TYPE_MISMATCH, -EIO,
+	"STATUS_FWP_MATCH_TYPE_MISMATCH"},
+	{STATUS_FWP_TYPE_MISMATCH, -EIO, "STATUS_FWP_TYPE_MISMATCH"},
+	{STATUS_FWP_OUT_OF_BOUNDS, -EIO, "STATUS_FWP_OUT_OF_BOUNDS"},
+	{STATUS_FWP_RESERVED, -EIO, "STATUS_FWP_RESERVED"},
+	{STATUS_FWP_DUPLICATE_CONDITION, -EIO,
+	"STATUS_FWP_DUPLICATE_CONDITION"},
+	{STATUS_FWP_DUPLICATE_KEYMOD, -EIO, "STATUS_FWP_DUPLICATE_KEYMOD"},
+	{STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER, -EIO,
+	"STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER"},
+	{STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER, -EIO,
+	"STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER"},
+	{STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER, -EIO,
+	"STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER"},
+	{STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT, -EIO,
+	"STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT"},
+	{STATUS_FWP_INCOMPATIBLE_AUTH_METHOD, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_AUTH_METHOD"},
+	{STATUS_FWP_INCOMPATIBLE_DH_GROUP, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_DH_GROUP"},
+	{STATUS_FWP_EM_NOT_SUPPORTED, -EOPNOTSUPP,
+	"STATUS_FWP_EM_NOT_SUPPORTED"},
+	{STATUS_FWP_NEVER_MATCH, -EIO, "STATUS_FWP_NEVER_MATCH"},
+	{STATUS_FWP_PROVIDER_CONTEXT_MISMATCH, -EIO,
+	"STATUS_FWP_PROVIDER_CONTEXT_MISMATCH"},
+	{STATUS_FWP_INVALID_PARAMETER, -EIO, "STATUS_FWP_INVALID_PARAMETER"},
+	{STATUS_FWP_TOO_MANY_SUBLAYERS, -EIO, "STATUS_FWP_TOO_MANY_SUBLAYERS"},
+	{STATUS_FWP_CALLOUT_NOTIFICATION_FAILED, -EIO,
+	"STATUS_FWP_CALLOUT_NOTIFICATION_FAILED"},
+	{STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG"},
+	{STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG"},
+	{STATUS_FWP_TCPIP_NOT_READY, -EIO, "STATUS_FWP_TCPIP_NOT_READY"},
+	{STATUS_FWP_INJECT_HANDLE_CLOSING, -EIO,
+	"STATUS_FWP_INJECT_HANDLE_CLOSING"},
+	{STATUS_FWP_INJECT_HANDLE_STALE, -EIO,
+	"STATUS_FWP_INJECT_HANDLE_STALE"},
+	{STATUS_FWP_CANNOT_PEND, -EIO, "STATUS_FWP_CANNOT_PEND"},
+	{STATUS_NDIS_CLOSING, -EIO, "STATUS_NDIS_CLOSING"},
+	{STATUS_NDIS_BAD_VERSION, -EIO, "STATUS_NDIS_BAD_VERSION"},
+	{STATUS_NDIS_BAD_CHARACTERISTICS, -EIO,
+	"STATUS_NDIS_BAD_CHARACTERISTICS"},
+	{STATUS_NDIS_ADAPTER_NOT_FOUND, -EIO, "STATUS_NDIS_ADAPTER_NOT_FOUND"},
+	{STATUS_NDIS_OPEN_FAILED, -EIO, "STATUS_NDIS_OPEN_FAILED"},
+	{STATUS_NDIS_DEVICE_FAILED, -EIO, "STATUS_NDIS_DEVICE_FAILED"},
+	{STATUS_NDIS_MULTICAST_FULL, -EIO, "STATUS_NDIS_MULTICAST_FULL"},
+	{STATUS_NDIS_MULTICAST_EXISTS, -EIO, "STATUS_NDIS_MULTICAST_EXISTS"},
+	{STATUS_NDIS_MULTICAST_NOT_FOUND, -EIO,
+	"STATUS_NDIS_MULTICAST_NOT_FOUND"},
+	{STATUS_NDIS_REQUEST_ABORTED, -EIO, "STATUS_NDIS_REQUEST_ABORTED"},
+	{STATUS_NDIS_RESET_IN_PROGRESS, -EIO, "STATUS_NDIS_RESET_IN_PROGRESS"},
+	{STATUS_NDIS_INVALID_PACKET, -EIO, "STATUS_NDIS_INVALID_PACKET"},
+	{STATUS_NDIS_INVALID_DEVICE_REQUEST, -EIO,
+	"STATUS_NDIS_INVALID_DEVICE_REQUEST"},
+	{STATUS_NDIS_ADAPTER_NOT_READY, -EIO, "STATUS_NDIS_ADAPTER_NOT_READY"},
+	{STATUS_NDIS_INVALID_LENGTH, -EIO, "STATUS_NDIS_INVALID_LENGTH"},
+	{STATUS_NDIS_INVALID_DATA, -EIO, "STATUS_NDIS_INVALID_DATA"},
+	{STATUS_NDIS_BUFFER_TOO_SHORT, -ENOBUFS,
+	"STATUS_NDIS_BUFFER_TOO_SHORT"},
+	{STATUS_NDIS_INVALID_OID, -EIO, "STATUS_NDIS_INVALID_OID"},
+	{STATUS_NDIS_ADAPTER_REMOVED, -EIO, "STATUS_NDIS_ADAPTER_REMOVED"},
+	{STATUS_NDIS_UNSUPPORTED_MEDIA, -EIO, "STATUS_NDIS_UNSUPPORTED_MEDIA"},
+	{STATUS_NDIS_GROUP_ADDRESS_IN_USE, -EIO,
+	"STATUS_NDIS_GROUP_ADDRESS_IN_USE"},
+	{STATUS_NDIS_FILE_NOT_FOUND, -EIO, "STATUS_NDIS_FILE_NOT_FOUND"},
+	{STATUS_NDIS_ERROR_READING_FILE, -EIO,
+	"STATUS_NDIS_ERROR_READING_FILE"},
+	{STATUS_NDIS_ALREADY_MAPPED, -EIO, "STATUS_NDIS_ALREADY_MAPPED"},
+	{STATUS_NDIS_RESOURCE_CONFLICT, -EIO, "STATUS_NDIS_RESOURCE_CONFLICT"},
+	{STATUS_NDIS_MEDIA_DISCONNECTED, -EIO,
+	"STATUS_NDIS_MEDIA_DISCONNECTED"},
+	{STATUS_NDIS_INVALID_ADDRESS, -EIO, "STATUS_NDIS_INVALID_ADDRESS"},
+	{STATUS_NDIS_PAUSED, -EIO, "STATUS_NDIS_PAUSED"},
+	{STATUS_NDIS_INTERFACE_NOT_FOUND, -EIO,
+	"STATUS_NDIS_INTERFACE_NOT_FOUND"},
+	{STATUS_NDIS_UNSUPPORTED_REVISION, -EIO,
+	"STATUS_NDIS_UNSUPPORTED_REVISION"},
+	{STATUS_NDIS_INVALID_PORT, -EIO, "STATUS_NDIS_INVALID_PORT"},
+	{STATUS_NDIS_INVALID_PORT_STATE, -EIO,
+	"STATUS_NDIS_INVALID_PORT_STATE"},
+	{STATUS_NDIS_LOW_POWER_STATE, -EIO, "STATUS_NDIS_LOW_POWER_STATE"},
+	{STATUS_NDIS_NOT_SUPPORTED, -ENOSYS, "STATUS_NDIS_NOT_SUPPORTED"},
+	{STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED, -EIO,
+	"STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED"},
+	{STATUS_NDIS_DOT11_MEDIA_IN_USE, -EIO,
+	"STATUS_NDIS_DOT11_MEDIA_IN_USE"},
+	{STATUS_NDIS_DOT11_POWER_STATE_INVALID, -EIO,
+	"STATUS_NDIS_DOT11_POWER_STATE_INVALID"},
+	{STATUS_IPSEC_BAD_SPI, -EIO, "STATUS_IPSEC_BAD_SPI"},
+	{STATUS_IPSEC_SA_LIFETIME_EXPIRED, -EIO,
+	"STATUS_IPSEC_SA_LIFETIME_EXPIRED"},
+	{STATUS_IPSEC_WRONG_SA, -EIO, "STATUS_IPSEC_WRONG_SA"},
+	{STATUS_IPSEC_REPLAY_CHECK_FAILED, -EIO,
+	"STATUS_IPSEC_REPLAY_CHECK_FAILED"},
+	{STATUS_IPSEC_INVALID_PACKET, -EIO, "STATUS_IPSEC_INVALID_PACKET"},
+	{STATUS_IPSEC_INTEGRITY_CHECK_FAILED, -EIO,
+	"STATUS_IPSEC_INTEGRITY_CHECK_FAILED"},
+	{STATUS_IPSEC_CLEAR_TEXT_DROP, -EIO, "STATUS_IPSEC_CLEAR_TEXT_DROP"},
+	{0, 0, NULL}
+};
+
+/*****************************************************************************
+ Print an error message from the status code
+ *****************************************************************************/
+static void
+smb2_print_status(__le32 status)
+{
+	int idx = 0;
+
+	while (smb2_error_map_table[idx].status_string != NULL) {
+		if ((smb2_error_map_table[idx].smb2_status) == status) {
+			pr_notice("Status code returned 0x%08x %s\n", status,
+				  smb2_error_map_table[idx].status_string);
+		}
+		idx++;
+	}
+	return;
+}
+
+int
+map_smb2_to_linux_error(char *buf, bool log_err)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	unsigned int i;
+	int rc = -EIO;
+	__le32 smb2err = hdr->Status;
+
+	if (smb2err == 0)
+		return 0;
+
+	/* mask facility */
+	if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) &&
+	    (smb2err != STATUS_END_OF_FILE))
+		smb2_print_status(smb2err);
+	else if (cifsFYI & CIFS_RC)
+		smb2_print_status(smb2err);
+
+	for (i = 0; i < sizeof(smb2_error_map_table) /
+			sizeof(struct status_to_posix_error); i++) {
+		if (smb2_error_map_table[i].smb2_status == smb2err) {
+			rc = smb2_error_map_table[i].posix_error;
+			break;
+		}
+	}
+
+	/* on error mapping not found  - return EIO */
+
+	cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n",
+		 smb2err, rc);
+
+	return rc;
+}
diff --git a/kernel/fs/cifs/smb2misc.c b/kernel/fs/cifs/smb2misc.c
new file mode 100644
index 000000000..1c5907019
--- /dev/null
+++ b/kernel/fs/cifs/smb2misc.c
@@ -0,0 +1,632 @@
+/*
+ *   fs/cifs/smb2misc.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/ctype.h>
+#include "smb2pdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "smb2proto.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "smb2status.h"
+
+static int
+check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
+{
+	__u64 wire_mid = le64_to_cpu(hdr->MessageId);
+
+	/*
+	 * Make sure that this really is an SMB, that it is a response,
+	 * and that the message ids match.
+	 */
+	if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
+	    (mid == wire_mid)) {
+		if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
+			return 0;
+		else {
+			/* only one valid case where server sends us request */
+			if (hdr->Command == SMB2_OPLOCK_BREAK)
+				return 0;
+			else
+				cifs_dbg(VFS, "Received Request not response\n");
+		}
+	} else { /* bad signature or mid */
+		if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
+			cifs_dbg(VFS, "Bad protocol string signature header %x\n",
+				 *(unsigned int *) hdr->ProtocolId);
+		if (mid != wire_mid)
+			cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
+				 mid, wire_mid);
+	}
+	cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", wire_mid);
+	return 1;
+}
+
+/*
+ *  The following table defines the expected "StructureSize" of SMB2 responses
+ *  in order by SMB2 command.  This is similar to "wct" in SMB/CIFS responses.
+ *
+ *  Note that commands are defined in smb2pdu.h in le16 but the array below is
+ *  indexed by command in host byte order
+ */
+static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
+	/* SMB2_NEGOTIATE */ cpu_to_le16(65),
+	/* SMB2_SESSION_SETUP */ cpu_to_le16(9),
+	/* SMB2_LOGOFF */ cpu_to_le16(4),
+	/* SMB2_TREE_CONNECT */ cpu_to_le16(16),
+	/* SMB2_TREE_DISCONNECT */ cpu_to_le16(4),
+	/* SMB2_CREATE */ cpu_to_le16(89),
+	/* SMB2_CLOSE */ cpu_to_le16(60),
+	/* SMB2_FLUSH */ cpu_to_le16(4),
+	/* SMB2_READ */ cpu_to_le16(17),
+	/* SMB2_WRITE */ cpu_to_le16(17),
+	/* SMB2_LOCK */ cpu_to_le16(4),
+	/* SMB2_IOCTL */ cpu_to_le16(49),
+	/* BB CHECK this ... not listed in documentation */
+	/* SMB2_CANCEL */ cpu_to_le16(0),
+	/* SMB2_ECHO */ cpu_to_le16(4),
+	/* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9),
+	/* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9),
+	/* SMB2_QUERY_INFO */ cpu_to_le16(9),
+	/* SMB2_SET_INFO */ cpu_to_le16(2),
+	/* BB FIXME can also be 44 for lease break */
+	/* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
+};
+
+int
+smb2_check_message(char *buf, unsigned int length)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	__u64 mid = le64_to_cpu(hdr->MessageId);
+	__u32 len = get_rfc1002_length(buf);
+	__u32 clc_len;  /* calculated length */
+	int command;
+
+	/* BB disable following printk later */
+	cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n",
+		 __func__, length, len);
+
+	/*
+	 * Add function to do table lookup of StructureSize by command
+	 * ie Validate the wct via smb2_struct_sizes table above
+	 */
+
+	if (length < sizeof(struct smb2_pdu)) {
+		if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
+			pdu->StructureSize2 = 0;
+			/*
+			 * As with SMB/CIFS, on some error cases servers may
+			 * not return wct properly
+			 */
+			return 0;
+		} else {
+			cifs_dbg(VFS, "Length less than SMB header size\n");
+		}
+		return 1;
+	}
+	if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
+		cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
+			 mid);
+		return 1;
+	}
+
+	if (check_smb2_hdr(hdr, mid))
+		return 1;
+
+	if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
+		cifs_dbg(VFS, "Illegal structure size %u\n",
+			 le16_to_cpu(hdr->StructureSize));
+		return 1;
+	}
+
+	command = le16_to_cpu(hdr->Command);
+	if (command >= NUMBER_OF_SMB2_COMMANDS) {
+		cifs_dbg(VFS, "Illegal SMB2 command %d\n", command);
+		return 1;
+	}
+
+	if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
+		if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 ||
+		    pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
+			/* error packets have 9 byte structure size */
+			cifs_dbg(VFS, "Illegal response size %u for command %d\n",
+				 le16_to_cpu(pdu->StructureSize2), command);
+			return 1;
+		} else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0)
+			   && (le16_to_cpu(pdu->StructureSize2) != 44)
+			   && (le16_to_cpu(pdu->StructureSize2) != 36)) {
+			/* special case for SMB2.1 lease break message */
+			cifs_dbg(VFS, "Illegal response size %d for oplock break\n",
+				 le16_to_cpu(pdu->StructureSize2));
+			return 1;
+		}
+	}
+
+	if (4 + len != length) {
+		cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n",
+			 length, 4 + len, mid);
+		return 1;
+	}
+
+	clc_len = smb2_calc_size(hdr);
+
+	if (4 + len != clc_len) {
+		cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
+			 clc_len, 4 + len, mid);
+		/* create failed on symlink */
+		if (command == SMB2_CREATE_HE &&
+		    hdr->Status == STATUS_STOPPED_ON_SYMLINK)
+			return 0;
+		/* Windows 7 server returns 24 bytes more */
+		if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
+			return 0;
+		/* server can return one byte more due to implied bcc[0] */
+		if (clc_len == 4 + len + 1)
+			return 0;
+
+		/*
+		 * MacOS server pads after SMB2.1 write response with 3 bytes
+		 * of junk. Other servers match RFC1001 len to actual
+		 * SMB2/SMB3 frame length (header + smb2 response specific data)
+		 * Log the server error (once), but allow it and continue
+		 * since the frame is parseable.
+		 */
+		if (clc_len < 4 /* RFC1001 header size */ + len) {
+			printk_once(KERN_WARNING
+				"SMB2 server sent bad RFC1001 len %d not %d\n",
+				len, clc_len - 4);
+			return 0;
+		}
+
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * The size of the variable area depends on the offset and length fields
+ * located in different fields for various SMB2 responses. SMB2 responses
+ * with no variable length info, show an offset of zero for the offset field.
+ */
+static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
+	/* SMB2_NEGOTIATE */ true,
+	/* SMB2_SESSION_SETUP */ true,
+	/* SMB2_LOGOFF */ false,
+	/* SMB2_TREE_CONNECT */	false,
+	/* SMB2_TREE_DISCONNECT */ false,
+	/* SMB2_CREATE */ true,
+	/* SMB2_CLOSE */ false,
+	/* SMB2_FLUSH */ false,
+	/* SMB2_READ */	true,
+	/* SMB2_WRITE */ false,
+	/* SMB2_LOCK */	false,
+	/* SMB2_IOCTL */ true,
+	/* SMB2_CANCEL */ false, /* BB CHECK this not listed in documentation */
+	/* SMB2_ECHO */ false,
+	/* SMB2_QUERY_DIRECTORY */ true,
+	/* SMB2_CHANGE_NOTIFY */ true,
+	/* SMB2_QUERY_INFO */ true,
+	/* SMB2_SET_INFO */ false,
+	/* SMB2_OPLOCK_BREAK */ false
+};
+
+/*
+ * Returns the pointer to the beginning of the data area. Length of the data
+ * area and the offset to it (from the beginning of the smb are also returned.
+ */
+char *
+smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+{
+	*off = 0;
+	*len = 0;
+
+	/* error responses do not have data area */
+	if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
+	    (((struct smb2_err_rsp *)hdr)->StructureSize) ==
+						SMB2_ERROR_STRUCTURE_SIZE2)
+		return NULL;
+
+	/*
+	 * Following commands have data areas so we have to get the location
+	 * of the data buffer offset and data buffer length for the particular
+	 * command.
+	 */
+	switch (hdr->Command) {
+	case SMB2_NEGOTIATE:
+		*off = le16_to_cpu(
+		    ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset);
+		*len = le16_to_cpu(
+		    ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength);
+		break;
+	case SMB2_SESSION_SETUP:
+		*off = le16_to_cpu(
+		    ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset);
+		*len = le16_to_cpu(
+		    ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength);
+		break;
+	case SMB2_CREATE:
+		*off = le32_to_cpu(
+		    ((struct smb2_create_rsp *)hdr)->CreateContextsOffset);
+		*len = le32_to_cpu(
+		    ((struct smb2_create_rsp *)hdr)->CreateContextsLength);
+		break;
+	case SMB2_QUERY_INFO:
+		*off = le16_to_cpu(
+		    ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset);
+		*len = le32_to_cpu(
+		    ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength);
+		break;
+	case SMB2_READ:
+		*off = ((struct smb2_read_rsp *)hdr)->DataOffset;
+		*len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength);
+		break;
+	case SMB2_QUERY_DIRECTORY:
+		*off = le16_to_cpu(
+		  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset);
+		*len = le32_to_cpu(
+		  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
+		break;
+	case SMB2_IOCTL:
+		*off = le32_to_cpu(
+		  ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
+		*len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+		break;
+	case SMB2_CHANGE_NOTIFY:
+	default:
+		/* BB FIXME for unimplemented cases above */
+		cifs_dbg(VFS, "no length check for command\n");
+		break;
+	}
+
+	/*
+	 * Invalid length or offset probably means data area is invalid, but
+	 * we have little choice but to ignore the data area in this case.
+	 */
+	if (*off > 4096) {
+		cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
+		*len = 0;
+		*off = 0;
+	} else if (*off < 0) {
+		cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
+			 *off);
+		*off = 0;
+		*len = 0;
+	} else if (*len < 0) {
+		cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
+			 *len);
+		*len = 0;
+	} else if (*len > 128 * 1024) {
+		cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
+		*len = 0;
+	}
+
+	/* return pointer to beginning of data area, ie offset from SMB start */
+	if ((*off != 0) && (*len != 0))
+		return (char *)(&hdr->ProtocolId[0]) + *off;
+	else
+		return NULL;
+}
+
+/*
+ * Calculate the size of the SMB message based on the fixed header
+ * portion, the number of word parameters and the data portion of the message.
+ */
+unsigned int
+smb2_calc_size(void *buf)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	int offset; /* the offset from the beginning of SMB to data area */
+	int data_length; /* the length of the variable length data area */
+	/* Structure Size has already been checked to make sure it is 64 */
+	int len = 4 + le16_to_cpu(pdu->hdr.StructureSize);
+
+	/*
+	 * StructureSize2, ie length of fixed parameter area has already
+	 * been checked to make sure it is the correct length.
+	 */
+	len += le16_to_cpu(pdu->StructureSize2);
+
+	if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false)
+		goto calc_size_exit;
+
+	smb2_get_data_area_len(&offset, &data_length, hdr);
+	cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
+
+	if (data_length > 0) {
+		/*
+		 * Check to make sure that data area begins after fixed area,
+		 * Note that last byte of the fixed area is part of data area
+		 * for some commands, typically those with odd StructureSize,
+		 * so we must add one to the calculation (and 4 to account for
+		 * the size of the RFC1001 hdr.
+		 */
+		if (offset + 4 + 1 < len) {
+			cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
+				 offset + 4 + 1, len);
+			data_length = 0;
+		} else {
+			len = 4 + offset + data_length;
+		}
+	}
+calc_size_exit:
+	cifs_dbg(FYI, "SMB2 len %d\n", len);
+	return len;
+}
+
+/* Note: caller must free return buffer */
+__le16 *
+cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
+{
+	int len;
+	const char *start_of_path;
+	__le16 *to;
+	int map_type;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
+		map_type = SFM_MAP_UNI_RSVD;
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+		map_type = SFU_MAP_UNI_RSVD;
+	else
+		map_type = NO_MAP_UNI_RSVD;
+
+	/* Windows doesn't allow paths beginning with \ */
+	if (from[0] == '\\')
+		start_of_path = from + 1;
+	else
+		start_of_path = from;
+	to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
+				   cifs_sb->local_nls, map_type);
+	return to;
+}
+
+__le32
+smb2_get_lease_state(struct cifsInodeInfo *cinode)
+{
+	__le32 lease = 0;
+
+	if (CIFS_CACHE_WRITE(cinode))
+		lease |= SMB2_LEASE_WRITE_CACHING;
+	if (CIFS_CACHE_HANDLE(cinode))
+		lease |= SMB2_LEASE_HANDLE_CACHING;
+	if (CIFS_CACHE_READ(cinode))
+		lease |= SMB2_LEASE_READ_CACHING;
+	return lease;
+}
+
+struct smb2_lease_break_work {
+	struct work_struct lease_break;
+	struct tcon_link *tlink;
+	__u8 lease_key[16];
+	__le32 lease_state;
+};
+
+static void
+cifs_ses_oplock_break(struct work_struct *work)
+{
+	struct smb2_lease_break_work *lw = container_of(work,
+				struct smb2_lease_break_work, lease_break);
+	int rc;
+
+	rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key,
+			      lw->lease_state);
+	cifs_dbg(FYI, "Lease release rc %d\n", rc);
+	cifs_put_tlink(lw->tlink);
+	kfree(lw);
+}
+
+static bool
+smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
+		    struct smb2_lease_break_work *lw)
+{
+	bool found;
+	__u8 lease_state;
+	struct list_head *tmp;
+	struct cifsFileInfo *cfile;
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifs_pending_open *open;
+	struct cifsInodeInfo *cinode;
+	int ack_req = le32_to_cpu(rsp->Flags &
+				  SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
+
+	lease_state = le32_to_cpu(rsp->NewLeaseState);
+
+	list_for_each(tmp, &tcon->openFileList) {
+		cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+		cinode = CIFS_I(d_inode(cfile->dentry));
+
+		if (memcmp(cinode->lease_key, rsp->LeaseKey,
+							SMB2_LEASE_KEY_SIZE))
+			continue;
+
+		cifs_dbg(FYI, "found in the open list\n");
+		cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
+			 le32_to_cpu(rsp->NewLeaseState));
+
+		server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
+
+		if (ack_req)
+			cfile->oplock_break_cancelled = false;
+		else
+			cfile->oplock_break_cancelled = true;
+
+		queue_work(cifsiod_wq, &cfile->oplock_break);
+		kfree(lw);
+		return true;
+	}
+
+	found = false;
+	list_for_each_entry(open, &tcon->pending_opens, olist) {
+		if (memcmp(open->lease_key, rsp->LeaseKey,
+			   SMB2_LEASE_KEY_SIZE))
+			continue;
+
+		if (!found && ack_req) {
+			found = true;
+			memcpy(lw->lease_key, open->lease_key,
+			       SMB2_LEASE_KEY_SIZE);
+			lw->tlink = cifs_get_tlink(open->tlink);
+			queue_work(cifsiod_wq, &lw->lease_break);
+		}
+
+		cifs_dbg(FYI, "found in the pending open list\n");
+		cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
+			 le32_to_cpu(rsp->NewLeaseState));
+
+		open->oplock = lease_state;
+	}
+	return found;
+}
+
+static bool
+smb2_is_valid_lease_break(char *buffer)
+{
+	struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct smb2_lease_break_work *lw;
+
+	lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
+	if (!lw)
+		return false;
+
+	INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
+	lw->lease_state = rsp->NewLeaseState;
+
+	cifs_dbg(FYI, "Checking for lease break\n");
+
+	/* look up tcon based on tid & uid */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list);
+
+		list_for_each(tmp1, &server->smb_ses_list) {
+			ses = list_entry(tmp1, struct cifs_ses, smb_ses_list);
+
+			spin_lock(&cifs_file_list_lock);
+			list_for_each(tmp2, &ses->tcon_list) {
+				tcon = list_entry(tmp2, struct cifs_tcon,
+						  tcon_list);
+				cifs_stats_inc(
+				    &tcon->stats.cifs_stats.num_oplock_brks);
+				if (smb2_tcon_has_lease(tcon, rsp, lw)) {
+					spin_unlock(&cifs_file_list_lock);
+					spin_unlock(&cifs_tcp_ses_lock);
+					return true;
+				}
+			}
+			spin_unlock(&cifs_file_list_lock);
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	kfree(lw);
+	cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
+	return false;
+}
+
+bool
+smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
+{
+	struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *cinode;
+	struct cifsFileInfo *cfile;
+
+	cifs_dbg(FYI, "Checking for oplock break\n");
+
+	if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
+		return false;
+
+	if (rsp->StructureSize !=
+				smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
+		if (le16_to_cpu(rsp->StructureSize) == 44)
+			return smb2_is_valid_lease_break(buffer);
+		else
+			return false;
+	}
+
+	cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
+
+	/* look up tcon based on tid & uid */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+		list_for_each(tmp1, &ses->tcon_list) {
+			tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+
+			cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
+			spin_lock(&cifs_file_list_lock);
+			list_for_each(tmp2, &tcon->openFileList) {
+				cfile = list_entry(tmp2, struct cifsFileInfo,
+						     tlist);
+				if (rsp->PersistentFid !=
+				    cfile->fid.persistent_fid ||
+				    rsp->VolatileFid !=
+				    cfile->fid.volatile_fid)
+					continue;
+
+				cifs_dbg(FYI, "file id match, oplock break\n");
+				cinode = CIFS_I(d_inode(cfile->dentry));
+
+				if (!CIFS_CACHE_WRITE(cinode) &&
+				    rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
+					cfile->oplock_break_cancelled = true;
+				else
+					cfile->oplock_break_cancelled = false;
+
+				set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+					&cinode->flags);
+
+				/*
+				 * Set flag if the server downgrades the oplock
+				 * to L2 else clear.
+				 */
+				if (rsp->OplockLevel)
+					set_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &cinode->flags);
+				else
+					clear_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &cinode->flags);
+
+				queue_work(cifsiod_wq, &cfile->oplock_break);
+
+				spin_unlock(&cifs_file_list_lock);
+				spin_unlock(&cifs_tcp_ses_lock);
+				return true;
+			}
+			spin_unlock(&cifs_file_list_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
+			cifs_dbg(FYI, "No matching file for oplock break\n");
+			return true;
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
+	return false;
+}
diff --git a/kernel/fs/cifs/smb2ops.c b/kernel/fs/cifs/smb2ops.c
new file mode 100644
index 000000000..54daee5ad
--- /dev/null
+++ b/kernel/fs/cifs/smb2ops.c
@@ -0,0 +1,1716 @@
+/*
+ *  SMB2 version specific operations
+ *
+ *  Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ *  This library is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License v2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *  the GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include <linux/falloc.h>
+#include "cifsglob.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "smb2status.h"
+#include "smb2glob.h"
+
+static int
+change_conf(struct TCP_Server_Info *server)
+{
+	server->credits += server->echo_credits + server->oplock_credits;
+	server->oplock_credits = server->echo_credits = 0;
+	switch (server->credits) {
+	case 0:
+		return -1;
+	case 1:
+		server->echoes = false;
+		server->oplocks = false;
+		cifs_dbg(VFS, "disabling echoes and oplocks\n");
+		break;
+	case 2:
+		server->echoes = true;
+		server->oplocks = false;
+		server->echo_credits = 1;
+		cifs_dbg(FYI, "disabling oplocks\n");
+		break;
+	default:
+		server->echoes = true;
+		server->oplocks = true;
+		server->echo_credits = 1;
+		server->oplock_credits = 1;
+	}
+	server->credits -= server->echo_credits + server->oplock_credits;
+	return 0;
+}
+
+static void
+smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
+		 const int optype)
+{
+	int *val, rc = 0;
+	spin_lock(&server->req_lock);
+	val = server->ops->get_credits_field(server, optype);
+	*val += add;
+	server->in_flight--;
+	if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP)
+		rc = change_conf(server);
+	/*
+	 * Sometimes server returns 0 credits on oplock break ack - we need to
+	 * rebalance credits in this case.
+	 */
+	else if (server->in_flight > 0 && server->oplock_credits == 0 &&
+		 server->oplocks) {
+		if (server->credits > 1) {
+			server->credits--;
+			server->oplock_credits++;
+		}
+	}
+	spin_unlock(&server->req_lock);
+	wake_up(&server->request_q);
+	if (rc)
+		cifs_reconnect(server);
+}
+
+static void
+smb2_set_credits(struct TCP_Server_Info *server, const int val)
+{
+	spin_lock(&server->req_lock);
+	server->credits = val;
+	spin_unlock(&server->req_lock);
+}
+
+static int *
+smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
+{
+	switch (optype) {
+	case CIFS_ECHO_OP:
+		return &server->echo_credits;
+	case CIFS_OBREAK_OP:
+		return &server->oplock_credits;
+	default:
+		return &server->credits;
+	}
+}
+
+static unsigned int
+smb2_get_credits(struct mid_q_entry *mid)
+{
+	return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
+}
+
+static int
+smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
+		      unsigned int *num, unsigned int *credits)
+{
+	int rc = 0;
+	unsigned int scredits;
+
+	spin_lock(&server->req_lock);
+	while (1) {
+		if (server->credits <= 0) {
+			spin_unlock(&server->req_lock);
+			cifs_num_waiters_inc(server);
+			rc = wait_event_killable(server->request_q,
+					has_credits(server, &server->credits));
+			cifs_num_waiters_dec(server);
+			if (rc)
+				return rc;
+			spin_lock(&server->req_lock);
+		} else {
+			if (server->tcpStatus == CifsExiting) {
+				spin_unlock(&server->req_lock);
+				return -ENOENT;
+			}
+
+			scredits = server->credits;
+			/* can deadlock with reopen */
+			if (scredits == 1) {
+				*num = SMB2_MAX_BUFFER_SIZE;
+				*credits = 0;
+				break;
+			}
+
+			/* leave one credit for a possible reopen */
+			scredits--;
+			*num = min_t(unsigned int, size,
+				     scredits * SMB2_MAX_BUFFER_SIZE);
+
+			*credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+			server->credits -= *credits;
+			server->in_flight++;
+			break;
+		}
+	}
+	spin_unlock(&server->req_lock);
+	return rc;
+}
+
+static __u64
+smb2_get_next_mid(struct TCP_Server_Info *server)
+{
+	__u64 mid;
+	/* for SMB2 we need the current value */
+	spin_lock(&GlobalMid_Lock);
+	mid = server->CurrentMid++;
+	spin_unlock(&GlobalMid_Lock);
+	return mid;
+}
+
+static struct mid_q_entry *
+smb2_find_mid(struct TCP_Server_Info *server, char *buf)
+{
+	struct mid_q_entry *mid;
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	__u64 wire_mid = le64_to_cpu(hdr->MessageId);
+
+	spin_lock(&GlobalMid_Lock);
+	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+		if ((mid->mid == wire_mid) &&
+		    (mid->mid_state == MID_REQUEST_SUBMITTED) &&
+		    (mid->command == hdr->Command)) {
+			spin_unlock(&GlobalMid_Lock);
+			return mid;
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return NULL;
+}
+
+static void
+smb2_dump_detail(void *buf)
+{
+#ifdef CONFIG_CIFS_DEBUG2
+	struct smb2_hdr *smb = (struct smb2_hdr *)buf;
+
+	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
+		 smb->Command, smb->Status, smb->Flags, smb->MessageId,
+		 smb->ProcessId);
+	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb));
+#endif
+}
+
+static bool
+smb2_need_neg(struct TCP_Server_Info *server)
+{
+	return server->max_read == 0;
+}
+
+static int
+smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
+{
+	int rc;
+	ses->server->CurrentMid = 0;
+	rc = SMB2_negotiate(xid, ses);
+	/* BB we probably don't need to retry with modern servers */
+	if (rc == -EAGAIN)
+		rc = -EHOSTDOWN;
+	return rc;
+}
+
+static unsigned int
+smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int wsize;
+
+	/* start with specified wsize, or default */
+	wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
+	wsize = min_t(unsigned int, wsize, server->max_write);
+
+	if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+		wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
+
+	return wsize;
+}
+
+static unsigned int
+smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int rsize;
+
+	/* start with specified rsize, or default */
+	rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
+	rsize = min_t(unsigned int, rsize, server->max_read);
+
+	if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+		rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
+
+	return rsize;
+}
+
+#ifdef CONFIG_CIFS_STATS2
+static int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	unsigned int ret_data_len = 0;
+	struct network_interface_info_ioctl_rsp *out_buf;
+
+	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+			FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+			NULL /* no data input */, 0 /* no data input */,
+			(char **)&out_buf, &ret_data_len);
+	if (rc != 0)
+		cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+	else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) {
+		cifs_dbg(VFS, "server returned bad net interface info buf\n");
+		rc = -EINVAL;
+	} else {
+		/* Dump info on first interface */
+		cifs_dbg(FYI, "Adapter Capability 0x%x\t",
+			le32_to_cpu(out_buf->Capability));
+		cifs_dbg(FYI, "Link Speed %lld\n",
+			le64_to_cpu(out_buf->LinkSpeed));
+	}
+
+	return rc;
+}
+#endif /* STATS2 */
+
+static void
+smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+#ifdef CONFIG_CIFS_STATS2
+	SMB3_request_interfaces(xid, tcon);
+#endif /* STATS2 */
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
+static void
+smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
+static int
+smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
+			struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+	if (rc) {
+		kfree(utf16_path);
+		return rc;
+	}
+
+	rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	kfree(utf16_path);
+	return rc;
+}
+
+static int
+smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+		  struct cifs_sb_info *cifs_sb, const char *full_path,
+		  u64 *uniqueid, FILE_ALL_INFO *data)
+{
+	*uniqueid = le64_to_cpu(data->IndexNumber);
+	return 0;
+}
+
+static int
+smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_fid *fid, FILE_ALL_INFO *data)
+{
+	int rc;
+	struct smb2_file_all_info *smb2_data;
+
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+			    GFP_KERNEL);
+	if (smb2_data == NULL)
+		return -ENOMEM;
+
+	rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid,
+			     smb2_data);
+	if (!rc)
+		move_smb2_info_to_cifs(data, smb2_data);
+	kfree(smb2_data);
+	return rc;
+}
+
+static bool
+smb2_can_echo(struct TCP_Server_Info *server)
+{
+	return server->echoes;
+}
+
+static void
+smb2_clear_stats(struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	int i;
+	for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) {
+		atomic_set(&tcon->stats.smb2_stats.smb2_com_sent[i], 0);
+		atomic_set(&tcon->stats.smb2_stats.smb2_com_failed[i], 0);
+	}
+#endif
+}
+
+static void
+smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
+{
+	seq_puts(m, "\n\tShare Capabilities:");
+	if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
+		seq_puts(m, " DFS,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+		seq_puts(m, " CONTINUOUS AVAILABILITY,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
+		seq_puts(m, " SCALEOUT,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
+		seq_puts(m, " CLUSTER,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
+		seq_puts(m, " ASYMMETRIC,");
+	if (tcon->capabilities == 0)
+		seq_puts(m, " None");
+	if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE)
+		seq_puts(m, " Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE)
+		seq_puts(m, " Partition Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY)
+		seq_puts(m, " SSD,");
+	if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED)
+		seq_puts(m, " TRIM-support,");
+
+	seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+	if (tcon->perf_sector_size)
+		seq_printf(m, "\tOptimal sector size: 0x%x",
+			   tcon->perf_sector_size);
+}
+
+static void
+smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	atomic_t *sent = tcon->stats.smb2_stats.smb2_com_sent;
+	atomic_t *failed = tcon->stats.smb2_stats.smb2_com_failed;
+	seq_printf(m, "\nNegotiates: %d sent %d failed",
+		   atomic_read(&sent[SMB2_NEGOTIATE_HE]),
+		   atomic_read(&failed[SMB2_NEGOTIATE_HE]));
+	seq_printf(m, "\nSessionSetups: %d sent %d failed",
+		   atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
+		   atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
+	seq_printf(m, "\nLogoffs: %d sent %d failed",
+		   atomic_read(&sent[SMB2_LOGOFF_HE]),
+		   atomic_read(&failed[SMB2_LOGOFF_HE]));
+	seq_printf(m, "\nTreeConnects: %d sent %d failed",
+		   atomic_read(&sent[SMB2_TREE_CONNECT_HE]),
+		   atomic_read(&failed[SMB2_TREE_CONNECT_HE]));
+	seq_printf(m, "\nTreeDisconnects: %d sent %d failed",
+		   atomic_read(&sent[SMB2_TREE_DISCONNECT_HE]),
+		   atomic_read(&failed[SMB2_TREE_DISCONNECT_HE]));
+	seq_printf(m, "\nCreates: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CREATE_HE]),
+		   atomic_read(&failed[SMB2_CREATE_HE]));
+	seq_printf(m, "\nCloses: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CLOSE_HE]),
+		   atomic_read(&failed[SMB2_CLOSE_HE]));
+	seq_printf(m, "\nFlushes: %d sent %d failed",
+		   atomic_read(&sent[SMB2_FLUSH_HE]),
+		   atomic_read(&failed[SMB2_FLUSH_HE]));
+	seq_printf(m, "\nReads: %d sent %d failed",
+		   atomic_read(&sent[SMB2_READ_HE]),
+		   atomic_read(&failed[SMB2_READ_HE]));
+	seq_printf(m, "\nWrites: %d sent %d failed",
+		   atomic_read(&sent[SMB2_WRITE_HE]),
+		   atomic_read(&failed[SMB2_WRITE_HE]));
+	seq_printf(m, "\nLocks: %d sent %d failed",
+		   atomic_read(&sent[SMB2_LOCK_HE]),
+		   atomic_read(&failed[SMB2_LOCK_HE]));
+	seq_printf(m, "\nIOCTLs: %d sent %d failed",
+		   atomic_read(&sent[SMB2_IOCTL_HE]),
+		   atomic_read(&failed[SMB2_IOCTL_HE]));
+	seq_printf(m, "\nCancels: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CANCEL_HE]),
+		   atomic_read(&failed[SMB2_CANCEL_HE]));
+	seq_printf(m, "\nEchos: %d sent %d failed",
+		   atomic_read(&sent[SMB2_ECHO_HE]),
+		   atomic_read(&failed[SMB2_ECHO_HE]));
+	seq_printf(m, "\nQueryDirectories: %d sent %d failed",
+		   atomic_read(&sent[SMB2_QUERY_DIRECTORY_HE]),
+		   atomic_read(&failed[SMB2_QUERY_DIRECTORY_HE]));
+	seq_printf(m, "\nChangeNotifies: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CHANGE_NOTIFY_HE]),
+		   atomic_read(&failed[SMB2_CHANGE_NOTIFY_HE]));
+	seq_printf(m, "\nQueryInfos: %d sent %d failed",
+		   atomic_read(&sent[SMB2_QUERY_INFO_HE]),
+		   atomic_read(&failed[SMB2_QUERY_INFO_HE]));
+	seq_printf(m, "\nSetInfos: %d sent %d failed",
+		   atomic_read(&sent[SMB2_SET_INFO_HE]),
+		   atomic_read(&failed[SMB2_SET_INFO_HE]));
+	seq_printf(m, "\nOplockBreaks: %d sent %d failed",
+		   atomic_read(&sent[SMB2_OPLOCK_BREAK_HE]),
+		   atomic_read(&failed[SMB2_OPLOCK_BREAK_HE]));
+#endif
+}
+
+static void
+smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+
+	cfile->fid.persistent_fid = fid->persistent_fid;
+	cfile->fid.volatile_fid = fid->volatile_fid;
+	server->ops->set_oplock_level(cinode, oplock, fid->epoch,
+				      &fid->purge_cache);
+	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
+}
+
+static void
+smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+}
+
+static int
+SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid,
+		     struct copychunk_ioctl *pcchunk)
+{
+	int rc;
+	unsigned int ret_data_len;
+	struct resume_key_req *res_key;
+
+	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+			FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
+			NULL, 0 /* no input */,
+			(char **)&res_key, &ret_data_len);
+
+	if (rc) {
+		cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
+		goto req_res_key_exit;
+	}
+	if (ret_data_len < sizeof(struct resume_key_req)) {
+		cifs_dbg(VFS, "Invalid refcopy resume key length\n");
+		rc = -EINVAL;
+		goto req_res_key_exit;
+	}
+	memcpy(pcchunk->SourceKey, res_key->ResumeKey, COPY_CHUNK_RES_KEY_SIZE);
+
+req_res_key_exit:
+	kfree(res_key);
+	return rc;
+}
+
+static int
+smb2_clone_range(const unsigned int xid,
+			struct cifsFileInfo *srcfile,
+			struct cifsFileInfo *trgtfile, u64 src_off,
+			u64 len, u64 dest_off)
+{
+	int rc;
+	unsigned int ret_data_len;
+	struct copychunk_ioctl *pcchunk;
+	struct copychunk_ioctl_rsp *retbuf = NULL;
+	struct cifs_tcon *tcon;
+	int chunks_copied = 0;
+	bool chunk_sizes_updated = false;
+
+	pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+
+	if (pcchunk == NULL)
+		return -ENOMEM;
+
+	cifs_dbg(FYI, "in smb2_clone_range - about to call request res key\n");
+	/* Request a key from the server to identify the source of the copy */
+	rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink),
+				srcfile->fid.persistent_fid,
+				srcfile->fid.volatile_fid, pcchunk);
+
+	/* Note: request_res_key sets res_key null only if rc !=0 */
+	if (rc)
+		goto cchunk_out;
+
+	/* For now array only one chunk long, will make more flexible later */
+	pcchunk->ChunkCount = cpu_to_le32(1);
+	pcchunk->Reserved = 0;
+	pcchunk->Reserved2 = 0;
+
+	tcon = tlink_tcon(trgtfile->tlink);
+
+	while (len > 0) {
+		pcchunk->SourceOffset = cpu_to_le64(src_off);
+		pcchunk->TargetOffset = cpu_to_le64(dest_off);
+		pcchunk->Length =
+			cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
+
+		/* Request server copy to target from src identified by key */
+		rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
+			trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
+			true /* is_fsctl */, (char *)pcchunk,
+			sizeof(struct copychunk_ioctl),	(char **)&retbuf,
+			&ret_data_len);
+		if (rc == 0) {
+			if (ret_data_len !=
+					sizeof(struct copychunk_ioctl_rsp)) {
+				cifs_dbg(VFS, "invalid cchunk response size\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			if (retbuf->TotalBytesWritten == 0) {
+				cifs_dbg(FYI, "no bytes copied\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			/*
+			 * Check if server claimed to write more than we asked
+			 */
+			if (le32_to_cpu(retbuf->TotalBytesWritten) >
+			    le32_to_cpu(pcchunk->Length)) {
+				cifs_dbg(VFS, "invalid copy chunk response\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			if (le32_to_cpu(retbuf->ChunksWritten) != 1) {
+				cifs_dbg(VFS, "invalid num chunks written\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			chunks_copied++;
+
+			src_off += le32_to_cpu(retbuf->TotalBytesWritten);
+			dest_off += le32_to_cpu(retbuf->TotalBytesWritten);
+			len -= le32_to_cpu(retbuf->TotalBytesWritten);
+
+			cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n",
+				le32_to_cpu(retbuf->ChunksWritten),
+				le32_to_cpu(retbuf->ChunkBytesWritten),
+				le32_to_cpu(retbuf->TotalBytesWritten));
+		} else if (rc == -EINVAL) {
+			if (ret_data_len != sizeof(struct copychunk_ioctl_rsp))
+				goto cchunk_out;
+
+			cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n",
+				le32_to_cpu(retbuf->ChunksWritten),
+				le32_to_cpu(retbuf->ChunkBytesWritten),
+				le32_to_cpu(retbuf->TotalBytesWritten));
+
+			/*
+			 * Check if this is the first request using these sizes,
+			 * (ie check if copy succeed once with original sizes
+			 * and check if the server gave us different sizes after
+			 * we already updated max sizes on previous request).
+			 * if not then why is the server returning an error now
+			 */
+			if ((chunks_copied != 0) || chunk_sizes_updated)
+				goto cchunk_out;
+
+			/* Check that server is not asking us to grow size */
+			if (le32_to_cpu(retbuf->ChunkBytesWritten) <
+					tcon->max_bytes_chunk)
+				tcon->max_bytes_chunk =
+					le32_to_cpu(retbuf->ChunkBytesWritten);
+			else
+				goto cchunk_out; /* server gave us bogus size */
+
+			/* No need to change MaxChunks since already set to 1 */
+			chunk_sizes_updated = true;
+		} else
+			goto cchunk_out;
+	}
+
+cchunk_out:
+	kfree(pcchunk);
+	return rc;
+}
+
+static int
+smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	return SMB2_flush(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+}
+
+static unsigned int
+smb2_read_data_offset(char *buf)
+{
+	struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+	return rsp->DataOffset;
+}
+
+static unsigned int
+smb2_read_data_length(char *buf)
+{
+	struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+	return le32_to_cpu(rsp->DataLength);
+}
+
+
+static int
+smb2_sync_read(const unsigned int xid, struct cifs_fid *pfid,
+	       struct cifs_io_parms *parms, unsigned int *bytes_read,
+	       char **buf, int *buf_type)
+{
+	parms->persistent_fid = pfid->persistent_fid;
+	parms->volatile_fid = pfid->volatile_fid;
+	return SMB2_read(xid, parms, bytes_read, buf, buf_type);
+}
+
+static int
+smb2_sync_write(const unsigned int xid, struct cifs_fid *pfid,
+		struct cifs_io_parms *parms, unsigned int *written,
+		struct kvec *iov, unsigned long nr_segs)
+{
+
+	parms->persistent_fid = pfid->persistent_fid;
+	parms->volatile_fid = pfid->volatile_fid;
+	return SMB2_write(xid, parms, written, iov, nr_segs);
+}
+
+/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */
+static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse)
+{
+	struct cifsInodeInfo *cifsi;
+	int rc;
+
+	cifsi = CIFS_I(inode);
+
+	/* if file already sparse don't bother setting sparse again */
+	if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse)
+		return true; /* already sparse */
+
+	if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse)
+		return true; /* already not sparse */
+
+	/*
+	 * Can't check for sparse support on share the usual way via the
+	 * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share
+	 * since Samba server doesn't set the flag on the share, yet
+	 * supports the set sparse FSCTL and returns sparse correctly
+	 * in the file attributes. If we fail setting sparse though we
+	 * mark that server does not support sparse files for this share
+	 * to avoid repeatedly sending the unsupported fsctl to server
+	 * if the file is repeatedly extended.
+	 */
+	if (tcon->broken_sparse_sup)
+		return false;
+
+	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+			cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
+			true /* is_fctl */, &setsparse, 1, NULL, NULL);
+	if (rc) {
+		tcon->broken_sparse_sup = true;
+		cifs_dbg(FYI, "set sparse rc = %d\n", rc);
+		return false;
+	}
+
+	if (setsparse)
+		cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE;
+	else
+		cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE);
+
+	return true;
+}
+
+static int
+smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
+{
+	__le64 eof = cpu_to_le64(size);
+	struct inode *inode;
+
+	/*
+	 * If extending file more than one page make sparse. Many Linux fs
+	 * make files sparse by default when extending via ftruncate
+	 */
+	inode = d_inode(cfile->dentry);
+
+	if (!set_alloc && (size > inode->i_size + 8192)) {
+		__u8 set_sparse = 1;
+
+		/* whether set sparse succeeds or not, extend the file */
+		smb2_set_sparse(xid, tcon, cfile, inode, set_sparse);
+	}
+
+	return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+			    cfile->fid.volatile_fid, cfile->pid, &eof, false);
+}
+
+static int
+smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid,
+			    cfile->fid.volatile_fid);
+}
+
+static int
+smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
+		     const char *path, struct cifs_sb_info *cifs_sb,
+		     struct cifs_fid *fid, __u16 search_flags,
+		     struct cifs_search_info *srch_inf)
+{
+	__le16 *utf16_path;
+	int rc;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+
+	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+	kfree(utf16_path);
+	if (rc) {
+		cifs_dbg(VFS, "open dir failed\n");
+		return rc;
+	}
+
+	srch_inf->entries_in_buffer = 0;
+	srch_inf->index_of_last_entry = 0;
+
+	rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
+				  fid->volatile_fid, 0, srch_inf);
+	if (rc) {
+		cifs_dbg(VFS, "query directory failed\n");
+		SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+	}
+	return rc;
+}
+
+static int
+smb2_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon,
+		    struct cifs_fid *fid, __u16 search_flags,
+		    struct cifs_search_info *srch_inf)
+{
+	return SMB2_query_directory(xid, tcon, fid->persistent_fid,
+				    fid->volatile_fid, 0, srch_inf);
+}
+
+static int
+smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
+	       struct cifs_fid *fid)
+{
+	return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+}
+
+/*
+* If we negotiate SMB2 protocol and get STATUS_PENDING - update
+* the number of credits and return true. Otherwise - return false.
+*/
+static bool
+smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+
+	if (hdr->Status != STATUS_PENDING)
+		return false;
+
+	if (!length) {
+		spin_lock(&server->req_lock);
+		server->credits += le16_to_cpu(hdr->CreditRequest);
+		spin_unlock(&server->req_lock);
+		wake_up(&server->request_q);
+	}
+
+	return true;
+}
+
+static int
+smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
+		     struct cifsInodeInfo *cinode)
+{
+	if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
+		return SMB2_lease_break(0, tcon, cinode->lease_key,
+					smb2_get_lease_state(cinode));
+
+	return SMB2_oplock_break(0, tcon, fid->persistent_fid,
+				 fid->volatile_fid,
+				 CIFS_CACHE_READ(cinode) ? 1 : 0);
+}
+
+static int
+smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
+	     struct kstatfs *buf)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return rc;
+	buf->f_type = SMB2_MAGIC_NUMBER;
+	rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			   buf);
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return rc;
+}
+
+static bool
+smb2_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
+{
+	return ob1->fid.persistent_fid == ob2->fid.persistent_fid &&
+	       ob1->fid.volatile_fid == ob2->fid.volatile_fid;
+}
+
+static int
+smb2_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
+	       __u64 length, __u32 type, int lock, int unlock, bool wait)
+{
+	if (unlock && !lock)
+		type = SMB2_LOCKFLAG_UNLOCK;
+	return SMB2_lock(xid, tlink_tcon(cfile->tlink),
+			 cfile->fid.persistent_fid, cfile->fid.volatile_fid,
+			 current->tgid, length, offset, type, wait);
+}
+
+static void
+smb2_get_lease_key(struct inode *inode, struct cifs_fid *fid)
+{
+	memcpy(fid->lease_key, CIFS_I(inode)->lease_key, SMB2_LEASE_KEY_SIZE);
+}
+
+static void
+smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid)
+{
+	memcpy(CIFS_I(inode)->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
+}
+
+static void
+smb2_new_lease_key(struct cifs_fid *fid)
+{
+	get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
+}
+
+static int
+smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, char **target_path,
+		   struct cifs_sb_info *cifs_sb)
+{
+	int rc;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+	struct smb2_err_rsp *err_buf = NULL;
+	struct smb2_symlink_err_rsp *symlink;
+	unsigned int sub_len, sub_offset;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf);
+
+	if (!rc || !err_buf) {
+		kfree(utf16_path);
+		return -ENOENT;
+	}
+	/* open must fail on symlink - reset rc */
+	rc = 0;
+	symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
+	sub_len = le16_to_cpu(symlink->SubstituteNameLength);
+	sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
+	*target_path = cifs_strndup_from_utf16(
+				(char *)symlink->PathBuffer + sub_offset,
+				sub_len, true, cifs_sb->local_nls);
+	if (!(*target_path)) {
+		kfree(utf16_path);
+		return -ENOMEM;
+	}
+	convert_delimiter(*target_path, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	kfree(utf16_path);
+	return rc;
+}
+
+static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
+			    loff_t offset, loff_t len, bool keep_size)
+{
+	struct inode *inode;
+	struct cifsInodeInfo *cifsi;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct file_zero_data_information fsctl_buf;
+	long rc;
+	unsigned int xid;
+
+	xid = get_xid();
+
+	inode = d_inode(cfile->dentry);
+	cifsi = CIFS_I(inode);
+
+	/* if file not oplocked can't be sure whether asking to extend size */
+	if (!CIFS_CACHE_READ(cifsi))
+		if (keep_size == false)
+			return -EOPNOTSUPP;
+
+	/*
+	 * Must check if file sparse since fallocate -z (zero range) assumes
+	 * non-sparse allocation
+	 */
+	if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
+		return -EOPNOTSUPP;
+
+	/*
+	 * need to make sure we are not asked to extend the file since the SMB3
+	 * fsctl does not change the file size. In the future we could change
+	 * this to zero the first part of the range then set the file size
+	 * which for a non sparse file would zero the newly extended range
+	 */
+	if (keep_size == false)
+		if (i_size_read(inode) < offset + len)
+			return -EOPNOTSUPP;
+
+	cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+
+	fsctl_buf.FileOffset = cpu_to_le64(offset);
+	fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+
+	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+			cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+			true /* is_fctl */, (char *)&fsctl_buf,
+			sizeof(struct file_zero_data_information), NULL, NULL);
+	free_xid(xid);
+	return rc;
+}
+
+static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode;
+	struct cifsInodeInfo *cifsi;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct file_zero_data_information fsctl_buf;
+	long rc;
+	unsigned int xid;
+	__u8 set_sparse = 1;
+
+	xid = get_xid();
+
+	inode = d_inode(cfile->dentry);
+	cifsi = CIFS_I(inode);
+
+	/* Need to make file sparse, if not already, before freeing range. */
+	/* Consider adding equivalent for compressed since it could also work */
+	if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
+		return -EOPNOTSUPP;
+
+	cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+
+	fsctl_buf.FileOffset = cpu_to_le64(offset);
+	fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+
+	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+			cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+			true /* is_fctl */, (char *)&fsctl_buf,
+			sizeof(struct file_zero_data_information), NULL, NULL);
+	free_xid(xid);
+	return rc;
+}
+
+static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
+			    loff_t off, loff_t len, bool keep_size)
+{
+	struct inode *inode;
+	struct cifsInodeInfo *cifsi;
+	struct cifsFileInfo *cfile = file->private_data;
+	long rc = -EOPNOTSUPP;
+	unsigned int xid;
+
+	xid = get_xid();
+
+	inode = d_inode(cfile->dentry);
+	cifsi = CIFS_I(inode);
+
+	/* if file not oplocked can't be sure whether asking to extend size */
+	if (!CIFS_CACHE_READ(cifsi))
+		if (keep_size == false)
+			return -EOPNOTSUPP;
+
+	/*
+	 * Files are non-sparse by default so falloc may be a no-op
+	 * Must check if file sparse. If not sparse, and not extending
+	 * then no need to do anything since file already allocated
+	 */
+	if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) {
+		if (keep_size == true)
+			return 0;
+		/* check if extending file */
+		else if (i_size_read(inode) >= off + len)
+			/* not extending file and already not sparse */
+			return 0;
+		/* BB: in future add else clause to extend file */
+		else
+			return -EOPNOTSUPP;
+	}
+
+	if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
+		/*
+		 * Check if falloc starts within first few pages of file
+		 * and ends within a few pages of the end of file to
+		 * ensure that most of file is being forced to be
+		 * fallocated now. If so then setting whole file sparse
+		 * ie potentially making a few extra pages at the beginning
+		 * or end of the file non-sparse via set_sparse is harmless.
+		 */
+		if ((off > 8192) || (off + len + 8192 < i_size_read(inode)))
+			return -EOPNOTSUPP;
+
+		rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
+	}
+	/* BB: else ... in future add code to extend file and set sparse */
+
+
+	free_xid(xid);
+	return rc;
+}
+
+
+static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
+			   loff_t off, loff_t len)
+{
+	/* KEEP_SIZE already checked for by do_fallocate */
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return smb3_punch_hole(file, tcon, off, len);
+	else if (mode & FALLOC_FL_ZERO_RANGE) {
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			return smb3_zero_range(file, tcon, off, len, true);
+		return smb3_zero_range(file, tcon, off, len, false);
+	} else if (mode == FALLOC_FL_KEEP_SIZE)
+		return smb3_simple_falloc(file, tcon, off, len, true);
+	else if (mode == 0)
+		return smb3_simple_falloc(file, tcon, off, len, false);
+
+	return -EOPNOTSUPP;
+}
+
+static void
+smb2_downgrade_oplock(struct TCP_Server_Info *server,
+			struct cifsInodeInfo *cinode, bool set_level2)
+{
+	if (set_level2)
+		server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
+						0, NULL);
+	else
+		server->ops->set_oplock_level(cinode, 0, 0, NULL);
+}
+
+static void
+smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		      unsigned int epoch, bool *purge_cache)
+{
+	oplock &= 0xFF;
+	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
+		return;
+	if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+		cinode->oplock = CIFS_CACHE_RHW_FLG;
+		cifs_dbg(FYI, "Batch Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+		cinode->oplock = CIFS_CACHE_RW_FLG;
+		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
+		cinode->oplock = CIFS_CACHE_READ_FLG;
+		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else
+		cinode->oplock = 0;
+}
+
+static void
+smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		       unsigned int epoch, bool *purge_cache)
+{
+	char message[5] = {0};
+
+	oplock &= 0xFF;
+	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
+		return;
+
+	cinode->oplock = 0;
+	if (oplock & SMB2_LEASE_READ_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_READ_FLG;
+		strcat(message, "R");
+	}
+	if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_HANDLE_FLG;
+		strcat(message, "H");
+	}
+	if (oplock & SMB2_LEASE_WRITE_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_WRITE_FLG;
+		strcat(message, "W");
+	}
+	if (!cinode->oplock)
+		strcat(message, "None");
+	cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
+		 &cinode->vfs_inode);
+}
+
+static void
+smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		      unsigned int epoch, bool *purge_cache)
+{
+	unsigned int old_oplock = cinode->oplock;
+
+	smb21_set_oplock_level(cinode, oplock, epoch, purge_cache);
+
+	if (purge_cache) {
+		*purge_cache = false;
+		if (old_oplock == CIFS_CACHE_READ_FLG) {
+			if (cinode->oplock == CIFS_CACHE_READ_FLG &&
+			    (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RH_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+			else if (cinode->oplock == 0 &&
+				 (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+		} else if (old_oplock == CIFS_CACHE_RH_FLG) {
+			if (cinode->oplock == CIFS_CACHE_RH_FLG &&
+			    (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+		}
+		cinode->epoch = epoch;
+	}
+}
+
+static bool
+smb2_is_read_op(__u32 oplock)
+{
+	return oplock == SMB2_OPLOCK_LEVEL_II;
+}
+
+static bool
+smb21_is_read_op(__u32 oplock)
+{
+	return (oplock & SMB2_LEASE_READ_CACHING_HE) &&
+	       !(oplock & SMB2_LEASE_WRITE_CACHING_HE);
+}
+
+static __le32
+map_oplock_to_lease(u8 oplock)
+{
+	if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+		return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
+	else if (oplock == SMB2_OPLOCK_LEVEL_II)
+		return SMB2_LEASE_READ_CACHING;
+	else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
+		return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING |
+		       SMB2_LEASE_WRITE_CACHING;
+	return 0;
+}
+
+static char *
+smb2_create_lease_buf(u8 *lease_key, u8 oplock)
+{
+	struct create_lease *buf;
+
+	buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
+	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
+	buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_lease, lcontext));
+	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_lease, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	/* SMB2_CREATE_REQUEST_LEASE is "RqLs" */
+	buf->Name[0] = 'R';
+	buf->Name[1] = 'q';
+	buf->Name[2] = 'L';
+	buf->Name[3] = 's';
+	return (char *)buf;
+}
+
+static char *
+smb3_create_lease_buf(u8 *lease_key, u8 oplock)
+{
+	struct create_lease_v2 *buf;
+
+	buf = kzalloc(sizeof(struct create_lease_v2), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
+	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
+	buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_lease_v2, lcontext));
+	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2));
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_lease_v2, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	/* SMB2_CREATE_REQUEST_LEASE is "RqLs" */
+	buf->Name[0] = 'R';
+	buf->Name[1] = 'q';
+	buf->Name[2] = 'L';
+	buf->Name[3] = 's';
+	return (char *)buf;
+}
+
+static __u8
+smb2_parse_lease_buf(void *buf, unsigned int *epoch)
+{
+	struct create_lease *lc = (struct create_lease *)buf;
+
+	*epoch = 0; /* not used */
+	if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+		return SMB2_OPLOCK_LEVEL_NOCHANGE;
+	return le32_to_cpu(lc->lcontext.LeaseState);
+}
+
+static __u8
+smb3_parse_lease_buf(void *buf, unsigned int *epoch)
+{
+	struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
+
+	*epoch = le16_to_cpu(lc->lcontext.Epoch);
+	if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+		return SMB2_OPLOCK_LEVEL_NOCHANGE;
+	return le32_to_cpu(lc->lcontext.LeaseState);
+}
+
+static unsigned int
+smb2_wp_retry_size(struct inode *inode)
+{
+	return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize,
+		     SMB2_MAX_BUFFER_SIZE);
+}
+
+static bool
+smb2_dir_needs_close(struct cifsFileInfo *cfile)
+{
+	return !cfile->invalidHandle;
+}
+
+struct smb_version_operations smb20_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.wait_mtu_credits = cifs_wait_mtu_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.downgrade_oplock = smb2_downgrade_oplock,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.calc_signature = smb2_calc_signature,
+	.is_read_op = smb2_is_read_op,
+	.set_oplock_level = smb2_set_oplock_level,
+	.create_lease_buf = smb2_create_lease_buf,
+	.parse_lease_buf = smb2_parse_lease_buf,
+	.clone_range = smb2_clone_range,
+	.wp_retry_size = smb2_wp_retry_size,
+	.dir_needs_close = smb2_dir_needs_close,
+};
+
+struct smb_version_operations smb21_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.wait_mtu_credits = smb2_wait_mtu_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.downgrade_oplock = smb2_downgrade_oplock,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.query_mf_symlink = smb3_query_mf_symlink,
+	.create_mf_symlink = smb3_create_mf_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.calc_signature = smb2_calc_signature,
+	.is_read_op = smb21_is_read_op,
+	.set_oplock_level = smb21_set_oplock_level,
+	.create_lease_buf = smb2_create_lease_buf,
+	.parse_lease_buf = smb2_parse_lease_buf,
+	.clone_range = smb2_clone_range,
+	.wp_retry_size = smb2_wp_retry_size,
+	.dir_needs_close = smb2_dir_needs_close,
+};
+
+struct smb_version_operations smb30_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.wait_mtu_credits = smb2_wait_mtu_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.dump_share_caps = smb2_dump_share_caps,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.downgrade_oplock = smb2_downgrade_oplock,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb3_qfs_tcon,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.query_mf_symlink = smb3_query_mf_symlink,
+	.create_mf_symlink = smb3_create_mf_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.generate_signingkey = generate_smb3signingkey,
+	.calc_signature = smb3_calc_signature,
+	.is_read_op = smb21_is_read_op,
+	.set_oplock_level = smb3_set_oplock_level,
+	.create_lease_buf = smb3_create_lease_buf,
+	.parse_lease_buf = smb3_parse_lease_buf,
+	.clone_range = smb2_clone_range,
+	.validate_negotiate = smb3_validate_negotiate,
+	.wp_retry_size = smb2_wp_retry_size,
+	.dir_needs_close = smb2_dir_needs_close,
+	.fallocate = smb3_fallocate,
+};
+
+struct smb_version_values smb20_values = {
+	.version_string = SMB20_VERSION_STRING,
+	.protocol_id = SMB20_PROT_ID,
+	.req_capabilities = 0, /* MBZ */
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease),
+};
+
+struct smb_version_values smb21_values = {
+	.version_string = SMB21_VERSION_STRING,
+	.protocol_id = SMB21_PROT_ID,
+	.req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease),
+};
+
+struct smb_version_values smb30_values = {
+	.version_string = SMB30_VERSION_STRING,
+	.protocol_id = SMB30_PROT_ID,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease_v2),
+};
+
+struct smb_version_values smb302_values = {
+	.version_string = SMB302_VERSION_STRING,
+	.protocol_id = SMB302_PROT_ID,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease_v2),
+};
diff --git a/kernel/fs/cifs/smb2pdu.c b/kernel/fs/cifs/smb2pdu.c
new file mode 100644
index 000000000..54cbe19d9
--- /dev/null
+++ b/kernel/fs/cifs/smb2pdu.c
@@ -0,0 +1,2661 @@
+/*
+ *   fs/cifs/smb2pdu.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2009, 2013
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   Contains the routines for constructing the SMB2 PDUs themselves
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ /* SMB2 PDU handling routines here - except for leftovers (eg session setup) */
+ /* Note that there are handle based routines which must be		      */
+ /* treated slightly differently for reconnection purposes since we never     */
+ /* want to reuse a stale file handle and only the caller knows the file info */
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/vfs.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include "smb2pdu.h"
+#include "cifsglob.h"
+#include "cifsacl.h"
+#include "cifsproto.h"
+#include "smb2proto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "ntlmssp.h"
+#include "smb2status.h"
+#include "smb2glob.h"
+#include "cifspdu.h"
+
+/*
+ *  The following table defines the expected "StructureSize" of SMB2 requests
+ *  in order by SMB2 command.  This is similar to "wct" in SMB/CIFS requests.
+ *
+ *  Note that commands are defined in smb2pdu.h in le16 but the array below is
+ *  indexed by command in host byte order.
+ */
+static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
+	/* SMB2_NEGOTIATE */ 36,
+	/* SMB2_SESSION_SETUP */ 25,
+	/* SMB2_LOGOFF */ 4,
+	/* SMB2_TREE_CONNECT */	9,
+	/* SMB2_TREE_DISCONNECT */ 4,
+	/* SMB2_CREATE */ 57,
+	/* SMB2_CLOSE */ 24,
+	/* SMB2_FLUSH */ 24,
+	/* SMB2_READ */	49,
+	/* SMB2_WRITE */ 49,
+	/* SMB2_LOCK */	48,
+	/* SMB2_IOCTL */ 57,
+	/* SMB2_CANCEL */ 4,
+	/* SMB2_ECHO */ 4,
+	/* SMB2_QUERY_DIRECTORY */ 33,
+	/* SMB2_CHANGE_NOTIFY */ 32,
+	/* SMB2_QUERY_INFO */ 41,
+	/* SMB2_SET_INFO */ 33,
+	/* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */
+};
+
+
+static void
+smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
+		  const struct cifs_tcon *tcon)
+{
+	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	char *temp = (char *)hdr;
+	/* lookup word count ie StructureSize from table */
+	__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_cmd)];
+
+	/*
+	 * smaller than SMALL_BUFFER_SIZE but bigger than fixed area of
+	 * largest operations (Create)
+	 */
+	memset(temp, 0, 256);
+
+	/* Note this is only network field converted to big endian */
+	hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr)
+			- 4 /*  RFC 1001 length field itself not counted */);
+
+	hdr->ProtocolId[0] = 0xFE;
+	hdr->ProtocolId[1] = 'S';
+	hdr->ProtocolId[2] = 'M';
+	hdr->ProtocolId[3] = 'B';
+	hdr->StructureSize = cpu_to_le16(64);
+	hdr->Command = smb2_cmd;
+	hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */
+	hdr->ProcessId = cpu_to_le32((__u16)current->tgid);
+
+	if (!tcon)
+		goto out;
+
+	/* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
+	/* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
+	if ((tcon->ses) && (tcon->ses->server) &&
+	    (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+		hdr->CreditCharge = cpu_to_le16(1);
+	/* else CreditCharge MBZ */
+
+	hdr->TreeId = tcon->tid;
+	/* Uid is not converted */
+	if (tcon->ses)
+		hdr->SessionId = tcon->ses->Suid;
+
+	/*
+	 * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
+	 * to pass the path on the Open SMB prefixed by \\server\share.
+	 * Not sure when we would need to do the augmented path (if ever) and
+	 * setting this flag breaks the SMB2 open operation since it is
+	 * illegal to send an empty path name (without \\server\share prefix)
+	 * when the DFS flag is set in the SMB open header. We could
+	 * consider setting the flag on all operations other than open
+	 * but it is safer to net set it for now.
+	 */
+/*	if (tcon->share_flags & SHI1005_FLAGS_DFS)
+		hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
+
+	if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
+		hdr->Flags |= SMB2_FLAGS_SIGNED;
+out:
+	pdu->StructureSize2 = cpu_to_le16(parmsize);
+	return;
+}
+
+static int
+smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
+{
+	int rc = 0;
+	struct nls_table *nls_codepage;
+	struct cifs_ses *ses;
+	struct TCP_Server_Info *server;
+
+	/*
+	 * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
+	 * check for tcp and smb session status done differently
+	 * for those three - in the calling routine.
+	 */
+	if (tcon == NULL)
+		return rc;
+
+	if (smb2_command == SMB2_TREE_CONNECT)
+		return rc;
+
+	if (tcon->tidStatus == CifsExiting) {
+		/*
+		 * only tree disconnect, open, and write,
+		 * (and ulogoff which does not have tcon)
+		 * are allowed as we start force umount.
+		 */
+		if ((smb2_command != SMB2_WRITE) &&
+		   (smb2_command != SMB2_CREATE) &&
+		   (smb2_command != SMB2_TREE_DISCONNECT)) {
+			cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+				 smb2_command);
+			return -ENODEV;
+		}
+	}
+	if ((!tcon->ses) || (tcon->ses->status == CifsExiting) ||
+	    (!tcon->ses->server))
+		return -EIO;
+
+	ses = tcon->ses;
+	server = ses->server;
+
+	/*
+	 * Give demultiplex thread up to 10 seconds to reconnect, should be
+	 * greater than cifs socket timeout which is 7 seconds
+	 */
+	while (server->tcpStatus == CifsNeedReconnect) {
+		/*
+		 * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
+		 * here since they are implicitly done when session drops.
+		 */
+		switch (smb2_command) {
+		/*
+		 * BB Should we keep oplock break and add flush to exceptions?
+		 */
+		case SMB2_TREE_DISCONNECT:
+		case SMB2_CANCEL:
+		case SMB2_CLOSE:
+		case SMB2_OPLOCK_BREAK:
+			return -EAGAIN;
+		}
+
+		wait_event_interruptible_timeout(server->response_q,
+			(server->tcpStatus != CifsNeedReconnect), 10 * HZ);
+
+		/* are we still trying to reconnect? */
+		if (server->tcpStatus != CifsNeedReconnect)
+			break;
+
+		/*
+		 * on "soft" mounts we wait once. Hard mounts keep
+		 * retrying until process is killed or server comes
+		 * back on-line
+		 */
+		if (!tcon->retry) {
+			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
+			return -EHOSTDOWN;
+		}
+	}
+
+	if (!tcon->ses->need_reconnect && !tcon->need_reconnect)
+		return rc;
+
+	nls_codepage = load_nls_default();
+
+	/*
+	 * need to prevent multiple threads trying to simultaneously reconnect
+	 * the same SMB session
+	 */
+	mutex_lock(&tcon->ses->session_mutex);
+	rc = cifs_negotiate_protocol(0, tcon->ses);
+	if (!rc && tcon->ses->need_reconnect)
+		rc = cifs_setup_session(0, tcon->ses, nls_codepage);
+
+	if (rc || !tcon->need_reconnect) {
+		mutex_unlock(&tcon->ses->session_mutex);
+		goto out;
+	}
+
+	cifs_mark_open_files_invalid(tcon);
+	rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
+	mutex_unlock(&tcon->ses->session_mutex);
+	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
+	if (rc)
+		goto out;
+	atomic_inc(&tconInfoReconnectCount);
+out:
+	/*
+	 * Check if handle based operation so we know whether we can continue
+	 * or not without returning to caller to reset file handle.
+	 */
+	/*
+	 * BB Is flush done by server on drop of tcp session? Should we special
+	 * case it and skip above?
+	 */
+	switch (smb2_command) {
+	case SMB2_FLUSH:
+	case SMB2_READ:
+	case SMB2_WRITE:
+	case SMB2_LOCK:
+	case SMB2_IOCTL:
+	case SMB2_QUERY_DIRECTORY:
+	case SMB2_CHANGE_NOTIFY:
+	case SMB2_QUERY_INFO:
+	case SMB2_SET_INFO:
+		return -EAGAIN;
+	}
+	unload_nls(nls_codepage);
+	return rc;
+}
+
+/*
+ * Allocate and return pointer to an SMB request hdr, and set basic
+ * SMB information in the SMB header. If the return code is zero, this
+ * function must have filled in request_buf pointer.
+ */
+static int
+small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
+		void **request_buf)
+{
+	int rc = 0;
+
+	rc = smb2_reconnect(smb2_command, tcon);
+	if (rc)
+		return rc;
+
+	/* BB eventually switch this to SMB2 specific small buf size */
+	*request_buf = cifs_small_buf_get();
+	if (*request_buf == NULL) {
+		/* BB should we add a retry in here if not a writepage? */
+		return -ENOMEM;
+	}
+
+	smb2_hdr_assemble((struct smb2_hdr *) *request_buf, smb2_command, tcon);
+
+	if (tcon != NULL) {
+#ifdef CONFIG_CIFS_STATS2
+		uint16_t com_code = le16_to_cpu(smb2_command);
+		cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]);
+#endif
+		cifs_stats_inc(&tcon->num_smbs_sent);
+	}
+
+	return rc;
+}
+
+/*
+ *
+ *	SMB2 Worker functions follow:
+ *
+ *	The general structure of the worker functions is:
+ *	1) Call smb2_init (assembles SMB2 header)
+ *	2) Initialize SMB2 command specific fields in fixed length area of SMB
+ *	3) Call smb_sendrcv2 (sends request on socket and waits for response)
+ *	4) Decode SMB2 command specific fields in the fixed length area
+ *	5) Decode variable length data area (if any for this SMB2 command type)
+ *	6) Call free smb buffer
+ *	7) return
+ *
+ */
+
+int
+SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
+{
+	struct smb2_negotiate_req *req;
+	struct smb2_negotiate_rsp *rsp;
+	struct kvec iov[1];
+	int rc = 0;
+	int resp_buftype;
+	struct TCP_Server_Info *server = ses->server;
+	int blob_offset, blob_length;
+	char *security_blob;
+	int flags = CIFS_NEG_OP;
+
+	cifs_dbg(FYI, "Negotiate protocol\n");
+
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
+	}
+
+	rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->hdr.SessionId = 0;
+
+	req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
+
+	req->DialectCount = cpu_to_le16(1); /* One vers= at a time for now */
+	inc_rfc1001_len(req, 2);
+
+	/* only one of SMB2 signing flags may be set in SMB2 request */
+	if (ses->sign)
+		req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+	else if (global_secflags & CIFSSEC_MAY_SIGN)
+		req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+	else
+		req->SecurityMode = 0;
+
+	req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
+
+	/* ClientGUID must be zero for SMB2.02 dialect */
+	if (ses->server->vals->protocol_id == SMB20_PROT_ID)
+		memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE);
+	else
+		memcpy(req->ClientGUID, server->client_guid,
+			SMB2_CLIENT_GUID_SIZE);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags);
+
+	rsp = (struct smb2_negotiate_rsp *)iov[0].iov_base;
+	/*
+	 * No tcon so can't do
+	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
+	 */
+	if (rc != 0)
+		goto neg_exit;
+
+	cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode);
+
+	/* BB we may eventually want to match the negotiated vs. requested
+	   dialect, even though we are only requesting one at a time */
+	if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb2.0 dialect\n");
+	else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
+	else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
+	else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
+	else {
+		cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
+			 le16_to_cpu(rsp->DialectRevision));
+		rc = -EIO;
+		goto neg_exit;
+	}
+	server->dialect = le16_to_cpu(rsp->DialectRevision);
+
+	/* SMB2 only has an extended negflavor */
+	server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
+	/* set it to the maximum buffer size value we can send with 1 credit */
+	server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize),
+			       SMB2_MAX_BUFFER_SIZE);
+	server->max_read = le32_to_cpu(rsp->MaxReadSize);
+	server->max_write = le32_to_cpu(rsp->MaxWriteSize);
+	/* BB Do we need to validate the SecurityMode? */
+	server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+	server->capabilities = le32_to_cpu(rsp->Capabilities);
+	/* Internal types */
+	server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
+
+	security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
+					       &rsp->hdr);
+	/*
+	 * See MS-SMB2 section 2.2.4: if no blob, client picks default which
+	 * for us will be
+	 *	ses->sectype = RawNTLMSSP;
+	 * but for time being this is our only auth choice so doesn't matter.
+	 * We just found a server which sets blob length to zero expecting raw.
+	 */
+	if (blob_length == 0)
+		cifs_dbg(FYI, "missing security blob on negprot\n");
+
+	rc = cifs_enable_signing(server, ses->sign);
+#ifdef CONFIG_SMB2_ASN1  /* BB REMOVEME when updated asn1.c ready */
+	if (rc)
+		goto neg_exit;
+	if (blob_length)
+		rc = decode_negTokenInit(security_blob, blob_length, server);
+	if (rc == 1)
+		rc = 0;
+	else if (rc == 0) {
+		rc = -EIO;
+		goto neg_exit;
+	}
+#endif
+
+neg_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc = 0;
+	struct validate_negotiate_info_req vneg_inbuf;
+	struct validate_negotiate_info_rsp *pneg_rsp;
+	u32 rsplen;
+
+	cifs_dbg(FYI, "validate negotiate\n");
+
+	/*
+	 * validation ioctl must be signed, so no point sending this if we
+	 * can not sign it.  We could eventually change this to selectively
+	 * sign just this, the first and only signed request on a connection.
+	 * This is good enough for now since a user who wants better security
+	 * would also enable signing on the mount. Having validation of
+	 * negotiate info for signed connections helps reduce attack vectors
+	 */
+	if (tcon->ses->server->sign == false)
+		return 0; /* validation requires signing */
+
+	vneg_inbuf.Capabilities =
+			cpu_to_le32(tcon->ses->server->vals->req_capabilities);
+	memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid,
+					SMB2_CLIENT_GUID_SIZE);
+
+	if (tcon->ses->sign)
+		vneg_inbuf.SecurityMode =
+			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+	else if (global_secflags & CIFSSEC_MAY_SIGN)
+		vneg_inbuf.SecurityMode =
+			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+	else
+		vneg_inbuf.SecurityMode = 0;
+
+	vneg_inbuf.DialectCount = cpu_to_le16(1);
+	vneg_inbuf.Dialects[0] =
+		cpu_to_le16(tcon->ses->server->vals->protocol_id);
+
+	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+		FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
+		(char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
+		(char **)&pneg_rsp, &rsplen);
+
+	if (rc != 0) {
+		cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
+		return -EIO;
+	}
+
+	if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
+		cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
+		return -EIO;
+	}
+
+	/* check validate negotiate info response matches what we got earlier */
+	if (pneg_rsp->Dialect !=
+			cpu_to_le16(tcon->ses->server->vals->protocol_id))
+		goto vneg_out;
+
+	if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode))
+		goto vneg_out;
+
+	/* do not validate server guid because not saved at negprot time yet */
+
+	if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND |
+	      SMB2_LARGE_FILES) != tcon->ses->server->capabilities)
+		goto vneg_out;
+
+	/* validate negotiate successful */
+	cifs_dbg(FYI, "validate negotiate info successful\n");
+	return 0;
+
+vneg_out:
+	cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
+	return -EIO;
+}
+
+int
+SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
+		const struct nls_table *nls_cp)
+{
+	struct smb2_sess_setup_req *req;
+	struct smb2_sess_setup_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int resp_buftype = CIFS_NO_BUFFER;
+	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+	struct TCP_Server_Info *server = ses->server;
+	u16 blob_length = 0;
+	char *security_blob;
+	char *ntlmssp_blob = NULL;
+	bool use_spnego = false; /* else use raw ntlmssp */
+
+	cifs_dbg(FYI, "Session Setup\n");
+
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
+	}
+
+	/*
+	 * If we are here due to reconnect, free per-smb session key
+	 * in case signing was required.
+	 */
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+
+	/*
+	 * If memory allocation is successful, caller of this function
+	 * frees it.
+	 */
+	ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+	if (!ses->ntlmssp)
+		return -ENOMEM;
+	ses->ntlmssp->sesskey_per_smbsess = true;
+
+	/* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
+	ses->sectype = RawNTLMSSP;
+
+ssetup_ntlmssp_authenticate:
+	if (phase == NtLmChallenge)
+		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
+
+	rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->hdr.SessionId = 0; /* First session, not a reauthenticate */
+	req->VcNumber = 0; /* MBZ */
+	/* to enable echos and oplocks */
+	req->hdr.CreditRequest = cpu_to_le16(3);
+
+	/* only one of SMB2 signing flags may be set in SMB2 request */
+	if (server->sign)
+		req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+	else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
+		req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
+	else
+		req->SecurityMode = 0;
+
+	req->Capabilities = 0;
+	req->Channel = 0; /* MBZ */
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and 1 for pad */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+	if (phase == NtLmNegotiate) {
+		ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE),
+				       GFP_KERNEL);
+		if (ntlmssp_blob == NULL) {
+			rc = -ENOMEM;
+			goto ssetup_exit;
+		}
+		build_ntlmssp_negotiate_blob(ntlmssp_blob, ses);
+		if (use_spnego) {
+			/* blob_length = build_spnego_ntlmssp_blob(
+					&security_blob,
+					sizeof(struct _NEGOTIATE_MESSAGE),
+					ntlmssp_blob); */
+			/* BB eventually need to add this */
+			cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
+			rc = -EOPNOTSUPP;
+			kfree(ntlmssp_blob);
+			goto ssetup_exit;
+		} else {
+			blob_length = sizeof(struct _NEGOTIATE_MESSAGE);
+			/* with raw NTLMSSP we don't encapsulate in SPNEGO */
+			security_blob = ntlmssp_blob;
+		}
+	} else if (phase == NtLmAuthenticate) {
+		req->hdr.SessionId = ses->Suid;
+		ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500,
+				       GFP_KERNEL);
+		if (ntlmssp_blob == NULL) {
+			rc = -ENOMEM;
+			goto ssetup_exit;
+		}
+		rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses,
+					     nls_cp);
+		if (rc) {
+			cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n",
+				 rc);
+			goto ssetup_exit; /* BB double check error handling */
+		}
+		if (use_spnego) {
+			/* blob_length = build_spnego_ntlmssp_blob(
+							&security_blob,
+							blob_length,
+							ntlmssp_blob); */
+			cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
+			rc = -EOPNOTSUPP;
+			kfree(ntlmssp_blob);
+			goto ssetup_exit;
+		} else {
+			security_blob = ntlmssp_blob;
+		}
+	} else {
+		cifs_dbg(VFS, "illegal ntlmssp phase\n");
+		rc = -EIO;
+		goto ssetup_exit;
+	}
+
+	/* Testing shows that buffer offset must be at location of Buffer[0] */
+	req->SecurityBufferOffset =
+				cpu_to_le16(sizeof(struct smb2_sess_setup_req) -
+					    1 /* pad */ - 4 /* rfc1001 len */);
+	req->SecurityBufferLength = cpu_to_le16(blob_length);
+	iov[1].iov_base = security_blob;
+	iov[1].iov_len = blob_length;
+
+	inc_rfc1001_len(req, blob_length - 1 /* pad */);
+
+	/* BB add code to build os and lm fields */
+
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
+			  CIFS_LOG_ERROR | CIFS_NEG_OP);
+
+	kfree(security_blob);
+	rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
+	if (resp_buftype != CIFS_NO_BUFFER &&
+	    rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
+		if (phase != NtLmNegotiate) {
+			cifs_dbg(VFS, "Unexpected more processing error\n");
+			goto ssetup_exit;
+		}
+		if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
+				le16_to_cpu(rsp->SecurityBufferOffset)) {
+			cifs_dbg(VFS, "Invalid security buffer offset %d\n",
+				 le16_to_cpu(rsp->SecurityBufferOffset));
+			rc = -EIO;
+			goto ssetup_exit;
+		}
+
+		/* NTLMSSP Negotiate sent now processing challenge (response) */
+		phase = NtLmChallenge; /* process ntlmssp challenge */
+		rc = 0; /* MORE_PROCESSING is not an error here but expected */
+		ses->Suid = rsp->hdr.SessionId;
+		rc = decode_ntlmssp_challenge(rsp->Buffer,
+				le16_to_cpu(rsp->SecurityBufferLength), ses);
+	}
+
+	/*
+	 * BB eventually add code for SPNEGO decoding of NtlmChallenge blob,
+	 * but at least the raw NTLMSSP case works.
+	 */
+	/*
+	 * No tcon so can't do
+	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
+	 */
+	if (rc != 0)
+		goto ssetup_exit;
+
+	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+	if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+		cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
+ssetup_exit:
+	free_rsp_buf(resp_buftype, rsp);
+
+	/* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
+	if ((phase == NtLmChallenge) && (rc == 0))
+		goto ssetup_ntlmssp_authenticate;
+
+	if (!rc) {
+		mutex_lock(&server->srv_mutex);
+		if (server->sign && server->ops->generate_signingkey) {
+			rc = server->ops->generate_signingkey(ses);
+			kfree(ses->auth_key.response);
+			ses->auth_key.response = NULL;
+			if (rc) {
+				cifs_dbg(FYI,
+					"SMB3 session key generation failed\n");
+				mutex_unlock(&server->srv_mutex);
+				goto keygen_exit;
+			}
+		}
+		if (!server->session_estab) {
+			server->sequence_number = 0x2;
+			server->session_estab = true;
+		}
+		mutex_unlock(&server->srv_mutex);
+
+		cifs_dbg(FYI, "SMB2/3 session established successfully\n");
+		spin_lock(&GlobalMid_Lock);
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+keygen_exit:
+	if (!server->sign) {
+		kfree(ses->auth_key.response);
+		ses->auth_key.response = NULL;
+	}
+	kfree(ses->ntlmssp);
+
+	return rc;
+}
+
+int
+SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
+{
+	struct smb2_logoff_req *req; /* response is also trivial struct */
+	int rc = 0;
+	struct TCP_Server_Info *server;
+
+	cifs_dbg(FYI, "disconnect session %p\n", ses);
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	/* no need to send SMB logoff if uid already closed due to reconnect */
+	if (ses->need_reconnect)
+		goto smb2_session_already_dead;
+
+	rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req);
+	if (rc)
+		return rc;
+
+	 /* since no tcon, smb2_init can not do this, so do here */
+	req->hdr.SessionId = ses->Suid;
+	if (server->sign)
+		req->hdr.Flags |= SMB2_FLAGS_SIGNED;
+
+	rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
+	/*
+	 * No tcon so can't do
+	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
+	 */
+
+smb2_session_already_dead:
+	return rc;
+}
+
+static inline void cifs_stats_fail_inc(struct cifs_tcon *tcon, uint16_t code)
+{
+	cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_failed[code]);
+}
+
+#define MAX_SHARENAME_LENGTH (255 /* server */ + 80 /* share */ + 1 /* NULL */)
+
+/* These are similar values to what Windows uses */
+static inline void init_copy_chunk_defaults(struct cifs_tcon *tcon)
+{
+	tcon->max_chunks = 256;
+	tcon->max_bytes_chunk = 1048576;
+	tcon->max_bytes_copy = 16777216;
+}
+
+int
+SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
+	  struct cifs_tcon *tcon, const struct nls_table *cp)
+{
+	struct smb2_tree_connect_req *req;
+	struct smb2_tree_connect_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int resp_buftype;
+	int unc_path_len;
+	struct TCP_Server_Info *server;
+	__le16 *unc_path = NULL;
+
+	cifs_dbg(FYI, "TCON\n");
+
+	if ((ses->server) && tree)
+		server = ses->server;
+	else
+		return -EIO;
+
+	if (tcon && tcon->bad_network_name)
+		return -ENOENT;
+
+	unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
+	if (unc_path == NULL)
+		return -ENOMEM;
+
+	unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp) + 1;
+	unc_path_len *= 2;
+	if (unc_path_len < 2) {
+		kfree(unc_path);
+		return -EINVAL;
+	}
+
+	rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req);
+	if (rc) {
+		kfree(unc_path);
+		return rc;
+	}
+
+	if (tcon == NULL) {
+		/* since no tcon, smb2_init can not do this, so do here */
+		req->hdr.SessionId = ses->Suid;
+		/* if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED)
+			req->hdr.Flags |= SMB2_FLAGS_SIGNED; */
+	}
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and 1 for pad */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+
+	/* Testing shows that buffer offset must be at location of Buffer[0] */
+	req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)
+			- 1 /* pad */ - 4 /* do not count rfc1001 len field */);
+	req->PathLength = cpu_to_le16(unc_path_len - 2);
+	iov[1].iov_base = unc_path;
+	iov[1].iov_len = unc_path_len;
+
+	inc_rfc1001_len(req, unc_path_len - 1 /* pad */);
+
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0);
+	rsp = (struct smb2_tree_connect_rsp *)iov[0].iov_base;
+
+	if (rc != 0) {
+		if (tcon) {
+			cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE);
+			tcon->need_reconnect = true;
+		}
+		goto tcon_error_exit;
+	}
+
+	if (tcon == NULL) {
+		ses->ipc_tid = rsp->hdr.TreeId;
+		goto tcon_exit;
+	}
+
+	if (rsp->ShareType & SMB2_SHARE_TYPE_DISK)
+		cifs_dbg(FYI, "connection to disk share\n");
+	else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) {
+		tcon->ipc = true;
+		cifs_dbg(FYI, "connection to pipe share\n");
+	} else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) {
+		tcon->print = true;
+		cifs_dbg(FYI, "connection to printer\n");
+	} else {
+		cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType);
+		rc = -EOPNOTSUPP;
+		goto tcon_error_exit;
+	}
+
+	tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
+	tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
+	tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
+	tcon->tidStatus = CifsGood;
+	tcon->need_reconnect = false;
+	tcon->tid = rsp->hdr.TreeId;
+	strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
+
+	if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
+	    ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
+		cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
+	init_copy_chunk_defaults(tcon);
+	if (tcon->ses->server->ops->validate_negotiate)
+		rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
+tcon_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	kfree(unc_path);
+	return rc;
+
+tcon_error_exit:
+	if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
+		cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
+		if (tcon)
+			tcon->bad_network_name = true;
+	}
+	goto tcon_exit;
+}
+
+int
+SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	struct smb2_tree_disconnect_req *req; /* response is trivial */
+	int rc = 0;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+
+	cifs_dbg(FYI, "Tree Disconnect\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
+		return 0;
+
+	rc = small_smb2_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	rc = SendReceiveNoRsp(xid, ses, (char *)&req->hdr, 0);
+	if (rc)
+		cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE);
+
+	return rc;
+}
+
+
+static struct create_durable *
+create_durable_buf(void)
+{
+	struct create_durable *buf;
+
+	buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_durable, Data));
+	buf->ccontext.DataLength = cpu_to_le32(16);
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_durable, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DHnQ" */
+	buf->Name[0] = 'D';
+	buf->Name[1] = 'H';
+	buf->Name[2] = 'n';
+	buf->Name[3] = 'Q';
+	return buf;
+}
+
+static struct create_durable *
+create_reconnect_durable_buf(struct cifs_fid *fid)
+{
+	struct create_durable *buf;
+
+	buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_durable, Data));
+	buf->ccontext.DataLength = cpu_to_le32(16);
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_durable, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	buf->Data.Fid.PersistentFileId = fid->persistent_fid;
+	buf->Data.Fid.VolatileFileId = fid->volatile_fid;
+	/* SMB2_CREATE_DURABLE_HANDLE_RECONNECT is "DHnC" */
+	buf->Name[0] = 'D';
+	buf->Name[1] = 'H';
+	buf->Name[2] = 'n';
+	buf->Name[3] = 'C';
+	return buf;
+}
+
+static __u8
+parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
+		  unsigned int *epoch)
+{
+	char *data_offset;
+	struct create_context *cc;
+	unsigned int next = 0;
+	char *name;
+
+	data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+	cc = (struct create_context *)data_offset;
+	do {
+		cc = (struct create_context *)((char *)cc + next);
+		name = le16_to_cpu(cc->NameOffset) + (char *)cc;
+		if (le16_to_cpu(cc->NameLength) != 4 ||
+		    strncmp(name, "RqLs", 4)) {
+			next = le32_to_cpu(cc->Next);
+			continue;
+		}
+		return server->ops->parse_lease_buf(cc, epoch);
+	} while (next != 0);
+
+	return 0;
+}
+
+static int
+add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
+		  unsigned int *num_iovec, __u8 *oplock)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock);
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = server->vals->create_lease_size;
+	req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset = cpu_to_le32(
+				sizeof(struct smb2_create_req) - 4 +
+				iov[num - 1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength,
+		     server->vals->create_lease_size);
+	inc_rfc1001_len(&req->hdr, server->vals->create_lease_size);
+	*num_iovec = num + 1;
+	return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+		    struct cifs_open_parms *oparms)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	if (oparms->reconnect) {
+		iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
+		/* indicate that we don't need to relock the file */
+		oparms->reconnect = false;
+	} else
+		iov[num].iov_base = create_durable_buf();
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = sizeof(struct create_durable);
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset =
+			cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+								iov[1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable));
+	inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
+	*num_iovec = num + 1;
+	return 0;
+}
+
+int
+SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
+	  __u8 *oplock, struct smb2_file_all_info *buf,
+	  struct smb2_err_rsp **err_buf)
+{
+	struct smb2_create_req *req;
+	struct smb2_create_rsp *rsp;
+	struct TCP_Server_Info *server;
+	struct cifs_tcon *tcon = oparms->tcon;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[4];
+	int resp_buftype;
+	int uni_path_len;
+	__le16 *copy_path = NULL;
+	int copy_size;
+	int rc = 0;
+	unsigned int num_iovecs = 2;
+	__u32 file_attributes = 0;
+	char *dhc_buf = NULL, *lc_buf = NULL;
+
+	cifs_dbg(FYI, "create/open\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_CREATE, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	if (oparms->create_options & CREATE_OPTION_READONLY)
+		file_attributes |= ATTR_READONLY;
+	if (oparms->create_options & CREATE_OPTION_SPECIAL)
+		file_attributes |= ATTR_SYSTEM;
+
+	req->ImpersonationLevel = IL_IMPERSONATION;
+	req->DesiredAccess = cpu_to_le32(oparms->desired_access);
+	/* File attributes ignored on open (used in create though) */
+	req->FileAttributes = cpu_to_le32(file_attributes);
+	req->ShareAccess = FILE_SHARE_ALL_LE;
+	req->CreateDisposition = cpu_to_le32(oparms->disposition);
+	req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
+	uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
+	/* do not count rfc1001 len field */
+	req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	/* MUST set path len (NameLength) to 0 opening root of share */
+	req->NameLength = cpu_to_le16(uni_path_len - 2);
+	/* -1 since last byte is buf[0] which is sent below (path) */
+	iov[0].iov_len--;
+	if (uni_path_len % 8 != 0) {
+		copy_size = uni_path_len / 8 * 8;
+		if (copy_size < uni_path_len)
+			copy_size += 8;
+
+		copy_path = kzalloc(copy_size, GFP_KERNEL);
+		if (!copy_path)
+			return -ENOMEM;
+		memcpy((char *)copy_path, (const char *)path,
+			uni_path_len);
+		uni_path_len = copy_size;
+		path = copy_path;
+	}
+
+	iov[1].iov_len = uni_path_len;
+	iov[1].iov_base = path;
+	/* -1 since last byte is buf[0] which was counted in smb2_buf_len */
+	inc_rfc1001_len(req, uni_path_len - 1);
+
+	if (!server->oplocks)
+		*oplock = SMB2_OPLOCK_LEVEL_NONE;
+
+	if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
+	    *oplock == SMB2_OPLOCK_LEVEL_NONE)
+		req->RequestedOplockLevel = *oplock;
+	else {
+		rc = add_lease_context(server, iov, &num_iovecs, oplock);
+		if (rc) {
+			cifs_small_buf_release(req);
+			kfree(copy_path);
+			return rc;
+		}
+		lc_buf = iov[num_iovecs-1].iov_base;
+	}
+
+	if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+		/* need to set Next field of lease context if we request it */
+		if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
+			struct create_context *ccontext =
+			    (struct create_context *)iov[num_iovecs-1].iov_base;
+			ccontext->Next =
+				cpu_to_le32(server->vals->create_lease_size);
+		}
+		rc = add_durable_context(iov, &num_iovecs, oparms);
+		if (rc) {
+			cifs_small_buf_release(req);
+			kfree(copy_path);
+			kfree(lc_buf);
+			return rc;
+		}
+		dhc_buf = iov[num_iovecs-1].iov_base;
+	}
+
+	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+	rsp = (struct smb2_create_rsp *)iov[0].iov_base;
+
+	if (rc != 0) {
+		cifs_stats_fail_inc(tcon, SMB2_CREATE_HE);
+		if (err_buf)
+			*err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4,
+					   GFP_KERNEL);
+		goto creat_exit;
+	}
+
+	oparms->fid->persistent_fid = rsp->PersistentFileId;
+	oparms->fid->volatile_fid = rsp->VolatileFileId;
+
+	if (buf) {
+		memcpy(buf, &rsp->CreationTime, 32);
+		buf->AllocationSize = rsp->AllocationSize;
+		buf->EndOfFile = rsp->EndofFile;
+		buf->Attributes = rsp->FileAttributes;
+		buf->NumberOfLinks = cpu_to_le32(1);
+		buf->DeletePending = 0;
+	}
+
+	if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
+		*oplock = parse_lease_state(server, rsp, &oparms->fid->epoch);
+	else
+		*oplock = rsp->OplockLevel;
+creat_exit:
+	kfree(copy_path);
+	kfree(lc_buf);
+	kfree(dhc_buf);
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+/*
+ *	SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	   u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
+	   u32 indatalen, char **out_data, u32 *plen /* returned data len */)
+{
+	struct smb2_ioctl_req *req;
+	struct smb2_ioctl_rsp *rsp;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct kvec iov[2];
+	int resp_buftype;
+	int num_iovecs;
+	int rc = 0;
+
+	cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+	if (out_data != NULL)
+		*out_data = NULL;
+
+	/* zero out returned data len, in case of error */
+	if (plen)
+		*plen = 0;
+
+	if (tcon)
+		ses = tcon->ses;
+	else
+		return -EIO;
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->CtlCode = cpu_to_le32(opcode);
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	if (indatalen) {
+		req->InputCount = cpu_to_le32(indatalen);
+		/* do not set InputOffset if no input data */
+		req->InputOffset =
+		       cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
+		iov[1].iov_base = in_data;
+		iov[1].iov_len = indatalen;
+		num_iovecs = 2;
+	} else
+		num_iovecs = 1;
+
+	req->OutputOffset = 0;
+	req->OutputCount = 0; /* MBZ */
+
+	/*
+	 * Could increase MaxOutputResponse, but that would require more
+	 * than one credit. Windows typically sets this smaller, but for some
+	 * ioctls it may be useful to allow server to send more. No point
+	 * limiting what the server can send as long as fits in one credit
+	 */
+	req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+
+	if (is_fsctl)
+		req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
+	else
+		req->Flags = 0;
+
+	iov[0].iov_base = (char *)req;
+
+	/*
+	 * If no input data, the size of ioctl struct in
+	 * protocol spec still includes a 1 byte data buffer,
+	 * but if input data passed to ioctl, we do not
+	 * want to double count this, so we do not send
+	 * the dummy one byte of data in iovec[0] if sending
+	 * input data (in iovec[1]). We also must add 4 bytes
+	 * in first iovec to allow for rfc1002 length field.
+	 */
+
+	if (indatalen) {
+		iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+		inc_rfc1001_len(req, indatalen - 1);
+	} else
+		iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+
+	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+	rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
+
+	if ((rc != 0) && (rc != -EINVAL)) {
+		cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+		goto ioctl_exit;
+	} else if (rc == -EINVAL) {
+		if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
+		    (opcode != FSCTL_SRV_COPYCHUNK)) {
+			cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+			goto ioctl_exit;
+		}
+	}
+
+	/* check if caller wants to look at return data or just return rc */
+	if ((plen == NULL) || (out_data == NULL))
+		goto ioctl_exit;
+
+	*plen = le32_to_cpu(rsp->OutputCount);
+
+	/* We check for obvious errors in the output buffer length and offset */
+	if (*plen == 0)
+		goto ioctl_exit; /* server returned no data */
+	else if (*plen > 0xFF00) {
+		cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
+		*plen = 0;
+		rc = -EIO;
+		goto ioctl_exit;
+	}
+
+	if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+		cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
+			le32_to_cpu(rsp->OutputOffset));
+		*plen = 0;
+		rc = -EIO;
+		goto ioctl_exit;
+	}
+
+	*out_data = kmalloc(*plen, GFP_KERNEL);
+	if (*out_data == NULL) {
+		rc = -ENOMEM;
+		goto ioctl_exit;
+	}
+
+	memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+	       *plen);
+ioctl_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+/*
+ *   Individual callers to ioctl worker function follow
+ */
+
+int
+SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid)
+{
+	int rc;
+	struct  compress_ioctl fsctl_input;
+	char *ret_data = NULL;
+
+	fsctl_input.CompressionState =
+			cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+			FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+			(char *)&fsctl_input /* data input */,
+			2 /* in data len */, &ret_data /* out data */, NULL);
+
+	cifs_dbg(FYI, "set compression rc %d\n", rc);
+
+	return rc;
+}
+
+int
+SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
+	   u64 persistent_fid, u64 volatile_fid)
+{
+	struct smb2_close_req *req;
+	struct smb2_close_rsp *rsp;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[1];
+	int resp_buftype;
+	int rc = 0;
+
+	cifs_dbg(FYI, "Close\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_CLOSE, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
+	rsp = (struct smb2_close_rsp *)iov[0].iov_base;
+
+	if (rc != 0) {
+		cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+		goto close_exit;
+	}
+
+	/* BB FIXME - decode close response, update inode for caching */
+
+close_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+static int
+validate_buf(unsigned int offset, unsigned int buffer_length,
+	     struct smb2_hdr *hdr, unsigned int min_buf_size)
+
+{
+	unsigned int smb_len = be32_to_cpu(hdr->smb2_buf_length);
+	char *end_of_smb = smb_len + 4 /* RFC1001 length field */ + (char *)hdr;
+	char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr;
+	char *end_of_buf = begin_of_buf + buffer_length;
+
+
+	if (buffer_length < min_buf_size) {
+		cifs_dbg(VFS, "buffer length %d smaller than minimum size %d\n",
+			 buffer_length, min_buf_size);
+		return -EINVAL;
+	}
+
+	/* check if beyond RFC1001 maximum length */
+	if ((smb_len > 0x7FFFFF) || (buffer_length > 0x7FFFFF)) {
+		cifs_dbg(VFS, "buffer length %d or smb length %d too large\n",
+			 buffer_length, smb_len);
+		return -EINVAL;
+	}
+
+	if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) {
+		cifs_dbg(VFS, "illegal server response, bad offset to data\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * If SMB buffer fields are valid, copy into temporary buffer to hold result.
+ * Caller must free buffer.
+ */
+static int
+validate_and_copy_buf(unsigned int offset, unsigned int buffer_length,
+		      struct smb2_hdr *hdr, unsigned int minbufsize,
+		      char *data)
+
+{
+	char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr;
+	int rc;
+
+	if (!data)
+		return -EINVAL;
+
+	rc = validate_buf(offset, buffer_length, hdr, minbufsize);
+	if (rc)
+		return rc;
+
+	memcpy(data, begin_of_buf, buffer_length);
+
+	return 0;
+}
+
+static int
+query_info(const unsigned int xid, struct cifs_tcon *tcon,
+	   u64 persistent_fid, u64 volatile_fid, u8 info_class,
+	   size_t output_len, size_t min_len, void *data)
+{
+	struct smb2_query_info_req *req;
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int resp_buftype;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+
+	cifs_dbg(FYI, "Query Info\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->InfoType = SMB2_O_INFO_FILE;
+	req->FileInfoClass = info_class;
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+	/* 4 for rfc1002 length field and 1 for Buffer */
+	req->InputBufferOffset =
+		cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
+	req->OutputBufferLength = cpu_to_le32(output_len);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
+	rsp = (struct smb2_query_info_rsp *)iov[0].iov_base;
+
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qinf_exit;
+	}
+
+	rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset),
+				   le32_to_cpu(rsp->OutputBufferLength),
+				   &rsp->hdr, min_len, data);
+
+qinf_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+int
+SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+		u64 persistent_fid, u64 volatile_fid,
+		struct smb2_file_all_info *data)
+{
+	return query_info(xid, tcon, persistent_fid, volatile_fid,
+			  FILE_ALL_INFORMATION,
+			  sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+			  sizeof(struct smb2_file_all_info), data);
+}
+
+int
+SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
+		 u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid)
+{
+	return query_info(xid, tcon, persistent_fid, volatile_fid,
+			  FILE_INTERNAL_INFORMATION,
+			  sizeof(struct smb2_file_internal_info),
+			  sizeof(struct smb2_file_internal_info), uniqueid);
+}
+
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+smb2_echo_callback(struct mid_q_entry *mid)
+{
+	struct TCP_Server_Info *server = mid->callback_data;
+	struct smb2_echo_rsp *smb2 = (struct smb2_echo_rsp *)mid->resp_buf;
+	unsigned int credits_received = 1;
+
+	if (mid->mid_state == MID_RESPONSE_RECEIVED)
+		credits_received = le16_to_cpu(smb2->hdr.CreditRequest);
+
+	DeleteMidQEntry(mid);
+	add_credits(server, credits_received, CIFS_ECHO_OP);
+}
+
+int
+SMB2_echo(struct TCP_Server_Info *server)
+{
+	struct smb2_echo_req *req;
+	int rc = 0;
+	struct kvec iov;
+	struct smb_rqst rqst = { .rq_iov = &iov,
+				 .rq_nvec = 1 };
+
+	cifs_dbg(FYI, "In echo request\n");
+
+	rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
+	if (rc)
+		return rc;
+
+	req->hdr.CreditRequest = cpu_to_le16(1);
+
+	iov.iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov.iov_len = get_rfc1002_length(req) + 4;
+
+	rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server,
+			     CIFS_ECHO_OP);
+	if (rc)
+		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
+
+	cifs_small_buf_release(req);
+	return rc;
+}
+
+int
+SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	   u64 volatile_fid)
+{
+	struct smb2_flush_req *req;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[1];
+	int resp_buftype;
+	int rc = 0;
+
+	cifs_dbg(FYI, "Flush\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
+
+	if (rc != 0)
+		cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
+
+	free_rsp_buf(resp_buftype, iov[0].iov_base);
+	return rc;
+}
+
+/*
+ * To form a chain of read requests, any read requests after the first should
+ * have the end_of_chain boolean set to true.
+ */
+static int
+smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms,
+		  unsigned int remaining_bytes, int request_type)
+{
+	int rc = -EACCES;
+	struct smb2_read_req *req = NULL;
+
+	rc = small_smb2_init(SMB2_READ, io_parms->tcon, (void **) &req);
+	if (rc)
+		return rc;
+	if (io_parms->tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
+
+	req->PersistentFileId = io_parms->persistent_fid;
+	req->VolatileFileId = io_parms->volatile_fid;
+	req->ReadChannelInfoOffset = 0; /* reserved */
+	req->ReadChannelInfoLength = 0; /* reserved */
+	req->Channel = 0; /* reserved */
+	req->MinimumCount = 0;
+	req->Length = cpu_to_le32(io_parms->length);
+	req->Offset = cpu_to_le64(io_parms->offset);
+
+	if (request_type & CHAINED_REQUEST) {
+		if (!(request_type & END_OF_CHAIN)) {
+			/* 4 for rfc1002 length field */
+			req->hdr.NextCommand =
+				cpu_to_le32(get_rfc1002_length(req) + 4);
+		} else /* END_OF_CHAIN */
+			req->hdr.NextCommand = 0;
+		if (request_type & RELATED_REQUEST) {
+			req->hdr.Flags |= SMB2_FLAGS_RELATED_OPERATIONS;
+			/*
+			 * Related requests use info from previous read request
+			 * in chain.
+			 */
+			req->hdr.SessionId = 0xFFFFFFFF;
+			req->hdr.TreeId = 0xFFFFFFFF;
+			req->PersistentFileId = 0xFFFFFFFF;
+			req->VolatileFileId = 0xFFFFFFFF;
+		}
+	}
+	if (remaining_bytes > io_parms->length)
+		req->RemainingBytes = cpu_to_le32(remaining_bytes);
+	else
+		req->RemainingBytes = 0;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+	return rc;
+}
+
+static void
+smb2_readv_callback(struct mid_q_entry *mid)
+{
+	struct cifs_readdata *rdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct smb2_hdr *buf = (struct smb2_hdr *)rdata->iov.iov_base;
+	unsigned int credits_received = 1;
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1,
+				 .rq_pages = rdata->pages,
+				 .rq_npages = rdata->nr_pages,
+				 .rq_pagesz = rdata->pagesz,
+				 .rq_tailsz = rdata->tailsz };
+
+	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+		 __func__, mid->mid, mid->mid_state, rdata->result,
+		 rdata->bytes);
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		credits_received = le16_to_cpu(buf->CreditRequest);
+		/* result already set, check signature */
+		if (server->sign) {
+			int rc;
+
+			rc = smb2_verify_signature(&rqst, server);
+			if (rc)
+				cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+					 rc);
+		}
+		/* FIXME: should this be counted toward the initiating task? */
+		task_io_account_read(rdata->got_bytes);
+		cifs_stats_bytes_read(tcon, rdata->got_bytes);
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		rdata->result = -EAGAIN;
+		if (server->sign && rdata->got_bytes)
+			/* reset bytes number since we can not check a sign */
+			rdata->got_bytes = 0;
+		/* FIXME: should this be counted toward the initiating task? */
+		task_io_account_read(rdata->got_bytes);
+		cifs_stats_bytes_read(tcon, rdata->got_bytes);
+		break;
+	default:
+		if (rdata->result != -ENODATA)
+			rdata->result = -EIO;
+	}
+
+	if (rdata->result)
+		cifs_stats_fail_inc(tcon, SMB2_READ_HE);
+
+	queue_work(cifsiod_wq, &rdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(server, credits_received, 0);
+}
+
+/* smb2_async_readv - send an async write, and set up mid to handle result */
+int
+smb2_async_readv(struct cifs_readdata *rdata)
+{
+	int rc, flags = 0;
+	struct smb2_hdr *buf;
+	struct cifs_io_parms io_parms;
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1 };
+	struct TCP_Server_Info *server;
+
+	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+		 __func__, rdata->offset, rdata->bytes);
+
+	io_parms.tcon = tlink_tcon(rdata->cfile->tlink);
+	io_parms.offset = rdata->offset;
+	io_parms.length = rdata->bytes;
+	io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
+	io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
+	io_parms.pid = rdata->pid;
+
+	server = io_parms.tcon->ses->server;
+
+	rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
+	if (rc) {
+		if (rc == -EAGAIN && rdata->credits) {
+			/* credits was reset by reconnect */
+			rdata->credits = 0;
+			/* reduce in_flight value since we won't send the req */
+			spin_lock(&server->req_lock);
+			server->in_flight--;
+			spin_unlock(&server->req_lock);
+		}
+		return rc;
+	}
+
+	buf = (struct smb2_hdr *)rdata->iov.iov_base;
+	/* 4 for rfc1002 length field */
+	rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+
+	if (rdata->credits) {
+		buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
+						SMB2_MAX_BUFFER_SIZE));
+		spin_lock(&server->req_lock);
+		server->credits += rdata->credits -
+						le16_to_cpu(buf->CreditCharge);
+		spin_unlock(&server->req_lock);
+		wake_up(&server->request_q);
+		flags = CIFS_HAS_CREDITS;
+	}
+
+	kref_get(&rdata->refcount);
+	rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
+			     cifs_readv_receive, smb2_readv_callback,
+			     rdata, flags);
+	if (rc) {
+		kref_put(&rdata->refcount, cifs_readdata_release);
+		cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
+	}
+
+	cifs_small_buf_release(buf);
+	return rc;
+}
+
+int
+SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
+	  unsigned int *nbytes, char **buf, int *buf_type)
+{
+	int resp_buftype, rc = -EACCES;
+	struct smb2_read_rsp *rsp = NULL;
+	struct kvec iov[1];
+
+	*nbytes = 0;
+	rc = smb2_new_read_req(iov, io_parms, 0, 0);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, io_parms->tcon->ses, iov, 1,
+			  &resp_buftype, CIFS_LOG_ERROR);
+
+	rsp = (struct smb2_read_rsp *)iov[0].iov_base;
+
+	if (rsp->hdr.Status == STATUS_END_OF_FILE) {
+		free_rsp_buf(resp_buftype, iov[0].iov_base);
+		return 0;
+	}
+
+	if (rc) {
+		cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
+		cifs_dbg(VFS, "Send error in read = %d\n", rc);
+	} else {
+		*nbytes = le32_to_cpu(rsp->DataLength);
+		if ((*nbytes > CIFS_MAX_MSGSIZE) ||
+		    (*nbytes > io_parms->length)) {
+			cifs_dbg(FYI, "bad length %d for count %d\n",
+				 *nbytes, io_parms->length);
+			rc = -EIO;
+			*nbytes = 0;
+		}
+	}
+
+	if (*buf) {
+		memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset,
+		       *nbytes);
+		free_rsp_buf(resp_buftype, iov[0].iov_base);
+	} else if (resp_buftype != CIFS_NO_BUFFER) {
+		*buf = iov[0].iov_base;
+		if (resp_buftype == CIFS_SMALL_BUFFER)
+			*buf_type = CIFS_SMALL_BUFFER;
+		else if (resp_buftype == CIFS_LARGE_BUFFER)
+			*buf_type = CIFS_LARGE_BUFFER;
+	}
+	return rc;
+}
+
+/*
+ * Check the mid_state and signature on received buffer (if any), and queue the
+ * workqueue completion task.
+ */
+static void
+smb2_writev_callback(struct mid_q_entry *mid)
+{
+	struct cifs_writedata *wdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	unsigned int written;
+	struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
+	unsigned int credits_received = 1;
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		credits_received = le16_to_cpu(rsp->hdr.CreditRequest);
+		wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
+		if (wdata->result != 0)
+			break;
+
+		written = le32_to_cpu(rsp->DataLength);
+		/*
+		 * Mask off high 16 bits when bytes written as returned
+		 * by the server is greater than bytes requested by the
+		 * client. OS/2 servers are known to set incorrect
+		 * CountHigh values.
+		 */
+		if (written > wdata->bytes)
+			written &= 0xFFFF;
+
+		if (written < wdata->bytes)
+			wdata->result = -ENOSPC;
+		else
+			wdata->bytes = written;
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		wdata->result = -EAGAIN;
+		break;
+	default:
+		wdata->result = -EIO;
+		break;
+	}
+
+	if (wdata->result)
+		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+
+	queue_work(cifsiod_wq, &wdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(tcon->ses->server, credits_received, 0);
+}
+
+/* smb2_async_writev - send an async write, and set up mid to handle result */
+int
+smb2_async_writev(struct cifs_writedata *wdata,
+		  void (*release)(struct kref *kref))
+{
+	int rc = -EACCES, flags = 0;
+	struct smb2_write_req *req = NULL;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct kvec iov;
+	struct smb_rqst rqst;
+
+	rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
+	if (rc) {
+		if (rc == -EAGAIN && wdata->credits) {
+			/* credits was reset by reconnect */
+			wdata->credits = 0;
+			/* reduce in_flight value since we won't send the req */
+			spin_lock(&server->req_lock);
+			server->in_flight--;
+			spin_unlock(&server->req_lock);
+		}
+		goto async_writev_out;
+	}
+
+	req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
+
+	req->PersistentFileId = wdata->cfile->fid.persistent_fid;
+	req->VolatileFileId = wdata->cfile->fid.volatile_fid;
+	req->WriteChannelInfoOffset = 0;
+	req->WriteChannelInfoLength = 0;
+	req->Channel = 0;
+	req->Offset = cpu_to_le64(wdata->offset);
+	/* 4 for rfc1002 length field */
+	req->DataOffset = cpu_to_le16(
+				offsetof(struct smb2_write_req, Buffer) - 4);
+	req->RemainingBytes = 0;
+
+	/* 4 for rfc1002 length field and 1 for Buffer */
+	iov.iov_len = get_rfc1002_length(req) + 4 - 1;
+	iov.iov_base = req;
+
+	rqst.rq_iov = &iov;
+	rqst.rq_nvec = 1;
+	rqst.rq_pages = wdata->pages;
+	rqst.rq_npages = wdata->nr_pages;
+	rqst.rq_pagesz = wdata->pagesz;
+	rqst.rq_tailsz = wdata->tailsz;
+
+	cifs_dbg(FYI, "async write at %llu %u bytes\n",
+		 wdata->offset, wdata->bytes);
+
+	req->Length = cpu_to_le32(wdata->bytes);
+
+	inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+
+	if (wdata->credits) {
+		req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
+						    SMB2_MAX_BUFFER_SIZE));
+		spin_lock(&server->req_lock);
+		server->credits += wdata->credits -
+					le16_to_cpu(req->hdr.CreditCharge);
+		spin_unlock(&server->req_lock);
+		wake_up(&server->request_q);
+		flags = CIFS_HAS_CREDITS;
+	}
+
+	kref_get(&wdata->refcount);
+	rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
+			     flags);
+
+	if (rc) {
+		kref_put(&wdata->refcount, release);
+		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+	}
+
+async_writev_out:
+	cifs_small_buf_release(req);
+	return rc;
+}
+
+/*
+ * SMB2_write function gets iov pointer to kvec array with n_vec as a length.
+ * The length field from io_parms must be at least 1 and indicates a number of
+ * elements with data to write that begins with position 1 in iov array. All
+ * data length is specified by count.
+ */
+int
+SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
+	   unsigned int *nbytes, struct kvec *iov, int n_vec)
+{
+	int rc = 0;
+	struct smb2_write_req *req = NULL;
+	struct smb2_write_rsp *rsp = NULL;
+	int resp_buftype;
+	*nbytes = 0;
+
+	if (n_vec < 1)
+		return rc;
+
+	rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	if (io_parms->tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
+
+	req->PersistentFileId = io_parms->persistent_fid;
+	req->VolatileFileId = io_parms->volatile_fid;
+	req->WriteChannelInfoOffset = 0;
+	req->WriteChannelInfoLength = 0;
+	req->Channel = 0;
+	req->Length = cpu_to_le32(io_parms->length);
+	req->Offset = cpu_to_le64(io_parms->offset);
+	/* 4 for rfc1002 length field */
+	req->DataOffset = cpu_to_le16(
+				offsetof(struct smb2_write_req, Buffer) - 4);
+	req->RemainingBytes = 0;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and 1 for Buffer */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+
+	/* length of entire message including data to be written */
+	inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */);
+
+	rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1,
+			  &resp_buftype, 0);
+	rsp = (struct smb2_write_rsp *)iov[0].iov_base;
+
+	if (rc) {
+		cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
+		cifs_dbg(VFS, "Send error in write = %d\n", rc);
+	} else
+		*nbytes = le32_to_cpu(rsp->DataLength);
+
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+static unsigned int
+num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
+{
+	int len;
+	unsigned int entrycount = 0;
+	unsigned int next_offset = 0;
+	FILE_DIRECTORY_INFO *entryptr;
+
+	if (bufstart == NULL)
+		return 0;
+
+	entryptr = (FILE_DIRECTORY_INFO *)bufstart;
+
+	while (1) {
+		entryptr = (FILE_DIRECTORY_INFO *)
+					((char *)entryptr + next_offset);
+
+		if ((char *)entryptr + size > end_of_buf) {
+			cifs_dbg(VFS, "malformed search entry would overflow\n");
+			break;
+		}
+
+		len = le32_to_cpu(entryptr->FileNameLength);
+		if ((char *)entryptr + len + size > end_of_buf) {
+			cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n",
+				 end_of_buf);
+			break;
+		}
+
+		*lastentry = (char *)entryptr;
+		entrycount++;
+
+		next_offset = le32_to_cpu(entryptr->NextEntryOffset);
+		if (!next_offset)
+			break;
+	}
+
+	return entrycount;
+}
+
+/*
+ * Readdir/FindFirst
+ */
+int
+SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid, int index,
+		     struct cifs_search_info *srch_inf)
+{
+	struct smb2_query_directory_req *req;
+	struct smb2_query_directory_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int len;
+	int resp_buftype = CIFS_NO_BUFFER;
+	unsigned char *bufptr;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	__le16 asteriks = cpu_to_le16('*');
+	char *end_of_smb;
+	unsigned int output_size = CIFSMaxBufSize;
+	size_t info_buf_size;
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	switch (srch_inf->info_level) {
+	case SMB_FIND_FILE_DIRECTORY_INFO:
+		req->FileInformationClass = FILE_DIRECTORY_INFORMATION;
+		info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1;
+		break;
+	case SMB_FIND_FILE_ID_FULL_DIR_INFO:
+		req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION;
+		info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
+		break;
+	default:
+		cifs_dbg(VFS, "info level %u isn't supported\n",
+			 srch_inf->info_level);
+		rc = -EINVAL;
+		goto qdir_exit;
+	}
+
+	req->FileIndex = cpu_to_le32(index);
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	len = 0x2;
+	bufptr = req->Buffer;
+	memcpy(bufptr, &asteriks, len);
+
+	req->FileNameOffset =
+		cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4);
+	req->FileNameLength = cpu_to_le16(len);
+	/*
+	 * BB could be 30 bytes or so longer if we used SMB2 specific
+	 * buffer lengths, but this is safe and close enough.
+	 */
+	output_size = min_t(unsigned int, output_size, server->maxBuf);
+	output_size = min_t(unsigned int, output_size, 2 << 15);
+	req->OutputBufferLength = cpu_to_le32(output_size);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for RFC1001 length and 1 for Buffer */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+
+	iov[1].iov_base = (char *)(req->Buffer);
+	iov[1].iov_len = len;
+
+	inc_rfc1001_len(req, len - 1 /* Buffer */);
+
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0);
+	rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
+
+	if (rc) {
+		if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
+			srch_inf->endOfSearch = true;
+			rc = 0;
+		}
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+		goto qdir_exit;
+	}
+
+	rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
+			  le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
+			  info_buf_size);
+	if (rc)
+		goto qdir_exit;
+
+	srch_inf->unicode = true;
+
+	if (srch_inf->ntwrk_buf_start) {
+		if (srch_inf->smallBuf)
+			cifs_small_buf_release(srch_inf->ntwrk_buf_start);
+		else
+			cifs_buf_release(srch_inf->ntwrk_buf_start);
+	}
+	srch_inf->ntwrk_buf_start = (char *)rsp;
+	srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ +
+		(char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset);
+	/* 4 for rfc1002 length field */
+	end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr;
+	srch_inf->entries_in_buffer =
+			num_entries(srch_inf->srch_entries_start, end_of_smb,
+				    &srch_inf->last_entry, info_buf_size);
+	srch_inf->index_of_last_entry += srch_inf->entries_in_buffer;
+	cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n",
+		 srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
+		 srch_inf->srch_entries_start, srch_inf->last_entry);
+	if (resp_buftype == CIFS_LARGE_BUFFER)
+		srch_inf->smallBuf = false;
+	else if (resp_buftype == CIFS_SMALL_BUFFER)
+		srch_inf->smallBuf = true;
+	else
+		cifs_dbg(VFS, "illegal search buffer type\n");
+
+	return rc;
+
+qdir_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+static int
+send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+	       u64 persistent_fid, u64 volatile_fid, u32 pid, int info_class,
+	       unsigned int num, void **data, unsigned int *size)
+{
+	struct smb2_set_info_req *req;
+	struct smb2_set_info_rsp *rsp = NULL;
+	struct kvec *iov;
+	int rc = 0;
+	int resp_buftype;
+	unsigned int i;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	if (!num)
+		return -EINVAL;
+
+	iov = kmalloc(sizeof(struct kvec) * num, GFP_KERNEL);
+	if (!iov)
+		return -ENOMEM;
+
+	rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req);
+	if (rc) {
+		kfree(iov);
+		return rc;
+	}
+
+	req->hdr.ProcessId = cpu_to_le32(pid);
+
+	req->InfoType = SMB2_O_INFO_FILE;
+	req->FileInfoClass = info_class;
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	/* 4 for RFC1001 length and 1 for Buffer */
+	req->BufferOffset =
+			cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4);
+	req->BufferLength = cpu_to_le32(*size);
+
+	inc_rfc1001_len(req, *size - 1 /* Buffer */);
+
+	memcpy(req->Buffer, *data, *size);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for RFC1001 length */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	for (i = 1; i < num; i++) {
+		inc_rfc1001_len(req, size[i]);
+		le32_add_cpu(&req->BufferLength, size[i]);
+		iov[i].iov_base = (char *)data[i];
+		iov[i].iov_len = size[i];
+	}
+
+	rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0);
+	rsp = (struct smb2_set_info_rsp *)iov[0].iov_base;
+
+	if (rc != 0)
+		cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
+
+	free_rsp_buf(resp_buftype, rsp);
+	kfree(iov);
+	return rc;
+}
+
+int
+SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
+	    u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
+{
+	struct smb2_file_rename_info info;
+	void **data;
+	unsigned int size[2];
+	int rc;
+	int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
+
+	data = kmalloc(sizeof(void *) * 2, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	info.ReplaceIfExists = 1; /* 1 = replace existing target with new */
+			      /* 0 = fail if target already exists */
+	info.RootDirectory = 0;  /* MBZ for network ops (why does spec say?) */
+	info.FileNameLength = cpu_to_le32(len);
+
+	data[0] = &info;
+	size[0] = sizeof(struct smb2_file_rename_info);
+
+	data[1] = target_file;
+	size[1] = len + 2 /* null */;
+
+	rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			   current->tgid, FILE_RENAME_INFORMATION, 2, data,
+			   size);
+	kfree(data);
+	return rc;
+}
+
+int
+SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+		  u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
+{
+	struct smb2_file_link_info info;
+	void **data;
+	unsigned int size[2];
+	int rc;
+	int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
+
+	data = kmalloc(sizeof(void *) * 2, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	info.ReplaceIfExists = 0; /* 1 = replace existing link with new */
+			      /* 0 = fail if link already exists */
+	info.RootDirectory = 0;  /* MBZ for network ops (why does spec say?) */
+	info.FileNameLength = cpu_to_le32(len);
+
+	data[0] = &info;
+	size[0] = sizeof(struct smb2_file_link_info);
+
+	data[1] = target_file;
+	size[1] = len + 2 /* null */;
+
+	rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			   current->tgid, FILE_LINK_INFORMATION, 2, data, size);
+	kfree(data);
+	return rc;
+}
+
+int
+SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	     u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
+{
+	struct smb2_file_eof_info info;
+	void *data;
+	unsigned int size;
+
+	info.EndOfFile = *eof;
+
+	data = &info;
+	size = sizeof(struct smb2_file_eof_info);
+
+	if (is_falloc)
+		return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size);
+	else
+		return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
+}
+
+int
+SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf)
+{
+	unsigned int size;
+	size = sizeof(FILE_BASIC_INFO);
+	return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			     current->tgid, FILE_BASIC_INFORMATION, 1,
+			     (void **)&buf, &size);
+}
+
+int
+SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
+		  const u64 persistent_fid, const u64 volatile_fid,
+		  __u8 oplock_level)
+{
+	int rc;
+	struct smb2_oplock_break *req = NULL;
+
+	cifs_dbg(FYI, "SMB2_oplock_break\n");
+	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
+
+	if (rc)
+		return rc;
+
+	req->VolatileFid = volatile_fid;
+	req->PersistentFid = persistent_fid;
+	req->OplockLevel = oplock_level;
+	req->hdr.CreditRequest = cpu_to_le16(1);
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
+	/* SMB2 buffer freed by function above */
+
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
+		cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
+	}
+
+	return rc;
+}
+
+static void
+copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf,
+			struct kstatfs *kst)
+{
+	kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) *
+			  le32_to_cpu(pfs_inf->SectorsPerAllocationUnit);
+	kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits);
+	kst->f_bfree  = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits);
+	kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits);
+	return;
+}
+
+static int
+build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
+		   int outbuf_len, u64 persistent_fid, u64 volatile_fid)
+{
+	int rc;
+	struct smb2_query_info_req *req;
+
+	cifs_dbg(FYI, "Query FSInfo level %d\n", level);
+
+	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->InfoType = SMB2_O_INFO_FILESYSTEM;
+	req->FileInfoClass = level;
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+	/* 4 for rfc1002 length field and 1 for pad */
+	req->InputBufferOffset =
+			cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
+	req->OutputBufferLength = cpu_to_le32(
+		outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4);
+
+	iov->iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov->iov_len = get_rfc1002_length(req) + 4;
+	return 0;
+}
+
+int
+SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata)
+{
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov;
+	int rc = 0;
+	int resp_buftype;
+	struct cifs_ses *ses = tcon->ses;
+	struct smb2_fs_full_size_info *info = NULL;
+
+	rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION,
+				sizeof(struct smb2_fs_full_size_info),
+				persistent_fid, volatile_fid);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qfsinf_exit;
+	}
+	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+
+	info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ +
+		le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
+	rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
+			  le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
+			  sizeof(struct smb2_fs_full_size_info));
+	if (!rc)
+		copy_fs_info_to_kstatfs(info, fsdata);
+
+qfsinf_exit:
+	free_rsp_buf(resp_buftype, iov.iov_base);
+	return rc;
+}
+
+int
+SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid, int level)
+{
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov;
+	int rc = 0;
+	int resp_buftype, max_len, min_len;
+	struct cifs_ses *ses = tcon->ses;
+	unsigned int rsp_len, offset;
+
+	if (level == FS_DEVICE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+		min_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+	} else if (level == FS_ATTRIBUTE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO);
+		min_len = MIN_FS_ATTR_INFO_SIZE;
+	} else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		max_len = sizeof(struct smb3_fs_ss_info);
+		min_len = sizeof(struct smb3_fs_ss_info);
+	} else {
+		cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level);
+		return -EINVAL;
+	}
+
+	rc = build_qfs_info_req(&iov, tcon, level, max_len,
+				persistent_fid, volatile_fid);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qfsattr_exit;
+	}
+	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+
+	rsp_len = le32_to_cpu(rsp->OutputBufferLength);
+	offset = le16_to_cpu(rsp->OutputBufferOffset);
+	rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len);
+	if (rc)
+		goto qfsattr_exit;
+
+	if (level == FS_ATTRIBUTE_INFORMATION)
+		memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, min_t(unsigned int,
+			rsp_len, max_len));
+	else if (level == FS_DEVICE_INFORMATION)
+		memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+	else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
+			(4 /* RFC1001 len */ + offset + (char *)&rsp->hdr);
+		tcon->ss_flags = le32_to_cpu(ss_info->Flags);
+		tcon->perf_sector_size =
+			le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
+	}
+
+qfsattr_exit:
+	free_rsp_buf(resp_buftype, iov.iov_base);
+	return rc;
+}
+
+int
+smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+	   const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid,
+	   const __u32 num_lock, struct smb2_lock_element *buf)
+{
+	int rc = 0;
+	struct smb2_lock_req *req = NULL;
+	struct kvec iov[2];
+	int resp_buf_type;
+	unsigned int count;
+
+	cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
+
+	rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->hdr.ProcessId = cpu_to_le32(pid);
+	req->LockCount = cpu_to_le16(num_lock);
+
+	req->PersistentFileId = persist_fid;
+	req->VolatileFileId = volatile_fid;
+
+	count = num_lock * sizeof(struct smb2_lock_element);
+	inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element));
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and count for all locks */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - count;
+	iov[1].iov_base = (char *)buf;
+	iov[1].iov_len = count;
+
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
+		cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
+	}
+
+	return rc;
+}
+
+int
+SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
+	  const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid,
+	  const __u64 length, const __u64 offset, const __u32 lock_flags,
+	  const bool wait)
+{
+	struct smb2_lock_element lock;
+
+	lock.Offset = cpu_to_le64(offset);
+	lock.Length = cpu_to_le64(length);
+	lock.Flags = cpu_to_le32(lock_flags);
+	if (!wait && lock_flags != SMB2_LOCKFLAG_UNLOCK)
+		lock.Flags |= cpu_to_le32(SMB2_LOCKFLAG_FAIL_IMMEDIATELY);
+
+	return smb2_lockv(xid, tcon, persist_fid, volatile_fid, pid, 1, &lock);
+}
+
+int
+SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
+		 __u8 *lease_key, const __le32 lease_state)
+{
+	int rc;
+	struct smb2_lease_ack *req = NULL;
+
+	cifs_dbg(FYI, "SMB2_lease_break\n");
+	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
+
+	if (rc)
+		return rc;
+
+	req->hdr.CreditRequest = cpu_to_le16(1);
+	req->StructureSize = cpu_to_le16(36);
+	inc_rfc1001_len(req, 12);
+
+	memcpy(req->LeaseKey, lease_key, 16);
+	req->LeaseState = lease_state;
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
+	/* SMB2 buffer freed by function above */
+
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
+		cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
+	}
+
+	return rc;
+}
diff --git a/kernel/fs/cifs/smb2pdu.h b/kernel/fs/cifs/smb2pdu.h
new file mode 100644
index 000000000..70867d54f
--- /dev/null
+++ b/kernel/fs/cifs/smb2pdu.h
@@ -0,0 +1,1080 @@
+/*
+ *   fs/cifs/smb2pdu.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2009, 2013
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _SMB2PDU_H
+#define _SMB2PDU_H
+
+#include <net/sock.h>
+
+/*
+ * Note that, due to trying to use names similar to the protocol specifications,
+ * there are many mixed case field names in the structures below.  Although
+ * this does not match typical Linux kernel style, it is necessary to be
+ * be able to match against the protocol specfication.
+ *
+ * SMB2 commands
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie no useful data other than the SMB error code itself) and are marked such.
+ * Knowing this helps avoid response buffer allocations and copy in some cases.
+ */
+
+/* List of commands in host endian */
+#define SMB2_NEGOTIATE_HE	0x0000
+#define SMB2_SESSION_SETUP_HE	0x0001
+#define SMB2_LOGOFF_HE		0x0002 /* trivial request/resp */
+#define SMB2_TREE_CONNECT_HE	0x0003
+#define SMB2_TREE_DISCONNECT_HE	0x0004 /* trivial req/resp */
+#define SMB2_CREATE_HE		0x0005
+#define SMB2_CLOSE_HE		0x0006
+#define SMB2_FLUSH_HE		0x0007 /* trivial resp */
+#define SMB2_READ_HE		0x0008
+#define SMB2_WRITE_HE		0x0009
+#define SMB2_LOCK_HE		0x000A
+#define SMB2_IOCTL_HE		0x000B
+#define SMB2_CANCEL_HE		0x000C
+#define SMB2_ECHO_HE		0x000D
+#define SMB2_QUERY_DIRECTORY_HE	0x000E
+#define SMB2_CHANGE_NOTIFY_HE	0x000F
+#define SMB2_QUERY_INFO_HE	0x0010
+#define SMB2_SET_INFO_HE	0x0011
+#define SMB2_OPLOCK_BREAK_HE	0x0012
+
+/* The same list in little endian */
+#define SMB2_NEGOTIATE		cpu_to_le16(SMB2_NEGOTIATE_HE)
+#define SMB2_SESSION_SETUP	cpu_to_le16(SMB2_SESSION_SETUP_HE)
+#define SMB2_LOGOFF		cpu_to_le16(SMB2_LOGOFF_HE)
+#define SMB2_TREE_CONNECT	cpu_to_le16(SMB2_TREE_CONNECT_HE)
+#define SMB2_TREE_DISCONNECT	cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
+#define SMB2_CREATE		cpu_to_le16(SMB2_CREATE_HE)
+#define SMB2_CLOSE		cpu_to_le16(SMB2_CLOSE_HE)
+#define SMB2_FLUSH		cpu_to_le16(SMB2_FLUSH_HE)
+#define SMB2_READ		cpu_to_le16(SMB2_READ_HE)
+#define SMB2_WRITE		cpu_to_le16(SMB2_WRITE_HE)
+#define SMB2_LOCK		cpu_to_le16(SMB2_LOCK_HE)
+#define SMB2_IOCTL		cpu_to_le16(SMB2_IOCTL_HE)
+#define SMB2_CANCEL		cpu_to_le16(SMB2_CANCEL_HE)
+#define SMB2_ECHO		cpu_to_le16(SMB2_ECHO_HE)
+#define SMB2_QUERY_DIRECTORY	cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
+#define SMB2_CHANGE_NOTIFY	cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
+#define SMB2_QUERY_INFO		cpu_to_le16(SMB2_QUERY_INFO_HE)
+#define SMB2_SET_INFO		cpu_to_le16(SMB2_SET_INFO_HE)
+#define SMB2_OPLOCK_BREAK	cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+
+#define NUMBER_OF_SMB2_COMMANDS	0x0013
+
+/* BB FIXME - analyze following length BB */
+#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
+
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+
+/*
+ * SMB2 Header Definition
+ *
+ * "MBZ" :  Must be Zero
+ * "BB"  :  BugBug, Something to check/review/analyze later
+ * "PDU" :  "Protocol Data Unit" (ie a network "frame")
+ *
+ */
+
+#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
+
+struct smb2_hdr {
+	__be32 smb2_buf_length;	/* big endian on wire */
+				/* length is only two or three bytes - with
+				 one or two byte type preceding it that MBZ */
+	__u8   ProtocolId[4];	/* 0xFE 'S' 'M' 'B' */
+	__le16 StructureSize;	/* 64 */
+	__le16 CreditCharge;	/* MBZ */
+	__le32 Status;		/* Error from server */
+	__le16 Command;
+	__le16 CreditRequest;  /* CreditResponse */
+	__le32 Flags;
+	__le32 NextCommand;
+	__le64 MessageId;
+	__le32 ProcessId;
+	__u32  TreeId;		/* opaque - so do not make little endian */
+	__u64  SessionId;	/* opaque - so do not make little endian */
+	__u8   Signature[16];
+} __packed;
+
+struct smb2_pdu {
+	struct smb2_hdr hdr;
+	__le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+struct smb2_transform_hdr {
+	__be32 smb2_buf_length;	/* big endian on wire */
+				/* length is only two or three bytes - with
+				 one or two byte type preceding it that MBZ */
+	__u8   ProtocolId[4];	/* 0xFD 'S' 'M' 'B' */
+	__u8   Signature[16];
+	__u8   Nonce[11];
+	__u8   Reserved[5];
+	__le32 OriginalMessageSize;
+	__u16  Reserved1;
+	__le16 EncryptionAlgorithm;
+	__u64  SessionId;
+} __packed;
+
+/* Encryption Algorithms */
+#define SMB2_ENCRYPTION_AES128_CCM	cpu_to_le16(0x0001)
+
+/*
+ *	SMB2 flag definitions
+ */
+#define SMB2_FLAGS_SERVER_TO_REDIR	cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND	cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS	cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED		cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_DFS_OPERATIONS	cpu_to_le32(0x10000000)
+
+/*
+ *	Definitions for SMB2 Protocol Data Units (network frames)
+ *
+ *  See MS-SMB2.PDF specification for protocol details.
+ *  The Naming convention is the lower case version of the SMB2
+ *  command code name for the struct. Note that structures must be packed.
+ *
+ */
+
+#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
+
+struct smb2_err_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;
+	__le16 Reserved; /* MBZ */
+	__le32 ByteCount;  /* even if zero, at least one byte follows */
+	__u8   ErrorData[1];  /* variable length */
+} __packed;
+
+struct smb2_symlink_err_rsp {
+	__le32 SymLinkLength;
+	__le32 SymLinkErrorTag;
+	__le32 ReparseTag;
+	__le16 ReparseDataLength;
+	__le16 UnparsedPathLength;
+	__le16 SubstituteNameOffset;
+	__le16 SubstituteNameLength;
+	__le16 PrintNameOffset;
+	__le16 PrintNameLength;
+	__le32 Flags;
+	__u8  PathBuffer[0];
+} __packed;
+
+#define SMB2_CLIENT_GUID_SIZE 16
+
+struct smb2_negotiate_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 36 */
+	__le16 DialectCount;
+	__le16 SecurityMode;
+	__le16 Reserved;	/* MBZ */
+	__le32 Capabilities;
+	__u8   ClientGUID[SMB2_CLIENT_GUID_SIZE];
+	__le64 ClientStartTime;	/* MBZ */
+	__le16 Dialects[1]; /* One dialect (vers=) at a time for now */
+} __packed;
+
+/* Dialects */
+#define SMB20_PROT_ID 0x0202
+#define SMB21_PROT_ID 0x0210
+#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
+#define BAD_PROT_ID   0xFFFF
+
+/* SecurityMode flags */
+#define	SMB2_NEGOTIATE_SIGNING_ENABLED	0x0001
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED	0x0002
+/* Capabilities flags */
+#define SMB2_GLOBAL_CAP_DFS		0x00000001
+#define SMB2_GLOBAL_CAP_LEASING		0x00000002 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU	0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_MULTI_CHANNEL	0x00000008 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING  0x00000020 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_ENCRYPTION	0x00000040 /* New to SMB3 */
+/* Internal types */
+#define SMB2_NT_FIND			0x00100000
+#define SMB2_LARGE_FILES		0x00200000
+
+struct smb2_negotiate_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 65 */
+	__le16 SecurityMode;
+	__le16 DialectRevision;
+	__le16 Reserved;	/* MBZ */
+	__u8   ServerGUID[16];
+	__le32 Capabilities;
+	__le32 MaxTransactSize;
+	__le32 MaxReadSize;
+	__le32 MaxWriteSize;
+	__le64 SystemTime;	/* MBZ */
+	__le64 ServerStartTime;
+	__le16 SecurityBufferOffset;
+	__le16 SecurityBufferLength;
+	__le32 Reserved2;	/* may be any value, ignore */
+	__u8   Buffer[1];	/* variable length GSS security buffer */
+} __packed;
+
+struct smb2_sess_setup_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 25 */
+	__u8   VcNumber;
+	__u8   SecurityMode;
+	__le32 Capabilities;
+	__le32 Channel;
+	__le16 SecurityBufferOffset;
+	__le16 SecurityBufferLength;
+	__le64 PreviousSessionId;
+	__u8   Buffer[1];	/* variable length GSS security buffer */
+} __packed;
+
+/* Currently defined SessionFlags */
+#define SMB2_SESSION_FLAG_IS_GUEST	0x0001
+#define SMB2_SESSION_FLAG_IS_NULL	0x0002
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA	0x0004
+struct smb2_sess_setup_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 9 */
+	__le16 SessionFlags;
+	__le16 SecurityBufferOffset;
+	__le16 SecurityBufferLength;
+	__u8   Buffer[1];	/* variable length GSS security buffer */
+} __packed;
+
+struct smb2_logoff_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_logoff_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_tree_connect_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 9 */
+	__le16 Reserved;
+	__le16 PathOffset;
+	__le16 PathLength;
+	__u8   Buffer[1];	/* variable length */
+} __packed;
+
+struct smb2_tree_connect_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 16 */
+	__u8   ShareType;  /* see below */
+	__u8   Reserved;
+	__le32 ShareFlags; /* see below */
+	__le32 Capabilities; /* see below */
+	__le32 MaximalAccess;
+} __packed;
+
+/* Possible ShareType values */
+#define SMB2_SHARE_TYPE_DISK	0x01
+#define SMB2_SHARE_TYPE_PIPE	0x02
+#define	SMB2_SHARE_TYPE_PRINT	0x03
+
+/*
+ * Possible ShareFlags - exactly one and only one of the first 4 caching flags
+ * must be set (any of the remaining, SHI1005, flags may be set individually
+ * or in combination.
+ */
+#define SMB2_SHAREFLAG_MANUAL_CACHING			0x00000000
+#define SMB2_SHAREFLAG_AUTO_CACHING			0x00000010
+#define SMB2_SHAREFLAG_VDO_CACHING			0x00000020
+#define SMB2_SHAREFLAG_NO_CACHING			0x00000030
+#define SHI1005_FLAGS_DFS				0x00000001
+#define SHI1005_FLAGS_DFS_ROOT				0x00000002
+#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS		0x00000100
+#define SHI1005_FLAGS_FORCE_SHARED_DELETE		0x00000200
+#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING		0x00000400
+#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM	0x00000800
+#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK		0x00001000
+#define SHI1005_FLAGS_ENABLE_HASH_V1			0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2			0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA			0x00008000
+#define SHI1005_FLAGS_ALL				0x0000FF33
+
+/* Possible share capabilities */
+#define SMB2_SHARE_CAP_DFS	cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT	cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER	cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
+
+struct smb2_tree_disconnect_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_tree_disconnect_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+/* File Attrubutes */
+#define FILE_ATTRIBUTE_READONLY			0x00000001
+#define FILE_ATTRIBUTE_HIDDEN			0x00000002
+#define FILE_ATTRIBUTE_SYSTEM			0x00000004
+#define FILE_ATTRIBUTE_DIRECTORY		0x00000010
+#define FILE_ATTRIBUTE_ARCHIVE			0x00000020
+#define FILE_ATTRIBUTE_NORMAL			0x00000080
+#define FILE_ATTRIBUTE_TEMPORARY		0x00000100
+#define FILE_ATTRIBUTE_SPARSE_FILE		0x00000200
+#define FILE_ATTRIBUTE_REPARSE_POINT		0x00000400
+#define FILE_ATTRIBUTE_COMPRESSED		0x00000800
+#define FILE_ATTRIBUTE_OFFLINE			0x00001000
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED	0x00002000
+#define FILE_ATTRIBUTE_ENCRYPTED		0x00004000
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM		0x00008000
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA		0x00020000
+
+/* Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE		0x00
+#define SMB2_OPLOCK_LEVEL_II		0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE	0x08
+#define SMB2_OPLOCK_LEVEL_BATCH		0x09
+#define SMB2_OPLOCK_LEVEL_LEASE		0xFF
+/* Non-spec internal type */
+#define SMB2_OPLOCK_LEVEL_NOCHANGE	0x99
+
+/* Desired Access Flags */
+#define FILE_READ_DATA_LE		cpu_to_le32(0x00000001)
+#define FILE_WRITE_DATA_LE		cpu_to_le32(0x00000002)
+#define FILE_APPEND_DATA_LE		cpu_to_le32(0x00000004)
+#define FILE_READ_EA_LE			cpu_to_le32(0x00000008)
+#define FILE_WRITE_EA_LE		cpu_to_le32(0x00000010)
+#define FILE_EXECUTE_LE			cpu_to_le32(0x00000020)
+#define FILE_READ_ATTRIBUTES_LE		cpu_to_le32(0x00000080)
+#define FILE_WRITE_ATTRIBUTES_LE	cpu_to_le32(0x00000100)
+#define FILE_DELETE_LE			cpu_to_le32(0x00010000)
+#define FILE_READ_CONTROL_LE		cpu_to_le32(0x00020000)
+#define FILE_WRITE_DAC_LE		cpu_to_le32(0x00040000)
+#define FILE_WRITE_OWNER_LE		cpu_to_le32(0x00080000)
+#define FILE_SYNCHRONIZE_LE		cpu_to_le32(0x00100000)
+#define FILE_ACCESS_SYSTEM_SECURITY_LE	cpu_to_le32(0x01000000)
+#define FILE_MAXIMAL_ACCESS_LE		cpu_to_le32(0x02000000)
+#define FILE_GENERIC_ALL_LE		cpu_to_le32(0x10000000)
+#define FILE_GENERIC_EXECUTE_LE		cpu_to_le32(0x20000000)
+#define FILE_GENERIC_WRITE_LE		cpu_to_le32(0x40000000)
+#define FILE_GENERIC_READ_LE		cpu_to_le32(0x80000000)
+
+/* ShareAccess Flags */
+#define FILE_SHARE_READ_LE		cpu_to_le32(0x00000001)
+#define FILE_SHARE_WRITE_LE		cpu_to_le32(0x00000002)
+#define FILE_SHARE_DELETE_LE		cpu_to_le32(0x00000004)
+#define FILE_SHARE_ALL_LE		cpu_to_le32(0x00000007)
+
+/* CreateDisposition Flags */
+#define FILE_SUPERSEDE_LE		cpu_to_le32(0x00000000)
+#define FILE_OPEN_LE			cpu_to_le32(0x00000001)
+#define FILE_CREATE_LE			cpu_to_le32(0x00000002)
+#define	FILE_OPEN_IF_LE			cpu_to_le32(0x00000003)
+#define FILE_OVERWRITE_LE		cpu_to_le32(0x00000004)
+#define FILE_OVERWRITE_IF_LE		cpu_to_le32(0x00000005)
+
+/* CreateOptions Flags */
+#define FILE_DIRECTORY_FILE_LE		cpu_to_le32(0x00000001)
+/* same as #define CREATE_NOT_FILE_LE	cpu_to_le32(0x00000001) */
+#define FILE_WRITE_THROUGH_LE		cpu_to_le32(0x00000002)
+#define FILE_SEQUENTIAL_ONLY_LE		cpu_to_le32(0x00000004)
+#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008)
+#define FILE_SYNCHRONOUS_IO_ALERT_LE	cpu_to_le32(0x00000010)
+#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE	cpu_to_le32(0x00000020)
+#define FILE_NON_DIRECTORY_FILE_LE	cpu_to_le32(0x00000040)
+#define FILE_COMPLETE_IF_OPLOCKED_LE	cpu_to_le32(0x00000100)
+#define FILE_NO_EA_KNOWLEDGE_LE		cpu_to_le32(0x00000200)
+#define FILE_RANDOM_ACCESS_LE		cpu_to_le32(0x00000800)
+#define FILE_DELETE_ON_CLOSE_LE		cpu_to_le32(0x00001000)
+#define FILE_OPEN_BY_FILE_ID_LE		cpu_to_le32(0x00002000)
+#define FILE_OPEN_FOR_BACKUP_INTENT_LE	cpu_to_le32(0x00004000)
+#define FILE_NO_COMPRESSION_LE		cpu_to_le32(0x00008000)
+#define FILE_RESERVE_OPFILTER_LE	cpu_to_le32(0x00100000)
+#define FILE_OPEN_REPARSE_POINT_LE	cpu_to_le32(0x00200000)
+#define FILE_OPEN_NO_RECALL_LE		cpu_to_le32(0x00400000)
+#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
+
+#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
+			| FILE_READ_ATTRIBUTES_LE)
+#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
+			| FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
+#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
+
+/* Impersonation Levels */
+#define IL_ANONYMOUS		cpu_to_le32(0x00000000)
+#define IL_IDENTIFICATION	cpu_to_le32(0x00000001)
+#define IL_IMPERSONATION	cpu_to_le32(0x00000002)
+#define IL_DELEGATE		cpu_to_le32(0x00000003)
+
+/* Create Context Values */
+#define SMB2_CREATE_EA_BUFFER			"ExtA" /* extended attributes */
+#define SMB2_CREATE_SD_BUFFER			"SecD" /* security descriptor */
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST	"DHnQ"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT	"DHnC"
+#define SMB2_CREATE_ALLOCATION_SIZE		"AISi"
+#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
+#define SMB2_CREATE_TIMEWARP_REQUEST		"TWrp"
+#define SMB2_CREATE_QUERY_ON_DISK_ID		"QFid"
+#define SMB2_CREATE_REQUEST_LEASE		"RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2	"DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2	"DH2C"
+#define SMB2_CREATE_APP_INSTANCE_ID	0x45BCA66AEFA7F74A9008FA462E144D74
+#define SVHDX_OPEN_DEVICE_CONTEXT	0x83CE6F1AD851E0986E34401CC9BCFCE9
+
+struct smb2_create_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u8   SecurityFlags;
+	__u8   RequestedOplockLevel;
+	__le32 ImpersonationLevel;
+	__le64 SmbCreateFlags;
+	__le64 Reserved;
+	__le32 DesiredAccess;
+	__le32 FileAttributes;
+	__le32 ShareAccess;
+	__le32 CreateDisposition;
+	__le32 CreateOptions;
+	__le16 NameOffset;
+	__le16 NameLength;
+	__le32 CreateContextsOffset;
+	__le32 CreateContextsLength;
+	__u8   Buffer[0];
+} __packed;
+
+struct smb2_create_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 89 */
+	__u8   OplockLevel;
+	__u8   Reserved;
+	__le32 CreateAction;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 AllocationSize;
+	__le64 EndofFile;
+	__le32 FileAttributes;
+	__le32 Reserved2;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 CreateContextsOffset;
+	__le32 CreateContextsLength;
+	__u8   Buffer[1];
+} __packed;
+
+struct create_context {
+	__le32 Next;
+	__le16 NameOffset;
+	__le16 NameLength;
+	__le16 Reserved;
+	__le16 DataOffset;
+	__le32 DataLength;
+	__u8 Buffer[0];
+} __packed;
+
+#define SMB2_LEASE_READ_CACHING_HE	0x01
+#define SMB2_LEASE_HANDLE_CACHING_HE	0x02
+#define SMB2_LEASE_WRITE_CACHING_HE	0x04
+
+#define SMB2_LEASE_NONE			cpu_to_le32(0x00)
+#define SMB2_LEASE_READ_CACHING		cpu_to_le32(0x01)
+#define SMB2_LEASE_HANDLE_CACHING	cpu_to_le32(0x02)
+#define SMB2_LEASE_WRITE_CACHING	cpu_to_le32(0x04)
+
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02)
+
+#define SMB2_LEASE_KEY_SIZE 16
+
+struct lease_context {
+	__le64 LeaseKeyLow;
+	__le64 LeaseKeyHigh;
+	__le32 LeaseState;
+	__le32 LeaseFlags;
+	__le64 LeaseDuration;
+} __packed;
+
+struct lease_context_v2 {
+	__le64 LeaseKeyLow;
+	__le64 LeaseKeyHigh;
+	__le32 LeaseState;
+	__le32 LeaseFlags;
+	__le64 LeaseDuration;
+	__le64 ParentLeaseKeyLow;
+	__le64 ParentLeaseKeyHigh;
+	__le16 Epoch;
+	__le16 Reserved;
+} __packed;
+
+struct create_lease {
+	struct create_context ccontext;
+	__u8   Name[8];
+	struct lease_context lcontext;
+} __packed;
+
+struct create_lease_v2 {
+	struct create_context ccontext;
+	__u8   Name[8];
+	struct lease_context_v2 lcontext;
+	__u8   Pad[4];
+} __packed;
+
+struct create_durable {
+	struct create_context ccontext;
+	__u8   Name[8];
+	union {
+		__u8  Reserved[16];
+		struct {
+			__u64 PersistentFileId;
+			__u64 VolatileFileId;
+		} Fid;
+	} Data;
+} __packed;
+
+#define COPY_CHUNK_RES_KEY_SIZE	24
+struct resume_key_req {
+	char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
+	__le32	ContextLength;	/* MBZ */
+	char	Context[0];	/* ignored, Windows sets to 4 bytes of zero */
+} __packed;
+
+/* this goes in the ioctl buffer when doing a copychunk request */
+struct copychunk_ioctl {
+	char SourceKey[COPY_CHUNK_RES_KEY_SIZE];
+	__le32 ChunkCount; /* we are only sending 1 */
+	__le32 Reserved;
+	/* array will only be one chunk long for us */
+	__le64 SourceOffset;
+	__le64 TargetOffset;
+	__le32 Length; /* how many bytes to copy */
+	__u32 Reserved2;
+} __packed;
+
+/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
+struct file_zero_data_information {
+	__le64	FileOffset;
+	__le64	BeyondFinalZero;
+} __packed;
+
+struct copychunk_ioctl_rsp {
+	__le32 ChunksWritten;
+	__le32 ChunkBytesWritten;
+	__le32 TotalBytesWritten;
+} __packed;
+
+struct validate_negotiate_info_req {
+	__le32 Capabilities;
+	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
+	__le16 SecurityMode;
+	__le16 DialectCount;
+	__le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
+} __packed;
+
+struct validate_negotiate_info_rsp {
+	__le32 Capabilities;
+	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
+	__le16 SecurityMode;
+	__le16 Dialect; /* Dialect in use for the connection */
+} __packed;
+
+#define RSS_CAPABLE	0x00000001
+#define RDMA_CAPABLE	0x00000002
+
+struct network_interface_info_ioctl_rsp {
+	__le32 Next; /* next interface. zero if this is last one */
+	__le32 IfIndex;
+	__le32 Capability; /* RSS or RDMA Capable */
+	__le32 Reserved;
+	__le64 LinkSpeed;
+	char	SockAddr_Storage[128];
+} __packed;
+
+#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
+
+struct compress_ioctl {
+	__le16 CompressionState; /* See cifspdu.h for possible flag values */
+} __packed;
+
+struct smb2_ioctl_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u16 Reserved;
+	__le32 CtlCode;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 InputOffset;
+	__le32 InputCount;
+	__le32 MaxInputResponse;
+	__le32 OutputOffset;
+	__le32 OutputCount;
+	__le32 MaxOutputResponse;
+	__le32 Flags;
+	__u32  Reserved2;
+	__u8   Buffer[0];
+} __packed;
+
+struct smb2_ioctl_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u16 Reserved;
+	__le32 CtlCode;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 InputOffset;
+	__le32 InputCount;
+	__le32 OutputOffset;
+	__le32 OutputCount;
+	__le32 Flags;
+	__u32  Reserved2;
+	/* char * buffer[] */
+} __packed;
+
+/* Currently defined values for close flags */
+#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB	cpu_to_le16(0x0001)
+struct smb2_close_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 24 */
+	__le16 Flags;
+	__le32 Reserved;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+} __packed;
+
+struct smb2_close_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* 60 */
+	__le16 Flags;
+	__le32 Reserved;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 AllocationSize;	/* Beginning of FILE_STANDARD_INFO equivalent */
+	__le64 EndOfFile;
+	__le32 Attributes;
+} __packed;
+
+struct smb2_flush_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 24 */
+	__le16 Reserved1;
+	__le32 Reserved2;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+} __packed;
+
+struct smb2_flush_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;
+	__le16 Reserved;
+} __packed;
+
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED	0x01
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE		0x00000000
+#define SMB2_CHANNEL_RDMA_V1		0x00000001 /* SMB3 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
+
+struct smb2_read_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 49 */
+	__u8   Padding; /* offset from start of SMB2 header to place read */
+	__u8   Flags; /* MBZ unless SMB3.02 or later */
+	__le32 Length;
+	__le64 Offset;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 MinimumCount;
+	__le32 Channel; /* MBZ except for SMB3 or later */
+	__le32 RemainingBytes;
+	__le16 ReadChannelInfoOffset; /* Reserved MBZ */
+	__le16 ReadChannelInfoLength; /* Reserved MBZ */
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_read_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 17 */
+	__u8   DataOffset;
+	__u8   Reserved;
+	__le32 DataLength;
+	__le32 DataRemaining;
+	__u32  Reserved2;
+	__u8   Buffer[1];
+} __packed;
+
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH	0x00000001	/* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED	0x00000002	/* SMB3.02 or later */
+
+struct smb2_write_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 49 */
+	__le16 DataOffset; /* offset from start of SMB2 header to write data */
+	__le32 Length;
+	__le64 Offset;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 Channel; /* Reserved MBZ */
+	__le32 RemainingBytes;
+	__le16 WriteChannelInfoOffset; /* Reserved MBZ */
+	__le16 WriteChannelInfoLength; /* Reserved MBZ */
+	__le32 Flags;
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_write_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 17 */
+	__u8   DataOffset;
+	__u8   Reserved;
+	__le32 DataLength;
+	__le32 DataRemaining;
+	__u32  Reserved2;
+	__u8   Buffer[1];
+} __packed;
+
+#define SMB2_LOCKFLAG_SHARED_LOCK	0x0001
+#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK	0x0002
+#define SMB2_LOCKFLAG_UNLOCK		0x0004
+#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY	0x0010
+
+struct smb2_lock_element {
+	__le64 Offset;
+	__le64 Length;
+	__le32 Flags;
+	__le32 Reserved;
+} __packed;
+
+struct smb2_lock_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 48 */
+	__le16 LockCount;
+	__le32 Reserved;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	/* Followed by at least one */
+	struct smb2_lock_element locks[1];
+} __packed;
+
+struct smb2_lock_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_echo_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__u16  Reserved;
+} __packed;
+
+struct smb2_echo_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__u16  Reserved;
+} __packed;
+
+/* search (query_directory) Flags field */
+#define SMB2_RESTART_SCANS		0x01
+#define SMB2_RETURN_SINGLE_ENTRY	0x02
+#define SMB2_INDEX_SPECIFIED		0x04
+#define SMB2_REOPEN			0x10
+
+struct smb2_query_directory_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 33 */
+	__u8   FileInformationClass;
+	__u8   Flags;
+	__le32 FileIndex;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le16 FileNameOffset;
+	__le16 FileNameLength;
+	__le32 OutputBufferLength;
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_query_directory_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 9 */
+	__le16 OutputBufferOffset;
+	__le32 OutputBufferLength;
+	__u8   Buffer[1];
+} __packed;
+
+/* Possible InfoType values */
+#define SMB2_O_INFO_FILE	0x01
+#define SMB2_O_INFO_FILESYSTEM	0x02
+#define SMB2_O_INFO_SECURITY	0x03
+#define SMB2_O_INFO_QUOTA	0x04
+
+/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */
+#define OWNER_SECINFO   0x00000001
+#define GROUP_SECINFO   0x00000002
+#define DACL_SECINFO   0x00000004
+#define SACL_SECINFO   0x00000008
+#define LABEL_SECINFO   0x00000010
+#define ATTRIBUTE_SECINFO   0x00000020
+#define SCOPE_SECINFO   0x00000040
+#define BACKUP_SECINFO   0x00010000
+#define UNPROTECTED_SACL_SECINFO   0x10000000
+#define UNPROTECTED_DACL_SECINFO   0x20000000
+#define PROTECTED_SACL_SECINFO   0x40000000
+#define PROTECTED_DACL_SECINFO   0x80000000
+
+/* Flags used for FileFullEAinfo */
+#define SL_RESTART_SCAN		0x00000001
+#define SL_RETURN_SINGLE_ENTRY	0x00000002
+#define SL_INDEX_SPECIFIED	0x00000004
+
+struct smb2_query_info_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 41 */
+	__u8   InfoType;
+	__u8   FileInfoClass;
+	__le32 OutputBufferLength;
+	__le16 InputBufferOffset;
+	__u16  Reserved;
+	__le32 InputBufferLength;
+	__le32 AdditionalInformation;
+	__le32 Flags;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_query_info_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 9 */
+	__le16 OutputBufferOffset;
+	__le32 OutputBufferLength;
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_set_info_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 33 */
+	__u8   InfoType;
+	__u8   FileInfoClass;
+	__le32 BufferLength;
+	__le16 BufferOffset;
+	__u16  Reserved;
+	__le32 AdditionalInformation;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_set_info_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 2 */
+} __packed;
+
+struct smb2_oplock_break {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 24 */
+	__u8   OplockLevel;
+	__u8   Reserved;
+	__le32 Reserved2;
+	__u64  PersistentFid;
+	__u64  VolatileFid;
+} __packed;
+
+#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
+
+struct smb2_lease_break {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 44 */
+	__le16 Reserved;
+	__le32 Flags;
+	__u8   LeaseKey[16];
+	__le32 CurrentLeaseState;
+	__le32 NewLeaseState;
+	__le32 BreakReason;
+	__le32 AccessMaskHint;
+	__le32 ShareMaskHint;
+} __packed;
+
+struct smb2_lease_ack {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 36 */
+	__le16 Reserved;
+	__le32 Flags;
+	__u8   LeaseKey[16];
+	__le32 LeaseState;
+	__le64 LeaseDuration;
+} __packed;
+
+/*
+ *	PDU infolevel structure definitions
+ *	BB consider moving to a different header
+ */
+
+/* File System Information Classes */
+#define FS_VOLUME_INFORMATION		1 /* Query */
+#define FS_LABEL_INFORMATION		2 /* Local only */
+#define FS_SIZE_INFORMATION		3 /* Query */
+#define FS_DEVICE_INFORMATION		4 /* Query */
+#define FS_ATTRIBUTE_INFORMATION	5 /* Query */
+#define FS_CONTROL_INFORMATION		6 /* Query, Set */
+#define FS_FULL_SIZE_INFORMATION	7 /* Query */
+#define FS_OBJECT_ID_INFORMATION	8 /* Query, Set */
+#define FS_DRIVER_PATH_INFORMATION	9 /* Local only */
+#define FS_VOLUME_FLAGS_INFORMATION	10 /* Local only */
+#define FS_SECTOR_SIZE_INFORMATION	11 /* SMB3 or later. Query */
+
+struct smb2_fs_full_size_info {
+	__le64 TotalAllocationUnits;
+	__le64 CallerAvailableAllocationUnits;
+	__le64 ActualAvailableAllocationUnits;
+	__le32 SectorsPerAllocationUnit;
+	__le32 BytesPerSector;
+} __packed;
+
+#define SSINFO_FLAGS_ALIGNED_DEVICE		0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY		0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED		0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+	__le32 LogicalBytesPerSector;
+	__le32 PhysicalBytesPerSectorForAtomicity;
+	__le32 PhysicalBytesPerSectorForPerf;
+	__le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity;
+	__le32 Flags;
+	__le32 ByteOffsetForSectorAlignment;
+	__le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
+/* partial list of QUERY INFO levels */
+#define FILE_DIRECTORY_INFORMATION	1
+#define FILE_FULL_DIRECTORY_INFORMATION 2
+#define FILE_BOTH_DIRECTORY_INFORMATION 3
+#define FILE_BASIC_INFORMATION		4
+#define FILE_STANDARD_INFORMATION	5
+#define FILE_INTERNAL_INFORMATION	6
+#define FILE_EA_INFORMATION	        7
+#define FILE_ACCESS_INFORMATION		8
+#define FILE_NAME_INFORMATION		9
+#define FILE_RENAME_INFORMATION		10
+#define FILE_LINK_INFORMATION		11
+#define FILE_NAMES_INFORMATION		12
+#define FILE_DISPOSITION_INFORMATION	13
+#define FILE_POSITION_INFORMATION	14
+#define FILE_FULL_EA_INFORMATION	15
+#define FILE_MODE_INFORMATION		16
+#define FILE_ALIGNMENT_INFORMATION	17
+#define FILE_ALL_INFORMATION		18
+#define FILE_ALLOCATION_INFORMATION	19
+#define FILE_END_OF_FILE_INFORMATION	20
+#define FILE_ALTERNATE_NAME_INFORMATION 21
+#define FILE_STREAM_INFORMATION		22
+#define FILE_PIPE_INFORMATION		23
+#define FILE_PIPE_LOCAL_INFORMATION	24
+#define FILE_PIPE_REMOTE_INFORMATION	25
+#define FILE_MAILSLOT_QUERY_INFORMATION 26
+#define FILE_MAILSLOT_SET_INFORMATION	27
+#define FILE_COMPRESSION_INFORMATION	28
+#define FILE_OBJECT_ID_INFORMATION	29
+/* Number 30 not defined in documents */
+#define FILE_MOVE_CLUSTER_INFORMATION	31
+#define FILE_QUOTA_INFORMATION		32
+#define FILE_REPARSE_POINT_INFORMATION	33
+#define FILE_NETWORK_OPEN_INFORMATION	34
+#define FILE_ATTRIBUTE_TAG_INFORMATION	35
+#define FILE_TRACKING_INFORMATION	36
+#define FILEID_BOTH_DIRECTORY_INFORMATION 37
+#define FILEID_FULL_DIRECTORY_INFORMATION 38
+#define FILE_VALID_DATA_LENGTH_INFORMATION 39
+#define FILE_SHORT_NAME_INFORMATION	40
+#define FILE_SFIO_RESERVE_INFORMATION	44
+#define FILE_SFIO_VOLUME_INFORMATION	45
+#define FILE_HARD_LINK_INFORMATION	46
+#define FILE_NORMALIZED_NAME_INFORMATION 48
+#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
+#define FILE_STANDARD_LINK_INFORMATION	54
+
+struct smb2_file_internal_info {
+	__le64 IndexNumber;
+} __packed; /* level 6 Query */
+
+struct smb2_file_rename_info { /* encoding of request for level 10 */
+	__u8   ReplaceIfExists; /* 1 = replace existing target with new */
+				/* 0 = fail if target already exists */
+	__u8   Reserved[7];
+	__u64  RootDirectory;  /* MBZ for network operations (why says spec?) */
+	__le32 FileNameLength;
+	char   FileName[0];     /* New name to be assigned */
+} __packed; /* level 10 Set */
+
+struct smb2_file_link_info { /* encoding of request for level 11 */
+	__u8   ReplaceIfExists; /* 1 = replace existing link with new */
+				/* 0 = fail if link already exists */
+	__u8   Reserved[7];
+	__u64  RootDirectory;  /* MBZ for network operations (why says spec?) */
+	__le32 FileNameLength;
+	char   FileName[0];     /* Name to be assigned to new link */
+} __packed; /* level 11 Set */
+
+/*
+ * This level 18, although with struct with same name is different from cifs
+ * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
+ * CurrentByteOffset.
+ */
+struct smb2_file_all_info { /* data block encoding of response to level 18 */
+	__le64 CreationTime;	/* Beginning of FILE_BASIC_INFO equivalent */
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 Attributes;
+	__u32  Pad1;		/* End of FILE_BASIC_INFO_INFO equivalent */
+	__le64 AllocationSize;	/* Beginning of FILE_STANDARD_INFO equivalent */
+	__le64 EndOfFile;	/* size ie offset to first free byte in file */
+	__le32 NumberOfLinks;	/* hard links */
+	__u8   DeletePending;
+	__u8   Directory;
+	__u16  Pad2;		/* End of FILE_STANDARD_INFO equivalent */
+	__le64 IndexNumber;
+	__le32 EASize;
+	__le32 AccessFlags;
+	__le64 CurrentByteOffset;
+	__le32 Mode;
+	__le32 AlignmentRequirement;
+	__le32 FileNameLength;
+	char   FileName[1];
+} __packed; /* level 18 Query */
+
+struct smb2_file_eof_info { /* encoding of request for level 10 */
+	__le64 EndOfFile; /* new end of file value */
+} __packed; /* level 20 Set */
+
+#endif				/* _SMB2PDU_H */
diff --git a/kernel/fs/cifs/smb2proto.h b/kernel/fs/cifs/smb2proto.h
new file mode 100644
index 000000000..79dc650c1
--- /dev/null
+++ b/kernel/fs/cifs/smb2proto.h
@@ -0,0 +1,174 @@
+/*
+ *   fs/cifs/smb2proto.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _SMB2PROTO_H
+#define _SMB2PROTO_H
+#include <linux/nls.h>
+#include <linux/key-type.h>
+
+struct statfs;
+struct smb_rqst;
+
+/*
+ *****************************************************************
+ * All Prototypes
+ *****************************************************************
+ */
+extern int map_smb2_to_linux_error(char *buf, bool log_err);
+extern int smb2_check_message(char *buf, unsigned int length);
+extern unsigned int smb2_calc_size(void *buf);
+extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
+extern __le16 *cifs_convert_path_to_utf16(const char *from,
+					  struct cifs_sb_info *cifs_sb);
+
+extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *);
+extern int smb2_check_receive(struct mid_q_entry *mid,
+			      struct TCP_Server_Info *server, bool log_error);
+extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
+			      struct smb_rqst *rqst);
+extern struct mid_q_entry *smb2_setup_async_request(
+			struct TCP_Server_Info *server, struct smb_rqst *rqst);
+extern int smb2_calc_signature(struct smb_rqst *rqst,
+				struct TCP_Server_Info *server);
+extern int smb3_calc_signature(struct smb_rqst *rqst,
+				struct TCP_Server_Info *server);
+extern void smb2_echo_request(struct work_struct *work);
+extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
+extern bool smb2_is_valid_oplock_break(char *buffer,
+				       struct TCP_Server_Info *srv);
+
+extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
+				   struct smb2_file_all_info *src);
+extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+				struct cifs_sb_info *cifs_sb,
+				const char *full_path, FILE_ALL_INFO *data,
+				bool *adjust_tz, bool *symlink);
+extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
+			      const char *full_path, __u64 size,
+			      struct cifs_sb_info *cifs_sb, bool set_alloc);
+extern int smb2_set_file_info(struct inode *inode, const char *full_path,
+			      FILE_BASIC_INFO *buf, const unsigned int xid);
+extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
+		      const char *name, struct cifs_sb_info *cifs_sb);
+extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
+			       struct cifs_sb_info *cifs_sb,
+			       struct cifs_tcon *tcon, const unsigned int xid);
+extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
+		      const char *name, struct cifs_sb_info *cifs_sb);
+extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon,
+		       const char *name, struct cifs_sb_info *cifs_sb);
+extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
+			    const char *from_name, const char *to_name,
+			    struct cifs_sb_info *cifs_sb);
+extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+				const char *from_name, const char *to_name,
+				struct cifs_sb_info *cifs_sb);
+extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			struct cifs_sb_info *cifs_sb, const unsigned char *path,
+			char *pbuf, unsigned int *pbytes_written);
+extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			  struct cifs_sb_info *cifs_sb,
+			  const unsigned char *path, char *pbuf,
+			  unsigned int *pbytes_read);
+extern int smb2_open_file(const unsigned int xid,
+			  struct cifs_open_parms *oparms,
+			  __u32 *oplock, FILE_ALL_INFO *buf);
+extern int smb2_unlock_range(struct cifsFileInfo *cfile,
+			     struct file_lock *flock, const unsigned int xid);
+extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
+
+/*
+ * SMB2 Worker functions - most of protocol specific implementation details
+ * are contained within these calls.
+ */
+extern int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses);
+extern int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
+			   const struct nls_table *nls_cp);
+extern int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses);
+extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
+		     const char *tree, struct cifs_tcon *tcon,
+		     const struct nls_table *);
+extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
+extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
+		     __le16 *path, __u8 *oplock,
+		     struct smb2_file_all_info *buf,
+		     struct smb2_err_rsp **err_buf);
+extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid, u32 opcode,
+		     bool is_fsctl, char *in_data, u32 indatalen,
+		     char **out_data, u32 *plen /* returned data len */);
+extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
+		      u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
+		      u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+			   u64 persistent_file_id, u64 volatile_file_id,
+			   struct smb2_file_all_info *data);
+extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
+			    u64 persistent_fid, u64 volatile_fid,
+			    __le64 *uniqueid);
+extern int smb2_async_readv(struct cifs_readdata *rdata);
+extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
+		     unsigned int *nbytes, char **buf, int *buf_type);
+extern int smb2_async_writev(struct cifs_writedata *wdata,
+			     void (*release)(struct kref *kref));
+extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
+		      unsigned int *nbytes, struct kvec *iov, int n_vec);
+extern int SMB2_echo(struct TCP_Server_Info *server);
+extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
+				u64 persistent_fid, u64 volatile_fid, int index,
+				struct cifs_search_info *srch_inf);
+extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
+		       u64 persistent_fid, u64 volatile_fid,
+		       __le16 *target_file);
+extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+			     u64 persistent_fid, u64 volatile_fid,
+			     __le16 *target_file);
+extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
+			u64 persistent_fid, u64 volatile_fid, u32 pid,
+			__le64 *eof, bool is_fallocate);
+extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_fid, u64 volatile_fid,
+			 FILE_BASIC_INFO *buf);
+extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+				u64 persistent_fid, u64 volatile_fid);
+extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
+			     const u64 persistent_fid, const u64 volatile_fid,
+			     const __u8 oplock_level);
+extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_file_id, u64 volatile_file_id,
+			 struct kstatfs *FSData);
+extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_file_id, u64 volatile_file_id, int lvl);
+extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
+		     const __u64 persist_fid, const __u64 volatile_fid,
+		     const __u32 pid, const __u64 length, const __u64 offset,
+		     const __u32 lockFlags, const bool wait);
+extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+		      const __u64 persist_fid, const __u64 volatile_fid,
+		      const __u32 pid, const __u32 num_lock,
+		      struct smb2_lock_element *buf);
+extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
+			    __u8 *lease_key, const __le32 lease_state);
+extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
+
+#endif			/* _SMB2PROTO_H */
diff --git a/kernel/fs/cifs/smb2status.h b/kernel/fs/cifs/smb2status.h
new file mode 100644
index 000000000..3d5f62150
--- /dev/null
+++ b/kernel/fs/cifs/smb2status.h
@@ -0,0 +1,1782 @@
+/*
+ *   fs/cifs/smb2status.h
+ *
+ *   SMB2 Status code (network error) definitions
+ *   Definitions are from MS-ERREF
+ *
+ *   Copyright (c) International Business Machines  Corp., 2009,2011
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *  0 1 2 3 4 5 6 7 8 9 0 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ *  SEV C N <-------Facility--------> <------Error Status Code------>
+ *
+ *  C is set if "customer defined" error, N bit is reserved and MBZ
+ */
+
+#define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000)
+#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003)
+
+struct ntstatus {
+	/* Facility is the high 12 bits of the following field */
+	__le32 Facility; /* low 2 bits Severity, next is Customer, then rsrvd */
+	__le32 Code;
+};
+
+#define STATUS_SUCCESS __constant_cpu_to_le32(0x00000000)
+#define STATUS_WAIT_0 __constant_cpu_to_le32(0x00000000)
+#define STATUS_WAIT_1 __constant_cpu_to_le32(0x00000001)
+#define STATUS_WAIT_2 __constant_cpu_to_le32(0x00000002)
+#define STATUS_WAIT_3 __constant_cpu_to_le32(0x00000003)
+#define STATUS_WAIT_63 __constant_cpu_to_le32(0x0000003F)
+#define STATUS_ABANDONED __constant_cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_0 __constant_cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_63 __constant_cpu_to_le32(0x000000BF)
+#define STATUS_USER_APC __constant_cpu_to_le32(0x000000C0)
+#define STATUS_KERNEL_APC __constant_cpu_to_le32(0x00000100)
+#define STATUS_ALERTED __constant_cpu_to_le32(0x00000101)
+#define STATUS_TIMEOUT __constant_cpu_to_le32(0x00000102)
+#define STATUS_PENDING __constant_cpu_to_le32(0x00000103)
+#define STATUS_REPARSE __constant_cpu_to_le32(0x00000104)
+#define STATUS_MORE_ENTRIES __constant_cpu_to_le32(0x00000105)
+#define STATUS_NOT_ALL_ASSIGNED __constant_cpu_to_le32(0x00000106)
+#define STATUS_SOME_NOT_MAPPED __constant_cpu_to_le32(0x00000107)
+#define STATUS_OPLOCK_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x00000108)
+#define STATUS_VOLUME_MOUNTED __constant_cpu_to_le32(0x00000109)
+#define STATUS_RXACT_COMMITTED __constant_cpu_to_le32(0x0000010A)
+#define STATUS_NOTIFY_CLEANUP __constant_cpu_to_le32(0x0000010B)
+#define STATUS_NOTIFY_ENUM_DIR __constant_cpu_to_le32(0x0000010C)
+#define STATUS_NO_QUOTAS_FOR_ACCOUNT __constant_cpu_to_le32(0x0000010D)
+#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED __constant_cpu_to_le32(0x0000010E)
+#define STATUS_PAGE_FAULT_TRANSITION __constant_cpu_to_le32(0x00000110)
+#define STATUS_PAGE_FAULT_DEMAND_ZERO __constant_cpu_to_le32(0x00000111)
+#define STATUS_PAGE_FAULT_COPY_ON_WRITE __constant_cpu_to_le32(0x00000112)
+#define STATUS_PAGE_FAULT_GUARD_PAGE __constant_cpu_to_le32(0x00000113)
+#define STATUS_PAGE_FAULT_PAGING_FILE __constant_cpu_to_le32(0x00000114)
+#define STATUS_CACHE_PAGE_LOCKED __constant_cpu_to_le32(0x00000115)
+#define STATUS_CRASH_DUMP __constant_cpu_to_le32(0x00000116)
+#define STATUS_BUFFER_ALL_ZEROS __constant_cpu_to_le32(0x00000117)
+#define STATUS_REPARSE_OBJECT __constant_cpu_to_le32(0x00000118)
+#define STATUS_RESOURCE_REQUIREMENTS_CHANGED __constant_cpu_to_le32(0x00000119)
+#define STATUS_TRANSLATION_COMPLETE __constant_cpu_to_le32(0x00000120)
+#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY __constant_cpu_to_le32(0x00000121)
+#define STATUS_NOTHING_TO_TERMINATE __constant_cpu_to_le32(0x00000122)
+#define STATUS_PROCESS_NOT_IN_JOB __constant_cpu_to_le32(0x00000123)
+#define STATUS_PROCESS_IN_JOB __constant_cpu_to_le32(0x00000124)
+#define STATUS_VOLSNAP_HIBERNATE_READY __constant_cpu_to_le32(0x00000125)
+#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY __constant_cpu_to_le32(0x00000126)
+#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED __constant_cpu_to_le32(0x00000127)
+#define STATUS_INTERRUPT_STILL_CONNECTED __constant_cpu_to_le32(0x00000128)
+#define STATUS_PROCESS_CLONED __constant_cpu_to_le32(0x00000129)
+#define STATUS_FILE_LOCKED_WITH_ONLY_READERS __constant_cpu_to_le32(0x0000012A)
+#define STATUS_FILE_LOCKED_WITH_WRITERS __constant_cpu_to_le32(0x0000012B)
+#define STATUS_RESOURCEMANAGER_READ_ONLY __constant_cpu_to_le32(0x00000202)
+#define STATUS_WAIT_FOR_OPLOCK __constant_cpu_to_le32(0x00000367)
+#define DBG_EXCEPTION_HANDLED __constant_cpu_to_le32(0x00010001)
+#define DBG_CONTINUE __constant_cpu_to_le32(0x00010002)
+#define STATUS_FLT_IO_COMPLETE __constant_cpu_to_le32(0x001C0001)
+#define STATUS_OBJECT_NAME_EXISTS __constant_cpu_to_le32(0x40000000)
+#define STATUS_THREAD_WAS_SUSPENDED __constant_cpu_to_le32(0x40000001)
+#define STATUS_WORKING_SET_LIMIT_RANGE __constant_cpu_to_le32(0x40000002)
+#define STATUS_IMAGE_NOT_AT_BASE __constant_cpu_to_le32(0x40000003)
+#define STATUS_RXACT_STATE_CREATED __constant_cpu_to_le32(0x40000004)
+#define STATUS_SEGMENT_NOTIFICATION __constant_cpu_to_le32(0x40000005)
+#define STATUS_LOCAL_USER_SESSION_KEY __constant_cpu_to_le32(0x40000006)
+#define STATUS_BAD_CURRENT_DIRECTORY __constant_cpu_to_le32(0x40000007)
+#define STATUS_SERIAL_MORE_WRITES __constant_cpu_to_le32(0x40000008)
+#define STATUS_REGISTRY_RECOVERED __constant_cpu_to_le32(0x40000009)
+#define STATUS_FT_READ_RECOVERY_FROM_BACKUP __constant_cpu_to_le32(0x4000000A)
+#define STATUS_FT_WRITE_RECOVERY __constant_cpu_to_le32(0x4000000B)
+#define STATUS_SERIAL_COUNTER_TIMEOUT __constant_cpu_to_le32(0x4000000C)
+#define STATUS_NULL_LM_PASSWORD __constant_cpu_to_le32(0x4000000D)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH __constant_cpu_to_le32(0x4000000E)
+#define STATUS_RECEIVE_PARTIAL __constant_cpu_to_le32(0x4000000F)
+#define STATUS_RECEIVE_EXPEDITED __constant_cpu_to_le32(0x40000010)
+#define STATUS_RECEIVE_PARTIAL_EXPEDITED __constant_cpu_to_le32(0x40000011)
+#define STATUS_EVENT_DONE __constant_cpu_to_le32(0x40000012)
+#define STATUS_EVENT_PENDING __constant_cpu_to_le32(0x40000013)
+#define STATUS_CHECKING_FILE_SYSTEM __constant_cpu_to_le32(0x40000014)
+#define STATUS_FATAL_APP_EXIT __constant_cpu_to_le32(0x40000015)
+#define STATUS_PREDEFINED_HANDLE __constant_cpu_to_le32(0x40000016)
+#define STATUS_WAS_UNLOCKED __constant_cpu_to_le32(0x40000017)
+#define STATUS_SERVICE_NOTIFICATION __constant_cpu_to_le32(0x40000018)
+#define STATUS_WAS_LOCKED __constant_cpu_to_le32(0x40000019)
+#define STATUS_LOG_HARD_ERROR __constant_cpu_to_le32(0x4000001A)
+#define STATUS_ALREADY_WIN32 __constant_cpu_to_le32(0x4000001B)
+#define STATUS_WX86_UNSIMULATE __constant_cpu_to_le32(0x4000001C)
+#define STATUS_WX86_CONTINUE __constant_cpu_to_le32(0x4000001D)
+#define STATUS_WX86_SINGLE_STEP __constant_cpu_to_le32(0x4000001E)
+#define STATUS_WX86_BREAKPOINT __constant_cpu_to_le32(0x4000001F)
+#define STATUS_WX86_EXCEPTION_CONTINUE __constant_cpu_to_le32(0x40000020)
+#define STATUS_WX86_EXCEPTION_LASTCHANCE __constant_cpu_to_le32(0x40000021)
+#define STATUS_WX86_EXCEPTION_CHAIN __constant_cpu_to_le32(0x40000022)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE __constant_cpu_to_le32(0x40000023)
+#define STATUS_NO_YIELD_PERFORMED __constant_cpu_to_le32(0x40000024)
+#define STATUS_TIMER_RESUME_IGNORED __constant_cpu_to_le32(0x40000025)
+#define STATUS_ARBITRATION_UNHANDLED __constant_cpu_to_le32(0x40000026)
+#define STATUS_CARDBUS_NOT_SUPPORTED __constant_cpu_to_le32(0x40000027)
+#define STATUS_WX86_CREATEWX86TIB __constant_cpu_to_le32(0x40000028)
+#define STATUS_MP_PROCESSOR_MISMATCH __constant_cpu_to_le32(0x40000029)
+#define STATUS_HIBERNATED __constant_cpu_to_le32(0x4000002A)
+#define STATUS_RESUME_HIBERNATION __constant_cpu_to_le32(0x4000002B)
+#define STATUS_FIRMWARE_UPDATED __constant_cpu_to_le32(0x4000002C)
+#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES __constant_cpu_to_le32(0x4000002D)
+#define STATUS_MESSAGE_RETRIEVED __constant_cpu_to_le32(0x4000002E)
+#define STATUS_SYSTEM_POWERSTATE_TRANSITION __constant_cpu_to_le32(0x4000002F)
+#define STATUS_ALPC_CHECK_COMPLETION_LIST __constant_cpu_to_le32(0x40000030)
+#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION __constant_cpu_to_le32(0x40000031)
+#define STATUS_ACCESS_AUDIT_BY_POLICY __constant_cpu_to_le32(0x40000032)
+#define STATUS_ABANDON_HIBERFILE __constant_cpu_to_le32(0x40000033)
+#define STATUS_BIZRULES_NOT_ENABLED __constant_cpu_to_le32(0x40000034)
+#define STATUS_WAKE_SYSTEM __constant_cpu_to_le32(0x40000294)
+#define STATUS_DS_SHUTTING_DOWN __constant_cpu_to_le32(0x40000370)
+#define DBG_REPLY_LATER __constant_cpu_to_le32(0x40010001)
+#define DBG_UNABLE_TO_PROVIDE_HANDLE __constant_cpu_to_le32(0x40010002)
+#define DBG_TERMINATE_THREAD __constant_cpu_to_le32(0x40010003)
+#define DBG_TERMINATE_PROCESS __constant_cpu_to_le32(0x40010004)
+#define DBG_CONTROL_C __constant_cpu_to_le32(0x40010005)
+#define DBG_PRINTEXCEPTION_C __constant_cpu_to_le32(0x40010006)
+#define DBG_RIPEXCEPTION __constant_cpu_to_le32(0x40010007)
+#define DBG_CONTROL_BREAK __constant_cpu_to_le32(0x40010008)
+#define DBG_COMMAND_EXCEPTION __constant_cpu_to_le32(0x40010009)
+#define RPC_NT_UUID_LOCAL_ONLY __constant_cpu_to_le32(0x40020056)
+#define RPC_NT_SEND_INCOMPLETE __constant_cpu_to_le32(0x400200AF)
+#define STATUS_CTX_CDM_CONNECT __constant_cpu_to_le32(0x400A0004)
+#define STATUS_CTX_CDM_DISCONNECT __constant_cpu_to_le32(0x400A0005)
+#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT __constant_cpu_to_le32(0x4015000D)
+#define STATUS_RECOVERY_NOT_NEEDED __constant_cpu_to_le32(0x40190034)
+#define STATUS_RM_ALREADY_STARTED __constant_cpu_to_le32(0x40190035)
+#define STATUS_LOG_NO_RESTART __constant_cpu_to_le32(0x401A000C)
+#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST __constant_cpu_to_le32(0x401B00EC)
+#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED __constant_cpu_to_le32(0x401E000A)
+#define STATUS_GRAPHICS_DRIVER_MISMATCH __constant_cpu_to_le32(0x401E0117)
+#define STATUS_GRAPHICS_MODE_NOT_PINNED __constant_cpu_to_le32(0x401E0307)
+#define STATUS_GRAPHICS_NO_PREFERRED_MODE __constant_cpu_to_le32(0x401E031E)
+#define STATUS_GRAPHICS_DATASET_IS_EMPTY __constant_cpu_to_le32(0x401E034B)
+#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET __constant_cpu_to_le32(0x401E034C)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED __constant_cpu_to_le32(0x401E0351)
+#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS __constant_cpu_to_le32(0x401E042F)
+#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED __constant_cpu_to_le32(0x401E0437)
+#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY __constant_cpu_to_le32(0x401E0439)
+#define STATUS_GRAPHICS_START_DEFERRED __constant_cpu_to_le32(0x401E043A)
+#define STATUS_NDIS_INDICATION_REQUIRED __constant_cpu_to_le32(0x40230001)
+#define STATUS_GUARD_PAGE_VIOLATION __constant_cpu_to_le32(0x80000001)
+#define STATUS_DATATYPE_MISALIGNMENT __constant_cpu_to_le32(0x80000002)
+#define STATUS_BREAKPOINT __constant_cpu_to_le32(0x80000003)
+#define STATUS_SINGLE_STEP __constant_cpu_to_le32(0x80000004)
+#define STATUS_BUFFER_OVERFLOW __constant_cpu_to_le32(0x80000005)
+#define STATUS_NO_MORE_FILES __constant_cpu_to_le32(0x80000006)
+#define STATUS_WAKE_SYSTEM_DEBUGGER __constant_cpu_to_le32(0x80000007)
+#define STATUS_HANDLES_CLOSED __constant_cpu_to_le32(0x8000000A)
+#define STATUS_NO_INHERITANCE __constant_cpu_to_le32(0x8000000B)
+#define STATUS_GUID_SUBSTITUTION_MADE __constant_cpu_to_le32(0x8000000C)
+#define STATUS_PARTIAL_COPY __constant_cpu_to_le32(0x8000000D)
+#define STATUS_DEVICE_PAPER_EMPTY __constant_cpu_to_le32(0x8000000E)
+#define STATUS_DEVICE_POWERED_OFF __constant_cpu_to_le32(0x8000000F)
+#define STATUS_DEVICE_OFF_LINE __constant_cpu_to_le32(0x80000010)
+#define STATUS_DEVICE_BUSY __constant_cpu_to_le32(0x80000011)
+#define STATUS_NO_MORE_EAS __constant_cpu_to_le32(0x80000012)
+#define STATUS_INVALID_EA_NAME __constant_cpu_to_le32(0x80000013)
+#define STATUS_EA_LIST_INCONSISTENT __constant_cpu_to_le32(0x80000014)
+#define STATUS_INVALID_EA_FLAG __constant_cpu_to_le32(0x80000015)
+#define STATUS_VERIFY_REQUIRED __constant_cpu_to_le32(0x80000016)
+#define STATUS_EXTRANEOUS_INFORMATION __constant_cpu_to_le32(0x80000017)
+#define STATUS_RXACT_COMMIT_NECESSARY __constant_cpu_to_le32(0x80000018)
+#define STATUS_NO_MORE_ENTRIES __constant_cpu_to_le32(0x8000001A)
+#define STATUS_FILEMARK_DETECTED __constant_cpu_to_le32(0x8000001B)
+#define STATUS_MEDIA_CHANGED __constant_cpu_to_le32(0x8000001C)
+#define STATUS_BUS_RESET __constant_cpu_to_le32(0x8000001D)
+#define STATUS_END_OF_MEDIA __constant_cpu_to_le32(0x8000001E)
+#define STATUS_BEGINNING_OF_MEDIA __constant_cpu_to_le32(0x8000001F)
+#define STATUS_MEDIA_CHECK __constant_cpu_to_le32(0x80000020)
+#define STATUS_SETMARK_DETECTED __constant_cpu_to_le32(0x80000021)
+#define STATUS_NO_DATA_DETECTED __constant_cpu_to_le32(0x80000022)
+#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000023)
+#define STATUS_SERVER_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000024)
+#define STATUS_ALREADY_DISCONNECTED __constant_cpu_to_le32(0x80000025)
+#define STATUS_LONGJUMP __constant_cpu_to_le32(0x80000026)
+#define STATUS_CLEANER_CARTRIDGE_INSTALLED __constant_cpu_to_le32(0x80000027)
+#define STATUS_PLUGPLAY_QUERY_VETOED __constant_cpu_to_le32(0x80000028)
+#define STATUS_UNWIND_CONSOLIDATE __constant_cpu_to_le32(0x80000029)
+#define STATUS_REGISTRY_HIVE_RECOVERED __constant_cpu_to_le32(0x8000002A)
+#define STATUS_DLL_MIGHT_BE_INSECURE __constant_cpu_to_le32(0x8000002B)
+#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE __constant_cpu_to_le32(0x8000002C)
+#define STATUS_STOPPED_ON_SYMLINK __constant_cpu_to_le32(0x8000002D)
+#define STATUS_DEVICE_REQUIRES_CLEANING __constant_cpu_to_le32(0x80000288)
+#define STATUS_DEVICE_DOOR_OPEN __constant_cpu_to_le32(0x80000289)
+#define STATUS_DATA_LOST_REPAIR __constant_cpu_to_le32(0x80000803)
+#define DBG_EXCEPTION_NOT_HANDLED __constant_cpu_to_le32(0x80010001)
+#define STATUS_CLUSTER_NODE_ALREADY_UP __constant_cpu_to_le32(0x80130001)
+#define STATUS_CLUSTER_NODE_ALREADY_DOWN __constant_cpu_to_le32(0x80130002)
+#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE __constant_cpu_to_le32(0x80130003)
+#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE __constant_cpu_to_le32(0x80130004)
+#define STATUS_CLUSTER_NODE_ALREADY_MEMBER __constant_cpu_to_le32(0x80130005)
+#define STATUS_COULD_NOT_RESIZE_LOG __constant_cpu_to_le32(0x80190009)
+#define STATUS_NO_TXF_METADATA __constant_cpu_to_le32(0x80190029)
+#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN __constant_cpu_to_le32(0x80190031)
+#define STATUS_TXF_METADATA_ALREADY_PRESENT __constant_cpu_to_le32(0x80190041)
+#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET __constant_cpu_to_le32(0x80190042)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED __constant_cpu_to_le32(0x801B00EB)
+#define STATUS_FLT_BUFFER_TOO_SMALL __constant_cpu_to_le32(0x801C0001)
+#define STATUS_FVE_PARTIAL_METADATA __constant_cpu_to_le32(0x80210001)
+#define STATUS_UNSUCCESSFUL __constant_cpu_to_le32(0xC0000001)
+#define STATUS_NOT_IMPLEMENTED __constant_cpu_to_le32(0xC0000002)
+#define STATUS_INVALID_INFO_CLASS __constant_cpu_to_le32(0xC0000003)
+#define STATUS_INFO_LENGTH_MISMATCH __constant_cpu_to_le32(0xC0000004)
+#define STATUS_ACCESS_VIOLATION __constant_cpu_to_le32(0xC0000005)
+#define STATUS_IN_PAGE_ERROR __constant_cpu_to_le32(0xC0000006)
+#define STATUS_PAGEFILE_QUOTA __constant_cpu_to_le32(0xC0000007)
+#define STATUS_INVALID_HANDLE __constant_cpu_to_le32(0xC0000008)
+#define STATUS_BAD_INITIAL_STACK __constant_cpu_to_le32(0xC0000009)
+#define STATUS_BAD_INITIAL_PC __constant_cpu_to_le32(0xC000000A)
+#define STATUS_INVALID_CID __constant_cpu_to_le32(0xC000000B)
+#define STATUS_TIMER_NOT_CANCELED __constant_cpu_to_le32(0xC000000C)
+#define STATUS_INVALID_PARAMETER __constant_cpu_to_le32(0xC000000D)
+#define STATUS_NO_SUCH_DEVICE __constant_cpu_to_le32(0xC000000E)
+#define STATUS_NO_SUCH_FILE __constant_cpu_to_le32(0xC000000F)
+#define STATUS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0000010)
+#define STATUS_END_OF_FILE __constant_cpu_to_le32(0xC0000011)
+#define STATUS_WRONG_VOLUME __constant_cpu_to_le32(0xC0000012)
+#define STATUS_NO_MEDIA_IN_DEVICE __constant_cpu_to_le32(0xC0000013)
+#define STATUS_UNRECOGNIZED_MEDIA __constant_cpu_to_le32(0xC0000014)
+#define STATUS_NONEXISTENT_SECTOR __constant_cpu_to_le32(0xC0000015)
+#define STATUS_MORE_PROCESSING_REQUIRED __constant_cpu_to_le32(0xC0000016)
+#define STATUS_NO_MEMORY __constant_cpu_to_le32(0xC0000017)
+#define STATUS_CONFLICTING_ADDRESSES __constant_cpu_to_le32(0xC0000018)
+#define STATUS_NOT_MAPPED_VIEW __constant_cpu_to_le32(0xC0000019)
+#define STATUS_UNABLE_TO_FREE_VM __constant_cpu_to_le32(0xC000001A)
+#define STATUS_UNABLE_TO_DELETE_SECTION __constant_cpu_to_le32(0xC000001B)
+#define STATUS_INVALID_SYSTEM_SERVICE __constant_cpu_to_le32(0xC000001C)
+#define STATUS_ILLEGAL_INSTRUCTION __constant_cpu_to_le32(0xC000001D)
+#define STATUS_INVALID_LOCK_SEQUENCE __constant_cpu_to_le32(0xC000001E)
+#define STATUS_INVALID_VIEW_SIZE __constant_cpu_to_le32(0xC000001F)
+#define STATUS_INVALID_FILE_FOR_SECTION __constant_cpu_to_le32(0xC0000020)
+#define STATUS_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0000021)
+#define STATUS_ACCESS_DENIED __constant_cpu_to_le32(0xC0000022)
+#define STATUS_BUFFER_TOO_SMALL __constant_cpu_to_le32(0xC0000023)
+#define STATUS_OBJECT_TYPE_MISMATCH __constant_cpu_to_le32(0xC0000024)
+#define STATUS_NONCONTINUABLE_EXCEPTION __constant_cpu_to_le32(0xC0000025)
+#define STATUS_INVALID_DISPOSITION __constant_cpu_to_le32(0xC0000026)
+#define STATUS_UNWIND __constant_cpu_to_le32(0xC0000027)
+#define STATUS_BAD_STACK __constant_cpu_to_le32(0xC0000028)
+#define STATUS_INVALID_UNWIND_TARGET __constant_cpu_to_le32(0xC0000029)
+#define STATUS_NOT_LOCKED __constant_cpu_to_le32(0xC000002A)
+#define STATUS_PARITY_ERROR __constant_cpu_to_le32(0xC000002B)
+#define STATUS_UNABLE_TO_DECOMMIT_VM __constant_cpu_to_le32(0xC000002C)
+#define STATUS_NOT_COMMITTED __constant_cpu_to_le32(0xC000002D)
+#define STATUS_INVALID_PORT_ATTRIBUTES __constant_cpu_to_le32(0xC000002E)
+#define STATUS_PORT_MESSAGE_TOO_LONG __constant_cpu_to_le32(0xC000002F)
+#define STATUS_INVALID_PARAMETER_MIX __constant_cpu_to_le32(0xC0000030)
+#define STATUS_INVALID_QUOTA_LOWER __constant_cpu_to_le32(0xC0000031)
+#define STATUS_DISK_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000032)
+#define STATUS_OBJECT_NAME_INVALID __constant_cpu_to_le32(0xC0000033)
+#define STATUS_OBJECT_NAME_NOT_FOUND __constant_cpu_to_le32(0xC0000034)
+#define STATUS_OBJECT_NAME_COLLISION __constant_cpu_to_le32(0xC0000035)
+#define STATUS_PORT_DISCONNECTED __constant_cpu_to_le32(0xC0000037)
+#define STATUS_DEVICE_ALREADY_ATTACHED __constant_cpu_to_le32(0xC0000038)
+#define STATUS_OBJECT_PATH_INVALID __constant_cpu_to_le32(0xC0000039)
+#define STATUS_OBJECT_PATH_NOT_FOUND __constant_cpu_to_le32(0xC000003A)
+#define STATUS_OBJECT_PATH_SYNTAX_BAD __constant_cpu_to_le32(0xC000003B)
+#define STATUS_DATA_OVERRUN __constant_cpu_to_le32(0xC000003C)
+#define STATUS_DATA_LATE_ERROR __constant_cpu_to_le32(0xC000003D)
+#define STATUS_DATA_ERROR __constant_cpu_to_le32(0xC000003E)
+#define STATUS_CRC_ERROR __constant_cpu_to_le32(0xC000003F)
+#define STATUS_SECTION_TOO_BIG __constant_cpu_to_le32(0xC0000040)
+#define STATUS_PORT_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000041)
+#define STATUS_INVALID_PORT_HANDLE __constant_cpu_to_le32(0xC0000042)
+#define STATUS_SHARING_VIOLATION __constant_cpu_to_le32(0xC0000043)
+#define STATUS_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000044)
+#define STATUS_INVALID_PAGE_PROTECTION __constant_cpu_to_le32(0xC0000045)
+#define STATUS_MUTANT_NOT_OWNED __constant_cpu_to_le32(0xC0000046)
+#define STATUS_SEMAPHORE_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000047)
+#define STATUS_PORT_ALREADY_SET __constant_cpu_to_le32(0xC0000048)
+#define STATUS_SECTION_NOT_IMAGE __constant_cpu_to_le32(0xC0000049)
+#define STATUS_SUSPEND_COUNT_EXCEEDED __constant_cpu_to_le32(0xC000004A)
+#define STATUS_THREAD_IS_TERMINATING __constant_cpu_to_le32(0xC000004B)
+#define STATUS_BAD_WORKING_SET_LIMIT __constant_cpu_to_le32(0xC000004C)
+#define STATUS_INCOMPATIBLE_FILE_MAP __constant_cpu_to_le32(0xC000004D)
+#define STATUS_SECTION_PROTECTION __constant_cpu_to_le32(0xC000004E)
+#define STATUS_EAS_NOT_SUPPORTED __constant_cpu_to_le32(0xC000004F)
+#define STATUS_EA_TOO_LARGE __constant_cpu_to_le32(0xC0000050)
+#define STATUS_NONEXISTENT_EA_ENTRY __constant_cpu_to_le32(0xC0000051)
+#define STATUS_NO_EAS_ON_FILE __constant_cpu_to_le32(0xC0000052)
+#define STATUS_EA_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000053)
+#define STATUS_FILE_LOCK_CONFLICT __constant_cpu_to_le32(0xC0000054)
+#define STATUS_LOCK_NOT_GRANTED __constant_cpu_to_le32(0xC0000055)
+#define STATUS_DELETE_PENDING __constant_cpu_to_le32(0xC0000056)
+#define STATUS_CTL_FILE_NOT_SUPPORTED __constant_cpu_to_le32(0xC0000057)
+#define STATUS_UNKNOWN_REVISION __constant_cpu_to_le32(0xC0000058)
+#define STATUS_REVISION_MISMATCH __constant_cpu_to_le32(0xC0000059)
+#define STATUS_INVALID_OWNER __constant_cpu_to_le32(0xC000005A)
+#define STATUS_INVALID_PRIMARY_GROUP __constant_cpu_to_le32(0xC000005B)
+#define STATUS_NO_IMPERSONATION_TOKEN __constant_cpu_to_le32(0xC000005C)
+#define STATUS_CANT_DISABLE_MANDATORY __constant_cpu_to_le32(0xC000005D)
+#define STATUS_NO_LOGON_SERVERS __constant_cpu_to_le32(0xC000005E)
+#define STATUS_NO_SUCH_LOGON_SESSION __constant_cpu_to_le32(0xC000005F)
+#define STATUS_NO_SUCH_PRIVILEGE __constant_cpu_to_le32(0xC0000060)
+#define STATUS_PRIVILEGE_NOT_HELD __constant_cpu_to_le32(0xC0000061)
+#define STATUS_INVALID_ACCOUNT_NAME __constant_cpu_to_le32(0xC0000062)
+#define STATUS_USER_EXISTS __constant_cpu_to_le32(0xC0000063)
+#define STATUS_NO_SUCH_USER __constant_cpu_to_le32(0xC0000064)
+#define STATUS_GROUP_EXISTS __constant_cpu_to_le32(0xC0000065)
+#define STATUS_NO_SUCH_GROUP __constant_cpu_to_le32(0xC0000066)
+#define STATUS_MEMBER_IN_GROUP __constant_cpu_to_le32(0xC0000067)
+#define STATUS_MEMBER_NOT_IN_GROUP __constant_cpu_to_le32(0xC0000068)
+#define STATUS_LAST_ADMIN __constant_cpu_to_le32(0xC0000069)
+#define STATUS_WRONG_PASSWORD __constant_cpu_to_le32(0xC000006A)
+#define STATUS_ILL_FORMED_PASSWORD __constant_cpu_to_le32(0xC000006B)
+#define STATUS_PASSWORD_RESTRICTION __constant_cpu_to_le32(0xC000006C)
+#define STATUS_LOGON_FAILURE __constant_cpu_to_le32(0xC000006D)
+#define STATUS_ACCOUNT_RESTRICTION __constant_cpu_to_le32(0xC000006E)
+#define STATUS_INVALID_LOGON_HOURS __constant_cpu_to_le32(0xC000006F)
+#define STATUS_INVALID_WORKSTATION __constant_cpu_to_le32(0xC0000070)
+#define STATUS_PASSWORD_EXPIRED __constant_cpu_to_le32(0xC0000071)
+#define STATUS_ACCOUNT_DISABLED __constant_cpu_to_le32(0xC0000072)
+#define STATUS_NONE_MAPPED __constant_cpu_to_le32(0xC0000073)
+#define STATUS_TOO_MANY_LUIDS_REQUESTED __constant_cpu_to_le32(0xC0000074)
+#define STATUS_LUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000075)
+#define STATUS_INVALID_SUB_AUTHORITY __constant_cpu_to_le32(0xC0000076)
+#define STATUS_INVALID_ACL __constant_cpu_to_le32(0xC0000077)
+#define STATUS_INVALID_SID __constant_cpu_to_le32(0xC0000078)
+#define STATUS_INVALID_SECURITY_DESCR __constant_cpu_to_le32(0xC0000079)
+#define STATUS_PROCEDURE_NOT_FOUND __constant_cpu_to_le32(0xC000007A)
+#define STATUS_INVALID_IMAGE_FORMAT __constant_cpu_to_le32(0xC000007B)
+#define STATUS_NO_TOKEN __constant_cpu_to_le32(0xC000007C)
+#define STATUS_BAD_INHERITANCE_ACL __constant_cpu_to_le32(0xC000007D)
+#define STATUS_RANGE_NOT_LOCKED __constant_cpu_to_le32(0xC000007E)
+#define STATUS_DISK_FULL __constant_cpu_to_le32(0xC000007F)
+#define STATUS_SERVER_DISABLED __constant_cpu_to_le32(0xC0000080)
+#define STATUS_SERVER_NOT_DISABLED __constant_cpu_to_le32(0xC0000081)
+#define STATUS_TOO_MANY_GUIDS_REQUESTED __constant_cpu_to_le32(0xC0000082)
+#define STATUS_GUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000083)
+#define STATUS_INVALID_ID_AUTHORITY __constant_cpu_to_le32(0xC0000084)
+#define STATUS_AGENTS_EXHAUSTED __constant_cpu_to_le32(0xC0000085)
+#define STATUS_INVALID_VOLUME_LABEL __constant_cpu_to_le32(0xC0000086)
+#define STATUS_SECTION_NOT_EXTENDED __constant_cpu_to_le32(0xC0000087)
+#define STATUS_NOT_MAPPED_DATA __constant_cpu_to_le32(0xC0000088)
+#define STATUS_RESOURCE_DATA_NOT_FOUND __constant_cpu_to_le32(0xC0000089)
+#define STATUS_RESOURCE_TYPE_NOT_FOUND __constant_cpu_to_le32(0xC000008A)
+#define STATUS_RESOURCE_NAME_NOT_FOUND __constant_cpu_to_le32(0xC000008B)
+#define STATUS_ARRAY_BOUNDS_EXCEEDED __constant_cpu_to_le32(0xC000008C)
+#define STATUS_FLOAT_DENORMAL_OPERAND __constant_cpu_to_le32(0xC000008D)
+#define STATUS_FLOAT_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC000008E)
+#define STATUS_FLOAT_INEXACT_RESULT __constant_cpu_to_le32(0xC000008F)
+#define STATUS_FLOAT_INVALID_OPERATION __constant_cpu_to_le32(0xC0000090)
+#define STATUS_FLOAT_OVERFLOW __constant_cpu_to_le32(0xC0000091)
+#define STATUS_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000092)
+#define STATUS_FLOAT_UNDERFLOW __constant_cpu_to_le32(0xC0000093)
+#define STATUS_INTEGER_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC0000094)
+#define STATUS_INTEGER_OVERFLOW __constant_cpu_to_le32(0xC0000095)
+#define STATUS_PRIVILEGED_INSTRUCTION __constant_cpu_to_le32(0xC0000096)
+#define STATUS_TOO_MANY_PAGING_FILES __constant_cpu_to_le32(0xC0000097)
+#define STATUS_FILE_INVALID __constant_cpu_to_le32(0xC0000098)
+#define STATUS_ALLOTTED_SPACE_EXCEEDED __constant_cpu_to_le32(0xC0000099)
+#define STATUS_INSUFFICIENT_RESOURCES __constant_cpu_to_le32(0xC000009A)
+#define STATUS_DFS_EXIT_PATH_FOUND __constant_cpu_to_le32(0xC000009B)
+#define STATUS_DEVICE_DATA_ERROR __constant_cpu_to_le32(0xC000009C)
+#define STATUS_DEVICE_NOT_CONNECTED __constant_cpu_to_le32(0xC000009D)
+#define STATUS_DEVICE_POWER_FAILURE __constant_cpu_to_le32(0xC000009E)
+#define STATUS_FREE_VM_NOT_AT_BASE __constant_cpu_to_le32(0xC000009F)
+#define STATUS_MEMORY_NOT_ALLOCATED __constant_cpu_to_le32(0xC00000A0)
+#define STATUS_WORKING_SET_QUOTA __constant_cpu_to_le32(0xC00000A1)
+#define STATUS_MEDIA_WRITE_PROTECTED __constant_cpu_to_le32(0xC00000A2)
+#define STATUS_DEVICE_NOT_READY __constant_cpu_to_le32(0xC00000A3)
+#define STATUS_INVALID_GROUP_ATTRIBUTES __constant_cpu_to_le32(0xC00000A4)
+#define STATUS_BAD_IMPERSONATION_LEVEL __constant_cpu_to_le32(0xC00000A5)
+#define STATUS_CANT_OPEN_ANONYMOUS __constant_cpu_to_le32(0xC00000A6)
+#define STATUS_BAD_VALIDATION_CLASS __constant_cpu_to_le32(0xC00000A7)
+#define STATUS_BAD_TOKEN_TYPE __constant_cpu_to_le32(0xC00000A8)
+#define STATUS_BAD_MASTER_BOOT_RECORD __constant_cpu_to_le32(0xC00000A9)
+#define STATUS_INSTRUCTION_MISALIGNMENT __constant_cpu_to_le32(0xC00000AA)
+#define STATUS_INSTANCE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AB)
+#define STATUS_PIPE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AC)
+#define STATUS_INVALID_PIPE_STATE __constant_cpu_to_le32(0xC00000AD)
+#define STATUS_PIPE_BUSY __constant_cpu_to_le32(0xC00000AE)
+#define STATUS_ILLEGAL_FUNCTION __constant_cpu_to_le32(0xC00000AF)
+#define STATUS_PIPE_DISCONNECTED __constant_cpu_to_le32(0xC00000B0)
+#define STATUS_PIPE_CLOSING __constant_cpu_to_le32(0xC00000B1)
+#define STATUS_PIPE_CONNECTED __constant_cpu_to_le32(0xC00000B2)
+#define STATUS_PIPE_LISTENING __constant_cpu_to_le32(0xC00000B3)
+#define STATUS_INVALID_READ_MODE __constant_cpu_to_le32(0xC00000B4)
+#define STATUS_IO_TIMEOUT __constant_cpu_to_le32(0xC00000B5)
+#define STATUS_FILE_FORCED_CLOSED __constant_cpu_to_le32(0xC00000B6)
+#define STATUS_PROFILING_NOT_STARTED __constant_cpu_to_le32(0xC00000B7)
+#define STATUS_PROFILING_NOT_STOPPED __constant_cpu_to_le32(0xC00000B8)
+#define STATUS_COULD_NOT_INTERPRET __constant_cpu_to_le32(0xC00000B9)
+#define STATUS_FILE_IS_A_DIRECTORY __constant_cpu_to_le32(0xC00000BA)
+#define STATUS_NOT_SUPPORTED __constant_cpu_to_le32(0xC00000BB)
+#define STATUS_REMOTE_NOT_LISTENING __constant_cpu_to_le32(0xC00000BC)
+#define STATUS_DUPLICATE_NAME __constant_cpu_to_le32(0xC00000BD)
+#define STATUS_BAD_NETWORK_PATH __constant_cpu_to_le32(0xC00000BE)
+#define STATUS_NETWORK_BUSY __constant_cpu_to_le32(0xC00000BF)
+#define STATUS_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC00000C0)
+#define STATUS_TOO_MANY_COMMANDS __constant_cpu_to_le32(0xC00000C1)
+#define STATUS_ADAPTER_HARDWARE_ERROR __constant_cpu_to_le32(0xC00000C2)
+#define STATUS_INVALID_NETWORK_RESPONSE __constant_cpu_to_le32(0xC00000C3)
+#define STATUS_UNEXPECTED_NETWORK_ERROR __constant_cpu_to_le32(0xC00000C4)
+#define STATUS_BAD_REMOTE_ADAPTER __constant_cpu_to_le32(0xC00000C5)
+#define STATUS_PRINT_QUEUE_FULL __constant_cpu_to_le32(0xC00000C6)
+#define STATUS_NO_SPOOL_SPACE __constant_cpu_to_le32(0xC00000C7)
+#define STATUS_PRINT_CANCELLED __constant_cpu_to_le32(0xC00000C8)
+#define STATUS_NETWORK_NAME_DELETED __constant_cpu_to_le32(0xC00000C9)
+#define STATUS_NETWORK_ACCESS_DENIED __constant_cpu_to_le32(0xC00000CA)
+#define STATUS_BAD_DEVICE_TYPE __constant_cpu_to_le32(0xC00000CB)
+#define STATUS_BAD_NETWORK_NAME __constant_cpu_to_le32(0xC00000CC)
+#define STATUS_TOO_MANY_NAMES __constant_cpu_to_le32(0xC00000CD)
+#define STATUS_TOO_MANY_SESSIONS __constant_cpu_to_le32(0xC00000CE)
+#define STATUS_SHARING_PAUSED __constant_cpu_to_le32(0xC00000CF)
+#define STATUS_REQUEST_NOT_ACCEPTED __constant_cpu_to_le32(0xC00000D0)
+#define STATUS_REDIRECTOR_PAUSED __constant_cpu_to_le32(0xC00000D1)
+#define STATUS_NET_WRITE_FAULT __constant_cpu_to_le32(0xC00000D2)
+#define STATUS_PROFILING_AT_LIMIT __constant_cpu_to_le32(0xC00000D3)
+#define STATUS_NOT_SAME_DEVICE __constant_cpu_to_le32(0xC00000D4)
+#define STATUS_FILE_RENAMED __constant_cpu_to_le32(0xC00000D5)
+#define STATUS_VIRTUAL_CIRCUIT_CLOSED __constant_cpu_to_le32(0xC00000D6)
+#define STATUS_NO_SECURITY_ON_OBJECT __constant_cpu_to_le32(0xC00000D7)
+#define STATUS_CANT_WAIT __constant_cpu_to_le32(0xC00000D8)
+#define STATUS_PIPE_EMPTY __constant_cpu_to_le32(0xC00000D9)
+#define STATUS_CANT_ACCESS_DOMAIN_INFO __constant_cpu_to_le32(0xC00000DA)
+#define STATUS_CANT_TERMINATE_SELF __constant_cpu_to_le32(0xC00000DB)
+#define STATUS_INVALID_SERVER_STATE __constant_cpu_to_le32(0xC00000DC)
+#define STATUS_INVALID_DOMAIN_STATE __constant_cpu_to_le32(0xC00000DD)
+#define STATUS_INVALID_DOMAIN_ROLE __constant_cpu_to_le32(0xC00000DE)
+#define STATUS_NO_SUCH_DOMAIN __constant_cpu_to_le32(0xC00000DF)
+#define STATUS_DOMAIN_EXISTS __constant_cpu_to_le32(0xC00000E0)
+#define STATUS_DOMAIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00000E1)
+#define STATUS_OPLOCK_NOT_GRANTED __constant_cpu_to_le32(0xC00000E2)
+#define STATUS_INVALID_OPLOCK_PROTOCOL __constant_cpu_to_le32(0xC00000E3)
+#define STATUS_INTERNAL_DB_CORRUPTION __constant_cpu_to_le32(0xC00000E4)
+#define STATUS_INTERNAL_ERROR __constant_cpu_to_le32(0xC00000E5)
+#define STATUS_GENERIC_NOT_MAPPED __constant_cpu_to_le32(0xC00000E6)
+#define STATUS_BAD_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC00000E7)
+#define STATUS_INVALID_USER_BUFFER __constant_cpu_to_le32(0xC00000E8)
+#define STATUS_UNEXPECTED_IO_ERROR __constant_cpu_to_le32(0xC00000E9)
+#define STATUS_UNEXPECTED_MM_CREATE_ERR __constant_cpu_to_le32(0xC00000EA)
+#define STATUS_UNEXPECTED_MM_MAP_ERROR __constant_cpu_to_le32(0xC00000EB)
+#define STATUS_UNEXPECTED_MM_EXTEND_ERR __constant_cpu_to_le32(0xC00000EC)
+#define STATUS_NOT_LOGON_PROCESS __constant_cpu_to_le32(0xC00000ED)
+#define STATUS_LOGON_SESSION_EXISTS __constant_cpu_to_le32(0xC00000EE)
+#define STATUS_INVALID_PARAMETER_1 __constant_cpu_to_le32(0xC00000EF)
+#define STATUS_INVALID_PARAMETER_2 __constant_cpu_to_le32(0xC00000F0)
+#define STATUS_INVALID_PARAMETER_3 __constant_cpu_to_le32(0xC00000F1)
+#define STATUS_INVALID_PARAMETER_4 __constant_cpu_to_le32(0xC00000F2)
+#define STATUS_INVALID_PARAMETER_5 __constant_cpu_to_le32(0xC00000F3)
+#define STATUS_INVALID_PARAMETER_6 __constant_cpu_to_le32(0xC00000F4)
+#define STATUS_INVALID_PARAMETER_7 __constant_cpu_to_le32(0xC00000F5)
+#define STATUS_INVALID_PARAMETER_8 __constant_cpu_to_le32(0xC00000F6)
+#define STATUS_INVALID_PARAMETER_9 __constant_cpu_to_le32(0xC00000F7)
+#define STATUS_INVALID_PARAMETER_10 __constant_cpu_to_le32(0xC00000F8)
+#define STATUS_INVALID_PARAMETER_11 __constant_cpu_to_le32(0xC00000F9)
+#define STATUS_INVALID_PARAMETER_12 __constant_cpu_to_le32(0xC00000FA)
+#define STATUS_REDIRECTOR_NOT_STARTED __constant_cpu_to_le32(0xC00000FB)
+#define STATUS_REDIRECTOR_STARTED __constant_cpu_to_le32(0xC00000FC)
+#define STATUS_STACK_OVERFLOW __constant_cpu_to_le32(0xC00000FD)
+#define STATUS_NO_SUCH_PACKAGE __constant_cpu_to_le32(0xC00000FE)
+#define STATUS_BAD_FUNCTION_TABLE __constant_cpu_to_le32(0xC00000FF)
+#define STATUS_VARIABLE_NOT_FOUND __constant_cpu_to_le32(0xC0000100)
+#define STATUS_DIRECTORY_NOT_EMPTY __constant_cpu_to_le32(0xC0000101)
+#define STATUS_FILE_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000102)
+#define STATUS_NOT_A_DIRECTORY __constant_cpu_to_le32(0xC0000103)
+#define STATUS_BAD_LOGON_SESSION_STATE __constant_cpu_to_le32(0xC0000104)
+#define STATUS_LOGON_SESSION_COLLISION __constant_cpu_to_le32(0xC0000105)
+#define STATUS_NAME_TOO_LONG __constant_cpu_to_le32(0xC0000106)
+#define STATUS_FILES_OPEN __constant_cpu_to_le32(0xC0000107)
+#define STATUS_CONNECTION_IN_USE __constant_cpu_to_le32(0xC0000108)
+#define STATUS_MESSAGE_NOT_FOUND __constant_cpu_to_le32(0xC0000109)
+#define STATUS_PROCESS_IS_TERMINATING __constant_cpu_to_le32(0xC000010A)
+#define STATUS_INVALID_LOGON_TYPE __constant_cpu_to_le32(0xC000010B)
+#define STATUS_NO_GUID_TRANSLATION __constant_cpu_to_le32(0xC000010C)
+#define STATUS_CANNOT_IMPERSONATE __constant_cpu_to_le32(0xC000010D)
+#define STATUS_IMAGE_ALREADY_LOADED __constant_cpu_to_le32(0xC000010E)
+#define STATUS_ABIOS_NOT_PRESENT __constant_cpu_to_le32(0xC000010F)
+#define STATUS_ABIOS_LID_NOT_EXIST __constant_cpu_to_le32(0xC0000110)
+#define STATUS_ABIOS_LID_ALREADY_OWNED __constant_cpu_to_le32(0xC0000111)
+#define STATUS_ABIOS_NOT_LID_OWNER __constant_cpu_to_le32(0xC0000112)
+#define STATUS_ABIOS_INVALID_COMMAND __constant_cpu_to_le32(0xC0000113)
+#define STATUS_ABIOS_INVALID_LID __constant_cpu_to_le32(0xC0000114)
+#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE __constant_cpu_to_le32(0xC0000115)
+#define STATUS_ABIOS_INVALID_SELECTOR __constant_cpu_to_le32(0xC0000116)
+#define STATUS_NO_LDT __constant_cpu_to_le32(0xC0000117)
+#define STATUS_INVALID_LDT_SIZE __constant_cpu_to_le32(0xC0000118)
+#define STATUS_INVALID_LDT_OFFSET __constant_cpu_to_le32(0xC0000119)
+#define STATUS_INVALID_LDT_DESCRIPTOR __constant_cpu_to_le32(0xC000011A)
+#define STATUS_INVALID_IMAGE_NE_FORMAT __constant_cpu_to_le32(0xC000011B)
+#define STATUS_RXACT_INVALID_STATE __constant_cpu_to_le32(0xC000011C)
+#define STATUS_RXACT_COMMIT_FAILURE __constant_cpu_to_le32(0xC000011D)
+#define STATUS_MAPPED_FILE_SIZE_ZERO __constant_cpu_to_le32(0xC000011E)
+#define STATUS_TOO_MANY_OPENED_FILES __constant_cpu_to_le32(0xC000011F)
+#define STATUS_CANCELLED __constant_cpu_to_le32(0xC0000120)
+#define STATUS_CANNOT_DELETE __constant_cpu_to_le32(0xC0000121)
+#define STATUS_INVALID_COMPUTER_NAME __constant_cpu_to_le32(0xC0000122)
+#define STATUS_FILE_DELETED __constant_cpu_to_le32(0xC0000123)
+#define STATUS_SPECIAL_ACCOUNT __constant_cpu_to_le32(0xC0000124)
+#define STATUS_SPECIAL_GROUP __constant_cpu_to_le32(0xC0000125)
+#define STATUS_SPECIAL_USER __constant_cpu_to_le32(0xC0000126)
+#define STATUS_MEMBERS_PRIMARY_GROUP __constant_cpu_to_le32(0xC0000127)
+#define STATUS_FILE_CLOSED __constant_cpu_to_le32(0xC0000128)
+#define STATUS_TOO_MANY_THREADS __constant_cpu_to_le32(0xC0000129)
+#define STATUS_THREAD_NOT_IN_PROCESS __constant_cpu_to_le32(0xC000012A)
+#define STATUS_TOKEN_ALREADY_IN_USE __constant_cpu_to_le32(0xC000012B)
+#define STATUS_PAGEFILE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC000012C)
+#define STATUS_COMMITMENT_LIMIT __constant_cpu_to_le32(0xC000012D)
+#define STATUS_INVALID_IMAGE_LE_FORMAT __constant_cpu_to_le32(0xC000012E)
+#define STATUS_INVALID_IMAGE_NOT_MZ __constant_cpu_to_le32(0xC000012F)
+#define STATUS_INVALID_IMAGE_PROTECT __constant_cpu_to_le32(0xC0000130)
+#define STATUS_INVALID_IMAGE_WIN_16 __constant_cpu_to_le32(0xC0000131)
+#define STATUS_LOGON_SERVER_CONFLICT __constant_cpu_to_le32(0xC0000132)
+#define STATUS_TIME_DIFFERENCE_AT_DC __constant_cpu_to_le32(0xC0000133)
+#define STATUS_SYNCHRONIZATION_REQUIRED __constant_cpu_to_le32(0xC0000134)
+#define STATUS_DLL_NOT_FOUND __constant_cpu_to_le32(0xC0000135)
+#define STATUS_OPEN_FAILED __constant_cpu_to_le32(0xC0000136)
+#define STATUS_IO_PRIVILEGE_FAILED __constant_cpu_to_le32(0xC0000137)
+#define STATUS_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000138)
+#define STATUS_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000139)
+#define STATUS_CONTROL_C_EXIT __constant_cpu_to_le32(0xC000013A)
+#define STATUS_LOCAL_DISCONNECT __constant_cpu_to_le32(0xC000013B)
+#define STATUS_REMOTE_DISCONNECT __constant_cpu_to_le32(0xC000013C)
+#define STATUS_REMOTE_RESOURCES __constant_cpu_to_le32(0xC000013D)
+#define STATUS_LINK_FAILED __constant_cpu_to_le32(0xC000013E)
+#define STATUS_LINK_TIMEOUT __constant_cpu_to_le32(0xC000013F)
+#define STATUS_INVALID_CONNECTION __constant_cpu_to_le32(0xC0000140)
+#define STATUS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0000141)
+#define STATUS_DLL_INIT_FAILED __constant_cpu_to_le32(0xC0000142)
+#define STATUS_MISSING_SYSTEMFILE __constant_cpu_to_le32(0xC0000143)
+#define STATUS_UNHANDLED_EXCEPTION __constant_cpu_to_le32(0xC0000144)
+#define STATUS_APP_INIT_FAILURE __constant_cpu_to_le32(0xC0000145)
+#define STATUS_PAGEFILE_CREATE_FAILED __constant_cpu_to_le32(0xC0000146)
+#define STATUS_NO_PAGEFILE __constant_cpu_to_le32(0xC0000147)
+#define STATUS_INVALID_LEVEL __constant_cpu_to_le32(0xC0000148)
+#define STATUS_WRONG_PASSWORD_CORE __constant_cpu_to_le32(0xC0000149)
+#define STATUS_ILLEGAL_FLOAT_CONTEXT __constant_cpu_to_le32(0xC000014A)
+#define STATUS_PIPE_BROKEN __constant_cpu_to_le32(0xC000014B)
+#define STATUS_REGISTRY_CORRUPT __constant_cpu_to_le32(0xC000014C)
+#define STATUS_REGISTRY_IO_FAILED __constant_cpu_to_le32(0xC000014D)
+#define STATUS_NO_EVENT_PAIR __constant_cpu_to_le32(0xC000014E)
+#define STATUS_UNRECOGNIZED_VOLUME __constant_cpu_to_le32(0xC000014F)
+#define STATUS_SERIAL_NO_DEVICE_INITED __constant_cpu_to_le32(0xC0000150)
+#define STATUS_NO_SUCH_ALIAS __constant_cpu_to_le32(0xC0000151)
+#define STATUS_MEMBER_NOT_IN_ALIAS __constant_cpu_to_le32(0xC0000152)
+#define STATUS_MEMBER_IN_ALIAS __constant_cpu_to_le32(0xC0000153)
+#define STATUS_ALIAS_EXISTS __constant_cpu_to_le32(0xC0000154)
+#define STATUS_LOGON_NOT_GRANTED __constant_cpu_to_le32(0xC0000155)
+#define STATUS_TOO_MANY_SECRETS __constant_cpu_to_le32(0xC0000156)
+#define STATUS_SECRET_TOO_LONG __constant_cpu_to_le32(0xC0000157)
+#define STATUS_INTERNAL_DB_ERROR __constant_cpu_to_le32(0xC0000158)
+#define STATUS_FULLSCREEN_MODE __constant_cpu_to_le32(0xC0000159)
+#define STATUS_TOO_MANY_CONTEXT_IDS __constant_cpu_to_le32(0xC000015A)
+#define STATUS_LOGON_TYPE_NOT_GRANTED __constant_cpu_to_le32(0xC000015B)
+#define STATUS_NOT_REGISTRY_FILE __constant_cpu_to_le32(0xC000015C)
+#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000015D)
+#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR __constant_cpu_to_le32(0xC000015E)
+#define STATUS_FT_MISSING_MEMBER __constant_cpu_to_le32(0xC000015F)
+#define STATUS_ILL_FORMED_SERVICE_ENTRY __constant_cpu_to_le32(0xC0000160)
+#define STATUS_ILLEGAL_CHARACTER __constant_cpu_to_le32(0xC0000161)
+#define STATUS_UNMAPPABLE_CHARACTER __constant_cpu_to_le32(0xC0000162)
+#define STATUS_UNDEFINED_CHARACTER __constant_cpu_to_le32(0xC0000163)
+#define STATUS_FLOPPY_VOLUME __constant_cpu_to_le32(0xC0000164)
+#define STATUS_FLOPPY_ID_MARK_NOT_FOUND __constant_cpu_to_le32(0xC0000165)
+#define STATUS_FLOPPY_WRONG_CYLINDER __constant_cpu_to_le32(0xC0000166)
+#define STATUS_FLOPPY_UNKNOWN_ERROR __constant_cpu_to_le32(0xC0000167)
+#define STATUS_FLOPPY_BAD_REGISTERS __constant_cpu_to_le32(0xC0000168)
+#define STATUS_DISK_RECALIBRATE_FAILED __constant_cpu_to_le32(0xC0000169)
+#define STATUS_DISK_OPERATION_FAILED __constant_cpu_to_le32(0xC000016A)
+#define STATUS_DISK_RESET_FAILED __constant_cpu_to_le32(0xC000016B)
+#define STATUS_SHARED_IRQ_BUSY __constant_cpu_to_le32(0xC000016C)
+#define STATUS_FT_ORPHANING __constant_cpu_to_le32(0xC000016D)
+#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT __constant_cpu_to_le32(0xC000016E)
+#define STATUS_PARTITION_FAILURE __constant_cpu_to_le32(0xC0000172)
+#define STATUS_INVALID_BLOCK_LENGTH __constant_cpu_to_le32(0xC0000173)
+#define STATUS_DEVICE_NOT_PARTITIONED __constant_cpu_to_le32(0xC0000174)
+#define STATUS_UNABLE_TO_LOCK_MEDIA __constant_cpu_to_le32(0xC0000175)
+#define STATUS_UNABLE_TO_UNLOAD_MEDIA __constant_cpu_to_le32(0xC0000176)
+#define STATUS_EOM_OVERFLOW __constant_cpu_to_le32(0xC0000177)
+#define STATUS_NO_MEDIA __constant_cpu_to_le32(0xC0000178)
+#define STATUS_NO_SUCH_MEMBER __constant_cpu_to_le32(0xC000017A)
+#define STATUS_INVALID_MEMBER __constant_cpu_to_le32(0xC000017B)
+#define STATUS_KEY_DELETED __constant_cpu_to_le32(0xC000017C)
+#define STATUS_NO_LOG_SPACE __constant_cpu_to_le32(0xC000017D)
+#define STATUS_TOO_MANY_SIDS __constant_cpu_to_le32(0xC000017E)
+#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000017F)
+#define STATUS_KEY_HAS_CHILDREN __constant_cpu_to_le32(0xC0000180)
+#define STATUS_CHILD_MUST_BE_VOLATILE __constant_cpu_to_le32(0xC0000181)
+#define STATUS_DEVICE_CONFIGURATION_ERROR __constant_cpu_to_le32(0xC0000182)
+#define STATUS_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC0000183)
+#define STATUS_INVALID_DEVICE_STATE __constant_cpu_to_le32(0xC0000184)
+#define STATUS_IO_DEVICE_ERROR __constant_cpu_to_le32(0xC0000185)
+#define STATUS_DEVICE_PROTOCOL_ERROR __constant_cpu_to_le32(0xC0000186)
+#define STATUS_BACKUP_CONTROLLER __constant_cpu_to_le32(0xC0000187)
+#define STATUS_LOG_FILE_FULL __constant_cpu_to_le32(0xC0000188)
+#define STATUS_TOO_LATE __constant_cpu_to_le32(0xC0000189)
+#define STATUS_NO_TRUST_LSA_SECRET __constant_cpu_to_le32(0xC000018A)
+#define STATUS_NO_TRUST_SAM_ACCOUNT __constant_cpu_to_le32(0xC000018B)
+#define STATUS_TRUSTED_DOMAIN_FAILURE __constant_cpu_to_le32(0xC000018C)
+#define STATUS_TRUSTED_RELATIONSHIP_FAILURE __constant_cpu_to_le32(0xC000018D)
+#define STATUS_EVENTLOG_FILE_CORRUPT __constant_cpu_to_le32(0xC000018E)
+#define STATUS_EVENTLOG_CANT_START __constant_cpu_to_le32(0xC000018F)
+#define STATUS_TRUST_FAILURE __constant_cpu_to_le32(0xC0000190)
+#define STATUS_MUTANT_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000191)
+#define STATUS_NETLOGON_NOT_STARTED __constant_cpu_to_le32(0xC0000192)
+#define STATUS_ACCOUNT_EXPIRED __constant_cpu_to_le32(0xC0000193)
+#define STATUS_POSSIBLE_DEADLOCK __constant_cpu_to_le32(0xC0000194)
+#define STATUS_NETWORK_CREDENTIAL_CONFLICT __constant_cpu_to_le32(0xC0000195)
+#define STATUS_REMOTE_SESSION_LIMIT __constant_cpu_to_le32(0xC0000196)
+#define STATUS_EVENTLOG_FILE_CHANGED __constant_cpu_to_le32(0xC0000197)
+#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000198)
+#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000199)
+#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT __constant_cpu_to_le32(0xC000019A)
+#define STATUS_DOMAIN_TRUST_INCONSISTENT __constant_cpu_to_le32(0xC000019B)
+#define STATUS_FS_DRIVER_REQUIRED __constant_cpu_to_le32(0xC000019C)
+#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL __constant_cpu_to_le32(0xC000019D)
+#define STATUS_NETWORK_OPEN_RESTRICTION __constant_cpu_to_le32(0xC0000201)
+#define STATUS_NO_USER_SESSION_KEY __constant_cpu_to_le32(0xC0000202)
+#define STATUS_USER_SESSION_DELETED __constant_cpu_to_le32(0xC0000203)
+#define STATUS_RESOURCE_LANG_NOT_FOUND __constant_cpu_to_le32(0xC0000204)
+#define STATUS_INSUFF_SERVER_RESOURCES __constant_cpu_to_le32(0xC0000205)
+#define STATUS_INVALID_BUFFER_SIZE __constant_cpu_to_le32(0xC0000206)
+#define STATUS_INVALID_ADDRESS_COMPONENT __constant_cpu_to_le32(0xC0000207)
+#define STATUS_INVALID_ADDRESS_WILDCARD __constant_cpu_to_le32(0xC0000208)
+#define STATUS_TOO_MANY_ADDRESSES __constant_cpu_to_le32(0xC0000209)
+#define STATUS_ADDRESS_ALREADY_EXISTS __constant_cpu_to_le32(0xC000020A)
+#define STATUS_ADDRESS_CLOSED __constant_cpu_to_le32(0xC000020B)
+#define STATUS_CONNECTION_DISCONNECTED __constant_cpu_to_le32(0xC000020C)
+#define STATUS_CONNECTION_RESET __constant_cpu_to_le32(0xC000020D)
+#define STATUS_TOO_MANY_NODES __constant_cpu_to_le32(0xC000020E)
+#define STATUS_TRANSACTION_ABORTED __constant_cpu_to_le32(0xC000020F)
+#define STATUS_TRANSACTION_TIMED_OUT __constant_cpu_to_le32(0xC0000210)
+#define STATUS_TRANSACTION_NO_RELEASE __constant_cpu_to_le32(0xC0000211)
+#define STATUS_TRANSACTION_NO_MATCH __constant_cpu_to_le32(0xC0000212)
+#define STATUS_TRANSACTION_RESPONDED __constant_cpu_to_le32(0xC0000213)
+#define STATUS_TRANSACTION_INVALID_ID __constant_cpu_to_le32(0xC0000214)
+#define STATUS_TRANSACTION_INVALID_TYPE __constant_cpu_to_le32(0xC0000215)
+#define STATUS_NOT_SERVER_SESSION __constant_cpu_to_le32(0xC0000216)
+#define STATUS_NOT_CLIENT_SESSION __constant_cpu_to_le32(0xC0000217)
+#define STATUS_CANNOT_LOAD_REGISTRY_FILE __constant_cpu_to_le32(0xC0000218)
+#define STATUS_DEBUG_ATTACH_FAILED __constant_cpu_to_le32(0xC0000219)
+#define STATUS_SYSTEM_PROCESS_TERMINATED __constant_cpu_to_le32(0xC000021A)
+#define STATUS_DATA_NOT_ACCEPTED __constant_cpu_to_le32(0xC000021B)
+#define STATUS_NO_BROWSER_SERVERS_FOUND __constant_cpu_to_le32(0xC000021C)
+#define STATUS_VDM_HARD_ERROR __constant_cpu_to_le32(0xC000021D)
+#define STATUS_DRIVER_CANCEL_TIMEOUT __constant_cpu_to_le32(0xC000021E)
+#define STATUS_REPLY_MESSAGE_MISMATCH __constant_cpu_to_le32(0xC000021F)
+#define STATUS_MAPPED_ALIGNMENT __constant_cpu_to_le32(0xC0000220)
+#define STATUS_IMAGE_CHECKSUM_MISMATCH __constant_cpu_to_le32(0xC0000221)
+#define STATUS_LOST_WRITEBEHIND_DATA __constant_cpu_to_le32(0xC0000222)
+#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID __constant_cpu_to_le32(0xC0000223)
+#define STATUS_PASSWORD_MUST_CHANGE __constant_cpu_to_le32(0xC0000224)
+#define STATUS_NOT_FOUND __constant_cpu_to_le32(0xC0000225)
+#define STATUS_NOT_TINY_STREAM __constant_cpu_to_le32(0xC0000226)
+#define STATUS_RECOVERY_FAILURE __constant_cpu_to_le32(0xC0000227)
+#define STATUS_STACK_OVERFLOW_READ __constant_cpu_to_le32(0xC0000228)
+#define STATUS_FAIL_CHECK __constant_cpu_to_le32(0xC0000229)
+#define STATUS_DUPLICATE_OBJECTID __constant_cpu_to_le32(0xC000022A)
+#define STATUS_OBJECTID_EXISTS __constant_cpu_to_le32(0xC000022B)
+#define STATUS_CONVERT_TO_LARGE __constant_cpu_to_le32(0xC000022C)
+#define STATUS_RETRY __constant_cpu_to_le32(0xC000022D)
+#define STATUS_FOUND_OUT_OF_SCOPE __constant_cpu_to_le32(0xC000022E)
+#define STATUS_ALLOCATE_BUCKET __constant_cpu_to_le32(0xC000022F)
+#define STATUS_PROPSET_NOT_FOUND __constant_cpu_to_le32(0xC0000230)
+#define STATUS_MARSHALL_OVERFLOW __constant_cpu_to_le32(0xC0000231)
+#define STATUS_INVALID_VARIANT __constant_cpu_to_le32(0xC0000232)
+#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND __constant_cpu_to_le32(0xC0000233)
+#define STATUS_ACCOUNT_LOCKED_OUT __constant_cpu_to_le32(0xC0000234)
+#define STATUS_HANDLE_NOT_CLOSABLE __constant_cpu_to_le32(0xC0000235)
+#define STATUS_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000236)
+#define STATUS_GRACEFUL_DISCONNECT __constant_cpu_to_le32(0xC0000237)
+#define STATUS_ADDRESS_ALREADY_ASSOCIATED __constant_cpu_to_le32(0xC0000238)
+#define STATUS_ADDRESS_NOT_ASSOCIATED __constant_cpu_to_le32(0xC0000239)
+#define STATUS_CONNECTION_INVALID __constant_cpu_to_le32(0xC000023A)
+#define STATUS_CONNECTION_ACTIVE __constant_cpu_to_le32(0xC000023B)
+#define STATUS_NETWORK_UNREACHABLE __constant_cpu_to_le32(0xC000023C)
+#define STATUS_HOST_UNREACHABLE __constant_cpu_to_le32(0xC000023D)
+#define STATUS_PROTOCOL_UNREACHABLE __constant_cpu_to_le32(0xC000023E)
+#define STATUS_PORT_UNREACHABLE __constant_cpu_to_le32(0xC000023F)
+#define STATUS_REQUEST_ABORTED __constant_cpu_to_le32(0xC0000240)
+#define STATUS_CONNECTION_ABORTED __constant_cpu_to_le32(0xC0000241)
+#define STATUS_BAD_COMPRESSION_BUFFER __constant_cpu_to_le32(0xC0000242)
+#define STATUS_USER_MAPPED_FILE __constant_cpu_to_le32(0xC0000243)
+#define STATUS_AUDIT_FAILED __constant_cpu_to_le32(0xC0000244)
+#define STATUS_TIMER_RESOLUTION_NOT_SET __constant_cpu_to_le32(0xC0000245)
+#define STATUS_CONNECTION_COUNT_LIMIT __constant_cpu_to_le32(0xC0000246)
+#define STATUS_LOGIN_TIME_RESTRICTION __constant_cpu_to_le32(0xC0000247)
+#define STATUS_LOGIN_WKSTA_RESTRICTION __constant_cpu_to_le32(0xC0000248)
+#define STATUS_IMAGE_MP_UP_MISMATCH __constant_cpu_to_le32(0xC0000249)
+#define STATUS_INSUFFICIENT_LOGON_INFO __constant_cpu_to_le32(0xC0000250)
+#define STATUS_BAD_DLL_ENTRYPOINT __constant_cpu_to_le32(0xC0000251)
+#define STATUS_BAD_SERVICE_ENTRYPOINT __constant_cpu_to_le32(0xC0000252)
+#define STATUS_LPC_REPLY_LOST __constant_cpu_to_le32(0xC0000253)
+#define STATUS_IP_ADDRESS_CONFLICT1 __constant_cpu_to_le32(0xC0000254)
+#define STATUS_IP_ADDRESS_CONFLICT2 __constant_cpu_to_le32(0xC0000255)
+#define STATUS_REGISTRY_QUOTA_LIMIT __constant_cpu_to_le32(0xC0000256)
+#define STATUS_PATH_NOT_COVERED __constant_cpu_to_le32(0xC0000257)
+#define STATUS_NO_CALLBACK_ACTIVE __constant_cpu_to_le32(0xC0000258)
+#define STATUS_LICENSE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000259)
+#define STATUS_PWD_TOO_SHORT __constant_cpu_to_le32(0xC000025A)
+#define STATUS_PWD_TOO_RECENT __constant_cpu_to_le32(0xC000025B)
+#define STATUS_PWD_HISTORY_CONFLICT __constant_cpu_to_le32(0xC000025C)
+#define STATUS_PLUGPLAY_NO_DEVICE __constant_cpu_to_le32(0xC000025E)
+#define STATUS_UNSUPPORTED_COMPRESSION __constant_cpu_to_le32(0xC000025F)
+#define STATUS_INVALID_HW_PROFILE __constant_cpu_to_le32(0xC0000260)
+#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH __constant_cpu_to_le32(0xC0000261)
+#define STATUS_DRIVER_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000262)
+#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000263)
+#define STATUS_RESOURCE_NOT_OWNED __constant_cpu_to_le32(0xC0000264)
+#define STATUS_TOO_MANY_LINKS __constant_cpu_to_le32(0xC0000265)
+#define STATUS_QUOTA_LIST_INCONSISTENT __constant_cpu_to_le32(0xC0000266)
+#define STATUS_FILE_IS_OFFLINE __constant_cpu_to_le32(0xC0000267)
+#define STATUS_EVALUATION_EXPIRATION __constant_cpu_to_le32(0xC0000268)
+#define STATUS_ILLEGAL_DLL_RELOCATION __constant_cpu_to_le32(0xC0000269)
+#define STATUS_LICENSE_VIOLATION __constant_cpu_to_le32(0xC000026A)
+#define STATUS_DLL_INIT_FAILED_LOGOFF __constant_cpu_to_le32(0xC000026B)
+#define STATUS_DRIVER_UNABLE_TO_LOAD __constant_cpu_to_le32(0xC000026C)
+#define STATUS_DFS_UNAVAILABLE __constant_cpu_to_le32(0xC000026D)
+#define STATUS_VOLUME_DISMOUNTED __constant_cpu_to_le32(0xC000026E)
+#define STATUS_WX86_INTERNAL_ERROR __constant_cpu_to_le32(0xC000026F)
+#define STATUS_WX86_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000270)
+#define STATUS_VALIDATE_CONTINUE __constant_cpu_to_le32(0xC0000271)
+#define STATUS_NO_MATCH __constant_cpu_to_le32(0xC0000272)
+#define STATUS_NO_MORE_MATCHES __constant_cpu_to_le32(0xC0000273)
+#define STATUS_NOT_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000275)
+#define STATUS_IO_REPARSE_TAG_INVALID __constant_cpu_to_le32(0xC0000276)
+#define STATUS_IO_REPARSE_TAG_MISMATCH __constant_cpu_to_le32(0xC0000277)
+#define STATUS_IO_REPARSE_DATA_INVALID __constant_cpu_to_le32(0xC0000278)
+#define STATUS_IO_REPARSE_TAG_NOT_HANDLED __constant_cpu_to_le32(0xC0000279)
+#define STATUS_REPARSE_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000280)
+#define STATUS_DIRECTORY_IS_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000281)
+#define STATUS_RANGE_LIST_CONFLICT __constant_cpu_to_le32(0xC0000282)
+#define STATUS_SOURCE_ELEMENT_EMPTY __constant_cpu_to_le32(0xC0000283)
+#define STATUS_DESTINATION_ELEMENT_FULL __constant_cpu_to_le32(0xC0000284)
+#define STATUS_ILLEGAL_ELEMENT_ADDRESS __constant_cpu_to_le32(0xC0000285)
+#define STATUS_MAGAZINE_NOT_PRESENT __constant_cpu_to_le32(0xC0000286)
+#define STATUS_REINITIALIZATION_NEEDED __constant_cpu_to_le32(0xC0000287)
+#define STATUS_ENCRYPTION_FAILED __constant_cpu_to_le32(0xC000028A)
+#define STATUS_DECRYPTION_FAILED __constant_cpu_to_le32(0xC000028B)
+#define STATUS_RANGE_NOT_FOUND __constant_cpu_to_le32(0xC000028C)
+#define STATUS_NO_RECOVERY_POLICY __constant_cpu_to_le32(0xC000028D)
+#define STATUS_NO_EFS __constant_cpu_to_le32(0xC000028E)
+#define STATUS_WRONG_EFS __constant_cpu_to_le32(0xC000028F)
+#define STATUS_NO_USER_KEYS __constant_cpu_to_le32(0xC0000290)
+#define STATUS_FILE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0000291)
+#define STATUS_NOT_EXPORT_FORMAT __constant_cpu_to_le32(0xC0000292)
+#define STATUS_FILE_ENCRYPTED __constant_cpu_to_le32(0xC0000293)
+#define STATUS_WMI_GUID_NOT_FOUND __constant_cpu_to_le32(0xC0000295)
+#define STATUS_WMI_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC0000296)
+#define STATUS_WMI_ITEMID_NOT_FOUND __constant_cpu_to_le32(0xC0000297)
+#define STATUS_WMI_TRY_AGAIN __constant_cpu_to_le32(0xC0000298)
+#define STATUS_SHARED_POLICY __constant_cpu_to_le32(0xC0000299)
+#define STATUS_POLICY_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC000029A)
+#define STATUS_POLICY_ONLY_IN_DS __constant_cpu_to_le32(0xC000029B)
+#define STATUS_VOLUME_NOT_UPGRADED __constant_cpu_to_le32(0xC000029C)
+#define STATUS_REMOTE_STORAGE_NOT_ACTIVE __constant_cpu_to_le32(0xC000029D)
+#define STATUS_REMOTE_STORAGE_MEDIA_ERROR __constant_cpu_to_le32(0xC000029E)
+#define STATUS_NO_TRACKING_SERVICE __constant_cpu_to_le32(0xC000029F)
+#define STATUS_SERVER_SID_MISMATCH __constant_cpu_to_le32(0xC00002A0)
+#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE __constant_cpu_to_le32(0xC00002A1)
+#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX __constant_cpu_to_le32(0xC00002A2)
+#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED __constant_cpu_to_le32(0xC00002A3)
+#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS __constant_cpu_to_le32(0xC00002A4)
+#define STATUS_DS_BUSY __constant_cpu_to_le32(0xC00002A5)
+#define STATUS_DS_UNAVAILABLE __constant_cpu_to_le32(0xC00002A6)
+#define STATUS_DS_NO_RIDS_ALLOCATED __constant_cpu_to_le32(0xC00002A7)
+#define STATUS_DS_NO_MORE_RIDS __constant_cpu_to_le32(0xC00002A8)
+#define STATUS_DS_INCORRECT_ROLE_OWNER __constant_cpu_to_le32(0xC00002A9)
+#define STATUS_DS_RIDMGR_INIT_ERROR __constant_cpu_to_le32(0xC00002AA)
+#define STATUS_DS_OBJ_CLASS_VIOLATION __constant_cpu_to_le32(0xC00002AB)
+#define STATUS_DS_CANT_ON_NON_LEAF __constant_cpu_to_le32(0xC00002AC)
+#define STATUS_DS_CANT_ON_RDN __constant_cpu_to_le32(0xC00002AD)
+#define STATUS_DS_CANT_MOD_OBJ_CLASS __constant_cpu_to_le32(0xC00002AE)
+#define STATUS_DS_CROSS_DOM_MOVE_FAILED __constant_cpu_to_le32(0xC00002AF)
+#define STATUS_DS_GC_NOT_AVAILABLE __constant_cpu_to_le32(0xC00002B0)
+#define STATUS_DIRECTORY_SERVICE_REQUIRED __constant_cpu_to_le32(0xC00002B1)
+#define STATUS_REPARSE_ATTRIBUTE_CONFLICT __constant_cpu_to_le32(0xC00002B2)
+#define STATUS_CANT_ENABLE_DENY_ONLY __constant_cpu_to_le32(0xC00002B3)
+#define STATUS_FLOAT_MULTIPLE_FAULTS __constant_cpu_to_le32(0xC00002B4)
+#define STATUS_FLOAT_MULTIPLE_TRAPS __constant_cpu_to_le32(0xC00002B5)
+#define STATUS_DEVICE_REMOVED __constant_cpu_to_le32(0xC00002B6)
+#define STATUS_JOURNAL_DELETE_IN_PROGRESS __constant_cpu_to_le32(0xC00002B7)
+#define STATUS_JOURNAL_NOT_ACTIVE __constant_cpu_to_le32(0xC00002B8)
+#define STATUS_NOINTERFACE __constant_cpu_to_le32(0xC00002B9)
+#define STATUS_DS_ADMIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00002C1)
+#define STATUS_DRIVER_FAILED_SLEEP __constant_cpu_to_le32(0xC00002C2)
+#define STATUS_MUTUAL_AUTHENTICATION_FAILED __constant_cpu_to_le32(0xC00002C3)
+#define STATUS_CORRUPT_SYSTEM_FILE __constant_cpu_to_le32(0xC00002C4)
+#define STATUS_DATATYPE_MISALIGNMENT_ERROR __constant_cpu_to_le32(0xC00002C5)
+#define STATUS_WMI_READ_ONLY __constant_cpu_to_le32(0xC00002C6)
+#define STATUS_WMI_SET_FAILURE __constant_cpu_to_le32(0xC00002C7)
+#define STATUS_COMMITMENT_MINIMUM __constant_cpu_to_le32(0xC00002C8)
+#define STATUS_REG_NAT_CONSUMPTION __constant_cpu_to_le32(0xC00002C9)
+#define STATUS_TRANSPORT_FULL __constant_cpu_to_le32(0xC00002CA)
+#define STATUS_DS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002CB)
+#define STATUS_ONLY_IF_CONNECTED __constant_cpu_to_le32(0xC00002CC)
+#define STATUS_DS_SENSITIVE_GROUP_VIOLATION __constant_cpu_to_le32(0xC00002CD)
+#define STATUS_PNP_RESTART_ENUMERATION __constant_cpu_to_le32(0xC00002CE)
+#define STATUS_JOURNAL_ENTRY_DELETED __constant_cpu_to_le32(0xC00002CF)
+#define STATUS_DS_CANT_MOD_PRIMARYGROUPID __constant_cpu_to_le32(0xC00002D0)
+#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE __constant_cpu_to_le32(0xC00002D1)
+#define STATUS_PNP_REBOOT_REQUIRED __constant_cpu_to_le32(0xC00002D2)
+#define STATUS_POWER_STATE_INVALID __constant_cpu_to_le32(0xC00002D3)
+#define STATUS_DS_INVALID_GROUP_TYPE __constant_cpu_to_le32(0xC00002D4)
+#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D5)
+#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D6)
+#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D7)
+#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC00002D8)
+#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D9)
+#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER __constant_cpu_to_le32(0xC00002DA)
+#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002DB)
+#define STATUS_DS_HAVE_PRIMARY_MEMBERS __constant_cpu_to_le32(0xC00002DC)
+#define STATUS_WMI_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002DD)
+#define STATUS_INSUFFICIENT_POWER __constant_cpu_to_le32(0xC00002DE)
+#define STATUS_SAM_NEED_BOOTKEY_PASSWORD __constant_cpu_to_le32(0xC00002DF)
+#define STATUS_SAM_NEED_BOOTKEY_FLOPPY __constant_cpu_to_le32(0xC00002E0)
+#define STATUS_DS_CANT_START __constant_cpu_to_le32(0xC00002E1)
+#define STATUS_DS_INIT_FAILURE __constant_cpu_to_le32(0xC00002E2)
+#define STATUS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002E3)
+#define STATUS_DS_GC_REQUIRED __constant_cpu_to_le32(0xC00002E4)
+#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY __constant_cpu_to_le32(0xC00002E5)
+#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS __constant_cpu_to_le32(0xC00002E6)
+#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC00002E7)
+#define STATUS_MULTIPLE_FAULT_VIOLATION __constant_cpu_to_le32(0xC00002E8)
+#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED __constant_cpu_to_le32(0xC00002E9)
+#define STATUS_CANNOT_MAKE __constant_cpu_to_le32(0xC00002EA)
+#define STATUS_SYSTEM_SHUTDOWN __constant_cpu_to_le32(0xC00002EB)
+#define STATUS_DS_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002EC)
+#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002ED)
+#define STATUS_UNFINISHED_CONTEXT_DELETED __constant_cpu_to_le32(0xC00002EE)
+#define STATUS_NO_TGT_REPLY __constant_cpu_to_le32(0xC00002EF)
+#define STATUS_OBJECTID_NOT_FOUND __constant_cpu_to_le32(0xC00002F0)
+#define STATUS_NO_IP_ADDRESSES __constant_cpu_to_le32(0xC00002F1)
+#define STATUS_WRONG_CREDENTIAL_HANDLE __constant_cpu_to_le32(0xC00002F2)
+#define STATUS_CRYPTO_SYSTEM_INVALID __constant_cpu_to_le32(0xC00002F3)
+#define STATUS_MAX_REFERRALS_EXCEEDED __constant_cpu_to_le32(0xC00002F4)
+#define STATUS_MUST_BE_KDC __constant_cpu_to_le32(0xC00002F5)
+#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002F6)
+#define STATUS_TOO_MANY_PRINCIPALS __constant_cpu_to_le32(0xC00002F7)
+#define STATUS_NO_PA_DATA __constant_cpu_to_le32(0xC00002F8)
+#define STATUS_PKINIT_NAME_MISMATCH __constant_cpu_to_le32(0xC00002F9)
+#define STATUS_SMARTCARD_LOGON_REQUIRED __constant_cpu_to_le32(0xC00002FA)
+#define STATUS_KDC_INVALID_REQUEST __constant_cpu_to_le32(0xC00002FB)
+#define STATUS_KDC_UNABLE_TO_REFER __constant_cpu_to_le32(0xC00002FC)
+#define STATUS_KDC_UNKNOWN_ETYPE __constant_cpu_to_le32(0xC00002FD)
+#define STATUS_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FE)
+#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FF)
+#define STATUS_NOT_SUPPORTED_ON_SBS __constant_cpu_to_le32(0xC0000300)
+#define STATUS_WMI_GUID_DISCONNECTED __constant_cpu_to_le32(0xC0000301)
+#define STATUS_WMI_ALREADY_DISABLED __constant_cpu_to_le32(0xC0000302)
+#define STATUS_WMI_ALREADY_ENABLED __constant_cpu_to_le32(0xC0000303)
+#define STATUS_MFT_TOO_FRAGMENTED __constant_cpu_to_le32(0xC0000304)
+#define STATUS_COPY_PROTECTION_FAILURE __constant_cpu_to_le32(0xC0000305)
+#define STATUS_CSS_AUTHENTICATION_FAILURE __constant_cpu_to_le32(0xC0000306)
+#define STATUS_CSS_KEY_NOT_PRESENT __constant_cpu_to_le32(0xC0000307)
+#define STATUS_CSS_KEY_NOT_ESTABLISHED __constant_cpu_to_le32(0xC0000308)
+#define STATUS_CSS_SCRAMBLED_SECTOR __constant_cpu_to_le32(0xC0000309)
+#define STATUS_CSS_REGION_MISMATCH __constant_cpu_to_le32(0xC000030A)
+#define STATUS_CSS_RESETS_EXHAUSTED __constant_cpu_to_le32(0xC000030B)
+#define STATUS_PKINIT_FAILURE __constant_cpu_to_le32(0xC0000320)
+#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE __constant_cpu_to_le32(0xC0000321)
+#define STATUS_NO_KERB_KEY __constant_cpu_to_le32(0xC0000322)
+#define STATUS_HOST_DOWN __constant_cpu_to_le32(0xC0000350)
+#define STATUS_UNSUPPORTED_PREAUTH __constant_cpu_to_le32(0xC0000351)
+#define STATUS_EFS_ALG_BLOB_TOO_BIG __constant_cpu_to_le32(0xC0000352)
+#define STATUS_PORT_NOT_SET __constant_cpu_to_le32(0xC0000353)
+#define STATUS_DEBUGGER_INACTIVE __constant_cpu_to_le32(0xC0000354)
+#define STATUS_DS_VERSION_CHECK_FAILURE __constant_cpu_to_le32(0xC0000355)
+#define STATUS_AUDITING_DISABLED __constant_cpu_to_le32(0xC0000356)
+#define STATUS_PRENT4_MACHINE_ACCOUNT __constant_cpu_to_le32(0xC0000357)
+#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC0000358)
+#define STATUS_INVALID_IMAGE_WIN_32 __constant_cpu_to_le32(0xC0000359)
+#define STATUS_INVALID_IMAGE_WIN_64 __constant_cpu_to_le32(0xC000035A)
+#define STATUS_BAD_BINDINGS __constant_cpu_to_le32(0xC000035B)
+#define STATUS_NETWORK_SESSION_EXPIRED __constant_cpu_to_le32(0xC000035C)
+#define STATUS_APPHELP_BLOCK __constant_cpu_to_le32(0xC000035D)
+#define STATUS_ALL_SIDS_FILTERED __constant_cpu_to_le32(0xC000035E)
+#define STATUS_NOT_SAFE_MODE_DRIVER __constant_cpu_to_le32(0xC000035F)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT __constant_cpu_to_le32(0xC0000361)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH __constant_cpu_to_le32(0xC0000362)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER __constant_cpu_to_le32(0xC0000363)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER __constant_cpu_to_le32(0xC0000364)
+#define STATUS_FAILED_DRIVER_ENTRY __constant_cpu_to_le32(0xC0000365)
+#define STATUS_DEVICE_ENUMERATION_ERROR __constant_cpu_to_le32(0xC0000366)
+#define STATUS_MOUNT_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000368)
+#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER __constant_cpu_to_le32(0xC0000369)
+#define STATUS_MCA_OCCURED __constant_cpu_to_le32(0xC000036A)
+#define STATUS_DRIVER_BLOCKED_CRITICAL __constant_cpu_to_le32(0xC000036B)
+#define STATUS_DRIVER_BLOCKED __constant_cpu_to_le32(0xC000036C)
+#define STATUS_DRIVER_DATABASE_ERROR __constant_cpu_to_le32(0xC000036D)
+#define STATUS_SYSTEM_HIVE_TOO_LARGE __constant_cpu_to_le32(0xC000036E)
+#define STATUS_INVALID_IMPORT_OF_NON_DLL __constant_cpu_to_le32(0xC000036F)
+#define STATUS_NO_SECRETS __constant_cpu_to_le32(0xC0000371)
+#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY __constant_cpu_to_le32(0xC0000372)
+#define STATUS_FAILED_STACK_SWITCH __constant_cpu_to_le32(0xC0000373)
+#define STATUS_HEAP_CORRUPTION __constant_cpu_to_le32(0xC0000374)
+#define STATUS_SMARTCARD_WRONG_PIN __constant_cpu_to_le32(0xC0000380)
+#define STATUS_SMARTCARD_CARD_BLOCKED __constant_cpu_to_le32(0xC0000381)
+#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED __constant_cpu_to_le32(0xC0000382)
+#define STATUS_SMARTCARD_NO_CARD __constant_cpu_to_le32(0xC0000383)
+#define STATUS_SMARTCARD_NO_KEY_CONTAINER __constant_cpu_to_le32(0xC0000384)
+#define STATUS_SMARTCARD_NO_CERTIFICATE __constant_cpu_to_le32(0xC0000385)
+#define STATUS_SMARTCARD_NO_KEYSET __constant_cpu_to_le32(0xC0000386)
+#define STATUS_SMARTCARD_IO_ERROR __constant_cpu_to_le32(0xC0000387)
+#define STATUS_DOWNGRADE_DETECTED __constant_cpu_to_le32(0xC0000388)
+#define STATUS_SMARTCARD_CERT_REVOKED __constant_cpu_to_le32(0xC0000389)
+#define STATUS_ISSUING_CA_UNTRUSTED __constant_cpu_to_le32(0xC000038A)
+#define STATUS_REVOCATION_OFFLINE_C __constant_cpu_to_le32(0xC000038B)
+#define STATUS_PKINIT_CLIENT_FAILURE __constant_cpu_to_le32(0xC000038C)
+#define STATUS_SMARTCARD_CERT_EXPIRED __constant_cpu_to_le32(0xC000038D)
+#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD __constant_cpu_to_le32(0xC000038E)
+#define STATUS_SMARTCARD_SILENT_CONTEXT __constant_cpu_to_le32(0xC000038F)
+#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000401)
+#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000402)
+#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000403)
+#define STATUS_DS_NAME_NOT_UNIQUE __constant_cpu_to_le32(0xC0000404)
+#define STATUS_DS_DUPLICATE_ID_FOUND __constant_cpu_to_le32(0xC0000405)
+#define STATUS_DS_GROUP_CONVERSION_ERROR __constant_cpu_to_le32(0xC0000406)
+#define STATUS_VOLSNAP_PREPARE_HIBERNATE __constant_cpu_to_le32(0xC0000407)
+#define STATUS_USER2USER_REQUIRED __constant_cpu_to_le32(0xC0000408)
+#define STATUS_STACK_BUFFER_OVERRUN __constant_cpu_to_le32(0xC0000409)
+#define STATUS_NO_S4U_PROT_SUPPORT __constant_cpu_to_le32(0xC000040A)
+#define STATUS_CROSSREALM_DELEGATION_FAILURE __constant_cpu_to_le32(0xC000040B)
+#define STATUS_REVOCATION_OFFLINE_KDC __constant_cpu_to_le32(0xC000040C)
+#define STATUS_ISSUING_CA_UNTRUSTED_KDC __constant_cpu_to_le32(0xC000040D)
+#define STATUS_KDC_CERT_EXPIRED __constant_cpu_to_le32(0xC000040E)
+#define STATUS_KDC_CERT_REVOKED __constant_cpu_to_le32(0xC000040F)
+#define STATUS_PARAMETER_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000410)
+#define STATUS_HIBERNATION_FAILURE __constant_cpu_to_le32(0xC0000411)
+#define STATUS_DELAY_LOAD_FAILED __constant_cpu_to_le32(0xC0000412)
+#define STATUS_AUTHENTICATION_FIREWALL_FAILED __constant_cpu_to_le32(0xC0000413)
+#define STATUS_VDM_DISALLOWED __constant_cpu_to_le32(0xC0000414)
+#define STATUS_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC0000415)
+#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE __constant_cpu_to_le32(0xC0000416)
+#define STATUS_INVALID_CRUNTIME_PARAMETER __constant_cpu_to_le32(0xC0000417)
+#define STATUS_NTLM_BLOCKED __constant_cpu_to_le32(0xC0000418)
+#define STATUS_ASSERTION_FAILURE __constant_cpu_to_le32(0xC0000420)
+#define STATUS_VERIFIER_STOP __constant_cpu_to_le32(0xC0000421)
+#define STATUS_CALLBACK_POP_STACK __constant_cpu_to_le32(0xC0000423)
+#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED __constant_cpu_to_le32(0xC0000424)
+#define STATUS_HIVE_UNLOADED __constant_cpu_to_le32(0xC0000425)
+#define STATUS_COMPRESSION_DISABLED __constant_cpu_to_le32(0xC0000426)
+#define STATUS_FILE_SYSTEM_LIMITATION __constant_cpu_to_le32(0xC0000427)
+#define STATUS_INVALID_IMAGE_HASH __constant_cpu_to_le32(0xC0000428)
+#define STATUS_NOT_CAPABLE __constant_cpu_to_le32(0xC0000429)
+#define STATUS_REQUEST_OUT_OF_SEQUENCE __constant_cpu_to_le32(0xC000042A)
+#define STATUS_IMPLEMENTATION_LIMIT __constant_cpu_to_le32(0xC000042B)
+#define STATUS_ELEVATION_REQUIRED __constant_cpu_to_le32(0xC000042C)
+#define STATUS_BEYOND_VDL __constant_cpu_to_le32(0xC0000432)
+#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS __constant_cpu_to_le32(0xC0000433)
+#define STATUS_PTE_CHANGED __constant_cpu_to_le32(0xC0000434)
+#define STATUS_PURGE_FAILED __constant_cpu_to_le32(0xC0000435)
+#define STATUS_CRED_REQUIRES_CONFIRMATION __constant_cpu_to_le32(0xC0000440)
+#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE __constant_cpu_to_le32(0xC0000441)
+#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER __constant_cpu_to_le32(0xC0000442)
+#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000443)
+#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000444)
+#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE __constant_cpu_to_le32(0xC0000445)
+#define STATUS_INVALID_LABEL __constant_cpu_to_le32(0xC0000446)
+#define STATUS_DRIVER_PROCESS_TERMINATED __constant_cpu_to_le32(0xC0000450)
+#define STATUS_AMBIGUOUS_SYSTEM_DEVICE __constant_cpu_to_le32(0xC0000451)
+#define STATUS_SYSTEM_DEVICE_NOT_FOUND __constant_cpu_to_le32(0xC0000452)
+#define STATUS_RESTART_BOOT_APPLICATION __constant_cpu_to_le32(0xC0000453)
+#define STATUS_INVALID_TASK_NAME __constant_cpu_to_le32(0xC0000500)
+#define STATUS_INVALID_TASK_INDEX __constant_cpu_to_le32(0xC0000501)
+#define STATUS_THREAD_ALREADY_IN_TASK __constant_cpu_to_le32(0xC0000502)
+#define STATUS_CALLBACK_BYPASS __constant_cpu_to_le32(0xC0000503)
+#define STATUS_PORT_CLOSED __constant_cpu_to_le32(0xC0000700)
+#define STATUS_MESSAGE_LOST __constant_cpu_to_le32(0xC0000701)
+#define STATUS_INVALID_MESSAGE __constant_cpu_to_le32(0xC0000702)
+#define STATUS_REQUEST_CANCELED __constant_cpu_to_le32(0xC0000703)
+#define STATUS_RECURSIVE_DISPATCH __constant_cpu_to_le32(0xC0000704)
+#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED __constant_cpu_to_le32(0xC0000705)
+#define STATUS_LPC_INVALID_CONNECTION_USAGE __constant_cpu_to_le32(0xC0000706)
+#define STATUS_LPC_REQUESTS_NOT_ALLOWED __constant_cpu_to_le32(0xC0000707)
+#define STATUS_RESOURCE_IN_USE __constant_cpu_to_le32(0xC0000708)
+#define STATUS_HARDWARE_MEMORY_ERROR __constant_cpu_to_le32(0xC0000709)
+#define STATUS_THREADPOOL_HANDLE_EXCEPTION __constant_cpu_to_le32(0xC000070A)
+#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070B)
+#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070C)
+#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070D)
+#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070E)
+#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION __constant_cpu_to_le32(0xC000070F)
+#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000710)
+#define STATUS_APC_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000711)
+#define STATUS_PROCESS_IS_PROTECTED __constant_cpu_to_le32(0xC0000712)
+#define STATUS_MCA_EXCEPTION __constant_cpu_to_le32(0xC0000713)
+#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE __constant_cpu_to_le32(0xC0000714)
+#define STATUS_SYMLINK_CLASS_DISABLED __constant_cpu_to_le32(0xC0000715)
+#define STATUS_INVALID_IDN_NORMALIZATION __constant_cpu_to_le32(0xC0000716)
+#define STATUS_NO_UNICODE_TRANSLATION __constant_cpu_to_le32(0xC0000717)
+#define STATUS_ALREADY_REGISTERED __constant_cpu_to_le32(0xC0000718)
+#define STATUS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0000719)
+#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST __constant_cpu_to_le32(0xC000071A)
+#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY __constant_cpu_to_le32(0xC000071B)
+#define STATUS_INVALID_THREAD __constant_cpu_to_le32(0xC000071C)
+#define STATUS_CALLBACK_RETURNED_TRANSACTION __constant_cpu_to_le32(0xC000071D)
+#define STATUS_CALLBACK_RETURNED_LDR_LOCK __constant_cpu_to_le32(0xC000071E)
+#define STATUS_CALLBACK_RETURNED_LANG __constant_cpu_to_le32(0xC000071F)
+#define STATUS_CALLBACK_RETURNED_PRI_BACK __constant_cpu_to_le32(0xC0000720)
+#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY __constant_cpu_to_le32(0xC0000721)
+#define STATUS_DISK_REPAIR_DISABLED __constant_cpu_to_le32(0xC0000800)
+#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS __constant_cpu_to_le32(0xC0000801)
+#define STATUS_DISK_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000802)
+#define STATUS_CONTENT_BLOCKED __constant_cpu_to_le32(0xC0000804)
+#define STATUS_BAD_CLUSTERS __constant_cpu_to_le32(0xC0000805)
+#define STATUS_VOLUME_DIRTY __constant_cpu_to_le32(0xC0000806)
+#define STATUS_FILE_CHECKED_OUT __constant_cpu_to_le32(0xC0000901)
+#define STATUS_CHECKOUT_REQUIRED __constant_cpu_to_le32(0xC0000902)
+#define STATUS_BAD_FILE_TYPE __constant_cpu_to_le32(0xC0000903)
+#define STATUS_FILE_TOO_LARGE __constant_cpu_to_le32(0xC0000904)
+#define STATUS_FORMS_AUTH_REQUIRED __constant_cpu_to_le32(0xC0000905)
+#define STATUS_VIRUS_INFECTED __constant_cpu_to_le32(0xC0000906)
+#define STATUS_VIRUS_DELETED __constant_cpu_to_le32(0xC0000907)
+#define STATUS_BAD_MCFG_TABLE __constant_cpu_to_le32(0xC0000908)
+#define STATUS_WOW_ASSERTION __constant_cpu_to_le32(0xC0009898)
+#define STATUS_INVALID_SIGNATURE __constant_cpu_to_le32(0xC000A000)
+#define STATUS_HMAC_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A001)
+#define STATUS_IPSEC_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A010)
+#define STATUS_ND_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A011)
+#define STATUS_HOPLIMIT_EXCEEDED __constant_cpu_to_le32(0xC000A012)
+#define STATUS_PROTOCOL_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A013)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED __constant_cpu_to_le32(0xC000A080)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR __constant_cpu_to_le32(0xC000A081)
+#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR __constant_cpu_to_le32(0xC000A082)
+#define STATUS_XML_PARSE_ERROR __constant_cpu_to_le32(0xC000A083)
+#define STATUS_XMLDSIG_ERROR __constant_cpu_to_le32(0xC000A084)
+#define STATUS_WRONG_COMPARTMENT __constant_cpu_to_le32(0xC000A085)
+#define STATUS_AUTHIP_FAILURE __constant_cpu_to_le32(0xC000A086)
+#define DBG_NO_STATE_CHANGE __constant_cpu_to_le32(0xC0010001)
+#define DBG_APP_NOT_IDLE __constant_cpu_to_le32(0xC0010002)
+#define RPC_NT_INVALID_STRING_BINDING __constant_cpu_to_le32(0xC0020001)
+#define RPC_NT_WRONG_KIND_OF_BINDING __constant_cpu_to_le32(0xC0020002)
+#define RPC_NT_INVALID_BINDING __constant_cpu_to_le32(0xC0020003)
+#define RPC_NT_PROTSEQ_NOT_SUPPORTED __constant_cpu_to_le32(0xC0020004)
+#define RPC_NT_INVALID_RPC_PROTSEQ __constant_cpu_to_le32(0xC0020005)
+#define RPC_NT_INVALID_STRING_UUID __constant_cpu_to_le32(0xC0020006)
+#define RPC_NT_INVALID_ENDPOINT_FORMAT __constant_cpu_to_le32(0xC0020007)
+#define RPC_NT_INVALID_NET_ADDR __constant_cpu_to_le32(0xC0020008)
+#define RPC_NT_NO_ENDPOINT_FOUND __constant_cpu_to_le32(0xC0020009)
+#define RPC_NT_INVALID_TIMEOUT __constant_cpu_to_le32(0xC002000A)
+#define RPC_NT_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC002000B)
+#define RPC_NT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000C)
+#define RPC_NT_TYPE_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000D)
+#define RPC_NT_ALREADY_LISTENING __constant_cpu_to_le32(0xC002000E)
+#define RPC_NT_NO_PROTSEQS_REGISTERED __constant_cpu_to_le32(0xC002000F)
+#define RPC_NT_NOT_LISTENING __constant_cpu_to_le32(0xC0020010)
+#define RPC_NT_UNKNOWN_MGR_TYPE __constant_cpu_to_le32(0xC0020011)
+#define RPC_NT_UNKNOWN_IF __constant_cpu_to_le32(0xC0020012)
+#define RPC_NT_NO_BINDINGS __constant_cpu_to_le32(0xC0020013)
+#define RPC_NT_NO_PROTSEQS __constant_cpu_to_le32(0xC0020014)
+#define RPC_NT_CANT_CREATE_ENDPOINT __constant_cpu_to_le32(0xC0020015)
+#define RPC_NT_OUT_OF_RESOURCES __constant_cpu_to_le32(0xC0020016)
+#define RPC_NT_SERVER_UNAVAILABLE __constant_cpu_to_le32(0xC0020017)
+#define RPC_NT_SERVER_TOO_BUSY __constant_cpu_to_le32(0xC0020018)
+#define RPC_NT_INVALID_NETWORK_OPTIONS __constant_cpu_to_le32(0xC0020019)
+#define RPC_NT_NO_CALL_ACTIVE __constant_cpu_to_le32(0xC002001A)
+#define RPC_NT_CALL_FAILED __constant_cpu_to_le32(0xC002001B)
+#define RPC_NT_CALL_FAILED_DNE __constant_cpu_to_le32(0xC002001C)
+#define RPC_NT_PROTOCOL_ERROR __constant_cpu_to_le32(0xC002001D)
+#define RPC_NT_UNSUPPORTED_TRANS_SYN __constant_cpu_to_le32(0xC002001F)
+#define RPC_NT_UNSUPPORTED_TYPE __constant_cpu_to_le32(0xC0020021)
+#define RPC_NT_INVALID_TAG __constant_cpu_to_le32(0xC0020022)
+#define RPC_NT_INVALID_BOUND __constant_cpu_to_le32(0xC0020023)
+#define RPC_NT_NO_ENTRY_NAME __constant_cpu_to_le32(0xC0020024)
+#define RPC_NT_INVALID_NAME_SYNTAX __constant_cpu_to_le32(0xC0020025)
+#define RPC_NT_UNSUPPORTED_NAME_SYNTAX __constant_cpu_to_le32(0xC0020026)
+#define RPC_NT_UUID_NO_ADDRESS __constant_cpu_to_le32(0xC0020028)
+#define RPC_NT_DUPLICATE_ENDPOINT __constant_cpu_to_le32(0xC0020029)
+#define RPC_NT_UNKNOWN_AUTHN_TYPE __constant_cpu_to_le32(0xC002002A)
+#define RPC_NT_MAX_CALLS_TOO_SMALL __constant_cpu_to_le32(0xC002002B)
+#define RPC_NT_STRING_TOO_LONG __constant_cpu_to_le32(0xC002002C)
+#define RPC_NT_PROTSEQ_NOT_FOUND __constant_cpu_to_le32(0xC002002D)
+#define RPC_NT_PROCNUM_OUT_OF_RANGE __constant_cpu_to_le32(0xC002002E)
+#define RPC_NT_BINDING_HAS_NO_AUTH __constant_cpu_to_le32(0xC002002F)
+#define RPC_NT_UNKNOWN_AUTHN_SERVICE __constant_cpu_to_le32(0xC0020030)
+#define RPC_NT_UNKNOWN_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020031)
+#define RPC_NT_INVALID_AUTH_IDENTITY __constant_cpu_to_le32(0xC0020032)
+#define RPC_NT_UNKNOWN_AUTHZ_SERVICE __constant_cpu_to_le32(0xC0020033)
+#define EPT_NT_INVALID_ENTRY __constant_cpu_to_le32(0xC0020034)
+#define EPT_NT_CANT_PERFORM_OP __constant_cpu_to_le32(0xC0020035)
+#define EPT_NT_NOT_REGISTERED __constant_cpu_to_le32(0xC0020036)
+#define RPC_NT_NOTHING_TO_EXPORT __constant_cpu_to_le32(0xC0020037)
+#define RPC_NT_INCOMPLETE_NAME __constant_cpu_to_le32(0xC0020038)
+#define RPC_NT_INVALID_VERS_OPTION __constant_cpu_to_le32(0xC0020039)
+#define RPC_NT_NO_MORE_MEMBERS __constant_cpu_to_le32(0xC002003A)
+#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED __constant_cpu_to_le32(0xC002003B)
+#define RPC_NT_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC002003C)
+#define RPC_NT_ENTRY_ALREADY_EXISTS __constant_cpu_to_le32(0xC002003D)
+#define RPC_NT_ENTRY_NOT_FOUND __constant_cpu_to_le32(0xC002003E)
+#define RPC_NT_NAME_SERVICE_UNAVAILABLE __constant_cpu_to_le32(0xC002003F)
+#define RPC_NT_INVALID_NAF_ID __constant_cpu_to_le32(0xC0020040)
+#define RPC_NT_CANNOT_SUPPORT __constant_cpu_to_le32(0xC0020041)
+#define RPC_NT_NO_CONTEXT_AVAILABLE __constant_cpu_to_le32(0xC0020042)
+#define RPC_NT_INTERNAL_ERROR __constant_cpu_to_le32(0xC0020043)
+#define RPC_NT_ZERO_DIVIDE __constant_cpu_to_le32(0xC0020044)
+#define RPC_NT_ADDRESS_ERROR __constant_cpu_to_le32(0xC0020045)
+#define RPC_NT_FP_DIV_ZERO __constant_cpu_to_le32(0xC0020046)
+#define RPC_NT_FP_UNDERFLOW __constant_cpu_to_le32(0xC0020047)
+#define RPC_NT_FP_OVERFLOW __constant_cpu_to_le32(0xC0020048)
+#define RPC_NT_CALL_IN_PROGRESS __constant_cpu_to_le32(0xC0020049)
+#define RPC_NT_NO_MORE_BINDINGS __constant_cpu_to_le32(0xC002004A)
+#define RPC_NT_GROUP_MEMBER_NOT_FOUND __constant_cpu_to_le32(0xC002004B)
+#define EPT_NT_CANT_CREATE __constant_cpu_to_le32(0xC002004C)
+#define RPC_NT_INVALID_OBJECT __constant_cpu_to_le32(0xC002004D)
+#define RPC_NT_NO_INTERFACES __constant_cpu_to_le32(0xC002004F)
+#define RPC_NT_CALL_CANCELLED __constant_cpu_to_le32(0xC0020050)
+#define RPC_NT_BINDING_INCOMPLETE __constant_cpu_to_le32(0xC0020051)
+#define RPC_NT_COMM_FAILURE __constant_cpu_to_le32(0xC0020052)
+#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020053)
+#define RPC_NT_NO_PRINC_NAME __constant_cpu_to_le32(0xC0020054)
+#define RPC_NT_NOT_RPC_ERROR __constant_cpu_to_le32(0xC0020055)
+#define RPC_NT_SEC_PKG_ERROR __constant_cpu_to_le32(0xC0020057)
+#define RPC_NT_NOT_CANCELLED __constant_cpu_to_le32(0xC0020058)
+#define RPC_NT_INVALID_ASYNC_HANDLE __constant_cpu_to_le32(0xC0020062)
+#define RPC_NT_INVALID_ASYNC_CALL __constant_cpu_to_le32(0xC0020063)
+#define RPC_NT_PROXY_ACCESS_DENIED __constant_cpu_to_le32(0xC0020064)
+#define RPC_NT_NO_MORE_ENTRIES __constant_cpu_to_le32(0xC0030001)
+#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL __constant_cpu_to_le32(0xC0030002)
+#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE __constant_cpu_to_le32(0xC0030003)
+#define RPC_NT_SS_IN_NULL_CONTEXT __constant_cpu_to_le32(0xC0030004)
+#define RPC_NT_SS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0030005)
+#define RPC_NT_SS_CONTEXT_DAMAGED __constant_cpu_to_le32(0xC0030006)
+#define RPC_NT_SS_HANDLES_MISMATCH __constant_cpu_to_le32(0xC0030007)
+#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE __constant_cpu_to_le32(0xC0030008)
+#define RPC_NT_NULL_REF_POINTER __constant_cpu_to_le32(0xC0030009)
+#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE __constant_cpu_to_le32(0xC003000A)
+#define RPC_NT_BYTE_COUNT_TOO_SMALL __constant_cpu_to_le32(0xC003000B)
+#define RPC_NT_BAD_STUB_DATA __constant_cpu_to_le32(0xC003000C)
+#define RPC_NT_INVALID_ES_ACTION __constant_cpu_to_le32(0xC0030059)
+#define RPC_NT_WRONG_ES_VERSION __constant_cpu_to_le32(0xC003005A)
+#define RPC_NT_WRONG_STUB_VERSION __constant_cpu_to_le32(0xC003005B)
+#define RPC_NT_INVALID_PIPE_OBJECT __constant_cpu_to_le32(0xC003005C)
+#define RPC_NT_INVALID_PIPE_OPERATION __constant_cpu_to_le32(0xC003005D)
+#define RPC_NT_WRONG_PIPE_VERSION __constant_cpu_to_le32(0xC003005E)
+#define RPC_NT_PIPE_CLOSED __constant_cpu_to_le32(0xC003005F)
+#define RPC_NT_PIPE_DISCIPLINE_ERROR __constant_cpu_to_le32(0xC0030060)
+#define RPC_NT_PIPE_EMPTY __constant_cpu_to_le32(0xC0030061)
+#define STATUS_PNP_BAD_MPS_TABLE __constant_cpu_to_le32(0xC0040035)
+#define STATUS_PNP_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040036)
+#define STATUS_PNP_IRQ_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040037)
+#define STATUS_PNP_INVALID_ID __constant_cpu_to_le32(0xC0040038)
+#define STATUS_IO_REISSUE_AS_CACHED __constant_cpu_to_le32(0xC0040039)
+#define STATUS_CTX_WINSTATION_NAME_INVALID __constant_cpu_to_le32(0xC00A0001)
+#define STATUS_CTX_INVALID_PD __constant_cpu_to_le32(0xC00A0002)
+#define STATUS_CTX_PD_NOT_FOUND __constant_cpu_to_le32(0xC00A0003)
+#define STATUS_CTX_CLOSE_PENDING __constant_cpu_to_le32(0xC00A0006)
+#define STATUS_CTX_NO_OUTBUF __constant_cpu_to_le32(0xC00A0007)
+#define STATUS_CTX_MODEM_INF_NOT_FOUND __constant_cpu_to_le32(0xC00A0008)
+#define STATUS_CTX_INVALID_MODEMNAME __constant_cpu_to_le32(0xC00A0009)
+#define STATUS_CTX_RESPONSE_ERROR __constant_cpu_to_le32(0xC00A000A)
+#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT __constant_cpu_to_le32(0xC00A000B)
+#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER __constant_cpu_to_le32(0xC00A000C)
+#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE __constant_cpu_to_le32(0xC00A000D)
+#define STATUS_CTX_MODEM_RESPONSE_BUSY __constant_cpu_to_le32(0xC00A000E)
+#define STATUS_CTX_MODEM_RESPONSE_VOICE __constant_cpu_to_le32(0xC00A000F)
+#define STATUS_CTX_TD_ERROR __constant_cpu_to_le32(0xC00A0010)
+#define STATUS_CTX_LICENSE_CLIENT_INVALID __constant_cpu_to_le32(0xC00A0012)
+#define STATUS_CTX_LICENSE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00A0013)
+#define STATUS_CTX_LICENSE_EXPIRED __constant_cpu_to_le32(0xC00A0014)
+#define STATUS_CTX_WINSTATION_NOT_FOUND __constant_cpu_to_le32(0xC00A0015)
+#define STATUS_CTX_WINSTATION_NAME_COLLISION __constant_cpu_to_le32(0xC00A0016)
+#define STATUS_CTX_WINSTATION_BUSY __constant_cpu_to_le32(0xC00A0017)
+#define STATUS_CTX_BAD_VIDEO_MODE __constant_cpu_to_le32(0xC00A0018)
+#define STATUS_CTX_GRAPHICS_INVALID __constant_cpu_to_le32(0xC00A0022)
+#define STATUS_CTX_NOT_CONSOLE __constant_cpu_to_le32(0xC00A0024)
+#define STATUS_CTX_CLIENT_QUERY_TIMEOUT __constant_cpu_to_le32(0xC00A0026)
+#define STATUS_CTX_CONSOLE_DISCONNECT __constant_cpu_to_le32(0xC00A0027)
+#define STATUS_CTX_CONSOLE_CONNECT __constant_cpu_to_le32(0xC00A0028)
+#define STATUS_CTX_SHADOW_DENIED __constant_cpu_to_le32(0xC00A002A)
+#define STATUS_CTX_WINSTATION_ACCESS_DENIED __constant_cpu_to_le32(0xC00A002B)
+#define STATUS_CTX_INVALID_WD __constant_cpu_to_le32(0xC00A002E)
+#define STATUS_CTX_WD_NOT_FOUND __constant_cpu_to_le32(0xC00A002F)
+#define STATUS_CTX_SHADOW_INVALID __constant_cpu_to_le32(0xC00A0030)
+#define STATUS_CTX_SHADOW_DISABLED __constant_cpu_to_le32(0xC00A0031)
+#define STATUS_RDP_PROTOCOL_ERROR __constant_cpu_to_le32(0xC00A0032)
+#define STATUS_CTX_CLIENT_LICENSE_NOT_SET __constant_cpu_to_le32(0xC00A0033)
+#define STATUS_CTX_CLIENT_LICENSE_IN_USE __constant_cpu_to_le32(0xC00A0034)
+#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE __constant_cpu_to_le32(0xC00A0035)
+#define STATUS_CTX_SHADOW_NOT_RUNNING __constant_cpu_to_le32(0xC00A0036)
+#define STATUS_CTX_LOGON_DISABLED __constant_cpu_to_le32(0xC00A0037)
+#define STATUS_CTX_SECURITY_LAYER_ERROR __constant_cpu_to_le32(0xC00A0038)
+#define STATUS_TS_INCOMPATIBLE_SESSIONS __constant_cpu_to_le32(0xC00A0039)
+#define STATUS_MUI_FILE_NOT_FOUND __constant_cpu_to_le32(0xC00B0001)
+#define STATUS_MUI_INVALID_FILE __constant_cpu_to_le32(0xC00B0002)
+#define STATUS_MUI_INVALID_RC_CONFIG __constant_cpu_to_le32(0xC00B0003)
+#define STATUS_MUI_INVALID_LOCALE_NAME __constant_cpu_to_le32(0xC00B0004)
+#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME __constant_cpu_to_le32(0xC00B0005)
+#define STATUS_MUI_FILE_NOT_LOADED __constant_cpu_to_le32(0xC00B0006)
+#define STATUS_RESOURCE_ENUM_USER_STOP __constant_cpu_to_le32(0xC00B0007)
+#define STATUS_CLUSTER_INVALID_NODE __constant_cpu_to_le32(0xC0130001)
+#define STATUS_CLUSTER_NODE_EXISTS __constant_cpu_to_le32(0xC0130002)
+#define STATUS_CLUSTER_JOIN_IN_PROGRESS __constant_cpu_to_le32(0xC0130003)
+#define STATUS_CLUSTER_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130004)
+#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130005)
+#define STATUS_CLUSTER_NETWORK_EXISTS __constant_cpu_to_le32(0xC0130006)
+#define STATUS_CLUSTER_NETWORK_NOT_FOUND __constant_cpu_to_le32(0xC0130007)
+#define STATUS_CLUSTER_NETINTERFACE_EXISTS __constant_cpu_to_le32(0xC0130008)
+#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC0130009)
+#define STATUS_CLUSTER_INVALID_REQUEST __constant_cpu_to_le32(0xC013000A)
+#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER __constant_cpu_to_le32(0xC013000B)
+#define STATUS_CLUSTER_NODE_DOWN __constant_cpu_to_le32(0xC013000C)
+#define STATUS_CLUSTER_NODE_UNREACHABLE __constant_cpu_to_le32(0xC013000D)
+#define STATUS_CLUSTER_NODE_NOT_MEMBER __constant_cpu_to_le32(0xC013000E)
+#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC013000F)
+#define STATUS_CLUSTER_INVALID_NETWORK __constant_cpu_to_le32(0xC0130010)
+#define STATUS_CLUSTER_NO_NET_ADAPTERS __constant_cpu_to_le32(0xC0130011)
+#define STATUS_CLUSTER_NODE_UP __constant_cpu_to_le32(0xC0130012)
+#define STATUS_CLUSTER_NODE_PAUSED __constant_cpu_to_le32(0xC0130013)
+#define STATUS_CLUSTER_NODE_NOT_PAUSED __constant_cpu_to_le32(0xC0130014)
+#define STATUS_CLUSTER_NO_SECURITY_CONTEXT __constant_cpu_to_le32(0xC0130015)
+#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL __constant_cpu_to_le32(0xC0130016)
+#define STATUS_CLUSTER_POISONED __constant_cpu_to_le32(0xC0130017)
+#define STATUS_ACPI_INVALID_OPCODE __constant_cpu_to_le32(0xC0140001)
+#define STATUS_ACPI_STACK_OVERFLOW __constant_cpu_to_le32(0xC0140002)
+#define STATUS_ACPI_ASSERT_FAILED __constant_cpu_to_le32(0xC0140003)
+#define STATUS_ACPI_INVALID_INDEX __constant_cpu_to_le32(0xC0140004)
+#define STATUS_ACPI_INVALID_ARGUMENT __constant_cpu_to_le32(0xC0140005)
+#define STATUS_ACPI_FATAL __constant_cpu_to_le32(0xC0140006)
+#define STATUS_ACPI_INVALID_SUPERNAME __constant_cpu_to_le32(0xC0140007)
+#define STATUS_ACPI_INVALID_ARGTYPE __constant_cpu_to_le32(0xC0140008)
+#define STATUS_ACPI_INVALID_OBJTYPE __constant_cpu_to_le32(0xC0140009)
+#define STATUS_ACPI_INVALID_TARGETTYPE __constant_cpu_to_le32(0xC014000A)
+#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT __constant_cpu_to_le32(0xC014000B)
+#define STATUS_ACPI_ADDRESS_NOT_MAPPED __constant_cpu_to_le32(0xC014000C)
+#define STATUS_ACPI_INVALID_EVENTTYPE __constant_cpu_to_le32(0xC014000D)
+#define STATUS_ACPI_HANDLER_COLLISION __constant_cpu_to_le32(0xC014000E)
+#define STATUS_ACPI_INVALID_DATA __constant_cpu_to_le32(0xC014000F)
+#define STATUS_ACPI_INVALID_REGION __constant_cpu_to_le32(0xC0140010)
+#define STATUS_ACPI_INVALID_ACCESS_SIZE __constant_cpu_to_le32(0xC0140011)
+#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK __constant_cpu_to_le32(0xC0140012)
+#define STATUS_ACPI_ALREADY_INITIALIZED __constant_cpu_to_le32(0xC0140013)
+#define STATUS_ACPI_NOT_INITIALIZED __constant_cpu_to_le32(0xC0140014)
+#define STATUS_ACPI_INVALID_MUTEX_LEVEL __constant_cpu_to_le32(0xC0140015)
+#define STATUS_ACPI_MUTEX_NOT_OWNED __constant_cpu_to_le32(0xC0140016)
+#define STATUS_ACPI_MUTEX_NOT_OWNER __constant_cpu_to_le32(0xC0140017)
+#define STATUS_ACPI_RS_ACCESS __constant_cpu_to_le32(0xC0140018)
+#define STATUS_ACPI_INVALID_TABLE __constant_cpu_to_le32(0xC0140019)
+#define STATUS_ACPI_REG_HANDLER_FAILED __constant_cpu_to_le32(0xC0140020)
+#define STATUS_ACPI_POWER_REQUEST_FAILED __constant_cpu_to_le32(0xC0140021)
+#define STATUS_SXS_SECTION_NOT_FOUND __constant_cpu_to_le32(0xC0150001)
+#define STATUS_SXS_CANT_GEN_ACTCTX __constant_cpu_to_le32(0xC0150002)
+#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT __constant_cpu_to_le32(0xC0150003)
+#define STATUS_SXS_ASSEMBLY_NOT_FOUND __constant_cpu_to_le32(0xC0150004)
+#define STATUS_SXS_MANIFEST_FORMAT_ERROR __constant_cpu_to_le32(0xC0150005)
+#define STATUS_SXS_MANIFEST_PARSE_ERROR __constant_cpu_to_le32(0xC0150006)
+#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED __constant_cpu_to_le32(0xC0150007)
+#define STATUS_SXS_KEY_NOT_FOUND __constant_cpu_to_le32(0xC0150008)
+#define STATUS_SXS_VERSION_CONFLICT __constant_cpu_to_le32(0xC0150009)
+#define STATUS_SXS_WRONG_SECTION_TYPE __constant_cpu_to_le32(0xC015000A)
+#define STATUS_SXS_THREAD_QUERIES_DISABLED __constant_cpu_to_le32(0xC015000B)
+#define STATUS_SXS_ASSEMBLY_MISSING __constant_cpu_to_le32(0xC015000C)
+#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET __constant_cpu_to_le32(0xC015000E)
+#define STATUS_SXS_EARLY_DEACTIVATION __constant_cpu_to_le32(0xC015000F)
+#define STATUS_SXS_INVALID_DEACTIVATION __constant_cpu_to_le32(0xC0150010)
+#define STATUS_SXS_MULTIPLE_DEACTIVATION __constant_cpu_to_le32(0xC0150011)
+#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY __constant_cpu_to_le32(0xC0150012)
+#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED __constant_cpu_to_le32(0xC0150013)
+#define STATUS_SXS_CORRUPT_ACTIVATION_STACK __constant_cpu_to_le32(0xC0150014)
+#define STATUS_SXS_CORRUPTION __constant_cpu_to_le32(0xC0150015)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE __constant_cpu_to_le32(0xC0150016)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME __constant_cpu_to_le32(0xC0150017)
+#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE __constant_cpu_to_le32(0xC0150018)
+#define STATUS_SXS_IDENTITY_PARSE_ERROR __constant_cpu_to_le32(0xC0150019)
+#define STATUS_SXS_COMPONENT_STORE_CORRUPT __constant_cpu_to_le32(0xC015001A)
+#define STATUS_SXS_FILE_HASH_MISMATCH __constant_cpu_to_le32(0xC015001B)
+#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT __constant_cpu_to_le32(0xC015001C)
+#define STATUS_SXS_IDENTITIES_DIFFERENT __constant_cpu_to_le32(0xC015001D)
+#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT __constant_cpu_to_le32(0xC015001E)
+#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY __constant_cpu_to_le32(0xC015001F)
+#define STATUS_ADVANCED_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150020)
+#define STATUS_XML_ENCODING_MISMATCH __constant_cpu_to_le32(0xC0150021)
+#define STATUS_SXS_MANIFEST_TOO_BIG __constant_cpu_to_le32(0xC0150022)
+#define STATUS_SXS_SETTING_NOT_REGISTERED __constant_cpu_to_le32(0xC0150023)
+#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE __constant_cpu_to_le32(0xC0150024)
+#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150025)
+#define STATUS_GENERIC_COMMAND_FAILED __constant_cpu_to_le32(0xC0150026)
+#define STATUS_SXS_FILE_HASH_MISSING __constant_cpu_to_le32(0xC0150027)
+#define STATUS_TRANSACTIONAL_CONFLICT __constant_cpu_to_le32(0xC0190001)
+#define STATUS_INVALID_TRANSACTION __constant_cpu_to_le32(0xC0190002)
+#define STATUS_TRANSACTION_NOT_ACTIVE __constant_cpu_to_le32(0xC0190003)
+#define STATUS_TM_INITIALIZATION_FAILED __constant_cpu_to_le32(0xC0190004)
+#define STATUS_RM_NOT_ACTIVE __constant_cpu_to_le32(0xC0190005)
+#define STATUS_RM_METADATA_CORRUPT __constant_cpu_to_le32(0xC0190006)
+#define STATUS_TRANSACTION_NOT_JOINED __constant_cpu_to_le32(0xC0190007)
+#define STATUS_DIRECTORY_NOT_RM __constant_cpu_to_le32(0xC0190008)
+#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC019000A)
+#define STATUS_LOG_RESIZE_INVALID_SIZE __constant_cpu_to_le32(0xC019000B)
+#define STATUS_REMOTE_FILE_VERSION_MISMATCH __constant_cpu_to_le32(0xC019000C)
+#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS __constant_cpu_to_le32(0xC019000F)
+#define STATUS_TRANSACTION_PROPAGATION_FAILED __constant_cpu_to_le32(0xC0190010)
+#define STATUS_CRM_PROTOCOL_NOT_FOUND __constant_cpu_to_le32(0xC0190011)
+#define STATUS_TRANSACTION_SUPERIOR_EXISTS __constant_cpu_to_le32(0xC0190012)
+#define STATUS_TRANSACTION_REQUEST_NOT_VALID __constant_cpu_to_le32(0xC0190013)
+#define STATUS_TRANSACTION_NOT_REQUESTED __constant_cpu_to_le32(0xC0190014)
+#define STATUS_TRANSACTION_ALREADY_ABORTED __constant_cpu_to_le32(0xC0190015)
+#define STATUS_TRANSACTION_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0190016)
+#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER __constant_cpu_to_le32(0xC0190017)
+#define STATUS_CURRENT_TRANSACTION_NOT_VALID __constant_cpu_to_le32(0xC0190018)
+#define STATUS_LOG_GROWTH_FAILED __constant_cpu_to_le32(0xC0190019)
+#define STATUS_OBJECT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC0190021)
+#define STATUS_STREAM_MINIVERSION_NOT_FOUND __constant_cpu_to_le32(0xC0190022)
+#define STATUS_STREAM_MINIVERSION_NOT_VALID __constant_cpu_to_le32(0xC0190023)
+#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION __constant_cpu_to_le32(0xC0190024)
+#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT __constant_cpu_to_le32(0xC0190025)
+#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS __constant_cpu_to_le32(0xC0190026)
+#define STATUS_HANDLE_NO_LONGER_VALID __constant_cpu_to_le32(0xC0190028)
+#define STATUS_LOG_CORRUPTION_DETECTED __constant_cpu_to_le32(0xC0190030)
+#define STATUS_RM_DISCONNECTED __constant_cpu_to_le32(0xC0190032)
+#define STATUS_ENLISTMENT_NOT_SUPERIOR __constant_cpu_to_le32(0xC0190033)
+#define STATUS_FILE_IDENTITY_NOT_PERSISTENT __constant_cpu_to_le32(0xC0190036)
+#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY __constant_cpu_to_le32(0xC0190037)
+#define STATUS_CANT_CROSS_RM_BOUNDARY __constant_cpu_to_le32(0xC0190038)
+#define STATUS_TXF_DIR_NOT_EMPTY __constant_cpu_to_le32(0xC0190039)
+#define STATUS_INDOUBT_TRANSACTIONS_EXIST __constant_cpu_to_le32(0xC019003A)
+#define STATUS_TM_VOLATILE __constant_cpu_to_le32(0xC019003B)
+#define STATUS_ROLLBACK_TIMER_EXPIRED __constant_cpu_to_le32(0xC019003C)
+#define STATUS_TXF_ATTRIBUTE_CORRUPT __constant_cpu_to_le32(0xC019003D)
+#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC019003E)
+#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED __constant_cpu_to_le32(0xC019003F)
+#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC0190040)
+#define STATUS_TRANSACTION_REQUIRED_PROMOTION __constant_cpu_to_le32(0xC0190043)
+#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION __constant_cpu_to_le32(0xC0190044)
+#define STATUS_TRANSACTIONS_NOT_FROZEN __constant_cpu_to_le32(0xC0190045)
+#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS __constant_cpu_to_le32(0xC0190046)
+#define STATUS_NOT_SNAPSHOT_VOLUME __constant_cpu_to_le32(0xC0190047)
+#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES __constant_cpu_to_le32(0xC0190048)
+#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190049)
+#define STATUS_TM_IDENTITY_MISMATCH __constant_cpu_to_le32(0xC019004A)
+#define STATUS_FLOATED_SECTION __constant_cpu_to_le32(0xC019004B)
+#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK __constant_cpu_to_le32(0xC019004C)
+#define STATUS_CANNOT_ABORT_TRANSACTIONS __constant_cpu_to_le32(0xC019004D)
+#define STATUS_TRANSACTION_NOT_FOUND __constant_cpu_to_le32(0xC019004E)
+#define STATUS_RESOURCEMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC019004F)
+#define STATUS_ENLISTMENT_NOT_FOUND __constant_cpu_to_le32(0xC0190050)
+#define STATUS_TRANSACTIONMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC0190051)
+#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE __constant_cpu_to_le32(0xC0190052)
+#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION __constant_cpu_to_le32(0xC0190053)
+#define STATUS_TRANSACTION_NOT_ROOT __constant_cpu_to_le32(0xC0190054)
+#define STATUS_TRANSACTION_OBJECT_EXPIRED __constant_cpu_to_le32(0xC0190055)
+#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190056)
+#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED __constant_cpu_to_le32(0xC0190057)
+#define STATUS_TRANSACTION_RECORD_TOO_LONG __constant_cpu_to_le32(0xC0190058)
+#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION __constant_cpu_to_le32(0xC0190059)
+#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION __constant_cpu_to_le32(0xC019005A)
+#define STATUS_TRANSACTION_INTEGRITY_VIOLATED __constant_cpu_to_le32(0xC019005B)
+#define STATUS_LOG_SECTOR_INVALID __constant_cpu_to_le32(0xC01A0001)
+#define STATUS_LOG_SECTOR_PARITY_INVALID __constant_cpu_to_le32(0xC01A0002)
+#define STATUS_LOG_SECTOR_REMAPPED __constant_cpu_to_le32(0xC01A0003)
+#define STATUS_LOG_BLOCK_INCOMPLETE __constant_cpu_to_le32(0xC01A0004)
+#define STATUS_LOG_INVALID_RANGE __constant_cpu_to_le32(0xC01A0005)
+#define STATUS_LOG_BLOCKS_EXHAUSTED __constant_cpu_to_le32(0xC01A0006)
+#define STATUS_LOG_READ_CONTEXT_INVALID __constant_cpu_to_le32(0xC01A0007)
+#define STATUS_LOG_RESTART_INVALID __constant_cpu_to_le32(0xC01A0008)
+#define STATUS_LOG_BLOCK_VERSION __constant_cpu_to_le32(0xC01A0009)
+#define STATUS_LOG_BLOCK_INVALID __constant_cpu_to_le32(0xC01A000A)
+#define STATUS_LOG_READ_MODE_INVALID __constant_cpu_to_le32(0xC01A000B)
+#define STATUS_LOG_METADATA_CORRUPT __constant_cpu_to_le32(0xC01A000D)
+#define STATUS_LOG_METADATA_INVALID __constant_cpu_to_le32(0xC01A000E)
+#define STATUS_LOG_METADATA_INCONSISTENT __constant_cpu_to_le32(0xC01A000F)
+#define STATUS_LOG_RESERVATION_INVALID __constant_cpu_to_le32(0xC01A0010)
+#define STATUS_LOG_CANT_DELETE __constant_cpu_to_le32(0xC01A0011)
+#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC01A0012)
+#define STATUS_LOG_START_OF_LOG __constant_cpu_to_le32(0xC01A0013)
+#define STATUS_LOG_POLICY_ALREADY_INSTALLED __constant_cpu_to_le32(0xC01A0014)
+#define STATUS_LOG_POLICY_NOT_INSTALLED __constant_cpu_to_le32(0xC01A0015)
+#define STATUS_LOG_POLICY_INVALID __constant_cpu_to_le32(0xC01A0016)
+#define STATUS_LOG_POLICY_CONFLICT __constant_cpu_to_le32(0xC01A0017)
+#define STATUS_LOG_PINNED_ARCHIVE_TAIL __constant_cpu_to_le32(0xC01A0018)
+#define STATUS_LOG_RECORD_NONEXISTENT __constant_cpu_to_le32(0xC01A0019)
+#define STATUS_LOG_RECORDS_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001A)
+#define STATUS_LOG_SPACE_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001B)
+#define STATUS_LOG_TAIL_INVALID __constant_cpu_to_le32(0xC01A001C)
+#define STATUS_LOG_FULL __constant_cpu_to_le32(0xC01A001D)
+#define STATUS_LOG_MULTIPLEXED __constant_cpu_to_le32(0xC01A001E)
+#define STATUS_LOG_DEDICATED __constant_cpu_to_le32(0xC01A001F)
+#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC01A0020)
+#define STATUS_LOG_ARCHIVE_IN_PROGRESS __constant_cpu_to_le32(0xC01A0021)
+#define STATUS_LOG_EPHEMERAL __constant_cpu_to_le32(0xC01A0022)
+#define STATUS_LOG_NOT_ENOUGH_CONTAINERS __constant_cpu_to_le32(0xC01A0023)
+#define STATUS_LOG_CLIENT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC01A0024)
+#define STATUS_LOG_CLIENT_NOT_REGISTERED __constant_cpu_to_le32(0xC01A0025)
+#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS __constant_cpu_to_le32(0xC01A0026)
+#define STATUS_LOG_CONTAINER_READ_FAILED __constant_cpu_to_le32(0xC01A0027)
+#define STATUS_LOG_CONTAINER_WRITE_FAILED __constant_cpu_to_le32(0xC01A0028)
+#define STATUS_LOG_CONTAINER_OPEN_FAILED __constant_cpu_to_le32(0xC01A0029)
+#define STATUS_LOG_CONTAINER_STATE_INVALID __constant_cpu_to_le32(0xC01A002A)
+#define STATUS_LOG_STATE_INVALID __constant_cpu_to_le32(0xC01A002B)
+#define STATUS_LOG_PINNED __constant_cpu_to_le32(0xC01A002C)
+#define STATUS_LOG_METADATA_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002D)
+#define STATUS_LOG_INCONSISTENT_SECURITY __constant_cpu_to_le32(0xC01A002E)
+#define STATUS_LOG_APPENDED_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002F)
+#define STATUS_LOG_PINNED_RESERVATION __constant_cpu_to_le32(0xC01A0030)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC01B00EA)
+#define STATUS_FLT_NO_HANDLER_DEFINED __constant_cpu_to_le32(0xC01C0001)
+#define STATUS_FLT_CONTEXT_ALREADY_DEFINED __constant_cpu_to_le32(0xC01C0002)
+#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST __constant_cpu_to_le32(0xC01C0003)
+#define STATUS_FLT_DISALLOW_FAST_IO __constant_cpu_to_le32(0xC01C0004)
+#define STATUS_FLT_INVALID_NAME_REQUEST __constant_cpu_to_le32(0xC01C0005)
+#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION __constant_cpu_to_le32(0xC01C0006)
+#define STATUS_FLT_NOT_INITIALIZED __constant_cpu_to_le32(0xC01C0007)
+#define STATUS_FLT_FILTER_NOT_READY __constant_cpu_to_le32(0xC01C0008)
+#define STATUS_FLT_POST_OPERATION_CLEANUP __constant_cpu_to_le32(0xC01C0009)
+#define STATUS_FLT_INTERNAL_ERROR __constant_cpu_to_le32(0xC01C000A)
+#define STATUS_FLT_DELETING_OBJECT __constant_cpu_to_le32(0xC01C000B)
+#define STATUS_FLT_MUST_BE_NONPAGED_POOL __constant_cpu_to_le32(0xC01C000C)
+#define STATUS_FLT_DUPLICATE_ENTRY __constant_cpu_to_le32(0xC01C000D)
+#define STATUS_FLT_CBDQ_DISABLED __constant_cpu_to_le32(0xC01C000E)
+#define STATUS_FLT_DO_NOT_ATTACH __constant_cpu_to_le32(0xC01C000F)
+#define STATUS_FLT_DO_NOT_DETACH __constant_cpu_to_le32(0xC01C0010)
+#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION __constant_cpu_to_le32(0xC01C0011)
+#define STATUS_FLT_INSTANCE_NAME_COLLISION __constant_cpu_to_le32(0xC01C0012)
+#define STATUS_FLT_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC01C0013)
+#define STATUS_FLT_VOLUME_NOT_FOUND __constant_cpu_to_le32(0xC01C0014)
+#define STATUS_FLT_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC01C0015)
+#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND __constant_cpu_to_le32(0xC01C0016)
+#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION __constant_cpu_to_le32(0xC01C0017)
+#define STATUS_FLT_NAME_CACHE_MISS __constant_cpu_to_le32(0xC01C0018)
+#define STATUS_FLT_NO_DEVICE_OBJECT __constant_cpu_to_le32(0xC01C0019)
+#define STATUS_FLT_VOLUME_ALREADY_MOUNTED __constant_cpu_to_le32(0xC01C001A)
+#define STATUS_FLT_ALREADY_ENLISTED __constant_cpu_to_le32(0xC01C001B)
+#define STATUS_FLT_CONTEXT_ALREADY_LINKED __constant_cpu_to_le32(0xC01C001C)
+#define STATUS_FLT_NO_WAITER_FOR_REPLY __constant_cpu_to_le32(0xC01C0020)
+#define STATUS_MONITOR_NO_DESCRIPTOR __constant_cpu_to_le32(0xC01D0001)
+#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC01D0002)
+#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM __constant_cpu_to_le32(0xC01D0003)
+#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0004)
+#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED __constant_cpu_to_le32(0xC01D0005)
+#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0006)
+#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0007)
+#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA __constant_cpu_to_le32(0xC01D0008)
+#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0009)
+#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER __constant_cpu_to_le32(0xC01E0000)
+#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER __constant_cpu_to_le32(0xC01E0001)
+#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER __constant_cpu_to_le32(0xC01E0002)
+#define STATUS_GRAPHICS_ADAPTER_WAS_RESET __constant_cpu_to_le32(0xC01E0003)
+#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL __constant_cpu_to_le32(0xC01E0004)
+#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED __constant_cpu_to_le32(0xC01E0005)
+#define STATUS_GRAPHICS_PRESENT_OCCLUDED __constant_cpu_to_le32(0xC01E0006)
+#define STATUS_GRAPHICS_PRESENT_DENIED __constant_cpu_to_le32(0xC01E0007)
+#define STATUS_GRAPHICS_CANNOTCOLORCONVERT __constant_cpu_to_le32(0xC01E0008)
+#define STATUS_GRAPHICS_NO_VIDEO_MEMORY __constant_cpu_to_le32(0xC01E0100)
+#define STATUS_GRAPHICS_CANT_LOCK_MEMORY __constant_cpu_to_le32(0xC01E0101)
+#define STATUS_GRAPHICS_ALLOCATION_BUSY __constant_cpu_to_le32(0xC01E0102)
+#define STATUS_GRAPHICS_TOO_MANY_REFERENCES __constant_cpu_to_le32(0xC01E0103)
+#define STATUS_GRAPHICS_TRY_AGAIN_LATER __constant_cpu_to_le32(0xC01E0104)
+#define STATUS_GRAPHICS_TRY_AGAIN_NOW __constant_cpu_to_le32(0xC01E0105)
+#define STATUS_GRAPHICS_ALLOCATION_INVALID __constant_cpu_to_le32(0xC01E0106)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE __constant_cpu_to_le32(0xC01E0107)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED __constant_cpu_to_le32(0xC01E0108)
+#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION __constant_cpu_to_le32(0xC01E0109)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE __constant_cpu_to_le32(0xC01E0110)
+#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION __constant_cpu_to_le32(0xC01E0111)
+#define STATUS_GRAPHICS_ALLOCATION_CLOSED __constant_cpu_to_le32(0xC01E0112)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE __constant_cpu_to_le32(0xC01E0113)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE __constant_cpu_to_le32(0xC01E0114)
+#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE __constant_cpu_to_le32(0xC01E0115)
+#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST __constant_cpu_to_le32(0xC01E0116)
+#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE __constant_cpu_to_le32(0xC01E0200)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0300)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0301)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0302)
+#define STATUS_GRAPHICS_INVALID_VIDPN __constant_cpu_to_le32(0xC01E0303)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE __constant_cpu_to_le32(0xC01E0304)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET __constant_cpu_to_le32(0xC01E0305)
+#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0306)
+#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET __constant_cpu_to_le32(0xC01E0308)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET __constant_cpu_to_le32(0xC01E0309)
+#define STATUS_GRAPHICS_INVALID_FREQUENCY __constant_cpu_to_le32(0xC01E030A)
+#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION __constant_cpu_to_le32(0xC01E030B)
+#define STATUS_GRAPHICS_INVALID_TOTAL_REGION __constant_cpu_to_le32(0xC01E030C)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE __constant_cpu_to_le32(0xC01E0310)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE __constant_cpu_to_le32(0xC01E0311)
+#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET __constant_cpu_to_le32(0xC01E0312)
+#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0313)
+#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET __constant_cpu_to_le32(0xC01E0314)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET __constant_cpu_to_le32(0xC01E0315)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET __constant_cpu_to_le32(0xC01E0316)
+#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0317)
+#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0318)
+#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH __constant_cpu_to_le32(0xC01E0319)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E031A)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET __constant_cpu_to_le32(0xC01E031B)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE __constant_cpu_to_le32(0xC01E031C)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET __constant_cpu_to_le32(0xC01E031D)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E031F)
+#define STATUS_GRAPHICS_STALE_MODESET __constant_cpu_to_le32(0xC01E0320)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET __constant_cpu_to_le32(0xC01E0321)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE __constant_cpu_to_le32(0xC01E0322)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN __constant_cpu_to_le32(0xC01E0323)
+#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0324)
+#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION __constant_cpu_to_le32(0xC01E0325)
+#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES __constant_cpu_to_le32(0xC01E0326)
+#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0327)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE __constant_cpu_to_le32(0xC01E0328)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET __constant_cpu_to_le32(0xC01E0329)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET __constant_cpu_to_le32(0xC01E032A)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR __constant_cpu_to_le32(0xC01E032B)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET __constant_cpu_to_le32(0xC01E032C)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E032D)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E032E)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE __constant_cpu_to_le32(0xC01E032F)
+#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED __constant_cpu_to_le32(0xC01E0330)
+#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0331)
+#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0332)
+#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET __constant_cpu_to_le32(0xC01E0333)
+#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER __constant_cpu_to_le32(0xC01E0334)
+#define STATUS_GRAPHICS_NO_VIDPNMGR __constant_cpu_to_le32(0xC01E0335)
+#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0336)
+#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0337)
+#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED __constant_cpu_to_le32(0xC01E0338)
+#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0339)
+#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE __constant_cpu_to_le32(0xC01E033A)
+#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE __constant_cpu_to_le32(0xC01E033B)
+#define STATUS_GRAPHICS_INVALID_STRIDE __constant_cpu_to_le32(0xC01E033C)
+#define STATUS_GRAPHICS_INVALID_PIXELFORMAT __constant_cpu_to_le32(0xC01E033D)
+#define STATUS_GRAPHICS_INVALID_COLORBASIS __constant_cpu_to_le32(0xC01E033E)
+#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE __constant_cpu_to_le32(0xC01E033F)
+#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0340)
+#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT __constant_cpu_to_le32(0xC01E0341)
+#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE __constant_cpu_to_le32(0xC01E0342)
+#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0343)
+#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL __constant_cpu_to_le32(0xC01E0344)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION __constant_cpu_to_le32(0xC01E0345)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0346)
+#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP __constant_cpu_to_le32(0xC01E0347)
+#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0348)
+#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0349)
+#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET __constant_cpu_to_le32(0xC01E034A)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON __constant_cpu_to_le32(0xC01E034D)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE __constant_cpu_to_le32(0xC01E034E)
+#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE __constant_cpu_to_le32(0xC01E034F)
+#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS __constant_cpu_to_le32(0xC01E0350)
+#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING __constant_cpu_to_le32(0xC01E0352)
+#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED __constant_cpu_to_le32(0xC01E0353)
+#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS __constant_cpu_to_le32(0xC01E0354)
+#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT __constant_cpu_to_le32(0xC01E0355)
+#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM __constant_cpu_to_le32(0xC01E0356)
+#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN __constant_cpu_to_le32(0xC01E0357)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT __constant_cpu_to_le32(0xC01E0358)
+#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED __constant_cpu_to_le32(0xC01E0359)
+#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION __constant_cpu_to_le32(0xC01E035A)
+#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE __constant_cpu_to_le32(0xC01E035B)
+#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET __constant_cpu_to_le32(0xC01E035C)
+#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED __constant_cpu_to_le32(0xC01E0400)
+#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0401)
+#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER __constant_cpu_to_le32(0xC01E0430)
+#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0431)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0432)
+#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY __constant_cpu_to_le32(0xC01E0433)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED __constant_cpu_to_le32(0xC01E0434)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON __constant_cpu_to_le32(0xC01E0435)
+#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE __constant_cpu_to_le32(0xC01E0436)
+#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER __constant_cpu_to_le32(0xC01E0438)
+#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED __constant_cpu_to_le32(0xC01E043B)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS __constant_cpu_to_le32(0xC01E051C)
+#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST __constant_cpu_to_le32(0xC01E051D)
+#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E051E)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS __constant_cpu_to_le32(0xC01E051F)
+#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0520)
+#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST __constant_cpu_to_le32(0xC01E0521)
+#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0500)
+#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0501)
+#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0502)
+#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS __constant_cpu_to_le32(0xC01E0503)
+#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E0504)
+#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST __constant_cpu_to_le32(0xC01E0505)
+#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E0506)
+#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E0507)
+#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0508)
+#define STATUS_GRAPHICS_OPM_INVALID_POINTER __constant_cpu_to_le32(0xC01E050A)
+#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E050B)
+#define STATUS_GRAPHICS_OPM_INVALID_HANDLE __constant_cpu_to_le32(0xC01E050C)
+#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E050D)
+#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH __constant_cpu_to_le32(0xC01E050E)
+#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED __constant_cpu_to_le32(0xC01E050F)
+#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED __constant_cpu_to_le32(0xC01E0510)
+#define STATUS_GRAPHICS_PVP_HFS_FAILED __constant_cpu_to_le32(0xC01E0511)
+#define STATUS_GRAPHICS_OPM_INVALID_SRM __constant_cpu_to_le32(0xC01E0512)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP __constant_cpu_to_le32(0xC01E0513)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP __constant_cpu_to_le32(0xC01E0514)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA __constant_cpu_to_le32(0xC01E0515)
+#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET __constant_cpu_to_le32(0xC01E0516)
+#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH __constant_cpu_to_le32(0xC01E0517)
+#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE __constant_cpu_to_le32(0xC01E0518)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E051A)
+#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E051B)
+#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0580)
+#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC01E0581)
+#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA __constant_cpu_to_le32(0xC01E0582)
+#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA __constant_cpu_to_le32(0xC01E0583)
+#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0584)
+#define STATUS_GRAPHICS_DDCCI_INVALID_DATA __constant_cpu_to_le32(0xC01E0585)
+#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE __constant_cpu_to_le32(0xC01E0586)
+#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING __constant_cpu_to_le32(0xC01E0587)
+#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E0588)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND __constant_cpu_to_le32(0xC01E0589)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH __constant_cpu_to_le32(0xC01E058A)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM __constant_cpu_to_le32(0xC01E058B)
+#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE __constant_cpu_to_le32(0xC01E058C)
+#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E058D)
+#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED __constant_cpu_to_le32(0xC01E05E0)
+#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E05E1)
+#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E05E2)
+#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E05E3)
+#define STATUS_GRAPHICS_INVALID_POINTER __constant_cpu_to_le32(0xC01E05E4)
+#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E05E5)
+#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E05E6)
+#define STATUS_GRAPHICS_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E05E7)
+#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E05E8)
+#define STATUS_FVE_LOCKED_VOLUME __constant_cpu_to_le32(0xC0210000)
+#define STATUS_FVE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0210001)
+#define STATUS_FVE_BAD_INFORMATION __constant_cpu_to_le32(0xC0210002)
+#define STATUS_FVE_TOO_SMALL __constant_cpu_to_le32(0xC0210003)
+#define STATUS_FVE_FAILED_WRONG_FS __constant_cpu_to_le32(0xC0210004)
+#define STATUS_FVE_FAILED_BAD_FS __constant_cpu_to_le32(0xC0210005)
+#define STATUS_FVE_FS_NOT_EXTENDED __constant_cpu_to_le32(0xC0210006)
+#define STATUS_FVE_FS_MOUNTED __constant_cpu_to_le32(0xC0210007)
+#define STATUS_FVE_NO_LICENSE __constant_cpu_to_le32(0xC0210008)
+#define STATUS_FVE_ACTION_NOT_ALLOWED __constant_cpu_to_le32(0xC0210009)
+#define STATUS_FVE_BAD_DATA __constant_cpu_to_le32(0xC021000A)
+#define STATUS_FVE_VOLUME_NOT_BOUND __constant_cpu_to_le32(0xC021000B)
+#define STATUS_FVE_NOT_DATA_VOLUME __constant_cpu_to_le32(0xC021000C)
+#define STATUS_FVE_CONV_READ_ERROR __constant_cpu_to_le32(0xC021000D)
+#define STATUS_FVE_CONV_WRITE_ERROR __constant_cpu_to_le32(0xC021000E)
+#define STATUS_FVE_OVERLAPPED_UPDATE __constant_cpu_to_le32(0xC021000F)
+#define STATUS_FVE_FAILED_SECTOR_SIZE __constant_cpu_to_le32(0xC0210010)
+#define STATUS_FVE_FAILED_AUTHENTICATION __constant_cpu_to_le32(0xC0210011)
+#define STATUS_FVE_NOT_OS_VOLUME __constant_cpu_to_le32(0xC0210012)
+#define STATUS_FVE_KEYFILE_NOT_FOUND __constant_cpu_to_le32(0xC0210013)
+#define STATUS_FVE_KEYFILE_INVALID __constant_cpu_to_le32(0xC0210014)
+#define STATUS_FVE_KEYFILE_NO_VMK __constant_cpu_to_le32(0xC0210015)
+#define STATUS_FVE_TPM_DISABLED __constant_cpu_to_le32(0xC0210016)
+#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO __constant_cpu_to_le32(0xC0210017)
+#define STATUS_FVE_TPM_INVALID_PCR __constant_cpu_to_le32(0xC0210018)
+#define STATUS_FVE_TPM_NO_VMK __constant_cpu_to_le32(0xC0210019)
+#define STATUS_FVE_PIN_INVALID __constant_cpu_to_le32(0xC021001A)
+#define STATUS_FVE_AUTH_INVALID_APPLICATION __constant_cpu_to_le32(0xC021001B)
+#define STATUS_FVE_AUTH_INVALID_CONFIG __constant_cpu_to_le32(0xC021001C)
+#define STATUS_FVE_DEBUGGER_ENABLED __constant_cpu_to_le32(0xC021001D)
+#define STATUS_FVE_DRY_RUN_FAILED __constant_cpu_to_le32(0xC021001E)
+#define STATUS_FVE_BAD_METADATA_POINTER __constant_cpu_to_le32(0xC021001F)
+#define STATUS_FVE_OLD_METADATA_COPY __constant_cpu_to_le32(0xC0210020)
+#define STATUS_FVE_REBOOT_REQUIRED __constant_cpu_to_le32(0xC0210021)
+#define STATUS_FVE_RAW_ACCESS __constant_cpu_to_le32(0xC0210022)
+#define STATUS_FVE_RAW_BLOCKED __constant_cpu_to_le32(0xC0210023)
+#define STATUS_FWP_CALLOUT_NOT_FOUND __constant_cpu_to_le32(0xC0220001)
+#define STATUS_FWP_CONDITION_NOT_FOUND __constant_cpu_to_le32(0xC0220002)
+#define STATUS_FWP_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC0220003)
+#define STATUS_FWP_LAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220004)
+#define STATUS_FWP_PROVIDER_NOT_FOUND __constant_cpu_to_le32(0xC0220005)
+#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND __constant_cpu_to_le32(0xC0220006)
+#define STATUS_FWP_SUBLAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220007)
+#define STATUS_FWP_NOT_FOUND __constant_cpu_to_le32(0xC0220008)
+#define STATUS_FWP_ALREADY_EXISTS __constant_cpu_to_le32(0xC0220009)
+#define STATUS_FWP_IN_USE __constant_cpu_to_le32(0xC022000A)
+#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS __constant_cpu_to_le32(0xC022000B)
+#define STATUS_FWP_WRONG_SESSION __constant_cpu_to_le32(0xC022000C)
+#define STATUS_FWP_NO_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000D)
+#define STATUS_FWP_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000E)
+#define STATUS_FWP_TXN_ABORTED __constant_cpu_to_le32(0xC022000F)
+#define STATUS_FWP_SESSION_ABORTED __constant_cpu_to_le32(0xC0220010)
+#define STATUS_FWP_INCOMPATIBLE_TXN __constant_cpu_to_le32(0xC0220011)
+#define STATUS_FWP_TIMEOUT __constant_cpu_to_le32(0xC0220012)
+#define STATUS_FWP_NET_EVENTS_DISABLED __constant_cpu_to_le32(0xC0220013)
+#define STATUS_FWP_INCOMPATIBLE_LAYER __constant_cpu_to_le32(0xC0220014)
+#define STATUS_FWP_KM_CLIENTS_ONLY __constant_cpu_to_le32(0xC0220015)
+#define STATUS_FWP_LIFETIME_MISMATCH __constant_cpu_to_le32(0xC0220016)
+#define STATUS_FWP_BUILTIN_OBJECT __constant_cpu_to_le32(0xC0220017)
+#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS __constant_cpu_to_le32(0xC0220018)
+#define STATUS_FWP_TOO_MANY_CALLOUTS __constant_cpu_to_le32(0xC0220018)
+#define STATUS_FWP_NOTIFICATION_DROPPED __constant_cpu_to_le32(0xC0220019)
+#define STATUS_FWP_TRAFFIC_MISMATCH __constant_cpu_to_le32(0xC022001A)
+#define STATUS_FWP_INCOMPATIBLE_SA_STATE __constant_cpu_to_le32(0xC022001B)
+#define STATUS_FWP_NULL_POINTER __constant_cpu_to_le32(0xC022001C)
+#define STATUS_FWP_INVALID_ENUMERATOR __constant_cpu_to_le32(0xC022001D)
+#define STATUS_FWP_INVALID_FLAGS __constant_cpu_to_le32(0xC022001E)
+#define STATUS_FWP_INVALID_NET_MASK __constant_cpu_to_le32(0xC022001F)
+#define STATUS_FWP_INVALID_RANGE __constant_cpu_to_le32(0xC0220020)
+#define STATUS_FWP_INVALID_INTERVAL __constant_cpu_to_le32(0xC0220021)
+#define STATUS_FWP_ZERO_LENGTH_ARRAY __constant_cpu_to_le32(0xC0220022)
+#define STATUS_FWP_NULL_DISPLAY_NAME __constant_cpu_to_le32(0xC0220023)
+#define STATUS_FWP_INVALID_ACTION_TYPE __constant_cpu_to_le32(0xC0220024)
+#define STATUS_FWP_INVALID_WEIGHT __constant_cpu_to_le32(0xC0220025)
+#define STATUS_FWP_MATCH_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220026)
+#define STATUS_FWP_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220027)
+#define STATUS_FWP_OUT_OF_BOUNDS __constant_cpu_to_le32(0xC0220028)
+#define STATUS_FWP_RESERVED __constant_cpu_to_le32(0xC0220029)
+#define STATUS_FWP_DUPLICATE_CONDITION __constant_cpu_to_le32(0xC022002A)
+#define STATUS_FWP_DUPLICATE_KEYMOD __constant_cpu_to_le32(0xC022002B)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002C)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER __constant_cpu_to_le32(0xC022002D)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002E)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT __constant_cpu_to_le32(0xC022002F)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD __constant_cpu_to_le32(0xC0220030)
+#define STATUS_FWP_INCOMPATIBLE_DH_GROUP __constant_cpu_to_le32(0xC0220031)
+#define STATUS_FWP_EM_NOT_SUPPORTED __constant_cpu_to_le32(0xC0220032)
+#define STATUS_FWP_NEVER_MATCH __constant_cpu_to_le32(0xC0220033)
+#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0220034)
+#define STATUS_FWP_INVALID_PARAMETER __constant_cpu_to_le32(0xC0220035)
+#define STATUS_FWP_TOO_MANY_SUBLAYERS __constant_cpu_to_le32(0xC0220036)
+#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED __constant_cpu_to_le32(0xC0220037)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG __constant_cpu_to_le32(0xC0220038)
+#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG __constant_cpu_to_le32(0xC0220039)
+#define STATUS_FWP_TCPIP_NOT_READY __constant_cpu_to_le32(0xC0220100)
+#define STATUS_FWP_INJECT_HANDLE_CLOSING __constant_cpu_to_le32(0xC0220101)
+#define STATUS_FWP_INJECT_HANDLE_STALE __constant_cpu_to_le32(0xC0220102)
+#define STATUS_FWP_CANNOT_PEND __constant_cpu_to_le32(0xC0220103)
+#define STATUS_NDIS_CLOSING __constant_cpu_to_le32(0xC0230002)
+#define STATUS_NDIS_BAD_VERSION __constant_cpu_to_le32(0xC0230004)
+#define STATUS_NDIS_BAD_CHARACTERISTICS __constant_cpu_to_le32(0xC0230005)
+#define STATUS_NDIS_ADAPTER_NOT_FOUND __constant_cpu_to_le32(0xC0230006)
+#define STATUS_NDIS_OPEN_FAILED __constant_cpu_to_le32(0xC0230007)
+#define STATUS_NDIS_DEVICE_FAILED __constant_cpu_to_le32(0xC0230008)
+#define STATUS_NDIS_MULTICAST_FULL __constant_cpu_to_le32(0xC0230009)
+#define STATUS_NDIS_MULTICAST_EXISTS __constant_cpu_to_le32(0xC023000A)
+#define STATUS_NDIS_MULTICAST_NOT_FOUND __constant_cpu_to_le32(0xC023000B)
+#define STATUS_NDIS_REQUEST_ABORTED __constant_cpu_to_le32(0xC023000C)
+#define STATUS_NDIS_RESET_IN_PROGRESS __constant_cpu_to_le32(0xC023000D)
+#define STATUS_NDIS_INVALID_PACKET __constant_cpu_to_le32(0xC023000F)
+#define STATUS_NDIS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0230010)
+#define STATUS_NDIS_ADAPTER_NOT_READY __constant_cpu_to_le32(0xC0230011)
+#define STATUS_NDIS_INVALID_LENGTH __constant_cpu_to_le32(0xC0230014)
+#define STATUS_NDIS_INVALID_DATA __constant_cpu_to_le32(0xC0230015)
+#define STATUS_NDIS_BUFFER_TOO_SHORT __constant_cpu_to_le32(0xC0230016)
+#define STATUS_NDIS_INVALID_OID __constant_cpu_to_le32(0xC0230017)
+#define STATUS_NDIS_ADAPTER_REMOVED __constant_cpu_to_le32(0xC0230018)
+#define STATUS_NDIS_UNSUPPORTED_MEDIA __constant_cpu_to_le32(0xC0230019)
+#define STATUS_NDIS_GROUP_ADDRESS_IN_USE __constant_cpu_to_le32(0xC023001A)
+#define STATUS_NDIS_FILE_NOT_FOUND __constant_cpu_to_le32(0xC023001B)
+#define STATUS_NDIS_ERROR_READING_FILE __constant_cpu_to_le32(0xC023001C)
+#define STATUS_NDIS_ALREADY_MAPPED __constant_cpu_to_le32(0xC023001D)
+#define STATUS_NDIS_RESOURCE_CONFLICT __constant_cpu_to_le32(0xC023001E)
+#define STATUS_NDIS_MEDIA_DISCONNECTED __constant_cpu_to_le32(0xC023001F)
+#define STATUS_NDIS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0230022)
+#define STATUS_NDIS_PAUSED __constant_cpu_to_le32(0xC023002A)
+#define STATUS_NDIS_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC023002B)
+#define STATUS_NDIS_UNSUPPORTED_REVISION __constant_cpu_to_le32(0xC023002C)
+#define STATUS_NDIS_INVALID_PORT __constant_cpu_to_le32(0xC023002D)
+#define STATUS_NDIS_INVALID_PORT_STATE __constant_cpu_to_le32(0xC023002E)
+#define STATUS_NDIS_LOW_POWER_STATE __constant_cpu_to_le32(0xC023002F)
+#define STATUS_NDIS_NOT_SUPPORTED __constant_cpu_to_le32(0xC02300BB)
+#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED __constant_cpu_to_le32(0xC0232000)
+#define STATUS_NDIS_DOT11_MEDIA_IN_USE __constant_cpu_to_le32(0xC0232001)
+#define STATUS_NDIS_DOT11_POWER_STATE_INVALID __constant_cpu_to_le32(0xC0232002)
+#define STATUS_IPSEC_BAD_SPI __constant_cpu_to_le32(0xC0360001)
+#define STATUS_IPSEC_SA_LIFETIME_EXPIRED __constant_cpu_to_le32(0xC0360002)
+#define STATUS_IPSEC_WRONG_SA __constant_cpu_to_le32(0xC0360003)
+#define STATUS_IPSEC_REPLAY_CHECK_FAILED __constant_cpu_to_le32(0xC0360004)
+#define STATUS_IPSEC_INVALID_PACKET __constant_cpu_to_le32(0xC0360005)
+#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED __constant_cpu_to_le32(0xC0360006)
+#define STATUS_IPSEC_CLEAR_TEXT_DROP __constant_cpu_to_le32(0xC0360007)
diff --git a/kernel/fs/cifs/smb2transport.c b/kernel/fs/cifs/smb2transport.c
new file mode 100644
index 000000000..d4c5b6f10
--- /dev/null
+++ b/kernel/fs/cifs/smb2transport.c
@@ -0,0 +1,612 @@
+/*
+ *   fs/cifs/smb2transport.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org) 2006
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <asm/processor.h>
+#include <linux/mempool.h>
+#include <linux/highmem.h>
+#include "smb2pdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "smb2proto.h"
+#include "cifs_debug.h"
+#include "smb2status.h"
+#include "smb2glob.h"
+
+static int
+smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	if (server->secmech.sdeschmacsha256 != NULL)
+		return 0; /* already allocated */
+
+	server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
+	if (IS_ERR(server->secmech.hmacsha256)) {
+		cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
+		rc = PTR_ERR(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.hmacsha256);
+	server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdeschmacsha256) {
+		crypto_free_shash(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
+	server->secmech.sdeschmacsha256->shash.flags = 0x0;
+
+	return 0;
+}
+
+static int
+smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	unsigned int size;
+	int rc;
+
+	if (server->secmech.sdesccmacaes != NULL)
+		return 0;  /* already allocated */
+
+	rc = smb2_crypto_shash_allocate(server);
+	if (rc)
+		return rc;
+
+	server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
+	if (IS_ERR(server->secmech.cmacaes)) {
+		cifs_dbg(VFS, "could not allocate crypto cmac-aes");
+		kfree(server->secmech.sdeschmacsha256);
+		server->secmech.sdeschmacsha256 = NULL;
+		crypto_free_shash(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
+		rc = PTR_ERR(server->secmech.cmacaes);
+		server->secmech.cmacaes = NULL;
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.cmacaes);
+	server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdesccmacaes) {
+		cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
+		kfree(server->secmech.sdeschmacsha256);
+		server->secmech.sdeschmacsha256 = NULL;
+		crypto_free_shash(server->secmech.hmacsha256);
+		crypto_free_shash(server->secmech.cmacaes);
+		server->secmech.hmacsha256 = NULL;
+		server->secmech.cmacaes = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
+	server->secmech.sdesccmacaes->shash.flags = 0x0;
+
+	return 0;
+}
+
+static struct cifs_ses *
+smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server)
+{
+	struct cifs_ses *ses;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		if (ses->Suid != smb2hdr->SessionId)
+			continue;
+		spin_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	return NULL;
+}
+
+
+int
+smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	int i, rc;
+	unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
+	unsigned char *sigptr = smb2_signature;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct cifs_ses *ses;
+
+	ses = smb2_find_smb_ses(smb2_pdu, server);
+	if (!ses) {
+		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		return 0;
+	}
+
+	memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
+	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+
+	rc = smb2_crypto_shash_allocate(server);
+	if (rc) {
+		cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_setkey(server->secmech.hmacsha256,
+		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init sha256", __func__);
+		return rc;
+	}
+
+	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+		if (iov[i].iov_base == NULL) {
+			cifs_dbg(VFS, "null iovec entry\n");
+			return -EIO;
+		}
+		/*
+		 * The first entry includes a length field (which does not get
+		 * signed that occupies the first 4 bytes before the header).
+		 */
+		if (i == 0) {
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+				break; /* nothing to sign or corrupt header */
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdeschmacsha256->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
+		} else {
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdeschmacsha256->shash,
+				iov[i].iov_base, iov[i].iov_len);
+		}
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with payload\n",
+				 __func__);
+			return rc;
+		}
+	}
+
+	/* now hash over the rq_pages array */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+					p_iov.iov_base, p_iov.iov_len);
+		kunmap(rqst->rq_pages[i]);
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
+				sigptr);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+
+	memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+
+	return rc;
+}
+
+int
+generate_smb3signingkey(struct cifs_ses *ses)
+{
+	unsigned char zero = 0x0;
+	__u8 i[4] = {0, 0, 0, 1};
+	__u8 L[4] = {0, 0, 0, 128};
+	int rc = 0;
+	unsigned char prfhash[SMB2_HMACSHA256_SIZE];
+	unsigned char *hashptr = prfhash;
+
+	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
+	memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+
+	rc = smb3_crypto_shash_allocate(ses->server);
+	if (rc) {
+		cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_setkey(ses->server->secmech.hmacsha256,
+		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				i, 4);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				"SMB2AESCMAC", 12);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				&zero, 1);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				"SmbSign", 8);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				L, 4);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash,
+				hashptr);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+
+smb3signkey_ret:
+	return rc;
+}
+
+int
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	int i;
+	int rc = 0;
+	unsigned char smb3_signature[SMB2_CMACAES_SIZE];
+	unsigned char *sigptr = smb3_signature;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct cifs_ses *ses;
+
+	ses = smb2_find_smb_ses(smb2_pdu, server);
+	if (!ses) {
+		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		return 0;
+	}
+
+	memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
+	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+
+	rc = crypto_shash_setkey(server->secmech.cmacaes,
+		ses->smb3signingkey, SMB2_CMACAES_SIZE);
+
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
+		return rc;
+	}
+
+	/*
+	 * we already allocate sdesccmacaes when we init smb3 signing key,
+	 * so unlike smb2 case we do not have to check here if secmech are
+	 * initialized
+	 */
+	rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
+		return rc;
+	}
+
+	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+		if (iov[i].iov_base == NULL) {
+			cifs_dbg(VFS, "null iovec entry");
+			return -EIO;
+		}
+		/*
+		 * The first entry includes a length field (which does not get
+		 * signed that occupies the first 4 bytes before the header).
+		 */
+		if (i == 0) {
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+				break; /* nothing to sign or corrupt header */
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdesccmacaes->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
+		} else {
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdesccmacaes->shash,
+				iov[i].iov_base, iov[i].iov_len);
+		}
+		if (rc) {
+			cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
+							__func__);
+			return rc;
+		}
+	}
+
+	/* now hash over the rq_pages array */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		crypto_shash_update(&server->secmech.sdesccmacaes->shash,
+					p_iov.iov_base, p_iov.iov_len);
+		kunmap(rqst->rq_pages[i]);
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
+						sigptr);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
+
+	memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+
+	return rc;
+}
+
+/* must be called with server->srv_mutex held */
+static int
+smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	struct smb2_hdr *smb2_pdu = rqst->rq_iov[0].iov_base;
+
+	if (!(smb2_pdu->Flags & SMB2_FLAGS_SIGNED) ||
+	    server->tcpStatus == CifsNeedNegotiate)
+		return rc;
+
+	if (!server->session_estab) {
+		strncpy(smb2_pdu->Signature, "BSRSPYL", 8);
+		return rc;
+	}
+
+	rc = server->ops->calc_signature(rqst, server);
+
+	return rc;
+}
+
+int
+smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	unsigned int rc;
+	char server_response_sig[16];
+	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+
+	if ((smb2_pdu->Command == SMB2_NEGOTIATE) ||
+	    (smb2_pdu->Command == SMB2_SESSION_SETUP) ||
+	    (smb2_pdu->Command == SMB2_OPLOCK_BREAK) ||
+	    (!server->session_estab))
+		return 0;
+
+	/*
+	 * BB what if signatures are supposed to be on for session but
+	 * server does not send one? BB
+	 */
+
+	/* Do not need to verify session setups with signature "BSRSPYL " */
+	if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0)
+		cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+			 smb2_pdu->Command);
+
+	/*
+	 * Save off the origiginal signature so we can modify the smb and check
+	 * our calculated signature against what the server sent.
+	 */
+	memcpy(server_response_sig, smb2_pdu->Signature, SMB2_SIGNATURE_SIZE);
+
+	memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
+
+	mutex_lock(&server->srv_mutex);
+	rc = server->ops->calc_signature(rqst, server);
+	mutex_unlock(&server->srv_mutex);
+
+	if (rc)
+		return rc;
+
+	if (memcmp(server_response_sig, smb2_pdu->Signature,
+		   SMB2_SIGNATURE_SIZE))
+		return -EACCES;
+	else
+		return 0;
+}
+
+/*
+ * Set message id for the request. Should be called after wait_for_free_request
+ * and when srv_mutex is held.
+ */
+static inline void
+smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
+{
+	unsigned int i, num = le16_to_cpu(hdr->CreditCharge);
+
+	hdr->MessageId = get_next_mid64(server);
+	/* skip message numbers according to CreditCharge field */
+	for (i = 1; i < num; i++)
+		get_next_mid(server);
+}
+
+static struct mid_q_entry *
+smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
+		     struct TCP_Server_Info *server)
+{
+	struct mid_q_entry *temp;
+
+	if (server == NULL) {
+		cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
+		return NULL;
+	}
+
+	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
+	if (temp == NULL)
+		return temp;
+	else {
+		memset(temp, 0, sizeof(struct mid_q_entry));
+		temp->mid = le64_to_cpu(smb_buffer->MessageId);
+		temp->pid = current->pid;
+		temp->command = smb_buffer->Command;	/* Always LE */
+		temp->when_alloc = jiffies;
+		temp->server = server;
+
+		/*
+		 * The default is for the mid to be synchronous, so the
+		 * default callback just wakes up the current task.
+		 */
+		temp->callback = cifs_wake_up_task;
+		temp->callback_data = current;
+	}
+
+	atomic_inc(&midCount);
+	temp->mid_state = MID_REQUEST_ALLOCATED;
+	return temp;
+}
+
+static int
+smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
+		   struct mid_q_entry **mid)
+{
+	if (ses->server->tcpStatus == CifsExiting)
+		return -ENOENT;
+
+	if (ses->server->tcpStatus == CifsNeedReconnect) {
+		cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
+		return -EAGAIN;
+	}
+
+	if (ses->status == CifsNew) {
+		if ((buf->Command != SMB2_SESSION_SETUP) &&
+		    (buf->Command != SMB2_NEGOTIATE))
+			return -EAGAIN;
+		/* else ok - we are setting up session */
+	}
+
+	if (ses->status == CifsExiting) {
+		if (buf->Command != SMB2_LOGOFF)
+			return -EAGAIN;
+		/* else ok - we are shutting down the session */
+	}
+
+	*mid = smb2_mid_entry_alloc(buf, ses->server);
+	if (*mid == NULL)
+		return -ENOMEM;
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+	return 0;
+}
+
+int
+smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+		   bool log_error)
+{
+	unsigned int len = get_rfc1002_length(mid->resp_buf);
+	struct kvec iov;
+	struct smb_rqst rqst = { .rq_iov = &iov,
+				 .rq_nvec = 1 };
+
+	iov.iov_base = (char *)mid->resp_buf;
+	iov.iov_len = get_rfc1002_length(mid->resp_buf) + 4;
+
+	dump_smb(mid->resp_buf, min_t(u32, 80, len));
+	/* convert the length into a more usable form */
+	if (len > 24 && server->sign) {
+		int rc;
+
+		rc = smb2_verify_signature(&rqst, server);
+		if (rc)
+			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+				 rc);
+	}
+
+	return map_smb2_to_linux_error(mid->resp_buf, log_error);
+}
+
+struct mid_q_entry *
+smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
+
+	smb2_seq_num_into_buf(ses->server, hdr);
+
+	rc = smb2_get_mid_entry(ses, hdr, &mid);
+	if (rc)
+		return ERR_PTR(rc);
+	rc = smb2_sign_rqst(rqst, ses->server);
+	if (rc) {
+		cifs_delete_mid(mid);
+		return ERR_PTR(rc);
+	}
+	return mid;
+}
+
+struct mid_q_entry *
+smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
+
+	smb2_seq_num_into_buf(server, hdr);
+
+	mid = smb2_mid_entry_alloc(hdr, server);
+	if (mid == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rc = smb2_sign_rqst(rqst, server);
+	if (rc) {
+		DeleteMidQEntry(mid);
+		return ERR_PTR(rc);
+	}
+
+	return mid;
+}
diff --git a/kernel/fs/cifs/smbencrypt.c b/kernel/fs/cifs/smbencrypt.c
new file mode 100644
index 000000000..a4232ec4f
--- /dev/null
+++ b/kernel/fs/cifs/smbencrypt.c
@@ -0,0 +1,249 @@
+/*
+   Unix SMB/Netbios implementation.
+   Version 1.9.
+   SMB parameters and setup
+   Copyright (C) Andrew Tridgell 1992-2000
+   Copyright (C) Luke Kenneth Casson Leighton 1996-2000
+   Modified by Jeremy Allison 1995.
+   Copyright (C) Andrew Bartlett <abartlet@samba.org> 2002-2003
+   Modified by Steve French (sfrench@us.ibm.com) 2002-2003
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifsproto.h"
+
+#ifndef false
+#define false 0
+#endif
+#ifndef true
+#define true 1
+#endif
+
+/* following came from the other byteorder.h to avoid include conflicts */
+#define CVAL(buf,pos) (((unsigned char *)(buf))[pos])
+#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
+#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
+
+static void
+str_to_key(unsigned char *str, unsigned char *key)
+{
+	int i;
+
+	key[0] = str[0] >> 1;
+	key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
+	key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
+	key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
+	key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
+	key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
+	key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
+	key[7] = str[6] & 0x7F;
+	for (i = 0; i < 8; i++)
+		key[i] = (key[i] << 1);
+}
+
+static int
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
+{
+	int rc;
+	unsigned char key2[8];
+	struct crypto_blkcipher *tfm_des;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+
+	str_to_key(key, key2);
+
+	tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm_des)) {
+		rc = PTR_ERR(tfm_des);
+		cifs_dbg(VFS, "could not allocate des crypto API\n");
+		goto smbhash_err;
+	}
+
+	desc.tfm = tfm_des;
+
+	crypto_blkcipher_setkey(tfm_des, key2, 8);
+
+	sg_init_one(&sgin, in, 8);
+	sg_init_one(&sgout, out, 8);
+
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+	if (rc)
+		cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
+
+	crypto_free_blkcipher(tfm_des);
+smbhash_err:
+	return rc;
+}
+
+static int
+E_P16(unsigned char *p14, unsigned char *p16)
+{
+	int rc;
+	unsigned char sp8[8] =
+	    { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
+
+	rc = smbhash(p16, sp8, p14);
+	if (rc)
+		return rc;
+	rc = smbhash(p16 + 8, sp8, p14 + 7);
+	return rc;
+}
+
+static int
+E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
+{
+	int rc;
+
+	rc = smbhash(p24, c8, p21);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 8, c8, p21 + 7);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 16, c8, p21 + 14);
+	return rc;
+}
+
+/* produce a md4 message digest from data of length n bytes */
+int
+mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md4;
+	struct sdesc *sdescmd4;
+
+	md4 = crypto_alloc_shash("md4", 0, 0);
+	if (IS_ERR(md4)) {
+		rc = PTR_ERR(md4);
+		cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n",
+			 __func__, rc);
+		return rc;
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
+	sdescmd4 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd4) {
+		rc = -ENOMEM;
+		goto mdfour_err;
+	}
+	sdescmd4->shash.tfm = md4;
+	sdescmd4->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd4->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__);
+		goto mdfour_err;
+	}
+	rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
+		goto mdfour_err;
+	}
+	rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__);
+
+mdfour_err:
+	crypto_free_shash(md4);
+	kfree(sdescmd4);
+
+	return rc;
+}
+
+/*
+   This implements the X/Open SMB password encryption
+   It takes a password, a 8 byte "crypt key" and puts 24 bytes of
+   encrypted password into p24 */
+/* Note that password must be uppercased and null terminated */
+int
+SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
+{
+	int rc;
+	unsigned char p14[14], p16[16], p21[21];
+
+	memset(p14, '\0', 14);
+	memset(p16, '\0', 16);
+	memset(p21, '\0', 21);
+
+	memcpy(p14, passwd, 14);
+	rc = E_P16(p14, p16);
+	if (rc)
+		return rc;
+
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
+
+	return rc;
+}
+
+/*
+ * Creates the MD4 Hash of the users password in NT UNICODE.
+ */
+
+int
+E_md4hash(const unsigned char *passwd, unsigned char *p16,
+	const struct nls_table *codepage)
+{
+	int rc;
+	int len;
+	__le16 wpwd[129];
+
+	/* Password cannot be longer than 128 characters */
+	if (passwd) /* Password must be converted to NT unicode */
+		len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
+	else {
+		len = 0;
+		*wpwd = 0; /* Ensure string is null terminated */
+	}
+
+	rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
+	memzero_explicit(wpwd, sizeof(wpwd));
+
+	return rc;
+}
+
+/* Does the NT MD4 hash then des encryption. */
+int
+SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
+		const struct nls_table *codepage)
+{
+	int rc;
+	unsigned char p16[16], p21[21];
+
+	memset(p16, '\0', 16);
+	memset(p21, '\0', 21);
+
+	rc = E_md4hash(passwd, p16, codepage);
+	if (rc) {
+		cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+			 __func__, rc);
+		return rc;
+	}
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
+	return rc;
+}
diff --git a/kernel/fs/cifs/smberr.h b/kernel/fs/cifs/smberr.h
new file mode 100644
index 000000000..7f16cb825
--- /dev/null
+++ b/kernel/fs/cifs/smberr.h
@@ -0,0 +1,184 @@
+/*
+ *   fs/cifs/smberr.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2004
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   See Error Codes section of the SNIA CIFS Specification
+ *   for more information
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define SUCCESS	0x00	/* The request was successful. */
+#define ERRDOS	0x01	/* Error is from the core DOS operating system set */
+#define ERRSRV	0x02	/* Error is generated by the file server daemon */
+#define ERRHRD	0x03	/* Error is a hardware error. */
+#define ERRCMD	0xFF	/* Command was not in the "SMB" format. */
+
+/* The following error codes may be generated with the SUCCESS error class.*/
+
+/*#define SUCCESS	0	The request was successful. */
+
+/* The following error codes may be generated with the ERRDOS error class.*/
+
+#define ERRbadfunc		1	/* Invalid function. The server did not
+					   recognize or could not perform a
+					   system call generated by the server,
+					   e.g., set the DIRECTORY attribute on
+					   a data file, invalid seek mode. */
+#define ERRbadfile		2	/* File not found. The last component
+					   of a file's pathname could not be
+					   found. */
+#define ERRbadpath		3	/* Directory invalid. A directory
+					   component in a pathname could not be
+					   found. */
+#define ERRnofids		4	/* Too many open files. The server has
+					   no file handles available. */
+#define ERRnoaccess		5	/* Access denied, the client's context
+					   does not permit the requested
+					   function. This includes the
+					   following conditions: invalid rename
+					   command, write to Fid open for read
+					   only, read on Fid open for write
+					   only, attempt to delete a non-empty
+					   directory */
+#define ERRbadfid		6	/* Invalid file handle. The file handle
+					   specified was not recognized by the
+					   server. */
+#define ERRbadmcb		7	/* Memory control blocks destroyed. */
+#define ERRnomem		8	/* Insufficient server memory to
+					   perform the requested function. */
+#define ERRbadmem		9	/* Invalid memory block address. */
+#define ERRbadenv		10	/* Invalid environment. */
+#define ERRbadformat		11	/* Invalid format. */
+#define ERRbadaccess		12	/* Invalid open mode. */
+#define ERRbaddata		13	/* Invalid data (generated only by
+					   IOCTL calls within the server). */
+#define ERRbaddrive		15	/* Invalid drive specified. */
+#define ERRremcd		16	/* A Delete Directory request attempted
+					   to remove the server's current
+					   directory. */
+#define ERRdiffdevice		17	/* Not same device (e.g., a cross
+					   volume rename was attempted */
+#define ERRnofiles		18	/* A File Search command can find no
+					   more files matching the specified
+					   criteria. */
+#define ERRwriteprot		19	/* media is write protected */
+#define ERRgeneral		31
+#define ERRbadshare		32	/* The sharing mode specified for an
+					   Open conflicts with existing FIDs on
+					   the file. */
+#define ERRlock			33	/* A Lock request conflicted with an
+					   existing lock or specified an
+					   invalid mode, or an Unlock requested
+					   attempted to remove a lock held by
+					   another process. */
+#define ERRunsup		50
+#define ERRnosuchshare		67
+#define ERRfilexists		80	/* The file named in the request
+					   already exists. */
+#define ERRinvparm		87
+#define ERRdiskfull		112
+#define ERRinvname		123
+#define ERRinvlevel		124
+#define ERRdirnotempty		145
+#define ERRnotlocked		158
+#define ERRcancelviolation	173
+#define ERRalreadyexists	183
+#define ERRbadpipe		230
+#define ERRpipebusy		231
+#define ERRpipeclosing		232
+#define ERRnotconnected		233
+#define ERRmoredata		234
+#define ERReasnotsupported	282
+#define ErrQuota		0x200	/* The operation would cause a quota
+					   limit to be exceeded. */
+#define ErrNotALink		0x201	/* A link operation was performed on a
+					   pathname that was not a link. */
+
+/* Below errors are used internally (do not come over the wire) for passthrough
+   from STATUS codes to POSIX only  */
+#define ERRsymlink              0xFFFD
+#define ErrTooManyLinks         0xFFFE
+
+/* Following error codes may be generated with the ERRSRV error class.*/
+
+#define ERRerror		1	/* Non-specific error code. It is
+					   returned under the following
+					   conditions: resource other than disk
+					   space exhausted (e.g. TIDs), first
+					   SMB command was not negotiate,
+					   multiple negotiates attempted, and
+					   internal server error. */
+#define ERRbadpw		2	/* Bad password - name/password pair in
+					   a TreeConnect or Session Setup are
+					   invalid. */
+#define ERRbadtype		3	/* used for indicating DFS referral
+					   needed */
+#define ERRaccess		4	/* The client does not have the
+					   necessary access rights within the
+					   specified context for requested
+					   function. */
+#define ERRinvtid		5	/* The Tid specified in a command was
+					   invalid. */
+#define ERRinvnetname		6	/* Invalid network name in tree
+					   connect. */
+#define ERRinvdevice		7	/* Invalid device - printer request
+					   made to non-printer connection or
+					   non-printer request made to printer
+					   connection. */
+#define ERRqfull		49	/* Print queue full (files) -- returned
+					   by open print file. */
+#define ERRqtoobig		50	/* Print queue full -- no space. */
+#define ERRqeof			51	/* EOF on print queue dump */
+#define ERRinvpfid		52	/* Invalid print file FID. */
+#define ERRsmbcmd		64	/* The server did not recognize the
+					   command received. */
+#define ERRsrverror		65	/* The server encountered an internal
+					   error, e.g., system file
+					   unavailable. */
+#define ERRbadBID		66	/* (obsolete) */
+#define ERRfilespecs		67	/* The Fid and pathname parameters
+					   contained an invalid combination of
+					   values. */
+#define ERRbadLink		68	/* (obsolete) */
+#define ERRbadpermits		69	/* The access permissions specified for
+					   a file or directory are not a valid
+					   combination. */
+#define ERRbadPID		70
+#define ERRsetattrmode		71	/* attribute (mode) is invalid */
+#define ERRpaused		81	/* Server is paused */
+#define ERRmsgoff		82	/* reserved - messaging off */
+#define ERRnoroom		83	/* reserved - no room for message */
+#define ERRrmuns		87	/* reserved - too many remote names */
+#define ERRtimeout		88	/* operation timed out */
+#define ERRnoresource		89	/* No resources available for request
+					   */
+#define ERRtoomanyuids		90	/* Too many UIDs active on this session
+					   */
+#define ERRbaduid		91	/* The UID is not known as a valid user
+					   */
+#define ERRusempx		250	/* temporarily unable to use raw */
+#define ERRusestd		251	/* temporarily unable to use either raw
+					   or mpx */
+#define ERR_NOTIFY_ENUM_DIR	1024
+#define ERRnoSuchUser		2238	/* user account does not exist */
+#define ERRaccountexpired	2239
+#define ERRbadclient		2240	/* can not logon from this client */
+#define ERRbadLogonTime		2241	/* logon hours do not allow this */
+#define ERRpasswordExpired	2242
+#define ERRnetlogonNotStarted	2455
+#define ERRnosupport		0xFFFF
diff --git a/kernel/fs/cifs/smbfsctl.h b/kernel/fs/cifs/smbfsctl.h
new file mode 100644
index 000000000..83efa5953
--- /dev/null
+++ b/kernel/fs/cifs/smbfsctl.h
@@ -0,0 +1,121 @@
+/*
+ *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2013
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* IOCTL information */
+/*
+ * List of ioctl/fsctl function codes that are or could be useful in the
+ * future to remote clients like cifs or SMB2/SMB3 client.  This is probably
+ * a slightly larger set of fsctls that NTFS local filesystem could handle,
+ * including the seven below that we do not have struct definitions for.
+ * Even with protocol definitions for most of these now available, we still
+ * need to do some experimentation to identify which are practical to do
+ * remotely.  Some of the following, such as the encryption/compression ones
+ * could be invoked from tools via a specialized hook into the VFS rather
+ * than via the standard vfs entry points
+ *
+ * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
+ * below). Additional detail on less common ones can be found in MS-FSCC
+ * section 2.3.
+ */
+#define FSCTL_DFS_GET_REFERRALS      0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX   0x000601B0
+#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
+#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
+#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
+#define FSCTL_LOCK_VOLUME            0x00090018
+#define FSCTL_UNLOCK_VOLUME          0x0009001C
+#define FSCTL_IS_PATHNAME_VALID      0x0009002C /* BB add struct */
+#define FSCTL_GET_COMPRESSION        0x0009003C /* BB add struct */
+#define FSCTL_SET_COMPRESSION        0x0009C040 /* BB add struct */
+#define FSCTL_QUERY_FAT_BPB          0x00090058 /* BB add struct */
+/* Verify the next FSCTL number, we had it as 0x00090090 before */
+#define FSCTL_FILESYSTEM_GET_STATS   0x00090060 /* BB add struct */
+#define FSCTL_GET_NTFS_VOLUME_DATA   0x00090064 /* BB add struct */
+#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
+#define FSCTL_IS_VOLUME_DIRTY        0x00090078 /* BB add struct */
+#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
+#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
+#define FSCTL_FIND_FILES_BY_SID      0x0009008F /* BB add struct */
+#define FSCTL_SET_OBJECT_ID          0x00090098 /* BB add struct */
+#define FSCTL_GET_OBJECT_ID          0x0009009C /* BB add struct */
+#define FSCTL_DELETE_OBJECT_ID       0x000900A0 /* BB add struct */
+#define FSCTL_SET_REPARSE_POINT      0x000900A4 /* BB add struct */
+#define FSCTL_GET_REPARSE_POINT      0x000900A8 /* BB add struct */
+#define FSCTL_DELETE_REPARSE_POINT   0x000900AC /* BB add struct */
+#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
+#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
+#define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000980C8
+#define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
+#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
+#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
+#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3 /* BB add struct */
+#define FSCTL_READ_FILE_USN_DATA     0x000900EB /* BB add struct */
+#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_SIS_COPYFILE           0x00090100 /* BB add struct */
+#define FSCTL_RECALL_FILE            0x00090117 /* BB add struct */
+#define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
+#define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
+#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
+#define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
+#define FSCTL_SIS_LINK_FILES         0x0009C104
+#define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
+#define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
+/* strange that the number for this op is not sequential with previous op */
+#define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+/* Enumerate previous versions of a file */
+#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
+/* Retrieve an opaque file reference for server-side data movement ie copy */
+#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
+#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
+#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
+/* Perform server-side data movement */
+#define FSCTL_SRV_COPYCHUNK 0x001440F2
+#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
+#define FSCTL_SRV_READ_HASH          0x001441BB /* BB add struct */
+
+/* See FSCC 2.1.2.5 */
+#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
+#define IO_REPARSE_TAG_HSM           0xC0000004
+#define IO_REPARSE_TAG_SIS           0x80000007
+#define IO_REPARSE_TAG_HSM2          0x80000006
+#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005
+/* Used by the DFS filter. See MS-DFSC */
+#define IO_REPARSE_TAG_DFS           0x8000000A
+/* Used by the DFS filter See MS-DFSC */
+#define IO_REPARSE_TAG_DFSR          0x80000012
+#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B
+/* See section MS-FSCC 2.1.2.4 */
+#define IO_REPARSE_TAG_SYMLINK       0xA000000C
+#define IO_REPARSE_TAG_DEDUP         0x80000013
+#define IO_REPARSE_APPXSTREAM	     0xC0000014
+/* NFS symlinks, Win 8/SMB3 and later */
+#define IO_REPARSE_TAG_NFS           0x80000014
+
+/* fsctl flags */
+/* If Flags is set to this value, the request is an FSCTL not ioctl request */
+#define SMB2_0_IOCTL_IS_FSCTL		0x00000001
+
diff --git a/kernel/fs/cifs/transport.c b/kernel/fs/cifs/transport.c
new file mode 100644
index 000000000..126f46b88
--- /dev/null
+++ b/kernel/fs/cifs/transport.c
@@ -0,0 +1,1113 @@
+/*
+ *   fs/cifs/transport.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *   Jeremy Allison (jra@samba.org) 2006.
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/tcp.h>
+#include <linux/highmem.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <linux/mempool.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+void
+cifs_wake_up_task(struct mid_q_entry *mid)
+{
+	wake_up_process(mid->callback_data);
+}
+
+struct mid_q_entry *
+AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
+{
+	struct mid_q_entry *temp;
+
+	if (server == NULL) {
+		cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n");
+		return NULL;
+	}
+
+	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
+	if (temp == NULL)
+		return temp;
+	else {
+		memset(temp, 0, sizeof(struct mid_q_entry));
+		temp->mid = get_mid(smb_buffer);
+		temp->pid = current->pid;
+		temp->command = cpu_to_le16(smb_buffer->Command);
+		cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
+	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
+		/* when mid allocated can be before when sent */
+		temp->when_alloc = jiffies;
+		temp->server = server;
+
+		/*
+		 * The default is for the mid to be synchronous, so the
+		 * default callback just wakes up the current task.
+		 */
+		temp->callback = cifs_wake_up_task;
+		temp->callback_data = current;
+	}
+
+	atomic_inc(&midCount);
+	temp->mid_state = MID_REQUEST_ALLOCATED;
+	return temp;
+}
+
+void
+DeleteMidQEntry(struct mid_q_entry *midEntry)
+{
+#ifdef CONFIG_CIFS_STATS2
+	__le16 command = midEntry->server->vals->lock_cmd;
+	unsigned long now;
+#endif
+	midEntry->mid_state = MID_FREE;
+	atomic_dec(&midCount);
+	if (midEntry->large_buf)
+		cifs_buf_release(midEntry->resp_buf);
+	else
+		cifs_small_buf_release(midEntry->resp_buf);
+#ifdef CONFIG_CIFS_STATS2
+	now = jiffies;
+	/* commands taking longer than one second are indications that
+	   something is wrong, unless it is quite a slow link or server */
+	if ((now - midEntry->when_alloc) > HZ) {
+		if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) {
+			pr_debug(" CIFS slow rsp: cmd %d mid %llu",
+			       midEntry->command, midEntry->mid);
+			pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
+			       now - midEntry->when_alloc,
+			       now - midEntry->when_sent,
+			       now - midEntry->when_received);
+		}
+	}
+#endif
+	mempool_free(midEntry, cifs_mid_poolp);
+}
+
+void
+cifs_delete_mid(struct mid_q_entry *mid)
+{
+	spin_lock(&GlobalMid_Lock);
+	list_del(&mid->qhead);
+	spin_unlock(&GlobalMid_Lock);
+
+	DeleteMidQEntry(mid);
+}
+
+/*
+ * smb_send_kvec - send an array of kvecs to the server
+ * @server:	Server to send the data to
+ * @iov:	Pointer to array of kvecs
+ * @n_vec:	length of kvec array
+ * @sent:	amount of data sent on socket is stored here
+ *
+ * Our basic "send data to server" function. Should be called with srv_mutex
+ * held. The caller is responsible for handling the results.
+ */
+static int
+smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
+		size_t *sent)
+{
+	int rc = 0;
+	int i = 0;
+	struct msghdr smb_msg;
+	unsigned int remaining;
+	size_t first_vec = 0;
+	struct socket *ssocket = server->ssocket;
+
+	*sent = 0;
+
+	smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
+	smb_msg.msg_namelen = sizeof(struct sockaddr);
+	smb_msg.msg_control = NULL;
+	smb_msg.msg_controllen = 0;
+	if (server->noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
+
+	remaining = 0;
+	for (i = 0; i < n_vec; i++)
+		remaining += iov[i].iov_len;
+
+	i = 0;
+	while (remaining) {
+		/*
+		 * If blocking send, we try 3 times, since each can block
+		 * for 5 seconds. For nonblocking  we have to try more
+		 * but wait increasing amounts of time allowing time for
+		 * socket to clear.  The overall time we wait in either
+		 * case to send on the socket is about 15 seconds.
+		 * Similarly we wait for 15 seconds for a response from
+		 * the server in SendReceive[2] for the server to send
+		 * a response back for most types of requests (except
+		 * SMB Write past end of file which can be slow, and
+		 * blocking lock operations). NFS waits slightly longer
+		 * than CIFS, but this can make it take longer for
+		 * nonresponsive servers to be detected and 15 seconds
+		 * is more than enough time for modern networks to
+		 * send a packet.  In most cases if we fail to send
+		 * after the retries we will kill the socket and
+		 * reconnect which may clear the network problem.
+		 */
+		rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
+				    n_vec - first_vec, remaining);
+		if (rc == -EAGAIN) {
+			i++;
+			if (i >= 14 || (!server->noblocksnd && (i > 2))) {
+				cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
+					 ssocket);
+				rc = -EAGAIN;
+				break;
+			}
+			msleep(1 << i);
+			continue;
+		}
+
+		if (rc < 0)
+			break;
+
+		/* send was at least partially successful */
+		*sent += rc;
+
+		if (rc == remaining) {
+			remaining = 0;
+			break;
+		}
+
+		if (rc > remaining) {
+			cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining);
+			break;
+		}
+
+		if (rc == 0) {
+			/* should never happen, letting socket clear before
+			   retrying is our only obvious option here */
+			cifs_dbg(VFS, "tcp sent no data\n");
+			msleep(500);
+			continue;
+		}
+
+		remaining -= rc;
+
+		/* the line below resets i */
+		for (i = first_vec; i < n_vec; i++) {
+			if (iov[i].iov_len) {
+				if (rc > iov[i].iov_len) {
+					rc -= iov[i].iov_len;
+					iov[i].iov_len = 0;
+				} else {
+					iov[i].iov_base += rc;
+					iov[i].iov_len -= rc;
+					first_vec = i;
+					break;
+				}
+			}
+		}
+
+		i = 0; /* in case we get ENOSPC on the next send */
+		rc = 0;
+	}
+	return rc;
+}
+
+/**
+ * rqst_page_to_kvec - Turn a slot in the smb_rqst page array into a kvec
+ * @rqst: pointer to smb_rqst
+ * @idx: index into the array of the page
+ * @iov: pointer to struct kvec that will hold the result
+ *
+ * Helper function to convert a slot in the rqst->rq_pages array into a kvec.
+ * The page will be kmapped and the address placed into iov_base. The length
+ * will then be adjusted according to the ptailoff.
+ */
+void
+cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
+			struct kvec *iov)
+{
+	/*
+	 * FIXME: We could avoid this kmap altogether if we used
+	 * kernel_sendpage instead of kernel_sendmsg. That will only
+	 * work if signing is disabled though as sendpage inlines the
+	 * page directly into the fraglist. If userspace modifies the
+	 * page after we calculate the signature, then the server will
+	 * reject it and may break the connection. kernel_sendmsg does
+	 * an extra copy of the data and avoids that issue.
+	 */
+	iov->iov_base = kmap(rqst->rq_pages[idx]);
+
+	/* if last page, don't send beyond this offset into page */
+	if (idx == (rqst->rq_npages - 1))
+		iov->iov_len = rqst->rq_tailsz;
+	else
+		iov->iov_len = rqst->rq_pagesz;
+}
+
+static unsigned long
+rqst_len(struct smb_rqst *rqst)
+{
+	unsigned int i;
+	struct kvec *iov = rqst->rq_iov;
+	unsigned long buflen = 0;
+
+	/* total up iov array first */
+	for (i = 0; i < rqst->rq_nvec; i++)
+		buflen += iov[i].iov_len;
+
+	/* add in the page array if there is one */
+	if (rqst->rq_npages) {
+		buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
+		buflen += rqst->rq_tailsz;
+	}
+
+	return buflen;
+}
+
+static int
+smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+	int rc;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+	unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
+	unsigned long send_length;
+	unsigned int i;
+	size_t total_len = 0, sent;
+	struct socket *ssocket = server->ssocket;
+	int val = 1;
+
+	if (ssocket == NULL)
+		return -ENOTSOCK;
+
+	/* sanity check send length */
+	send_length = rqst_len(rqst);
+	if (send_length != smb_buf_length + 4) {
+		WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n",
+			send_length, smb_buf_length);
+		return -EIO;
+	}
+
+	cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
+	dump_smb(iov[0].iov_base, iov[0].iov_len);
+
+	/* cork the socket */
+	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
+				(char *)&val, sizeof(val));
+
+	rc = smb_send_kvec(server, iov, n_vec, &sent);
+	if (rc < 0)
+		goto uncork;
+
+	total_len += sent;
+
+	/* now walk the page array and send each page in it */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		rc = smb_send_kvec(server, &p_iov, 1, &sent);
+		kunmap(rqst->rq_pages[i]);
+		if (rc < 0)
+			break;
+
+		total_len += sent;
+	}
+
+uncork:
+	/* uncork it */
+	val = 0;
+	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
+				(char *)&val, sizeof(val));
+
+	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
+		cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n",
+			 smb_buf_length + 4, total_len);
+		/*
+		 * If we have only sent part of an SMB then the next SMB could
+		 * be taken as the remainder of this one. We need to kill the
+		 * socket so the server throws away the partial SMB
+		 */
+		server->tcpStatus = CifsNeedReconnect;
+	}
+
+	if (rc < 0 && rc != -EINTR)
+		cifs_dbg(VFS, "Error %d sending data on socket to server\n",
+			 rc);
+	else
+		rc = 0;
+
+	return rc;
+}
+
+static int
+smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
+{
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = n_vec };
+
+	return smb_send_rqst(server, &rqst);
+}
+
+int
+smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
+	 unsigned int smb_buf_length)
+{
+	struct kvec iov;
+
+	iov.iov_base = smb_buffer;
+	iov.iov_len = smb_buf_length + 4;
+
+	return smb_sendv(server, &iov, 1);
+}
+
+static int
+wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
+		      int *credits)
+{
+	int rc;
+
+	spin_lock(&server->req_lock);
+	if (timeout == CIFS_ASYNC_OP) {
+		/* oplock breaks must not be held up */
+		server->in_flight++;
+		*credits -= 1;
+		spin_unlock(&server->req_lock);
+		return 0;
+	}
+
+	while (1) {
+		if (*credits <= 0) {
+			spin_unlock(&server->req_lock);
+			cifs_num_waiters_inc(server);
+			rc = wait_event_killable(server->request_q,
+						 has_credits(server, credits));
+			cifs_num_waiters_dec(server);
+			if (rc)
+				return rc;
+			spin_lock(&server->req_lock);
+		} else {
+			if (server->tcpStatus == CifsExiting) {
+				spin_unlock(&server->req_lock);
+				return -ENOENT;
+			}
+
+			/*
+			 * Can not count locking commands against total
+			 * as they are allowed to block on server.
+			 */
+
+			/* update # of requests on the wire to server */
+			if (timeout != CIFS_BLOCKING_OP) {
+				*credits -= 1;
+				server->in_flight++;
+			}
+			spin_unlock(&server->req_lock);
+			break;
+		}
+	}
+	return 0;
+}
+
+static int
+wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
+		      const int optype)
+{
+	int *val;
+
+	val = server->ops->get_credits_field(server, optype);
+	/* Since an echo is already inflight, no need to wait to send another */
+	if (*val <= 0 && optype == CIFS_ECHO_OP)
+		return -EAGAIN;
+	return wait_for_free_credits(server, timeout, val);
+}
+
+int
+cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
+		      unsigned int *num, unsigned int *credits)
+{
+	*num = size;
+	*credits = 0;
+	return 0;
+}
+
+static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
+			struct mid_q_entry **ppmidQ)
+{
+	if (ses->server->tcpStatus == CifsExiting) {
+		return -ENOENT;
+	}
+
+	if (ses->server->tcpStatus == CifsNeedReconnect) {
+		cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
+		return -EAGAIN;
+	}
+
+	if (ses->status == CifsNew) {
+		if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
+			(in_buf->Command != SMB_COM_NEGOTIATE))
+			return -EAGAIN;
+		/* else ok - we are setting up session */
+	}
+
+	if (ses->status == CifsExiting) {
+		/* check if SMB session is bad because we are setting it up */
+		if (in_buf->Command != SMB_COM_LOGOFF_ANDX)
+			return -EAGAIN;
+		/* else ok - we are shutting down session */
+	}
+
+	*ppmidQ = AllocMidQEntry(in_buf, ses->server);
+	if (*ppmidQ == NULL)
+		return -ENOMEM;
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+	return 0;
+}
+
+static int
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
+{
+	int error;
+
+	error = wait_event_freezekillable_unsafe(server->response_q,
+				    midQ->mid_state != MID_REQUEST_SUBMITTED);
+	if (error < 0)
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+struct mid_q_entry *
+cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
+
+	/* enable signing if server requires it */
+	if (server->sign)
+		hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	mid = AllocMidQEntry(hdr, server);
+	if (mid == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cifs_sign_rqst(rqst, server, &mid->sequence_number);
+	if (rc) {
+		DeleteMidQEntry(mid);
+		return ERR_PTR(rc);
+	}
+
+	return mid;
+}
+
+/*
+ * Send a SMB request and set the callback function in the mid to handle
+ * the result. Caller is responsible for dealing with timeouts.
+ */
+int
+cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
+		mid_receive_t *receive, mid_callback_t *callback,
+		void *cbdata, const int flags)
+{
+	int rc, timeout, optype;
+	struct mid_q_entry *mid;
+	unsigned int credits = 0;
+
+	timeout = flags & CIFS_TIMEOUT_MASK;
+	optype = flags & CIFS_OP_MASK;
+
+	if ((flags & CIFS_HAS_CREDITS) == 0) {
+		rc = wait_for_free_request(server, timeout, optype);
+		if (rc)
+			return rc;
+		credits = 1;
+	}
+
+	mutex_lock(&server->srv_mutex);
+	mid = server->ops->setup_async_request(server, rqst);
+	if (IS_ERR(mid)) {
+		mutex_unlock(&server->srv_mutex);
+		add_credits_and_wake_if(server, credits, optype);
+		return PTR_ERR(mid);
+	}
+
+	mid->receive = receive;
+	mid->callback = callback;
+	mid->callback_data = cbdata;
+	mid->mid_state = MID_REQUEST_SUBMITTED;
+
+	/* put it on the pending_mid_q */
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&mid->qhead, &server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+
+
+	cifs_in_send_inc(server);
+	rc = smb_send_rqst(server, rqst);
+	cifs_in_send_dec(server);
+	cifs_save_when_sent(mid);
+
+	if (rc < 0)
+		server->sequence_number -= 2;
+	mutex_unlock(&server->srv_mutex);
+
+	if (rc == 0)
+		return 0;
+
+	cifs_delete_mid(mid);
+	add_credits_and_wake_if(server, credits, optype);
+	return rc;
+}
+
+/*
+ *
+ * Send an SMB Request.  No response info (other than return code)
+ * needs to be parsed.
+ *
+ * flags indicate the type of request buffer and how long to wait
+ * and whether to log NT STATUS code (error) before mapping it to POSIX error
+ *
+ */
+int
+SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
+		 char *in_buf, int flags)
+{
+	int rc;
+	struct kvec iov[1];
+	int resp_buf_type;
+
+	iov[0].iov_base = in_buf;
+	iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
+	flags |= CIFS_NO_RESP;
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
+	cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
+
+	return rc;
+}
+
+static int
+cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+	int rc = 0;
+
+	cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
+		 __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);
+
+	spin_lock(&GlobalMid_Lock);
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		spin_unlock(&GlobalMid_Lock);
+		return rc;
+	case MID_RETRY_NEEDED:
+		rc = -EAGAIN;
+		break;
+	case MID_RESPONSE_MALFORMED:
+		rc = -EIO;
+		break;
+	case MID_SHUTDOWN:
+		rc = -EHOSTDOWN;
+		break;
+	default:
+		list_del_init(&mid->qhead);
+		cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
+			 __func__, mid->mid, mid->mid_state);
+		rc = -EIO;
+	}
+	spin_unlock(&GlobalMid_Lock);
+
+	DeleteMidQEntry(mid);
+	return rc;
+}
+
+static inline int
+send_cancel(struct TCP_Server_Info *server, void *buf, struct mid_q_entry *mid)
+{
+	return server->ops->send_cancel ?
+				server->ops->send_cancel(server, buf, mid) : 0;
+}
+
+int
+cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+		   bool log_error)
+{
+	unsigned int len = get_rfc1002_length(mid->resp_buf) + 4;
+
+	dump_smb(mid->resp_buf, min_t(u32, 92, len));
+
+	/* convert the length into a more usable form */
+	if (server->sign) {
+		struct kvec iov;
+		int rc = 0;
+		struct smb_rqst rqst = { .rq_iov = &iov,
+					 .rq_nvec = 1 };
+
+		iov.iov_base = mid->resp_buf;
+		iov.iov_len = len;
+		/* FIXME: add code to kill session */
+		rc = cifs_verify_signature(&rqst, server,
+					   mid->sequence_number);
+		if (rc)
+			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+				 rc);
+	}
+
+	/* BB special case reconnect tid and uid here? */
+	return map_smb_to_linux_error(mid->resp_buf, log_error);
+}
+
+struct mid_q_entry *
+cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
+
+	rc = allocate_mid(ses, hdr, &mid);
+	if (rc)
+		return ERR_PTR(rc);
+	rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number);
+	if (rc) {
+		cifs_delete_mid(mid);
+		return ERR_PTR(rc);
+	}
+	return mid;
+}
+
+int
+SendReceive2(const unsigned int xid, struct cifs_ses *ses,
+	     struct kvec *iov, int n_vec, int *resp_buf_type /* ret */,
+	     const int flags)
+{
+	int rc = 0;
+	int timeout, optype;
+	struct mid_q_entry *midQ;
+	char *buf = iov[0].iov_base;
+	unsigned int credits = 1;
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = n_vec };
+
+	timeout = flags & CIFS_TIMEOUT_MASK;
+	optype = flags & CIFS_OP_MASK;
+
+	*resp_buf_type = CIFS_NO_BUFFER;  /* no response buf yet */
+
+	if ((ses == NULL) || (ses->server == NULL)) {
+		cifs_small_buf_release(buf);
+		cifs_dbg(VFS, "Null session\n");
+		return -EIO;
+	}
+
+	if (ses->server->tcpStatus == CifsExiting) {
+		cifs_small_buf_release(buf);
+		return -ENOENT;
+	}
+
+	/*
+	 * Ensure that we do not send more than 50 overlapping requests
+	 * to the same server. We may make this configurable later or
+	 * use ses->maxReq.
+	 */
+
+	rc = wait_for_free_request(ses->server, timeout, optype);
+	if (rc) {
+		cifs_small_buf_release(buf);
+		return rc;
+	}
+
+	/*
+	 * Make sure that we sign in the same order that we send on this socket
+	 * and avoid races inside tcp sendmsg code that could cause corruption
+	 * of smb data.
+	 */
+
+	mutex_lock(&ses->server->srv_mutex);
+
+	midQ = ses->server->ops->setup_request(ses, &rqst);
+	if (IS_ERR(midQ)) {
+		mutex_unlock(&ses->server->srv_mutex);
+		cifs_small_buf_release(buf);
+		/* Update # of requests on wire to server */
+		add_credits(ses->server, 1, optype);
+		return PTR_ERR(midQ);
+	}
+
+	midQ->mid_state = MID_REQUEST_SUBMITTED;
+	cifs_in_send_inc(ses->server);
+	rc = smb_sendv(ses->server, iov, n_vec);
+	cifs_in_send_dec(ses->server);
+	cifs_save_when_sent(midQ);
+
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
+	mutex_unlock(&ses->server->srv_mutex);
+
+	if (rc < 0) {
+		cifs_small_buf_release(buf);
+		goto out;
+	}
+
+	if (timeout == CIFS_ASYNC_OP) {
+		cifs_small_buf_release(buf);
+		goto out;
+	}
+
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0) {
+		send_cancel(ses->server, buf, midQ);
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			cifs_small_buf_release(buf);
+			add_credits(ses->server, 1, optype);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+	cifs_small_buf_release(buf);
+
+	rc = cifs_sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
+		add_credits(ses->server, 1, optype);
+		return rc;
+	}
+
+	if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) {
+		rc = -EIO;
+		cifs_dbg(FYI, "Bad MID state?\n");
+		goto out;
+	}
+
+	buf = (char *)midQ->resp_buf;
+	iov[0].iov_base = buf;
+	iov[0].iov_len = get_rfc1002_length(buf) + 4;
+	if (midQ->large_buf)
+		*resp_buf_type = CIFS_LARGE_BUFFER;
+	else
+		*resp_buf_type = CIFS_SMALL_BUFFER;
+
+	credits = ses->server->ops->get_credits(midQ);
+
+	rc = ses->server->ops->check_receive(midQ, ses->server,
+					     flags & CIFS_LOG_ERROR);
+
+	/* mark it so buf will not be freed by cifs_delete_mid */
+	if ((flags & CIFS_NO_RESP) == 0)
+		midQ->resp_buf = NULL;
+out:
+	cifs_delete_mid(midQ);
+	add_credits(ses->server, credits, optype);
+
+	return rc;
+}
+
+int
+SendReceive(const unsigned int xid, struct cifs_ses *ses,
+	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
+	    int *pbytes_returned, const int timeout)
+{
+	int rc = 0;
+	struct mid_q_entry *midQ;
+
+	if (ses == NULL) {
+		cifs_dbg(VFS, "Null smb session\n");
+		return -EIO;
+	}
+	if (ses->server == NULL) {
+		cifs_dbg(VFS, "Null tcp session\n");
+		return -EIO;
+	}
+
+	if (ses->server->tcpStatus == CifsExiting)
+		return -ENOENT;
+
+	/* Ensure that we do not send more than 50 overlapping requests
+	   to the same server. We may make this configurable later or
+	   use ses->maxReq */
+
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
+		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+			 be32_to_cpu(in_buf->smb_buf_length));
+		return -EIO;
+	}
+
+	rc = wait_for_free_request(ses->server, timeout, 0);
+	if (rc)
+		return rc;
+
+	/* make sure that we sign in the same order that we send on this socket
+	   and avoid races inside tcp sendmsg code that could cause corruption
+	   of smb data */
+
+	mutex_lock(&ses->server->srv_mutex);
+
+	rc = allocate_mid(ses, in_buf, &midQ);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		/* Update # of requests on wire to server */
+		add_credits(ses->server, 1, 0);
+		return rc;
+	}
+
+	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		goto out;
+	}
+
+	midQ->mid_state = MID_REQUEST_SUBMITTED;
+
+	cifs_in_send_inc(ses->server);
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	cifs_in_send_dec(ses->server);
+	cifs_save_when_sent(midQ);
+
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
+
+	mutex_unlock(&ses->server->srv_mutex);
+
+	if (rc < 0)
+		goto out;
+
+	if (timeout == CIFS_ASYNC_OP)
+		goto out;
+
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0) {
+		send_cancel(ses->server, in_buf, midQ);
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+			/* no longer considered to be "in-flight" */
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			add_credits(ses->server, 1, 0);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+	rc = cifs_sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
+		add_credits(ses->server, 1, 0);
+		return rc;
+	}
+
+	if (!midQ->resp_buf || !out_buf ||
+	    midQ->mid_state != MID_RESPONSE_RECEIVED) {
+		rc = -EIO;
+		cifs_dbg(VFS, "Bad MID state?\n");
+		goto out;
+	}
+
+	*pbytes_returned = get_rfc1002_length(midQ->resp_buf);
+	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+	rc = cifs_check_receive(midQ, ses->server, 0);
+out:
+	cifs_delete_mid(midQ);
+	add_credits(ses->server, 1, 0);
+
+	return rc;
+}
+
+/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
+   blocking lock to return. */
+
+static int
+send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
+			struct smb_hdr *in_buf,
+			struct smb_hdr *out_buf)
+{
+	int bytes_returned;
+	struct cifs_ses *ses = tcon->ses;
+	LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
+
+	/* We just modify the current in_buf to change
+	   the type of lock from LOCKING_ANDX_SHARED_LOCK
+	   or LOCKING_ANDX_EXCLUSIVE_LOCK to
+	   LOCKING_ANDX_CANCEL_LOCK. */
+
+	pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
+	pSMB->Timeout = 0;
+	pSMB->hdr.Mid = get_next_mid(ses->server);
+
+	return SendReceive(xid, ses, in_buf, out_buf,
+			&bytes_returned, 0);
+}
+
+int
+SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
+	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
+	    int *pbytes_returned)
+{
+	int rc = 0;
+	int rstart = 0;
+	struct mid_q_entry *midQ;
+	struct cifs_ses *ses;
+
+	if (tcon == NULL || tcon->ses == NULL) {
+		cifs_dbg(VFS, "Null smb session\n");
+		return -EIO;
+	}
+	ses = tcon->ses;
+
+	if (ses->server == NULL) {
+		cifs_dbg(VFS, "Null tcp session\n");
+		return -EIO;
+	}
+
+	if (ses->server->tcpStatus == CifsExiting)
+		return -ENOENT;
+
+	/* Ensure that we do not send more than 50 overlapping requests
+	   to the same server. We may make this configurable later or
+	   use ses->maxReq */
+
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
+		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+			 be32_to_cpu(in_buf->smb_buf_length));
+		return -EIO;
+	}
+
+	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0);
+	if (rc)
+		return rc;
+
+	/* make sure that we sign in the same order that we send on this socket
+	   and avoid races inside tcp sendmsg code that could cause corruption
+	   of smb data */
+
+	mutex_lock(&ses->server->srv_mutex);
+
+	rc = allocate_mid(ses, in_buf, &midQ);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		return rc;
+	}
+
+	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
+	if (rc) {
+		cifs_delete_mid(midQ);
+		mutex_unlock(&ses->server->srv_mutex);
+		return rc;
+	}
+
+	midQ->mid_state = MID_REQUEST_SUBMITTED;
+	cifs_in_send_inc(ses->server);
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	cifs_in_send_dec(ses->server);
+	cifs_save_when_sent(midQ);
+
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
+
+	mutex_unlock(&ses->server->srv_mutex);
+
+	if (rc < 0) {
+		cifs_delete_mid(midQ);
+		return rc;
+	}
+
+	/* Wait for a reply - allow signals to interrupt. */
+	rc = wait_event_interruptible(ses->server->response_q,
+		(!(midQ->mid_state == MID_REQUEST_SUBMITTED)) ||
+		((ses->server->tcpStatus != CifsGood) &&
+		 (ses->server->tcpStatus != CifsNew)));
+
+	/* Were we interrupted by a signal ? */
+	if ((rc == -ERESTARTSYS) &&
+		(midQ->mid_state == MID_REQUEST_SUBMITTED) &&
+		((ses->server->tcpStatus == CifsGood) ||
+		 (ses->server->tcpStatus == CifsNew))) {
+
+		if (in_buf->Command == SMB_COM_TRANSACTION2) {
+			/* POSIX lock. We send a NT_CANCEL SMB to cause the
+			   blocking lock to return. */
+			rc = send_cancel(ses->server, in_buf, midQ);
+			if (rc) {
+				cifs_delete_mid(midQ);
+				return rc;
+			}
+		} else {
+			/* Windows lock. We send a LOCKINGX_CANCEL_LOCK
+			   to cause the blocking lock to return. */
+
+			rc = send_lock_cancel(xid, tcon, in_buf, out_buf);
+
+			/* If we get -ENOLCK back the lock may have
+			   already been removed. Don't exit in this case. */
+			if (rc && rc != -ENOLCK) {
+				cifs_delete_mid(midQ);
+				return rc;
+			}
+		}
+
+		rc = wait_for_response(ses->server, midQ);
+		if (rc) {
+			send_cancel(ses->server, in_buf, midQ);
+			spin_lock(&GlobalMid_Lock);
+			if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+				/* no longer considered to be "in-flight" */
+				midQ->callback = DeleteMidQEntry;
+				spin_unlock(&GlobalMid_Lock);
+				return rc;
+			}
+			spin_unlock(&GlobalMid_Lock);
+		}
+
+		/* We got the response - restart system call. */
+		rstart = 1;
+	}
+
+	rc = cifs_sync_mid_result(midQ, ses->server);
+	if (rc != 0)
+		return rc;
+
+	/* rcvd frame is ok */
+	if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
+		rc = -EIO;
+		cifs_dbg(VFS, "Bad MID state?\n");
+		goto out;
+	}
+
+	*pbytes_returned = get_rfc1002_length(midQ->resp_buf);
+	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+	rc = cifs_check_receive(midQ, ses->server, 0);
+out:
+	cifs_delete_mid(midQ);
+	if (rstart && rc == -EACCES)
+		return -ERESTARTSYS;
+	return rc;
+}
diff --git a/kernel/fs/cifs/winucase.c b/kernel/fs/cifs/winucase.c
new file mode 100644
index 000000000..1506d4fdd
--- /dev/null
+++ b/kernel/fs/cifs/winucase.c
@@ -0,0 +1,663 @@
+/*
+ * fs/cifs/winucase.c
+ *
+ * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * The const tables in this file were converted from the following info
+ * provided by Microsoft:
+ *
+ * 3.1.5.3 Mapping UTF-16 Strings to Upper Case:
+ *
+ * http://msdn.microsoft.com/en-us/library/hh877830.aspx
+ * http://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=10921
+ *
+ * In particular, the table in "Windows 8 Upper Case Mapping Table.txt" was
+ * post-processed using the winucase_convert.pl script.
+ */
+
+#include <linux/nls.h>
+
+wchar_t cifs_toupper(wchar_t in);  /* quiet sparse */
+
+static const wchar_t t2_00[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0000,
+	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178,
+};
+
+static const wchar_t t2_01[256] = {
+	0x0000, 0x0100, 0x0000, 0x0102, 0x0000, 0x0104, 0x0000, 0x0106,
+	0x0000, 0x0108, 0x0000, 0x010a, 0x0000, 0x010c, 0x0000, 0x010e,
+	0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0000, 0x0116,
+	0x0000, 0x0118, 0x0000, 0x011a, 0x0000, 0x011c, 0x0000, 0x011e,
+	0x0000, 0x0120, 0x0000, 0x0122, 0x0000, 0x0124, 0x0000, 0x0126,
+	0x0000, 0x0128, 0x0000, 0x012a, 0x0000, 0x012c, 0x0000, 0x012e,
+	0x0000, 0x0000, 0x0000, 0x0132, 0x0000, 0x0134, 0x0000, 0x0136,
+	0x0000, 0x0000, 0x0139, 0x0000, 0x013b, 0x0000, 0x013d, 0x0000,
+	0x013f, 0x0000, 0x0141, 0x0000, 0x0143, 0x0000, 0x0145, 0x0000,
+	0x0147, 0x0000, 0x0000, 0x014a, 0x0000, 0x014c, 0x0000, 0x014e,
+	0x0000, 0x0150, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156,
+	0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x0000, 0x015e,
+	0x0000, 0x0160, 0x0000, 0x0162, 0x0000, 0x0164, 0x0000, 0x0166,
+	0x0000, 0x0168, 0x0000, 0x016a, 0x0000, 0x016c, 0x0000, 0x016e,
+	0x0000, 0x0170, 0x0000, 0x0172, 0x0000, 0x0174, 0x0000, 0x0176,
+	0x0000, 0x0000, 0x0179, 0x0000, 0x017b, 0x0000, 0x017d, 0x0000,
+	0x0243, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0000, 0x0000,
+	0x0187, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x01f6, 0x0000, 0x0000,
+	0x0000, 0x0198, 0x023d, 0x0000, 0x0000, 0x0000, 0x0220, 0x0000,
+	0x0000, 0x01a0, 0x0000, 0x01a2, 0x0000, 0x01a4, 0x0000, 0x0000,
+	0x01a7, 0x0000, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0000,
+	0x01af, 0x0000, 0x0000, 0x0000, 0x01b3, 0x0000, 0x01b5, 0x0000,
+	0x0000, 0x01b8, 0x0000, 0x0000, 0x0000, 0x01bc, 0x0000, 0x01f7,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01c4, 0x0000,
+	0x0000, 0x01c7, 0x0000, 0x0000, 0x01ca, 0x0000, 0x01cd, 0x0000,
+	0x01cf, 0x0000, 0x01d1, 0x0000, 0x01d3, 0x0000, 0x01d5, 0x0000,
+	0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x018e, 0x0000, 0x01de,
+	0x0000, 0x01e0, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6,
+	0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee,
+	0x0000, 0x0000, 0x0000, 0x01f1, 0x0000, 0x01f4, 0x0000, 0x0000,
+	0x0000, 0x01f8, 0x0000, 0x01fa, 0x0000, 0x01fc, 0x0000, 0x01fe,
+};
+
+static const wchar_t t2_02[256] = {
+	0x0000, 0x0200, 0x0000, 0x0202, 0x0000, 0x0204, 0x0000, 0x0206,
+	0x0000, 0x0208, 0x0000, 0x020a, 0x0000, 0x020c, 0x0000, 0x020e,
+	0x0000, 0x0210, 0x0000, 0x0212, 0x0000, 0x0214, 0x0000, 0x0216,
+	0x0000, 0x0218, 0x0000, 0x021a, 0x0000, 0x021c, 0x0000, 0x021e,
+	0x0000, 0x0000, 0x0000, 0x0222, 0x0000, 0x0224, 0x0000, 0x0226,
+	0x0000, 0x0228, 0x0000, 0x022a, 0x0000, 0x022c, 0x0000, 0x022e,
+	0x0000, 0x0230, 0x0000, 0x0232, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x023b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0241, 0x0000, 0x0000, 0x0000, 0x0000, 0x0246,
+	0x0000, 0x0248, 0x0000, 0x024a, 0x0000, 0x024c, 0x0000, 0x024e,
+	0x2c6f, 0x2c6d, 0x0000, 0x0181, 0x0186, 0x0000, 0x0189, 0x018a,
+	0x0000, 0x018f, 0x0000, 0x0190, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0193, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0197, 0x0196, 0x0000, 0x2c62, 0x0000, 0x0000, 0x0000, 0x019c,
+	0x0000, 0x2c6e, 0x019d, 0x0000, 0x0000, 0x019f, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c64, 0x0000, 0x0000,
+	0x01a6, 0x0000, 0x0000, 0x01a9, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x01b7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_03[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0370, 0x0000, 0x0372, 0x0000, 0x0000, 0x0000, 0x0376,
+	0x0000, 0x0000, 0x0000, 0x03fd, 0x03fe, 0x03ff, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0388, 0x0389, 0x038a,
+	0x0000, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
+	0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
+	0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
+	0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03cf,
+	0x0000, 0x03d8, 0x0000, 0x03da, 0x0000, 0x03dc, 0x0000, 0x03de,
+	0x0000, 0x03e0, 0x0000, 0x03e2, 0x0000, 0x03e4, 0x0000, 0x03e6,
+	0x0000, 0x03e8, 0x0000, 0x03ea, 0x0000, 0x03ec, 0x0000, 0x03ee,
+	0x0000, 0x0000, 0x03f9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x03f7, 0x0000, 0x0000, 0x03fa, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_04[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
+	0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
+	0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
+	0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,
+	0x0000, 0x0460, 0x0000, 0x0462, 0x0000, 0x0464, 0x0000, 0x0466,
+	0x0000, 0x0468, 0x0000, 0x046a, 0x0000, 0x046c, 0x0000, 0x046e,
+	0x0000, 0x0470, 0x0000, 0x0472, 0x0000, 0x0474, 0x0000, 0x0476,
+	0x0000, 0x0478, 0x0000, 0x047a, 0x0000, 0x047c, 0x0000, 0x047e,
+	0x0000, 0x0480, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x048a, 0x0000, 0x048c, 0x0000, 0x048e,
+	0x0000, 0x0490, 0x0000, 0x0492, 0x0000, 0x0494, 0x0000, 0x0496,
+	0x0000, 0x0498, 0x0000, 0x049a, 0x0000, 0x049c, 0x0000, 0x049e,
+	0x0000, 0x04a0, 0x0000, 0x04a2, 0x0000, 0x04a4, 0x0000, 0x04a6,
+	0x0000, 0x04a8, 0x0000, 0x04aa, 0x0000, 0x04ac, 0x0000, 0x04ae,
+	0x0000, 0x04b0, 0x0000, 0x04b2, 0x0000, 0x04b4, 0x0000, 0x04b6,
+	0x0000, 0x04b8, 0x0000, 0x04ba, 0x0000, 0x04bc, 0x0000, 0x04be,
+	0x0000, 0x0000, 0x04c1, 0x0000, 0x04c3, 0x0000, 0x04c5, 0x0000,
+	0x04c7, 0x0000, 0x04c9, 0x0000, 0x04cb, 0x0000, 0x04cd, 0x04c0,
+	0x0000, 0x04d0, 0x0000, 0x04d2, 0x0000, 0x04d4, 0x0000, 0x04d6,
+	0x0000, 0x04d8, 0x0000, 0x04da, 0x0000, 0x04dc, 0x0000, 0x04de,
+	0x0000, 0x04e0, 0x0000, 0x04e2, 0x0000, 0x04e4, 0x0000, 0x04e6,
+	0x0000, 0x04e8, 0x0000, 0x04ea, 0x0000, 0x04ec, 0x0000, 0x04ee,
+	0x0000, 0x04f0, 0x0000, 0x04f2, 0x0000, 0x04f4, 0x0000, 0x04f6,
+	0x0000, 0x04f8, 0x0000, 0x04fa, 0x0000, 0x04fc, 0x0000, 0x04fe,
+};
+
+static const wchar_t t2_05[256] = {
+	0x0000, 0x0500, 0x0000, 0x0502, 0x0000, 0x0504, 0x0000, 0x0506,
+	0x0000, 0x0508, 0x0000, 0x050a, 0x0000, 0x050c, 0x0000, 0x050e,
+	0x0000, 0x0510, 0x0000, 0x0512, 0x0000, 0x0514, 0x0000, 0x0516,
+	0x0000, 0x0518, 0x0000, 0x051a, 0x0000, 0x051c, 0x0000, 0x051e,
+	0x0000, 0x0520, 0x0000, 0x0522, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537,
+	0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f,
+	0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547,
+	0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f,
+	0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_1d[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa77d, 0x0000, 0x0000, 0x0000, 0x2c63, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_1e[256] = {
+	0x0000, 0x1e00, 0x0000, 0x1e02, 0x0000, 0x1e04, 0x0000, 0x1e06,
+	0x0000, 0x1e08, 0x0000, 0x1e0a, 0x0000, 0x1e0c, 0x0000, 0x1e0e,
+	0x0000, 0x1e10, 0x0000, 0x1e12, 0x0000, 0x1e14, 0x0000, 0x1e16,
+	0x0000, 0x1e18, 0x0000, 0x1e1a, 0x0000, 0x1e1c, 0x0000, 0x1e1e,
+	0x0000, 0x1e20, 0x0000, 0x1e22, 0x0000, 0x1e24, 0x0000, 0x1e26,
+	0x0000, 0x1e28, 0x0000, 0x1e2a, 0x0000, 0x1e2c, 0x0000, 0x1e2e,
+	0x0000, 0x1e30, 0x0000, 0x1e32, 0x0000, 0x1e34, 0x0000, 0x1e36,
+	0x0000, 0x1e38, 0x0000, 0x1e3a, 0x0000, 0x1e3c, 0x0000, 0x1e3e,
+	0x0000, 0x1e40, 0x0000, 0x1e42, 0x0000, 0x1e44, 0x0000, 0x1e46,
+	0x0000, 0x1e48, 0x0000, 0x1e4a, 0x0000, 0x1e4c, 0x0000, 0x1e4e,
+	0x0000, 0x1e50, 0x0000, 0x1e52, 0x0000, 0x1e54, 0x0000, 0x1e56,
+	0x0000, 0x1e58, 0x0000, 0x1e5a, 0x0000, 0x1e5c, 0x0000, 0x1e5e,
+	0x0000, 0x1e60, 0x0000, 0x1e62, 0x0000, 0x1e64, 0x0000, 0x1e66,
+	0x0000, 0x1e68, 0x0000, 0x1e6a, 0x0000, 0x1e6c, 0x0000, 0x1e6e,
+	0x0000, 0x1e70, 0x0000, 0x1e72, 0x0000, 0x1e74, 0x0000, 0x1e76,
+	0x0000, 0x1e78, 0x0000, 0x1e7a, 0x0000, 0x1e7c, 0x0000, 0x1e7e,
+	0x0000, 0x1e80, 0x0000, 0x1e82, 0x0000, 0x1e84, 0x0000, 0x1e86,
+	0x0000, 0x1e88, 0x0000, 0x1e8a, 0x0000, 0x1e8c, 0x0000, 0x1e8e,
+	0x0000, 0x1e90, 0x0000, 0x1e92, 0x0000, 0x1e94, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x1ea0, 0x0000, 0x1ea2, 0x0000, 0x1ea4, 0x0000, 0x1ea6,
+	0x0000, 0x1ea8, 0x0000, 0x1eaa, 0x0000, 0x1eac, 0x0000, 0x1eae,
+	0x0000, 0x1eb0, 0x0000, 0x1eb2, 0x0000, 0x1eb4, 0x0000, 0x1eb6,
+	0x0000, 0x1eb8, 0x0000, 0x1eba, 0x0000, 0x1ebc, 0x0000, 0x1ebe,
+	0x0000, 0x1ec0, 0x0000, 0x1ec2, 0x0000, 0x1ec4, 0x0000, 0x1ec6,
+	0x0000, 0x1ec8, 0x0000, 0x1eca, 0x0000, 0x1ecc, 0x0000, 0x1ece,
+	0x0000, 0x1ed0, 0x0000, 0x1ed2, 0x0000, 0x1ed4, 0x0000, 0x1ed6,
+	0x0000, 0x1ed8, 0x0000, 0x1eda, 0x0000, 0x1edc, 0x0000, 0x1ede,
+	0x0000, 0x1ee0, 0x0000, 0x1ee2, 0x0000, 0x1ee4, 0x0000, 0x1ee6,
+	0x0000, 0x1ee8, 0x0000, 0x1eea, 0x0000, 0x1eec, 0x0000, 0x1eee,
+	0x0000, 0x1ef0, 0x0000, 0x1ef2, 0x0000, 0x1ef4, 0x0000, 0x1ef6,
+	0x0000, 0x1ef8, 0x0000, 0x1efa, 0x0000, 0x1efc, 0x0000, 0x1efe,
+};
+
+static const wchar_t t2_1f[256] = {
+	0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x1f59, 0x0000, 0x1f5b, 0x0000, 0x1f5d, 0x0000, 0x1f5f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb,
+	0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x0000, 0x0000,
+	0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fb8, 0x1fb9, 0x0000, 0x1fbc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x1fcc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fd8, 0x1fd9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fe8, 0x1fe9, 0x0000, 0x0000, 0x0000, 0x1fec, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x1ffc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_21[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2132, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167,
+	0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2183, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_24[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd,
+	0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5,
+	0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd,
+	0x24ce, 0x24cf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_2c[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07,
+	0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f,
+	0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17,
+	0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f,
+	0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27,
+	0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x0000,
+	0x0000, 0x2c60, 0x0000, 0x0000, 0x0000, 0x023a, 0x023e, 0x0000,
+	0x2c67, 0x0000, 0x2c69, 0x0000, 0x2c6b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x2c72, 0x0000, 0x0000, 0x2c75, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2c80, 0x0000, 0x2c82, 0x0000, 0x2c84, 0x0000, 0x2c86,
+	0x0000, 0x2c88, 0x0000, 0x2c8a, 0x0000, 0x2c8c, 0x0000, 0x2c8e,
+	0x0000, 0x2c90, 0x0000, 0x2c92, 0x0000, 0x2c94, 0x0000, 0x2c96,
+	0x0000, 0x2c98, 0x0000, 0x2c9a, 0x0000, 0x2c9c, 0x0000, 0x2c9e,
+	0x0000, 0x2ca0, 0x0000, 0x2ca2, 0x0000, 0x2ca4, 0x0000, 0x2ca6,
+	0x0000, 0x2ca8, 0x0000, 0x2caa, 0x0000, 0x2cac, 0x0000, 0x2cae,
+	0x0000, 0x2cb0, 0x0000, 0x2cb2, 0x0000, 0x2cb4, 0x0000, 0x2cb6,
+	0x0000, 0x2cb8, 0x0000, 0x2cba, 0x0000, 0x2cbc, 0x0000, 0x2cbe,
+	0x0000, 0x2cc0, 0x0000, 0x2cc2, 0x0000, 0x2cc4, 0x0000, 0x2cc6,
+	0x0000, 0x2cc8, 0x0000, 0x2cca, 0x0000, 0x2ccc, 0x0000, 0x2cce,
+	0x0000, 0x2cd0, 0x0000, 0x2cd2, 0x0000, 0x2cd4, 0x0000, 0x2cd6,
+	0x0000, 0x2cd8, 0x0000, 0x2cda, 0x0000, 0x2cdc, 0x0000, 0x2cde,
+	0x0000, 0x2ce0, 0x0000, 0x2ce2, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_2d[256] = {
+	0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7,
+	0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af,
+	0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7,
+	0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf,
+	0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_a6[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa640, 0x0000, 0xa642, 0x0000, 0xa644, 0x0000, 0xa646,
+	0x0000, 0xa648, 0x0000, 0xa64a, 0x0000, 0xa64c, 0x0000, 0xa64e,
+	0x0000, 0xa650, 0x0000, 0xa652, 0x0000, 0xa654, 0x0000, 0xa656,
+	0x0000, 0xa658, 0x0000, 0xa65a, 0x0000, 0xa65c, 0x0000, 0xa65e,
+	0x0000, 0x0000, 0x0000, 0xa662, 0x0000, 0xa664, 0x0000, 0xa666,
+	0x0000, 0xa668, 0x0000, 0xa66a, 0x0000, 0xa66c, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa680, 0x0000, 0xa682, 0x0000, 0xa684, 0x0000, 0xa686,
+	0x0000, 0xa688, 0x0000, 0xa68a, 0x0000, 0xa68c, 0x0000, 0xa68e,
+	0x0000, 0xa690, 0x0000, 0xa692, 0x0000, 0xa694, 0x0000, 0xa696,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_a7[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0xa722, 0x0000, 0xa724, 0x0000, 0xa726,
+	0x0000, 0xa728, 0x0000, 0xa72a, 0x0000, 0xa72c, 0x0000, 0xa72e,
+	0x0000, 0x0000, 0x0000, 0xa732, 0x0000, 0xa734, 0x0000, 0xa736,
+	0x0000, 0xa738, 0x0000, 0xa73a, 0x0000, 0xa73c, 0x0000, 0xa73e,
+	0x0000, 0xa740, 0x0000, 0xa742, 0x0000, 0xa744, 0x0000, 0xa746,
+	0x0000, 0xa748, 0x0000, 0xa74a, 0x0000, 0xa74c, 0x0000, 0xa74e,
+	0x0000, 0xa750, 0x0000, 0xa752, 0x0000, 0xa754, 0x0000, 0xa756,
+	0x0000, 0xa758, 0x0000, 0xa75a, 0x0000, 0xa75c, 0x0000, 0xa75e,
+	0x0000, 0xa760, 0x0000, 0xa762, 0x0000, 0xa764, 0x0000, 0xa766,
+	0x0000, 0xa768, 0x0000, 0xa76a, 0x0000, 0xa76c, 0x0000, 0xa76e,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0xa779, 0x0000, 0xa77b, 0x0000, 0x0000, 0xa77e,
+	0x0000, 0xa780, 0x0000, 0xa782, 0x0000, 0xa784, 0x0000, 0xa786,
+	0x0000, 0x0000, 0x0000, 0x0000, 0xa78b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_ff[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27,
+	0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f,
+	0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37,
+	0xff38, 0xff39, 0xff3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t *const toplevel[256] = {
+	t2_00, t2_01, t2_02, t2_03, t2_04, t2_05,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL, t2_1d, t2_1e, t2_1f,
+	NULL, t2_21,  NULL,  NULL, t2_24,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL, t2_2c, t2_2d,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL, t2_a6, t2_a7,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL, t2_ff,
+};
+
+/**
+ * cifs_toupper - convert a wchar_t from lower to uppercase
+ * @in: character to convert from lower to uppercase
+ *
+ * This function consults the static tables above to convert a wchar_t from
+ * lower to uppercase. In the event that there is no mapping, the original
+ * "in" character is returned.
+ */
+wchar_t
+cifs_toupper(wchar_t in)
+{
+	unsigned char idx;
+	const wchar_t *tbl;
+	wchar_t out;
+
+	/* grab upper byte */
+	idx = (in & 0xff00) >> 8;
+
+	/* find pointer to 2nd layer table */
+	tbl = toplevel[idx];
+	if (!tbl)
+		return in;
+
+	/* grab lower byte */
+	idx = in & 0xff;
+
+	/* look up character in table */
+	out = tbl[idx];
+	if (out)
+		return out;
+
+	return in;
+}
diff --git a/kernel/fs/cifs/xattr.c b/kernel/fs/cifs/xattr.c
new file mode 100644
index 000000000..ff9e1f8b1
--- /dev/null
+++ b/kernel/fs/cifs/xattr.c
@@ -0,0 +1,424 @@
+/*
+ *   fs/cifs/xattr.c
+ *
+ *   Copyright (c) International Business Machines  Corp., 2003, 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+
+#define MAX_EA_VALUE_SIZE 65535
+#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
+
+/* BB need to add server (Samba e.g) support for security and trusted prefix */
+
+int cifs_removexattr(struct dentry *direntry, const char *ea_name)
+{
+	int rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path = NULL;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (d_really_is_negative(direntry))
+		return -EIO;
+	sb = d_inode(direntry)->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto remove_ea_exit;
+	}
+	if (ea_name == NULL) {
+		cifs_dbg(FYI, "Null xattr names not supported\n");
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		&& (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
+		cifs_dbg(FYI,
+			 "illegal xattr request %s (only user namespace supported)\n",
+			 ea_name);
+		/* BB what if no namespace prefix? */
+		/* Should we just pass them to server, except for
+		system and perhaps security prefixes? */
+	} else {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto remove_ea_exit;
+
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, NULL, (__u16)0,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+	}
+remove_ea_exit:
+	kfree(full_path);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
+
+int cifs_setxattr(struct dentry *direntry, const char *ea_name,
+		  const void *ea_value, size_t value_size, int flags)
+{
+	int rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (d_really_is_negative(direntry))
+		return -EIO;
+	sb = d_inode(direntry)->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto set_ea_exit;
+	}
+	/* return dos attributes as pseudo xattr */
+	/* return alt name if available as pseudo attr */
+
+	/* if proc/fs/cifs/streamstoxattr is set then
+		search server for EAs or streams to
+		returns as xattrs */
+	if (value_size > MAX_EA_VALUE_SIZE) {
+		cifs_dbg(FYI, "size of EA value too large\n");
+		rc = -EOPNOTSUPP;
+		goto set_ea_exit;
+	}
+
+	if (ea_name == NULL) {
+		cifs_dbg(FYI, "Null xattr names not supported\n");
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		   == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto set_ea_exit;
+		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
+			cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
+
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
+		   == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto set_ea_exit;
+
+		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+			strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+		struct cifs_ntsd *pacl;
+		pacl = kmalloc(value_size, GFP_KERNEL);
+		if (!pacl) {
+			rc = -ENOMEM;
+		} else {
+			memcpy(pacl, ea_value, value_size);
+			if (pTcon->ses->server->ops->set_acl)
+				rc = pTcon->ses->server->ops->set_acl(pacl,
+						value_size, d_inode(direntry),
+						full_path, CIFS_ACL_DACL);
+			else
+				rc = -EOPNOTSUPP;
+			if (rc == 0) /* force revalidate of the inode */
+				CIFS_I(d_inode(direntry))->time = 0;
+			kfree(pacl);
+		}
+#else
+		cifs_dbg(FYI, "Set CIFS ACL not supported yet\n");
+#endif /* CONFIG_CIFS_ACL */
+	} else {
+		int temp;
+		temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
+			strlen(POSIX_ACL_XATTR_ACCESS));
+		if (temp == 0) {
+#ifdef CONFIG_CIFS_POSIX
+			if (sb->s_flags & MS_POSIXACL)
+				rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
+					ea_value, (const int)value_size,
+					ACL_TYPE_ACCESS, cifs_sb->local_nls,
+					cifs_remap(cifs_sb));
+			cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc);
+#else
+			cifs_dbg(FYI, "set POSIX ACL not supported\n");
+#endif
+		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
+				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+#ifdef CONFIG_CIFS_POSIX
+			if (sb->s_flags & MS_POSIXACL)
+				rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
+					ea_value, (const int)value_size,
+					ACL_TYPE_DEFAULT, cifs_sb->local_nls,
+					cifs_remap(cifs_sb));
+			cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc);
+#else
+			cifs_dbg(FYI, "set default POSIX ACL not supported\n");
+#endif
+		} else {
+			cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n",
+				 ea_name);
+		  /* BB what if no namespace prefix? */
+		  /* Should we just pass them to server, except for
+		  system and perhaps security prefixes? */
+		}
+	}
+
+set_ea_exit:
+	kfree(full_path);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
+
+ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
+	void *ea_value, size_t buf_size)
+{
+	ssize_t rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (d_really_is_negative(direntry))
+		return -EIO;
+	sb = d_inode(direntry)->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto get_ea_exit;
+	}
+	/* return dos attributes as pseudo xattr */
+	/* return alt name if available as pseudo attr */
+	if (ea_name == NULL) {
+		cifs_dbg(FYI, "Null xattr names not supported\n");
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		   == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto get_ea_exit;
+
+		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
+			cifs_dbg(FYI, "attempt to query cifs inode metadata\n");
+			/* revalidate/getattr then populate from inode */
+		} /* BB add else when above is implemented */
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto get_ea_exit;
+
+		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
+			  strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+#ifdef CONFIG_CIFS_POSIX
+		if (sb->s_flags & MS_POSIXACL)
+			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
+				ea_value, buf_size, ACL_TYPE_ACCESS,
+				cifs_sb->local_nls,
+				cifs_remap(cifs_sb));
+#else
+		cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
+#endif /* CONFIG_CIFS_POSIX */
+	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
+			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+#ifdef CONFIG_CIFS_POSIX
+		if (sb->s_flags & MS_POSIXACL)
+			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
+				ea_value, buf_size, ACL_TYPE_DEFAULT,
+				cifs_sb->local_nls,
+				cifs_remap(cifs_sb));
+#else
+		cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n");
+#endif /* CONFIG_CIFS_POSIX */
+	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+				strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+			u32 acllen;
+			struct cifs_ntsd *pacl;
+
+			if (pTcon->ses->server->ops->get_acl == NULL)
+				goto get_ea_exit; /* rc already EOPNOTSUPP */
+
+			pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
+					d_inode(direntry), full_path, &acllen);
+			if (IS_ERR(pacl)) {
+				rc = PTR_ERR(pacl);
+				cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
+					 __func__, rc);
+			} else {
+				if (ea_value) {
+					if (acllen > buf_size)
+						acllen = -ERANGE;
+					else
+						memcpy(ea_value, pacl, acllen);
+				}
+				rc = acllen;
+				kfree(pacl);
+			}
+#else
+			cifs_dbg(FYI, "Query CIFS ACL not supported yet\n");
+#endif /* CONFIG_CIFS_ACL */
+	} else if (strncmp(ea_name,
+		  XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
+		cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n");
+	} else if (strncmp(ea_name,
+		  XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
+		cifs_dbg(FYI, "Security xattr namespace not supported yet\n");
+	} else
+		cifs_dbg(FYI,
+			 "illegal xattr request %s (only user namespace supported)\n",
+			 ea_name);
+
+	/* We could add an additional check for streams ie
+	    if proc/fs/cifs/streamstoxattr is set then
+		search server for EAs or streams to
+		returns as xattrs */
+
+	if (rc == -EINVAL)
+		rc = -EOPNOTSUPP;
+
+get_ea_exit:
+	kfree(full_path);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
+
+ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
+{
+	ssize_t rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (d_really_is_negative(direntry))
+		return -EIO;
+	sb = d_inode(direntry)->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+		return -EOPNOTSUPP;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto list_ea_exit;
+	}
+	/* return dos attributes as pseudo xattr */
+	/* return alt name if available as pseudo attr */
+
+	/* if proc/fs/cifs/streamstoxattr is set then
+		search server for EAs or streams to
+		returns as xattrs */
+
+	if (pTcon->ses->server->ops->query_all_EAs)
+		rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, NULL, data, buf_size,
+				cifs_sb->local_nls, cifs_remap(cifs_sb));
+list_ea_exit:
+	kfree(full_path);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
diff --git a/kernel/fs/coda/Kconfig b/kernel/fs/coda/Kconfig
new file mode 100644
index 000000000..c0e5a7fad
--- /dev/null
+++ b/kernel/fs/coda/Kconfig
@@ -0,0 +1,21 @@
+config CODA_FS
+	tristate "Coda file system support (advanced network fs)"
+	depends on INET
+	help
+	  Coda is an advanced network file system, similar to NFS in that it
+	  enables you to mount file systems of a remote server and access them
+	  with regular Unix commands as if they were sitting on your hard
+	  disk.  Coda has several advantages over NFS: support for
+	  disconnected operation (e.g. for laptops), read/write server
+	  replication, security model for authentication and encryption,
+	  persistent client caches and write back caching.
+
+	  If you say Y here, your Linux box will be able to act as a Coda
+	  *client*.  You will need user level code as well, both for the
+	  client and server.  Servers are currently user level, i.e. they need
+	  no kernel support.  Please read
+	  <file:Documentation/filesystems/coda.txt> and check out the Coda
+	  home page <http://www.coda.cs.cmu.edu/>.
+
+	  To compile the coda client support as a module, choose M here: the
+	  module will be called coda.
diff --git a/kernel/fs/coda/Makefile b/kernel/fs/coda/Makefile
new file mode 100644
index 000000000..1bab69a0d
--- /dev/null
+++ b/kernel/fs/coda/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the Linux Coda filesystem routines.
+#
+
+obj-$(CONFIG_CODA_FS) += coda.o
+
+coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
+	     coda_linux.o symlink.o pioctl.o sysctl.o 
+
+# If you want debugging output, please uncomment the following line.
+
+# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1
diff --git a/kernel/fs/coda/cache.c b/kernel/fs/coda/cache.c
new file mode 100644
index 000000000..5bb630a76
--- /dev/null
+++ b/kernel/fs/coda/cache.c
@@ -0,0 +1,118 @@
+/*
+ * Cache operations for Coda.
+ * For Linux 2.1: (C) 1997 Carnegie Mellon University
+ * For Linux 2.3: (C) 2000 Carnegie Mellon University
+ *
+ * Carnegie Mellon encourages users of this code to contribute improvements
+ * to the Coda project http://www.coda.cs.cmu.edu/ <coda@cs.cmu.edu>.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+static atomic_t permission_epoch = ATOMIC_INIT(0);
+
+/* replace or extend an acl cache hit */
+void coda_cache_enter(struct inode *inode, int mask)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+
+	spin_lock(&cii->c_lock);
+	cii->c_cached_epoch = atomic_read(&permission_epoch);
+	if (!uid_eq(cii->c_uid, current_fsuid())) {
+		cii->c_uid = current_fsuid();
+                cii->c_cached_perm = mask;
+        } else
+                cii->c_cached_perm |= mask;
+	spin_unlock(&cii->c_lock);
+}
+
+/* remove cached acl from an inode */
+void coda_cache_clear_inode(struct inode *inode)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+	spin_lock(&cii->c_lock);
+	cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
+	spin_unlock(&cii->c_lock);
+}
+
+/* remove all acl caches */
+void coda_cache_clear_all(struct super_block *sb)
+{
+	atomic_inc(&permission_epoch);
+}
+
+
+/* check if the mask has been matched against the acl already */
+int coda_cache_check(struct inode *inode, int mask)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+	int hit;
+	
+	spin_lock(&cii->c_lock);
+	hit = (mask & cii->c_cached_perm) == mask &&
+	    uid_eq(cii->c_uid, current_fsuid()) &&
+	    cii->c_cached_epoch == atomic_read(&permission_epoch);
+	spin_unlock(&cii->c_lock);
+
+	return hit;
+}
+
+
+/* Purging dentries and children */
+/* The following routines drop dentries which are not
+   in use and flag dentries which are in use to be 
+   zapped later.
+
+   The flags are detected by:
+   - coda_dentry_revalidate (for lookups) if the flag is C_PURGE
+   - coda_dentry_delete: to remove dentry from the cache when d_count
+     falls to zero
+   - an inode method coda_revalidate (for attributes) if the 
+     flag is C_VATTR
+*/
+
+/* this won't do any harm: just flag all children */
+static void coda_flag_children(struct dentry *parent, int flag)
+{
+	struct dentry *de;
+
+	spin_lock(&parent->d_lock);
+	list_for_each_entry(de, &parent->d_subdirs, d_child) {
+		/* don't know what to do with negative dentries */
+		if (d_inode(de) ) 
+			coda_flag_inode(d_inode(de), flag);
+	}
+	spin_unlock(&parent->d_lock);
+	return; 
+}
+
+void coda_flag_inode_children(struct inode *inode, int flag)
+{
+	struct dentry *alias_de;
+
+	if ( !inode || !S_ISDIR(inode->i_mode)) 
+		return; 
+
+	alias_de = d_find_alias(inode);
+	if (!alias_de)
+		return;
+	coda_flag_children(alias_de, flag);
+	shrink_dcache_parent(alias_de);
+	dput(alias_de);
+}
+
diff --git a/kernel/fs/coda/cnode.c b/kernel/fs/coda/cnode.c
new file mode 100644
index 000000000..7740b1c87
--- /dev/null
+++ b/kernel/fs/coda/cnode.c
@@ -0,0 +1,168 @@
+/* cnode related routines for the coda kernel code
+   (C) 1996 Peter Braam
+   */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/time.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+
+static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
+{
+	return memcmp(fid1, fid2, sizeof(*fid1)) == 0;
+}
+
+static const struct inode_operations coda_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= coda_setattr,
+};
+
+/* cnode.c */
+static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
+{
+        coda_vattr_to_iattr(inode, attr);
+
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &coda_file_inode_operations;
+                inode->i_fop = &coda_file_operations;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &coda_dir_inode_operations;
+                inode->i_fop = &coda_dir_operations;
+        } else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &coda_symlink_inode_operations;
+		inode->i_data.a_ops = &coda_symlink_aops;
+		inode->i_mapping = &inode->i_data;
+	} else
+                init_special_inode(inode, inode->i_mode, huge_decode_dev(attr->va_rdev));
+}
+
+static int coda_test_inode(struct inode *inode, void *data)
+{
+	struct CodaFid *fid = (struct CodaFid *)data;
+	struct coda_inode_info *cii = ITOC(inode);
+	return coda_fideq(&cii->c_fid, fid);
+}
+
+static int coda_set_inode(struct inode *inode, void *data)
+{
+	struct CodaFid *fid = (struct CodaFid *)data;
+	struct coda_inode_info *cii = ITOC(inode);
+	cii->c_fid = *fid;
+	return 0;
+}
+
+struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
+			 struct coda_vattr * attr)
+{
+	struct inode *inode;
+	struct coda_inode_info *cii;
+	unsigned long hash = coda_f2i(fid);
+
+	inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		cii = ITOC(inode);
+		/* we still need to set i_ino for things like stat(2) */
+		inode->i_ino = hash;
+		/* inode is locked and unique, no need to grab cii->c_lock */
+		cii->c_mapcount = 0;
+		unlock_new_inode(inode);
+	}
+
+	/* always replace the attributes, type might have changed */
+	coda_fill_inode(inode, attr);
+	return inode;
+}
+
+/* this is effectively coda_iget:
+   - get attributes (might be cached)
+   - get the inode for the fid using vfs iget
+   - link the two up if this is needed
+   - fill in the attributes
+*/
+struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
+{
+        struct coda_vattr attr;
+	struct inode *inode;
+        int error;
+        
+	/* We get inode numbers from Venus -- see venus source */
+	error = venus_getattr(sb, fid, &attr);
+	if (error)
+		return ERR_PTR(error);
+
+	inode = coda_iget(sb, fid, &attr);
+	if (IS_ERR(inode))
+		pr_warn("%s: coda_iget failed\n", __func__);
+	return inode;
+}
+
+
+/* Although we treat Coda file identifiers as immutable, there is one
+ * special case for files created during a disconnection where they may
+ * not be globally unique. When an identifier collision is detected we
+ * first try to flush the cached inode from the kernel and finally
+ * resort to renaming/rehashing in-place. Userspace remembers both old
+ * and new values of the identifier to handle any in-flight upcalls.
+ * The real solution is to use globally unique UUIDs as identifiers, but
+ * retrofitting the existing userspace code for this is non-trivial. */
+void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 
+		      struct CodaFid *newfid)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+	unsigned long hash = coda_f2i(newfid);
+	
+	BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
+
+	/* replace fid and rehash inode */
+	/* XXX we probably need to hold some lock here! */
+	remove_inode_hash(inode);
+	cii->c_fid = *newfid;
+	inode->i_ino = hash;
+	__insert_inode_hash(inode, hash);
+}
+
+/* convert a fid to an inode. */
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) 
+{
+	struct inode *inode;
+	unsigned long hash = coda_f2i(fid);
+
+	if ( !sb ) {
+		pr_warn("%s: no sb!\n", __func__);
+		return NULL;
+	}
+
+	inode = ilookup5(sb, hash, coda_test_inode, fid);
+	if ( !inode )
+		return NULL;
+
+	/* we should never see newly created inodes because we intentionally
+	 * fail in the initialization callback */
+	BUG_ON(inode->i_state & I_NEW);
+
+	return inode;
+}
+
+/* the CONTROL inode is made without asking attributes from Venus */
+struct inode *coda_cnode_makectl(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	if (inode) {
+		inode->i_ino = CTL_INO;
+		inode->i_op = &coda_ioctl_inode_operations;
+		inode->i_fop = &coda_ioctl_operations;
+		inode->i_mode = 0444;
+		return inode;
+	}
+	return ERR_PTR(-ENOMEM);
+}
+
diff --git a/kernel/fs/coda/coda_cache.h b/kernel/fs/coda/coda_cache.h
new file mode 100644
index 000000000..c910b5eb1
--- /dev/null
+++ b/kernel/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+
+#endif /* _CFSNC_HEADER_ */
diff --git a/kernel/fs/coda/coda_fs_i.h b/kernel/fs/coda/coda_fs_i.h
new file mode 100644
index 000000000..c64075213
--- /dev/null
+++ b/kernel/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ *  coda_fs_i.h
+ *
+ *  Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+	struct CodaFid	   c_fid;	/* Coda identifier */
+	u_short	           c_flags;     /* flags (see below) */
+	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
+	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
+	kuid_t		   c_uid;	/* fsuid for cached permissions */
+	unsigned int       c_cached_perm; /* cached access permissions */
+	spinlock_t	   c_lock;
+	struct inode	   vfs_inode;
+};
+
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+	int		   cfi_magic;	  /* magic number */
+	struct file	  *cfi_container; /* container file for this cnode */
+	unsigned int	   cfi_mapcount;  /* nr of times this file is mapped */
+};
+
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+
+/* flags */
+#define C_VATTR       0x1   /* Validity of vattr in inode */
+#define C_FLUSH       0x2   /* used after a flush */
+#define C_DYING       0x4   /* from venus (which died) */
+#define C_PURGE       0x8
+
+struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+struct inode *coda_cnode_makectl(struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+
+#endif
diff --git a/kernel/fs/coda/coda_int.h b/kernel/fs/coda/coda_int.h
new file mode 100644
index 000000000..381c993b1
--- /dev/null
+++ b/kernel/fs/coda/coda_int.h
@@ -0,0 +1,20 @@
+#ifndef _CODA_INT_
+#define _CODA_INT_
+
+struct dentry;
+struct file;
+
+extern struct file_system_type coda_fs_type;
+extern unsigned long coda_timeout;
+extern int coda_hard;
+extern int coda_fake_statfs;
+
+void coda_destroy_inodecache(void);
+int __init coda_init_inodecache(void);
+int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+
+#endif  /*  _CODA_INT_  */
+
+
diff --git a/kernel/fs/coda/coda_linux.c b/kernel/fs/coda/coda_linux.c
new file mode 100644
index 000000000..f1714cfb5
--- /dev/null
+++ b/kernel/fs/coda/coda_linux.c
@@ -0,0 +1,186 @@
+/*
+ * Inode operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+
+/* initialize the debugging variables */
+int coda_fake_statfs;
+
+/* print a fid */
+char * coda_f2s(struct CodaFid *f)
+{
+	static char s[60];
+
+ 	sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]);
+
+	return s;
+}
+
+/* recognize special .CONTROL name */
+int coda_iscontrol(const char *name, size_t length)
+{
+	return ((CODA_CONTROLLEN == length) && 
+                (strncmp(name, CODA_CONTROL, CODA_CONTROLLEN) == 0));
+}
+
+unsigned short coda_flags_to_cflags(unsigned short flags)
+{
+	unsigned short coda_flags = 0;
+	
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		coda_flags |= C_O_READ;
+
+	if ((flags & O_ACCMODE) == O_RDWR)
+		coda_flags |= C_O_READ | C_O_WRITE;
+
+	if ((flags & O_ACCMODE) == O_WRONLY)
+		coda_flags |= C_O_WRITE;
+
+	if (flags & O_TRUNC)
+		coda_flags |= C_O_TRUNC;
+
+	if (flags & O_CREAT)
+		coda_flags |= C_O_CREAT;
+
+	if (flags & O_EXCL)
+		coda_flags |= C_O_EXCL;
+
+	return coda_flags;
+}
+
+
+/* utility functions below */
+void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
+{
+        int inode_type;
+        /* inode's i_flags, i_ino are set by iget 
+           XXX: is this all we need ??
+           */
+        switch (attr->va_type) {
+        case C_VNON:
+                inode_type  = 0;
+                break;
+        case C_VREG:
+                inode_type = S_IFREG;
+                break;
+        case C_VDIR:
+                inode_type = S_IFDIR;
+                break;
+        case C_VLNK:
+                inode_type = S_IFLNK;
+                break;
+        default:
+                inode_type = 0;
+        }
+	inode->i_mode |= inode_type;
+
+	if (attr->va_mode != (u_short) -1)
+	        inode->i_mode = attr->va_mode | inode_type;
+        if (attr->va_uid != -1) 
+	        inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid);
+        if (attr->va_gid != -1)
+	        inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid);
+	if (attr->va_nlink != -1)
+		set_nlink(inode, attr->va_nlink);
+	if (attr->va_size != -1)
+	        inode->i_size = attr->va_size;
+	if (attr->va_size != -1)
+		inode->i_blocks = (attr->va_size + 511) >> 9;
+	if (attr->va_atime.tv_sec != -1) 
+	        inode->i_atime = attr->va_atime;
+	if (attr->va_mtime.tv_sec != -1)
+	        inode->i_mtime = attr->va_mtime;
+        if (attr->va_ctime.tv_sec != -1)
+	        inode->i_ctime = attr->va_ctime;
+}
+
+
+/* 
+ * BSD sets attributes that need not be modified to -1. 
+ * Linux uses the valid field to indicate what should be
+ * looked at.  The BSD type field needs to be deduced from linux 
+ * mode.
+ * So we have to do some translations here.
+ */
+
+void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
+{
+        unsigned int valid;
+
+        /* clean out */        
+	vattr->va_mode = -1;
+        vattr->va_uid = (vuid_t) -1; 
+        vattr->va_gid = (vgid_t) -1;
+        vattr->va_size = (off_t) -1;
+	vattr->va_atime.tv_sec = (time_t) -1;
+	vattr->va_atime.tv_nsec =  (time_t) -1;
+        vattr->va_mtime.tv_sec = (time_t) -1;
+        vattr->va_mtime.tv_nsec = (time_t) -1;
+	vattr->va_ctime.tv_sec = (time_t) -1;
+	vattr->va_ctime.tv_nsec = (time_t) -1;
+        vattr->va_type = C_VNON;
+	vattr->va_fileid = -1;
+	vattr->va_gen = -1;
+	vattr->va_bytes = -1;
+	vattr->va_nlink = -1;
+	vattr->va_blocksize = -1;
+	vattr->va_rdev = -1;
+        vattr->va_flags = 0;
+
+        /* determine the type */
+#if 0
+        mode = iattr->ia_mode;
+                if ( S_ISDIR(mode) ) {
+                vattr->va_type = C_VDIR; 
+        } else if ( S_ISREG(mode) ) {
+                vattr->va_type = C_VREG;
+        } else if ( S_ISLNK(mode) ) {
+                vattr->va_type = C_VLNK;
+        } else {
+                /* don't do others */
+                vattr->va_type = C_VNON;
+        }
+#endif 
+
+        /* set those vattrs that need change */
+        valid = iattr->ia_valid;
+        if ( valid & ATTR_MODE ) {
+                vattr->va_mode = iattr->ia_mode;
+	}
+        if ( valid & ATTR_UID ) {
+                vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid);
+	}
+        if ( valid & ATTR_GID ) {
+                vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid);
+	}
+        if ( valid & ATTR_SIZE ) {
+                vattr->va_size = iattr->ia_size;
+	}
+        if ( valid & ATTR_ATIME ) {
+                vattr->va_atime = iattr->ia_atime;
+	}
+        if ( valid & ATTR_MTIME ) {
+                vattr->va_mtime = iattr->ia_mtime;
+	}
+        if ( valid & ATTR_CTIME ) {
+                vattr->va_ctime = iattr->ia_ctime;
+	}
+}
+
diff --git a/kernel/fs/coda/coda_linux.h b/kernel/fs/coda/coda_linux.h
new file mode 100644
index 000000000..d6f7a76a1
--- /dev/null
+++ b/kernel/fs/coda/coda_linux.h
@@ -0,0 +1,105 @@
+/* 
+ * Coda File System, Linux Kernel module
+ * 
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>		
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+
+extern const struct dentry_operations coda_dentry_operations;
+
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask);
+int coda_revalidate_inode(struct inode *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+
+/* this file:  heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_iscontrol(const char *name, size_t length);
+
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+
+#define CODA_ALLOC(ptr, cast, size) do { \
+    if (size < PAGE_SIZE) \
+        ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
+    else \
+        ptr = (cast)vzalloc((unsigned long) size); \
+    if (!ptr) \
+	pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+} while (0)
+
+
+#define CODA_FREE(ptr,size) \
+    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+
+/* inode to cnode access functions */
+
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+	return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+	return &(ITOC(inode)->c_fid);
+}
+
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+	return coda_f2s(&(ITOC(inode)->c_fid));
+}
+
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+
+	spin_lock(&cii->c_lock);
+	cii->c_flags |= flag;
+	spin_unlock(&cii->c_lock);
+}		
+
+#endif
diff --git a/kernel/fs/coda/dir.c b/kernel/fs/coda/dir.c
new file mode 100644
index 000000000..fda9f4311
--- /dev/null
+++ b/kernel/fs/coda/dir.c
@@ -0,0 +1,579 @@
+
+/*
+ * Directory operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/namei.h>
+#include <linux/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+#include "coda_int.h"
+
+/* same as fs/bad_inode.c */
+static int coda_return_EIO(void)
+{
+	return -EIO;
+}
+#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
+
+/* inode operations for directories */
+/* access routines: lookup, readlink, permission */
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
+{
+	struct super_block *sb = dir->i_sb;
+	const char *name = entry->d_name.name;
+	size_t length = entry->d_name.len;
+	struct inode *inode;
+	int type = 0;
+
+	if (length > CODA_MAXNAMLEN) {
+		pr_err("name too long: lookup, %s (%*s)\n",
+		       coda_i2s(dir), (int)length, name);
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	/* control object, create inode on the fly */
+	if (is_root_inode(dir) && coda_iscontrol(name, length)) {
+		inode = coda_cnode_makectl(sb);
+		type = CODA_NOCACHE;
+	} else {
+		struct CodaFid fid = { { 0, } };
+		int error = venus_lookup(sb, coda_i2f(dir), name, length,
+				     &type, &fid);
+		inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
+	}
+
+	if (!IS_ERR(inode) && (type & CODA_NOCACHE))
+		coda_flag_inode(inode, C_VATTR | C_PURGE);
+
+	if (inode == ERR_PTR(-ENOENT))
+		inode = NULL;
+
+	return d_splice_alias(inode, entry);
+}
+
+
+int coda_permission(struct inode *inode, int mask)
+{
+	int error;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+ 
+	if (!mask)
+		return 0;
+
+	if ((mask & MAY_EXEC) && !execute_ok(inode))
+		return -EACCES;
+
+	if (coda_cache_check(inode, mask))
+		return 0;
+
+	error = venus_access(inode->i_sb, coda_i2f(inode), mask);
+    
+	if (!error)
+		coda_cache_enter(inode, mask);
+
+	return error;
+}
+
+
+static inline void coda_dir_update_mtime(struct inode *dir)
+{
+#ifdef REQUERY_VENUS_FOR_MTIME
+	/* invalidate the directory cnode's attributes so we refetch the
+	 * attributes from venus next time the inode is referenced */
+	coda_flag_inode(dir, C_VATTR);
+#else
+	/* optimistically we can also act as if our nose bleeds. The
+	 * granularity of the mtime is coarse anyways so we might actually be
+	 * right most of the time. Note: we only do this for directories. */
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+#endif
+}
+
+/* we have to wrap inc_nlink/drop_nlink because sometimes userspace uses a
+ * trick to fool GNU find's optimizations. If we can't be sure of the link
+ * (because of volume mount points) we set i_nlink to 1 which forces find
+ * to consider every child as a possible directory. We should also never
+ * see an increment or decrement for deleted directories where i_nlink == 0 */
+static inline void coda_dir_inc_nlink(struct inode *dir)
+{
+	if (dir->i_nlink >= 2)
+		inc_nlink(dir);
+}
+
+static inline void coda_dir_drop_nlink(struct inode *dir)
+{
+	if (dir->i_nlink > 2)
+		drop_nlink(dir);
+}
+
+/* creation routines: create, mknod, mkdir, link, symlink */
+static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl)
+{
+	int error;
+	const char *name=de->d_name.name;
+	int length=de->d_name.len;
+	struct inode *inode;
+	struct CodaFid newfid;
+	struct coda_vattr attrs;
+
+	if (is_root_inode(dir) && coda_iscontrol(name, length))
+		return -EPERM;
+
+	error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 
+				0, mode, &newfid, &attrs);
+	if (error)
+		goto err_out;
+
+	inode = coda_iget(dir->i_sb, &newfid, &attrs);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
+	}
+
+	/* invalidate the directory cnode's attributes */
+	coda_dir_update_mtime(dir);
+	d_instantiate(de, inode);
+	return 0;
+err_out:
+	d_drop(de);
+	return error;
+}
+
+static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode)
+{
+	struct inode *inode;
+	struct coda_vattr attrs;
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+	int error;
+	struct CodaFid newfid;
+
+	if (is_root_inode(dir) && coda_iscontrol(name, len))
+		return -EPERM;
+
+	attrs.va_mode = mode;
+	error = venus_mkdir(dir->i_sb, coda_i2f(dir), 
+			       name, len, &newfid, &attrs);
+	if (error)
+		goto err_out;
+         
+	inode = coda_iget(dir->i_sb, &newfid, &attrs);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
+	}
+
+	/* invalidate the directory cnode's attributes */
+	coda_dir_inc_nlink(dir);
+	coda_dir_update_mtime(dir);
+	d_instantiate(de, inode);
+	return 0;
+err_out:
+	d_drop(de);
+	return error;
+}
+
+/* try to make de an entry in dir_inodde linked to source_de */ 
+static int coda_link(struct dentry *source_de, struct inode *dir_inode, 
+	  struct dentry *de)
+{
+	struct inode *inode = d_inode(source_de);
+        const char * name = de->d_name.name;
+	int len = de->d_name.len;
+	int error;
+
+	if (is_root_inode(dir_inode) && coda_iscontrol(name, len))
+		return -EPERM;
+
+	error = venus_link(dir_inode->i_sb, coda_i2f(inode),
+			   coda_i2f(dir_inode), (const char *)name, len);
+	if (error) {
+		d_drop(de);
+		return error;
+	}
+
+	coda_dir_update_mtime(dir_inode);
+	ihold(inode);
+	d_instantiate(de, inode);
+	inc_nlink(inode);
+	return 0;
+}
+
+
+static int coda_symlink(struct inode *dir_inode, struct dentry *de,
+			const char *symname)
+{
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+	int symlen;
+	int error;
+
+	if (is_root_inode(dir_inode) && coda_iscontrol(name, len))
+		return -EPERM;
+
+	symlen = strlen(symname);
+	if (symlen > CODA_MAXPATHLEN)
+		return -ENAMETOOLONG;
+
+	/*
+	 * This entry is now negative. Since we do not create
+	 * an inode for the entry we have to drop it.
+	 */
+	d_drop(de);
+	error = venus_symlink(dir_inode->i_sb, coda_i2f(dir_inode), name, len,
+			      symname, symlen);
+
+	/* mtime is no good anymore */
+	if (!error)
+		coda_dir_update_mtime(dir_inode);
+
+	return error;
+}
+
+/* destruction routines: unlink, rmdir */
+static int coda_unlink(struct inode *dir, struct dentry *de)
+{
+        int error;
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+
+	error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
+	if (error)
+		return error;
+
+	coda_dir_update_mtime(dir);
+	drop_nlink(d_inode(de));
+	return 0;
+}
+
+static int coda_rmdir(struct inode *dir, struct dentry *de)
+{
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+	int error;
+
+	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
+	if (!error) {
+		/* VFS may delete the child */
+		if (d_really_is_positive(de))
+			clear_nlink(d_inode(de));
+
+		/* fix the link count of the parent */
+		coda_dir_drop_nlink(dir);
+		coda_dir_update_mtime(dir);
+	}
+	return error;
+}
+
+/* rename */
+static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	const char *old_name = old_dentry->d_name.name;
+	const char *new_name = new_dentry->d_name.name;
+	int old_length = old_dentry->d_name.len;
+	int new_length = new_dentry->d_name.len;
+	int error;
+
+	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
+			     coda_i2f(new_dir), old_length, new_length,
+			     (const char *) old_name, (const char *)new_name);
+	if (!error) {
+		if (d_really_is_positive(new_dentry)) {
+			if (d_is_dir(new_dentry)) {
+				coda_dir_drop_nlink(old_dir);
+				coda_dir_inc_nlink(new_dir);
+			}
+			coda_dir_update_mtime(old_dir);
+			coda_dir_update_mtime(new_dir);
+			coda_flag_inode(d_inode(new_dentry), C_VATTR);
+		} else {
+			coda_flag_inode(old_dir, C_VATTR);
+			coda_flag_inode(new_dir, C_VATTR);
+		}
+	}
+	return error;
+}
+
+static inline unsigned int CDT2DT(unsigned char cdt)
+{
+	unsigned int dt;
+
+	switch(cdt) {
+	case CDT_UNKNOWN: dt = DT_UNKNOWN; break;
+	case CDT_FIFO:	  dt = DT_FIFO;    break;
+	case CDT_CHR:	  dt = DT_CHR;     break;
+	case CDT_DIR:	  dt = DT_DIR;     break;
+	case CDT_BLK:	  dt = DT_BLK;     break;
+	case CDT_REG:	  dt = DT_REG;     break;
+	case CDT_LNK:	  dt = DT_LNK;     break;
+	case CDT_SOCK:	  dt = DT_SOCK;    break;
+	case CDT_WHT:	  dt = DT_WHT;     break;
+	default:	  dt = DT_UNKNOWN; break;
+	}
+	return dt;
+}
+
+/* support routines */
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
+{
+	struct coda_file_info *cfi;
+	struct coda_inode_info *cii;
+	struct file *host_file;
+	struct venus_dirent *vdir;
+	unsigned long vdir_size = offsetof(struct venus_dirent, d_name);
+	unsigned int type;
+	struct qstr name;
+	ino_t ino;
+	int ret;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	cii = ITOC(file_inode(coda_file));
+
+	vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
+	if (!vdir) return -ENOMEM;
+
+	if (!dir_emit_dots(coda_file, ctx))
+		goto out;
+
+	while (1) {
+		/* read entries from the directory file */
+		ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
+				  sizeof(*vdir));
+		if (ret < 0) {
+			pr_err("%s: read dir %s failed %d\n",
+			       __func__, coda_f2s(&cii->c_fid), ret);
+			break;
+		}
+		if (ret == 0) break; /* end of directory file reached */
+
+		/* catch truncated reads */
+		if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) {
+			pr_err("%s: short read on %s\n",
+			       __func__, coda_f2s(&cii->c_fid));
+			ret = -EBADF;
+			break;
+		}
+		/* validate whether the directory file actually makes sense */
+		if (vdir->d_reclen < vdir_size + vdir->d_namlen) {
+			pr_err("%s: invalid dir %s\n",
+			       __func__, coda_f2s(&cii->c_fid));
+			ret = -EBADF;
+			break;
+		}
+
+		name.len = vdir->d_namlen;
+		name.name = vdir->d_name;
+
+		/* Make sure we skip '.' and '..', we already got those */
+		if (name.name[0] == '.' && (name.len == 1 ||
+		    (name.name[1] == '.' && name.len == 2)))
+			vdir->d_fileno = name.len = 0;
+
+		/* skip null entries */
+		if (vdir->d_fileno && name.len) {
+			ino = vdir->d_fileno;
+			type = CDT2DT(vdir->d_type);
+			if (!dir_emit(ctx, name.name, name.len, ino, type))
+				break;
+		}
+		/* we'll always have progress because d_reclen is unsigned and
+		 * we've already established it is non-zero. */
+		ctx->pos += vdir->d_reclen;
+	}
+out:
+	kfree(vdir);
+	return 0;
+}
+
+/* file operations for directories */
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
+{
+	struct coda_file_info *cfi;
+	struct file *host_file;
+	int ret;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	if (host_file->f_op->iterate) {
+		struct inode *host_inode = file_inode(host_file);
+
+		mutex_lock(&host_inode->i_mutex);
+		ret = -ENOENT;
+		if (!IS_DEADDIR(host_inode)) {
+			ret = host_file->f_op->iterate(host_file, ctx);
+			file_accessed(host_file);
+		}
+		mutex_unlock(&host_inode->i_mutex);
+		return ret;
+	}
+	/* Venus: we must read Venus dirents from a file */
+	return coda_venus_readdir(coda_file, ctx);
+}
+
+/* called when a cache lookup succeeds */
+static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
+{
+	struct inode *inode;
+	struct coda_inode_info *cii;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = d_inode(de);
+	if (!inode || is_root_inode(inode))
+		goto out;
+	if (is_bad_inode(inode))
+		goto bad;
+
+	cii = ITOC(d_inode(de));
+	if (!(cii->c_flags & (C_PURGE | C_FLUSH)))
+		goto out;
+
+	shrink_dcache_parent(de);
+
+	/* propagate for a flush */
+	if (cii->c_flags & C_FLUSH) 
+		coda_flag_inode_children(inode, C_FLUSH);
+
+	if (d_count(de) > 1)
+		/* pretend it's valid, but don't change the flags */
+		goto out;
+
+	/* clear the flags. */
+	spin_lock(&cii->c_lock);
+	cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+	spin_unlock(&cii->c_lock);
+bad:
+	return 0;
+out:
+	return 1;
+}
+
+/*
+ * This is the callback from dput() when d_count is going to 0.
+ * We use this to unhash dentries with bad inodes.
+ */
+static int coda_dentry_delete(const struct dentry * dentry)
+{
+	int flags;
+
+	if (d_really_is_negative(dentry)) 
+		return 0;
+
+	flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE;
+	if (is_bad_inode(d_inode(dentry)) || flags) {
+		return 1;
+	}
+	return 0;
+}
+
+
+
+/*
+ * This is called when we want to check if the inode has
+ * changed on the server.  Coda makes this easy since the
+ * cache manager Venus issues a downcall to the kernel when this 
+ * happens 
+ */
+int coda_revalidate_inode(struct inode *inode)
+{
+	struct coda_vattr attr;
+	int error;
+	int old_mode;
+	ino_t old_ino;
+	struct coda_inode_info *cii = ITOC(inode);
+
+	if (!cii->c_flags)
+		return 0;
+
+	if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
+		error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
+		if (error)
+			return -EIO;
+
+		/* this inode may be lost if:
+		   - it's ino changed 
+		   - type changes must be permitted for repair and
+		   missing mount points.
+		*/
+		old_mode = inode->i_mode;
+		old_ino = inode->i_ino;
+		coda_vattr_to_iattr(inode, &attr);
+
+		if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) {
+			pr_warn("inode %ld, fid %s changed type!\n",
+				inode->i_ino, coda_f2s(&(cii->c_fid)));
+		}
+
+		/* the following can happen when a local fid is replaced 
+		   with a global one, here we lose and declare the inode bad */
+		if (inode->i_ino != old_ino)
+			return -EIO;
+		
+		coda_flag_inode_children(inode, C_FLUSH);
+
+		spin_lock(&cii->c_lock);
+		cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+		spin_unlock(&cii->c_lock);
+	}
+	return 0;
+}
+
+const struct dentry_operations coda_dentry_operations = {
+	.d_revalidate	= coda_dentry_revalidate,
+	.d_delete	= coda_dentry_delete,
+};
+
+const struct inode_operations coda_dir_inode_operations = {
+	.create		= coda_create,
+	.lookup		= coda_lookup,
+	.link		= coda_link,
+	.unlink		= coda_unlink,
+	.symlink	= coda_symlink,
+	.mkdir		= coda_mkdir,
+	.rmdir		= coda_rmdir,
+	.mknod		= CODA_EIO_ERROR,
+	.rename		= coda_rename,
+	.permission	= coda_permission,
+	.getattr	= coda_getattr,
+	.setattr	= coda_setattr,
+};
+
+const struct file_operations coda_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= coda_readdir,
+	.open		= coda_open,
+	.release	= coda_release,
+	.fsync		= coda_fsync,
+};
diff --git a/kernel/fs/coda/file.c b/kernel/fs/coda/file.c
new file mode 100644
index 000000000..1da3805f3
--- /dev/null
+++ b/kernel/fs/coda/file.c
@@ -0,0 +1,230 @@
+/*
+ * File operations for Coda.
+ * Original version: (C) 1996 Peter Braam 
+ * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon encourages users of this code to contribute improvements
+ * to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+#include "coda_int.h"
+
+static ssize_t
+coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *coda_file = iocb->ki_filp;
+	struct coda_file_info *cfi = CODA_FTOC(coda_file);
+
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+
+	return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos);
+}
+
+static ssize_t
+coda_file_splice_read(struct file *coda_file, loff_t *ppos,
+		      struct pipe_inode_info *pipe, size_t count,
+		      unsigned int flags)
+{
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
+	struct coda_file_info *cfi;
+	struct file *host_file;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	splice_read = host_file->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
+
+	return splice_read(host_file, ppos, pipe, count, flags);
+}
+
+static ssize_t
+coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *coda_file = iocb->ki_filp;
+	struct inode *coda_inode = file_inode(coda_file);
+	struct coda_file_info *cfi = CODA_FTOC(coda_file);
+	struct file *host_file;
+	ssize_t ret;
+
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+
+	host_file = cfi->cfi_container;
+	file_start_write(host_file);
+	mutex_lock(&coda_inode->i_mutex);
+	ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
+	coda_inode->i_size = file_inode(host_file)->i_size;
+	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
+	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
+	mutex_unlock(&coda_inode->i_mutex);
+	file_end_write(host_file);
+	return ret;
+}
+
+static int
+coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
+{
+	struct coda_file_info *cfi;
+	struct coda_inode_info *cii;
+	struct file *host_file;
+	struct inode *coda_inode, *host_inode;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	if (!host_file->f_op->mmap)
+		return -ENODEV;
+
+	coda_inode = file_inode(coda_file);
+	host_inode = file_inode(host_file);
+
+	cii = ITOC(coda_inode);
+	spin_lock(&cii->c_lock);
+	coda_file->f_mapping = host_file->f_mapping;
+	if (coda_inode->i_mapping == &coda_inode->i_data)
+		coda_inode->i_mapping = host_inode->i_mapping;
+
+	/* only allow additional mmaps as long as userspace isn't changing
+	 * the container file on us! */
+	else if (coda_inode->i_mapping != host_inode->i_mapping) {
+		spin_unlock(&cii->c_lock);
+		return -EBUSY;
+	}
+
+	/* keep track of how often the coda_inode/host_file has been mmapped */
+	cii->c_mapcount++;
+	cfi->cfi_mapcount++;
+	spin_unlock(&cii->c_lock);
+
+	return host_file->f_op->mmap(host_file, vma);
+}
+
+int coda_open(struct inode *coda_inode, struct file *coda_file)
+{
+	struct file *host_file = NULL;
+	int error;
+	unsigned short flags = coda_file->f_flags & (~O_EXCL);
+	unsigned short coda_flags = coda_flags_to_cflags(flags);
+	struct coda_file_info *cfi;
+
+	cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL);
+	if (!cfi)
+		return -ENOMEM;
+
+	error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
+			   &host_file);
+	if (!host_file)
+		error = -EIO;
+
+	if (error) {
+		kfree(cfi);
+		return error;
+	}
+
+	host_file->f_flags |= coda_file->f_flags & (O_APPEND | O_SYNC);
+
+	cfi->cfi_magic = CODA_MAGIC;
+	cfi->cfi_mapcount = 0;
+	cfi->cfi_container = host_file;
+
+	BUG_ON(coda_file->private_data != NULL);
+	coda_file->private_data = cfi;
+	return 0;
+}
+
+int coda_release(struct inode *coda_inode, struct file *coda_file)
+{
+	unsigned short flags = (coda_file->f_flags) & (~O_EXCL);
+	unsigned short coda_flags = coda_flags_to_cflags(flags);
+	struct coda_file_info *cfi;
+	struct coda_inode_info *cii;
+	struct inode *host_inode;
+	int err;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+
+	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
+			  coda_flags, coda_file->f_cred->fsuid);
+
+	host_inode = file_inode(cfi->cfi_container);
+	cii = ITOC(coda_inode);
+
+	/* did we mmap this file? */
+	spin_lock(&cii->c_lock);
+	if (coda_inode->i_mapping == &host_inode->i_data) {
+		cii->c_mapcount -= cfi->cfi_mapcount;
+		if (!cii->c_mapcount)
+			coda_inode->i_mapping = &coda_inode->i_data;
+	}
+	spin_unlock(&cii->c_lock);
+
+	fput(cfi->cfi_container);
+	kfree(coda_file->private_data);
+	coda_file->private_data = NULL;
+
+	/* VFS fput ignores the return value from file_operations->release, so
+	 * there is no use returning an error here */
+	return 0;
+}
+
+int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
+{
+	struct file *host_file;
+	struct inode *coda_inode = file_inode(coda_file);
+	struct coda_file_info *cfi;
+	int err;
+
+	if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
+	      S_ISLNK(coda_inode->i_mode)))
+		return -EINVAL;
+
+	err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&coda_inode->i_mutex);
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	err = vfs_fsync(host_file, datasync);
+	if (!err && !datasync)
+		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
+	mutex_unlock(&coda_inode->i_mutex);
+
+	return err;
+}
+
+const struct file_operations coda_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= coda_file_read_iter,
+	.write_iter	= coda_file_write_iter,
+	.mmap		= coda_file_mmap,
+	.open		= coda_open,
+	.release	= coda_release,
+	.fsync		= coda_fsync,
+	.splice_read	= coda_file_splice_read,
+};
+
diff --git a/kernel/fs/coda/inode.c b/kernel/fs/coda/inode.c
new file mode 100644
index 000000000..cac1390b8
--- /dev/null
+++ b/kernel/fs/coda/inode.c
@@ -0,0 +1,333 @@
+/*
+ * Super block/filesystem wide operations
+ *
+ * Copyright (C) 1996 Peter J. Braam <braam@maths.ox.ac.uk> and 
+ * Michael Callahan <callahan@maths.ox.ac.uk> 
+ * 
+ * Rewritten for Linux 2.1.  Peter Braam <braam@cs.cmu.edu>
+ * Copyright (C) Carnegie Mellon University
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include <linux/uaccess.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+#include "coda_int.h"
+
+/* VFS super_block ops */
+static void coda_evict_inode(struct inode *);
+static void coda_put_super(struct super_block *);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+static struct kmem_cache * coda_inode_cachep;
+
+static struct inode *coda_alloc_inode(struct super_block *sb)
+{
+	struct coda_inode_info *ei;
+	ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	memset(&ei->c_fid, 0, sizeof(struct CodaFid));
+	ei->c_flags = 0;
+	ei->c_uid = GLOBAL_ROOT_UID;
+	ei->c_cached_perm = 0;
+	spin_lock_init(&ei->c_lock);
+	return &ei->vfs_inode;
+}
+
+static void coda_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(coda_inode_cachep, ITOC(inode));
+}
+
+static void coda_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, coda_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+int __init coda_init_inodecache(void)
+{
+	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
+				sizeof(struct coda_inode_info),
+				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				init_once);
+	if (coda_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void coda_destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(coda_inode_cachep);
+}
+
+static int coda_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_NOATIME;
+	return 0;
+}
+
+/* exported operations */
+static const struct super_operations coda_super_operations =
+{
+	.alloc_inode	= coda_alloc_inode,
+	.destroy_inode	= coda_destroy_inode,
+	.evict_inode	= coda_evict_inode,
+	.put_super	= coda_put_super,
+	.statfs		= coda_statfs,
+	.remount_fs	= coda_remount,
+};
+
+static int get_device_index(struct coda_mount_data *data)
+{
+	struct fd f;
+	struct inode *inode;
+	int idx;
+
+	if (data == NULL) {
+		pr_warn("%s: Bad mount data\n", __func__);
+		return -1;
+	}
+
+	if (data->version != CODA_MOUNT_VERSION) {
+		pr_warn("%s: Bad mount version\n", __func__);
+		return -1;
+	}
+
+	f = fdget(data->fd);
+	if (!f.file)
+		goto Ebadf;
+	inode = file_inode(f.file);
+	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
+		fdput(f);
+		goto Ebadf;
+	}
+
+	idx = iminor(inode);
+	fdput(f);
+
+	if (idx < 0 || idx >= MAX_CODADEVS) {
+		pr_warn("%s: Bad minor number\n", __func__);
+		return -1;
+	}
+
+	return idx;
+Ebadf:
+	pr_warn("%s: Bad file\n", __func__);
+	return -1;
+}
+
+static int coda_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root = NULL;
+	struct venus_comm *vc;
+	struct CodaFid fid;
+	int error;
+	int idx;
+
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
+
+	idx = get_device_index((struct coda_mount_data *) data);
+
+	/* Ignore errors in data, for backward compatibility */
+	if(idx == -1)
+		idx = 0;
+	
+	pr_info("%s: device index: %i\n", __func__,  idx);
+
+	vc = &coda_comms[idx];
+	mutex_lock(&vc->vc_mutex);
+
+	if (!vc->vc_inuse) {
+		pr_warn("%s: No pseudo device\n", __func__);
+		error = -EINVAL;
+		goto unlock_out;
+	}
+
+	if (vc->vc_sb) {
+		pr_warn("%s: Device already mounted\n", __func__);
+		error = -EBUSY;
+		goto unlock_out;
+	}
+
+	error = bdi_setup_and_register(&vc->bdi, "coda");
+	if (error)
+		goto unlock_out;
+
+	vc->vc_sb = sb;
+	mutex_unlock(&vc->vc_mutex);
+
+	sb->s_fs_info = vc;
+	sb->s_flags |= MS_NOATIME;
+	sb->s_blocksize = 4096;	/* XXXXX  what do we put here?? */
+	sb->s_blocksize_bits = 12;
+	sb->s_magic = CODA_SUPER_MAGIC;
+	sb->s_op = &coda_super_operations;
+	sb->s_d_op = &coda_dentry_operations;
+	sb->s_bdi = &vc->bdi;
+
+	/* get root fid from Venus: this needs the root inode */
+	error = venus_rootfid(sb, &fid);
+	if ( error ) {
+		pr_warn("%s: coda_get_rootfid failed with %d\n",
+			__func__, error);
+		goto error;
+	}
+	pr_info("%s: rootfid is %s\n", __func__, coda_f2s(&fid));
+	
+	/* make root inode */
+        root = coda_cnode_make(&fid, sb);
+        if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		pr_warn("Failure of coda_cnode_make for root: error %d\n",
+			error);
+		goto error;
+	} 
+
+	pr_info("%s: rootinode is %ld dev %s\n",
+		__func__, root->i_ino, root->i_sb->s_id);
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		error = -EINVAL;
+		goto error;
+	}
+	return 0;
+
+error:
+	mutex_lock(&vc->vc_mutex);
+	bdi_destroy(&vc->bdi);
+	vc->vc_sb = NULL;
+	sb->s_fs_info = NULL;
+unlock_out:
+	mutex_unlock(&vc->vc_mutex);
+	return error;
+}
+
+static void coda_put_super(struct super_block *sb)
+{
+	struct venus_comm *vcp = coda_vcp(sb);
+	mutex_lock(&vcp->vc_mutex);
+	bdi_destroy(&vcp->bdi);
+	vcp->vc_sb = NULL;
+	sb->s_fs_info = NULL;
+	mutex_unlock(&vcp->vc_mutex);
+
+	pr_info("Bye bye.\n");
+}
+
+static void coda_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	coda_cache_clear_inode(inode);
+}
+
+int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	int err = coda_revalidate_inode(d_inode(dentry));
+	if (!err)
+		generic_fillattr(d_inode(dentry), stat);
+	return err;
+}
+
+int coda_setattr(struct dentry *de, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(de);
+	struct coda_vattr vattr;
+	int error;
+
+	memset(&vattr, 0, sizeof(vattr)); 
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	coda_iattr_to_vattr(iattr, &vattr);
+	vattr.va_type = C_VNON; /* cannot set type */
+
+	/* Venus is responsible for truncating the container-file!!! */
+	error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
+
+	if (!error) {
+	        coda_vattr_to_iattr(inode, &vattr); 
+		coda_cache_clear_inode(inode);
+	}
+	return error;
+}
+
+const struct inode_operations coda_file_inode_operations = {
+	.permission	= coda_permission,
+	.getattr	= coda_getattr,
+	.setattr	= coda_setattr,
+};
+
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int error;
+	
+	error = venus_statfs(dentry, buf);
+
+	if (error) {
+		/* fake something like AFS does */
+		buf->f_blocks = 9000000;
+		buf->f_bfree  = 9000000;
+		buf->f_bavail = 9000000;
+		buf->f_files  = 9000000;
+		buf->f_ffree  = 9000000;
+	}
+
+	/* and fill in the rest */
+	buf->f_type = CODA_SUPER_MAGIC;
+	buf->f_bsize = 4096;
+	buf->f_namelen = CODA_MAXNAMLEN;
+
+	return 0; 
+}
+
+/* init_coda: used by filesystems.c to register coda */
+
+static struct dentry *coda_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, coda_fill_super);
+}
+
+struct file_system_type coda_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "coda",
+	.mount		= coda_mount,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("coda");
+
diff --git a/kernel/fs/coda/pioctl.c b/kernel/fs/coda/pioctl.c
new file mode 100644
index 000000000..f36a4040a
--- /dev/null
+++ b/kernel/fs/coda/pioctl.c
@@ -0,0 +1,90 @@
+/*
+ * Pioctl operations for Coda.
+ * Original version: (C) 1996 Peter Braam
+ * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon encourages users of this code to contribute improvements
+ * to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/namei.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+
+/* pioctl ops */
+static int coda_ioctl_permission(struct inode *inode, int mask);
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data);
+
+/* exported from this file */
+const struct inode_operations coda_ioctl_inode_operations = {
+	.permission	= coda_ioctl_permission,
+	.setattr	= coda_setattr,
+};
+
+const struct file_operations coda_ioctl_operations = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= coda_pioctl,
+	.llseek		= noop_llseek,
+};
+
+/* the coda pioctl inode ops */
+static int coda_ioctl_permission(struct inode *inode, int mask)
+{
+	return (mask & MAY_EXEC) ? -EACCES : 0;
+}
+
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data)
+{
+	struct path path;
+	int error;
+	struct PioctlData data;
+	struct inode *inode = file_inode(filp);
+	struct inode *target_inode = NULL;
+	struct coda_inode_info *cnp;
+
+	/* get the Pioctl data arguments from user space */
+	if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
+		return -EINVAL;
+
+	/*
+	 * Look up the pathname. Note that the pathname is in
+	 * user memory, and namei takes care of this
+	 */
+	if (data.follow)
+		error = user_path(data.path, &path);
+	else
+		error = user_lpath(data.path, &path);
+
+	if (error)
+		return error;
+
+	target_inode = d_inode(path.dentry);
+
+	/* return if it is not a Coda inode */
+	if (target_inode->i_sb != inode->i_sb) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	/* now proceed to make the upcall */
+	cnp = ITOC(target_inode);
+
+	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
+out:
+	path_put(&path);
+	return error;
+}
diff --git a/kernel/fs/coda/psdev.c b/kernel/fs/coda/psdev.c
new file mode 100644
index 000000000..822629126
--- /dev/null
+++ b/kernel/fs/coda/psdev.c
@@ -0,0 +1,437 @@
+/*
+ *      	An implementation of a loadable kernel mode driver providing
+ *		multiple kernel/user space bidirectional communications links.
+ *
+ * 		Author: 	Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ * 
+ *              Adapted to become the Linux 2.0 Coda pseudo device
+ *              Peter  Braam  <braam@maths.ox.ac.uk> 
+ *              Michael Callahan <mjc@emmy.smith.edu>           
+ *
+ *              Changes for Linux 2.1
+ *              Copyright (c) 1997 Carnegie-Mellon University
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/device.h>
+#include <linux/pid_namespace.h>
+#include <asm/io.h>
+#include <asm/poll.h>
+#include <linux/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+
+#include "coda_int.h"
+
+/* statistics */
+int           coda_hard;         /* allows signals during upcalls */
+unsigned long coda_timeout = 30; /* .. secs, then signals will dequeue */
+
+
+struct venus_comm coda_comms[MAX_CODADEVS];
+static struct class *coda_psdev_class;
+
+/*
+ * Device operations
+ */
+
+static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
+{
+        struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(file, &vcp->vc_waitq, wait);
+	mutex_lock(&vcp->vc_mutex);
+	if (!list_empty(&vcp->vc_pending))
+                mask |= POLLIN | POLLRDNORM;
+	mutex_unlock(&vcp->vc_mutex);
+
+	return mask;
+}
+
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
+{
+	unsigned int data;
+
+	switch(cmd) {
+	case CIOC_KERNEL_VERSION:
+		data = CODA_KERNEL_VERSION;
+		return put_user(data, (int __user *) arg);
+	default:
+		return -ENOTTY;
+	}
+
+	return 0;
+}
+
+/*
+ *	Receive a message written by Venus to the psdev
+ */
+ 
+static ssize_t coda_psdev_write(struct file *file, const char __user *buf, 
+				size_t nbytes, loff_t *off)
+{
+        struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+        struct upc_req *req = NULL;
+        struct upc_req *tmp;
+	struct list_head *lh;
+	struct coda_in_hdr hdr;
+	ssize_t retval = 0, count = 0;
+	int error;
+
+        /* Peek at the opcode, uniquefier */
+	if (copy_from_user(&hdr, buf, 2 * sizeof(u_long)))
+	        return -EFAULT;
+
+        if (DOWNCALL(hdr.opcode)) {
+		union outputArgs *dcbuf;
+		int size = sizeof(*dcbuf);
+
+		if  ( nbytes < sizeof(struct coda_out_hdr) ) {
+			pr_warn("coda_downcall opc %d uniq %d, not enough!\n",
+				hdr.opcode, hdr.unique);
+			count = nbytes;
+			goto out;
+		}
+		if ( nbytes > size ) {
+			pr_warn("downcall opc %d, uniq %d, too much!",
+				hdr.opcode, hdr.unique);
+		        nbytes = size;
+		}
+		CODA_ALLOC(dcbuf, union outputArgs *, nbytes);
+		if (copy_from_user(dcbuf, buf, nbytes)) {
+			CODA_FREE(dcbuf, nbytes);
+			retval = -EFAULT;
+			goto out;
+		}
+
+		/* what downcall errors does Venus handle ? */
+		error = coda_downcall(vcp, hdr.opcode, dcbuf);
+
+		CODA_FREE(dcbuf, nbytes);
+		if (error) {
+			pr_warn("%s: coda_downcall error: %d\n",
+				__func__, error);
+			retval = error;
+			goto out;
+		}
+		count = nbytes;
+		goto out;
+	}
+        
+	/* Look for the message on the processing queue. */
+	mutex_lock(&vcp->vc_mutex);
+	list_for_each(lh, &vcp->vc_processing) {
+		tmp = list_entry(lh, struct upc_req , uc_chain);
+		if (tmp->uc_unique == hdr.unique) {
+			req = tmp;
+			list_del(&req->uc_chain);
+			break;
+		}
+	}
+	mutex_unlock(&vcp->vc_mutex);
+
+	if (!req) {
+		pr_warn("%s: msg (%d, %d) not found\n",
+			__func__, hdr.opcode, hdr.unique);
+		retval = -ESRCH;
+		goto out;
+	}
+
+        /* move data into response buffer. */
+	if (req->uc_outSize < nbytes) {
+		pr_warn("%s: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n",
+			__func__, req->uc_outSize, (long)nbytes,
+			hdr.opcode, hdr.unique);
+		nbytes = req->uc_outSize; /* don't have more space! */
+	}
+        if (copy_from_user(req->uc_data, buf, nbytes)) {
+		req->uc_flags |= CODA_REQ_ABORT;
+		wake_up(&req->uc_sleep);
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/* adjust outsize. is this useful ?? */
+	req->uc_outSize = nbytes;
+	req->uc_flags |= CODA_REQ_WRITE;
+	count = nbytes;
+
+	/* Convert filedescriptor into a file handle */
+	if (req->uc_opcode == CODA_OPEN_BY_FD) {
+		struct coda_open_by_fd_out *outp =
+			(struct coda_open_by_fd_out *)req->uc_data;
+		if (!outp->oh.result)
+			outp->fh = fget(outp->fd);
+	}
+
+        wake_up(&req->uc_sleep);
+out:
+        return(count ? count : retval);  
+}
+
+/*
+ *	Read a message from the kernel to Venus
+ */
+
+static ssize_t coda_psdev_read(struct file * file, char __user * buf, 
+			       size_t nbytes, loff_t *off)
+{
+	DECLARE_WAITQUEUE(wait, current);
+        struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+        struct upc_req *req;
+	ssize_t retval = 0, count = 0;
+
+	if (nbytes == 0)
+		return 0;
+
+	mutex_lock(&vcp->vc_mutex);
+
+	add_wait_queue(&vcp->vc_waitq, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (list_empty(&vcp->vc_pending)) {
+		if (file->f_flags & O_NONBLOCK) {
+			retval = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		mutex_unlock(&vcp->vc_mutex);
+		schedule();
+		mutex_lock(&vcp->vc_mutex);
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&vcp->vc_waitq, &wait);
+
+	if (retval)
+		goto out;
+
+	req = list_entry(vcp->vc_pending.next, struct upc_req,uc_chain);
+	list_del(&req->uc_chain);
+
+	/* Move the input args into userspace */
+	count = req->uc_inSize;
+	if (nbytes < req->uc_inSize) {
+		pr_warn("%s: Venus read %ld bytes of %d in message\n",
+			__func__, (long)nbytes, req->uc_inSize);
+		count = nbytes;
+        }
+
+	if (copy_to_user(buf, req->uc_data, count))
+	        retval = -EFAULT;
+        
+	/* If request was not a signal, enqueue and don't free */
+	if (!(req->uc_flags & CODA_REQ_ASYNC)) {
+		req->uc_flags |= CODA_REQ_READ;
+		list_add_tail(&(req->uc_chain), &vcp->vc_processing);
+		goto out;
+	}
+
+	CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
+	kfree(req);
+out:
+	mutex_unlock(&vcp->vc_mutex);
+	return (count ? count : retval);
+}
+
+static int coda_psdev_open(struct inode * inode, struct file * file)
+{
+	struct venus_comm *vcp;
+	int idx, err;
+
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
+
+	if (current_user_ns() != &init_user_ns)
+		return -EINVAL;
+
+	idx = iminor(inode);
+	if (idx < 0 || idx >= MAX_CODADEVS)
+		return -ENODEV;
+
+	err = -EBUSY;
+	vcp = &coda_comms[idx];
+	mutex_lock(&vcp->vc_mutex);
+
+	if (!vcp->vc_inuse) {
+		vcp->vc_inuse++;
+
+		INIT_LIST_HEAD(&vcp->vc_pending);
+		INIT_LIST_HEAD(&vcp->vc_processing);
+		init_waitqueue_head(&vcp->vc_waitq);
+		vcp->vc_sb = NULL;
+		vcp->vc_seq = 0;
+
+		file->private_data = vcp;
+		err = 0;
+	}
+
+	mutex_unlock(&vcp->vc_mutex);
+	return err;
+}
+
+
+static int coda_psdev_release(struct inode * inode, struct file * file)
+{
+	struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+	struct upc_req *req, *tmp;
+
+	if (!vcp || !vcp->vc_inuse ) {
+		pr_warn("%s: Not open.\n", __func__);
+		return -1;
+	}
+
+	mutex_lock(&vcp->vc_mutex);
+
+	/* Wakeup clients so they can return. */
+	list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
+		list_del(&req->uc_chain);
+
+		/* Async requests need to be freed here */
+		if (req->uc_flags & CODA_REQ_ASYNC) {
+			CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
+			kfree(req);
+			continue;
+		}
+		req->uc_flags |= CODA_REQ_ABORT;
+		wake_up(&req->uc_sleep);
+	}
+
+	list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) {
+		list_del(&req->uc_chain);
+
+		req->uc_flags |= CODA_REQ_ABORT;
+		wake_up(&req->uc_sleep);
+	}
+
+	file->private_data = NULL;
+	vcp->vc_inuse--;
+	mutex_unlock(&vcp->vc_mutex);
+	return 0;
+}
+
+
+static const struct file_operations coda_psdev_fops = {
+	.owner		= THIS_MODULE,
+	.read		= coda_psdev_read,
+	.write		= coda_psdev_write,
+	.poll		= coda_psdev_poll,
+	.unlocked_ioctl	= coda_psdev_ioctl,
+	.open		= coda_psdev_open,
+	.release	= coda_psdev_release,
+	.llseek		= noop_llseek,
+};
+
+static int init_coda_psdev(void)
+{
+	int i, err = 0;
+	if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) {
+		pr_err("%s: unable to get major %d\n",
+		       __func__, CODA_PSDEV_MAJOR);
+              return -EIO;
+	}
+	coda_psdev_class = class_create(THIS_MODULE, "coda");
+	if (IS_ERR(coda_psdev_class)) {
+		err = PTR_ERR(coda_psdev_class);
+		goto out_chrdev;
+	}		
+	for (i = 0; i < MAX_CODADEVS; i++) {
+		mutex_init(&(&coda_comms[i])->vc_mutex);
+		device_create(coda_psdev_class, NULL,
+			      MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
+	}
+	coda_sysctl_init();
+	goto out;
+
+out_chrdev:
+	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
+out:
+	return err;
+}
+
+MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
+MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
+MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
+MODULE_LICENSE("GPL");
+MODULE_VERSION("6.6");
+
+static int __init init_coda(void)
+{
+	int status;
+	int i;
+
+	status = coda_init_inodecache();
+	if (status)
+		goto out2;
+	status = init_coda_psdev();
+	if ( status ) {
+		pr_warn("Problem (%d) in init_coda_psdev\n", status);
+		goto out1;
+	}
+	
+	status = register_filesystem(&coda_fs_type);
+	if (status) {
+		pr_warn("failed to register filesystem!\n");
+		goto out;
+	}
+	return 0;
+out:
+	for (i = 0; i < MAX_CODADEVS; i++)
+		device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+	class_destroy(coda_psdev_class);
+	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
+	coda_sysctl_clean();
+out1:
+	coda_destroy_inodecache();
+out2:
+	return status;
+}
+
+static void __exit exit_coda(void)
+{
+        int err, i;
+
+	err = unregister_filesystem(&coda_fs_type);
+	if (err != 0)
+		pr_warn("failed to unregister filesystem\n");
+	for (i = 0; i < MAX_CODADEVS; i++)
+		device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+	class_destroy(coda_psdev_class);
+	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
+	coda_sysctl_clean();
+	coda_destroy_inodecache();
+}
+
+module_init(init_coda);
+module_exit(exit_coda);
+
diff --git a/kernel/fs/coda/symlink.c b/kernel/fs/coda/symlink.c
new file mode 100644
index 000000000..ab94ef63c
--- /dev/null
+++ b/kernel/fs/coda/symlink.c
@@ -0,0 +1,50 @@
+/*
+ * Symlink inode operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+
+static int coda_symlink_filler(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int error;
+	struct coda_inode_info *cii;
+	unsigned int len = PAGE_SIZE;
+	char *p = kmap(page);
+
+	cii = ITOC(inode);
+
+	error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
+	if (error)
+		goto fail;
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return error;
+}
+
+const struct address_space_operations coda_symlink_aops = {
+	.readpage	= coda_symlink_filler,
+};
diff --git a/kernel/fs/coda/sysctl.c b/kernel/fs/coda/sysctl.c
new file mode 100644
index 000000000..34218a8a2
--- /dev/null
+++ b/kernel/fs/coda/sysctl.c
@@ -0,0 +1,73 @@
+/*
+ * Sysctl operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/sysctl.h>
+
+#include "coda_int.h"
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fs_table_header;
+
+static struct ctl_table coda_table[] = {
+	{
+		.procname	= "timeout",
+		.data		= &coda_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "hard",
+		.data		= &coda_hard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "fake_statfs",
+		.data		= &coda_fake_statfs,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec
+	},
+	{}
+};
+
+static struct ctl_table fs_table[] = {
+	{
+		.procname	= "coda",
+		.mode		= 0555,
+		.child		= coda_table
+	},
+	{}
+};
+
+void coda_sysctl_init(void)
+{
+	if ( !fs_table_header )
+		fs_table_header = register_sysctl_table(fs_table);
+}
+
+void coda_sysctl_clean(void)
+{
+	if ( fs_table_header ) {
+		unregister_sysctl_table(fs_table_header);
+		fs_table_header = NULL;
+	}
+}
+
+#else
+void coda_sysctl_init(void)
+{
+}
+
+void coda_sysctl_clean(void)
+{
+}
+#endif
diff --git a/kernel/fs/coda/upcall.c b/kernel/fs/coda/upcall.c
new file mode 100644
index 000000000..9b1ffaa05
--- /dev/null
+++ b/kernel/fs/coda/upcall.c
@@ -0,0 +1,882 @@
+/*
+ * Mostly platform independent upcall operations to Venus:
+ *  -- upcalls
+ *  -- upcall routines
+ *
+ * Linux 2.0 version
+ * Copyright (C) 1996 Peter J. Braam <braam@maths.ox.ac.uk>, 
+ * Michael Callahan <callahan@maths.ox.ac.uk> 
+ * 
+ * Redone for Linux 2.1
+ * Copyright (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this code to contribute
+ * improvements to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/vfs.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+#include "coda_int.h"
+
+static int coda_upcall(struct venus_comm *vc, int inSize, int *outSize,
+		       union inputArgs *buffer);
+
+static void *alloc_upcall(int opcode, int size)
+{
+	union inputArgs *inp;
+
+	CODA_ALLOC(inp, union inputArgs *, size);
+        if (!inp)
+		return ERR_PTR(-ENOMEM);
+
+        inp->ih.opcode = opcode;
+	inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns);
+	inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns);
+	inp->ih.uid = from_kuid(&init_user_ns, current_fsuid());
+
+	return (void*)inp;
+}
+
+#define UPARG(op)\
+do {\
+	inp = (union inputArgs *)alloc_upcall(op, insize); \
+        if (IS_ERR(inp)) { return PTR_ERR(inp); }\
+        outp = (union outputArgs *)(inp); \
+        outsize = insize; \
+} while (0)
+
+#define INSIZE(tag) sizeof(struct coda_ ## tag ## _in)
+#define OUTSIZE(tag) sizeof(struct coda_ ## tag ## _out)
+#define SIZE(tag)  max_t(unsigned int, INSIZE(tag), OUTSIZE(tag))
+
+
+/* the upcalls */
+int venus_rootfid(struct super_block *sb, struct CodaFid *fidp)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+
+        insize = SIZE(root);
+        UPARG(CODA_ROOT);
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error)
+		*fidp = outp->coda_root.VFid;
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_getattr(struct super_block *sb, struct CodaFid *fid, 
+		     struct coda_vattr *attr) 
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+
+        insize = SIZE(getattr); 
+	UPARG(CODA_GETATTR);
+        inp->coda_getattr.VFid = *fid;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error)
+		*attr = outp->coda_getattr.attr;
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_setattr(struct super_block *sb, struct CodaFid *fid, 
+		  struct coda_vattr *vattr)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+	
+	insize = SIZE(setattr);
+	UPARG(CODA_SETATTR);
+
+        inp->coda_setattr.VFid = *fid;
+	inp->coda_setattr.attr = *vattr;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+        CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_lookup(struct super_block *sb, struct CodaFid *fid, 
+		    const char *name, int length, int * type, 
+		    struct CodaFid *resfid)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+	int offset;
+
+	offset = INSIZE(lookup);
+        insize = max_t(unsigned int, offset + length +1, OUTSIZE(lookup));
+	UPARG(CODA_LOOKUP);
+
+        inp->coda_lookup.VFid = *fid;
+	inp->coda_lookup.name = offset;
+	inp->coda_lookup.flags = CLU_CASE_SENSITIVE;
+        /* send Venus a null terminated string */
+        memcpy((char *)(inp) + offset, name, length);
+        *((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		*resfid = outp->coda_lookup.VFid;
+		*type = outp->coda_lookup.vtype;
+	}
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
+		kuid_t uid)
+{
+	union inputArgs *inp;
+	union outputArgs *outp;
+	int insize, outsize, error;
+	
+	insize = SIZE(release);
+	UPARG(CODA_CLOSE);
+	
+	inp->ih.uid = from_kuid(&init_user_ns, uid);
+        inp->coda_close.VFid = *fid;
+        inp->coda_close.flags = flags;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_open(struct super_block *sb, struct CodaFid *fid,
+		  int flags, struct file **fh)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+       
+	insize = SIZE(open_by_fd);
+	UPARG(CODA_OPEN_BY_FD);
+
+	inp->coda_open_by_fd.VFid = *fid;
+	inp->coda_open_by_fd.flags = flags;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error)
+		*fh = outp->coda_open_by_fd.fh;
+
+	CODA_FREE(inp, insize);
+	return error;
+}	
+
+int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, 
+		   const char *name, int length, 
+		   struct CodaFid *newfid, struct coda_vattr *attrs)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+	offset = INSIZE(mkdir);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(mkdir));
+	UPARG(CODA_MKDIR);
+
+        inp->coda_mkdir.VFid = *dirfid;
+        inp->coda_mkdir.attr = *attrs;
+	inp->coda_mkdir.name = offset;
+        /* Venus must get null terminated string */
+        memcpy((char *)(inp) + offset, name, length);
+        *((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		*attrs = outp->coda_mkdir.attr;
+		*newfid = outp->coda_mkdir.VFid;
+	}
+
+	CODA_FREE(inp, insize);
+	return error;        
+}
+
+
+int venus_rename(struct super_block *sb, struct CodaFid *old_fid, 
+		 struct CodaFid *new_fid, size_t old_length, 
+		 size_t new_length, const char *old_name, 
+		 const char *new_name)
+{
+	union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error; 
+	int offset, s;
+	
+	offset = INSIZE(rename);
+	insize = max_t(unsigned int, offset + new_length + old_length + 8,
+		     OUTSIZE(rename)); 
+ 	UPARG(CODA_RENAME);
+
+        inp->coda_rename.sourceFid = *old_fid;
+        inp->coda_rename.destFid =  *new_fid;
+        inp->coda_rename.srcname = offset;
+
+        /* Venus must receive an null terminated string */
+        s = ( old_length & ~0x3) +4; /* round up to word boundary */
+        memcpy((char *)(inp) + offset, old_name, old_length);
+        *((char *)inp + offset + old_length) = '\0';
+
+        /* another null terminated string for Venus */
+        offset += s;
+        inp->coda_rename.destname = offset;
+        s = ( new_length & ~0x3) +4; /* round up to word boundary */
+        memcpy((char *)(inp) + offset, new_name, new_length);
+        *((char *)inp + offset + new_length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_create(struct super_block *sb, struct CodaFid *dirfid, 
+		 const char *name, int length, int excl, int mode,
+		 struct CodaFid *newfid, struct coda_vattr *attrs) 
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+        offset = INSIZE(create);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(create));
+	UPARG(CODA_CREATE);
+
+        inp->coda_create.VFid = *dirfid;
+        inp->coda_create.attr.va_mode = mode;
+	inp->coda_create.excl = excl;
+        inp->coda_create.mode = mode;
+        inp->coda_create.name = offset;
+
+        /* Venus must get null terminated string */
+        memcpy((char *)(inp) + offset, name, length);
+        *((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		*attrs = outp->coda_create.attr;
+		*newfid = outp->coda_create.VFid;
+	}
+
+	CODA_FREE(inp, insize);
+	return error;        
+}
+
+int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, 
+		    const char *name, int length)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+        offset = INSIZE(rmdir);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(rmdir));
+	UPARG(CODA_RMDIR);
+
+        inp->coda_rmdir.VFid = *dirfid;
+        inp->coda_rmdir.name = offset;
+        memcpy((char *)(inp) + offset, name, length);
+	*((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_remove(struct super_block *sb, struct CodaFid *dirfid, 
+		    const char *name, int length)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int error=0, insize, outsize, offset;
+
+        offset = INSIZE(remove);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(remove));
+	UPARG(CODA_REMOVE);
+
+        inp->coda_remove.VFid = *dirfid;
+        inp->coda_remove.name = offset;
+        memcpy((char *)(inp) + offset, name, length);
+	*((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_readlink(struct super_block *sb, struct CodaFid *fid, 
+		      char *buffer, int *length)
+{ 
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int retlen;
+        char *result;
+        
+	insize = max_t(unsigned int,
+		     INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+	UPARG(CODA_READLINK);
+
+        inp->coda_readlink.VFid = *fid;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		retlen = outp->coda_readlink.count;
+		if ( retlen > *length )
+			retlen = *length;
+		*length = retlen;
+		result =  (char *)outp + (long)outp->coda_readlink.data;
+		memcpy(buffer, result, retlen);
+		*(buffer + retlen) = '\0';
+	}
+
+        CODA_FREE(inp, insize);
+        return error;
+}
+
+
+
+int venus_link(struct super_block *sb, struct CodaFid *fid, 
+		  struct CodaFid *dirfid, const char *name, int len )
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+	offset = INSIZE(link);
+	insize = max_t(unsigned int, offset  + len + 1, OUTSIZE(link));
+        UPARG(CODA_LINK);
+
+        inp->coda_link.sourceFid = *fid;
+        inp->coda_link.destFid = *dirfid;
+        inp->coda_link.tname = offset;
+
+        /* make sure strings are null terminated */
+        memcpy((char *)(inp) + offset, name, len);
+        *((char *)inp + offset + len) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_symlink(struct super_block *sb, struct CodaFid *fid,
+		     const char *name, int len,
+		     const char *symname, int symlen)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset, s;
+
+        offset = INSIZE(symlink);
+	insize = max_t(unsigned int, offset + len + symlen + 8, OUTSIZE(symlink));
+	UPARG(CODA_SYMLINK);
+        
+        /*        inp->coda_symlink.attr = *tva; XXXXXX */ 
+        inp->coda_symlink.VFid = *fid;
+
+	/* Round up to word boundary and null terminate */
+        inp->coda_symlink.srcname = offset;
+        s = ( symlen  & ~0x3 ) + 4; 
+        memcpy((char *)(inp) + offset, symname, symlen);
+        *((char *)inp + offset + symlen) = '\0';
+        
+	/* Round up to word boundary and null terminate */
+        offset += s;
+        inp->coda_symlink.tname = offset;
+        s = (len & ~0x3) + 4;
+        memcpy((char *)(inp) + offset, name, len);
+        *((char *)inp + offset + len) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_fsync(struct super_block *sb, struct CodaFid *fid)
+{
+        union inputArgs *inp;
+        union outputArgs *outp; 
+	int insize, outsize, error;
+	
+	insize=SIZE(fsync);
+	UPARG(CODA_FSYNC);
+
+	inp->coda_fsync.VFid = *fid;
+	error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs),
+			    &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_access(struct super_block *sb, struct CodaFid *fid, int mask)
+{
+        union inputArgs *inp;
+        union outputArgs *outp; 
+	int insize, outsize, error;
+
+	insize = SIZE(access);
+	UPARG(CODA_ACCESS);
+
+        inp->coda_access.VFid = *fid;
+        inp->coda_access.flags = mask;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+
+int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
+		 unsigned int cmd, struct PioctlData *data)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;  
+	int insize, outsize, error;
+	int iocsize;
+
+	insize = VC_MAXMSGSIZE;
+	UPARG(CODA_IOCTL);
+
+        /* build packet for Venus */
+        if (data->vi.in_size > VC_MAXDATASIZE) {
+		error = -EINVAL;
+		goto exit;
+        }
+
+        if (data->vi.out_size > VC_MAXDATASIZE) {
+		error = -EINVAL;
+		goto exit;
+	}
+
+        inp->coda_ioctl.VFid = *fid;
+    
+        /* the cmd field was mutated by increasing its size field to
+         * reflect the path and follow args. We need to subtract that
+         * out before sending the command to Venus.  */
+        inp->coda_ioctl.cmd = (cmd & ~(PIOCPARM_MASK << 16));	
+        iocsize = ((cmd >> 16) & PIOCPARM_MASK) - sizeof(char *) - sizeof(int);
+        inp->coda_ioctl.cmd |= (iocsize & PIOCPARM_MASK) <<	16;	
+    
+        /* in->coda_ioctl.rwflag = flag; */
+        inp->coda_ioctl.len = data->vi.in_size;
+        inp->coda_ioctl.data = (char *)(INSIZE(ioctl));
+     
+        /* get the data out of user space */
+	if (copy_from_user((char *)inp + (long)inp->coda_ioctl.data,
+			   data->vi.in, data->vi.in_size)) {
+		error = -EINVAL;
+	        goto exit;
+	}
+
+	error = coda_upcall(coda_vcp(sb), SIZE(ioctl) + data->vi.in_size,
+			    &outsize, inp);
+
+        if (error) {
+		pr_warn("%s: Venus returns: %d for %s\n",
+			__func__, error, coda_f2s(fid));
+		goto exit; 
+	}
+
+	if (outsize < (long)outp->coda_ioctl.data + outp->coda_ioctl.len) {
+		error = -EINVAL;
+		goto exit;
+	}
+        
+	/* Copy out the OUT buffer. */
+        if (outp->coda_ioctl.len > data->vi.out_size) {
+		error = -EINVAL;
+		goto exit;
+        }
+
+	/* Copy out the OUT buffer. */
+	if (copy_to_user(data->vi.out,
+			 (char *)outp + (long)outp->coda_ioctl.data,
+			 outp->coda_ioctl.len)) {
+		error = -EFAULT;
+		goto exit;
+	}
+
+ exit:
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
+{ 
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        
+	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
+	UPARG(CODA_STATFS);
+
+	error = coda_upcall(coda_vcp(dentry->d_sb), insize, &outsize, inp);
+	if (!error) {
+		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
+		sfs->f_bfree  = outp->coda_statfs.stat.f_bfree;
+		sfs->f_bavail = outp->coda_statfs.stat.f_bavail;
+		sfs->f_files  = outp->coda_statfs.stat.f_files;
+		sfs->f_ffree  = outp->coda_statfs.stat.f_ffree;
+	}
+
+        CODA_FREE(inp, insize);
+        return error;
+}
+
+/*
+ * coda_upcall and coda_downcall routines.
+ */
+static void coda_block_signals(sigset_t *old)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	*old = current->blocked;
+
+	sigfillset(&current->blocked);
+	sigdelset(&current->blocked, SIGKILL);
+	sigdelset(&current->blocked, SIGSTOP);
+	sigdelset(&current->blocked, SIGINT);
+
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void coda_unblock_signals(sigset_t *old)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = *old;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+/* Don't allow signals to interrupt the following upcalls before venus
+ * has seen them,
+ * - CODA_CLOSE or CODA_RELEASE upcall  (to avoid reference count problems)
+ * - CODA_STORE				(to avoid data loss)
+ */
+#define CODA_INTERRUPTIBLE(r) (!coda_hard && \
+			       (((r)->uc_opcode != CODA_CLOSE && \
+				 (r)->uc_opcode != CODA_STORE && \
+				 (r)->uc_opcode != CODA_RELEASE) || \
+				(r)->uc_flags & CODA_REQ_READ))
+
+static inline void coda_waitfor_upcall(struct venus_comm *vcp,
+				       struct upc_req *req)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	unsigned long timeout = jiffies + coda_timeout * HZ;
+	sigset_t old;
+	int blocked;
+
+	coda_block_signals(&old);
+	blocked = 1;
+
+	add_wait_queue(&req->uc_sleep, &wait);
+	for (;;) {
+		if (CODA_INTERRUPTIBLE(req))
+			set_current_state(TASK_INTERRUPTIBLE);
+		else
+			set_current_state(TASK_UNINTERRUPTIBLE);
+
+		/* got a reply */
+		if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT))
+			break;
+
+		if (blocked && time_after(jiffies, timeout) &&
+		    CODA_INTERRUPTIBLE(req))
+		{
+			coda_unblock_signals(&old);
+			blocked = 0;
+		}
+
+		if (signal_pending(current)) {
+			list_del(&req->uc_chain);
+			break;
+		}
+
+		mutex_unlock(&vcp->vc_mutex);
+		if (blocked)
+			schedule_timeout(HZ);
+		else
+			schedule();
+		mutex_lock(&vcp->vc_mutex);
+	}
+	if (blocked)
+		coda_unblock_signals(&old);
+
+	remove_wait_queue(&req->uc_sleep, &wait);
+	set_current_state(TASK_RUNNING);
+}
+
+
+/*
+ * coda_upcall will return an error in the case of
+ * failed communication with Venus _or_ will peek at Venus
+ * reply and return Venus' error.
+ *
+ * As venus has 2 types of errors, normal errors (positive) and internal
+ * errors (negative), normal errors are negated, while internal errors
+ * are all mapped to -EINTR, while showing a nice warning message. (jh)
+ */
+static int coda_upcall(struct venus_comm *vcp,
+		       int inSize, int *outSize,
+		       union inputArgs *buffer)
+{
+	union outputArgs *out;
+	union inputArgs *sig_inputArgs;
+	struct upc_req *req = NULL, *sig_req;
+	int error;
+
+	mutex_lock(&vcp->vc_mutex);
+
+	if (!vcp->vc_inuse) {
+		pr_notice("Venus dead, not sending upcall\n");
+		error = -ENXIO;
+		goto exit;
+	}
+
+	/* Format the request message. */
+	req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
+	if (!req) {
+		error = -ENOMEM;
+		goto exit;
+	}
+
+	req->uc_data = (void *)buffer;
+	req->uc_flags = 0;
+	req->uc_inSize = inSize;
+	req->uc_outSize = *outSize ? *outSize : inSize;
+	req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode;
+	req->uc_unique = ++vcp->vc_seq;
+	init_waitqueue_head(&req->uc_sleep);
+
+	/* Fill in the common input args. */
+	((union inputArgs *)buffer)->ih.unique = req->uc_unique;
+
+	/* Append msg to pending queue and poke Venus. */
+	list_add_tail(&req->uc_chain, &vcp->vc_pending);
+
+	wake_up_interruptible(&vcp->vc_waitq);
+	/* We can be interrupted while we wait for Venus to process
+	 * our request.  If the interrupt occurs before Venus has read
+	 * the request, we dequeue and return. If it occurs after the
+	 * read but before the reply, we dequeue, send a signal
+	 * message, and return. If it occurs after the reply we ignore
+	 * it. In no case do we want to restart the syscall.  If it
+	 * was interrupted by a venus shutdown (psdev_close), return
+	 * ENODEV.  */
+
+	/* Go to sleep.  Wake up on signals only after the timeout. */
+	coda_waitfor_upcall(vcp, req);
+
+	/* Op went through, interrupt or not... */
+	if (req->uc_flags & CODA_REQ_WRITE) {
+		out = (union outputArgs *)req->uc_data;
+		/* here we map positive Venus errors to kernel errors */
+		error = -out->oh.result;
+		*outSize = req->uc_outSize;
+		goto exit;
+	}
+
+	error = -EINTR;
+	if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
+		pr_warn("Unexpected interruption.\n");
+		goto exit;
+	}
+
+	/* Interrupted before venus read it. */
+	if (!(req->uc_flags & CODA_REQ_READ))
+		goto exit;
+
+	/* Venus saw the upcall, make sure we can send interrupt signal */
+	if (!vcp->vc_inuse) {
+		pr_info("Venus dead, not sending signal.\n");
+		goto exit;
+	}
+
+	error = -ENOMEM;
+	sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
+	if (!sig_req) goto exit;
+
+	CODA_ALLOC((sig_req->uc_data), char *, sizeof(struct coda_in_hdr));
+	if (!sig_req->uc_data) {
+		kfree(sig_req);
+		goto exit;
+	}
+
+	error = -EINTR;
+	sig_inputArgs = (union inputArgs *)sig_req->uc_data;
+	sig_inputArgs->ih.opcode = CODA_SIGNAL;
+	sig_inputArgs->ih.unique = req->uc_unique;
+
+	sig_req->uc_flags = CODA_REQ_ASYNC;
+	sig_req->uc_opcode = sig_inputArgs->ih.opcode;
+	sig_req->uc_unique = sig_inputArgs->ih.unique;
+	sig_req->uc_inSize = sizeof(struct coda_in_hdr);
+	sig_req->uc_outSize = sizeof(struct coda_in_hdr);
+
+	/* insert at head of queue! */
+	list_add(&(sig_req->uc_chain), &vcp->vc_pending);
+	wake_up_interruptible(&vcp->vc_waitq);
+
+exit:
+	kfree(req);
+	mutex_unlock(&vcp->vc_mutex);
+	return error;
+}
+
+/*  
+    The statements below are part of the Coda opportunistic
+    programming -- taken from the Mach/BSD kernel code for Coda. 
+    You don't get correct semantics by stating what needs to be
+    done without guaranteeing the invariants needed for it to happen.
+    When will be have time to find out what exactly is going on?  (pjb)
+*/
+
+
+/* 
+ * There are 7 cases where cache invalidations occur.  The semantics
+ *  of each is listed here:
+ *
+ * CODA_FLUSH     -- flush all entries from the name cache and the cnode cache.
+ * CODA_PURGEUSER -- flush all entries from the name cache for a specific user
+ *                  This call is a result of token expiration.
+ *
+ * The next arise as the result of callbacks on a file or directory.
+ * CODA_ZAPFILE   -- flush the cached attributes for a file.
+
+ * CODA_ZAPDIR    -- flush the attributes for the dir and
+ *                  force a new lookup for all the children
+                    of this dir.
+
+ *
+ * The next is a result of Venus detecting an inconsistent file.
+ * CODA_PURGEFID  -- flush the attribute for the file
+ *                  purge it and its children from the dcache
+ *
+ * The last  allows Venus to replace local fids with global ones
+ * during reintegration.
+ *
+ * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
+
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
+{
+	struct inode *inode = NULL;
+	struct CodaFid *fid = NULL, *newfid;
+	struct super_block *sb;
+
+	/* Handle invalidation requests. */
+	mutex_lock(&vcp->vc_mutex);
+	sb = vcp->vc_sb;
+	if (!sb || !sb->s_root)
+		goto unlock_out;
+
+	switch (opcode) {
+	case CODA_FLUSH:
+		coda_cache_clear_all(sb);
+		shrink_dcache_sb(sb);
+		if (d_really_is_positive(sb->s_root))
+			coda_flag_inode(d_inode(sb->s_root), C_FLUSH);
+		break;
+
+	case CODA_PURGEUSER:
+		coda_cache_clear_all(sb);
+		break;
+
+	case CODA_ZAPDIR:
+		fid = &out->coda_zapdir.CodaFid;
+		break;
+
+	case CODA_ZAPFILE:
+		fid = &out->coda_zapfile.CodaFid;
+		break;
+
+	case CODA_PURGEFID:
+		fid = &out->coda_purgefid.CodaFid;
+		break;
+
+	case CODA_REPLACE:
+		fid = &out->coda_replace.OldFid;
+		break;
+	}
+	if (fid)
+		inode = coda_fid_to_inode(fid, sb);
+
+unlock_out:
+	mutex_unlock(&vcp->vc_mutex);
+
+	if (!inode)
+		return 0;
+
+	switch (opcode) {
+	case CODA_ZAPDIR:
+		coda_flag_inode_children(inode, C_PURGE);
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_ZAPFILE:
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_PURGEFID:
+		coda_flag_inode_children(inode, C_PURGE);
+
+		/* catch the dentries later if some are still busy */
+		coda_flag_inode(inode, C_PURGE);
+		d_prune_aliases(inode);
+		break;
+
+	case CODA_REPLACE:
+		newfid = &out->coda_replace.NewFid;
+		coda_replace_fid(inode, fid, newfid);
+		break;
+	}
+	iput(inode);
+	return 0;
+}
+
diff --git a/kernel/fs/compat.c b/kernel/fs/compat.c
new file mode 100644
index 000000000..6fd272d45
--- /dev/null
+++ b/kernel/fs/compat.c
@@ -0,0 +1,1481 @@
+/*
+ *  linux/fs/compat.c
+ *
+ *  Kernel compatibililty routines for e.g. 32 bit syscall support
+ *  on 64 bit kernels.
+ *
+ *  Copyright (C) 2002       Stephen Rothwell, IBM Corporation
+ *  Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
+ *  Copyright (C) 1998       Eddie C. Dost  (ecd@skynet.be)
+ *  Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
+ *  Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/vfs.h>
+#include <linux/ioctl.h>
+#include <linux/init.h>
+#include <linux/ncp_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/syscalls.h>
+#include <linux/ctype.h>
+#include <linux/dirent.h>
+#include <linux/fsnotify.h>
+#include <linux/highuid.h>
+#include <linux/personality.h>
+#include <linux/rwsem.h>
+#include <linux/tsacct_kern.h>
+#include <linux/security.h>
+#include <linux/highmem.h>
+#include <linux/signal.h>
+#include <linux/poll.h>
+#include <linux/mm.h>
+#include <linux/fs_struct.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/aio.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/ioctls.h>
+#include "internal.h"
+
+int compat_log = 1;
+
+int compat_printk(const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = vprintk(fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
+/*
+ * Not all architectures have sys_utime, so implement this in terms
+ * of sys_utimes.
+ */
+COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
+		       struct compat_utimbuf __user *, t)
+{
+	struct timespec tv[2];
+
+	if (t) {
+		if (get_user(tv[0].tv_sec, &t->actime) ||
+		    get_user(tv[1].tv_sec, &t->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
+{
+	struct timespec tv[2];
+
+	if  (t) {
+		if (compat_get_timespec(&tv[0], &t[0]) ||
+		    compat_get_timespec(&tv[1], &t[1]))
+			return -EFAULT;
+
+		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
+			return 0;
+	}
+	return do_utimes(dfd, filename, t ? tv : NULL, flags);
+}
+
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
+{
+	struct timespec tv[2];
+
+	if (t) {
+		if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
+		    get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
+		    get_user(tv[1].tv_sec, &t[1].tv_sec) ||
+		    get_user(tv[1].tv_nsec, &t[1].tv_usec))
+			return -EFAULT;
+		if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
+		    tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
+			return -EINVAL;
+		tv[0].tv_nsec *= 1000;
+		tv[1].tv_nsec *= 1000;
+	}
+	return do_utimes(dfd, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
+{
+	return compat_sys_futimesat(AT_FDCWD, filename, t);
+}
+
+static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+{
+	struct compat_stat tmp;
+
+	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+
+	memset(&tmp, 0, sizeof(tmp));
+	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
+		return -EOVERFLOW;
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+	tmp.st_rdev = old_encode_dev(stat->rdev);
+	if ((u64) stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		       struct compat_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+
+COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		       struct compat_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+
+#ifndef __ARCH_WANT_STAT64
+COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
+		       const char __user *, filename,
+		       struct compat_stat __user *, statbuf, int, flag)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
+		       struct compat_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_compat_stat(&stat, statbuf);
+	return error;
+}
+
+static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
+{
+	
+	if (sizeof ubuf->f_blocks == 4) {
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+			return -EOVERFLOW;
+		/* f_files and f_ffree may be -1; it's okay
+		 * to stuff that into 32 bits */
+		if (kbuf->f_files != 0xffffffffffffffffULL
+		 && (kbuf->f_files & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+		if (kbuf->f_ffree != 0xffffffffffffffffULL
+		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+	}
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+	    __put_user(kbuf->f_type, &ubuf->f_type) ||
+	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+	    __put_user(kbuf->f_files, &ubuf->f_files) ||
+	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * The following statfs calls are copies of code from fs/statfs.c and
+ * should be checked against those from time to time
+ */
+COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
+{
+	struct kstatfs tmp;
+	int error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
+	return error;
+}
+
+COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
+{
+	struct kstatfs tmp;
+	int error = fd_statfs(fd, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
+	return error;
+}
+
+static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
+{
+	if (sizeof ubuf->f_blocks == 4) {
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+			return -EOVERFLOW;
+		/* f_files and f_ffree may be -1; it's okay
+		 * to stuff that into 32 bits */
+		if (kbuf->f_files != 0xffffffffffffffffULL
+		 && (kbuf->f_files & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+		if (kbuf->f_ffree != 0xffffffffffffffffULL
+		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+	}
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+	    __put_user(kbuf->f_type, &ubuf->f_type) ||
+	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+	    __put_user(kbuf->f_files, &ubuf->f_files) ||
+	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+		return -EFAULT;
+	return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+	struct kstatfs tmp;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
+	return error;
+}
+
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+	struct kstatfs tmp;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = fd_statfs(fd, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
+	return error;
+}
+
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
+{
+	struct compat_ustat tmp;
+	struct kstatfs sbuf;
+	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	if (err)
+		return err;
+
+	memset(&tmp, 0, sizeof(struct compat_ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+	if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+		return -EFAULT;
+	return 0;
+}
+
+static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+	    __get_user(kfl->l_type, &ufl->l_type) ||
+	    __get_user(kfl->l_whence, &ufl->l_whence) ||
+	    __get_user(kfl->l_start, &ufl->l_start) ||
+	    __get_user(kfl->l_len, &ufl->l_len) ||
+	    __get_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+
+static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+	    __put_user(kfl->l_type, &ufl->l_type) ||
+	    __put_user(kfl->l_whence, &ufl->l_whence) ||
+	    __put_user(kfl->l_start, &ufl->l_start) ||
+	    __put_user(kfl->l_len, &ufl->l_len) ||
+	    __put_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+
+#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
+static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+	    __get_user(kfl->l_type, &ufl->l_type) ||
+	    __get_user(kfl->l_whence, &ufl->l_whence) ||
+	    __get_user(kfl->l_start, &ufl->l_start) ||
+	    __get_user(kfl->l_len, &ufl->l_len) ||
+	    __get_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
+static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+	    __put_user(kfl->l_type, &ufl->l_type) ||
+	    __put_user(kfl->l_whence, &ufl->l_whence) ||
+	    __put_user(kfl->l_start, &ufl->l_start) ||
+	    __put_user(kfl->l_len, &ufl->l_len) ||
+	    __put_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+static unsigned int
+convert_fcntl_cmd(unsigned int cmd)
+{
+	switch (cmd) {
+	case F_GETLK64:
+		return F_GETLK;
+	case F_SETLK64:
+		return F_SETLK;
+	case F_SETLKW64:
+		return F_SETLKW;
+	}
+
+	return cmd;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg)
+{
+	mm_segment_t old_fs;
+	struct flock f;
+	long ret;
+	unsigned int conv_cmd;
+
+	switch (cmd) {
+	case F_GETLK:
+	case F_SETLK:
+	case F_SETLKW:
+		ret = get_compat_flock(&f, compat_ptr(arg));
+		if (ret != 0)
+			break;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = sys_fcntl(fd, cmd, (unsigned long)&f);
+		set_fs(old_fs);
+		if (cmd == F_GETLK && ret == 0) {
+			/* GETLK was successful and we need to return the data...
+			 * but it needs to fit in the compat structure.
+			 * l_start shouldn't be too big, unless the original
+			 * start + end is greater than COMPAT_OFF_T_MAX, in which
+			 * case the app was asking for trouble, so we return
+			 * -EOVERFLOW in that case.
+			 * l_len could be too big, in which case we just truncate it,
+			 * and only allow the app to see that part of the conflicting
+			 * lock that might make sense to it anyway
+			 */
+
+			if (f.l_start > COMPAT_OFF_T_MAX)
+				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_OFF_T_MAX)
+				f.l_len = COMPAT_OFF_T_MAX;
+			if (ret == 0)
+				ret = put_compat_flock(&f, compat_ptr(arg));
+		}
+		break;
+
+	case F_GETLK64:
+	case F_SETLK64:
+	case F_SETLKW64:
+	case F_OFD_GETLK:
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
+		ret = get_compat_flock64(&f, compat_ptr(arg));
+		if (ret != 0)
+			break;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		conv_cmd = convert_fcntl_cmd(cmd);
+		ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
+		set_fs(old_fs);
+		if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
+			/* need to return lock information - see above for commentary */
+			if (f.l_start > COMPAT_LOFF_T_MAX)
+				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_LOFF_T_MAX)
+				f.l_len = COMPAT_LOFF_T_MAX;
+			if (ret == 0)
+				ret = put_compat_flock64(&f, compat_ptr(arg));
+		}
+		break;
+
+	default:
+		ret = sys_fcntl(fd, cmd, arg);
+		break;
+	}
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg)
+{
+	switch (cmd) {
+	case F_GETLK64:
+	case F_SETLK64:
+	case F_SETLKW64:
+	case F_OFD_GETLK:
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
+		return -EINVAL;
+	}
+	return compat_sys_fcntl64(fd, cmd, arg);
+}
+
+COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
+{
+	long ret;
+	aio_context_t ctx64;
+
+	mm_segment_t oldfs = get_fs();
+	if (unlikely(get_user(ctx64, ctx32p)))
+		return -EFAULT;
+
+	set_fs(KERNEL_DS);
+	/* The __user pointer cast is valid because of the set_fs() */
+	ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64);
+	set_fs(oldfs);
+	/* truncating is ok because it's a user address */
+	if (!ret)
+		ret = put_user((u32) ctx64, ctx32p);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
+		       compat_long_t, min_nr,
+		       compat_long_t, nr,
+		       struct io_event __user *, events,
+		       struct compat_timespec __user *, timeout)
+{
+	struct timespec t;
+	struct timespec __user *ut = NULL;
+
+	if (timeout) {
+		if (compat_get_timespec(&t, timeout))
+			return -EFAULT;
+
+		ut = compat_alloc_user_space(sizeof(*ut));
+		if (copy_to_user(ut, &t, sizeof(t)) )
+			return -EFAULT;
+	} 
+	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+}
+
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t compat_rw_copy_check_uvector(int type,
+		const struct compat_iovec __user *uvector, unsigned long nr_segs,
+		unsigned long fast_segs, struct iovec *fast_pointer,
+		struct iovec **ret_pointer)
+{
+	compat_ssize_t tot_len;
+	struct iovec *iov = *ret_pointer = fast_pointer;
+	ssize_t ret = 0;
+	int seg;
+
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
+	if (nr_segs == 0)
+		goto out;
+
+	ret = -EINVAL;
+	if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+		goto out;
+	if (nr_segs > fast_segs) {
+		ret = -ENOMEM;
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL)
+			goto out;
+	}
+	*ret_pointer = iov;
+
+	ret = -EFAULT;
+	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+		goto out;
+
+	/*
+	 * Single unix specification:
+	 * We should -EINVAL if an element length is not >= 0 and fitting an
+	 * ssize_t.
+	 *
+	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
+	 * no overflow possibility.
+	 */
+	tot_len = 0;
+	ret = -EINVAL;
+	for (seg = 0; seg < nr_segs; seg++) {
+		compat_uptr_t buf;
+		compat_ssize_t len;
+
+		if (__get_user(len, &uvector->iov_len) ||
+		   __get_user(buf, &uvector->iov_base)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
+			goto out;
+		if (type >= 0 &&
+		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len > MAX_RW_COUNT - tot_len)
+			len = MAX_RW_COUNT - tot_len;
+		tot_len += len;
+		iov->iov_base = compat_ptr(buf);
+		iov->iov_len = (compat_size_t) len;
+		uvector++;
+		iov++;
+	}
+	ret = tot_len;
+
+out:
+	return ret;
+}
+
+static inline long
+copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+{
+	compat_uptr_t uptr;
+	int i;
+
+	for (i = 0; i < nr; ++i) {
+		if (get_user(uptr, ptr32 + i))
+			return -EFAULT;
+		if (put_user(compat_ptr(uptr), ptr64 + i))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
+
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+		       int, nr, u32 __user *, iocb)
+{
+	struct iocb __user * __user *iocb64; 
+	long ret;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (nr > MAX_AIO_SUBMITS)
+		nr = MAX_AIO_SUBMITS;
+	
+	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
+	ret = copy_iocb(nr, iocb, iocb64);
+	if (!ret)
+		ret = do_io_submit(ctx_id, nr, iocb64, 1);
+	return ret;
+}
+
+struct compat_ncp_mount_data {
+	compat_int_t version;
+	compat_uint_t ncp_fd;
+	__compat_uid_t mounted_uid;
+	compat_pid_t wdog_pid;
+	unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
+	compat_uint_t time_out;
+	compat_uint_t retry_count;
+	compat_uint_t flags;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
+	compat_mode_t file_mode;
+	compat_mode_t dir_mode;
+};
+
+struct compat_ncp_mount_data_v4 {
+	compat_int_t version;
+	compat_ulong_t flags;
+	compat_ulong_t mounted_uid;
+	compat_long_t wdog_pid;
+	compat_uint_t ncp_fd;
+	compat_uint_t time_out;
+	compat_uint_t retry_count;
+	compat_ulong_t uid;
+	compat_ulong_t gid;
+	compat_ulong_t file_mode;
+	compat_ulong_t dir_mode;
+};
+
+static void *do_ncp_super_data_conv(void *raw_data)
+{
+	int version = *(unsigned int *)raw_data;
+
+	if (version == 3) {
+		struct compat_ncp_mount_data *c_n = raw_data;
+		struct ncp_mount_data *n = raw_data;
+
+		n->dir_mode = c_n->dir_mode;
+		n->file_mode = c_n->file_mode;
+		n->gid = c_n->gid;
+		n->uid = c_n->uid;
+		memmove (n->mounted_vol, c_n->mounted_vol, (sizeof (c_n->mounted_vol) + 3 * sizeof (unsigned int)));
+		n->wdog_pid = c_n->wdog_pid;
+		n->mounted_uid = c_n->mounted_uid;
+	} else if (version == 4) {
+		struct compat_ncp_mount_data_v4 *c_n = raw_data;
+		struct ncp_mount_data_v4 *n = raw_data;
+
+		n->dir_mode = c_n->dir_mode;
+		n->file_mode = c_n->file_mode;
+		n->gid = c_n->gid;
+		n->uid = c_n->uid;
+		n->retry_count = c_n->retry_count;
+		n->time_out = c_n->time_out;
+		n->ncp_fd = c_n->ncp_fd;
+		n->wdog_pid = c_n->wdog_pid;
+		n->mounted_uid = c_n->mounted_uid;
+		n->flags = c_n->flags;
+	} else if (version != 5) {
+		return NULL;
+	}
+
+	return raw_data;
+}
+
+
+struct compat_nfs_string {
+	compat_uint_t len;
+	compat_uptr_t data;
+};
+
+static inline void compat_nfs_string(struct nfs_string *dst,
+				     struct compat_nfs_string *src)
+{
+	dst->data = compat_ptr(src->data);
+	dst->len = src->len;
+}
+
+struct compat_nfs4_mount_data_v1 {
+	compat_int_t version;
+	compat_int_t flags;
+	compat_int_t rsize;
+	compat_int_t wsize;
+	compat_int_t timeo;
+	compat_int_t retrans;
+	compat_int_t acregmin;
+	compat_int_t acregmax;
+	compat_int_t acdirmin;
+	compat_int_t acdirmax;
+	struct compat_nfs_string client_addr;
+	struct compat_nfs_string mnt_path;
+	struct compat_nfs_string hostname;
+	compat_uint_t host_addrlen;
+	compat_uptr_t host_addr;
+	compat_int_t proto;
+	compat_int_t auth_flavourlen;
+	compat_uptr_t auth_flavours;
+};
+
+static int do_nfs4_super_data_conv(void *raw_data)
+{
+	int version = *(compat_uint_t *) raw_data;
+
+	if (version == 1) {
+		struct compat_nfs4_mount_data_v1 *raw = raw_data;
+		struct nfs4_mount_data *real = raw_data;
+
+		/* copy the fields backwards */
+		real->auth_flavours = compat_ptr(raw->auth_flavours);
+		real->auth_flavourlen = raw->auth_flavourlen;
+		real->proto = raw->proto;
+		real->host_addr = compat_ptr(raw->host_addr);
+		real->host_addrlen = raw->host_addrlen;
+		compat_nfs_string(&real->hostname, &raw->hostname);
+		compat_nfs_string(&real->mnt_path, &raw->mnt_path);
+		compat_nfs_string(&real->client_addr, &raw->client_addr);
+		real->acdirmax = raw->acdirmax;
+		real->acdirmin = raw->acdirmin;
+		real->acregmax = raw->acregmax;
+		real->acregmin = raw->acregmin;
+		real->retrans = raw->retrans;
+		real->timeo = raw->timeo;
+		real->wsize = raw->wsize;
+		real->rsize = raw->rsize;
+		real->flags = raw->flags;
+		real->version = raw->version;
+	}
+
+	return 0;
+}
+
+#define NCPFS_NAME      "ncpfs"
+#define NFS4_NAME	"nfs4"
+
+COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
+		       const char __user *, dir_name,
+		       const char __user *, type, compat_ulong_t, flags,
+		       const void __user *, data)
+{
+	char *kernel_type;
+	unsigned long data_page;
+	char *kernel_dev;
+	int retval;
+
+	kernel_type = copy_mount_string(type);
+	retval = PTR_ERR(kernel_type);
+	if (IS_ERR(kernel_type))
+		goto out;
+
+	kernel_dev = copy_mount_string(dev_name);
+	retval = PTR_ERR(kernel_dev);
+	if (IS_ERR(kernel_dev))
+		goto out1;
+
+	retval = copy_mount_options(data, &data_page);
+	if (retval < 0)
+		goto out2;
+
+	retval = -EINVAL;
+
+	if (kernel_type && data_page) {
+		if (!strcmp(kernel_type, NCPFS_NAME)) {
+			do_ncp_super_data_conv((void *)data_page);
+		} else if (!strcmp(kernel_type, NFS4_NAME)) {
+			if (do_nfs4_super_data_conv((void *) data_page))
+				goto out3;
+		}
+	}
+
+	retval = do_mount(kernel_dev, dir_name, kernel_type,
+			flags, (void*)data_page);
+
+ out3:
+	free_page(data_page);
+ out2:
+	kfree(kernel_dev);
+ out1:
+	kfree(kernel_type);
+ out:
+	return retval;
+}
+
+struct compat_old_linux_dirent {
+	compat_ulong_t	d_ino;
+	compat_ulong_t	d_offset;
+	unsigned short	d_namlen;
+	char		d_name[1];
+};
+
+struct compat_readdir_callback {
+	struct dir_context ctx;
+	struct compat_old_linux_dirent __user *dirent;
+	int result;
+};
+
+static int compat_fillonedir(struct dir_context *ctx, const char *name,
+			     int namlen, loff_t offset, u64 ino,
+			     unsigned int d_type)
+{
+	struct compat_readdir_callback *buf =
+		container_of(ctx, struct compat_readdir_callback, ctx);
+	struct compat_old_linux_dirent __user *dirent;
+	compat_ulong_t d_ino;
+
+	if (buf->result)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->result = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	buf->result++;
+	dirent = buf->dirent;
+	if (!access_ok(VERIFY_WRITE, dirent,
+			(unsigned long)(dirent->d_name + namlen + 1) -
+				(unsigned long)dirent))
+		goto efault;
+	if (	__put_user(d_ino, &dirent->d_ino) ||
+		__put_user(offset, &dirent->d_offset) ||
+		__put_user(namlen, &dirent->d_namlen) ||
+		__copy_to_user(dirent->d_name, name, namlen) ||
+		__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	return 0;
+efault:
+	buf->result = -EFAULT;
+	return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+		struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
+{
+	int error;
+	struct fd f = fdget(fd);
+	struct compat_readdir_callback buf = {
+		.ctx.actor = compat_fillonedir,
+		.dirent = dirent
+	};
+
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (buf.result)
+		error = buf.result;
+
+	fdput(f);
+	return error;
+}
+
+struct compat_linux_dirent {
+	compat_ulong_t	d_ino;
+	compat_ulong_t	d_off;
+	unsigned short	d_reclen;
+	char		d_name[1];
+};
+
+struct compat_getdents_callback {
+	struct dir_context ctx;
+	struct compat_linux_dirent __user *current_dir;
+	struct compat_linux_dirent __user *previous;
+	int count;
+	int error;
+};
+
+static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
+		loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct compat_linux_dirent __user * dirent;
+	struct compat_getdents_callback *buf =
+		container_of(ctx, struct compat_getdents_callback, ctx);
+	compat_ulong_t d_ino;
+	int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
+		namlen + 2, sizeof(compat_long_t));
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->error = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(d_ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, (char  __user *) dirent + reclen - 1))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
+		struct compat_linux_dirent __user *, dirent, unsigned int, count)
+{
+	struct fd f;
+	struct compat_linux_dirent __user * lastdirent;
+	struct compat_getdents_callback buf = {
+		.ctx.actor = compat_filldir,
+		.current_dir = dirent,
+		.count = count
+	};
+	int error;
+
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		return -EFAULT;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fdput(f);
+	return error;
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
+
+struct compat_getdents_callback64 {
+	struct dir_context ctx;
+	struct linux_dirent64 __user *current_dir;
+	struct linux_dirent64 __user *previous;
+	int count;
+	int error;
+};
+
+static int compat_filldir64(struct dir_context *ctx, const char *name,
+			    int namlen, loff_t offset, u64 ino,
+			    unsigned int d_type)
+{
+	struct linux_dirent64 __user *dirent;
+	struct compat_getdents_callback64 *buf =
+		container_of(ctx, struct compat_getdents_callback64, ctx);
+	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
+		sizeof(u64));
+	u64 off;
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	dirent = buf->previous;
+
+	if (dirent) {
+		if (__put_user_unaligned(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user_unaligned(ino, &dirent->d_ino))
+		goto efault;
+	off = 0;
+	if (__put_user_unaligned(off, &dirent->d_off))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (__put_user(d_type, &dirent->d_type))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+		struct linux_dirent64 __user *, dirent, unsigned int, count)
+{
+	struct fd f;
+	struct linux_dirent64 __user * lastdirent;
+	struct compat_getdents_callback64 buf = {
+		.ctx.actor = compat_filldir64,
+		.current_dir = dirent,
+		.count = count
+	};
+	int error;
+
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		return -EFAULT;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
+		if (__put_user_unaligned(d_off, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fdput(f);
+	return error;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
+
+/*
+ * Exactly like fs/open.c:sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+{
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+/*
+ * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
+{
+	return do_sys_open(dfd, filename, flags, mode);
+}
+
+#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+/*
+ * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
+ * 64-bit unsigned longs.
+ */
+static
+int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+			unsigned long *fdset)
+{
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+	if (ufdset) {
+		unsigned long odd;
+
+		if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
+			return -EFAULT;
+
+		odd = nr & 1UL;
+		nr &= ~1UL;
+		while (nr) {
+			unsigned long h, l;
+			if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
+				return -EFAULT;
+			ufdset += 2;
+			*fdset++ = h << 32 | l;
+			nr -= 2;
+		}
+		if (odd && __get_user(*fdset, ufdset))
+			return -EFAULT;
+	} else {
+		/* Tricky, must clear full unsigned long in the
+		 * kernel fdset at the end, this makes sure that
+		 * actually happens.
+		 */
+		memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
+	}
+	return 0;
+}
+
+static
+int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+		      unsigned long *fdset)
+{
+	unsigned long odd;
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+
+	if (!ufdset)
+		return 0;
+
+	odd = nr & 1UL;
+	nr &= ~1UL;
+	while (nr) {
+		unsigned long h, l;
+		l = *fdset++;
+		h = l >> 32;
+		if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
+			return -EFAULT;
+		ufdset += 2;
+		nr -= 2;
+	}
+	if (odd && __put_user(*fdset, ufdset))
+		return -EFAULT;
+	return 0;
+}
+
+
+/*
+ * This is a virtual copy of sys_select from fs/select.c and probably
+ * should be compared to it from time to time
+ */
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
+{
+	fd_set_bits fds;
+	void *bits;
+	int size, max_fds, ret = -EINVAL;
+	struct fdtable *fdt;
+	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+	if (n < 0)
+		goto out_nofds;
+
+	/* max_fds can increase, so grab it once to avoid race */
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fds = fdt->max_fds;
+	rcu_read_unlock();
+	if (n > max_fds)
+		n = max_fds;
+
+	/*
+	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+	 * since we used fdset we need to allocate memory in units of
+	 * long-words.
+	 */
+	size = FDS_BYTES(n);
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		bits = kmalloc(6 * size, GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = (unsigned long *)  bits;
+	fds.out     = (unsigned long *) (bits +   size);
+	fds.ex      = (unsigned long *) (bits + 2*size);
+	fds.res_in  = (unsigned long *) (bits + 3*size);
+	fds.res_out = (unsigned long *) (bits + 4*size);
+	fds.res_ex  = (unsigned long *) (bits + 5*size);
+
+	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
+	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
+	    (ret = compat_get_fd_set(n, exp, fds.ex)))
+		goto out;
+	zero_fd_set(n, fds.res_in);
+	zero_fd_set(n, fds.res_out);
+	zero_fd_set(n, fds.res_ex);
+
+	ret = do_select(n, &fds, end_time);
+
+	if (ret < 0)
+		goto out;
+	if (!ret) {
+		ret = -ERESTARTNOHAND;
+		if (signal_pending(current))
+			goto out;
+		ret = 0;
+	}
+
+	if (compat_set_fd_set(n, inp, fds.res_in) ||
+	    compat_set_fd_set(n, outp, fds.res_out) ||
+	    compat_set_fd_set(n, exp, fds.res_ex))
+		ret = -EFAULT;
+out:
+	if (bits != stack_fds)
+		kfree(bits);
+out_nofds:
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timeval __user *, tvp)
+{
+	struct timespec end_time, *to = NULL;
+	struct compat_timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+			return -EINVAL;
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+	return ret;
+}
+
+struct compat_sel_arg_struct {
+	compat_ulong_t n;
+	compat_uptr_t inp;
+	compat_uptr_t outp;
+	compat_uptr_t exp;
+	compat_uptr_t tvp;
+};
+
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
+{
+	struct compat_sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+				 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+	compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timespec __user *, tsp, void __user *, sig)
+{
+	compat_size_t sigsetsize = 0;
+	compat_uptr_t up = 0;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig,
+				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+		    	__get_user(up, (compat_uptr_t __user *)sig) ||
+		    	__get_user(sigsetsize,
+				(compat_size_t __user *)(sig+sizeof(up))))
+			return -EFAULT;
+	}
+	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+				 sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+	unsigned int,  nfds, struct compat_timespec __user *, tsp,
+	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+				sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	return ret;
+}
+
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+			     struct file_handle __user *, handle, int, flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/kernel/fs/compat_binfmt_elf.c b/kernel/fs/compat_binfmt_elf.c
new file mode 100644
index 000000000..4d24d17bc
--- /dev/null
+++ b/kernel/fs/compat_binfmt_elf.c
@@ -0,0 +1,145 @@
+/*
+ * 32-bit compatibility support for ELF format executables and core dumps.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ *
+ * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
+ * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros
+ * used below, with definitions appropriate for 32-bit ABI compatibility.
+ *
+ * We use macros to rename the ABI types and machine-dependent
+ * functions used in binfmt_elf.c to compat versions.
+ */
+
+#include <linux/elfcore-compat.h>
+#include <linux/time.h>
+
+/*
+ * Rename the basic ELF layout types to refer to the 32-bit class of files.
+ */
+#undef	ELF_CLASS
+#define ELF_CLASS	ELFCLASS32
+
+#undef	elfhdr
+#undef	elf_phdr
+#undef	elf_shdr
+#undef	elf_note
+#undef	elf_addr_t
+#define elfhdr		elf32_hdr
+#define elf_phdr	elf32_phdr
+#define elf_shdr	elf32_shdr
+#define elf_note	elf32_note
+#define elf_addr_t	Elf32_Addr
+
+/*
+ * Some data types as stored in coredump.
+ */
+#define user_long_t		compat_long_t
+#define user_siginfo_t		compat_siginfo_t
+#define copy_siginfo_to_user	copy_siginfo_to_user32
+
+/*
+ * The machine-dependent core note format types are defined in elfcore-compat.h,
+ * which requires asm/elf.h to define compat_elf_gregset_t et al.
+ */
+#define elf_prstatus	compat_elf_prstatus
+#define elf_prpsinfo	compat_elf_prpsinfo
+
+/*
+ * Compat version of cputime_to_compat_timeval, perhaps this
+ * should be an inline in <linux/compat.h>.
+ */
+static void cputime_to_compat_timeval(const cputime_t cputime,
+				      struct compat_timeval *value)
+{
+	struct timeval tv;
+	cputime_to_timeval(cputime, &tv);
+	value->tv_sec = tv.tv_sec;
+	value->tv_usec = tv.tv_usec;
+}
+
+#undef cputime_to_timeval
+#define cputime_to_timeval cputime_to_compat_timeval
+
+
+/*
+ * To use this file, asm/elf.h must define compat_elf_check_arch.
+ * The other following macros can be defined if the compat versions
+ * differ from the native ones, or omitted when they match.
+ */
+
+#undef	ELF_ARCH
+#undef	elf_check_arch
+#define	elf_check_arch	compat_elf_check_arch
+
+#ifdef	COMPAT_ELF_PLATFORM
+#undef	ELF_PLATFORM
+#define	ELF_PLATFORM		COMPAT_ELF_PLATFORM
+#endif
+
+#ifdef	COMPAT_ELF_HWCAP
+#undef	ELF_HWCAP
+#define	ELF_HWCAP		COMPAT_ELF_HWCAP
+#endif
+
+#ifdef	COMPAT_ELF_HWCAP2
+#undef	ELF_HWCAP2
+#define	ELF_HWCAP2		COMPAT_ELF_HWCAP2
+#endif
+
+#ifdef	COMPAT_ARCH_DLINFO
+#undef	ARCH_DLINFO
+#define	ARCH_DLINFO		COMPAT_ARCH_DLINFO
+#endif
+
+#ifdef	COMPAT_ELF_ET_DYN_BASE
+#undef	ELF_ET_DYN_BASE
+#define	ELF_ET_DYN_BASE		COMPAT_ELF_ET_DYN_BASE
+#endif
+
+#ifdef COMPAT_ELF_EXEC_PAGESIZE
+#undef	ELF_EXEC_PAGESIZE
+#define	ELF_EXEC_PAGESIZE	COMPAT_ELF_EXEC_PAGESIZE
+#endif
+
+#ifdef	COMPAT_ELF_PLAT_INIT
+#undef	ELF_PLAT_INIT
+#define	ELF_PLAT_INIT		COMPAT_ELF_PLAT_INIT
+#endif
+
+#ifdef	COMPAT_SET_PERSONALITY
+#undef	SET_PERSONALITY
+#define	SET_PERSONALITY		COMPAT_SET_PERSONALITY
+#endif
+
+#ifdef	compat_start_thread
+#undef	start_thread
+#define	start_thread		compat_start_thread
+#endif
+
+#ifdef	compat_arch_setup_additional_pages
+#undef	ARCH_HAS_SETUP_ADDITIONAL_PAGES
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+#undef	arch_setup_additional_pages
+#define	arch_setup_additional_pages compat_arch_setup_additional_pages
+#endif
+
+/*
+ * Rename a few of the symbols that binfmt_elf.c will define.
+ * These are all local so the names don't really matter, but it
+ * might make some debugging less confusing not to duplicate them.
+ */
+#define elf_format		compat_elf_format
+#define init_elf_binfmt		init_compat_elf_binfmt
+#define exit_elf_binfmt		exit_compat_elf_binfmt
+
+/*
+ * We share all the actual code with the native (64-bit) version.
+ */
+#include "binfmt_elf.c"
diff --git a/kernel/fs/compat_ioctl.c b/kernel/fs/compat_ioctl.c
new file mode 100644
index 000000000..6b8e2f091
--- /dev/null
+++ b/kernel/fs/compat_ioctl.c
@@ -0,0 +1,1638 @@
+/*
+ * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
+ *
+ * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
+ * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
+ * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
+ * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * ioctls.
+ */
+
+#include <linux/joystick.h>
+
+#include <linux/types.h>
+#include <linux/compat.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/compiler.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/ioctl.h>
+#include <linux/if.h>
+#include <linux/if_bridge.h>
+#include <linux/raid/md_u.h>
+#include <linux/kd.h>
+#include <linux/route.h>
+#include <linux/in6.h>
+#include <linux/ipv6_route.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/vt.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/ppp_defs.h>
+#include <linux/ppp-ioctl.h>
+#include <linux/if_pppox.h>
+#include <linux/mtio.h>
+#include <linux/auto_fs.h>
+#include <linux/auto_fs4.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>
+#include <linux/fb.h>
+#include <linux/videodev2.h>
+#include <linux/netdevice.h>
+#include <linux/raw.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/rtc.h>
+#include <linux/pci.h>
+#include <linux/serial.h>
+#include <linux/if_tun.h>
+#include <linux/ctype.h>
+#include <linux/syscalls.h>
+#include <linux/i2c.h>
+#include <linux/i2c-dev.h>
+#include <linux/atalk.h>
+#include <linux/gfp.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_sock.h>
+#include <net/bluetooth/rfcomm.h>
+
+#include <linux/capi.h>
+#include <linux/gigaset_dev.h>
+
+#ifdef CONFIG_BLOCK
+#include <linux/cdrom.h>
+#include <linux/fd.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/sg.h>
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_bonding.h>
+#include <linux/watchdog.h>
+
+#include <linux/soundcard.h>
+#include <linux/lp.h>
+#include <linux/ppdev.h>
+
+#include <linux/atm.h>
+#include <linux/atmarp.h>
+#include <linux/atmclip.h>
+#include <linux/atmdev.h>
+#include <linux/atmioc.h>
+#include <linux/atmlec.h>
+#include <linux/atmmpc.h>
+#include <linux/atmsvc.h>
+#include <linux/atm_tcp.h>
+#include <linux/sonet.h>
+#include <linux/atm_suni.h>
+
+#include <linux/usb.h>
+#include <linux/usbdevice_fs.h>
+#include <linux/nbd.h>
+#include <linux/random.h>
+#include <linux/filter.h>
+
+#include <linux/hiddev.h>
+
+#define __DVB_CORE__
+#include <linux/dvb/audio.h>
+#include <linux/dvb/dmx.h>
+#include <linux/dvb/frontend.h>
+#include <linux/dvb/video.h>
+
+#include <linux/sort.h>
+
+#ifdef CONFIG_SPARC
+#include <asm/fbio.h>
+#endif
+
+static int w_long(unsigned int fd, unsigned int cmd,
+		compat_ulong_t __user *argp)
+{
+	mm_segment_t old_fs = get_fs();
+	int err;
+	unsigned long val;
+
+	set_fs (KERNEL_DS);
+	err = sys_ioctl(fd, cmd, (unsigned long)&val);
+	set_fs (old_fs);
+	if (!err && put_user(val, argp))
+		return -EFAULT;
+	return err;
+}
+
+struct compat_video_event {
+	int32_t		type;
+	compat_time_t	timestamp;
+	union {
+	        video_size_t size;
+		unsigned int frame_rate;
+	} u;
+};
+
+static int do_video_get_event(unsigned int fd, unsigned int cmd,
+		struct compat_video_event __user *up)
+{
+	struct video_event kevent;
+	mm_segment_t old_fs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
+	set_fs(old_fs);
+
+	if (!err) {
+		err  = put_user(kevent.type, &up->type);
+		err |= put_user(kevent.timestamp, &up->timestamp);
+		err |= put_user(kevent.u.size.w, &up->u.size.w);
+		err |= put_user(kevent.u.size.h, &up->u.size.h);
+		err |= put_user(kevent.u.size.aspect_ratio,
+				&up->u.size.aspect_ratio);
+		if (err)
+			err = -EFAULT;
+	}
+
+	return err;
+}
+
+struct compat_video_still_picture {
+        compat_uptr_t iFrame;
+        int32_t size;
+};
+
+static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
+	struct compat_video_still_picture __user *up)
+{
+	struct video_still_picture __user *up_native;
+	compat_uptr_t fp;
+	int32_t size;
+	int err;
+
+	err  = get_user(fp, &up->iFrame);
+	err |= get_user(size, &up->size);
+	if (err)
+		return -EFAULT;
+
+	up_native =
+		compat_alloc_user_space(sizeof(struct video_still_picture));
+
+	err =  put_user(compat_ptr(fp), &up_native->iFrame);
+	err |= put_user(size, &up_native->size);
+	if (err)
+		return -EFAULT;
+
+	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+
+	return err;
+}
+
+struct compat_video_spu_palette {
+	int length;
+	compat_uptr_t palette;
+};
+
+static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
+		struct compat_video_spu_palette __user *up)
+{
+	struct video_spu_palette __user *up_native;
+	compat_uptr_t palp;
+	int length, err;
+
+	err  = get_user(palp, &up->palette);
+	err |= get_user(length, &up->length);
+	if (err)
+		return -EFAULT;
+
+	up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
+	err  = put_user(compat_ptr(palp), &up_native->palette);
+	err |= put_user(length, &up_native->length);
+	if (err)
+		return -EFAULT;
+
+	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+
+	return err;
+}
+
+#ifdef CONFIG_BLOCK
+typedef struct sg_io_hdr32 {
+	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
+	compat_int_t dxfer_direction;	/* [i] data transfer direction  */
+	unsigned char cmd_len;		/* [i] SCSI command length ( <= 16 bytes) */
+	unsigned char mx_sb_len;		/* [i] max length to write to sbp */
+	unsigned short iovec_count;	/* [i] 0 implies no scatter gather */
+	compat_uint_t dxfer_len;		/* [i] byte count of data transfer */
+	compat_uint_t dxferp;		/* [i], [*io] points to data transfer memory
+					      or scatter gather list */
+	compat_uptr_t cmdp;		/* [i], [*i] points to command to perform */
+	compat_uptr_t sbp;		/* [i], [*o] points to sense_buffer memory */
+	compat_uint_t timeout;		/* [i] MAX_UINT->no timeout (unit: millisec) */
+	compat_uint_t flags;		/* [i] 0 -> default, see SG_FLAG... */
+	compat_int_t pack_id;		/* [i->o] unused internally (normally) */
+	compat_uptr_t usr_ptr;		/* [i->o] unused internally */
+	unsigned char status;		/* [o] scsi status */
+	unsigned char masked_status;	/* [o] shifted, masked scsi status */
+	unsigned char msg_status;		/* [o] messaging level data (optional) */
+	unsigned char sb_len_wr;		/* [o] byte count actually written to sbp */
+	unsigned short host_status;	/* [o] errors from host adapter */
+	unsigned short driver_status;	/* [o] errors from software driver */
+	compat_int_t resid;		/* [o] dxfer_len - actual_transferred */
+	compat_uint_t duration;		/* [o] time taken by cmd (unit: millisec) */
+	compat_uint_t info;		/* [o] auxiliary information */
+} sg_io_hdr32_t;  /* 64 bytes long (on sparc32) */
+
+typedef struct sg_iovec32 {
+	compat_uint_t iov_base;
+	compat_uint_t iov_len;
+} sg_iovec32_t;
+
+static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
+{
+	sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
+	sg_iovec32_t __user *iov32 = dxferp;
+	int i;
+
+	for (i = 0; i < iovec_count; i++) {
+		u32 base, len;
+
+		if (get_user(base, &iov32[i].iov_base) ||
+		    get_user(len, &iov32[i].iov_len) ||
+		    put_user(compat_ptr(base), &iov[i].iov_base) ||
+		    put_user(len, &iov[i].iov_len))
+			return -EFAULT;
+	}
+
+	if (put_user(iov, &sgio->dxferp))
+		return -EFAULT;
+	return 0;
+}
+
+static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+			sg_io_hdr32_t __user *sgio32)
+{
+	sg_io_hdr_t __user *sgio;
+	u16 iovec_count;
+	u32 data;
+	void __user *dxferp;
+	int err;
+	int interface_id;
+
+	if (get_user(interface_id, &sgio32->interface_id))
+		return -EFAULT;
+	if (interface_id != 'S')
+		return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+
+	if (get_user(iovec_count, &sgio32->iovec_count))
+		return -EFAULT;
+
+	{
+		void __user *top = compat_alloc_user_space(0);
+		void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
+				       (iovec_count * sizeof(sg_iovec_t)));
+		if (new > top)
+			return -EINVAL;
+
+		sgio = new;
+	}
+
+	/* Ok, now construct.  */
+	if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
+			 (2 * sizeof(int)) +
+			 (2 * sizeof(unsigned char)) +
+			 (1 * sizeof(unsigned short)) +
+			 (1 * sizeof(unsigned int))))
+		return -EFAULT;
+
+	if (get_user(data, &sgio32->dxferp))
+		return -EFAULT;
+	dxferp = compat_ptr(data);
+	if (iovec_count) {
+		if (sg_build_iovec(sgio, dxferp, iovec_count))
+			return -EFAULT;
+	} else {
+		if (put_user(dxferp, &sgio->dxferp))
+			return -EFAULT;
+	}
+
+	{
+		unsigned char __user *cmdp;
+		unsigned char __user *sbp;
+
+		if (get_user(data, &sgio32->cmdp))
+			return -EFAULT;
+		cmdp = compat_ptr(data);
+
+		if (get_user(data, &sgio32->sbp))
+			return -EFAULT;
+		sbp = compat_ptr(data);
+
+		if (put_user(cmdp, &sgio->cmdp) ||
+		    put_user(sbp, &sgio->sbp))
+			return -EFAULT;
+	}
+
+	if (copy_in_user(&sgio->timeout, &sgio32->timeout,
+			 3 * sizeof(int)))
+		return -EFAULT;
+
+	if (get_user(data, &sgio32->usr_ptr))
+		return -EFAULT;
+	if (put_user(compat_ptr(data), &sgio->usr_ptr))
+		return -EFAULT;
+
+	err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+
+	if (err >= 0) {
+		void __user *datap;
+
+		if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
+				 sizeof(int)) ||
+		    get_user(datap, &sgio->usr_ptr) ||
+		    put_user((u32)(unsigned long)datap,
+			     &sgio32->usr_ptr) ||
+		    copy_in_user(&sgio32->status, &sgio->status,
+				 (4 * sizeof(unsigned char)) +
+				 (2 * sizeof(unsigned short)) +
+				 (3 * sizeof(int))))
+			err = -EFAULT;
+	}
+
+	return err;
+}
+
+struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
+	char req_state;
+	char orphan;
+	char sg_io_owned;
+	char problem;
+	int pack_id;
+	compat_uptr_t usr_ptr;
+	unsigned int duration;
+	int unused;
+};
+
+static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
+			compat_sg_req_info __user *o)
+{
+	int err, i;
+	sg_req_info_t __user *r;
+	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
+	err = sys_ioctl(fd,cmd,(unsigned long)r);
+	if (err < 0)
+		return err;
+	for (i = 0; i < SG_MAX_QUEUE; i++) {
+		void __user *ptr;
+		int d;
+
+		if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) ||
+		    get_user(ptr, &r[i].usr_ptr) ||
+		    get_user(d, &r[i].duration) ||
+		    put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) ||
+		    put_user(d, &o[i].duration))
+			return -EFAULT;
+	}
+	return err;
+}
+#endif /* CONFIG_BLOCK */
+
+struct sock_fprog32 {
+	unsigned short	len;
+	compat_caddr_t	filter;
+};
+
+#define PPPIOCSPASS32	_IOW('t', 71, struct sock_fprog32)
+#define PPPIOCSACTIVE32	_IOW('t', 70, struct sock_fprog32)
+
+static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
+			struct sock_fprog32 __user *u_fprog32)
+{
+	struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
+	void __user *fptr64;
+	u32 fptr32;
+	u16 flen;
+
+	if (get_user(flen, &u_fprog32->len) ||
+	    get_user(fptr32, &u_fprog32->filter))
+		return -EFAULT;
+
+	fptr64 = compat_ptr(fptr32);
+
+	if (put_user(flen, &u_fprog64->len) ||
+	    put_user(fptr64, &u_fprog64->filter))
+		return -EFAULT;
+
+	if (cmd == PPPIOCSPASS32)
+		cmd = PPPIOCSPASS;
+	else
+		cmd = PPPIOCSACTIVE;
+
+	return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+}
+
+struct ppp_option_data32 {
+	compat_caddr_t	ptr;
+	u32			length;
+	compat_int_t		transmit;
+};
+#define PPPIOCSCOMPRESS32	_IOW('t', 77, struct ppp_option_data32)
+
+struct ppp_idle32 {
+	compat_time_t xmit_idle;
+	compat_time_t recv_idle;
+};
+#define PPPIOCGIDLE32		_IOR('t', 63, struct ppp_idle32)
+
+static int ppp_gidle(unsigned int fd, unsigned int cmd,
+		struct ppp_idle32 __user *idle32)
+{
+	struct ppp_idle __user *idle;
+	__kernel_time_t xmit, recv;
+	int err;
+
+	idle = compat_alloc_user_space(sizeof(*idle));
+
+	err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+
+	if (!err) {
+		if (get_user(xmit, &idle->xmit_idle) ||
+		    get_user(recv, &idle->recv_idle) ||
+		    put_user(xmit, &idle32->xmit_idle) ||
+		    put_user(recv, &idle32->recv_idle))
+			err = -EFAULT;
+	}
+	return err;
+}
+
+static int ppp_scompress(unsigned int fd, unsigned int cmd,
+	struct ppp_option_data32 __user *odata32)
+{
+	struct ppp_option_data __user *odata;
+	__u32 data;
+	void __user *datap;
+
+	odata = compat_alloc_user_space(sizeof(*odata));
+
+	if (get_user(data, &odata32->ptr))
+		return -EFAULT;
+
+	datap = compat_ptr(data);
+	if (put_user(datap, &odata->ptr))
+		return -EFAULT;
+
+	if (copy_in_user(&odata->length, &odata32->length,
+			 sizeof(__u32) + sizeof(int)))
+		return -EFAULT;
+
+	return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+}
+
+#ifdef CONFIG_BLOCK
+struct mtget32 {
+	compat_long_t	mt_type;
+	compat_long_t	mt_resid;
+	compat_long_t	mt_dsreg;
+	compat_long_t	mt_gstat;
+	compat_long_t	mt_erreg;
+	compat_daddr_t	mt_fileno;
+	compat_daddr_t	mt_blkno;
+};
+#define MTIOCGET32	_IOR('m', 2, struct mtget32)
+
+struct mtpos32 {
+	compat_long_t	mt_blkno;
+};
+#define MTIOCPOS32	_IOR('m', 3, struct mtpos32)
+
+static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+{
+	mm_segment_t old_fs = get_fs();
+	struct mtget get;
+	struct mtget32 __user *umget32;
+	struct mtpos pos;
+	struct mtpos32 __user *upos32;
+	unsigned long kcmd;
+	void *karg;
+	int err = 0;
+
+	switch(cmd) {
+	case MTIOCPOS32:
+		kcmd = MTIOCPOS;
+		karg = &pos;
+		break;
+	default:	/* MTIOCGET32 */
+		kcmd = MTIOCGET;
+		karg = &get;
+		break;
+	}
+	set_fs (KERNEL_DS);
+	err = sys_ioctl (fd, kcmd, (unsigned long)karg);
+	set_fs (old_fs);
+	if (err)
+		return err;
+	switch (cmd) {
+	case MTIOCPOS32:
+		upos32 = argp;
+		err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+		break;
+	case MTIOCGET32:
+		umget32 = argp;
+		err = __put_user(get.mt_type, &umget32->mt_type);
+		err |= __put_user(get.mt_resid, &umget32->mt_resid);
+		err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
+		err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
+		err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
+		err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
+		err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+		break;
+	}
+	return err ? -EFAULT: 0;
+}
+
+#endif /* CONFIG_BLOCK */
+
+/* Bluetooth ioctls */
+#define HCIUARTSETPROTO		_IOW('U', 200, int)
+#define HCIUARTGETPROTO		_IOR('U', 201, int)
+#define HCIUARTGETDEVICE	_IOR('U', 202, int)
+#define HCIUARTSETFLAGS		_IOW('U', 203, int)
+#define HCIUARTGETFLAGS		_IOR('U', 204, int)
+
+#define BNEPCONNADD	_IOW('B', 200, int)
+#define BNEPCONNDEL	_IOW('B', 201, int)
+#define BNEPGETCONNLIST	_IOR('B', 210, int)
+#define BNEPGETCONNINFO	_IOR('B', 211, int)
+#define BNEPGETSUPPFEAT	_IOR('B', 212, int)
+
+#define CMTPCONNADD	_IOW('C', 200, int)
+#define CMTPCONNDEL	_IOW('C', 201, int)
+#define CMTPGETCONNLIST	_IOR('C', 210, int)
+#define CMTPGETCONNINFO	_IOR('C', 211, int)
+
+#define HIDPCONNADD	_IOW('H', 200, int)
+#define HIDPCONNDEL	_IOW('H', 201, int)
+#define HIDPGETCONNLIST	_IOR('H', 210, int)
+#define HIDPGETCONNINFO	_IOR('H', 211, int)
+
+
+struct serial_struct32 {
+        compat_int_t    type;
+        compat_int_t    line;
+        compat_uint_t   port;
+        compat_int_t    irq;
+        compat_int_t    flags;
+        compat_int_t    xmit_fifo_size;
+        compat_int_t    custom_divisor;
+        compat_int_t    baud_base;
+        unsigned short  close_delay;
+        char    io_type;
+        char    reserved_char[1];
+        compat_int_t    hub6;
+        unsigned short  closing_wait; /* time to wait before closing */
+        unsigned short  closing_wait2; /* no longer used... */
+        compat_uint_t   iomem_base;
+        unsigned short  iomem_reg_shift;
+        unsigned int    port_high;
+     /* compat_ulong_t  iomap_base FIXME */
+        compat_int_t    reserved[1];
+};
+
+static int serial_struct_ioctl(unsigned fd, unsigned cmd,
+			struct serial_struct32 __user *ss32)
+{
+        typedef struct serial_struct32 SS32;
+        int err;
+        struct serial_struct ss;
+        mm_segment_t oldseg = get_fs();
+        __u32 udata;
+	unsigned int base;
+
+        if (cmd == TIOCSSERIAL) {
+                if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
+                        return -EFAULT;
+                if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
+			return -EFAULT;
+                if (__get_user(udata, &ss32->iomem_base))
+			return -EFAULT;
+                ss.iomem_base = compat_ptr(udata);
+                if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
+		    __get_user(ss.port_high, &ss32->port_high))
+			return -EFAULT;
+                ss.iomap_base = 0UL;
+        }
+        set_fs(KERNEL_DS);
+                err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
+        set_fs(oldseg);
+        if (cmd == TIOCGSERIAL && err >= 0) {
+                if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
+                        return -EFAULT;
+                if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+			return -EFAULT;
+		base = (unsigned long)ss.iomem_base  >> 32 ?
+			0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
+		if (__put_user(base, &ss32->iomem_base) ||
+		    __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
+		    __put_user(ss.port_high, &ss32->port_high))
+			return -EFAULT;
+        }
+        return err;
+}
+
+/*
+ * I2C layer ioctls
+ */
+
+struct i2c_msg32 {
+	u16 addr;
+	u16 flags;
+	u16 len;
+	compat_caddr_t buf;
+};
+
+struct i2c_rdwr_ioctl_data32 {
+	compat_caddr_t msgs; /* struct i2c_msg __user *msgs */
+	u32 nmsgs;
+};
+
+struct i2c_smbus_ioctl_data32 {
+	u8 read_write;
+	u8 command;
+	u32 size;
+	compat_caddr_t data; /* union i2c_smbus_data *data */
+};
+
+struct i2c_rdwr_aligned {
+	struct i2c_rdwr_ioctl_data cmd;
+	struct i2c_msg msgs[0];
+};
+
+static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
+			struct i2c_rdwr_ioctl_data32    __user *udata)
+{
+	struct i2c_rdwr_aligned		__user *tdata;
+	struct i2c_msg			__user *tmsgs;
+	struct i2c_msg32		__user *umsgs;
+	compat_caddr_t			datap;
+	u32				nmsgs;
+	int				i;
+
+	if (get_user(nmsgs, &udata->nmsgs))
+		return -EFAULT;
+	if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS)
+		return -EINVAL;
+
+	if (get_user(datap, &udata->msgs))
+		return -EFAULT;
+	umsgs = compat_ptr(datap);
+
+	tdata = compat_alloc_user_space(sizeof(*tdata) +
+				      nmsgs * sizeof(struct i2c_msg));
+	tmsgs = &tdata->msgs[0];
+
+	if (put_user(nmsgs, &tdata->cmd.nmsgs) ||
+	    put_user(tmsgs, &tdata->cmd.msgs))
+		return -EFAULT;
+
+	for (i = 0; i < nmsgs; i++) {
+		if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16)))
+			return -EFAULT;
+		if (get_user(datap, &umsgs[i].buf) ||
+		    put_user(compat_ptr(datap), &tmsgs[i].buf))
+			return -EFAULT;
+	}
+	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+}
+
+static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
+			struct i2c_smbus_ioctl_data32   __user *udata)
+{
+	struct i2c_smbus_ioctl_data	__user *tdata;
+	compat_caddr_t			datap;
+
+	tdata = compat_alloc_user_space(sizeof(*tdata));
+	if (tdata == NULL)
+		return -ENOMEM;
+	if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
+		return -EFAULT;
+
+	if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8)))
+		return -EFAULT;
+	if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32)))
+		return -EFAULT;
+	if (__get_user(datap, &udata->data) ||
+	    __put_user(compat_ptr(datap), &tdata->data))
+		return -EFAULT;
+
+	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+}
+
+#define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
+#define RTC_IRQP_SET32		_IOW('p', 0x0c, compat_ulong_t)
+#define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
+#define RTC_EPOCH_SET32		_IOW('p', 0x0e, compat_ulong_t)
+
+static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+{
+	mm_segment_t oldfs = get_fs();
+	compat_ulong_t val32;
+	unsigned long kval;
+	int ret;
+
+	switch (cmd) {
+	case RTC_IRQP_READ32:
+	case RTC_EPOCH_READ32:
+		set_fs(KERNEL_DS);
+		ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+					RTC_IRQP_READ : RTC_EPOCH_READ,
+					(unsigned long)&kval);
+		set_fs(oldfs);
+		if (ret)
+			return ret;
+		val32 = kval;
+		return put_user(val32, (unsigned int __user *)argp);
+	case RTC_IRQP_SET32:
+		return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+	case RTC_EPOCH_SET32:
+		return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+struct space_resv_32 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start	__attribute__((packed));
+			/* len == 0 means until end of file */
+	__s64		l_len __attribute__((packed));
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area */
+};
+
+#define FS_IOC_RESVSP_32		_IOW ('X', 40, struct space_resv_32)
+#define FS_IOC_RESVSP64_32	_IOW ('X', 42, struct space_resv_32)
+
+/* just account for different alignment */
+static int compat_ioctl_preallocate(struct file *file,
+			struct space_resv_32    __user *p32)
+{
+	struct space_resv	__user *p = compat_alloc_user_space(sizeof(*p));
+
+	if (copy_in_user(&p->l_type,	&p32->l_type,	sizeof(s16)) ||
+	    copy_in_user(&p->l_whence,	&p32->l_whence, sizeof(s16)) ||
+	    copy_in_user(&p->l_start,	&p32->l_start,	sizeof(s64)) ||
+	    copy_in_user(&p->l_len,	&p32->l_len,	sizeof(s64)) ||
+	    copy_in_user(&p->l_sysid,	&p32->l_sysid,	sizeof(s32)) ||
+	    copy_in_user(&p->l_pid,	&p32->l_pid,	sizeof(u32)) ||
+	    copy_in_user(&p->l_pad,	&p32->l_pad,	4*sizeof(u32)))
+		return -EFAULT;
+
+	return ioctl_preallocate(file, p);
+}
+#endif
+
+/*
+ * simple reversible transform to make our table more evenly
+ * distributed after sorting.
+ */
+#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
+
+#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
+/* ioctl should not be warned about even if it's not implemented.
+   Valid reasons to use this:
+   - It is implemented with ->compat_ioctl on some device, but programs
+   call it on others too.
+   - The ioctl is not implemented in the native kernel, but programs
+   call it commonly anyways.
+   Most other reasons are not valid. */
+#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
+
+static unsigned int ioctl_pointer[] = {
+/* compatible ioctls first */
+COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
+COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
+
+/* Big T */
+COMPATIBLE_IOCTL(TCGETA)
+COMPATIBLE_IOCTL(TCSETA)
+COMPATIBLE_IOCTL(TCSETAW)
+COMPATIBLE_IOCTL(TCSETAF)
+COMPATIBLE_IOCTL(TCSBRK)
+COMPATIBLE_IOCTL(TCXONC)
+COMPATIBLE_IOCTL(TCFLSH)
+COMPATIBLE_IOCTL(TCGETS)
+COMPATIBLE_IOCTL(TCSETS)
+COMPATIBLE_IOCTL(TCSETSW)
+COMPATIBLE_IOCTL(TCSETSF)
+COMPATIBLE_IOCTL(TIOCLINUX)
+COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCGDEV)
+COMPATIBLE_IOCTL(TIOCCBRK)
+COMPATIBLE_IOCTL(TIOCGSID)
+COMPATIBLE_IOCTL(TIOCGICOUNT)
+COMPATIBLE_IOCTL(TIOCGPKT)
+COMPATIBLE_IOCTL(TIOCGPTLCK)
+COMPATIBLE_IOCTL(TIOCGEXCL)
+/* Little t */
+COMPATIBLE_IOCTL(TIOCGETD)
+COMPATIBLE_IOCTL(TIOCSETD)
+COMPATIBLE_IOCTL(TIOCEXCL)
+COMPATIBLE_IOCTL(TIOCNXCL)
+COMPATIBLE_IOCTL(TIOCCONS)
+COMPATIBLE_IOCTL(TIOCGSOFTCAR)
+COMPATIBLE_IOCTL(TIOCSSOFTCAR)
+COMPATIBLE_IOCTL(TIOCSWINSZ)
+COMPATIBLE_IOCTL(TIOCGWINSZ)
+COMPATIBLE_IOCTL(TIOCMGET)
+COMPATIBLE_IOCTL(TIOCMBIC)
+COMPATIBLE_IOCTL(TIOCMBIS)
+COMPATIBLE_IOCTL(TIOCMSET)
+COMPATIBLE_IOCTL(TIOCPKT)
+COMPATIBLE_IOCTL(TIOCNOTTY)
+COMPATIBLE_IOCTL(TIOCSTI)
+COMPATIBLE_IOCTL(TIOCOUTQ)
+COMPATIBLE_IOCTL(TIOCSPGRP)
+COMPATIBLE_IOCTL(TIOCGPGRP)
+COMPATIBLE_IOCTL(TIOCGPTN)
+COMPATIBLE_IOCTL(TIOCSPTLCK)
+COMPATIBLE_IOCTL(TIOCSERGETLSR)
+COMPATIBLE_IOCTL(TIOCSIG)
+#ifdef TIOCSRS485
+COMPATIBLE_IOCTL(TIOCSRS485)
+#endif
+#ifdef TIOCGRS485
+COMPATIBLE_IOCTL(TIOCGRS485)
+#endif
+#ifdef TCGETS2
+COMPATIBLE_IOCTL(TCGETS2)
+COMPATIBLE_IOCTL(TCSETS2)
+COMPATIBLE_IOCTL(TCSETSW2)
+COMPATIBLE_IOCTL(TCSETSF2)
+#endif
+/* Little f */
+COMPATIBLE_IOCTL(FIOCLEX)
+COMPATIBLE_IOCTL(FIONCLEX)
+COMPATIBLE_IOCTL(FIOASYNC)
+COMPATIBLE_IOCTL(FIONBIO)
+COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
+/* 0x00 */
+COMPATIBLE_IOCTL(FIBMAP)
+COMPATIBLE_IOCTL(FIGETBSZ)
+/* 'X' - originally XFS but some now in the VFS */
+COMPATIBLE_IOCTL(FIFREEZE)
+COMPATIBLE_IOCTL(FITHAW)
+COMPATIBLE_IOCTL(KDGETKEYCODE)
+COMPATIBLE_IOCTL(KDSETKEYCODE)
+COMPATIBLE_IOCTL(KDGKBTYPE)
+COMPATIBLE_IOCTL(KDGETMODE)
+COMPATIBLE_IOCTL(KDGKBMODE)
+COMPATIBLE_IOCTL(KDGKBMETA)
+COMPATIBLE_IOCTL(KDGKBENT)
+COMPATIBLE_IOCTL(KDSKBENT)
+COMPATIBLE_IOCTL(KDGKBSENT)
+COMPATIBLE_IOCTL(KDSKBSENT)
+COMPATIBLE_IOCTL(KDGKBDIACR)
+COMPATIBLE_IOCTL(KDSKBDIACR)
+COMPATIBLE_IOCTL(KDGKBDIACRUC)
+COMPATIBLE_IOCTL(KDSKBDIACRUC)
+COMPATIBLE_IOCTL(KDKBDREP)
+COMPATIBLE_IOCTL(KDGKBLED)
+COMPATIBLE_IOCTL(KDGETLED)
+#ifdef CONFIG_BLOCK
+/* Big S */
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
+COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
+COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK)
+COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY)
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
+COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
+COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
+/* Big V (don't complain on serial console) */
+IGNORE_IOCTL(VT_OPENQRY)
+IGNORE_IOCTL(VT_GETMODE)
+/* Little p (/dev/rtc, /dev/envctrl, etc.) */
+COMPATIBLE_IOCTL(RTC_AIE_ON)
+COMPATIBLE_IOCTL(RTC_AIE_OFF)
+COMPATIBLE_IOCTL(RTC_UIE_ON)
+COMPATIBLE_IOCTL(RTC_UIE_OFF)
+COMPATIBLE_IOCTL(RTC_PIE_ON)
+COMPATIBLE_IOCTL(RTC_PIE_OFF)
+COMPATIBLE_IOCTL(RTC_WIE_ON)
+COMPATIBLE_IOCTL(RTC_WIE_OFF)
+COMPATIBLE_IOCTL(RTC_ALM_SET)
+COMPATIBLE_IOCTL(RTC_ALM_READ)
+COMPATIBLE_IOCTL(RTC_RD_TIME)
+COMPATIBLE_IOCTL(RTC_SET_TIME)
+COMPATIBLE_IOCTL(RTC_WKALM_SET)
+COMPATIBLE_IOCTL(RTC_WKALM_RD)
+/*
+ * These two are only for the sbus rtc driver, but
+ * hwclock tries them on every rtc device first when
+ * running on sparc.  On other architectures the entries
+ * are useless but harmless.
+ */
+COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */
+COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
+/* Little m */
+COMPATIBLE_IOCTL(MTIOCTOP)
+/* Socket level stuff */
+COMPATIBLE_IOCTL(FIOQSIZE)
+#ifdef CONFIG_BLOCK
+/* md calls this on random blockdevs */
+IGNORE_IOCTL(RAID_VERSION)
+/* qemu/qemu-img might call these two on plain files for probing */
+IGNORE_IOCTL(CDROM_DRIVE_STATUS)
+IGNORE_IOCTL(FDGETPRM32)
+/* SG stuff */
+COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
+COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
+COMPATIBLE_IOCTL(SG_EMULATED_HOST)
+COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
+COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
+COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
+COMPATIBLE_IOCTL(SG_GET_SCSI_ID)
+COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA)
+COMPATIBLE_IOCTL(SG_GET_LOW_DMA)
+COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID)
+COMPATIBLE_IOCTL(SG_GET_PACK_ID)
+COMPATIBLE_IOCTL(SG_GET_NUM_WAITING)
+COMPATIBLE_IOCTL(SG_SET_DEBUG)
+COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE)
+COMPATIBLE_IOCTL(SG_GET_COMMAND_Q)
+COMPATIBLE_IOCTL(SG_SET_COMMAND_Q)
+COMPATIBLE_IOCTL(SG_GET_VERSION_NUM)
+COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN)
+COMPATIBLE_IOCTL(SG_SCSI_RESET)
+COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
+COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
+COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
+/* PPP stuff */
+COMPATIBLE_IOCTL(PPPIOCGFLAGS)
+COMPATIBLE_IOCTL(PPPIOCSFLAGS)
+COMPATIBLE_IOCTL(PPPIOCGASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCGUNIT)
+COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCGMRU)
+COMPATIBLE_IOCTL(PPPIOCSMRU)
+COMPATIBLE_IOCTL(PPPIOCSMAXCID)
+COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCXFERUNIT)
+/* PPPIOCSCOMPRESS is translated */
+COMPATIBLE_IOCTL(PPPIOCGNPMODE)
+COMPATIBLE_IOCTL(PPPIOCSNPMODE)
+COMPATIBLE_IOCTL(PPPIOCGDEBUG)
+COMPATIBLE_IOCTL(PPPIOCSDEBUG)
+/* PPPIOCSPASS is translated */
+/* PPPIOCSACTIVE is translated */
+/* PPPIOCGIDLE is translated */
+COMPATIBLE_IOCTL(PPPIOCNEWUNIT)
+COMPATIBLE_IOCTL(PPPIOCATTACH)
+COMPATIBLE_IOCTL(PPPIOCDETACH)
+COMPATIBLE_IOCTL(PPPIOCSMRRU)
+COMPATIBLE_IOCTL(PPPIOCCONNECT)
+COMPATIBLE_IOCTL(PPPIOCDISCONN)
+COMPATIBLE_IOCTL(PPPIOCATTCHAN)
+COMPATIBLE_IOCTL(PPPIOCGCHAN)
+COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
+/* PPPOX */
+COMPATIBLE_IOCTL(PPPOEIOCSFWD)
+COMPATIBLE_IOCTL(PPPOEIOCDFWD)
+/* ppdev */
+COMPATIBLE_IOCTL(PPSETMODE)
+COMPATIBLE_IOCTL(PPRSTATUS)
+COMPATIBLE_IOCTL(PPRCONTROL)
+COMPATIBLE_IOCTL(PPWCONTROL)
+COMPATIBLE_IOCTL(PPFCONTROL)
+COMPATIBLE_IOCTL(PPRDATA)
+COMPATIBLE_IOCTL(PPWDATA)
+COMPATIBLE_IOCTL(PPCLAIM)
+COMPATIBLE_IOCTL(PPRELEASE)
+COMPATIBLE_IOCTL(PPYIELD)
+COMPATIBLE_IOCTL(PPEXCL)
+COMPATIBLE_IOCTL(PPDATADIR)
+COMPATIBLE_IOCTL(PPNEGOT)
+COMPATIBLE_IOCTL(PPWCTLONIRQ)
+COMPATIBLE_IOCTL(PPCLRIRQ)
+COMPATIBLE_IOCTL(PPSETPHASE)
+COMPATIBLE_IOCTL(PPGETMODES)
+COMPATIBLE_IOCTL(PPGETMODE)
+COMPATIBLE_IOCTL(PPGETPHASE)
+COMPATIBLE_IOCTL(PPGETFLAGS)
+COMPATIBLE_IOCTL(PPSETFLAGS)
+/* Big A */
+/* sparc only */
+/* Big Q for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE)
+COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL)
+COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE)
+/* Big T for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_START)
+COMPATIBLE_IOCTL(SNDCTL_TMR_STOP)
+COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO)
+COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME)
+COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT)
+/* Little m for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD)
+/* Big P for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_DSP_RESET)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED)
+COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS)
+COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_POST)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR)
+/* SNDCTL_DSP_MAPINBUF,  XXX needs translation */
+/* SNDCTL_DSP_MAPOUTBUF,  XXX needs translation */
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY)
+COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER)
+/* Big C for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_COPR_RESET)
+COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE)
+COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA)
+COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RUN)
+COMPATIBLE_IOCTL(SNDCTL_COPR_HALT)
+COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG)
+/* Big M for sound/OSS */
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
+/* SOUND_MIXER_READ_ENHANCE,  same value as READ_MUTE */
+/* SOUND_MIXER_READ_LOUD,  same value as READ_MUTE */
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
+/* SOUND_MIXER_WRITE_ENHANCE,  same value as WRITE_MUTE */
+/* SOUND_MIXER_WRITE_LOUD,  same value as WRITE_MUTE */
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC)
+COMPATIBLE_IOCTL(SOUND_MIXER_INFO)
+COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO)
+COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS)
+COMPATIBLE_IOCTL(SOUND_MIXER_AGC)
+COMPATIBLE_IOCTL(SOUND_MIXER_3DSE)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
+COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
+COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
+COMPATIBLE_IOCTL(OSS_GETVERSION)
+/* Raw devices */
+COMPATIBLE_IOCTL(RAW_SETBIND)
+COMPATIBLE_IOCTL(RAW_GETBIND)
+/* Watchdog */
+COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
+COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
+COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
+COMPATIBLE_IOCTL(WDIOC_GETTEMP)
+COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
+COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
+COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
+/* Big R */
+COMPATIBLE_IOCTL(RNDGETENTCNT)
+COMPATIBLE_IOCTL(RNDADDTOENTCNT)
+COMPATIBLE_IOCTL(RNDGETPOOL)
+COMPATIBLE_IOCTL(RNDADDENTROPY)
+COMPATIBLE_IOCTL(RNDZAPENTCNT)
+COMPATIBLE_IOCTL(RNDCLEARPOOL)
+/* Bluetooth */
+COMPATIBLE_IOCTL(HCIDEVUP)
+COMPATIBLE_IOCTL(HCIDEVDOWN)
+COMPATIBLE_IOCTL(HCIDEVRESET)
+COMPATIBLE_IOCTL(HCIDEVRESTAT)
+COMPATIBLE_IOCTL(HCIGETDEVLIST)
+COMPATIBLE_IOCTL(HCIGETDEVINFO)
+COMPATIBLE_IOCTL(HCIGETCONNLIST)
+COMPATIBLE_IOCTL(HCIGETCONNINFO)
+COMPATIBLE_IOCTL(HCIGETAUTHINFO)
+COMPATIBLE_IOCTL(HCISETRAW)
+COMPATIBLE_IOCTL(HCISETSCAN)
+COMPATIBLE_IOCTL(HCISETAUTH)
+COMPATIBLE_IOCTL(HCISETENCRYPT)
+COMPATIBLE_IOCTL(HCISETPTYPE)
+COMPATIBLE_IOCTL(HCISETLINKPOL)
+COMPATIBLE_IOCTL(HCISETLINKMODE)
+COMPATIBLE_IOCTL(HCISETACLMTU)
+COMPATIBLE_IOCTL(HCISETSCOMTU)
+COMPATIBLE_IOCTL(HCIBLOCKADDR)
+COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
+COMPATIBLE_IOCTL(HCIINQUIRY)
+COMPATIBLE_IOCTL(HCIUARTSETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
+COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
+COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
+COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
+COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
+COMPATIBLE_IOCTL(BNEPCONNADD)
+COMPATIBLE_IOCTL(BNEPCONNDEL)
+COMPATIBLE_IOCTL(BNEPGETCONNLIST)
+COMPATIBLE_IOCTL(BNEPGETCONNINFO)
+COMPATIBLE_IOCTL(BNEPGETSUPPFEAT)
+COMPATIBLE_IOCTL(CMTPCONNADD)
+COMPATIBLE_IOCTL(CMTPCONNDEL)
+COMPATIBLE_IOCTL(CMTPGETCONNLIST)
+COMPATIBLE_IOCTL(CMTPGETCONNINFO)
+COMPATIBLE_IOCTL(HIDPCONNADD)
+COMPATIBLE_IOCTL(HIDPCONNDEL)
+COMPATIBLE_IOCTL(HIDPGETCONNLIST)
+COMPATIBLE_IOCTL(HIDPGETCONNINFO)
+/* CAPI */
+COMPATIBLE_IOCTL(CAPI_REGISTER)
+COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
+COMPATIBLE_IOCTL(CAPI_GET_VERSION)
+COMPATIBLE_IOCTL(CAPI_GET_SERIAL)
+COMPATIBLE_IOCTL(CAPI_GET_PROFILE)
+COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD)
+COMPATIBLE_IOCTL(CAPI_GET_ERRCODE)
+COMPATIBLE_IOCTL(CAPI_INSTALLED)
+COMPATIBLE_IOCTL(CAPI_GET_FLAGS)
+COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
+COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
+COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
+COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
+/* Siemens Gigaset */
+COMPATIBLE_IOCTL(GIGASET_REDIR)
+COMPATIBLE_IOCTL(GIGASET_CONFIG)
+COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
+COMPATIBLE_IOCTL(GIGASET_VERSION)
+/* Misc. */
+COMPATIBLE_IOCTL(0x41545900)		/* ATYIO_CLKR */
+COMPATIBLE_IOCTL(0x41545901)		/* ATYIO_CLKW */
+COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
+COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
+COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
+COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
+/* NBD */
+COMPATIBLE_IOCTL(NBD_DO_IT)
+COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
+COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
+COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
+COMPATIBLE_IOCTL(NBD_DISCONNECT)
+/* i2c */
+COMPATIBLE_IOCTL(I2C_SLAVE)
+COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
+COMPATIBLE_IOCTL(I2C_TENBIT)
+COMPATIBLE_IOCTL(I2C_PEC)
+COMPATIBLE_IOCTL(I2C_RETRIES)
+COMPATIBLE_IOCTL(I2C_TIMEOUT)
+/* hiddev */
+COMPATIBLE_IOCTL(HIDIOCGVERSION)
+COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
+COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
+COMPATIBLE_IOCTL(HIDIOCGSTRING)
+COMPATIBLE_IOCTL(HIDIOCINITREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORT)
+COMPATIBLE_IOCTL(HIDIOCSREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
+COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
+COMPATIBLE_IOCTL(HIDIOCGUSAGE)
+COMPATIBLE_IOCTL(HIDIOCSUSAGE)
+COMPATIBLE_IOCTL(HIDIOCGUCODE)
+COMPATIBLE_IOCTL(HIDIOCGFLAG)
+COMPATIBLE_IOCTL(HIDIOCSFLAG)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
+/* dvb */
+COMPATIBLE_IOCTL(AUDIO_STOP)
+COMPATIBLE_IOCTL(AUDIO_PLAY)
+COMPATIBLE_IOCTL(AUDIO_PAUSE)
+COMPATIBLE_IOCTL(AUDIO_CONTINUE)
+COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
+COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
+COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
+COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
+COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
+COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
+COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
+COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
+COMPATIBLE_IOCTL(AUDIO_SET_ID)
+COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
+COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
+COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID)
+COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES)
+COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE)
+COMPATIBLE_IOCTL(DMX_START)
+COMPATIBLE_IOCTL(DMX_STOP)
+COMPATIBLE_IOCTL(DMX_SET_FILTER)
+COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
+COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
+COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
+COMPATIBLE_IOCTL(DMX_GET_CAPS)
+COMPATIBLE_IOCTL(DMX_SET_SOURCE)
+COMPATIBLE_IOCTL(DMX_GET_STC)
+COMPATIBLE_IOCTL(FE_GET_INFO)
+COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD)
+COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD)
+COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY)
+COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST)
+COMPATIBLE_IOCTL(FE_SET_TONE)
+COMPATIBLE_IOCTL(FE_SET_VOLTAGE)
+COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE)
+COMPATIBLE_IOCTL(FE_READ_STATUS)
+COMPATIBLE_IOCTL(FE_READ_BER)
+COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH)
+COMPATIBLE_IOCTL(FE_READ_SNR)
+COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS)
+COMPATIBLE_IOCTL(FE_SET_FRONTEND)
+COMPATIBLE_IOCTL(FE_GET_FRONTEND)
+COMPATIBLE_IOCTL(FE_GET_EVENT)
+COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD)
+COMPATIBLE_IOCTL(VIDEO_STOP)
+COMPATIBLE_IOCTL(VIDEO_PLAY)
+COMPATIBLE_IOCTL(VIDEO_FREEZE)
+COMPATIBLE_IOCTL(VIDEO_CONTINUE)
+COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
+COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
+COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
+COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
+COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
+COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
+COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
+COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
+COMPATIBLE_IOCTL(VIDEO_SET_ID)
+COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
+COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
+COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM)
+COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT)
+COMPATIBLE_IOCTL(VIDEO_SET_SPU)
+COMPATIBLE_IOCTL(VIDEO_GET_NAVI)
+COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
+COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
+COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
+
+/* joystick */
+COMPATIBLE_IOCTL(JSIOCGVERSION)
+COMPATIBLE_IOCTL(JSIOCGAXES)
+COMPATIBLE_IOCTL(JSIOCGBUTTONS)
+COMPATIBLE_IOCTL(JSIOCGNAME(0))
+
+#ifdef TIOCGLTC
+COMPATIBLE_IOCTL(TIOCGLTC)
+COMPATIBLE_IOCTL(TIOCSLTC)
+#endif
+#ifdef TIOCSTART
+/*
+ * For these two we have definitions in ioctls.h and/or termios.h on
+ * some architectures but no actual implemention.  Some applications
+ * like bash call them if they are defined in the headers, so we provide
+ * entries here to avoid syslog message spew.
+ */
+COMPATIBLE_IOCTL(TIOCSTART)
+COMPATIBLE_IOCTL(TIOCSTOP)
+#endif
+
+/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
+   but we don't want warnings on other file systems. So declare
+   them as compatible here. */
+#define VFAT_IOCTL_READDIR_BOTH32       _IOR('r', 1, struct compat_dirent[2])
+#define VFAT_IOCTL_READDIR_SHORT32      _IOR('r', 2, struct compat_dirent[2])
+
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
+
+#ifdef CONFIG_SPARC
+/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
+IGNORE_IOCTL(FBIOGTYPE)
+IGNORE_IOCTL(FBIOSATTR)
+IGNORE_IOCTL(FBIOGATTR)
+IGNORE_IOCTL(FBIOSVIDEO)
+IGNORE_IOCTL(FBIOGVIDEO)
+IGNORE_IOCTL(FBIOSCURPOS)
+IGNORE_IOCTL(FBIOGCURPOS)
+IGNORE_IOCTL(FBIOGCURMAX)
+IGNORE_IOCTL(FBIOPUTCMAP32)
+IGNORE_IOCTL(FBIOGETCMAP32)
+IGNORE_IOCTL(FBIOSCURSOR32)
+IGNORE_IOCTL(FBIOGCURSOR32)
+#endif
+};
+
+/*
+ * Convert common ioctl arguments based on their command number
+ *
+ * Please do not add any code in here. Instead, implement
+ * a compat_ioctl operation in the place that handleѕ the
+ * ioctl for the native case.
+ */
+static long do_ioctl_trans(int fd, unsigned int cmd,
+		 unsigned long arg, struct file *file)
+{
+	void __user *argp = compat_ptr(arg);
+
+	switch (cmd) {
+	case PPPIOCGIDLE32:
+		return ppp_gidle(fd, cmd, argp);
+	case PPPIOCSCOMPRESS32:
+		return ppp_scompress(fd, cmd, argp);
+	case PPPIOCSPASS32:
+	case PPPIOCSACTIVE32:
+		return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+#ifdef CONFIG_BLOCK
+	case SG_IO:
+		return sg_ioctl_trans(fd, cmd, argp);
+	case SG_GET_REQUEST_TABLE:
+		return sg_grt_trans(fd, cmd, argp);
+	case MTIOCGET32:
+	case MTIOCPOS32:
+		return mt_ioctl_trans(fd, cmd, argp);
+#endif
+	/* Serial */
+	case TIOCGSERIAL:
+	case TIOCSSERIAL:
+		return serial_struct_ioctl(fd, cmd, argp);
+	/* i2c */
+	case I2C_FUNCS:
+		return w_long(fd, cmd, argp);
+	case I2C_RDWR:
+		return do_i2c_rdwr_ioctl(fd, cmd, argp);
+	case I2C_SMBUS:
+		return do_i2c_smbus_ioctl(fd, cmd, argp);
+	/* Not implemented in the native kernel */
+	case RTC_IRQP_READ32:
+	case RTC_IRQP_SET32:
+	case RTC_EPOCH_READ32:
+	case RTC_EPOCH_SET32:
+		return rtc_ioctl(fd, cmd, argp);
+
+	/* dvb */
+	case VIDEO_GET_EVENT:
+		return do_video_get_event(fd, cmd, argp);
+	case VIDEO_STILLPICTURE:
+		return do_video_stillpicture(fd, cmd, argp);
+	case VIDEO_SET_SPU_PALETTE:
+		return do_video_set_spu_palette(fd, cmd, argp);
+	}
+
+	/*
+	 * These take an integer instead of a pointer as 'arg',
+	 * so we must not do a compat_ptr() translation.
+	 */
+	switch (cmd) {
+	/* Big T */
+	case TCSBRKP:
+	case TIOCMIWAIT:
+	case TIOCSCTTY:
+	/* RAID */
+	case HOT_REMOVE_DISK:
+	case HOT_ADD_DISK:
+	case SET_DISK_FAULTY:
+	case SET_BITMAP_FILE:
+	/* Big K */
+	case KDSIGACCEPT:
+	case KIOCSOUND:
+	case KDMKTONE:
+	case KDSETMODE:
+	case KDSKBMODE:
+	case KDSKBMETA:
+	case KDSKBLED:
+	case KDSETLED:
+	/* NBD */
+	case NBD_SET_SOCK:
+	case NBD_SET_BLKSIZE:
+	case NBD_SET_SIZE:
+	case NBD_SET_SIZE_BLOCKS:
+		return do_vfs_ioctl(file, fd, cmd, arg);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static int compat_ioctl_check_table(unsigned int xcmd)
+{
+	int i;
+	const int max = ARRAY_SIZE(ioctl_pointer) - 1;
+
+	BUILD_BUG_ON(max >= (1 << 16));
+
+	/* guess initial offset into table, assuming a
+	   normalized distribution */
+	i = ((xcmd >> 16) * max) >> 16;
+
+	/* do linear search up first, until greater or equal */
+	while (ioctl_pointer[i] < xcmd && i < max)
+		i++;
+
+	/* then do linear search down */
+	while (ioctl_pointer[i] > xcmd && i > 0)
+		i--;
+
+	return ioctl_pointer[i] == xcmd;
+}
+
+COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg32)
+{
+	unsigned long arg = arg32;
+	struct fd f = fdget(fd);
+	int error = -EBADF;
+	if (!f.file)
+		goto out;
+
+	/* RED-PEN how should LSM module know it's handling 32bit? */
+	error = security_file_ioctl(f.file, cmd, arg);
+	if (error)
+		goto out_fput;
+
+	/*
+	 * To allow the compat_ioctl handlers to be self contained
+	 * we need to check the common ioctls here first.
+	 * Just handle them with the standard handlers below.
+	 */
+	switch (cmd) {
+	case FIOCLEX:
+	case FIONCLEX:
+	case FIONBIO:
+	case FIOASYNC:
+	case FIOQSIZE:
+		break;
+
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+	case FS_IOC_RESVSP_32:
+	case FS_IOC_RESVSP64_32:
+		error = compat_ioctl_preallocate(f.file, compat_ptr(arg));
+		goto out_fput;
+#else
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		error = ioctl_preallocate(f.file, compat_ptr(arg));
+		goto out_fput;
+#endif
+
+	case FIBMAP:
+	case FIGETBSZ:
+	case FIONREAD:
+		if (S_ISREG(file_inode(f.file)->i_mode))
+			break;
+		/*FALL THROUGH*/
+
+	default:
+		if (f.file->f_op->compat_ioctl) {
+			error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
+			if (error != -ENOIOCTLCMD)
+				goto out_fput;
+		}
+
+		if (!f.file->f_op->unlocked_ioctl)
+			goto do_ioctl;
+		break;
+	}
+
+	if (compat_ioctl_check_table(XFORM(cmd)))
+		goto found_handler;
+
+	error = do_ioctl_trans(fd, cmd, arg, f.file);
+	if (error == -ENOIOCTLCMD)
+		error = -ENOTTY;
+
+	goto out_fput;
+
+ found_handler:
+	arg = (unsigned long)compat_ptr(arg);
+ do_ioctl:
+	error = do_vfs_ioctl(f.file, fd, cmd, arg);
+ out_fput:
+	fdput(f);
+ out:
+	return error;
+}
+
+static int __init init_sys32_ioctl_cmp(const void *p, const void *q)
+{
+	unsigned int a, b;
+	a = *(unsigned int *)p;
+	b = *(unsigned int *)q;
+	if (a > b)
+		return 1;
+	if (a < b)
+		return -1;
+	return 0;
+}
+
+static int __init init_sys32_ioctl(void)
+{
+	sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer),
+		init_sys32_ioctl_cmp, NULL);
+	return 0;
+}
+__initcall(init_sys32_ioctl);
diff --git a/kernel/fs/configfs/Kconfig b/kernel/fs/configfs/Kconfig
new file mode 100644
index 000000000..9febcdefd
--- /dev/null
+++ b/kernel/fs/configfs/Kconfig
@@ -0,0 +1,11 @@
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem"
+	select SYSFS
+	help
+	  configfs is a RAM-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
diff --git a/kernel/fs/configfs/Makefile b/kernel/fs/configfs/Makefile
new file mode 100644
index 000000000..00ffb278e
--- /dev/null
+++ b/kernel/fs/configfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
+
+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/kernel/fs/configfs/configfs_internal.h b/kernel/fs/configfs/configfs_internal.h
new file mode 100644
index 000000000..b65d1ef53
--- /dev/null
+++ b/kernel/fs/configfs/configfs_internal.h
@@ -0,0 +1,163 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct configfs_dirent {
+	atomic_t		s_count;
+	int			s_dependent_count;
+	struct list_head	s_sibling;
+	struct list_head	s_children;
+	struct list_head	s_links;
+	void			* s_element;
+	int			s_type;
+	umode_t			s_mode;
+	struct dentry		* s_dentry;
+	struct iattr		* s_iattr;
+#ifdef CONFIG_LOCKDEP
+	int			s_depth;
+#endif
+};
+
+#define CONFIGFS_ROOT		0x0001
+#define CONFIGFS_DIR		0x0002
+#define CONFIGFS_ITEM_ATTR	0x0004
+#define CONFIGFS_ITEM_LINK	0x0020
+#define CONFIGFS_USET_DIR	0x0040
+#define CONFIGFS_USET_DEFAULT	0x0080
+#define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_USET_IN_MKDIR	0x0200
+#define CONFIGFS_USET_CREATING	0x0400
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+
+extern struct mutex configfs_symlink_mutex;
+extern spinlock_t configfs_dirent_lock;
+
+extern struct kmem_cache *configfs_dir_cachep;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
+extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+				struct dentry *, void *, umode_t, int);
+extern int configfs_dirent_is_ready(struct configfs_dirent *);
+
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
+
+extern struct dentry *configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern const struct file_operations configfs_dir_operations;
+extern const struct file_operations configfs_file_operations;
+extern const struct file_operations bin_fops;
+extern const struct inode_operations configfs_dir_inode_operations;
+extern const struct inode_operations configfs_root_inode_operations;
+extern const struct inode_operations configfs_symlink_inode_operations;
+extern const struct dentry_operations configfs_dentry_ops;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+	struct list_head sl_list;
+	struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+				struct dentry *parent,
+				struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+	struct config_item * item = NULL;
+
+	spin_lock(&dentry->d_lock);
+	if (!d_unhashed(dentry)) {
+		struct configfs_dirent * sd = dentry->d_fsdata;
+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
+			struct configfs_symlink * sl = sd->s_element;
+			item = config_item_get(sl->sl_target);
+		} else
+			item = config_item_get(sd->s_element);
+	}
+	spin_unlock(&dentry->d_lock);
+
+	return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+	if (!(sd->s_type & CONFIGFS_ROOT)) {
+		kfree(sd->s_iattr);
+		kmem_cache_free(configfs_dir_cachep, sd);
+	}
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+	WARN_ON(!atomic_read(&sd->s_count));
+	if (atomic_dec_and_test(&sd->s_count))
+		release_configfs_dirent(sd);
+}
+
diff --git a/kernel/fs/configfs/dir.c b/kernel/fs/configfs/dir.c
new file mode 100644
index 000000000..c81ce7f20
--- /dev/null
+++ b/kernel/fs/configfs/dir.c
@@ -0,0 +1,1724 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+/*
+ * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Also protects mutations of symlinks linkage to target configfs_dirent
+ * Mutators of configfs_dirent linkage must *both* have the proper inode locked
+ * and configfs_dirent_lock locked, in that order.
+ * This allows one to safely traverse configfs_dirent trees and symlinks without
+ * having to lock inodes.
+ *
+ * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
+ * unlocked is not reliable unless in detach_groups() called from
+ * rmdir()/unregister() and from configfs_attach_group()
+ */
+DEFINE_SPINLOCK(configfs_dirent_lock);
+
+static void configfs_d_iput(struct dentry * dentry,
+			    struct inode * inode)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+
+	if (sd) {
+		/* Coordinate with configfs_readdir */
+		spin_lock(&configfs_dirent_lock);
+		/* Coordinate with configfs_attach_attr where will increase
+		 * sd->s_count and update sd->s_dentry to new allocated one.
+		 * Only set sd->dentry to null when this dentry is the only
+		 * sd owner.
+		 * If not do so, configfs_d_iput may run just after
+		 * configfs_attach_attr and set sd->s_dentry to null
+		 * even it's still in use.
+		 */
+		if (atomic_read(&sd->s_count) <= 2)
+			sd->s_dentry = NULL;
+
+		spin_unlock(&configfs_dirent_lock);
+		configfs_put(sd);
+	}
+	iput(inode);
+}
+
+const struct dentry_operations configfs_dentry_ops = {
+	.d_iput		= configfs_d_iput,
+	.d_delete	= always_delete_dentry,
+};
+
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Helpers to make lockdep happy with our recursive locking of default groups'
+ * inodes (see configfs_attach_group() and configfs_detach_group()).
+ * We put default groups i_mutexes in separate classes according to their depth
+ * from the youngest non-default group ancestor.
+ *
+ * For a non-default group A having default groups A/B, A/C, and A/C/D, default
+ * groups A/B and A/C will have their inode's mutex in class
+ * default_group_class[0], and default group A/C/D will be in
+ * default_group_class[1].
+ *
+ * The lock classes are declared and assigned in inode.c, according to the
+ * s_depth value.
+ * The s_depth value is initialized to -1, adjusted to >= 0 when attaching
+ * default groups, and reset to -1 when all default groups are attached. During
+ * attachment, if configfs_create() sees s_depth > 0, the lock class of the new
+ * inode's mutex is set to default_group_class[s_depth - 1].
+ */
+
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+	sd->s_depth = -1;
+}
+
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+					  struct configfs_dirent *sd)
+{
+	int parent_depth = parent_sd->s_depth;
+
+	if (parent_depth >= 0)
+		sd->s_depth = parent_depth + 1;
+}
+
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+	/*
+	 * item's i_mutex class is already setup, so s_depth is now only
+	 * used to set new sub-directories s_depth, which is always done
+	 * with item's i_mutex locked.
+	 */
+	/*
+	 *  sd->s_depth == -1 iff we are a non default group.
+	 *  else (we are a default group) sd->s_depth > 0 (see
+	 *  create_dir()).
+	 */
+	if (sd->s_depth == -1)
+		/*
+		 * We are a non default group and we are going to create
+		 * default groups.
+		 */
+		sd->s_depth = 0;
+}
+
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+	/* We will not create default groups anymore. */
+	sd->s_depth = -1;
+}
+
+#else /* CONFIG_LOCKDEP */
+
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+}
+
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+					  struct configfs_dirent *sd)
+{
+}
+
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+}
+
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+}
+
+#endif /* CONFIG_LOCKDEP */
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
+						   void *element, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
+	if (!sd)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&sd->s_count, 1);
+	INIT_LIST_HEAD(&sd->s_links);
+	INIT_LIST_HEAD(&sd->s_children);
+	sd->s_element = element;
+	sd->s_type = type;
+	configfs_init_dirent_depth(sd);
+	spin_lock(&configfs_dirent_lock);
+	if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
+		spin_unlock(&configfs_dirent_lock);
+		kmem_cache_free(configfs_dir_cachep, sd);
+		return ERR_PTR(-ENOENT);
+	}
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	spin_unlock(&configfs_dirent_lock);
+
+	return sd;
+}
+
+/*
+ *
+ * Return -EEXIST if there is already a configfs element with the same
+ * name for the same parent.
+ *
+ * called with parent inode's i_mutex held
+ */
+static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
+				  const unsigned char *new)
+{
+	struct configfs_dirent * sd;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_element) {
+			const unsigned char *existing = configfs_get_name(sd);
+			if (strcmp(existing, new))
+				continue;
+			else
+				return -EEXIST;
+		}
+	}
+
+	return 0;
+}
+
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+			 struct dentry * dentry, void * element,
+			 umode_t mode, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = configfs_new_dirent(parent_sd, element, type);
+	if (IS_ERR(sd))
+		return PTR_ERR(sd);
+
+	sd->s_mode = mode;
+	sd->s_dentry = dentry;
+	if (dentry)
+		dentry->d_fsdata = configfs_get(sd);
+
+	return 0;
+}
+
+static void init_dir(struct inode * inode)
+{
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inc_nlink(inode);
+}
+
+static void configfs_init_file(struct inode * inode)
+{
+	inode->i_size = PAGE_SIZE;
+	inode->i_fop = &configfs_file_operations;
+}
+
+static void init_symlink(struct inode * inode)
+{
+	inode->i_op = &configfs_symlink_inode_operations;
+}
+
+/**
+ *	configfs_create_dir - create a directory for an config_item.
+ *	@item:		config_itemwe're creating directory for.
+ *	@dentry:	config_item's dentry.
+ *
+ *	Note: user-created entries won't be allowed under this new directory
+ *	until it is validated by configfs_dir_set_ready()
+ */
+
+static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
+{
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+	struct dentry *p = dentry->d_parent;
+
+	BUG_ON(!item);
+
+	error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name);
+	if (unlikely(error))
+		return error;
+
+	error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
+				     CONFIGFS_DIR | CONFIGFS_USET_CREATING);
+	if (unlikely(error))
+		return error;
+
+	configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata);
+	error = configfs_create(dentry, mode, init_dir);
+	if (!error) {
+		inc_nlink(d_inode(p));
+		item->ci_dentry = dentry;
+	} else {
+		struct configfs_dirent *sd = dentry->d_fsdata;
+		if (sd) {
+			spin_lock(&configfs_dirent_lock);
+			list_del_init(&sd->s_sibling);
+			spin_unlock(&configfs_dirent_lock);
+			configfs_put(sd);
+		}
+	}
+	return error;
+}
+
+/*
+ * Allow userspace to create new entries under a new directory created with
+ * configfs_create_dir(), and under all of its chidlren directories recursively.
+ * @sd		configfs_dirent of the new directory to validate
+ *
+ * Caller must hold configfs_dirent_lock.
+ */
+static void configfs_dir_set_ready(struct configfs_dirent *sd)
+{
+	struct configfs_dirent *child_sd;
+
+	sd->s_type &= ~CONFIGFS_USET_CREATING;
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling)
+		if (child_sd->s_type & CONFIGFS_USET_CREATING)
+			configfs_dir_set_ready(child_sd);
+}
+
+/*
+ * Check that a directory does not belong to a directory hierarchy being
+ * attached and not validated yet.
+ * @sd		configfs_dirent of the directory to check
+ *
+ * @return	non-zero iff the directory was validated
+ *
+ * Note: takes configfs_dirent_lock, so the result may change from false to true
+ * in two consecutive calls, but never from true to false.
+ */
+int configfs_dirent_is_ready(struct configfs_dirent *sd)
+{
+	int ret;
+
+	spin_lock(&configfs_dirent_lock);
+	ret = !(sd->s_type & CONFIGFS_USET_CREATING);
+	spin_unlock(&configfs_dirent_lock);
+
+	return ret;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+			 struct dentry *parent,
+			 struct dentry *dentry)
+{
+	int err = 0;
+	umode_t mode = S_IFLNK | S_IRWXUGO;
+
+	err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
+				   CONFIGFS_ITEM_LINK);
+	if (!err) {
+		err = configfs_create(dentry, mode, init_symlink);
+		if (err) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			if (sd) {
+				spin_lock(&configfs_dirent_lock);
+				list_del_init(&sd->s_sibling);
+				spin_unlock(&configfs_dirent_lock);
+				configfs_put(sd);
+			}
+		}
+	}
+	return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+	struct dentry * parent = dget(d->d_parent);
+	struct configfs_dirent * sd;
+
+	sd = d->d_fsdata;
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
+	configfs_put(sd);
+	if (d_really_is_positive(d))
+		simple_rmdir(d_inode(parent),d);
+
+	pr_debug(" o %pd removing done (%d)\n", d, d_count(d));
+
+	dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:	config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ *
+ * Caller holds the mutex of the item's inode
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+
+	if (!dentry)
+		return;
+
+	remove_dir(dentry);
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+	struct configfs_attribute * attr = sd->s_element;
+	int error;
+
+	spin_lock(&configfs_dirent_lock);
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
+	spin_unlock(&configfs_dirent_lock);
+
+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
+				configfs_init_file);
+	if (error) {
+		configfs_put(sd);
+		return error;
+	}
+
+	d_rehash(dentry);
+
+	return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       unsigned int flags)
+{
+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+	struct configfs_dirent * sd;
+	int found = 0;
+	int err;
+
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 *
+	 * This forbids userspace to read/write attributes of items which may
+	 * not complete their initialization, since the dentries of the
+	 * attributes won't be instantiated.
+	 */
+	err = -ENOENT;
+	if (!configfs_dirent_is_ready(parent_sd))
+		goto out;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
+			const unsigned char * name = configfs_get_name(sd);
+
+			if (strcmp(name, dentry->d_name.name))
+				continue;
+
+			found = 1;
+			err = configfs_attach_attr(sd, dentry);
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 * If it doesn't exist and it isn't a NOT_PINNED item,
+		 * it must be negative.
+		 */
+		if (dentry->d_name.len > NAME_MAX)
+			return ERR_PTR(-ENAMETOOLONG);
+		d_add(dentry, NULL);
+		return NULL;
+	}
+
+out:
+	return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, setting
+ * CONFIGFS_USET_DROPPING on all children that are candidates for
+ * default detach.
+ * If there is an error, the caller will reset the flags via
+ * configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+	int ret;
+
+	/* Mark that we're trying to drop the group */
+	parent_sd->s_type |= CONFIGFS_USET_DROPPING;
+
+	ret = -EBUSY;
+	if (!list_empty(&parent_sd->s_links))
+		goto out;
+
+	ret = 0;
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    (sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			/* Abort if racing with mkdir() */
+			if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
+				if (wait_mutex)
+					*wait_mutex = &d_inode(sd->s_dentry)->i_mutex;
+				return -EAGAIN;
+			}
+
+			/*
+			 * Yup, recursive.  If there's a problem, blame
+			 * deep nesting of default_groups
+			 */
+			ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
+			if (!ret)
+				continue;
+		} else
+			ret = -ENOTEMPTY;
+
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+
+	parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
+		if (sd->s_type & CONFIGFS_USET_DEFAULT)
+			configfs_detach_rollback(sd->s_dentry);
+}
+
+static void detach_attrs(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+	struct configfs_dirent * parent_sd;
+	struct configfs_dirent * sd, * tmp;
+
+	if (!dentry)
+		return;
+
+	pr_debug("configfs %s: dropping attrs for  dir\n",
+		 dentry->d_name.name);
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		spin_lock(&configfs_dirent_lock);
+		list_del_init(&sd->s_sibling);
+		spin_unlock(&configfs_dirent_lock);
+		configfs_drop_dentry(sd, dentry);
+		configfs_put(sd);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct configfs_attribute *attr;
+	int error = 0;
+	int i;
+
+	if (!t)
+		return -EINVAL;
+	if (t->ct_attrs) {
+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+			if ((error = configfs_create_file(item, attr)))
+				break;
+		}
+	}
+
+	if (error)
+		detach_attrs(item);
+
+	return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
+	struct dentry *child;
+	struct configfs_dirent *parent_sd;
+	struct configfs_dirent *sd, *tmp;
+
+	if (!dentry)
+		return;
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
+			continue;
+
+		child = sd->s_dentry;
+
+		mutex_lock(&d_inode(child)->i_mutex);
+
+		configfs_detach_group(sd->s_element);
+		d_inode(child)->i_flags |= S_DEAD;
+		dont_mount(child);
+
+		mutex_unlock(&d_inode(child)->i_mutex);
+
+		d_delete(child);
+		dput(child);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+				struct config_group *group)
+{
+	int ret;
+	struct configfs_dirent *sd;
+	/* We trust the caller holds a reference to parent */
+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	ret = -ENOMEM;
+	child = d_alloc_name(parent, group->cg_item.ci_name);
+	if (child) {
+		d_add(child, NULL);
+
+		ret = configfs_attach_group(&parent_group->cg_item,
+					    &group->cg_item, child);
+		if (!ret) {
+			sd = child->d_fsdata;
+			sd->s_type |= CONFIGFS_USET_DEFAULT;
+		} else {
+			BUG_ON(d_inode(child));
+			d_drop(child);
+			dput(child);
+		}
+	}
+
+	return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+	struct config_group *new_group;
+	int ret = 0;
+	int i;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+
+			ret = create_default_group(group, new_group);
+			if (ret) {
+				detach_groups(group);
+				break;
+			}
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_mutex is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+	struct config_group *group;
+
+	group = item->ci_group;
+	if (group) {
+		list_del_init(&item->ci_entry);
+
+		item->ci_group = NULL;
+		item->ci_parent = NULL;
+
+		/* Drop the reference for ci_entry */
+		config_item_put(item);
+
+		/* Drop the reference for ci_parent */
+		config_group_put(group);
+	}
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+	/*
+	 * Parent seems redundant with group, but it makes certain
+	 * traversals much nicer.
+	 */
+	item->ci_parent = parent_item;
+
+	/*
+	 * We hold a reference on the parent for the child's ci_parent
+	 * link.
+	 */
+	item->ci_group = config_group_get(to_config_group(parent_item));
+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+	/*
+	 * We hold a reference on the child for ci_entry on the parent's
+	 * cg_children
+	 */
+	config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			unlink_group(new_group);
+		}
+	}
+
+	group->cg_subsys = NULL;
+	unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+	link_obj(&parent_group->cg_item, &group->cg_item);
+
+	if (parent_group->cg_subsys)
+		subsys = parent_group->cg_subsys;
+	else if (configfs_is_root(&parent_group->cg_item))
+		subsys = to_configfs_subsystem(group);
+	else
+		BUG();
+	group->cg_subsys = subsys;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			link_group(group, new_group);
+		}
+	}
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+				struct config_item *item,
+				struct dentry *dentry)
+{
+	int ret;
+
+	ret = configfs_create_dir(item, dentry);
+	if (!ret) {
+		ret = populate_attrs(item);
+		if (ret) {
+			/*
+			 * We are going to remove an inode and its dentry but
+			 * the VFS may already have hit and used them. Thus,
+			 * we must lock them as rmdir() would.
+			 */
+			mutex_lock(&d_inode(dentry)->i_mutex);
+			configfs_remove_dir(item);
+			d_inode(dentry)->i_flags |= S_DEAD;
+			dont_mount(dentry);
+			mutex_unlock(&d_inode(dentry)->i_mutex);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+/* Caller holds the mutex of the item's inode */
+static void configfs_detach_item(struct config_item *item)
+{
+	detach_attrs(item);
+	configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry)
+{
+	int ret;
+	struct configfs_dirent *sd;
+
+	ret = configfs_attach_item(parent_item, item, dentry);
+	if (!ret) {
+		sd = dentry->d_fsdata;
+		sd->s_type |= CONFIGFS_USET_DIR;
+
+		/*
+		 * FYI, we're faking mkdir in populate_groups()
+		 * We must lock the group's inode to avoid races with the VFS
+		 * which can already hit the inode and try to add/remove entries
+		 * under it.
+		 *
+		 * We must also lock the inode to remove it safely in case of
+		 * error, as rmdir() would.
+		 */
+		mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+		configfs_adjust_dir_dirent_depth_before_populate(sd);
+		ret = populate_groups(to_config_group(item));
+		if (ret) {
+			configfs_detach_item(item);
+			d_inode(dentry)->i_flags |= S_DEAD;
+			dont_mount(dentry);
+		}
+		configfs_adjust_dir_dirent_depth_after_populate(sd);
+		mutex_unlock(&d_inode(dentry)->i_mutex);
+		if (ret)
+			d_delete(dentry);
+	}
+
+	return ret;
+}
+
+/* Caller holds the mutex of the group's inode */
+static void configfs_detach_group(struct config_item *item)
+{
+	detach_groups(to_config_group(item));
+	configfs_detach_item(item);
+}
+
+/*
+ * After the item has been detached from the filesystem view, we are
+ * ready to tear it out of the hierarchy.  Notify the client before
+ * we do that so they can perform any cleanup that requires
+ * navigating the hierarchy.  A client does not need to provide this
+ * callback.  The subsystem semaphore MUST be held by the caller, and
+ * references must be valid for both items.  It also assumes the
+ * caller has validated ci_type.
+ */
+static void client_disconnect_notify(struct config_item *parent_item,
+				     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
+		type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
+						      item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+			     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	/*
+	 * If ->drop_item() exists, it is responsible for the
+	 * config_item_put().
+	 */
+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
+		type->ct_group_ops->drop_item(to_config_group(parent_item),
+					      item);
+	else
+		config_item_put(item);
+}
+
+#ifdef DEBUG
+static void configfs_dump_one(struct configfs_dirent *sd, int level)
+{
+	pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+
+#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type);
+	type_print(CONFIGFS_ROOT);
+	type_print(CONFIGFS_DIR);
+	type_print(CONFIGFS_ITEM_ATTR);
+	type_print(CONFIGFS_ITEM_LINK);
+	type_print(CONFIGFS_USET_DIR);
+	type_print(CONFIGFS_USET_DEFAULT);
+	type_print(CONFIGFS_USET_DROPPING);
+#undef type_print
+}
+
+static int configfs_dump(struct configfs_dirent *sd, int level)
+{
+	struct configfs_dirent *child_sd;
+	int ret = 0;
+
+	configfs_dump_one(sd, level);
+
+	if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
+		return 0;
+
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+		ret = configfs_dump(child_sd, level + 2);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+#endif
+
+
+/*
+ * configfs_depend_item() and configfs_undepend_item()
+ *
+ * WARNING: Do not call these from a configfs callback!
+ *
+ * This describes these functions and their helpers.
+ *
+ * Allow another kernel system to depend on a config_item.  If this
+ * happens, the item cannot go away until the dependent can live without
+ * it.  The idea is to give client modules as simple an interface as
+ * possible.  When a system asks them to depend on an item, they just
+ * call configfs_depend_item().  If the item is live and the client
+ * driver is in good shape, we'll happily do the work for them.
+ *
+ * Why is the locking complex?  Because configfs uses the VFS to handle
+ * all locking, but this function is called outside the normal
+ * VFS->configfs path.  So it must take VFS locks to prevent the
+ * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc).  This is
+ * why you can't call these functions underneath configfs callbacks.
+ *
+ * Note, btw, that this can be called at *any* time, even when a configfs
+ * subsystem isn't registered, or when configfs is loading or unloading.
+ * Just like configfs_register_subsystem().  So we take the same
+ * precautions.  We pin the filesystem.  We lock configfs_dirent_lock.
+ * If we can find the target item in the
+ * configfs tree, it must be part of the subsystem tree as well, so we
+ * do not need the subsystem semaphore.  Holding configfs_dirent_lock helps
+ * locking out mkdir() and rmdir(), who might be racing us.
+ */
+
+/*
+ * configfs_depend_prep()
+ *
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes.  This is similar but not the same to configfs_detach_prep().
+ * Note that configfs_detach_prep() expects the parent to be locked when it
+ * is called, but we lock the parent *inside* configfs_depend_prep().  We
+ * do that so we can unlock it if we find nothing.
+ *
+ * Here we do a depth-first search of the dentry hierarchy looking for
+ * our object.
+ * We deliberately ignore items tagged as dropping since they are virtually
+ * dead, as well as items in the middle of attachment since they virtually
+ * do not exist yet. This completes the locking out of racing mkdir() and
+ * rmdir().
+ * Note: subdirectories in the middle of attachment start with s_type =
+ * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir().  When
+ * CONFIGFS_USET_CREATING is set, we ignore the item.  The actual set of
+ * s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
+ *
+ * If the target is not found, -ENOENT is bubbled up.
+ *
+ * This adds a requirement that all config_items be unique!
+ *
+ * This is recursive.  There isn't
+ * much on the stack, though, so folks that need this function - be careful
+ * about your stack!  Patches will be accepted to make it iterative.
+ */
+static int configfs_depend_prep(struct dentry *origin,
+				struct config_item *target)
+{
+	struct configfs_dirent *child_sd, *sd;
+	int ret = 0;
+
+	BUG_ON(!origin || !origin->d_fsdata);
+	sd = origin->d_fsdata;
+
+	if (sd->s_element == target)  /* Boo-yah */
+		goto out;
+
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+		if ((child_sd->s_type & CONFIGFS_DIR) &&
+		    !(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
+		    !(child_sd->s_type & CONFIGFS_USET_CREATING)) {
+			ret = configfs_depend_prep(child_sd->s_dentry,
+						   target);
+			if (!ret)
+				goto out;  /* Child path boo-yah */
+		}
+	}
+
+	/* We looped all our children and didn't find target */
+	ret = -ENOENT;
+
+out:
+	return ret;
+}
+
+int configfs_depend_item(struct configfs_subsystem *subsys,
+			 struct config_item *target)
+{
+	int ret;
+	struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+	struct config_item *s_item = &subsys->su_group.cg_item;
+	struct dentry *root;
+
+	/*
+	 * Pin the configfs filesystem.  This means we can safely access
+	 * the root of the configfs filesystem.
+	 */
+	root = configfs_pin_fs();
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	/*
+	 * Next, lock the root directory.  We're going to check that the
+	 * subsystem is really registered, and so we need to lock out
+	 * configfs_[un]register_subsystem().
+	 */
+	mutex_lock(&d_inode(root)->i_mutex);
+
+	root_sd = root->d_fsdata;
+
+	list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+		if (p->s_type & CONFIGFS_DIR) {
+			if (p->s_element == s_item) {
+				subsys_sd = p;
+				break;
+			}
+		}
+	}
+
+	if (!subsys_sd) {
+		ret = -ENOENT;
+		goto out_unlock_fs;
+	}
+
+	/* Ok, now we can trust subsys/s_item */
+
+	spin_lock(&configfs_dirent_lock);
+	/* Scan the tree, return 0 if found */
+	ret = configfs_depend_prep(subsys_sd->s_dentry, target);
+	if (ret)
+		goto out_unlock_dirent_lock;
+
+	/*
+	 * We are sure that the item is not about to be removed by rmdir(), and
+	 * not in the middle of attachment by mkdir().
+	 */
+	p = target->ci_dentry->d_fsdata;
+	p->s_dependent_count += 1;
+
+out_unlock_dirent_lock:
+	spin_unlock(&configfs_dirent_lock);
+out_unlock_fs:
+	mutex_unlock(&d_inode(root)->i_mutex);
+
+	/*
+	 * If we succeeded, the fs is pinned via other methods.  If not,
+	 * we're done with it anyway.  So release_fs() is always right.
+	 */
+	configfs_release_fs();
+
+	return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item);
+
+/*
+ * Release the dependent linkage.  This is much simpler than
+ * configfs_depend_item() because we know that that the client driver is
+ * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
+ */
+void configfs_undepend_item(struct configfs_subsystem *subsys,
+			    struct config_item *target)
+{
+	struct configfs_dirent *sd;
+
+	/*
+	 * Since we can trust everything is pinned, we just need
+	 * configfs_dirent_lock.
+	 */
+	spin_lock(&configfs_dirent_lock);
+
+	sd = target->ci_dentry->d_fsdata;
+	BUG_ON(sd->s_dependent_count < 1);
+
+	sd->s_dependent_count -= 1;
+
+	/*
+	 * After this unlock, we cannot trust the item to stay alive!
+	 * DO NOT REFERENCE item after this unlock.
+	 */
+	spin_unlock(&configfs_dirent_lock);
+}
+EXPORT_SYMBOL(configfs_undepend_item);
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int ret = 0;
+	int module_got = 0;
+	struct config_group *group = NULL;
+	struct config_item *item = NULL;
+	struct config_item *parent_item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct config_item_type *type;
+	struct module *subsys_owner = NULL, *new_item_owner = NULL;
+	char *name;
+
+	sd = dentry->d_parent->d_fsdata;
+
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	if (!configfs_dirent_is_ready(sd)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (!(sd->s_type & CONFIGFS_USET_DIR)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	/* Get a working ref for the duration of this function */
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!type || !type->ct_group_ops ||
+	    (!type->ct_group_ops->make_group &&
+	     !type->ct_group_ops->make_item)) {
+		ret = -EPERM;  /* Lack-of-mkdir returns -EPERM */
+		goto out_put;
+	}
+
+	/*
+	 * The subsystem may belong to a different module than the item
+	 * being created.  We don't want to safely pin the new item but
+	 * fail to pin the subsystem it sits under.
+	 */
+	if (!subsys->su_group.cg_item.ci_type) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+	if (!try_module_get(subsys_owner)) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name) {
+		ret = -ENOMEM;
+		goto out_subsys_put;
+	}
+
+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+	mutex_lock(&subsys->su_mutex);
+	if (type->ct_group_ops->make_group) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (!group)
+			group = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(group)) {
+			link_group(to_config_group(parent_item), group);
+			item = &group->cg_item;
+		} else
+			ret = PTR_ERR(group);
+	} else {
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (!item)
+			item = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(item))
+			link_obj(parent_item, item);
+		else
+			ret = PTR_ERR(item);
+	}
+	mutex_unlock(&subsys->su_mutex);
+
+	kfree(name);
+	if (ret) {
+		/*
+		 * If ret != 0, then link_obj() was never called.
+		 * There are no extra references to clean up.
+		 */
+		goto out_subsys_put;
+	}
+
+	/*
+	 * link_obj() has been called (via link_group() for groups).
+	 * From here on out, errors must clean that up.
+	 */
+
+	type = item->ci_type;
+	if (!type) {
+		ret = -EINVAL;
+		goto out_unlink;
+	}
+
+	new_item_owner = type->ct_owner;
+	if (!try_module_get(new_item_owner)) {
+		ret = -EINVAL;
+		goto out_unlink;
+	}
+
+	/*
+	 * I hate doing it this way, but if there is
+	 * an error,  module_put() probably should
+	 * happen after any cleanup.
+	 */
+	module_got = 1;
+
+	/*
+	 * Make racing rmdir() fail if it did not tag parent with
+	 * CONFIGFS_USET_DROPPING
+	 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
+	 * fail and let rmdir() terminate correctly
+	 */
+	spin_lock(&configfs_dirent_lock);
+	/* This will make configfs_detach_prep() fail */
+	sd->s_type |= CONFIGFS_USET_IN_MKDIR;
+	spin_unlock(&configfs_dirent_lock);
+
+	if (group)
+		ret = configfs_attach_group(parent_item, item, dentry);
+	else
+		ret = configfs_attach_item(parent_item, item, dentry);
+
+	spin_lock(&configfs_dirent_lock);
+	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+	if (!ret)
+		configfs_dir_set_ready(dentry->d_fsdata);
+	spin_unlock(&configfs_dirent_lock);
+
+out_unlink:
+	if (ret) {
+		/* Tear down everything we built up */
+		mutex_lock(&subsys->su_mutex);
+
+		client_disconnect_notify(parent_item, item);
+		if (group)
+			unlink_group(group);
+		else
+			unlink_obj(item);
+		client_drop_item(parent_item, item);
+
+		mutex_unlock(&subsys->su_mutex);
+
+		if (module_got)
+			module_put(new_item_owner);
+	}
+
+out_subsys_put:
+	if (ret)
+		module_put(subsys_owner);
+
+out_put:
+	/*
+	 * link_obj()/link_group() took a reference from child->parent,
+	 * so the parent is safely pinned.  We can drop our working
+	 * reference.
+	 */
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct config_item *parent_item;
+	struct config_item *item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
+	int ret;
+
+	sd = dentry->d_fsdata;
+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
+		return -EPERM;
+
+	/* Get a working ref until we have the child */
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!parent_item->ci_type) {
+		config_item_put(parent_item);
+		return -EINVAL;
+	}
+
+	/* configfs_mkdir() shouldn't have allowed this */
+	BUG_ON(!subsys->su_group.cg_item.ci_type);
+	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+
+	/*
+	 * Ensure that no racing symlink() will make detach_prep() fail while
+	 * the new link is temporarily attached
+	 */
+	do {
+		struct mutex *wait_mutex;
+
+		mutex_lock(&configfs_symlink_mutex);
+		spin_lock(&configfs_dirent_lock);
+		/*
+		 * Here's where we check for dependents.  We're protected by
+		 * configfs_dirent_lock.
+		 * If no dependent, atomically tag the item as dropping.
+		 */
+		ret = sd->s_dependent_count ? -EBUSY : 0;
+		if (!ret) {
+			ret = configfs_detach_prep(dentry, &wait_mutex);
+			if (ret)
+				configfs_detach_rollback(dentry);
+		}
+		spin_unlock(&configfs_dirent_lock);
+		mutex_unlock(&configfs_symlink_mutex);
+
+		if (ret) {
+			if (ret != -EAGAIN) {
+				config_item_put(parent_item);
+				return ret;
+			}
+
+			/* Wait until the racing operation terminates */
+			mutex_lock(wait_mutex);
+			mutex_unlock(wait_mutex);
+		}
+	} while (ret == -EAGAIN);
+
+	/* Get a working ref for the duration of this function */
+	item = configfs_get_config_item(dentry);
+
+	/* Drop reference from above, item already holds one. */
+	config_item_put(parent_item);
+
+	if (item->ci_type)
+		dead_item_owner = item->ci_type->ct_owner;
+
+	if (sd->s_type & CONFIGFS_USET_DIR) {
+		configfs_detach_group(item);
+
+		mutex_lock(&subsys->su_mutex);
+		client_disconnect_notify(parent_item, item);
+		unlink_group(to_config_group(item));
+	} else {
+		configfs_detach_item(item);
+
+		mutex_lock(&subsys->su_mutex);
+		client_disconnect_notify(parent_item, item);
+		unlink_obj(item);
+	}
+
+	client_drop_item(parent_item, item);
+	mutex_unlock(&subsys->su_mutex);
+
+	/* Drop our reference from above */
+	config_item_put(item);
+
+	module_put(dead_item_owner);
+	module_put(subsys_owner);
+
+	return 0;
+}
+
+const struct inode_operations configfs_dir_inode_operations = {
+	.mkdir		= configfs_mkdir,
+	.rmdir		= configfs_rmdir,
+	.symlink	= configfs_symlink,
+	.unlink		= configfs_unlink,
+	.lookup		= configfs_lookup,
+	.setattr	= configfs_setattr,
+};
+
+const struct inode_operations configfs_root_inode_operations = {
+	.lookup		= configfs_lookup,
+	.setattr	= configfs_setattr,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+	int error = 0;
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(config_item_name(item), new_name))
+		return -EINVAL;
+
+	if (!item->parent)
+		return -EINVAL;
+
+	down_write(&configfs_rename_sem);
+	parent = item->parent->dentry;
+
+	mutex_lock(&d_inode(parent)->i_mutex);
+
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+	if (!IS_ERR(new_dentry)) {
+		if (d_really_is_negative(new_dentry)) {
+			error = config_item_set_name(item, "%s", new_name);
+			if (!error) {
+				d_add(new_dentry, NULL);
+				d_move(item->dentry, new_dentry);
+			}
+			else
+				d_delete(new_dentry);
+		} else
+			error = -EEXIST;
+		dput(new_dentry);
+	}
+	mutex_unlock(&d_inode(parent)->i_mutex);
+	up_write(&configfs_rename_sem);
+
+	return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_path.dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	int err;
+
+	mutex_lock(&d_inode(dentry)->i_mutex);
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	err = -ENOENT;
+	if (configfs_dirent_is_ready(parent_sd)) {
+		file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
+		if (IS_ERR(file->private_data))
+			err = PTR_ERR(file->private_data);
+		else
+			err = 0;
+	}
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+
+	return err;
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_path.dentry;
+	struct configfs_dirent * cursor = file->private_data;
+
+	mutex_lock(&d_inode(dentry)->i_mutex);
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&cursor->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+
+	release_configfs_dirent(cursor);
+
+	return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct super_block *sb = dentry->d_sb;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *cursor = file->private_data;
+	struct list_head *p, *q = &cursor->s_sibling;
+	ino_t ino = 0;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	if (ctx->pos == 2) {
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, &parent_sd->s_children);
+		spin_unlock(&configfs_dirent_lock);
+	}
+	for (p = q->next; p != &parent_sd->s_children; p = p->next) {
+		struct configfs_dirent *next;
+		const char *name;
+		int len;
+		struct inode *inode = NULL;
+
+		next = list_entry(p, struct configfs_dirent, s_sibling);
+		if (!next->s_element)
+			continue;
+
+		name = configfs_get_name(next);
+		len = strlen(name);
+
+		/*
+		 * We'll have a dentry and an inode for
+		 * PINNED items and for open attribute
+		 * files.  We lock here to prevent a race
+		 * with configfs_d_iput() clearing
+		 * s_dentry before calling iput().
+		 *
+		 * Why do we go to the trouble?  If
+		 * someone has an attribute file open,
+		 * the inode number should match until
+		 * they close it.  Beyond that, we don't
+		 * care.
+		 */
+		spin_lock(&configfs_dirent_lock);
+		dentry = next->s_dentry;
+		if (dentry)
+			inode = d_inode(dentry);
+		if (inode)
+			ino = inode->i_ino;
+		spin_unlock(&configfs_dirent_lock);
+		if (!inode)
+			ino = iunique(sb, 2);
+
+		if (!dir_emit(ctx, name, len, ino, dt_type(next)))
+			return 0;
+
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, p);
+		spin_unlock(&configfs_dirent_lock);
+		p = q;
+		ctx->pos++;
+	}
+	return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
+{
+	struct dentry * dentry = file->f_path.dentry;
+
+	mutex_lock(&d_inode(dentry)->i_mutex);
+	switch (whence) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			mutex_unlock(&d_inode(dentry)->i_mutex);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			struct configfs_dirent *cursor = file->private_data;
+			struct list_head *p;
+			loff_t n = file->f_pos - 2;
+
+			spin_lock(&configfs_dirent_lock);
+			list_del(&cursor->s_sibling);
+			p = sd->s_children.next;
+			while (n && p != &sd->s_children) {
+				struct configfs_dirent *next;
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (next->s_element)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->s_sibling, p);
+			spin_unlock(&configfs_dirent_lock);
+		}
+	}
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+	return offset;
+}
+
+const struct file_operations configfs_dir_operations = {
+	.open		= configfs_dir_open,
+	.release	= configfs_dir_close,
+	.llseek		= configfs_dir_lseek,
+	.read		= generic_read_dir,
+	.iterate	= configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+	int err;
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry;
+	struct dentry *root;
+	struct configfs_dirent *sd;
+
+	root = configfs_pin_fs();
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	sd = root->d_fsdata;
+	link_group(to_config_group(sd->s_element), group);
+
+	mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT);
+
+	err = -ENOMEM;
+	dentry = d_alloc_name(root, group->cg_item.ci_name);
+	if (dentry) {
+		d_add(dentry, NULL);
+
+		err = configfs_attach_group(sd->s_element, &group->cg_item,
+					    dentry);
+		if (err) {
+			BUG_ON(d_inode(dentry));
+			d_drop(dentry);
+			dput(dentry);
+		} else {
+			spin_lock(&configfs_dirent_lock);
+			configfs_dir_set_ready(dentry->d_fsdata);
+			spin_unlock(&configfs_dirent_lock);
+		}
+	}
+
+	mutex_unlock(&d_inode(root)->i_mutex);
+
+	if (err) {
+		unlink_group(group);
+		configfs_release_fs();
+	}
+
+	return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+	struct dentry *root = dentry->d_sb->s_root;
+
+	if (dentry->d_parent != root) {
+		pr_err("Tried to unregister non-subsystem!\n");
+		return;
+	}
+
+	mutex_lock_nested(&d_inode(root)->i_mutex,
+			  I_MUTEX_PARENT);
+	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	mutex_lock(&configfs_symlink_mutex);
+	spin_lock(&configfs_dirent_lock);
+	if (configfs_detach_prep(dentry, NULL)) {
+		pr_err("Tried to unregister non-empty subsystem!\n");
+	}
+	spin_unlock(&configfs_dirent_lock);
+	mutex_unlock(&configfs_symlink_mutex);
+	configfs_detach_group(&group->cg_item);
+	d_inode(dentry)->i_flags |= S_DEAD;
+	dont_mount(dentry);
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+
+	d_delete(dentry);
+
+	mutex_unlock(&d_inode(root)->i_mutex);
+
+	dput(dentry);
+
+	unlink_group(group);
+	configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/kernel/fs/configfs/file.c b/kernel/fs/configfs/file.c
new file mode 100644
index 000000000..403269ffc
--- /dev/null
+++ b/kernel/fs/configfs/file.c
@@ -0,0 +1,336 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/*
+ * A simple attribute can only be 4096 characters.  Why 4k?  Because the
+ * original code limited it to PAGE_SIZE.  That's a bad idea, though,
+ * because an attribute of 16k on ia64 won't work on x86.  So we limit to
+ * 4k, our minimum common page size.
+ */
+#define SIMPLE_ATTR_SIZE 4096
+
+struct configfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	struct configfs_item_operations	* ops;
+	struct mutex		mutex;
+	int			needs_read_fill;
+};
+
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from item.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	config_item's show() method to fill the buffer with this attribute's
+ *	data.
+ *	This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	count = ops->show_attribute(item,attr,buffer->page);
+	buffer->needs_read_fill = 0;
+	BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
+	if (count >= 0)
+		buffer->count = count;
+	else
+		ret = count;
+	return ret;
+}
+
+/**
+ *	configfs_read_file - read an attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target item is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	item's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the item has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	mutex_lock(&buffer->mutex);
+	if (buffer->needs_read_fill) {
+		if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
+			goto out;
+	}
+	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
+		 __func__, count, *ppos, buffer->page);
+	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
+					 buffer->count);
+out:
+	mutex_unlock(&buffer->mutex);
+	return retval;
+}
+
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@buf:		data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count >= SIMPLE_ATTR_SIZE)
+		count = SIMPLE_ATTR_SIZE - 1;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	/* if buf is assumed to contain a string, terminate it by \0,
+	 * so e.g. sscanf() can scan the string easily */
+	buffer->page[count] = 0;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to config_item.
+ *	@dentry:	dentry to the attribute
+ *	@buffer:	data buffer for file.
+ *	@count:		number of bytes
+ *
+ *	Get the correct pointers for the config_item and the attribute we're
+ *	dealing with, then call the store() method for the attribute,
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+
+	return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *	configfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to configfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the config_item in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write.
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t len;
+
+	mutex_lock(&buffer->mutex);
+	len = fill_write_buffer(buffer, buf, count);
+	if (len > 0)
+		len = flush_write_buffer(file->f_path.dentry, buffer, len);
+	if (len > 0)
+		*ppos += len;
+	mutex_unlock(&buffer->mutex);
+	return len;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+	struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(file->f_path.dentry);
+	struct configfs_buffer * buffer;
+	struct configfs_item_operations * ops = NULL;
+	int error = 0;
+
+	if (!item || !attr)
+		goto Einval;
+
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->ca_owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
+	if (item->ci_type)
+		ops = item->ci_type->ct_item_ops;
+	else
+		goto Eaccess;
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok,
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+
+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+			goto Eaccess;
+
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+			goto Eaccess;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (!buffer) {
+		error = -ENOMEM;
+		goto Enomem;
+	}
+	mutex_init(&buffer->mutex);
+	buffer->needs_read_fill = 1;
+	buffer->ops = ops;
+	file->private_data = buffer;
+	goto Done;
+
+ Einval:
+	error = -EINVAL;
+	goto Done;
+ Eaccess:
+	error = -EACCES;
+ Enomem:
+	module_put(attr->ca_owner);
+ Done:
+	if (error && item)
+		config_item_put(item);
+	return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+	return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+	struct config_item * item = to_item(filp->f_path.dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
+	struct module * owner = attr->ca_owner;
+	struct configfs_buffer * buffer = filp->private_data;
+
+	if (item)
+		config_item_put(item);
+	/* After this point, attr should not be accessed. */
+	module_put(owner);
+
+	if (buffer) {
+		if (buffer->page)
+			free_page((unsigned long)buffer->page);
+		mutex_destroy(&buffer->mutex);
+		kfree(buffer);
+	}
+	return 0;
+}
+
+const struct file_operations configfs_file_operations = {
+	.read		= configfs_read_file,
+	.write		= configfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= configfs_open_file,
+	.release	= configfs_release,
+};
+
+/**
+ *	configfs_create_file - create an attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+	struct dentry *dir = item->ci_dentry;
+	struct configfs_dirent *parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
+				     CONFIGFS_ITEM_ATTR);
+	mutex_unlock(&d_inode(dir)->i_mutex);
+
+	return error;
+}
+
diff --git a/kernel/fs/configfs/inode.c b/kernel/fs/configfs/inode.c
new file mode 100644
index 000000000..8d89f5fd0
--- /dev/null
+++ b/kernel/fs/configfs/inode.c
@@ -0,0 +1,272 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs/configfs.txt for more
+ * information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/sched.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
+#endif
+
+static const struct address_space_operations configfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+};
+
+static const struct inode_operations configfs_inode_operations ={
+	.setattr	= configfs_setattr,
+};
+
+int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
+{
+	struct inode * inode = d_inode(dentry);
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	struct iattr * sd_iattr;
+	unsigned int ia_valid = iattr->ia_valid;
+	int error;
+
+	if (!sd)
+		return -EINVAL;
+
+	sd_iattr = sd->s_iattr;
+	if (!sd_iattr) {
+		/* setting attributes for the first time, allocate now */
+		sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
+		if (!sd_iattr)
+			return -ENOMEM;
+		/* assign default attributes */
+		sd_iattr->ia_mode = sd->s_mode;
+		sd_iattr->ia_uid = GLOBAL_ROOT_UID;
+		sd_iattr->ia_gid = GLOBAL_ROOT_GID;
+		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
+		sd->s_iattr = sd_iattr;
+	}
+	/* attributes were changed atleast once in past */
+
+	error = simple_setattr(dentry, iattr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_UID)
+		sd_iattr->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		sd_iattr->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		sd_iattr->ia_mode = sd->s_mode = mode;
+	}
+
+	return error;
+}
+
+static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
+{
+	inode->i_mode = iattr->ia_mode;
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
+				 struct super_block *s)
+{
+	struct inode * inode = new_inode(s);
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_op = &configfs_inode_operations;
+
+		if (sd->s_iattr) {
+			/* sysfs_dirent has non-default attributes
+			 * get them for the new inode from persistent copy
+			 * in sysfs_dirent
+			 */
+			set_inode_attr(inode, sd->s_iattr);
+		} else
+			set_default_inode_attr(inode, mode);
+	}
+	return inode;
+}
+
+#ifdef CONFIG_LOCKDEP
+
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+					  struct inode *inode)
+{
+	int depth = sd->s_depth;
+
+	if (depth > 0) {
+		if (depth <= ARRAY_SIZE(default_group_class)) {
+			lockdep_set_class(&inode->i_mutex,
+					  &default_group_class[depth - 1]);
+		} else {
+			/*
+			 * In practice the maximum level of locking depth is
+			 * already reached. Just inform about possible reasons.
+			 */
+			pr_info("Too many levels of inodes for the locking correctness validator.\n");
+			pr_info("Spurious warnings may appear.\n");
+		}
+	}
+}
+
+#else /* CONFIG_LOCKDEP */
+
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+					  struct inode *inode)
+{
+}
+
+#endif /* CONFIG_LOCKDEP */
+
+int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct inode *))
+{
+	int error = 0;
+	struct inode *inode = NULL;
+	struct configfs_dirent *sd;
+	struct inode *p_inode;
+
+	if (!dentry)
+		return -ENOENT;
+
+	if (d_really_is_positive(dentry))
+		return -EEXIST;
+
+	sd = dentry->d_fsdata;
+	inode = configfs_new_inode(mode, sd, dentry->d_sb);
+	if (!inode)
+		return -ENOMEM;
+
+	p_inode = d_inode(dentry->d_parent);
+	p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+	configfs_set_inode_lock_class(sd, inode);
+
+	init(inode);
+	d_instantiate(dentry, inode);
+	if (S_ISDIR(mode) || S_ISLNK(mode))
+		dget(dentry);  /* pin link and directory dentries in core */
+	return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+	struct configfs_attribute *attr;
+
+	BUG_ON(!sd || !sd->s_element);
+
+	/* These always have a dentry, so use that */
+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+		return sd->s_dentry->d_name.name;
+
+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+		attr = sd->s_element;
+		return attr->ca_name;
+	}
+	return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_mutex held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+	struct dentry * dentry = sd->s_dentry;
+
+	if (dentry) {
+		spin_lock(&dentry->d_lock);
+		if (!d_unhashed(dentry) && d_really_is_positive(dentry)) {
+			dget_dlock(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
+			simple_unlink(d_inode(parent), dentry);
+		} else
+			spin_unlock(&dentry->d_lock);
+	}
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct configfs_dirent * sd;
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+	if (d_really_is_negative(dir))
+		/* no inode means this hasn't been made visible yet */
+		return;
+
+	mutex_lock(&d_inode(dir)->i_mutex);
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element)
+			continue;
+		if (!strcmp(configfs_get_name(sd), name)) {
+			spin_lock(&configfs_dirent_lock);
+			list_del_init(&sd->s_sibling);
+			spin_unlock(&configfs_dirent_lock);
+			configfs_drop_dentry(sd, dir);
+			configfs_put(sd);
+			break;
+		}
+	}
+	mutex_unlock(&d_inode(dir)->i_mutex);
+}
diff --git a/kernel/fs/configfs/item.c b/kernel/fs/configfs/item.c
new file mode 100644
index 000000000..e65f9ffbb
--- /dev/null
+++ b/kernel/fs/configfs/item.c
@@ -0,0 +1,214 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ *	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item *to_item(struct list_head *entry)
+{
+	return container_of(entry, struct config_item, ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *	config_item_init - initialize item.
+ *	@item:	item in question.
+ */
+void config_item_init(struct config_item *item)
+{
+	kref_init(&item->ci_kref);
+	INIT_LIST_HEAD(&item->ci_entry);
+}
+EXPORT_SYMBOL(config_item_init);
+
+/**
+ *	config_item_set_name - Set the name of an item
+ *	@item:	item.
+ *	@fmt:  The vsnprintf()'s format string.
+ *
+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *	dynamically allocated string that @item->ci_name points to.
+ *	Otherwise, use the static @item->ci_namebuf array.
+ */
+int config_item_set_name(struct config_item *item, const char *fmt, ...)
+{
+	int error = 0;
+	int limit = CONFIGFS_ITEM_NAME_LEN;
+	int need;
+	va_list args;
+	char *name;
+
+	/*
+	 * First, try the static array
+	 */
+	va_start(args, fmt);
+	need = vsnprintf(item->ci_namebuf, limit, fmt, args);
+	va_end(args);
+	if (need < limit)
+		name = item->ci_namebuf;
+	else {
+		/*
+		 * Need more space? Allocate it and try again
+		 */
+		limit = need + 1;
+		name = kmalloc(limit, GFP_KERNEL);
+		if (!name) {
+			error = -ENOMEM;
+			goto Done;
+		}
+		va_start(args, fmt);
+		need = vsnprintf(name, limit, fmt, args);
+		va_end(args);
+
+		/* Still? Give up. */
+		if (need >= limit) {
+			kfree(name);
+			error = -EFAULT;
+			goto Done;
+		}
+	}
+
+	/* Free the old name, if necessary. */
+	if (item->ci_name && item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+
+	/* Now, set the new name */
+	item->ci_name = name;
+ Done:
+	return error;
+}
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+				const char *name,
+				struct config_item_type *type)
+{
+	config_item_set_name(item, name);
+	item->ci_type = type;
+	config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+			 struct config_item_type *type)
+{
+	config_item_set_name(&group->cg_item, name);
+	group->cg_item.ci_type = type;
+	config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item *config_item_get(struct config_item *item)
+{
+	if (item)
+		kref_get(&item->ci_kref);
+	return item;
+}
+EXPORT_SYMBOL(config_item_get);
+
+static void config_item_cleanup(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct config_group *s = item->ci_group;
+	struct config_item *parent = item->ci_parent;
+
+	pr_debug("config_item %s: cleaning up\n", config_item_name(item));
+	if (item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+	item->ci_name = NULL;
+	if (t && t->ct_item_ops && t->ct_item_ops->release)
+		t->ct_item_ops->release(item);
+	if (s)
+		config_group_put(s);
+	if (parent)
+		config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *	config_item_put - decrement refcount for item.
+ *	@item:	item.
+ *
+ *	Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item *item)
+{
+	if (item)
+		kref_put(&item->ci_kref, config_item_release);
+}
+EXPORT_SYMBOL(config_item_put);
+
+/**
+ *	config_group_init - initialize a group for use
+ *	@group:	config_group
+ */
+void config_group_init(struct config_group *group)
+{
+	config_item_init(&group->cg_item);
+	INIT_LIST_HEAD(&group->cg_children);
+}
+EXPORT_SYMBOL(config_group_init);
+
+/**
+ *	config_group_find_item - search for item in group.
+ *	@group:	group we're looking in.
+ *	@name:	item's name.
+ *
+ *	Iterate over @group->cg_list, looking for a matching config_item.
+ *	If matching item is found take a reference and return the item.
+ *	Caller must have locked group via @group->cg_subsys->su_mtx.
+ */
+struct config_item *config_group_find_item(struct config_group *group,
+					   const char *name)
+{
+	struct list_head *entry;
+	struct config_item *ret = NULL;
+
+	list_for_each(entry, &group->cg_children) {
+		struct config_item *item = to_item(entry);
+		if (config_item_name(item) &&
+		    !strcmp(config_item_name(item), name)) {
+			ret = config_item_get(item);
+			break;
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(config_group_find_item);
diff --git a/kernel/fs/configfs/mount.c b/kernel/fs/configfs/mount.c
new file mode 100644
index 000000000..a8f3b589a
--- /dev/null
+++ b/kernel/fs/configfs/mount.c
@@ -0,0 +1,175 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+static struct vfsmount *configfs_mount = NULL;
+struct kmem_cache *configfs_dir_cachep;
+static int configfs_mnt_count = 0;
+
+static const struct super_operations configfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+	.cg_item = {
+		.ci_namebuf	= "root",
+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
+	},
+};
+
+int configfs_is_root(struct config_item *item)
+{
+	return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
+	.s_element	= &configfs_root_group.cg_item,
+	.s_type		= CONFIGFS_ROOT,
+	.s_iattr	= NULL,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CONFIGFS_MAGIC;
+	sb->s_op = &configfs_ops;
+	sb->s_time_gran = 1;
+
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+				   &configfs_root, sb);
+	if (inode) {
+		inode->i_op = &configfs_root_inode_operations;
+		inode->i_fop = &configfs_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inc_nlink(inode);
+	} else {
+		pr_debug("could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	root = d_make_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__func__);
+		return -ENOMEM;
+	}
+	config_group_init(&configfs_root_group);
+	configfs_root_group.cg_item.ci_dentry = root;
+	root->d_fsdata = &configfs_root;
+	sb->s_root = root;
+	sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
+	return 0;
+}
+
+static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "configfs",
+	.mount		= configfs_do_mount,
+	.kill_sb	= kill_litter_super,
+};
+MODULE_ALIAS_FS("configfs");
+
+struct dentry *configfs_pin_fs(void)
+{
+	int err = simple_pin_fs(&configfs_fs_type, &configfs_mount,
+			     &configfs_mnt_count);
+	return err ? ERR_PTR(err) : configfs_mount->mnt_root;
+}
+
+void configfs_release_fs(void)
+{
+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static int __init configfs_init(void)
+{
+	int err = -ENOMEM;
+
+	configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
+						sizeof(struct configfs_dirent),
+						0, 0, NULL);
+	if (!configfs_dir_cachep)
+		goto out;
+
+	err = sysfs_create_mount_point(kernel_kobj, "config");
+	if (err)
+		goto out2;
+
+	err = register_filesystem(&configfs_fs_type);
+	if (err)
+		goto out3;
+
+	return 0;
+out3:
+	pr_err("Unable to register filesystem!\n");
+	sysfs_remove_mount_point(kernel_kobj, "config");
+out2:
+	kmem_cache_destroy(configfs_dir_cachep);
+	configfs_dir_cachep = NULL;
+out:
+	return err;
+}
+
+static void __exit configfs_exit(void)
+{
+	unregister_filesystem(&configfs_fs_type);
+	sysfs_remove_mount_point(kernel_kobj, "config");
+	kmem_cache_destroy(configfs_dir_cachep);
+	configfs_dir_cachep = NULL;
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.2");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+core_initcall(configfs_init);
+module_exit(configfs_exit);
diff --git a/kernel/fs/configfs/symlink.c b/kernel/fs/configfs/symlink.c
new file mode 100644
index 000000000..cc9f2546e
--- /dev/null
+++ b/kernel/fs/configfs/symlink.c
@@ -0,0 +1,314 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Protects attachments of new symlinks */
+DEFINE_MUTEX(configfs_symlink_mutex);
+
+static int item_depth(struct config_item * item)
+{
+	struct config_item * p = item;
+	int depth = 0;
+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+	return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+	struct config_item * p = item;
+	int length = 1;
+	do {
+		length += strlen(config_item_name(p)) + 1;
+		p = p->ci_parent;
+	} while (p && !configfs_is_root(p));
+	return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+	struct config_item * p;
+
+	--length;
+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+		int cur = strlen(config_item_name(p));
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,config_item_name(p),cur);
+		*(buffer + --length) = '/';
+	}
+}
+
+static int create_link(struct config_item *parent_item,
+		       struct config_item *item,
+		       struct dentry *dentry)
+{
+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	int ret;
+
+	ret = -ENOENT;
+	if (!configfs_dirent_is_ready(target_sd))
+		goto out;
+	ret = -ENOMEM;
+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+	if (sl) {
+		sl->sl_target = config_item_get(item);
+		spin_lock(&configfs_dirent_lock);
+		if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+			spin_unlock(&configfs_dirent_lock);
+			config_item_put(item);
+			kfree(sl);
+			return -ENOENT;
+		}
+		list_add(&sl->sl_list, &target_sd->s_links);
+		spin_unlock(&configfs_dirent_lock);
+		ret = configfs_create_link(sl, parent_item->ci_dentry,
+					   dentry);
+		if (ret) {
+			spin_lock(&configfs_dirent_lock);
+			list_del_init(&sl->sl_list);
+			spin_unlock(&configfs_dirent_lock);
+			config_item_put(item);
+			kfree(sl);
+		}
+	}
+
+out:
+	return ret;
+}
+
+
+static int get_target(const char *symname, struct path *path,
+		      struct config_item **target, struct super_block *sb)
+{
+	int ret;
+
+	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
+	if (!ret) {
+		if (path->dentry->d_sb == sb) {
+			*target = configfs_get_config_item(path->dentry);
+			if (!*target) {
+				ret = -ENOENT;
+				path_put(path);
+			}
+		} else {
+			ret = -EPERM;
+			path_put(path);
+		}
+	}
+
+	return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int ret;
+	struct path path;
+	struct configfs_dirent *sd;
+	struct config_item *parent_item;
+	struct config_item *target_item = NULL;
+	struct config_item_type *type;
+
+	sd = dentry->d_parent->d_fsdata;
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	ret = -ENOENT;
+	if (!configfs_dirent_is_ready(sd))
+		goto out;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	ret = -EPERM;
+	if (!type || !type->ct_item_ops ||
+	    !type->ct_item_ops->allow_link)
+		goto out_put;
+
+	ret = get_target(symname, &path, &target_item, dentry->d_sb);
+	if (ret)
+		goto out_put;
+
+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (!ret) {
+		mutex_lock(&configfs_symlink_mutex);
+		ret = create_link(parent_item, target_item, dentry);
+		mutex_unlock(&configfs_symlink_mutex);
+		if (ret && type->ct_item_ops->drop_link)
+			type->ct_item_ops->drop_link(parent_item,
+						     target_item);
+	}
+
+	config_item_put(target_item);
+	path_put(&path);
+
+out_put:
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	struct config_item *parent_item;
+	struct config_item_type *type;
+	int ret;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+		goto out;
+
+	sl = sd->s_element;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
+	configfs_drop_dentry(sd, dentry->d_parent);
+	dput(dentry);
+	configfs_put(sd);
+
+	/*
+	 * drop_link() must be called before
+	 * list_del_init(&sl->sl_list), so that the order of
+	 * drop_link(this, target) and drop_item(target) is preserved.
+	 */
+	if (type && type->ct_item_ops &&
+	    type->ct_item_ops->drop_link)
+		type->ct_item_ops->drop_link(parent_item,
+					       sl->sl_target);
+
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&sl->sl_list);
+	spin_unlock(&configfs_dirent_lock);
+
+	/* Put reference from create_link() */
+	config_item_put(sl->sl_target);
+	kfree(sl);
+
+	config_item_put(parent_item);
+
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+				   char *path)
+{
+	char * s;
+	int depth, size;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __func__, path);
+
+	return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+	struct config_item *item, *target_item;
+	int error = 0;
+
+	item = configfs_get_config_item(dentry->d_parent);
+	if (!item)
+		return -EINVAL;
+
+	target_item = configfs_get_config_item(dentry);
+	if (!target_item) {
+		config_item_put(item);
+		return -EINVAL;
+	}
+
+	down_read(&configfs_rename_sem);
+	error = configfs_get_target_path(item, target_item, path);
+	up_read(&configfs_rename_sem);
+
+	config_item_put(item);
+	config_item_put(target_item);
+	return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	if (page) {
+		error = configfs_getlink(dentry, (char *)page);
+		if (!error) {
+			nd_set_link(nd, (char *)page);
+			return (void *)page;
+		}
+	}
+
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			      void *cookie)
+{
+	if (cookie) {
+		unsigned long page = (unsigned long)cookie;
+		free_page(page);
+	}
+}
+
+const struct inode_operations configfs_symlink_inode_operations = {
+	.follow_link = configfs_follow_link,
+	.readlink = generic_readlink,
+	.put_link = configfs_put_link,
+	.setattr = configfs_setattr,
+};
+
diff --git a/kernel/fs/coredump.c b/kernel/fs/coredump.c
new file mode 100644
index 000000000..bbbe139ab
--- /dev/null
+++ b/kernel/fs/coredump.c
@@ -0,0 +1,751 @@
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/swap.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/perf_event.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/key.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/coredump.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
+#include "internal.h"
+
+#include <trace/events/sched.h>
+
+int core_uses_pid;
+unsigned int core_pipe_limit;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+static int core_name_size = CORENAME_MAX_SIZE;
+
+struct core_name {
+	char *corename;
+	int used, size;
+};
+
+/* The maximal length of core_pattern is also specified in sysctl.c */
+
+static int expand_corename(struct core_name *cn, int size)
+{
+	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
+
+	if (!corename)
+		return -ENOMEM;
+
+	if (size > core_name_size) /* racy but harmless */
+		core_name_size = size;
+
+	cn->size = ksize(corename);
+	cn->corename = corename;
+	return 0;
+}
+
+static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+{
+	int free, need;
+	va_list arg_copy;
+
+again:
+	free = cn->size - cn->used;
+
+	va_copy(arg_copy, arg);
+	need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
+	va_end(arg_copy);
+
+	if (need < free) {
+		cn->used += need;
+		return 0;
+	}
+
+	if (!expand_corename(cn, cn->size + need - free + 1))
+		goto again;
+
+	return -ENOMEM;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+	va_list arg;
+	int ret;
+
+	va_start(arg, fmt);
+	ret = cn_vprintf(cn, fmt, arg);
+	va_end(arg);
+
+	return ret;
+}
+
+static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+{
+	int cur = cn->used;
+	va_list arg;
+	int ret;
+
+	va_start(arg, fmt);
+	ret = cn_vprintf(cn, fmt, arg);
+	va_end(arg);
+
+	for (; cur < cn->used; ++cur) {
+		if (cn->corename[cur] == '/')
+			cn->corename[cur] = '!';
+	}
+	return ret;
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+	struct file *exe_file;
+	char *pathbuf, *path;
+	int ret;
+
+	exe_file = get_mm_exe_file(current->mm);
+	if (!exe_file)
+		return cn_esc_printf(cn, "%s (path unknown)", current->comm);
+
+	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+	if (!pathbuf) {
+		ret = -ENOMEM;
+		goto put_exe_file;
+	}
+
+	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+	if (IS_ERR(path)) {
+		ret = PTR_ERR(path);
+		goto free_buf;
+	}
+
+	ret = cn_esc_printf(cn, "%s", path);
+
+free_buf:
+	kfree(pathbuf);
+put_exe_file:
+	fput(exe_file);
+	return ret;
+}
+
+/* format_corename will inspect the pattern parameter, and output a
+ * name into corename, which must have space for at least
+ * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+ */
+static int format_corename(struct core_name *cn, struct coredump_params *cprm)
+{
+	const struct cred *cred = current_cred();
+	const char *pat_ptr = core_pattern;
+	int ispipe = (*pat_ptr == '|');
+	int pid_in_pattern = 0;
+	int err = 0;
+
+	cn->used = 0;
+	cn->corename = NULL;
+	if (expand_corename(cn, core_name_size))
+		return -ENOMEM;
+	cn->corename[0] = '\0';
+
+	if (ispipe)
+		++pat_ptr;
+
+	/* Repeat as long as we have more pattern to process and more output
+	   space */
+	while (*pat_ptr) {
+		if (*pat_ptr != '%') {
+			err = cn_printf(cn, "%c", *pat_ptr++);
+		} else {
+			switch (*++pat_ptr) {
+			/* single % at the end, drop that */
+			case 0:
+				goto out;
+			/* Double percent, output one percent */
+			case '%':
+				err = cn_printf(cn, "%c", '%');
+				break;
+			/* pid */
+			case 'p':
+				pid_in_pattern = 1;
+				err = cn_printf(cn, "%d",
+					      task_tgid_vnr(current));
+				break;
+			/* global pid */
+			case 'P':
+				err = cn_printf(cn, "%d",
+					      task_tgid_nr(current));
+				break;
+			case 'i':
+				err = cn_printf(cn, "%d",
+					      task_pid_vnr(current));
+				break;
+			case 'I':
+				err = cn_printf(cn, "%d",
+					      task_pid_nr(current));
+				break;
+			/* uid */
+			case 'u':
+				err = cn_printf(cn, "%d", cred->uid);
+				break;
+			/* gid */
+			case 'g':
+				err = cn_printf(cn, "%d", cred->gid);
+				break;
+			case 'd':
+				err = cn_printf(cn, "%d",
+					__get_dumpable(cprm->mm_flags));
+				break;
+			/* signal that caused the coredump */
+			case 's':
+				err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
+				break;
+			/* UNIX time of coredump */
+			case 't': {
+				struct timeval tv;
+				do_gettimeofday(&tv);
+				err = cn_printf(cn, "%lu", tv.tv_sec);
+				break;
+			}
+			/* hostname */
+			case 'h':
+				down_read(&uts_sem);
+				err = cn_esc_printf(cn, "%s",
+					      utsname()->nodename);
+				up_read(&uts_sem);
+				break;
+			/* executable */
+			case 'e':
+				err = cn_esc_printf(cn, "%s", current->comm);
+				break;
+			case 'E':
+				err = cn_print_exe_file(cn);
+				break;
+			/* core limit size */
+			case 'c':
+				err = cn_printf(cn, "%lu",
+					      rlimit(RLIMIT_CORE));
+				break;
+			default:
+				break;
+			}
+			++pat_ptr;
+		}
+
+		if (err)
+			return err;
+	}
+
+out:
+	/* Backward compatibility with core_uses_pid:
+	 *
+	 * If core_pattern does not include a %p (as is the default)
+	 * and core_uses_pid is set, then .%pid will be appended to
+	 * the filename. Do not do this for piped commands. */
+	if (!ispipe && !pid_in_pattern && core_uses_pid) {
+		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+		if (err)
+			return err;
+	}
+	return ispipe;
+}
+
+static int zap_process(struct task_struct *start, int exit_code)
+{
+	struct task_struct *t;
+	int nr = 0;
+
+	start->signal->group_exit_code = exit_code;
+	start->signal->group_stop_count = 0;
+
+	t = start;
+	do {
+		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
+		if (t != current && t->mm) {
+			sigaddset(&t->pending.signal, SIGKILL);
+			signal_wake_up(t, 1);
+			nr++;
+		}
+	} while_each_thread(start, t);
+
+	return nr;
+}
+
+static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+			struct core_state *core_state, int exit_code)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	int nr = -EAGAIN;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!signal_group_exit(tsk->signal)) {
+		mm->core_state = core_state;
+		nr = zap_process(tsk, exit_code);
+		tsk->signal->group_exit_task = tsk;
+		/* ignore all signals except SIGKILL, see prepare_signal() */
+		tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (unlikely(nr < 0))
+		return nr;
+
+	tsk->flags |= PF_DUMPCORE;
+	if (atomic_read(&mm->mm_users) == nr + 1)
+		goto done;
+	/*
+	 * We should find and kill all tasks which use this mm, and we should
+	 * count them correctly into ->nr_threads. We don't take tasklist
+	 * lock, but this is safe wrt:
+	 *
+	 * fork:
+	 *	None of sub-threads can fork after zap_process(leader). All
+	 *	processes which were created before this point should be
+	 *	visible to zap_threads() because copy_process() adds the new
+	 *	process to the tail of init_task.tasks list, and lock/unlock
+	 *	of ->siglock provides a memory barrier.
+	 *
+	 * do_exit:
+	 *	The caller holds mm->mmap_sem. This means that the task which
+	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
+	 *	its ->mm.
+	 *
+	 * de_thread:
+	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
+	 *	we must see either old or new leader, this does not matter.
+	 *	However, it can change p->sighand, so lock_task_sighand(p)
+	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem
+	 *	it can't fail.
+	 *
+	 *	Note also that "g" can be the old leader with ->mm == NULL
+	 *	and already unhashed and thus removed from ->thread_group.
+	 *	This is OK, __unhash_process()->list_del_rcu() does not
+	 *	clear the ->next pointer, we will find the new leader via
+	 *	next_thread().
+	 */
+	rcu_read_lock();
+	for_each_process(g) {
+		if (g == tsk->group_leader)
+			continue;
+		if (g->flags & PF_KTHREAD)
+			continue;
+		p = g;
+		do {
+			if (p->mm) {
+				if (unlikely(p->mm == mm)) {
+					lock_task_sighand(p, &flags);
+					nr += zap_process(p, exit_code);
+					p->signal->flags = SIGNAL_GROUP_EXIT;
+					unlock_task_sighand(p, &flags);
+				}
+				break;
+			}
+		} while_each_thread(g, p);
+	}
+	rcu_read_unlock();
+done:
+	atomic_set(&core_state->nr_threads, nr);
+	return nr;
+}
+
+static int coredump_wait(int exit_code, struct core_state *core_state)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	int core_waiters = -EBUSY;
+
+	init_completion(&core_state->startup);
+	core_state->dumper.task = tsk;
+	core_state->dumper.next = NULL;
+
+	down_write(&mm->mmap_sem);
+	if (!mm->core_state)
+		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+	up_write(&mm->mmap_sem);
+
+	if (core_waiters > 0) {
+		struct core_thread *ptr;
+
+		wait_for_completion(&core_state->startup);
+		/*
+		 * Wait for all the threads to become inactive, so that
+		 * all the thread context (extended register state, like
+		 * fpu etc) gets copied to the memory.
+		 */
+		ptr = core_state->dumper.next;
+		while (ptr != NULL) {
+			wait_task_inactive(ptr->task, 0);
+			ptr = ptr->next;
+		}
+	}
+
+	return core_waiters;
+}
+
+static void coredump_finish(struct mm_struct *mm, bool core_dumped)
+{
+	struct core_thread *curr, *next;
+	struct task_struct *task;
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (core_dumped && !__fatal_signal_pending(current))
+		current->signal->group_exit_code |= 0x80;
+	current->signal->group_exit_task = NULL;
+	current->signal->flags = SIGNAL_GROUP_EXIT;
+	spin_unlock_irq(&current->sighand->siglock);
+
+	next = mm->core_state->dumper.next;
+	while ((curr = next) != NULL) {
+		next = curr->next;
+		task = curr->task;
+		/*
+		 * see exit_mm(), curr->task must not see
+		 * ->task == NULL before we read ->next.
+		 */
+		smp_mb();
+		curr->task = NULL;
+		wake_up_process(task);
+	}
+
+	mm->core_state = NULL;
+}
+
+static bool dump_interrupted(void)
+{
+	/*
+	 * SIGKILL or freezing() interrupt the coredumping. Perhaps we
+	 * can do try_to_freeze() and check __fatal_signal_pending(),
+	 * but then we need to teach dump_write() to restart and clear
+	 * TIF_SIGPENDING.
+	 */
+	return signal_pending(current);
+}
+
+static void wait_for_dump_helpers(struct file *file)
+{
+	struct pipe_inode_info *pipe = file->private_data;
+
+	pipe_lock(pipe);
+	pipe->readers++;
+	pipe->writers--;
+	wake_up_interruptible_sync(&pipe->wait);
+	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	pipe_unlock(pipe);
+
+	/*
+	 * We actually want wait_event_freezable() but then we need
+	 * to clear TIF_SIGPENDING and improve dump_interrupted().
+	 */
+	wait_event_interruptible(pipe->wait, pipe->readers == 1);
+
+	pipe_lock(pipe);
+	pipe->readers--;
+	pipe->writers++;
+	pipe_unlock(pipe);
+}
+
+/*
+ * umh_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct file *files[2];
+	struct coredump_params *cp = (struct coredump_params *)info->data;
+	int err = create_pipe_files(files, 0);
+	if (err)
+		return err;
+
+	cp->file = files[1];
+
+	err = replace_fd(0, files[0], 0);
+	fput(files[0]);
+	/* and disallow core files too */
+	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+	return err;
+}
+
+void do_coredump(const siginfo_t *siginfo)
+{
+	struct core_state core_state;
+	struct core_name cn;
+	struct mm_struct *mm = current->mm;
+	struct linux_binfmt * binfmt;
+	const struct cred *old_cred;
+	struct cred *cred;
+	int retval = 0;
+	int flag = 0;
+	int ispipe;
+	struct files_struct *displaced;
+	bool need_nonrelative = false;
+	bool core_dumped = false;
+	static atomic_t core_dump_count = ATOMIC_INIT(0);
+	struct coredump_params cprm = {
+		.siginfo = siginfo,
+		.regs = signal_pt_regs(),
+		.limit = rlimit(RLIMIT_CORE),
+		/*
+		 * We must use the same mm->flags while dumping core to avoid
+		 * inconsistency of bit flags, since this flag is not protected
+		 * by any locks.
+		 */
+		.mm_flags = mm->flags,
+	};
+
+	audit_core_dumps(siginfo->si_signo);
+
+	binfmt = mm->binfmt;
+	if (!binfmt || !binfmt->core_dump)
+		goto fail;
+	if (!__get_dumpable(cprm.mm_flags))
+		goto fail;
+
+	cred = prepare_creds();
+	if (!cred)
+		goto fail;
+	/*
+	 * We cannot trust fsuid as being the "true" uid of the process
+	 * nor do we know its entire history. We only know it was tainted
+	 * so we dump it as root in mode 2, and only into a controlled
+	 * environment (pipe handler or fully qualified path).
+	 */
+	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
+		/* Setuid core dump mode */
+		flag = O_EXCL;		/* Stop rewrite attacks */
+		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
+		need_nonrelative = true;
+	}
+
+	retval = coredump_wait(siginfo->si_signo, &core_state);
+	if (retval < 0)
+		goto fail_creds;
+
+	old_cred = override_creds(cred);
+
+	ispipe = format_corename(&cn, &cprm);
+
+	if (ispipe) {
+		int dump_count;
+		char **helper_argv;
+		struct subprocess_info *sub_info;
+
+		if (ispipe < 0) {
+			printk(KERN_WARNING "format_corename failed\n");
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_unlock;
+		}
+
+		if (cprm.limit == 1) {
+			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
+			 *
+			 * Normally core limits are irrelevant to pipes, since
+			 * we're not writing to the file system, but we use
+			 * cprm.limit of 1 here as a special value, this is a
+			 * consistent way to catch recursive crashes.
+			 * We can still crash if the core_pattern binary sets
+			 * RLIM_CORE = !1, but it runs as root, and can do
+			 * lots of stupid things.
+			 *
+			 * Note that we use task_tgid_vnr here to grab the pid
+			 * of the process group leader.  That way we get the
+			 * right pid if a thread in a multi-threaded
+			 * core_pattern process dies.
+			 */
+			printk(KERN_WARNING
+				"Process %d(%s) has RLIMIT_CORE set to 1\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_unlock;
+		}
+		cprm.limit = RLIM_INFINITY;
+
+		dump_count = atomic_inc_return(&core_dump_count);
+		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+			       task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_dropcount;
+		}
+
+		helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
+		if (!helper_argv) {
+			printk(KERN_WARNING "%s failed to allocate memory\n",
+			       __func__);
+			goto fail_dropcount;
+		}
+
+		retval = -ENOMEM;
+		sub_info = call_usermodehelper_setup(helper_argv[0],
+						helper_argv, NULL, GFP_KERNEL,
+						umh_pipe_setup, NULL, &cprm);
+		if (sub_info)
+			retval = call_usermodehelper_exec(sub_info,
+							  UMH_WAIT_EXEC);
+
+		argv_free(helper_argv);
+		if (retval) {
+			printk(KERN_INFO "Core dump to |%s pipe failed\n",
+			       cn.corename);
+			goto close_fail;
+		}
+	} else {
+		struct inode *inode;
+
+		if (cprm.limit < binfmt->min_coredump)
+			goto fail_unlock;
+
+		if (need_nonrelative && cn.corename[0] != '/') {
+			printk(KERN_WARNING "Pid %d(%s) can only dump core "\
+				"to fully qualified path!\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_unlock;
+		}
+
+		cprm.file = filp_open(cn.corename,
+				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+				 0600);
+		if (IS_ERR(cprm.file))
+			goto fail_unlock;
+
+		inode = file_inode(cprm.file);
+		if (inode->i_nlink > 1)
+			goto close_fail;
+		if (d_unhashed(cprm.file->f_path.dentry))
+			goto close_fail;
+		/*
+		 * AK: actually i see no reason to not allow this for named
+		 * pipes etc, but keep the previous behaviour for now.
+		 */
+		if (!S_ISREG(inode->i_mode))
+			goto close_fail;
+		/*
+		 * Dont allow local users get cute and trick others to coredump
+		 * into their pre-created files.
+		 */
+		if (!uid_eq(inode->i_uid, current_fsuid()))
+			goto close_fail;
+		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
+			goto close_fail;
+		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+			goto close_fail;
+	}
+
+	/* get us an unshared descriptor table; almost always a no-op */
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto close_fail;
+	if (displaced)
+		put_files_struct(displaced);
+	if (!dump_interrupted()) {
+		file_start_write(cprm.file);
+		core_dumped = binfmt->core_dump(&cprm);
+		file_end_write(cprm.file);
+	}
+	if (ispipe && core_pipe_limit)
+		wait_for_dump_helpers(cprm.file);
+close_fail:
+	if (cprm.file)
+		filp_close(cprm.file, NULL);
+fail_dropcount:
+	if (ispipe)
+		atomic_dec(&core_dump_count);
+fail_unlock:
+	kfree(cn.corename);
+	coredump_finish(mm, core_dumped);
+	revert_creds(old_cred);
+fail_creds:
+	put_cred(cred);
+fail:
+	return;
+}
+
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
+{
+	struct file *file = cprm->file;
+	loff_t pos = file->f_pos;
+	ssize_t n;
+	if (cprm->written + nr > cprm->limit)
+		return 0;
+	while (nr) {
+		if (dump_interrupted())
+			return 0;
+		n = __kernel_write(file, addr, nr, &pos);
+		if (n <= 0)
+			return 0;
+		file->f_pos = pos;
+		cprm->written += n;
+		nr -= n;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(dump_emit);
+
+int dump_skip(struct coredump_params *cprm, size_t nr)
+{
+	static char zeroes[PAGE_SIZE];
+	struct file *file = cprm->file;
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (cprm->written + nr > cprm->limit)
+			return 0;
+		if (dump_interrupted() ||
+		    file->f_op->llseek(file, nr, SEEK_CUR) < 0)
+			return 0;
+		cprm->written += nr;
+		return 1;
+	} else {
+		while (nr > PAGE_SIZE) {
+			if (!dump_emit(cprm, zeroes, PAGE_SIZE))
+				return 0;
+			nr -= PAGE_SIZE;
+		}
+		return dump_emit(cprm, zeroes, nr);
+	}
+}
+EXPORT_SYMBOL(dump_skip);
+
+int dump_align(struct coredump_params *cprm, int align)
+{
+	unsigned mod = cprm->written & (align - 1);
+	if (align & (align - 1))
+		return 0;
+	return mod ? dump_skip(cprm, align - mod) : 1;
+}
+EXPORT_SYMBOL(dump_align);
diff --git a/kernel/fs/cramfs/Kconfig b/kernel/fs/cramfs/Kconfig
new file mode 100644
index 000000000..11b29d491
--- /dev/null
+++ b/kernel/fs/cramfs/Kconfig
@@ -0,0 +1,22 @@
+config CRAMFS
+	tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for CramFs (Compressed ROM File
+	  System).  CramFs is designed to be a simple, small, and compressed
+	  file system for ROM based embedded systems.  CramFs is read-only,
+	  limited to 256MB file systems (with 16MB files), and doesn't support
+	  16/32 bits uid/gid, hard links and timestamps.
+
+	  See <file:Documentation/filesystems/cramfs.txt> and
+	  <file:fs/cramfs/README> for further information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  cramfs.  Note that the root file system (the one containing the
+	  directory /) cannot be compiled as a module.
+
+	  This filesystem is obsoleted by SquashFS, which is much better
+	  in terms of performance and features.
+
+	  If unsure, say N.
diff --git a/kernel/fs/cramfs/Makefile b/kernel/fs/cramfs/Makefile
new file mode 100644
index 000000000..92ebb464a
--- /dev/null
+++ b/kernel/fs/cramfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux cramfs routines.
+#
+
+obj-$(CONFIG_CRAMFS) += cramfs.o
+
+cramfs-objs := inode.o uncompress.o
diff --git a/kernel/fs/cramfs/README b/kernel/fs/cramfs/README
new file mode 100644
index 000000000..445d1c2d7
--- /dev/null
+++ b/kernel/fs/cramfs/README
@@ -0,0 +1,168 @@
+Notes on Filesystem Layout
+--------------------------
+
+These notes describe what mkcramfs generates.  Kernel requirements are
+a bit looser, e.g. it doesn't care if the <file_data> items are
+swapped around (though it does care that directory entries (inodes) in
+a given directory are contiguous, as this is used by readdir).
+
+All data is currently in host-endian format; neither mkcramfs nor the
+kernel ever do swabbing.  (See section `Block Size' below.)
+
+<filesystem>:
+	<superblock>
+	<directory_structure>
+	<data>
+
+<superblock>: struct cramfs_super (see cramfs_fs.h).
+
+<directory_structure>:
+	For each file:
+		struct cramfs_inode (see cramfs_fs.h).
+		Filename.  Not generally null-terminated, but it is
+		 null-padded to a multiple of 4 bytes.
+
+The order of inode traversal is described as "width-first" (not to be
+confused with breadth-first); i.e. like depth-first but listing all of
+a directory's entries before recursing down its subdirectories: the
+same order as `ls -AUR' (but without the /^\..*:$/ directory header
+lines); put another way, the same order as `find -type d -exec
+ls -AU1 {} \;'.
+
+Beginning in 2.4.7, directory entries are sorted.  This optimization
+allows cramfs_lookup to return more quickly when a filename does not
+exist, speeds up user-space directory sorts, etc.
+
+<data>:
+	One <file_data> for each file that's either a symlink or a
+	 regular file of non-zero st_size.
+
+<file_data>:
+	nblocks * <block_pointer>
+	 (where nblocks = (st_size - 1) / blksize + 1)
+	nblocks * <block>
+	padding to multiple of 4 bytes
+
+The i'th <block_pointer> for a file stores the byte offset of the
+*end* of the i'th <block> (i.e. one past the last byte, which is the
+same as the start of the (i+1)'th <block> if there is one).  The first
+<block> immediately follows the last <block_pointer> for the file.
+<block_pointer>s are each 32 bits long.
+
+The order of <file_data>'s is a depth-first descent of the directory
+tree, i.e. the same order as `find -size +0 \( -type f -o -type l \)
+-print'.
+
+
+<block>: The i'th <block> is the output of zlib's compress function
+applied to the i'th blksize-sized chunk of the input data.
+(For the last <block> of the file, the input may of course be smaller.)
+Each <block> may be a different size.  (See <block_pointer> above.)
+<block>s are merely byte-aligned, not generally u32-aligned.
+
+
+Holes
+-----
+
+This kernel supports cramfs holes (i.e. [efficient representation of]
+blocks in uncompressed data consisting entirely of NUL bytes), but by
+default mkcramfs doesn't test for & create holes, since cramfs in
+kernels up to at least 2.3.39 didn't support holes.  Run mkcramfs
+with -z if you want it to create files that can have holes in them.
+
+
+Tools
+-----
+
+The cramfs user-space tools, including mkcramfs and cramfsck, are
+located at <http://sourceforge.net/projects/cramfs/>.
+
+
+Future Development
+==================
+
+Block Size
+----------
+
+(Block size in cramfs refers to the size of input data that is
+compressed at a time.  It's intended to be somewhere around
+PAGE_CACHE_SIZE for cramfs_readpage's convenience.)
+
+The superblock ought to indicate the block size that the fs was
+written for, since comments in <linux/pagemap.h> indicate that
+PAGE_CACHE_SIZE may grow in future (if I interpret the comment
+correctly).
+
+Currently, mkcramfs #define's PAGE_CACHE_SIZE as 4096 and uses that
+for blksize, whereas Linux-2.3.39 uses its PAGE_CACHE_SIZE, which in
+turn is defined as PAGE_SIZE (which can be as large as 32KB on arm).
+This discrepancy is a bug, though it's not clear which should be
+changed.
+
+One option is to change mkcramfs to take its PAGE_CACHE_SIZE from
+<asm/page.h>.  Personally I don't like this option, but it does
+require the least amount of change: just change `#define
+PAGE_CACHE_SIZE (4096)' to `#include <asm/page.h>'.  The disadvantage
+is that the generated cramfs cannot always be shared between different
+kernels, not even necessarily kernels of the same architecture if
+PAGE_CACHE_SIZE is subject to change between kernel versions
+(currently possible with arm and ia64).
+
+The remaining options try to make cramfs more sharable.
+
+One part of that is addressing endianness.  The two options here are
+`always use little-endian' (like ext2fs) or `writer chooses
+endianness; kernel adapts at runtime'.  Little-endian wins because of
+code simplicity and little CPU overhead even on big-endian machines.
+
+The cost of swabbing is changing the code to use the le32_to_cpu
+etc. macros as used by ext2fs.  We don't need to swab the compressed
+data, only the superblock, inodes and block pointers.
+
+
+The other part of making cramfs more sharable is choosing a block
+size.  The options are:
+
+  1. Always 4096 bytes.
+
+  2. Writer chooses blocksize; kernel adapts but rejects blocksize >
+     PAGE_CACHE_SIZE.
+
+  3. Writer chooses blocksize; kernel adapts even to blocksize >
+     PAGE_CACHE_SIZE.
+
+It's easy enough to change the kernel to use a smaller value than
+PAGE_CACHE_SIZE: just make cramfs_readpage read multiple blocks.
+
+The cost of option 1 is that kernels with a larger PAGE_CACHE_SIZE
+value don't get as good compression as they can.
+
+The cost of option 2 relative to option 1 is that the code uses
+variables instead of #define'd constants.  The gain is that people
+with kernels having larger PAGE_CACHE_SIZE can make use of that if
+they don't mind their cramfs being inaccessible to kernels with
+smaller PAGE_CACHE_SIZE values.
+
+Option 3 is easy to implement if we don't mind being CPU-inefficient:
+e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which
+must be no larger than 32KB) and discard what it doesn't need.
+Getting readpage to read into all the covered pages is harder.
+
+The main advantage of option 3 over 1, 2, is better compression.  The
+cost is greater complexity.  Probably not worth it, but I hope someone
+will disagree.  (If it is implemented, then I'll re-use that code in
+e2compr.)
+
+
+Another cost of 2 and 3 over 1 is making mkcramfs use a different
+block size, but that just means adding and parsing a -b option.
+
+
+Inode Size
+----------
+
+Given that cramfs will probably be used for CDs etc. as well as just
+silicon ROMs, it might make sense to expand the inode a little from
+its current 12 bytes.  Inodes other than the root inode are followed
+by filename, so the expansion doesn't even have to be a multiple of 4
+bytes.
diff --git a/kernel/fs/cramfs/inode.c b/kernel/fs/cramfs/inode.c
new file mode 100644
index 000000000..355c522f3
--- /dev/null
+++ b/kernel/fs/cramfs/inode.c
@@ -0,0 +1,611 @@
+/*
+ * Compressed rom filesystem for Linux.
+ *
+ * Copyright (C) 1999 Linus Torvalds.
+ *
+ * This file is released under the GPL.
+ */
+
+/*
+ * These are the VFS interfaces to the compressed rom filesystem.
+ * The actual compression is based on zlib, see the other files.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+#include <linux/mutex.h>
+#include <uapi/linux/cramfs_fs.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * cramfs super-block data in memory
+ */
+struct cramfs_sb_info {
+	unsigned long magic;
+	unsigned long size;
+	unsigned long blocks;
+	unsigned long files;
+	unsigned long flags;
+};
+
+static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static const struct super_operations cramfs_ops;
+static const struct inode_operations cramfs_dir_inode_operations;
+static const struct file_operations cramfs_directory_operations;
+static const struct address_space_operations cramfs_aops;
+
+static DEFINE_MUTEX(read_mutex);
+
+
+/* These macros may change in future, to provide better st_ino semantics. */
+#define OFFSET(x)	((x)->i_ino)
+
+static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset)
+{
+	if (!cino->offset)
+		return offset + 1;
+	if (!cino->size)
+		return offset + 1;
+
+	/*
+	 * The file mode test fixes buggy mkcramfs implementations where
+	 * cramfs_inode->offset is set to a non zero value for entries
+	 * which did not contain data, like devices node and fifos.
+	 */
+	switch (cino->mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		return cino->offset << 2;
+	default:
+		break;
+	}
+	return offset + 1;
+}
+
+static struct inode *get_cramfs_inode(struct super_block *sb,
+	const struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+	struct inode *inode;
+	static struct timespec zerotime;
+
+	inode = iget_locked(sb, cramino(cramfs_inode, offset));
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	switch (cramfs_inode->mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_fop = &generic_ro_fops;
+		inode->i_data.a_ops = &cramfs_aops;
+		break;
+	case S_IFDIR:
+		inode->i_op = &cramfs_dir_inode_operations;
+		inode->i_fop = &cramfs_directory_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &cramfs_aops;
+		break;
+	default:
+		init_special_inode(inode, cramfs_inode->mode,
+				old_decode_dev(cramfs_inode->size));
+	}
+
+	inode->i_mode = cramfs_inode->mode;
+	i_uid_write(inode, cramfs_inode->uid);
+	i_gid_write(inode, cramfs_inode->gid);
+
+	/* if the lower 2 bits are zero, the inode contains data */
+	if (!(inode->i_ino & 3)) {
+		inode->i_size = cramfs_inode->size;
+		inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+	}
+
+	/* Struct copy intentional */
+	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+	/* inode->i_nlink is left 1 - arguably wrong for directories,
+	   but it's the best we can do without reading the directory
+	   contents.  1 yields the right result in GNU find, even
+	   without -noleaf option. */
+
+	unlock_new_inode(inode);
+
+	return inode;
+}
+
+/*
+ * We have our own block cache: don't fill up the buffer cache
+ * with the rom-image, because the way the filesystem is set
+ * up the accesses should be fairly regular and cached in the
+ * page cache and dentry tree anyway..
+ *
+ * This also acts as a way to guarantee contiguous areas of up to
+ * BLKS_PER_BUF*PAGE_CACHE_SIZE, so that the caller doesn't need to
+ * worry about end-of-buffer issues even when decompressing a full
+ * page cache.
+ */
+#define READ_BUFFERS (2)
+/* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */
+#define NEXT_BUFFER(_ix) ((_ix) ^ 1)
+
+/*
+ * BLKS_PER_BUF_SHIFT should be at least 2 to allow for "compressed"
+ * data that takes up more space than the original and with unlucky
+ * alignment.
+ */
+#define BLKS_PER_BUF_SHIFT	(2)
+#define BLKS_PER_BUF		(1 << BLKS_PER_BUF_SHIFT)
+#define BUFFER_SIZE		(BLKS_PER_BUF*PAGE_CACHE_SIZE)
+
+static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
+static unsigned buffer_blocknr[READ_BUFFERS];
+static struct super_block *buffer_dev[READ_BUFFERS];
+static int next_buffer;
+
+/*
+ * Returns a pointer to a buffer containing at least LEN bytes of
+ * filesystem starting at byte offset OFFSET into the filesystem.
+ */
+static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len)
+{
+	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+	struct page *pages[BLKS_PER_BUF];
+	unsigned i, blocknr, buffer;
+	unsigned long devsize;
+	char *data;
+
+	if (!len)
+		return NULL;
+	blocknr = offset >> PAGE_CACHE_SHIFT;
+	offset &= PAGE_CACHE_SIZE - 1;
+
+	/* Check if an existing buffer already has the data.. */
+	for (i = 0; i < READ_BUFFERS; i++) {
+		unsigned int blk_offset;
+
+		if (buffer_dev[i] != sb)
+			continue;
+		if (blocknr < buffer_blocknr[i])
+			continue;
+		blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT;
+		blk_offset += offset;
+		if (blk_offset + len > BUFFER_SIZE)
+			continue;
+		return read_buffers[i] + blk_offset;
+	}
+
+	devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT;
+
+	/* Ok, read in BLKS_PER_BUF pages completely first. */
+	for (i = 0; i < BLKS_PER_BUF; i++) {
+		struct page *page = NULL;
+
+		if (blocknr + i < devsize) {
+			page = read_mapping_page(mapping, blocknr + i, NULL);
+			/* synchronous error? */
+			if (IS_ERR(page))
+				page = NULL;
+		}
+		pages[i] = page;
+	}
+
+	for (i = 0; i < BLKS_PER_BUF; i++) {
+		struct page *page = pages[i];
+
+		if (page) {
+			wait_on_page_locked(page);
+			if (!PageUptodate(page)) {
+				/* asynchronous error */
+				page_cache_release(page);
+				pages[i] = NULL;
+			}
+		}
+	}
+
+	buffer = next_buffer;
+	next_buffer = NEXT_BUFFER(buffer);
+	buffer_blocknr[buffer] = blocknr;
+	buffer_dev[buffer] = sb;
+
+	data = read_buffers[buffer];
+	for (i = 0; i < BLKS_PER_BUF; i++) {
+		struct page *page = pages[i];
+
+		if (page) {
+			memcpy(data, kmap(page), PAGE_CACHE_SIZE);
+			kunmap(page);
+			page_cache_release(page);
+		} else
+			memset(data, 0, PAGE_CACHE_SIZE);
+		data += PAGE_CACHE_SIZE;
+	}
+	return read_buffers[buffer] + offset;
+}
+
+static void cramfs_kill_sb(struct super_block *sb)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
+	kill_block_super(sb);
+	kfree(sbi);
+}
+
+static int cramfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int i;
+	struct cramfs_super super;
+	unsigned long root_offset;
+	struct cramfs_sb_info *sbi;
+	struct inode *root;
+
+	sb->s_flags |= MS_RDONLY;
+
+	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* Invalidate the read buffers on mount: think disk change.. */
+	mutex_lock(&read_mutex);
+	for (i = 0; i < READ_BUFFERS; i++)
+		buffer_blocknr[i] = -1;
+
+	/* Read the first block and get the superblock from it */
+	memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super));
+	mutex_unlock(&read_mutex);
+
+	/* Do sanity checks on the superblock */
+	if (super.magic != CRAMFS_MAGIC) {
+		/* check for wrong endianness */
+		if (super.magic == CRAMFS_MAGIC_WEND) {
+			if (!silent)
+				pr_err("wrong endianness\n");
+			return -EINVAL;
+		}
+
+		/* check at 512 byte offset */
+		mutex_lock(&read_mutex);
+		memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super));
+		mutex_unlock(&read_mutex);
+		if (super.magic != CRAMFS_MAGIC) {
+			if (super.magic == CRAMFS_MAGIC_WEND && !silent)
+				pr_err("wrong endianness\n");
+			else if (!silent)
+				pr_err("wrong magic\n");
+			return -EINVAL;
+		}
+	}
+
+	/* get feature flags first */
+	if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
+		pr_err("unsupported filesystem features\n");
+		return -EINVAL;
+	}
+
+	/* Check that the root inode is in a sane state */
+	if (!S_ISDIR(super.root.mode)) {
+		pr_err("root is not a directory\n");
+		return -EINVAL;
+	}
+	/* correct strange, hard-coded permissions of mkcramfs */
+	super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+
+	root_offset = super.root.offset << 2;
+	if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
+		sbi->size = super.size;
+		sbi->blocks = super.fsid.blocks;
+		sbi->files = super.fsid.files;
+	} else {
+		sbi->size = 1<<28;
+		sbi->blocks = 0;
+		sbi->files = 0;
+	}
+	sbi->magic = super.magic;
+	sbi->flags = super.flags;
+	if (root_offset == 0)
+		pr_info("empty filesystem");
+	else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
+		 ((root_offset != sizeof(struct cramfs_super)) &&
+		  (root_offset != 512 + sizeof(struct cramfs_super))))
+	{
+		pr_err("bad root offset %lu\n", root_offset);
+		return -EINVAL;
+	}
+
+	/* Set it all up.. */
+	sb->s_op = &cramfs_ops;
+	root = get_cramfs_inode(sb, &super.root, 0);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root)
+		return -ENOMEM;
+	return 0;
+}
+
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = CRAMFS_MAGIC;
+	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_blocks = CRAMFS_SB(sb)->blocks;
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_files = CRAMFS_SB(sb)->files;
+	buf->f_ffree = 0;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = CRAMFS_MAXPATHLEN;
+	return 0;
+}
+
+/*
+ * Read a cramfs directory entry.
+ */
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	char *buf;
+	unsigned int offset;
+
+	/* Offset within the thing. */
+	if (ctx->pos >= inode->i_size)
+		return 0;
+	offset = ctx->pos;
+	/* Directory entries are always 4-byte aligned */
+	if (offset & 3)
+		return -EINVAL;
+
+	buf = kmalloc(CRAMFS_MAXPATHLEN, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	while (offset < inode->i_size) {
+		struct cramfs_inode *de;
+		unsigned long nextoffset;
+		char *name;
+		ino_t ino;
+		umode_t mode;
+		int namelen;
+
+		mutex_lock(&read_mutex);
+		de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+		name = (char *)(de+1);
+
+		/*
+		 * Namelengths on disk are shifted by two
+		 * and the name padded out to 4-byte boundaries
+		 * with zeroes.
+		 */
+		namelen = de->namelen << 2;
+		memcpy(buf, name, namelen);
+		ino = cramino(de, OFFSET(inode) + offset);
+		mode = de->mode;
+		mutex_unlock(&read_mutex);
+		nextoffset = offset + sizeof(*de) + namelen;
+		for (;;) {
+			if (!namelen) {
+				kfree(buf);
+				return -EIO;
+			}
+			if (buf[namelen-1])
+				break;
+			namelen--;
+		}
+		if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
+			break;
+
+		ctx->pos = offset = nextoffset;
+	}
+	kfree(buf);
+	return 0;
+}
+
+/*
+ * Lookup and fill in the inode data..
+ */
+static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	unsigned int offset = 0;
+	struct inode *inode = NULL;
+	int sorted;
+
+	mutex_lock(&read_mutex);
+	sorted = CRAMFS_SB(dir->i_sb)->flags & CRAMFS_FLAG_SORTED_DIRS;
+	while (offset < dir->i_size) {
+		struct cramfs_inode *de;
+		char *name;
+		int namelen, retval;
+		int dir_off = OFFSET(dir) + offset;
+
+		de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
+		name = (char *)(de+1);
+
+		/* Try to take advantage of sorted directories */
+		if (sorted && (dentry->d_name.name[0] < name[0]))
+			break;
+
+		namelen = de->namelen << 2;
+		offset += sizeof(*de) + namelen;
+
+		/* Quick check that the name is roughly the right length */
+		if (((dentry->d_name.len + 3) & ~3) != namelen)
+			continue;
+
+		for (;;) {
+			if (!namelen) {
+				inode = ERR_PTR(-EIO);
+				goto out;
+			}
+			if (name[namelen-1])
+				break;
+			namelen--;
+		}
+		if (namelen != dentry->d_name.len)
+			continue;
+		retval = memcmp(dentry->d_name.name, name, namelen);
+		if (retval > 0)
+			continue;
+		if (!retval) {
+			inode = get_cramfs_inode(dir->i_sb, de, dir_off);
+			break;
+		}
+		/* else (retval < 0) */
+		if (sorted)
+			break;
+	}
+out:
+	mutex_unlock(&read_mutex);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int cramfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	u32 maxblock;
+	int bytes_filled;
+	void *pgdata;
+
+	maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	bytes_filled = 0;
+	pgdata = kmap(page);
+
+	if (page->index < maxblock) {
+		struct super_block *sb = inode->i_sb;
+		u32 blkptr_offset = OFFSET(inode) + page->index*4;
+		u32 start_offset, compr_len;
+
+		start_offset = OFFSET(inode) + maxblock*4;
+		mutex_lock(&read_mutex);
+		if (page->index)
+			start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
+				4);
+		compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
+			start_offset);
+		mutex_unlock(&read_mutex);
+
+		if (compr_len == 0)
+			; /* hole */
+		else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
+			pr_err("bad compressed blocksize %u\n",
+				compr_len);
+			goto err;
+		} else {
+			mutex_lock(&read_mutex);
+			bytes_filled = cramfs_uncompress_block(pgdata,
+				 PAGE_CACHE_SIZE,
+				 cramfs_read(sb, start_offset, compr_len),
+				 compr_len);
+			mutex_unlock(&read_mutex);
+			if (unlikely(bytes_filled < 0))
+				goto err;
+		}
+	}
+
+	memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
+	flush_dcache_page(page);
+	kunmap(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+err:
+	kunmap(page);
+	ClearPageUptodate(page);
+	SetPageError(page);
+	unlock_page(page);
+	return 0;
+}
+
+static const struct address_space_operations cramfs_aops = {
+	.readpage = cramfs_readpage
+};
+
+/*
+ * Our operations:
+ */
+
+/*
+ * A directory can only readdir
+ */
+static const struct file_operations cramfs_directory_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= cramfs_readdir,
+};
+
+static const struct inode_operations cramfs_dir_inode_operations = {
+	.lookup		= cramfs_lookup,
+};
+
+static const struct super_operations cramfs_ops = {
+	.remount_fs	= cramfs_remount,
+	.statfs		= cramfs_statfs,
+};
+
+static struct dentry *cramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+}
+
+static struct file_system_type cramfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "cramfs",
+	.mount		= cramfs_mount,
+	.kill_sb	= cramfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("cramfs");
+
+static int __init init_cramfs_fs(void)
+{
+	int rv;
+
+	rv = cramfs_uncompress_init();
+	if (rv < 0)
+		return rv;
+	rv = register_filesystem(&cramfs_fs_type);
+	if (rv < 0)
+		cramfs_uncompress_exit();
+	return rv;
+}
+
+static void __exit exit_cramfs_fs(void)
+{
+	cramfs_uncompress_exit();
+	unregister_filesystem(&cramfs_fs_type);
+}
+
+module_init(init_cramfs_fs)
+module_exit(exit_cramfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/cramfs/internal.h b/kernel/fs/cramfs/internal.h
new file mode 100644
index 000000000..349d71272
--- /dev/null
+++ b/kernel/fs/cramfs/internal.h
@@ -0,0 +1,4 @@
+/* Uncompression interfaces to the underlying zlib */
+int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
+int cramfs_uncompress_init(void);
+void cramfs_uncompress_exit(void);
diff --git a/kernel/fs/cramfs/uncompress.c b/kernel/fs/cramfs/uncompress.c
new file mode 100644
index 000000000..ec4f1d4fd
--- /dev/null
+++ b/kernel/fs/cramfs/uncompress.c
@@ -0,0 +1,79 @@
+/*
+ * uncompress.c
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * cramfs interfaces to the uncompression library. There's really just
+ * three entrypoints:
+ *
+ *  - cramfs_uncompress_init() - called to initialize the thing.
+ *  - cramfs_uncompress_exit() - tell me when you're done
+ *  - cramfs_uncompress_block() - uncompress a block.
+ *
+ * NOTE NOTE NOTE! The uncompression is entirely single-threaded. We
+ * only have one stream, and we'll initialize it only once even if it
+ * then is used by multiple filesystems.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+#include "internal.h"
+
+static z_stream stream;
+static int initialized;
+
+/* Returns length of decompressed data. */
+int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
+{
+	int err;
+
+	stream.next_in = src;
+	stream.avail_in = srclen;
+
+	stream.next_out = dst;
+	stream.avail_out = dstlen;
+
+	err = zlib_inflateReset(&stream);
+	if (err != Z_OK) {
+		pr_err("zlib_inflateReset error %d\n", err);
+		zlib_inflateEnd(&stream);
+		zlib_inflateInit(&stream);
+	}
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto err;
+	return stream.total_out;
+
+err:
+	pr_err("Error %d while decompressing!\n", err);
+	pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
+	return -EIO;
+}
+
+int cramfs_uncompress_init(void)
+{
+	if (!initialized++) {
+		stream.workspace = vmalloc(zlib_inflate_workspacesize());
+		if (!stream.workspace) {
+			initialized = 0;
+			return -ENOMEM;
+		}
+		stream.next_in = NULL;
+		stream.avail_in = 0;
+		zlib_inflateInit(&stream);
+	}
+	return 0;
+}
+
+void cramfs_uncompress_exit(void)
+{
+	if (!--initialized) {
+		zlib_inflateEnd(&stream);
+		vfree(stream.workspace);
+	}
+}
diff --git a/kernel/fs/dax.c b/kernel/fs/dax.c
new file mode 100644
index 000000000..6f65f00e5
--- /dev/null
+++ b/kernel/fs/dax.c
@@ -0,0 +1,550 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <linux/vmstat.h>
+
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	sector_t sector = block << (inode->i_blkbits - 9);
+
+	might_sleep();
+	do {
+		void *addr;
+		unsigned long pfn;
+		long count;
+
+		count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+		if (count < 0)
+			return count;
+		BUG_ON(size < count);
+		while (count > 0) {
+			unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+			if (pgsz > count)
+				pgsz = count;
+			if (pgsz < PAGE_SIZE)
+				memset(addr, 0, pgsz);
+			else
+				clear_page(addr);
+			addr += pgsz;
+			size -= pgsz;
+			count -= pgsz;
+			BUG_ON(pgsz & 511);
+			sector += pgsz / 512;
+			cond_resched();
+		}
+	} while (size);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+	unsigned long pfn;
+	sector_t sector = bh->b_blocknr << (blkbits - 9);
+	return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+			loff_t end)
+{
+	loff_t final = end - pos + first; /* The final byte of the buffer */
+
+	if (first > 0)
+		memset(addr, 0, first);
+	if (final < size)
+		memset(addr + final, 0, size - final);
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+	return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size.  To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+	return bh->b_state != 0;
+}
+
+static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
+		      loff_t start, loff_t end, get_block_t get_block,
+		      struct buffer_head *bh)
+{
+	ssize_t retval = 0;
+	loff_t pos = start;
+	loff_t max = start;
+	loff_t bh_max = start;
+	void *addr;
+	bool hole = false;
+
+	if (iov_iter_rw(iter) != WRITE)
+		end = min(end, i_size_read(inode));
+
+	while (pos < end) {
+		unsigned len;
+		if (pos == max) {
+			unsigned blkbits = inode->i_blkbits;
+			sector_t block = pos >> blkbits;
+			unsigned first = pos - (block << blkbits);
+			long size;
+
+			if (pos == bh_max) {
+				bh->b_size = PAGE_ALIGN(end - pos);
+				bh->b_state = 0;
+				retval = get_block(inode, block, bh,
+						   iov_iter_rw(iter) == WRITE);
+				if (retval)
+					break;
+				if (!buffer_size_valid(bh))
+					bh->b_size = 1 << blkbits;
+				bh_max = pos - first + bh->b_size;
+			} else {
+				unsigned done = bh->b_size -
+						(bh_max - (pos - first));
+				bh->b_blocknr += done >> blkbits;
+				bh->b_size -= done;
+			}
+
+			hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+			if (hole) {
+				addr = NULL;
+				size = bh->b_size - first;
+			} else {
+				retval = dax_get_addr(bh, &addr, blkbits);
+				if (retval < 0)
+					break;
+				if (buffer_unwritten(bh) || buffer_new(bh))
+					dax_new_buf(addr, retval, first, pos,
+									end);
+				addr += first;
+				size = retval - first;
+			}
+			max = min(pos + size, end);
+		}
+
+		if (iov_iter_rw(iter) == WRITE)
+			len = copy_from_iter(addr, max - pos, iter);
+		else if (!hole)
+			len = copy_to_iter(addr, max - pos, iter);
+		else
+			len = iov_iter_zero(max - pos, iter);
+
+		if (!len)
+			break;
+
+		pos += len;
+		addr += len;
+	}
+
+	return (pos == start) ? retval : pos - start;
+}
+
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes.  For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
+		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
+		  dio_iodone_t end_io, int flags)
+{
+	struct buffer_head bh;
+	ssize_t retval = -EINVAL;
+	loff_t end = pos + iov_iter_count(iter);
+
+	memset(&bh, 0, sizeof(bh));
+
+	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
+		struct address_space *mapping = inode->i_mapping;
+		mutex_lock(&inode->i_mutex);
+		retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+		if (retval) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+
+	/* Protects against truncate */
+	inode_dio_begin(inode);
+
+	retval = dax_io(inode, iter, pos, end, get_block, &bh);
+
+	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
+		mutex_unlock(&inode->i_mutex);
+
+	if ((retval > 0) && end_io)
+		end_io(iocb, pos, retval, bh.b_private);
+
+	inode_dio_end(inode);
+ out:
+	return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
+
+/*
+ * The user has performed a load from a hole in the file.  Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files.  We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+							struct vm_fault *vmf)
+{
+	unsigned long size;
+	struct inode *inode = mapping->host;
+	if (!page)
+		page = find_or_create_page(mapping, vmf->pgoff,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		return VM_FAULT_OOM;
+	/* Recheck i_size under page lock to avoid truncate race */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size) {
+		unlock_page(page);
+		page_cache_release(page);
+		return VM_FAULT_SIGBUS;
+	}
+
+	vmf->page = page;
+	return VM_FAULT_LOCKED;
+}
+
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+			unsigned blkbits, unsigned long vaddr)
+{
+	void *vfrom, *vto;
+	if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+		return -EIO;
+	vto = kmap_atomic(to);
+	copy_user_page(vto, vfrom, vaddr, to);
+	kunmap_atomic(vto);
+	return 0;
+}
+
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+			struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct address_space *mapping = inode->i_mapping;
+	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	void *addr;
+	unsigned long pfn;
+	pgoff_t size;
+	int error;
+
+	i_mmap_lock_read(mapping);
+
+	/*
+	 * Check truncate didn't happen while we were allocating a block.
+	 * If it did, this block may or may not be still allocated to the
+	 * file.  We can't tell the filesystem to free it because we can't
+	 * take i_mutex here.  In the worst case, the file still has blocks
+	 * allocated past the end of the file.
+	 */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (unlikely(vmf->pgoff >= size)) {
+		error = -EIO;
+		goto out;
+	}
+
+	error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+	if (error < 0)
+		goto out;
+	if (error < PAGE_SIZE) {
+		error = -EIO;
+		goto out;
+	}
+
+	if (buffer_unwritten(bh) || buffer_new(bh))
+		clear_page(addr);
+
+	error = vm_insert_mixed(vma, vaddr, pfn);
+
+ out:
+	i_mmap_unlock_read(mapping);
+
+	if (bh->b_end_io)
+		bh->b_end_io(bh, 1);
+
+	return error;
+}
+
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			get_block_t get_block)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head bh;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned blkbits = inode->i_blkbits;
+	sector_t block;
+	pgoff_t size;
+	int error;
+	int major = 0;
+
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		return VM_FAULT_SIGBUS;
+
+	memset(&bh, 0, sizeof(bh));
+	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+	bh.b_size = PAGE_SIZE;
+
+ repeat:
+	page = find_get_page(mapping, vmf->pgoff);
+	if (page) {
+		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+			page_cache_release(page);
+			return VM_FAULT_RETRY;
+		}
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (unlikely(vmf->pgoff >= size)) {
+			/*
+			 * We have a struct page covering a hole in the file
+			 * from a read fault and we've raced with a truncate
+			 */
+			error = -EIO;
+			goto unlock_page;
+		}
+	}
+
+	error = get_block(inode, block, &bh, 0);
+	if (!error && (bh.b_size < PAGE_SIZE))
+		error = -EIO;		/* fs corruption? */
+	if (error)
+		goto unlock_page;
+
+	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+		if (vmf->flags & FAULT_FLAG_WRITE) {
+			error = get_block(inode, block, &bh, 1);
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			if (!error && (bh.b_size < PAGE_SIZE))
+				error = -EIO;
+			if (error)
+				goto unlock_page;
+		} else {
+			return dax_load_hole(mapping, page, vmf);
+		}
+	}
+
+	if (vmf->cow_page) {
+		struct page *new_page = vmf->cow_page;
+		if (buffer_written(&bh))
+			error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+		else
+			clear_user_highpage(new_page, vaddr);
+		if (error)
+			goto unlock_page;
+		vmf->page = page;
+		if (!page) {
+			i_mmap_lock_read(mapping);
+			/* Check we didn't race with truncate */
+			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+								PAGE_SHIFT;
+			if (vmf->pgoff >= size) {
+				i_mmap_unlock_read(mapping);
+				error = -EIO;
+				goto out;
+			}
+		}
+		return VM_FAULT_LOCKED;
+	}
+
+	/* Check we didn't race with a read fault installing a new page */
+	if (!page && major)
+		page = find_lock_page(mapping, vmf->pgoff);
+
+	if (page) {
+		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+							PAGE_CACHE_SIZE, 0);
+		delete_from_page_cache(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	error = dax_insert_mapping(inode, &bh, vma, vmf);
+
+ out:
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM | major;
+	/* -EBUSY is fine, somebody else faulted on the same PTE */
+	if ((error < 0) && (error != -EBUSY))
+		return VM_FAULT_SIGBUS | major;
+	return VM_FAULT_NOPAGE | major;
+
+ unlock_page:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	goto out;
+}
+
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			get_block_t get_block)
+{
+	int result;
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(sb);
+		file_update_time(vma->vm_file);
+	}
+	result = do_dax_fault(vma, vmf, get_block);
+	if (vmf->flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(sb);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
+
+/**
+ * dax_pfn_mkwrite - handle first write to DAX page
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ *
+ */
+int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	sb_start_pagefault(sb);
+	file_update_time(vma->vm_file);
+	sb_end_pagefault(sb);
+	return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+
+/**
+ * dax_zero_page_range - zero a range within a page of a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file.  This is intended for hole-punch operations.  If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+							get_block_t get_block)
+{
+	struct buffer_head bh;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	int err;
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+
+	memset(&bh, 0, sizeof(bh));
+	bh.b_size = PAGE_CACHE_SIZE;
+	err = get_block(inode, index, &bh, 0);
+	if (err < 0)
+		return err;
+	if (buffer_written(&bh)) {
+		void *addr;
+		err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+		if (err < 0)
+			return err;
+		memset(addr + offset, 0, length);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+	unsigned length = PAGE_CACHE_ALIGN(from) - from;
+	return dax_zero_page_range(inode, from, length, get_block);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/kernel/fs/dcache.c b/kernel/fs/dcache.c
new file mode 100644
index 000000000..e1d20e11e
--- /dev/null
+++ b/kernel/fs/dcache.c
@@ -0,0 +1,3459 @@
+/*
+ * fs/dcache.c
+ *
+ * Complete reimplementation
+ * (C) 1997 Thomas Schoebel-Theuer,
+ * with heavy changes by Linus Torvalds
+ */
+
+/*
+ * Notes on the allocation strategy:
+ *
+ * The dcache is a master of the icache - whenever a dcache entry
+ * exists, the inode will always exist. "iput()" is done either when
+ * the dcache entry is deleted or garbage collected.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/cache.h>
+#include <linux/export.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include <linux/security.h>
+#include <linux/seqlock.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+#include <linux/fs_struct.h>
+#include <linux/hardirq.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rculist_bl.h>
+#include <linux/prefetch.h>
+#include <linux/ratelimit.h>
+#include <linux/list_lru.h>
+#include <linux/kasan.h>
+
+#include "internal.h"
+#include "mount.h"
+
+/*
+ * Usage:
+ * dcache->d_inode->i_lock protects:
+ *   - i_dentry, d_u.d_alias, d_inode of aliases
+ * dcache_hash_bucket lock protects:
+ *   - the dcache hash table
+ * s_anon bl list spinlock protects:
+ *   - the s_anon list (see __d_drop)
+ * dentry->d_sb->s_dentry_lru_lock protects:
+ *   - the dcache lru lists and counters
+ * d_lock protects:
+ *   - d_flags
+ *   - d_name
+ *   - d_lru
+ *   - d_count
+ *   - d_unhashed()
+ *   - d_parent and d_subdirs
+ *   - childrens' d_child and d_parent
+ *   - d_u.d_alias, d_inode
+ *
+ * Ordering:
+ * dentry->d_inode->i_lock
+ *   dentry->d_lock
+ *     dentry->d_sb->s_dentry_lru_lock
+ *     dcache_hash_bucket lock
+ *     s_anon lock
+ *
+ * If there is an ancestor relationship:
+ * dentry->d_parent->...->d_parent->d_lock
+ *   ...
+ *     dentry->d_parent->d_lock
+ *       dentry->d_lock
+ *
+ * If no ancestor relationship:
+ * if (dentry1 < dentry2)
+ *   dentry1->d_lock
+ *     dentry2->d_lock
+ */
+int sysctl_vfs_cache_pressure __read_mostly = 100;
+EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+
+EXPORT_SYMBOL(rename_lock);
+
+static struct kmem_cache *dentry_cache __read_mostly;
+
+/*
+ * This is the single most critical data structure when it comes
+ * to the dcache: the hashtable for lookups. Somebody should try
+ * to make this good - I've just made it work.
+ *
+ * This hash-function tries to avoid losing too many bits of hash
+ * information, yet avoid using a prime hash-size or similar.
+ */
+
+static unsigned int d_hash_mask __read_mostly;
+static unsigned int d_hash_shift __read_mostly;
+
+static struct hlist_bl_head *dentry_hashtable __read_mostly;
+
+static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
+					unsigned int hash)
+{
+	hash += (unsigned long) parent / L1_CACHE_BYTES;
+	return dentry_hashtable + hash_32(hash, d_hash_shift);
+}
+
+/* Statistics gathering. */
+struct dentry_stat_t dentry_stat = {
+	.age_limit = 45,
+};
+
+static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
+static long get_nr_dentry(void)
+{
+	int i;
+	long sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static long get_nr_dentry_unused(void)
+{
+	int i;
+	long sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_unused, i);
+	return sum < 0 ? 0 : sum;
+}
+
+int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
+		   size_t *lenp, loff_t *ppos)
+{
+	dentry_stat.nr_dentry = get_nr_dentry();
+	dentry_stat.nr_unused = get_nr_dentry_unused();
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
+/*
+ * Compare 2 name strings, return 0 if they match, otherwise non-zero.
+ * The strings are both count bytes long, and count is non-zero.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+#include <asm/word-at-a-time.h>
+/*
+ * NOTE! 'cs' and 'scount' come from a dentry, so it has a
+ * aligned allocation for this particular component. We don't
+ * strictly need the load_unaligned_zeropad() safety, but it
+ * doesn't hurt either.
+ *
+ * In contrast, 'ct' and 'tcount' can be from a pathname, and do
+ * need the careful unaligned handling.
+ */
+static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
+{
+	unsigned long a,b,mask;
+
+	for (;;) {
+		a = *(unsigned long *)cs;
+		b = load_unaligned_zeropad(ct);
+		if (tcount < sizeof(unsigned long))
+			break;
+		if (unlikely(a != b))
+			return 1;
+		cs += sizeof(unsigned long);
+		ct += sizeof(unsigned long);
+		tcount -= sizeof(unsigned long);
+		if (!tcount)
+			return 0;
+	}
+	mask = bytemask_from_count(tcount);
+	return unlikely(!!((a ^ b) & mask));
+}
+
+#else
+
+static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
+{
+	do {
+		if (*cs != *ct)
+			return 1;
+		cs++;
+		ct++;
+		tcount--;
+	} while (tcount);
+	return 0;
+}
+
+#endif
+
+static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
+{
+	const unsigned char *cs;
+	/*
+	 * Be careful about RCU walk racing with rename:
+	 * use ACCESS_ONCE to fetch the name pointer.
+	 *
+	 * NOTE! Even if a rename will mean that the length
+	 * was not loaded atomically, we don't care. The
+	 * RCU walk will check the sequence count eventually,
+	 * and catch it. And we won't overrun the buffer,
+	 * because we're reading the name pointer atomically,
+	 * and a dentry name is guaranteed to be properly
+	 * terminated with a NUL byte.
+	 *
+	 * End result: even if 'len' is wrong, we'll exit
+	 * early because the data cannot match (there can
+	 * be no NUL in the ct/tcount data)
+	 */
+	cs = ACCESS_ONCE(dentry->d_name.name);
+	smp_read_barrier_depends();
+	return dentry_string_cmp(cs, ct, tcount);
+}
+
+struct external_name {
+	union {
+		atomic_t count;
+		struct rcu_head head;
+	} u;
+	unsigned char name[];
+};
+
+static inline struct external_name *external_name(struct dentry *dentry)
+{
+	return container_of(dentry->d_name.name, struct external_name, name[0]);
+}
+
+static void __d_free(struct rcu_head *head)
+{
+	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
+static void __d_free_external(struct rcu_head *head)
+{
+	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+	kfree(external_name(dentry));
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
+static inline int dname_external(const struct dentry *dentry)
+{
+	return dentry->d_name.name != dentry->d_iname;
+}
+
+/*
+ * Make sure other CPUs see the inode attached before the type is set.
+ */
+static inline void __d_set_inode_and_type(struct dentry *dentry,
+					  struct inode *inode,
+					  unsigned type_flags)
+{
+	unsigned flags;
+
+	dentry->d_inode = inode;
+	smp_wmb();
+	flags = READ_ONCE(dentry->d_flags);
+	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+	flags |= type_flags;
+	WRITE_ONCE(dentry->d_flags, flags);
+}
+
+/*
+ * Ideally, we want to make sure that other CPUs see the flags cleared before
+ * the inode is detached, but this is really a violation of RCU principles
+ * since the ordering suggests we should always set inode before flags.
+ *
+ * We should instead replace or discard the entire dentry - but that sucks
+ * performancewise on mass deletion/rename.
+ */
+static inline void __d_clear_type_and_inode(struct dentry *dentry)
+{
+	unsigned flags = READ_ONCE(dentry->d_flags);
+
+	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+	WRITE_ONCE(dentry->d_flags, flags);
+	smp_wmb();
+	dentry->d_inode = NULL;
+}
+
+static void dentry_free(struct dentry *dentry)
+{
+	WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
+	if (unlikely(dname_external(dentry))) {
+		struct external_name *p = external_name(dentry);
+		if (likely(atomic_dec_and_test(&p->u.count))) {
+			call_rcu(&dentry->d_u.d_rcu, __d_free_external);
+			return;
+		}
+	}
+	/* if dentry was never visible to RCU, immediate free is OK */
+	if (!(dentry->d_flags & DCACHE_RCUACCESS))
+		__d_free(&dentry->d_u.d_rcu);
+	else
+		call_rcu(&dentry->d_u.d_rcu, __d_free);
+}
+
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+	assert_spin_locked(&dentry->d_lock);
+	/* Go through a barrier */
+	write_seqcount_barrier(&dentry->d_seq);
+}
+
+/*
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
+ */
+static void dentry_iput(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dentry->d_inode->i_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	if (inode) {
+		__d_clear_type_and_inode(dentry);
+		hlist_del_init(&dentry->d_u.d_alias);
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&inode->i_lock);
+		if (!inode->i_nlink)
+			fsnotify_inoderemove(inode);
+		if (dentry->d_op && dentry->d_op->d_iput)
+			dentry->d_op->d_iput(dentry, inode);
+		else
+			iput(inode);
+	} else {
+		spin_unlock(&dentry->d_lock);
+	}
+}
+
+/*
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dentry->d_inode->i_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	__d_clear_type_and_inode(dentry);
+	hlist_del_init(&dentry->d_u.d_alias);
+	dentry_rcuwalk_barrier(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&inode->i_lock);
+	if (!inode->i_nlink)
+		fsnotify_inoderemove(inode);
+	if (dentry->d_op && dentry->d_op->d_iput)
+		dentry->d_op->d_iput(dentry, inode);
+	else
+		iput(inode);
+}
+
+/*
+ * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
+ * is in use - which includes both the "real" per-superblock
+ * LRU list _and_ the DCACHE_SHRINK_LIST use.
+ *
+ * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
+ * on the shrink list (ie not on the superblock LRU list).
+ *
+ * The per-cpu "nr_dentry_unused" counters are updated with
+ * the DCACHE_LRU_LIST bit.
+ *
+ * These helper functions make sure we always follow the
+ * rules. d_lock must be held by the caller.
+ */
+#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
+static void d_lru_add(struct dentry *dentry)
+{
+	D_FLAG_VERIFY(dentry, 0);
+	dentry->d_flags |= DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+}
+
+static void d_lru_del(struct dentry *dentry)
+{
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+}
+
+static void d_shrink_del(struct dentry *dentry)
+{
+	D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
+	list_del_init(&dentry->d_lru);
+	dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
+	this_cpu_dec(nr_dentry_unused);
+}
+
+static void d_shrink_add(struct dentry *dentry, struct list_head *list)
+{
+	D_FLAG_VERIFY(dentry, 0);
+	list_add(&dentry->d_lru, list);
+	dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
+}
+
+/*
+ * These can only be called under the global LRU lock, ie during the
+ * callback for freeing the LRU list. "isolate" removes it from the
+ * LRU lists entirely, while shrink_move moves it to the indicated
+ * private list.
+ */
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
+{
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	list_lru_isolate(lru, &dentry->d_lru);
+}
+
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+			      struct list_head *list)
+{
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags |= DCACHE_SHRINK_LIST;
+	list_lru_isolate_move(lru, &dentry->d_lru, list);
+}
+
+/*
+ * dentry_lru_(add|del)_list) must be called with d_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+		d_lru_add(dentry);
+}
+
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
+ */
+void __d_drop(struct dentry *dentry)
+{
+	if (!d_unhashed(dentry)) {
+		struct hlist_bl_head *b;
+		/*
+		 * Hashed dentries are normally on the dentry hashtable,
+		 * with the exception of those newly allocated by
+		 * d_obtain_alias, which are always IS_ROOT:
+		 */
+		if (unlikely(IS_ROOT(dentry)))
+			b = &dentry->d_sb->s_anon;
+		else
+			b = d_hash(dentry->d_parent, dentry->d_name.hash);
+
+		hlist_bl_lock(b);
+		__hlist_bl_del(&dentry->d_hash);
+		dentry->d_hash.pprev = NULL;
+		hlist_bl_unlock(b);
+		dentry_rcuwalk_barrier(dentry);
+	}
+}
+EXPORT_SYMBOL(__d_drop);
+
+void d_drop(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_drop);
+
+static void __dentry_kill(struct dentry *dentry)
+{
+	struct dentry *parent = NULL;
+	bool can_free = true;
+	if (!IS_ROOT(dentry))
+		parent = dentry->d_parent;
+
+	/*
+	 * The dentry is now unrecoverably dead to the world.
+	 */
+	lockref_mark_dead(&dentry->d_lockref);
+
+	/*
+	 * inform the fs via d_prune that this dentry is about to be
+	 * unhashed and destroyed.
+	 */
+	if (dentry->d_flags & DCACHE_OP_PRUNE)
+		dentry->d_op->d_prune(dentry);
+
+	if (dentry->d_flags & DCACHE_LRU_LIST) {
+		if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+			d_lru_del(dentry);
+	}
+	/* if it was on the hash then remove it */
+	__d_drop(dentry);
+	__list_del_entry(&dentry->d_child);
+	/*
+	 * Inform d_walk() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	dentry_iput(dentry);
+	/*
+	 * dentry_iput drops the locks, at which point nobody (except
+	 * transient RCU lookups) can reach this dentry.
+	 */
+	BUG_ON(dentry->d_lockref.count > 0);
+	this_cpu_dec(nr_dentry);
+	if (dentry->d_op && dentry->d_op->d_release)
+		dentry->d_op->d_release(dentry);
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+		dentry->d_flags |= DCACHE_MAY_FREE;
+		can_free = false;
+	}
+	spin_unlock(&dentry->d_lock);
+	if (likely(can_free))
+		dentry_free(dentry);
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+	__releases(dentry->d_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = NULL;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto failed;
+
+	if (!IS_ROOT(dentry)) {
+		parent = dentry->d_parent;
+		if (unlikely(!spin_trylock(&parent->d_lock))) {
+			if (inode)
+				spin_unlock(&inode->i_lock);
+			goto failed;
+		}
+	}
+
+	__dentry_kill(dentry);
+	return parent;
+
+failed:
+	spin_unlock(&dentry->d_lock);
+	cpu_chill();
+	return dentry; /* try again with same dentry */
+}
+
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	if (IS_ROOT(dentry))
+		return NULL;
+	if (unlikely(dentry->d_lockref.count < 0))
+		return NULL;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return parent;
+	rcu_read_lock();
+	spin_unlock(&dentry->d_lock);
+again:
+	parent = ACCESS_ONCE(dentry->d_parent);
+	spin_lock(&parent->d_lock);
+	/*
+	 * We can't blindly lock dentry until we are sure
+	 * that we won't violate the locking order.
+	 * Any changes of dentry->d_parent must have
+	 * been done with parent->d_lock held, so
+	 * spin_lock() above is enough of a barrier
+	 * for checking if it's still our child.
+	 */
+	if (unlikely(parent != dentry->d_parent)) {
+		spin_unlock(&parent->d_lock);
+		goto again;
+	}
+	rcu_read_unlock();
+	if (parent != dentry)
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	else
+		parent = NULL;
+	return parent;
+}
+
+/*
+ * Try to do a lockless dput(), and return whether that was successful.
+ *
+ * If unsuccessful, we return false, having already taken the dentry lock.
+ *
+ * The caller needs to hold the RCU read lock, so that the dentry is
+ * guaranteed to stay around even if the refcount goes down to zero!
+ */
+static inline bool fast_dput(struct dentry *dentry)
+{
+	int ret;
+	unsigned int d_flags;
+
+	/*
+	 * If we have a d_op->d_delete() operation, we sould not
+	 * let the dentry count go to zero, so use "put__or_lock".
+	 */
+	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
+		return lockref_put_or_lock(&dentry->d_lockref);
+
+	/*
+	 * .. otherwise, we can try to just decrement the
+	 * lockref optimistically.
+	 */
+	ret = lockref_put_return(&dentry->d_lockref);
+
+	/*
+	 * If the lockref_put_return() failed due to the lock being held
+	 * by somebody else, the fast path has failed. We will need to
+	 * get the lock, and then check the count again.
+	 */
+	if (unlikely(ret < 0)) {
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_lockref.count > 1) {
+			dentry->d_lockref.count--;
+			spin_unlock(&dentry->d_lock);
+			return 1;
+		}
+		return 0;
+	}
+
+	/*
+	 * If we weren't the last ref, we're done.
+	 */
+	if (ret)
+		return 1;
+
+	/*
+	 * Careful, careful. The reference count went down
+	 * to zero, but we don't hold the dentry lock, so
+	 * somebody else could get it again, and do another
+	 * dput(), and we need to not race with that.
+	 *
+	 * However, there is a very special and common case
+	 * where we don't care, because there is nothing to
+	 * do: the dentry is still hashed, it does not have
+	 * a 'delete' op, and it's referenced and already on
+	 * the LRU list.
+	 *
+	 * NOTE! Since we aren't locked, these values are
+	 * not "stable". However, it is sufficient that at
+	 * some point after we dropped the reference the
+	 * dentry was hashed and the flags had the proper
+	 * value. Other dentry users may have re-gotten
+	 * a reference to the dentry and change that, but
+	 * our work is done - we can leave the dentry
+	 * around with a zero refcount.
+	 */
+	smp_rmb();
+	d_flags = ACCESS_ONCE(dentry->d_flags);
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
+
+	/* Nothing to do? Dropping the reference was all we needed? */
+	if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+		return 1;
+
+	/*
+	 * Not the fast normal case? Get the lock. We've already decremented
+	 * the refcount, but we'll need to re-check the situation after
+	 * getting the lock.
+	 */
+	spin_lock(&dentry->d_lock);
+
+	/*
+	 * Did somebody else grab a reference to it in the meantime, and
+	 * we're no longer the last user after all? Alternatively, somebody
+	 * else could have killed it and marked it dead. Either way, we
+	 * don't need to do anything else.
+	 */
+	if (dentry->d_lockref.count) {
+		spin_unlock(&dentry->d_lock);
+		return 1;
+	}
+
+	/*
+	 * Re-get the reference we optimistically dropped. We hold the
+	 * lock, and we just tested that it was zero, so we can just
+	 * set it to 1.
+	 */
+	dentry->d_lockref.count = 1;
+	return 0;
+}
+
+
+/* 
+ * This is dput
+ *
+ * This is complicated by the fact that we do not want to put
+ * dentries that are no longer on any hash chain on the unused
+ * list: we'd much rather just get rid of them immediately.
+ *
+ * However, that implies that we have to traverse the dentry
+ * tree upwards to the parents which might _also_ now be
+ * scheduled for deletion (it may have been only waiting for
+ * its last child to go away).
+ *
+ * This tail recursion is done by hand as we don't want to depend
+ * on the compiler to always get this right (gcc generally doesn't).
+ * Real recursion would eat up our stack space.
+ */
+
+/*
+ * dput - release a dentry
+ * @dentry: dentry to release 
+ *
+ * Release a dentry. This will drop the usage count and if appropriate
+ * call the dentry unlink method as well as removing it from the queues and
+ * releasing its resources. If the parent dentries were scheduled for release
+ * they too may now get deleted.
+ */
+void dput(struct dentry *dentry)
+{
+	if (unlikely(!dentry))
+		return;
+
+repeat:
+	rcu_read_lock();
+	if (likely(fast_dput(dentry))) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* Slow case: now with the dentry lock held */
+	rcu_read_unlock();
+
+	/* Unreachable? Get rid of it */
+	if (unlikely(d_unhashed(dentry)))
+		goto kill_it;
+
+	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
+		if (dentry->d_op->d_delete(dentry))
+			goto kill_it;
+	}
+
+	if (!(dentry->d_flags & DCACHE_REFERENCED))
+		dentry->d_flags |= DCACHE_REFERENCED;
+	dentry_lru_add(dentry);
+
+	dentry->d_lockref.count--;
+	spin_unlock(&dentry->d_lock);
+	return;
+
+kill_it:
+	dentry = dentry_kill(dentry);
+	if (dentry)
+		goto repeat;
+}
+EXPORT_SYMBOL(dput);
+
+
+/* This must be called with d_lock held */
+static inline void __dget_dlock(struct dentry *dentry)
+{
+	dentry->d_lockref.count++;
+}
+
+static inline void __dget(struct dentry *dentry)
+{
+	lockref_get(&dentry->d_lockref);
+}
+
+struct dentry *dget_parent(struct dentry *dentry)
+{
+	int gotref;
+	struct dentry *ret;
+
+	/*
+	 * Do optimistic parent lookup without any
+	 * locking.
+	 */
+	rcu_read_lock();
+	ret = ACCESS_ONCE(dentry->d_parent);
+	gotref = lockref_get_not_zero(&ret->d_lockref);
+	rcu_read_unlock();
+	if (likely(gotref)) {
+		if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
+			return ret;
+		dput(ret);
+	}
+
+repeat:
+	/*
+	 * Don't need rcu_dereference because we re-check it was correct under
+	 * the lock.
+	 */
+	rcu_read_lock();
+	ret = dentry->d_parent;
+	spin_lock(&ret->d_lock);
+	if (unlikely(ret != dentry->d_parent)) {
+		spin_unlock(&ret->d_lock);
+		rcu_read_unlock();
+		goto repeat;
+	}
+	rcu_read_unlock();
+	BUG_ON(!ret->d_lockref.count);
+	ret->d_lockref.count++;
+	spin_unlock(&ret->d_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dget_parent);
+
+/**
+ * d_find_alias - grab a hashed alias of inode
+ * @inode: inode in question
+ *
+ * If inode has a hashed alias, or is a directory and has any alias,
+ * acquire the reference to alias and return it. Otherwise return NULL.
+ * Notice that if inode is a directory there can be only one alias and
+ * it can be unhashed only if it has no children, or if it is the root
+ * of a filesystem, or if the directory was renamed and d_revalidate
+ * was the first vfs operation to notice.
+ *
+ * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
+ * any other hashed alias over that one.
+ */
+static struct dentry *__d_find_alias(struct inode *inode)
+{
+	struct dentry *alias, *discon_alias;
+
+again:
+	discon_alias = NULL;
+	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+		spin_lock(&alias->d_lock);
+ 		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+			if (IS_ROOT(alias) &&
+			    (alias->d_flags & DCACHE_DISCONNECTED)) {
+				discon_alias = alias;
+			} else {
+				__dget_dlock(alias);
+				spin_unlock(&alias->d_lock);
+				return alias;
+			}
+		}
+		spin_unlock(&alias->d_lock);
+	}
+	if (discon_alias) {
+		alias = discon_alias;
+		spin_lock(&alias->d_lock);
+		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+			__dget_dlock(alias);
+			spin_unlock(&alias->d_lock);
+			return alias;
+		}
+		spin_unlock(&alias->d_lock);
+		goto again;
+	}
+	return NULL;
+}
+
+struct dentry *d_find_alias(struct inode *inode)
+{
+	struct dentry *de = NULL;
+
+	if (!hlist_empty(&inode->i_dentry)) {
+		spin_lock(&inode->i_lock);
+		de = __d_find_alias(inode);
+		spin_unlock(&inode->i_lock);
+	}
+	return de;
+}
+EXPORT_SYMBOL(d_find_alias);
+
+/*
+ *	Try to kill dentries associated with this inode.
+ * WARNING: you must own a reference to inode.
+ */
+void d_prune_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+restart:
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
+		spin_lock(&dentry->d_lock);
+		if (!dentry->d_lockref.count) {
+			struct dentry *parent = lock_parent(dentry);
+			if (likely(!dentry->d_lockref.count)) {
+				__dentry_kill(dentry);
+				dput(parent);
+				goto restart;
+			}
+			if (parent)
+				spin_unlock(&parent->d_lock);
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(d_prune_aliases);
+
+static void shrink_dentry_list(struct list_head *list)
+{
+	struct dentry *dentry, *parent;
+
+	while (!list_empty(list)) {
+		struct inode *inode;
+		dentry = list_entry(list->prev, struct dentry, d_lru);
+		spin_lock(&dentry->d_lock);
+		parent = lock_parent(dentry);
+
+		/*
+		 * The dispose list is isolated and dentries are not accounted
+		 * to the LRU here, so we can simply remove it from the list
+		 * here regardless of whether it is referenced or not.
+		 */
+		d_shrink_del(dentry);
+
+		/*
+		 * We found an inuse dentry which was not removed from
+		 * the LRU because of laziness during lookup. Do not free it.
+		 */
+		if (dentry->d_lockref.count > 0) {
+			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			continue;
+		}
+
+
+		if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+			bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			if (can_free)
+				dentry_free(dentry);
+			continue;
+		}
+
+		inode = dentry->d_inode;
+		if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
+			d_shrink_add(dentry, list);
+			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			continue;
+		}
+
+		__dentry_kill(dentry);
+
+		/*
+		 * We need to prune ancestors too. This is necessary to prevent
+		 * quadratic behavior of shrink_dcache_parent(), but is also
+		 * expected to be beneficial in reducing dentry cache
+		 * fragmentation.
+		 */
+		dentry = parent;
+		while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+			parent = lock_parent(dentry);
+			if (dentry->d_lockref.count != 1) {
+				dentry->d_lockref.count--;
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				break;
+			}
+			inode = dentry->d_inode;	/* can't be NULL */
+			if (unlikely(!spin_trylock(&inode->i_lock))) {
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				cpu_relax();
+				continue;
+			}
+			__dentry_kill(dentry);
+			dentry = parent;
+		}
+	}
+}
+
+static enum lru_status dentry_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	/*
+	 * Referenced dentries are still in use. If they have active
+	 * counts, just remove them from the LRU. Otherwise give them
+	 * another pass through the LRU.
+	 */
+	if (dentry->d_lockref.count) {
+		d_lru_isolate(lru, dentry);
+		spin_unlock(&dentry->d_lock);
+		return LRU_REMOVED;
+	}
+
+	if (dentry->d_flags & DCACHE_REFERENCED) {
+		dentry->d_flags &= ~DCACHE_REFERENCED;
+		spin_unlock(&dentry->d_lock);
+
+		/*
+		 * The list move itself will be made by the common LRU code. At
+		 * this point, we've dropped the dentry->d_lock but keep the
+		 * lru lock. This is safe to do, since every list movement is
+		 * protected by the lru lock even if both locks are held.
+		 *
+		 * This is guaranteed by the fact that all LRU management
+		 * functions are intermediated by the LRU API calls like
+		 * list_lru_add and list_lru_del. List movement in this file
+		 * only ever occur through this functions or through callbacks
+		 * like this one, that are called from the LRU API.
+		 *
+		 * The only exceptions to this are functions like
+		 * shrink_dentry_list, and code that first checks for the
+		 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
+		 * operating only with stack provided lists after they are
+		 * properly isolated from the main list.  It is thus, always a
+		 * local access.
+		 */
+		return LRU_ROTATE;
+	}
+
+	d_lru_shrink_move(lru, dentry, freeable);
+	spin_unlock(&dentry->d_lock);
+
+	return LRU_REMOVED;
+}
+
+/**
+ * prune_dcache_sb - shrink the dcache
+ * @sb: superblock
+ * @sc: shrink control, passed to list_lru_shrink_walk()
+ *
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+ * is done when we need more memory and called from the superblock shrinker
+ * function.
+ *
+ * This function may fail to free any resources if all the dentries are in
+ * use.
+ */
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
+{
+	LIST_HEAD(dispose);
+	long freed;
+
+	freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+				     dentry_lru_isolate, &dispose);
+	shrink_dentry_list(&dispose);
+	return freed;
+}
+
+static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	d_lru_shrink_move(lru, dentry, freeable);
+	spin_unlock(&dentry->d_lock);
+
+	return LRU_REMOVED;
+}
+
+
+/**
+ * shrink_dcache_sb - shrink dcache for a superblock
+ * @sb: superblock
+ *
+ * Shrink the dcache for the specified super block. This is used to free
+ * the dcache before unmounting a file system.
+ */
+void shrink_dcache_sb(struct super_block *sb)
+{
+	long freed;
+
+	do {
+		LIST_HEAD(dispose);
+
+		freed = list_lru_walk(&sb->s_dentry_lru,
+			dentry_lru_isolate_shrink, &dispose, UINT_MAX);
+
+		this_cpu_sub(nr_dentry_unused, freed);
+		shrink_dentry_list(&dispose);
+	} while (freed > 0);
+}
+EXPORT_SYMBOL(shrink_dcache_sb);
+
+/**
+ * enum d_walk_ret - action to talke during tree walk
+ * @D_WALK_CONTINUE:	contrinue walk
+ * @D_WALK_QUIT:	quit walk
+ * @D_WALK_NORETRY:	quit when retry is needed
+ * @D_WALK_SKIP:	skip this dentry and its children
+ */
+enum d_walk_ret {
+	D_WALK_CONTINUE,
+	D_WALK_QUIT,
+	D_WALK_NORETRY,
+	D_WALK_SKIP,
+};
+
+/**
+ * d_walk - walk the dentry tree
+ * @parent:	start of walk
+ * @data:	data passed to @enter() and @finish()
+ * @enter:	callback when first entering the dentry
+ * @finish:	callback when successfully finished the walk
+ *
+ * The @enter() and @finish() callbacks are called with d_lock held.
+ */
+static void d_walk(struct dentry *parent, void *data,
+		   enum d_walk_ret (*enter)(void *, struct dentry *),
+		   void (*finish)(void *))
+{
+	struct dentry *this_parent;
+	struct list_head *next;
+	unsigned seq = 0;
+	enum d_walk_ret ret;
+	bool retry = true;
+
+again:
+	read_seqbegin_or_lock(&rename_lock, &seq);
+	this_parent = parent;
+	spin_lock(&this_parent->d_lock);
+
+	ret = enter(data, this_parent);
+	switch (ret) {
+	case D_WALK_CONTINUE:
+		break;
+	case D_WALK_QUIT:
+	case D_WALK_SKIP:
+		goto out_unlock;
+	case D_WALK_NORETRY:
+		retry = false;
+		break;
+	}
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		next = tmp->next;
+
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+		ret = enter(data, dentry);
+		switch (ret) {
+		case D_WALK_CONTINUE:
+			break;
+		case D_WALK_QUIT:
+			spin_unlock(&dentry->d_lock);
+			goto out_unlock;
+		case D_WALK_NORETRY:
+			retry = false;
+			break;
+		case D_WALK_SKIP:
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
+
+		if (!list_empty(&dentry->d_subdirs)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+			this_parent = dentry;
+			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+			goto repeat;
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	/*
+	 * All done at this level ... ascend and resume the search.
+	 */
+	rcu_read_lock();
+ascend:
+	if (this_parent != parent) {
+		struct dentry *child = this_parent;
+		this_parent = child->d_parent;
+
+		spin_unlock(&child->d_lock);
+		spin_lock(&this_parent->d_lock);
+
+		/* might go back up the wrong parent if we have had a rename. */
+		if (need_seqretry(&rename_lock, seq))
+			goto rename_retry;
+		/* go into the first sibling still alive */
+		do {
+			next = child->d_child.next;
+			if (next == &this_parent->d_subdirs)
+				goto ascend;
+			child = list_entry(next, struct dentry, d_child);
+		} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
+		rcu_read_unlock();
+		goto resume;
+	}
+	if (need_seqretry(&rename_lock, seq))
+		goto rename_retry;
+	rcu_read_unlock();
+	if (finish)
+		finish(data);
+
+out_unlock:
+	spin_unlock(&this_parent->d_lock);
+	done_seqretry(&rename_lock, seq);
+	return;
+
+rename_retry:
+	spin_unlock(&this_parent->d_lock);
+	rcu_read_unlock();
+	BUG_ON(seq & 1);
+	if (!retry)
+		return;
+	seq = 1;
+	goto again;
+}
+
+/*
+ * Search for at least 1 mount point in the dentry's subdirs.
+ * We descend to the next level whenever the d_subdirs
+ * list is non-empty and continue searching.
+ */
+
+static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
+{
+	int *ret = data;
+	if (d_mountpoint(dentry)) {
+		*ret = 1;
+		return D_WALK_QUIT;
+	}
+	return D_WALK_CONTINUE;
+}
+
+/**
+ * have_submounts - check for mounts over a dentry
+ * @parent: dentry to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point
+ */
+int have_submounts(struct dentry *parent)
+{
+	int ret = 0;
+
+	d_walk(parent, &ret, check_mount, NULL);
+
+	return ret;
+}
+EXPORT_SYMBOL(have_submounts);
+
+/*
+ * Called by mount code to set a mountpoint and check if the mountpoint is
+ * reachable (e.g. NFS can unhash a directory dentry and then the complete
+ * subtree can become unreachable).
+ *
+ * Only one of d_invalidate() and d_set_mounted() must succeed.  For
+ * this reason take rename_lock and d_lock on dentry and ancestors.
+ */
+int d_set_mounted(struct dentry *dentry)
+{
+	struct dentry *p;
+	int ret = -ENOENT;
+	write_seqlock(&rename_lock);
+	for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
+		/* Need exclusion wrt. d_invalidate() */
+		spin_lock(&p->d_lock);
+		if (unlikely(d_unhashed(p))) {
+			spin_unlock(&p->d_lock);
+			goto out;
+		}
+		spin_unlock(&p->d_lock);
+	}
+	spin_lock(&dentry->d_lock);
+	if (!d_unlinked(dentry)) {
+		dentry->d_flags |= DCACHE_MOUNTED;
+		ret = 0;
+	}
+ 	spin_unlock(&dentry->d_lock);
+out:
+	write_sequnlock(&rename_lock);
+	return ret;
+}
+
+/*
+ * Search the dentry child list of the specified parent,
+ * and move any unused dentries to the end of the unused
+ * list for prune_dcache(). We descend to the next level
+ * whenever the d_subdirs list is non-empty and continue
+ * searching.
+ *
+ * It returns zero iff there are no unused children,
+ * otherwise  it returns the number of children moved to
+ * the end of the unused list. This may not be the total
+ * number of unused children, because select_parent can
+ * drop the lock and return early due to latency
+ * constraints.
+ */
+
+struct select_data {
+	struct dentry *start;
+	struct list_head dispose;
+	int found;
+};
+
+static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
+{
+	struct select_data *data = _data;
+	enum d_walk_ret ret = D_WALK_CONTINUE;
+
+	if (data->start == dentry)
+		goto out;
+
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+		data->found++;
+	} else {
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		if (!dentry->d_lockref.count) {
+			d_shrink_add(dentry, &data->dispose);
+			data->found++;
+		}
+	}
+	/*
+	 * We can return to the caller if we have found some (this
+	 * ensures forward progress). We'll be coming back to find
+	 * the rest.
+	 */
+	if (!list_empty(&data->dispose))
+		ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
+out:
+	return ret;
+}
+
+/**
+ * shrink_dcache_parent - prune dcache
+ * @parent: parent of entries to prune
+ *
+ * Prune the dcache to remove unused children of the parent dentry.
+ */
+void shrink_dcache_parent(struct dentry *parent)
+{
+	for (;;) {
+		struct select_data data;
+
+		INIT_LIST_HEAD(&data.dispose);
+		data.start = parent;
+		data.found = 0;
+
+		d_walk(parent, &data, select_collect, NULL);
+		if (!data.found)
+			break;
+
+		shrink_dentry_list(&data.dispose);
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL(shrink_dcache_parent);
+
+static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
+{
+	/* it has busy descendents; complain about those instead */
+	if (!list_empty(&dentry->d_subdirs))
+		return D_WALK_CONTINUE;
+
+	/* root with refcount 1 is fine */
+	if (dentry == _data && dentry->d_lockref.count == 1)
+		return D_WALK_CONTINUE;
+
+	printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
+			" still in use (%d) [unmount of %s %s]\n",
+		       dentry,
+		       dentry->d_inode ?
+		       dentry->d_inode->i_ino : 0UL,
+		       dentry,
+		       dentry->d_lockref.count,
+		       dentry->d_sb->s_type->name,
+		       dentry->d_sb->s_id);
+	WARN_ON(1);
+	return D_WALK_CONTINUE;
+}
+
+static void do_one_tree(struct dentry *dentry)
+{
+	shrink_dcache_parent(dentry);
+	d_walk(dentry, dentry, umount_check, NULL);
+	d_drop(dentry);
+	dput(dentry);
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+	struct dentry *dentry;
+
+	WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
+
+	dentry = sb->s_root;
+	sb->s_root = NULL;
+	do_one_tree(dentry);
+
+	while (!hlist_bl_empty(&sb->s_anon)) {
+		dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
+		do_one_tree(dentry);
+	}
+}
+
+struct detach_data {
+	struct select_data select;
+	struct dentry *mountpoint;
+};
+static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
+{
+	struct detach_data *data = _data;
+
+	if (d_mountpoint(dentry)) {
+		__dget_dlock(dentry);
+		data->mountpoint = dentry;
+		return D_WALK_QUIT;
+	}
+
+	return select_collect(&data->select, dentry);
+}
+
+static void check_and_drop(void *_data)
+{
+	struct detach_data *data = _data;
+
+	if (!data->mountpoint && !data->select.found)
+		__d_drop(data->select.start);
+}
+
+/**
+ * d_invalidate - detach submounts, prune dcache, and drop
+ * @dentry: dentry to invalidate (aka detach, prune and drop)
+ *
+ * no dcache lock.
+ *
+ * The final d_drop is done as an atomic operation relative to
+ * rename_lock ensuring there are no races with d_set_mounted.  This
+ * ensures there are no unhashed dentries on the path to a mountpoint.
+ */
+void d_invalidate(struct dentry *dentry)
+{
+	/*
+	 * If it's already been dropped, return OK.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (d_unhashed(dentry)) {
+		spin_unlock(&dentry->d_lock);
+		return;
+	}
+	spin_unlock(&dentry->d_lock);
+
+	/* Negative dentries can be dropped without further checks */
+	if (!dentry->d_inode) {
+		d_drop(dentry);
+		return;
+	}
+
+	for (;;) {
+		struct detach_data data;
+
+		data.mountpoint = NULL;
+		INIT_LIST_HEAD(&data.select.dispose);
+		data.select.start = dentry;
+		data.select.found = 0;
+
+		d_walk(dentry, &data, detach_and_collect, check_and_drop);
+
+		if (data.select.found)
+			shrink_dentry_list(&data.select.dispose);
+
+		if (data.mountpoint) {
+			detach_mounts(data.mountpoint);
+			dput(data.mountpoint);
+		}
+
+		if (!data.mountpoint && !data.select.found)
+			break;
+
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL(d_invalidate);
+
+/**
+ * __d_alloc	-	allocate a dcache entry
+ * @sb: filesystem it will belong to
+ * @name: qstr of the name
+ *
+ * Allocates a dentry. It returns %NULL if there is insufficient memory
+ * available. On a success the dentry is returned. The name passed in is
+ * copied and the copy passed in may be reused after this call.
+ */
+ 
+struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
+{
+	struct dentry *dentry;
+	char *dname;
+
+	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+	if (!dentry)
+		return NULL;
+
+	/*
+	 * We guarantee that the inline name is always NUL-terminated.
+	 * This way the memcpy() done by the name switching in rename
+	 * will still always have a NUL at the end, even if we might
+	 * be overwriting an internal NUL character
+	 */
+	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
+	if (name->len > DNAME_INLINE_LEN-1) {
+		size_t size = offsetof(struct external_name, name[1]);
+		struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+		if (!p) {
+			kmem_cache_free(dentry_cache, dentry); 
+			return NULL;
+		}
+		atomic_set(&p->u.count, 1);
+		dname = p->name;
+		if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
+			kasan_unpoison_shadow(dname,
+				round_up(name->len + 1,	sizeof(unsigned long)));
+	} else  {
+		dname = dentry->d_iname;
+	}	
+
+	dentry->d_name.len = name->len;
+	dentry->d_name.hash = name->hash;
+	memcpy(dname, name->name, name->len);
+	dname[name->len] = 0;
+
+	/* Make sure we always see the terminating NUL character */
+	smp_wmb();
+	dentry->d_name.name = dname;
+
+	dentry->d_lockref.count = 1;
+	dentry->d_flags = 0;
+	spin_lock_init(&dentry->d_lock);
+	seqcount_init(&dentry->d_seq);
+	dentry->d_inode = NULL;
+	dentry->d_parent = dentry;
+	dentry->d_sb = sb;
+	dentry->d_op = NULL;
+	dentry->d_fsdata = NULL;
+	INIT_HLIST_BL_NODE(&dentry->d_hash);
+	INIT_LIST_HEAD(&dentry->d_lru);
+	INIT_LIST_HEAD(&dentry->d_subdirs);
+	INIT_HLIST_NODE(&dentry->d_u.d_alias);
+	INIT_LIST_HEAD(&dentry->d_child);
+	d_set_d_op(dentry, dentry->d_sb->s_d_op);
+
+	this_cpu_inc(nr_dentry);
+
+	return dentry;
+}
+
+/**
+ * d_alloc	-	allocate a dcache entry
+ * @parent: parent of entry to allocate
+ * @name: qstr of the name
+ *
+ * Allocates a dentry. It returns %NULL if there is insufficient memory
+ * available. On a success the dentry is returned. The name passed in is
+ * copied and the copy passed in may be reused after this call.
+ */
+struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
+{
+	struct dentry *dentry = __d_alloc(parent->d_sb, name);
+	if (!dentry)
+		return NULL;
+
+	spin_lock(&parent->d_lock);
+	/*
+	 * don't need child lock because it is not subject
+	 * to concurrency here
+	 */
+	__dget_dlock(parent);
+	dentry->d_parent = parent;
+	list_add(&dentry->d_child, &parent->d_subdirs);
+	spin_unlock(&parent->d_lock);
+
+	return dentry;
+}
+EXPORT_SYMBOL(d_alloc);
+
+/**
+ * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
+ * @sb: the superblock
+ * @name: qstr of the name
+ *
+ * For a filesystem that just pins its dentries in memory and never
+ * performs lookups at all, return an unhashed IS_ROOT dentry.
+ */
+struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+{
+	return __d_alloc(sb, name);
+}
+EXPORT_SYMBOL(d_alloc_pseudo);
+
+struct dentry *d_alloc_name(struct dentry *parent, const char *name)
+{
+	struct qstr q;
+
+	q.name = name;
+	q.len = strlen(name);
+	q.hash = full_name_hash(q.name, q.len);
+	return d_alloc(parent, &q);
+}
+EXPORT_SYMBOL(d_alloc_name);
+
+void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+	WARN_ON_ONCE(dentry->d_op);
+	WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH	|
+				DCACHE_OP_COMPARE	|
+				DCACHE_OP_REVALIDATE	|
+				DCACHE_OP_WEAK_REVALIDATE	|
+				DCACHE_OP_DELETE ));
+	dentry->d_op = op;
+	if (!op)
+		return;
+	if (op->d_hash)
+		dentry->d_flags |= DCACHE_OP_HASH;
+	if (op->d_compare)
+		dentry->d_flags |= DCACHE_OP_COMPARE;
+	if (op->d_revalidate)
+		dentry->d_flags |= DCACHE_OP_REVALIDATE;
+	if (op->d_weak_revalidate)
+		dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
+	if (op->d_delete)
+		dentry->d_flags |= DCACHE_OP_DELETE;
+	if (op->d_prune)
+		dentry->d_flags |= DCACHE_OP_PRUNE;
+
+}
+EXPORT_SYMBOL(d_set_d_op);
+
+
+/*
+ * d_set_fallthru - Mark a dentry as falling through to a lower layer
+ * @dentry - The dentry to mark
+ *
+ * Mark a dentry as falling through to the lower layer (as set with
+ * d_pin_lower()).  This flag may be recorded on the medium.
+ */
+void d_set_fallthru(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_FALLTHRU;
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_set_fallthru);
+
+static unsigned d_flags_for_inode(struct inode *inode)
+{
+	unsigned add_flags = DCACHE_REGULAR_TYPE;
+
+	if (!inode)
+		return DCACHE_MISS_TYPE;
+
+	if (S_ISDIR(inode->i_mode)) {
+		add_flags = DCACHE_DIRECTORY_TYPE;
+		if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
+			if (unlikely(!inode->i_op->lookup))
+				add_flags = DCACHE_AUTODIR_TYPE;
+			else
+				inode->i_opflags |= IOP_LOOKUP;
+		}
+		goto type_determined;
+	}
+
+	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+		if (unlikely(inode->i_op->follow_link)) {
+			add_flags = DCACHE_SYMLINK_TYPE;
+			goto type_determined;
+		}
+		inode->i_opflags |= IOP_NOFOLLOW;
+	}
+
+	if (unlikely(!S_ISREG(inode->i_mode)))
+		add_flags = DCACHE_SPECIAL_TYPE;
+
+type_determined:
+	if (unlikely(IS_AUTOMOUNT(inode)))
+		add_flags |= DCACHE_NEED_AUTOMOUNT;
+	return add_flags;
+}
+
+static void __d_instantiate(struct dentry *dentry, struct inode *inode)
+{
+	unsigned add_flags = d_flags_for_inode(inode);
+
+	spin_lock(&dentry->d_lock);
+	if (inode)
+		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+	__d_set_inode_and_type(dentry, inode, add_flags);
+	dentry_rcuwalk_barrier(dentry);
+	spin_unlock(&dentry->d_lock);
+	fsnotify_d_instantiate(dentry, inode);
+}
+
+/**
+ * d_instantiate - fill in inode information for a dentry
+ * @entry: dentry to complete
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry.
+ *
+ * This turns negative dentries into productive full members
+ * of society.
+ *
+ * NOTE! This assumes that the inode count has been incremented
+ * (or otherwise set) by the caller to indicate that it is now
+ * in use by the dcache.
+ */
+ 
+void d_instantiate(struct dentry *entry, struct inode * inode)
+{
+	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+	if (inode)
+		spin_lock(&inode->i_lock);
+	__d_instantiate(entry, inode);
+	if (inode)
+		spin_unlock(&inode->i_lock);
+	security_d_instantiate(entry, inode);
+}
+EXPORT_SYMBOL(d_instantiate);
+
+/**
+ * d_instantiate_unique - instantiate a non-aliased dentry
+ * @entry: dentry to instantiate
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry. On success, it returns NULL.
+ * If an unhashed alias of "entry" already exists, then we return the
+ * aliased dentry instead and drop one reference to inode.
+ *
+ * Note that in order to avoid conflicts with rename() etc, the caller
+ * had better be holding the parent directory semaphore.
+ *
+ * This also assumes that the inode count has been incremented
+ * (or otherwise set) by the caller to indicate that it is now
+ * in use by the dcache.
+ */
+static struct dentry *__d_instantiate_unique(struct dentry *entry,
+					     struct inode *inode)
+{
+	struct dentry *alias;
+	int len = entry->d_name.len;
+	const char *name = entry->d_name.name;
+	unsigned int hash = entry->d_name.hash;
+
+	if (!inode) {
+		__d_instantiate(entry, NULL);
+		return NULL;
+	}
+
+	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+		/*
+		 * Don't need alias->d_lock here, because aliases with
+		 * d_parent == entry->d_parent are not subject to name or
+		 * parent changes, because the parent inode i_mutex is held.
+		 */
+		if (alias->d_name.hash != hash)
+			continue;
+		if (alias->d_parent != entry->d_parent)
+			continue;
+		if (alias->d_name.len != len)
+			continue;
+		if (dentry_cmp(alias, name, len))
+			continue;
+		__dget(alias);
+		return alias;
+	}
+
+	__d_instantiate(entry, inode);
+	return NULL;
+}
+
+struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
+{
+	struct dentry *result;
+
+	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+
+	if (inode)
+		spin_lock(&inode->i_lock);
+	result = __d_instantiate_unique(entry, inode);
+	if (inode)
+		spin_unlock(&inode->i_lock);
+
+	if (!result) {
+		security_d_instantiate(entry, inode);
+		return NULL;
+	}
+
+	BUG_ON(!d_unhashed(result));
+	iput(inode);
+	return result;
+}
+
+EXPORT_SYMBOL(d_instantiate_unique);
+
+/**
+ * d_instantiate_no_diralias - instantiate a non-aliased dentry
+ * @entry: dentry to complete
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry.  If a directory alias is found, then
+ * return an error (and drop inode).  Together with d_materialise_unique() this
+ * guarantees that a directory inode may never have more than one alias.
+ */
+int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
+{
+	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+
+	spin_lock(&inode->i_lock);
+	if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
+		spin_unlock(&inode->i_lock);
+		iput(inode);
+		return -EBUSY;
+	}
+	__d_instantiate(entry, inode);
+	spin_unlock(&inode->i_lock);
+	security_d_instantiate(entry, inode);
+
+	return 0;
+}
+EXPORT_SYMBOL(d_instantiate_no_diralias);
+
+struct dentry *d_make_root(struct inode *root_inode)
+{
+	struct dentry *res = NULL;
+
+	if (root_inode) {
+		static const struct qstr name = QSTR_INIT("/", 1);
+
+		res = __d_alloc(root_inode->i_sb, &name);
+		if (res)
+			d_instantiate(res, root_inode);
+		else
+			iput(root_inode);
+	}
+	return res;
+}
+EXPORT_SYMBOL(d_make_root);
+
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+	struct dentry *alias;
+
+	if (hlist_empty(&inode->i_dentry))
+		return NULL;
+	alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+	__dget(alias);
+	return alias;
+}
+
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them.  If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
+{
+	struct dentry *de;
+
+	spin_lock(&inode->i_lock);
+	de = __d_find_any_alias(inode);
+	spin_unlock(&inode->i_lock);
+	return de;
+}
+EXPORT_SYMBOL(d_find_any_alias);
+
+static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
+{
+	static const struct qstr anonstring = QSTR_INIT("/", 1);
+	struct dentry *tmp;
+	struct dentry *res;
+	unsigned add_flags;
+
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	res = d_find_any_alias(inode);
+	if (res)
+		goto out_iput;
+
+	tmp = __d_alloc(inode->i_sb, &anonstring);
+	if (!tmp) {
+		res = ERR_PTR(-ENOMEM);
+		goto out_iput;
+	}
+
+	spin_lock(&inode->i_lock);
+	res = __d_find_any_alias(inode);
+	if (res) {
+		spin_unlock(&inode->i_lock);
+		dput(tmp);
+		goto out_iput;
+	}
+
+	/* attach a disconnected dentry */
+	add_flags = d_flags_for_inode(inode);
+
+	if (disconnected)
+		add_flags |= DCACHE_DISCONNECTED;
+
+	spin_lock(&tmp->d_lock);
+	__d_set_inode_and_type(tmp, inode, add_flags);
+	hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
+	hlist_bl_lock(&tmp->d_sb->s_anon);
+	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
+	hlist_bl_unlock(&tmp->d_sb->s_anon);
+	spin_unlock(&tmp->d_lock);
+	spin_unlock(&inode->i_lock);
+	security_d_instantiate(tmp, inode);
+
+	return tmp;
+
+ out_iput:
+	if (res && !IS_ERR(res))
+		security_d_instantiate(res, inode);
+	iput(inode);
+	return res;
+}
+
+/**
+ * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
+ *
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and the error will be propagated to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_alias(struct inode *inode)
+{
+	return __d_obtain_alias(inode, 1);
+}
+EXPORT_SYMBOL(d_obtain_alias);
+
+/**
+ * d_obtain_root - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain an IS_ROOT dentry for the root of a filesystem.
+ *
+ * We must ensure that directory inodes only ever have one dentry.  If a
+ * dentry is found, that is returned instead of allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is
+ * released.  A %NULL or IS_ERR inode may be passed in and will be the
+ * error will be propagate to the return value, with a %NULL @inode
+ * replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_root(struct inode *inode)
+{
+	return __d_obtain_alias(inode, 0);
+}
+EXPORT_SYMBOL(d_obtain_root);
+
+/**
+ * d_add_ci - lookup or allocate new dentry with case-exact name
+ * @inode:  the inode case-insensitive lookup has found
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name:   the case-exact name to be associated with the returned dentry
+ *
+ * This is to avoid filling the dcache with case-insensitive names to the
+ * same inode, only the actual correct case is stored in the dcache for
+ * case-insensitive filesystems.
+ *
+ * For a case-insensitive lookup match and if the the case-exact dentry
+ * already exists in in the dcache, use it and return it.
+ *
+ * If no entry exists with the exact case name, allocate new dentry with
+ * the exact case, and return the spliced entry.
+ */
+struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
+			struct qstr *name)
+{
+	struct dentry *found;
+	struct dentry *new;
+
+	/*
+	 * First check if a dentry matching the name already exists,
+	 * if not go ahead and create it now.
+	 */
+	found = d_hash_and_lookup(dentry->d_parent, name);
+	if (!found) {
+		new = d_alloc(dentry->d_parent, name);
+		if (!new) {
+			found = ERR_PTR(-ENOMEM);
+		} else {
+			found = d_splice_alias(inode, new);
+			if (found) {
+				dput(new);
+				return found;
+			}
+			return new;
+		}
+	}
+	iput(inode);
+	return found;
+}
+EXPORT_SYMBOL(d_add_ci);
+
+/*
+ * Do the slow-case of the dentry name compare.
+ *
+ * Unlike the dentry_cmp() function, we need to atomically
+ * load the name and length information, so that the
+ * filesystem can rely on them, and can use the 'name' and
+ * 'len' information without worrying about walking off the
+ * end of memory etc.
+ *
+ * Thus the read_seqcount_retry() and the "duplicate" info
+ * in arguments (the low-level filesystem should not look
+ * at the dentry inode or name contents directly, since
+ * rename can change them while we're in RCU mode).
+ */
+enum slow_d_compare {
+	D_COMP_OK,
+	D_COMP_NOMATCH,
+	D_COMP_SEQRETRY,
+};
+
+static noinline enum slow_d_compare slow_dentry_cmp(
+		const struct dentry *parent,
+		struct dentry *dentry,
+		unsigned int seq,
+		const struct qstr *name)
+{
+	int tlen = dentry->d_name.len;
+	const char *tname = dentry->d_name.name;
+
+	if (read_seqcount_retry(&dentry->d_seq, seq)) {
+		cpu_relax();
+		return D_COMP_SEQRETRY;
+	}
+	if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
+		return D_COMP_NOMATCH;
+	return D_COMP_OK;
+}
+
+/**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seqp: returns d_seq value at the point where the dentry was found
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ *
+ * NOTE! The caller *has* to check the resulting dentry against the sequence
+ * number we've returned before using any of the resulting dentry state!
+ */
+struct dentry *__d_lookup_rcu(const struct dentry *parent,
+				const struct qstr *name,
+				unsigned *seqp)
+{
+	u64 hashlen = name->hash_len;
+	const unsigned char *str = name->name;
+	struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen));
+	struct hlist_bl_node *node;
+	struct dentry *dentry;
+
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Carefully use d_seq when comparing a candidate dentry, to avoid
+	 * races with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/filesystems/path-lookup.txt for more details.
+	 */
+	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+		unsigned seq;
+
+seqretry:
+		/*
+		 * The dentry sequence count protects us from concurrent
+		 * renames, and thus protects parent and name fields.
+		 *
+		 * The caller must perform a seqcount check in order
+		 * to do anything useful with the returned dentry.
+		 *
+		 * NOTE! We do a "raw" seqcount_begin here. That means that
+		 * we don't wait for the sequence count to stabilize if it
+		 * is in the middle of a sequence change. If we do the slow
+		 * dentry compare, we will do seqretries until it is stable,
+		 * and if we end up with a successful lookup, we actually
+		 * want to exit RCU lookup anyway.
+		 */
+		seq = raw_seqcount_begin(&dentry->d_seq);
+		if (dentry->d_parent != parent)
+			continue;
+		if (d_unhashed(dentry))
+			continue;
+
+		if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
+			if (dentry->d_name.hash != hashlen_hash(hashlen))
+				continue;
+			*seqp = seq;
+			switch (slow_dentry_cmp(parent, dentry, seq, name)) {
+			case D_COMP_OK:
+				return dentry;
+			case D_COMP_NOMATCH:
+				continue;
+			default:
+				goto seqretry;
+			}
+		}
+
+		if (dentry->d_name.hash_len != hashlen)
+			continue;
+		*seqp = seq;
+		if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
+			return dentry;
+	}
+	return NULL;
+}
+
+/**
+ * d_lookup - search for a dentry
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * d_lookup searches the children of the parent dentry for the name in
+ * question. If the dentry is found its reference count is incremented and the
+ * dentry is returned. The caller must use dput to free the entry when it has
+ * finished using it. %NULL is returned if the dentry does not exist.
+ */
+struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
+{
+	struct dentry *dentry;
+	unsigned seq;
+
+	do {
+		seq = read_seqbegin(&rename_lock);
+		dentry = __d_lookup(parent, name);
+		if (dentry)
+			break;
+	} while (read_seqretry(&rename_lock, seq));
+	return dentry;
+}
+EXPORT_SYMBOL(d_lookup);
+
+/**
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
+struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct hlist_bl_head *b = d_hash(parent, hash);
+	struct hlist_bl_node *node;
+	struct dentry *found = NULL;
+	struct dentry *dentry;
+
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Take d_lock when comparing a candidate dentry, to avoid races
+	 * with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/filesystems/path-lookup.txt for more details.
+	 */
+	rcu_read_lock();
+	
+	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+
+		if (dentry->d_name.hash != hash)
+			continue;
+
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_parent != parent)
+			goto next;
+		if (d_unhashed(dentry))
+			goto next;
+
+		/*
+		 * It is safe to compare names since d_move() cannot
+		 * change the qstr (protected by d_lock).
+		 */
+		if (parent->d_flags & DCACHE_OP_COMPARE) {
+			int tlen = dentry->d_name.len;
+			const char *tname = dentry->d_name.name;
+			if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
+				goto next;
+		} else {
+			if (dentry->d_name.len != len)
+				goto next;
+			if (dentry_cmp(dentry, str, len))
+				goto next;
+		}
+
+		dentry->d_lockref.count++;
+		found = dentry;
+		spin_unlock(&dentry->d_lock);
+		break;
+next:
+		spin_unlock(&dentry->d_lock);
+ 	}
+ 	rcu_read_unlock();
+
+ 	return found;
+}
+
+/**
+ * d_hash_and_lookup - hash the qstr then search for a dentry
+ * @dir: Directory to search in
+ * @name: qstr of name we wish to find
+ *
+ * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
+ */
+struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
+{
+	/*
+	 * Check for a fs-specific hash function. Note that we must
+	 * calculate the standard hash first, as the d_op->d_hash()
+	 * routine may choose to leave the hash value unchanged.
+	 */
+	name->hash = full_name_hash(name->name, name->len);
+	if (dir->d_flags & DCACHE_OP_HASH) {
+		int err = dir->d_op->d_hash(dir, name);
+		if (unlikely(err < 0))
+			return ERR_PTR(err);
+	}
+	return d_lookup(dir, name);
+}
+EXPORT_SYMBOL(d_hash_and_lookup);
+
+/*
+ * When a file is deleted, we have two options:
+ * - turn this dentry into a negative dentry
+ * - unhash this dentry and free it.
+ *
+ * Usually, we want to just turn this into
+ * a negative dentry, but if anybody else is
+ * currently using the dentry or the inode
+ * we can't do that and we fall back on removing
+ * it from the hash queues and waiting for
+ * it to be deleted later when it has no users
+ */
+ 
+/**
+ * d_delete - delete a dentry
+ * @dentry: The dentry to delete
+ *
+ * Turn the dentry into a negative dentry if possible, otherwise
+ * remove it from the hash queues so it can be deleted later
+ */
+ 
+void d_delete(struct dentry * dentry)
+{
+	struct inode *inode;
+	int isdir = 0;
+	/*
+	 * Are we the only user?
+	 */
+again:
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_inode;
+	isdir = S_ISDIR(inode->i_mode);
+	if (dentry->d_lockref.count == 1) {
+		if (!spin_trylock(&inode->i_lock)) {
+			spin_unlock(&dentry->d_lock);
+			cpu_chill();
+			goto again;
+		}
+		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
+		dentry_unlink_inode(dentry);
+		fsnotify_nameremove(dentry, isdir);
+		return;
+	}
+
+	if (!d_unhashed(dentry))
+		__d_drop(dentry);
+
+	spin_unlock(&dentry->d_lock);
+
+	fsnotify_nameremove(dentry, isdir);
+}
+EXPORT_SYMBOL(d_delete);
+
+static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
+{
+	BUG_ON(!d_unhashed(entry));
+	hlist_bl_lock(b);
+	entry->d_flags |= DCACHE_RCUACCESS;
+	hlist_bl_add_head_rcu(&entry->d_hash, b);
+	hlist_bl_unlock(b);
+}
+
+static void _d_rehash(struct dentry * entry)
+{
+	__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
+}
+
+/**
+ * d_rehash	- add an entry back to the hash
+ * @entry: dentry to add to the hash
+ *
+ * Adds a dentry to the hash according to its name.
+ */
+ 
+void d_rehash(struct dentry * entry)
+{
+	spin_lock(&entry->d_lock);
+	_d_rehash(entry);
+	spin_unlock(&entry->d_lock);
+}
+EXPORT_SYMBOL(d_rehash);
+
+/**
+ * dentry_update_name_case - update case insensitive dentry with a new name
+ * @dentry: dentry to be updated
+ * @name: new name
+ *
+ * Update a case insensitive dentry with new case of name.
+ *
+ * dentry must have been returned by d_lookup with name @name. Old and new
+ * name lengths must match (ie. no d_compare which allows mismatched name
+ * lengths).
+ *
+ * Parent inode i_mutex must be held over d_lookup and into this call (to
+ * keep renames and concurrent inserts, and readdir(2) away).
+ */
+void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+{
+	BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
+	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
+
+	spin_lock(&dentry->d_lock);
+	write_seqcount_begin(&dentry->d_seq);
+	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+	write_seqcount_end(&dentry->d_seq);
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(dentry_update_name_case);
+
+static void swap_names(struct dentry *dentry, struct dentry *target)
+{
+	if (unlikely(dname_external(target))) {
+		if (unlikely(dname_external(dentry))) {
+			/*
+			 * Both external: swap the pointers
+			 */
+			swap(target->d_name.name, dentry->d_name.name);
+		} else {
+			/*
+			 * dentry:internal, target:external.  Steal target's
+			 * storage and make target internal.
+			 */
+			memcpy(target->d_iname, dentry->d_name.name,
+					dentry->d_name.len + 1);
+			dentry->d_name.name = target->d_name.name;
+			target->d_name.name = target->d_iname;
+		}
+	} else {
+		if (unlikely(dname_external(dentry))) {
+			/*
+			 * dentry:external, target:internal.  Give dentry's
+			 * storage to target and make dentry internal
+			 */
+			memcpy(dentry->d_iname, target->d_name.name,
+					target->d_name.len + 1);
+			target->d_name.name = dentry->d_name.name;
+			dentry->d_name.name = dentry->d_iname;
+		} else {
+			/*
+			 * Both are internal.
+			 */
+			unsigned int i;
+			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+			kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
+			kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
+			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
+				swap(((long *) &dentry->d_iname)[i],
+				     ((long *) &target->d_iname)[i]);
+			}
+		}
+	}
+	swap(dentry->d_name.hash_len, target->d_name.hash_len);
+}
+
+static void copy_name(struct dentry *dentry, struct dentry *target)
+{
+	struct external_name *old_name = NULL;
+	if (unlikely(dname_external(dentry)))
+		old_name = external_name(dentry);
+	if (unlikely(dname_external(target))) {
+		atomic_inc(&external_name(target)->u.count);
+		dentry->d_name = target->d_name;
+	} else {
+		memcpy(dentry->d_iname, target->d_name.name,
+				target->d_name.len + 1);
+		dentry->d_name.name = dentry->d_iname;
+		dentry->d_name.hash_len = target->d_name.hash_len;
+	}
+	if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
+		kfree_rcu(old_name, u.head);
+}
+
+static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
+{
+	/*
+	 * XXXX: do we really need to take target->d_lock?
+	 */
+	if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
+		spin_lock(&target->d_parent->d_lock);
+	else {
+		if (d_ancestor(dentry->d_parent, target->d_parent)) {
+			spin_lock(&dentry->d_parent->d_lock);
+			spin_lock_nested(&target->d_parent->d_lock,
+						DENTRY_D_LOCK_NESTED);
+		} else {
+			spin_lock(&target->d_parent->d_lock);
+			spin_lock_nested(&dentry->d_parent->d_lock,
+						DENTRY_D_LOCK_NESTED);
+		}
+	}
+	if (target < dentry) {
+		spin_lock_nested(&target->d_lock, 2);
+		spin_lock_nested(&dentry->d_lock, 3);
+	} else {
+		spin_lock_nested(&dentry->d_lock, 2);
+		spin_lock_nested(&target->d_lock, 3);
+	}
+}
+
+static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
+{
+	if (target->d_parent != dentry->d_parent)
+		spin_unlock(&dentry->d_parent->d_lock);
+	if (target->d_parent != target)
+		spin_unlock(&target->d_parent->d_lock);
+	spin_unlock(&target->d_lock);
+	spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * When switching names, the actual string doesn't strictly have to
+ * be preserved in the target - because we're dropping the target
+ * anyway. As such, we can just do a simple memcpy() to copy over
+ * the new name before we switch, unless we are going to rehash
+ * it.  Note that if we *do* unhash the target, we are not allowed
+ * to rehash it without giving it a new name/hash key - whether
+ * we swap or overwrite the names here, resulting name won't match
+ * the reality in filesystem; it's only there for d_path() purposes.
+ * Note that all of this is happening under rename_lock, so the
+ * any hash lookup seeing it in the middle of manipulations will
+ * be discarded anyway.  So we do not care what happens to the hash
+ * key in that case.
+ */
+/*
+ * __d_move - move a dentry
+ * @dentry: entry to move
+ * @target: new dentry
+ * @exchange: exchange the two dentries
+ *
+ * Update the dcache to reflect the move of a file name. Negative
+ * dcache entries should not be moved in this way. Caller must hold
+ * rename_lock, the i_mutex of the source and target directories,
+ * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
+ */
+static void __d_move(struct dentry *dentry, struct dentry *target,
+		     bool exchange)
+{
+	if (!dentry->d_inode)
+		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+
+	BUG_ON(d_ancestor(dentry, target));
+	BUG_ON(d_ancestor(target, dentry));
+
+	dentry_lock_for_move(dentry, target);
+
+	write_seqcount_begin(&dentry->d_seq);
+	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
+
+	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
+
+	/*
+	 * Move the dentry to the target hash queue. Don't bother checking
+	 * for the same hash queue because of how unlikely it is.
+	 */
+	__d_drop(dentry);
+	__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
+
+	/*
+	 * Unhash the target (d_delete() is not usable here).  If exchanging
+	 * the two dentries, then rehash onto the other's hash queue.
+	 */
+	__d_drop(target);
+	if (exchange) {
+		__d_rehash(target,
+			   d_hash(dentry->d_parent, dentry->d_name.hash));
+	}
+
+	/* Switch the names.. */
+	if (exchange)
+		swap_names(dentry, target);
+	else
+		copy_name(dentry, target);
+
+	/* ... and switch them in the tree */
+	if (IS_ROOT(dentry)) {
+		/* splicing a tree */
+		dentry->d_parent = target->d_parent;
+		target->d_parent = target;
+		list_del_init(&target->d_child);
+		list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+	} else {
+		/* swapping two dentries */
+		swap(dentry->d_parent, target->d_parent);
+		list_move(&target->d_child, &target->d_parent->d_subdirs);
+		list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+		if (exchange)
+			fsnotify_d_move(target);
+		fsnotify_d_move(dentry);
+	}
+
+	write_seqcount_end(&target->d_seq);
+	write_seqcount_end(&dentry->d_seq);
+
+	dentry_unlock_for_move(dentry, target);
+}
+
+/*
+ * d_move - move a dentry
+ * @dentry: entry to move
+ * @target: new dentry
+ *
+ * Update the dcache to reflect the move of a file name. Negative
+ * dcache entries should not be moved in this way. See the locking
+ * requirements for __d_move.
+ */
+void d_move(struct dentry *dentry, struct dentry *target)
+{
+	write_seqlock(&rename_lock);
+	__d_move(dentry, target, false);
+	write_sequnlock(&rename_lock);
+}
+EXPORT_SYMBOL(d_move);
+
+/*
+ * d_exchange - exchange two dentries
+ * @dentry1: first dentry
+ * @dentry2: second dentry
+ */
+void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
+{
+	write_seqlock(&rename_lock);
+
+	WARN_ON(!dentry1->d_inode);
+	WARN_ON(!dentry2->d_inode);
+	WARN_ON(IS_ROOT(dentry1));
+	WARN_ON(IS_ROOT(dentry2));
+
+	__d_move(dentry1, dentry2, true);
+
+	write_sequnlock(&rename_lock);
+}
+
+/**
+ * d_ancestor - search for an ancestor
+ * @p1: ancestor dentry
+ * @p2: child dentry
+ *
+ * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
+ * an ancestor of p2, else NULL.
+ */
+struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
+{
+	struct dentry *p;
+
+	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
+		if (p->d_parent == p1)
+			return p;
+	}
+	return NULL;
+}
+
+/*
+ * This helper attempts to cope with remotely renamed directories
+ *
+ * It assumes that the caller is already holding
+ * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock
+ *
+ * Note: If ever the locking in lock_rename() changes, then please
+ * remember to update this too...
+ */
+static int __d_unalias(struct inode *inode,
+		struct dentry *dentry, struct dentry *alias)
+{
+	struct mutex *m1 = NULL, *m2 = NULL;
+	int ret = -ESTALE;
+
+	/* If alias and dentry share a parent, then no extra locks required */
+	if (alias->d_parent == dentry->d_parent)
+		goto out_unalias;
+
+	/* See lock_rename() */
+	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
+		goto out_err;
+	m1 = &dentry->d_sb->s_vfs_rename_mutex;
+	if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+		goto out_err;
+	m2 = &alias->d_parent->d_inode->i_mutex;
+out_unalias:
+	__d_move(alias, dentry, false);
+	ret = 0;
+out_err:
+	spin_unlock(&inode->i_lock);
+	if (m2)
+		mutex_unlock(m2);
+	if (m1)
+		mutex_unlock(m1);
+	return ret;
+}
+
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode:  the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * is returned.  This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	BUG_ON(!d_unhashed(dentry));
+
+	if (!inode) {
+		__d_instantiate(dentry, NULL);
+		goto out;
+	}
+	spin_lock(&inode->i_lock);
+	if (S_ISDIR(inode->i_mode)) {
+		struct dentry *new = __d_find_any_alias(inode);
+		if (unlikely(new)) {
+			write_seqlock(&rename_lock);
+			if (unlikely(d_ancestor(new, dentry))) {
+				write_sequnlock(&rename_lock);
+				spin_unlock(&inode->i_lock);
+				dput(new);
+				new = ERR_PTR(-ELOOP);
+				pr_warn_ratelimited(
+					"VFS: Lookup of '%s' in %s %s"
+					" would have caused loop\n",
+					dentry->d_name.name,
+					inode->i_sb->s_type->name,
+					inode->i_sb->s_id);
+			} else if (!IS_ROOT(new)) {
+				int err = __d_unalias(inode, dentry, new);
+				write_sequnlock(&rename_lock);
+				if (err) {
+					dput(new);
+					new = ERR_PTR(err);
+				}
+			} else {
+				__d_move(new, dentry, false);
+				write_sequnlock(&rename_lock);
+				spin_unlock(&inode->i_lock);
+				security_d_instantiate(new, inode);
+			}
+			iput(inode);
+			return new;
+		}
+	}
+	/* already taking inode->i_lock, so d_add() by hand */
+	__d_instantiate(dentry, inode);
+	spin_unlock(&inode->i_lock);
+out:
+	security_d_instantiate(dentry, inode);
+	d_rehash(dentry);
+	return NULL;
+}
+EXPORT_SYMBOL(d_splice_alias);
+
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
+{
+	*buflen -= namelen;
+	if (*buflen < 0)
+		return -ENAMETOOLONG;
+	*buffer -= namelen;
+	memcpy(*buffer, str, namelen);
+	return 0;
+}
+
+/**
+ * prepend_name - prepend a pathname in front of current buffer pointer
+ * @buffer: buffer pointer
+ * @buflen: allocated length of the buffer
+ * @name:   name string and length qstr structure
+ *
+ * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
+ * make sure that either the old or the new name pointer and length are
+ * fetched. However, there may be mismatch between length and pointer.
+ * The length cannot be trusted, we need to copy it byte-by-byte until
+ * the length is reached or a null byte is found. It also prepends "/" at
+ * the beginning of the name. The sequence number check at the caller will
+ * retry it again when a d_move() does happen. So any garbage in the buffer
+ * due to mismatched pointer and length will be discarded.
+ *
+ * Data dependency barrier is needed to make sure that we see that terminating
+ * NUL.  Alpha strikes again, film at 11...
+ */
+static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+{
+	const char *dname = ACCESS_ONCE(name->name);
+	u32 dlen = ACCESS_ONCE(name->len);
+	char *p;
+
+	smp_read_barrier_depends();
+
+	*buflen -= dlen + 1;
+	if (*buflen < 0)
+		return -ENAMETOOLONG;
+	p = *buffer -= dlen + 1;
+	*p++ = '/';
+	while (dlen--) {
+		char c = *dname++;
+		if (!c)
+			break;
+		*p++ = c;
+	}
+	return 0;
+}
+
+/**
+ * prepend_path - Prepend path string to a buffer
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry
+ * @buffer: pointer to the end of the buffer
+ * @buflen: pointer to buffer length
+ *
+ * The function will first try to write out the pathname without taking any
+ * lock other than the RCU read lock to make sure that dentries won't go away.
+ * It only checks the sequence number of the global rename_lock as any change
+ * in the dentry's d_seq will be preceded by changes in the rename_lock
+ * sequence number. If the sequence number had been changed, it will restart
+ * the whole pathname back-tracing sequence again by taking the rename_lock.
+ * In this case, there is no need to take the RCU read lock as the recursive
+ * parent pointer references will keep the dentry chain alive as long as no
+ * rename operation is performed.
+ */
+static int prepend_path(const struct path *path,
+			const struct path *root,
+			char **buffer, int *buflen)
+{
+	struct dentry *dentry;
+	struct vfsmount *vfsmnt;
+	struct mount *mnt;
+	int error = 0;
+	unsigned seq, m_seq = 0;
+	char *bptr;
+	int blen;
+
+	rcu_read_lock();
+restart_mnt:
+	read_seqbegin_or_lock(&mount_lock, &m_seq);
+	seq = 0;
+	rcu_read_lock();
+restart:
+	bptr = *buffer;
+	blen = *buflen;
+	error = 0;
+	dentry = path->dentry;
+	vfsmnt = path->mnt;
+	mnt = real_mount(vfsmnt);
+	read_seqbegin_or_lock(&rename_lock, &seq);
+	while (dentry != root->dentry || vfsmnt != root->mnt) {
+		struct dentry * parent;
+
+		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+			struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
+			/* Global root? */
+			if (mnt != parent) {
+				dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+				mnt = parent;
+				vfsmnt = &mnt->mnt;
+				continue;
+			}
+			if (!error)
+				error = is_mounted(vfsmnt) ? 1 : 2;
+			break;
+		}
+		parent = dentry->d_parent;
+		prefetch(parent);
+		error = prepend_name(&bptr, &blen, &dentry->d_name);
+		if (error)
+			break;
+
+		dentry = parent;
+	}
+	if (!(seq & 1))
+		rcu_read_unlock();
+	if (need_seqretry(&rename_lock, seq)) {
+		seq = 1;
+		goto restart;
+	}
+	done_seqretry(&rename_lock, seq);
+
+	if (!(m_seq & 1))
+		rcu_read_unlock();
+	if (need_seqretry(&mount_lock, m_seq)) {
+		m_seq = 1;
+		goto restart_mnt;
+	}
+	done_seqretry(&mount_lock, m_seq);
+
+	if (error >= 0 && bptr == *buffer) {
+		if (--blen < 0)
+			error = -ENAMETOOLONG;
+		else
+			*--bptr = '/';
+	}
+	*buffer = bptr;
+	*buflen = blen;
+	return error;
+}
+
+/**
+ * __d_path - return the path of a dentry
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name.
+ *
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
+ *
+ * "buflen" should be positive.
+ *
+ * If the path is not reachable from the supplied root, return %NULL.
+ */
+char *__d_path(const struct path *path,
+	       const struct path *root,
+	       char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	int error;
+
+	prepend(&res, &buflen, "\0", 1);
+	error = prepend_path(path, root, &res, &buflen);
+
+	if (error < 0)
+		return ERR_PTR(error);
+	if (error > 0)
+		return NULL;
+	return res;
+}
+
+char *d_absolute_path(const struct path *path,
+	       char *buf, int buflen)
+{
+	struct path root = {};
+	char *res = buf + buflen;
+	int error;
+
+	prepend(&res, &buflen, "\0", 1);
+	error = prepend_path(path, &root, &res, &buflen);
+
+	if (error > 1)
+		error = -EINVAL;
+	if (error < 0)
+		return ERR_PTR(error);
+	return res;
+}
+
+/*
+ * same as __d_path but appends "(deleted)" for unlinked files.
+ */
+static int path_with_deleted(const struct path *path,
+			     const struct path *root,
+			     char **buf, int *buflen)
+{
+	prepend(buf, buflen, "\0", 1);
+	if (d_unlinked(path->dentry)) {
+		int error = prepend(buf, buflen, " (deleted)", 10);
+		if (error)
+			return error;
+	}
+
+	return prepend_path(path, root, buf, buflen);
+}
+
+static int prepend_unreachable(char **buffer, int *buflen)
+{
+	return prepend(buffer, buflen, "(unreachable)", 13);
+}
+
+static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
+{
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&fs->seq);
+		*root = fs->root;
+	} while (read_seqcount_retry(&fs->seq, seq));
+}
+
+/**
+ * d_path - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name. If the entry has been deleted
+ * the string " (deleted)" is appended. Note that this is ambiguous.
+ *
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
+ *
+ * "buflen" should be positive.
+ */
+char *d_path(const struct path *path, char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	struct path root;
+	int error;
+
+	/*
+	 * We have various synthetic filesystems that never get mounted.  On
+	 * these filesystems dentries are never used for lookup purposes, and
+	 * thus don't need to be hashed.  They also don't need a name until a
+	 * user wants to identify the object in /proc/pid/fd/.  The little hack
+	 * below allows us to generate a name for these objects on demand:
+	 *
+	 * Some pseudo inodes are mountable.  When they are mounted
+	 * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
+	 * and instead have d_path return the mounted path.
+	 */
+	if (path->dentry->d_op && path->dentry->d_op->d_dname &&
+	    (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
+		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+
+	rcu_read_lock();
+	get_fs_root_rcu(current->fs, &root);
+	error = path_with_deleted(path, &root, &res, &buflen);
+	rcu_read_unlock();
+
+	if (error < 0)
+		res = ERR_PTR(error);
+	return res;
+}
+EXPORT_SYMBOL(d_path);
+
+/*
+ * Helper function for dentry_operations.d_dname() members
+ */
+char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
+			const char *fmt, ...)
+{
+	va_list args;
+	char temp[64];
+	int sz;
+
+	va_start(args, fmt);
+	sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
+	va_end(args);
+
+	if (sz > sizeof(temp) || sz > buflen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	buffer += buflen - sz;
+	return memcpy(buffer, temp, sz);
+}
+
+char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	char *end = buffer + buflen;
+	/* these dentries are never renamed, so d_lock is not needed */
+	if (prepend(&end, &buflen, " (deleted)", 11) ||
+	    prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
+	    prepend(&end, &buflen, "/", 1))  
+		end = ERR_PTR(-ENAMETOOLONG);
+	return end;
+}
+EXPORT_SYMBOL(simple_dname);
+
+/*
+ * Write full pathname from the root of the filesystem into the buffer.
+ */
+static char *__dentry_path(struct dentry *d, char *buf, int buflen)
+{
+	struct dentry *dentry;
+	char *end, *retval;
+	int len, seq = 0;
+	int error = 0;
+
+	if (buflen < 2)
+		goto Elong;
+
+	rcu_read_lock();
+restart:
+	dentry = d;
+	end = buf + buflen;
+	len = buflen;
+	prepend(&end, &len, "\0", 1);
+	/* Get '/' right */
+	retval = end-1;
+	*retval = '/';
+	read_seqbegin_or_lock(&rename_lock, &seq);
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dentry->d_parent;
+
+		prefetch(parent);
+		error = prepend_name(&end, &len, &dentry->d_name);
+		if (error)
+			break;
+
+		retval = end;
+		dentry = parent;
+	}
+	if (!(seq & 1))
+		rcu_read_unlock();
+	if (need_seqretry(&rename_lock, seq)) {
+		seq = 1;
+		goto restart;
+	}
+	done_seqretry(&rename_lock, seq);
+	if (error)
+		goto Elong;
+	return retval;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+	return __dentry_path(dentry, buf, buflen);
+}
+EXPORT_SYMBOL(dentry_path_raw);
+
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+	char *p = NULL;
+	char *retval;
+
+	if (d_unlinked(dentry)) {
+		p = buf + buflen;
+		if (prepend(&p, &buflen, "//deleted", 10) != 0)
+			goto Elong;
+		buflen++;
+	}
+	retval = __dentry_path(dentry, buf, buflen);
+	if (!IS_ERR(retval) && p)
+		*p = '/';	/* restore '/' overriden with '\0' */
+	return retval;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
+				    struct path *pwd)
+{
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&fs->seq);
+		*root = fs->root;
+		*pwd = fs->pwd;
+	} while (read_seqcount_retry(&fs->seq, seq));
+}
+
+/*
+ * NOTE! The user-level library version returns a
+ * character pointer. The kernel system call just
+ * returns the length of the buffer filled (which
+ * includes the ending '\0' character), or a negative
+ * error value. So libc would do something like
+ *
+ *	char *getcwd(char * buf, size_t size)
+ *	{
+ *		int retval;
+ *
+ *		retval = sys_getcwd(buf, size);
+ *		if (retval >= 0)
+ *			return buf;
+ *		errno = -retval;
+ *		return NULL;
+ *	}
+ */
+SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
+{
+	int error;
+	struct path pwd, root;
+	char *page = __getname();
+
+	if (!page)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
+
+	error = -ENOENT;
+	if (!d_unlinked(pwd.dentry)) {
+		unsigned long len;
+		char *cwd = page + PATH_MAX;
+		int buflen = PATH_MAX;
+
+		prepend(&cwd, &buflen, "\0", 1);
+		error = prepend_path(&pwd, &root, &cwd, &buflen);
+		rcu_read_unlock();
+
+		if (error < 0)
+			goto out;
+
+		/* Unreachable from current root */
+		if (error > 0) {
+			error = prepend_unreachable(&cwd, &buflen);
+			if (error)
+				goto out;
+		}
+
+		error = -ERANGE;
+		len = PATH_MAX + page - cwd;
+		if (len <= size) {
+			error = len;
+			if (copy_to_user(buf, cwd, len))
+				error = -EFAULT;
+		}
+	} else {
+		rcu_read_unlock();
+	}
+
+out:
+	__putname(page);
+	return error;
+}
+
+/*
+ * Test whether new_dentry is a subdirectory of old_dentry.
+ *
+ * Trivially implemented using the dcache structure
+ */
+
+/**
+ * is_subdir - is new dentry a subdirectory of old_dentry
+ * @new_dentry: new dentry
+ * @old_dentry: old dentry
+ *
+ * Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
+ * Returns 0 otherwise.
+ * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
+ */
+  
+int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+{
+	int result;
+	unsigned seq;
+
+	if (new_dentry == old_dentry)
+		return 1;
+
+	do {
+		/* for restarting inner loop in case of seq retry */
+		seq = read_seqbegin(&rename_lock);
+		/*
+		 * Need rcu_readlock to protect against the d_parent trashing
+		 * due to d_move
+		 */
+		rcu_read_lock();
+		if (d_ancestor(old_dentry, new_dentry))
+			result = 1;
+		else
+			result = 0;
+		rcu_read_unlock();
+	} while (read_seqretry(&rename_lock, seq));
+
+	return result;
+}
+
+static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
+{
+	struct dentry *root = data;
+	if (dentry != root) {
+		if (d_unhashed(dentry) || !dentry->d_inode)
+			return D_WALK_SKIP;
+
+		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+			dentry->d_flags |= DCACHE_GENOCIDE;
+			dentry->d_lockref.count--;
+		}
+	}
+	return D_WALK_CONTINUE;
+}
+
+void d_genocide(struct dentry *parent)
+{
+	d_walk(parent, parent, d_genocide_kill, NULL);
+}
+
+void d_tmpfile(struct dentry *dentry, struct inode *inode)
+{
+	inode_dec_link_count(inode);
+	BUG_ON(dentry->d_name.name != dentry->d_iname ||
+		!hlist_unhashed(&dentry->d_u.d_alias) ||
+		!d_unlinked(dentry));
+	spin_lock(&dentry->d_parent->d_lock);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
+				(unsigned long long)inode->i_ino);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dentry->d_parent->d_lock);
+	d_instantiate(dentry, inode);
+}
+EXPORT_SYMBOL(d_tmpfile);
+
+static __initdata unsigned long dhash_entries;
+static int __init set_dhash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	dhash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("dhash_entries=", set_dhash_entries);
+
+static void __init dcache_init_early(void)
+{
+	unsigned int loop;
+
+	/* If hashes are distributed across NUMA nodes, defer
+	 * hash allocation until vmalloc space is available.
+	 */
+	if (hashdist)
+		return;
+
+	dentry_hashtable =
+		alloc_large_system_hash("Dentry cache",
+					sizeof(struct hlist_bl_head),
+					dhash_entries,
+					13,
+					HASH_EARLY,
+					&d_hash_shift,
+					&d_hash_mask,
+					0,
+					0);
+
+	for (loop = 0; loop < (1U << d_hash_shift); loop++)
+		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
+}
+
+static void __init dcache_init(void)
+{
+	unsigned int loop;
+
+	/* 
+	 * A constructor could be added for stable state like the lists,
+	 * but it is probably not worth it because of the cache nature
+	 * of the dcache. 
+	 */
+	dentry_cache = KMEM_CACHE(dentry,
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+
+	/* Hash may have been set up in dcache_init_early */
+	if (!hashdist)
+		return;
+
+	dentry_hashtable =
+		alloc_large_system_hash("Dentry cache",
+					sizeof(struct hlist_bl_head),
+					dhash_entries,
+					13,
+					0,
+					&d_hash_shift,
+					&d_hash_mask,
+					0,
+					0);
+
+	for (loop = 0; loop < (1U << d_hash_shift); loop++)
+		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
+}
+
+/* SLAB cache for __getname() consumers */
+struct kmem_cache *names_cachep __read_mostly;
+EXPORT_SYMBOL(names_cachep);
+
+EXPORT_SYMBOL(d_genocide);
+
+void __init vfs_caches_init_early(void)
+{
+	dcache_init_early();
+	inode_init_early();
+}
+
+void __init vfs_caches_init(unsigned long mempages)
+{
+	unsigned long reserve;
+
+	/* Base hash sizes on available memory, with a reserve equal to
+           150% of current kernel size */
+
+	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
+	mempages -= reserve;
+
+	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+	dcache_init();
+	inode_init();
+	files_init(mempages);
+	mnt_init();
+	bdev_cache_init();
+	chrdev_init();
+}
diff --git a/kernel/fs/dcookies.c b/kernel/fs/dcookies.c
new file mode 100644
index 000000000..ac44a69fb
--- /dev/null
+++ b/kernel/fs/dcookies.c
@@ -0,0 +1,350 @@
+/*
+ * dcookies.c
+ *
+ * Copyright 2002 John Levon <levon@movementarian.org>
+ *
+ * Persistent cookie-path mappings. These are used by
+ * profilers to convert a per-task EIP value into something
+ * non-transitory that can be processed at a later date.
+ * This is done by locking the dentry/vfsmnt pair in the
+ * kernel until released by the tasks needing the persistent
+ * objects. The tag is simply an unsigned long that refers
+ * to the pair and can be looked up from userspace.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/mount.h>
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/dcookies.h>
+#include <linux/mutex.h>
+#include <linux/path.h>
+#include <linux/compat.h>
+#include <asm/uaccess.h>
+
+/* The dcookies are allocated from a kmem_cache and
+ * hashed onto a small number of lists. None of the
+ * code here is particularly performance critical
+ */
+struct dcookie_struct {
+	struct path path;
+	struct list_head hash_list;
+};
+
+static LIST_HEAD(dcookie_users);
+static DEFINE_MUTEX(dcookie_mutex);
+static struct kmem_cache *dcookie_cache __read_mostly;
+static struct list_head *dcookie_hashtable __read_mostly;
+static size_t hash_size __read_mostly;
+
+static inline int is_live(void)
+{
+	return !(list_empty(&dcookie_users));
+}
+
+
+/* The dentry is locked, its address will do for the cookie */
+static inline unsigned long dcookie_value(struct dcookie_struct * dcs)
+{
+	return (unsigned long)dcs->path.dentry;
+}
+
+
+static size_t dcookie_hash(unsigned long dcookie)
+{
+	return (dcookie >> L1_CACHE_SHIFT) & (hash_size - 1);
+}
+
+
+static struct dcookie_struct * find_dcookie(unsigned long dcookie)
+{
+	struct dcookie_struct *found = NULL;
+	struct dcookie_struct * dcs;
+	struct list_head * pos;
+	struct list_head * list;
+
+	list = dcookie_hashtable + dcookie_hash(dcookie);
+
+	list_for_each(pos, list) {
+		dcs = list_entry(pos, struct dcookie_struct, hash_list);
+		if (dcookie_value(dcs) == dcookie) {
+			found = dcs;
+			break;
+		}
+	}
+
+	return found;
+}
+
+
+static void hash_dcookie(struct dcookie_struct * dcs)
+{
+	struct list_head * list = dcookie_hashtable + dcookie_hash(dcookie_value(dcs));
+	list_add(&dcs->hash_list, list);
+}
+
+
+static struct dcookie_struct *alloc_dcookie(struct path *path)
+{
+	struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
+							GFP_KERNEL);
+	struct dentry *d;
+	if (!dcs)
+		return NULL;
+
+	d = path->dentry;
+	spin_lock(&d->d_lock);
+	d->d_flags |= DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
+	dcs->path = *path;
+	path_get(path);
+	hash_dcookie(dcs);
+	return dcs;
+}
+
+
+/* This is the main kernel-side routine that retrieves the cookie
+ * value for a dentry/vfsmnt pair.
+ */
+int get_dcookie(struct path *path, unsigned long *cookie)
+{
+	int err = 0;
+	struct dcookie_struct * dcs;
+
+	mutex_lock(&dcookie_mutex);
+
+	if (!is_live()) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (path->dentry->d_flags & DCACHE_COOKIE) {
+		dcs = find_dcookie((unsigned long)path->dentry);
+	} else {
+		dcs = alloc_dcookie(path);
+		if (!dcs) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	*cookie = dcookie_value(dcs);
+
+out:
+	mutex_unlock(&dcookie_mutex);
+	return err;
+}
+
+
+/* And here is where the userspace process can look up the cookie value
+ * to retrieve the path.
+ */
+SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
+{
+	unsigned long cookie = (unsigned long)cookie64;
+	int err = -EINVAL;
+	char * kbuf;
+	char * path;
+	size_t pathlen;
+	struct dcookie_struct * dcs;
+
+	/* we could leak path information to users
+	 * without dir read permission without this
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&dcookie_mutex);
+
+	if (!is_live()) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (!(dcs = find_dcookie(cookie)))
+		goto out;
+
+	err = -ENOMEM;
+	kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!kbuf)
+		goto out;
+
+	/* FIXME: (deleted) ? */
+	path = d_path(&dcs->path, kbuf, PAGE_SIZE);
+
+	mutex_unlock(&dcookie_mutex);
+
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		goto out_free;
+	}
+
+	err = -ERANGE;
+ 
+	pathlen = kbuf + PAGE_SIZE - path;
+	if (pathlen <= len) {
+		err = pathlen;
+		if (copy_to_user(buf, path, pathlen))
+			err = -EFAULT;
+	}
+
+out_free:
+	kfree(kbuf);
+	return err;
+out:
+	mutex_unlock(&dcookie_mutex);
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
+{
+#ifdef __BIG_ENDIAN
+	return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
+#else
+	return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
+#endif
+}
+#endif
+
+static int dcookie_init(void)
+{
+	struct list_head * d;
+	unsigned int i, hash_bits;
+	int err = -ENOMEM;
+
+	dcookie_cache = kmem_cache_create("dcookie_cache",
+		sizeof(struct dcookie_struct),
+		0, 0, NULL);
+
+	if (!dcookie_cache)
+		goto out;
+
+	dcookie_hashtable = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!dcookie_hashtable)
+		goto out_kmem;
+
+	err = 0;
+
+	/*
+	 * Find the power-of-two list-heads that can fit into the allocation..
+	 * We don't guarantee that "sizeof(struct list_head)" is necessarily
+	 * a power-of-two.
+	 */
+	hash_size = PAGE_SIZE / sizeof(struct list_head);
+	hash_bits = 0;
+	do {
+		hash_bits++;
+	} while ((hash_size >> hash_bits) != 0);
+	hash_bits--;
+
+	/*
+	 * Re-calculate the actual number of entries and the mask
+	 * from the number of bits we can fit.
+	 */
+	hash_size = 1UL << hash_bits;
+
+	/* And initialize the newly allocated array */
+	d = dcookie_hashtable;
+	i = hash_size;
+	do {
+		INIT_LIST_HEAD(d);
+		d++;
+		i--;
+	} while (i);
+
+out:
+	return err;
+out_kmem:
+	kmem_cache_destroy(dcookie_cache);
+	goto out;
+}
+
+
+static void free_dcookie(struct dcookie_struct * dcs)
+{
+	struct dentry *d = dcs->path.dentry;
+
+	spin_lock(&d->d_lock);
+	d->d_flags &= ~DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
+	path_put(&dcs->path);
+	kmem_cache_free(dcookie_cache, dcs);
+}
+
+
+static void dcookie_exit(void)
+{
+	struct list_head * list;
+	struct list_head * pos;
+	struct list_head * pos2;
+	struct dcookie_struct * dcs;
+	size_t i;
+
+	for (i = 0; i < hash_size; ++i) {
+		list = dcookie_hashtable + i;
+		list_for_each_safe(pos, pos2, list) {
+			dcs = list_entry(pos, struct dcookie_struct, hash_list);
+			list_del(&dcs->hash_list);
+			free_dcookie(dcs);
+		}
+	}
+
+	kfree(dcookie_hashtable);
+	kmem_cache_destroy(dcookie_cache);
+}
+
+
+struct dcookie_user {
+	struct list_head next;
+};
+ 
+struct dcookie_user * dcookie_register(void)
+{
+	struct dcookie_user * user;
+
+	mutex_lock(&dcookie_mutex);
+
+	user = kmalloc(sizeof(struct dcookie_user), GFP_KERNEL);
+	if (!user)
+		goto out;
+
+	if (!is_live() && dcookie_init())
+		goto out_free;
+
+	list_add(&user->next, &dcookie_users);
+
+out:
+	mutex_unlock(&dcookie_mutex);
+	return user;
+out_free:
+	kfree(user);
+	user = NULL;
+	goto out;
+}
+
+
+void dcookie_unregister(struct dcookie_user * user)
+{
+	mutex_lock(&dcookie_mutex);
+
+	list_del(&user->next);
+	kfree(user);
+
+	if (!is_live())
+		dcookie_exit();
+
+	mutex_unlock(&dcookie_mutex);
+}
+
+EXPORT_SYMBOL_GPL(dcookie_register);
+EXPORT_SYMBOL_GPL(dcookie_unregister);
+EXPORT_SYMBOL_GPL(get_dcookie);
diff --git a/kernel/fs/debugfs/Makefile b/kernel/fs/debugfs/Makefile
new file mode 100644
index 000000000..840c45696
--- /dev/null
+++ b/kernel/fs/debugfs/Makefile
@@ -0,0 +1,4 @@
+debugfs-objs	:= inode.o file.o
+
+obj-$(CONFIG_DEBUG_FS)	+= debugfs.o
+
diff --git a/kernel/fs/debugfs/file.c b/kernel/fs/debugfs/file.c
new file mode 100644
index 000000000..830a7e76f
--- /dev/null
+++ b/kernel/fs/debugfs/file.c
@@ -0,0 +1,818 @@
+/*
+ *  file.c - part of debugfs, a tiny little debug file system
+ *
+ *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
+ *  Copyright (C) 2004 IBM Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ *  debugfs is for people to use instead of /proc or /sys.
+ *  See Documentation/DocBook/filesystems for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <linux/device.h>
+
+static ssize_t default_read_file(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return 0;
+}
+
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+const struct file_operations debugfs_file_operations = {
+	.read =		default_read_file,
+	.write =	default_write_file,
+	.open =		simple_open,
+	.llseek =	noop_llseek,
+};
+
+static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, d_inode(dentry)->i_private);
+	return NULL;
+}
+
+const struct inode_operations debugfs_link_operations = {
+	.readlink       = generic_readlink,
+	.follow_link    = debugfs_follow_link,
+};
+
+static int debugfs_u8_set(void *data, u64 val)
+{
+	*(u8 *)data = val;
+	return 0;
+}
+static int debugfs_u8_get(void *data, u64 *val)
+{
+	*val = *(u8 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
+
+/**
+ * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u8(const char *name, umode_t mode,
+				 struct dentry *parent, u8 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u8);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u8);
+
+static int debugfs_u16_set(void *data, u64 val)
+{
+	*(u16 *)data = val;
+	return 0;
+}
+static int debugfs_u16_get(void *data, u64 *val)
+{
+	*val = *(u16 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
+
+/**
+ * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u16(const char *name, umode_t mode,
+				  struct dentry *parent, u16 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u16);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u16);
+
+static int debugfs_u32_set(void *data, u64 val)
+{
+	*(u32 *)data = val;
+	return 0;
+}
+static int debugfs_u32_get(void *data, u64 *val)
+{
+	*val = *(u32 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
+
+/**
+ * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u32(const char *name, umode_t mode,
+				 struct dentry *parent, u32 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u32);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u32);
+
+static int debugfs_u64_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+	return 0;
+}
+
+static int debugfs_u64_get(void *data, u64 *val)
+{
+	*val = *(u64 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
+
+/**
+ * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u64(const char *name, umode_t mode,
+				 struct dentry *parent, u64 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u64);
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+
+/*
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
+ *
+ * These functions are exactly the same as the above functions (but use a hex
+ * output for the decimal challenged). For details look at the above unsigned
+ * decimal functions.
+ */
+
+/**
+ * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x8(const char *name, umode_t mode,
+				 struct dentry *parent, u8 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_x8);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x8);
+
+/**
+ * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x16(const char *name, umode_t mode,
+				 struct dentry *parent, u16 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_x16);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x16);
+
+/**
+ * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x32(const char *name, umode_t mode,
+				 struct dentry *parent, u32 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_x32);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x32);
+
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x64(const char *name, umode_t mode,
+				 struct dentry *parent, u64 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
+
+
+static int debugfs_size_t_set(void *data, u64 val)
+{
+	*(size_t *)data = val;
+	return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+	*val = *(size_t *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+			"%llu\n");	/* %llu and %zu are more or less the same */
+
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
+				     struct dentry *parent, size_t *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+	atomic_set((atomic_t *)data, val);
+	return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+	*val = atomic_read((atomic_t *)data);
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+			debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+				 struct dentry *parent, atomic_t *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value,
+					&fops_atomic_t_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value,
+					&fops_atomic_t_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
+
+static ssize_t read_file_bool(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	char buf[3];
+	u32 *val = file->private_data;
+
+	if (*val)
+		buf[0] = 'Y';
+	else
+		buf[0] = 'N';
+	buf[1] = '\n';
+	buf[2] = 0x00;
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
+			       size_t count, loff_t *ppos)
+{
+	char buf[32];
+	size_t buf_size;
+	bool bv;
+	u32 *val = file->private_data;
+
+	buf_size = min(count, (sizeof(buf)-1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	buf[buf_size] = '\0';
+	if (strtobool(buf, &bv) == 0)
+		*val = bv;
+
+	return count;
+}
+
+static const struct file_operations fops_bool = {
+	.read =		read_file_bool,
+	.write =	write_file_bool,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+/**
+ * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_bool(const char *name, umode_t mode,
+				   struct dentry *parent, u32 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_bool);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_bool);
+
+static ssize_t read_file_blob(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	struct debugfs_blob_wrapper *blob = file->private_data;
+	return simple_read_from_buffer(user_buf, count, ppos, blob->data,
+			blob->size);
+}
+
+static const struct file_operations fops_blob = {
+	.read =		read_file_blob,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+/**
+ * debugfs_create_blob - create a debugfs file that is used to read a binary blob
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer
+ *        to the blob data and the size of the data.
+ *
+ * This function creates a file in debugfs with the given name that exports
+ * @blob->data as a binary blob. If the @mode variable is so set it can be
+ * read from. Writing is not supported.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_blob(const char *name, umode_t mode,
+				   struct dentry *parent,
+				   struct debugfs_blob_wrapper *blob)
+{
+	return debugfs_create_file(name, mode, parent, blob, &fops_blob);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_blob);
+
+struct array_data {
+	void *array;
+	u32 elements;
+};
+
+static size_t u32_format_array(char *buf, size_t bufsize,
+			       u32 *array, int array_size)
+{
+	size_t ret = 0;
+
+	while (--array_size >= 0) {
+		size_t len;
+		char term = array_size ? ' ' : '\n';
+
+		len = snprintf(buf, bufsize, "%u%c", *array++, term);
+		ret += len;
+
+		buf += len;
+		bufsize -= len;
+	}
+	return ret;
+}
+
+static int u32_array_open(struct inode *inode, struct file *file)
+{
+	struct array_data *data = inode->i_private;
+	int size, elements = data->elements;
+	char *buf;
+
+	/*
+	 * Max size:
+	 *  - 10 digits + ' '/'\n' = 11 bytes per number
+	 *  - terminating NUL character
+	 */
+	size = elements*11;
+	buf = kmalloc(size+1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	buf[size] = 0;
+
+	file->private_data = buf;
+	u32_format_array(buf, size, data->array, data->elements);
+
+	return nonseekable_open(inode, file);
+}
+
+static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
+			      loff_t *ppos)
+{
+	size_t size = strlen(file->private_data);
+
+	return simple_read_from_buffer(buf, len, ppos,
+					file->private_data, size);
+}
+
+static int u32_array_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+
+	return 0;
+}
+
+static const struct file_operations u32_array_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = u32_array_open,
+	.release = u32_array_release,
+	.read	 = u32_array_read,
+	.llseek  = no_llseek,
+};
+
+/**
+ * debugfs_create_u32_array - create a debugfs file that is used to read u32
+ * array.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @array: u32 array that provides data.
+ * @elements: total number of elements in the array.
+ *
+ * This function creates a file in debugfs with the given name that exports
+ * @array as data. If the @mode variable is so set it can be read from.
+ * Writing is not supported. Seek within the file is also not supported.
+ * Once array is created its size can not be changed.
+ *
+ * The function returns a pointer to dentry on success. If debugfs is not
+ * enabled in the kernel, the value -%ENODEV will be returned.
+ */
+struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+					    struct dentry *parent,
+					    u32 *array, u32 elements)
+{
+	struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+	if (data == NULL)
+		return NULL;
+
+	data->array = array;
+	data->elements = elements;
+
+	return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
+
+#ifdef CONFIG_HAS_IOMEM
+
+/*
+ * The regset32 stuff is used to print 32-bit registers using the
+ * seq_file utilities. We offer printing a register set in an already-opened
+ * sequential file or create a debugfs file that only prints a regset32.
+ */
+
+/**
+ * debugfs_print_regs32 - use seq_print to describe a set of registers
+ * @s: the seq_file structure being used to generate output
+ * @regs: an array if struct debugfs_reg32 structures
+ * @nregs: the length of the above array
+ * @base: the base address to be used in reading the registers
+ * @prefix: a string to be prefixed to every output line
+ *
+ * This function outputs a text block describing the current values of
+ * some 32-bit hardware registers. It is meant to be used within debugfs
+ * files based on seq_file that need to show registers, intermixed with other
+ * information. The prefix argument may be used to specify a leading string,
+ * because some peripherals have several blocks of identical registers,
+ * for example configuration of dma channels
+ */
+void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
+			  int nregs, void __iomem *base, char *prefix)
+{
+	int i;
+
+	for (i = 0; i < nregs; i++, regs++) {
+		if (prefix)
+			seq_printf(s, "%s", prefix);
+		seq_printf(s, "%s = 0x%08x\n", regs->name,
+			   readl(base + regs->offset));
+		if (seq_has_overflowed(s))
+			break;
+	}
+}
+EXPORT_SYMBOL_GPL(debugfs_print_regs32);
+
+static int debugfs_show_regset32(struct seq_file *s, void *data)
+{
+	struct debugfs_regset32 *regset = s->private;
+
+	debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, "");
+	return 0;
+}
+
+static int debugfs_open_regset32(struct inode *inode, struct file *file)
+{
+	return single_open(file, debugfs_show_regset32, inode->i_private);
+}
+
+static const struct file_operations fops_regset32 = {
+	.open =		debugfs_open_regset32,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+	.release =	single_release,
+};
+
+/**
+ * debugfs_create_regset32 - create a debugfs file that returns register values
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @regset: a pointer to a struct debugfs_regset32, which contains a pointer
+ *          to an array of register definitions, the array size and the base
+ *          address where the register bank is to be found.
+ *
+ * This function creates a file in debugfs with the given name that reports
+ * the names and values of a set of 32-bit registers. If the @mode variable
+ * is so set it can be read from. Writing is not supported.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
+				       struct dentry *parent,
+				       struct debugfs_regset32 *regset)
+{
+	return debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_regset32);
+
+#endif /* CONFIG_HAS_IOMEM */
+
+struct debugfs_devm_entry {
+	int (*read)(struct seq_file *seq, void *data);
+	struct device *dev;
+};
+
+static int debugfs_devm_entry_open(struct inode *inode, struct file *f)
+{
+	struct debugfs_devm_entry *entry = inode->i_private;
+
+	return single_open(f, entry->read, entry->dev);
+}
+
+static const struct file_operations debugfs_devm_entry_ops = {
+	.owner = THIS_MODULE,
+	.open = debugfs_devm_entry_open,
+	.release = single_release,
+	.read = seq_read,
+	.llseek = seq_lseek
+};
+
+/**
+ * debugfs_create_devm_seqfile - create a debugfs file that is bound to device.
+ *
+ * @dev: device related to this debugfs file.
+ * @name: name of the debugfs file.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *	directory dentry if set.  If this parameter is %NULL, then the
+ *	file will be created in the root of the debugfs filesystem.
+ * @read_fn: function pointer called to print the seq_file content.
+ */
+struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
+					   struct dentry *parent,
+					   int (*read_fn)(struct seq_file *s,
+							  void *data))
+{
+	struct debugfs_devm_entry *entry;
+
+	if (IS_ERR(parent))
+		return ERR_PTR(-ENOENT);
+
+	entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	entry->read = read_fn;
+	entry->dev = dev;
+
+	return debugfs_create_file(name, S_IRUGO, parent, entry,
+				   &debugfs_devm_entry_ops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);
+
diff --git a/kernel/fs/debugfs/inode.c b/kernel/fs/debugfs/inode.c
new file mode 100644
index 000000000..12756040c
--- /dev/null
+++ b/kernel/fs/debugfs/inode.c
@@ -0,0 +1,736 @@
+/*
+ *  inode.c - part of debugfs, a tiny little debug file system
+ *
+ *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
+ *  Copyright (C) 2004 IBM Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ *  debugfs is for people to use instead of /proc or /sys.
+ *  See Documentation/DocBook/kernel-api for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/debugfs.h>
+#include <linux/fsnotify.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#define DEBUGFS_DEFAULT_MODE	0700
+
+static struct vfsmount *debugfs_mount;
+static int debugfs_mount_count;
+static bool debugfs_registered;
+
+static struct inode *debugfs_get_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	}
+	return inode;
+}
+
+static inline int debugfs_positive(struct dentry *dentry)
+{
+	return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
+struct debugfs_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
+	umode_t mode;
+};
+
+enum {
+	Opt_uid,
+	Opt_gid,
+	Opt_mode,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_mode, "mode=%o"},
+	{Opt_err, NULL}
+};
+
+struct debugfs_fs_info {
+	struct debugfs_mount_opts mount_opts;
+};
+
+static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int token;
+	kuid_t uid;
+	kgid_t gid;
+	char *p;
+
+	opts->mode = DEBUGFS_DEFAULT_MODE;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid))
+				return -EINVAL;
+			opts->uid = uid;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid))
+				return -EINVAL;
+			opts->gid = gid;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+		/*
+		 * We might like to report bad mount options here;
+		 * but traditionally debugfs has ignored all mount options
+		 */
+		}
+	}
+
+	return 0;
+}
+
+static int debugfs_apply_options(struct super_block *sb)
+{
+	struct debugfs_fs_info *fsi = sb->s_fs_info;
+	struct inode *inode = d_inode(sb->s_root);
+	struct debugfs_mount_opts *opts = &fsi->mount_opts;
+
+	inode->i_mode &= ~S_IALLUGO;
+	inode->i_mode |= opts->mode;
+
+	inode->i_uid = opts->uid;
+	inode->i_gid = opts->gid;
+
+	return 0;
+}
+
+static int debugfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct debugfs_fs_info *fsi = sb->s_fs_info;
+
+	sync_filesystem(sb);
+	err = debugfs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		goto fail;
+
+	debugfs_apply_options(sb);
+
+fail:
+	return err;
+}
+
+static int debugfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct debugfs_fs_info *fsi = root->d_sb->s_fs_info;
+	struct debugfs_mount_opts *opts = &fsi->mount_opts;
+
+	if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, opts->uid));
+	if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, opts->gid));
+	if (opts->mode != DEBUGFS_DEFAULT_MODE)
+		seq_printf(m, ",mode=%o", opts->mode);
+
+	return 0;
+}
+
+static void debugfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_private);
+}
+
+static const struct super_operations debugfs_super_operations = {
+	.statfs		= simple_statfs,
+	.remount_fs	= debugfs_remount,
+	.show_options	= debugfs_show_options,
+	.evict_inode	= debugfs_evict_inode,
+};
+
+static struct vfsmount *debugfs_automount(struct path *path)
+{
+	struct vfsmount *(*f)(void *);
+	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+	return f(d_inode(path->dentry)->i_private);
+}
+
+static const struct dentry_operations debugfs_dops = {
+	.d_delete = always_delete_dentry,
+	.d_automount = debugfs_automount,
+};
+
+static int debug_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr debug_files[] = {{""}};
+	struct debugfs_fs_info *fsi;
+	int err;
+
+	save_mount_options(sb, data);
+
+	fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL);
+	sb->s_fs_info = fsi;
+	if (!fsi) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	err = debugfs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		goto fail;
+
+	err  =  simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
+	if (err)
+		goto fail;
+
+	sb->s_op = &debugfs_super_operations;
+	sb->s_d_op = &debugfs_dops;
+
+	debugfs_apply_options(sb);
+
+	return 0;
+
+fail:
+	kfree(fsi);
+	sb->s_fs_info = NULL;
+	return err;
+}
+
+static struct dentry *debug_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	return mount_single(fs_type, flags, data, debug_fill_super);
+}
+
+static struct file_system_type debug_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"debugfs",
+	.mount =	debug_mount,
+	.kill_sb =	kill_litter_super,
+};
+MODULE_ALIAS_FS("debugfs");
+
+static struct dentry *start_creating(const char *name, struct dentry *parent)
+{
+	struct dentry *dentry;
+	int error;
+
+	pr_debug("debugfs: creating file '%s'\n",name);
+
+	if (IS_ERR(parent))
+		return parent;
+
+	error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
+			      &debugfs_mount_count);
+	if (error)
+		return ERR_PTR(error);
+
+	/* If the parent is not specified, we create it in the root.
+	 * We need the root dentry to do this, which is in the super
+	 * block. A pointer to that is in the struct vfsmount that we
+	 * have around.
+	 */
+	if (!parent)
+		parent = debugfs_mount->mnt_root;
+
+	mutex_lock(&d_inode(parent)->i_mutex);
+	dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
+		dput(dentry);
+		dentry = ERR_PTR(-EEXIST);
+	}
+	if (IS_ERR(dentry))
+		mutex_unlock(&d_inode(parent)->i_mutex);
+	return dentry;
+}
+
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+	mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+	dput(dentry);
+	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+	return NULL;
+}
+
+static struct dentry *end_creating(struct dentry *dentry)
+{
+	mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+	return dentry;
+}
+
+/**
+ * debugfs_create_file - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file(const char *name, umode_t mode,
+				   struct dentry *parent, void *data,
+				   const struct file_operations *fops)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	if (!(mode & S_IFMT))
+		mode |= S_IFREG;
+	BUG_ON(!S_ISREG(mode));
+	dentry = start_creating(name, parent);
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = mode;
+	inode->i_fop = fops ? fops : &debugfs_file_operations;
+	inode->i_private = data;
+	d_instantiate(dentry, inode);
+	fsnotify_create(d_inode(dentry->d_parent), dentry);
+	return end_creating(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file);
+
+/**
+ * debugfs_create_file_size - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ * @file_size: initial file size
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size)
+{
+	struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
+
+	if (de)
+		d_inode(de)->i_size = file_size;
+	return de;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_size);
+
+/**
+ * debugfs_create_dir - create a directory in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ *        create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          directory will be created in the root of the debugfs filesystem.
+ *
+ * This function creates a directory in debugfs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
+{
+	struct dentry *dentry = start_creating(name, parent);
+	struct inode *inode;
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inc_nlink(inode);
+	d_instantiate(dentry, inode);
+	inc_nlink(d_inode(dentry->d_parent));
+	fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
+	return end_creating(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_dir);
+
+/**
+ * debugfs_create_automount - create automount point in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @f: function to be called when pathname resolution steps on that one.
+ * @data: opaque argument to pass to f().
+ *
+ * @f should return what ->d_automount() would.
+ */
+struct dentry *debugfs_create_automount(const char *name,
+					struct dentry *parent,
+					struct vfsmount *(*f)(void *),
+					void *data)
+{
+	struct dentry *dentry = start_creating(name, parent);
+	struct inode *inode;
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_flags |= S_AUTOMOUNT;
+	inode->i_private = data;
+	dentry->d_fsdata = (void *)f;
+	d_instantiate(dentry, inode);
+	return end_creating(dentry);
+}
+EXPORT_SYMBOL(debugfs_create_automount);
+
+/**
+ * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the symbolic link to
+ *        create.
+ * @parent: a pointer to the parent dentry for this symbolic link.  This
+ *          should be a directory dentry if set.  If this parameter is NULL,
+ *          then the symbolic link will be created in the root of the debugfs
+ *          filesystem.
+ * @target: a pointer to a string containing the path to the target of the
+ *          symbolic link.
+ *
+ * This function creates a symbolic link with the given name in debugfs that
+ * links to the given target path.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the symbolic
+ * link is to be removed (no automatic cleanup happens if your module is
+ * unloaded, you are responsible here.)  If an error occurs, %NULL will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
+				      const char *target)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+	char *link = kstrdup(target, GFP_KERNEL);
+	if (!link)
+		return NULL;
+
+	dentry = start_creating(name, parent);
+	if (IS_ERR(dentry)) {
+		kfree(link);
+		return NULL;
+	}
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode)) {
+		kfree(link);
+		return failed_creating(dentry);
+	}
+	inode->i_mode = S_IFLNK | S_IRWXUGO;
+	inode->i_op = &debugfs_link_operations;
+	inode->i_private = link;
+	d_instantiate(dentry, inode);
+	return end_creating(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_symlink);
+
+static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+{
+	int ret = 0;
+
+	if (debugfs_positive(dentry)) {
+		dget(dentry);
+		if (d_is_dir(dentry))
+			ret = simple_rmdir(d_inode(parent), dentry);
+		else
+			simple_unlink(d_inode(parent), dentry);
+		if (!ret)
+			d_delete(dentry);
+		dput(dentry);
+	}
+	return ret;
+}
+
+/**
+ * debugfs_remove - removes a file or directory from the debugfs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ *          removed.
+ *
+ * This function removes a file or directory in debugfs that was previously
+ * created with a call to another debugfs function (like
+ * debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove(struct dentry *dentry)
+{
+	struct dentry *parent;
+	int ret;
+
+	if (IS_ERR_OR_NULL(dentry))
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || d_really_is_negative(parent))
+		return;
+
+	mutex_lock(&d_inode(parent)->i_mutex);
+	ret = __debugfs_remove(dentry, parent);
+	mutex_unlock(&d_inode(parent)->i_mutex);
+	if (!ret)
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+}
+EXPORT_SYMBOL_GPL(debugfs_remove);
+
+/**
+ * debugfs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in debugfs that
+ * was previously created with a call to another debugfs function
+ * (like debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove_recursive(struct dentry *dentry)
+{
+	struct dentry *child, *parent;
+
+	if (IS_ERR_OR_NULL(dentry))
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || d_really_is_negative(parent))
+		return;
+
+	parent = dentry;
+ down:
+	mutex_lock(&d_inode(parent)->i_mutex);
+ loop:
+	/*
+	 * The parent->d_subdirs is protected by the d_lock. Outside that
+	 * lock, the child can be unlinked and set to be freed which can
+	 * use the d_u.d_child as the rcu head and corrupt this list.
+	 */
+	spin_lock(&parent->d_lock);
+	list_for_each_entry(child, &parent->d_subdirs, d_child) {
+		if (!debugfs_positive(child))
+			continue;
+
+		/* perhaps simple_empty(child) makes more sense */
+		if (!list_empty(&child->d_subdirs)) {
+			spin_unlock(&parent->d_lock);
+			mutex_unlock(&d_inode(parent)->i_mutex);
+			parent = child;
+			goto down;
+		}
+
+		spin_unlock(&parent->d_lock);
+
+		if (!__debugfs_remove(child, parent))
+			simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+
+		/*
+		 * The parent->d_lock protects agaist child from unlinking
+		 * from d_subdirs. When releasing the parent->d_lock we can
+		 * no longer trust that the next pointer is valid.
+		 * Restart the loop. We'll skip this one with the
+		 * debugfs_positive() check.
+		 */
+		goto loop;
+	}
+	spin_unlock(&parent->d_lock);
+
+	mutex_unlock(&d_inode(parent)->i_mutex);
+	child = parent;
+	parent = parent->d_parent;
+	mutex_lock(&d_inode(parent)->i_mutex);
+
+	if (child != dentry)
+		/* go up */
+		goto loop;
+
+	if (!__debugfs_remove(child, parent))
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+	mutex_unlock(&d_inode(parent)->i_mutex);
+}
+EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
+
+/**
+ * debugfs_rename - rename a file/directory in the debugfs filesystem
+ * @old_dir: a pointer to the parent dentry for the renamed object. This
+ *          should be a directory dentry.
+ * @old_dentry: dentry of an object to be renamed.
+ * @new_dir: a pointer to the parent dentry where the object should be
+ *          moved. This should be a directory dentry.
+ * @new_name: a pointer to a string containing the target name.
+ *
+ * This function renames a file/directory in debugfs.  The target must not
+ * exist for rename to succeed.
+ *
+ * This function will return a pointer to old_dentry (which is updated to
+ * reflect renaming) if it succeeds. If an error occurs, %NULL will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
+		struct dentry *new_dir, const char *new_name)
+{
+	int error;
+	struct dentry *dentry = NULL, *trap;
+	const char *old_name;
+
+	trap = lock_rename(new_dir, old_dir);
+	/* Source or destination directories don't exist? */
+	if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
+		goto exit;
+	/* Source does not exist, cyclic rename, or mountpoint? */
+	if (d_really_is_negative(old_dentry) || old_dentry == trap ||
+	    d_mountpoint(old_dentry))
+		goto exit;
+	dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
+	/* Lookup failed, cyclic rename or target exists? */
+	if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry))
+		goto exit;
+
+	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+
+	error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir),
+		dentry);
+	if (error) {
+		fsnotify_oldname_free(old_name);
+		goto exit;
+	}
+	d_move(old_dentry, dentry);
+	fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name,
+		d_is_dir(old_dentry),
+		NULL, old_dentry);
+	fsnotify_oldname_free(old_name);
+	unlock_rename(new_dir, old_dir);
+	dput(dentry);
+	return old_dentry;
+exit:
+	if (dentry && !IS_ERR(dentry))
+		dput(dentry);
+	unlock_rename(new_dir, old_dir);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(debugfs_rename);
+
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+	return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
+
+static int __init debugfs_init(void)
+{
+	int retval;
+
+	retval = sysfs_create_mount_point(kernel_kobj, "debug");
+	if (retval)
+		return retval;
+
+	retval = register_filesystem(&debug_fs_type);
+	if (retval)
+		sysfs_remove_mount_point(kernel_kobj, "debug");
+	else
+		debugfs_registered = true;
+
+	return retval;
+}
+core_initcall(debugfs_init);
+
diff --git a/kernel/fs/devpts/Makefile b/kernel/fs/devpts/Makefile
new file mode 100644
index 000000000..236696efc
--- /dev/null
+++ b/kernel/fs/devpts/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux /dev/pts virtual filesystem.
+#
+
+obj-$(CONFIG_UNIX98_PTYS)		+= devpts.o
+
+devpts-$(CONFIG_UNIX98_PTYS)		:= inode.o
diff --git a/kernel/fs/devpts/inode.c b/kernel/fs/devpts/inode.c
new file mode 100644
index 000000000..add566303
--- /dev/null
+++ b/kernel/fs/devpts/inode.c
@@ -0,0 +1,689 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * linux/fs/devpts/inode.c
+ *
+ *  Copyright 1998-2004 H. Peter Anvin -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/tty.h>
+#include <linux/mutex.h>
+#include <linux/magic.h>
+#include <linux/idr.h>
+#include <linux/devpts_fs.h>
+#include <linux/parser.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+
+#define DEVPTS_DEFAULT_MODE 0600
+/*
+ * ptmx is a new node in /dev/pts and will be unused in legacy (single-
+ * instance) mode. To prevent surprises in user space, set permissions of
+ * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
+ * permissions.
+ */
+#define DEVPTS_DEFAULT_PTMX_MODE 0000
+#define PTMX_MINOR	2
+
+/*
+ * sysctl support for setting limits on the number of Unix98 ptys allocated.
+ * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly.
+ */
+static int pty_limit = NR_UNIX98_PTY_DEFAULT;
+static int pty_reserve = NR_UNIX98_PTY_RESERVE;
+static int pty_limit_min;
+static int pty_limit_max = INT_MAX;
+static int pty_count;
+
+static struct ctl_table pty_table[] = {
+	{
+		.procname	= "max",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.data		= &pty_limit,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &pty_limit_min,
+		.extra2		= &pty_limit_max,
+	}, {
+		.procname	= "reserve",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.data		= &pty_reserve,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &pty_limit_min,
+		.extra2		= &pty_limit_max,
+	}, {
+		.procname	= "nr",
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.data		= &pty_count,
+		.proc_handler	= proc_dointvec,
+	},
+	{}
+};
+
+static struct ctl_table pty_kern_table[] = {
+	{
+		.procname	= "pty",
+		.mode		= 0555,
+		.child		= pty_table,
+	},
+	{}
+};
+
+static struct ctl_table pty_root_table[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= pty_kern_table,
+	},
+	{}
+};
+
+static DEFINE_MUTEX(allocated_ptys_lock);
+
+static struct vfsmount *devpts_mnt;
+
+struct pts_mount_opts {
+	int setuid;
+	int setgid;
+	kuid_t   uid;
+	kgid_t   gid;
+	umode_t mode;
+	umode_t ptmxmode;
+	int newinstance;
+	int max;
+};
+
+enum {
+	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,  Opt_max,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_mode, "mode=%o"},
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	{Opt_ptmxmode, "ptmxmode=%o"},
+	{Opt_newinstance, "newinstance"},
+	{Opt_max, "max=%d"},
+#endif
+	{Opt_err, NULL}
+};
+
+struct pts_fs_info {
+	struct ida allocated_ptys;
+	struct pts_mount_opts mount_opts;
+	struct dentry *ptmx_dentry;
+};
+
+static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+{
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		return inode->i_sb;
+#endif
+	return devpts_mnt->mnt_sb;
+}
+
+#define PARSE_MOUNT	0
+#define PARSE_REMOUNT	1
+
+/*
+ * parse_mount_options():
+ *	Set @opts to mount options specified in @data. If an option is not
+ *	specified in @data, set it to its default value. The exception is
+ *	'newinstance' option which can only be set/cleared on a mount (i.e.
+ *	cannot be changed during remount).
+ *
+ * Note: @data may be NULL (in which case all options are set to default).
+ */
+static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+{
+	char *p;
+	kuid_t uid;
+	kgid_t gid;
+
+	opts->setuid  = 0;
+	opts->setgid  = 0;
+	opts->uid     = GLOBAL_ROOT_UID;
+	opts->gid     = GLOBAL_ROOT_GID;
+	opts->mode    = DEVPTS_DEFAULT_MODE;
+	opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+	opts->max     = NR_UNIX98_PTY_MAX;
+
+	/* newinstance makes sense only on initial mount */
+	if (op == PARSE_MOUNT)
+		opts->newinstance = 0;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		int option;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid))
+				return -EINVAL;
+			opts->uid = uid;
+			opts->setuid = 1;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid))
+				return -EINVAL;
+			opts->gid = gid;
+			opts->setgid = 1;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+		case Opt_ptmxmode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->ptmxmode = option & S_IALLUGO;
+			break;
+		case Opt_newinstance:
+			/* newinstance makes sense only on initial mount */
+			if (op == PARSE_MOUNT)
+				opts->newinstance = 1;
+			break;
+		case Opt_max:
+			if (match_int(&args[0], &option) ||
+			    option < 0 || option > NR_UNIX98_PTY_MAX)
+				return -EINVAL;
+			opts->max = option;
+			break;
+#endif
+		default:
+			pr_err("called with bogus options\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int mknod_ptmx(struct super_block *sb)
+{
+	int mode;
+	int rc = -ENOMEM;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct dentry *root = sb->s_root;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+	kuid_t root_uid;
+	kgid_t root_gid;
+
+	root_uid = make_kuid(current_user_ns(), 0);
+	root_gid = make_kgid(current_user_ns(), 0);
+	if (!uid_valid(root_uid) || !gid_valid(root_gid))
+		return -EINVAL;
+
+	mutex_lock(&d_inode(root)->i_mutex);
+
+	/* If we have already created ptmx node, return */
+	if (fsi->ptmx_dentry) {
+		rc = 0;
+		goto out;
+	}
+
+	dentry = d_alloc_name(root, "ptmx");
+	if (!dentry) {
+		pr_err("Unable to alloc dentry for ptmx node\n");
+		goto out;
+	}
+
+	/*
+	 * Create a new 'ptmx' node in this mount of devpts.
+	 */
+	inode = new_inode(sb);
+	if (!inode) {
+		pr_err("Unable to alloc inode for ptmx node\n");
+		dput(dentry);
+		goto out;
+	}
+
+	inode->i_ino = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	mode = S_IFCHR|opts->ptmxmode;
+	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+	inode->i_uid = root_uid;
+	inode->i_gid = root_gid;
+
+	d_add(dentry, inode);
+
+	fsi->ptmx_dentry = dentry;
+	rc = 0;
+out:
+	mutex_unlock(&d_inode(root)->i_mutex);
+	return rc;
+}
+
+static void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+	struct inode *inode;
+	if (fsi->ptmx_dentry) {
+		inode = d_inode(fsi->ptmx_dentry);
+		inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
+	}
+}
+#else
+static inline void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+	return;
+}
+#endif
+
+static int devpts_remount(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	sync_filesystem(sb);
+	err = parse_mount_options(data, PARSE_REMOUNT, opts);
+
+	/*
+	 * parse_mount_options() restores options to default values
+	 * before parsing and may have changed ptmxmode. So, update the
+	 * mode in the inode too. Bogus options don't fail the remount,
+	 * so do this even on error return.
+	 */
+	update_ptmx_mode(fsi);
+
+	return err;
+}
+
+static int devpts_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	if (opts->setuid)
+		seq_printf(seq, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, opts->uid));
+	if (opts->setgid)
+		seq_printf(seq, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, opts->gid));
+	seq_printf(seq, ",mode=%03o", opts->mode);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+	if (opts->max < NR_UNIX98_PTY_MAX)
+		seq_printf(seq, ",max=%d", opts->max);
+#endif
+
+	return 0;
+}
+
+static const struct super_operations devpts_sops = {
+	.statfs		= simple_statfs,
+	.remount_fs	= devpts_remount,
+	.show_options	= devpts_show_options,
+};
+
+static void *new_pts_fs_info(void)
+{
+	struct pts_fs_info *fsi;
+
+	fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+	if (!fsi)
+		return NULL;
+
+	ida_init(&fsi->allocated_ptys);
+	fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+	fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+
+	return fsi;
+}
+
+static int
+devpts_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode *inode;
+
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = DEVPTS_SUPER_MAGIC;
+	s->s_op = &devpts_sops;
+	s->s_time_gran = 1;
+
+	s->s_fs_info = new_pts_fs_info();
+	if (!s->s_fs_info)
+		goto fail;
+
+	inode = new_inode(s);
+	if (!inode)
+		goto fail;
+	inode->i_ino = 1;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	set_nlink(inode, 2);
+
+	s->s_root = d_make_root(inode);
+	if (s->s_root)
+		return 0;
+
+	pr_err("get root dentry failed\n");
+
+fail:
+	return -ENOMEM;
+}
+
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+	if (devpts_mnt)
+		return devpts_mnt->mnt_sb == s;
+	return 0;
+}
+
+/*
+ * devpts_mount()
+ *
+ *     If the '-o newinstance' mount option was specified, mount a new
+ *     (private) instance of devpts.  PTYs created in this instance are
+ *     independent of the PTYs in other devpts instances.
+ *
+ *     If the '-o newinstance' option was not specified, mount/remount the
+ *     initial kernel mount of devpts.  This type of mount gives the
+ *     legacy, single-instance semantics.
+ *
+ *     The 'newinstance' option is needed to support multiple namespace
+ *     semantics in devpts while preserving backward compatibility of the
+ *     current 'single-namespace' semantics. i.e all mounts of devpts
+ *     without the 'newinstance' mount option should bind to the initial
+ *     kernel mount, like mount_single().
+ *
+ *     Mounts with 'newinstance' option create a new, private namespace.
+ *
+ *     NOTE:
+ *
+ *     For single-mount semantics, devpts cannot use mount_single(),
+ *     because mount_single()/sget() find and use the super-block from
+ *     the most recent mount of devpts. But that recent mount may be a
+ *     'newinstance' mount and mount_single() would pick the newinstance
+ *     super-block instead of the initial super-block.
+ */
+static struct dentry *devpts_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	int error;
+	struct pts_mount_opts opts;
+	struct super_block *s;
+
+	error = parse_mount_options(data, PARSE_MOUNT, &opts);
+	if (error)
+		return ERR_PTR(error);
+
+	/* Require newinstance for all user namespace mounts to ensure
+	 * the mount options are not changed.
+	 */
+	if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+		return ERR_PTR(-EINVAL);
+
+	if (opts.newinstance)
+		s = sget(fs_type, NULL, set_anon_super, flags, NULL);
+	else
+		s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
+			 NULL);
+
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	if (!s->s_root) {
+		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error)
+			goto out_undo_sget;
+		s->s_flags |= MS_ACTIVE;
+	}
+
+	memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
+
+	error = mknod_ptmx(s);
+	if (error)
+		goto out_undo_sget;
+
+	return dget(s->s_root);
+
+out_undo_sget:
+	deactivate_locked_super(s);
+	return ERR_PTR(error);
+}
+
+#else
+/*
+ * This supports only the legacy single-instance semantics (no
+ * multiple-instance semantics)
+ */
+static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, devpts_fill_super);
+}
+#endif
+
+static void devpts_kill_sb(struct super_block *sb)
+{
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
+	ida_destroy(&fsi->allocated_ptys);
+	kfree(fsi);
+	kill_litter_super(sb);
+}
+
+static struct file_system_type devpts_fs_type = {
+	.name		= "devpts",
+	.mount		= devpts_mount,
+	.kill_sb	= devpts_kill_sb,
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+#endif
+};
+
+/*
+ * The normal naming convention is simply /dev/pts/<number>; this conforms
+ * to the System V naming convention
+ */
+
+int devpts_new_index(struct inode *ptmx_inode)
+{
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	int index;
+	int ida_ret;
+
+retry:
+	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
+		return -ENOMEM;
+
+	mutex_lock(&allocated_ptys_lock);
+	if (pty_count >= pty_limit -
+			(fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+		mutex_unlock(&allocated_ptys_lock);
+		return -ENOSPC;
+	}
+
+	ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
+	if (ida_ret < 0) {
+		mutex_unlock(&allocated_ptys_lock);
+		if (ida_ret == -EAGAIN)
+			goto retry;
+		return -EIO;
+	}
+
+	if (index >= fsi->mount_opts.max) {
+		ida_remove(&fsi->allocated_ptys, index);
+		mutex_unlock(&allocated_ptys_lock);
+		return -ENOSPC;
+	}
+	pty_count++;
+	mutex_unlock(&allocated_ptys_lock);
+	return index;
+}
+
+void devpts_kill_index(struct inode *ptmx_inode, int idx)
+{
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
+	mutex_lock(&allocated_ptys_lock);
+	ida_remove(&fsi->allocated_ptys, idx);
+	pty_count--;
+	mutex_unlock(&allocated_ptys_lock);
+}
+
+/**
+ * devpts_pty_new -- create a new inode in /dev/pts/
+ * @ptmx_inode: inode of the master
+ * @device: major+minor of the node to be created
+ * @index: used as a name of the node
+ * @priv: what's given back by devpts_get_priv
+ *
+ * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
+ */
+struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+		void *priv)
+{
+	struct dentry *dentry;
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct inode *inode;
+	struct dentry *root = sb->s_root;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+	char s[12];
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_ino = index + 3;
+	inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
+	inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	init_special_inode(inode, S_IFCHR|opts->mode, device);
+	inode->i_private = priv;
+
+	sprintf(s, "%d", index);
+
+	mutex_lock(&d_inode(root)->i_mutex);
+
+	dentry = d_alloc_name(root, s);
+	if (dentry) {
+		d_add(dentry, inode);
+		fsnotify_create(d_inode(root), dentry);
+	} else {
+		iput(inode);
+		inode = ERR_PTR(-ENOMEM);
+	}
+
+	mutex_unlock(&d_inode(root)->i_mutex);
+
+	return inode;
+}
+
+/**
+ * devpts_get_priv -- get private data for a slave
+ * @pts_inode: inode of the slave
+ *
+ * Returns whatever was passed as priv in devpts_pty_new for a given inode.
+ */
+void *devpts_get_priv(struct inode *pts_inode)
+{
+	struct dentry *dentry;
+	void *priv = NULL;
+
+	BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+
+	/* Ensure dentry has not been deleted by devpts_pty_kill() */
+	dentry = d_find_alias(pts_inode);
+	if (!dentry)
+		return NULL;
+
+	if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		priv = pts_inode->i_private;
+
+	dput(dentry);
+
+	return priv;
+}
+
+/**
+ * devpts_pty_kill -- remove inode form /dev/pts/
+ * @inode: inode of the slave to be removed
+ *
+ * This is an inverse operation of devpts_pty_new.
+ */
+void devpts_pty_kill(struct inode *inode)
+{
+	struct super_block *sb = pts_sb_from_inode(inode);
+	struct dentry *root = sb->s_root;
+	struct dentry *dentry;
+
+	BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+
+	mutex_lock(&d_inode(root)->i_mutex);
+
+	dentry = d_find_alias(inode);
+
+	drop_nlink(inode);
+	d_delete(dentry);
+	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
+	dput(dentry);		/* d_find_alias above */
+
+	mutex_unlock(&d_inode(root)->i_mutex);
+}
+
+static int __init init_devpts_fs(void)
+{
+	int err = register_filesystem(&devpts_fs_type);
+	struct ctl_table_header *table;
+
+	if (!err) {
+		table = register_sysctl_table(pty_root_table);
+		devpts_mnt = kern_mount(&devpts_fs_type);
+		if (IS_ERR(devpts_mnt)) {
+			err = PTR_ERR(devpts_mnt);
+			unregister_filesystem(&devpts_fs_type);
+			unregister_sysctl_table(table);
+		}
+	}
+	return err;
+}
+module_init(init_devpts_fs)
diff --git a/kernel/fs/direct-io.c b/kernel/fs/direct-io.c
new file mode 100644
index 000000000..745d23426
--- /dev/null
+++ b/kernel/fs/direct-io.c
@@ -0,0 +1,1332 @@
+/*
+ * fs/direct-io.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * O_DIRECT
+ *
+ * 04Jul2002	Andrew Morton
+ *		Initial version
+ * 11Sep2002	janetinc@us.ibm.com
+ * 		added readv/writev support.
+ * 29Oct2002	Andrew Morton
+ *		rewrote bio_add_page() support.
+ * 30Oct2002	pbadari@us.ibm.com
+ *		added support for non-aligned IO.
+ * 06Nov2002	pbadari@us.ibm.com
+ *		added asynchronous IO support.
+ * 21Jul2003	nathans@sgi.com
+ *		added IO completion notifier.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/bio.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/rwsem.h>
+#include <linux/uio.h>
+#include <linux/atomic.h>
+#include <linux/prefetch.h>
+
+/*
+ * How many user pages to map in one call to get_user_pages().  This determines
+ * the size of a structure in the slab cache
+ */
+#define DIO_PAGES	64
+
+/*
+ * This code generally works in units of "dio_blocks".  A dio_block is
+ * somewhere between the hard sector size and the filesystem block size.  it
+ * is determined on a per-invocation basis.   When talking to the filesystem
+ * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
+ * down by dio->blkfactor.  Similarly, fs-blocksize quantities are converted
+ * to bio_block quantities by shifting left by blkfactor.
+ *
+ * If blkfactor is zero then the user's request was aligned to the filesystem's
+ * blocksize.
+ */
+
+/* dio_state only used in the submission path */
+
+struct dio_submit {
+	struct bio *bio;		/* bio under assembly */
+	unsigned blkbits;		/* doesn't change */
+	unsigned blkfactor;		/* When we're using an alignment which
+					   is finer than the filesystem's soft
+					   blocksize, this specifies how much
+					   finer.  blkfactor=2 means 1/4-block
+					   alignment.  Does not change */
+	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
+					   been performed at the start of a
+					   write */
+	int pages_in_io;		/* approximate total IO pages */
+	sector_t block_in_file;		/* Current offset into the underlying
+					   file in dio_block units. */
+	unsigned blocks_available;	/* At block_in_file.  changes */
+	int reap_counter;		/* rate limit reaping */
+	sector_t final_block_in_request;/* doesn't change */
+	int boundary;			/* prev block is at a boundary */
+	get_block_t *get_block;		/* block mapping function */
+	dio_submit_t *submit_io;	/* IO submition function */
+
+	loff_t logical_offset_in_bio;	/* current first logical block in bio */
+	sector_t final_block_in_bio;	/* current final block in bio + 1 */
+	sector_t next_block_for_io;	/* next block to be put under IO,
+					   in dio_blocks units */
+
+	/*
+	 * Deferred addition of a page to the dio.  These variables are
+	 * private to dio_send_cur_page(), submit_page_section() and
+	 * dio_bio_add_page().
+	 */
+	struct page *cur_page;		/* The page */
+	unsigned cur_page_offset;	/* Offset into it, in bytes */
+	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
+	sector_t cur_page_block;	/* Where it starts */
+	loff_t cur_page_fs_offset;	/* Offset in file */
+
+	struct iov_iter *iter;
+	/*
+	 * Page queue.  These variables belong to dio_refill_pages() and
+	 * dio_get_page().
+	 */
+	unsigned head;			/* next page to process */
+	unsigned tail;			/* last valid page + 1 */
+	size_t from, to;
+};
+
+/* dio_state communicated between submission path and end_io */
+struct dio {
+	int flags;			/* doesn't change */
+	int rw;
+	struct inode *inode;
+	loff_t i_size;			/* i_size when submitted */
+	dio_iodone_t *end_io;		/* IO completion function */
+
+	void *private;			/* copy from map_bh.b_private */
+
+	/* BIO completion state */
+	spinlock_t bio_lock;		/* protects BIO fields below */
+	int page_errors;		/* errno from get_user_pages() */
+	int is_async;			/* is IO async ? */
+	bool defer_completion;		/* defer AIO completion to workqueue? */
+	int io_error;			/* IO error in completion path */
+	unsigned long refcount;		/* direct_io_worker() and bios */
+	struct bio *bio_list;		/* singly linked via bi_private */
+	struct task_struct *waiter;	/* waiting task (NULL if none) */
+
+	/* AIO related stuff */
+	struct kiocb *iocb;		/* kiocb */
+	ssize_t result;                 /* IO result */
+
+	/*
+	 * pages[] (and any fields placed after it) are not zeroed out at
+	 * allocation time.  Don't add new fields after pages[] unless you
+	 * wish that they not be zeroed.
+	 */
+	union {
+		struct page *pages[DIO_PAGES];	/* page buffer */
+		struct work_struct complete_work;/* deferred AIO completion */
+	};
+} ____cacheline_aligned_in_smp;
+
+static struct kmem_cache *dio_cache __read_mostly;
+
+/*
+ * How many pages are in the queue?
+ */
+static inline unsigned dio_pages_present(struct dio_submit *sdio)
+{
+	return sdio->tail - sdio->head;
+}
+
+/*
+ * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
+ */
+static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
+{
+	ssize_t ret;
+
+	ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
+				&sdio->from);
+
+	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
+		struct page *page = ZERO_PAGE(0);
+		/*
+		 * A memory fault, but the filesystem has some outstanding
+		 * mapped blocks.  We need to use those blocks up to avoid
+		 * leaking stale data in the file.
+		 */
+		if (dio->page_errors == 0)
+			dio->page_errors = ret;
+		page_cache_get(page);
+		dio->pages[0] = page;
+		sdio->head = 0;
+		sdio->tail = 1;
+		sdio->from = 0;
+		sdio->to = PAGE_SIZE;
+		return 0;
+	}
+
+	if (ret >= 0) {
+		iov_iter_advance(sdio->iter, ret);
+		ret += sdio->from;
+		sdio->head = 0;
+		sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
+		return 0;
+	}
+	return ret;	
+}
+
+/*
+ * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
+ * buffered inside the dio so that we can call get_user_pages() against a
+ * decent number of pages, less frequently.  To provide nicer use of the
+ * L1 cache.
+ */
+static inline struct page *dio_get_page(struct dio *dio,
+					struct dio_submit *sdio)
+{
+	if (dio_pages_present(sdio) == 0) {
+		int ret;
+
+		ret = dio_refill_pages(dio, sdio);
+		if (ret)
+			return ERR_PTR(ret);
+		BUG_ON(dio_pages_present(sdio) == 0);
+	}
+	return dio->pages[sdio->head];
+}
+
+/**
+ * dio_complete() - called when all DIO BIO I/O has been completed
+ * @offset: the byte offset in the file of the completed operation
+ *
+ * This drops i_dio_count, lets interested parties know that a DIO operation
+ * has completed, and calculates the resulting return code for the operation.
+ *
+ * It lets the filesystem know if it registered an interest earlier via
+ * get_block.  Pass the private field of the map buffer_head so that
+ * filesystems can use it to hold additional state between get_block calls and
+ * dio_complete.
+ */
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
+		bool is_async)
+{
+	ssize_t transferred = 0;
+
+	/*
+	 * AIO submission can race with bio completion to get here while
+	 * expecting to have the last io completed by bio completion.
+	 * In that case -EIOCBQUEUED is in fact not an error we want
+	 * to preserve through this call.
+	 */
+	if (ret == -EIOCBQUEUED)
+		ret = 0;
+
+	if (dio->result) {
+		transferred = dio->result;
+
+		/* Check for short read case */
+		if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
+			transferred = dio->i_size - offset;
+	}
+
+	if (ret == 0)
+		ret = dio->page_errors;
+	if (ret == 0)
+		ret = dio->io_error;
+	if (ret == 0)
+		ret = transferred;
+
+	if (dio->end_io && dio->result)
+		dio->end_io(dio->iocb, offset, transferred, dio->private);
+
+	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+		inode_dio_end(dio->inode);
+
+	if (is_async) {
+		if (dio->rw & WRITE) {
+			int err;
+
+			err = generic_write_sync(dio->iocb->ki_filp, offset,
+						 transferred);
+			if (err < 0 && ret > 0)
+				ret = err;
+		}
+
+		dio->iocb->ki_complete(dio->iocb, ret, 0);
+	}
+
+	kmem_cache_free(dio_cache, dio);
+	return ret;
+}
+
+static void dio_aio_complete_work(struct work_struct *work)
+{
+	struct dio *dio = container_of(work, struct dio, complete_work);
+
+	dio_complete(dio, dio->iocb->ki_pos, 0, true);
+}
+
+static int dio_bio_complete(struct dio *dio, struct bio *bio);
+
+/*
+ * Asynchronous IO callback. 
+ */
+static void dio_bio_end_aio(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+	unsigned long remaining;
+	unsigned long flags;
+
+	/* cleanup the bio */
+	dio_bio_complete(dio, bio);
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	remaining = --dio->refcount;
+	if (remaining == 1 && dio->waiter)
+		wake_up_process(dio->waiter);
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+
+	if (remaining == 0) {
+		if (dio->result && dio->defer_completion) {
+			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
+			queue_work(dio->inode->i_sb->s_dio_done_wq,
+				   &dio->complete_work);
+		} else {
+			dio_complete(dio, dio->iocb->ki_pos, 0, true);
+		}
+	}
+}
+
+/*
+ * The BIO completion handler simply queues the BIO up for the process-context
+ * handler.
+ *
+ * During I/O bi_private points at the dio.  After I/O, bi_private is used to
+ * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ */
+static void dio_bio_end_io(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	bio->bi_private = dio->bio_list;
+	dio->bio_list = bio;
+	if (--dio->refcount == 1 && dio->waiter)
+		wake_up_process(dio->waiter);
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+}
+
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+
+	if (dio->is_async)
+		dio_bio_end_aio(bio, error);
+	else
+		dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
+
+static inline void
+dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
+	      struct block_device *bdev,
+	      sector_t first_sector, int nr_vecs)
+{
+	struct bio *bio;
+
+	/*
+	 * bio_alloc() is guaranteed to return a bio when called with
+	 * __GFP_WAIT and we request a valid number of vectors.
+	 */
+	bio = bio_alloc(GFP_KERNEL, nr_vecs);
+
+	bio->bi_bdev = bdev;
+	bio->bi_iter.bi_sector = first_sector;
+	if (dio->is_async)
+		bio->bi_end_io = dio_bio_end_aio;
+	else
+		bio->bi_end_io = dio_bio_end_io;
+
+	sdio->bio = bio;
+	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
+}
+
+/*
+ * In the AIO read case we speculatively dirty the pages before starting IO.
+ * During IO completion, any of these pages which happen to have been written
+ * back will be redirtied by bio_check_pages_dirty().
+ *
+ * bios hold a dio reference between submit_bio and ->end_io.
+ */
+static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
+{
+	struct bio *bio = sdio->bio;
+	unsigned long flags;
+
+	bio->bi_private = dio;
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	dio->refcount++;
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+
+	if (dio->is_async && dio->rw == READ)
+		bio_set_pages_dirty(bio);
+
+	if (sdio->submit_io)
+		sdio->submit_io(dio->rw, bio, dio->inode,
+			       sdio->logical_offset_in_bio);
+	else
+		submit_bio(dio->rw, bio);
+
+	sdio->bio = NULL;
+	sdio->boundary = 0;
+	sdio->logical_offset_in_bio = 0;
+}
+
+/*
+ * Release any resources in case of a failure
+ */
+static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
+{
+	while (sdio->head < sdio->tail)
+		page_cache_release(dio->pages[sdio->head++]);
+}
+
+/*
+ * Wait for the next BIO to complete.  Remove it and return it.  NULL is
+ * returned once all BIOs have been completed.  This must only be called once
+ * all bios have been issued so that dio->refcount can only decrease.  This
+ * requires that that the caller hold a reference on the dio.
+ */
+static struct bio *dio_await_one(struct dio *dio)
+{
+	unsigned long flags;
+	struct bio *bio = NULL;
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+
+	/*
+	 * Wait as long as the list is empty and there are bios in flight.  bio
+	 * completion drops the count, maybe adds to the list, and wakes while
+	 * holding the bio_lock so we don't need set_current_state()'s barrier
+	 * and can call it after testing our condition.
+	 */
+	while (dio->refcount > 1 && dio->bio_list == NULL) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		dio->waiter = current;
+		spin_unlock_irqrestore(&dio->bio_lock, flags);
+		io_schedule();
+		/* wake up sets us TASK_RUNNING */
+		spin_lock_irqsave(&dio->bio_lock, flags);
+		dio->waiter = NULL;
+	}
+	if (dio->bio_list) {
+		bio = dio->bio_list;
+		dio->bio_list = bio->bi_private;
+	}
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+	return bio;
+}
+
+/*
+ * Process one completed BIO.  No locks are held.
+ */
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec;
+	unsigned i;
+
+	if (!uptodate)
+		dio->io_error = -EIO;
+
+	if (dio->is_async && dio->rw == READ) {
+		bio_check_pages_dirty(bio);	/* transfers ownership */
+	} else {
+		bio_for_each_segment_all(bvec, bio, i) {
+			struct page *page = bvec->bv_page;
+
+			if (dio->rw == READ && !PageCompound(page))
+				set_page_dirty_lock(page);
+			page_cache_release(page);
+		}
+		bio_put(bio);
+	}
+	return uptodate ? 0 : -EIO;
+}
+
+/*
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through dio_bio_complete.  IO
+ * errors are propagated through dio->io_error and should be propagated via
+ * dio_complete().
+ */
+static void dio_await_completion(struct dio *dio)
+{
+	struct bio *bio;
+	do {
+		bio = dio_await_one(dio);
+		if (bio)
+			dio_bio_complete(dio, bio);
+	} while (bio);
+}
+
+/*
+ * A really large O_DIRECT read or write can generate a lot of BIOs.  So
+ * to keep the memory consumption sane we periodically reap any completed BIOs
+ * during the BIO generation phase.
+ *
+ * This also helps to limit the peak amount of pinned userspace memory.
+ */
+static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
+{
+	int ret = 0;
+
+	if (sdio->reap_counter++ >= 64) {
+		while (dio->bio_list) {
+			unsigned long flags;
+			struct bio *bio;
+			int ret2;
+
+			spin_lock_irqsave(&dio->bio_lock, flags);
+			bio = dio->bio_list;
+			dio->bio_list = bio->bi_private;
+			spin_unlock_irqrestore(&dio->bio_lock, flags);
+			ret2 = dio_bio_complete(dio, bio);
+			if (ret == 0)
+				ret = ret2;
+		}
+		sdio->reap_counter = 0;
+	}
+	return ret;
+}
+
+/*
+ * Create workqueue for deferred direct IO completions. We allocate the
+ * workqueue when it's first needed. This avoids creating workqueue for
+ * filesystems that don't need it and also allows us to create the workqueue
+ * late enough so the we can include s_id in the name of the workqueue.
+ */
+static int sb_init_dio_done_wq(struct super_block *sb)
+{
+	struct workqueue_struct *old;
+	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+						      WQ_MEM_RECLAIM, 0,
+						      sb->s_id);
+	if (!wq)
+		return -ENOMEM;
+	/*
+	 * This has to be atomic as more DIOs can race to create the workqueue
+	 */
+	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
+	/* Someone created workqueue before us? Free ours... */
+	if (old)
+		destroy_workqueue(wq);
+	return 0;
+}
+
+static int dio_set_defer_completion(struct dio *dio)
+{
+	struct super_block *sb = dio->inode->i_sb;
+
+	if (dio->defer_completion)
+		return 0;
+	dio->defer_completion = true;
+	if (!sb->s_dio_done_wq)
+		return sb_init_dio_done_wq(sb);
+	return 0;
+}
+
+/*
+ * Call into the fs to map some more disk blocks.  We record the current number
+ * of available blocks at sdio->blocks_available.  These are in units of the
+ * fs blocksize, (1 << inode->i_blkbits).
+ *
+ * The fs is allowed to map lots of blocks at once.  If it wants to do that,
+ * it uses the passed inode-relative block number as the file offset, as usual.
+ *
+ * get_block() is passed the number of i_blkbits-sized blocks which direct_io
+ * has remaining to do.  The fs should not map more than this number of blocks.
+ *
+ * If the fs has mapped a lot of blocks, it should populate bh->b_size to
+ * indicate how much contiguous disk space has been made available at
+ * bh->b_blocknr.
+ *
+ * If *any* of the mapped blocks are new, then the fs must set buffer_new().
+ * This isn't very efficient...
+ *
+ * In the case of filesystem holes: the fs may return an arbitrarily-large
+ * hole by returning an appropriate value in b_size and by clearing
+ * buffer_mapped().  However the direct-io code will only process holes one
+ * block at a time - it will repeatedly call get_block() as it walks the hole.
+ */
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
+			   struct buffer_head *map_bh)
+{
+	int ret;
+	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
+	sector_t fs_endblk;	/* Into file, in filesystem-sized blocks */
+	unsigned long fs_count;	/* Number of filesystem-sized blocks */
+	int create;
+	unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
+
+	/*
+	 * If there was a memory error and we've overwritten all the
+	 * mapped blocks then we can now return that memory error
+	 */
+	ret = dio->page_errors;
+	if (ret == 0) {
+		BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
+		fs_startblk = sdio->block_in_file >> sdio->blkfactor;
+		fs_endblk = (sdio->final_block_in_request - 1) >>
+					sdio->blkfactor;
+		fs_count = fs_endblk - fs_startblk + 1;
+
+		map_bh->b_state = 0;
+		map_bh->b_size = fs_count << i_blkbits;
+
+		/*
+		 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+		 * forbid block creations: only overwrites are permitted.
+		 * We will return early to the caller once we see an
+		 * unmapped buffer head returned, and the caller will fall
+		 * back to buffered I/O.
+		 *
+		 * Otherwise the decision is left to the get_blocks method,
+		 * which may decide to handle it or also return an unmapped
+		 * buffer head.
+		 */
+		create = dio->rw & WRITE;
+		if (dio->flags & DIO_SKIP_HOLES) {
+			if (sdio->block_in_file < (i_size_read(dio->inode) >>
+							sdio->blkbits))
+				create = 0;
+		}
+
+		ret = (*sdio->get_block)(dio->inode, fs_startblk,
+						map_bh, create);
+
+		/* Store for completion */
+		dio->private = map_bh->b_private;
+
+		if (ret == 0 && buffer_defer_completion(map_bh))
+			ret = dio_set_defer_completion(dio);
+	}
+	return ret;
+}
+
+/*
+ * There is no bio.  Make one now.
+ */
+static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
+		sector_t start_sector, struct buffer_head *map_bh)
+{
+	sector_t sector;
+	int ret, nr_pages;
+
+	ret = dio_bio_reap(dio, sdio);
+	if (ret)
+		goto out;
+	sector = start_sector << (sdio->blkbits - 9);
+	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
+	BUG_ON(nr_pages <= 0);
+	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
+	sdio->boundary = 0;
+out:
+	return ret;
+}
+
+/*
+ * Attempt to put the current chunk of 'cur_page' into the current BIO.  If
+ * that was successful then update final_block_in_bio and take a ref against
+ * the just-added page.
+ *
+ * Return zero on success.  Non-zero means the caller needs to start a new BIO.
+ */
+static inline int dio_bio_add_page(struct dio_submit *sdio)
+{
+	int ret;
+
+	ret = bio_add_page(sdio->bio, sdio->cur_page,
+			sdio->cur_page_len, sdio->cur_page_offset);
+	if (ret == sdio->cur_page_len) {
+		/*
+		 * Decrement count only, if we are done with this page
+		 */
+		if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
+			sdio->pages_in_io--;
+		page_cache_get(sdio->cur_page);
+		sdio->final_block_in_bio = sdio->cur_page_block +
+			(sdio->cur_page_len >> sdio->blkbits);
+		ret = 0;
+	} else {
+		ret = 1;
+	}
+	return ret;
+}
+		
+/*
+ * Put cur_page under IO.  The section of cur_page which is described by
+ * cur_page_offset,cur_page_len is put into a BIO.  The section of cur_page
+ * starts on-disk at cur_page_block.
+ *
+ * We take a ref against the page here (on behalf of its presence in the bio).
+ *
+ * The caller of this function is responsible for removing cur_page from the
+ * dio, and for dropping the refcount which came from that presence.
+ */
+static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+		struct buffer_head *map_bh)
+{
+	int ret = 0;
+
+	if (sdio->bio) {
+		loff_t cur_offset = sdio->cur_page_fs_offset;
+		loff_t bio_next_offset = sdio->logical_offset_in_bio +
+			sdio->bio->bi_iter.bi_size;
+
+		/*
+		 * See whether this new request is contiguous with the old.
+		 *
+		 * Btrfs cannot handle having logically non-contiguous requests
+		 * submitted.  For example if you have
+		 *
+		 * Logical:  [0-4095][HOLE][8192-12287]
+		 * Physical: [0-4095]      [4096-8191]
+		 *
+		 * We cannot submit those pages together as one BIO.  So if our
+		 * current logical offset in the file does not equal what would
+		 * be the next logical offset in the bio, submit the bio we
+		 * have.
+		 */
+		if (sdio->final_block_in_bio != sdio->cur_page_block ||
+		    cur_offset != bio_next_offset)
+			dio_bio_submit(dio, sdio);
+	}
+
+	if (sdio->bio == NULL) {
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
+		if (ret)
+			goto out;
+	}
+
+	if (dio_bio_add_page(sdio) != 0) {
+		dio_bio_submit(dio, sdio);
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
+		if (ret == 0) {
+			ret = dio_bio_add_page(sdio);
+			BUG_ON(ret != 0);
+		}
+	}
+out:
+	return ret;
+}
+
+/*
+ * An autonomous function to put a chunk of a page under deferred IO.
+ *
+ * The caller doesn't actually know (or care) whether this piece of page is in
+ * a BIO, or is under IO or whatever.  We just take care of all possible 
+ * situations here.  The separation between the logic of do_direct_IO() and
+ * that of submit_page_section() is important for clarity.  Please don't break.
+ *
+ * The chunk of page starts on-disk at blocknr.
+ *
+ * We perform deferred IO, by recording the last-submitted page inside our
+ * private part of the dio structure.  If possible, we just expand the IO
+ * across that page here.
+ *
+ * If that doesn't work out then we put the old page into the bio and add this
+ * page to the dio instead.
+ */
+static inline int
+submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
+		    unsigned offset, unsigned len, sector_t blocknr,
+		    struct buffer_head *map_bh)
+{
+	int ret = 0;
+
+	if (dio->rw & WRITE) {
+		/*
+		 * Read accounting is performed in submit_bio()
+		 */
+		task_io_account_write(len);
+	}
+
+	/*
+	 * Can we just grow the current page's presence in the dio?
+	 */
+	if (sdio->cur_page == page &&
+	    sdio->cur_page_offset + sdio->cur_page_len == offset &&
+	    sdio->cur_page_block +
+	    (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
+		sdio->cur_page_len += len;
+		goto out;
+	}
+
+	/*
+	 * If there's a deferred page already there then send it.
+	 */
+	if (sdio->cur_page) {
+		ret = dio_send_cur_page(dio, sdio, map_bh);
+		page_cache_release(sdio->cur_page);
+		sdio->cur_page = NULL;
+		if (ret)
+			return ret;
+	}
+
+	page_cache_get(page);		/* It is in dio */
+	sdio->cur_page = page;
+	sdio->cur_page_offset = offset;
+	sdio->cur_page_len = len;
+	sdio->cur_page_block = blocknr;
+	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
+out:
+	/*
+	 * If sdio->boundary then we want to schedule the IO now to
+	 * avoid metadata seeks.
+	 */
+	if (sdio->boundary) {
+		ret = dio_send_cur_page(dio, sdio, map_bh);
+		dio_bio_submit(dio, sdio);
+		page_cache_release(sdio->cur_page);
+		sdio->cur_page = NULL;
+	}
+	return ret;
+}
+
+/*
+ * Clean any dirty buffers in the blockdev mapping which alias newly-created
+ * file blocks.  Only called for S_ISREG files - blockdevs do not set
+ * buffer_new
+ */
+static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
+{
+	unsigned i;
+	unsigned nblocks;
+
+	nblocks = map_bh->b_size >> dio->inode->i_blkbits;
+
+	for (i = 0; i < nblocks; i++) {
+		unmap_underlying_metadata(map_bh->b_bdev,
+					  map_bh->b_blocknr + i);
+	}
+}
+
+/*
+ * If we are not writing the entire block and get_block() allocated
+ * the block for us, we need to fill-in the unused portion of the
+ * block with zeros. This happens only if user-buffer, fileoffset or
+ * io length is not filesystem block-size multiple.
+ *
+ * `end' is zero if we're doing the start of the IO, 1 at the end of the
+ * IO.
+ */
+static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
+		int end, struct buffer_head *map_bh)
+{
+	unsigned dio_blocks_per_fs_block;
+	unsigned this_chunk_blocks;	/* In dio_blocks */
+	unsigned this_chunk_bytes;
+	struct page *page;
+
+	sdio->start_zero_done = 1;
+	if (!sdio->blkfactor || !buffer_new(map_bh))
+		return;
+
+	dio_blocks_per_fs_block = 1 << sdio->blkfactor;
+	this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
+
+	if (!this_chunk_blocks)
+		return;
+
+	/*
+	 * We need to zero out part of an fs block.  It is either at the
+	 * beginning or the end of the fs block.
+	 */
+	if (end) 
+		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+
+	this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
+
+	page = ZERO_PAGE(0);
+	if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
+				sdio->next_block_for_io, map_bh))
+		return;
+
+	sdio->next_block_for_io += this_chunk_blocks;
+}
+
+/*
+ * Walk the user pages, and the file, mapping blocks to disk and generating
+ * a sequence of (page,offset,len,block) mappings.  These mappings are injected
+ * into submit_page_section(), which takes care of the next stage of submission
+ *
+ * Direct IO against a blockdev is different from a file.  Because we can
+ * happily perform page-sized but 512-byte aligned IOs.  It is important that
+ * blockdev IO be able to have fine alignment and large sizes.
+ *
+ * So what we do is to permit the ->get_block function to populate bh.b_size
+ * with the size of IO which is permitted at this offset and this i_blkbits.
+ *
+ * For best results, the blockdev should be set up with 512-byte i_blkbits and
+ * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
+ * fine alignment but still allows this function to work in PAGE_SIZE units.
+ */
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
+			struct buffer_head *map_bh)
+{
+	const unsigned blkbits = sdio->blkbits;
+	int ret = 0;
+
+	while (sdio->block_in_file < sdio->final_block_in_request) {
+		struct page *page;
+		size_t from, to;
+
+		page = dio_get_page(dio, sdio);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto out;
+		}
+		from = sdio->head ? 0 : sdio->from;
+		to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
+		sdio->head++;
+
+		while (from < to) {
+			unsigned this_chunk_bytes;	/* # of bytes mapped */
+			unsigned this_chunk_blocks;	/* # of blocks */
+			unsigned u;
+
+			if (sdio->blocks_available == 0) {
+				/*
+				 * Need to go and map some more disk
+				 */
+				unsigned long blkmask;
+				unsigned long dio_remainder;
+
+				ret = get_more_blocks(dio, sdio, map_bh);
+				if (ret) {
+					page_cache_release(page);
+					goto out;
+				}
+				if (!buffer_mapped(map_bh))
+					goto do_holes;
+
+				sdio->blocks_available =
+						map_bh->b_size >> sdio->blkbits;
+				sdio->next_block_for_io =
+					map_bh->b_blocknr << sdio->blkfactor;
+				if (buffer_new(map_bh))
+					clean_blockdev_aliases(dio, map_bh);
+
+				if (!sdio->blkfactor)
+					goto do_holes;
+
+				blkmask = (1 << sdio->blkfactor) - 1;
+				dio_remainder = (sdio->block_in_file & blkmask);
+
+				/*
+				 * If we are at the start of IO and that IO
+				 * starts partway into a fs-block,
+				 * dio_remainder will be non-zero.  If the IO
+				 * is a read then we can simply advance the IO
+				 * cursor to the first block which is to be
+				 * read.  But if the IO is a write and the
+				 * block was newly allocated we cannot do that;
+				 * the start of the fs block must be zeroed out
+				 * on-disk
+				 */
+				if (!buffer_new(map_bh))
+					sdio->next_block_for_io += dio_remainder;
+				sdio->blocks_available -= dio_remainder;
+			}
+do_holes:
+			/* Handle holes */
+			if (!buffer_mapped(map_bh)) {
+				loff_t i_size_aligned;
+
+				/* AKPM: eargh, -ENOTBLK is a hack */
+				if (dio->rw & WRITE) {
+					page_cache_release(page);
+					return -ENOTBLK;
+				}
+
+				/*
+				 * Be sure to account for a partial block as the
+				 * last block in the file
+				 */
+				i_size_aligned = ALIGN(i_size_read(dio->inode),
+							1 << blkbits);
+				if (sdio->block_in_file >=
+						i_size_aligned >> blkbits) {
+					/* We hit eof */
+					page_cache_release(page);
+					goto out;
+				}
+				zero_user(page, from, 1 << blkbits);
+				sdio->block_in_file++;
+				from += 1 << blkbits;
+				dio->result += 1 << blkbits;
+				goto next_block;
+			}
+
+			/*
+			 * If we're performing IO which has an alignment which
+			 * is finer than the underlying fs, go check to see if
+			 * we must zero out the start of this block.
+			 */
+			if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
+				dio_zero_block(dio, sdio, 0, map_bh);
+
+			/*
+			 * Work out, in this_chunk_blocks, how much disk we
+			 * can add to this page
+			 */
+			this_chunk_blocks = sdio->blocks_available;
+			u = (to - from) >> blkbits;
+			if (this_chunk_blocks > u)
+				this_chunk_blocks = u;
+			u = sdio->final_block_in_request - sdio->block_in_file;
+			if (this_chunk_blocks > u)
+				this_chunk_blocks = u;
+			this_chunk_bytes = this_chunk_blocks << blkbits;
+			BUG_ON(this_chunk_bytes == 0);
+
+			if (this_chunk_blocks == sdio->blocks_available)
+				sdio->boundary = buffer_boundary(map_bh);
+			ret = submit_page_section(dio, sdio, page,
+						  from,
+						  this_chunk_bytes,
+						  sdio->next_block_for_io,
+						  map_bh);
+			if (ret) {
+				page_cache_release(page);
+				goto out;
+			}
+			sdio->next_block_for_io += this_chunk_blocks;
+
+			sdio->block_in_file += this_chunk_blocks;
+			from += this_chunk_bytes;
+			dio->result += this_chunk_bytes;
+			sdio->blocks_available -= this_chunk_blocks;
+next_block:
+			BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
+			if (sdio->block_in_file == sdio->final_block_in_request)
+				break;
+		}
+
+		/* Drop the ref which was taken in get_user_pages() */
+		page_cache_release(page);
+	}
+out:
+	return ret;
+}
+
+static inline int drop_refcount(struct dio *dio)
+{
+	int ret2;
+	unsigned long flags;
+
+	/*
+	 * Sync will always be dropping the final ref and completing the
+	 * operation.  AIO can if it was a broken operation described above or
+	 * in fact if all the bios race to complete before we get here.  In
+	 * that case dio_complete() translates the EIOCBQUEUED into the proper
+	 * return code that the caller will hand to ->complete().
+	 *
+	 * This is managed by the bio_lock instead of being an atomic_t so that
+	 * completion paths can drop their ref and use the remaining count to
+	 * decide to wake the submission path atomically.
+	 */
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	ret2 = --dio->refcount;
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+	return ret2;
+}
+
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *
+ * To help with locking against truncate we incremented the i_dio_count
+ * counter before starting direct I/O, and decrement it once we are done.
+ * Truncate can wait for it to reach zero to provide exclusion.  It is
+ * expected that filesystem provide exclusion between new direct I/O
+ * and truncates.  For DIO_LOCKING filesystems this is done by i_mutex,
+ * but other filesystems need to take care of this on their own.
+ *
+ * NOTE: if you pass "sdio" to anything by pointer make sure that function
+ * is always inlined. Otherwise gcc is unable to split the structure into
+ * individual fields and will generate much worse code. This is important
+ * for the whole file.
+ */
+static inline ssize_t
+do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+		      struct block_device *bdev, struct iov_iter *iter,
+		      loff_t offset, get_block_t get_block, dio_iodone_t end_io,
+		      dio_submit_t submit_io, int flags)
+{
+	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+	unsigned blkbits = i_blkbits;
+	unsigned blocksize_mask = (1 << blkbits) - 1;
+	ssize_t retval = -EINVAL;
+	size_t count = iov_iter_count(iter);
+	loff_t end = offset + count;
+	struct dio *dio;
+	struct dio_submit sdio = { 0, };
+	struct buffer_head map_bh = { 0, };
+	struct blk_plug plug;
+	unsigned long align = offset | iov_iter_alignment(iter);
+
+	/*
+	 * Avoid references to bdev if not absolutely needed to give
+	 * the early prefetch in the caller enough time.
+	 */
+
+	if (align & blocksize_mask) {
+		if (bdev)
+			blkbits = blksize_bits(bdev_logical_block_size(bdev));
+		blocksize_mask = (1 << blkbits) - 1;
+		if (align & blocksize_mask)
+			goto out;
+	}
+
+	/* watch out for a 0 len io from a tricksy fs */
+	if (iov_iter_rw(iter) == READ && !iov_iter_count(iter))
+		return 0;
+
+	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
+	retval = -ENOMEM;
+	if (!dio)
+		goto out;
+	/*
+	 * Believe it or not, zeroing out the page array caused a .5%
+	 * performance regression in a database benchmark.  So, we take
+	 * care to only zero out what's needed.
+	 */
+	memset(dio, 0, offsetof(struct dio, pages));
+
+	dio->flags = flags;
+	if (dio->flags & DIO_LOCKING) {
+		if (iov_iter_rw(iter) == READ) {
+			struct address_space *mapping =
+					iocb->ki_filp->f_mapping;
+
+			/* will be released by direct_io_worker */
+			mutex_lock(&inode->i_mutex);
+
+			retval = filemap_write_and_wait_range(mapping, offset,
+							      end - 1);
+			if (retval) {
+				mutex_unlock(&inode->i_mutex);
+				kmem_cache_free(dio_cache, dio);
+				goto out;
+			}
+		}
+	}
+
+	/*
+	 * For file extending writes updating i_size before data writeouts
+	 * complete can expose uninitialized blocks in dumb filesystems.
+	 * In that case we need to wait for I/O completion even if asked
+	 * for an asynchronous write.
+	 */
+	if (is_sync_kiocb(iocb))
+		dio->is_async = false;
+	else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
+		 iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
+		dio->is_async = false;
+	else
+		dio->is_async = true;
+
+	dio->inode = inode;
+	dio->rw = iov_iter_rw(iter) == WRITE ? WRITE_ODIRECT : READ;
+
+	/*
+	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
+	 * so that we can call ->fsync.
+	 */
+	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
+	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
+	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
+		retval = dio_set_defer_completion(dio);
+		if (retval) {
+			/*
+			 * We grab i_mutex only for reads so we don't have
+			 * to release it here
+			 */
+			kmem_cache_free(dio_cache, dio);
+			goto out;
+		}
+	}
+
+	/*
+	 * Will be decremented at I/O completion time.
+	 */
+	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+		inode_dio_begin(inode);
+
+	retval = 0;
+	sdio.blkbits = blkbits;
+	sdio.blkfactor = i_blkbits - blkbits;
+	sdio.block_in_file = offset >> blkbits;
+
+	sdio.get_block = get_block;
+	dio->end_io = end_io;
+	sdio.submit_io = submit_io;
+	sdio.final_block_in_bio = -1;
+	sdio.next_block_for_io = -1;
+
+	dio->iocb = iocb;
+	dio->i_size = i_size_read(inode);
+
+	spin_lock_init(&dio->bio_lock);
+	dio->refcount = 1;
+
+	sdio.iter = iter;
+	sdio.final_block_in_request =
+		(offset + iov_iter_count(iter)) >> blkbits;
+
+	/*
+	 * In case of non-aligned buffers, we may need 2 more
+	 * pages since we need to zero out first and last block.
+	 */
+	if (unlikely(sdio.blkfactor))
+		sdio.pages_in_io = 2;
+
+	sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
+
+	blk_start_plug(&plug);
+
+	retval = do_direct_IO(dio, &sdio, &map_bh);
+	if (retval)
+		dio_cleanup(dio, &sdio);
+
+	if (retval == -ENOTBLK) {
+		/*
+		 * The remaining part of the request will be
+		 * be handled by buffered I/O when we return
+		 */
+		retval = 0;
+	}
+	/*
+	 * There may be some unwritten disk at the end of a part-written
+	 * fs-block-sized block.  Go zero that now.
+	 */
+	dio_zero_block(dio, &sdio, 1, &map_bh);
+
+	if (sdio.cur_page) {
+		ssize_t ret2;
+
+		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
+		if (retval == 0)
+			retval = ret2;
+		page_cache_release(sdio.cur_page);
+		sdio.cur_page = NULL;
+	}
+	if (sdio.bio)
+		dio_bio_submit(dio, &sdio);
+
+	blk_finish_plug(&plug);
+
+	/*
+	 * It is possible that, we return short IO due to end of file.
+	 * In that case, we need to release all the pages we got hold on.
+	 */
+	dio_cleanup(dio, &sdio);
+
+	/*
+	 * All block lookups have been performed. For READ requests
+	 * we can let i_mutex go now that its achieved its purpose
+	 * of protecting us from looking up uninitialized blocks.
+	 */
+	if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
+		mutex_unlock(&dio->inode->i_mutex);
+
+	/*
+	 * The only time we want to leave bios in flight is when a successful
+	 * partial aio read or full aio write have been setup.  In that case
+	 * bio completion will call aio_complete.  The only time it's safe to
+	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+	 * This had *better* be the only place that raises -EIOCBQUEUED.
+	 */
+	BUG_ON(retval == -EIOCBQUEUED);
+	if (dio->is_async && retval == 0 && dio->result &&
+	    (iov_iter_rw(iter) == READ || dio->result == count))
+		retval = -EIOCBQUEUED;
+	else
+		dio_await_completion(dio);
+
+	if (drop_refcount(dio) == 0) {
+		retval = dio_complete(dio, offset, retval, false);
+	} else
+		BUG_ON(retval != -EIOCBQUEUED);
+
+out:
+	return retval;
+}
+
+ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+			     struct block_device *bdev, struct iov_iter *iter,
+			     loff_t offset, get_block_t get_block,
+			     dio_iodone_t end_io, dio_submit_t submit_io,
+			     int flags)
+{
+	/*
+	 * The block device state is needed in the end to finally
+	 * submit everything.  Since it's likely to be cache cold
+	 * prefetch it here as first thing to hide some of the
+	 * latency.
+	 *
+	 * Attempt to prefetch the pieces we likely need later.
+	 */
+	prefetch(&bdev->bd_disk->part_tbl);
+	prefetch(bdev->bd_queue);
+	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+
+	return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block,
+				     end_io, submit_io, flags);
+}
+
+EXPORT_SYMBOL(__blockdev_direct_IO);
+
+static __init int dio_init(void)
+{
+	dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
+	return 0;
+}
+module_init(dio_init)
diff --git a/kernel/fs/dlm/Kconfig b/kernel/fs/dlm/Kconfig
new file mode 100644
index 000000000..e4242c3f8
--- /dev/null
+++ b/kernel/fs/dlm/Kconfig
@@ -0,0 +1,16 @@
+menuconfig DLM
+	tristate "Distributed Lock Manager (DLM)"
+	depends on INET
+	depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
+	select IP_SCTP
+	help
+	A general purpose distributed lock manager for kernel or userspace
+	applications.
+
+config DLM_DEBUG
+	bool "DLM debugging"
+	depends on DLM
+	help
+	Under the debugfs mount point, the name of each lockspace will
+	appear as a file in the "dlm" directory.  The output is the
+	list of resource and locks the local node knows about.
diff --git a/kernel/fs/dlm/Makefile b/kernel/fs/dlm/Makefile
new file mode 100644
index 000000000..ca1c9124c
--- /dev/null
+++ b/kernel/fs/dlm/Makefile
@@ -0,0 +1,21 @@
+obj-$(CONFIG_DLM) +=		dlm.o
+dlm-y :=			ast.o \
+				config.o \
+				dir.o \
+				lock.o \
+				lockspace.o \
+				main.o \
+				member.o \
+				memory.o \
+				midcomms.o \
+				netlink.o \
+				lowcomms.o \
+				plock.o \
+				rcom.o \
+				recover.o \
+				recoverd.o \
+				requestqueue.o \
+				user.o \
+				util.o 
+dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
+
diff --git a/kernel/fs/dlm/ast.c b/kernel/fs/dlm/ast.c
new file mode 100644
index 000000000..dcea1e37a
--- /dev/null
+++ b/kernel/fs/dlm/ast.c
@@ -0,0 +1,314 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lock.h"
+#include "user.h"
+#include "ast.h"
+
+static uint64_t dlm_cb_seq;
+static DEFINE_SPINLOCK(dlm_cb_seq_spin);
+
+static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+{
+	int i;
+
+	log_print("last_bast %x %llu flags %x mode %d sb %d %x",
+		  lkb->lkb_id,
+		  (unsigned long long)lkb->lkb_last_bast.seq,
+		  lkb->lkb_last_bast.flags,
+		  lkb->lkb_last_bast.mode,
+		  lkb->lkb_last_bast.sb_status,
+		  lkb->lkb_last_bast.sb_flags);
+
+	log_print("last_cast %x %llu flags %x mode %d sb %d %x",
+		  lkb->lkb_id,
+		  (unsigned long long)lkb->lkb_last_cast.seq,
+		  lkb->lkb_last_cast.flags,
+		  lkb->lkb_last_cast.mode,
+		  lkb->lkb_last_cast.sb_status,
+		  lkb->lkb_last_cast.sb_flags);
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		log_print("cb %x %llu flags %x mode %d sb %d %x",
+			  lkb->lkb_id,
+			  (unsigned long long)lkb->lkb_callbacks[i].seq,
+			  lkb->lkb_callbacks[i].flags,
+			  lkb->lkb_callbacks[i].mode,
+			  lkb->lkb_callbacks[i].sb_status,
+			  lkb->lkb_callbacks[i].sb_flags);
+	}
+}
+
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+			 int status, uint32_t sbflags, uint64_t seq)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	uint64_t prev_seq;
+	int prev_mode;
+	int i, rv;
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		if (lkb->lkb_callbacks[i].seq)
+			continue;
+
+		/*
+		 * Suppress some redundant basts here, do more on removal.
+		 * Don't even add a bast if the callback just before it
+		 * is a bast for the same mode or a more restrictive mode.
+		 * (the addional > PR check is needed for PR/CW inversion)
+		 */
+
+		if ((i > 0) && (flags & DLM_CB_BAST) &&
+		    (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
+
+			prev_seq = lkb->lkb_callbacks[i-1].seq;
+			prev_mode = lkb->lkb_callbacks[i-1].mode;
+
+			if ((prev_mode == mode) ||
+			    (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
+
+				log_debug(ls, "skip %x add bast %llu mode %d "
+					  "for bast %llu mode %d",
+					  lkb->lkb_id,
+					  (unsigned long long)seq,
+					  mode,
+					  (unsigned long long)prev_seq,
+					  prev_mode);
+				rv = 0;
+				goto out;
+			}
+		}
+
+		lkb->lkb_callbacks[i].seq = seq;
+		lkb->lkb_callbacks[i].flags = flags;
+		lkb->lkb_callbacks[i].mode = mode;
+		lkb->lkb_callbacks[i].sb_status = status;
+		lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
+		rv = 0;
+		break;
+	}
+
+	if (i == DLM_CALLBACKS_SIZE) {
+		log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
+			  lkb->lkb_id, (unsigned long long)seq,
+			  flags, mode, status, sbflags);
+		dlm_dump_lkb_callbacks(lkb);
+		rv = -1;
+		goto out;
+	}
+ out:
+	return rv;
+}
+
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			 struct dlm_callback *cb, int *resid)
+{
+	int i, rv;
+
+	*resid = 0;
+
+	if (!lkb->lkb_callbacks[0].seq) {
+		rv = -ENOENT;
+		goto out;
+	}
+
+	/* oldest undelivered cb is callbacks[0] */
+
+	memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
+	memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
+
+	/* shift others down */
+
+	for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
+		if (!lkb->lkb_callbacks[i].seq)
+			break;
+		memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
+		       sizeof(struct dlm_callback));
+		memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
+		(*resid)++;
+	}
+
+	/* if cb is a bast, it should be skipped if the blocking mode is
+	   compatible with the last granted mode */
+
+	if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
+		if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
+			cb->flags |= DLM_CB_SKIP;
+
+			log_debug(ls, "skip %x bast %llu mode %d "
+				  "for cast %llu mode %d",
+				  lkb->lkb_id,
+				  (unsigned long long)cb->seq,
+				  cb->mode,
+				  (unsigned long long)lkb->lkb_last_cast.seq,
+				  lkb->lkb_last_cast.mode);
+			rv = 0;
+			goto out;
+		}
+	}
+
+	if (cb->flags & DLM_CB_CAST) {
+		memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
+		lkb->lkb_last_cast_time = ktime_get();
+	}
+
+	if (cb->flags & DLM_CB_BAST) {
+		memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
+		lkb->lkb_last_bast_time = ktime_get();
+	}
+	rv = 0;
+ out:
+	return rv;
+}
+
+void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+		uint32_t sbflags)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	uint64_t new_seq, prev_seq;
+	int rv;
+
+	spin_lock(&dlm_cb_seq_spin);
+	new_seq = ++dlm_cb_seq;
+	spin_unlock(&dlm_cb_seq_spin);
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq);
+		return;
+	}
+
+	mutex_lock(&lkb->lkb_cb_mutex);
+	prev_seq = lkb->lkb_callbacks[0].seq;
+
+	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq);
+	if (rv < 0)
+		goto out;
+
+	if (!prev_seq) {
+		kref_get(&lkb->lkb_ref);
+
+		if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
+			mutex_lock(&ls->ls_cb_mutex);
+			list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
+			mutex_unlock(&ls->ls_cb_mutex);
+		} else {
+			queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
+		}
+	}
+ out:
+	mutex_unlock(&lkb->lkb_cb_mutex);
+}
+
+void dlm_callback_work(struct work_struct *work)
+{
+	struct dlm_lkb *lkb = container_of(work, struct dlm_lkb, lkb_cb_work);
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	void (*castfn) (void *astparam);
+	void (*bastfn) (void *astparam, int mode);
+	struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
+	int i, rv, resid;
+
+	memset(&callbacks, 0, sizeof(callbacks));
+
+	mutex_lock(&lkb->lkb_cb_mutex);
+	if (!lkb->lkb_callbacks[0].seq) {
+		/* no callback work exists, shouldn't happen */
+		log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id);
+		dlm_print_lkb(lkb);
+		dlm_dump_lkb_callbacks(lkb);
+	}
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
+		if (rv < 0)
+			break;
+	}
+
+	if (resid) {
+		/* cbs remain, loop should have removed all, shouldn't happen */
+		log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id,
+			  resid);
+		dlm_print_lkb(lkb);
+		dlm_dump_lkb_callbacks(lkb);
+	}
+	mutex_unlock(&lkb->lkb_cb_mutex);
+
+	castfn = lkb->lkb_astfn;
+	bastfn = lkb->lkb_bastfn;
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		if (!callbacks[i].seq)
+			break;
+		if (callbacks[i].flags & DLM_CB_SKIP) {
+			continue;
+		} else if (callbacks[i].flags & DLM_CB_BAST) {
+			bastfn(lkb->lkb_astparam, callbacks[i].mode);
+		} else if (callbacks[i].flags & DLM_CB_CAST) {
+			lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
+			lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
+			castfn(lkb->lkb_astparam);
+		}
+	}
+
+	/* undo kref_get from dlm_add_callback, may cause lkb to be freed */
+	dlm_put_lkb(lkb);
+}
+
+int dlm_callback_start(struct dlm_ls *ls)
+{
+	ls->ls_callback_wq = alloc_workqueue("dlm_callback",
+					     WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+	if (!ls->ls_callback_wq) {
+		log_print("can't start dlm_callback workqueue");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void dlm_callback_stop(struct dlm_ls *ls)
+{
+	if (ls->ls_callback_wq)
+		destroy_workqueue(ls->ls_callback_wq);
+}
+
+void dlm_callback_suspend(struct dlm_ls *ls)
+{
+	set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+
+	if (ls->ls_callback_wq)
+		flush_workqueue(ls->ls_callback_wq);
+}
+
+void dlm_callback_resume(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb, *safe;
+	int count = 0;
+
+	clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
+
+	if (!ls->ls_callback_wq)
+		return;
+
+	mutex_lock(&ls->ls_cb_mutex);
+	list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
+		list_del_init(&lkb->lkb_cb_list);
+		queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
+		count++;
+	}
+	mutex_unlock(&ls->ls_cb_mutex);
+
+	if (count)
+		log_rinfo(ls, "dlm_callback_resume %d", count);
+}
+
diff --git a/kernel/fs/dlm/ast.h b/kernel/fs/dlm/ast.h
new file mode 100644
index 000000000..757b551c6
--- /dev/null
+++ b/kernel/fs/dlm/ast.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+
+void dlm_del_ast(struct dlm_lkb *lkb);
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                         int status, uint32_t sbflags, uint64_t seq);
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                         struct dlm_callback *cb, int *resid);
+void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+                uint32_t sbflags);
+
+void dlm_callback_work(struct work_struct *work);
+int dlm_callback_start(struct dlm_ls *ls);
+void dlm_callback_stop(struct dlm_ls *ls);
+void dlm_callback_suspend(struct dlm_ls *ls);
+void dlm_callback_resume(struct dlm_ls *ls);
+
+#endif
+
+
diff --git a/kernel/fs/dlm/config.c b/kernel/fs/dlm/config.c
new file mode 100644
index 000000000..d521bddf8
--- /dev/null
+++ b/kernel/fs/dlm/config.c
@@ -0,0 +1,1040 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/dlmconstants.h>
+#include <net/ipv6.h>
+#include <net/sock.h>
+
+#include "config.h"
+#include "lowcomms.h"
+
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr      (write only)
+ * /config/dlm/<cluster>/comms/<comm>/addr_list (read only)
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct dlm_comm *local_comm;
+static uint32_t dlm_comm_count;
+
+struct dlm_clusters;
+struct dlm_cluster;
+struct dlm_spaces;
+struct dlm_space;
+struct dlm_comms;
+struct dlm_comm;
+struct dlm_nodes;
+struct dlm_node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+			    char *buf);
+static ssize_t store_cluster(struct config_item *i,
+			     struct configfs_attribute *a,
+			     const char *buf, size_t len);
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+			 char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+			 char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len);
+
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf);
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+				size_t len);
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+				size_t len);
+
+struct dlm_cluster {
+	struct config_group group;
+	unsigned int cl_tcp_port;
+	unsigned int cl_buffer_size;
+	unsigned int cl_rsbtbl_size;
+	unsigned int cl_recover_timer;
+	unsigned int cl_toss_secs;
+	unsigned int cl_scan_secs;
+	unsigned int cl_log_debug;
+	unsigned int cl_protocol;
+	unsigned int cl_timewarn_cs;
+	unsigned int cl_waitwarn_us;
+	unsigned int cl_new_rsb_count;
+	unsigned int cl_recover_callbacks;
+	char cl_cluster_name[DLM_LOCKSPACE_LEN];
+};
+
+enum {
+	CLUSTER_ATTR_TCP_PORT = 0,
+	CLUSTER_ATTR_BUFFER_SIZE,
+	CLUSTER_ATTR_RSBTBL_SIZE,
+	CLUSTER_ATTR_RECOVER_TIMER,
+	CLUSTER_ATTR_TOSS_SECS,
+	CLUSTER_ATTR_SCAN_SECS,
+	CLUSTER_ATTR_LOG_DEBUG,
+	CLUSTER_ATTR_PROTOCOL,
+	CLUSTER_ATTR_TIMEWARN_CS,
+	CLUSTER_ATTR_WAITWARN_US,
+	CLUSTER_ATTR_NEW_RSB_COUNT,
+	CLUSTER_ATTR_RECOVER_CALLBACKS,
+	CLUSTER_ATTR_CLUSTER_NAME,
+};
+
+struct cluster_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct dlm_cluster *, char *);
+	ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
+};
+
+static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+{
+	return sprintf(buf, "%s\n", cl->cl_cluster_name);
+}
+
+static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+					  const char *buf, size_t len)
+{
+	strlcpy(dlm_config.ci_cluster_name, buf,
+				sizeof(dlm_config.ci_cluster_name));
+	strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
+	return len;
+}
+
+static struct cluster_attribute cluster_attr_cluster_name = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "cluster_name",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = cluster_cluster_name_read,
+	.store  = cluster_cluster_name_write,
+};
+
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
+			   int *info_field, int check_zero,
+			   const char *buf, size_t len)
+{
+	unsigned int x;
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	rc = kstrtouint(buf, 0, &x);
+	if (rc)
+		return rc;
+
+	if (check_zero && !x)
+		return -EINVAL;
+
+	*cl_field = x;
+	*info_field = x;
+
+	return len;
+}
+
+#define CLUSTER_ATTR(name, check_zero)                                        \
+static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
+{                                                                             \
+	return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
+			   check_zero, buf, len);                             \
+}                                                                             \
+static ssize_t name##_read(struct dlm_cluster *cl, char *buf)                 \
+{                                                                             \
+	return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
+}                                                                             \
+static struct cluster_attribute cluster_attr_##name =                         \
+__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+
+CLUSTER_ATTR(tcp_port, 1);
+CLUSTER_ATTR(buffer_size, 1);
+CLUSTER_ATTR(rsbtbl_size, 1);
+CLUSTER_ATTR(recover_timer, 1);
+CLUSTER_ATTR(toss_secs, 1);
+CLUSTER_ATTR(scan_secs, 1);
+CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(protocol, 0);
+CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
+CLUSTER_ATTR(new_rsb_count, 0);
+CLUSTER_ATTR(recover_callbacks, 0);
+
+static struct configfs_attribute *cluster_attrs[] = {
+	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
+	[CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
+	[CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
+	[CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
+	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
+	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
+	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
+	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+	[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
+	[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
+	[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
+	[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
+	NULL,
+};
+
+enum {
+	COMM_ATTR_NODEID = 0,
+	COMM_ATTR_LOCAL,
+	COMM_ATTR_ADDR,
+	COMM_ATTR_ADDR_LIST,
+};
+
+struct comm_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct dlm_comm *, char *);
+	ssize_t (*store)(struct dlm_comm *, const char *, size_t);
+};
+
+static struct comm_attribute comm_attr_nodeid = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = comm_nodeid_read,
+	.store  = comm_nodeid_write,
+};
+
+static struct comm_attribute comm_attr_local = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "local",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = comm_local_read,
+	.store  = comm_local_write,
+};
+
+static struct comm_attribute comm_attr_addr = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "addr",
+                    .ca_mode = S_IWUSR },
+	.store  = comm_addr_write,
+};
+
+static struct comm_attribute comm_attr_addr_list = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "addr_list",
+                    .ca_mode = S_IRUGO },
+	.show   = comm_addr_list_read,
+};
+
+static struct configfs_attribute *comm_attrs[] = {
+	[COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+	[COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+	[COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+	[COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr,
+	NULL,
+};
+
+enum {
+	NODE_ATTR_NODEID = 0,
+	NODE_ATTR_WEIGHT,
+};
+
+struct node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct dlm_node *, char *);
+	ssize_t (*store)(struct dlm_node *, const char *, size_t);
+};
+
+static struct node_attribute node_attr_nodeid = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = node_nodeid_read,
+	.store  = node_nodeid_write,
+};
+
+static struct node_attribute node_attr_weight = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "weight",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = node_weight_read,
+	.store  = node_weight_write,
+};
+
+static struct configfs_attribute *node_attrs[] = {
+	[NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+	[NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+	NULL,
+};
+
+struct dlm_clusters {
+	struct configfs_subsystem subsys;
+};
+
+struct dlm_spaces {
+	struct config_group ss_group;
+};
+
+struct dlm_space {
+	struct config_group group;
+	struct list_head members;
+	struct mutex members_lock;
+	int members_count;
+};
+
+struct dlm_comms {
+	struct config_group cs_group;
+};
+
+struct dlm_comm {
+	struct config_item item;
+	int seq;
+	int nodeid;
+	int local;
+	int addr_count;
+	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+struct dlm_nodes {
+	struct config_group ns_group;
+};
+
+struct dlm_node {
+	struct config_item item;
+	struct list_head list; /* space->members */
+	int nodeid;
+	int weight;
+	int new;
+	int comm_seq; /* copy of cm->seq when nd->nodeid is set */
+};
+
+static struct configfs_group_operations clusters_ops = {
+	.make_group = make_cluster,
+	.drop_item = drop_cluster,
+};
+
+static struct configfs_item_operations cluster_ops = {
+	.release = release_cluster,
+	.show_attribute = show_cluster,
+	.store_attribute = store_cluster,
+};
+
+static struct configfs_group_operations spaces_ops = {
+	.make_group = make_space,
+	.drop_item = drop_space,
+};
+
+static struct configfs_item_operations space_ops = {
+	.release = release_space,
+};
+
+static struct configfs_group_operations comms_ops = {
+	.make_item = make_comm,
+	.drop_item = drop_comm,
+};
+
+static struct configfs_item_operations comm_ops = {
+	.release = release_comm,
+	.show_attribute = show_comm,
+	.store_attribute = store_comm,
+};
+
+static struct configfs_group_operations nodes_ops = {
+	.make_item = make_node,
+	.drop_item = drop_node,
+};
+
+static struct configfs_item_operations node_ops = {
+	.release = release_node,
+	.show_attribute = show_node,
+	.store_attribute = store_node,
+};
+
+static struct config_item_type clusters_type = {
+	.ct_group_ops = &clusters_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type cluster_type = {
+	.ct_item_ops = &cluster_ops,
+	.ct_attrs = cluster_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type spaces_type = {
+	.ct_group_ops = &spaces_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type space_type = {
+	.ct_item_ops = &space_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comms_type = {
+	.ct_group_ops = &comms_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comm_type = {
+	.ct_item_ops = &comm_ops,
+	.ct_attrs = comm_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type nodes_type = {
+	.ct_group_ops = &nodes_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type node_type = {
+	.ct_item_ops = &node_ops,
+	.ct_attrs = node_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
+{
+	return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+		   NULL;
+}
+
+static struct dlm_space *config_item_to_space(struct config_item *i)
+{
+	return i ? container_of(to_config_group(i), struct dlm_space, group) :
+		   NULL;
+}
+
+static struct dlm_comm *config_item_to_comm(struct config_item *i)
+{
+	return i ? container_of(i, struct dlm_comm, item) : NULL;
+}
+
+static struct dlm_node *config_item_to_node(struct config_item *i)
+{
+	return i ? container_of(i, struct dlm_node, item) : NULL;
+}
+
+static struct config_group *make_cluster(struct config_group *g,
+					 const char *name)
+{
+	struct dlm_cluster *cl = NULL;
+	struct dlm_spaces *sps = NULL;
+	struct dlm_comms *cms = NULL;
+	void *gps = NULL;
+
+	cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
+	gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
+	sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
+	cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
+
+	if (!cl || !gps || !sps || !cms)
+		goto fail;
+
+	config_group_init_type_name(&cl->group, name, &cluster_type);
+	config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+	config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+
+	cl->group.default_groups = gps;
+	cl->group.default_groups[0] = &sps->ss_group;
+	cl->group.default_groups[1] = &cms->cs_group;
+	cl->group.default_groups[2] = NULL;
+
+	cl->cl_tcp_port = dlm_config.ci_tcp_port;
+	cl->cl_buffer_size = dlm_config.ci_buffer_size;
+	cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
+	cl->cl_recover_timer = dlm_config.ci_recover_timer;
+	cl->cl_toss_secs = dlm_config.ci_toss_secs;
+	cl->cl_scan_secs = dlm_config.ci_scan_secs;
+	cl->cl_log_debug = dlm_config.ci_log_debug;
+	cl->cl_protocol = dlm_config.ci_protocol;
+	cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+	cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
+	cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
+	cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
+	memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
+	       DLM_LOCKSPACE_LEN);
+
+	space_list = &sps->ss_group;
+	comm_list = &cms->cs_group;
+	return &cl->group;
+
+ fail:
+	kfree(cl);
+	kfree(gps);
+	kfree(sps);
+	kfree(cms);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	struct config_item *tmp;
+	int j;
+
+	for (j = 0; cl->group.default_groups[j]; j++) {
+		tmp = &cl->group.default_groups[j]->cg_item;
+		cl->group.default_groups[j] = NULL;
+		config_item_put(tmp);
+	}
+
+	space_list = NULL;
+	comm_list = NULL;
+
+	config_item_put(i);
+}
+
+static void release_cluster(struct config_item *i)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	kfree(cl->group.default_groups);
+	kfree(cl);
+}
+
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+	struct dlm_space *sp = NULL;
+	struct dlm_nodes *nds = NULL;
+	void *gps = NULL;
+
+	sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
+	gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
+	nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
+
+	if (!sp || !gps || !nds)
+		goto fail;
+
+	config_group_init_type_name(&sp->group, name, &space_type);
+	config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+
+	sp->group.default_groups = gps;
+	sp->group.default_groups[0] = &nds->ns_group;
+	sp->group.default_groups[1] = NULL;
+
+	INIT_LIST_HEAD(&sp->members);
+	mutex_init(&sp->members_lock);
+	sp->members_count = 0;
+	return &sp->group;
+
+ fail:
+	kfree(sp);
+	kfree(gps);
+	kfree(nds);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+	struct dlm_space *sp = config_item_to_space(i);
+	struct config_item *tmp;
+	int j;
+
+	/* assert list_empty(&sp->members) */
+
+	for (j = 0; sp->group.default_groups[j]; j++) {
+		tmp = &sp->group.default_groups[j]->cg_item;
+		sp->group.default_groups[j] = NULL;
+		config_item_put(tmp);
+	}
+
+	config_item_put(i);
+}
+
+static void release_space(struct config_item *i)
+{
+	struct dlm_space *sp = config_item_to_space(i);
+	kfree(sp->group.default_groups);
+	kfree(sp);
+}
+
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+	struct dlm_comm *cm;
+
+	cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
+	if (!cm)
+		return ERR_PTR(-ENOMEM);
+
+	config_item_init_type_name(&cm->item, name, &comm_type);
+
+	cm->seq = dlm_comm_count++;
+	if (!cm->seq)
+		cm->seq = dlm_comm_count++;
+
+	cm->nodeid = -1;
+	cm->local = 0;
+	cm->addr_count = 0;
+	return &cm->item;
+}
+
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	if (local_comm == cm)
+		local_comm = NULL;
+	dlm_lowcomms_close(cm->nodeid);
+	while (cm->addr_count--)
+		kfree(cm->addr[cm->addr_count]);
+	config_item_put(i);
+}
+
+static void release_comm(struct config_item *i)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	kfree(cm);
+}
+
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+	struct dlm_node *nd;
+
+	nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
+	if (!nd)
+		return ERR_PTR(-ENOMEM);
+
+	config_item_init_type_name(&nd->item, name, &node_type);
+	nd->nodeid = -1;
+	nd->weight = 1;  /* default weight of 1 if none is set */
+	nd->new = 1;     /* set to 0 once it's been read by dlm_nodeid_list() */
+
+	mutex_lock(&sp->members_lock);
+	list_add(&nd->list, &sp->members);
+	sp->members_count++;
+	mutex_unlock(&sp->members_lock);
+
+	return &nd->item;
+}
+
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+	struct dlm_node *nd = config_item_to_node(i);
+
+	mutex_lock(&sp->members_lock);
+	list_del(&nd->list);
+	sp->members_count--;
+	mutex_unlock(&sp->members_lock);
+
+	config_item_put(i);
+}
+
+static void release_node(struct config_item *i)
+{
+	struct dlm_node *nd = config_item_to_node(i);
+	kfree(nd);
+}
+
+static struct dlm_clusters clusters_root = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "dlm",
+				.ci_type = &clusters_type,
+			},
+		},
+	},
+};
+
+int __init dlm_config_init(void)
+{
+	config_group_init(&clusters_root.subsys.su_group);
+	mutex_init(&clusters_root.subsys.su_mutex);
+	return configfs_register_subsystem(&clusters_root.subsys);
+}
+
+void dlm_config_exit(void)
+{
+	configfs_unregister_subsystem(&clusters_root.subsys);
+}
+
+/*
+ * Functions for user space to read/write attributes
+ */
+
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+			    char *buf)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	struct cluster_attribute *cla =
+			container_of(a, struct cluster_attribute, attr);
+	return cla->show ? cla->show(cl, buf) : 0;
+}
+
+static ssize_t store_cluster(struct config_item *i,
+			     struct configfs_attribute *a,
+			     const char *buf, size_t len)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	struct cluster_attribute *cla =
+		container_of(a, struct cluster_attribute, attr);
+	return cla->store ? cla->store(cl, buf, len) : -EINVAL;
+}
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+			 char *buf)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	struct comm_attribute *cma =
+			container_of(a, struct comm_attribute, attr);
+	return cma->show ? cma->show(cm, buf) : 0;
+}
+
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	struct comm_attribute *cma =
+		container_of(a, struct comm_attribute, attr);
+	return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
+{
+	return sprintf(buf, "%d\n", cm->nodeid);
+}
+
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+				 size_t len)
+{
+	int rc = kstrtoint(buf, 0, &cm->nodeid);
+
+	if (rc)
+		return rc;
+	return len;
+}
+
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
+{
+	return sprintf(buf, "%d\n", cm->local);
+}
+
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+				size_t len)
+{
+	int rc = kstrtoint(buf, 0, &cm->local);
+
+	if (rc)
+		return rc;
+	if (cm->local && !local_comm)
+		local_comm = cm;
+	return len;
+}
+
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
+{
+	struct sockaddr_storage *addr;
+	int rv;
+
+	if (len != sizeof(struct sockaddr_storage))
+		return -EINVAL;
+
+	if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+		return -ENOSPC;
+
+	addr = kzalloc(sizeof(*addr), GFP_NOFS);
+	if (!addr)
+		return -ENOMEM;
+
+	memcpy(addr, buf, len);
+
+	rv = dlm_lowcomms_addr(cm->nodeid, addr, len);
+	if (rv) {
+		kfree(addr);
+		return rv;
+	}
+
+	cm->addr[cm->addr_count++] = addr;
+	return len;
+}
+
+static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
+{
+	ssize_t s;
+	ssize_t allowance;
+	int i;
+	struct sockaddr_storage *addr;
+	struct sockaddr_in *addr_in;
+	struct sockaddr_in6 *addr_in6;
+	
+	/* Taken from ip6_addr_string() defined in lib/vsprintf.c */
+	char buf0[sizeof("AF_INET6	xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")];
+	
+
+	/* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */
+	allowance = 4096;
+	buf[0] = '\0';
+
+	for (i = 0; i < cm->addr_count; i++) {
+		addr = cm->addr[i];
+
+		switch(addr->ss_family) {
+		case AF_INET:
+			addr_in = (struct sockaddr_in *)addr;
+			s = sprintf(buf0, "AF_INET	%pI4\n", &addr_in->sin_addr.s_addr);
+			break;
+		case AF_INET6:
+			addr_in6 = (struct sockaddr_in6 *)addr;
+			s = sprintf(buf0, "AF_INET6	%pI6\n", &addr_in6->sin6_addr);
+			break;
+		default:
+			s = sprintf(buf0, "%s\n", "<UNKNOWN>");
+			break;
+		}
+		allowance -= s;
+		if (allowance >= 0)
+			strcat(buf, buf0);
+		else {
+			allowance += s;
+			break;
+		}
+	}
+	return 4096 - allowance;
+}
+
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+			 char *buf)
+{
+	struct dlm_node *nd = config_item_to_node(i);
+	struct node_attribute *nda =
+			container_of(a, struct node_attribute, attr);
+	return nda->show ? nda->show(nd, buf) : 0;
+}
+
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len)
+{
+	struct dlm_node *nd = config_item_to_node(i);
+	struct node_attribute *nda =
+		container_of(a, struct node_attribute, attr);
+	return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
+{
+	return sprintf(buf, "%d\n", nd->nodeid);
+}
+
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+				 size_t len)
+{
+	uint32_t seq = 0;
+	int rc = kstrtoint(buf, 0, &nd->nodeid);
+
+	if (rc)
+		return rc;
+	dlm_comm_seq(nd->nodeid, &seq);
+	nd->comm_seq = seq;
+	return len;
+}
+
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
+{
+	return sprintf(buf, "%d\n", nd->weight);
+}
+
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+				 size_t len)
+{
+	int rc = kstrtoint(buf, 0, &nd->weight);
+
+	if (rc)
+		return rc;
+	return len;
+}
+
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+
+static struct dlm_space *get_space(char *name)
+{
+	struct config_item *i;
+
+	if (!space_list)
+		return NULL;
+
+	mutex_lock(&space_list->cg_subsys->su_mutex);
+	i = config_group_find_item(space_list, name);
+	mutex_unlock(&space_list->cg_subsys->su_mutex);
+
+	return config_item_to_space(i);
+}
+
+static void put_space(struct dlm_space *sp)
+{
+	config_item_put(&sp->group.cg_item);
+}
+
+static struct dlm_comm *get_comm(int nodeid)
+{
+	struct config_item *i;
+	struct dlm_comm *cm = NULL;
+	int found = 0;
+
+	if (!comm_list)
+		return NULL;
+
+	mutex_lock(&clusters_root.subsys.su_mutex);
+
+	list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+		cm = config_item_to_comm(i);
+
+		if (cm->nodeid != nodeid)
+			continue;
+		found = 1;
+		config_item_get(i);
+		break;
+	}
+	mutex_unlock(&clusters_root.subsys.su_mutex);
+
+	if (!found)
+		cm = NULL;
+	return cm;
+}
+
+static void put_comm(struct dlm_comm *cm)
+{
+	config_item_put(&cm->item);
+}
+
+/* caller must free mem */
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+		     int *count_out)
+{
+	struct dlm_space *sp;
+	struct dlm_node *nd;
+	struct dlm_config_node *nodes, *node;
+	int rv, count;
+
+	sp = get_space(lsname);
+	if (!sp)
+		return -EEXIST;
+
+	mutex_lock(&sp->members_lock);
+	if (!sp->members_count) {
+		rv = -EINVAL;
+		printk(KERN_ERR "dlm: zero members_count\n");
+		goto out;
+	}
+
+	count = sp->members_count;
+
+	nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
+	if (!nodes) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	node = nodes;
+	list_for_each_entry(nd, &sp->members, list) {
+		node->nodeid = nd->nodeid;
+		node->weight = nd->weight;
+		node->new = nd->new;
+		node->comm_seq = nd->comm_seq;
+		node++;
+
+		nd->new = 0;
+	}
+
+	*count_out = count;
+	*nodes_out = nodes;
+	rv = 0;
+ out:
+	mutex_unlock(&sp->members_lock);
+	put_space(sp);
+	return rv;
+}
+
+int dlm_comm_seq(int nodeid, uint32_t *seq)
+{
+	struct dlm_comm *cm = get_comm(nodeid);
+	if (!cm)
+		return -EEXIST;
+	*seq = cm->seq;
+	put_comm(cm);
+	return 0;
+}
+
+int dlm_our_nodeid(void)
+{
+	return local_comm ? local_comm->nodeid : 0;
+}
+
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+	if (!local_comm)
+		return -1;
+	if (num + 1 > local_comm->addr_count)
+		return -1;
+	memcpy(addr, local_comm->addr[num], sizeof(*addr));
+	return 0;
+}
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RSBTBL_SIZE     1024
+#define DEFAULT_RECOVER_TIMER      5
+#define DEFAULT_TOSS_SECS         10
+#define DEFAULT_SCAN_SECS          5
+#define DEFAULT_LOG_DEBUG          0
+#define DEFAULT_PROTOCOL           0
+#define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US	   0
+#define DEFAULT_NEW_RSB_COUNT    128
+#define DEFAULT_RECOVER_CALLBACKS  0
+#define DEFAULT_CLUSTER_NAME      ""
+
+struct dlm_config_info dlm_config = {
+	.ci_tcp_port = DEFAULT_TCP_PORT,
+	.ci_buffer_size = DEFAULT_BUFFER_SIZE,
+	.ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+	.ci_recover_timer = DEFAULT_RECOVER_TIMER,
+	.ci_toss_secs = DEFAULT_TOSS_SECS,
+	.ci_scan_secs = DEFAULT_SCAN_SECS,
+	.ci_log_debug = DEFAULT_LOG_DEBUG,
+	.ci_protocol = DEFAULT_PROTOCOL,
+	.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+	.ci_waitwarn_us = DEFAULT_WAITWARN_US,
+	.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
+	.ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
+	.ci_cluster_name = DEFAULT_CLUSTER_NAME
+};
+
diff --git a/kernel/fs/dlm/config.h b/kernel/fs/dlm/config.h
new file mode 100644
index 000000000..f30697bc2
--- /dev/null
+++ b/kernel/fs/dlm/config.h
@@ -0,0 +1,53 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+struct dlm_config_node {
+	int nodeid;
+	int weight;
+	int new;
+	uint32_t comm_seq;
+};
+
+#define DLM_MAX_ADDR_COUNT 3
+
+struct dlm_config_info {
+	int ci_tcp_port;
+	int ci_buffer_size;
+	int ci_rsbtbl_size;
+	int ci_recover_timer;
+	int ci_toss_secs;
+	int ci_scan_secs;
+	int ci_log_debug;
+	int ci_protocol;
+	int ci_timewarn_cs;
+	int ci_waitwarn_us;
+	int ci_new_rsb_count;
+	int ci_recover_callbacks;
+	char ci_cluster_name[DLM_LOCKSPACE_LEN];
+};
+
+extern struct dlm_config_info dlm_config;
+
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+		     int *count_out);
+int dlm_comm_seq(int nodeid, uint32_t *seq);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+
+#endif				/* __CONFIG_DOT_H__ */
+
diff --git a/kernel/fs/dlm/debug_fs.c b/kernel/fs/dlm/debug_fs.c
new file mode 100644
index 000000000..eea64912c
--- /dev/null
+++ b/kernel/fs/dlm/debug_fs.c
@@ -0,0 +1,791 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include "dlm_internal.h"
+#include "lock.h"
+
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+
+static struct dentry *dlm_root;
+
+static char *print_lockmode(int mode)
+{
+	switch (mode) {
+	case DLM_LOCK_IV:
+		return "--";
+	case DLM_LOCK_NL:
+		return "NL";
+	case DLM_LOCK_CR:
+		return "CR";
+	case DLM_LOCK_CW:
+		return "CW";
+	case DLM_LOCK_PR:
+		return "PR";
+	case DLM_LOCK_PW:
+		return "PW";
+	case DLM_LOCK_EX:
+		return "EX";
+	default:
+		return "??";
+	}
+}
+
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       struct dlm_rsb *res)
+{
+	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+	if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
+	    lkb->lkb_status == DLM_LKSTS_WAITING)
+		seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+	if (lkb->lkb_nodeid) {
+		if (lkb->lkb_nodeid != res->res_nodeid)
+			seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+				   lkb->lkb_remid);
+		else
+			seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+	}
+
+	if (lkb->lkb_wait_type)
+		seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+
+	seq_puts(s, "\n");
+}
+
+static void print_format1(struct dlm_rsb *res, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+
+	lock_rsb(res);
+
+	seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+
+	for (i = 0; i < res->res_length; i++) {
+		if (isprint(res->res_name[i]))
+			seq_printf(s, "%c", res->res_name[i]);
+		else
+			seq_printf(s, "%c", '.');
+	}
+
+	if (res->res_nodeid > 0)
+		seq_printf(s, "\"\nLocal Copy, Master is node %d\n",
+			   res->res_nodeid);
+	else if (res->res_nodeid == 0)
+		seq_puts(s, "\"\nMaster Copy\n");
+	else if (res->res_nodeid == -1)
+		seq_printf(s, "\"\nLooking up master (lkid %x)\n",
+			   res->res_first_lkid);
+	else
+		seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid);
+	if (seq_has_overflowed(s))
+		goto out;
+
+	/* Print the LVB: */
+	if (res->res_lvbptr) {
+		seq_puts(s, "LVB: ");
+		for (i = 0; i < lvblen; i++) {
+			if (i == lvblen / 2)
+				seq_puts(s, "\n     ");
+			seq_printf(s, "%02x ",
+				   (unsigned char) res->res_lvbptr[i]);
+		}
+		if (rsb_flag(res, RSB_VALNOTVALID))
+			seq_puts(s, " (INVALID)");
+		seq_puts(s, "\n");
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	root_list = !list_empty(&res->res_root_list);
+	recover_list = !list_empty(&res->res_recover_list);
+
+	if (root_list || recover_list) {
+		seq_printf(s, "Recovery: root %d recover %d flags %lx count %d\n",
+			   root_list, recover_list,
+			   res->res_flags, res->res_recover_locks_count);
+	}
+
+	/* Print the locks attached to this resource */
+	seq_puts(s, "Granted Queue\n");
+	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
+		print_format1_lock(s, lkb, res);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	seq_puts(s, "Conversion Queue\n");
+	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
+		print_format1_lock(s, lkb, res);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	seq_puts(s, "Waiting Queue\n");
+	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
+		print_format1_lock(s, lkb, res);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	if (list_empty(&res->res_lookup))
+		goto out;
+
+	seq_puts(s, "Lookup Queue\n");
+	list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+		seq_printf(s, "%08x %s",
+			   lkb->lkb_id, print_lockmode(lkb->lkb_rqmode));
+		if (lkb->lkb_wait_type)
+			seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+		seq_puts(s, "\n");
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+ out:
+	unlock_rsb(res);
+}
+
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       struct dlm_rsb *r)
+{
+	u64 xid = 0;
+	u64 us;
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		if (lkb->lkb_ua)
+			xid = lkb->lkb_ua->xid;
+	}
+
+	/* microseconds since lkb was added to current queue */
+	us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
+
+	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
+	   r_nodeid r_len r_name */
+
+	seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
+		   lkb->lkb_id,
+		   lkb->lkb_nodeid,
+		   lkb->lkb_remid,
+		   lkb->lkb_ownpid,
+		   (unsigned long long)xid,
+		   lkb->lkb_exflags,
+		   lkb->lkb_flags,
+		   lkb->lkb_status,
+		   lkb->lkb_grmode,
+		   lkb->lkb_rqmode,
+		   (unsigned long long)us,
+		   r->res_nodeid,
+		   r->res_length,
+		   r->res_name);
+}
+
+static void print_format2(struct dlm_rsb *r, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+
+	lock_rsb(r);
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		print_format2_lock(s, lkb, r);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		print_format2_lock(s, lkb, r);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		print_format2_lock(s, lkb, r);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+ out:
+	unlock_rsb(r);
+}
+
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      int rsb_lookup)
+{
+	u64 xid = 0;
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		if (lkb->lkb_ua)
+			xid = lkb->lkb_ua->xid;
+	}
+
+	seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+		   lkb->lkb_id,
+		   lkb->lkb_nodeid,
+		   lkb->lkb_remid,
+		   lkb->lkb_ownpid,
+		   (unsigned long long)xid,
+		   lkb->lkb_exflags,
+		   lkb->lkb_flags,
+		   lkb->lkb_status,
+		   lkb->lkb_grmode,
+		   lkb->lkb_rqmode,
+		   lkb->lkb_last_bast.mode,
+		   rsb_lookup,
+		   lkb->lkb_wait_type,
+		   lkb->lkb_lvbseq,
+		   (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+		   (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
+}
+
+static void print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int i, lvblen = r->res_ls->ls_lvblen;
+	int print_name = 1;
+
+	lock_rsb(r);
+
+	seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+		   r,
+		   r->res_nodeid,
+		   r->res_first_lkid,
+		   r->res_flags,
+		   !list_empty(&r->res_root_list),
+		   !list_empty(&r->res_recover_list),
+		   r->res_recover_locks_count,
+		   r->res_length);
+	if (seq_has_overflowed(s))
+		goto out;
+
+	for (i = 0; i < r->res_length; i++) {
+		if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+			print_name = 0;
+	}
+
+	seq_puts(s, print_name ? "str " : "hex");
+
+	for (i = 0; i < r->res_length; i++) {
+		if (print_name)
+			seq_printf(s, "%c", r->res_name[i]);
+		else
+			seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+	}
+	seq_puts(s, "\n");
+	if (seq_has_overflowed(s))
+		goto out;
+
+	if (!r->res_lvbptr)
+		goto do_locks;
+
+	seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+
+	for (i = 0; i < lvblen; i++)
+		seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+	seq_puts(s, "\n");
+	if (seq_has_overflowed(s))
+		goto out;
+
+ do_locks:
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		print_format3_lock(s, lkb, 0);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		print_format3_lock(s, lkb, 0);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		print_format3_lock(s, lkb, 0);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
+		print_format3_lock(s, lkb, 1);
+		if (seq_has_overflowed(s))
+			goto out;
+	}
+ out:
+	unlock_rsb(r);
+}
+
+static void print_format4(struct dlm_rsb *r, struct seq_file *s)
+{
+	int our_nodeid = dlm_our_nodeid();
+	int print_name = 1;
+	int i;
+
+	lock_rsb(r);
+
+	seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ",
+		   r,
+		   r->res_nodeid,
+		   r->res_master_nodeid,
+		   r->res_dir_nodeid,
+		   our_nodeid,
+		   r->res_toss_time,
+		   r->res_flags,
+		   r->res_length);
+
+	for (i = 0; i < r->res_length; i++) {
+		if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+			print_name = 0;
+	}
+
+	seq_puts(s, print_name ? "str " : "hex");
+
+	for (i = 0; i < r->res_length; i++) {
+		if (print_name)
+			seq_printf(s, "%c", r->res_name[i]);
+		else
+			seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+	}
+	seq_puts(s, "\n");
+
+	unlock_rsb(r);
+}
+
+struct rsbtbl_iter {
+	struct dlm_rsb *rsb;
+	unsigned bucket;
+	int format;
+	int header;
+};
+
+/*
+ * If the buffer is full, seq_printf can be called again, but it
+ * does nothing.  So, the these printing routines periodically check
+ * seq_has_overflowed to avoid wasting too much time trying to print to
+ * a full buffer.
+ */
+
+static int table_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+	struct rsbtbl_iter *ri = iter_ptr;
+
+	switch (ri->format) {
+	case 1:
+		print_format1(ri->rsb, seq);
+		break;
+	case 2:
+		if (ri->header) {
+			seq_puts(seq, "id nodeid remid pid xid exflags flags sts grmode rqmode time_ms r_nodeid r_len r_name\n");
+			ri->header = 0;
+		}
+		print_format2(ri->rsb, seq);
+		break;
+	case 3:
+		if (ri->header) {
+			seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+			ri->header = 0;
+		}
+		print_format3(ri->rsb, seq);
+		break;
+	case 4:
+		if (ri->header) {
+			seq_puts(seq, "version 4 rsb 2\n");
+			ri->header = 0;
+		}
+		print_format4(ri->rsb, seq);
+		break;
+	}
+
+	return 0;
+}
+
+static const struct seq_operations format1_seq_ops;
+static const struct seq_operations format2_seq_ops;
+static const struct seq_operations format3_seq_ops;
+static const struct seq_operations format4_seq_ops;
+
+static void *table_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct rb_root *tree;
+	struct rb_node *node;
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri;
+	struct dlm_rsb *r;
+	loff_t n = *pos;
+	unsigned bucket, entry;
+	int toss = (seq->op == &format4_seq_ops);
+
+	bucket = n >> 32;
+	entry = n & ((1LL << 32) - 1);
+
+	if (bucket >= ls->ls_rsbtbl_size)
+		return NULL;
+
+	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
+	if (!ri)
+		return NULL;
+	if (n == 0)
+		ri->header = 1;
+	if (seq->op == &format1_seq_ops)
+		ri->format = 1;
+	if (seq->op == &format2_seq_ops)
+		ri->format = 2;
+	if (seq->op == &format3_seq_ops)
+		ri->format = 3;
+	if (seq->op == &format4_seq_ops)
+		ri->format = 4;
+
+	tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	if (!RB_EMPTY_ROOT(tree)) {
+		for (node = rb_first(tree); node; node = rb_next(node)) {
+			r = rb_entry(node, struct dlm_rsb, res_hashnode);
+			if (!entry--) {
+				dlm_hold_rsb(r);
+				ri->rsb = r;
+				ri->bucket = bucket;
+				spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+				return ri;
+			}
+		}
+	}
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
+
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
+
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
+
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
+			return NULL;
+		}
+		tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
+
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!RB_EMPTY_ROOT(tree)) {
+			node = rb_first(tree);
+			r = rb_entry(node, struct dlm_rsb, res_hashnode);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	}
+}
+
+static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
+{
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri = iter_ptr;
+	struct rb_root *tree;
+	struct rb_node *next;
+	struct dlm_rsb *r, *rp;
+	loff_t n = *pos;
+	unsigned bucket;
+	int toss = (seq->op == &format4_seq_ops);
+
+	bucket = n >> 32;
+
+	/*
+	 * move to the next rsb in the same bucket
+	 */
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	rp = ri->rsb;
+	next = rb_next(&rp->res_hashnode);
+
+	if (next) {
+		r = rb_entry(next, struct dlm_rsb, res_hashnode);
+		dlm_hold_rsb(r);
+		ri->rsb = r;
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+		dlm_put_rsb(rp);
+		++*pos;
+		return ri;
+	}
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	dlm_put_rsb(rp);
+
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
+
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
+
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
+
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
+			return NULL;
+		}
+		tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
+
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!RB_EMPTY_ROOT(tree)) {
+			next = rb_first(tree);
+			r = rb_entry(next, struct dlm_rsb, res_hashnode);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	}
+}
+
+static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+	struct rsbtbl_iter *ri = iter_ptr;
+
+	if (ri) {
+		dlm_put_rsb(ri->rsb);
+		kfree(ri);
+	}
+}
+
+static const struct seq_operations format1_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct seq_operations format2_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct seq_operations format3_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct seq_operations format4_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct file_operations format1_fops;
+static const struct file_operations format2_fops;
+static const struct file_operations format3_fops;
+static const struct file_operations format4_fops;
+
+static int table_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret = -1;
+
+	if (file->f_op == &format1_fops)
+		ret = seq_open(file, &format1_seq_ops);
+	else if (file->f_op == &format2_fops)
+		ret = seq_open(file, &format2_seq_ops);
+	else if (file->f_op == &format3_fops)
+		ret = seq_open(file, &format3_seq_ops);
+	else if (file->f_op == &format4_fops)
+		ret = seq_open(file, &format4_seq_ops);
+
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private; /* the dlm_ls */
+	return 0;
+}
+
+static const struct file_operations format1_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format2_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format3_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format4_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
+ * dump lkb's on the ls_waiters list
+ */
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+			    size_t count, loff_t *ppos)
+{
+	struct dlm_ls *ls = file->private_data;
+	struct dlm_lkb *lkb;
+	size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+
+	mutex_lock(&debug_buf_lock);
+	mutex_lock(&ls->ls_waiters_mutex);
+	memset(debug_buf, 0, sizeof(debug_buf));
+
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+			       lkb->lkb_id, lkb->lkb_wait_type,
+			       lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+	mutex_unlock(&debug_buf_lock);
+	return rv;
+}
+
+static const struct file_operations waiters_fops = {
+	.owner   = THIS_MODULE,
+	.open    = simple_open,
+	.read    = waiters_read,
+	.llseek  = default_llseek,
+};
+
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+	debugfs_remove(ls->ls_debug_rsb_dentry);
+	debugfs_remove(ls->ls_debug_waiters_dentry);
+	debugfs_remove(ls->ls_debug_locks_dentry);
+	debugfs_remove(ls->ls_debug_all_dentry);
+	debugfs_remove(ls->ls_debug_toss_dentry);
+}
+
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+	char name[DLM_LOCKSPACE_LEN+8];
+
+	/* format 1 */
+
+	ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+						      S_IFREG | S_IRUGO,
+						      dlm_root,
+						      ls,
+						      &format1_fops);
+	if (!ls->ls_debug_rsb_dentry)
+		goto fail;
+
+	/* format 2 */
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
+
+	ls->ls_debug_locks_dentry = debugfs_create_file(name,
+							S_IFREG | S_IRUGO,
+							dlm_root,
+							ls,
+							&format2_fops);
+	if (!ls->ls_debug_locks_dentry)
+		goto fail;
+
+	/* format 3 */
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+
+	ls->ls_debug_all_dentry = debugfs_create_file(name,
+						      S_IFREG | S_IRUGO,
+						      dlm_root,
+						      ls,
+						      &format3_fops);
+	if (!ls->ls_debug_all_dentry)
+		goto fail;
+
+	/* format 4 */
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name);
+
+	ls->ls_debug_toss_dentry = debugfs_create_file(name,
+						       S_IFREG | S_IRUGO,
+						       dlm_root,
+						       ls,
+						       &format4_fops);
+	if (!ls->ls_debug_toss_dentry)
+		goto fail;
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+							  S_IFREG | S_IRUGO,
+							  dlm_root,
+							  ls,
+							  &waiters_fops);
+	if (!ls->ls_debug_waiters_dentry)
+		goto fail;
+
+	return 0;
+
+ fail:
+	dlm_delete_debug_file(ls);
+	return -ENOMEM;
+}
+
+int __init dlm_register_debugfs(void)
+{
+	mutex_init(&debug_buf_lock);
+	dlm_root = debugfs_create_dir("dlm", NULL);
+	return dlm_root ? 0 : -ENOMEM;
+}
+
+void dlm_unregister_debugfs(void)
+{
+	debugfs_remove(dlm_root);
+}
+
diff --git a/kernel/fs/dlm/dir.c b/kernel/fs/dlm/dir.c
new file mode 100644
index 000000000..d975851a7
--- /dev/null
+++ b/kernel/fs/dlm/dir.c
@@ -0,0 +1,308 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value.  This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+	uint32_t node;
+
+	if (ls->ls_num_nodes == 1)
+		return dlm_our_nodeid();
+	else {
+		node = (hash >> 16) % ls->ls_total_weight;
+		return ls->ls_node_array[node];
+	}
+}
+
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+	return r->res_dir_nodeid;
+}
+
+void dlm_recover_dir_nodeid(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		r->res_dir_nodeid = dlm_hash2nodeid(ls, r->res_hash);
+	}
+	up_read(&ls->ls_root_sem);
+}
+
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	char *b, *last_name = NULL;
+	int error = -ENOMEM, last_len, nodeid, result;
+	uint16_t namelen;
+	unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
+
+	log_rinfo(ls, "dlm_recover_directory");
+
+	if (dlm_no_directory(ls))
+		goto out_status;
+
+	last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
+	if (!last_name)
+		goto out;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->nodeid == dlm_our_nodeid())
+			continue;
+
+		memset(last_name, 0, DLM_RESNAME_MAXLEN);
+		last_len = 0;
+
+		for (;;) {
+			int left;
+			error = dlm_recovery_stopped(ls);
+			if (error)
+				goto out_free;
+
+			error = dlm_rcom_names(ls, memb->nodeid,
+					       last_name, last_len);
+			if (error)
+				goto out_free;
+
+			cond_resched();
+
+			/*
+			 * pick namelen/name pairs out of received buffer
+			 */
+
+			b = ls->ls_recover_buf->rc_buf;
+			left = ls->ls_recover_buf->rc_header.h_length;
+			left -= sizeof(struct dlm_rcom);
+
+			for (;;) {
+				__be16 v;
+
+				error = -EINVAL;
+				if (left < sizeof(__be16))
+					goto out_free;
+
+				memcpy(&v, b, sizeof(__be16));
+				namelen = be16_to_cpu(v);
+				b += sizeof(__be16);
+				left -= sizeof(__be16);
+
+				/* namelen of 0xFFFFF marks end of names for
+				   this node; namelen of 0 marks end of the
+				   buffer */
+
+				if (namelen == 0xFFFF)
+					goto done;
+				if (!namelen)
+					break;
+
+				if (namelen > left)
+					goto out_free;
+
+				if (namelen > DLM_RESNAME_MAXLEN)
+					goto out_free;
+
+				error = dlm_master_lookup(ls, memb->nodeid,
+							  b, namelen,
+							  DLM_LU_RECOVER_DIR,
+							  &nodeid, &result);
+				if (error) {
+					log_error(ls, "recover_dir lookup %d",
+						  error);
+					goto out_free;
+				}
+
+				/* The name was found in rsbtbl, but the
+				 * master nodeid is different from
+				 * memb->nodeid which says it is the master.
+				 * This should not happen. */
+
+				if (result == DLM_LU_MATCH &&
+				    nodeid != memb->nodeid) {
+					count_bad++;
+					log_error(ls, "recover_dir lookup %d "
+						  "nodeid %d memb %d bad %u",
+						  result, nodeid, memb->nodeid,
+						  count_bad);
+					print_hex_dump_bytes("dlm_recover_dir ",
+							     DUMP_PREFIX_NONE,
+							     b, namelen);
+				}
+
+				/* The name was found in rsbtbl, and the
+				 * master nodeid matches memb->nodeid. */
+
+				if (result == DLM_LU_MATCH &&
+				    nodeid == memb->nodeid) {
+					count_match++;
+				}
+
+				/* The name was not found in rsbtbl and was
+				 * added with memb->nodeid as the master. */
+
+				if (result == DLM_LU_ADD) {
+					count_add++;
+				}
+
+				last_len = namelen;
+				memcpy(last_name, b, namelen);
+				b += namelen;
+				left -= namelen;
+				count++;
+			}
+		}
+	 done:
+		;
+	}
+
+ out_status:
+	error = 0;
+	dlm_set_recover_status(ls, DLM_RS_DIR);
+
+	log_rinfo(ls, "dlm_recover_directory %u in %u new",
+		  count, count_add);
+ out_free:
+	kfree(last_name);
+ out:
+	return error;
+}
+
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
+{
+	struct dlm_rsb *r;
+	uint32_t hash, bucket;
+	int rv;
+
+	hash = jhash(name, len, 0);
+	bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, &r);
+	if (rv)
+		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss,
+					 name, len, &r);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+	if (!rv)
+		return r;
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (len == r->res_length && !memcmp(name, r->res_name, len)) {
+			up_read(&ls->ls_root_sem);
+			log_debug(ls, "find_rsb_root revert to root_list %s",
+				  r->res_name);
+			return r;
+		}
+	}
+	up_read(&ls->ls_root_sem);
+	return NULL;
+}
+
+/* Find the rsb where we left off (or start again), then send rsb names
+   for rsb's we're master of and whose directory node matches the requesting
+   node.  inbuf is the rsb name last sent, inlen is the name's length */
+
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+ 			   char *outbuf, int outlen, int nodeid)
+{
+	struct list_head *list;
+	struct dlm_rsb *r;
+	int offset = 0, dir_nodeid;
+	__be16 be_namelen;
+
+	down_read(&ls->ls_root_sem);
+
+	if (inlen > 1) {
+		r = find_rsb_root(ls, inbuf, inlen);
+		if (!r) {
+			inbuf[inlen - 1] = '\0';
+			log_error(ls, "copy_master_names from %d start %d %s",
+				  nodeid, inlen, inbuf);
+			goto out;
+		}
+		list = r->res_root_list.next;
+	} else {
+		list = ls->ls_root_list.next;
+	}
+
+	for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+		r = list_entry(list, struct dlm_rsb, res_root_list);
+		if (r->res_nodeid)
+			continue;
+
+		dir_nodeid = dlm_dir_nodeid(r);
+		if (dir_nodeid != nodeid)
+			continue;
+
+		/*
+		 * The block ends when we can't fit the following in the
+		 * remaining buffer space:
+		 * namelen (uint16_t) +
+		 * name (r->res_length) +
+		 * end-of-block record 0x0000 (uint16_t)
+		 */
+
+		if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+			/* Write end-of-block record */
+			be_namelen = cpu_to_be16(0);
+			memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+			offset += sizeof(__be16);
+			ls->ls_recover_dir_sent_msg++;
+			goto out;
+		}
+
+		be_namelen = cpu_to_be16(r->res_length);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
+		memcpy(outbuf + offset, r->res_name, r->res_length);
+		offset += r->res_length;
+		ls->ls_recover_dir_sent_res++;
+	}
+
+	/*
+	 * If we've reached the end of the list (and there's room) write a
+	 * terminating record.
+	 */
+
+	if ((list == &ls->ls_root_list) &&
+	    (offset + sizeof(uint16_t) <= outlen)) {
+		be_namelen = cpu_to_be16(0xFFFF);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
+		ls->ls_recover_dir_sent_msg++;
+	}
+ out:
+	up_read(&ls->ls_root_sem);
+}
+
diff --git a/kernel/fs/dlm/dir.h b/kernel/fs/dlm/dir.h
new file mode 100644
index 000000000..417506344
--- /dev/null
+++ b/kernel/fs/dlm/dir.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_recover_dir_nodeid(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+	char *outbuf, int outlen, int nodeid);
+
+#endif				/* __DIR_DOT_H__ */
+
diff --git a/kernel/fs/dlm/dlm_internal.h b/kernel/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000..5eff6ea3e
--- /dev/null
+++ b/kernel/fs/dlm/dlm_internal.h
@@ -0,0 +1,729 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
+#include <linux/ratelimit.h>
+#include <asm/uaccess.h>
+
+#include <linux/dlm.h>
+#include "config.h"
+
+/* Size of the temp buffer midcomms allocates on the stack.
+   We try to make this large enough so most messages fit.
+   FIXME: should sctp make this unnecessary? */
+
+#define DLM_INBUF_LEN		148
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_rsbtable;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+
+#define log_print(fmt, args...) \
+	printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+	printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+#define log_rinfo(ls, fmt, args...) \
+	printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
+
+#define log_debug(ls, fmt, args...) \
+do { \
+	if (dlm_config.ci_log_debug) \
+		printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
+		       (ls)->ls_name , ##args); \
+} while (0)
+
+#define log_limit(ls, fmt, args...) \
+do { \
+	if (dlm_config.ci_log_debug) \
+		printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \
+			(ls)->ls_name , ##args); \
+} while (0)
+
+#define DLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    printk(KERN_ERR "\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+
+
+#define DLM_RTF_SHRINK		0x00000001
+
+struct dlm_rsbtable {
+	struct rb_root		keep;
+	struct rb_root		toss;
+	spinlock_t		lock;
+	uint32_t		flags;
+};
+
+
+/*
+ * Lockspace member (per node in a ls)
+ */
+
+struct dlm_member {
+	struct list_head	list;
+	int			nodeid;
+	int			weight;
+	int			slot;
+	int			slot_prev;
+	int			comm_seq;
+	uint32_t		generation;
+};
+
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+	struct list_head	list;
+	struct dlm_config_node	*nodes;
+	int			nodes_count;
+	uint64_t		seq;
+};
+
+/*
+ * Pass input args to second stage locking function.
+ */
+
+struct dlm_args {
+	uint32_t		flags;
+	void			(*astfn) (void *astparam);
+	void			*astparam;
+	void			(*bastfn) (void *astparam, int mode);
+	int			mode;
+	struct dlm_lksb		*lksb;
+	unsigned long		timeout;
+};
+
+
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy      lock is mastered locally
+ *                 (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy    lock is mastered on a remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy     master node's copy of a lock owned by remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock.  The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h.  These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes.  One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v.  Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY).  These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags.  These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast.  All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait.  DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed.  Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+
+/* lkb_status */
+
+#define DLM_LKSTS_WAITING	1
+#define DLM_LKSTS_GRANTED	2
+#define DLM_LKSTS_CONVERT	3
+
+/* lkb_flags */
+
+#define DLM_IFL_MSTCPY		0x00010000
+#define DLM_IFL_RESEND		0x00020000
+#define DLM_IFL_DEAD		0x00040000
+#define DLM_IFL_OVERLAP_UNLOCK  0x00080000
+#define DLM_IFL_OVERLAP_CANCEL  0x00100000
+#define DLM_IFL_ENDOFLIFE	0x00200000
+#define DLM_IFL_WATCH_TIMEWARN	0x00400000
+#define DLM_IFL_TIMEOUT_CANCEL	0x00800000
+#define DLM_IFL_DEADLOCK_CANCEL	0x01000000
+#define DLM_IFL_STUB_MS		0x02000000 /* magic number for m_flags */
+#define DLM_IFL_USER		0x00000001
+#define DLM_IFL_ORPHAN		0x00000002
+
+#define DLM_CALLBACKS_SIZE	6
+
+#define DLM_CB_CAST		0x00000001
+#define DLM_CB_BAST		0x00000002
+#define DLM_CB_SKIP		0x00000004
+
+struct dlm_callback {
+	uint64_t		seq;
+	uint32_t		flags;		/* DLM_CBF_ */
+	int			sb_status;	/* copy to lksb status */
+	uint8_t			sb_flags;	/* copy to lksb flags */
+	int8_t			mode; /* rq mode of bast, gr mode of cast */
+};
+
+struct dlm_lkb {
+	struct dlm_rsb		*lkb_resource;	/* the rsb */
+	struct kref		lkb_ref;
+	int			lkb_nodeid;	/* copied from rsb */
+	int			lkb_ownpid;	/* pid of lock owner */
+	uint32_t		lkb_id;		/* our lock ID */
+	uint32_t		lkb_remid;	/* lock ID on remote partner */
+	uint32_t		lkb_exflags;	/* external flags from caller */
+	uint32_t		lkb_sbflags;	/* lksb flags */
+	uint32_t		lkb_flags;	/* internal flags */
+	uint32_t		lkb_lvbseq;	/* lvb sequence number */
+
+	int8_t			lkb_status;     /* granted, waiting, convert */
+	int8_t			lkb_rqmode;	/* requested lock mode */
+	int8_t			lkb_grmode;	/* granted lock mode */
+	int8_t			lkb_highbast;	/* highest mode bast sent for */
+
+	int8_t			lkb_wait_type;	/* type of reply waiting for */
+	int8_t			lkb_wait_count;
+	int			lkb_wait_nodeid; /* for debugging */
+
+	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
+	struct list_head	lkb_rsb_lookup;	/* waiting for rsb lookup */
+	struct list_head	lkb_wait_reply;	/* waiting for remote reply */
+	struct list_head	lkb_ownqueue;	/* list of locks for a process */
+	struct list_head	lkb_time_list;
+	ktime_t			lkb_timestamp;
+	ktime_t			lkb_wait_time;
+	unsigned long		lkb_timeout_cs;
+
+	struct mutex		lkb_cb_mutex;
+	struct work_struct	lkb_cb_work;
+	struct list_head	lkb_cb_list; /* for ls_cb_delay or proc->asts */
+	struct dlm_callback	lkb_callbacks[DLM_CALLBACKS_SIZE];
+	struct dlm_callback	lkb_last_cast;
+	struct dlm_callback	lkb_last_bast;
+	ktime_t			lkb_last_cast_time;	/* for debugging */
+	ktime_t			lkb_last_bast_time;	/* for debugging */
+
+	uint64_t		lkb_recover_seq; /* from ls_recover_seq */
+
+	char			*lkb_lvbptr;
+	struct dlm_lksb		*lkb_lksb;      /* caller's status block */
+	void			(*lkb_astfn) (void *astparam);
+	void			(*lkb_bastfn) (void *astparam, int mode);
+	union {
+		void			*lkb_astparam;	/* caller's ast arg */
+		struct dlm_user_args	*lkb_ua;
+	};
+};
+
+/*
+ * res_master_nodeid is "normal": 0 is unset/invalid, non-zero is the real
+ * nodeid, even when nodeid is our_nodeid.
+ *
+ * res_nodeid is "odd": -1 is unset/invalid, zero means our_nodeid,
+ * greater than zero when another nodeid.
+ *
+ * (TODO: remove res_nodeid and only use res_master_nodeid)
+ */
+
+struct dlm_rsb {
+	struct dlm_ls		*res_ls;	/* the lockspace */
+	struct kref		res_ref;
+	struct mutex		res_mutex;
+	unsigned long		res_flags;
+	int			res_length;	/* length of rsb name */
+	int			res_nodeid;
+	int			res_master_nodeid;
+	int			res_dir_nodeid;
+	int			res_id;		/* for ls_recover_idr */
+	uint32_t                res_lvbseq;
+	uint32_t		res_hash;
+	uint32_t		res_bucket;	/* rsbtbl */
+	unsigned long		res_toss_time;
+	uint32_t		res_first_lkid;
+	struct list_head	res_lookup;	/* lkbs waiting on first */
+	union {
+		struct list_head	res_hashchain;
+		struct rb_node		res_hashnode;	/* rsbtbl */
+	};
+	struct list_head	res_grantqueue;
+	struct list_head	res_convertqueue;
+	struct list_head	res_waitqueue;
+
+	struct list_head	res_root_list;	    /* used for recovery */
+	struct list_head	res_recover_list;   /* used for recovery */
+	int			res_recover_locks_count;
+
+	char			*res_lvbptr;
+	char			res_name[DLM_RESNAME_MAXLEN+1];
+};
+
+/* dlm_master_lookup() flags */
+
+#define DLM_LU_RECOVER_DIR	1
+#define DLM_LU_RECOVER_MASTER	2
+
+/* dlm_master_lookup() results */
+
+#define DLM_LU_MATCH		1
+#define DLM_LU_ADD		2
+
+/* find_rsb() flags */
+
+#define R_REQUEST		0x00000001
+#define R_RECEIVE_REQUEST	0x00000002
+#define R_RECEIVE_RECOVER	0x00000004
+
+/* rsb_flags */
+
+enum rsb_flags {
+	RSB_MASTER_UNCERTAIN,
+	RSB_VALNOTVALID,
+	RSB_VALNOTVALID_PREV,
+	RSB_NEW_MASTER,
+	RSB_NEW_MASTER2,
+	RSB_RECOVER_CONVERT,
+	RSB_RECOVER_GRANT,
+	RSB_RECOVER_LVB_INVAL,
+};
+
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	__set_bit(flag, &r->res_flags);
+}
+
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	__clear_bit(flag, &r->res_flags);
+}
+
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	return test_bit(flag, &r->res_flags);
+}
+
+
+/* dlm_header is first element of all structs sent between nodes */
+
+#define DLM_HEADER_MAJOR	0x00030000
+#define DLM_HEADER_MINOR	0x00000001
+
+#define DLM_HEADER_SLOTS	0x00000001
+
+#define DLM_MSG			1
+#define DLM_RCOM		2
+
+struct dlm_header {
+	uint32_t		h_version;
+	uint32_t		h_lockspace;
+	uint32_t		h_nodeid;	/* nodeid of sender */
+	uint16_t		h_length;
+	uint8_t			h_cmd;		/* DLM_MSG, DLM_RCOM */
+	uint8_t			h_pad;
+};
+
+
+#define DLM_MSG_REQUEST		1
+#define DLM_MSG_CONVERT		2
+#define DLM_MSG_UNLOCK		3
+#define DLM_MSG_CANCEL		4
+#define DLM_MSG_REQUEST_REPLY	5
+#define DLM_MSG_CONVERT_REPLY	6
+#define DLM_MSG_UNLOCK_REPLY	7
+#define DLM_MSG_CANCEL_REPLY	8
+#define DLM_MSG_GRANT		9
+#define DLM_MSG_BAST		10
+#define DLM_MSG_LOOKUP		11
+#define DLM_MSG_REMOVE		12
+#define DLM_MSG_LOOKUP_REPLY	13
+#define DLM_MSG_PURGE		14
+
+struct dlm_message {
+	struct dlm_header	m_header;
+	uint32_t		m_type;		/* DLM_MSG_ */
+	uint32_t		m_nodeid;
+	uint32_t		m_pid;
+	uint32_t		m_lkid;		/* lkid on sender */
+	uint32_t		m_remid;	/* lkid on receiver */
+	uint32_t		m_parent_lkid;
+	uint32_t		m_parent_remid;
+	uint32_t		m_exflags;
+	uint32_t		m_sbflags;
+	uint32_t		m_flags;
+	uint32_t		m_lvbseq;
+	uint32_t		m_hash;
+	int			m_status;
+	int			m_grmode;
+	int			m_rqmode;
+	int			m_bastmode;
+	int			m_asts;
+	int			m_result;	/* 0 or -EXXX */
+	char			m_extra[0];	/* name or lvb */
+};
+
+
+#define DLM_RS_NODES		0x00000001
+#define DLM_RS_NODES_ALL	0x00000002
+#define DLM_RS_DIR		0x00000004
+#define DLM_RS_DIR_ALL		0x00000008
+#define DLM_RS_LOCKS		0x00000010
+#define DLM_RS_LOCKS_ALL	0x00000020
+#define DLM_RS_DONE		0x00000040
+#define DLM_RS_DONE_ALL		0x00000080
+
+#define DLM_RCOM_STATUS		1
+#define DLM_RCOM_NAMES		2
+#define DLM_RCOM_LOOKUP		3
+#define DLM_RCOM_LOCK		4
+#define DLM_RCOM_STATUS_REPLY	5
+#define DLM_RCOM_NAMES_REPLY	6
+#define DLM_RCOM_LOOKUP_REPLY	7
+#define DLM_RCOM_LOCK_REPLY	8
+
+struct dlm_rcom {
+	struct dlm_header	rc_header;
+	uint32_t		rc_type;	/* DLM_RCOM_ */
+	int			rc_result;	/* multi-purpose */
+	uint64_t		rc_id;		/* match reply with request */
+	uint64_t		rc_seq;		/* sender's ls_recover_seq */
+	uint64_t		rc_seq_reply;	/* remote ls_recover_seq */
+	char			rc_buf[0];
+};
+
+union dlm_packet {
+	struct dlm_header	header;		/* common to other two */
+	struct dlm_message	message;
+	struct dlm_rcom		rcom;
+};
+
+#define DLM_RSF_NEED_SLOTS	0x00000001
+
+/* RCOM_STATUS data */
+struct rcom_status {
+	__le32			rs_flags;
+	__le32			rs_unused1;
+	__le64			rs_unused2;
+};
+
+/* RCOM_STATUS_REPLY data */
+struct rcom_config {
+	__le32			rf_lvblen;
+	__le32			rf_lsflags;
+
+	/* DLM_HEADER_SLOTS adds: */
+	__le32			rf_flags;
+	__le16			rf_our_slot;
+	__le16			rf_num_slots;
+	__le32			rf_generation;
+	__le32			rf_unused1;
+	__le64			rf_unused2;
+};
+
+struct rcom_slot {
+	__le32			ro_nodeid;
+	__le16			ro_slot;
+	__le16			ro_unused1;
+	__le64			ro_unused2;
+};
+
+struct rcom_lock {
+	__le32			rl_ownpid;
+	__le32			rl_lkid;
+	__le32			rl_remid;
+	__le32			rl_parent_lkid;
+	__le32			rl_parent_remid;
+	__le32			rl_exflags;
+	__le32			rl_flags;
+	__le32			rl_lvbseq;
+	__le32			rl_result;
+	int8_t			rl_rqmode;
+	int8_t			rl_grmode;
+	int8_t			rl_status;
+	int8_t			rl_asts;
+	__le16			rl_wait_type;
+	__le16			rl_namelen;
+	char			rl_name[DLM_RESNAME_MAXLEN];
+	char			rl_lvb[0];
+};
+
+/*
+ * The max number of resources per rsbtbl bucket that shrink will attempt
+ * to remove in each iteration.
+ */
+
+#define DLM_REMOVE_NAMES_MAX 8
+
+struct dlm_ls {
+	struct list_head	ls_list;	/* list of lockspaces */
+	dlm_lockspace_t		*ls_local_handle;
+	uint32_t		ls_global_id;	/* global unique lockspace ID */
+	uint32_t		ls_generation;
+	uint32_t		ls_exflags;
+	int			ls_lvblen;
+	int			ls_count;	/* refcount of processes in
+						   the dlm using this ls */
+	int			ls_create_count; /* create/release refcount */
+	unsigned long		ls_flags;	/* LSFL_ */
+	unsigned long		ls_scan_time;
+	struct kobject		ls_kobj;
+
+	struct idr		ls_lkbidr;
+	spinlock_t		ls_lkbidr_spin;
+
+	struct dlm_rsbtable	*ls_rsbtbl;
+	uint32_t		ls_rsbtbl_size;
+
+	struct mutex		ls_waiters_mutex;
+	struct list_head	ls_waiters;	/* lkbs needing a reply */
+
+	struct mutex		ls_orphans_mutex;
+	struct list_head	ls_orphans;
+
+	struct mutex		ls_timeout_mutex;
+	struct list_head	ls_timeout;
+
+	spinlock_t		ls_new_rsb_spin;
+	int			ls_new_rsb_count;
+	struct list_head	ls_new_rsb;	/* new rsb structs */
+
+	spinlock_t		ls_remove_spin;
+	char			ls_remove_name[DLM_RESNAME_MAXLEN+1];
+	char			*ls_remove_names[DLM_REMOVE_NAMES_MAX];
+	int			ls_remove_len;
+	int			ls_remove_lens[DLM_REMOVE_NAMES_MAX];
+
+	struct list_head	ls_nodes;	/* current nodes in ls */
+	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
+	int			ls_num_nodes;	/* number of nodes in ls */
+	int			ls_low_nodeid;
+	int			ls_total_weight;
+	int			*ls_node_array;
+
+	int			ls_slot;
+	int			ls_num_slots;
+	int			ls_slots_size;
+	struct dlm_slot		*ls_slots;
+
+	struct dlm_rsb		ls_stub_rsb;	/* for returning errors */
+	struct dlm_lkb		ls_stub_lkb;	/* for returning errors */
+	struct dlm_message	ls_stub_ms;	/* for faking a reply */
+
+	struct dentry		*ls_debug_rsb_dentry; /* debugfs */
+	struct dentry		*ls_debug_waiters_dentry; /* debugfs */
+	struct dentry		*ls_debug_locks_dentry; /* debugfs */
+	struct dentry		*ls_debug_all_dentry; /* debugfs */
+	struct dentry		*ls_debug_toss_dentry; /* debugfs */
+
+	wait_queue_head_t	ls_uevent_wait;	/* user part of join/leave */
+	int			ls_uevent_result;
+	struct completion	ls_members_done;
+	int			ls_members_result;
+
+	struct miscdevice       ls_device;
+
+	struct workqueue_struct	*ls_callback_wq;
+
+	/* recovery related */
+
+	struct mutex		ls_cb_mutex;
+	struct list_head	ls_cb_delay; /* save for queue_work later */
+	struct timer_list	ls_timer;
+	struct task_struct	*ls_recoverd_task;
+	struct mutex		ls_recoverd_active;
+	spinlock_t		ls_recover_lock;
+	unsigned long		ls_recover_begin; /* jiffies timestamp */
+	uint32_t		ls_recover_status; /* DLM_RS_ */
+	uint64_t		ls_recover_seq;
+	struct dlm_recover	*ls_recover_args;
+	struct rw_semaphore	ls_in_recovery;	/* block local requests */
+	struct rw_semaphore	ls_recv_active;	/* block dlm_recv */
+	struct list_head	ls_requestqueue;/* queue remote requests */
+	struct mutex		ls_requestqueue_mutex;
+	struct dlm_rcom		*ls_recover_buf;
+	int			ls_recover_nodeid; /* for debugging */
+	unsigned int		ls_recover_dir_sent_res; /* for log info */
+	unsigned int		ls_recover_dir_sent_msg; /* for log info */
+	unsigned int		ls_recover_locks_in; /* for log info */
+	uint64_t		ls_rcom_seq;
+	spinlock_t		ls_rcom_spin;
+	struct list_head	ls_recover_list;
+	spinlock_t		ls_recover_list_lock;
+	int			ls_recover_list_count;
+	struct idr		ls_recover_idr;
+	spinlock_t		ls_recover_idr_lock;
+	wait_queue_head_t	ls_wait_general;
+	wait_queue_head_t	ls_recover_lock_wait;
+	struct mutex		ls_clear_proc_locks;
+
+	struct list_head	ls_root_list;	/* root resources */
+	struct rw_semaphore	ls_root_sem;	/* protect root_list */
+
+	const struct dlm_lockspace_ops *ls_ops;
+	void			*ls_ops_arg;
+
+	int			ls_namelen;
+	char			ls_name[1];
+};
+
+/*
+ * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines
+ * that they should abort what they're doing so new recovery can be started.
+ *
+ * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it
+ * should do down_write() on the in_recovery rw_semaphore. (doing down_write
+ * within dlm_ls_stop causes complaints about the lock acquired/released
+ * in different contexts.)
+ *
+ * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore.
+ * It sets this after it is done with down_write() on the in_recovery
+ * rw_semaphore and clears it after it has released the rw_semaphore.
+ *
+ * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it
+ * should begin recovery of the lockspace.
+ *
+ * LSFL_RUNNING - set when normal locking activity is enabled.
+ * dlm_ls_stop() clears this to tell dlm locking routines that they should
+ * quit what they are doing so recovery can run.  dlm_recoverd sets
+ * this after recovery is finished.
+ */
+
+#define LSFL_RECOVER_STOP	0
+#define LSFL_RECOVER_DOWN	1
+#define LSFL_RECOVER_LOCK	2
+#define LSFL_RECOVER_WORK	3
+#define LSFL_RUNNING		4
+
+#define LSFL_RCOM_READY		5
+#define LSFL_RCOM_WAIT		6
+#define LSFL_UEVENT_WAIT	7
+#define LSFL_TIMEWARN		8
+#define LSFL_CB_DELAY		9
+#define LSFL_NODIR		10
+
+/* much of this is just saving user space pointers associated with the
+   lock that we pass back to the user lib with an ast */
+
+struct dlm_user_args {
+	struct dlm_user_proc	*proc; /* each process that opens the lockspace
+					  device has private data
+					  (dlm_user_proc) on the struct file,
+					  the process's locks point back to it*/
+	struct dlm_lksb		lksb;
+	struct dlm_lksb __user	*user_lksb;
+	void __user		*castparam;
+	void __user		*castaddr;
+	void __user		*bastparam;
+	void __user		*bastaddr;
+	uint64_t		xid;
+};
+
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT  2
+
+/* locks list is kept so we can remove all a process's locks when it
+   exits (or orphan those that are persistent) */
+
+struct dlm_user_proc {
+	dlm_lockspace_t		*lockspace;
+	unsigned long		flags; /* DLM_PROC_FLAGS */
+	struct list_head	asts;
+	spinlock_t		asts_spin;
+	struct list_head	locks;
+	spinlock_t		locks_spin;
+	struct list_head	unlocking;
+	wait_queue_head_t	wait;
+};
+
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+	return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+}
+
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_NODIR, &ls->ls_flags);
+}
+
+int dlm_netlink_init(void);
+void dlm_netlink_exit(void);
+void dlm_timeout_warn(struct dlm_lkb *lkb);
+int dlm_plock_init(void);
+void dlm_plock_exit(void);
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
+#endif				/* __DLM_INTERNAL_DOT_H__ */
+
diff --git a/kernel/fs/dlm/lock.c b/kernel/fs/dlm/lock.c
new file mode 100644
index 000000000..35502d404
--- /dev/null
+++ b/kernel/fs/dlm/lock.c
@@ -0,0 +1,6304 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* Central locking logic has four stages:
+
+   dlm_lock()
+   dlm_unlock()
+
+   request_lock(ls, lkb)
+   convert_lock(ls, lkb)
+   unlock_lock(ls, lkb)
+   cancel_lock(ls, lkb)
+
+   _request_lock(r, lkb)
+   _convert_lock(r, lkb)
+   _unlock_lock(r, lkb)
+   _cancel_lock(r, lkb)
+
+   do_request(r, lkb)
+   do_convert(r, lkb)
+   do_unlock(r, lkb)
+   do_cancel(r, lkb)
+
+   Stage 1 (lock, unlock) is mainly about checking input args and
+   splitting into one of the four main operations:
+
+       dlm_lock          = request_lock
+       dlm_lock+CONVERT  = convert_lock
+       dlm_unlock        = unlock_lock
+       dlm_unlock+CANCEL = cancel_lock
+
+   Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+   provided to the next stage.
+
+   Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+   When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+   Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
+   given rsb and lkb and queues callbacks.
+
+   For remote operations, send_xxxx() results in the corresponding do_xxxx()
+   function being executed on the remote node.  The connecting send/receive
+   calls on local (L) and remote (R) nodes:
+
+   L: send_xxxx()              ->  R: receive_xxxx()
+                                   R: do_xxxx()
+   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
+*/
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+				    struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
+static void del_timeout(struct dlm_lkb *lkb);
+static void toss_rsb(struct kref *kref);
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+
+static const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
+        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
+        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
+        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
+        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
+        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int dlm_lvb_operations[8][8] = {
+        /* UN   NL  CR  CW  PR  PW  EX  PD*/
+        {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+        {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+        {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+        {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+        {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+        {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+
+#define modes_compat(gr, rq) \
+	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+static const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
+        {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
+        {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
+        {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
+        {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
+        {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+	printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x "
+	       "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n",
+	       lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+	       lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+	       lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid,
+	       (unsigned long long)lkb->lkb_recover_seq);
+}
+
+static void dlm_print_rsb(struct dlm_rsb *r)
+{
+	printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x "
+	       "rlc %d name %s\n",
+	       r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
+	       r->res_flags, r->res_first_lkid, r->res_recover_locks_count,
+	       r->res_name);
+}
+
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb;
+
+	dlm_print_rsb(r);
+
+	printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+	       list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+	printk(KERN_ERR "rsb lookup list\n");
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb grant queue:\n");
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb convert queue:\n");
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb wait queue:\n");
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static inline void dlm_lock_recovery(struct dlm_ls *ls)
+{
+	down_read(&ls->ls_in_recovery);
+}
+
+void dlm_unlock_recovery(struct dlm_ls *ls)
+{
+	up_read(&ls->ls_in_recovery);
+}
+
+int dlm_lock_recovery_try(struct dlm_ls *ls)
+{
+	return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+	return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+
+static inline int is_altmode(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
+}
+
+static inline int is_granted(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_status == DLM_LKSTS_GRANTED);
+}
+
+static inline int is_remote(struct dlm_rsb *r)
+{
+	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+	return !!r->res_nodeid;
+}
+
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
+}
+
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+	if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+	    (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+		return 1;
+	return 0;
+}
+
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+
+static inline int is_overlap_unlock(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
+}
+
+static inline int is_overlap_cancel(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
+}
+
+static inline int is_overlap(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
+				  DLM_IFL_OVERLAP_CANCEL));
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	if (is_master_copy(lkb))
+		return;
+
+	del_timeout(lkb);
+
+	DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+	/* if the operation was a cancel, then return -DLM_ECANCEL, if a
+	   timeout caused the cancel then return -ETIMEDOUT */
+	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
+		lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
+		rv = -ETIMEDOUT;
+	}
+
+	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
+		lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
+		rv = -EDEADLK;
+	}
+
+	dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
+}
+
+static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	queue_cast(r, lkb,
+		   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+	if (is_master_copy(lkb)) {
+		send_bast(r, lkb, rqmode);
+	} else {
+		dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0);
+	}
+}
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the rsb, so there's no need for locking. */
+
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+	kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+	hold_rsb(r);
+}
+
+/* When all references to the rsb are gone it's transferred to
+   the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+	uint32_t bucket = r->res_bucket;
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	kref_put(&r->res_ref, toss_rsb);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+	put_rsb(r);
+}
+
+static int pre_rsb_struct(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r1, *r2;
+	int count = 0;
+
+	spin_lock(&ls->ls_new_rsb_spin);
+	if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) {
+		spin_unlock(&ls->ls_new_rsb_spin);
+		return 0;
+	}
+	spin_unlock(&ls->ls_new_rsb_spin);
+
+	r1 = dlm_allocate_rsb(ls);
+	r2 = dlm_allocate_rsb(ls);
+
+	spin_lock(&ls->ls_new_rsb_spin);
+	if (r1) {
+		list_add(&r1->res_hashchain, &ls->ls_new_rsb);
+		ls->ls_new_rsb_count++;
+	}
+	if (r2) {
+		list_add(&r2->res_hashchain, &ls->ls_new_rsb);
+		ls->ls_new_rsb_count++;
+	}
+	count = ls->ls_new_rsb_count;
+	spin_unlock(&ls->ls_new_rsb_spin);
+
+	if (!count)
+		return -ENOMEM;
+	return 0;
+}
+
+/* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can
+   unlock any spinlocks, go back and call pre_rsb_struct again.
+   Otherwise, take an rsb off the list and return it. */
+
+static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
+			  struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r;
+	int count;
+
+	spin_lock(&ls->ls_new_rsb_spin);
+	if (list_empty(&ls->ls_new_rsb)) {
+		count = ls->ls_new_rsb_count;
+		spin_unlock(&ls->ls_new_rsb_spin);
+		log_debug(ls, "find_rsb retry %d %d %s",
+			  count, dlm_config.ci_new_rsb_count, name);
+		return -EAGAIN;
+	}
+
+	r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
+	list_del(&r->res_hashchain);
+	/* Convert the empty list_head to a NULL rb_node for tree usage: */
+	memset(&r->res_hashnode, 0, sizeof(struct rb_node));
+	ls->ls_new_rsb_count--;
+	spin_unlock(&ls->ls_new_rsb_spin);
+
+	r->res_ls = ls;
+	r->res_length = len;
+	memcpy(r->res_name, name, len);
+	mutex_init(&r->res_mutex);
+
+	INIT_LIST_HEAD(&r->res_lookup);
+	INIT_LIST_HEAD(&r->res_grantqueue);
+	INIT_LIST_HEAD(&r->res_convertqueue);
+	INIT_LIST_HEAD(&r->res_waitqueue);
+	INIT_LIST_HEAD(&r->res_root_list);
+	INIT_LIST_HEAD(&r->res_recover_list);
+
+	*r_ret = r;
+	return 0;
+}
+
+static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
+{
+	char maxname[DLM_RESNAME_MAXLEN];
+
+	memset(maxname, 0, DLM_RESNAME_MAXLEN);
+	memcpy(maxname, name, nlen);
+	return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
+}
+
+int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+			struct dlm_rsb **r_ret)
+{
+	struct rb_node *node = tree->rb_node;
+	struct dlm_rsb *r;
+	int rc;
+
+	while (node) {
+		r = rb_entry(node, struct dlm_rsb, res_hashnode);
+		rc = rsb_cmp(r, name, len);
+		if (rc < 0)
+			node = node->rb_left;
+		else if (rc > 0)
+			node = node->rb_right;
+		else
+			goto found;
+	}
+	*r_ret = NULL;
+	return -EBADR;
+
+ found:
+	*r_ret = r;
+	return 0;
+}
+
+static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
+{
+	struct rb_node **newn = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int rc;
+
+	while (*newn) {
+		struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
+					       res_hashnode);
+
+		parent = *newn;
+		rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
+		if (rc < 0)
+			newn = &parent->rb_left;
+		else if (rc > 0)
+			newn = &parent->rb_right;
+		else {
+			log_print("rsb_insert match");
+			dlm_dump_rsb(rsb);
+			dlm_dump_rsb(cur);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&rsb->res_hashnode, parent, newn);
+	rb_insert_color(&rsb->res_hashnode, tree);
+	return 0;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused.  Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list.  When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ *
+ * rsb's on the keep list are being used locally and refcounted.
+ * rsb's on the toss list are not being used locally, and are not refcounted.
+ *
+ * The toss list rsb's were either
+ * - previously used locally but not any more (were on keep list, then
+ *   moved to toss list when last refcount dropped)
+ * - created and put on toss list as a directory record for a lookup
+ *   (we are the dir node for the res, but are not using the res right now,
+ *   but some other node is)
+ *
+ * The purpose of find_rsb() is to return a refcounted rsb for local use.
+ * So, if the given rsb is on the toss list, it is moved to the keep list
+ * before being returned.
+ *
+ * toss_rsb() happens when all local usage of the rsb is done, i.e. no
+ * more refcounts exist, so the rsb is moved from the keep list to the
+ * toss list.
+ *
+ * rsb's on both keep and toss lists are used for doing a name to master
+ * lookups.  rsb's that are in use locally (and being refcounted) are on
+ * the keep list, rsb's that are not in use locally (not refcounted) and
+ * only exist for name/master lookups are on the toss list.
+ *
+ * rsb's on the toss list who's dir_nodeid is not local can have stale
+ * name/master mappings.  So, remote requests on such rsb's can potentially
+ * return with an error, which means the mapping is stale and needs to
+ * be updated with a new lookup.  (The idea behind MASTER UNCERTAIN and
+ * first_lkid is to keep only a single outstanding request on an rsb
+ * while that rsb has a potentially stale master.)
+ */
+
+static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
+			uint32_t hash, uint32_t b,
+			int dir_nodeid, int from_nodeid,
+			unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r = NULL;
+	int our_nodeid = dlm_our_nodeid();
+	int from_local = 0;
+	int from_other = 0;
+	int from_dir = 0;
+	int create = 0;
+	int error;
+
+	if (flags & R_RECEIVE_REQUEST) {
+		if (from_nodeid == dir_nodeid)
+			from_dir = 1;
+		else
+			from_other = 1;
+	} else if (flags & R_REQUEST) {
+		from_local = 1;
+	}
+
+	/*
+	 * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so
+	 * from_nodeid has sent us a lock in dlm_recover_locks, believing
+	 * we're the new master.  Our local recovery may not have set
+	 * res_master_nodeid to our_nodeid yet, so allow either.  Don't
+	 * create the rsb; dlm_recover_process_copy() will handle EBADR
+	 * by resending.
+	 *
+	 * If someone sends us a request, we are the dir node, and we do
+	 * not find the rsb anywhere, then recreate it.  This happens if
+	 * someone sends us a request after we have removed/freed an rsb
+	 * from our toss list.  (They sent a request instead of lookup
+	 * because they are using an rsb from their toss list.)
+	 */
+
+	if (from_local || from_dir ||
+	    (from_other && (dir_nodeid == our_nodeid))) {
+		create = 1;
+	}
+
+ retry:
+	if (create) {
+		error = pre_rsb_struct(ls);
+		if (error < 0)
+			goto out;
+	}
+
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+	if (error)
+		goto do_toss;
+	
+	/*
+	 * rsb is active, so we can't check master_nodeid without lock_rsb.
+	 */
+
+	kref_get(&r->res_ref);
+	error = 0;
+	goto out_unlock;
+
+
+ do_toss:
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+	if (error)
+		goto do_new;
+
+	/*
+	 * rsb found inactive (master_nodeid may be out of date unless
+	 * we are the dir_nodeid or were the master)  No other thread
+	 * is using this rsb because it's on the toss list, so we can
+	 * look at or update res_master_nodeid without lock_rsb.
+	 */
+
+	if ((r->res_master_nodeid != our_nodeid) && from_other) {
+		/* our rsb was not master, and another node (not the dir node)
+		   has sent us a request */
+		log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s",
+			  from_nodeid, r->res_master_nodeid, dir_nodeid,
+			  r->res_name);
+		error = -ENOTBLK;
+		goto out_unlock;
+	}
+
+	if ((r->res_master_nodeid != our_nodeid) && from_dir) {
+		/* don't think this should ever happen */
+		log_error(ls, "find_rsb toss from_dir %d master %d",
+			  from_nodeid, r->res_master_nodeid);
+		dlm_print_rsb(r);
+		/* fix it and go on */
+		r->res_master_nodeid = our_nodeid;
+		r->res_nodeid = 0;
+		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = 0;
+	}
+
+	if (from_local && (r->res_master_nodeid != our_nodeid)) {
+		/* Because we have held no locks on this rsb,
+		   res_master_nodeid could have become stale. */
+		rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = 0;
+	}
+
+	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+	goto out_unlock;
+
+
+ do_new:
+	/*
+	 * rsb not found
+	 */
+
+	if (error == -EBADR && !create)
+		goto out_unlock;
+
+	error = get_rsb_struct(ls, name, len, &r);
+	if (error == -EAGAIN) {
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		goto retry;
+	}
+	if (error)
+		goto out_unlock;
+
+	r->res_hash = hash;
+	r->res_bucket = b;
+	r->res_dir_nodeid = dir_nodeid;
+	kref_init(&r->res_ref);
+
+	if (from_dir) {
+		/* want to see how often this happens */
+		log_debug(ls, "find_rsb new from_dir %d recreate %s",
+			  from_nodeid, r->res_name);
+		r->res_master_nodeid = our_nodeid;
+		r->res_nodeid = 0;
+		goto out_add;
+	}
+
+	if (from_other && (dir_nodeid != our_nodeid)) {
+		/* should never happen */
+		log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
+			  from_nodeid, dir_nodeid, our_nodeid, r->res_name);
+		dlm_free_rsb(r);
+		r = NULL;
+		error = -ENOTBLK;
+		goto out_unlock;
+	}
+
+	if (from_other) {
+		log_debug(ls, "find_rsb new from_other %d dir %d %s",
+			  from_nodeid, dir_nodeid, r->res_name);
+	}
+
+	if (dir_nodeid == our_nodeid) {
+		/* When we are the dir nodeid, we can set the master
+		   node immediately */
+		r->res_master_nodeid = our_nodeid;
+		r->res_nodeid = 0;
+	} else {
+		/* set_master will send_lookup to dir_nodeid */
+		r->res_master_nodeid = 0;
+		r->res_nodeid = -1;
+	}
+
+ out_add:
+	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ out_unlock:
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
+ out:
+	*r_ret = r;
+	return error;
+}
+
+/* During recovery, other nodes can send us new MSTCPY locks (from
+   dlm_recover_locks) before we've made ourself master (in
+   dlm_recover_masters). */
+
+static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
+			  uint32_t hash, uint32_t b,
+			  int dir_nodeid, int from_nodeid,
+			  unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r = NULL;
+	int our_nodeid = dlm_our_nodeid();
+	int recover = (flags & R_RECEIVE_RECOVER);
+	int error;
+
+ retry:
+	error = pre_rsb_struct(ls);
+	if (error < 0)
+		goto out;
+
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+	if (error)
+		goto do_toss;
+
+	/*
+	 * rsb is active, so we can't check master_nodeid without lock_rsb.
+	 */
+
+	kref_get(&r->res_ref);
+	goto out_unlock;
+
+
+ do_toss:
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+	if (error)
+		goto do_new;
+
+	/*
+	 * rsb found inactive. No other thread is using this rsb because
+	 * it's on the toss list, so we can look at or update
+	 * res_master_nodeid without lock_rsb.
+	 */
+
+	if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) {
+		/* our rsb is not master, and another node has sent us a
+		   request; this should never happen */
+		log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d",
+			  from_nodeid, r->res_master_nodeid, dir_nodeid);
+		dlm_print_rsb(r);
+		error = -ENOTBLK;
+		goto out_unlock;
+	}
+
+	if (!recover && (r->res_master_nodeid != our_nodeid) &&
+	    (dir_nodeid == our_nodeid)) {
+		/* our rsb is not master, and we are dir; may as well fix it;
+		   this should never happen */
+		log_error(ls, "find_rsb toss our %d master %d dir %d",
+			  our_nodeid, r->res_master_nodeid, dir_nodeid);
+		dlm_print_rsb(r);
+		r->res_master_nodeid = our_nodeid;
+		r->res_nodeid = 0;
+	}
+
+	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+	goto out_unlock;
+
+
+ do_new:
+	/*
+	 * rsb not found
+	 */
+
+	error = get_rsb_struct(ls, name, len, &r);
+	if (error == -EAGAIN) {
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		goto retry;
+	}
+	if (error)
+		goto out_unlock;
+
+	r->res_hash = hash;
+	r->res_bucket = b;
+	r->res_dir_nodeid = dir_nodeid;
+	r->res_master_nodeid = dir_nodeid;
+	r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid;
+	kref_init(&r->res_ref);
+
+	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ out_unlock:
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
+ out:
+	*r_ret = r;
+	return error;
+}
+
+static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid,
+		    unsigned int flags, struct dlm_rsb **r_ret)
+{
+	uint32_t hash, b;
+	int dir_nodeid;
+
+	if (len > DLM_RESNAME_MAXLEN)
+		return -EINVAL;
+
+	hash = jhash(name, len, 0);
+	b = hash & (ls->ls_rsbtbl_size - 1);
+
+	dir_nodeid = dlm_hash2nodeid(ls, hash);
+
+	if (dlm_no_directory(ls))
+		return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid,
+				      from_nodeid, flags, r_ret);
+	else
+		return find_rsb_dir(ls, name, len, hash, b, dir_nodeid,
+				      from_nodeid, flags, r_ret);
+}
+
+/* we have received a request and found that res_master_nodeid != our_nodeid,
+   so we need to return an error or make ourself the master */
+
+static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
+				  int from_nodeid)
+{
+	if (dlm_no_directory(ls)) {
+		log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d",
+			  from_nodeid, r->res_master_nodeid,
+			  r->res_dir_nodeid);
+		dlm_print_rsb(r);
+		return -ENOTBLK;
+	}
+
+	if (from_nodeid != r->res_dir_nodeid) {
+		/* our rsb is not master, and another node (not the dir node)
+	   	   has sent us a request.  this is much more common when our
+	   	   master_nodeid is zero, so limit debug to non-zero.  */
+
+		if (r->res_master_nodeid) {
+			log_debug(ls, "validate master from_other %d master %d "
+				  "dir %d first %x %s", from_nodeid,
+				  r->res_master_nodeid, r->res_dir_nodeid,
+				  r->res_first_lkid, r->res_name);
+		}
+		return -ENOTBLK;
+	} else {
+		/* our rsb is not master, but the dir nodeid has sent us a
+	   	   request; this could happen with master 0 / res_nodeid -1 */
+
+		if (r->res_master_nodeid) {
+			log_error(ls, "validate master from_dir %d master %d "
+				  "first %x %s",
+				  from_nodeid, r->res_master_nodeid,
+				  r->res_first_lkid, r->res_name);
+		}
+
+		r->res_master_nodeid = dlm_our_nodeid();
+		r->res_nodeid = 0;
+		return 0;
+	}
+}
+
+/*
+ * We're the dir node for this res and another node wants to know the
+ * master nodeid.  During normal operation (non recovery) this is only
+ * called from receive_lookup(); master lookups when the local node is
+ * the dir node are done by find_rsb().
+ *
+ * normal operation, we are the dir node for a resource
+ * . _request_lock
+ * . set_master
+ * . send_lookup
+ * . receive_lookup
+ * . dlm_master_lookup flags 0
+ *
+ * recover directory, we are rebuilding dir for all resources
+ * . dlm_recover_directory
+ * . dlm_rcom_names
+ *   remote node sends back the rsb names it is master of and we are dir of
+ * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1)
+ *   we either create new rsb setting remote node as master, or find existing
+ *   rsb and set master to be the remote node.
+ *
+ * recover masters, we are finding the new master for resources
+ * . dlm_recover_masters
+ * . recover_master
+ * . dlm_send_rcom_lookup
+ * . receive_rcom_lookup
+ * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0)
+ */
+
+int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
+		      unsigned int flags, int *r_nodeid, int *result)
+{
+	struct dlm_rsb *r = NULL;
+	uint32_t hash, b;
+	int from_master = (flags & DLM_LU_RECOVER_DIR);
+	int fix_master = (flags & DLM_LU_RECOVER_MASTER);
+	int our_nodeid = dlm_our_nodeid();
+	int dir_nodeid, error, toss_list = 0;
+
+	if (len > DLM_RESNAME_MAXLEN)
+		return -EINVAL;
+
+	if (from_nodeid == our_nodeid) {
+		log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x",
+			  our_nodeid, flags);
+		return -EINVAL;
+	}
+
+	hash = jhash(name, len, 0);
+	b = hash & (ls->ls_rsbtbl_size - 1);
+
+	dir_nodeid = dlm_hash2nodeid(ls, hash);
+	if (dir_nodeid != our_nodeid) {
+		log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d",
+			  from_nodeid, dir_nodeid, our_nodeid, hash,
+			  ls->ls_num_nodes);
+		*r_nodeid = -1;
+		return -EINVAL;
+	}
+
+ retry:
+	error = pre_rsb_struct(ls);
+	if (error < 0)
+		return error;
+
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+	if (!error) {
+		/* because the rsb is active, we need to lock_rsb before
+		   checking/changing re_master_nodeid */
+
+		hold_rsb(r);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		lock_rsb(r);
+		goto found;
+	}
+
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+	if (error)
+		goto not_found;
+
+	/* because the rsb is inactive (on toss list), it's not refcounted
+	   and lock_rsb is not used, but is protected by the rsbtbl lock */
+
+	toss_list = 1;
+ found:
+	if (r->res_dir_nodeid != our_nodeid) {
+		/* should not happen, but may as well fix it and carry on */
+		log_error(ls, "dlm_master_lookup res_dir %d our %d %s",
+			  r->res_dir_nodeid, our_nodeid, r->res_name);
+		r->res_dir_nodeid = our_nodeid;
+	}
+
+	if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
+		/* Recovery uses this function to set a new master when
+		   the previous master failed.  Setting NEW_MASTER will
+		   force dlm_recover_masters to call recover_master on this
+		   rsb even though the res_nodeid is no longer removed. */
+
+		r->res_master_nodeid = from_nodeid;
+		r->res_nodeid = from_nodeid;
+		rsb_set_flag(r, RSB_NEW_MASTER);
+
+		if (toss_list) {
+			/* I don't think we should ever find it on toss list. */
+			log_error(ls, "dlm_master_lookup fix_master on toss");
+			dlm_dump_rsb(r);
+		}
+	}
+
+	if (from_master && (r->res_master_nodeid != from_nodeid)) {
+		/* this will happen if from_nodeid became master during
+		   a previous recovery cycle, and we aborted the previous
+		   cycle before recovering this master value */
+
+		log_limit(ls, "dlm_master_lookup from_master %d "
+			  "master_nodeid %d res_nodeid %d first %x %s",
+			  from_nodeid, r->res_master_nodeid, r->res_nodeid,
+			  r->res_first_lkid, r->res_name);
+
+		if (r->res_master_nodeid == our_nodeid) {
+			log_error(ls, "from_master %d our_master", from_nodeid);
+			dlm_dump_rsb(r);
+			dlm_send_rcom_lookup_dump(r, from_nodeid);
+			goto out_found;
+		}
+
+		r->res_master_nodeid = from_nodeid;
+		r->res_nodeid = from_nodeid;
+		rsb_set_flag(r, RSB_NEW_MASTER);
+	}
+
+	if (!r->res_master_nodeid) {
+		/* this will happen if recovery happens while we're looking
+		   up the master for this rsb */
+
+		log_debug(ls, "dlm_master_lookup master 0 to %d first %x %s",
+			  from_nodeid, r->res_first_lkid, r->res_name);
+		r->res_master_nodeid = from_nodeid;
+		r->res_nodeid = from_nodeid;
+	}
+
+	if (!from_master && !fix_master &&
+	    (r->res_master_nodeid == from_nodeid)) {
+		/* this can happen when the master sends remove, the dir node
+		   finds the rsb on the keep list and ignores the remove,
+		   and the former master sends a lookup */
+
+		log_limit(ls, "dlm_master_lookup from master %d flags %x "
+			  "first %x %s", from_nodeid, flags,
+			  r->res_first_lkid, r->res_name);
+	}
+
+ out_found:
+	*r_nodeid = r->res_master_nodeid;
+	if (result)
+		*result = DLM_LU_MATCH;
+
+	if (toss_list) {
+		r->res_toss_time = jiffies;
+		/* the rsb was inactive (on toss list) */
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+	} else {
+		/* the rsb was active */
+		unlock_rsb(r);
+		put_rsb(r);
+	}
+	return 0;
+
+ not_found:
+	error = get_rsb_struct(ls, name, len, &r);
+	if (error == -EAGAIN) {
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		goto retry;
+	}
+	if (error)
+		goto out_unlock;
+
+	r->res_hash = hash;
+	r->res_bucket = b;
+	r->res_dir_nodeid = our_nodeid;
+	r->res_master_nodeid = from_nodeid;
+	r->res_nodeid = from_nodeid;
+	kref_init(&r->res_ref);
+	r->res_toss_time = jiffies;
+
+	error = rsb_insert(r, &ls->ls_rsbtbl[b].toss);
+	if (error) {
+		/* should never happen */
+		dlm_free_rsb(r);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		goto retry;
+	}
+
+	if (result)
+		*result = DLM_LU_ADD;
+	*r_nodeid = from_nodeid;
+	error = 0;
+ out_unlock:
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
+	return error;
+}
+
+static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash)
+{
+	struct rb_node *n;
+	struct dlm_rsb *r;
+	int i;
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		spin_lock(&ls->ls_rsbtbl[i].lock);
+		for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+			r = rb_entry(n, struct dlm_rsb, res_hashnode);
+			if (r->res_hash == hash)
+				dlm_dump_rsb(r);
+		}
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+}
+
+void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len)
+{
+	struct dlm_rsb *r = NULL;
+	uint32_t hash, b;
+	int error;
+
+	hash = jhash(name, len, 0);
+	b = hash & (ls->ls_rsbtbl_size - 1);
+
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+	if (!error)
+		goto out_dump;
+
+	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+	if (error)
+		goto out;
+ out_dump:
+	dlm_dump_rsb(r);
+ out:
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+	struct dlm_ls *ls = r->res_ls;
+
+	DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+	kref_init(&r->res_ref);
+	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
+	rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
+	r->res_toss_time = jiffies;
+	ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;
+	if (r->res_lvbptr) {
+		dlm_free_lvb(r->res_lvbptr);
+		r->res_lvbptr = NULL;
+	}
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+	int rv;
+	rv = kref_put(&r->res_ref, toss_rsb);
+	DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+	/* All work is done after the return from kref_put() so we
+	   can release the write_lock before the remove and free. */
+
+	DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+   The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	hold_rsb(r);
+	lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_resource) {
+		put_rsb(lkb->lkb_resource);
+		lkb->lkb_resource = NULL;
+	}
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+	struct dlm_lkb *lkb;
+	int rv;
+
+	lkb = dlm_allocate_lkb(ls);
+	if (!lkb)
+		return -ENOMEM;
+
+	lkb->lkb_nodeid = -1;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	kref_init(&lkb->lkb_ref);
+	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
+	INIT_LIST_HEAD(&lkb->lkb_time_list);
+	INIT_LIST_HEAD(&lkb->lkb_cb_list);
+	mutex_init(&lkb->lkb_cb_mutex);
+	INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
+
+	idr_preload(GFP_NOFS);
+	spin_lock(&ls->ls_lkbidr_spin);
+	rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT);
+	if (rv >= 0)
+		lkb->lkb_id = rv;
+	spin_unlock(&ls->ls_lkbidr_spin);
+	idr_preload_end();
+
+	if (rv < 0) {
+		log_error(ls, "create_lkb idr error %d", rv);
+		return rv;
+	}
+
+	*lkb_ret = lkb;
+	return 0;
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+	struct dlm_lkb *lkb;
+
+	spin_lock(&ls->ls_lkbidr_spin);
+	lkb = idr_find(&ls->ls_lkbidr, lkid);
+	if (lkb)
+		kref_get(&lkb->lkb_ref);
+	spin_unlock(&ls->ls_lkbidr_spin);
+
+	*lkb_ret = lkb;
+	return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+	struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+	/* All work is done after the return from kref_put() so we
+	   can release the write_lock before the detach_lkb */
+
+	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+   it so we need to provide the lockspace explicitly */
+
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	uint32_t lkid = lkb->lkb_id;
+
+	spin_lock(&ls->ls_lkbidr_spin);
+	if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+		idr_remove(&ls->ls_lkbidr, lkid);
+		spin_unlock(&ls->ls_lkbidr_spin);
+
+		detach_lkb(lkb);
+
+		/* for local/process lkbs, lvbptr points to caller's lksb */
+		if (lkb->lkb_lvbptr && is_master_copy(lkb))
+			dlm_free_lvb(lkb->lkb_lvbptr);
+		dlm_free_lkb(lkb);
+		return 1;
+	} else {
+		spin_unlock(&ls->ls_lkbidr_spin);
+		return 0;
+	}
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls;
+
+	DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+	DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+
+	ls = lkb->lkb_resource->res_ls;
+	return __put_lkb(ls, lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the lkb, so there's no need for locking. */
+
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+	kref_get(&lkb->lkb_ref);
+}
+
+/* This is called when we need to remove a reference and are certain
+   it's not the last ref.  e.g. del_lkb is always called between a
+   find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+   put_lkb would work fine, but would involve unnecessary locking */
+
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+	int rv;
+	rv = kref_put(&lkb->lkb_ref, kill_lkb);
+	DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+			    int mode)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	list_for_each_entry(lkb, head, lkb_statequeue)
+		if (lkb->lkb_rqmode < mode)
+			break;
+
+	__list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+	kref_get(&lkb->lkb_ref);
+
+	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+	lkb->lkb_timestamp = ktime_get();
+
+	lkb->lkb_status = status;
+
+	switch (status) {
+	case DLM_LKSTS_WAITING:
+		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+		break;
+	case DLM_LKSTS_GRANTED:
+		/* convention says granted locks kept in order of grmode */
+		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+				lkb->lkb_grmode);
+		break;
+	case DLM_LKSTS_CONVERT:
+		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue,
+				      &r->res_convertqueue);
+		break;
+	default:
+		DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+	}
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	lkb->lkb_status = 0;
+	list_del(&lkb->lkb_statequeue);
+	unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+	hold_lkb(lkb);
+	del_lkb(r, lkb);
+	add_lkb(r, lkb, sts);
+	unhold_lkb(lkb);
+}
+
+static int msg_reply_type(int mstype)
+{
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+		return DLM_MSG_REQUEST_REPLY;
+	case DLM_MSG_CONVERT:
+		return DLM_MSG_CONVERT_REPLY;
+	case DLM_MSG_UNLOCK:
+		return DLM_MSG_UNLOCK_REPLY;
+	case DLM_MSG_CANCEL:
+		return DLM_MSG_CANCEL_REPLY;
+	case DLM_MSG_LOOKUP:
+		return DLM_MSG_LOOKUP_REPLY;
+	}
+	return -1;
+}
+
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+	int i;
+
+	for (i = 0; i < num_nodes; i++) {
+		if (!warned[i]) {
+			warned[i] = nodeid;
+			return 0;
+		}
+		if (warned[i] == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	ktime_t zero = ktime_set(0, 0);
+	s64 us;
+	s64 debug_maxus = 0;
+	u32 debug_scanned = 0;
+	u32 debug_expired = 0;
+	int num_nodes = 0;
+	int *warned = NULL;
+
+	if (!dlm_config.ci_waitwarn_us)
+		return;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (ktime_equal(lkb->lkb_wait_time, zero))
+			continue;
+
+		debug_scanned++;
+
+		us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+
+		if (us < dlm_config.ci_waitwarn_us)
+			continue;
+
+		lkb->lkb_wait_time = zero;
+
+		debug_expired++;
+		if (us > debug_maxus)
+			debug_maxus = us;
+
+		if (!num_nodes) {
+			num_nodes = ls->ls_num_nodes;
+			warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL);
+		}
+		if (!warned)
+			continue;
+		if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+			continue;
+
+		log_error(ls, "waitwarn %x %lld %d us check connection to "
+			  "node %d", lkb->lkb_id, (long long)us,
+			  dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+	kfree(warned);
+
+	if (debug_expired)
+		log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+			  debug_scanned, debug_expired,
+			  dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+   a reply from a remote node */
+
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error = 0;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	if (is_overlap_unlock(lkb) ||
+	    (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
+		switch (mstype) {
+		case DLM_MSG_UNLOCK:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			break;
+		case DLM_MSG_CANCEL:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			break;
+		default:
+			error = -EBUSY;
+			goto out;
+		}
+		lkb->lkb_wait_count++;
+		hold_lkb(lkb);
+
+		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
+			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
+			  lkb->lkb_wait_count, lkb->lkb_flags);
+		goto out;
+	}
+
+	DLM_ASSERT(!lkb->lkb_wait_count,
+		   dlm_print_lkb(lkb);
+		   printk("wait_count %d\n", lkb->lkb_wait_count););
+
+	lkb->lkb_wait_count++;
+	lkb->lkb_wait_type = mstype;
+	lkb->lkb_wait_time = ktime_get();
+	lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
+	hold_lkb(lkb);
+	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+	if (error)
+		log_error(ls, "addwait error %x %d flags %x %d %d %s",
+			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
+			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
+	mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
+/* We clear the RESEND flag because we might be taking an lkb off the waiters
+   list as part of process_requestqueue (e.g. a lookup that has an optimized
+   request reply on the requestqueue) between dlm_recover_waiters_pre() which
+   set RESEND and dlm_recover_waiters_post() */
+
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
+				struct dlm_message *ms)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int overlap_done = 0;
+
+	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
+		log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
+		log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	/* Cancel state was preemptively cleared by a successful convert,
+	   see next comment, nothing to do. */
+
+	if ((mstype == DLM_MSG_CANCEL_REPLY) &&
+	    (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
+		log_debug(ls, "remwait %x cancel_reply wait_type %d",
+			  lkb->lkb_id, lkb->lkb_wait_type);
+		return -1;
+	}
+
+	/* Remove for the convert reply, and premptively remove for the
+	   cancel reply.  A convert has been granted while there's still
+	   an outstanding cancel on it (the cancel is moot and the result
+	   in the cancel reply should be 0).  We preempt the cancel reply
+	   because the app gets the convert result and then can follow up
+	   with another op, like convert.  This subsequent op would see the
+	   lingering state of the cancel and fail with -EBUSY. */
+
+	if ((mstype == DLM_MSG_CONVERT_REPLY) &&
+	    (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
+	    is_overlap_cancel(lkb) && ms && !ms->m_result) {
+		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
+			  lkb->lkb_id);
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_count--;
+		goto out_del;
+	}
+
+	/* N.B. type of reply may not always correspond to type of original
+	   msg due to lookup->request optimization, verify others? */
+
+	if (lkb->lkb_wait_type) {
+		lkb->lkb_wait_type = 0;
+		goto out_del;
+	}
+
+	log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait",
+		  lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid,
+		  mstype, lkb->lkb_flags);
+	return -1;
+
+ out_del:
+	/* the force-unlock/cancel has completed and we haven't recvd a reply
+	   to the op that was in progress prior to the unlock/cancel; we
+	   give up on any reply to the earlier op.  FIXME: not sure when/how
+	   this would happen */
+
+	if (overlap_done && lkb->lkb_wait_type) {
+		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
+			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
+		lkb->lkb_wait_count--;
+		lkb->lkb_wait_type = 0;
+	}
+
+	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
+
+	lkb->lkb_flags &= ~DLM_IFL_RESEND;
+	lkb->lkb_wait_count--;
+	if (!lkb->lkb_wait_count)
+		list_del_init(&lkb->lkb_wait_reply);
+	unhold_lkb(lkb);
+	return 0;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	error = _remove_from_waiters(lkb, mstype, NULL);
+	mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
+/* Handles situations where we might be processing a "fake" or "stub" reply in
+   which we can't try to take waiters_mutex again. */
+
+static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error;
+
+	if (ms->m_flags != DLM_IFL_STUB_MS)
+		mutex_lock(&ls->ls_waiters_mutex);
+	error = _remove_from_waiters(lkb, ms->m_type, ms);
+	if (ms->m_flags != DLM_IFL_STUB_MS)
+		mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
+/* If there's an rsb for the same resource being removed, ensure
+   that the remove message is sent before the new lookup message.
+   It should be rare to need a delay here, but if not, then it may
+   be worthwhile to add a proper wait mechanism rather than a delay. */
+
+static void wait_pending_remove(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+ restart:
+	spin_lock(&ls->ls_remove_spin);
+	if (ls->ls_remove_len &&
+	    !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) {
+		log_debug(ls, "delay lookup for remove dir %d %s",
+		  	  r->res_dir_nodeid, r->res_name);
+		spin_unlock(&ls->ls_remove_spin);
+		msleep(1);
+		goto restart;
+	}
+	spin_unlock(&ls->ls_remove_spin);
+}
+
+/*
+ * ls_remove_spin protects ls_remove_name and ls_remove_len which are
+ * read by other threads in wait_pending_remove.  ls_remove_names
+ * and ls_remove_lens are only used by the scan thread, so they do
+ * not need protection.
+ */
+
+static void shrink_bucket(struct dlm_ls *ls, int b)
+{
+	struct rb_node *n, *next;
+	struct dlm_rsb *r;
+	char *name;
+	int our_nodeid = dlm_our_nodeid();
+	int remote_count = 0;
+	int need_shrink = 0;
+	int i, len, rv;
+
+	memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
+
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+
+	if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) {
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		return;
+	}
+
+	for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
+		next = rb_next(n);
+		r = rb_entry(n, struct dlm_rsb, res_hashnode);
+
+		/* If we're the directory record for this rsb, and
+		   we're not the master of it, then we need to wait
+		   for the master node to send us a dir remove for
+		   before removing the dir record. */
+
+		if (!dlm_no_directory(ls) &&
+		    (r->res_master_nodeid != our_nodeid) &&
+		    (dlm_dir_nodeid(r) == our_nodeid)) {
+			continue;
+		}
+
+		need_shrink = 1;
+
+		if (!time_after_eq(jiffies, r->res_toss_time +
+				   dlm_config.ci_toss_secs * HZ)) {
+			continue;
+		}
+
+		if (!dlm_no_directory(ls) &&
+		    (r->res_master_nodeid == our_nodeid) &&
+		    (dlm_dir_nodeid(r) != our_nodeid)) {
+
+			/* We're the master of this rsb but we're not
+			   the directory record, so we need to tell the
+			   dir node to remove the dir record. */
+
+			ls->ls_remove_lens[remote_count] = r->res_length;
+			memcpy(ls->ls_remove_names[remote_count], r->res_name,
+			       DLM_RESNAME_MAXLEN);
+			remote_count++;
+
+			if (remote_count >= DLM_REMOVE_NAMES_MAX)
+				break;
+			continue;
+		}
+
+		if (!kref_put(&r->res_ref, kill_rsb)) {
+			log_error(ls, "tossed rsb in use %s", r->res_name);
+			continue;
+		}
+
+		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+		dlm_free_rsb(r);
+	}
+
+	if (need_shrink)
+		ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK;
+	else
+		ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
+
+	/*
+	 * While searching for rsb's to free, we found some that require
+	 * remote removal.  We leave them in place and find them again here
+	 * so there is a very small gap between removing them from the toss
+	 * list and sending the removal.  Keeping this gap small is
+	 * important to keep us (the master node) from being out of sync
+	 * with the remote dir node for very long.
+	 *
+	 * From the time the rsb is removed from toss until just after
+	 * send_remove, the rsb name is saved in ls_remove_name.  A new
+	 * lookup checks this to ensure that a new lookup message for the
+	 * same resource name is not sent just before the remove message.
+	 */
+
+	for (i = 0; i < remote_count; i++) {
+		name = ls->ls_remove_names[i];
+		len = ls->ls_remove_lens[i];
+
+		spin_lock(&ls->ls_rsbtbl[b].lock);
+		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+		if (rv) {
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			log_debug(ls, "remove_name not toss %s", name);
+			continue;
+		}
+
+		if (r->res_master_nodeid != our_nodeid) {
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			log_debug(ls, "remove_name master %d dir %d our %d %s",
+				  r->res_master_nodeid, r->res_dir_nodeid,
+				  our_nodeid, name);
+			continue;
+		}
+
+		if (r->res_dir_nodeid == our_nodeid) {
+			/* should never happen */
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			log_error(ls, "remove_name dir %d master %d our %d %s",
+				  r->res_dir_nodeid, r->res_master_nodeid,
+				  our_nodeid, name);
+			continue;
+		}
+
+		if (!time_after_eq(jiffies, r->res_toss_time +
+				   dlm_config.ci_toss_secs * HZ)) {
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			log_debug(ls, "remove_name toss_time %lu now %lu %s",
+				  r->res_toss_time, jiffies, name);
+			continue;
+		}
+
+		if (!kref_put(&r->res_ref, kill_rsb)) {
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			log_error(ls, "remove_name in use %s", name);
+			continue;
+		}
+
+		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+
+		/* block lookup of same name until we've sent remove */
+		spin_lock(&ls->ls_remove_spin);
+		ls->ls_remove_len = len;
+		memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
+		spin_unlock(&ls->ls_remove_spin);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+
+		send_remove(r);
+
+		/* allow lookup of name again */
+		spin_lock(&ls->ls_remove_spin);
+		ls->ls_remove_len = 0;
+		memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
+		spin_unlock(&ls->ls_remove_spin);
+
+		dlm_free_rsb(r);
+	}
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+	int i;
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		shrink_bucket(ls, i);
+		if (dlm_locking_stopped(ls))
+			break;
+		cond_resched();
+	}
+}
+
+static void add_timeout(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+	if (is_master_copy(lkb))
+		return;
+
+	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
+	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
+		lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
+		goto add_it;
+	}
+	if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
+		goto add_it;
+	return;
+
+ add_it:
+	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
+	mutex_lock(&ls->ls_timeout_mutex);
+	hold_lkb(lkb);
+	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
+	mutex_unlock(&ls->ls_timeout_mutex);
+}
+
+static void del_timeout(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+	mutex_lock(&ls->ls_timeout_mutex);
+	if (!list_empty(&lkb->lkb_time_list)) {
+		list_del_init(&lkb->lkb_time_list);
+		unhold_lkb(lkb);
+	}
+	mutex_unlock(&ls->ls_timeout_mutex);
+}
+
+/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and
+   lkb_lksb_timeout without lock_rsb?  Note: we can't lock timeout_mutex
+   and then lock rsb because of lock ordering in add_timeout.  We may need
+   to specify some special timeout-related bits in the lkb that are just to
+   be accessed under the timeout_mutex. */
+
+void dlm_scan_timeout(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	int do_cancel, do_warn;
+	s64 wait_us;
+
+	for (;;) {
+		if (dlm_locking_stopped(ls))
+			break;
+
+		do_cancel = 0;
+		do_warn = 0;
+		mutex_lock(&ls->ls_timeout_mutex);
+		list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+
+			wait_us = ktime_to_us(ktime_sub(ktime_get(),
+					      		lkb->lkb_timestamp));
+
+			if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
+			    wait_us >= (lkb->lkb_timeout_cs * 10000))
+				do_cancel = 1;
+
+			if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
+			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
+				do_warn = 1;
+
+			if (!do_cancel && !do_warn)
+				continue;
+			hold_lkb(lkb);
+			break;
+		}
+		mutex_unlock(&ls->ls_timeout_mutex);
+
+		if (!do_cancel && !do_warn)
+			break;
+
+		r = lkb->lkb_resource;
+		hold_rsb(r);
+		lock_rsb(r);
+
+		if (do_warn) {
+			/* clear flag so we only warn once */
+			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
+			if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT))
+				del_timeout(lkb);
+			dlm_timeout_warn(lkb);
+		}
+
+		if (do_cancel) {
+			log_debug(ls, "timeout cancel %x node %d %s",
+				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
+			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
+			lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL;
+			del_timeout(lkb);
+			_cancel_lock(r, lkb);
+		}
+
+		unlock_rsb(r);
+		unhold_rsb(r);
+		dlm_put_lkb(lkb);
+	}
+}
+
+/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping
+   dlm_recoverd before checking/setting ls_recover_begin. */
+
+void dlm_adjust_timeouts(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
+
+	ls->ls_recover_begin = 0;
+	mutex_lock(&ls->ls_timeout_mutex);
+	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
+		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
+	mutex_unlock(&ls->ls_timeout_mutex);
+
+	if (!dlm_config.ci_waitwarn_us)
+		return;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (ktime_to_us(lkb->lkb_wait_time))
+			lkb->lkb_wait_time = ktime_get();
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int b, len = r->res_ls->ls_lvblen;
+
+	/* b=1 lvb returned to caller
+	   b=0 lvb written to rsb or invalidated
+	   b=-1 do nothing */
+
+	b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+	if (b == 1) {
+		if (!lkb->lkb_lvbptr)
+			return;
+
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			return;
+
+		if (!r->res_lvbptr)
+			return;
+
+		memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+		lkb->lkb_lvbseq = r->res_lvbseq;
+
+	} else if (b == 0) {
+		if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+			rsb_set_flag(r, RSB_VALNOTVALID);
+			return;
+		}
+
+		if (!lkb->lkb_lvbptr)
+			return;
+
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			return;
+
+		if (!r->res_lvbptr)
+			r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+
+		if (!r->res_lvbptr)
+			return;
+
+		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+		r->res_lvbseq++;
+		lkb->lkb_lvbseq = r->res_lvbseq;
+		rsb_clear_flag(r, RSB_VALNOTVALID);
+	}
+
+	if (rsb_flag(r, RSB_VALNOTVALID))
+		lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_grmode < DLM_LOCK_PW)
+		return;
+
+	if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+		rsb_set_flag(r, RSB_VALNOTVALID);
+		return;
+	}
+
+	if (!lkb->lkb_lvbptr)
+		return;
+
+	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+		return;
+
+	if (!r->res_lvbptr)
+		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+
+	if (!r->res_lvbptr)
+		return;
+
+	memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+	r->res_lvbseq++;
+	rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			    struct dlm_message *ms)
+{
+	int b;
+
+	if (!lkb->lkb_lvbptr)
+		return;
+
+	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+		return;
+
+	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+	if (b == 1) {
+		int len = receive_extralen(ms);
+		if (len > r->res_ls->ls_lvblen)
+			len = r->res_ls->ls_lvblen;
+		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+		lkb->lkb_lvbseq = ms->m_lvbseq;
+	}
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+   remove_lock -- used for unlock, removes lkb from granted
+   revert_lock -- used for cancel, moves lkb from convert to granted
+   grant_lock  -- used for request and convert, adds lkb to granted or
+                  moves lkb from convert or waiting to granted
+
+   Each of these is used for master or local copy lkb's.  There is
+   also a _pc() variation used to make the corresponding change on
+   a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	del_lkb(r, lkb);
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	/* this unhold undoes the original ref from create_lkb()
+	   so this leads to the lkb being freed */
+	unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	set_lvb_unlock(r, lkb);
+	_remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	_remove_lock(r, lkb);
+}
+
+/* returns: 0 did nothing
+	    1 moved lock to granted
+	   -1 removed lock */
+
+static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int rv = 0;
+
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+
+	switch (lkb->lkb_status) {
+	case DLM_LKSTS_GRANTED:
+		break;
+	case DLM_LKSTS_CONVERT:
+		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		rv = 1;
+		break;
+	case DLM_LKSTS_WAITING:
+		del_lkb(r, lkb);
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		/* this unhold undoes the original ref from create_lkb()
+		   so this leads to the lkb being freed */
+		unhold_lkb(lkb);
+		rv = -1;
+		break;
+	default:
+		log_print("invalid status for revert %d", lkb->lkb_status);
+	}
+	return rv;
+}
+
+static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+		lkb->lkb_grmode = lkb->lkb_rqmode;
+		if (lkb->lkb_status)
+			move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		else
+			add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+	}
+
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+	lkb->lkb_highbast = 0;
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	set_lvb_lock(r, lkb);
+	_grant_lock(r, lkb);
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  struct dlm_message *ms)
+{
+	set_lvb_lock_pc(r, lkb, ms);
+	_grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+   be sent to the requesting node in addition to granting the lock if the
+   lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	grant_lock(r, lkb);
+	if (is_master_copy(lkb))
+		send_grant(r, lkb);
+	else
+		queue_cast(r, lkb, 0);
+}
+
+/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
+   change the granted/requested modes.  We're munging things accordingly in
+   the process copy.
+   CONVDEADLK: our grmode may have been forced down to NL to resolve a
+   conversion deadlock
+   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
+   compatible with other granted locks */
+
+static void munge_demoted(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
+		log_print("munge_demoted %x invalid modes gr %d rq %d",
+			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
+		return;
+	}
+
+	lkb->lkb_grmode = DLM_LOCK_NL;
+}
+
+static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
+	    ms->m_type != DLM_MSG_GRANT) {
+		log_print("munge_altmode %x invalid reply type %d",
+			  lkb->lkb_id, ms->m_type);
+		return;
+	}
+
+	if (lkb->lkb_exflags & DLM_LKF_ALTPR)
+		lkb->lkb_rqmode = DLM_LOCK_PR;
+	else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
+		lkb->lkb_rqmode = DLM_LOCK_CW;
+	else {
+		log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
+		dlm_print_lkb(lkb);
+	}
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+					   lkb_statequeue);
+	if (lkb->lkb_id == first->lkb_id)
+		return 1;
+
+	return 0;
+}
+
+/* Check if the given lkb conflicts with another lkb on the queue. */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *this;
+
+	list_for_each_entry(this, head, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+		if (!modes_compat(this, lkb))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing an earlier lkb in the
+ * convert queue from being granted, then deadlk/demote lkb.
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ *                PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list.  We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we
+ * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK
+ * flag set and return DEMOTED in the lksb flags.
+ *
+ * Originally, this function detected conv-deadlk in a more limited scope:
+ * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or
+ * - if lkb1 was the first entry in the queue (not just earlier), and was
+ *   blocked by the granted mode of lkb2, and there was nothing on the
+ *   granted queue preventing lkb1 from being granted immediately, i.e.
+ *   lkb2 was the only thing preventing lkb1 from being granted.
+ *
+ * That second condition meant we'd only say there was conv-deadlk if
+ * resolving it (by demotion) would lead to the first lock on the convert
+ * queue being granted right away.  It allowed conversion deadlocks to exist
+ * between locks on the convert queue while they couldn't be granted anyway.
+ *
+ * Now, we detect and take action on conversion deadlocks immediately when
+ * they're created, even if they may not be immediately consequential.  If
+ * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted
+ * mode that would prevent lkb1's conversion from being granted, we do a
+ * deadlk/demote on lkb2 right away and don't let it onto the convert queue.
+ * I think this means that the lkb_is_ahead condition below should always
+ * be zero, i.e. there will never be conv-deadlk between two locks that are
+ * both already on the convert queue.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
+{
+	struct dlm_lkb *lkb1;
+	int lkb_is_ahead = 0;
+
+	list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) {
+		if (lkb1 == lkb2) {
+			lkb_is_ahead = 1;
+			continue;
+		}
+
+		if (!lkb_is_ahead) {
+			if (!modes_compat(lkb2, lkb1))
+				return 1;
+		} else {
+			if (!modes_compat(lkb2, lkb1) &&
+			    !modes_compat(lkb1, lkb2))
+				return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * recover is 1 if dlm_recover_grant() is trying to grant conversions
+ * after recovery.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
+			   int recover)
+{
+	int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+	/*
+	 * 6-10: Version 5.4 introduced an option to address the phenomenon of
+	 * a new request for a NL mode lock being blocked.
+	 *
+	 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+	 * request, then it would be granted.  In essence, the use of this flag
+	 * tells the Lock Manager to expedite theis request by not considering
+	 * what may be in the CONVERTING or WAITING queues...  As of this
+	 * writing, the EXPEDITE flag can be used only with new requests for NL
+	 * mode locks.  This flag is not valid for conversion requests.
+	 *
+	 * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
+	 * conversion or used with a non-NL requested mode.  We also know an
+	 * EXPEDITE request is always granted immediately, so now must always
+	 * be 1.  The full condition to grant an expedite request: (now &&
+	 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+	 * therefore be shortened to just checking the flag.
+	 */
+
+	if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+		return 1;
+
+	/*
+	 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+	 * added to the remaining conditions.
+	 */
+
+	if (queue_conflict(&r->res_grantqueue, lkb))
+		return 0;
+
+	/*
+	 * 6-3: By default, a conversion request is immediately granted if the
+	 * requested mode is compatible with the modes of all other granted
+	 * locks
+	 */
+
+	if (queue_conflict(&r->res_convertqueue, lkb))
+		return 0;
+
+	/*
+	 * The RECOVER_GRANT flag means dlm_recover_grant() is granting
+	 * locks for a recovered rsb, on which lkb's have been rebuilt.
+	 * The lkb's may have been rebuilt on the queues in a different
+	 * order than they were in on the previous master.  So, granting
+	 * queued conversions in order after recovery doesn't make sense
+	 * since the order hasn't been preserved anyway.  The new order
+	 * could also have created a new "in place" conversion deadlock.
+	 * (e.g. old, failed master held granted EX, with PR->EX, NL->EX.
+	 * After recovery, there would be no granted locks, and possibly
+	 * NL->EX, PR->EX, an in-place conversion deadlock.)  So, after
+	 * recovery, grant conversions without considering order.
+	 */
+
+	if (conv && recover)
+		return 1;
+
+	/*
+	 * 6-5: But the default algorithm for deciding whether to grant or
+	 * queue conversion requests does not by itself guarantee that such
+	 * requests are serviced on a "first come first serve" basis.  This, in
+	 * turn, can lead to a phenomenon known as "indefinate postponement".
+	 *
+	 * 6-7: This issue is dealt with by using the optional QUECVT flag with
+	 * the system service employed to request a lock conversion.  This flag
+	 * forces certain conversion requests to be queued, even if they are
+	 * compatible with the granted modes of other locks on the same
+	 * resource.  Thus, the use of this flag results in conversion requests
+	 * being ordered on a "first come first servce" basis.
+	 *
+	 * DCT: This condition is all about new conversions being able to occur
+	 * "in place" while the lock remains on the granted queue (assuming
+	 * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
+	 * doesn't _have_ to go onto the convert queue where it's processed in
+	 * order.  The "now" variable is necessary to distinguish converts
+	 * being received and processed for the first time now, because once a
+	 * convert is moved to the conversion queue the condition below applies
+	 * requiring fifo granting.
+	 */
+
+	if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+		return 1;
+
+	/*
+	 * Even if the convert is compat with all granted locks,
+	 * QUECVT forces it behind other locks on the convert queue.
+	 */
+
+	if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) {
+		if (list_empty(&r->res_convertqueue))
+			return 1;
+		else
+			return 0;
+	}
+
+	/*
+	 * The NOORDER flag is set to avoid the standard vms rules on grant
+	 * order.
+	 */
+
+	if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+		return 1;
+
+	/*
+	 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+	 * granted until all other conversion requests ahead of it are granted
+	 * and/or canceled.
+	 */
+
+	if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+		return 1;
+
+	/*
+	 * 6-4: By default, a new request is immediately granted only if all
+	 * three of the following conditions are satisfied when the request is
+	 * issued:
+	 * - The queue of ungranted conversion requests for the resource is
+	 *   empty.
+	 * - The queue of ungranted new requests for the resource is empty.
+	 * - The mode of the new request is compatible with the most
+	 *   restrictive mode of all granted locks on the resource.
+	 */
+
+	if (now && !conv && list_empty(&r->res_convertqueue) &&
+	    list_empty(&r->res_waitqueue))
+		return 1;
+
+	/*
+	 * 6-4: Once a lock request is in the queue of ungranted new requests,
+	 * it cannot be granted until the queue of ungranted conversion
+	 * requests is empty, all ungranted new requests ahead of it are
+	 * granted and/or canceled, and it is compatible with the granted mode
+	 * of the most restrictive lock granted on the resource.
+	 */
+
+	if (!now && !conv && list_empty(&r->res_convertqueue) &&
+	    first_in_list(lkb, &r->res_waitqueue))
+		return 1;
+
+	return 0;
+}
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
+			  int recover, int *err)
+{
+	int rv;
+	int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+	int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+	if (err)
+		*err = 0;
+
+	rv = _can_be_granted(r, lkb, now, recover);
+	if (rv)
+		goto out;
+
+	/*
+	 * The CONVDEADLK flag is non-standard and tells the dlm to resolve
+	 * conversion deadlocks by demoting grmode to NL, otherwise the dlm
+	 * cancels one of the locks.
+	 */
+
+	if (is_convert && can_be_queued(lkb) &&
+	    conversion_deadlock_detect(r, lkb)) {
+		if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
+			lkb->lkb_grmode = DLM_LOCK_NL;
+			lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+		} else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
+			if (err)
+				*err = -EDEADLK;
+			else {
+				log_print("can_be_granted deadlock %x now %d",
+					  lkb->lkb_id, now);
+				dlm_dump_rsb(r);
+			}
+		}
+		goto out;
+	}
+
+	/*
+	 * The ALTPR and ALTCW flags are non-standard and tell the dlm to try
+	 * to grant a request in a mode other than the normal rqmode.  It's a
+	 * simple way to provide a big optimization to applications that can
+	 * use them.
+	 */
+
+	if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR))
+		alt = DLM_LOCK_PR;
+	else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW))
+		alt = DLM_LOCK_CW;
+
+	if (alt) {
+		lkb->lkb_rqmode = alt;
+		rv = _can_be_granted(r, lkb, now, 0);
+		if (rv)
+			lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+		else
+			lkb->lkb_rqmode = rqmode;
+	}
+ out:
+	return rv;
+}
+
+/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock
+   for locks pending on the convert list.  Once verified (watch for these
+   log_prints), we should be able to just call _can_be_granted() and not
+   bother with the demote/deadlk cases here (and there's no easy way to deal
+   with a deadlk here, we'd have to generate something like grant_lock with
+   the deadlk error.) */
+
+/* Returns the highest requested mode of all blocked conversions; sets
+   cw if there's a blocked conversion to DLM_LOCK_CW. */
+
+static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
+				 unsigned int *count)
+{
+	struct dlm_lkb *lkb, *s;
+	int recover = rsb_flag(r, RSB_RECOVER_GRANT);
+	int hi, demoted, quit, grant_restart, demote_restart;
+	int deadlk;
+
+	quit = 0;
+ restart:
+	grant_restart = 0;
+	demote_restart = 0;
+	hi = DLM_LOCK_IV;
+
+	list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+		demoted = is_demoted(lkb);
+		deadlk = 0;
+
+		if (can_be_granted(r, lkb, 0, recover, &deadlk)) {
+			grant_lock_pending(r, lkb);
+			grant_restart = 1;
+			if (count)
+				(*count)++;
+			continue;
+		}
+
+		if (!demoted && is_demoted(lkb)) {
+			log_print("WARN: pending demoted %x node %d %s",
+				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
+			demote_restart = 1;
+			continue;
+		}
+
+		if (deadlk) {
+			log_print("WARN: pending deadlock %x node %d %s",
+				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
+			dlm_dump_rsb(r);
+			continue;
+		}
+
+		hi = max_t(int, lkb->lkb_rqmode, hi);
+
+		if (cw && lkb->lkb_rqmode == DLM_LOCK_CW)
+			*cw = 1;
+	}
+
+	if (grant_restart)
+		goto restart;
+	if (demote_restart && !quit) {
+		quit = 1;
+		goto restart;
+	}
+
+	return max_t(int, high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw,
+			      unsigned int *count)
+{
+	struct dlm_lkb *lkb, *s;
+
+	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+		if (can_be_granted(r, lkb, 0, 0, NULL)) {
+			grant_lock_pending(r, lkb);
+			if (count)
+				(*count)++;
+		} else {
+			high = max_t(int, lkb->lkb_rqmode, high);
+			if (lkb->lkb_rqmode == DLM_LOCK_CW)
+				*cw = 1;
+		}
+	}
+
+	return high;
+}
+
+/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked
+   on either the convert or waiting queue.
+   high is the largest rqmode of all locks blocked on the convert or
+   waiting queue. */
+
+static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
+{
+	if (gr->lkb_grmode == DLM_LOCK_PR && cw) {
+		if (gr->lkb_highbast < DLM_LOCK_EX)
+			return 1;
+		return 0;
+	}
+
+	if (gr->lkb_highbast < high &&
+	    !__dlm_compat_matrix[gr->lkb_grmode+1][high+1])
+		return 1;
+	return 0;
+}
+
+static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count)
+{
+	struct dlm_lkb *lkb, *s;
+	int high = DLM_LOCK_IV;
+	int cw = 0;
+
+	if (!is_master(r)) {
+		log_print("grant_pending_locks r nodeid %d", r->res_nodeid);
+		dlm_dump_rsb(r);
+		return;
+	}
+
+	high = grant_pending_convert(r, high, &cw, count);
+	high = grant_pending_wait(r, high, &cw, count);
+
+	if (high == DLM_LOCK_IV)
+		return;
+
+	/*
+	 * If there are locks left on the wait/convert queue then send blocking
+	 * ASTs to granted locks based on the largest requested mode (high)
+	 * found above.
+	 */
+
+	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+		if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
+			if (cw && high == DLM_LOCK_PR &&
+			    lkb->lkb_grmode == DLM_LOCK_PR)
+				queue_bast(r, lkb, DLM_LOCK_CW);
+			else
+				queue_bast(r, lkb, high);
+			lkb->lkb_highbast = high;
+		}
+	}
+}
+
+static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq)
+{
+	if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) ||
+	    (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) {
+		if (gr->lkb_highbast < DLM_LOCK_EX)
+			return 1;
+		return 0;
+	}
+
+	if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq))
+		return 1;
+	return 0;
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+			    struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *gr;
+
+	list_for_each_entry(gr, head, lkb_statequeue) {
+		/* skip self when sending basts to convertqueue */
+		if (gr == lkb)
+			continue;
+		if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
+			queue_bast(r, gr, lkb->lkb_rqmode);
+			gr->lkb_highbast = lkb->lkb_rqmode;
+		}
+	}
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	send_bast_queue(r, &r->res_grantqueue, lkb);
+	send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+   The purpose of this function is to set the nodeid field in the given
+   lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
+   known, it can just be copied to the lkb and the function will return
+   0.  If the rsb's nodeid is _not_ known, it needs to be looked up
+   before it can be copied to the lkb.
+
+   When the rsb nodeid is being looked up remotely, the initial lkb
+   causing the lookup is kept on the ls_waiters list waiting for the
+   lookup reply.  Other lkb's waiting for the same rsb lookup are kept
+   on the rsb's res_lookup list until the master is verified.
+
+   Return values:
+   0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+   1: the rsb master is not available and the lkb has been placed on
+      a wait queue
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int our_nodeid = dlm_our_nodeid();
+
+	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = lkb->lkb_id;
+		lkb->lkb_nodeid = r->res_nodeid;
+		return 0;
+	}
+
+	if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+		list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+		return 1;
+	}
+
+	if (r->res_master_nodeid == our_nodeid) {
+		lkb->lkb_nodeid = 0;
+		return 0;
+	}
+
+	if (r->res_master_nodeid) {
+		lkb->lkb_nodeid = r->res_master_nodeid;
+		return 0;
+	}
+
+	if (dlm_dir_nodeid(r) == our_nodeid) {
+		/* This is a somewhat unusual case; find_rsb will usually
+		   have set res_master_nodeid when dir nodeid is local, but
+		   there are cases where we become the dir node after we've
+		   past find_rsb and go through _request_lock again.
+		   confirm_master() or process_lookup_list() needs to be
+		   called after this. */
+		log_debug(r->res_ls, "set_master %x self master %d dir %d %s",
+			  lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid,
+			  r->res_name);
+		r->res_master_nodeid = our_nodeid;
+		r->res_nodeid = 0;
+		lkb->lkb_nodeid = 0;
+		return 0;
+	}
+
+	wait_pending_remove(r);
+
+	r->res_first_lkid = lkb->lkb_id;
+	send_lookup(r, lkb);
+	return 1;
+}
+
+static void process_lookup_list(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+		list_del_init(&lkb->lkb_rsb_lookup);
+		_request_lock(r, lkb);
+		schedule();
+	}
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+	struct dlm_lkb *lkb;
+
+	if (!r->res_first_lkid)
+		return;
+
+	switch (error) {
+	case 0:
+	case -EINPROGRESS:
+		r->res_first_lkid = 0;
+		process_lookup_list(r);
+		break;
+
+	case -EAGAIN:
+	case -EBADR:
+	case -ENOTBLK:
+		/* the remote request failed and won't be retried (it was
+		   a NOQUEUE, or has been canceled/unlocked); make a waiting
+		   lkb the first_lkid */
+
+		r->res_first_lkid = 0;
+
+		if (!list_empty(&r->res_lookup)) {
+			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+					 lkb_rsb_lookup);
+			list_del_init(&lkb->lkb_rsb_lookup);
+			r->res_first_lkid = lkb->lkb_id;
+			_request_lock(r, lkb);
+		}
+		break;
+
+	default:
+		log_error(r->res_ls, "confirm_master unknown error %d", error);
+	}
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+			 int namelen, unsigned long timeout_cs,
+			 void (*ast) (void *astparam),
+			 void *astparam,
+			 void (*bast) (void *astparam, int mode),
+			 struct dlm_args *args)
+{
+	int rv = -EINVAL;
+
+	/* check for invalid arg usage */
+
+	if (mode < 0 || mode > DLM_LOCK_EX)
+		goto out;
+
+	if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+		goto out;
+
+	if (flags & DLM_LKF_CANCEL)
+		goto out;
+
+	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+		goto out;
+
+	if (!ast || !lksb)
+		goto out;
+
+	if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+		goto out;
+
+	if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+		goto out;
+
+	/* these args will be copied to the lkb in validate_lock_args,
+	   it cannot be done now because when converting locks, fields in
+	   an active lkb cannot be modified before locking the rsb */
+
+	args->flags = flags;
+	args->astfn = ast;
+	args->astparam = astparam;
+	args->bastfn = bast;
+	args->timeout = timeout_cs;
+	args->mode = mode;
+	args->lksb = lksb;
+	rv = 0;
+ out:
+	return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+	if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+ 		      DLM_LKF_FORCEUNLOCK))
+		return -EINVAL;
+
+	if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
+		return -EINVAL;
+
+	args->flags = flags;
+	args->astparam = astarg;
+	return 0;
+}
+
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			      struct dlm_args *args)
+{
+	int rv = -EINVAL;
+
+	if (args->flags & DLM_LKF_CONVERT) {
+		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+			goto out;
+
+		if (args->flags & DLM_LKF_QUECVT &&
+		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+			goto out;
+
+		rv = -EBUSY;
+		if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+			goto out;
+
+		if (lkb->lkb_wait_type)
+			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
+	}
+
+	lkb->lkb_exflags = args->flags;
+	lkb->lkb_sbflags = 0;
+	lkb->lkb_astfn = args->astfn;
+	lkb->lkb_astparam = args->astparam;
+	lkb->lkb_bastfn = args->bastfn;
+	lkb->lkb_rqmode = args->mode;
+	lkb->lkb_lksb = args->lksb;
+	lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+	lkb->lkb_ownpid = (int) current->pid;
+	lkb->lkb_timeout_cs = args->timeout;
+	rv = 0;
+ out:
+	if (rv)
+		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
+			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
+			  lkb->lkb_status, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
+	return rv;
+}
+
+/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
+   for success */
+
+/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
+   because there may be a lookup in progress and it's valid to do
+   cancel/unlockf on it */
+
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int rv = -EINVAL;
+
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+		dlm_print_lkb(lkb);
+		goto out;
+	}
+
+	/* an lkb may still exist even though the lock is EOL'ed due to a
+	   cancel, unlock or failed noqueue request; an app can't use these
+	   locks; return same error as if the lkid had not been found at all */
+
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+		rv = -ENOENT;
+		goto out;
+	}
+
+	/* an lkb may be waiting for an rsb lookup to complete where the
+	   lookup was initiated by another lock */
+
+	if (!list_empty(&lkb->lkb_rsb_lookup)) {
+		if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
+			list_del_init(&lkb->lkb_rsb_lookup);
+			queue_cast(lkb->lkb_resource, lkb,
+				   args->flags & DLM_LKF_CANCEL ?
+				   -DLM_ECANCEL : -DLM_EUNLOCK);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+		}
+		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
+		rv = -EBUSY;
+		goto out;
+	}
+
+	/* cancel not allowed with another cancel/unlock in progress */
+
+	if (args->flags & DLM_LKF_CANCEL) {
+		if (lkb->lkb_exflags & DLM_LKF_CANCEL)
+			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
+
+		/* don't let scand try to do a cancel */
+		del_timeout(lkb);
+
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		/* there's nothing to cancel */
+		if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
+		    !lkb->lkb_wait_type) {
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+		case DLM_MSG_CANCEL:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_CANCEL */
+		goto out_ok;
+	}
+
+	/* do we need to allow a force-unlock if there's a normal unlock
+	   already in progress?  in what conditions could the normal unlock
+	   fail such that we'd want to send a force-unlock to be sure? */
+
+	if (args->flags & DLM_LKF_FORCEUNLOCK) {
+		if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
+			goto out;
+
+		if (is_overlap_unlock(lkb))
+			goto out;
+
+		/* don't let scand try to do a cancel */
+		del_timeout(lkb);
+
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_UNLOCK */
+		goto out_ok;
+	}
+
+	/* normal unlock not allowed if there's any op in progress */
+	rv = -EBUSY;
+	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
+		goto out;
+
+ out_ok:
+	/* an overlapping op shouldn't blow away exflags from other op */
+	lkb->lkb_exflags |= args->flags;
+	lkb->lkb_sbflags = 0;
+	lkb->lkb_astparam = args->astparam;
+	rv = 0;
+ out:
+	if (rv)
+		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+			  args->flags, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
+	return rv;
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error = 0;
+
+	if (can_be_granted(r, lkb, 1, 0, NULL)) {
+		grant_lock(r, lkb);
+		queue_cast(r, lkb, 0);
+		goto out;
+	}
+
+	if (can_be_queued(lkb)) {
+		error = -EINPROGRESS;
+		add_lkb(r, lkb, DLM_LKSTS_WAITING);
+		add_timeout(lkb);
+		goto out;
+	}
+
+	error = -EAGAIN;
+	queue_cast(r, lkb, -EAGAIN);
+ out:
+	return error;
+}
+
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error = 0;
+	int deadlk = 0;
+
+	/* changing an existing lock may allow others to be granted */
+
+	if (can_be_granted(r, lkb, 1, 0, &deadlk)) {
+		grant_lock(r, lkb);
+		queue_cast(r, lkb, 0);
+		goto out;
+	}
+
+	/* can_be_granted() detected that this lock would block in a conversion
+	   deadlock, so we leave it on the granted queue and return EDEADLK in
+	   the ast for the convert. */
+
+	if (deadlk) {
+		/* it's left on the granted queue */
+		revert_lock(r, lkb);
+		queue_cast(r, lkb, -EDEADLK);
+		error = -EDEADLK;
+		goto out;
+	}
+
+	/* is_demoted() means the can_be_granted() above set the grmode
+	   to NL, and left us on the granted queue.  This auto-demotion
+	   (due to CONVDEADLK) might mean other locks, and/or this lock, are
+	   now grantable.  We have to try to grant other converting locks
+	   before we try again to grant this one. */
+
+	if (is_demoted(lkb)) {
+		grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL);
+		if (_can_be_granted(r, lkb, 1, 0)) {
+			grant_lock(r, lkb);
+			queue_cast(r, lkb, 0);
+			goto out;
+		}
+		/* else fall through and move to convert queue */
+	}
+
+	if (can_be_queued(lkb)) {
+		error = -EINPROGRESS;
+		del_lkb(r, lkb);
+		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+		add_timeout(lkb);
+		goto out;
+	}
+
+	error = -EAGAIN;
+	queue_cast(r, lkb, -EAGAIN);
+ out:
+	return error;
+}
+
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case 0:
+		grant_pending_locks(r, NULL);
+		/* grant_pending_locks also sends basts */
+		break;
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	remove_lock(r, lkb);
+	queue_cast(r, lkb, -DLM_EUNLOCK);
+	return -DLM_EUNLOCK;
+}
+
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	grant_pending_locks(r, NULL);
+}
+
+/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
+
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	error = revert_lock(r, lkb);
+	if (error) {
+		queue_cast(r, lkb, -DLM_ECANCEL);
+		return -DLM_ECANCEL;
+	}
+	return 0;
+}
+
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	if (error)
+		grant_pending_locks(r, NULL);
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	/* set_master: sets lkb nodeid from r */
+
+	error = set_master(r, lkb);
+	if (error < 0)
+		goto out;
+	if (error) {
+		error = 0;
+		goto out;
+	}
+
+	if (is_remote(r)) {
+		/* receive_request() calls do_request() on remote node */
+		error = send_request(r, lkb);
+	} else {
+		error = do_request(r, lkb);
+		/* for remote locks the request_reply is sent
+		   between do_request and do_request_effects */
+		do_request_effects(r, lkb, error);
+	}
+ out:
+	return error;
+}
+
+/* change some property of an existing lkb, e.g. mode */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r)) {
+		/* receive_convert() calls do_convert() on remote node */
+		error = send_convert(r, lkb);
+	} else {
+		error = do_convert(r, lkb);
+		/* for remote locks the convert_reply is sent
+		   between do_convert and do_convert_effects */
+		do_convert_effects(r, lkb, error);
+	}
+
+	return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r)) {
+		/* receive_unlock() calls do_unlock() on remote node */
+		error = send_unlock(r, lkb);
+	} else {
+		error = do_unlock(r, lkb);
+		/* for remote locks the unlock_reply is sent
+		   between do_unlock and do_unlock_effects */
+		do_unlock_effects(r, lkb, error);
+	}
+
+	return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r)) {
+		/* receive_cancel() calls do_cancel() on remote node */
+		error = send_cancel(r, lkb);
+	} else {
+		error = do_cancel(r, lkb);
+		/* for remote locks the cancel_reply is sent
+		   between do_cancel and do_cancel_effects */
+		do_cancel_effects(r, lkb, error);
+	}
+
+	return error;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+			int len, struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	error = validate_lock_args(ls, lkb, args);
+	if (error)
+		return error;
+
+	error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
+	if (error)
+		return error;
+
+	lock_rsb(r);
+
+	attach_lkb(r, lkb);
+	lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+
+	error = _request_lock(r, lkb);
+
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_lock_args(ls, lkb, args);
+	if (error)
+		goto out;
+
+	error = _convert_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, args);
+	if (error)
+		goto out;
+
+	error = _unlock_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, args);
+	if (error)
+		goto out;
+
+	error = _cancel_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+/*
+ * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+	     int mode,
+	     struct dlm_lksb *lksb,
+	     uint32_t flags,
+	     void *name,
+	     unsigned int namelen,
+	     uint32_t parent_lkid,
+	     void (*ast) (void *astarg),
+	     void *astarg,
+	     void (*bast) (void *astarg, int mode))
+{
+	struct dlm_ls *ls;
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error, convert = flags & DLM_LKF_CONVERT;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	dlm_lock_recovery(ls);
+
+	if (convert)
+		error = find_lkb(ls, lksb->sb_lkid, &lkb);
+	else
+		error = create_lkb(ls, &lkb);
+
+	if (error)
+		goto out;
+
+	error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
+			      astarg, bast, &args);
+	if (error)
+		goto out_put;
+
+	if (convert)
+		error = convert_lock(ls, lkb, &args);
+	else
+		error = request_lock(ls, lkb, name, namelen, &args);
+
+	if (error == -EINPROGRESS)
+		error = 0;
+ out_put:
+	if (convert || error)
+		__put_lkb(ls, lkb);
+	if (error == -EAGAIN || error == -EDEADLK)
+		error = 0;
+ out:
+	dlm_unlock_recovery(ls);
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+	       uint32_t lkid,
+	       uint32_t flags,
+	       struct dlm_lksb *lksb,
+	       void *astarg)
+{
+	struct dlm_ls *ls;
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	error = set_unlock_args(flags, astarg, &args);
+	if (error)
+		goto out_put;
+
+	if (flags & DLM_LKF_CANCEL)
+		error = cancel_lock(ls, lkb, &args);
+	else
+		error = unlock_lock(ls, lkb, &args);
+
+	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+		error = 0;
+	if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request			receive_request
+ * send_convert			receive_convert
+ * send_unlock			receive_unlock
+ * send_cancel			receive_cancel
+ * send_grant			receive_grant
+ * send_bast			receive_bast
+ * send_lookup			receive_lookup
+ * send_remove			receive_remove
+ *
+ * 				send_common_reply
+ * receive_request_reply	send_request_reply
+ * receive_convert_reply	send_convert_reply
+ * receive_unlock_reply		send_unlock_reply
+ * receive_cancel_reply		send_cancel_reply
+ * receive_lookup_reply		send_lookup_reply
+ */
+
+static int _create_message(struct dlm_ls *ls, int mb_len,
+			   int to_nodeid, int mstype,
+			   struct dlm_message **ms_ret,
+			   struct dlm_mhandle **mh_ret)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	char *mb;
+
+	/* get_buffer gives us a message handle (mh) that we need to
+	   pass into lowcomms_commit and a message buffer (mb) that we
+	   write our data into */
+
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+	if (!mh)
+		return -ENOBUFS;
+
+	memset(mb, 0, mb_len);
+
+	ms = (struct dlm_message *) mb;
+
+	ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	ms->m_header.h_lockspace = ls->ls_global_id;
+	ms->m_header.h_nodeid = dlm_our_nodeid();
+	ms->m_header.h_length = mb_len;
+	ms->m_header.h_cmd = DLM_MSG;
+
+	ms->m_type = mstype;
+
+	*mh_ret = mh;
+	*ms_ret = ms;
+	return 0;
+}
+
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  int to_nodeid, int mstype,
+			  struct dlm_message **ms_ret,
+			  struct dlm_mhandle **mh_ret)
+{
+	int mb_len = sizeof(struct dlm_message);
+
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+	case DLM_MSG_REMOVE:
+		mb_len += r->res_length;
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (lkb && lkb->lkb_lvbptr)
+			mb_len += r->res_ls->ls_lvblen;
+		break;
+	}
+
+	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
+			       ms_ret, mh_ret);
+}
+
+/* further lowcomms enhancements or alternate implementations may make
+   the return value from this function useful at some point */
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+	dlm_message_out(ms);
+	dlm_lowcomms_commit_buffer(mh);
+	return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+		      struct dlm_message *ms)
+{
+	ms->m_nodeid   = lkb->lkb_nodeid;
+	ms->m_pid      = lkb->lkb_ownpid;
+	ms->m_lkid     = lkb->lkb_id;
+	ms->m_remid    = lkb->lkb_remid;
+	ms->m_exflags  = lkb->lkb_exflags;
+	ms->m_sbflags  = lkb->lkb_sbflags;
+	ms->m_flags    = lkb->lkb_flags;
+	ms->m_lvbseq   = lkb->lkb_lvbseq;
+	ms->m_status   = lkb->lkb_status;
+	ms->m_grmode   = lkb->lkb_grmode;
+	ms->m_rqmode   = lkb->lkb_rqmode;
+	ms->m_hash     = r->res_hash;
+
+	/* m_result and m_bastmode are set from function args,
+	   not from lkb fields */
+
+	if (lkb->lkb_bastfn)
+		ms->m_asts |= DLM_CB_BAST;
+	if (lkb->lkb_astfn)
+		ms->m_asts |= DLM_CB_CAST;
+
+	/* compare with switch in create_message; send_remove() doesn't
+	   use send_args() */
+
+	switch (ms->m_type) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+		memcpy(ms->m_extra, r->res_name, r->res_length);
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (!lkb->lkb_lvbptr)
+			break;
+		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+		break;
+	}
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = r->res_nodeid;
+
+	error = add_to_waiters(lkb, mstype, to_nodeid);
+	if (error)
+		return error;
+
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	if (error)
+		goto fail;
+
+	send_args(r, lkb, ms);
+
+	error = send_message(mh, ms);
+	if (error)
+		goto fail;
+	return 0;
+
+ fail:
+	remove_from_waiters(lkb, msg_reply_type(mstype));
+	return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	error = send_common(r, lkb, DLM_MSG_CONVERT);
+
+	/* down conversions go without a reply from the master */
+	if (!error && down_conversion(lkb)) {
+		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+		r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
+		r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
+		r->res_ls->ls_stub_ms.m_result = 0;
+		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+	}
+
+	return error;
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+   MASTER_UNCERTAIN to force the next request on the rsb to confirm
+   that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_result = 0;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_bastmode = mode;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = dlm_dir_nodeid(r);
+
+	error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
+	if (error)
+		return error;
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+	if (error)
+		goto fail;
+
+	send_args(r, lkb, ms);
+
+	error = send_message(mh, ms);
+	if (error)
+		goto fail;
+	return 0;
+
+ fail:
+	remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+	return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = dlm_dir_nodeid(r);
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+	if (error)
+		goto out;
+
+	memcpy(ms->m_extra, r->res_name, r->res_length);
+	ms->m_hash = r->res_hash;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			     int mstype, int rv)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_result = rv;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+			     int ret_nodeid, int rv)
+{
+	struct dlm_rsb *r = &ls->ls_stub_rsb;
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int error, nodeid = ms_in->m_header.h_nodeid;
+
+	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+	if (error)
+		goto out;
+
+	ms->m_lkid = ms_in->m_lkid;
+	ms->m_result = rv;
+	ms->m_nodeid = ret_nodeid;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+   of message, unlike the send side where we can safely send everything about
+   the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	lkb->lkb_exflags = ms->m_exflags;
+	lkb->lkb_sbflags = ms->m_sbflags;
+	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+		         (ms->m_flags & 0x0000FFFF);
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_flags == DLM_IFL_STUB_MS)
+		return;
+
+	lkb->lkb_sbflags = ms->m_sbflags;
+	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+		         (ms->m_flags & 0x0000FFFF);
+}
+
+static int receive_extralen(struct dlm_message *ms)
+{
+	return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_message *ms)
+{
+	int len;
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		if (!lkb->lkb_lvbptr)
+			lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+		len = receive_extralen(ms);
+		if (len > ls->ls_lvblen)
+			len = ls->ls_lvblen;
+		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+	}
+	return 0;
+}
+
+static void fake_bastfn(void *astparam, int mode)
+{
+	log_print("fake_bastfn should not be called");
+}
+
+static void fake_astfn(void *astparam)
+{
+	log_print("fake_astfn should not be called");
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				struct dlm_message *ms)
+{
+	lkb->lkb_nodeid = ms->m_header.h_nodeid;
+	lkb->lkb_ownpid = ms->m_pid;
+	lkb->lkb_remid = ms->m_lkid;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	lkb->lkb_rqmode = ms->m_rqmode;
+
+	lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
+	lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		/* lkb was just created so there won't be an lvb yet */
+		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				struct dlm_message *ms)
+{
+	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+		return -EBUSY;
+
+	if (receive_lvb(ls, lkb, ms))
+		return -ENOMEM;
+
+	lkb->lkb_rqmode = ms->m_rqmode;
+	lkb->lkb_lvbseq = ms->m_lvbseq;
+
+	return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			       struct dlm_message *ms)
+{
+	if (receive_lvb(ls, lkb, ms))
+		return -ENOMEM;
+	return 0;
+}
+
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+   uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+	lkb->lkb_nodeid = ms->m_header.h_nodeid;
+	lkb->lkb_remid = ms->m_lkid;
+}
+
+/* This is called after the rsb is locked so that we can safely inspect
+   fields in the lkb. */
+
+static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	int from = ms->m_header.h_nodeid;
+	int error = 0;
+
+	switch (ms->m_type) {
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_CANCEL:
+		if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_UNLOCK_REPLY:
+	case DLM_MSG_CANCEL_REPLY:
+	case DLM_MSG_GRANT:
+	case DLM_MSG_BAST:
+		if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	case DLM_MSG_REQUEST_REPLY:
+		if (!is_process_copy(lkb))
+			error = -EINVAL;
+		else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	default:
+		error = -EINVAL;
+	}
+
+	if (error)
+		log_error(lkb->lkb_resource->res_ls,
+			  "ignore invalid message %d from %d %x %x %x %d",
+			  ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
+			  lkb->lkb_flags, lkb->lkb_nodeid);
+	return error;
+}
+
+static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
+{
+	char name[DLM_RESNAME_MAXLEN + 1];
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	struct dlm_rsb *r;
+	uint32_t hash, b;
+	int rv, dir_nodeid;
+
+	memset(name, 0, sizeof(name));
+	memcpy(name, ms_name, len);
+
+	hash = jhash(name, len, 0);
+	b = hash & (ls->ls_rsbtbl_size - 1);
+
+	dir_nodeid = dlm_hash2nodeid(ls, hash);
+
+	log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
+
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+	if (!rv) {
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		log_error(ls, "repeat_remove on keep %s", name);
+		return;
+	}
+
+	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+	if (!rv) {
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		log_error(ls, "repeat_remove on toss %s", name);
+		return;
+	}
+
+	/* use ls->remove_name2 to avoid conflict with shrink? */
+
+	spin_lock(&ls->ls_remove_spin);
+	ls->ls_remove_len = len;
+	memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
+	spin_unlock(&ls->ls_remove_spin);
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
+
+	rv = _create_message(ls, sizeof(struct dlm_message) + len,
+			     dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+	if (rv)
+		return;
+
+	memcpy(ms->m_extra, name, len);
+	ms->m_hash = hash;
+
+	send_message(mh, ms);
+
+	spin_lock(&ls->ls_remove_spin);
+	ls->ls_remove_len = 0;
+	memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
+	spin_unlock(&ls->ls_remove_spin);
+}
+
+static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int from_nodeid;
+	int error, namelen = 0;
+
+	from_nodeid = ms->m_header.h_nodeid;
+
+	error = create_lkb(ls, &lkb);
+	if (error)
+		goto fail;
+
+	receive_flags(lkb, ms);
+	lkb->lkb_flags |= DLM_IFL_MSTCPY;
+	error = receive_request_args(ls, lkb, ms);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto fail;
+	}
+
+	/* The dir node is the authority on whether we are the master
+	   for this rsb or not, so if the master sends us a request, we should
+	   recreate the rsb if we've destroyed it.   This race happens when we
+	   send a remove message to the dir node at the same time that the dir
+	   node sends us a request for the rsb. */
+
+	namelen = receive_extralen(ms);
+
+	error = find_rsb(ls, ms->m_extra, namelen, from_nodeid,
+			 R_RECEIVE_REQUEST, &r);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto fail;
+	}
+
+	lock_rsb(r);
+
+	if (r->res_master_nodeid != dlm_our_nodeid()) {
+		error = validate_master_nodeid(ls, r, from_nodeid);
+		if (error) {
+			unlock_rsb(r);
+			put_rsb(r);
+			__put_lkb(ls, lkb);
+			goto fail;
+		}
+	}
+
+	attach_lkb(r, lkb);
+	error = do_request(r, lkb);
+	send_request_reply(r, lkb, error);
+	do_request_effects(r, lkb, error);
+
+	unlock_rsb(r);
+	put_rsb(r);
+
+	if (error == -EINPROGRESS)
+		error = 0;
+	if (error)
+		dlm_put_lkb(lkb);
+	return 0;
+
+ fail:
+	/* TODO: instead of returning ENOTBLK, add the lkb to res_lookup
+	   and do this receive_request again from process_lookup_list once
+	   we get the lookup reply.  This would avoid a many repeated
+	   ENOTBLK request failures when the lookup reply designating us
+	   as master is delayed. */
+
+	/* We could repeatedly return -EBADR here if our send_remove() is
+	   delayed in being sent/arriving/being processed on the dir node.
+	   Another node would repeatedly lookup up the master, and the dir
+	   node would continue returning our nodeid until our send_remove
+	   took effect.
+
+	   We send another remove message in case our previous send_remove
+	   was lost/ignored/missed somehow. */
+
+	if (error != -ENOTBLK) {
+		log_limit(ls, "receive_request %x from %d %d",
+			  ms->m_lkid, from_nodeid, error);
+	}
+
+	if (namelen && error == -EBADR) {
+		send_repeat_remove(ls, ms->m_extra, namelen);
+		msleep(1000);
+	}
+
+	setup_stub_lkb(ls, ms);
+	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	return error;
+}
+
+static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, reply = 1;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	if (lkb->lkb_remid != ms->m_lkid) {
+		log_error(ls, "receive_convert %x remid %x recover_seq %llu "
+			  "remote %d %x", lkb->lkb_id, lkb->lkb_remid,
+			  (unsigned long long)lkb->lkb_recover_seq,
+			  ms->m_header.h_nodeid, ms->m_lkid);
+		error = -ENOENT;
+		goto fail;
+	}
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	receive_flags(lkb, ms);
+
+	error = receive_convert_args(ls, lkb, ms);
+	if (error) {
+		send_convert_reply(r, lkb, error);
+		goto out;
+	}
+
+	reply = !down_conversion(lkb);
+
+	error = do_convert(r, lkb);
+	if (reply)
+		send_convert_reply(r, lkb, error);
+	do_convert_effects(r, lkb, error);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return 0;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	return error;
+}
+
+static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	if (lkb->lkb_remid != ms->m_lkid) {
+		log_error(ls, "receive_unlock %x remid %x remote %d %x",
+			  lkb->lkb_id, lkb->lkb_remid,
+			  ms->m_header.h_nodeid, ms->m_lkid);
+		error = -ENOENT;
+		goto fail;
+	}
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	receive_flags(lkb, ms);
+
+	error = receive_unlock_args(ls, lkb, ms);
+	if (error) {
+		send_unlock_reply(r, lkb, error);
+		goto out;
+	}
+
+	error = do_unlock(r, lkb);
+	send_unlock_reply(r, lkb, error);
+	do_unlock_effects(r, lkb, error);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return 0;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	return error;
+}
+
+static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	receive_flags(lkb, ms);
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	error = do_cancel(r, lkb);
+	send_cancel_reply(r, lkb, error);
+	do_cancel_effects(r, lkb, error);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return 0;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	return error;
+}
+
+static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		return error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	receive_flags_reply(lkb, ms);
+	if (is_altmode(lkb))
+		munge_altmode(lkb, ms);
+	grant_lock_pc(r, lkb, ms);
+	queue_cast(r, lkb, 0);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return 0;
+}
+
+static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		return error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	queue_bast(r, lkb, ms->m_bastmode);
+	lkb->lkb_highbast = ms->m_bastmode;
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return 0;
+}
+
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	int len, error, ret_nodeid, from_nodeid, our_nodeid;
+
+	from_nodeid = ms->m_header.h_nodeid;
+	our_nodeid = dlm_our_nodeid();
+
+	len = receive_extralen(ms);
+
+	error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0,
+				  &ret_nodeid, NULL);
+
+	/* Optimization: we're master so treat lookup as a request */
+	if (!error && ret_nodeid == our_nodeid) {
+		receive_request(ls, ms);
+		return;
+	}
+	send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	char name[DLM_RESNAME_MAXLEN+1];
+	struct dlm_rsb *r;
+	uint32_t hash, b;
+	int rv, len, dir_nodeid, from_nodeid;
+
+	from_nodeid = ms->m_header.h_nodeid;
+
+	len = receive_extralen(ms);
+
+	if (len > DLM_RESNAME_MAXLEN) {
+		log_error(ls, "receive_remove from %d bad len %d",
+			  from_nodeid, len);
+		return;
+	}
+
+	dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+	if (dir_nodeid != dlm_our_nodeid()) {
+		log_error(ls, "receive_remove from %d bad nodeid %d",
+			  from_nodeid, dir_nodeid);
+		return;
+	}
+
+	/* Look for name on rsbtbl.toss, if it's there, kill it.
+	   If it's on rsbtbl.keep, it's being used, and we should ignore this
+	   message.  This is an expected race between the dir node sending a
+	   request to the master node at the same time as the master node sends
+	   a remove to the dir node.  The resolution to that race is for the
+	   dir node to ignore the remove message, and the master node to
+	   recreate the master rsb when it gets a request from the dir node for
+	   an rsb it doesn't have. */
+
+	memset(name, 0, sizeof(name));
+	memcpy(name, ms->m_extra, len);
+
+	hash = jhash(name, len, 0);
+	b = hash & (ls->ls_rsbtbl_size - 1);
+
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+
+	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+	if (rv) {
+		/* verify the rsb is on keep list per comment above */
+		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+		if (rv) {
+			/* should not happen */
+			log_error(ls, "receive_remove from %d not found %s",
+				  from_nodeid, name);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			return;
+		}
+		if (r->res_master_nodeid != from_nodeid) {
+			/* should not happen */
+			log_error(ls, "receive_remove keep from %d master %d",
+				  from_nodeid, r->res_master_nodeid);
+			dlm_print_rsb(r);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			return;
+		}
+
+		log_debug(ls, "receive_remove from %d master %d first %x %s",
+			  from_nodeid, r->res_master_nodeid, r->res_first_lkid,
+			  name);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		return;
+	}
+
+	if (r->res_master_nodeid != from_nodeid) {
+		log_error(ls, "receive_remove toss from %d master %d",
+			  from_nodeid, r->res_master_nodeid);
+		dlm_print_rsb(r);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		return;
+	}
+
+	if (kref_put(&r->res_ref, kill_rsb)) {
+		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		dlm_free_rsb(r);
+	} else {
+		log_error(ls, "receive_remove from %d rsb ref error",
+			  from_nodeid);
+		dlm_print_rsb(r);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+	}
+}
+
+static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	do_purge(ls, ms->m_nodeid, ms->m_pid);
+}
+
+static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, mstype, result;
+	int from_nodeid = ms->m_header.h_nodeid;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		return error;
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	mstype = lkb->lkb_wait_type;
+	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
+	if (error) {
+		log_error(ls, "receive_request_reply %x remote %d %x result %d",
+			  lkb->lkb_id, from_nodeid, ms->m_lkid, ms->m_result);
+		dlm_dump_rsb(r);
+		goto out;
+	}
+
+	/* Optimization: the dir node was also the master, so it took our
+	   lookup as a request and sent request reply instead of lookup reply */
+	if (mstype == DLM_MSG_LOOKUP) {
+		r->res_master_nodeid = from_nodeid;
+		r->res_nodeid = from_nodeid;
+		lkb->lkb_nodeid = from_nodeid;
+	}
+
+	/* this is the value returned from do_request() on the master */
+	result = ms->m_result;
+
+	switch (result) {
+	case -EAGAIN:
+		/* request would block (be queued) on remote master */
+		queue_cast(r, lkb, -EAGAIN);
+		confirm_master(r, -EAGAIN);
+		unhold_lkb(lkb); /* undoes create_lkb() */
+		break;
+
+	case -EINPROGRESS:
+	case 0:
+		/* request was queued or granted on remote master */
+		receive_flags_reply(lkb, ms);
+		lkb->lkb_remid = ms->m_lkid;
+		if (is_altmode(lkb))
+			munge_altmode(lkb, ms);
+		if (result) {
+			add_lkb(r, lkb, DLM_LKSTS_WAITING);
+			add_timeout(lkb);
+		} else {
+			grant_lock_pc(r, lkb, ms);
+			queue_cast(r, lkb, 0);
+		}
+		confirm_master(r, result);
+		break;
+
+	case -EBADR:
+	case -ENOTBLK:
+		/* find_rsb failed to find rsb or rsb wasn't master */
+		log_limit(ls, "receive_request_reply %x from %d %d "
+			  "master %d dir %d first %x %s", lkb->lkb_id,
+			  from_nodeid, result, r->res_master_nodeid,
+			  r->res_dir_nodeid, r->res_first_lkid, r->res_name);
+
+		if (r->res_dir_nodeid != dlm_our_nodeid() &&
+		    r->res_master_nodeid != dlm_our_nodeid()) {
+			/* cause _request_lock->set_master->send_lookup */
+			r->res_master_nodeid = 0;
+			r->res_nodeid = -1;
+			lkb->lkb_nodeid = -1;
+		}
+
+		if (is_overlap(lkb)) {
+			/* we'll ignore error in cancel/unlock reply */
+			queue_cast_overlap(r, lkb);
+			confirm_master(r, result);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+		} else {
+			_request_lock(r, lkb);
+
+			if (r->res_master_nodeid == dlm_our_nodeid())
+				confirm_master(r, 0);
+		}
+		break;
+
+	default:
+		log_error(ls, "receive_request_reply %x error %d",
+			  lkb->lkb_id, result);
+	}
+
+	if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x result %d unlock",
+			  lkb->lkb_id, result);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_unlock(r, lkb);
+	} else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_cancel(r, lkb);
+	} else {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+	}
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return 0;
+}
+
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+				    struct dlm_message *ms)
+{
+	/* this is the value returned from do_convert() on the master */
+	switch (ms->m_result) {
+	case -EAGAIN:
+		/* convert would block (be queued) on remote master */
+		queue_cast(r, lkb, -EAGAIN);
+		break;
+
+	case -EDEADLK:
+		receive_flags_reply(lkb, ms);
+		revert_lock_pc(r, lkb);
+		queue_cast(r, lkb, -EDEADLK);
+		break;
+
+	case -EINPROGRESS:
+		/* convert was queued on remote master */
+		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb);
+		del_lkb(r, lkb);
+		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+		add_timeout(lkb);
+		break;
+
+	case 0:
+		/* convert was granted on remote master */
+		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb);
+		grant_lock_pc(r, lkb, ms);
+		queue_cast(r, lkb, 0);
+		break;
+
+	default:
+		log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d",
+			  lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid,
+			  ms->m_result);
+		dlm_print_rsb(r);
+		dlm_print_lkb(lkb);
+	}
+}
+
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
+	__receive_convert_reply(r, lkb, ms);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		return error;
+
+	_receive_convert_reply(lkb, ms);
+	dlm_put_lkb(lkb);
+	return 0;
+}
+
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
+	/* this is the value returned from do_unlock() on the master */
+
+	switch (ms->m_result) {
+	case -DLM_EUNLOCK:
+		receive_flags_reply(lkb, ms);
+		remove_lock_pc(r, lkb);
+		queue_cast(r, lkb, -DLM_EUNLOCK);
+		break;
+	case -ENOENT:
+		break;
+	default:
+		log_error(r->res_ls, "receive_unlock_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
+	}
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		return error;
+
+	_receive_unlock_reply(lkb, ms);
+	dlm_put_lkb(lkb);
+	return 0;
+}
+
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
+	/* this is the value returned from do_cancel() on the master */
+
+	switch (ms->m_result) {
+	case -DLM_ECANCEL:
+		receive_flags_reply(lkb, ms);
+		revert_lock_pc(r, lkb);
+		queue_cast(r, lkb, -DLM_ECANCEL);
+		break;
+	case 0:
+		break;
+	default:
+		log_error(r->res_ls, "receive_cancel_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
+	}
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		return error;
+
+	_receive_cancel_reply(lkb, ms);
+	dlm_put_lkb(lkb);
+	return 0;
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, ret_nodeid;
+	int do_lookup_list = 0;
+
+	error = find_lkb(ls, ms->m_lkid, &lkb);
+	if (error) {
+		log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid);
+		return;
+	}
+
+	/* ms->m_result is the value returned by dlm_master_lookup on dir node
+	   FIXME: will a non-zero error ever be returned? */
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+	if (error)
+		goto out;
+
+	ret_nodeid = ms->m_nodeid;
+
+	/* We sometimes receive a request from the dir node for this
+	   rsb before we've received the dir node's loookup_reply for it.
+	   The request from the dir node implies we're the master, so we set
+	   ourself as master in receive_request_reply, and verify here that
+	   we are indeed the master. */
+
+	if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) {
+		/* This should never happen */
+		log_error(ls, "receive_lookup_reply %x from %d ret %d "
+			  "master %d dir %d our %d first %x %s",
+			  lkb->lkb_id, ms->m_header.h_nodeid, ret_nodeid,
+			  r->res_master_nodeid, r->res_dir_nodeid,
+			  dlm_our_nodeid(), r->res_first_lkid, r->res_name);
+	}
+
+	if (ret_nodeid == dlm_our_nodeid()) {
+		r->res_master_nodeid = ret_nodeid;
+		r->res_nodeid = 0;
+		do_lookup_list = 1;
+		r->res_first_lkid = 0;
+	} else if (ret_nodeid == -1) {
+		/* the remote node doesn't believe it's the dir node */
+		log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid",
+			  lkb->lkb_id, ms->m_header.h_nodeid);
+		r->res_master_nodeid = 0;
+		r->res_nodeid = -1;
+		lkb->lkb_nodeid = -1;
+	} else {
+		/* set_master() will set lkb_nodeid from r */
+		r->res_master_nodeid = ret_nodeid;
+		r->res_nodeid = ret_nodeid;
+	}
+
+	if (is_overlap(lkb)) {
+		log_debug(ls, "receive_lookup_reply %x unlock %x",
+			  lkb->lkb_id, lkb->lkb_flags);
+		queue_cast_overlap(r, lkb);
+		unhold_lkb(lkb); /* undoes create_lkb() */
+		goto out_list;
+	}
+
+	_request_lock(r, lkb);
+
+ out_list:
+	if (do_lookup_list)
+		process_lookup_list(r);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+}
+
+static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
+			     uint32_t saved_seq)
+{
+	int error = 0, noent = 0;
+
+	if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+		log_limit(ls, "receive %d from non-member %d %x %x %d",
+			  ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
+			  ms->m_remid, ms->m_result);
+		return;
+	}
+
+	switch (ms->m_type) {
+
+	/* messages sent to a master node */
+
+	case DLM_MSG_REQUEST:
+		error = receive_request(ls, ms);
+		break;
+
+	case DLM_MSG_CONVERT:
+		error = receive_convert(ls, ms);
+		break;
+
+	case DLM_MSG_UNLOCK:
+		error = receive_unlock(ls, ms);
+		break;
+
+	case DLM_MSG_CANCEL:
+		noent = 1;
+		error = receive_cancel(ls, ms);
+		break;
+
+	/* messages sent from a master node (replies to above) */
+
+	case DLM_MSG_REQUEST_REPLY:
+		error = receive_request_reply(ls, ms);
+		break;
+
+	case DLM_MSG_CONVERT_REPLY:
+		error = receive_convert_reply(ls, ms);
+		break;
+
+	case DLM_MSG_UNLOCK_REPLY:
+		error = receive_unlock_reply(ls, ms);
+		break;
+
+	case DLM_MSG_CANCEL_REPLY:
+		error = receive_cancel_reply(ls, ms);
+		break;
+
+	/* messages sent from a master node (only two types of async msg) */
+
+	case DLM_MSG_GRANT:
+		noent = 1;
+		error = receive_grant(ls, ms);
+		break;
+
+	case DLM_MSG_BAST:
+		noent = 1;
+		error = receive_bast(ls, ms);
+		break;
+
+	/* messages sent to a dir node */
+
+	case DLM_MSG_LOOKUP:
+		receive_lookup(ls, ms);
+		break;
+
+	case DLM_MSG_REMOVE:
+		receive_remove(ls, ms);
+		break;
+
+	/* messages sent from a dir node (remove has no reply) */
+
+	case DLM_MSG_LOOKUP_REPLY:
+		receive_lookup_reply(ls, ms);
+		break;
+
+	/* other messages */
+
+	case DLM_MSG_PURGE:
+		receive_purge(ls, ms);
+		break;
+
+	default:
+		log_error(ls, "unknown message type %d", ms->m_type);
+	}
+
+	/*
+	 * When checking for ENOENT, we're checking the result of
+	 * find_lkb(m_remid):
+	 *
+	 * The lock id referenced in the message wasn't found.  This may
+	 * happen in normal usage for the async messages and cancel, so
+	 * only use log_debug for them.
+	 *
+	 * Some errors are expected and normal.
+	 */
+
+	if (error == -ENOENT && noent) {
+		log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
+			  ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
+			  ms->m_lkid, saved_seq);
+	} else if (error == -ENOENT) {
+		log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
+			  ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
+			  ms->m_lkid, saved_seq);
+
+		if (ms->m_type == DLM_MSG_CONVERT)
+			dlm_dump_rsb_hash(ls, ms->m_hash);
+	}
+
+	if (error == -EINVAL) {
+		log_error(ls, "receive %d inval from %d lkid %x remid %x "
+			  "saved_seq %u",
+			  ms->m_type, ms->m_header.h_nodeid,
+			  ms->m_lkid, ms->m_remid, saved_seq);
+	}
+}
+
+/* If the lockspace is in recovery mode (locking stopped), then normal
+   messages are saved on the requestqueue for processing after recovery is
+   done.  When not in recovery mode, we wait for dlm_recoverd to drain saved
+   messages off the requestqueue before we process new ones. This occurs right
+   after recovery completes when we transition from saving all messages on
+   requestqueue, to processing all the saved messages, to processing new
+   messages as they arrive. */
+
+static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
+				int nodeid)
+{
+	if (dlm_locking_stopped(ls)) {
+		/* If we were a member of this lockspace, left, and rejoined,
+		   other nodes may still be sending us messages from the
+		   lockspace generation before we left. */
+		if (!ls->ls_generation) {
+			log_limit(ls, "receive %d from %d ignore old gen",
+				  ms->m_type, nodeid);
+			return;
+		}
+
+		dlm_add_requestqueue(ls, nodeid, ms);
+	} else {
+		dlm_wait_requestqueue(ls);
+		_receive_message(ls, ms, 0);
+	}
+}
+
+/* This is called by dlm_recoverd to process messages that were saved on
+   the requestqueue. */
+
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
+			       uint32_t saved_seq)
+{
+	_receive_message(ls, ms, saved_seq);
+}
+
+/* This is called by the midcomms layer when something is received for
+   the lockspace.  It could be either a MSG (normal message sent as part of
+   standard locking activity) or an RCOM (recovery message sent as part of
+   lockspace recovery). */
+
+void dlm_receive_buffer(union dlm_packet *p, int nodeid)
+{
+	struct dlm_header *hd = &p->header;
+	struct dlm_ls *ls;
+	int type = 0;
+
+	switch (hd->h_cmd) {
+	case DLM_MSG:
+		dlm_message_in(&p->message);
+		type = p->message.m_type;
+		break;
+	case DLM_RCOM:
+		dlm_rcom_in(&p->rcom);
+		type = p->rcom.rc_type;
+		break;
+	default:
+		log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
+		return;
+	}
+
+	if (hd->h_nodeid != nodeid) {
+		log_print("invalid h_nodeid %d from %d lockspace %x",
+			  hd->h_nodeid, nodeid, hd->h_lockspace);
+		return;
+	}
+
+	ls = dlm_find_lockspace_global(hd->h_lockspace);
+	if (!ls) {
+		if (dlm_config.ci_log_debug) {
+			printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
+				"%u from %d cmd %d type %d\n",
+				hd->h_lockspace, nodeid, hd->h_cmd, type);
+		}
+
+		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
+			dlm_send_ls_not_ready(nodeid, &p->rcom);
+		return;
+	}
+
+	/* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
+	   be inactive (in this ls) before transitioning to recovery mode */
+
+	down_read(&ls->ls_recv_active);
+	if (hd->h_cmd == DLM_MSG)
+		dlm_receive_message(ls, &p->message, nodeid);
+	else
+		dlm_receive_rcom(ls, &p->rcom, nodeid);
+	up_read(&ls->ls_recv_active);
+
+	dlm_put_lockspace(ls);
+}
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				   struct dlm_message *ms_stub)
+{
+	if (middle_conversion(lkb)) {
+		hold_lkb(lkb);
+		memset(ms_stub, 0, sizeof(struct dlm_message));
+		ms_stub->m_flags = DLM_IFL_STUB_MS;
+		ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
+		ms_stub->m_result = -EINPROGRESS;
+		ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+		_receive_convert_reply(lkb, ms_stub);
+
+		/* Same special case as in receive_rcom_lock_args() */
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+		unhold_lkb(lkb);
+
+	} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+		lkb->lkb_flags |= DLM_IFL_RESEND;
+	}
+
+	/* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+	   conversions are async; there's no reply from the remote master */
+}
+
+/* A waiting lkb needs recovery if the master node has failed, or
+   the master node is changing (only when no directory is used) */
+
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				 int dir_nodeid)
+{
+	if (dlm_no_directory(ls))
+		return 1;
+
+	if (dlm_is_removed(ls, lkb->lkb_wait_nodeid))
+		return 1;
+
+	return 0;
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+   gone.  We can just complete unlocks and cancels by faking a reply from the
+   dead node.  Requests and up-conversions we flag to be resent after
+   recovery.  Down-conversions can just be completed with a fake reply like
+   unlocks.  Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb, *safe;
+	struct dlm_message *ms_stub;
+	int wait_type, stub_unlock_result, stub_cancel_result;
+	int dir_nodeid;
+
+	ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL);
+	if (!ms_stub) {
+		log_error(ls, "dlm_recover_waiters_pre no mem");
+		return;
+	}
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+
+		dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);
+
+		/* exclude debug messages about unlocks because there can be so
+		   many and they aren't very interesting */
+
+		if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+			log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
+				  "lkb_nodeid %d wait_nodeid %d dir_nodeid %d",
+				  lkb->lkb_id,
+				  lkb->lkb_remid,
+				  lkb->lkb_wait_type,
+				  lkb->lkb_resource->res_nodeid,
+				  lkb->lkb_nodeid,
+				  lkb->lkb_wait_nodeid,
+				  dir_nodeid);
+		}
+
+		/* all outstanding lookups, regardless of destination  will be
+		   resent after recovery is done */
+
+		if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+			lkb->lkb_flags |= DLM_IFL_RESEND;
+			continue;
+		}
+
+		if (!waiter_needs_recovery(ls, lkb, dir_nodeid))
+			continue;
+
+		wait_type = lkb->lkb_wait_type;
+		stub_unlock_result = -DLM_EUNLOCK;
+		stub_cancel_result = -DLM_ECANCEL;
+
+		/* Main reply may have been received leaving a zero wait_type,
+		   but a reply for the overlapping op may not have been
+		   received.  In that case we need to fake the appropriate
+		   reply for the overlap op. */
+
+		if (!wait_type) {
+			if (is_overlap_cancel(lkb)) {
+				wait_type = DLM_MSG_CANCEL;
+				if (lkb->lkb_grmode == DLM_LOCK_IV)
+					stub_cancel_result = 0;
+			}
+			if (is_overlap_unlock(lkb)) {
+				wait_type = DLM_MSG_UNLOCK;
+				if (lkb->lkb_grmode == DLM_LOCK_IV)
+					stub_unlock_result = -ENOENT;
+			}
+
+			log_debug(ls, "rwpre overlap %x %x %d %d %d",
+				  lkb->lkb_id, lkb->lkb_flags, wait_type,
+				  stub_cancel_result, stub_unlock_result);
+		}
+
+		switch (wait_type) {
+
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_RESEND;
+			break;
+
+		case DLM_MSG_CONVERT:
+			recover_convert_waiter(ls, lkb, ms_stub);
+			break;
+
+		case DLM_MSG_UNLOCK:
+			hold_lkb(lkb);
+			memset(ms_stub, 0, sizeof(struct dlm_message));
+			ms_stub->m_flags = DLM_IFL_STUB_MS;
+			ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
+			ms_stub->m_result = stub_unlock_result;
+			ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+			_receive_unlock_reply(lkb, ms_stub);
+			dlm_put_lkb(lkb);
+			break;
+
+		case DLM_MSG_CANCEL:
+			hold_lkb(lkb);
+			memset(ms_stub, 0, sizeof(struct dlm_message));
+			ms_stub->m_flags = DLM_IFL_STUB_MS;
+			ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
+			ms_stub->m_result = stub_cancel_result;
+			ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+			_receive_cancel_reply(lkb, ms_stub);
+			dlm_put_lkb(lkb);
+			break;
+
+		default:
+			log_error(ls, "invalid lkb wait_type %d %d",
+				  lkb->lkb_wait_type, wait_type);
+		}
+		schedule();
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+	kfree(ms_stub);
+}
+
+static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	int found = 0;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			hold_lkb(lkb);
+			found = 1;
+			break;
+		}
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	if (!found)
+		lkb = NULL;
+	return lkb;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
+   master or dir-node for r.  Processing the lkb may result in it being placed
+   back on waiters. */
+
+/* We do this after normal locking has been enabled and any saved messages
+   (in requestqueue) have been processed.  We should be confident that at
+   this point we won't get or process a reply to any of these waiting
+   operations.  But, new ops may be coming in on the rsbs/locks here from
+   userspace or remotely. */
+
+/* there may have been an overlap unlock/cancel prior to recovery or after
+   recovery.  if before, the lkb may still have a pos wait_count; if after, the
+   overlap flag would just have been set and nothing new sent.  we can be
+   confident here than any replies to either the initial op or overlap ops
+   prior to recovery have been received. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error = 0, mstype, err, oc, ou;
+
+	while (1) {
+		if (dlm_locking_stopped(ls)) {
+			log_debug(ls, "recover_waiters_post aborted");
+			error = -EINTR;
+			break;
+		}
+
+		lkb = find_resend_waiter(ls);
+		if (!lkb)
+			break;
+
+		r = lkb->lkb_resource;
+		hold_rsb(r);
+		lock_rsb(r);
+
+		mstype = lkb->lkb_wait_type;
+		oc = is_overlap_cancel(lkb);
+		ou = is_overlap_unlock(lkb);
+		err = 0;
+
+		log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
+			  "lkb_nodeid %d wait_nodeid %d dir_nodeid %d "
+			  "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
+			  r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
+			  dlm_dir_nodeid(r), oc, ou);
+
+		/* At this point we assume that we won't get a reply to any
+		   previous op or overlap op on this lock.  First, do a big
+		   remove_from_waiters() for all previous ops. */
+
+		lkb->lkb_flags &= ~DLM_IFL_RESEND;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_wait_count = 0;
+		mutex_lock(&ls->ls_waiters_mutex);
+		list_del_init(&lkb->lkb_wait_reply);
+		mutex_unlock(&ls->ls_waiters_mutex);
+		unhold_lkb(lkb); /* for waiters list */
+
+		if (oc || ou) {
+			/* do an unlock or cancel instead of resending */
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
+							-DLM_ECANCEL);
+				unhold_lkb(lkb); /* undoes create_lkb() */
+				break;
+			case DLM_MSG_CONVERT:
+				if (oc) {
+					queue_cast(r, lkb, -DLM_ECANCEL);
+				} else {
+					lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
+					_unlock_lock(r, lkb);
+				}
+				break;
+			default:
+				err = 1;
+			}
+		} else {
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				_request_lock(r, lkb);
+				if (is_master(r))
+					confirm_master(r, 0);
+				break;
+			case DLM_MSG_CONVERT:
+				_convert_lock(r, lkb);
+				break;
+			default:
+				err = 1;
+			}
+		}
+
+		if (err) {
+			log_error(ls, "waiter %x msg %d r_nodeid %d "
+				  "dir_nodeid %d overlap %d %d",
+				  lkb->lkb_id, mstype, r->res_nodeid,
+				  dlm_dir_nodeid(r), oc, ou);
+		}
+		unlock_rsb(r);
+		put_rsb(r);
+		dlm_put_lkb(lkb);
+	}
+
+	return error;
+}
+
+static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r,
+			      struct list_head *list)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
+		if (!is_master_copy(lkb))
+			continue;
+
+		/* don't purge lkbs we've added in recover_master_copy for
+		   the current recovery seq */
+
+		if (lkb->lkb_recover_seq == ls->ls_recover_seq)
+			continue;
+
+		del_lkb(r, lkb);
+
+		/* this put should free the lkb */
+		if (!dlm_put_lkb(lkb))
+			log_error(ls, "purged mstcpy lkb not released");
+	}
+}
+
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	purge_mstcpy_list(ls, r, &r->res_grantqueue);
+	purge_mstcpy_list(ls, r, &r->res_convertqueue);
+	purge_mstcpy_list(ls, r, &r->res_waitqueue);
+}
+
+static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
+			    struct list_head *list,
+			    int nodeid_gone, unsigned int *count)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
+		if (!is_master_copy(lkb))
+			continue;
+
+		if ((lkb->lkb_nodeid == nodeid_gone) ||
+		    dlm_is_removed(ls, lkb->lkb_nodeid)) {
+
+			/* tell recover_lvb to invalidate the lvb
+			   because a node holding EX/PW failed */
+			if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
+			    (lkb->lkb_grmode >= DLM_LOCK_PW)) {
+				rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
+			}
+
+			del_lkb(r, lkb);
+
+			/* this put should free the lkb */
+			if (!dlm_put_lkb(lkb))
+				log_error(ls, "purged dead lkb not released");
+
+			rsb_set_flag(r, RSB_RECOVER_GRANT);
+
+			(*count)++;
+		}
+	}
+}
+
+/* Get rid of locks held by nodes that are gone. */
+
+void dlm_recover_purge(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	struct dlm_member *memb;
+	int nodes_count = 0;
+	int nodeid_gone = 0;
+	unsigned int lkb_count = 0;
+
+	/* cache one removed nodeid to optimize the common
+	   case of a single node removed */
+
+	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+		nodes_count++;
+		nodeid_gone = memb->nodeid;
+	}
+
+	if (!nodes_count)
+		return;
+
+	down_write(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		hold_rsb(r);
+		lock_rsb(r);
+		if (is_master(r)) {
+			purge_dead_list(ls, r, &r->res_grantqueue,
+					nodeid_gone, &lkb_count);
+			purge_dead_list(ls, r, &r->res_convertqueue,
+					nodeid_gone, &lkb_count);
+			purge_dead_list(ls, r, &r->res_waitqueue,
+					nodeid_gone, &lkb_count);
+		}
+		unlock_rsb(r);
+		unhold_rsb(r);
+		cond_resched();
+	}
+	up_write(&ls->ls_root_sem);
+
+	if (lkb_count)
+		log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
+			  lkb_count, nodes_count);
+}
+
+static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket)
+{
+	struct rb_node *n;
+	struct dlm_rsb *r;
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
+		r = rb_entry(n, struct dlm_rsb, res_hashnode);
+
+		if (!rsb_flag(r, RSB_RECOVER_GRANT))
+			continue;
+		if (!is_master(r)) {
+			rsb_clear_flag(r, RSB_RECOVER_GRANT);
+			continue;
+		}
+		hold_rsb(r);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+		return r;
+	}
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	return NULL;
+}
+
+/*
+ * Attempt to grant locks on resources that we are the master of.
+ * Locks may have become grantable during recovery because locks
+ * from departed nodes have been purged (or not rebuilt), allowing
+ * previously blocked locks to now be granted.  The subset of rsb's
+ * we are interested in are those with lkb's on either the convert or
+ * waiting queues.
+ *
+ * Simplest would be to go through each master rsb and check for non-empty
+ * convert or waiting queues, and attempt to grant on those rsbs.
+ * Checking the queues requires lock_rsb, though, for which we'd need
+ * to release the rsbtbl lock.  This would make iterating through all
+ * rsb's very inefficient.  So, we rely on earlier recovery routines
+ * to set RECOVER_GRANT on any rsb's that we should attempt to grant
+ * locks for.
+ */
+
+void dlm_recover_grant(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int bucket = 0;
+	unsigned int count = 0;
+	unsigned int rsb_count = 0;
+	unsigned int lkb_count = 0;
+
+	while (1) {
+		r = find_grant_rsb(ls, bucket);
+		if (!r) {
+			if (bucket == ls->ls_rsbtbl_size - 1)
+				break;
+			bucket++;
+			continue;
+		}
+		rsb_count++;
+		count = 0;
+		lock_rsb(r);
+		/* the RECOVER_GRANT flag is checked in the grant path */
+		grant_pending_locks(r, &count);
+		rsb_clear_flag(r, RSB_RECOVER_GRANT);
+		lkb_count += count;
+		confirm_master(r, 0);
+		unlock_rsb(r);
+		put_rsb(r);
+		cond_resched();
+	}
+
+	if (lkb_count)
+		log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
+			  lkb_count, rsb_count);
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+					 uint32_t remid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+			return lkb;
+	}
+	return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+				    uint32_t remid)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	return NULL;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				  struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+
+	lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+	lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
+	lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
+	lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
+	lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF;
+	lkb->lkb_flags |= DLM_IFL_MSTCPY;
+	lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
+	lkb->lkb_rqmode = rl->rl_rqmode;
+	lkb->lkb_grmode = rl->rl_grmode;
+	/* don't set lkb_status because add_lkb wants to itself */
+
+	lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
+	lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+			 sizeof(struct rcom_lock);
+		if (lvblen > ls->ls_lvblen)
+			return -EINVAL;
+		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+		memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+	}
+
+	/* Conversions between PR and CW (middle modes) need special handling.
+	   The real granted mode of these converting locks cannot be determined
+	   until all locks have been rebuilt on the rsb (recover_conversion) */
+
+	if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) &&
+	    middle_conversion(lkb)) {
+		rl->rl_status = DLM_LKSTS_CONVERT;
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		rsb_set_flag(r, RSB_RECOVER_CONVERT);
+	}
+
+	return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+   to check if the rsb already has an lkb with the given remote nodeid/lkid.
+   If so we just send back a standard reply.  If not, we create a new lkb with
+   the given values and send back our lkid.  We send back our lkid by sending
+   back the rcom_lock struct we got but with the remid field filled in. */
+
+/* needs at least dlm_rcom + rcom_lock */
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	uint32_t remid = 0;
+	int from_nodeid = rc->rc_header.h_nodeid;
+	int error;
+
+	if (rl->rl_parent_lkid) {
+		error = -EOPNOTSUPP;
+		goto out;
+	}
+
+	remid = le32_to_cpu(rl->rl_lkid);
+
+	/* In general we expect the rsb returned to be R_MASTER, but we don't
+	   have to require it.  Recovery of masters on one node can overlap
+	   recovery of locks on another node, so one node can send us MSTCPY
+	   locks before we've made ourselves master of this rsb.  We can still
+	   add new MSTCPY locks that we receive here without any harm; when
+	   we make ourselves master, dlm_recover_masters() won't touch the
+	   MSTCPY locks we've received early. */
+
+	error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
+			 from_nodeid, R_RECEIVE_RECOVER, &r);
+	if (error)
+		goto out;
+
+	lock_rsb(r);
+
+	if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) {
+		log_error(ls, "dlm_recover_master_copy remote %d %x not dir",
+			  from_nodeid, remid);
+		error = -EBADR;
+		goto out_unlock;
+	}
+
+	lkb = search_remid(r, from_nodeid, remid);
+	if (lkb) {
+		error = -EEXIST;
+		goto out_remid;
+	}
+
+	error = create_lkb(ls, &lkb);
+	if (error)
+		goto out_unlock;
+
+	error = receive_rcom_lock_args(ls, lkb, r, rc);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto out_unlock;
+	}
+
+	attach_lkb(r, lkb);
+	add_lkb(r, lkb, rl->rl_status);
+	error = 0;
+	ls->ls_recover_locks_in++;
+
+	if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+		rsb_set_flag(r, RSB_RECOVER_GRANT);
+
+ out_remid:
+	/* this is the new value returned to the lock holder for
+	   saving in its process-copy lkb */
+	rl->rl_remid = cpu_to_le32(lkb->lkb_id);
+
+	lkb->lkb_recover_seq = ls->ls_recover_seq;
+
+ out_unlock:
+	unlock_rsb(r);
+	put_rsb(r);
+ out:
+	if (error && error != -EEXIST)
+		log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
+			  from_nodeid, remid, error);
+	rl->rl_result = cpu_to_le32(error);
+	return error;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	uint32_t lkid, remid;
+	int error, result;
+
+	lkid = le32_to_cpu(rl->rl_lkid);
+	remid = le32_to_cpu(rl->rl_remid);
+	result = le32_to_cpu(rl->rl_result);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error) {
+		log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d",
+			  lkid, rc->rc_header.h_nodeid, remid, result);
+		return error;
+	}
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	if (!is_process_copy(lkb)) {
+		log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
+			  lkid, rc->rc_header.h_nodeid, remid, result);
+		dlm_dump_rsb(r);
+		unlock_rsb(r);
+		put_rsb(r);
+		dlm_put_lkb(lkb);
+		return -EINVAL;
+	}
+
+	switch (result) {
+	case -EBADR:
+		/* There's a chance the new master received our lock before
+		   dlm_recover_master_reply(), this wouldn't happen if we did
+		   a barrier between recover_masters and recover_locks. */
+
+		log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d",
+			  lkid, rc->rc_header.h_nodeid, remid, result);
+	
+		dlm_send_rcom_lock(r, lkb);
+		goto out;
+	case -EEXIST:
+	case 0:
+		lkb->lkb_remid = remid;
+		break;
+	default:
+		log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk",
+			  lkid, rc->rc_header.h_nodeid, remid, result);
+	}
+
+	/* an ack for dlm_recover_locks() which waits for replies from
+	   all the locks it sends to new masters */
+	dlm_recovered_lock(r);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+
+	return 0;
+}
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+		     int mode, uint32_t flags, void *name, unsigned int namelen,
+		     unsigned long timeout_cs)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = create_lkb(ls, &lkb);
+	if (error) {
+		kfree(ua);
+		goto out;
+	}
+
+	if (flags & DLM_LKF_VALBLK) {
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+		if (!ua->lksb.sb_lvbptr) {
+			kfree(ua);
+			__put_lkb(ls, lkb);
+			error = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
+	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
+	   lock and that lkb_astparam is the dlm_user_args structure. */
+
+	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
+			      fake_astfn, ua, fake_bastfn, &args);
+	lkb->lkb_flags |= DLM_IFL_USER;
+
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto out;
+	}
+
+	error = request_lock(ls, lkb, name, namelen, &args);
+
+	switch (error) {
+	case 0:
+		break;
+	case -EINPROGRESS:
+		error = 0;
+		break;
+	case -EAGAIN:
+		error = 0;
+		/* fall through */
+	default:
+		__put_lkb(ls, lkb);
+		goto out;
+	}
+
+	/* add this new lkb to the per-process list of locks */
+	spin_lock(&ua->proc->locks_spin);
+	hold_lkb(lkb);
+	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+	spin_unlock(&ua->proc->locks_spin);
+ out:
+	dlm_unlock_recovery(ls);
+	return error;
+}
+
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
+		     unsigned long timeout_cs)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	/* user can change the params on its lock when it converts it, or
+	   add an lvb that didn't exist before */
+
+	ua = lkb->lkb_ua;
+
+	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+		if (!ua->lksb.sb_lvbptr) {
+			error = -ENOMEM;
+			goto out_put;
+		}
+	}
+	if (lvb_in && ua->lksb.sb_lvbptr)
+		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+
+	ua->xid = ua_tmp->xid;
+	ua->castparam = ua_tmp->castparam;
+	ua->castaddr = ua_tmp->castaddr;
+	ua->bastparam = ua_tmp->bastparam;
+	ua->bastaddr = ua_tmp->bastaddr;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
+			      fake_astfn, ua, fake_bastfn, &args);
+	if (error)
+		goto out_put;
+
+	error = convert_lock(ls, lkb, &args);
+
+	if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	kfree(ua_tmp);
+	return error;
+}
+
+/*
+ * The caller asks for an orphan lock on a given resource with a given mode.
+ * If a matching lock exists, it's moved to the owner's list of locks and
+ * the lkid is returned.
+ */
+
+int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		     int mode, uint32_t flags, void *name, unsigned int namelen,
+		     unsigned long timeout_cs, uint32_t *lkid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_user_args *ua;
+	int found_other_mode = 0;
+	int found = 0;
+	int rv = 0;
+
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) {
+		if (lkb->lkb_resource->res_length != namelen)
+			continue;
+		if (memcmp(lkb->lkb_resource->res_name, name, namelen))
+			continue;
+		if (lkb->lkb_grmode != mode) {
+			found_other_mode = 1;
+			continue;
+		}
+
+		found = 1;
+		list_del_init(&lkb->lkb_ownqueue);
+		lkb->lkb_flags &= ~DLM_IFL_ORPHAN;
+		*lkid = lkb->lkb_id;
+		break;
+	}
+	mutex_unlock(&ls->ls_orphans_mutex);
+
+	if (!found && found_other_mode) {
+		rv = -EAGAIN;
+		goto out;
+	}
+
+	if (!found) {
+		rv = -ENOENT;
+		goto out;
+	}
+
+	lkb->lkb_exflags = flags;
+	lkb->lkb_ownpid = (int) current->pid;
+
+	ua = lkb->lkb_ua;
+
+	ua->proc = ua_tmp->proc;
+	ua->xid = ua_tmp->xid;
+	ua->castparam = ua_tmp->castparam;
+	ua->castaddr = ua_tmp->castaddr;
+	ua->bastparam = ua_tmp->bastparam;
+	ua->bastaddr = ua_tmp->bastaddr;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	/*
+	 * The lkb reference from the ls_orphans list was not
+	 * removed above, and is now considered the reference
+	 * for the proc locks list.
+	 */
+
+	spin_lock(&ua->proc->locks_spin);
+	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+	spin_unlock(&ua->proc->locks_spin);
+ out:
+	kfree(ua_tmp);
+	return rv;
+}
+
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		    uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = lkb->lkb_ua;
+
+	if (lvb_in && ua->lksb.sb_lvbptr)
+		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+	if (ua_tmp->castparam)
+		ua->castparam = ua_tmp->castparam;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	error = unlock_lock(ls, lkb, &args);
+
+	if (error == -DLM_EUNLOCK)
+		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
+		error = 0;
+	if (error)
+		goto out_put;
+
+	spin_lock(&ua->proc->locks_spin);
+	/* dlm_user_add_cb() may have already taken lkb off the proc list */
+	if (!list_empty(&lkb->lkb_ownqueue))
+		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
+	spin_unlock(&ua->proc->locks_spin);
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	kfree(ua_tmp);
+	return error;
+}
+
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		    uint32_t flags, uint32_t lkid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = lkb->lkb_ua;
+	if (ua_tmp->castparam)
+		ua->castparam = ua_tmp->castparam;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	error = cancel_lock(ls, lkb, &args);
+
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	kfree(ua_tmp);
+	return error;
+}
+
+int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	struct dlm_rsb *r;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = lkb->lkb_ua;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	/* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, &args);
+	if (error)
+		goto out_r;
+	lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL;
+
+	error = _cancel_lock(r, lkb);
+ out_r:
+	unlock_rsb(r);
+	put_rsb(r);
+
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	return error;
+}
+
+/* lkb's that are removed from the waiters list by revert are just left on the
+   orphans list with the granted orphan locks, to be freed by purge */
+
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	struct dlm_args args;
+	int error;
+
+	hold_lkb(lkb); /* reference for the ls_orphans list */
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
+	mutex_unlock(&ls->ls_orphans_mutex);
+
+	set_unlock_args(0, lkb->lkb_ua, &args);
+
+	error = cancel_lock(ls, lkb, &args);
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	return error;
+}
+
+/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
+   granted.  Regardless of what rsb queue the lock is on, it's removed and
+   freed.  The IVVALBLK flag causes the lvb on the resource to be invalidated
+   if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
+
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	struct dlm_args args;
+	int error;
+
+	set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
+			lkb->lkb_ua, &args);
+
+	error = unlock_lock(ls, lkb, &args);
+	if (error == -DLM_EUNLOCK)
+		error = 0;
+	return error;
+}
+
+/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
+   (which does lock_rsb) due to deadlock with receiving a message that does
+   lock_rsb followed by dlm_user_add_cb() */
+
+static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
+				     struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	mutex_lock(&ls->ls_clear_proc_locks);
+	if (list_empty(&proc->locks))
+		goto out;
+
+	lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
+	list_del_init(&lkb->lkb_ownqueue);
+
+	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+		lkb->lkb_flags |= DLM_IFL_ORPHAN;
+	else
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+ out:
+	mutex_unlock(&ls->ls_clear_proc_locks);
+	return lkb;
+}
+
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which
+   1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+   which we clear here. */
+
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+   list, and no more device_writes should add lkb's to proc->locks list; so we
+   shouldn't need to take asts_spin or locks_spin here.  this assumes that
+   device reads/writes/closes are serialized -- FIXME: we may need to serialize
+   them ourself. */
+
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	dlm_lock_recovery(ls);
+
+	while (1) {
+		lkb = del_proc_lock(ls, proc);
+		if (!lkb)
+			break;
+		del_timeout(lkb);
+		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+			orphan_proc_lock(ls, lkb);
+		else
+			unlock_proc_lock(ls, lkb);
+
+		/* this removes the reference for the proc->locks list
+		   added by dlm_user_request, it may result in the lkb
+		   being freed */
+
+		dlm_put_lkb(lkb);
+	}
+
+	mutex_lock(&ls->ls_clear_proc_locks);
+
+	/* in-progress unlocks */
+	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+		list_del_init(&lkb->lkb_ownqueue);
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		dlm_put_lkb(lkb);
+	}
+
+	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
+		memset(&lkb->lkb_callbacks, 0,
+		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		list_del_init(&lkb->lkb_cb_list);
+		dlm_put_lkb(lkb);
+	}
+
+	mutex_unlock(&ls->ls_clear_proc_locks);
+	dlm_unlock_recovery(ls);
+}
+
+static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	while (1) {
+		lkb = NULL;
+		spin_lock(&proc->locks_spin);
+		if (!list_empty(&proc->locks)) {
+			lkb = list_entry(proc->locks.next, struct dlm_lkb,
+					 lkb_ownqueue);
+			list_del_init(&lkb->lkb_ownqueue);
+		}
+		spin_unlock(&proc->locks_spin);
+
+		if (!lkb)
+			break;
+
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		unlock_proc_lock(ls, lkb);
+		dlm_put_lkb(lkb); /* ref from proc->locks list */
+	}
+
+	spin_lock(&proc->locks_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+		list_del_init(&lkb->lkb_ownqueue);
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->locks_spin);
+
+	spin_lock(&proc->asts_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
+		memset(&lkb->lkb_callbacks, 0,
+		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		list_del_init(&lkb->lkb_cb_list);
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->asts_spin);
+}
+
+/* pid of 0 means purge all orphans */
+
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
+		if (pid && lkb->lkb_ownpid != pid)
+			continue;
+		unlock_proc_lock(ls, lkb);
+		list_del_init(&lkb->lkb_ownqueue);
+		dlm_put_lkb(lkb);
+	}
+	mutex_unlock(&ls->ls_orphans_mutex);
+}
+
+static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int error;
+
+	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
+				DLM_MSG_PURGE, &ms, &mh);
+	if (error)
+		return error;
+	ms->m_nodeid = nodeid;
+	ms->m_pid = pid;
+
+	return send_message(mh, ms);
+}
+
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+		   int nodeid, int pid)
+{
+	int error = 0;
+
+	if (nodeid && (nodeid != dlm_our_nodeid())) {
+		error = send_purge(ls, nodeid, pid);
+	} else {
+		dlm_lock_recovery(ls);
+		if (pid == current->pid)
+			purge_proc_locks(ls, proc);
+		else
+			do_purge(ls, nodeid, pid);
+		dlm_unlock_recovery(ls);
+	}
+	return error;
+}
+
diff --git a/kernel/fs/dlm/lock.h b/kernel/fs/dlm/lock.h
new file mode 100644
index 000000000..ed8ebd3a8
--- /dev/null
+++ b/kernel/fs/dlm/lock.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
+			       uint32_t saved_seq);
+void dlm_receive_buffer(union dlm_packet *p, int nodeid);
+int dlm_modes_compat(int mode1, int mode2);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+int dlm_lock_recovery_try(struct dlm_ls *ls);
+void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
+void dlm_scan_timeout(struct dlm_ls *ls);
+void dlm_adjust_timeouts(struct dlm_ls *ls);
+int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
+		      unsigned int flags, int *r_nodeid, int *result);
+
+int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+			struct dlm_rsb **r_ret);
+
+void dlm_recover_purge(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_recover_grant(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+	uint32_t flags, void *name, unsigned int namelen,
+	unsigned long timeout_cs);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+	int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
+	unsigned long timeout_cs);
+int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+	int mode, uint32_t flags, void *name, unsigned int namelen,
+	unsigned long timeout_cs, uint32_t *lkid);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+	uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
+	uint32_t flags, uint32_t lkid);
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+	int nodeid, int pid);
+int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+
+static inline int is_master(struct dlm_rsb *r)
+{
+	return !r->res_nodeid;
+}
+
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+	mutex_lock(&r->res_mutex);
+}
+
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+	mutex_unlock(&r->res_mutex);
+}
+
+#endif
+
diff --git a/kernel/fs/dlm/lockspace.c b/kernel/fs/dlm/lockspace.c
new file mode 100644
index 000000000..f3e72787e
--- /dev/null
+++ b/kernel/fs/dlm/lockspace.c
@@ -0,0 +1,917 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+#include "requestqueue.h"
+#include "user.h"
+#include "ast.h"
+
+static int			ls_count;
+static struct mutex		ls_lock;
+static struct list_head		lslist;
+static spinlock_t		lslist_lock;
+static struct task_struct *	scand_task;
+
+
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int n;
+	int rc = kstrtoint(buf, 0, &n);
+
+	if (rc)
+		return rc;
+	ls = dlm_find_lockspace_local(ls->ls_local_handle);
+	if (!ls)
+		return -EINVAL;
+
+	switch (n) {
+	case 0:
+		dlm_ls_stop(ls);
+		break;
+	case 1:
+		dlm_ls_start(ls);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	dlm_put_lockspace(ls);
+	return ret;
+}
+
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	int rc = kstrtoint(buf, 0, &ls->ls_uevent_result);
+
+	if (rc)
+		return rc;
+	set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+	wake_up(&ls->ls_uevent_wait);
+	return len;
+}
+
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	int rc = kstrtouint(buf, 0, &ls->ls_global_id);
+
+	if (rc)
+		return rc;
+	return len;
+}
+
+static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls));
+}
+
+static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	int val;
+	int rc = kstrtoint(buf, 0, &val);
+
+	if (rc)
+		return rc;
+	if (val == 1)
+		set_bit(LSFL_NODIR, &ls->ls_flags);
+	return len;
+}
+
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+	uint32_t status = dlm_recover_status(ls);
+	return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+
+struct dlm_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct dlm_ls *, char *);
+	ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+
+static struct dlm_attr dlm_attr_control = {
+	.attr  = {.name = "control", .mode = S_IWUSR},
+	.store = dlm_control_store
+};
+
+static struct dlm_attr dlm_attr_event = {
+	.attr  = {.name = "event_done", .mode = S_IWUSR},
+	.store = dlm_event_store
+};
+
+static struct dlm_attr dlm_attr_id = {
+	.attr  = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+	.show  = dlm_id_show,
+	.store = dlm_id_store
+};
+
+static struct dlm_attr dlm_attr_nodir = {
+	.attr  = {.name = "nodir", .mode = S_IRUGO | S_IWUSR},
+	.show  = dlm_nodir_show,
+	.store = dlm_nodir_store
+};
+
+static struct dlm_attr dlm_attr_recover_status = {
+	.attr  = {.name = "recover_status", .mode = S_IRUGO},
+	.show  = dlm_recover_status_show
+};
+
+static struct dlm_attr dlm_attr_recover_nodeid = {
+	.attr  = {.name = "recover_nodeid", .mode = S_IRUGO},
+	.show  = dlm_recover_nodeid_show
+};
+
+static struct attribute *dlm_attrs[] = {
+	&dlm_attr_control.attr,
+	&dlm_attr_event.attr,
+	&dlm_attr_id.attr,
+	&dlm_attr_nodir.attr,
+	&dlm_attr_recover_status.attr,
+	&dlm_attr_recover_nodeid.attr,
+	NULL,
+};
+
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+	struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+	return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buf, size_t len)
+{
+	struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+	struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+	return a->store ? a->store(ls, buf, len) : len;
+}
+
+static void lockspace_kobj_release(struct kobject *k)
+{
+	struct dlm_ls *ls  = container_of(k, struct dlm_ls, ls_kobj);
+	kfree(ls);
+}
+
+static const struct sysfs_ops dlm_attr_ops = {
+	.show  = dlm_attr_show,
+	.store = dlm_attr_store,
+};
+
+static struct kobj_type dlm_ktype = {
+	.default_attrs = dlm_attrs,
+	.sysfs_ops     = &dlm_attr_ops,
+	.release       = lockspace_kobj_release,
+};
+
+static struct kset *dlm_kset;
+
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+	int error;
+
+	if (in)
+		kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+	else
+		kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+
+	log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
+
+	/* dlm_controld will see the uevent, do the necessary group management
+	   and then write to sysfs to wake us */
+
+	error = wait_event_interruptible(ls->ls_uevent_wait,
+			test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+
+	log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result);
+
+	if (error)
+		goto out;
+
+	error = ls->ls_uevent_result;
+ out:
+	if (error)
+		log_error(ls, "group %s failed %d %d", in ? "join" : "leave",
+			  error, ls->ls_uevent_result);
+	return error;
+}
+
+static int dlm_uevent(struct kset *kset, struct kobject *kobj,
+		      struct kobj_uevent_env *env)
+{
+	struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+
+	add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
+	return 0;
+}
+
+static struct kset_uevent_ops dlm_uevent_ops = {
+	.uevent = dlm_uevent,
+};
+
+int __init dlm_lockspace_init(void)
+{
+	ls_count = 0;
+	mutex_init(&ls_lock);
+	INIT_LIST_HEAD(&lslist);
+	spin_lock_init(&lslist_lock);
+
+	dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
+	if (!dlm_kset) {
+		printk(KERN_WARNING "%s: can not create kset\n", __func__);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void dlm_lockspace_exit(void)
+{
+	kset_unregister(dlm_kset);
+}
+
+static struct dlm_ls *find_ls_to_scan(void)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (time_after_eq(jiffies, ls->ls_scan_time +
+					    dlm_config.ci_scan_secs * HZ)) {
+			spin_unlock(&lslist_lock);
+			return ls;
+		}
+	}
+	spin_unlock(&lslist_lock);
+	return NULL;
+}
+
+static int dlm_scand(void *data)
+{
+	struct dlm_ls *ls;
+
+	while (!kthread_should_stop()) {
+		ls = find_ls_to_scan();
+		if (ls) {
+			if (dlm_lock_recovery_try(ls)) {
+				ls->ls_scan_time = jiffies;
+				dlm_scan_rsbs(ls);
+				dlm_scan_timeout(ls);
+				dlm_scan_waiters(ls);
+				dlm_unlock_recovery(ls);
+			} else {
+				ls->ls_scan_time += HZ;
+			}
+			continue;
+		}
+		schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
+	}
+	return 0;
+}
+
+static int dlm_scand_start(void)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(dlm_scand, NULL, "dlm_scand");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+		scand_task = p;
+	return error;
+}
+
+static void dlm_scand_stop(void)
+{
+	kthread_stop(scand_task);
+}
+
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_global_id == id) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_local_handle == lockspace) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_device.minor == minor) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+	spin_lock(&lslist_lock);
+	ls->ls_count--;
+	spin_unlock(&lslist_lock);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+	for (;;) {
+		spin_lock(&lslist_lock);
+		if (ls->ls_count == 0) {
+			WARN_ON(ls->ls_create_count != 0);
+			list_del(&ls->ls_list);
+			spin_unlock(&lslist_lock);
+			return;
+		}
+		spin_unlock(&lslist_lock);
+		ssleep(1);
+	}
+}
+
+static int threads_start(void)
+{
+	int error;
+
+	error = dlm_scand_start();
+	if (error) {
+		log_print("cannot start dlm_scand thread %d", error);
+		goto fail;
+	}
+
+	/* Thread for sending/receiving messages for all lockspace's */
+	error = dlm_lowcomms_start();
+	if (error) {
+		log_print("cannot start dlm lowcomms %d", error);
+		goto scand_fail;
+	}
+
+	return 0;
+
+ scand_fail:
+	dlm_scand_stop();
+ fail:
+	return error;
+}
+
+static void threads_stop(void)
+{
+	dlm_scand_stop();
+	dlm_lowcomms_stop();
+}
+
+static int new_lockspace(const char *name, const char *cluster,
+			 uint32_t flags, int lvblen,
+			 const struct dlm_lockspace_ops *ops, void *ops_arg,
+			 int *ops_result, dlm_lockspace_t **lockspace)
+{
+	struct dlm_ls *ls;
+	int i, size, error;
+	int do_unreg = 0;
+	int namelen = strlen(name);
+
+	if (namelen > DLM_LOCKSPACE_LEN)
+		return -EINVAL;
+
+	if (!lvblen || (lvblen % 8))
+		return -EINVAL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	if (!dlm_user_daemon_available()) {
+		log_print("dlm user daemon not available");
+		error = -EUNATCH;
+		goto out;
+	}
+
+	if (ops && ops_result) {
+	       	if (!dlm_config.ci_recover_callbacks)
+			*ops_result = -EOPNOTSUPP;
+		else
+			*ops_result = 0;
+	}
+
+	if (dlm_config.ci_recover_callbacks && cluster &&
+	    strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
+		log_print("dlm cluster name %s mismatch %s",
+			  dlm_config.ci_cluster_name, cluster);
+		error = -EBADR;
+		goto out;
+	}
+
+	error = 0;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		WARN_ON(ls->ls_create_count <= 0);
+		if (ls->ls_namelen != namelen)
+			continue;
+		if (memcmp(ls->ls_name, name, namelen))
+			continue;
+		if (flags & DLM_LSFL_NEWEXCL) {
+			error = -EEXIST;
+			break;
+		}
+		ls->ls_create_count++;
+		*lockspace = ls;
+		error = 1;
+		break;
+	}
+	spin_unlock(&lslist_lock);
+
+	if (error)
+		goto out;
+
+	error = -ENOMEM;
+
+	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
+	if (!ls)
+		goto out;
+	memcpy(ls->ls_name, name, namelen);
+	ls->ls_namelen = namelen;
+	ls->ls_lvblen = lvblen;
+	ls->ls_count = 0;
+	ls->ls_flags = 0;
+	ls->ls_scan_time = jiffies;
+
+	if (ops && dlm_config.ci_recover_callbacks) {
+		ls->ls_ops = ops;
+		ls->ls_ops_arg = ops_arg;
+	}
+
+	if (flags & DLM_LSFL_TIMEWARN)
+		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
+
+	/* ls_exflags are forced to match among nodes, and we don't
+	   need to require all nodes to have some flags set */
+	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
+				    DLM_LSFL_NEWEXCL));
+
+	size = dlm_config.ci_rsbtbl_size;
+	ls->ls_rsbtbl_size = size;
+
+	ls->ls_rsbtbl = vmalloc(sizeof(struct dlm_rsbtable) * size);
+	if (!ls->ls_rsbtbl)
+		goto out_lsfree;
+	for (i = 0; i < size; i++) {
+		ls->ls_rsbtbl[i].keep.rb_node = NULL;
+		ls->ls_rsbtbl[i].toss.rb_node = NULL;
+		spin_lock_init(&ls->ls_rsbtbl[i].lock);
+	}
+
+	spin_lock_init(&ls->ls_remove_spin);
+
+	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
+		ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
+						 GFP_KERNEL);
+		if (!ls->ls_remove_names[i])
+			goto out_rsbtbl;
+	}
+
+	idr_init(&ls->ls_lkbidr);
+	spin_lock_init(&ls->ls_lkbidr_spin);
+
+	INIT_LIST_HEAD(&ls->ls_waiters);
+	mutex_init(&ls->ls_waiters_mutex);
+	INIT_LIST_HEAD(&ls->ls_orphans);
+	mutex_init(&ls->ls_orphans_mutex);
+	INIT_LIST_HEAD(&ls->ls_timeout);
+	mutex_init(&ls->ls_timeout_mutex);
+
+	INIT_LIST_HEAD(&ls->ls_new_rsb);
+	spin_lock_init(&ls->ls_new_rsb_spin);
+
+	INIT_LIST_HEAD(&ls->ls_nodes);
+	INIT_LIST_HEAD(&ls->ls_nodes_gone);
+	ls->ls_num_nodes = 0;
+	ls->ls_low_nodeid = 0;
+	ls->ls_total_weight = 0;
+	ls->ls_node_array = NULL;
+
+	memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+	ls->ls_stub_rsb.res_ls = ls;
+
+	ls->ls_debug_rsb_dentry = NULL;
+	ls->ls_debug_waiters_dentry = NULL;
+
+	init_waitqueue_head(&ls->ls_uevent_wait);
+	ls->ls_uevent_result = 0;
+	init_completion(&ls->ls_members_done);
+	ls->ls_members_result = -1;
+
+	mutex_init(&ls->ls_cb_mutex);
+	INIT_LIST_HEAD(&ls->ls_cb_delay);
+
+	ls->ls_recoverd_task = NULL;
+	mutex_init(&ls->ls_recoverd_active);
+	spin_lock_init(&ls->ls_recover_lock);
+	spin_lock_init(&ls->ls_rcom_spin);
+	get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t));
+	ls->ls_recover_status = 0;
+	ls->ls_recover_seq = 0;
+	ls->ls_recover_args = NULL;
+	init_rwsem(&ls->ls_in_recovery);
+	init_rwsem(&ls->ls_recv_active);
+	INIT_LIST_HEAD(&ls->ls_requestqueue);
+	mutex_init(&ls->ls_requestqueue_mutex);
+	mutex_init(&ls->ls_clear_proc_locks);
+
+	ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
+	if (!ls->ls_recover_buf)
+		goto out_lkbidr;
+
+	ls->ls_slot = 0;
+	ls->ls_num_slots = 0;
+	ls->ls_slots_size = 0;
+	ls->ls_slots = NULL;
+
+	INIT_LIST_HEAD(&ls->ls_recover_list);
+	spin_lock_init(&ls->ls_recover_list_lock);
+	idr_init(&ls->ls_recover_idr);
+	spin_lock_init(&ls->ls_recover_idr_lock);
+	ls->ls_recover_list_count = 0;
+	ls->ls_local_handle = ls;
+	init_waitqueue_head(&ls->ls_wait_general);
+	INIT_LIST_HEAD(&ls->ls_root_list);
+	init_rwsem(&ls->ls_root_sem);
+
+	spin_lock(&lslist_lock);
+	ls->ls_create_count = 1;
+	list_add(&ls->ls_list, &lslist);
+	spin_unlock(&lslist_lock);
+
+	if (flags & DLM_LSFL_FS) {
+		error = dlm_callback_start(ls);
+		if (error) {
+			log_error(ls, "can't start dlm_callback %d", error);
+			goto out_delist;
+		}
+	}
+
+	init_waitqueue_head(&ls->ls_recover_lock_wait);
+
+	/*
+	 * Once started, dlm_recoverd first looks for ls in lslist, then
+	 * initializes ls_in_recovery as locked in "down" mode.  We need
+	 * to wait for the wakeup from dlm_recoverd because in_recovery
+	 * has to start out in down mode.
+	 */
+
+	error = dlm_recoverd_start(ls);
+	if (error) {
+		log_error(ls, "can't start dlm_recoverd %d", error);
+		goto out_callback;
+	}
+
+	wait_event(ls->ls_recover_lock_wait,
+		   test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
+
+	ls->ls_kobj.kset = dlm_kset;
+	error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
+				     "%s", ls->ls_name);
+	if (error)
+		goto out_recoverd;
+	kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
+
+	/* let kobject handle freeing of ls if there's an error */
+	do_unreg = 1;
+
+	/* This uevent triggers dlm_controld in userspace to add us to the
+	   group of nodes that are members of this lockspace (managed by the
+	   cluster infrastructure.)  Once it's done that, it tells us who the
+	   current lockspace members are (via configfs) and then tells the
+	   lockspace to start running (via sysfs) in dlm_ls_start(). */
+
+	error = do_uevent(ls, 1);
+	if (error)
+		goto out_recoverd;
+
+	wait_for_completion(&ls->ls_members_done);
+	error = ls->ls_members_result;
+	if (error)
+		goto out_members;
+
+	dlm_create_debug_file(ls);
+
+	log_rinfo(ls, "join complete");
+	*lockspace = ls;
+	return 0;
+
+ out_members:
+	do_uevent(ls, 0);
+	dlm_clear_members(ls);
+	kfree(ls->ls_node_array);
+ out_recoverd:
+	dlm_recoverd_stop(ls);
+ out_callback:
+	dlm_callback_stop(ls);
+ out_delist:
+	spin_lock(&lslist_lock);
+	list_del(&ls->ls_list);
+	spin_unlock(&lslist_lock);
+	idr_destroy(&ls->ls_recover_idr);
+	kfree(ls->ls_recover_buf);
+ out_lkbidr:
+	idr_destroy(&ls->ls_lkbidr);
+	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
+		if (ls->ls_remove_names[i])
+			kfree(ls->ls_remove_names[i]);
+	}
+ out_rsbtbl:
+	vfree(ls->ls_rsbtbl);
+ out_lsfree:
+	if (do_unreg)
+		kobject_put(&ls->ls_kobj);
+	else
+		kfree(ls);
+ out:
+	module_put(THIS_MODULE);
+	return error;
+}
+
+int dlm_new_lockspace(const char *name, const char *cluster,
+		      uint32_t flags, int lvblen,
+		      const struct dlm_lockspace_ops *ops, void *ops_arg,
+		      int *ops_result, dlm_lockspace_t **lockspace)
+{
+	int error = 0;
+
+	mutex_lock(&ls_lock);
+	if (!ls_count)
+		error = threads_start();
+	if (error)
+		goto out;
+
+	error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg,
+			      ops_result, lockspace);
+	if (!error)
+		ls_count++;
+	if (error > 0)
+		error = 0;
+	if (!ls_count)
+		threads_stop();
+ out:
+	mutex_unlock(&ls_lock);
+	return error;
+}
+
+static int lkb_idr_is_local(int id, void *p, void *data)
+{
+	struct dlm_lkb *lkb = p;
+
+	return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV;
+}
+
+static int lkb_idr_is_any(int id, void *p, void *data)
+{
+	return 1;
+}
+
+static int lkb_idr_free(int id, void *p, void *data)
+{
+	struct dlm_lkb *lkb = p;
+
+	if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+		dlm_free_lvb(lkb->lkb_lvbptr);
+
+	dlm_free_lkb(lkb);
+	return 0;
+}
+
+/* NOTE: We check the lkbidr here rather than the resource table.
+   This is because there may be LKBs queued as ASTs that have been unlinked
+   from their RSBs and are pending deletion once the AST has been delivered */
+
+static int lockspace_busy(struct dlm_ls *ls, int force)
+{
+	int rv;
+
+	spin_lock(&ls->ls_lkbidr_spin);
+	if (force == 0) {
+		rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_any, ls);
+	} else if (force == 1) {
+		rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_local, ls);
+	} else {
+		rv = 0;
+	}
+	spin_unlock(&ls->ls_lkbidr_spin);
+	return rv;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+	struct dlm_rsb *rsb;
+	struct rb_node *n;
+	int i, busy, rv;
+
+	busy = lockspace_busy(ls, force);
+
+	spin_lock(&lslist_lock);
+	if (ls->ls_create_count == 1) {
+		if (busy) {
+			rv = -EBUSY;
+		} else {
+			/* remove_lockspace takes ls off lslist */
+			ls->ls_create_count = 0;
+			rv = 0;
+		}
+	} else if (ls->ls_create_count > 1) {
+		rv = --ls->ls_create_count;
+	} else {
+		rv = -EINVAL;
+	}
+	spin_unlock(&lslist_lock);
+
+	if (rv) {
+		log_debug(ls, "release_lockspace no remove %d", rv);
+		return rv;
+	}
+
+	dlm_device_deregister(ls);
+
+	if (force < 3 && dlm_user_daemon_available())
+		do_uevent(ls, 0);
+
+	dlm_recoverd_stop(ls);
+
+	dlm_callback_stop(ls);
+
+	remove_lockspace(ls);
+
+	dlm_delete_debug_file(ls);
+
+	kfree(ls->ls_recover_buf);
+
+	/*
+	 * Free all lkb's in idr
+	 */
+
+	idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls);
+	idr_destroy(&ls->ls_lkbidr);
+
+	/*
+	 * Free all rsb's on rsbtbl[] lists
+	 */
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) {
+			rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+			rb_erase(n, &ls->ls_rsbtbl[i].keep);
+			dlm_free_rsb(rsb);
+		}
+
+		while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) {
+			rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+			rb_erase(n, &ls->ls_rsbtbl[i].toss);
+			dlm_free_rsb(rsb);
+		}
+	}
+
+	vfree(ls->ls_rsbtbl);
+
+	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
+		kfree(ls->ls_remove_names[i]);
+
+	while (!list_empty(&ls->ls_new_rsb)) {
+		rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb,
+				       res_hashchain);
+		list_del(&rsb->res_hashchain);
+		dlm_free_rsb(rsb);
+	}
+
+	/*
+	 * Free structures on any other lists
+	 */
+
+	dlm_purge_requestqueue(ls);
+	kfree(ls->ls_recover_args);
+	dlm_clear_members(ls);
+	dlm_clear_members_gone(ls);
+	kfree(ls->ls_node_array);
+	log_rinfo(ls, "release_lockspace final free");
+	kobject_put(&ls->ls_kobj);
+	/* The ls structure will be freed when the kobject is done with */
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died.  The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+	dlm_put_lockspace(ls);
+
+	mutex_lock(&ls_lock);
+	error = release_lockspace(ls, force);
+	if (!error)
+		ls_count--;
+	if (!ls_count)
+		threads_stop();
+	mutex_unlock(&ls_lock);
+
+	return error;
+}
+
+void dlm_stop_lockspaces(void)
+{
+	struct dlm_ls *ls;
+	int count;
+
+ restart:
+	count = 0;
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+			count++;
+			continue;
+		}
+		spin_unlock(&lslist_lock);
+		log_error(ls, "no userland control daemon, stopping lockspace");
+		dlm_ls_stop(ls);
+		goto restart;
+	}
+	spin_unlock(&lslist_lock);
+
+	if (count)
+		log_print("dlm user daemon left %d lockspaces", count);
+}
+
diff --git a/kernel/fs/dlm/lockspace.h b/kernel/fs/dlm/lockspace.h
new file mode 100644
index 000000000..f879f8790
--- /dev/null
+++ b/kernel/fs/dlm/lockspace.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+void dlm_stop_lockspaces(void);
+
+#endif				/* __LOCKSPACE_DOT_H__ */
+
diff --git a/kernel/fs/dlm/lowcomms.c b/kernel/fs/dlm/lowcomms.c
new file mode 100644
index 000000000..d08e079ea
--- /dev/null
+++ b/kernel/fs/dlm/lowcomms.c
@@ -0,0 +1,1830 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is its
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * lowcomms will choose to use either TCP or SCTP as its transport layer
+ * depending on the configuration variable 'protocol'. This should be set
+ * to 0 (default) for TCP or 1 for SCTP. It should be configured using a
+ * cluster-wide mechanism as it must be the same on all nodes of the cluster
+ * for the DLM to function.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/sctp.h>
+#include <linux/slab.h>
+#include <net/sctp/sctp.h>
+#include <net/ipv6.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "config.h"
+
+#define NEEDED_RMEM (4*1024*1024)
+#define CONN_HASH_SIZE 32
+
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
+
+struct cbuf {
+	unsigned int base;
+	unsigned int len;
+	unsigned int mask;
+};
+
+static void cbuf_add(struct cbuf *cb, int n)
+{
+	cb->len += n;
+}
+
+static int cbuf_data(struct cbuf *cb)
+{
+	return ((cb->base + cb->len) & cb->mask);
+}
+
+static void cbuf_init(struct cbuf *cb, int size)
+{
+	cb->base = cb->len = 0;
+	cb->mask = size-1;
+}
+
+static void cbuf_eat(struct cbuf *cb, int n)
+{
+	cb->len  -= n;
+	cb->base += n;
+	cb->base &= cb->mask;
+}
+
+static bool cbuf_empty(struct cbuf *cb)
+{
+	return cb->len == 0;
+}
+
+struct connection {
+	struct socket *sock;	/* NULL if not connected */
+	uint32_t nodeid;	/* So we know who we are in the list */
+	struct mutex sock_mutex;
+	unsigned long flags;
+#define CF_READ_PENDING 1
+#define CF_WRITE_PENDING 2
+#define CF_CONNECT_PENDING 3
+#define CF_INIT_PENDING 4
+#define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
+#define CF_APP_LIMITED 7
+	struct list_head writequeue;  /* List of outgoing writequeue_entries */
+	spinlock_t writequeue_lock;
+	int (*rx_action) (struct connection *);	/* What to do when active */
+	void (*connect_action) (struct connection *);	/* What to do to connect */
+	struct page *rx_page;
+	struct cbuf cb;
+	int retries;
+#define MAX_CONNECT_RETRIES 3
+	int sctp_assoc;
+	struct hlist_node list;
+	struct connection *othercon;
+	struct work_struct rwork; /* Receive workqueue */
+	struct work_struct swork; /* Send workqueue */
+	bool try_new_addr;
+};
+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
+
+/* An entry waiting to be sent */
+struct writequeue_entry {
+	struct list_head list;
+	struct page *page;
+	int offset;
+	int len;
+	int end;
+	int users;
+	struct connection *con;
+};
+
+struct dlm_node_addr {
+	struct list_head list;
+	int nodeid;
+	int addr_count;
+	int curr_addr_index;
+	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+static LIST_HEAD(dlm_node_addrs);
+static DEFINE_SPINLOCK(dlm_node_addrs_spin);
+
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
+static int dlm_allow_conn;
+
+/* Work queues */
+static struct workqueue_struct *recv_workqueue;
+static struct workqueue_struct *send_workqueue;
+
+static struct hlist_head connection_hash[CONN_HASH_SIZE];
+static DEFINE_MUTEX(connections_lock);
+static struct kmem_cache *con_cache;
+
+static void process_recv_sockets(struct work_struct *work);
+static void process_send_sockets(struct work_struct *work);
+
+
+/* This is deliberately very simple because most clusters have simple
+   sequential nodeids, so we should be able to go straight to a connection
+   struct in the array */
+static inline int nodeid_hash(int nodeid)
+{
+	return nodeid & (CONN_HASH_SIZE-1);
+}
+
+static struct connection *__find_con(int nodeid)
+{
+	int r;
+	struct connection *con;
+
+	r = nodeid_hash(nodeid);
+
+	hlist_for_each_entry(con, &connection_hash[r], list) {
+		if (con->nodeid == nodeid)
+			return con;
+	}
+	return NULL;
+}
+
+/*
+ * If 'allocation' is zero then we don't attempt to create a new
+ * connection structure for this node.
+ */
+static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
+{
+	struct connection *con = NULL;
+	int r;
+
+	con = __find_con(nodeid);
+	if (con || !alloc)
+		return con;
+
+	con = kmem_cache_zalloc(con_cache, alloc);
+	if (!con)
+		return NULL;
+
+	r = nodeid_hash(nodeid);
+	hlist_add_head(&con->list, &connection_hash[r]);
+
+	con->nodeid = nodeid;
+	mutex_init(&con->sock_mutex);
+	INIT_LIST_HEAD(&con->writequeue);
+	spin_lock_init(&con->writequeue_lock);
+	INIT_WORK(&con->swork, process_send_sockets);
+	INIT_WORK(&con->rwork, process_recv_sockets);
+
+	/* Setup action pointers for child sockets */
+	if (con->nodeid) {
+		struct connection *zerocon = __find_con(0);
+
+		con->connect_action = zerocon->connect_action;
+		if (!con->rx_action)
+			con->rx_action = zerocon->rx_action;
+	}
+
+	return con;
+}
+
+/* Loop round all connections */
+static void foreach_conn(void (*conn_func)(struct connection *c))
+{
+	int i;
+	struct hlist_node *n;
+	struct connection *con;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
+			conn_func(con);
+	}
+}
+
+static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+{
+	struct connection *con;
+
+	mutex_lock(&connections_lock);
+	con = __nodeid2con(nodeid, allocation);
+	mutex_unlock(&connections_lock);
+
+	return con;
+}
+
+/* This is a bit drastic, but only called when things go wrong */
+static struct connection *assoc2con(int assoc_id)
+{
+	int i;
+	struct connection *con;
+
+	mutex_lock(&connections_lock);
+
+	for (i = 0 ; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry(con, &connection_hash[i], list) {
+			if (con->sctp_assoc == assoc_id) {
+				mutex_unlock(&connections_lock);
+				return con;
+			}
+		}
+	}
+	mutex_unlock(&connections_lock);
+	return NULL;
+}
+
+static struct dlm_node_addr *find_node_addr(int nodeid)
+{
+	struct dlm_node_addr *na;
+
+	list_for_each_entry(na, &dlm_node_addrs, list) {
+		if (na->nodeid == nodeid)
+			return na;
+	}
+	return NULL;
+}
+
+static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+	switch (x->ss_family) {
+	case AF_INET: {
+		struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+		struct sockaddr_in *siny = (struct sockaddr_in *)y;
+		if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+			return 0;
+		if (sinx->sin_port != siny->sin_port)
+			return 0;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+		struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+		if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+			return 0;
+		if (sinx->sin6_port != siny->sin6_port)
+			return 0;
+		break;
+	}
+	default:
+		return 0;
+	}
+	return 1;
+}
+
+static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
+			  struct sockaddr *sa_out, bool try_new_addr)
+{
+	struct sockaddr_storage sas;
+	struct dlm_node_addr *na;
+
+	if (!dlm_local_count)
+		return -1;
+
+	spin_lock(&dlm_node_addrs_spin);
+	na = find_node_addr(nodeid);
+	if (na && na->addr_count) {
+		if (try_new_addr) {
+			na->curr_addr_index++;
+			if (na->curr_addr_index == na->addr_count)
+				na->curr_addr_index = 0;
+		}
+
+		memcpy(&sas, na->addr[na->curr_addr_index ],
+			sizeof(struct sockaddr_storage));
+	}
+	spin_unlock(&dlm_node_addrs_spin);
+
+	if (!na)
+		return -EEXIST;
+
+	if (!na->addr_count)
+		return -ENOENT;
+
+	if (sas_out)
+		memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
+
+	if (!sa_out)
+		return 0;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+		struct sockaddr_in *in4  = (struct sockaddr_in *) &sas;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	} else {
+		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &sas;
+		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out;
+		ret6->sin6_addr = in6->sin6_addr;
+	}
+
+	return 0;
+}
+
+static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+	struct dlm_node_addr *na;
+	int rv = -EEXIST;
+	int addr_i;
+
+	spin_lock(&dlm_node_addrs_spin);
+	list_for_each_entry(na, &dlm_node_addrs, list) {
+		if (!na->addr_count)
+			continue;
+
+		for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+			if (addr_compare(na->addr[addr_i], addr)) {
+				*nodeid = na->nodeid;
+				rv = 0;
+				goto unlock;
+			}
+		}
+	}
+unlock:
+	spin_unlock(&dlm_node_addrs_spin);
+	return rv;
+}
+
+int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
+{
+	struct sockaddr_storage *new_addr;
+	struct dlm_node_addr *new_node, *na;
+
+	new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
+	if (!new_node)
+		return -ENOMEM;
+
+	new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
+	if (!new_addr) {
+		kfree(new_node);
+		return -ENOMEM;
+	}
+
+	memcpy(new_addr, addr, len);
+
+	spin_lock(&dlm_node_addrs_spin);
+	na = find_node_addr(nodeid);
+	if (!na) {
+		new_node->nodeid = nodeid;
+		new_node->addr[0] = new_addr;
+		new_node->addr_count = 1;
+		list_add(&new_node->list, &dlm_node_addrs);
+		spin_unlock(&dlm_node_addrs_spin);
+		return 0;
+	}
+
+	if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
+		spin_unlock(&dlm_node_addrs_spin);
+		kfree(new_addr);
+		kfree(new_node);
+		return -ENOSPC;
+	}
+
+	na->addr[na->addr_count++] = new_addr;
+	spin_unlock(&dlm_node_addrs_spin);
+	kfree(new_node);
+	return 0;
+}
+
+/* Data available on socket or listen socket received a connect */
+static void lowcomms_data_ready(struct sock *sk)
+{
+	struct connection *con = sock2con(sk);
+	if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
+		queue_work(recv_workqueue, &con->rwork);
+}
+
+static void lowcomms_write_space(struct sock *sk)
+{
+	struct connection *con = sock2con(sk);
+
+	if (!con)
+		return;
+
+	clear_bit(SOCK_NOSPACE, &con->sock->flags);
+
+	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+		con->sock->sk->sk_write_pending--;
+		clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+	}
+
+	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+		queue_work(send_workqueue, &con->swork);
+}
+
+static inline void lowcomms_connect_sock(struct connection *con)
+{
+	if (test_bit(CF_CLOSE, &con->flags))
+		return;
+	if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
+		queue_work(send_workqueue, &con->swork);
+}
+
+static void lowcomms_state_change(struct sock *sk)
+{
+	if (sk->sk_state == TCP_ESTABLISHED)
+		lowcomms_write_space(sk);
+}
+
+int dlm_lowcomms_connect_node(int nodeid)
+{
+	struct connection *con;
+
+	/* with sctp there's no connecting without sending */
+	if (dlm_config.ci_protocol != 0)
+		return 0;
+
+	if (nodeid == dlm_our_nodeid())
+		return 0;
+
+	con = nodeid2con(nodeid, GFP_NOFS);
+	if (!con)
+		return -ENOMEM;
+	lowcomms_connect_sock(con);
+	return 0;
+}
+
+/* Make a socket active */
+static void add_sock(struct socket *sock, struct connection *con)
+{
+	con->sock = sock;
+
+	/* Install a data_ready callback */
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->sock->sk->sk_write_space = lowcomms_write_space;
+	con->sock->sk->sk_state_change = lowcomms_state_change;
+	con->sock->sk->sk_user_data = con;
+	con->sock->sk->sk_allocation = GFP_NOFS;
+}
+
+/* Add the port number to an IPv6 or 4 sockaddr and return the address
+   length */
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+			  int *addr_len)
+{
+	saddr->ss_family =  dlm_local_addr[0]->ss_family;
+	if (saddr->ss_family == AF_INET) {
+		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+		in4_addr->sin_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in);
+		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+	} else {
+		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+		in6_addr->sin6_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in6);
+	}
+	memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
+}
+
+/* Close a remote connection and tidy up */
+static void close_connection(struct connection *con, bool and_other)
+{
+	mutex_lock(&con->sock_mutex);
+
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+	if (con->othercon && and_other) {
+		/* Will only re-enter once. */
+		close_connection(con->othercon, false);
+	}
+	if (con->rx_page) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+
+	con->retries = 0;
+	mutex_unlock(&con->sock_mutex);
+}
+
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void sctp_send_shutdown(sctp_assoc_t associd)
+{
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+	struct connection *con;
+
+	con = nodeid2con(0,0);
+	BUG_ON(con == NULL);
+
+	outmessage.msg_name = NULL;
+	outmessage.msg_namelen = 0;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+	sinfo->sinfo_flags |= MSG_EOF;
+	sinfo->sinfo_assoc_id = associd;
+
+	ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
+
+	if (ret != 0)
+		log_print("send EOF to node failed: %d", ret);
+}
+
+static void sctp_init_failed_foreach(struct connection *con)
+{
+
+	/*
+	 * Don't try to recover base con and handle race where the
+	 * other node's assoc init creates a assoc and we get that
+	 * notification, then we get a notification that our attempt
+	 * failed due. This happens when we are still trying the primary
+	 * address, but the other node has already tried secondary addrs
+	 * and found one that worked.
+	 */
+	if (!con->nodeid || con->sctp_assoc)
+		return;
+
+	log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+	con->try_new_addr = true;
+	con->sctp_assoc = 0;
+	if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
+		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+			queue_work(send_workqueue, &con->swork);
+	}
+}
+
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void sctp_init_failed(void)
+{
+	mutex_lock(&connections_lock);
+
+	foreach_conn(sctp_init_failed_foreach);
+
+	mutex_unlock(&connections_lock);
+}
+
+static void retry_failed_sctp_send(struct connection *recv_con,
+				   struct sctp_send_failed *sn_send_failed,
+				   char *buf)
+{
+	int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+	struct dlm_mhandle *mh;
+	struct connection *con;
+	char *retry_buf;
+	int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+	log_print("Retry sending %d bytes to node id %d", len, nodeid);
+	
+	if (!nodeid) {
+		log_print("Shouldn't resend data via listening connection.");
+		return;
+	}
+
+	con = nodeid2con(nodeid, 0);
+	if (!con) {
+		log_print("Could not look up con for nodeid %d\n",
+			  nodeid);
+		return;
+	}
+
+	mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+	if (!mh) {
+		log_print("Could not allocate buf for retry.");
+		return;
+	}
+	memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+	dlm_lowcomms_commit_buffer(mh);
+
+	/*
+	 * If we got a assoc changed event before the send failed event then
+	 * we only need to retry the send.
+	 */
+	if (con->sctp_assoc) {
+		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+			queue_work(send_workqueue, &con->swork);
+	} else
+		sctp_init_failed_foreach(con);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct connection *con,
+				      struct msghdr *msg, char *buf)
+{
+	union sctp_notification *sn = (union sctp_notification *)buf;
+	struct linger linger;
+
+	switch (sn->sn_header.sn_type) {
+	case SCTP_SEND_FAILED:
+		retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+		break;
+	case SCTP_ASSOC_CHANGE:
+		switch (sn->sn_assoc_change.sac_state) {
+		case SCTP_COMM_UP:
+		case SCTP_RESTART:
+		{
+			/* Check that the new node is in the lockspace */
+			struct sctp_prim prim;
+			int nodeid;
+			int prim_len, ret;
+			int addr_len;
+			struct connection *new_con;
+
+			/*
+			 * We get this before any data for an association.
+			 * We verify that the node is in the cluster and
+			 * then peel off a socket for it.
+			 */
+			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+				log_print("COMM_UP for invalid assoc ID %d",
+					 (int)sn->sn_assoc_change.sac_assoc_id);
+				sctp_init_failed();
+				return;
+			}
+			memset(&prim, 0, sizeof(struct sctp_prim));
+			prim_len = sizeof(struct sctp_prim);
+			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+			ret = kernel_getsockopt(con->sock,
+						IPPROTO_SCTP,
+						SCTP_PRIMARY_ADDR,
+						(char*)&prim,
+						&prim_len);
+			if (ret < 0) {
+				log_print("getsockopt/sctp_primary_addr on "
+					  "new assoc %d failed : %d",
+					  (int)sn->sn_assoc_change.sac_assoc_id,
+					  ret);
+
+				/* Retry INIT later */
+				new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+				if (new_con)
+					clear_bit(CF_CONNECT_PENDING, &con->flags);
+				return;
+			}
+			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+			if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+				unsigned char *b=(unsigned char *)&prim.ssp_addr;
+				log_print("reject connect from unknown addr");
+				print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, 
+						     b, sizeof(struct sockaddr_storage));
+				sctp_send_shutdown(prim.ssp_assoc_id);
+				return;
+			}
+
+			new_con = nodeid2con(nodeid, GFP_NOFS);
+			if (!new_con)
+				return;
+
+			/* Peel off a new sock */
+			lock_sock(con->sock->sk);
+			ret = sctp_do_peeloff(con->sock->sk,
+				sn->sn_assoc_change.sac_assoc_id,
+				&new_con->sock);
+			release_sock(con->sock->sk);
+			if (ret < 0) {
+				log_print("Can't peel off a socket for "
+					  "connection %d to node %d: err=%d",
+					  (int)sn->sn_assoc_change.sac_assoc_id,
+					  nodeid, ret);
+				return;
+			}
+			add_sock(new_con->sock, new_con);
+
+			linger.l_onoff = 1;
+			linger.l_linger = 0;
+			ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
+						(char *)&linger, sizeof(linger));
+			if (ret < 0)
+				log_print("set socket option SO_LINGER failed");
+
+			log_print("connecting to %d sctp association %d",
+				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+
+			new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+			new_con->try_new_addr = false;
+			/* Send any pending writes */
+			clear_bit(CF_CONNECT_PENDING, &new_con->flags);
+			clear_bit(CF_INIT_PENDING, &new_con->flags);
+			if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
+				queue_work(send_workqueue, &new_con->swork);
+			}
+			if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
+				queue_work(recv_workqueue, &new_con->rwork);
+		}
+		break;
+
+		case SCTP_COMM_LOST:
+		case SCTP_SHUTDOWN_COMP:
+		{
+			con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+			if (con) {
+				con->sctp_assoc = 0;
+			}
+		}
+		break;
+
+		case SCTP_CANT_STR_ASSOC:
+		{
+			/* Will retry init when we get the send failed notification */
+			log_print("Can't start SCTP association - retrying");
+		}
+		break;
+
+		default:
+			log_print("unexpected SCTP assoc change id=%d state=%d",
+				  (int)sn->sn_assoc_change.sac_assoc_id,
+				  sn->sn_assoc_change.sac_state);
+		}
+	default:
+		; /* fall through */
+	}
+}
+
+/* Data received from remote end */
+static int receive_from_sock(struct connection *con)
+{
+	int ret = 0;
+	struct msghdr msg = {};
+	struct kvec iov[2];
+	unsigned len;
+	int r;
+	int call_again_soon = 0;
+	int nvec;
+	char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+
+	mutex_lock(&con->sock_mutex);
+
+	if (con->sock == NULL) {
+		ret = -EAGAIN;
+		goto out_close;
+	}
+
+	if (con->rx_page == NULL) {
+		/*
+		 * This doesn't need to be atomic, but I think it should
+		 * improve performance if it is.
+		 */
+		con->rx_page = alloc_page(GFP_ATOMIC);
+		if (con->rx_page == NULL)
+			goto out_resched;
+		cbuf_init(&con->cb, PAGE_CACHE_SIZE);
+	}
+
+	/* Only SCTP needs these really */
+	memset(&incmsg, 0, sizeof(incmsg));
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+
+	/*
+	 * iov[0] is the bit of the circular buffer between the current end
+	 * point (cb.base + cb.len) and the end of the buffer.
+	 */
+	iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
+	iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
+	iov[1].iov_len = 0;
+	nvec = 1;
+
+	/*
+	 * iov[1] is the bit of the circular buffer between the start of the
+	 * buffer and the start of the currently used section (cb.base)
+	 */
+	if (cbuf_data(&con->cb) >= con->cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
+		iov[1].iov_len = con->cb.base;
+		iov[1].iov_base = page_address(con->rx_page);
+		nvec = 2;
+	}
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len,
+			       MSG_DONTWAIT | MSG_NOSIGNAL);
+	if (ret <= 0)
+		goto out_close;
+
+	/* Process SCTP notifications */
+	if (msg.msg_flags & MSG_NOTIFICATION) {
+		msg.msg_control = incmsg;
+		msg.msg_controllen = sizeof(incmsg);
+
+		process_sctp_notification(con, &msg,
+				page_address(con->rx_page) + con->cb.base);
+		mutex_unlock(&con->sock_mutex);
+		return 0;
+	}
+	BUG_ON(con->nodeid == 0);
+
+	if (ret == len)
+		call_again_soon = 1;
+	cbuf_add(&con->cb, ret);
+	ret = dlm_process_incoming_buffer(con->nodeid,
+					  page_address(con->rx_page),
+					  con->cb.base, con->cb.len,
+					  PAGE_CACHE_SIZE);
+	if (ret == -EBADMSG) {
+		log_print("lowcomms: addr=%p, base=%u, len=%u, "
+			  "iov_len=%u, iov_base[0]=%p, read=%d",
+			  page_address(con->rx_page), con->cb.base, con->cb.len,
+			  len, iov[0].iov_base, r);
+	}
+	if (ret < 0)
+		goto out_close;
+	cbuf_eat(&con->cb, ret);
+
+	if (cbuf_empty(&con->cb) && !call_again_soon) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+
+	if (call_again_soon)
+		goto out_resched;
+	mutex_unlock(&con->sock_mutex);
+	return 0;
+
+out_resched:
+	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
+		queue_work(recv_workqueue, &con->rwork);
+	mutex_unlock(&con->sock_mutex);
+	return -EAGAIN;
+
+out_close:
+	mutex_unlock(&con->sock_mutex);
+	if (ret != -EAGAIN) {
+		close_connection(con, false);
+		/* Reconnect when there is something to send */
+	}
+	/* Don't return success if we really got EOF */
+	if (ret == 0)
+		ret = -EAGAIN;
+
+	return ret;
+}
+
+/* Listening socket is busy, accept a connection */
+static int tcp_accept_from_sock(struct connection *con)
+{
+	int result;
+	struct sockaddr_storage peeraddr;
+	struct socket *newsock;
+	int len;
+	int nodeid;
+	struct connection *newcon;
+	struct connection *addcon;
+
+	mutex_lock(&connections_lock);
+	if (!dlm_allow_conn) {
+		mutex_unlock(&connections_lock);
+		return -1;
+	}
+	mutex_unlock(&connections_lock);
+
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &newsock);
+	if (result < 0)
+		return -ENOMEM;
+
+	mutex_lock_nested(&con->sock_mutex, 0);
+
+	result = -ENOTCONN;
+	if (con->sock == NULL)
+		goto accept_err;
+
+	newsock->type = con->sock->type;
+	newsock->ops = con->sock->ops;
+
+	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+	if (result < 0)
+		goto accept_err;
+
+	/* Get the connected socket's peer */
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
+				  &len, 2)) {
+		result = -ECONNABORTED;
+		goto accept_err;
+	}
+
+	/* Get the new node's NODEID */
+	make_sockaddr(&peeraddr, 0, &len);
+	if (addr_to_nodeid(&peeraddr, &nodeid)) {
+		unsigned char *b=(unsigned char *)&peeraddr;
+		log_print("connect from non cluster node");
+		print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, 
+				     b, sizeof(struct sockaddr_storage));
+		sock_release(newsock);
+		mutex_unlock(&con->sock_mutex);
+		return -1;
+	}
+
+	log_print("got connection from %d", nodeid);
+
+	/*  Check to see if we already have a connection to this node. This
+	 *  could happen if the two nodes initiate a connection at roughly
+	 *  the same time and the connections cross on the wire.
+	 *  In this case we store the incoming one in "othercon"
+	 */
+	newcon = nodeid2con(nodeid, GFP_NOFS);
+	if (!newcon) {
+		result = -ENOMEM;
+		goto accept_err;
+	}
+	mutex_lock_nested(&newcon->sock_mutex, 1);
+	if (newcon->sock) {
+		struct connection *othercon = newcon->othercon;
+
+		if (!othercon) {
+			othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
+			if (!othercon) {
+				log_print("failed to allocate incoming socket");
+				mutex_unlock(&newcon->sock_mutex);
+				result = -ENOMEM;
+				goto accept_err;
+			}
+			othercon->nodeid = nodeid;
+			othercon->rx_action = receive_from_sock;
+			mutex_init(&othercon->sock_mutex);
+			INIT_WORK(&othercon->swork, process_send_sockets);
+			INIT_WORK(&othercon->rwork, process_recv_sockets);
+			set_bit(CF_IS_OTHERCON, &othercon->flags);
+		}
+		if (!othercon->sock) {
+			newcon->othercon = othercon;
+			othercon->sock = newsock;
+			newsock->sk->sk_user_data = othercon;
+			add_sock(newsock, othercon);
+			addcon = othercon;
+		}
+		else {
+			printk("Extra connection from node %d attempted\n", nodeid);
+			result = -EAGAIN;
+			mutex_unlock(&newcon->sock_mutex);
+			goto accept_err;
+		}
+	}
+	else {
+		newsock->sk->sk_user_data = newcon;
+		newcon->rx_action = receive_from_sock;
+		add_sock(newsock, newcon);
+		addcon = newcon;
+	}
+
+	mutex_unlock(&newcon->sock_mutex);
+
+	/*
+	 * Add it to the active queue in case we got data
+	 * between processing the accept adding the socket
+	 * to the read_sockets list
+	 */
+	if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
+		queue_work(recv_workqueue, &addcon->rwork);
+	mutex_unlock(&con->sock_mutex);
+
+	return 0;
+
+accept_err:
+	mutex_unlock(&con->sock_mutex);
+	sock_release(newsock);
+
+	if (result != -EAGAIN)
+		log_print("error accepting connection from node: %d", result);
+	return result;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+	e->offset += completed;
+	e->len -= completed;
+
+	if (e->len == 0 && e->users == 0) {
+		list_del(&e->list);
+		free_entry(e);
+	}
+}
+
+/* Initiate an SCTP association.
+   This is a special case of send_to_sock() in that we don't yet have a
+   peeled-off socket for this association, so we use the listening socket
+   and add the primary IP address of the remote node.
+ */
+static void sctp_init_assoc(struct connection *con)
+{
+	struct sockaddr_storage rem_addr;
+	char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	struct connection *base_con;
+	struct writequeue_entry *e;
+	int len, offset;
+	int ret;
+	int addrlen;
+	struct kvec iov[1];
+
+	mutex_lock(&con->sock_mutex);
+	if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
+		goto unlock;
+
+	if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+			   con->try_new_addr)) {
+		log_print("no address for nodeid %d", con->nodeid);
+		goto unlock;
+	}
+	base_con = nodeid2con(0, 0);
+	BUG_ON(base_con == NULL);
+
+	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
+
+	outmessage.msg_name = &rem_addr;
+	outmessage.msg_namelen = addrlen;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	spin_lock(&con->writequeue_lock);
+
+	if (list_empty(&con->writequeue)) {
+		spin_unlock(&con->writequeue_lock);
+		log_print("writequeue empty for nodeid %d", con->nodeid);
+		goto unlock;
+	}
+
+	e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
+	len = e->len;
+	offset = e->offset;
+
+	/* Send the first block off the write queue */
+	iov[0].iov_base = page_address(e->page)+offset;
+	iov[0].iov_len = len;
+	spin_unlock(&con->writequeue_lock);
+
+	if (rem_addr.ss_family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+		log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+	} else {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+		log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+	}
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo->sinfo_flags |= SCTP_ADDR_OVER;
+
+	ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
+	if (ret < 0) {
+		log_print("Send first packet to node %d failed: %d",
+			  con->nodeid, ret);
+
+		/* Try again later */
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_INIT_PENDING, &con->flags);
+	}
+	else {
+		spin_lock(&con->writequeue_lock);
+		writequeue_entry_complete(e, ret);
+		spin_unlock(&con->writequeue_lock);
+	}
+
+unlock:
+	mutex_unlock(&con->sock_mutex);
+}
+
+/* Connect a new socket to its peer */
+static void tcp_connect_to_sock(struct connection *con)
+{
+	struct sockaddr_storage saddr, src_addr;
+	int addr_len;
+	struct socket *sock = NULL;
+	int one = 1;
+	int result;
+
+	if (con->nodeid == 0) {
+		log_print("attempt to connect sock 0 foiled");
+		return;
+	}
+
+	mutex_lock(&con->sock_mutex);
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		goto out;
+
+	/* Some odd races can cause double-connects, ignore them */
+	if (con->sock)
+		goto out;
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &sock);
+	if (result < 0)
+		goto out_err;
+
+	memset(&saddr, 0, sizeof(saddr));
+	result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
+	if (result < 0) {
+		log_print("no address for nodeid %d", con->nodeid);
+		goto out_err;
+	}
+
+	sock->sk->sk_user_data = con;
+	con->rx_action = receive_from_sock;
+	con->connect_action = tcp_connect_to_sock;
+	add_sock(sock, con);
+
+	/* Bind to our cluster-known address connecting to avoid
+	   routing problems */
+	memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+	make_sockaddr(&src_addr, 0, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
+				 addr_len);
+	if (result < 0) {
+		log_print("could not bind for connect: %d", result);
+		/* This *may* not indicate a critical error */
+	}
+
+	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
+
+	log_print("connecting to %d", con->nodeid);
+
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
+	result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
+				   O_NONBLOCK);
+	if (result == -EINPROGRESS)
+		result = 0;
+	if (result == 0)
+		goto out;
+
+out_err:
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	} else if (sock) {
+		sock_release(sock);
+	}
+	/*
+	 * Some errors are fatal and this list might need adjusting. For other
+	 * errors we try again until the max number of retries is reached.
+	 */
+	if (result != -EHOSTUNREACH &&
+	    result != -ENETUNREACH &&
+	    result != -ENETDOWN && 
+	    result != -EINVAL &&
+	    result != -EPROTONOSUPPORT) {
+		log_print("connect %d try %d error %d", con->nodeid,
+			  con->retries, result);
+		mutex_unlock(&con->sock_mutex);
+		msleep(1000);
+		lowcomms_connect_sock(con);
+		return;
+	}
+out:
+	mutex_unlock(&con->sock_mutex);
+	return;
+}
+
+static struct socket *tcp_create_listen_sock(struct connection *con,
+					     struct sockaddr_storage *saddr)
+{
+	struct socket *sock = NULL;
+	int result = 0;
+	int one = 1;
+	int addr_len;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET)
+		addr_len = sizeof(struct sockaddr_in);
+	else
+		addr_len = sizeof(struct sockaddr_in6);
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &sock);
+	if (result < 0) {
+		log_print("Can't create listening comms socket");
+		goto create_out;
+	}
+
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				   (char *)&one, sizeof(one));
+
+	if (result < 0) {
+		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
+	}
+	con->rx_action = tcp_accept_from_sock;
+	con->connect_action = tcp_connect_to_sock;
+
+	/* Bind to our port */
+	make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
+	if (result < 0) {
+		log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		con->sock = NULL;
+		goto create_out;
+	}
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				 (char *)&one, sizeof(one));
+	if (result < 0) {
+		log_print("Set keepalive failed: %d", result);
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		goto create_out;
+	}
+
+create_out:
+	return sock;
+}
+
+/* Get local addresses */
+static void init_local(void)
+{
+	struct sockaddr_storage sas, *addr;
+	int i;
+
+	dlm_local_count = 0;
+	for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) {
+		if (dlm_our_addr(&sas, i))
+			break;
+
+		addr = kmalloc(sizeof(*addr), GFP_NOFS);
+		if (!addr)
+			break;
+		memcpy(addr, &sas, sizeof(*addr));
+		dlm_local_addr[dlm_local_count++] = addr;
+	}
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do
+   multi-homing */
+static int add_sctp_bind_addr(struct connection *sctp_con,
+			      struct sockaddr_storage *addr,
+			      int addr_len, int num)
+{
+	int result = 0;
+
+	if (num == 1)
+		result = kernel_bind(sctp_con->sock,
+				     (struct sockaddr *) addr,
+				     addr_len);
+	else
+		result = kernel_setsockopt(sctp_con->sock, SOL_SCTP,
+					   SCTP_SOCKOPT_BINDX_ADD,
+					   (char *)addr, addr_len);
+
+	if (result < 0)
+		log_print("Can't bind to port %d addr number %d",
+			  dlm_config.ci_tcp_port, num);
+
+	return result;
+}
+
+/* Initialise SCTP socket and bind to all interfaces */
+static int sctp_listen_for_all(void)
+{
+	struct socket *sock = NULL;
+	struct sockaddr_storage localaddr;
+	struct sctp_event_subscribe subscribe;
+	int result = -EINVAL, num = 1, i, addr_len;
+	struct connection *con = nodeid2con(0, GFP_NOFS);
+	int bufsize = NEEDED_RMEM;
+	int one = 1;
+
+	if (!con)
+		return -ENOMEM;
+
+	log_print("Using SCTP for communications");
+
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+				  IPPROTO_SCTP, &sock);
+	if (result < 0) {
+		log_print("Can't create comms socket, check SCTP is loaded");
+		goto out;
+	}
+
+	/* Listen for events */
+	memset(&subscribe, 0, sizeof(subscribe));
+	subscribe.sctp_data_io_event = 1;
+	subscribe.sctp_association_event = 1;
+	subscribe.sctp_send_failure_event = 1;
+	subscribe.sctp_shutdown_event = 1;
+	subscribe.sctp_partial_delivery_event = 1;
+
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE,
+				 (char *)&bufsize, sizeof(bufsize));
+	if (result)
+		log_print("Error increasing buffer space on socket %d", result);
+
+	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+				   (char *)&subscribe, sizeof(subscribe));
+	if (result < 0) {
+		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+			  result);
+		goto create_delsock;
+	}
+
+	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+				   sizeof(one));
+	if (result < 0)
+		log_print("Could not set SCTP NODELAY error %d\n", result);
+
+	/* Init con struct */
+	sock->sk->sk_user_data = con;
+	con->sock = sock;
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->rx_action = receive_from_sock;
+	con->connect_action = sctp_init_assoc;
+
+	/* Bind to all interfaces. */
+	for (i = 0; i < dlm_local_count; i++) {
+		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
+
+		result = add_sctp_bind_addr(con, &localaddr, addr_len, num);
+		if (result)
+			goto create_delsock;
+		++num;
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't set socket listening");
+		goto create_delsock;
+	}
+
+	return 0;
+
+create_delsock:
+	sock_release(sock);
+	con->sock = NULL;
+out:
+	return result;
+}
+
+static int tcp_listen_for_all(void)
+{
+	struct socket *sock = NULL;
+	struct connection *con = nodeid2con(0, GFP_NOFS);
+	int result = -EINVAL;
+
+	if (!con)
+		return -ENOMEM;
+
+	/* We don't support multi-homed hosts */
+	if (dlm_local_addr[1] != NULL) {
+		log_print("TCP protocol can't handle multi-homed hosts, "
+			  "try SCTP");
+		return -EINVAL;
+	}
+
+	log_print("Using TCP for communications");
+
+	sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
+	if (sock) {
+		add_sock(sock, con);
+		result = 0;
+	}
+	else {
+		result = -EADDRINUSE;
+	}
+
+	return result;
+}
+
+
+
+static struct writequeue_entry *new_writequeue_entry(struct connection *con,
+						     gfp_t allocation)
+{
+	struct writequeue_entry *entry;
+
+	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+	if (!entry)
+		return NULL;
+
+	entry->page = alloc_page(allocation);
+	if (!entry->page) {
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->offset = 0;
+	entry->len = 0;
+	entry->end = 0;
+	entry->users = 0;
+	entry->con = con;
+
+	return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+{
+	struct connection *con;
+	struct writequeue_entry *e;
+	int offset = 0;
+
+	con = nodeid2con(nodeid, allocation);
+	if (!con)
+		return NULL;
+
+	spin_lock(&con->writequeue_lock);
+	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
+	if ((&e->list == &con->writequeue) ||
+	    (PAGE_CACHE_SIZE - e->end < len)) {
+		e = NULL;
+	} else {
+		offset = e->end;
+		e->end += len;
+		e->users++;
+	}
+	spin_unlock(&con->writequeue_lock);
+
+	if (e) {
+	got_one:
+		*ppc = page_address(e->page) + offset;
+		return e;
+	}
+
+	e = new_writequeue_entry(con, allocation);
+	if (e) {
+		spin_lock(&con->writequeue_lock);
+		offset = e->end;
+		e->end += len;
+		e->users++;
+		list_add_tail(&e->list, &con->writequeue);
+		spin_unlock(&con->writequeue_lock);
+		goto got_one;
+	}
+	return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *mh)
+{
+	struct writequeue_entry *e = (struct writequeue_entry *)mh;
+	struct connection *con = e->con;
+	int users;
+
+	spin_lock(&con->writequeue_lock);
+	users = --e->users;
+	if (users)
+		goto out;
+	e->len = e->end - e->offset;
+	spin_unlock(&con->writequeue_lock);
+
+	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
+		queue_work(send_workqueue, &con->swork);
+	}
+	return;
+
+out:
+	spin_unlock(&con->writequeue_lock);
+	return;
+}
+
+/* Send a message */
+static void send_to_sock(struct connection *con)
+{
+	int ret = 0;
+	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	struct writequeue_entry *e;
+	int len, offset;
+	int count = 0;
+
+	mutex_lock(&con->sock_mutex);
+	if (con->sock == NULL)
+		goto out_connect;
+
+	spin_lock(&con->writequeue_lock);
+	for (;;) {
+		e = list_entry(con->writequeue.next, struct writequeue_entry,
+			       list);
+		if ((struct list_head *) e == &con->writequeue)
+			break;
+
+		len = e->len;
+		offset = e->offset;
+		BUG_ON(len == 0 && e->users == 0);
+		spin_unlock(&con->writequeue_lock);
+
+		ret = 0;
+		if (len) {
+			ret = kernel_sendpage(con->sock, e->page, offset, len,
+					      msg_flags);
+			if (ret == -EAGAIN || ret == 0) {
+				if (ret == -EAGAIN &&
+				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+				    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+					/* Notify TCP that we're limited by the
+					 * application window size.
+					 */
+					set_bit(SOCK_NOSPACE, &con->sock->flags);
+					con->sock->sk->sk_write_pending++;
+				}
+				cond_resched();
+				goto out;
+			} else if (ret < 0)
+				goto send_error;
+		}
+
+		/* Don't starve people filling buffers */
+		if (++count >= MAX_SEND_MSG_COUNT) {
+			cond_resched();
+			count = 0;
+		}
+
+		spin_lock(&con->writequeue_lock);
+		writequeue_entry_complete(e, ret);
+	}
+	spin_unlock(&con->writequeue_lock);
+out:
+	mutex_unlock(&con->sock_mutex);
+	return;
+
+send_error:
+	mutex_unlock(&con->sock_mutex);
+	close_connection(con, false);
+	lowcomms_connect_sock(con);
+	return;
+
+out_connect:
+	mutex_unlock(&con->sock_mutex);
+	if (!test_bit(CF_INIT_PENDING, &con->flags))
+		lowcomms_connect_sock(con);
+}
+
+static void clean_one_writequeue(struct connection *con)
+{
+	struct writequeue_entry *e, *safe;
+
+	spin_lock(&con->writequeue_lock);
+	list_for_each_entry_safe(e, safe, &con->writequeue, list) {
+		list_del(&e->list);
+		free_entry(e);
+	}
+	spin_unlock(&con->writequeue_lock);
+}
+
+/* Called from recovery when it knows that a node has
+   left the cluster */
+int dlm_lowcomms_close(int nodeid)
+{
+	struct connection *con;
+	struct dlm_node_addr *na;
+
+	log_print("closing connection to node %d", nodeid);
+	con = nodeid2con(nodeid, 0);
+	if (con) {
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+		set_bit(CF_CLOSE, &con->flags);
+		if (cancel_work_sync(&con->swork))
+			log_print("canceled swork for node %d", nodeid);
+		if (cancel_work_sync(&con->rwork))
+			log_print("canceled rwork for node %d", nodeid);
+		clean_one_writequeue(con);
+		close_connection(con, true);
+	}
+
+	spin_lock(&dlm_node_addrs_spin);
+	na = find_node_addr(nodeid);
+	if (na) {
+		list_del(&na->list);
+		while (na->addr_count--)
+			kfree(na->addr[na->addr_count]);
+		kfree(na);
+	}
+	spin_unlock(&dlm_node_addrs_spin);
+
+	return 0;
+}
+
+/* Receive workqueue function */
+static void process_recv_sockets(struct work_struct *work)
+{
+	struct connection *con = container_of(work, struct connection, rwork);
+	int err;
+
+	clear_bit(CF_READ_PENDING, &con->flags);
+	do {
+		err = con->rx_action(con);
+	} while (!err);
+}
+
+/* Send workqueue function */
+static void process_send_sockets(struct work_struct *work)
+{
+	struct connection *con = container_of(work, struct connection, swork);
+
+	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+		con->connect_action(con);
+		set_bit(CF_WRITE_PENDING, &con->flags);
+	}
+	if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+		send_to_sock(con);
+}
+
+
+/* Discard all entries on the write queues */
+static void clean_writequeues(void)
+{
+	foreach_conn(clean_one_writequeue);
+}
+
+static void work_stop(void)
+{
+	destroy_workqueue(recv_workqueue);
+	destroy_workqueue(send_workqueue);
+}
+
+static int work_start(void)
+{
+	recv_workqueue = alloc_workqueue("dlm_recv",
+					 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+	if (!recv_workqueue) {
+		log_print("can't start dlm_recv");
+		return -ENOMEM;
+	}
+
+	send_workqueue = alloc_workqueue("dlm_send",
+					 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+	if (!send_workqueue) {
+		log_print("can't start dlm_send");
+		destroy_workqueue(recv_workqueue);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void stop_conn(struct connection *con)
+{
+	con->flags |= 0x0F;
+	if (con->sock && con->sock->sk)
+		con->sock->sk->sk_user_data = NULL;
+}
+
+static void free_conn(struct connection *con)
+{
+	close_connection(con, true);
+	if (con->othercon)
+		kmem_cache_free(con_cache, con->othercon);
+	hlist_del(&con->list);
+	kmem_cache_free(con_cache, con);
+}
+
+void dlm_lowcomms_stop(void)
+{
+	/* Set all the flags to prevent any
+	   socket activity.
+	*/
+	mutex_lock(&connections_lock);
+	dlm_allow_conn = 0;
+	foreach_conn(stop_conn);
+	mutex_unlock(&connections_lock);
+
+	work_stop();
+
+	mutex_lock(&connections_lock);
+	clean_writequeues();
+
+	foreach_conn(free_conn);
+
+	mutex_unlock(&connections_lock);
+	kmem_cache_destroy(con_cache);
+}
+
+int dlm_lowcomms_start(void)
+{
+	int error = -EINVAL;
+	struct connection *con;
+	int i;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&connection_hash[i]);
+
+	init_local();
+	if (!dlm_local_count) {
+		error = -ENOTCONN;
+		log_print("no local IP address has been set");
+		goto fail;
+	}
+
+	error = -ENOMEM;
+	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
+				      __alignof__(struct connection), 0,
+				      NULL);
+	if (!con_cache)
+		goto fail;
+
+	error = work_start();
+	if (error)
+		goto fail_destroy;
+
+	dlm_allow_conn = 1;
+
+	/* Start listening */
+	if (dlm_config.ci_protocol == 0)
+		error = tcp_listen_for_all();
+	else
+		error = sctp_listen_for_all();
+	if (error)
+		goto fail_unlisten;
+
+	return 0;
+
+fail_unlisten:
+	dlm_allow_conn = 0;
+	con = nodeid2con(0,0);
+	if (con) {
+		close_connection(con, false);
+		kmem_cache_free(con_cache, con);
+	}
+fail_destroy:
+	kmem_cache_destroy(con_cache);
+fail:
+	return error;
+}
+
+void dlm_lowcomms_exit(void)
+{
+	struct dlm_node_addr *na, *safe;
+
+	spin_lock(&dlm_node_addrs_spin);
+	list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
+		list_del(&na->list);
+		while (na->addr_count--)
+			kfree(na->addr[na->addr_count]);
+		kfree(na);
+	}
+	spin_unlock(&dlm_node_addrs_spin);
+}
diff --git a/kernel/fs/dlm/lowcomms.h b/kernel/fs/dlm/lowcomms.h
new file mode 100644
index 000000000..67462e54f
--- /dev/null
+++ b/kernel/fs/dlm/lowcomms.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_close(int nodeid);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+int dlm_lowcomms_connect_node(int nodeid);
+int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
+
+#endif				/* __LOWCOMMS_DOT_H__ */
+
diff --git a/kernel/fs/dlm/lvb_table.h b/kernel/fs/dlm/lvb_table.h
new file mode 100644
index 000000000..cc3e92f3f
--- /dev/null
+++ b/kernel/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+
+extern const int dlm_lvb_operations[8][8];
+
+#endif
diff --git a/kernel/fs/dlm/main.c b/kernel/fs/dlm/main.c
new file mode 100644
index 000000000..079c0bd71
--- /dev/null
+++ b/kernel/fs/dlm/main.c
@@ -0,0 +1,97 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "config.h"
+#include "lowcomms.h"
+
+static int __init init_dlm(void)
+{
+	int error;
+
+	error = dlm_memory_init();
+	if (error)
+		goto out;
+
+	error = dlm_lockspace_init();
+	if (error)
+		goto out_mem;
+
+	error = dlm_config_init();
+	if (error)
+		goto out_lockspace;
+
+	error = dlm_register_debugfs();
+	if (error)
+		goto out_config;
+
+	error = dlm_user_init();
+	if (error)
+		goto out_debug;
+
+	error = dlm_netlink_init();
+	if (error)
+		goto out_user;
+
+	error = dlm_plock_init();
+	if (error)
+		goto out_netlink;
+
+	printk("DLM installed\n");
+
+	return 0;
+
+ out_netlink:
+	dlm_netlink_exit();
+ out_user:
+	dlm_user_exit();
+ out_debug:
+	dlm_unregister_debugfs();
+ out_config:
+	dlm_config_exit();
+ out_lockspace:
+	dlm_lockspace_exit();
+ out_mem:
+	dlm_memory_exit();
+ out:
+	return error;
+}
+
+static void __exit exit_dlm(void)
+{
+	dlm_plock_exit();
+	dlm_netlink_exit();
+	dlm_user_exit();
+	dlm_config_exit();
+	dlm_memory_exit();
+	dlm_lockspace_exit();
+	dlm_lowcomms_exit();
+	dlm_unregister_debugfs();
+}
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
+
diff --git a/kernel/fs/dlm/member.c b/kernel/fs/dlm/member.c
new file mode 100644
index 000000000..9c47f1c14
--- /dev/null
+++ b/kernel/fs/dlm/member.c
@@ -0,0 +1,722 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+#include "lowcomms.h"
+
+int dlm_slots_version(struct dlm_header *h)
+{
+	if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
+		return 0;
+	return 1;
+}
+
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+		   struct dlm_member *memb)
+{
+	struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+
+	if (!dlm_slots_version(&rc->rc_header))
+		return;
+
+	memb->slot = le16_to_cpu(rf->rf_our_slot);
+	memb->generation = le32_to_cpu(rf->rf_generation);
+}
+
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct dlm_slot *slot;
+	struct rcom_slot *ro;
+	int i;
+
+	ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+	/* ls_slots array is sparse, but not rcom_slots */
+
+	for (i = 0; i < ls->ls_slots_size; i++) {
+		slot = &ls->ls_slots[i];
+		if (!slot->nodeid)
+			continue;
+		ro->ro_nodeid = cpu_to_le32(slot->nodeid);
+		ro->ro_slot = cpu_to_le16(slot->slot);
+		ro++;
+	}
+}
+
+#define SLOT_DEBUG_LINE 128
+
+static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+		      struct rcom_slot *ro0, struct dlm_slot *array,
+		      int array_size)
+{
+	char line[SLOT_DEBUG_LINE];
+	int len = SLOT_DEBUG_LINE - 1;
+	int pos = 0;
+	int ret, i;
+
+	memset(line, 0, sizeof(line));
+
+	if (array) {
+		for (i = 0; i < array_size; i++) {
+			if (!array[i].nodeid)
+				continue;
+
+			ret = snprintf(line + pos, len - pos, " %d:%d",
+				       array[i].slot, array[i].nodeid);
+			if (ret >= len - pos)
+				break;
+			pos += ret;
+		}
+	} else if (ro0) {
+		for (i = 0; i < num_slots; i++) {
+			ret = snprintf(line + pos, len - pos, " %d:%d",
+				       ro0[i].ro_slot, ro0[i].ro_nodeid);
+			if (ret >= len - pos)
+				break;
+			pos += ret;
+		}
+	}
+
+	log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line);
+}
+
+int dlm_slots_copy_in(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	struct dlm_rcom *rc = ls->ls_recover_buf;
+	struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+	struct rcom_slot *ro0, *ro;
+	int our_nodeid = dlm_our_nodeid();
+	int i, num_slots;
+	uint32_t gen;
+
+	if (!dlm_slots_version(&rc->rc_header))
+		return -1;
+
+	gen = le32_to_cpu(rf->rf_generation);
+	if (gen <= ls->ls_generation) {
+		log_error(ls, "dlm_slots_copy_in gen %u old %u",
+			  gen, ls->ls_generation);
+	}
+	ls->ls_generation = gen;
+
+	num_slots = le16_to_cpu(rf->rf_num_slots);
+	if (!num_slots)
+		return -1;
+
+	ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+	for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+		ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
+		ro->ro_slot = le16_to_cpu(ro->ro_slot);
+	}
+
+	log_slots(ls, gen, num_slots, ro0, NULL, 0);
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+			if (ro->ro_nodeid != memb->nodeid)
+				continue;
+			memb->slot = ro->ro_slot;
+			memb->slot_prev = memb->slot;
+			break;
+		}
+
+		if (memb->nodeid == our_nodeid) {
+			if (ls->ls_slot && ls->ls_slot != memb->slot) {
+				log_error(ls, "dlm_slots_copy_in our slot "
+					  "changed %d %d", ls->ls_slot,
+					  memb->slot);
+				return -1;
+			}
+
+			if (!ls->ls_slot)
+				ls->ls_slot = memb->slot;
+		}
+
+		if (!memb->slot) {
+			log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
+				   memb->nodeid);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/* for any nodes that do not support slots, we will not have set memb->slot
+   in wait_status_all(), so memb->slot will remain -1, and we will not
+   assign slots or set ls_num_slots here */
+
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+		     struct dlm_slot **slots_out, uint32_t *gen_out)
+{
+	struct dlm_member *memb;
+	struct dlm_slot *array;
+	int our_nodeid = dlm_our_nodeid();
+	int array_size, max_slots, i;
+	int need = 0;
+	int max = 0;
+	int num = 0;
+	uint32_t gen = 0;
+
+	/* our own memb struct will have slot -1 gen 0 */
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->nodeid == our_nodeid) {
+			memb->slot = ls->ls_slot;
+			memb->generation = ls->ls_generation;
+			break;
+		}
+	}
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->generation > gen)
+			gen = memb->generation;
+
+		/* node doesn't support slots */
+
+		if (memb->slot == -1)
+			return -1;
+
+		/* node needs a slot assigned */
+
+		if (!memb->slot)
+			need++;
+
+		/* node has a slot assigned */
+
+		num++;
+
+		if (!max || max < memb->slot)
+			max = memb->slot;
+
+		/* sanity check, once slot is assigned it shouldn't change */
+
+		if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
+			log_error(ls, "nodeid %d slot changed %d %d",
+				  memb->nodeid, memb->slot_prev, memb->slot);
+			return -1;
+		}
+		memb->slot_prev = memb->slot;
+	}
+
+	array_size = max + need;
+
+	array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
+	if (!array)
+		return -ENOMEM;
+
+	num = 0;
+
+	/* fill in slots (offsets) that are used */
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (!memb->slot)
+			continue;
+
+		if (memb->slot > array_size) {
+			log_error(ls, "invalid slot number %d", memb->slot);
+			kfree(array);
+			return -1;
+		}
+
+		array[memb->slot - 1].nodeid = memb->nodeid;
+		array[memb->slot - 1].slot = memb->slot;
+		num++;
+	}
+
+	/* assign new slots from unused offsets */
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->slot)
+			continue;
+
+		for (i = 0; i < array_size; i++) {
+			if (array[i].nodeid)
+				continue;
+
+			memb->slot = i + 1;
+			memb->slot_prev = memb->slot;
+			array[i].nodeid = memb->nodeid;
+			array[i].slot = memb->slot;
+			num++;
+
+			if (!ls->ls_slot && memb->nodeid == our_nodeid)
+				ls->ls_slot = memb->slot;
+			break;
+		}
+
+		if (!memb->slot) {
+			log_error(ls, "no free slot found");
+			kfree(array);
+			return -1;
+		}
+	}
+
+	gen++;
+
+	log_slots(ls, gen, num, NULL, array, array_size);
+
+	max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
+		     sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
+
+	if (num > max_slots) {
+		log_error(ls, "num_slots %d exceeds max_slots %d",
+			  num, max_slots);
+		kfree(array);
+		return -1;
+	}
+
+	*gen_out = gen;
+	*slots_out = array;
+	*slots_size = array_size;
+	*num_slots = num;
+	return 0;
+}
+
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+	struct dlm_member *memb = NULL;
+	struct list_head *tmp;
+	struct list_head *newlist = &new->list;
+	struct list_head *head = &ls->ls_nodes;
+
+	list_for_each(tmp, head) {
+		memb = list_entry(tmp, struct dlm_member, list);
+		if (new->nodeid < memb->nodeid)
+			break;
+	}
+
+	if (!memb)
+		list_add_tail(newlist, head);
+	else {
+		/* FIXME: can use list macro here */
+		newlist->prev = tmp->prev;
+		newlist->next = tmp;
+		tmp->prev->next = newlist;
+		tmp->prev = newlist;
+	}
+}
+
+static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
+{
+	struct dlm_member *memb;
+	int error;
+
+	memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
+	if (!memb)
+		return -ENOMEM;
+
+	error = dlm_lowcomms_connect_node(node->nodeid);
+	if (error < 0) {
+		kfree(memb);
+		return error;
+	}
+
+	memb->nodeid = node->nodeid;
+	memb->weight = node->weight;
+	memb->comm_seq = node->comm_seq;
+	add_ordered_member(ls, memb);
+	ls->ls_num_nodes++;
+	return 0;
+}
+
+static struct dlm_member *find_memb(struct list_head *head, int nodeid)
+{
+	struct dlm_member *memb;
+
+	list_for_each_entry(memb, head, list) {
+		if (memb->nodeid == nodeid)
+			return memb;
+	}
+	return NULL;
+}
+
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+	if (find_memb(&ls->ls_nodes, nodeid))
+		return 1;
+	return 0;
+}
+
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+	if (find_memb(&ls->ls_nodes_gone, nodeid))
+		return 1;
+	return 0;
+}
+
+static void clear_memb_list(struct list_head *head)
+{
+	struct dlm_member *memb;
+
+	while (!list_empty(head)) {
+		memb = list_entry(head->next, struct dlm_member, list);
+		list_del(&memb->list);
+		kfree(memb);
+	}
+}
+
+void dlm_clear_members(struct dlm_ls *ls)
+{
+	clear_memb_list(&ls->ls_nodes);
+	ls->ls_num_nodes = 0;
+}
+
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+	clear_memb_list(&ls->ls_nodes_gone);
+}
+
+static void make_member_array(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	int i, w, x = 0, total = 0, all_zero = 0, *array;
+
+	kfree(ls->ls_node_array);
+	ls->ls_node_array = NULL;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->weight)
+			total += memb->weight;
+	}
+
+	/* all nodes revert to weight of 1 if all have weight 0 */
+
+	if (!total) {
+		total = ls->ls_num_nodes;
+		all_zero = 1;
+	}
+
+	ls->ls_total_weight = total;
+
+	array = kmalloc(sizeof(int) * total, GFP_NOFS);
+	if (!array)
+		return;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (!all_zero && !memb->weight)
+			continue;
+
+		if (all_zero)
+			w = 1;
+		else
+			w = memb->weight;
+
+		DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+
+		for (i = 0; i < w; i++)
+			array[x++] = memb->nodeid;
+	}
+
+	ls->ls_node_array = array;
+}
+
+/* send a status request to all members just to establish comms connections */
+
+static int ping_members(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	int error = 0;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		error = dlm_recovery_stopped(ls);
+		if (error)
+			break;
+		error = dlm_rcom_status(ls, memb->nodeid, 0);
+		if (error)
+			break;
+	}
+	if (error)
+		log_rinfo(ls, "ping_members aborted %d last nodeid %d",
+			  error, ls->ls_recover_nodeid);
+	return error;
+}
+
+static void dlm_lsop_recover_prep(struct dlm_ls *ls)
+{
+	if (!ls->ls_ops || !ls->ls_ops->recover_prep)
+		return;
+	ls->ls_ops->recover_prep(ls->ls_ops_arg);
+}
+
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+{
+	struct dlm_slot slot;
+	uint32_t seq;
+	int error;
+
+	if (!ls->ls_ops || !ls->ls_ops->recover_slot)
+		return;
+
+	/* if there is no comms connection with this node
+	   or the present comms connection is newer
+	   than the one when this member was added, then
+	   we consider the node to have failed (versus
+	   being removed due to dlm_release_lockspace) */
+
+	error = dlm_comm_seq(memb->nodeid, &seq);
+
+	if (!error && seq == memb->comm_seq)
+		return;
+
+	slot.nodeid = memb->nodeid;
+	slot.slot = memb->slot;
+
+	ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
+}
+
+void dlm_lsop_recover_done(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	struct dlm_slot *slots;
+	int i, num;
+
+	if (!ls->ls_ops || !ls->ls_ops->recover_done)
+		return;
+
+	num = ls->ls_num_nodes;
+
+	slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
+	if (!slots)
+		return;
+
+	i = 0;
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (i == num) {
+			log_error(ls, "dlm_lsop_recover_done bad num %d", num);
+			goto out;
+		}
+		slots[i].nodeid = memb->nodeid;
+		slots[i].slot = memb->slot;
+		i++;
+	}
+
+	ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
+				 ls->ls_slot, ls->ls_generation);
+ out:
+	kfree(slots);
+}
+
+static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
+						int nodeid)
+{
+	int i;
+
+	for (i = 0; i < rv->nodes_count; i++) {
+		if (rv->nodes[i].nodeid == nodeid)
+			return &rv->nodes[i];
+	}
+	return NULL;
+}
+
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+	struct dlm_member *memb, *safe;
+	struct dlm_config_node *node;
+	int i, error, neg = 0, low = -1;
+
+	/* previously removed members that we've not finished removing need to
+	   count as a negative change so the "neg" recovery steps will happen */
+
+	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+		log_rinfo(ls, "prev removed member %d", memb->nodeid);
+		neg++;
+	}
+
+	/* move departed members from ls_nodes to ls_nodes_gone */
+
+	list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+		node = find_config_node(rv, memb->nodeid);
+		if (node && !node->new)
+			continue;
+
+		if (!node) {
+			log_rinfo(ls, "remove member %d", memb->nodeid);
+		} else {
+			/* removed and re-added */
+			log_rinfo(ls, "remove member %d comm_seq %u %u",
+				  memb->nodeid, memb->comm_seq, node->comm_seq);
+		}
+
+		neg++;
+		list_move(&memb->list, &ls->ls_nodes_gone);
+		ls->ls_num_nodes--;
+		dlm_lsop_recover_slot(ls, memb);
+	}
+
+	/* add new members to ls_nodes */
+
+	for (i = 0; i < rv->nodes_count; i++) {
+		node = &rv->nodes[i];
+		if (dlm_is_member(ls, node->nodeid))
+			continue;
+		dlm_add_member(ls, node);
+		log_rinfo(ls, "add member %d", node->nodeid);
+	}
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (low == -1 || memb->nodeid < low)
+			low = memb->nodeid;
+	}
+	ls->ls_low_nodeid = low;
+
+	make_member_array(ls);
+	*neg_out = neg;
+
+	error = ping_members(ls);
+	if (!error || error == -EPROTO) {
+		/* new_lockspace() may be waiting to know if the config
+		   is good or bad */
+		ls->ls_members_result = error;
+		complete(&ls->ls_members_done);
+	}
+
+	log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
+	return error;
+}
+
+/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before
+   dlm_ls_start() is called on any of them to start the new recovery. */
+
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+	int new;
+
+	/*
+	 * Prevent dlm_recv from being in the middle of something when we do
+	 * the stop.  This includes ensuring dlm_recv isn't processing a
+	 * recovery message (rcom), while dlm_recoverd is aborting and
+	 * resetting things from an in-progress recovery.  i.e. we want
+	 * dlm_recoverd to abort its recovery without worrying about dlm_recv
+	 * processing an rcom at the same time.  Stopping dlm_recv also makes
+	 * it easy for dlm_receive_message() to check locking stopped and add a
+	 * message to the requestqueue without races.
+	 */
+
+	down_write(&ls->ls_recv_active);
+
+	/*
+	 * Abort any recovery that's in progress (see RECOVER_STOP,
+	 * dlm_recovery_stopped()) and tell any other threads running in the
+	 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
+	 */
+
+	spin_lock(&ls->ls_recover_lock);
+	set_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+	new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+	ls->ls_recover_seq++;
+	spin_unlock(&ls->ls_recover_lock);
+
+	/*
+	 * Let dlm_recv run again, now any normal messages will be saved on the
+	 * requestqueue for later.
+	 */
+
+	up_write(&ls->ls_recv_active);
+
+	/*
+	 * This in_recovery lock does two things:
+	 * 1) Keeps this function from returning until all threads are out
+	 *    of locking routines and locking is truly stopped.
+	 * 2) Keeps any new requests from being processed until it's unlocked
+	 *    when recovery is complete.
+	 */
+
+	if (new) {
+		set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags);
+		wake_up_process(ls->ls_recoverd_task);
+		wait_event(ls->ls_recover_lock_wait,
+			   test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
+	}
+
+	/*
+	 * The recoverd suspend/resume makes sure that dlm_recoverd (if
+	 * running) has noticed RECOVER_STOP above and quit processing the
+	 * previous recovery.
+	 */
+
+	dlm_recoverd_suspend(ls);
+
+	spin_lock(&ls->ls_recover_lock);
+	kfree(ls->ls_slots);
+	ls->ls_slots = NULL;
+	ls->ls_num_slots = 0;
+	ls->ls_slots_size = 0;
+	ls->ls_recover_status = 0;
+	spin_unlock(&ls->ls_recover_lock);
+
+	dlm_recoverd_resume(ls);
+
+	if (!ls->ls_recover_begin)
+		ls->ls_recover_begin = jiffies;
+
+	dlm_lsop_recover_prep(ls);
+	return 0;
+}
+
+int dlm_ls_start(struct dlm_ls *ls)
+{
+	struct dlm_recover *rv = NULL, *rv_old;
+	struct dlm_config_node *nodes;
+	int error, count;
+
+	rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
+	if (!rv)
+		return -ENOMEM;
+
+	error = dlm_config_nodes(ls->ls_name, &nodes, &count);
+	if (error < 0)
+		goto fail;
+
+	spin_lock(&ls->ls_recover_lock);
+
+	/* the lockspace needs to be stopped before it can be started */
+
+	if (!dlm_locking_stopped(ls)) {
+		spin_unlock(&ls->ls_recover_lock);
+		log_error(ls, "start ignored: lockspace running");
+		error = -EINVAL;
+		goto fail;
+	}
+
+	rv->nodes = nodes;
+	rv->nodes_count = count;
+	rv->seq = ++ls->ls_recover_seq;
+	rv_old = ls->ls_recover_args;
+	ls->ls_recover_args = rv;
+	spin_unlock(&ls->ls_recover_lock);
+
+	if (rv_old) {
+		log_error(ls, "unused recovery %llx %d",
+			  (unsigned long long)rv_old->seq, rv_old->nodes_count);
+		kfree(rv_old->nodes);
+		kfree(rv_old);
+	}
+
+	set_bit(LSFL_RECOVER_WORK, &ls->ls_flags);
+	wake_up_process(ls->ls_recoverd_task);
+	return 0;
+
+ fail:
+	kfree(rv);
+	kfree(nodes);
+	return error;
+}
+
diff --git a/kernel/fs/dlm/member.h b/kernel/fs/dlm/member.h
new file mode 100644
index 000000000..3deb70661
--- /dev/null
+++ b/kernel/fs/dlm/member.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+int dlm_is_member(struct dlm_ls *ls, int nodeid);
+int dlm_slots_version(struct dlm_header *h);
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+		   struct dlm_member *memb);
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_slots_copy_in(struct dlm_ls *ls);
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+		     struct dlm_slot **slots_out, uint32_t *gen_out);
+void dlm_lsop_recover_done(struct dlm_ls *ls);
+
+#endif                          /* __MEMBER_DOT_H__ */
+
diff --git a/kernel/fs/dlm/memory.c b/kernel/fs/dlm/memory.c
new file mode 100644
index 000000000..7cd24bccd
--- /dev/null
+++ b/kernel/fs/dlm/memory.c
@@ -0,0 +1,96 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+
+static struct kmem_cache *lkb_cache;
+static struct kmem_cache *rsb_cache;
+
+
+int __init dlm_memory_init(void)
+{
+	lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+				__alignof__(struct dlm_lkb), 0, NULL);
+	if (!lkb_cache)
+		return -ENOMEM;
+
+	rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb),
+				__alignof__(struct dlm_rsb), 0, NULL);
+	if (!rsb_cache) {
+		kmem_cache_destroy(lkb_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void dlm_memory_exit(void)
+{
+	if (lkb_cache)
+		kmem_cache_destroy(lkb_cache);
+	if (rsb_cache)
+		kmem_cache_destroy(rsb_cache);
+}
+
+char *dlm_allocate_lvb(struct dlm_ls *ls)
+{
+	char *p;
+
+	p = kzalloc(ls->ls_lvblen, GFP_NOFS);
+	return p;
+}
+
+void dlm_free_lvb(char *p)
+{
+	kfree(p);
+}
+
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+
+	r = kmem_cache_zalloc(rsb_cache, GFP_NOFS);
+	return r;
+}
+
+void dlm_free_rsb(struct dlm_rsb *r)
+{
+	if (r->res_lvbptr)
+		dlm_free_lvb(r->res_lvbptr);
+	kmem_cache_free(rsb_cache, r);
+}
+
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
+	return lkb;
+}
+
+void dlm_free_lkb(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		struct dlm_user_args *ua;
+		ua = lkb->lkb_ua;
+		if (ua) {
+			if (ua->lksb.sb_lvbptr)
+				kfree(ua->lksb.sb_lvbptr);
+			kfree(ua);
+		}
+	}
+	kmem_cache_free(lkb_cache, lkb);
+}
+
diff --git a/kernel/fs/dlm/memory.h b/kernel/fs/dlm/memory.h
new file mode 100644
index 000000000..177c11cbb
--- /dev/null
+++ b/kernel/fs/dlm/memory.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls);
+void dlm_free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
+void dlm_free_lkb(struct dlm_lkb *l);
+char *dlm_allocate_lvb(struct dlm_ls *ls);
+void dlm_free_lvb(char *l);
+
+#endif		/* __MEMORY_DOT_H__ */
+
diff --git a/kernel/fs/dlm/midcomms.c b/kernel/fs/dlm/midcomms.c
new file mode 100644
index 000000000..f3396c622
--- /dev/null
+++ b/kernel/fs/dlm/midcomms.c
@@ -0,0 +1,137 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "lock.h"
+#include "midcomms.h"
+
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+			 unsigned len, unsigned limit)
+{
+	unsigned copy = len;
+
+	if ((copy + offset) > limit)
+		copy = limit - offset;
+	memcpy(dst, base + offset, copy);
+	len -= copy;
+	if (len)
+		memcpy(dst + copy, base, len);
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+				unsigned offset, unsigned len, unsigned limit)
+{
+	union {
+		unsigned char __buf[DLM_INBUF_LEN];
+		/* this is to force proper alignment on some arches */
+		union dlm_packet p;
+	} __tmp;
+	union dlm_packet *p = &__tmp.p;
+	int ret = 0;
+	int err = 0;
+	uint16_t msglen;
+	uint32_t lockspace;
+
+	while (len > sizeof(struct dlm_header)) {
+
+		/* Copy just the header to check the total length.  The
+		   message may wrap around the end of the buffer back to the
+		   start, so we need to use a temp buffer and copy_from_cb. */
+
+		copy_from_cb(p, base, offset, sizeof(struct dlm_header),
+			     limit);
+
+		msglen = le16_to_cpu(p->header.h_length);
+		lockspace = p->header.h_lockspace;
+
+		err = -EINVAL;
+		if (msglen < sizeof(struct dlm_header))
+			break;
+		if (p->header.h_cmd == DLM_MSG) {
+			if (msglen < sizeof(struct dlm_message))
+				break;
+		} else {
+			if (msglen < sizeof(struct dlm_rcom))
+				break;
+		}
+		err = -E2BIG;
+		if (msglen > dlm_config.ci_buffer_size) {
+			log_print("message size %d from %d too big, buf len %d",
+				  msglen, nodeid, len);
+			break;
+		}
+		err = 0;
+
+		/* If only part of the full message is contained in this
+		   buffer, then do nothing and wait for lowcomms to call
+		   us again later with more data.  We return 0 meaning
+		   we've consumed none of the input buffer. */
+
+		if (msglen > len)
+			break;
+
+		/* Allocate a larger temp buffer if the full message won't fit
+		   in the buffer on the stack (which should work for most
+		   ordinary messages). */
+
+		if (msglen > sizeof(__tmp) && p == &__tmp.p) {
+			p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
+			if (p == NULL)
+				return ret;
+		}
+
+		copy_from_cb(p, base, offset, msglen, limit);
+
+		BUG_ON(lockspace != p->header.h_lockspace);
+
+		ret += msglen;
+		offset += msglen;
+		offset &= (limit - 1);
+		len -= msglen;
+
+		dlm_receive_buffer(p, nodeid);
+	}
+
+	if (p != &__tmp.p)
+		kfree(p);
+
+	return err ? err : ret;
+}
+
diff --git a/kernel/fs/dlm/midcomms.h b/kernel/fs/dlm/midcomms.h
new file mode 100644
index 000000000..95852a5f1
--- /dev/null
+++ b/kernel/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+				unsigned len, unsigned limit);
+
+#endif				/* __MIDCOMMS_DOT_H__ */
+
diff --git a/kernel/fs/dlm/netlink.c b/kernel/fs/dlm/netlink.c
new file mode 100644
index 000000000..1e6e22713
--- /dev/null
+++ b/kernel/fs/dlm/netlink.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <net/genetlink.h>
+#include <linux/dlm.h>
+#include <linux/dlm_netlink.h>
+#include <linux/gfp.h>
+
+#include "dlm_internal.h"
+
+static uint32_t dlm_nl_seqnum;
+static uint32_t listener_nlportid;
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= DLM_GENL_NAME,
+	.version	= DLM_GENL_VERSION,
+};
+
+static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
+{
+	struct sk_buff *skb;
+	void *data;
+
+	skb = genlmsg_new(size, GFP_NOFS);
+	if (!skb)
+		return -ENOMEM;
+
+	/* add the message headers */
+	data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd);
+	if (!data) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	return 0;
+}
+
+static struct dlm_lock_data *mk_data(struct sk_buff *skb)
+{
+	struct nlattr *ret;
+
+	ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data));
+	if (!ret)
+		return NULL;
+	return nla_data(ret);
+}
+
+static int send_data(struct sk_buff *skb)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *data = genlmsg_data(genlhdr);
+
+	genlmsg_end(skb, data);
+
+	return genlmsg_unicast(&init_net, skb, listener_nlportid);
+}
+
+static int user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	listener_nlportid = info->snd_portid;
+	printk("user_cmd nlpid %u\n", listener_nlportid);
+	return 0;
+}
+
+static struct genl_ops dlm_nl_ops[] = {
+	{
+		.cmd	= DLM_CMD_HELLO,
+		.doit	= user_cmd,
+	},
+};
+
+int __init dlm_netlink_init(void)
+{
+	return genl_register_family_with_ops(&family, dlm_nl_ops);
+}
+
+void dlm_netlink_exit(void)
+{
+	genl_unregister_family(&family);
+}
+
+static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+
+	memset(data, 0, sizeof(struct dlm_lock_data));
+
+	data->version = DLM_LOCK_DATA_VERSION;
+	data->nodeid = lkb->lkb_nodeid;
+	data->ownpid = lkb->lkb_ownpid;
+	data->id = lkb->lkb_id;
+	data->remid = lkb->lkb_remid;
+	data->status = lkb->lkb_status;
+	data->grmode = lkb->lkb_grmode;
+	data->rqmode = lkb->lkb_rqmode;
+	if (lkb->lkb_ua)
+		data->xid = lkb->lkb_ua->xid;
+	if (r) {
+		data->lockspace_id = r->res_ls->ls_global_id;
+		data->resource_namelen = r->res_length;
+		memcpy(data->resource_name, r->res_name, r->res_length);
+	}
+}
+
+void dlm_timeout_warn(struct dlm_lkb *lkb)
+{
+	struct sk_buff *uninitialized_var(send_skb);
+	struct dlm_lock_data *data;
+	size_t size;
+	int rv;
+
+	size = nla_total_size(sizeof(struct dlm_lock_data)) +
+	       nla_total_size(0); /* why this? */
+
+	rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size);
+	if (rv < 0)
+		return;
+
+	data = mk_data(send_skb);
+	if (!data) {
+		nlmsg_free(send_skb);
+		return;
+	}
+
+	fill_data(data, lkb);
+
+	send_data(send_skb);
+}
+
diff --git a/kernel/fs/dlm/plock.c b/kernel/fs/dlm/plock.c
new file mode 100644
index 000000000..e0ab3a93e
--- /dev/null
+++ b/kernel/fs/dlm/plock.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+#include <linux/slab.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+
+static spinlock_t ops_lock;
+static struct list_head send_list;
+static struct list_head recv_list;
+static wait_queue_head_t send_wq;
+static wait_queue_head_t recv_wq;
+
+struct plock_op {
+	struct list_head list;
+	int done;
+	struct dlm_plock_info info;
+};
+
+struct plock_xop {
+	struct plock_op xop;
+	int (*callback)(struct file_lock *fl, int result);
+	void *fl;
+	void *file;
+	struct file_lock flc;
+};
+
+
+static inline void set_version(struct dlm_plock_info *info)
+{
+	info->version[0] = DLM_PLOCK_VERSION_MAJOR;
+	info->version[1] = DLM_PLOCK_VERSION_MINOR;
+	info->version[2] = DLM_PLOCK_VERSION_PATCH;
+}
+
+static int check_version(struct dlm_plock_info *info)
+{
+	if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+	    (DLM_PLOCK_VERSION_MINOR < info->version[1])) {
+		log_print("plock device version mismatch: "
+			  "kernel (%u.%u.%u), user (%u.%u.%u)",
+			  DLM_PLOCK_VERSION_MAJOR,
+			  DLM_PLOCK_VERSION_MINOR,
+			  DLM_PLOCK_VERSION_PATCH,
+			  info->version[0],
+			  info->version[1],
+			  info->version[2]);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void send_op(struct plock_op *op)
+{
+	set_version(&op->info);
+	INIT_LIST_HEAD(&op->list);
+	spin_lock(&ops_lock);
+	list_add_tail(&op->list, &send_list);
+	spin_unlock(&ops_lock);
+	wake_up(&send_wq);
+}
+
+/* If a process was killed while waiting for the only plock on a file,
+   locks_remove_posix will not see any lock on the file so it won't
+   send an unlock-close to us to pass on to userspace to clean up the
+   abandoned waiter.  So, we have to insert the unlock-close when the
+   lock call is interrupted. */
+
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+			    struct file *file, struct file_lock *fl)
+{
+	struct plock_op *op;
+
+	op = kzalloc(sizeof(*op), GFP_NOFS);
+	if (!op)
+		return;
+
+	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= 0;
+	op->info.end		= OFFSET_MAX;
+	if (fl->fl_lmops && fl->fl_lmops->lm_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	op->info.flags |= DLM_PLOCK_FL_CLOSE;
+	send_op(op);
+}
+
+int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		   int cmd, struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	struct plock_xop *xop;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	xop = kzalloc(sizeof(*xop), GFP_NOFS);
+	if (!xop) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	op = &xop->xop;
+	op->info.optype		= DLM_PLOCK_OP_LOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.wait		= IS_SETLKW(cmd);
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+		/* fl_owner is lockd which doesn't distinguish
+		   processes on the nfs client */
+		op->info.owner	= (__u64) fl->fl_pid;
+		xop->callback	= fl->fl_lmops->lm_grant;
+		locks_init_lock(&xop->flc);
+		locks_copy_lock(&xop->flc, fl);
+		xop->fl		= fl;
+		xop->file	= file;
+	} else {
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+		xop->callback	= NULL;
+	}
+
+	send_op(op);
+
+	if (xop->callback == NULL) {
+		rv = wait_event_killable(recv_wq, (op->done != 0));
+		if (rv == -ERESTARTSYS) {
+			log_debug(ls, "dlm_posix_lock: wait killed %llx",
+				  (unsigned long long)number);
+			spin_lock(&ops_lock);
+			list_del(&op->list);
+			spin_unlock(&ops_lock);
+			kfree(xop);
+			do_unlock_close(ls, number, file, fl);
+			goto out;
+		}
+	} else {
+		rv = FILE_LOCK_DEFERRED;
+		goto out;
+	}
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_lock: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (!rv) {
+		if (posix_lock_file_wait(file, fl) < 0)
+			log_error(ls, "dlm_posix_lock: vfs lock error %llx",
+				  (unsigned long long)number);
+	}
+
+	kfree(xop);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_lock);
+
+/* Returns failure iff a successful lock operation should be canceled */
+static int dlm_plock_callback(struct plock_op *op)
+{
+	struct file *file;
+	struct file_lock *fl;
+	struct file_lock *flc;
+	int (*notify)(struct file_lock *fl, int result) = NULL;
+	struct plock_xop *xop = (struct plock_xop *)op;
+	int rv = 0;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_print("dlm_plock_callback: op on list %llx",
+			  (unsigned long long)op->info.number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* check if the following 2 are still valid or make a copy */
+	file = xop->file;
+	flc = &xop->flc;
+	fl = xop->fl;
+	notify = xop->callback;
+
+	if (op->info.rv) {
+		notify(fl, op->info.rv);
+		goto out;
+	}
+
+	/* got fs lock; bookkeep locally as well: */
+	flc->fl_flags &= ~FL_SLEEP;
+	if (posix_lock_file(file, flc, NULL)) {
+		/*
+		 * This can only happen in the case of kmalloc() failure.
+		 * The filesystem's own lock is the authoritative lock,
+		 * so a failure to get the lock locally is not a disaster.
+		 * As long as the fs cannot reliably cancel locks (especially
+		 * in a low-memory situation), we're better off ignoring
+		 * this failure than trying to recover.
+		 */
+		log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p",
+			  (unsigned long long)op->info.number, file, fl);
+	}
+
+	rv = notify(fl, 0);
+	if (rv) {
+		/* XXX: We need to cancel the fs lock here: */
+		log_print("dlm_plock_callback: lock granted after lock request "
+			  "failed; dangling lock!\n");
+		goto out;
+	}
+
+out:
+	kfree(xop);
+	return rv;
+}
+
+int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		     struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	int rv;
+	unsigned char fl_flags = fl->fl_flags;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	op = kzalloc(sizeof(*op), GFP_NOFS);
+	if (!op) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	/* cause the vfs unlock to return ENOENT if lock is not found */
+	fl->fl_flags |= FL_EXISTS;
+
+	rv = posix_lock_file_wait(file, fl);
+	if (rv == -ENOENT) {
+		rv = 0;
+		goto out_free;
+	}
+	if (rv < 0) {
+		log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx",
+			  rv, (unsigned long long)number);
+	}
+
+	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->lm_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	if (fl->fl_flags & FL_CLOSE) {
+		op->info.flags |= DLM_PLOCK_FL_CLOSE;
+		send_op(op);
+		rv = 0;
+		goto out;
+	}
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_unlock: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (rv == -ENOENT)
+		rv = 0;
+
+out_free:
+	kfree(op);
+out:
+	dlm_put_lockspace(ls);
+	fl->fl_flags = fl_flags;
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_unlock);
+
+int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		  struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	op = kzalloc(sizeof(*op), GFP_NOFS);
+	if (!op) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	op->info.optype		= DLM_PLOCK_OP_GET;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->lm_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_get: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* info.rv from userspace is 1 for conflict, 0 for no-conflict,
+	   -ENOENT if there are no locks on the file */
+
+	rv = op->info.rv;
+
+	fl->fl_type = F_UNLCK;
+	if (rv == -ENOENT)
+		rv = 0;
+	else if (rv > 0) {
+		locks_init_lock(fl);
+		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+		fl->fl_flags = FL_POSIX;
+		fl->fl_pid = op->info.pid;
+		fl->fl_start = op->info.start;
+		fl->fl_end = op->info.end;
+		rv = 0;
+	}
+
+	kfree(op);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_get);
+
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+			loff_t *ppos)
+{
+	struct dlm_plock_info info;
+	struct plock_op *op = NULL;
+
+	if (count < sizeof(info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list)) {
+		op = list_entry(send_list.next, struct plock_op, list);
+		if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+			list_del(&op->list);
+		else
+			list_move(&op->list, &recv_list);
+		memcpy(&info, &op->info, sizeof(info));
+	}
+	spin_unlock(&ops_lock);
+
+	if (!op)
+		return -EAGAIN;
+
+	/* there is no need to get a reply from userspace for unlocks
+	   that were generated by the vfs cleaning up for a close
+	   (the process did not make an unlock call). */
+
+	if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+		kfree(op);
+
+	if (copy_to_user(u, &info, sizeof(info)))
+		return -EFAULT;
+	return sizeof(info);
+}
+
+/* a write copies in one plock result that should match a plock_op
+   on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+			 loff_t *ppos)
+{
+	struct dlm_plock_info info;
+	struct plock_op *op;
+	int found = 0, do_callback = 0;
+
+	if (count != sizeof(info))
+		return -EINVAL;
+
+	if (copy_from_user(&info, u, sizeof(info)))
+		return -EFAULT;
+
+	if (check_version(&info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	list_for_each_entry(op, &recv_list, list) {
+		if (op->info.fsid == info.fsid &&
+		    op->info.number == info.number &&
+		    op->info.owner == info.owner) {
+			struct plock_xop *xop = (struct plock_xop *)op;
+			list_del_init(&op->list);
+			memcpy(&op->info, &info, sizeof(info));
+			if (xop->callback)
+				do_callback = 1;
+			else
+				op->done = 1;
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&ops_lock);
+
+	if (found) {
+		if (do_callback)
+			dlm_plock_callback(op);
+		else
+			wake_up(&recv_wq);
+	} else
+		log_print("dev_write no op %x %llx", info.fsid,
+			  (unsigned long long)info.number);
+	return count;
+}
+
+static unsigned int dev_poll(struct file *file, poll_table *wait)
+{
+	unsigned int mask = 0;
+
+	poll_wait(file, &send_wq, wait);
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list))
+		mask = POLLIN | POLLRDNORM;
+	spin_unlock(&ops_lock);
+
+	return mask;
+}
+
+static const struct file_operations dev_fops = {
+	.read    = dev_read,
+	.write   = dev_write,
+	.poll    = dev_poll,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice plock_dev_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = DLM_PLOCK_MISC_NAME,
+	.fops = &dev_fops
+};
+
+int dlm_plock_init(void)
+{
+	int rv;
+
+	spin_lock_init(&ops_lock);
+	INIT_LIST_HEAD(&send_list);
+	INIT_LIST_HEAD(&recv_list);
+	init_waitqueue_head(&send_wq);
+	init_waitqueue_head(&recv_wq);
+
+	rv = misc_register(&plock_dev_misc);
+	if (rv)
+		log_print("dlm_plock_init: misc_register failed %d", rv);
+	return rv;
+}
+
+void dlm_plock_exit(void)
+{
+	if (misc_deregister(&plock_dev_misc) < 0)
+		log_print("dlm_plock_exit: misc_deregister failed");
+}
+
diff --git a/kernel/fs/dlm/rcom.c b/kernel/fs/dlm/rcom.c
new file mode 100644
index 000000000..f3f5e72a2
--- /dev/null
+++ b/kernel/fs/dlm/rcom.c
@@ -0,0 +1,656 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+
+static int rcom_response(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+		       struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	char *mb;
+	int mb_len = sizeof(struct dlm_rcom) + len;
+
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+	if (!mh) {
+		log_print("create_rcom to %d type %d len %d ENOBUFS",
+			  to_nodeid, type, len);
+		return -ENOBUFS;
+	}
+	memset(mb, 0, mb_len);
+
+	rc = (struct dlm_rcom *) mb;
+
+	rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	rc->rc_header.h_lockspace = ls->ls_global_id;
+	rc->rc_header.h_nodeid = dlm_our_nodeid();
+	rc->rc_header.h_length = mb_len;
+	rc->rc_header.h_cmd = DLM_RCOM;
+
+	rc->rc_type = type;
+
+	spin_lock(&ls->ls_recover_lock);
+	rc->rc_seq = ls->ls_recover_seq;
+	spin_unlock(&ls->ls_recover_lock);
+
+	*mh_ret = mh;
+	*rc_ret = rc;
+	return 0;
+}
+
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+		      struct dlm_rcom *rc)
+{
+	dlm_rcom_out(rc);
+	dlm_lowcomms_commit_buffer(mh);
+}
+
+static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
+			    uint32_t flags)
+{
+	rs->rs_flags = cpu_to_le32(flags);
+}
+
+/* When replying to a status request, a node also sends back its
+   configuration values.  The requesting node then checks that the remote
+   node is configured the same way as itself. */
+
+static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
+			    uint32_t num_slots)
+{
+	rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
+	rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
+
+	rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
+	rf->rf_num_slots = cpu_to_le16(num_slots);
+	rf->rf_generation =  cpu_to_le32(ls->ls_generation);
+}
+
+static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+{
+	struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
+
+	if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+		log_error(ls, "version mismatch: %x nodeid %d: %x",
+			  DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
+			  rc->rc_header.h_version);
+		return -EPROTO;
+	}
+
+	if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
+	    le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
+		log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+			  ls->ls_lvblen, ls->ls_exflags, nodeid,
+			  le32_to_cpu(rf->rf_lvblen),
+			  le32_to_cpu(rf->rf_lsflags));
+		return -EPROTO;
+	}
+	return 0;
+}
+
+static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	*new_seq = ++ls->ls_rcom_seq;
+	set_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
+static void disallow_sync_reply(struct dlm_ls *ls)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
+/*
+ * low nodeid gathers one slot value at a time from each node.
+ * it sets need_slots=0, and saves rf_our_slot returned from each
+ * rcom_config.
+ *
+ * other nodes gather all slot values at once from the low nodeid.
+ * they set need_slots=1, and ignore the rf_our_slot returned from each
+ * rcom_config.  they use the rf_num_slots returned from the low
+ * node's rcom_config.
+ */
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error = 0;
+
+	ls->ls_recover_nodeid = nodeid;
+
+	if (nodeid == dlm_our_nodeid()) {
+		rc = ls->ls_recover_buf;
+		rc->rc_result = dlm_recover_status(ls);
+		goto out;
+	}
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
+			    sizeof(struct rcom_status), &rc, &mh);
+	if (error)
+		goto out;
+
+	set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
+
+	allow_sync_reply(ls, &rc->rc_id);
+	memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
+
+	send_rcom(ls, mh, rc);
+
+	error = dlm_wait_function(ls, &rcom_response);
+	disallow_sync_reply(ls);
+	if (error)
+		goto out;
+
+	rc = ls->ls_recover_buf;
+
+	if (rc->rc_result == -ESRCH) {
+		/* we pretend the remote lockspace exists with 0 status */
+		log_debug(ls, "remote node %d not ready", nodeid);
+		rc->rc_result = 0;
+		error = 0;
+	} else {
+		error = check_rcom_config(ls, rc, nodeid);
+	}
+
+	/* the caller looks at rc_result for the remote recovery status */
+ out:
+	return error;
+}
+
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct rcom_status *rs;
+	uint32_t status;
+	int nodeid = rc_in->rc_header.h_nodeid;
+	int len = sizeof(struct rcom_config);
+	int num_slots = 0;
+	int error;
+
+	if (!dlm_slots_version(&rc_in->rc_header)) {
+		status = dlm_recover_status(ls);
+		goto do_create;
+	}
+
+	rs = (struct rcom_status *)rc_in->rc_buf;
+
+	if (!(le32_to_cpu(rs->rs_flags) & DLM_RSF_NEED_SLOTS)) {
+		status = dlm_recover_status(ls);
+		goto do_create;
+	}
+
+	spin_lock(&ls->ls_recover_lock);
+	status = ls->ls_recover_status;
+	num_slots = ls->ls_num_slots;
+	spin_unlock(&ls->ls_recover_lock);
+	len += num_slots * sizeof(struct rcom_slot);
+
+ do_create:
+	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+			    len, &rc, &mh);
+	if (error)
+		return;
+
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+	rc->rc_result = status;
+
+	set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
+
+	if (!num_slots)
+		goto do_send;
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_num_slots != num_slots) {
+		spin_unlock(&ls->ls_recover_lock);
+		log_debug(ls, "receive_rcom_status num_slots %d to %d",
+			  num_slots, ls->ls_num_slots);
+		rc->rc_result = 0;
+		set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
+		goto do_send;
+	}
+
+	dlm_slots_copy_out(ls, rc);
+	spin_unlock(&ls->ls_recover_lock);
+
+ do_send:
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
+	    rc_in->rc_id != ls->ls_rcom_seq) {
+		log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
+			  rc_in->rc_type, rc_in->rc_header.h_nodeid,
+			  (unsigned long long)rc_in->rc_id,
+			  (unsigned long long)ls->ls_rcom_seq);
+		goto out;
+	}
+	memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+	set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	wake_up(&ls->ls_wait_general);
+ out:
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error = 0;
+
+	ls->ls_recover_nodeid = nodeid;
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+	if (error)
+		goto out;
+	memcpy(rc->rc_buf, last_name, last_len);
+
+	allow_sync_reply(ls, &rc->rc_id);
+	memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
+
+	send_rcom(ls, mh, rc);
+
+	error = dlm_wait_function(ls, &rcom_response);
+	disallow_sync_reply(ls);
+ out:
+	return error;
+}
+
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, inlen, outlen, nodeid;
+
+	nodeid = rc_in->rc_header.h_nodeid;
+	inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+	outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+	if (error)
+		return;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+
+	dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+			      nodeid);
+	send_rcom(ls, mh, rc);
+}
+
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct dlm_ls *ls = r->res_ls;
+	int error;
+
+	error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+			    &rc, &mh);
+	if (error)
+		goto out;
+	memcpy(rc->rc_buf, r->res_name, r->res_length);
+	rc->rc_id = (unsigned long) r->res_id;
+
+	send_rcom(ls, mh, rc);
+ out:
+	return error;
+}
+
+int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct dlm_ls *ls = r->res_ls;
+	int error;
+
+	error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+			    &rc, &mh);
+	if (error)
+		goto out;
+	memcpy(rc->rc_buf, r->res_name, r->res_length);
+	rc->rc_id = 0xFFFFFFFF;
+
+	send_rcom(ls, mh, rc);
+ out:
+	return error;
+}
+
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+	int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+	if (error)
+		return;
+
+	if (rc_in->rc_id == 0xFFFFFFFF) {
+		log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
+		dlm_dump_rsb_name(ls, rc_in->rc_buf, len);
+		return;
+	}
+
+	error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len,
+				  DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL);
+	if (error)
+		ret_nodeid = error;
+	rc->rc_result = ret_nodeid;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	dlm_recover_master_reply(ls, rc_in);
+}
+
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			   struct rcom_lock *rl)
+{
+	memset(rl, 0, sizeof(*rl));
+
+	rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid);
+	rl->rl_lkid = cpu_to_le32(lkb->lkb_id);
+	rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags);
+	rl->rl_flags = cpu_to_le32(lkb->lkb_flags);
+	rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
+	rl->rl_rqmode = lkb->lkb_rqmode;
+	rl->rl_grmode = lkb->lkb_grmode;
+	rl->rl_status = lkb->lkb_status;
+	rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
+
+	if (lkb->lkb_bastfn)
+		rl->rl_asts |= DLM_CB_BAST;
+	if (lkb->lkb_astfn)
+		rl->rl_asts |= DLM_CB_CAST;
+
+	rl->rl_namelen = cpu_to_le16(r->res_length);
+	memcpy(rl->rl_name, r->res_name, r->res_length);
+
+	/* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+	   If so, receive_rcom_lock_args() won't take this copy. */
+
+	if (lkb->lkb_lvbptr)
+		memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = r->res_ls;
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct rcom_lock *rl;
+	int error, len = sizeof(struct rcom_lock);
+
+	if (lkb->lkb_lvbptr)
+		len += ls->ls_lvblen;
+
+	error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+	if (error)
+		goto out;
+
+	rl = (struct rcom_lock *) rc->rc_buf;
+	pack_rcom_lock(r, lkb, rl);
+	rc->rc_id = (unsigned long) r;
+
+	send_rcom(ls, mh, rc);
+ out:
+	return error;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, nodeid = rc_in->rc_header.h_nodeid;
+
+	dlm_recover_master_copy(ls, rc_in);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+			    sizeof(struct rcom_lock), &rc, &mh);
+	if (error)
+		return;
+
+	/* We send back the same rcom_lock struct we received, but
+	   dlm_recover_master_copy() has filled in rl_remid and rl_result */
+
+	memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+
+	send_rcom(ls, mh, rc);
+}
+
+/* If the lockspace doesn't exist then still send a status message
+   back; it's possible that it just doesn't have its global_id yet. */
+
+int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct rcom_config *rf;
+	struct dlm_mhandle *mh;
+	char *mb;
+	int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
+
+	mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb);
+	if (!mh)
+		return -ENOBUFS;
+	memset(mb, 0, mb_len);
+
+	rc = (struct dlm_rcom *) mb;
+
+	rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+	rc->rc_header.h_nodeid = dlm_our_nodeid();
+	rc->rc_header.h_length = mb_len;
+	rc->rc_header.h_cmd = DLM_RCOM;
+
+	rc->rc_type = DLM_RCOM_STATUS_REPLY;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+	rc->rc_result = -ESRCH;
+
+	rf = (struct rcom_config *) rc->rc_buf;
+	rf->rf_lvblen = cpu_to_le32(~0U);
+
+	dlm_rcom_out(rc);
+	dlm_lowcomms_commit_buffer(mh);
+
+	return 0;
+}
+
+/*
+ * Ignore messages for stage Y before we set
+ * recover_status bit for stage X:
+ *
+ * recover_status = 0
+ *
+ * dlm_recover_members()
+ * - send nothing
+ * - recv nothing
+ * - ignore NAMES, NAMES_REPLY
+ * - ignore LOOKUP, LOOKUP_REPLY
+ * - ignore LOCK, LOCK_REPLY
+ *
+ * recover_status |= NODES
+ *
+ * dlm_recover_members_wait()
+ *
+ * dlm_recover_directory()
+ * - send NAMES
+ * - recv NAMES_REPLY
+ * - ignore LOOKUP, LOOKUP_REPLY
+ * - ignore LOCK, LOCK_REPLY
+ *
+ * recover_status |= DIR
+ *
+ * dlm_recover_directory_wait()
+ *
+ * dlm_recover_masters()
+ * - send LOOKUP
+ * - recv LOOKUP_REPLY
+ *
+ * dlm_recover_locks()
+ * - send LOCKS
+ * - recv LOCKS_REPLY
+ *
+ * recover_status |= LOCKS
+ *
+ * dlm_recover_locks_wait()
+ *
+ * recover_status |= DONE
+ */
+
+/* Called by dlm_recv; corresponds to dlm_receive_message() but special
+   recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+{
+	int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
+	int stop, reply = 0, names = 0, lookup = 0, lock = 0;
+	uint32_t status;
+	uint64_t seq;
+
+	switch (rc->rc_type) {
+	case DLM_RCOM_STATUS_REPLY:
+		reply = 1;
+		break;
+	case DLM_RCOM_NAMES:
+		names = 1;
+		break;
+	case DLM_RCOM_NAMES_REPLY:
+		names = 1;
+		reply = 1;
+		break;
+	case DLM_RCOM_LOOKUP:
+		lookup = 1;
+		break;
+	case DLM_RCOM_LOOKUP_REPLY:
+		lookup = 1;
+		reply = 1;
+		break;
+	case DLM_RCOM_LOCK:
+		lock = 1;
+		break;
+	case DLM_RCOM_LOCK_REPLY:
+		lock = 1;
+		reply = 1;
+		break;
+	};
+
+	spin_lock(&ls->ls_recover_lock);
+	status = ls->ls_recover_status;
+	stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+	seq = ls->ls_recover_seq;
+	spin_unlock(&ls->ls_recover_lock);
+
+	if (stop && (rc->rc_type != DLM_RCOM_STATUS))
+		goto ignore;
+
+	if (reply && (rc->rc_seq_reply != seq))
+		goto ignore;
+
+	if (!(status & DLM_RS_NODES) && (names || lookup || lock))
+		goto ignore;
+
+	if (!(status & DLM_RS_DIR) && (lookup || lock))
+		goto ignore;
+
+	switch (rc->rc_type) {
+	case DLM_RCOM_STATUS:
+		receive_rcom_status(ls, rc);
+		break;
+
+	case DLM_RCOM_NAMES:
+		receive_rcom_names(ls, rc);
+		break;
+
+	case DLM_RCOM_LOOKUP:
+		receive_rcom_lookup(ls, rc);
+		break;
+
+	case DLM_RCOM_LOCK:
+		if (rc->rc_header.h_length < lock_size)
+			goto Eshort;
+		receive_rcom_lock(ls, rc);
+		break;
+
+	case DLM_RCOM_STATUS_REPLY:
+		receive_sync_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_NAMES_REPLY:
+		receive_sync_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_LOOKUP_REPLY:
+		receive_rcom_lookup_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_LOCK_REPLY:
+		if (rc->rc_header.h_length < lock_size)
+			goto Eshort;
+		dlm_recover_process_copy(ls, rc);
+		break;
+
+	default:
+		log_error(ls, "receive_rcom bad type %d", rc->rc_type);
+	}
+	return;
+
+ignore:
+	log_limit(ls, "dlm_receive_rcom ignore msg %d "
+		  "from %d %llu %llu recover seq %llu sts %x gen %u",
+		   rc->rc_type,
+		   nodeid,
+		   (unsigned long long)rc->rc_seq,
+		   (unsigned long long)rc->rc_seq_reply,
+		   (unsigned long long)seq,
+		   status, ls->ls_generation);
+	return;
+Eshort:
+	log_error(ls, "recovery message %d from %d is too short",
+		  rc->rc_type, nodeid);
+}
+
diff --git a/kernel/fs/dlm/rcom.h b/kernel/fs/dlm/rcom.h
new file mode 100644
index 000000000..f8e243463
--- /dev/null
+++ b/kernel/fs/dlm/rcom.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
+int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
+
+#endif
+
diff --git a/kernel/fs/dlm/recover.c b/kernel/fs/dlm/recover.c
new file mode 100644
index 000000000..eaea789bf
--- /dev/null
+++ b/kernel/fs/dlm/recover.c
@@ -0,0 +1,955 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+
+
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status.  They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes).  When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timeout.  This uses a timeout so it can check periodically if the wait
+ * should abort due to node failure (which doesn't cause a wake_up).
+ * This should only be called by the dlm_recoverd thread.
+ */
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+	int error = 0;
+	int rv;
+
+	while (1) {
+		rv = wait_event_timeout(ls->ls_wait_general,
+					testfn(ls) || dlm_recovery_stopped(ls),
+					dlm_config.ci_recover_timer * HZ);
+		if (rv)
+			break;
+	}
+
+	if (dlm_recovery_stopped(ls)) {
+		log_debug(ls, "dlm_wait_function aborted");
+		error = -EINTR;
+	}
+	return error;
+}
+
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status.  The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low).  When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+	uint32_t status;
+	spin_lock(&ls->ls_recover_lock);
+	status = ls->ls_recover_status;
+	spin_unlock(&ls->ls_recover_lock);
+	return status;
+}
+
+static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+	ls->ls_recover_status |= status;
+}
+
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+	spin_lock(&ls->ls_recover_lock);
+	_set_recover_status(ls, status);
+	spin_unlock(&ls->ls_recover_lock);
+}
+
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
+			   int save_slots)
+{
+	struct dlm_rcom *rc = ls->ls_recover_buf;
+	struct dlm_member *memb;
+	int error = 0, delay;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		delay = 0;
+		for (;;) {
+			if (dlm_recovery_stopped(ls)) {
+				error = -EINTR;
+				goto out;
+			}
+
+			error = dlm_rcom_status(ls, memb->nodeid, 0);
+			if (error)
+				goto out;
+
+			if (save_slots)
+				dlm_slot_save(ls, rc, memb);
+
+			if (rc->rc_result & wait_status)
+				break;
+			if (delay < 1000)
+				delay += 20;
+			msleep(delay);
+		}
+	}
+ out:
+	return error;
+}
+
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
+			   uint32_t status_flags)
+{
+	struct dlm_rcom *rc = ls->ls_recover_buf;
+	int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+
+	for (;;) {
+		if (dlm_recovery_stopped(ls)) {
+			error = -EINTR;
+			goto out;
+		}
+
+		error = dlm_rcom_status(ls, nodeid, status_flags);
+		if (error)
+			break;
+
+		if (rc->rc_result & wait_status)
+			break;
+		if (delay < 1000)
+			delay += 20;
+		msleep(delay);
+	}
+ out:
+	return error;
+}
+
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+	uint32_t status_all = status << 1;
+	int error;
+
+	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+		error = wait_status_all(ls, status, 0);
+		if (!error)
+			dlm_set_recover_status(ls, status_all);
+	} else
+		error = wait_status_low(ls, status_all, 0);
+
+	return error;
+}
+
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	struct dlm_slot *slots;
+	int num_slots, slots_size;
+	int error, rv;
+	uint32_t gen;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		memb->slot = -1;
+		memb->generation = 0;
+	}
+
+	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+		error = wait_status_all(ls, DLM_RS_NODES, 1);
+		if (error)
+			goto out;
+
+		/* slots array is sparse, slots_size may be > num_slots */
+
+		rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
+		if (!rv) {
+			spin_lock(&ls->ls_recover_lock);
+			_set_recover_status(ls, DLM_RS_NODES_ALL);
+			ls->ls_num_slots = num_slots;
+			ls->ls_slots_size = slots_size;
+			ls->ls_slots = slots;
+			ls->ls_generation = gen;
+			spin_unlock(&ls->ls_recover_lock);
+		} else {
+			dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
+		}
+	} else {
+		error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
+		if (error)
+			goto out;
+
+		dlm_slots_copy_in(ls);
+	}
+ out:
+	return error;
+}
+
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_DIR);
+}
+
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_LOCKS);
+}
+
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_DONE);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+
+static int recover_list_empty(struct dlm_ls *ls)
+{
+	int empty;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	empty = list_empty(&ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	return empty;
+}
+
+static void recover_list_add(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	if (list_empty(&r->res_recover_list)) {
+		list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+		ls->ls_recover_list_count++;
+		dlm_hold_rsb(r);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static void recover_list_del(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_del_init(&r->res_recover_list);
+	ls->ls_recover_list_count--;
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	dlm_put_rsb(r);
+}
+
+static void recover_list_clear(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *s;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+		list_del_init(&r->res_recover_list);
+		r->res_recover_locks_count = 0;
+		dlm_put_rsb(r);
+		ls->ls_recover_list_count--;
+	}
+
+	if (ls->ls_recover_list_count != 0) {
+		log_error(ls, "warning: recover_list_count %d",
+			  ls->ls_recover_list_count);
+		ls->ls_recover_list_count = 0;
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static int recover_idr_empty(struct dlm_ls *ls)
+{
+	int empty = 1;
+
+	spin_lock(&ls->ls_recover_idr_lock);
+	if (ls->ls_recover_list_count)
+		empty = 0;
+	spin_unlock(&ls->ls_recover_idr_lock);
+
+	return empty;
+}
+
+static int recover_idr_add(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+	int rv;
+
+	idr_preload(GFP_NOFS);
+	spin_lock(&ls->ls_recover_idr_lock);
+	if (r->res_id) {
+		rv = -1;
+		goto out_unlock;
+	}
+	rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT);
+	if (rv < 0)
+		goto out_unlock;
+
+	r->res_id = rv;
+	ls->ls_recover_list_count++;
+	dlm_hold_rsb(r);
+	rv = 0;
+out_unlock:
+	spin_unlock(&ls->ls_recover_idr_lock);
+	idr_preload_end();
+	return rv;
+}
+
+static void recover_idr_del(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	spin_lock(&ls->ls_recover_idr_lock);
+	idr_remove(&ls->ls_recover_idr, r->res_id);
+	r->res_id = 0;
+	ls->ls_recover_list_count--;
+	spin_unlock(&ls->ls_recover_idr_lock);
+
+	dlm_put_rsb(r);
+}
+
+static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)
+{
+	struct dlm_rsb *r;
+
+	spin_lock(&ls->ls_recover_idr_lock);
+	r = idr_find(&ls->ls_recover_idr, (int)id);
+	spin_unlock(&ls->ls_recover_idr_lock);
+	return r;
+}
+
+static void recover_idr_clear(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int id;
+
+	spin_lock(&ls->ls_recover_idr_lock);
+
+	idr_for_each_entry(&ls->ls_recover_idr, r, id) {
+		idr_remove(&ls->ls_recover_idr, id);
+		r->res_id = 0;
+		r->res_recover_locks_count = 0;
+		ls->ls_recover_list_count--;
+
+		dlm_put_rsb(r);
+	}
+
+	if (ls->ls_recover_list_count != 0) {
+		log_error(ls, "warning: recover_list_count %d",
+			  ls->ls_recover_list_count);
+		ls->ls_recover_list_count = 0;
+	}
+	spin_unlock(&ls->ls_recover_idr_lock);
+}
+
+
+/* Master recovery: find new master node for rsb's that were
+   mastered on nodes that have been removed.
+
+   dlm_recover_masters
+   recover_master
+   dlm_send_rcom_lookup            ->  receive_rcom_lookup
+                                       dlm_dir_lookup
+   receive_rcom_lookup_reply       <-
+   dlm_recover_master_reply
+   set_new_master
+   set_master_lkbs
+   set_lock_master
+*/
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, queue, lkb_statequeue) {
+		if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) {
+			lkb->lkb_nodeid = nodeid;
+			lkb->lkb_remid = 0;
+		}
+	}
+}
+
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+	set_lock_master(&r->res_grantqueue, r->res_nodeid);
+	set_lock_master(&r->res_convertqueue, r->res_nodeid);
+	set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+
+/*
+ * Propagate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which
+ * rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *r)
+{
+	set_master_lkbs(r);
+	rsb_set_flag(r, RSB_NEW_MASTER);
+	rsb_set_flag(r, RSB_NEW_MASTER2);
+}
+
+/*
+ * We do async lookups on rsb's that need new masters.  The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ *
+ * Another node recovering the master may have sent us a rcom lookup,
+ * and our dlm_master_lookup() set it as the new master, along with
+ * NEW_MASTER so that we'll recover it here (this implies dir_nodeid
+ * equals our_nodeid below).
+ */
+
+static int recover_master(struct dlm_rsb *r, unsigned int *count)
+{
+	struct dlm_ls *ls = r->res_ls;
+	int our_nodeid, dir_nodeid;
+	int is_removed = 0;
+	int error;
+
+	if (is_master(r))
+		return 0;
+
+	is_removed = dlm_is_removed(ls, r->res_nodeid);
+
+	if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER))
+		return 0;
+
+	our_nodeid = dlm_our_nodeid();
+	dir_nodeid = dlm_dir_nodeid(r);
+
+	if (dir_nodeid == our_nodeid) {
+		if (is_removed) {
+			r->res_master_nodeid = our_nodeid;
+			r->res_nodeid = 0;
+		}
+
+		/* set master of lkbs to ourself when is_removed, or to
+		   another new master which we set along with NEW_MASTER
+		   in dlm_master_lookup */
+		set_new_master(r);
+		error = 0;
+	} else {
+		recover_idr_add(r);
+		error = dlm_send_rcom_lookup(r, dir_nodeid);
+	}
+
+	(*count)++;
+	return error;
+}
+
+/*
+ * All MSTCPY locks are purged and rebuilt, even if the master stayed the same.
+ * This is necessary because recovery can be started, aborted and restarted,
+ * causing the master nodeid to briefly change during the aborted recovery, and
+ * change back to the original value in the second recovery.  The MSTCPY locks
+ * may or may not have been purged during the aborted recovery.  Another node
+ * with an outstanding request in waiters list and a request reply saved in the
+ * requestqueue, cannot know whether it should ignore the reply and resend the
+ * request, or accept the reply and complete the request.  It must do the
+ * former if the remote node purged MSTCPY locks, and it must do the later if
+ * the remote node did not.  This is solved by always purging MSTCPY locks, in
+ * which case, the request reply would always be ignored and the request
+ * resent.
+ */
+
+static int recover_master_static(struct dlm_rsb *r, unsigned int *count)
+{
+	int dir_nodeid = dlm_dir_nodeid(r);
+	int new_master = dir_nodeid;
+
+	if (dir_nodeid == dlm_our_nodeid())
+		new_master = 0;
+
+	dlm_purge_mstcpy_locks(r);
+	r->res_master_nodeid = dir_nodeid;
+	r->res_nodeid = new_master;
+	set_new_master(r);
+	(*count)++;
+	return 0;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory.  The dir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	unsigned int total = 0;
+	unsigned int count = 0;
+	int nodir = dlm_no_directory(ls);
+	int error;
+
+	log_rinfo(ls, "dlm_recover_masters");
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (dlm_recovery_stopped(ls)) {
+			up_read(&ls->ls_root_sem);
+			error = -EINTR;
+			goto out;
+		}
+
+		lock_rsb(r);
+		if (nodir)
+			error = recover_master_static(r, &count);
+		else
+			error = recover_master(r, &count);
+		unlock_rsb(r);
+		cond_resched();
+		total++;
+
+		if (error) {
+			up_read(&ls->ls_root_sem);
+			goto out;
+		}
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
+
+	error = dlm_wait_function(ls, &recover_idr_empty);
+ out:
+	if (error)
+		recover_idr_clear(ls);
+	return error;
+}
+
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct dlm_rsb *r;
+	int ret_nodeid, new_master;
+
+	r = recover_idr_find(ls, rc->rc_id);
+	if (!r) {
+		log_error(ls, "dlm_recover_master_reply no id %llx",
+			  (unsigned long long)rc->rc_id);
+		goto out;
+	}
+
+	ret_nodeid = rc->rc_result;
+
+	if (ret_nodeid == dlm_our_nodeid())
+		new_master = 0;
+	else
+		new_master = ret_nodeid;
+
+	lock_rsb(r);
+	r->res_master_nodeid = ret_nodeid;
+	r->res_nodeid = new_master;
+	set_new_master(r);
+	unlock_rsb(r);
+	recover_idr_del(r);
+
+	if (recover_idr_empty(ls))
+		wake_up(&ls->ls_wait_general);
+ out:
+	return 0;
+}
+
+
+/* Lock recovery: rebuild the process-copy locks we hold on a
+   remastered rsb on the new rsb master.
+
+   dlm_recover_locks
+   recover_locks
+   recover_locks_queue
+   dlm_send_rcom_lock              ->  receive_rcom_lock
+                                       dlm_recover_master_copy
+   receive_rcom_lock_reply         <-
+   dlm_recover_process_copy
+*/
+
+
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+	struct dlm_lkb *lkb;
+	int error = 0;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+	   	error = dlm_send_rcom_lock(r, lkb);
+		if (error)
+			break;
+		r->res_recover_locks_count++;
+	}
+
+	return error;
+}
+
+static int recover_locks(struct dlm_rsb *r)
+{
+	int error = 0;
+
+	lock_rsb(r);
+
+	DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+
+	error = recover_locks_queue(r, &r->res_grantqueue);
+	if (error)
+		goto out;
+	error = recover_locks_queue(r, &r->res_convertqueue);
+	if (error)
+		goto out;
+	error = recover_locks_queue(r, &r->res_waitqueue);
+	if (error)
+		goto out;
+
+	if (r->res_recover_locks_count)
+		recover_list_add(r);
+	else
+		rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+	unlock_rsb(r);
+	return error;
+}
+
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int error, count = 0;
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (is_master(r)) {
+			rsb_clear_flag(r, RSB_NEW_MASTER);
+			continue;
+		}
+
+		if (!rsb_flag(r, RSB_NEW_MASTER))
+			continue;
+
+		if (dlm_recovery_stopped(ls)) {
+			error = -EINTR;
+			up_read(&ls->ls_root_sem);
+			goto out;
+		}
+
+		error = recover_locks(r);
+		if (error) {
+			up_read(&ls->ls_root_sem);
+			goto out;
+		}
+
+		count += r->res_recover_locks_count;
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_rinfo(ls, "dlm_recover_locks %d out", count);
+
+	error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+	if (error)
+		recover_list_clear(ls);
+	return error;
+}
+
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+	DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+
+	r->res_recover_locks_count--;
+	if (!r->res_recover_locks_count) {
+		rsb_clear_flag(r, RSB_NEW_MASTER);
+		recover_list_del(r);
+	}
+
+	if (recover_list_empty(r->res_ls))
+		wake_up(&r->res_ls->ls_wait_general);
+}
+
+/*
+ * The lvb needs to be recovered on all master rsb's.  This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set in two cases:
+ *
+ * 1. we are master, but not new, and we purged an EX/PW lock held by a
+ * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
+ *
+ * 2. we are a new master, and there are only NL/CR locks left.
+ * (We could probably improve this by only invaliding in this way when
+ * the previous master left uncleanly.  VMS docs mention that.)
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
+ * mode > CR.  If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+
+static void recover_lvb(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *high_lkb = NULL;
+	uint32_t high_seq = 0;
+	int lock_lvb_exists = 0;
+	int big_lock_exists = 0;
+	int lvblen = r->res_ls->ls_lvblen;
+
+	if (!rsb_flag(r, RSB_NEW_MASTER2) &&
+	    rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
+		/* case 1 above */
+		rsb_set_flag(r, RSB_VALNOTVALID);
+		return;
+	}
+
+	if (!rsb_flag(r, RSB_NEW_MASTER2))
+		return;
+
+	/* we are the new master, so figure out if VALNOTVALID should
+	   be set, and set the rsb lvb from the best lkb available. */
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			continue;
+
+		lock_lvb_exists = 1;
+
+		if (lkb->lkb_grmode > DLM_LOCK_CR) {
+			big_lock_exists = 1;
+			goto setflag;
+		}
+
+		if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+			high_lkb = lkb;
+			high_seq = lkb->lkb_lvbseq;
+		}
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			continue;
+
+		lock_lvb_exists = 1;
+
+		if (lkb->lkb_grmode > DLM_LOCK_CR) {
+			big_lock_exists = 1;
+			goto setflag;
+		}
+
+		if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+			high_lkb = lkb;
+			high_seq = lkb->lkb_lvbseq;
+		}
+	}
+
+ setflag:
+	if (!lock_lvb_exists)
+		goto out;
+
+	/* lvb is invalidated if only NL/CR locks remain */
+	if (!big_lock_exists)
+		rsb_set_flag(r, RSB_VALNOTVALID);
+
+	if (!r->res_lvbptr) {
+		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+		if (!r->res_lvbptr)
+			goto out;
+	}
+
+	if (big_lock_exists) {
+		r->res_lvbseq = lkb->lkb_lvbseq;
+		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+	} else if (high_lkb) {
+		r->res_lvbseq = high_lkb->lkb_lvbseq;
+		memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+	} else {
+		r->res_lvbseq = 0;
+		memset(r->res_lvbptr, 0, lvblen);
+	}
+ out:
+	return;
+}
+
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at.  The locks
+   converting PR->CW or CW->PR need to have their lkb_grmode set. */
+
+static void recover_conversion(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+	struct dlm_lkb *lkb;
+	int grmode = -1;
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		if (lkb->lkb_grmode == DLM_LOCK_PR ||
+		    lkb->lkb_grmode == DLM_LOCK_CW) {
+			grmode = lkb->lkb_grmode;
+			break;
+		}
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		if (lkb->lkb_grmode != DLM_LOCK_IV)
+			continue;
+		if (grmode == -1) {
+			log_debug(ls, "recover_conversion %x set gr to rq %d",
+				  lkb->lkb_id, lkb->lkb_rqmode);
+			lkb->lkb_grmode = lkb->lkb_rqmode;
+		} else {
+			log_debug(ls, "recover_conversion %x set gr %d",
+				  lkb->lkb_id, grmode);
+			lkb->lkb_grmode = grmode;
+		}
+	}
+}
+
+/* We've become the new master for this rsb and waiting/converting locks may
+   need to be granted in dlm_recover_grant() due to locks that may have
+   existed from a removed node. */
+
+static void recover_grant(struct dlm_rsb *r)
+{
+	if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+		rsb_set_flag(r, RSB_RECOVER_GRANT);
+}
+
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	unsigned int count = 0;
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		lock_rsb(r);
+		if (is_master(r)) {
+			if (rsb_flag(r, RSB_RECOVER_CONVERT))
+				recover_conversion(r);
+
+			/* recover lvb before granting locks so the updated
+			   lvb/VALNOTVALID is presented in the completion */
+			recover_lvb(r);
+
+			if (rsb_flag(r, RSB_NEW_MASTER2))
+				recover_grant(r);
+			count++;
+		} else {
+			rsb_clear_flag(r, RSB_VALNOTVALID);
+		}
+		rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+		rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
+		rsb_clear_flag(r, RSB_NEW_MASTER2);
+		unlock_rsb(r);
+	}
+	up_read(&ls->ls_root_sem);
+
+	if (count)
+		log_rinfo(ls, "dlm_recover_rsbs %d done", count);
+}
+
+/* Create a single list of all root rsb's to be used during recovery */
+
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+	struct rb_node *n;
+	struct dlm_rsb *r;
+	int i, error = 0;
+
+	down_write(&ls->ls_root_sem);
+	if (!list_empty(&ls->ls_root_list)) {
+		log_error(ls, "root list not empty");
+		error = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		spin_lock(&ls->ls_rsbtbl[i].lock);
+		for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+			r = rb_entry(n, struct dlm_rsb, res_hashnode);
+			list_add(&r->res_root_list, &ls->ls_root_list);
+			dlm_hold_rsb(r);
+		}
+
+		if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[i].toss))
+			log_error(ls, "dlm_create_root_list toss not empty");
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+ out:
+	up_write(&ls->ls_root_sem);
+	return error;
+}
+
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *safe;
+
+	down_write(&ls->ls_root_sem);
+	list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+		list_del_init(&r->res_root_list);
+		dlm_put_rsb(r);
+	}
+	up_write(&ls->ls_root_sem);
+}
+
+void dlm_clear_toss(struct dlm_ls *ls)
+{
+	struct rb_node *n, *next;
+	struct dlm_rsb *r;
+	unsigned int count = 0;
+	int i;
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		spin_lock(&ls->ls_rsbtbl[i].lock);
+		for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
+			next = rb_next(n);
+			r = rb_entry(n, struct dlm_rsb, res_hashnode);
+			rb_erase(n, &ls->ls_rsbtbl[i].toss);
+			dlm_free_rsb(r);
+			count++;
+		}
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+
+	if (count)
+		log_rinfo(ls, "dlm_clear_toss %u done", count);
+}
+
diff --git a/kernel/fs/dlm/recover.h b/kernel/fs/dlm/recover.h
new file mode 100644
index 000000000..d8c8738c7
--- /dev/null
+++ b/kernel/fs/dlm/recover.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+
+#endif				/* __RECOVER_DOT_H__ */
+
diff --git a/kernel/fs/dlm/recoverd.c b/kernel/fs/dlm/recoverd.c
new file mode 100644
index 000000000..6859b4bf9
--- /dev/null
+++ b/kernel/fs/dlm/recoverd.c
@@ -0,0 +1,342 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+
+
+/* If the start for which we're re-enabling locking (seq) has been superseded
+   by a newer stop (ls_recover_seq), we need to leave locking disabled.
+
+   We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
+   locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
+   enables locking and clears the requestqueue between a and b. */
+
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+	int error = -EINTR;
+
+	down_write(&ls->ls_recv_active);
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_recover_seq == seq) {
+		set_bit(LSFL_RUNNING, &ls->ls_flags);
+		/* unblocks processes waiting to enter the dlm */
+		up_write(&ls->ls_in_recovery);
+		clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+		error = 0;
+	}
+	spin_unlock(&ls->ls_recover_lock);
+
+	up_write(&ls->ls_recv_active);
+	return error;
+}
+
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+	unsigned long start;
+	int error, neg = 0;
+
+	log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
+
+	mutex_lock(&ls->ls_recoverd_active);
+
+	dlm_callback_suspend(ls);
+
+	dlm_clear_toss(ls);
+
+	/*
+	 * This list of root rsb's will be the basis of most of the recovery
+	 * routines.
+	 */
+
+	dlm_create_root_list(ls);
+
+	/*
+	 * Add or remove nodes from the lockspace's ls_nodes list.
+	 */
+
+	error = dlm_recover_members(ls, rv, &neg);
+	if (error) {
+		log_rinfo(ls, "dlm_recover_members error %d", error);
+		goto fail;
+	}
+
+	dlm_recover_dir_nodeid(ls);
+
+	ls->ls_recover_dir_sent_res = 0;
+	ls->ls_recover_dir_sent_msg = 0;
+	ls->ls_recover_locks_in = 0;
+
+	dlm_set_recover_status(ls, DLM_RS_NODES);
+
+	error = dlm_recover_members_wait(ls);
+	if (error) {
+		log_rinfo(ls, "dlm_recover_members_wait error %d", error);
+		goto fail;
+	}
+
+	start = jiffies;
+
+	/*
+	 * Rebuild our own share of the directory by collecting from all other
+	 * nodes their master rsb names that hash to us.
+	 */
+
+	error = dlm_recover_directory(ls);
+	if (error) {
+		log_rinfo(ls, "dlm_recover_directory error %d", error);
+		goto fail;
+	}
+
+	dlm_set_recover_status(ls, DLM_RS_DIR);
+
+	error = dlm_recover_directory_wait(ls);
+	if (error) {
+		log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
+		goto fail;
+	}
+
+	log_rinfo(ls, "dlm_recover_directory %u out %u messages",
+		  ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
+
+	/*
+	 * We may have outstanding operations that are waiting for a reply from
+	 * a failed node.  Mark these to be resent after recovery.  Unlock and
+	 * cancel ops can just be completed.
+	 */
+
+	dlm_recover_waiters_pre(ls);
+
+	error = dlm_recovery_stopped(ls);
+	if (error)
+		goto fail;
+
+	if (neg || dlm_no_directory(ls)) {
+		/*
+		 * Clear lkb's for departed nodes.
+		 */
+
+		dlm_recover_purge(ls);
+
+		/*
+		 * Get new master nodeid's for rsb's that were mastered on
+		 * departed nodes.
+		 */
+
+		error = dlm_recover_masters(ls);
+		if (error) {
+			log_rinfo(ls, "dlm_recover_masters error %d", error);
+			goto fail;
+		}
+
+		/*
+		 * Send our locks on remastered rsb's to the new masters.
+		 */
+
+		error = dlm_recover_locks(ls);
+		if (error) {
+			log_rinfo(ls, "dlm_recover_locks error %d", error);
+			goto fail;
+		}
+
+		dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
+		error = dlm_recover_locks_wait(ls);
+		if (error) {
+			log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
+			goto fail;
+		}
+
+		log_rinfo(ls, "dlm_recover_locks %u in",
+			  ls->ls_recover_locks_in);
+
+		/*
+		 * Finalize state in master rsb's now that all locks can be
+		 * checked.  This includes conversion resolution and lvb
+		 * settings.
+		 */
+
+		dlm_recover_rsbs(ls);
+	} else {
+		/*
+		 * Other lockspace members may be going through the "neg" steps
+		 * while also adding us to the lockspace, in which case they'll
+		 * be doing the recover_locks (RS_LOCKS) barrier.
+		 */
+		dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
+		error = dlm_recover_locks_wait(ls);
+		if (error) {
+			log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
+			goto fail;
+		}
+	}
+
+	dlm_release_root_list(ls);
+
+	/*
+	 * Purge directory-related requests that are saved in requestqueue.
+	 * All dir requests from before recovery are invalid now due to the dir
+	 * rebuild and will be resent by the requesting nodes.
+	 */
+
+	dlm_purge_requestqueue(ls);
+
+	dlm_set_recover_status(ls, DLM_RS_DONE);
+
+	error = dlm_recover_done_wait(ls);
+	if (error) {
+		log_rinfo(ls, "dlm_recover_done_wait error %d", error);
+		goto fail;
+	}
+
+	dlm_clear_members_gone(ls);
+
+	dlm_adjust_timeouts(ls);
+
+	dlm_callback_resume(ls);
+
+	error = enable_locking(ls, rv->seq);
+	if (error) {
+		log_rinfo(ls, "enable_locking error %d", error);
+		goto fail;
+	}
+
+	error = dlm_process_requestqueue(ls);
+	if (error) {
+		log_rinfo(ls, "dlm_process_requestqueue error %d", error);
+		goto fail;
+	}
+
+	error = dlm_recover_waiters_post(ls);
+	if (error) {
+		log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
+		goto fail;
+	}
+
+	dlm_recover_grant(ls);
+
+	log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
+		  (unsigned long long)rv->seq, ls->ls_generation,
+		  jiffies_to_msecs(jiffies - start));
+	mutex_unlock(&ls->ls_recoverd_active);
+
+	dlm_lsop_recover_done(ls);
+	return 0;
+
+ fail:
+	dlm_release_root_list(ls);
+	log_rinfo(ls, "dlm_recover %llu error %d",
+		  (unsigned long long)rv->seq, error);
+	mutex_unlock(&ls->ls_recoverd_active);
+	return error;
+}
+
+/* The dlm_ls_start() that created the rv we take here may already have been
+   stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
+   flag set. */
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+	struct dlm_recover *rv = NULL;
+
+	spin_lock(&ls->ls_recover_lock);
+	rv = ls->ls_recover_args;
+	ls->ls_recover_args = NULL;
+	if (rv && ls->ls_recover_seq == rv->seq)
+		clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	if (rv) {
+		ls_recover(ls, rv);
+		kfree(rv->nodes);
+		kfree(rv);
+	}
+}
+
+static int dlm_recoverd(void *arg)
+{
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_local(arg);
+	if (!ls) {
+		log_print("dlm_recoverd: no lockspace %p", arg);
+		return -1;
+	}
+
+	down_write(&ls->ls_in_recovery);
+	set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+	wake_up(&ls->ls_recover_lock_wait);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
+		    !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags))
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
+			down_write(&ls->ls_in_recovery);
+			set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+			wake_up(&ls->ls_recover_lock_wait);
+		}
+
+		if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
+			do_ls_recovery(ls);
+	}
+
+	if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
+		up_write(&ls->ls_in_recovery);
+
+	dlm_put_lockspace(ls);
+	return 0;
+}
+
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+                ls->ls_recoverd_task = p;
+	return error;
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+	kthread_stop(ls->ls_recoverd_task);
+}
+
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+	wake_up(&ls->ls_wait_general);
+	mutex_lock(&ls->ls_recoverd_active);
+}
+
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+	mutex_unlock(&ls->ls_recoverd_active);
+}
+
diff --git a/kernel/fs/dlm/recoverd.h b/kernel/fs/dlm/recoverd.h
new file mode 100644
index 000000000..885607973
--- /dev/null
+++ b/kernel/fs/dlm/recoverd.h
@@ -0,0 +1,23 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+
+#endif				/* __RECOVERD_DOT_H__ */
+
diff --git a/kernel/fs/dlm/requestqueue.c b/kernel/fs/dlm/requestqueue.c
new file mode 100644
index 000000000..1695f1b0d
--- /dev/null
+++ b/kernel/fs/dlm/requestqueue.c
@@ -0,0 +1,171 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+
+struct rq_entry {
+	struct list_head list;
+	uint32_t recover_seq;
+	int nodeid;
+	struct dlm_message request;
+};
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.  This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
+{
+	struct rq_entry *e;
+	int length = ms->m_header.h_length - sizeof(struct dlm_message);
+
+	e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
+	if (!e) {
+		log_print("dlm_add_requestqueue: out of memory len %d", length);
+		return;
+	}
+
+	e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
+	e->nodeid = nodeid;
+	memcpy(&e->request, ms, ms->m_header.h_length);
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+	list_add_tail(&e->list, &ls->ls_requestqueue);
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+/*
+ * Called by dlm_recoverd to process normal messages saved while recovery was
+ * happening.  Normal locking has been enabled before this is called.  dlm_recv
+ * upon receiving a message, will wait for all saved messages to be drained
+ * here before processing the message it got.  If a new dlm_ls_stop() arrives
+ * while we're processing these saved messages, it may block trying to suspend
+ * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue.  In that
+ * case, we don't abort since locking_stopped is still 0.  If dlm_recv is not
+ * waiting for us, then this processing may be aborted due to locking_stopped.
+ */
+
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+	struct rq_entry *e;
+	struct dlm_message *ms;
+	int error = 0;
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+
+	for (;;) {
+		if (list_empty(&ls->ls_requestqueue)) {
+			mutex_unlock(&ls->ls_requestqueue_mutex);
+			error = 0;
+			break;
+		}
+		e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+		mutex_unlock(&ls->ls_requestqueue_mutex);
+
+		ms = &e->request;
+
+		log_limit(ls, "dlm_process_requestqueue msg %d from %d "
+			  "lkid %x remid %x result %d seq %u",
+			  ms->m_type, ms->m_header.h_nodeid,
+			  ms->m_lkid, ms->m_remid, ms->m_result,
+			  e->recover_seq);
+
+		dlm_receive_message_saved(ls, &e->request, e->recover_seq);
+
+		mutex_lock(&ls->ls_requestqueue_mutex);
+		list_del(&e->list);
+		kfree(e);
+
+		if (dlm_locking_stopped(ls)) {
+			log_debug(ls, "process_requestqueue abort running");
+			mutex_unlock(&ls->ls_requestqueue_mutex);
+			error = -EINTR;
+			break;
+		}
+		schedule();
+	}
+
+	return error;
+}
+
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recv.  At
+ * the same time, dlm_recv will start receiving new requests from remote nodes.
+ * We want to delay dlm_recv processing new requests until dlm_recoverd has
+ * finished processing the old saved requests.  We don't check for locking
+ * stopped here because dlm_ls_stop won't stop locking until it's suspended us
+ * (dlm_recv).
+ */
+
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+	for (;;) {
+		mutex_lock(&ls->ls_requestqueue_mutex);
+		if (list_empty(&ls->ls_requestqueue))
+			break;
+		mutex_unlock(&ls->ls_requestqueue_mutex);
+		schedule();
+	}
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+	uint32_t type = ms->m_type;
+
+	/* the ls is being cleaned up and freed by release_lockspace */
+	if (!ls->ls_count)
+		return 1;
+
+	if (dlm_is_removed(ls, nodeid))
+		return 1;
+
+	/* directory operations are always purged because the directory is
+	   always rebuilt during recovery and the lookups resent */
+
+	if (type == DLM_MSG_REMOVE ||
+	    type == DLM_MSG_LOOKUP ||
+	    type == DLM_MSG_LOOKUP_REPLY)
+		return 1;
+
+	if (!dlm_no_directory(ls))
+		return 0;
+
+	return 1;
+}
+
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+	struct dlm_message *ms;
+	struct rq_entry *e, *safe;
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+	list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+		ms =  &e->request;
+
+		if (purge_request(ls, ms, e->nodeid)) {
+			list_del(&e->list);
+			kfree(e);
+		}
+	}
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
diff --git a/kernel/fs/dlm/requestqueue.h b/kernel/fs/dlm/requestqueue.h
new file mode 100644
index 000000000..10ce449b7
--- /dev/null
+++ b/kernel/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+
+#endif
+
diff --git a/kernel/fs/dlm/user.c b/kernel/fs/dlm/user.c
new file mode 100644
index 000000000..fb85f32e9
--- /dev/null
+++ b/kernel/fs/dlm/user.c
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+#include <linux/slab.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "ast.h"
+
+static const char name_prefix[] = "dlm";
+static const struct file_operations device_fops;
+static atomic_t dlm_monitor_opened;
+static int dlm_monitor_unused = 1;
+
+#ifdef CONFIG_COMPAT
+
+struct dlm_lock_params32 {
+	__u8 mode;
+	__u8 namelen;
+	__u16 unused;
+	__u32 flags;
+	__u32 lkid;
+	__u32 parent;
+	__u64 xid;
+	__u64 timeout;
+	__u32 castparam;
+	__u32 castaddr;
+	__u32 bastparam;
+	__u32 bastaddr;
+	__u32 lksb;
+	char lvb[DLM_USER_LVB_LEN];
+	char name[0];
+};
+
+struct dlm_write_request32 {
+	__u32 version[3];
+	__u8 cmd;
+	__u8 is64bit;
+	__u8 unused[2];
+
+	union  {
+		struct dlm_lock_params32 lock;
+		struct dlm_lspace_params lspace;
+		struct dlm_purge_params purge;
+	} i;
+};
+
+struct dlm_lksb32 {
+	__u32 sb_status;
+	__u32 sb_lkid;
+	__u8 sb_flags;
+	__u32 sb_lvbptr;
+};
+
+struct dlm_lock_result32 {
+	__u32 version[3];
+	__u32 length;
+	__u32 user_astaddr;
+	__u32 user_astparam;
+	__u32 user_lksb;
+	struct dlm_lksb32 lksb;
+	__u8 bast_mode;
+	__u8 unused[3];
+	/* Offsets may be zero if no data is present */
+	__u32 lvb_offset;
+};
+
+static void compat_input(struct dlm_write_request *kb,
+			 struct dlm_write_request32 *kb32,
+			 int namelen)
+{
+	kb->version[0] = kb32->version[0];
+	kb->version[1] = kb32->version[1];
+	kb->version[2] = kb32->version[2];
+
+	kb->cmd = kb32->cmd;
+	kb->is64bit = kb32->is64bit;
+	if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+	    kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+		kb->i.lspace.flags = kb32->i.lspace.flags;
+		kb->i.lspace.minor = kb32->i.lspace.minor;
+		memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen);
+	} else if (kb->cmd == DLM_USER_PURGE) {
+		kb->i.purge.nodeid = kb32->i.purge.nodeid;
+		kb->i.purge.pid = kb32->i.purge.pid;
+	} else {
+		kb->i.lock.mode = kb32->i.lock.mode;
+		kb->i.lock.namelen = kb32->i.lock.namelen;
+		kb->i.lock.flags = kb32->i.lock.flags;
+		kb->i.lock.lkid = kb32->i.lock.lkid;
+		kb->i.lock.parent = kb32->i.lock.parent;
+		kb->i.lock.xid = kb32->i.lock.xid;
+		kb->i.lock.timeout = kb32->i.lock.timeout;
+		kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
+		kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
+		kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
+		kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
+		kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+		memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+		memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
+	}
+}
+
+static void compat_output(struct dlm_lock_result *res,
+			  struct dlm_lock_result32 *res32)
+{
+	res32->version[0] = res->version[0];
+	res32->version[1] = res->version[1];
+	res32->version[2] = res->version[2];
+
+	res32->user_astaddr = (__u32)(long)res->user_astaddr;
+	res32->user_astparam = (__u32)(long)res->user_astparam;
+	res32->user_lksb = (__u32)(long)res->user_lksb;
+	res32->bast_mode = res->bast_mode;
+
+	res32->lvb_offset = res->lvb_offset;
+	res32->length = res->length;
+
+	res32->lksb.sb_status = res->lksb.sb_status;
+	res32->lksb.sb_flags = res->lksb.sb_flags;
+	res32->lksb.sb_lkid = res->lksb.sb_lkid;
+	res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+
+/* Figure out if this lock is at the end of its life and no longer
+   available for the application to use.  The lkb still exists until
+   the final ast is read.  A lock becomes EOL in three situations:
+     1. a noqueue request fails with EAGAIN
+     2. an unlock completes with EUNLOCK
+     3. a cancel of a waiting request completes with ECANCEL/EDEADLK
+   An EOL lock needs to be removed from the process's list of locks.
+   And we can't allow any new operation on an EOL lock.  This is
+   not related to the lifetime of the lkb struct which is managed
+   entirely by refcount. */
+
+static int lkb_is_endoflife(int mode, int status)
+{
+	switch (status) {
+	case -DLM_EUNLOCK:
+		return 1;
+	case -DLM_ECANCEL:
+	case -ETIMEDOUT:
+	case -EDEADLK:
+	case -EAGAIN:
+		if (mode == DLM_LOCK_IV)
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+/* we could possibly check if the cancel of an orphan has resulted in the lkb
+   being removed and then remove that lkb from the orphans list and free it */
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+		      int status, uint32_t sbflags, uint64_t seq)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	struct dlm_user_proc *proc;
+	int rv;
+
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
+		return;
+
+	ls = lkb->lkb_resource->res_ls;
+	mutex_lock(&ls->ls_clear_proc_locks);
+
+	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
+	   lkb->ua so we can't try to use it.  This second check is necessary
+	   for cases where a completion ast is received for an operation that
+	   began before clear_proc_locks did its cancel/unlock. */
+
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
+		goto out;
+
+	DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb););
+	ua = lkb->lkb_ua;
+	proc = ua->proc;
+
+	if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
+		goto out;
+
+	if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
+		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
+
+	spin_lock(&proc->asts_spin);
+
+	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
+	if (rv < 0) {
+		spin_unlock(&proc->asts_spin);
+		goto out;
+	}
+
+	if (list_empty(&lkb->lkb_cb_list)) {
+		kref_get(&lkb->lkb_ref);
+		list_add_tail(&lkb->lkb_cb_list, &proc->asts);
+		wake_up_interruptible(&proc->wait);
+	}
+	spin_unlock(&proc->asts_spin);
+
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		/* N.B. spin_lock locks_spin, not asts_spin */
+		spin_lock(&proc->locks_spin);
+		if (!list_empty(&lkb->lkb_ownqueue)) {
+			list_del_init(&lkb->lkb_ownqueue);
+			dlm_put_lkb(lkb);
+		}
+		spin_unlock(&proc->locks_spin);
+	}
+ out:
+	mutex_unlock(&ls->ls_clear_proc_locks);
+}
+
+static int device_user_lock(struct dlm_user_proc *proc,
+			    struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	uint32_t lkid;
+	int error = -ENOMEM;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	if (!params->castaddr || !params->lksb) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+	if (!ua)
+		goto out;
+	ua->proc = proc;
+	ua->user_lksb = params->lksb;
+	ua->castparam = params->castparam;
+	ua->castaddr = params->castaddr;
+	ua->bastparam = params->bastparam;
+	ua->bastaddr = params->bastaddr;
+	ua->xid = params->xid;
+
+	if (params->flags & DLM_LKF_CONVERT) {
+		error = dlm_user_convert(ls, ua,
+				         params->mode, params->flags,
+				         params->lkid, params->lvb,
+					 (unsigned long) params->timeout);
+	} else if (params->flags & DLM_LKF_ORPHAN) {
+		error = dlm_user_adopt_orphan(ls, ua,
+					 params->mode, params->flags,
+					 params->name, params->namelen,
+					 (unsigned long) params->timeout,
+					 &lkid);
+		if (!error)
+			error = lkid;
+	} else {
+		error = dlm_user_request(ls, ua,
+					 params->mode, params->flags,
+					 params->name, params->namelen,
+					 (unsigned long) params->timeout);
+		if (!error)
+			error = ua->lksb.sb_lkid;
+	}
+ out:
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_user_unlock(struct dlm_user_proc *proc,
+			      struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	int error = -ENOMEM;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+	if (!ua)
+		goto out;
+	ua->proc = proc;
+	ua->user_lksb = params->lksb;
+	ua->castparam = params->castparam;
+	ua->castaddr = params->castaddr;
+
+	if (params->flags & DLM_LKF_CANCEL)
+		error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+	else
+		error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+					params->lvb);
+ out:
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_user_deadlock(struct dlm_user_proc *proc,
+				struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_user_deadlock(ls, params->flags, params->lkid);
+
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int dlm_device_register(struct dlm_ls *ls, char *name)
+{
+	int error, len;
+
+	/* The device is already registered.  This happens when the
+	   lockspace is created multiple times from userspace. */
+	if (ls->ls_device.name)
+		return 0;
+
+	error = -ENOMEM;
+	len = strlen(name) + strlen(name_prefix) + 2;
+	ls->ls_device.name = kzalloc(len, GFP_NOFS);
+	if (!ls->ls_device.name)
+		goto fail;
+
+	snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+		 name);
+	ls->ls_device.fops = &device_fops;
+	ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+
+	error = misc_register(&ls->ls_device);
+	if (error) {
+		kfree(ls->ls_device.name);
+	}
+fail:
+	return error;
+}
+
+int dlm_device_deregister(struct dlm_ls *ls)
+{
+	int error;
+
+	/* The device is not registered.  This happens when the lockspace
+	   was never used from userspace, or when device_create_lockspace()
+	   calls dlm_release_lockspace() after the register fails. */
+	if (!ls->ls_device.name)
+		return 0;
+
+	error = misc_deregister(&ls->ls_device);
+	if (!error)
+		kfree(ls->ls_device.name);
+	return error;
+}
+
+static int device_user_purge(struct dlm_user_proc *proc,
+			     struct dlm_purge_params *params)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
+
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	error = dlm_new_lockspace(params->name, NULL, params->flags,
+				  DLM_USER_LVB_LEN, NULL, NULL, NULL,
+				  &lockspace);
+	if (error)
+		return error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_device_register(ls, params->name);
+	dlm_put_lockspace(ls);
+
+	if (error)
+		dlm_release_lockspace(lockspace, 0);
+	else
+		error = ls->ls_device.minor;
+
+	return error;
+}
+
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error, force = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ls = dlm_find_lockspace_device(params->minor);
+	if (!ls)
+		return -ENOENT;
+
+	if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+		force = 2;
+
+	lockspace = ls->ls_local_handle;
+	dlm_put_lockspace(ls);
+
+	/* The final dlm_release_lockspace waits for references to go to
+	   zero, so all processes will need to close their device for the
+	   ls before the release will proceed.  release also calls the
+	   device_deregister above.  Converting a positive return value
+	   from release to zero means that userspace won't know when its
+	   release was the final one, but it shouldn't need to know. */
+
+	error = dlm_release_lockspace(lockspace, force);
+	if (error > 0)
+		error = 0;
+	return error;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+	if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+	    (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+	     req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+		printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+		       "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+		       current->comm,
+		       task_pid_nr(current),
+		       req->version[0],
+		       req->version[1],
+		       req->version[2],
+		       DLM_DEVICE_VERSION_MAJOR,
+		       DLM_DEVICE_VERSION_MINOR,
+		       DLM_DEVICE_VERSION_PATCH);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * device_write
+ *
+ *   device_user_lock
+ *     dlm_user_request -> request_lock
+ *     dlm_user_convert -> convert_lock
+ *
+ *   device_user_unlock
+ *     dlm_user_unlock -> unlock_lock
+ *     dlm_user_cancel -> cancel_lock
+ *
+ *   device_create_lockspace
+ *     dlm_new_lockspace
+ *
+ *   device_remove_lockspace
+ *     dlm_release_lockspace
+ */
+
+/* a write to a lockspace device is a lock or unlock request, a write
+   to the control device is to create/remove a lockspace */
+
+static ssize_t device_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_write_request *kbuf;
+	int error;
+
+#ifdef CONFIG_COMPAT
+	if (count < sizeof(struct dlm_write_request32))
+#else
+	if (count < sizeof(struct dlm_write_request))
+#endif
+		return -EINVAL;
+
+	/*
+	 * can't compare against COMPAT/dlm_write_request32 because
+	 * we don't yet know if is64bit is zero
+	 */
+	if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
+		return -EINVAL;
+
+	kbuf = kzalloc(count + 1, GFP_NOFS);
+	if (!kbuf)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buf, count)) {
+		error = -EFAULT;
+		goto out_free;
+	}
+
+	if (check_version(kbuf)) {
+		error = -EBADE;
+		goto out_free;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (!kbuf->is64bit) {
+		struct dlm_write_request32 *k32buf;
+		int namelen = 0;
+
+		if (count > sizeof(struct dlm_write_request32))
+			namelen = count - sizeof(struct dlm_write_request32);
+
+		k32buf = (struct dlm_write_request32 *)kbuf;
+
+		/* add 1 after namelen so that the name string is terminated */
+		kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
+			       GFP_NOFS);
+		if (!kbuf) {
+			kfree(k32buf);
+			return -ENOMEM;
+		}
+
+		if (proc)
+			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+
+		compat_input(kbuf, k32buf, namelen);
+		kfree(k32buf);
+	}
+#endif
+
+	/* do we really need this? can a write happen after a close? */
+	if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+	    (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
+		error = -EINVAL;
+		goto out_free;
+	}
+
+	error = -EINVAL;
+
+	switch (kbuf->cmd)
+	{
+	case DLM_USER_LOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_free;
+		}
+		error = device_user_lock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_UNLOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_free;
+		}
+		error = device_user_unlock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_DEADLOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_free;
+		}
+		error = device_user_deadlock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_CREATE_LOCKSPACE:
+		if (proc) {
+			log_print("create/remove only on control device");
+			goto out_free;
+		}
+		error = device_create_lockspace(&kbuf->i.lspace);
+		break;
+
+	case DLM_USER_REMOVE_LOCKSPACE:
+		if (proc) {
+			log_print("create/remove only on control device");
+			goto out_free;
+		}
+		error = device_remove_lockspace(&kbuf->i.lspace);
+		break;
+
+	case DLM_USER_PURGE:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_free;
+		}
+		error = device_user_purge(proc, &kbuf->i.purge);
+		break;
+
+	default:
+		log_print("Unknown command passed to DLM device : %d\n",
+			  kbuf->cmd);
+	}
+
+ out_free:
+	kfree(kbuf);
+	return error;
+}
+
+/* Every process that opens the lockspace device has its own "proc" structure
+   hanging off the open file that's used to keep track of locks owned by the
+   process and asts that need to be delivered to the process. */
+
+static int device_open(struct inode *inode, struct file *file)
+{
+	struct dlm_user_proc *proc;
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_device(iminor(inode));
+	if (!ls)
+		return -ENOENT;
+
+	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
+	if (!proc) {
+		dlm_put_lockspace(ls);
+		return -ENOMEM;
+	}
+
+	proc->lockspace = ls->ls_local_handle;
+	INIT_LIST_HEAD(&proc->asts);
+	INIT_LIST_HEAD(&proc->locks);
+	INIT_LIST_HEAD(&proc->unlocking);
+	spin_lock_init(&proc->asts_spin);
+	spin_lock_init(&proc->locks_spin);
+	init_waitqueue_head(&proc->wait);
+	file->private_data = proc;
+
+	return 0;
+}
+
+static int device_close(struct inode *inode, struct file *file)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+
+	dlm_clear_proc_locks(ls, proc);
+
+	/* at this point no more lkb's should exist for this lockspace,
+	   so there's no chance of dlm_user_add_ast() being called and
+	   looking for lkb->ua->proc */
+
+	kfree(proc);
+	file->private_data = NULL;
+
+	dlm_put_lockspace(ls);
+	dlm_put_lockspace(ls);  /* for the find in device_open() */
+
+	/* FIXME: AUTOFREE: if this ls is no longer used do
+	   device_remove_lockspace() */
+
+	return 0;
+}
+
+static int copy_result_to_user(struct dlm_user_args *ua, int compat,
+			       uint32_t flags, int mode, int copy_lvb,
+			       char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+	struct dlm_lock_result32 result32;
+#endif
+	struct dlm_lock_result result;
+	void *resultptr;
+	int error=0;
+	int len;
+	int struct_len;
+
+	memset(&result, 0, sizeof(struct dlm_lock_result));
+	result.version[0] = DLM_DEVICE_VERSION_MAJOR;
+	result.version[1] = DLM_DEVICE_VERSION_MINOR;
+	result.version[2] = DLM_DEVICE_VERSION_PATCH;
+	memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+	result.user_lksb = ua->user_lksb;
+
+	/* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+	   in a conversion unless the conversion is successful.  See code
+	   in dlm_user_convert() for updating ua from ua_tmp.  OpenVMS, though,
+	   notes that a new blocking AST address and parameter are set even if
+	   the conversion fails, so maybe we should just do that. */
+
+	if (flags & DLM_CB_BAST) {
+		result.user_astaddr = ua->bastaddr;
+		result.user_astparam = ua->bastparam;
+		result.bast_mode = mode;
+	} else {
+		result.user_astaddr = ua->castaddr;
+		result.user_astparam = ua->castparam;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (compat)
+		len = sizeof(struct dlm_lock_result32);
+	else
+#endif
+		len = sizeof(struct dlm_lock_result);
+	struct_len = len;
+
+	/* copy lvb to userspace if there is one, it's been updated, and
+	   the user buffer has space for it */
+
+	if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
+		if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+				 DLM_USER_LVB_LEN)) {
+			error = -EFAULT;
+			goto out;
+		}
+
+		result.lvb_offset = len;
+		len += DLM_USER_LVB_LEN;
+	}
+
+	result.length = len;
+	resultptr = &result;
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		compat_output(&result, &result32);
+		resultptr = &result32;
+	}
+#endif
+
+	if (copy_to_user(buf, resultptr, struct_len))
+		error = -EFAULT;
+	else
+		error = len;
+ out:
+	return error;
+}
+
+static int copy_version_to_user(char __user *buf, size_t count)
+{
+	struct dlm_device_version ver;
+
+	memset(&ver, 0, sizeof(struct dlm_device_version));
+	ver.version[0] = DLM_DEVICE_VERSION_MAJOR;
+	ver.version[1] = DLM_DEVICE_VERSION_MINOR;
+	ver.version[2] = DLM_DEVICE_VERSION_PATCH;
+
+	if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version)))
+		return -EFAULT;
+	return sizeof(struct dlm_device_version);
+}
+
+/* a read returns a single ast described in a struct dlm_lock_result */
+
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+			   loff_t *ppos)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_lkb *lkb;
+	DECLARE_WAITQUEUE(wait, current);
+	struct dlm_callback cb;
+	int rv, resid, copy_lvb = 0;
+
+	if (count == sizeof(struct dlm_device_version)) {
+		rv = copy_version_to_user(buf, count);
+		return rv;
+	}
+
+	if (!proc) {
+		log_print("non-version read from control device %zu", count);
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (count < sizeof(struct dlm_lock_result32))
+#else
+	if (count < sizeof(struct dlm_lock_result))
+#endif
+		return -EINVAL;
+
+ try_another:
+
+	/* do we really need this? can a read happen after a close? */
+	if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+		return -EINVAL;
+
+	spin_lock(&proc->asts_spin);
+	if (list_empty(&proc->asts)) {
+		if (file->f_flags & O_NONBLOCK) {
+			spin_unlock(&proc->asts_spin);
+			return -EAGAIN;
+		}
+
+		add_wait_queue(&proc->wait, &wait);
+
+	repeat:
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&proc->asts) && !signal_pending(current)) {
+			spin_unlock(&proc->asts_spin);
+			schedule();
+			spin_lock(&proc->asts_spin);
+			goto repeat;
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&proc->wait, &wait);
+
+		if (signal_pending(current)) {
+			spin_unlock(&proc->asts_spin);
+			return -ERESTARTSYS;
+		}
+	}
+
+	/* if we empty lkb_callbacks, we don't want to unlock the spinlock
+	   without removing lkb_cb_list; so empty lkb_cb_list is always
+	   consistent with empty lkb_callbacks */
+
+	lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list);
+
+	rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
+	if (rv < 0) {
+		/* this shouldn't happen; lkb should have been removed from
+		   list when resid was zero */
+		log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
+		list_del_init(&lkb->lkb_cb_list);
+		spin_unlock(&proc->asts_spin);
+		/* removes ref for proc->asts, may cause lkb to be freed */
+		dlm_put_lkb(lkb);
+		goto try_another;
+	}
+	if (!resid)
+		list_del_init(&lkb->lkb_cb_list);
+	spin_unlock(&proc->asts_spin);
+
+	if (cb.flags & DLM_CB_SKIP) {
+		/* removes ref for proc->asts, may cause lkb to be freed */
+		if (!resid)
+			dlm_put_lkb(lkb);
+		goto try_another;
+	}
+
+	if (cb.flags & DLM_CB_CAST) {
+		int old_mode, new_mode;
+
+		old_mode = lkb->lkb_last_cast.mode;
+		new_mode = cb.mode;
+
+		if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
+		    dlm_lvb_operations[old_mode + 1][new_mode + 1])
+			copy_lvb = 1;
+
+		lkb->lkb_lksb->sb_status = cb.sb_status;
+		lkb->lkb_lksb->sb_flags = cb.sb_flags;
+	}
+
+	rv = copy_result_to_user(lkb->lkb_ua,
+				 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+				 cb.flags, cb.mode, copy_lvb, buf, count);
+
+	/* removes ref for proc->asts, may cause lkb to be freed */
+	if (!resid)
+		dlm_put_lkb(lkb);
+
+	return rv;
+}
+
+static unsigned int device_poll(struct file *file, poll_table *wait)
+{
+	struct dlm_user_proc *proc = file->private_data;
+
+	poll_wait(file, &proc->wait, wait);
+
+	spin_lock(&proc->asts_spin);
+	if (!list_empty(&proc->asts)) {
+		spin_unlock(&proc->asts_spin);
+		return POLLIN | POLLRDNORM;
+	}
+	spin_unlock(&proc->asts_spin);
+	return 0;
+}
+
+int dlm_user_daemon_available(void)
+{
+	/* dlm_controld hasn't started (or, has started, but not
+	   properly populated configfs) */
+
+	if (!dlm_our_nodeid())
+		return 0;
+
+	/* This is to deal with versions of dlm_controld that don't
+	   know about the monitor device.  We assume that if the
+	   dlm_controld was started (above), but the monitor device
+	   was never opened, that it's an old version.  dlm_controld
+	   should open the monitor device before populating configfs. */
+
+	if (dlm_monitor_unused)
+		return 1;
+
+	return atomic_read(&dlm_monitor_opened) ? 1 : 0;
+}
+
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int monitor_device_open(struct inode *inode, struct file *file)
+{
+	atomic_inc(&dlm_monitor_opened);
+	dlm_monitor_unused = 0;
+	return 0;
+}
+
+static int monitor_device_close(struct inode *inode, struct file *file)
+{
+	if (atomic_dec_and_test(&dlm_monitor_opened))
+		dlm_stop_lockspaces();
+	return 0;
+}
+
+static const struct file_operations device_fops = {
+	.open    = device_open,
+	.release = device_close,
+	.read    = device_read,
+	.write   = device_write,
+	.poll    = device_poll,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static const struct file_operations ctl_device_fops = {
+	.open    = ctl_device_open,
+	.release = ctl_device_close,
+	.read    = device_read,
+	.write   = device_write,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice ctl_device = {
+	.name  = "dlm-control",
+	.fops  = &ctl_device_fops,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static const struct file_operations monitor_device_fops = {
+	.open    = monitor_device_open,
+	.release = monitor_device_close,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice monitor_device = {
+	.name  = "dlm-monitor",
+	.fops  = &monitor_device_fops,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+int __init dlm_user_init(void)
+{
+	int error;
+
+	atomic_set(&dlm_monitor_opened, 0);
+
+	error = misc_register(&ctl_device);
+	if (error) {
+		log_print("misc_register failed for control device");
+		goto out;
+	}
+
+	error = misc_register(&monitor_device);
+	if (error) {
+		log_print("misc_register failed for monitor device");
+		misc_deregister(&ctl_device);
+	}
+ out:
+	return error;
+}
+
+void dlm_user_exit(void)
+{
+	misc_deregister(&ctl_device);
+	misc_deregister(&monitor_device);
+}
+
diff --git a/kernel/fs/dlm/user.h b/kernel/fs/dlm/user.h
new file mode 100644
index 000000000..00499ab88
--- /dev/null
+++ b/kernel/fs/dlm/user.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                      int status, uint32_t sbflags, uint64_t seq);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+int dlm_device_deregister(struct dlm_ls *ls);
+int dlm_user_daemon_available(void);
+
+#endif
diff --git a/kernel/fs/dlm/util.c b/kernel/fs/dlm/util.c
new file mode 100644
index 000000000..e36520af7
--- /dev/null
+++ b/kernel/fs/dlm/util.c
@@ -0,0 +1,154 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+
+#define DLM_ERRNO_EDEADLK		35
+#define DLM_ERRNO_EBADR			53
+#define DLM_ERRNO_EBADSLT		57
+#define DLM_ERRNO_EPROTO		71
+#define DLM_ERRNO_EOPNOTSUPP		95
+#define DLM_ERRNO_ETIMEDOUT	       110
+#define DLM_ERRNO_EINPROGRESS	       115
+
+static void header_out(struct dlm_header *hd)
+{
+	hd->h_version		= cpu_to_le32(hd->h_version);
+	hd->h_lockspace		= cpu_to_le32(hd->h_lockspace);
+	hd->h_nodeid		= cpu_to_le32(hd->h_nodeid);
+	hd->h_length		= cpu_to_le16(hd->h_length);
+}
+
+static void header_in(struct dlm_header *hd)
+{
+	hd->h_version		= le32_to_cpu(hd->h_version);
+	hd->h_lockspace		= le32_to_cpu(hd->h_lockspace);
+	hd->h_nodeid		= le32_to_cpu(hd->h_nodeid);
+	hd->h_length		= le16_to_cpu(hd->h_length);
+}
+
+/* higher errno values are inconsistent across architectures, so select
+   one set of values for on the wire */
+
+static int to_dlm_errno(int err)
+{
+	switch (err) {
+	case -EDEADLK:
+		return -DLM_ERRNO_EDEADLK;
+	case -EBADR:
+		return -DLM_ERRNO_EBADR;
+	case -EBADSLT:
+		return -DLM_ERRNO_EBADSLT;
+	case -EPROTO:
+		return -DLM_ERRNO_EPROTO;
+	case -EOPNOTSUPP:
+		return -DLM_ERRNO_EOPNOTSUPP;
+	case -ETIMEDOUT:
+		return -DLM_ERRNO_ETIMEDOUT;
+	case -EINPROGRESS:
+		return -DLM_ERRNO_EINPROGRESS;
+	}
+	return err;
+}
+
+static int from_dlm_errno(int err)
+{
+	switch (err) {
+	case -DLM_ERRNO_EDEADLK:
+		return -EDEADLK;
+	case -DLM_ERRNO_EBADR:
+		return -EBADR;
+	case -DLM_ERRNO_EBADSLT:
+		return -EBADSLT;
+	case -DLM_ERRNO_EPROTO:
+		return -EPROTO;
+	case -DLM_ERRNO_EOPNOTSUPP:
+		return -EOPNOTSUPP;
+	case -DLM_ERRNO_ETIMEDOUT:
+		return -ETIMEDOUT;
+	case -DLM_ERRNO_EINPROGRESS:
+		return -EINPROGRESS;
+	}
+	return err;
+}
+
+void dlm_message_out(struct dlm_message *ms)
+{
+	header_out(&ms->m_header);
+
+	ms->m_type		= cpu_to_le32(ms->m_type);
+	ms->m_nodeid		= cpu_to_le32(ms->m_nodeid);
+	ms->m_pid		= cpu_to_le32(ms->m_pid);
+	ms->m_lkid		= cpu_to_le32(ms->m_lkid);
+	ms->m_remid		= cpu_to_le32(ms->m_remid);
+	ms->m_parent_lkid	= cpu_to_le32(ms->m_parent_lkid);
+	ms->m_parent_remid	= cpu_to_le32(ms->m_parent_remid);
+	ms->m_exflags		= cpu_to_le32(ms->m_exflags);
+	ms->m_sbflags		= cpu_to_le32(ms->m_sbflags);
+	ms->m_flags		= cpu_to_le32(ms->m_flags);
+	ms->m_lvbseq		= cpu_to_le32(ms->m_lvbseq);
+	ms->m_hash		= cpu_to_le32(ms->m_hash);
+	ms->m_status		= cpu_to_le32(ms->m_status);
+	ms->m_grmode		= cpu_to_le32(ms->m_grmode);
+	ms->m_rqmode		= cpu_to_le32(ms->m_rqmode);
+	ms->m_bastmode		= cpu_to_le32(ms->m_bastmode);
+	ms->m_asts		= cpu_to_le32(ms->m_asts);
+	ms->m_result		= cpu_to_le32(to_dlm_errno(ms->m_result));
+}
+
+void dlm_message_in(struct dlm_message *ms)
+{
+	header_in(&ms->m_header);
+
+	ms->m_type		= le32_to_cpu(ms->m_type);
+	ms->m_nodeid		= le32_to_cpu(ms->m_nodeid);
+	ms->m_pid		= le32_to_cpu(ms->m_pid);
+	ms->m_lkid		= le32_to_cpu(ms->m_lkid);
+	ms->m_remid		= le32_to_cpu(ms->m_remid);
+	ms->m_parent_lkid	= le32_to_cpu(ms->m_parent_lkid);
+	ms->m_parent_remid	= le32_to_cpu(ms->m_parent_remid);
+	ms->m_exflags		= le32_to_cpu(ms->m_exflags);
+	ms->m_sbflags		= le32_to_cpu(ms->m_sbflags);
+	ms->m_flags		= le32_to_cpu(ms->m_flags);
+	ms->m_lvbseq		= le32_to_cpu(ms->m_lvbseq);
+	ms->m_hash		= le32_to_cpu(ms->m_hash);
+	ms->m_status		= le32_to_cpu(ms->m_status);
+	ms->m_grmode		= le32_to_cpu(ms->m_grmode);
+	ms->m_rqmode		= le32_to_cpu(ms->m_rqmode);
+	ms->m_bastmode		= le32_to_cpu(ms->m_bastmode);
+	ms->m_asts		= le32_to_cpu(ms->m_asts);
+	ms->m_result		= from_dlm_errno(le32_to_cpu(ms->m_result));
+}
+
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+	header_out(&rc->rc_header);
+
+	rc->rc_type		= cpu_to_le32(rc->rc_type);
+	rc->rc_result		= cpu_to_le32(rc->rc_result);
+	rc->rc_id		= cpu_to_le64(rc->rc_id);
+	rc->rc_seq		= cpu_to_le64(rc->rc_seq);
+	rc->rc_seq_reply	= cpu_to_le64(rc->rc_seq_reply);
+}
+
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+	header_in(&rc->rc_header);
+
+	rc->rc_type		= le32_to_cpu(rc->rc_type);
+	rc->rc_result		= le32_to_cpu(rc->rc_result);
+	rc->rc_id		= le64_to_cpu(rc->rc_id);
+	rc->rc_seq		= le64_to_cpu(rc->rc_seq);
+	rc->rc_seq_reply	= le64_to_cpu(rc->rc_seq_reply);
+}
diff --git a/kernel/fs/dlm/util.h b/kernel/fs/dlm/util.h
new file mode 100644
index 000000000..2b2591516
--- /dev/null
+++ b/kernel/fs/dlm/util.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+
+#endif
+
diff --git a/kernel/fs/drop_caches.c b/kernel/fs/drop_caches.c
new file mode 100644
index 000000000..5718cb9f7
--- /dev/null
+++ b/kernel/fs/drop_caches.c
@@ -0,0 +1,67 @@
+/*
+ * Implement the manual drop-all-pagecache function
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/sysctl.h>
+#include <linux/gfp.h>
+#include "internal.h"
+
+/* A global variable is a bit ugly, but it keeps the code simple */
+int sysctl_drop_caches;
+
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
+{
+	struct inode *inode, *toput_inode = NULL;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (inode->i_mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+		invalidate_mapping_pages(inode->i_mapping, 0, -1);
+		iput(toput_inode);
+		toput_inode = inode;
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(toput_inode);
+}
+
+int drop_caches_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret)
+		return ret;
+	if (write) {
+		static int stfu;
+
+		if (sysctl_drop_caches & 1) {
+			iterate_supers(drop_pagecache_sb, NULL);
+			count_vm_event(DROP_PAGECACHE);
+		}
+		if (sysctl_drop_caches & 2) {
+			drop_slab();
+			count_vm_event(DROP_SLAB);
+		}
+		if (!stfu) {
+			pr_info("%s (%d): drop_caches: %d\n",
+				current->comm, task_pid_nr(current),
+				sysctl_drop_caches);
+		}
+		stfu |= sysctl_drop_caches & 4;
+	}
+	return 0;
+}
diff --git a/kernel/fs/ecryptfs/Kconfig b/kernel/fs/ecryptfs/Kconfig
new file mode 100644
index 000000000..434aa313f
--- /dev/null
+++ b/kernel/fs/ecryptfs/Kconfig
@@ -0,0 +1,22 @@
+config ECRYPT_FS
+	tristate "eCrypt filesystem layer support"
+	depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
+	select CRYPTO_ECB
+	select CRYPTO_CBC
+	select CRYPTO_MD5
+	help
+	  Encrypted filesystem that operates on the VFS layer.  See
+	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
+	  eCryptfs.  Userspace components are required and can be
+	  obtained from <http://ecryptfs.sf.net>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ecryptfs.
+
+config ECRYPT_FS_MESSAGING
+	bool "Enable notifications for userspace key wrap/unwrap"
+	depends on ECRYPT_FS
+	help
+	  Enables the /dev/ecryptfs entry for use by ecryptfsd. This allows
+	  for userspace to wrap/unwrap file encryption keys by other
+	  backends, like OpenSSL.
diff --git a/kernel/fs/ecryptfs/Makefile b/kernel/fs/ecryptfs/Makefile
new file mode 100644
index 000000000..49678a699
--- /dev/null
+++ b/kernel/fs/ecryptfs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux eCryptfs
+#
+
+obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
+
+ecryptfs-y := dentry.o file.o inode.o main.o super.o mmap.o read_write.o \
+	      crypto.o keystore.o kthread.o debug.o
+
+ecryptfs-$(CONFIG_ECRYPT_FS_MESSAGING) += messaging.o miscdev.o
diff --git a/kernel/fs/ecryptfs/crypto.c b/kernel/fs/ecryptfs/crypto.c
new file mode 100644
index 000000000..97315f2f6
--- /dev/null
+++ b/kernel/fs/ecryptfs/crypto.c
@@ -0,0 +1,2166 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *   		Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/key.h>
+#include <linux/namei.h>
+#include <linux/crypto.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "ecryptfs_kernel.h"
+
+#define DECRYPT		0
+#define ENCRYPT		1
+
+/**
+ * ecryptfs_to_hex
+ * @dst: Buffer to take hex character representation of contents of
+ *       src; must be at least of size (src_size * 2)
+ * @src: Buffer to be converted to a hex string respresentation
+ * @src_size: number of bytes to convert
+ */
+void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
+{
+	int x;
+
+	for (x = 0; x < src_size; x++)
+		sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
+}
+
+/**
+ * ecryptfs_from_hex
+ * @dst: Buffer to take the bytes from src hex; must be at least of
+ *       size (src_size / 2)
+ * @src: Buffer to be converted from a hex string respresentation to raw value
+ * @dst_size: size of dst buffer, or number of hex characters pairs to convert
+ */
+void ecryptfs_from_hex(char *dst, char *src, int dst_size)
+{
+	int x;
+	char tmp[3] = { 0, };
+
+	for (x = 0; x < dst_size; x++) {
+		tmp[0] = src[x * 2];
+		tmp[1] = src[x * 2 + 1];
+		dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16);
+	}
+}
+
+/**
+ * ecryptfs_calculate_md5 - calculates the md5 of @src
+ * @dst: Pointer to 16 bytes of allocated memory
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @src: Data to be md5'd
+ * @len: Length of @src
+ *
+ * Uses the allocated crypto context that crypt_stat references to
+ * generate the MD5 sum of the contents of src.
+ */
+static int ecryptfs_calculate_md5(char *dst,
+				  struct ecryptfs_crypt_stat *crypt_stat,
+				  char *src, int len)
+{
+	struct scatterlist sg;
+	struct hash_desc desc = {
+		.tfm = crypt_stat->hash_tfm,
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	mutex_lock(&crypt_stat->cs_hash_tfm_mutex);
+	sg_init_one(&sg, (u8 *)src, len);
+	if (!desc.tfm) {
+		desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0,
+					     CRYPTO_ALG_ASYNC);
+		if (IS_ERR(desc.tfm)) {
+			rc = PTR_ERR(desc.tfm);
+			ecryptfs_printk(KERN_ERR, "Error attempting to "
+					"allocate crypto context; rc = [%d]\n",
+					rc);
+			goto out;
+		}
+		crypt_stat->hash_tfm = desc.tfm;
+	}
+	rc = crypto_hash_init(&desc);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error initializing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out;
+	}
+	rc = crypto_hash_update(&desc, &sg, len);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error updating crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out;
+	}
+	rc = crypto_hash_final(&desc, dst);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error finalizing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out;
+	}
+out:
+	mutex_unlock(&crypt_stat->cs_hash_tfm_mutex);
+	return rc;
+}
+
+static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name,
+						  char *cipher_name,
+						  char *chaining_modifier)
+{
+	int cipher_name_len = strlen(cipher_name);
+	int chaining_modifier_len = strlen(chaining_modifier);
+	int algified_name_len;
+	int rc;
+
+	algified_name_len = (chaining_modifier_len + cipher_name_len + 3);
+	(*algified_name) = kmalloc(algified_name_len, GFP_KERNEL);
+	if (!(*algified_name)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	snprintf((*algified_name), algified_name_len, "%s(%s)",
+		 chaining_modifier, cipher_name);
+	rc = 0;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_derive_iv
+ * @iv: destination for the derived iv vale
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @offset: Offset of the extent whose IV we are to derive
+ *
+ * Generate the initialization vector from the given root IV and page
+ * offset.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset)
+{
+	int rc = 0;
+	char dst[MD5_DIGEST_SIZE];
+	char src[ECRYPTFS_MAX_IV_BYTES + 16];
+
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "root iv:\n");
+		ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes);
+	}
+	/* TODO: It is probably secure to just cast the least
+	 * significant bits of the root IV into an unsigned long and
+	 * add the offset to that rather than go through all this
+	 * hashing business. -Halcrow */
+	memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes);
+	memset((src + crypt_stat->iv_bytes), 0, 16);
+	snprintf((src + crypt_stat->iv_bytes), 16, "%lld", offset);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "source:\n");
+		ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
+	}
+	rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
+				    (crypt_stat->iv_bytes + 16));
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+				"MD5 while generating IV for a page\n");
+		goto out;
+	}
+	memcpy(iv, dst, crypt_stat->iv_bytes);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
+		ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_init_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Initialize the crypt_stat structure.
+ */
+void
+ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+	INIT_LIST_HEAD(&crypt_stat->keysig_list);
+	mutex_init(&crypt_stat->keysig_list_mutex);
+	mutex_init(&crypt_stat->cs_mutex);
+	mutex_init(&crypt_stat->cs_tfm_mutex);
+	mutex_init(&crypt_stat->cs_hash_tfm_mutex);
+	crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED;
+}
+
+/**
+ * ecryptfs_destroy_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Releases all memory associated with a crypt_stat struct.
+ */
+void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
+
+	if (crypt_stat->tfm)
+		crypto_free_ablkcipher(crypt_stat->tfm);
+	if (crypt_stat->hash_tfm)
+		crypto_free_hash(crypt_stat->hash_tfm);
+	list_for_each_entry_safe(key_sig, key_sig_tmp,
+				 &crypt_stat->keysig_list, crypt_stat_list) {
+		list_del(&key_sig->crypt_stat_list);
+		kmem_cache_free(ecryptfs_key_sig_cache, key_sig);
+	}
+	memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+}
+
+void ecryptfs_destroy_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct ecryptfs_global_auth_tok *auth_tok, *auth_tok_tmp;
+
+	if (!(mount_crypt_stat->flags & ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED))
+		return;
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry_safe(auth_tok, auth_tok_tmp,
+				 &mount_crypt_stat->global_auth_tok_list,
+				 mount_crypt_stat_list) {
+		list_del(&auth_tok->mount_crypt_stat_list);
+		if (auth_tok->global_auth_tok_key
+		    && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
+			key_put(auth_tok->global_auth_tok_key);
+		kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok);
+	}
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat));
+}
+
+/**
+ * virt_to_scatterlist
+ * @addr: Virtual address
+ * @size: Size of data; should be an even multiple of the block size
+ * @sg: Pointer to scatterlist array; set to NULL to obtain only
+ *      the number of scatterlist structs required in array
+ * @sg_size: Max array size
+ *
+ * Fills in a scatterlist array with page references for a passed
+ * virtual address.
+ *
+ * Returns the number of scatterlist structs in array used
+ */
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+			int sg_size)
+{
+	int i = 0;
+	struct page *pg;
+	int offset;
+	int remainder_of_page;
+
+	sg_init_table(sg, sg_size);
+
+	while (size > 0 && i < sg_size) {
+		pg = virt_to_page(addr);
+		offset = offset_in_page(addr);
+		sg_set_page(&sg[i], pg, 0, offset);
+		remainder_of_page = PAGE_CACHE_SIZE - offset;
+		if (size >= remainder_of_page) {
+			sg[i].length = remainder_of_page;
+			addr += remainder_of_page;
+			size -= remainder_of_page;
+		} else {
+			sg[i].length = size;
+			addr += size;
+			size = 0;
+		}
+		i++;
+	}
+	if (size > 0)
+		return -ENOMEM;
+	return i;
+}
+
+struct extent_crypt_result {
+	struct completion completion;
+	int rc;
+};
+
+static void extent_crypt_complete(struct crypto_async_request *req, int rc)
+{
+	struct extent_crypt_result *ecr = req->data;
+
+	if (rc == -EINPROGRESS)
+		return;
+
+	ecr->rc = rc;
+	complete(&ecr->completion);
+}
+
+/**
+ * crypt_scatterlist
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ * @dst_sg: Destination of the data after performing the crypto operation
+ * @src_sg: Data to be encrypted or decrypted
+ * @size: Length of data
+ * @iv: IV to use
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
+ *
+ * Returns the number of bytes encrypted or decrypted; negative value on error
+ */
+static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct scatterlist *dst_sg,
+			     struct scatterlist *src_sg, int size,
+			     unsigned char *iv, int op)
+{
+	struct ablkcipher_request *req = NULL;
+	struct extent_crypt_result ecr;
+	int rc = 0;
+
+	BUG_ON(!crypt_stat || !crypt_stat->tfm
+	       || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
+				crypt_stat->key_size);
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+
+	init_completion(&ecr.completion);
+
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
+	if (!req) {
+		mutex_unlock(&crypt_stat->cs_tfm_mutex);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			extent_crypt_complete, &ecr);
+	/* Consider doing this once, when the file is opened */
+	if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
+		rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+					      crypt_stat->key_size);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR,
+					"Error setting key; rc = [%d]\n",
+					rc);
+			mutex_unlock(&crypt_stat->cs_tfm_mutex);
+			rc = -EINVAL;
+			goto out;
+		}
+		crypt_stat->flags |= ECRYPTFS_KEY_SET;
+	}
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+	ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
+	rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
+			     crypto_ablkcipher_decrypt(req);
+	if (rc == -EINPROGRESS || rc == -EBUSY) {
+		struct extent_crypt_result *ecr = req->base.data;
+
+		wait_for_completion(&ecr->completion);
+		rc = ecr->rc;
+		reinit_completion(&ecr->completion);
+	}
+out:
+	ablkcipher_request_free(req);
+	return rc;
+}
+
+/**
+ * lower_offset_for_page
+ *
+ * Convert an eCryptfs page index into a lower byte offset
+ */
+static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
+				    struct page *page)
+{
+	return ecryptfs_lower_header_size(crypt_stat) +
+	       ((loff_t)page->index << PAGE_CACHE_SHIFT);
+}
+
+/**
+ * crypt_extent
+ * @crypt_stat: crypt_stat containing cryptographic context for the
+ *              encryption operation
+ * @dst_page: The page to write the result into
+ * @src_page: The page to read from
+ * @extent_offset: Page extent offset for use in generating IV
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
+ *
+ * Encrypts or decrypts one extent of data.
+ *
+ * Return zero on success; non-zero otherwise
+ */
+static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
+			struct page *dst_page,
+			struct page *src_page,
+			unsigned long extent_offset, int op)
+{
+	pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
+	loff_t extent_base;
+	char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+	struct scatterlist src_sg, dst_sg;
+	size_t extent_size = crypt_stat->extent_size;
+	int rc;
+
+	extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
+	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+				(extent_base + extent_offset));
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+			"extent [0x%.16llx]; rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
+		goto out;
+	}
+
+	sg_init_table(&src_sg, 1);
+	sg_init_table(&dst_sg, 1);
+
+	sg_set_page(&src_sg, src_page, extent_size,
+		    extent_offset * extent_size);
+	sg_set_page(&dst_sg, dst_page, extent_size,
+		    extent_offset * extent_size);
+
+	rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
+			       extent_iv, op);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error attempting to crypt page with "
+		       "page_index = [%ld], extent_offset = [%ld]; "
+		       "rc = [%d]\n", __func__, page_index, extent_offset, rc);
+		goto out;
+	}
+	rc = 0;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_page
+ * @page: Page mapped from the eCryptfs inode for the file; contains
+ *        decrypted content that needs to be encrypted (to a temporary
+ *        page; not in place) and written out to the lower file
+ *
+ * Encrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_encrypt_page(struct page *page)
+{
+	struct inode *ecryptfs_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	char *enc_extent_virt;
+	struct page *enc_extent_page = NULL;
+	loff_t extent_offset;
+	loff_t lower_offset;
+	int rc = 0;
+
+	ecryptfs_inode = page->mapping->host;
+	crypt_stat =
+		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
+	enc_extent_page = alloc_page(GFP_USER);
+	if (!enc_extent_page) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Error allocating memory for "
+				"encrypted extent\n");
+		goto out;
+	}
+
+	for (extent_offset = 0;
+	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+	     extent_offset++) {
+		rc = crypt_extent(crypt_stat, enc_extent_page, page,
+				  extent_offset, ENCRYPT);
+		if (rc) {
+			printk(KERN_ERR "%s: Error encrypting extent; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out;
+		}
+	}
+
+	lower_offset = lower_offset_for_page(crypt_stat, page);
+	enc_extent_virt = kmap(enc_extent_page);
+	rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
+				  PAGE_CACHE_SIZE);
+	kunmap(enc_extent_page);
+	if (rc < 0) {
+		ecryptfs_printk(KERN_ERR,
+			"Error attempting to write lower page; rc = [%d]\n",
+			rc);
+		goto out;
+	}
+	rc = 0;
+out:
+	if (enc_extent_page) {
+		__free_page(enc_extent_page);
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_decrypt_page
+ * @page: Page mapped from the eCryptfs inode for the file; data read
+ *        and decrypted from the lower file will be written into this
+ *        page
+ *
+ * Decrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_decrypt_page(struct page *page)
+{
+	struct inode *ecryptfs_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	char *page_virt;
+	unsigned long extent_offset;
+	loff_t lower_offset;
+	int rc = 0;
+
+	ecryptfs_inode = page->mapping->host;
+	crypt_stat =
+		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
+
+	lower_offset = lower_offset_for_page(crypt_stat, page);
+	page_virt = kmap(page);
+	rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
+				 ecryptfs_inode);
+	kunmap(page);
+	if (rc < 0) {
+		ecryptfs_printk(KERN_ERR,
+			"Error attempting to read lower page; rc = [%d]\n",
+			rc);
+		goto out;
+	}
+
+	for (extent_offset = 0;
+	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+	     extent_offset++) {
+		rc = crypt_extent(crypt_stat, page, page,
+				  extent_offset, DECRYPT);
+		if (rc) {
+			printk(KERN_ERR "%s: Error encrypting extent; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out;
+		}
+	}
+out:
+	return rc;
+}
+
+#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
+
+/**
+ * ecryptfs_init_crypt_ctx
+ * @crypt_stat: Uninitialized crypt stats structure
+ *
+ * Initialize the crypto context.
+ *
+ * TODO: Performance: Keep a cache of initialized cipher contexts;
+ * only init if needed
+ */
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	char *full_alg_name;
+	int rc = -EINVAL;
+
+	ecryptfs_printk(KERN_DEBUG,
+			"Initializing cipher [%s]; strlen = [%d]; "
+			"key_size_bits = [%zd]\n",
+			crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
+			crypt_stat->key_size << 3);
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	if (crypt_stat->tfm) {
+		rc = 0;
+		goto out_unlock;
+	}
+	rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name,
+						    crypt_stat->cipher, "cbc");
+	if (rc)
+		goto out_unlock;
+	crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0);
+	if (IS_ERR(crypt_stat->tfm)) {
+		rc = PTR_ERR(crypt_stat->tfm);
+		crypt_stat->tfm = NULL;
+		ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
+				"Error initializing cipher [%s]\n",
+				full_alg_name);
+		goto out_free;
+	}
+	crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	rc = 0;
+out_free:
+	kfree(full_alg_name);
+out_unlock:
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+	return rc;
+}
+
+static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int extent_size_tmp;
+
+	crypt_stat->extent_mask = 0xFFFFFFFF;
+	crypt_stat->extent_shift = 0;
+	if (crypt_stat->extent_size == 0)
+		return;
+	extent_size_tmp = crypt_stat->extent_size;
+	while ((extent_size_tmp & 0x01) == 0) {
+		extent_size_tmp >>= 1;
+		crypt_stat->extent_mask <<= 1;
+		crypt_stat->extent_shift++;
+	}
+}
+
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	/* Default values; may be overwritten as we are parsing the
+	 * packets. */
+	crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
+	set_extent_mask_and_shift(crypt_stat);
+	crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+	else {
+		if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
+			crypt_stat->metadata_size =
+				ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+		else
+			crypt_stat->metadata_size = PAGE_CACHE_SIZE;
+	}
+}
+
+/**
+ * ecryptfs_compute_root_iv
+ * @crypt_stats
+ *
+ * On error, sets the root IV to all 0's.
+ */
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int rc = 0;
+	char dst[MD5_DIGEST_SIZE];
+
+	BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
+	BUG_ON(crypt_stat->iv_bytes <= 0);
+	if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING, "Session key not valid; "
+				"cannot generate root IV\n");
+		goto out;
+	}
+	rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
+				    crypt_stat->key_size);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+				"MD5 while generating root IV\n");
+		goto out;
+	}
+	memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+out:
+	if (rc) {
+		memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
+		crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING;
+	}
+	return rc;
+}
+
+static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	get_random_bytes(crypt_stat->key, crypt_stat->key_size);
+	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
+	ecryptfs_compute_root_iv(crypt_stat);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n");
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+}
+
+/**
+ * ecryptfs_copy_mount_wide_flags_to_inode_flags
+ * @crypt_stat: The inode's cryptographic context
+ * @mount_crypt_stat: The mount point's cryptographic context
+ *
+ * This function propagates the mount-wide flags to individual inode
+ * flags.
+ */
+static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED)
+		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+		crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+		if (mount_crypt_stat->flags
+		    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+		else if (mount_crypt_stat->flags
+			 & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+	}
+}
+
+static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct ecryptfs_global_auth_tok *global_auth_tok;
+	int rc = 0;
+
+	mutex_lock(&crypt_stat->keysig_list_mutex);
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
+	list_for_each_entry(global_auth_tok,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			continue;
+		rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
+		if (rc) {
+			printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+
+out:
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	mutex_unlock(&crypt_stat->keysig_list_mutex);
+	return rc;
+}
+
+/**
+ * ecryptfs_set_default_crypt_stat_vals
+ * @crypt_stat: The inode's cryptographic context
+ * @mount_crypt_stat: The mount point's cryptographic context
+ *
+ * Default values in the event that policy does not override them.
+ */
+static void ecryptfs_set_default_crypt_stat_vals(
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
+						      mount_crypt_stat);
+	ecryptfs_set_default_sizes(crypt_stat);
+	strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER);
+	crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES;
+	crypt_stat->flags &= ~(ECRYPTFS_KEY_VALID);
+	crypt_stat->file_version = ECRYPTFS_FILE_VERSION;
+	crypt_stat->mount_crypt_stat = mount_crypt_stat;
+}
+
+/**
+ * ecryptfs_new_file_context
+ * @ecryptfs_inode: The eCryptfs inode
+ *
+ * If the crypto context for the file has not yet been established,
+ * this is where we do that.  Establishing a new crypto context
+ * involves the following decisions:
+ *  - What cipher to use?
+ *  - What set of authentication tokens to use?
+ * Here we just worry about getting enough information into the
+ * authentication tokens so that we know that they are available.
+ * We associate the available authentication tokens with the new file
+ * via the set of signatures in the crypt_stat struct.  Later, when
+ * the headers are actually written out, we may again defer to
+ * userspace to perform the encryption of the session key; for the
+ * foreseeable future, this will be the case with public key packets.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+	    &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+	    &ecryptfs_superblock_to_private(
+		    ecryptfs_inode->i_sb)->mount_crypt_stat;
+	int cipher_name_len;
+	int rc = 0;
+
+	ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat);
+	crypt_stat->flags |= (ECRYPTFS_ENCRYPTED | ECRYPTFS_KEY_VALID);
+	ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
+						      mount_crypt_stat);
+	rc = ecryptfs_copy_mount_wide_sigs_to_inode_sigs(crypt_stat,
+							 mount_crypt_stat);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to copy mount-wide key sigs "
+		       "to the inode key sigs; rc = [%d]\n", rc);
+		goto out;
+	}
+	cipher_name_len =
+		strlen(mount_crypt_stat->global_default_cipher_name);
+	memcpy(crypt_stat->cipher,
+	       mount_crypt_stat->global_default_cipher_name,
+	       cipher_name_len);
+	crypt_stat->cipher[cipher_name_len] = '\0';
+	crypt_stat->key_size =
+		mount_crypt_stat->global_default_cipher_key_size;
+	ecryptfs_generate_new_key(crypt_stat);
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc)
+		ecryptfs_printk(KERN_ERR, "Error initializing cryptographic "
+				"context for cipher [%s]: rc = [%d]\n",
+				crypt_stat->cipher, rc);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_validate_marker - check for the ecryptfs marker
+ * @data: The data block in which to check
+ *
+ * Returns zero if marker found; -EINVAL if not found
+ */
+static int ecryptfs_validate_marker(char *data)
+{
+	u32 m_1, m_2;
+
+	m_1 = get_unaligned_be32(data);
+	m_2 = get_unaligned_be32(data + 4);
+	if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
+		return 0;
+	ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
+			"MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
+			MAGIC_ECRYPTFS_MARKER);
+	ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
+			"[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
+	return -EINVAL;
+}
+
+struct ecryptfs_flag_map_elem {
+	u32 file_flag;
+	u32 local_flag;
+};
+
+/* Add support for additional flags by adding elements here. */
+static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
+	{0x00000001, ECRYPTFS_ENABLE_HMAC},
+	{0x00000002, ECRYPTFS_ENCRYPTED},
+	{0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+	{0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
+};
+
+/**
+ * ecryptfs_process_flags
+ * @crypt_stat: The cryptographic context
+ * @page_virt: Source data to be parsed
+ * @bytes_read: Updated with the number of bytes read
+ *
+ * Returns zero on success; non-zero if the flag set is invalid
+ */
+static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
+				  char *page_virt, int *bytes_read)
+{
+	int rc = 0;
+	int i;
+	u32 flags;
+
+	flags = get_unaligned_be32(page_virt);
+	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+		if (flags & ecryptfs_flag_map[i].file_flag) {
+			crypt_stat->flags |= ecryptfs_flag_map[i].local_flag;
+		} else
+			crypt_stat->flags &= ~(ecryptfs_flag_map[i].local_flag);
+	/* Version is in top 8 bits of the 32-bit flag vector */
+	crypt_stat->file_version = ((flags >> 24) & 0xFF);
+	(*bytes_read) = 4;
+	return rc;
+}
+
+/**
+ * write_ecryptfs_marker
+ * @page_virt: The pointer to in a page to begin writing the marker
+ * @written: Number of bytes written
+ *
+ * Marker = 0x3c81b7f5
+ */
+static void write_ecryptfs_marker(char *page_virt, size_t *written)
+{
+	u32 m_1, m_2;
+
+	get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+	m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
+	put_unaligned_be32(m_1, page_virt);
+	page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2);
+	put_unaligned_be32(m_2, page_virt);
+	(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+}
+
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written)
+{
+	u32 flags = 0;
+	int i;
+
+	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+		if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag)
+			flags |= ecryptfs_flag_map[i].file_flag;
+	/* Version is in top 8 bits of the 32-bit flag vector */
+	flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
+	put_unaligned_be32(flags, page_virt);
+	(*written) = 4;
+}
+
+struct ecryptfs_cipher_code_str_map_elem {
+	char cipher_str[16];
+	u8 cipher_code;
+};
+
+/* Add support for additional ciphers by adding elements here. The
+ * cipher_code is whatever OpenPGP applicatoins use to identify the
+ * ciphers. List in order of probability. */
+static struct ecryptfs_cipher_code_str_map_elem
+ecryptfs_cipher_code_str_map[] = {
+	{"aes",RFC2440_CIPHER_AES_128 },
+	{"blowfish", RFC2440_CIPHER_BLOWFISH},
+	{"des3_ede", RFC2440_CIPHER_DES3_EDE},
+	{"cast5", RFC2440_CIPHER_CAST_5},
+	{"twofish", RFC2440_CIPHER_TWOFISH},
+	{"cast6", RFC2440_CIPHER_CAST_6},
+	{"aes", RFC2440_CIPHER_AES_192},
+	{"aes", RFC2440_CIPHER_AES_256}
+};
+
+/**
+ * ecryptfs_code_for_cipher_string
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
+ *
+ * Returns zero on no match, or the cipher code on match
+ */
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
+{
+	int i;
+	u8 code = 0;
+	struct ecryptfs_cipher_code_str_map_elem *map =
+		ecryptfs_cipher_code_str_map;
+
+	if (strcmp(cipher_name, "aes") == 0) {
+		switch (key_bytes) {
+		case 16:
+			code = RFC2440_CIPHER_AES_128;
+			break;
+		case 24:
+			code = RFC2440_CIPHER_AES_192;
+			break;
+		case 32:
+			code = RFC2440_CIPHER_AES_256;
+		}
+	} else {
+		for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+			if (strcmp(cipher_name, map[i].cipher_str) == 0) {
+				code = map[i].cipher_code;
+				break;
+			}
+	}
+	return code;
+}
+
+/**
+ * ecryptfs_cipher_code_to_string
+ * @str: Destination to write out the cipher name
+ * @cipher_code: The code to convert to cipher name string
+ *
+ * Returns zero on success
+ */
+int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
+{
+	int rc = 0;
+	int i;
+
+	str[0] = '\0';
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+		if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code)
+			strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str);
+	if (str[0] == '\0') {
+		ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: "
+				"[%d]\n", cipher_code);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+int ecryptfs_read_and_validate_header_region(struct inode *inode)
+{
+	u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+	u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
+	int rc;
+
+	rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
+				 inode);
+	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return rc >= 0 ? -EINVAL : rc;
+	rc = ecryptfs_validate_marker(marker);
+	if (!rc)
+		ecryptfs_i_size_init(file_size, inode);
+	return rc;
+}
+
+void
+ecryptfs_write_header_metadata(char *virt,
+			       struct ecryptfs_crypt_stat *crypt_stat,
+			       size_t *written)
+{
+	u32 header_extent_size;
+	u16 num_header_extents_at_front;
+
+	header_extent_size = (u32)crypt_stat->extent_size;
+	num_header_extents_at_front =
+		(u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
+	put_unaligned_be32(header_extent_size, virt);
+	virt += 4;
+	put_unaligned_be16(num_header_extents_at_front, virt);
+	(*written) = 6;
+}
+
+struct kmem_cache *ecryptfs_header_cache;
+
+/**
+ * ecryptfs_write_headers_virt
+ * @page_virt: The virtual address to write the headers to
+ * @max: The size of memory allocated at page_virt
+ * @size: Set to the number of bytes written by this function
+ * @crypt_stat: The cryptographic context
+ * @ecryptfs_dentry: The eCryptfs dentry
+ *
+ * Format version: 1
+ *
+ *   Header Extent:
+ *     Octets 0-7:        Unencrypted file size (big-endian)
+ *     Octets 8-15:       eCryptfs special marker
+ *     Octets 16-19:      Flags
+ *      Octet 16:         File format version number (between 0 and 255)
+ *      Octets 17-18:     Reserved
+ *      Octet 19:         Bit 1 (lsb): Reserved
+ *                        Bit 2: Encrypted?
+ *                        Bits 3-8: Reserved
+ *     Octets 20-23:      Header extent size (big-endian)
+ *     Octets 24-25:      Number of header extents at front of file
+ *                        (big-endian)
+ *     Octet  26:         Begin RFC 2440 authentication token packet set
+ *   Data Extent 0:
+ *     Lower data (CBC encrypted)
+ *   Data Extent 1:
+ *     Lower data (CBC encrypted)
+ *   ...
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
+				       size_t *size,
+				       struct ecryptfs_crypt_stat *crypt_stat,
+				       struct dentry *ecryptfs_dentry)
+{
+	int rc;
+	size_t written;
+	size_t offset;
+
+	offset = ECRYPTFS_FILE_SIZE_BYTES;
+	write_ecryptfs_marker((page_virt + offset), &written);
+	offset += written;
+	ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
+					&written);
+	offset += written;
+	ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
+				       &written);
+	offset += written;
+	rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
+					      ecryptfs_dentry, &written,
+					      max - offset);
+	if (rc)
+		ecryptfs_printk(KERN_WARNING, "Error generating key packet "
+				"set; rc = [%d]\n", rc);
+	if (size) {
+		offset += written;
+		*size = offset;
+	}
+	return rc;
+}
+
+static int
+ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
+				    char *virt, size_t virt_len)
+{
+	int rc;
+
+	rc = ecryptfs_write_lower(ecryptfs_inode, virt,
+				  0, virt_len);
+	if (rc < 0)
+		printk(KERN_ERR "%s: Error attempting to write header "
+		       "information to lower file; rc = [%d]\n", __func__, rc);
+	else
+		rc = 0;
+	return rc;
+}
+
+static int
+ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
+				 char *page_virt, size_t size)
+{
+	int rc;
+
+	rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt,
+			       size, 0);
+	return rc;
+}
+
+static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
+					       unsigned int order)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (page)
+		return (unsigned long) page_address(page);
+	return 0;
+}
+
+/**
+ * ecryptfs_write_metadata
+ * @ecryptfs_dentry: The eCryptfs dentry, which should be negative
+ * @ecryptfs_inode: The newly created eCryptfs inode
+ *
+ * Write the file headers out.  This will likely involve a userspace
+ * callout, in which the session key is encrypted with one or more
+ * public keys and/or the passphrase necessary to do the encryption is
+ * retrieved via a prompt.  Exactly what happens at this point should
+ * be policy-dependent.
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+			    struct inode *ecryptfs_inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	unsigned int order;
+	char *virt;
+	size_t virt_len;
+	size_t size = 0;
+	int rc = 0;
+
+	if (likely(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
+			printk(KERN_ERR "Key is invalid; bailing out\n");
+			rc = -EINVAL;
+			goto out;
+		}
+	} else {
+		printk(KERN_WARNING "%s: Encrypted flag not set\n",
+		       __func__);
+		rc = -EINVAL;
+		goto out;
+	}
+	virt_len = crypt_stat->metadata_size;
+	order = get_order(virt_len);
+	/* Released in this function */
+	virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
+	if (!virt) {
+		printk(KERN_ERR "%s: Out of memory\n", __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+	/* Zeroed page ensures the in-header unencrypted i_size is set to 0 */
+	rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
+					 ecryptfs_dentry);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
+		       __func__, rc);
+		goto out_free;
+	}
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
+						      size);
+	else
+		rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
+							 virt_len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error writing metadata out to lower file; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_free;
+	}
+out_free:
+	free_pages((unsigned long)virt, order);
+out:
+	return rc;
+}
+
+#define ECRYPTFS_DONT_VALIDATE_HEADER_SIZE 0
+#define ECRYPTFS_VALIDATE_HEADER_SIZE 1
+static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
+				 char *virt, int *bytes_read,
+				 int validate_header_size)
+{
+	int rc = 0;
+	u32 header_extent_size;
+	u16 num_header_extents_at_front;
+
+	header_extent_size = get_unaligned_be32(virt);
+	virt += sizeof(__be32);
+	num_header_extents_at_front = get_unaligned_be16(virt);
+	crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
+				     * (size_t)header_extent_size));
+	(*bytes_read) = (sizeof(__be32) + sizeof(__be16));
+	if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
+	    && (crypt_stat->metadata_size
+		< ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
+		rc = -EINVAL;
+		printk(KERN_WARNING "Invalid header size: [%zd]\n",
+		       crypt_stat->metadata_size);
+	}
+	return rc;
+}
+
+/**
+ * set_default_header_data
+ * @crypt_stat: The cryptographic context
+ *
+ * For version 0 file format; this function is only for backwards
+ * compatibility for files created with the prior versions of
+ * eCryptfs.
+ */
+static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+}
+
+void ecryptfs_i_size_init(const char *page_virt, struct inode *inode)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	u64 file_size;
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	mount_crypt_stat =
+		&ecryptfs_superblock_to_private(inode->i_sb)->mount_crypt_stat;
+	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
+		file_size = i_size_read(ecryptfs_inode_to_lower(inode));
+		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+			file_size += crypt_stat->metadata_size;
+	} else
+		file_size = get_unaligned_be64(page_virt);
+	i_size_write(inode, (loff_t)file_size);
+	crypt_stat->flags |= ECRYPTFS_I_SIZE_INITIALIZED;
+}
+
+/**
+ * ecryptfs_read_headers_virt
+ * @page_virt: The virtual address into which to read the headers
+ * @crypt_stat: The cryptographic context
+ * @ecryptfs_dentry: The eCryptfs dentry
+ * @validate_header_size: Whether to validate the header size while reading
+ *
+ * Read/parse the header data. The header format is detailed in the
+ * comment block for the ecryptfs_write_headers_virt() function.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_read_headers_virt(char *page_virt,
+				      struct ecryptfs_crypt_stat *crypt_stat,
+				      struct dentry *ecryptfs_dentry,
+				      int validate_header_size)
+{
+	int rc = 0;
+	int offset;
+	int bytes_read;
+
+	ecryptfs_set_default_sizes(crypt_stat);
+	crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
+		ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	offset = ECRYPTFS_FILE_SIZE_BYTES;
+	rc = ecryptfs_validate_marker(page_virt + offset);
+	if (rc)
+		goto out;
+	if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
+		ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry));
+	offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+	rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
+				    &bytes_read);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error processing flags\n");
+		goto out;
+	}
+	if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) {
+		ecryptfs_printk(KERN_WARNING, "File version is [%d]; only "
+				"file version [%d] is supported by this "
+				"version of eCryptfs\n",
+				crypt_stat->file_version,
+				ECRYPTFS_SUPPORTED_FILE_VERSION);
+		rc = -EINVAL;
+		goto out;
+	}
+	offset += bytes_read;
+	if (crypt_stat->file_version >= 1) {
+		rc = parse_header_metadata(crypt_stat, (page_virt + offset),
+					   &bytes_read, validate_header_size);
+		if (rc) {
+			ecryptfs_printk(KERN_WARNING, "Error reading header "
+					"metadata; rc = [%d]\n", rc);
+		}
+		offset += bytes_read;
+	} else
+		set_default_header_data(crypt_stat);
+	rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset),
+				       ecryptfs_dentry);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_read_xattr_region
+ * @page_virt: The vitual address into which to read the xattr data
+ * @ecryptfs_inode: The eCryptfs inode
+ *
+ * Attempts to read the crypto metadata from the extended attribute
+ * region of the lower file.
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode)
+{
+	struct dentry *lower_dentry =
+		ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry;
+	ssize_t size;
+	int rc = 0;
+
+	size = ecryptfs_getxattr_lower(lower_dentry, ECRYPTFS_XATTR_NAME,
+				       page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE);
+	if (size < 0) {
+		if (unlikely(ecryptfs_verbosity > 0))
+			printk(KERN_INFO "Error attempting to read the [%s] "
+			       "xattr from the lower file; return value = "
+			       "[%zd]\n", ECRYPTFS_XATTR_NAME, size);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+					    struct inode *inode)
+{
+	u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+	u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
+	int rc;
+
+	rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+				     ECRYPTFS_XATTR_NAME, file_size,
+				     ECRYPTFS_SIZE_AND_MARKER_BYTES);
+	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return rc >= 0 ? -EINVAL : rc;
+	rc = ecryptfs_validate_marker(marker);
+	if (!rc)
+		ecryptfs_i_size_init(file_size, inode);
+	return rc;
+}
+
+/**
+ * ecryptfs_read_metadata
+ *
+ * Common entry point for reading file metadata. From here, we could
+ * retrieve the header information from the header region of the file,
+ * the xattr region of the file, or some other repostory that is
+ * stored separately from the file itself. The current implementation
+ * supports retrieving the metadata information from the file contents
+ * and from the xattr region.
+ *
+ * Returns zero if valid headers found and parsed; non-zero otherwise
+ */
+int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
+{
+	int rc;
+	char *page_virt;
+	struct inode *ecryptfs_inode = d_inode(ecryptfs_dentry);
+	struct ecryptfs_crypt_stat *crypt_stat =
+	    &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+
+	ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
+						      mount_crypt_stat);
+	/* Read the first page from the underlying file */
+	page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
+	if (!page_virt) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Unable to allocate page_virt\n",
+		       __func__);
+		goto out;
+	}
+	rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
+				 ecryptfs_inode);
+	if (rc >= 0)
+		rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
+						ecryptfs_dentry,
+						ECRYPTFS_VALIDATE_HEADER_SIZE);
+	if (rc) {
+		/* metadata is not in the file header, so try xattrs */
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
+		if (rc) {
+			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
+			       "file header region or xattr region, inode %lu\n",
+				ecryptfs_inode->i_ino);
+			rc = -EINVAL;
+			goto out;
+		}
+		rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
+						ecryptfs_dentry,
+						ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
+		if (rc) {
+			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
+			       "file xattr region either, inode %lu\n",
+				ecryptfs_inode->i_ino);
+			rc = -EINVAL;
+		}
+		if (crypt_stat->mount_crypt_stat->flags
+		    & ECRYPTFS_XATTR_METADATA_ENABLED) {
+			crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+		} else {
+			printk(KERN_WARNING "Attempt to access file with "
+			       "crypto metadata only in the extended attribute "
+			       "region, but eCryptfs was mounted without "
+			       "xattr support enabled. eCryptfs will not treat "
+			       "this like an encrypted file, inode %lu\n",
+				ecryptfs_inode->i_ino);
+			rc = -EINVAL;
+		}
+	}
+out:
+	if (page_virt) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		kmem_cache_free(ecryptfs_header_cache, page_virt);
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_filename - encrypt filename
+ *
+ * CBC-encrypts the filename. We do not want to encrypt the same
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
+			  struct ecryptfs_crypt_stat *crypt_stat,
+			  struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	int rc = 0;
+
+	filename->encrypted_filename = NULL;
+	filename->encrypted_filename_size = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+		size_t packet_size;
+		size_t remaining_bytes;
+
+		rc = ecryptfs_write_tag_70_packet(
+			NULL, NULL,
+			&filename->encrypted_filename_size,
+			mount_crypt_stat, NULL,
+			filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to get packet "
+			       "size for tag 72; rc = [%d]\n", __func__,
+			       rc);
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename =
+			kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+		if (!filename->encrypted_filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%zd] bytes\n", __func__,
+			       filename->encrypted_filename_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		remaining_bytes = filename->encrypted_filename_size;
+		rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+						  &remaining_bytes,
+						  &packet_size,
+						  mount_crypt_stat,
+						  filename->filename,
+						  filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to generate "
+			       "tag 70 packet; rc = [%d]\n", __func__,
+			       rc);
+			kfree(filename->encrypted_filename);
+			filename->encrypted_filename = NULL;
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename_size = packet_size;
+	} else {
+		printk(KERN_ERR "%s: No support for requested filename "
+		       "encryption method in this release\n", __func__);
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
+				  const char *name, size_t name_size)
+{
+	int rc = 0;
+
+	(*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
+	if (!(*copied_name)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	memcpy((void *)(*copied_name), (void *)name, name_size);
+	(*copied_name)[(name_size)] = '\0';	/* Only for convenience
+						 * in printing out the
+						 * string in debug
+						 * messages */
+	(*copied_name_size) = name_size;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_process_key_cipher - Perform key cipher initialization.
+ * @key_tfm: Crypto context for key material, set by this function
+ * @cipher_name: Name of the cipher
+ * @key_size: Size of the key in bytes
+ *
+ * Returns zero on success. Any crypto_tfm structs allocated here
+ * should be released by other functions, such as on a superblock put
+ * event, regardless of whether this function succeeds for fails.
+ */
+static int
+ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
+			    char *cipher_name, size_t *key_size)
+{
+	char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
+	char *full_alg_name = NULL;
+	int rc;
+
+	*key_tfm = NULL;
+	if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
+		      "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
+		goto out;
+	}
+	rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, cipher_name,
+						    "ecb");
+	if (rc)
+		goto out;
+	*key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(*key_tfm)) {
+		rc = PTR_ERR(*key_tfm);
+		printk(KERN_ERR "Unable to allocate crypto cipher with name "
+		       "[%s]; rc = [%d]\n", full_alg_name, rc);
+		goto out;
+	}
+	crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	if (*key_size == 0) {
+		struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm);
+
+		*key_size = alg->max_keysize;
+	}
+	get_random_bytes(dummy_key, *key_size);
+	rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to set key of size [%zd] for "
+		       "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
+		       rc);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	kfree(full_alg_name);
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_key_tfm_cache;
+static struct list_head key_tfm_list;
+struct mutex key_tfm_list_mutex;
+
+int __init ecryptfs_init_crypto(void)
+{
+	mutex_init(&key_tfm_list_mutex);
+	INIT_LIST_HEAD(&key_tfm_list);
+	return 0;
+}
+
+/**
+ * ecryptfs_destroy_crypto - free all cached key_tfms on key_tfm_list
+ *
+ * Called only at module unload time
+ */
+int ecryptfs_destroy_crypto(void)
+{
+	struct ecryptfs_key_tfm *key_tfm, *key_tfm_tmp;
+
+	mutex_lock(&key_tfm_list_mutex);
+	list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list,
+				 key_tfm_list) {
+		list_del(&key_tfm->key_tfm_list);
+		if (key_tfm->key_tfm)
+			crypto_free_blkcipher(key_tfm->key_tfm);
+		kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm);
+	}
+	mutex_unlock(&key_tfm_list_mutex);
+	return 0;
+}
+
+int
+ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
+			 size_t key_size)
+{
+	struct ecryptfs_key_tfm *tmp_tfm;
+	int rc = 0;
+
+	BUG_ON(!mutex_is_locked(&key_tfm_list_mutex));
+
+	tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL);
+	if (key_tfm != NULL)
+		(*key_tfm) = tmp_tfm;
+	if (!tmp_tfm) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "Error attempting to allocate from "
+		       "ecryptfs_key_tfm_cache\n");
+		goto out;
+	}
+	mutex_init(&tmp_tfm->key_tfm_mutex);
+	strncpy(tmp_tfm->cipher_name, cipher_name,
+		ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+	tmp_tfm->cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+	tmp_tfm->key_size = key_size;
+	rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm,
+					 tmp_tfm->cipher_name,
+					 &tmp_tfm->key_size);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to initialize key TFM "
+		       "cipher with name = [%s]; rc = [%d]\n",
+		       tmp_tfm->cipher_name, rc);
+		kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm);
+		if (key_tfm != NULL)
+			(*key_tfm) = NULL;
+		goto out;
+	}
+	list_add(&tmp_tfm->key_tfm_list, &key_tfm_list);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_tfm_exists - Search for existing tfm for cipher_name.
+ * @cipher_name: the name of the cipher to search for
+ * @key_tfm: set to corresponding tfm if found
+ *
+ * Searches for cached key_tfm matching @cipher_name
+ * Must be called with &key_tfm_list_mutex held
+ * Returns 1 if found, with @key_tfm set
+ * Returns 0 if not found, with @key_tfm set to NULL
+ */
+int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm)
+{
+	struct ecryptfs_key_tfm *tmp_key_tfm;
+
+	BUG_ON(!mutex_is_locked(&key_tfm_list_mutex));
+
+	list_for_each_entry(tmp_key_tfm, &key_tfm_list, key_tfm_list) {
+		if (strcmp(tmp_key_tfm->cipher_name, cipher_name) == 0) {
+			if (key_tfm)
+				(*key_tfm) = tmp_key_tfm;
+			return 1;
+		}
+	}
+	if (key_tfm)
+		(*key_tfm) = NULL;
+	return 0;
+}
+
+/**
+ * ecryptfs_get_tfm_and_mutex_for_cipher_name
+ *
+ * @tfm: set to cached tfm found, or new tfm created
+ * @tfm_mutex: set to mutex for cached tfm found, or new tfm created
+ * @cipher_name: the name of the cipher to search for and/or add
+ *
+ * Sets pointers to @tfm & @tfm_mutex matching @cipher_name.
+ * Searches for cached item first, and creates new if not found.
+ * Returns 0 on success, non-zero if adding new cipher failed
+ */
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+					       struct mutex **tfm_mutex,
+					       char *cipher_name)
+{
+	struct ecryptfs_key_tfm *key_tfm;
+	int rc = 0;
+
+	(*tfm) = NULL;
+	(*tfm_mutex) = NULL;
+
+	mutex_lock(&key_tfm_list_mutex);
+	if (!ecryptfs_tfm_exists(cipher_name, &key_tfm)) {
+		rc = ecryptfs_add_new_key_tfm(&key_tfm, cipher_name, 0);
+		if (rc) {
+			printk(KERN_ERR "Error adding new key_tfm to list; "
+					"rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+	(*tfm) = key_tfm->key_tfm;
+	(*tfm_mutex) = &key_tfm->key_tfm_mutex;
+out:
+	mutex_unlock(&key_tfm_list_mutex);
+	return rc;
+}
+
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+						 "EFGHIJKLMNOPQRST"
+						 "UVWXYZabcdefghij"
+						 "klmnopqrstuvwxyz");
+
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static const unsigned char filename_rev_map[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+	0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+	0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+	0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+	0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+	0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+	0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+	0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+	0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+	0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+	0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */
+};
+
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+static void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+				  unsigned char *src, size_t src_size)
+{
+	size_t num_blocks;
+	size_t block_num = 0;
+	size_t dst_offset = 0;
+	unsigned char last_block[3];
+
+	if (src_size == 0) {
+		(*dst_size) = 0;
+		goto out;
+	}
+	num_blocks = (src_size / 3);
+	if ((src_size % 3) == 0) {
+		memcpy(last_block, (&src[src_size - 3]), 3);
+	} else {
+		num_blocks++;
+		last_block[2] = 0x00;
+		switch (src_size % 3) {
+		case 1:
+			last_block[0] = src[src_size - 1];
+			last_block[1] = 0x00;
+			break;
+		case 2:
+			last_block[0] = src[src_size - 2];
+			last_block[1] = src[src_size - 1];
+		}
+	}
+	(*dst_size) = (num_blocks * 4);
+	if (!dst)
+		goto out;
+	while (block_num < num_blocks) {
+		unsigned char *src_block;
+		unsigned char dst_block[4];
+
+		if (block_num == (num_blocks - 1))
+			src_block = last_block;
+		else
+			src_block = &src[block_num * 3];
+		dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+		dst_block[1] = (((src_block[0] << 4) & 0x30)
+				| ((src_block[1] >> 4) & 0x0F));
+		dst_block[2] = (((src_block[1] << 2) & 0x3C)
+				| ((src_block[2] >> 6) & 0x03));
+		dst_block[3] = (src_block[2] & 0x3F);
+		dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+		block_num++;
+	}
+out:
+	return;
+}
+
+static size_t ecryptfs_max_decoded_size(size_t encoded_size)
+{
+	/* Not exact; conservatively long. Every block of 4
+	 * encoded characters decodes into a block of 3
+	 * decoded characters. This segment of code provides
+	 * the caller with the maximum amount of allocated
+	 * space that @dst will need to point to in a
+	 * subsequent call. */
+	return ((encoded_size + 1) * 3) / 4;
+}
+
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ *       non-NULL, this function decodes the encoded octets in @src
+ *       into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+			      const unsigned char *src, size_t src_size)
+{
+	u8 current_bit_offset = 0;
+	size_t src_byte_offset = 0;
+	size_t dst_byte_offset = 0;
+
+	if (dst == NULL) {
+		(*dst_size) = ecryptfs_max_decoded_size(src_size);
+		goto out;
+	}
+	while (src_byte_offset < src_size) {
+		unsigned char src_byte =
+				filename_rev_map[(int)src[src_byte_offset]];
+
+		switch (current_bit_offset) {
+		case 0:
+			dst[dst_byte_offset] = (src_byte << 2);
+			current_bit_offset = 6;
+			break;
+		case 6:
+			dst[dst_byte_offset++] |= (src_byte >> 4);
+			dst[dst_byte_offset] = ((src_byte & 0xF)
+						 << 4);
+			current_bit_offset = 4;
+			break;
+		case 4:
+			dst[dst_byte_offset++] |= (src_byte >> 2);
+			dst[dst_byte_offset] = (src_byte << 6);
+			current_bit_offset = 2;
+			break;
+		case 2:
+			dst[dst_byte_offset++] |= (src_byte);
+			current_bit_offset = 0;
+			break;
+		}
+		src_byte_offset++;
+	}
+	(*dst_size) = dst_byte_offset;
+out:
+	return;
+}
+
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size)
+{
+	size_t encoded_name_no_prefix_size;
+	int rc = 0;
+
+	(*encoded_name) = NULL;
+	(*encoded_name_size) = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+		struct ecryptfs_filename *filename;
+
+		filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+		if (!filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%zd] bytes\n", __func__,
+			       sizeof(*filename));
+			rc = -ENOMEM;
+			goto out;
+		}
+		filename->filename = (char *)name;
+		filename->filename_size = name_size;
+		rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+					       mount_crypt_stat);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encrypt "
+			       "filename; rc = [%d]\n", __func__, rc);
+			kfree(filename);
+			goto out;
+		}
+		ecryptfs_encode_for_filename(
+			NULL, &encoded_name_no_prefix_size,
+			filename->encrypted_filename,
+			filename->encrypted_filename_size);
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		else
+			(*encoded_name_size) =
+				(ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		(*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+		if (!(*encoded_name)) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%zd] bytes\n", __func__,
+			       (*encoded_name_size));
+			rc = -ENOMEM;
+			kfree(filename->encrypted_filename);
+			kfree(filename);
+			goto out;
+		}
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+			memcpy((*encoded_name),
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+			ecryptfs_encode_for_filename(
+			    ((*encoded_name)
+			     + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+			    &encoded_name_no_prefix_size,
+			    filename->encrypted_filename,
+			    filename->encrypted_filename_size);
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+			(*encoded_name)[(*encoded_name_size)] = '\0';
+		} else {
+			rc = -EOPNOTSUPP;
+		}
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encode "
+			       "encrypted filename; rc = [%d]\n", __func__,
+			       rc);
+			kfree((*encoded_name));
+			(*encoded_name) = NULL;
+			(*encoded_name_size) = 0;
+		}
+		kfree(filename->encrypted_filename);
+		kfree(filename);
+	} else {
+		rc = ecryptfs_copy_filename(encoded_name,
+					    encoded_name_size,
+					    name, name_size);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+					 size_t *plaintext_name_size,
+					 struct super_block *sb,
+					 const char *name, size_t name_size)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+	char *decoded_name;
+	size_t decoded_name_size;
+	size_t packet_size;
+	int rc = 0;
+
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+	    && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+	    && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+		const char *orig_name = name;
+		size_t orig_name_size = name_size;
+
+		name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+					      name, name_size);
+		decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+		if (!decoded_name) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%zd] bytes\n", __func__,
+			       decoded_name_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+					      name, name_size);
+		rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+						  plaintext_name_size,
+						  &packet_size,
+						  mount_crypt_stat,
+						  decoded_name,
+						  decoded_name_size);
+		if (rc) {
+			printk(KERN_INFO "%s: Could not parse tag 70 packet "
+			       "from filename; copying through filename "
+			       "as-is\n", __func__);
+			rc = ecryptfs_copy_filename(plaintext_name,
+						    plaintext_name_size,
+						    orig_name, orig_name_size);
+			goto out_free;
+		}
+	} else {
+		rc = ecryptfs_copy_filename(plaintext_name,
+					    plaintext_name_size,
+					    name, name_size);
+		goto out;
+	}
+out_free:
+	kfree(decoded_name);
+out:
+	return rc;
+}
+
+#define ENC_NAME_MAX_BLOCKLEN_8_OR_16	143
+
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+			   struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct blkcipher_desc desc;
+	struct mutex *tfm_mutex;
+	size_t cipher_blocksize;
+	int rc;
+
+	if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+		(*namelen) = lower_namelen;
+		return 0;
+	}
+
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+			mount_crypt_stat->global_default_fn_cipher_name);
+	if (unlikely(rc)) {
+		(*namelen) = 0;
+		return rc;
+	}
+
+	mutex_lock(tfm_mutex);
+	cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+	mutex_unlock(tfm_mutex);
+
+	/* Return an exact amount for the common cases */
+	if (lower_namelen == NAME_MAX
+	    && (cipher_blocksize == 8 || cipher_blocksize == 16)) {
+		(*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16;
+		return 0;
+	}
+
+	/* Return a safe estimate for the uncommon cases */
+	(*namelen) = lower_namelen;
+	(*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+	/* Since this is the max decoded size, subtract 1 "decoded block" len */
+	(*namelen) = ecryptfs_max_decoded_size(*namelen) - 3;
+	(*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE;
+	(*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES;
+	/* Worst case is that the filename is padded nearly a full block size */
+	(*namelen) -= cipher_blocksize - 1;
+
+	if ((*namelen) < 0)
+		(*namelen) = 0;
+
+	return 0;
+}
diff --git a/kernel/fs/ecryptfs/debug.c b/kernel/fs/ecryptfs/debug.c
new file mode 100644
index 000000000..3d2bdf546
--- /dev/null
+++ b/kernel/fs/ecryptfs/debug.c
@@ -0,0 +1,121 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Functions only useful for debugging.
+ *
+ * Copyright (C) 2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_dump_auth_tok - debug function to print auth toks
+ *
+ * This function will print the contents of an ecryptfs authentication
+ * token.
+ */
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok)
+{
+	char salt[ECRYPTFS_SALT_SIZE * 2 + 1];
+	char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+
+	ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n",
+			auth_tok);
+	if (auth_tok->flags & ECRYPTFS_PRIVATE_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * private key type\n");
+	} else {
+		ecryptfs_printk(KERN_DEBUG, " * passphrase type\n");
+		ecryptfs_to_hex(salt, auth_tok->token.password.salt,
+				ECRYPTFS_SALT_SIZE);
+		salt[ECRYPTFS_SALT_SIZE * 2] = '\0';
+		ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt);
+		if (auth_tok->token.password.flags &
+		    ECRYPTFS_PERSISTENT_PASSWORD) {
+			ecryptfs_printk(KERN_DEBUG, " * persistent\n");
+		}
+		memcpy(sig, auth_tok->token.password.signature,
+		       ECRYPTFS_SIG_SIZE_HEX);
+		sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+		ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig);
+	}
+	ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n",
+			auth_tok->session_key.flags);
+	if (auth_tok->session_key.flags
+	    & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT)
+		ecryptfs_printk(KERN_DEBUG,
+				" * Userspace decrypt request set\n");
+	if (auth_tok->session_key.flags
+	    & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT)
+		ecryptfs_printk(KERN_DEBUG,
+				" * Userspace encrypt request set\n");
+	if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n");
+		ecryptfs_printk(KERN_DEBUG,
+				" * session_key.decrypted_key_size = [0x%x]\n",
+				auth_tok->session_key.decrypted_key_size);
+		ecryptfs_printk(KERN_DEBUG, " * Decrypted session key "
+				"dump:\n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(auth_tok->session_key.decrypted_key,
+					  ECRYPTFS_DEFAULT_KEY_BYTES);
+	}
+	if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n");
+		ecryptfs_printk(KERN_DEBUG,
+				" * session_key.encrypted_key_size = [0x%x]\n",
+				auth_tok->session_key.encrypted_key_size);
+		ecryptfs_printk(KERN_DEBUG, " * Encrypted session key "
+				"dump:\n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(auth_tok->session_key.encrypted_key,
+					  auth_tok->session_key.
+					  encrypted_key_size);
+	}
+}
+
+/**
+ * ecryptfs_dump_hex - debug hex printer
+ * @data: string of bytes to be printed
+ * @bytes: number of bytes to print
+ *
+ * Dump hexadecimal representation of char array
+ */
+void ecryptfs_dump_hex(char *data, int bytes)
+{
+	int i = 0;
+	int add_newline = 1;
+
+	if (ecryptfs_verbosity < 1)
+		return;
+	if (bytes != 0) {
+		printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]);
+		i++;
+	}
+	while (i < bytes) {
+		printk("0x%.2x.", (unsigned char)data[i]);
+		i++;
+		if (i % 16 == 0) {
+			printk("\n");
+			add_newline = 0;
+		} else
+			add_newline = 1;
+	}
+	if (add_newline)
+		printk("\n");
+}
+
diff --git a/kernel/fs/ecryptfs/dentry.c b/kernel/fs/ecryptfs/dentry.c
new file mode 100644
index 000000000..8db0b4644
--- /dev/null
+++ b/kernel/fs/ecryptfs/dentry.c
@@ -0,0 +1,92 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/fs_stack.h>
+#include <linux/slab.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_d_revalidate - revalidate an ecryptfs dentry
+ * @dentry: The ecryptfs dentry
+ * @flags: lookup flags
+ *
+ * Called when the VFS needs to revalidate a dentry. This
+ * is called whenever a name lookup finds a dentry in the
+ * dcache. Most filesystems leave this as NULL, because all their
+ * dentries in the dcache are valid.
+ *
+ * Returns 1 if valid, 0 otherwise.
+ *
+ */
+static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	int rc;
+
+	if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
+		return 1;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+	if (d_really_is_positive(dentry)) {
+		struct inode *lower_inode =
+			ecryptfs_inode_to_lower(d_inode(dentry));
+
+		fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+	}
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_dentry_info_cache;
+
+static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(ecryptfs_dentry_info_cache,
+		container_of(head, struct ecryptfs_dentry_info, rcu));
+}
+
+/**
+ * ecryptfs_d_release
+ * @dentry: The ecryptfs dentry
+ *
+ * Called when a dentry is really deallocated.
+ */
+static void ecryptfs_d_release(struct dentry *dentry)
+{
+	struct ecryptfs_dentry_info *p = dentry->d_fsdata;
+	if (p) {
+		path_put(&p->lower_path);
+		call_rcu(&p->rcu, ecryptfs_dentry_free_rcu);
+	}
+}
+
+const struct dentry_operations ecryptfs_dops = {
+	.d_revalidate = ecryptfs_d_revalidate,
+	.d_release = ecryptfs_d_release,
+};
diff --git a/kernel/fs/ecryptfs/ecryptfs_kernel.h b/kernel/fs/ecryptfs/ecryptfs_kernel.h
new file mode 100644
index 000000000..5ba029e62
--- /dev/null
+++ b/kernel/fs/ecryptfs/ecryptfs_kernel.h
@@ -0,0 +1,721 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Kernel declarations.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Trevor S. Highland <trevor.highland@gmail.com>
+ *              Tyler Hicks <tyhicks@ou.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef ECRYPTFS_KERNEL_H
+#define ECRYPTFS_KERNEL_H
+
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include <linux/fs.h>
+#include <linux/fs_stack.h>
+#include <linux/namei.h>
+#include <linux/scatterlist.h>
+#include <linux/hash.h>
+#include <linux/nsproxy.h>
+#include <linux/backing-dev.h>
+#include <linux/ecryptfs.h>
+#include <linux/crypto.h>
+
+#define ECRYPTFS_DEFAULT_IV_BYTES 16
+#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
+#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
+#define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32
+#define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ
+#define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3)
+#define ECRYPTFS_DEFAULT_NUM_USERS 4
+#define ECRYPTFS_MAX_NUM_USERS 32768
+#define ECRYPTFS_XATTR_NAME "user.ecryptfs"
+
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
+extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
+extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
+
+struct ecryptfs_key_record {
+	unsigned char type;
+	size_t enc_key_size;
+	unsigned char sig[ECRYPTFS_SIG_SIZE];
+	unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+};
+
+struct ecryptfs_auth_tok_list {
+	struct ecryptfs_auth_tok *auth_tok;
+	struct list_head list;
+};
+
+struct ecryptfs_crypt_stat;
+struct ecryptfs_mount_crypt_stat;
+
+struct ecryptfs_page_crypt_context {
+	struct page *page;
+#define ECRYPTFS_PREPARE_COMMIT_MODE 0
+#define ECRYPTFS_WRITEPAGE_MODE      1
+	unsigned int mode;
+	union {
+		struct file *lower_file;
+		struct writeback_control *wbc;
+	} param;
+};
+
+#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_encrypted_key_payload_data(struct key *key)
+{
+	if (key->type == &key_type_encrypted)
+		return (struct ecryptfs_auth_tok *)
+			(&((struct encrypted_key_payload *)key->payload.data)->payload_data);
+	else
+		return NULL;
+}
+
+static inline struct key *ecryptfs_get_encrypted_key(char *sig)
+{
+	return request_key(&key_type_encrypted, sig, NULL);
+}
+
+#else
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_encrypted_key_payload_data(struct key *key)
+{
+	return NULL;
+}
+
+static inline struct key *ecryptfs_get_encrypted_key(char *sig)
+{
+	return ERR_PTR(-ENOKEY);
+}
+
+#endif /* CONFIG_ENCRYPTED_KEYS */
+
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_key_payload_data(struct key *key)
+{
+	struct ecryptfs_auth_tok *auth_tok;
+
+	auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
+	if (!auth_tok)
+		return (struct ecryptfs_auth_tok *)
+			(((struct user_key_payload *)key->payload.data)->data);
+	else
+		return auth_tok;
+}
+
+#define ECRYPTFS_MAX_KEYSET_SIZE 1024
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31
+#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
+#define ECRYPTFS_MAX_IV_BYTES 16	/* 128 bits */
+#define ECRYPTFS_SALT_BYTES 2
+#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
+#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8	/* 4*2 */
+#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
+#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
+					+ MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
+#define ECRYPTFS_DEFAULT_CIPHER "aes"
+#define ECRYPTFS_DEFAULT_KEY_BYTES 16
+#define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
+#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
+#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
+#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
+#define ECRYPTFS_TAG_64_PACKET_TYPE 0x40
+#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
+#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
+#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+					  * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+					  * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+					  * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+					  * metadata */
+#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
+#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
+				     * ecryptfs_parse_packet_length() and
+				     * ecryptfs_write_packet_length()
+				     */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
+#define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+					   + ECRYPTFS_SIG_SIZE + 1 + 1)
+#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+					   + ECRYPTFS_SIG_SIZE + 1 + 1)
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
+
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
+# define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \
+					     | ECRYPTFS_VERSIONING_PUBKEY)
+#else
+# define ECRYPTFS_VERSIONING_MASK_MESSAGING 0
+#endif
+
+#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
+				  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
+				  | ECRYPTFS_VERSIONING_XATTR \
+				  | ECRYPTFS_VERSIONING_MULTKEY \
+				  | ECRYPTFS_VERSIONING_MASK_MESSAGING \
+				  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
+struct ecryptfs_key_sig {
+	struct list_head crypt_stat_list;
+	char keysig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+struct ecryptfs_filename {
+	struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+	u32 flags;
+	u32 seq_no;
+	char *filename;
+	char *encrypted_filename;
+	size_t filename_size;
+	size_t encrypted_filename_size;
+	char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+	char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
+
+/**
+ * This is the primary struct associated with each encrypted file.
+ *
+ * TODO: cache align/pack?
+ */
+struct ecryptfs_crypt_stat {
+#define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
+#define ECRYPTFS_POLICY_APPLIED       0x00000002
+#define ECRYPTFS_ENCRYPTED            0x00000004
+#define ECRYPTFS_SECURITY_WARNING     0x00000008
+#define ECRYPTFS_ENABLE_HMAC          0x00000010
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000020
+#define ECRYPTFS_KEY_VALID            0x00000040
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000080
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000100
+#define ECRYPTFS_KEY_SET              0x00000200
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000400
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800
+#define ECRYPTFS_ENCFN_USE_FEK        0x00001000
+#define ECRYPTFS_UNLINK_SIGS          0x00002000
+#define ECRYPTFS_I_SIZE_INITIALIZED   0x00004000
+	u32 flags;
+	unsigned int file_version;
+	size_t iv_bytes;
+	size_t metadata_size;
+	size_t extent_size; /* Data extent size; default is 4096 */
+	size_t key_size;
+	size_t extent_shift;
+	unsigned int extent_mask;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct crypto_ablkcipher *tfm;
+	struct crypto_hash *hash_tfm; /* Crypto context for generating
+				       * the initialization vectors */
+	unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+	unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
+	unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
+	struct list_head keysig_list;
+	struct mutex keysig_list_mutex;
+	struct mutex cs_tfm_mutex;
+	struct mutex cs_hash_tfm_mutex;
+	struct mutex cs_mutex;
+};
+
+/* inode private data. */
+struct ecryptfs_inode_info {
+	struct inode vfs_inode;
+	struct inode *wii_inode;
+	struct mutex lower_file_mutex;
+	atomic_t lower_file_count;
+	struct file *lower_file;
+	struct ecryptfs_crypt_stat crypt_stat;
+};
+
+/* dentry private data. Each dentry must keep track of a lower
+ * vfsmount too. */
+struct ecryptfs_dentry_info {
+	struct path lower_path;
+	union {
+		struct ecryptfs_crypt_stat *crypt_stat;
+		struct rcu_head rcu;
+	};
+};
+
+/**
+ * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint
+ * @flags: Status flags
+ * @mount_crypt_stat_list: These auth_toks hang off the mount-wide
+ *                         cryptographic context. Every time a new
+ *                         inode comes into existence, eCryptfs copies
+ *                         the auth_toks on that list to the set of
+ *                         auth_toks on the inode's crypt_stat
+ * @global_auth_tok_key: The key from the user's keyring for the sig
+ * @global_auth_tok: The key contents
+ * @sig: The key identifier
+ *
+ * ecryptfs_global_auth_tok structs refer to authentication token keys
+ * in the user keyring that apply to newly created files. A list of
+ * these objects hangs off of the mount_crypt_stat struct for any
+ * given eCryptfs mount. This struct maintains a reference to both the
+ * key contents and the key itself so that the key can be put on
+ * unmount.
+ */
+struct ecryptfs_global_auth_tok {
+#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
+#define ECRYPTFS_AUTH_TOK_FNEK    0x00000002
+	u32 flags;
+	struct list_head mount_crypt_stat_list;
+	struct key *global_auth_tok_key;
+	unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+/**
+ * ecryptfs_key_tfm - Persistent key tfm
+ * @key_tfm: crypto API handle to the key
+ * @key_size: Key size in bytes
+ * @key_tfm_mutex: Mutex to ensure only one operation in eCryptfs is
+ *                 using the persistent TFM at any point in time
+ * @key_tfm_list: Handle to hang this off the module-wide TFM list
+ * @cipher_name: String name for the cipher for this TFM
+ *
+ * Typically, eCryptfs will use the same ciphers repeatedly throughout
+ * the course of its operations. In order to avoid unnecessarily
+ * destroying and initializing the same cipher repeatedly, eCryptfs
+ * keeps a list of crypto API contexts around to use when needed.
+ */
+struct ecryptfs_key_tfm {
+	struct crypto_blkcipher *key_tfm;
+	size_t key_size;
+	struct mutex key_tfm_mutex;
+	struct list_head key_tfm_list;
+	unsigned char cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+};
+
+extern struct mutex key_tfm_list_mutex;
+
+/**
+ * This struct is to enable a mount-wide passphrase/salt combo. This
+ * is more or less a stopgap to provide similar functionality to other
+ * crypto filesystems like EncFS or CFS until full policy support is
+ * implemented in eCryptfs.
+ */
+struct ecryptfs_mount_crypt_stat {
+	/* Pointers to memory we do not own, do not free these */
+#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001
+#define ECRYPTFS_XATTR_METADATA_ENABLED        0x00000002
+#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED        0x00000004
+#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED  0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
+#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
+	u32 flags;
+	struct list_head global_auth_tok_list;
+	struct mutex global_auth_tok_list_mutex;
+	size_t global_default_cipher_key_size;
+	size_t global_default_fn_cipher_key_bytes;
+	unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
+						 + 1];
+	unsigned char global_default_fn_cipher_name[
+		ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+	char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+/* superblock private data. */
+struct ecryptfs_sb_info {
+	struct super_block *wsi_sb;
+	struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+	struct backing_dev_info bdi;
+};
+
+/* file private data. */
+struct ecryptfs_file_info {
+	struct file *wfi_file;
+	struct ecryptfs_crypt_stat *crypt_stat;
+};
+
+/* auth_tok <=> encrypted_session_key mappings */
+struct ecryptfs_auth_tok_list_item {
+	unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES];
+	struct list_head list;
+	struct ecryptfs_auth_tok auth_tok;
+};
+
+struct ecryptfs_message {
+	/* Can never be greater than ecryptfs_message_buf_len */
+	/* Used to find the parent msg_ctx */
+	/* Inherits from msg_ctx->index */
+	u32 index;
+	u32 data_len;
+	u8 data[];
+};
+
+struct ecryptfs_msg_ctx {
+#define ECRYPTFS_MSG_CTX_STATE_FREE     0x01
+#define ECRYPTFS_MSG_CTX_STATE_PENDING  0x02
+#define ECRYPTFS_MSG_CTX_STATE_DONE     0x03
+#define ECRYPTFS_MSG_CTX_STATE_NO_REPLY 0x04
+	u8 state;
+#define ECRYPTFS_MSG_HELO 100
+#define ECRYPTFS_MSG_QUIT 101
+#define ECRYPTFS_MSG_REQUEST 102
+#define ECRYPTFS_MSG_RESPONSE 103
+	u8 type;
+	u32 index;
+	/* Counter converts to a sequence number. Each message sent
+	 * out for which we expect a response has an associated
+	 * sequence number. The response must have the same sequence
+	 * number as the counter for the msg_stc for the message to be
+	 * valid. */
+	u32 counter;
+	size_t msg_size;
+	struct ecryptfs_message *msg;
+	struct task_struct *task;
+	struct list_head node;
+	struct list_head daemon_out_list;
+	struct mutex mux;
+};
+
+struct ecryptfs_daemon {
+#define ECRYPTFS_DAEMON_IN_READ      0x00000001
+#define ECRYPTFS_DAEMON_IN_POLL      0x00000002
+#define ECRYPTFS_DAEMON_ZOMBIE       0x00000004
+#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008
+	u32 flags;
+	u32 num_queued_msg_ctx;
+	struct file *file;
+	struct mutex mux;
+	struct list_head msg_ctx_out_queue;
+	wait_queue_head_t wait;
+	struct hlist_node euid_chain;
+};
+
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
+extern struct mutex ecryptfs_daemon_hash_mux;
+#endif
+
+static inline size_t
+ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		return 0;
+	return crypt_stat->metadata_size;
+}
+
+static inline struct ecryptfs_file_info *
+ecryptfs_file_to_private(struct file *file)
+{
+	return file->private_data;
+}
+
+static inline void
+ecryptfs_set_file_private(struct file *file,
+			  struct ecryptfs_file_info *file_info)
+{
+	file->private_data = file_info;
+}
+
+static inline struct file *ecryptfs_file_to_lower(struct file *file)
+{
+	return ((struct ecryptfs_file_info *)file->private_data)->wfi_file;
+}
+
+static inline void
+ecryptfs_set_file_lower(struct file *file, struct file *lower_file)
+{
+	((struct ecryptfs_file_info *)file->private_data)->wfi_file =
+		lower_file;
+}
+
+static inline struct ecryptfs_inode_info *
+ecryptfs_inode_to_private(struct inode *inode)
+{
+	return container_of(inode, struct ecryptfs_inode_info, vfs_inode);
+}
+
+static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode)
+{
+	return ecryptfs_inode_to_private(inode)->wii_inode;
+}
+
+static inline void
+ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode)
+{
+	ecryptfs_inode_to_private(inode)->wii_inode = lower_inode;
+}
+
+static inline struct ecryptfs_sb_info *
+ecryptfs_superblock_to_private(struct super_block *sb)
+{
+	return (struct ecryptfs_sb_info *)sb->s_fs_info;
+}
+
+static inline void
+ecryptfs_set_superblock_private(struct super_block *sb,
+				struct ecryptfs_sb_info *sb_info)
+{
+	sb->s_fs_info = sb_info;
+}
+
+static inline struct super_block *
+ecryptfs_superblock_to_lower(struct super_block *sb)
+{
+	return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb;
+}
+
+static inline void
+ecryptfs_set_superblock_lower(struct super_block *sb,
+			      struct super_block *lower_sb)
+{
+	((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
+}
+
+static inline struct ecryptfs_dentry_info *
+ecryptfs_dentry_to_private(struct dentry *dentry)
+{
+	return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
+}
+
+static inline void
+ecryptfs_set_dentry_private(struct dentry *dentry,
+			    struct ecryptfs_dentry_info *dentry_info)
+{
+	dentry->d_fsdata = dentry_info;
+}
+
+static inline struct dentry *
+ecryptfs_dentry_to_lower(struct dentry *dentry)
+{
+	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
+}
+
+static inline struct vfsmount *
+ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
+{
+	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt;
+}
+
+static inline struct path *
+ecryptfs_dentry_to_lower_path(struct dentry *dentry)
+{
+	return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
+}
+
+#define ecryptfs_printk(type, fmt, arg...) \
+        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__printf(1, 2)
+void __ecryptfs_printk(const char *fmt, ...);
+
+extern const struct file_operations ecryptfs_main_fops;
+extern const struct file_operations ecryptfs_dir_fops;
+extern const struct inode_operations ecryptfs_main_iops;
+extern const struct inode_operations ecryptfs_dir_iops;
+extern const struct inode_operations ecryptfs_symlink_iops;
+extern const struct super_operations ecryptfs_sops;
+extern const struct dentry_operations ecryptfs_dops;
+extern const struct address_space_operations ecryptfs_aops;
+extern int ecryptfs_verbosity;
+extern unsigned int ecryptfs_message_buf_len;
+extern signed long ecryptfs_message_wait_timeout;
+extern unsigned int ecryptfs_number_of_users;
+
+extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+extern struct kmem_cache *ecryptfs_file_info_cache;
+extern struct kmem_cache *ecryptfs_dentry_info_cache;
+extern struct kmem_cache *ecryptfs_inode_info_cache;
+extern struct kmem_cache *ecryptfs_sb_info_cache;
+extern struct kmem_cache *ecryptfs_header_cache;
+extern struct kmem_cache *ecryptfs_xattr_cache;
+extern struct kmem_cache *ecryptfs_key_record_cache;
+extern struct kmem_cache *ecryptfs_key_sig_cache;
+extern struct kmem_cache *ecryptfs_global_auth_tok_cache;
+extern struct kmem_cache *ecryptfs_key_tfm_cache;
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+				 struct super_block *sb);
+void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
+int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
+			     struct inode *ecryptfs_inode);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+					 size_t *decrypted_name_size,
+					 struct super_block *sb,
+					 const char *name, size_t name_size);
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size);
+struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
+void ecryptfs_dump_hex(char *data, int bytes);
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+			int sg_size);
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_rotate_iv(unsigned char *iv);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destroy_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
+int ecryptfs_encrypt_page(struct page *page);
+int ecryptfs_decrypt_page(struct page *page);
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+			    struct inode *ecryptfs_inode);
+int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode);
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written);
+int ecryptfs_read_and_validate_header_region(struct inode *inode);
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+					    struct inode *inode);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
+int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_generate_key_packet_set(char *dest_base,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     struct dentry *ecryptfs_dentry,
+				     size_t *len, size_t max);
+int
+ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+			  unsigned char *src, struct dentry *ecryptfs_dentry);
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
+ssize_t
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
+			void *value, size_t size);
+int
+ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags);
+int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
+int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
+			      struct ecryptfs_message *msg, u32 seq);
+int ecryptfs_send_message(char *data, int data_len,
+			  struct ecryptfs_msg_ctx **msg_ctx);
+int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
+			       struct ecryptfs_message **emsg);
+int ecryptfs_init_messaging(void);
+void ecryptfs_release_messaging(void);
+#else
+static inline int ecryptfs_init_messaging(void)
+{
+	return 0;
+}
+static inline void ecryptfs_release_messaging(void)
+{ }
+static inline int ecryptfs_send_message(char *data, int data_len,
+					struct ecryptfs_msg_ctx **msg_ctx)
+{
+	return -ENOTCONN;
+}
+static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
+					     struct ecryptfs_message **emsg)
+{
+	return -ENOMSG;
+}
+#endif
+
+void
+ecryptfs_write_header_metadata(char *virt,
+			       struct ecryptfs_crypt_stat *crypt_stat,
+			       size_t *written);
+int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
+int
+ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			   char *sig, u32 global_auth_tok_flags);
+int ecryptfs_get_global_auth_tok_for_sig(
+	struct ecryptfs_global_auth_tok **global_auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
+int
+ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
+			 size_t key_size);
+int ecryptfs_init_crypto(void);
+int ecryptfs_destroy_crypto(void);
+int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm);
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+					       struct mutex **tfm_mutex,
+					       char *cipher_name);
+int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
+				      struct ecryptfs_auth_tok **auth_tok,
+				      char *sig);
+int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
+			 loff_t offset, size_t size);
+int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
+				      struct page *page_for_lower,
+				      size_t offset_in_page, size_t size);
+int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
+int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
+			struct inode *ecryptfs_inode);
+int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+				     pgoff_t page_index,
+				     size_t offset_in_page, size_t size,
+				     struct inode *ecryptfs_inode);
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
+int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
+				 size_t *length_size);
+int ecryptfs_write_packet_length(char *dest, size_t size,
+				 size_t *packet_size_length);
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
+int ecryptfs_init_ecryptfs_miscdev(void);
+void ecryptfs_destroy_ecryptfs_miscdev(void);
+int ecryptfs_send_miscdev(char *data, size_t data_size,
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct ecryptfs_daemon *daemon);
+void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
+int
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file);
+int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon);
+#endif
+int ecryptfs_init_kthread(void);
+void ecryptfs_destroy_kthread(void);
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt,
+			     const struct cred *cred);
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
+void ecryptfs_put_lower_file(struct inode *inode);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size);
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+			   struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset);
+
+#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/kernel/fs/ecryptfs/file.c b/kernel/fs/ecryptfs/file.c
new file mode 100644
index 000000000..72afcc629
--- /dev/null
+++ b/kernel/fs/ecryptfs/file.c
@@ -0,0 +1,375 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *   		Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/compat.h>
+#include <linux/fs_stack.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_read_update_atime
+ *
+ * generic_file_read updates the atime of upper layer inode.  But, it
+ * doesn't give us a chance to update the atime of the lower layer
+ * inode.  This function is a wrapper to generic_file_read.  It
+ * updates the atime of the lower level inode if generic_file_read
+ * returns without any errors. This is to be used only for file reads.
+ * The function to be used for directory reads is ecryptfs_read.
+ */
+static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
+				struct iov_iter *to)
+{
+	ssize_t rc;
+	struct path *path;
+	struct file *file = iocb->ki_filp;
+
+	rc = generic_file_read_iter(iocb, to);
+	if (rc >= 0) {
+		path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
+		touch_atime(path);
+	}
+	return rc;
+}
+
+struct ecryptfs_getdents_callback {
+	struct dir_context ctx;
+	struct dir_context *caller;
+	struct super_block *sb;
+	int filldir_called;
+	int entries_written;
+};
+
+/* Inspired by generic filldir in fs/readdir.c */
+static int
+ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
+		 int lower_namelen, loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct ecryptfs_getdents_callback *buf =
+		container_of(ctx, struct ecryptfs_getdents_callback, ctx);
+	size_t name_size;
+	char *name;
+	int rc;
+
+	buf->filldir_called++;
+	rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+						  buf->sb, lower_name,
+						  lower_namelen);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+		       "filename [%s]; rc = [%d]\n", __func__, lower_name,
+		       rc);
+		goto out;
+	}
+	buf->caller->pos = buf->ctx.pos;
+	rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
+	kfree(name);
+	if (!rc)
+		buf->entries_written++;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_readdir
+ * @file: The eCryptfs directory file
+ * @ctx: The actor to feed the entries to
+ */
+static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	int rc;
+	struct file *lower_file;
+	struct inode *inode = file_inode(file);
+	struct ecryptfs_getdents_callback buf = {
+		.ctx.actor = ecryptfs_filldir,
+		.caller = ctx,
+		.sb = inode->i_sb,
+	};
+	lower_file = ecryptfs_file_to_lower(file);
+	lower_file->f_pos = ctx->pos;
+	rc = iterate_dir(lower_file, &buf.ctx);
+	ctx->pos = buf.ctx.pos;
+	if (rc < 0)
+		goto out;
+	if (buf.filldir_called && !buf.entries_written)
+		goto out;
+	if (rc >= 0)
+		fsstack_copy_attr_atime(inode,
+					file_inode(lower_file));
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_file_info_cache;
+
+static int read_or_initialize_metadata(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	int rc;
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+						inode->i_sb)->mount_crypt_stat;
+	mutex_lock(&crypt_stat->cs_mutex);
+
+	if (crypt_stat->flags & ECRYPTFS_POLICY_APPLIED &&
+	    crypt_stat->flags & ECRYPTFS_KEY_VALID) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = ecryptfs_read_metadata(dentry);
+	if (!rc)
+		goto out;
+
+	if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) {
+		crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
+				       | ECRYPTFS_ENCRYPTED);
+		rc = 0;
+		goto out;
+	}
+
+	if (!(mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) &&
+	    !i_size_read(ecryptfs_inode_to_lower(inode))) {
+		rc = ecryptfs_initialize_file(dentry, inode);
+		if (!rc)
+			goto out;
+	}
+
+	rc = -EIO;
+out:
+	mutex_unlock(&crypt_stat->cs_mutex);
+	return rc;
+}
+
+/**
+ * ecryptfs_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_open(struct inode *inode, struct file *file)
+{
+	int rc = 0;
+	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	struct dentry *ecryptfs_dentry = file->f_path.dentry;
+	/* Private value of ecryptfs_dentry allocated in
+	 * ecryptfs_lookup() */
+	struct ecryptfs_file_info *file_info;
+
+	/* Released in ecryptfs_release or end of function if failure */
+	file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
+	ecryptfs_set_file_private(file, file_info);
+	if (!file_info) {
+		ecryptfs_printk(KERN_ERR,
+				"Error attempting to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	mutex_lock(&crypt_stat->cs_mutex);
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) {
+		ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n");
+		/* Policy code enabled in future release */
+		crypt_stat->flags |= (ECRYPTFS_POLICY_APPLIED
+				      | ECRYPTFS_ENCRYPTED);
+	}
+	mutex_unlock(&crypt_stat->cs_mutex);
+	rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%pd]; rc = [%d]\n", __func__,
+			ecryptfs_dentry, rc);
+		goto out_free;
+	}
+	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
+	    == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
+		rc = -EPERM;
+		printk(KERN_WARNING "%s: Lower file is RO; eCryptfs "
+		       "file must hence be opened RO\n", __func__);
+		goto out_put;
+	}
+	ecryptfs_set_file_lower(
+		file, ecryptfs_inode_to_private(inode)->lower_file);
+	if (d_is_dir(ecryptfs_dentry)) {
+		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+		mutex_lock(&crypt_stat->cs_mutex);
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+		mutex_unlock(&crypt_stat->cs_mutex);
+		rc = 0;
+		goto out;
+	}
+	rc = read_or_initialize_metadata(ecryptfs_dentry);
+	if (rc)
+		goto out_put;
+	ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
+			"[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
+			(unsigned long long)i_size_read(inode));
+	goto out;
+out_put:
+	ecryptfs_put_lower_file(inode);
+out_free:
+	kmem_cache_free(ecryptfs_file_info_cache,
+			ecryptfs_file_to_private(file));
+out:
+	return rc;
+}
+
+static int ecryptfs_flush(struct file *file, fl_owner_t td)
+{
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+
+	if (lower_file->f_op->flush) {
+		filemap_write_and_wait(file->f_mapping);
+		return lower_file->f_op->flush(lower_file, td);
+	}
+
+	return 0;
+}
+
+static int ecryptfs_release(struct inode *inode, struct file *file)
+{
+	ecryptfs_put_lower_file(inode);
+	kmem_cache_free(ecryptfs_file_info_cache,
+			ecryptfs_file_to_private(file));
+	return 0;
+}
+
+static int
+ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	int rc;
+
+	rc = filemap_write_and_wait(file->f_mapping);
+	if (rc)
+		return rc;
+
+	return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
+}
+
+static int ecryptfs_fasync(int fd, struct file *file, int flag)
+{
+	int rc = 0;
+	struct file *lower_file = NULL;
+
+	lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file->f_op->fasync)
+		rc = lower_file->f_op->fasync(fd, lower_file, flag);
+	return rc;
+}
+
+static long
+ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+	long rc = -ENOTTY;
+
+	if (!lower_file->f_op->unlocked_ioctl)
+		return rc;
+
+	switch (cmd) {
+	case FITRIM:
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+	case FS_IOC_GETVERSION:
+	case FS_IOC_SETVERSION:
+		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+		fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+		return rc;
+	default:
+		return rc;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long
+ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+	long rc = -ENOIOCTLCMD;
+
+	if (!lower_file->f_op->compat_ioctl)
+		return rc;
+
+	switch (cmd) {
+	case FITRIM:
+	case FS_IOC32_GETFLAGS:
+	case FS_IOC32_SETFLAGS:
+	case FS_IOC32_GETVERSION:
+	case FS_IOC32_SETVERSION:
+		rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+		fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+		return rc;
+	default:
+		return rc;
+	}
+}
+#endif
+
+const struct file_operations ecryptfs_dir_fops = {
+	.iterate = ecryptfs_readdir,
+	.read = generic_read_dir,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
+	.open = ecryptfs_open,
+	.flush = ecryptfs_flush,
+	.release = ecryptfs_release,
+	.fsync = ecryptfs_fsync,
+	.fasync = ecryptfs_fasync,
+	.splice_read = generic_file_splice_read,
+	.llseek = default_llseek,
+};
+
+const struct file_operations ecryptfs_main_fops = {
+	.llseek = generic_file_llseek,
+	.read_iter = ecryptfs_read_update_atime,
+	.write_iter = generic_file_write_iter,
+	.iterate = ecryptfs_readdir,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
+	.mmap = generic_file_mmap,
+	.open = ecryptfs_open,
+	.flush = ecryptfs_flush,
+	.release = ecryptfs_release,
+	.fsync = ecryptfs_fsync,
+	.fasync = ecryptfs_fasync,
+	.splice_read = generic_file_splice_read,
+};
diff --git a/kernel/fs/ecryptfs/inode.c b/kernel/fs/ecryptfs/inode.c
new file mode 100644
index 000000000..fc850b55d
--- /dev/null
+++ b/kernel/fs/ecryptfs/inode.c
@@ -0,0 +1,1138 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompsion <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/crypto.h>
+#include <linux/fs_stack.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <asm/unaligned.h>
+#include "ecryptfs_kernel.h"
+
+static struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *dir;
+
+	dir = dget_parent(dentry);
+	mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT);
+	return dir;
+}
+
+static void unlock_dir(struct dentry *dir)
+{
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	dput(dir);
+}
+
+static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
+{
+	return ecryptfs_inode_to_lower(inode) == lower_inode;
+}
+
+static int ecryptfs_inode_set(struct inode *inode, void *opaque)
+{
+	struct inode *lower_inode = opaque;
+
+	ecryptfs_set_inode_lower(inode, lower_inode);
+	fsstack_copy_attr_all(inode, lower_inode);
+	/* i_size will be overwritten for encrypted regular files */
+	fsstack_copy_inode_size(inode, lower_inode);
+	inode->i_ino = lower_inode->i_ino;
+	inode->i_version++;
+	inode->i_mapping->a_ops = &ecryptfs_aops;
+
+	if (S_ISLNK(inode->i_mode))
+		inode->i_op = &ecryptfs_symlink_iops;
+	else if (S_ISDIR(inode->i_mode))
+		inode->i_op = &ecryptfs_dir_iops;
+	else
+		inode->i_op = &ecryptfs_main_iops;
+
+	if (S_ISDIR(inode->i_mode))
+		inode->i_fop = &ecryptfs_dir_fops;
+	else if (special_file(inode->i_mode))
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	else
+		inode->i_fop = &ecryptfs_main_fops;
+
+	return 0;
+}
+
+static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
+					  struct super_block *sb)
+{
+	struct inode *inode;
+
+	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
+		return ERR_PTR(-EXDEV);
+	if (!igrab(lower_inode))
+		return ERR_PTR(-ESTALE);
+	inode = iget5_locked(sb, (unsigned long)lower_inode,
+			     ecryptfs_inode_test, ecryptfs_inode_set,
+			     lower_inode);
+	if (!inode) {
+		iput(lower_inode);
+		return ERR_PTR(-EACCES);
+	}
+	if (!(inode->i_state & I_NEW))
+		iput(lower_inode);
+
+	return inode;
+}
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+				 struct super_block *sb)
+{
+	struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
+
+	if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+		unlock_new_inode(inode);
+
+	return inode;
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_interpose(struct dentry *lower_dentry,
+			      struct dentry *dentry, struct super_block *sb)
+{
+	struct inode *inode = ecryptfs_get_inode(d_inode(lower_dentry), sb);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	d_instantiate(dentry, inode);
+
+	return 0;
+}
+
+static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
+			      struct inode *inode)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+	struct dentry *lower_dir_dentry;
+	int rc;
+
+	dget(lower_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+	if (rc) {
+		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
+		goto out_unlock;
+	}
+	fsstack_copy_attr_times(dir, lower_dir_inode);
+	set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
+	inode->i_ctime = dir->i_ctime;
+	d_drop(dentry);
+out_unlock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_dentry);
+	return rc;
+}
+
+/**
+ * ecryptfs_do_create
+ * @directory_inode: inode of the new file's dentry's parent in ecryptfs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the underlying file and the eCryptfs inode which will link to
+ * it. It will also update the eCryptfs directory inode to mimic the
+ * stat of the lower directory inode.
+ *
+ * Returns the new eCryptfs inode on success; an ERR_PTR on error condition
+ */
+static struct inode *
+ecryptfs_do_create(struct inode *directory_inode,
+		   struct dentry *ecryptfs_dentry, umode_t mode)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+	struct inode *inode;
+
+	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
+	if (rc) {
+		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
+		       "rc = [%d]\n", __func__, rc);
+		inode = ERR_PTR(rc);
+		goto out_lock;
+	}
+	inode = __ecryptfs_get_inode(d_inode(lower_dentry),
+				     directory_inode->i_sb);
+	if (IS_ERR(inode)) {
+		vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
+		goto out_lock;
+	}
+	fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
+	fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry));
+out_lock:
+	unlock_dir(lower_dir_dentry);
+	return inode;
+}
+
+/**
+ * ecryptfs_initialize_file
+ *
+ * Cause the file to be changed from a basic empty file to an ecryptfs
+ * file with a header and first data page.
+ *
+ * Returns zero on success
+ */
+int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
+			     struct inode *ecryptfs_inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	int rc = 0;
+
+	if (S_ISDIR(ecryptfs_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
+	rc = ecryptfs_new_file_context(ecryptfs_inode);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error creating new file "
+				"context; rc = [%d]\n", rc);
+		goto out;
+	}
+	rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%pd]; rc = [%d]\n", __func__,
+			ecryptfs_dentry, rc);
+		goto out;
+	}
+	rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
+	if (rc)
+		printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
+	ecryptfs_put_lower_file(ecryptfs_inode);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_create
+ * @dir: The inode of the directory in which to create the file.
+ * @dentry: The eCryptfs dentry
+ * @mode: The mode of the new file.
+ *
+ * Creates a new file.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+		umode_t mode, bool excl)
+{
+	struct inode *ecryptfs_inode;
+	int rc;
+
+	ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
+					    mode);
+	if (unlikely(IS_ERR(ecryptfs_inode))) {
+		ecryptfs_printk(KERN_WARNING, "Failed to create file in"
+				"lower filesystem\n");
+		rc = PTR_ERR(ecryptfs_inode);
+		goto out;
+	}
+	/* At this point, a file exists on "disk"; we need to make sure
+	 * that this on disk file is prepared to be an ecryptfs file */
+	rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
+	if (rc) {
+		ecryptfs_do_unlink(directory_inode, ecryptfs_dentry,
+				   ecryptfs_inode);
+		make_bad_inode(ecryptfs_inode);
+		unlock_new_inode(ecryptfs_inode);
+		iput(ecryptfs_inode);
+		goto out;
+	}
+	unlock_new_inode(ecryptfs_inode);
+	d_instantiate(ecryptfs_dentry, ecryptfs_inode);
+out:
+	return rc;
+}
+
+static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	int rc;
+
+	rc = ecryptfs_get_lower_file(dentry, inode);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%pd]; rc = [%d]\n", __func__,
+			dentry, rc);
+		return rc;
+	}
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	/* TODO: lock for crypt_stat comparison */
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+		ecryptfs_set_default_sizes(crypt_stat);
+
+	rc = ecryptfs_read_and_validate_header_region(inode);
+	ecryptfs_put_lower_file(inode);
+	if (rc) {
+		rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
+		if (!rc)
+			crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	}
+
+	/* Must return 0 to allow non-eCryptfs files to be looked up, too */
+	return 0;
+}
+
+/**
+ * ecryptfs_lookup_interpose - Dentry interposition for a lookup
+ */
+static int ecryptfs_lookup_interpose(struct dentry *dentry,
+				     struct dentry *lower_dentry,
+				     struct inode *dir_inode)
+{
+	struct inode *inode, *lower_inode = d_inode(lower_dentry);
+	struct ecryptfs_dentry_info *dentry_info;
+	struct vfsmount *lower_mnt;
+	int rc = 0;
+
+	dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+	if (!dentry_info) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to allocate ecryptfs_dentry_info struct\n",
+			__func__);
+		dput(lower_dentry);
+		return -ENOMEM;
+	}
+
+	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+	fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent));
+	BUG_ON(!d_count(lower_dentry));
+
+	ecryptfs_set_dentry_private(dentry, dentry_info);
+	dentry_info->lower_path.mnt = lower_mnt;
+	dentry_info->lower_path.dentry = lower_dentry;
+
+	if (d_really_is_negative(lower_dentry)) {
+		/* We want to add because we couldn't find in lower */
+		d_add(dentry, NULL);
+		return 0;
+	}
+	inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
+		       __func__, PTR_ERR(inode));
+		return PTR_ERR(inode);
+	}
+	if (S_ISREG(inode->i_mode)) {
+		rc = ecryptfs_i_size_read(dentry, inode);
+		if (rc) {
+			make_bad_inode(inode);
+			return rc;
+		}
+	}
+
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+	d_add(dentry, inode);
+
+	return rc;
+}
+
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+				      struct dentry *ecryptfs_dentry,
+				      unsigned int flags)
+{
+	char *encrypted_and_encoded_name = NULL;
+	size_t encrypted_and_encoded_name_size;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+	struct dentry *lower_dir_dentry, *lower_dentry;
+	int rc = 0;
+
+	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+	mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+				      lower_dir_dentry,
+				      ecryptfs_dentry->d_name.len);
+	mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%pd]\n", __func__, rc,
+				ecryptfs_dentry);
+		goto out;
+	}
+	if (d_really_is_positive(lower_dentry))
+		goto interpose;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+				ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	if (!(mount_crypt_stat
+	    && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+		goto interpose;
+	dput(lower_dentry);
+	rc = ecryptfs_encrypt_and_encode_filename(
+		&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+		NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+		ecryptfs_dentry->d_name.len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+		       "filename; rc = [%d]\n", __func__, rc);
+		goto out;
+	}
+	mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+				      lower_dir_dentry,
+				      encrypted_and_encoded_name_size);
+	mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%s]\n", __func__, rc,
+				encrypted_and_encoded_name);
+		goto out;
+	}
+interpose:
+	rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
+				       ecryptfs_dir_inode);
+out:
+	kfree(encrypted_and_encoded_name);
+	return ERR_PTR(rc);
+}
+
+static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *new_dentry)
+{
+	struct dentry *lower_old_dentry;
+	struct dentry *lower_new_dentry;
+	struct dentry *lower_dir_dentry;
+	u64 file_size_save;
+	int rc;
+
+	file_size_save = i_size_read(d_inode(old_dentry));
+	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+	dget(lower_old_dentry);
+	dget(lower_new_dentry);
+	lower_dir_dentry = lock_parent(lower_new_dentry);
+	rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
+		      lower_new_dentry, NULL);
+	if (rc || d_really_is_negative(lower_new_dentry))
+		goto out_lock;
+	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
+	if (rc)
+		goto out_lock;
+	fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+	fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+	set_nlink(d_inode(old_dentry),
+		  ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
+	i_size_write(d_inode(new_dentry), file_size_save);
+out_lock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_new_dentry);
+	dput(lower_old_dentry);
+	return rc;
+}
+
+static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return ecryptfs_do_unlink(dir, dentry, d_inode(dentry));
+}
+
+static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+	char *encoded_symname;
+	size_t encoded_symlen;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	dget(lower_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+		dir->i_sb)->mount_crypt_stat;
+	rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
+						  &encoded_symlen,
+						  NULL,
+						  mount_crypt_stat, symname,
+						  strlen(symname));
+	if (rc)
+		goto out_lock;
+	rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry,
+			 encoded_symname);
+	kfree(encoded_symname);
+	if (rc || d_really_is_negative(lower_dentry))
+		goto out_lock;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
+	if (rc)
+		goto out_lock;
+	fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+	fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+out_lock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_dentry);
+	if (d_really_is_negative(dentry))
+		d_drop(dentry);
+	return rc;
+}
+
+static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
+	if (rc || d_really_is_negative(lower_dentry))
+		goto out;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
+	if (rc)
+		goto out;
+	fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+	fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+	set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
+out:
+	unlock_dir(lower_dir_dentry);
+	if (d_really_is_negative(dentry))
+		d_drop(dentry);
+	return rc;
+}
+
+static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+	int rc;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	dget(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	dget(lower_dentry);
+	rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
+	dput(lower_dentry);
+	if (!rc && d_really_is_positive(dentry))
+		clear_nlink(d_inode(dentry));
+	fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+	set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
+	unlock_dir(lower_dir_dentry);
+	if (!rc)
+		d_drop(dentry);
+	dput(dentry);
+	return rc;
+}
+
+static int
+ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
+	if (rc || d_really_is_negative(lower_dentry))
+		goto out;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
+	if (rc)
+		goto out;
+	fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+	fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+out:
+	unlock_dir(lower_dir_dentry);
+	if (d_really_is_negative(dentry))
+		d_drop(dentry);
+	return rc;
+}
+
+static int
+ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	int rc;
+	struct dentry *lower_old_dentry;
+	struct dentry *lower_new_dentry;
+	struct dentry *lower_old_dir_dentry;
+	struct dentry *lower_new_dir_dentry;
+	struct dentry *trap = NULL;
+	struct inode *target_inode;
+
+	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+	dget(lower_old_dentry);
+	dget(lower_new_dentry);
+	lower_old_dir_dentry = dget_parent(lower_old_dentry);
+	lower_new_dir_dentry = dget_parent(lower_new_dentry);
+	target_inode = d_inode(new_dentry);
+	trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+	/* source should not be ancestor of target */
+	if (trap == lower_old_dentry) {
+		rc = -EINVAL;
+		goto out_lock;
+	}
+	/* target should not be ancestor of source */
+	if (trap == lower_new_dentry) {
+		rc = -ENOTEMPTY;
+		goto out_lock;
+	}
+	rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
+			d_inode(lower_new_dir_dentry), lower_new_dentry,
+			NULL, 0);
+	if (rc)
+		goto out_lock;
+	if (target_inode)
+		fsstack_copy_attr_all(target_inode,
+				      ecryptfs_inode_to_lower(target_inode));
+	fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry));
+	if (new_dir != old_dir)
+		fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
+out_lock:
+	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+	dput(lower_new_dir_dentry);
+	dput(lower_old_dir_dentry);
+	dput(lower_new_dentry);
+	dput(lower_old_dentry);
+	return rc;
+}
+
+static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	char *lower_buf;
+	char *buf;
+	mm_segment_t old_fs;
+	int rc;
+
+	lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!lower_buf)
+		return ERR_PTR(-ENOMEM);
+	old_fs = get_fs();
+	set_fs(get_ds());
+	rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry,
+						   (char __user *)lower_buf,
+						   PATH_MAX);
+	set_fs(old_fs);
+	if (rc < 0)
+		goto out;
+	rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb,
+						  lower_buf, rc);
+out:
+	kfree(lower_buf);
+	return rc ? ERR_PTR(rc) : buf;
+}
+
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	size_t len;
+	char *buf = ecryptfs_readlink_lower(dentry, &len);
+	if (IS_ERR(buf))
+		goto out;
+	fsstack_copy_attr_atime(d_inode(dentry),
+				d_inode(ecryptfs_dentry_to_lower(dentry)));
+	buf[len] = '\0';
+out:
+	nd_set_link(nd, buf);
+	return NULL;
+}
+
+/**
+ * upper_size_to_lower_size
+ * @crypt_stat: Crypt_stat associated with file
+ * @upper_size: Size of the upper file
+ *
+ * Calculate the required size of the lower file based on the
+ * specified size of the upper file. This calculation is based on the
+ * number of headers in the underlying file and the extent size.
+ *
+ * Returns Calculated size of the lower file.
+ */
+static loff_t
+upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
+			 loff_t upper_size)
+{
+	loff_t lower_size;
+
+	lower_size = ecryptfs_lower_header_size(crypt_stat);
+	if (upper_size != 0) {
+		loff_t num_extents;
+
+		num_extents = upper_size >> crypt_stat->extent_shift;
+		if (upper_size & ~crypt_stat->extent_mask)
+			num_extents++;
+		lower_size += (num_extents * crypt_stat->extent_size);
+	}
+	return lower_size;
+}
+
+/**
+ * truncate_upper
+ * @dentry: The ecryptfs layer dentry
+ * @ia: Address of the ecryptfs inode's attributes
+ * @lower_ia: Address of the lower inode's attributes
+ *
+ * Function to handle truncations modifying the size of the file. Note
+ * that the file sizes are interpolated. When expanding, we are simply
+ * writing strings of 0's out. When truncating, we truncate the upper
+ * inode and update the lower_ia according to the page index
+ * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
+ * the caller must use lower_ia in a call to notify_change() to perform
+ * the truncation of the lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int truncate_upper(struct dentry *dentry, struct iattr *ia,
+			  struct iattr *lower_ia)
+{
+	int rc = 0;
+	struct inode *inode = d_inode(dentry);
+	struct ecryptfs_crypt_stat *crypt_stat;
+	loff_t i_size = i_size_read(inode);
+	loff_t lower_size_before_truncate;
+	loff_t lower_size_after_truncate;
+
+	if (unlikely((ia->ia_size == i_size))) {
+		lower_ia->ia_valid &= ~ATTR_SIZE;
+		return 0;
+	}
+	rc = ecryptfs_get_lower_file(dentry, inode);
+	if (rc)
+		return rc;
+	crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
+	/* Switch on growing or shrinking file */
+	if (ia->ia_size > i_size) {
+		char zero[] = { 0x00 };
+
+		lower_ia->ia_valid &= ~ATTR_SIZE;
+		/* Write a single 0 at the last position of the file;
+		 * this triggers code that will fill in 0's throughout
+		 * the intermediate portion of the previous end of the
+		 * file and the new and of the file */
+		rc = ecryptfs_write(inode, zero,
+				    (ia->ia_size - 1), 1);
+	} else { /* ia->ia_size < i_size_read(inode) */
+		/* We're chopping off all the pages down to the page
+		 * in which ia->ia_size is located. Fill in the end of
+		 * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
+		 * PAGE_CACHE_SIZE with zeros. */
+		size_t num_zeros = (PAGE_CACHE_SIZE
+				    - (ia->ia_size & ~PAGE_CACHE_MASK));
+
+		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+			truncate_setsize(inode, ia->ia_size);
+			lower_ia->ia_size = ia->ia_size;
+			lower_ia->ia_valid |= ATTR_SIZE;
+			goto out;
+		}
+		if (num_zeros) {
+			char *zeros_virt;
+
+			zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
+			if (!zeros_virt) {
+				rc = -ENOMEM;
+				goto out;
+			}
+			rc = ecryptfs_write(inode, zeros_virt,
+					    ia->ia_size, num_zeros);
+			kfree(zeros_virt);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to zero out "
+				       "the remainder of the end page on "
+				       "reducing truncate; rc = [%d]\n", rc);
+				goto out;
+			}
+		}
+		truncate_setsize(inode, ia->ia_size);
+		rc = ecryptfs_write_inode_size_to_metadata(inode);
+		if (rc) {
+			printk(KERN_ERR	"Problem with "
+			       "ecryptfs_write_inode_size_to_metadata; "
+			       "rc = [%d]\n", rc);
+			goto out;
+		}
+		/* We are reducing the size of the ecryptfs file, and need to
+		 * know if we need to reduce the size of the lower file. */
+		lower_size_before_truncate =
+		    upper_size_to_lower_size(crypt_stat, i_size);
+		lower_size_after_truncate =
+		    upper_size_to_lower_size(crypt_stat, ia->ia_size);
+		if (lower_size_after_truncate < lower_size_before_truncate) {
+			lower_ia->ia_size = lower_size_after_truncate;
+			lower_ia->ia_valid |= ATTR_SIZE;
+		} else
+			lower_ia->ia_valid &= ~ATTR_SIZE;
+	}
+out:
+	ecryptfs_put_lower_file(inode);
+	return rc;
+}
+
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	loff_t lower_oldsize, lower_newsize;
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	lower_oldsize = upper_size_to_lower_size(crypt_stat,
+						 i_size_read(inode));
+	lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+	if (lower_newsize > lower_oldsize) {
+		/*
+		 * The eCryptfs inode and the new *lower* size are mixed here
+		 * because we may not have the lower i_mutex held and/or it may
+		 * not be appropriate to call inode_newsize_ok() with inodes
+		 * from other filesystems.
+		 */
+		return inode_newsize_ok(inode, lower_newsize);
+	}
+
+	return 0;
+}
+
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Simple function that handles the truncation of an eCryptfs inode and
+ * its corresponding lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+	struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
+	struct iattr lower_ia = { .ia_valid = 0 };
+	int rc;
+
+	rc = ecryptfs_inode_newsize_ok(d_inode(dentry), new_length);
+	if (rc)
+		return rc;
+
+	rc = truncate_upper(dentry, &ia, &lower_ia);
+	if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
+		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+		mutex_lock(&d_inode(lower_dentry)->i_mutex);
+		rc = notify_change(lower_dentry, &lower_ia, NULL);
+		mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	}
+	return rc;
+}
+
+static int
+ecryptfs_permission(struct inode *inode, int mask)
+{
+	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
+}
+
+/**
+ * ecryptfs_setattr
+ * @dentry: dentry handle to the inode to modify
+ * @ia: Structure with flags of what to change and values
+ *
+ * Updates the metadata of an inode. If the update is to the size
+ * i.e. truncation, then ecryptfs_truncate will handle the size modification
+ * of both the ecryptfs inode and the lower inode.
+ *
+ * All other metadata changes will be passed right to the lower filesystem,
+ * and we will just update our inode to look like the lower.
+ */
+static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+	struct iattr lower_ia;
+	struct inode *inode;
+	struct inode *lower_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+
+	crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
+	if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+		ecryptfs_init_crypt_stat(crypt_stat);
+	inode = d_inode(dentry);
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	mutex_lock(&crypt_stat->cs_mutex);
+	if (d_is_dir(dentry))
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+	else if (d_is_reg(dentry)
+		 && (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)
+		     || !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) {
+		struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+
+		mount_crypt_stat = &ecryptfs_superblock_to_private(
+			dentry->d_sb)->mount_crypt_stat;
+		rc = ecryptfs_get_lower_file(dentry, inode);
+		if (rc) {
+			mutex_unlock(&crypt_stat->cs_mutex);
+			goto out;
+		}
+		rc = ecryptfs_read_metadata(dentry);
+		ecryptfs_put_lower_file(inode);
+		if (rc) {
+			if (!(mount_crypt_stat->flags
+			      & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+				rc = -EIO;
+				printk(KERN_WARNING "Either the lower file "
+				       "is not in a valid eCryptfs format, "
+				       "or the key could not be retrieved. "
+				       "Plaintext passthrough mode is not "
+				       "enabled; returning -EIO\n");
+				mutex_unlock(&crypt_stat->cs_mutex);
+				goto out;
+			}
+			rc = 0;
+			crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
+					       | ECRYPTFS_ENCRYPTED);
+		}
+	}
+	mutex_unlock(&crypt_stat->cs_mutex);
+
+	rc = inode_change_ok(inode, ia);
+	if (rc)
+		goto out;
+	if (ia->ia_valid & ATTR_SIZE) {
+		rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+		if (rc)
+			goto out;
+	}
+
+	memcpy(&lower_ia, ia, sizeof(lower_ia));
+	if (ia->ia_valid & ATTR_FILE)
+		lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
+	if (ia->ia_valid & ATTR_SIZE) {
+		rc = truncate_upper(dentry, ia, &lower_ia);
+		if (rc < 0)
+			goto out;
+	}
+
+	/*
+	 * mode change is for clearing setuid/setgid bits. Allow lower fs
+	 * to interpret this in its own way.
+	 */
+	if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+		lower_ia.ia_valid &= ~ATTR_MODE;
+
+	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	rc = notify_change(lower_dentry, &lower_ia, NULL);
+	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+out:
+	fsstack_copy_attr_all(inode, lower_inode);
+	return rc;
+}
+
+static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	int rc = 0;
+
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+						dentry->d_sb)->mount_crypt_stat;
+	generic_fillattr(d_inode(dentry), stat);
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		char *target;
+		size_t targetsiz;
+
+		target = ecryptfs_readlink_lower(dentry, &targetsiz);
+		if (!IS_ERR(target)) {
+			kfree(target);
+			stat->size = targetsiz;
+		} else {
+			rc = PTR_ERR(target);
+		}
+	}
+	return rc;
+}
+
+static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			    struct kstat *stat)
+{
+	struct kstat lower_stat;
+	int rc;
+
+	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat);
+	if (!rc) {
+		fsstack_copy_attr_all(d_inode(dentry),
+				      ecryptfs_inode_to_lower(d_inode(dentry)));
+		generic_fillattr(d_inode(dentry), stat);
+		stat->blocks = lower_stat.blocks;
+	}
+	return rc;
+}
+
+int
+ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!d_inode(lower_dentry)->i_op->setxattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+	if (!rc && d_really_is_positive(dentry))
+		fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
+out:
+	return rc;
+}
+
+ssize_t
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
+			void *value, size_t size)
+{
+	int rc = 0;
+
+	if (!d_inode(lower_dentry)->i_op->getxattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
+						   size);
+	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+out:
+	return rc;
+}
+
+static ssize_t
+ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
+		  size_t size)
+{
+	return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), name,
+				       value, size);
+}
+
+static ssize_t
+ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!d_inode(lower_dentry)->i_op->listxattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size);
+	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+out:
+	return rc;
+}
+
+static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!d_inode(lower_dentry)->i_op->removexattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
+	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+out:
+	return rc;
+}
+
+const struct inode_operations ecryptfs_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = ecryptfs_follow_link,
+	.put_link = kfree_put_link,
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.getattr = ecryptfs_getattr_link,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
+
+const struct inode_operations ecryptfs_dir_iops = {
+	.create = ecryptfs_create,
+	.lookup = ecryptfs_lookup,
+	.link = ecryptfs_link,
+	.unlink = ecryptfs_unlink,
+	.symlink = ecryptfs_symlink,
+	.mkdir = ecryptfs_mkdir,
+	.rmdir = ecryptfs_rmdir,
+	.mknod = ecryptfs_mknod,
+	.rename = ecryptfs_rename,
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
+
+const struct inode_operations ecryptfs_main_iops = {
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.getattr = ecryptfs_getattr,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
diff --git a/kernel/fs/ecryptfs/keystore.c b/kernel/fs/ecryptfs/keystore.c
new file mode 100644
index 000000000..6bd67e201
--- /dev/null
+++ b/kernel/fs/ecryptfs/keystore.c
@@ -0,0 +1,2529 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * In-kernel key management code.  Includes functions to parse and
+ * write authentication token-related packets with the underlying
+ * file.
+ *
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *              Trevor S. Highland <trevor.highland@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * request_key returned an error instead of a valid key address;
+ * determine the type of error, make appropriate log entries, and
+ * return an error code.
+ */
+static int process_request_key_err(long err_code)
+{
+	int rc = 0;
+
+	switch (err_code) {
+	case -ENOKEY:
+		ecryptfs_printk(KERN_WARNING, "No key\n");
+		rc = -ENOENT;
+		break;
+	case -EKEYEXPIRED:
+		ecryptfs_printk(KERN_WARNING, "Key expired\n");
+		rc = -ETIME;
+		break;
+	case -EKEYREVOKED:
+		ecryptfs_printk(KERN_WARNING, "Key revoked\n");
+		rc = -EINVAL;
+		break;
+	default:
+		ecryptfs_printk(KERN_WARNING, "Unknown error code: "
+				"[0x%.16lx]\n", err_code);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+static int process_find_global_auth_tok_for_sig_err(int err_code)
+{
+	int rc = err_code;
+
+	switch (err_code) {
+	case -ENOENT:
+		ecryptfs_printk(KERN_WARNING, "Missing auth tok\n");
+		break;
+	case -EINVAL:
+		ecryptfs_printk(KERN_WARNING, "Invalid auth tok\n");
+		break;
+	default:
+		rc = process_request_key_err(err_code);
+		break;
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_parse_packet_length
+ * @data: Pointer to memory containing length at offset
+ * @size: This function writes the decoded size to this memory
+ *        address; zero on error
+ * @length_size: The number of bytes occupied by the encoded length
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
+				 size_t *length_size)
+{
+	int rc = 0;
+
+	(*length_size) = 0;
+	(*size) = 0;
+	if (data[0] < 192) {
+		/* One-byte length */
+		(*size) = data[0];
+		(*length_size) = 1;
+	} else if (data[0] < 224) {
+		/* Two-byte length */
+		(*size) = (data[0] - 192) * 256;
+		(*size) += data[1] + 192;
+		(*length_size) = 2;
+	} else if (data[0] == 255) {
+		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
+		ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
+				"supported\n");
+		rc = -EINVAL;
+		goto out;
+	} else {
+		ecryptfs_printk(KERN_ERR, "Error parsing packet length\n");
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_write_packet_length
+ * @dest: The byte array target into which to write the length. Must
+ *        have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
+ * @size: The length to write.
+ * @packet_size_length: The number of bytes used to encode the packet
+ *                      length is written to this address.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int ecryptfs_write_packet_length(char *dest, size_t size,
+				 size_t *packet_size_length)
+{
+	int rc = 0;
+
+	if (size < 192) {
+		dest[0] = size;
+		(*packet_size_length) = 1;
+	} else if (size < 65536) {
+		dest[0] = (((size - 192) / 256) + 192);
+		dest[1] = ((size - 192) % 256);
+		(*packet_size_length) = 2;
+	} else {
+		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING,
+				"Unsupported packet size: [%zd]\n", size);
+	}
+	return rc;
+}
+
+static int
+write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key,
+		    char **packet, size_t *packet_len)
+{
+	size_t i = 0;
+	size_t data_len;
+	size_t packet_size_len;
+	char *message;
+	int rc;
+
+	/*
+	 *              ***** TAG 64 Packet Format *****
+	 *    | Content Type                       | 1 byte       |
+	 *    | Key Identifier Size                | 1 or 2 bytes |
+	 *    | Key Identifier                     | arbitrary    |
+	 *    | Encrypted File Encryption Key Size | 1 or 2 bytes |
+	 *    | Encrypted File Encryption Key      | arbitrary    |
+	 */
+	data_len = (5 + ECRYPTFS_SIG_SIZE_HEX
+		    + session_key->encrypted_key_size);
+	*packet = kmalloc(data_len, GFP_KERNEL);
+	message = *packet;
+	if (!message) {
+		ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	message[i++] = ECRYPTFS_TAG_64_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX);
+	i += ECRYPTFS_SIG_SIZE_HEX;
+	rc = ecryptfs_write_packet_length(&message[i],
+					  session_key->encrypted_key_size,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	memcpy(&message[i], session_key->encrypted_key,
+	       session_key->encrypted_key_size);
+	i += session_key->encrypted_key_size;
+	*packet_len = i;
+out:
+	return rc;
+}
+
+static int
+parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code,
+		    struct ecryptfs_message *msg)
+{
+	size_t i = 0;
+	char *data;
+	size_t data_len;
+	size_t m_size;
+	size_t message_len;
+	u16 checksum = 0;
+	u16 expected_checksum = 0;
+	int rc;
+
+	/*
+	 *              ***** TAG 65 Packet Format *****
+	 *         | Content Type             | 1 byte       |
+	 *         | Status Indicator         | 1 byte       |
+	 *         | File Encryption Key Size | 1 or 2 bytes |
+	 *         | File Encryption Key      | arbitrary    |
+	 */
+	message_len = msg->data_len;
+	data = msg->data;
+	if (message_len < 4) {
+		rc = -EIO;
+		goto out;
+	}
+	if (data[i++] != ECRYPTFS_TAG_65_PACKET_TYPE) {
+		ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_65\n");
+		rc = -EIO;
+		goto out;
+	}
+	if (data[i++]) {
+		ecryptfs_printk(KERN_ERR, "Status indicator has non-zero value "
+				"[%d]\n", data[i-1]);
+		rc = -EIO;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[i], &m_size, &data_len);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
+				"rc = [%d]\n", rc);
+		goto out;
+	}
+	i += data_len;
+	if (message_len < (i + m_size)) {
+		ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd "
+				"is shorter than expected\n");
+		rc = -EIO;
+		goto out;
+	}
+	if (m_size < 3) {
+		ecryptfs_printk(KERN_ERR,
+				"The decrypted key is not long enough to "
+				"include a cipher code and checksum\n");
+		rc = -EIO;
+		goto out;
+	}
+	*cipher_code = data[i++];
+	/* The decrypted key includes 1 byte cipher code and 2 byte checksum */
+	session_key->decrypted_key_size = m_size - 3;
+	if (session_key->decrypted_key_size > ECRYPTFS_MAX_KEY_BYTES) {
+		ecryptfs_printk(KERN_ERR, "key_size [%d] larger than "
+				"the maximum key size [%d]\n",
+				session_key->decrypted_key_size,
+				ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
+		rc = -EIO;
+		goto out;
+	}
+	memcpy(session_key->decrypted_key, &data[i],
+	       session_key->decrypted_key_size);
+	i += session_key->decrypted_key_size;
+	expected_checksum += (unsigned char)(data[i++]) << 8;
+	expected_checksum += (unsigned char)(data[i++]);
+	for (i = 0; i < session_key->decrypted_key_size; i++)
+		checksum += session_key->decrypted_key[i];
+	if (expected_checksum != checksum) {
+		ecryptfs_printk(KERN_ERR, "Invalid checksum for file "
+				"encryption  key; expected [%x]; calculated "
+				"[%x]\n", expected_checksum, checksum);
+		rc = -EIO;
+	}
+out:
+	return rc;
+}
+
+
+static int
+write_tag_66_packet(char *signature, u8 cipher_code,
+		    struct ecryptfs_crypt_stat *crypt_stat, char **packet,
+		    size_t *packet_len)
+{
+	size_t i = 0;
+	size_t j;
+	size_t data_len;
+	size_t checksum = 0;
+	size_t packet_size_len;
+	char *message;
+	int rc;
+
+	/*
+	 *              ***** TAG 66 Packet Format *****
+	 *         | Content Type             | 1 byte       |
+	 *         | Key Identifier Size      | 1 or 2 bytes |
+	 *         | Key Identifier           | arbitrary    |
+	 *         | File Encryption Key Size | 1 or 2 bytes |
+	 *         | File Encryption Key      | arbitrary    |
+	 */
+	data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + crypt_stat->key_size);
+	*packet = kmalloc(data_len, GFP_KERNEL);
+	message = *packet;
+	if (!message) {
+		ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	message[i++] = ECRYPTFS_TAG_66_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX);
+	i += ECRYPTFS_SIG_SIZE_HEX;
+	/* The encrypted key includes 1 byte cipher code and 2 byte checksum */
+	rc = ecryptfs_write_packet_length(&message[i], crypt_stat->key_size + 3,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	message[i++] = cipher_code;
+	memcpy(&message[i], crypt_stat->key, crypt_stat->key_size);
+	i += crypt_stat->key_size;
+	for (j = 0; j < crypt_stat->key_size; j++)
+		checksum += crypt_stat->key[j];
+	message[i++] = (checksum / 256) % 256;
+	message[i++] = (checksum % 256);
+	*packet_len = i;
+out:
+	return rc;
+}
+
+static int
+parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
+		    struct ecryptfs_message *msg)
+{
+	size_t i = 0;
+	char *data;
+	size_t data_len;
+	size_t message_len;
+	int rc;
+
+	/*
+	 *              ***** TAG 65 Packet Format *****
+	 *    | Content Type                       | 1 byte       |
+	 *    | Status Indicator                   | 1 byte       |
+	 *    | Encrypted File Encryption Key Size | 1 or 2 bytes |
+	 *    | Encrypted File Encryption Key      | arbitrary    |
+	 */
+	message_len = msg->data_len;
+	data = msg->data;
+	/* verify that everything through the encrypted FEK size is present */
+	if (message_len < 4) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
+		       "message length is [%d]\n", __func__, message_len, 4);
+		goto out;
+	}
+	if (data[i++] != ECRYPTFS_TAG_67_PACKET_TYPE) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Type should be ECRYPTFS_TAG_67\n",
+		       __func__);
+		goto out;
+	}
+	if (data[i++]) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Status indicator has non zero "
+		       "value [%d]\n", __func__, data[i-1]);
+
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[i], &key_rec->enc_key_size,
+					  &data_len);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
+				"rc = [%d]\n", rc);
+		goto out;
+	}
+	i += data_len;
+	if (message_len < (i + key_rec->enc_key_size)) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
+		       __func__, message_len, (i + key_rec->enc_key_size));
+		goto out;
+	}
+	if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
+		       "the maximum key size [%d]\n", __func__,
+		       key_rec->enc_key_size,
+		       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
+		goto out;
+	}
+	memcpy(key_rec->enc_key, &data[i], key_rec->enc_key_size);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_verify_version
+ * @version: The version number to confirm
+ *
+ * Returns zero on good version; non-zero otherwise
+ */
+static int ecryptfs_verify_version(u16 version)
+{
+	int rc = 0;
+	unsigned char major;
+	unsigned char minor;
+
+	major = ((version >> 8) & 0xFF);
+	minor = (version & 0xFF);
+	if (major != ECRYPTFS_VERSION_MAJOR) {
+		ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
+				"Expected [%d]; got [%d]\n",
+				ECRYPTFS_VERSION_MAJOR, major);
+		rc = -EINVAL;
+		goto out;
+	}
+	if (minor != ECRYPTFS_VERSION_MINOR) {
+		ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
+				"Expected [%d]; got [%d]\n",
+				ECRYPTFS_VERSION_MINOR, minor);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_verify_auth_tok_from_key
+ * @auth_tok_key: key containing the authentication token
+ * @auth_tok: authentication token
+ *
+ * Returns zero on valid auth tok; -EINVAL otherwise
+ */
+static int
+ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key,
+				  struct ecryptfs_auth_tok **auth_tok)
+{
+	int rc = 0;
+
+	(*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key);
+	if (ecryptfs_verify_version((*auth_tok)->version)) {
+		printk(KERN_ERR "Data structure version mismatch. Userspace "
+		       "tools must match eCryptfs kernel module with major "
+		       "version [%d] and minor version [%d]\n",
+		       ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR);
+		rc = -EINVAL;
+		goto out;
+	}
+	if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
+	    && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
+		printk(KERN_ERR "Invalid auth_tok structure "
+		       "returned from key query\n");
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int
+ecryptfs_find_global_auth_tok_for_sig(
+	struct key **auth_tok_key,
+	struct ecryptfs_auth_tok **auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+	struct ecryptfs_global_auth_tok *walker;
+	int rc = 0;
+
+	(*auth_tok_key) = NULL;
+	(*auth_tok) = NULL;
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry(walker,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX))
+			continue;
+
+		if (walker->flags & ECRYPTFS_AUTH_TOK_INVALID) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = key_validate(walker->global_auth_tok_key);
+		if (rc) {
+			if (rc == -EKEYEXPIRED)
+				goto out;
+			goto out_invalid_auth_tok;
+		}
+
+		down_write(&(walker->global_auth_tok_key->sem));
+		rc = ecryptfs_verify_auth_tok_from_key(
+				walker->global_auth_tok_key, auth_tok);
+		if (rc)
+			goto out_invalid_auth_tok_unlock;
+
+		(*auth_tok_key) = walker->global_auth_tok_key;
+		key_get(*auth_tok_key);
+		goto out;
+	}
+	rc = -ENOENT;
+	goto out;
+out_invalid_auth_tok_unlock:
+	up_write(&(walker->global_auth_tok_key->sem));
+out_invalid_auth_tok:
+	printk(KERN_WARNING "Invalidating auth tok with sig = [%s]\n", sig);
+	walker->flags |= ECRYPTFS_AUTH_TOK_INVALID;
+	key_put(walker->global_auth_tok_key);
+	walker->global_auth_tok_key = NULL;
+out:
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	return rc;
+}
+
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+	struct key **auth_tok_key,
+	struct ecryptfs_auth_tok **auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	char *sig)
+{
+	int rc = 0;
+
+	rc = ecryptfs_find_global_auth_tok_for_sig(auth_tok_key, auth_tok,
+						   mount_crypt_stat, sig);
+	if (rc == -ENOENT) {
+		/* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
+		 * mount_crypt_stat structure, we prevent to use auth toks that
+		 * are not inserted through the ecryptfs_add_global_auth_tok
+		 * function.
+		 */
+		if (mount_crypt_stat->flags
+				& ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+			return -EINVAL;
+
+		rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
+						       sig);
+	}
+	return rc;
+}
+
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	size_t j;
+	size_t num_rand_bytes;
+	struct mutex *tfm_mutex;
+	char *block_aligned_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg[2];
+	struct scatterlist dst_sg[2];
+	struct blkcipher_desc desc;
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	struct hash_desc hash_desc;
+	struct scatterlist hash_sg;
+};
+
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size)
+{
+	struct ecryptfs_write_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
+	int rc = 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	(*packet_size) = 0;
+	rc = ecryptfs_find_auth_tok_for_sig(
+		&auth_tok_key,
+		&s->auth_tok, mount_crypt_stat,
+		mount_crypt_stat->global_default_fnek_sig);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__,
+		       mount_crypt_stat->global_default_fnek_sig, rc);
+		goto out;
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+		&s->desc.tfm,
+		&s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       mount_crypt_stat->global_default_fn_cipher_name, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+	/* Plus one for the \0 separator between the random prefix
+	 * and the plaintext filename */
+	s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+	s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+	if ((s->block_aligned_filename_size % s->block_size) != 0) {
+		s->num_rand_bytes += (s->block_size
+				      - (s->block_aligned_filename_size
+					 % s->block_size));
+		s->block_aligned_filename_size = (s->num_rand_bytes
+						  + filename_size);
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random characters, a \0
+	 *    separator, and then the filename */
+	s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE
+			      + s->block_aligned_filename_size);
+	if (dest == NULL) {
+		(*packet_size) = s->max_packet_size;
+		goto out_unlock;
+	}
+	if (s->max_packet_size > (*remaining_bytes)) {
+		printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+		       "[%zd] available\n", __func__, s->max_packet_size,
+		       (*remaining_bytes));
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+	s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+					    GFP_KERNEL);
+	if (!s->block_aligned_filename) {
+		printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+		       "kzalloc [%zd] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	s->i = 0;
+	dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[s->i],
+					  (ECRYPTFS_SIG_SIZE
+					   + 1 /* Cipher code */
+					   + s->block_aligned_filename_size),
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error generating tag 70 packet "
+		       "header; cannot generate packet length; rc = [%d]\n",
+		       __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i += s->packet_size_len;
+	ecryptfs_from_hex(&dest[s->i],
+			  mount_crypt_stat->global_default_fnek_sig,
+			  ECRYPTFS_SIG_SIZE);
+	s->i += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = ecryptfs_code_for_cipher_string(
+		mount_crypt_stat->global_default_fn_cipher_name,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (s->cipher_code == 0) {
+		printk(KERN_WARNING "%s: Unable to generate code for "
+		       "cipher [%s] with key bytes [%zd]\n", __func__,
+		       mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	dest[s->i++] = s->cipher_code;
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+		rc = -EOPNOTSUPP;
+		printk(KERN_INFO "%s: Filename encryption only supports "
+		       "password tokens\n", __func__);
+		goto out_free_unlock;
+	}
+	sg_init_one(
+		&s->hash_sg,
+		(u8 *)s->auth_tok->token.password.session_key_encryption_key,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+					     CRYPTO_ALG_ASYNC);
+	if (IS_ERR(s->hash_desc.tfm)) {
+			rc = PTR_ERR(s->hash_desc.tfm);
+			printk(KERN_ERR "%s: Error attempting to "
+			       "allocate hash crypto context; rc = [%d]\n",
+			       __func__, rc);
+			goto out_free_unlock;
+	}
+	rc = crypto_hash_init(&s->hash_desc);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error initializing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_update(
+		&s->hash_desc, &s->hash_sg,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error updating crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_final(&s->hash_desc, s->hash);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error finalizing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+		s->block_aligned_filename[s->j] =
+			s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+		if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+		    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+			sg_init_one(&s->hash_sg, (u8 *)s->hash,
+				    ECRYPTFS_TAG_70_DIGEST_SIZE);
+			rc = crypto_hash_init(&s->hash_desc);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error initializing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+						ECRYPTFS_TAG_70_DIGEST_SIZE);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error updating crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error finalizing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			memcpy(s->hash, s->tmp_hash,
+			       ECRYPTFS_TAG_70_DIGEST_SIZE);
+		}
+		if (s->block_aligned_filename[s->j] == '\0')
+			s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+	}
+	memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+	       filename_size);
+	rc = virt_to_scatterlist(s->block_aligned_filename,
+				 s->block_aligned_filename_size, s->src_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert filename memory to scatterlist; rc = [%d]. "
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+				 s->dst_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
+		       __func__, rc, s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	/* The characters in the first block effectively do the job
+	 * of the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_release_free_unlock;
+	}
+	s->i += s->block_aligned_filename_size;
+	(*packet_size) = s->i;
+	(*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+	crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+	kzfree(s->block_aligned_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	if (auth_tok_key) {
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+	}
+	kfree(s);
+	return rc;
+}
+
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t parsed_tag_70_packet_size;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	struct mutex *tfm_mutex;
+	char *decrypted_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg[2];
+	struct scatterlist dst_sg[2];
+	struct blkcipher_desc desc;
+	char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+};
+
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ *                 kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ *               in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ *        packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ *                   from @data
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size)
+{
+	struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*filename_size) = 0;
+	(*filename) = NULL;
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
+		       "at least [%d]\n", __func__, max_packet_size,
+		       ECRYPTFS_TAG_70_MIN_METADATA_SIZE);
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random numbers, a \0
+	 *    separator, and then the filename */
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+		printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+		       "tag [0x%.2x]\n", __func__,
+		       data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+					  &s->parsed_tag_70_packet_size,
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out;
+	}
+	s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+					  - ECRYPTFS_SIG_SIZE - 1);
+	if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+	    > max_packet_size) {
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+		       "size is [%zd]\n", __func__, max_packet_size,
+		       (1 + s->packet_size_len + 1
+			+ s->block_aligned_filename_size));
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += s->packet_size_len;
+	ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+			ECRYPTFS_SIG_SIZE);
+	s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	(*packet_size) += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = data[(*packet_size)++];
+	rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+	if (rc) {
+		printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+		       __func__, s->cipher_code);
+		goto out;
+	}
+	rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					    &s->auth_tok, mount_crypt_stat,
+					    s->fnek_sig_hex);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+		       rc);
+		goto out;
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+							&s->tfm_mutex,
+							s->cipher_string);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       s->cipher_string, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	rc = virt_to_scatterlist(&data[(*packet_size)],
+				 s->block_aligned_filename_size, s->src_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
+		       __func__, rc, s->block_aligned_filename_size);
+		goto out_unlock;
+	}
+	(*packet_size) += s->block_aligned_filename_size;
+	s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+					GFP_KERNEL);
+	if (!s->decrypted_filename) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%zd] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	rc = virt_to_scatterlist(s->decrypted_filename,
+				 s->block_aligned_filename_size, s->dst_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert decrypted filename memory to scatterlist; "
+		       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
+		       __func__, rc, s->block_aligned_filename_size);
+		goto out_free_unlock;
+	}
+	/* The characters in the first block effectively do the job of
+	 * the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+		rc = -EOPNOTSUPP;
+		printk(KERN_INFO "%s: Filename encryption only supports "
+		       "password tokens\n", __func__);
+		goto out_free_unlock;
+	}
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_free_unlock;
+	}
+	rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i = 0;
+	while (s->decrypted_filename[s->i] != '\0'
+	       && s->i < s->block_aligned_filename_size)
+		s->i++;
+	if (s->i == s->block_aligned_filename_size) {
+		printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+		       "find valid separator between random characters and "
+		       "the filename\n", __func__);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	s->i++;
+	(*filename_size) = (s->block_aligned_filename_size - s->i);
+	if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+		printk(KERN_WARNING "%s: Filename size is [%zd], which is "
+		       "invalid\n", __func__, (*filename_size));
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	(*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+	if (!(*filename)) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%zd] bytes\n", __func__,
+		       ((*filename_size) + 1));
+		rc = -ENOMEM;
+		goto out_free_unlock;
+	}
+	memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+	(*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+	kfree(s->decrypted_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	if (rc) {
+		(*packet_size) = 0;
+		(*filename_size) = 0;
+		(*filename) = NULL;
+	}
+	if (auth_tok_key) {
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+	}
+	kfree(s);
+	return rc;
+}
+
+static int
+ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
+{
+	int rc = 0;
+
+	(*sig) = NULL;
+	switch (auth_tok->token_type) {
+	case ECRYPTFS_PASSWORD:
+		(*sig) = auth_tok->token.password.signature;
+		break;
+	case ECRYPTFS_PRIVATE_KEY:
+		(*sig) = auth_tok->token.private_key.signature;
+		break;
+	default:
+		printk(KERN_ERR "Cannot get sig for auth_tok of type [%d]\n",
+		       auth_tok->token_type);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+/**
+ * decrypt_pki_encrypted_session_key - Decrypt the session key with the given auth_tok.
+ * @auth_tok: The key authentication token used to decrypt the session key
+ * @crypt_stat: The cryptographic context
+ *
+ * Returns zero on success; non-zero error otherwise.
+ */
+static int
+decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
+				  struct ecryptfs_crypt_stat *crypt_stat)
+{
+	u8 cipher_code = 0;
+	struct ecryptfs_msg_ctx *msg_ctx;
+	struct ecryptfs_message *msg = NULL;
+	char *auth_tok_sig;
+	char *payload = NULL;
+	size_t payload_len = 0;
+	int rc;
+
+	rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok);
+	if (rc) {
+		printk(KERN_ERR "Unrecognized auth tok type: [%d]\n",
+		       auth_tok->token_type);
+		goto out;
+	}
+	rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key),
+				 &payload, &payload_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n");
+		goto out;
+	}
+	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error sending message to "
+				"ecryptfsd: %d\n", rc);
+		goto out;
+	}
+	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failed to receive tag 65 packet "
+				"from the user space daemon\n");
+		rc = -EIO;
+		goto out;
+	}
+	rc = parse_tag_65_packet(&(auth_tok->session_key),
+				 &cipher_code, msg);
+	if (rc) {
+		printk(KERN_ERR "Failed to parse tag 65 packet; rc = [%d]\n",
+		       rc);
+		goto out;
+	}
+	auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+	memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
+	       auth_tok->session_key.decrypted_key_size);
+	crypt_stat->key_size = auth_tok->session_key.decrypted_key_size;
+	rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n",
+				cipher_code)
+		goto out;
+	}
+	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
+	if (ecryptfs_verbosity > 0) {
+		ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n");
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+out:
+	kfree(msg);
+	kfree(payload);
+	return rc;
+}
+
+static void wipe_auth_tok_list(struct list_head *auth_tok_list_head)
+{
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp;
+
+	list_for_each_entry_safe(auth_tok_list_item, auth_tok_list_item_tmp,
+				 auth_tok_list_head, list) {
+		list_del(&auth_tok_list_item->list);
+		kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+				auth_tok_list_item);
+	}
+}
+
+struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+
+/**
+ * parse_tag_1_packet
+ * @crypt_stat: The cryptographic context to modify based on packet contents
+ * @data: The raw bytes of the packet.
+ * @auth_tok_list: eCryptfs parses packets into authentication tokens;
+ *                 a new authentication token will be placed at the
+ *                 end of this list for this packet.
+ * @new_auth_tok: Pointer to a pointer to memory that this function
+ *                allocates; sets the memory address of the pointer to
+ *                NULL on error. This object is added to the
+ *                auth_tok_list.
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error.
+ * @max_packet_size: The maximum allowable packet size
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
+		   unsigned char *data, struct list_head *auth_tok_list,
+		   struct ecryptfs_auth_tok **new_auth_tok,
+		   size_t *packet_size, size_t max_packet_size)
+{
+	size_t body_size;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	size_t length_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*new_auth_tok) = NULL;
+	/**
+	 * This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 1
+	 *
+	 * Tag 1 identifier (1 byte)
+	 * Max Tag 1 packet size (max 3 bytes)
+	 * Version (1 byte)
+	 * Key identifier (8 bytes; ECRYPTFS_SIG_SIZE)
+	 * Cipher identifier (1 byte)
+	 * Encrypted key size (arbitrary)
+	 *
+	 * 12 bytes minimum packet size
+	 */
+	if (unlikely(max_packet_size < 12)) {
+		printk(KERN_ERR "Invalid max packet size; must be >=12\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_1_PACKET_TYPE) {
+		printk(KERN_ERR "Enter w/ first byte != 0x%.2x\n",
+		       ECRYPTFS_TAG_1_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
+	 * at end of function upon failure */
+	auth_tok_list_item =
+		kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache,
+				  GFP_KERNEL);
+	if (!auth_tok_list_item) {
+		printk(KERN_ERR "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	(*new_auth_tok) = &auth_tok_list_item->auth_tok;
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
+	if (rc) {
+		printk(KERN_WARNING "Error parsing packet length; "
+		       "rc = [%d]\n", rc);
+		goto out_free;
+	}
+	if (unlikely(body_size < (ECRYPTFS_SIG_SIZE + 2))) {
+		printk(KERN_WARNING "Invalid body size ([%td])\n", body_size);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	(*packet_size) += length_size;
+	if (unlikely((*packet_size) + body_size > max_packet_size)) {
+		printk(KERN_WARNING "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+	if (unlikely(data[(*packet_size)++] != 0x03)) {
+		printk(KERN_WARNING "Unknown version number [%d]\n",
+		       data[(*packet_size) - 1]);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	ecryptfs_to_hex((*new_auth_tok)->token.private_key.signature,
+			&data[(*packet_size)], ECRYPTFS_SIG_SIZE);
+	*packet_size += ECRYPTFS_SIG_SIZE;
+	/* This byte is skipped because the kernel does not need to
+	 * know which public key encryption algorithm was used */
+	(*packet_size)++;
+	(*new_auth_tok)->session_key.encrypted_key_size =
+		body_size - (ECRYPTFS_SIG_SIZE + 2);
+	if ((*new_auth_tok)->session_key.encrypted_key_size
+	    > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+		printk(KERN_WARNING "Tag 1 packet contains key larger "
+		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES");
+		rc = -EINVAL;
+		goto out;
+	}
+	memcpy((*new_auth_tok)->session_key.encrypted_key,
+	       &data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2)));
+	(*packet_size) += (*new_auth_tok)->session_key.encrypted_key_size;
+	(*new_auth_tok)->session_key.flags &=
+		~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+	(*new_auth_tok)->session_key.flags |=
+		ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
+	(*new_auth_tok)->token_type = ECRYPTFS_PRIVATE_KEY;
+	(*new_auth_tok)->flags = 0;
+	(*new_auth_tok)->session_key.flags &=
+		~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
+	(*new_auth_tok)->session_key.flags &=
+		~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
+	list_add(&auth_tok_list_item->list, auth_tok_list);
+	goto out;
+out_free:
+	(*new_auth_tok) = NULL;
+	memset(auth_tok_list_item, 0,
+	       sizeof(struct ecryptfs_auth_tok_list_item));
+	kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+			auth_tok_list_item);
+out:
+	if (rc)
+		(*packet_size) = 0;
+	return rc;
+}
+
+/**
+ * parse_tag_3_packet
+ * @crypt_stat: The cryptographic context to modify based on packet
+ *              contents.
+ * @data: The raw bytes of the packet.
+ * @auth_tok_list: eCryptfs parses packets into authentication tokens;
+ *                 a new authentication token will be placed at the end
+ *                 of this list for this packet.
+ * @new_auth_tok: Pointer to a pointer to memory that this function
+ *                allocates; sets the memory address of the pointer to
+ *                NULL on error. This object is added to the
+ *                auth_tok_list.
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error.
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
+		   unsigned char *data, struct list_head *auth_tok_list,
+		   struct ecryptfs_auth_tok **new_auth_tok,
+		   size_t *packet_size, size_t max_packet_size)
+{
+	size_t body_size;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	size_t length_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*new_auth_tok) = NULL;
+	/**
+	 *This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 3
+	 *
+	 * Tag 3 identifier (1 byte)
+	 * Max Tag 3 packet size (max 3 bytes)
+	 * Version (1 byte)
+	 * Cipher code (1 byte)
+	 * S2K specifier (1 byte)
+	 * Hash identifier (1 byte)
+	 * Salt (ECRYPTFS_SALT_SIZE)
+	 * Hash iterations (1 byte)
+	 * Encrypted key (arbitrary)
+	 *
+	 * (ECRYPTFS_SALT_SIZE + 7) minimum packet size
+	 */
+	if (max_packet_size < (ECRYPTFS_SALT_SIZE + 7)) {
+		printk(KERN_ERR "Max packet size too large\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) {
+		printk(KERN_ERR "First byte != 0x%.2x; invalid packet\n",
+		       ECRYPTFS_TAG_3_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
+	 * at end of function upon failure */
+	auth_tok_list_item =
+	    kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL);
+	if (!auth_tok_list_item) {
+		printk(KERN_ERR "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	(*new_auth_tok) = &auth_tok_list_item->auth_tok;
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
+	if (rc) {
+		printk(KERN_WARNING "Error parsing packet length; rc = [%d]\n",
+		       rc);
+		goto out_free;
+	}
+	if (unlikely(body_size < (ECRYPTFS_SALT_SIZE + 5))) {
+		printk(KERN_WARNING "Invalid body size ([%td])\n", body_size);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	(*packet_size) += length_size;
+	if (unlikely((*packet_size) + body_size > max_packet_size)) {
+		printk(KERN_ERR "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+	(*new_auth_tok)->session_key.encrypted_key_size =
+		(body_size - (ECRYPTFS_SALT_SIZE + 5));
+	if ((*new_auth_tok)->session_key.encrypted_key_size
+	    > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+		printk(KERN_WARNING "Tag 3 packet contains key larger "
+		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+	if (unlikely(data[(*packet_size)++] != 0x04)) {
+		printk(KERN_WARNING "Unknown version number [%d]\n",
+		       data[(*packet_size) - 1]);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher,
+					    (u16)data[(*packet_size)]);
+	if (rc)
+		goto out_free;
+	/* A little extra work to differentiate among the AES key
+	 * sizes; see RFC2440 */
+	switch(data[(*packet_size)++]) {
+	case RFC2440_CIPHER_AES_192:
+		crypt_stat->key_size = 24;
+		break;
+	default:
+		crypt_stat->key_size =
+			(*new_auth_tok)->session_key.encrypted_key_size;
+	}
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc)
+		goto out_free;
+	if (unlikely(data[(*packet_size)++] != 0x03)) {
+		printk(KERN_WARNING "Only S2K ID 3 is currently supported\n");
+		rc = -ENOSYS;
+		goto out_free;
+	}
+	/* TODO: finish the hash mapping */
+	switch (data[(*packet_size)++]) {
+	case 0x01: /* See RFC2440 for these numbers and their mappings */
+		/* Choose MD5 */
+		memcpy((*new_auth_tok)->token.password.salt,
+		       &data[(*packet_size)], ECRYPTFS_SALT_SIZE);
+		(*packet_size) += ECRYPTFS_SALT_SIZE;
+		/* This conversion was taken straight from RFC2440 */
+		(*new_auth_tok)->token.password.hash_iterations =
+			((u32) 16 + (data[(*packet_size)] & 15))
+				<< ((data[(*packet_size)] >> 4) + 6);
+		(*packet_size)++;
+		/* Friendly reminder:
+		 * (*new_auth_tok)->session_key.encrypted_key_size =
+		 *         (body_size - (ECRYPTFS_SALT_SIZE + 5)); */
+		memcpy((*new_auth_tok)->session_key.encrypted_key,
+		       &data[(*packet_size)],
+		       (*new_auth_tok)->session_key.encrypted_key_size);
+		(*packet_size) +=
+			(*new_auth_tok)->session_key.encrypted_key_size;
+		(*new_auth_tok)->session_key.flags &=
+			~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+		(*new_auth_tok)->session_key.flags |=
+			ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
+		(*new_auth_tok)->token.password.hash_algo = 0x01; /* MD5 */
+		break;
+	default:
+		ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: "
+				"[%d]\n", data[(*packet_size) - 1]);
+		rc = -ENOSYS;
+		goto out_free;
+	}
+	(*new_auth_tok)->token_type = ECRYPTFS_PASSWORD;
+	/* TODO: Parametarize; we might actually want userspace to
+	 * decrypt the session key. */
+	(*new_auth_tok)->session_key.flags &=
+			    ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
+	(*new_auth_tok)->session_key.flags &=
+			    ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
+	list_add(&auth_tok_list_item->list, auth_tok_list);
+	goto out;
+out_free:
+	(*new_auth_tok) = NULL;
+	memset(auth_tok_list_item, 0,
+	       sizeof(struct ecryptfs_auth_tok_list_item));
+	kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+			auth_tok_list_item);
+out:
+	if (rc)
+		(*packet_size) = 0;
+	return rc;
+}
+
+/**
+ * parse_tag_11_packet
+ * @data: The raw bytes of the packet
+ * @contents: This function writes the data contents of the literal
+ *            packet into this memory location
+ * @max_contents_bytes: The maximum number of bytes that this function
+ *                      is allowed to write into contents
+ * @tag_11_contents_size: This function writes the size of the parsed
+ *                        contents into this memory location; zero on
+ *                        error
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_11_packet(unsigned char *data, unsigned char *contents,
+		    size_t max_contents_bytes, size_t *tag_11_contents_size,
+		    size_t *packet_size, size_t max_packet_size)
+{
+	size_t body_size;
+	size_t length_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*tag_11_contents_size) = 0;
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 11
+	 *
+	 * Tag 11 identifier (1 byte)
+	 * Max Tag 11 packet size (max 3 bytes)
+	 * Binary format specifier (1 byte)
+	 * Filename length (1 byte)
+	 * Filename ("_CONSOLE") (8 bytes)
+	 * Modification date (4 bytes)
+	 * Literal data (arbitrary)
+	 *
+	 * We need at least 16 bytes of data for the packet to even be
+	 * valid.
+	 */
+	if (max_packet_size < 16) {
+		printk(KERN_ERR "Maximum packet size too small\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) {
+		printk(KERN_WARNING "Invalid tag 11 packet format\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
+	if (rc) {
+		printk(KERN_WARNING "Invalid tag 11 packet format\n");
+		goto out;
+	}
+	if (body_size < 14) {
+		printk(KERN_WARNING "Invalid body size ([%td])\n", body_size);
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += length_size;
+	(*tag_11_contents_size) = (body_size - 14);
+	if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) {
+		printk(KERN_ERR "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+		printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+		       "expected size\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != 0x62) {
+		printk(KERN_WARNING "Unrecognizable packet\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != 0x08) {
+		printk(KERN_WARNING "Unrecognizable packet\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += 12; /* Ignore filename and modification date */
+	memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size));
+	(*packet_size) += (*tag_11_contents_size);
+out:
+	if (rc) {
+		(*packet_size) = 0;
+		(*tag_11_contents_size) = 0;
+	}
+	return rc;
+}
+
+int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
+				      struct ecryptfs_auth_tok **auth_tok,
+				      char *sig)
+{
+	int rc = 0;
+
+	(*auth_tok_key) = request_key(&key_type_user, sig, NULL);
+	if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
+		(*auth_tok_key) = ecryptfs_get_encrypted_key(sig);
+		if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
+			printk(KERN_ERR "Could not find key with description: [%s]\n",
+			      sig);
+			rc = process_request_key_err(PTR_ERR(*auth_tok_key));
+			(*auth_tok_key) = NULL;
+			goto out;
+		}
+	}
+	down_write(&(*auth_tok_key)->sem);
+	rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok);
+	if (rc) {
+		up_write(&(*auth_tok_key)->sem);
+		key_put(*auth_tok_key);
+		(*auth_tok_key) = NULL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
+ * @auth_tok: The passphrase authentication token to use to encrypt the FEK
+ * @crypt_stat: The cryptographic context
+ *
+ * Returns zero on success; non-zero error otherwise
+ */
+static int
+decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
+					 struct ecryptfs_crypt_stat *crypt_stat)
+{
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
+	struct mutex *tfm_mutex;
+	struct blkcipher_desc desc = {
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(
+			KERN_DEBUG, "Session key encryption key (size [%d]):\n",
+			auth_tok->token.password.session_key_encryption_key_bytes);
+		ecryptfs_dump_hex(
+			auth_tok->token.password.session_key_encryption_key,
+			auth_tok->token.password.session_key_encryption_key_bytes);
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+							crypt_stat->cipher);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       crypt_stat->cipher, rc);
+		goto out;
+	}
+	rc = virt_to_scatterlist(auth_tok->session_key.encrypted_key,
+				 auth_tok->session_key.encrypted_key_size,
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
+		printk(KERN_ERR "Internal error whilst attempting to convert "
+			"auth_tok->session_key.encrypted_key to scatterlist; "
+			"expected rc = 1; got rc = [%d]. "
+		       "auth_tok->session_key.encrypted_key_size = [%d]\n", rc,
+			auth_tok->session_key.encrypted_key_size);
+		goto out;
+	}
+	auth_tok->session_key.decrypted_key_size =
+		auth_tok->session_key.encrypted_key_size;
+	rc = virt_to_scatterlist(auth_tok->session_key.decrypted_key,
+				 auth_tok->session_key.decrypted_key_size,
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
+		printk(KERN_ERR "Internal error whilst attempting to convert "
+			"auth_tok->session_key.decrypted_key to scatterlist; "
+			"expected rc = 1; got rc = [%d]\n", rc);
+		goto out;
+	}
+	mutex_lock(tfm_mutex);
+	rc = crypto_blkcipher_setkey(
+		desc.tfm, auth_tok->token.password.session_key_encryption_key,
+		crypt_stat->key_size);
+	if (unlikely(rc < 0)) {
+		mutex_unlock(tfm_mutex);
+		printk(KERN_ERR "Error setting key for crypto context\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
+				      auth_tok->session_key.encrypted_key_size);
+	mutex_unlock(tfm_mutex);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc);
+		goto out;
+	}
+	auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+	memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
+	       auth_tok->session_key.decrypted_key_size);
+	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
+				crypt_stat->key_size);
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_parse_packet_set
+ * @crypt_stat: The cryptographic context
+ * @src: Virtual address of region of memory containing the packets
+ * @ecryptfs_dentry: The eCryptfs dentry associated with the packet set
+ *
+ * Get crypt_stat to have the file's session key if the requisite key
+ * is available to decrypt the session key.
+ *
+ * Returns Zero if a valid authentication token was retrieved and
+ * processed; negative value for file not encrypted or for error
+ * conditions.
+ */
+int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+			      unsigned char *src,
+			      struct dentry *ecryptfs_dentry)
+{
+	size_t i = 0;
+	size_t found_auth_tok;
+	size_t next_packet_is_auth_tok_packet;
+	struct list_head auth_tok_list;
+	struct ecryptfs_auth_tok *matching_auth_tok;
+	struct ecryptfs_auth_tok *candidate_auth_tok;
+	char *candidate_auth_tok_sig;
+	size_t packet_size;
+	struct ecryptfs_auth_tok *new_auth_tok;
+	unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE];
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	size_t tag_11_contents_size;
+	size_t tag_11_packet_size;
+	struct key *auth_tok_key = NULL;
+	int rc = 0;
+
+	INIT_LIST_HEAD(&auth_tok_list);
+	/* Parse the header to find as many packets as we can; these will be
+	 * added the our &auth_tok_list */
+	next_packet_is_auth_tok_packet = 1;
+	while (next_packet_is_auth_tok_packet) {
+		size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+
+		switch (src[i]) {
+		case ECRYPTFS_TAG_3_PACKET_TYPE:
+			rc = parse_tag_3_packet(crypt_stat,
+						(unsigned char *)&src[i],
+						&auth_tok_list, &new_auth_tok,
+						&packet_size, max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error parsing "
+						"tag 3 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += packet_size;
+			rc = parse_tag_11_packet((unsigned char *)&src[i],
+						 sig_tmp_space,
+						 ECRYPTFS_SIG_SIZE,
+						 &tag_11_contents_size,
+						 &tag_11_packet_size,
+						 max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "No valid "
+						"(ecryptfs-specific) literal "
+						"packet containing "
+						"authentication token "
+						"signature found after "
+						"tag 3 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += tag_11_packet_size;
+			if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
+				ecryptfs_printk(KERN_ERR, "Expected "
+						"signature of size [%d]; "
+						"read size [%zd]\n",
+						ECRYPTFS_SIG_SIZE,
+						tag_11_contents_size);
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			ecryptfs_to_hex(new_auth_tok->token.password.signature,
+					sig_tmp_space, tag_11_contents_size);
+			new_auth_tok->token.password.signature[
+				ECRYPTFS_PASSWORD_SIG_SIZE] = '\0';
+			crypt_stat->flags |= ECRYPTFS_ENCRYPTED;
+			break;
+		case ECRYPTFS_TAG_1_PACKET_TYPE:
+			rc = parse_tag_1_packet(crypt_stat,
+						(unsigned char *)&src[i],
+						&auth_tok_list, &new_auth_tok,
+						&packet_size, max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error parsing "
+						"tag 1 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += packet_size;
+			crypt_stat->flags |= ECRYPTFS_ENCRYPTED;
+			break;
+		case ECRYPTFS_TAG_11_PACKET_TYPE:
+			ecryptfs_printk(KERN_WARNING, "Invalid packet set "
+					"(Tag 11 not allowed by itself)\n");
+			rc = -EIO;
+			goto out_wipe_list;
+		default:
+			ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
+					"of the file header; hex value of "
+					"character is [0x%.2x]\n", i, src[i]);
+			next_packet_is_auth_tok_packet = 0;
+		}
+	}
+	if (list_empty(&auth_tok_list)) {
+		printk(KERN_ERR "The lower file appears to be a non-encrypted "
+		       "eCryptfs file; this is not supported in this version "
+		       "of the eCryptfs kernel module\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	/* auth_tok_list contains the set of authentication tokens
+	 * parsed from the metadata. We need to find a matching
+	 * authentication token that has the secret component(s)
+	 * necessary to decrypt the EFEK in the auth_tok parsed from
+	 * the metadata. There may be several potential matches, but
+	 * just one will be sufficient to decrypt to get the FEK. */
+find_next_matching_auth_tok:
+	found_auth_tok = 0;
+	list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
+		candidate_auth_tok = &auth_tok_list_item->auth_tok;
+		if (unlikely(ecryptfs_verbosity > 0)) {
+			ecryptfs_printk(KERN_DEBUG,
+					"Considering cadidate auth tok:\n");
+			ecryptfs_dump_auth_tok(candidate_auth_tok);
+		}
+		rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig,
+					       candidate_auth_tok);
+		if (rc) {
+			printk(KERN_ERR
+			       "Unrecognized candidate auth tok type: [%d]\n",
+			       candidate_auth_tok->token_type);
+			rc = -EINVAL;
+			goto out_wipe_list;
+		}
+		rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					       &matching_auth_tok,
+					       crypt_stat->mount_crypt_stat,
+					       candidate_auth_tok_sig);
+		if (!rc) {
+			found_auth_tok = 1;
+			goto found_matching_auth_tok;
+		}
+	}
+	if (!found_auth_tok) {
+		ecryptfs_printk(KERN_ERR, "Could not find a usable "
+				"authentication token\n");
+		rc = -EIO;
+		goto out_wipe_list;
+	}
+found_matching_auth_tok:
+	if (candidate_auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) {
+		memcpy(&(candidate_auth_tok->token.private_key),
+		       &(matching_auth_tok->token.private_key),
+		       sizeof(struct ecryptfs_private_key));
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		rc = decrypt_pki_encrypted_session_key(candidate_auth_tok,
+						       crypt_stat);
+	} else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) {
+		memcpy(&(candidate_auth_tok->token.password),
+		       &(matching_auth_tok->token.password),
+		       sizeof(struct ecryptfs_password));
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		rc = decrypt_passphrase_encrypted_session_key(
+			candidate_auth_tok, crypt_stat);
+	} else {
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		rc = -EINVAL;
+	}
+	if (rc) {
+		struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp;
+
+		ecryptfs_printk(KERN_WARNING, "Error decrypting the "
+				"session key for authentication token with sig "
+				"[%.*s]; rc = [%d]. Removing auth tok "
+				"candidate from the list and searching for "
+				"the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
+				candidate_auth_tok_sig,	rc);
+		list_for_each_entry_safe(auth_tok_list_item,
+					 auth_tok_list_item_tmp,
+					 &auth_tok_list, list) {
+			if (candidate_auth_tok
+			    == &auth_tok_list_item->auth_tok) {
+				list_del(&auth_tok_list_item->list);
+				kmem_cache_free(
+					ecryptfs_auth_tok_list_item_cache,
+					auth_tok_list_item);
+				goto find_next_matching_auth_tok;
+			}
+		}
+		BUG();
+	}
+	rc = ecryptfs_compute_root_iv(crypt_stat);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error computing "
+				"the root IV\n");
+		goto out_wipe_list;
+	}
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error initializing crypto "
+				"context for cipher [%s]; rc = [%d]\n",
+				crypt_stat->cipher, rc);
+	}
+out_wipe_list:
+	wipe_auth_tok_list(&auth_tok_list);
+out:
+	return rc;
+}
+
+static int
+pki_encrypt_session_key(struct key *auth_tok_key,
+			struct ecryptfs_auth_tok *auth_tok,
+			struct ecryptfs_crypt_stat *crypt_stat,
+			struct ecryptfs_key_record *key_rec)
+{
+	struct ecryptfs_msg_ctx *msg_ctx = NULL;
+	char *payload = NULL;
+	size_t payload_len = 0;
+	struct ecryptfs_message *msg;
+	int rc;
+
+	rc = write_tag_66_packet(auth_tok->token.private_key.signature,
+				 ecryptfs_code_for_cipher_string(
+					 crypt_stat->cipher,
+					 crypt_stat->key_size),
+				 crypt_stat, &payload, &payload_len);
+	up_write(&(auth_tok_key->sem));
+	key_put(auth_tok_key);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
+		goto out;
+	}
+	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error sending message to "
+				"ecryptfsd: %d\n", rc);
+		goto out;
+	}
+	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failed to receive tag 67 packet "
+				"from the user space daemon\n");
+		rc = -EIO;
+		goto out;
+	}
+	rc = parse_tag_67_packet(key_rec, msg);
+	if (rc)
+		ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n");
+	kfree(msg);
+out:
+	kfree(payload);
+	return rc;
+}
+/**
+ * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet
+ * @dest: Buffer into which to write the packet
+ * @remaining_bytes: Maximum number of bytes that can be writtn
+ * @auth_tok_key: The authentication token key to unlock and put when done with
+ *                @auth_tok
+ * @auth_tok: The authentication token used for generating the tag 1 packet
+ * @crypt_stat: The cryptographic context
+ * @key_rec: The key record struct for the tag 1 packet
+ * @packet_size: This function will write the number of bytes that end
+ *               up constituting the packet; set to zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_1_packet(char *dest, size_t *remaining_bytes,
+		   struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok,
+		   struct ecryptfs_crypt_stat *crypt_stat,
+		   struct ecryptfs_key_record *key_rec, size_t *packet_size)
+{
+	size_t i;
+	size_t encrypted_session_key_valid = 0;
+	size_t packet_size_length;
+	size_t max_packet_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	ecryptfs_from_hex(key_rec->sig, auth_tok->token.private_key.signature,
+			  ECRYPTFS_SIG_SIZE);
+	encrypted_session_key_valid = 0;
+	for (i = 0; i < crypt_stat->key_size; i++)
+		encrypted_session_key_valid |=
+			auth_tok->session_key.encrypted_key[i];
+	if (encrypted_session_key_valid) {
+		memcpy(key_rec->enc_key,
+		       auth_tok->session_key.encrypted_key,
+		       auth_tok->session_key.encrypted_key_size);
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		goto encrypted_session_key_set;
+	}
+	if (auth_tok->session_key.encrypted_key_size == 0)
+		auth_tok->session_key.encrypted_key_size =
+			auth_tok->token.private_key.key_size;
+	rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat,
+				     key_rec);
+	if (rc) {
+		printk(KERN_ERR "Failed to encrypt session key via a key "
+		       "module; rc = [%d]\n", rc);
+		goto out;
+	}
+	if (ecryptfs_verbosity > 0) {
+		ecryptfs_printk(KERN_DEBUG, "Encrypted key:\n");
+		ecryptfs_dump_hex(key_rec->enc_key, key_rec->enc_key_size);
+	}
+encrypted_session_key_set:
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 1 */
+	max_packet_size = (1                         /* Tag 1 identifier */
+			   + 3                       /* Max Tag 1 packet size */
+			   + 1                       /* Version */
+			   + ECRYPTFS_SIG_SIZE       /* Key identifier */
+			   + 1                       /* Cipher identifier */
+			   + key_rec->enc_key_size); /* Encrypted key size */
+	if (max_packet_size > (*remaining_bytes)) {
+		printk(KERN_ERR "Packet length larger than maximum allowable; "
+		       "need up to [%td] bytes, but there are only [%td] "
+		       "available\n", max_packet_size, (*remaining_bytes));
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_size)++] = ECRYPTFS_TAG_1_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[(*packet_size)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 1 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	(*packet_size) += packet_size_length;
+	dest[(*packet_size)++] = 0x03; /* version 3 */
+	memcpy(&dest[(*packet_size)], key_rec->sig, ECRYPTFS_SIG_SIZE);
+	(*packet_size) += ECRYPTFS_SIG_SIZE;
+	dest[(*packet_size)++] = RFC2440_CIPHER_RSA;
+	memcpy(&dest[(*packet_size)], key_rec->enc_key,
+	       key_rec->enc_key_size);
+	(*packet_size) += key_rec->enc_key_size;
+out:
+	if (rc)
+		(*packet_size) = 0;
+	else
+		(*remaining_bytes) -= (*packet_size);
+	return rc;
+}
+
+/**
+ * write_tag_11_packet
+ * @dest: Target into which Tag 11 packet is to be written
+ * @remaining_bytes: Maximum packet length
+ * @contents: Byte array of contents to copy in
+ * @contents_length: Number of bytes in contents
+ * @packet_length: Length of the Tag 11 packet written; zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_11_packet(char *dest, size_t *remaining_bytes, char *contents,
+		    size_t contents_length, size_t *packet_length)
+{
+	size_t packet_size_length;
+	size_t max_packet_size;
+	int rc = 0;
+
+	(*packet_length) = 0;
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 11 */
+	max_packet_size = (1                   /* Tag 11 identifier */
+			   + 3                 /* Max Tag 11 packet size */
+			   + 1                 /* Binary format specifier */
+			   + 1                 /* Filename length */
+			   + 8                 /* Filename ("_CONSOLE") */
+			   + 4                 /* Modification date */
+			   + contents_length); /* Literal data */
+	if (max_packet_size > (*remaining_bytes)) {
+		printk(KERN_ERR "Packet length larger than maximum allowable; "
+		       "need up to [%td] bytes, but there are only [%td] "
+		       "available\n", max_packet_size, (*remaining_bytes));
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[(*packet_length)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_ERR "Error generating tag 11 packet header; cannot "
+		       "generate packet length. rc = [%d]\n", rc);
+		goto out;
+	}
+	(*packet_length) += packet_size_length;
+	dest[(*packet_length)++] = 0x62; /* binary data format specifier */
+	dest[(*packet_length)++] = 8;
+	memcpy(&dest[(*packet_length)], "_CONSOLE", 8);
+	(*packet_length) += 8;
+	memset(&dest[(*packet_length)], 0x00, 4);
+	(*packet_length) += 4;
+	memcpy(&dest[(*packet_length)], contents, contents_length);
+	(*packet_length) += contents_length;
+ out:
+	if (rc)
+		(*packet_length) = 0;
+	else
+		(*remaining_bytes) -= (*packet_length);
+	return rc;
+}
+
+/**
+ * write_tag_3_packet
+ * @dest: Buffer into which to write the packet
+ * @remaining_bytes: Maximum number of bytes that can be written
+ * @auth_tok: Authentication token
+ * @crypt_stat: The cryptographic context
+ * @key_rec: encrypted key
+ * @packet_size: This function will write the number of bytes that end
+ *               up constituting the packet; set to zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_3_packet(char *dest, size_t *remaining_bytes,
+		   struct ecryptfs_auth_tok *auth_tok,
+		   struct ecryptfs_crypt_stat *crypt_stat,
+		   struct ecryptfs_key_record *key_rec, size_t *packet_size)
+{
+	size_t i;
+	size_t encrypted_session_key_valid = 0;
+	char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
+	struct mutex *tfm_mutex = NULL;
+	u8 cipher_code;
+	size_t packet_size_length;
+	size_t max_packet_size;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		crypt_stat->mount_crypt_stat;
+	struct blkcipher_desc desc = {
+		.tfm = NULL,
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	(*packet_size) = 0;
+	ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature,
+			  ECRYPTFS_SIG_SIZE);
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+							crypt_stat->cipher);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       crypt_stat->cipher, rc);
+		goto out;
+	}
+	if (mount_crypt_stat->global_default_cipher_key_size == 0) {
+		struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm);
+
+		printk(KERN_WARNING "No key size specified at mount; "
+		       "defaulting to [%d]\n", alg->max_keysize);
+		mount_crypt_stat->global_default_cipher_key_size =
+			alg->max_keysize;
+	}
+	if (crypt_stat->key_size == 0)
+		crypt_stat->key_size =
+			mount_crypt_stat->global_default_cipher_key_size;
+	if (auth_tok->session_key.encrypted_key_size == 0)
+		auth_tok->session_key.encrypted_key_size =
+			crypt_stat->key_size;
+	if (crypt_stat->key_size == 24
+	    && strcmp("aes", crypt_stat->cipher) == 0) {
+		memset((crypt_stat->key + 24), 0, 8);
+		auth_tok->session_key.encrypted_key_size = 32;
+	} else
+		auth_tok->session_key.encrypted_key_size = crypt_stat->key_size;
+	key_rec->enc_key_size =
+		auth_tok->session_key.encrypted_key_size;
+	encrypted_session_key_valid = 0;
+	for (i = 0; i < auth_tok->session_key.encrypted_key_size; i++)
+		encrypted_session_key_valid |=
+			auth_tok->session_key.encrypted_key[i];
+	if (encrypted_session_key_valid) {
+		ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
+				"using auth_tok->session_key.encrypted_key, "
+				"where key_rec->enc_key_size = [%zd]\n",
+				key_rec->enc_key_size);
+		memcpy(key_rec->enc_key,
+		       auth_tok->session_key.encrypted_key,
+		       key_rec->enc_key_size);
+		goto encrypted_session_key_set;
+	}
+	if (auth_tok->token.password.flags &
+	    ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET) {
+		ecryptfs_printk(KERN_DEBUG, "Using previously generated "
+				"session key encryption key of size [%d]\n",
+				auth_tok->token.password.
+				session_key_encryption_key_bytes);
+		memcpy(session_key_encryption_key,
+		       auth_tok->token.password.session_key_encryption_key,
+		       crypt_stat->key_size);
+		ecryptfs_printk(KERN_DEBUG,
+				"Cached session key encryption key:\n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(session_key_encryption_key, 16);
+	}
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n");
+		ecryptfs_dump_hex(session_key_encryption_key, 16);
+	}
+	rc = virt_to_scatterlist(crypt_stat->key, key_rec->enc_key_size,
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
+		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+				"for crypt_stat session key; expected rc = 1; "
+				"got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
+				rc, key_rec->enc_key_size);
+		rc = -ENOMEM;
+		goto out;
+	}
+	rc = virt_to_scatterlist(key_rec->enc_key, key_rec->enc_key_size,
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
+		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+				"for crypt_stat encrypted session key; "
+				"expected rc = 1; got rc = [%d]. "
+				"key_rec->enc_key_size = [%zd]\n", rc,
+				key_rec->enc_key_size);
+		rc = -ENOMEM;
+		goto out;
+	}
+	mutex_lock(tfm_mutex);
+	rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key,
+				     crypt_stat->key_size);
+	if (rc < 0) {
+		mutex_unlock(tfm_mutex);
+		ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
+				"context; rc = [%d]\n", rc);
+		goto out;
+	}
+	rc = 0;
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
+			crypt_stat->key_size);
+	rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
+				      (*key_rec).enc_key_size);
+	mutex_unlock(tfm_mutex);
+	if (rc) {
+		printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc);
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
+	if (ecryptfs_verbosity > 0) {
+		ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
+				key_rec->enc_key_size);
+		ecryptfs_dump_hex(key_rec->enc_key,
+				  key_rec->enc_key_size);
+	}
+encrypted_session_key_set:
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 3 */
+	max_packet_size = (1                         /* Tag 3 identifier */
+			   + 3                       /* Max Tag 3 packet size */
+			   + 1                       /* Version */
+			   + 1                       /* Cipher code */
+			   + 1                       /* S2K specifier */
+			   + 1                       /* Hash identifier */
+			   + ECRYPTFS_SALT_SIZE      /* Salt */
+			   + 1                       /* Hash iterations */
+			   + key_rec->enc_key_size); /* Encrypted key size */
+	if (max_packet_size > (*remaining_bytes)) {
+		printk(KERN_ERR "Packet too large; need up to [%td] bytes, but "
+		       "there are only [%td] available\n", max_packet_size,
+		       (*remaining_bytes));
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE;
+	/* Chop off the Tag 3 identifier(1) and Tag 3 packet size(3)
+	 * to get the number of octets in the actual Tag 3 packet */
+	rc = ecryptfs_write_packet_length(&dest[(*packet_size)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_ERR "Error generating tag 3 packet header; cannot "
+		       "generate packet length. rc = [%d]\n", rc);
+		goto out;
+	}
+	(*packet_size) += packet_size_length;
+	dest[(*packet_size)++] = 0x04; /* version 4 */
+	/* TODO: Break from RFC2440 so that arbitrary ciphers can be
+	 * specified with strings */
+	cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+						      crypt_stat->key_size);
+	if (cipher_code == 0) {
+		ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
+				"cipher [%s]\n", crypt_stat->cipher);
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_size)++] = cipher_code;
+	dest[(*packet_size)++] = 0x03;	/* S2K */
+	dest[(*packet_size)++] = 0x01;	/* MD5 (TODO: parameterize) */
+	memcpy(&dest[(*packet_size)], auth_tok->token.password.salt,
+	       ECRYPTFS_SALT_SIZE);
+	(*packet_size) += ECRYPTFS_SALT_SIZE;	/* salt */
+	dest[(*packet_size)++] = 0x60;	/* hash iterations (65536) */
+	memcpy(&dest[(*packet_size)], key_rec->enc_key,
+	       key_rec->enc_key_size);
+	(*packet_size) += key_rec->enc_key_size;
+out:
+	if (rc)
+		(*packet_size) = 0;
+	else
+		(*remaining_bytes) -= (*packet_size);
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_key_record_cache;
+
+/**
+ * ecryptfs_generate_key_packet_set
+ * @dest_base: Virtual address from which to write the key record set
+ * @crypt_stat: The cryptographic context from which the
+ *              authentication tokens will be retrieved
+ * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat
+ *                   for the global parameters
+ * @len: The amount written
+ * @max: The maximum amount of data allowed to be written
+ *
+ * Generates a key packet set and writes it to the virtual address
+ * passed in.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_generate_key_packet_set(char *dest_base,
+				 struct ecryptfs_crypt_stat *crypt_stat,
+				 struct dentry *ecryptfs_dentry, size_t *len,
+				 size_t max)
+{
+	struct ecryptfs_auth_tok *auth_tok;
+	struct key *auth_tok_key = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	size_t written;
+	struct ecryptfs_key_record *key_rec;
+	struct ecryptfs_key_sig *key_sig;
+	int rc = 0;
+
+	(*len) = 0;
+	mutex_lock(&crypt_stat->keysig_list_mutex);
+	key_rec = kmem_cache_alloc(ecryptfs_key_record_cache, GFP_KERNEL);
+	if (!key_rec) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	list_for_each_entry(key_sig, &crypt_stat->keysig_list,
+			    crypt_stat_list) {
+		memset(key_rec, 0, sizeof(*key_rec));
+		rc = ecryptfs_find_global_auth_tok_for_sig(&auth_tok_key,
+							   &auth_tok,
+							   mount_crypt_stat,
+							   key_sig->keysig);
+		if (rc) {
+			printk(KERN_WARNING "Unable to retrieve auth tok with "
+			       "sig = [%s]\n", key_sig->keysig);
+			rc = process_find_global_auth_tok_for_sig_err(rc);
+			goto out_free;
+		}
+		if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
+			rc = write_tag_3_packet((dest_base + (*len)),
+						&max, auth_tok,
+						crypt_stat, key_rec,
+						&written);
+			up_write(&(auth_tok_key->sem));
+			key_put(auth_tok_key);
+			if (rc) {
+				ecryptfs_printk(KERN_WARNING, "Error "
+						"writing tag 3 packet\n");
+				goto out_free;
+			}
+			(*len) += written;
+			/* Write auth tok signature packet */
+			rc = write_tag_11_packet((dest_base + (*len)), &max,
+						 key_rec->sig,
+						 ECRYPTFS_SIG_SIZE, &written);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error writing "
+						"auth tok signature packet\n");
+				goto out_free;
+			}
+			(*len) += written;
+		} else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) {
+			rc = write_tag_1_packet(dest_base + (*len), &max,
+						auth_tok_key, auth_tok,
+						crypt_stat, key_rec, &written);
+			if (rc) {
+				ecryptfs_printk(KERN_WARNING, "Error "
+						"writing tag 1 packet\n");
+				goto out_free;
+			}
+			(*len) += written;
+		} else {
+			up_write(&(auth_tok_key->sem));
+			key_put(auth_tok_key);
+			ecryptfs_printk(KERN_WARNING, "Unsupported "
+					"authentication token type\n");
+			rc = -EINVAL;
+			goto out_free;
+		}
+	}
+	if (likely(max > 0)) {
+		dest_base[(*len)] = 0x00;
+	} else {
+		ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n");
+		rc = -EIO;
+	}
+out_free:
+	kmem_cache_free(ecryptfs_key_record_cache, key_rec);
+out:
+	if (rc)
+		(*len) = 0;
+	mutex_unlock(&crypt_stat->keysig_list_mutex);
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_key_sig_cache;
+
+int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
+{
+	struct ecryptfs_key_sig *new_key_sig;
+
+	new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
+	if (!new_key_sig) {
+		printk(KERN_ERR
+		       "Error allocating from ecryptfs_key_sig_cache\n");
+		return -ENOMEM;
+	}
+	memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
+	new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	/* Caller must hold keysig_list_mutex */
+	list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
+
+	return 0;
+}
+
+struct kmem_cache *ecryptfs_global_auth_tok_cache;
+
+int
+ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *sig, u32 global_auth_tok_flags)
+{
+	struct ecryptfs_global_auth_tok *new_auth_tok;
+	int rc = 0;
+
+	new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache,
+					GFP_KERNEL);
+	if (!new_auth_tok) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "Error allocating from "
+		       "ecryptfs_global_auth_tok_cache\n");
+		goto out;
+	}
+	memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
+	new_auth_tok->flags = global_auth_tok_flags;
+	new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_add(&new_auth_tok->mount_crypt_stat_list,
+		 &mount_crypt_stat->global_auth_tok_list);
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+out:
+	return rc;
+}
+
diff --git a/kernel/fs/ecryptfs/kthread.c b/kernel/fs/ecryptfs/kthread.c
new file mode 100644
index 000000000..866bb18ef
--- /dev/null
+++ b/kernel/fs/ecryptfs/kthread.c
@@ -0,0 +1,172 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/mount.h>
+#include "ecryptfs_kernel.h"
+
+struct ecryptfs_open_req {
+	struct file **lower_file;
+	struct path path;
+	struct completion done;
+	struct list_head kthread_ctl_list;
+};
+
+static struct ecryptfs_kthread_ctl {
+#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001
+	u32 flags;
+	struct mutex mux;
+	struct list_head req_list;
+	wait_queue_head_t wait;
+} ecryptfs_kthread_ctl;
+
+static struct task_struct *ecryptfs_kthread;
+
+/**
+ * ecryptfs_threadfn
+ * @ignored: ignored
+ *
+ * The eCryptfs kernel thread that has the responsibility of getting
+ * the lower file with RW permissions.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_threadfn(void *ignored)
+{
+	set_freezable();
+	while (1)  {
+		struct ecryptfs_open_req *req;
+
+		wait_event_freezable(
+			ecryptfs_kthread_ctl.wait,
+			(!list_empty(&ecryptfs_kthread_ctl.req_list)
+			 || kthread_should_stop()));
+		mutex_lock(&ecryptfs_kthread_ctl.mux);
+		if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+			mutex_unlock(&ecryptfs_kthread_ctl.mux);
+			goto out;
+		}
+		while (!list_empty(&ecryptfs_kthread_ctl.req_list)) {
+			req = list_first_entry(&ecryptfs_kthread_ctl.req_list,
+					       struct ecryptfs_open_req,
+					       kthread_ctl_list);
+			list_del(&req->kthread_ctl_list);
+			*req->lower_file = dentry_open(&req->path,
+				(O_RDWR | O_LARGEFILE), current_cred());
+			complete(&req->done);
+		}
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	}
+out:
+	return 0;
+}
+
+int __init ecryptfs_init_kthread(void)
+{
+	int rc = 0;
+
+	mutex_init(&ecryptfs_kthread_ctl.mux);
+	init_waitqueue_head(&ecryptfs_kthread_ctl.wait);
+	INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list);
+	ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL,
+				       "ecryptfs-kthread");
+	if (IS_ERR(ecryptfs_kthread)) {
+		rc = PTR_ERR(ecryptfs_kthread);
+		printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]"
+		       "\n", __func__, rc);
+	}
+	return rc;
+}
+
+void ecryptfs_destroy_kthread(void)
+{
+	struct ecryptfs_open_req *req, *tmp;
+
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
+	list_for_each_entry_safe(req, tmp, &ecryptfs_kthread_ctl.req_list,
+				 kthread_ctl_list) {
+		list_del(&req->kthread_ctl_list);
+		*req->lower_file = ERR_PTR(-EIO);
+		complete(&req->done);
+	}
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	kthread_stop(ecryptfs_kthread);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+}
+
+/**
+ * ecryptfs_privileged_open
+ * @lower_file: Result of dentry_open by root on lower dentry
+ * @lower_dentry: Lower dentry for file to open
+ * @lower_mnt: Lower vfsmount for file to open
+ *
+ * This function gets a r/w file opened againt the lower dentry.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt,
+			     const struct cred *cred)
+{
+	struct ecryptfs_open_req req;
+	int flags = O_LARGEFILE;
+	int rc = 0;
+
+	init_completion(&req.done);
+	req.lower_file = lower_file;
+	req.path.dentry = lower_dentry;
+	req.path.mnt = lower_mnt;
+
+	/* Corresponding dput() and mntput() are done when the
+	 * lower file is fput() when all eCryptfs files for the inode are
+	 * released. */
+	flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR;
+	(*lower_file) = dentry_open(&req.path, flags, cred);
+	if (!IS_ERR(*lower_file))
+		goto out;
+	if ((flags & O_ACCMODE) == O_RDONLY) {
+		rc = PTR_ERR((*lower_file));
+		goto out;
+	}
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+		rc = -EIO;
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+		printk(KERN_ERR "%s: We are in the middle of shutting down; "
+		       "aborting privileged request to open lower file\n",
+			__func__);
+		goto out;
+	}
+	list_add_tail(&req.kthread_ctl_list, &ecryptfs_kthread_ctl.req_list);
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+	wait_for_completion(&req.done);
+	if (IS_ERR(*lower_file))
+		rc = PTR_ERR(*lower_file);
+out:
+	return rc;
+}
diff --git a/kernel/fs/ecryptfs/main.c b/kernel/fs/ecryptfs/main.c
new file mode 100644
index 000000000..4f4d0474b
--- /dev/null
+++ b/kernel/fs/ecryptfs/main.c
@@ -0,0 +1,906 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *              Tyler Hicks <tyhicks@ou.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/parser.h>
+#include <linux/fs_stack.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * Module parameter that defines the ecryptfs_verbosity level.
+ */
+int ecryptfs_verbosity = 0;
+
+module_param(ecryptfs_verbosity, int, 0);
+MODULE_PARM_DESC(ecryptfs_verbosity,
+		 "Initial verbosity level (0 or 1; defaults to "
+		 "0, which is Quiet)");
+
+/**
+ * Module parameter that defines the number of message buffer elements
+ */
+unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
+
+module_param(ecryptfs_message_buf_len, uint, 0);
+MODULE_PARM_DESC(ecryptfs_message_buf_len,
+		 "Number of message buffer elements");
+
+/**
+ * Module parameter that defines the maximum guaranteed amount of time to wait
+ * for a response from ecryptfsd.  The actual sleep time will be, more than
+ * likely, a small amount greater than this specified value, but only less if
+ * the message successfully arrives.
+ */
+signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ;
+
+module_param(ecryptfs_message_wait_timeout, long, 0);
+MODULE_PARM_DESC(ecryptfs_message_wait_timeout,
+		 "Maximum number of seconds that an operation will "
+		 "sleep while waiting for a message response from "
+		 "userspace");
+
+/**
+ * Module parameter that is an estimate of the maximum number of users
+ * that will be concurrently using eCryptfs. Set this to the right
+ * value to balance performance and memory use.
+ */
+unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS;
+
+module_param(ecryptfs_number_of_users, uint, 0);
+MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of "
+		 "concurrent users of eCryptfs");
+
+void __ecryptfs_printk(const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	if (fmt[1] == '7') { /* KERN_DEBUG */
+		if (ecryptfs_verbosity >= 1)
+			vprintk(fmt, args);
+	} else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
+/**
+ * ecryptfs_init_lower_file
+ * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with
+ *                   the lower dentry and the lower mount set
+ *
+ * eCryptfs only ever keeps a single open file for every lower
+ * inode. All I/O operations to the lower inode occur through that
+ * file. When the first eCryptfs dentry that interposes with the first
+ * lower dentry for that inode is created, this function creates the
+ * lower file struct and associates it with the eCryptfs
+ * inode. When all eCryptfs files associated with the inode are released, the
+ * file is closed.
+ *
+ * The lower file will be opened with read/write permissions, if
+ * possible. Otherwise, it is opened read-only.
+ *
+ * This function does nothing if a lower file is already
+ * associated with the eCryptfs inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_init_lower_file(struct dentry *dentry,
+				    struct file **lower_file)
+{
+	const struct cred *cred = current_cred();
+	struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+	int rc;
+
+	rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
+				      cred);
+	if (rc) {
+		printk(KERN_ERR "Error opening lower file "
+		       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
+		       "rc = [%d]\n", path->dentry, path->mnt, rc);
+		(*lower_file) = NULL;
+	}
+	return rc;
+}
+
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
+{
+	struct ecryptfs_inode_info *inode_info;
+	int count, rc = 0;
+
+	inode_info = ecryptfs_inode_to_private(inode);
+	mutex_lock(&inode_info->lower_file_mutex);
+	count = atomic_inc_return(&inode_info->lower_file_count);
+	if (WARN_ON_ONCE(count < 1))
+		rc = -EINVAL;
+	else if (count == 1) {
+		rc = ecryptfs_init_lower_file(dentry,
+					      &inode_info->lower_file);
+		if (rc)
+			atomic_set(&inode_info->lower_file_count, 0);
+	}
+	mutex_unlock(&inode_info->lower_file_mutex);
+	return rc;
+}
+
+void ecryptfs_put_lower_file(struct inode *inode)
+{
+	struct ecryptfs_inode_info *inode_info;
+
+	inode_info = ecryptfs_inode_to_private(inode);
+	if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count,
+				      &inode_info->lower_file_mutex)) {
+		filemap_write_and_wait(inode->i_mapping);
+		fput(inode_info->lower_file);
+		inode_info->lower_file = NULL;
+		mutex_unlock(&inode_info->lower_file_mutex);
+	}
+}
+
+enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
+       ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
+       ecryptfs_opt_ecryptfs_key_bytes,
+       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
+       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_check_dev_ruid,
+       ecryptfs_opt_err };
+
+static const match_table_t tokens = {
+	{ecryptfs_opt_sig, "sig=%s"},
+	{ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
+	{ecryptfs_opt_cipher, "cipher=%s"},
+	{ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"},
+	{ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"},
+	{ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
+	{ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
+	{ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+	{ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
+	{ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
+	{ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
+	{ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"},
+	{ecryptfs_opt_err, NULL}
+};
+
+static int ecryptfs_init_global_auth_toks(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct ecryptfs_global_auth_tok *global_auth_tok;
+	struct ecryptfs_auth_tok *auth_tok;
+	int rc = 0;
+
+	list_for_each_entry(global_auth_tok,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		rc = ecryptfs_keyring_auth_tok_for_sig(
+			&global_auth_tok->global_auth_tok_key, &auth_tok,
+			global_auth_tok->sig);
+		if (rc) {
+			printk(KERN_ERR "Could not find valid key in user "
+			       "session keyring for sig specified in mount "
+			       "option: [%s]\n", global_auth_tok->sig);
+			global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
+			goto out;
+		} else {
+			global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
+			up_write(&(global_auth_tok->global_auth_tok_key)->sem);
+		}
+	}
+out:
+	return rc;
+}
+
+static void ecryptfs_init_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	memset((void *)mount_crypt_stat, 0,
+	       sizeof(struct ecryptfs_mount_crypt_stat));
+	INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list);
+	mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex);
+	mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED;
+}
+
+/**
+ * ecryptfs_parse_options
+ * @sb: The ecryptfs super block
+ * @options: The options passed to the kernel
+ * @check_ruid: set to 1 if device uid should be checked against the ruid
+ *
+ * Parse mount options:
+ * debug=N 	   - ecryptfs_verbosity level for debug output
+ * sig=XXX	   - description(signature) of the key to use
+ *
+ * Returns the dentry object of the lower-level (lower/interposed)
+ * directory; We want to mount our stackable file system on top of
+ * that lower directory.
+ *
+ * The signature of the key to use must be the description of a key
+ * already in the keyring. Mounting will fail if the key can not be
+ * found.
+ *
+ * Returns zero on success; non-zero on error
+ */
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
+				  uid_t *check_ruid)
+{
+	char *p;
+	int rc = 0;
+	int sig_set = 0;
+	int cipher_name_set = 0;
+	int fn_cipher_name_set = 0;
+	int cipher_key_bytes;
+	int cipher_key_bytes_set = 0;
+	int fn_cipher_key_bytes;
+	int fn_cipher_key_bytes_set = 0;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&sbi->mount_crypt_stat;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+	char *sig_src;
+	char *cipher_name_dst;
+	char *cipher_name_src;
+	char *fn_cipher_name_dst;
+	char *fn_cipher_name_src;
+	char *fnek_dst;
+	char *fnek_src;
+	char *cipher_key_bytes_src;
+	char *fn_cipher_key_bytes_src;
+	u8 cipher_code;
+
+	*check_ruid = 0;
+
+	if (!options) {
+		rc = -EINVAL;
+		goto out;
+	}
+	ecryptfs_init_mount_crypt_stat(mount_crypt_stat);
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case ecryptfs_opt_sig:
+		case ecryptfs_opt_ecryptfs_sig:
+			sig_src = args[0].from;
+			rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
+							  sig_src, 0);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to register "
+				       "global sig; rc = [%d]\n", rc);
+				goto out;
+			}
+			sig_set = 1;
+			break;
+		case ecryptfs_opt_cipher:
+		case ecryptfs_opt_ecryptfs_cipher:
+			cipher_name_src = args[0].from;
+			cipher_name_dst =
+				mount_crypt_stat->
+				global_default_cipher_name;
+			strncpy(cipher_name_dst, cipher_name_src,
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+			cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+			cipher_name_set = 1;
+			break;
+		case ecryptfs_opt_ecryptfs_key_bytes:
+			cipher_key_bytes_src = args[0].from;
+			cipher_key_bytes =
+				(int)simple_strtol(cipher_key_bytes_src,
+						   &cipher_key_bytes_src, 0);
+			mount_crypt_stat->global_default_cipher_key_size =
+				cipher_key_bytes;
+			cipher_key_bytes_set = 1;
+			break;
+		case ecryptfs_opt_passthrough:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED;
+			break;
+		case ecryptfs_opt_xattr_metadata:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_XATTR_METADATA_ENABLED;
+			break;
+		case ecryptfs_opt_encrypted_view:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_XATTR_METADATA_ENABLED;
+			mount_crypt_stat->flags |=
+				ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
+			break;
+		case ecryptfs_opt_fnek_sig:
+			fnek_src = args[0].from;
+			fnek_dst =
+				mount_crypt_stat->global_default_fnek_sig;
+			strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+			mount_crypt_stat->global_default_fnek_sig[
+				ECRYPTFS_SIG_SIZE_HEX] = '\0';
+			rc = ecryptfs_add_global_auth_tok(
+				mount_crypt_stat,
+				mount_crypt_stat->global_default_fnek_sig,
+				ECRYPTFS_AUTH_TOK_FNEK);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to register "
+				       "global fnek sig [%s]; rc = [%d]\n",
+				       mount_crypt_stat->global_default_fnek_sig,
+				       rc);
+				goto out;
+			}
+			mount_crypt_stat->flags |=
+				(ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+				 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+			break;
+		case ecryptfs_opt_fn_cipher:
+			fn_cipher_name_src = args[0].from;
+			fn_cipher_name_dst =
+				mount_crypt_stat->global_default_fn_cipher_name;
+			strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+			mount_crypt_stat->global_default_fn_cipher_name[
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+			fn_cipher_name_set = 1;
+			break;
+		case ecryptfs_opt_fn_cipher_key_bytes:
+			fn_cipher_key_bytes_src = args[0].from;
+			fn_cipher_key_bytes =
+				(int)simple_strtol(fn_cipher_key_bytes_src,
+						   &fn_cipher_key_bytes_src, 0);
+			mount_crypt_stat->global_default_fn_cipher_key_bytes =
+				fn_cipher_key_bytes;
+			fn_cipher_key_bytes_set = 1;
+			break;
+		case ecryptfs_opt_unlink_sigs:
+			mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
+			break;
+		case ecryptfs_opt_mount_auth_tok_only:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
+			break;
+		case ecryptfs_opt_check_dev_ruid:
+			*check_ruid = 1;
+			break;
+		case ecryptfs_opt_err:
+		default:
+			printk(KERN_WARNING
+			       "%s: eCryptfs: unrecognized option [%s]\n",
+			       __func__, p);
+		}
+	}
+	if (!sig_set) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "You must supply at least one valid "
+				"auth tok signature as a mount "
+				"parameter; see the eCryptfs README\n");
+		goto out;
+	}
+	if (!cipher_name_set) {
+		int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+
+		BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+		strcpy(mount_crypt_stat->global_default_cipher_name,
+		       ECRYPTFS_DEFAULT_CIPHER);
+	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_name_set)
+		strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_cipher_name);
+	if (!cipher_key_bytes_set)
+		mount_crypt_stat->global_default_cipher_key_size = 0;
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_key_bytes_set)
+		mount_crypt_stat->global_default_fn_cipher_key_bytes =
+			mount_crypt_stat->global_default_cipher_key_size;
+
+	cipher_code = ecryptfs_code_for_cipher_string(
+		mount_crypt_stat->global_default_cipher_name,
+		mount_crypt_stat->global_default_cipher_key_size);
+	if (!cipher_code) {
+		ecryptfs_printk(KERN_ERR,
+				"eCryptfs doesn't support cipher: %s",
+				mount_crypt_stat->global_default_cipher_name);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	mutex_lock(&key_tfm_list_mutex);
+	if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
+				 NULL)) {
+		rc = ecryptfs_add_new_key_tfm(
+			NULL, mount_crypt_stat->global_default_cipher_name,
+			mount_crypt_stat->global_default_cipher_key_size);
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_cipher_name,
+			       mount_crypt_stat->global_default_cipher_key_size,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
+	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !ecryptfs_tfm_exists(
+		    mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+		rc = ecryptfs_add_new_key_tfm(
+			NULL, mount_crypt_stat->global_default_fn_cipher_name,
+			mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_fn_cipher_name,
+			       mount_crypt_stat->global_default_fn_cipher_key_bytes,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&key_tfm_list_mutex);
+	rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
+	if (rc)
+		printk(KERN_WARNING "One or more global auth toks could not "
+		       "properly register; rc = [%d]\n", rc);
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_sb_info_cache;
+static struct file_system_type ecryptfs_fs_type;
+
+/**
+ * ecryptfs_get_sb
+ * @fs_type
+ * @flags
+ * @dev_name: The path to mount over
+ * @raw_data: The options passed into the kernel
+ */
+static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *raw_data)
+{
+	struct super_block *s;
+	struct ecryptfs_sb_info *sbi;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct ecryptfs_dentry_info *root_info;
+	const char *err = "Getting sb failed";
+	struct inode *inode;
+	struct path path;
+	uid_t check_ruid;
+	int rc;
+
+	sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
+	if (!sbi) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
+	if (rc) {
+		err = "Error parsing options";
+		goto out;
+	}
+	mount_crypt_stat = &sbi->mount_crypt_stat;
+
+	s = sget(fs_type, NULL, set_anon_super, flags, NULL);
+	if (IS_ERR(s)) {
+		rc = PTR_ERR(s);
+		goto out;
+	}
+
+	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
+	if (rc)
+		goto out1;
+
+	ecryptfs_set_superblock_private(s, sbi);
+	s->s_bdi = &sbi->bdi;
+
+	/* ->kill_sb() will take care of sbi after that point */
+	sbi = NULL;
+	s->s_op = &ecryptfs_sops;
+	s->s_d_op = &ecryptfs_dops;
+
+	err = "Reading sb failed";
+	rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
+		goto out1;
+	}
+	if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Mount on filesystem of type "
+			"eCryptfs explicitly disallowed due to "
+			"known incompatibilities\n");
+		goto out_free;
+	}
+
+	if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) {
+		rc = -EPERM;
+		printk(KERN_ERR "Mount of device (uid: %d) not owned by "
+		       "requested user (uid: %d)\n",
+			i_uid_read(d_inode(path.dentry)),
+			from_kuid(&init_user_ns, current_uid()));
+		goto out_free;
+	}
+
+	ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+
+	/**
+	 * Set the POSIX ACL flag based on whether they're enabled in the lower
+	 * mount.
+	 */
+	s->s_flags = flags & ~MS_POSIXACL;
+	s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL;
+
+	/**
+	 * Force a read-only eCryptfs mount when:
+	 *   1) The lower mount is ro
+	 *   2) The ecryptfs_encrypted_view mount option is specified
+	 */
+	if (path.dentry->d_sb->s_flags & MS_RDONLY ||
+	    mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+		s->s_flags |= MS_RDONLY;
+
+	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+	s->s_blocksize = path.dentry->d_sb->s_blocksize;
+	s->s_magic = ECRYPTFS_SUPER_MAGIC;
+	s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
+
+	rc = -EINVAL;
+	if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+		pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
+		goto out_free;
+	}
+
+	inode = ecryptfs_get_inode(d_inode(path.dentry), s);
+	rc = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_free;
+
+	s->s_root = d_make_root(inode);
+	if (!s->s_root) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	rc = -ENOMEM;
+	root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+	if (!root_info)
+		goto out_free;
+
+	/* ->kill_sb() will take care of root_info */
+	ecryptfs_set_dentry_private(s->s_root, root_info);
+	root_info->lower_path = path;
+
+	s->s_flags |= MS_ACTIVE;
+	return dget(s->s_root);
+
+out_free:
+	path_put(&path);
+out1:
+	deactivate_locked_super(s);
+out:
+	if (sbi) {
+		ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
+		kmem_cache_free(ecryptfs_sb_info_cache, sbi);
+	}
+	printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
+	return ERR_PTR(rc);
+}
+
+/**
+ * ecryptfs_kill_block_super
+ * @sb: The ecryptfs super block
+ *
+ * Used to bring the superblock down and free the private data.
+ */
+static void ecryptfs_kill_block_super(struct super_block *sb)
+{
+	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+	kill_anon_super(sb);
+	if (!sb_info)
+		return;
+	ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+	bdi_destroy(&sb_info->bdi);
+	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
+}
+
+static struct file_system_type ecryptfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "ecryptfs",
+	.mount = ecryptfs_mount,
+	.kill_sb = ecryptfs_kill_block_super,
+	.fs_flags = 0
+};
+MODULE_ALIAS_FS("ecryptfs");
+
+/**
+ * inode_info_init_once
+ *
+ * Initializes the ecryptfs_inode_info_cache when it is created
+ */
+static void
+inode_info_init_once(void *vptr)
+{
+	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static struct ecryptfs_cache_info {
+	struct kmem_cache **cache;
+	const char *name;
+	size_t size;
+	void (*ctor)(void *obj);
+} ecryptfs_cache_infos[] = {
+	{
+		.cache = &ecryptfs_auth_tok_list_item_cache,
+		.name = "ecryptfs_auth_tok_list_item",
+		.size = sizeof(struct ecryptfs_auth_tok_list_item),
+	},
+	{
+		.cache = &ecryptfs_file_info_cache,
+		.name = "ecryptfs_file_cache",
+		.size = sizeof(struct ecryptfs_file_info),
+	},
+	{
+		.cache = &ecryptfs_dentry_info_cache,
+		.name = "ecryptfs_dentry_info_cache",
+		.size = sizeof(struct ecryptfs_dentry_info),
+	},
+	{
+		.cache = &ecryptfs_inode_info_cache,
+		.name = "ecryptfs_inode_cache",
+		.size = sizeof(struct ecryptfs_inode_info),
+		.ctor = inode_info_init_once,
+	},
+	{
+		.cache = &ecryptfs_sb_info_cache,
+		.name = "ecryptfs_sb_cache",
+		.size = sizeof(struct ecryptfs_sb_info),
+	},
+	{
+		.cache = &ecryptfs_header_cache,
+		.name = "ecryptfs_headers",
+		.size = PAGE_CACHE_SIZE,
+	},
+	{
+		.cache = &ecryptfs_xattr_cache,
+		.name = "ecryptfs_xattr_cache",
+		.size = PAGE_CACHE_SIZE,
+	},
+	{
+		.cache = &ecryptfs_key_record_cache,
+		.name = "ecryptfs_key_record_cache",
+		.size = sizeof(struct ecryptfs_key_record),
+	},
+	{
+		.cache = &ecryptfs_key_sig_cache,
+		.name = "ecryptfs_key_sig_cache",
+		.size = sizeof(struct ecryptfs_key_sig),
+	},
+	{
+		.cache = &ecryptfs_global_auth_tok_cache,
+		.name = "ecryptfs_global_auth_tok_cache",
+		.size = sizeof(struct ecryptfs_global_auth_tok),
+	},
+	{
+		.cache = &ecryptfs_key_tfm_cache,
+		.name = "ecryptfs_key_tfm_cache",
+		.size = sizeof(struct ecryptfs_key_tfm),
+	},
+};
+
+static void ecryptfs_free_kmem_caches(void)
+{
+	int i;
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+		struct ecryptfs_cache_info *info;
+
+		info = &ecryptfs_cache_infos[i];
+		if (*(info->cache))
+			kmem_cache_destroy(*(info->cache));
+	}
+}
+
+/**
+ * ecryptfs_init_kmem_caches
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_init_kmem_caches(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+		struct ecryptfs_cache_info *info;
+
+		info = &ecryptfs_cache_infos[i];
+		*(info->cache) = kmem_cache_create(info->name, info->size,
+				0, SLAB_HWCACHE_ALIGN, info->ctor);
+		if (!*(info->cache)) {
+			ecryptfs_free_kmem_caches();
+			ecryptfs_printk(KERN_WARNING, "%s: "
+					"kmem_cache_create failed\n",
+					info->name);
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+static struct kobject *ecryptfs_kobj;
+
+static ssize_t version_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buff)
+{
+	return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+}
+
+static struct kobj_attribute version_attr = __ATTR_RO(version);
+
+static struct attribute *attributes[] = {
+	&version_attr.attr,
+	NULL,
+};
+
+static struct attribute_group attr_group = {
+	.attrs = attributes,
+};
+
+static int do_sysfs_registration(void)
+{
+	int rc;
+
+	ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj);
+	if (!ecryptfs_kobj) {
+		printk(KERN_ERR "Unable to create ecryptfs kset\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	rc = sysfs_create_group(ecryptfs_kobj, &attr_group);
+	if (rc) {
+		printk(KERN_ERR
+		       "Unable to create ecryptfs version attributes\n");
+		kobject_put(ecryptfs_kobj);
+	}
+out:
+	return rc;
+}
+
+static void do_sysfs_unregistration(void)
+{
+	sysfs_remove_group(ecryptfs_kobj, &attr_group);
+	kobject_put(ecryptfs_kobj);
+}
+
+static int __init ecryptfs_init(void)
+{
+	int rc;
+
+	if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
+				"larger than the host's page size, and so "
+				"eCryptfs cannot run on this system. The "
+				"default eCryptfs extent size is [%u] bytes; "
+				"the page size is [%lu] bytes.\n",
+				ECRYPTFS_DEFAULT_EXTENT_SIZE,
+				(unsigned long)PAGE_CACHE_SIZE);
+		goto out;
+	}
+	rc = ecryptfs_init_kmem_caches();
+	if (rc) {
+		printk(KERN_ERR
+		       "Failed to allocate one or more kmem_cache objects\n");
+		goto out;
+	}
+	rc = do_sysfs_registration();
+	if (rc) {
+		printk(KERN_ERR "sysfs registration failed\n");
+		goto out_free_kmem_caches;
+	}
+	rc = ecryptfs_init_kthread();
+	if (rc) {
+		printk(KERN_ERR "%s: kthread initialization failed; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_do_sysfs_unregistration;
+	}
+	rc = ecryptfs_init_messaging();
+	if (rc) {
+		printk(KERN_ERR "Failure occurred while attempting to "
+				"initialize the communications channel to "
+				"ecryptfsd\n");
+		goto out_destroy_kthread;
+	}
+	rc = ecryptfs_init_crypto();
+	if (rc) {
+		printk(KERN_ERR "Failure whilst attempting to init crypto; "
+		       "rc = [%d]\n", rc);
+		goto out_release_messaging;
+	}
+	rc = register_filesystem(&ecryptfs_fs_type);
+	if (rc) {
+		printk(KERN_ERR "Failed to register filesystem\n");
+		goto out_destroy_crypto;
+	}
+	if (ecryptfs_verbosity > 0)
+		printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values "
+			"will be written to the syslog!\n", ecryptfs_verbosity);
+
+	goto out;
+out_destroy_crypto:
+	ecryptfs_destroy_crypto();
+out_release_messaging:
+	ecryptfs_release_messaging();
+out_destroy_kthread:
+	ecryptfs_destroy_kthread();
+out_do_sysfs_unregistration:
+	do_sysfs_unregistration();
+out_free_kmem_caches:
+	ecryptfs_free_kmem_caches();
+out:
+	return rc;
+}
+
+static void __exit ecryptfs_exit(void)
+{
+	int rc;
+
+	rc = ecryptfs_destroy_crypto();
+	if (rc)
+		printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
+		       "rc = [%d]\n", rc);
+	ecryptfs_release_messaging();
+	ecryptfs_destroy_kthread();
+	do_sysfs_unregistration();
+	unregister_filesystem(&ecryptfs_fs_type);
+	ecryptfs_free_kmem_caches();
+}
+
+MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>");
+MODULE_DESCRIPTION("eCryptfs");
+
+MODULE_LICENSE("GPL");
+
+module_init(ecryptfs_init)
+module_exit(ecryptfs_exit)
diff --git a/kernel/fs/ecryptfs/messaging.c b/kernel/fs/ecryptfs/messaging.c
new file mode 100644
index 000000000..286f10b03
--- /dev/null
+++ b/kernel/fs/ecryptfs/messaging.c
@@ -0,0 +1,468 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2004-2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *		Tyler Hicks <tyhicks@ou.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/nsproxy.h>
+#include "ecryptfs_kernel.h"
+
+static LIST_HEAD(ecryptfs_msg_ctx_free_list);
+static LIST_HEAD(ecryptfs_msg_ctx_alloc_list);
+static struct mutex ecryptfs_msg_ctx_lists_mux;
+
+static struct hlist_head *ecryptfs_daemon_hash;
+struct mutex ecryptfs_daemon_hash_mux;
+static int ecryptfs_hash_bits;
+#define ecryptfs_current_euid_hash(uid) \
+	hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits)
+
+static u32 ecryptfs_msg_counter;
+static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
+
+/**
+ * ecryptfs_acquire_free_msg_ctx
+ * @msg_ctx: The context that was acquired from the free list
+ *
+ * Acquires a context element from the free list and locks the mutex
+ * on the context.  Sets the msg_ctx task to current.  Returns zero on
+ * success; non-zero on error or upon failure to acquire a free
+ * context element.  Must be called with ecryptfs_msg_ctx_lists_mux
+ * held.
+ */
+static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx)
+{
+	struct list_head *p;
+	int rc;
+
+	if (list_empty(&ecryptfs_msg_ctx_free_list)) {
+		printk(KERN_WARNING "%s: The eCryptfs free "
+		       "context list is empty.  It may be helpful to "
+		       "specify the ecryptfs_message_buf_len "
+		       "parameter to be greater than the current "
+		       "value of [%d]\n", __func__, ecryptfs_message_buf_len);
+		rc = -ENOMEM;
+		goto out;
+	}
+	list_for_each(p, &ecryptfs_msg_ctx_free_list) {
+		*msg_ctx = list_entry(p, struct ecryptfs_msg_ctx, node);
+		if (mutex_trylock(&(*msg_ctx)->mux)) {
+			(*msg_ctx)->task = current;
+			rc = 0;
+			goto out;
+		}
+	}
+	rc = -ENOMEM;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_msg_ctx_free_to_alloc
+ * @msg_ctx: The context to move from the free list to the alloc list
+ *
+ * Must be called with ecryptfs_msg_ctx_lists_mux held.
+ */
+static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx)
+{
+	list_move(&msg_ctx->node, &ecryptfs_msg_ctx_alloc_list);
+	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_PENDING;
+	msg_ctx->counter = ++ecryptfs_msg_counter;
+}
+
+/**
+ * ecryptfs_msg_ctx_alloc_to_free
+ * @msg_ctx: The context to move from the alloc list to the free list
+ *
+ * Must be called with ecryptfs_msg_ctx_lists_mux held.
+ */
+void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
+{
+	list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list);
+	kfree(msg_ctx->msg);
+	msg_ctx->msg = NULL;
+	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE;
+}
+
+/**
+ * ecryptfs_find_daemon_by_euid
+ * @daemon: If return value is zero, points to the desired daemon pointer
+ *
+ * Must be called with ecryptfs_daemon_hash_mux held.
+ *
+ * Search the hash list for the current effective user id.
+ *
+ * Returns zero if the user id exists in the list; non-zero otherwise.
+ */
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon)
+{
+	int rc;
+
+	hlist_for_each_entry(*daemon,
+			    &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()],
+			    euid_chain) {
+		if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) {
+			rc = 0;
+			goto out;
+		}
+	}
+	rc = -EINVAL;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
+ * @daemon: Pointer to set to newly allocated daemon struct
+ * @file: File used when opening /dev/ecryptfs
+ *
+ * Must be called ceremoniously while in possession of
+ * ecryptfs_sacred_daemon_hash_mux
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file)
+{
+	int rc = 0;
+
+	(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
+	if (!(*daemon)) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
+		       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
+		goto out;
+	}
+	(*daemon)->file = file;
+	mutex_init(&(*daemon)->mux);
+	INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue);
+	init_waitqueue_head(&(*daemon)->wait);
+	(*daemon)->num_queued_msg_ctx = 0;
+	hlist_add_head(&(*daemon)->euid_chain,
+		       &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()]);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_exorcise_daemon - Destroy the daemon struct
+ *
+ * Must be called ceremoniously while in possession of
+ * ecryptfs_daemon_hash_mux and the daemon's own mux.
+ */
+int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
+{
+	struct ecryptfs_msg_ctx *msg_ctx, *msg_ctx_tmp;
+	int rc = 0;
+
+	mutex_lock(&daemon->mux);
+	if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ)
+	    || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) {
+		rc = -EBUSY;
+		mutex_unlock(&daemon->mux);
+		goto out;
+	}
+	list_for_each_entry_safe(msg_ctx, msg_ctx_tmp,
+				 &daemon->msg_ctx_out_queue, daemon_out_list) {
+		list_del(&msg_ctx->daemon_out_list);
+		daemon->num_queued_msg_ctx--;
+		printk(KERN_WARNING "%s: Warning: dropping message that is in "
+		       "the out queue of a dying daemon\n", __func__);
+		ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+	}
+	hlist_del(&daemon->euid_chain);
+	mutex_unlock(&daemon->mux);
+	kzfree(daemon);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_process_reponse
+ * @msg: The ecryptfs message received; the caller should sanity check
+ *       msg->data_len and free the memory
+ * @seq: The sequence number of the message; must match the sequence
+ *       number for the existing message context waiting for this
+ *       response
+ *
+ * Processes a response message after sending an operation request to
+ * userspace. Some other process is awaiting this response. Before
+ * sending out its first communications, the other process allocated a
+ * msg_ctx from the ecryptfs_msg_ctx_arr at a particular index. The
+ * response message contains this index so that we can copy over the
+ * response message into the msg_ctx that the process holds a
+ * reference to. The other process is going to wake up, check to see
+ * that msg_ctx->state == ECRYPTFS_MSG_CTX_STATE_DONE, and then
+ * proceed to read off and process the response message. Returns zero
+ * upon delivery to desired context element; non-zero upon delivery
+ * failure or error.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
+			      struct ecryptfs_message *msg, u32 seq)
+{
+	struct ecryptfs_msg_ctx *msg_ctx;
+	size_t msg_size;
+	int rc;
+
+	if (msg->index >= ecryptfs_message_buf_len) {
+		rc = -EINVAL;
+		printk(KERN_ERR "%s: Attempt to reference "
+		       "context buffer at index [%d]; maximum "
+		       "allowable is [%d]\n", __func__, msg->index,
+		       (ecryptfs_message_buf_len - 1));
+		goto out;
+	}
+	msg_ctx = &ecryptfs_msg_ctx_arr[msg->index];
+	mutex_lock(&msg_ctx->mux);
+	if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) {
+		rc = -EINVAL;
+		printk(KERN_WARNING "%s: Desired context element is not "
+		       "pending a response\n", __func__);
+		goto unlock;
+	} else if (msg_ctx->counter != seq) {
+		rc = -EINVAL;
+		printk(KERN_WARNING "%s: Invalid message sequence; "
+		       "expected [%d]; received [%d]\n", __func__,
+		       msg_ctx->counter, seq);
+		goto unlock;
+	}
+	msg_size = (sizeof(*msg) + msg->data_len);
+	msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
+	if (!msg_ctx->msg) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
+		       "GFP_KERNEL memory\n", __func__, msg_size);
+		goto unlock;
+	}
+	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
+	wake_up_process(msg_ctx->task);
+	rc = 0;
+unlock:
+	mutex_unlock(&msg_ctx->mux);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_send_message_locked
+ * @data: The data to send
+ * @data_len: The length of data
+ * @msg_ctx: The message context allocated for the send
+ *
+ * Must be called with ecryptfs_daemon_hash_mux held.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
+			     struct ecryptfs_msg_ctx **msg_ctx)
+{
+	struct ecryptfs_daemon *daemon;
+	int rc;
+
+	rc = ecryptfs_find_daemon_by_euid(&daemon);
+	if (rc) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+	rc = ecryptfs_acquire_free_msg_ctx(msg_ctx);
+	if (rc) {
+		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+		printk(KERN_WARNING "%s: Could not claim a free "
+		       "context element\n", __func__);
+		goto out;
+	}
+	ecryptfs_msg_ctx_free_to_alloc(*msg_ctx);
+	mutex_unlock(&(*msg_ctx)->mux);
+	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0,
+				   daemon);
+	if (rc)
+		printk(KERN_ERR "%s: Error attempting to send message to "
+		       "userspace daemon; rc = [%d]\n", __func__, rc);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_send_message
+ * @data: The data to send
+ * @data_len: The length of data
+ * @msg_ctx: The message context allocated for the send
+ *
+ * Grabs ecryptfs_daemon_hash_mux.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_send_message(char *data, int data_len,
+			  struct ecryptfs_msg_ctx **msg_ctx)
+{
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST,
+					  msg_ctx);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_wait_for_response
+ * @msg_ctx: The context that was assigned when sending a message
+ * @msg: The incoming message from userspace; not set if rc != 0
+ *
+ * Sleeps until awaken by ecryptfs_receive_message or until the amount
+ * of time exceeds ecryptfs_message_wait_timeout.  If zero is
+ * returned, msg will point to a valid message from userspace; a
+ * non-zero value is returned upon failure to receive a message or an
+ * error occurs. Callee must free @msg on success.
+ */
+int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
+			       struct ecryptfs_message **msg)
+{
+	signed long timeout = ecryptfs_message_wait_timeout * HZ;
+	int rc = 0;
+
+sleep:
+	timeout = schedule_timeout_interruptible(timeout);
+	mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+	mutex_lock(&msg_ctx->mux);
+	if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_DONE) {
+		if (timeout) {
+			mutex_unlock(&msg_ctx->mux);
+			mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+			goto sleep;
+		}
+		rc = -ENOMSG;
+	} else {
+		*msg = msg_ctx->msg;
+		msg_ctx->msg = NULL;
+	}
+	ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+	mutex_unlock(&msg_ctx->mux);
+	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	return rc;
+}
+
+int __init ecryptfs_init_messaging(void)
+{
+	int i;
+	int rc = 0;
+
+	if (ecryptfs_number_of_users > ECRYPTFS_MAX_NUM_USERS) {
+		ecryptfs_number_of_users = ECRYPTFS_MAX_NUM_USERS;
+		printk(KERN_WARNING "%s: Specified number of users is "
+		       "too large, defaulting to [%d] users\n", __func__,
+		       ecryptfs_number_of_users);
+	}
+	mutex_init(&ecryptfs_daemon_hash_mux);
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	ecryptfs_hash_bits = 1;
+	while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
+		ecryptfs_hash_bits++;
+	ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
+					* (1 << ecryptfs_hash_bits)),
+				       GFP_KERNEL);
+	if (!ecryptfs_daemon_hash) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+		goto out;
+	}
+	for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
+		INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
+					* ecryptfs_message_buf_len),
+				       GFP_KERNEL);
+	if (!ecryptfs_msg_ctx_arr) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
+		goto out;
+	}
+	mutex_init(&ecryptfs_msg_ctx_lists_mux);
+	mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+	ecryptfs_msg_counter = 0;
+	for (i = 0; i < ecryptfs_message_buf_len; i++) {
+		INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].node);
+		INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].daemon_out_list);
+		mutex_init(&ecryptfs_msg_ctx_arr[i].mux);
+		mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
+		ecryptfs_msg_ctx_arr[i].index = i;
+		ecryptfs_msg_ctx_arr[i].state = ECRYPTFS_MSG_CTX_STATE_FREE;
+		ecryptfs_msg_ctx_arr[i].counter = 0;
+		ecryptfs_msg_ctx_arr[i].task = NULL;
+		ecryptfs_msg_ctx_arr[i].msg = NULL;
+		list_add_tail(&ecryptfs_msg_ctx_arr[i].node,
+			      &ecryptfs_msg_ctx_free_list);
+		mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
+	}
+	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	rc = ecryptfs_init_ecryptfs_miscdev();
+	if (rc)
+		ecryptfs_release_messaging();
+out:
+	return rc;
+}
+
+void ecryptfs_release_messaging(void)
+{
+	if (ecryptfs_msg_ctx_arr) {
+		int i;
+
+		mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+		for (i = 0; i < ecryptfs_message_buf_len; i++) {
+			mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
+			kfree(ecryptfs_msg_ctx_arr[i].msg);
+			mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
+		}
+		kfree(ecryptfs_msg_ctx_arr);
+		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	}
+	if (ecryptfs_daemon_hash) {
+		struct ecryptfs_daemon *daemon;
+		int i;
+
+		mutex_lock(&ecryptfs_daemon_hash_mux);
+		for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
+			int rc;
+
+			hlist_for_each_entry(daemon,
+					     &ecryptfs_daemon_hash[i],
+					     euid_chain) {
+				rc = ecryptfs_exorcise_daemon(daemon);
+				if (rc)
+					printk(KERN_ERR "%s: Error whilst "
+					       "attempting to destroy daemon; "
+					       "rc = [%d]. Dazed and confused, "
+					       "but trying to continue.\n",
+					       __func__, rc);
+			}
+		}
+		kfree(ecryptfs_daemon_hash);
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+	}
+	ecryptfs_destroy_ecryptfs_miscdev();
+	return;
+}
diff --git a/kernel/fs/ecryptfs/miscdev.c b/kernel/fs/ecryptfs/miscdev.c
new file mode 100644
index 000000000..e4141f257
--- /dev/null
+++ b/kernel/fs/ecryptfs/miscdev.c
@@ -0,0 +1,511 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include "ecryptfs_kernel.h"
+
+static atomic_t ecryptfs_num_miscdev_opens;
+
+/**
+ * ecryptfs_miscdev_poll
+ * @file: dev file
+ * @pt: dev poll table (ignored)
+ *
+ * Returns the poll mask
+ */
+static unsigned int
+ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
+{
+	struct ecryptfs_daemon *daemon = file->private_data;
+	unsigned int mask = 0;
+
+	mutex_lock(&daemon->mux);
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		printk(KERN_WARNING "%s: Attempt to poll on zombified "
+		       "daemon\n", __func__);
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ)
+		goto out_unlock_daemon;
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)
+		goto out_unlock_daemon;
+	daemon->flags |= ECRYPTFS_DAEMON_IN_POLL;
+	mutex_unlock(&daemon->mux);
+	poll_wait(file, &daemon->wait, pt);
+	mutex_lock(&daemon->mux);
+	if (!list_empty(&daemon->msg_ctx_out_queue))
+		mask |= POLLIN | POLLRDNORM;
+out_unlock_daemon:
+	daemon->flags &= ~ECRYPTFS_DAEMON_IN_POLL;
+	mutex_unlock(&daemon->mux);
+	return mask;
+}
+
+/**
+ * ecryptfs_miscdev_open
+ * @inode: inode of miscdev handle (ignored)
+ * @file: file for miscdev handle
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_miscdev_open(struct inode *inode, struct file *file)
+{
+	struct ecryptfs_daemon *daemon = NULL;
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_find_daemon_by_euid(&daemon);
+	if (!rc) {
+		rc = -EINVAL;
+		goto out_unlock_daemon_list;
+	}
+	rc = ecryptfs_spawn_daemon(&daemon, file);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to spawn daemon; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_unlock_daemon_list;
+	}
+	mutex_lock(&daemon->mux);
+	if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) {
+		rc = -EBUSY;
+		goto out_unlock_daemon;
+	}
+	daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN;
+	file->private_data = daemon;
+	atomic_inc(&ecryptfs_num_miscdev_opens);
+out_unlock_daemon:
+	mutex_unlock(&daemon->mux);
+out_unlock_daemon_list:
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_release
+ * @inode: inode of fs/ecryptfs/euid handle (ignored)
+ * @file: file for fs/ecryptfs/euid handle
+ *
+ * This keeps the daemon registered until the daemon sends another
+ * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_miscdev_release(struct inode *inode, struct file *file)
+{
+	struct ecryptfs_daemon *daemon = file->private_data;
+	int rc;
+
+	mutex_lock(&daemon->mux);
+	BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN));
+	daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN;
+	atomic_dec(&ecryptfs_num_miscdev_opens);
+	mutex_unlock(&daemon->mux);
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_exorcise_daemon(daemon);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	if (rc) {
+		printk(KERN_CRIT "%s: Fatal error whilst attempting to "
+		       "shut down daemon; rc = [%d]. Please report this "
+		       "bug.\n", __func__, rc);
+		BUG();
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_send_miscdev
+ * @data: Data to send to daemon; may be NULL
+ * @data_size: Amount of data to send to daemon
+ * @msg_ctx: Message context, which is used to handle the reply. If
+ *           this is NULL, then we do not expect a reply.
+ * @msg_type: Type of message
+ * @msg_flags: Flags for message
+ * @daemon: eCryptfs daemon object
+ *
+ * Add msg_ctx to queue and then, if it exists, notify the blocked
+ * miscdevess about the data being available. Must be called with
+ * ecryptfs_daemon_hash_mux held.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_send_miscdev(char *data, size_t data_size,
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct ecryptfs_daemon *daemon)
+{
+	struct ecryptfs_message *msg;
+
+	msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL);
+	if (!msg) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
+		       (sizeof(*msg) + data_size));
+		return -ENOMEM;
+	}
+
+	mutex_lock(&msg_ctx->mux);
+	msg_ctx->msg = msg;
+	msg_ctx->msg->index = msg_ctx->index;
+	msg_ctx->msg->data_len = data_size;
+	msg_ctx->type = msg_type;
+	memcpy(msg_ctx->msg->data, data, data_size);
+	msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
+	list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
+	mutex_unlock(&msg_ctx->mux);
+
+	mutex_lock(&daemon->mux);
+	daemon->num_queued_msg_ctx++;
+	wake_up_interruptible(&daemon->wait);
+	mutex_unlock(&daemon->mux);
+
+	return 0;
+}
+
+/*
+ * miscdevfs packet format:
+ *  Octet 0: Type
+ *  Octets 1-4: network byte order msg_ctx->counter
+ *  Octets 5-N0: Size of struct ecryptfs_message to follow
+ *  Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ *  Octets 5-N1 not written if the packet type does not include a message
+ */
+#define PKT_TYPE_SIZE		1
+#define PKT_CTR_SIZE		4
+#define MIN_NON_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE)
+#define MIN_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \
+				 + ECRYPTFS_MIN_PKT_LEN_SIZE)
+/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
+#define MAX_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \
+				 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+				 + sizeof(struct ecryptfs_message) \
+				 + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
+#define PKT_TYPE_OFFSET		0
+#define PKT_CTR_OFFSET		PKT_TYPE_SIZE
+#define PKT_LEN_OFFSET		(PKT_TYPE_SIZE + PKT_CTR_SIZE)
+
+/**
+ * ecryptfs_miscdev_read - format and send message from queue
+ * @file: miscdevfs handle
+ * @buf: User buffer into which to copy the next message on the daemon queue
+ * @count: Amount of space available in @buf
+ * @ppos: Offset in file (ignored)
+ *
+ * Pulls the most recent message from the daemon queue, formats it for
+ * being sent via a miscdevfs handle, and copies it into @buf
+ *
+ * Returns the number of bytes copied into the user buffer
+ */
+static ssize_t
+ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
+		      loff_t *ppos)
+{
+	struct ecryptfs_daemon *daemon = file->private_data;
+	struct ecryptfs_msg_ctx *msg_ctx;
+	size_t packet_length_size;
+	char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
+	size_t i;
+	size_t total_length;
+	int rc;
+
+	mutex_lock(&daemon->mux);
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		rc = 0;
+		printk(KERN_WARNING "%s: Attempt to read from zombified "
+		       "daemon\n", __func__);
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) {
+		rc = 0;
+		goto out_unlock_daemon;
+	}
+	/* This daemon will not go away so long as this flag is set */
+	daemon->flags |= ECRYPTFS_DAEMON_IN_READ;
+check_list:
+	if (list_empty(&daemon->msg_ctx_out_queue)) {
+		mutex_unlock(&daemon->mux);
+		rc = wait_event_interruptible(
+			daemon->wait, !list_empty(&daemon->msg_ctx_out_queue));
+		mutex_lock(&daemon->mux);
+		if (rc < 0) {
+			rc = 0;
+			goto out_unlock_daemon;
+		}
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		rc = 0;
+		goto out_unlock_daemon;
+	}
+	if (list_empty(&daemon->msg_ctx_out_queue)) {
+		/* Something else jumped in since the
+		 * wait_event_interruptable() and removed the
+		 * message from the queue; try again */
+		goto check_list;
+	}
+	msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
+				   struct ecryptfs_msg_ctx, daemon_out_list);
+	BUG_ON(!msg_ctx);
+	mutex_lock(&msg_ctx->mux);
+	if (msg_ctx->msg) {
+		rc = ecryptfs_write_packet_length(packet_length,
+						  msg_ctx->msg_size,
+						  &packet_length_size);
+		if (rc) {
+			rc = 0;
+			printk(KERN_WARNING "%s: Error writing packet length; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out_unlock_msg_ctx;
+		}
+	} else {
+		packet_length_size = 0;
+		msg_ctx->msg_size = 0;
+	}
+	total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
+			+ msg_ctx->msg_size);
+	if (count < total_length) {
+		rc = 0;
+		printk(KERN_WARNING "%s: Only given user buffer of "
+		       "size [%zd], but we need [%zd] to read the "
+		       "pending message\n", __func__, count, total_length);
+		goto out_unlock_msg_ctx;
+	}
+	rc = -EFAULT;
+	if (put_user(msg_ctx->type, buf))
+		goto out_unlock_msg_ctx;
+	if (put_user(cpu_to_be32(msg_ctx->counter),
+		     (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
+		goto out_unlock_msg_ctx;
+	i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
+	if (msg_ctx->msg) {
+		if (copy_to_user(&buf[i], packet_length, packet_length_size))
+			goto out_unlock_msg_ctx;
+		i += packet_length_size;
+		if (copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size))
+			goto out_unlock_msg_ctx;
+		i += msg_ctx->msg_size;
+	}
+	rc = i;
+	list_del(&msg_ctx->daemon_out_list);
+	kfree(msg_ctx->msg);
+	msg_ctx->msg = NULL;
+	/* We do not expect a reply from the userspace daemon for any
+	 * message type other than ECRYPTFS_MSG_REQUEST */
+	if (msg_ctx->type != ECRYPTFS_MSG_REQUEST)
+		ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+out_unlock_msg_ctx:
+	mutex_unlock(&msg_ctx->mux);
+out_unlock_daemon:
+	daemon->flags &= ~ECRYPTFS_DAEMON_IN_READ;
+	mutex_unlock(&daemon->mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
+ * @data: Bytes comprising struct ecryptfs_message
+ * @data_size: sizeof(struct ecryptfs_message) + data len
+ * @seq: Sequence number for miscdev response packet
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_miscdev_response(struct ecryptfs_daemon *daemon, char *data,
+				     size_t data_size, u32 seq)
+{
+	struct ecryptfs_message *msg = (struct ecryptfs_message *)data;
+	int rc;
+
+	if ((sizeof(*msg) + msg->data_len) != data_size) {
+		printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
+		       "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
+		       (sizeof(*msg) + msg->data_len), data_size);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_process_response(daemon, msg, seq);
+	if (rc)
+		printk(KERN_ERR
+		       "Error processing response message; rc = [%d]\n", rc);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_write - handle write to daemon miscdev handle
+ * @file: File for misc dev handle
+ * @buf: Buffer containing user data
+ * @count: Amount of data in @buf
+ * @ppos: Pointer to offset in file (ignored)
+ *
+ * Returns the number of bytes read from @buf
+ */
+static ssize_t
+ecryptfs_miscdev_write(struct file *file, const char __user *buf,
+		       size_t count, loff_t *ppos)
+{
+	__be32 counter_nbo;
+	u32 seq;
+	size_t packet_size, packet_size_length;
+	char *data;
+	unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
+	ssize_t rc;
+
+	if (count == 0) {
+		return 0;
+	} else if (count == MIN_NON_MSG_PKT_SIZE) {
+		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+		goto memdup;
+	} else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
+		printk(KERN_WARNING "%s: Acceptable packet size range is "
+		       "[%d-%zu], but amount of data written is [%zu].",
+		       __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
+			   sizeof(packet_size_peek))) {
+		printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+		       __func__);
+		return -EFAULT;
+	}
+
+	rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%zd]\n", __func__, rc);
+		return rc;
+	}
+
+	if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
+	    != count) {
+		printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+		       packet_size);
+		return -EINVAL;
+	}
+
+memdup:
+	data = memdup_user(buf, count);
+	if (IS_ERR(data)) {
+		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
+		       __func__, PTR_ERR(data));
+		return PTR_ERR(data);
+	}
+	switch (data[PKT_TYPE_OFFSET]) {
+	case ECRYPTFS_MSG_RESPONSE:
+		if (count < (MIN_MSG_PKT_SIZE
+			     + sizeof(struct ecryptfs_message))) {
+			printk(KERN_WARNING "%s: Minimum acceptable packet "
+			       "size is [%zd], but amount of data written is "
+			       "only [%zd]. Discarding response packet.\n",
+			       __func__,
+			       (MIN_MSG_PKT_SIZE
+				+ sizeof(struct ecryptfs_message)), count);
+			rc = -EINVAL;
+			goto out_free;
+		}
+		memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
+		seq = be32_to_cpu(counter_nbo);
+		rc = ecryptfs_miscdev_response(file->private_data,
+				&data[PKT_LEN_OFFSET + packet_size_length],
+				packet_size, seq);
+		if (rc) {
+			printk(KERN_WARNING "%s: Failed to deliver miscdev "
+			       "response to requesting operation; rc = [%zd]\n",
+			       __func__, rc);
+			goto out_free;
+		}
+		break;
+	case ECRYPTFS_MSG_HELO:
+	case ECRYPTFS_MSG_QUIT:
+		break;
+	default:
+		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
+				"message of unrecognized type [%d]\n",
+				data[0]);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	rc = count;
+out_free:
+	kfree(data);
+	return rc;
+}
+
+
+static const struct file_operations ecryptfs_miscdev_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ecryptfs_miscdev_open,
+	.poll    = ecryptfs_miscdev_poll,
+	.read    = ecryptfs_miscdev_read,
+	.write   = ecryptfs_miscdev_write,
+	.release = ecryptfs_miscdev_release,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice ecryptfs_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name  = "ecryptfs",
+	.fops  = &ecryptfs_miscdev_fops
+};
+
+/**
+ * ecryptfs_init_ecryptfs_miscdev
+ *
+ * Messages sent to the userspace daemon from the kernel are placed on
+ * a queue associated with the daemon. The next read against the
+ * miscdev handle by that daemon will return the oldest message placed
+ * on the message queue for the daemon.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int __init ecryptfs_init_ecryptfs_miscdev(void)
+{
+	int rc;
+
+	atomic_set(&ecryptfs_num_miscdev_opens, 0);
+	rc = misc_register(&ecryptfs_miscdev);
+	if (rc)
+		printk(KERN_ERR "%s: Failed to register miscellaneous device "
+		       "for communications with userspace daemons; rc = [%d]\n",
+		       __func__, rc);
+	return rc;
+}
+
+/**
+ * ecryptfs_destroy_ecryptfs_miscdev
+ *
+ * All of the daemons must be exorcised prior to calling this
+ * function.
+ */
+void ecryptfs_destroy_ecryptfs_miscdev(void)
+{
+	BUG_ON(atomic_read(&ecryptfs_num_miscdev_opens) != 0);
+	misc_deregister(&ecryptfs_miscdev);
+}
diff --git a/kernel/fs/ecryptfs/mmap.c b/kernel/fs/ecryptfs/mmap.c
new file mode 100644
index 000000000..cf2085229
--- /dev/null
+++ b/kernel/fs/ecryptfs/mmap.c
@@ -0,0 +1,561 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * This is where eCryptfs coordinates the symmetric encryption and
+ * decryption of the file data as it passes between the lower
+ * encrypted file and the upper decrypted file.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/page-flags.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_get_locked_page
+ *
+ * Get one page from cache or lower f/s, return error otherwise.
+ *
+ * Returns locked and up-to-date page (if ok), with increased
+ * refcnt.
+ */
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
+{
+	struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
+	if (!IS_ERR(page))
+		lock_page(page);
+	return page;
+}
+
+/**
+ * ecryptfs_writepage
+ * @page: Page that is locked before this call is made
+ *
+ * Returns zero on success; non-zero otherwise
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem.  In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
+ */
+static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int rc;
+
+	rc = ecryptfs_encrypt_page(page);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error encrypting "
+				"page (upper index [0x%.16lx])\n", page->index);
+		ClearPageUptodate(page);
+		goto out;
+	}
+	SetPageUptodate(page);
+out:
+	unlock_page(page);
+	return rc;
+}
+
+static void strip_xattr_flag(char *page_virt,
+			     struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+		size_t written;
+
+		crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
+		ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
+						&written);
+		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	}
+}
+
+/**
+ *   Header Extent:
+ *     Octets 0-7:        Unencrypted file size (big-endian)
+ *     Octets 8-15:       eCryptfs special marker
+ *     Octets 16-19:      Flags
+ *      Octet 16:         File format version number (between 0 and 255)
+ *      Octets 17-18:     Reserved
+ *      Octet 19:         Bit 1 (lsb): Reserved
+ *                        Bit 2: Encrypted?
+ *                        Bits 3-8: Reserved
+ *     Octets 20-23:      Header extent size (big-endian)
+ *     Octets 24-25:      Number of header extents at front of file
+ *                        (big-endian)
+ *     Octet  26:         Begin RFC 2440 authentication token packet set
+ */
+
+/**
+ * ecryptfs_copy_up_encrypted_with_header
+ * @page: Sort of a ``virtual'' representation of the encrypted lower
+ *        file. The actual lower file does not have the metadata in
+ *        the header. This is locked.
+ * @crypt_stat: The eCryptfs inode's cryptographic context
+ *
+ * The ``view'' is the version of the file that userspace winds up
+ * seeing, with the header information inserted.
+ */
+static int
+ecryptfs_copy_up_encrypted_with_header(struct page *page,
+				       struct ecryptfs_crypt_stat *crypt_stat)
+{
+	loff_t extent_num_in_page = 0;
+	loff_t num_extents_per_page = (PAGE_CACHE_SIZE
+				       / crypt_stat->extent_size);
+	int rc = 0;
+
+	while (extent_num_in_page < num_extents_per_page) {
+		loff_t view_extent_num = ((((loff_t)page->index)
+					   * num_extents_per_page)
+					  + extent_num_in_page);
+		size_t num_header_extents_at_front =
+			(crypt_stat->metadata_size / crypt_stat->extent_size);
+
+		if (view_extent_num < num_header_extents_at_front) {
+			/* This is a header extent */
+			char *page_virt;
+
+			page_virt = kmap_atomic(page);
+			memset(page_virt, 0, PAGE_CACHE_SIZE);
+			/* TODO: Support more than one header extent */
+			if (view_extent_num == 0) {
+				size_t written;
+
+				rc = ecryptfs_read_xattr_region(
+					page_virt, page->mapping->host);
+				strip_xattr_flag(page_virt + 16, crypt_stat);
+				ecryptfs_write_header_metadata(page_virt + 20,
+							       crypt_stat,
+							       &written);
+			}
+			kunmap_atomic(page_virt);
+			flush_dcache_page(page);
+			if (rc) {
+				printk(KERN_ERR "%s: Error reading xattr "
+				       "region; rc = [%d]\n", __func__, rc);
+				goto out;
+			}
+		} else {
+			/* This is an encrypted data extent */
+			loff_t lower_offset =
+				((view_extent_num * crypt_stat->extent_size)
+				 - crypt_stat->metadata_size);
+
+			rc = ecryptfs_read_lower_page_segment(
+				page, (lower_offset >> PAGE_CACHE_SHIFT),
+				(lower_offset & ~PAGE_CACHE_MASK),
+				crypt_stat->extent_size, page->mapping->host);
+			if (rc) {
+				printk(KERN_ERR "%s: Error attempting to read "
+				       "extent at offset [%lld] in the lower "
+				       "file; rc = [%d]\n", __func__,
+				       lower_offset, rc);
+				goto out;
+			}
+		}
+		extent_num_in_page++;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_readpage
+ * @file: An eCryptfs file
+ * @page: Page from eCryptfs inode mapping into which to stick the read data
+ *
+ * Read in a page, decrypting if necessary.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_readpage(struct file *file, struct page *page)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
+	int rc = 0;
+
+	if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
+						      PAGE_CACHE_SIZE,
+						      page->mapping->host);
+	} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
+		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+			rc = ecryptfs_copy_up_encrypted_with_header(page,
+								    crypt_stat);
+			if (rc) {
+				printk(KERN_ERR "%s: Error attempting to copy "
+				       "the encrypted content from the lower "
+				       "file whilst inserting the metadata "
+				       "from the xattr into the header; rc = "
+				       "[%d]\n", __func__, rc);
+				goto out;
+			}
+
+		} else {
+			rc = ecryptfs_read_lower_page_segment(
+				page, page->index, 0, PAGE_CACHE_SIZE,
+				page->mapping->host);
+			if (rc) {
+				printk(KERN_ERR "Error reading page; rc = "
+				       "[%d]\n", rc);
+				goto out;
+			}
+		}
+	} else {
+		rc = ecryptfs_decrypt_page(page);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error decrypting page; "
+					"rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+out:
+	if (rc)
+		ClearPageUptodate(page);
+	else
+		SetPageUptodate(page);
+	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
+			page->index);
+	unlock_page(page);
+	return rc;
+}
+
+/**
+ * Called with lower inode mutex held.
+ */
+static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
+{
+	struct inode *inode = page->mapping->host;
+	int end_byte_in_page;
+
+	if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index)
+		goto out;
+	end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+	if (to > end_byte_in_page)
+		end_byte_in_page = to;
+	zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE);
+out:
+	return 0;
+}
+
+/**
+ * ecryptfs_write_begin
+ * @file: The eCryptfs file
+ * @mapping: The eCryptfs object
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused)
+ *
+ * This function must zero any hole we create
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_write_begin(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	loff_t prev_page_end_size;
+	int rc = 0;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
+	if (!PageUptodate(page)) {
+		struct ecryptfs_crypt_stat *crypt_stat =
+			&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
+
+		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+			rc = ecryptfs_read_lower_page_segment(
+				page, index, 0, PAGE_CACHE_SIZE, mapping->host);
+			if (rc) {
+				printk(KERN_ERR "%s: Error attemping to read "
+				       "lower page segment; rc = [%d]\n",
+				       __func__, rc);
+				ClearPageUptodate(page);
+				goto out;
+			} else
+				SetPageUptodate(page);
+		} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
+			if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+				rc = ecryptfs_copy_up_encrypted_with_header(
+					page, crypt_stat);
+				if (rc) {
+					printk(KERN_ERR "%s: Error attempting "
+					       "to copy the encrypted content "
+					       "from the lower file whilst "
+					       "inserting the metadata from "
+					       "the xattr into the header; rc "
+					       "= [%d]\n", __func__, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
+				SetPageUptodate(page);
+			} else {
+				rc = ecryptfs_read_lower_page_segment(
+					page, index, 0, PAGE_CACHE_SIZE,
+					mapping->host);
+				if (rc) {
+					printk(KERN_ERR "%s: Error reading "
+					       "page; rc = [%d]\n",
+					       __func__, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
+				SetPageUptodate(page);
+			}
+		} else {
+			if (prev_page_end_size
+			    >= i_size_read(page->mapping->host)) {
+				zero_user(page, 0, PAGE_CACHE_SIZE);
+				SetPageUptodate(page);
+			} else if (len < PAGE_CACHE_SIZE) {
+				rc = ecryptfs_decrypt_page(page);
+				if (rc) {
+					printk(KERN_ERR "%s: Error decrypting "
+					       "page at index [%ld]; "
+					       "rc = [%d]\n",
+					       __func__, page->index, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
+				SetPageUptodate(page);
+			}
+		}
+	}
+	/* If creating a page or more of holes, zero them out via truncate.
+	 * Note, this will increase i_size. */
+	if (index != 0) {
+		if (prev_page_end_size > i_size_read(page->mapping->host)) {
+			rc = ecryptfs_truncate(file->f_path.dentry,
+					       prev_page_end_size);
+			if (rc) {
+				printk(KERN_ERR "%s: Error on attempt to "
+				       "truncate to (higher) offset [%lld];"
+				       " rc = [%d]\n", __func__,
+				       prev_page_end_size, rc);
+				goto out;
+			}
+		}
+	}
+	/* Writing to a new page, and creating a small hole from start
+	 * of page?  Zero it out. */
+	if ((i_size_read(mapping->host) == prev_page_end_size)
+	    && (pos != 0))
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+out:
+	if (unlikely(rc)) {
+		unlock_page(page);
+		page_cache_release(page);
+		*pagep = NULL;
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_write_inode_size_to_header
+ *
+ * Writes the lower file size to the first 8 bytes of the header.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
+{
+	char *file_size_virt;
+	int rc;
+
+	file_size_virt = kmalloc(sizeof(u64), GFP_KERNEL);
+	if (!file_size_virt) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt);
+	rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
+				  sizeof(u64));
+	kfree(file_size_virt);
+	if (rc < 0)
+		printk(KERN_ERR "%s: Error writing file size to header; "
+		       "rc = [%d]\n", __func__, rc);
+	else
+		rc = 0;
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_xattr_cache;
+
+static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
+{
+	ssize_t size;
+	void *xattr_virt;
+	struct dentry *lower_dentry =
+		ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry;
+	struct inode *lower_inode = d_inode(lower_dentry);
+	int rc;
+
+	if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
+		printk(KERN_WARNING
+		       "No support for setting xattr in lower filesystem\n");
+		rc = -ENOSYS;
+		goto out;
+	}
+	xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL);
+	if (!xattr_virt) {
+		printk(KERN_ERR "Out of memory whilst attempting to write "
+		       "inode size to xattr\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	mutex_lock(&lower_inode->i_mutex);
+	size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+					   xattr_virt, PAGE_CACHE_SIZE);
+	if (size < 0)
+		size = 8;
+	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
+	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+					 xattr_virt, size, 0);
+	mutex_unlock(&lower_inode->i_mutex);
+	if (rc)
+		printk(KERN_ERR "Error whilst attempting to write inode size "
+		       "to lower file xattr; rc = [%d]\n", rc);
+	kmem_cache_free(ecryptfs_xattr_cache, xattr_virt);
+out:
+	return rc;
+}
+
+int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+
+	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
+	else
+		return ecryptfs_write_inode_size_to_header(ecryptfs_inode);
+}
+
+/**
+ * ecryptfs_write_end
+ * @file: The eCryptfs file object
+ * @mapping: The eCryptfs object
+ * @pos: The file position
+ * @len: The length of the data (unused)
+ * @copied: The amount of data copied
+ * @page: The eCryptfs page
+ * @fsdata: The fsdata (unused)
+ */
+static int ecryptfs_write_end(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + copied;
+	struct inode *ecryptfs_inode = mapping->host;
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	int rc;
+
+	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
+			"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
+	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
+						       to);
+		if (!rc) {
+			rc = copied;
+			fsstack_copy_inode_size(ecryptfs_inode,
+				ecryptfs_inode_to_lower(ecryptfs_inode));
+		}
+		goto out;
+	}
+	if (!PageUptodate(page)) {
+		if (copied < PAGE_CACHE_SIZE) {
+			rc = 0;
+			goto out;
+		}
+		SetPageUptodate(page);
+	}
+	/* Fills in zeros if 'to' goes beyond inode size */
+	rc = fill_zeros_to_end_of_page(page, to);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
+			"zeros in page with index = [0x%.16lx]\n", index);
+		goto out;
+	}
+	rc = ecryptfs_encrypt_page(page);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
+				"index [0x%.16lx])\n", index);
+		goto out;
+	}
+	if (pos + copied > i_size_read(ecryptfs_inode)) {
+		i_size_write(ecryptfs_inode, pos + copied);
+		ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
+			"[0x%.16llx]\n",
+			(unsigned long long)i_size_read(ecryptfs_inode));
+	}
+	rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
+	if (rc)
+		printk(KERN_ERR "Error writing inode size to metadata; "
+		       "rc = [%d]\n", rc);
+	else
+		rc = copied;
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return rc;
+}
+
+static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
+{
+	int rc = 0;
+	struct inode *inode;
+	struct inode *lower_inode;
+
+	inode = (struct inode *)mapping->host;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	if (lower_inode->i_mapping->a_ops->bmap)
+		rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping,
+							 block);
+	return rc;
+}
+
+const struct address_space_operations ecryptfs_aops = {
+	.writepage = ecryptfs_writepage,
+	.readpage = ecryptfs_readpage,
+	.write_begin = ecryptfs_write_begin,
+	.write_end = ecryptfs_write_end,
+	.bmap = ecryptfs_bmap,
+};
diff --git a/kernel/fs/ecryptfs/read_write.c b/kernel/fs/ecryptfs/read_write.c
new file mode 100644
index 000000000..09fe62227
--- /dev/null
+++ b/kernel/fs/ecryptfs/read_write.c
@@ -0,0 +1,273 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_write_lower
+ * @ecryptfs_inode: The eCryptfs inode
+ * @data: Data to write
+ * @offset: Byte offset in the lower file to which to write the data
+ * @size: Number of bytes from @data to write at @offset in the lower
+ *        file
+ *
+ * Write data to the lower file.
+ *
+ * Returns bytes written on success; less than zero on error
+ */
+int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
+			 loff_t offset, size_t size)
+{
+	struct file *lower_file;
+	ssize_t rc;
+
+	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+	if (!lower_file)
+		return -EIO;
+	rc = kernel_write(lower_file, data, size, offset);
+	mark_inode_dirty_sync(ecryptfs_inode);
+	return rc;
+}
+
+/**
+ * ecryptfs_write_lower_page_segment
+ * @ecryptfs_inode: The eCryptfs inode
+ * @page_for_lower: The page containing the data to be written to the
+ *                  lower file
+ * @offset_in_page: The offset in the @page_for_lower from which to
+ *                  start writing the data
+ * @size: The amount of data from @page_for_lower to write to the
+ *        lower file
+ *
+ * Determines the byte offset in the file for the given page and
+ * offset within the page, maps the page, and makes the call to write
+ * the contents of @page_for_lower to the lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
+				      struct page *page_for_lower,
+				      size_t offset_in_page, size_t size)
+{
+	char *virt;
+	loff_t offset;
+	int rc;
+
+	offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
+		  + offset_in_page);
+	virt = kmap(page_for_lower);
+	rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
+	if (rc > 0)
+		rc = 0;
+	kunmap(page_for_lower);
+	return rc;
+}
+
+/**
+ * ecryptfs_write
+ * @ecryptfs_inode: The eCryptfs file into which to write
+ * @data: Virtual address where data to write is located
+ * @offset: Offset in the eCryptfs file at which to begin writing the
+ *          data from @data
+ * @size: The number of bytes to write from @data
+ *
+ * Write an arbitrary amount of data to an arbitrary location in the
+ * eCryptfs inode page cache. This is done on a page-by-page, and then
+ * by an extent-by-extent, basis; individual extents are encrypted and
+ * written to the lower page cache (via VFS writes). This function
+ * takes care of all the address translation to locations in the lower
+ * filesystem; it also handles truncate events, writing out zeros
+ * where necessary.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
+		   size_t size)
+{
+	struct page *ecryptfs_page;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	char *ecryptfs_page_virt;
+	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
+	loff_t data_offset = 0;
+	loff_t pos;
+	int rc = 0;
+
+	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	/*
+	 * if we are writing beyond current size, then start pos
+	 * at the current size - we'll fill in zeros from there.
+	 */
+	if (offset > ecryptfs_file_size)
+		pos = ecryptfs_file_size;
+	else
+		pos = offset;
+	while (pos < (offset + size)) {
+		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
+		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
+		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
+		loff_t total_remaining_bytes = ((offset + size) - pos);
+
+		if (fatal_signal_pending(current)) {
+			rc = -EINTR;
+			break;
+		}
+
+		if (num_bytes > total_remaining_bytes)
+			num_bytes = total_remaining_bytes;
+		if (pos < offset) {
+			/* remaining zeros to write, up to destination offset */
+			loff_t total_remaining_zeros = (offset - pos);
+
+			if (num_bytes > total_remaining_zeros)
+				num_bytes = total_remaining_zeros;
+		}
+		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
+							 ecryptfs_page_idx);
+		if (IS_ERR(ecryptfs_page)) {
+			rc = PTR_ERR(ecryptfs_page);
+			printk(KERN_ERR "%s: Error getting page at "
+			       "index [%ld] from eCryptfs inode "
+			       "mapping; rc = [%d]\n", __func__,
+			       ecryptfs_page_idx, rc);
+			goto out;
+		}
+		ecryptfs_page_virt = kmap_atomic(ecryptfs_page);
+
+		/*
+		 * pos: where we're now writing, offset: where the request was
+		 * If current pos is before request, we are filling zeros
+		 * If we are at or beyond request, we are writing the *data*
+		 * If we're in a fresh page beyond eof, zero it in either case
+		 */
+		if (pos < offset || !start_offset_in_page) {
+			/* We are extending past the previous end of the file.
+			 * Fill in zero values to the end of the page */
+			memset(((char *)ecryptfs_page_virt
+				+ start_offset_in_page), 0,
+				PAGE_CACHE_SIZE - start_offset_in_page);
+		}
+
+		/* pos >= offset, we are now writing the data request */
+		if (pos >= offset) {
+			memcpy(((char *)ecryptfs_page_virt
+				+ start_offset_in_page),
+			       (data + data_offset), num_bytes);
+			data_offset += num_bytes;
+		}
+		kunmap_atomic(ecryptfs_page_virt);
+		flush_dcache_page(ecryptfs_page);
+		SetPageUptodate(ecryptfs_page);
+		unlock_page(ecryptfs_page);
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
+			rc = ecryptfs_encrypt_page(ecryptfs_page);
+		else
+			rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
+						ecryptfs_page,
+						start_offset_in_page,
+						data_offset);
+		page_cache_release(ecryptfs_page);
+		if (rc) {
+			printk(KERN_ERR "%s: Error encrypting "
+			       "page; rc = [%d]\n", __func__, rc);
+			goto out;
+		}
+		pos += num_bytes;
+	}
+	if (pos > ecryptfs_file_size) {
+		i_size_write(ecryptfs_inode, pos);
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
+			int rc2;
+
+			rc2 = ecryptfs_write_inode_size_to_metadata(
+								ecryptfs_inode);
+			if (rc2) {
+				printk(KERN_ERR	"Problem with "
+				       "ecryptfs_write_inode_size_to_metadata; "
+				       "rc = [%d]\n", rc2);
+				if (!rc)
+					rc = rc2;
+				goto out;
+			}
+		}
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_read_lower
+ * @data: The read data is stored here by this function
+ * @offset: Byte offset in the lower file from which to read the data
+ * @size: Number of bytes to read from @offset of the lower file and
+ *        store into @data
+ * @ecryptfs_inode: The eCryptfs inode
+ *
+ * Read @size bytes of data at byte offset @offset from the lower
+ * inode into memory location @data.
+ *
+ * Returns bytes read on success; 0 on EOF; less than zero on error
+ */
+int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
+			struct inode *ecryptfs_inode)
+{
+	struct file *lower_file;
+	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+	if (!lower_file)
+		return -EIO;
+	return kernel_read(lower_file, offset, data, size);
+}
+
+/**
+ * ecryptfs_read_lower_page_segment
+ * @page_for_ecryptfs: The page into which data for eCryptfs will be
+ *                     written
+ * @offset_in_page: Offset in @page_for_ecryptfs from which to start
+ *                  writing
+ * @size: The number of bytes to write into @page_for_ecryptfs
+ * @ecryptfs_inode: The eCryptfs inode
+ *
+ * Determines the byte offset in the file for the given page and
+ * offset within the page, maps the page, and makes the call to read
+ * the contents of @page_for_ecryptfs from the lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+				     pgoff_t page_index,
+				     size_t offset_in_page, size_t size,
+				     struct inode *ecryptfs_inode)
+{
+	char *virt;
+	loff_t offset;
+	int rc;
+
+	offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
+	virt = kmap(page_for_ecryptfs);
+	rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
+	if (rc > 0)
+		rc = 0;
+	kunmap(page_for_ecryptfs);
+	flush_dcache_page(page_for_ecryptfs);
+	return rc;
+}
diff --git a/kernel/fs/ecryptfs/super.c b/kernel/fs/ecryptfs/super.c
new file mode 100644
index 000000000..afa1b81c3
--- /dev/null
+++ b/kernel/fs/ecryptfs/super.c
@@ -0,0 +1,191 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/key.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_inode_info_cache;
+
+/**
+ * ecryptfs_alloc_inode - allocate an ecryptfs inode
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Called to bring an inode into existence.
+ *
+ * Only handle allocation, setting up structures should be done in
+ * ecryptfs_read_inode. This is because the kernel, between now and
+ * then, will 0 out the private data pointer.
+ *
+ * Returns a pointer to a newly allocated inode, NULL otherwise
+ */
+static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
+{
+	struct ecryptfs_inode_info *inode_info;
+	struct inode *inode = NULL;
+
+	inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL);
+	if (unlikely(!inode_info))
+		goto out;
+	ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
+	mutex_init(&inode_info->lower_file_mutex);
+	atomic_set(&inode_info->lower_file_count, 0);
+	inode_info->lower_file = NULL;
+	inode = &inode_info->vfs_inode;
+out:
+	return inode;
+}
+
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ecryptfs_inode_info *inode_info;
+	inode_info = ecryptfs_inode_to_private(inode);
+
+	kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
+
+/**
+ * ecryptfs_destroy_inode
+ * @inode: The ecryptfs inode
+ *
+ * This is used during the final destruction of the inode.  All
+ * allocation of memory related to the inode, including allocated
+ * memory in the crypt_stat struct, will be released here.
+ * There should be no chance that this deallocation will be missed.
+ */
+static void ecryptfs_destroy_inode(struct inode *inode)
+{
+	struct ecryptfs_inode_info *inode_info;
+
+	inode_info = ecryptfs_inode_to_private(inode);
+	BUG_ON(inode_info->lower_file);
+	ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
+	call_rcu(&inode->i_rcu, ecryptfs_i_callback);
+}
+
+/**
+ * ecryptfs_statfs
+ * @sb: The ecryptfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. Currently, we let this pass right through
+ * to the lower filesystem and take no action ourselves.
+ */
+static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	int rc;
+
+	if (!lower_dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+
+	rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+	if (rc)
+		return rc;
+
+	buf->f_type = ECRYPTFS_SUPER_MAGIC;
+	rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen,
+	       &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat);
+
+	return rc;
+}
+
+/**
+ * ecryptfs_evict_inode
+ * @inode - The ecryptfs inode
+ *
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere.  Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list. We use this to drop out reference to the
+ * lower inode.
+ */
+static void ecryptfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	iput(ecryptfs_inode_to_lower(inode));
+}
+
+/**
+ * ecryptfs_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ecryptfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+	struct ecryptfs_global_auth_tok *walker;
+
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry(walker,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
+		else
+			seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
+	}
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
+	seq_printf(m, ",ecryptfs_cipher=%s",
+		mount_crypt_stat->global_default_cipher_name);
+
+	if (mount_crypt_stat->global_default_cipher_key_size)
+		seq_printf(m, ",ecryptfs_key_bytes=%zd",
+			   mount_crypt_stat->global_default_cipher_key_size);
+	if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)
+		seq_printf(m, ",ecryptfs_passthrough");
+	if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED)
+		seq_printf(m, ",ecryptfs_xattr_metadata");
+	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+		seq_printf(m, ",ecryptfs_encrypted_view");
+	if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
+		seq_printf(m, ",ecryptfs_unlink_sigs");
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+		seq_printf(m, ",ecryptfs_mount_auth_tok_only");
+
+	return 0;
+}
+
+const struct super_operations ecryptfs_sops = {
+	.alloc_inode = ecryptfs_alloc_inode,
+	.destroy_inode = ecryptfs_destroy_inode,
+	.statfs = ecryptfs_statfs,
+	.remount_fs = NULL,
+	.evict_inode = ecryptfs_evict_inode,
+	.show_options = ecryptfs_show_options
+};
diff --git a/kernel/fs/efivarfs/Kconfig b/kernel/fs/efivarfs/Kconfig
new file mode 100644
index 000000000..c2499ef17
--- /dev/null
+++ b/kernel/fs/efivarfs/Kconfig
@@ -0,0 +1,13 @@
+config EFIVAR_FS
+	tristate "EFI Variable filesystem"
+	depends on EFI
+	default m
+	help
+	  efivarfs is a replacement filesystem for the old EFI
+	  variable support via sysfs, as it doesn't suffer from the
+	  same 1024-byte variable size limit.
+
+	  To compile this file system support as a module, choose M
+	  here. The module will be called efivarfs.
+
+	  If unsure, say N.
diff --git a/kernel/fs/efivarfs/Makefile b/kernel/fs/efivarfs/Makefile
new file mode 100644
index 000000000..955d47817
--- /dev/null
+++ b/kernel/fs/efivarfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the efivarfs filesystem
+#
+
+obj-$(CONFIG_EFIVAR_FS)		+= efivarfs.o
+
+efivarfs-objs			:= inode.o file.o super.o
diff --git a/kernel/fs/efivarfs/file.c b/kernel/fs/efivarfs/file.c
new file mode 100644
index 000000000..90001da9a
--- /dev/null
+++ b/kernel/fs/efivarfs/file.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+static ssize_t efivarfs_file_write(struct file *file,
+		const char __user *userbuf, size_t count, loff_t *ppos)
+{
+	struct efivar_entry *var = file->private_data;
+	void *data;
+	u32 attributes;
+	struct inode *inode = file->f_mapping->host;
+	unsigned long datasize = count - sizeof(attributes);
+	ssize_t bytes;
+	bool set = false;
+
+	if (count < sizeof(attributes))
+		return -EINVAL;
+
+	if (copy_from_user(&attributes, userbuf, sizeof(attributes)))
+		return -EFAULT;
+
+	if (attributes & ~(EFI_VARIABLE_MASK))
+		return -EINVAL;
+
+	data = memdup_user(userbuf + sizeof(attributes), datasize);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
+					  data, &set);
+	if (!set && bytes) {
+		if (bytes == -ENOENT)
+			bytes = -EIO;
+		goto out;
+	}
+
+	if (bytes == -ENOENT) {
+		drop_nlink(inode);
+		d_delete(file->f_path.dentry);
+		dput(file->f_path.dentry);
+	} else {
+		mutex_lock(&inode->i_mutex);
+		i_size_write(inode, datasize + sizeof(attributes));
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	bytes = count;
+
+out:
+	kfree(data);
+
+	return bytes;
+}
+
+static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
+		size_t count, loff_t *ppos)
+{
+	struct efivar_entry *var = file->private_data;
+	unsigned long datasize = 0;
+	u32 attributes;
+	void *data;
+	ssize_t size = 0;
+	int err;
+
+	err = efivar_entry_size(var, &datasize);
+
+	/*
+	 * efivarfs represents uncommitted variables with
+	 * zero-length files. Reading them should return EOF.
+	 */
+	if (err == -ENOENT)
+		return 0;
+	else if (err)
+		return err;
+
+	data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
+
+	if (!data)
+		return -ENOMEM;
+
+	size = efivar_entry_get(var, &attributes, &datasize,
+				data + sizeof(attributes));
+	if (size)
+		goto out_free;
+
+	memcpy(data, &attributes, sizeof(attributes));
+	size = simple_read_from_buffer(userbuf, count, ppos,
+				       data, datasize + sizeof(attributes));
+out_free:
+	kfree(data);
+
+	return size;
+}
+
+const struct file_operations efivarfs_file_operations = {
+	.open	= simple_open,
+	.read	= efivarfs_file_read,
+	.write	= efivarfs_file_write,
+	.llseek	= no_llseek,
+};
diff --git a/kernel/fs/efivarfs/inode.c b/kernel/fs/efivarfs/inode.c
new file mode 100644
index 000000000..3381b9da9
--- /dev/null
+++ b/kernel/fs/efivarfs/inode.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+struct inode *efivarfs_get_inode(struct super_block *sb,
+				const struct inode *dir, int mode, dev_t dev)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		case S_IFREG:
+			inode->i_fop = &efivarfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &efivarfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+			inc_nlink(inode);
+			break;
+		}
+	}
+	return inode;
+}
+
+/*
+ * Return true if 'str' is a valid efivarfs filename of the form,
+ *
+ *	VariableName-12345678-1234-1234-1234-1234567891bc
+ */
+bool efivarfs_valid_name(const char *str, int len)
+{
+	static const char dashes[EFI_VARIABLE_GUID_LEN] = {
+		[8] = 1, [13] = 1, [18] = 1, [23] = 1
+	};
+	const char *s = str + len - EFI_VARIABLE_GUID_LEN;
+	int i;
+
+	/*
+	 * We need a GUID, plus at least one letter for the variable name,
+	 * plus the '-' separator
+	 */
+	if (len < EFI_VARIABLE_GUID_LEN + 2)
+		return false;
+
+	/* GUID must be preceded by a '-' */
+	if (*(s - 1) != '-')
+		return false;
+
+	/*
+	 * Validate that 's' is of the correct format, e.g.
+	 *
+	 *	12345678-1234-1234-1234-123456789abc
+	 */
+	for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) {
+		if (dashes[i]) {
+			if (*s++ != '-')
+				return false;
+		} else {
+			if (!isxdigit(*s++))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
+{
+	guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]);
+	guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]);
+	guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]);
+	guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]);
+	guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]);
+	guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]);
+	guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]);
+	guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]);
+	guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]);
+	guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]);
+	guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]);
+	guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]);
+	guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]);
+	guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]);
+	guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]);
+	guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]);
+}
+
+static int efivarfs_create(struct inode *dir, struct dentry *dentry,
+			  umode_t mode, bool excl)
+{
+	struct inode *inode;
+	struct efivar_entry *var;
+	int namelen, i = 0, err = 0;
+
+	if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
+		return -EINVAL;
+
+	inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
+	if (!inode)
+		return -ENOMEM;
+
+	var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
+	if (!var) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* length of the variable name itself: remove GUID and separator */
+	namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
+
+	efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
+			&var->var.VendorGuid);
+
+	for (i = 0; i < namelen; i++)
+		var->var.VariableName[i] = dentry->d_name.name[i];
+
+	var->var.VariableName[i] = '\0';
+
+	inode->i_private = var;
+
+	efivar_entry_add(var, &efivarfs_list);
+	d_instantiate(dentry, inode);
+	dget(dentry);
+out:
+	if (err) {
+		kfree(var);
+		iput(inode);
+	}
+	return err;
+}
+
+static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct efivar_entry *var = d_inode(dentry)->i_private;
+
+	if (efivar_entry_delete(var))
+		return -EINVAL;
+
+	drop_nlink(d_inode(dentry));
+	dput(dentry);
+	return 0;
+};
+
+const struct inode_operations efivarfs_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.unlink = efivarfs_unlink,
+	.create = efivarfs_create,
+};
diff --git a/kernel/fs/efivarfs/internal.h b/kernel/fs/efivarfs/internal.h
new file mode 100644
index 000000000..b5ff16add
--- /dev/null
+++ b/kernel/fs/efivarfs/internal.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef EFIVAR_FS_INTERNAL_H
+#define EFIVAR_FS_INTERNAL_H
+
+#include <linux/list.h>
+
+extern const struct file_operations efivarfs_file_operations;
+extern const struct inode_operations efivarfs_dir_inode_operations;
+extern bool efivarfs_valid_name(const char *str, int len);
+extern struct inode *efivarfs_get_inode(struct super_block *sb,
+			const struct inode *dir, int mode, dev_t dev);
+
+extern struct list_head efivarfs_list;
+
+#endif /* EFIVAR_FS_INTERNAL_H */
diff --git a/kernel/fs/efivarfs/super.c b/kernel/fs/efivarfs/super.c
new file mode 100644
index 000000000..86a212182
--- /dev/null
+++ b/kernel/fs/efivarfs/super.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ctype.h>
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/ucs2_string.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+
+#include "internal.h"
+
+LIST_HEAD(efivarfs_list);
+
+static void efivarfs_evict_inode(struct inode *inode)
+{
+	clear_inode(inode);
+}
+
+static const struct super_operations efivarfs_ops = {
+	.statfs = simple_statfs,
+	.drop_inode = generic_delete_inode,
+	.evict_inode = efivarfs_evict_inode,
+	.show_options = generic_show_options,
+};
+
+static struct super_block *efivarfs_sb;
+
+/*
+ * Compare two efivarfs file names.
+ *
+ * An efivarfs filename is composed of two parts,
+ *
+ *	1. A case-sensitive variable name
+ *	2. A case-insensitive GUID
+ *
+ * So we need to perform a case-sensitive match on part 1 and a
+ * case-insensitive match on part 2.
+ */
+static int efivarfs_d_compare(const struct dentry *parent,
+			      const struct dentry *dentry,
+			      unsigned int len, const char *str,
+			      const struct qstr *name)
+{
+	int guid = len - EFI_VARIABLE_GUID_LEN;
+
+	if (name->len != len)
+		return 1;
+
+	/* Case-sensitive compare for the variable name */
+	if (memcmp(str, name->name, guid))
+		return 1;
+
+	/* Case-insensitive compare for the GUID */
+	return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
+}
+
+static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	unsigned long hash = init_name_hash();
+	const unsigned char *s = qstr->name;
+	unsigned int len = qstr->len;
+
+	if (!efivarfs_valid_name(s, len))
+		return -EINVAL;
+
+	while (len-- > EFI_VARIABLE_GUID_LEN)
+		hash = partial_name_hash(*s++, hash);
+
+	/* GUID is case-insensitive. */
+	while (len--)
+		hash = partial_name_hash(tolower(*s++), hash);
+
+	qstr->hash = end_name_hash(hash);
+	return 0;
+}
+
+static const struct dentry_operations efivarfs_d_ops = {
+	.d_compare = efivarfs_d_compare,
+	.d_hash = efivarfs_d_hash,
+	.d_delete = always_delete_dentry,
+};
+
+static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
+{
+	struct dentry *d;
+	struct qstr q;
+	int err;
+
+	q.name = name;
+	q.len = strlen(name);
+
+	err = efivarfs_d_hash(NULL, &q);
+	if (err)
+		return ERR_PTR(err);
+
+	d = d_alloc(parent, &q);
+	if (d)
+		return d;
+
+	return ERR_PTR(-ENOMEM);
+}
+
+static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
+			     unsigned long name_size, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct efivar_entry *entry;
+	struct inode *inode = NULL;
+	struct dentry *dentry, *root = sb->s_root;
+	unsigned long size = 0;
+	char *name;
+	int len, i;
+	int err = -ENOMEM;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return err;
+
+	memcpy(entry->var.VariableName, name16, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+	len = ucs2_strlen(entry->var.VariableName);
+
+	/* name, plus '-', plus GUID, plus NUL*/
+	name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
+	if (!name)
+		goto fail;
+
+	for (i = 0; i < len; i++)
+		name[i] = entry->var.VariableName[i] & 0xFF;
+
+	name[len] = '-';
+
+	efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
+
+	name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+
+	inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0);
+	if (!inode)
+		goto fail_name;
+
+	dentry = efivarfs_alloc_dentry(root, name);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto fail_inode;
+	}
+
+	/* copied by the above to local storage in the dentry. */
+	kfree(name);
+
+	efivar_entry_size(entry, &size);
+	efivar_entry_add(entry, &efivarfs_list);
+
+	mutex_lock(&inode->i_mutex);
+	inode->i_private = entry;
+	i_size_write(inode, size + sizeof(entry->var.Attributes));
+	mutex_unlock(&inode->i_mutex);
+	d_add(dentry, inode);
+
+	return 0;
+
+fail_inode:
+	iput(inode);
+fail_name:
+	kfree(name);
+fail:
+	kfree(entry);
+	return err;
+}
+
+static int efivarfs_destroy(struct efivar_entry *entry, void *data)
+{
+	efivar_entry_remove(entry);
+	kfree(entry);
+	return 0;
+}
+
+static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	efivarfs_sb = sb;
+
+	sb->s_maxbytes          = MAX_LFS_FILESIZE;
+	sb->s_blocksize         = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
+	sb->s_magic             = EFIVARFS_MAGIC;
+	sb->s_op                = &efivarfs_ops;
+	sb->s_d_op		= &efivarfs_d_ops;
+	sb->s_time_gran         = 1;
+
+	inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+	if (!inode)
+		return -ENOMEM;
+	inode->i_op = &efivarfs_dir_inode_operations;
+
+	root = d_make_root(inode);
+	sb->s_root = root;
+	if (!root)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&efivarfs_list);
+
+	err = efivar_init(efivarfs_callback, (void *)sb, false,
+			  true, &efivarfs_list);
+	if (err)
+		__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+
+	return err;
+}
+
+static struct dentry *efivarfs_mount(struct file_system_type *fs_type,
+				    int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, efivarfs_fill_super);
+}
+
+static void efivarfs_kill_sb(struct super_block *sb)
+{
+	kill_litter_super(sb);
+	efivarfs_sb = NULL;
+
+	/* Remove all entries and destroy */
+	__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+}
+
+static struct file_system_type efivarfs_type = {
+	.owner   = THIS_MODULE,
+	.name    = "efivarfs",
+	.mount   = efivarfs_mount,
+	.kill_sb = efivarfs_kill_sb,
+};
+
+static __init int efivarfs_init(void)
+{
+	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+		return -ENODEV;
+
+	if (!efivars_kobject())
+		return -ENODEV;
+
+	return register_filesystem(&efivarfs_type);
+}
+
+static __exit void efivarfs_exit(void)
+{
+	unregister_filesystem(&efivarfs_type);
+}
+
+MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr");
+MODULE_DESCRIPTION("EFI Variable Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS("efivarfs");
+
+module_init(efivarfs_init);
+module_exit(efivarfs_exit);
diff --git a/kernel/fs/efs/Kconfig b/kernel/fs/efs/Kconfig
new file mode 100644
index 000000000..d020e3c30
--- /dev/null
+++ b/kernel/fs/efs/Kconfig
@@ -0,0 +1,14 @@
+config EFS_FS
+	tristate "EFS file system support (read only)"
+	depends on BLOCK
+	help
+	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
+	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
+	  uses the XFS file system for hard disk partitions however).
+
+	  This implementation only offers read-only access. If you don't know
+	  what all this is about, it's safe to say N. For more information
+	  about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
+
+	  To compile the EFS file system support as a module, choose M here: the
+	  module will be called efs.
diff --git a/kernel/fs/efs/Makefile b/kernel/fs/efs/Makefile
new file mode 100644
index 000000000..963543d46
--- /dev/null
+++ b/kernel/fs/efs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux efs-filesystem routines.
+#
+
+obj-$(CONFIG_EFS_FS) += efs.o
+
+efs-objs := super.o inode.o namei.o dir.o file.o symlink.o
diff --git a/kernel/fs/efs/dir.c b/kernel/fs/efs/dir.c
new file mode 100644
index 000000000..ce63b24f7
--- /dev/null
+++ b/kernel/fs/efs/dir.c
@@ -0,0 +1,103 @@
+/*
+ * dir.c
+ *
+ * Copyright (c) 1999 Al Smith
+ */
+
+#include <linux/buffer_head.h>
+#include "efs.h"
+
+static int efs_readdir(struct file *, struct dir_context *);
+
+const struct file_operations efs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= efs_readdir,
+};
+
+const struct inode_operations efs_dir_inode_operations = {
+	.lookup		= efs_lookup,
+};
+
+static int efs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	efs_block_t		block;
+	int			slot;
+
+	if (inode->i_size & (EFS_DIRBSIZE-1))
+		pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n",
+			__func__);
+
+	/* work out where this entry can be found */
+	block = ctx->pos >> EFS_DIRBSIZE_BITS;
+
+	/* each block contains at most 256 slots */
+	slot  = ctx->pos & 0xff;
+
+	/* look at all blocks */
+	while (block < inode->i_blocks) {
+		struct efs_dir		*dirblock;
+		struct buffer_head *bh;
+
+		/* read the dir block */
+		bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
+
+		if (!bh) {
+			pr_err("%s(): failed to read dir block %d\n",
+			       __func__, block);
+			break;
+		}
+
+		dirblock = (struct efs_dir *) bh->b_data; 
+
+		if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
+			pr_err("%s(): invalid directory block\n", __func__);
+			brelse(bh);
+			break;
+		}
+
+		for (; slot < dirblock->slots; slot++) {
+			struct efs_dentry *dirslot;
+			efs_ino_t inodenum;
+			const char *nameptr;
+			int namelen;
+
+			if (dirblock->space[slot] == 0)
+				continue;
+
+			dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
+
+			inodenum = be32_to_cpu(dirslot->inode);
+			namelen  = dirslot->namelen;
+			nameptr  = dirslot->name;
+			pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n",
+				 __func__, block, slot, dirblock->slots-1,
+				 inodenum, nameptr, namelen);
+			if (!namelen)
+				continue;
+			/* found the next entry */
+			ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
+
+			/* sanity check */
+			if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+				pr_warn("directory entry %d exceeds directory block\n",
+					slot);
+				continue;
+			}
+
+			/* copy filename and data in dirslot */
+			if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
+				brelse(bh);
+				return 0;
+			}
+		}
+		brelse(bh);
+
+		slot = 0;
+		block++;
+	}
+	ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
+	return 0;
+}
+
diff --git a/kernel/fs/efs/efs.h b/kernel/fs/efs/efs.h
new file mode 100644
index 000000000..5bbf96121
--- /dev/null
+++ b/kernel/fs/efs/efs.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ * Portions derived from IRIX header files (c) 1988 Silicon Graphics
+ */
+#ifndef _EFS_EFS_H_
+#define _EFS_EFS_H_
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+
+#define EFS_VERSION "1.0a"
+
+static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith <Al.Smith@aeschi.ch.eu.org>";
+
+
+/* 1 block is 512 bytes */
+#define	EFS_BLOCKSIZE_BITS	9
+#define	EFS_BLOCKSIZE		(1 << EFS_BLOCKSIZE_BITS)
+
+typedef	int32_t		efs_block_t;
+typedef uint32_t	efs_ino_t;
+
+#define	EFS_DIRECTEXTENTS	12
+
+/*
+ * layout of an extent, in memory and on disk. 8 bytes exactly.
+ */
+typedef union extent_u {
+	unsigned char raw[8];
+	struct extent_s {
+		unsigned int	ex_magic:8;	/* magic # (zero) */
+		unsigned int	ex_bn:24;	/* basic block */
+		unsigned int	ex_length:8;	/* numblocks in this extent */
+		unsigned int	ex_offset:24;	/* logical offset into file */
+	} cooked;
+} efs_extent;
+
+typedef struct edevs {
+	__be16		odev;
+	__be32		ndev;
+} efs_devs;
+
+/*
+ * extent based filesystem inode as it appears on disk.  The efs inode
+ * is exactly 128 bytes long.
+ */
+struct	efs_dinode {
+	__be16		di_mode;	/* mode and type of file */
+	__be16		di_nlink;	/* number of links to file */
+	__be16		di_uid;		/* owner's user id */
+	__be16		di_gid;		/* owner's group id */
+	__be32		di_size;	/* number of bytes in file */
+	__be32		di_atime;	/* time last accessed */
+	__be32		di_mtime;	/* time last modified */
+	__be32		di_ctime;	/* time created */
+	__be32		di_gen;		/* generation number */
+	__be16		di_numextents;	/* # of extents */
+	u_char		di_version;	/* version of inode */
+	u_char		di_spare;	/* spare - used by AFS */
+	union di_addr {
+		efs_extent	di_extents[EFS_DIRECTEXTENTS];
+		efs_devs	di_dev;	/* device for IFCHR/IFBLK */
+	} di_u;
+};
+
+/* efs inode storage in memory */
+struct efs_inode_info {
+	int		numextents;
+	int		lastextent;
+
+	efs_extent	extents[EFS_DIRECTEXTENTS];
+	struct inode	vfs_inode;
+};
+
+#include <linux/efs_fs_sb.h>
+
+#define EFS_DIRBSIZE_BITS	EFS_BLOCKSIZE_BITS
+#define EFS_DIRBSIZE		(1 << EFS_DIRBSIZE_BITS)
+
+struct efs_dentry {
+	__be32		inode;
+	unsigned char	namelen;
+	char		name[3];
+};
+
+#define EFS_DENTSIZE	(sizeof(struct efs_dentry) - 3 + 1)
+#define EFS_MAXNAMELEN  ((1 << (sizeof(char) * 8)) - 1)
+
+#define EFS_DIRBLK_HEADERSIZE	4
+#define EFS_DIRBLK_MAGIC	0xbeef	/* moo */
+
+struct efs_dir {
+	__be16	magic;
+	unsigned char	firstused;
+	unsigned char	slots;
+
+	unsigned char	space[EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE];
+};
+
+#define EFS_MAXENTS \
+	((EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE) / \
+	 (EFS_DENTSIZE + sizeof(char)))
+
+#define EFS_SLOTAT(dir, slot) EFS_REALOFF((dir)->space[slot])
+
+#define EFS_REALOFF(offset) ((offset << 1))
+
+
+static inline struct efs_inode_info *INODE_INFO(struct inode *inode)
+{
+	return container_of(inode, struct efs_inode_info, vfs_inode);
+}
+
+static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+struct statfs;
+struct fid;
+
+extern const struct inode_operations efs_dir_inode_operations;
+extern const struct file_operations efs_dir_operations;
+extern const struct address_space_operations efs_symlink_aops;
+
+extern struct inode *efs_iget(struct super_block *, unsigned long);
+extern efs_block_t efs_map_block(struct inode *, efs_block_t);
+extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
+extern struct dentry *efs_lookup(struct inode *, struct dentry *, unsigned int);
+extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type);
+extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type);
+extern struct dentry *efs_get_parent(struct dentry *);
+extern int efs_bmap(struct inode *, int);
+
+#endif /* _EFS_EFS_H_ */
diff --git a/kernel/fs/efs/file.c b/kernel/fs/efs/file.c
new file mode 100644
index 000000000..a37dcee46
--- /dev/null
+++ b/kernel/fs/efs/file.c
@@ -0,0 +1,56 @@
+/*
+ * file.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/buffer_head.h>
+#include "efs.h"
+
+int efs_get_block(struct inode *inode, sector_t iblock,
+		  struct buffer_head *bh_result, int create)
+{
+	int error = -EROFS;
+	long phys;
+
+	if (create)
+		return error;
+	if (iblock >= inode->i_blocks) {
+#ifdef DEBUG
+		/*
+		 * i have no idea why this happens as often as it does
+		 */
+		pr_warn("%s(): block %d >= %ld (filesize %ld)\n",
+			__func__, block, inode->i_blocks, inode->i_size);
+#endif
+		return 0;
+	}
+	phys = efs_map_block(inode, iblock);
+	if (phys)
+		map_bh(bh_result, inode->i_sb, phys);
+	return 0;
+}
+
+int efs_bmap(struct inode *inode, efs_block_t block) {
+
+	if (block < 0) {
+		pr_warn("%s(): block < 0\n", __func__);
+		return 0;
+	}
+
+	/* are we about to read past the end of a file ? */
+	if (!(block < inode->i_blocks)) {
+#ifdef DEBUG
+		/*
+		 * i have no idea why this happens as often as it does
+		 */
+		pr_warn("%s(): block %d >= %ld (filesize %ld)\n",
+			__func__, block, inode->i_blocks, inode->i_size);
+#endif
+		return 0;
+	}
+
+	return efs_map_block(inode, block);
+}
diff --git a/kernel/fs/efs/inode.c b/kernel/fs/efs/inode.c
new file mode 100644
index 000000000..079d20306
--- /dev/null
+++ b/kernel/fs/efs/inode.c
@@ -0,0 +1,311 @@
+/*
+ * inode.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang,
+ *              and from work (c) 1998 Mike Shaver.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include "efs.h"
+#include <linux/efs_fs_sb.h>
+
+static int efs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,efs_get_block);
+}
+static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,efs_get_block);
+}
+static const struct address_space_operations efs_aops = {
+	.readpage = efs_readpage,
+	.bmap = _efs_bmap
+};
+
+static inline void extent_copy(efs_extent *src, efs_extent *dst) {
+	/*
+	 * this is slightly evil. it doesn't just copy
+	 * efs_extent from src to dst, it also mangles
+	 * the bits so that dst ends up in cpu byte-order.
+	 */
+
+	dst->cooked.ex_magic  =  (unsigned int) src->raw[0];
+	dst->cooked.ex_bn     = ((unsigned int) src->raw[1] << 16) |
+				((unsigned int) src->raw[2] <<  8) |
+				((unsigned int) src->raw[3] <<  0);
+	dst->cooked.ex_length =  (unsigned int) src->raw[4];
+	dst->cooked.ex_offset = ((unsigned int) src->raw[5] << 16) |
+				((unsigned int) src->raw[6] <<  8) |
+				((unsigned int) src->raw[7] <<  0);
+	return;
+}
+
+struct inode *efs_iget(struct super_block *super, unsigned long ino)
+{
+	int i, inode_index;
+	dev_t device;
+	u32 rdev;
+	struct buffer_head *bh;
+	struct efs_sb_info    *sb = SUPER_INFO(super);
+	struct efs_inode_info *in;
+	efs_block_t block, offset;
+	struct efs_dinode *efs_inode;
+	struct inode *inode;
+
+	inode = iget_locked(super, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	in = INODE_INFO(inode);
+
+	/*
+	** EFS layout:
+	**
+	** |   cylinder group    |   cylinder group    |   cylinder group ..etc
+	** |inodes|data          |inodes|data          |inodes|data       ..etc
+	**
+	** work out the inode block index, (considering initially that the
+	** inodes are stored as consecutive blocks). then work out the block
+	** number of that inode given the above layout, and finally the
+	** offset of the inode within that block.
+	*/
+
+	inode_index = inode->i_ino /
+		(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
+
+	block = sb->fs_start + sb->first_block + 
+		(sb->group_size * (inode_index / sb->inode_blocks)) +
+		(inode_index % sb->inode_blocks);
+
+	offset = (inode->i_ino %
+			(EFS_BLOCKSIZE / sizeof(struct efs_dinode))) *
+		sizeof(struct efs_dinode);
+
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh) {
+		pr_warn("%s() failed at block %d\n", __func__, block);
+		goto read_inode_error;
+	}
+
+	efs_inode = (struct efs_dinode *) (bh->b_data + offset);
+    
+	inode->i_mode  = be16_to_cpu(efs_inode->di_mode);
+	set_nlink(inode, be16_to_cpu(efs_inode->di_nlink));
+	i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid));
+	i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid));
+	inode->i_size  = be32_to_cpu(efs_inode->di_size);
+	inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
+	inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
+	inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime);
+	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+
+	/* this is the number of blocks in the file */
+	if (inode->i_size == 0) {
+		inode->i_blocks = 0;
+	} else {
+		inode->i_blocks = ((inode->i_size - 1) >> EFS_BLOCKSIZE_BITS) + 1;
+	}
+
+	rdev = be16_to_cpu(efs_inode->di_u.di_dev.odev);
+	if (rdev == 0xffff) {
+		rdev = be32_to_cpu(efs_inode->di_u.di_dev.ndev);
+		if (sysv_major(rdev) > 0xfff)
+			device = 0;
+		else
+			device = MKDEV(sysv_major(rdev), sysv_minor(rdev));
+	} else
+		device = old_decode_dev(rdev);
+
+	/* get the number of extents for this object */
+	in->numextents = be16_to_cpu(efs_inode->di_numextents);
+	in->lastextent = 0;
+
+	/* copy the extents contained within the inode to memory */
+	for(i = 0; i < EFS_DIRECTEXTENTS; i++) {
+		extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i]));
+		if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) {
+			pr_warn("extent %d has bad magic number in inode %lu\n",
+				i, inode->i_ino);
+			brelse(bh);
+			goto read_inode_error;
+		}
+	}
+
+	brelse(bh);
+	pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n",
+		 inode->i_ino, in->numextents, inode->i_mode);
+	switch (inode->i_mode & S_IFMT) {
+		case S_IFDIR: 
+			inode->i_op = &efs_dir_inode_operations; 
+			inode->i_fop = &efs_dir_operations; 
+			break;
+		case S_IFREG:
+			inode->i_fop = &generic_ro_fops;
+			inode->i_data.a_ops = &efs_aops;
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_data.a_ops = &efs_symlink_aops;
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
+		case S_IFIFO:
+			init_special_inode(inode, inode->i_mode, device);
+			break;
+		default:
+			pr_warn("unsupported inode mode %o\n", inode->i_mode);
+			goto read_inode_error;
+			break;
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+        
+read_inode_error:
+	pr_warn("failed to read inode %lu\n", inode->i_ino);
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static inline efs_block_t
+efs_extent_check(efs_extent *ptr, efs_block_t block, struct efs_sb_info *sb) {
+	efs_block_t start;
+	efs_block_t length;
+	efs_block_t offset;
+
+	/*
+	 * given an extent and a logical block within a file,
+	 * can this block be found within this extent ?
+	 */
+	start  = ptr->cooked.ex_bn;
+	length = ptr->cooked.ex_length;
+	offset = ptr->cooked.ex_offset;
+
+	if ((block >= offset) && (block < offset+length)) {
+		return(sb->fs_start + start + block - offset);
+	} else {
+		return 0;
+	}
+}
+
+efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
+	struct efs_sb_info    *sb = SUPER_INFO(inode->i_sb);
+	struct efs_inode_info *in = INODE_INFO(inode);
+	struct buffer_head    *bh = NULL;
+
+	int cur, last, first = 1;
+	int ibase, ioffset, dirext, direxts, indext, indexts;
+	efs_block_t iblock, result = 0, lastblock = 0;
+	efs_extent ext, *exts;
+
+	last = in->lastextent;
+
+	if (in->numextents <= EFS_DIRECTEXTENTS) {
+		/* first check the last extent we returned */
+		if ((result = efs_extent_check(&in->extents[last], block, sb)))
+			return result;
+    
+		/* if we only have one extent then nothing can be found */
+		if (in->numextents == 1) {
+			pr_err("%s() failed to map (1 extent)\n", __func__);
+			return 0;
+		}
+
+		direxts = in->numextents;
+
+		/*
+		 * check the stored extents in the inode
+		 * start with next extent and check forwards
+		 */
+		for(dirext = 1; dirext < direxts; dirext++) {
+			cur = (last + dirext) % in->numextents;
+			if ((result = efs_extent_check(&in->extents[cur], block, sb))) {
+				in->lastextent = cur;
+				return result;
+			}
+		}
+
+		pr_err("%s() failed to map block %u (dir)\n", __func__, block);
+		return 0;
+	}
+
+	pr_debug("%s(): indirect search for logical block %u\n",
+		 __func__, block);
+	direxts = in->extents[0].cooked.ex_offset;
+	indexts = in->numextents;
+
+	for(indext = 0; indext < indexts; indext++) {
+		cur = (last + indext) % indexts;
+
+		/*
+		 * work out which direct extent contains `cur'.
+		 *
+		 * also compute ibase: i.e. the number of the first
+		 * indirect extent contained within direct extent `cur'.
+		 *
+		 */
+		ibase = 0;
+		for(dirext = 0; cur < ibase && dirext < direxts; dirext++) {
+			ibase += in->extents[dirext].cooked.ex_length *
+				(EFS_BLOCKSIZE / sizeof(efs_extent));
+		}
+
+		if (dirext == direxts) {
+			/* should never happen */
+			pr_err("couldn't find direct extent for indirect extent %d (block %u)\n",
+			       cur, block);
+			if (bh) brelse(bh);
+			return 0;
+		}
+		
+		/* work out block number and offset of this indirect extent */
+		iblock = sb->fs_start + in->extents[dirext].cooked.ex_bn +
+			(cur - ibase) /
+			(EFS_BLOCKSIZE / sizeof(efs_extent));
+		ioffset = (cur - ibase) %
+			(EFS_BLOCKSIZE / sizeof(efs_extent));
+
+		if (first || lastblock != iblock) {
+			if (bh) brelse(bh);
+
+			bh = sb_bread(inode->i_sb, iblock);
+			if (!bh) {
+				pr_err("%s() failed at block %d\n",
+				       __func__, iblock);
+				return 0;
+			}
+			pr_debug("%s(): read indirect extent block %d\n",
+				 __func__, iblock);
+			first = 0;
+			lastblock = iblock;
+		}
+
+		exts = (efs_extent *) bh->b_data;
+
+		extent_copy(&(exts[ioffset]), &ext);
+
+		if (ext.cooked.ex_magic != 0) {
+			pr_err("extent %d has bad magic number in block %d\n",
+			       cur, iblock);
+			if (bh) brelse(bh);
+			return 0;
+		}
+
+		if ((result = efs_extent_check(&ext, block, sb))) {
+			if (bh) brelse(bh);
+			in->lastextent = cur;
+			return result;
+		}
+	}
+	if (bh) brelse(bh);
+	pr_err("%s() failed to map block %u (indir)\n", __func__, block);
+	return 0;
+}  
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/efs/namei.c b/kernel/fs/efs/namei.c
new file mode 100644
index 000000000..40ba9cc41
--- /dev/null
+++ b/kernel/fs/efs/namei.c
@@ -0,0 +1,119 @@
+/*
+ * namei.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/exportfs.h>
+#include "efs.h"
+
+
+static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
+{
+	struct buffer_head *bh;
+
+	int			slot, namelen;
+	char			*nameptr;
+	struct efs_dir		*dirblock;
+	struct efs_dentry	*dirslot;
+	efs_ino_t		inodenum;
+	efs_block_t		block;
+ 
+	if (inode->i_size & (EFS_DIRBSIZE-1))
+		pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n",
+			__func__);
+
+	for(block = 0; block < inode->i_blocks; block++) {
+
+		bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
+		if (!bh) {
+			pr_err("%s(): failed to read dir block %d\n",
+			       __func__, block);
+			return 0;
+		}
+    
+		dirblock = (struct efs_dir *) bh->b_data;
+
+		if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
+			pr_err("%s(): invalid directory block\n", __func__);
+			brelse(bh);
+			return 0;
+		}
+
+		for (slot = 0; slot < dirblock->slots; slot++) {
+			dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
+
+			namelen  = dirslot->namelen;
+			nameptr  = dirslot->name;
+
+			if ((namelen == len) && (!memcmp(name, nameptr, len))) {
+				inodenum = be32_to_cpu(dirslot->inode);
+				brelse(bh);
+				return inodenum;
+			}
+		}
+		brelse(bh);
+	}
+	return 0;
+}
+
+struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	efs_ino_t inodenum;
+	struct inode *inode = NULL;
+
+	inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+	if (inodenum)
+		inode = efs_iget(dir->i_sb, inodenum);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
+		u32 generation)
+{
+	struct inode *inode;
+
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+	inode = efs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    efs_nfs_get_inode);
+}
+
+struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    efs_nfs_get_inode);
+}
+
+struct dentry *efs_get_parent(struct dentry *child)
+{
+	struct dentry *parent = ERR_PTR(-ENOENT);
+	efs_ino_t ino;
+
+	ino = efs_find_entry(d_inode(child), "..", 2);
+	if (ino)
+		parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino));
+
+	return parent;
+}
diff --git a/kernel/fs/efs/super.c b/kernel/fs/efs/super.c
new file mode 100644
index 000000000..7fca462ea
--- /dev/null
+++ b/kernel/fs/efs/super.c
@@ -0,0 +1,353 @@
+/*
+ * super.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+
+#include "efs.h"
+#include <linux/efs_vh.h>
+#include <linux/efs_fs_sb.h>
+
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int efs_fill_super(struct super_block *s, void *d, int silent);
+
+static struct dentry *efs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+}
+
+static void efs_kill_sb(struct super_block *s)
+{
+	struct efs_sb_info *sbi = SUPER_INFO(s);
+	kill_block_super(s);
+	kfree(sbi);
+}
+
+static struct file_system_type efs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "efs",
+	.mount		= efs_mount,
+	.kill_sb	= efs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("efs");
+
+static struct pt_types sgi_pt_types[] = {
+	{0x00,		"SGI vh"},
+	{0x01,		"SGI trkrepl"},
+	{0x02,		"SGI secrepl"},
+	{0x03,		"SGI raw"},
+	{0x04,		"SGI bsd"},
+	{SGI_SYSV,	"SGI sysv"},
+	{0x06,		"SGI vol"},
+	{SGI_EFS,	"SGI efs"},
+	{0x08,		"SGI lv"},
+	{0x09,		"SGI rlv"},
+	{0x0A,		"SGI xfs"},
+	{0x0B,		"SGI xfslog"},
+	{0x0C,		"SGI xlv"},
+	{0x82,		"Linux swap"},
+	{0x83,		"Linux native"},
+	{0,		NULL}
+};
+
+
+static struct kmem_cache * efs_inode_cachep;
+
+static struct inode *efs_alloc_inode(struct super_block *sb)
+{
+	struct efs_inode_info *ei;
+	ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void efs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
+}
+
+static void efs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, efs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
+				sizeof(struct efs_inode_info),
+				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				init_once);
+	if (efs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(efs_inode_cachep);
+}
+
+static int efs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations efs_superblock_operations = {
+	.alloc_inode	= efs_alloc_inode,
+	.destroy_inode	= efs_destroy_inode,
+	.statfs		= efs_statfs,
+	.remount_fs	= efs_remount,
+};
+
+static const struct export_operations efs_export_ops = {
+	.fh_to_dentry	= efs_fh_to_dentry,
+	.fh_to_parent	= efs_fh_to_parent,
+	.get_parent	= efs_get_parent,
+};
+
+static int __init init_efs_fs(void) {
+	int err;
+	pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n");
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&efs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_efs_fs(void) {
+	unregister_filesystem(&efs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_efs_fs)
+module_exit(exit_efs_fs)
+
+static efs_block_t efs_validate_vh(struct volume_header *vh) {
+	int		i;
+	__be32		cs, *ui;
+	int		csum;
+	efs_block_t	sblock = 0; /* shuts up gcc */
+	struct pt_types	*pt_entry;
+	int		pt_type, slice = -1;
+
+	if (be32_to_cpu(vh->vh_magic) != VHMAGIC) {
+		/*
+		 * assume that we're dealing with a partition and allow
+		 * read_super() to try and detect a valid superblock
+		 * on the next block.
+		 */
+		return 0;
+	}
+
+	ui = ((__be32 *) (vh + 1)) - 1;
+	for(csum = 0; ui >= ((__be32 *) vh);) {
+		cs = *ui--;
+		csum += be32_to_cpu(cs);
+	}
+	if (csum) {
+		pr_warn("SGI disklabel: checksum bad, label corrupted\n");
+		return 0;
+	}
+
+#ifdef DEBUG
+	pr_debug("bf: \"%16s\"\n", vh->vh_bootfile);
+
+	for(i = 0; i < NVDIR; i++) {
+		int	j;
+		char	name[VDNAMESIZE+1];
+
+		for(j = 0; j < VDNAMESIZE; j++) {
+			name[j] = vh->vh_vd[i].vd_name[j];
+		}
+		name[j] = (char) 0;
+
+		if (name[0]) {
+			pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n",
+				name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn),
+				(int) be32_to_cpu(vh->vh_vd[i].vd_nbytes));
+		}
+	}
+#endif
+
+	for(i = 0; i < NPARTAB; i++) {
+		pt_type = (int) be32_to_cpu(vh->vh_pt[i].pt_type);
+		for(pt_entry = sgi_pt_types; pt_entry->pt_name; pt_entry++) {
+			if (pt_type == pt_entry->pt_type) break;
+		}
+#ifdef DEBUG
+		if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) {
+			pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n",
+				 i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn),
+				 (int)be32_to_cpu(vh->vh_pt[i].pt_nblks),
+				 pt_type, (pt_entry->pt_name) ?
+				 pt_entry->pt_name : "unknown");
+		}
+#endif
+		if (IS_EFS(pt_type)) {
+			sblock = be32_to_cpu(vh->vh_pt[i].pt_firstlbn);
+			slice = i;
+		}
+	}
+
+	if (slice == -1) {
+		pr_notice("partition table contained no EFS partitions\n");
+#ifdef DEBUG
+	} else {
+		pr_info("using slice %d (type %s, offset 0x%x)\n", slice,
+			(pt_entry->pt_name) ? pt_entry->pt_name : "unknown",
+			sblock);
+#endif
+	}
+	return sblock;
+}
+
+static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
+
+	if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic)))
+		return -1;
+
+	sb->fs_magic     = be32_to_cpu(super->fs_magic);
+	sb->total_blocks = be32_to_cpu(super->fs_size);
+	sb->first_block  = be32_to_cpu(super->fs_firstcg);
+	sb->group_size   = be32_to_cpu(super->fs_cgfsize);
+	sb->data_free    = be32_to_cpu(super->fs_tfree);
+	sb->inode_free   = be32_to_cpu(super->fs_tinode);
+	sb->inode_blocks = be16_to_cpu(super->fs_cgisize);
+	sb->total_groups = be16_to_cpu(super->fs_ncg);
+    
+	return 0;    
+}
+
+static int efs_fill_super(struct super_block *s, void *d, int silent)
+{
+	struct efs_sb_info *sb;
+	struct buffer_head *bh;
+	struct inode *root;
+
+ 	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+	if (!sb)
+		return -ENOMEM;
+	s->s_fs_info = sb;
+ 
+	s->s_magic		= EFS_SUPER_MAGIC;
+	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
+		pr_err("device does not support %d byte blocks\n",
+			EFS_BLOCKSIZE);
+		return -EINVAL;
+	}
+  
+	/* read the vh (volume header) block */
+	bh = sb_bread(s, 0);
+
+	if (!bh) {
+		pr_err("cannot read volume header\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * if this returns zero then we didn't find any partition table.
+	 * this isn't (yet) an error - just assume for the moment that
+	 * the device is valid and go on to search for a superblock.
+	 */
+	sb->fs_start = efs_validate_vh((struct volume_header *) bh->b_data);
+	brelse(bh);
+
+	if (sb->fs_start == -1) {
+		return -EINVAL;
+	}
+
+	bh = sb_bread(s, sb->fs_start + EFS_SUPER);
+	if (!bh) {
+		pr_err("cannot read superblock\n");
+		return -EINVAL;
+	}
+		
+	if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
+#ifdef DEBUG
+		pr_warn("invalid superblock at block %u\n",
+			sb->fs_start + EFS_SUPER);
+#endif
+		brelse(bh);
+		return -EINVAL;
+	}
+	brelse(bh);
+
+	if (!(s->s_flags & MS_RDONLY)) {
+#ifdef DEBUG
+		pr_info("forcing read-only mode\n");
+#endif
+		s->s_flags |= MS_RDONLY;
+	}
+	s->s_op   = &efs_superblock_operations;
+	s->s_export_op = &efs_export_ops;
+	root = efs_iget(s, EFS_ROOTINODE);
+	if (IS_ERR(root)) {
+		pr_err("get root inode failed\n");
+		return PTR_ERR(root);
+	}
+
+	s->s_root = d_make_root(root);
+	if (!(s->s_root)) {
+		pr_err("get root dentry failed\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
+	struct super_block *sb = dentry->d_sb;
+	struct efs_sb_info *sbi = SUPER_INFO(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
+	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
+	buf->f_blocks  = sbi->total_groups *	/* total data blocks */
+			(sbi->group_size - sbi->inode_blocks);
+	buf->f_bfree   = sbi->data_free;	/* free data blocks */
+	buf->f_bavail  = sbi->data_free;	/* free blocks for non-root */
+	buf->f_files   = sbi->total_groups *	/* total inodes */
+			sbi->inode_blocks *
+			(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
+	buf->f_ffree   = sbi->inode_free;	/* free inodes */
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = EFS_MAXNAMELEN;	/* max filename length */
+
+	return 0;
+}
+
diff --git a/kernel/fs/efs/symlink.c b/kernel/fs/efs/symlink.c
new file mode 100644
index 000000000..75117d0da
--- /dev/null
+++ b/kernel/fs/efs/symlink.c
@@ -0,0 +1,54 @@
+/*
+ * symlink.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include "efs.h"
+
+static int efs_symlink_readpage(struct file *file, struct page *page)
+{
+	char *link = kmap(page);
+	struct buffer_head * bh;
+	struct inode * inode = page->mapping->host;
+	efs_block_t size = inode->i_size;
+	int err;
+  
+	err = -ENAMETOOLONG;
+	if (size > 2 * EFS_BLOCKSIZE)
+		goto fail;
+  
+	/* read first 512 bytes of link target */
+	err = -EIO;
+	bh = sb_bread(inode->i_sb, efs_bmap(inode, 0));
+	if (!bh)
+		goto fail;
+	memcpy(link, bh->b_data, (size > EFS_BLOCKSIZE) ? EFS_BLOCKSIZE : size);
+	brelse(bh);
+	if (size > EFS_BLOCKSIZE) {
+		bh = sb_bread(inode->i_sb, efs_bmap(inode, 1));
+		if (!bh)
+			goto fail;
+		memcpy(link + EFS_BLOCKSIZE, bh->b_data, size - EFS_BLOCKSIZE);
+		brelse(bh);
+	}
+	link[size] = '\0';
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+const struct address_space_operations efs_symlink_aops = {
+	.readpage	= efs_symlink_readpage
+};
diff --git a/kernel/fs/eventfd.c b/kernel/fs/eventfd.c
new file mode 100644
index 000000000..8d0c0df01
--- /dev/null
+++ b/kernel/fs/eventfd.c
@@ -0,0 +1,449 @@
+/*
+ *  fs/eventfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+struct eventfd_ctx {
+	struct kref kref;
+	wait_queue_head_t wqh;
+	/*
+	 * Every time that a write(2) is performed on an eventfd, the
+	 * value of the __u64 being written is added to "count" and a
+	 * wakeup is performed on "wqh". A read(2) will return the "count"
+	 * value to userspace, and will reset "count" to zero. The kernel
+	 * side eventfd_signal() also, adds to the "count" counter and
+	 * issue a wakeup.
+	 */
+	__u64 count;
+	unsigned int flags;
+};
+
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *          The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns the amount by which the counter was incrememnted.  This will be less
+ * than @n if the counter has overflowed.
+ */
+__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ULLONG_MAX - ctx->count < n)
+		n = ULLONG_MAX - ctx->count;
+	ctx->count += n;
+	if (waitqueue_active(&ctx->wqh))
+		wake_up_locked_poll(&ctx->wqh, POLLIN);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return n;
+}
+EXPORT_SYMBOL_GPL(eventfd_signal);
+
+static void eventfd_free_ctx(struct eventfd_ctx *ctx)
+{
+	kfree(ctx);
+}
+
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	eventfd_free_ctx(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget().
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
+static int eventfd_release(struct inode *inode, struct file *file)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
+	return 0;
+}
+
+static unsigned int eventfd_poll(struct file *file, poll_table *wait)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+	u64 count;
+
+	poll_wait(file, &ctx->wqh, wait);
+	smp_rmb();
+	count = ctx->count;
+
+	if (count > 0)
+		events |= POLLIN;
+	if (count == ULLONG_MAX)
+		events |= POLLERR;
+	if (ULLONG_MAX - 1 > count)
+		events |= POLLOUT;
+
+	return events;
+}
+
+static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+{
+	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+	ctx->count -= *cnt;
+}
+
+/**
+ * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
+ * @ctx: [in] Pointer to eventfd context.
+ * @wait: [in] Wait queue to be removed.
+ * @cnt: [out] Pointer to the 64-bit counter value.
+ *
+ * Returns %0 if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked.
+ *
+ * This is used to atomically remove a wait queue entry from the eventfd wait
+ * queue head, and read/reset the counter value.
+ */
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+				  __u64 *cnt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	eventfd_ctx_do_read(ctx, cnt);
+	__remove_wait_queue(&ctx->wqh, wait);
+	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
+		wake_up_locked_poll(&ctx->wqh, POLLOUT);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return *cnt != 0 ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
+
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64-bit counter value.
+ *
+ * Returns %0 if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
+{
+	ssize_t res;
+	DECLARE_WAITQUEUE(wait, current);
+
+	spin_lock_irq(&ctx->wqh.lock);
+	*cnt = 0;
+	res = -EAGAIN;
+	if (ctx->count > 0)
+		res = 0;
+	else if (!no_wait) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ctx->count > 0) {
+				res = 0;
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -ERESTARTSYS;
+				break;
+			}
+			spin_unlock_irq(&ctx->wqh.lock);
+			schedule();
+			spin_lock_irq(&ctx->wqh.lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+	if (likely(res == 0)) {
+		eventfd_ctx_do_read(ctx, cnt);
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh, POLLOUT);
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 cnt;
+
+	if (count < sizeof(cnt))
+		return -EINVAL;
+	res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+	if (res < 0)
+		return res;
+
+	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+}
+
+static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 ucnt;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(ucnt))
+		return -EINVAL;
+	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
+		return -EFAULT;
+	if (ucnt == ULLONG_MAX)
+		return -EINVAL;
+	spin_lock_irq(&ctx->wqh.lock);
+	res = -EAGAIN;
+	if (ULLONG_MAX - ctx->count > ucnt)
+		res = sizeof(ucnt);
+	else if (!(file->f_flags & O_NONBLOCK)) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (res = 0;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ULLONG_MAX - ctx->count > ucnt) {
+				res = sizeof(ucnt);
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -ERESTARTSYS;
+				break;
+			}
+			spin_unlock_irq(&ctx->wqh.lock);
+			schedule();
+			spin_lock_irq(&ctx->wqh.lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+	if (likely(res > 0)) {
+		ctx->count += ucnt;
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh, POLLIN);
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	return res;
+}
+
+#ifdef CONFIG_PROC_FS
+static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct eventfd_ctx *ctx = f->private_data;
+
+	spin_lock_irq(&ctx->wqh.lock);
+	seq_printf(m, "eventfd-count: %16llx\n",
+		   (unsigned long long)ctx->count);
+	spin_unlock_irq(&ctx->wqh.lock);
+}
+#endif
+
+static const struct file_operations eventfd_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= eventfd_show_fdinfo,
+#endif
+	.release	= eventfd_release,
+	.poll		= eventfd_poll,
+	.read		= eventfd_read,
+	.write		= eventfd_write,
+	.llseek		= noop_llseek,
+};
+
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct file *eventfd_fget(int fd)
+{
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+	if (file->f_op != &eventfd_fops) {
+		fput(file);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return file;
+}
+EXPORT_SYMBOL_GPL(eventfd_fget);
+
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct eventfd_ctx *ctx;
+	struct fd f = fdget(fd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+	ctx = eventfd_ctx_fileget(f.file);
+	fdput(f);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+	if (file->f_op != &eventfd_fops)
+		return ERR_PTR(-EINVAL);
+
+	return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
+
+/**
+ * eventfd_file_create - Creates an eventfd file pointer.
+ * @count: Initial eventfd counter value.
+ * @flags: Flags for the eventfd file.
+ *
+ * This function creates an eventfd file pointer, w/out installing it into
+ * the fd table. This is useful when the eventfd file is used during the
+ * initialization of data structures that require extra setup after the eventfd
+ * creation. So the eventfd creation is split into the file pointer creation
+ * phase, and the file descriptor installation phase.
+ * In this way races with userspace closing the newly installed file descriptor
+ * can be avoided.
+ * Returns an eventfd file pointer, or a proper error pointer.
+ */
+struct file *eventfd_file_create(unsigned int count, int flags)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	/* Check the EFD_* constants for consistency.  */
+	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~EFD_FLAGS_SET)
+		return ERR_PTR(-EINVAL);
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ctx->kref);
+	init_waitqueue_head(&ctx->wqh);
+	ctx->count = count;
+	ctx->flags = flags;
+
+	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
+				  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
+	if (IS_ERR(file))
+		eventfd_free_ctx(ctx);
+
+	return file;
+}
+
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+{
+	int fd, error;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = eventfd_file_create(count, flags);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	fd_install(fd, file);
+
+	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+
+	return error;
+}
+
+SYSCALL_DEFINE1(eventfd, unsigned int, count)
+{
+	return sys_eventfd2(count, 0);
+}
+
diff --git a/kernel/fs/eventpoll.c b/kernel/fs/eventpoll.c
new file mode 100644
index 000000000..d0c12504d
--- /dev/null
+++ b/kernel/fs/eventpoll.c
@@ -0,0 +1,2133 @@
+/*
+ *  fs/eventpoll.c (Efficient event retrieval implementation)
+ *  Copyright (C) 2001,...,2009	 Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/spinlock.h>
+#include <linux/syscalls.h>
+#include <linux/rbtree.h>
+#include <linux/wait.h>
+#include <linux/eventpoll.h>
+#include <linux/mount.h>
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+#include <linux/anon_inodes.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/mman.h>
+#include <linux/atomic.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/compat.h>
+#include <linux/rculist.h>
+
+/*
+ * LOCKING:
+ * There are three level of locking required by epoll :
+ *
+ * 1) epmutex (mutex)
+ * 2) ep->mtx (mutex)
+ * 3) ep->lock (spinlock)
+ *
+ * The acquire order is the one listed above, from 1 to 3.
+ * We need a spinlock (ep->lock) because we manipulate objects
+ * from inside the poll callback, that might be triggered from
+ * a wake_up() that in turn might be called from IRQ context.
+ * So we can't sleep inside the poll callback and hence we need
+ * a spinlock. During the event transfer loop (from kernel to
+ * user space) we could end up sleeping due a copy_to_user(), so
+ * we need a lock that will allow us to sleep. This lock is a
+ * mutex (ep->mtx). It is acquired during the event transfer loop,
+ * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
+ * Then we also need a global mutex to serialize eventpoll_release_file()
+ * and ep_free().
+ * This mutex is acquired by ep_free() during the epoll file
+ * cleanup path and it is also acquired by eventpoll_release_file()
+ * if a file has been pushed inside an epoll set and it is then
+ * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
+ * It is necessary to acquire multiple "ep->mtx"es at once in the
+ * case when one epoll fd is added to another. In this case, we
+ * always acquire the locks in the order of nesting (i.e. after
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
+ * before e2->mtx). Since we disallow cycles of epoll file
+ * descriptors, this ensures that the mutexes are well-ordered. In
+ * order to communicate this nesting to lockdep, when walking a tree
+ * of epoll file descriptors, we use the current recursion depth as
+ * the lockdep subkey.
+ * It is possible to drop the "ep->mtx" and to use the global
+ * mutex "epmutex" (together with "ep->lock") to have it working,
+ * but having "ep->mtx" will make the interface more scalable.
+ * Events that require holding "epmutex" are very rare, while for
+ * normal operations the epoll private "ep->mtx" will guarantee
+ * a better scalability.
+ */
+
+/* Epoll private bits inside the event mask */
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
+
+/* Maximum number of nesting allowed inside epoll sets */
+#define EP_MAX_NESTS 4
+
+#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
+
+#define EP_UNACTIVE_PTR ((void *) -1L)
+
+#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
+
+struct epoll_filefd {
+	struct file *file;
+	int fd;
+} __packed;
+
+/*
+ * Structure used to track possible nested calls, for too deep recursions
+ * and loop cycles.
+ */
+struct nested_call_node {
+	struct list_head llink;
+	void *cookie;
+	void *ctx;
+};
+
+/*
+ * This structure is used as collector for nested calls, to check for
+ * maximum recursion dept and loop cycles.
+ */
+struct nested_calls {
+	struct list_head tasks_call_list;
+	spinlock_t lock;
+};
+
+/*
+ * Each file descriptor added to the eventpoll interface will
+ * have an entry of this type linked to the "rbr" RB tree.
+ * Avoid increasing the size of this struct, there can be many thousands
+ * of these on a server and we do not want this to take another cache line.
+ */
+struct epitem {
+	union {
+		/* RB tree node links this structure to the eventpoll RB tree */
+		struct rb_node rbn;
+		/* Used to free the struct epitem */
+		struct rcu_head rcu;
+	};
+
+	/* List header used to link this structure to the eventpoll ready list */
+	struct list_head rdllink;
+
+	/*
+	 * Works together "struct eventpoll"->ovflist in keeping the
+	 * single linked chain of items.
+	 */
+	struct epitem *next;
+
+	/* The file descriptor information this item refers to */
+	struct epoll_filefd ffd;
+
+	/* Number of active wait queue attached to poll operations */
+	int nwait;
+
+	/* List containing poll wait queues */
+	struct list_head pwqlist;
+
+	/* The "container" of this item */
+	struct eventpoll *ep;
+
+	/* List header used to link this item to the "struct file" items list */
+	struct list_head fllink;
+
+	/* wakeup_source used when EPOLLWAKEUP is set */
+	struct wakeup_source __rcu *ws;
+
+	/* The structure that describe the interested events and the source fd */
+	struct epoll_event event;
+};
+
+/*
+ * This structure is stored inside the "private_data" member of the file
+ * structure and represents the main data structure for the eventpoll
+ * interface.
+ */
+struct eventpoll {
+	/* Protect the access to this structure */
+	spinlock_t lock;
+
+	/*
+	 * This mutex is used to ensure that files are not removed
+	 * while epoll is using them. This is held during the event
+	 * collection loop, the file cleanup path, the epoll file exit
+	 * code and the ctl operations.
+	 */
+	struct mutex mtx;
+
+	/* Wait queue used by sys_epoll_wait() */
+	wait_queue_head_t wq;
+
+	/* Wait queue used by file->poll() */
+	wait_queue_head_t poll_wait;
+
+	/* List of ready file descriptors */
+	struct list_head rdllist;
+
+	/* RB tree root used to store monitored fd structs */
+	struct rb_root rbr;
+
+	/*
+	 * This is a single linked list that chains all the "struct epitem" that
+	 * happened while transferring ready events to userspace w/out
+	 * holding ->lock.
+	 */
+	struct epitem *ovflist;
+
+	/* wakeup_source used when ep_scan_ready_list is running */
+	struct wakeup_source *ws;
+
+	/* The user that created the eventpoll descriptor */
+	struct user_struct *user;
+
+	struct file *file;
+
+	/* used to optimize loop detection check */
+	int visited;
+	struct list_head visited_list_link;
+};
+
+/* Wait structure used by the poll hooks */
+struct eppoll_entry {
+	/* List header used to link this structure to the "struct epitem" */
+	struct list_head llink;
+
+	/* The "base" pointer is set to the container "struct epitem" */
+	struct epitem *base;
+
+	/*
+	 * Wait queue item that will be linked to the target file wait
+	 * queue head.
+	 */
+	wait_queue_t wait;
+
+	/* The wait queue head that linked the "wait" wait queue item */
+	wait_queue_head_t *whead;
+};
+
+/* Wrapper struct used by poll queueing */
+struct ep_pqueue {
+	poll_table pt;
+	struct epitem *epi;
+};
+
+/* Used by the ep_send_events() function as callback private data */
+struct ep_send_events_data {
+	int maxevents;
+	struct epoll_event __user *events;
+};
+
+/*
+ * Configuration options available inside /proc/sys/fs/epoll/
+ */
+/* Maximum number of epoll watched descriptors, per user */
+static long max_user_watches __read_mostly;
+
+/*
+ * This mutex is used to serialize ep_free() and eventpoll_release_file().
+ */
+static DEFINE_MUTEX(epmutex);
+
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
+/* Used for safe wake up implementation */
+static struct nested_calls poll_safewake_ncalls;
+
+/* Used to call file's f_op->poll() under the nested calls boundaries */
+static struct nested_calls poll_readywalk_ncalls;
+
+/* Slab cache used to allocate "struct epitem" */
+static struct kmem_cache *epi_cache __read_mostly;
+
+/* Slab cache used to allocate "struct eppoll_entry" */
+static struct kmem_cache *pwq_cache __read_mostly;
+
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static long zero;
+static long long_max = LONG_MAX;
+
+struct ctl_table epoll_table[] = {
+	{
+		.procname	= "max_user_watches",
+		.data		= &max_user_watches,
+		.maxlen		= sizeof(max_user_watches),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &long_max,
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
+
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+	return f->f_op == &eventpoll_fops;
+}
+
+/* Setup the structure that is used as key for the RB tree */
+static inline void ep_set_ffd(struct epoll_filefd *ffd,
+			      struct file *file, int fd)
+{
+	ffd->file = file;
+	ffd->fd = fd;
+}
+
+/* Compare RB tree keys */
+static inline int ep_cmp_ffd(struct epoll_filefd *p1,
+			     struct epoll_filefd *p2)
+{
+	return (p1->file > p2->file ? +1:
+	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
+}
+
+/* Tells us if the item is currently linked */
+static inline int ep_is_linked(struct list_head *p)
+{
+	return !list_empty(p);
+}
+
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+{
+	return container_of(p, struct eppoll_entry, wait);
+}
+
+/* Get the "struct epitem" from a wait queue pointer */
+static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+{
+	return container_of(p, struct eppoll_entry, wait)->base;
+}
+
+/* Get the "struct epitem" from an epoll queue wrapper */
+static inline struct epitem *ep_item_from_epqueue(poll_table *p)
+{
+	return container_of(p, struct ep_pqueue, pt)->epi;
+}
+
+/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
+static inline int ep_op_has_event(int op)
+{
+	return op != EPOLL_CTL_DEL;
+}
+
+/* Initialize the poll safe wake up structure */
+static void ep_nested_calls_init(struct nested_calls *ncalls)
+{
+	INIT_LIST_HEAD(&ncalls->tasks_call_list);
+	spin_lock_init(&ncalls->lock);
+}
+
+/**
+ * ep_events_available - Checks if ready events might be available.
+ *
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Returns: Returns a value different than zero if ready events are available,
+ *          or zero otherwise.
+ */
+static inline int ep_events_available(struct eventpoll *ep)
+{
+	return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+}
+
+/**
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
+ *                  that the recursion limit is not exceeded, and that
+ *                  the same nested call (by the meaning of same cookie) is
+ *                  no re-entered.
+ *
+ * @ncalls: Pointer to the nested_calls structure to be used for this call.
+ * @max_nests: Maximum number of allowed nesting calls.
+ * @nproc: Nested call core function pointer.
+ * @priv: Opaque data to be passed to the @nproc callback.
+ * @cookie: Cookie to be used to identify this nested call.
+ * @ctx: This instance context.
+ *
+ * Returns: Returns the code returned by the @nproc callback, or -1 if
+ *          the maximum recursion limit has been exceeded.
+ */
+static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+			  int (*nproc)(void *, void *, int), void *priv,
+			  void *cookie, void *ctx)
+{
+	int error, call_nests = 0;
+	unsigned long flags;
+	struct list_head *lsthead = &ncalls->tasks_call_list;
+	struct nested_call_node *tncur;
+	struct nested_call_node tnode;
+
+	spin_lock_irqsave(&ncalls->lock, flags);
+
+	/*
+	 * Try to see if the current task is already inside this wakeup call.
+	 * We use a list here, since the population inside this set is always
+	 * very much limited.
+	 */
+	list_for_each_entry(tncur, lsthead, llink) {
+		if (tncur->ctx == ctx &&
+		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
+			/*
+			 * Ops ... loop detected or maximum nest level reached.
+			 * We abort this wake by breaking the cycle itself.
+			 */
+			error = -1;
+			goto out_unlock;
+		}
+	}
+
+	/* Add the current task and cookie to the list */
+	tnode.ctx = ctx;
+	tnode.cookie = cookie;
+	list_add(&tnode.llink, lsthead);
+
+	spin_unlock_irqrestore(&ncalls->lock, flags);
+
+	/* Call the nested function */
+	error = (*nproc)(priv, cookie, call_nests);
+
+	/* Remove the current task from the list */
+	spin_lock_irqsave(&ncalls->lock, flags);
+	list_del(&tnode.llink);
+out_unlock:
+	spin_unlock_irqrestore(&ncalls->lock, flags);
+
+	return error;
+}
+
+/*
+ * As described in commit 0ccf831cb lockdep: annotate epoll
+ * the use of wait queues used by epoll is done in a very controlled
+ * manner. Wake ups can nest inside each other, but are never done
+ * with the same locking. For example:
+ *
+ *   dfd = socket(...);
+ *   efd1 = epoll_create();
+ *   efd2 = epoll_create();
+ *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
+ *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
+ *
+ * When a packet arrives to the device underneath "dfd", the net code will
+ * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
+ * callback wakeup entry on that queue, and the wake_up() performed by the
+ * "dfd" net code will end up in ep_poll_callback(). At this point epoll
+ * (efd1) notices that it may have some event ready, so it needs to wake up
+ * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
+ * that ends up in another wake_up(), after having checked about the
+ * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
+ * avoid stack blasting.
+ *
+ * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
+ * this special case of epoll.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
+	wake_up_locked_poll(wqueue, events);
+	spin_unlock_irqrestore(&wqueue->lock, flags);
+}
+#else
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	wake_up_poll(wqueue, events);
+}
+#endif
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
+{
+	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
+			  1 + call_nests);
+	return 0;
+}
+
+/*
+ * Perform a safe wake up of the poll wait list. The problem is that
+ * with the new callback'd wake up system, it is possible that the
+ * poll callback is reentered from inside the call to wake_up() done
+ * on the poll wait queue head. The rule is that we cannot reenter the
+ * wake up code from the same task more than EP_MAX_NESTS times,
+ * and we cannot reenter the same wait queue head at all. This will
+ * enable to have a hierarchy of epoll file descriptor of no more than
+ * EP_MAX_NESTS deep.
+ */
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+	int this_cpu = get_cpu_light();
+
+	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+
+	put_cpu_light();
+}
+
+static void ep_remove_wait_queue(struct eppoll_entry *pwq)
+{
+	wait_queue_head_t *whead;
+
+	rcu_read_lock();
+	/* If it is cleared by POLLFREE, it should be rcu-safe */
+	whead = rcu_dereference(pwq->whead);
+	if (whead)
+		remove_wait_queue(whead, &pwq->wait);
+	rcu_read_unlock();
+}
+
+/*
+ * This function unregisters poll callbacks from the associated file
+ * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
+ * ep_free).
+ */
+static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
+{
+	struct list_head *lsthead = &epi->pwqlist;
+	struct eppoll_entry *pwq;
+
+	while (!list_empty(lsthead)) {
+		pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+
+		list_del(&pwq->llink);
+		ep_remove_wait_queue(pwq);
+		kmem_cache_free(pwq_cache, pwq);
+	}
+}
+
+/* call only when ep->mtx is held */
+static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
+{
+	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
+}
+
+/* call only when ep->mtx is held */
+static inline void ep_pm_stay_awake(struct epitem *epi)
+{
+	struct wakeup_source *ws = ep_wakeup_source(epi);
+
+	if (ws)
+		__pm_stay_awake(ws);
+}
+
+static inline bool ep_has_wakeup_source(struct epitem *epi)
+{
+	return rcu_access_pointer(epi->ws) ? true : false;
+}
+
+/* call when ep->mtx cannot be held (ep_poll_callback) */
+static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
+{
+	struct wakeup_source *ws;
+
+	rcu_read_lock();
+	ws = rcu_dereference(epi->ws);
+	if (ws)
+		__pm_stay_awake(ws);
+	rcu_read_unlock();
+}
+
+/**
+ * ep_scan_ready_list - Scans the ready list in a way that makes possible for
+ *                      the scan code, to call f_op->poll(). Also allows for
+ *                      O(NumReady) performance.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @sproc: Pointer to the scan callback.
+ * @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
+ * @ep_locked: caller already holds ep->mtx
+ *
+ * Returns: The same integer error code returned by the @sproc callback.
+ */
+static int ep_scan_ready_list(struct eventpoll *ep,
+			      int (*sproc)(struct eventpoll *,
+					   struct list_head *, void *),
+			      void *priv, int depth, bool ep_locked)
+{
+	int error, pwake = 0;
+	unsigned long flags;
+	struct epitem *epi, *nepi;
+	LIST_HEAD(txlist);
+
+	/*
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() and epoll_ctl().
+	 */
+
+	if (!ep_locked)
+		mutex_lock_nested(&ep->mtx, depth);
+
+	/*
+	 * Steal the ready list, and re-init the original one to the
+	 * empty list. Also, set ep->ovflist to NULL so that events
+	 * happening while looping w/out locks, are not lost. We cannot
+	 * have the poll callback to queue directly on ep->rdllist,
+	 * because we want the "sproc" callback to be able to do it
+	 * in a lockless way.
+	 */
+	spin_lock_irqsave(&ep->lock, flags);
+	list_splice_init(&ep->rdllist, &txlist);
+	ep->ovflist = NULL;
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/*
+	 * Now call the callback function.
+	 */
+	error = (*sproc)(ep, &txlist, priv);
+
+	spin_lock_irqsave(&ep->lock, flags);
+	/*
+	 * During the time we spent inside the "sproc" callback, some
+	 * other events might have been queued by the poll callback.
+	 * We re-insert them inside the main ready-list here.
+	 */
+	for (nepi = ep->ovflist; (epi = nepi) != NULL;
+	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		/*
+		 * We need to check if the item is already in the list.
+		 * During the "sproc" callback execution time, items are
+		 * queued into ->ovflist but the "txlist" might already
+		 * contain them, and the list_splice() below takes care of them.
+		 */
+		if (!ep_is_linked(&epi->rdllink)) {
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+			ep_pm_stay_awake(epi);
+		}
+	}
+	/*
+	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+	 * releasing the lock, events will be queued in the normal way inside
+	 * ep->rdllist.
+	 */
+	ep->ovflist = EP_UNACTIVE_PTR;
+
+	/*
+	 * Quickly re-inject items left on "txlist".
+	 */
+	list_splice(&txlist, &ep->rdllist);
+	__pm_relax(ep->ws);
+
+	if (!list_empty(&ep->rdllist)) {
+		/*
+		 * Wake up (if active) both the eventpoll wait list and
+		 * the ->poll() wait list (delayed after we release the lock).
+		 */
+		if (waitqueue_active(&ep->wq))
+			wake_up_locked(&ep->wq);
+		if (waitqueue_active(&ep->poll_wait))
+			pwake++;
+	}
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	if (!ep_locked)
+		mutex_unlock(&ep->mtx);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return error;
+}
+
+static void epi_rcu_free(struct rcu_head *head)
+{
+	struct epitem *epi = container_of(head, struct epitem, rcu);
+	kmem_cache_free(epi_cache, epi);
+}
+
+/*
+ * Removes a "struct epitem" from the eventpoll RB tree and deallocates
+ * all the associated resources. Must be called with "mtx" held.
+ */
+static int ep_remove(struct eventpoll *ep, struct epitem *epi)
+{
+	unsigned long flags;
+	struct file *file = epi->ffd.file;
+
+	/*
+	 * Removes poll wait queue hooks. We _have_ to do this without holding
+	 * the "ep->lock" otherwise a deadlock might occur. This because of the
+	 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
+	 * queue head lock when unregistering the wait queue. The wakeup callback
+	 * will run by holding the wait queue head lock and will call our callback
+	 * that will try to get "ep->lock".
+	 */
+	ep_unregister_pollwait(ep, epi);
+
+	/* Remove the current item from the list of epoll hooks */
+	spin_lock(&file->f_lock);
+	list_del_rcu(&epi->fllink);
+	spin_unlock(&file->f_lock);
+
+	rb_erase(&epi->rbn, &ep->rbr);
+
+	spin_lock_irqsave(&ep->lock, flags);
+	if (ep_is_linked(&epi->rdllink))
+		list_del_init(&epi->rdllink);
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	wakeup_source_unregister(ep_wakeup_source(epi));
+	/*
+	 * At this point it is safe to free the eventpoll item. Use the union
+	 * field epi->rcu, since we are trying to minimize the size of
+	 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
+	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
+	 * use of the rbn field.
+	 */
+	call_rcu(&epi->rcu, epi_rcu_free);
+
+	atomic_long_dec(&ep->user->epoll_watches);
+
+	return 0;
+}
+
+static void ep_free(struct eventpoll *ep)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	/* We need to release all tasks waiting for these file */
+	if (waitqueue_active(&ep->poll_wait))
+		ep_poll_safewake(&ep->poll_wait);
+
+	/*
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
+	 * We do not need to hold "ep->mtx" here because the epoll file
+	 * is on the way to be removed and no one has references to it
+	 * anymore. The only hit might come from eventpoll_release_file() but
+	 * holding "epmutex" is sufficient here.
+	 */
+	mutex_lock(&epmutex);
+
+	/*
+	 * Walks through the whole tree by unregistering poll callbacks.
+	 */
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+
+		ep_unregister_pollwait(ep, epi);
+		cond_resched();
+	}
+
+	/*
+	 * Walks through the whole tree by freeing each "struct epitem". At this
+	 * point we are sure no poll callbacks will be lingering around, and also by
+	 * holding "epmutex" we can be sure that no file cleanup code will hit
+	 * us during this operation. So we can avoid the lock on "ep->lock".
+	 * We do not need to lock ep->mtx, either, we only do it to prevent
+	 * a lockdep warning.
+	 */
+	mutex_lock(&ep->mtx);
+	while ((rbp = rb_first(&ep->rbr)) != NULL) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		ep_remove(ep, epi);
+		cond_resched();
+	}
+	mutex_unlock(&ep->mtx);
+
+	mutex_unlock(&epmutex);
+	mutex_destroy(&ep->mtx);
+	free_uid(ep->user);
+	wakeup_source_unregister(ep->ws);
+	kfree(ep);
+}
+
+static int ep_eventpoll_release(struct inode *inode, struct file *file)
+{
+	struct eventpoll *ep = file->private_data;
+
+	if (ep)
+		ep_free(ep);
+
+	return 0;
+}
+
+static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+{
+	pt->_key = epi->event.events;
+
+	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+}
+
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
+{
+	struct epitem *epi, *tmp;
+	poll_table pt;
+
+	init_poll_funcptr(&pt, NULL);
+
+	list_for_each_entry_safe(epi, tmp, head, rdllink) {
+		if (ep_item_poll(epi, &pt))
+			return POLLIN | POLLRDNORM;
+		else {
+			/*
+			 * Item has been dropped into the ready list by the poll
+			 * callback, but it's not actually ready, as far as
+			 * caller requested events goes. We can remove it here.
+			 */
+			__pm_relax(ep_wakeup_source(epi));
+			list_del_init(&epi->rdllink);
+		}
+	}
+
+	return 0;
+}
+
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt);
+
+struct readyevents_arg {
+	struct eventpoll *ep;
+	bool locked;
+};
+
+static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+{
+	struct readyevents_arg *arg = priv;
+
+	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
+				  call_nests + 1, arg->locked);
+}
+
+static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
+{
+	int pollflags;
+	struct eventpoll *ep = file->private_data;
+	struct readyevents_arg arg;
+
+	/*
+	 * During ep_insert() we already hold the ep->mtx for the tfile.
+	 * Prevent re-aquisition.
+	 */
+	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
+	arg.ep = ep;
+
+	/* Insert inside our poll wait queue */
+	poll_wait(file, &ep->poll_wait, wait);
+
+	/*
+	 * Proceed to find out if wanted events are really available inside
+	 * the ready list. This need to be done under ep_call_nested()
+	 * supervision, since the call to f_op->poll() done on listed files
+	 * could re-enter here.
+	 */
+	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+				   ep_poll_readyevents_proc, &arg, ep, current);
+
+	return pollflags != -1 ? pollflags : 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void ep_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct eventpoll *ep = f->private_data;
+	struct rb_node *rbp;
+
+	mutex_lock(&ep->mtx);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
+
+		seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
+			   epi->ffd.fd, epi->event.events,
+			   (long long)epi->event.data);
+		if (seq_has_overflowed(m))
+			break;
+	}
+	mutex_unlock(&ep->mtx);
+}
+#endif
+
+/* File callbacks that implement the eventpoll file behaviour */
+static const struct file_operations eventpoll_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= ep_show_fdinfo,
+#endif
+	.release	= ep_eventpoll_release,
+	.poll		= ep_eventpoll_poll,
+	.llseek		= noop_llseek,
+};
+
+/*
+ * This is called from eventpoll_release() to unlink files from the eventpoll
+ * interface. We need to have this facility to cleanup correctly files that are
+ * closed without being removed from the eventpoll interface.
+ */
+void eventpoll_release_file(struct file *file)
+{
+	struct eventpoll *ep;
+	struct epitem *epi, *next;
+
+	/*
+	 * We don't want to get "file->f_lock" because it is not
+	 * necessary. It is not necessary because we're in the "struct file"
+	 * cleanup path, and this means that no one is using this file anymore.
+	 * So, for example, epoll_ctl() cannot hit here since if we reach this
+	 * point, the file counter already went to zero and fget() would fail.
+	 * The only hit might come from ep_free() but by holding the mutex
+	 * will correctly serialize the operation. We do need to acquire
+	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
+	 * from anywhere but ep_free().
+	 *
+	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
+	 */
+	mutex_lock(&epmutex);
+	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
+		ep = epi->ep;
+		mutex_lock_nested(&ep->mtx, 0);
+		ep_remove(ep, epi);
+		mutex_unlock(&ep->mtx);
+	}
+	mutex_unlock(&epmutex);
+}
+
+static int ep_alloc(struct eventpoll **pep)
+{
+	int error;
+	struct user_struct *user;
+	struct eventpoll *ep;
+
+	user = get_current_user();
+	error = -ENOMEM;
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (unlikely(!ep))
+		goto free_uid;
+
+	spin_lock_init(&ep->lock);
+	mutex_init(&ep->mtx);
+	init_waitqueue_head(&ep->wq);
+	init_waitqueue_head(&ep->poll_wait);
+	INIT_LIST_HEAD(&ep->rdllist);
+	ep->rbr = RB_ROOT;
+	ep->ovflist = EP_UNACTIVE_PTR;
+	ep->user = user;
+
+	*pep = ep;
+
+	return 0;
+
+free_uid:
+	free_uid(user);
+	return error;
+}
+
+/*
+ * Search the file inside the eventpoll tree. The RB tree operations
+ * are protected by the "mtx" mutex, and ep_find() must be called with
+ * "mtx" held.
+ */
+static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+{
+	int kcmp;
+	struct rb_node *rbp;
+	struct epitem *epi, *epir = NULL;
+	struct epoll_filefd ffd;
+
+	ep_set_ffd(&ffd, file, fd);
+	for (rbp = ep->rbr.rb_node; rbp; ) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
+		if (kcmp > 0)
+			rbp = rbp->rb_right;
+		else if (kcmp < 0)
+			rbp = rbp->rb_left;
+		else {
+			epir = epi;
+			break;
+		}
+	}
+
+	return epir;
+}
+
+/*
+ * This is the callback that is passed to the wait queue wakeup
+ * mechanism. It is called by the stored file descriptors when they
+ * have events to report.
+ */
+static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	int pwake = 0;
+	unsigned long flags;
+	struct epitem *epi = ep_item_from_wait(wait);
+	struct eventpoll *ep = epi->ep;
+
+	if ((unsigned long)key & POLLFREE) {
+		ep_pwq_from_wait(wait)->whead = NULL;
+		/*
+		 * whead = NULL above can race with ep_remove_wait_queue()
+		 * which can do another remove_wait_queue() after us, so we
+		 * can't use __remove_wait_queue(). whead->lock is held by
+		 * the caller.
+		 */
+		list_del_init(&wait->task_list);
+	}
+
+	spin_lock_irqsave(&ep->lock, flags);
+
+	/*
+	 * If the event mask does not contain any poll(2) event, we consider the
+	 * descriptor to be disabled. This condition is likely the effect of the
+	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
+	 * until the next EPOLL_CTL_MOD will be issued.
+	 */
+	if (!(epi->event.events & ~EP_PRIVATE_BITS))
+		goto out_unlock;
+
+	/*
+	 * Check the events coming with the callback. At this stage, not
+	 * every device reports the events in the "key" parameter of the
+	 * callback. We need to be able to handle both cases here, hence the
+	 * test for "key" != NULL before the event match test.
+	 */
+	if (key && !((unsigned long) key & epi->event.events))
+		goto out_unlock;
+
+	/*
+	 * If we are transferring events to userspace, we can hold no locks
+	 * (because we're accessing user memory, and because of linux f_op->poll()
+	 * semantics). All the events that happen during that period of time are
+	 * chained in ep->ovflist and requeued later on.
+	 */
+	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+		if (epi->next == EP_UNACTIVE_PTR) {
+			epi->next = ep->ovflist;
+			ep->ovflist = epi;
+			if (epi->ws) {
+				/*
+				 * Activate ep->ws since epi->ws may get
+				 * deactivated at any time.
+				 */
+				__pm_stay_awake(ep->ws);
+			}
+
+		}
+		goto out_unlock;
+	}
+
+	/* If this file is already in the ready list we exit soon */
+	if (!ep_is_linked(&epi->rdllink)) {
+		list_add_tail(&epi->rdllink, &ep->rdllist);
+		ep_pm_stay_awake_rcu(epi);
+	}
+
+	/*
+	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
+	 * wait list.
+	 */
+	if (waitqueue_active(&ep->wq))
+		wake_up_locked(&ep->wq);
+	if (waitqueue_active(&ep->poll_wait))
+		pwake++;
+
+out_unlock:
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return 1;
+}
+
+/*
+ * This is the callback that is used to add our wait queue to the
+ * target file wakeup lists.
+ */
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt)
+{
+	struct epitem *epi = ep_item_from_epqueue(pt);
+	struct eppoll_entry *pwq;
+
+	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
+		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
+		pwq->whead = whead;
+		pwq->base = epi;
+		add_wait_queue(whead, &pwq->wait);
+		list_add_tail(&pwq->llink, &epi->pwqlist);
+		epi->nwait++;
+	} else {
+		/* We have to signal that an error occurred */
+		epi->nwait = -1;
+	}
+}
+
+static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
+{
+	int kcmp;
+	struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
+	struct epitem *epic;
+
+	while (*p) {
+		parent = *p;
+		epic = rb_entry(parent, struct epitem, rbn);
+		kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
+		if (kcmp > 0)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&epi->rbn, parent, p);
+	rb_insert_color(&epi->rbn, &ep->rbr);
+}
+
+
+
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+	/* Allow an arbitrary number of depth 1 paths */
+	if (nests == 0)
+		return 0;
+
+	if (++path_count[nests] > path_limits[nests])
+		return -1;
+	return 0;
+}
+
+static void path_count_init(void)
+{
+	int i;
+
+	for (i = 0; i < PATH_ARR_SIZE; i++)
+		path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+	int error = 0;
+	struct file *file = priv;
+	struct file *child_file;
+	struct epitem *epi;
+
+	/* CTL_DEL can remove links here, but that can't increase our count */
+	rcu_read_lock();
+	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
+		child_file = epi->ep->file;
+		if (is_file_epoll(child_file)) {
+			if (list_empty(&child_file->f_ep_links)) {
+				if (path_count_inc(call_nests)) {
+					error = -1;
+					break;
+				}
+			} else {
+				error = ep_call_nested(&poll_loop_ncalls,
+							EP_MAX_NESTS,
+							reverse_path_check_proc,
+							child_file, child_file,
+							current);
+			}
+			if (error != 0)
+				break;
+		} else {
+			printk(KERN_ERR "reverse_path_check_proc: "
+				"file is not an ep!\n");
+		}
+	}
+	rcu_read_unlock();
+	return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ *                      links that are proposed to be newly added. We need to
+ *                      make sure that those added links don't add too many
+ *                      paths such that we will spend all our time waking up
+ *                      eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ *	    -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+	int error = 0;
+	struct file *current_file;
+
+	/* let's call this for all tfiles */
+	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+		path_count_init();
+		error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+					reverse_path_check_proc, current_file,
+					current_file, current);
+		if (error)
+			break;
+	}
+	return error;
+}
+
+static int ep_create_wakeup_source(struct epitem *epi)
+{
+	const char *name;
+	struct wakeup_source *ws;
+
+	if (!epi->ep->ws) {
+		epi->ep->ws = wakeup_source_register("eventpoll");
+		if (!epi->ep->ws)
+			return -ENOMEM;
+	}
+
+	name = epi->ffd.file->f_path.dentry->d_name.name;
+	ws = wakeup_source_register(name);
+
+	if (!ws)
+		return -ENOMEM;
+	rcu_assign_pointer(epi->ws, ws);
+
+	return 0;
+}
+
+/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
+static noinline void ep_destroy_wakeup_source(struct epitem *epi)
+{
+	struct wakeup_source *ws = ep_wakeup_source(epi);
+
+	RCU_INIT_POINTER(epi->ws, NULL);
+
+	/*
+	 * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
+	 * used internally by wakeup_source_remove, too (called by
+	 * wakeup_source_unregister), so we cannot use call_rcu
+	 */
+	synchronize_rcu();
+	wakeup_source_unregister(ws);
+}
+
+/*
+ * Must be called with "mtx" held.
+ */
+static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+		     struct file *tfile, int fd, int full_check)
+{
+	int error, revents, pwake = 0;
+	unsigned long flags;
+	long user_watches;
+	struct epitem *epi;
+	struct ep_pqueue epq;
+
+	user_watches = atomic_long_read(&ep->user->epoll_watches);
+	if (unlikely(user_watches >= max_user_watches))
+		return -ENOSPC;
+	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
+		return -ENOMEM;
+
+	/* Item initialization follow here ... */
+	INIT_LIST_HEAD(&epi->rdllink);
+	INIT_LIST_HEAD(&epi->fllink);
+	INIT_LIST_HEAD(&epi->pwqlist);
+	epi->ep = ep;
+	ep_set_ffd(&epi->ffd, tfile, fd);
+	epi->event = *event;
+	epi->nwait = 0;
+	epi->next = EP_UNACTIVE_PTR;
+	if (epi->event.events & EPOLLWAKEUP) {
+		error = ep_create_wakeup_source(epi);
+		if (error)
+			goto error_create_wakeup_source;
+	} else {
+		RCU_INIT_POINTER(epi->ws, NULL);
+	}
+
+	/* Initialize the poll table using the queue callback */
+	epq.epi = epi;
+	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
+
+	/*
+	 * Attach the item to the poll hooks and get current event bits.
+	 * We can safely use the file* here because its usage count has
+	 * been increased by the caller of this function. Note that after
+	 * this operation completes, the poll callback can start hitting
+	 * the new item.
+	 */
+	revents = ep_item_poll(epi, &epq.pt);
+
+	/*
+	 * We have to check if something went wrong during the poll wait queue
+	 * install process. Namely an allocation for a wait queue failed due
+	 * high memory pressure.
+	 */
+	error = -ENOMEM;
+	if (epi->nwait < 0)
+		goto error_unregister;
+
+	/* Add the current item to the list of active epoll hook for this file */
+	spin_lock(&tfile->f_lock);
+	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+	spin_unlock(&tfile->f_lock);
+
+	/*
+	 * Add the current item to the RB tree. All RB tree operations are
+	 * protected by "mtx", and ep_insert() is called with "mtx" held.
+	 */
+	ep_rbtree_insert(ep, epi);
+
+	/* now check if we've created too many backpaths */
+	error = -EINVAL;
+	if (full_check && reverse_path_check())
+		goto error_remove_epi;
+
+	/* We have to drop the new item inside our item list to keep track of it */
+	spin_lock_irqsave(&ep->lock, flags);
+
+	/* If the file is already "ready" we drop it inside the ready list */
+	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
+		list_add_tail(&epi->rdllink, &ep->rdllist);
+		ep_pm_stay_awake(epi);
+
+		/* Notify waiting tasks that events are available */
+		if (waitqueue_active(&ep->wq))
+			wake_up_locked(&ep->wq);
+		if (waitqueue_active(&ep->poll_wait))
+			pwake++;
+	}
+
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	atomic_long_inc(&ep->user->epoll_watches);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return 0;
+
+error_remove_epi:
+	spin_lock(&tfile->f_lock);
+	list_del_rcu(&epi->fllink);
+	spin_unlock(&tfile->f_lock);
+
+	rb_erase(&epi->rbn, &ep->rbr);
+
+error_unregister:
+	ep_unregister_pollwait(ep, epi);
+
+	/*
+	 * We need to do this because an event could have been arrived on some
+	 * allocated wait queue. Note that we don't care about the ep->ovflist
+	 * list, since that is used/cleaned only inside a section bound by "mtx".
+	 * And ep_insert() is called with "mtx" held.
+	 */
+	spin_lock_irqsave(&ep->lock, flags);
+	if (ep_is_linked(&epi->rdllink))
+		list_del_init(&epi->rdllink);
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	wakeup_source_unregister(ep_wakeup_source(epi));
+
+error_create_wakeup_source:
+	kmem_cache_free(epi_cache, epi);
+
+	return error;
+}
+
+/*
+ * Modify the interest event mask by dropping an event if the new mask
+ * has a match in the current file status. Must be called with "mtx" held.
+ */
+static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
+{
+	int pwake = 0;
+	unsigned int revents;
+	poll_table pt;
+
+	init_poll_funcptr(&pt, NULL);
+
+	/*
+	 * Set the new event interest mask before calling f_op->poll();
+	 * otherwise we might miss an event that happens between the
+	 * f_op->poll() call and the new event set registering.
+	 */
+	epi->event.events = event->events; /* need barrier below */
+	epi->event.data = event->data; /* protected by mtx */
+	if (epi->event.events & EPOLLWAKEUP) {
+		if (!ep_has_wakeup_source(epi))
+			ep_create_wakeup_source(epi);
+	} else if (ep_has_wakeup_source(epi)) {
+		ep_destroy_wakeup_source(epi);
+	}
+
+	/*
+	 * The following barrier has two effects:
+	 *
+	 * 1) Flush epi changes above to other CPUs.  This ensures
+	 *    we do not miss events from ep_poll_callback if an
+	 *    event occurs immediately after we call f_op->poll().
+	 *    We need this because we did not take ep->lock while
+	 *    changing epi above (but ep_poll_callback does take
+	 *    ep->lock).
+	 *
+	 * 2) We also need to ensure we do not miss _past_ events
+	 *    when calling f_op->poll().  This barrier also
+	 *    pairs with the barrier in wq_has_sleeper (see
+	 *    comments for wq_has_sleeper).
+	 *
+	 * This barrier will now guarantee ep_poll_callback or f_op->poll
+	 * (or both) will notice the readiness of an item.
+	 */
+	smp_mb();
+
+	/*
+	 * Get current event bits. We can safely use the file* here because
+	 * its usage count has been increased by the caller of this function.
+	 */
+	revents = ep_item_poll(epi, &pt);
+
+	/*
+	 * If the item is "hot" and it is not registered inside the ready
+	 * list, push it inside.
+	 */
+	if (revents & event->events) {
+		spin_lock_irq(&ep->lock);
+		if (!ep_is_linked(&epi->rdllink)) {
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+			ep_pm_stay_awake(epi);
+
+			/* Notify waiting tasks that events are available */
+			if (waitqueue_active(&ep->wq))
+				wake_up_locked(&ep->wq);
+			if (waitqueue_active(&ep->poll_wait))
+				pwake++;
+		}
+		spin_unlock_irq(&ep->lock);
+	}
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return 0;
+}
+
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
+{
+	struct ep_send_events_data *esed = priv;
+	int eventcnt;
+	unsigned int revents;
+	struct epitem *epi;
+	struct epoll_event __user *uevent;
+	struct wakeup_source *ws;
+	poll_table pt;
+
+	init_poll_funcptr(&pt, NULL);
+
+	/*
+	 * We can loop without lock because we are passed a task private list.
+	 * Items cannot vanish during the loop because ep_scan_ready_list() is
+	 * holding "mtx" during this call.
+	 */
+	for (eventcnt = 0, uevent = esed->events;
+	     !list_empty(head) && eventcnt < esed->maxevents;) {
+		epi = list_first_entry(head, struct epitem, rdllink);
+
+		/*
+		 * Activate ep->ws before deactivating epi->ws to prevent
+		 * triggering auto-suspend here (in case we reactive epi->ws
+		 * below).
+		 *
+		 * This could be rearranged to delay the deactivation of epi->ws
+		 * instead, but then epi->ws would temporarily be out of sync
+		 * with ep_is_linked().
+		 */
+		ws = ep_wakeup_source(epi);
+		if (ws) {
+			if (ws->active)
+				__pm_stay_awake(ep->ws);
+			__pm_relax(ws);
+		}
+
+		list_del_init(&epi->rdllink);
+
+		revents = ep_item_poll(epi, &pt);
+
+		/*
+		 * If the event mask intersect the caller-requested one,
+		 * deliver the event to userspace. Again, ep_scan_ready_list()
+		 * is holding "mtx", so no operations coming from userspace
+		 * can change the item.
+		 */
+		if (revents) {
+			if (__put_user(revents, &uevent->events) ||
+			    __put_user(epi->event.data, &uevent->data)) {
+				list_add(&epi->rdllink, head);
+				ep_pm_stay_awake(epi);
+				return eventcnt ? eventcnt : -EFAULT;
+			}
+			eventcnt++;
+			uevent++;
+			if (epi->event.events & EPOLLONESHOT)
+				epi->event.events &= EP_PRIVATE_BITS;
+			else if (!(epi->event.events & EPOLLET)) {
+				/*
+				 * If this file has been added with Level
+				 * Trigger mode, we need to insert back inside
+				 * the ready list, so that the next call to
+				 * epoll_wait() will check again the events
+				 * availability. At this point, no one can insert
+				 * into ep->rdllist besides us. The epoll_ctl()
+				 * callers are locked out by
+				 * ep_scan_ready_list() holding "mtx" and the
+				 * poll callback will queue them in ep->ovflist.
+				 */
+				list_add_tail(&epi->rdllink, &ep->rdllist);
+				ep_pm_stay_awake(epi);
+			}
+		}
+	}
+
+	return eventcnt;
+}
+
+static int ep_send_events(struct eventpoll *ep,
+			  struct epoll_event __user *events, int maxevents)
+{
+	struct ep_send_events_data esed;
+
+	esed.maxevents = maxevents;
+	esed.events = events;
+
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
+}
+
+static inline struct timespec ep_set_mstimeout(long ms)
+{
+	struct timespec now, ts = {
+		.tv_sec = ms / MSEC_PER_SEC,
+		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
+	};
+
+	ktime_get_ts(&now);
+	return timespec_add_safe(now, ts);
+}
+
+/**
+ * ep_poll - Retrieves ready events, and delivers them to the caller supplied
+ *           event buffer.
+ *
+ * @ep: Pointer to the eventpoll context.
+ * @events: Pointer to the userspace buffer where the ready events should be
+ *          stored.
+ * @maxevents: Size (in terms of number of events) of the caller event buffer.
+ * @timeout: Maximum timeout for the ready events fetch operation, in
+ *           milliseconds. If the @timeout is zero, the function will not block,
+ *           while if the @timeout is less than zero, the function will block
+ *           until at least one event has been retrieved (or an error
+ *           occurred).
+ *
+ * Returns: Returns the number of ready events which have been fetched, or an
+ *          error code, in case of error.
+ */
+static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+		   int maxevents, long timeout)
+{
+	int res = 0, eavail, timed_out = 0;
+	unsigned long flags;
+	long slack = 0;
+	wait_queue_t wait;
+	ktime_t expires, *to = NULL;
+
+	if (timeout > 0) {
+		struct timespec end_time = ep_set_mstimeout(timeout);
+
+		slack = select_estimate_accuracy(&end_time);
+		to = &expires;
+		*to = timespec_to_ktime(end_time);
+	} else if (timeout == 0) {
+		/*
+		 * Avoid the unnecessary trip to the wait queue loop, if the
+		 * caller specified a non blocking operation.
+		 */
+		timed_out = 1;
+		spin_lock_irqsave(&ep->lock, flags);
+		goto check_events;
+	}
+
+fetch_events:
+	spin_lock_irqsave(&ep->lock, flags);
+
+	if (!ep_events_available(ep)) {
+		/*
+		 * We don't have any available event to return to the caller.
+		 * We need to sleep here, and we will be wake up by
+		 * ep_poll_callback() when events will become available.
+		 */
+		init_waitqueue_entry(&wait, current);
+		__add_wait_queue_exclusive(&ep->wq, &wait);
+
+		for (;;) {
+			/*
+			 * We don't want to sleep if the ep_poll_callback() sends us
+			 * a wakeup in between. That's why we set the task state
+			 * to TASK_INTERRUPTIBLE before doing the checks.
+			 */
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ep_events_available(ep) || timed_out)
+				break;
+			if (signal_pending(current)) {
+				res = -EINTR;
+				break;
+			}
+
+			spin_unlock_irqrestore(&ep->lock, flags);
+			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+				timed_out = 1;
+
+			spin_lock_irqsave(&ep->lock, flags);
+		}
+
+		__remove_wait_queue(&ep->wq, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+check_events:
+	/* Is it worth to try to dig for events ? */
+	eavail = ep_events_available(ep);
+
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/*
+	 * Try to transfer events to user space. In case we get 0 events and
+	 * there's still timeout left over, we go trying again in search of
+	 * more luck.
+	 */
+	if (!res && eavail &&
+	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+		goto fetch_events;
+
+	return res;
+}
+
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+	int error = 0;
+	struct file *file = priv;
+	struct eventpoll *ep = file->private_data;
+	struct eventpoll *ep_tovisit;
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
+	ep->visited = 1;
+	list_add(&ep->visited_list_link, &visited_list);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (unlikely(is_file_epoll(epi->ffd.file))) {
+			ep_tovisit = epi->ffd.file->private_data;
+			if (ep_tovisit->visited)
+				continue;
+			error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+					ep_loop_check_proc, epi->ffd.file,
+					ep_tovisit, current);
+			if (error != 0)
+				break;
+		} else {
+			/*
+			 * If we've reached a file that is not associated with
+			 * an ep, then we need to check if the newly added
+			 * links are going to add too many wakeup paths. We do
+			 * this by adding it to the tfile_check_list, if it's
+			 * not already there, and calling reverse_path_check()
+			 * during ep_insert().
+			 */
+			if (list_empty(&epi->ffd.file->f_tfile_llink))
+				list_add(&epi->ffd.file->f_tfile_llink,
+					 &tfile_check_list);
+		}
+	}
+	mutex_unlock(&ep->mtx);
+
+	return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+	int ret;
+	struct eventpoll *ep_cur, *ep_next;
+
+	ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+			      ep_loop_check_proc, file, ep, current);
+	/* clear visited list */
+	list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+							visited_list_link) {
+		ep_cur->visited = 0;
+		list_del(&ep_cur->visited_list_link);
+	}
+	return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+	struct file *file;
+
+	/* first clear the tfile_check_list */
+	while (!list_empty(&tfile_check_list)) {
+		file = list_first_entry(&tfile_check_list, struct file,
+					f_tfile_llink);
+		list_del_init(&file->f_tfile_llink);
+	}
+	INIT_LIST_HEAD(&tfile_check_list);
+}
+
+/*
+ * Open an eventpoll file descriptor.
+ */
+SYSCALL_DEFINE1(epoll_create1, int, flags)
+{
+	int error, fd;
+	struct eventpoll *ep = NULL;
+	struct file *file;
+
+	/* Check the EPOLL_* constant for consistency.  */
+	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+
+	if (flags & ~EPOLL_CLOEXEC)
+		return -EINVAL;
+	/*
+	 * Create the internal data structure ("struct eventpoll").
+	 */
+	error = ep_alloc(&ep);
+	if (error < 0)
+		return error;
+	/*
+	 * Creates all the items needed to setup an eventpoll file. That is,
+	 * a file structure and a free file descriptor.
+	 */
+	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+	if (fd < 0) {
+		error = fd;
+		goto out_free_ep;
+	}
+	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
+				 O_RDWR | (flags & O_CLOEXEC));
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto out_free_fd;
+	}
+	ep->file = file;
+	fd_install(fd, file);
+	return fd;
+
+out_free_fd:
+	put_unused_fd(fd);
+out_free_ep:
+	ep_free(ep);
+	return error;
+}
+
+SYSCALL_DEFINE1(epoll_create, int, size)
+{
+	if (size <= 0)
+		return -EINVAL;
+
+	return sys_epoll_create1(0);
+}
+
+/*
+ * The following function implements the controller interface for
+ * the eventpoll file that enables the insertion/removal/change of
+ * file descriptors inside the interest set.
+ */
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+		struct epoll_event __user *, event)
+{
+	int error;
+	int full_check = 0;
+	struct fd f, tf;
+	struct eventpoll *ep;
+	struct epitem *epi;
+	struct epoll_event epds;
+	struct eventpoll *tep = NULL;
+
+	error = -EFAULT;
+	if (ep_op_has_event(op) &&
+	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
+		goto error_return;
+
+	error = -EBADF;
+	f = fdget(epfd);
+	if (!f.file)
+		goto error_return;
+
+	/* Get the "struct file *" for the target file */
+	tf = fdget(fd);
+	if (!tf.file)
+		goto error_fput;
+
+	/* The target file descriptor must support poll */
+	error = -EPERM;
+	if (!tf.file->f_op->poll)
+		goto error_tgt_fput;
+
+	/* Check if EPOLLWAKEUP is allowed */
+	if (ep_op_has_event(op))
+		ep_take_care_of_epollwakeup(&epds);
+
+	/*
+	 * We have to check that the file structure underneath the file descriptor
+	 * the user passed to us _is_ an eventpoll file. And also we do not permit
+	 * adding an epoll file descriptor inside itself.
+	 */
+	error = -EINVAL;
+	if (f.file == tf.file || !is_file_epoll(f.file))
+		goto error_tgt_fput;
+
+	/*
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
+	 */
+	ep = f.file->private_data;
+
+	/*
+	 * When we insert an epoll file descriptor, inside another epoll file
+	 * descriptor, there is the change of creating closed loops, which are
+	 * better be handled here, than in more critical paths. While we are
+	 * checking for loops we also determine the list of files reachable
+	 * and hang them on the tfile_check_list, so we can check that we
+	 * haven't created too many possible wakeup paths.
+	 *
+	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
+	 * the epoll file descriptor is attaching directly to a wakeup source,
+	 * unless the epoll file descriptor is nested. The purpose of taking the
+	 * 'epmutex' on add is to prevent complex toplogies such as loops and
+	 * deep wakeup paths from forming in parallel through multiple
+	 * EPOLL_CTL_ADD operations.
+	 */
+	mutex_lock_nested(&ep->mtx, 0);
+	if (op == EPOLL_CTL_ADD) {
+		if (!list_empty(&f.file->f_ep_links) ||
+						is_file_epoll(tf.file)) {
+			full_check = 1;
+			mutex_unlock(&ep->mtx);
+			mutex_lock(&epmutex);
+			if (is_file_epoll(tf.file)) {
+				error = -ELOOP;
+				if (ep_loop_check(ep, tf.file) != 0) {
+					clear_tfile_check_list();
+					goto error_tgt_fput;
+				}
+			} else
+				list_add(&tf.file->f_tfile_llink,
+							&tfile_check_list);
+			mutex_lock_nested(&ep->mtx, 0);
+			if (is_file_epoll(tf.file)) {
+				tep = tf.file->private_data;
+				mutex_lock_nested(&tep->mtx, 1);
+			}
+		}
+	}
+
+	/*
+	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
+	 * above, we can be sure to be able to use the item looked up by
+	 * ep_find() till we release the mutex.
+	 */
+	epi = ep_find(ep, tf.file, fd);
+
+	error = -EINVAL;
+	switch (op) {
+	case EPOLL_CTL_ADD:
+		if (!epi) {
+			epds.events |= POLLERR | POLLHUP;
+			error = ep_insert(ep, &epds, tf.file, fd, full_check);
+		} else
+			error = -EEXIST;
+		if (full_check)
+			clear_tfile_check_list();
+		break;
+	case EPOLL_CTL_DEL:
+		if (epi)
+			error = ep_remove(ep, epi);
+		else
+			error = -ENOENT;
+		break;
+	case EPOLL_CTL_MOD:
+		if (epi) {
+			epds.events |= POLLERR | POLLHUP;
+			error = ep_modify(ep, epi, &epds);
+		} else
+			error = -ENOENT;
+		break;
+	}
+	if (tep != NULL)
+		mutex_unlock(&tep->mtx);
+	mutex_unlock(&ep->mtx);
+
+error_tgt_fput:
+	if (full_check)
+		mutex_unlock(&epmutex);
+
+	fdput(tf);
+error_fput:
+	fdput(f);
+error_return:
+
+	return error;
+}
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_wait(2).
+ */
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
+{
+	int error;
+	struct fd f;
+	struct eventpoll *ep;
+
+	/* The maximum number of event must be greater than zero */
+	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
+		return -EINVAL;
+
+	/* Verify that the area passed by the user is writeable */
+	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
+		return -EFAULT;
+
+	/* Get the "struct file *" for the eventpoll file */
+	f = fdget(epfd);
+	if (!f.file)
+		return -EBADF;
+
+	/*
+	 * We have to check that the file structure underneath the fd
+	 * the user passed to us _is_ an eventpoll file.
+	 */
+	error = -EINVAL;
+	if (!is_file_epoll(f.file))
+		goto error_fput;
+
+	/*
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
+	 */
+	ep = f.file->private_data;
+
+	/* Time to fish for events ... */
+	error = ep_poll(ep, events, maxevents, timeout);
+
+error_fput:
+	fdput(f);
+	return error;
+}
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
+{
+	int error;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+		sigsaved = current->blocked;
+		set_current_blocked(&ksigmask);
+	}
+
+	error = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (error == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+			       sizeof(sigsaved));
+			set_restore_sigmask();
+		} else
+			set_current_blocked(&sigsaved);
+	}
+
+	return error;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+			struct epoll_event __user *, events,
+			int, maxevents, int, timeout,
+			const compat_sigset_t __user *, sigmask,
+			compat_size_t, sigsetsize)
+{
+	long err;
+	compat_sigset_t csigmask;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &csigmask);
+		sigsaved = current->blocked;
+		set_current_blocked(&ksigmask);
+	}
+
+	err = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (err == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+			       sizeof(sigsaved));
+			set_restore_sigmask();
+		} else
+			set_current_blocked(&sigsaved);
+	}
+
+	return err;
+}
+#endif
+
+static int __init eventpoll_init(void)
+{
+	struct sysinfo si;
+
+	si_meminfo(&si);
+	/*
+	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
+	 */
+	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
+		EP_ITEM_COST;
+	BUG_ON(max_user_watches < 0);
+
+	/*
+	 * Initialize the structure used to perform epoll file descriptor
+	 * inclusion loops checks.
+	 */
+	ep_nested_calls_init(&poll_loop_ncalls);
+
+	/* Initialize the structure used to perform safe poll wait head wake ups */
+	ep_nested_calls_init(&poll_safewake_ncalls);
+
+	/* Initialize the structure used to perform file's f_op->poll() calls */
+	ep_nested_calls_init(&poll_readywalk_ncalls);
+
+	/*
+	 * We can have many thousands of epitems, so prevent this from
+	 * using an extra cache line on 64-bit (and smaller) CPUs
+	 */
+	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
+
+	/* Allocates slab cache used to allocate "struct epitem" items */
+	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	/* Allocates slab cache used to allocate "struct eppoll_entry" */
+	pwq_cache = kmem_cache_create("eventpoll_pwq",
+			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
+
+	return 0;
+}
+fs_initcall(eventpoll_init);
diff --git a/kernel/fs/exec.c b/kernel/fs/exec.c
new file mode 100644
index 000000000..0e7125be0
--- /dev/null
+++ b/kernel/fs/exec.c
@@ -0,0 +1,1747 @@
+/*
+ *  linux/fs/exec.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * #!-checking implemented by tytso.
+ */
+/*
+ * Demand-loading implemented 01.12.91 - no need to read anything but
+ * the header into memory. The inode of the executable is put into
+ * "current->executable", and page faults do the actual loading. Clean.
+ *
+ * Once more I can proudly say that linux stood up to being changed: it
+ * was less than 2 hours work to get demand-loading completely implemented.
+ *
+ * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
+ * current->executable is only used by the procfs.  This allows a dispatch
+ * table to check for several different types  of binary formats.  We keep
+ * trying until we recognize the file or we run out of supported binary
+ * formats. 
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/swap.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/perf_event.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/key.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+
+#include <trace/events/task.h>
+#include "internal.h"
+
+#include <trace/events/sched.h>
+
+int suid_dumpable = 0;
+
+static LIST_HEAD(formats);
+static DEFINE_RWLOCK(binfmt_lock);
+
+void __register_binfmt(struct linux_binfmt * fmt, int insert)
+{
+	BUG_ON(!fmt);
+	if (WARN_ON(!fmt->load_binary))
+		return;
+	write_lock(&binfmt_lock);
+	insert ? list_add(&fmt->lh, &formats) :
+		 list_add_tail(&fmt->lh, &formats);
+	write_unlock(&binfmt_lock);
+}
+
+EXPORT_SYMBOL(__register_binfmt);
+
+void unregister_binfmt(struct linux_binfmt * fmt)
+{
+	write_lock(&binfmt_lock);
+	list_del(&fmt->lh);
+	write_unlock(&binfmt_lock);
+}
+
+EXPORT_SYMBOL(unregister_binfmt);
+
+static inline void put_binfmt(struct linux_binfmt * fmt)
+{
+	module_put(fmt->module);
+}
+
+#ifdef CONFIG_USELIB
+/*
+ * Note that a shared library must be both readable and executable due to
+ * security reasons.
+ *
+ * Also note that we take the address to load from from the file itself.
+ */
+SYSCALL_DEFINE1(uselib, const char __user *, library)
+{
+	struct linux_binfmt *fmt;
+	struct file *file;
+	struct filename *tmp = getname(library);
+	int error = PTR_ERR(tmp);
+	static const struct open_flags uselib_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN,
+		.lookup_flags = LOOKUP_FOLLOW,
+	};
+
+	if (IS_ERR(tmp))
+		goto out;
+
+	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
+	putname(tmp);
+	error = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+
+	error = -EINVAL;
+	if (!S_ISREG(file_inode(file)->i_mode))
+		goto exit;
+
+	error = -EACCES;
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
+	fsnotify_open(file);
+
+	error = -ENOEXEC;
+
+	read_lock(&binfmt_lock);
+	list_for_each_entry(fmt, &formats, lh) {
+		if (!fmt->load_shlib)
+			continue;
+		if (!try_module_get(fmt->module))
+			continue;
+		read_unlock(&binfmt_lock);
+		error = fmt->load_shlib(file);
+		read_lock(&binfmt_lock);
+		put_binfmt(fmt);
+		if (error != -ENOEXEC)
+			break;
+	}
+	read_unlock(&binfmt_lock);
+exit:
+	fput(file);
+out:
+  	return error;
+}
+#endif /* #ifdef CONFIG_USELIB */
+
+#ifdef CONFIG_MMU
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+	struct mm_struct *mm = current->mm;
+	long diff = (long)(pages - bprm->vma_pages);
+
+	if (!mm || !diff)
+		return;
+
+	bprm->vma_pages = pages;
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+}
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+	int ret;
+
+#ifdef CONFIG_STACK_GROWSUP
+	if (write) {
+		ret = expand_downwards(bprm->vma, pos);
+		if (ret < 0)
+			return NULL;
+	}
+#endif
+	ret = get_user_pages(current, bprm->mm, pos,
+			1, write, 1, &page, NULL);
+	if (ret <= 0)
+		return NULL;
+
+	if (write) {
+		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+		struct rlimit *rlim;
+
+		acct_arg_size(bprm, size / PAGE_SIZE);
+
+		/*
+		 * We've historically supported up to 32 pages (ARG_MAX)
+		 * of argument strings even with small stacks
+		 */
+		if (size <= ARG_MAX)
+			return page;
+
+		/*
+		 * Limit to 1/4-th the stack size for the argv+env strings.
+		 * This ensures that:
+		 *  - the remaining binfmt code will not run out of stack space,
+		 *  - the program will have a reasonable amount of stack left
+		 *    to work from.
+		 */
+		rlim = current->signal->rlim;
+		if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
+			put_page(page);
+			return NULL;
+		}
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+	put_page(page);
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err;
+	struct vm_area_struct *vma = NULL;
+	struct mm_struct *mm = bprm->mm;
+
+	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	if (!vma)
+		return -ENOMEM;
+
+	down_write(&mm->mmap_sem);
+	vma->vm_mm = mm;
+
+	/*
+	 * Place the stack at the largest stack address the architecture
+	 * supports. Later, we'll move this to an appropriate place. We don't
+	 * use STACK_TOP because that can depend on attributes which aren't
+	 * configured yet.
+	 */
+	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+	vma->vm_end = STACK_TOP_MAX;
+	vma->vm_start = vma->vm_end - PAGE_SIZE;
+	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
+
+	err = insert_vm_struct(mm, vma);
+	if (err)
+		goto err;
+
+	mm->stack_vm = mm->total_vm = 1;
+	arch_bprm_mm_init(mm, vma);
+	up_write(&mm->mmap_sem);
+	bprm->p = vma->vm_end - sizeof(void *);
+	return 0;
+err:
+	up_write(&mm->mmap_sem);
+	bprm->vma = NULL;
+	kmem_cache_free(vm_area_cachep, vma);
+	return err;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= MAX_ARG_STRLEN;
+}
+
+#else
+
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+
+	page = bprm->page[pos / PAGE_SIZE];
+	if (!page && write) {
+		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
+		if (!page)
+			return NULL;
+		bprm->page[pos / PAGE_SIZE] = page;
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+	if (bprm->page[i]) {
+		__free_page(bprm->page[i]);
+		bprm->page[i] = NULL;
+	}
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+	int i;
+
+	for (i = 0; i < MAX_ARG_PAGES; i++)
+		free_arg_page(bprm, i);
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+	return 0;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= bprm->p;
+}
+
+#endif /* CONFIG_MMU */
+
+/*
+ * Create a new mm_struct and populate it with a temporary stack
+ * vm_area_struct.  We don't have enough context at this point to set the stack
+ * flags, permissions, and offset, so we use temporary values.  We'll update
+ * them later in setup_arg_pages().
+ */
+static int bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err;
+	struct mm_struct *mm = NULL;
+
+	bprm->mm = mm = mm_alloc();
+	err = -ENOMEM;
+	if (!mm)
+		goto err;
+
+	err = __bprm_mm_init(bprm);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	if (mm) {
+		bprm->mm = NULL;
+		mmdrop(mm);
+	}
+
+	return err;
+}
+
+struct user_arg_ptr {
+#ifdef CONFIG_COMPAT
+	bool is_compat;
+#endif
+	union {
+		const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+		const compat_uptr_t __user *compat;
+#endif
+	} ptr;
+};
+
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
+{
+	const char __user *native;
+
+#ifdef CONFIG_COMPAT
+	if (unlikely(argv.is_compat)) {
+		compat_uptr_t compat;
+
+		if (get_user(compat, argv.ptr.compat + nr))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(compat);
+	}
+#endif
+
+	if (get_user(native, argv.ptr.native + nr))
+		return ERR_PTR(-EFAULT);
+
+	return native;
+}
+
+/*
+ * count() counts the number of strings in array ARGV.
+ */
+static int count(struct user_arg_ptr argv, int max)
+{
+	int i = 0;
+
+	if (argv.ptr.native != NULL) {
+		for (;;) {
+			const char __user *p = get_user_arg_ptr(argv, i);
+
+			if (!p)
+				break;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
+			if (i >= max)
+				return -E2BIG;
+			++i;
+
+			if (fatal_signal_pending(current))
+				return -ERESTARTNOHAND;
+			cond_resched();
+		}
+	}
+	return i;
+}
+
+/*
+ * 'copy_strings()' copies argument/environment strings from the old
+ * processes's memory to the new process's stack.  The call to get_user_pages()
+ * ensures the destination page is created and not swapped out.
+ */
+static int copy_strings(int argc, struct user_arg_ptr argv,
+			struct linux_binprm *bprm)
+{
+	struct page *kmapped_page = NULL;
+	char *kaddr = NULL;
+	unsigned long kpos = 0;
+	int ret;
+
+	while (argc-- > 0) {
+		const char __user *str;
+		int len;
+		unsigned long pos;
+
+		ret = -EFAULT;
+		str = get_user_arg_ptr(argv, argc);
+		if (IS_ERR(str))
+			goto out;
+
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
+			goto out;
+
+		/* We're going to work our way backwords. */
+		pos = bprm->p;
+		str += len;
+		bprm->p -= len;
+
+		while (len > 0) {
+			int offset, bytes_to_copy;
+
+			if (fatal_signal_pending(current)) {
+				ret = -ERESTARTNOHAND;
+				goto out;
+			}
+			cond_resched();
+
+			offset = pos % PAGE_SIZE;
+			if (offset == 0)
+				offset = PAGE_SIZE;
+
+			bytes_to_copy = offset;
+			if (bytes_to_copy > len)
+				bytes_to_copy = len;
+
+			offset -= bytes_to_copy;
+			pos -= bytes_to_copy;
+			str -= bytes_to_copy;
+			len -= bytes_to_copy;
+
+			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
+				struct page *page;
+
+				page = get_arg_page(bprm, pos, 1);
+				if (!page) {
+					ret = -E2BIG;
+					goto out;
+				}
+
+				if (kmapped_page) {
+					flush_kernel_dcache_page(kmapped_page);
+					kunmap(kmapped_page);
+					put_arg_page(kmapped_page);
+				}
+				kmapped_page = page;
+				kaddr = kmap(kmapped_page);
+				kpos = pos & PAGE_MASK;
+				flush_arg_page(bprm, kpos, kmapped_page);
+			}
+			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		}
+	}
+	ret = 0;
+out:
+	if (kmapped_page) {
+		flush_kernel_dcache_page(kmapped_page);
+		kunmap(kmapped_page);
+		put_arg_page(kmapped_page);
+	}
+	return ret;
+}
+
+/*
+ * Like copy_strings, but get argv and its values from kernel memory.
+ */
+int copy_strings_kernel(int argc, const char *const *__argv,
+			struct linux_binprm *bprm)
+{
+	int r;
+	mm_segment_t oldfs = get_fs();
+	struct user_arg_ptr argv = {
+		.ptr.native = (const char __user *const  __user *)__argv,
+	};
+
+	set_fs(KERNEL_DS);
+	r = copy_strings(argc, argv, bprm);
+	set_fs(oldfs);
+
+	return r;
+}
+EXPORT_SYMBOL(copy_strings_kernel);
+
+#ifdef CONFIG_MMU
+
+/*
+ * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
+ * the binfmt code determines where the new stack should reside, we shift it to
+ * its final location.  The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges.  This ensures the
+ *    arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long old_start = vma->vm_start;
+	unsigned long old_end = vma->vm_end;
+	unsigned long length = old_end - old_start;
+	unsigned long new_start = old_start - shift;
+	unsigned long new_end = old_end - shift;
+	struct mmu_gather tlb;
+
+	BUG_ON(new_start > new_end);
+
+	/*
+	 * ensure there are no vmas between where we want to go
+	 * and where we are
+	 */
+	if (vma != find_vma(mm, new_start))
+		return -EFAULT;
+
+	/*
+	 * cover the whole range: [new_start, old_end)
+	 */
+	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
+		return -ENOMEM;
+
+	/*
+	 * move the page tables downwards, on failure we rely on
+	 * process cleanup to remove whatever mess we made.
+	 */
+	if (length != move_page_tables(vma, old_start,
+				       vma, new_start, length, false))
+		return -ENOMEM;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, old_start, old_end);
+	if (new_end > old_start) {
+		/*
+		 * when the old and new regions overlap clear from new_end.
+		 */
+		free_pgd_range(&tlb, new_end, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+	} else {
+		/*
+		 * otherwise, clean from old_start; this is done to not touch
+		 * the address space in [new_end, old_start) some architectures
+		 * have constraints on va-space that make this illegal (IA64) -
+		 * for the others its just a little faster.
+		 */
+		free_pgd_range(&tlb, old_start, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+	}
+	tlb_finish_mmu(&tlb, old_start, old_end);
+
+	/*
+	 * Shrink the vma to just the new range.  Always succeeds.
+	 */
+	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
+
+	return 0;
+}
+
+/*
+ * Finalizes the stack vm_area_struct. The flags and permissions are updated,
+ * the stack is optionally relocated, and some extra space is added.
+ */
+int setup_arg_pages(struct linux_binprm *bprm,
+		    unsigned long stack_top,
+		    int executable_stack)
+{
+	unsigned long ret;
+	unsigned long stack_shift;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma = bprm->vma;
+	struct vm_area_struct *prev = NULL;
+	unsigned long vm_flags;
+	unsigned long stack_base;
+	unsigned long stack_size;
+	unsigned long stack_expand;
+	unsigned long rlim_stack;
+
+#ifdef CONFIG_STACK_GROWSUP
+	/* Limit stack size */
+	stack_base = rlimit_max(RLIMIT_STACK);
+	if (stack_base > STACK_SIZE_MAX)
+		stack_base = STACK_SIZE_MAX;
+
+	/* Add space for stack randomization. */
+	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
+
+	/* Make sure we didn't let the argument array grow too large. */
+	if (vma->vm_end - vma->vm_start > stack_base)
+		return -ENOMEM;
+
+	stack_base = PAGE_ALIGN(stack_top - stack_base);
+
+	stack_shift = vma->vm_start - stack_base;
+	mm->arg_start = bprm->p - stack_shift;
+	bprm->p = vma->vm_end - stack_shift;
+#else
+	stack_top = arch_align_stack(stack_top);
+	stack_top = PAGE_ALIGN(stack_top);
+
+	if (unlikely(stack_top < mmap_min_addr) ||
+	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
+		return -ENOMEM;
+
+	stack_shift = vma->vm_end - stack_top;
+
+	bprm->p -= stack_shift;
+	mm->arg_start = bprm->p;
+#endif
+
+	if (bprm->loader)
+		bprm->loader -= stack_shift;
+	bprm->exec -= stack_shift;
+
+	down_write(&mm->mmap_sem);
+	vm_flags = VM_STACK_FLAGS;
+
+	/*
+	 * Adjust stack execute permissions; explicitly enable for
+	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
+	 * (arch default) otherwise.
+	 */
+	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
+		vm_flags |= VM_EXEC;
+	else if (executable_stack == EXSTACK_DISABLE_X)
+		vm_flags &= ~VM_EXEC;
+	vm_flags |= mm->def_flags;
+	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
+
+	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+			vm_flags);
+	if (ret)
+		goto out_unlock;
+	BUG_ON(prev != vma);
+
+	/* Move stack pages down in memory. */
+	if (stack_shift) {
+		ret = shift_arg_pages(vma, stack_shift);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/* mprotect_fixup is overkill to remove the temporary stack flags */
+	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
+
+	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
+	stack_size = vma->vm_end - vma->vm_start;
+	/*
+	 * Align this down to a page boundary as expand_stack
+	 * will align it up.
+	 */
+	rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
+#ifdef CONFIG_STACK_GROWSUP
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_start + rlim_stack;
+	else
+		stack_base = vma->vm_end + stack_expand;
+#else
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_end - rlim_stack;
+	else
+		stack_base = vma->vm_start - stack_expand;
+#endif
+	current->mm->start_stack = bprm->p;
+	ret = expand_stack(vma, stack_base);
+	if (ret)
+		ret = -EFAULT;
+
+out_unlock:
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(setup_arg_pages);
+
+#endif /* CONFIG_MMU */
+
+static struct file *do_open_execat(int fd, struct filename *name, int flags)
+{
+	struct file *file;
+	int err;
+	struct open_flags open_exec_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN,
+		.lookup_flags = LOOKUP_FOLLOW,
+	};
+
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return ERR_PTR(-EINVAL);
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
+
+	file = do_filp_open(fd, name, &open_exec_flags);
+	if (IS_ERR(file))
+		goto out;
+
+	err = -EACCES;
+	if (!S_ISREG(file_inode(file)->i_mode))
+		goto exit;
+
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
+	err = deny_write_access(file);
+	if (err)
+		goto exit;
+
+	if (name->name[0] != '\0')
+		fsnotify_open(file);
+
+out:
+	return file;
+
+exit:
+	fput(file);
+	return ERR_PTR(err);
+}
+
+struct file *open_exec(const char *name)
+{
+	struct filename *filename = getname_kernel(name);
+	struct file *f = ERR_CAST(filename);
+
+	if (!IS_ERR(filename)) {
+		f = do_open_execat(AT_FDCWD, filename, 0);
+		putname(filename);
+	}
+	return f;
+}
+EXPORT_SYMBOL(open_exec);
+
+int kernel_read(struct file *file, loff_t offset,
+		char *addr, unsigned long count)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	int result;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	result = vfs_read(file, (void __user *)addr, count, &pos);
+	set_fs(old_fs);
+	return result;
+}
+
+EXPORT_SYMBOL(kernel_read);
+
+ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
+{
+	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
+	if (res > 0)
+		flush_icache_range(addr, addr + len);
+	return res;
+}
+EXPORT_SYMBOL(read_code);
+
+static int exec_mmap(struct mm_struct *mm)
+{
+	struct task_struct *tsk;
+	struct mm_struct *old_mm, *active_mm;
+
+	/* Notify parent that we're no longer interested in the old VM */
+	tsk = current;
+	old_mm = current->mm;
+	mm_release(tsk, old_mm);
+
+	if (old_mm) {
+		sync_mm_rss(old_mm);
+		/*
+		 * Make sure that if there is a core dump in progress
+		 * for the old mm, we get out and die instead of going
+		 * through with the exec.  We must hold mmap_sem around
+		 * checking core_state and changing tsk->mm.
+		 */
+		down_read(&old_mm->mmap_sem);
+		if (unlikely(old_mm->core_state)) {
+			up_read(&old_mm->mmap_sem);
+			return -EINTR;
+		}
+	}
+	task_lock(tsk);
+	preempt_disable_rt();
+	active_mm = tsk->active_mm;
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	activate_mm(active_mm, mm);
+	tsk->mm->vmacache_seqnum = 0;
+	vmacache_flush(tsk);
+	preempt_enable_rt();
+	task_unlock(tsk);
+	if (old_mm) {
+		up_read(&old_mm->mmap_sem);
+		BUG_ON(active_mm != old_mm);
+		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
+		mm_update_next_owner(old_mm);
+		mmput(old_mm);
+		return 0;
+	}
+	mmdrop(active_mm);
+	return 0;
+}
+
+/*
+ * This function makes sure the current process has its own signal table,
+ * so that flush_signal_handlers can later reset the handlers without
+ * disturbing other processes.  (Other processes might share the signal
+ * table via the CLONE_SIGHAND option to clone().)
+ */
+static int de_thread(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct sighand_struct *oldsighand = tsk->sighand;
+	spinlock_t *lock = &oldsighand->siglock;
+
+	if (thread_group_empty(tsk))
+		goto no_thread_group;
+
+	/*
+	 * Kill all other threads in the thread group.
+	 */
+	spin_lock_irq(lock);
+	if (signal_group_exit(sig)) {
+		/*
+		 * Another group action in progress, just
+		 * return so that the signal is processed.
+		 */
+		spin_unlock_irq(lock);
+		return -EAGAIN;
+	}
+
+	sig->group_exit_task = tsk;
+	sig->notify_count = zap_other_threads(tsk);
+	if (!thread_group_leader(tsk))
+		sig->notify_count--;
+
+	while (sig->notify_count) {
+		__set_current_state(TASK_KILLABLE);
+		spin_unlock_irq(lock);
+		schedule();
+		if (unlikely(__fatal_signal_pending(tsk)))
+			goto killed;
+		spin_lock_irq(lock);
+	}
+	spin_unlock_irq(lock);
+
+	/*
+	 * At this point all other threads have exited, all we have to
+	 * do is to wait for the thread group leader to become inactive,
+	 * and to assume its PID:
+	 */
+	if (!thread_group_leader(tsk)) {
+		struct task_struct *leader = tsk->group_leader;
+
+		for (;;) {
+			threadgroup_change_begin(tsk);
+			write_lock_irq(&tasklist_lock);
+			/*
+			 * Do this under tasklist_lock to ensure that
+			 * exit_notify() can't miss ->group_exit_task
+			 */
+			sig->notify_count = -1;
+			if (likely(leader->exit_state))
+				break;
+			__set_current_state(TASK_KILLABLE);
+			write_unlock_irq(&tasklist_lock);
+			threadgroup_change_end(tsk);
+			schedule();
+			if (unlikely(__fatal_signal_pending(tsk)))
+				goto killed;
+		}
+
+		/*
+		 * The only record we have of the real-time age of a
+		 * process, regardless of execs it's done, is start_time.
+		 * All the past CPU time is accumulated in signal_struct
+		 * from sister threads now dead.  But in this non-leader
+		 * exec, nothing survives from the original leader thread,
+		 * whose birth marks the true age of this process now.
+		 * When we take on its identity by switching to its PID, we
+		 * also take its birthdate (always earlier than our own).
+		 */
+		tsk->start_time = leader->start_time;
+		tsk->real_start_time = leader->real_start_time;
+
+		BUG_ON(!same_thread_group(leader, tsk));
+		BUG_ON(has_group_leader_pid(tsk));
+		/*
+		 * An exec() starts a new thread group with the
+		 * TGID of the previous thread group. Rehash the
+		 * two threads with a switched PID, and release
+		 * the former thread group leader:
+		 */
+
+		/* Become a process group leader with the old leader's pid.
+		 * The old leader becomes a thread of the this thread group.
+		 * Note: The old leader also uses this pid until release_task
+		 *       is called.  Odd but simple and correct.
+		 */
+		tsk->pid = leader->pid;
+		change_pid(tsk, PIDTYPE_PID, task_pid(leader));
+		transfer_pid(leader, tsk, PIDTYPE_PGID);
+		transfer_pid(leader, tsk, PIDTYPE_SID);
+
+		list_replace_rcu(&leader->tasks, &tsk->tasks);
+		list_replace_init(&leader->sibling, &tsk->sibling);
+
+		tsk->group_leader = tsk;
+		leader->group_leader = tsk;
+
+		tsk->exit_signal = SIGCHLD;
+		leader->exit_signal = -1;
+
+		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
+		leader->exit_state = EXIT_DEAD;
+
+		/*
+		 * We are going to release_task()->ptrace_unlink() silently,
+		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
+		 * the tracer wont't block again waiting for this thread.
+		 */
+		if (unlikely(leader->ptrace))
+			__wake_up_parent(leader, leader->parent);
+		write_unlock_irq(&tasklist_lock);
+		threadgroup_change_end(tsk);
+
+		release_task(leader);
+	}
+
+	sig->group_exit_task = NULL;
+	sig->notify_count = 0;
+
+no_thread_group:
+	/* we have changed execution domain */
+	tsk->exit_signal = SIGCHLD;
+
+	exit_itimers(sig);
+	flush_itimer_signals();
+
+	if (atomic_read(&oldsighand->count) != 1) {
+		struct sighand_struct *newsighand;
+		/*
+		 * This ->sighand is shared with the CLONE_SIGHAND
+		 * but not CLONE_THREAD task, switch to the new one.
+		 */
+		newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+		if (!newsighand)
+			return -ENOMEM;
+
+		atomic_set(&newsighand->count, 1);
+		memcpy(newsighand->action, oldsighand->action,
+		       sizeof(newsighand->action));
+
+		write_lock_irq(&tasklist_lock);
+		spin_lock(&oldsighand->siglock);
+		rcu_assign_pointer(tsk->sighand, newsighand);
+		spin_unlock(&oldsighand->siglock);
+		write_unlock_irq(&tasklist_lock);
+
+		__cleanup_sighand(oldsighand);
+	}
+
+	BUG_ON(!thread_group_leader(tsk));
+	return 0;
+
+killed:
+	/* protects against exit_notify() and __exit_signal() */
+	read_lock(&tasklist_lock);
+	sig->group_exit_task = NULL;
+	sig->notify_count = 0;
+	read_unlock(&tasklist_lock);
+	return -EAGAIN;
+}
+
+char *get_task_comm(char *buf, struct task_struct *tsk)
+{
+	/* buf must be at least sizeof(tsk->comm) in size */
+	task_lock(tsk);
+	strncpy(buf, tsk->comm, sizeof(tsk->comm));
+	task_unlock(tsk);
+	return buf;
+}
+EXPORT_SYMBOL_GPL(get_task_comm);
+
+/*
+ * These functions flushes out all traces of the currently running executable
+ * so that a new one can be started
+ */
+
+void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
+{
+	task_lock(tsk);
+	trace_task_rename(tsk, buf);
+	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
+	task_unlock(tsk);
+	perf_event_comm(tsk, exec);
+}
+
+int flush_old_exec(struct linux_binprm * bprm)
+{
+	int retval;
+
+	/*
+	 * Make sure we have a private signal table and that
+	 * we are unassociated from the previous thread group.
+	 */
+	retval = de_thread(current);
+	if (retval)
+		goto out;
+
+	/*
+	 * Must be called _before_ exec_mmap() as bprm->mm is
+	 * not visibile until then. This also enables the update
+	 * to be lockless.
+	 */
+	set_mm_exe_file(bprm->mm, bprm->file);
+
+	/*
+	 * Release all of the old mmap stuff
+	 */
+	acct_arg_size(bprm, 0);
+	retval = exec_mmap(bprm->mm);
+	if (retval)
+		goto out;
+
+	bprm->mm = NULL;		/* We're using it now */
+
+	set_fs(USER_DS);
+	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+					PF_NOFREEZE | PF_NO_SETAFFINITY);
+	flush_thread();
+	current->personality &= ~bprm->per_clear;
+
+	return 0;
+
+out:
+	return retval;
+}
+EXPORT_SYMBOL(flush_old_exec);
+
+void would_dump(struct linux_binprm *bprm, struct file *file)
+{
+	if (inode_permission(file_inode(file), MAY_READ) < 0)
+		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+}
+EXPORT_SYMBOL(would_dump);
+
+void setup_new_exec(struct linux_binprm * bprm)
+{
+	arch_pick_mmap_layout(current->mm);
+
+	/* This is the point of no return */
+	current->sas_ss_sp = current->sas_ss_size = 0;
+
+	if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
+		set_dumpable(current->mm, SUID_DUMP_USER);
+	else
+		set_dumpable(current->mm, suid_dumpable);
+
+	perf_event_exec();
+	__set_task_comm(current, kbasename(bprm->filename), true);
+
+	/* Set the new mm task size. We have to do that late because it may
+	 * depend on TIF_32BIT which is only updated in flush_thread() on
+	 * some architectures like powerpc
+	 */
+	current->mm->task_size = TASK_SIZE;
+
+	/* install the new credentials */
+	if (!uid_eq(bprm->cred->uid, current_euid()) ||
+	    !gid_eq(bprm->cred->gid, current_egid())) {
+		current->pdeath_signal = 0;
+	} else {
+		would_dump(bprm, bprm->file);
+		if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
+			set_dumpable(current->mm, suid_dumpable);
+	}
+
+	/* An exec changes our domain. We are no longer part of the thread
+	   group */
+	current->self_exec_id++;
+	flush_signal_handlers(current, 0);
+	do_close_on_exec(current->files);
+}
+EXPORT_SYMBOL(setup_new_exec);
+
+/*
+ * Prepare credentials and lock ->cred_guard_mutex.
+ * install_exec_creds() commits the new creds and drops the lock.
+ * Or, if exec fails before, free_bprm() should release ->cred and
+ * and unlock.
+ */
+int prepare_bprm_creds(struct linux_binprm *bprm)
+{
+	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
+		return -ERESTARTNOINTR;
+
+	bprm->cred = prepare_exec_creds();
+	if (likely(bprm->cred))
+		return 0;
+
+	mutex_unlock(&current->signal->cred_guard_mutex);
+	return -ENOMEM;
+}
+
+static void free_bprm(struct linux_binprm *bprm)
+{
+	free_arg_pages(bprm);
+	if (bprm->cred) {
+		mutex_unlock(&current->signal->cred_guard_mutex);
+		abort_creds(bprm->cred);
+	}
+	if (bprm->file) {
+		allow_write_access(bprm->file);
+		fput(bprm->file);
+	}
+	/* If a binfmt changed the interp, free it. */
+	if (bprm->interp != bprm->filename)
+		kfree(bprm->interp);
+	kfree(bprm);
+}
+
+int bprm_change_interp(char *interp, struct linux_binprm *bprm)
+{
+	/* If a binfmt changed the interp, free it first. */
+	if (bprm->interp != bprm->filename)
+		kfree(bprm->interp);
+	bprm->interp = kstrdup(interp, GFP_KERNEL);
+	if (!bprm->interp)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL(bprm_change_interp);
+
+/*
+ * install the new credentials for this executable
+ */
+void install_exec_creds(struct linux_binprm *bprm)
+{
+	security_bprm_committing_creds(bprm);
+
+	commit_creds(bprm->cred);
+	bprm->cred = NULL;
+
+	/*
+	 * Disable monitoring for regular users
+	 * when executing setuid binaries. Must
+	 * wait until new credentials are committed
+	 * by commit_creds() above
+	 */
+	if (get_dumpable(current->mm) != SUID_DUMP_USER)
+		perf_event_exit_task(current);
+	/*
+	 * cred_guard_mutex must be held at least to this point to prevent
+	 * ptrace_attach() from altering our determination of the task's
+	 * credentials; any time after this it may be unlocked.
+	 */
+	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->signal->cred_guard_mutex);
+}
+EXPORT_SYMBOL(install_exec_creds);
+
+/*
+ * determine how safe it is to execute the proposed program
+ * - the caller must hold ->cred_guard_mutex to protect against
+ *   PTRACE_ATTACH or seccomp thread-sync
+ */
+static void check_unsafe_exec(struct linux_binprm *bprm)
+{
+	struct task_struct *p = current, *t;
+	unsigned n_fs;
+
+	if (p->ptrace) {
+		if (p->ptrace & PT_PTRACE_CAP)
+			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+		else
+			bprm->unsafe |= LSM_UNSAFE_PTRACE;
+	}
+
+	/*
+	 * This isn't strictly necessary, but it makes it harder for LSMs to
+	 * mess up.
+	 */
+	if (task_no_new_privs(current))
+		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+
+	t = p;
+	n_fs = 1;
+	spin_lock(&p->fs->lock);
+	rcu_read_lock();
+	while_each_thread(p, t) {
+		if (t->fs == p->fs)
+			n_fs++;
+	}
+	rcu_read_unlock();
+
+	if (p->fs->users > n_fs)
+		bprm->unsafe |= LSM_UNSAFE_SHARE;
+	else
+		p->fs->in_exec = 1;
+	spin_unlock(&p->fs->lock);
+}
+
+static void bprm_fill_uid(struct linux_binprm *bprm)
+{
+	struct inode *inode;
+	unsigned int mode;
+	kuid_t uid;
+	kgid_t gid;
+
+	/* clear any previous set[ug]id data from a previous binary */
+	bprm->cred->euid = current_euid();
+	bprm->cred->egid = current_egid();
+
+	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+		return;
+
+	if (task_no_new_privs(current))
+		return;
+
+	inode = file_inode(bprm->file);
+	mode = READ_ONCE(inode->i_mode);
+	if (!(mode & (S_ISUID|S_ISGID)))
+		return;
+
+	/* Be careful if suid/sgid is set */
+	mutex_lock(&inode->i_mutex);
+
+	/* reload atomically mode/uid/gid now that lock held */
+	mode = inode->i_mode;
+	uid = inode->i_uid;
+	gid = inode->i_gid;
+	mutex_unlock(&inode->i_mutex);
+
+	/* We ignore suid/sgid if there are no mappings for them in the ns */
+	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
+		 !kgid_has_mapping(bprm->cred->user_ns, gid))
+		return;
+
+	if (mode & S_ISUID) {
+		bprm->per_clear |= PER_CLEAR_ON_SETID;
+		bprm->cred->euid = uid;
+	}
+
+	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+		bprm->per_clear |= PER_CLEAR_ON_SETID;
+		bprm->cred->egid = gid;
+	}
+}
+
+/*
+ * Fill the binprm structure from the inode.
+ * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ *
+ * This may be called multiple times for binary chains (scripts for example).
+ */
+int prepare_binprm(struct linux_binprm *bprm)
+{
+	int retval;
+
+	bprm_fill_uid(bprm);
+
+	/* fill in binprm security blob */
+	retval = security_bprm_set_creds(bprm);
+	if (retval)
+		return retval;
+	bprm->cred_prepared = 1;
+
+	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
+	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
+}
+
+EXPORT_SYMBOL(prepare_binprm);
+
+/*
+ * Arguments are '\0' separated strings found at the location bprm->p
+ * points to; chop off the first by relocating brpm->p to right after
+ * the first '\0' encountered.
+ */
+int remove_arg_zero(struct linux_binprm *bprm)
+{
+	int ret = 0;
+	unsigned long offset;
+	char *kaddr;
+	struct page *page;
+
+	if (!bprm->argc)
+		return 0;
+
+	do {
+		offset = bprm->p & ~PAGE_MASK;
+		page = get_arg_page(bprm, bprm->p, 0);
+		if (!page) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kaddr = kmap_atomic(page);
+
+		for (; offset < PAGE_SIZE && kaddr[offset];
+				offset++, bprm->p++)
+			;
+
+		kunmap_atomic(kaddr);
+		put_arg_page(page);
+
+		if (offset == PAGE_SIZE)
+			free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
+	} while (offset == PAGE_SIZE);
+
+	bprm->p++;
+	bprm->argc--;
+	ret = 0;
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(remove_arg_zero);
+
+#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
+/*
+ * cycle the list of binary formats handler, until one recognizes the image
+ */
+int search_binary_handler(struct linux_binprm *bprm)
+{
+	bool need_retry = IS_ENABLED(CONFIG_MODULES);
+	struct linux_binfmt *fmt;
+	int retval;
+
+	/* This allows 4 levels of binfmt rewrites before failing hard. */
+	if (bprm->recursion_depth > 5)
+		return -ELOOP;
+
+	retval = security_bprm_check(bprm);
+	if (retval)
+		return retval;
+
+	retval = -ENOENT;
+ retry:
+	read_lock(&binfmt_lock);
+	list_for_each_entry(fmt, &formats, lh) {
+		if (!try_module_get(fmt->module))
+			continue;
+		read_unlock(&binfmt_lock);
+		bprm->recursion_depth++;
+		retval = fmt->load_binary(bprm);
+		read_lock(&binfmt_lock);
+		put_binfmt(fmt);
+		bprm->recursion_depth--;
+		if (retval < 0 && !bprm->mm) {
+			/* we got to flush_old_exec() and failed after it */
+			read_unlock(&binfmt_lock);
+			force_sigsegv(SIGSEGV, current);
+			return retval;
+		}
+		if (retval != -ENOEXEC || !bprm->file) {
+			read_unlock(&binfmt_lock);
+			return retval;
+		}
+	}
+	read_unlock(&binfmt_lock);
+
+	if (need_retry) {
+		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
+		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
+			return retval;
+		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
+			return retval;
+		need_retry = false;
+		goto retry;
+	}
+
+	return retval;
+}
+EXPORT_SYMBOL(search_binary_handler);
+
+static int exec_binprm(struct linux_binprm *bprm)
+{
+	pid_t old_pid, old_vpid;
+	int ret;
+
+	/* Need to fetch pid before load_binary changes it */
+	old_pid = current->pid;
+	rcu_read_lock();
+	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+	rcu_read_unlock();
+
+	ret = search_binary_handler(bprm);
+	if (ret >= 0) {
+		audit_bprm(bprm);
+		trace_sched_process_exec(current, old_pid, bprm);
+		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
+		proc_exec_connector(current);
+	}
+
+	return ret;
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
+{
+	char *pathbuf = NULL;
+	struct linux_binprm *bprm;
+	struct file *file;
+	struct files_struct *displaced;
+	int retval;
+
+	if (IS_ERR(filename))
+		return PTR_ERR(filename);
+
+	/*
+	 * We move the actual failure in case of RLIMIT_NPROC excess from
+	 * set*uid() to execve() because too many poorly written programs
+	 * don't check setuid() return code.  Here we additionally recheck
+	 * whether NPROC limit is still exceeded.
+	 */
+	if ((current->flags & PF_NPROC_EXCEEDED) &&
+	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
+		retval = -EAGAIN;
+		goto out_ret;
+	}
+
+	/* We're below the limit (still or again), so we don't want to make
+	 * further execve() calls fail. */
+	current->flags &= ~PF_NPROC_EXCEEDED;
+
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto out_ret;
+
+	retval = -ENOMEM;
+	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+	if (!bprm)
+		goto out_files;
+
+	retval = prepare_bprm_creds(bprm);
+	if (retval)
+		goto out_free;
+
+	check_unsafe_exec(bprm);
+	current->in_execve = 1;
+
+	file = do_open_execat(fd, filename, flags);
+	retval = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out_unmark;
+
+	sched_exec();
+
+	bprm->file = file;
+	if (fd == AT_FDCWD || filename->name[0] == '/') {
+		bprm->filename = filename->name;
+	} else {
+		if (filename->name[0] == '\0')
+			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
+		else
+			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
+					    fd, filename->name);
+		if (!pathbuf) {
+			retval = -ENOMEM;
+			goto out_unmark;
+		}
+		/*
+		 * Record that a name derived from an O_CLOEXEC fd will be
+		 * inaccessible after exec. Relies on having exclusive access to
+		 * current->files (due to unshare_files above).
+		 */
+		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+		bprm->filename = pathbuf;
+	}
+	bprm->interp = bprm->filename;
+
+	retval = bprm_mm_init(bprm);
+	if (retval)
+		goto out_unmark;
+
+	bprm->argc = count(argv, MAX_ARG_STRINGS);
+	if ((retval = bprm->argc) < 0)
+		goto out;
+
+	bprm->envc = count(envp, MAX_ARG_STRINGS);
+	if ((retval = bprm->envc) < 0)
+		goto out;
+
+	retval = prepare_binprm(bprm);
+	if (retval < 0)
+		goto out;
+
+	retval = copy_strings_kernel(1, &bprm->filename, bprm);
+	if (retval < 0)
+		goto out;
+
+	bprm->exec = bprm->p;
+	retval = copy_strings(bprm->envc, envp, bprm);
+	if (retval < 0)
+		goto out;
+
+	retval = copy_strings(bprm->argc, argv, bprm);
+	if (retval < 0)
+		goto out;
+
+	retval = exec_binprm(bprm);
+	if (retval < 0)
+		goto out;
+
+	/* execve succeeded */
+	current->fs->in_exec = 0;
+	current->in_execve = 0;
+	acct_update_integrals(current);
+	task_numa_free(current);
+	free_bprm(bprm);
+	kfree(pathbuf);
+	putname(filename);
+	if (displaced)
+		put_files_struct(displaced);
+	return retval;
+
+out:
+	if (bprm->mm) {
+		acct_arg_size(bprm, 0);
+		mmput(bprm->mm);
+	}
+
+out_unmark:
+	current->fs->in_exec = 0;
+	current->in_execve = 0;
+
+out_free:
+	free_bprm(bprm);
+	kfree(pathbuf);
+
+out_files:
+	if (displaced)
+		reset_files_struct(displaced);
+out_ret:
+	putname(filename);
+	return retval;
+}
+
+int do_execve(struct filename *filename,
+	const char __user *const __user *__argv,
+	const char __user *const __user *__envp)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+int do_execveat(int fd, struct filename *filename,
+		const char __user *const __user *__argv,
+		const char __user *const __user *__envp,
+		int flags)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+
+	return do_execveat_common(fd, filename, argv, envp, flags);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_do_execve(struct filename *filename,
+	const compat_uptr_t __user *__argv,
+	const compat_uptr_t __user *__envp)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+static int compat_do_execveat(int fd, struct filename *filename,
+			      const compat_uptr_t __user *__argv,
+			      const compat_uptr_t __user *__envp,
+			      int flags)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execveat_common(fd, filename, argv, envp, flags);
+}
+#endif
+
+void set_binfmt(struct linux_binfmt *new)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+
+	mm->binfmt = new;
+	if (new)
+		__module_get(new->module);
+}
+EXPORT_SYMBOL(set_binfmt);
+
+/*
+ * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
+ */
+void set_dumpable(struct mm_struct *mm, int value)
+{
+	unsigned long old, new;
+
+	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
+		return;
+
+	do {
+		old = ACCESS_ONCE(mm->flags);
+		new = (old & ~MMF_DUMPABLE_MASK) | value;
+	} while (cmpxchg(&mm->flags, old, new) != old);
+}
+
+SYSCALL_DEFINE3(execve,
+		const char __user *, filename,
+		const char __user *const __user *, argv,
+		const char __user *const __user *, envp)
+{
+	return do_execve(getname(filename), argv, envp);
+}
+
+SYSCALL_DEFINE5(execveat,
+		int, fd, const char __user *, filename,
+		const char __user *const __user *, argv,
+		const char __user *const __user *, envp,
+		int, flags)
+{
+	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+	return do_execveat(fd,
+			   getname_flags(filename, lookup_flags, NULL),
+			   argv, envp, flags);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
+	const compat_uptr_t __user *, argv,
+	const compat_uptr_t __user *, envp)
+{
+	return compat_do_execve(getname(filename), argv, envp);
+}
+
+COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
+		       const char __user *, filename,
+		       const compat_uptr_t __user *, argv,
+		       const compat_uptr_t __user *, envp,
+		       int,  flags)
+{
+	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+	return compat_do_execveat(fd,
+				  getname_flags(filename, lookup_flags, NULL),
+				  argv, envp, flags);
+}
+#endif
diff --git a/kernel/fs/exofs/BUGS b/kernel/fs/exofs/BUGS
new file mode 100644
index 000000000..1b2d4c63a
--- /dev/null
+++ b/kernel/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/kernel/fs/exofs/Kbuild b/kernel/fs/exofs/Kbuild
new file mode 100644
index 000000000..b47c7b8dc
--- /dev/null
+++ b/kernel/fs/exofs/Kbuild
@@ -0,0 +1,20 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <ooo@electrozaur.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+# ore module library
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
+
+exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/kernel/fs/exofs/Kconfig b/kernel/fs/exofs/Kconfig
new file mode 100644
index 000000000..86194b2f7
--- /dev/null
+++ b/kernel/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+	tristate "exofs: OSD based file system support"
+	depends on SCSI_OSD_ULD
+	help
+	  EXOFS is a file system that uses an OSD storage device,
+	  as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+	bool "Enable debugging"
+	depends on EXOFS_FS
+	help
+	  This option enables EXOFS debug prints.
diff --git a/kernel/fs/exofs/Kconfig.ore b/kernel/fs/exofs/Kconfig.ore
new file mode 100644
index 000000000..2daf2329c
--- /dev/null
+++ b/kernel/fs/exofs/Kconfig.ore
@@ -0,0 +1,14 @@
+# ORE - Objects Raid Engine (libore.ko)
+#
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
+config ORE
+	tristate
+	depends on EXOFS_FS || PNFS_OBJLAYOUT
+	select ASYNC_XOR
+	select RAID6_PQ
+	select ASYNC_PQ
+	default SCSI_OSD_ULD
diff --git a/kernel/fs/exofs/common.h b/kernel/fs/exofs/common.h
new file mode 100644
index 000000000..7d88ef566
--- /dev/null
+++ b/kernel/fs/exofs/common.h
@@ -0,0 +1,262 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID   0x10000	/* Smallest partition ID */
+#define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
+#define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
+#define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
+
+/* exofs Application specific page/attribute */
+/* Inode attrs */
+# define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA	1
+# define EXOFS_ATTR_INODE_FILE_LAYOUT	2
+# define EXOFS_ATTR_INODE_DIR_LAYOUT	3
+/* Partition attrs */
+# define EXOFS_APAGE_SB_DATA	(0xF0000000U + 3)
+# define EXOFS_ATTR_SB_STATS	1
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+					(1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+	EXOFS_MAX_ID	 = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT	12
+#define EXOFS_BLKSIZE	(1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC	0x5DF5
+
+/*
+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
+ * This is where the in-memory superblock is stored on disk.
+ */
+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
+struct exofs_fscb {
+	__le64  s_nextid;	/* Only used after mkfs */
+	__le64  s_numfiles;	/* Only used after mkfs */
+	__le32	s_version;	/* == EXOFS_FSCB_VER */
+	__le16  s_magic;	/* Magic signature */
+	__le16  s_newfs;	/* Non-zero if this is a new fs */
+
+	/* From here on it's a static part, only written by mkexofs */
+	__le64	s_dev_table_oid;   /* Resurved, not used */
+	__le64	s_dev_table_count; /* == 0 means no dev_table */
+} __packed;
+
+/*
+ * This struct is set on the FS partition's attributes.
+ * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
+ * with the create command, to atomically persist the sb writeable information.
+ */
+struct exofs_sb_stats {
+	__le64  s_nextid;	/* Highest object ID used */
+	__le64  s_numfiles;	/* Number of files on fs */
+} __packed;
+
+/*
+ * Describes the raid used in the FS. It is part of the device table.
+ * This here is taken from the pNFS-objects definition. In exofs we
+ * use one raid policy through-out the filesystem. (NOTE: the funny
+ * alignment at beginning. We take care of it at exofs_device_table.
+ */
+struct exofs_dt_data_map {
+	__le32	cb_num_comps;
+	__le64	cb_stripe_unit;
+	__le32	cb_group_width;
+	__le32	cb_group_depth;
+	__le32	cb_mirror_cnt;
+	__le32	cb_raid_algorithm;
+} __packed;
+
+/*
+ * This is an osd device information descriptor. It is a single entry in
+ * the exofs device table. It describes an osd target lun which
+ * contains data belonging to this FS. (Same partition_id on all devices)
+ */
+struct exofs_dt_device_info {
+	__le32	systemid_len;
+	u8	systemid[OSD_SYSTEMID_LEN];
+	__le64	long_name_offset;	/* If !0 then offset-in-file */
+	__le32	osdname_len;		/* */
+	u8	osdname[44];		/* Embbeded, Usually an asci uuid */
+} __packed;
+
+/*
+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
+ * It contains the raid used for this multy-device FS and an array of
+ * participating devices.
+ */
+struct exofs_device_table {
+	__le32				dt_version;	/* == EXOFS_DT_VER */
+	struct exofs_dt_data_map	dt_data_map;	/* Raid policy to use */
+
+	/* Resurved space For future use. Total includeing this:
+	 * (8 * sizeof(le64))
+	 */
+	__le64				__Resurved[4];
+
+	__le64				dt_num_devices;	/* Array size */
+	struct exofs_dt_device_info	dt_dev_table[];	/* Array of devices */
+} __packed;
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA		5
+
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+	__le64  i_size;			/* Size of the file */
+	__le16  i_mode;         	/* File mode */
+	__le16  i_links_count;  	/* Links count */
+	__le32  i_uid;          	/* Owner Uid */
+	__le32  i_gid;          	/* Group Id */
+	__le32  i_atime;        	/* Access time */
+	__le32  i_ctime;        	/* Creation time */
+	__le32  i_mtime;        	/* Modification time */
+	__le32  i_flags;        	/* File flags (unused for now)*/
+	__le32  i_generation;   	/* File version (for NFS) */
+	__le32  i_data[EXOFS_IDATA];	/* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE	sizeof(struct exofs_fcb)
+
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DATA,
+	EXOFS_INO_ATTR_SIZE);
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN	255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+	__le64		inode_no;		/* inode number           */
+	__le16		rec_len;		/* directory entry length */
+	u8		name_len;		/* name length            */
+	u8		file_type;		/* umm...file type        */
+	char		name[EXOFS_NAME_LEN];	/* file name              */
+};
+
+enum {
+	EXOFS_FT_UNKNOWN,
+	EXOFS_FT_REG_FILE,
+	EXOFS_FT_DIR,
+	EXOFS_FT_CHRDEV,
+	EXOFS_FT_BLKDEV,
+	EXOFS_FT_FIFO,
+	EXOFS_FT_SOCK,
+	EXOFS_FT_SYMLINK,
+	EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD			4
+#define EXOFS_DIR_ROUND			(EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+/*
+ * The on-disk (optional) layout structure.
+ * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
+ * attribute, attached to any inode, usually to a directory.
+ */
+
+enum exofs_inode_layout_gen_functions {
+	LAYOUT_MOVING_WINDOW = 0,
+	LAYOUT_IMPLICT = 1,
+};
+
+struct exofs_on_disk_inode_layout {
+	__le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
+	__le16 pad;
+	union {
+		/* gen_func == LAYOUT_MOVING_WINDOW (default) */
+		struct exofs_layout_sliding_window {
+			__le32 num_devices; /* first n devices in global-table*/
+		} sliding_window __packed;
+
+		/* gen_func == LAYOUT_IMPLICT */
+		struct exofs_layout_implict_list {
+			struct exofs_dt_data_map data_map;
+			/* Variable array of size data_map.cb_num_comps. These
+			 * are device indexes of the devices in the global table
+			 */
+			__le32 dev_indexes[];
+		} implict __packed;
+	};
+} __packed;
+
+static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
+{
+	return sizeof(struct exofs_on_disk_inode_layout) +
+		max_devs * sizeof(__le32);
+}
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/kernel/fs/exofs/dir.c b/kernel/fs/exofs/dir.c
new file mode 100644
index 000000000..4deb0b05b
--- /dev/null
+++ b/kernel/fs/exofs/dir.c
@@ -0,0 +1,667 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	loff_t last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	set_page_dirty(page);
+
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+
+	return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct exofs_dir_entry *p;
+	char *error;
+
+	/* if the page is the last one in the directory */
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct exofs_dir_entry *)(kaddr + offs);
+		rec_len = le16_to_cpu(p->rec_len);
+
+		if (rec_len < EXOFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+Ebadsize:
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"size of directory(0x%lx) is not a multiple of chunk size\n",
+		dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+bad_entry:
+	EXOFS_ERR(
+		"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
+		"offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
+		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)),
+		rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct exofs_dir_entry *)(kaddr + offs);
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"entry in directory(0x%lx) spans the page boundary"
+		"offset=%lu, inode=0x%llx\n",
+		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			exofs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	exofs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+					struct exofs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode_no)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+	return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+	struct exofs_dir_entry *p =
+			(struct exofs_dir_entry *)(base + (offset&mask));
+	while ((char *)p < (char *)de) {
+		if (p->rec_len == 0)
+			break;
+		p = exofs_next_entry(p);
+	}
+	return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+	[EXOFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EXOFS_FT_REG_FILE]	= DT_REG,
+	[EXOFS_FT_DIR]		= DT_DIR,
+	[EXOFS_FT_CHRDEV]	= DT_CHR,
+	[EXOFS_FT_BLKDEV]	= DT_BLK,
+	[EXOFS_FT_FIFO]		= DT_FIFO,
+	[EXOFS_FT_SOCK]		= DT_SOCK,
+	[EXOFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXOFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXOFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXOFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXOFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXOFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXOFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+	umode_t mode = inode->i_mode;
+	de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *file, struct dir_context *ctx)
+{
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+	int need_revalidate = (file->f_version != inode->i_version);
+
+	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+		return 0;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct exofs_dir_entry *de;
+		struct page *page = exofs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
+				  inode->i_ino);
+			ctx->pos += PAGE_CACHE_SIZE - offset;
+			return PTR_ERR(page);
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = exofs_validate_entry(kaddr, offset,
+								chunk_mask);
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			file->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct exofs_dir_entry *)(kaddr + offset);
+		limit = kaddr + exofs_last_byte(inode, n) -
+							EXOFS_DIR_REC_LEN(1);
+		for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: "
+				     "zero-length entry in directory(0x%lx)\n",
+				     inode->i_ino);
+				exofs_put_page(page);
+				return -EIO;
+			}
+			if (de->inode_no) {
+				unsigned char t;
+
+				if (de->file_type < EXOFS_FT_MAX)
+					t = exofs_filetype_table[de->file_type];
+				else
+					t = DT_UNKNOWN;
+
+				if (!dir_emit(ctx, de->name, de->name_len,
+						le64_to_cpu(de->inode_no),
+						t)) {
+					exofs_put_page(page);
+					return 0;
+				}
+			}
+			ctx->pos += le16_to_cpu(de->rec_len);
+		}
+		exofs_put_page(page);
+	}
+	return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+			struct dentry *dentry, struct page **res_page)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct exofs_i_info *oi = exofs_i(dir);
+	struct exofs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	*res_page = NULL;
+
+	start = oi->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = exofs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct exofs_dir_entry *) kaddr;
+			kaddr += exofs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					EXOFS_ERR("ERROR: zero-length entry in "
+						  "directory(0x%lx)\n",
+						  dir->i_ino);
+					exofs_put_page(page);
+					goto out;
+				}
+				if (exofs_match(namelen, name, de))
+					goto found;
+				de = exofs_next_entry(de);
+			}
+			exofs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	oi->i_dir_start_lookup = n;
+	return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = exofs_get_page(dir, 0);
+	struct exofs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = exofs_next_entry(
+				(struct exofs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t exofs_parent_ino(struct dentry *child)
+{
+	struct page *page;
+	struct exofs_dir_entry *de;
+	ino_t ino;
+
+	de = exofs_dotdot(d_inode(child), &page);
+	if (!de)
+		return 0;
+
+	ino = le64_to_cpu(de->inode_no);
+	exofs_put_page(page);
+	return ino;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct exofs_dir_entry *de;
+	struct page *page;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode_no);
+		exofs_put_page(page);
+	}
+	return res;
+}
+
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+			struct page *page, struct inode *inode)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = le16_to_cpu(de->rec_len);
+	int err;
+
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, len,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
+			  err);
+
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, len);
+	exofs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	return err;
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = exofs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + exofs_last_byte(dir, n);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = cpu_to_le16(chunk_size);
+				de->inode_no = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_add_link: "
+				      "zero-length entry in directory(0x%lx)\n",
+				      inode->i_ino);
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (exofs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = EXOFS_DIR_REC_LEN(de->name_len);
+			rec_len = le16_to_cpu(de->rec_len);
+			if (!de->inode_no && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		exofs_put_page(page);
+	}
+
+	EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
+		  dentry, inode->i_ino);
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+		(char *)de - (char *)page_address(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+							&page, NULL);
+	if (err)
+		goto out_unlock;
+	if (de->inode_no) {
+		struct exofs_dir_entry *de1 =
+			(struct exofs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = cpu_to_le16(rec_len - name_len);
+		de->rec_len = cpu_to_le16(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	err = exofs_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	sbi->s_numfiles++;
+
+out_put:
+	exofs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	loff_t pos;
+	struct exofs_dir_entry *pde = NULL;
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+	int err;
+
+	while (de < dir) {
+		if (de->rec_len == 0) {
+			EXOFS_ERR("ERROR: exofs_delete_entry:"
+				  "zero-length entry in directory(0x%lx)\n",
+				  inode->i_ino);
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = exofs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+							&page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
+			  err);
+	if (pde)
+		pde->rec_len = cpu_to_le16(to - from);
+	dir->inode_no = 0;
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	sbi->s_numfiles--;
+out:
+	exofs_put_page(page);
+	return err;
+}
+
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = exofs_chunk_size(inode);
+	struct exofs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+							&page, NULL);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kaddr = kmap_atomic(page);
+	de = (struct exofs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+	memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+
+	de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+	de->inode_no = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+	exofs_set_de_type(de, inode);
+	kunmap_atomic(kaddr);
+	err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+int exofs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct exofs_dir_entry *de;
+		page = exofs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_empty_dir: "
+					  "zero-length directory entry"
+					  "kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode_no != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (le64_to_cpu(de->inode_no) !=
+					    inode->i_ino)
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = exofs_next_entry(de);
+		}
+		exofs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	exofs_put_page(page);
+	return 0;
+}
+
+const struct file_operations exofs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= exofs_readdir,
+};
diff --git a/kernel/fs/exofs/exofs.h b/kernel/fs/exofs/exofs.h
new file mode 100644
index 000000000..ad9cac670
--- /dev/null
+++ b/kernel/fs/exofs/exofs.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/backing-dev.h>
+#include <scsi/osd_ore.h>
+
+#include "common.h"
+
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+struct exofs_dev {
+	struct ore_dev ored;
+	unsigned did;
+	unsigned urilen;
+	uint8_t *uri;
+	struct kobject ed_kobj;
+};
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+	struct backing_dev_info bdi;		/* register our bdi with VFS  */
+	struct exofs_sb_stats s_ess;		/* Written often, pre-allocate*/
+	int		s_timeout;		/* timeout for OSD operations */
+	uint64_t	s_nextid;		/* highest object ID used     */
+	uint32_t	s_numfiles;		/* number of files on fs      */
+	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
+	u32		s_next_generation;	/* next gen # to use          */
+	atomic_t	s_curr_pending;		/* number of pending commands */
+
+	struct ore_layout	layout;		/* Default files layout       */
+	struct ore_comp one_comp;		/* id & cred of partition id=0*/
+	struct ore_components oc;		/* comps for the partition    */
+	struct kobject	s_kobj;			/* holds per-sbi kobject      */
+};
+
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
+	unsigned long  i_flags;            /* various atomic flags            */
+	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+	uint64_t       i_commit_size;      /* the object's written length     */
+	struct ore_comp one_comp;	   /* same component for all devices  */
+	struct ore_components oc;	   /* inode view of the device table  */
+};
+
+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
+{
+	return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+}
+
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED	0	/* object will be created soon*/
+#define OBJ_CREATED	1	/* object has been created on the osd*/
+
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline int obj_created(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+	if (likely(obj_created(oi)))
+		return 0;
+
+	return __exofs_wait_obj_created(oi);
+}
+
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+	return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+
+/*************************
+ * function declarations *
+ *************************/
+
+/* inode.c               */
+unsigned exofs_max_io_pages(struct ore_layout *layout,
+			    unsigned expected_pages);
+int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, umode_t);
+extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
+extern void exofs_evict_inode(struct inode *);
+
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+					 struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+		    struct inode *);
+
+/* super.c               */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+			   const struct osd_obj_id *obj);
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
+
+/* sys.c                 */
+int exofs_sysfs_init(void);
+void exofs_sysfs_uninit(void);
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+		       struct exofs_dt_device_info *dt_dev);
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
+int exofs_sysfs_odev_add(struct exofs_dev *edev,
+			 struct exofs_sb_info *sbi);
+void exofs_sysfs_dbg_print(void);
+
+/*********************
+ * operation vectors *
+ *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+
+/* exofs_init_comps will initialize an ore_components device array
+ * pointing to a single ore_comp struct, and a round-robin view
+ * of the device table.
+ * The first device of each inode is the [inode->ino % num_devices]
+ * and the rest of the devices sequentially following where the
+ * first device is after the last device.
+ * It is assumed that the global device array at @sbi is twice
+ * bigger and that the device table repeats twice.
+ * See: exofs_read_lookup_dev_table()
+ */
+static inline void exofs_init_comps(struct ore_components *oc,
+				    struct ore_comp *one_comp,
+				    struct exofs_sb_info *sbi, osd_id oid)
+{
+	unsigned dev_mod = (unsigned)oid, first_dev;
+
+	one_comp->obj.partition = sbi->one_comp.obj.partition;
+	one_comp->obj.id = oid;
+	exofs_make_credential(one_comp->cred, &one_comp->obj);
+
+	oc->first_dev = 0;
+	oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+							sbi->layout.group_count;
+	oc->single_comp = EC_SINGLE_COMP;
+	oc->comps = one_comp;
+
+	/* Round robin device view of the table */
+	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+	oc->ods = &sbi->oc.ods[first_dev];
+}
+
+#endif
diff --git a/kernel/fs/exofs/file.c b/kernel/fs/exofs/file.c
new file mode 100644
index 000000000..906de66e8
--- /dev/null
+++ b/kernel/fs/exofs/file.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "exofs.h"
+
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/* exofs_file_fsync - flush the inode to disk
+ *
+ *   Note, in exofs all metadata is written as part of inode, regardless.
+ *   The writeout is synchronous
+ */
+static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			    int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = sync_inode_metadata(filp->f_mapping->host, 1);
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+	int ret = vfs_fsync(file, 0);
+	/* TODO: Flush the OSD target */
+	return ret;
+}
+
+const struct file_operations exofs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.release	= exofs_release_file,
+	.fsync		= exofs_file_fsync,
+	.flush		= exofs_flush,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+};
+
+const struct inode_operations exofs_file_inode_operations = {
+	.setattr	= exofs_setattr,
+};
diff --git a/kernel/fs/exofs/inode.c b/kernel/fs/exofs/inode.c
new file mode 100644
index 000000000..786e4cc8c
--- /dev/null
+++ b/kernel/fs/exofs/inode.c
@@ -0,0 +1,1524 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/slab.h>
+
+#include "exofs.h"
+
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+
+unsigned exofs_max_io_pages(struct ore_layout *layout,
+			    unsigned expected_pages)
+{
+	unsigned pages = min_t(unsigned, expected_pages,
+			       layout->max_io_length / PAGE_SIZE);
+
+	return pages;
+}
+
+struct page_collect {
+	struct exofs_sb_info *sbi;
+	struct inode *inode;
+	unsigned expected_pages;
+	struct ore_io_state *ios;
+
+	struct page **pages;
+	unsigned alloc_pages;
+	unsigned nr_pages;
+	unsigned long length;
+	loff_t pg_first; /* keep 64bit also in 32-arches */
+	bool read_4_write; /* This means two things: that the read is sync
+			    * And the pages should not be unlocked.
+			    */
+	struct page *that_locked_page;
+};
+
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+		       struct inode *inode)
+{
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+
+	pcol->sbi = sbi;
+	pcol->inode = inode;
+	pcol->expected_pages = expected_pages;
+
+	pcol->ios = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+	pcol->read_4_write = false;
+	pcol->that_locked_page = NULL;
+}
+
+static void _pcol_reset(struct page_collect *pcol)
+{
+	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+	pcol->ios = NULL;
+	pcol->that_locked_page = NULL;
+
+	/* this is probably the end of the loop but in writes
+	 * it might not end here. don't be left with nothing
+	 */
+	if (!pcol->expected_pages)
+		pcol->expected_pages =
+				exofs_max_io_pages(&pcol->sbi->layout, ~0);
+}
+
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+	unsigned pages;
+
+	/* TODO: easily support bio chaining */
+	pages =  exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
+
+	for (; pages; pages >>= 1) {
+		pcol->pages = kmalloc(pages * sizeof(struct page *),
+				      GFP_KERNEL);
+		if (likely(pcol->pages)) {
+			pcol->alloc_pages = pages;
+			return 0;
+		}
+	}
+
+	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
+		  pcol->expected_pages);
+	return -ENOMEM;
+}
+
+static void pcol_free(struct page_collect *pcol)
+{
+	kfree(pcol->pages);
+	pcol->pages = NULL;
+
+	if (pcol->ios) {
+		ore_put_io_state(pcol->ios);
+		pcol->ios = NULL;
+	}
+}
+
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+			 unsigned len)
+{
+	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
+		return -ENOMEM;
+
+	pcol->pages[pcol->nr_pages++] = page;
+	pcol->length += len;
+	return 0;
+}
+
+enum {PAGE_WAS_NOT_IN_IO = 17};
+static int update_read_page(struct page *page, int ret)
+{
+	switch (ret) {
+	case 0:
+		/* Everything is OK */
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		break;
+	case -EFAULT:
+		/* In this case we were trying to read something that wasn't on
+		 * disk yet - return a page full of zeroes.  This should be OK,
+		 * because the object should be empty (if there was a write
+		 * before this read, the read would be waiting with the page
+		 * locked */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		EXOFS_DBGMSG("recovered read error\n");
+		/* fall through */
+	case PAGE_WAS_NOT_IN_IO:
+		ret = 0; /* recovered error */
+		break;
+	default:
+		SetPageError(page);
+	}
+	return ret;
+}
+
+static void update_write_page(struct page *page, int ret)
+{
+	if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+		return; /* don't pass start don't collect $200 */
+
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		SetPageError(page);
+	}
+	end_page_writeback(page);
+}
+
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct page_collect *pcol)
+{
+	int i;
+	u64 good_bytes;
+	u64 length = 0;
+	int ret = ore_check_io(pcol->ios, NULL);
+
+	if (likely(!ret)) {
+		good_bytes = pcol->length;
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
+		good_bytes = 0;
+	}
+
+	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages at end */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
+			  inode->i_ino, page->index,
+			  page_stat ? "bad_bytes" : "good_bytes");
+
+		ret = update_read_page(page, page_stat);
+		if (!pcol->read_4_write)
+			unlock_page(page);
+		length += PAGE_SIZE;
+	}
+
+	pcol_free(pcol);
+	EXOFS_DBGMSG2("readpages_done END\n");
+	return ret;
+}
+
+/* callback of async reads */
+static void readpages_done(struct ore_io_state *ios, void *p)
+{
+	struct page_collect *pcol = p;
+
+	__readpages_done(pcol);
+	atomic_dec(&pcol->sbi->s_curr_pending);
+	kfree(pcol);
+}
+
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+	int i;
+
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
+
+		if (rw == READ)
+			update_read_page(page, ret);
+		else
+			update_write_page(page, ret);
+
+		unlock_page(page);
+	}
+}
+
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+	struct page_collect *pcol_src, struct page_collect *pcol)
+{
+	/* length was wrong or offset was not page aligned */
+	BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+	if (pcol_src->nr_pages > ios->nr_pages) {
+		struct page **src_page;
+		unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+		unsigned long len_less = pcol_src->length - ios->length;
+		unsigned i;
+		int ret;
+
+		/* This IO was trimmed */
+		pcol_src->nr_pages = ios->nr_pages;
+		pcol_src->length = ios->length;
+
+		/* Left over pages are passed to the next io */
+		pcol->expected_pages += pages_less;
+		pcol->nr_pages = pages_less;
+		pcol->length = len_less;
+		src_page = pcol_src->pages + pcol_src->nr_pages;
+		pcol->pg_first = (*src_page)->index;
+
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			return ret;
+
+		for (i = 0; i < pages_less; ++i)
+			pcol->pages[i] = *src_page++;
+
+		EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+			"pages_less=0x%x expected_pages=0x%x "
+			"next_offset=0x%llx next_len=0x%lx\n",
+			pcol_src->nr_pages, pages_less, pcol->expected_pages,
+			pcol->pg_first * PAGE_SIZE, pcol->length);
+	}
+	return 0;
+}
+
+static int read_exec(struct page_collect *pcol)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct ore_io_state *ios;
+	struct page_collect *pcol_copy = NULL;
+	int ret;
+
+	if (!pcol->pages)
+		return 0;
+
+	if (!pcol->ios) {
+		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
+					     pcol->pg_first << PAGE_CACHE_SHIFT,
+					     pcol->length, &pcol->ios);
+
+		if (ret)
+			return ret;
+	}
+
+	ios = pcol->ios;
+	ios->pages = pcol->pages;
+
+	if (pcol->read_4_write) {
+		ore_read(pcol->ios);
+		return __readpages_done(pcol);
+	}
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+	ios->done = readpages_done;
+	ios->private = pcol_copy;
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
+	ret = ore_read(ios);
+	if (unlikely(ret))
+		goto err;
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+
+	return 0;
+
+err:
+	if (!pcol_copy) /* Failed before ownership transfer */
+		pcol_copy = pcol;
+	_unlock_pcol_pages(pcol_copy, ret, READ);
+	pcol_free(pcol_copy);
+	kfree(pcol_copy);
+
+	return ret;
+}
+
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	BUG_ON(!PageLocked(page));
+
+	/* FIXME: Just for debugging, will be removed */
+	if (PageUptodate(page))
+		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+			  page->index);
+
+	pcol->that_locked_page = page;
+
+	if (page->index < end_index)
+		len = PAGE_CACHE_SIZE;
+	else if (page->index == end_index)
+		len = i_size & ~PAGE_CACHE_MASK;
+	else
+		len = 0;
+
+	if (!len || !obj_created(oi)) {
+		/* this will be out of bounds, or doesn't exist yet.
+		 * Current page is cleared and the request is split
+		 */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+
+		if (!pcol->read_4_write)
+			unlock_page(page);
+		EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
+			     "read_4_write=%d index=0x%lx end_index=0x%lx "
+			     "splitting\n", inode->i_ino, len,
+			     pcol->read_4_write, page->index, end_index);
+
+		return read_exec(pcol);
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = read_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+		goto try_again;
+	}
+
+	if (!pcol->pages) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	if (len != PAGE_CACHE_SIZE)
+		zero_user(page, len, PAGE_CACHE_SIZE - len);
+
+	EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (ret) {
+		EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
+			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+			  page, len, pcol->nr_pages, pcol->length);
+
+		/* split the request, and start again with current page */
+		ret = read_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+
+		goto try_again;
+	}
+
+	return 0;
+
+fail:
+	/* SetPageError(page); ??? */
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, nr_pages, mapping->host);
+
+	ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+	if (ret) {
+		EXOFS_ERR("read_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	ret = read_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
+	return read_exec(&pcol);
+}
+
+static int _readpage(struct page *page, bool read_4_write)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	pcol.read_4_write = read_4_write;
+	ret = readpage_strip(&pcol, page);
+	if (ret) {
+		EXOFS_ERR("_readpage => %d\n", ret);
+		return ret;
+	}
+
+	return read_exec(&pcol);
+}
+
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+	return _readpage(page, false);
+}
+
+/* Callback for osd_write. All writes are asynchronous */
+static void writepages_done(struct ore_io_state *ios, void *p)
+{
+	struct page_collect *pcol = p;
+	int i;
+	u64  good_bytes;
+	u64  length = 0;
+	int ret = ore_check_io(ios, NULL);
+
+	atomic_dec(&pcol->sbi->s_curr_pending);
+
+	if (likely(!ret)) {
+		good_bytes = pcol->length;
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
+		good_bytes = 0;
+	}
+
+	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages to a bio */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		update_write_page(page, page_stat);
+		unlock_page(page);
+		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+			     inode->i_ino, page->index, page_stat);
+
+		length += PAGE_SIZE;
+	}
+
+	pcol_free(pcol);
+	kfree(pcol);
+	EXOFS_DBGMSG2("writepages_done END\n");
+}
+
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+	struct page_collect *pcol = priv;
+	pgoff_t index = offset / PAGE_SIZE;
+
+	if (!pcol->that_locked_page ||
+	    (pcol->that_locked_page->index != index)) {
+		struct page *page;
+		loff_t i_size = i_size_read(pcol->inode);
+
+		if (offset >= i_size) {
+			*uptodate = true;
+			EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
+			return ZERO_PAGE(0);
+		}
+
+		page =  find_get_page(pcol->inode->i_mapping, index);
+		if (!page) {
+			page = find_or_create_page(pcol->inode->i_mapping,
+						   index, GFP_NOFS);
+			if (unlikely(!page)) {
+				EXOFS_DBGMSG("grab_cache_page Failed "
+					"index=0x%llx\n", _LLU(index));
+				return NULL;
+			}
+			unlock_page(page);
+		}
+		if (PageDirty(page) || PageWriteback(page))
+			*uptodate = true;
+		else
+			*uptodate = PageUptodate(page);
+		EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
+		return page;
+	} else {
+		EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
+			     pcol->that_locked_page->index);
+		*uptodate = true;
+		return pcol->that_locked_page;
+	}
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+	struct page_collect *pcol = priv;
+
+	if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
+		EXOFS_DBGMSG2("index=0x%lx\n", page->index);
+		page_cache_release(page);
+		return;
+	}
+	EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
+		     ZERO_PAGE(0) == page ? -1 : page->index);
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+	.get_page = &__r4w_get_page,
+	.put_page = &__r4w_put_page,
+};
+
+static int write_exec(struct page_collect *pcol)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct ore_io_state *ios;
+	struct page_collect *pcol_copy = NULL;
+	int ret;
+
+	if (!pcol->pages)
+		return 0;
+
+	BUG_ON(pcol->ios);
+	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
+				 pcol->pg_first << PAGE_CACHE_SHIFT,
+				 pcol->length, &pcol->ios);
+	if (unlikely(ret))
+		goto err;
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+
+	ios = pcol->ios;
+	ios->pages = pcol_copy->pages;
+	ios->done = writepages_done;
+	ios->r4w = &_r4w_op;
+	ios->private = pcol_copy;
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
+	ret = ore_write(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("write_exec: ore_write() Failed\n");
+		goto err;
+	}
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+	return 0;
+
+err:
+	if (!pcol_copy) /* Failed before ownership transfer */
+		pcol_copy = pcol;
+	_unlock_pcol_pages(pcol_copy, ret, WRITE);
+	pcol_free(pcol_copy);
+	kfree(pcol_copy);
+
+	return ret;
+}
+
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+			   struct writeback_control *wbc_unused, void *data)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	BUG_ON(!PageLocked(page));
+
+	ret = wait_obj_created(oi);
+	if (unlikely(ret))
+		goto fail;
+
+	if (page->index < end_index)
+		/* in this case, the page is within the limits of the file */
+		len = PAGE_CACHE_SIZE;
+	else {
+		len = i_size & ~PAGE_CACHE_MASK;
+
+		if (page->index > end_index || !len) {
+			/* in this case, the page is outside the limits
+			 * (truncate in progress)
+			 */
+			ret = write_exec(pcol);
+			if (unlikely(ret))
+				goto fail;
+			if (PageError(page))
+				ClearPageError(page);
+			unlock_page(page);
+			EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
+				     "outside the limits\n",
+				     inode->i_ino, page->index);
+			return 0;
+		}
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = write_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+
+		EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
+			     inode->i_ino, page->index);
+		goto try_again;
+	}
+
+	if (!pcol->pages) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG2("Failed pcol_add_page "
+			     "nr_pages=%u total_length=0x%lx\n",
+			     pcol->nr_pages, pcol->length);
+
+		/* split the request, next loop will start again */
+		ret = write_exec(pcol);
+		if (unlikely(ret)) {
+			EXOFS_DBGMSG("write_exec failed => %d", ret);
+			goto fail;
+		}
+
+		goto try_again;
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	return 0;
+
+fail:
+	EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
+		     inode->i_ino, page->index, ret);
+	set_bit(AS_EIO, &page->mapping->flags);
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	long start, end, expected_pages;
+	int ret;
+
+	start = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = (wbc->range_end == LLONG_MAX) ?
+			start + mapping->nrpages :
+			wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	if (start || end)
+		expected_pages = end - start + 1;
+	else
+		expected_pages = mapping->nrpages;
+
+	if (expected_pages < 32L)
+		expected_pages = 32L;
+
+	EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
+		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
+		     mapping->nrpages, start, end, expected_pages);
+
+	_pcol_init(&pcol, expected_pages, mapping->host);
+
+	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+	if (unlikely(ret)) {
+		EXOFS_ERR("write_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	ret = write_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		return write_exec(&pcol); /* pump the last reminder */
+	} else if (pcol.nr_pages) {
+		/* not SYNC let the reminder join the next writeout */
+		unsigned i;
+
+		for (i = 0; i < pcol.nr_pages; i++) {
+			struct page *page = pcol.pages[i];
+
+			end_page_writeback(page);
+			set_page_dirty(page);
+			unlock_page(page);
+		}
+	}
+	return 0;
+}
+
+/*
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	ret = writepage_strip(page, NULL, &pcol);
+	if (ret) {
+		EXOFS_ERR("exofs_writepage => %d\n", ret);
+		return ret;
+	}
+
+	return write_exec(&pcol);
+}
+*/
+/* i_mutex held using inode->i_size directly */
+static void _write_failed(struct inode *inode, loff_t to)
+{
+	if (to > inode->i_size)
+		truncate_pagecache(inode, inode->i_size);
+}
+
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret = 0;
+	struct page *page;
+
+	page = *pagep;
+	if (page == NULL) {
+		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+					 fsdata);
+		if (ret) {
+			EXOFS_DBGMSG("simple_write_begin failed\n");
+			goto out;
+		}
+
+		page = *pagep;
+	}
+
+	 /* read modify write */
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		loff_t i_size = i_size_read(mapping->host);
+		pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+		size_t rlen;
+
+		if (page->index < end_index)
+			rlen = PAGE_CACHE_SIZE;
+		else if (page->index == end_index)
+			rlen = i_size & ~PAGE_CACHE_MASK;
+		else
+			rlen = 0;
+
+		if (!rlen) {
+			clear_highpage(page);
+			SetPageUptodate(page);
+			goto out;
+		}
+
+		ret = _readpage(page, true);
+		if (ret) {
+			/*SetPageError was done by _readpage. Is it ok?*/
+			unlock_page(page);
+			EXOFS_DBGMSG("__readpage failed\n");
+		}
+	}
+out:
+	if (unlikely(ret))
+		_write_failed(mapping->host, pos + len);
+
+	return ret;
+}
+
+static int exofs_write_begin_export(struct file *file,
+		struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+
+	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+					fsdata);
+}
+
+static int exofs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	/* According to comment in simple_write_end i_mutex is held */
+	loff_t i_size = inode->i_size;
+	int ret;
+
+	ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
+	if (unlikely(ret))
+		_write_failed(inode, pos + len);
+
+	/* TODO: once simple_write_end marks inode dirty remove */
+	if (i_size != inode->i_size)
+		mark_inode_dirty(inode);
+	return ret;
+}
+
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+	EXOFS_DBGMSG("page 0x%lx\n", page->index);
+	WARN_ON(1);
+	return 0;
+}
+
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
+{
+	EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+		     page->index, offset, length);
+	WARN_ON(1);
+}
+
+
+ /* TODO: Should be easy enough to do proprly */
+static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			       loff_t offset)
+{
+	return 0;
+}
+
+const struct address_space_operations exofs_aops = {
+	.readpage	= exofs_readpage,
+	.readpages	= exofs_readpages,
+	.writepage	= NULL,
+	.writepages	= exofs_writepages,
+	.write_begin	= exofs_write_begin_export,
+	.write_end	= exofs_write_end,
+	.releasepage	= exofs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.invalidatepage = exofs_invalidatepage,
+
+	/* Not implemented Yet */
+	.bmap		= NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+	.direct_IO	= exofs_direct_IO,
+
+	/* With these NULL has special meaning or default is not exported */
+	.migratepage	= NULL,
+	.launder_page	= NULL,
+	.is_partially_uptodate = NULL,
+	.error_remove_page = NULL,
+};
+
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+
+	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+
+static int _do_truncate(struct inode *inode, loff_t newsize)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	int ret;
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
+	if (likely(!ret))
+		truncate_setsize(inode, newsize);
+
+	EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
+		     inode->i_ino, newsize, ret);
+	return ret;
+}
+
+/*
+ * Set inode attributes - update size attribute on OSD if needed,
+ *                        otherwise just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	/* if we are about to modify an object, and it hasn't been
+	 * created yet, wait
+	 */
+	error = wait_obj_created(exofs_i(inode));
+	if (unlikely(error))
+		return error;
+
+	error = inode_change_ok(inode, iattr);
+	if (unlikely(error))
+		return error;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		error = _do_truncate(inode, iattr->ia_size);
+		if (unlikely(error))
+			return error;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_FILE_LAYOUT,
+	0);
+static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DIR_LAYOUT,
+	0);
+
+/*
+ * Read the Linux inode info from the OSD, and return it as is. In exofs the
+ * inode info is in an application specific page/attribute of the osd-object.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+		    struct exofs_fcb *inode)
+{
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_attr attrs[] = {
+		[0] = g_attr_inode_data,
+		[1] = g_attr_inode_file_layout,
+		[2] = g_attr_inode_dir_layout,
+	};
+	struct ore_io_state *ios;
+	struct exofs_on_disk_inode_layout *layout;
+	int ret;
+
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = ore_read(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
+			  _LLU(oi->one_comp.obj.id), ret);
+		memset(inode, 0, sizeof(*inode));
+		inode->i_mode = 0040000 | (0777 & ~022);
+		/* If object is lost on target we might as well enable it's
+		 * delete.
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
+		goto out;
+	}
+	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
+	memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
+
+	ret = extract_attr_from_ios(ios, &attrs[1]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
+		goto out;
+	}
+	if (attrs[1].len) {
+		layout = attrs[1].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported files layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[2]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
+		goto out;
+	}
+	if (attrs[2].len) {
+		layout = attrs[2].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported meta-data layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+out:
+	ore_put_io_state(ios);
+	return ret;
+}
+
+static void __oi_init(struct exofs_i_info *oi)
+{
+	init_waitqueue_head(&oi->i_wq);
+	oi->i_flags = 0;
+}
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct exofs_i_info *oi;
+	struct exofs_fcb fcb;
+	struct inode *inode;
+	int ret;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	oi = exofs_i(inode);
+	__oi_init(oi);
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
+			 exofs_oi_objno(oi));
+
+	/* read the inode from the osd */
+	ret = exofs_get_inode(sb, oi, &fcb);
+	if (ret)
+		goto bad_inode;
+
+	set_obj_created(oi);
+
+	/* copy stuff from on-disk struct to in-memory struct */
+	inode->i_mode = le16_to_cpu(fcb.i_mode);
+	i_uid_write(inode, le32_to_cpu(fcb.i_uid));
+	i_gid_write(inode, le32_to_cpu(fcb.i_gid));
+	set_nlink(inode, le16_to_cpu(fcb.i_links_count));
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+	inode->i_ctime.tv_nsec =
+		inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+	oi->i_commit_size = le64_to_cpu(fcb.i_size);
+	i_size_write(inode, oi->i_commit_size);
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_generation = le32_to_cpu(fcb.i_generation);
+
+	oi->i_dir_start_lookup = 0;
+
+	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (fcb.i_data[0])
+			inode->i_rdev =
+				old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+		else
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+	} else {
+		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &exofs_dir_inode_operations;
+		inode->i_fop = &exofs_dir_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (exofs_inode_is_fast_symlink(inode))
+			inode->i_op = &exofs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &exofs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &exofs_aops;
+		}
+	} else {
+		inode->i_op = &exofs_special_inode_operations;
+		if (fcb.i_data[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+	if (!obj_created(oi)) {
+		EXOFS_DBGMSG("!obj_created\n");
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+		EXOFS_DBGMSG("wait_event done\n");
+	}
+	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct ore_io_state *ios, void *p)
+{
+	struct inode *inode = p;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = ore_check_io(ios, NULL);
+	ore_put_io_state(ios);
+
+	atomic_dec(&sbi->s_curr_pending);
+
+	if (unlikely(ret)) {
+		EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
+			  _LLU(exofs_oi_objno(oi)),
+			  _LLU(oi->one_comp.obj.partition));
+		/*TODO: When FS is corrupted creation can fail, object already
+		 * exist. Get rid of this asynchronous creation, if exist
+		 * increment the obj counter and try the next object. Until we
+		 * succeed. All these dangling objects will be made into lost
+		 * files by chkfs.exofs
+		 */
+	}
+
+	set_obj_created(oi);
+
+	wake_up(&oi->i_wq);
+}
+
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+	struct ore_io_state *ios;
+	int ret;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	oi = exofs_i(inode);
+	__oi_init(oi);
+
+	set_obj_2bcreated(oi);
+
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = sbi->s_nextid++;
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	oi->i_commit_size = inode->i_size = 0;
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	insert_inode_hash(inode);
+
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
+			 exofs_oi_objno(oi));
+	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
+
+	mark_inode_dirty(inode);
+
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
+		return ERR_PTR(ret);
+	}
+
+	ios->done = create_done;
+	ios->private = inode;
+
+	ret = ore_create(ios);
+	if (ret) {
+		ore_put_io_state(ios);
+		return ERR_PTR(ret);
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return inode;
+}
+
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+	struct exofs_sb_info	*sbi;
+	struct exofs_fcb	fcb;
+};
+
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct ore_io_state *ios, void *p)
+{
+	struct updatei_args *args = p;
+
+	ore_put_io_state(ios);
+
+	atomic_dec(&args->sbi->s_curr_pending);
+
+	kfree(args);
+}
+
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct ore_io_state *ios;
+	struct osd_attr attr;
+	struct exofs_fcb *fcb;
+	struct updatei_args *args;
+	int ret;
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args) {
+		EXOFS_DBGMSG("Failed kzalloc of args\n");
+		return -ENOMEM;
+	}
+
+	fcb = &args->fcb;
+
+	fcb->i_mode = cpu_to_le16(inode->i_mode);
+	fcb->i_uid = cpu_to_le32(i_uid_read(inode));
+	fcb->i_gid = cpu_to_le32(i_gid_read(inode));
+	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	oi->i_commit_size = i_size_read(inode);
+	fcb->i_size = cpu_to_le64(oi->i_commit_size);
+	fcb->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			fcb->i_data[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			fcb->i_data[1] = 0;
+		} else {
+			fcb->i_data[0] = 0;
+			fcb->i_data[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			fcb->i_data[2] = 0;
+		}
+	} else
+		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
+		goto free_args;
+	}
+
+	attr = g_attr_inode_data;
+	attr.val_ptr = fcb;
+	ios->out_attr_len = 1;
+	ios->out_attr = &attr;
+
+	wait_obj_created(oi);
+
+	if (!do_sync) {
+		args->sbi = sbi;
+		ios->done = updatei_done;
+		ios->private = args;
+	}
+
+	ret = ore_write(ios);
+	if (!do_sync && !ret) {
+		atomic_inc(&sbi->s_curr_pending);
+		goto out; /* deallocation in updatei_done */
+	}
+
+	ore_put_io_state(ios);
+free_args:
+	kfree(args);
+out:
+	EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
+		     inode->i_ino, do_sync, ret);
+	return ret;
+}
+
+int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	/* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
+	return exofs_update_inode(inode, 1);
+}
+
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct ore_io_state *ios, void *p)
+{
+	struct exofs_sb_info *sbi = p;
+
+	ore_put_io_state(ios);
+
+	atomic_dec(&sbi->s_curr_pending);
+}
+
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_evict_inode(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct ore_io_state *ios;
+	int ret;
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	/* TODO: should do better here */
+	if (inode->i_nlink || is_bad_inode(inode))
+		goto no_delete;
+
+	inode->i_size = 0;
+	clear_inode(inode);
+
+	/* if we are deleting an obj that hasn't been created yet, wait.
+	 * This also makes sure that create_done cannot be called with an
+	 * already evicted inode.
+	 */
+	wait_obj_created(oi);
+	/* ignore the error, attempt a remove anyway */
+
+	/* Now Remove the OSD objects */
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
+		return;
+	}
+
+	ios->done = delete_done;
+	ios->private = sbi;
+
+	ret = ore_remove(ios);
+	if (ret) {
+		EXOFS_ERR("%s: ore_remove failed\n", __func__);
+		ore_put_io_state(ios);
+		return;
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return;
+
+no_delete:
+	clear_inode(inode);
+}
diff --git a/kernel/fs/exofs/namei.c b/kernel/fs/exofs/namei.c
new file mode 100644
index 000000000..5ae25e431
--- /dev/null
+++ b/kernel/fs/exofs/namei.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = exofs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > EXOFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = exofs_inode_by_name(dir, dentry);
+	inode = ino ? exofs_iget(dir->i_sb, ino) : NULL;
+	return d_splice_alias(inode, dentry);
+}
+
+static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			 bool excl)
+{
+	struct inode *inode = exofs_new_inode(dir, mode);
+	int err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       dev_t rdev)
+{
+	struct inode *inode;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = exofs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+			  const char *symname)
+{
+	struct super_block *sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+
+	if (l > sb->s_blocksize)
+		goto out;
+
+	inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	oi = exofs_i(inode);
+	if (l > sizeof(oi->i_data)) {
+		/* slow symlink */
+		inode->i_op = &exofs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		memset(oi->i_data, 0, sizeof(oi->i_data));
+
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &exofs_fast_symlink_inode_operations;
+		memcpy(oi->i_data, symname, l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = exofs_add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	return exofs_add_nondir(dentry, inode);
+}
+
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	int err;
+
+	inode_inc_link_count(dir);
+
+	inode = exofs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &exofs_dir_inode_operations;
+	inode->i_fop = &exofs_dir_operations;
+	inode->i_mapping->a_ops = &exofs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = exofs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = exofs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct exofs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (!de)
+		goto out;
+
+	err = exofs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	int err = -ENOTEMPTY;
+
+	if (exofs_empty_dir(inode)) {
+		err = exofs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct page *dir_page = NULL;
+	struct exofs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct exofs_dir_entry *old_de;
+	int err = -ENOENT;
+
+	old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = exofs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct exofs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !exofs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+		if (err)
+			goto out_dir;
+	} else {
+		err = exofs_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+
+	exofs_delete_entry(old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+		if (err)
+			goto out_dir;
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations exofs_dir_inode_operations = {
+	.create 	= exofs_create,
+	.lookup 	= exofs_lookup,
+	.link   	= exofs_link,
+	.unlink 	= exofs_unlink,
+	.symlink	= exofs_symlink,
+	.mkdir  	= exofs_mkdir,
+	.rmdir  	= exofs_rmdir,
+	.mknod  	= exofs_mknod,
+	.rename 	= exofs_rename,
+	.setattr	= exofs_setattr,
+};
+
+const struct inode_operations exofs_special_inode_operations = {
+	.setattr	= exofs_setattr,
+};
diff --git a/kernel/fs/exofs/ore.c b/kernel/fs/exofs/ore.c
new file mode 100644
index 000000000..7bd8ac8df
--- /dev/null
+++ b/kernel/fs/exofs/ore.c
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <asm/div64.h>
+#include <linux/lcm.h>
+
+#include "ore_raid.h"
+
+MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
+MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
+MODULE_LICENSE("GPL");
+
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ *    members to be operatonals for the ore. The needed parameters are those
+ *    that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ *    for example stripe_unit must be a multple of the system PAGE_SIZE,
+ *    and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
+
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+	u64 stripe_length;
+
+	switch (layout->raid_algorithm) {
+	case PNFS_OSD_RAID_0:
+		layout->parity = 0;
+		break;
+	case PNFS_OSD_RAID_5:
+		layout->parity = 1;
+		break;
+	case PNFS_OSD_RAID_PQ:
+		layout->parity = 2;
+		break;
+	case PNFS_OSD_RAID_4:
+	default:
+		ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
+			layout->raid_algorithm);
+		return -EINVAL;
+	}
+	if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+		ORE_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(layout->stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+	if (layout->group_width) {
+		if (!layout->group_depth) {
+			ORE_ERR("group_depth == 0 && group_width != 0\n");
+			return -EINVAL;
+		}
+		if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+			ORE_ERR("Data Map wrong, "
+				"numdevs=%d < group_width=%d * mirrors=%d\n",
+				total_comps, layout->group_width,
+				layout->mirrors_p1);
+			return -EINVAL;
+		}
+		layout->group_count = total_comps / layout->mirrors_p1 /
+						layout->group_width;
+	} else {
+		if (layout->group_depth) {
+			printk(KERN_NOTICE "Warning: group_depth ignored "
+				"group_width == 0 && group_depth == %lld\n",
+				_LLU(layout->group_depth));
+		}
+		layout->group_width = total_comps / layout->mirrors_p1;
+		layout->group_depth = -1;
+		layout->group_count = 1;
+	}
+
+	stripe_length = (u64)layout->group_width * layout->stripe_unit;
+	if (stripe_length >= (1ULL << 32)) {
+		ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+			_LLU(stripe_length));
+		return -EINVAL;
+	}
+
+	layout->max_io_length =
+		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+					(layout->group_width - layout->parity);
+	if (layout->parity) {
+		unsigned stripe_length =
+				(layout->group_width - layout->parity) *
+				layout->stripe_unit;
+
+		layout->max_io_length /= stripe_length;
+		layout->max_io_length *= stripe_length;
+	}
+	ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
+
+	return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
+
+static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
+{
+	return ios->oc->comps[index & ios->oc->single_comp].cred;
+}
+
+static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
+{
+	return &ios->oc->comps[index & ios->oc->single_comp].obj;
+}
+
+static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
+{
+	ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+		    ios->oc->first_dev, ios->oc->numdevs, index,
+		    ios->oc->ods);
+
+	return ore_comp_dev(ios->oc, index);
+}
+
+int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios)
+{
+	struct ore_io_state *ios;
+	struct page **pages;
+	struct osd_sg_entry *sgilist;
+	struct __alloc_all_io_state {
+		struct ore_io_state ios;
+		struct ore_per_dev_state per_dev[numdevs];
+		union {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		};
+	} *_aios;
+
+	if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+		_aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+		if (unlikely(!_aios)) {
+			ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+				   sizeof(*_aios));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		pages = num_par_pages ? _aios->pages : NULL;
+		sgilist = sgs_per_dev ? _aios->sglist : NULL;
+		ios = &_aios->ios;
+	} else {
+		struct __alloc_small_io_state {
+			struct ore_io_state ios;
+			struct ore_per_dev_state per_dev[numdevs];
+		} *_aio_small;
+		union __extra_part {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		} *extra_part;
+
+		_aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+		if (unlikely(!_aio_small)) {
+			ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+				   sizeof(*_aio_small));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+		if (unlikely(!extra_part)) {
+			ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+				   sizeof(*extra_part));
+			kfree(_aio_small);
+			*pios = NULL;
+			return -ENOMEM;
+		}
+
+		pages = num_par_pages ? extra_part->pages : NULL;
+		sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+		/* In this case the per_dev[0].sgilist holds the pointer to
+		 * be freed
+		 */
+		ios = &_aio_small->ios;
+		ios->extra_part_alloc = true;
+	}
+
+	if (pages) {
+		ios->parity_pages = pages;
+		ios->max_par_pages = num_par_pages;
+	}
+	if (sgilist) {
+		unsigned d;
+
+		for (d = 0; d < numdevs; ++d) {
+			ios->per_dev[d].sglist = sgilist;
+			sgilist += sgs_per_dev;
+		}
+		ios->sgs_per_dev = sgs_per_dev;
+	}
+
+	ios->layout = layout;
+	ios->oc = oc;
+	*pios = ios;
+	return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ *   (@offset + @length) % strip_size == 0. Or the complete range is within a
+ *   single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ *   And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
+		      bool is_reading, u64 offset, u64 length,
+		      struct ore_io_state **pios)
+{
+	struct ore_io_state *ios;
+	unsigned numdevs = layout->group_width * layout->mirrors_p1;
+	unsigned sgs_per_dev = 0, max_par_pages = 0;
+	int ret;
+
+	if (layout->parity && length) {
+		unsigned data_devs = layout->group_width - layout->parity;
+		unsigned stripe_size = layout->stripe_unit * data_devs;
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+		u32 remainder;
+		u64 num_stripes;
+		u64 num_raid_units;
+
+		num_stripes = div_u64_rem(length, stripe_size, &remainder);
+		if (remainder)
+			++num_stripes;
+
+		num_raid_units =  num_stripes * layout->parity;
+
+		if (is_reading) {
+			/* For reads add per_dev sglist array */
+			/* TODO: Raid 6 we need twice more. Actually:
+			*         num_stripes / LCMdP(W,P);
+			*         if (W%P != 0) num_stripes *= parity;
+			*/
+
+			/* first/last seg is split */
+			num_raid_units += layout->group_width;
+			sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
+		} else {
+			/* For Writes add parity pages array. */
+			max_par_pages = num_raid_units * pages_in_unit *
+						sizeof(struct page *);
+		}
+	}
+
+	ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
+				pios);
+	if (unlikely(ret))
+		return ret;
+
+	ios = *pios;
+	ios->reading = is_reading;
+	ios->offset = offset;
+
+	if (length) {
+		ore_calc_stripe_info(layout, offset, length, &ios->si);
+		ios->length = ios->si.length;
+		ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+				 ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+		if (layout->parity)
+			_ore_post_alloc_raid_stuff(ios);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ore_get_rw_state);
+
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
+int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
+		      struct ore_io_state **pios)
+{
+	return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
+}
+EXPORT_SYMBOL(ore_get_io_state);
+
+void ore_put_io_state(struct ore_io_state *ios)
+{
+	if (ios) {
+		unsigned i;
+
+		for (i = 0; i < ios->numdevs; i++) {
+			struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+
+			if (per_dev->or)
+				osd_end_request(per_dev->or);
+			if (per_dev->bio)
+				bio_put(per_dev->bio);
+		}
+
+		_ore_free_raid_stuff(ios);
+		kfree(ios);
+	}
+}
+EXPORT_SYMBOL(ore_put_io_state);
+
+static void _sync_done(struct ore_io_state *ios, void *p)
+{
+	struct completion *waiting = p;
+
+	complete(waiting);
+}
+
+static void _last_io(struct kref *kref)
+{
+	struct ore_io_state *ios = container_of(
+					kref, struct ore_io_state, kref);
+
+	ios->done(ios, ios->private);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+	struct ore_io_state *ios = p;
+
+	kref_put(&ios->kref, _last_io);
+}
+
+int ore_io_execute(struct ore_io_state *ios)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	bool sync = (ios->done == NULL);
+	int i, ret;
+
+	if (sync) {
+		ios->done = _sync_done;
+		ios->private = &wait;
+	}
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
+		if (unlikely(ret)) {
+			ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
+				     ret);
+			return ret;
+		}
+	}
+
+	kref_init(&ios->kref);
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		kref_get(&ios->kref);
+		osd_execute_request_async(or, _done_io, ios);
+	}
+
+	kref_put(&ios->kref, _last_io);
+	ret = 0;
+
+	if (sync) {
+		wait_for_completion(&wait);
+		ret = ore_check_io(ios, NULL);
+	}
+	return ret;
+}
+
+static void _clear_bio(struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		unsigned this_count = bv->bv_len;
+
+		if (likely(PAGE_SIZE == this_count))
+			clear_highpage(bv->bv_page);
+		else
+			zero_user(bv->bv_page, bv->bv_offset, this_count);
+	}
+}
+
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
+{
+	enum osd_err_priority acumulated_osd_err = 0;
+	int acumulated_lin_err = 0;
+	int i;
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_sense_info osi;
+		struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+		struct osd_request *or = per_dev->or;
+		int ret;
+
+		if (unlikely(!or))
+			continue;
+
+		ret = osd_req_decode_sense(or, &osi);
+		if (likely(!ret))
+			continue;
+
+		if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
+		    per_dev->bio) {
+			/* start read offset passed endof file.
+			 * Note: if we do not have bio it means read-attributes
+			 * In this case we should return error to caller.
+			 */
+			_clear_bio(per_dev->bio);
+			ORE_DBGMSG("start read offset passed end of file "
+				"offset=0x%llx, length=0x%llx\n",
+				_LLU(per_dev->offset),
+				_LLU(per_dev->length));
+
+			continue; /* we recovered */
+		}
+
+		if (on_dev_error) {
+			u64 residual = ios->reading ?
+					or->in.residual : or->out.residual;
+			u64 offset = (ios->offset + ios->length) - residual;
+			unsigned dev = per_dev->dev - ios->oc->first_dev;
+			struct ore_dev *od = ios->oc->ods[dev];
+
+			on_dev_error(ios, od, dev, osi.osd_err_pri,
+				     offset, residual);
+		}
+		if (osi.osd_err_pri >= acumulated_osd_err) {
+			acumulated_osd_err = osi.osd_err_pri;
+			acumulated_lin_err = ret;
+		}
+	}
+
+	return acumulated_lin_err;
+}
+EXPORT_SYMBOL(ore_check_io);
+
+/*
+ * L - logical offset into the file
+ *
+ * D - number of Data devices
+ *	D = group_width - parity
+ *
+ * U - The number of bytes in a stripe within a group
+ *	U =  stripe_unit * D
+ *
+ * T - The number of bytes striped within a group of component objects
+ *     (before advancing to the next group)
+ *	T = U * group_depth
+ *
+ * S - The number of bytes striped across all component objects
+ *     before the pattern repeats
+ *	S = T * group_count
+ *
+ * M - The "major" (i.e., across all components) cycle number
+ *	M = L / S
+ *
+ * G - Counts the groups from the beginning of the major cycle
+ *	G = (L - (M * S)) / T	[or (L % S) / T]
+ *
+ * H - The byte offset within the group
+ *	H = (L - (M * S)) % T	[or (L % S) % T]
+ *
+ * N - The "minor" (i.e., across the group) stripe number
+ *	N = H / U
+ *
+ * C - The component index coresponding to L
+ *
+ *	C = (H - (N * U)) / stripe_unit + G * D
+ *	[or (L % U) / stripe_unit + G * D]
+ *
+ * O - The component offset coresponding to L
+ *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ *          divide by parity
+ *	LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ *     (Note parity cycle always starts at a group's boundary)
+ *	R = N % LCMdP
+ *
+ * I = the first parity device index
+ *	I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ *	Craid = (group_width + C - R*parity) % group_width
+ *      (We add the group_width to avoid negative numbers modulo math)
+ */
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+			  u64 length, struct ore_striping_info *si)
+{
+	u32	stripe_unit = layout->stripe_unit;
+	u32	group_width = layout->group_width;
+	u64	group_depth = layout->group_depth;
+	u32	parity      = layout->parity;
+
+	u32	D = group_width - parity;
+	u32	U = D * stripe_unit;
+	u64	T = U * group_depth;
+	u64	S = T * layout->group_count;
+	u64	M = div64_u64(file_offset, S);
+
+	/*
+	G = (L - (M * S)) / T
+	H = (L - (M * S)) % T
+	*/
+	u64	LmodS = file_offset - M * S;
+	u32	G = div64_u64(LmodS, T);
+	u64	H = LmodS - G * T;
+
+	u32	N = div_u64(H, U);
+	u32	Nlast;
+
+	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
+	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+	u32 first_dev = C - C % group_width;
+
+	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+
+	si->obj_offset = si->unit_off + (N * stripe_unit) +
+				  (M * group_depth * stripe_unit);
+	si->cur_comp = C - first_dev;
+	si->cur_pg = si->unit_off / PAGE_SIZE;
+
+	if (parity) {
+		u32 LCMdP = lcm(group_width, parity) / parity;
+		/* R     = N % LCMdP; */
+		u32 RxP   = (N % LCMdP) * parity;
+
+		si->par_dev = (group_width + group_width - parity - RxP) %
+			      group_width + first_dev;
+		si->dev = (group_width + group_width + C - RxP) %
+			  group_width + first_dev;
+		si->bytes_in_stripe = U;
+		si->first_stripe_start = M * S + G * T + N * U;
+	} else {
+		/* Make the math correct see _prepare_one_group */
+		si->par_dev = group_width;
+		si->dev = C;
+	}
+
+	si->dev *= layout->mirrors_p1;
+	si->par_dev *= layout->mirrors_p1;
+	si->offset = file_offset;
+	si->length = T - H;
+	if (si->length > length)
+		si->length = length;
+
+	Nlast = div_u64(H + si->length + U - 1, U);
+	si->maxdevUnits = Nlast - N;
+
+	si->M = M;
+}
+EXPORT_SYMBOL(ore_calc_stripe_info);
+
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+			 unsigned pgbase, struct page **pages,
+			 struct ore_per_dev_state *per_dev, int cur_len)
+{
+	unsigned pg = *cur_pg;
+	struct request_queue *q =
+			osd_request_queue(_ios_od(ios, per_dev->dev));
+	unsigned len = cur_len;
+	int ret;
+
+	if (per_dev->bio == NULL) {
+		unsigned bio_size;
+
+		if (!ios->reading) {
+			bio_size = ios->si.maxdevUnits;
+		} else {
+			bio_size = (ios->si.maxdevUnits + 1) *
+			     (ios->layout->group_width - ios->layout->parity) /
+			     ios->layout->group_width;
+		}
+		bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
+
+		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+		if (unlikely(!per_dev->bio)) {
+			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+				     bio_size);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	while (cur_len > 0) {
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
+
+		cur_len -= pglen;
+
+		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
+					    pglen, pgbase);
+		if (unlikely(pglen != added_len)) {
+			/* If bi_vcnt == bi_max then this is a SW BUG */
+			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+				   "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+				   per_dev->bio->bi_vcnt,
+				   per_dev->bio->bi_max_vecs,
+				   BIO_MAX_PAGES_KMALLOC, cur_len);
+			ret = -ENOMEM;
+			goto out;
+		}
+		_add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
+
+		pgbase = 0;
+		++pg;
+	}
+	BUG_ON(cur_len);
+
+	per_dev->length += len;
+	*cur_pg = pg;
+	ret = 0;
+out:	/* we fail the complete unit on an error eg don't advance
+	 * per_dev->length and cur_pg. This means that we might have a bigger
+	 * bio than the CDB requested length (per_dev->length). That's fine
+	 * only the oposite is fatal.
+	 */
+	return ret;
+}
+
+static int _add_parity_units(struct ore_io_state *ios,
+			     struct ore_striping_info *si,
+			     unsigned dev, unsigned first_dev,
+			     unsigned mirrors_p1, unsigned devs_in_group,
+			     unsigned cur_len)
+{
+	unsigned do_parity;
+	int ret = 0;
+
+	for (do_parity = ios->layout->parity; do_parity; --do_parity) {
+		struct ore_per_dev_state *per_dev;
+
+		per_dev = &ios->per_dev[dev - first_dev];
+		if (!per_dev->length && !per_dev->offset) {
+			/* Only/always the parity unit of the first
+			 * stripe will be empty. So this is a chance to
+			 * initialize the per_dev info.
+			 */
+			per_dev->dev = dev;
+			per_dev->offset = si->obj_offset - si->unit_off;
+		}
+
+		ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
+					   do_parity == 1);
+		if (unlikely(ret))
+				break;
+
+		if (do_parity != 1) {
+			dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+			si->cur_comp = (si->cur_comp + 1) %
+						       ios->layout->group_width;
+		}
+	}
+
+	return ret;
+}
+
+static int _prepare_for_striping(struct ore_io_state *ios)
+{
+	struct ore_striping_info *si = &ios->si;
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned mirrors_p1 = ios->layout->mirrors_p1;
+	unsigned group_width = ios->layout->group_width;
+	unsigned devs_in_group = group_width * mirrors_p1;
+	unsigned dev = si->dev;
+	unsigned first_dev = dev - (dev % devs_in_group);
+	unsigned cur_pg = ios->pages_consumed;
+	u64 length = ios->length;
+	int ret = 0;
+
+	if (!ios->pages) {
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	BUG_ON(length > si->length);
+
+	while (length) {
+		struct ore_per_dev_state *per_dev =
+						&ios->per_dev[dev - first_dev];
+		unsigned cur_len, page_off = 0;
+
+		if (!per_dev->length && !per_dev->offset) {
+			/* First time initialize the per_dev info. */
+			per_dev->dev = dev;
+			if (dev == si->dev) {
+				WARN_ON(dev == si->par_dev);
+				per_dev->offset = si->obj_offset;
+				cur_len = stripe_unit - si->unit_off;
+				page_off = si->unit_off & ~PAGE_MASK;
+				BUG_ON(page_off && (page_off != ios->pgbase));
+			} else {
+				per_dev->offset = si->obj_offset - si->unit_off;
+				cur_len = stripe_unit;
+			}
+		} else {
+			cur_len = stripe_unit;
+		}
+		if (cur_len >= length)
+			cur_len = length;
+
+		ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
+					   per_dev, cur_len);
+		if (unlikely(ret))
+			goto out;
+
+		length -= cur_len;
+
+		dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+		si->cur_comp = (si->cur_comp + 1) % group_width;
+		if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
+			if (!length && ios->sp2d) {
+				/* If we are writing and this is the very last
+				 * stripe. then operate on parity dev.
+				 */
+				dev = si->par_dev;
+				/* If last stripe operate on parity comp */
+				si->cur_comp = group_width - ios->layout->parity;
+			}
+
+			/* In writes cur_len just means if it's the
+			 * last one. See _ore_add_parity_unit.
+			 */
+			ret = _add_parity_units(ios, si, dev, first_dev,
+						mirrors_p1, devs_in_group,
+						ios->sp2d ? length : cur_len);
+			if (unlikely(ret))
+					goto out;
+
+			/* Rotate next par_dev backwards with wraping */
+			si->par_dev = (devs_in_group + si->par_dev -
+				       ios->layout->parity * mirrors_p1) %
+				      devs_in_group + first_dev;
+			/* Next stripe, start fresh */
+			si->cur_comp = 0;
+			si->cur_pg = 0;
+			si->obj_offset += cur_len;
+			si->unit_off = 0;
+		}
+	}
+out:
+	ios->numdevs = devs_in_group;
+	ios->pages_consumed = cur_pg;
+	return ret;
+}
+
+int ore_create(struct ore_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->oc->numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+		if (unlikely(!or)) {
+			ORE_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_create_object(or, _ios_obj(ios, i));
+	}
+	ret = ore_io_execute(ios);
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ore_create);
+
+int ore_remove(struct ore_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->oc->numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+		if (unlikely(!or)) {
+			ORE_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_remove_object(or, _ios_obj(ios, i));
+	}
+	ret = ore_io_execute(ios);
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ore_remove);
+
+static int _write_mirror(struct ore_io_state *ios, int cur_comp)
+{
+	struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+	unsigned dev = ios->per_dev[cur_comp].dev;
+	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+	int ret = 0;
+
+	if (ios->pages && !master_dev->length)
+		return 0; /* Just an empty slot */
+
+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+		struct osd_request *or;
+
+		or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
+		if (unlikely(!or)) {
+			ORE_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		per_dev->or = or;
+
+		if (ios->pages) {
+			struct bio *bio;
+
+			if (per_dev != master_dev) {
+				bio = bio_clone_kmalloc(master_dev->bio,
+							GFP_KERNEL);
+				if (unlikely(!bio)) {
+					ORE_DBGMSG(
+					      "Failed to allocate BIO size=%u\n",
+					      master_dev->bio->bi_max_vecs);
+					ret = -ENOMEM;
+					goto out;
+				}
+
+				bio->bi_bdev = NULL;
+				bio->bi_next = NULL;
+				per_dev->offset = master_dev->offset;
+				per_dev->length = master_dev->length;
+				per_dev->bio =  bio;
+				per_dev->dev = dev;
+			} else {
+				bio = master_dev->bio;
+				/* FIXME: bio_set_dir() */
+				bio->bi_rw |= REQ_WRITE;
+			}
+
+			osd_req_write(or, _ios_obj(ios, cur_comp),
+				      per_dev->offset, bio, per_dev->length);
+			ORE_DBGMSG("write(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(_ios_obj(ios, cur_comp)->id),
+				     _LLU(per_dev->offset),
+				     _LLU(per_dev->length), dev);
+		} else if (ios->kern_buff) {
+			per_dev->offset = ios->si.obj_offset;
+			per_dev->dev = ios->si.dev + dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (ios->si.unit_off + ios->length >
+				ios->layout->stripe_unit));
+
+			ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
+						 per_dev->offset,
+						 ios->kern_buff, ios->length);
+			if (unlikely(ret))
+				goto out;
+			ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(_ios_obj(ios, cur_comp)->id),
+				     _LLU(per_dev->offset),
+				     _LLU(ios->length), per_dev->dev);
+		} else {
+			osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
+			ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+				     _LLU(_ios_obj(ios, cur_comp)->id),
+				     ios->out_attr_len, dev);
+		}
+
+		if (ios->out_attr)
+			osd_req_add_set_attr_list(or, ios->out_attr,
+						  ios->out_attr_len);
+
+		if (ios->in_attr)
+			osd_req_add_get_attr_list(or, ios->in_attr,
+						  ios->in_attr_len);
+	}
+
+out:
+	return ret;
+}
+
+int ore_write(struct ore_io_state *ios)
+{
+	int i;
+	int ret;
+
+	if (unlikely(ios->sp2d && !ios->r4w)) {
+		/* A library is attempting a RAID-write without providing
+		 * a pages lock interface.
+		 */
+		WARN_ON_ONCE(1);
+		return -ENOTSUPP;
+	}
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _write_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = ore_io_execute(ios);
+	return ret;
+}
+EXPORT_SYMBOL(ore_write);
+
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+{
+	struct osd_request *or;
+	struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+	struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
+	unsigned first_dev = (unsigned)obj->id;
+
+	if (ios->pages && !per_dev->length)
+		return 0; /* Just an empty slot */
+
+	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
+	or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
+	if (unlikely(!or)) {
+		ORE_ERR("%s: osd_start_request failed\n", __func__);
+		return -ENOMEM;
+	}
+	per_dev->or = or;
+
+	if (ios->pages) {
+		if (per_dev->cur_sg) {
+			/* finalize the last sg_entry */
+			_ore_add_sg_seg(per_dev, 0, false);
+			if (unlikely(!per_dev->cur_sg))
+				return 0; /* Skip parity only device */
+
+			osd_req_read_sg(or, obj, per_dev->bio,
+					per_dev->sglist, per_dev->cur_sg);
+		} else {
+			/* The no raid case */
+			osd_req_read(or, obj, per_dev->offset,
+				     per_dev->bio, per_dev->length);
+		}
+
+		ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+			     " dev=%d sg_len=%d\n", _LLU(obj->id),
+			     _LLU(per_dev->offset), _LLU(per_dev->length),
+			     first_dev, per_dev->cur_sg);
+	} else {
+		BUG_ON(ios->kern_buff);
+
+		osd_req_get_attributes(or, obj);
+		ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+			      _LLU(obj->id),
+			      ios->in_attr_len, first_dev);
+	}
+	if (ios->out_attr)
+		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
+
+	if (ios->in_attr)
+		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
+
+	return 0;
+}
+
+int ore_read(struct ore_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _ore_read_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = ore_io_execute(ios);
+	return ret;
+}
+EXPORT_SYMBOL(ore_read);
+
+int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(ios->per_dev[0].or,
+					     &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+EXPORT_SYMBOL(extract_attr_from_ios);
+
+static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
+			     struct osd_attr *attr)
+{
+	int last_comp = cur_comp + ios->layout->mirrors_p1;
+
+	for (; cur_comp < last_comp; ++cur_comp) {
+		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+		struct osd_request *or;
+
+		or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
+		if (unlikely(!or)) {
+			ORE_ERR("%s: osd_start_request failed\n", __func__);
+			return -ENOMEM;
+		}
+		per_dev->or = or;
+
+		osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
+		osd_req_add_set_attr_list(or, attr, 1);
+	}
+
+	return 0;
+}
+
+struct _trunc_info {
+	struct ore_striping_info si;
+	u64 prev_group_obj_off;
+	u64 next_group_obj_off;
+
+	unsigned first_group_dev;
+	unsigned nex_group_dev;
+};
+
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+			     struct _trunc_info *ti)
+{
+	unsigned stripe_unit = layout->stripe_unit;
+
+	ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
+
+	ti->prev_group_obj_off = ti->si.M * stripe_unit;
+	ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
+
+	ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
+	ti->nex_group_dev = ti->first_group_dev + layout->group_width;
+}
+
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
+		   u64 size)
+{
+	struct ore_io_state *ios;
+	struct exofs_trunc_attr {
+		struct osd_attr attr;
+		__be64 newsize;
+	} *size_attrs;
+	struct _trunc_info ti;
+	int i, ret;
+
+	ret = ore_get_io_state(layout, oc, &ios);
+	if (unlikely(ret))
+		return ret;
+
+	_calc_trunk_info(ios->layout, size, &ti);
+
+	size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
+			     GFP_KERNEL);
+	if (unlikely(!size_attrs)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ios->numdevs = ios->oc->numdevs;
+
+	for (i = 0; i < ios->numdevs; ++i) {
+		struct exofs_trunc_attr *size_attr = &size_attrs[i];
+		u64 obj_size;
+
+		if (i < ti.first_group_dev)
+			obj_size = ti.prev_group_obj_off;
+		else if (i >= ti.nex_group_dev)
+			obj_size = ti.next_group_obj_off;
+		else if (i < ti.si.dev) /* dev within this group */
+			obj_size = ti.si.obj_offset +
+				      ios->layout->stripe_unit - ti.si.unit_off;
+		else if (i == ti.si.dev)
+			obj_size = ti.si.obj_offset;
+		else /* i > ti.dev */
+			obj_size = ti.si.obj_offset - ti.si.unit_off;
+
+		size_attr->newsize = cpu_to_be64(obj_size);
+		size_attr->attr = g_attr_logical_length;
+		size_attr->attr.val_ptr = &size_attr->newsize;
+
+		ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
+		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
+					&size_attr->attr);
+		if (unlikely(ret))
+			goto out;
+	}
+	ret = ore_io_execute(ios);
+
+out:
+	kfree(size_attrs);
+	ore_put_io_state(ios);
+	return ret;
+}
+EXPORT_SYMBOL(ore_truncate);
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/kernel/fs/exofs/ore_raid.c b/kernel/fs/exofs/ore_raid.c
new file mode 100644
index 000000000..27cbdb697
--- /dev/null
+++ b/kernel/fs/exofs/ore_raid.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <linux/gfp.h>
+#include <linux/async_tx.h>
+
+#include "ore_raid.h"
+
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+
+static struct page *_raid_page_alloc(void)
+{
+	return alloc_page(GFP_KERNEL);
+}
+
+static void _raid_page_free(struct page *p)
+{
+	__free_page(p);
+}
+
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+	/* Cache some hot path repeated calculations */
+	unsigned parity;
+	unsigned data_devs;
+	unsigned pages_in_unit;
+
+	bool needed ;
+
+	/* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+	struct __1_page_stripe {
+		bool alloc;
+		unsigned write_count;
+		struct async_submit_ctl submit;
+		struct dma_async_tx_descriptor *tx;
+
+		/* The size of this array is data_devs + parity */
+		struct page **pages;
+		struct page **scribble;
+		/* bool array, size of this array is data_devs */
+		char *page_is_read;
+	} _1p_stripes[];
+};
+
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+		       unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+	struct __stripe_pages_2d *sp2d;
+	unsigned data_devs = group_width - parity;
+	struct _alloc_all_bytes {
+		struct __alloc_stripe_pages_2d {
+			struct __stripe_pages_2d sp2d;
+			struct __1_page_stripe _1p_stripes[pages_in_unit];
+		} __asp2d;
+		struct __alloc_1p_arrays {
+			struct page *pages[group_width];
+			struct page *scribble[group_width];
+			char page_is_read[data_devs];
+		} __a1pa[pages_in_unit];
+	} *_aab;
+	struct __alloc_1p_arrays *__a1pa;
+	struct __alloc_1p_arrays *__a1pa_end;
+	const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+	unsigned num_a1pa, alloc_size, i;
+
+	/* FIXME: check these numbers in ore_verify_layout */
+	BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+	BUG_ON(sizeof__a1pa > PAGE_SIZE);
+
+	if (sizeof(*_aab) > PAGE_SIZE) {
+		num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+		alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+	} else {
+		num_a1pa = pages_in_unit;
+		alloc_size = sizeof(*_aab);
+	}
+
+	_aab = kzalloc(alloc_size, GFP_KERNEL);
+	if (unlikely(!_aab)) {
+		ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+		return -ENOMEM;
+	}
+
+	sp2d = &_aab->__asp2d.sp2d;
+	*psp2d = sp2d; /* From here Just call _sp2d_free */
+
+	__a1pa = _aab->__a1pa;
+	__a1pa_end = __a1pa + num_a1pa;
+
+	for (i = 0; i < pages_in_unit; ++i) {
+		if (unlikely(__a1pa >= __a1pa_end)) {
+			num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+							pages_in_unit - i);
+
+			__a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL);
+			if (unlikely(!__a1pa)) {
+				ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+					   num_a1pa);
+				return -ENOMEM;
+			}
+			__a1pa_end = __a1pa + num_a1pa;
+			/* First *pages is marked for kfree of the buffer */
+			sp2d->_1p_stripes[i].alloc = true;
+		}
+
+		sp2d->_1p_stripes[i].pages = __a1pa->pages;
+		sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+		sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+		++__a1pa;
+	}
+
+	sp2d->parity = parity;
+	sp2d->data_devs = data_devs;
+	sp2d->pages_in_unit = pages_in_unit;
+	return 0;
+}
+
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+			const struct _ore_r4w_op *r4w, void *priv)
+{
+	unsigned data_devs = sp2d->data_devs;
+	unsigned group_width = data_devs + sp2d->parity;
+	int p, c;
+
+	if (!sp2d->needed)
+		return;
+
+	for (c = data_devs - 1; c >= 0; --c)
+		for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+			if (_1ps->page_is_read[c]) {
+				struct page *page = _1ps->pages[c];
+
+				r4w->put_page(priv, page);
+				_1ps->page_is_read[c] = false;
+			}
+		}
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+		_1ps->write_count = 0;
+		_1ps->tx = NULL;
+	}
+
+	sp2d->needed = false;
+}
+
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+	unsigned i;
+
+	if (!sp2d)
+		return;
+
+	for (i = 0; i < sp2d->pages_in_unit; ++i) {
+		if (sp2d->_1p_stripes[i].alloc)
+			kfree(sp2d->_1p_stripes[i].pages);
+	}
+
+	kfree(sp2d);
+}
+
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+	int p;
+
+	for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+	unsigned tx_flags = ASYNC_TX_ACK;
+
+	if (sp2d->parity == 1)
+		tx_flags |= ASYNC_TX_XOR_ZERO_DST;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (!_1ps->write_count)
+			continue;
+
+		init_async_submit(&_1ps->submit, tx_flags,
+			NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
+
+		if (sp2d->parity == 1)
+			_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
+						_1ps->pages, 0, sp2d->data_devs,
+						PAGE_SIZE, &_1ps->submit);
+		else /* parity == 2 */
+			_1ps->tx = async_gen_syndrome(_1ps->pages, 0,
+						sp2d->data_devs + sp2d->parity,
+						PAGE_SIZE, &_1ps->submit);
+	}
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+		/* NOTE: We wait for HW synchronously (I don't have such HW
+		 * to test with.) Is parallelism needed with today's multi
+		 * cores?
+		 */
+		async_tx_issue_pending(_1ps->tx);
+	}
+}
+
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page)
+{
+	struct __1_page_stripe *_1ps;
+
+	sp2d->needed = true;
+
+	_1ps = &sp2d->_1p_stripes[si->cur_pg];
+	_1ps->pages[si->cur_comp] = page;
+	++_1ps->write_count;
+
+	si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+	/* si->cur_comp is advanced outside at main loop */
+}
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		     bool not_last)
+{
+	struct osd_sg_entry *sge;
+
+	ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+		     "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+		     per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+		     _LLU(per_dev->offset), per_dev->length,
+		     per_dev->last_sgs_total);
+
+	if (!per_dev->cur_sg) {
+		sge = per_dev->sglist;
+
+		/* First time we prepare two entries */
+		if (per_dev->length) {
+			++per_dev->cur_sg;
+			sge->offset = per_dev->offset;
+			sge->len = per_dev->length;
+		} else {
+			/* Here the parity is the first unit of this object.
+			 * This happens every time we reach a parity device on
+			 * the same stripe as the per_dev->offset. We need to
+			 * just skip this unit.
+			 */
+			per_dev->offset += cur_len;
+			return;
+		}
+	} else {
+		/* finalize the last one */
+		sge = &per_dev->sglist[per_dev->cur_sg - 1];
+		sge->len = per_dev->length - per_dev->last_sgs_total;
+	}
+
+	if (not_last) {
+		/* Partly prepare the next one */
+		struct osd_sg_entry *next_sge = sge + 1;
+
+		++per_dev->cur_sg;
+		next_sge->offset = sge->offset + sge->len + cur_len;
+		/* Save cur len so we know how mutch was added next time */
+		per_dev->last_sgs_total = per_dev->length;
+		next_sge->len = 0;
+	} else if (!sge->len) {
+		/* Optimize for when the last unit is a parity */
+		--per_dev->cur_sg;
+	}
+}
+
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+	struct ore_layout *layout = ios->layout;
+	int ret;
+	/* We want to only read those pages not in cache so worst case
+	 * is a stripe populated with every other page
+	 */
+	unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+
+	ret = _ore_get_io_state(layout, ios->oc,
+				layout->group_width * layout->mirrors_p1,
+				sgs_per_dev, 0, &ios->ios_read_4_write);
+	return ret;
+}
+
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
+		       struct page *page, unsigned pg_len)
+{
+	struct request_queue *q;
+	struct ore_per_dev_state *per_dev;
+	struct ore_io_state *read_ios;
+	unsigned first_dev = si->dev - (si->dev %
+			  (ios->layout->group_width * ios->layout->mirrors_p1));
+	unsigned comp = si->dev - first_dev;
+	unsigned added_len;
+
+	if (!ios->ios_read_4_write) {
+		int ret = _alloc_read_4_write(ios);
+
+		if (unlikely(ret))
+			return ret;
+	}
+
+	read_ios = ios->ios_read_4_write;
+	read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+
+	per_dev = &read_ios->per_dev[comp];
+	if (!per_dev->length) {
+		per_dev->bio = bio_kmalloc(GFP_KERNEL,
+					   ios->sp2d->pages_in_unit);
+		if (unlikely(!per_dev->bio)) {
+			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+				     ios->sp2d->pages_in_unit);
+			return -ENOMEM;
+		}
+		per_dev->offset = si->obj_offset;
+		per_dev->dev = si->dev;
+	} else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+		u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+
+		_ore_add_sg_seg(per_dev, gap, true);
+	}
+	q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+	added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
+				    si->obj_offset % PAGE_SIZE);
+	if (unlikely(added_len != pg_len)) {
+		ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+			      per_dev->bio->bi_vcnt);
+		return -ENOMEM;
+	}
+
+	per_dev->length += pg_len;
+	return 0;
+}
+
+/* read the beginning of an unaligned first page */
+static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
+{
+	struct ore_striping_info si;
+	unsigned pg_len;
+
+	ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
+
+	pg_len = si.obj_offset % PAGE_SIZE;
+	si.obj_offset -= pg_len;
+
+	ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
+		   _LLU(si.obj_offset), pg_len, page->index, si.dev);
+
+	return _add_to_r4w(ios, &si, page, pg_len);
+}
+
+/* read the end of an incomplete last page */
+static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
+{
+	struct ore_striping_info si;
+	struct page *page;
+	unsigned pg_len, p, c;
+
+	ore_calc_stripe_info(ios->layout, *offset, 0, &si);
+
+	p = si.cur_pg;
+	c = si.cur_comp;
+	page = ios->sp2d->_1p_stripes[p].pages[c];
+
+	pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
+	*offset += pg_len;
+
+	ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
+		   p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
+
+	BUG_ON(!page);
+
+	return _add_to_r4w(ios, &si, page, pg_len);
+}
+
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+	struct bio_vec *bv;
+	unsigned i, d;
+
+	/* loop on all devices all pages */
+	for (d = 0; d < ios->numdevs; d++) {
+		struct bio *bio = ios->per_dev[d].bio;
+
+		if (!bio)
+			continue;
+
+		bio_for_each_segment_all(bv, bio, i) {
+			struct page *page = bv->bv_page;
+
+			SetPageUptodate(page);
+			if (PageError(page))
+				ClearPageError(page);
+		}
+	}
+}
+
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write_first_stripe(struct ore_io_state *ios)
+{
+	struct ore_striping_info read_si;
+	struct __stripe_pages_2d *sp2d = ios->sp2d;
+	u64 offset = ios->si.first_stripe_start;
+	unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+
+	if (offset == ios->offset) /* Go to start collect $200 */
+		goto read_last_stripe;
+
+	min_p = _sp2d_min_pg(sp2d);
+	max_p = _sp2d_max_pg(sp2d);
+
+	ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
+		   offset, ios->offset, min_p, max_p);
+
+	for (c = 0; ; c++) {
+		ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		read_si.obj_offset += min_p * PAGE_SIZE;
+		offset += min_p * PAGE_SIZE;
+		for (p = min_p; p <= max_p; p++) {
+			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+			struct page **pp = &_1ps->pages[c];
+			bool uptodate;
+
+			if (*pp) {
+				if (ios->offset % PAGE_SIZE)
+					/* Read the remainder of the page */
+					_add_to_r4w_first_page(ios, *pp);
+				/* to-be-written pages start here */
+				goto read_last_stripe;
+			}
+
+			*pp = ios->r4w->get_page(ios->private, offset,
+						 &uptodate);
+			if (unlikely(!*pp))
+				return -ENOMEM;
+
+			if (!uptodate)
+				_add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
+
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			read_si.obj_offset += PAGE_SIZE;
+			offset += PAGE_SIZE;
+		}
+		offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+	}
+
+read_last_stripe:
+	return 0;
+}
+
+static int _read_4_write_last_stripe(struct ore_io_state *ios)
+{
+	struct ore_striping_info read_si;
+	struct __stripe_pages_2d *sp2d = ios->sp2d;
+	u64 offset;
+	u64 last_stripe_end;
+	unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+	unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+
+	offset = ios->offset + ios->length;
+	if (offset % PAGE_SIZE)
+		_add_to_r4w_last_page(ios, &offset);
+		/* offset will be aligned to next page */
+
+	last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+				 * bytes_in_stripe;
+	if (offset == last_stripe_end) /* Optimize for the aligned case */
+		goto read_it;
+
+	ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+	p = read_si.cur_pg;
+	c = read_si.cur_comp;
+
+	if (min_p == sp2d->pages_in_unit) {
+		/* Didn't do it yet */
+		min_p = _sp2d_min_pg(sp2d);
+		max_p = _sp2d_max_pg(sp2d);
+	}
+
+	ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
+		   offset, last_stripe_end, min_p, max_p);
+
+	while (offset < last_stripe_end) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if ((min_p <= p) && (p <= max_p)) {
+			struct page *page;
+			bool uptodate;
+
+			BUG_ON(_1ps->pages[c]);
+			page = ios->r4w->get_page(ios->private, offset,
+						  &uptodate);
+			if (unlikely(!page))
+				return -ENOMEM;
+
+			_1ps->pages[c] = page;
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			if (!uptodate)
+				_add_to_r4w(ios, &read_si, page, PAGE_SIZE);
+		}
+
+		offset += PAGE_SIZE;
+		if (p == (sp2d->pages_in_unit - 1)) {
+			++c;
+			p = 0;
+			ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		} else {
+			read_si.obj_offset += PAGE_SIZE;
+			++p;
+		}
+	}
+
+read_it:
+	return 0;
+}
+
+static int _read_4_write_execute(struct ore_io_state *ios)
+{
+	struct ore_io_state *ios_read;
+	unsigned i;
+	int ret;
+
+	ios_read = ios->ios_read_4_write;
+	if (!ios_read)
+		return 0;
+
+	/* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+	 * to check for per_dev->bio
+	 */
+	ios_read->pages = ios->pages;
+
+	/* Now read these devices */
+	for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+		ret = _ore_read_mirror(ios_read, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = ore_io_execute(ios_read); /* Synchronus execution */
+	if (unlikely(ret)) {
+		ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+		return ret;
+	}
+
+	_mark_read4write_pages_uptodate(ios_read, ret);
+	ore_put_io_state(ios_read);
+	ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
+	return 0;
+}
+
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+			    struct ore_striping_info *si,
+			    struct ore_per_dev_state *per_dev,
+			    unsigned cur_len, bool do_xor)
+{
+	if (ios->reading) {
+		if (per_dev->cur_sg >= ios->sgs_per_dev) {
+			ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
+				per_dev->cur_sg, ios->sgs_per_dev);
+			return -ENOMEM;
+		}
+		_ore_add_sg_seg(per_dev, cur_len, true);
+	} else {
+		struct __stripe_pages_2d *sp2d = ios->sp2d;
+		struct page **pages = ios->parity_pages + ios->cur_par_page;
+		unsigned num_pages;
+		unsigned array_start = 0;
+		unsigned i;
+		int ret;
+
+		si->cur_pg = _sp2d_min_pg(sp2d);
+		num_pages  = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+
+		if (!per_dev->length) {
+			per_dev->offset += si->cur_pg * PAGE_SIZE;
+			/* If first stripe, Read in all read4write pages
+			 * (if needed) before we calculate the first parity.
+			 */
+			if (do_xor)
+				_read_4_write_first_stripe(ios);
+		}
+		if (!cur_len && do_xor)
+			/* If last stripe r4w pages of last stripe */
+			_read_4_write_last_stripe(ios);
+		_read_4_write_execute(ios);
+
+		for (i = 0; i < num_pages; i++) {
+			pages[i] = _raid_page_alloc();
+			if (unlikely(!pages[i]))
+				return -ENOMEM;
+
+			++(ios->cur_par_page);
+		}
+
+		BUG_ON(si->cur_comp < sp2d->data_devs);
+		BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
+
+		ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
+					   per_dev, num_pages * PAGE_SIZE);
+		if (unlikely(ret))
+			return ret;
+
+		if (do_xor) {
+			_gen_xor_unit(sp2d);
+			_sp2d_reset(sp2d, ios->r4w, ios->private);
+		}
+	}
+	return 0;
+}
+
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+	if (ios->parity_pages) {
+		struct ore_layout *layout = ios->layout;
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+
+		if (_sp2d_alloc(pages_in_unit, layout->group_width,
+				layout->parity, &ios->sp2d)) {
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+	if (ios->sp2d) { /* writing and raid */
+		unsigned i;
+
+		for (i = 0; i < ios->cur_par_page; i++) {
+			struct page *page = ios->parity_pages[i];
+
+			if (page)
+				_raid_page_free(page);
+		}
+		if (ios->extra_part_alloc)
+			kfree(ios->parity_pages);
+		/* If IO returned an error pages might need unlocking */
+		_sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+		_sp2d_free(ios->sp2d);
+	} else {
+		/* Will only be set if raid reading && sglist is big */
+		if (ios->extra_part_alloc)
+			kfree(ios->per_dev[0].sglist);
+	}
+	if (ios->ios_read_4_write)
+		ore_put_io_state(ios->ios_read_4_write);
+}
diff --git a/kernel/fs/exofs/ore_raid.h b/kernel/fs/exofs/ore_raid.h
new file mode 100644
index 000000000..a6e746775
--- /dev/null
+++ b/kernel/fs/exofs/ore_raid.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <scsi/osd_ore.h>
+
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		 bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+		     struct ore_per_dev_state *per_dev, unsigned cur_len,
+		     bool do_xor);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+				struct ore_striping_info *si, struct page *page)
+{
+	if (!sp2d) /* Inline the fast path */
+		return; /* Hay no raid stuff */
+	_ore_add_stripe_page(sp2d, si, page);
+}
+
+/* ios.c stuff needed by ios_raid.c */
+int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios);
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct page **pages,
+		struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);
diff --git a/kernel/fs/exofs/super.c b/kernel/fs/exofs/super.c
new file mode 100644
index 000000000..b795c567b
--- /dev/null
+++ b/kernel/fs/exofs/super.c
@@ -0,0 +1,1049 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+
+#include "exofs.h"
+
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+	bool is_osdname;
+	const char *dev_name;
+	uint64_t pid;
+	int timeout;
+};
+
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_name, Opt_pid, Opt_to, Opt_err };
+
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+	{Opt_name, "osdname=%s"},
+	{Opt_pid, "pid=%u"},
+	{Opt_to, "to=%u"},
+	{Opt_err, NULL}
+};
+
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	bool s_pid = false;
+
+	EXOFS_DBGMSG("parse_options %s\n", options);
+	/* defaults */
+	memset(opts, 0, sizeof(*opts));
+	opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		char str[32];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_name:
+			opts->dev_name = match_strdup(&args[0]);
+			if (unlikely(!opts->dev_name)) {
+				EXOFS_ERR("Error allocating dev_name");
+				return -ENOMEM;
+			}
+			opts->is_osdname = true;
+			break;
+		case Opt_pid:
+			if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+				return -EINVAL;
+			opts->pid = simple_strtoull(str, NULL, 0);
+			if (opts->pid < EXOFS_MIN_PID) {
+				EXOFS_ERR("Partition ID must be >= %u",
+					  EXOFS_MIN_PID);
+				return -EINVAL;
+			}
+			s_pid = 1;
+			break;
+		case Opt_to:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option <= 0) {
+				EXOFS_ERR("Timout must be > 0");
+				return -EINVAL;
+			}
+			opts->timeout = option * HZ;
+			break;
+		}
+	}
+
+	if (!s_pid) {
+		EXOFS_ERR("Need to specify the following options:\n");
+		EXOFS_ERR("    -o pid=pid_no_to_use\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+	struct exofs_i_info *oi;
+
+	oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+	if (!oi)
+		return NULL;
+
+	oi->vfs_inode.i_version = 1;
+	return &oi->vfs_inode;
+}
+
+static void exofs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, exofs_i_callback);
+}
+
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+	struct exofs_i_info *oi = foo;
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+				sizeof(struct exofs_i_info), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				exofs_init_once);
+	if (exofs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(exofs_inode_cachep);
+}
+
+/******************************************************************************
+ * Some osd helpers
+ *****************************************************************************/
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+		    u64 offset, void *p, unsigned length)
+{
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*	struct osd_sense_info osi = {.key = 0};*/
+	int ret;
+
+	if (unlikely(!or)) {
+		EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+		return -ENOMEM;
+	}
+	ret = osd_req_read_kern(or, obj, offset, p, length);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+		goto out;
+	}
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
+		goto out;
+	}
+
+	ret = osd_execute_request(or);
+	if (unlikely(ret))
+		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+	/* osd_req_decode_sense(or, ret); */
+
+out:
+	osd_end_request(or);
+	EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+		      "length=0x%llx dev=%p ret=>%d\n",
+		      _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
+	return ret;
+}
+
+static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
+	EXOFS_APAGE_SB_DATA,
+	EXOFS_ATTR_SB_STATS,
+	sizeof(struct exofs_sb_stats));
+
+static int __sbi_read_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct ore_io_state *ios;
+	int ret;
+
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = ore_read(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("Error reading super_block stats => %d\n", ret);
+		goto out;
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
+		goto out;
+	}
+	if (attrs[0].len) {
+		struct exofs_sb_stats *ess;
+
+		if (unlikely(attrs[0].len != sizeof(*ess))) {
+			EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
+				  "size(%d) != expected(%zd)\n",
+				  __func__, attrs[0].len, sizeof(*ess));
+			goto out;
+		}
+
+		ess = attrs[0].val_ptr;
+		sbi->s_nextid = le64_to_cpu(ess->s_nextid);
+		sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
+	}
+
+out:
+	ore_put_io_state(ios);
+	return ret;
+}
+
+static void stats_done(struct ore_io_state *ios, void *p)
+{
+	ore_put_io_state(ios);
+	/* Good thanks nothing to do anymore */
+}
+
+/* Asynchronously write the stats attribute */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct ore_io_state *ios;
+	int ret;
+
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
+	sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
+	attrs[0].val_ptr = &sbi->s_ess;
+
+
+	ios->done = stats_done;
+	ios->private = sbi;
+	ios->out_attr = attrs;
+	ios->out_attr_len = ARRAY_SIZE(attrs);
+
+	ret = ore_write(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: ore_write failed.\n", __func__);
+		ore_put_io_state(ios);
+	}
+
+	return ret;
+}
+
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
+
+/*
+ * Write the superblock to the OSD
+ */
+static int exofs_sync_fs(struct super_block *sb, int wait)
+{
+	struct exofs_sb_info *sbi;
+	struct exofs_fscb *fscb;
+	struct ore_comp one_comp;
+	struct ore_components oc;
+	struct ore_io_state *ios;
+	int ret = -ENOMEM;
+
+	fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
+	if (unlikely(!fscb))
+		return -ENOMEM;
+
+	sbi = sb->s_fs_info;
+
+	/* NOTE: We no longer dirty the super_block anywhere in exofs. The
+	 * reason we write the fscb here on unmount is so we can stay backwards
+	 * compatible with fscb->s_version == 1. (What we are not compatible
+	 * with is if a new version FS crashed and then we try to mount an old
+	 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
+	 * the writeable info is set in exofs_sbi_write_stats() above.
+	 */
+
+	exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
+
+	ret = ore_get_io_state(&sbi->layout, &oc, &ios);
+	if (unlikely(ret))
+		goto out;
+
+	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
+	memset(fscb, 0, ios->length);
+	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+	fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
+	fscb->s_magic = cpu_to_le16(sb->s_magic);
+	fscb->s_newfs = 0;
+	fscb->s_version = EXOFS_FSCB_VER;
+
+	ios->offset = 0;
+	ios->kern_buff = fscb;
+
+	ret = ore_write(ios);
+	if (unlikely(ret))
+		EXOFS_ERR("%s: ore_write failed.\n", __func__);
+
+out:
+	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
+	ore_put_io_state(ios);
+	kfree(fscb);
+	return ret;
+}
+
+static void _exofs_print_device(const char *msg, const char *dev_path,
+				struct osd_dev *od, u64 pid)
+{
+	const struct osd_dev_info *odi = osduld_device_info(od);
+
+	printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
+		msg, dev_path ?: "", odi->osdname, _LLU(pid));
+}
+
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
+{
+	unsigned numdevs = sbi->oc.numdevs;
+
+	while (numdevs) {
+		unsigned i = --numdevs;
+		struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
+
+		if (od) {
+			ore_comp_set_dev(&sbi->oc, i, NULL);
+			osduld_put_device(od);
+		}
+	}
+	kfree(sbi->oc.ods);
+	kfree(sbi);
+}
+
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+	int num_pend;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+
+	/* make sure there are no pending commands */
+	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+	     num_pend = atomic_read(&sbi->s_curr_pending)) {
+		wait_queue_head_t wq;
+
+		printk(KERN_NOTICE "%s: !!Pending operations in flight. "
+		       "This is a BUG. please report to osd-dev@open-osd.org\n",
+		       __func__);
+		init_waitqueue_head(&wq);
+		wait_event_timeout(wq,
+				  (atomic_read(&sbi->s_curr_pending) == 0),
+				  msecs_to_jiffies(100));
+	}
+
+	_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
+			    sbi->one_comp.obj.partition);
+
+	exofs_sysfs_sb_del(sbi);
+	bdi_destroy(&sbi->bdi);
+	exofs_free_sbi(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
+				    struct exofs_device_table *dt)
+{
+	int ret;
+
+	sbi->layout.stripe_unit =
+				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
+	sbi->layout.group_width =
+				le32_to_cpu(dt->dt_data_map.cb_group_width);
+	sbi->layout.group_depth =
+				le32_to_cpu(dt->dt_data_map.cb_group_depth);
+	sbi->layout.mirrors_p1  =
+				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+	sbi->layout.raid_algorithm  =
+				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
+
+	ret = ore_verify_layout(numdevs, &sbi->layout);
+
+	EXOFS_DBGMSG("exofs: layout: "
+		"num_comps=%u stripe_unit=0x%x group_width=%u "
+		"group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
+		numdevs,
+		sbi->layout.stripe_unit,
+		sbi->layout.group_width,
+		_LLU(sbi->layout.group_depth),
+		sbi->layout.mirrors_p1,
+		sbi->layout.raid_algorithm);
+	return ret;
+}
+
+static unsigned __ra_pages(struct ore_layout *layout)
+{
+	const unsigned _MIN_RA = 32; /* min 128K read-ahead */
+	unsigned ra_pages = layout->group_width * layout->stripe_unit /
+				PAGE_SIZE;
+	unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
+
+	ra_pages *= 2; /* two stripes */
+	if (ra_pages < _MIN_RA)
+		ra_pages = roundup(_MIN_RA, ra_pages / 2);
+
+	if (ra_pages > max_io_pages)
+		ra_pages = max_io_pages;
+
+	return ra_pages;
+}
+
+/* @odi is valid only as long as @fscb_dev is valid */
+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
+			     struct osd_dev_info *odi)
+{
+	odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
+	if (likely(odi->systemid_len))
+		memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
+
+	odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
+	odi->osdname = dt_dev->osdname;
+
+	/* FIXME support long names. Will need a _put function */
+	if (dt_dev->long_name_offset)
+		return -EINVAL;
+
+	/* Make sure osdname is printable!
+	 * mkexofs should give us space for a null-terminator else the
+	 * device-table is invalid.
+	 */
+	if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
+		odi->osdname_len = sizeof(dt_dev->osdname) - 1;
+	dt_dev->osdname[odi->osdname_len] = 0;
+
+	/* If it's all zeros something is bad we read past end-of-obj */
+	return !(odi->systemid_len || odi->osdname_len);
+}
+
+static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+		      struct exofs_dev **peds)
+{
+	struct __alloc_ore_devs_and_exofs_devs {
+		/* Twice bigger table: See exofs_init_comps() and comment at
+		 * exofs_read_lookup_dev_table()
+		 */
+		struct ore_dev *oreds[numdevs * 2 - 1];
+		struct exofs_dev eds[numdevs];
+	} *aoded;
+	struct exofs_dev *eds;
+	unsigned i;
+
+	aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+	if (unlikely(!aoded)) {
+		EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
+			  numdevs);
+		return -ENOMEM;
+	}
+
+	sbi->oc.ods = aoded->oreds;
+	*peds = eds = aoded->eds;
+	for (i = 0; i < numdevs; ++i)
+		aoded->oreds[i] = &eds[i].ored;
+	return 0;
+}
+
+static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
+				       struct osd_dev *fscb_od,
+				       unsigned table_count)
+{
+	struct ore_comp comp;
+	struct exofs_device_table *dt;
+	struct exofs_dev *eds;
+	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
+					     sizeof(*dt);
+	unsigned numdevs, i;
+	int ret;
+
+	dt = kmalloc(table_bytes, GFP_KERNEL);
+	if (unlikely(!dt)) {
+		EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
+			  table_bytes);
+		return -ENOMEM;
+	}
+
+	sbi->oc.numdevs = 0;
+
+	comp.obj.partition = sbi->one_comp.obj.partition;
+	comp.obj.id = EXOFS_DEVTABLE_ID;
+	exofs_make_credential(comp.cred, &comp.obj);
+
+	ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
+			      table_bytes);
+	if (unlikely(ret)) {
+		EXOFS_ERR("ERROR: reading device table\n");
+		goto out;
+	}
+
+	numdevs = le64_to_cpu(dt->dt_num_devices);
+	if (unlikely(!numdevs)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	WARN_ON(table_count != numdevs);
+
+	ret = _read_and_match_data_map(sbi, numdevs, dt);
+	if (unlikely(ret))
+		goto out;
+
+	ret = __alloc_dev_table(sbi, numdevs, &eds);
+	if (unlikely(ret))
+		goto out;
+	/* exofs round-robins the device table view according to inode
+	 * number. We hold a: twice bigger table hence inodes can point
+	 * to any device and have a sequential view of the table
+	 * starting at this device. See exofs_init_comps()
+	 */
+	memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+		(numdevs - 1) * sizeof(sbi->oc.ods[0]));
+
+	/* create sysfs subdir under which we put the device table
+	 * And cluster layout. A Superblock is identified by the string:
+	 *	"dev[0].osdname"_"pid"
+	 */
+	exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
+
+	for (i = 0; i < numdevs; i++) {
+		struct exofs_fscb fscb;
+		struct osd_dev_info odi;
+		struct osd_dev *od;
+
+		if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
+			EXOFS_ERR("ERROR: Read all-zeros device entry\n");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
+		       i, odi.osdname);
+
+		/* the exofs id is currently the table index */
+		eds[i].did = i;
+
+		/* On all devices the device table is identical. The user can
+		 * specify any one of the participating devices on the command
+		 * line. We always keep them in device-table order.
+		 */
+		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
+			eds[i].ored.od = fscb_od;
+			++sbi->oc.numdevs;
+			fscb_od = NULL;
+			exofs_sysfs_odev_add(&eds[i], sbi);
+			continue;
+		}
+
+		od = osduld_info_lookup(&odi);
+		if (IS_ERR(od)) {
+			ret = PTR_ERR(od);
+			EXOFS_ERR("ERROR: device requested is not found "
+				  "osd_name-%s =>%d\n", odi.osdname, ret);
+			goto out;
+		}
+
+		eds[i].ored.od = od;
+		++sbi->oc.numdevs;
+
+		/* Read the fscb of the other devices to make sure the FS
+		 * partition is there.
+		 */
+		ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
+				      sizeof(fscb));
+		if (unlikely(ret)) {
+			EXOFS_ERR("ERROR: Malformed participating device "
+				  "error reading fscb osd_name-%s\n",
+				  odi.osdname);
+			goto out;
+		}
+		exofs_sysfs_odev_add(&eds[i], sbi);
+
+		/* TODO: verify other information is correct and FS-uuid
+		 *	 matches. Benny what did you say about device table
+		 *	 generation and old devices?
+		 */
+	}
+
+out:
+	kfree(dt);
+	if (unlikely(fscb_od && !ret)) {
+			EXOFS_ERR("ERROR: Bad device-table container device not present\n");
+			osduld_put_device(fscb_od);
+			return -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root;
+	struct exofs_mountopt *opts = data;
+	struct exofs_sb_info *sbi;	/*extended info                  */
+	struct osd_dev *od;		/* Master device                 */
+	struct exofs_fscb fscb;		/*on-disk superblock info        */
+	struct ore_comp comp;
+	unsigned table_count;
+	int ret;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	/* use mount options to fill superblock */
+	if (opts->is_osdname) {
+		struct osd_dev_info odi = {.systemid_len = 0};
+
+		odi.osdname_len = strlen(opts->dev_name);
+		odi.osdname = (u8 *)opts->dev_name;
+		od = osduld_info_lookup(&odi);
+		kfree(opts->dev_name);
+		opts->dev_name = NULL;
+	} else {
+		od = osduld_path_lookup(opts->dev_name);
+	}
+	if (IS_ERR(od)) {
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	/* Default layout in case we do not have a device-table */
+	sbi->layout.stripe_unit = PAGE_SIZE;
+	sbi->layout.mirrors_p1 = 1;
+	sbi->layout.group_width = 1;
+	sbi->layout.group_depth = -1;
+	sbi->layout.group_count = 1;
+	sbi->s_timeout = opts->timeout;
+
+	sbi->one_comp.obj.partition = opts->pid;
+	sbi->one_comp.obj.id = 0;
+	exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
+	sbi->oc.single_comp = EC_SINGLE_COMP;
+	sbi->oc.comps = &sbi->one_comp;
+
+	/* fill in some other data by hand */
+	memset(sb->s_id, 0, sizeof(sb->s_id));
+	strcpy(sb->s_id, "exofs");
+	sb->s_blocksize = EXOFS_BLKSIZE;
+	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_max_links = EXOFS_LINK_MAX;
+	atomic_set(&sbi->s_curr_pending, 0);
+	sb->s_bdev = NULL;
+	sb->s_dev = 0;
+
+	comp.obj.partition = sbi->one_comp.obj.partition;
+	comp.obj.id = EXOFS_SUPER_ID;
+	exofs_make_credential(comp.cred, &comp.obj);
+
+	ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
+	if (unlikely(ret))
+		goto free_sbi;
+
+	sb->s_magic = le16_to_cpu(fscb.s_magic);
+	/* NOTE: we read below to be backward compatible with old versions */
+	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+
+	/* make sure what we read from the object store is correct */
+	if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+		if (!silent)
+			EXOFS_ERR("ERROR: Bad magic value\n");
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+	if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
+		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
+			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	/* start generation numbers from a random point */
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	table_count = le64_to_cpu(fscb.s_dev_table_count);
+	if (table_count) {
+		ret = exofs_read_lookup_dev_table(sbi, od, table_count);
+		if (unlikely(ret))
+			goto free_sbi;
+	} else {
+		struct exofs_dev *eds;
+
+		ret = __alloc_dev_table(sbi, 1, &eds);
+		if (unlikely(ret))
+			goto free_sbi;
+
+		ore_comp_set_dev(&sbi->oc, 0, od);
+		sbi->oc.numdevs = 1;
+	}
+
+	__sbi_read_stats(sbi);
+
+	/* set up operation vectors */
+	sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
+	sb->s_bdi = &sbi->bdi;
+	sb->s_fs_info = sbi;
+	sb->s_op = &exofs_sops;
+	sb->s_export_op = &exofs_export_ops;
+	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+	if (IS_ERR(root)) {
+		EXOFS_ERR("ERROR: exofs_iget failed\n");
+		ret = PTR_ERR(root);
+		goto free_sbi;
+	}
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		EXOFS_ERR("ERROR: get root inode failed\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	if (!S_ISDIR(root->i_mode)) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+		EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+		       root->i_mode);
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	ret = bdi_setup_and_register(&sbi->bdi, "exofs");
+	if (ret) {
+		EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
+		dput(sb->s_root);
+		sb->s_root = NULL;
+		goto free_sbi;
+	}
+
+	exofs_sysfs_dbg_print();
+	_exofs_print_device("Mounting", opts->dev_name,
+			    ore_comp_dev(&sbi->oc, 0),
+			    sbi->one_comp.obj.partition);
+	return 0;
+
+free_sbi:
+	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
+		  opts->dev_name, sbi->one_comp.obj.partition, ret);
+	exofs_free_sbi(sbi);
+	return ret;
+}
+
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static struct dentry *exofs_mount(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data)
+{
+	struct exofs_mountopt opts;
+	int ret;
+
+	ret = parse_options(data, &opts);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!opts.dev_name)
+		opts.dev_name = dev_name;
+	return mount_nodev(type, flags, &opts, exofs_fill_super);
+}
+
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct ore_io_state *ios;
+	struct osd_attr attrs[] = {
+		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+		ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+			OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+	};
+	uint64_t capacity = ULLONG_MAX;
+	uint64_t used = ULLONG_MAX;
+	int ret;
+
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
+	if (ret) {
+		EXOFS_DBGMSG("ore_get_io_state failed.\n");
+		return ret;
+	}
+
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = ore_read(ios);
+	if (unlikely(ret))
+		goto out;
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (likely(!ret)) {
+		capacity = get_unaligned_be64(attrs[0].val_ptr);
+		if (unlikely(!capacity))
+			capacity = ULLONG_MAX;
+	} else
+		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+
+	ret = extract_attr_from_ios(ios, &attrs[1]);
+	if (likely(!ret))
+		used = get_unaligned_be64(attrs[1].val_ptr);
+	else
+		EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+
+	/* fill in the stats buffer */
+	buf->f_type = EXOFS_SUPER_MAGIC;
+	buf->f_bsize = EXOFS_BLKSIZE;
+	buf->f_blocks = capacity >> 9;
+	buf->f_bfree = (capacity - used) >> 9;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = sbi->s_numfiles;
+	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+	buf->f_namelen = EXOFS_NAME_LEN;
+
+out:
+	ore_put_io_state(ios);
+	return ret;
+}
+
+static const struct super_operations exofs_sops = {
+	.alloc_inode    = exofs_alloc_inode,
+	.destroy_inode  = exofs_destroy_inode,
+	.write_inode    = exofs_write_inode,
+	.evict_inode    = exofs_evict_inode,
+	.put_super      = exofs_put_super,
+	.sync_fs	= exofs_sync_fs,
+	.statfs         = exofs_statfs,
+};
+
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+
+static struct dentry *exofs_get_parent(struct dentry *child)
+{
+	unsigned long ino = exofs_parent_ino(child);
+
+	if (!ino)
+		return ERR_PTR(-ESTALE);
+
+	return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino));
+}
+
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	inode = exofs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static const struct export_operations exofs_export_ops = {
+	.fh_to_dentry = exofs_fh_to_dentry,
+	.fh_to_parent = exofs_fh_to_parent,
+	.get_parent = exofs_get_parent,
+};
+
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "exofs",
+	.mount          = exofs_mount,
+	.kill_sb        = generic_shutdown_super,
+};
+MODULE_ALIAS_FS("exofs");
+
+static int __init init_exofs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		goto out;
+
+	err = register_filesystem(&exofs_type);
+	if (err)
+		goto out_d;
+
+	/* We don't fail if sysfs creation failed */
+	exofs_sysfs_init();
+
+	return 0;
+out_d:
+	destroy_inodecache();
+out:
+	return err;
+}
+
+static void __exit exit_exofs(void)
+{
+	exofs_sysfs_uninit();
+	unregister_filesystem(&exofs_type);
+	destroy_inodecache();
+}
+
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+
+module_init(init_exofs)
+module_exit(exit_exofs)
diff --git a/kernel/fs/exofs/symlink.c b/kernel/fs/exofs/symlink.c
new file mode 100644
index 000000000..6f6f3a4c1
--- /dev/null
+++ b/kernel/fs/exofs/symlink.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/namei.h>
+
+#include "exofs.h"
+
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct exofs_i_info *oi = exofs_i(d_inode(dentry));
+
+	nd_set_link(nd, (char *)oi->i_data);
+	return NULL;
+}
+
+const struct inode_operations exofs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= exofs_follow_link,
+};
diff --git a/kernel/fs/exofs/sys.c b/kernel/fs/exofs/sys.c
new file mode 100644
index 000000000..5e6a2c0a1
--- /dev/null
+++ b/kernel/fs/exofs/sys.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2012
+ * Sachin Bhamare <sbhamare@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License 2 as published by
+ * the Free Software Foundation.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the:
+ *	Free Software Foundation <licensing@fsf.org>
+ */
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+
+#include "exofs.h"
+
+struct odev_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct exofs_dev *, char *);
+	ssize_t (*store)(struct exofs_dev *, const char *, size_t);
+};
+
+static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
+		char *buf)
+{
+	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+	return a->show ? a->show(edp, buf) : 0;
+}
+
+static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
+		const char *buf, size_t len)
+{
+	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+	return a->store ? a->store(edp, buf, len) : len;
+}
+
+static const struct sysfs_ops odev_attr_ops = {
+	.show  = odev_attr_show,
+	.store = odev_attr_store,
+};
+
+
+static struct kset *exofs_kset;
+
+static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
+{
+	struct osd_dev *odev = edp->ored.od;
+	const struct osd_dev_info *odi = osduld_device_info(odev);
+
+	return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
+}
+
+static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
+{
+	struct osd_dev *odev = edp->ored.od;
+	const struct osd_dev_info *odi = osduld_device_info(odev);
+
+	memcpy(buf, odi->systemid, odi->systemid_len);
+	return odi->systemid_len;
+}
+
+static ssize_t uri_show(struct exofs_dev *edp, char *buf)
+{
+	return snprintf(buf, edp->urilen, "%s", edp->uri);
+}
+
+static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
+{
+	uint8_t *new_uri;
+
+	edp->urilen = strlen(buf) + 1;
+	new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
+	if (new_uri == NULL)
+		return -ENOMEM;
+	edp->uri = new_uri;
+	strncpy(edp->uri, buf, edp->urilen);
+	return edp->urilen;
+}
+
+#define OSD_ATTR(name, mode, show, store) \
+	static struct odev_attr odev_attr_##name = \
+					__ATTR(name, mode, show, store)
+
+OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
+OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
+OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
+
+static struct attribute *odev_attrs[] = {
+	&odev_attr_osdname.attr,
+	&odev_attr_systemid.attr,
+	&odev_attr_uri.attr,
+	NULL,
+};
+
+static struct kobj_type odev_ktype = {
+	.default_attrs	= odev_attrs,
+	.sysfs_ops	= &odev_attr_ops,
+};
+
+static struct kobj_type uuid_ktype = {
+};
+
+void exofs_sysfs_dbg_print(void)
+{
+#ifdef CONFIG_EXOFS_DEBUG
+	struct kobject *k_name, *k_tmp;
+
+	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+		printk(KERN_INFO "%s: name %s ref %d\n",
+			__func__, kobject_name(k_name),
+			(int)atomic_read(&k_name->kref.refcount));
+	}
+#endif
+}
+/*
+ * This function removes all kobjects under exofs_kset
+ * At the end of it, exofs_kset kobject will have a refcount
+ * of 1 which gets decremented only on exofs module unload
+ */
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
+{
+	struct kobject *k_name, *k_tmp;
+	struct kobject *s_kobj = &sbi->s_kobj;
+
+	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+		/* Remove all that are children of this SBI */
+		if (k_name->parent == s_kobj)
+			kobject_put(k_name);
+	}
+	kobject_put(s_kobj);
+}
+
+/*
+ * This function creates sysfs entries to hold the current exofs cluster
+ * instance (uniquely identified by osdname,pid tuple).
+ * This function gets called once per exofs mount instance.
+ */
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+		       struct exofs_dt_device_info *dt_dev)
+{
+	struct kobject *s_kobj;
+	int retval = 0;
+	uint64_t pid = sbi->one_comp.obj.partition;
+
+	/* allocate new uuid dirent */
+	s_kobj = &sbi->s_kobj;
+	s_kobj->kset = exofs_kset;
+	retval = kobject_init_and_add(s_kobj, &uuid_ktype,
+			&exofs_kset->kobj,  "%s_%llx", dt_dev->osdname, pid);
+	if (retval) {
+		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+			  "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
+{
+	struct kobject *d_kobj;
+	int retval = 0;
+
+	/* create osd device group which contains following attributes
+	 * osdname, systemid & uri
+	 */
+	d_kobj = &edev->ed_kobj;
+	d_kobj->kset = exofs_kset;
+	retval = kobject_init_and_add(d_kobj, &odev_ktype,
+			&sbi->s_kobj, "dev%u", edev->did);
+	if (retval) {
+		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+				"device dev%u\n", edev->did);
+		return retval;
+	}
+	return 0;
+}
+
+int exofs_sysfs_init(void)
+{
+	exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
+	if (!exofs_kset) {
+		EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void exofs_sysfs_uninit(void)
+{
+	kset_unregister(exofs_kset);
+}
diff --git a/kernel/fs/exportfs/Makefile b/kernel/fs/exportfs/Makefile
new file mode 100644
index 000000000..d7c5d4ddb
--- /dev/null
+++ b/kernel/fs/exportfs/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the filesystem export support routines.
+
+obj-$(CONFIG_EXPORTFS) += exportfs.o
+
+exportfs-objs := expfs.o
diff --git a/kernel/fs/exportfs/expfs.c b/kernel/fs/exportfs/expfs.c
new file mode 100644
index 000000000..714cd37a6
--- /dev/null
+++ b/kernel/fs/exportfs/expfs.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (C) Neil Brown 2002
+ * Copyright (C) Christoph Hellwig 2007
+ *
+ * This file contains the code mapping from inodes to NFS file handles,
+ * and for mapping back from file handles to dentries.
+ *
+ * For details on why we do all the strange and hairy things in here
+ * take a look at Documentation/filesystems/nfs/Exporting.
+ */
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#define dprintk(fmt, args...) do{}while(0)
+
+
+static int get_name(const struct path *path, char *name, struct dentry *child);
+
+
+static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir,
+		char *name, struct dentry *child)
+{
+	const struct export_operations *nop = dir->d_sb->s_export_op;
+	struct path path = {.mnt = mnt, .dentry = dir};
+
+	if (nop->get_name)
+		return nop->get_name(dir, name, child);
+	else
+		return get_name(&path, name, child);
+}
+
+/*
+ * Check if the dentry or any of it's aliases is acceptable.
+ */
+static struct dentry *
+find_acceptable_alias(struct dentry *result,
+		int (*acceptable)(void *context, struct dentry *dentry),
+		void *context)
+{
+	struct dentry *dentry, *toput = NULL;
+	struct inode *inode;
+
+	if (acceptable(context, result))
+		return result;
+
+	inode = result->d_inode;
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
+		dget(dentry);
+		spin_unlock(&inode->i_lock);
+		if (toput)
+			dput(toput);
+		if (dentry != result && acceptable(context, dentry)) {
+			dput(result);
+			return dentry;
+		}
+		spin_lock(&inode->i_lock);
+		toput = dentry;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (toput)
+		dput(toput);
+	return NULL;
+}
+
+static bool dentry_connected(struct dentry *dentry)
+{
+	dget(dentry);
+	while (dentry->d_flags & DCACHE_DISCONNECTED) {
+		struct dentry *parent = dget_parent(dentry);
+
+		dput(dentry);
+		if (IS_ROOT(dentry)) {
+			dput(parent);
+			return false;
+		}
+		dentry = parent;
+	}
+	dput(dentry);
+	return true;
+}
+
+static void clear_disconnected(struct dentry *dentry)
+{
+	dget(dentry);
+	while (dentry->d_flags & DCACHE_DISCONNECTED) {
+		struct dentry *parent = dget_parent(dentry);
+
+		WARN_ON_ONCE(IS_ROOT(dentry));
+
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_DISCONNECTED;
+		spin_unlock(&dentry->d_lock);
+
+		dput(dentry);
+		dentry = parent;
+	}
+	dput(dentry);
+}
+
+/*
+ * Reconnect a directory dentry with its parent.
+ *
+ * This can return a dentry, or NULL, or an error.
+ *
+ * In the first case the returned dentry is the parent of the given
+ * dentry, and may itself need to be reconnected to its parent.
+ *
+ * In the NULL case, a concurrent VFS operation has either renamed or
+ * removed this directory.  The concurrent operation has reconnected our
+ * dentry, so we no longer need to.
+ */
+static struct dentry *reconnect_one(struct vfsmount *mnt,
+		struct dentry *dentry, char *nbuf)
+{
+	struct dentry *parent;
+	struct dentry *tmp;
+	int err;
+
+	parent = ERR_PTR(-EACCES);
+	mutex_lock(&dentry->d_inode->i_mutex);
+	if (mnt->mnt_sb->s_export_op->get_parent)
+		parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	if (IS_ERR(parent)) {
+		dprintk("%s: get_parent of %ld failed, err %d\n",
+			__func__, dentry->d_inode->i_ino, PTR_ERR(parent));
+		return parent;
+	}
+
+	dprintk("%s: find name of %lu in %lu\n", __func__,
+		dentry->d_inode->i_ino, parent->d_inode->i_ino);
+	err = exportfs_get_name(mnt, parent, nbuf, dentry);
+	if (err == -ENOENT)
+		goto out_reconnected;
+	if (err)
+		goto out_err;
+	dprintk("%s: found name: %s\n", __func__, nbuf);
+	mutex_lock(&parent->d_inode->i_mutex);
+	tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
+	mutex_unlock(&parent->d_inode->i_mutex);
+	if (IS_ERR(tmp)) {
+		dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+		goto out_err;
+	}
+	if (tmp != dentry) {
+		dput(tmp);
+		goto out_reconnected;
+	}
+	dput(tmp);
+	if (IS_ROOT(dentry)) {
+		err = -ESTALE;
+		goto out_err;
+	}
+	return parent;
+
+out_err:
+	dput(parent);
+	return ERR_PTR(err);
+out_reconnected:
+	dput(parent);
+	/*
+	 * Someone must have renamed our entry into another parent, in
+	 * which case it has been reconnected by the rename.
+	 *
+	 * Or someone removed it entirely, in which case filehandle
+	 * lookup will succeed but the directory is now IS_DEAD and
+	 * subsequent operations on it will fail.
+	 *
+	 * Alternatively, maybe there was no race at all, and the
+	 * filesystem is just corrupt and gave us a parent that doesn't
+	 * actually contain any entry pointing to this inode.  So,
+	 * double check that this worked and return -ESTALE if not:
+	 */
+	if (!dentry_connected(dentry))
+		return ERR_PTR(-ESTALE);
+	return NULL;
+}
+
+/*
+ * Make sure target_dir is fully connected to the dentry tree.
+ *
+ * On successful return, DCACHE_DISCONNECTED will be cleared on
+ * target_dir, and target_dir->d_parent->...->d_parent will reach the
+ * root of the filesystem.
+ *
+ * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected.
+ * But the converse is not true: target_dir may have DCACHE_DISCONNECTED
+ * set but already be connected.  In that case we'll verify the
+ * connection to root and then clear the flag.
+ *
+ * Note that target_dir could be removed by a concurrent operation.  In
+ * that case reconnect_path may still succeed with target_dir fully
+ * connected, but further operations using the filehandle will fail when
+ * necessary (due to S_DEAD being set on the directory).
+ */
+static int
+reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
+{
+	struct dentry *dentry, *parent;
+
+	dentry = dget(target_dir);
+
+	while (dentry->d_flags & DCACHE_DISCONNECTED) {
+		BUG_ON(dentry == mnt->mnt_sb->s_root);
+
+		if (IS_ROOT(dentry))
+			parent = reconnect_one(mnt, dentry, nbuf);
+		else
+			parent = dget_parent(dentry);
+
+		if (!parent)
+			break;
+		dput(dentry);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+		dentry = parent;
+	}
+	dput(dentry);
+	clear_disconnected(target_dir);
+	return 0;
+}
+
+struct getdents_callback {
+	struct dir_context ctx;
+	char *name;		/* name that was found. It already points to a
+				   buffer NAME_MAX+1 is size */
+	u64 ino;		/* the inum we are looking for */
+	int found;		/* inode matched? */
+	int sequence;		/* sequence counter */
+};
+
+/*
+ * A rather strange filldir function to capture
+ * the name matching the specified inode number.
+ */
+static int filldir_one(struct dir_context *ctx, const char *name, int len,
+			loff_t pos, u64 ino, unsigned int d_type)
+{
+	struct getdents_callback *buf =
+		container_of(ctx, struct getdents_callback, ctx);
+	int result = 0;
+
+	buf->sequence++;
+	if (buf->ino == ino && len <= NAME_MAX) {
+		memcpy(buf->name, name, len);
+		buf->name[len] = '\0';
+		buf->found = 1;
+		result = -1;
+	}
+	return result;
+}
+
+/**
+ * get_name - default export_operations->get_name function
+ * @path:   the directory in which to find a name
+ * @name:   a pointer to a %NAME_MAX+1 char buffer to store the name
+ * @child:  the dentry for the child directory.
+ *
+ * calls readdir on the parent until it finds an entry with
+ * the same inode number as the child, and returns that.
+ */
+static int get_name(const struct path *path, char *name, struct dentry *child)
+{
+	const struct cred *cred = current_cred();
+	struct inode *dir = path->dentry->d_inode;
+	int error;
+	struct file *file;
+	struct kstat stat;
+	struct path child_path = {
+		.mnt = path->mnt,
+		.dentry = child,
+	};
+	struct getdents_callback buffer = {
+		.ctx.actor = filldir_one,
+		.name = name,
+	};
+
+	error = -ENOTDIR;
+	if (!dir || !S_ISDIR(dir->i_mode))
+		goto out;
+	error = -EINVAL;
+	if (!dir->i_fop)
+		goto out;
+	/*
+	 * inode->i_ino is unsigned long, kstat->ino is u64, so the
+	 * former would be insufficient on 32-bit hosts when the
+	 * filesystem supports 64-bit inode numbers.  So we need to
+	 * actually call ->getattr, not just read i_ino:
+	 */
+	error = vfs_getattr_nosec(&child_path, &stat);
+	if (error)
+		return error;
+	buffer.ino = stat.ino;
+	/*
+	 * Open the directory ...
+	 */
+	file = dentry_open(path, O_RDONLY, cred);
+	error = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+
+	error = -EINVAL;
+	if (!file->f_op->iterate)
+		goto out_close;
+
+	buffer.sequence = 0;
+	while (1) {
+		int old_seq = buffer.sequence;
+
+		error = iterate_dir(file, &buffer.ctx);
+		if (buffer.found) {
+			error = 0;
+			break;
+		}
+
+		if (error < 0)
+			break;
+
+		error = -ENOENT;
+		if (old_seq == buffer.sequence)
+			break;
+	}
+
+out_close:
+	fput(file);
+out:
+	return error;
+}
+
+/**
+ * export_encode_fh - default export_operations->encode_fh function
+ * @inode:   the object to encode
+ * @fid:     where to store the file handle fragment
+ * @max_len: maximum length to store there
+ * @parent:  parent directory inode, if wanted
+ *
+ * This default encode_fh function assumes that the 32 inode number
+ * is suitable for locating an inode, and that the generation number
+ * can be used to check that it is still valid.  It places them in the
+ * filehandle fragment where export_decode_fh expects to find them.
+ */
+static int export_encode_fh(struct inode *inode, struct fid *fid,
+		int *max_len, struct inode *parent)
+{
+	int len = *max_len;
+	int type = FILEID_INO32_GEN;
+
+	if (parent && (len < 4)) {
+		*max_len = 4;
+		return FILEID_INVALID;
+	} else if (len < 2) {
+		*max_len = 2;
+		return FILEID_INVALID;
+	}
+
+	len = 2;
+	fid->i32.ino = inode->i_ino;
+	fid->i32.gen = inode->i_generation;
+	if (parent) {
+		fid->i32.parent_ino = parent->i_ino;
+		fid->i32.parent_gen = parent->i_generation;
+		len = 4;
+		type = FILEID_INO32_GEN_PARENT;
+	}
+	*max_len = len;
+	return type;
+}
+
+int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
+			     int *max_len, struct inode *parent)
+{
+	const struct export_operations *nop = inode->i_sb->s_export_op;
+
+	if (nop && nop->encode_fh)
+		return nop->encode_fh(inode, fid->raw, max_len, parent);
+
+	return export_encode_fh(inode, fid, max_len, parent);
+}
+EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
+
+int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
+		int connectable)
+{
+	int error;
+	struct dentry *p = NULL;
+	struct inode *inode = dentry->d_inode, *parent = NULL;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		p = dget_parent(dentry);
+		/*
+		 * note that while p might've ceased to be our parent already,
+		 * it's still pinned by and still positive.
+		 */
+		parent = p->d_inode;
+	}
+
+	error = exportfs_encode_inode_fh(inode, fid, max_len, parent);
+	dput(p);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(exportfs_encode_fh);
+
+struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+		int fh_len, int fileid_type,
+		int (*acceptable)(void *, struct dentry *), void *context)
+{
+	const struct export_operations *nop = mnt->mnt_sb->s_export_op;
+	struct dentry *result, *alias;
+	char nbuf[NAME_MAX+1];
+	int err;
+
+	/*
+	 * Try to get any dentry for the given file handle from the filesystem.
+	 */
+	if (!nop || !nop->fh_to_dentry)
+		return ERR_PTR(-ESTALE);
+	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
+	if (!result)
+		result = ERR_PTR(-ESTALE);
+	if (IS_ERR(result))
+		return result;
+
+	if (d_is_dir(result)) {
+		/*
+		 * This request is for a directory.
+		 *
+		 * On the positive side there is only one dentry for each
+		 * directory inode.  On the negative side this implies that we
+		 * to ensure our dentry is connected all the way up to the
+		 * filesystem root.
+		 */
+		if (result->d_flags & DCACHE_DISCONNECTED) {
+			err = reconnect_path(mnt, result, nbuf);
+			if (err)
+				goto err_result;
+		}
+
+		if (!acceptable(context, result)) {
+			err = -EACCES;
+			goto err_result;
+		}
+
+		return result;
+	} else {
+		/*
+		 * It's not a directory.  Life is a little more complicated.
+		 */
+		struct dentry *target_dir, *nresult;
+
+		/*
+		 * See if either the dentry we just got from the filesystem
+		 * or any alias for it is acceptable.  This is always true
+		 * if this filesystem is exported without the subtreecheck
+		 * option.  If the filesystem is exported with the subtree
+		 * check option there's a fair chance we need to look at
+		 * the parent directory in the file handle and make sure
+		 * it's connected to the filesystem root.
+		 */
+		alias = find_acceptable_alias(result, acceptable, context);
+		if (alias)
+			return alias;
+
+		/*
+		 * Try to extract a dentry for the parent directory from the
+		 * file handle.  If this fails we'll have to give up.
+		 */
+		err = -ESTALE;
+		if (!nop->fh_to_parent)
+			goto err_result;
+
+		target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
+				fh_len, fileid_type);
+		if (!target_dir)
+			goto err_result;
+		err = PTR_ERR(target_dir);
+		if (IS_ERR(target_dir))
+			goto err_result;
+
+		/*
+		 * And as usual we need to make sure the parent directory is
+		 * connected to the filesystem root.  The VFS really doesn't
+		 * like disconnected directories..
+		 */
+		err = reconnect_path(mnt, target_dir, nbuf);
+		if (err) {
+			dput(target_dir);
+			goto err_result;
+		}
+
+		/*
+		 * Now that we've got both a well-connected parent and a
+		 * dentry for the inode we're after, make sure that our
+		 * inode is actually connected to the parent.
+		 */
+		err = exportfs_get_name(mnt, target_dir, nbuf, result);
+		if (!err) {
+			mutex_lock(&target_dir->d_inode->i_mutex);
+			nresult = lookup_one_len(nbuf, target_dir,
+						 strlen(nbuf));
+			mutex_unlock(&target_dir->d_inode->i_mutex);
+			if (!IS_ERR(nresult)) {
+				if (nresult->d_inode) {
+					dput(result);
+					result = nresult;
+				} else
+					dput(nresult);
+			}
+		}
+
+		/*
+		 * At this point we are done with the parent, but it's pinned
+		 * by the child dentry anyway.
+		 */
+		dput(target_dir);
+
+		/*
+		 * And finally make sure the dentry is actually acceptable
+		 * to NFSD.
+		 */
+		alias = find_acceptable_alias(result, acceptable, context);
+		if (!alias) {
+			err = -EACCES;
+			goto err_result;
+		}
+
+		return alias;
+	}
+
+ err_result:
+	dput(result);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/ext2/Kconfig b/kernel/fs/ext2/Kconfig
new file mode 100644
index 000000000..c634874e1
--- /dev/null
+++ b/kernel/fs/ext2/Kconfig
@@ -0,0 +1,44 @@
+config EXT2_FS
+	tristate "Second extended fs support"
+	help
+	  Ext2 is a standard Linux file system for hard disks.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext2.
+
+	  If unsure, say Y.
+
+config EXT2_FS_XATTR
+	bool "Ext2 extended attributes"
+	depends on EXT2_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config EXT2_FS_POSIX_ACL
+	bool "Ext2 POSIX Access Control Lists"
+	depends on EXT2_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT2_FS_SECURITY
+	bool "Ext2 Security Labels"
+	depends on EXT2_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/kernel/fs/ext2/Makefile b/kernel/fs/ext2/Makefile
new file mode 100644
index 000000000..445b0e996
--- /dev/null
+++ b/kernel/fs/ext2/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ext2-filesystem routines.
+#
+
+obj-$(CONFIG_EXT2_FS) += ext2.o
+
+ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
+	  ioctl.o namei.o super.o symlink.o
+
+ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
+ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
+ext2-$(CONFIG_EXT2_FS_SECURITY)	 += xattr_security.o
diff --git a/kernel/fs/ext2/acl.c b/kernel/fs/ext2/acl.c
new file mode 100644
index 000000000..27695e6f4
--- /dev/null
+++ b/kernel/fs/ext2/acl.c
@@ -0,0 +1,257 @@
+/*
+ * linux/fs/ext2/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext2_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(ext2_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((ext2_acl_header *)value)->a_version !=
+	    cpu_to_le32(EXT2_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(ext2_acl_header);
+	count = ext2_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n=0; n < count; n++) {
+		ext2_acl_entry *entry =
+			(ext2_acl_entry *)value;
+		if ((char *)value + sizeof(ext2_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value +
+					sizeof(ext2_acl_entry_short);
+				break;
+
+			case ACL_USER:
+				value = (char *)value + sizeof(ext2_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_uid =
+					make_kuid(&init_user_ns,
+						  le32_to_cpu(entry->e_id));
+				break;
+			case ACL_GROUP:
+				value = (char *)value + sizeof(ext2_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_gid =
+					make_kgid(&init_user_ns,
+						  le32_to_cpu(entry->e_id));
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext2_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	ext2_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = ext2_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(ext2_acl_header) + acl->a_count *
+			sizeof(ext2_acl_entry), GFP_KERNEL);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(ext2_acl_header);
+	for (n=0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		ext2_acl_entry *entry = (ext2_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch(acl_e->e_tag) {
+			case ACL_USER:
+				entry->e_id = cpu_to_le32(
+					from_kuid(&init_user_ns, acl_e->e_uid));
+				e += sizeof(ext2_acl_entry);
+				break;
+			case ACL_GROUP:
+				entry->e_id = cpu_to_le32(
+					from_kgid(&init_user_ns, acl_e->e_gid));
+				e += sizeof(ext2_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(ext2_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * inode->i_mutex: don't care
+ */
+struct posix_acl *
+ext2_get_acl(struct inode *inode, int type)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = ext2_xattr_get(inode, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ext2_xattr_get(inode, name_index, "", value, retval);
+	}
+	if (retval > 0)
+		acl = ext2_acl_from_disk(value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+/*
+ * inode->i_mutex: down
+ */
+int
+ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+			if (acl) {
+				error = posix_acl_equiv_mode(acl, &inode->i_mode);
+				if (error < 0)
+					return error;
+				else {
+					inode->i_ctime = CURRENT_TIME_SEC;
+					mark_inode_dirty(inode);
+					if (error == 0)
+						acl = NULL;
+				}
+			}
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+			if (!S_ISDIR(inode->i_mode))
+				return acl ? -EACCES : 0;
+			break;
+
+		default:
+			return -EINVAL;
+	}
+ 	if (acl) {
+		value = ext2_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = ext2_xattr_set(inode, name_index, "", value, size, 0);
+
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext2_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext2_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *default_acl, *acl;
+	int error;
+
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (default_acl) {
+		error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+	return error;
+}
diff --git a/kernel/fs/ext2/acl.h b/kernel/fs/ext2/acl.h
new file mode 100644
index 000000000..44937f9fc
--- /dev/null
+++ b/kernel/fs/ext2/acl.h
@@ -0,0 +1,71 @@
+/*
+  File: fs/ext2/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT2_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} ext2_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} ext2_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} ext2_acl_header;
+
+static inline size_t ext2_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(ext2_acl_header) +
+		       count * sizeof(ext2_acl_entry_short);
+	} else {
+		return sizeof(ext2_acl_header) +
+		       4 * sizeof(ext2_acl_entry_short) +
+		       (count - 4) * sizeof(ext2_acl_entry);
+	}
+}
+
+static inline int ext2_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(ext2_acl_header);
+	s = size - 4 * sizeof(ext2_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(ext2_acl_entry_short))
+			return -1;
+		return size / sizeof(ext2_acl_entry_short);
+	} else {
+		if (s % sizeof(ext2_acl_entry))
+			return -1;
+		return s / sizeof(ext2_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+
+/* acl.c */
+extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
+extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int ext2_init_acl (struct inode *, struct inode *);
+
+#else
+#include <linux/sched.h>
+#define ext2_get_acl	NULL
+#define ext2_set_acl	NULL
+
+static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif
+
diff --git a/kernel/fs/ext2/balloc.c b/kernel/fs/ext2/balloc.c
new file mode 100644
index 000000000..9f9992b37
--- /dev/null
+++ b/kernel/fs/ext2/balloc.c
@@ -0,0 +1,1536 @@
+/*
+ *  linux/fs/ext2/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include "ext2.h"
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/buffer_head.h>
+#include <linux/capability.h>
+
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext2_fill_super).
+ */
+
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
+					     unsigned int block_group,
+					     struct buffer_head ** bh)
+{
+	unsigned long group_desc;
+	unsigned long offset;
+	struct ext2_group_desc * desc;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	if (block_group >= sbi->s_groups_count) {
+		ext2_error (sb, "ext2_get_group_desc",
+			    "block_group >= groups_count - "
+			    "block_group = %d, groups_count = %lu",
+			    block_group, sbi->s_groups_count);
+
+		return NULL;
+	}
+
+	group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb);
+	offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1);
+	if (!sbi->s_group_desc[group_desc]) {
+		ext2_error (sb, "ext2_get_group_desc",
+			    "Group descriptor not loaded - "
+			    "block_group = %d, group_desc = %lu, desc = %lu",
+			     block_group, group_desc, offset);
+		return NULL;
+	}
+
+	desc = (struct ext2_group_desc *) sbi->s_group_desc[group_desc]->b_data;
+	if (bh)
+		*bh = sbi->s_group_desc[group_desc];
+	return desc + offset;
+}
+
+static int ext2_valid_block_bitmap(struct super_block *sb,
+					struct ext2_group_desc *desc,
+					unsigned int block_group,
+					struct buffer_head *bh)
+{
+	ext2_grpblk_t offset;
+	ext2_grpblk_t next_zero_bit;
+	ext2_fsblk_t bitmap_blk;
+	ext2_fsblk_t group_first_block;
+
+	group_first_block = ext2_group_first_block_no(sb, block_group);
+
+	/* check whether block bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext2_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext2_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode table block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
+	offset = bitmap_blk - group_first_block;
+	next_zero_bit = ext2_find_next_zero_bit(bh->b_data,
+				offset + EXT2_SB(sb)->s_itb_per_group,
+				offset);
+	if (next_zero_bit >= offset + EXT2_SB(sb)->s_itb_per_group)
+		/* good bitmap for inode tables */
+		return 1;
+
+err_out:
+	ext2_error(sb, __func__,
+			"Invalid block bitmap - "
+			"block_group = %d, block = %lu",
+			block_group, bitmap_blk);
+	return 0;
+}
+
+/*
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+	struct ext2_group_desc * desc;
+	struct buffer_head * bh = NULL;
+	ext2_fsblk_t bitmap_blk;
+
+	desc = ext2_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext2_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+	if (likely(bh_uptodate_or_lock(bh)))
+		return bh;
+
+	if (bh_submit_read(bh) < 0) {
+		brelse(bh);
+		ext2_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+
+	ext2_valid_block_bitmap(sb, desc, block_group, bh);
+	/*
+	 * file system mounted not to panic on error, continue with corrupt
+	 * bitmap
+	 */
+	return bh;
+}
+
+static void group_adjust_blocks(struct super_block *sb, int group_no,
+	struct ext2_group_desc *desc, struct buffer_head *bh, int count)
+{
+	if (count) {
+		struct ext2_sb_info *sbi = EXT2_SB(sb);
+		unsigned free_blocks;
+
+		spin_lock(sb_bgl_lock(sbi, group_no));
+		free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+		spin_unlock(sb_bgl_lock(sbi, group_no));
+		mark_buffer_dirty(bh);
+	}
+}
+
+/*
+ * The reservation window structure operations
+ * --------------------------------------------
+ * Operations include:
+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
+ *
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:		root of per-filesystem reservation rb tree
+ * @verbose:		verbose mode
+ * @fn:			function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end). Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+			      const char *fn)
+{
+	struct rb_node *n;
+	struct ext2_reserve_window_node *rsv, *prev;
+	int bad;
+
+restart:
+	n = rb_first(root);
+	bad = 0;
+	prev = NULL;
+
+	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+	while (n) {
+		rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node);
+		if (verbose)
+			printk("reservation window 0x%p "
+				"start: %lu, end: %lu\n",
+				rsv, rsv->rsv_start, rsv->rsv_end);
+		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+			printk("Bad reservation %p (start >= end)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (prev && prev->rsv_end >= rsv->rsv_start) {
+			printk("Bad reservation %p (prev->end >= start)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (bad) {
+			if (!verbose) {
+				printk("Restarting reservation walk in verbose mode\n");
+				verbose = 1;
+				goto restart;
+			}
+		}
+		n = rb_next(n);
+		prev = rsv;
+	}
+	printk("Window map complete.\n");
+	BUG_ON(bad);
+}
+#define rsv_window_dump(root, verbose) \
+	__rsv_window_dump((root), (verbose), __func__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
+
+/**
+ * goal_in_my_reservation()
+ * @rsv:		inode's reservation window
+ * @grp_goal:		given goal block relative to the allocation block group
+ * @group:		the current allocation block group
+ * @sb:			filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
+static int
+goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal,
+			unsigned int group, struct super_block * sb)
+{
+	ext2_fsblk_t group_first_block, group_last_block;
+
+	group_first_block = ext2_group_first_block_no(sb, group);
+	group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1;
+
+	if ((rsv->_rsv_start > group_last_block) ||
+	    (rsv->_rsv_end < group_first_block))
+		return 0;
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
+		return 0;
+	return 1;
+}
+
+/**
+ * search_reserve_window()
+ * @rb_root:		root of reservation tree
+ * @goal:		target allocation block
+ *
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct ext2_reserve_window_node *
+search_reserve_window(struct rb_root *root, ext2_fsblk_t goal)
+{
+	struct rb_node *n = root->rb_node;
+	struct ext2_reserve_window_node *rsv;
+
+	if (!n)
+		return NULL;
+
+	do {
+		rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node);
+
+		if (goal < rsv->rsv_start)
+			n = n->rb_left;
+		else if (goal > rsv->rsv_end)
+			n = n->rb_right;
+		else
+			return rsv;
+	} while (n);
+	/*
+	 * We've fallen off the end of the tree: the goal wasn't inside
+	 * any particular node.  OK, the previous node must be to one
+	 * side of the interval containing the goal.  If it's the RHS,
+	 * we need to back up one.
+	 */
+	if (rsv->rsv_start > goal) {
+		n = rb_prev(&rsv->rsv_node);
+		rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node);
+	}
+	return rsv;
+}
+
+/*
+ * ext2_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:			super block
+ * @rsv:		reservation window to add
+ *
+ * Must be called with rsv_lock held.
+ */
+void ext2_rsv_window_add(struct super_block *sb,
+		    struct ext2_reserve_window_node *rsv)
+{
+	struct rb_root *root = &EXT2_SB(sb)->s_rsv_window_root;
+	struct rb_node *node = &rsv->rsv_node;
+	ext2_fsblk_t start = rsv->rsv_start;
+
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ext2_reserve_window_node *this;
+
+	while (*p)
+	{
+		parent = *p;
+		this = rb_entry(parent, struct ext2_reserve_window_node, rsv_node);
+
+		if (start < this->rsv_start)
+			p = &(*p)->rb_left;
+		else if (start > this->rsv_end)
+			p = &(*p)->rb_right;
+		else {
+			rsv_window_dump(root, 1);
+			BUG();
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+}
+
+/**
+ * rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:			super block
+ * @rsv:		reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock held.
+ */
+static void rsv_window_remove(struct super_block *sb,
+			      struct ext2_reserve_window_node *rsv)
+{
+	rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_alloc_hit = 0;
+	rb_erase(&rsv->rsv_node, &EXT2_SB(sb)->s_rsv_window_root);
+}
+
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:		given reservation window to check
+ *
+ * returns 1 if the end block is EXT2_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
+static inline int rsv_is_empty(struct ext2_reserve_window *rsv)
+{
+	/* a valid reservation end block could not be 0 */
+	return (rsv->_rsv_end == EXT2_RESERVE_WINDOW_NOT_ALLOCATED);
+}
+
+/**
+ * ext2_init_block_alloc_info()
+ * @inode:		file inode structure
+ *
+ * Allocate and initialize the  reservation window structure, and
+ * link the window to the ext2 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext2 inode the first time the open file
+ * needs a new block. So, before every ext2_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext2_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to calling this function.
+ */
+void ext2_init_block_alloc_info(struct inode *inode)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	struct ext2_block_alloc_info *block_i;
+	struct super_block *sb = inode->i_sb;
+
+	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+	if (block_i) {
+		struct ext2_reserve_window_node *rsv = &block_i->rsv_window_node;
+
+		rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+		rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+
+	 	/*
+		 * if filesystem is mounted with NORESERVATION, the goal
+		 * reservation window size is set to zero to indicate
+		 * block reservation is off
+		 */
+		if (!test_opt(sb, RESERVATION))
+			rsv->rsv_goal_size = 0;
+		else
+			rsv->rsv_goal_size = EXT2_DEFAULT_RESERVE_BLOCKS;
+		rsv->rsv_alloc_hit = 0;
+		block_i->last_alloc_logical_block = 0;
+		block_i->last_alloc_physical_block = 0;
+	}
+	ei->i_block_alloc_info = block_i;
+}
+
+/**
+ * ext2_discard_reservation()
+ * @inode:		inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ * 	ext2_release_file(): last writer closes the file
+ * 	ext2_clear_inode(): last iput(), when nobody links to this file.
+ * 	ext2_truncate(): when the block indirect map is about to change.
+ */
+void ext2_discard_reservation(struct inode *inode)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct ext2_reserve_window_node *rsv;
+	spinlock_t *rsv_lock = &EXT2_SB(inode->i_sb)->s_rsv_window_lock;
+
+	if (!block_i)
+		return;
+
+	rsv = &block_i->rsv_window_node;
+	if (!rsv_is_empty(&rsv->rsv_window)) {
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&rsv->rsv_window))
+			rsv_window_remove(inode->i_sb, rsv);
+		spin_unlock(rsv_lock);
+	}
+}
+
+/**
+ * ext2_free_blocks() -- Free given blocks and update quota and i_blocks
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to free
+ */
+void ext2_free_blocks (struct inode * inode, unsigned long block,
+		       unsigned long count)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head * bh2;
+	unsigned long block_group;
+	unsigned long bit;
+	unsigned long i;
+	unsigned long overflow;
+	struct super_block * sb = inode->i_sb;
+	struct ext2_sb_info * sbi = EXT2_SB(sb);
+	struct ext2_group_desc * desc;
+	struct ext2_super_block * es = sbi->s_es;
+	unsigned freed = 0, group_freed;
+
+	if (block < le32_to_cpu(es->s_first_data_block) ||
+	    block + count < block ||
+	    block + count > le32_to_cpu(es->s_blocks_count)) {
+		ext2_error (sb, "ext2_free_blocks",
+			    "Freeing blocks not in datazone - "
+			    "block = %lu, count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext2_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
+
+do_more:
+	overflow = 0;
+	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+		      EXT2_BLOCKS_PER_GROUP(sb);
+	bit = (block - le32_to_cpu(es->s_first_data_block)) %
+		      EXT2_BLOCKS_PER_GROUP(sb);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT2_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT2_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	brelse(bitmap_bh);
+	bitmap_bh = read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+
+	desc = ext2_get_group_desc (sb, block_group, &bh2);
+	if (!desc)
+		goto error_return;
+
+	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
+	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
+	    in_range (block, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group) ||
+	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group)) {
+		ext2_error (sb, "ext2_free_blocks",
+			    "Freeing blocks in system zones - "
+			    "Block = %lu, count = %lu",
+			    block, count);
+		goto error_return;
+	}
+
+	for (i = 0, group_freed = 0; i < count; i++) {
+		if (!ext2_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+						bit + i, bitmap_bh->b_data)) {
+			ext2_error(sb, __func__,
+				"bit already cleared for block %lu", block + i);
+		} else {
+			group_freed++;
+		}
+	}
+
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+
+	group_adjust_blocks(sb, block_group, desc, bh2, group_freed);
+	freed += group_freed;
+
+	if (overflow) {
+		block += count;
+		count = overflow;
+		goto do_more;
+	}
+error_return:
+	brelse(bitmap_bh);
+	if (freed) {
+		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+		dquot_free_block_nodirty(inode, freed);
+		mark_inode_dirty(inode);
+	}
+}
+
+/**
+ * bitmap_search_next_usable_block()
+ * @start:		the starting block (group relative) of the search
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward through the actual bitmap on disk until
+ * we find a bit free.
+ */
+static ext2_grpblk_t
+bitmap_search_next_usable_block(ext2_grpblk_t start, struct buffer_head *bh,
+					ext2_grpblk_t maxblocks)
+{
+	ext2_grpblk_t next;
+
+	next = ext2_find_next_zero_bit(bh->b_data, maxblocks, start);
+	if (next >= maxblocks)
+		return -1;
+	return next;
+}
+
+/**
+ * find_next_usable_block()
+ * @start:		the starting block (group relative) to find next
+ * 			allocatable block in bitmap.
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap;
+ * then for any free bit in the bitmap.
+ */
+static ext2_grpblk_t
+find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+{
+	ext2_grpblk_t here, next;
+	char *p, *r;
+
+	if (start > 0) {
+		/*
+		 * The goal was occupied; search forward for a free 
+		 * block within the next XX blocks.
+		 *
+		 * end_goal is more or less random, but it has to be
+		 * less than EXT2_BLOCKS_PER_GROUP. Aligning up to the
+		 * next 64-bit boundary is simple..
+		 */
+		ext2_grpblk_t end_goal = (start + 63) & ~63;
+		if (end_goal > maxblocks)
+			end_goal = maxblocks;
+		here = ext2_find_next_zero_bit(bh->b_data, end_goal, start);
+		if (here < end_goal)
+			return here;
+		ext2_debug("Bit not found near goal\n");
+	}
+
+	here = start;
+	if (here < 0)
+		here = 0;
+
+	p = ((char *)bh->b_data) + (here >> 3);
+	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
+	next = (r - ((char *)bh->b_data)) << 3;
+
+	if (next < maxblocks && next >= here)
+		return next;
+
+	here = bitmap_search_next_usable_block(here, bh, maxblocks);
+	return here;
+}
+
+/**
+ * ext2_try_to_allocate()
+ * @sb:			superblock
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ * 	if there is a reservation window, only try to allocate block(s)
+ * 	from the file's own reservation window;
+ * 	Otherwise, the allocation range starts from the give goal block,
+ * 	ends at the block group's last block.
+ *
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap.
+ */
+static int
+ext2_try_to_allocate(struct super_block *sb, int group,
+			struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal,
+			unsigned long *count,
+			struct ext2_reserve_window *my_rsv)
+{
+	ext2_fsblk_t group_first_block;
+       	ext2_grpblk_t start, end;
+	unsigned long num = 0;
+
+	/* we do allocation within the reservation window if we have a window */
+	if (my_rsv) {
+		group_first_block = ext2_group_first_block_no(sb, group);
+		if (my_rsv->_rsv_start >= group_first_block)
+			start = my_rsv->_rsv_start - group_first_block;
+		else
+			/* reservation window cross group boundary */
+			start = 0;
+		end = my_rsv->_rsv_end - group_first_block + 1;
+		if (end > EXT2_BLOCKS_PER_GROUP(sb))
+			/* reservation window crosses group boundary */
+			end = EXT2_BLOCKS_PER_GROUP(sb);
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
+		else
+			grp_goal = -1;
+	} else {
+		if (grp_goal > 0)
+			start = grp_goal;
+		else
+			start = 0;
+		end = EXT2_BLOCKS_PER_GROUP(sb);
+	}
+
+	BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb));
+
+repeat:
+	if (grp_goal < 0) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
+			goto fail_access;
+		if (!my_rsv) {
+			int i;
+
+			for (i = 0; i < 7 && grp_goal > start &&
+					!ext2_test_bit(grp_goal - 1,
+					     		bitmap_bh->b_data);
+			     		i++, grp_goal--)
+				;
+		}
+	}
+	start = grp_goal;
+
+	if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal,
+			       				bitmap_bh->b_data)) {
+		/*
+		 * The block was allocated by another thread, or it was
+		 * allocated and then freed by another thread
+		 */
+		start++;
+		grp_goal++;
+		if (start >= end)
+			goto fail_access;
+		goto repeat;
+	}
+	num++;
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group),
+					grp_goal, bitmap_bh->b_data)) {
+		num++;
+		grp_goal++;
+	}
+	*count = num;
+	return grp_goal - num;
+fail_access:
+	*count = num;
+	return -1;
+}
+
+/**
+ * 	find_next_reservable_window():
+ *		find a reservable space within the given range.
+ *		It does not allocate the reservation window for now:
+ *		alloc_new_reservation() will do the work later.
+ *
+ * 	@search_head: the head of the searching list;
+ *		This is not necessarily the list head of the whole filesystem
+ *
+ *		We have both head and start_block to assist the search
+ *		for the reservable space. The list starts from head,
+ *		but we will shift to the place where start_block is,
+ *		then start from there, when looking for a reservable space.
+ *
+ * 	@size: the target new reservation window size
+ *
+ * 	@group_first_block: the first block we consider to start
+ *			the real search from
+ *
+ * 	@last_block:
+ *		the maximum block number that our goal reservable space
+ *		could start from. This is normally the last block in this
+ *		group. The search will end when we found the start of next
+ *		possible reservable space is out of this boundary.
+ *		This could handle the cross boundary reservation window
+ *		request.
+ *
+ * 	basically we search from the given range, rather than the whole
+ * 	reservation double linked list, (start_block, last_block)
+ * 	to find a free region that is of my size and has not
+ * 	been reserved.
+ *
+ */
+static int find_next_reservable_window(
+				struct ext2_reserve_window_node *search_head,
+				struct ext2_reserve_window_node *my_rsv,
+				struct super_block * sb,
+				ext2_fsblk_t start_block,
+				ext2_fsblk_t last_block)
+{
+	struct rb_node *next;
+	struct ext2_reserve_window_node *rsv, *prev;
+	ext2_fsblk_t cur;
+	int size = my_rsv->rsv_goal_size;
+
+	/* TODO: make the start of the reservation window byte-aligned */
+	/* cur = *start_block & ~7;*/
+	cur = start_block;
+	rsv = search_head;
+	if (!rsv)
+		return -1;
+
+	while (1) {
+		if (cur <= rsv->rsv_end)
+			cur = rsv->rsv_end + 1;
+
+		/* TODO?
+		 * in the case we could not find a reservable space
+		 * that is what is expected, during the re-search, we could
+		 * remember what's the largest reservable space we could have
+		 * and return that one.
+		 *
+		 * For now it will fail if we could not find the reservable
+		 * space with expected-size (or more)...
+		 */
+		if (cur > last_block)
+			return -1;		/* fail */
+
+		prev = rsv;
+		next = rb_next(&rsv->rsv_node);
+		rsv = rb_entry(next,struct ext2_reserve_window_node,rsv_node);
+
+		/*
+		 * Reached the last reservation, we can just append to the
+		 * previous one.
+		 */
+		if (!next)
+			break;
+
+		if (cur + size <= rsv->rsv_start) {
+			/*
+			 * Found a reserveable space big enough.  We could
+			 * have a reservation across the group boundary here
+		 	 */
+			break;
+		}
+	}
+	/*
+	 * we come here either :
+	 * when we reach the end of the whole list,
+	 * and there is empty reservable space after last entry in the list.
+	 * append it to the end of the list.
+	 *
+	 * or we found one reservable space in the middle of the list,
+	 * return the reservation window that we could append to.
+	 * succeed.
+	 */
+
+	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+		rsv_window_remove(sb, my_rsv);
+
+	/*
+	 * Let's book the whole available window for now.  We will check the
+	 * disk bitmap later and then, if there are free blocks then we adjust
+	 * the window size if it's larger than requested.
+	 * Otherwise, we will remove this node from the tree next time
+	 * call find_next_reservable_window.
+	 */
+	my_rsv->rsv_start = cur;
+	my_rsv->rsv_end = cur + size - 1;
+	my_rsv->rsv_alloc_hit = 0;
+
+	if (prev != my_rsv)
+		ext2_rsv_window_add(sb, my_rsv);
+
+	return 0;
+}
+
+/**
+ * 	alloc_new_reservation()--allocate a new reservation window
+ *
+ *		To make a new reservation, we search part of the filesystem
+ *		reservation list (the list that inside the group). We try to
+ *		allocate a new reservation window near the allocation goal,
+ *		or the beginning of the group, if there is no goal.
+ *
+ *		We first find a reservable space after the goal, then from
+ *		there, we check the bitmap for the first free block after
+ *		it. If there is no free block until the end of group, then the
+ *		whole group is full, we failed. Otherwise, check if the free
+ *		block is inside the expected reservable space, if so, we
+ *		succeed.
+ *		If the first free block is outside the reservable space, then
+ *		start from the first free block, we search for next available
+ *		space, and go on.
+ *
+ *	on succeed, a new reservation will be found and inserted into the list
+ *	It contains at least one free block, and it does not overlap with other
+ *	reservation windows.
+ *
+ *	failed: we failed to find a reservation window in this group
+ *
+ *	@rsv: the reservation
+ *
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
+ *		free reservable space should start from.
+ *		if we have a goal(goal >0 ), then start from there,
+ *		no goal(goal = -1), we start from the first block
+ *		of the group.
+ *
+ *	@sb: the super block
+ *	@group: the group we are trying to allocate in
+ *	@bitmap_bh: the block group block bitmap
+ *
+ */
+static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv,
+		ext2_grpblk_t grp_goal, struct super_block *sb,
+		unsigned int group, struct buffer_head *bitmap_bh)
+{
+	struct ext2_reserve_window_node *search_head;
+	ext2_fsblk_t group_first_block, group_end_block, start_block;
+	ext2_grpblk_t first_free_block;
+	struct rb_root *fs_rsv_root = &EXT2_SB(sb)->s_rsv_window_root;
+	unsigned long size;
+	int ret;
+	spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock;
+
+	group_first_block = ext2_group_first_block_no(sb, group);
+	group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+
+	if (grp_goal < 0)
+		start_block = group_first_block;
+	else
+		start_block = grp_goal + group_first_block;
+
+	size = my_rsv->rsv_goal_size;
+
+	if (!rsv_is_empty(&my_rsv->rsv_window)) {
+		/*
+		 * if the old reservation is cross group boundary
+		 * and if the goal is inside the old reservation window,
+		 * we will come here when we just failed to allocate from
+		 * the first part of the window. We still have another part
+		 * that belongs to the next group. In this case, there is no
+		 * point to discard our window and try to allocate a new one
+		 * in this group(which will fail). we should
+		 * keep the reservation window, just simply move on.
+		 *
+		 * Maybe we could shift the start block of the reservation
+		 * window to the first block of next group.
+		 */
+
+		if ((my_rsv->rsv_start <= group_end_block) &&
+				(my_rsv->rsv_end > group_end_block) &&
+				(start_block >= my_rsv->rsv_start))
+			return -1;
+
+		if ((my_rsv->rsv_alloc_hit >
+		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+			/*
+			 * if the previously allocation hit ratio is
+			 * greater than 1/2, then we double the size of
+			 * the reservation window the next time,
+			 * otherwise we keep the same size window
+			 */
+			size = size * 2;
+			if (size > EXT2_MAX_RESERVE_BLOCKS)
+				size = EXT2_MAX_RESERVE_BLOCKS;
+			my_rsv->rsv_goal_size= size;
+		}
+	}
+
+	spin_lock(rsv_lock);
+	/*
+	 * shift the search start to the window near the goal block
+	 */
+	search_head = search_reserve_window(fs_rsv_root, start_block);
+
+	/*
+	 * find_next_reservable_window() simply finds a reservable window
+	 * inside the given range(start_block, group_end_block).
+	 *
+	 * To make sure the reservation window has a free bit inside it, we
+	 * need to check the bitmap after we found a reservable window.
+	 */
+retry:
+	ret = find_next_reservable_window(search_head, my_rsv, sb,
+						start_block, group_end_block);
+
+	if (ret == -1) {
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;
+	}
+
+	/*
+	 * On success, find_next_reservable_window() returns the
+	 * reservation window where there is a reservable space after it.
+	 * Before we reserve this reservable space, we need
+	 * to make sure there is at least a free block inside this region.
+	 *
+	 * Search the first free bit on the block bitmap.  Search starts from
+	 * the start block of the reservable space we just found.
+	 */
+	spin_unlock(rsv_lock);
+	first_free_block = bitmap_search_next_usable_block(
+			my_rsv->rsv_start - group_first_block,
+			bitmap_bh, group_end_block - group_first_block + 1);
+
+	if (first_free_block < 0) {
+		/*
+		 * no free block left on the bitmap, no point
+		 * to reserve the space. return failed.
+		 */
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;		/* failed */
+	}
+
+	start_block = first_free_block + group_first_block;
+	/*
+	 * check if the first free block is within the
+	 * free space we just reserved
+	 */
+	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
+		return 0;		/* success */
+	/*
+	 * if the first free bit we found is out of the reservable space
+	 * continue search for next reservable space,
+	 * start from where the free block is,
+	 * we also shift the list head to where we stopped last time
+	 */
+	search_head = my_rsv;
+	spin_lock(rsv_lock);
+	goto retry;
+}
+
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:		given reservation window
+ * @sb:			super block
+ * @size:		the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext2_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext2_new_blocks() tries to allocate blocks.
+ */
+static void try_to_extend_reservation(struct ext2_reserve_window_node *my_rsv,
+			struct super_block *sb, int size)
+{
+	struct ext2_reserve_window_node *next_rsv;
+	struct rb_node *next;
+	spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock;
+
+	if (!spin_trylock(rsv_lock))
+		return;
+
+	next = rb_next(&my_rsv->rsv_node);
+
+	if (!next)
+		my_rsv->rsv_end += size;
+	else {
+		next_rsv = rb_entry(next, struct ext2_reserve_window_node, rsv_node);
+
+		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+			my_rsv->rsv_end += size;
+		else
+			my_rsv->rsv_end = next_rsv->rsv_start - 1;
+	}
+	spin_unlock(rsv_lock);
+}
+
+/**
+ * ext2_try_to_allocate_with_rsv()
+ * @sb:			superblock
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * This is the main function used to allocate a new block and its reservation
+ * window.
+ *
+ * Each time when a new block allocation is need, first try to allocate from
+ * its own reservation.  If it does not have a reservation window, instead of
+ * looking for a free bit on bitmap first, then look up the reservation list to
+ * see if it is inside somebody else's reservation window, we try to allocate a
+ * reservation window for it starting from the goal first. Then do the block
+ * allocation within the reservation window.
+ *
+ * This will avoid keeping on searching the reservation list again and
+ * again when somebody is looking for a free block (without
+ * reservation), and there are lots of free blocks, but they are all
+ * being reserved.
+ *
+ * We use a red-black tree for the per-filesystem reservation list.
+ */
+static ext2_grpblk_t
+ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
+			struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal,
+			struct ext2_reserve_window_node * my_rsv,
+			unsigned long *count)
+{
+	ext2_fsblk_t group_first_block, group_last_block;
+	ext2_grpblk_t ret = 0;
+	unsigned long num = *count;
+
+	/*
+	 * we don't deal with reservation when
+	 * filesystem is mounted without reservation
+	 * or the file is not a regular file
+	 * or last attempt to allocate a block with reservation turned on failed
+	 */
+	if (my_rsv == NULL) {
+		return ext2_try_to_allocate(sb, group, bitmap_bh,
+						grp_goal, count, NULL);
+	}
+	/*
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 <= grp_goal < EXT2_BLOCKS_PER_GROUP(sb)
+	 * first block is a filesystem wide block number
+	 * first block is the block number of the first block in this group
+	 */
+	group_first_block = ext2_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+
+	/*
+	 * Basically we will allocate a new block from inode's reservation
+	 * window.
+	 *
+	 * We need to allocate a new reservation window, if:
+	 * a) inode does not have a reservation window; or
+	 * b) last attempt to allocate a block from existing reservation
+	 *    failed; or
+	 * c) we come here with a goal and with a reservation window
+	 *
+	 * We do not need to allocate a new reservation window if we come here
+	 * at the beginning with a goal and the goal is inside the window, or
+	 * we don't have a goal but already have a reservation window.
+	 * then we could go to allocate from the reservation window directly.
+	 */
+	while (1) {
+		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
+			!goal_in_my_reservation(&my_rsv->rsv_window,
+						grp_goal, group, sb)) {
+			if (my_rsv->rsv_goal_size < *count)
+				my_rsv->rsv_goal_size = *count;
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
+							group, bitmap_bh);
+			if (ret < 0)
+				break;			/* failed */
+
+			if (!goal_in_my_reservation(&my_rsv->rsv_window,
+							grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal >= 0) {
+			int curr = my_rsv->rsv_end -
+					(grp_goal + group_first_block) + 1;
+
+			if (curr < *count)
+				try_to_extend_reservation(my_rsv, sb,
+							*count - curr);
+		}
+
+		if ((my_rsv->rsv_start > group_last_block) ||
+				(my_rsv->rsv_end < group_first_block)) {
+			rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1);
+			BUG();
+		}
+		ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal,
+					   &num, &my_rsv->rsv_window);
+		if (ret >= 0) {
+			my_rsv->rsv_alloc_hit += num;
+			*count = num;
+			break;				/* succeed */
+		}
+		num = *count;
+	}
+	return ret;
+}
+
+/**
+ * ext2_has_free_blocks()
+ * @sbi:		in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext2_has_free_blocks(struct ext2_sb_info *sbi)
+{
+	ext2_fsblk_t free_blocks, root_blocks;
+
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
+	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+		!uid_eq(sbi->s_resuid, current_fsuid()) &&
+		(gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
+		 !in_group_p (sbi->s_resgid))) {
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * ext2_new_blocks() -- core block(s) allocation function
+ * @inode:		file inode
+ * @goal:		given target block(filesystem wide)
+ * @count:		target number of blocks to allocate
+ * @errp:		error code
+ *
+ * ext2_new_blocks uses a goal block to assist allocation.  If the goal is
+ * free, or there is a free block within 32 blocks of the goal, that block
+ * is allocated.  Otherwise a forward search is made for a free block; within 
+ * each block group the search first looks for an entire free byte in the block
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
+		    unsigned long *count, int *errp)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gdp_bh;
+	int group_no;
+	int goal_group;
+	ext2_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext2_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext2_fsblk_t ret_block;		/* filesyetem-wide allocated block */
+	int bgi;			/* blockgroup iteration index */
+	int performed_allocation = 0;
+	ext2_grpblk_t free_blocks;	/* number of free blocks in a group */
+	struct super_block *sb;
+	struct ext2_group_desc *gdp;
+	struct ext2_super_block *es;
+	struct ext2_sb_info *sbi;
+	struct ext2_reserve_window_node *my_rsv = NULL;
+	struct ext2_block_alloc_info *block_i;
+	unsigned short windowsz = 0;
+	unsigned long ngroups;
+	unsigned long num = *count;
+	int ret;
+
+	*errp = -ENOSPC;
+	sb = inode->i_sb;
+
+	/*
+	 * Check quota for allocation of this block.
+	 */
+	ret = dquot_alloc_block(inode, num);
+	if (ret) {
+		*errp = ret;
+		return 0;
+	}
+
+	sbi = EXT2_SB(sb);
+	es = EXT2_SB(sb)->s_es;
+	ext2_debug("goal=%lu.\n", goal);
+	/*
+	 * Allocate a block from reservation only when
+	 * filesystem is mounted with reservation(default,-o reservation), and
+	 * it's a regular file, and
+	 * the desired window size is greater than 0 (One could use ioctl
+	 * command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn off
+	 * reservation on that particular file)
+	 */
+	block_i = EXT2_I(inode)->i_block_alloc_info;
+	if (block_i) {
+		windowsz = block_i->rsv_window_node.rsv_goal_size;
+		if (windowsz > 0)
+			my_rsv = &block_i->rsv_window_node;
+	}
+
+	if (!ext2_has_free_blocks(sbi)) {
+		*errp = -ENOSPC;
+		goto out;
+	}
+
+	/*
+	 * First, test whether the goal block is free.
+	 */
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+	    goal >= le32_to_cpu(es->s_blocks_count))
+		goal = le32_to_cpu(es->s_first_data_block);
+	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
+			EXT2_BLOCKS_PER_GROUP(sb);
+	goal_group = group_no;
+retry_alloc:
+	gdp = ext2_get_group_desc(sb, group_no, &gdp_bh);
+	if (!gdp)
+		goto io_error;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	/*
+	 * if there is not enough free blocks to make a new resevation
+	 * turn off reservation for this allocation
+	 */
+	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
+		&& (rsv_is_empty(&my_rsv->rsv_window)))
+		my_rsv = NULL;
+
+	if (free_blocks > 0) {
+		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
+				EXT2_BLOCKS_PER_GROUP(sb));
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no,
+					bitmap_bh, grp_target_blk,
+					my_rsv, &num);
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+
+	ngroups = EXT2_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/*
+	 * Now search the rest of the groups.  We assume that
+	 * group_no and gdp correctly point to the last group visited.
+	 */
+	for (bgi = 0; bgi < ngroups; bgi++) {
+		group_no++;
+		if (group_no >= ngroups)
+			group_no = 0;
+		gdp = ext2_get_group_desc(sb, group_no, &gdp_bh);
+		if (!gdp)
+			goto io_error;
+
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		/*
+		 * skip this group (and avoid loading bitmap) if there
+		 * are no free blocks
+		 */
+		if (!free_blocks)
+			continue;
+		/*
+		 * skip this group if the number of
+		 * free blocks is less than half of the reservation
+		 * window size.
+		 */
+		if (my_rsv && (free_blocks <= (windowsz/2)))
+			continue;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no,
+					bitmap_bh, -1, my_rsv, &num);
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+	/*
+	 * We may end up a bogus earlier ENOSPC error due to
+	 * filesystem is "full" of reservations, but
+	 * there maybe indeed free blocks available on disk
+	 * In this case, we just forget about the reservations
+	 * just do block allocation as without reservations.
+	 */
+	if (my_rsv) {
+		my_rsv = NULL;
+		windowsz = 0;
+		group_no = goal_group;
+		goto retry_alloc;
+	}
+	/* No space left on the device */
+	*errp = -ENOSPC;
+	goto out;
+
+allocated:
+
+	ext2_debug("using block group %d(%d)\n",
+			group_no, gdp->bg_free_blocks_count);
+
+	ret_block = grp_alloc_blk + ext2_group_first_block_no(sb, group_no);
+
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
+		      EXT2_SB(sb)->s_itb_per_group) ||
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+		      EXT2_SB(sb)->s_itb_per_group)) {
+		ext2_error(sb, "ext2_new_blocks",
+			    "Allocating block in system zone - "
+			    "blocks from "E2FSBLK", length %lu",
+			    ret_block, num);
+		/*
+		 * ext2_try_to_allocate marked the blocks we allocated as in
+		 * use.  So we may want to selectively mark some of the blocks
+		 * as free
+		 */
+		goto retry_alloc;
+	}
+
+	performed_allocation = 1;
+
+	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
+		ext2_error(sb, "ext2_new_blocks",
+			    "block("E2FSBLK") >= blocks count(%d) - "
+			    "block_group = %d, es == %p ", ret_block,
+			le32_to_cpu(es->s_blocks_count), group_no, es);
+		goto out;
+	}
+
+	group_adjust_blocks(sb, group_no, gdp, gdp_bh, -num);
+	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+
+	*errp = 0;
+	brelse(bitmap_bh);
+	if (num < *count) {
+		dquot_free_block_nodirty(inode, *count-num);
+		mark_inode_dirty(inode);
+		*count = num;
+	}
+	return ret_block;
+
+io_error:
+	*errp = -EIO;
+out:
+	/*
+	 * Undo the block allocation
+	 */
+	if (!performed_allocation) {
+		dquot_free_block_nodirty(inode, *count);
+		mark_inode_dirty(inode);
+	}
+	brelse(bitmap_bh);
+	return 0;
+}
+
+ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp)
+{
+	unsigned long count = 1;
+
+	return ext2_new_blocks(inode, goal, &count, errp);
+}
+
+#ifdef EXT2FS_DEBUG
+
+unsigned long ext2_count_free(struct buffer_head *map, unsigned int numchars)
+{
+	return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
+}
+
+#endif  /*  EXT2FS_DEBUG  */
+
+unsigned long ext2_count_free_blocks (struct super_block * sb)
+{
+	struct ext2_group_desc * desc;
+	unsigned long desc_count = 0;
+	int i;
+#ifdef EXT2FS_DEBUG
+	unsigned long bitmap_count, x;
+	struct ext2_super_block *es;
+
+	es = EXT2_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	desc = NULL;
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		struct buffer_head *bitmap_bh;
+		desc = ext2_get_group_desc (sb, i, NULL);
+		if (!desc)
+			continue;
+		desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+		bitmap_bh = read_block_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+		
+		x = ext2_count_free(bitmap_bh, sb->s_blocksize);
+		printk ("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(desc->bg_free_blocks_count), x);
+		bitmap_count += x;
+		brelse(bitmap_bh);
+	}
+	printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
+		(long)le32_to_cpu(es->s_free_blocks_count),
+		desc_count, bitmap_count);
+	return bitmap_count;
+#else
+        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+                desc = ext2_get_group_desc (sb, i, NULL);
+                if (!desc)
+                        continue;
+                desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+	}
+	return desc_count;
+#endif
+}
+
+static inline int test_root(int a, int b)
+{
+	int num = b;
+
+	while (a > num)
+		num *= b;
+	return num == a;
+}
+
+static int ext2_group_sparse(int group)
+{
+	if (group <= 1)
+		return 1;
+	return (test_root(group, 3) || test_root(group, 5) ||
+		test_root(group, 7));
+}
+
+/**
+ *	ext2_bg_has_super - number of blocks used by the superblock in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the superblock (primary or backup)
+ *	in this group.  Currently this will be only 0 or 1.
+ */
+int ext2_bg_has_super(struct super_block *sb, int group)
+{
+	if (EXT2_HAS_RO_COMPAT_FEATURE(sb,EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+	    !ext2_group_sparse(group))
+		return 0;
+	return 1;
+}
+
+/**
+ *	ext2_bg_num_gdb - number of blocks used by the group table in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the group descriptor table
+ *	(primary or backup) in this group.  In the future there may be a
+ *	different number of descriptor blocks in each group.
+ */
+unsigned long ext2_bg_num_gdb(struct super_block *sb, int group)
+{
+	return ext2_bg_has_super(sb, group) ? EXT2_SB(sb)->s_gdb_count : 0;
+}
+
diff --git a/kernel/fs/ext2/dir.c b/kernel/fs/ext2/dir.c
new file mode 100644
index 000000000..796b491e6
--- /dev/null
+++ b/kernel/fs/ext2/dir.c
@@ -0,0 +1,730 @@
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+
+#include "ext2.h"
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+typedef struct ext2_dir_entry_2 ext2_dirent;
+
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
+static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == EXT2_MAX_REC_LEN)
+		return 1 << 16;
+#endif
+	return len;
+}
+
+static inline __le16 ext2_rec_len_to_disk(unsigned len)
+{
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == (1 << 16))
+		return cpu_to_le16(EXT2_MAX_REC_LEN);
+	else
+		BUG_ON(len > (1 << 16));
+#endif
+	return cpu_to_le16(len);
+}
+
+/*
+ * ext2 uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned ext2_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void ext2_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ext2_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+
+	if (IS_DIRSYNC(dir)) {
+		err = write_one_page(page, 1);
+		if (!err)
+			err = sync_inode_metadata(dir, 1);
+	} else {
+		unlock_page(page);
+	}
+
+	return err;
+}
+
+static void ext2_check_page(struct page *page, int quiet)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	unsigned chunk_size = ext2_chunk_size(dir);
+	char *kaddr = page_address(page);
+	u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	ext2_dirent *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) {
+		p = (ext2_dirent *)(kaddr + offs);
+		rec_len = ext2_rec_len_from_disk(p->rec_len);
+
+		if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
+			goto Eshort;
+		if (unlikely(rec_len & 3))
+			goto Ealign;
+		if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
+			goto Enamelen;
+		if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
+			goto Espan;
+		if (unlikely(le32_to_cpu(p->inode) > max_inumber))
+			goto Einumber;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	if (!quiet)
+		ext2_error(sb, __func__,
+			"size of directory #%lu is not a multiple "
+			"of chunk size", dir->i_ino);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "inode out of bounds";
+bad_entry:
+	if (!quiet)
+		ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
+			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			(unsigned long) le32_to_cpu(p->inode),
+			rec_len, p->name_len);
+	goto fail;
+Eend:
+	if (!quiet) {
+		p = (ext2_dirent *)(kaddr + offs);
+		ext2_error(sb, "ext2_check_page",
+			"entry in directory #%lu spans the page boundary"
+			"offset=%lu, inode=%lu",
+			dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			(unsigned long) le32_to_cpu(p->inode));
+	}
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page * ext2_get_page(struct inode *dir, unsigned long n,
+				   int quiet)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			ext2_check_page(page, quiet);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	ext2_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * NOTE! unlike strncmp, ext2_match returns 1 for success, 0 for failure.
+ *
+ * len <= EXT2_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static inline int ext2_match (int len, const char * const name,
+					struct ext2_dir_entry_2 * de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline ext2_dirent *ext2_next_entry(ext2_dirent *p)
+{
+	return (ext2_dirent *)((char *)p +
+			ext2_rec_len_from_disk(p->rec_len));
+}
+
+static inline unsigned 
+ext2_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+	ext2_dirent *de = (ext2_dirent*)(base + offset);
+	ext2_dirent *p = (ext2_dirent*)(base + (offset&mask));
+	while ((char*)p < (char*)de) {
+		if (p->rec_len == 0)
+			break;
+		p = ext2_next_entry(p);
+	}
+	return (char *)p - base;
+}
+
+static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
+	[EXT2_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EXT2_FT_REG_FILE]	= DT_REG,
+	[EXT2_FT_DIR]		= DT_DIR,
+	[EXT2_FT_CHRDEV]	= DT_CHR,
+	[EXT2_FT_BLKDEV]	= DT_BLK,
+	[EXT2_FT_FIFO]		= DT_FIFO,
+	[EXT2_FT_SOCK]		= DT_SOCK,
+	[EXT2_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT2_FT_SYMLINK,
+};
+
+static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
+{
+	umode_t mode = inode->i_mode;
+	if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+	else
+		de->file_type = 0;
+}
+
+static int
+ext2_readdir(struct file *file, struct dir_context *ctx)
+{
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
+	unsigned char *types = NULL;
+	int need_revalidate = file->f_version != inode->i_version;
+
+	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
+		return 0;
+
+	if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
+		types = ext2_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		ext2_dirent *de;
+		struct page *page = ext2_get_page(inode, n, 0);
+
+		if (IS_ERR(page)) {
+			ext2_error(sb, __func__,
+				   "bad page in #%lu",
+				   inode->i_ino);
+			ctx->pos += PAGE_CACHE_SIZE - offset;
+			return PTR_ERR(page);
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			file->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (ext2_dirent *)(kaddr+offset);
+		limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1);
+		for ( ;(char*)de <= limit; de = ext2_next_entry(de)) {
+			if (de->rec_len == 0) {
+				ext2_error(sb, __func__,
+					"zero-length directory entry");
+				ext2_put_page(page);
+				return -EIO;
+			}
+			if (de->inode) {
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < EXT2_FT_MAX)
+					d_type = types[de->file_type];
+
+				if (!dir_emit(ctx, de->name, de->name_len,
+						le32_to_cpu(de->inode),
+						d_type)) {
+					ext2_put_page(page);
+					return 0;
+				}
+			}
+			ctx->pos += ext2_rec_len_from_disk(de->rec_len);
+		}
+		ext2_put_page(page);
+	}
+	return 0;
+}
+
+/*
+ *	ext2_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
+			struct qstr *child, struct page ** res_page)
+{
+	const char *name = child->name;
+	int namelen = child->len;
+	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct ext2_inode_info *ei = EXT2_I(dir);
+	ext2_dirent * de;
+	int dir_has_error = 0;
+
+	if (npages == 0)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ei->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = ext2_get_page(dir, n, dir_has_error);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (ext2_dirent *) kaddr;
+			kaddr += ext2_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					ext2_error(dir->i_sb, __func__,
+						"zero-length directory entry");
+					ext2_put_page(page);
+					goto out;
+				}
+				if (ext2_match (namelen, name, de))
+					goto found;
+				de = ext2_next_entry(de);
+			}
+			ext2_put_page(page);
+		} else
+			dir_has_error = 1;
+
+		if (++n >= npages)
+			n = 0;
+		/* next page is past the blocks we've got */
+		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+			ext2_error(dir->i_sb, __func__,
+				"dir %lu size %lld exceeds block count %llu",
+				dir->i_ino, dir->i_size,
+				(unsigned long long)dir->i_blocks);
+			goto out;
+		}
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ei->i_dir_start_lookup = n;
+	return de;
+}
+
+struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
+{
+	struct page *page = ext2_get_page(dir, 0, 0);
+	ext2_dirent *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = ext2_next_entry((ext2_dirent *) page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
+{
+	ino_t res = 0;
+	struct ext2_dir_entry_2 *de;
+	struct page *page;
+	
+	de = ext2_find_entry (dir, child, &page);
+	if (de) {
+		res = le32_to_cpu(de->inode);
+		ext2_put_page(page);
+	}
+	return res;
+}
+
+static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, ext2_get_block);
+}
+
+/* Releases the page */
+void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
+		   struct page *page, struct inode *inode, int update_times)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = ext2_rec_len_from_disk(de->rec_len);
+	int err;
+
+	lock_page(page);
+	err = ext2_prepare_chunk(page, pos, len);
+	BUG_ON(err);
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext2_set_de_type(de, inode);
+	err = ext2_commit_chunk(page, pos, len);
+	ext2_put_page(page);
+	if (update_times)
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(dir);
+}
+
+/*
+ *	Parent is locked.
+ */
+int ext2_add_link (struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = ext2_chunk_size(dir);
+	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	ext2_dirent * de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = ext2_get_page(dir, n, 0);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + ext2_last_byte(dir, n);
+		de = (ext2_dirent *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = ext2_rec_len_to_disk(chunk_size);
+				de->inode = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				ext2_error(dir->i_sb, __func__,
+					"zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (ext2_match (namelen, name, de))
+				goto out_unlock;
+			name_len = EXT2_DIR_REC_LEN(de->name_len);
+			rec_len = ext2_rec_len_from_disk(de->rec_len);
+			if (!de->inode && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (ext2_dirent *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		ext2_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+		(char*)de - (char*)page_address(page);
+	err = ext2_prepare_chunk(page, pos, rec_len);
+	if (err)
+		goto out_unlock;
+	if (de->inode) {
+		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
+		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = ext2_rec_len_to_disk(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext2_set_de_type (de, inode);
+	err = ext2_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ext2_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+/*
+ * ext2_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr = page_address(page);
+	unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
+	unsigned to = ((char *)dir - kaddr) +
+				ext2_rec_len_from_disk(dir->rec_len);
+	loff_t pos;
+	ext2_dirent * pde = NULL;
+	ext2_dirent * de = (ext2_dirent *) (kaddr + from);
+	int err;
+
+	while ((char*)de < (char*)dir) {
+		if (de->rec_len == 0) {
+			ext2_error(inode->i_sb, __func__,
+				"zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = ext2_next_entry(de);
+	}
+	if (pde)
+		from = (char*)pde - (char*)page_address(page);
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = ext2_prepare_chunk(page, pos, to - from);
+	BUG_ON(err);
+	if (pde)
+		pde->rec_len = ext2_rec_len_to_disk(to - from);
+	dir->inode = 0;
+	err = ext2_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(inode);
+out:
+	ext2_put_page(page);
+	return err;
+}
+
+/*
+ * Set the first fragment of directory.
+ */
+int ext2_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
+	unsigned chunk_size = ext2_chunk_size(inode);
+	struct ext2_dir_entry_2 * de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = ext2_prepare_chunk(page, 0, chunk_size);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+	kaddr = kmap_atomic(page);
+	memset(kaddr, 0, chunk_size);
+	de = (struct ext2_dir_entry_2 *)kaddr;
+	de->name_len = 1;
+	de->rec_len = ext2_rec_len_to_disk(EXT2_DIR_REC_LEN(1));
+	memcpy (de->name, ".\0\0", 4);
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext2_set_de_type (de, inode);
+
+	de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = ext2_rec_len_to_disk(chunk_size - EXT2_DIR_REC_LEN(1));
+	de->inode = cpu_to_le32(parent->i_ino);
+	memcpy (de->name, "..\0", 4);
+	ext2_set_de_type (de, inode);
+	kunmap_atomic(kaddr);
+	err = ext2_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ext2_empty_dir (struct inode * inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+	int dir_has_error = 0;
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		ext2_dirent * de;
+		page = ext2_get_page(inode, i, dir_has_error);
+
+		if (IS_ERR(page)) {
+			dir_has_error = 1;
+			continue;
+		}
+
+		kaddr = page_address(page);
+		de = (ext2_dirent *)kaddr;
+		kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				ext2_error(inode->i_sb, __func__,
+					"zero-length directory entry");
+				printk("kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (de->inode !=
+					    cpu_to_le32(inode->i_ino))
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = ext2_next_entry(de);
+		}
+		ext2_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	ext2_put_page(page);
+	return 0;
+}
+
+const struct file_operations ext2_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= ext2_readdir,
+	.unlocked_ioctl = ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
+	.fsync		= ext2_fsync,
+};
diff --git a/kernel/fs/ext2/ext2.h b/kernel/fs/ext2/ext2.h
new file mode 100644
index 000000000..8d15febd0
--- /dev/null
+++ b/kernel/fs/ext2/ext2.h
@@ -0,0 +1,820 @@
+/*
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+#include <linux/fs.h>
+#include <linux/ext2_fs.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#include <linux/rbtree.h>
+
+/* XXX Here for now... not interested in restructing headers JUST now */
+
+/* data type for block offset of block group */
+typedef int ext2_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long ext2_fsblk_t;
+
+#define E2FSBLK "%lu"
+
+struct ext2_reserve_window {
+	ext2_fsblk_t		_rsv_start;	/* First byte reserved */
+	ext2_fsblk_t		_rsv_end;	/* Last byte reserved or 0 */
+};
+
+struct ext2_reserve_window_node {
+	struct rb_node	 	rsv_node;
+	__u32			rsv_goal_size;
+	__u32			rsv_alloc_hit;
+	struct ext2_reserve_window	rsv_window;
+};
+
+struct ext2_block_alloc_info {
+	/* information about reservation window */
+	struct ext2_reserve_window_node	rsv_window_node;
+	/*
+	 * was i_next_alloc_block in ext2_inode_info
+	 * is the logical (file-relative) number of the
+	 * most-recently-allocated block in this file.
+	 * We use this for detecting linearly ascending allocation requests.
+	 */
+	__u32			last_alloc_logical_block;
+	/*
+	 * Was i_next_alloc_goal in ext2_inode_info
+	 * is the *physical* companion to i_next_alloc_block.
+	 * it the the physical block number of the block which was most-recentl
+	 * allocated to this file.  This give us the goal (target) for the next
+	 * allocation when we detect linearly ascending requests.
+	 */
+	ext2_fsblk_t		last_alloc_physical_block;
+};
+
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
+/*
+ * second extended-fs super-block data in memory
+ */
+struct ext2_sb_info {
+	unsigned long s_frag_size;	/* Size of a fragment in bytes */
+	unsigned long s_frags_per_block;/* Number of fragments per block */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_frags_per_group;/* Number of fragments in a group */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	unsigned long s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext2_super_block * s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head ** s_group_desc;
+	unsigned long  s_mount_opt;
+	unsigned long s_sb_block;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	unsigned long s_dir_count;
+	u8 *s_debts;
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+	/* root of the per fs reservation window tree */
+	spinlock_t s_rsv_window_lock;
+	struct rb_root s_rsv_window_root;
+	struct ext2_reserve_window_node s_rsv_window_head;
+	/*
+	 * s_lock protects against concurrent modifications of s_mount_state,
+	 * s_blocks_last, s_overhead_last and the content of superblock's
+	 * buffer pointed to by sbi->s_es.
+	 *
+	 * Note: It is used in ext2_show_options() to provide a consistent view
+	 * of the mount options.
+	 */
+	spinlock_t s_lock;
+};
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext2_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
+
+/*
+ * Define EXT2FS_DEBUG to produce debug messages
+ */
+#undef EXT2FS_DEBUG
+
+/*
+ * Define EXT2_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT2_DEFAULT_RESERVE_BLOCKS     8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT2_MAX_RESERVE_BLOCKS         1027
+#define EXT2_RESERVE_WINDOW_NOT_ALLOCATED 0
+/*
+ * The second extended file system version
+ */
+#define EXT2FS_DATE		"95/08/09"
+#define EXT2FS_VERSION		"0.5b"
+
+/*
+ * Debug code
+ */
+#ifdef EXT2FS_DEBUG
+#	define ext2_debug(f, a...)	{ \
+					printk ("EXT2-fs DEBUG (%s, %d): %s:", \
+						__FILE__, __LINE__, __func__); \
+				  	printk (f, ## a); \
+					}
+#else
+#	define ext2_debug(f, a...)	/**/
+#endif
+
+/*
+ * Special inode numbers
+ */
+#define	EXT2_BAD_INO		 1	/* Bad blocks inode */
+#define EXT2_ROOT_INO		 2	/* Root inode */
+#define EXT2_BOOT_LOADER_INO	 5	/* Boot loader inode */
+#define EXT2_UNDEL_DIR_INO	 6	/* Undelete directory inode */
+
+/* First non-reserved inode for old ext2 filesystems */
+#define EXT2_GOOD_OLD_FIRST_INO	11
+
+static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT2_MIN_BLOCK_SIZE		1024
+#define	EXT2_MAX_BLOCK_SIZE		4096
+#define EXT2_MIN_BLOCK_LOG_SIZE		  10
+#define EXT2_BLOCK_SIZE(s)		((s)->s_blocksize)
+#define	EXT2_ADDR_PER_BLOCK(s)		(EXT2_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT2_BLOCK_SIZE_BITS(s)		((s)->s_blocksize_bits)
+#define	EXT2_ADDR_PER_BLOCK_BITS(s)	(EXT2_SB(s)->s_addr_per_block_bits)
+#define EXT2_INODE_SIZE(s)		(EXT2_SB(s)->s_inode_size)
+#define EXT2_FIRST_INO(s)		(EXT2_SB(s)->s_first_ino)
+
+/*
+ * Macro-instructions used to manage fragments
+ */
+#define EXT2_MIN_FRAG_SIZE		1024
+#define	EXT2_MAX_FRAG_SIZE		4096
+#define EXT2_MIN_FRAG_LOG_SIZE		  10
+#define EXT2_FRAG_SIZE(s)		(EXT2_SB(s)->s_frag_size)
+#define EXT2_FRAGS_PER_BLOCK(s)		(EXT2_SB(s)->s_frags_per_block)
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext2_group_desc
+{
+	__le32	bg_block_bitmap;		/* Blocks bitmap block */
+	__le32	bg_inode_bitmap;		/* Inodes bitmap block */
+	__le32	bg_inode_table;		/* Inodes table block */
+	__le16	bg_free_blocks_count;	/* Free blocks count */
+	__le16	bg_free_inodes_count;	/* Free inodes count */
+	__le16	bg_used_dirs_count;	/* Directories count */
+	__le16	bg_pad;
+	__le32	bg_reserved[3];
+};
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT2_BLOCKS_PER_GROUP(s)	(EXT2_SB(s)->s_blocks_per_group)
+#define EXT2_DESC_PER_BLOCK(s)		(EXT2_SB(s)->s_desc_per_block)
+#define EXT2_INODES_PER_GROUP(s)	(EXT2_SB(s)->s_inodes_per_group)
+#define EXT2_DESC_PER_BLOCK_BITS(s)	(EXT2_SB(s)->s_desc_per_block_bits)
+
+/*
+ * Constants relative to the data blocks
+ */
+#define	EXT2_NDIR_BLOCKS		12
+#define	EXT2_IND_BLOCK			EXT2_NDIR_BLOCKS
+#define	EXT2_DIND_BLOCK			(EXT2_IND_BLOCK + 1)
+#define	EXT2_TIND_BLOCK			(EXT2_DIND_BLOCK + 1)
+#define	EXT2_N_BLOCKS			(EXT2_TIND_BLOCK + 1)
+
+/*
+ * Inode flags (GETFLAGS/SETFLAGS)
+ */
+#define	EXT2_SECRM_FL			FS_SECRM_FL	/* Secure deletion */
+#define	EXT2_UNRM_FL			FS_UNRM_FL	/* Undelete */
+#define	EXT2_COMPR_FL			FS_COMPR_FL	/* Compress file */
+#define EXT2_SYNC_FL			FS_SYNC_FL	/* Synchronous updates */
+#define EXT2_IMMUTABLE_FL		FS_IMMUTABLE_FL	/* Immutable file */
+#define EXT2_APPEND_FL			FS_APPEND_FL	/* writes to file may only append */
+#define EXT2_NODUMP_FL			FS_NODUMP_FL	/* do not dump file */
+#define EXT2_NOATIME_FL			FS_NOATIME_FL	/* do not update atime */
+/* Reserved for compression usage... */
+#define EXT2_DIRTY_FL			FS_DIRTY_FL
+#define EXT2_COMPRBLK_FL		FS_COMPRBLK_FL	/* One or more compressed clusters */
+#define EXT2_NOCOMP_FL			FS_NOCOMP_FL	/* Don't compress */
+#define EXT2_ECOMPR_FL			FS_ECOMPR_FL	/* Compression error */
+/* End compression flags --- maybe not all used */	
+#define EXT2_BTREE_FL			FS_BTREE_FL	/* btree format dir */
+#define EXT2_INDEX_FL			FS_INDEX_FL	/* hash-indexed directory */
+#define EXT2_IMAGIC_FL			FS_IMAGIC_FL	/* AFS directory */
+#define EXT2_JOURNAL_DATA_FL		FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define EXT2_NOTAIL_FL			FS_NOTAIL_FL	/* file tail should not be merged */
+#define EXT2_DIRSYNC_FL			FS_DIRSYNC_FL	/* dirsync behaviour (directories only) */
+#define EXT2_TOPDIR_FL			FS_TOPDIR_FL	/* Top of directory hierarchies*/
+#define EXT2_RESERVED_FL		FS_RESERVED_FL	/* reserved for ext2 lib */
+
+#define EXT2_FL_USER_VISIBLE		FS_FL_USER_VISIBLE	/* User visible flags */
+#define EXT2_FL_USER_MODIFIABLE		FS_FL_USER_MODIFIABLE	/* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\
+			   EXT2_SYNC_FL | EXT2_NODUMP_FL |\
+			   EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\
+			   EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
+			   EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT2_REG_FLMASK (~(EXT2_DIRSYNC_FL | EXT2_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT2_OTHER_FLMASK (EXT2_NODUMP_FL | EXT2_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT2_REG_FLMASK;
+	else
+		return flags & EXT2_OTHER_FLMASK;
+}
+
+/*
+ * ioctl commands
+ */
+#define	EXT2_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define	EXT2_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define	EXT2_IOC_GETVERSION		FS_IOC_GETVERSION
+#define	EXT2_IOC_SETVERSION		FS_IOC_SETVERSION
+#define	EXT2_IOC_GETRSVSZ		_IOR('f', 5, long)
+#define	EXT2_IOC_SETRSVSZ		_IOW('f', 6, long)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT2_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define EXT2_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define EXT2_IOC32_GETVERSION		FS_IOC32_GETVERSION
+#define EXT2_IOC32_SETVERSION		FS_IOC32_SETVERSION
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext2_inode {
+	__le16	i_mode;		/* File mode */
+	__le16	i_uid;		/* Low 16 bits of Owner Uid */
+	__le32	i_size;		/* Size in bytes */
+	__le32	i_atime;	/* Access time */
+	__le32	i_ctime;	/* Creation time */
+	__le32	i_mtime;	/* Modification time */
+	__le32	i_dtime;	/* Deletion Time */
+	__le16	i_gid;		/* Low 16 bits of Group Id */
+	__le16	i_links_count;	/* Links count */
+	__le32	i_blocks;	/* Blocks count */
+	__le32	i_flags;	/* File flags */
+	union {
+		struct {
+			__le32  l_i_reserved1;
+		} linux1;
+		struct {
+			__le32  h_i_translator;
+		} hurd1;
+		struct {
+			__le32  m_i_reserved1;
+		} masix1;
+	} osd1;				/* OS dependent 1 */
+	__le32	i_block[EXT2_N_BLOCKS];/* Pointers to blocks */
+	__le32	i_generation;	/* File version (for NFS) */
+	__le32	i_file_acl;	/* File ACL */
+	__le32	i_dir_acl;	/* Directory ACL */
+	__le32	i_faddr;	/* Fragment address */
+	union {
+		struct {
+			__u8	l_i_frag;	/* Fragment number */
+			__u8	l_i_fsize;	/* Fragment size */
+			__u16	i_pad1;
+			__le16	l_i_uid_high;	/* these 2 fields    */
+			__le16	l_i_gid_high;	/* were reserved2[0] */
+			__u32	l_i_reserved2;
+		} linux2;
+		struct {
+			__u8	h_i_frag;	/* Fragment number */
+			__u8	h_i_fsize;	/* Fragment size */
+			__le16	h_i_mode_high;
+			__le16	h_i_uid_high;
+			__le16	h_i_gid_high;
+			__le32	h_i_author;
+		} hurd2;
+		struct {
+			__u8	m_i_frag;	/* Fragment number */
+			__u8	m_i_fsize;	/* Fragment size */
+			__u16	m_pad1;
+			__u32	m_i_reserved2[2];
+		} masix2;
+	} osd2;				/* OS dependent 2 */
+};
+
+#define i_size_high	i_dir_acl
+
+#define i_reserved1	osd1.linux1.l_i_reserved1
+#define i_frag		osd2.linux2.l_i_frag
+#define i_fsize		osd2.linux2.l_i_fsize
+#define i_uid_low	i_uid
+#define i_gid_low	i_gid
+#define i_uid_high	osd2.linux2.l_i_uid_high
+#define i_gid_high	osd2.linux2.l_i_gid_high
+#define i_reserved2	osd2.linux2.l_i_reserved2
+
+/*
+ * File system states
+ */
+#define	EXT2_VALID_FS			0x0001	/* Unmounted cleanly */
+#define	EXT2_ERROR_FS			0x0002	/* Errors detected */
+
+/*
+ * Mount flags
+ */
+#define EXT2_MOUNT_CHECK		0x000001  /* Do mount-time checks */
+#define EXT2_MOUNT_OLDALLOC		0x000002  /* Don't use the new Orlov allocator */
+#define EXT2_MOUNT_GRPID		0x000004  /* Create files with directory's group */
+#define EXT2_MOUNT_DEBUG		0x000008  /* Some debugging messages */
+#define EXT2_MOUNT_ERRORS_CONT		0x000010  /* Continue on errors */
+#define EXT2_MOUNT_ERRORS_RO		0x000020  /* Remount fs ro on errors */
+#define EXT2_MOUNT_ERRORS_PANIC		0x000040  /* Panic on errors */
+#define EXT2_MOUNT_MINIX_DF		0x000080  /* Mimics the Minix statfs */
+#define EXT2_MOUNT_NOBH			0x000100  /* No buffer_heads */
+#define EXT2_MOUNT_NO_UID32		0x000200  /* Disable 32-bit UIDs */
+#define EXT2_MOUNT_XATTR_USER		0x004000  /* Extended user attributes */
+#define EXT2_MOUNT_POSIX_ACL		0x008000  /* POSIX Access Control Lists */
+#define EXT2_MOUNT_XIP			0x010000  /* Obsolete, use DAX */
+#define EXT2_MOUNT_USRQUOTA		0x020000  /* user quota */
+#define EXT2_MOUNT_GRPQUOTA		0x040000  /* group quota */
+#define EXT2_MOUNT_RESERVATION		0x080000  /* Preallocation */
+#ifdef CONFIG_FS_DAX
+#define EXT2_MOUNT_DAX			0x100000  /* Direct Access */
+#else
+#define EXT2_MOUNT_DAX			0
+#endif
+
+
+#define clear_opt(o, opt)		o &= ~EXT2_MOUNT_##opt
+#define set_opt(o, opt)			o |= EXT2_MOUNT_##opt
+#define test_opt(sb, opt)		(EXT2_SB(sb)->s_mount_opt & \
+					 EXT2_MOUNT_##opt)
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT2_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
+#define EXT2_DFL_CHECKINTERVAL		0	/* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT2_ERRORS_CONTINUE		1	/* Continue execution */
+#define EXT2_ERRORS_RO			2	/* Remount fs read-only */
+#define EXT2_ERRORS_PANIC		3	/* Panic */
+#define EXT2_ERRORS_DEFAULT		EXT2_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext2_super_block {
+	__le32	s_inodes_count;		/* Inodes count */
+	__le32	s_blocks_count;		/* Blocks count */
+	__le32	s_r_blocks_count;	/* Reserved blocks count */
+	__le32	s_free_blocks_count;	/* Free blocks count */
+	__le32	s_free_inodes_count;	/* Free inodes count */
+	__le32	s_first_data_block;	/* First Data Block */
+	__le32	s_log_block_size;	/* Block size */
+	__le32	s_log_frag_size;	/* Fragment size */
+	__le32	s_blocks_per_group;	/* # Blocks per group */
+	__le32	s_frags_per_group;	/* # Fragments per group */
+	__le32	s_inodes_per_group;	/* # Inodes per group */
+	__le32	s_mtime;		/* Mount time */
+	__le32	s_wtime;		/* Write time */
+	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_magic;		/* Magic signature */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le16	s_minor_rev_level; 	/* minor revision level */
+	__le32	s_lastcheck;		/* time of last check */
+	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le32	s_rev_level;		/* Revision level */
+	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	/*
+	 * These fields are for EXT2_DYNAMIC_REV superblocks only.
+	 *
+	 * Note: the difference between the compatible feature set and
+	 * the incompatible feature set is that if there is a bit set
+	 * in the incompatible feature set that the kernel doesn't
+	 * know about, it should refuse to mount the filesystem.
+	 * 
+	 * e2fsck's requirements are more strict; if it doesn't know
+	 * about a feature in either the compatible or incompatible
+	 * feature set, it must abort and not try to meddle with
+	 * things it doesn't understand...
+	 */
+	__le32	s_first_ino; 		/* First non-reserved inode */
+	__le16   s_inode_size; 		/* size of inode structure */
+	__le16	s_block_group_nr; 	/* block group # of this superblock */
+	__le32	s_feature_compat; 	/* compatible feature set */
+	__le32	s_feature_incompat; 	/* incompatible feature set */
+	__le32	s_feature_ro_compat; 	/* readonly-compatible feature set */
+	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+	char	s_volume_name[16]; 	/* volume name */
+	char	s_last_mounted[64]; 	/* directory where last mounted */
+	__le32	s_algorithm_usage_bitmap; /* For compression */
+	/*
+	 * Performance hints.  Directory preallocation should only
+	 * happen if the EXT2_COMPAT_PREALLOC flag is on.
+	 */
+	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
+	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
+	__u16	s_padding1;
+	/*
+	 * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+	 */
+	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
+	__u32	s_journal_inum;		/* inode number of journal file */
+	__u32	s_journal_dev;		/* device number of journal file */
+	__u32	s_last_orphan;		/* start of list of inodes to delete */
+	__u32	s_hash_seed[4];		/* HTREE hash seed */
+	__u8	s_def_hash_version;	/* Default hash version to use */
+	__u8	s_reserved_char_pad;
+	__u16	s_reserved_word_pad;
+	__le32	s_default_mount_opts;
+ 	__le32	s_first_meta_bg; 	/* First metablock block group */
+	__u32	s_reserved[190];	/* Padding to the end of the block */
+};
+
+/*
+ * Codes for operating systems
+ */
+#define EXT2_OS_LINUX		0
+#define EXT2_OS_HURD		1
+#define EXT2_OS_MASIX		2
+#define EXT2_OS_FREEBSD		3
+#define EXT2_OS_LITES		4
+
+/*
+ * Revision levels
+ */
+#define EXT2_GOOD_OLD_REV	0	/* The good old (original) format */
+#define EXT2_DYNAMIC_REV	1 	/* V2 format w/ dynamic inode sizes */
+
+#define EXT2_CURRENT_REV	EXT2_GOOD_OLD_REV
+#define EXT2_MAX_SUPP_REV	EXT2_DYNAMIC_REV
+
+#define EXT2_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT2_HAS_COMPAT_FEATURE(sb,mask)			\
+	( EXT2_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+#define EXT2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( EXT2_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+#define EXT2_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( EXT2_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+#define EXT2_SET_COMPAT_FEATURE(sb,mask)			\
+	EXT2_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT2_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT2_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT2_SET_INCOMPAT_FEATURE(sb,mask)			\
+	EXT2_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT2_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	EXT2_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT2_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	EXT2_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT2_FEATURE_COMPAT_DIR_PREALLOC	0x0001
+#define EXT2_FEATURE_COMPAT_IMAGIC_INODES	0x0002
+#define EXT3_FEATURE_COMPAT_HAS_JOURNAL		0x0004
+#define EXT2_FEATURE_COMPAT_EXT_ATTR		0x0008
+#define EXT2_FEATURE_COMPAT_RESIZE_INO		0x0010
+#define EXT2_FEATURE_COMPAT_DIR_INDEX		0x0020
+#define EXT2_FEATURE_COMPAT_ANY			0xffffffff
+
+#define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
+#define EXT2_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
+#define EXT2_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT2_FEATURE_RO_COMPAT_ANY		0xffffffff
+
+#define EXT2_FEATURE_INCOMPAT_COMPRESSION	0x0001
+#define EXT2_FEATURE_INCOMPAT_FILETYPE		0x0002
+#define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004
+#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008
+#define EXT2_FEATURE_INCOMPAT_META_BG		0x0010
+#define EXT2_FEATURE_INCOMPAT_ANY		0xffffffff
+
+#define EXT2_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP	(EXT2_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT2_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT2_FEATURE_RO_COMPAT_BTREE_DIR)
+#define EXT2_FEATURE_RO_COMPAT_UNSUPPORTED	~EXT2_FEATURE_RO_COMPAT_SUPP
+#define EXT2_FEATURE_INCOMPAT_UNSUPPORTED	~EXT2_FEATURE_INCOMPAT_SUPP
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define	EXT2_DEF_RESUID		0
+#define	EXT2_DEF_RESGID		0
+
+/*
+ * Default mount options
+ */
+#define EXT2_DEFM_DEBUG		0x0001
+#define EXT2_DEFM_BSDGROUPS	0x0002
+#define EXT2_DEFM_XATTR_USER	0x0004
+#define EXT2_DEFM_ACL		0x0008
+#define EXT2_DEFM_UID16		0x0010
+    /* Not used by ext2, but reserved for use by ext3 */
+#define EXT3_DEFM_JMODE		0x0060 
+#define EXT3_DEFM_JMODE_DATA	0x0020
+#define EXT3_DEFM_JMODE_ORDERED	0x0040
+#define EXT3_DEFM_JMODE_WBACK	0x0060
+
+/*
+ * Structure of a directory entry
+ */
+
+struct ext2_dir_entry {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__le16	name_len;		/* Name length */
+	char	name[];			/* File name, up to EXT2_NAME_LEN */
+};
+
+/*
+ * The new version of the directory entry.  Since EXT2 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext2_dir_entry_2 {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;
+	char	name[];			/* File name, up to EXT2_NAME_LEN */
+};
+
+/*
+ * Ext2 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+enum {
+	EXT2_FT_UNKNOWN		= 0,
+	EXT2_FT_REG_FILE	= 1,
+	EXT2_FT_DIR		= 2,
+	EXT2_FT_CHRDEV		= 3,
+	EXT2_FT_BLKDEV		= 4,
+	EXT2_FT_FIFO		= 5,
+	EXT2_FT_SOCK		= 6,
+	EXT2_FT_SYMLINK		= 7,
+	EXT2_FT_MAX
+};
+
+/*
+ * EXT2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT2_DIR_PAD		 	4
+#define EXT2_DIR_ROUND 			(EXT2_DIR_PAD - 1)
+#define EXT2_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT2_DIR_ROUND) & \
+					 ~EXT2_DIR_ROUND)
+#define EXT2_MAX_REC_LEN		((1<<16)-1)
+
+static inline void verify_offsets(void)
+{
+#define A(x,y) BUILD_BUG_ON(x != offsetof(struct ext2_super_block, y));
+	A(EXT2_SB_MAGIC_OFFSET, s_magic);
+	A(EXT2_SB_BLOCKS_OFFSET, s_blocks_count);
+	A(EXT2_SB_BSIZE_OFFSET, s_log_block_size);
+#undef A
+}
+
+/*
+ * ext2 mount options
+ */
+struct ext2_mount_options {
+	unsigned long s_mount_opt;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+};
+
+/*
+ * second extended file system inode data in memory
+ */
+struct ext2_inode_info {
+	__le32	i_data[15];
+	__u32	i_flags;
+	__u32	i_faddr;
+	__u8	i_frag_no;
+	__u8	i_frag_size;
+	__u16	i_state;
+	__u32	i_file_acl;
+	__u32	i_dir_acl;
+	__u32	i_dtime;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is used for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	__u32	i_block_group;
+
+	/* block reservation info */
+	struct ext2_block_alloc_info *i_block_alloc_info;
+
+	__u32	i_dir_start_lookup;
+#ifdef CONFIG_EXT2_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+	rwlock_t i_meta_lock;
+
+	/*
+	 * truncate_mutex is for serialising ext2_truncate() against
+	 * ext2_getblock().  It also protects the internals of the inode's
+	 * reservation data structures: ext2_reserve_window and
+	 * ext2_reserve_window_node.
+	 */
+	struct mutex truncate_mutex;
+	struct inode	vfs_inode;
+	struct list_head i_orphan;	/* unlinked but open inodes */
+#ifdef CONFIG_QUOTA
+	struct dquot *i_dquot[MAXQUOTAS];
+#endif
+};
+
+/*
+ * Inode dynamic state flags
+ */
+#define EXT2_STATE_NEW			0x00000001 /* inode is newly created */
+
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext2 source programs needs to include it so they are duplicated here.
+ */
+
+static inline struct ext2_inode_info *EXT2_I(struct inode *inode)
+{
+	return container_of(inode, struct ext2_inode_info, vfs_inode);
+}
+
+/* balloc.c */
+extern int ext2_bg_has_super(struct super_block *sb, int group);
+extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
+extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *);
+extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long,
+				unsigned long *, int *);
+extern void ext2_free_blocks (struct inode *, unsigned long,
+			      unsigned long);
+extern unsigned long ext2_count_free_blocks (struct super_block *);
+extern unsigned long ext2_count_dirs (struct super_block *);
+extern void ext2_check_blocks_bitmap (struct super_block *);
+extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
+						    unsigned int block_group,
+						    struct buffer_head ** bh);
+extern void ext2_discard_reservation (struct inode *);
+extern int ext2_should_retry_alloc(struct super_block *sb, int *retries);
+extern void ext2_init_block_alloc_info(struct inode *);
+extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
+
+/* dir.c */
+extern int ext2_add_link (struct dentry *, struct inode *);
+extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
+extern int ext2_make_empty(struct inode *, struct inode *);
+extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
+extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_empty_dir (struct inode *);
+extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
+extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
+
+/* ialloc.c */
+extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *);
+extern void ext2_free_inode (struct inode *);
+extern unsigned long ext2_count_free_inodes (struct super_block *);
+extern void ext2_check_inodes_bitmap (struct super_block *);
+extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
+
+/* inode.c */
+extern struct inode *ext2_iget (struct super_block *, unsigned long);
+extern int ext2_write_inode (struct inode *, struct writeback_control *);
+extern void ext2_evict_inode(struct inode *);
+extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int ext2_setattr (struct dentry *, struct iattr *);
+extern void ext2_set_inode_flags(struct inode *inode);
+extern void ext2_get_inode_flags(struct ext2_inode_info *);
+extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len);
+
+/* ioctl.c */
+extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* namei.c */
+struct dentry *ext2_get_parent(struct dentry *child);
+
+/* super.c */
+extern __printf(3, 4)
+void ext2_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext2_msg(struct super_block *, const char *, const char *, ...);
+extern void ext2_update_dynamic_rev (struct super_block *sb);
+extern void ext2_write_super (struct super_block *);
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext2_dir_operations;
+
+/* file.c */
+extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync);
+extern const struct inode_operations ext2_file_inode_operations;
+extern const struct file_operations ext2_file_operations;
+
+/* inode.c */
+extern const struct address_space_operations ext2_aops;
+extern const struct address_space_operations ext2_nobh_aops;
+
+/* namei.c */
+extern const struct inode_operations ext2_dir_inode_operations;
+extern const struct inode_operations ext2_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations ext2_fast_symlink_inode_operations;
+extern const struct inode_operations ext2_symlink_inode_operations;
+
+static inline ext2_fsblk_t
+ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+	return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
+}
+
+#define ext2_set_bit	__test_and_set_bit_le
+#define ext2_clear_bit	__test_and_clear_bit_le
+#define ext2_test_bit	test_bit_le
+#define ext2_find_first_zero_bit	find_first_zero_bit_le
+#define ext2_find_next_zero_bit		find_next_zero_bit_le
diff --git a/kernel/fs/ext2/file.c b/kernel/fs/ext2/file.c
new file mode 100644
index 000000000..3a0a6c640
--- /dev/null
+++ b/kernel/fs/ext2/file.c
@@ -0,0 +1,121 @@
+/*
+ *  linux/fs/ext2/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ * 	(jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+#ifdef CONFIG_FS_DAX
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_fault(vma, vmf, ext2_get_block);
+}
+
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+	.fault		= ext2_dax_fault,
+	.page_mkwrite	= ext2_dax_mkwrite,
+	.pfn_mkwrite	= dax_pfn_mkwrite,
+};
+
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!IS_DAX(file_inode(file)))
+		return generic_file_mmap(file, vma);
+
+	file_accessed(file);
+	vma->vm_ops = &ext2_dax_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP;
+	return 0;
+}
+#else
+#define ext2_file_mmap	generic_file_mmap
+#endif
+
+/*
+ * Called when filp is released. This happens when all file descriptors
+ * for a single struct file are closed. Note that different open() calls
+ * for the same file yield different struct file structures.
+ */
+static int ext2_release_file (struct inode * inode, struct file * filp)
+{
+	if (filp->f_mode & FMODE_WRITE) {
+		mutex_lock(&EXT2_I(inode)->truncate_mutex);
+		ext2_discard_reservation(inode);
+		mutex_unlock(&EXT2_I(inode)->truncate_mutex);
+	}
+	return 0;
+}
+
+int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	int ret;
+	struct super_block *sb = file->f_mapping->host->i_sb;
+	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+
+	ret = generic_file_fsync(file, start, end, datasync);
+	if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+		/* We don't really know where the IO error happened... */
+		ext2_error(sb, __func__,
+			   "detected IO error when writing metadata buffers");
+		ret = -EIO;
+	}
+	return ret;
+}
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the ext2 filesystem.
+ */
+const struct file_operations ext2_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.unlocked_ioctl = ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
+	.mmap		= ext2_file_mmap,
+	.open		= dquot_file_open,
+	.release	= ext2_release_file,
+	.fsync		= ext2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+};
+
+const struct inode_operations ext2_file_inode_operations = {
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.setattr	= ext2_setattr,
+	.get_acl	= ext2_get_acl,
+	.set_acl	= ext2_set_acl,
+	.fiemap		= ext2_fiemap,
+};
diff --git a/kernel/fs/ext2/ialloc.c b/kernel/fs/ext2/ialloc.c
new file mode 100644
index 000000000..5c04a0dde
--- /dev/null
+++ b/kernel/fs/ext2/ialloc.c
@@ -0,0 +1,675 @@
+/*
+ *  linux/fs/ext2/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by 
+ *  Stephen Tweedie (sct@dcs.ed.ac.uk), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+{
+	struct ext2_group_desc *desc;
+	struct buffer_head *bh = NULL;
+
+	desc = ext2_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		goto error_out;
+
+	bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
+	if (!bh)
+		ext2_error(sb, "read_inode_bitmap",
+			    "Cannot read inode bitmap - "
+			    "block_group = %lu, inode_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_inode_bitmap));
+error_out:
+	return bh;
+}
+
+static void ext2_release_inode(struct super_block *sb, int group, int dir)
+{
+	struct ext2_group_desc * desc;
+	struct buffer_head *bh;
+
+	desc = ext2_get_group_desc(sb, group, &bh);
+	if (!desc) {
+		ext2_error(sb, "ext2_release_inode",
+			"can't get descriptor for group %d", group);
+		return;
+	}
+
+	spin_lock(sb_bgl_lock(EXT2_SB(sb), group));
+	le16_add_cpu(&desc->bg_free_inodes_count, 1);
+	if (dir)
+		le16_add_cpu(&desc->bg_used_dirs_count, -1);
+	spin_unlock(sb_bgl_lock(EXT2_SB(sb), group));
+	if (dir)
+		percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter);
+	mark_buffer_dirty(bh);
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext2_free_inode (struct inode * inode)
+{
+	struct super_block * sb = inode->i_sb;
+	int is_directory;
+	unsigned long ino;
+	struct buffer_head *bitmap_bh;
+	unsigned long block_group;
+	unsigned long bit;
+	struct ext2_super_block * es;
+
+	ino = inode->i_ino;
+	ext2_debug ("freeing inode %lu\n", ino);
+
+	/*
+	 * Note: we must free any quota before locking the superblock,
+	 * as writing the quota to disk may need the lock as well.
+	 */
+	/* Quota is already initialized in iput() */
+	dquot_free_inode(inode);
+	dquot_drop(inode);
+
+	es = EXT2_SB(sb)->s_es;
+	is_directory = S_ISDIR(inode->i_mode);
+
+	if (ino < EXT2_FIRST_INO(sb) ||
+	    ino > le32_to_cpu(es->s_inodes_count)) {
+		ext2_error (sb, "ext2_free_inode",
+			    "reserved or nonexistent inode %lu", ino);
+		return;
+	}
+	block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		return;
+
+	/* Ok, now we can actually update the inode bitmaps.. */
+	if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
+				bit, (void *) bitmap_bh->b_data))
+		ext2_error (sb, "ext2_free_inode",
+			      "bit already cleared for inode %lu", ino);
+	else
+		ext2_release_inode(sb, block_group, is_directory);
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+
+	brelse(bitmap_bh);
+}
+
+/*
+ * We perform asynchronous prereading of the new inode's inode block when
+ * we create the inode, in the expectation that the inode will be written
+ * back soon.  There are two reasons:
+ *
+ * - When creating a large number of files, the async prereads will be
+ *   nicely merged into large reads
+ * - When writing out a large number of inodes, we don't need to keep on
+ *   stalling the writes while we read the inode block.
+ *
+ * FIXME: ext2_get_group_desc() needs to be simplified.
+ */
+static void ext2_preread_inode(struct inode *inode)
+{
+	unsigned long block_group;
+	unsigned long offset;
+	unsigned long block;
+	struct ext2_group_desc * gdp;
+	struct backing_dev_info *bdi;
+
+	bdi = inode_to_bdi(inode);
+	if (bdi_read_congested(bdi))
+		return;
+	if (bdi_write_congested(bdi))
+		return;
+
+	block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
+	gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL);
+	if (gdp == NULL)
+		return;
+
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) *
+				EXT2_INODE_SIZE(inode->i_sb);
+	block = le32_to_cpu(gdp->bg_inode_table) +
+				(offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb));
+	sb_breadahead(inode->i_sb, block);
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory\'s block
+ * group to find a free inode.
+ */
+static int find_group_dir(struct super_block *sb, struct inode *parent)
+{
+	int ngroups = EXT2_SB(sb)->s_groups_count;
+	int avefreei = ext2_count_free_inodes(sb) / ngroups;
+	struct ext2_group_desc *desc, *best_desc = NULL;
+	int group, best_group = -1;
+
+	for (group = 0; group < ngroups; group++) {
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+			continue;
+		if (!best_desc || 
+		    (le16_to_cpu(desc->bg_free_blocks_count) >
+		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+			best_group = group;
+			best_desc = desc;
+		}
+	}
+	if (!best_desc)
+		return -1;
+
+	return best_group;
+}
+
+/* 
+ * Orlov's allocator for directories. 
+ * 
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts 
+ * not worse than average we return one with smallest directory count. 
+ * Otherwise we simply return a random group. 
+ * 
+ * For the rest rules look so: 
+ * 
+ * It's OK to put directory into a group unless 
+ * it has too many directories already (max_dirs) or 
+ * it has too few free inodes left (min_inodes) or 
+ * it has too few free blocks left (min_blocks) or 
+ * it's already running too large debt (max_debt). 
+ * Parent's group is preferred, if it doesn't satisfy these 
+ * conditions we search cyclically through the rest. If none 
+ * of the groups look good we just look for a group with more 
+ * free inodes than average (starting at parent's group). 
+ * 
+ * Debt is incremented each time we allocate a directory and decremented 
+ * when we allocate an inode, within 0--255. 
+ */ 
+
+#define INODE_COST 64
+#define BLOCK_COST 256
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT2_I(parent)->i_block_group;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+	int ngroups = sbi->s_groups_count;
+	int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
+	int freei;
+	int avefreei;
+	int free_blocks;
+	int avefreeb;
+	int blocks_per_dir;
+	int ndirs;
+	int max_debt, max_dirs, min_blocks, min_inodes;
+	int group = -1, i;
+	struct ext2_group_desc *desc;
+
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	avefreeb = free_blocks / ngroups;
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+	if ((parent == d_inode(sb->s_root)) ||
+	    (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) {
+		struct ext2_group_desc *best_desc = NULL;
+		int best_ndir = inodes_per_group;
+		int best_group = -1;
+
+		group = prandom_u32();
+		parent_group = (unsigned)group % ngroups;
+		for (i = 0; i < ngroups; i++) {
+			group = (parent_group + i) % ngroups;
+			desc = ext2_get_group_desc (sb, group, NULL);
+			if (!desc || !desc->bg_free_inodes_count)
+				continue;
+			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+				continue;
+			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+				continue;
+			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+				continue;
+			best_group = group;
+			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+			best_desc = desc;
+		}
+		if (best_group >= 0) {
+			desc = best_desc;
+			group = best_group;
+			goto found;
+		}
+		goto fallback;
+	}
+
+	if (ndirs == 0)
+		ndirs = 1;	/* percpu_counters are approximate... */
+
+	blocks_per_dir = (le32_to_cpu(es->s_blocks_count)-free_blocks) / ndirs;
+
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group / 4;
+	min_blocks = avefreeb - EXT2_BLOCKS_PER_GROUP(sb) / 4;
+
+	max_debt = EXT2_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
+	if (max_debt * INODE_COST > inodes_per_group)
+		max_debt = inodes_per_group / INODE_COST;
+	if (max_debt > 255)
+		max_debt = 255;
+	if (max_debt == 0)
+		max_debt = 1;
+
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (sbi->s_debts[group] >= max_debt)
+			continue;
+		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+			continue;
+		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+			continue;
+		goto found;
+	}
+
+fallback:
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+			goto found;
+	}
+
+	if (avefreei) {
+		/*
+		 * The free-inodes counter is approximate, and for really small
+		 * filesystems the above test can fail to find any blockgroups
+		 */
+		avefreei = 0;
+		goto fallback;
+	}
+
+	return -1;
+
+found:
+	return group;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT2_I(parent)->i_block_group;
+	int ngroups = EXT2_SB(sb)->s_groups_count;
+	struct ext2_group_desc *desc;
+	int group, i;
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	group = parent_group;
+	desc = ext2_get_group_desc (sb, group, NULL);
+	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+			le16_to_cpu(desc->bg_free_blocks_count))
+		goto found;
+
+	/*
+	 * We're going to place this inode in a different blockgroup from its
+	 * parent.  We want to cause files in a common directory to all land in
+	 * the same blockgroup.  But we want files which are in a different
+	 * directory which shares a blockgroup with our parent to land in a
+	 * different blockgroup.
+	 *
+	 * So add our directory's i_ino into the starting point for the hash.
+	 */
+	group = (group + parent->i_ino) % ngroups;
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode and some
+	 * free blocks.
+	 */
+	for (i = 1; i < ngroups; i <<= 1) {
+		group += i;
+		if (group >= ngroups)
+			group -= ngroups;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+				le16_to_cpu(desc->bg_free_blocks_count))
+			goto found;
+	}
+
+	/*
+	 * That failed: try linear search for a free inode, even if that group
+	 * has no free blocks.
+	 */
+	group = parent_group;
+	for (i = 0; i < ngroups; i++) {
+		if (++group >= ngroups)
+			group = 0;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+			goto found;
+	}
+
+	return -1;
+
+found:
+	return group;
+}
+
+struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
+			     const struct qstr *qstr)
+{
+	struct super_block *sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	int group, i;
+	ino_t ino = 0;
+	struct inode * inode;
+	struct ext2_group_desc *gdp;
+	struct ext2_super_block *es;
+	struct ext2_inode_info *ei;
+	struct ext2_sb_info *sbi;
+	int err;
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	ei = EXT2_I(inode);
+	sbi = EXT2_SB(sb);
+	es = sbi->s_es;
+	if (S_ISDIR(mode)) {
+		if (test_opt(sb, OLDALLOC))
+			group = find_group_dir(sb, dir);
+		else
+			group = find_group_orlov(sb, dir);
+	} else 
+		group = find_group_other(sb, dir);
+
+	if (group == -1) {
+		err = -ENOSPC;
+		goto fail;
+	}
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext2_get_group_desc(sb, group, &bh2);
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, group);
+		if (!bitmap_bh) {
+			err = -EIO;
+			goto fail;
+		}
+		ino = 0;
+
+repeat_in_this_group:
+		ino = ext2_find_next_zero_bit((unsigned long *)bitmap_bh->b_data,
+					      EXT2_INODES_PER_GROUP(sb), ino);
+		if (ino >= EXT2_INODES_PER_GROUP(sb)) {
+			/*
+			 * Rare race: find_group_xx() decided that there were
+			 * free inodes in this group, but by the time we tried
+			 * to allocate one, they're all gone.  This can also
+			 * occur because the counters which find_group_orlov()
+			 * uses are approximate.  So just go and search the
+			 * next block group.
+			 */
+			if (++group == sbi->s_groups_count)
+				group = 0;
+			continue;
+		}
+		if (ext2_set_bit_atomic(sb_bgl_lock(sbi, group),
+						ino, bitmap_bh->b_data)) {
+			/* we lost this inode */
+			if (++ino >= EXT2_INODES_PER_GROUP(sb)) {
+				/* this group is exhausted, try next group */
+				if (++group == sbi->s_groups_count)
+					group = 0;
+				continue;
+			}
+			/* try to find free inode in the same group */
+			goto repeat_in_this_group;
+		}
+		goto got;
+	}
+
+	/*
+	 * Scanned all blockgroups.
+	 */
+	err = -ENOSPC;
+	goto fail;
+got:
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+	brelse(bitmap_bh);
+
+	ino += group * EXT2_INODES_PER_GROUP(sb) + 1;
+	if (ino < EXT2_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext2_error (sb, "ext2_new_inode",
+			    "reserved inode or inode > inodes count - "
+			    "block_group = %d,inode=%lu", group,
+			    (unsigned long) ino);
+		err = -EIO;
+		goto fail;
+	}
+
+	percpu_counter_add(&sbi->s_freeinodes_counter, -1);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
+
+	spin_lock(sb_bgl_lock(sbi, group));
+	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+	if (S_ISDIR(mode)) {
+		if (sbi->s_debts[group] < 255)
+			sbi->s_debts[group]++;
+		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+	} else {
+		if (sbi->s_debts[group])
+			sbi->s_debts[group]--;
+	}
+	spin_unlock(sb_bgl_lock(sbi, group));
+
+	mark_buffer_dirty(bh2);
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+
+	inode->i_ino = ino;
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	ei->i_flags =
+		ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
+	ei->i_faddr = 0;
+	ei->i_frag_no = 0;
+	ei->i_frag_size = 0;
+	ei->i_file_acl = 0;
+	ei->i_dir_acl = 0;
+	ei->i_dtime = 0;
+	ei->i_block_alloc_info = NULL;
+	ei->i_block_group = group;
+	ei->i_dir_start_lookup = 0;
+	ei->i_state = EXT2_STATE_NEW;
+	ext2_set_inode_flags(inode);
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	if (insert_inode_locked(inode) < 0) {
+		ext2_error(sb, "ext2_new_inode",
+			   "inode number already in use - inode=%lu",
+			   (unsigned long) ino);
+		err = -EIO;
+		goto fail;
+	}
+
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto fail_drop;
+
+	err = ext2_init_acl(inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext2_init_security(inode, dir, qstr);
+	if (err)
+		goto fail_free_drop;
+
+	mark_inode_dirty(inode);
+	ext2_debug("allocating inode %lu\n", inode->i_ino);
+	ext2_preread_inode(inode);
+	return inode;
+
+fail_free_drop:
+	dquot_free_inode(inode);
+
+fail_drop:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+
+fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+}
+
+unsigned long ext2_count_free_inodes (struct super_block * sb)
+{
+	struct ext2_group_desc *desc;
+	unsigned long desc_count = 0;
+	int i;	
+
+#ifdef EXT2FS_DEBUG
+	struct ext2_super_block *es;
+	unsigned long bitmap_count = 0;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT2_SB(sb)->s_es;
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		unsigned x;
+
+		desc = ext2_get_group_desc (sb, i, NULL);
+		if (!desc)
+			continue;
+		desc_count += le16_to_cpu(desc->bg_free_inodes_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+
+		x = ext2_count_free(bitmap_bh, EXT2_INODES_PER_GROUP(sb) / 8);
+		printk("group %d: stored = %d, counted = %u\n",
+			i, le16_to_cpu(desc->bg_free_inodes_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
+		(unsigned long)
+		percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
+		desc_count, bitmap_count);
+	return desc_count;
+#else
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		desc = ext2_get_group_desc (sb, i, NULL);
+		if (!desc)
+			continue;
+		desc_count += le16_to_cpu(desc->bg_free_inodes_count);
+	}
+	return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext2_count_dirs (struct super_block * sb)
+{
+	unsigned long count = 0;
+	int i;
+
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		struct ext2_group_desc *gdp = ext2_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		count += le16_to_cpu(gdp->bg_used_dirs_count);
+	}
+	return count;
+}
+
diff --git a/kernel/fs/ext2/inode.c b/kernel/fs/ext2/inode.c
new file mode 100644
index 000000000..f460ae36d
--- /dev/null
+++ b/kernel/fs/ext2/inode.c
@@ -0,0 +1,1573 @@
+/*
+ *  linux/fs/ext2/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ * 	(sct@dcs.ed.ac.uk), 1993, 1998
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ * 	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
+ */
+
+#include <linux/time.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/fiemap.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include "ext2.h"
+#include "acl.h"
+#include "xattr.h"
+
+static int __ext2_write_inode(struct inode *inode, int do_sync);
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int ext2_inode_is_fast_symlink(struct inode *inode)
+{
+	int ea_blocks = EXT2_I(inode)->i_file_acl ?
+		(inode->i_sb->s_blocksize >> 9) : 0;
+
+	return (S_ISLNK(inode->i_mode) &&
+		inode->i_blocks - ea_blocks == 0);
+}
+
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		ext2_truncate_blocks(inode, inode->i_size);
+	}
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext2_evict_inode(struct inode * inode)
+{
+	struct ext2_block_alloc_info *rsv;
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		want_delete = 1;
+		dquot_initialize(inode);
+	} else {
+		dquot_drop(inode);
+	}
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	if (want_delete) {
+		sb_start_intwrite(inode->i_sb);
+		/* set dtime */
+		EXT2_I(inode)->i_dtime	= get_seconds();
+		mark_inode_dirty(inode);
+		__ext2_write_inode(inode, inode_needs_sync(inode));
+		/* truncate to 0 */
+		inode->i_size = 0;
+		if (inode->i_blocks)
+			ext2_truncate_blocks(inode, 0);
+		ext2_xattr_delete_inode(inode);
+	}
+
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+
+	ext2_discard_reservation(inode);
+	rsv = EXT2_I(inode)->i_block_alloc_info;
+	EXT2_I(inode)->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+
+	if (want_delete) {
+		ext2_free_inode(inode);
+		sb_end_intwrite(inode->i_sb);
+	}
+}
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static inline int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+/**
+ *	ext2_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *      @boundary: set this non-zero if the referred-to block is likely to be
+ *             followed (on disk) by an indirect block.
+ *	To store the locations of file's data ext2 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext2_block_to_path(struct inode *inode,
+			long i_block, int offsets[4], int *boundary)
+{
+	int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT2_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < 0) {
+		ext2_msg(inode->i_sb, KERN_WARNING,
+			"warning: %s: block < 0", __func__);
+	} else if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT2_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT2_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT2_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext2_msg(inode->i_sb, KERN_WARNING,
+			"warning: %s: block is too big", __func__);
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+
+	return n;
+}
+
+/**
+ *	ext2_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it notices that chain had been changed while it was reading
+ *		(ditto, *@err == -EAGAIN)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static Indirect *ext2_get_branch(struct inode *inode,
+				 int depth,
+				 int *offsets,
+				 Indirect chain[4],
+				 int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_bread(sb, le32_to_cpu(p->key));
+		if (!bh)
+			goto failure;
+		read_lock(&EXT2_I(inode)->i_meta_lock);
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		read_unlock(&EXT2_I(inode)->i_meta_lock);
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	read_unlock(&EXT2_I(inode)->i_meta_lock);
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+/**
+ *	ext2_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the preferred place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+
+static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+	ext2_fsblk_t bg_start;
+	ext2_fsblk_t colour;
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--)
+		if (*p)
+			return le32_to_cpu(*p);
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred from inode itself? OK, just put it into
+	 * the same cylinder group then.
+	 */
+	bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group);
+	colour = (current->pid % 16) *
+			(EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+/**
+ *	ext2_find_goal - find a preferred place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@partial: pointer to the last triple within a chain
+ *
+ *	Returns preferred place for a block (the goal).
+ */
+
+static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block,
+					  Indirect *partial)
+{
+	struct ext2_block_alloc_info *block_i;
+
+	block_i = EXT2_I(inode)->i_block_alloc_info;
+
+	/*
+	 * try the heuristic for sequential allocation,
+	 * failing that at least try to get decent locality.
+	 */
+	if (block_i && (block == block_i->last_alloc_logical_block + 1)
+		&& (block_i->last_alloc_physical_block != 0)) {
+		return block_i->last_alloc_physical_block + 1;
+	}
+
+	return ext2_find_near(inode, partial);
+}
+
+/**
+ *	ext2_blks_to_allocate: Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ * 	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int
+ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
+		int blocks_to_boundary)
+{
+	unsigned long count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now don't hanel cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary
+		&& le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext2_alloc_blocks: multiple allocate blocks needed for a branch
+ *	@indirect_blks: the number of blocks need to allocate for indirect
+ *			blocks
+ *
+ *	@new_blocks: on return it will store the new block numbers for
+ *	the indirect blocks(if needed) and the first direct block,
+ *	@blks:	on return it will store the total number of allocated
+ *		direct blocks
+ */
+static int ext2_alloc_blocks(struct inode *inode,
+			ext2_fsblk_t goal, int indirect_blks, int blks,
+			ext2_fsblk_t new_blocks[4], int *err)
+{
+	int target, i;
+	unsigned long count = 0;
+	int index = 0;
+	ext2_fsblk_t current_block = 0;
+	int ret = 0;
+
+	/*
+	 * Here we try to allocate the requested multiple blocks at once,
+	 * on a best-effort basis.
+	 * To build a branch, we should allocate blocks for
+	 * the indirect blocks(if not allocated yet), and at least
+	 * the first direct block of this branch.  That's the
+	 * minimum number of blocks need to allocate(required)
+	 */
+	target = blks + indirect_blks;
+
+	while (1) {
+		count = target;
+		/* allocating blocks for indirect blocks and direct blocks */
+		current_block = ext2_new_blocks(inode,goal,&count,err);
+		if (*err)
+			goto failed_out;
+
+		target -= count;
+		/* allocate blocks for indirect blocks */
+		while (index < indirect_blks && count) {
+			new_blocks[index++] = current_block++;
+			count--;
+		}
+
+		if (count > 0)
+			break;
+	}
+
+	/* save the new block number for the first direct block */
+	new_blocks[index] = current_block;
+
+	/* total number of blocks allocated for direct blocks */
+	ret = count;
+	*err = 0;
+	return ret;
+failed_out:
+	for (i = 0; i <index; i++)
+		ext2_free_blocks(inode, new_blocks[i], 1);
+	if (index)
+		mark_inode_dirty(inode);
+	return ret;
+}
+
+/**
+ *	ext2_alloc_branch - allocate and set up a chain of blocks.
+ *	@inode: owner
+ *	@num: depth of the chain (number of blocks to allocate)
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates @num blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext2_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext2_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+
+static int ext2_alloc_branch(struct inode *inode,
+			int indirect_blks, int *blks, ext2_fsblk_t goal,
+			int *offsets, Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int i, n = 0;
+	int err = 0;
+	struct buffer_head *bh;
+	int num;
+	ext2_fsblk_t new_blocks[4];
+	ext2_fsblk_t current_block;
+
+	num = ext2_alloc_blocks(inode, goal, indirect_blks,
+				*blks, new_blocks, &err);
+	if (err)
+		return err;
+
+	branch[0].key = cpu_to_le32(new_blocks[0]);
+	/*
+	 * metadata blocks and data blocks are allocated.
+	 */
+	for (n = 1; n <= indirect_blks;  n++) {
+		/*
+		 * Get buffer_head for parent block, zero it out
+		 * and set the pointer to new one, then send
+		 * parent to disk.
+		 */
+		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		branch[n].bh = bh;
+		lock_buffer(bh);
+		memset(bh->b_data, 0, blocksize);
+		branch[n].p = (__le32 *) bh->b_data + offsets[n];
+		branch[n].key = cpu_to_le32(new_blocks[n]);
+		*branch[n].p = branch[n].key;
+		if ( n == indirect_blks) {
+			current_block = new_blocks[n];
+			/*
+			 * End of chain, update the last new metablock of
+			 * the chain to point to the new allocated
+			 * data blocks numbers
+			 */
+			for (i=1; i < num; i++)
+				*(branch[n].p + i) = cpu_to_le32(++current_block);
+		}
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		mark_buffer_dirty_inode(bh, inode);
+		/* We used to sync bh here if IS_SYNC(inode).
+		 * But we now rely upon generic_write_sync()
+		 * and b_inode_buffers.  But not for directories.
+		 */
+		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+			sync_dirty_buffer(bh);
+	}
+	*blks = num;
+	return err;
+
+failed:
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < indirect_blks; i++)
+		ext2_free_blocks(inode, new_blocks[i], 1);
+	ext2_free_blocks(inode, new_blocks[i], num);
+	return err;
+}
+
+/**
+ * ext2_splice_branch - splice the allocated branch onto inode.
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static void ext2_splice_branch(struct inode *inode,
+			long block, Indirect *where, int num, int blks)
+{
+	int i;
+	struct ext2_block_alloc_info *block_i;
+	ext2_fsblk_t current_block;
+
+	block_i = EXT2_I(inode)->i_block_alloc_info;
+
+	/* XXX LOCKING probably should have i_meta_lock ?*/
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && blks > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < blks; i++)
+			*(where->p + i ) = cpu_to_le32(current_block++);
+	}
+
+	/*
+	 * update the most recently allocated logical & physical block
+	 * in i_block_alloc_info, to assist find the proper goal block for next
+	 * allocation
+	 */
+	if (block_i) {
+		block_i->last_alloc_logical_block = block + blks - 1;
+		block_i->last_alloc_physical_block =
+				le32_to_cpu(where[num].key) + blks - 1;
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh)
+		mark_buffer_dirty_inode(where->bh, inode);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+static int ext2_get_blocks(struct inode *inode,
+			   sector_t iblock, unsigned long maxblocks,
+			   struct buffer_head *bh_result,
+			   int create)
+{
+	int err = -EIO;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	ext2_fsblk_t goal;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int count = 0;
+	ext2_fsblk_t first_block = 0;
+
+	BUG_ON(maxblocks == 0);
+
+	depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+
+	if (depth == 0)
+		return (err);
+
+	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		clear_buffer_new(bh_result); /* What's this do? */
+		count++;
+		/*map more blocks*/
+		while (count < maxblocks && count <= blocks_to_boundary) {
+			ext2_fsblk_t blk;
+
+			if (!verify_chain(chain, chain + depth - 1)) {
+				/*
+				 * Indirect block might be removed by
+				 * truncate while we were reading it.
+				 * Handling of that case: forget what we've
+				 * got now, go to reread.
+				 */
+				err = -EAGAIN;
+				count = 0;
+				break;
+			}
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		if (err != -EAGAIN)
+			goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO)
+		goto cleanup;
+
+	mutex_lock(&ei->truncate_mutex);
+	/*
+	 * If the indirect block is missing while we are reading
+	 * the chain(ext2_get_branch() returns -EAGAIN err), or
+	 * if the chain has been changed after we grab the semaphore,
+	 * (either because another process truncated this branch, or
+	 * another get_block allocated this branch) re-grab the chain to see if
+	 * the request block has been allocated or not.
+	 *
+	 * Since we already block the truncate/other get_block
+	 * at this point, we will have the current copy of the chain when we
+	 * splice the branch into the tree.
+	 */
+	if (err == -EAGAIN || !verify_chain(chain, partial)) {
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+		if (!partial) {
+			count++;
+			mutex_unlock(&ei->truncate_mutex);
+			if (err)
+				goto cleanup;
+			clear_buffer_new(bh_result);
+			goto got_it;
+		}
+	}
+
+	/*
+	 * Okay, we need to do block allocation.  Lazily initialize the block
+	 * allocation info here if necessary
+	*/
+	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
+		ext2_init_block_alloc_info(inode);
+
+	goal = ext2_find_goal(inode, iblock, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	count = ext2_blks_to_allocate(partial, indirect_blks,
+					maxblocks, blocks_to_boundary);
+	/*
+	 * XXX ???? Block out ext2_truncate while we alter the tree
+	 */
+	err = ext2_alloc_branch(inode, indirect_blks, &count, goal,
+				offsets + (partial - chain), partial);
+
+	if (err) {
+		mutex_unlock(&ei->truncate_mutex);
+		goto cleanup;
+	}
+
+	if (IS_DAX(inode)) {
+		/*
+		 * block must be initialised before we put it in the tree
+		 * so that it's not found by another thread before it's
+		 * initialised
+		 */
+		err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
+						1 << inode->i_blkbits);
+		if (err) {
+			mutex_unlock(&ei->truncate_mutex);
+			goto cleanup;
+		}
+	}
+
+	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
+	mutex_unlock(&ei->truncate_mutex);
+	set_buffer_new(bh_result);
+got_it:
+	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	if (count > blocks_to_boundary)
+		set_buffer_boundary(bh_result);
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		brelse(partial->bh);
+		partial--;
+	}
+	return err;
+}
+
+int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+{
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int ret = ext2_get_blocks(inode, iblock, max_blocks,
+			      bh_result, create);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	return ret;
+
+}
+
+int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext2_get_block);
+}
+
+static int ext2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, ext2_get_block, wbc);
+}
+
+static int ext2_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, ext2_get_block);
+}
+
+static int
+ext2_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
+}
+
+static int
+ext2_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				ext2_get_block);
+	if (ret < 0)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static int ext2_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int ret;
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret < len)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static int
+ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
+			       ext2_get_block);
+	if (ret < 0)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static int ext2_nobh_writepage(struct page *page,
+			struct writeback_control *wbc)
+{
+	return nobh_writepage(page, ext2_get_block, wbc);
+}
+
+static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,ext2_get_block);
+}
+
+static ssize_t
+ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (IS_DAX(inode))
+		ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
+				DIO_LOCKING);
+	else
+		ret = blockdev_direct_IO(iocb, inode, iter, offset,
+					 ext2_get_block);
+	if (ret < 0 && iov_iter_rw(iter) == WRITE)
+		ext2_write_failed(mapping, offset + count);
+	return ret;
+}
+
+static int
+ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, ext2_get_block);
+}
+
+const struct address_space_operations ext2_aops = {
+	.readpage		= ext2_readpage,
+	.readpages		= ext2_readpages,
+	.writepage		= ext2_writepage,
+	.write_begin		= ext2_write_begin,
+	.write_end		= ext2_write_end,
+	.bmap			= ext2_bmap,
+	.direct_IO		= ext2_direct_IO,
+	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+const struct address_space_operations ext2_nobh_aops = {
+	.readpage		= ext2_readpage,
+	.readpages		= ext2_readpages,
+	.writepage		= ext2_nobh_writepage,
+	.write_begin		= ext2_nobh_write_begin,
+	.write_end		= nobh_write_end,
+	.bmap			= ext2_bmap,
+	.direct_IO		= ext2_direct_IO,
+	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext2_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext2_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext2_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several indirect
+ *	blocks but leave the blocks themselves alive. Block is partially
+ *	truncated if some data below the new i_size is referred from it (and
+ *	it is on the path to the first completely truncated data block, indeed).
+ *	We have to free the top of that path along with everything to the right
+ *	of the path. Since no allocation past the truncation point is possible
+ *	until ext2_truncate() finishes, we may safely do the latter, but top
+ *	of branch may require special attention - pageout below the truncation
+ *	point might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the block
+ *	number of its root in *@top, pointers to buffer_heads of partially
+ *	truncated blocks - in @chain[].bh and pointers to their last elements
+ *	that should not be removed - in @chain[].p. Return value is the pointer
+ *	to last filled element of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].p
+ *			(no partially truncated stuff there).
+ */
+
+static Indirect *ext2_find_shared(struct inode *inode,
+				int depth,
+				int offsets[4],
+				Indirect chain[4],
+				__le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext2_get_branch(inode, k, offsets, chain, &err);
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	write_lock(&EXT2_I(inode)->i_meta_lock);
+	if (!partial->key && *partial->p) {
+		write_unlock(&EXT2_I(inode)->i_meta_lock);
+		goto no_top;
+	}
+	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		*p->p = 0;
+	}
+	write_unlock(&EXT2_I(inode)->i_meta_lock);
+
+	while(partial > p)
+	{
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/**
+ *	ext2_free_data - free a list of data blocks
+ *	@inode:	inode we are dealing with
+ *	@p:	array of block numbers
+ *	@q:	points immediately past the end of array
+ *
+ *	We are freeing all blocks referred from that array (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
+{
+	unsigned long block_to_free = 0, count = 0;
+	unsigned long nr;
+
+	for ( ; p < q ; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			*p = 0;
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0)
+				goto free_this;
+			else if (block_to_free == nr - count)
+				count++;
+			else {
+				ext2_free_blocks (inode, block_to_free, count);
+				mark_inode_dirty(inode);
+			free_this:
+				block_to_free = nr;
+				count = 1;
+			}
+		}
+	}
+	if (count > 0) {
+		ext2_free_blocks (inode, block_to_free, count);
+		mark_inode_dirty(inode);
+	}
+}
+
+/**
+ *	ext2_free_branches - free an array of branches
+ *	@inode:	inode we are dealing with
+ *	@p:	array of block numbers
+ *	@q:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks referred from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth)
+{
+	struct buffer_head * bh;
+	unsigned long nr;
+
+	if (depth--) {
+		int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
+		for ( ; p < q ; p++) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;
+			*p = 0;
+			bh = sb_bread(inode->i_sb, nr);
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */ 
+			if (!bh) {
+				ext2_error(inode->i_sb, "ext2_free_branches",
+					"Read failure, inode=%ld, block=%ld",
+					inode->i_ino, nr);
+				continue;
+			}
+			ext2_free_branches(inode,
+					   (__le32*)bh->b_data,
+					   (__le32*)bh->b_data + addr_per_block,
+					   depth);
+			bforget(bh);
+			ext2_free_blocks(inode, nr, 1);
+			mark_inode_dirty(inode);
+		}
+	} else
+		ext2_free_data(inode, p, q);
+}
+
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+	__le32 *i_data = EXT2_I(inode)->i_data;
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n;
+	long iblock;
+	unsigned blocksize;
+	blocksize = inode->i_sb->s_blocksize;
+	iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
+
+	n = ext2_block_to_path(inode, iblock, offsets, NULL);
+	if (n == 0)
+		return;
+
+	/*
+	 * From here we block out all ext2_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	mutex_lock(&ei->truncate_mutex);
+
+	if (n == 1) {
+		ext2_free_data(inode, i_data+offsets[0],
+					i_data + EXT2_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext2_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (already detached) */
+	if (nr) {
+		if (partial == chain)
+			mark_inode_dirty(inode);
+		else
+			mark_buffer_dirty_inode(partial->bh, inode);
+		ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext2_free_branches(inode,
+				   partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		mark_buffer_dirty_inode(partial->bh, inode);
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+		default:
+			nr = i_data[EXT2_IND_BLOCK];
+			if (nr) {
+				i_data[EXT2_IND_BLOCK] = 0;
+				mark_inode_dirty(inode);
+				ext2_free_branches(inode, &nr, &nr+1, 1);
+			}
+		case EXT2_IND_BLOCK:
+			nr = i_data[EXT2_DIND_BLOCK];
+			if (nr) {
+				i_data[EXT2_DIND_BLOCK] = 0;
+				mark_inode_dirty(inode);
+				ext2_free_branches(inode, &nr, &nr+1, 2);
+			}
+		case EXT2_DIND_BLOCK:
+			nr = i_data[EXT2_TIND_BLOCK];
+			if (nr) {
+				i_data[EXT2_TIND_BLOCK] = 0;
+				mark_inode_dirty(inode);
+				ext2_free_branches(inode, &nr, &nr+1, 3);
+			}
+		case EXT2_TIND_BLOCK:
+			;
+	}
+
+	ext2_discard_reservation(inode);
+
+	mutex_unlock(&ei->truncate_mutex);
+}
+
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+	/*
+	 * XXX: it seems like a bug here that we don't allow
+	 * IS_APPEND inode to have blocks-past-i_size trimmed off.
+	 * review and fix this.
+	 *
+	 * Also would be nice to be able to handle IO errors and such,
+	 * but that's probably too much to ask.
+	 */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return;
+	if (ext2_inode_is_fast_symlink(inode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+	__ext2_truncate_blocks(inode, offset);
+}
+
+static int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+	int error;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+	if (ext2_inode_is_fast_symlink(inode))
+		return -EINVAL;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	inode_dio_wait(inode);
+
+	if (IS_DAX(inode))
+		error = dax_truncate_page(inode, newsize, ext2_get_block);
+	else if (test_opt(inode->i_sb, NOBH))
+		error = nobh_truncate_page(inode->i_mapping,
+				newsize, ext2_get_block);
+	else
+		error = block_truncate_page(inode->i_mapping,
+				newsize, ext2_get_block);
+	if (error)
+		return error;
+
+	truncate_setsize(inode, newsize);
+	__ext2_truncate_blocks(inode, newsize);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	if (inode_needs_sync(inode)) {
+		sync_mapping_buffers(inode->i_mapping);
+		sync_inode_metadata(inode, 1);
+	} else {
+		mark_inode_dirty(inode);
+	}
+
+	return 0;
+}
+
+static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
+					struct buffer_head **p)
+{
+	struct buffer_head * bh;
+	unsigned long block_group;
+	unsigned long block;
+	unsigned long offset;
+	struct ext2_group_desc * gdp;
+
+	*p = NULL;
+	if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) ||
+	    ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
+		goto Einval;
+
+	block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
+	gdp = ext2_get_group_desc(sb, block_group, NULL);
+	if (!gdp)
+		goto Egdp;
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb);
+	block = le32_to_cpu(gdp->bg_inode_table) +
+		(offset >> EXT2_BLOCK_SIZE_BITS(sb));
+	if (!(bh = sb_bread(sb, block)))
+		goto Eio;
+
+	*p = bh;
+	offset &= (EXT2_BLOCK_SIZE(sb) - 1);
+	return (struct ext2_inode *) (bh->b_data + offset);
+
+Einval:
+	ext2_error(sb, "ext2_get_inode", "bad inode number: %lu",
+		   (unsigned long) ino);
+	return ERR_PTR(-EINVAL);
+Eio:
+	ext2_error(sb, "ext2_get_inode",
+		   "unable to read inode block - inode=%lu, block=%lu",
+		   (unsigned long) ino, block);
+Egdp:
+	return ERR_PTR(-EIO);
+}
+
+void ext2_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = EXT2_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+				S_DIRSYNC | S_DAX);
+	if (flags & EXT2_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & EXT2_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & EXT2_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & EXT2_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & EXT2_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+	if (test_opt(inode->i_sb, DAX))
+		inode->i_flags |= S_DAX;
+}
+
+/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
+void ext2_get_inode_flags(struct ext2_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
+			EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT2_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT2_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT2_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT2_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT2_DIRSYNC_FL;
+}
+
+struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
+{
+	struct ext2_inode_info *ei;
+	struct buffer_head * bh;
+	struct ext2_inode *raw_inode;
+	struct inode *inode;
+	long ret = -EIO;
+	int n;
+	uid_t i_uid;
+	gid_t i_gid;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = EXT2_I(inode);
+	ei->i_block_alloc_info = NULL;
+
+	raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
+	if (IS_ERR(raw_inode)) {
+		ret = PTR_ERR(raw_inode);
+ 		goto bad_inode;
+	}
+
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if (!(test_opt (inode->i_sb, NO_UID32))) {
+		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	}
+	i_uid_write(inode, i_uid);
+	i_gid_write(inode, i_gid);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
+	inode->i_size = le32_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	/* We now have enough fields to check if the inode was active or not.
+	 * This is needed because nfsd might try to access dead inodes
+	 * the test is that same one that e2fsck uses
+	 * NeilBrown 1999oct15
+	 */
+	if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
+		/* this inode is deleted */
+		brelse (bh);
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+	ei->i_frag_no = raw_inode->i_frag;
+	ei->i_frag_size = raw_inode->i_fsize;
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	ei->i_dir_acl = 0;
+	if (S_ISREG(inode->i_mode))
+		inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+	else
+		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+	ei->i_dtime = 0;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+	ei->i_state = 0;
+	ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
+	ei->i_dir_start_lookup = 0;
+
+	/*
+	 * NOTE! The in-memory inode i_data array is in little-endian order
+	 * even on big-endian machines: we do NOT byteswap the block numbers!
+	 */
+	for (n = 0; n < EXT2_N_BLOCKS; n++)
+		ei->i_data[n] = raw_inode->i_block[n];
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ext2_file_inode_operations;
+		if (test_opt(inode->i_sb, NOBH)) {
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+			inode->i_fop = &ext2_file_operations;
+		} else {
+			inode->i_mapping->a_ops = &ext2_aops;
+			inode->i_fop = &ext2_file_operations;
+		}
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ext2_dir_inode_operations;
+		inode->i_fop = &ext2_dir_operations;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (ext2_inode_is_fast_symlink(inode)) {
+			inode->i_op = &ext2_fast_symlink_inode_operations;
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
+			inode->i_op = &ext2_symlink_inode_operations;
+			if (test_opt(inode->i_sb, NOBH))
+				inode->i_mapping->a_ops = &ext2_nobh_aops;
+			else
+				inode->i_mapping->a_ops = &ext2_aops;
+		}
+	} else {
+		inode->i_op = &ext2_special_inode_operations;
+		if (raw_inode->i_block[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+		else 
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	}
+	brelse (bh);
+	ext2_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+	
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+static int __ext2_write_inode(struct inode *inode, int do_sync)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	struct super_block *sb = inode->i_sb;
+	ino_t ino = inode->i_ino;
+	uid_t uid = i_uid_read(inode);
+	gid_t gid = i_gid_read(inode);
+	struct buffer_head * bh;
+	struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh);
+	int n;
+	int err = 0;
+
+	if (IS_ERR(raw_inode))
+ 		return -EIO;
+
+	/* For fields not not tracking in the in-memory inode,
+	 * initialise them to zero for new inodes. */
+	if (ei->i_state & EXT2_STATE_NEW)
+		memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
+
+	ext2_get_inode_flags(ei);
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	if (!(test_opt(sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+		if (!ei->i_dtime) {
+			raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid));
+			raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid));
+		} else {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		}
+	} else {
+		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid));
+		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le32(inode->i_size);
+	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+
+	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
+	raw_inode->i_frag = ei->i_frag_no;
+	raw_inode->i_fsize = ei->i_frag_size;
+	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+	if (!S_ISREG(inode->i_mode))
+		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+	else {
+		raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32);
+		if (inode->i_size > 0x7fffffffULL) {
+			if (!EXT2_HAS_RO_COMPAT_FEATURE(sb,
+					EXT2_FEATURE_RO_COMPAT_LARGE_FILE) ||
+			    EXT2_SB(sb)->s_es->s_rev_level ==
+					cpu_to_le32(EXT2_GOOD_OLD_REV)) {
+			       /* If this is the first large file
+				* created, add a flag to the superblock.
+				*/
+				spin_lock(&EXT2_SB(sb)->s_lock);
+				ext2_update_dynamic_rev(sb);
+				EXT2_SET_RO_COMPAT_FEATURE(sb,
+					EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
+				spin_unlock(&EXT2_SB(sb)->s_lock);
+				ext2_write_super(sb);
+			}
+		}
+	}
+	
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else for (n = 0; n < EXT2_N_BLOCKS; n++)
+		raw_inode->i_block[n] = ei->i_data[n];
+	mark_buffer_dirty(bh);
+	if (do_sync) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			printk ("IO error syncing ext2 inode [%s:%08lx]\n",
+				sb->s_id, (unsigned long) ino);
+			err = -EIO;
+		}
+	}
+	ei->i_state &= ~EXT2_STATE_NEW;
+	brelse (bh);
+	return err;
+}
+
+int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	if (is_quota_modification(inode, iattr))
+		dquot_initialize(inode);
+	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
+	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
+		error = dquot_transfer(inode, iattr);
+		if (error)
+			return error;
+	}
+	if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
+		error = ext2_setsize(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+	setattr_copy(inode, iattr);
+	if (iattr->ia_valid & ATTR_MODE)
+		error = posix_acl_chmod(inode, inode->i_mode);
+	mark_inode_dirty(inode);
+
+	return error;
+}
diff --git a/kernel/fs/ext2/ioctl.c b/kernel/fs/ext2/ioctl.c
new file mode 100644
index 000000000..5d46c0986
--- /dev/null
+++ b/kernel/fs/ext2/ioctl.c
@@ -0,0 +1,188 @@
+/*
+ * linux/fs/ext2/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include "ext2.h"
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+
+
+long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	unsigned int flags;
+	unsigned short rsv_window_size;
+	int ret;
+
+	ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+	switch (cmd) {
+	case EXT2_IOC_GETFLAGS:
+		ext2_get_inode_flags(ei);
+		flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case EXT2_IOC_SETFLAGS: {
+		unsigned int oldflags;
+
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+
+		if (!inode_owner_or_capable(inode)) {
+			ret = -EACCES;
+			goto setflags_out;
+		}
+
+		if (get_user(flags, (int __user *) arg)) {
+			ret = -EFAULT;
+			goto setflags_out;
+		}
+
+		flags = ext2_mask_flags(inode->i_mode, flags);
+
+		mutex_lock(&inode->i_mutex);
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode)) {
+			mutex_unlock(&inode->i_mutex);
+			ret = -EPERM;
+			goto setflags_out;
+		}
+		oldflags = ei->i_flags;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 *
+		 * This test looks nicer. Thanks to Pauline Middelink
+		 */
+		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
+				ret = -EPERM;
+				goto setflags_out;
+			}
+		}
+
+		flags = flags & EXT2_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
+		ei->i_flags = flags;
+
+		ext2_set_inode_flags(inode);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mutex_unlock(&inode->i_mutex);
+
+		mark_inode_dirty(inode);
+setflags_out:
+		mnt_drop_write_file(filp);
+		return ret;
+	}
+	case EXT2_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *) arg);
+	case EXT2_IOC_SETVERSION: {
+		__u32 generation;
+
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+		if (get_user(generation, (int __user *) arg)) {
+			ret = -EFAULT;
+			goto setversion_out;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_generation = generation;
+		mutex_unlock(&inode->i_mutex);
+
+		mark_inode_dirty(inode);
+setversion_out:
+		mnt_drop_write_file(filp);
+		return ret;
+	}
+	case EXT2_IOC_GETRSVSZ:
+		if (test_opt(inode->i_sb, RESERVATION)
+			&& S_ISREG(inode->i_mode)
+			&& ei->i_block_alloc_info) {
+			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
+			return put_user(rsv_window_size, (int __user *)arg);
+		}
+		return -ENOTTY;
+	case EXT2_IOC_SETRSVSZ: {
+
+		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+			return -ENOTTY;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(rsv_window_size, (int __user *)arg))
+			return -EFAULT;
+
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+
+		if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS)
+			rsv_window_size = EXT2_MAX_RESERVE_BLOCKS;
+
+		/*
+		 * need to allocate reservation structure for this inode
+		 * before set the window size
+		 */
+		/*
+		 * XXX What lock should protect the rsv_goal_size?
+		 * Accessed in ext2_get_block only.  ext3 uses i_truncate.
+		 */
+		mutex_lock(&ei->truncate_mutex);
+		if (!ei->i_block_alloc_info)
+			ext2_init_block_alloc_info(inode);
+
+		if (ei->i_block_alloc_info){
+			struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
+			rsv->rsv_goal_size = rsv_window_size;
+		}
+		mutex_unlock(&ei->truncate_mutex);
+		mnt_drop_write_file(filp);
+		return 0;
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT2_IOC32_GETFLAGS:
+		cmd = EXT2_IOC_GETFLAGS;
+		break;
+	case EXT2_IOC32_SETFLAGS:
+		cmd = EXT2_IOC_SETFLAGS;
+		break;
+	case EXT2_IOC32_GETVERSION:
+		cmd = EXT2_IOC_GETVERSION;
+		break;
+	case EXT2_IOC32_SETVERSION:
+		cmd = EXT2_IOC_SETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ext2_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/kernel/fs/ext2/namei.c b/kernel/fs/ext2/namei.c
new file mode 100644
index 000000000..3e074a9cc
--- /dev/null
+++ b/kernel/fs/ext2/namei.c
@@ -0,0 +1,431 @@
+/*
+ * linux/fs/ext2/namei.c
+ *
+ * Rewrite to pagecache. Almost all code had been changed, so blame me
+ * if the things go wrong. Please, send bug reports to
+ * viro@parcelfarce.linux.theplanet.co.uk
+ *
+ * Stuff here is basically a glue between the VFS and generic UNIXish
+ * filesystem that keeps everything in pagecache. All knowledge of the
+ * directory layout is in fs/ext2/dir.c - it turned out to be easily separatable
+ * and it's easier to debug that way. In principle we might want to
+ * generalize that a bit and turn it into a library. Or not.
+ *
+ * The only non-static object here is ext2_dir_inode_operations.
+ *
+ * TODO: get rid of kmap() use, add readahead.
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = ext2_add_link(dentry, inode);
+	if (!err) {
+		unlock_new_inode(inode);
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * Methods themselves.
+ */
+
+static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode * inode;
+	ino_t ino;
+	
+	if (dentry->d_name.len > EXT2_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = ext2_inode_by_name(dir, &dentry->d_name);
+	inode = NULL;
+	if (ino) {
+		inode = ext2_iget(dir->i_sb, ino);
+		if (inode == ERR_PTR(-ESTALE)) {
+			ext2_error(dir->i_sb, __func__,
+					"deleted inode referenced: %lu",
+					(unsigned long) ino);
+			return ERR_PTR(-EIO);
+		}
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+struct dentry *ext2_get_parent(struct dentry *child)
+{
+	struct qstr dotdot = QSTR_INIT("..", 2);
+	unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+	return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino));
+} 
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate(). 
+ */
+static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
+{
+	struct inode *inode;
+
+	dquot_initialize(dir);
+
+	inode = ext2_new_inode(dir, mode, &dentry->d_name);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &ext2_file_inode_operations;
+	if (test_opt(inode->i_sb, NOBH)) {
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+		inode->i_fop = &ext2_file_operations;
+	} else {
+		inode->i_mapping->a_ops = &ext2_aops;
+		inode->i_fop = &ext2_file_operations;
+	}
+	mark_inode_dirty(inode);
+	return ext2_add_nondir(dentry, inode);
+}
+
+static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode = ext2_new_inode(dir, mode, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &ext2_file_inode_operations;
+	if (test_opt(inode->i_sb, NOBH)) {
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+		inode->i_fop = &ext2_file_operations;
+	} else {
+		inode->i_mapping->a_ops = &ext2_aops;
+		inode->i_fop = &ext2_file_operations;
+	}
+	mark_inode_dirty(inode);
+	d_tmpfile(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+}
+
+static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	struct inode * inode;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+	inode = ext2_new_inode (dir, mode, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT2_FS_XATTR
+		inode->i_op = &ext2_special_inode_operations;
+#endif
+		mark_inode_dirty(inode);
+		err = ext2_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int ext2_symlink (struct inode * dir, struct dentry * dentry,
+	const char * symname)
+{
+	struct super_block * sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode * inode;
+
+	if (l > sb->s_blocksize)
+		goto out;
+
+	dquot_initialize(dir);
+
+	inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	if (l > sizeof (EXT2_I(inode)->i_data)) {
+		/* slow symlink */
+		inode->i_op = &ext2_symlink_inode_operations;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &ext2_fast_symlink_inode_operations;
+		memcpy((char*)(EXT2_I(inode)->i_data),symname,l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = ext2_add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput (inode);
+	goto out;
+}
+
+static int ext2_link (struct dentry * old_dentry, struct inode * dir,
+	struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+	int err;
+
+	dquot_initialize(dir);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	err = ext2_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+{
+	struct inode * inode;
+	int err;
+
+	dquot_initialize(dir);
+
+	inode_inc_link_count(dir);
+
+	inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &ext2_dir_inode_operations;
+	inode->i_fop = &ext2_dir_operations;
+	if (test_opt(inode->i_sb, NOBH))
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+	else
+		inode->i_mapping->a_ops = &ext2_aops;
+
+	inode_inc_link_count(inode);
+
+	err = ext2_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = ext2_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int ext2_unlink(struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = d_inode(dentry);
+	struct ext2_dir_entry_2 * de;
+	struct page * page;
+	int err = -ENOENT;
+
+	dquot_initialize(dir);
+
+	de = ext2_find_entry (dir, &dentry->d_name, &page);
+	if (!de)
+		goto out;
+
+	err = ext2_delete_entry (de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = d_inode(dentry);
+	int err = -ENOTEMPTY;
+
+	if (ext2_empty_dir(inode)) {
+		err = ext2_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
+	struct inode * new_dir,	struct dentry * new_dentry )
+{
+	struct inode * old_inode = d_inode(old_dentry);
+	struct inode * new_inode = d_inode(new_dentry);
+	struct page * dir_page = NULL;
+	struct ext2_dir_entry_2 * dir_de = NULL;
+	struct page * old_page;
+	struct ext2_dir_entry_2 * old_de;
+	int err = -ENOENT;
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = ext2_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct ext2_dir_entry_2 *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !ext2_empty_dir (new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
+		if (!new_de)
+			goto out_dir;
+		ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		err = ext2_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
+
+	ext2_delete_entry (old_de, old_page);
+
+	if (dir_de) {
+		if (old_dir != new_dir)
+			ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+		else {
+			kunmap(dir_page);
+			page_cache_release(dir_page);
+		}
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations ext2_dir_inode_operations = {
+	.create		= ext2_create,
+	.lookup		= ext2_lookup,
+	.link		= ext2_link,
+	.unlink		= ext2_unlink,
+	.symlink	= ext2_symlink,
+	.mkdir		= ext2_mkdir,
+	.rmdir		= ext2_rmdir,
+	.mknod		= ext2_mknod,
+	.rename		= ext2_rename,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.setattr	= ext2_setattr,
+	.get_acl	= ext2_get_acl,
+	.set_acl	= ext2_set_acl,
+	.tmpfile	= ext2_tmpfile,
+};
+
+const struct inode_operations ext2_special_inode_operations = {
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.setattr	= ext2_setattr,
+	.get_acl	= ext2_get_acl,
+	.set_acl	= ext2_set_acl,
+};
diff --git a/kernel/fs/ext2/super.c b/kernel/fs/ext2/super.c
new file mode 100644
index 000000000..d0e746e96
--- /dev/null
+++ b/kernel/fs/ext2/super.c
@@ -0,0 +1,1580 @@
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/log2.h>
+#include <linux/quotaops.h>
+#include <asm/uaccess.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+static void ext2_sync_super(struct super_block *sb,
+			    struct ext2_super_block *es, int wait);
+static int ext2_remount (struct super_block * sb, int * flags, char * data);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext2_sync_fs(struct super_block *sb, int wait);
+static int ext2_freeze(struct super_block *sb);
+static int ext2_unfreeze(struct super_block *sb);
+
+void ext2_error(struct super_block *sb, const char *function,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		spin_lock(&sbi->s_lock);
+		sbi->s_mount_state |= EXT2_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
+		spin_unlock(&sbi->s_lock);
+		ext2_sync_super(sb, es, 1);
+	}
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT2-fs: panic from previous error\n");
+	if (test_opt(sb, ERRORS_RO)) {
+		ext2_msg(sb, KERN_CRIT,
+			     "error: remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+}
+
+void ext2_msg(struct super_block *sb, const char *prefix,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
+	va_end(args);
+}
+
+/*
+ * This must be called with sbi->s_lock held.
+ */
+void ext2_update_dynamic_rev(struct super_block *sb)
+{
+	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV)
+		return;
+
+	ext2_msg(sb, KERN_WARNING,
+		     "warning: updating to rev %d because of "
+		     "new feature flag, running e2fsck is recommended",
+		     EXT2_DYNAMIC_REV);
+
+	es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO);
+	es->s_inode_size = cpu_to_le16(EXT2_GOOD_OLD_INODE_SIZE);
+	es->s_rev_level = cpu_to_le32(EXT2_DYNAMIC_REV);
+	/* leave es->s_feature_*compat flags alone */
+	/* es->s_uuid will be set by e2fsck if empty */
+
+	/*
+	 * The rest of the superblock fields should be zero, and if not it
+	 * means they are likely already in use, so leave them alone.  We
+	 * can leave it up to e2fsck to clean up any inconsistencies there.
+	 */
+}
+
+static void ext2_put_super (struct super_block * sb)
+{
+	int db_count;
+	int i;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	ext2_xattr_put_super(sb);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		struct ext2_super_block *es = sbi->s_es;
+
+		spin_lock(&sbi->s_lock);
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		spin_unlock(&sbi->s_lock);
+		ext2_sync_super(sb, es, 1);
+	}
+	db_count = sbi->s_gdb_count;
+	for (i = 0; i < db_count; i++)
+		if (sbi->s_group_desc[i])
+			brelse (sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+	kfree(sbi->s_debts);
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	brelse (sbi->s_sbh);
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+}
+
+static struct kmem_cache * ext2_inode_cachep;
+
+static struct inode *ext2_alloc_inode(struct super_block *sb)
+{
+	struct ext2_inode_info *ei;
+	ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	ei->i_block_alloc_info = NULL;
+	ei->vfs_inode.i_version = 1;
+#ifdef CONFIG_QUOTA
+	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
+#endif
+
+	return &ei->vfs_inode;
+}
+
+static void ext2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
+}
+
+static void ext2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ext2_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
+
+	rwlock_init(&ei->i_meta_lock);
+#ifdef CONFIG_EXT2_FS_XATTR
+	init_rwsem(&ei->xattr_sem);
+#endif
+	mutex_init(&ei->truncate_mutex);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
+					     sizeof(struct ext2_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ext2_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ext2_inode_cachep);
+}
+
+static int ext2_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+	unsigned long def_mount_opts;
+
+	spin_lock(&sbi->s_lock);
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+
+	if (sbi->s_sb_block != 1)
+		seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
+	if (test_opt(sb, MINIX_DF))
+		seq_puts(seq, ",minixdf");
+	if (test_opt(sb, GRPID))
+		seq_puts(seq, ",grpid");
+	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT2_DEFM_BSDGROUPS))
+		seq_puts(seq, ",nogrpid");
+	if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT2_DEF_RESUID)) ||
+	    le16_to_cpu(es->s_def_resuid) != EXT2_DEF_RESUID) {
+		seq_printf(seq, ",resuid=%u",
+				from_kuid_munged(&init_user_ns, sbi->s_resuid));
+	}
+	if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT2_DEF_RESGID)) ||
+	    le16_to_cpu(es->s_def_resgid) != EXT2_DEF_RESGID) {
+		seq_printf(seq, ",resgid=%u",
+				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+	}
+	if (test_opt(sb, ERRORS_RO)) {
+		int def_errors = le16_to_cpu(es->s_errors);
+
+		if (def_errors == EXT2_ERRORS_PANIC ||
+		    def_errors == EXT2_ERRORS_CONTINUE) {
+			seq_puts(seq, ",errors=remount-ro");
+		}
+	}
+	if (test_opt(sb, ERRORS_CONT))
+		seq_puts(seq, ",errors=continue");
+	if (test_opt(sb, ERRORS_PANIC))
+		seq_puts(seq, ",errors=panic");
+	if (test_opt(sb, NO_UID32))
+		seq_puts(seq, ",nouid32");
+	if (test_opt(sb, DEBUG))
+		seq_puts(seq, ",debug");
+	if (test_opt(sb, OLDALLOC))
+		seq_puts(seq, ",oldalloc");
+
+#ifdef CONFIG_EXT2_FS_XATTR
+	if (test_opt(sb, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	if (!test_opt(sb, XATTR_USER) &&
+	    (def_mount_opts & EXT2_DEFM_XATTR_USER)) {
+		seq_puts(seq, ",nouser_xattr");
+	}
+#endif
+
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	if (test_opt(sb, POSIX_ACL))
+		seq_puts(seq, ",acl");
+	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT2_DEFM_ACL))
+		seq_puts(seq, ",noacl");
+#endif
+
+	if (test_opt(sb, NOBH))
+		seq_puts(seq, ",nobh");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+#ifdef CONFIG_FS_DAX
+	if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
+		seq_puts(seq, ",xip");
+	if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
+		seq_puts(seq, ",dax");
+#endif
+
+	if (!test_opt(sb, RESERVATION))
+		seq_puts(seq, ",noreservation");
+
+	spin_unlock(&sbi->s_lock);
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
+static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
+static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
+static struct dquot **ext2_get_dquots(struct inode *inode)
+{
+	return EXT2_I(inode)->i_dquot;
+}
+#endif
+
+static const struct super_operations ext2_sops = {
+	.alloc_inode	= ext2_alloc_inode,
+	.destroy_inode	= ext2_destroy_inode,
+	.write_inode	= ext2_write_inode,
+	.evict_inode	= ext2_evict_inode,
+	.put_super	= ext2_put_super,
+	.sync_fs	= ext2_sync_fs,
+	.freeze_fs	= ext2_freeze,
+	.unfreeze_fs	= ext2_unfreeze,
+	.statfs		= ext2_statfs,
+	.remount_fs	= ext2_remount,
+	.show_options	= ext2_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext2_quota_read,
+	.quota_write	= ext2_quota_write,
+	.get_dquots	= ext2_get_dquots,
+#endif
+};
+
+static struct inode *ext2_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+	if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	/*
+	 * ext2_iget isn't quite right if the inode is currently unallocated!
+	 * However ext2_iget currently does appropriate checks to handle stale
+	 * inodes so everything is OK.
+	 */
+	inode = ext2_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *ext2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ext2_nfs_get_inode);
+}
+
+static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ext2_nfs_get_inode);
+}
+
+static const struct export_operations ext2_export_ops = {
+	.fh_to_dentry = ext2_fh_to_dentry,
+	.fh_to_parent = ext2_fh_to_parent,
+	.get_parent = ext2_get_parent,
+};
+
+static unsigned long get_sb_block(void **data)
+{
+	unsigned long 	sb_block;
+	char 		*options = (char *) *data;
+
+	if (!options || strncmp(options, "sb=", 3) != 0)
+		return 1;	/* Default location */
+	options += 3;
+	sb_block = simple_strtoul(options, &options, 0);
+	if (*options && *options != ',') {
+		printk("EXT2-fs: Invalid sb specification: %s\n",
+		       (char *) *data);
+		return 1;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *) options;
+	return sb_block;
+}
+
+enum {
+	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
+	Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
+	Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
+	Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
+};
+
+static const match_table_t tokens = {
+	{Opt_bsd_df, "bsddf"},
+	{Opt_minix_df, "minixdf"},
+	{Opt_grpid, "grpid"},
+	{Opt_grpid, "bsdgroups"},
+	{Opt_nogrpid, "nogrpid"},
+	{Opt_nogrpid, "sysvgroups"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
+	{Opt_sb, "sb=%u"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_nouid32, "nouid32"},
+	{Opt_nocheck, "check=none"},
+	{Opt_nocheck, "nocheck"},
+	{Opt_debug, "debug"},
+	{Opt_oldalloc, "oldalloc"},
+	{Opt_orlov, "orlov"},
+	{Opt_nobh, "nobh"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_xip, "xip"},
+	{Opt_dax, "dax"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_ignore, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_reservation, "reservation"},
+	{Opt_noreservation, "noreservation"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct super_block *sb)
+{
+	char *p;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	kuid_t uid;
+	kgid_t gid;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep (&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bsd_df:
+			clear_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_minix_df:
+			set_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_grpid:
+			set_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_nogrpid:
+			clear_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_resuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid)) {
+				ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option);
+				return 0;
+
+			}
+			sbi->s_resuid = uid;
+			break;
+		case Opt_resgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid)) {
+				ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option);
+				return 0;
+			}
+			sbi->s_resgid = gid;
+			break;
+		case Opt_sb:
+			/* handled by get_sb_block() instead of here */
+			/* *sb_block = match_int(&args[0]); */
+			break;
+		case Opt_err_panic:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_CONT);
+			break;
+		case Opt_nouid32:
+			set_opt (sbi->s_mount_opt, NO_UID32);
+			break;
+		case Opt_nocheck:
+			clear_opt (sbi->s_mount_opt, CHECK);
+			break;
+		case Opt_debug:
+			set_opt (sbi->s_mount_opt, DEBUG);
+			break;
+		case Opt_oldalloc:
+			set_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+		case Opt_orlov:
+			clear_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+		case Opt_nobh:
+			set_opt (sbi->s_mount_opt, NOBH);
+			break;
+#ifdef CONFIG_EXT2_FS_XATTR
+		case Opt_user_xattr:
+			set_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+		case Opt_nouser_xattr:
+			ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
+				"not supported");
+			break;
+#endif
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			ext2_msg(sb, KERN_INFO,
+				"(no)acl options not supported");
+			break;
+#endif
+		case Opt_xip:
+			ext2_msg(sb, KERN_INFO, "use dax instead of xip");
+			set_opt(sbi->s_mount_opt, XIP);
+			/* Fall through */
+		case Opt_dax:
+#ifdef CONFIG_FS_DAX
+			set_opt(sbi->s_mount_opt, DAX);
+#else
+			ext2_msg(sb, KERN_INFO, "dax option not supported");
+#endif
+			break;
+
+#if defined(CONFIG_QUOTA)
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+			ext2_msg(sb, KERN_INFO,
+				"quota operations not supported");
+			break;
+#endif
+
+		case Opt_reservation:
+			set_opt(sbi->s_mount_opt, RESERVATION);
+			ext2_msg(sb, KERN_INFO, "reservations ON");
+			break;
+		case Opt_noreservation:
+			clear_opt(sbi->s_mount_opt, RESERVATION);
+			ext2_msg(sb, KERN_INFO, "reservations OFF");
+			break;
+		case Opt_ignore:
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int ext2_setup_super (struct super_block * sb,
+			      struct ext2_super_block * es,
+			      int read_only)
+{
+	int res = 0;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) {
+		ext2_msg(sb, KERN_ERR,
+			"error: revision level too high, "
+			"forcing read-only mode");
+		res = MS_RDONLY;
+	}
+	if (read_only)
+		return res;
+	if (!(sbi->s_mount_state & EXT2_VALID_FS))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting unchecked fs, "
+			"running e2fsck is recommended");
+	else if ((sbi->s_mount_state & EXT2_ERROR_FS))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting fs with errors, "
+			"running e2fsck is recommended");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+		 le16_to_cpu(es->s_mnt_count) >=
+		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: maximal mount count reached, "
+			"running e2fsck is recommended");
+	else if (le32_to_cpu(es->s_checkinterval) &&
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: checktime reached, "
+			"running e2fsck is recommended");
+	if (!le16_to_cpu(es->s_max_mnt_count))
+		es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
+	le16_add_cpu(&es->s_mnt_count, 1);
+	if (test_opt (sb, DEBUG))
+		ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
+			"bpg=%lu, ipg=%lu, mo=%04lx]",
+			EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize,
+			sbi->s_frag_size,
+			sbi->s_groups_count,
+			EXT2_BLOCKS_PER_GROUP(sb),
+			EXT2_INODES_PER_GROUP(sb),
+			sbi->s_mount_opt);
+	return res;
+}
+
+static int ext2_check_descriptors(struct super_block *sb)
+{
+	int i;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	ext2_debug ("Checking group descriptors");
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL);
+		ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i);
+		ext2_fsblk_t last_block;
+
+		if (i == sbi->s_groups_count - 1)
+			last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+		else
+			last_block = first_block +
+				(EXT2_BLOCKS_PER_GROUP(sb) - 1);
+
+		if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
+		{
+			ext2_error (sb, "ext2_check_descriptors",
+				    "Block bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long) le32_to_cpu(gdp->bg_block_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
+		{
+			ext2_error (sb, "ext2_check_descriptors",
+				    "Inode bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long) le32_to_cpu(gdp->bg_inode_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
+		    last_block)
+		{
+			ext2_error (sb, "ext2_check_descriptors",
+				    "Inode table for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long) le32_to_cpu(gdp->bg_inode_table));
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+static loff_t ext2_max_size(int bits)
+{
+	loff_t res = EXT2_NDIR_BLOCKS;
+	int meta_blocks;
+	loff_t upper_limit;
+
+	/* This is calculated to be the largest file size for a
+	 * dense, file such that the total number of
+	 * sectors in the file, including data and all indirect blocks,
+	 * does not exceed 2^32 -1
+	 * __u32 i_blocks representing the total number of
+	 * 512 bytes blocks of the file
+	 */
+	upper_limit = (1LL << 32) - 1;
+
+	/* total blocks in file system block size */
+	upper_limit >>= (bits - 9);
+
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
+
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	res <<= bits;
+	if (res > upper_limit)
+		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	return res;
+}
+
+static unsigned long descriptor_loc(struct super_block *sb,
+				    unsigned long logic_sb_block,
+				    int nr)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	unsigned long bg, first_meta_bg;
+	int has_super = 0;
+	
+	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+	if (!EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_META_BG) ||
+	    nr < first_meta_bg)
+		return (logic_sb_block + nr + 1);
+	bg = sbi->s_desc_per_block * nr;
+	if (ext2_bg_has_super(sb, bg))
+		has_super = 1;
+
+	return ext2_group_first_block_no(sb, bg) + has_super;
+}
+
+static int ext2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head * bh;
+	struct ext2_sb_info * sbi;
+	struct ext2_super_block * es;
+	struct inode *root;
+	unsigned long block;
+	unsigned long sb_block = get_sb_block(&data);
+	unsigned long logic_sb_block;
+	unsigned long offset = 0;
+	unsigned long def_mount_opts;
+	long ret = -EINVAL;
+	int blocksize = BLOCK_SIZE;
+	int db_count;
+	int i, j;
+	__le32 features;
+	int err;
+
+	err = -ENOMEM;
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto failed;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		goto failed;
+	}
+	sb->s_fs_info = sbi;
+	sbi->s_sb_block = sb_block;
+
+	spin_lock_init(&sbi->s_lock);
+
+	/*
+	 * See what the current blocksize for the device is, and
+	 * use that as the blocksize.  Otherwise (or if the blocksize
+	 * is smaller than the default) use the default.
+	 * This is important for devices that have a hardware
+	 * sectorsize that is larger than the default.
+	 */
+	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	if (!blocksize) {
+		ext2_msg(sb, KERN_ERR, "error: unable to set blocksize");
+		goto failed_sbi;
+	}
+
+	/*
+	 * If the superblock doesn't start on a hardware sector boundary,
+	 * calculate the offset.  
+	 */
+	if (blocksize != BLOCK_SIZE) {
+		logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
+		offset = (sb_block*BLOCK_SIZE) % blocksize;
+	} else {
+		logic_sb_block = sb_block;
+	}
+
+	if (!(bh = sb_bread(sb, logic_sb_block))) {
+		ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
+		goto failed_sbi;
+	}
+	/*
+	 * Note: s_es must be initialized as soon as possible because
+	 *       some ext2 macro-instructions depend on its value
+	 */
+	es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
+	sbi->s_es = es;
+	sb->s_magic = le16_to_cpu(es->s_magic);
+
+	if (sb->s_magic != EXT2_SUPER_MAGIC)
+		goto cantfind_ext2;
+
+	/* Set defaults before we parse the mount options */
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	if (def_mount_opts & EXT2_DEFM_DEBUG)
+		set_opt(sbi->s_mount_opt, DEBUG);
+	if (def_mount_opts & EXT2_DEFM_BSDGROUPS)
+		set_opt(sbi->s_mount_opt, GRPID);
+	if (def_mount_opts & EXT2_DEFM_UID16)
+		set_opt(sbi->s_mount_opt, NO_UID32);
+#ifdef CONFIG_EXT2_FS_XATTR
+	if (def_mount_opts & EXT2_DEFM_XATTR_USER)
+		set_opt(sbi->s_mount_opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	if (def_mount_opts & EXT2_DEFM_ACL)
+		set_opt(sbi->s_mount_opt, POSIX_ACL);
+#endif
+	
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC)
+		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE)
+		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_RO);
+
+	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+	
+	set_opt(sbi->s_mount_opt, RESERVATION);
+
+	if (!parse_options((char *) data, sb))
+		goto failed_mount;
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
+		 MS_POSIXACL : 0);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
+	    (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: feature flags set on rev 0 fs, "
+			"running e2fsck is recommended");
+	/*
+	 * Check feature flags regardless of the revision level, since we
+	 * previously didn't change the revision level when setting the flags,
+	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+	 */
+	features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
+	if (features) {
+		ext2_msg(sb, KERN_ERR,	"error: couldn't mount because of "
+		       "unsupported optional features (%x)",
+			le32_to_cpu(features));
+		goto failed_mount;
+	}
+	if (!(sb->s_flags & MS_RDONLY) &&
+	    (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
+		ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of "
+		       "unsupported optional features (%x)",
+		       le32_to_cpu(features));
+		goto failed_mount;
+	}
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
+		if (blocksize != PAGE_SIZE) {
+			ext2_msg(sb, KERN_ERR,
+					"error: unsupported blocksize for dax");
+			goto failed_mount;
+		}
+		if (!sb->s_bdev->bd_disk->fops->direct_access) {
+			ext2_msg(sb, KERN_ERR,
+					"error: device does not support dax");
+			goto failed_mount;
+		}
+	}
+
+	/* If the blocksize doesn't match, re-read the thing.. */
+	if (sb->s_blocksize != blocksize) {
+		brelse(bh);
+
+		if (!sb_set_blocksize(sb, blocksize)) {
+			ext2_msg(sb, KERN_ERR,
+				"error: bad blocksize %d", blocksize);
+			goto failed_sbi;
+		}
+
+		logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
+		offset = (sb_block*BLOCK_SIZE) % blocksize;
+		bh = sb_bread(sb, logic_sb_block);
+		if(!bh) {
+			ext2_msg(sb, KERN_ERR, "error: couldn't read"
+				"superblock on 2nd try");
+			goto failed_sbi;
+		}
+		es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
+		sbi->s_es = es;
+		if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
+			ext2_msg(sb, KERN_ERR, "error: magic mismatch");
+			goto failed_mount;
+		}
+	}
+
+	sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits);
+	sb->s_max_links = EXT2_LINK_MAX;
+
+	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) {
+		sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE;
+		sbi->s_first_ino = EXT2_GOOD_OLD_FIRST_INO;
+	} else {
+		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+		if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) ||
+		    !is_power_of_2(sbi->s_inode_size) ||
+		    (sbi->s_inode_size > blocksize)) {
+			ext2_msg(sb, KERN_ERR,
+				"error: unsupported inode size: %d",
+				sbi->s_inode_size);
+			goto failed_mount;
+		}
+	}
+
+	sbi->s_frag_size = EXT2_MIN_FRAG_SIZE <<
+				   le32_to_cpu(es->s_log_frag_size);
+	if (sbi->s_frag_size == 0)
+		goto cantfind_ext2;
+	sbi->s_frags_per_block = sb->s_blocksize / sbi->s_frag_size;
+
+	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+
+	if (EXT2_INODE_SIZE(sb) == 0)
+		goto cantfind_ext2;
+	sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
+	if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
+		goto cantfind_ext2;
+	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+					sbi->s_inodes_per_block;
+	sbi->s_desc_per_block = sb->s_blocksize /
+					sizeof (struct ext2_group_desc);
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = le16_to_cpu(es->s_state);
+	sbi->s_addr_per_block_bits =
+		ilog2 (EXT2_ADDR_PER_BLOCK(sb));
+	sbi->s_desc_per_block_bits =
+		ilog2 (EXT2_DESC_PER_BLOCK(sb));
+
+	if (sb->s_magic != EXT2_SUPER_MAGIC)
+		goto cantfind_ext2;
+
+	if (sb->s_blocksize != bh->b_size) {
+		if (!silent)
+			ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
+		goto failed_mount;
+	}
+
+	if (sb->s_blocksize != sbi->s_frag_size) {
+		ext2_msg(sb, KERN_ERR,
+			"error: fragsize %lu != blocksize %lu"
+			"(not supported yet)",
+			sbi->s_frag_size, sb->s_blocksize);
+		goto failed_mount;
+	}
+
+	if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
+		ext2_msg(sb, KERN_ERR,
+			"error: #blocks per group too big: %lu",
+			sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
+		ext2_msg(sb, KERN_ERR,
+			"error: #fragments per group too big: %lu",
+			sbi->s_frags_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
+		ext2_msg(sb, KERN_ERR,
+			"error: #inodes per group too big: %lu",
+			sbi->s_inodes_per_group);
+		goto failed_mount;
+	}
+
+	if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
+		goto cantfind_ext2;
+ 	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+ 				le32_to_cpu(es->s_first_data_block) - 1)
+ 					/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+	db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
+		   EXT2_DESC_PER_BLOCK(sb);
+	sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
+	if (sbi->s_group_desc == NULL) {
+		ext2_msg(sb, KERN_ERR, "error: not enough memory");
+		goto failed_mount;
+	}
+	bgl_lock_init(sbi->s_blockgroup_lock);
+	sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
+	if (!sbi->s_debts) {
+		ext2_msg(sb, KERN_ERR, "error: not enough memory");
+		goto failed_mount_group_desc;
+	}
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logic_sb_block, i);
+		sbi->s_group_desc[i] = sb_bread(sb, block);
+		if (!sbi->s_group_desc[i]) {
+			for (j = 0; j < i; j++)
+				brelse (sbi->s_group_desc[j]);
+			ext2_msg(sb, KERN_ERR,
+				"error: unable to read group descriptors");
+			goto failed_mount_group_desc;
+		}
+	}
+	if (!ext2_check_descriptors (sb)) {
+		ext2_msg(sb, KERN_ERR, "group descriptors corrupted");
+		goto failed_mount2;
+	}
+	sbi->s_gdb_count = db_count;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	/* per fileystem reservation list head & lock */
+	spin_lock_init(&sbi->s_rsv_window_lock);
+	sbi->s_rsv_window_root = RB_ROOT;
+	/*
+	 * Add a single, static dummy reservation to the start of the
+	 * reservation window list --- it gives us a placeholder for
+	 * append-at-start-of-list which makes the allocation logic
+	 * _much_ simpler.
+	 */
+	sbi->s_rsv_window_head.rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+	sbi->s_rsv_window_head.rsv_goal_size = 0;
+	ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
+
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+				ext2_count_free_blocks(sb), GFP_KERNEL);
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext2_count_free_inodes(sb), GFP_KERNEL);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext2_count_dirs(sb), GFP_KERNEL);
+	}
+	if (err) {
+		ext2_msg(sb, KERN_ERR, "error: insufficient memory");
+		goto failed_mount3;
+	}
+	/*
+	 * set up enough so that it can read an inode
+	 */
+	sb->s_op = &ext2_sops;
+	sb->s_export_op = &ext2_export_ops;
+	sb->s_xattr = ext2_xattr_handlers;
+
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &dquot_operations;
+	sb->s_qcop = &dquot_quotactl_ops;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+#endif
+
+	root = ext2_iget(sb, EXT2_ROOT_INO);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto failed_mount3;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		iput(root);
+		ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
+		goto failed_mount3;
+	}
+
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		ext2_msg(sb, KERN_ERR, "error: get root inode failed");
+		ret = -ENOMEM;
+		goto failed_mount3;
+	}
+	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting ext3 filesystem as ext2");
+	if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
+	ext2_write_super(sb);
+	return 0;
+
+cantfind_ext2:
+	if (!silent)
+		ext2_msg(sb, KERN_ERR,
+			"error: can't find an ext2 filesystem on dev %s.",
+			sb->s_id);
+	goto failed_mount;
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+failed_mount2:
+	for (i = 0; i < db_count; i++)
+		brelse(sbi->s_group_desc[i]);
+failed_mount_group_desc:
+	kfree(sbi->s_group_desc);
+	kfree(sbi->s_debts);
+failed_mount:
+	brelse(bh);
+failed_sbi:
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+failed:
+	return ret;
+}
+
+static void ext2_clear_super_error(struct super_block *sb)
+{
+	struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
+
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext2_msg(sb, KERN_ERR,
+		       "previous I/O error to superblock detected\n");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+}
+
+static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+			    int wait)
+{
+	ext2_clear_super_error(sb);
+	spin_lock(&EXT2_SB(sb)->s_lock);
+	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
+	es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+	es->s_wtime = cpu_to_le32(get_seconds());
+	/* unlock before we do IO */
+	spin_unlock(&EXT2_SB(sb)->s_lock);
+	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
+	if (wait)
+		sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
+}
+
+/*
+ * In the second extended file system, it is not necessary to
+ * write the super block since we use a mapping of the
+ * disk super block in a buffer.
+ *
+ * However, this function is still used to set the fs valid
+ * flags to 0.  We need to set this flag to 0 since the fs
+ * may have been checked while mounted and e2fsck may have
+ * set s_state to EXT2_VALID_FS after some corrections.
+ */
+static int ext2_sync_fs(struct super_block *sb, int wait)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
+	/*
+	 * Write quota structures to quota file, sync_blockdev() will write
+	 * them to disk later
+	 */
+	dquot_writeback_dquots(sb, -1);
+
+	spin_lock(&sbi->s_lock);
+	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+		ext2_debug("setting valid to 0\n");
+		es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+	}
+	spin_unlock(&sbi->s_lock);
+	ext2_sync_super(sb, es, wait);
+	return 0;
+}
+
+static int ext2_freeze(struct super_block *sb)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	/*
+	 * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared
+	 * because we have unattached inodes and thus filesystem is not fully
+	 * consistent.
+	 */
+	if (atomic_long_read(&sb->s_remove_count)) {
+		ext2_sync_fs(sb, 1);
+		return 0;
+	}
+	/* Set EXT2_FS_VALID flag */
+	spin_lock(&sbi->s_lock);
+	sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state);
+	spin_unlock(&sbi->s_lock);
+	ext2_sync_super(sb, sbi->s_es, 1);
+
+	return 0;
+}
+
+static int ext2_unfreeze(struct super_block *sb)
+{
+	/* Just write sb to clear EXT2_VALID_FS flag */
+	ext2_write_super(sb);
+
+	return 0;
+}
+
+void ext2_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		ext2_sync_fs(sb, 1);
+}
+
+static int ext2_remount (struct super_block * sb, int * flags, char * data)
+{
+	struct ext2_sb_info * sbi = EXT2_SB(sb);
+	struct ext2_super_block * es;
+	struct ext2_mount_options old_opts;
+	unsigned long old_sb_flags;
+	int err;
+
+	sync_filesystem(sb);
+	spin_lock(&sbi->s_lock);
+
+	/* Store the old options */
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 */
+	if (!parse_options(data, sb)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+	es = sbi->s_es;
+	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
+		ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
+			 "dax flag with busy inodes while remounting");
+		sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
+	}
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		spin_unlock(&sbi->s_lock);
+		return 0;
+	}
+	if (*flags & MS_RDONLY) {
+		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
+		    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+			spin_unlock(&sbi->s_lock);
+			return 0;
+		}
+
+		/*
+		 * OK, we are remounting a valid rw partition rdonly, so set
+		 * the rdonly flag and then mark the partition as valid again.
+		 */
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		es->s_mtime = cpu_to_le32(get_seconds());
+		spin_unlock(&sbi->s_lock);
+
+		err = dquot_suspend(sb, -1);
+		if (err < 0) {
+			spin_lock(&sbi->s_lock);
+			goto restore_opts;
+		}
+
+		ext2_sync_super(sb, es, 1);
+	} else {
+		__le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
+					       ~EXT2_FEATURE_RO_COMPAT_SUPP);
+		if (ret) {
+			ext2_msg(sb, KERN_WARNING,
+				"warning: couldn't remount RDWR because of "
+				"unsupported optional features (%x).",
+				le32_to_cpu(ret));
+			err = -EROFS;
+			goto restore_opts;
+		}
+		/*
+		 * Mounting a RDONLY partition read-write, so reread and
+		 * store the current valid flag.  (It may have been changed
+		 * by e2fsck since we originally mounted the partition.)
+		 */
+		sbi->s_mount_state = le16_to_cpu(es->s_state);
+		if (!ext2_setup_super (sb, es, 0))
+			sb->s_flags &= ~MS_RDONLY;
+		spin_unlock(&sbi->s_lock);
+
+		ext2_write_super(sb);
+
+		dquot_resume(sb, -1);
+	}
+
+	return 0;
+restore_opts:
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sb->s_flags = old_sb_flags;
+	spin_unlock(&sbi->s_lock);
+	return err;
+}
+
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+	u64 fsid;
+
+	spin_lock(&sbi->s_lock);
+
+	if (test_opt (sb, MINIX_DF))
+		sbi->s_overhead_last = 0;
+	else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+		unsigned long i, overhead = 0;
+		smp_rmb();
+
+		/*
+		 * Compute the overhead (FS structures). This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
+		 */
+
+		/*
+		 * All of the blocks before first_data_block are
+		 * overhead
+		 */
+		overhead = le32_to_cpu(es->s_first_data_block);
+
+		/*
+		 * Add the overhead attributed to the superblock and
+		 * block group descriptors.  If the sparse superblocks
+		 * feature is turned on, then not all groups have this.
+		 */
+		for (i = 0; i < sbi->s_groups_count; i++)
+			overhead += ext2_bg_has_super(sb, i) +
+				ext2_bg_num_gdb(sb, i);
+
+		/*
+		 * Every block group has an inode bitmap, a block
+		 * bitmap, and an inode table.
+		 */
+		overhead += (sbi->s_groups_count *
+			     (2 + sbi->s_itb_per_group));
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
+	}
+
+	buf->f_type = EXT2_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
+	buf->f_bfree = ext2_count_free_blocks(sb);
+	es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
+	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+		buf->f_bavail = 0;
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = ext2_count_free_inodes(sb);
+	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
+	buf->f_namelen = EXT2_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	spin_unlock(&sbi->s_lock);
+	return 0;
+}
+
+static struct dentry *ext2_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+}
+
+#ifdef CONFIG_QUOTA
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+
+		tmp_bh.b_state = 0;
+		tmp_bh.b_size = sb->s_blocksize;
+		err = ext2_get_block(inode, blk, &tmp_bh, 0);
+		if (err < 0)
+			return err;
+		if (!buffer_mapped(&tmp_bh))	/* A hole? */
+			memset(data, 0, tocopy);
+		else {
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			if (!bh)
+				return -EIO;
+			memcpy(data, bh->b_data+offset, tocopy);
+			brelse(bh);
+		}
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile */
+static ssize_t ext2_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t towrite = len;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+				sb->s_blocksize - offset : towrite;
+
+		tmp_bh.b_state = 0;
+		tmp_bh.b_size = sb->s_blocksize;
+		err = ext2_get_block(inode, blk, &tmp_bh, 1);
+		if (err < 0)
+			goto out;
+		if (offset || tocopy != EXT2_BLOCK_SIZE(sb))
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+		else
+			bh = sb_getblk(sb, tmp_bh.b_blocknr);
+		if (unlikely(!bh)) {
+			err = -EIO;
+			goto out;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data+offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		brelse(bh);
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+out:
+	if (len == towrite)
+		return err;
+	if (inode->i_size < off+len-towrite)
+		i_size_write(inode, off+len-towrite);
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	return len - towrite;
+}
+
+#endif
+
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.mount		= ext2_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext2");
+
+static int __init init_ext2_fs(void)
+{
+	int err = init_ext2_xattr();
+	if (err)
+		return err;
+	err = init_inodecache();
+	if (err)
+		goto out1;
+        err = register_filesystem(&ext2_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	exit_ext2_xattr();
+	return err;
+}
+
+static void __exit exit_ext2_fs(void)
+{
+	unregister_filesystem(&ext2_fs_type);
+	destroy_inodecache();
+	exit_ext2_xattr();
+}
+
+MODULE_AUTHOR("Remy Card and others");
+MODULE_DESCRIPTION("Second Extended Filesystem");
+MODULE_LICENSE("GPL");
+module_init(init_ext2_fs)
+module_exit(exit_ext2_fs)
diff --git a/kernel/fs/ext2/symlink.c b/kernel/fs/ext2/symlink.c
new file mode 100644
index 000000000..20608f17c
--- /dev/null
+++ b/kernel/fs/ext2/symlink.c
@@ -0,0 +1,54 @@
+/*
+ *  linux/fs/ext2/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 symlink handling code
+ */
+
+#include "ext2.h"
+#include "xattr.h"
+#include <linux/namei.h>
+
+static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
+	nd_set_link(nd, (char *)ei->i_data);
+	return NULL;
+}
+
+const struct inode_operations ext2_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ext2_setattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+ 
+const struct inode_operations ext2_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ext2_follow_link,
+	.setattr	= ext2_setattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
diff --git a/kernel/fs/ext2/xattr.c b/kernel/fs/ext2/xattr.c
new file mode 100644
index 000000000..0b6bfd3a3
--- /dev/null
+++ b/kernel/fs/ext2/xattr.c
@@ -0,0 +1,1029 @@
+/*
+ * linux/fs/ext2/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ *
+ */
+
+/*
+ * Extended attributes are stored on disk blocks allocated outside of
+ * any inode. The i_file_acl field is then made to point to this allocated
+ * block. If all extended attributes of an inode are identical, these
+ * inodes may share the same extended attribute block. Such situations
+ * are automatically detected by keeping a cache of recent attribute block
+ * numbers and hashes over the block's contents in memory.
+ *
+ *
+ * Extended attribute block layout:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The block header is followed by multiple entry descriptors. These entry
+ * descriptors are variable in size, and aligned to EXT2_XATTR_PAD
+ * byte boundaries. The entry descriptors are sorted by attribute name,
+ * so that two extended attribute blocks can be compared efficiently.
+ *
+ * Attribute values are aligned to the end of the block, stored in
+ * no specific order. They are also padded to EXT2_XATTR_PAD byte
+ * boundaries. No additional gaps are left between them.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT2_I(inode)->i_file_acl is protected by EXT2_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count will change. Multiple writers to an EA block are synchronized
+ * by the bh lock. No more than a single bh lock is held at any time
+ * to avoid deadlocks.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include <linux/security.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#ifdef EXT2_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+		printk(KERN_DEBUG "inode %s:%ld: ", \
+			inode->i_sb->s_id, inode->i_ino); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+# define ea_bdebug(bh, f...) do { \
+		char b[BDEVNAME_SIZE]; \
+		printk(KERN_DEBUG "block %s:%lu: ", \
+			bdevname(bh->b_bdev, b), \
+			(unsigned long) bh->b_blocknr); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+
+static int ext2_xattr_set2(struct inode *, struct buffer_head *,
+			   struct ext2_xattr_header *);
+
+static int ext2_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext2_xattr_cache_find(struct inode *,
+						 struct ext2_xattr_header *);
+static void ext2_xattr_rehash(struct ext2_xattr_header *,
+			      struct ext2_xattr_entry *);
+
+static struct mb_cache *ext2_xattr_cache;
+
+static const struct xattr_handler *ext2_xattr_handler_map[] = {
+	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
+	[EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+#endif
+	[EXT2_XATTR_INDEX_TRUSTED]	     = &ext2_xattr_trusted_handler,
+#ifdef CONFIG_EXT2_FS_SECURITY
+	[EXT2_XATTR_INDEX_SECURITY]	     = &ext2_xattr_security_handler,
+#endif
+};
+
+const struct xattr_handler *ext2_xattr_handlers[] = {
+	&ext2_xattr_user_handler,
+	&ext2_xattr_trusted_handler,
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+#ifdef CONFIG_EXT2_FS_SECURITY
+	&ext2_xattr_security_handler,
+#endif
+	NULL
+};
+
+static inline const struct xattr_handler *
+ext2_xattr_handler(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
+		handler = ext2_xattr_handler_map[name_index];
+	return handler;
+}
+
+/*
+ * ext2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext2_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	struct ext2_xattr_entry *entry;
+	size_t name_len, size;
+	char *end;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+		  name_index, name, buffer, (long)buffer_size);
+
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	if (name_len > 255)
+		return -ERANGE;
+
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = -ENODATA;
+	if (!EXT2_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
+	end = bh->b_data + bh->b_size;
+	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
+bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
+			"inode %ld: bad block %d", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+
+	/* find named attribute */
+	entry = FIRST_ENTRY(bh);
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext2_xattr_entry *next =
+			EXT2_XATTR_NEXT(entry);
+		if ((char *)next >= end)
+			goto bad_block;
+		if (name_index == entry->e_name_index &&
+		    name_len == entry->e_name_len &&
+		    memcmp(name, entry->e_name, name_len) == 0)
+			goto found;
+		entry = next;
+	}
+	if (ext2_xattr_cache_insert(bh))
+		ea_idebug(inode, "cache insert failed");
+	error = -ENODATA;
+	goto cleanup;
+found:
+	/* check the buffer size */
+	if (entry->e_value_block != 0)
+		goto bad_block;
+	size = le32_to_cpu(entry->e_value_size);
+	if (size > inode->i_sb->s_blocksize ||
+	    le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
+		goto bad_block;
+
+	if (ext2_xattr_cache_insert(bh))
+		ea_idebug(inode, "cache insert failed");
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		/* return value of attribute */
+		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+			size);
+	}
+	error = size;
+
+cleanup:
+	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
+
+	return error;
+}
+
+/*
+ * ext2_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+static int
+ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct buffer_head *bh = NULL;
+	struct ext2_xattr_entry *entry;
+	char *end;
+	size_t rest = buffer_size;
+	int error;
+
+	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+		  buffer, (long)buffer_size);
+
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = 0;
+	if (!EXT2_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
+	end = bh->b_data + bh->b_size;
+	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
+bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
+			"inode %ld: bad block %d", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+
+	/* check the on-disk data structure */
+	entry = FIRST_ENTRY(bh);
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(entry);
+
+		if ((char *)next >= end)
+			goto bad_block;
+		entry = next;
+	}
+	if (ext2_xattr_cache_insert(bh))
+		ea_idebug(inode, "cache insert failed");
+
+	/* list the attribute names */
+	for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
+	     entry = EXT2_XATTR_NEXT(entry)) {
+		const struct xattr_handler *handler =
+			ext2_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(dentry, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len,
+						    handler->flags);
+			if (buffer) {
+				if (size > rest) {
+					error = -ERANGE;
+					goto cleanup;
+				}
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+	error = buffer_size - rest;  /* total size */
+
+cleanup:
+	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
+
+	return error;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * d_inode(dentry)->i_mutex: don't care
+ */
+ssize_t
+ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	return ext2_xattr_list(dentry, buffer, size);
+}
+
+/*
+ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext2_xattr_update_super_block(struct super_block *sb)
+{
+	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
+		return;
+
+	spin_lock(&EXT2_SB(sb)->s_lock);
+	EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
+	spin_unlock(&EXT2_SB(sb)->s_lock);
+	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
+}
+
+/*
+ * ext2_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode.  Value
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext2_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t value_len, int flags)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh = NULL;
+	struct ext2_xattr_header *header = NULL;
+	struct ext2_xattr_entry *here, *last;
+	size_t name_len, free, min_offs = sb->s_blocksize;
+	int not_found = 1, error;
+	char *end;
+	
+	/*
+	 * header -- Points either into bh, or to a temporarily
+	 *           allocated buffer.
+	 * here -- The named entry found, or the place for inserting, within
+	 *         the block pointed to by header.
+	 * last -- Points right after the last named entry within the block
+	 *         pointed to by header.
+	 * min_offs -- The offset of the first value (values are aligned
+	 *             towards the end of the block).
+	 * end -- Points right after the block pointed to by header.
+	 */
+	
+	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+		  name_index, name, value, (long)value_len);
+
+	if (value == NULL)
+		value_len = 0;
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	if (name_len > 255 || value_len > sb->s_blocksize)
+		return -ERANGE;
+	down_write(&EXT2_I(inode)->xattr_sem);
+	if (EXT2_I(inode)->i_file_acl) {
+		/* The inode already has an extended attribute block. */
+		bh = sb_bread(sb, EXT2_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bh)
+			goto cleanup;
+		ea_bdebug(bh, "b_count=%d, refcount=%d",
+			atomic_read(&(bh->b_count)),
+			le32_to_cpu(HDR(bh)->h_refcount));
+		header = HDR(bh);
+		end = bh->b_data + bh->b_size;
+		if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+		    header->h_blocks != cpu_to_le32(1)) {
+bad_block:		ext2_error(sb, "ext2_xattr_set",
+				"inode %ld: bad block %d", inode->i_ino, 
+				   EXT2_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		/* Find the named attribute. */
+		here = FIRST_ENTRY(bh);
+		while (!IS_LAST_ENTRY(here)) {
+			struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
+			if ((char *)next >= end)
+				goto bad_block;
+			if (!here->e_value_block && here->e_value_size) {
+				size_t offs = le16_to_cpu(here->e_value_offs);
+				if (offs < min_offs)
+					min_offs = offs;
+			}
+			not_found = name_index - here->e_name_index;
+			if (!not_found)
+				not_found = name_len - here->e_name_len;
+			if (!not_found)
+				not_found = memcmp(name, here->e_name,name_len);
+			if (not_found <= 0)
+				break;
+			here = next;
+		}
+		last = here;
+		/* We still need to compute min_offs and last. */
+		while (!IS_LAST_ENTRY(last)) {
+			struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
+			if ((char *)next >= end)
+				goto bad_block;
+			if (!last->e_value_block && last->e_value_size) {
+				size_t offs = le16_to_cpu(last->e_value_offs);
+				if (offs < min_offs)
+					min_offs = offs;
+			}
+			last = next;
+		}
+
+		/* Check whether we have enough space left. */
+		free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
+	} else {
+		/* We will use a new extended attribute block. */
+		free = sb->s_blocksize -
+			sizeof(struct ext2_xattr_header) - sizeof(__u32);
+		here = last = NULL;  /* avoid gcc uninitialized warning. */
+	}
+
+	if (not_found) {
+		/* Request to remove a nonexistent attribute? */
+		error = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		error = 0;
+		if (value == NULL)
+			goto cleanup;
+	} else {
+		/* Request to create an existing attribute? */
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+		if (!here->e_value_block && here->e_value_size) {
+			size_t size = le32_to_cpu(here->e_value_size);
+
+			if (le16_to_cpu(here->e_value_offs) + size > 
+			    sb->s_blocksize || size > sb->s_blocksize)
+				goto bad_block;
+			free += EXT2_XATTR_SIZE(size);
+		}
+		free += EXT2_XATTR_LEN(name_len);
+	}
+	error = -ENOSPC;
+	if (free < EXT2_XATTR_LEN(name_len) + EXT2_XATTR_SIZE(value_len))
+		goto cleanup;
+
+	/* Here we know that we can set the new attribute. */
+
+	if (header) {
+		struct mb_cache_entry *ce;
+
+		/* assert(header == HDR(bh)); */
+		ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
+					bh->b_blocknr);
+		lock_buffer(bh);
+		if (header->h_refcount == cpu_to_le32(1)) {
+			ea_bdebug(bh, "modifying in-place");
+			if (ce)
+				mb_cache_entry_free(ce);
+			/* keep the buffer locked while modifying it. */
+		} else {
+			int offset;
+
+			if (ce)
+				mb_cache_entry_release(ce);
+			unlock_buffer(bh);
+			ea_bdebug(bh, "cloning");
+			header = kmalloc(bh->b_size, GFP_KERNEL);
+			error = -ENOMEM;
+			if (header == NULL)
+				goto cleanup;
+			memcpy(header, HDR(bh), bh->b_size);
+			header->h_refcount = cpu_to_le32(1);
+
+			offset = (char *)here - bh->b_data;
+			here = ENTRY((char *)header + offset);
+			offset = (char *)last - bh->b_data;
+			last = ENTRY((char *)header + offset);
+		}
+	} else {
+		/* Allocate a buffer where we construct the new block. */
+		header = kzalloc(sb->s_blocksize, GFP_KERNEL);
+		error = -ENOMEM;
+		if (header == NULL)
+			goto cleanup;
+		end = (char *)header + sb->s_blocksize;
+		header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
+		header->h_blocks = header->h_refcount = cpu_to_le32(1);
+		last = here = ENTRY(header+1);
+	}
+
+	/* Iff we are modifying the block in-place, bh is locked here. */
+
+	if (not_found) {
+		/* Insert the new name. */
+		size_t size = EXT2_XATTR_LEN(name_len);
+		size_t rest = (char *)last - (char *)here;
+		memmove((char *)here + size, here, rest);
+		memset(here, 0, size);
+		here->e_name_index = name_index;
+		here->e_name_len = name_len;
+		memcpy(here->e_name, name, name_len);
+	} else {
+		if (!here->e_value_block && here->e_value_size) {
+			char *first_val = (char *)header + min_offs;
+			size_t offs = le16_to_cpu(here->e_value_offs);
+			char *val = (char *)header + offs;
+			size_t size = EXT2_XATTR_SIZE(
+				le32_to_cpu(here->e_value_size));
+
+			if (size == EXT2_XATTR_SIZE(value_len)) {
+				/* The old and the new value have the same
+				   size. Just replace. */
+				here->e_value_size = cpu_to_le32(value_len);
+				memset(val + size - EXT2_XATTR_PAD, 0,
+				       EXT2_XATTR_PAD); /* Clear pad bytes. */
+				memcpy(val, value, value_len);
+				goto skip_replace;
+			}
+
+			/* Remove the old value. */
+			memmove(first_val + size, first_val, val - first_val);
+			memset(first_val, 0, size);
+			here->e_value_offs = 0;
+			min_offs += size;
+
+			/* Adjust all value offsets. */
+			last = ENTRY(header+1);
+			while (!IS_LAST_ENTRY(last)) {
+				size_t o = le16_to_cpu(last->e_value_offs);
+				if (!last->e_value_block && o < offs)
+					last->e_value_offs =
+						cpu_to_le16(o + size);
+				last = EXT2_XATTR_NEXT(last);
+			}
+		}
+		if (value == NULL) {
+			/* Remove the old name. */
+			size_t size = EXT2_XATTR_LEN(name_len);
+			last = ENTRY((char *)last - size);
+			memmove(here, (char*)here + size,
+				(char*)last - (char*)here);
+			memset(last, 0, size);
+		}
+	}
+
+	if (value != NULL) {
+		/* Insert the new value. */
+		here->e_value_size = cpu_to_le32(value_len);
+		if (value_len) {
+			size_t size = EXT2_XATTR_SIZE(value_len);
+			char *val = (char *)header + min_offs - size;
+			here->e_value_offs =
+				cpu_to_le16((char *)val - (char *)header);
+			memset(val + size - EXT2_XATTR_PAD, 0,
+			       EXT2_XATTR_PAD); /* Clear the pad bytes. */
+			memcpy(val, value, value_len);
+		}
+	}
+
+skip_replace:
+	if (IS_LAST_ENTRY(ENTRY(header+1))) {
+		/* This block is now empty. */
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
+		error = ext2_xattr_set2(inode, bh, NULL);
+	} else {
+		ext2_xattr_rehash(header, here);
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
+		error = ext2_xattr_set2(inode, bh, header);
+	}
+
+cleanup:
+	brelse(bh);
+	if (!(bh && header == HDR(bh)))
+		kfree(header);
+	up_write(&EXT2_I(inode)->xattr_sem);
+
+	return error;
+}
+
+/*
+ * Second half of ext2_xattr_set(): Update the file system.
+ */
+static int
+ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
+		struct ext2_xattr_header *header)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh = NULL;
+	int error;
+
+	if (header) {
+		new_bh = ext2_xattr_cache_find(inode, header);
+		if (new_bh) {
+			/* We found an identical block in the cache. */
+			if (new_bh == old_bh) {
+				ea_bdebug(new_bh, "keeping this block");
+			} else {
+				/* The old block is released after updating
+				   the inode.  */
+				ea_bdebug(new_bh, "reusing block");
+
+				error = dquot_alloc_block(inode, 1);
+				if (error) {
+					unlock_buffer(new_bh);
+					goto cleanup;
+				}
+				le32_add_cpu(&HDR(new_bh)->h_refcount, 1);
+				ea_bdebug(new_bh, "refcount now=%d",
+					le32_to_cpu(HDR(new_bh)->h_refcount));
+			}
+			unlock_buffer(new_bh);
+		} else if (old_bh && header == HDR(old_bh)) {
+			/* Keep this block. No need to lock the block as we
+			   don't need to change the reference count. */
+			new_bh = old_bh;
+			get_bh(new_bh);
+			ext2_xattr_cache_insert(new_bh);
+		} else {
+			/* We need to allocate a new block */
+			ext2_fsblk_t goal = ext2_group_first_block_no(sb,
+						EXT2_I(inode)->i_block_group);
+			int block = ext2_new_block(inode, goal, &error);
+			if (error)
+				goto cleanup;
+			ea_idebug(inode, "creating block %d", block);
+
+			new_bh = sb_getblk(sb, block);
+			if (unlikely(!new_bh)) {
+				ext2_free_blocks(inode, block, 1);
+				mark_inode_dirty(inode);
+				error = -ENOMEM;
+				goto cleanup;
+			}
+			lock_buffer(new_bh);
+			memcpy(new_bh->b_data, header, new_bh->b_size);
+			set_buffer_uptodate(new_bh);
+			unlock_buffer(new_bh);
+			ext2_xattr_cache_insert(new_bh);
+			
+			ext2_xattr_update_super_block(sb);
+		}
+		mark_buffer_dirty(new_bh);
+		if (IS_SYNC(inode)) {
+			sync_dirty_buffer(new_bh);
+			error = -EIO;
+			if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
+				goto cleanup;
+		}
+	}
+
+	/* Update the inode. */
+	EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+	inode->i_ctime = CURRENT_TIME_SEC;
+	if (IS_SYNC(inode)) {
+		error = sync_inode_metadata(inode, 1);
+		/* In case sync failed due to ENOSPC the inode was actually
+		 * written (only some dirty data were not) so we just proceed
+		 * as if nothing happened and cleanup the unused block */
+		if (error && error != -ENOSPC) {
+			if (new_bh && new_bh != old_bh) {
+				dquot_free_block_nodirty(inode, 1);
+				mark_inode_dirty(inode);
+			}
+			goto cleanup;
+		}
+	} else
+		mark_inode_dirty(inode);
+
+	error = 0;
+	if (old_bh && old_bh != new_bh) {
+		struct mb_cache_entry *ce;
+
+		/*
+		 * If there was an old block and we are no longer using it,
+		 * release the old block.
+		 */
+		ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
+					old_bh->b_blocknr);
+		lock_buffer(old_bh);
+		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+			/* Free the old block. */
+			if (ce)
+				mb_cache_entry_free(ce);
+			ea_bdebug(old_bh, "freeing");
+			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
+			mark_inode_dirty(inode);
+			/* We let our caller release old_bh, so we
+			 * need to duplicate the buffer before. */
+			get_bh(old_bh);
+			bforget(old_bh);
+		} else {
+			/* Decrement the refcount only. */
+			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
+			if (ce)
+				mb_cache_entry_release(ce);
+			dquot_free_block_nodirty(inode, 1);
+			mark_inode_dirty(inode);
+			mark_buffer_dirty(old_bh);
+			ea_bdebug(old_bh, "refcount now=%d",
+				le32_to_cpu(HDR(old_bh)->h_refcount));
+		}
+		unlock_buffer(old_bh);
+	}
+
+cleanup:
+	brelse(new_bh);
+
+	return error;
+}
+
+/*
+ * ext2_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed.
+ */
+void
+ext2_xattr_delete_inode(struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+	struct mb_cache_entry *ce;
+
+	down_write(&EXT2_I(inode)->xattr_sem);
+	if (!EXT2_I(inode)->i_file_acl)
+		goto cleanup;
+	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	if (!bh) {
+		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
+			"inode %ld: block %d read error", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
+	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
+		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
+			"inode %ld: bad block %d", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	lock_buffer(bh);
+	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+		if (ce)
+			mb_cache_entry_free(ce);
+		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
+		get_bh(bh);
+		bforget(bh);
+		unlock_buffer(bh);
+	} else {
+		le32_add_cpu(&HDR(bh)->h_refcount, -1);
+		if (ce)
+			mb_cache_entry_release(ce);
+		ea_bdebug(bh, "refcount now=%d",
+			le32_to_cpu(HDR(bh)->h_refcount));
+		unlock_buffer(bh);
+		mark_buffer_dirty(bh);
+		if (IS_SYNC(inode))
+			sync_dirty_buffer(bh);
+		dquot_free_block_nodirty(inode, 1);
+	}
+	EXT2_I(inode)->i_file_acl = 0;
+
+cleanup:
+	brelse(bh);
+	up_write(&EXT2_I(inode)->xattr_sem);
+}
+
+/*
+ * ext2_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext2_xattr_put_super(struct super_block *sb)
+{
+	mb_cache_shrink(sb->s_bdev);
+}
+
+
+/*
+ * ext2_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static int
+ext2_xattr_cache_insert(struct buffer_head *bh)
+{
+	__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+	struct mb_cache_entry *ce;
+	int error;
+
+	ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
+	if (!ce)
+		return -ENOMEM;
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	if (error) {
+		mb_cache_entry_free(ce);
+		if (error == -EBUSY) {
+			ea_bdebug(bh, "already in cache (%d cache entries)",
+				atomic_read(&ext2_xattr_cache->c_entry_count));
+			error = 0;
+		}
+	} else {
+		ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
+			  atomic_read(&ext2_xattr_cache->c_entry_count));
+		mb_cache_entry_release(ce);
+	}
+	return error;
+}
+
+/*
+ * ext2_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext2_xattr_cmp(struct ext2_xattr_header *header1,
+	       struct ext2_xattr_header *header2)
+{
+	struct ext2_xattr_entry *entry1, *entry2;
+
+	entry1 = ENTRY(header1+1);
+	entry2 = ENTRY(header2+1);
+	while (!IS_LAST_ENTRY(entry1)) {
+		if (IS_LAST_ENTRY(entry2))
+			return 1;
+		if (entry1->e_hash != entry2->e_hash ||
+		    entry1->e_name_index != entry2->e_name_index ||
+		    entry1->e_name_len != entry2->e_name_len ||
+		    entry1->e_value_size != entry2->e_value_size ||
+		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+			return 1;
+		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+			return -EIO;
+		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+			   le32_to_cpu(entry1->e_value_size)))
+			return 1;
+
+		entry1 = EXT2_XATTR_NEXT(entry1);
+		entry2 = EXT2_XATTR_NEXT(entry2);
+	}
+	if (!IS_LAST_ENTRY(entry2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext2_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a locked buffer head to the block found, or NULL if such
+ * a block was not found or an error occurred.
+ */
+static struct buffer_head *
+ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
+{
+	__u32 hash = le32_to_cpu(header->h_hash);
+	struct mb_cache_entry *ce;
+
+	if (!header->h_hash)
+		return NULL;  /* never share */
+	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+	ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
+	while (ce) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(ce)) {
+			if (PTR_ERR(ce) == -EAGAIN)
+				goto again;
+			break;
+		}
+
+		bh = sb_bread(inode->i_sb, ce->e_block);
+		if (!bh) {
+			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
+				"inode %ld: block %ld read error",
+				inode->i_ino, (unsigned long) ce->e_block);
+		} else {
+			lock_buffer(bh);
+			if (le32_to_cpu(HDR(bh)->h_refcount) >
+				   EXT2_XATTR_REFCOUNT_MAX) {
+				ea_idebug(inode, "block %ld refcount %d>%d",
+					  (unsigned long) ce->e_block,
+					  le32_to_cpu(HDR(bh)->h_refcount),
+					  EXT2_XATTR_REFCOUNT_MAX);
+			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
+				ea_bdebug(bh, "b_count=%d",
+					  atomic_read(&(bh->b_count)));
+				mb_cache_entry_release(ce);
+				return bh;
+			}
+			unlock_buffer(bh);
+			brelse(bh);
+		}
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+	}
+	return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
+					 struct ext2_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	char *name = entry->e_name;
+	int n;
+
+	for (n=0; n < entry->e_name_len; n++) {
+		hash = (hash << NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+		__le32 *value = (__le32 *)((char *)header +
+			le16_to_cpu(entry->e_value_offs));
+		for (n = (le32_to_cpu(entry->e_value_size) +
+		     EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
+			hash = (hash << VALUE_HASH_SHIFT) ^
+			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+			       le32_to_cpu(*value++);
+		}
+	}
+	entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext2_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext2_xattr_rehash(struct ext2_xattr_header *header,
+			      struct ext2_xattr_entry *entry)
+{
+	struct ext2_xattr_entry *here;
+	__u32 hash = 0;
+	
+	ext2_xattr_hash_entry(header, entry);
+	here = ENTRY(header+1);
+	while (!IS_LAST_ENTRY(here)) {
+		if (!here->e_hash) {
+			/* Block is not shared if an entry's hash value == 0 */
+			hash = 0;
+			break;
+		}
+		hash = (hash << BLOCK_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+		       le32_to_cpu(here->e_hash);
+		here = EXT2_XATTR_NEXT(here);
+	}
+	header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+init_ext2_xattr(void)
+{
+	ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
+	if (!ext2_xattr_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+exit_ext2_xattr(void)
+{
+	mb_cache_destroy(ext2_xattr_cache);
+}
diff --git a/kernel/fs/ext2/xattr.h b/kernel/fs/ext2/xattr.h
new file mode 100644
index 000000000..60edf2986
--- /dev/null
+++ b/kernel/fs/ext2/xattr.h
@@ -0,0 +1,125 @@
+/*
+  File: linux/ext2_xattr.h
+
+  On-disk format of extended attributes for the ext2 filesystem.
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT2_XATTR_MAGIC		0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT2_XATTR_REFCOUNT_MAX		1024
+
+/* Name indexes */
+#define EXT2_XATTR_INDEX_USER			1
+#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define EXT2_XATTR_INDEX_TRUSTED		4
+#define	EXT2_XATTR_INDEX_LUSTRE			5
+#define EXT2_XATTR_INDEX_SECURITY	        6
+
+struct ext2_xattr_header {
+	__le32	h_magic;	/* magic number for identification */
+	__le32	h_refcount;	/* reference count */
+	__le32	h_blocks;	/* number of disk blocks used */
+	__le32	h_hash;		/* hash value of all attributes */
+	__u32	h_reserved[4];	/* zero right now */
+};
+
+struct ext2_xattr_entry {
+	__u8	e_name_len;	/* length of name */
+	__u8	e_name_index;	/* attribute name index */
+	__le16	e_value_offs;	/* offset in disk block of value */
+	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_size;	/* size of attribute value */
+	__le32	e_hash;		/* hash value of name and value */
+	char	e_name[0];	/* attribute name */
+};
+
+#define EXT2_XATTR_PAD_BITS		2
+#define EXT2_XATTR_PAD		(1<<EXT2_XATTR_PAD_BITS)
+#define EXT2_XATTR_ROUND		(EXT2_XATTR_PAD-1)
+#define EXT2_XATTR_LEN(name_len) \
+	(((name_len) + EXT2_XATTR_ROUND + \
+	sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
+#define EXT2_XATTR_NEXT(entry) \
+	( (struct ext2_xattr_entry *)( \
+	  (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
+#define EXT2_XATTR_SIZE(size) \
+	(((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
+
+# ifdef CONFIG_EXT2_FS_XATTR
+
+extern const struct xattr_handler ext2_xattr_user_handler;
+extern const struct xattr_handler ext2_xattr_trusted_handler;
+extern const struct xattr_handler ext2_xattr_security_handler;
+
+extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
+
+extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext2_xattr_delete_inode(struct inode *);
+extern void ext2_xattr_put_super(struct super_block *);
+
+extern int init_ext2_xattr(void);
+extern void exit_ext2_xattr(void);
+
+extern const struct xattr_handler *ext2_xattr_handlers[];
+
+# else  /* CONFIG_EXT2_FS_XATTR */
+
+static inline int
+ext2_xattr_get(struct inode *inode, int name_index,
+	       const char *name, void *buffer, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext2_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+ext2_xattr_delete_inode(struct inode *inode)
+{
+}
+
+static inline void
+ext2_xattr_put_super(struct super_block *sb)
+{
+}
+
+static inline int
+init_ext2_xattr(void)
+{
+	return 0;
+}
+
+static inline void
+exit_ext2_xattr(void)
+{
+}
+
+#define ext2_xattr_handlers NULL
+
+# endif  /* CONFIG_EXT2_FS_XATTR */
+
+#ifdef CONFIG_EXT2_FS_SECURITY
+extern int ext2_init_security(struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr);
+#else
+static inline int ext2_init_security(struct inode *inode, struct inode *dir,
+				     const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/fs/ext2/xattr_security.c b/kernel/fs/ext2/xattr_security.c
new file mode 100644
index 000000000..702fc6840
--- /dev/null
+++ b/kernel/fs/ext2/xattr_security.c
@@ -0,0 +1,74 @@
+/*
+ * linux/fs/ext2/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include "ext2.h"
+#include <linux/security.h>
+#include "xattr.h"
+
+static size_t
+ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+			 const char *name, size_t name_len, int type)
+{
+	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext2_xattr_security_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+			      buffer, size);
+}
+
+static int
+ext2_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+			      value, size, flags);
+}
+
+static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+			   void *fs_info)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+				     xattr->name, xattr->value,
+				     xattr->value_len, 0);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int
+ext2_init_security(struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &ext2_initxattrs, NULL);
+}
+
+const struct xattr_handler ext2_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ext2_xattr_security_list,
+	.get	= ext2_xattr_security_get,
+	.set	= ext2_xattr_security_set,
+};
diff --git a/kernel/fs/ext2/xattr_trusted.c b/kernel/fs/ext2/xattr_trusted.c
new file mode 100644
index 000000000..42b6e9874
--- /dev/null
+++ b/kernel/fs/ext2/xattr_trusted.c
@@ -0,0 +1,54 @@
+/*
+ * linux/fs/ext2/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include "ext2.h"
+#include "xattr.h"
+
+static size_t
+ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+			      buffer, size);
+}
+
+static int
+ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+			      value, size, flags);
+}
+
+const struct xattr_handler ext2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ext2_xattr_trusted_list,
+	.get	= ext2_xattr_trusted_get,
+	.set	= ext2_xattr_trusted_set,
+};
diff --git a/kernel/fs/ext2/xattr_user.c b/kernel/fs/ext2/xattr_user.c
new file mode 100644
index 000000000..ecdc46051
--- /dev/null
+++ b/kernel/fs/ext2/xattr_user.c
@@ -0,0 +1,61 @@
+/*
+ * linux/fs/ext2/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext2.h"
+#include "xattr.h"
+
+static size_t
+ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext2_xattr_user_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+			      name, buffer, size);
+}
+
+static int
+ext2_xattr_user_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+
+	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ext2_xattr_user_list,
+	.get	= ext2_xattr_user_get,
+	.set	= ext2_xattr_user_set,
+};
diff --git a/kernel/fs/ext3/Kconfig b/kernel/fs/ext3/Kconfig
new file mode 100644
index 000000000..e8c6ba0e4
--- /dev/null
+++ b/kernel/fs/ext3/Kconfig
@@ -0,0 +1,89 @@
+config EXT3_FS
+	tristate "Ext3 journalling file system support"
+	select JBD
+	help
+	  This is the journalling version of the Second extended file system
+	  (often called ext3), the de facto standard Linux file system
+	  (method to organize files on a storage device) for hard disks.
+
+	  The journalling code included in this driver means you do not have
+	  to run e2fsck (file system checker) on your file systems after a
+	  crash.  The journal keeps track of any changes that were being made
+	  at the time the system crashed, and can ensure that your file system
+	  is consistent without the need for a lengthy check.
+
+	  Other than adding the journal to the file system, the on-disk format
+	  of ext3 is identical to ext2.  It is possible to freely switch
+	  between using the ext3 driver and the ext2 driver, as long as the
+	  file system has been cleanly unmounted, or e2fsck is run on the file
+	  system.
+
+	  To add a journal on an existing ext2 file system or change the
+	  behavior of ext3 file systems, you can use the tune2fs utility ("man
+	  tune2fs").  To modify attributes of files and directories on ext3
+	  file systems, use chattr ("man chattr").  You need to be using
+	  e2fsprogs version 1.20 or later in order to create ext3 journals
+	  (available at <http://sourceforge.net/projects/e2fsprogs/>).
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext3.
+
+config EXT3_DEFAULTS_TO_ORDERED
+	bool "Default to 'data=ordered' in ext3"
+	depends on EXT3_FS
+	default y
+	help
+	  The journal mode options for ext3 have different tradeoffs
+	  between when data is guaranteed to be on disk and
+	  performance.	The use of "data=writeback" can cause
+	  unwritten data to appear in files after an system crash or
+	  power failure, which can be a security issue.	 However,
+	  "data=ordered" mode can also result in major performance
+	  problems, including seconds-long delays before an fsync()
+	  call returns.	 For details, see:
+
+	  http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
+
+	  If you have been historically happy with ext3's performance,
+	  data=ordered mode will be a safe choice and you should
+	  answer 'y' here.  If you understand the reliability and data
+	  privacy issues of data=writeback and are willing to make
+	  that trade off, answer 'n'.
+
+config EXT3_FS_XATTR
+	bool "Ext3 extended attributes"
+	depends on EXT3_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext3.
+
+config EXT3_FS_POSIX_ACL
+	bool "Ext3 POSIX Access Control Lists"
+	depends on EXT3_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT3_FS_SECURITY
+	bool "Ext3 Security Labels"
+	depends on EXT3_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext3 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/kernel/fs/ext3/Makefile b/kernel/fs/ext3/Makefile
new file mode 100644
index 000000000..e77766a8b
--- /dev/null
+++ b/kernel/fs/ext3/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ext3-filesystem routines.
+#
+
+obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+	   ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
+
+ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
+ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+ext3-$(CONFIG_EXT3_FS_SECURITY)	 += xattr_security.o
diff --git a/kernel/fs/ext3/acl.c b/kernel/fs/ext3/acl.c
new file mode 100644
index 000000000..8bbaf5bcf
--- /dev/null
+++ b/kernel/fs/ext3/acl.c
@@ -0,0 +1,281 @@
+/*
+ * linux/fs/ext3/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include "ext3.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext3_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(ext3_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((ext3_acl_header *)value)->a_version !=
+	    cpu_to_le32(EXT3_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(ext3_acl_header);
+	count = ext3_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n=0; n < count; n++) {
+		ext3_acl_entry *entry =
+			(ext3_acl_entry *)value;
+		if ((char *)value + sizeof(ext3_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value +
+					sizeof(ext3_acl_entry_short);
+				break;
+
+			case ACL_USER:
+				value = (char *)value + sizeof(ext3_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_uid =
+					make_kuid(&init_user_ns,
+						  le32_to_cpu(entry->e_id));
+				break;
+			case ACL_GROUP:
+				value = (char *)value + sizeof(ext3_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_gid =
+					make_kgid(&init_user_ns,
+						  le32_to_cpu(entry->e_id));
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	ext3_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = ext3_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
+			sizeof(ext3_acl_entry), GFP_NOFS);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(ext3_acl_header);
+	for (n=0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		ext3_acl_entry *entry = (ext3_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch(acl_e->e_tag) {
+			case ACL_USER:
+				entry->e_id = cpu_to_le32(
+					from_kuid(&init_user_ns, acl_e->e_uid));
+				e += sizeof(ext3_acl_entry);
+				break;
+			case ACL_GROUP:
+				entry->e_id = cpu_to_le32(
+					from_kgid(&init_user_ns, acl_e->e_gid));
+				e += sizeof(ext3_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(ext3_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+struct posix_acl *
+ext3_get_acl(struct inode *inode, int type)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ext3_xattr_get(inode, name_index, "", value, retval);
+	}
+	if (retval > 0)
+		acl = ext3_acl_from_disk(value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext3_new_inode
+ */
+static int
+__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
+	     struct posix_acl *acl)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+			if (acl) {
+				error = posix_acl_equiv_mode(acl, &inode->i_mode);
+				if (error < 0)
+					return error;
+				else {
+					inode->i_ctime = CURRENT_TIME_SEC;
+					ext3_mark_inode_dirty(handle, inode);
+					if (error == 0)
+						acl = NULL;
+				}
+			}
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+			if (!S_ISDIR(inode->i_mode))
+				return acl ? -EACCES : 0;
+			break;
+
+		default:
+			return -EINVAL;
+	}
+	if (acl) {
+		value = ext3_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = ext3_xattr_set_handle(handle, inode, name_index, "",
+				      value, size, 0);
+
+	kfree(value);
+
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
+	return error;
+}
+
+int
+ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	handle_t *handle;
+	int error, retries = 0;
+
+retry:
+	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	error = __ext3_set_acl(handle, inode, type, acl);
+	ext3_journal_stop(handle);
+	if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+	return error;
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext3_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *default_acl, *acl;
+	int error;
+
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (default_acl) {
+		error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+				       default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
+					       acl);
+		posix_acl_release(acl);
+	}
+	return error;
+}
diff --git a/kernel/fs/ext3/acl.h b/kernel/fs/ext3/acl.h
new file mode 100644
index 000000000..ea1c69eda
--- /dev/null
+++ b/kernel/fs/ext3/acl.h
@@ -0,0 +1,72 @@
+/*
+  File: fs/ext3/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT3_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} ext3_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} ext3_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} ext3_acl_header;
+
+static inline size_t ext3_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(ext3_acl_header) +
+		       count * sizeof(ext3_acl_entry_short);
+	} else {
+		return sizeof(ext3_acl_header) +
+		       4 * sizeof(ext3_acl_entry_short) +
+		       (count - 4) * sizeof(ext3_acl_entry);
+	}
+}
+
+static inline int ext3_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(ext3_acl_header);
+	s = size - 4 * sizeof(ext3_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(ext3_acl_entry_short))
+			return -1;
+		return size / sizeof(ext3_acl_entry_short);
+	} else {
+		if (s % sizeof(ext3_acl_entry))
+			return -1;
+		return s / sizeof(ext3_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+
+/* acl.c */
+extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
+extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
+
+#else  /* CONFIG_EXT3_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext3_get_acl NULL
+#define ext3_set_acl NULL
+
+static inline int
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif  /* CONFIG_EXT3_FS_POSIX_ACL */
+
diff --git a/kernel/fs/ext3/balloc.c b/kernel/fs/ext3/balloc.c
new file mode 100644
index 000000000..158b5d4ce
--- /dev/null
+++ b/kernel/fs/ext3/balloc.c
@@ -0,0 +1,2158 @@
+/*
+ *  linux/fs/ext3/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
+#include "ext3.h"
+
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext3_fill_super).
+ */
+
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+	ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	if (offsetp)
+		*offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+	if (blockgrpp)
+		*blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
+
+/**
+ * ext3_get_group_desc() -- load group descriptor from disk
+ * @sb:			super block
+ * @block_group:	given block group
+ * @bh:			pointer to the buffer head to store the block
+ *			group descriptor
+ */
+struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+					     unsigned int block_group,
+					     struct buffer_head ** bh)
+{
+	unsigned long group_desc;
+	unsigned long offset;
+	struct ext3_group_desc * desc;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (block_group >= sbi->s_groups_count) {
+		ext3_error (sb, "ext3_get_group_desc",
+			    "block_group >= groups_count - "
+			    "block_group = %d, groups_count = %lu",
+			    block_group, sbi->s_groups_count);
+
+		return NULL;
+	}
+	smp_rmb();
+
+	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
+	offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
+	if (!sbi->s_group_desc[group_desc]) {
+		ext3_error (sb, "ext3_get_group_desc",
+			    "Group descriptor not loaded - "
+			    "block_group = %d, group_desc = %lu, desc = %lu",
+			     block_group, group_desc, offset);
+		return NULL;
+	}
+
+	desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
+	if (bh)
+		*bh = sbi->s_group_desc[group_desc];
+	return desc + offset;
+}
+
+static int ext3_valid_block_bitmap(struct super_block *sb,
+					struct ext3_group_desc *desc,
+					unsigned int block_group,
+					struct buffer_head *bh)
+{
+	ext3_grpblk_t offset;
+	ext3_grpblk_t next_zero_bit;
+	ext3_fsblk_t bitmap_blk;
+	ext3_fsblk_t group_first_block;
+
+	group_first_block = ext3_group_first_block_no(sb, block_group);
+
+	/* check whether block bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext3_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext3_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode table block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
+	offset = bitmap_blk - group_first_block;
+	next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
+				offset + EXT3_SB(sb)->s_itb_per_group,
+				offset);
+	if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
+		/* good bitmap for inode tables */
+		return 1;
+
+err_out:
+	ext3_error(sb, __func__,
+			"Invalid block bitmap - "
+			"block_group = %d, block = %lu",
+			block_group, bitmap_blk);
+	return 0;
+}
+
+/**
+ * read_block_bitmap()
+ * @sb:			super block
+ * @block_group:	given block group
+ *
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+	struct ext3_group_desc * desc;
+	struct buffer_head * bh = NULL;
+	ext3_fsblk_t bitmap_blk;
+
+	desc = ext3_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+	trace_ext3_read_block_bitmap(sb, block_group);
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext3_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+	if (likely(bh_uptodate_or_lock(bh)))
+		return bh;
+
+	if (bh_submit_read(bh) < 0) {
+		brelse(bh);
+		ext3_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+	ext3_valid_block_bitmap(sb, desc, block_group, bh);
+	/*
+	 * file system mounted not to panic on error, continue with corrupt
+	 * bitmap
+	 */
+	return bh;
+}
+/*
+ * The reservation window structure operations
+ * --------------------------------------------
+ * Operations include:
+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
+ *
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:		root of per-filesystem reservation rb tree
+ * @verbose:		verbose mode
+ * @fn:			function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end).	Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+			      const char *fn)
+{
+	struct rb_node *n;
+	struct ext3_reserve_window_node *rsv, *prev;
+	int bad;
+
+restart:
+	n = rb_first(root);
+	bad = 0;
+	prev = NULL;
+
+	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+	while (n) {
+		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
+		if (verbose)
+			printk("reservation window 0x%p "
+			       "start:  %lu, end:  %lu\n",
+			       rsv, rsv->rsv_start, rsv->rsv_end);
+		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+			printk("Bad reservation %p (start >= end)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (prev && prev->rsv_end >= rsv->rsv_start) {
+			printk("Bad reservation %p (prev->end >= start)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (bad) {
+			if (!verbose) {
+				printk("Restarting reservation walk in verbose mode\n");
+				verbose = 1;
+				goto restart;
+			}
+		}
+		n = rb_next(n);
+		prev = rsv;
+	}
+	printk("Window map complete.\n");
+	BUG_ON(bad);
+}
+#define rsv_window_dump(root, verbose) \
+	__rsv_window_dump((root), (verbose), __func__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
+
+/**
+ * goal_in_my_reservation()
+ * @rsv:		inode's reservation window
+ * @grp_goal:		given goal block relative to the allocation block group
+ * @group:		the current allocation block group
+ * @sb:			filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
+static int
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
+			unsigned int group, struct super_block * sb)
+{
+	ext3_fsblk_t group_first_block, group_last_block;
+
+	group_first_block = ext3_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+	if ((rsv->_rsv_start > group_last_block) ||
+	    (rsv->_rsv_end < group_first_block))
+		return 0;
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
+		return 0;
+	return 1;
+}
+
+/**
+ * search_reserve_window()
+ * @rb_root:		root of reservation tree
+ * @goal:		target allocation block
+ *
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct ext3_reserve_window_node *
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
+{
+	struct rb_node *n = root->rb_node;
+	struct ext3_reserve_window_node *rsv;
+
+	if (!n)
+		return NULL;
+
+	do {
+		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
+
+		if (goal < rsv->rsv_start)
+			n = n->rb_left;
+		else if (goal > rsv->rsv_end)
+			n = n->rb_right;
+		else
+			return rsv;
+	} while (n);
+	/*
+	 * We've fallen off the end of the tree: the goal wasn't inside
+	 * any particular node.  OK, the previous node must be to one
+	 * side of the interval containing the goal.  If it's the RHS,
+	 * we need to back up one.
+	 */
+	if (rsv->rsv_start > goal) {
+		n = rb_prev(&rsv->rsv_node);
+		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
+	}
+	return rsv;
+}
+
+/**
+ * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:			super block
+ * @rsv:		reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
+void ext3_rsv_window_add(struct super_block *sb,
+		    struct ext3_reserve_window_node *rsv)
+{
+	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
+	struct rb_node *node = &rsv->rsv_node;
+	ext3_fsblk_t start = rsv->rsv_start;
+
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ext3_reserve_window_node *this;
+
+	trace_ext3_rsv_window_add(sb, rsv);
+	while (*p)
+	{
+		parent = *p;
+		this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
+
+		if (start < this->rsv_start)
+			p = &(*p)->rb_left;
+		else if (start > this->rsv_end)
+			p = &(*p)->rb_right;
+		else {
+			rsv_window_dump(root, 1);
+			BUG();
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+}
+
+/**
+ * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:			super block
+ * @rsv:		reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
+static void rsv_window_remove(struct super_block *sb,
+			      struct ext3_reserve_window_node *rsv)
+{
+	rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_alloc_hit = 0;
+	rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
+}
+
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:		given reservation window to check
+ *
+ * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
+static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
+{
+	/* a valid reservation end block could not be 0 */
+	return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+}
+
+/**
+ * ext3_init_block_alloc_info()
+ * @inode:		file inode structure
+ *
+ * Allocate and initialize the	reservation window structure, and
+ * link the window to the ext3 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext3 inode the first time the open file
+ * needs a new block. So, before every ext3_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext3_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
+void ext3_init_block_alloc_info(struct inode *inode)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_block_alloc_info *block_i;
+	struct super_block *sb = inode->i_sb;
+
+	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+	if (block_i) {
+		struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
+
+		rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+		rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+
+		/*
+		 * if filesystem is mounted with NORESERVATION, the goal
+		 * reservation window size is set to zero to indicate
+		 * block reservation is off
+		 */
+		if (!test_opt(sb, RESERVATION))
+			rsv->rsv_goal_size = 0;
+		else
+			rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
+		rsv->rsv_alloc_hit = 0;
+		block_i->last_alloc_logical_block = 0;
+		block_i->last_alloc_physical_block = 0;
+	}
+	ei->i_block_alloc_info = block_i;
+}
+
+/**
+ * ext3_discard_reservation()
+ * @inode:		inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ *	ext3_release_file(): last writer close the file
+ *	ext3_clear_inode(): last iput(), when nobody link to this file.
+ *	ext3_truncate(): when the block indirect map is about to change.
+ *
+ */
+void ext3_discard_reservation(struct inode *inode)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct ext3_reserve_window_node *rsv;
+	spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
+
+	if (!block_i)
+		return;
+
+	rsv = &block_i->rsv_window_node;
+	if (!rsv_is_empty(&rsv->rsv_window)) {
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&rsv->rsv_window)) {
+			trace_ext3_discard_reservation(inode, rsv);
+			rsv_window_remove(inode->i_sb, rsv);
+		}
+		spin_unlock(rsv_lock);
+	}
+}
+
+/**
+ * ext3_free_blocks_sb() -- Free given blocks and update quota
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physical block to free
+ * @count:			number of blocks to free
+ * @pdquot_freed_blocks:	pointer to quota
+ */
+void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
+			 ext3_fsblk_t block, unsigned long count,
+			 unsigned long *pdquot_freed_blocks)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	unsigned long block_group;
+	ext3_grpblk_t bit;
+	unsigned long i;
+	unsigned long overflow;
+	struct ext3_group_desc * desc;
+	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi;
+	int err = 0, ret;
+	ext3_grpblk_t group_freed;
+
+	*pdquot_freed_blocks = 0;
+	sbi = EXT3_SB(sb);
+	es = sbi->s_es;
+	if (block < le32_to_cpu(es->s_first_data_block) ||
+	    block + count < block ||
+	    block + count > le32_to_cpu(es->s_blocks_count)) {
+		ext3_error (sb, "ext3_free_blocks",
+			    "Freeing blocks not in datazone - "
+			    "block = "E3FSBLK", count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
+
+do_more:
+	overflow = 0;
+	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+		      EXT3_BLOCKS_PER_GROUP(sb);
+	bit = (block - le32_to_cpu(es->s_first_data_block)) %
+		      EXT3_BLOCKS_PER_GROUP(sb);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	brelse(bitmap_bh);
+	bitmap_bh = read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	desc = ext3_get_group_desc (sb, block_group, &gd_bh);
+	if (!desc)
+		goto error_return;
+
+	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
+	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
+	    in_range (block, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group) ||
+	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group)) {
+		ext3_error (sb, "ext3_free_blocks",
+			    "Freeing blocks in system zones - "
+			    "Block = "E3FSBLK", count = %lu",
+			    block, count);
+		goto error_return;
+	}
+
+	/*
+	 * We are about to start releasing blocks in the bitmap,
+	 * so we need undo access.
+	 */
+	/* @@@ check errors */
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	jbd_lock_bh_state(bitmap_bh);
+
+	for (i = 0, group_freed = 0; i < count; i++) {
+		/*
+		 * An HJ special.  This is expensive...
+		 */
+#ifdef CONFIG_JBD_DEBUG
+		jbd_unlock_bh_state(bitmap_bh);
+		{
+			struct buffer_head *debug_bh;
+			debug_bh = sb_find_get_block(sb, block + i);
+			if (debug_bh) {
+				BUFFER_TRACE(debug_bh, "Deleted!");
+				if (!bh2jh(bitmap_bh)->b_committed_data)
+					BUFFER_TRACE(debug_bh,
+						"No committed data in bitmap");
+				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+				__brelse(debug_bh);
+			}
+		}
+		jbd_lock_bh_state(bitmap_bh);
+#endif
+		if (need_resched()) {
+			jbd_unlock_bh_state(bitmap_bh);
+			cond_resched();
+			jbd_lock_bh_state(bitmap_bh);
+		}
+		/* @@@ This prevents newly-allocated data from being
+		 * freed and then reallocated within the same
+		 * transaction.
+		 *
+		 * Ideally we would want to allow that to happen, but to
+		 * do so requires making journal_forget() capable of
+		 * revoking the queued write of a data block, which
+		 * implies blocking on the journal lock.  *forget()
+		 * cannot block due to truncate races.
+		 *
+		 * Eventually we can fix this by making journal_forget()
+		 * return a status indicating whether or not it was able
+		 * to revoke the buffer.  On successful revoke, it is
+		 * safe not to set the allocation bit in the committed
+		 * bitmap, because we know that there is no outstanding
+		 * activity on the buffer any more and so it is safe to
+		 * reallocate it.
+		 */
+		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
+		J_ASSERT_BH(bitmap_bh,
+				bh2jh(bitmap_bh)->b_committed_data != NULL);
+		ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
+				bh2jh(bitmap_bh)->b_committed_data);
+
+		/*
+		 * We clear the bit in the bitmap after setting the committed
+		 * data bit, because this is the reverse order to that which
+		 * the allocator uses.
+		 */
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+						bit + i, bitmap_bh->b_data)) {
+			jbd_unlock_bh_state(bitmap_bh);
+			ext3_error(sb, __func__,
+				"bit already cleared for block "E3FSBLK,
+				 block + i);
+			jbd_lock_bh_state(bitmap_bh);
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			group_freed++;
+		}
+	}
+	jbd_unlock_bh_state(bitmap_bh);
+
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gd_bh);
+	if (!err) err = ret;
+	*pdquot_freed_blocks += group_freed;
+
+	if (overflow && !err) {
+		block += count;
+		count = overflow;
+		goto do_more;
+	}
+
+error_return:
+	brelse(bitmap_bh);
+	ext3_std_error(sb, err);
+	return;
+}
+
+/**
+ * ext3_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ */
+void ext3_free_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t block, unsigned long count)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long dquot_freed_blocks;
+
+	trace_ext3_free_blocks(inode, block, count);
+	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+	if (dquot_freed_blocks)
+		dquot_free_block(inode, dquot_freed_blocks);
+	return;
+}
+
+/**
+ * ext3_test_allocatable()
+ * @nr:			given allocation block group
+ * @bh:			bufferhead contains the bitmap of the given block group
+ *
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes.
+ */
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
+{
+	int ret;
+	struct journal_head *jh = bh2jh(bh);
+
+	if (ext3_test_bit(nr, bh->b_data))
+		return 0;
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data)
+		ret = 1;
+	else
+		ret = !ext3_test_bit(nr, jh->b_committed_data);
+	jbd_unlock_bh_state(bh);
+	return ret;
+}
+
+/**
+ * bitmap_search_next_usable_block()
+ * @start:		the starting block (group relative) of the search
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
+static ext3_grpblk_t
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+					ext3_grpblk_t maxblocks)
+{
+	ext3_grpblk_t next;
+	struct journal_head *jh = bh2jh(bh);
+
+	while (start < maxblocks) {
+		next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
+		if (next >= maxblocks)
+			return -1;
+		if (ext3_test_allocatable(next, bh))
+			return next;
+		jbd_lock_bh_state(bh);
+		if (jh->b_committed_data)
+			start = ext3_find_next_zero_bit(jh->b_committed_data,
+							maxblocks, next);
+		jbd_unlock_bh_state(bh);
+	}
+	return -1;
+}
+
+/**
+ * find_next_usable_block()
+ * @start:		the starting block (group relative) to find next
+ *			allocatable block in bitmap.
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We honor both the bitmap and
+ * its last-committed copy (if that exists), and perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap; then
+ * for any free bit in the bitmap.
+ */
+static ext3_grpblk_t
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+			ext3_grpblk_t maxblocks)
+{
+	ext3_grpblk_t here, next;
+	char *p, *r;
+
+	if (start > 0) {
+		/*
+		 * The goal was occupied; search forward for a free
+		 * block within the next XX blocks.
+		 *
+		 * end_goal is more or less random, but it has to be
+		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
+		 * next 64-bit boundary is simple..
+		 */
+		ext3_grpblk_t end_goal = (start + 63) & ~63;
+		if (end_goal > maxblocks)
+			end_goal = maxblocks;
+		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
+		if (here < end_goal && ext3_test_allocatable(here, bh))
+			return here;
+		ext3_debug("Bit not found near goal\n");
+	}
+
+	here = start;
+	if (here < 0)
+		here = 0;
+
+	p = bh->b_data + (here >> 3);
+	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
+	next = (r - bh->b_data) << 3;
+
+	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
+		return next;
+
+	/*
+	 * The bitmap search --- search forward alternately through the actual
+	 * bitmap and the last-committed copy until we find a bit free in
+	 * both
+	 */
+	here = bitmap_search_next_usable_block(here, bh, maxblocks);
+	return here;
+}
+
+/**
+ * claim_block()
+ * @lock:		the spin lock for this block group
+ * @block:		the free block (group relative) to allocate
+ * @bh:			the buffer_head contains the block group bitmap
+ *
+ * We think we can allocate this block in this bitmap.  Try to set the bit.
+ * If that succeeds then check that nobody has allocated and then freed the
+ * block since we saw that is was not marked in b_committed_data.  If it _was_
+ * allocated and freed then clear the bit in the bitmap again and return
+ * zero (failure).
+ */
+static inline int
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+	int ret;
+
+	if (ext3_set_bit_atomic(lock, block, bh->b_data))
+		return 0;
+	jbd_lock_bh_state(bh);
+	if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
+		ext3_clear_bit_atomic(lock, block, bh->b_data);
+		ret = 0;
+	} else {
+		ret = 1;
+	}
+	jbd_unlock_bh_state(bh);
+	return ret;
+}
+
+/**
+ * ext3_try_to_allocate()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ *	if there is a reservation window, only try to allocate block(s) from the
+ *	file's own reservation window;
+ *	Otherwise, the allocation range starts from the give goal block, ends at
+ *	the block group's last block.
+ *
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap.  In that case we must release write access to the old one via
+ * ext3_journal_release_buffer(), else we'll run out of credits.
+ */
+static ext3_grpblk_t
+ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
+			unsigned long *count, struct ext3_reserve_window *my_rsv)
+{
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t start, end;
+	unsigned long num = 0;
+
+	/* we do allocation within the reservation window if we have a window */
+	if (my_rsv) {
+		group_first_block = ext3_group_first_block_no(sb, group);
+		if (my_rsv->_rsv_start >= group_first_block)
+			start = my_rsv->_rsv_start - group_first_block;
+		else
+			/* reservation window cross group boundary */
+			start = 0;
+		end = my_rsv->_rsv_end - group_first_block + 1;
+		if (end > EXT3_BLOCKS_PER_GROUP(sb))
+			/* reservation window crosses group boundary */
+			end = EXT3_BLOCKS_PER_GROUP(sb);
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
+		else
+			grp_goal = -1;
+	} else {
+		if (grp_goal > 0)
+			start = grp_goal;
+		else
+			start = 0;
+		end = EXT3_BLOCKS_PER_GROUP(sb);
+	}
+
+	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
+
+repeat:
+	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
+			goto fail_access;
+		if (!my_rsv) {
+			int i;
+
+			for (i = 0; i < 7 && grp_goal > start &&
+					ext3_test_allocatable(grp_goal - 1,
+								bitmap_bh);
+					i++, grp_goal--)
+				;
+		}
+	}
+	start = grp_goal;
+
+	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+		grp_goal, bitmap_bh)) {
+		/*
+		 * The block was allocated by another thread, or it was
+		 * allocated and then freed by another thread
+		 */
+		start++;
+		grp_goal++;
+		if (start >= end)
+			goto fail_access;
+		goto repeat;
+	}
+	num++;
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& ext3_test_allocatable(grp_goal, bitmap_bh)
+		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+				grp_goal, bitmap_bh)) {
+		num++;
+		grp_goal++;
+	}
+	*count = num;
+	return grp_goal - num;
+fail_access:
+	*count = num;
+	return -1;
+}
+
+/**
+ *	find_next_reservable_window():
+ *		find a reservable space within the given range.
+ *		It does not allocate the reservation window for now:
+ *		alloc_new_reservation() will do the work later.
+ *
+ *	@search_head: the head of the searching list;
+ *		This is not necessarily the list head of the whole filesystem
+ *
+ *		We have both head and start_block to assist the search
+ *		for the reservable space. The list starts from head,
+ *		but we will shift to the place where start_block is,
+ *		then start from there, when looking for a reservable space.
+ *
+ *	@my_rsv: the reservation window
+ *
+ *	@sb: the super block
+ *
+ *	@start_block: the first block we consider to start
+ *			the real search from
+ *
+ *	@last_block:
+ *		the maximum block number that our goal reservable space
+ *		could start from. This is normally the last block in this
+ *		group. The search will end when we found the start of next
+ *		possible reservable space is out of this boundary.
+ *		This could handle the cross boundary reservation window
+ *		request.
+ *
+ *	basically we search from the given range, rather than the whole
+ *	reservation double linked list, (start_block, last_block)
+ *	to find a free region that is of my size and has not
+ *	been reserved.
+ *
+ */
+static int find_next_reservable_window(
+				struct ext3_reserve_window_node *search_head,
+				struct ext3_reserve_window_node *my_rsv,
+				struct super_block * sb,
+				ext3_fsblk_t start_block,
+				ext3_fsblk_t last_block)
+{
+	struct rb_node *next;
+	struct ext3_reserve_window_node *rsv, *prev;
+	ext3_fsblk_t cur;
+	int size = my_rsv->rsv_goal_size;
+
+	/* TODO: make the start of the reservation window byte-aligned */
+	/* cur = *start_block & ~7;*/
+	cur = start_block;
+	rsv = search_head;
+	if (!rsv)
+		return -1;
+
+	while (1) {
+		if (cur <= rsv->rsv_end)
+			cur = rsv->rsv_end + 1;
+
+		/* TODO?
+		 * in the case we could not find a reservable space
+		 * that is what is expected, during the re-search, we could
+		 * remember what's the largest reservable space we could have
+		 * and return that one.
+		 *
+		 * For now it will fail if we could not find the reservable
+		 * space with expected-size (or more)...
+		 */
+		if (cur > last_block)
+			return -1;		/* fail */
+
+		prev = rsv;
+		next = rb_next(&rsv->rsv_node);
+		rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
+
+		/*
+		 * Reached the last reservation, we can just append to the
+		 * previous one.
+		 */
+		if (!next)
+			break;
+
+		if (cur + size <= rsv->rsv_start) {
+			/*
+			 * Found a reserveable space big enough.  We could
+			 * have a reservation across the group boundary here
+			 */
+			break;
+		}
+	}
+	/*
+	 * we come here either :
+	 * when we reach the end of the whole list,
+	 * and there is empty reservable space after last entry in the list.
+	 * append it to the end of the list.
+	 *
+	 * or we found one reservable space in the middle of the list,
+	 * return the reservation window that we could append to.
+	 * succeed.
+	 */
+
+	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+		rsv_window_remove(sb, my_rsv);
+
+	/*
+	 * Let's book the whole available window for now.  We will check the
+	 * disk bitmap later and then, if there are free blocks then we adjust
+	 * the window size if it's larger than requested.
+	 * Otherwise, we will remove this node from the tree next time
+	 * call find_next_reservable_window.
+	 */
+	my_rsv->rsv_start = cur;
+	my_rsv->rsv_end = cur + size - 1;
+	my_rsv->rsv_alloc_hit = 0;
+
+	if (prev != my_rsv)
+		ext3_rsv_window_add(sb, my_rsv);
+
+	return 0;
+}
+
+/**
+ *	alloc_new_reservation()--allocate a new reservation window
+ *
+ *		To make a new reservation, we search part of the filesystem
+ *		reservation list (the list that inside the group). We try to
+ *		allocate a new reservation window near the allocation goal,
+ *		or the beginning of the group, if there is no goal.
+ *
+ *		We first find a reservable space after the goal, then from
+ *		there, we check the bitmap for the first free block after
+ *		it. If there is no free block until the end of group, then the
+ *		whole group is full, we failed. Otherwise, check if the free
+ *		block is inside the expected reservable space, if so, we
+ *		succeed.
+ *		If the first free block is outside the reservable space, then
+ *		start from the first free block, we search for next available
+ *		space, and go on.
+ *
+ *	on succeed, a new reservation will be found and inserted into the list
+ *	It contains at least one free block, and it does not overlap with other
+ *	reservation windows.
+ *
+ *	failed: we failed to find a reservation window in this group
+ *
+ *	@my_rsv: the reservation window
+ *
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
+ *		free reservable space should start from.
+ *		if we have a grp_goal(grp_goal >0 ), then start from there,
+ *		no grp_goal(grp_goal = -1), we start from the first block
+ *		of the group.
+ *
+ *	@sb: the super block
+ *	@group: the group we are trying to allocate in
+ *	@bitmap_bh: the block group block bitmap
+ *
+ */
+static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
+		ext3_grpblk_t grp_goal, struct super_block *sb,
+		unsigned int group, struct buffer_head *bitmap_bh)
+{
+	struct ext3_reserve_window_node *search_head;
+	ext3_fsblk_t group_first_block, group_end_block, start_block;
+	ext3_grpblk_t first_free_block;
+	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
+	unsigned long size;
+	int ret;
+	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
+
+	group_first_block = ext3_group_first_block_no(sb, group);
+	group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+	if (grp_goal < 0)
+		start_block = group_first_block;
+	else
+		start_block = grp_goal + group_first_block;
+
+	trace_ext3_alloc_new_reservation(sb, start_block);
+	size = my_rsv->rsv_goal_size;
+
+	if (!rsv_is_empty(&my_rsv->rsv_window)) {
+		/*
+		 * if the old reservation is cross group boundary
+		 * and if the goal is inside the old reservation window,
+		 * we will come here when we just failed to allocate from
+		 * the first part of the window. We still have another part
+		 * that belongs to the next group. In this case, there is no
+		 * point to discard our window and try to allocate a new one
+		 * in this group(which will fail). we should
+		 * keep the reservation window, just simply move on.
+		 *
+		 * Maybe we could shift the start block of the reservation
+		 * window to the first block of next group.
+		 */
+
+		if ((my_rsv->rsv_start <= group_end_block) &&
+				(my_rsv->rsv_end > group_end_block) &&
+				(start_block >= my_rsv->rsv_start))
+			return -1;
+
+		if ((my_rsv->rsv_alloc_hit >
+		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+			/*
+			 * if the previously allocation hit ratio is
+			 * greater than 1/2, then we double the size of
+			 * the reservation window the next time,
+			 * otherwise we keep the same size window
+			 */
+			size = size * 2;
+			if (size > EXT3_MAX_RESERVE_BLOCKS)
+				size = EXT3_MAX_RESERVE_BLOCKS;
+			my_rsv->rsv_goal_size= size;
+		}
+	}
+
+	spin_lock(rsv_lock);
+	/*
+	 * shift the search start to the window near the goal block
+	 */
+	search_head = search_reserve_window(fs_rsv_root, start_block);
+
+	/*
+	 * find_next_reservable_window() simply finds a reservable window
+	 * inside the given range(start_block, group_end_block).
+	 *
+	 * To make sure the reservation window has a free bit inside it, we
+	 * need to check the bitmap after we found a reservable window.
+	 */
+retry:
+	ret = find_next_reservable_window(search_head, my_rsv, sb,
+						start_block, group_end_block);
+
+	if (ret == -1) {
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;
+	}
+
+	/*
+	 * On success, find_next_reservable_window() returns the
+	 * reservation window where there is a reservable space after it.
+	 * Before we reserve this reservable space, we need
+	 * to make sure there is at least a free block inside this region.
+	 *
+	 * searching the first free bit on the block bitmap and copy of
+	 * last committed bitmap alternatively, until we found a allocatable
+	 * block. Search start from the start block of the reservable space
+	 * we just found.
+	 */
+	spin_unlock(rsv_lock);
+	first_free_block = bitmap_search_next_usable_block(
+			my_rsv->rsv_start - group_first_block,
+			bitmap_bh, group_end_block - group_first_block + 1);
+
+	if (first_free_block < 0) {
+		/*
+		 * no free block left on the bitmap, no point
+		 * to reserve the space. return failed.
+		 */
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;		/* failed */
+	}
+
+	start_block = first_free_block + group_first_block;
+	/*
+	 * check if the first free block is within the
+	 * free space we just reserved
+	 */
+	if (start_block >= my_rsv->rsv_start &&
+	    start_block <= my_rsv->rsv_end) {
+		trace_ext3_reserved(sb, start_block, my_rsv);
+		return 0;		/* success */
+	}
+	/*
+	 * if the first free bit we found is out of the reservable space
+	 * continue search for next reservable space,
+	 * start from where the free block is,
+	 * we also shift the list head to where we stopped last time
+	 */
+	search_head = my_rsv;
+	spin_lock(rsv_lock);
+	goto retry;
+}
+
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:		given reservation window
+ * @sb:			super block
+ * @size:		the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext3_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext3_new_blocks() tries to allocate blocks,
+ */
+static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
+			struct super_block *sb, int size)
+{
+	struct ext3_reserve_window_node *next_rsv;
+	struct rb_node *next;
+	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
+
+	if (!spin_trylock(rsv_lock))
+		return;
+
+	next = rb_next(&my_rsv->rsv_node);
+
+	if (!next)
+		my_rsv->rsv_end += size;
+	else {
+		next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
+
+		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+			my_rsv->rsv_end += size;
+		else
+			my_rsv->rsv_end = next_rsv->rsv_start - 1;
+	}
+	spin_unlock(rsv_lock);
+}
+
+/**
+ * ext3_try_to_allocate_with_rsv()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @my_rsv:		reservation window
+ * @count:		target number of blocks to allocate
+ * @errp:		pointer to store the error code
+ *
+ * This is the main function used to allocate a new block and its reservation
+ * window.
+ *
+ * Each time when a new block allocation is need, first try to allocate from
+ * its own reservation.  If it does not have a reservation window, instead of
+ * looking for a free bit on bitmap first, then look up the reservation list to
+ * see if it is inside somebody else's reservation window, we try to allocate a
+ * reservation window for it starting from the goal first. Then do the block
+ * allocation within the reservation window.
+ *
+ * This will avoid keeping on searching the reservation list again and
+ * again when somebody is looking for a free block (without
+ * reservation), and there are lots of free blocks, but they are all
+ * being reserved.
+ *
+ * We use a red-black tree for the per-filesystem reservation list.
+ *
+ */
+static ext3_grpblk_t
+ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
+			unsigned int group, struct buffer_head *bitmap_bh,
+			ext3_grpblk_t grp_goal,
+			struct ext3_reserve_window_node * my_rsv,
+			unsigned long *count, int *errp)
+{
+	ext3_fsblk_t group_first_block, group_last_block;
+	ext3_grpblk_t ret = 0;
+	int fatal;
+	unsigned long num = *count;
+
+	*errp = 0;
+
+	/*
+	 * Make sure we use undo access for the bitmap, because it is critical
+	 * that we do the frozen_data COW on bitmap buffers in all cases even
+	 * if the buffer is in BJ_Forget state in the committing transaction.
+	 */
+	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+	fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (fatal) {
+		*errp = fatal;
+		return -1;
+	}
+
+	/*
+	 * we don't deal with reservation when
+	 * filesystem is mounted without reservation
+	 * or the file is not a regular file
+	 * or last attempt to allocate a block with reservation turned on failed
+	 */
+	if (my_rsv == NULL ) {
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+						grp_goal, count, NULL);
+		goto out;
+	}
+	/*
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
+	 * first block is a filesystem wide block number
+	 * first block is the block number of the first block in this group
+	 */
+	group_first_block = ext3_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+	/*
+	 * Basically we will allocate a new block from inode's reservation
+	 * window.
+	 *
+	 * We need to allocate a new reservation window, if:
+	 * a) inode does not have a reservation window; or
+	 * b) last attempt to allocate a block from existing reservation
+	 *    failed; or
+	 * c) we come here with a goal and with a reservation window
+	 *
+	 * We do not need to allocate a new reservation window if we come here
+	 * at the beginning with a goal and the goal is inside the window, or
+	 * we don't have a goal but already have a reservation window.
+	 * then we could go to allocate from the reservation window directly.
+	 */
+	while (1) {
+		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
+			!goal_in_my_reservation(&my_rsv->rsv_window,
+						grp_goal, group, sb)) {
+			if (my_rsv->rsv_goal_size < *count)
+				my_rsv->rsv_goal_size = *count;
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
+							group, bitmap_bh);
+			if (ret < 0)
+				break;			/* failed */
+
+			if (!goal_in_my_reservation(&my_rsv->rsv_window,
+							grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal >= 0) {
+			int curr = my_rsv->rsv_end -
+					(grp_goal + group_first_block) + 1;
+
+			if (curr < *count)
+				try_to_extend_reservation(my_rsv, sb,
+							*count - curr);
+		}
+
+		if ((my_rsv->rsv_start > group_last_block) ||
+				(my_rsv->rsv_end < group_first_block)) {
+			rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
+			BUG();
+		}
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+					   grp_goal, &num, &my_rsv->rsv_window);
+		if (ret >= 0) {
+			my_rsv->rsv_alloc_hit += num;
+			*count = num;
+			break;				/* succeed */
+		}
+		num = *count;
+	}
+out:
+	if (ret >= 0) {
+		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
+					"bitmap block");
+		fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
+		if (fatal) {
+			*errp = fatal;
+			return -1;
+		}
+		return ret;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+	ext3_journal_release_buffer(handle, bitmap_bh);
+	return ret;
+}
+
+/**
+ * ext3_has_free_blocks()
+ * @sbi:		in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
+{
+	ext3_fsblk_t free_blocks, root_blocks;
+
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
+	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+		!use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
+		(gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
+		 !in_group_p (sbi->s_resgid))) {
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * ext3_should_retry_alloc()
+ * @sb:			super block
+ * @retries		number of attemps has been made
+ *
+ * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or committing transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext3_should_retry_alloc(struct super_block *sb, int *retries)
+{
+	if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
+		return 0;
+
+	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+
+	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
+}
+
+/**
+ * ext3_new_blocks() -- core block(s) allocation function
+ * @handle:		handle to this transaction
+ * @inode:		file inode
+ * @goal:		given target block(filesystem wide)
+ * @count:		target number of blocks to allocate
+ * @errp:		error code
+ *
+ * ext3_new_blocks uses a goal block to assist allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
+ */
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, unsigned long *count, int *errp)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gdp_bh;
+	int group_no;
+	int goal_group;
+	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
+	int bgi;			/* blockgroup iteration index */
+	int fatal = 0, err;
+	int performed_allocation = 0;
+	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
+	struct super_block *sb;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es;
+	struct ext3_sb_info *sbi;
+	struct ext3_reserve_window_node *my_rsv = NULL;
+	struct ext3_block_alloc_info *block_i;
+	unsigned short windowsz = 0;
+#ifdef EXT3FS_DEBUG
+	static int goal_hits, goal_attempts;
+#endif
+	unsigned long ngroups;
+	unsigned long num = *count;
+
+	*errp = -ENOSPC;
+	sb = inode->i_sb;
+
+	/*
+	 * Check quota for allocation of this block.
+	 */
+	err = dquot_alloc_block(inode, num);
+	if (err) {
+		*errp = err;
+		return 0;
+	}
+
+	trace_ext3_request_blocks(inode, goal, num);
+
+	sbi = EXT3_SB(sb);
+	es = sbi->s_es;
+	ext3_debug("goal=%lu.\n", goal);
+	/*
+	 * Allocate a block from reservation only when
+	 * filesystem is mounted with reservation(default,-o reservation), and
+	 * it's a regular file, and
+	 * the desired window size is greater than 0 (One could use ioctl
+	 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
+	 * reservation on that particular file)
+	 */
+	block_i = EXT3_I(inode)->i_block_alloc_info;
+	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
+		my_rsv = &block_i->rsv_window_node;
+
+	if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
+		*errp = -ENOSPC;
+		goto out;
+	}
+
+	/*
+	 * First, test whether the goal block is free.
+	 */
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+	    goal >= le32_to_cpu(es->s_blocks_count))
+		goal = le32_to_cpu(es->s_first_data_block);
+	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
+			EXT3_BLOCKS_PER_GROUP(sb);
+	goal_group = group_no;
+retry_alloc:
+	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
+	if (!gdp)
+		goto io_error;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	/*
+	 * if there is not enough free blocks to make a new resevation
+	 * turn off reservation for this allocation
+	 */
+	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
+		&& (rsv_is_empty(&my_rsv->rsv_window)))
+		my_rsv = NULL;
+
+	if (free_blocks > 0) {
+		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
+				EXT3_BLOCKS_PER_GROUP(sb));
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, grp_target_blk,
+					my_rsv,	&num, &fatal);
+		if (fatal)
+			goto out;
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+
+	ngroups = EXT3_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/*
+	 * Now search the rest of the groups.  We assume that
+	 * group_no and gdp correctly point to the last group visited.
+	 */
+	for (bgi = 0; bgi < ngroups; bgi++) {
+		group_no++;
+		if (group_no >= ngroups)
+			group_no = 0;
+		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
+		if (!gdp)
+			goto io_error;
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		/*
+		 * skip this group (and avoid loading bitmap) if there
+		 * are no free blocks
+		 */
+		if (!free_blocks)
+			continue;
+		/*
+		 * skip this group if the number of
+		 * free blocks is less than half of the reservation
+		 * window size.
+		 */
+		if (my_rsv && (free_blocks <= (windowsz/2)))
+			continue;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, -1, my_rsv,
+					&num, &fatal);
+		if (fatal)
+			goto out;
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+	/*
+	 * We may end up a bogus earlier ENOSPC error due to
+	 * filesystem is "full" of reservations, but
+	 * there maybe indeed free blocks available on disk
+	 * In this case, we just forget about the reservations
+	 * just do block allocation as without reservations.
+	 */
+	if (my_rsv) {
+		my_rsv = NULL;
+		windowsz = 0;
+		group_no = goal_group;
+		goto retry_alloc;
+	}
+	/* No space left on the device */
+	*errp = -ENOSPC;
+	goto out;
+
+allocated:
+
+	ext3_debug("using block group %d(%d)\n",
+			group_no, gdp->bg_free_blocks_count);
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	fatal = ext3_journal_get_write_access(handle, gdp_bh);
+	if (fatal)
+		goto out;
+
+	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
+
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
+		      EXT3_SB(sb)->s_itb_per_group) ||
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+		      EXT3_SB(sb)->s_itb_per_group)) {
+		ext3_error(sb, "ext3_new_block",
+			    "Allocating block in system zone - "
+			    "blocks from "E3FSBLK", length %lu",
+			     ret_block, num);
+		/*
+		 * claim_block() marked the blocks we allocated as in use. So we
+		 * may want to selectively mark some of the blocks as free.
+		 */
+		goto retry_alloc;
+	}
+
+	performed_allocation = 1;
+
+#ifdef CONFIG_JBD_DEBUG
+	{
+		struct buffer_head *debug_bh;
+
+		/* Record bitmap buffer state in the newly allocated block */
+		debug_bh = sb_find_get_block(sb, ret_block);
+		if (debug_bh) {
+			BUFFER_TRACE(debug_bh, "state when allocated");
+			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
+			brelse(debug_bh);
+		}
+	}
+	jbd_lock_bh_state(bitmap_bh);
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
+		int i;
+
+		for (i = 0; i < num; i++) {
+			if (ext3_test_bit(grp_alloc_blk+i,
+					bh2jh(bitmap_bh)->b_committed_data)) {
+				printk("%s: block was unexpectedly set in "
+					"b_committed_data\n", __func__);
+			}
+		}
+	}
+	ext3_debug("found bit %d\n", grp_alloc_blk);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	jbd_unlock_bh_state(bitmap_bh);
+#endif
+
+	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
+		ext3_error(sb, "ext3_new_block",
+			    "block("E3FSBLK") >= blocks count(%d) - "
+			    "block_group = %d, es == %p ", ret_block,
+			le32_to_cpu(es->s_blocks_count), group_no, es);
+		goto out;
+	}
+
+	/*
+	 * It is up to the caller to add the new buffer to a journal
+	 * list of some description.  We don't know in advance whether
+	 * the caller wants to use it as metadata or data.
+	 */
+	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
+			ret_block, goal_hits, goal_attempts);
+
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
+	fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (fatal)
+		goto out;
+
+	*errp = 0;
+	brelse(bitmap_bh);
+
+	if (num < *count) {
+		dquot_free_block(inode, *count-num);
+		*count = num;
+	}
+
+	trace_ext3_allocate_blocks(inode, goal, num,
+				   (unsigned long long)ret_block);
+
+	return ret_block;
+
+io_error:
+	*errp = -EIO;
+out:
+	if (fatal) {
+		*errp = fatal;
+		ext3_std_error(sb, fatal);
+	}
+	/*
+	 * Undo the block allocation
+	 */
+	if (!performed_allocation)
+		dquot_free_block(inode, *count);
+	brelse(bitmap_bh);
+	return 0;
+}
+
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int *errp)
+{
+	unsigned long count = 1;
+
+	return ext3_new_blocks(handle, inode, goal, &count, errp);
+}
+
+/**
+ * ext3_count_free_blocks() -- count filesystem free blocks
+ * @sb:		superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
+{
+	ext3_fsblk_t desc_count;
+	struct ext3_group_desc *gdp;
+	int i;
+	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
+#ifdef EXT3FS_DEBUG
+	struct ext3_super_block *es;
+	ext3_fsblk_t bitmap_count;
+	unsigned long x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT3_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+
+	smp_rmb();
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext3_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, i);
+		if (bitmap_bh == NULL)
+			continue;
+
+		x = ext3_count_free(bitmap_bh, sb->s_blocksize);
+		printk("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext3_count_free_blocks: stored = "E3FSBLK
+		", computed = "E3FSBLK", "E3FSBLK"\n",
+	       (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count),
+		desc_count, bitmap_count);
+	return bitmap_count;
+#else
+	desc_count = 0;
+	smp_rmb();
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext3_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+	}
+
+	return desc_count;
+#endif
+}
+
+static inline int test_root(int a, int b)
+{
+	int num = b;
+
+	while (a > num)
+		num *= b;
+	return num == a;
+}
+
+static int ext3_group_sparse(int group)
+{
+	if (group <= 1)
+		return 1;
+	if (!(group & 1))
+		return 0;
+	return (test_root(group, 7) || test_root(group, 5) ||
+		test_root(group, 3));
+}
+
+/**
+ *	ext3_bg_has_super - number of blocks used by the superblock in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the superblock (primary or backup)
+ *	in this group.  Currently this will be only 0 or 1.
+ */
+int ext3_bg_has_super(struct super_block *sb, int group)
+{
+	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
+				EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+			!ext3_group_sparse(group))
+		return 0;
+	return 1;
+}
+
+static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
+{
+	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
+	unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
+	unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
+
+	if (group == first || group == first + 1 || group == last)
+		return 1;
+	return 0;
+}
+
+static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
+{
+	return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
+}
+
+/**
+ *	ext3_bg_num_gdb - number of blocks used by the group table in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the group descriptor table
+ *	(primary or backup) in this group.  In the future there may be a
+ *	different number of descriptor blocks in each group.
+ */
+unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
+{
+	unsigned long first_meta_bg =
+			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
+	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
+
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
+			metagroup < first_meta_bg)
+		return ext3_bg_num_gdb_nometa(sb,group);
+
+	return ext3_bg_num_gdb_meta(sb,group);
+
+}
+
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @group:		allocation group to trim
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @gdp:		allocation group description structure
+ * @minblocks:		minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
+					unsigned int group,
+					ext3_grpblk_t start, ext3_grpblk_t max,
+					ext3_grpblk_t minblocks)
+{
+	handle_t *handle;
+	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+	ext3_fsblk_t discard_block;
+	struct ext3_sb_info *sbi;
+	struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+	struct ext3_group_desc *gdp;
+	int err = 0, ret = 0;
+
+	/*
+	 * We will update one block bitmap, and one group descriptor
+	 */
+	handle = ext3_journal_start_sb(sb, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	bitmap_bh = read_block_bitmap(sb, group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto err_out;
+
+	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto err_out;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	sbi = EXT3_SB(sb);
+
+	 /* Walk through the whole group */
+	while (start <= max) {
+		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+		if (start < 0)
+			break;
+		next = start;
+
+		/*
+		 * Allocate contiguous free extents by setting bits in the
+		 * block bitmap
+		 */
+		while (next <= max
+			&& claim_block(sb_bgl_lock(sbi, group),
+					next, bitmap_bh)) {
+			next++;
+		}
+
+		 /* We did not claim any blocks */
+		if (next == start)
+			continue;
+
+		discard_block = (ext3_fsblk_t)start +
+				ext3_group_first_block_no(sb, group);
+
+		/* Update counters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+
+		free_blocks -= next - start;
+		/* Do not issue a TRIM on extents smaller than minblocks */
+		if ((next - start) < minblocks)
+			goto free_extent;
+
+		trace_ext3_discard_blocks(sb, discard_block, next - start);
+		 /* Send the TRIM command down to the device */
+		err = sb_issue_discard(sb, discard_block, next - start,
+				       GFP_NOFS, 0);
+		count += (next - start);
+free_extent:
+		freed = 0;
+
+		/*
+		 * Clear bits in the bitmap
+		 */
+		for (bit = start; bit < next; bit++) {
+			BUFFER_TRACE(bitmap_bh, "clear bit");
+			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+						bit, bitmap_bh->b_data)) {
+				ext3_error(sb, __func__,
+					"bit already cleared for block "E3FSBLK,
+					 (unsigned long)bit);
+				BUFFER_TRACE(bitmap_bh, "bit already cleared");
+			} else {
+				freed++;
+			}
+		}
+
+		/* Update couters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+
+		start = next;
+		if (err < 0) {
+			if (err != -EOPNOTSUPP)
+				ext3_warning(sb, __func__, "Discard command "
+					     "returned error %d\n", err);
+			break;
+		}
+
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+
+		/* No more suitable extents */
+		if (free_blocks < minblocks)
+			break;
+	}
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (!err)
+		err = ret;
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (!err)
+		err = ret;
+
+	ext3_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+err_out:
+	if (err)
+		count = err;
+	ext3_journal_stop(handle);
+	brelse(bitmap_bh);
+
+	return count;
+}
+
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @start:		First Byte to trim
+ * @len:		number of Bytes to trim from start
+ * @minlen:		minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	ext3_grpblk_t last_block, first_block;
+	unsigned long group, first_group, last_group;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	uint64_t start, minlen, end, trimmed = 0;
+	ext3_fsblk_t first_data_blk =
+			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	end = start + (range->len >> sb->s_blocksize_bits) - 1;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+
+	if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
+	    start >= max_blks ||
+	    range->len < sb->s_blocksize)
+		return -EINVAL;
+	if (end >= max_blks)
+		end = max_blks - 1;
+	if (end <= first_data_blk)
+		goto out;
+	if (start < first_data_blk)
+		start = first_data_blk;
+
+	smp_rmb();
+
+	/* Determine first and last group to examine based on start and len */
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+				     &first_group, &first_block);
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
+				     &last_group, &last_block);
+
+	/* end now represents the last block to discard in this group */
+	end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
+
+	for (group = first_group; group <= last_group; group++) {
+		gdp = ext3_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			break;
+
+		/*
+		 * For all the groups except the last one, last block will
+		 * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
+		 * change it for the last group, note that last_block is
+		 * already computed earlier by ext3_get_group_no_and_offset()
+		 */
+		if (group == last_group)
+			end = last_block;
+
+		if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
+			ret = ext3_trim_all_free(sb, group, first_block,
+						 end, minlen);
+			if (ret < 0)
+				break;
+			trimmed += ret;
+		}
+
+		/*
+		 * For every group except the first one, we are sure
+		 * that the first block to discard will be block #0.
+		 */
+		first_block = 0;
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out:
+	range->len = trimmed * sb->s_blocksize;
+	return ret;
+}
diff --git a/kernel/fs/ext3/bitmap.c b/kernel/fs/ext3/bitmap.c
new file mode 100644
index 000000000..ef9c643e8
--- /dev/null
+++ b/kernel/fs/ext3/bitmap.c
@@ -0,0 +1,20 @@
+/*
+ *  linux/fs/ext3/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include "ext3.h"
+
+#ifdef EXT3FS_DEBUG
+
+unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
+{
+	return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
+}
+
+#endif  /*  EXT3FS_DEBUG  */
+
diff --git a/kernel/fs/ext3/dir.c b/kernel/fs/ext3/dir.c
new file mode 100644
index 000000000..17742eed2
--- /dev/null
+++ b/kernel/fs/ext3/dir.c
@@ -0,0 +1,537 @@
+/*
+ *  linux/fs/ext3/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001  Daniel Phillips
+ *
+ */
+
+#include <linux/compat.h>
+#include "ext3.h"
+
+static unsigned char ext3_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ext3_dx_readdir(struct file *, struct dir_context *);
+
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
+	    (filetype >= EXT3_FT_MAX))
+		return DT_UNKNOWN;
+
+	return (ext3_filetype_table[filetype]);
+}
+
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which could potentially get converted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+		     EXT3_FEATURE_COMPAT_DIR_INDEX) &&
+	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+		return 1;
+
+	return 0;
+}
+
+int ext3_check_dir_entry (const char * function, struct inode * dir,
+			  struct ext3_dir_entry_2 * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char * error_msg = NULL;
+	const int rlen = ext3_rec_len_from_disk(de->rec_len);
+
+	if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
+		error_msg = "rec_len is smaller than minimal";
+	else if (unlikely(rlen % 4 != 0))
+		error_msg = "rec_len % 4 != 0";
+	else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
+		error_msg = "rec_len is too small for name_len";
+	else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
+		error_msg = "directory entry across blocks";
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
+		error_msg = "inode out of bounds";
+
+	if (unlikely(error_msg != NULL))
+		ext3_error (dir->i_sb, function,
+			"bad entry in directory #%lu: %s - "
+			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			dir->i_ino, error_msg, offset,
+			(unsigned long) le32_to_cpu(de->inode),
+			rlen, de->name_len);
+
+	return error_msg == NULL ? 1 : 0;
+}
+
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
+{
+	unsigned long offset;
+	int i;
+	struct ext3_dir_entry_2 *de;
+	int err;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	int dir_has_error = 0;
+
+	if (is_dx_dir(inode)) {
+		err = ext3_dx_readdir(file, ctx);
+		if (err != ERR_BAD_DX_DIR)
+			return err;
+		/*
+		 * We don't set the inode dirty flag since it's not
+		 * critical that it get flushed back to the disk.
+		 */
+		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
+	}
+	offset = ctx->pos & (sb->s_blocksize - 1);
+
+	while (ctx->pos < inode->i_size) {
+		unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
+		struct buffer_head map_bh;
+		struct buffer_head *bh = NULL;
+
+		map_bh.b_state = 0;
+		err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
+		if (err > 0) {
+			pgoff_t index = map_bh.b_blocknr >>
+					(PAGE_CACHE_SHIFT - inode->i_blkbits);
+			if (!ra_has_index(&file->f_ra, index))
+				page_cache_sync_readahead(
+					sb->s_bdev->bd_inode->i_mapping,
+					&file->f_ra, file,
+					index, 1);
+			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			bh = ext3_bread(NULL, inode, blk, 0, &err);
+		}
+
+		/*
+		 * We ignore I/O errors on directories so users have a chance
+		 * of recovering data when there's a bad sector
+		 */
+		if (!bh) {
+			if (!dir_has_error) {
+				ext3_error(sb, __func__, "directory #%lu "
+					"contains a hole at offset %lld",
+					inode->i_ino, ctx->pos);
+				dir_has_error = 1;
+			}
+			/* corrupt size?  Maybe no more blocks to read */
+			if (ctx->pos > inode->i_blocks << 9)
+				break;
+			ctx->pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (offset && file->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ext3_dir_entry_2 *)
+					(bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (ext3_rec_len_from_disk(de->rec_len) <
+						EXT3_DIR_REC_LEN(1))
+					break;
+				i += ext3_rec_len_from_disk(de->rec_len);
+			}
+			offset = i;
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
+				| offset;
+			file->f_version = inode->i_version;
+		}
+
+		while (ctx->pos < inode->i_size
+		       && offset < sb->s_blocksize) {
+			de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
+			if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
+						   bh, offset)) {
+				/* On error, skip the to the
+                                   next block. */
+				ctx->pos = (ctx->pos |
+						(sb->s_blocksize - 1)) + 1;
+				break;
+			}
+			offset += ext3_rec_len_from_disk(de->rec_len);
+			if (le32_to_cpu(de->inode)) {
+				if (!dir_emit(ctx, de->name, de->name_len,
+					      le32_to_cpu(de->inode),
+					      get_dtype(sb, de->file_type))) {
+					brelse(bh);
+					return 0;
+				}
+			}
+			ctx->pos += ext3_rec_len_from_disk(de->rec_len);
+		}
+		offset = 0;
+		brelse (bh);
+		if (ctx->pos < inode->i_size)
+			if (!dir_relax(inode))
+				return 0;
+	}
+	return 0;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+	return is_compat_task();
+#else
+	return (BITS_PER_LONG == 32);
+#endif
+}
+
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value for dx directories
+ *
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
+ */
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return major >> 1;
+	else
+		return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return (pos << 1) & 0xffffffff;
+	else
+		return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return 0;
+	else
+		return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext3_get_htree_eof(struct file *filp)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return EXT3_HTREE_EOF_32BIT;
+	else
+		return EXT3_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond s_maxbytes,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
+ *       will be invalid once the directory was converted into a dx directory
+ */
+static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int dx_dir = is_dx_dir(inode);
+	loff_t htree_max = ext3_get_htree_eof(file);
+
+	if (likely(dx_dir))
+		return generic_file_llseek_size(file, offset, whence,
+					        htree_max, htree_max);
+	else
+		return generic_file_llseek(file, offset, whence);
+}
+
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+	__u32		hash;
+	__u32		minor_hash;
+	struct rb_node	rb_hash;
+	struct fname	*next;
+	__u32		inode;
+	__u8		name_len;
+	__u8		file_type;
+	char		name[0];
+};
+
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+	struct fname *fname, *next;
+
+	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
+		do {
+			struct fname *old = fname;
+			fname = fname->next;
+			kfree(old);
+		} while (fname);
+
+	*root = RB_ROOT;
+}
+
+static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
+							   loff_t pos)
+{
+	struct dir_private_info *p;
+
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	if (!p)
+		return NULL;
+	p->curr_hash = pos2maj_hash(filp, pos);
+	p->curr_minor_hash = pos2min_hash(filp, pos);
+	return p;
+}
+
+void ext3_htree_free_dir_info(struct dir_private_info *p)
+{
+	free_rb_tree_fname(&p->root);
+	kfree(p);
+}
+
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ */
+int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+			     __u32 minor_hash,
+			     struct ext3_dir_entry_2 *dirent)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fname * fname, *new_fn;
+	struct dir_private_info *info;
+	int len;
+
+	info = (struct dir_private_info *) dir_file->private_data;
+	p = &info->root.rb_node;
+
+	/* Create and allocate the fname structure */
+	len = sizeof(struct fname) + dirent->name_len + 1;
+	new_fn = kzalloc(len, GFP_KERNEL);
+	if (!new_fn)
+		return -ENOMEM;
+	new_fn->hash = hash;
+	new_fn->minor_hash = minor_hash;
+	new_fn->inode = le32_to_cpu(dirent->inode);
+	new_fn->name_len = dirent->name_len;
+	new_fn->file_type = dirent->file_type;
+	memcpy(new_fn->name, dirent->name, dirent->name_len);
+	new_fn->name[dirent->name_len] = 0;
+
+	while (*p) {
+		parent = *p;
+		fname = rb_entry(parent, struct fname, rb_hash);
+
+		/*
+		 * If the hash and minor hash match up, then we put
+		 * them on a linked list.  This rarely happens...
+		 */
+		if ((new_fn->hash == fname->hash) &&
+		    (new_fn->minor_hash == fname->minor_hash)) {
+			new_fn->next = fname->next;
+			fname->next = new_fn;
+			return 0;
+		}
+
+		if (new_fn->hash < fname->hash)
+			p = &(*p)->rb_left;
+		else if (new_fn->hash > fname->hash)
+			p = &(*p)->rb_right;
+		else if (new_fn->minor_hash < fname->minor_hash)
+			p = &(*p)->rb_left;
+		else /* if (new_fn->minor_hash > fname->minor_hash) */
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&new_fn->rb_hash, parent, p);
+	rb_insert_color(&new_fn->rb_hash, &info->root);
+	return 0;
+}
+
+
+
+/*
+ * This is a helper function for ext3_dx_readdir.  It calls filldir
+ * for all entres on the fname linked list.  (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static bool call_filldir(struct file *file, struct dir_context *ctx,
+			struct fname *fname)
+{
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+
+	if (!fname) {
+		printk("call_filldir: called with null fname?!?\n");
+		return true;
+	}
+	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
+	while (fname) {
+		if (!dir_emit(ctx, fname->name, fname->name_len,
+				fname->inode,
+				get_dtype(sb, fname->file_type))) {
+			info->extra_fname = fname;
+			return false;
+		}
+		fname = fname->next;
+	}
+	return true;
+}
+
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct fname *fname;
+	int	ret;
+
+	if (!info) {
+		info = ext3_htree_create_dir_info(file, ctx->pos);
+		if (!info)
+			return -ENOMEM;
+		file->private_data = info;
+	}
+
+	if (ctx->pos == ext3_get_htree_eof(file))
+		return 0;	/* EOF */
+
+	/* Some one has messed with f_pos; reset the world */
+	if (info->last_pos != ctx->pos) {
+		free_rb_tree_fname(&info->root);
+		info->curr_node = NULL;
+		info->extra_fname = NULL;
+		info->curr_hash = pos2maj_hash(file, ctx->pos);
+		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
+	}
+
+	/*
+	 * If there are any leftover names on the hash collision
+	 * chain, return them first.
+	 */
+	if (info->extra_fname) {
+		if (!call_filldir(file, ctx, info->extra_fname))
+			goto finished;
+		info->extra_fname = NULL;
+		goto next_node;
+	} else if (!info->curr_node)
+		info->curr_node = rb_first(&info->root);
+
+	while (1) {
+		/*
+		 * Fill the rbtree if we have no more entries,
+		 * or the inode has changed since we last read in the
+		 * cached entries.
+		 */
+		if ((!info->curr_node) ||
+		    (file->f_version != inode->i_version)) {
+			info->curr_node = NULL;
+			free_rb_tree_fname(&info->root);
+			file->f_version = inode->i_version;
+			ret = ext3_htree_fill_tree(file, info->curr_hash,
+						   info->curr_minor_hash,
+						   &info->next_hash);
+			if (ret < 0)
+				return ret;
+			if (ret == 0) {
+				ctx->pos = ext3_get_htree_eof(file);
+				break;
+			}
+			info->curr_node = rb_first(&info->root);
+		}
+
+		fname = rb_entry(info->curr_node, struct fname, rb_hash);
+		info->curr_hash = fname->hash;
+		info->curr_minor_hash = fname->minor_hash;
+		if (!call_filldir(file, ctx, fname))
+			break;
+	next_node:
+		info->curr_node = rb_next(info->curr_node);
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
+			if (info->next_hash == ~0) {
+				ctx->pos = ext3_get_htree_eof(file);
+				break;
+			}
+			info->curr_hash = info->next_hash;
+			info->curr_minor_hash = 0;
+		}
+	}
+finished:
+	info->last_pos = ctx->pos;
+	return 0;
+}
+
+static int ext3_release_dir (struct inode * inode, struct file * filp)
+{
+       if (filp->private_data)
+		ext3_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+const struct file_operations ext3_dir_operations = {
+	.llseek		= ext3_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= ext3_readdir,
+	.unlocked_ioctl = ext3_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext3_compat_ioctl,
+#endif
+	.fsync		= ext3_sync_file,
+	.release	= ext3_release_dir,
+};
diff --git a/kernel/fs/ext3/ext3.h b/kernel/fs/ext3/ext3.h
new file mode 100644
index 000000000..f483a80b3
--- /dev/null
+++ b/kernel/fs/ext3/ext3.h
@@ -0,0 +1,1332 @@
+/*
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/magic.h>
+#include <linux/bug.h>
+#include <linux/blockgroup_lock.h>
+
+/*
+ * The second extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT3FS_DEBUG to produce debug messages
+ */
+#undef EXT3FS_DEBUG
+
+/*
+ * Define EXT3_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT3_DEFAULT_RESERVE_BLOCKS     8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT3_MAX_RESERVE_BLOCKS         1027
+#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
+
+/*
+ * Debug code
+ */
+#ifdef EXT3FS_DEBUG
+#define ext3_debug(f, a...)						\
+	do {								\
+		printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:",	\
+			__FILE__, __LINE__, __func__);		\
+		printk (KERN_DEBUG f, ## a);				\
+	} while (0)
+#else
+#define ext3_debug(f, a...)	do {} while (0)
+#endif
+
+/*
+ * Special inodes numbers
+ */
+#define	EXT3_BAD_INO		 1	/* Bad blocks inode */
+#define EXT3_ROOT_INO		 2	/* Root inode */
+#define EXT3_BOOT_LOADER_INO	 5	/* Boot loader inode */
+#define EXT3_UNDEL_DIR_INO	 6	/* Undelete directory inode */
+#define EXT3_RESIZE_INO		 7	/* Reserved group descriptors inode */
+#define EXT3_JOURNAL_INO	 8	/* Journal inode */
+
+/* First non-reserved inode for old ext3 filesystems */
+#define EXT3_GOOD_OLD_FIRST_INO	11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT3_LINK_MAX		32000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT3_MIN_BLOCK_SIZE		1024
+#define	EXT3_MAX_BLOCK_SIZE		65536
+#define EXT3_MIN_BLOCK_LOG_SIZE		10
+#define EXT3_BLOCK_SIZE(s)		((s)->s_blocksize)
+#define	EXT3_ADDR_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT3_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
+#define	EXT3_ADDR_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_addr_per_block_bits)
+#define EXT3_INODE_SIZE(s)		(EXT3_SB(s)->s_inode_size)
+#define EXT3_FIRST_INO(s)		(EXT3_SB(s)->s_first_ino)
+
+/*
+ * Macro-instructions used to manage fragments
+ */
+#define EXT3_MIN_FRAG_SIZE		1024
+#define	EXT3_MAX_FRAG_SIZE		4096
+#define EXT3_MIN_FRAG_LOG_SIZE		  10
+#define EXT3_FRAG_SIZE(s)		(EXT3_SB(s)->s_frag_size)
+#define EXT3_FRAGS_PER_BLOCK(s)		(EXT3_SB(s)->s_frags_per_block)
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext3_group_desc
+{
+	__le32	bg_block_bitmap;		/* Blocks bitmap block */
+	__le32	bg_inode_bitmap;		/* Inodes bitmap block */
+	__le32	bg_inode_table;		/* Inodes table block */
+	__le16	bg_free_blocks_count;	/* Free blocks count */
+	__le16	bg_free_inodes_count;	/* Free inodes count */
+	__le16	bg_used_dirs_count;	/* Directories count */
+	__u16	bg_pad;
+	__le32	bg_reserved[3];
+};
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT3_BLOCKS_PER_GROUP(s)	(EXT3_SB(s)->s_blocks_per_group)
+#define EXT3_DESC_PER_BLOCK(s)		(EXT3_SB(s)->s_desc_per_block)
+#define EXT3_INODES_PER_GROUP(s)	(EXT3_SB(s)->s_inodes_per_group)
+#define EXT3_DESC_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_desc_per_block_bits)
+
+/*
+ * Constants relative to the data blocks
+ */
+#define	EXT3_NDIR_BLOCKS		12
+#define	EXT3_IND_BLOCK			EXT3_NDIR_BLOCKS
+#define	EXT3_DIND_BLOCK			(EXT3_IND_BLOCK + 1)
+#define	EXT3_TIND_BLOCK			(EXT3_DIND_BLOCK + 1)
+#define	EXT3_N_BLOCKS			(EXT3_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define	EXT3_SECRM_FL			0x00000001 /* Secure deletion */
+#define	EXT3_UNRM_FL			0x00000002 /* Undelete */
+#define	EXT3_COMPR_FL			0x00000004 /* Compress file */
+#define EXT3_SYNC_FL			0x00000008 /* Synchronous updates */
+#define EXT3_IMMUTABLE_FL		0x00000010 /* Immutable file */
+#define EXT3_APPEND_FL			0x00000020 /* writes to file may only append */
+#define EXT3_NODUMP_FL			0x00000040 /* do not dump file */
+#define EXT3_NOATIME_FL			0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT3_DIRTY_FL			0x00000100
+#define EXT3_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
+#define EXT3_NOCOMPR_FL			0x00000400 /* Don't compress */
+#define EXT3_ECOMPR_FL			0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT3_INDEX_FL			0x00001000 /* hash-indexed directory */
+#define EXT3_IMAGIC_FL			0x00002000 /* AFS directory */
+#define EXT3_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
+#define EXT3_NOTAIL_FL			0x00008000 /* file tail should not be merged */
+#define EXT3_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
+#define EXT3_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
+#define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */
+
+#define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
+#define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
+			   EXT3_SYNC_FL | EXT3_NODUMP_FL |\
+			   EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
+			   EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
+			   EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT3_REG_FLMASK;
+	else
+		return flags & EXT3_OTHER_FLMASK;
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext3_new_group_input {
+	__u32 group;            /* Group number for this data */
+	__u32 block_bitmap;     /* Absolute block number of block bitmap */
+	__u32 inode_bitmap;     /* Absolute block number of inode bitmap */
+	__u32 inode_table;      /* Absolute block number of inode table start */
+	__u32 blocks_count;     /* Total number of blocks in this group */
+	__u16 reserved_blocks;  /* Number of reserved blocks in this group */
+	__u16 unused;
+};
+
+/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
+struct ext3_new_group_data {
+	__u32 group;
+	__u32 block_bitmap;
+	__u32 inode_bitmap;
+	__u32 inode_table;
+	__u32 blocks_count;
+	__u16 reserved_blocks;
+	__u16 unused;
+	__u32 free_blocks_count;
+};
+
+
+/*
+ * ioctl commands
+ */
+#define	EXT3_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define	EXT3_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define	EXT3_IOC_GETVERSION		_IOR('f', 3, long)
+#define	EXT3_IOC_SETVERSION		_IOW('f', 4, long)
+#define EXT3_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT3_IOC_GROUP_ADD		_IOW('f', 8,struct ext3_new_group_input)
+#define	EXT3_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
+#define	EXT3_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
+#endif
+#define EXT3_IOC_GETRSVSZ		_IOR('f', 5, long)
+#define EXT3_IOC_SETRSVSZ		_IOW('f', 6, long)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT3_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define EXT3_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define EXT3_IOC32_GETVERSION		_IOR('f', 3, int)
+#define EXT3_IOC32_SETVERSION		_IOW('f', 4, int)
+#define EXT3_IOC32_GETRSVSZ		_IOR('f', 5, int)
+#define EXT3_IOC32_SETRSVSZ		_IOW('f', 6, int)
+#define EXT3_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
+#endif
+#define EXT3_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
+#define EXT3_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+
+/* Number of supported quota types */
+#define EXT3_MAXQUOTAS 2
+
+/*
+ *  Mount options
+ */
+struct ext3_mount_options {
+	unsigned long s_mount_opt;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+	unsigned long s_commit_interval;
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[EXT3_MAXQUOTAS];
+#endif
+};
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext3_inode {
+	__le16	i_mode;		/* File mode */
+	__le16	i_uid;		/* Low 16 bits of Owner Uid */
+	__le32	i_size;		/* Size in bytes */
+	__le32	i_atime;	/* Access time */
+	__le32	i_ctime;	/* Creation time */
+	__le32	i_mtime;	/* Modification time */
+	__le32	i_dtime;	/* Deletion Time */
+	__le16	i_gid;		/* Low 16 bits of Group Id */
+	__le16	i_links_count;	/* Links count */
+	__le32	i_blocks;	/* Blocks count */
+	__le32	i_flags;	/* File flags */
+	union {
+		struct {
+			__u32  l_i_reserved1;
+		} linux1;
+		struct {
+			__u32  h_i_translator;
+		} hurd1;
+		struct {
+			__u32  m_i_reserved1;
+		} masix1;
+	} osd1;				/* OS dependent 1 */
+	__le32	i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
+	__le32	i_generation;	/* File version (for NFS) */
+	__le32	i_file_acl;	/* File ACL */
+	__le32	i_dir_acl;	/* Directory ACL */
+	__le32	i_faddr;	/* Fragment address */
+	union {
+		struct {
+			__u8	l_i_frag;	/* Fragment number */
+			__u8	l_i_fsize;	/* Fragment size */
+			__u16	i_pad1;
+			__le16	l_i_uid_high;	/* these 2 fields    */
+			__le16	l_i_gid_high;	/* were reserved2[0] */
+			__u32	l_i_reserved2;
+		} linux2;
+		struct {
+			__u8	h_i_frag;	/* Fragment number */
+			__u8	h_i_fsize;	/* Fragment size */
+			__u16	h_i_mode_high;
+			__u16	h_i_uid_high;
+			__u16	h_i_gid_high;
+			__u32	h_i_author;
+		} hurd2;
+		struct {
+			__u8	m_i_frag;	/* Fragment number */
+			__u8	m_i_fsize;	/* Fragment size */
+			__u16	m_pad1;
+			__u32	m_i_reserved2[2];
+		} masix2;
+	} osd2;				/* OS dependent 2 */
+	__le16	i_extra_isize;
+	__le16	i_pad1;
+};
+
+#define i_size_high	i_dir_acl
+
+#define i_reserved1	osd1.linux1.l_i_reserved1
+#define i_frag		osd2.linux2.l_i_frag
+#define i_fsize		osd2.linux2.l_i_fsize
+#define i_uid_low	i_uid
+#define i_gid_low	i_gid
+#define i_uid_high	osd2.linux2.l_i_uid_high
+#define i_gid_high	osd2.linux2.l_i_gid_high
+#define i_reserved2	osd2.linux2.l_i_reserved2
+
+/*
+ * File system states
+ */
+#define	EXT3_VALID_FS			0x0001	/* Unmounted cleanly */
+#define	EXT3_ERROR_FS			0x0002	/* Errors detected */
+#define	EXT3_ORPHAN_FS			0x0004	/* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+
+/*
+ * Mount flags
+ */
+#define EXT3_MOUNT_CHECK		0x00001	/* Do mount-time checks */
+/* EXT3_MOUNT_OLDALLOC was there */
+#define EXT3_MOUNT_GRPID		0x00004	/* Create files with directory's group */
+#define EXT3_MOUNT_DEBUG		0x00008	/* Some debugging messages */
+#define EXT3_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
+#define EXT3_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
+#define EXT3_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
+#define EXT3_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
+#define EXT3_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
+#define EXT3_MOUNT_ABORT		0x00200	/* Fatal error detected */
+#define EXT3_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
+#define EXT3_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
+#define EXT3_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
+#define EXT3_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
+#define EXT3_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
+#define EXT3_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
+#define EXT3_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
+#define EXT3_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
+#define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
+#define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
+#define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
+#define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
+#define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT	0x400000 /* Abort on file data write
+						  * error in ordered mode */
+
+/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+#ifndef _LINUX_EXT2_FS_H
+#define clear_opt(o, opt)		o &= ~EXT3_MOUNT_##opt
+#define set_opt(o, opt)			o |= EXT3_MOUNT_##opt
+#define test_opt(sb, opt)		(EXT3_SB(sb)->s_mount_opt & \
+					 EXT3_MOUNT_##opt)
+#else
+#define EXT2_MOUNT_NOLOAD		EXT3_MOUNT_NOLOAD
+#define EXT2_MOUNT_ABORT		EXT3_MOUNT_ABORT
+#define EXT2_MOUNT_DATA_FLAGS		EXT3_MOUNT_DATA_FLAGS
+#endif
+
+#define ext3_set_bit			__set_bit_le
+#define ext3_set_bit_atomic		ext2_set_bit_atomic
+#define ext3_clear_bit			__clear_bit_le
+#define ext3_clear_bit_atomic		ext2_clear_bit_atomic
+#define ext3_test_bit			test_bit_le
+#define ext3_find_next_zero_bit		find_next_zero_bit_le
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT3_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
+#define EXT3_DFL_CHECKINTERVAL		0	/* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT3_ERRORS_CONTINUE		1	/* Continue execution */
+#define EXT3_ERRORS_RO			2	/* Remount fs read-only */
+#define EXT3_ERRORS_PANIC		3	/* Panic */
+#define EXT3_ERRORS_DEFAULT		EXT3_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext3_super_block {
+/*00*/	__le32	s_inodes_count;		/* Inodes count */
+	__le32	s_blocks_count;		/* Blocks count */
+	__le32	s_r_blocks_count;	/* Reserved blocks count */
+	__le32	s_free_blocks_count;	/* Free blocks count */
+/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
+	__le32	s_first_data_block;	/* First Data Block */
+	__le32	s_log_block_size;	/* Block size */
+	__le32	s_log_frag_size;	/* Fragment size */
+/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
+	__le32	s_frags_per_group;	/* # Fragments per group */
+	__le32	s_inodes_per_group;	/* # Inodes per group */
+	__le32	s_mtime;		/* Mount time */
+/*30*/	__le32	s_wtime;		/* Write time */
+	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_magic;		/* Magic signature */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le16	s_minor_rev_level;	/* minor revision level */
+/*40*/	__le32	s_lastcheck;		/* time of last check */
+	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le32	s_rev_level;		/* Revision level */
+/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	/*
+	 * These fields are for EXT3_DYNAMIC_REV superblocks only.
+	 *
+	 * Note: the difference between the compatible feature set and
+	 * the incompatible feature set is that if there is a bit set
+	 * in the incompatible feature set that the kernel doesn't
+	 * know about, it should refuse to mount the filesystem.
+	 *
+	 * e2fsck's requirements are more strict; if it doesn't know
+	 * about a feature in either the compatible or incompatible
+	 * feature set, it must abort and not try to meddle with
+	 * things it doesn't understand...
+	 */
+	__le32	s_first_ino;		/* First non-reserved inode */
+	__le16   s_inode_size;		/* size of inode structure */
+	__le16	s_block_group_nr;	/* block group # of this superblock */
+	__le32	s_feature_compat;	/* compatible feature set */
+/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
+	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
+/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*78*/	char	s_volume_name[16];	/* volume name */
+/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
+/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
+	/*
+	 * Performance hints.  Directory preallocation should only
+	 * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+	 */
+	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
+	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
+	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
+	/*
+	 * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+	 */
+/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
+/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
+	__le32	s_journal_dev;		/* device number of journal file */
+	__le32	s_last_orphan;		/* start of list of inodes to delete */
+	__le32	s_hash_seed[4];		/* HTREE hash seed */
+	__u8	s_def_hash_version;	/* Default hash version to use */
+	__u8	s_reserved_char_pad;
+	__u16	s_reserved_word_pad;
+	__le32	s_default_mount_opts;
+	__le32	s_first_meta_bg;	/* First metablock block group */
+	__le32	s_mkfs_time;		/* When the filesystem was created */
+	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
+	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
+	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_hi;	/* Free blocks count */
+	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
+	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
+	__le32	s_flags;		/* Miscellaneous flags */
+	__le16  s_raid_stride;		/* RAID stride */
+	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__le64  s_mmp_block;            /* Block for multi-mount protection */
+	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_reserved_char_pad2;
+	__le16  s_reserved_pad;
+	__u32   s_reserved[162];        /* Padding to the end of the block */
+};
+
+/* data type for block offset of block group */
+typedef int ext3_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long ext3_fsblk_t;
+
+#define E3FSBLK "%lu"
+
+struct ext3_reserve_window {
+	ext3_fsblk_t	_rsv_start;	/* First byte reserved */
+	ext3_fsblk_t	_rsv_end;	/* Last byte reserved or 0 */
+};
+
+struct ext3_reserve_window_node {
+	struct rb_node		rsv_node;
+	__u32			rsv_goal_size;
+	__u32			rsv_alloc_hit;
+	struct ext3_reserve_window	rsv_window;
+};
+
+struct ext3_block_alloc_info {
+	/* information about reservation window */
+	struct ext3_reserve_window_node	rsv_window_node;
+	/*
+	 * was i_next_alloc_block in ext3_inode_info
+	 * is the logical (file-relative) number of the
+	 * most-recently-allocated block in this file.
+	 * We use this for detecting linearly ascending allocation requests.
+	 */
+	__u32                   last_alloc_logical_block;
+	/*
+	 * Was i_next_alloc_goal in ext3_inode_info
+	 * is the *physical* companion to i_next_alloc_block.
+	 * it the physical block number of the block which was most-recentl
+	 * allocated to this file.  This give us the goal (target) for the next
+	 * allocation when we detect linearly ascending requests.
+	 */
+	ext3_fsblk_t		last_alloc_physical_block;
+};
+
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
+/*
+ * third extended file system inode data in memory
+ */
+struct ext3_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_flags;
+#ifdef EXT3_FRAGMENTS
+	__u32	i_faddr;
+	__u8	i_frag_no;
+	__u8	i_frag_size;
+#endif
+	ext3_fsblk_t	i_file_acl;
+	__u32	i_dir_acl;
+	__u32	i_dtime;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	__u32	i_block_group;
+	unsigned long	i_state_flags;	/* Dynamic state flags for ext3 */
+
+	/* block reservation info */
+	struct ext3_block_alloc_info *i_block_alloc_info;
+
+	__u32	i_dir_start_lookup;
+#ifdef CONFIG_EXT3_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext3_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext3_get_block (growth) and ext3_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
+	/*
+	 * truncate_mutex is for serialising ext3_truncate() against
+	 * ext3_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext3 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have truncate_mutex.
+	 */
+	struct mutex truncate_mutex;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	atomic_t i_sync_tid;
+	atomic_t i_datasync_tid;
+
+#ifdef CONFIG_QUOTA
+	struct dquot *i_dquot[MAXQUOTAS];
+#endif
+
+	struct inode vfs_inode;
+};
+
+/*
+ * third extended-fs super-block data in memory
+ */
+struct ext3_sb_info {
+	unsigned long s_frag_size;	/* Size of a fragment in bytes */
+	unsigned long s_frags_per_block;/* Number of fragments per block */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_frags_per_group;/* Number of fragments in a group */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	unsigned long s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext3_super_block * s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head ** s_group_desc;
+	unsigned long  s_mount_opt;
+	ext3_fsblk_t s_sb_block;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+
+	/* root of the per fs reservation window tree */
+	spinlock_t s_rsv_window_lock;
+	struct rb_root s_rsv_window_root;
+	struct ext3_reserve_window_node s_rsv_window_head;
+
+	/* Journaling */
+	struct inode * s_journal_inode;
+	struct journal_s * s_journal;
+	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
+	unsigned long s_commit_interval;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[EXT3_MAXQUOTAS];	/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+};
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
+
+static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
+{
+	return container_of(inode, struct ext3_inode_info, vfs_inode);
+}
+
+static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
+{
+	return ino == EXT3_ROOT_INO ||
+		ino == EXT3_JOURNAL_INO ||
+		ino == EXT3_RESIZE_INO ||
+		(ino >= EXT3_FIRST_INO(sb) &&
+		 ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+	EXT3_STATE_JDATA,		/* journaled data exists */
+	EXT3_STATE_NEW,			/* inode is newly created */
+	EXT3_STATE_XATTR,		/* has in-inode xattrs */
+	EXT3_STATE_FLUSH_ON_CLOSE,	/* flush dirty pages on close */
+};
+
+static inline int ext3_test_inode_state(struct inode *inode, int bit)
+{
+	return test_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_set_inode_state(struct inode *inode, int bit)
+{
+	set_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_clear_inode_state(struct inode *inode, int bit)
+{
+	clear_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT3_OS_LINUX		0
+#define EXT3_OS_HURD		1
+#define EXT3_OS_MASIX		2
+#define EXT3_OS_FREEBSD		3
+#define EXT3_OS_LITES		4
+
+/*
+ * Revision levels
+ */
+#define EXT3_GOOD_OLD_REV	0	/* The good old (original) format */
+#define EXT3_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
+
+#define EXT3_CURRENT_REV	EXT3_GOOD_OLD_REV
+#define EXT3_MAX_SUPP_REV	EXT3_DYNAMIC_REV
+
+#define EXT3_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT3_HAS_COMPAT_FEATURE(sb,mask)			\
+	( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+#define EXT3_SET_COMPAT_FEATURE(sb,mask)			\
+	EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT3_SET_INCOMPAT_FEATURE(sb,mask)			\
+	EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT3_FEATURE_COMPAT_DIR_PREALLOC	0x0001
+#define EXT3_FEATURE_COMPAT_IMAGIC_INODES	0x0002
+#define EXT3_FEATURE_COMPAT_HAS_JOURNAL		0x0004
+#define EXT3_FEATURE_COMPAT_EXT_ATTR		0x0008
+#define EXT3_FEATURE_COMPAT_RESIZE_INODE	0x0010
+#define EXT3_FEATURE_COMPAT_DIR_INDEX		0x0020
+
+#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
+#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
+#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+
+#define EXT3_FEATURE_INCOMPAT_COMPRESSION	0x0001
+#define EXT3_FEATURE_INCOMPAT_FILETYPE		0x0002
+#define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+#define EXT3_FEATURE_INCOMPAT_META_BG		0x0010
+
+#define EXT3_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT3_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT3_FEATURE_INCOMPAT_RECOVER| \
+					 EXT3_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define	EXT3_DEF_RESUID		0
+#define	EXT3_DEF_RESGID		0
+
+/*
+ * Default mount options
+ */
+#define EXT3_DEFM_DEBUG		0x0001
+#define EXT3_DEFM_BSDGROUPS	0x0002
+#define EXT3_DEFM_XATTR_USER	0x0004
+#define EXT3_DEFM_ACL		0x0008
+#define EXT3_DEFM_UID16		0x0010
+#define EXT3_DEFM_JMODE		0x0060
+#define EXT3_DEFM_JMODE_DATA	0x0020
+#define EXT3_DEFM_JMODE_ORDERED	0x0040
+#define EXT3_DEFM_JMODE_WBACK	0x0060
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT3_NAME_LEN 255
+
+struct ext3_dir_entry {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__le16	name_len;		/* Name length */
+	char	name[EXT3_NAME_LEN];	/* File name */
+};
+
+/*
+ * The new version of the directory entry.  Since EXT3 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext3_dir_entry_2 {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;
+	char	name[EXT3_NAME_LEN];	/* File name */
+};
+
+/*
+ * Ext3 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define EXT3_FT_UNKNOWN		0
+#define EXT3_FT_REG_FILE	1
+#define EXT3_FT_DIR		2
+#define EXT3_FT_CHRDEV		3
+#define EXT3_FT_BLKDEV		4
+#define EXT3_FT_FIFO		5
+#define EXT3_FT_SOCK		6
+#define EXT3_FT_SYMLINK		7
+
+#define EXT3_FT_MAX		8
+
+/*
+ * EXT3_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT3_DIR_PAD			4
+#define EXT3_DIR_ROUND			(EXT3_DIR_PAD - 1)
+#define EXT3_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT3_DIR_ROUND) & \
+					 ~EXT3_DIR_ROUND)
+#define EXT3_MAX_REC_LEN		((1<<16)-1)
+
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
+static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == EXT3_MAX_REC_LEN)
+		return 1 << 16;
+#endif
+	return len;
+}
+
+static inline __le16 ext3_rec_len_to_disk(unsigned len)
+{
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == (1 << 16))
+		return cpu_to_le16(EXT3_MAX_REC_LEN);
+	else if (len > (1 << 16))
+		BUG();
+#endif
+	return cpu_to_le16(len);
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+				      EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+		      (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY		0
+#define DX_HASH_HALF_MD4	1
+#define DX_HASH_TEA		2
+#define DX_HASH_LEGACY_UNSIGNED	3
+#define DX_HASH_HALF_MD4_UNSIGNED	4
+#define DX_HASH_TEA_UNSIGNED		5
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+	u32		hash;
+	u32		minor_hash;
+	int		hash_version;
+	u32		*seed;
+};
+
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT3_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
+#define EXT3_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
+
+
+/*
+ * Control parameters used by ext3_htree_next_block
+ */
+#define HASH_NB_ALWAYS		1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext3_iloc
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	unsigned long block_group;
+};
+
+static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
+{
+	return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories.  It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+	struct rb_root	root;
+	struct rb_node	*curr_node;
+	struct fname	*extra_fname;
+	loff_t		last_pos;
+	__u32		curr_hash;
+	__u32		curr_minor_hash;
+	__u32		next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext3_fsblk_t
+ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+	return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR	-75000
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext3 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE    /**/
+# define ATTRIB_NORET  __attribute__((noreturn))
+# define NORET_AND     noreturn,
+
+/* balloc.c */
+extern int ext3_bg_has_super(struct super_block *sb, int group);
+extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int *errp);
+extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, unsigned long *count, int *errp);
+extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
+			ext3_fsblk_t block, unsigned long count);
+extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
+				 ext3_fsblk_t block, unsigned long count,
+				unsigned long *pdquot_freed_blocks);
+extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
+extern void ext3_check_blocks_bitmap (struct super_block *);
+extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+						    unsigned int block_group,
+						    struct buffer_head ** bh);
+extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
+extern void ext3_init_block_alloc_info(struct inode *);
+extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
+extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
+
+/* dir.c */
+extern int ext3_check_dir_entry(const char *, struct inode *,
+				struct ext3_dir_entry_2 *,
+				struct buffer_head *, unsigned long);
+extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+				    __u32 minor_hash,
+				    struct ext3_dir_entry_2 *dirent);
+extern void ext3_htree_free_dir_info(struct dir_private_info *p);
+
+/* fsync.c */
+extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
+
+/* hash.c */
+extern int ext3fs_dirhash(const char *name, int len, struct
+			  dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode * ext3_new_inode (handle_t *, struct inode *,
+				      const struct qstr *, umode_t);
+extern void ext3_free_inode (handle_t *, struct inode *);
+extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+extern unsigned long ext3_count_free_inodes (struct super_block *);
+extern unsigned long ext3_count_dirs (struct super_block *);
+extern void ext3_check_inodes_bitmap (struct super_block *);
+extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+
+/* inode.c */
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+		struct buffer_head *bh, ext3_fsblk_t blocknr);
+struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+	sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
+	int create);
+
+extern struct inode *ext3_iget(struct super_block *, unsigned long);
+extern int  ext3_write_inode (struct inode *, struct writeback_control *);
+extern int  ext3_setattr (struct dentry *, struct iattr *);
+extern void ext3_evict_inode (struct inode *);
+extern int  ext3_sync_inode (handle_t *, struct inode *);
+extern void ext3_discard_reservation (struct inode *);
+extern void ext3_dirty_inode(struct inode *, int);
+extern int ext3_change_inode_journal_flag(struct inode *, int);
+extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
+extern int ext3_can_truncate(struct inode *inode);
+extern void ext3_truncate(struct inode *inode);
+extern void ext3_set_inode_flags(struct inode *);
+extern void ext3_get_inode_flags(struct ext3_inode_info *);
+extern void ext3_set_aops(struct inode *inode);
+extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len);
+
+/* ioctl.c */
+extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* namei.c */
+extern int ext3_orphan_add(handle_t *, struct inode *);
+extern int ext3_orphan_del(handle_t *, struct inode *);
+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+				__u32 start_minor_hash, __u32 *next_hash);
+
+/* resize.c */
+extern int ext3_group_add(struct super_block *sb,
+				struct ext3_new_group_data *input);
+extern int ext3_group_extend(struct super_block *sb,
+				struct ext3_super_block *es,
+				ext3_fsblk_t n_blocks_count);
+
+/* super.c */
+extern __printf(3, 4)
+void ext3_error(struct super_block *, const char *, const char *, ...);
+extern void __ext3_std_error (struct super_block *, const char *, int);
+extern __printf(3, 4)
+void ext3_abort(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_msg(struct super_block *, const char *, const char *, ...);
+extern void ext3_update_dynamic_rev (struct super_block *sb);
+
+#define ext3_std_error(sb, errno)				\
+do {								\
+	if ((errno))						\
+		__ext3_std_error((sb), __func__, (errno));	\
+} while (0)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext3_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext3_file_inode_operations;
+extern const struct file_operations ext3_file_operations;
+
+/* namei.c */
+extern const struct inode_operations ext3_dir_inode_operations;
+extern const struct inode_operations ext3_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations ext3_symlink_inode_operations;
+extern const struct inode_operations ext3_fast_symlink_inode_operations;
+
+#define EXT3_JOURNAL(inode)	(EXT3_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction.  */
+
+#define EXT3_SINGLEDATA_TRANS_BLOCKS	8U
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT3_XATTR_TRANS_BLOCKS		6U
+
+/* Define the minimum size for a transaction which modifies data.  This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota).  The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT3_DATA_TRANS_BLOCKS(sb)	(EXT3_SINGLEDATA_TRANS_BLOCKS + \
+					 EXT3_XATTR_TRANS_BLOCKS - 2 + \
+					 EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
+ * generous.  We can grow the delete transaction later if necessary. */
+
+#define EXT3_DELETE_TRANS_BLOCKS(sb)   (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction.  For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT3_MAX_TRANS_DATA		64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one.  Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates.  Quota allocations are not
+ * needed. */
+
+#define EXT3_RESERVE_TRANS_BLOCKS	12U
+
+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS	8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only inode+data */
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+		(EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
+#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+		(EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
+
+int
+ext3_mark_iloc_dirty(handle_t *handle,
+		     struct inode *inode,
+		     struct ext3_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+			struct ext3_iloc *iloc);
+
+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext3 calls into JBD.  The intent here is
+ * to allow these to be turned into appropriate stubs so ext3 can control
+ * ext2 filesystems, so ext2+ext3 systems only nee one fs.  This work hasn't
+ * been done yet.
+ */
+
+static inline void ext3_journal_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	journal_release_buffer(handle, bh);
+}
+
+void ext3_journal_abort_handle(const char *caller, const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err);
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh);
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+				unsigned long blocknr, struct buffer_head *bh);
+
+int __ext3_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh);
+
+int __ext3_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh);
+
+#define ext3_journal_get_undo_access(handle, bh) \
+	__ext3_journal_get_undo_access(__func__, (handle), (bh))
+#define ext3_journal_get_write_access(handle, bh) \
+	__ext3_journal_get_write_access(__func__, (handle), (bh))
+#define ext3_journal_revoke(handle, blocknr, bh) \
+	__ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
+#define ext3_journal_get_create_access(handle, bh) \
+	__ext3_journal_get_create_access(__func__, (handle), (bh))
+#define ext3_journal_dirty_metadata(handle, bh) \
+	__ext3_journal_dirty_metadata(__func__, (handle), (bh))
+#define ext3_journal_forget(handle, bh) \
+	__ext3_journal_forget(__func__, (handle), (bh))
+
+int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+
+handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
+int __ext3_journal_stop(const char *where, handle_t *handle);
+
+static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
+{
+	return ext3_journal_start_sb(inode->i_sb, nblocks);
+}
+
+#define ext3_journal_stop(handle) \
+	__ext3_journal_stop(__func__, (handle))
+
+static inline handle_t *ext3_journal_current_handle(void)
+{
+	return journal_current_handle();
+}
+
+static inline int ext3_journal_extend(handle_t *handle, int nblocks)
+{
+	return journal_extend(handle, nblocks);
+}
+
+static inline int ext3_journal_restart(handle_t *handle, int nblocks)
+{
+	return journal_restart(handle, nblocks);
+}
+
+static inline int ext3_journal_blocks_per_page(struct inode *inode)
+{
+	return journal_blocks_per_page(inode);
+}
+
+static inline int ext3_journal_force_commit(journal_t *journal)
+{
+	return journal_force_commit(journal);
+}
+
+/* super.c */
+int ext3_force_commit(struct super_block *sb);
+
+static inline int ext3_should_journal_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 1;
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+		return 1;
+	if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+		return 1;
+	return 0;
+}
+
+static inline int ext3_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+		return 0;
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
+		return 1;
+	return 0;
+}
+
+static inline int ext3_should_writeback_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+		return 0;
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
+		return 1;
+	return 0;
+}
+
+#include <trace/events/ext3.h>
diff --git a/kernel/fs/ext3/ext3_jbd.c b/kernel/fs/ext3/ext3_jbd.c
new file mode 100644
index 000000000..785a3261a
--- /dev/null
+++ b/kernel/fs/ext3/ext3_jbd.c
@@ -0,0 +1,59 @@
+/*
+ * Interface between ext3 and JBD
+ */
+
+#include "ext3.h"
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_get_undo_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_get_write_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_forget(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+				unsigned long blocknr, struct buffer_head *bh)
+{
+	int err = journal_revoke(handle, blocknr, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_get_create_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_dirty_metadata(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
diff --git a/kernel/fs/ext3/file.c b/kernel/fs/ext3/file.c
new file mode 100644
index 000000000..3b8f650de
--- /dev/null
+++ b/kernel/fs/ext3/file.c
@@ -0,0 +1,79 @@
+/*
+ *  linux/fs/ext3/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/quotaops.h>
+#include "ext3.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext3_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext3_release_file (struct inode * inode, struct file * filp)
+{
+	if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
+		filemap_flush(inode->i_mapping);
+		ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
+	}
+	/* if we are the last writer on the inode, drop the block reservation */
+	if ((filp->f_mode & FMODE_WRITE) &&
+			(atomic_read(&inode->i_writecount) == 1))
+	{
+		mutex_lock(&EXT3_I(inode)->truncate_mutex);
+		ext3_discard_reservation(inode);
+		mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+	}
+	if (is_dx(inode) && filp->private_data)
+		ext3_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+const struct file_operations ext3_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.unlocked_ioctl	= ext3_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext3_compat_ioctl,
+#endif
+	.mmap		= generic_file_mmap,
+	.open		= dquot_file_open,
+	.release	= ext3_release_file,
+	.fsync		= ext3_sync_file,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+};
+
+const struct inode_operations ext3_file_inode_operations = {
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.get_acl	= ext3_get_acl,
+	.set_acl	= ext3_set_acl,
+	.fiemap		= ext3_fiemap,
+};
+
diff --git a/kernel/fs/ext3/fsync.c b/kernel/fs/ext3/fsync.c
new file mode 100644
index 000000000..1cb9c7e10
--- /dev/null
+++ b/kernel/fs/ext3/fsync.c
@@ -0,0 +1,109 @@
+/*
+ *  linux/fs/ext3/fsync.c
+ *
+ *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
+ *  from
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *                      Laboratoire MASI - Institut Blaise Pascal
+ *                      Universite Pierre et Marie Curie (Paris VI)
+ *  from
+ *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3fs fsync primitive
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ *  Removed unnecessary code duplication for little endian machines
+ *  and excessive __inline__s.
+ *        Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include "ext3.h"
+
+/*
+ * akpm: A new design for ext3_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode.  Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it.  This will snapshot the
+ * inode to disk.
+ */
+
+int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+	int ret, needs_barrier = 0;
+	tid_t commit_tid;
+
+	trace_ext3_sync_file_enter(file, datasync);
+
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make sure that we read updated state */
+		smp_rmb();
+		if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+			return -EROFS;
+		return 0;
+	}
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		goto out;
+
+	J_ASSERT(ext3_journal_current_handle() == NULL);
+
+	/*
+	 * data=writeback,ordered:
+	 *  The caller's filemap_fdatawrite()/wait will sync the data.
+	 *  Metadata is in the journal, we wait for a proper transaction
+	 *  to commit here.
+	 *
+	 * data=journal:
+	 *  filemap_fdatawrite won't do anything (the buffers are clean).
+	 *  ext3_force_commit will write the file data into the journal and
+	 *  will wait on that.
+	 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+	 *  (they were dirtied by commit).  But that's OK - the blocks are
+	 *  safe in-journal, which is all fsync() needs to ensure.
+	 */
+	if (ext3_should_journal_data(inode)) {
+		ret = ext3_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	if (datasync)
+		commit_tid = atomic_read(&ei->i_datasync_tid);
+	else
+		commit_tid = atomic_read(&ei->i_sync_tid);
+
+	if (test_opt(inode->i_sb, BARRIER) &&
+	    !journal_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = 1;
+	log_start_commit(journal, commit_tid);
+	ret = log_wait_commit(journal, commit_tid);
+
+	/*
+	 * In case we didn't commit a transaction, we have to flush
+	 * disk caches manually so that data really is on persistent
+	 * storage
+	 */
+	if (needs_barrier) {
+		int err;
+
+		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+		if (!ret)
+			ret = err;
+	}
+out:
+	trace_ext3_sync_file_exit(inode, ret);
+	return ret;
+}
diff --git a/kernel/fs/ext3/hash.c b/kernel/fs/ext3/hash.c
new file mode 100644
index 000000000..ede315cdf
--- /dev/null
+++ b/kernel/fs/ext3/hash.c
@@ -0,0 +1,206 @@
+/*
+ *  linux/fs/ext3/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include "ext3.h"
+#include <linux/cryptohash.h>
+
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while(--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+
+/* The old legacy hash */
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i=0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) ucp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+/*
+ * Returns the hash of a filename.  If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash.  If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits.  32 bit hashes will return 0 for the minor hash.
+ */
+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+	__u32	hash;
+	__u32	minor_hash = 0;
+	const char	*p;
+	int		i;
+	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
+
+	/* Initialize the default seed for the hash checksum functions */
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+
+	/* Check to see if the seed is all zero's */
+	if (hinfo->seed) {
+		for (i=0; i < 4; i++) {
+			if (hinfo->seed[i])
+				break;
+		}
+		if (i < 4)
+			memcpy(buf, hinfo->seed, sizeof(buf));
+	}
+
+	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
+	case DX_HASH_LEGACY:
+		hash = dx_hack_hash_signed(name, len);
+		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_HALF_MD4:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 8);
+			half_md4_transform(buf, in);
+			len -= 32;
+			p += 32;
+		}
+		minor_hash = buf[2];
+		hash = buf[1];
+		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_TEA:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 4);
+			TEA_transform(buf, in);
+			len -= 16;
+			p += 16;
+		}
+		hash = buf[0];
+		minor_hash = buf[1];
+		break;
+	default:
+		hinfo->hash = 0;
+		return -1;
+	}
+	hash = hash & ~1;
+	if (hash == (EXT3_HTREE_EOF_32BIT << 1))
+		hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
+	hinfo->hash = hash;
+	hinfo->minor_hash = minor_hash;
+	return 0;
+}
diff --git a/kernel/fs/ext3/ialloc.c b/kernel/fs/ext3/ialloc.c
new file mode 100644
index 000000000..3ad242e58
--- /dev/null
+++ b/kernel/fs/ext3/ialloc.c
@@ -0,0 +1,706 @@
+/*
+ *  linux/fs/ext3/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by
+ *  Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/quotaops.h>
+#include <linux/random.h>
+
+#include "ext3.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+{
+	struct ext3_group_desc *desc;
+	struct buffer_head *bh = NULL;
+
+	desc = ext3_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		goto error_out;
+
+	bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
+	if (!bh)
+		ext3_error(sb, "read_inode_bitmap",
+			    "Cannot read inode bitmap - "
+			    "block_group = %lu, inode_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_inode_bitmap));
+error_out:
+	return bh;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext3_free_inode (handle_t *handle, struct inode * inode)
+{
+	struct super_block * sb = inode->i_sb;
+	int is_directory;
+	unsigned long ino;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	unsigned long block_group;
+	unsigned long bit;
+	struct ext3_group_desc * gdp;
+	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi;
+	int fatal = 0, err;
+
+	if (atomic_read(&inode->i_count) > 1) {
+		printk ("ext3_free_inode: inode has count=%d\n",
+					atomic_read(&inode->i_count));
+		return;
+	}
+	if (inode->i_nlink) {
+		printk ("ext3_free_inode: inode has nlink=%d\n",
+			inode->i_nlink);
+		return;
+	}
+	if (!sb) {
+		printk("ext3_free_inode: inode on nonexistent device\n");
+		return;
+	}
+	sbi = EXT3_SB(sb);
+
+	ino = inode->i_ino;
+	ext3_debug ("freeing inode %lu\n", ino);
+	trace_ext3_free_inode(inode);
+
+	is_directory = S_ISDIR(inode->i_mode);
+
+	es = EXT3_SB(sb)->s_es;
+	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext3_error (sb, "ext3_free_inode",
+			    "reserved or nonexistent inode %lu", ino);
+		goto error_return;
+	}
+	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+
+	BUFFER_TRACE(bitmap_bh, "get_write_access");
+	fatal = ext3_journal_get_write_access(handle, bitmap_bh);
+	if (fatal)
+		goto error_return;
+
+	/* Ok, now we can actually update the inode bitmaps.. */
+	if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+					bit, bitmap_bh->b_data))
+		ext3_error (sb, "ext3_free_inode",
+			      "bit already cleared for inode %lu", ino);
+	else {
+		gdp = ext3_get_group_desc (sb, block_group, &bh2);
+
+		BUFFER_TRACE(bh2, "get_write_access");
+		fatal = ext3_journal_get_write_access(handle, bh2);
+		if (fatal) goto error_return;
+
+		if (gdp) {
+			spin_lock(sb_bgl_lock(sbi, block_group));
+			le16_add_cpu(&gdp->bg_free_inodes_count, 1);
+			if (is_directory)
+				le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+			spin_unlock(sb_bgl_lock(sbi, block_group));
+			percpu_counter_inc(&sbi->s_freeinodes_counter);
+			if (is_directory)
+				percpu_counter_dec(&sbi->s_dirs_counter);
+
+		}
+		BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+		err = ext3_journal_dirty_metadata(handle, bh2);
+		if (!fatal) fatal = err;
+	}
+	BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (!fatal)
+		fatal = err;
+
+error_return:
+	brelse(bitmap_bh);
+	ext3_std_error(sb, fatal);
+}
+
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks).
+ * Parent's group is preferred, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
+ */
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT3_I(parent)->i_block_group;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int ngroups = sbi->s_groups_count;
+	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
+	unsigned int freei, avefreei;
+	ext3_fsblk_t freeb, avefreeb;
+	unsigned int ndirs;
+	int max_dirs, min_inodes;
+	ext3_grpblk_t min_blocks;
+	int group = -1, i;
+	struct ext3_group_desc *desc;
+
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	avefreeb = freeb / ngroups;
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+	if ((parent == d_inode(sb->s_root)) ||
+	    (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
+		int best_ndir = inodes_per_group;
+		int best_group = -1;
+
+		group = prandom_u32();
+		parent_group = (unsigned)group % ngroups;
+		for (i = 0; i < ngroups; i++) {
+			group = (parent_group + i) % ngroups;
+			desc = ext3_get_group_desc (sb, group, NULL);
+			if (!desc || !desc->bg_free_inodes_count)
+				continue;
+			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+				continue;
+			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+				continue;
+			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+				continue;
+			best_group = group;
+			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+		}
+		if (best_group >= 0)
+			return best_group;
+		goto fallback;
+	}
+
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group / 4;
+	min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
+
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+			continue;
+		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+			continue;
+		return group;
+	}
+
+fallback:
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+			return group;
+	}
+
+	if (avefreei) {
+		/*
+		 * The free-inodes counter is approximate, and for really small
+		 * filesystems the above test can fail to find any blockgroups
+		 */
+		avefreei = 0;
+		goto fallback;
+	}
+
+	return -1;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT3_I(parent)->i_block_group;
+	int ngroups = EXT3_SB(sb)->s_groups_count;
+	struct ext3_group_desc *desc;
+	int group, i;
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	group = parent_group;
+	desc = ext3_get_group_desc (sb, group, NULL);
+	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+			le16_to_cpu(desc->bg_free_blocks_count))
+		return group;
+
+	/*
+	 * We're going to place this inode in a different blockgroup from its
+	 * parent.  We want to cause files in a common directory to all land in
+	 * the same blockgroup.  But we want files which are in a different
+	 * directory which shares a blockgroup with our parent to land in a
+	 * different blockgroup.
+	 *
+	 * So add our directory's i_ino into the starting point for the hash.
+	 */
+	group = (group + parent->i_ino) % ngroups;
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode and some free
+	 * blocks.
+	 */
+	for (i = 1; i < ngroups; i <<= 1) {
+		group += i;
+		if (group >= ngroups)
+			group -= ngroups;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+				le16_to_cpu(desc->bg_free_blocks_count))
+			return group;
+	}
+
+	/*
+	 * That failed: try linear search for a free inode, even if that group
+	 * has no free blocks.
+	 */
+	group = parent_group;
+	for (i = 0; i < ngroups; i++) {
+		if (++group >= ngroups)
+			group = 0;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+			return group;
+	}
+
+	return -1;
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+			     const struct qstr *qstr, umode_t mode)
+{
+	struct super_block *sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	int group;
+	unsigned long ino = 0;
+	struct inode * inode;
+	struct ext3_group_desc * gdp = NULL;
+	struct ext3_super_block * es;
+	struct ext3_inode_info *ei;
+	struct ext3_sb_info *sbi;
+	int err = 0;
+	struct inode *ret;
+	int i;
+
+	/* Cannot create files in a deleted directory */
+	if (!dir || !dir->i_nlink)
+		return ERR_PTR(-EPERM);
+
+	sb = dir->i_sb;
+	trace_ext3_request_inode(dir, mode);
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	ei = EXT3_I(inode);
+
+	sbi = EXT3_SB(sb);
+	es = sbi->s_es;
+	if (S_ISDIR(mode))
+		group = find_group_orlov(sb, dir);
+	else
+		group = find_group_other(sb, dir);
+
+	err = -ENOSPC;
+	if (group == -1)
+		goto out;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		err = -EIO;
+
+		gdp = ext3_get_group_desc(sb, group, &bh2);
+		if (!gdp)
+			goto fail;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, group);
+		if (!bitmap_bh)
+			goto fail;
+
+		ino = 0;
+
+repeat_in_this_group:
+		ino = ext3_find_next_zero_bit((unsigned long *)
+				bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
+		if (ino < EXT3_INODES_PER_GROUP(sb)) {
+
+			BUFFER_TRACE(bitmap_bh, "get_write_access");
+			err = ext3_journal_get_write_access(handle, bitmap_bh);
+			if (err)
+				goto fail;
+
+			if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
+						ino, bitmap_bh->b_data)) {
+				/* we won it */
+				BUFFER_TRACE(bitmap_bh,
+					"call ext3_journal_dirty_metadata");
+				err = ext3_journal_dirty_metadata(handle,
+								bitmap_bh);
+				if (err)
+					goto fail;
+				goto got;
+			}
+			/* we lost it */
+			journal_release_buffer(handle, bitmap_bh);
+
+			if (++ino < EXT3_INODES_PER_GROUP(sb))
+				goto repeat_in_this_group;
+		}
+
+		/*
+		 * This case is possible in concurrent environment.  It is very
+		 * rare.  We cannot repeat the find_group_xxx() call because
+		 * that will simply return the same blockgroup, because the
+		 * group descriptor metadata has not yet been updated.
+		 * So we just go onto the next blockgroup.
+		 */
+		if (++group == sbi->s_groups_count)
+			group = 0;
+	}
+	err = -ENOSPC;
+	goto out;
+
+got:
+	ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
+	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext3_error (sb, "ext3_new_inode",
+			    "reserved inode or inode > inodes count - "
+			    "block_group = %d, inode=%lu", group, ino);
+		err = -EIO;
+		goto fail;
+	}
+
+	BUFFER_TRACE(bh2, "get_write_access");
+	err = ext3_journal_get_write_access(handle, bh2);
+	if (err) goto fail;
+	spin_lock(sb_bgl_lock(sbi, group));
+	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+	if (S_ISDIR(mode)) {
+		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+	}
+	spin_unlock(sb_bgl_lock(sbi, group));
+	BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, bh2);
+	if (err) goto fail;
+
+	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
+
+
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+
+	inode->i_ino = ino;
+	/* This is the optimal IO size (for stat), not the fs block size */
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	ei->i_dir_start_lookup = 0;
+	ei->i_disksize = 0;
+
+	ei->i_flags =
+		ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
+#ifdef EXT3_FRAGMENTS
+	ei->i_faddr = 0;
+	ei->i_frag_no = 0;
+	ei->i_frag_size = 0;
+#endif
+	ei->i_file_acl = 0;
+	ei->i_dir_acl = 0;
+	ei->i_dtime = 0;
+	ei->i_block_alloc_info = NULL;
+	ei->i_block_group = group;
+
+	ext3_set_inode_flags(inode);
+	if (IS_DIRSYNC(inode))
+		handle->h_sync = 1;
+	if (insert_inode_locked(inode) < 0) {
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
+	}
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	ei->i_state_flags = 0;
+	ext3_set_inode_state(inode, EXT3_STATE_NEW);
+
+	/* See comment in ext3_iget for explanation */
+	if (ino >= EXT3_FIRST_INO(sb) + 1 &&
+	    EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+		ei->i_extra_isize =
+			sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+	} else {
+		ei->i_extra_isize = 0;
+	}
+
+	ret = inode;
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto fail_drop;
+
+	err = ext3_init_acl(handle, inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext3_init_security(handle, inode, dir, qstr);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext3_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext3_std_error(sb, err);
+		goto fail_free_drop;
+	}
+
+	ext3_debug("allocating inode %lu\n", inode->i_ino);
+	trace_ext3_allocate_inode(inode, dir, mode);
+	goto really_out;
+fail:
+	ext3_std_error(sb, err);
+out:
+	iput(inode);
+	ret = ERR_PTR(err);
+really_out:
+	brelse(bitmap_bh);
+	return ret;
+
+fail_free_drop:
+	dquot_free_inode(inode);
+
+fail_drop:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	brelse(bitmap_bh);
+	return ERR_PTR(err);
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
+{
+	unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
+	unsigned long block_group;
+	int bit;
+	struct buffer_head *bitmap_bh;
+	struct inode *inode = NULL;
+	long err = -EIO;
+
+	/* Error cases - e2fsck has already cleaned up for us */
+	if (ino > max_ino) {
+		ext3_warning(sb, __func__,
+			     "bad orphan ino %lu!  e2fsck was run?", ino);
+		goto error;
+	}
+
+	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		ext3_warning(sb, __func__,
+			     "inode bitmap error for orphan %lu", ino);
+		goto error;
+	}
+
+	/* Having the inode bit set should be a 100% indicator that this
+	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
+	 * inodes that were being truncated, so we can't check i_nlink==0.
+	 */
+	if (!ext3_test_bit(bit, bitmap_bh->b_data))
+		goto bad_orphan;
+
+	inode = ext3_iget(sb, ino);
+	if (IS_ERR(inode))
+		goto iget_failed;
+
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext3_can_truncate(inode))
+		goto bad_orphan;
+
+	if (NEXT_ORPHAN(inode) > max_ino)
+		goto bad_orphan;
+	brelse(bitmap_bh);
+	return inode;
+
+iget_failed:
+	err = PTR_ERR(inode);
+	inode = NULL;
+bad_orphan:
+	ext3_warning(sb, __func__,
+		     "bad orphan inode %lu!  e2fsck was run?", ino);
+	printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
+	       bit, (unsigned long long)bitmap_bh->b_blocknr,
+	       ext3_test_bit(bit, bitmap_bh->b_data));
+	printk(KERN_NOTICE "inode=%p\n", inode);
+	if (inode) {
+		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+		       is_bad_inode(inode));
+		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+		       NEXT_ORPHAN(inode));
+		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+		/* Avoid freeing blocks if we got a bad deleted inode */
+		if (inode->i_nlink == 0)
+			inode->i_blocks = 0;
+		iput(inode);
+	}
+	brelse(bitmap_bh);
+error:
+	return ERR_PTR(err);
+}
+
+unsigned long ext3_count_free_inodes (struct super_block * sb)
+{
+	unsigned long desc_count;
+	struct ext3_group_desc *gdp;
+	int i;
+#ifdef EXT3FS_DEBUG
+	struct ext3_super_block *es;
+	unsigned long bitmap_count, x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT3_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+		gdp = ext3_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+
+		x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
+		printk("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
+		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	return desc_count;
+#else
+	desc_count = 0;
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+		gdp = ext3_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		cond_resched();
+	}
+	return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext3_count_dirs (struct super_block * sb)
+{
+	unsigned long count = 0;
+	int i;
+
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+		struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		count += le16_to_cpu(gdp->bg_used_dirs_count);
+	}
+	return count;
+}
+
diff --git a/kernel/fs/ext3/inode.c b/kernel/fs/ext3/inode.c
new file mode 100644
index 000000000..2ee2dc435
--- /dev/null
+++ b/kernel/fs/ext3/inode.c
@@ -0,0 +1,3573 @@
+/*
+ *  linux/fs/ext3/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *	(sct@redhat.com), 1993, 1998
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
+ */
+
+#include <linux/highuid.h>
+#include <linux/quotaops.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include "ext3.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static int ext3_inode_is_fast_symlink(struct inode *inode)
+{
+	int ea_blocks = EXT3_I(inode)->i_file_acl ?
+		(inode->i_sb->s_blocksize >> 9) : 0;
+
+	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+
+/*
+ * The ext3 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ */
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+			struct buffer_head *bh, ext3_fsblk_t blocknr)
+{
+	int err;
+
+	might_sleep();
+
+	trace_ext3_forget(inode, is_metadata, blocknr);
+	BUFFER_TRACE(bh, "enter");
+
+	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+		  "data mode %lx\n",
+		  bh, is_metadata, inode->i_mode,
+		  test_opt(inode->i_sb, DATA_FLAGS));
+
+	/* Never use the revoke function if we are doing full data
+	 * journaling: there is no need to, and a V1 superblock won't
+	 * support it.  Otherwise, only skip the revoke on un-journaled
+	 * data blocks. */
+
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
+	    (!is_metadata && !ext3_should_journal_data(inode))) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call journal_forget");
+			return ext3_journal_forget(handle, bh);
+		}
+		return 0;
+	}
+
+	/*
+	 * data!=journal && (is_metadata || should_journal_data(inode))
+	 */
+	BUFFER_TRACE(bh, "call ext3_journal_revoke");
+	err = ext3_journal_revoke(handle, blocknr, bh);
+	if (err)
+		ext3_abort(inode->i_sb, __func__,
+			   "error %d when attempting revoke", err);
+	BUFFER_TRACE(bh, "exit");
+	return err;
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static unsigned long blocks_for_truncate(struct inode *inode)
+{
+	unsigned long needed;
+
+	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+	/* Give ourselves just enough room to cope with inodes in which
+	 * i_blocks is corrupt: we've seen disk corruptions in the past
+	 * which resulted in random data in an inode which looked enough
+	 * like a regular file for ext3 to try to delete it.  Things
+	 * will go a bit crazy if that happens, but at least we should
+	 * try not to panic the whole kernel. */
+	if (needed < 2)
+		needed = 2;
+
+	/* But we need to bound the transaction so we don't overflow the
+	 * journal. */
+	if (needed > EXT3_MAX_TRANS_DATA)
+		needed = EXT3_MAX_TRANS_DATA;
+
+	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+	handle_t *result;
+
+	result = ext3_journal_start(inode, blocks_for_truncate(inode));
+	if (!IS_ERR(result))
+		return result;
+
+	ext3_std_error(inode->i_sb, PTR_ERR(result));
+	return result;
+}
+
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
+		return 0;
+	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
+		return 0;
+	return 1;
+}
+
+/*
+ * Restart the transaction associated with *handle.  This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
+{
+	int ret;
+
+	jbd_debug(2, "restarting handle %p\n", handle);
+	/*
+	 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+	 * At this moment, get_block can be called only for blocks inside
+	 * i_size since page cache has been already dropped and writes are
+	 * blocked by i_mutex. So we can safely drop the truncate_mutex.
+	 */
+	mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+	ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+	mutex_lock(&EXT3_I(inode)->truncate_mutex);
+	return ret;
+}
+
+/*
+ * Called at inode eviction from icache
+ */
+void ext3_evict_inode (struct inode *inode)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_block_alloc_info *rsv;
+	handle_t *handle;
+	int want_delete = 0;
+
+	trace_ext3_evict_inode(inode);
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		dquot_initialize(inode);
+		want_delete = 1;
+	}
+
+	/*
+	 * When journalling data dirty buffers are tracked only in the journal.
+	 * So although mm thinks everything is clean and ready for reaping the
+	 * inode might still have some pages to write in the running
+	 * transaction or waiting to be checkpointed. Thus calling
+	 * journal_invalidatepage() (via truncate_inode_pages()) to discard
+	 * these buffers can cause data loss. Also even if we did not discard
+	 * these buffers, we would have no way to find them after the inode
+	 * is reaped and thus user could see stale data if he tries to read
+	 * them before the transaction is checkpointed. So be careful and
+	 * force everything to disk here... We use ei->i_datasync_tid to
+	 * store the newest transaction containing inode's data.
+	 *
+	 * Note that directories do not have this problem because they don't
+	 * use page cache.
+	 *
+	 * The s_journal check handles the case when ext3_get_journal() fails
+	 * and puts the journal inode.
+	 */
+	if (inode->i_nlink && ext3_should_journal_data(inode) &&
+	    EXT3_SB(inode->i_sb)->s_journal &&
+	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+	    inode->i_ino != EXT3_JOURNAL_INO) {
+		tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
+		journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+
+		log_start_commit(journal, commit_tid);
+		log_wait_commit(journal, commit_tid);
+		filemap_write_and_wait(&inode->i_data);
+	}
+	truncate_inode_pages_final(&inode->i_data);
+
+	ext3_discard_reservation(inode);
+	rsv = ei->i_block_alloc_info;
+	ei->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+
+	if (!want_delete)
+		goto no_delete;
+
+	handle = start_transaction(inode);
+	if (IS_ERR(handle)) {
+		/*
+		 * If we're going to skip the normal cleanup, we still need to
+		 * make sure that the in-core orphan linked list is properly
+		 * cleaned up.
+		 */
+		ext3_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+	inode->i_size = 0;
+	if (inode->i_blocks)
+		ext3_truncate(inode);
+	/*
+	 * Kill off the orphan record created when the inode lost the last
+	 * link.  Note that ext3_orphan_del() has to be able to cope with the
+	 * deletion of a non-existent orphan - ext3_truncate() could
+	 * have removed the record.
+	 */
+	ext3_orphan_del(handle, inode);
+	ei->i_dtime = get_seconds();
+
+	/*
+	 * One subtle ordering requirement: if anything has gone wrong
+	 * (transaction abort, IO errors, whatever), then we can still
+	 * do these next steps (the fs will already have been marked as
+	 * having errors), but we can't free the inode if the mark_dirty
+	 * fails.
+	 */
+	if (ext3_mark_inode_dirty(handle, inode)) {
+		/* If that failed, just dquot_drop() and be done with that */
+		dquot_drop(inode);
+		clear_inode(inode);
+	} else {
+		ext3_xattr_delete_inode(handle, inode);
+		dquot_free_inode(inode);
+		dquot_drop(inode);
+		clear_inode(inode);
+		ext3_free_inode(handle, inode);
+	}
+	ext3_journal_stop(handle);
+	return;
+no_delete:
+	clear_inode(inode);
+	dquot_drop(inode);
+}
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+/**
+ *	ext3_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *      @boundary: set this non-zero if the referred-to block is likely to be
+ *             followed (on disk) by an indirect block.
+ *
+ *	To store the locations of file's data ext3 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext3_block_to_path(struct inode *inode,
+			long i_block, int offsets[4], int *boundary)
+{
+	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT3_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < 0) {
+		ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
+	} else if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT3_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT3_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT3_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+	return n;
+}
+
+/**
+ *	ext3_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it notices that chain had been changed while it was reading
+ *		(ditto, *@err == -EAGAIN)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
+				 Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_bread(sb, le32_to_cpu(p->key));
+		if (!bh)
+			goto failure;
+		/* Reader: pointers */
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		/* Reader: end */
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+/**
+ *	ext3_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the preferred place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same
+ *	    cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+	ext3_fsblk_t bg_start;
+	ext3_grpblk_t colour;
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--) {
+		if (*p)
+			return le32_to_cpu(*p);
+	}
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred to from the inode itself? OK, just put it
+	 * into the same cylinder group then.
+	 */
+	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
+	colour = (current->pid % 16) *
+			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+/**
+ *	ext3_find_goal - find a preferred place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@partial: pointer to the last triple within a chain
+ *
+ *	Normally this function find the preferred place for block allocation,
+ *	returns it.
+ */
+
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
+				   Indirect *partial)
+{
+	struct ext3_block_alloc_info *block_i;
+
+	block_i =  EXT3_I(inode)->i_block_alloc_info;
+
+	/*
+	 * try the heuristic for sequential allocation,
+	 * failing that at least try to get decent locality.
+	 */
+	if (block_i && (block == block_i->last_alloc_logical_block + 1)
+		&& (block_i->last_alloc_physical_block != 0)) {
+		return block_i->last_alloc_physical_block + 1;
+	}
+
+	return ext3_find_near(inode, partial);
+}
+
+/**
+ *	ext3_blks_to_allocate - Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ *	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+		int blocks_to_boundary)
+{
+	unsigned long count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now we don't handle cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary &&
+		le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@goal: preferred place for allocation
+ *	@indirect_blks: the number of blocks need to allocate for indirect
+ *			blocks
+ *	@blks:	number of blocks need to allocated for direct blocks
+ *	@new_blocks: on return it will store the new block numbers for
+ *	the indirect blocks(if needed) and the first direct block,
+ *	@err: here we store the error value
+ *
+ *	return the number of direct blocks allocated
+ */
+static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int indirect_blks, int blks,
+			ext3_fsblk_t new_blocks[4], int *err)
+{
+	int target, i;
+	unsigned long count = 0;
+	int index = 0;
+	ext3_fsblk_t current_block = 0;
+	int ret = 0;
+
+	/*
+	 * Here we try to allocate the requested multiple blocks at once,
+	 * on a best-effort basis.
+	 * To build a branch, we should allocate blocks for
+	 * the indirect blocks(if not allocated yet), and at least
+	 * the first direct block of this branch.  That's the
+	 * minimum number of blocks need to allocate(required)
+	 */
+	target = blks + indirect_blks;
+
+	while (1) {
+		count = target;
+		/* allocating blocks for indirect blocks and direct blocks */
+		current_block = ext3_new_blocks(handle,inode,goal,&count,err);
+		if (*err)
+			goto failed_out;
+
+		target -= count;
+		/* allocate blocks for indirect blocks */
+		while (index < indirect_blks && count) {
+			new_blocks[index++] = current_block++;
+			count--;
+		}
+
+		if (count > 0)
+			break;
+	}
+
+	/* save the new block number for the first direct block */
+	new_blocks[index] = current_block;
+
+	/* total number of blocks allocated for direct blocks */
+	ret = count;
+	*err = 0;
+	return ret;
+failed_out:
+	for (i = 0; i <index; i++)
+		ext3_free_blocks(handle, inode, new_blocks[i], 1);
+	return ret;
+}
+
+/**
+ *	ext3_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@indirect_blks: number of allocated indirect blocks
+ *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext3_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext3_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
+			int indirect_blks, int *blks, ext3_fsblk_t goal,
+			int *offsets, Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int i, n = 0;
+	int err = 0;
+	struct buffer_head *bh;
+	int num;
+	ext3_fsblk_t new_blocks[4];
+	ext3_fsblk_t current_block;
+
+	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
+				*blks, new_blocks, &err);
+	if (err)
+		return err;
+
+	branch[0].key = cpu_to_le32(new_blocks[0]);
+	/*
+	 * metadata blocks and data blocks are allocated.
+	 */
+	for (n = 1; n <= indirect_blks;  n++) {
+		/*
+		 * Get buffer_head for parent block, zero it out
+		 * and set the pointer to new one, then send
+		 * parent to disk.
+		 */
+		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		branch[n].bh = bh;
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		err = ext3_journal_get_create_access(handle, bh);
+		if (err) {
+			unlock_buffer(bh);
+			brelse(bh);
+			goto failed;
+		}
+
+		memset(bh->b_data, 0, blocksize);
+		branch[n].p = (__le32 *) bh->b_data + offsets[n];
+		branch[n].key = cpu_to_le32(new_blocks[n]);
+		*branch[n].p = branch[n].key;
+		if ( n == indirect_blks) {
+			current_block = new_blocks[n];
+			/*
+			 * End of chain, update the last new metablock of
+			 * the chain to point to the new allocated
+			 * data blocks numbers
+			 */
+			for (i=1; i < num; i++)
+				*(branch[n].p + i) = cpu_to_le32(++current_block);
+		}
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+		err = ext3_journal_dirty_metadata(handle, bh);
+		if (err)
+			goto failed;
+	}
+	*blks = num;
+	return err;
+failed:
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i <= n ; i++) {
+		BUFFER_TRACE(branch[i].bh, "call journal_forget");
+		ext3_journal_forget(handle, branch[i].bh);
+	}
+	for (i = 0; i < indirect_blks; i++)
+		ext3_free_blocks(handle, inode, new_blocks[i], 1);
+
+	ext3_free_blocks(handle, inode, new_blocks[i], num);
+
+	return err;
+}
+
+/**
+ * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext3_splice_branch(handle_t *handle, struct inode *inode,
+			long block, Indirect *where, int num, int blks)
+{
+	int i;
+	int err = 0;
+	struct ext3_block_alloc_info *block_i;
+	ext3_fsblk_t current_block;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct timespec now;
+
+	block_i = ei->i_block_alloc_info;
+	/*
+	 * If we're splicing into a [td]indirect block (as opposed to the
+	 * inode) then we need to get write access to the [td]indirect block
+	 * before the splice.
+	 */
+	if (where->bh) {
+		BUFFER_TRACE(where->bh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, where->bh);
+		if (err)
+			goto err_out;
+	}
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && blks > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < blks; i++)
+			*(where->p + i ) = cpu_to_le32(current_block++);
+	}
+
+	/*
+	 * update the most recently allocated logical & physical block
+	 * in i_block_alloc_info, to assist find the proper goal block for next
+	 * allocation
+	 */
+	if (block_i) {
+		block_i->last_alloc_logical_block = block + blks - 1;
+		block_i->last_alloc_physical_block =
+				le32_to_cpu(where[num].key) + blks - 1;
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+	now = CURRENT_TIME_SEC;
+	if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
+		inode->i_ctime = now;
+		ext3_mark_inode_dirty(handle, inode);
+	}
+	/* ext3_mark_inode_dirty already updated i_sync_tid */
+	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		/*
+		 * If we spliced it onto an indirect block, we haven't
+		 * altered the inode.  Note however that if it is being spliced
+		 * onto an indirect block at the very end of the file (the
+		 * file is growing) then we *will* alter the inode to reflect
+		 * the new i_size.  But that is not done here - it is done in
+		 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
+		 */
+		jbd_debug(5, "splicing indirect only\n");
+		BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
+		err = ext3_journal_dirty_metadata(handle, where->bh);
+		if (err)
+			goto err_out;
+	} else {
+		/*
+		 * OK, we spliced it into the inode itself on a direct block.
+		 * Inode was dirtied above.
+		 */
+		jbd_debug(5, "splicing direct\n");
+	}
+	return err;
+
+err_out:
+	for (i = 1; i <= num; i++) {
+		BUFFER_TRACE(where[i].bh, "call journal_forget");
+		ext3_journal_forget(handle, where[i].bh);
+		ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+	}
+	ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+
+	return err;
+}
+
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * The BKL may not be held on entry here.  Be sure to take it early.
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+		sector_t iblock, unsigned long maxblocks,
+		struct buffer_head *bh_result,
+		int create)
+{
+	int err = -EIO;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	ext3_fsblk_t goal;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	int count = 0;
+	ext3_fsblk_t first_block = 0;
+
+
+	trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
+	J_ASSERT(handle != NULL || create == 0);
+	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+
+	if (depth == 0)
+		goto out;
+
+	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		clear_buffer_new(bh_result);
+		count++;
+		/*map more blocks*/
+		while (count < maxblocks && count <= blocks_to_boundary) {
+			ext3_fsblk_t blk;
+
+			if (!verify_chain(chain, chain + depth - 1)) {
+				/*
+				 * Indirect block might be removed by
+				 * truncate while we were reading it.
+				 * Handling of that case: forget what we've
+				 * got now. Flag the err as EAGAIN, so it
+				 * will reread.
+				 */
+				err = -EAGAIN;
+				count = 0;
+				break;
+			}
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		if (err != -EAGAIN)
+			goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO)
+		goto cleanup;
+
+	/*
+	 * Block out ext3_truncate while we alter the tree
+	 */
+	mutex_lock(&ei->truncate_mutex);
+
+	/*
+	 * If the indirect block is missing while we are reading
+	 * the chain(ext3_get_branch() returns -EAGAIN err), or
+	 * if the chain has been changed after we grab the semaphore,
+	 * (either because another process truncated this branch, or
+	 * another get_block allocated this branch) re-grab the chain to see if
+	 * the request block has been allocated or not.
+	 *
+	 * Since we already block the truncate/other get_block
+	 * at this point, we will have the current copy of the chain when we
+	 * splice the branch into the tree.
+	 */
+	if (err == -EAGAIN || !verify_chain(chain, partial)) {
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+		if (!partial) {
+			count++;
+			mutex_unlock(&ei->truncate_mutex);
+			if (err)
+				goto cleanup;
+			clear_buffer_new(bh_result);
+			goto got_it;
+		}
+	}
+
+	/*
+	 * Okay, we need to do block allocation.  Lazily initialize the block
+	 * allocation info here if necessary
+	*/
+	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
+		ext3_init_block_alloc_info(inode);
+
+	goal = ext3_find_goal(inode, iblock, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	count = ext3_blks_to_allocate(partial, indirect_blks,
+					maxblocks, blocks_to_boundary);
+	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
+				offsets + (partial - chain), partial);
+
+	/*
+	 * The ext3_splice_branch call will free and forget any buffers
+	 * on the new chain if there is a failure, but that risks using
+	 * up transaction credits, especially for bitmaps where the
+	 * credits cannot be returned.  Can we handle this somehow?  We
+	 * may need to return -EAGAIN upwards in the worst case.  --sct
+	 */
+	if (!err)
+		err = ext3_splice_branch(handle, inode, iblock,
+					partial, indirect_blks, count);
+	mutex_unlock(&ei->truncate_mutex);
+	if (err)
+		goto cleanup;
+
+	set_buffer_new(bh_result);
+got_it:
+	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	if (count > blocks_to_boundary)
+		set_buffer_boundary(bh_result);
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+	BUFFER_TRACE(bh_result, "returned");
+out:
+	trace_ext3_get_blocks_exit(inode, iblock,
+				   depth ? le32_to_cpu(chain[depth-1].key) : 0,
+				   count, err);
+	return err;
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
+
+static int ext3_get_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	int ret = 0, started = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	if (create && !handle) {	/* Direct IO write... */
+		if (max_blocks > DIO_MAX_BLOCKS)
+			max_blocks = DIO_MAX_BLOCKS;
+		handle = ext3_journal_start(inode, DIO_CREDITS +
+				EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		started = 1;
+	}
+
+	ret = ext3_get_blocks_handle(handle, inode, iblock,
+					max_blocks, bh_result, create);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	if (started)
+		ext3_journal_stop(handle);
+out:
+	return ret;
+}
+
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext3_get_block);
+}
+
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
+				long block, int create, int *errp)
+{
+	struct buffer_head dummy;
+	int fatal = 0, err;
+
+	J_ASSERT(handle != NULL || create == 0);
+
+	dummy.b_state = 0;
+	dummy.b_blocknr = -1000;
+	buffer_trace_init(&dummy.b_history);
+	err = ext3_get_blocks_handle(handle, inode, block, 1,
+					&dummy, create);
+	/*
+	 * ext3_get_blocks_handle() returns number of blocks
+	 * mapped. 0 in case of a HOLE.
+	 */
+	if (err > 0) {
+		WARN_ON(err > 1);
+		err = 0;
+	}
+	*errp = err;
+	if (!err && buffer_mapped(&dummy)) {
+		struct buffer_head *bh;
+		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+		if (unlikely(!bh)) {
+			*errp = -ENOMEM;
+			goto err;
+		}
+		if (buffer_new(&dummy)) {
+			J_ASSERT(create != 0);
+			J_ASSERT(handle != NULL);
+
+			/*
+			 * Now that we do not always journal data, we should
+			 * keep in mind whether this should always journal the
+			 * new buffer as metadata.  For now, regular file
+			 * writes use ext3_get_block instead, so it's not a
+			 * problem.
+			 */
+			lock_buffer(bh);
+			BUFFER_TRACE(bh, "call get_create_access");
+			fatal = ext3_journal_get_create_access(handle, bh);
+			if (!fatal && !buffer_uptodate(bh)) {
+				memset(bh->b_data,0,inode->i_sb->s_blocksize);
+				set_buffer_uptodate(bh);
+			}
+			unlock_buffer(bh);
+			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+			err = ext3_journal_dirty_metadata(handle, bh);
+			if (!fatal)
+				fatal = err;
+		} else {
+			BUFFER_TRACE(bh, "not a new buffer");
+		}
+		if (fatal) {
+			*errp = fatal;
+			brelse(bh);
+			bh = NULL;
+		}
+		return bh;
+	}
+err:
+	return NULL;
+}
+
+struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
+			       int block, int create, int *err)
+{
+	struct buffer_head * bh;
+
+	bh = ext3_getblk(handle, inode, block, create, err);
+	if (!bh)
+		return bh;
+	if (bh_uptodate_or_lock(bh))
+		return bh;
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	put_bh(bh);
+	*err = -EIO;
+	return NULL;
+}
+
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+		block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction.  We cannot
+ * close off a transaction and start a new one between the ext3_get_block()
+ * and the commit_write().  So doing the journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext3_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext3_writepage()
+ * has generated enough buffer credits to do the whole page.  So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext3 can be reentered when a transaction is open via
+ * quota file writes.  If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated.  We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+static int do_journal_get_write_access(handle_t *handle,
+					struct buffer_head *bh)
+{
+	int dirty = buffer_dirty(bh);
+	int ret;
+
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	/*
+	 * __block_prepare_write() could have dirtied some buffers. Clean
+	 * the dirty bit as jbd2_journal_get_write_access() could complain
+	 * otherwise about fs integrity issues. Setting of the dirty bit
+	 * by __block_prepare_write() isn't a real problem here as we clear
+	 * the bit before releasing a page lock and thus writeback cannot
+	 * ever write the buffer.
+	 */
+	if (dirty)
+		clear_buffer_dirty(bh);
+	ret = ext3_journal_get_write_access(handle, bh);
+	if (!ret && dirty)
+		ret = ext3_journal_dirty_metadata(handle, bh);
+	return ret;
+}
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext3_truncate(inode);
+}
+
+/*
+ * Truncate blocks that were not used by direct IO write. We have to zero out
+ * the last file block as well because direct IO might have written to it.
+ */
+static void ext3_truncate_failed_direct_write(struct inode *inode)
+{
+	ext3_block_truncate_page(inode, inode->i_size);
+	ext3_truncate(inode);
+}
+
+static int ext3_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret;
+	handle_t *handle;
+	int retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	/* Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason */
+	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
+
+	trace_ext3_write_begin(inode, pos, len, flags);
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+retry:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	handle = ext3_journal_start(inode, needed_blocks);
+	if (IS_ERR(handle)) {
+		unlock_page(page);
+		page_cache_release(page);
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = __block_write_begin(page, pos, len, ext3_get_block);
+	if (ret)
+		goto write_begin_failed;
+
+	if (ext3_should_journal_data(inode)) {
+		ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, NULL, do_journal_get_write_access);
+	}
+write_begin_failed:
+	if (ret) {
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before truncate
+		 * finishes. Do this only if ext3_can_truncate() agrees so
+		 * that orphan processing code is happy.
+		 */
+		if (pos + len > inode->i_size && ext3_can_truncate(inode))
+			ext3_orphan_add(handle, inode);
+		ext3_journal_stop(handle);
+		unlock_page(page);
+		page_cache_release(page);
+		if (pos + len > inode->i_size)
+			ext3_truncate_failed_write(inode);
+	}
+	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+
+int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(__func__, __func__,
+						bh, handle, err);
+	return err;
+}
+
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * Write could have mapped the buffer but it didn't copy the data in
+	 * yet. So avoid filing such buffer into a transaction.
+	 */
+	if (buffer_mapped(bh) && buffer_uptodate(bh))
+		return ext3_journal_dirty_data(handle, bh);
+	return 0;
+}
+
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+{
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	set_buffer_uptodate(bh);
+	return ext3_journal_dirty_metadata(handle, bh);
+}
+
+/*
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
+ * for the whole page but later we failed to copy the data in. Update inode
+ * size according to what we managed to copy. The rest is going to be
+ * truncated in write_end function.
+ */
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
+{
+	/* What matters to us is i_disksize. We don't write i_size anywhere */
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+	if (pos + copied > EXT3_I(inode)->i_disksize) {
+		EXT3_I(inode)->i_disksize = pos + copied;
+		mark_inode_dirty(inode);
+	}
+}
+
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext3 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext3_ordered_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	struct inode *inode = file->f_mapping->host;
+	unsigned from, to;
+	int ret = 0, ret2;
+
+	trace_ext3_ordered_write_end(inode, pos, len, copied);
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + copied;
+	ret = walk_page_buffers(handle, page_buffers(page),
+		from, to, NULL, journal_dirty_data_fn);
+
+	if (ret == 0)
+		update_file_sizes(inode, pos, copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
+		ext3_orphan_add(handle, inode);
+	ret2 = ext3_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		ext3_truncate_failed_write(inode);
+	return ret ? ret : copied;
+}
+
+static int ext3_writeback_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	trace_ext3_writeback_write_end(inode, pos, len, copied);
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+	update_file_sizes(inode, pos, copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
+		ext3_orphan_add(handle, inode);
+	ret = ext3_journal_stop(handle);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		ext3_truncate_failed_write(inode);
+	return ret ? ret : copied;
+}
+
+static int ext3_journalled_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	struct inode *inode = mapping->host;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	int ret = 0, ret2;
+	int partial = 0;
+	unsigned from, to;
+
+	trace_ext3_journalled_write_end(inode, pos, len, copied);
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	if (copied < len) {
+		if (!PageUptodate(page))
+			copied = 0;
+		page_zero_new_buffers(page, from + copied, to);
+		to = from + copied;
+	}
+
+	ret = walk_page_buffers(handle, page_buffers(page), from,
+				to, &partial, write_end_fn);
+	if (!partial)
+		SetPageUptodate(page);
+
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
+		ext3_orphan_add(handle, inode);
+	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+	if (inode->i_size > ei->i_disksize) {
+		ei->i_disksize = inode->i_size;
+		ret2 = ext3_mark_inode_dirty(handle, inode);
+		if (!ret)
+			ret = ret2;
+	}
+
+	ret2 = ext3_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		ext3_truncate_failed_write(inode);
+	return ret ? ret : copied;
+}
+
+/*
+ * bmap() is special.  It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal.  If somebody makes a swapfile on an ext3 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	journal_t *journal;
+	int err;
+
+	if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
+		/*
+		 * This is a REALLY heavyweight approach, but the use of
+		 * bmap on dirty files is expected to be extremely rare:
+		 * only if we run lilo or swapon on a freshly made file
+		 * do we expect this to happen.
+		 *
+		 * (bmap requires CAP_SYS_RAWIO so this does not
+		 * represent an unprivileged user DOS attack --- we'd be
+		 * in trouble if mortal users could trigger this path at
+		 * will.)
+		 *
+		 * NB. EXT3_STATE_JDATA is not set on files other than
+		 * regular files.  If somebody wants to bmap a directory
+		 * or symlink and gets confused because the buffer
+		 * hasn't yet been flushed to disk, they deserve
+		 * everything they get.
+		 */
+
+		ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
+		journal = EXT3_JOURNAL(inode);
+		journal_lock_updates(journal);
+		err = journal_flush(journal);
+		journal_unlock_updates(journal);
+
+		if (err)
+			return 0;
+	}
+
+	return generic_block_bmap(mapping,block,ext3_get_block);
+}
+
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+	get_bh(bh);
+	return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+	put_bh(bh);
+	return 0;
+}
+
+static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+/*
+ * Note that whenever we need to map blocks we start a transaction even if
+ * we're not journalling data.  This is to preserve ordering: any hole
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
+ * journalled along with the data so we don't crash and then get metadata which
+ * refers to old data.
+ *
+ * In all journalling modes block_write_full_page() will start the I/O.
+ *
+ * We don't honour synchronous mounts for writepage().  That would be
+ * disastrous.  Any write() or metadata operation will sync the fs for
+ * us.
+ */
+static int ext3_ordered_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *page_bufs;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	J_ASSERT(PageLocked(page));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
+	/*
+	 * We give up here if we're reentered, because it might be for a
+	 * different filesystem.
+	 */
+	if (ext3_journal_current_handle())
+		goto out_fail;
+
+	trace_ext3_ordered_writepage(page);
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+		page_bufs = page_buffers(page);
+	} else {
+		page_bufs = page_buffers(page);
+		if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+				       NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
+	}
+	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
+
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bget_one);
+
+	ret = block_write_full_page(page, ext3_get_block, wbc);
+
+	/*
+	 * The page can become unlocked at any point now, and
+	 * truncate can then come in and change things.  So we
+	 * can't touch *page from now on.  But *page_bufs is
+	 * safe due to elevated refcount.
+	 */
+
+	/*
+	 * And attach them to the current transaction.  But only if
+	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
+	 * and generally junk.
+	 */
+	if (ret == 0)
+		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+					NULL, journal_dirty_data_fn);
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bput_one);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext3_writeback_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	J_ASSERT(PageLocked(page));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
+	if (ext3_journal_current_handle())
+		goto out_fail;
+
+	trace_ext3_writeback_writepage(page);
+	if (page_has_buffers(page)) {
+		if (!walk_page_buffers(NULL, page_buffers(page), 0,
+				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
+	}
+
+	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	ret = block_write_full_page(page, ext3_get_block, wbc);
+
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext3_journalled_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	J_ASSERT(PageLocked(page));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
+	trace_ext3_journalled_writepage(page);
+	if (!page_has_buffers(page) || PageChecked(page)) {
+		if (ext3_journal_current_handle())
+			goto no_write;
+
+		handle = ext3_journal_start(inode,
+					    ext3_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto no_write;
+		}
+		/*
+		 * It's mmapped pagecache.  Add buffers and journal it.  There
+		 * doesn't seem much point in redirtying the page here.
+		 */
+		ClearPageChecked(page);
+		ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
+					  ext3_get_block);
+		if (ret != 0) {
+			ext3_journal_stop(handle);
+			goto out_unlock;
+		}
+		ret = walk_page_buffers(handle, page_buffers(page), 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+
+		err = walk_page_buffers(handle, page_buffers(page), 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+		if (ret == 0)
+			ret = err;
+		ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+		atomic_set(&EXT3_I(inode)->i_datasync_tid,
+			   handle->h_transaction->t_tid);
+		unlock_page(page);
+		err = ext3_journal_stop(handle);
+		if (!ret)
+			ret = err;
+	} else {
+		/*
+		 * It is a page full of checkpoint-mode buffers. Go and write
+		 * them. They should have been already mapped when they went
+		 * to the journal so provide NULL get_block function to catch
+		 * errors.
+		 */
+		ret = block_write_full_page(page, NULL, wbc);
+	}
+out:
+	return ret;
+
+no_write:
+	redirty_page_for_writepage(wbc, page);
+out_unlock:
+	unlock_page(page);
+	goto out;
+}
+
+static int ext3_readpage(struct file *file, struct page *page)
+{
+	trace_ext3_readpage(page);
+	return mpage_readpage(page, ext3_get_block);
+}
+
+static int
+ext3_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
+}
+
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
+{
+	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+
+	trace_ext3_invalidatepage(page, offset, length);
+
+	/*
+	 * If it's a full truncate we just forget about the pending dirtying
+	 */
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
+		ClearPageChecked(page);
+
+	journal_invalidatepage(journal, page, offset, length);
+}
+
+static int ext3_releasepage(struct page *page, gfp_t wait)
+{
+	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+
+	trace_ext3_releasepage(page);
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	return journal_try_to_free_buffers(journal, page, wait);
+}
+
+/*
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			      loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	handle_t *handle;
+	ssize_t ret;
+	int orphan = 0;
+	size_t count = iov_iter_count(iter);
+	int retries = 0;
+
+	trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+
+	if (iov_iter_rw(iter) == WRITE) {
+		loff_t final_size = offset + count;
+
+		if (final_size > inode->i_size) {
+			/* Credits for sb + inode write */
+			handle = ext3_journal_start(inode, 2);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				goto out;
+			}
+			ret = ext3_orphan_add(handle, inode);
+			if (ret) {
+				ext3_journal_stop(handle);
+				goto out;
+			}
+			orphan = 1;
+			ei->i_disksize = inode->i_size;
+			ext3_journal_stop(handle);
+		}
+	}
+
+retry:
+	ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block);
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + count;
+
+		if (end > isize)
+			ext3_truncate_failed_direct_write(inode);
+	}
+	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	if (orphan) {
+		int err;
+
+		/* Credits for sb + inode write */
+		handle = ext3_journal_start(inode, 2);
+		if (IS_ERR(handle)) {
+			/* This is really bad luck. We've written the data
+			 * but cannot extend i_size. Truncate allocated blocks
+			 * and pretend the write failed... */
+			ext3_truncate_failed_direct_write(inode);
+			ret = PTR_ERR(handle);
+			if (inode->i_nlink)
+				ext3_orphan_del(NULL, inode);
+			goto out;
+		}
+		if (inode->i_nlink)
+			ext3_orphan_del(handle, inode);
+		if (ret > 0) {
+			loff_t end = offset + ret;
+			if (end > inode->i_size) {
+				ei->i_disksize = end;
+				i_size_write(inode, end);
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext3_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext3_mark_inode_dirty(handle, inode);
+			}
+		}
+		err = ext3_journal_stop(handle);
+		if (ret == 0)
+			ret = err;
+	}
+out:
+	trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
+	return ret;
+}
+
+/*
+ * Pages can be marked dirty completely asynchronously from ext3's journalling
+ * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
+ * much here because ->set_page_dirty is called under VFS locks.  The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext3_journalled_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations ext3_ordered_aops = {
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_ordered_writepage,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_ordered_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext3_writeback_aops = {
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_writeback_writepage,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_writeback_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext3_journalled_aops = {
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_journalled_writepage,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_journalled_write_end,
+	.set_page_dirty		= ext3_journalled_set_page_dirty,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+void ext3_set_aops(struct inode *inode)
+{
+	if (ext3_should_order_data(inode))
+		inode->i_mapping->a_ops = &ext3_ordered_aops;
+	else if (ext3_should_writeback_data(inode))
+		inode->i_mapping->a_ops = &ext3_writeback_aops;
+	else
+		inode->i_mapping->a_ops = &ext3_journalled_aops;
+}
+
+/*
+ * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
+{
+	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize, iblock, length, pos;
+	struct page *page;
+	handle_t *handle = NULL;
+	struct buffer_head *bh;
+	int err = 0;
+
+	/* Truncated on block boundary - nothing to do */
+	blocksize = inode->i_sb->s_blocksize;
+	if ((from & (blocksize - 1)) == 0)
+		return 0;
+
+	page = grab_cache_page(inode->i_mapping, index);
+	if (!page)
+		return -ENOMEM;
+	length = blocksize - (offset & (blocksize - 1));
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext3_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh)) {
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
+		}
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!bh_uptodate_or_lock(bh)) {
+		err = bh_submit_read(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (err)
+			goto unlock;
+	}
+
+	/* data=writeback mode doesn't need transaction to zero-out data */
+	if (!ext3_should_writeback_data(inode)) {
+		/* We journal at most one block */
+		handle = ext3_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			err = PTR_ERR(handle);
+			goto unlock;
+		}
+	}
+
+	if (ext3_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err)
+			goto stop;
+	}
+
+	zero_user(page, offset, length);
+	BUFFER_TRACE(bh, "zeroed end of block");
+
+	err = 0;
+	if (ext3_should_journal_data(inode)) {
+		err = ext3_journal_dirty_metadata(handle, bh);
+	} else {
+		if (ext3_should_order_data(inode))
+			err = ext3_journal_dirty_data(handle, bh);
+		mark_buffer_dirty(bh);
+	}
+stop:
+	if (handle)
+		ext3_journal_stop(handle);
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext3_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext3_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext3_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several
+ *	indirect blocks but leave the blocks themselves alive. Block is
+ *	partially truncated if some data below the new i_size is referred
+ *	from it (and it is on the path to the first completely truncated
+ *	data block, indeed).  We have to free the top of that path along
+ *	with everything to the right of the path. Since no allocation
+ *	past the truncation point is possible until ext3_truncate()
+ *	finishes, we may safely do the latter, but top of branch may
+ *	require special attention - pageout below the truncation point
+ *	might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the
+ *	block number of its root in *@top, pointers to buffer_heads of
+ *	partially truncated blocks - in @chain[].bh and pointers to
+ *	their last elements that should not be removed - in
+ *	@chain[].p. Return value is the pointer to last filled element
+ *	of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].
+ *			(no partially truncated stuff there).  */
+
+static Indirect *ext3_find_shared(struct inode *inode, int depth,
+			int offsets[4], Indirect chain[4], __le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	/* Make k index the deepest non-null offset + 1 */
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext3_get_branch(inode, k, offsets, chain, &err);
+	/* Writer: pointers */
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p)
+		/* Writer: end */
+		goto no_top;
+	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		/* Nope, don't do this in ext3.  Must leave the tree intact */
+#if 0
+		*p->p = 0;
+#endif
+	}
+	/* Writer: end */
+
+	while(partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ */
+static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
+		struct buffer_head *bh, ext3_fsblk_t block_to_free,
+		unsigned long count, __le32 *first, __le32 *last)
+{
+	__le32 *p;
+	if (try_to_extend_transaction(handle, inode)) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+			if (ext3_journal_dirty_metadata(handle, bh))
+				return;
+		}
+		ext3_mark_inode_dirty(handle, inode);
+		truncate_restart_transaction(handle, inode);
+		if (bh) {
+			BUFFER_TRACE(bh, "retaking write access");
+			if (ext3_journal_get_write_access(handle, bh))
+				return;
+		}
+	}
+
+	/*
+	 * Any buffers which are on the journal will be in memory. We find
+	 * them on the hash table so journal_revoke() will run journal_forget()
+	 * on them.  We've already detached each block from the file, so
+	 * bforget() in journal_forget() should be safe.
+	 *
+	 * AKPM: turn on bforget in journal_forget()!!!
+	 */
+	for (p = first; p < last; p++) {
+		u32 nr = le32_to_cpu(*p);
+		if (nr) {
+			struct buffer_head *bh;
+
+			*p = 0;
+			bh = sb_find_get_block(inode->i_sb, nr);
+			ext3_forget(handle, 0, inode, bh, nr);
+		}
+	}
+
+	ext3_free_blocks(handle, inode, block_to_free, count);
+}
+
+/**
+ * ext3_free_data - free a list of data blocks
+ * @handle:	handle for this transaction
+ * @inode:	inode we are dealing with
+ * @this_bh:	indirect buffer_head which contains *@first and *@last
+ * @first:	array of block numbers
+ * @last:	points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext3_free_data(handle_t *handle, struct inode *inode,
+			   struct buffer_head *this_bh,
+			   __le32 *first, __le32 *last)
+{
+	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+	unsigned long count = 0;	    /* Number of blocks in the run */
+	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
+					       corresponding to
+					       block_to_free */
+	ext3_fsblk_t nr;		    /* Current block # */
+	__le32 *p;			    /* Pointer into inode/ind
+					       for current block */
+	int err;
+
+	if (this_bh) {				/* For indirect block */
+		BUFFER_TRACE(this_bh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, this_bh);
+		/* Important: if we can't update the indirect pointers
+		 * to the blocks, we can't free them. */
+		if (err)
+			return;
+	}
+
+	for (p = first; p < last; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0) {
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			} else if (nr == block_to_free + count) {
+				count++;
+			} else {
+				ext3_clear_blocks(handle, inode, this_bh,
+						  block_to_free,
+						  count, block_to_free_p, p);
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			}
+		}
+	}
+
+	if (count > 0)
+		ext3_clear_blocks(handle, inode, this_bh, block_to_free,
+				  count, block_to_free_p, p);
+
+	if (this_bh) {
+		BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if (bh2jh(this_bh))
+			ext3_journal_dirty_metadata(handle, this_bh);
+		else
+			ext3_error(inode->i_sb, "ext3_free_data",
+				   "circular indirect block detected, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long)this_bh->b_blocknr);
+	}
+}
+
+/**
+ *	ext3_free_branches - free an array of branches
+ *	@handle: JBD handle for this transaction
+ *	@inode:	inode we are dealing with
+ *	@parent_bh: the buffer_head which contains *@first and *@last
+ *	@first:	array of block numbers
+ *	@last:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks referred from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext3_free_branches(handle_t *handle, struct inode *inode,
+			       struct buffer_head *parent_bh,
+			       __le32 *first, __le32 *last, int depth)
+{
+	ext3_fsblk_t nr;
+	__le32 *p;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	if (depth--) {
+		struct buffer_head *bh;
+		int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+		p = last;
+		while (--p >= first) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;		/* A hole */
+
+			/* Go read the buffer for the next level down */
+			bh = sb_bread(inode->i_sb, nr);
+
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */
+			if (!bh) {
+				ext3_error(inode->i_sb, "ext3_free_branches",
+					   "Read failure, inode=%lu, block="E3FSBLK,
+					   inode->i_ino, nr);
+				continue;
+			}
+
+			/* This zaps the entire block.  Bottom up. */
+			BUFFER_TRACE(bh, "free child branches");
+			ext3_free_branches(handle, inode, bh,
+					   (__le32*)bh->b_data,
+					   (__le32*)bh->b_data + addr_per_block,
+					   depth);
+
+			/*
+			 * Everything below this this pointer has been
+			 * released.  Now let this top-of-subtree go.
+			 *
+			 * We want the freeing of this indirect block to be
+			 * atomic in the journal with the updating of the
+			 * bitmap block which owns it.  So make some room in
+			 * the journal.
+			 *
+			 * We zero the parent pointer *after* freeing its
+			 * pointee in the bitmaps, so if extend_transaction()
+			 * for some reason fails to put the bitmap changes and
+			 * the release into the same transaction, recovery
+			 * will merely complain about releasing a free block,
+			 * rather than leaking blocks.
+			 */
+			if (is_handle_aborted(handle))
+				return;
+			if (try_to_extend_transaction(handle, inode)) {
+				ext3_mark_inode_dirty(handle, inode);
+				truncate_restart_transaction(handle, inode);
+			}
+
+			/*
+			 * We've probably journalled the indirect block several
+			 * times during the truncate.  But it's no longer
+			 * needed and we now drop it from the transaction via
+			 * journal_revoke().
+			 *
+			 * That's easy if it's exclusively part of this
+			 * transaction.  But if it's part of the committing
+			 * transaction then journal_forget() will simply
+			 * brelse() it.  That means that if the underlying
+			 * block is reallocated in ext3_get_block(),
+			 * unmap_underlying_metadata() will find this block
+			 * and will try to get rid of it.  damn, damn. Thus
+			 * we don't allow a block to be reallocated until
+			 * a transaction freeing it has fully committed.
+			 *
+			 * We also have to make sure journal replay after a
+			 * crash does not overwrite non-journaled data blocks
+			 * with old metadata when the block got reallocated for
+			 * data.  Thus we have to store a revoke record for a
+			 * block in the same transaction in which we free the
+			 * block.
+			 */
+			ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+
+			ext3_free_blocks(handle, inode, nr, 1);
+
+			if (parent_bh) {
+				/*
+				 * The block which we have just freed is
+				 * pointed to by an indirect block: journal it
+				 */
+				BUFFER_TRACE(parent_bh, "get_write_access");
+				if (!ext3_journal_get_write_access(handle,
+								   parent_bh)){
+					*p = 0;
+					BUFFER_TRACE(parent_bh,
+					"call ext3_journal_dirty_metadata");
+					ext3_journal_dirty_metadata(handle,
+								    parent_bh);
+				}
+			}
+		}
+	} else {
+		/* We have reached the bottom of the tree. */
+		BUFFER_TRACE(parent_bh, "free data blocks");
+		ext3_free_data(handle, inode, parent_bh, first, last);
+	}
+}
+
+int ext3_can_truncate(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext3_inode_is_fast_symlink(inode);
+	return 0;
+}
+
+/*
+ * ext3_truncate()
+ *
+ * We block out ext3_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext3_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk.  We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable.  It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext3_truncate() to have another go.  So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext3 filesystem.  But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext3_truncate() run will find them and release them.
+ */
+void ext3_truncate(struct inode *inode)
+{
+	handle_t *handle;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	__le32 *i_data = ei->i_data;
+	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n;
+	long last_block;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+
+	trace_ext3_truncate_enter(inode);
+
+	if (!ext3_can_truncate(inode))
+		goto out_notrans;
+
+	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+		ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
+
+	handle = start_transaction(inode);
+	if (IS_ERR(handle))
+		goto out_notrans;
+
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+	n = ext3_block_to_path(inode, last_block, offsets, NULL);
+	if (n == 0)
+		goto out_stop;	/* error */
+
+	/*
+	 * OK.  This truncate is going to happen.  We add the inode to the
+	 * orphan list, so that if this truncate spans multiple transactions,
+	 * and we crash, we will resume the truncate when the filesystem
+	 * recovers.  It also marks the inode dirty, to catch the new size.
+	 *
+	 * Implication: the file must always be in a sane, consistent
+	 * truncatable state while each transaction commits.
+	 */
+	if (ext3_orphan_add(handle, inode))
+		goto out_stop;
+
+	/*
+	 * The orphan list entry will now protect us from any crash which
+	 * occurs before the truncate completes, so it is now safe to propagate
+	 * the new, shorter inode size (held for now in i_size) into the
+	 * on-disk inode. We do this via i_disksize, which is the value which
+	 * ext3 *really* writes onto the disk inode.
+	 */
+	ei->i_disksize = inode->i_size;
+
+	/*
+	 * From here we block out all ext3_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	mutex_lock(&ei->truncate_mutex);
+
+	if (n == 1) {		/* direct blocks */
+		ext3_free_data(handle, inode, NULL, i_data+offsets[0],
+			       i_data + EXT3_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (not detached) */
+	if (nr) {
+		if (partial == chain) {
+			/* Shared branch grows from the inode */
+			ext3_free_branches(handle, inode, NULL,
+					   &nr, &nr+1, (chain+n-1) - partial);
+			*partial->p = 0;
+			/*
+			 * We mark the inode dirty prior to restart,
+			 * and prior to stop.  No need for it here.
+			 */
+		} else {
+			/* Shared branch grows from an indirect block */
+			ext3_free_branches(handle, inode, partial->bh,
+					partial->p,
+					partial->p+1, (chain+n-1) - partial);
+		}
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+	default:
+		nr = i_data[EXT3_IND_BLOCK];
+		if (nr) {
+			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+			i_data[EXT3_IND_BLOCK] = 0;
+		}
+	case EXT3_IND_BLOCK:
+		nr = i_data[EXT3_DIND_BLOCK];
+		if (nr) {
+			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+			i_data[EXT3_DIND_BLOCK] = 0;
+		}
+	case EXT3_DIND_BLOCK:
+		nr = i_data[EXT3_TIND_BLOCK];
+		if (nr) {
+			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+			i_data[EXT3_TIND_BLOCK] = 0;
+		}
+	case EXT3_TIND_BLOCK:
+		;
+	}
+
+	ext3_discard_reservation(inode);
+
+	mutex_unlock(&ei->truncate_mutex);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	ext3_mark_inode_dirty(handle, inode);
+
+	/*
+	 * In a multi-transaction truncate, we only make the final transaction
+	 * synchronous
+	 */
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+out_stop:
+	/*
+	 * If this was a simple ftruncate(), and the file will remain alive
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext3_evict_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(handle, inode);
+
+	ext3_journal_stop(handle);
+	trace_ext3_truncate_exit(inode);
+	return;
+out_notrans:
+	/*
+	 * Delete the inode from orphan list so that it doesn't stay there
+	 * forever and trigger assertion on umount.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(NULL, inode);
+	trace_ext3_truncate_exit(inode);
+}
+
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
+		unsigned long ino, struct ext3_iloc *iloc)
+{
+	unsigned long block_group;
+	unsigned long offset;
+	ext3_fsblk_t block;
+	struct ext3_group_desc *gdp;
+
+	if (!ext3_valid_inum(sb, ino)) {
+		/*
+		 * This error is already checked for in namei.c unless we are
+		 * looking at an NFS filehandle, in which case no error
+		 * report is needed
+		 */
+		return 0;
+	}
+
+	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+	gdp = ext3_get_group_desc(sb, block_group, NULL);
+	if (!gdp)
+		return 0;
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
+		EXT3_INODE_SIZE(sb);
+	block = le32_to_cpu(gdp->bg_inode_table) +
+		(offset >> EXT3_BLOCK_SIZE_BITS(sb));
+
+	iloc->block_group = block_group;
+	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
+	return block;
+}
+
+/*
+ * ext3_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext3_get_inode_loc(struct inode *inode,
+				struct ext3_iloc *iloc, int in_mem)
+{
+	ext3_fsblk_t block;
+	struct buffer_head *bh;
+
+	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
+	if (!block)
+		return -EIO;
+
+	bh = sb_getblk(inode->i_sb, block);
+	if (unlikely(!bh)) {
+		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+				"unable to read inode block - "
+				"inode=%lu, block="E3FSBLK,
+				 inode->i_ino, block);
+		return -ENOMEM;
+	}
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
+		if (buffer_uptodate(bh)) {
+			/* someone brought it uptodate while we waited */
+			unlock_buffer(bh);
+			goto has_buffer;
+		}
+
+		/*
+		 * If we have all information of the inode in memory and this
+		 * is the only valid inode in the block, we need not read the
+		 * block.
+		 */
+		if (in_mem) {
+			struct buffer_head *bitmap_bh;
+			struct ext3_group_desc *desc;
+			int inodes_per_buffer;
+			int inode_offset, i;
+			int block_group;
+			int start;
+
+			block_group = (inode->i_ino - 1) /
+					EXT3_INODES_PER_GROUP(inode->i_sb);
+			inodes_per_buffer = bh->b_size /
+				EXT3_INODE_SIZE(inode->i_sb);
+			inode_offset = ((inode->i_ino - 1) %
+					EXT3_INODES_PER_GROUP(inode->i_sb));
+			start = inode_offset & ~(inodes_per_buffer - 1);
+
+			/* Is the inode bitmap in cache? */
+			desc = ext3_get_group_desc(inode->i_sb,
+						block_group, NULL);
+			if (!desc)
+				goto make_io;
+
+			bitmap_bh = sb_getblk(inode->i_sb,
+					le32_to_cpu(desc->bg_inode_bitmap));
+			if (unlikely(!bitmap_bh))
+				goto make_io;
+
+			/*
+			 * If the inode bitmap isn't in cache then the
+			 * optimisation may end up performing two reads instead
+			 * of one, so skip it.
+			 */
+			if (!buffer_uptodate(bitmap_bh)) {
+				brelse(bitmap_bh);
+				goto make_io;
+			}
+			for (i = start; i < start + inodes_per_buffer; i++) {
+				if (i == inode_offset)
+					continue;
+				if (ext3_test_bit(i, bitmap_bh->b_data))
+					break;
+			}
+			brelse(bitmap_bh);
+			if (i == start + inodes_per_buffer) {
+				/* all other inodes are free, so skip I/O */
+				memset(bh->b_data, 0, bh->b_size);
+				set_buffer_uptodate(bh);
+				unlock_buffer(bh);
+				goto has_buffer;
+			}
+		}
+
+make_io:
+		/*
+		 * There are other valid inodes in the buffer, this inode
+		 * has in-inode xattrs, or we don't have this inode in memory.
+		 * Read the block from disk.
+		 */
+		trace_ext3_load_inode(inode);
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ | REQ_META | REQ_PRIO, bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			ext3_error(inode->i_sb, "ext3_get_inode_loc",
+					"unable to read inode block - "
+					"inode=%lu, block="E3FSBLK,
+					inode->i_ino, block);
+			brelse(bh);
+			return -EIO;
+		}
+	}
+has_buffer:
+	iloc->bh = bh;
+	return 0;
+}
+
+int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
+{
+	/* We have all inode data except xattrs in memory here. */
+	return __ext3_get_inode_loc(inode, iloc,
+		!ext3_test_inode_state(inode, EXT3_STATE_XATTR));
+}
+
+void ext3_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = EXT3_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (flags & EXT3_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & EXT3_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & EXT3_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & EXT3_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & EXT3_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
+void ext3_get_inode_flags(struct ext3_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
+			EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT3_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT3_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT3_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT3_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT3_DIRSYNC_FL;
+}
+
+struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
+{
+	struct ext3_iloc iloc;
+	struct ext3_inode *raw_inode;
+	struct ext3_inode_info *ei;
+	struct buffer_head *bh;
+	struct inode *inode;
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+	transaction_t *transaction;
+	long ret;
+	int block;
+	uid_t i_uid;
+	gid_t i_gid;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = EXT3_I(inode);
+	ei->i_block_alloc_info = NULL;
+
+	ret = __ext3_get_inode_loc(inode, &iloc, 0);
+	if (ret < 0)
+		goto bad_inode;
+	bh = iloc.bh;
+	raw_inode = ext3_raw_inode(&iloc);
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if(!(test_opt (inode->i_sb, NO_UID32))) {
+		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	}
+	i_uid_write(inode, i_uid);
+	i_gid_write(inode, i_gid);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
+	inode->i_size = le32_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+
+	ei->i_state_flags = 0;
+	ei->i_dir_start_lookup = 0;
+	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	/* We now have enough fields to check if the inode was active or not.
+	 * This is needed because nfsd might try to access dead inodes
+	 * the test is that same one that e2fsck uses
+	 * NeilBrown 1999oct15
+	 */
+	if (inode->i_nlink == 0) {
+		if (inode->i_mode == 0 ||
+		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
+			/* this inode is deleted */
+			brelse (bh);
+			ret = -ESTALE;
+			goto bad_inode;
+		}
+		/* The only unlinked inodes we let through here have
+		 * valid i_mode and are being read by the orphan
+		 * recovery code: that's fine, we're about to complete
+		 * the process of deleting those. */
+	}
+	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+#ifdef EXT3_FRAGMENTS
+	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+	ei->i_frag_no = raw_inode->i_frag;
+	ei->i_frag_size = raw_inode->i_fsize;
+#endif
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	if (!S_ISREG(inode->i_mode)) {
+		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+	} else {
+		inode->i_size |=
+			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+	}
+	ei->i_disksize = inode->i_size;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+	ei->i_block_group = iloc.block_group;
+	/*
+	 * NOTE! The in-memory inode i_data array is in little-endian order
+	 * even on big-endian machines: we do NOT byteswap the block numbers!
+	 */
+	for (block = 0; block < EXT3_N_BLOCKS; block++)
+		ei->i_data[block] = raw_inode->i_block[block];
+	INIT_LIST_HEAD(&ei->i_orphan);
+
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		tid_t tid;
+
+		spin_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		spin_unlock(&journal->j_state_lock);
+		atomic_set(&ei->i_sync_tid, tid);
+		atomic_set(&ei->i_datasync_tid, tid);
+	}
+
+	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
+	    EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+		/*
+		 * When mke2fs creates big inodes it does not zero out
+		 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
+		 * so ignore those first few inodes.
+		 */
+		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+		if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+		    EXT3_INODE_SIZE(inode->i_sb)) {
+			brelse (bh);
+			ret = -EIO;
+			goto bad_inode;
+		}
+		if (ei->i_extra_isize == 0) {
+			/* The extra space is currently unused. Use it. */
+			ei->i_extra_isize = sizeof(struct ext3_inode) -
+					    EXT3_GOOD_OLD_INODE_SIZE;
+		} else {
+			__le32 *magic = (void *)raw_inode +
+					EXT3_GOOD_OLD_INODE_SIZE +
+					ei->i_extra_isize;
+			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
+				 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
+		}
+	} else
+		ei->i_extra_isize = 0;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ext3_file_inode_operations;
+		inode->i_fop = &ext3_file_operations;
+		ext3_set_aops(inode);
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ext3_dir_inode_operations;
+		inode->i_fop = &ext3_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (ext3_inode_is_fast_symlink(inode)) {
+			inode->i_op = &ext3_fast_symlink_inode_operations;
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
+			inode->i_op = &ext3_symlink_inode_operations;
+			ext3_set_aops(inode);
+		}
+	} else {
+		inode->i_op = &ext3_special_inode_operations;
+		if (raw_inode->i_block[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	}
+	brelse (iloc.bh);
+	ext3_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache.  This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext3_do_update_inode(handle_t *handle,
+				struct inode *inode,
+				struct ext3_iloc *iloc)
+{
+	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct buffer_head *bh = iloc->bh;
+	int err = 0, rc, block;
+	int need_datasync = 0;
+	__le32 disksize;
+	uid_t i_uid;
+	gid_t i_gid;
+
+again:
+	/* we can't allow multiple procs in here at once, its a bit racey */
+	lock_buffer(bh);
+
+	/* For fields not not tracking in the in-memory inode,
+	 * initialise them to zero for new inodes. */
+	if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
+		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
+
+	ext3_get_inode_flags(ei);
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	i_uid = i_uid_read(inode);
+	i_gid = i_gid_read(inode);
+	if(!(test_opt(inode->i_sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+		if(!ei->i_dtime) {
+			raw_inode->i_uid_high =
+				cpu_to_le16(high_16_bits(i_uid));
+			raw_inode->i_gid_high =
+				cpu_to_le16(high_16_bits(i_gid));
+		} else {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		}
+	} else {
+		raw_inode->i_uid_low =
+			cpu_to_le16(fs_high2lowuid(i_uid));
+		raw_inode->i_gid_low =
+			cpu_to_le16(fs_high2lowgid(i_gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	disksize = cpu_to_le32(ei->i_disksize);
+	if (disksize != raw_inode->i_size) {
+		need_datasync = 1;
+		raw_inode->i_size = disksize;
+	}
+	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+#ifdef EXT3_FRAGMENTS
+	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
+	raw_inode->i_frag = ei->i_frag_no;
+	raw_inode->i_fsize = ei->i_frag_size;
+#endif
+	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+	if (!S_ISREG(inode->i_mode)) {
+		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+	} else {
+		disksize = cpu_to_le32(ei->i_disksize >> 32);
+		if (disksize != raw_inode->i_size_high) {
+			raw_inode->i_size_high = disksize;
+			need_datasync = 1;
+		}
+		if (ei->i_disksize > 0x7fffffffULL) {
+			struct super_block *sb = inode->i_sb;
+			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
+			    EXT3_SB(sb)->s_es->s_rev_level ==
+					cpu_to_le32(EXT3_GOOD_OLD_REV)) {
+			       /* If this is the first large file
+				* created, add a flag to the superblock.
+				*/
+				unlock_buffer(bh);
+				err = ext3_journal_get_write_access(handle,
+						EXT3_SB(sb)->s_sbh);
+				if (err)
+					goto out_brelse;
+
+				ext3_update_dynamic_rev(sb);
+				EXT3_SET_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
+				handle->h_sync = 1;
+				err = ext3_journal_dirty_metadata(handle,
+						EXT3_SB(sb)->s_sbh);
+				/* get our lock and start over */
+				goto again;
+			}
+		}
+	}
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
+		raw_inode->i_block[block] = ei->i_data[block];
+
+	if (ei->i_extra_isize)
+		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+
+	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+	unlock_buffer(bh);
+	rc = ext3_journal_dirty_metadata(handle, bh);
+	if (!err)
+		err = rc;
+	ext3_clear_inode_state(inode, EXT3_STATE_NEW);
+
+	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+	if (need_datasync)
+		atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+out_brelse:
+	brelse (bh);
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext3_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
+ *   Here, there will be no transaction running. We wait for any running
+ *   transaction to commit.
+ *
+ * - Within flush work (for sys_sync(), kupdate and such).
+ *   We wait on commit, if told to.
+ *
+ * - Within iput_final() -> write_inode_now()
+ *   We wait on commit, if told to.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext3_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
+ * writeback.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this.  The code:
+ *
+ *	mark_inode_dirty(inode)
+ *	stuff();
+ *	inode->i_size = expr;
+ *
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost.  Plus the inode will no longer be on the
+ * superblock's dirty inode list.
+ */
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+		return 0;
+
+	if (ext3_journal_current_handle()) {
+		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+		dump_stack();
+		return -EIO;
+	}
+
+	/*
+	 * No need to force transaction in WB_SYNC_NONE mode. Also
+	 * ext3_sync_fs() will force the commit after everything is
+	 * written.
+	 */
+	if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
+		return 0;
+
+	return ext3_force_commit(inode->i_sb);
+}
+
+/*
+ * ext3_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible.  In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk.  (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Called with inode->sem down.
+ */
+int ext3_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error, rc = 0;
+	const unsigned int ia_valid = attr->ia_valid;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+		handle_t *handle;
+
+		/* (user+group)*(old+new) structure, inode write (sb,
+		 * inode block, ? - but truncate inode update has it) */
+		handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+					EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+		error = dquot_transfer(inode, attr);
+		if (error) {
+			ext3_journal_stop(handle);
+			return error;
+		}
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		error = ext3_mark_inode_dirty(handle, inode);
+		ext3_journal_stop(handle);
+	}
+
+	if (attr->ia_valid & ATTR_SIZE)
+		inode_dio_wait(inode);
+
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+		handle_t *handle;
+
+		handle = ext3_journal_start(inode, 3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+
+		error = ext3_orphan_add(handle, inode);
+		if (error) {
+			ext3_journal_stop(handle);
+			goto err_out;
+		}
+		EXT3_I(inode)->i_disksize = attr->ia_size;
+		error = ext3_mark_inode_dirty(handle, inode);
+		ext3_journal_stop(handle);
+		if (error) {
+			/* Some hard fs error must have happened. Bail out. */
+			ext3_orphan_del(NULL, inode);
+			goto err_out;
+		}
+		rc = ext3_block_truncate_page(inode, attr->ia_size);
+		if (rc) {
+			/* Cleanup orphan list and exit */
+			handle = ext3_journal_start(inode, 3);
+			if (IS_ERR(handle)) {
+				ext3_orphan_del(NULL, inode);
+				goto err_out;
+			}
+			ext3_orphan_del(handle, inode);
+			ext3_journal_stop(handle);
+			goto err_out;
+		}
+	}
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		truncate_setsize(inode, attr->ia_size);
+		ext3_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	if (ia_valid & ATTR_MODE)
+		rc = posix_acl_chmod(inode, inode->i_mode);
+
+err_out:
+	ext3_std_error(inode->i_sb, error);
+	if (!error)
+		error = rc;
+	return error;
+}
+
+
+/*
+ * How many blocks doth make a writepage()?
+ *
+ * With N blocks per page, it may be:
+ * N data blocks
+ * 2 indirect block
+ * 2 dindirect
+ * 1 tindirect
+ * N+5 bitmap blocks (from the above)
+ * N+5 group descriptor summary blocks
+ * 1 inode block
+ * 1 superblock.
+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
+ *
+ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
+ *
+ * With ordered or writeback data it's the same, less the N data blocks.
+ *
+ * If the inode's direct blocks can hold an integral number of pages then a
+ * page cannot straddle two indirect blocks, and we can only touch one indirect
+ * and dindirect block, and the "5" above becomes "3".
+ *
+ * This still overestimates under most circumstances.  If we were to pass the
+ * start and end offsets in here as well we could do block_to_path() on each
+ * block and work out the exact number of indirects which are touched.  Pah.
+ */
+
+static int ext3_writepage_trans_blocks(struct inode *inode)
+{
+	int bpp = ext3_journal_blocks_per_page(inode);
+	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+	int ret;
+
+	if (ext3_should_journal_data(inode))
+		ret = 3 * (bpp + indirects) + 2;
+	else
+		ret = 2 * (bpp + indirects) + indirects + 2;
+
+#ifdef CONFIG_QUOTA
+	/* We know that structure was already allocated during dquot_initialize so
+	 * we will be updating only the data blocks + inodes */
+	ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+	return ret;
+}
+
+/*
+ * The caller must have previously called ext3_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext3_mark_iloc_dirty(handle_t *handle,
+		struct inode *inode, struct ext3_iloc *iloc)
+{
+	int err = 0;
+
+	/* the do_update_inode consumes one bh->b_count */
+	get_bh(iloc->bh);
+
+	/* ext3_do_update_inode() does journal_dirty_metadata */
+	err = ext3_do_update_inode(handle, inode, iloc);
+	put_bh(iloc->bh);
+	return err;
+}
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int
+ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+			 struct ext3_iloc *iloc)
+{
+	int err = 0;
+	if (handle) {
+		err = ext3_get_inode_loc(inode, iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc->bh, "get_write_access");
+			err = ext3_journal_get_write_access(handle, iloc->bh);
+			if (err) {
+				brelse(iloc->bh);
+				iloc->bh = NULL;
+			}
+		}
+	}
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O.  This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating?  Not really.  Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ */
+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+	struct ext3_iloc iloc;
+	int err;
+
+	might_sleep();
+	trace_ext3_mark_inode_dirty(inode, _RET_IP_);
+	err = ext3_reserve_inode_write(handle, inode, &iloc);
+	if (!err)
+		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+	return err;
+}
+
+/*
+ * ext3_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+void ext3_dirty_inode(struct inode *inode, int flags)
+{
+	handle_t *current_handle = ext3_journal_current_handle();
+	handle_t *handle;
+
+	handle = ext3_journal_start(inode, 2);
+	if (IS_ERR(handle))
+		goto out;
+	if (current_handle &&
+		current_handle->h_transaction != handle->h_transaction) {
+		/* This task has a transaction open against a different fs */
+		printk(KERN_EMERG "%s: transactions do not match!\n",
+		       __func__);
+	} else {
+		jbd_debug(5, "marking dirty.  outer handle=%p\n",
+				current_handle);
+		ext3_mark_inode_dirty(handle, inode);
+	}
+	ext3_journal_stop(handle);
+out:
+	return;
+}
+
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early.  Unlike
+ * ext3_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext3_pin_inode(handle_t *handle, struct inode *inode)
+{
+	struct ext3_iloc iloc;
+
+	int err = 0;
+	if (handle) {
+		err = ext3_get_inode_loc(inode, &iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc.bh, "get_write_access");
+			err = journal_get_write_access(handle, iloc.bh);
+			if (!err)
+				err = ext3_journal_dirty_metadata(handle,
+								  iloc.bh);
+			brelse(iloc.bh);
+		}
+	}
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+#endif
+
+int ext3_change_inode_journal_flag(struct inode *inode, int val)
+{
+	journal_t *journal;
+	handle_t *handle;
+	int err;
+
+	/*
+	 * We have to be very careful here: changing a data block's
+	 * journaling status dynamically is dangerous.  If we write a
+	 * data block to the journal, change the status and then delete
+	 * that block, we risk forgetting to revoke the old log record
+	 * from the journal and so a subsequent replay can corrupt data.
+	 * So, first we make sure that the journal is empty and that
+	 * nobody is changing anything.
+	 */
+
+	journal = EXT3_JOURNAL(inode);
+	if (is_journal_aborted(journal))
+		return -EROFS;
+
+	journal_lock_updates(journal);
+	journal_flush(journal);
+
+	/*
+	 * OK, there are no updates running now, and all cached data is
+	 * synced to disk.  We are now in a completely consistent state
+	 * which doesn't have anything in the journal, and we know that
+	 * no filesystem updates are running, so it is safe to modify
+	 * the inode's in-core data-journaling state flag now.
+	 */
+
+	if (val)
+		EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
+	else
+		EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
+	ext3_set_aops(inode);
+
+	journal_unlock_updates(journal);
+
+	/* Finally we can mark the inode as dirty. */
+
+	handle = ext3_journal_start(inode, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext3_mark_inode_dirty(handle, inode);
+	handle->h_sync = 1;
+	ext3_journal_stop(handle);
+	ext3_std_error(inode->i_sb, err);
+
+	return err;
+}
diff --git a/kernel/fs/ext3/ioctl.c b/kernel/fs/ext3/ioctl.c
new file mode 100644
index 000000000..4d96e9a64
--- /dev/null
+++ b/kernel/fs/ext3/ioctl.c
@@ -0,0 +1,327 @@
+/*
+ * linux/fs/ext3/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/mount.h>
+#include <linux/compat.h>
+#include <asm/uaccess.h>
+#include "ext3.h"
+
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned int flags;
+	unsigned short rsv_window_size;
+
+	ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+	switch (cmd) {
+	case EXT3_IOC_GETFLAGS:
+		ext3_get_inode_flags(ei);
+		flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case EXT3_IOC_SETFLAGS: {
+		handle_t *handle = NULL;
+		int err;
+		struct ext3_iloc iloc;
+		unsigned int oldflags;
+		unsigned int jflag;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		flags = ext3_mask_flags(inode->i_mode, flags);
+
+		mutex_lock(&inode->i_mutex);
+
+		/* Is it quota file? Do not allow user to mess with it */
+		err = -EPERM;
+		if (IS_NOQUOTA(inode))
+			goto flags_out;
+
+		oldflags = ei->i_flags;
+
+		/* The JOURNAL_DATA flag is modifiable only by root */
+		jflag = flags & EXT3_JOURNAL_DATA_FL;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 *
+		 * This test looks nicer. Thanks to Pauline Middelink
+		 */
+		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE))
+				goto flags_out;
+		}
+
+		/*
+		 * The JOURNAL_DATA flag can only be changed by
+		 * the relevant capability.
+		 */
+		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
+			if (!capable(CAP_SYS_RESOURCE))
+				goto flags_out;
+		}
+
+		handle = ext3_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto flags_out;
+		}
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+		err = ext3_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto flags_err;
+
+		flags = flags & EXT3_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
+		ei->i_flags = flags;
+
+		ext3_set_inode_flags(inode);
+		inode->i_ctime = CURRENT_TIME_SEC;
+
+		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+		ext3_journal_stop(handle);
+		if (err)
+			goto flags_out;
+
+		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
+			err = ext3_change_inode_journal_flag(inode, jflag);
+flags_out:
+		mutex_unlock(&inode->i_mutex);
+		mnt_drop_write_file(filp);
+		return err;
+	}
+	case EXT3_IOC_GETVERSION:
+	case EXT3_IOC_GETVERSION_OLD:
+		return put_user(inode->i_generation, (int __user *) arg);
+	case EXT3_IOC_SETVERSION:
+	case EXT3_IOC_SETVERSION_OLD: {
+		handle_t *handle;
+		struct ext3_iloc iloc;
+		__u32 generation;
+		int err;
+
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		if (get_user(generation, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		handle = ext3_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto unlock_out;
+		}
+		err = ext3_reserve_inode_write(handle, inode, &iloc);
+		if (err == 0) {
+			inode->i_ctime = CURRENT_TIME_SEC;
+			inode->i_generation = generation;
+			err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+		}
+		ext3_journal_stop(handle);
+
+unlock_out:
+		mutex_unlock(&inode->i_mutex);
+setversion_out:
+		mnt_drop_write_file(filp);
+		return err;
+	}
+	case EXT3_IOC_GETRSVSZ:
+		if (test_opt(inode->i_sb, RESERVATION)
+			&& S_ISREG(inode->i_mode)
+			&& ei->i_block_alloc_info) {
+			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
+			return put_user(rsv_window_size, (int __user *)arg);
+		}
+		return -ENOTTY;
+	case EXT3_IOC_SETRSVSZ: {
+		int err;
+
+		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+			return -ENOTTY;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		if (!inode_owner_or_capable(inode)) {
+			err = -EACCES;
+			goto setrsvsz_out;
+		}
+
+		if (get_user(rsv_window_size, (int __user *)arg)) {
+			err = -EFAULT;
+			goto setrsvsz_out;
+		}
+
+		if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
+			rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
+
+		/*
+		 * need to allocate reservation structure for this inode
+		 * before set the window size
+		 */
+		mutex_lock(&ei->truncate_mutex);
+		if (!ei->i_block_alloc_info)
+			ext3_init_block_alloc_info(inode);
+
+		if (ei->i_block_alloc_info){
+			struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
+			rsv->rsv_goal_size = rsv_window_size;
+		}
+		mutex_unlock(&ei->truncate_mutex);
+setrsvsz_out:
+		mnt_drop_write_file(filp);
+		return err;
+	}
+	case EXT3_IOC_GROUP_EXTEND: {
+		ext3_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		int err, err2;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
+		}
+		err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
+group_extend_out:
+		mnt_drop_write_file(filp);
+		return err;
+	}
+	case EXT3_IOC_GROUP_ADD: {
+		struct ext3_new_group_data input;
+		struct super_block *sb = inode->i_sb;
+		int err, err2;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
+				sizeof(input))) {
+			err = -EFAULT;
+			goto group_add_out;
+		}
+
+		err = ext3_group_add(sb, &input);
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
+group_add_out:
+		mnt_drop_write_file(filp);
+		return err;
+	}
+	case FITRIM: {
+
+		struct super_block *sb = inode->i_sb;
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+				   sizeof(range)))
+			return -EFAULT;
+
+		ret = ext3_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range __user *)arg, &range,
+				 sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT3_IOC32_GETFLAGS:
+		cmd = EXT3_IOC_GETFLAGS;
+		break;
+	case EXT3_IOC32_SETFLAGS:
+		cmd = EXT3_IOC_SETFLAGS;
+		break;
+	case EXT3_IOC32_GETVERSION:
+		cmd = EXT3_IOC_GETVERSION;
+		break;
+	case EXT3_IOC32_SETVERSION:
+		cmd = EXT3_IOC_SETVERSION;
+		break;
+	case EXT3_IOC32_GROUP_EXTEND:
+		cmd = EXT3_IOC_GROUP_EXTEND;
+		break;
+	case EXT3_IOC32_GETVERSION_OLD:
+		cmd = EXT3_IOC_GETVERSION_OLD;
+		break;
+	case EXT3_IOC32_SETVERSION_OLD:
+		cmd = EXT3_IOC_SETVERSION_OLD;
+		break;
+#ifdef CONFIG_JBD_DEBUG
+	case EXT3_IOC32_WAIT_FOR_READONLY:
+		cmd = EXT3_IOC_WAIT_FOR_READONLY;
+		break;
+#endif
+	case EXT3_IOC32_GETRSVSZ:
+		cmd = EXT3_IOC_GETRSVSZ;
+		break;
+	case EXT3_IOC32_SETRSVSZ:
+		cmd = EXT3_IOC_SETRSVSZ;
+		break;
+	case EXT3_IOC_GROUP_ADD:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/kernel/fs/ext3/namei.c b/kernel/fs/ext3/namei.c
new file mode 100644
index 000000000..4264b9bd0
--- /dev/null
+++ b/kernel/fs/ext3/namei.c
@@ -0,0 +1,2585 @@
+/*
+ *  linux/fs/ext3/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  Directory entry file type support and forward compatibility hooks
+ *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *  Hash Tree Directory indexing (c)
+ *	Daniel Phillips, 2001
+ *  Hash Tree Directory indexing porting
+ *	Christopher Li, 2002
+ *  Hash Tree Directory indexing cleanup
+ *	Theodore Ts'o, 2002
+ */
+
+#include <linux/quotaops.h>
+#include "ext3.h"
+#include "namei.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+
+static struct buffer_head *ext3_append(handle_t *handle,
+					struct inode *inode,
+					u32 *block, int *err)
+{
+	struct buffer_head *bh;
+
+	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+	if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
+		inode->i_size += inode->i_sb->s_blocksize;
+		EXT3_I(inode)->i_disksize = inode->i_size;
+		*err = ext3_journal_get_write_access(handle, bh);
+		if (*err) {
+			brelse(bh);
+			bh = NULL;
+		}
+	}
+	return bh;
+}
+
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+struct fake_dirent
+{
+	__le32 inode;
+	__le16 rec_len;
+	u8 name_len;
+	u8 file_type;
+};
+
+struct dx_countlimit
+{
+	__le16 limit;
+	__le16 count;
+};
+
+struct dx_entry
+{
+	__le32 hash;
+	__le32 block;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero.  Therefore, the
+ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+ */
+
+struct dx_root
+{
+	struct fake_dirent dot;
+	char dot_name[4];
+	struct fake_dirent dotdot;
+	char dotdot_name[4];
+	struct dx_root_info
+	{
+		__le32 reserved_zero;
+		u8 hash_version;
+		u8 info_length; /* 8 */
+		u8 indirect_levels;
+		u8 unused_flags;
+	}
+	info;
+	struct dx_entry	entries[0];
+};
+
+struct dx_node
+{
+	struct fake_dirent fake;
+	struct dx_entry	entries[0];
+};
+
+
+struct dx_frame
+{
+	struct buffer_head *bh;
+	struct dx_entry *entries;
+	struct dx_entry *at;
+};
+
+struct dx_map_entry
+{
+	u32 hash;
+	u16 offs;
+	u16 size;
+};
+
+static inline unsigned dx_get_block (struct dx_entry *entry);
+static void dx_set_block (struct dx_entry *entry, unsigned value);
+static inline unsigned dx_get_hash (struct dx_entry *entry);
+static void dx_set_hash (struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count (struct dx_entry *entries);
+static unsigned dx_get_limit (struct dx_entry *entries);
+static void dx_set_count (struct dx_entry *entries, unsigned value);
+static void dx_set_limit (struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit (struct inode *dir);
+static struct dx_frame *dx_probe(struct qstr *entry,
+				 struct inode *dir,
+				 struct dx_hash_info *hinfo,
+				 struct dx_frame *frame,
+				 int *err);
+static void dx_release (struct dx_frame *frames);
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
+			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+		struct dx_map_entry *offsets, int count);
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash);
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err);
+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode);
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext3_dir_entry_2 *
+ext3_next_entry(struct ext3_dir_entry_2 *p)
+{
+	return (struct ext3_dir_entry_2 *)((char *)p +
+		ext3_rec_len_from_disk(p->rec_len));
+}
+
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+static inline unsigned dx_get_block (struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+
+static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+{
+	entry->block = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_hash (struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->hash);
+}
+
+static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+{
+	entry->hash = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_count (struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit (struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
+		EXT3_DIR_REC_LEN(2) - infosize;
+	return entry_space / sizeof(struct dx_entry);
+}
+
+static inline unsigned dx_node_limit (struct inode *dir)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
+	return entry_space / sizeof(struct dx_entry);
+}
+
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index (char * label, struct dx_entry *entries)
+{
+        int i, n = dx_get_count (entries);
+        printk("%s index ", label);
+        for (i = 0; i < n; i++)
+        {
+                printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
+        }
+        printk("\n");
+}
+
+struct stats
+{
+	unsigned names;
+	unsigned space;
+	unsigned bcount;
+};
+
+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
+				 int size, int show_names)
+{
+	unsigned names = 0, space = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	printk("names: ");
+	while ((char *) de < base + size)
+	{
+		if (de->inode)
+		{
+			if (show_names)
+			{
+				int len = de->name_len;
+				char *name = de->name;
+				while (len--) printk("%c", *name++);
+				ext3fs_dirhash(de->name, de->name_len, &h);
+				printk(":%x.%u ", h.hash,
+				       (unsigned) ((char *) de - base));
+			}
+			space += EXT3_DIR_REC_LEN(de->name_len);
+			names++;
+		}
+		de = ext3_next_entry(de);
+	}
+	printk("(%i)\n", names);
+	return (struct stats) { names, space, 1 };
+}
+
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+			     struct dx_entry *entries, int levels)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+	unsigned bcount = 0;
+	struct buffer_head *bh;
+	int err;
+	printk("%i indexed blocks...\n", count);
+	for (i = 0; i < count; i++, entries++)
+	{
+		u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
+		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+		struct stats stats;
+		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
+		if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
+		stats = levels?
+		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+		   dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
+		names += stats.names;
+		space += stats.space;
+		bcount += stats.bcount;
+		brelse (bh);
+	}
+	if (bcount)
+		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
+			names, space/bcount,(space/bcount)*100/blocksize);
+	return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally.  The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(struct qstr *entry, struct inode *dir,
+	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+{
+	unsigned count, indirect;
+	struct dx_entry *at, *entries, *p, *q, *m;
+	struct dx_root *root;
+	struct buffer_head *bh;
+	struct dx_frame *frame = frame_in;
+	u32 hash;
+
+	frame->bh = NULL;
+	if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+	root = (struct dx_root *) bh->b_data;
+	if (root->info.hash_version != DX_HASH_TEA &&
+	    root->info.hash_version != DX_HASH_HALF_MD4 &&
+	    root->info.hash_version != DX_HASH_LEGACY) {
+		ext3_warning(dir->i_sb, __func__,
+			     "Unrecognised inode hash code %d",
+			     root->info.hash_version);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+	if (entry)
+		ext3fs_dirhash(entry->name, entry->len, hinfo);
+	hash = hinfo->hash;
+
+	if (root->info.unused_flags & 1) {
+		ext3_warning(dir->i_sb, __func__,
+			     "Unimplemented inode hash flags: %#06x",
+			     root->info.unused_flags);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	if ((indirect = root->info.indirect_levels) > 1) {
+		ext3_warning(dir->i_sb, __func__,
+			     "Unimplemented inode hash depth: %#06x",
+			     root->info.indirect_levels);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	entries = (struct dx_entry *) (((char *)&root->info) +
+				       root->info.info_length);
+
+	if (dx_get_limit(entries) != dx_root_limit(dir,
+						   root->info.info_length)) {
+		ext3_warning(dir->i_sb, __func__,
+			     "dx entry: limit != root limit");
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	dxtrace (printk("Look up %x", hash));
+	while (1)
+	{
+		count = dx_get_count(entries);
+		if (!count || count > dx_get_limit(entries)) {
+			ext3_warning(dir->i_sb, __func__,
+				     "dx entry: no count or count > limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+
+		p = entries + 1;
+		q = entries + count - 1;
+		while (p <= q)
+		{
+			m = p + (q - p)/2;
+			dxtrace(printk("."));
+			if (dx_get_hash(m) > hash)
+				q = m - 1;
+			else
+				p = m + 1;
+		}
+
+		if (0) // linear search cross check
+		{
+			unsigned n = count - 1;
+			at = entries;
+			while (n--)
+			{
+				dxtrace(printk(","));
+				if (dx_get_hash(++at) > hash)
+				{
+					at--;
+					break;
+				}
+			}
+			assert (at == p - 1);
+		}
+
+		at = p - 1;
+		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+		frame->bh = bh;
+		frame->entries = entries;
+		frame->at = at;
+		if (!indirect--) return frame;
+		if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+		at = entries = ((struct dx_node *) bh->b_data)->entries;
+		if (dx_get_limit(entries) != dx_node_limit (dir)) {
+			ext3_warning(dir->i_sb, __func__,
+				     "dx entry: limit != node limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+		frame++;
+		frame->bh = NULL;
+	}
+fail2:
+	while (frame >= frame_in) {
+		brelse(frame->bh);
+		frame--;
+	}
+fail:
+	if (*err == ERR_BAD_DX_DIR)
+		ext3_warning(dir->i_sb, __func__,
+			     "Corrupt dir inode %ld, running e2fsck is "
+			     "recommended.", dir->i_ino);
+	return NULL;
+}
+
+static void dx_release (struct dx_frame *frames)
+{
+	if (frames[0].bh == NULL)
+		return;
+
+	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+		brelse(frames[1].bh);
+	brelse(frames[0].bh);
+}
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary.  Whether or not the search is necessary is
+ * controlled by the hash parameter.  If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value.  This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not.  If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash)
+{
+	struct dx_frame *p;
+	struct buffer_head *bh;
+	int err, num_frames = 0;
+	__u32 bhash;
+
+	p = frame;
+	/*
+	 * Find the next leaf page by incrementing the frame pointer.
+	 * If we run out of entries in the interior node, loop around and
+	 * increment pointer in the parent node.  When we break out of
+	 * this loop, num_frames indicates the number of interior
+	 * nodes need to be read.
+	 */
+	while (1) {
+		if (++(p->at) < p->entries + dx_get_count(p->entries))
+			break;
+		if (p == frames)
+			return 0;
+		num_frames++;
+		p--;
+	}
+
+	/*
+	 * If the hash is 1, then continue only if the next page has a
+	 * continuation hash of any value.  This is used for readdir
+	 * handling.  Otherwise, check to see if the hash matches the
+	 * desired contiuation hash.  If it doesn't, return since
+	 * there's no point to read in the successive index pages.
+	 */
+	bhash = dx_get_hash(p->at);
+	if (start_hash)
+		*start_hash = bhash;
+	if ((hash & 1) == 0) {
+		if ((bhash & ~1) != hash)
+			return 0;
+	}
+	/*
+	 * If the hash is HASH_NB_ALWAYS, we always go to the next
+	 * block so no check is necessary
+	 */
+	while (num_frames--) {
+		if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
+					  0, &err)))
+			return err; /* Failure */
+		p++;
+		brelse (p->bh);
+		p->bh = bh;
+		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+	}
+	return 1;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory block.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+				  struct inode *dir, int block,
+				  struct dx_hash_info *hinfo,
+				  __u32 start_hash, __u32 start_minor_hash)
+{
+	struct buffer_head *bh;
+	struct ext3_dir_entry_2 *de, *top;
+	int err = 0, count = 0;
+
+	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
+
+	if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
+		return err;
+
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	top = (struct ext3_dir_entry_2 *) ((char *) de +
+					   dir->i_sb->s_blocksize -
+					   EXT3_DIR_REC_LEN(0));
+	for (; de < top; de = ext3_next_entry(de)) {
+		if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+					(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+						+((char *)de - bh->b_data))) {
+			/* silently ignore the rest of the block */
+			break;
+		}
+		ext3fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		if ((err = ext3_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
+			brelse(bh);
+			return err;
+		}
+		count++;
+	}
+	brelse(bh);
+	return count;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory.  We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+			 __u32 start_minor_hash, __u32 *next_hash)
+{
+	struct dx_hash_info hinfo;
+	struct ext3_dir_entry_2 *de;
+	struct dx_frame frames[2], *frame;
+	struct inode *dir;
+	int block, err;
+	int count = 0;
+	int ret;
+	__u32 hashval;
+
+	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+		       start_minor_hash));
+	dir = file_inode(dir_file);
+	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT3_SB(dir->i_sb)->s_hash_unsigned;
+		hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+					       start_hash, start_minor_hash);
+		*next_hash = ~0;
+		return count;
+	}
+	hinfo.hash = start_hash;
+	hinfo.minor_hash = 0;
+	frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
+	if (!frame)
+		return err;
+
+	/* Add '.' and '..' from the htree header */
+	if (!start_hash && !start_minor_hash) {
+		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
+		if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
+		de = ext3_next_entry(de);
+		if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+
+	while (1) {
+		block = dx_get_block(frame->at);
+		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+					     start_hash, start_minor_hash);
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		count += ret;
+		hashval = ~0;
+		ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
+					    frame, frames, &hashval);
+		*next_hash = hashval;
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		/*
+		 * Stop if:  (a) there are no more entries, or
+		 * (b) we have inserted at least one entry and the
+		 * next hash value is not a continuation
+		 */
+		if ((ret == 0) ||
+		    (count && ((hashval & 1) == 0)))
+			break;
+	}
+	dx_release(frames);
+	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+		       count, *next_hash));
+	return count;
+errout:
+	dx_release(frames);
+	return (err);
+}
+
+
+/*
+ * Directory block splitting, compacting
+ */
+
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
+		struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+{
+	int count = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	while ((char *) de < base + blocksize)
+	{
+		if (de->name_len && de->inode) {
+			ext3fs_dirhash(de->name, de->name_len, &h);
+			map_tail--;
+			map_tail->hash = h.hash;
+			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->size = le16_to_cpu(de->rec_len);
+			count++;
+			cond_resched();
+		}
+		/* XXX: do we need to check rec_len == 0 case? -Chris */
+		de = ext3_next_entry(de);
+	}
+	return count;
+}
+
+/* Sort map by hash value */
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+        struct dx_map_entry *p, *q, *top = map + count - 1;
+        int more;
+        /* Combsort until bubble sort doesn't suck */
+        while (count > 2)
+	{
+                count = count*10/13;
+                if (count - 9 < 2) /* 9, 10 -> 11 */
+                        count = 11;
+                for (p = top, q = p - count; q >= map; p--, q--)
+                        if (p->hash < q->hash)
+                                swap(*p, *q);
+        }
+        /* Garden variety bubble sort */
+        do {
+                more = 0;
+                q = top;
+                while (q-- > map)
+		{
+                        if (q[1].hash >= q[0].hash)
+				continue;
+                        swap(*(q+1), *q);
+                        more = 1;
+		}
+	} while(more);
+}
+
+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
+{
+	struct dx_entry *entries = frame->entries;
+	struct dx_entry *old = frame->at, *new = old + 1;
+	int count = dx_get_count(entries);
+
+	assert(count < dx_get_limit(entries));
+	assert(old < entries + count);
+	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+	dx_set_hash(new, hash);
+	dx_set_block(new, block);
+	dx_set_count(entries, count + 1);
+}
+
+static void ext3_update_dx_flag(struct inode *inode)
+{
+	if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+				     EXT3_FEATURE_COMPAT_DIR_INDEX))
+		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
+}
+
+/*
+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT3_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext3_match (int len, const char * const name,
+			      struct ext3_dir_entry_2 * de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int search_dirblock(struct buffer_head * bh,
+				  struct inode *dir,
+				  struct qstr *child,
+				  unsigned long offset,
+				  struct ext3_dir_entry_2 ** res_dir)
+{
+	struct ext3_dir_entry_2 * de;
+	char * dlimit;
+	int de_len;
+	const char *name = child->name;
+	int namelen = child->len;
+
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	dlimit = bh->b_data + dir->i_sb->s_blocksize;
+	while ((char *) de < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		if ((char *) de + namelen <= dlimit &&
+		    ext3_match (namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ext3_check_dir_entry("ext3_find_entry",
+						  dir, de, bh, offset))
+				return -1;
+			*res_dir = de;
+			return 1;
+		}
+		/* prevent looping on a bad block */
+		de_len = ext3_rec_len_from_disk(de->rec_len);
+		if (de_len <= 0)
+			return -1;
+		offset += de_len;
+		de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
+	}
+	return 0;
+}
+
+
+/*
+ *	ext3_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated.  The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head *ext3_find_entry(struct inode *dir,
+					struct qstr *entry,
+					struct ext3_dir_entry_2 **res_dir)
+{
+	struct super_block * sb;
+	struct buffer_head * bh_use[NAMEI_RA_SIZE];
+	struct buffer_head * bh, *ret = NULL;
+	unsigned long start, block, b;
+	const u8 *name = entry->name;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+	int namelen;
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+	namelen = entry->len;
+	if (namelen > EXT3_NAME_LEN)
+		return NULL;
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == 0)) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
+	if (is_dx(dir)) {
+		bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
+		/*
+		 * On success, or if the error was file not found,
+		 * return.  Otherwise, fall back to doing a search the
+		 * old fashioned way.
+		 */
+		if (bh || (err != ERR_BAD_DX_DIR))
+			return bh;
+		dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
+	}
+	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+	start = EXT3_I(dir)->i_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+				bh = ext3_getblk(NULL, dir, b++, 0, &err);
+				bh_use[ra_max] = bh;
+				if (bh && !bh_uptodate_or_lock(bh)) {
+					get_bh(bh);
+					bh->b_end_io = end_buffer_read_sync;
+					submit_bh(READ | REQ_META | REQ_PRIO,
+						  bh);
+				}
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			ext3_error(sb, __func__, "reading directory #%lu "
+				   "offset %lu", dir->i_ino, block);
+			brelse(bh);
+			goto next;
+		}
+		i = search_dirblock(bh, dir, entry,
+			    block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+		if (i == 1) {
+			EXT3_I(dir)->i_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse (bh_use[ra_ptr]);
+	return ret;
+}
+
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err)
+{
+	struct super_block *sb = dir->i_sb;
+	struct dx_hash_info	hinfo;
+	struct dx_frame frames[2], *frame;
+	struct buffer_head *bh;
+	unsigned long block;
+	int retval;
+
+	if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
+		return NULL;
+	do {
+		block = dx_get_block(frame->at);
+		if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
+			goto errout;
+
+		retval = search_dirblock(bh, dir, entry,
+					 block << EXT3_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1) {
+			dx_release(frames);
+			return bh;
+		}
+		brelse(bh);
+		if (retval == -1) {
+			*err = ERR_BAD_DX_DIR;
+			goto errout;
+		}
+
+		/* Check to see if we should continue to search */
+		retval = ext3_htree_next_block(dir, hinfo.hash, frame,
+					       frames, NULL);
+		if (retval < 0) {
+			ext3_warning(sb, __func__,
+			     "error reading index page in directory #%lu",
+			     dir->i_ino);
+			*err = retval;
+			goto errout;
+		}
+	} while (retval == 1);
+
+	*err = -ENOENT;
+errout:
+	dxtrace(printk("%s not found\n", entry->name));
+	dx_release (frames);
+	return NULL;
+}
+
+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode * inode;
+	struct ext3_dir_entry_2 * de;
+	struct buffer_head * bh;
+
+	if (dentry->d_name.len > EXT3_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
+	inode = NULL;
+	if (bh) {
+		unsigned long ino = le32_to_cpu(de->inode);
+		brelse (bh);
+		if (!ext3_valid_inum(dir->i_sb, ino)) {
+			ext3_error(dir->i_sb, "ext3_lookup",
+				   "bad inode number: %lu", ino);
+			return ERR_PTR(-EIO);
+		}
+		inode = ext3_iget(dir->i_sb, ino);
+		if (inode == ERR_PTR(-ESTALE)) {
+			ext3_error(dir->i_sb, __func__,
+					"deleted inode referenced: %lu",
+					ino);
+			return ERR_PTR(-EIO);
+		}
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+
+struct dentry *ext3_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct qstr dotdot = QSTR_INIT("..", 2);
+	struct ext3_dir_entry_2 * de;
+	struct buffer_head *bh;
+
+	bh = ext3_find_entry(d_inode(child), &dotdot, &de);
+	if (!bh)
+		return ERR_PTR(-ENOENT);
+	ino = le32_to_cpu(de->inode);
+	brelse(bh);
+
+	if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) {
+		ext3_error(d_inode(child)->i_sb, "ext3_get_parent",
+			   "bad inode number: %lu", ino);
+		return ERR_PTR(-EIO);
+	}
+
+	return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino));
+}
+
+#define S_SHIFT 12
+static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT3_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT3_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT3_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT3_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT3_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT3_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT3_FT_SYMLINK,
+};
+
+static inline void ext3_set_de_type(struct super_block *sb,
+				struct ext3_dir_entry_2 *de,
+				umode_t mode) {
+	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
+static struct ext3_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+{
+	unsigned rec_len = 0;
+
+	while (count--) {
+		struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
+		rec_len = EXT3_DIR_REC_LEN(de->name_len);
+		memcpy (to, de, rec_len);
+		((struct ext3_dir_entry_2 *) to)->rec_len =
+				ext3_rec_len_to_disk(rec_len);
+		de->inode = 0;
+		map++;
+		to += rec_len;
+	}
+	return (struct ext3_dir_entry_2 *) (to - rec_len);
+}
+
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
+{
+	struct ext3_dir_entry_2 *next, *to, *prev;
+	struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
+	unsigned rec_len = 0;
+
+	prev = to = de;
+	while ((char *)de < base + blocksize) {
+		next = ext3_next_entry(de);
+		if (de->inode && de->name_len) {
+			rec_len = EXT3_DIR_REC_LEN(de->name_len);
+			if (de > to)
+				memmove(to, de, rec_len);
+			to->rec_len = ext3_rec_len_to_disk(rec_len);
+			prev = to;
+			to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
+		}
+		de = next;
+	}
+	return prev;
+}
+
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+			struct buffer_head **bh,struct dx_frame *frame,
+			struct dx_hash_info *hinfo, int *error)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count, continued;
+	struct buffer_head *bh2;
+	u32 newblock;
+	u32 hash2;
+	struct dx_map_entry *map;
+	char *data1 = (*bh)->b_data, *data2;
+	unsigned split, move, size;
+	struct ext3_dir_entry_2 *de = NULL, *de2;
+	int	err = 0, i;
+
+	bh2 = ext3_append (handle, dir, &newblock, &err);
+	if (!(bh2)) {
+		brelse(*bh);
+		*bh = NULL;
+		goto errout;
+	}
+
+	BUFFER_TRACE(*bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, *bh);
+	if (err)
+		goto journal_error;
+
+	BUFFER_TRACE(frame->bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, frame->bh);
+	if (err)
+		goto journal_error;
+
+	data2 = bh2->b_data;
+
+	/* create map in the end of data2 block */
+	map = (struct dx_map_entry *) (data2 + blocksize);
+	count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
+			     blocksize, hinfo, map);
+	map -= count;
+	dx_sort_map (map, count);
+	/* Split the existing block in the middle, size-wise */
+	size = 0;
+	move = 0;
+	for (i = count-1; i >= 0; i--) {
+		/* is more than half of this entry in 2nd half of the block? */
+		if (size + map[i].size/2 > blocksize/2)
+			break;
+		size += map[i].size;
+		move++;
+	}
+	/* map index at which we will split */
+	split = count - move;
+	hash2 = map[split].hash;
+	continued = hash2 == map[split - 1].hash;
+	dxtrace(printk("Split block %i at %x, %i/%i\n",
+		dx_get_block(frame->at), hash2, split, count-split));
+
+	/* Fancy dance to stay within two buffers */
+	de2 = dx_move_dirents(data1, data2, map + split, count - split);
+	de = dx_pack_dirents(data1,blocksize);
+	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
+	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
+
+	/* Which block gets the new entry? */
+	if (hinfo->hash >= hash2)
+	{
+		swap(*bh, bh2);
+		de = de2;
+	}
+	dx_insert_block (frame, hash2 + continued, newblock);
+	err = ext3_journal_dirty_metadata (handle, bh2);
+	if (err)
+		goto journal_error;
+	err = ext3_journal_dirty_metadata (handle, frame->bh);
+	if (err)
+		goto journal_error;
+	brelse (bh2);
+	dxtrace(dx_show_index ("frame", frame->entries));
+	return de;
+
+journal_error:
+	brelse(*bh);
+	brelse(bh2);
+	*bh = NULL;
+	ext3_std_error(dir->i_sb, err);
+errout:
+	*error = err;
+	return NULL;
+}
+
+
+/*
+ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry.  If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space.  It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ *
+ * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
+ * all other cases bh is released.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode, struct ext3_dir_entry_2 *de,
+			     struct buffer_head * bh)
+{
+	struct inode	*dir = d_inode(dentry->d_parent);
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	unsigned long	offset = 0;
+	unsigned short	reclen;
+	int		nlen, rlen, err;
+	char		*top;
+
+	reclen = EXT3_DIR_REC_LEN(namelen);
+	if (!de) {
+		de = (struct ext3_dir_entry_2 *)bh->b_data;
+		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+		while ((char *) de <= top) {
+			if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
+						  bh, offset)) {
+				brelse (bh);
+				return -EIO;
+			}
+			if (ext3_match (namelen, name, de)) {
+				brelse (bh);
+				return -EEXIST;
+			}
+			nlen = EXT3_DIR_REC_LEN(de->name_len);
+			rlen = ext3_rec_len_from_disk(de->rec_len);
+			if ((de->inode? rlen - nlen: rlen) >= reclen)
+				break;
+			de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
+			offset += rlen;
+		}
+		if ((char *) de > top)
+			return -ENOSPC;
+	}
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, bh);
+	if (err) {
+		ext3_std_error(dir->i_sb, err);
+		brelse(bh);
+		return err;
+	}
+
+	/* By now the buffer is marked for journaling */
+	nlen = EXT3_DIR_REC_LEN(de->name_len);
+	rlen = ext3_rec_len_from_disk(de->rec_len);
+	if (de->inode) {
+		struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
+		de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
+		de->rec_len = ext3_rec_len_to_disk(nlen);
+		de = de1;
+	}
+	de->file_type = EXT3_FT_UNKNOWN;
+	if (inode) {
+		de->inode = cpu_to_le32(inode->i_ino);
+		ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+	} else
+		de->inode = 0;
+	de->name_len = namelen;
+	memcpy (de->name, name, namelen);
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 *
+	 * XXX similarly, too many callers depend on
+	 * ext3_new_inode() setting the times, but error
+	 * recovery deletes the inode, so the worst that can
+	 * happen is that the times are slightly out of date
+	 * and/or different from the directory change time.
+	 */
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	ext3_update_dx_flag(dir);
+	dir->i_version++;
+	ext3_mark_inode_dirty(handle, dir);
+	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, bh);
+	if (err)
+		ext3_std_error(dir->i_sb, err);
+	brelse(bh);
+	return 0;
+}
+
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+			    struct inode *inode, struct buffer_head *bh)
+{
+	struct inode	*dir = d_inode(dentry->d_parent);
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	struct buffer_head *bh2;
+	struct dx_root	*root;
+	struct dx_frame	frames[2], *frame;
+	struct dx_entry *entries;
+	struct ext3_dir_entry_2	*de, *de2;
+	char		*data1, *top;
+	unsigned	len;
+	int		retval;
+	unsigned	blocksize;
+	struct dx_hash_info hinfo;
+	u32		block;
+	struct fake_dirent *fde;
+
+	blocksize =  dir->i_sb->s_blocksize;
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+	retval = ext3_journal_get_write_access(handle, bh);
+	if (retval) {
+		ext3_std_error(dir->i_sb, retval);
+		brelse(bh);
+		return retval;
+	}
+	root = (struct dx_root *) bh->b_data;
+
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext3_dir_entry_2 *)((char *)fde +
+			ext3_rec_len_from_disk(fde->rec_len));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		ext3_error(dir->i_sb, __func__,
+			   "invalid rec_len for '..' in inode %lu",
+			   dir->i_ino);
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + blocksize - (char *) de;
+
+	bh2 = ext3_append (handle, dir, &block, &retval);
+	if (!(bh2)) {
+		brelse(bh);
+		return retval;
+	}
+	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
+	data1 = bh2->b_data;
+
+	memcpy (data1, de, len);
+	de = (struct ext3_dir_entry_2 *) data1;
+	top = data1 + len;
+	while ((char *)(de2 = ext3_next_entry(de)) < top)
+		de = de2;
+	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
+	/* Initialize the root; the dot dirents already exist */
+	de = (struct ext3_dir_entry_2 *) (&root->dotdot);
+	de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
+	memset (&root->info, 0, sizeof(root->info));
+	root->info.info_length = sizeof(root->info);
+	root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+	entries = root->entries;
+	dx_set_block (entries, 1);
+	dx_set_count (entries, 1);
+	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+
+	/* Initialize as for dx_probe */
+	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+	ext3fs_dirhash(name, namelen, &hinfo);
+	frame = frames;
+	frame->entries = entries;
+	frame->at = entries;
+	frame->bh = bh;
+	bh = bh2;
+	/*
+	 * Mark buffers dirty here so that if do_split() fails we write a
+	 * consistent set of buffers to disk.
+	 */
+	ext3_journal_dirty_metadata(handle, frame->bh);
+	ext3_journal_dirty_metadata(handle, bh);
+	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+	if (!de) {
+		ext3_mark_inode_dirty(handle, dir);
+		dx_release(frames);
+		return retval;
+	}
+	dx_release(frames);
+
+	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+
+/*
+ *	ext3_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext3_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+	struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 *de;
+	struct super_block * sb;
+	int	retval;
+	int	dx_fallback=0;
+	unsigned blocksize;
+	u32 block, blocks;
+
+	sb = dir->i_sb;
+	blocksize = sb->s_blocksize;
+	if (!dentry->d_name.len)
+		return -EINVAL;
+	if (is_dx(dir)) {
+		retval = ext3_dx_add_entry(handle, dentry, inode);
+		if (!retval || (retval != ERR_BAD_DX_DIR))
+			return retval;
+		EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
+		dx_fallback++;
+		ext3_mark_inode_dirty(handle, dir);
+	}
+	blocks = dir->i_size >> sb->s_blocksize_bits;
+	for (block = 0; block < blocks; block++) {
+		if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
+			return retval;
+
+		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+		if (retval != -ENOSPC)
+			return retval;
+
+		if (blocks == 1 && !dx_fallback &&
+		    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+			return make_indexed_dir(handle, dentry, inode, bh);
+		brelse(bh);
+	}
+	bh = ext3_append(handle, dir, &block, &retval);
+	if (!bh)
+		return retval;
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	de->inode = 0;
+	de->rec_len = ext3_rec_len_to_disk(blocksize);
+	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct dx_frame frames[2], *frame;
+	struct dx_entry *entries, *at;
+	struct dx_hash_info hinfo;
+	struct buffer_head * bh;
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct super_block * sb = dir->i_sb;
+	struct ext3_dir_entry_2 *de;
+	int err;
+
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+	if (!frame)
+		return err;
+	entries = frame->entries;
+	at = frame->at;
+
+	if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
+		goto cleanup;
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, bh);
+	if (err)
+		goto journal_error;
+
+	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+	if (err != -ENOSPC) {
+		bh = NULL;
+		goto cleanup;
+	}
+
+	/* Block full, should compress but for now just split */
+	dxtrace(printk("using %u of %u node entries\n",
+		       dx_get_count(entries), dx_get_limit(entries)));
+	/* Need to split index? */
+	if (dx_get_count(entries) == dx_get_limit(entries)) {
+		u32 newblock;
+		unsigned icount = dx_get_count(entries);
+		int levels = frame - frames;
+		struct dx_entry *entries2;
+		struct dx_node *node2;
+		struct buffer_head *bh2;
+
+		if (levels && (dx_get_count(frames->entries) ==
+			       dx_get_limit(frames->entries))) {
+			ext3_warning(sb, __func__,
+				     "Directory index full!");
+			err = -ENOSPC;
+			goto cleanup;
+		}
+		bh2 = ext3_append (handle, dir, &newblock, &err);
+		if (!(bh2))
+			goto cleanup;
+		node2 = (struct dx_node *)(bh2->b_data);
+		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
+		node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
+		BUFFER_TRACE(frame->bh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, frame->bh);
+		if (err)
+			goto journal_error;
+		if (levels) {
+			unsigned icount1 = icount/2, icount2 = icount - icount1;
+			unsigned hash2 = dx_get_hash(entries + icount1);
+			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+
+			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+			err = ext3_journal_get_write_access(handle,
+							     frames[0].bh);
+			if (err)
+				goto journal_error;
+
+			memcpy ((char *) entries2, (char *) (entries + icount1),
+				icount2 * sizeof(struct dx_entry));
+			dx_set_count (entries, icount1);
+			dx_set_count (entries2, icount2);
+			dx_set_limit (entries2, dx_node_limit(dir));
+
+			/* Which index block gets the new entry? */
+			if (at - entries >= icount1) {
+				frame->at = at = at - entries - icount1 + entries2;
+				frame->entries = entries = entries2;
+				swap(frame->bh, bh2);
+			}
+			dx_insert_block (frames + 0, hash2, newblock);
+			dxtrace(dx_show_index ("node", frames[1].entries));
+			dxtrace(dx_show_index ("node",
+			       ((struct dx_node *) bh2->b_data)->entries));
+			err = ext3_journal_dirty_metadata(handle, bh2);
+			if (err)
+				goto journal_error;
+			brelse (bh2);
+		} else {
+			dxtrace(printk("Creating second level index...\n"));
+			memcpy((char *) entries2, (char *) entries,
+			       icount * sizeof(struct dx_entry));
+			dx_set_limit(entries2, dx_node_limit(dir));
+
+			/* Set up root */
+			dx_set_count(entries, 1);
+			dx_set_block(entries + 0, newblock);
+			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+
+			/* Add new access path frame */
+			frame = frames + 1;
+			frame->at = at = at - entries + entries2;
+			frame->entries = entries = entries2;
+			frame->bh = bh2;
+			err = ext3_journal_get_write_access(handle,
+							     frame->bh);
+			if (err)
+				goto journal_error;
+		}
+		err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+		if (err)
+			goto journal_error;
+	}
+	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+	if (!de)
+		goto cleanup;
+	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	bh = NULL;
+	goto cleanup;
+
+journal_error:
+	ext3_std_error(dir->i_sb, err);
+cleanup:
+	if (bh)
+		brelse(bh);
+	dx_release(frames);
+	return err;
+}
+
+/*
+ * ext3_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ext3_delete_entry (handle_t *handle,
+			      struct inode * dir,
+			      struct ext3_dir_entry_2 * de_del,
+			      struct buffer_head * bh)
+{
+	struct ext3_dir_entry_2 * de, * pde;
+	int i;
+
+	i = 0;
+	pde = NULL;
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
+			return -EIO;
+		if (de == de_del)  {
+			int err;
+
+			BUFFER_TRACE(bh, "get_write_access");
+			err = ext3_journal_get_write_access(handle, bh);
+			if (err)
+				goto journal_error;
+
+			if (pde)
+				pde->rec_len = ext3_rec_len_to_disk(
+					ext3_rec_len_from_disk(pde->rec_len) +
+					ext3_rec_len_from_disk(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+			err = ext3_journal_dirty_metadata(handle, bh);
+			if (err) {
+journal_error:
+				ext3_std_error(dir->i_sb, err);
+				return err;
+			}
+			return 0;
+		}
+		i += ext3_rec_len_from_disk(de->rec_len);
+		pde = de;
+		de = ext3_next_entry(de);
+	}
+	return -ENOENT;
+}
+
+static int ext3_add_nondir(handle_t *handle,
+		struct dentry *dentry, struct inode *inode)
+{
+	int err = ext3_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext3_mark_inode_dirty(handle, inode);
+		unlock_new_inode(inode);
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	drop_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
+		bool excl)
+{
+	handle_t *handle;
+	struct inode * inode;
+	int err, retries = 0;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext3_file_inode_operations;
+		inode->i_fop = &ext3_file_operations;
+		ext3_set_aops(inode);
+		err = ext3_add_nondir(handle, dentry, inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext3_mknod (struct inode * dir, struct dentry *dentry,
+			umode_t mode, dev_t rdev)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT3_FS_XATTR
+		inode->i_op = &ext3_special_inode_operations;
+#endif
+		err = ext3_add_nondir(handle, dentry, inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  4 + EXT3_XATTR_TRANS_BLOCKS);
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	inode = ext3_new_inode (handle, dir, NULL, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext3_file_inode_operations;
+		inode->i_fop = &ext3_file_operations;
+		ext3_set_aops(inode);
+		d_tmpfile(dentry, inode);
+		err = ext3_orphan_add(handle, inode);
+		if (err)
+			goto err_unlock_inode;
+		mark_inode_dirty(inode);
+		unlock_new_inode(inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+err_unlock_inode:
+	ext3_journal_stop(handle);
+	unlock_new_inode(inode);
+	return err;
+}
+
+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+{
+	handle_t *handle;
+	struct inode * inode;
+	struct buffer_head * dir_block = NULL;
+	struct ext3_dir_entry_2 * de;
+	int err, retries = 0;
+
+	if (dir->i_nlink >= EXT3_LINK_MAX)
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	inode->i_op = &ext3_dir_inode_operations;
+	inode->i_fop = &ext3_dir_operations;
+	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+	if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
+		goto out_clear_inode;
+
+	BUFFER_TRACE(dir_block, "get_write_access");
+	err = ext3_journal_get_write_access(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+
+	de = (struct ext3_dir_entry_2 *) dir_block->b_data;
+	de->inode = cpu_to_le32(inode->i_ino);
+	de->name_len = 1;
+	de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
+	strcpy (de->name, ".");
+	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+	de = ext3_next_entry(de);
+	de->inode = cpu_to_le32(dir->i_ino);
+	de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
+					EXT3_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy (de->name, "..");
+	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+	set_nlink(inode, 2);
+	BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+
+	err = ext3_mark_inode_dirty(handle, inode);
+	if (!err)
+		err = ext3_add_entry (handle, dentry, inode);
+
+	if (err) {
+out_clear_inode:
+		clear_nlink(inode);
+		unlock_new_inode(inode);
+		ext3_mark_inode_dirty(handle, inode);
+		iput (inode);
+		goto out_stop;
+	}
+	inc_nlink(dir);
+	ext3_update_dx_flag(dir);
+	err = ext3_mark_inode_dirty(handle, dir);
+	if (err)
+		goto out_clear_inode;
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+out_stop:
+	brelse(dir_block);
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int empty_dir (struct inode * inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 * de, * de1;
+	struct super_block * sb;
+	int err = 0;
+
+	sb = inode->i_sb;
+	if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
+	    !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
+		if (err)
+			ext3_error(inode->i_sb, __func__,
+				   "error %d reading directory #%lu offset 0",
+				   err, inode->i_ino);
+		else
+			ext3_warning(inode->i_sb, __func__,
+				     "bad directory (dir #%lu) - no data block",
+				     inode->i_ino);
+		return 1;
+	}
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	de1 = ext3_next_entry(de);
+	if (le32_to_cpu(de->inode) != inode->i_ino ||
+			!le32_to_cpu(de1->inode) ||
+			strcmp (".", de->name) ||
+			strcmp ("..", de1->name)) {
+		ext3_warning (inode->i_sb, "empty_dir",
+			      "bad directory (dir #%lu) - no `.' or `..'",
+			      inode->i_ino);
+		brelse (bh);
+		return 1;
+	}
+	offset = ext3_rec_len_from_disk(de->rec_len) +
+			ext3_rec_len_from_disk(de1->rec_len);
+	de = ext3_next_entry(de1);
+	while (offset < inode->i_size ) {
+		if (!bh ||
+			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			err = 0;
+			brelse (bh);
+			if (!(bh = ext3_dir_bread (NULL, inode,
+				offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
+				if (err)
+					ext3_error(sb, __func__,
+						   "error %d reading directory"
+						   " #%lu offset %lu",
+						   err, inode->i_ino, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ext3_dir_entry_2 *) bh->b_data;
+		}
+		if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+			de = (struct ext3_dir_entry_2 *)(bh->b_data +
+							 sb->s_blocksize);
+			offset = (offset | (sb->s_blocksize - 1)) + 1;
+			continue;
+		}
+		if (le32_to_cpu(de->inode)) {
+			brelse (bh);
+			return 0;
+		}
+		offset += ext3_rec_len_from_disk(de->rec_len);
+		de = ext3_next_entry(de);
+	}
+	brelse (bh);
+	return 1;
+}
+
+/* ext3_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext3_orphan_cleanup().
+ */
+int ext3_orphan_add(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext3_iloc iloc;
+	int err = 0, rc;
+
+	mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
+	if (!list_empty(&EXT3_I(inode)->i_orphan))
+		goto out_unlock;
+
+	/* Orphan handling is only valid for files with data blocks
+	 * being truncated, or files being unlinked. */
+
+	/* @@@ FIXME: Observation from aviro:
+	 * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
+	 * here (on s_orphan_lock), so race with ext3_link() which might bump
+	 * ->i_nlink. For, say it, character device. Not a regular file,
+	 * not a directory, not a symlink and ->i_nlink > 0.
+	 *
+	 * tytso, 4/25/2009: I'm not sure how that could happen;
+	 * shouldn't the fs core protect us from these sort of
+	 * unlink()/link() races?
+	 */
+	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+	if (err)
+		goto out_unlock;
+
+	err = ext3_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_unlock;
+
+	/* Insert this inode at the head of the on-disk orphan list... */
+	NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+	EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
+	if (!err)
+		err = rc;
+
+	/* Only add to the head of the in-memory list if all the
+	 * previous operations succeeded.  If the orphan_add is going to
+	 * fail (possibly taking the journal offline), we can't risk
+	 * leaving the inode on the orphan list: stray orphan-list
+	 * entries can cause panics at unmount time.
+	 *
+	 * This is safe: on error we're going to ignore the orphan list
+	 * anyway on the next recovery. */
+	if (!err)
+		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+
+	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+	jbd_debug(4, "orphan inode %lu will point to %d\n",
+			inode->i_ino, NEXT_ORPHAN(inode));
+out_unlock:
+	mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext3_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext3_orphan_del(handle_t *handle, struct inode *inode)
+{
+	struct list_head *prev;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_sb_info *sbi;
+	unsigned long ino_next;
+	struct ext3_iloc iloc;
+	int err = 0;
+
+	mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
+	if (list_empty(&ei->i_orphan))
+		goto out;
+
+	ino_next = NEXT_ORPHAN(inode);
+	prev = ei->i_orphan.prev;
+	sbi = EXT3_SB(inode->i_sb);
+
+	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+	list_del_init(&ei->i_orphan);
+
+	/* If we're on an error path, we may not have a valid
+	 * transaction handle with which to update the orphan list on
+	 * disk, but we still need to remove the inode from the linked
+	 * list in memory. */
+	if (!handle)
+		goto out;
+
+	err = ext3_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_err;
+
+	if (prev == &sbi->s_orphan) {
+		jbd_debug(4, "superblock will point to %lu\n", ino_next);
+		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, sbi->s_sbh);
+		if (err)
+			goto out_brelse;
+		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+		err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+	} else {
+		struct ext3_iloc iloc2;
+		struct inode *i_prev =
+			&list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
+
+		jbd_debug(4, "orphan inode %lu will point to %lu\n",
+			  i_prev->i_ino, ino_next);
+		err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
+		if (err)
+			goto out_brelse;
+		NEXT_ORPHAN(i_prev) = ino_next;
+		err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
+	}
+	if (err)
+		goto out_brelse;
+	NEXT_ORPHAN(inode) = 0;
+	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+
+out_err:
+	ext3_std_error(inode->i_sb, err);
+out:
+	mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
+	return err;
+
+out_brelse:
+	brelse(iloc.bh);
+	goto out_err;
+}
+
+static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode * inode;
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 * de;
+	handle_t *handle;
+
+	/* Initialize quotas before so that eventual writes go in
+	 * separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(d_inode(dentry));
+
+	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	retval = -ENOENT;
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
+	if (!bh)
+		goto end_rmdir;
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = d_inode(dentry);
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_rmdir;
+
+	retval = -ENOTEMPTY;
+	if (!empty_dir (inode))
+		goto end_rmdir;
+
+	retval = ext3_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_rmdir;
+	if (inode->i_nlink != 2)
+		ext3_warning (inode->i_sb, "ext3_rmdir",
+			      "empty directory has nlink!=2 (%d)",
+			      inode->i_nlink);
+	inode->i_version++;
+	clear_nlink(inode);
+	/* There's no need to set i_disksize: the fact that i_nlink is
+	 * zero will ensure that the right thing happens during any
+	 * recovery. */
+	inode->i_size = 0;
+	ext3_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	ext3_mark_inode_dirty(handle, inode);
+	drop_nlink(dir);
+	ext3_update_dx_flag(dir);
+	ext3_mark_inode_dirty(handle, dir);
+
+end_rmdir:
+	ext3_journal_stop(handle);
+	brelse (bh);
+	return retval;
+}
+
+static int ext3_unlink(struct inode * dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode * inode;
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 * de;
+	handle_t *handle;
+
+	trace_ext3_unlink_enter(dir, dentry);
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(d_inode(dentry));
+
+	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	retval = -ENOENT;
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
+	if (!bh)
+		goto end_unlink;
+
+	inode = d_inode(dentry);
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_unlink;
+
+	if (!inode->i_nlink) {
+		ext3_warning (inode->i_sb, "ext3_unlink",
+			      "Deleting nonexistent file (%lu), %d",
+			      inode->i_ino, inode->i_nlink);
+		set_nlink(inode, 1);
+	}
+	retval = ext3_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_unlink;
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	ext3_update_dx_flag(dir);
+	ext3_mark_inode_dirty(handle, dir);
+	drop_nlink(inode);
+	if (!inode->i_nlink)
+		ext3_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime;
+	ext3_mark_inode_dirty(handle, inode);
+	retval = 0;
+
+end_unlink:
+	ext3_journal_stop(handle);
+	brelse (bh);
+	trace_ext3_unlink_exit(dentry, retval);
+	return retval;
+}
+
+static int ext3_symlink (struct inode * dir,
+		struct dentry *dentry, const char * symname)
+{
+	handle_t *handle;
+	struct inode * inode;
+	int l, err, retries = 0;
+	int credits;
+
+	l = strlen(symname)+1;
+	if (l > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	dquot_initialize(dir);
+
+	if (l > EXT3_N_BLOCKS * 4) {
+		/*
+		 * For non-fast symlinks, we just allocate inode and put it on
+		 * orphan list in the first transaction => we need bitmap,
+		 * group descriptor, sb, inode block, quota blocks, and
+		 * possibly selinux xattr blocks.
+		 */
+		credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  EXT3_XATTR_TRANS_BLOCKS;
+	} else {
+		/*
+		 * Fast symlink. We have to add entry to directory
+		 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+		 * allocate new inode (bitmap, group descriptor, inode block,
+		 * quota blocks, sb is already counted in previous macros).
+		 */
+		credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+			  EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+			  EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+	}
+retry:
+	handle = ext3_journal_start(dir, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	if (l > EXT3_N_BLOCKS * 4) {
+		inode->i_op = &ext3_symlink_inode_operations;
+		ext3_set_aops(inode);
+		/*
+		 * We cannot call page_symlink() with transaction started
+		 * because it calls into ext3_write_begin() which acquires page
+		 * lock which ranks below transaction start (and it can also
+		 * wait for journal commit if we are running out of space). So
+		 * we have to stop transaction now and restart it when symlink
+		 * contents is written. 
+		 *
+		 * To keep fs consistent in case of crash, we have to put inode
+		 * to orphan list in the mean time.
+		 */
+		drop_nlink(inode);
+		err = ext3_orphan_add(handle, inode);
+		ext3_journal_stop(handle);
+		if (err)
+			goto err_drop_inode;
+		err = __page_symlink(inode, symname, l, 1);
+		if (err)
+			goto err_drop_inode;
+		/*
+		 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+		 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+		 */
+		handle = ext3_journal_start(dir,
+				EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+				EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err_drop_inode;
+		}
+		set_nlink(inode, 1);
+		err = ext3_orphan_del(handle, inode);
+		if (err) {
+			ext3_journal_stop(handle);
+			drop_nlink(inode);
+			goto err_drop_inode;
+		}
+	} else {
+		inode->i_op = &ext3_fast_symlink_inode_operations;
+		memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+		inode->i_size = l-1;
+	}
+	EXT3_I(inode)->i_disksize = inode->i_size;
+	err = ext3_add_nondir(handle, dentry, inode);
+out_stop:
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+err_drop_inode:
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+static int ext3_link (struct dentry * old_dentry,
+		struct inode * dir, struct dentry *dentry)
+{
+	handle_t *handle;
+	struct inode *inode = d_inode(old_dentry);
+	int err, retries = 0;
+
+	if (inode->i_nlink >= EXT3_LINK_MAX)
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inc_nlink(inode);
+	ihold(inode);
+
+	err = ext3_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext3_mark_inode_dirty(handle, inode);
+		/* this can happen only for tmpfile being
+		 * linked the first time
+		 */
+		if (inode->i_nlink == 1)
+			ext3_orphan_del(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+#define PARENT_INO(buffer) \
+	(ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
+			   struct inode * new_dir,struct dentry *new_dentry)
+{
+	handle_t *handle;
+	struct inode * old_inode, * new_inode;
+	struct buffer_head * old_bh, * new_bh, * dir_bh;
+	struct ext3_dir_entry_2 * old_de, * new_de;
+	int retval, flush_file = 0;
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_bh = new_bh = dir_bh = NULL;
+
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	if (d_really_is_positive(new_dentry))
+		dquot_initialize(d_inode(new_dentry));
+	handle = ext3_journal_start(old_dir, 2 *
+					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+		handle->h_sync = 1;
+
+	old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	old_inode = d_inode(old_dentry);
+	retval = -ENOENT;
+	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+		goto end_rename;
+
+	new_inode = d_inode(new_dentry);
+	new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
+	if (new_bh) {
+		if (!new_inode) {
+			brelse (new_bh);
+			new_bh = NULL;
+		}
+	}
+	if (S_ISDIR(old_inode->i_mode)) {
+		if (new_inode) {
+			retval = -ENOTEMPTY;
+			if (!empty_dir (new_inode))
+				goto end_rename;
+		}
+		retval = -EIO;
+		dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
+		if (!dir_bh)
+			goto end_rename;
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+			goto end_rename;
+		retval = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+				new_dir->i_nlink >= EXT3_LINK_MAX)
+			goto end_rename;
+	}
+	if (!new_bh) {
+		retval = ext3_add_entry (handle, new_dentry, old_inode);
+		if (retval)
+			goto end_rename;
+	} else {
+		BUFFER_TRACE(new_bh, "get write access");
+		retval = ext3_journal_get_write_access(handle, new_bh);
+		if (retval)
+			goto journal_error;
+		new_de->inode = cpu_to_le32(old_inode->i_ino);
+		if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+					      EXT3_FEATURE_INCOMPAT_FILETYPE))
+			new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
+		ext3_mark_inode_dirty(handle, new_dir);
+		BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
+		retval = ext3_journal_dirty_metadata(handle, new_bh);
+		if (retval)
+			goto journal_error;
+		brelse(new_bh);
+		new_bh = NULL;
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+	ext3_mark_inode_dirty(handle, old_inode);
+
+	/*
+	 * ok, that's it
+	 */
+	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+	    old_de->name_len != old_dentry->d_name.len ||
+	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+	    (retval = ext3_delete_entry(handle, old_dir,
+					old_de, old_bh)) == -ENOENT) {
+		/* old_de could have moved from under us during htree split, so
+		 * make sure that we are deleting the right entry.  We might
+		 * also be pointing to a stale entry in the unused part of
+		 * old_bh so just checking inum and the name isn't enough. */
+		struct buffer_head *old_bh2;
+		struct ext3_dir_entry_2 *old_de2;
+
+		old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
+					  &old_de2);
+		if (old_bh2) {
+			retval = ext3_delete_entry(handle, old_dir,
+						   old_de2, old_bh2);
+			brelse(old_bh2);
+		}
+	}
+	if (retval) {
+		ext3_warning(old_dir->i_sb, "ext3_rename",
+				"Deleting old file (%lu), %d, error=%d",
+				old_dir->i_ino, old_dir->i_nlink, retval);
+	}
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	ext3_update_dx_flag(old_dir);
+	if (dir_bh) {
+		BUFFER_TRACE(dir_bh, "get_write_access");
+		retval = ext3_journal_get_write_access(handle, dir_bh);
+		if (retval)
+			goto journal_error;
+		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+		retval = ext3_journal_dirty_metadata(handle, dir_bh);
+		if (retval) {
+journal_error:
+			ext3_std_error(new_dir->i_sb, retval);
+			goto end_rename;
+		}
+		drop_nlink(old_dir);
+		if (new_inode) {
+			drop_nlink(new_inode);
+		} else {
+			inc_nlink(new_dir);
+			ext3_update_dx_flag(new_dir);
+			ext3_mark_inode_dirty(handle, new_dir);
+		}
+	}
+	ext3_mark_inode_dirty(handle, old_dir);
+	if (new_inode) {
+		ext3_mark_inode_dirty(handle, new_inode);
+		if (!new_inode->i_nlink)
+			ext3_orphan_add(handle, new_inode);
+		if (ext3_should_writeback_data(new_inode))
+			flush_file = 1;
+	}
+	retval = 0;
+
+end_rename:
+	brelse (dir_bh);
+	brelse (old_bh);
+	brelse (new_bh);
+	ext3_journal_stop(handle);
+	if (retval == 0 && flush_file)
+		filemap_flush(old_inode->i_mapping);
+	return retval;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations ext3_dir_inode_operations = {
+	.create		= ext3_create,
+	.lookup		= ext3_lookup,
+	.link		= ext3_link,
+	.unlink		= ext3_unlink,
+	.symlink	= ext3_symlink,
+	.mkdir		= ext3_mkdir,
+	.rmdir		= ext3_rmdir,
+	.mknod		= ext3_mknod,
+	.tmpfile	= ext3_tmpfile,
+	.rename		= ext3_rename,
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.get_acl	= ext3_get_acl,
+	.set_acl	= ext3_set_acl,
+};
+
+const struct inode_operations ext3_special_inode_operations = {
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.get_acl	= ext3_get_acl,
+	.set_acl	= ext3_set_acl,
+};
diff --git a/kernel/fs/ext3/namei.h b/kernel/fs/ext3/namei.h
new file mode 100644
index 000000000..46304d8c9
--- /dev/null
+++ b/kernel/fs/ext3/namei.h
@@ -0,0 +1,27 @@
+/*  linux/fs/ext3/namei.h
+ *
+ * Copyright (C) 2005 Simtec Electronics
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+*/
+
+extern struct dentry *ext3_get_parent(struct dentry *child);
+
+static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
+						 struct inode *inode,
+						 int block, int create,
+						 int *err)
+{
+	struct buffer_head *bh;
+
+	bh = ext3_bread(handle, inode, block, create, err);
+
+	if (!bh && !(*err)) {
+		*err = -EIO;
+		ext3_error(inode->i_sb, __func__,
+			   "Directory hole detected on inode %lu\n",
+			   inode->i_ino);
+		return NULL;
+	}
+	return bh;
+}
diff --git a/kernel/fs/ext3/resize.c b/kernel/fs/ext3/resize.c
new file mode 100644
index 000000000..271056555
--- /dev/null
+++ b/kernel/fs/ext3/resize.c
@@ -0,0 +1,1117 @@
+/*
+ *  linux/fs/ext3/resize.c
+ *
+ * Support for resizing an ext3 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+
+
+#define EXT3FS_DEBUG
+
+#include "ext3.h"
+
+
+#define outside(b, first, last)	((b) < (first) || (b) >= (last))
+#define inside(b, first, last)	((b) >= (first) && (b) < (last))
+
+static int verify_group_input(struct super_block *sb,
+			      struct ext3_new_group_data *input)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
+	ext3_fsblk_t end = start + input->blocks_count;
+	unsigned group = input->group;
+	ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+	unsigned overhead = ext3_bg_has_super(sb, group) ?
+		(1 + ext3_bg_num_gdb(sb, group) +
+		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+	ext3_fsblk_t metaend = start + overhead;
+	struct buffer_head *bh = NULL;
+	ext3_grpblk_t free_blocks_count;
+	int err = -EINVAL;
+
+	input->free_blocks_count = free_blocks_count =
+		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
+		       "(%d free, %u reserved)\n",
+		       ext3_bg_has_super(sb, input->group) ? "normal" :
+		       "no-super", input->group, input->blocks_count,
+		       free_blocks_count, input->reserved_blocks);
+
+	if (group != sbi->s_groups_count)
+		ext3_warning(sb, __func__,
+			     "Cannot add at group %u (only %lu groups)",
+			     input->group, sbi->s_groups_count);
+	else if ((start - le32_to_cpu(es->s_first_data_block)) %
+		 EXT3_BLOCKS_PER_GROUP(sb))
+		ext3_warning(sb, __func__, "Last group not full");
+	else if (input->reserved_blocks > input->blocks_count / 5)
+		ext3_warning(sb, __func__, "Reserved blocks too high (%u)",
+			     input->reserved_blocks);
+	else if (free_blocks_count < 0)
+		ext3_warning(sb, __func__, "Bad blocks count %u",
+			     input->blocks_count);
+	else if (!(bh = sb_bread(sb, end - 1)))
+		ext3_warning(sb, __func__,
+			     "Cannot read last block ("E3FSBLK")",
+			     end - 1);
+	else if (outside(input->block_bitmap, start, end))
+		ext3_warning(sb, __func__,
+			     "Block bitmap not in group (block %u)",
+			     input->block_bitmap);
+	else if (outside(input->inode_bitmap, start, end))
+		ext3_warning(sb, __func__,
+			     "Inode bitmap not in group (block %u)",
+			     input->inode_bitmap);
+	else if (outside(input->inode_table, start, end) ||
+	         outside(itend - 1, start, end))
+		ext3_warning(sb, __func__,
+			     "Inode table not in group (blocks %u-"E3FSBLK")",
+			     input->inode_table, itend - 1);
+	else if (input->inode_bitmap == input->block_bitmap)
+		ext3_warning(sb, __func__,
+			     "Block bitmap same as inode bitmap (%u)",
+			     input->block_bitmap);
+	else if (inside(input->block_bitmap, input->inode_table, itend))
+		ext3_warning(sb, __func__,
+			     "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
+			     input->block_bitmap, input->inode_table, itend-1);
+	else if (inside(input->inode_bitmap, input->inode_table, itend))
+		ext3_warning(sb, __func__,
+			     "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
+			     input->inode_bitmap, input->inode_table, itend-1);
+	else if (inside(input->block_bitmap, start, metaend))
+		ext3_warning(sb, __func__,
+			     "Block bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
+			     input->block_bitmap, start, metaend - 1);
+	else if (inside(input->inode_bitmap, start, metaend))
+		ext3_warning(sb, __func__,
+			     "Inode bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
+			     input->inode_bitmap, start, metaend - 1);
+	else if (inside(input->inode_table, start, metaend) ||
+	         inside(itend - 1, start, metaend))
+		ext3_warning(sb, __func__,
+			     "Inode table (%u-"E3FSBLK") overlaps"
+			     "GDT table ("E3FSBLK"-"E3FSBLK")",
+			     input->inode_table, itend - 1, start, metaend - 1);
+	else
+		err = 0;
+	brelse(bh);
+
+	return err;
+}
+
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+				  ext3_fsblk_t blk)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = sb_getblk(sb, blk);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
+	if ((err = ext3_journal_get_write_access(handle, bh))) {
+		brelse(bh);
+		bh = ERR_PTR(err);
+	} else {
+		lock_buffer(bh);
+		memset(bh->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+	}
+
+	return bh;
+}
+
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+	int i;
+
+	if (start_bit >= end_bit)
+		return;
+
+	ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+		ext3_set_bit(i, bitmap);
+	if (i < end_bit)
+		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/*
+ * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for block_bitmap modifications.
+ */
+static int extend_or_restart_transaction(handle_t *handle, int thresh,
+					 struct buffer_head *bh)
+{
+	int err;
+
+	if (handle->h_buffer_credits >= thresh)
+		return 0;
+
+	err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA);
+	if (err < 0)
+		return err;
+	if (err) {
+		err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA);
+		if (err)
+			return err;
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new group.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ */
+static int setup_new_group_blocks(struct super_block *sb,
+				  struct ext3_new_group_data *input)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
+	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
+		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
+	unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
+	struct buffer_head *bh;
+	handle_t *handle;
+	ext3_fsblk_t block;
+	ext3_grpblk_t bit;
+	int i;
+	int err = 0, err2;
+
+	/* This transaction may be extended/restarted along the way */
+	handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	mutex_lock(&sbi->s_resize_lock);
+	if (input->group != sbi->s_groups_count) {
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	if (ext3_bg_has_super(sb, input->group)) {
+		ext3_debug("mark backup superblock %#04lx (+0)\n", start);
+		ext3_set_bit(0, bh->b_data);
+	}
+
+	/* Copy all of the GDT blocks into the backup in this group */
+	for (i = 0, bit = 1, block = start + 1;
+	     i < gdblocks; i++, block++, bit++) {
+		struct buffer_head *gdb;
+
+		ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
+
+		err = extend_or_restart_transaction(handle, 1, bh);
+		if (err)
+			goto exit_bh;
+
+		gdb = sb_getblk(sb, block);
+		if (unlikely(!gdb)) {
+			err = -ENOMEM;
+			goto exit_bh;
+		}
+		if ((err = ext3_journal_get_write_access(handle, gdb))) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		lock_buffer(gdb);
+		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
+		set_buffer_uptodate(gdb);
+		unlock_buffer(gdb);
+		err = ext3_journal_dirty_metadata(handle, gdb);
+		if (err) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		ext3_set_bit(bit, bh->b_data);
+		brelse(gdb);
+	}
+
+	/* Zero out all of the reserved backup group descriptor table blocks */
+	for (i = 0, bit = gdblocks + 1, block = start + bit;
+	     i < reserved_gdb; i++, block++, bit++) {
+		struct buffer_head *gdb;
+
+		ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+
+		err = extend_or_restart_transaction(handle, 1, bh);
+		if (err)
+			goto exit_bh;
+
+		if (IS_ERR(gdb = bclean(handle, sb, block))) {
+			err = PTR_ERR(gdb);
+			goto exit_bh;
+		}
+		err = ext3_journal_dirty_metadata(handle, gdb);
+		if (err) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		ext3_set_bit(bit, bh->b_data);
+		brelse(gdb);
+	}
+	ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
+		   input->block_bitmap - start);
+	ext3_set_bit(input->block_bitmap - start, bh->b_data);
+	ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
+		   input->inode_bitmap - start);
+	ext3_set_bit(input->inode_bitmap - start, bh->b_data);
+
+	/* Zero out all of the inode table blocks */
+	for (i = 0, block = input->inode_table, bit = block - start;
+	     i < sbi->s_itb_per_group; i++, bit++, block++) {
+		struct buffer_head *it;
+
+		ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
+
+		err = extend_or_restart_transaction(handle, 1, bh);
+		if (err)
+			goto exit_bh;
+
+		if (IS_ERR(it = bclean(handle, sb, block))) {
+			err = PTR_ERR(it);
+			goto exit_bh;
+		}
+		err = ext3_journal_dirty_metadata(handle, it);
+		if (err) {
+			brelse(it);
+			goto exit_bh;
+		}
+		brelse(it);
+		ext3_set_bit(bit, bh->b_data);
+	}
+
+	err = extend_or_restart_transaction(handle, 2, bh);
+	if (err)
+		goto exit_bh;
+
+	mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
+			bh->b_data);
+	err = ext3_journal_dirty_metadata(handle, bh);
+	if (err)
+		goto exit_bh;
+	brelse(bh);
+
+	/* Mark unused entries in inode bitmap used */
+	ext3_debug("clear inode bitmap %#04x (+%ld)\n",
+		   input->inode_bitmap, input->inode_bitmap - start);
+	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
+			bh->b_data);
+	err = ext3_journal_dirty_metadata(handle, bh);
+exit_bh:
+	brelse(bh);
+
+exit_journal:
+	mutex_unlock(&sbi->s_resize_lock);
+	if ((err2 = ext3_journal_stop(handle)) && !err)
+		err = err2;
+
+	return err;
+}
+
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext3 filesystem.  The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time.  In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
+				  unsigned *five, unsigned *seven)
+{
+	unsigned *min = three;
+	int mult = 3;
+	unsigned ret;
+
+	if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ret = *min;
+		*min += 1;
+		return ret;
+	}
+
+	if (*five < *min) {
+		min = five;
+		mult = 5;
+	}
+	if (*seven < *min) {
+		min = seven;
+		mult = 7;
+	}
+
+	ret = *min;
+	*min *= mult;
+
+	return ret;
+}
+
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order.  Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+			       struct buffer_head *primary)
+{
+	const ext3_fsblk_t blk = primary->b_blocknr;
+	const unsigned long end = EXT3_SB(sb)->s_groups_count;
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned grp;
+	__le32 *p = (__le32 *)primary->b_data;
+	int gdbackups = 0;
+
+	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
+		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
+			ext3_warning(sb, __func__,
+				     "reserved GDT "E3FSBLK
+				     " missing grp %d ("E3FSBLK")",
+				     blk, grp,
+				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
+			return -EINVAL;
+		}
+		if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
+			return -EFBIG;
+	}
+
+	return gdbackups;
+}
+
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode.  The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order.  Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+		       struct ext3_new_group_data *input,
+		       struct buffer_head **primary)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
+	ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	struct buffer_head **o_group_desc, **n_group_desc;
+	struct buffer_head *dind;
+	int gdbackups;
+	struct ext3_iloc iloc;
+	__le32 *data;
+	int err;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG
+		       "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
+		       gdb_num);
+
+	/*
+	 * If we are not using the primary superblock/GDT copy don't resize,
+	 * because the user tools have no way of handling this.  Probably a
+	 * bad time to do it anyways.
+	 */
+	if (EXT3_SB(sb)->s_sbh->b_blocknr !=
+	    le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
+		ext3_warning(sb, __func__,
+			"won't resize using backup superblock at %llu",
+			(unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
+		return -EPERM;
+	}
+
+	*primary = sb_bread(sb, gdblock);
+	if (!*primary)
+		return -EIO;
+
+	if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+		err = gdbackups;
+		goto exit_bh;
+	}
+
+	data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_bh;
+	}
+
+	data = (__le32 *)dind->b_data;
+	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
+		ext3_warning(sb, __func__,
+			     "new group %u GDT block "E3FSBLK" not reserved",
+			     input->group, gdblock);
+		err = -EINVAL;
+		goto exit_dind;
+	}
+
+	if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
+		goto exit_dind;
+
+	if ((err = ext3_journal_get_write_access(handle, *primary)))
+		goto exit_sbh;
+
+	if ((err = ext3_journal_get_write_access(handle, dind)))
+		goto exit_primary;
+
+	/* ext3_reserve_inode_write() gets a reference on the iloc */
+	if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_dindj;
+
+	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+			GFP_NOFS);
+	if (!n_group_desc) {
+		err = -ENOMEM;
+		ext3_warning (sb, __func__,
+			      "not enough memory for %lu groups", gdb_num + 1);
+		goto exit_inode;
+	}
+
+	/*
+	 * Finally, we have all of the possible failures behind us...
+	 *
+	 * Remove new GDT block from inode double-indirect block and clear out
+	 * the new GDT block for use (which also "frees" the backup GDT blocks
+	 * from the reserved inode).  We don't need to change the bitmaps for
+	 * these blocks, because they are marked as in-use from being in the
+	 * reserved inode, and will become GDT blocks (primary and backup).
+	 */
+	data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
+	err = ext3_journal_dirty_metadata(handle, dind);
+	if (err)
+		goto exit_group_desc;
+	brelse(dind);
+	dind = NULL;
+	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+	if (err)
+		goto exit_group_desc;
+	memset((*primary)->b_data, 0, sb->s_blocksize);
+	err = ext3_journal_dirty_metadata(handle, *primary);
+	if (err)
+		goto exit_group_desc;
+
+	o_group_desc = EXT3_SB(sb)->s_group_desc;
+	memcpy(n_group_desc, o_group_desc,
+	       EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	n_group_desc[gdb_num] = *primary;
+	EXT3_SB(sb)->s_group_desc = n_group_desc;
+	EXT3_SB(sb)->s_gdb_count++;
+	kfree(o_group_desc);
+
+	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	if (err)
+		goto exit_inode;
+
+	return 0;
+
+exit_group_desc:
+	kfree(n_group_desc);
+exit_inode:
+	//ext3_journal_release_buffer(handle, iloc.bh);
+	brelse(iloc.bh);
+exit_dindj:
+	//ext3_journal_release_buffer(handle, dind);
+exit_primary:
+	//ext3_journal_release_buffer(handle, *primary);
+exit_sbh:
+	//ext3_journal_release_buffer(handle, *primary);
+exit_dind:
+	brelse(dind);
+exit_bh:
+	brelse(*primary);
+
+	ext3_debug("leaving with error %d\n", err);
+	return err;
+}
+
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are.  We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident.  The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+			      struct ext3_new_group_data *input)
+{
+	struct super_block *sb = inode->i_sb;
+	int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
+	struct buffer_head **primary;
+	struct buffer_head *dind;
+	struct ext3_iloc iloc;
+	ext3_fsblk_t blk;
+	__le32 *data, *end;
+	int gdbackups = 0;
+	int res, i;
+	int err;
+
+	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
+	if (!primary)
+		return -ENOMEM;
+
+	data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_free;
+	}
+
+	blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
+					 EXT3_ADDR_PER_BLOCK(sb));
+	end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
+
+	/* Get each reserved primary GDT block and verify it holds backups */
+	for (res = 0; res < reserved_gdb; res++, blk++) {
+		if (le32_to_cpu(*data) != blk) {
+			ext3_warning(sb, __func__,
+				     "reserved block "E3FSBLK
+				     " not at offset %ld",
+				     blk,
+				     (long)(data - (__le32 *)dind->b_data));
+			err = -EINVAL;
+			goto exit_bh;
+		}
+		primary[res] = sb_bread(sb, blk);
+		if (!primary[res]) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+			brelse(primary[res]);
+			err = gdbackups;
+			goto exit_bh;
+		}
+		if (++data >= end)
+			data = (__le32 *)dind->b_data;
+	}
+
+	for (i = 0; i < reserved_gdb; i++) {
+		if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
+			/*
+			int j;
+			for (j = 0; j < i; j++)
+				ext3_journal_release_buffer(handle, primary[j]);
+			 */
+			goto exit_bh;
+		}
+	}
+
+	if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_bh;
+
+	/*
+	 * Finally we can add each of the reserved backup GDT blocks from
+	 * the new group to its reserved primary GDT block.
+	 */
+	blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
+	for (i = 0; i < reserved_gdb; i++) {
+		int err2;
+		data = (__le32 *)primary[i]->b_data;
+		/* printk("reserving backup %lu[%u] = %lu\n",
+		       primary[i]->b_blocknr, gdbackups,
+		       blk + primary[i]->b_blocknr); */
+		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+		err2 = ext3_journal_dirty_metadata(handle, primary[i]);
+		if (!err)
+			err = err2;
+	}
+	inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+	ext3_mark_iloc_dirty(handle, inode, &iloc);
+
+exit_bh:
+	while (--res >= 0)
+		brelse(primary[res]);
+	brelse(dind);
+
+exit_free:
+	kfree(primary);
+
+	return err;
+}
+
+/*
+ * Update the backup copies of the ext3 metadata.  These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem).  However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock.  The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb,
+			   int blk_off, char *data, int size)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	const unsigned long last = sbi->s_groups_count;
+	const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned group;
+	int rest = sb->s_blocksize - size;
+	handle_t *handle;
+	int err = 0, err2;
+
+	handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
+	if (IS_ERR(handle)) {
+		group = 1;
+		err = PTR_ERR(handle);
+		goto exit_err;
+	}
+
+	while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
+		struct buffer_head *bh;
+
+		/* Out of journal space, and can't get more - abort - so sad */
+		if (handle->h_buffer_credits == 0 &&
+		    ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
+		    (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
+			break;
+
+		bh = sb_getblk(sb, group * bpg + blk_off);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			break;
+		}
+		ext3_debug("update metadata backup %#04lx\n",
+			  (unsigned long)bh->b_blocknr);
+		if ((err = ext3_journal_get_write_access(handle, bh))) {
+			brelse(bh);
+			break;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data, data, size);
+		if (rest)
+			memset(bh->b_data + size, 0, rest);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		err = ext3_journal_dirty_metadata(handle, bh);
+		brelse(bh);
+		if (err)
+			break;
+	}
+	if ((err2 = ext3_journal_stop(handle)) && !err)
+		err = err2;
+
+	/*
+	 * Ugh! Need to have e2fsck write the backup copies.  It is too
+	 * late to revert the resize, we shouldn't fail just because of
+	 * the backup copies (they are only needed in case of corruption).
+	 *
+	 * However, if we got here we have a journal problem too, so we
+	 * can't really start a transaction to mark the superblock.
+	 * Chicken out and just set the flag on the hope it will be written
+	 * to disk, and if not - we will simply wait until next fsck.
+	 */
+exit_err:
+	if (err) {
+		ext3_warning(sb, __func__,
+			     "can't update backup for group %d (err %d), "
+			     "forcing fsck on next reboot", group, err);
+		sbi->s_mount_state &= ~EXT3_VALID_FS;
+		sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
+		mark_buffer_dirty(sbi->s_sbh);
+	}
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock.  Prior to that we have
+ * not really "added" the group at all.  We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
+		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+	struct buffer_head *primary = NULL;
+	struct ext3_group_desc *gdp;
+	struct inode *inode = NULL;
+	handle_t *handle;
+	int gdb_off, gdb_num;
+	int err, err2;
+
+	gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
+	gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
+
+	if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ext3_warning(sb, __func__,
+			     "Can't resize non-sparse filesystem further");
+		return -EPERM;
+	}
+
+	if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
+	    le32_to_cpu(es->s_blocks_count)) {
+		ext3_warning(sb, __func__, "blocks_count overflow\n");
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_inodes_count)) {
+		ext3_warning(sb, __func__, "inodes_count overflow\n");
+		return -EINVAL;
+	}
+
+	if (reserved_gdb || gdb_off == 0) {
+		if (!EXT3_HAS_COMPAT_FEATURE(sb,
+					     EXT3_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+			ext3_warning(sb, __func__,
+				     "No reserved GDT blocks, can't resize");
+			return -EPERM;
+		}
+		inode = ext3_iget(sb, EXT3_RESIZE_INO);
+		if (IS_ERR(inode)) {
+			ext3_warning(sb, __func__,
+				     "Error opening resize inode");
+			return PTR_ERR(inode);
+		}
+	}
+
+	if ((err = verify_group_input(sb, input)))
+		goto exit_put;
+
+	if ((err = setup_new_group_blocks(sb, input)))
+		goto exit_put;
+
+	/*
+	 * We will always be modifying at least the superblock and a GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	handle = ext3_journal_start_sb(sb,
+				       ext3_bg_has_super(sb, input->group) ?
+				       3 + reserved_gdb : 4);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit_put;
+	}
+
+	mutex_lock(&sbi->s_resize_lock);
+	if (input->group != sbi->s_groups_count) {
+		ext3_warning(sb, __func__,
+			     "multiple resizers run on filesystem!");
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
+		goto exit_journal;
+
+	/*
+	 * We will only either add reserved group blocks to a backup group
+	 * or remove reserved blocks for the first group in a new group block.
+	 * Doing both would be mean more complex code, and sane people don't
+	 * use non-sparse filesystems anymore.  This is already checked above.
+	 */
+	if (gdb_off) {
+		primary = sbi->s_group_desc[gdb_num];
+		if ((err = ext3_journal_get_write_access(handle, primary)))
+			goto exit_journal;
+
+		if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
+		    (err = reserve_backup_gdb(handle, inode, input)))
+			goto exit_journal;
+	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
+		goto exit_journal;
+
+	/*
+	 * OK, now we've set up the new group.  Time to make it active.
+	 *
+	 * We do not lock all allocations via s_resize_lock
+	 * so we have to be safe wrt. concurrent accesses the group
+	 * data.  So we need to be careful to set all of the relevant
+	 * group descriptor data etc. *before* we enable the group.
+	 *
+	 * The key field here is sbi->s_groups_count: as long as
+	 * that retains its old value, nobody is going to access the new
+	 * group.
+	 *
+	 * So first we update all the descriptor metadata for the new
+	 * group; then we update the total disk blocks count; then we
+	 * update the groups count to enable the group; then finally we
+	 * update the free space counts so that the system can start
+	 * using the new disk blocks.
+	 */
+
+	/* Update group descriptor block for new group */
+	gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
+
+	gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
+	gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
+	gdp->bg_inode_table = cpu_to_le32(input->inode_table);
+	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
+	gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
+
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	le32_add_cpu(&es->s_blocks_count, input->blocks_count);
+	le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb));
+
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers of s_groups_count *must* hold s_resize_lock
+	 * AND
+	 * * Writers must perform a smp_wmb() after updating all dependent
+	 *   data and before modifying the groups count
+	 *
+	 * * Readers must hold s_resize_lock over the access
+	 * OR
+	 * * Readers must perform an smp_rmb() after reading the groups count
+	 *   and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count++;
+
+	err = ext3_journal_dirty_metadata(handle, primary);
+	if (err)
+		goto exit_journal;
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_add(&sbi->s_freeblocks_counter,
+			   input->free_blocks_count);
+	percpu_counter_add(&sbi->s_freeinodes_counter,
+			   EXT3_INODES_PER_GROUP(sb));
+
+	err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+
+exit_journal:
+	mutex_unlock(&sbi->s_resize_lock);
+	if ((err2 = ext3_journal_stop(handle)) && !err)
+		err = err2;
+	if (!err) {
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext3_super_block));
+		update_backups(sb, primary->b_blocknr, primary->b_data,
+			       primary->b_size);
+	}
+exit_put:
+	iput(inode);
+	return err;
+} /* ext3_group_add */
+
+/* Extend the filesystem to the new number of blocks specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext3_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
+		      ext3_fsblk_t n_blocks_count)
+{
+	ext3_fsblk_t o_blocks_count;
+	ext3_grpblk_t last;
+	ext3_grpblk_t add;
+	struct buffer_head * bh;
+	handle_t *handle;
+	int err;
+	unsigned long freed_blocks;
+
+	/* We don't need to worry about locking wrt other resizers just
+	 * yet: we're going to revalidate es->s_blocks_count after
+	 * taking the s_resize_lock below. */
+	o_blocks_count = le32_to_cpu(es->s_blocks_count);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+		       " up to "E3FSBLK" blocks\n",
+		       o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+		return 0;
+
+	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+			" too large to resize to "E3FSBLK" blocks safely\n",
+			sb->s_id, n_blocks_count);
+		if (sizeof(sector_t) < 8)
+			ext3_warning(sb, __func__,
+			"CONFIG_LBDAF not enabled\n");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count < o_blocks_count) {
+		ext3_warning(sb, __func__,
+			     "can't shrink FS - resize aborted");
+		return -EBUSY;
+	}
+
+	/* Handle the remaining blocks in the last group only. */
+	last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
+		EXT3_BLOCKS_PER_GROUP(sb);
+
+	if (last == 0) {
+		ext3_warning(sb, __func__,
+			     "need to use ext2online to resize further");
+		return -EPERM;
+	}
+
+	add = EXT3_BLOCKS_PER_GROUP(sb) - last;
+
+	if (o_blocks_count + add < o_blocks_count) {
+		ext3_warning(sb, __func__, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (o_blocks_count + add > n_blocks_count)
+		add = n_blocks_count - o_blocks_count;
+
+	if (o_blocks_count + add < n_blocks_count)
+		ext3_warning(sb, __func__,
+			     "will only finish group ("E3FSBLK
+			     " blocks, %u new)",
+			     o_blocks_count + add, add);
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, o_blocks_count + add -1);
+	if (!bh) {
+		ext3_warning(sb, __func__,
+			     "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext3_free_blocks().
+	 */
+	handle = ext3_journal_start_sb(sb, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext3_warning(sb, __func__, "error %d on journal start",err);
+		goto exit_put;
+	}
+
+	mutex_lock(&EXT3_SB(sb)->s_resize_lock);
+	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
+		ext3_warning(sb, __func__,
+			     "multiple resizers run on filesystem!");
+		mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+		ext3_journal_stop(handle);
+		err = -EBUSY;
+		goto exit_put;
+	}
+
+	if ((err = ext3_journal_get_write_access(handle,
+						 EXT3_SB(sb)->s_sbh))) {
+		ext3_warning(sb, __func__,
+			     "error %d on journal write access", err);
+		mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+		ext3_journal_stop(handle);
+		goto exit_put;
+	}
+	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+	if (err) {
+		ext3_warning(sb, __func__,
+			     "error %d on journal dirty metadata", err);
+		ext3_journal_stop(handle);
+		goto exit_put;
+	}
+	ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
+	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
+	if ((err = ext3_journal_stop(handle)))
+		goto exit_put;
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
+		       le32_to_cpu(es->s_blocks_count));
+	update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
+		       sizeof(struct ext3_super_block));
+exit_put:
+	return err;
+} /* ext3_group_extend */
diff --git a/kernel/fs/ext3/super.c b/kernel/fs/ext3/super.c
new file mode 100644
index 000000000..a9312f0a5
--- /dev/null
+++ b/kernel/fs/ext3/super.c
@@ -0,0 +1,3165 @@
+/*
+ *  linux/fs/ext3/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/exportfs.h>
+#include <linux/statfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include <linux/log2.h>
+#include <linux/cleancache.h>
+#include <linux/namei.h>
+
+#include <asm/uaccess.h>
+
+#define CREATE_TRACE_POINTS
+
+#include "ext3.h"
+#include "xattr.h"
+#include "acl.h"
+#include "namei.h"
+
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
+static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
+			     unsigned long journal_devnum);
+static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
+			       unsigned int);
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
+			       int sync);
+static void ext3_mark_recovery_complete(struct super_block * sb,
+					struct ext3_super_block * es);
+static void ext3_clear_journal_err(struct super_block * sb,
+				   struct ext3_super_block * es);
+static int ext3_sync_fs(struct super_block *sb, int wait);
+static const char *ext3_decode_error(struct super_block * sb, int errno,
+				     char nbuf[16]);
+static int ext3_remount (struct super_block * sb, int * flags, char * data);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext3_unfreeze(struct super_block *sb);
+static int ext3_freeze(struct super_block *sb);
+
+/*
+ * Wrappers for journal_start/end.
+ */
+handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
+{
+	journal_t *journal;
+
+	if (sb->s_flags & MS_RDONLY)
+		return ERR_PTR(-EROFS);
+
+	/* Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly. */
+	journal = EXT3_SB(sb)->s_journal;
+	if (is_journal_aborted(journal)) {
+		ext3_abort(sb, __func__,
+			   "Detected aborted journal");
+		return ERR_PTR(-EROFS);
+	}
+
+	return journal_start(journal, nblocks);
+}
+
+int __ext3_journal_stop(const char *where, handle_t *handle)
+{
+	struct super_block *sb;
+	int err;
+	int rc;
+
+	sb = handle->h_transaction->t_journal->j_private;
+	err = handle->h_err;
+	rc = journal_stop(handle);
+
+	if (!err)
+		err = rc;
+	if (err)
+		__ext3_std_error(sb, where, err);
+	return err;
+}
+
+void ext3_journal_abort_handle(const char *caller, const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err)
+{
+	char nbuf[16];
+	const char *errstr = ext3_decode_error(NULL, err, nbuf);
+
+	if (bh)
+		BUFFER_TRACE(bh, "abort");
+
+	if (!handle->h_err)
+		handle->h_err = err;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
+		caller, errstr, err_fn);
+
+	journal_abort_handle(handle);
+}
+
+void ext3_msg(struct super_block *sb, const char *prefix,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
+	va_end(args);
+}
+
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock.  That is not possible on ext3, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the journal_abort() error code to record an error in
+ * the journal instead.  On recovery, the journal will complain about
+ * that error until we've noted it down and cleared it.
+ */
+
+static void ext3_handle_error(struct super_block *sb)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+	es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (!test_opt (sb, ERRORS_CONT)) {
+		journal_t *journal = EXT3_SB(sb)->s_journal;
+
+		set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+		if (journal)
+			journal_abort(journal, -EIO);
+	}
+	if (test_opt (sb, ERRORS_RO)) {
+		ext3_msg(sb, KERN_CRIT,
+			"error: remounting filesystem read-only");
+		/*
+		 * Make sure updated value of ->s_mount_state will be visible
+		 * before ->s_flags update.
+		 */
+		smp_wmb();
+		sb->s_flags |= MS_RDONLY;
+	}
+	ext3_commit_super(sb, es, 1);
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT3-fs (%s): panic forced after error\n",
+			sb->s_id);
+}
+
+void ext3_error(struct super_block *sb, const char *function,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	ext3_handle_error(sb);
+}
+
+static const char *ext3_decode_error(struct super_block * sb, int errno,
+				     char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
+			errstr = "Journal has aborted";
+		else
+			errstr = "Readonly filesystem";
+		break;
+	default:
+		/* If the caller passed in an extra buffer for unknown
+		 * errors, textualise them now.  Else we just return
+		 * NULL. */
+		if (nbuf) {
+			/* Check for truncated error codes... */
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+/* __ext3_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response.  */
+
+void __ext3_std_error (struct super_block * sb, const char * function,
+		       int errno)
+{
+	char nbuf[16];
+	const char *errstr;
+
+	/* Special case: if the error is EROFS, and we're not already
+	 * inside a transaction, then there's really no point in logging
+	 * an error. */
+	if (errno == -EROFS && journal_current_handle() == NULL &&
+	    (sb->s_flags & MS_RDONLY))
+		return;
+
+	errstr = ext3_decode_error(sb, errno, nbuf);
+	ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
+
+	ext3_handle_error(sb);
+}
+
+/*
+ * ext3_abort is a much stronger failure handler than ext3_error.  The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+void ext3_abort(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT3-fs: panic from previous error\n");
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	ext3_msg(sb, KERN_CRIT,
+		"error: remounting filesystem read-only");
+	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+	set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+	/*
+	 * Make sure updated value of ->s_mount_state will be visible
+	 * before ->s_flags update.
+	 */
+	smp_wmb();
+	sb->s_flags |= MS_RDONLY;
+
+	if (EXT3_SB(sb)->s_journal)
+		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+}
+
+void ext3_warning(struct super_block *sb, const char *function,
+		  const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+}
+
+void ext3_update_dynamic_rev(struct super_block *sb)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
+		return;
+
+	ext3_msg(sb, KERN_WARNING,
+		"warning: updating to rev %d because of "
+		"new feature flag, running e2fsck is recommended",
+		EXT3_DYNAMIC_REV);
+
+	es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
+	es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
+	es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
+	/* leave es->s_feature_*compat flags alone */
+	/* es->s_uuid will be set by e2fsck if empty */
+
+	/*
+	 * The rest of the superblock fields should be zero, and if not it
+	 * means they are likely already in use, so leave them alone.  We
+	 * can leave it up to e2fsck to clean up any inconsistencies there.
+	 */
+}
+
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
+{
+	struct block_device *bdev;
+	char b[BDEVNAME_SIZE];
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	if (IS_ERR(bdev))
+		goto fail;
+	return bdev;
+
+fail:
+	ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
+		__bdevname(dev, b), PTR_ERR(bdev));
+
+	return NULL;
+}
+
+/*
+ * Release the journal device
+ */
+static void ext3_blkdev_put(struct block_device *bdev)
+{
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
+{
+	struct block_device *bdev;
+	bdev = sbi->journal_bdev;
+	if (bdev) {
+		ext3_blkdev_put(bdev);
+		sbi->journal_bdev = NULL;
+	}
+}
+
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+	return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
+}
+
+static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
+{
+	struct list_head *l;
+
+	ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
+	       le32_to_cpu(sbi->s_es->s_last_orphan));
+
+	ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
+	list_for_each(l, &sbi->s_orphan) {
+		struct inode *inode = orphan_list_entry(l);
+		ext3_msg(sb, KERN_ERR, "  "
+		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+		       inode->i_sb->s_id, inode->i_ino, inode,
+		       inode->i_mode, inode->i_nlink,
+		       NEXT_ORPHAN(inode));
+	}
+}
+
+static void ext3_put_super (struct super_block * sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	int i, err;
+
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+	ext3_xattr_put_super(sb);
+	err = journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
+	if (err < 0)
+		ext3_abort(sb, __func__, "Couldn't clean up the journal");
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+		mark_buffer_dirty(sbi->s_sbh);
+		ext3_commit_super(sb, es, 1);
+	}
+
+	for (i = 0; i < sbi->s_gdb_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < EXT3_MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+
+	/* Debugging code just in case the in-memory inode orphan list
+	 * isn't empty.  The on-disk one can be non-empty if we've
+	 * detected an error and taken the fs readonly, but the
+	 * in-memory list had better be clean by this point. */
+	if (!list_empty(&sbi->s_orphan))
+		dump_orphan_list(sb, sbi);
+	J_ASSERT(list_empty(&sbi->s_orphan));
+
+	invalidate_bdev(sb->s_bdev);
+	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+		/*
+		 * Invalidate the journal device's buffers.  We don't want them
+		 * floating about in memory - the physical journal device may
+		 * hotswapped, and it breaks the `ro-after' testing code.
+		 */
+		sync_blockdev(sbi->journal_bdev);
+		invalidate_bdev(sbi->journal_bdev);
+		ext3_blkdev_remove(sbi);
+	}
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	mutex_destroy(&sbi->s_orphan_lock);
+	mutex_destroy(&sbi->s_resize_lock);
+	kfree(sbi);
+}
+
+static struct kmem_cache *ext3_inode_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext3_alloc_inode(struct super_block *sb)
+{
+	struct ext3_inode_info *ei;
+
+	ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	ei->i_block_alloc_info = NULL;
+	ei->vfs_inode.i_version = 1;
+	atomic_set(&ei->i_datasync_tid, 0);
+	atomic_set(&ei->i_sync_tid, 0);
+#ifdef CONFIG_QUOTA
+	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
+#endif
+
+	return &ei->vfs_inode;
+}
+
+static int ext3_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	trace_ext3_drop_inode(inode, drop);
+	return drop;
+}
+
+static void ext3_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
+
+static void ext3_destroy_inode(struct inode *inode)
+{
+	if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
+		printk("EXT3 Inode %p: orphan list check failed!\n",
+			EXT3_I(inode));
+		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+				EXT3_I(inode), sizeof(struct ext3_inode_info),
+				false);
+		dump_stack();
+	}
+	call_rcu(&inode->i_rcu, ext3_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
+
+	INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT3_FS_XATTR
+	init_rwsem(&ei->xattr_sem);
+#endif
+	mutex_init(&ei->truncate_mutex);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
+					     sizeof(struct ext3_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ext3_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ext3_inode_cachep);
+}
+
+static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+	if (test_opt(sb, USRQUOTA))
+		seq_puts(seq, ",usrquota");
+
+	if (test_opt(sb, GRPQUOTA))
+		seq_puts(seq, ",grpquota");
+#endif
+}
+
+static char *data_mode_string(unsigned long mode)
+{
+	switch (mode) {
+	case EXT3_MOUNT_JOURNAL_DATA:
+		return "journal";
+	case EXT3_MOUNT_ORDERED_DATA:
+		return "ordered";
+	case EXT3_MOUNT_WRITEBACK_DATA:
+		return "writeback";
+	}
+	return "unknown";
+}
+
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int ext3_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	unsigned long def_mount_opts;
+
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+
+	if (sbi->s_sb_block != 1)
+		seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
+	if (test_opt(sb, MINIX_DF))
+		seq_puts(seq, ",minixdf");
+	if (test_opt(sb, GRPID))
+		seq_puts(seq, ",grpid");
+	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
+		seq_puts(seq, ",nogrpid");
+	if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
+	    le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
+		seq_printf(seq, ",resuid=%u",
+				from_kuid_munged(&init_user_ns, sbi->s_resuid));
+	}
+	if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
+	    le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
+		seq_printf(seq, ",resgid=%u",
+				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+	}
+	if (test_opt(sb, ERRORS_RO)) {
+		int def_errors = le16_to_cpu(es->s_errors);
+
+		if (def_errors == EXT3_ERRORS_PANIC ||
+		    def_errors == EXT3_ERRORS_CONTINUE) {
+			seq_puts(seq, ",errors=remount-ro");
+		}
+	}
+	if (test_opt(sb, ERRORS_CONT))
+		seq_puts(seq, ",errors=continue");
+	if (test_opt(sb, ERRORS_PANIC))
+		seq_puts(seq, ",errors=panic");
+	if (test_opt(sb, NO_UID32))
+		seq_puts(seq, ",nouid32");
+	if (test_opt(sb, DEBUG))
+		seq_puts(seq, ",debug");
+#ifdef CONFIG_EXT3_FS_XATTR
+	if (test_opt(sb, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	if (!test_opt(sb, XATTR_USER) &&
+	    (def_mount_opts & EXT3_DEFM_XATTR_USER)) {
+		seq_puts(seq, ",nouser_xattr");
+	}
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	if (test_opt(sb, POSIX_ACL))
+		seq_puts(seq, ",acl");
+	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL))
+		seq_puts(seq, ",noacl");
+#endif
+	if (!test_opt(sb, RESERVATION))
+		seq_puts(seq, ",noreservation");
+	if (sbi->s_commit_interval) {
+		seq_printf(seq, ",commit=%u",
+			   (unsigned) (sbi->s_commit_interval / HZ));
+	}
+
+	/*
+	 * Always display barrier state so it's clear what the status is.
+	 */
+	seq_puts(seq, ",barrier=");
+	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+	seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
+	if (test_opt(sb, NOLOAD))
+		seq_puts(seq, ",norecovery");
+
+	ext3_show_quota_options(seq, sb);
+
+	return 0;
+}
+
+
+static struct inode *ext3_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+	if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	/* iget isn't really right if the inode is currently unallocated!!
+	 *
+	 * ext3_read_inode will return a bad_inode if the inode had been
+	 * deleted, so we should be safe.
+	 *
+	 * Currently we don't know the generation for parent directory, so
+	 * a generation of 0 means "accept any"
+	 */
+	inode = ext3_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ext3_nfs_get_inode);
+}
+
+static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ext3_nfs_get_inode);
+}
+
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
+{
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return journal_try_to_free_buffers(journal, page, 
+						   wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+
+static int ext3_write_dquot(struct dquot *dquot);
+static int ext3_acquire_dquot(struct dquot *dquot);
+static int ext3_release_dquot(struct dquot *dquot);
+static int ext3_mark_dquot_dirty(struct dquot *dquot);
+static int ext3_write_info(struct super_block *sb, int type);
+static int ext3_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path);
+static int ext3_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off);
+static ssize_t ext3_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off);
+static struct dquot **ext3_get_dquots(struct inode *inode)
+{
+	return EXT3_I(inode)->i_dquot;
+}
+
+static const struct dquot_operations ext3_quota_operations = {
+	.write_dquot	= ext3_write_dquot,
+	.acquire_dquot	= ext3_acquire_dquot,
+	.release_dquot	= ext3_release_dquot,
+	.mark_dirty	= ext3_mark_dquot_dirty,
+	.write_info	= ext3_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+
+static const struct quotactl_ops ext3_qctl_operations = {
+	.quota_on	= ext3_quota_on,
+	.quota_off	= dquot_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_state	= dquot_get_state,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+#endif
+
+static const struct super_operations ext3_sops = {
+	.alloc_inode	= ext3_alloc_inode,
+	.destroy_inode	= ext3_destroy_inode,
+	.write_inode	= ext3_write_inode,
+	.dirty_inode	= ext3_dirty_inode,
+	.drop_inode	= ext3_drop_inode,
+	.evict_inode	= ext3_evict_inode,
+	.put_super	= ext3_put_super,
+	.sync_fs	= ext3_sync_fs,
+	.freeze_fs	= ext3_freeze,
+	.unfreeze_fs	= ext3_unfreeze,
+	.statfs		= ext3_statfs,
+	.remount_fs	= ext3_remount,
+	.show_options	= ext3_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext3_quota_read,
+	.quota_write	= ext3_quota_write,
+	.get_dquots	= ext3_get_dquots,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct export_operations ext3_export_ops = {
+	.fh_to_dentry = ext3_fh_to_dentry,
+	.fh_to_parent = ext3_fh_to_parent,
+	.get_parent = ext3_get_parent,
+};
+
+enum {
+	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_journal_path,
+	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
+	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_resize, Opt_usrquota, Opt_grpquota
+};
+
+static const match_table_t tokens = {
+	{Opt_bsd_df, "bsddf"},
+	{Opt_minix_df, "minixdf"},
+	{Opt_grpid, "grpid"},
+	{Opt_grpid, "bsdgroups"},
+	{Opt_nogrpid, "nogrpid"},
+	{Opt_nogrpid, "sysvgroups"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
+	{Opt_sb, "sb=%u"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_nouid32, "nouid32"},
+	{Opt_nocheck, "nocheck"},
+	{Opt_nocheck, "check=none"},
+	{Opt_debug, "debug"},
+	{Opt_oldalloc, "oldalloc"},
+	{Opt_orlov, "orlov"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_reservation, "reservation"},
+	{Opt_noreservation, "noreservation"},
+	{Opt_noload, "noload"},
+	{Opt_noload, "norecovery"},
+	{Opt_nobh, "nobh"},
+	{Opt_bh, "bh"},
+	{Opt_commit, "commit=%u"},
+	{Opt_journal_update, "journal=update"},
+	{Opt_journal_inum, "journal=%u"},
+	{Opt_journal_dev, "journal_dev=%u"},
+	{Opt_journal_path, "journal_path=%s"},
+	{Opt_abort, "abort"},
+	{Opt_data_journal, "data=journal"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
+	{Opt_offusrjquota, "usrjquota="},
+	{Opt_usrjquota, "usrjquota=%s"},
+	{Opt_offgrpjquota, "grpjquota="},
+	{Opt_grpjquota, "grpjquota=%s"},
+	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_noquota, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_resize, "resize"},
+	{Opt_err, NULL},
+};
+
+static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
+{
+	ext3_fsblk_t	sb_block;
+	char		*options = (char *) *data;
+
+	if (!options || strncmp(options, "sb=", 3) != 0)
+		return 1;	/* Default location */
+	options += 3;
+	/*todo: use simple_strtoll with >32bit ext3 */
+	sb_block = simple_strtoul(options, &options, 0);
+	if (*options && *options != ',') {
+		ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
+		       (char *) *data);
+		return 1;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *) options;
+	return sb_block;
+}
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	char *qname;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext3_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return 0;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext3_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return 0;
+	}
+	if (sbi->s_qf_names[qtype]) {
+		int same = !strcmp(sbi->s_qf_names[qtype], qname);
+
+		kfree(qname);
+		if (!same) {
+			ext3_msg(sb, KERN_ERR,
+				 "%s quota file already specified",
+				 QTYPE2NAME(qtype));
+		}
+		return same;
+	}
+	if (strchr(qname, '/')) {
+		ext3_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		kfree(qname);
+		return 0;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	set_opt(sbi->s_mount_opt, QUOTA);
+	return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype) {
+
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return 0;
+	}
+	if (sbi->s_qf_names[qtype]) {
+		kfree(sbi->s_qf_names[qtype]);
+		sbi->s_qf_names[qtype] = NULL;
+	}
+	return 1;
+}
+#endif
+
+static int parse_options (char *options, struct super_block *sb,
+			  unsigned int *inum, unsigned long *journal_devnum,
+			  ext3_fsblk_t *n_blocks_count, int is_remount)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	char * p;
+	substring_t args[MAX_OPT_ARGS];
+	int data_opt = 0;
+	int option;
+	kuid_t uid;
+	kgid_t gid;
+	char *journal_path;
+	struct inode *journal_inode;
+	struct path path;
+	int error;
+
+#ifdef CONFIG_QUOTA
+	int qfmt;
+#endif
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep (&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bsd_df:
+			clear_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_minix_df:
+			set_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_grpid:
+			set_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_nogrpid:
+			clear_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_resuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid)) {
+				ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
+				return 0;
+
+			}
+			sbi->s_resuid = uid;
+			break;
+		case Opt_resgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid)) {
+				ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
+				return 0;
+			}
+			sbi->s_resgid = gid;
+			break;
+		case Opt_sb:
+			/* handled by get_sb_block() instead of here */
+			/* *sb_block = match_int(&args[0]); */
+			break;
+		case Opt_err_panic:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_CONT);
+			break;
+		case Opt_nouid32:
+			set_opt (sbi->s_mount_opt, NO_UID32);
+			break;
+		case Opt_nocheck:
+			clear_opt (sbi->s_mount_opt, CHECK);
+			break;
+		case Opt_debug:
+			set_opt (sbi->s_mount_opt, DEBUG);
+			break;
+		case Opt_oldalloc:
+			ext3_msg(sb, KERN_WARNING,
+				"Ignoring deprecated oldalloc option");
+			break;
+		case Opt_orlov:
+			ext3_msg(sb, KERN_WARNING,
+				"Ignoring deprecated orlov option");
+			break;
+#ifdef CONFIG_EXT3_FS_XATTR
+		case Opt_user_xattr:
+			set_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+		case Opt_nouser_xattr:
+			ext3_msg(sb, KERN_INFO,
+				"(no)user_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			ext3_msg(sb, KERN_INFO,
+				"(no)acl options not supported");
+			break;
+#endif
+		case Opt_reservation:
+			set_opt(sbi->s_mount_opt, RESERVATION);
+			break;
+		case Opt_noreservation:
+			clear_opt(sbi->s_mount_opt, RESERVATION);
+			break;
+		case Opt_journal_update:
+			/* @@@ FIXME */
+			/* Eventually we will want to be able to create
+			   a journal file here.  For now, only allow the
+			   user to specify an existing inode to be the
+			   journal file. */
+			if (is_remount) {
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+					"journal on remount");
+				return 0;
+			}
+			set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+			break;
+		case Opt_journal_inum:
+			if (is_remount) {
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+				       "journal on remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*inum = option;
+			break;
+		case Opt_journal_dev:
+			if (is_remount) {
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+				       "journal on remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*journal_devnum = option;
+			break;
+		case Opt_journal_path:
+			if (is_remount) {
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+				       "journal on remount");
+				return 0;
+			}
+
+			journal_path = match_strdup(&args[0]);
+			if (!journal_path) {
+				ext3_msg(sb, KERN_ERR, "error: could not dup "
+					"journal device string");
+				return 0;
+			}
+
+			error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+			if (error) {
+				ext3_msg(sb, KERN_ERR, "error: could not find "
+					"journal device path: error %d", error);
+				kfree(journal_path);
+				return 0;
+			}
+
+			journal_inode = d_inode(path.dentry);
+			if (!S_ISBLK(journal_inode->i_mode)) {
+				ext3_msg(sb, KERN_ERR, "error: journal path %s "
+					"is not a block device", journal_path);
+				path_put(&path);
+				kfree(journal_path);
+				return 0;
+			}
+
+			*journal_devnum = new_encode_dev(journal_inode->i_rdev);
+			path_put(&path);
+			kfree(journal_path);
+			break;
+		case Opt_noload:
+			set_opt (sbi->s_mount_opt, NOLOAD);
+			break;
+		case Opt_commit:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+			sbi->s_commit_interval = HZ * option;
+			break;
+		case Opt_data_journal:
+			data_opt = EXT3_MOUNT_JOURNAL_DATA;
+			goto datacheck;
+		case Opt_data_ordered:
+			data_opt = EXT3_MOUNT_ORDERED_DATA;
+			goto datacheck;
+		case Opt_data_writeback:
+			data_opt = EXT3_MOUNT_WRITEBACK_DATA;
+		datacheck:
+			if (is_remount) {
+				if (test_opt(sb, DATA_FLAGS) == data_opt)
+					break;
+				ext3_msg(sb, KERN_ERR,
+					"error: cannot change "
+					"data mode on remount. The filesystem "
+					"is mounted in data=%s mode and you "
+					"try to remount it in data=%s mode.",
+					data_mode_string(test_opt(sb,
+							DATA_FLAGS)),
+					data_mode_string(data_opt));
+				return 0;
+			} else {
+				clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+				sbi->s_mount_opt |= data_opt;
+			}
+			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+#ifdef CONFIG_QUOTA
+		case Opt_usrjquota:
+			if (!set_qf_name(sb, USRQUOTA, &args[0]))
+				return 0;
+			break;
+		case Opt_grpjquota:
+			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
+				return 0;
+			break;
+		case Opt_offusrjquota:
+			if (!clear_qf_name(sb, USRQUOTA))
+				return 0;
+			break;
+		case Opt_offgrpjquota:
+			if (!clear_qf_name(sb, GRPQUOTA))
+				return 0;
+			break;
+		case Opt_jqfmt_vfsold:
+			qfmt = QFMT_VFS_OLD;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv0:
+			qfmt = QFMT_VFS_V0;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv1:
+			qfmt = QFMT_VFS_V1;
+set_qf_format:
+			if (sb_any_quota_loaded(sb) &&
+			    sbi->s_jquota_fmt != qfmt) {
+				ext3_msg(sb, KERN_ERR, "error: cannot change "
+					"journaled quota options when "
+					"quota turned on.");
+				return 0;
+			}
+			sbi->s_jquota_fmt = qfmt;
+			break;
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+		case Opt_noquota:
+			if (sb_any_quota_loaded(sb)) {
+				ext3_msg(sb, KERN_ERR, "error: cannot change "
+					"quota options when quota turned on.");
+				return 0;
+			}
+			clear_opt(sbi->s_mount_opt, QUOTA);
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+			ext3_msg(sb, KERN_ERR,
+				"error: quota options not supported.");
+			break;
+		case Opt_usrjquota:
+		case Opt_grpjquota:
+		case Opt_offusrjquota:
+		case Opt_offgrpjquota:
+		case Opt_jqfmt_vfsold:
+		case Opt_jqfmt_vfsv0:
+		case Opt_jqfmt_vfsv1:
+			ext3_msg(sb, KERN_ERR,
+				"error: journaled quota options not "
+				"supported.");
+			break;
+		case Opt_noquota:
+			break;
+#endif
+		case Opt_abort:
+			set_opt(sbi->s_mount_opt, ABORT);
+			break;
+		case Opt_nobarrier:
+			clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
+		case Opt_barrier:
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
+			if (option)
+				set_opt(sbi->s_mount_opt, BARRIER);
+			else
+				clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
+		case Opt_ignore:
+			break;
+		case Opt_resize:
+			if (!is_remount) {
+				ext3_msg(sb, KERN_ERR,
+					"error: resize option only available "
+					"for remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option) != 0)
+				return 0;
+			*n_blocks_count = option;
+			break;
+		case Opt_nobh:
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated nobh option");
+			break;
+		case Opt_bh:
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated bh option");
+			break;
+		default:
+			ext3_msg(sb, KERN_ERR,
+				"error: unrecognized mount option \"%s\" "
+				"or missing value", p);
+			return 0;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+			ext3_msg(sb, KERN_ERR, "error: old and new quota "
+					"format mixing.");
+			return 0;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
+					"not specified.");
+			return 0;
+		}
+	}
+#endif
+	return 1;
+}
+
+static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
+			    int read_only)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int res = 0;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
+		ext3_msg(sb, KERN_ERR,
+			"error: revision level too high, "
+			"forcing read-only mode");
+		res = MS_RDONLY;
+	}
+	if (read_only)
+		return res;
+	if (!(sbi->s_mount_state & EXT3_VALID_FS))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: mounting unchecked fs, "
+			"running e2fsck is recommended");
+	else if ((sbi->s_mount_state & EXT3_ERROR_FS))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: mounting fs with errors, "
+			"running e2fsck is recommended");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
+		 le16_to_cpu(es->s_mnt_count) >=
+			le16_to_cpu(es->s_max_mnt_count))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: maximal mount count reached, "
+			"running e2fsck is recommended");
+	else if (le32_to_cpu(es->s_checkinterval) &&
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: checktime reached, "
+			"running e2fsck is recommended");
+#if 0
+		/* @@@ We _will_ want to clear the valid bit if we find
+                   inconsistencies, to force a fsck at reboot.  But for
+                   a plain journaled filesystem we can keep it set as
+                   valid forever! :) */
+	es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
+#endif
+	if (!le16_to_cpu(es->s_max_mnt_count))
+		es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
+	le16_add_cpu(&es->s_mnt_count, 1);
+	es->s_mtime = cpu_to_le32(get_seconds());
+	ext3_update_dynamic_rev(sb);
+	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+
+	ext3_commit_super(sb, es, 1);
+	if (test_opt(sb, DEBUG))
+		ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
+				"bpg=%lu, ipg=%lu, mo=%04lx]",
+			sb->s_blocksize,
+			sbi->s_groups_count,
+			EXT3_BLOCKS_PER_GROUP(sb),
+			EXT3_INODES_PER_GROUP(sb),
+			sbi->s_mount_opt);
+
+	if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+		char b[BDEVNAME_SIZE];
+		ext3_msg(sb, KERN_INFO, "using external journal on %s",
+			bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
+	} else {
+		ext3_msg(sb, KERN_INFO, "using internal journal");
+	}
+	cleancache_init_fs(sb);
+	return res;
+}
+
+/* Called at mount-time, super-block is locked */
+static int ext3_check_descriptors(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int i;
+
+	ext3_debug ("Checking group descriptors");
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
+		ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i);
+		ext3_fsblk_t last_block;
+
+		if (i == sbi->s_groups_count - 1)
+			last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+		else
+			last_block = first_block +
+				(EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+		if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
+		{
+			ext3_error (sb, "ext3_check_descriptors",
+				    "Block bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long)
+					le32_to_cpu(gdp->bg_block_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
+		{
+			ext3_error (sb, "ext3_check_descriptors",
+				    "Inode bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long)
+					le32_to_cpu(gdp->bg_inode_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
+		    last_block)
+		{
+			ext3_error (sb, "ext3_check_descriptors",
+				    "Inode table for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long)
+					le32_to_cpu(gdp->bg_inode_table));
+			return 0;
+		}
+	}
+
+	sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
+	sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
+	return 1;
+}
+
+
+/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash.  We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode.  The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext3_free_inode().  The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext3_orphan_cleanup (struct super_block * sb,
+				 struct ext3_super_block * es)
+{
+	unsigned int s_flags = sb->s_flags;
+	int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+	if (!es->s_last_orphan) {
+		jbd_debug(4, "no orphan inodes to clean up\n");
+		return;
+	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		ext3_msg(sb, KERN_ERR, "error: write access "
+			"unavailable, skipping orphan cleanup.");
+		return;
+	}
+
+	/* Check if feature set allows readwrite operations */
+	if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+		ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+			 "unknown ROCOMPAT features");
+		return;
+	}
+
+	if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
+		/* don't clear list on RO mount w/ errors */
+		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
+			jbd_debug(1, "Errors on filesystem, "
+				  "clearing orphan list.\n");
+			es->s_last_orphan = 0;
+		}
+		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+		return;
+	}
+
+	if (s_flags & MS_RDONLY) {
+		ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+		sb->s_flags &= ~MS_RDONLY;
+	}
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	sb->s_flags |= MS_ACTIVE;
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < EXT3_MAXQUOTAS; i++) {
+		if (EXT3_SB(sb)->s_qf_names[i]) {
+			int ret = ext3_quota_on_mount(sb, i);
+			if (ret < 0)
+				ext3_msg(sb, KERN_ERR,
+					"error: cannot turn on journaled "
+					"quota: %d", ret);
+		}
+	}
+#endif
+
+	while (es->s_last_orphan) {
+		struct inode *inode;
+
+		inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+		if (IS_ERR(inode)) {
+			es->s_last_orphan = 0;
+			break;
+		}
+
+		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+		dquot_initialize(inode);
+		if (inode->i_nlink) {
+			printk(KERN_DEBUG
+				"%s: truncating inode %lu to %Ld bytes\n",
+				__func__, inode->i_ino, inode->i_size);
+			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+				  inode->i_ino, inode->i_size);
+			ext3_truncate(inode);
+			nr_truncates++;
+		} else {
+			printk(KERN_DEBUG
+				"%s: deleting unreferenced inode %lu\n",
+				__func__, inode->i_ino);
+			jbd_debug(2, "deleting unreferenced inode %lu\n",
+				  inode->i_ino);
+			nr_orphans++;
+		}
+		iput(inode);  /* The delete magic happens here! */
+	}
+
+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+	if (nr_orphans)
+		ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
+	if (nr_truncates)
+		ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	for (i = 0; i < EXT3_MAXQUOTAS; i++) {
+		if (sb_dqopt(sb)->files[i])
+			dquot_quota_off(sb, i);
+	}
+#endif
+	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
+/*
+ * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+static loff_t ext3_max_size(int bits)
+{
+	loff_t res = EXT3_NDIR_BLOCKS;
+	int meta_blocks;
+	loff_t upper_limit;
+
+	/* This is calculated to be the largest file size for a
+	 * dense, file such that the total number of
+	 * sectors in the file, including data and all indirect blocks,
+	 * does not exceed 2^32 -1
+	 * __u32 i_blocks representing the total number of
+	 * 512 bytes blocks of the file
+	 */
+	upper_limit = (1LL << 32) - 1;
+
+	/* total blocks in file system block size */
+	upper_limit >>= (bits - 9);
+
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
+
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	res <<= bits;
+	if (res > upper_limit)
+		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	return res;
+}
+
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
+				    ext3_fsblk_t logic_sb_block,
+				    int nr)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	unsigned long bg, first_meta_bg;
+	int has_super = 0;
+
+	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
+	    nr < first_meta_bg)
+		return (logic_sb_block + nr + 1);
+	bg = sbi->s_desc_per_block * nr;
+	if (ext3_bg_has_super(sb, bg))
+		has_super = 1;
+	return (has_super + ext3_group_first_block_no(sb, bg));
+}
+
+
+static int ext3_fill_super (struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head * bh;
+	struct ext3_super_block *es = NULL;
+	struct ext3_sb_info *sbi;
+	ext3_fsblk_t block;
+	ext3_fsblk_t sb_block = get_sb_block(&data, sb);
+	ext3_fsblk_t logic_sb_block;
+	unsigned long offset = 0;
+	unsigned int journal_inum = 0;
+	unsigned long journal_devnum = 0;
+	unsigned long def_mount_opts;
+	struct inode *root;
+	int blocksize;
+	int hblock;
+	int db_count;
+	int i;
+	int needs_recovery;
+	int ret = -EINVAL;
+	__le32 features;
+	int err;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
+	sb->s_fs_info = sbi;
+	sbi->s_sb_block = sb_block;
+
+	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
+		goto out_fail;
+	}
+
+	/*
+	 * The ext3 superblock will not be buffer aligned for other than 1kB
+	 * block sizes.  We need to calculate the offset from buffer start.
+	 */
+	if (blocksize != EXT3_MIN_BLOCK_SIZE) {
+		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+	} else {
+		logic_sb_block = sb_block;
+	}
+
+	if (!(bh = sb_bread(sb, logic_sb_block))) {
+		ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
+		goto out_fail;
+	}
+	/*
+	 * Note: s_es must be initialized as soon as possible because
+	 *       some ext3 macro-instructions depend on its value
+	 */
+	es = (struct ext3_super_block *) (bh->b_data + offset);
+	sbi->s_es = es;
+	sb->s_magic = le16_to_cpu(es->s_magic);
+	if (sb->s_magic != EXT3_SUPER_MAGIC)
+		goto cantfind_ext3;
+
+	/* Set defaults before we parse the mount options */
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	if (def_mount_opts & EXT3_DEFM_DEBUG)
+		set_opt(sbi->s_mount_opt, DEBUG);
+	if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
+		set_opt(sbi->s_mount_opt, GRPID);
+	if (def_mount_opts & EXT3_DEFM_UID16)
+		set_opt(sbi->s_mount_opt, NO_UID32);
+#ifdef CONFIG_EXT3_FS_XATTR
+	if (def_mount_opts & EXT3_DEFM_XATTR_USER)
+		set_opt(sbi->s_mount_opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	if (def_mount_opts & EXT3_DEFM_ACL)
+		set_opt(sbi->s_mount_opt, POSIX_ACL);
+#endif
+	if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
+		set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
+		set_opt(sbi->s_mount_opt, ORDERED_DATA);
+	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
+		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
+		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE)
+		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_RO);
+
+	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+
+	/* enable barriers by default */
+	set_opt(sbi->s_mount_opt, BARRIER);
+	set_opt(sbi->s_mount_opt, RESERVATION);
+
+	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+			    NULL, 0))
+		goto failed_mount;
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
+	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: feature flags set on rev 0 fs, "
+			"running e2fsck is recommended");
+	/*
+	 * Check feature flags regardless of the revision level, since we
+	 * previously didn't change the revision level when setting the flags,
+	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+	 */
+	features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
+	if (features) {
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount because of unsupported "
+			"optional features (%x)", le32_to_cpu(features));
+		goto failed_mount;
+	}
+	features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount RDWR because of unsupported "
+			"optional features (%x)", le32_to_cpu(features));
+		goto failed_mount;
+	}
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+	if (blocksize < EXT3_MIN_BLOCK_SIZE ||
+	    blocksize > EXT3_MAX_BLOCK_SIZE) {
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount because of unsupported "
+			"filesystem blocksize %d", blocksize);
+		goto failed_mount;
+	}
+
+	hblock = bdev_logical_block_size(sb->s_bdev);
+	if (sb->s_blocksize != blocksize) {
+		/*
+		 * Make sure the blocksize for the filesystem is larger
+		 * than the hardware sectorsize for the machine.
+		 */
+		if (blocksize < hblock) {
+			ext3_msg(sb, KERN_ERR,
+				"error: fsblocksize %d too small for "
+				"hardware sectorsize %d", blocksize, hblock);
+			goto failed_mount;
+		}
+
+		brelse (bh);
+		if (!sb_set_blocksize(sb, blocksize)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: bad blocksize %d", blocksize);
+			goto out_fail;
+		}
+		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+		bh = sb_bread(sb, logic_sb_block);
+		if (!bh) {
+			ext3_msg(sb, KERN_ERR,
+			       "error: can't read superblock on 2nd try");
+			goto failed_mount;
+		}
+		es = (struct ext3_super_block *)(bh->b_data + offset);
+		sbi->s_es = es;
+		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: magic mismatch");
+			goto failed_mount;
+		}
+	}
+
+	sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
+		sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
+		sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
+	} else {
+		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+		if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
+		    (!is_power_of_2(sbi->s_inode_size)) ||
+		    (sbi->s_inode_size > blocksize)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: unsupported inode size: %d",
+				sbi->s_inode_size);
+			goto failed_mount;
+		}
+	}
+	sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
+				   le32_to_cpu(es->s_log_frag_size);
+	if (blocksize != sbi->s_frag_size) {
+		ext3_msg(sb, KERN_ERR,
+		       "error: fragsize %lu != blocksize %u (unsupported)",
+		       sbi->s_frag_size, blocksize);
+		goto failed_mount;
+	}
+	sbi->s_frags_per_block = 1;
+	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+	if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
+		goto cantfind_ext3;
+	sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
+	if (sbi->s_inodes_per_block == 0)
+		goto cantfind_ext3;
+	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+					sbi->s_inodes_per_block;
+	sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = le16_to_cpu(es->s_state);
+	sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
+	sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
+	for (i=0; i < 4; i++)
+		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+	sbi->s_def_hash_version = es->s_def_hash_version;
+	i = le32_to_cpu(es->s_flags);
+	if (i & EXT2_FLAGS_UNSIGNED_HASH)
+		sbi->s_hash_unsigned = 3;
+	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+		sbi->s_hash_unsigned = 3;
+#else
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+	}
+
+	if (sbi->s_blocks_per_group > blocksize * 8) {
+		ext3_msg(sb, KERN_ERR,
+			"#blocks per group too big: %lu",
+			sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_frags_per_group > blocksize * 8) {
+		ext3_msg(sb, KERN_ERR,
+			"error: #fragments per group too big: %lu",
+			sbi->s_frags_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_inodes_per_group > blocksize * 8) {
+		ext3_msg(sb, KERN_ERR,
+			"error: #inodes per group too big: %lu",
+			sbi->s_inodes_per_group);
+		goto failed_mount;
+	}
+
+	err = generic_check_addressable(sb->s_blocksize_bits,
+					le32_to_cpu(es->s_blocks_count));
+	if (err) {
+		ext3_msg(sb, KERN_ERR,
+			"error: filesystem is too large to mount safely");
+		if (sizeof(sector_t) < 8)
+			ext3_msg(sb, KERN_ERR,
+				"error: CONFIG_LBDAF not enabled");
+		ret = err;
+		goto failed_mount;
+	}
+
+	if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
+		goto cantfind_ext3;
+	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+			       le32_to_cpu(es->s_first_data_block) - 1)
+				       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
+	db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
+	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+				    GFP_KERNEL);
+	if (sbi->s_group_desc == NULL) {
+		ext3_msg(sb, KERN_ERR,
+			"error: not enough memory");
+		ret = -ENOMEM;
+		goto failed_mount;
+	}
+
+	bgl_lock_init(sbi->s_blockgroup_lock);
+
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logic_sb_block, i);
+		sbi->s_group_desc[i] = sb_bread(sb, block);
+		if (!sbi->s_group_desc[i]) {
+			ext3_msg(sb, KERN_ERR,
+				"error: can't read group descriptor %d", i);
+			db_count = i;
+			goto failed_mount2;
+		}
+	}
+	if (!ext3_check_descriptors (sb)) {
+		ext3_msg(sb, KERN_ERR,
+			"error: group descriptors corrupted");
+		goto failed_mount2;
+	}
+	sbi->s_gdb_count = db_count;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	/* per fileystem reservation list head & lock */
+	spin_lock_init(&sbi->s_rsv_window_lock);
+	sbi->s_rsv_window_root = RB_ROOT;
+	/* Add a single, static dummy reservation to the start of the
+	 * reservation window list --- it gives us a placeholder for
+	 * append-at-start-of-list which makes the allocation logic
+	 * _much_ simpler. */
+	sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+	sbi->s_rsv_window_head.rsv_goal_size = 0;
+	ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
+
+	/*
+	 * set up enough so that it can read an inode
+	 */
+	sb->s_op = &ext3_sops;
+	sb->s_export_op = &ext3_export_ops;
+	sb->s_xattr = ext3_xattr_handlers;
+#ifdef CONFIG_QUOTA
+	sb->s_qcop = &ext3_qctl_operations;
+	sb->dq_op = &ext3_quota_operations;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+#endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
+	mutex_init(&sbi->s_resize_lock);
+
+	sb->s_root = NULL;
+
+	needs_recovery = (es->s_last_orphan != 0 ||
+			  EXT3_HAS_INCOMPAT_FEATURE(sb,
+				    EXT3_FEATURE_INCOMPAT_RECOVER));
+
+	/*
+	 * The first inode we look at is the journal inode.  Don't try
+	 * root first: it may be modified in the journal!
+	 */
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+		if (ext3_load_journal(sb, es, journal_devnum))
+			goto failed_mount2;
+	} else if (journal_inum) {
+		if (ext3_create_journal(sb, es, journal_inum))
+			goto failed_mount2;
+	} else {
+		if (!silent)
+			ext3_msg(sb, KERN_ERR,
+				"error: no journal found. "
+				"mounting ext3 over ext2?");
+		goto failed_mount2;
+	}
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+			ext3_count_free_blocks(sb), GFP_KERNEL);
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext3_count_free_inodes(sb), GFP_KERNEL);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext3_count_dirs(sb), GFP_KERNEL);
+	}
+	if (err) {
+		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+		ret = err;
+		goto failed_mount3;
+	}
+
+	/* We have now updated the journal if required, so we can
+	 * validate the data journaling mode. */
+	switch (test_opt(sb, DATA_FLAGS)) {
+	case 0:
+		/* No mode set, assume a default based on the journal
+                   capabilities: ORDERED_DATA if the journal can
+                   cope, else JOURNAL_DATA */
+		if (journal_check_available_features
+		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
+			set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
+		else
+			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+		break;
+
+	case EXT3_MOUNT_ORDERED_DATA:
+	case EXT3_MOUNT_WRITEBACK_DATA:
+		if (!journal_check_available_features
+		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: journal does not support "
+				"requested data journaling mode");
+			goto failed_mount3;
+		}
+	default:
+		break;
+	}
+
+	/*
+	 * The journal_load will have done any necessary log recovery,
+	 * so we can safely mount the rest of the filesystem now.
+	 */
+
+	root = ext3_iget(sb, EXT3_ROOT_INO);
+	if (IS_ERR(root)) {
+		ext3_msg(sb, KERN_ERR, "error: get root inode failed");
+		ret = PTR_ERR(root);
+		goto failed_mount3;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		iput(root);
+		ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
+		goto failed_mount3;
+	}
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
+		ret = -ENOMEM;
+		goto failed_mount3;
+	}
+
+	if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
+
+	EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+	ext3_orphan_cleanup(sb, es);
+	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+	if (needs_recovery) {
+		ext3_mark_recovery_complete(sb, es);
+		ext3_msg(sb, KERN_INFO, "recovery complete");
+	}
+	ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
+		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
+		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+		"writeback");
+
+	return 0;
+
+cantfind_ext3:
+	if (!silent)
+		ext3_msg(sb, KERN_INFO,
+			"error: can't find ext3 filesystem on dev %s.",
+		       sb->s_id);
+	goto failed_mount;
+
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	journal_destroy(sbi->s_journal);
+failed_mount2:
+	for (i = 0; i < db_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+failed_mount:
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < EXT3_MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+	ext3_blkdev_remove(sbi);
+	brelse(bh);
+out_fail:
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+	return ret;
+}
+
+/*
+ * Setup any per-fs journal parameters now.  We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sbi->s_commit_interval)
+		journal->j_commit_interval = sbi->s_commit_interval;
+	/* We could also set up an ext3-specific default for the commit
+	 * interval here, but for now we'll just fall back to the jbd
+	 * default. */
+
+	spin_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
+	spin_unlock(&journal->j_state_lock);
+}
+
+static journal_t *ext3_get_journal(struct super_block *sb,
+				   unsigned int journal_inum)
+{
+	struct inode *journal_inode;
+	journal_t *journal;
+
+	/* First, test for the existence of a valid inode on disk.  Bad
+	 * things happen if we iget() an unused inode, as the subsequent
+	 * iput() will try to delete it. */
+
+	journal_inode = ext3_iget(sb, journal_inum);
+	if (IS_ERR(journal_inode)) {
+		ext3_msg(sb, KERN_ERR, "error: no journal found");
+		return NULL;
+	}
+	if (!journal_inode->i_nlink) {
+		make_bad_inode(journal_inode);
+		iput(journal_inode);
+		ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
+		return NULL;
+	}
+
+	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+		  journal_inode, journal_inode->i_size);
+	if (!S_ISREG(journal_inode->i_mode)) {
+		ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+
+	journal = journal_init_inode(journal_inode);
+	if (!journal) {
+		ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+	journal->j_private = sb;
+	ext3_init_journal_params(sb, journal);
+	return journal;
+}
+
+static journal_t *ext3_get_dev_journal(struct super_block *sb,
+				       dev_t j_dev)
+{
+	struct buffer_head * bh;
+	journal_t *journal;
+	ext3_fsblk_t start;
+	ext3_fsblk_t len;
+	int hblock, blocksize;
+	ext3_fsblk_t sb_block;
+	unsigned long offset;
+	struct ext3_super_block * es;
+	struct block_device *bdev;
+
+	bdev = ext3_blkdev_get(j_dev, sb);
+	if (bdev == NULL)
+		return NULL;
+
+	blocksize = sb->s_blocksize;
+	hblock = bdev_logical_block_size(bdev);
+	if (blocksize < hblock) {
+		ext3_msg(sb, KERN_ERR,
+			"error: blocksize too small for journal device");
+		goto out_bdev;
+	}
+
+	sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
+	offset = EXT3_MIN_BLOCK_SIZE % blocksize;
+	set_blocksize(bdev, blocksize);
+	if (!(bh = __bread(bdev, sb_block, blocksize))) {
+		ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
+			"external journal");
+		goto out_bdev;
+	}
+
+	es = (struct ext3_super_block *) (bh->b_data + offset);
+	if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
+	    !(le32_to_cpu(es->s_feature_incompat) &
+	      EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+		ext3_msg(sb, KERN_ERR, "error: external journal has "
+			"bad superblock");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+		ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	len = le32_to_cpu(es->s_blocks_count);
+	start = sb_block + 1;
+	brelse(bh);	/* we're done with the superblock */
+
+	journal = journal_init_dev(bdev, sb->s_bdev,
+					start, len, blocksize);
+	if (!journal) {
+		ext3_msg(sb, KERN_ERR,
+			"error: failed to create device journal");
+		goto out_bdev;
+	}
+	journal->j_private = sb;
+	if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
+		if (bh_submit_read(journal->j_sb_buffer)) {
+			ext3_msg(sb, KERN_ERR, "I/O error on journal device");
+			goto out_journal;
+		}
+	}
+	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+		ext3_msg(sb, KERN_ERR,
+			"error: external journal has more than one "
+			"user (unsupported) - %d",
+			be32_to_cpu(journal->j_superblock->s_nr_users));
+		goto out_journal;
+	}
+	EXT3_SB(sb)->journal_bdev = bdev;
+	ext3_init_journal_params(sb, journal);
+	return journal;
+out_journal:
+	journal_destroy(journal);
+out_bdev:
+	ext3_blkdev_put(bdev);
+	return NULL;
+}
+
+static int ext3_load_journal(struct super_block *sb,
+			     struct ext3_super_block *es,
+			     unsigned long journal_devnum)
+{
+	journal_t *journal;
+	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+	dev_t journal_dev;
+	int err = 0;
+	int really_read_only;
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		ext3_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
+		journal_dev = new_decode_dev(journal_devnum);
+	} else
+		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
+	really_read_only = bdev_read_only(sb->s_bdev);
+
+	/*
+	 * Are we loading a blank journal or performing recovery after a
+	 * crash?  For recovery, we need to check in advance whether we
+	 * can get read-write access to the device.
+	 */
+
+	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
+		if (sb->s_flags & MS_RDONLY) {
+			ext3_msg(sb, KERN_INFO,
+				"recovery required on readonly filesystem");
+			if (really_read_only) {
+				ext3_msg(sb, KERN_ERR, "error: write access "
+					"unavailable, cannot proceed");
+				return -EROFS;
+			}
+			ext3_msg(sb, KERN_INFO,
+				"write access will be enabled during recovery");
+		}
+	}
+
+	if (journal_inum && journal_dev) {
+		ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
+		       "and inode journals");
+		return -EINVAL;
+	}
+
+	if (journal_inum) {
+		if (!(journal = ext3_get_journal(sb, journal_inum)))
+			return -EINVAL;
+	} else {
+		if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
+			return -EINVAL;
+	}
+
+	if (!(journal->j_flags & JFS_BARRIER))
+		printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
+
+	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+		err = journal_update_format(journal);
+		if (err)  {
+			ext3_msg(sb, KERN_ERR, "error updating journal");
+			journal_destroy(journal);
+			return err;
+		}
+	}
+
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
+		err = journal_wipe(journal, !really_read_only);
+	if (!err)
+		err = journal_load(journal);
+
+	if (err) {
+		ext3_msg(sb, KERN_ERR, "error loading journal");
+		journal_destroy(journal);
+		return err;
+	}
+
+	EXT3_SB(sb)->s_journal = journal;
+	ext3_clear_journal_err(sb, es);
+
+	if (!really_read_only && journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		es->s_journal_dev = cpu_to_le32(journal_devnum);
+
+		/* Make sure we flush the recovery flag to disk. */
+		ext3_commit_super(sb, es, 1);
+	}
+
+	return 0;
+}
+
+static int ext3_create_journal(struct super_block *sb,
+			       struct ext3_super_block *es,
+			       unsigned int journal_inum)
+{
+	journal_t *journal;
+	int err;
+
+	if (sb->s_flags & MS_RDONLY) {
+		ext3_msg(sb, KERN_ERR,
+			"error: readonly filesystem when trying to "
+			"create journal");
+		return -EROFS;
+	}
+
+	journal = ext3_get_journal(sb, journal_inum);
+	if (!journal)
+		return -EINVAL;
+
+	ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
+	       journal_inum);
+
+	err = journal_create(journal);
+	if (err) {
+		ext3_msg(sb, KERN_ERR, "error creating journal");
+		journal_destroy(journal);
+		return -EIO;
+	}
+
+	EXT3_SB(sb)->s_journal = journal;
+
+	ext3_update_dynamic_rev(sb);
+	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+	EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
+
+	es->s_journal_inum = cpu_to_le32(journal_inum);
+
+	/* Make sure we flush the recovery flag to disk. */
+	ext3_commit_super(sb, es, 1);
+
+	return 0;
+}
+
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
+			       int sync)
+{
+	struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
+	int error = 0;
+
+	if (!sbh)
+		return error;
+
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext3_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
+	es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
+	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
+	BUFFER_TRACE(sbh, "marking dirty");
+	mark_buffer_dirty(sbh);
+	if (sync) {
+		error = sync_dirty_buffer(sbh);
+		if (buffer_write_io_error(sbh)) {
+			ext3_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
+	return error;
+}
+
+
+/*
+ * Have we just finished recovery?  If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk.  Record that fact.
+ */
+static void ext3_mark_recovery_complete(struct super_block * sb,
+					struct ext3_super_block * es)
+{
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+
+	journal_lock_updates(journal);
+	if (journal_flush(journal) < 0)
+		goto out;
+
+	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
+	    sb->s_flags & MS_RDONLY) {
+		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		ext3_commit_super(sb, es, 1);
+	}
+
+out:
+	journal_unlock_updates(journal);
+}
+
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext3_clear_journal_err(struct super_block *sb,
+				   struct ext3_super_block *es)
+{
+	journal_t *journal;
+	int j_errno;
+	const char *errstr;
+
+	journal = EXT3_SB(sb)->s_journal;
+
+	/*
+	 * Now check for any error status which may have been recorded in the
+	 * journal by a prior ext3_error() or ext3_abort()
+	 */
+
+	j_errno = journal_errno(journal);
+	if (j_errno) {
+		char nbuf[16];
+
+		errstr = ext3_decode_error(sb, j_errno, nbuf);
+		ext3_warning(sb, __func__, "Filesystem error recorded "
+			     "from previous mount: %s", errstr);
+		ext3_warning(sb, __func__, "Marking fs in need of "
+			     "filesystem check.");
+
+		EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+		ext3_commit_super (sb, es, 1);
+
+		journal_clear_err(journal);
+	}
+}
+
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext3_force_commit(struct super_block *sb)
+{
+	journal_t *journal;
+	int ret;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	journal = EXT3_SB(sb)->s_journal;
+	ret = ext3_journal_force_commit(journal);
+	return ret;
+}
+
+static int ext3_sync_fs(struct super_block *sb, int wait)
+{
+	tid_t target;
+
+	trace_ext3_sync_fs(sb, wait);
+	/*
+	 * Writeback quota in non-journalled quota case - journalled quota has
+	 * no dirty dquots
+	 */
+	dquot_writeback_dquots(sb, -1);
+	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
+		if (wait)
+			log_wait_commit(EXT3_SB(sb)->s_journal, target);
+	}
+	return 0;
+}
+
+/*
+ * LVM calls this function before a (read-only) snapshot is created.  This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ */
+static int ext3_freeze(struct super_block *sb)
+{
+	int error = 0;
+	journal_t *journal;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		journal = EXT3_SB(sb)->s_journal;
+
+		/* Now we set up the journal barrier. */
+		journal_lock_updates(journal);
+
+		/*
+		 * We don't want to clear needs_recovery flag when we failed
+		 * to flush the journal.
+		 */
+		error = journal_flush(journal);
+		if (error < 0)
+			goto out;
+
+		/* Journal blocked and flushed, clear needs_recovery flag. */
+		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		if (error)
+			goto out;
+	}
+	return 0;
+
+out:
+	journal_unlock_updates(journal);
+	return error;
+}
+
+/*
+ * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static int ext3_unfreeze(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* Reser the needs_recovery flag before the fs is unlocked. */
+		EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+	}
+	return 0;
+}
+
+static int ext3_remount (struct super_block * sb, int * flags, char * data)
+{
+	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	ext3_fsblk_t n_blocks_count = 0;
+	unsigned long old_sb_flags;
+	struct ext3_mount_options old_opts;
+	int enable_quota = 0;
+	int err;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+
+	sync_filesystem(sb);
+
+	/* Store the original options */
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+	old_opts.s_commit_interval = sbi->s_commit_interval;
+#ifdef CONFIG_QUOTA
+	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+	for (i = 0; i < EXT3_MAXQUOTAS; i++)
+		if (sbi->s_qf_names[i]) {
+			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+							 GFP_KERNEL);
+			if (!old_opts.s_qf_names[i]) {
+				int j;
+
+				for (j = 0; j < i; j++)
+					kfree(old_opts.s_qf_names[j]);
+				return -ENOMEM;
+			}
+		} else
+			old_opts.s_qf_names[i] = NULL;
+#endif
+
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 */
+	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	if (test_opt(sb, ABORT))
+		ext3_abort(sb, __func__, "Abort forced by user");
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	es = sbi->s_es;
+
+	ext3_init_journal_params(sb, sbi->s_journal);
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+		n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
+		if (test_opt(sb, ABORT)) {
+			err = -EROFS;
+			goto restore_opts;
+		}
+
+		if (*flags & MS_RDONLY) {
+			err = dquot_suspend(sb, -1);
+			if (err < 0)
+				goto restore_opts;
+
+			/*
+			 * First of all, the unconditional stuff we have to do
+			 * to disable replay of the journal when we next remount
+			 */
+			sb->s_flags |= MS_RDONLY;
+
+			/*
+			 * OK, test if we are remounting a valid rw partition
+			 * readonly, and if so set the rdonly flag and then
+			 * mark the partition as valid again.
+			 */
+			if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
+			    (sbi->s_mount_state & EXT3_VALID_FS))
+				es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+			ext3_mark_recovery_complete(sb, es);
+		} else {
+			__le32 ret;
+			if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					~EXT3_FEATURE_RO_COMPAT_SUPP))) {
+				ext3_msg(sb, KERN_WARNING,
+					"warning: couldn't remount RDWR "
+					"because of unsupported optional "
+					"features (%x)", le32_to_cpu(ret));
+				err = -EROFS;
+				goto restore_opts;
+			}
+
+			/*
+			 * If we have an unprocessed orphan list hanging
+			 * around from a previously readonly bdev mount,
+			 * require a full umount & mount for now.
+			 */
+			if (es->s_last_orphan) {
+				ext3_msg(sb, KERN_WARNING, "warning: couldn't "
+				       "remount RDWR because of unprocessed "
+				       "orphan inode list.  Please "
+				       "umount & mount instead.");
+				err = -EINVAL;
+				goto restore_opts;
+			}
+
+			/*
+			 * Mounting a RDONLY partition read-write, so reread
+			 * and store the current valid flag.  (It may have
+			 * been changed by e2fsck since we originally mounted
+			 * the partition.)
+			 */
+			ext3_clear_journal_err(sb, es);
+			sbi->s_mount_state = le16_to_cpu(es->s_state);
+			if ((err = ext3_group_extend(sb, es, n_blocks_count)))
+				goto restore_opts;
+			if (!ext3_setup_super (sb, es, 0))
+				sb->s_flags &= ~MS_RDONLY;
+			enable_quota = 1;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	/* Release old quota file names */
+	for (i = 0; i < EXT3_MAXQUOTAS; i++)
+		kfree(old_opts.s_qf_names[i]);
+#endif
+	if (enable_quota)
+		dquot_resume(sb, -1);
+	return 0;
+restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sbi->s_commit_interval = old_opts.s_commit_interval;
+#ifdef CONFIG_QUOTA
+	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+	for (i = 0; i < EXT3_MAXQUOTAS; i++) {
+		kfree(sbi->s_qf_names[i]);
+		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+	}
+#endif
+	return err;
+}
+
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	u64 fsid;
+
+	if (test_opt(sb, MINIX_DF)) {
+		sbi->s_overhead_last = 0;
+	} else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+		unsigned long ngroups = sbi->s_groups_count, i;
+		ext3_fsblk_t overhead = 0;
+		smp_rmb();
+
+		/*
+		 * Compute the overhead (FS structures).  This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
+		 */
+
+		/*
+		 * All of the blocks before first_data_block are
+		 * overhead
+		 */
+		overhead = le32_to_cpu(es->s_first_data_block);
+
+		/*
+		 * Add the overhead attributed to the superblock and
+		 * block group descriptors.  If the sparse superblocks
+		 * feature is turned on, then not all groups have this.
+		 */
+		for (i = 0; i < ngroups; i++) {
+			overhead += ext3_bg_has_super(sb, i) +
+				ext3_bg_num_gdb(sb, i);
+			cond_resched();
+		}
+
+		/*
+		 * Every block group has an inode bitmap, a block
+		 * bitmap, and an inode table.
+		 */
+		overhead += ngroups * (2 + sbi->s_itb_per_group);
+
+		/* Add the internal journal blocks as well */
+		if (sbi->s_journal && !sbi->journal_bdev)
+			overhead += sbi->s_journal->j_maxlen;
+
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
+	}
+
+	buf->f_type = EXT3_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+		buf->f_bavail = 0;
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
+	buf->f_namelen = EXT3_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	return 0;
+}
+
+/* Helper function for writing quotas on sync - we need to start transaction before quota file
+ * is locked for write. Otherwise the are possible deadlocks:
+ * Process 1                         Process 2
+ * ext3_create()                     quota_sync()
+ *   journal_start()                   write_dquot()
+ *   dquot_initialize()                       down(dqio_mutex)
+ *     down(dqio_mutex)                    journal_start()
+ *
+ */
+
+#ifdef CONFIG_QUOTA
+
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
+}
+
+static int ext3_write_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+	struct inode *inode;
+
+	inode = dquot_to_inode(dquot);
+	handle = ext3_journal_start(inode,
+					EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit(dquot);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext3_acquire_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext3_journal_start(dquot_to_inode(dquot),
+					EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_acquire(dquot);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext3_release_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext3_journal_start(dquot_to_inode(dquot),
+					EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle)) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
+		return PTR_ERR(handle);
+	}
+	ret = dquot_release(dquot);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext3_mark_dquot_dirty(struct dquot *dquot)
+{
+	/* Are we journaling quotas? */
+	if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+	    EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return ext3_write_dquot(dquot);
+	} else {
+		return dquot_mark_dquot_dirty(dquot);
+	}
+}
+
+static int ext3_write_info(struct super_block *sb, int type)
+{
+	int ret, err;
+	handle_t *handle;
+
+	/* Data block + inode block */
+	handle = ext3_journal_start(d_inode(sb->s_root), 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit_info(sb, type);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext3_quota_on_mount(struct super_block *sb, int type)
+{
+	return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+					EXT3_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext3_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path)
+{
+	int err;
+
+	if (!test_opt(sb, QUOTA))
+		return -EINVAL;
+
+	/* Quotafile not on the same filesystem? */
+	if (path->dentry->d_sb != sb)
+		return -EXDEV;
+	/* Journaling quota? */
+	if (EXT3_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (path->dentry->d_parent != sb->s_root)
+			ext3_msg(sb, KERN_WARNING,
+				"warning: Quota file not on filesystem root. "
+				"Journaled quota will not work.");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (ext3_should_journal_data(d_inode(path->dentry))) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		err = journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err)
+			return err;
+	}
+
+	return dquot_quota_on(sb, type, format_id, path);
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+		bh = ext3_bread(NULL, inode, blk, 0, &err);
+		if (err)
+			return err;
+		if (!bh)	/* A hole? */
+			memset(data, 0, tocopy);
+		else
+			memcpy(data, bh->b_data+offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext3_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
+	struct buffer_head *bh;
+	handle_t *handle = journal_current_handle();
+
+	if (!handle) {
+		ext3_msg(sb, KERN_WARNING,
+			"warning: quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started.",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	bh = ext3_bread(handle, inode, blk, 1, &err);
+	if (!bh)
+		goto out;
+	if (journal_quota) {
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err) {
+			brelse(bh);
+			goto out;
+		}
+	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	if (journal_quota)
+		err = ext3_journal_dirty_metadata(handle, bh);
+	else {
+		/* Always do at least ordered writes for quotas */
+		err = ext3_journal_dirty_data(handle, bh);
+		mark_buffer_dirty(bh);
+	}
+	brelse(bh);
+out:
+	if (err)
+		return err;
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
+		EXT3_I(inode)->i_disksize = inode->i_size;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	ext3_mark_inode_dirty(handle, inode);
+	return len;
+}
+
+#endif
+
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+}
+
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.mount		= ext3_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext3");
+
+static int __init init_ext3_fs(void)
+{
+	int err = init_ext3_xattr();
+	if (err)
+		return err;
+	err = init_inodecache();
+	if (err)
+		goto out1;
+        err = register_filesystem(&ext3_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	exit_ext3_xattr();
+	return err;
+}
+
+static void __exit exit_ext3_fs(void)
+{
+	unregister_filesystem(&ext3_fs_type);
+	destroy_inodecache();
+	exit_ext3_xattr();
+}
+
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+MODULE_LICENSE("GPL");
+module_init(init_ext3_fs)
+module_exit(exit_ext3_fs)
diff --git a/kernel/fs/ext3/symlink.c b/kernel/fs/ext3/symlink.c
new file mode 100644
index 000000000..ea96df3c5
--- /dev/null
+++ b/kernel/fs/ext3/symlink.c
@@ -0,0 +1,54 @@
+/*
+ *  linux/fs/ext3/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3 symlink handling code
+ */
+
+#include <linux/namei.h>
+#include "ext3.h"
+#include "xattr.h"
+
+static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ext3_inode_info *ei = EXT3_I(d_inode(dentry));
+	nd_set_link(nd, (char*)ei->i_data);
+	return NULL;
+}
+
+const struct inode_operations ext3_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
+const struct inode_operations ext3_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ext3_follow_link,
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
diff --git a/kernel/fs/ext3/xattr.c b/kernel/fs/ext3/xattr.c
new file mode 100644
index 000000000..7cf36501c
--- /dev/null
+++ b/kernel/fs/ext3/xattr.c
@@ -0,0 +1,1330 @@
+/*
+ * linux/fs/ext3/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ *  and Andreas Gruenbacher <agruen@suse.de>.
+ */
+
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+
+#include "ext3.h"
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include "xattr.h"
+#include "acl.h"
+
+#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define IHDR(inode, raw_inode) \
+	((struct ext3_xattr_ibody_header *) \
+		((void *)raw_inode + \
+		 EXT3_GOOD_OLD_INODE_SIZE + \
+		 EXT3_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
+
+#ifdef EXT3_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
+			inode->i_sb->s_id, inode->i_ino); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+# define ea_bdebug(bh, f...) do { \
+		char b[BDEVNAME_SIZE]; \
+		printk(KERN_DEBUG "block %s:%lu: ", \
+			bdevname(bh->b_bdev, b), \
+			(unsigned long) bh->b_blocknr); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+
+static void ext3_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext3_xattr_cache_find(struct inode *,
+						 struct ext3_xattr_header *,
+						 struct mb_cache_entry **);
+static void ext3_xattr_rehash(struct ext3_xattr_header *,
+			      struct ext3_xattr_entry *);
+static int ext3_xattr_list(struct dentry *dentry, char *buffer,
+			   size_t buffer_size);
+
+static struct mb_cache *ext3_xattr_cache;
+
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
+	[EXT3_XATTR_INDEX_USER]		     = &ext3_xattr_user_handler,
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
+	[EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+#endif
+	[EXT3_XATTR_INDEX_TRUSTED]	     = &ext3_xattr_trusted_handler,
+#ifdef CONFIG_EXT3_FS_SECURITY
+	[EXT3_XATTR_INDEX_SECURITY]	     = &ext3_xattr_security_handler,
+#endif
+};
+
+const struct xattr_handler *ext3_xattr_handlers[] = {
+	&ext3_xattr_user_handler,
+	&ext3_xattr_trusted_handler,
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+#ifdef CONFIG_EXT3_FS_SECURITY
+	&ext3_xattr_security_handler,
+#endif
+	NULL
+};
+
+static inline const struct xattr_handler *
+ext3_xattr_handler(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
+		handler = ext3_xattr_handler_map[name_index];
+	return handler;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * d_inode(dentry)->i_mutex: don't care
+ */
+ssize_t
+ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	return ext3_xattr_list(dentry, buffer, size);
+}
+
+static int
+ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
+{
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
+		if ((void *)next >= end)
+			return -EIO;
+		entry = next;
+	}
+	return 0;
+}
+
+static inline int
+ext3_xattr_check_block(struct buffer_head *bh)
+{
+	int error;
+
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1))
+		return -EIO;
+	error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+	return error;
+}
+
+static inline int
+ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
+{
+	size_t value_size = le32_to_cpu(entry->e_value_size);
+
+	if (entry->e_value_block != 0 || value_size > size ||
+	    le16_to_cpu(entry->e_value_offs) + value_size > size)
+		return -EIO;
+	return 0;
+}
+
+static int
+ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
+		      const char *name, size_t size, int sorted)
+{
+	struct ext3_xattr_entry *entry;
+	size_t name_len;
+	int cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	entry = *pentry;
+	for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
+		cmp = name_index - entry->e_name_index;
+		if (!cmp)
+			cmp = name_len - entry->e_name_len;
+		if (!cmp)
+			cmp = memcmp(name, entry->e_name, name_len);
+		if (cmp <= 0 && (sorted || cmp == 0))
+			break;
+	}
+	*pentry = entry;
+	if (!cmp && ext3_xattr_check_entry(entry, size))
+			return -EIO;
+	return cmp ? -ENODATA : 0;
+}
+
+static int
+ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	struct ext3_xattr_entry *entry;
+	size_t size;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+		  name_index, name, buffer, (long)buffer_size);
+
+	error = -ENODATA;
+	if (!EXT3_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext3_xattr_check_block(bh)) {
+bad_block:	ext3_error(inode->i_sb, __func__,
+			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
+			   EXT3_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext3_xattr_cache_insert(bh);
+	entry = BFIRST(bh);
+	error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+		       size);
+	}
+	error = size;
+
+cleanup:
+	brelse(bh);
+	return error;
+}
+
+static int
+ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_xattr_entry *entry;
+	struct ext3_inode *raw_inode;
+	struct ext3_iloc iloc;
+	size_t size;
+	void *end;
+	int error;
+
+	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
+		return -ENODATA;
+	error = ext3_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext3_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+	end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
+	error = ext3_xattr_check_names(entry, end);
+	if (error)
+		goto cleanup;
+	error = ext3_xattr_find_entry(&entry, name_index, name,
+				      end - (void *)entry, 0);
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, (void *)IFIRST(header) +
+		       le16_to_cpu(entry->e_value_offs), size);
+	}
+	error = size;
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext3_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t buffer_size)
+{
+	int error;
+
+	down_read(&EXT3_I(inode)->xattr_sem);
+	error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
+				     buffer_size);
+	if (error == -ENODATA)
+		error = ext3_xattr_block_get(inode, name_index, name, buffer,
+					     buffer_size);
+	up_read(&EXT3_I(inode)->xattr_sem);
+	return error;
+}
+
+static int
+ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
+			char *buffer, size_t buffer_size)
+{
+	size_t rest = buffer_size;
+
+	for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
+		const struct xattr_handler *handler =
+			ext3_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(dentry, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len,
+						    handler->flags);
+			if (buffer) {
+				if (size > rest)
+					return -ERANGE;
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+	return buffer_size - rest;
+}
+
+static int
+ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct buffer_head *bh = NULL;
+	int error;
+
+	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+		  buffer, (long)buffer_size);
+
+	error = 0;
+	if (!EXT3_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext3_xattr_check_block(bh)) {
+		ext3_error(inode->i_sb, __func__,
+			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
+			   EXT3_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext3_xattr_cache_insert(bh);
+	error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
+
+cleanup:
+	brelse(bh);
+
+	return error;
+}
+
+static int
+ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_inode *raw_inode;
+	struct ext3_iloc iloc;
+	void *end;
+	int error;
+
+	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
+		return 0;
+	error = ext3_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext3_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
+	error = ext3_xattr_check_names(IFIRST(header), end);
+	if (error)
+		goto cleanup;
+	error = ext3_xattr_list_entries(dentry, IFIRST(header),
+					buffer, buffer_size);
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext3_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+static int
+ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	int i_error, b_error;
+
+	down_read(&EXT3_I(d_inode(dentry))->xattr_sem);
+	i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
+	if (i_error < 0) {
+		b_error = 0;
+	} else {
+		if (buffer) {
+			buffer += i_error;
+			buffer_size -= i_error;
+		}
+		b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
+		if (b_error < 0)
+			i_error = 0;
+	}
+	up_read(&EXT3_I(d_inode(dentry))->xattr_sem);
+	return i_error + b_error;
+}
+
+/*
+ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext3_xattr_update_super_block(handle_t *handle,
+					  struct super_block *sb)
+{
+	if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
+		return;
+
+	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
+		EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
+		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	}
+}
+
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement
+ * it; otherwise free the block.
+ */
+static void
+ext3_xattr_release_block(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh)
+{
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+
+	ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	error = ext3_journal_get_write_access(handle, bh);
+	if (error)
+		 goto out;
+
+	lock_buffer(bh);
+
+	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+		ea_bdebug(bh, "refcount now=0; freeing");
+		if (ce)
+			mb_cache_entry_free(ce);
+		ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
+		get_bh(bh);
+		ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+	} else {
+		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+		error = ext3_journal_dirty_metadata(handle, bh);
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+		dquot_free_block(inode, 1);
+		ea_bdebug(bh, "refcount now=%d; releasing",
+			  le32_to_cpu(BHDR(bh)->h_refcount));
+		if (ce)
+			mb_cache_entry_release(ce);
+	}
+	unlock_buffer(bh);
+out:
+	ext3_std_error(inode->i_sb, error);
+	return;
+}
+
+struct ext3_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ext3_xattr_search {
+	struct ext3_xattr_entry *first;
+	void *base;
+	void *end;
+	struct ext3_xattr_entry *here;
+	int not_found;
+};
+
+static int
+ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
+{
+	struct ext3_xattr_entry *last;
+	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+
+	/* Compute min_offs and last. */
+	last = s->first;
+	for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < min_offs)
+				min_offs = offs;
+		}
+	}
+	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+	if (!s->not_found) {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			size_t size = le32_to_cpu(s->here->e_value_size);
+			free += EXT3_XATTR_SIZE(size);
+		}
+		free += EXT3_XATTR_LEN(name_len);
+	}
+	if (i->value) {
+		if (free < EXT3_XATTR_LEN(name_len) +
+			   EXT3_XATTR_SIZE(i->value_len))
+			return -ENOSPC;
+	}
+
+	if (i->value && s->not_found) {
+		/* Insert the new name. */
+		size_t size = EXT3_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+		memmove((void *)s->here + size, s->here, rest);
+		memset(s->here, 0, size);
+		s->here->e_name_index = i->name_index;
+		s->here->e_name_len = name_len;
+		memcpy(s->here->e_name, i->name, name_len);
+	} else {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			void *first_val = s->base + min_offs;
+			size_t offs = le16_to_cpu(s->here->e_value_offs);
+			void *val = s->base + offs;
+			size_t size = EXT3_XATTR_SIZE(
+				le32_to_cpu(s->here->e_value_size));
+
+			if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
+				/* The old and the new value have the same
+				   size. Just replace. */
+				s->here->e_value_size =
+					cpu_to_le32(i->value_len);
+				memset(val + size - EXT3_XATTR_PAD, 0,
+				       EXT3_XATTR_PAD); /* Clear pad bytes. */
+				memcpy(val, i->value, i->value_len);
+				return 0;
+			}
+
+			/* Remove the old value. */
+			memmove(first_val + size, first_val, val - first_val);
+			memset(first_val, 0, size);
+			s->here->e_value_size = 0;
+			s->here->e_value_offs = 0;
+			min_offs += size;
+
+			/* Adjust all value offsets. */
+			last = s->first;
+			while (!IS_LAST_ENTRY(last)) {
+				size_t o = le16_to_cpu(last->e_value_offs);
+				if (!last->e_value_block &&
+				    last->e_value_size && o < offs)
+					last->e_value_offs =
+						cpu_to_le16(o + size);
+				last = EXT3_XATTR_NEXT(last);
+			}
+		}
+		if (!i->value) {
+			/* Remove the old name. */
+			size_t size = EXT3_XATTR_LEN(name_len);
+			last = ENTRY((void *)last - size);
+			memmove(s->here, (void *)s->here + size,
+				(void *)last - (void *)s->here + sizeof(__u32));
+			memset(last, 0, size);
+		}
+	}
+
+	if (i->value) {
+		/* Insert the new value. */
+		s->here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value_len) {
+			size_t size = EXT3_XATTR_SIZE(i->value_len);
+			void *val = s->base + min_offs - size;
+			s->here->e_value_offs = cpu_to_le16(min_offs - size);
+			memset(val + size - EXT3_XATTR_PAD, 0,
+			       EXT3_XATTR_PAD); /* Clear the pad bytes. */
+			memcpy(val, i->value, i->value_len);
+		}
+	}
+	return 0;
+}
+
+struct ext3_xattr_block_find {
+	struct ext3_xattr_search s;
+	struct buffer_head *bh;
+};
+
+static int
+ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
+		      struct ext3_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+		  i->name_index, i->name, i->value, (long)i->value_len);
+
+	if (EXT3_I(inode)->i_file_acl) {
+		/* The inode already has an extended attribute block. */
+		bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bs->bh)
+			goto cleanup;
+		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+			atomic_read(&(bs->bh->b_count)),
+			le32_to_cpu(BHDR(bs->bh)->h_refcount));
+		if (ext3_xattr_check_block(bs->bh)) {
+			ext3_error(sb, __func__,
+				"inode %lu: bad block "E3FSBLK, inode->i_ino,
+				EXT3_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		/* Find the named attribute. */
+		bs->s.base = BHDR(bs->bh);
+		bs->s.first = BFIRST(bs->bh);
+		bs->s.end = bs->bh->b_data + bs->bh->b_size;
+		bs->s.here = bs->s.first;
+		error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
+					      i->name, bs->bh->b_size, 1);
+		if (error && error != -ENODATA)
+			goto cleanup;
+		bs->s.not_found = error;
+	}
+	error = 0;
+
+cleanup:
+	return error;
+}
+
+static int
+ext3_xattr_block_set(handle_t *handle, struct inode *inode,
+		     struct ext3_xattr_info *i,
+		     struct ext3_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh = NULL;
+	struct ext3_xattr_search *s = &bs->s;
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+
+#define header(x) ((struct ext3_xattr_header *)(x))
+
+	if (i->value && i->value_len > sb->s_blocksize)
+		return -ENOSPC;
+	if (s->base) {
+		ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
+					bs->bh->b_blocknr);
+		error = ext3_journal_get_write_access(handle, bs->bh);
+		if (error)
+			goto cleanup;
+		lock_buffer(bs->bh);
+
+		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+			if (ce) {
+				mb_cache_entry_free(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "modifying in-place");
+			error = ext3_xattr_set_entry(i, s);
+			if (!error) {
+				if (!IS_LAST_ENTRY(s->first))
+					ext3_xattr_rehash(header(s->base),
+							  s->here);
+				ext3_xattr_cache_insert(bs->bh);
+			}
+			unlock_buffer(bs->bh);
+			if (error == -EIO)
+				goto bad_block;
+			if (!error)
+				error = ext3_journal_dirty_metadata(handle,
+								    bs->bh);
+			if (error)
+				goto cleanup;
+			goto inserted;
+		} else {
+			int offset = (char *)s->here - bs->bh->b_data;
+
+			unlock_buffer(bs->bh);
+			journal_release_buffer(handle, bs->bh);
+
+			if (ce) {
+				mb_cache_entry_release(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "cloning");
+			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+			error = -ENOMEM;
+			if (s->base == NULL)
+				goto cleanup;
+			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+			s->first = ENTRY(header(s->base)+1);
+			header(s->base)->h_refcount = cpu_to_le32(1);
+			s->here = ENTRY(s->base + offset);
+			s->end = s->base + bs->bh->b_size;
+		}
+	} else {
+		/* Allocate a buffer where we construct the new block. */
+		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
+		/* assert(header == s->base) */
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
+		header(s->base)->h_blocks = cpu_to_le32(1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->first = ENTRY(header(s->base)+1);
+		s->here = ENTRY(header(s->base)+1);
+		s->end = s->base + sb->s_blocksize;
+	}
+
+	error = ext3_xattr_set_entry(i, s);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	if (!IS_LAST_ENTRY(s->first))
+		ext3_xattr_rehash(header(s->base), s->here);
+
+inserted:
+	if (!IS_LAST_ENTRY(s->first)) {
+		new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
+		if (new_bh) {
+			/* We found an identical block in the cache. */
+			if (new_bh == bs->bh)
+				ea_bdebug(new_bh, "keeping");
+			else {
+				/* The old block is released after updating
+				   the inode. */
+				error = dquot_alloc_block(inode, 1);
+				if (error)
+					goto cleanup;
+				error = ext3_journal_get_write_access(handle,
+								      new_bh);
+				if (error)
+					goto cleanup_dquot;
+				lock_buffer(new_bh);
+				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+				ea_bdebug(new_bh, "reusing; refcount now=%d",
+					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				unlock_buffer(new_bh);
+				error = ext3_journal_dirty_metadata(handle,
+								    new_bh);
+				if (error)
+					goto cleanup_dquot;
+			}
+			mb_cache_entry_release(ce);
+			ce = NULL;
+		} else if (bs->bh && s->base == bs->bh->b_data) {
+			/* We were modifying this block in-place. */
+			ea_bdebug(bs->bh, "keeping this block");
+			new_bh = bs->bh;
+			get_bh(new_bh);
+		} else {
+			/* We need to allocate a new block */
+			ext3_fsblk_t goal = ext3_group_first_block_no(sb,
+						EXT3_I(inode)->i_block_group);
+			ext3_fsblk_t block;
+
+			/*
+			 * Protect us agaist concurrent allocations to the
+			 * same inode from ext3_..._writepage(). Reservation
+			 * code does not expect racing allocations.
+			 */
+			mutex_lock(&EXT3_I(inode)->truncate_mutex);
+			block = ext3_new_block(handle, inode, goal, &error);
+			mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+			if (error)
+				goto cleanup;
+			ea_idebug(inode, "creating block %d", block);
+
+			new_bh = sb_getblk(sb, block);
+			if (unlikely(!new_bh)) {
+getblk_failed:
+				ext3_free_blocks(handle, inode, block, 1);
+				error = -ENOMEM;
+				goto cleanup;
+			}
+			lock_buffer(new_bh);
+			error = ext3_journal_get_create_access(handle, new_bh);
+			if (error) {
+				unlock_buffer(new_bh);
+				goto getblk_failed;
+			}
+			memcpy(new_bh->b_data, s->base, new_bh->b_size);
+			set_buffer_uptodate(new_bh);
+			unlock_buffer(new_bh);
+			ext3_xattr_cache_insert(new_bh);
+			error = ext3_journal_dirty_metadata(handle, new_bh);
+			if (error)
+				goto cleanup;
+		}
+	}
+
+	/* Update the inode. */
+	EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+
+	/* Drop the previous xattr block. */
+	if (bs->bh && bs->bh != new_bh)
+		ext3_xattr_release_block(handle, inode, bs->bh);
+	error = 0;
+
+cleanup:
+	if (ce)
+		mb_cache_entry_release(ce);
+	brelse(new_bh);
+	if (!(bs->bh && s->base == bs->bh->b_data))
+		kfree(s->base);
+
+	return error;
+
+cleanup_dquot:
+	dquot_free_block(inode, 1);
+	goto cleanup;
+
+bad_block:
+	ext3_error(inode->i_sb, __func__,
+		   "inode %lu: bad block "E3FSBLK, inode->i_ino,
+		   EXT3_I(inode)->i_file_acl);
+	goto cleanup;
+
+#undef header
+}
+
+struct ext3_xattr_ibody_find {
+	struct ext3_xattr_search s;
+	struct ext3_iloc iloc;
+};
+
+static int
+ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
+		      struct ext3_xattr_ibody_find *is)
+{
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_inode *raw_inode;
+	int error;
+
+	if (EXT3_I(inode)->i_extra_isize == 0)
+		return 0;
+	raw_inode = ext3_raw_inode(&is->iloc);
+	header = IHDR(inode, raw_inode);
+	is->s.base = is->s.first = IFIRST(header);
+	is->s.here = is->s.first;
+	is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
+	if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
+		error = ext3_xattr_check_names(IFIRST(header), is->s.end);
+		if (error)
+			return error;
+		/* Find the named attribute. */
+		error = ext3_xattr_find_entry(&is->s.here, i->name_index,
+					      i->name, is->s.end -
+					      (void *)is->s.base, 0);
+		if (error && error != -ENODATA)
+			return error;
+		is->s.not_found = error;
+	}
+	return 0;
+}
+
+static int
+ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
+		     struct ext3_xattr_info *i,
+		     struct ext3_xattr_ibody_find *is)
+{
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_xattr_search *s = &is->s;
+	int error;
+
+	if (EXT3_I(inode)->i_extra_isize == 0)
+		return -ENOSPC;
+	error = ext3_xattr_set_entry(i, s);
+	if (error)
+		return error;
+	header = IHDR(inode, ext3_raw_inode(&is->iloc));
+	if (!IS_LAST_ENTRY(s->first)) {
+		header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
+		ext3_set_inode_state(inode, EXT3_STATE_XATTR);
+	} else {
+		header->h_magic = cpu_to_le32(0);
+		ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
+	}
+	return 0;
+}
+
+/*
+ * ext3_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode.  Value
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+		      const char *name, const void *value, size_t value_len,
+		      int flags)
+{
+	struct ext3_xattr_info i = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+
+	};
+	struct ext3_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext3_xattr_block_find bs = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int error;
+
+	if (!name)
+		return -EINVAL;
+	if (strlen(name) > 255)
+		return -ERANGE;
+	down_write(&EXT3_I(inode)->xattr_sem);
+	error = ext3_get_inode_loc(inode, &is.iloc);
+	if (error)
+		goto cleanup;
+
+	error = ext3_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto cleanup;
+
+	if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
+		struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
+		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
+		ext3_clear_inode_state(inode, EXT3_STATE_NEW);
+	}
+
+	error = ext3_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found)
+		error = ext3_xattr_block_find(inode, &i, &bs);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found && bs.s.not_found) {
+		error = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		error = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+	if (!value) {
+		if (!is.s.not_found)
+			error = ext3_xattr_ibody_set(handle, inode, &i, &is);
+		else if (!bs.s.not_found)
+			error = ext3_xattr_block_set(handle, inode, &i, &bs);
+	} else {
+		error = ext3_xattr_ibody_set(handle, inode, &i, &is);
+		if (!error && !bs.s.not_found) {
+			i.value = NULL;
+			error = ext3_xattr_block_set(handle, inode, &i, &bs);
+		} else if (error == -ENOSPC) {
+			if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext3_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
+			error = ext3_xattr_block_set(handle, inode, &i, &bs);
+			if (error)
+				goto cleanup;
+			if (!is.s.not_found) {
+				i.value = NULL;
+				error = ext3_xattr_ibody_set(handle, inode, &i,
+							     &is);
+			}
+		}
+	}
+	if (!error) {
+		ext3_xattr_update_super_block(handle, inode->i_sb);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
+		/*
+		 * The bh is consumed by ext3_mark_iloc_dirty, even with
+		 * error != 0.
+		 */
+		is.iloc.bh = NULL;
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+	}
+
+cleanup:
+	brelse(is.iloc.bh);
+	brelse(bs.bh);
+	up_write(&EXT3_I(inode)->xattr_sem);
+	return error;
+}
+
+/*
+ * ext3_xattr_set()
+ *
+ * Like ext3_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext3_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t value_len, int flags)
+{
+	handle_t *handle;
+	int error, retries = 0;
+
+retry:
+	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle)) {
+		error = PTR_ERR(handle);
+	} else {
+		int error2;
+
+		error = ext3_xattr_set_handle(handle, inode, name_index, name,
+					      value, value_len, flags);
+		error2 = ext3_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext3_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+		if (error == 0)
+			error = error2;
+	}
+
+	return error;
+}
+
+/*
+ * ext3_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+
+	if (!EXT3_I(inode)->i_file_acl)
+		goto cleanup;
+	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	if (!bh) {
+		ext3_error(inode->i_sb, __func__,
+			"inode %lu: block "E3FSBLK" read error", inode->i_ino,
+			EXT3_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+		ext3_error(inode->i_sb, __func__,
+			"inode %lu: bad block "E3FSBLK, inode->i_ino,
+			EXT3_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ext3_xattr_release_block(handle, inode, bh);
+	EXT3_I(inode)->i_file_acl = 0;
+
+cleanup:
+	brelse(bh);
+}
+
+/*
+ * ext3_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext3_xattr_put_super(struct super_block *sb)
+{
+	mb_cache_shrink(sb->s_bdev);
+}
+
+/*
+ * ext3_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext3_xattr_cache_insert(struct buffer_head *bh)
+{
+	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+	struct mb_cache_entry *ce;
+	int error;
+
+	ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
+	if (!ce) {
+		ea_bdebug(bh, "out of memory");
+		return;
+	}
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	if (error) {
+		mb_cache_entry_free(ce);
+		if (error == -EBUSY) {
+			ea_bdebug(bh, "already in cache");
+			error = 0;
+		}
+	} else {
+		ea_bdebug(bh, "inserting [%x]", (int)hash);
+		mb_cache_entry_release(ce);
+	}
+}
+
+/*
+ * ext3_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext3_xattr_cmp(struct ext3_xattr_header *header1,
+	       struct ext3_xattr_header *header2)
+{
+	struct ext3_xattr_entry *entry1, *entry2;
+
+	entry1 = ENTRY(header1+1);
+	entry2 = ENTRY(header2+1);
+	while (!IS_LAST_ENTRY(entry1)) {
+		if (IS_LAST_ENTRY(entry2))
+			return 1;
+		if (entry1->e_hash != entry2->e_hash ||
+		    entry1->e_name_index != entry2->e_name_index ||
+		    entry1->e_name_len != entry2->e_name_len ||
+		    entry1->e_value_size != entry2->e_value_size ||
+		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+			return 1;
+		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+			return -EIO;
+		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+			   le32_to_cpu(entry1->e_value_size)))
+			return 1;
+
+		entry1 = EXT3_XATTR_NEXT(entry1);
+		entry2 = EXT3_XATTR_NEXT(entry2);
+	}
+	if (!IS_LAST_ENTRY(entry2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext3_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
+		      struct mb_cache_entry **pce)
+{
+	__u32 hash = le32_to_cpu(header->h_hash);
+	struct mb_cache_entry *ce;
+
+	if (!header->h_hash)
+		return NULL;  /* never share */
+	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+	ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
+	while (ce) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(ce)) {
+			if (PTR_ERR(ce) == -EAGAIN)
+				goto again;
+			break;
+		}
+		bh = sb_bread(inode->i_sb, ce->e_block);
+		if (!bh) {
+			ext3_error(inode->i_sb, __func__,
+				"inode %lu: block %lu read error",
+				inode->i_ino, (unsigned long) ce->e_block);
+		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+				EXT3_XATTR_REFCOUNT_MAX) {
+			ea_idebug(inode, "block %lu refcount %d>=%d",
+				  (unsigned long) ce->e_block,
+				  le32_to_cpu(BHDR(bh)->h_refcount),
+					  EXT3_XATTR_REFCOUNT_MAX);
+		} else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
+			*pce = ce;
+			return bh;
+		}
+		brelse(bh);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+	}
+	return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext3_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
+					 struct ext3_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	char *name = entry->e_name;
+	int n;
+
+	for (n=0; n < entry->e_name_len; n++) {
+		hash = (hash << NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+		__le32 *value = (__le32 *)((char *)header +
+			le16_to_cpu(entry->e_value_offs));
+		for (n = (le32_to_cpu(entry->e_value_size) +
+		     EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
+			hash = (hash << VALUE_HASH_SHIFT) ^
+			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+			       le32_to_cpu(*value++);
+		}
+	}
+	entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext3_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext3_xattr_rehash(struct ext3_xattr_header *header,
+			      struct ext3_xattr_entry *entry)
+{
+	struct ext3_xattr_entry *here;
+	__u32 hash = 0;
+
+	ext3_xattr_hash_entry(header, entry);
+	here = ENTRY(header+1);
+	while (!IS_LAST_ENTRY(here)) {
+		if (!here->e_hash) {
+			/* Block is not shared if an entry's hash value == 0 */
+			hash = 0;
+			break;
+		}
+		hash = (hash << BLOCK_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+		       le32_to_cpu(here->e_hash);
+		here = EXT3_XATTR_NEXT(here);
+	}
+	header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+init_ext3_xattr(void)
+{
+	ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
+	if (!ext3_xattr_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+exit_ext3_xattr(void)
+{
+	if (ext3_xattr_cache)
+		mb_cache_destroy(ext3_xattr_cache);
+	ext3_xattr_cache = NULL;
+}
diff --git a/kernel/fs/ext3/xattr.h b/kernel/fs/ext3/xattr.h
new file mode 100644
index 000000000..32e93ebf8
--- /dev/null
+++ b/kernel/fs/ext3/xattr.h
@@ -0,0 +1,136 @@
+/*
+  File: fs/ext3/xattr.h
+
+  On-disk format of extended attributes for the ext3 filesystem.
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT3_XATTR_MAGIC		0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT3_XATTR_REFCOUNT_MAX		1024
+
+/* Name indexes */
+#define EXT3_XATTR_INDEX_USER			1
+#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define EXT3_XATTR_INDEX_TRUSTED		4
+#define	EXT3_XATTR_INDEX_LUSTRE			5
+#define EXT3_XATTR_INDEX_SECURITY	        6
+
+struct ext3_xattr_header {
+	__le32	h_magic;	/* magic number for identification */
+	__le32	h_refcount;	/* reference count */
+	__le32	h_blocks;	/* number of disk blocks used */
+	__le32	h_hash;		/* hash value of all attributes */
+	__u32	h_reserved[4];	/* zero right now */
+};
+
+struct ext3_xattr_ibody_header {
+	__le32	h_magic;	/* magic number for identification */
+};
+
+struct ext3_xattr_entry {
+	__u8	e_name_len;	/* length of name */
+	__u8	e_name_index;	/* attribute name index */
+	__le16	e_value_offs;	/* offset in disk block of value */
+	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_size;	/* size of attribute value */
+	__le32	e_hash;		/* hash value of name and value */
+	char	e_name[0];	/* attribute name */
+};
+
+#define EXT3_XATTR_PAD_BITS		2
+#define EXT3_XATTR_PAD		(1<<EXT3_XATTR_PAD_BITS)
+#define EXT3_XATTR_ROUND		(EXT3_XATTR_PAD-1)
+#define EXT3_XATTR_LEN(name_len) \
+	(((name_len) + EXT3_XATTR_ROUND + \
+	sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
+#define EXT3_XATTR_NEXT(entry) \
+	( (struct ext3_xattr_entry *)( \
+	  (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
+#define EXT3_XATTR_SIZE(size) \
+	(((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
+
+# ifdef CONFIG_EXT3_FS_XATTR
+
+extern const struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
+
+extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
+
+extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext3_xattr_put_super(struct super_block *);
+
+extern int init_ext3_xattr(void);
+extern void exit_ext3_xattr(void);
+
+extern const struct xattr_handler *ext3_xattr_handlers[];
+
+# else  /* CONFIG_EXT3_FS_XATTR */
+
+static inline int
+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext3_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+	       const char *name, const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+
+static inline void
+ext3_xattr_put_super(struct super_block *sb)
+{
+}
+
+static inline int
+init_ext3_xattr(void)
+{
+	return 0;
+}
+
+static inline void
+exit_ext3_xattr(void)
+{
+}
+
+#define ext3_xattr_handlers	NULL
+
+# endif  /* CONFIG_EXT3_FS_XATTR */
+
+#ifdef CONFIG_EXT3_FS_SECURITY
+extern int ext3_init_security(handle_t *handle, struct inode *inode,
+			      struct inode *dir, const struct qstr *qstr);
+#else
+static inline int ext3_init_security(handle_t *handle, struct inode *inode,
+				     struct inode *dir, const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/fs/ext3/xattr_security.c b/kernel/fs/ext3/xattr_security.c
new file mode 100644
index 000000000..c9506d5e3
--- /dev/null
+++ b/kernel/fs/ext3/xattr_security.c
@@ -0,0 +1,78 @@
+/*
+ * linux/fs/ext3/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/security.h>
+#include "ext3.h"
+#include "xattr.h"
+
+static size_t
+ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+			 const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext3_xattr_security_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
+			      name, buffer, size);
+}
+
+static int
+ext3_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
+			      name, value, size, flags);
+}
+
+static int ext3_initxattrs(struct inode *inode,
+			   const struct xattr *xattr_array,
+			   void *fs_info)
+{
+	const struct xattr *xattr;
+	handle_t *handle = fs_info;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ext3_xattr_set_handle(handle, inode,
+					    EXT3_XATTR_INDEX_SECURITY,
+					    xattr->name, xattr->value,
+					    xattr->value_len, 0);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &ext3_initxattrs, handle);
+}
+
+const struct xattr_handler ext3_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ext3_xattr_security_list,
+	.get	= ext3_xattr_security_get,
+	.set	= ext3_xattr_security_set,
+};
diff --git a/kernel/fs/ext3/xattr_trusted.c b/kernel/fs/ext3/xattr_trusted.c
new file mode 100644
index 000000000..206cc66dc
--- /dev/null
+++ b/kernel/fs/ext3/xattr_trusted.c
@@ -0,0 +1,54 @@
+/*
+ * linux/fs/ext3/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include "ext3.h"
+#include "xattr.h"
+
+static size_t
+ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED,
+			      name, buffer, size);
+}
+
+static int
+ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name,
+			      value, size, flags);
+}
+
+const struct xattr_handler ext3_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ext3_xattr_trusted_list,
+	.get	= ext3_xattr_trusted_get,
+	.set	= ext3_xattr_trusted_set,
+};
diff --git a/kernel/fs/ext3/xattr_user.c b/kernel/fs/ext3/xattr_user.c
new file mode 100644
index 000000000..021508ad1
--- /dev/null
+++ b/kernel/fs/ext3/xattr_user.c
@@ -0,0 +1,58 @@
+/*
+ * linux/fs/ext3/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include "ext3.h"
+#include "xattr.h"
+
+static size_t
+ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
+		size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER,
+			      name, buffer, size);
+}
+
+static int
+ext3_xattr_user_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext3_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ext3_xattr_user_list,
+	.get	= ext3_xattr_user_get,
+	.set	= ext3_xattr_user_set,
+};
diff --git a/kernel/fs/ext4/Kconfig b/kernel/fs/ext4/Kconfig
new file mode 100644
index 000000000..024f2284d
--- /dev/null
+++ b/kernel/fs/ext4/Kconfig
@@ -0,0 +1,97 @@
+config EXT4_FS
+	tristate "The Extended 4 (ext4) filesystem"
+	select JBD2
+	select CRC16
+	select CRYPTO
+	select CRYPTO_CRC32C
+	help
+	  This is the next generation of the ext3 filesystem.
+
+	  Unlike the change from ext2 filesystem to ext3 filesystem,
+	  the on-disk format of ext4 is not forwards compatible with
+	  ext3; it is based on extent maps and it supports 48-bit
+	  physical block numbers.  The ext4 filesystem also supports delayed
+	  allocation, persistent preallocation, high resolution time stamps,
+	  and a number of other features to improve performance and speed
+	  up fsck time.  For more information, please see the web pages at
+	  http://ext4.wiki.kernel.org.
+
+	  The ext4 filesystem will support mounting an ext3
+	  filesystem; while there will be some performance gains from
+	  the delayed allocation and inode table readahead, the best
+	  performance gains will require enabling ext4 features in the
+	  filesystem, or formatting a new filesystem as an ext4
+	  filesystem initially.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called ext4.
+
+	  If unsure, say N.
+
+config EXT4_USE_FOR_EXT23
+	bool "Use ext4 for ext2/ext3 file systems"
+	depends on EXT4_FS
+	depends on EXT3_FS=n || EXT2_FS=n
+	default y
+	help
+	  Allow the ext4 file system driver code to be used for ext2 or
+	  ext3 file system mounts.  This allows users to reduce their
+	  compiled kernel size by using one file system driver for
+	  ext2, ext3, and ext4 file systems.
+
+config EXT4_FS_POSIX_ACL
+	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+	bool "Ext4 Security Labels"
+	depends on EXT4_FS
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext4 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config EXT4_ENCRYPTION
+	tristate "Ext4 Encryption"
+	depends on EXT4_FS
+	select CRYPTO_AES
+	select CRYPTO_CBC
+	select CRYPTO_ECB
+	select CRYPTO_XTS
+	select CRYPTO_CTS
+	select CRYPTO_SHA256
+	select KEYS
+	select ENCRYPTED_KEYS
+	help
+	  Enable encryption of ext4 files and directories.  This
+	  feature is similar to ecryptfs, but it is more memory
+	  efficient since it avoids caching the encrypted and
+	  decrypted pages in the page cache.
+
+config EXT4_FS_ENCRYPTION
+	bool
+	default y
+	depends on EXT4_ENCRYPTION
+
+config EXT4_DEBUG
+	bool "EXT4 debugging support"
+	depends on EXT4_FS
+	help
+	  Enables run-time debugging support for the ext4 filesystem.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with a command such as:
+		echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/kernel/fs/ext4/Makefile b/kernel/fs/ext4/Makefile
new file mode 100644
index 000000000..75285ea9a
--- /dev/null
+++ b/kernel/fs/ext4/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the linux ext4-filesystem routines.
+#
+
+obj-$(CONFIG_EXT4_FS) += ext4.o
+
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+		mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+		xattr_trusted.o inline.o readpage.o
+
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
+ext4-$(CONFIG_EXT4_FS_ENCRYPTION)	+= crypto_policy.o crypto.o \
+		crypto_key.o crypto_fname.o
diff --git a/kernel/fs/ext4/acl.c b/kernel/fs/ext4/acl.c
new file mode 100644
index 000000000..69b1e7302
--- /dev/null
+++ b/kernel/fs/ext4/acl.c
@@ -0,0 +1,283 @@
+/*
+ * linux/fs/ext4/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext4_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(ext4_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((ext4_acl_header *)value)->a_version !=
+	    cpu_to_le32(EXT4_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(ext4_acl_header);
+	count = ext4_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		ext4_acl_entry *entry =
+			(ext4_acl_entry *)value;
+		if ((char *)value + sizeof(ext4_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+				sizeof(ext4_acl_entry_short);
+			break;
+
+		case ACL_USER:
+			value = (char *)value + sizeof(ext4_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_uid =
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			value = (char *)value + sizeof(ext4_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	ext4_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = ext4_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
+			sizeof(ext4_acl_entry), GFP_NOFS);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(ext4_acl_header);
+	for (n = 0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		ext4_acl_entry *entry = (ext4_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			e += sizeof(ext4_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			e += sizeof(ext4_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(ext4_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+struct posix_acl *
+ext4_get_acl(struct inode *inode, int type)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ext4_xattr_get(inode, name_index, "", value, retval);
+	}
+	if (retval > 0)
+		acl = ext4_acl_from_disk(value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext4_new_inode
+ */
+static int
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+	     struct posix_acl *acl)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			error = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (error < 0)
+				return error;
+			else {
+				inode->i_ctime = ext4_current_time(inode);
+				ext4_mark_inode_dirty(handle, inode);
+				if (error == 0)
+					acl = NULL;
+			}
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (acl) {
+		value = ext4_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = ext4_xattr_set_handle(handle, inode, name_index, "",
+				      value, size, 0);
+
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
+	return error;
+}
+
+int
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	handle_t *handle;
+	int error, retries = 0;
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+				    ext4_jbd2_credits_xattr(inode));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	error = __ext4_set_acl(handle, inode, type, acl);
+	ext4_journal_stop(handle);
+	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+	return error;
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *default_acl, *acl;
+	int error;
+
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (default_acl) {
+		error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+				       default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+					       acl);
+		posix_acl_release(acl);
+	}
+	return error;
+}
diff --git a/kernel/fs/ext4/acl.h b/kernel/fs/ext4/acl.h
new file mode 100644
index 000000000..da2c79577
--- /dev/null
+++ b/kernel/fs/ext4/acl.h
@@ -0,0 +1,72 @@
+/*
+  File: fs/ext4/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT4_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} ext4_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} ext4_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} ext4_acl_header;
+
+static inline size_t ext4_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(ext4_acl_header) +
+		       count * sizeof(ext4_acl_entry_short);
+	} else {
+		return sizeof(ext4_acl_header) +
+		       4 * sizeof(ext4_acl_entry_short) +
+		       (count - 4) * sizeof(ext4_acl_entry);
+	}
+}
+
+static inline int ext4_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(ext4_acl_header);
+	s = size - 4 * sizeof(ext4_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(ext4_acl_entry_short))
+			return -1;
+		return size / sizeof(ext4_acl_entry_short);
+	} else {
+		if (s % sizeof(ext4_acl_entry))
+			return -1;
+		return s / sizeof(ext4_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+
+/* acl.c */
+struct posix_acl *ext4_get_acl(struct inode *inode, int type);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
+
+#else  /* CONFIG_EXT4_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext4_get_acl NULL
+#define ext4_set_acl NULL
+
+static inline int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif  /* CONFIG_EXT4_FS_POSIX_ACL */
+
diff --git a/kernel/fs/ext4/balloc.c b/kernel/fs/ext4/balloc.c
new file mode 100644
index 000000000..955bf49a7
--- /dev/null
+++ b/kernel/fs/ext4/balloc.c
@@ -0,0 +1,880 @@
+/*
+ *  linux/fs/ext4/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "mballoc.h"
+
+#include <trace/events/ext4.h>
+
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+					    ext4_group_t block_group);
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+				   ext4_fsblk_t block)
+{
+	ext4_group_t group;
+
+	if (test_opt2(sb, STD_GROUP_SIZE))
+		group = (block -
+			 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
+			(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+	else
+		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+	return group;
+}
+
+/*
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
+ */
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	ext4_grpblk_t offset;
+
+	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+		EXT4_SB(sb)->s_cluster_bits;
+	if (offsetp)
+		*offsetp = offset;
+	if (blockgrpp)
+		*blockgrpp = blocknr;
+
+}
+
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+				      ext4_fsblk_t block,
+				      ext4_group_t block_group)
+{
+	ext4_group_t actual_group;
+
+	actual_group = ext4_get_group_number(sb, block);
+	return (actual_group == block_group) ? 1 : 0;
+}
+
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+static unsigned ext4_num_overhead_clusters(struct super_block *sb,
+					   ext4_group_t block_group,
+					   struct ext4_group_desc *gdp)
+{
+	unsigned num_clusters;
+	int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+	ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+	ext4_fsblk_t itbl_blk;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/* This is the number of clusters used by the superblock,
+	 * block group descriptors, and reserved block group
+	 * descriptor blocks */
+	num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+	/*
+	 * For the allocation bitmaps and inode table, we first need
+	 * to check to see if the block is in the block group.  If it
+	 * is, then check to see if the cluster is already accounted
+	 * for in the clusters used for the base metadata cluster, or
+	 * if we can increment the base metadata cluster to include
+	 * that block.  Otherwise, we will have to track the cluster
+	 * used for the allocation bitmap or inode table explicitly.
+	 * Normally all of these blocks are contiguous, so the special
+	 * case handling shouldn't be necessary except for *very*
+	 * unusual file system layouts.
+	 */
+	if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+		block_cluster = EXT4_B2C(sbi,
+					 ext4_block_bitmap(sb, gdp) - start);
+		if (block_cluster < num_clusters)
+			block_cluster = -1;
+		else if (block_cluster == num_clusters) {
+			num_clusters++;
+			block_cluster = -1;
+		}
+	}
+
+	if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+		inode_cluster = EXT4_B2C(sbi,
+					 ext4_inode_bitmap(sb, gdp) - start);
+		if (inode_cluster < num_clusters)
+			inode_cluster = -1;
+		else if (inode_cluster == num_clusters) {
+			num_clusters++;
+			inode_cluster = -1;
+		}
+	}
+
+	itbl_blk = ext4_inode_table(sb, gdp);
+	for (i = 0; i < sbi->s_itb_per_group; i++) {
+		if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+			c = EXT4_B2C(sbi, itbl_blk + i - start);
+			if ((c < num_clusters) || (c == inode_cluster) ||
+			    (c == block_cluster) || (c == itbl_cluster))
+				continue;
+			if (c == num_clusters) {
+				num_clusters++;
+				continue;
+			}
+			num_clusters++;
+			itbl_cluster = c;
+		}
+	}
+
+	if (block_cluster != -1)
+		num_clusters++;
+	if (inode_cluster != -1)
+		num_clusters++;
+
+	return num_clusters;
+}
+
+static unsigned int num_clusters_in_group(struct super_block *sb,
+					  ext4_group_t block_group)
+{
+	unsigned int blocks;
+
+	if (block_group == ext4_get_groups_count(sb) - 1) {
+		/*
+		 * Even though mke2fs always initializes the first and
+		 * last group, just in case some other tool was used,
+		 * we need to make sure we calculate the right free
+		 * blocks.
+		 */
+		blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+			ext4_group_first_block_no(sb, block_group);
+	} else
+		blocks = EXT4_BLOCKS_PER_GROUP(sb);
+	return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
+
+/* Initializes an uninitialized block bitmap */
+static int ext4_init_block_bitmap(struct super_block *sb,
+				   struct buffer_head *bh,
+				   ext4_group_t block_group,
+				   struct ext4_group_desc *gdp)
+{
+	unsigned int bit, bit_max;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t start, tmp;
+	int flex_bg = 0;
+	struct ext4_group_info *grp;
+
+	J_ASSERT_BH(bh, buffer_locked(bh));
+
+	/* If checksum is bad mark all blocks used to prevent allocation
+	 * essentially implementing a per-group read-only flag. */
+	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+		grp = ext4_get_group_info(sb, block_group);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, gdp);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
+		}
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return -EIO;
+	}
+	memset(bh->b_data, 0, sb->s_blocksize);
+
+	bit_max = ext4_num_base_meta_clusters(sb, block_group);
+	for (bit = 0; bit < bit_max; bit++)
+		ext4_set_bit(bit, bh->b_data);
+
+	start = ext4_group_first_block_no(sb, block_group);
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		flex_bg = 1;
+
+	/* Set bits for block and inode bitmaps, and inode table */
+	tmp = ext4_block_bitmap(sb, gdp);
+	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+		ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+
+	tmp = ext4_inode_bitmap(sb, gdp);
+	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+		ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+
+	tmp = ext4_inode_table(sb, gdp);
+	for (; tmp < ext4_inode_table(sb, gdp) +
+		     sbi->s_itb_per_group; tmp++) {
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+	}
+
+	/*
+	 * Also if the number of blocks within the group is less than
+	 * the blocksize * 8 ( which is the size of bitmap ), set rest
+	 * of the block bitmap to 1
+	 */
+	ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+			     sb->s_blocksize * 8, bh->b_data);
+	ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
+	return 0;
+}
+
+/* Return the number of free blocks in a block group.  It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
+{
+	return num_clusters_in_group(sb, block_group) - 
+		ext4_num_overhead_clusters(sb, block_group, gdp);
+}
+
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext4_fill_super).
+ */
+
+/**
+ * ext4_get_group_desc() -- load group descriptor from disk
+ * @sb:			super block
+ * @block_group:	given block group
+ * @bh:			pointer to the buffer head to store the block
+ *			group descriptor
+ */
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
+					     ext4_group_t block_group,
+					     struct buffer_head **bh)
+{
+	unsigned int group_desc;
+	unsigned int offset;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_desc *desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (block_group >= ngroups) {
+		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+			   " groups_count = %u", block_group, ngroups);
+
+		return NULL;
+	}
+
+	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+	if (!sbi->s_group_desc[group_desc]) {
+		ext4_error(sb, "Group descriptor not loaded - "
+			   "block_group = %u, group_desc = %u, desc = %u",
+			   block_group, group_desc, offset);
+		return NULL;
+	}
+
+	desc = (struct ext4_group_desc *)(
+		(__u8 *)sbi->s_group_desc[group_desc]->b_data +
+		offset * EXT4_DESC_SIZE(sb));
+	if (bh)
+		*bh = sbi->s_group_desc[group_desc];
+	return desc;
+}
+
+/*
+ * Return the block number which was discovered to be invalid, or 0 if
+ * the block bitmap is valid.
+ */
+static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
+					    struct ext4_group_desc *desc,
+					    ext4_group_t block_group,
+					    struct buffer_head *bh)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_grpblk_t offset;
+	ext4_grpblk_t next_zero_bit;
+	ext4_fsblk_t blk;
+	ext4_fsblk_t group_first_block;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		/* with FLEX_BG, the inode/block bitmaps and itable
+		 * blocks may not be in the group at all
+		 * so the bitmap validation will be skipped for those groups
+		 * or it has to also read the block group where the bitmaps
+		 * are located to verify they are set.
+		 */
+		return 0;
+	}
+	group_first_block = ext4_group_first_block_no(sb, block_group);
+
+	/* check whether block bitmap block number is set */
+	blk = ext4_block_bitmap(sb, desc);
+	offset = blk - group_first_block;
+	if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
+		/* bad block bitmap */
+		return blk;
+
+	/* check whether the inode bitmap block number is set */
+	blk = ext4_inode_bitmap(sb, desc);
+	offset = blk - group_first_block;
+	if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
+		/* bad block bitmap */
+		return blk;
+
+	/* check whether the inode table block number is set */
+	blk = ext4_inode_table(sb, desc);
+	offset = blk - group_first_block;
+	next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
+			EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
+			EXT4_B2C(sbi, offset));
+	if (next_zero_bit <
+	    EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
+		/* bad bitmap for inode tables */
+		return blk;
+	return 0;
+}
+
+static void ext4_validate_block_bitmap(struct super_block *sb,
+				       struct ext4_group_desc *desc,
+				       ext4_group_t block_group,
+				       struct buffer_head *bh)
+{
+	ext4_fsblk_t	blk;
+	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (buffer_verified(bh))
+		return;
+
+	ext4_lock_group(sb, block_group);
+	blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+	if (unlikely(blk != 0)) {
+		ext4_unlock_group(sb, block_group);
+		ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+			   block_group, blk);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return;
+	}
+	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+			desc, bh))) {
+		ext4_unlock_group(sb, block_group);
+		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return;
+	}
+	set_buffer_verified(bh);
+	ext4_unlock_group(sb, block_group);
+}
+
+/**
+ * ext4_read_block_bitmap_nowait()
+ * @sb:			super block
+ * @block_group:	given block group
+ *
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+struct buffer_head *
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+{
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
+	ext4_fsblk_t bitmap_blk;
+
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+	bitmap_blk = ext4_block_bitmap(sb, desc);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext4_error(sb, "Cannot get buffer for block bitmap - "
+			   "block_group = %u, block_bitmap = %llu",
+			   block_group, bitmap_blk);
+		return NULL;
+	}
+
+	if (bitmap_uptodate(bh))
+		goto verify;
+
+	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto verify;
+	}
+	ext4_lock_group(sb, block_group);
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		int err;
+
+		err = ext4_init_block_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
+		set_buffer_uptodate(bh);
+		ext4_unlock_group(sb, block_group);
+		unlock_buffer(bh);
+		if (err)
+			ext4_error(sb, "Checksum bad for grp %u", block_group);
+		return bh;
+	}
+	ext4_unlock_group(sb, block_group);
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		goto verify;
+	}
+	/*
+	 * submit the buffer_head for reading
+	 */
+	set_buffer_new(bh);
+	trace_ext4_read_block_bitmap_load(sb, block_group);
+	bh->b_end_io = ext4_end_bitmap_read;
+	get_bh(bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
+	return bh;
+verify:
+	ext4_validate_block_bitmap(sb, desc, block_group, bh);
+	if (buffer_verified(bh))
+		return bh;
+	put_bh(bh);
+	return NULL;
+}
+
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+			   struct buffer_head *bh)
+{
+	struct ext4_group_desc *desc;
+
+	if (!buffer_new(bh))
+		return 0;
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return 1;
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		ext4_error(sb, "Cannot read block bitmap - "
+			   "block_group = %u, block_bitmap = %llu",
+			   block_group, (unsigned long long) bh->b_blocknr);
+		return 1;
+	}
+	clear_buffer_new(bh);
+	/* Panic or remount fs read-only if block bitmap is invalid */
+	ext4_validate_block_bitmap(sb, desc, block_group, bh);
+	/* ...but check for error just in case errors=continue. */
+	return !buffer_verified(bh);
+}
+
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+	struct buffer_head *bh;
+
+	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	if (!bh)
+		return NULL;
+	if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+		put_bh(bh);
+		return NULL;
+	}
+	return bh;
+}
+
+/**
+ * ext4_has_free_clusters()
+ * @sbi:	in-core super block structure.
+ * @nclusters:	number of needed blocks
+ * @flags:	flags from ext4_mb_new_blocks()
+ *
+ * Check if filesystem has nclusters free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+				  s64 nclusters, unsigned int flags)
+{
+	s64 free_clusters, dirty_clusters, rsv, resv_clusters;
+	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+	struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
+
+	free_clusters  = percpu_counter_read_positive(fcc);
+	dirty_clusters = percpu_counter_read_positive(dcc);
+	resv_clusters = atomic64_read(&sbi->s_resv_clusters);
+
+	/*
+	 * r_blocks_count should always be multiple of the cluster ratio so
+	 * we are safe to do a plane bit shift only.
+	 */
+	rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+	      resv_clusters;
+
+	if (free_clusters - (nclusters + rsv + dirty_clusters) <
+					EXT4_FREECLUSTERS_WATERMARK) {
+		free_clusters  = percpu_counter_sum_positive(fcc);
+		dirty_clusters = percpu_counter_sum_positive(dcc);
+	}
+	/* Check whether we have space after accounting for current
+	 * dirty clusters & root reserved clusters.
+	 */
+	if (free_clusters >= (rsv + nclusters + dirty_clusters))
+		return 1;
+
+	/* Hm, nope.  Are (enough) root reserved clusters available? */
+	if (uid_eq(sbi->s_resuid, current_fsuid()) ||
+	    (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
+	    capable(CAP_SYS_RESOURCE) ||
+	    (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
+		if (free_clusters >= (nclusters + dirty_clusters +
+				      resv_clusters))
+			return 1;
+	}
+	/* No free blocks. Let's see if we can dip into reserved pool */
+	if (flags & EXT4_MB_USE_RESERVED) {
+		if (free_clusters >= (nclusters + dirty_clusters))
+			return 1;
+	}
+
+	return 0;
+}
+
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+			     s64 nclusters, unsigned int flags)
+{
+	if (ext4_has_free_clusters(sbi, nclusters, flags)) {
+		percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
+		return 0;
+	} else
+		return -ENOSPC;
+}
+
+/**
+ * ext4_should_retry_alloc()
+ * @sb:			super block
+ * @retries		number of attemps has been made
+ *
+ * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or committing transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext4_should_retry_alloc(struct super_block *sb, int *retries)
+{
+	if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
+	    (*retries)++ > 3 ||
+	    !EXT4_SB(sb)->s_journal)
+		return 0;
+
+	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+
+	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+}
+
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		pointer to total number of clusters needed
+ * @errp:               error code
+ *
+ * Return 1st allocated block number on success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+				  ext4_fsblk_t goal, unsigned int flags,
+				  unsigned long *count, int *errp)
+{
+	struct ext4_allocation_request ar;
+	ext4_fsblk_t ret;
+
+	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = count ? *count : 1;
+	ar.flags = flags;
+
+	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	if (count)
+		*count = ar.len;
+	/*
+	 * Account for the allocated meta blocks.  We will never
+	 * fail EDQUOT for metdata, but we do account for it.
+	 */
+	if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
+		dquot_alloc_block_nofail(inode,
+				EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
+	}
+	return ret;
+}
+
+/**
+ * ext4_count_free_clusters() -- count filesystem free clusters
+ * @sb:		superblock
+ *
+ * Adds up the number of free clusters from each block group.
+ */
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
+{
+	ext4_fsblk_t desc_count;
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_info *grp;
+#ifdef EXT4FS_DEBUG
+	struct ext4_super_block *es;
+	ext4_fsblk_t bitmap_count;
+	unsigned int x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT4_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		grp = NULL;
+		if (EXT4_SB(sb)->s_group_info)
+			grp = ext4_get_group_info(sb, i);
+		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			desc_count += ext4_free_group_clusters(sb, gdp);
+		brelse(bitmap_bh);
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
+		if (bitmap_bh == NULL)
+			continue;
+
+		x = ext4_count_free(bitmap_bh->b_data,
+				    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+			i, ext4_free_group_clusters(sb, gdp), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+	       ", computed = %llu, %llu\n",
+	       EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
+	       desc_count, bitmap_count);
+	return bitmap_count;
+#else
+	desc_count = 0;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		grp = NULL;
+		if (EXT4_SB(sb)->s_group_info)
+			grp = ext4_get_group_info(sb, i);
+		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			desc_count += ext4_free_group_clusters(sb, gdp);
+	}
+
+	return desc_count;
+#endif
+}
+
+static inline int test_root(ext4_group_t a, int b)
+{
+	while (1) {
+		if (a < b)
+			return 0;
+		if (a == b)
+			return 1;
+		if ((a % b) != 0)
+			return 0;
+		a = a / b;
+	}
+}
+
+/**
+ *	ext4_bg_has_super - number of blocks used by the superblock in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the superblock (primary or backup)
+ *	in this group.  Currently this will be only 0 or 1.
+ */
+int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	if (group == 0)
+		return 1;
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+		if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
+		    group == le32_to_cpu(es->s_backup_bgs[1]))
+			return 1;
+		return 0;
+	}
+	if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+		return 1;
+	if (!(group & 1))
+		return 0;
+	if (test_root(group, 3) || (test_root(group, 5)) ||
+	    test_root(group, 7))
+		return 1;
+
+	return 0;
+}
+
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
+					ext4_group_t group)
+{
+	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+	ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+	ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+
+	if (group == first || group == first + 1 || group == last)
+		return 1;
+	return 0;
+}
+
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
+					ext4_group_t group)
+{
+	if (!ext4_bg_has_super(sb, group))
+		return 0;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+		return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+	else
+		return EXT4_SB(sb)->s_gdb_count;
+}
+
+/**
+ *	ext4_bg_num_gdb - number of blocks used by the group table in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the group descriptor table
+ *	(primary or backup) in this group.  In the future there may be a
+ *	different number of descriptor blocks in each group.
+ */
+unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
+{
+	unsigned long first_meta_bg =
+			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
+			metagroup < first_meta_bg)
+		return ext4_bg_num_gdb_nometa(sb, group);
+
+	return ext4_bg_num_gdb_meta(sb,group);
+
+}
+
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+				     ext4_group_t block_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned num;
+
+	/* Check for superblock and gdt backups in this group */
+	num = ext4_bg_has_super(sb, block_group);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+			  sbi->s_desc_per_block) {
+		if (num) {
+			num += ext4_bg_num_gdb(sb, block_group);
+			num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+		}
+	} else { /* For META_BG_BLOCK_GROUPS */
+		num += ext4_bg_num_gdb(sb, block_group);
+	}
+	return EXT4_NUM_B2C(sbi, num);
+}
+/**
+ *	ext4_inode_to_goal_block - return a hint for block allocation
+ *	@inode: inode for block allocation
+ *
+ *	Return the ideal location to start allocating blocks for a
+ *	newly created inode.
+ */
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_group_t block_group;
+	ext4_grpblk_t colour;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+	ext4_fsblk_t bg_start;
+	ext4_fsblk_t last_block;
+
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		/*
+		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+		 * block groups per flexgroup, reserve the first block
+		 * group for directories and special files.  Regular
+		 * files will start at the second block group.  This
+		 * tends to speed up directory access and improves
+		 * fsck times.
+		 */
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
+	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+		colour = (current->pid % 16) *
+			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	else
+		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+	return bg_start + colour;
+}
+
diff --git a/kernel/fs/ext4/bitmap.c b/kernel/fs/ext4/bitmap.c
new file mode 100644
index 000000000..4a606afb1
--- /dev/null
+++ b/kernel/fs/ext4/bitmap.c
@@ -0,0 +1,97 @@
+/*
+ *  linux/fs/ext4/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/buffer_head.h>
+#include "ext4.h"
+
+unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
+{
+	return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
+}
+
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh, int sz)
+{
+	__u32 hi;
+	__u32 provided, calculated;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!ext4_has_metadata_csum(sb))
+		return 1;
+
+	provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+	calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
+		hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
+		provided |= (hi << 16);
+	} else
+		calculated &= 0xFFFF;
+
+	return provided == calculated;
+}
+
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh, int sz)
+{
+	__u32 csum;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!ext4_has_metadata_csum(sb))
+		return;
+
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+	if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+		gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
+
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh)
+{
+	__u32 hi;
+	__u32 provided, calculated;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
+
+	if (!ext4_has_metadata_csum(sb))
+		return 1;
+
+	provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+	calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
+		hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
+		provided |= (hi << 16);
+	} else
+		calculated &= 0xFFFF;
+
+	if (provided == calculated)
+		return 1;
+
+	return 0;
+}
+
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh)
+{
+	int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
+	__u32 csum;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!ext4_has_metadata_csum(sb))
+		return;
+
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+	if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+		gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
diff --git a/kernel/fs/ext4/block_validity.c b/kernel/fs/ext4/block_validity.c
new file mode 100644
index 000000000..3522340c7
--- /dev/null
+++ b/kernel/fs/ext4/block_validity.c
@@ -0,0 +1,242 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+	struct rb_node	node;
+	ext4_fsblk_t	start_blk;
+	unsigned int	count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init ext4_init_system_zone(void)
+{
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
+	if (ext4_system_zone_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_system_zone(void)
+{
+	kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+		     struct ext4_system_zone *entry2)
+{
+	if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+		return 1;
+	return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+			   ext4_fsblk_t start_blk,
+			   unsigned int count)
+{
+	struct ext4_system_zone *new_entry = NULL, *entry;
+	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node = NULL;
+
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_system_zone, node);
+		if (start_blk < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			if (start_blk + count > (entry->start_blk +
+						 entry->count))
+				entry->count = (start_blk + count -
+						entry->start_blk);
+			new_node = *n;
+			new_entry = rb_entry(new_node, struct ext4_system_zone,
+					     node);
+			break;
+		}
+	}
+
+	if (!new_entry) {
+		new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+					     GFP_KERNEL);
+		if (!new_entry)
+			return -ENOMEM;
+		new_entry->start_blk = start_blk;
+		new_entry->count = count;
+		new_node = &new_entry->node;
+
+		rb_link_node(new_node, parent, n);
+		rb_insert_color(new_node, &sbi->system_blks);
+	}
+
+	/* Can we merge to the left? */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+
+	/* Can we merge to the right? */
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+	return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+	struct rb_node *node;
+	struct ext4_system_zone *entry;
+	int first = 1;
+
+	printk(KERN_INFO "System zones: ");
+	node = rb_first(&sbi->system_blks);
+	while (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		printk("%s%llu-%llu", first ? "" : ", ",
+		       entry->start_blk, entry->start_blk + entry->count - 1);
+		first = 0;
+		node = rb_next(node);
+	}
+	printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(sbi);
+	int ret;
+
+	if (!test_opt(sb, BLOCK_VALIDITY)) {
+		if (EXT4_SB(sb)->system_blks.rb_node)
+			ext4_release_system_zone(sb);
+		return 0;
+	}
+	if (EXT4_SB(sb)->system_blks.rb_node)
+		return 0;
+
+	for (i=0; i < ngroups; i++) {
+		if (ext4_bg_has_super(sb, i) &&
+		    ((i < 5) || ((i % flex_size) == 0)))
+			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+					ext4_bg_num_gdb(sb, i) + 1);
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+				sbi->s_itb_per_group);
+		if (ret)
+			return ret;
+	}
+
+	if (test_opt(sb, DEBUG))
+		debug_print_tree(EXT4_SB(sb));
+	return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+	struct ext4_system_zone	*entry, *n;
+
+	rbtree_postorder_for_each_entry_safe(entry, n,
+			&EXT4_SB(sb)->system_blks, node)
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+
+	EXT4_SB(sb)->system_blks = RB_ROOT;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+			  unsigned int count)
+{
+	struct ext4_system_zone *entry;
+	struct rb_node *n = sbi->system_blks.rb_node;
+
+	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count < start_blk) ||
+	    (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+		sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+		return 0;
+	}
+	while (n) {
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		if (start_blk + count - 1 < entry->start_blk)
+			n = n->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = n->rb_right;
+		else {
+			sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+int ext4_check_blockref(const char *function, unsigned int line,
+			struct inode *inode, __le32 *p, unsigned int max)
+{
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	__le32 *bref = p;
+	unsigned int blk;
+
+	while (bref < p+max) {
+		blk = le32_to_cpu(*bref++);
+		if (blk &&
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						    blk, 1))) {
+			es->s_last_error_block = cpu_to_le64(blk);
+			ext4_error_inode(inode, function, line, blk,
+					 "invalid block");
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
diff --git a/kernel/fs/ext4/crypto.c b/kernel/fs/ext4/crypto.c
new file mode 100644
index 000000000..8ff15273a
--- /dev/null
+++ b/kernel/fs/ext4/crypto.c
@@ -0,0 +1,558 @@
+/*
+ * linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption functions for ext4
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include <linux/crypto.h>
+#include <linux/ecryptfs.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+
+#include "ext4_extents.h"
+#include "xattr.h"
+
+/* Encryption added and removed here! (L: */
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+		 "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+		 "Number of crypto contexts to preallocate");
+
+static mempool_t *ext4_bounce_page_pool;
+
+static LIST_HEAD(ext4_free_crypto_ctxs);
+static DEFINE_SPINLOCK(ext4_crypto_ctx_lock);
+
+/**
+ * ext4_release_crypto_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
+{
+	unsigned long flags;
+
+	if (ctx->bounce_page) {
+		if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL)
+			__free_page(ctx->bounce_page);
+		else
+			mempool_free(ctx->bounce_page, ext4_bounce_page_pool);
+		ctx->bounce_page = NULL;
+	}
+	ctx->control_page = NULL;
+	if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+		if (ctx->tfm)
+			crypto_free_tfm(ctx->tfm);
+		kfree(ctx);
+	} else {
+		spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+		list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+		spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+	}
+}
+
+/**
+ * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context
+ * @mask: The allocation mask.
+ *
+ * Return: An allocated and initialized encryption context on success. An error
+ * value or NULL otherwise.
+ */
+static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask)
+{
+	struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx),
+					      mask);
+
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+	return ctx;
+}
+
+/**
+ * ext4_get_crypto_ctx() - Gets an encryption context
+ * @inode:       The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
+{
+	struct ext4_crypto_ctx *ctx = NULL;
+	int res = 0;
+	unsigned long flags;
+	struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key;
+
+	if (!ext4_read_workqueue)
+		ext4_init_crypto();
+
+	/*
+	 * We first try getting the ctx from a free list because in
+	 * the common case the ctx will have an allocated and
+	 * initialized crypto tfm, so it's probably a worthwhile
+	 * optimization. For the bounce page, we first try getting it
+	 * from the kernel allocator because that's just about as fast
+	 * as getting it from a list and because a cache of free pages
+	 * should generally be a "last resort" option for a filesystem
+	 * to be able to do its job.
+	 */
+	spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+	ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs,
+				       struct ext4_crypto_ctx, free_list);
+	if (ctx)
+		list_del(&ctx->free_list);
+	spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+	if (!ctx) {
+		ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS);
+		if (IS_ERR(ctx)) {
+			res = PTR_ERR(ctx);
+			goto out;
+		}
+		ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	} else {
+		ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	}
+
+	/* Allocate a new Crypto API context if we don't already have
+	 * one or if it isn't the right mode. */
+	BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID);
+	if (ctx->tfm && (ctx->mode != key->mode)) {
+		crypto_free_tfm(ctx->tfm);
+		ctx->tfm = NULL;
+		ctx->mode = EXT4_ENCRYPTION_MODE_INVALID;
+	}
+	if (!ctx->tfm) {
+		switch (key->mode) {
+		case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+			ctx->tfm = crypto_ablkcipher_tfm(
+				crypto_alloc_ablkcipher("xts(aes)", 0, 0));
+			break;
+		case EXT4_ENCRYPTION_MODE_AES_256_GCM:
+			/* TODO(mhalcrow): AEAD w/ gcm(aes);
+			 * crypto_aead_setauthsize() */
+			ctx->tfm = ERR_PTR(-ENOTSUPP);
+			break;
+		default:
+			BUG();
+		}
+		if (IS_ERR_OR_NULL(ctx->tfm)) {
+			res = PTR_ERR(ctx->tfm);
+			ctx->tfm = NULL;
+			goto out;
+		}
+		ctx->mode = key->mode;
+	}
+	BUG_ON(key->size != ext4_encryption_key_size(key->mode));
+
+	/* There shouldn't be a bounce page attached to the crypto
+	 * context at this point. */
+	BUG_ON(ctx->bounce_page);
+
+out:
+	if (res) {
+		if (!IS_ERR_OR_NULL(ctx))
+			ext4_release_crypto_ctx(ctx);
+		ctx = ERR_PTR(res);
+	}
+	return ctx;
+}
+
+struct workqueue_struct *ext4_read_workqueue;
+static DEFINE_MUTEX(crypto_init);
+
+/**
+ * ext4_exit_crypto() - Shutdown the ext4 encryption system
+ */
+void ext4_exit_crypto(void)
+{
+	struct ext4_crypto_ctx *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) {
+		if (pos->bounce_page) {
+			if (pos->flags &
+			    EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) {
+				__free_page(pos->bounce_page);
+			} else {
+				mempool_free(pos->bounce_page,
+					     ext4_bounce_page_pool);
+			}
+		}
+		if (pos->tfm)
+			crypto_free_tfm(pos->tfm);
+		kfree(pos);
+	}
+	INIT_LIST_HEAD(&ext4_free_crypto_ctxs);
+	if (ext4_bounce_page_pool)
+		mempool_destroy(ext4_bounce_page_pool);
+	ext4_bounce_page_pool = NULL;
+	if (ext4_read_workqueue)
+		destroy_workqueue(ext4_read_workqueue);
+	ext4_read_workqueue = NULL;
+}
+
+/**
+ * ext4_init_crypto() - Set up for ext4 encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_init_crypto(void)
+{
+	int i, res;
+
+	mutex_lock(&crypto_init);
+	if (ext4_read_workqueue)
+		goto already_initialized;
+	ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0);
+	if (!ext4_read_workqueue) {
+		res = -ENOMEM;
+		goto fail;
+	}
+
+	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+		struct ext4_crypto_ctx *ctx;
+
+		ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL);
+		if (IS_ERR(ctx)) {
+			res = PTR_ERR(ctx);
+			goto fail;
+		}
+		list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+	}
+
+	ext4_bounce_page_pool =
+		mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+	if (!ext4_bounce_page_pool) {
+		res = -ENOMEM;
+		goto fail;
+	}
+already_initialized:
+	mutex_unlock(&crypto_init);
+	return 0;
+fail:
+	ext4_exit_crypto();
+	mutex_unlock(&crypto_init);
+	return res;
+}
+
+void ext4_restore_control_page(struct page *data_page)
+{
+	struct ext4_crypto_ctx *ctx =
+		(struct ext4_crypto_ctx *)page_private(data_page);
+
+	set_page_private(data_page, (unsigned long)NULL);
+	ClearPagePrivate(data_page);
+	unlock_page(data_page);
+	ext4_release_crypto_ctx(ctx);
+}
+
+/**
+ * ext4_crypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void ext4_crypt_complete(struct crypto_async_request *req, int res)
+{
+	struct ext4_completion_result *ecr = req->data;
+
+	if (res == -EINPROGRESS)
+		return;
+	ecr->res = res;
+	complete(&ecr->completion);
+}
+
+typedef enum {
+	EXT4_DECRYPT = 0,
+	EXT4_ENCRYPT,
+} ext4_direction_t;
+
+static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
+			    struct inode *inode,
+			    ext4_direction_t rw,
+			    pgoff_t index,
+			    struct page *src_page,
+			    struct page *dest_page)
+
+{
+	u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
+	struct ablkcipher_request *req = NULL;
+	DECLARE_EXT4_COMPLETION_RESULT(ecr);
+	struct scatterlist dst, src;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm);
+	int res = 0;
+
+	BUG_ON(!ctx->tfm);
+	BUG_ON(ctx->mode != ei->i_encryption_key.mode);
+
+	if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) {
+		printk_ratelimited(KERN_ERR
+				   "%s: unsupported crypto algorithm: %d\n",
+				   __func__, ctx->mode);
+		return -ENOTSUPP;
+	}
+
+	crypto_ablkcipher_clear_flags(atfm, ~0);
+	crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+
+	res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw,
+				       ei->i_encryption_key.size);
+	if (res) {
+		printk_ratelimited(KERN_ERR
+				   "%s: crypto_ablkcipher_setkey() failed\n",
+				   __func__);
+		return res;
+	}
+	req = ablkcipher_request_alloc(atfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+				   "%s: crypto_request_alloc() failed\n",
+				   __func__);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(
+		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		ext4_crypt_complete, &ecr);
+
+	BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index));
+	memcpy(xts_tweak, &index, sizeof(index));
+	memset(&xts_tweak[sizeof(index)], 0,
+	       EXT4_XTS_TWEAK_SIZE - sizeof(index));
+
+	sg_init_table(&dst, 1);
+	sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+	sg_init_table(&src, 1);
+	sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+	ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+				     xts_tweak);
+	if (rw == EXT4_DECRYPT)
+		res = crypto_ablkcipher_decrypt(req);
+	else
+		res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	ablkcipher_request_free(req);
+	if (res) {
+		printk_ratelimited(
+			KERN_ERR
+			"%s: crypto_ablkcipher_encrypt() returned %d\n",
+			__func__, res);
+		return res;
+	}
+	return 0;
+}
+
+/**
+ * ext4_encrypt() - Encrypts a page
+ * @inode:          The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path.  The caller must call
+ * ext4_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *ext4_encrypt(struct inode *inode,
+			  struct page *plaintext_page)
+{
+	struct ext4_crypto_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	int err;
+
+	BUG_ON(!PageLocked(plaintext_page));
+
+	ctx = ext4_get_crypto_ctx(inode);
+	if (IS_ERR(ctx))
+		return (struct page *) ctx;
+
+	/* The encryption operation will require a bounce page. */
+	ciphertext_page = alloc_page(GFP_NOFS);
+	if (!ciphertext_page) {
+		/* This is a potential bottleneck, but at least we'll have
+		 * forward progress. */
+		ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+						 GFP_NOFS);
+		if (WARN_ON_ONCE(!ciphertext_page)) {
+			ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+							 GFP_NOFS | __GFP_WAIT);
+		}
+		ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+	} else {
+		ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+	}
+	ctx->bounce_page = ciphertext_page;
+	ctx->control_page = plaintext_page;
+	err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
+			       plaintext_page, ciphertext_page);
+	if (err) {
+		ext4_release_crypto_ctx(ctx);
+		return ERR_PTR(err);
+	}
+	SetPagePrivate(ciphertext_page);
+	set_page_private(ciphertext_page, (unsigned long)ctx);
+	lock_page(ciphertext_page);
+	return ciphertext_page;
+}
+
+/**
+ * ext4_decrypt() - Decrypts a page in-place
+ * @ctx:  The encryption context.
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	return ext4_page_crypto(ctx, page->mapping->host,
+				EXT4_DECRYPT, page->index, page, page);
+}
+
+/*
+ * Convenience function which takes care of allocating and
+ * deallocating the encryption context
+ */
+int ext4_decrypt_one(struct inode *inode, struct page *page)
+{
+	int ret;
+
+	struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
+
+	if (!ctx)
+		return -ENOMEM;
+	ret = ext4_decrypt(ctx, page);
+	ext4_release_crypto_ctx(ctx);
+	return ret;
+}
+
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	struct ext4_crypto_ctx	*ctx;
+	struct page		*ciphertext_page = NULL;
+	struct bio		*bio;
+	ext4_lblk_t		lblk = ex->ee_block;
+	ext4_fsblk_t		pblk = ext4_ext_pblock(ex);
+	unsigned int		len = ext4_ext_get_actual_len(ex);
+	int			err = 0;
+
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+
+	ctx = ext4_get_crypto_ctx(inode);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ciphertext_page = alloc_page(GFP_NOFS);
+	if (!ciphertext_page) {
+		/* This is a potential bottleneck, but at least we'll have
+		 * forward progress. */
+		ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+						 GFP_NOFS);
+		if (WARN_ON_ONCE(!ciphertext_page)) {
+			ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+							 GFP_NOFS | __GFP_WAIT);
+		}
+		ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+	} else {
+		ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+	}
+	ctx->bounce_page = ciphertext_page;
+
+	while (len--) {
+		err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
+				       ZERO_PAGE(0), ciphertext_page);
+		if (err)
+			goto errout;
+
+		bio = bio_alloc(GFP_KERNEL, 1);
+		if (!bio) {
+			err = -ENOMEM;
+			goto errout;
+		}
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_iter.bi_sector = pblk;
+		err = bio_add_page(bio, ciphertext_page,
+				   inode->i_sb->s_blocksize, 0);
+		if (err) {
+			bio_put(bio);
+			goto errout;
+		}
+		err = submit_bio_wait(WRITE, bio);
+		if (err)
+			goto errout;
+	}
+	err = 0;
+errout:
+	ext4_release_crypto_ctx(ctx);
+	return err;
+}
+
+bool ext4_valid_contents_enc_mode(uint32_t mode)
+{
+	return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+/**
+ * ext4_validate_encryption_key_size() - Validate the encryption key size
+ * @mode: The key mode.
+ * @size: The key size to validate.
+ *
+ * Return: The validated key size for @mode. Zero if invalid.
+ */
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
+{
+	if (size == ext4_encryption_key_size(mode))
+		return size;
+	return 0;
+}
diff --git a/kernel/fs/ext4/crypto_fname.c b/kernel/fs/ext4/crypto_fname.c
new file mode 100644
index 000000000..fded02f72
--- /dev/null
+++ b/kernel/fs/ext4/crypto_fname.c
@@ -0,0 +1,719 @@
+/*
+ * linux/fs/ext4/crypto_fname.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains functions for filename crypto management in ext4
+ *
+ * Written by Uday Savagaonkar, 2014.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/crypto.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+
+#include "ext4.h"
+#include "ext4_crypto.h"
+#include "xattr.h"
+
+/**
+ * ext4_dir_crypt_complete() -
+ */
+static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
+{
+	struct ext4_completion_result *ecr = req->data;
+
+	if (res == -EINPROGRESS)
+		return;
+	ecr->res = res;
+	complete(&ecr->completion);
+}
+
+bool ext4_valid_filenames_enc_mode(uint32_t mode)
+{
+	return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+/**
+ * ext4_fname_encrypt() -
+ *
+ * This function encrypts the input filename, and returns the length of the
+ * ciphertext. Errors are returned as negative numbers.  We trust the caller to
+ * allocate sufficient memory to oname string.
+ */
+static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
+			      const struct qstr *iname,
+			      struct ext4_str *oname)
+{
+	u32 ciphertext_len;
+	struct ablkcipher_request *req = NULL;
+	DECLARE_EXT4_COMPLETION_RESULT(ecr);
+	struct crypto_ablkcipher *tfm = ctx->ctfm;
+	int res = 0;
+	char iv[EXT4_CRYPTO_BLOCK_SIZE];
+	struct scatterlist sg[1];
+	int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
+	char *workbuf;
+
+	if (iname->len <= 0 || iname->len > ctx->lim)
+		return -EIO;
+
+	ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
+		EXT4_CRYPTO_BLOCK_SIZE : iname->len;
+	ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
+	ciphertext_len = (ciphertext_len > ctx->lim)
+			? ctx->lim : ciphertext_len;
+
+	/* Allocate request */
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(
+		    KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(req,
+		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		ext4_dir_crypt_complete, &ecr);
+
+	/* Map the workpage */
+	workbuf = kmap(ctx->workpage);
+
+	/* Copy the input */
+	memcpy(workbuf, iname->name, iname->len);
+	if (iname->len < ciphertext_len)
+		memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
+
+	/* Initialize IV */
+	memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+
+	/* Create encryption request */
+	sg_init_table(sg, 1);
+	sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
+	ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv);
+	res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	if (res >= 0) {
+		/* Copy the result to output */
+		memcpy(oname->name, workbuf, ciphertext_len);
+		res = ciphertext_len;
+	}
+	kunmap(ctx->workpage);
+	ablkcipher_request_free(req);
+	if (res < 0) {
+		printk_ratelimited(
+		    KERN_ERR "%s: Error (error code %d)\n", __func__, res);
+	}
+	oname->len = ciphertext_len;
+	return res;
+}
+
+/*
+ * ext4_fname_decrypt()
+ *	This function decrypts the input filename, and returns
+ *	the length of the plaintext.
+ *	Errors are returned as negative numbers.
+ *	We trust the caller to allocate sufficient memory to oname string.
+ */
+static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx,
+			      const struct ext4_str *iname,
+			      struct ext4_str *oname)
+{
+	struct ext4_str tmp_in[2], tmp_out[1];
+	struct ablkcipher_request *req = NULL;
+	DECLARE_EXT4_COMPLETION_RESULT(ecr);
+	struct scatterlist sg[1];
+	struct crypto_ablkcipher *tfm = ctx->ctfm;
+	int res = 0;
+	char iv[EXT4_CRYPTO_BLOCK_SIZE];
+	char *workbuf;
+
+	if (iname->len <= 0 || iname->len > ctx->lim)
+		return -EIO;
+
+	tmp_in[0].name = iname->name;
+	tmp_in[0].len = iname->len;
+	tmp_out[0].name = oname->name;
+
+	/* Allocate request */
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(
+		    KERN_ERR "%s: crypto_request_alloc() failed\n",  __func__);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(req,
+		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		ext4_dir_crypt_complete, &ecr);
+
+	/* Map the workpage */
+	workbuf = kmap(ctx->workpage);
+
+	/* Copy the input */
+	memcpy(workbuf, iname->name, iname->len);
+
+	/* Initialize IV */
+	memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+
+	/* Create encryption request */
+	sg_init_table(sg, 1);
+	sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
+	ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv);
+	res = crypto_ablkcipher_decrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	if (res >= 0) {
+		/* Copy the result to output */
+		memcpy(oname->name, workbuf, iname->len);
+		res = iname->len;
+	}
+	kunmap(ctx->workpage);
+	ablkcipher_request_free(req);
+	if (res < 0) {
+		printk_ratelimited(
+		    KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
+		    __func__, res);
+		return res;
+	}
+
+	oname->len = strnlen(oname->name, iname->len);
+	return oname->len;
+}
+
+static const char *lookup_table =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+/**
+ * ext4_fname_encode_digest() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+static int digest_encode(const char *src, int len, char *dst)
+{
+	int i = 0, bits = 0, ac = 0;
+	char *cp = dst;
+
+	while (i < len) {
+		ac += (((unsigned char) src[i]) << bits);
+		bits += 8;
+		do {
+			*cp++ = lookup_table[ac & 0x3f];
+			ac >>= 6;
+			bits -= 6;
+		} while (bits >= 6);
+		i++;
+	}
+	if (bits)
+		*cp++ = lookup_table[ac & 0x3f];
+	return cp - dst;
+}
+
+static int digest_decode(const char *src, int len, char *dst)
+{
+	int i = 0, bits = 0, ac = 0;
+	const char *p;
+	char *cp = dst;
+
+	while (i < len) {
+		p = strchr(lookup_table, src[i]);
+		if (p == NULL || src[i] == 0)
+			return -2;
+		ac += (p - lookup_table) << bits;
+		bits += 6;
+		if (bits >= 8) {
+			*cp++ = ac & 0xff;
+			ac >>= 8;
+			bits -= 8;
+		}
+		i++;
+	}
+	if (ac)
+		return -1;
+	return cp - dst;
+}
+
+/**
+ * ext4_free_fname_crypto_ctx() -
+ *
+ * Frees up a crypto context.
+ */
+void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx)
+{
+	if (ctx == NULL || IS_ERR(ctx))
+		return;
+
+	if (ctx->ctfm && !IS_ERR(ctx->ctfm))
+		crypto_free_ablkcipher(ctx->ctfm);
+	if (ctx->htfm && !IS_ERR(ctx->htfm))
+		crypto_free_hash(ctx->htfm);
+	if (ctx->workpage && !IS_ERR(ctx->workpage))
+		__free_page(ctx->workpage);
+	kfree(ctx);
+}
+
+/**
+ * ext4_put_fname_crypto_ctx() -
+ *
+ * Return: The crypto context onto free list. If the free list is above a
+ * threshold, completely frees up the context, and returns the memory.
+ *
+ * TODO: Currently we directly free the crypto context. Eventually we should
+ * add code it to return to free list. Such an approach will increase
+ * efficiency of directory lookup.
+ */
+void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx)
+{
+	if (*ctx == NULL || IS_ERR(*ctx))
+		return;
+	ext4_free_fname_crypto_ctx(*ctx);
+	*ctx = NULL;
+}
+
+/**
+ * ext4_search_fname_crypto_ctx() -
+ */
+static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx(
+		const struct ext4_encryption_key *key)
+{
+	return NULL;
+}
+
+/**
+ * ext4_alloc_fname_crypto_ctx() -
+ */
+struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx(
+	const struct ext4_encryption_key *key)
+{
+	struct ext4_fname_crypto_ctx *ctx;
+
+	ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS);
+	if (ctx == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) {
+		/* This will automatically set key mode to invalid
+		 * As enum for ENCRYPTION_MODE_INVALID is zero */
+		memset(&ctx->key, 0, sizeof(ctx->key));
+	} else {
+		memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key));
+	}
+	ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode)
+		? 0 : 1;
+	ctx->ctfm_key_is_ready = 0;
+	ctx->ctfm = NULL;
+	ctx->htfm = NULL;
+	ctx->workpage = NULL;
+	return ctx;
+}
+
+/**
+ * ext4_get_fname_crypto_ctx() -
+ *
+ * Allocates a free crypto context and initializes it to hold
+ * the crypto material for the inode.
+ *
+ * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise.
+ */
+struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(
+	struct inode *inode, u32 max_ciphertext_len)
+{
+	struct ext4_fname_crypto_ctx *ctx;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int res;
+
+	/* Check if the crypto policy is set on the inode */
+	res = ext4_encrypted_inode(inode);
+	if (res == 0)
+		return NULL;
+
+	if (!ext4_has_encryption_key(inode))
+		ext4_generate_encryption_key(inode);
+
+	/* Get a crypto context based on the key.
+	 * A new context is allocated if no context matches the requested key.
+	 */
+	ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key));
+	if (ctx == NULL)
+		ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key));
+	if (IS_ERR(ctx))
+		return ctx;
+
+	ctx->flags = ei->i_crypt_policy_flags;
+	if (ctx->has_valid_key) {
+		if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) {
+			printk_once(KERN_WARNING
+				    "ext4: unsupported key mode %d\n",
+				    ctx->key.mode);
+			return ERR_PTR(-ENOKEY);
+		}
+
+		/* As a first cut, we will allocate new tfm in every call.
+		 * later, we will keep the tfm around, in case the key gets
+		 * re-used */
+		if (ctx->ctfm == NULL) {
+			ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))",
+					0, 0);
+		}
+		if (IS_ERR(ctx->ctfm)) {
+			res = PTR_ERR(ctx->ctfm);
+			printk(
+			    KERN_DEBUG "%s: error (%d) allocating crypto tfm\n",
+			    __func__, res);
+			ctx->ctfm = NULL;
+			ext4_put_fname_crypto_ctx(&ctx);
+			return ERR_PTR(res);
+		}
+		if (ctx->ctfm == NULL) {
+			printk(
+			    KERN_DEBUG "%s: could not allocate crypto tfm\n",
+			    __func__);
+			ext4_put_fname_crypto_ctx(&ctx);
+			return ERR_PTR(-ENOMEM);
+		}
+		if (ctx->workpage == NULL)
+			ctx->workpage = alloc_page(GFP_NOFS);
+		if (IS_ERR(ctx->workpage)) {
+			res = PTR_ERR(ctx->workpage);
+			printk(
+			    KERN_DEBUG "%s: error (%d) allocating work page\n",
+			    __func__, res);
+			ctx->workpage = NULL;
+			ext4_put_fname_crypto_ctx(&ctx);
+			return ERR_PTR(res);
+		}
+		if (ctx->workpage == NULL) {
+			printk(
+			    KERN_DEBUG "%s: could not allocate work page\n",
+			    __func__);
+			ext4_put_fname_crypto_ctx(&ctx);
+			return ERR_PTR(-ENOMEM);
+		}
+		ctx->lim = max_ciphertext_len;
+		crypto_ablkcipher_clear_flags(ctx->ctfm, ~0);
+		crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm),
+			CRYPTO_TFM_REQ_WEAK_KEY);
+
+		/* If we are lucky, we will get a context that is already
+		 * set up with the right key. Else, we will have to
+		 * set the key */
+		if (!ctx->ctfm_key_is_ready) {
+			/* Since our crypto objectives for filename encryption
+			 * are pretty weak,
+			 * we directly use the inode master key */
+			res = crypto_ablkcipher_setkey(ctx->ctfm,
+					ctx->key.raw, ctx->key.size);
+			if (res) {
+				ext4_put_fname_crypto_ctx(&ctx);
+				return ERR_PTR(-EIO);
+			}
+			ctx->ctfm_key_is_ready = 1;
+		} else {
+			/* In the current implementation, key should never be
+			 * marked "ready" for a context that has just been
+			 * allocated. So we should never reach here */
+			 BUG();
+		}
+	}
+	if (ctx->htfm == NULL)
+		ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(ctx->htfm)) {
+		res = PTR_ERR(ctx->htfm);
+		printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n",
+			__func__, res);
+		ctx->htfm = NULL;
+		ext4_put_fname_crypto_ctx(&ctx);
+		return ERR_PTR(res);
+	}
+	if (ctx->htfm == NULL) {
+		printk(KERN_DEBUG "%s: could not allocate hash tfm\n",
+				__func__);
+		ext4_put_fname_crypto_ctx(&ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return ctx;
+}
+
+/**
+ * ext4_fname_crypto_round_up() -
+ *
+ * Return: The next multiple of block size
+ */
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
+{
+	return ((size+blksize-1)/blksize)*blksize;
+}
+
+/**
+ * ext4_fname_crypto_namelen_on_disk() -
+ */
+int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
+				      u32 namelen)
+{
+	u32 ciphertext_len;
+	int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
+
+	if (ctx == NULL)
+		return -EIO;
+	if (!(ctx->has_valid_key))
+		return -EACCES;
+	ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ?
+		EXT4_CRYPTO_BLOCK_SIZE : namelen;
+	ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
+	ciphertext_len = (ciphertext_len > ctx->lim)
+			? ctx->lim : ciphertext_len;
+	return (int) ciphertext_len;
+}
+
+/**
+ * ext4_fname_crypto_alloc_obuff() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
+				   u32 ilen, struct ext4_str *crypto_str)
+{
+	unsigned int olen;
+	int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
+
+	if (!ctx)
+		return -EIO;
+	if (padding < EXT4_CRYPTO_BLOCK_SIZE)
+		padding = EXT4_CRYPTO_BLOCK_SIZE;
+	olen = ext4_fname_crypto_round_up(ilen, padding);
+	crypto_str->len = olen;
+	if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
+		olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
+	/* Allocated buffer can hold one more character to null-terminate the
+	 * string */
+	crypto_str->name = kmalloc(olen+1, GFP_NOFS);
+	if (!(crypto_str->name))
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * ext4_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
+{
+	if (!crypto_str)
+		return;
+	kfree(crypto_str->name);
+	crypto_str->name = NULL;
+}
+
+/**
+ * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
+ */
+int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+			    struct dx_hash_info *hinfo,
+			    const struct ext4_str *iname,
+			    struct ext4_str *oname)
+{
+	char buf[24];
+	int ret;
+
+	if (ctx == NULL)
+		return -EIO;
+	if (iname->len < 3) {
+		/*Check for . and .. */
+		if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') {
+			oname->name[0] = '.';
+			oname->name[iname->len-1] = '.';
+			oname->len = iname->len;
+			return oname->len;
+		}
+	}
+	if (ctx->has_valid_key)
+		return ext4_fname_decrypt(ctx, iname, oname);
+
+	if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
+		ret = digest_encode(iname->name, iname->len, oname->name);
+		oname->len = ret;
+		return ret;
+	}
+	if (hinfo) {
+		memcpy(buf, &hinfo->hash, 4);
+		memcpy(buf+4, &hinfo->minor_hash, 4);
+	} else
+		memset(buf, 0, 8);
+	memcpy(buf + 8, iname->name + iname->len - 16, 16);
+	oname->name[0] = '_';
+	ret = digest_encode(buf, 24, oname->name+1);
+	oname->len = ret + 1;
+	return ret + 1;
+}
+
+int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+			   struct dx_hash_info *hinfo,
+			   const struct ext4_dir_entry_2 *de,
+			   struct ext4_str *oname)
+{
+	struct ext4_str iname = {.name = (unsigned char *) de->name,
+				 .len = de->name_len };
+
+	return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname);
+}
+
+
+/**
+ * ext4_fname_usr_to_disk() - converts a filename from user space to disk space
+ */
+int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
+			   const struct qstr *iname,
+			   struct ext4_str *oname)
+{
+	int res;
+
+	if (ctx == NULL)
+		return -EIO;
+	if (iname->len < 3) {
+		/*Check for . and .. */
+		if (iname->name[0] == '.' &&
+				iname->name[iname->len-1] == '.') {
+			oname->name[0] = '.';
+			oname->name[iname->len-1] = '.';
+			oname->len = iname->len;
+			return oname->len;
+		}
+	}
+	if (ctx->has_valid_key) {
+		res = ext4_fname_encrypt(ctx, iname, oname);
+		return res;
+	}
+	/* Without a proper key, a user is not allowed to modify the filenames
+	 * in a directory. Consequently, a user space name cannot be mapped to
+	 * a disk-space name */
+	return -EACCES;
+}
+
+/*
+ * Calculate the htree hash from a filename from user space
+ */
+int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
+			    const struct qstr *iname,
+			    struct dx_hash_info *hinfo)
+{
+	struct ext4_str tmp;
+	int ret = 0;
+	char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1];
+
+	if (!ctx ||
+	    ((iname->name[0] == '.') &&
+	     ((iname->len == 1) ||
+	      ((iname->name[1] == '.') && (iname->len == 2))))) {
+		ext4fs_dirhash(iname->name, iname->len, hinfo);
+		return 0;
+	}
+
+	if (!ctx->has_valid_key && iname->name[0] == '_') {
+		if (iname->len != 33)
+			return -ENOENT;
+		ret = digest_decode(iname->name+1, iname->len, buf);
+		if (ret != 24)
+			return -ENOENT;
+		memcpy(&hinfo->hash, buf, 4);
+		memcpy(&hinfo->minor_hash, buf + 4, 4);
+		return 0;
+	}
+
+	if (!ctx->has_valid_key && iname->name[0] != '_') {
+		if (iname->len > 43)
+			return -ENOENT;
+		ret = digest_decode(iname->name, iname->len, buf);
+		ext4fs_dirhash(buf, ret, hinfo);
+		return 0;
+	}
+
+	/* First encrypt the plaintext name */
+	ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp);
+	if (ret < 0)
+		return ret;
+
+	ret = ext4_fname_encrypt(ctx, iname, &tmp);
+	if (ret >= 0) {
+		ext4fs_dirhash(tmp.name, tmp.len, hinfo);
+		ret = 0;
+	}
+
+	ext4_fname_crypto_free_buffer(&tmp);
+	return ret;
+}
+
+int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr,
+		     int len, const char * const name,
+		     struct ext4_dir_entry_2 *de)
+{
+	int ret = -ENOENT;
+	int bigname = (*name == '_');
+
+	if (ctx->has_valid_key) {
+		if (cstr->name == NULL) {
+			struct qstr istr;
+
+			ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr);
+			if (ret < 0)
+				goto errout;
+			istr.name = name;
+			istr.len = len;
+			ret = ext4_fname_encrypt(ctx, &istr, cstr);
+			if (ret < 0)
+				goto errout;
+		}
+	} else {
+		if (cstr->name == NULL) {
+			cstr->name = kmalloc(32, GFP_KERNEL);
+			if (cstr->name == NULL)
+				return -ENOMEM;
+			if ((bigname && (len != 33)) ||
+			    (!bigname && (len > 43)))
+				goto errout;
+			ret = digest_decode(name+bigname, len-bigname,
+					    cstr->name);
+			if (ret < 0) {
+				ret = -ENOENT;
+				goto errout;
+			}
+			cstr->len = ret;
+		}
+		if (bigname) {
+			if (de->name_len < 16)
+				return 0;
+			ret = memcmp(de->name + de->name_len - 16,
+				     cstr->name + 8, 16);
+			return (ret == 0) ? 1 : 0;
+		}
+	}
+	if (de->name_len != cstr->len)
+		return 0;
+	ret = memcmp(de->name, cstr->name, cstr->len);
+	return (ret == 0) ? 1 : 0;
+errout:
+	kfree(cstr->name);
+	cstr->name = NULL;
+	return ret;
+}
diff --git a/kernel/fs/ext4/crypto_key.c b/kernel/fs/ext4/crypto_key.c
new file mode 100644
index 000000000..52170d0b7
--- /dev/null
+++ b/kernel/fs/ext4/crypto_key.c
@@ -0,0 +1,166 @@
+/*
+ * linux/fs/ext4/crypto_key.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions for ext4
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+
+#include "ext4.h"
+#include "xattr.h"
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+	struct ext4_completion_result *ecr = req->data;
+
+	if (rc == -EINPROGRESS)
+		return;
+
+	ecr->res = rc;
+	complete(&ecr->completion);
+}
+
+/**
+ * ext4_derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivatio.
+ * @source_key:   Source key to which to apply derivation.
+ * @derived_key:  Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
+			       char source_key[EXT4_AES_256_XTS_KEY_SIZE],
+			       char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
+{
+	int res = 0;
+	struct ablkcipher_request *req = NULL;
+	DECLARE_EXT4_COMPLETION_RESULT(ecr);
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+								0);
+
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+	crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			derive_crypt_complete, &ecr);
+	res = crypto_ablkcipher_setkey(tfm, deriving_key,
+				       EXT4_AES_128_ECB_KEY_SIZE);
+	if (res < 0)
+		goto out;
+	sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
+	sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+				     EXT4_AES_256_XTS_KEY_SIZE, NULL);
+	res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+
+out:
+	if (req)
+		ablkcipher_request_free(req);
+	if (tfm)
+		crypto_free_ablkcipher(tfm);
+	return res;
+}
+
+/**
+ * ext4_generate_encryption_key() - generates an encryption key
+ * @inode: The inode to generate the encryption key for.
+ */
+int ext4_generate_encryption_key(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
+	char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+				 (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1];
+	struct key *keyring_key = NULL;
+	struct ext4_encryption_key *master_key;
+	struct ext4_encryption_context ctx;
+	struct user_key_payload *ukp;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+				 &ctx, sizeof(ctx));
+
+	if (res != sizeof(ctx)) {
+		if (res > 0)
+			res = -EINVAL;
+		goto out;
+	}
+	res = 0;
+
+	ei->i_crypt_policy_flags = ctx.flags;
+	if (S_ISREG(inode->i_mode))
+		crypt_key->mode = ctx.contents_encryption_mode;
+	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		crypt_key->mode = ctx.filenames_encryption_mode;
+	else {
+		printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n");
+		BUG();
+	}
+	crypt_key->size = ext4_encryption_key_size(crypt_key->mode);
+	BUG_ON(!crypt_key->size);
+	if (DUMMY_ENCRYPTION_ENABLED(sbi)) {
+		memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
+		goto out;
+	}
+	memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX,
+	       EXT4_KEY_DESC_PREFIX_SIZE);
+	sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE,
+		"%*phN", EXT4_KEY_DESCRIPTOR_SIZE,
+		ctx.master_key_descriptor);
+	full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+			    (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0';
+	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+	if (IS_ERR(keyring_key)) {
+		res = PTR_ERR(keyring_key);
+		keyring_key = NULL;
+		goto out;
+	}
+	BUG_ON(keyring_key->type != &key_type_logon);
+	ukp = ((struct user_key_payload *)keyring_key->payload.data);
+	if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
+		res = -EINVAL;
+		goto out;
+	}
+	master_key = (struct ext4_encryption_key *)ukp->data;
+	BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
+		     EXT4_KEY_DERIVATION_NONCE_SIZE);
+	BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
+	res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw);
+out:
+	if (keyring_key)
+		key_put(keyring_key);
+	if (res < 0)
+		crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID;
+	return res;
+}
+
+int ext4_has_encryption_key(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
+
+	return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID);
+}
diff --git a/kernel/fs/ext4/crypto_policy.c b/kernel/fs/ext4/crypto_policy.c
new file mode 100644
index 000000000..a6d6291ae
--- /dev/null
+++ b/kernel/fs/ext4/crypto_policy.c
@@ -0,0 +1,198 @@
+/*
+ * linux/fs/ext4/crypto_policy.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption policy functions for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "ext4.h"
+#include "xattr.h"
+
+static int ext4_inode_has_encryption_context(struct inode *inode)
+{
+	int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0);
+	return (res > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int ext4_is_encryption_context_consistent_with_policy(
+	struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+	struct ext4_encryption_context ctx;
+	int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+				 sizeof(ctx));
+	if (res != sizeof(ctx))
+		return 0;
+	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+			EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(ctx.flags ==
+		 policy->flags) &&
+		(ctx.contents_encryption_mode ==
+		 policy->contents_encryption_mode) &&
+		(ctx.filenames_encryption_mode ==
+		 policy->filenames_encryption_mode));
+}
+
+static int ext4_create_encryption_context_from_policy(
+	struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+	struct ext4_encryption_context ctx;
+	int res = 0;
+
+	ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+	       EXT4_KEY_DESCRIPTOR_SIZE);
+	if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
+		printk(KERN_WARNING
+		       "%s: Invalid contents encryption mode %d\n", __func__,
+			policy->contents_encryption_mode);
+		return -EINVAL;
+	}
+	if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
+		printk(KERN_WARNING
+		       "%s: Invalid filenames encryption mode %d\n", __func__,
+			policy->filenames_encryption_mode);
+		return -EINVAL;
+	}
+	if (policy->flags & ~EXT4_POLICY_FLAGS_VALID)
+		return -EINVAL;
+	ctx.contents_encryption_mode = policy->contents_encryption_mode;
+	ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+	ctx.flags = policy->flags;
+	BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
+	get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+
+	res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+			     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+			     sizeof(ctx), 0);
+	if (!res)
+		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+	return res;
+}
+
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+			struct inode *inode)
+{
+	if (policy->version != 0)
+		return -EINVAL;
+
+	if (!ext4_inode_has_encryption_context(inode)) {
+		if (!ext4_empty_dir(inode))
+			return -ENOTEMPTY;
+		return ext4_create_encryption_context_from_policy(inode,
+								  policy);
+	}
+
+	if (ext4_is_encryption_context_consistent_with_policy(inode, policy))
+		return 0;
+
+	printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+	       __func__);
+	return -EINVAL;
+}
+
+int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
+{
+	struct ext4_encryption_context ctx;
+
+	int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+				 &ctx, sizeof(ctx));
+	if (res != sizeof(ctx))
+		return -ENOENT;
+	if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1)
+		return -EINVAL;
+	policy->version = 0;
+	policy->contents_encryption_mode = ctx.contents_encryption_mode;
+	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+	policy->flags = ctx.flags;
+	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+	       EXT4_KEY_DESCRIPTOR_SIZE);
+	return 0;
+}
+
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+						 struct inode *child)
+{
+	struct ext4_encryption_context parent_ctx, child_ctx;
+	int res;
+
+	if ((parent == NULL) || (child == NULL)) {
+		pr_err("parent %p child %p\n", parent, child);
+		BUG_ON(1);
+	}
+	/* no restrictions if the parent directory is not encrypted */
+	if (!ext4_encrypted_inode(parent))
+		return 1;
+	res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
+			     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+			     &parent_ctx, sizeof(parent_ctx));
+	if (res != sizeof(parent_ctx))
+		return 0;
+	/* if the child directory is not encrypted, this is always a problem */
+	if (!ext4_encrypted_inode(child))
+		return 0;
+	res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION,
+			     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+			     &child_ctx, sizeof(child_ctx));
+	if (res != sizeof(child_ctx))
+		return 0;
+	return (memcmp(parent_ctx.master_key_descriptor,
+		       child_ctx.master_key_descriptor,
+		       EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(parent_ctx.contents_encryption_mode ==
+		 child_ctx.contents_encryption_mode) &&
+		(parent_ctx.filenames_encryption_mode ==
+		 child_ctx.filenames_encryption_mode));
+}
+
+/**
+ * ext4_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child:  Child inode that inherits the context from @parent.
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int ext4_inherit_context(struct inode *parent, struct inode *child)
+{
+	struct ext4_encryption_context ctx;
+	int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
+				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+				 &ctx, sizeof(ctx));
+
+	if (res != sizeof(ctx)) {
+		if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
+			ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+			ctx.contents_encryption_mode =
+				EXT4_ENCRYPTION_MODE_AES_256_XTS;
+			ctx.filenames_encryption_mode =
+				EXT4_ENCRYPTION_MODE_AES_256_CTS;
+			ctx.flags = 0;
+			memset(ctx.master_key_descriptor, 0x42,
+			       EXT4_KEY_DESCRIPTOR_SIZE);
+			res = 0;
+		} else {
+			goto out;
+		}
+	}
+	get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+	res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION,
+			     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+			     sizeof(ctx), 0);
+out:
+	if (!res)
+		ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
+	return res;
+}
diff --git a/kernel/fs/ext4/dir.c b/kernel/fs/ext4/dir.c
new file mode 100644
index 000000000..5665d82d2
--- /dev/null
+++ b/kernel/fs/ext4/dir.c
@@ -0,0 +1,644 @@
+/*
+ *  linux/fs/ext4/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001  Daniel Phillips
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static int ext4_dx_readdir(struct file *, struct dir_context *);
+
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which could potentially get converted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+		     EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+	     ext4_has_inline_data(inode)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
+int __ext4_check_dir_entry(const char *function, unsigned int line,
+			   struct inode *dir, struct file *filp,
+			   struct ext4_dir_entry_2 *de,
+			   struct buffer_head *bh, char *buf, int size,
+			   unsigned int offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
+
+	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
+		error_msg = "rec_len is smaller than minimal";
+	else if (unlikely(rlen % 4 != 0))
+		error_msg = "rec_len % 4 != 0";
+	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
+		error_msg = "rec_len is too small for name_len";
+	else if (unlikely(((char *) de - buf) + rlen > size))
+		error_msg = "directory entry across range";
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
+		error_msg = "inode out of bounds";
+	else
+		return 0;
+
+	if (filp)
+		ext4_error_file(filp, function, line, bh->b_blocknr,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset % size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+	else
+		ext4_error_inode(dir, function, line, bh->b_blocknr,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset % size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+
+	return 1;
+}
+
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
+{
+	unsigned int offset;
+	int i;
+	struct ext4_dir_entry_2 *de;
+	int err;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh = NULL;
+	int dir_has_error = 0;
+	struct ext4_fname_crypto_ctx *enc_ctx = NULL;
+	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+
+	if (is_dx_dir(inode)) {
+		err = ext4_dx_readdir(file, ctx);
+		if (err != ERR_BAD_DX_DIR) {
+			return err;
+		}
+		/*
+		 * We don't set the inode dirty flag since it's not
+		 * critical that it get flushed back to the disk.
+		 */
+		ext4_clear_inode_flag(file_inode(file),
+				      EXT4_INODE_INDEX);
+	}
+
+	if (ext4_has_inline_data(inode)) {
+		int has_inline_data = 1;
+		err = ext4_read_inline_dir(file, ctx,
+					   &has_inline_data);
+		if (has_inline_data)
+			return err;
+	}
+
+	enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN);
+	if (IS_ERR(enc_ctx))
+		return PTR_ERR(enc_ctx);
+	if (enc_ctx) {
+		err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN,
+						     &fname_crypto_str);
+		if (err < 0) {
+			ext4_put_fname_crypto_ctx(&enc_ctx);
+			return err;
+		}
+	}
+
+	offset = ctx->pos & (sb->s_blocksize - 1);
+
+	while (ctx->pos < inode->i_size) {
+		struct ext4_map_blocks map;
+
+		map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_len = 1;
+		err = ext4_map_blocks(NULL, inode, &map, 0);
+		if (err > 0) {
+			pgoff_t index = map.m_pblk >>
+					(PAGE_CACHE_SHIFT - inode->i_blkbits);
+			if (!ra_has_index(&file->f_ra, index))
+				page_cache_sync_readahead(
+					sb->s_bdev->bd_inode->i_mapping,
+					&file->f_ra, file,
+					index, 1);
+			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			bh = ext4_bread(NULL, inode, map.m_lblk, 0);
+			if (IS_ERR(bh))
+				return PTR_ERR(bh);
+		}
+
+		if (!bh) {
+			if (!dir_has_error) {
+				EXT4_ERROR_FILE(file, 0,
+						"directory contains a "
+						"hole at offset %llu",
+					   (unsigned long long) ctx->pos);
+				dir_has_error = 1;
+			}
+			/* corrupt size?  Maybe no more blocks to read */
+			if (ctx->pos > inode->i_blocks << 9)
+				break;
+			ctx->pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/* Check the checksum */
+		if (!buffer_verified(bh) &&
+		    !ext4_dirent_csum_verify(inode,
+				(struct ext4_dir_entry *)bh->b_data)) {
+			EXT4_ERROR_FILE(file, 0, "directory fails checksum "
+					"at offset %llu",
+					(unsigned long long)ctx->pos);
+			ctx->pos += sb->s_blocksize - offset;
+			brelse(bh);
+			bh = NULL;
+			continue;
+		}
+		set_buffer_verified(bh);
+
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (file->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ext4_dir_entry_2 *)
+					(bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
+					break;
+				i += ext4_rec_len_from_disk(de->rec_len,
+							    sb->s_blocksize);
+			}
+			offset = i;
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
+				| offset;
+			file->f_version = inode->i_version;
+		}
+
+		while (ctx->pos < inode->i_size
+		       && offset < sb->s_blocksize) {
+			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
+			if (ext4_check_dir_entry(inode, file, de, bh,
+						 bh->b_data, bh->b_size,
+						 offset)) {
+				/*
+				 * On error, skip to the next block
+				 */
+				ctx->pos = (ctx->pos |
+						(sb->s_blocksize - 1)) + 1;
+				break;
+			}
+			offset += ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize);
+			if (le32_to_cpu(de->inode)) {
+				if (enc_ctx == NULL) {
+					/* Directory is not encrypted */
+					if (!dir_emit(ctx, de->name,
+					    de->name_len,
+					    le32_to_cpu(de->inode),
+					    get_dtype(sb, de->file_type)))
+						goto done;
+				} else {
+					/* Directory is encrypted */
+					err = ext4_fname_disk_to_usr(enc_ctx,
+						NULL, de, &fname_crypto_str);
+					if (err < 0)
+						goto errout;
+					if (!dir_emit(ctx,
+					    fname_crypto_str.name, err,
+					    le32_to_cpu(de->inode),
+					    get_dtype(sb, de->file_type)))
+						goto done;
+				}
+			}
+			ctx->pos += ext4_rec_len_from_disk(de->rec_len,
+						sb->s_blocksize);
+		}
+		if ((ctx->pos < inode->i_size) && !dir_relax(inode))
+			goto done;
+		brelse(bh);
+		bh = NULL;
+		offset = 0;
+	}
+done:
+	err = 0;
+errout:
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	ext4_put_fname_crypto_ctx(&enc_ctx);
+	ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
+	brelse(bh);
+	return err;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+	return is_compat_task();
+#else
+	return (BITS_PER_LONG == 32);
+#endif
+}
+
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value for dx directories
+ *
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
+ */
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return major >> 1;
+	else
+		return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return (pos << 1) & 0xffffffff;
+	else
+		return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return 0;
+	else
+		return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return EXT4_HTREE_EOF_32BIT;
+	else
+		return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
+ * directories, where the "offset" is in terms of the filename hash
+ * value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond offset limits,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * For non-htree, ext4_llseek already chooses the proper max offset.
+ */
+static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int dx_dir = is_dx_dir(inode);
+	loff_t htree_max = ext4_get_htree_eof(file);
+
+	if (likely(dx_dir))
+		return generic_file_llseek_size(file, offset, whence,
+						    htree_max, htree_max);
+	else
+		return ext4_llseek(file, offset, whence);
+}
+
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+	__u32		hash;
+	__u32		minor_hash;
+	struct rb_node	rb_hash;
+	struct fname	*next;
+	__u32		inode;
+	__u8		name_len;
+	__u8		file_type;
+	char		name[0];
+};
+
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+	struct fname *fname, *next;
+
+	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
+		while (fname) {
+			struct fname *old = fname;
+			fname = fname->next;
+			kfree(old);
+		}
+
+	*root = RB_ROOT;
+}
+
+
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+							   loff_t pos)
+{
+	struct dir_private_info *p;
+
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	if (!p)
+		return NULL;
+	p->curr_hash = pos2maj_hash(filp, pos);
+	p->curr_minor_hash = pos2min_hash(filp, pos);
+	return p;
+}
+
+void ext4_htree_free_dir_info(struct dir_private_info *p)
+{
+	free_rb_tree_fname(&p->root);
+	kfree(p);
+}
+
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ *
+ * When filename encryption is enabled, the dirent will hold the
+ * encrypted filename, while the htree will hold decrypted filename.
+ * The decrypted filename is passed in via ent_name.  parameter.
+ */
+int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+			     __u32 minor_hash,
+			    struct ext4_dir_entry_2 *dirent,
+			    struct ext4_str *ent_name)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fname *fname, *new_fn;
+	struct dir_private_info *info;
+	int len;
+
+	info = dir_file->private_data;
+	p = &info->root.rb_node;
+
+	/* Create and allocate the fname structure */
+	len = sizeof(struct fname) + ent_name->len + 1;
+	new_fn = kzalloc(len, GFP_KERNEL);
+	if (!new_fn)
+		return -ENOMEM;
+	new_fn->hash = hash;
+	new_fn->minor_hash = minor_hash;
+	new_fn->inode = le32_to_cpu(dirent->inode);
+	new_fn->name_len = ent_name->len;
+	new_fn->file_type = dirent->file_type;
+	memcpy(new_fn->name, ent_name->name, ent_name->len);
+	new_fn->name[ent_name->len] = 0;
+
+	while (*p) {
+		parent = *p;
+		fname = rb_entry(parent, struct fname, rb_hash);
+
+		/*
+		 * If the hash and minor hash match up, then we put
+		 * them on a linked list.  This rarely happens...
+		 */
+		if ((new_fn->hash == fname->hash) &&
+		    (new_fn->minor_hash == fname->minor_hash)) {
+			new_fn->next = fname->next;
+			fname->next = new_fn;
+			return 0;
+		}
+
+		if (new_fn->hash < fname->hash)
+			p = &(*p)->rb_left;
+		else if (new_fn->hash > fname->hash)
+			p = &(*p)->rb_right;
+		else if (new_fn->minor_hash < fname->minor_hash)
+			p = &(*p)->rb_left;
+		else /* if (new_fn->minor_hash > fname->minor_hash) */
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&new_fn->rb_hash, parent, p);
+	rb_insert_color(&new_fn->rb_hash, &info->root);
+	return 0;
+}
+
+
+
+/*
+ * This is a helper function for ext4_dx_readdir.  It calls filldir
+ * for all entres on the fname linked list.  (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static int call_filldir(struct file *file, struct dir_context *ctx,
+			struct fname *fname)
+{
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+
+	if (!fname) {
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
+			 "called with null fname?!?", __func__, __LINE__,
+			 inode->i_ino, current->comm);
+		return 0;
+	}
+	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
+	while (fname) {
+		if (!dir_emit(ctx, fname->name,
+				fname->name_len,
+				fname->inode,
+				get_dtype(sb, fname->file_type))) {
+			info->extra_fname = fname;
+			return 1;
+		}
+		fname = fname->next;
+	}
+	return 0;
+}
+
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct fname *fname;
+	int	ret;
+
+	if (!info) {
+		info = ext4_htree_create_dir_info(file, ctx->pos);
+		if (!info)
+			return -ENOMEM;
+		file->private_data = info;
+	}
+
+	if (ctx->pos == ext4_get_htree_eof(file))
+		return 0;	/* EOF */
+
+	/* Some one has messed with f_pos; reset the world */
+	if (info->last_pos != ctx->pos) {
+		free_rb_tree_fname(&info->root);
+		info->curr_node = NULL;
+		info->extra_fname = NULL;
+		info->curr_hash = pos2maj_hash(file, ctx->pos);
+		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
+	}
+
+	/*
+	 * If there are any leftover names on the hash collision
+	 * chain, return them first.
+	 */
+	if (info->extra_fname) {
+		if (call_filldir(file, ctx, info->extra_fname))
+			goto finished;
+		info->extra_fname = NULL;
+		goto next_node;
+	} else if (!info->curr_node)
+		info->curr_node = rb_first(&info->root);
+
+	while (1) {
+		/*
+		 * Fill the rbtree if we have no more entries,
+		 * or the inode has changed since we last read in the
+		 * cached entries.
+		 */
+		if ((!info->curr_node) ||
+		    (file->f_version != inode->i_version)) {
+			info->curr_node = NULL;
+			free_rb_tree_fname(&info->root);
+			file->f_version = inode->i_version;
+			ret = ext4_htree_fill_tree(file, info->curr_hash,
+						   info->curr_minor_hash,
+						   &info->next_hash);
+			if (ret < 0)
+				return ret;
+			if (ret == 0) {
+				ctx->pos = ext4_get_htree_eof(file);
+				break;
+			}
+			info->curr_node = rb_first(&info->root);
+		}
+
+		fname = rb_entry(info->curr_node, struct fname, rb_hash);
+		info->curr_hash = fname->hash;
+		info->curr_minor_hash = fname->minor_hash;
+		if (call_filldir(file, ctx, fname))
+			break;
+	next_node:
+		info->curr_node = rb_next(info->curr_node);
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
+			if (info->next_hash == ~0) {
+				ctx->pos = ext4_get_htree_eof(file);
+				break;
+			}
+			info->curr_hash = info->next_hash;
+			info->curr_minor_hash = 0;
+		}
+	}
+finished:
+	info->last_pos = ctx->pos;
+	return 0;
+}
+
+static int ext4_release_dir(struct inode *inode, struct file *filp)
+{
+	if (filp->private_data)
+		ext4_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
+		      int buf_size)
+{
+	struct ext4_dir_entry_2 *de;
+	int nlen, rlen;
+	unsigned int offset = 0;
+	char *top;
+
+	de = (struct ext4_dir_entry_2 *)buf;
+	top = buf + buf_size;
+	while ((char *) de < top) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+					 buf, buf_size, offset))
+			return -EIO;
+		nlen = EXT4_DIR_REC_LEN(de->name_len);
+		rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+		de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+		offset += rlen;
+	}
+	if ((char *) de > top)
+		return -EIO;
+
+	return 0;
+}
+
+const struct file_operations ext4_dir_operations = {
+	.llseek		= ext4_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= ext4_readdir,
+	.unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.fsync		= ext4_sync_file,
+	.release	= ext4_release_dir,
+};
diff --git a/kernel/fs/ext4/ext4.h b/kernel/fs/ext4/ext4.h
new file mode 100644
index 000000000..9a83f149a
--- /dev/null
+++ b/kernel/fs/ext4/ext4.h
@@ -0,0 +1,2998 @@
+/*
+ *  ext4.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _EXT4_H
+#define _EXT4_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/magic.h>
+#include <linux/jbd2.h>
+#include <linux/quota.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
+#include <crypto/hash.h>
+#include <linux/falloc.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
+
+/*
+ * The fourth extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT4FS_DEBUG to produce debug messages
+ */
+#undef EXT4FS_DEBUG
+
+/*
+ * Debug code
+ */
+#ifdef EXT4FS_DEBUG
+#define ext4_debug(f, a...)						\
+	do {								\
+		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
+			__FILE__, __LINE__, __func__);			\
+		printk(KERN_DEBUG f, ## a);				\
+	} while (0)
+#else
+#define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
+	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, block, fmt, a...)				\
+	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
+
+/* prefer goal again. length */
+#define EXT4_MB_HINT_MERGE		0x0001
+/* blocks already reserved */
+#define EXT4_MB_HINT_RESERVED		0x0002
+/* metadata is being allocated */
+#define EXT4_MB_HINT_METADATA		0x0004
+/* first blocks in the file */
+#define EXT4_MB_HINT_FIRST		0x0008
+/* search for the best chunk */
+#define EXT4_MB_HINT_BEST		0x0010
+/* data is being allocated */
+#define EXT4_MB_HINT_DATA		0x0020
+/* don't preallocate (for tails) */
+#define EXT4_MB_HINT_NOPREALLOC		0x0040
+/* allocate for locality group */
+#define EXT4_MB_HINT_GROUP_ALLOC	0x0080
+/* allocate goal blocks or none */
+#define EXT4_MB_HINT_GOAL_ONLY		0x0100
+/* goal is meaningful */
+#define EXT4_MB_HINT_TRY_GOAL		0x0200
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED	0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC		0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS		0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED		0x2000
+
+struct ext4_allocation_request {
+	/* target inode for block we're allocating */
+	struct inode *inode;
+	/* how many blocks we want to allocate */
+	unsigned int len;
+	/* logical block in target inode */
+	ext4_lblk_t logical;
+	/* the closest logical allocated block to the left */
+	ext4_lblk_t lleft;
+	/* the closest logical allocated block to the right */
+	ext4_lblk_t lright;
+	/* phys. target (a hint) */
+	ext4_fsblk_t goal;
+	/* phys. block for the closest logical allocated block to the left */
+	ext4_fsblk_t pleft;
+	/* phys. block for the closest logical allocated block to the right */
+	ext4_fsblk_t pright;
+	/* flags. see above EXT4_MB_HINT_* */
+	unsigned int flags;
+};
+
+/*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW		(1 << BH_New)
+#define EXT4_MAP_MAPPED		(1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
+#define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+
+struct ext4_map_blocks {
+	ext4_fsblk_t m_pblk;
+	ext4_lblk_t m_lblk;
+	unsigned int m_len;
+	unsigned int m_flags;
+};
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define	EXT4_IO_END_UNWRITTEN	0x0001
+
+/*
+ * For converting unwritten extents on a work queue. 'handle' is used for
+ * buffered writeback.
+ */
+typedef struct ext4_io_end {
+	struct list_head	list;		/* per-file finished IO list */
+	handle_t		*handle;	/* handle reserved for extent
+						 * conversion */
+	struct inode		*inode;		/* file being written to */
+	struct bio		*bio;		/* Linked list of completed
+						 * bios covering the extent */
+	unsigned int		flag;		/* unwritten or not */
+	loff_t			offset;		/* offset in the file */
+	ssize_t			size;		/* size of the extent */
+	atomic_t		count;		/* reference counter */
+} ext4_io_end_t;
+
+struct ext4_io_submit {
+	int			io_op;
+	struct bio		*io_bio;
+	ext4_io_end_t		*io_end;
+	sector_t		io_next_block;
+};
+
+/*
+ * Special inodes numbers
+ */
+#define	EXT4_BAD_INO		 1	/* Bad blocks inode */
+#define EXT4_ROOT_INO		 2	/* Root inode */
+#define EXT4_USR_QUOTA_INO	 3	/* User quota inode */
+#define EXT4_GRP_QUOTA_INO	 4	/* Group quota inode */
+#define EXT4_BOOT_LOADER_INO	 5	/* Boot loader inode */
+#define EXT4_UNDEL_DIR_INO	 6	/* Undelete directory inode */
+#define EXT4_RESIZE_INO		 7	/* Reserved group descriptors inode */
+#define EXT4_JOURNAL_INO	 8	/* Journal inode */
+
+/* First non-reserved inode for old ext4 filesystems */
+#define EXT4_GOOD_OLD_FIRST_INO	11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT4_LINK_MAX		65000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT4_MIN_BLOCK_SIZE		1024
+#define	EXT4_MAX_BLOCK_SIZE		65536
+#define EXT4_MIN_BLOCK_LOG_SIZE		10
+#define EXT4_MAX_BLOCK_LOG_SIZE		16
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
+#else
+# define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+#endif
+#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s)		(EXT4_BLOCK_SIZE(s) << \
+					 EXT4_SB(s)->s_cluster_bits)
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s)		(EXT4_SB(s)->s_cluster_bits)
+#else
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
+#endif
+#ifdef __KERNEL__
+#define	EXT4_ADDR_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_addr_per_block_bits)
+#define EXT4_INODE_SIZE(s)		(EXT4_SB(s)->s_inode_size)
+#define EXT4_FIRST_INO(s)		(EXT4_SB(s)->s_first_ino)
+#else
+#define EXT4_INODE_SIZE(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_INODE_SIZE : \
+				 (s)->s_inode_size)
+#define EXT4_FIRST_INO(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_FIRST_INO : \
+				 (s)->s_first_ino)
+#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size), (1 << (blkbits)))
+
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk)	((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster)	((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks)	(((blks) + (sbi)->s_cluster_ratio - 1) >> \
+				 (sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &				\
+				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\
+				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\
+				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) &				\
+				 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext4_group_desc
+{
+	__le32	bg_block_bitmap_lo;	/* Blocks bitmap block */
+	__le32	bg_inode_bitmap_lo;	/* Inodes bitmap block */
+	__le32	bg_inode_table_lo;	/* Inodes table block */
+	__le16	bg_free_blocks_count_lo;/* Free blocks count */
+	__le16	bg_free_inodes_count_lo;/* Free inodes count */
+	__le16	bg_used_dirs_count_lo;	/* Directories count */
+	__le16	bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
+	__le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
+	__le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
+	__le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
+	__le16  bg_itable_unused_lo;	/* Unused inodes count */
+	__le16  bg_checksum;		/* crc16(sb_uuid+group+desc) */
+	__le32	bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
+	__le32	bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
+	__le32	bg_inode_table_hi;	/* Inodes table block MSB */
+	__le16	bg_free_blocks_count_hi;/* Free blocks count MSB */
+	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
+	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
+	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
+	__le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
+	__le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
+	__le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
+	__u32   bg_reserved;
+};
+
+#define EXT4_BG_INODE_BITMAP_CSUM_HI_END	\
+	(offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
+	 sizeof(__le16))
+#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END	\
+	(offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
+	 sizeof(__le16))
+
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+	atomic64_t	free_clusters;
+	atomic_t	free_inodes;
+	atomic_t	used_dirs;
+};
+
+#define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
+#define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
+#define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT4_MIN_DESC_SIZE		32
+#define EXT4_MIN_DESC_SIZE_64BIT	64
+#define	EXT4_MAX_DESC_SIZE		EXT4_MIN_BLOCK_SIZE
+#define EXT4_DESC_SIZE(s)		(EXT4_SB(s)->s_desc_size)
+#ifdef __KERNEL__
+# define EXT4_BLOCKS_PER_GROUP(s)	(EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s)	(EXT4_SB(s)->s_clusters_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_SB(s)->s_desc_per_block)
+# define EXT4_INODES_PER_GROUP(s)	(EXT4_SB(s)->s_inodes_per_group)
+# define EXT4_DESC_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_desc_per_block_bits)
+#else
+# define EXT4_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
+# define EXT4_INODES_PER_GROUP(s)	((s)->s_inodes_per_group)
+#endif
+
+/*
+ * Constants relative to the data blocks
+ */
+#define	EXT4_NDIR_BLOCKS		12
+#define	EXT4_IND_BLOCK			EXT4_NDIR_BLOCKS
+#define	EXT4_DIND_BLOCK			(EXT4_IND_BLOCK + 1)
+#define	EXT4_TIND_BLOCK			(EXT4_DIND_BLOCK + 1)
+#define	EXT4_N_BLOCKS			(EXT4_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define	EXT4_SECRM_FL			0x00000001 /* Secure deletion */
+#define	EXT4_UNRM_FL			0x00000002 /* Undelete */
+#define	EXT4_COMPR_FL			0x00000004 /* Compress file */
+#define EXT4_SYNC_FL			0x00000008 /* Synchronous updates */
+#define EXT4_IMMUTABLE_FL		0x00000010 /* Immutable file */
+#define EXT4_APPEND_FL			0x00000020 /* writes to file may only append */
+#define EXT4_NODUMP_FL			0x00000040 /* do not dump file */
+#define EXT4_NOATIME_FL			0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT4_DIRTY_FL			0x00000100
+#define EXT4_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
+#define EXT4_NOCOMPR_FL			0x00000400 /* Don't compress */
+	/* nb: was previously EXT2_ECOMPR_FL */
+#define EXT4_ENCRYPT_FL			0x00000800 /* encrypted file */
+/* End compression flags --- maybe not all used */
+#define EXT4_INDEX_FL			0x00001000 /* hash-indexed directory */
+#define EXT4_IMAGIC_FL			0x00002000 /* AFS directory */
+#define EXT4_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
+#define EXT4_NOTAIL_FL			0x00008000 /* file tail should not be merged */
+#define EXT4_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
+#define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
+#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
+#define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
+#define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
+
+#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x004380FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT4_REG_FLMASK;
+	else
+		return flags & EXT4_OTHER_FLMASK;
+}
+
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+	EXT4_INODE_SECRM	= 0,	/* Secure deletion */
+	EXT4_INODE_UNRM		= 1,	/* Undelete */
+	EXT4_INODE_COMPR	= 2,	/* Compress file */
+	EXT4_INODE_SYNC		= 3,	/* Synchronous updates */
+	EXT4_INODE_IMMUTABLE	= 4,	/* Immutable file */
+	EXT4_INODE_APPEND	= 5,	/* writes to file may only append */
+	EXT4_INODE_NODUMP	= 6,	/* do not dump file */
+	EXT4_INODE_NOATIME	= 7,	/* do not update atime */
+/* Reserved for compression usage... */
+	EXT4_INODE_DIRTY	= 8,
+	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
+	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
+	EXT4_INODE_ENCRYPT	= 11,	/* Encrypted file */
+/* End compression flags --- maybe not all used */
+	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
+	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
+	EXT4_INODE_JOURNAL_DATA	= 14,	/* file data should be journaled */
+	EXT4_INODE_NOTAIL	= 15,	/* file tail should not be merged */
+	EXT4_INODE_DIRSYNC	= 16,	/* dirsync behaviour (directories only) */
+	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
+	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
+	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
+	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
+	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
+	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
+};
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
+ * It's important that these values are the same, since we are using
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
+ * defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
+
+static inline void ext4_check_flag_values(void)
+{
+	CHECK_FLAG_VALUE(SECRM);
+	CHECK_FLAG_VALUE(UNRM);
+	CHECK_FLAG_VALUE(COMPR);
+	CHECK_FLAG_VALUE(SYNC);
+	CHECK_FLAG_VALUE(IMMUTABLE);
+	CHECK_FLAG_VALUE(APPEND);
+	CHECK_FLAG_VALUE(NODUMP);
+	CHECK_FLAG_VALUE(NOATIME);
+	CHECK_FLAG_VALUE(DIRTY);
+	CHECK_FLAG_VALUE(COMPRBLK);
+	CHECK_FLAG_VALUE(NOCOMPR);
+	CHECK_FLAG_VALUE(ENCRYPT);
+	CHECK_FLAG_VALUE(INDEX);
+	CHECK_FLAG_VALUE(IMAGIC);
+	CHECK_FLAG_VALUE(JOURNAL_DATA);
+	CHECK_FLAG_VALUE(NOTAIL);
+	CHECK_FLAG_VALUE(DIRSYNC);
+	CHECK_FLAG_VALUE(TOPDIR);
+	CHECK_FLAG_VALUE(HUGE_FILE);
+	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(EA_INODE);
+	CHECK_FLAG_VALUE(EOFBLOCKS);
+	CHECK_FLAG_VALUE(INLINE_DATA);
+	CHECK_FLAG_VALUE(RESERVED);
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext4_new_group_input {
+	__u32 group;		/* Group number for this data */
+	__u64 block_bitmap;	/* Absolute block number of block bitmap */
+	__u64 inode_bitmap;	/* Absolute block number of inode bitmap */
+	__u64 inode_table;	/* Absolute block number of inode table start */
+	__u32 blocks_count;	/* Total number of blocks in this group */
+	__u16 reserved_blocks;	/* Number of reserved blocks in this group */
+	__u16 unused;
+};
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+	u32 group;
+	compat_u64 block_bitmap;
+	compat_u64 inode_bitmap;
+	compat_u64 inode_table;
+	u32 blocks_count;
+	u16 reserved_blocks;
+	u16 unused;
+};
+#endif
+
+/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+struct ext4_new_group_data {
+	__u32 group;
+	__u64 block_bitmap;
+	__u64 inode_bitmap;
+	__u64 inode_table;
+	__u32 blocks_count;
+	__u16 reserved_blocks;
+	__u16 unused;
+	__u32 free_blocks_count;
+};
+
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+	BLOCK_BITMAP = 0,	/* block bitmap */
+	INODE_BITMAP,		/* inode bitmap */
+	INODE_TABLE,		/* inode tables */
+	GROUP_TABLE_COUNT,
+};
+
+/*
+ * Flags used by ext4_map_blocks()
+ */
+	/* Allocate any needed blocks and/or convert an unwritten
+	   extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE			0x0001
+	/* Request the creation of an unwritten extent */
+#define EXT4_GET_BLOCKS_UNWRIT_EXT		0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT	(EXT4_GET_BLOCKS_UNWRIT_EXT|\
+						 EXT4_GET_BLOCKS_CREATE)
+	/* Caller is from the delayed allocation writeout path
+	 * finally doing the actual allocation of delayed blocks */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
+	/* caller is from the direct IO path, request to creation of an
+	unwritten extents if not allocated, split the unwritten
+	extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_PRE_IO			0x0008
+#define EXT4_GET_BLOCKS_CONVERT			0x0010
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\
+					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+	/* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
+					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+	/* Eventual metadata allocation (due to growing extent tree)
+	 * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL		0x0020
+	/* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
+	/* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
+	/* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK			0x0100
+	/* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0200
+
+/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE				0x40000000
+#define EXT4_EX_FORCE_CACHE			0x20000000
+
+/*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA	0x0001
+#define EXT4_FREE_BLOCKS_FORGET		0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED	0x0004
+#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE	0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER	0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
+
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID		0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS	1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM	2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC	3
+#define EXT4_ENCRYPTION_MODE_AES_256_CTS	4
+
+#include "ext4_crypto.h"
+
+/*
+ * ioctl commands
+ */
+#define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
+#define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
+#define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
+#define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
+#define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
+#define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS	_IO('f', 18)
+#define EXT4_IOC_SET_ENCRYPTION_POLICY	_IOR('f', 19, struct ext4_encryption_policy)
+#define EXT4_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
+#define EXT4_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct ext4_encryption_policy)
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT4_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define EXT4_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define EXT4_IOC32_GETVERSION		_IOR('f', 3, int)
+#define EXT4_IOC32_SETVERSION		_IOW('f', 4, int)
+#define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
+#define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
+#define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
+#define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
+#define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+#endif
+
+/* Max physical block we can address w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext4_inode {
+	__le16	i_mode;		/* File mode */
+	__le16	i_uid;		/* Low 16 bits of Owner Uid */
+	__le32	i_size_lo;	/* Size in bytes */
+	__le32	i_atime;	/* Access time */
+	__le32	i_ctime;	/* Inode Change time */
+	__le32	i_mtime;	/* Modification time */
+	__le32	i_dtime;	/* Deletion Time */
+	__le16	i_gid;		/* Low 16 bits of Group Id */
+	__le16	i_links_count;	/* Links count */
+	__le32	i_blocks_lo;	/* Blocks count */
+	__le32	i_flags;	/* File flags */
+	union {
+		struct {
+			__le32  l_i_version;
+		} linux1;
+		struct {
+			__u32  h_i_translator;
+		} hurd1;
+		struct {
+			__u32  m_i_reserved1;
+		} masix1;
+	} osd1;				/* OS dependent 1 */
+	__le32	i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
+	__le32	i_generation;	/* File version (for NFS) */
+	__le32	i_file_acl_lo;	/* File ACL */
+	__le32	i_size_high;
+	__le32	i_obso_faddr;	/* Obsoleted fragment address */
+	union {
+		struct {
+			__le16	l_i_blocks_high; /* were l_i_reserved1 */
+			__le16	l_i_file_acl_high;
+			__le16	l_i_uid_high;	/* these 2 fields */
+			__le16	l_i_gid_high;	/* were reserved2[0] */
+			__le16	l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
+			__le16	l_i_reserved;
+		} linux2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__u16	h_i_mode_high;
+			__u16	h_i_uid_high;
+			__u16	h_i_gid_high;
+			__u32	h_i_author;
+		} hurd2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__le16	m_i_file_acl_high;
+			__u32	m_i_reserved2[2];
+		} masix2;
+	} osd2;				/* OS dependent 2 */
+	__le16	i_extra_isize;
+	__le16	i_checksum_hi;	/* crc32c(uuid+inum+inode) BE */
+	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
+	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
+	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
+	__le32  i_crtime;       /* File Creation time */
+	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
+	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
+};
+
+struct move_extent {
+	__u32 reserved;		/* should be zero */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+};
+
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
+
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
+	((offsetof(typeof(*ext4_inode), field) +	\
+	  sizeof((ext4_inode)->field))			\
+	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
+	    (einode)->i_extra_isize))			\
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+       return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+			   (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+                          ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+{
+       if (sizeof(time->tv_sec) > 4)
+	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+			       << 32;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(inode)->xtime);       \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(einode)->xtime);      \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		ext4_decode_extra_time(&(inode)->xtime,			       \
+				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(inode)->xtime.tv_nsec = 0;				       \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(einode)->xtime.tv_sec = 				       \
+			(signed)le32_to_cpu((raw_inode)->xtime);	       \
+	else								       \
+		(einode)->xtime.tv_sec = 0;				       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		ext4_decode_extra_time(&(einode)->xtime,		       \
+				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(einode)->xtime.tv_nsec = 0;				       \
+} while (0)
+
+#define i_disk_version osd1.linux1.l_i_version
+
+#if defined(__KERNEL__) || defined(__linux__)
+#define i_reserved1	osd1.linux1.l_i_reserved1
+#define i_file_acl_high	osd2.linux2.l_i_file_acl_high
+#define i_blocks_high	osd2.linux2.l_i_blocks_high
+#define i_uid_low	i_uid
+#define i_gid_low	i_gid
+#define i_uid_high	osd2.linux2.l_i_uid_high
+#define i_gid_high	osd2.linux2.l_i_gid_high
+#define i_checksum_lo	osd2.linux2.l_i_checksum_lo
+
+#elif defined(__GNU__)
+
+#define i_translator	osd1.hurd1.h_i_translator
+#define i_uid_high	osd2.hurd2.h_i_uid_high
+#define i_gid_high	osd2.hurd2.h_i_gid_high
+#define i_author	osd2.hurd2.h_i_author
+
+#elif defined(__masix__)
+
+#define i_reserved1	osd1.masix1.m_i_reserved1
+#define i_file_acl_high	osd2.masix2.m_i_file_acl_high
+#define i_reserved2	osd2.masix2.m_i_reserved2
+
+#endif /* defined(__KERNEL__) || defined(__linux__) */
+
+#include "extents_status.h"
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_dtime;
+	ext4_fsblk_t	i_file_acl;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	ext4_group_t	i_block_group;
+	ext4_lblk_t	i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
+	unsigned long	i_state_flags;		/* Dynamic state flags */
+#endif
+	unsigned long	i_flags;
+
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/*
+	 * i_data_sem is for serialising ext4_truncate() against
+	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext4 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have i_data_sem.
+	 */
+	struct rw_semaphore i_data_sem;
+	struct inode vfs_inode;
+	struct jbd2_inode *jinode;
+
+	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
+
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
+
+	/* mballoc */
+	struct list_head i_prealloc_list;
+	spinlock_t i_prealloc_lock;
+
+	/* extents status tree */
+	struct ext4_es_tree i_es_tree;
+	rwlock_t i_es_lock;
+	struct list_head i_es_list;
+	unsigned int i_es_all_nr;	/* protected by i_es_lock */
+	unsigned int i_es_shk_nr;	/* protected by i_es_lock */
+	ext4_lblk_t i_es_shrink_lblk;	/* Offset where we start searching for
+					   extents to shrink. Protected by
+					   i_es_lock  */
+
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
+	/* allocation reservation info for delalloc */
+	/* In case of bigalloc, these refer to clusters rather than blocks */
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
+	ext4_lblk_t i_da_metadata_calc_last_lblock;
+	int i_da_metadata_calc_len;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+	char i_crypt_policy_flags;
+
+	/* Indicate the inline data space. */
+	u16 i_inline_off;
+	u16 i_inline_size;
+
+#ifdef CONFIG_QUOTA
+	/* quota space reservation, managed internally by quota code */
+	qsize_t i_reserved_quota;
+#endif
+
+	/* Lock protecting lists below */
+	spinlock_t i_completed_io_lock;
+	/*
+	 * Completed IOs that need unwritten extents handling and have
+	 * transaction reserved
+	 */
+	struct list_head i_rsv_conversion_list;
+	/*
+	 * Completed IOs that need unwritten extents handling and don't have
+	 * transaction reserved
+	 */
+	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
+	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+	struct work_struct i_rsv_conversion_work;
+
+	spinlock_t i_block_reservation_lock;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	tid_t i_sync_tid;
+	tid_t i_datasync_tid;
+
+#ifdef CONFIG_QUOTA
+	struct dquot *i_dquot[MAXQUOTAS];
+#endif
+
+	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
+	__u32 i_csum_seed;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	/* Encryption params */
+	struct ext4_encryption_key i_encryption_key;
+#endif
+};
+
+/*
+ * File system states
+ */
+#define	EXT4_VALID_FS			0x0001	/* Unmounted cleanly */
+#define	EXT4_ERROR_FS			0x0002	/* Errors detected */
+#define	EXT4_ORPHAN_FS			0x0004	/* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+
+/*
+ * Mount flags set via mount options or defaults
+ */
+#define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
+#define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
+#define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
+#define EXT4_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
+#define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK		0x00070
+#define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
+#define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX			0x00200	/* Direct Access */
+#else
+#define EXT4_MOUNT_DAX			0
+#endif
+#define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
+#define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
+#define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
+#define EXT4_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
+#define EXT4_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
+#define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
+#define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
+#define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
+#define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
+#define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
+#define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
+#define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
+#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
+
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC	0x00000001 /* User explicitly
+						      specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group
+						      size of blocksize * 8
+						      blocks */
+#define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
+						      file systems */
+
+#define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
+						~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
+						EXT4_MOUNT_##opt
+#define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
+					 EXT4_MOUNT_##opt)
+
+#define clear_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 &= \
+						~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 |= \
+						EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
+					 EXT4_MOUNT2_##opt)
+
+#define ext4_test_and_set_bit		__test_and_set_bit_le
+#define ext4_set_bit			__set_bit_le
+#define ext4_set_bit_atomic		ext2_set_bit_atomic
+#define ext4_test_and_clear_bit		__test_and_clear_bit_le
+#define ext4_clear_bit			__clear_bit_le
+#define ext4_clear_bit_atomic		ext2_clear_bit_atomic
+#define ext4_test_bit			test_bit_le
+#define ext4_find_next_zero_bit		find_next_zero_bit_le
+#define ext4_find_next_bit		find_next_bit_le
+
+extern void ext4_set_bits(void *bm, int cur, int len);
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT4_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
+#define EXT4_DFL_CHECKINTERVAL		0	/* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT4_ERRORS_CONTINUE		1	/* Continue execution */
+#define EXT4_ERRORS_RO			2	/* Remount fs read-only */
+#define EXT4_ERRORS_PANIC		3	/* Panic */
+#define EXT4_ERRORS_DEFAULT		EXT4_ERRORS_CONTINUE
+
+/* Metadata checksum algorithm codes */
+#define EXT4_CRC32C_CHKSUM		1
+
+/*
+ * Structure of the super block
+ */
+struct ext4_super_block {
+/*00*/	__le32	s_inodes_count;		/* Inodes count */
+	__le32	s_blocks_count_lo;	/* Blocks count */
+	__le32	s_r_blocks_count_lo;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_lo;	/* Free blocks count */
+/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
+	__le32	s_first_data_block;	/* First Data Block */
+	__le32	s_log_block_size;	/* Block size */
+	__le32	s_log_cluster_size;	/* Allocation cluster size */
+/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
+	__le32	s_clusters_per_group;	/* # Clusters per group */
+	__le32	s_inodes_per_group;	/* # Inodes per group */
+	__le32	s_mtime;		/* Mount time */
+/*30*/	__le32	s_wtime;		/* Write time */
+	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_magic;		/* Magic signature */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le16	s_minor_rev_level;	/* minor revision level */
+/*40*/	__le32	s_lastcheck;		/* time of last check */
+	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le32	s_rev_level;		/* Revision level */
+/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	/*
+	 * These fields are for EXT4_DYNAMIC_REV superblocks only.
+	 *
+	 * Note: the difference between the compatible feature set and
+	 * the incompatible feature set is that if there is a bit set
+	 * in the incompatible feature set that the kernel doesn't
+	 * know about, it should refuse to mount the filesystem.
+	 *
+	 * e2fsck's requirements are more strict; if it doesn't know
+	 * about a feature in either the compatible or incompatible
+	 * feature set, it must abort and not try to meddle with
+	 * things it doesn't understand...
+	 */
+	__le32	s_first_ino;		/* First non-reserved inode */
+	__le16  s_inode_size;		/* size of inode structure */
+	__le16	s_block_group_nr;	/* block group # of this superblock */
+	__le32	s_feature_compat;	/* compatible feature set */
+/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
+	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
+/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*78*/	char	s_volume_name[16];	/* volume name */
+/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
+/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
+	/*
+	 * Performance hints.  Directory preallocation should only
+	 * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+	 */
+	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
+	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
+	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
+	/*
+	 * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
+	 */
+/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
+/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
+	__le32	s_journal_dev;		/* device number of journal file */
+	__le32	s_last_orphan;		/* start of list of inodes to delete */
+	__le32	s_hash_seed[4];		/* HTREE hash seed */
+	__u8	s_def_hash_version;	/* Default hash version to use */
+	__u8	s_jnl_backup_type;
+	__le16  s_desc_size;		/* size of group descriptor */
+/*100*/	__le32	s_default_mount_opts;
+	__le32	s_first_meta_bg;	/* First metablock block group */
+	__le32	s_mkfs_time;		/* When the filesystem was created */
+	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
+	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
+	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_hi;	/* Free blocks count */
+	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
+	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
+	__le32	s_flags;		/* Miscellaneous flags */
+	__le16  s_raid_stride;		/* RAID stride */
+	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
+	__le64  s_mmp_block;            /* Block for multi-mount protection */
+	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_checksum_type;	/* metadata checksum algorithm used */
+	__u8	s_encryption_level;	/* versioning level for encryption */
+	__u8	s_reserved_pad;		/* Padding to next 32bits */
+	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
+	__le32	s_snapshot_inum;	/* Inode number of active snapshot */
+	__le32	s_snapshot_id;		/* sequential ID of active snapshot */
+	__le64	s_snapshot_r_blocks_count; /* reserved blocks for active
+					      snapshot's future use */
+	__le32	s_snapshot_list;	/* inode number of the head of the
+					   on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+	__le32	s_error_count;		/* number of fs errors */
+	__le32	s_first_error_time;	/* first time an error happened */
+	__le32	s_first_error_ino;	/* inode involved in first error */
+	__le64	s_first_error_block;	/* block involved of first error */
+	__u8	s_first_error_func[32];	/* function where the error happened */
+	__le32	s_first_error_line;	/* line number where error happened */
+	__le32	s_last_error_time;	/* most recent time of an error */
+	__le32	s_last_error_ino;	/* inode involved in last error */
+	__le32	s_last_error_line;	/* line number where error happened */
+	__le64	s_last_error_block;	/* block involved of last error */
+	__u8	s_last_error_func[32];	/* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+	__u8	s_mount_opts[64];
+	__le32	s_usr_quota_inum;	/* inode for tracking user quota */
+	__le32	s_grp_quota_inum;	/* inode for tracking group quota */
+	__le32	s_overhead_clusters;	/* overhead blocks/clusters in fs */
+	__le32	s_backup_bgs[2];	/* groups with sparse_super2 SBs */
+	__u8	s_encrypt_algos[4];	/* Encryption algorithms in use  */
+	__u8	s_encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
+	__le32	s_lpf_ino;		/* Location of the lost+found inode */
+	__le32	s_reserved[100];	/* Padding to the end of the block */
+	__le32	s_checksum;		/* crc32c(superblock) */
+};
+
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
+
+#ifdef __KERNEL__
+
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED		0x0001
+#define EXT4_MF_FS_ABORTED		0x0002	/* Fatal error detected */
+#define EXT4_MF_TEST_DUMMY_ENCRYPTION	0x0004
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
+						EXT4_MF_TEST_DUMMY_ENCRYPTION))
+#else
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
+#endif
+
+/* Number of quota types we support */
+#define EXT4_MAXQUOTAS 2
+
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_clusters_per_group; /* Number of clusters in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
+	unsigned long s_overhead;  /* # of fs overhead clusters */
+	unsigned int s_cluster_ratio;	/* Number of blocks per cluster */
+	unsigned int s_cluster_bits;	/* log2 of s_cluster_ratio */
+	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
+	unsigned int s_mount_opt;
+	unsigned int s_mount_opt2;
+	unsigned int s_mount_flags;
+	unsigned int s_def_mount_opt;
+	ext4_fsblk_t s_sb_block;
+	atomic64_t s_resv_clusters;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
+	unsigned int s_inode_goal;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
+	struct percpu_counter s_freeclusters_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyclusters_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
+	struct super_block *s_sb;
+
+	/* Journaling */
+	struct journal_s *s_journal;
+	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
+	unsigned long s_resize_flags;		/* Flags indicating if there
+						   is a resizer */
+	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[EXT4_MAXQUOTAS];	/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+	struct rb_root system_blks;
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
+
+	/* for buddy allocator */
+	struct ext4_group_info ***s_group_info;
+	struct inode *s_buddy_cache;
+	spinlock_t s_md_lock;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
+	unsigned int s_group_info_size;
+
+	/* tunables */
+	unsigned long s_stripe;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
+	unsigned int s_max_dir_size_kb;
+	/* where last allocation was done - for stream allocation */
+	unsigned long s_mb_last_group;
+	unsigned long s_mb_last_start;
+
+	/* stats for buddy allocator */
+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+	atomic_t s_bal_success;	/* we found long enough chunks */
+	atomic_t s_bal_allocated;	/* in blocks */
+	atomic_t s_bal_ex_scanned;	/* total extents scanned */
+	atomic_t s_bal_goals;	/* goal hits */
+	atomic_t s_bal_breaks;	/* too long searches */
+	atomic_t s_bal_2orders;	/* 2^order hits */
+	spinlock_t s_bal_lock;
+	unsigned long s_mb_buddies_generated;
+	unsigned long long s_mb_generation_time;
+	atomic_t s_mb_lost_chunks;
+	atomic_t s_mb_preallocated;
+	atomic_t s_mb_discarded;
+	atomic_t s_lock_busy;
+
+	/* locality groups */
+	struct ext4_locality_group __percpu *s_locality_groups;
+
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
+	/* the size of zero-out chunk */
+	unsigned int s_extent_max_zeroout_kb;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
+	ext4_group_t s_flex_groups_allocated;
+
+	/* workqueue for reserved extent conversions (buffered io) */
+	struct workqueue_struct *rsv_conversion_wq;
+
+	/* timer for periodic error stats printing */
+	struct timer_list s_err_report;
+
+	/* Lazy inode table initialization info */
+	struct ext4_li_request *s_li_request;
+	/* Wait multiplier for lazy initialization thread */
+	unsigned int s_li_wait_mult;
+
+	/* Kernel thread for multiple mount protection */
+	struct task_struct *s_mmp_tsk;
+
+	/* record the last minlen when FITRIM is called. */
+	atomic_t s_last_trim_minblks;
+
+	/* Reference to checksum algorithm driver via cryptoapi */
+	struct crypto_shash *s_chksum_driver;
+
+	/* Precomputed FS UUID checksum for seeding other checksums */
+	__u32 s_csum_seed;
+
+	/* Reclaim extents from extent status tree */
+	struct shrinker s_es_shrinker;
+	struct list_head s_es_list;	/* List of inodes with reclaimable extents */
+	long s_es_nr_inode;
+	struct ext4_es_stats s_es_stats;
+	struct mb_cache *s_mb_cache;
+	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+
+	/* Ratelimit ext4 messages. */
+	struct ratelimit_state s_err_ratelimit_state;
+	struct ratelimit_state s_warning_ratelimit_state;
+	struct ratelimit_state s_msg_ratelimit_state;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	/* Encryption */
+	uint32_t s_file_encryption_mode;
+	uint32_t s_dir_encryption_mode;
+#endif
+};
+
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
+{
+	return container_of(inode, struct ext4_inode_info, vfs_inode);
+}
+
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
+{
+	return ino == EXT4_ROOT_INO ||
+		ino == EXT4_USR_QUOTA_INO ||
+		ino == EXT4_GRP_QUOTA_INO ||
+		ino == EXT4_BOOT_LOADER_INO ||
+		ino == EXT4_JOURNAL_INO ||
+		ino == EXT4_RESIZE_INO ||
+		(ino >= EXT4_FIRST_INO(sb) &&
+		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+}
+
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+					      struct ext4_io_end *io_end)
+{
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_unwritten);
+	}
+}
+
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+	return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+	inode->i_private = io;
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+	EXT4_STATE_JDATA,		/* journaled data exists */
+	EXT4_STATE_NEW,			/* inode is newly created */
+	EXT4_STATE_XATTR,		/* has in-inode xattrs */
+	EXT4_STATE_NO_EXPAND,		/* No space for expansion */
+	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
+	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
+	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_NEWENTRY,		/* File just added to dir */
+	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
+					   nolocking */
+	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
+	EXT4_STATE_ORDERED_MODE,	/* data=ordered mode */
+	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
+};
+
+#define EXT4_INODE_BIT_FNS(name, field, offset)				\
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
+{									\
+	return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);	\
+}									\
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
+{									\
+	set_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
+}									\
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{									\
+	clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
+}
+
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	(ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	/* We depend on the fact that callers will set i_flags */
+}
+#endif
+#else
+/* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block.  This will allow us to call the feature-test
+ * macros from user land. */
+#define EXT4_SB(sb)	(sb)
+#endif
+
+/*
+ * Returns true if the inode is inode is encrypted
+ */
+static inline int ext4_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
+#else
+	return 0;
+#endif
+}
+
+#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT4_OS_LINUX		0
+#define EXT4_OS_HURD		1
+#define EXT4_OS_MASIX		2
+#define EXT4_OS_FREEBSD		3
+#define EXT4_OS_LITES		4
+
+/*
+ * Revision levels
+ */
+#define EXT4_GOOD_OLD_REV	0	/* The good old (original) format */
+#define EXT4_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
+
+#define EXT4_CURRENT_REV	EXT4_GOOD_OLD_REV
+#define EXT4_MAX_SUPP_REV	EXT4_DYNAMIC_REV
+
+#define EXT4_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
+#define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT4_SET_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT4_FEATURE_COMPAT_DIR_PREALLOC	0x0001
+#define EXT4_FEATURE_COMPAT_IMAGIC_INODES	0x0002
+#define EXT4_FEATURE_COMPAT_HAS_JOURNAL		0x0004
+#define EXT4_FEATURE_COMPAT_EXT_ATTR		0x0008
+#define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
+#define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
+#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200
+
+#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
+#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
+#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
+#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC		0x0200
+/*
+ * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
+ * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
+ * all other data structures' checksums.  However, the METADATA_CSUM and
+ * GDT_CSUM bits are mutually exclusive.
+ */
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
+#define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
+
+#define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
+#define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
+#define EXT4_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+#define EXT4_FEATURE_INCOMPAT_META_BG		0x0010
+#define EXT4_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
+#define EXT4_FEATURE_INCOMPAT_64BIT		0x0080
+#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
+#define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_ENCRYPT		0x10000
+
+#define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG| \
+					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
+					 EXT4_FEATURE_INCOMPAT_64BIT| \
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_MMP | \
+					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
+					 EXT4_FEATURE_INCOMPAT_ENCRYPT)
+#define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
+					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
+					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
+					 EXT4_FEATURE_RO_COMPAT_QUOTA)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define	EXT4_DEF_RESUID		0
+#define	EXT4_DEF_RESGID		0
+
+#define EXT4_DEF_INODE_READAHEAD_BLKS	32
+
+/*
+ * Default mount options
+ */
+#define EXT4_DEFM_DEBUG		0x0001
+#define EXT4_DEFM_BSDGROUPS	0x0002
+#define EXT4_DEFM_XATTR_USER	0x0004
+#define EXT4_DEFM_ACL		0x0008
+#define EXT4_DEFM_UID16		0x0010
+#define EXT4_DEFM_JMODE		0x0060
+#define EXT4_DEFM_JMODE_DATA	0x0020
+#define EXT4_DEFM_JMODE_ORDERED	0x0040
+#define EXT4_DEFM_JMODE_WBACK	0x0060
+#define EXT4_DEFM_NOBARRIER	0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD	0x0400
+#define EXT4_DEFM_NODELALLOC	0x0800
+
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT4_NAME_LEN 255
+
+struct ext4_dir_entry {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__le16	name_len;		/* Name length */
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * The new version of the directory entry.  Since EXT4 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext4_dir_entry_2 {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * This is a bogus directory entry at the end of each leaf block that
+ * records checksums.
+ */
+struct ext4_dir_entry_tail {
+	__le32	det_reserved_zero1;	/* Pretend to be unused */
+	__le16	det_rec_len;		/* 12 */
+	__u8	det_reserved_zero2;	/* Zero name length */
+	__u8	det_reserved_ft;	/* 0xDE, fake file type */
+	__le32	det_checksum;		/* crc32c(uuid+inum+dirblock) */
+};
+
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+	((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+					((blocksize) - \
+					 sizeof(struct ext4_dir_entry_tail))))
+
+/*
+ * Ext4 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define EXT4_FT_UNKNOWN		0
+#define EXT4_FT_REG_FILE	1
+#define EXT4_FT_DIR		2
+#define EXT4_FT_CHRDEV		3
+#define EXT4_FT_BLKDEV		4
+#define EXT4_FT_FIFO		5
+#define EXT4_FT_SOCK		6
+#define EXT4_FT_SYMLINK		7
+
+#define EXT4_FT_MAX		8
+
+#define EXT4_FT_DIR_CSUM	0xDE
+
+/*
+ * EXT4_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT4_DIR_PAD			4
+#define EXT4_DIR_ROUND			(EXT4_DIR_PAD - 1)
+#define EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
+					 ~EXT4_DIR_ROUND)
+#define EXT4_MAX_REC_LEN		((1<<16)-1)
+
+/*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == EXT4_MAX_REC_LEN || len == 0)
+		return blocksize;
+	return (len & 65532) | ((len & 3) << 16);
+#else
+	return len;
+#endif
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+		BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len < 65536)
+		return cpu_to_le16(len);
+	if (len == blocksize) {
+		if (blocksize == 65536)
+			return cpu_to_le16(EXT4_MAX_REC_LEN);
+		else
+			return cpu_to_le16(0);
+	}
+	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+	return cpu_to_le16(len);
+#endif
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
+#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY		0
+#define DX_HASH_HALF_MD4	1
+#define DX_HASH_TEA		2
+#define DX_HASH_LEGACY_UNSIGNED	3
+#define DX_HASH_HALF_MD4_UNSIGNED	4
+#define DX_HASH_TEA_UNSIGNED		5
+
+static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
+			      const void *address, unsigned int length)
+{
+	struct {
+		struct shash_desc shash;
+		char ctx[4];
+	} desc;
+	int err;
+
+	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
+
+	desc.shash.tfm = sbi->s_chksum_driver;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
+
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
+
+	return *(u32 *)desc.ctx;
+}
+
+#ifdef __KERNEL__
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+	u32		hash;
+	u32		minor_hash;
+	int		hash_version;
+	u32		*seed;
+};
+
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
+
+
+/*
+ * Control parameters used by ext4_htree_next_block
+ */
+#define HASH_NB_ALWAYS		1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext4_iloc
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ext4_group_t block_group;
+};
+
+static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
+{
+	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories.  It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+	struct rb_root	root;
+	struct rb_node	*curr_node;
+	struct fname	*extra_fname;
+	loff_t		last_pos;
+	__u32		curr_hash;
+	__u32		curr_minor_hash;
+	__u32		next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext4_fsblk_t
+ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
+{
+	return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR	(-(MAX_ERRNO - 1))
+
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT			10
+#define EXT4_DEF_LI_MAX_START_DELAY		5
+#define EXT4_LAZYINIT_QUIT			0x0001
+#define EXT4_LAZYINIT_RUNNING			0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+	unsigned long		li_state;
+	struct list_head	li_request_list;
+	struct mutex		li_list_mtx;
+};
+
+struct ext4_li_request {
+	struct super_block	*lr_super;
+	struct ext4_sb_info	*lr_sbi;
+	ext4_group_t		lr_next_group;
+	struct list_head	lr_request;
+	unsigned long		lr_next_sched;
+	unsigned long		lr_timeout;
+};
+
+struct ext4_features {
+	struct kobject f_kobj;
+	struct completion f_kobj_unregister;
+};
+
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+	__le32	mmp_magic;		/* Magic number for MMP */
+	__le32	mmp_seq;		/* Sequence no. updated periodically */
+
+	/*
+	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+	 * purposes and do not affect the correctness of the algorithm
+	 */
+	__le64	mmp_time;		/* Time last updated */
+	char	mmp_nodename[64];	/* Node which last updated MMP block */
+	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
+
+	/*
+	 * mmp_check_interval is used to verify if the MMP block has been
+	 * updated on the block device. The value is updated based on the
+	 * maximum time to write the MMP block during an update cycle.
+	 */
+	__le16	mmp_check_interval;
+
+	__le16	mmp_pad1;
+	__le32	mmp_pad2[226];
+	__le32	mmp_checksum;		/* crc32c(uuid+mmp_block) */
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+	struct buffer_head *bh; /* bh from initial read_mmp_block() */
+	struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT		2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext4 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE	/**/
+# define ATTRIB_NORET	__attribute__((noreturn))
+# define NORET_AND	noreturn,
+
+/* bitmap.c */
+extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh, int sz);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh, int sz);
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh);
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh);
+
+/* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+					 ext4_fsblk_t blocknr,
+					 ext4_group_t *blockgrpp,
+					 ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+					  ext4_fsblk_t block);
+
+extern unsigned int ext4_block_group(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
+extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
+			ext4_group_t group);
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+					 ext4_fsblk_t goal,
+					 unsigned int flags,
+					 unsigned long *count,
+					 int *errp);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+				    s64 nclusters, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
+extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+						    ext4_group_t block_group,
+						    struct buffer_head ** bh);
+extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+						ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+				  ext4_group_t block_group,
+				  struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+						  ext4_group_t block_group);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+					      ext4_group_t block_group,
+					      struct ext4_group_desc *gdp);
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+
+/* crypto_policy.c */
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+						 struct inode *child);
+int ext4_inherit_context(struct inode *parent, struct inode *child);
+void ext4_to_hex(char *dst, char *src, size_t src_size);
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+			struct inode *inode);
+int ext4_get_policy(struct inode *inode,
+		    struct ext4_encryption_policy *policy);
+
+/* crypto.c */
+bool ext4_valid_contents_enc_mode(uint32_t mode);
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
+extern struct workqueue_struct *ext4_read_workqueue;
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode);
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
+void ext4_restore_control_page(struct page *data_page);
+struct page *ext4_encrypt(struct inode *inode,
+			  struct page *plaintext_page);
+int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
+int ext4_decrypt_one(struct inode *inode, struct page *page);
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_init_crypto(void);
+void ext4_exit_crypto(void);
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+	return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+}
+#else
+static inline int ext4_init_crypto(void) { return 0; }
+static inline void ext4_exit_crypto(void) { }
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+	return 0;
+}
+#endif
+
+/* crypto_fname.c */
+bool ext4_valid_filenames_enc_mode(uint32_t mode);
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
+int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
+				   u32 ilen, struct ext4_str *crypto_str);
+int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+			    struct dx_hash_info *hinfo,
+			    const struct ext4_str *iname,
+			    struct ext4_str *oname);
+int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+			   struct dx_hash_info *hinfo,
+			   const struct ext4_dir_entry_2 *de,
+			   struct ext4_str *oname);
+int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
+			   const struct qstr *iname,
+			   struct ext4_str *oname);
+int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
+			   const struct qstr *iname,
+			   struct dx_hash_info *hinfo);
+int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
+				      u32 namelen);
+int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr,
+		     int len, const char * const name,
+		     struct ext4_dir_entry_2 *de);
+
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx);
+struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
+							u32 max_len);
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str);
+#else
+static inline
+void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { }
+static inline
+struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
+							u32 max_len)
+{
+	return NULL;
+}
+static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { }
+#endif
+
+
+/* crypto_key.c */
+int ext4_generate_encryption_key(struct inode *inode);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_has_encryption_key(struct inode *inode);
+#else
+static inline int ext4_has_encryption_key(struct inode *inode)
+{
+	return 0;
+}
+#endif
+
+
+/* dir.c */
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+				  struct file *,
+				  struct ext4_dir_entry_2 *,
+				  struct buffer_head *, char *, int,
+				  unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)	\
+	unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+					(de), (bh), (buf), (size), (offset)))
+extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+				__u32 minor_hash,
+				struct ext4_dir_entry_2 *dirent,
+				struct ext4_str *ent_name);
+extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+			     struct buffer_head *bh,
+			     void *buf, int buf_size,
+			     const char *name, int namelen,
+			     struct ext4_dir_entry_2 **dest_de);
+int ext4_insert_dentry(struct inode *dir,
+			struct inode *inode,
+			struct ext4_dir_entry_2 *de,
+			int buf_size,
+		       const struct qstr *iname,
+			const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+				     EXT4_FEATURE_COMPAT_DIR_INDEX))
+		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+	    (filetype >= EXT4_FT_MAX))
+		return DT_UNKNOWN;
+
+	return ext4_filetype_table[filetype];
+}
+extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
+			     void *buf, int buf_size);
+
+/* fsync.c */
+extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+
+/* hash.c */
+extern int ext4fs_dirhash(const char *name, int len, struct
+			  dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+				      const struct qstr *qstr, __u32 goal,
+				      uid_t *owner, int handle_type,
+				      unsigned int line_no, int nblocks);
+
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+	__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
+			 0, 0, 0)
+#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+				    type, nblocks)		    \
+	__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+			 (type), __LINE__, (nblocks))
+
+
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+				 ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+
+/* mballoc.c */
+extern long ext4_mb_stats;
+extern long ext4_mb_max_to_scan;
+extern int ext4_mb_init(struct super_block *);
+extern int ext4_mb_release(struct super_block *);
+extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+				struct ext4_allocation_request *, int *);
+extern int ext4_mb_reserve_blocks(struct super_block *, int);
+extern void ext4_discard_preallocations(struct inode *);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh, ext4_fsblk_t block,
+			     unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+				   ext4_group_t ngroups);
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
+		ext4_group_t i, struct ext4_group_desc *desc);
+extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
+/* inode.c */
+int ext4_inode_is_fast_symlink(struct inode *inode);
+struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
+struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+			   struct buffer_head *head,
+			   unsigned from,
+			   unsigned to,
+			   int *partial,
+			   int (*fn)(handle_t *handle,
+				     struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+				struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA	 2
+
+extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
+extern int  ext4_write_inode(struct inode *, struct writeback_control *);
+extern int  ext4_setattr(struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				struct kstat *stat);
+extern void ext4_evict_inode(struct inode *);
+extern void ext4_clear_inode(struct inode *);
+extern int  ext4_sync_inode(handle_t *, struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
+extern int ext4_change_inode_journal_flag(struct inode *, int);
+extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_inode_attach_jinode(struct inode *inode);
+extern int ext4_can_truncate(struct inode *inode);
+extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
+extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
+extern void ext4_set_aops(struct inode *inode);
+extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t lend);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+					int used, int quota_claim);
+
+/* indirect.c */
+extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+				struct ext4_map_blocks *map, int flags);
+extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+				  loff_t offset);
+extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
+				 ext4_lblk_t start, ext4_lblk_t end);
+
+/* ioctl.c */
+extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* migrate.c */
+extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
+
+/* namei.c */
+extern int ext4_dirent_csum_verify(struct inode *inode,
+				   struct ext4_dir_entry *dirent);
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+				__u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+		      char *search_buf,
+		      int buf_size,
+		      struct inode *dir,
+		      const struct qstr *d_name,
+		      unsigned int offset,
+		      struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+				     struct inode *dir,
+				     struct ext4_dir_entry_2 *de_del,
+				     struct buffer_head *bh,
+				     void *entry_buf,
+				     int buf_size,
+				     int csum_size);
+extern int ext4_empty_dir(struct inode *inode);
+
+/* resize.c */
+extern int ext4_group_add(struct super_block *sb,
+				struct ext4_new_group_data *input);
+extern int ext4_group_extend(struct super_block *sb,
+				struct ext4_super_block *es,
+				ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+
+/* super.c */
+extern int ext4_calculate_overhead(struct super_block *sb);
+extern void ext4_superblock_csum_set(struct super_block *sb);
+extern void *ext4_kvmalloc(size_t size, gfp_t flags);
+extern void *ext4_kvzalloc(size_t size, gfp_t flags);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+				    ext4_group_t ngroup);
+extern const char *ext4_decode_error(struct super_block *sb, int errno,
+				     char nbuf[16]);
+
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+		  const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+		      const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+		     const char *, ...);
+extern void __ext4_std_error(struct super_block *, const char *,
+			     unsigned int, int);
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+		  const char *, ...);
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+		    const char *, ...);
+extern __printf(3, 4)
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+			   const char *, unsigned int, const char *);
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+			     struct super_block *, ext4_group_t,
+			     unsigned long, ext4_fsblk_t,
+			     const char *, ...);
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+	__ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+	__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...)					\
+	__ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...)					\
+	__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...)					\
+	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...)				\
+	__ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+	__ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+				fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_inode(inode, "", 0, block, " ");			\
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_file(file, "", 0, block, " ");			\
+} while (0)
+#define ext4_error(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error(sb, "", 0, " ");					\
+} while (0)
+#define ext4_abort(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_abort(sb, "", 0, " ");					\
+} while (0)
+#define ext4_warning(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_warning(sb, "", 0, " ");					\
+} while (0)
+#define ext4_msg(sb, level, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_msg(sb, "", " ");					\
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);				\
+	__ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");	\
+} while (0)
+
+#endif
+
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
+					__u32 compat);
+extern int ext4_update_rocompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 rocompat);
+extern int ext4_update_incompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 incompat);
+extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+				     struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+				 struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+				   struct ext4_group_desc *bg);
+extern void ext4_block_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_table_set(struct super_block *sb,
+				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+					 struct ext4_group_desc *bg,
+					 __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+				   struct ext4_group_desc *bg, __u32 count);
+extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
+				       struct ext4_group_desc *gdp);
+extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
+				     struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+				    ext4_group_t first_not_zeroed);
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+	return EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					  EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
+	       (EXT4_SB(sb)->s_chksum_driver != NULL);
+}
+
+static inline int ext4_has_metadata_csum(struct super_block *sb)
+{
+	WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+		     !EXT4_SB(sb)->s_chksum_driver);
+
+	return (EXT4_SB(sb)->s_chksum_driver != NULL);
+}
+static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_r_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_free_blocks_count_lo);
+}
+
+static inline void ext4_blocks_count_set(struct ext4_super_block *es,
+					 ext4_fsblk_t blk)
+{
+	es->s_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
+					      ext4_fsblk_t blk)
+{
+	es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+					   ext4_fsblk_t blk)
+{
+	es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+{
+	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+			le32_to_cpu(raw_inode->i_size_lo);
+	else
+		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+}
+
+static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+{
+	raw_inode->i_size_lo = cpu_to_le32(i_size);
+	raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
+}
+
+static inline
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+					    ext4_group_t group)
+{
+	 struct ext4_group_info ***grp_info;
+	 long indexv, indexh;
+	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
+	 grp_info = EXT4_SB(sb)->s_group_info;
+	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+	 return grp_info[indexv][indexh];
+}
+
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+	ext4_group_t	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	smp_rmb();
+	return ngroups;
+}
+
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+					     ext4_group_t block_group)
+{
+	return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+	return 1 << sbi->s_log_groups_per_flex;
+}
+
+#define ext4_std_error(sb, errno)				\
+do {								\
+	if ((errno))						\
+		__ext4_std_error((sb), __func__, __LINE__, (errno));	\
+} while (0)
+
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#else
+#define EXT4_FREECLUSTERS_WATERMARK 0
+#endif
+
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+		     !mutex_is_locked(&inode->i_mutex));
+	down_write(&EXT4_I(inode)->i_data_sem);
+	if (newsize > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+	up_write(&EXT4_I(inode)->i_data_sem);
+}
+
+/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
+{
+	int changed = 0;
+
+	if (newsize > inode->i_size) {
+		i_size_write(inode, newsize);
+		changed = 1;
+	}
+	if (newsize > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, newsize);
+		changed |= 2;
+	}
+	return changed;
+}
+
+struct ext4_group_info {
+	unsigned long   bb_state;
+	struct rb_root  bb_free_root;
+	ext4_grpblk_t	bb_first_free;	/* first free block */
+	ext4_grpblk_t	bb_free;	/* total free blocks */
+	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
+	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
+	struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+	void            *bb_bitmap;
+#endif
+	struct rw_semaphore alloc_sem;
+	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
+					 * regions, index is order.
+					 * bb_counters[3] = 5 means
+					 * 5 free 8-block regions. */
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT		0
+#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
+
+#define EXT4_MB_GRP_NEED_INIT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+
+#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
+	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
+	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
+	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+
+#define EXT4_MAX_CONTENTION		8
+#define EXT4_CONTENTION_THRESHOLD	2
+
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+					      ext4_group_t group)
+{
+	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
+
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+	return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+	if (spin_trylock(lock))
+		/*
+		 * We're able to grab the lock right away, so drop the
+		 * lock contention counter.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+	else {
+		/*
+		 * The lock is busy, so bump the contention counter,
+		 * and then wait on the spin lock.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+				  EXT4_MAX_CONTENTION);
+		spin_lock(lock);
+	}
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+					ext4_group_t group)
+{
+	spin_unlock(ext4_group_lock_ptr(sb, group));
+}
+
+/*
+ * Block validity checking
+ */
+#define ext4_check_indirect_blockref(inode, bh)				\
+	ext4_check_blockref(__func__, __LINE__, inode,			\
+			    (__le32 *)(bh)->b_data,			\
+			    EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_ind_check_inode(inode)					\
+	ext4_check_blockref(__func__, __LINE__, inode,			\
+			    EXT4_I(inode)->i_data,			\
+			    EXT4_NDIR_BLOCKS)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext4_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext4_file_inode_operations;
+extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+
+/* inline.c */
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+				 unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+					 struct inode *inode,
+					 loff_t pos, unsigned len,
+					 unsigned flags,
+					 struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+				      loff_t pos, unsigned len,
+				      unsigned copied,
+				      struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+				  unsigned len,
+				  struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+					   struct inode *inode,
+					   loff_t pos, unsigned len,
+					   unsigned flags,
+					   struct page **pagep,
+					   void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+					 unsigned len, unsigned copied,
+					 struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+				     struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+				      struct inode *parent,
+				      struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+				struct dir_context *ctx,
+				int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+				   struct inode *dir, ext4_lblk_t block,
+				   struct dx_hash_info *hinfo,
+				   __u32 start_hash, __u32 start_minor_hash,
+				   int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 **res_dir,
+					int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+				    struct inode *dir,
+				    struct ext4_dir_entry_2 *de_del,
+				    struct buffer_head *bh,
+				    int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+					struct ext4_dir_entry_2 **parent_de,
+					int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+				   struct fiemap_extent_info *fieinfo,
+				   int *has_inline, __u64 start, __u64 len);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+					 struct inode *inode,
+					 int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
+
+static inline int ext4_has_inline_data(struct inode *inode)
+{
+	return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+	       EXT4_I(inode)->i_inline_off;
+}
+
+/* namei.c */
+extern const struct inode_operations ext4_dir_inode_operations;
+extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+				 struct ext4_dir_entry_2 *de,
+				 int blocksize, int csum_size,
+				 unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+				   unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+					 struct inode *inode,
+					 struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+				struct ext4_dir_entry_2 *de,
+				umode_t mode) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+/* readpages.c */
+extern int ext4_mpage_readpages(struct address_space *mapping,
+				struct list_head *pages, struct page *page,
+				unsigned nr_pages);
+
+/* symlink.c */
+extern const struct inode_operations ext4_symlink_inode_operations;
+extern const struct inode_operations ext4_fast_symlink_inode_operations;
+
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+				 ext4_fsblk_t start_blk,
+				 unsigned int count);
+extern int ext4_check_blockref(const char *, unsigned int,
+			       struct inode *, __le32 *, unsigned int);
+
+/* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
+
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS	0xffffffff
+
+extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map, int flags);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+				 ext4_lblk_t end);
+extern void ext4_ext_init(struct super_block *);
+extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
+			  loff_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+					  loff_t offset, ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+					 ext4_lblk_t lblocks);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+						   int num,
+						   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+				      struct ext4_extent *ex1,
+				      struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+				  struct ext4_ext_path **,
+				  struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
+					      struct ext4_ext_path **,
+					      int flags);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_range(struct inode *inode,
+				    ext4_lblk_t lblk_start,
+				    ext4_lblk_t lblk_end);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len);
+extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+				struct inode *inode2, ext4_lblk_t lblk1,
+			     ext4_lblk_t lblk2,  ext4_lblk_t count,
+			     int mark_unwritten,int *err);
+
+/* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+					    struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+					  struct inode *donor_inode);
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+			     __u64 start_orig, __u64 start_donor,
+			     __u64 len, __u64 *moved_len);
+
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+				struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+			       struct page *page,
+			       int len,
+			       struct writeback_control *wbc,
+			       bool keep_towrite);
+
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
+/*
+ * Add new method to test whether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+	return (buffer_uptodate(bh) &&
+			test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+	smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb();
+	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ		37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+					    EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+					     EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+#define EXT4_RESIZING	0
+extern int ext4_resize_begin(struct super_block *sb);
+extern void ext4_resize_end(struct super_block *sb);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* _EXT4_H */
diff --git a/kernel/fs/ext4/ext4_crypto.h b/kernel/fs/ext4/ext4_crypto.h
new file mode 100644
index 000000000..d75159c10
--- /dev/null
+++ b/kernel/fs/ext4/ext4_crypto.h
@@ -0,0 +1,156 @@
+/*
+ * linux/fs/ext4/ext4_crypto.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption header content for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+
+#ifndef _EXT4_CRYPTO_H
+#define _EXT4_CRYPTO_H
+
+#include <linux/fs.h>
+
+#define EXT4_KEY_DESCRIPTOR_SIZE 8
+
+/* Policy provided via an ioctl on the topmost directory */
+struct ext4_encryption_policy {
+	char version;
+	char contents_encryption_mode;
+	char filenames_encryption_mode;
+	char flags;
+	char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+} __attribute__((__packed__));
+
+#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1
+#define EXT4_KEY_DERIVATION_NONCE_SIZE 16
+
+#define EXT4_POLICY_FLAGS_PAD_4		0x00
+#define EXT4_POLICY_FLAGS_PAD_8		0x01
+#define EXT4_POLICY_FLAGS_PAD_16	0x02
+#define EXT4_POLICY_FLAGS_PAD_32	0x03
+#define EXT4_POLICY_FLAGS_PAD_MASK	0x03
+#define EXT4_POLICY_FLAGS_VALID		0x03
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Reserved
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct ext4_encryption_context {
+	char format;
+	char contents_encryption_mode;
+	char filenames_encryption_mode;
+	char flags;
+	char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+	char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE];
+} __attribute__((__packed__));
+
+/* Encryption parameters */
+#define EXT4_XTS_TWEAK_SIZE 16
+#define EXT4_AES_128_ECB_KEY_SIZE 16
+#define EXT4_AES_256_GCM_KEY_SIZE 32
+#define EXT4_AES_256_CBC_KEY_SIZE 32
+#define EXT4_AES_256_CTS_KEY_SIZE 32
+#define EXT4_AES_256_XTS_KEY_SIZE 64
+#define EXT4_MAX_KEY_SIZE 64
+
+#define EXT4_KEY_DESC_PREFIX "ext4:"
+#define EXT4_KEY_DESC_PREFIX_SIZE 5
+
+struct ext4_encryption_key {
+	uint32_t mode;
+	char raw[EXT4_MAX_KEY_SIZE];
+	uint32_t size;
+};
+
+#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
+#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL     0x00000002
+
+struct ext4_crypto_ctx {
+	struct crypto_tfm *tfm;         /* Crypto API context */
+	struct page *bounce_page;       /* Ciphertext page on write path */
+	struct page *control_page;      /* Original page on write path */
+	struct bio *bio;                /* The bio for this context */
+	struct work_struct work;        /* Work queue for read complete path */
+	struct list_head free_list;     /* Free list */
+	int flags;                      /* Flags */
+	int mode;                       /* Encryption mode for tfm */
+};
+
+struct ext4_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \
+	struct ext4_completion_result ecr = { \
+		COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int ext4_encryption_key_size(int mode)
+{
+	switch (mode) {
+	case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+		return EXT4_AES_256_XTS_KEY_SIZE;
+	case EXT4_ENCRYPTION_MODE_AES_256_GCM:
+		return EXT4_AES_256_GCM_KEY_SIZE;
+	case EXT4_ENCRYPTION_MODE_AES_256_CBC:
+		return EXT4_AES_256_CBC_KEY_SIZE;
+	case EXT4_ENCRYPTION_MODE_AES_256_CTS:
+		return EXT4_AES_256_CTS_KEY_SIZE;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+#define EXT4_FNAME_NUM_SCATTER_ENTRIES	4
+#define EXT4_CRYPTO_BLOCK_SIZE		16
+#define EXT4_FNAME_CRYPTO_DIGEST_SIZE	32
+
+struct ext4_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct ext4_fname_crypto_ctx {
+	u32 lim;
+	char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE];
+	struct crypto_ablkcipher *ctfm;
+	struct crypto_hash *htfm;
+	struct page *workpage;
+	struct ext4_encryption_key key;
+	unsigned flags : 8;
+	unsigned has_valid_key : 1;
+	unsigned ctfm_key_is_ready : 1;
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct ext4_encrypted_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __attribute__((__packed__));
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 encrypted_symlink_data_len(u32 l)
+{
+	if (l < EXT4_CRYPTO_BLOCK_SIZE)
+		l = EXT4_CRYPTO_BLOCK_SIZE;
+	return (l + sizeof(struct ext4_encrypted_symlink_data) - 1);
+}
+
+#endif	/* _EXT4_CRYPTO_H */
diff --git a/kernel/fs/ext4/ext4_extents.h b/kernel/fs/ext4/ext4_extents.h
new file mode 100644
index 000000000..3c9381547
--- /dev/null
+++ b/kernel/fs/ext4/ext4_extents.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+#ifndef _EXT4_EXTENTS
+#define _EXT4_EXTENTS
+
+#include "ext4.h"
+
+/*
+ * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
+ * becomes very small, so index split, in-depth growing and
+ * other hard changes happen much more often.
+ * This is for debug purposes only.
+ */
+#define AGGRESSIVE_TEST_
+
+/*
+ * With EXTENTS_STATS defined, the number of blocks and extents
+ * are collected in the truncate path. They'll be shown at
+ * umount time.
+ */
+#define EXTENTS_STATS__
+
+/*
+ * If CHECK_BINSEARCH is defined, then the results of the binary search
+ * will also be checked by linear search.
+ */
+#define CHECK_BINSEARCH__
+
+/*
+ * If EXT_STATS is defined then stats numbers are collected.
+ * These number will be displayed at umount time.
+ */
+#define EXT_STATS_
+
+
+/*
+ * ext4_inode has i_block array (60 bytes total).
+ * The first 12 bytes store ext4_extent_header;
+ * the remainder stores an array of ext4_extent.
+ * For non-inode extent blocks, ext4_extent_tail
+ * follows the array.
+ */
+
+/*
+ * This is the extent tail on-disk structure.
+ * All other extent structures are 12 bytes long.  It turns out that
+ * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
+ * covers all valid ext4 block sizes.  Therefore, this tail structure can be
+ * crammed into the end of the block without having to rebalance the tree.
+ */
+struct ext4_extent_tail {
+	__le32	et_checksum;	/* crc32c(uuid+inum+extent_block) */
+};
+
+/*
+ * This is the extent on-disk structure.
+ * It's used at the bottom of the tree.
+ */
+struct ext4_extent {
+	__le32	ee_block;	/* first logical block extent covers */
+	__le16	ee_len;		/* number of blocks covered by extent */
+	__le16	ee_start_hi;	/* high 16 bits of physical block */
+	__le32	ee_start_lo;	/* low 32 bits of physical block */
+};
+
+/*
+ * This is index on-disk structure.
+ * It's used at all the levels except the bottom.
+ */
+struct ext4_extent_idx {
+	__le32	ei_block;	/* index covers logical blocks from 'block' */
+	__le32	ei_leaf_lo;	/* pointer to the physical block of the next *
+				 * level. leaf or next index could be there */
+	__le16	ei_leaf_hi;	/* high 16 bits of physical block */
+	__u16	ei_unused;
+};
+
+/*
+ * Each block (leaves and indexes), even inode-stored has header.
+ */
+struct ext4_extent_header {
+	__le16	eh_magic;	/* probably will support different formats */
+	__le16	eh_entries;	/* number of valid entries */
+	__le16	eh_max;		/* capacity of store in entries */
+	__le16	eh_depth;	/* has tree real underlying blocks? */
+	__le32	eh_generation;	/* generation of the tree */
+};
+
+#define EXT4_EXT_MAGIC		cpu_to_le16(0xf30a)
+
+#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
+	(sizeof(struct ext4_extent_header) + \
+	 (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
+
+static inline struct ext4_extent_tail *
+find_ext4_extent_tail(struct ext4_extent_header *eh)
+{
+	return (struct ext4_extent_tail *)(((void *)eh) +
+					   EXT4_EXTENT_TAIL_OFFSET(eh));
+}
+
+/*
+ * Array of ext4_ext_path contains path to some extent.
+ * Creation/lookup routines use it for traversal/splitting/etc.
+ * Truncate uses it to simulate recursive walking.
+ */
+struct ext4_ext_path {
+	ext4_fsblk_t			p_block;
+	__u16				p_depth;
+	__u16				p_maxdepth;
+	struct ext4_extent		*p_ext;
+	struct ext4_extent_idx		*p_idx;
+	struct ext4_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+/*
+ * structure for external API
+ */
+
+/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+ * particular extent is an initialized extent or an unwritten (i.e.
+ * preallocated).
+ * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
+ * unwritten extent.
+ * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
+ * unwritten one. In other words, if MSB of ee_len is set, it is an
+ * unwritten extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an unwritten extent of zero length and
+ * thus we make it as a special case of initialized extent with 0x8000 length.
+ * This way we get better extent-to-group alignment for initialized extents.
+ * Hence, the maximum number of blocks we can have in an *initialized*
+ * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
+ */
+#define EXT_INIT_MAX_LEN	(1UL << 15)
+#define EXT_UNWRITTEN_MAX_LEN	(EXT_INIT_MAX_LEN - 1)
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext4_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext4_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext4_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext4_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+	(le16_to_cpu((__path__)->p_hdr->eh_entries) \
+				     < le16_to_cpu((__path__)->p_hdr->eh_max))
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+
+static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
+{
+	return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
+}
+
+static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
+{
+	return (struct ext4_extent_header *) bh->b_data;
+}
+
+static inline unsigned short ext_depth(struct inode *inode)
+{
+	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+}
+
+static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
+{
+	/* We can not have an unwritten extent of zero length! */
+	BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
+	ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
+{
+	/* Extent with ee_len of 0x8000 is treated as an initialized extent */
+	return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
+{
+	return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
+		le16_to_cpu(ext->ee_len) :
+		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+}
+
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+	ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ex->ee_start_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ix->ei_leaf_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+					 ext4_fsblk_t pb)
+{
+	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				      0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+					 ext4_fsblk_t pb)
+{
+	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				     0xffff);
+}
+
+#define ext4_ext_dirty(handle, inode, path) \
+		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+		     struct inode *inode, struct ext4_ext_path *path);
+
+#endif /* _EXT4_EXTENTS */
+
diff --git a/kernel/fs/ext4/ext4_jbd2.c b/kernel/fs/ext4/ext4_jbd2.c
new file mode 100644
index 000000000..d41843181
--- /dev/null
+++ b/kernel/fs/ext4/ext4_jbd2.c
@@ -0,0 +1,327 @@
+/*
+ * Interface between ext4 and JBD
+ */
+
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+	handle_t *handle = current->journal_info;
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+	ref_cnt++;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+	return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt == 0);
+
+	ref_cnt--;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ */
+static int ext4_journal_check_start(struct super_block *sb)
+{
+	journal_t *journal;
+
+	might_sleep();
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
+	journal = EXT4_SB(sb)->s_journal;
+	/*
+	 * Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly.
+	 */
+	if (journal && is_journal_aborted(journal)) {
+		ext4_abort(sb, "Detected aborted journal");
+		return -EROFS;
+	}
+	return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+				  int type, int blocks, int rsv_blocks)
+{
+	journal_t *journal;
+	int err;
+
+	trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	journal = EXT4_SB(sb)->s_journal;
+	if (!journal)
+		return ext4_get_nojournal();
+	return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+				   type, line);
+}
+
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+	struct super_block *sb;
+	int err;
+	int rc;
+
+	if (!ext4_handle_valid(handle)) {
+		ext4_put_nojournal(handle);
+		return 0;
+	}
+
+	if (!handle->h_transaction) {
+		err = jbd2_journal_stop(handle);
+		return handle->h_err ? handle->h_err : err;
+	}
+
+	sb = handle->h_transaction->t_journal->j_private;
+	err = handle->h_err;
+	rc = jbd2_journal_stop(handle);
+
+	if (!err)
+		err = rc;
+	if (err)
+		__ext4_std_error(sb, where, line, err);
+	return err;
+}
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type)
+{
+	struct super_block *sb;
+	int err;
+
+	if (!ext4_handle_valid(handle))
+		return ext4_get_nojournal();
+
+	sb = handle->h_journal->j_private;
+	trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+					  _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0) {
+		jbd2_journal_free_reserved(handle);
+		return ERR_PTR(err);
+	}
+
+	err = jbd2_journal_start_reserved(handle, type, line);
+	if (err < 0)
+		return ERR_PTR(err);
+	return handle;
+}
+
+static void ext4_journal_abort_handle(const char *caller, unsigned int line,
+				      const char *err_fn,
+				      struct buffer_head *bh,
+				      handle_t *handle, int err)
+{
+	char nbuf[16];
+	const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (bh)
+		BUFFER_TRACE(bh, "abort");
+
+	if (!handle->h_err)
+		handle->h_err = err;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+	       caller, line, errstr, err_fn);
+
+	jbd2_journal_abort_handle(handle);
+}
+
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh)
+{
+	int err = 0;
+
+	might_sleep();
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_write_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__, bh,
+						  handle, err);
+	}
+	return err;
+}
+
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr)
+{
+	int err;
+
+	might_sleep();
+
+	trace_ext4_forget(inode, is_metadata, blocknr);
+	BUFFER_TRACE(bh, "enter");
+
+	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+		  "data mode %x\n",
+		  bh, is_metadata, inode->i_mode,
+		  test_opt(inode->i_sb, DATA_FLAGS));
+
+	/* In the no journal case, we can just do a bforget and return */
+	if (!ext4_handle_valid(handle)) {
+		bforget(bh);
+		return 0;
+	}
+
+	/* Never use the revoke function if we are doing full data
+	 * journaling: there is no need to, and a V1 superblock won't
+	 * support it.  Otherwise, only skip the revoke on un-journaled
+	 * data blocks. */
+
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    (!is_metadata && !ext4_should_journal_data(inode))) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call jbd2_journal_forget");
+			err = jbd2_journal_forget(handle, bh);
+			if (err)
+				ext4_journal_abort_handle(where, line, __func__,
+							  bh, handle, err);
+			return err;
+		}
+		return 0;
+	}
+
+	/*
+	 * data!=journal && (is_metadata || should_journal_data(inode))
+	 */
+	BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+	err = jbd2_journal_revoke(handle, blocknr, bh);
+	if (err) {
+		ext4_journal_abort_handle(where, line, __func__,
+					  bh, handle, err);
+		__ext4_abort(inode->i_sb, where, line,
+			   "error %d when attempting revoke", err);
+	}
+	BUFFER_TRACE(bh, "exit");
+	return err;
+}
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_create_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	}
+	return err;
+}
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh)
+{
+	int err = 0;
+
+	might_sleep();
+
+	set_buffer_meta(bh);
+	set_buffer_prio(bh);
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		/* Errors can only happen due to aborted journal or a nasty bug */
+		if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
+			ext4_journal_abort_handle(where, line, __func__, bh,
+						  handle, err);
+			if (inode == NULL) {
+				pr_err("EXT4: jbd2_journal_dirty_metadata "
+				       "failed: handle type %u started at "
+				       "line %u, credits %u/%u, errcode %d",
+				       handle->h_type,
+				       handle->h_line_no,
+				       handle->h_requested_credits,
+				       handle->h_buffer_credits, err);
+				return err;
+			}
+			ext4_error_inode(inode, where, line,
+					 bh->b_blocknr,
+					 "journal_dirty_metadata failed: "
+					 "handle type %u started at line %u, "
+					 "credits %u/%u, errcode %d",
+					 handle->h_type,
+					 handle->h_line_no,
+					 handle->h_requested_credits,
+					 handle->h_buffer_credits, err);
+		}
+	} else {
+		if (inode)
+			mark_buffer_dirty_inode(bh, inode);
+		else
+			mark_buffer_dirty(bh);
+		if (inode && inode_needs_sync(inode)) {
+			sync_dirty_buffer(bh);
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				struct ext4_super_block *es;
+
+				es = EXT4_SB(inode->i_sb)->s_es;
+				es->s_last_error_block =
+					cpu_to_le64(bh->b_blocknr);
+				ext4_error_inode(inode, where, line,
+						 bh->b_blocknr,
+					"IO error syncing itable block");
+				err = -EIO;
+			}
+		}
+	}
+	return err;
+}
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb)
+{
+	struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+	int err = 0;
+
+	ext4_superblock_csum_set(sb);
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	} else
+		mark_buffer_dirty(bh);
+	return err;
+}
diff --git a/kernel/fs/ext4/ext4_jbd2.h b/kernel/fs/ext4/ext4_jbd2.h
new file mode 100644
index 000000000..9c5b49fb2
--- /dev/null
+++ b/kernel/fs/ext4/ext4_jbd2.h
@@ -0,0 +1,450 @@
+/*
+ * ext4_jbd2.h
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Ext4-specific journaling extensions.
+ */
+
+#ifndef _EXT4_JBD2_H
+#define _EXT4_JBD2_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#define EXT4_JOURNAL(inode)	(EXT4_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction.
+ *
+ * For extents-enabled fs we may have to allocate and modify up to
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
+
+#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
+	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+	 ? 20U : 8U)
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT4_XATTR_TRANS_BLOCKS		6U
+
+/* Define the minimum size for a transaction which modifies data.  This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota).  The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT4_DATA_TRANS_BLOCKS(sb)	(EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
+					 EXT4_XATTR_TRANS_BLOCKS - 2 + \
+					 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \
+					EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction.  For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT4_MAX_TRANS_DATA		64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one.  Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates.  Quota allocations are not
+ * needed. */
+
+#define EXT4_RESERVE_TRANS_BLOCKS	12U
+
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS	8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		1 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+		 +3+DQUOT_INIT_REWRITE) : 0)
+
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+		 +3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+
+static inline int ext4_jbd2_credits_xattr(struct inode *inode)
+{
+	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+	/*
+	 * In case of inline data, we may push out the data to a block,
+	 * so we need to reserve credits for this eventuality
+	 */
+	if (ext4_has_inline_data(inode))
+		credits += ext4_writepage_trans_blocks(inode) + 1;
+	return credits;
+}
+
+
+/*
+ * Ext4 handle operation types -- for logging purposes
+ */
+#define EXT4_HT_MISC             0
+#define EXT4_HT_INODE            1
+#define EXT4_HT_WRITE_PAGE       2
+#define EXT4_HT_MAP_BLOCKS       3
+#define EXT4_HT_DIR              4
+#define EXT4_HT_TRUNCATE         5
+#define EXT4_HT_QUOTA            6
+#define EXT4_HT_RESIZE           7
+#define EXT4_HT_MIGRATE          8
+#define EXT4_HT_MOVE_EXTENTS     9
+#define EXT4_HT_XATTR           10
+#define EXT4_HT_EXT_CONVERT     11
+#define EXT4_HT_MAX             12
+
+/**
+ *   struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ *   This struct is a 'seed' structure for a using with your own callback
+ *   structs. If you are using callbacks you must allocate one of these
+ *   or another struct of your own definition which has this struct
+ *   as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+	/* list information for other callbacks attached to the same handle */
+	struct list_head jce_list;
+
+	/*  Function to call with this callback structure */
+	void (*jce_func)(struct super_block *sb,
+			 struct ext4_journal_cb_entry *jce, int error);
+
+	/* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ *        @sb: superblock of current filesystem for transaction
+ *        @jce: returned journal callback data
+ *        @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+			void (*func)(struct super_block *sb,
+				     struct ext4_journal_cb_entry *jce,
+				     int rc),
+			struct ext4_journal_cb_entry *jce)
+{
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	/* Add the jce to transaction's private list */
+	jce->jce_func = func;
+	spin_lock(&sbi->s_md_lock);
+	list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+	spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ * Return true if object was successfully removed
+ */
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
+					     struct ext4_journal_cb_entry *jce)
+{
+	bool deleted;
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	spin_lock(&sbi->s_md_lock);
+	deleted = !list_empty(&jce->jce_list);
+	list_del_init(&jce->jce_list);
+	spin_unlock(&sbi->s_md_lock);
+	return deleted;
+}
+
+int
+ext4_mark_iloc_dirty(handle_t *handle,
+		     struct inode *inode,
+		     struct ext4_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+			struct ext4_iloc *iloc);
+
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext4 calls into JBD.
+ */
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh);
+
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr);
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+				handle_t *handle, struct buffer_head *bh);
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh);
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb);
+
+#define ext4_journal_get_write_access(handle, bh) \
+	__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+	__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
+		      (bh), (block_nr))
+#define ext4_journal_get_create_access(handle, bh) \
+	__ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+	__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+				     (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+				  int type, int blocks, int rsv_blocks);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
+
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+
+/* Note:  Do not use this for NULL handles.  This is only to determine if
+ * a properly allocated handle is using a journal or not. */
+static inline int ext4_handle_valid(handle_t *handle)
+{
+	if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
+		return 0;
+	return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		handle->h_sync = 1;
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		return is_handle_aborted(handle);
+	return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+	if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+		return 0;
+	return 1;
+}
+
+#define ext4_journal_start_sb(sb, type, nblocks)			\
+	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start(inode, type, nblocks)			\
+	__ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+	__ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+
+static inline handle_t *__ext4_journal_start(struct inode *inode,
+					     unsigned int line, int type,
+					     int blocks, int rsv_blocks)
+{
+	return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+				       rsv_blocks);
+}
+
+#define ext4_journal_stop(handle) \
+	__ext4_journal_stop(__func__, __LINE__, (handle))
+
+#define ext4_journal_start_reserved(handle, type) \
+	__ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_free_reserved(handle);
+}
+
+static inline handle_t *ext4_journal_current_handle(void)
+{
+	return journal_current_handle();
+}
+
+static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_extend(handle, nblocks);
+	return 0;
+}
+
+static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_restart(handle, nblocks);
+	return 0;
+}
+
+static inline int ext4_journal_blocks_per_page(struct inode *inode)
+{
+	if (EXT4_JOURNAL(inode) != NULL)
+		return jbd2_journal_blocks_per_page(inode);
+	return 0;
+}
+
+static inline int ext4_journal_force_commit(journal_t *journal)
+{
+	if (journal)
+		return jbd2_journal_force_commit(journal);
+	return 0;
+}
+
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+	return 0;
+}
+
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+						 struct inode *inode,
+						 int datasync)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		if (datasync)
+			ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+}
+
+/* super.c */
+int ext4_force_commit(struct super_block *sb);
+
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE	0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE	0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE	0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
+{
+	if (EXT4_JOURNAL(inode) == NULL)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	/* We do not support data journalling with delayed allocation */
+	if (!S_ISREG(inode->i_mode) ||
+	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !test_opt(inode->i_sb, DELALLOC))
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	else
+		BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
+}
+
+static inline int ext4_should_order_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
+}
+
+static inline int ext4_should_writeback_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
+}
+
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads.  This only works for extent-based
+ * files, and it doesn't work if data journaling is enabled, since the
+ * dioread_nolock code uses b_private to pass information back to the
+ * I/O completion handler, and this conflicts with the jbd's use of
+ * b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+	if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return 0;
+	if (ext4_should_journal_data(inode))
+		return 0;
+	return 1;
+}
+
+#endif	/* _EXT4_JBD2_H */
diff --git a/kernel/fs/ext4/extents.c b/kernel/fs/ext4/extents.c
new file mode 100644
index 000000000..e003a1e81
--- /dev/null
+++ b/kernel/fs/ext4/extents.c
@@ -0,0 +1,5707 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * Architecture independence:
+ *   Copyright (c) 2005, Bull S.A.
+ *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+/*
+ * Extents support for EXT4
+ *
+ * TODO:
+ *   - ext4*_error() should be used in some situations
+ *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
+ *   - smart tree reduction
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/fiemap.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "xattr.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
+					due to ENOSPC */
+#define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
+#define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
+
+#define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */
+#define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */
+
+static __le32 ext4_extent_block_csum(struct inode *inode,
+				     struct ext4_extent_header *eh)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u32 csum;
+
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+			   EXT4_EXTENT_TAIL_OFFSET(eh));
+	return cpu_to_le32(csum);
+}
+
+static int ext4_extent_block_csum_verify(struct inode *inode,
+					 struct ext4_extent_header *eh)
+{
+	struct ext4_extent_tail *et;
+
+	if (!ext4_has_metadata_csum(inode->i_sb))
+		return 1;
+
+	et = find_ext4_extent_tail(eh);
+	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
+		return 0;
+	return 1;
+}
+
+static void ext4_extent_block_csum_set(struct inode *inode,
+				       struct ext4_extent_header *eh)
+{
+	struct ext4_extent_tail *et;
+
+	if (!ext4_has_metadata_csum(inode->i_sb))
+		return;
+
+	et = find_ext4_extent_tail(eh);
+	et->et_checksum = ext4_extent_block_csum(inode, eh);
+}
+
+static int ext4_split_extent(handle_t *handle,
+				struct inode *inode,
+				struct ext4_ext_path **ppath,
+				struct ext4_map_blocks *map,
+				int split_flag,
+				int flags);
+
+static int ext4_split_extent_at(handle_t *handle,
+			     struct inode *inode,
+			     struct ext4_ext_path **ppath,
+			     ext4_lblk_t split,
+			     int split_flag,
+			     int flags);
+
+static int ext4_find_delayed_extent(struct inode *inode,
+				    struct extent_status *newes);
+
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+					    struct inode *inode,
+					    int needed)
+{
+	int err;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+	if (handle->h_buffer_credits > needed)
+		return 0;
+	err = ext4_journal_extend(handle, needed);
+	if (err <= 0)
+		return err;
+	err = ext4_truncate_restart_trans(handle, inode, needed);
+	if (err == 0)
+		err = -EAGAIN;
+
+	return err;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ */
+static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		BUFFER_TRACE(path->p_bh, "get_write_access");
+		return ext4_journal_get_write_access(handle, path->p_bh);
+	}
+	/* path points to leaf/index in inode body */
+	/* we use in-core data, no need to protect them */
+	return 0;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ *  - EIO
+ */
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+		     struct inode *inode, struct ext4_ext_path *path)
+{
+	int err;
+
+	WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+	if (path->p_bh) {
+		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
+		/* path points to block */
+		err = __ext4_handle_dirty_metadata(where, line, handle,
+						   inode, path->p_bh);
+	} else {
+		/* path points to leaf/index in inode body */
+		err = ext4_mark_inode_dirty(handle, inode);
+	}
+	return err;
+}
+
+static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+			      struct ext4_ext_path *path,
+			      ext4_lblk_t block)
+{
+	if (path) {
+		int depth = path->p_depth;
+		struct ext4_extent *ex;
+
+		/*
+		 * Try to predict block placement assuming that we are
+		 * filling in a file which will eventually be
+		 * non-sparse --- i.e., in the case of libbfd writing
+		 * an ELF object sections out-of-order but in a way
+		 * the eventually results in a contiguous object or
+		 * executable file, or some database extending a table
+		 * space file.  However, this is actually somewhat
+		 * non-ideal if we are writing a sparse file such as
+		 * qemu or KVM writing a raw image file that is going
+		 * to stay fairly sparse, since it will end up
+		 * fragmenting the file system's free space.  Maybe we
+		 * should have some hueristics or some way to allow
+		 * userspace to pass a hint to file system,
+		 * especially if the latter case turns out to be
+		 * common.
+		 */
+		ex = path[depth].p_ext;
+		if (ex) {
+			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+
+			if (block > ext_block)
+				return ext_pblk + (block - ext_block);
+			else
+				return ext_pblk - (ext_block - block);
+		}
+
+		/* it looks like index is empty;
+		 * try to find starting block from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	return ext4_inode_to_goal_block(inode);
+}
+
+/*
+ * Allocation for a meta data block
+ */
+static ext4_fsblk_t
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
+			struct ext4_ext_path *path,
+			struct ext4_extent *ex, int *err, unsigned int flags)
+{
+	ext4_fsblk_t goal, newblock;
+
+	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
+	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+					NULL, err);
+	return newblock;
+}
+
+static inline int ext4_ext_space_block(struct inode *inode, int check)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+			/ sizeof(struct ext4_extent);
+#ifdef AGGRESSIVE_TEST
+	if (!check && size > 6)
+		size = 6;
+#endif
+	return size;
+}
+
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+			/ sizeof(struct ext4_extent_idx);
+#ifdef AGGRESSIVE_TEST
+	if (!check && size > 5)
+		size = 5;
+#endif
+	return size;
+}
+
+static inline int ext4_ext_space_root(struct inode *inode, int check)
+{
+	int size;
+
+	size = sizeof(EXT4_I(inode)->i_data);
+	size -= sizeof(struct ext4_extent_header);
+	size /= sizeof(struct ext4_extent);
+#ifdef AGGRESSIVE_TEST
+	if (!check && size > 3)
+		size = 3;
+#endif
+	return size;
+}
+
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
+{
+	int size;
+
+	size = sizeof(EXT4_I(inode)->i_data);
+	size -= sizeof(struct ext4_extent_header);
+	size /= sizeof(struct ext4_extent_idx);
+#ifdef AGGRESSIVE_TEST
+	if (!check && size > 4)
+		size = 4;
+#endif
+	return size;
+}
+
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+			   struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+			   int nofail)
+{
+	struct ext4_ext_path *path = *ppath;
+	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+
+	return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
+			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
+			EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
+			(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
+
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int idxs;
+
+	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+		/ sizeof(struct ext4_extent_idx));
+
+	/*
+	 * If the new delayed allocation block is contiguous with the
+	 * previous da block, it can share index blocks with the
+	 * previous block, so we only need to allocate a new index
+	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
+	 * an additional index block, and at ldxs**3 blocks, yet
+	 * another index blocks.
+	 */
+	if (ei->i_da_metadata_calc_len &&
+	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+		int num = 0;
+
+		if ((ei->i_da_metadata_calc_len % idxs) == 0)
+			num++;
+		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+			num++;
+		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+			num++;
+			ei->i_da_metadata_calc_len = 0;
+		} else
+			ei->i_da_metadata_calc_len++;
+		ei->i_da_metadata_calc_last_lblock++;
+		return num;
+	}
+
+	/*
+	 * In the worst case we need a new set of index blocks at
+	 * every level of the inode's extent tree.
+	 */
+	ei->i_da_metadata_calc_len = 1;
+	ei->i_da_metadata_calc_last_lblock = lblock;
+	return ext_depth(inode) + 1;
+}
+
+static int
+ext4_ext_max_entries(struct inode *inode, int depth)
+{
+	int max;
+
+	if (depth == ext_depth(inode)) {
+		if (depth == 0)
+			max = ext4_ext_space_root(inode, 1);
+		else
+			max = ext4_ext_space_root_idx(inode, 1);
+	} else {
+		if (depth == 0)
+			max = ext4_ext_space_block(inode, 1);
+		else
+			max = ext4_ext_space_block_idx(inode, 1);
+	}
+
+	return max;
+}
+
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+	ext4_fsblk_t block = ext4_ext_pblock(ext);
+	int len = ext4_ext_get_actual_len(ext);
+	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+	ext4_lblk_t last = lblock + len - 1;
+
+	if (len == 0 || lblock > last)
+		return 0;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+				struct ext4_extent_idx *ext_idx)
+{
+	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
+
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+				struct ext4_extent_header *eh,
+				int depth)
+{
+	unsigned short entries;
+	if (eh->eh_entries == 0)
+		return 1;
+
+	entries = le16_to_cpu(eh->eh_entries);
+
+	if (depth == 0) {
+		/* leaf entries */
+		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+		ext4_fsblk_t pblock = 0;
+		ext4_lblk_t lblock = 0;
+		ext4_lblk_t prev = 0;
+		int len = 0;
+		while (entries) {
+			if (!ext4_valid_extent(inode, ext))
+				return 0;
+
+			/* Check for overlapping extents */
+			lblock = le32_to_cpu(ext->ee_block);
+			len = ext4_ext_get_actual_len(ext);
+			if ((lblock <= prev) && prev) {
+				pblock = ext4_ext_pblock(ext);
+				es->s_last_error_block = cpu_to_le64(pblock);
+				return 0;
+			}
+			ext++;
+			entries--;
+			prev = lblock + len - 1;
+		}
+	} else {
+		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+		while (entries) {
+			if (!ext4_valid_extent_idx(inode, ext_idx))
+				return 0;
+			ext_idx++;
+			entries--;
+		}
+	}
+	return 1;
+}
+
+static int __ext4_ext_check(const char *function, unsigned int line,
+			    struct inode *inode, struct ext4_extent_header *eh,
+			    int depth, ext4_fsblk_t pblk)
+{
+	const char *error_msg;
+	int max = 0;
+
+	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
+		error_msg = "invalid magic";
+		goto corrupted;
+	}
+	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
+		error_msg = "unexpected eh_depth";
+		goto corrupted;
+	}
+	if (unlikely(eh->eh_max == 0)) {
+		error_msg = "invalid eh_max";
+		goto corrupted;
+	}
+	max = ext4_ext_max_entries(inode, depth);
+	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
+		error_msg = "too large eh_max";
+		goto corrupted;
+	}
+	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
+		error_msg = "invalid eh_entries";
+		goto corrupted;
+	}
+	if (!ext4_valid_extent_entries(inode, eh, depth)) {
+		error_msg = "invalid extent entries";
+		goto corrupted;
+	}
+	/* Verify checksum on non-root extent tree nodes */
+	if (ext_depth(inode) != depth &&
+	    !ext4_extent_block_csum_verify(inode, eh)) {
+		error_msg = "extent tree corrupted";
+		goto corrupted;
+	}
+	return 0;
+
+corrupted:
+	ext4_error_inode(inode, function, line, 0,
+			 "pblk %llu bad header/extent: %s - magic %x, "
+			 "entries %u, max %u(%u), depth %u(%u)",
+			 (unsigned long long) pblk, error_msg,
+			 le16_to_cpu(eh->eh_magic),
+			 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+			 max, le16_to_cpu(eh->eh_depth), depth);
+	return -EIO;
+}
+
+#define ext4_ext_check(inode, eh, depth, pblk)			\
+	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
+}
+
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+			 struct inode *inode, ext4_fsblk_t pblk, int depth,
+			 int flags)
+{
+	struct buffer_head		*bh;
+	int				err;
+
+	bh = sb_getblk(inode->i_sb, pblk);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
+
+	if (!bh_uptodate_or_lock(bh)) {
+		trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+		err = bh_submit_read(bh);
+		if (err < 0)
+			goto errout;
+	}
+	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+		return bh;
+	err = __ext4_ext_check(function, line, inode,
+			       ext_block_hdr(bh), depth, pblk);
+	if (err)
+		goto errout;
+	set_buffer_verified(bh);
+	/*
+	 * If this is a leaf block, cache all of its entries
+	 */
+	if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+		struct ext4_extent_header *eh = ext_block_hdr(bh);
+		struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+		ext4_lblk_t prev = 0;
+		int i;
+
+		for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+			unsigned int status = EXTENT_STATUS_WRITTEN;
+			ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+			int len = ext4_ext_get_actual_len(ex);
+
+			if (prev && (prev != lblk))
+				ext4_es_cache_extent(inode, prev,
+						     lblk - prev, ~0,
+						     EXTENT_STATUS_HOLE);
+
+			if (ext4_ext_is_unwritten(ex))
+				status = EXTENT_STATUS_UNWRITTEN;
+			ext4_es_cache_extent(inode, lblk, len,
+					     ext4_ext_pblock(ex), status);
+			prev = lblk + len;
+		}
+	}
+	return bh;
+errout:
+	put_bh(bh);
+	return ERR_PTR(err);
+
+}
+
+#define read_extent_tree_block(inode, pblk, depth, flags)		\
+	__read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
+				 (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_ext_path *path = NULL;
+	struct buffer_head *bh;
+	int i = 0, depth, ret = 0;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		return 0;	/* not an extent-mapped inode */
+
+	down_read(&ei->i_data_sem);
+	depth = ext_depth(inode);
+
+	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+		       GFP_NOFS);
+	if (path == NULL) {
+		up_read(&ei->i_data_sem);
+		return -ENOMEM;
+	}
+
+	/* Don't cache anything if there are no external extent blocks */
+	if (depth == 0)
+		goto out;
+	path[0].p_hdr = ext_inode_hdr(inode);
+	ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+	if (ret)
+		goto out;
+	path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+	while (i >= 0) {
+		/*
+		 * If this is a leaf block or we've reached the end of
+		 * the index block, go up
+		 */
+		if ((i == depth) ||
+		    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			continue;
+		}
+		bh = read_extent_tree_block(inode,
+					    ext4_idx_pblock(path[i].p_idx++),
+					    depth - i - 1,
+					    EXT4_EX_FORCE_CACHE);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
+			break;
+		}
+		i++;
+		path[i].p_bh = bh;
+		path[i].p_hdr = ext_block_hdr(bh);
+		path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+	}
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+	up_read(&ei->i_data_sem);
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	return ret;
+}
+
+#ifdef EXT_DEBUG
+static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
+{
+	int k, l = path->p_depth;
+
+	ext_debug("path:");
+	for (k = 0; k <= l; k++, path++) {
+		if (path->p_idx) {
+		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
+			    ext4_idx_pblock(path->p_idx));
+		} else if (path->p_ext) {
+			ext_debug("  %d:[%d]%d:%llu ",
+				  le32_to_cpu(path->p_ext->ee_block),
+				  ext4_ext_is_unwritten(path->p_ext),
+				  ext4_ext_get_actual_len(path->p_ext),
+				  ext4_ext_pblock(path->p_ext));
+		} else
+			ext_debug("  []");
+	}
+	ext_debug("\n");
+}
+
+static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
+{
+	int depth = ext_depth(inode);
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex;
+	int i;
+
+	if (!path)
+		return;
+
+	eh = path[depth].p_hdr;
+	ex = EXT_FIRST_EXTENT(eh);
+
+	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
+	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
+		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_unwritten(ex),
+			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
+	}
+	ext_debug("\n");
+}
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+			ext4_fsblk_t newblock, int level)
+{
+	int depth = ext_depth(inode);
+	struct ext4_extent *ex;
+
+	if (depth != level) {
+		struct ext4_extent_idx *idx;
+		idx = path[level].p_idx;
+		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+			ext_debug("%d: move %d:%llu in new index %llu\n", level,
+					le32_to_cpu(idx->ei_block),
+					ext4_idx_pblock(idx),
+					newblock);
+			idx++;
+		}
+
+		return;
+	}
+
+	ex = path[depth].p_ext;
+	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+				le32_to_cpu(ex->ee_block),
+				ext4_ext_pblock(ex),
+				ext4_ext_is_unwritten(ex),
+				ext4_ext_get_actual_len(ex),
+				newblock);
+		ex++;
+	}
+}
+
+#else
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
+#endif
+
+void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+	int depth, i;
+
+	if (!path)
+		return;
+	depth = path->p_depth;
+	for (i = 0; i <= depth; i++, path++)
+		if (path->p_bh) {
+			brelse(path->p_bh);
+			path->p_bh = NULL;
+		}
+}
+
+/*
+ * ext4_ext_binsearch_idx:
+ * binary search for the closest index of the given block
+ * the header must be checked before calling this
+ */
+static void
+ext4_ext_binsearch_idx(struct inode *inode,
+			struct ext4_ext_path *path, ext4_lblk_t block)
+{
+	struct ext4_extent_header *eh = path->p_hdr;
+	struct ext4_extent_idx *r, *l, *m;
+
+
+	ext_debug("binsearch for %u(idx):  ", block);
+
+	l = EXT_FIRST_INDEX(eh) + 1;
+	r = EXT_LAST_INDEX(eh);
+	while (l <= r) {
+		m = l + (r - l) / 2;
+		if (block < le32_to_cpu(m->ei_block))
+			r = m - 1;
+		else
+			l = m + 1;
+		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
+				m, le32_to_cpu(m->ei_block),
+				r, le32_to_cpu(r->ei_block));
+	}
+
+	path->p_idx = l - 1;
+	ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
+		  ext4_idx_pblock(path->p_idx));
+
+#ifdef CHECK_BINSEARCH
+	{
+		struct ext4_extent_idx *chix, *ix;
+		int k;
+
+		chix = ix = EXT_FIRST_INDEX(eh);
+		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
+		  if (k != 0 &&
+		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+				printk(KERN_DEBUG "k=%d, ix=0x%p, "
+				       "first=0x%p\n", k,
+				       ix, EXT_FIRST_INDEX(eh));
+				printk(KERN_DEBUG "%u <= %u\n",
+				       le32_to_cpu(ix->ei_block),
+				       le32_to_cpu(ix[-1].ei_block));
+			}
+			BUG_ON(k && le32_to_cpu(ix->ei_block)
+					   <= le32_to_cpu(ix[-1].ei_block));
+			if (block < le32_to_cpu(ix->ei_block))
+				break;
+			chix = ix;
+		}
+		BUG_ON(chix != path->p_idx);
+	}
+#endif
+
+}
+
+/*
+ * ext4_ext_binsearch:
+ * binary search for closest extent of the given block
+ * the header must be checked before calling this
+ */
+static void
+ext4_ext_binsearch(struct inode *inode,
+		struct ext4_ext_path *path, ext4_lblk_t block)
+{
+	struct ext4_extent_header *eh = path->p_hdr;
+	struct ext4_extent *r, *l, *m;
+
+	if (eh->eh_entries == 0) {
+		/*
+		 * this leaf is empty:
+		 * we get such a leaf in split/add case
+		 */
+		return;
+	}
+
+	ext_debug("binsearch for %u:  ", block);
+
+	l = EXT_FIRST_EXTENT(eh) + 1;
+	r = EXT_LAST_EXTENT(eh);
+
+	while (l <= r) {
+		m = l + (r - l) / 2;
+		if (block < le32_to_cpu(m->ee_block))
+			r = m - 1;
+		else
+			l = m + 1;
+		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
+				m, le32_to_cpu(m->ee_block),
+				r, le32_to_cpu(r->ee_block));
+	}
+
+	path->p_ext = l - 1;
+	ext_debug("  -> %d:%llu:[%d]%d ",
+			le32_to_cpu(path->p_ext->ee_block),
+			ext4_ext_pblock(path->p_ext),
+			ext4_ext_is_unwritten(path->p_ext),
+			ext4_ext_get_actual_len(path->p_ext));
+
+#ifdef CHECK_BINSEARCH
+	{
+		struct ext4_extent *chex, *ex;
+		int k;
+
+		chex = ex = EXT_FIRST_EXTENT(eh);
+		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
+			BUG_ON(k && le32_to_cpu(ex->ee_block)
+					  <= le32_to_cpu(ex[-1].ee_block));
+			if (block < le32_to_cpu(ex->ee_block))
+				break;
+			chex = ex;
+		}
+		BUG_ON(chex != path->p_ext);
+	}
+#endif
+
+}
+
+int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+{
+	struct ext4_extent_header *eh;
+
+	eh = ext_inode_hdr(inode);
+	eh->eh_depth = 0;
+	eh->eh_entries = 0;
+	eh->eh_magic = EXT4_EXT_MAGIC;
+	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
+	ext4_mark_inode_dirty(handle, inode);
+	return 0;
+}
+
+struct ext4_ext_path *
+ext4_find_extent(struct inode *inode, ext4_lblk_t block,
+		 struct ext4_ext_path **orig_path, int flags)
+{
+	struct ext4_extent_header *eh;
+	struct buffer_head *bh;
+	struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
+	short int depth, i, ppos = 0;
+	int ret;
+
+	eh = ext_inode_hdr(inode);
+	depth = ext_depth(inode);
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		if (depth > path[0].p_maxdepth) {
+			kfree(path);
+			*orig_path = path = NULL;
+		}
+	}
+	if (!path) {
+		/* account possible depth increase */
+		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
+				GFP_NOFS);
+		if (unlikely(!path))
+			return ERR_PTR(-ENOMEM);
+		path[0].p_maxdepth = depth + 1;
+	}
+	path[0].p_hdr = eh;
+	path[0].p_bh = NULL;
+
+	i = depth;
+	/* walk through the tree */
+	while (i) {
+		ext_debug("depth %d: num %d, max %d\n",
+			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+
+		ext4_ext_binsearch_idx(inode, path + ppos, block);
+		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
+		path[ppos].p_depth = i;
+		path[ppos].p_ext = NULL;
+
+		bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+					    flags);
+		if (unlikely(IS_ERR(bh))) {
+			ret = PTR_ERR(bh);
+			goto err;
+		}
+
+		eh = ext_block_hdr(bh);
+		ppos++;
+		if (unlikely(ppos > depth)) {
+			put_bh(bh);
+			EXT4_ERROR_INODE(inode,
+					 "ppos %d > depth %d", ppos, depth);
+			ret = -EIO;
+			goto err;
+		}
+		path[ppos].p_bh = bh;
+		path[ppos].p_hdr = eh;
+	}
+
+	path[ppos].p_depth = i;
+	path[ppos].p_ext = NULL;
+	path[ppos].p_idx = NULL;
+
+	/* find extent */
+	ext4_ext_binsearch(inode, path + ppos, block);
+	/* if not an empty leaf */
+	if (path[ppos].p_ext)
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
+
+	ext4_ext_show_path(inode, path);
+
+	return path;
+
+err:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	if (orig_path)
+		*orig_path = NULL;
+	return ERR_PTR(ret);
+}
+
+/*
+ * ext4_ext_insert_index:
+ * insert new index [@logical;@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+				 struct ext4_ext_path *curp,
+				 int logical, ext4_fsblk_t ptr)
+{
+	struct ext4_extent_idx *ix;
+	int len, err;
+
+	err = ext4_ext_get_access(handle, inode, curp);
+	if (err)
+		return err;
+
+	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
+
+	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
+		EXT4_ERROR_INODE(inode,
+				 "eh_entries %d >= eh_max %d!",
+				 le16_to_cpu(curp->p_hdr->eh_entries),
+				 le16_to_cpu(curp->p_hdr->eh_max));
+		return -EIO;
+	}
+
+	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
+		/* insert after */
+		ext_debug("insert new index %d after: %llu\n", logical, ptr);
+		ix = curp->p_idx + 1;
+	} else {
+		/* insert before */
+		ext_debug("insert new index %d before: %llu\n", logical, ptr);
+		ix = curp->p_idx;
+	}
+
+	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+	BUG_ON(len < 0);
+	if (len > 0) {
+		ext_debug("insert new index %d: "
+				"move %d indices from 0x%p to 0x%p\n",
+				logical, len, ix, ix + 1);
+		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+	}
+
+	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+		return -EIO;
+	}
+
+	ix->ei_block = cpu_to_le32(logical);
+	ext4_idx_store_pblock(ix, ptr);
+	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
+
+	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+		return -EIO;
+	}
+
+	err = ext4_ext_dirty(handle, inode, curp);
+	ext4_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+/*
+ * ext4_ext_split:
+ * inserts new subtree into the path, using free index entry
+ * at depth @at:
+ * - allocates all needed blocks (new leaf and all intermediate index blocks)
+ * - makes decision where to split
+ * - moves remaining extents and index entries (right to the split point)
+ *   into the newly allocated blocks
+ * - initializes subtree
+ */
+static int ext4_ext_split(handle_t *handle, struct inode *inode,
+			  unsigned int flags,
+			  struct ext4_ext_path *path,
+			  struct ext4_extent *newext, int at)
+{
+	struct buffer_head *bh = NULL;
+	int depth = ext_depth(inode);
+	struct ext4_extent_header *neh;
+	struct ext4_extent_idx *fidx;
+	int i = at, k, m, a;
+	ext4_fsblk_t newblock, oldblock;
+	__le32 border;
+	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+	int err = 0;
+
+	/* make decision: where to split? */
+	/* FIXME: now decision is simplest: at current extent */
+
+	/* if current leaf will be split, then we should use
+	 * border from split point */
+	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+		return -EIO;
+	}
+	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		border = path[depth].p_ext[1].ee_block;
+		ext_debug("leaf will be split."
+				" next leaf starts at %d\n",
+				  le32_to_cpu(border));
+	} else {
+		border = newext->ee_block;
+		ext_debug("leaf will be added."
+				" next leaf starts at %d\n",
+				le32_to_cpu(border));
+	}
+
+	/*
+	 * If error occurs, then we break processing
+	 * and mark filesystem read-only. index won't
+	 * be inserted and tree will be in consistent
+	 * state. Next mount will repair buffers too.
+	 */
+
+	/*
+	 * Get array to track all allocated blocks.
+	 * We need this to handle errors and free blocks
+	 * upon them.
+	 */
+	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
+	if (!ablocks)
+		return -ENOMEM;
+
+	/* allocate all needed blocks */
+	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+	for (a = 0; a < depth - at; a++) {
+		newblock = ext4_ext_new_meta_block(handle, inode, path,
+						   newext, &err, flags);
+		if (newblock == 0)
+			goto cleanup;
+		ablocks[a] = newblock;
+	}
+
+	/* initialize new leaf */
+	newblock = ablocks[--a];
+	if (unlikely(newblock == 0)) {
+		EXT4_ERROR_INODE(inode, "newblock == 0!");
+		err = -EIO;
+		goto cleanup;
+	}
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (unlikely(!bh)) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+	lock_buffer(bh);
+
+	err = ext4_journal_get_create_access(handle, bh);
+	if (err)
+		goto cleanup;
+
+	neh = ext_block_hdr(bh);
+	neh->eh_entries = 0;
+	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
+	neh->eh_magic = EXT4_EXT_MAGIC;
+	neh->eh_depth = 0;
+
+	/* move remainder of path[depth] to the new leaf */
+	if (unlikely(path[depth].p_hdr->eh_entries !=
+		     path[depth].p_hdr->eh_max)) {
+		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+				 path[depth].p_hdr->eh_entries,
+				 path[depth].p_hdr->eh_max);
+		err = -EIO;
+		goto cleanup;
+	}
+	/* start copy from next extent */
+	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+	ext4_ext_show_move(inode, path, newblock, depth);
+	if (m) {
+		struct ext4_extent *ex;
+		ex = EXT_FIRST_EXTENT(neh);
+		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
+		le16_add_cpu(&neh->eh_entries, m);
+	}
+
+	ext4_extent_block_csum_set(inode, neh);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
+	if (err)
+		goto cleanup;
+	brelse(bh);
+	bh = NULL;
+
+	/* correct old leaf */
+	if (m) {
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto cleanup;
+		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto cleanup;
+
+	}
+
+	/* create intermediate indexes */
+	k = depth - at - 1;
+	if (unlikely(k < 0)) {
+		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+		err = -EIO;
+		goto cleanup;
+	}
+	if (k)
+		ext_debug("create %d intermediate indices\n", k);
+	/* insert new index into current index block */
+	/* current depth stored in i var */
+	i = depth - 1;
+	while (k--) {
+		oldblock = newblock;
+		newblock = ablocks[--a];
+		bh = sb_getblk(inode->i_sb, newblock);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto cleanup;
+		}
+		lock_buffer(bh);
+
+		err = ext4_journal_get_create_access(handle, bh);
+		if (err)
+			goto cleanup;
+
+		neh = ext_block_hdr(bh);
+		neh->eh_entries = cpu_to_le16(1);
+		neh->eh_magic = EXT4_EXT_MAGIC;
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
+		neh->eh_depth = cpu_to_le16(depth - i);
+		fidx = EXT_FIRST_INDEX(neh);
+		fidx->ei_block = border;
+		ext4_idx_store_pblock(fidx, oldblock);
+
+		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
+				i, newblock, le32_to_cpu(border), oldblock);
+
+		/* move remainder of path[i] to the new index block */
+		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+					EXT_LAST_INDEX(path[i].p_hdr))) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+					 le32_to_cpu(path[i].p_ext->ee_block));
+			err = -EIO;
+			goto cleanup;
+		}
+		/* start copy indexes */
+		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
+		ext4_ext_show_move(inode, path, newblock, i);
+		if (m) {
+			memmove(++fidx, path[i].p_idx,
+				sizeof(struct ext4_extent_idx) * m);
+			le16_add_cpu(&neh->eh_entries, m);
+		}
+		ext4_extent_block_csum_set(inode, neh);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (err)
+			goto cleanup;
+		brelse(bh);
+		bh = NULL;
+
+		/* correct old index */
+		if (m) {
+			err = ext4_ext_get_access(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
+			err = ext4_ext_dirty(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+		}
+
+		i--;
+	}
+
+	/* insert new index */
+	err = ext4_ext_insert_index(handle, inode, path + at,
+				    le32_to_cpu(border), newblock);
+
+cleanup:
+	if (bh) {
+		if (buffer_locked(bh))
+			unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	if (err) {
+		/* free all allocated blocks in error case */
+		for (i = 0; i < depth; i++) {
+			if (!ablocks[i])
+				continue;
+			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
+					 EXT4_FREE_BLOCKS_METADATA);
+		}
+	}
+	kfree(ablocks);
+
+	return err;
+}
+
+/*
+ * ext4_ext_grow_indepth:
+ * implements tree growing procedure:
+ * - allocates new block
+ * - moves top-level data (index block or leaf) into the new block
+ * - initializes new top-level, creating index that points to the
+ *   just created block
+ */
+static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
+				 unsigned int flags)
+{
+	struct ext4_extent_header *neh;
+	struct buffer_head *bh;
+	ext4_fsblk_t newblock, goal = 0;
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	int err = 0;
+
+	/* Try to prepend new index to old one */
+	if (ext_depth(inode))
+		goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
+	if (goal > le32_to_cpu(es->s_first_data_block)) {
+		flags |= EXT4_MB_HINT_TRY_GOAL;
+		goal--;
+	} else
+		goal = ext4_inode_to_goal_block(inode);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+					NULL, &err);
+	if (newblock == 0)
+		return err;
+
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (unlikely(!bh))
+		return -ENOMEM;
+	lock_buffer(bh);
+
+	err = ext4_journal_get_create_access(handle, bh);
+	if (err) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	/* move top-level index/leaf into new block */
+	memmove(bh->b_data, EXT4_I(inode)->i_data,
+		sizeof(EXT4_I(inode)->i_data));
+
+	/* set size of new block */
+	neh = ext_block_hdr(bh);
+	/* old root could have indexes or leaves
+	 * so calculate e_max right way */
+	if (ext_depth(inode))
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
+	else
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
+	neh->eh_magic = EXT4_EXT_MAGIC;
+	ext4_extent_block_csum_set(inode, neh);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
+	if (err)
+		goto out;
+
+	/* Update top-level index: num,max,pointer */
+	neh = ext_inode_hdr(inode);
+	neh->eh_entries = cpu_to_le16(1);
+	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+	if (neh->eh_depth == 0) {
+		/* Root extent block becomes index block */
+		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+		EXT_FIRST_INDEX(neh)->ei_block =
+			EXT_FIRST_EXTENT(neh)->ee_block;
+	}
+	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
+		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
+
+	le16_add_cpu(&neh->eh_depth, 1);
+	ext4_mark_inode_dirty(handle, inode);
+out:
+	brelse(bh);
+
+	return err;
+}
+
+/*
+ * ext4_ext_create_new_leaf:
+ * finds empty index and adds new leaf.
+ * if no free index is found, then it requests in-depth growing.
+ */
+static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+				    unsigned int mb_flags,
+				    unsigned int gb_flags,
+				    struct ext4_ext_path **ppath,
+				    struct ext4_extent *newext)
+{
+	struct ext4_ext_path *path = *ppath;
+	struct ext4_ext_path *curp;
+	int depth, i, err = 0;
+
+repeat:
+	i = depth = ext_depth(inode);
+
+	/* walk up to the tree and look for free index entry */
+	curp = path + depth;
+	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+		i--;
+		curp--;
+	}
+
+	/* we use already allocated block for index block,
+	 * so subsequent data blocks should be contiguous */
+	if (EXT_HAS_FREE_INDEX(curp)) {
+		/* if we found index with free entry, then use that
+		 * entry: create all needed subtree and add new leaf */
+		err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
+		if (err)
+			goto out;
+
+		/* refill path */
+		path = ext4_find_extent(inode,
+				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+				    ppath, gb_flags);
+		if (IS_ERR(path))
+			err = PTR_ERR(path);
+	} else {
+		/* tree is full, time to grow in depth */
+		err = ext4_ext_grow_indepth(handle, inode, mb_flags);
+		if (err)
+			goto out;
+
+		/* refill path */
+		path = ext4_find_extent(inode,
+				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+				    ppath, gb_flags);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+
+		/*
+		 * only first (depth 0 -> 1) produces free space;
+		 * in all other cases we have to split the grown tree
+		 */
+		depth = ext_depth(inode);
+		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+			/* now we need to split */
+			goto repeat;
+		}
+	}
+
+out:
+	return err;
+}
+
+/*
+ * search the closest allocated block to the left for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+static int ext4_ext_search_left(struct inode *inode,
+				struct ext4_ext_path *path,
+				ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+	struct ext4_extent_idx *ix;
+	struct ext4_extent *ex;
+	int depth, ee_len;
+
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
+	depth = path->p_depth;
+	*phys = 0;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0;
+
+	/* usually extent in the path covers blocks smaller
+	 * then *logical, but it can be that extent is the
+	 * first one in the file */
+
+	ex = path[depth].p_ext;
+	ee_len = ext4_ext_get_actual_len(ex);
+	if (*logical < le32_to_cpu(ex->ee_block)) {
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+					 *logical, le32_to_cpu(ex->ee_block));
+			return -EIO;
+		}
+		while (--depth >= 0) {
+			ix = path[depth].p_idx;
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
+				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+				  depth);
+				return -EIO;
+			}
+		}
+		return 0;
+	}
+
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
+
+	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
+	*phys = ext4_ext_pblock(ex) + ee_len - 1;
+	return 0;
+}
+
+/*
+ * search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the largest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+static int ext4_ext_search_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
+				 struct ext4_extent **ret_ex)
+{
+	struct buffer_head *bh = NULL;
+	struct ext4_extent_header *eh;
+	struct ext4_extent_idx *ix;
+	struct ext4_extent *ex;
+	ext4_fsblk_t block;
+	int depth;	/* Note, NOT eh_depth; depth from top of tree */
+	int ee_len;
+
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
+	depth = path->p_depth;
+	*phys = 0;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0;
+
+	/* usually extent in the path covers blocks smaller
+	 * then *logical, but it can be that extent is the
+	 * first one in the file */
+
+	ex = path[depth].p_ext;
+	ee_len = ext4_ext_get_actual_len(ex);
+	if (*logical < le32_to_cpu(ex->ee_block)) {
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "first_extent(path[%d].p_hdr) != ex",
+					 depth);
+			return -EIO;
+		}
+		while (--depth >= 0) {
+			ix = path[depth].p_idx;
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+						 "ix != EXT_FIRST_INDEX *logical %d!",
+						 *logical);
+				return -EIO;
+			}
+		}
+		goto found_extent;
+	}
+
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
+
+	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
+		/* next allocated block in this leaf */
+		ex++;
+		goto found_extent;
+	}
+
+	/* go up and search for index to the right */
+	while (--depth >= 0) {
+		ix = path[depth].p_idx;
+		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
+			goto got_index;
+	}
+
+	/* we've gone up to the root and found no index to the right */
+	return 0;
+
+got_index:
+	/* we've found index to the right, let's
+	 * follow it and find the closest allocated
+	 * block to the right */
+	ix++;
+	block = ext4_idx_pblock(ix);
+	while (++depth < path->p_depth) {
+		/* subtract from p_depth to get proper eh_depth */
+		bh = read_extent_tree_block(inode, block,
+					    path->p_depth - depth, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		eh = ext_block_hdr(bh);
+		ix = EXT_FIRST_INDEX(eh);
+		block = ext4_idx_pblock(ix);
+		put_bh(bh);
+	}
+
+	bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
+	eh = ext_block_hdr(bh);
+	ex = EXT_FIRST_EXTENT(eh);
+found_extent:
+	*logical = le32_to_cpu(ex->ee_block);
+	*phys = ext4_ext_pblock(ex);
+	*ret_ex = ex;
+	if (bh)
+		put_bh(bh);
+	return 0;
+}
+
+/*
+ * ext4_ext_next_allocated_block:
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
+ * NOTE: it considers block number from index entry as
+ * allocated block. Thus, index entries have to be consistent
+ * with leaves.
+ */
+ext4_lblk_t
+ext4_ext_next_allocated_block(struct ext4_ext_path *path)
+{
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return EXT_MAX_BLOCKS;
+
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			/* leaf */
+			if (path[depth].p_ext &&
+				path[depth].p_ext !=
+					EXT_LAST_EXTENT(path[depth].p_hdr))
+			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
+		} else {
+			/* index */
+			if (path[depth].p_idx !=
+					EXT_LAST_INDEX(path[depth].p_hdr))
+			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
+		}
+		depth--;
+	}
+
+	return EXT_MAX_BLOCKS;
+}
+
+/*
+ * ext4_ext_next_leaf_block:
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
+ */
+static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
+{
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+
+	/* zero-tree has no leaf blocks at all */
+	if (depth == 0)
+		return EXT_MAX_BLOCKS;
+
+	/* go to index block */
+	depth--;
+
+	while (depth >= 0) {
+		if (path[depth].p_idx !=
+				EXT_LAST_INDEX(path[depth].p_hdr))
+			return (ext4_lblk_t)
+				le32_to_cpu(path[depth].p_idx[1].ei_block);
+		depth--;
+	}
+
+	return EXT_MAX_BLOCKS;
+}
+
+/*
+ * ext4_ext_correct_indexes:
+ * if leaf gets modified and modified extent is first in the leaf,
+ * then we have to correct all indexes above.
+ * TODO: do we need to correct tree in all cases?
+ */
+static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	struct ext4_extent_header *eh;
+	int depth = ext_depth(inode);
+	struct ext4_extent *ex;
+	__le32 border;
+	int k, err = 0;
+
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	if (unlikely(ex == NULL || eh == NULL)) {
+		EXT4_ERROR_INODE(inode,
+				 "ex %p == NULL or eh %p == NULL", ex, eh);
+		return -EIO;
+	}
+
+	if (depth == 0) {
+		/* there is no tree at all */
+		return 0;
+	}
+
+	if (ex != EXT_FIRST_EXTENT(eh)) {
+		/* we correct tree if first leaf got modified only */
+		return 0;
+	}
+
+	/*
+	 * TODO: we need correction if border is smaller than current one
+	 */
+	k = depth - 1;
+	border = path[depth].p_ext->ee_block;
+	err = ext4_ext_get_access(handle, inode, path + k);
+	if (err)
+		return err;
+	path[k].p_idx->ei_block = border;
+	err = ext4_ext_dirty(handle, inode, path + k);
+	if (err)
+		return err;
+
+	while (k--) {
+		/* change all left-side indexes */
+		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
+			break;
+		err = ext4_ext_get_access(handle, inode, path + k);
+		if (err)
+			break;
+		path[k].p_idx->ei_block = border;
+		err = ext4_ext_dirty(handle, inode, path + k);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+int
+ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
+				struct ext4_extent *ex2)
+{
+	unsigned short ext1_ee_len, ext2_ee_len;
+
+	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
+		return 0;
+
+	ext1_ee_len = ext4_ext_get_actual_len(ex1);
+	ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
+			le32_to_cpu(ex2->ee_block))
+		return 0;
+
+	/*
+	 * To allow future support for preallocated extents to be added
+	 * as an RO_COMPAT feature, refuse to merge to extents if
+	 * this can result in the top bit of ee_len being set.
+	 */
+	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
+		return 0;
+	if (ext4_ext_is_unwritten(ex1) &&
+	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
+	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+		return 0;
+#ifdef AGGRESSIVE_TEST
+	if (ext1_ee_len >= 4)
+		return 0;
+#endif
+
+	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
+		return 1;
+	return 0;
+}
+
+/*
+ * This function tries to merge the "ex" extent to the next extent in the tree.
+ * It always tries to merge towards right. If you want to merge towards
+ * left, pass "ex - 1" as argument instead of "ex".
+ * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
+ * 1 if they got merged.
+ */
+static int ext4_ext_try_to_merge_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *ex)
+{
+	struct ext4_extent_header *eh;
+	unsigned int depth, len;
+	int merge_done = 0, unwritten;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	while (ex < EXT_LAST_EXTENT(eh)) {
+		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
+			break;
+		/* merge with next extent! */
+		unwritten = ext4_ext_is_unwritten(ex);
+		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+				+ ext4_ext_get_actual_len(ex + 1));
+		if (unwritten)
+			ext4_ext_mark_unwritten(ex);
+
+		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
+			len = (EXT_LAST_EXTENT(eh) - ex - 1)
+				* sizeof(struct ext4_extent);
+			memmove(ex + 1, ex + 2, len);
+		}
+		le16_add_cpu(&eh->eh_entries, -1);
+		merge_done = 1;
+		WARN_ON(eh->eh_entries == 0);
+		if (!eh->eh_entries)
+			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
+	}
+
+	return merge_done;
+}
+
+/*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+				     struct inode *inode,
+				     struct ext4_ext_path *path)
+{
+	size_t s;
+	unsigned max_root = ext4_ext_space_root(inode, 0);
+	ext4_fsblk_t blk;
+
+	if ((path[0].p_depth != 1) ||
+	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+		return;
+
+	/*
+	 * We need to modify the block allocation bitmap and the block
+	 * group descriptor to release the extent tree block.  If we
+	 * can't get the journal credits, give up.
+	 */
+	if (ext4_journal_extend(handle, 2))
+		return;
+
+	/*
+	 * Copy the extent data up to the inode
+	 */
+	blk = ext4_idx_pblock(path[0].p_idx);
+	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+		sizeof(struct ext4_extent_idx);
+	s += sizeof(struct ext4_extent_header);
+
+	path[1].p_maxdepth = path[0].p_maxdepth;
+	memcpy(path[0].p_hdr, path[1].p_hdr, s);
+	path[0].p_depth = 0;
+	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+	brelse(path[1].p_bh);
+	ext4_free_blocks(handle, inode, NULL, blk, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+}
+
+/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static void ext4_ext_try_to_merge(handle_t *handle,
+				  struct inode *inode,
+				  struct ext4_ext_path *path,
+				  struct ext4_extent *ex) {
+	struct ext4_extent_header *eh;
+	unsigned int depth;
+	int merge_done = 0;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	if (ex > EXT_FIRST_EXTENT(eh))
+		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+	if (!merge_done)
+		(void) ext4_ext_try_to_merge_right(inode, path, ex);
+
+	ext4_ext_try_to_merge_up(handle, inode, path);
+}
+
+/*
+ * check if a portion of the "newext" extent overlaps with an
+ * existing extent.
+ *
+ * If there is an overlap discovered, it updates the length of the newext
+ * such that there will be no overlap, and then returns 1.
+ * If there is no overlap found, it returns 0.
+ */
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+					   struct inode *inode,
+					   struct ext4_extent *newext,
+					   struct ext4_ext_path *path)
+{
+	ext4_lblk_t b1, b2;
+	unsigned int depth, len1;
+	unsigned int ret = 0;
+
+	b1 = le32_to_cpu(newext->ee_block);
+	len1 = ext4_ext_get_actual_len(newext);
+	depth = ext_depth(inode);
+	if (!path[depth].p_ext)
+		goto out;
+	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
+
+	/*
+	 * get the next allocated block if the extent in the path
+	 * is before the requested block(s)
+	 */
+	if (b2 < b1) {
+		b2 = ext4_ext_next_allocated_block(path);
+		if (b2 == EXT_MAX_BLOCKS)
+			goto out;
+		b2 = EXT4_LBLK_CMASK(sbi, b2);
+	}
+
+	/* check for wrap through zero on extent logical start block*/
+	if (b1 + len1 < b1) {
+		len1 = EXT_MAX_BLOCKS - b1;
+		newext->ee_len = cpu_to_le16(len1);
+		ret = 1;
+	}
+
+	/* check for overlap */
+	if (b1 + len1 > b2) {
+		newext->ee_len = cpu_to_le16(b2 - b1);
+		ret = 1;
+	}
+out:
+	return ret;
+}
+
+/*
+ * ext4_ext_insert_extent:
+ * tries to merge requsted extent into the existing extent or
+ * inserts requested extent as new one into the tree,
+ * creating new leaf in the no-space case.
+ */
+int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path **ppath,
+				struct ext4_extent *newext, int gb_flags)
+{
+	struct ext4_ext_path *path = *ppath;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex, *fex;
+	struct ext4_extent *nearex; /* nearest extent */
+	struct ext4_ext_path *npath = NULL;
+	int depth, len, err;
+	ext4_lblk_t next;
+	int mb_flags = 0, unwritten;
+
+	if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
+	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+		return -EIO;
+	}
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	eh = path[depth].p_hdr;
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
+
+	/* try to insert block into found extent and return */
+	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
+
+		/*
+		 * Try to see whether we should rather test the extent on
+		 * right from ex, or from the left of ex. This is because
+		 * ext4_find_extent() can return either extent on the
+		 * left, or on the right from the searched position. This
+		 * will make merging more effective.
+		 */
+		if (ex < EXT_LAST_EXTENT(eh) &&
+		    (le32_to_cpu(ex->ee_block) +
+		    ext4_ext_get_actual_len(ex) <
+		    le32_to_cpu(newext->ee_block))) {
+			ex += 1;
+			goto prepend;
+		} else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+			   (le32_to_cpu(newext->ee_block) +
+			   ext4_ext_get_actual_len(newext) <
+			   le32_to_cpu(ex->ee_block)))
+			ex -= 1;
+
+		/* Try to append newex to the ex */
+		if (ext4_can_extents_be_merged(inode, ex, newext)) {
+			ext_debug("append [%d]%d block to %u:[%d]%d"
+				  "(from %llu)\n",
+				  ext4_ext_is_unwritten(newext),
+				  ext4_ext_get_actual_len(newext),
+				  le32_to_cpu(ex->ee_block),
+				  ext4_ext_is_unwritten(ex),
+				  ext4_ext_get_actual_len(ex),
+				  ext4_ext_pblock(ex));
+			err = ext4_ext_get_access(handle, inode,
+						  path + depth);
+			if (err)
+				return err;
+			unwritten = ext4_ext_is_unwritten(ex);
+			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+					+ ext4_ext_get_actual_len(newext));
+			if (unwritten)
+				ext4_ext_mark_unwritten(ex);
+			eh = path[depth].p_hdr;
+			nearex = ex;
+			goto merge;
+		}
+
+prepend:
+		/* Try to prepend newex to the ex */
+		if (ext4_can_extents_be_merged(inode, newext, ex)) {
+			ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+				  "(from %llu)\n",
+				  le32_to_cpu(newext->ee_block),
+				  ext4_ext_is_unwritten(newext),
+				  ext4_ext_get_actual_len(newext),
+				  le32_to_cpu(ex->ee_block),
+				  ext4_ext_is_unwritten(ex),
+				  ext4_ext_get_actual_len(ex),
+				  ext4_ext_pblock(ex));
+			err = ext4_ext_get_access(handle, inode,
+						  path + depth);
+			if (err)
+				return err;
+
+			unwritten = ext4_ext_is_unwritten(ex);
+			ex->ee_block = newext->ee_block;
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+					+ ext4_ext_get_actual_len(newext));
+			if (unwritten)
+				ext4_ext_mark_unwritten(ex);
+			eh = path[depth].p_hdr;
+			nearex = ex;
+			goto merge;
+		}
+	}
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
+		goto has_space;
+
+	/* probably next leaf has space for us? */
+	fex = EXT_LAST_EXTENT(eh);
+	next = EXT_MAX_BLOCKS;
+	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
+		next = ext4_ext_next_leaf_block(path);
+	if (next != EXT_MAX_BLOCKS) {
+		ext_debug("next leaf block - %u\n", next);
+		BUG_ON(npath != NULL);
+		npath = ext4_find_extent(inode, next, NULL, 0);
+		if (IS_ERR(npath))
+			return PTR_ERR(npath);
+		BUG_ON(npath->p_depth != path->p_depth);
+		eh = npath[depth].p_hdr;
+		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
+			ext_debug("next leaf isn't full(%d)\n",
+				  le16_to_cpu(eh->eh_entries));
+			path = npath;
+			goto has_space;
+		}
+		ext_debug("next leaf has no free space(%d,%d)\n",
+			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+	}
+
+	/*
+	 * There is no free space in the found leaf.
+	 * We're gonna add a new leaf in the tree.
+	 */
+	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+		mb_flags |= EXT4_MB_USE_RESERVED;
+	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+				       ppath, newext);
+	if (err)
+		goto cleanup;
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+
+has_space:
+	nearex = path[depth].p_ext;
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto cleanup;
+
+	if (!nearex) {
+		/* there is no extent in this leaf, create first one */
+		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
+				le32_to_cpu(newext->ee_block),
+				ext4_ext_pblock(newext),
+				ext4_ext_is_unwritten(newext),
+				ext4_ext_get_actual_len(newext));
+		nearex = EXT_FIRST_EXTENT(eh);
+	} else {
+		if (le32_to_cpu(newext->ee_block)
+			   > le32_to_cpu(nearex->ee_block)) {
+			/* Insert after */
+			ext_debug("insert %u:%llu:[%d]%d before: "
+					"nearest %p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_unwritten(newext),
+					ext4_ext_get_actual_len(newext),
+					nearex);
+			nearex++;
+		} else {
+			/* Insert before */
+			BUG_ON(newext->ee_block == nearex->ee_block);
+			ext_debug("insert %u:%llu:[%d]%d after: "
+					"nearest %p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_unwritten(newext),
+					ext4_ext_get_actual_len(newext),
+					nearex);
+		}
+		len = EXT_LAST_EXTENT(eh) - nearex + 1;
+		if (len > 0) {
+			ext_debug("insert %u:%llu:[%d]%d: "
+					"move %d extents from 0x%p to 0x%p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_unwritten(newext),
+					ext4_ext_get_actual_len(newext),
+					len, nearex, nearex + 1);
+			memmove(nearex + 1, nearex,
+				len * sizeof(struct ext4_extent));
+		}
+	}
+
+	le16_add_cpu(&eh->eh_entries, 1);
+	path[depth].p_ext = nearex;
+	nearex->ee_block = newext->ee_block;
+	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
+	nearex->ee_len = newext->ee_len;
+
+merge:
+	/* try to merge extents */
+	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+		ext4_ext_try_to_merge(handle, inode, path, nearex);
+
+
+	/* time to correct all indexes above */
+	err = ext4_ext_correct_indexes(handle, inode, path);
+	if (err)
+		goto cleanup;
+
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+
+cleanup:
+	ext4_ext_drop_refs(npath);
+	kfree(npath);
+	return err;
+}
+
+static int ext4_fill_fiemap_extents(struct inode *inode,
+				    ext4_lblk_t block, ext4_lblk_t num,
+				    struct fiemap_extent_info *fieinfo)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	struct extent_status es;
+	ext4_lblk_t next, next_del, start = 0, end = 0;
+	ext4_lblk_t last = block + num;
+	int exists, depth = 0, err = 0;
+	unsigned int flags = 0;
+	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+
+	while (block < last && block != EXT_MAX_BLOCKS) {
+		num = last - block;
+		/* find extent for this block */
+		down_read(&EXT4_I(inode)->i_data_sem);
+
+		path = ext4_find_extent(inode, block, &path, 0);
+		if (IS_ERR(path)) {
+			up_read(&EXT4_I(inode)->i_data_sem);
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		if (unlikely(path[depth].p_hdr == NULL)) {
+			up_read(&EXT4_I(inode)->i_data_sem);
+			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+			err = -EIO;
+			break;
+		}
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+
+		flags = 0;
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+			 * by found extent
+			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			es.es_lblk = start;
+			es.es_len = end - start;
+			es.es_pblk = 0;
+		} else {
+			es.es_lblk = le32_to_cpu(ex->ee_block);
+			es.es_len = ext4_ext_get_actual_len(ex);
+			es.es_pblk = ext4_ext_pblock(ex);
+			if (ext4_ext_is_unwritten(ex))
+				flags |= FIEMAP_EXTENT_UNWRITTEN;
+		}
+
+		/*
+		 * Find delayed extent and update es accordingly. We call
+		 * it even in !exists case to find out whether es is the
+		 * last existing extent or not.
+		 */
+		next_del = ext4_find_delayed_extent(inode, &es);
+		if (!exists && next_del) {
+			exists = 1;
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+		}
+		up_read(&EXT4_I(inode)->i_data_sem);
+
+		if (unlikely(es.es_len == 0)) {
+			EXT4_ERROR_INODE(inode, "es.es_len == 0");
+			err = -EIO;
+			break;
+		}
+
+		/*
+		 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
+		 * we need to check next == EXT_MAX_BLOCKS because it is
+		 * possible that an extent is with unwritten and delayed
+		 * status due to when an extent is delayed allocated and
+		 * is allocated by fallocate status tree will track both of
+		 * them in a extent.
+		 *
+		 * So we could return a unwritten and delayed extent, and
+		 * its block is equal to 'next'.
+		 */
+		if (next == next_del && next == EXT_MAX_BLOCKS) {
+			flags |= FIEMAP_EXTENT_LAST;
+			if (unlikely(next_del != EXT_MAX_BLOCKS ||
+				     next != EXT_MAX_BLOCKS)) {
+				EXT4_ERROR_INODE(inode,
+						 "next extent == %u, next "
+						 "delalloc extent = %u",
+						 next, next_del);
+				err = -EIO;
+				break;
+			}
+		}
+
+		if (exists) {
+			err = fiemap_fill_next_extent(fieinfo,
+				(__u64)es.es_lblk << blksize_bits,
+				(__u64)es.es_pblk << blksize_bits,
+				(__u64)es.es_len << blksize_bits,
+				flags);
+			if (err < 0)
+				break;
+			if (err == 1) {
+				err = 0;
+				break;
+			}
+		}
+
+		block = es.es_lblk + es.es_len;
+	}
+
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	return err;
+}
+
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
+				ext4_lblk_t block)
+{
+	int depth = ext_depth(inode);
+	ext4_lblk_t len;
+	ext4_lblk_t lblock;
+	struct ext4_extent *ex;
+	struct extent_status es;
+
+	ex = path[depth].p_ext;
+	if (ex == NULL) {
+		/* there is no extent yet, so gap is [0;-] */
+		lblock = 0;
+		len = EXT_MAX_BLOCKS;
+		ext_debug("cache gap(whole file):");
+	} else if (block < le32_to_cpu(ex->ee_block)) {
+		lblock = block;
+		len = le32_to_cpu(ex->ee_block) - block;
+		ext_debug("cache gap(before): %u [%u:%u]",
+				block,
+				le32_to_cpu(ex->ee_block),
+				 ext4_ext_get_actual_len(ex));
+	} else if (block >= le32_to_cpu(ex->ee_block)
+			+ ext4_ext_get_actual_len(ex)) {
+		ext4_lblk_t next;
+		lblock = le32_to_cpu(ex->ee_block)
+			+ ext4_ext_get_actual_len(ex);
+
+		next = ext4_ext_next_allocated_block(path);
+		ext_debug("cache gap(after): [%u:%u] %u",
+				le32_to_cpu(ex->ee_block),
+				ext4_ext_get_actual_len(ex),
+				block);
+		BUG_ON(next == lblock);
+		len = next - lblock;
+	} else {
+		BUG();
+	}
+
+	ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+	if (es.es_len) {
+		/* There's delayed extent containing lblock? */
+		if (es.es_lblk <= lblock)
+			return;
+		len = min(es.es_lblk - lblock, len);
+	}
+	ext_debug(" -> %u:%u\n", lblock, len);
+	ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
+}
+
+/*
+ * ext4_ext_rm_idx:
+ * removes index from the index block.
+ */
+static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+			struct ext4_ext_path *path, int depth)
+{
+	int err;
+	ext4_fsblk_t leaf;
+
+	/* free index block */
+	depth--;
+	path = path + depth;
+	leaf = ext4_idx_pblock(path->p_idx);
+	if (unlikely(path->p_hdr->eh_entries == 0)) {
+		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+		return -EIO;
+	}
+	err = ext4_ext_get_access(handle, inode, path);
+	if (err)
+		return err;
+
+	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
+		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+		len *= sizeof(struct ext4_extent_idx);
+		memmove(path->p_idx, path->p_idx + 1, len);
+	}
+
+	le16_add_cpu(&path->p_hdr->eh_entries, -1);
+	err = ext4_ext_dirty(handle, inode, path);
+	if (err)
+		return err;
+	ext_debug("index is empty, remove it, free block %llu\n", leaf);
+	trace_ext4_ext_rm_idx(inode, leaf);
+
+	ext4_free_blocks(handle, inode, NULL, leaf, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+	while (--depth >= 0) {
+		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+			break;
+		path--;
+		err = ext4_ext_get_access(handle, inode, path);
+		if (err)
+			break;
+		path->p_idx->ei_block = (path+1)->p_idx->ei_block;
+		err = ext4_ext_dirty(handle, inode, path);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/*
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+ * When pass the actual path, the caller should calculate credits
+ * under i_data_sem.
+ */
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
+						struct ext4_ext_path *path)
+{
+	if (path) {
+		int depth = ext_depth(inode);
+		int ret = 0;
+
+		/* probably there is space in leaf? */
+		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
+				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
+
+			/*
+			 *  There are some space in the leaf tree, no
+			 *  need to account for leaf block credit
+			 *
+			 *  bitmaps and block group descriptor blocks
+			 *  and other metadata blocks still need to be
+			 *  accounted.
+			 */
+			/* 1 bitmap, 1 block group descriptor */
+			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+			return ret;
+		}
+	}
+
+	return ext4_chunk_trans_blocks(inode, nrblocks);
+}
+
+/*
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
+ *
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
+ *
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
+{
+	int index;
+	int depth;
+
+	/* If we are converting the inline data, only one is needed here. */
+	if (ext4_has_inline_data(inode))
+		return 1;
+
+	depth = ext_depth(inode);
+
+	if (extents <= 1)
+		index = depth * 2;
+	else
+		index = depth * 3;
+
+	return index;
+}
+
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+	else if (ext4_should_journal_data(inode))
+		return EXT4_FREE_BLOCKS_FORGET;
+	return 0;
+}
+
+static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
+			      struct ext4_extent *ex,
+			      long long *partial_cluster,
+			      ext4_lblk_t from, ext4_lblk_t to)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned short ee_len = ext4_ext_get_actual_len(ex);
+	ext4_fsblk_t pblk;
+	int flags = get_default_free_blocks_flags(inode);
+
+	/*
+	 * For bigalloc file systems, we never free a partial cluster
+	 * at the beginning of the extent.  Instead, we make a note
+	 * that we tried freeing the cluster, and check to see if we
+	 * need to free it on a subsequent call to ext4_remove_blocks,
+	 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
+	 */
+	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+	/*
+	 * If we have a partial cluster, and it's different from the
+	 * cluster of the last block, we need to explicitly free the
+	 * partial cluster here.
+	 */
+	pblk = ext4_ext_pblock(ex) + ee_len - 1;
+	if (*partial_cluster > 0 &&
+	    *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, *partial_cluster),
+				 sbi->s_cluster_ratio, flags);
+		*partial_cluster = 0;
+	}
+
+#ifdef EXTENTS_STATS
+	{
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+		spin_lock(&sbi->s_ext_stats_lock);
+		sbi->s_ext_blocks += ee_len;
+		sbi->s_ext_extents++;
+		if (ee_len < sbi->s_ext_min)
+			sbi->s_ext_min = ee_len;
+		if (ee_len > sbi->s_ext_max)
+			sbi->s_ext_max = ee_len;
+		if (ext_depth(inode) > sbi->s_depth_max)
+			sbi->s_depth_max = ext_depth(inode);
+		spin_unlock(&sbi->s_ext_stats_lock);
+	}
+#endif
+	if (from >= le32_to_cpu(ex->ee_block)
+	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
+		/* tail removal */
+		ext4_lblk_t num;
+		long long first_cluster;
+
+		num = le32_to_cpu(ex->ee_block) + ee_len - from;
+		pblk = ext4_ext_pblock(ex) + ee_len - num;
+		/*
+		 * Usually we want to free partial cluster at the end of the
+		 * extent, except for the situation when the cluster is still
+		 * used by any other extent (partial_cluster is negative).
+		 */
+		if (*partial_cluster < 0 &&
+		    *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
+			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+		ext_debug("free last %u blocks starting %llu partial %lld\n",
+			  num, pblk, *partial_cluster);
+		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+		/*
+		 * If the block range to be freed didn't start at the
+		 * beginning of a cluster, and we removed the entire
+		 * extent and the cluster is not used by any other extent,
+		 * save the partial cluster here, since we might need to
+		 * delete if we determine that the truncate or punch hole
+		 * operation has removed all of the blocks in the cluster.
+		 * If that cluster is used by another extent, preserve its
+		 * negative value so it isn't freed later on.
+		 *
+		 * If the whole extent wasn't freed, we've reached the
+		 * start of the truncated/punched region and have finished
+		 * removing blocks.  If there's a partial cluster here it's
+		 * shared with the remainder of the extent and is no longer
+		 * a candidate for removal.
+		 */
+		if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
+			first_cluster = (long long) EXT4_B2C(sbi, pblk);
+			if (first_cluster != -*partial_cluster)
+				*partial_cluster = first_cluster;
+		} else {
+			*partial_cluster = 0;
+		}
+	} else
+		ext4_error(sbi->s_sb, "strange request: removal(2) "
+			   "%u-%u from %u:%u\n",
+			   from, to, le32_to_cpu(ex->ee_block), ee_len);
+	return 0;
+}
+
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end".  Both "start"
+ * and "end" must appear in the same extent or EIO is returned.
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it.  However, if this value is
+ *                   negative, it's a cluster just to the right of the
+ *                   punched region and it must not be freed.
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
+static int
+ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
+		 struct ext4_ext_path *path,
+		 long long *partial_cluster,
+		 ext4_lblk_t start, ext4_lblk_t end)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int err = 0, correct_index = 0;
+	int depth = ext_depth(inode), credits;
+	struct ext4_extent_header *eh;
+	ext4_lblk_t a, b;
+	unsigned num;
+	ext4_lblk_t ex_ee_block;
+	unsigned short ex_ee_len;
+	unsigned unwritten = 0;
+	struct ext4_extent *ex;
+	ext4_fsblk_t pblk;
+
+	/* the header must be checked already in ext4_ext_remove_space() */
+	ext_debug("truncate since %u in leaf to %u\n", start, end);
+	if (!path[depth].p_hdr)
+		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
+	eh = path[depth].p_hdr;
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
+	/* find where to start removing */
+	ex = path[depth].p_ext;
+	if (!ex)
+		ex = EXT_LAST_EXTENT(eh);
+
+	ex_ee_block = le32_to_cpu(ex->ee_block);
+	ex_ee_len = ext4_ext_get_actual_len(ex);
+
+	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
+	while (ex >= EXT_FIRST_EXTENT(eh) &&
+			ex_ee_block + ex_ee_len > start) {
+
+		if (ext4_ext_is_unwritten(ex))
+			unwritten = 1;
+		else
+			unwritten = 0;
+
+		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+			  unwritten, ex_ee_len);
+		path[depth].p_ext = ex;
+
+		a = ex_ee_block > start ? ex_ee_block : start;
+		b = ex_ee_block+ex_ee_len - 1 < end ?
+			ex_ee_block+ex_ee_len - 1 : end;
+
+		ext_debug("  border %u:%u\n", a, b);
+
+		/* If this extent is beyond the end of the hole, skip it */
+		if (end < ex_ee_block) {
+			/*
+			 * We're going to skip this extent and move to another,
+			 * so note that its first cluster is in use to avoid
+			 * freeing it when removing blocks.  Eventually, the
+			 * right edge of the truncated/punched region will
+			 * be just to the left.
+			 */
+			if (sbi->s_cluster_ratio > 1) {
+				pblk = ext4_ext_pblock(ex);
+				*partial_cluster =
+					-(long long) EXT4_B2C(sbi, pblk);
+			}
+			ex--;
+			ex_ee_block = le32_to_cpu(ex->ee_block);
+			ex_ee_len = ext4_ext_get_actual_len(ex);
+			continue;
+		} else if (b != ex_ee_block + ex_ee_len - 1) {
+			EXT4_ERROR_INODE(inode,
+					 "can not handle truncate %u:%u "
+					 "on extent %u:%u",
+					 start, end, ex_ee_block,
+					 ex_ee_block + ex_ee_len - 1);
+			err = -EIO;
+			goto out;
+		} else if (a != ex_ee_block) {
+			/* remove tail of the extent */
+			num = a - ex_ee_block;
+		} else {
+			/* remove whole extent: excellent! */
+			num = 0;
+		}
+		/*
+		 * 3 for leaf, sb, and inode plus 2 (bmap and group
+		 * descriptor) for each block group; assume two block
+		 * groups plus ex_ee_len/blocks_per_block_group for
+		 * the worst case
+		 */
+		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
+		if (ex == EXT_FIRST_EXTENT(eh)) {
+			correct_index = 1;
+			credits += (ext_depth(inode)) + 1;
+		}
+		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+		if (err)
+			goto out;
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+					 a, b);
+		if (err)
+			goto out;
+
+		if (num == 0)
+			/* this extent is removed; mark slot entirely unused */
+			ext4_ext_store_pblock(ex, 0);
+
+		ex->ee_len = cpu_to_le16(num);
+		/*
+		 * Do not mark unwritten if all the blocks in the
+		 * extent have been removed.
+		 */
+		if (unwritten && num)
+			ext4_ext_mark_unwritten(ex);
+		/*
+		 * If the extent was completely released,
+		 * we need to remove it from the leaf
+		 */
+		if (num == 0) {
+			if (end != EXT_MAX_BLOCKS - 1) {
+				/*
+				 * For hole punching, we need to scoot all the
+				 * extents up when an extent is removed so that
+				 * we dont have blank extents in the middle
+				 */
+				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+					sizeof(struct ext4_extent));
+
+				/* Now get rid of the one at the end */
+				memset(EXT_LAST_EXTENT(eh), 0,
+					sizeof(struct ext4_extent));
+			}
+			le16_add_cpu(&eh->eh_entries, -1);
+		}
+
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
+				ext4_ext_pblock(ex));
+		ex--;
+		ex_ee_block = le32_to_cpu(ex->ee_block);
+		ex_ee_len = ext4_ext_get_actual_len(ex);
+	}
+
+	if (correct_index && eh->eh_entries)
+		err = ext4_ext_correct_indexes(handle, inode, path);
+
+	/*
+	 * If there's a partial cluster and at least one extent remains in
+	 * the leaf, free the partial cluster if it isn't shared with the
+	 * current extent.  If it is shared with the current extent
+	 * we zero partial_cluster because we've reached the start of the
+	 * truncated/punched region and we're done removing blocks.
+	 */
+	if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
+		pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+		if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+			ext4_free_blocks(handle, inode, NULL,
+					 EXT4_C2B(sbi, *partial_cluster),
+					 sbi->s_cluster_ratio,
+					 get_default_free_blocks_flags(inode));
+		}
+		*partial_cluster = 0;
+	}
+
+	/* if this leaf is free, then we should
+	 * remove it from index block above */
+	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
+		err = ext4_ext_rm_idx(handle, inode, path, depth);
+
+out:
+	return err;
+}
+
+/*
+ * ext4_ext_more_to_rm:
+ * returns 1 if current index has to be freed (even partial)
+ */
+static int
+ext4_ext_more_to_rm(struct ext4_ext_path *path)
+{
+	BUG_ON(path->p_idx == NULL);
+
+	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+		return 0;
+
+	/*
+	 * if truncate on deeper level happened, it wasn't partial,
+	 * so we have to consider current index for truncation
+	 */
+	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
+		return 0;
+	return 1;
+}
+
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+			  ext4_lblk_t end)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int depth = ext_depth(inode);
+	struct ext4_ext_path *path = NULL;
+	long long partial_cluster = 0;
+	handle_t *handle;
+	int i = 0, err = 0;
+
+	ext_debug("truncate since %u to %u\n", start, end);
+
+	/* probably first extent we're gonna free will be last in block */
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+again:
+	trace_ext4_ext_remove_space(inode, start, end, depth);
+
+	/*
+	 * Check if we are removing extents inside the extent tree. If that
+	 * is the case, we are going to punch a hole inside the extent tree
+	 * so we have to check whether we need to split the extent covering
+	 * the last block to remove so we can easily remove the part of it
+	 * in ext4_ext_rm_leaf().
+	 */
+	if (end < EXT_MAX_BLOCKS - 1) {
+		struct ext4_extent *ex;
+		ext4_lblk_t ee_block, ex_end, lblk;
+		ext4_fsblk_t pblk;
+
+		/* find extent for or closest extent to this block */
+		path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path)) {
+			ext4_journal_stop(handle);
+			return PTR_ERR(path);
+		}
+		depth = ext_depth(inode);
+		/* Leaf not may not exist only if inode has no blocks at all */
+		ex = path[depth].p_ext;
+		if (!ex) {
+			if (depth) {
+				EXT4_ERROR_INODE(inode,
+						 "path[%d].p_hdr == NULL",
+						 depth);
+				err = -EIO;
+			}
+			goto out;
+		}
+
+		ee_block = le32_to_cpu(ex->ee_block);
+		ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
+
+		/*
+		 * See if the last block is inside the extent, if so split
+		 * the extent at 'end' block so we can easily remove the
+		 * tail of the first part of the split extent in
+		 * ext4_ext_rm_leaf().
+		 */
+		if (end >= ee_block && end < ex_end) {
+
+			/*
+			 * If we're going to split the extent, note that
+			 * the cluster containing the block after 'end' is
+			 * in use to avoid freeing it when removing blocks.
+			 */
+			if (sbi->s_cluster_ratio > 1) {
+				pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+				partial_cluster =
+					-(long long) EXT4_B2C(sbi, pblk);
+			}
+
+			/*
+			 * Split the extent in two so that 'end' is the last
+			 * block in the first new extent. Also we should not
+			 * fail removing space due to ENOSPC so try to use
+			 * reserved block if that happens.
+			 */
+			err = ext4_force_split_extent_at(handle, inode, &path,
+							 end + 1, 1);
+			if (err < 0)
+				goto out;
+
+		} else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+			/*
+			 * If there's an extent to the right its first cluster
+			 * contains the immediate right boundary of the
+			 * truncated/punched region.  Set partial_cluster to
+			 * its negative value so it won't be freed if shared
+			 * with the current extent.  The end < ee_block case
+			 * is handled in ext4_ext_rm_leaf().
+			 */
+			lblk = ex_end + 1;
+			err = ext4_ext_search_right(inode, path, &lblk, &pblk,
+						    &ex);
+			if (err)
+				goto out;
+			if (pblk)
+				partial_cluster =
+					-(long long) EXT4_B2C(sbi, pblk);
+		}
+	}
+	/*
+	 * We start scanning from right side, freeing all the blocks
+	 * after i_size and walking into the tree depth-wise.
+	 */
+	depth = ext_depth(inode);
+	if (path) {
+		int k = i = depth;
+		while (--k > 0)
+			path[k].p_block =
+				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
+	} else {
+		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+			       GFP_NOFS);
+		if (path == NULL) {
+			ext4_journal_stop(handle);
+			return -ENOMEM;
+		}
+		path[0].p_maxdepth = path[0].p_depth = depth;
+		path[0].p_hdr = ext_inode_hdr(inode);
+		i = 0;
+
+		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
+			err = -EIO;
+			goto out;
+		}
+	}
+	err = 0;
+
+	while (i >= 0 && err == 0) {
+		if (i == depth) {
+			/* this is leaf block */
+			err = ext4_ext_rm_leaf(handle, inode, path,
+					       &partial_cluster, start,
+					       end);
+			/* root level has p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			continue;
+		}
+
+		/* this is index block */
+		if (!path[i].p_hdr) {
+			ext_debug("initialize header\n");
+			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
+		}
+
+		if (!path[i].p_idx) {
+			/* this level hasn't been touched yet */
+			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
+			ext_debug("init index ptr: hdr 0x%p, num %d\n",
+				  path[i].p_hdr,
+				  le16_to_cpu(path[i].p_hdr->eh_entries));
+		} else {
+			/* we were already here, see at next index */
+			path[i].p_idx--;
+		}
+
+		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+				i, EXT_FIRST_INDEX(path[i].p_hdr),
+				path[i].p_idx);
+		if (ext4_ext_more_to_rm(path + i)) {
+			struct buffer_head *bh;
+			/* go to the next level */
+			ext_debug("move to level %d (block %llu)\n",
+				  i + 1, ext4_idx_pblock(path[i].p_idx));
+			memset(path + i + 1, 0, sizeof(*path));
+			bh = read_extent_tree_block(inode,
+				ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+				EXT4_EX_NOCACHE);
+			if (IS_ERR(bh)) {
+				/* should we reset i_size? */
+				err = PTR_ERR(bh);
+				break;
+			}
+			/* Yield here to deal with large extent trees.
+			 * Should be a no-op if we did IO above. */
+			cond_resched();
+			if (WARN_ON(i + 1 > depth)) {
+				err = -EIO;
+				break;
+			}
+			path[i + 1].p_bh = bh;
+
+			/* save actual number of indexes since this
+			 * number is changed at the next iteration */
+			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
+			i++;
+		} else {
+			/* we finished processing this index, go up */
+			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
+				/* index is empty, remove it;
+				 * handle must be already prepared by the
+				 * truncatei_leaf() */
+				err = ext4_ext_rm_idx(handle, inode, path, i);
+			}
+			/* root level has p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			ext_debug("return to level %d\n", i);
+		}
+	}
+
+	trace_ext4_ext_remove_space_done(inode, start, end, depth,
+			partial_cluster, path->p_hdr->eh_entries);
+
+	/*
+	 * If we still have something in the partial cluster and we have removed
+	 * even the first extent, then we should free the blocks in the partial
+	 * cluster as well.  (This code will only run when there are no leaves
+	 * to the immediate left of the truncated/punched region.)
+	 */
+	if (partial_cluster > 0 && err == 0) {
+		/* don't zero partial_cluster since it's not used afterwards */
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, partial_cluster),
+				 sbi->s_cluster_ratio,
+				 get_default_free_blocks_flags(inode));
+	}
+
+	/* TODO: flexible tree reduction should be here */
+	if (path->p_hdr->eh_entries == 0) {
+		/*
+		 * truncate to zero freed all the tree,
+		 * so we need to correct eh_depth
+		 */
+		err = ext4_ext_get_access(handle, inode, path);
+		if (err == 0) {
+			ext_inode_hdr(inode)->eh_depth = 0;
+			ext_inode_hdr(inode)->eh_max =
+				cpu_to_le16(ext4_ext_space_root(inode, 0));
+			err = ext4_ext_dirty(handle, inode, path);
+		}
+	}
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	path = NULL;
+	if (err == -EAGAIN)
+		goto again;
+	ext4_journal_stop(handle);
+
+	return err;
+}
+
+/*
+ * called at mount time
+ */
+void ext4_ext_init(struct super_block *sb)
+{
+	/*
+	 * possible initialization would be here
+	 */
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
+		printk(KERN_INFO "EXT4-fs: file extents enabled"
+#ifdef AGGRESSIVE_TEST
+		       ", aggressive tests"
+#endif
+#ifdef CHECK_BINSEARCH
+		       ", check binsearch"
+#endif
+#ifdef EXTENTS_STATS
+		       ", stats"
+#endif
+		       "\n");
+#endif
+#ifdef EXTENTS_STATS
+		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
+		EXT4_SB(sb)->s_ext_min = 1 << 30;
+		EXT4_SB(sb)->s_ext_max = 0;
+#endif
+	}
+}
+
+/*
+ * called at umount time
+ */
+void ext4_ext_release(struct super_block *sb)
+{
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		return;
+
+#ifdef EXTENTS_STATS
+	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
+			sbi->s_ext_blocks, sbi->s_ext_extents,
+			sbi->s_ext_blocks / sbi->s_ext_extents);
+		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
+			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
+	}
+#endif
+}
+
+static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+{
+	ext4_lblk_t  ee_block;
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
+
+	ee_block  = le32_to_cpu(ex->ee_block);
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext4_ext_pblock(ex);
+
+	if (ee_len == 0)
+		return 0;
+
+	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+				     EXTENT_STATUS_WRITTEN);
+}
+
+/* FIXME!! we need to try to merge to left or right after zero-out  */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
+	int ret;
+
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext4_ext_pblock(ex);
+
+	if (ext4_encrypted_inode(inode))
+		return ext4_encrypted_zeroout(inode, ex);
+
+	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *		 the states(init or unwritten) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+			     struct inode *inode,
+			     struct ext4_ext_path **ppath,
+			     ext4_lblk_t split,
+			     int split_flag,
+			     int flags)
+{
+	struct ext4_ext_path *path = *ppath;
+	ext4_fsblk_t newblock;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex, newex, orig_ex, zero_ex;
+	struct ext4_extent *ex2 = NULL;
+	unsigned int ee_len, depth;
+	int err = 0;
+
+	BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
+	       (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+
+	ext_debug("ext4_split_extents_at: inode %lu, logical"
+		"block %llu\n", inode->i_ino, (unsigned long long)split);
+
+	ext4_ext_show_leaf(inode, path);
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	newblock = split - ee_block + ext4_ext_pblock(ex);
+
+	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+	BUG_ON(!ext4_ext_is_unwritten(ex) &&
+	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
+			     EXT4_EXT_MARK_UNWRIT1 |
+			     EXT4_EXT_MARK_UNWRIT2));
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+
+	if (split == ee_block) {
+		/*
+		 * case b: block @split is the block that the extent begins with
+		 * then we just change the state of the extent, and splitting
+		 * is not needed.
+		 */
+		if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+			ext4_ext_mark_unwritten(ex);
+		else
+			ext4_ext_mark_initialized(ex);
+
+		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+			ext4_ext_try_to_merge(handle, inode, path, ex);
+
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		goto out;
+	}
+
+	/* case a */
+	memcpy(&orig_ex, ex, sizeof(orig_ex));
+	ex->ee_len = cpu_to_le16(split - ee_block);
+	if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+		ext4_ext_mark_unwritten(ex);
+
+	/*
+	 * path may lead to new leaf, not to original leaf any more
+	 * after ext4_ext_insert_extent() returns,
+	 */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	if (err)
+		goto fix_extent_len;
+
+	ex2 = &newex;
+	ex2->ee_block = cpu_to_le32(split);
+	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
+	ext4_ext_store_pblock(ex2, newblock);
+	if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+		ext4_ext_mark_unwritten(ex2);
+
+	err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
+	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
+			if (split_flag & EXT4_EXT_DATA_VALID1) {
+				err = ext4_ext_zeroout(inode, ex2);
+				zero_ex.ee_block = ex2->ee_block;
+				zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(ex2));
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex2));
+			} else {
+				err = ext4_ext_zeroout(inode, ex);
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(ex));
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+			}
+		} else {
+			err = ext4_ext_zeroout(inode, &orig_ex);
+			zero_ex.ee_block = orig_ex.ee_block;
+			zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(&orig_ex));
+			ext4_ext_store_pblock(&zero_ex,
+					      ext4_ext_pblock(&orig_ex));
+		}
+
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_len = cpu_to_le16(ee_len);
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		if (err)
+			goto fix_extent_len;
+
+		/* update extent status tree */
+		err = ext4_zeroout_es(inode, &zero_ex);
+
+		goto out;
+	} else if (err)
+		goto fix_extent_len;
+
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+
+fix_extent_len:
+	ex->ee_len = orig_ex.ee_len;
+	ext4_ext_dirty(handle, inode, path + path->p_depth);
+	return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (up to three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+			      struct inode *inode,
+			      struct ext4_ext_path **ppath,
+			      struct ext4_map_blocks *map,
+			      int split_flag,
+			      int flags)
+{
+	struct ext4_ext_path *path = *ppath;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len, depth;
+	int err = 0;
+	int unwritten;
+	int split_flag1, flags1;
+	int allocated = map->m_len;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	unwritten = ext4_ext_is_unwritten(ex);
+
+	if (map->m_lblk + map->m_len < ee_block + ee_len) {
+		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
+		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+		if (unwritten)
+			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
+				       EXT4_EXT_MARK_UNWRIT2;
+		if (split_flag & EXT4_EXT_DATA_VALID2)
+			split_flag1 |= EXT4_EXT_DATA_VALID1;
+		err = ext4_split_extent_at(handle, inode, ppath,
+				map->m_lblk + map->m_len, split_flag1, flags1);
+		if (err)
+			goto out;
+	} else {
+		allocated = ee_len - (map->m_lblk - ee_block);
+	}
+	/*
+	 * Update path is required because previous ext4_split_extent_at() may
+	 * result in split of original leaf or extent zeroout.
+	 */
+	path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	if (!ex) {
+		EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+				 (unsigned long) map->m_lblk);
+		return -EIO;
+	}
+	unwritten = ext4_ext_is_unwritten(ex);
+	split_flag1 = 0;
+
+	if (map->m_lblk >= ee_block) {
+		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+		if (unwritten) {
+			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
+			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+						     EXT4_EXT_MARK_UNWRIT2);
+		}
+		err = ext4_split_extent_at(handle, inode, ppath,
+				map->m_lblk, split_flag1, flags);
+		if (err)
+			goto out;
+	}
+
+	ext4_ext_show_leaf(inode, path);
+out:
+	return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
+ * to an unwritten extent. It may result in splitting the unwritten
+ * extent into multiple extents (up to three - one initialized and two
+ * unwritten).
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be initialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is unwritten.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
+ */
+static int ext4_ext_convert_to_initialized(handle_t *handle,
+					   struct inode *inode,
+					   struct ext4_map_blocks *map,
+					   struct ext4_ext_path **ppath,
+					   int flags)
+{
+	struct ext4_ext_path *path = *ppath;
+	struct ext4_sb_info *sbi;
+	struct ext4_extent_header *eh;
+	struct ext4_map_blocks split_map;
+	struct ext4_extent zero_ex;
+	struct ext4_extent *ex, *abut_ex;
+	ext4_lblk_t ee_block, eof_block;
+	unsigned int ee_len, depth, map_len = map->m_len;
+	int allocated = 0, max_zeroout = 0;
+	int err = 0;
+	int split_flag = 0;
+
+	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map_len);
+
+	sbi = EXT4_SB(inode->i_sb);
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map_len)
+		eof_block = map->m_lblk + map_len;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	zero_ex.ee_len = 0;
+
+	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+	/* Pre-conditions */
+	BUG_ON(!ext4_ext_is_unwritten(ex));
+	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+
+	/*
+	 * Attempt to transfer newly initialized blocks from the currently
+	 * unwritten extent to its neighbor. This is much cheaper
+	 * than an insertion followed by a merge as those involve costly
+	 * memmove() calls. Transferring to the left is the common case in
+	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+	 * followed by append writes.
+	 *
+	 * Limitations of the current logic:
+	 *  - L1: we do not deal with writes covering the whole extent.
+	 *    This would require removing the extent if the transfer
+	 *    is possible.
+	 *  - L2: we only attempt to merge with an extent stored in the
+	 *    same extent tree node.
+	 */
+	if ((map->m_lblk == ee_block) &&
+		/* See if we can merge left */
+		(map_len < ee_len) &&		/*L1*/
+		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/
+		ext4_lblk_t prev_lblk;
+		ext4_fsblk_t prev_pblk, ee_pblk;
+		unsigned int prev_len;
+
+		abut_ex = ex - 1;
+		prev_lblk = le32_to_cpu(abut_ex->ee_block);
+		prev_len = ext4_ext_get_actual_len(abut_ex);
+		prev_pblk = ext4_ext_pblock(abut_ex);
+		ee_pblk = ext4_ext_pblock(ex);
+
+		/*
+		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+		 * upon those conditions:
+		 * - C1: abut_ex is initialized,
+		 * - C2: abut_ex is logically abutting ex,
+		 * - C3: abut_ex is physically abutting ex,
+		 * - C4: abut_ex can receive the additional blocks without
+		 *   overflowing the (initialized) length limit.
+		 */
+		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
+			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
+			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
+			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			trace_ext4_ext_convert_to_initialized_fastpath(inode,
+				map, ex, abut_ex);
+
+			/* Shift the start of ex by 'map_len' blocks */
+			ex->ee_block = cpu_to_le32(ee_block + map_len);
+			ext4_ext_store_pblock(ex, ee_pblk + map_len);
+			ex->ee_len = cpu_to_le16(ee_len - map_len);
+			ext4_ext_mark_unwritten(ex); /* Restore the flag */
+
+			/* Extend abut_ex by 'map_len' blocks */
+			abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
+
+			/* Result: number of initialized blocks past m_lblk */
+			allocated = map_len;
+		}
+	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+		   (map_len < ee_len) &&	/*L1*/
+		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/
+		/* See if we can merge right */
+		ext4_lblk_t next_lblk;
+		ext4_fsblk_t next_pblk, ee_pblk;
+		unsigned int next_len;
+
+		abut_ex = ex + 1;
+		next_lblk = le32_to_cpu(abut_ex->ee_block);
+		next_len = ext4_ext_get_actual_len(abut_ex);
+		next_pblk = ext4_ext_pblock(abut_ex);
+		ee_pblk = ext4_ext_pblock(ex);
+
+		/*
+		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+		 * upon those conditions:
+		 * - C1: abut_ex is initialized,
+		 * - C2: abut_ex is logically abutting ex,
+		 * - C3: abut_ex is physically abutting ex,
+		 * - C4: abut_ex can receive the additional blocks without
+		 *   overflowing the (initialized) length limit.
+		 */
+		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
+		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/
+		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/
+		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			trace_ext4_ext_convert_to_initialized_fastpath(inode,
+				map, ex, abut_ex);
+
+			/* Shift the start of abut_ex by 'map_len' blocks */
+			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+			ex->ee_len = cpu_to_le16(ee_len - map_len);
+			ext4_ext_mark_unwritten(ex); /* Restore the flag */
+
+			/* Extend abut_ex by 'map_len' blocks */
+			abut_ex->ee_len = cpu_to_le16(next_len + map_len);
+
+			/* Result: number of initialized blocks past m_lblk */
+			allocated = map_len;
+		}
+	}
+	if (allocated) {
+		/* Mark the block containing both extents as dirty */
+		ext4_ext_dirty(handle, inode, path + depth);
+
+		/* Update path to point to the right extent */
+		path[depth].p_ext = abut_ex;
+		goto out;
+	} else
+		allocated = ee_len - (map->m_lblk - ee_block);
+
+	WARN_ON(map->m_lblk < ee_block);
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully inside i_size or new_size.
+	 */
+	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+
+	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+		max_zeroout = sbi->s_extent_max_zeroout_kb >>
+			(inode->i_sb->s_blocksize_bits - 10);
+
+	/* If extent is less than s_max_zeroout_kb, zeroout directly */
+	if (max_zeroout && (ee_len <= max_zeroout)) {
+		err = ext4_ext_zeroout(inode, ex);
+		if (err)
+			goto out;
+		zero_ex.ee_block = ex->ee_block;
+		zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
+		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+		ext4_ext_mark_initialized(ex);
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		goto out;
+	}
+
+	/*
+	 * four cases:
+	 * 1. split the extent into three extents.
+	 * 2. split the extent into two extents, zeroout the first half.
+	 * 3. split the extent into two extents, zeroout the second half.
+	 * 4. split the extent into two extents with out zeroout.
+	 */
+	split_map.m_lblk = map->m_lblk;
+	split_map.m_len = map->m_len;
+
+	if (max_zeroout && (allocated > map->m_len)) {
+		if (allocated <= max_zeroout) {
+			/* case 3 */
+			zero_ex.ee_block =
+					 cpu_to_le32(map->m_lblk);
+			zero_ex.ee_len = cpu_to_le16(allocated);
+			ext4_ext_store_pblock(&zero_ex,
+				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+			err = ext4_ext_zeroout(inode, &zero_ex);
+			if (err)
+				goto out;
+			split_map.m_lblk = map->m_lblk;
+			split_map.m_len = allocated;
+		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
+			/* case 2 */
+			if (map->m_lblk != ee_block) {
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+							ee_block);
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+				err = ext4_ext_zeroout(inode, &zero_ex);
+				if (err)
+					goto out;
+			}
+
+			split_map.m_lblk = ee_block;
+			split_map.m_len = map->m_lblk - ee_block + map->m_len;
+			allocated = map->m_len;
+		}
+	}
+
+	err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
+				flags);
+	if (err > 0)
+		err = 0;
+out:
+	/* If we have gotten a failure, don't zero out status tree */
+	if (!err)
+		err = ext4_zeroout_es(inode, &zero_ex);
+	return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an unwritten extent.
+ *
+ * Writing to an unwritten extent may result in splitting the unwritten
+ * extent into multiple initialized/unwritten extents (up to three)
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be unwritten
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unwritten extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the unwritten extent before DIO submit
+ * the IO. The unwritten extent called at this time will be split
+ * into three unwritten extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of unwritten extent to be written on success.
+ */
+static int ext4_split_convert_extents(handle_t *handle,
+					struct inode *inode,
+					struct ext4_map_blocks *map,
+					struct ext4_ext_path **ppath,
+					int flags)
+{
+	struct ext4_ext_path *path = *ppath;
+	ext4_lblk_t eof_block;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len;
+	int split_flag = 0, depth;
+
+	ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+		  __func__, inode->i_ino,
+		  (unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	/* Convert to unwritten */
+	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+		split_flag |= EXT4_EXT_DATA_VALID1;
+	/* Convert to initialized */
+	} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		split_flag |= ee_block + ee_len <= eof_block ?
+			      EXT4_EXT_MAY_ZEROOUT : 0;
+		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+	}
+	flags |= EXT4_GET_BLOCKS_PRE_IO;
+	return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
+}
+
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
+						struct inode *inode,
+						struct ext4_map_blocks *map,
+						struct ext4_ext_path **ppath)
+{
+	struct ext4_ext_path *path = *ppath;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	unsigned int ee_len;
+	int depth;
+	int err = 0;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		  (unsigned long long)ee_block, ee_len);
+
+	/* If extent is larger than requested it is a clear sign that we still
+	 * have some extent state machine issues left. So extent_split is still
+	 * required.
+	 * TODO: Once all related issues will be fixed this situation should be
+	 * illegal.
+	 */
+	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+#ifdef EXT4_DEBUG
+		ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+			     " len %u; IO logical block %llu, len %u\n",
+			     inode->i_ino, (unsigned long long)ee_block, ee_len,
+			     (unsigned long long)map->m_lblk, map->m_len);
+#endif
+		err = ext4_split_convert_extents(handle, inode, map, ppath,
+						 EXT4_GET_BLOCKS_CONVERT);
+		if (err < 0)
+			return err;
+		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* first mark the extent as initialized */
+	ext4_ext_mark_initialized(ex);
+
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
+	 */
+	ext4_ext_try_to_merge(handle, inode, path, ex);
+
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+}
+
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+			sector_t block, int count)
+{
+	int i;
+	for (i = 0; i < count; i++)
+                unmap_underlying_metadata(bdev, block + i);
+}
+
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+			      ext4_lblk_t lblk,
+			      struct ext4_ext_path *path,
+			      unsigned int len)
+{
+	int i, depth;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *last_ex;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		return 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+
+	/*
+	 * We're going to remove EOFBLOCKS_FL entirely in future so we
+	 * do not care for this case anymore. Simply remove the flag
+	 * if there are no extents.
+	 */
+	if (unlikely(!eh->eh_entries))
+		goto out;
+	last_ex = EXT_LAST_EXTENT(eh);
+	/*
+	 * We should clear the EOFBLOCKS_FL flag if we are writing the
+	 * last block in the last extent in the file.  We test this by
+	 * first checking to see if the caller to
+	 * ext4_ext_get_blocks() was interested in the last block (or
+	 * a block beyond the last block) in the current extent.  If
+	 * this turns out to be false, we can bail out from this
+	 * function immediately.
+	 */
+	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
+	    ext4_ext_get_actual_len(last_ex))
+		return 0;
+	/*
+	 * If the caller does appear to be planning to write at or
+	 * beyond the end of the current extent, we then test to see
+	 * if the current extent is the last extent in the file, by
+	 * checking to make sure it was reached via the rightmost node
+	 * at each level of the tree.
+	 */
+	for (i = depth-1; i >= 0; i--)
+		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+			return 0;
+out:
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	return ext4_mark_inode_dirty(handle, inode);
+}
+
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
+ */
+int ext4_find_delalloc_range(struct inode *inode,
+			     ext4_lblk_t lblk_start,
+			     ext4_lblk_t lblk_end)
+{
+	struct extent_status es;
+
+	ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
+	if (es.es_len == 0)
+		return 0; /* there is no delay extent in this tree */
+	else if (es.es_lblk <= lblk_start &&
+		 lblk_start < es.es_lblk + es.es_len)
+		return 1;
+	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
+		return 1;
+	else
+		return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t lblk_start, lblk_end;
+	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ *  '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ *     are not delalloc'ed.
+ *	Ex:
+ *	|----c---=|====c====|====c====|===-c----|
+ *	         |++++++ allocated ++++++|
+ *	==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ *     marked for delayed allocation. In this case, we will exclude that
+ *     cluster.
+ *	Ex:
+ *	|----====c========|========c========|
+ *	     |++++++ allocated ++++++|
+ *	==> 1 complete clusters in above example
+ *
+ *	Ex:
+ *	|================c================|
+ *            |++++++ allocated ++++++|
+ *	==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+			   unsigned int num_blks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+	ext4_lblk_t lblk_from, lblk_to, c_offset;
+	unsigned int allocated_clusters = 0;
+
+	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+	/* max possible clusters for this allocation */
+	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
+	/* Check towards left side */
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
+	if (c_offset) {
+		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
+		lblk_to = lblk_from + c_offset - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+			allocated_clusters--;
+	}
+
+	/* Now check towards right. */
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
+	if (allocated_clusters && c_offset) {
+		lblk_from = lblk_start + num_blks;
+		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+			allocated_clusters--;
+	}
+
+	return allocated_clusters;
+}
+
+static int
+convert_initialized_extent(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map,
+			   struct ext4_ext_path **ppath, int flags,
+			   unsigned int allocated, ext4_fsblk_t newblock)
+{
+	struct ext4_ext_path *path = *ppath;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	unsigned int ee_len;
+	int depth;
+	int err = 0;
+
+	/*
+	 * Make sure that the extent is no bigger than we support with
+	 * unwritten extent
+	 */
+	if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
+		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	ext_debug("%s: inode %lu, logical"
+		"block %llu, max_blocks %u\n", __func__, inode->i_ino,
+		  (unsigned long long)ee_block, ee_len);
+
+	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+		err = ext4_split_convert_extents(handle, inode, map, ppath,
+				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+		if (err < 0)
+			return err;
+		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+		if (!ex) {
+			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+					 (unsigned long) map->m_lblk);
+			return -EIO;
+		}
+	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		return err;
+	/* first mark the extent as unwritten */
+	ext4_ext_mark_unwritten(ex);
+
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
+	 */
+	ext4_ext_try_to_merge(handle, inode, path, ex);
+
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+	if (err)
+		return err;
+	ext4_ext_show_leaf(inode, path);
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
+	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
+	if (err)
+		return err;
+	map->m_flags |= EXT4_MAP_UNWRITTEN;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_len = allocated;
+	return allocated;
+}
+
+static int
+ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map,
+			struct ext4_ext_path **ppath, int flags,
+			unsigned int allocated, ext4_fsblk_t newblock)
+{
+	struct ext4_ext_path *path = *ppath;
+	int ret = 0;
+	int err = 0;
+	ext4_io_end_t *io = ext4_inode_aio(inode);
+
+	ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
+		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
+		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
+		  flags, allocated);
+	ext4_ext_show_leaf(inode, path);
+
+	/*
+	 * When writing into unwritten space, we should not fail to
+	 * allocate metadata blocks for the new extent block if needed.
+	 */
+	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
+	trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
+						    allocated, newblock);
+
+	/* get_block() before submit the IO, split the extent */
+	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+		ret = ext4_split_convert_extents(handle, inode, map, ppath,
+					 flags | EXT4_GET_BLOCKS_CONVERT);
+		if (ret <= 0)
+			goto out;
+		/*
+		 * Flag the inode(non aio case) or end_io struct (aio case)
+		 * that this IO needs to conversion to written when IO is
+		 * completed
+		 */
+		if (io)
+			ext4_set_io_unwritten_flag(inode, io);
+		else
+			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		goto out;
+	}
+	/* IO end_io complete, convert the filled extent to written */
+	if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
+							   ppath);
+		if (ret >= 0) {
+			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, map->m_lblk,
+						 path, map->m_len);
+		} else
+			err = ret;
+		map->m_flags |= EXT4_MAP_MAPPED;
+		map->m_pblk = newblock;
+		if (allocated > map->m_len)
+			allocated = map->m_len;
+		map->m_len = allocated;
+		goto out2;
+	}
+	/* buffered IO case */
+	/*
+	 * repeat fallocate creation request
+	 * we already have an unwritten extent
+	 */
+	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		goto map_out;
+	}
+
+	/* buffered READ or buffered write_begin() lookup */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		/*
+		 * We have blocks reserved already.  We
+		 * return allocated blocks so that delalloc
+		 * won't do block reservation for us.  But
+		 * the buffer head will be unmapped so that
+		 * a read from the block returns 0s.
+		 */
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		goto out1;
+	}
+
+	/* buffered write, writepage time, convert*/
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
+	if (ret >= 0)
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+out:
+	if (ret <= 0) {
+		err = ret;
+		goto out2;
+	} else
+		allocated = ret;
+	map->m_flags |= EXT4_MAP_NEW;
+	/*
+	 * if we allocated more blocks than requested
+	 * we need to make sure we unmap the extra block
+	 * allocated. The actual needed block will get
+	 * unmapped later when we find the buffer_head marked
+	 * new.
+	 */
+	if (allocated > map->m_len) {
+		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+					newblock + map->m_len,
+					allocated - map->m_len);
+		allocated = map->m_len;
+	}
+	map->m_len = allocated;
+
+	/*
+	 * If we have done fallocate with the offset that is already
+	 * delayed allocated, we would have block reservation
+	 * and quota reservation done in the delayed write path.
+	 * But fallocate would have already updated quota and block
+	 * count for this offset. So cancel these reservation
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+				map->m_lblk, map->m_len);
+		if (reserved_clusters)
+			ext4_da_update_reserve_space(inode,
+						     reserved_clusters,
+						     0);
+	}
+
+map_out:
+	map->m_flags |= EXT4_MAP_MAPPED;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+					 map->m_len);
+		if (err < 0)
+			goto out2;
+	}
+out1:
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	ext4_ext_show_leaf(inode, path);
+	map->m_pblk = newblock;
+	map->m_len = allocated;
+out2:
+	return err ? err : allocated;
+}
+
+/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ *	@sb	The filesystem superblock structure
+ *	@map	The requested lblk->pblk mapping
+ *	@ex	The extent structure which might contain an implied
+ *			cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree.  Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree.  There are three cases we
+ * want to catch.  The first is this case:
+ *
+ *		 |--- cluster # N--|
+ *    |--- extent ---|	|---- requested region ---|
+ *			|==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ *   |--------- cluster # N ----------------|
+ *	   |--- requested region --|   |------- extent ----|
+ *	   |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ *          |------------- cluster # N-------------|
+ * |----- ex -----|                  |---- ex_right ----|
+ *                  |------ requested region ------|
+ *                  |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region.  Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+				     struct ext4_map_blocks *map,
+				     struct ext4_extent *ex,
+				     struct ext4_ext_path *path)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+	ext4_lblk_t ex_cluster_start, ex_cluster_end;
+	ext4_lblk_t rr_cluster_start;
+	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+	unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+	/* The extent passed in that we are trying to match */
+	ex_cluster_start = EXT4_B2C(sbi, ee_block);
+	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+	/* The requested region passed into ext4_map_blocks() */
+	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+
+	if ((rr_cluster_start == ex_cluster_end) ||
+	    (rr_cluster_start == ex_cluster_start)) {
+		if (rr_cluster_start == ex_cluster_end)
+			ee_start += ee_len - 1;
+		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
+		map->m_len = min(map->m_len,
+				 (unsigned) sbi->s_cluster_ratio - c_offset);
+		/*
+		 * Check for and handle this case:
+		 *
+		 *   |--------- cluster # N-------------|
+		 *		       |------- extent ----|
+		 *	   |--- requested region ---|
+		 *	   |===========|
+		 */
+
+		if (map->m_lblk < ee_block)
+			map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+		/*
+		 * Check for the case where there is already another allocated
+		 * block to the right of 'ex' but before the end of the cluster.
+		 *
+		 *          |------------- cluster # N-------------|
+		 * |----- ex -----|                  |---- ex_right ----|
+		 *                  |------ requested region ------|
+		 *                  |================|
+		 */
+		if (map->m_lblk > ee_block) {
+			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+			map->m_len = min(map->m_len, next - map->m_lblk);
+		}
+
+		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+		return 1;
+	}
+
+	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+	return 0;
+}
+
+
+/*
+ * Block allocation/map/preallocation routine for extents based files
+ *
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ *
+ * return > 0, number of of blocks already mapped/allocated
+ *          if create == 0 and these are pre-allocated blocks
+ *          	buffer head is unmapped
+ *          otherwise blocks are mapped
+ *
+ * return = 0, if plain look up failed (blocks have not been allocated)
+ *          buffer head is unmapped
+ *
+ * return < 0, error case.
+ */
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map, int flags)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent newex, *ex, *ex2;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_fsblk_t newblock = 0;
+	int free_on_err = 0, err = 0, depth, ret;
+	unsigned int allocated = 0, offset = 0;
+	unsigned int allocated_clusters = 0;
+	struct ext4_allocation_request ar;
+	ext4_io_end_t *io = ext4_inode_aio(inode);
+	ext4_lblk_t cluster_offset;
+	int set_unwritten = 0;
+	bool map_from_cluster = false;
+
+	ext_debug("blocks %u/%u requested for inode %lu\n",
+		  map->m_lblk, map->m_len, inode->i_ino);
+	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+
+	/* find extent for this block */
+	path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		path = NULL;
+		goto out2;
+	}
+
+	depth = ext_depth(inode);
+
+	/*
+	 * consistent leaf must not be empty;
+	 * this situation is possible, though, _during_ tree modification;
+	 * this is why assert can't be put in ext4_find_extent()
+	 */
+	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+		EXT4_ERROR_INODE(inode, "bad extent address "
+				 "lblock: %lu, depth: %d pblock %lld",
+				 (unsigned long) map->m_lblk, depth,
+				 path[depth].p_block);
+		err = -EIO;
+		goto out2;
+	}
+
+	ex = path[depth].p_ext;
+	if (ex) {
+		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+		unsigned short ee_len;
+
+
+		/*
+		 * unwritten extents are treated as holes, except that
+		 * we split out initialized portions during a write.
+		 */
+		ee_len = ext4_ext_get_actual_len(ex);
+
+		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
+		/* if found extent covers block, simply return it */
+		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			newblock = map->m_lblk - ee_block + ee_start;
+			/* number of remaining blocks in the extent */
+			allocated = ee_len - (map->m_lblk - ee_block);
+			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+				  ee_block, ee_len, newblock);
+
+			/*
+			 * If the extent is initialized check whether the
+			 * caller wants to convert it to unwritten.
+			 */
+			if ((!ext4_ext_is_unwritten(ex)) &&
+			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+				allocated = convert_initialized_extent(
+						handle, inode, map, &path,
+						flags, allocated, newblock);
+				goto out2;
+			} else if (!ext4_ext_is_unwritten(ex))
+				goto out;
+
+			ret = ext4_ext_handle_unwritten_extents(
+				handle, inode, map, &path, flags,
+				allocated, newblock);
+			if (ret < 0)
+				err = ret;
+			else
+				allocated = ret;
+			goto out2;
+		}
+	}
+
+	/*
+	 * requested block isn't allocated yet;
+	 * we couldn't try to create block if create flag is zero
+	 */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		/*
+		 * put just found gap into cache to speed up
+		 * subsequent requests
+		 */
+		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+		goto out2;
+	}
+
+	/*
+	 * Okay, we need to do block allocation.
+	 */
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+
+	/*
+	 * If we are doing bigalloc, check to see if the extent returned
+	 * by ext4_find_extent() implies a cluster we can use.
+	 */
+	if (cluster_offset && ex &&
+	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		map_from_cluster = true;
+		goto got_allocated_blocks;
+	}
+
+	/* find neighbour allocated blocks */
+	ar.lleft = map->m_lblk;
+	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
+	if (err)
+		goto out2;
+	ar.lright = map->m_lblk;
+	ex2 = NULL;
+	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+	if (err)
+		goto out2;
+
+	/* Check if the extent after searching to the right implies a
+	 * cluster we can use. */
+	if ((sbi->s_cluster_ratio > 1) && ex2 &&
+	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		map_from_cluster = true;
+		goto got_allocated_blocks;
+	}
+
+	/*
+	 * See if request is beyond maximum number of blocks we can have in
+	 * a single extent. For an initialized extent this limit is
+	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
+	 * EXT_UNWRITTEN_MAX_LEN.
+	 */
+	if (map->m_len > EXT_INIT_MAX_LEN &&
+	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+		map->m_len = EXT_INIT_MAX_LEN;
+	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
+		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+		map->m_len = EXT_UNWRITTEN_MAX_LEN;
+
+	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+	newex.ee_len = cpu_to_le16(map->m_len);
+	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
+	if (err)
+		allocated = ext4_ext_get_actual_len(&newex);
+	else
+		allocated = map->m_len;
+
+	/* allocate new block */
+	ar.inode = inode;
+	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+	ar.logical = map->m_lblk;
+	/*
+	 * We calculate the offset from the beginning of the cluster
+	 * for the logical block number, since when we allocate a
+	 * physical cluster, the physical block should start at the
+	 * same offset from the beginning of the cluster.  This is
+	 * needed so that future calls to get_implied_cluster_alloc()
+	 * work correctly.
+	 */
+	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+	ar.goal -= offset;
+	ar.logical -= offset;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+	else
+		/* disable in-core preallocation for non-regular files */
+		ar.flags = 0;
+	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+	newblock = ext4_mb_new_blocks(handle, &ar, &err);
+	if (!newblock)
+		goto out2;
+	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
+		  ar.goal, newblock, allocated);
+	free_on_err = 1;
+	allocated_clusters = ar.len;
+	ar.len = EXT4_C2B(sbi, ar.len) - offset;
+	if (ar.len > allocated)
+		ar.len = allocated;
+
+got_allocated_blocks:
+	/* try to insert new extent into found leaf and return */
+	ext4_ext_store_pblock(&newex, newblock + offset);
+	newex.ee_len = cpu_to_le16(ar.len);
+	/* Mark unwritten */
+	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+		ext4_ext_mark_unwritten(&newex);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		/*
+		 * io_end structure was created for every IO write to an
+		 * unwritten extent. To avoid unnecessary conversion,
+		 * here we flag the IO that really needs the conversion.
+		 * For non asycn direct IO case, flag the inode state
+		 * that we need to perform conversion when IO is done.
+		 */
+		if (flags & EXT4_GET_BLOCKS_PRE_IO)
+			set_unwritten = 1;
+	}
+
+	err = 0;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+		err = check_eofblocks_fl(handle, inode, map->m_lblk,
+					 path, ar.len);
+	if (!err)
+		err = ext4_ext_insert_extent(handle, inode, &path,
+					     &newex, flags);
+
+	if (!err && set_unwritten) {
+		if (io)
+			ext4_set_io_unwritten_flag(inode, io);
+		else
+			ext4_set_inode_state(inode,
+					     EXT4_STATE_DIO_UNWRITTEN);
+	}
+
+	if (err && free_on_err) {
+		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
+			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
+		/* free data blocks we just allocated */
+		/* not a good idea to call discard here directly,
+		 * but otherwise we'd need to call it every free() */
+		ext4_discard_preallocations(inode);
+		ext4_free_blocks(handle, inode, NULL, newblock,
+				 EXT4_C2B(sbi, allocated_clusters), fb_flags);
+		goto out2;
+	}
+
+	/* previous routine could use block we allocated */
+	newblock = ext4_ext_pblock(&newex);
+	allocated = ext4_ext_get_actual_len(&newex);
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_flags |= EXT4_MAP_NEW;
+
+	/*
+	 * Update reserved blocks/metadata blocks after successful
+	 * block allocation which had been deferred till now.
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
+		/*
+		 * Check how many clusters we had reserved this allocated range
+		 */
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+						map->m_lblk, allocated);
+		if (!map_from_cluster) {
+			BUG_ON(allocated_clusters < reserved_clusters);
+			if (reserved_clusters < allocated_clusters) {
+				struct ext4_inode_info *ei = EXT4_I(inode);
+				int reservation = allocated_clusters -
+						  reserved_clusters;
+				/*
+				 * It seems we claimed few clusters outside of
+				 * the range of this allocation. We should give
+				 * it back to the reservation pool. This can
+				 * happen in the following case:
+				 *
+				 * * Suppose s_cluster_ratio is 4 (i.e., each
+				 *   cluster has 4 blocks. Thus, the clusters
+				 *   are [0-3],[4-7],[8-11]...
+				 * * First comes delayed allocation write for
+				 *   logical blocks 10 & 11. Since there were no
+				 *   previous delayed allocated blocks in the
+				 *   range [8-11], we would reserve 1 cluster
+				 *   for this write.
+				 * * Next comes write for logical blocks 3 to 8.
+				 *   In this case, we will reserve 2 clusters
+				 *   (for [0-3] and [4-7]; and not for [8-11] as
+				 *   that range has a delayed allocated blocks.
+				 *   Thus total reserved clusters now becomes 3.
+				 * * Now, during the delayed allocation writeout
+				 *   time, we will first write blocks [3-8] and
+				 *   allocate 3 clusters for writing these
+				 *   blocks. Also, we would claim all these
+				 *   three clusters above.
+				 * * Now when we come here to writeout the
+				 *   blocks [10-11], we would expect to claim
+				 *   the reservation of 1 cluster we had made
+				 *   (and we would claim it since there are no
+				 *   more delayed allocated blocks in the range
+				 *   [8-11]. But our reserved cluster count had
+				 *   already gone to 0.
+				 *
+				 *   Thus, at the step 4 above when we determine
+				 *   that there are still some unwritten delayed
+				 *   allocated blocks outside of our current
+				 *   block range, we should increment the
+				 *   reserved clusters count so that when the
+				 *   remaining blocks finally gets written, we
+				 *   could claim them.
+				 */
+				dquot_reserve_block(inode,
+						EXT4_C2B(sbi, reservation));
+				spin_lock(&ei->i_block_reservation_lock);
+				ei->i_reserved_data_blocks += reservation;
+				spin_unlock(&ei->i_block_reservation_lock);
+			}
+			/*
+			 * We will claim quota for all newly allocated blocks.
+			 * We're updating the reserved space *after* the
+			 * correction above so we do not accidentally free
+			 * all the metadata reservation because we might
+			 * actually need it later on.
+			 */
+			ext4_da_update_reserve_space(inode, allocated_clusters,
+							1);
+		}
+	}
+
+	/*
+	 * Cache the extent and update transaction to commit on fdatasync only
+	 * when it is _not_ an unwritten extent.
+	 */
+	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+	else
+		ext4_update_inode_fsync_trans(handle, inode, 0);
+out:
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	ext4_ext_show_leaf(inode, path);
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
+out2:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	trace_ext4_ext_map_blocks_exit(inode, flags, map,
+				       err ? err : allocated);
+	return err ? err : allocated;
+}
+
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t last_block;
+	int err = 0;
+
+	/*
+	 * TODO: optimization is possible here.
+	 * Probably we need not scan at all,
+	 * because page truncation is enough.
+	 */
+
+	/* we have to know where to truncate from in crash case */
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	ext4_mark_inode_dirty(handle, inode);
+
+	last_block = (inode->i_size + sb->s_blocksize - 1)
+			>> EXT4_BLOCK_SIZE_BITS(sb);
+retry:
+	err = ext4_es_remove_extent(inode, last_block,
+				    EXT_MAX_BLOCKS - last_block);
+	if (err == -ENOMEM) {
+		cond_resched();
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry;
+	}
+	if (err) {
+		ext4_std_error(inode->i_sb, err);
+		return;
+	}
+	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+	ext4_std_error(inode->i_sb, err);
+}
+
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+				  ext4_lblk_t len, loff_t new_size,
+				  int flags, int mode)
+{
+	struct inode *inode = file_inode(file);
+	handle_t *handle;
+	int ret = 0;
+	int ret2 = 0;
+	int retries = 0;
+	struct ext4_map_blocks map;
+	unsigned int credits;
+	loff_t epos;
+
+	map.m_lblk = offset;
+	map.m_len = len;
+	/*
+	 * Don't normalize the request if it can fit in one extent so
+	 * that it doesn't get unnecessarily split into multiple
+	 * extents.
+	 */
+	if (len <= EXT_UNWRITTEN_MAX_LEN)
+		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+	/*
+	 * credits to insert 1 extent into extent tree
+	 */
+	credits = ext4_chunk_trans_blocks(inode, len);
+
+retry:
+	while (ret >= 0 && len) {
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+		ret = ext4_map_blocks(handle, inode, &map, flags);
+		if (ret <= 0) {
+			ext4_debug("inode #%lu: block %u: len %u: "
+				   "ext4_ext_map_blocks returned %d",
+				   inode->i_ino, map.m_lblk,
+				   map.m_len, ret);
+			ext4_mark_inode_dirty(handle, inode);
+			ret2 = ext4_journal_stop(handle);
+			break;
+		}
+		map.m_lblk += ret;
+		map.m_len = len = len - ret;
+		epos = (loff_t)map.m_lblk << inode->i_blkbits;
+		inode->i_ctime = ext4_current_time(inode);
+		if (new_size) {
+			if (epos > new_size)
+				epos = new_size;
+			if (ext4_update_inode_size(inode, epos) & 0x1)
+				inode->i_mtime = inode->i_ctime;
+		} else {
+			if (epos > inode->i_size)
+				ext4_set_inode_flag(inode,
+						    EXT4_INODE_EOFBLOCKS);
+		}
+		ext4_mark_inode_dirty(handle, inode);
+		ret2 = ext4_journal_stop(handle);
+		if (ret2)
+			break;
+	}
+	if (ret == -ENOSPC &&
+			ext4_should_retry_alloc(inode->i_sb, &retries)) {
+		ret = 0;
+		goto retry;
+	}
+
+	return ret > 0 ? ret2 : ret;
+}
+
+static long ext4_zero_range(struct file *file, loff_t offset,
+			    loff_t len, int mode)
+{
+	struct inode *inode = file_inode(file);
+	handle_t *handle = NULL;
+	unsigned int max_blocks;
+	loff_t new_size = 0;
+	int ret = 0;
+	int flags;
+	int credits;
+	int partial_begin, partial_end;
+	loff_t start, end;
+	ext4_lblk_t lblk;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned int blkbits = inode->i_blkbits;
+
+	trace_ext4_zero_range(inode, offset, len, mode);
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	/* Call ext4_force_commit to flush all data in case of data=journal. */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		ret = filemap_write_and_wait_range(mapping, offset,
+						   offset + len - 1);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Round up offset. This is not fallocate, we neet to zero out
+	 * blocks, so convert interior block aligned part of the range to
+	 * unwritten and possibly manually zero out unaligned parts of the
+	 * range.
+	 */
+	start = round_up(offset, 1 << blkbits);
+	end = round_down((offset + len), 1 << blkbits);
+
+	if (start < offset || end > offset + len)
+		return -EINVAL;
+	partial_begin = offset & ((1 << blkbits) - 1);
+	partial_end = (offset + len) & ((1 << blkbits) - 1);
+
+	lblk = start >> blkbits;
+	max_blocks = (end >> blkbits);
+	if (max_blocks < lblk)
+		max_blocks = 0;
+	else
+		max_blocks -= lblk;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * Indirect files do not support unwritten extnets
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	     offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto out_mutex;
+	}
+
+	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+	/* Preallocate the range including the unaligned edges */
+	if (partial_begin || partial_end) {
+		ret = ext4_alloc_file_blocks(file,
+				round_down(offset, 1 << blkbits) >> blkbits,
+				(round_up((offset + len), 1 << blkbits) -
+				 round_down(offset, 1 << blkbits)) >> blkbits,
+				new_size, flags, mode);
+		if (ret)
+			goto out_mutex;
+
+	}
+
+	/* Zero range excluding the unaligned edges */
+	if (max_blocks > 0) {
+		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+			  EXT4_EX_NOCACHE);
+
+		/* Now release the pages and zero block aligned part of pages*/
+		truncate_pagecache_range(inode, start, end - 1);
+		inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+		/* Wait all existing dio workers, newcomers will block on i_mutex */
+		ext4_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+
+		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+					     flags, mode);
+		if (ret)
+			goto out_dio;
+	}
+	if (!partial_begin && !partial_end)
+		goto out_dio;
+
+	/*
+	 * In worst case we have to writeout two nonadjacent unwritten
+	 * blocks and update the inode
+	 */
+	credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
+	if (ext4_should_journal_data(inode))
+		credits += 2;
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_std_error(inode->i_sb, ret);
+		goto out_dio;
+	}
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	if (new_size) {
+		ext4_update_inode_size(inode, new_size);
+	} else {
+		/*
+		* Mark that we allocate beyond EOF so the subsequent truncate
+		* can proceed even if the new size is the same as i_size.
+		*/
+		if ((offset + len) > i_size_read(inode))
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	}
+	ext4_mark_inode_dirty(handle, inode);
+
+	/* Zero out partial block at the edges of the range */
+	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+	if (file->f_flags & O_SYNC)
+		ext4_handle_sync(handle);
+
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+/*
+ * preallocate space for a file. This implements ext4's fallocate file
+ * operation, which gets called from sys_fallocate system call.
+ * For block-mapped files, posix_fallocate should fall back to the method
+ * of writing zeroes to the required new blocks (the same behavior which is
+ * expected for file systems which do not support fallocate() system call).
+ */
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	loff_t new_size = 0;
+	unsigned int max_blocks;
+	int ret = 0;
+	int flags;
+	ext4_lblk_t lblk;
+	unsigned int blkbits = inode->i_blkbits;
+
+	/*
+	 * Encrypted inodes can't handle collapse range or insert
+	 * range since we would need to re-encrypt blocks with a
+	 * different IV or XTS tweak (which are based on the logical
+	 * block number).
+	 *
+	 * XXX It's not clear why zero range isn't working, but we'll
+	 * leave it disabled for encrypted inodes for now.  This is a
+	 * bug we should fix....
+	 */
+	if (ext4_encrypted_inode(inode) &&
+	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
+		return -EOPNOTSUPP;
+
+	/* Return error if mode is not supported */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return ext4_punch_hole(inode, offset, len);
+
+	ret = ext4_convert_inline_data(inode);
+	if (ret)
+		return ret;
+
+	if (mode & FALLOC_FL_COLLAPSE_RANGE)
+		return ext4_collapse_range(inode, offset, len);
+
+	if (mode & FALLOC_FL_ZERO_RANGE)
+		return ext4_zero_range(file, offset, len, mode);
+
+	trace_ext4_fallocate_enter(inode, offset, len, mode);
+	lblk = offset >> blkbits;
+	/*
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
+	 */
+	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+		- lblk;
+
+	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * We only support preallocation for extent-based files only
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	     offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto out;
+	}
+
+	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+				     flags, mode);
+	if (ret)
+		goto out;
+
+	if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+		ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+						EXT4_I(inode)->i_sync_tid);
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+	return ret;
+}
+
+/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+				   loff_t offset, ssize_t len)
+{
+	unsigned int max_blocks;
+	int ret = 0;
+	int ret2 = 0;
+	struct ext4_map_blocks map;
+	unsigned int credits, blkbits = inode->i_blkbits;
+
+	map.m_lblk = offset >> blkbits;
+	/*
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
+	 */
+	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+		      map.m_lblk);
+	/*
+	 * This is somewhat ugly but the idea is clear: When transaction is
+	 * reserved, everything goes into it. Otherwise we rather start several
+	 * smaller transactions for conversion of each extent separately.
+	 */
+	if (handle) {
+		handle = ext4_journal_start_reserved(handle,
+						     EXT4_HT_EXT_CONVERT);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		credits = 0;
+	} else {
+		/*
+		 * credits to insert 1 extent into extent tree
+		 */
+		credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	}
+	while (ret >= 0 && ret < max_blocks) {
+		map.m_lblk += ret;
+		map.m_len = (max_blocks -= ret);
+		if (credits) {
+			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+						    credits);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				break;
+			}
+		}
+		ret = ext4_map_blocks(handle, inode, &map,
+				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+		if (ret <= 0)
+			ext4_warning(inode->i_sb,
+				     "inode #%lu: block %u: len %u: "
+				     "ext4_ext_map_blocks returned %d",
+				     inode->i_ino, map.m_lblk,
+				     map.m_len, ret);
+		ext4_mark_inode_dirty(handle, inode);
+		if (credits)
+			ret2 = ext4_journal_stop(handle);
+		if (ret <= 0 || ret2)
+			break;
+	}
+	if (!credits)
+		ret2 = ext4_journal_stop(handle);
+	return ret > 0 ? ret2 : ret;
+}
+
+/*
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newes unmodified.
+ */
+static int ext4_find_delayed_extent(struct inode *inode,
+				    struct extent_status *newes)
+{
+	struct extent_status es;
+	ext4_lblk_t block, next_del;
+
+	if (newes->es_pblk == 0) {
+		ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
+				newes->es_lblk + newes->es_len - 1, &es);
+
+		/*
+		 * No extent in extent-tree contains block @newes->es_pblk,
+		 * then the block may stay in 1)a hole or 2)delayed-extent.
+		 */
+		if (es.es_len == 0)
+			/* A hole found. */
+			return 0;
+
+		if (es.es_lblk > newes->es_lblk) {
+			/* A hole found. */
+			newes->es_len = min(es.es_lblk - newes->es_lblk,
+					    newes->es_len);
+			return 0;
+		}
+
+		newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
+	}
+
+	block = newes->es_lblk + newes->es_len;
+	ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+	if (es.es_len == 0)
+		next_del = EXT_MAX_BLOCKS;
+	else
+		next_del = es.es_lblk;
+
+	return next_del;
+}
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+static int ext4_xattr_fiemap(struct inode *inode,
+				struct fiemap_extent_info *fieinfo)
+{
+	__u64 physical = 0;
+	__u64 length;
+	__u32 flags = FIEMAP_EXTENT_LAST;
+	int blockbits = inode->i_sb->s_blocksize_bits;
+	int error = 0;
+
+	/* in-inode? */
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		struct ext4_iloc iloc;
+		int offset;	/* offset of xattr in inode */
+
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error)
+			return error;
+		physical = (__u64)iloc.bh->b_blocknr << blockbits;
+		offset = EXT4_GOOD_OLD_INODE_SIZE +
+				EXT4_I(inode)->i_extra_isize;
+		physical += offset;
+		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
+		brelse(iloc.bh);
+	} else { /* external block */
+		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
+		length = inode->i_sb->s_blocksize;
+	}
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, 0, physical,
+						length, flags);
+	return (error < 0 ? error : 0);
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	ext4_lblk_t start_blk;
+	int error = 0;
+
+	if (ext4_has_inline_data(inode)) {
+		int has_inline = 1;
+
+		error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
+						start, len);
+
+		if (has_inline)
+			return error;
+	}
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+		error = ext4_ext_precache(inode);
+		if (error)
+			return error;
+	}
+
+	/* fallback to generic here if not in extents fmt */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return generic_block_fiemap(inode, fieinfo, start, len,
+			ext4_get_block);
+
+	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+		return -EBADR;
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		error = ext4_xattr_fiemap(inode, fieinfo);
+	} else {
+		ext4_lblk_t len_blks;
+		__u64 last_blk;
+
+		start_blk = start >> inode->i_sb->s_blocksize_bits;
+		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+		if (last_blk >= EXT_MAX_BLOCKS)
+			last_blk = EXT_MAX_BLOCKS-1;
+		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
+
+		/*
+		 * Walk the extent tree gathering extent information
+		 * and pushing extents back to the user.
+		 */
+		error = ext4_fill_fiemap_extents(inode, start_blk,
+						 len_blks, fieinfo);
+	}
+	return error;
+}
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *path)
+{
+	int credits, err;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+
+	/*
+	 * Check if need to extend journal credits
+	 * 3 for leaf, sb, and inode plus 2 (bmap and group
+	 * descriptor) for each block group; assume two block
+	 * groups
+	 */
+	if (handle->h_buffer_credits < 7) {
+		credits = ext4_writepage_trans_blocks(inode);
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+		/* EAGAIN is success */
+		if (err && err != -EAGAIN)
+			return err;
+	}
+
+	err = ext4_ext_get_access(handle, inode, path);
+	return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+			    struct inode *inode, handle_t *handle,
+			    ext4_lblk_t *start)
+{
+	int depth, err = 0;
+	struct ext4_extent *ex_start, *ex_last;
+	bool update = 0;
+	depth = path->p_depth;
+
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			ex_start = path[depth].p_ext;
+			if (!ex_start)
+				return -EIO;
+
+			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+
+			err = ext4_access_path(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+				update = 1;
+
+			*start = le32_to_cpu(ex_last->ee_block) +
+				ext4_ext_get_actual_len(ex_last);
+
+			while (ex_start <= ex_last) {
+				le32_add_cpu(&ex_start->ee_block, -shift);
+				/* Try to merge to the left. */
+				if ((ex_start >
+				     EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+				    ext4_ext_try_to_merge_right(inode,
+							path, ex_start - 1))
+					ex_last--;
+				else
+					ex_start++;
+			}
+			err = ext4_ext_dirty(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			if (--depth < 0 || !update)
+				break;
+		}
+
+		/* Update index too */
+		err = ext4_access_path(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		/* we are done if current index is not a starting index */
+		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+			break;
+
+		depth--;
+	}
+
+out:
+	return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+		       ext4_lblk_t start, ext4_lblk_t shift)
+{
+	struct ext4_ext_path *path;
+	int ret = 0, depth;
+	struct ext4_extent *extent;
+	ext4_lblk_t stop_block;
+	ext4_lblk_t ex_start, ex_end;
+
+	/* Let path point to the last extent */
+	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	depth = path->p_depth;
+	extent = path[depth].p_ext;
+	if (!extent)
+		goto out;
+
+	stop_block = le32_to_cpu(extent->ee_block) +
+			ext4_ext_get_actual_len(extent);
+
+	/* Nothing to shift, if hole is at the end of file */
+	if (start >= stop_block)
+		goto out;
+
+	/*
+	 * Don't start shifting extents until we make sure the hole is big
+	 * enough to accomodate the shift.
+	 */
+	path = ext4_find_extent(inode, start - 1, &path, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	depth = path->p_depth;
+	extent =  path[depth].p_ext;
+	if (extent) {
+		ex_start = le32_to_cpu(extent->ee_block);
+		ex_end = le32_to_cpu(extent->ee_block) +
+			ext4_ext_get_actual_len(extent);
+	} else {
+		ex_start = 0;
+		ex_end = 0;
+	}
+
+	if ((start == ex_start && shift > ex_start) ||
+	    (shift > start - ex_end))
+		return -EINVAL;
+
+	/* Its safe to start updating extents */
+	while (start < stop_block) {
+		path = ext4_find_extent(inode, start, &path, 0);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+		depth = path->p_depth;
+		extent = path[depth].p_ext;
+		if (!extent) {
+			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+					 (unsigned long) start);
+			return -EIO;
+		}
+		if (start > le32_to_cpu(extent->ee_block)) {
+			/* Hole, move to the next extent */
+			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
+				path[depth].p_ext++;
+			} else {
+				start = ext4_ext_next_allocated_block(path);
+				continue;
+			}
+		}
+		ret = ext4_ext_shift_path_extents(path, shift, inode,
+				handle, &start);
+		if (ret)
+			break;
+	}
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t punch_start, punch_stop;
+	handle_t *handle;
+	unsigned int credits;
+	loff_t new_size, ioffset;
+	int ret;
+
+	/*
+	 * We need to test this early because xfstests assumes that a
+	 * collapse range of (0, 1) will return EOPNOTSUPP if the file
+	 * system does not support collapse range.
+	 */
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		return -EOPNOTSUPP;
+
+	/* Collapse range works only on fs block size aligned offsets. */
+	if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
+	    len & (EXT4_CLUSTER_SIZE(sb) - 1))
+		return -EINVAL;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	trace_ext4_collapse_range(inode, offset, len);
+
+	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	/* Call ext4_force_commit to flush all data in case of data=journal. */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Need to round down offset to be aligned with page size boundary
+	 * for page size > block size.
+	 */
+	ioffset = round_down(offset, PAGE_SIZE);
+
+	/* Write out all dirty pages */
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+					   LLONG_MAX);
+	if (ret)
+		return ret;
+
+	/* Take mutex lock */
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * There is no need to overlap collapse range with EOF, in which case
+	 * it is effectively a truncate operation
+	 */
+	if (offset + len >= i_size_read(inode)) {
+		ret = -EINVAL;
+		goto out_mutex;
+	}
+
+	/* Currently just for extent based files */
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	truncate_pagecache(inode, ioffset);
+
+	/* Wait for existing dio to complete */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	credits = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_dio;
+	}
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	ret = ext4_es_remove_extent(inode, punch_start,
+				    EXT_MAX_BLOCKS - punch_start);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+	ext4_discard_preallocations(inode);
+
+	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+				     punch_stop - punch_start);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	new_size = i_size_read(inode) - len;
+	i_size_write(inode, new_size);
+	EXT4_I(inode)->i_disksize = new_size;
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1:	First inode
+ * @inode2:	Second inode
+ * @lblk1:	Start block for first inode
+ * @lblk2:	Start block for second inode
+ * @count:	Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp:	Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ * 		i_mutex is held for both inodes
+ * 		i_data_sem is locked for write for both inodes
+ * Assumptions:
+ *		All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+		     struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, int unwritten, int *erp)
+{
+	struct ext4_ext_path *path1 = NULL;
+	struct ext4_ext_path *path2 = NULL;
+	int replaced_count = 0;
+
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+
+	*erp = ext4_es_remove_extent(inode1, lblk1, count);
+	if (unlikely(*erp))
+		return 0;
+	*erp = ext4_es_remove_extent(inode2, lblk2, count);
+	if (unlikely(*erp))
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex1, *ex2, tmp_ex;
+		ext4_lblk_t e1_blk, e2_blk;
+		int e1_len, e2_len, len;
+		int split = 0;
+
+		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+		if (unlikely(IS_ERR(path1))) {
+			*erp = PTR_ERR(path1);
+			path1 = NULL;
+		finish:
+			count = 0;
+			goto repeat;
+		}
+		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+		if (unlikely(IS_ERR(path2))) {
+			*erp = PTR_ERR(path2);
+			path2 = NULL;
+			goto finish;
+		}
+		ex1 = path1[path1->p_depth].p_ext;
+		ex2 = path2[path2->p_depth].p_ext;
+		/* Do we have somthing to swap ? */
+		if (unlikely(!ex2 || !ex1))
+			goto finish;
+
+		e1_blk = le32_to_cpu(ex1->ee_block);
+		e2_blk = le32_to_cpu(ex2->ee_block);
+		e1_len = ext4_ext_get_actual_len(ex1);
+		e2_len = ext4_ext_get_actual_len(ex2);
+
+		/* Hole handling */
+		if (!in_range(lblk1, e1_blk, e1_len) ||
+		    !in_range(lblk2, e2_blk, e2_len)) {
+			ext4_lblk_t next1, next2;
+
+			/* if hole after extent, then go to next extent */
+			next1 = ext4_ext_next_allocated_block(path1);
+			next2 = ext4_ext_next_allocated_block(path2);
+			/* If hole before extent, then shift to that extent */
+			if (e1_blk > lblk1)
+				next1 = e1_blk;
+			if (e2_blk > lblk2)
+				next2 = e1_blk;
+			/* Do we have something to swap */
+			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+				goto finish;
+			/* Move to the rightest boundary */
+			len = next1 - lblk1;
+			if (len < next2 - lblk2)
+				len = next2 - lblk2;
+			if (len > count)
+				len = count;
+			lblk1 += len;
+			lblk2 += len;
+			count -= len;
+			goto repeat;
+		}
+
+		/* Prepare left boundary */
+		if (e1_blk < lblk1) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		if (e2_blk < lblk2) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2,  lblk2, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e1_blk + e1_len - lblk1)
+			len = e1_blk + e1_len - lblk1;
+		if (len > e2_blk + e2_len - lblk2)
+			len = e2_blk + e2_len - lblk2;
+
+		if (len != e1_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1 + len, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		if (len != e2_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2, lblk2 + len, 0);
+			if (*erp)
+				goto finish;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		BUG_ON(e2_len != e1_len);
+		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		/* Both extents are fully inside boundaries. Swap it now */
+		tmp_ex = *ex1;
+		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+		ex1->ee_len = cpu_to_le16(e2_len);
+		ex2->ee_len = cpu_to_le16(e1_len);
+		if (unwritten)
+			ext4_ext_mark_unwritten(ex2);
+		if (ext4_ext_is_unwritten(&tmp_ex))
+			ext4_ext_mark_unwritten(ex1);
+
+		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+		*erp = ext4_ext_dirty(handle, inode2, path2 +
+				      path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_dirty(handle, inode1, path1 +
+				      path1->p_depth);
+		/*
+		 * Looks scarry ah..? second inode already points to new blocks,
+		 * and it was successfully dirtied. But luckily error may happen
+		 * only due to journal error, so full transaction will be
+		 * aborted anyway.
+		 */
+		if (unlikely(*erp))
+			goto finish;
+		lblk1 += len;
+		lblk2 += len;
+		replaced_count += len;
+		count -= len;
+
+	repeat:
+		ext4_ext_drop_refs(path1);
+		kfree(path1);
+		ext4_ext_drop_refs(path2);
+		kfree(path2);
+		path1 = path2 = NULL;
+	}
+	return replaced_count;
+}
diff --git a/kernel/fs/ext4/extents_status.c b/kernel/fs/ext4/extents_status.c
new file mode 100644
index 000000000..26724aeec
--- /dev/null
+++ b/kernel/fs/ext4/extents_status.c
@@ -0,0 +1,1310 @@
+/*
+ *  fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *	Allison Henderson <achender@linux.vnet.ibm.com>
+ *	Hugh Dickins <hughd@google.com>
+ *	Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/list_sort.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "ext4.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal.  It is
+ * original built by Yongqiang Yang.  At that time it is called delay
+ * extent tree, whose goal is only track delayed extents in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
+ * delay extent tree at the first commit.  But for better understand
+ * what it does, it has been rename to extent status tree.
+ *
+ * Step1:
+ * Currently the first step has been done.  All delayed extents are
+ * tracked in the tree.  It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
+ * invalidated.  Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree.  Hence, single extent cache can be removed
+ * because extent status tree can do a better job.  Extents in status
+ * tree are loaded on-demand.  Therefore, the extent status tree may not
+ * contain all of the extents in a file.  Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory.  written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure.  Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
+ */
+
+/*
+ * Extent status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extent status tree tracks all extent status.
+ *
+ * 1. Why we need to implement extent status tree?
+ *
+ * Without extent status tree, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
+ *
+ * Let us have a look at how they do without extent status tree.
+ *   --	FIEMAP
+ *	FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ *   --	SEEK_HOLE/DATA
+ *	SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ *   --	bigalloc
+ *	bigalloc looks up page cache to figure out if a block is
+ *	already under delayed allocation or not to determine whether
+ *	quota reserving is needed for the cluster.
+ *
+ *   --	writeout
+ *	Writeout looks up whole page cache to see if a buffer is
+ *	mapped, If there are not very many delayed buffers, then it is
+ *	time comsuming.
+ *
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. Ext4 extent status tree impelmentation
+ *
+ *   --	extent
+ *	A extent is a range of blocks which are contiguous logically and
+ *	physically.  Unlike extent in extent tree, this extent in ext4 is
+ *	a in-memory struct, there is no corresponding on-disk data.  There
+ *	is no limit on length of extent, so an extent can contain as many
+ *	blocks as they are contiguous logically and physically.
+ *
+ *   --	extent status tree
+ *	Every inode has an extent status tree and all allocation blocks
+ *	are added to the tree with different status.  The extent in the
+ *	tree are ordered by logical block no.
+ *
+ *   --	operations on a extent status tree
+ *	There are three important operations on a delayed extent tree: find
+ *	next extent, adding a extent(a range of blocks) and removing a extent.
+ *
+ *   --	race on a extent status tree
+ *	Extent status tree is protected by inode->i_es_lock.
+ *
+ *   --	memory consumption
+ *      Fragmented extent tree will make extent status tree cost too much
+ *      memory.  Hence, we will reclaim written/unwritten/hole extents from
+ *      the tree under a heavy memory pressure.
+ *
+ *
+ * ==========================================================================
+ * 3. Performance analysis
+ *
+ *   --	overhead
+ *	1. There is a cache extent for write access, so if writes are
+ *	not very random, adding space operaions are in O(1) time.
+ *
+ *   --	gain
+ *	2. Code is much simpler, more readable, more maintainable and
+ *	more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ *
+ *   -- Refactor delayed space reservation
+ *
+ *   -- Extent-level locking
+ */
+
+static struct kmem_cache *ext4_es_cachep;
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_lblk_t end);
+static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+		       struct ext4_inode_info *locked_ei);
+
+int __init ext4_init_es(void)
+{
+	ext4_es_cachep = kmem_cache_create("ext4_extent_status",
+					   sizeof(struct extent_status),
+					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
+	if (ext4_es_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_es(void)
+{
+	if (ext4_es_cachep)
+		kmem_cache_destroy(ext4_es_cachep);
+}
+
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+	tree->root = RB_ROOT;
+	tree->cache_es = NULL;
+}
+
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+	struct ext4_es_tree *tree;
+	struct rb_node *node;
+
+	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+	tree = &EXT4_I(inode)->i_es_tree;
+	node = rb_first(&tree->root);
+	while (node) {
+		struct extent_status *es;
+		es = rb_entry(node, struct extent_status, rb_node);
+		printk(KERN_DEBUG " [%u/%u) %llu %x",
+		       es->es_lblk, es->es_len,
+		       ext4_es_pblock(es), ext4_es_status(es));
+		node = rb_next(node);
+	}
+	printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
+{
+	BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+	return es->es_lblk + es->es_len - 1;
+}
+
+/*
+ * search through the tree for an delayed extent with a given offset.  If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+					      ext4_lblk_t lblk)
+{
+	struct rb_node *node = root->rb_node;
+	struct extent_status *es = NULL;
+
+	while (node) {
+		es = rb_entry(node, struct extent_status, rb_node);
+		if (lblk < es->es_lblk)
+			node = node->rb_left;
+		else if (lblk > ext4_es_end(es))
+			node = node->rb_right;
+		else
+			return es;
+	}
+
+	if (es && lblk < es->es_lblk)
+		return es;
+
+	if (es && lblk > ext4_es_end(es)) {
+		node = rb_next(&es->rb_node);
+		return node ? rb_entry(node, struct extent_status, rb_node) :
+			      NULL;
+	}
+
+	return NULL;
+}
+
+/*
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ *
+ * @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
+ * @es: delayed extent that we found
+ */
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+				 ext4_lblk_t lblk, ext4_lblk_t end,
+				 struct extent_status *es)
+{
+	struct ext4_es_tree *tree = NULL;
+	struct extent_status *es1 = NULL;
+	struct rb_node *node;
+
+	BUG_ON(es == NULL);
+	BUG_ON(end < lblk);
+	trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+
+	read_lock(&EXT4_I(inode)->i_es_lock);
+	tree = &EXT4_I(inode)->i_es_tree;
+
+	/* find extent in cache firstly */
+	es->es_lblk = es->es_len = es->es_pblk = 0;
+	if (tree->cache_es) {
+		es1 = tree->cache_es;
+		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u) %llu %x\n",
+				 lblk, es1->es_lblk, es1->es_len,
+				 ext4_es_pblock(es1), ext4_es_status(es1));
+			goto out;
+		}
+	}
+
+	es1 = __es_tree_search(&tree->root, lblk);
+
+out:
+	if (es1 && !ext4_es_is_delayed(es1)) {
+		while ((node = rb_next(&es1->rb_node)) != NULL) {
+			es1 = rb_entry(node, struct extent_status, rb_node);
+			if (es1->es_lblk > end) {
+				es1 = NULL;
+				break;
+			}
+			if (ext4_es_is_delayed(es1))
+				break;
+		}
+	}
+
+	if (es1 && ext4_es_is_delayed(es1)) {
+		tree->cache_es = es1;
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+	}
+
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+}
+
+static void ext4_es_list_add(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	if (!list_empty(&ei->i_es_list))
+		return;
+
+	spin_lock(&sbi->s_es_lock);
+	if (list_empty(&ei->i_es_list)) {
+		list_add_tail(&ei->i_es_list, &sbi->s_es_list);
+		sbi->s_es_nr_inode++;
+	}
+	spin_unlock(&sbi->s_es_lock);
+}
+
+static void ext4_es_list_del(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	spin_lock(&sbi->s_es_lock);
+	if (!list_empty(&ei->i_es_list)) {
+		list_del_init(&ei->i_es_list);
+		sbi->s_es_nr_inode--;
+		WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
+	}
+	spin_unlock(&sbi->s_es_lock);
+}
+
+static struct extent_status *
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+		     ext4_fsblk_t pblk)
+{
+	struct extent_status *es;
+	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+	if (es == NULL)
+		return NULL;
+	es->es_lblk = lblk;
+	es->es_len = len;
+	es->es_pblk = pblk;
+
+	/*
+	 * We don't count delayed extent because we never try to reclaim them
+	 */
+	if (!ext4_es_is_delayed(es)) {
+		if (!EXT4_I(inode)->i_es_shk_nr++)
+			ext4_es_list_add(inode);
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+					s_es_stats.es_stats_shk_cnt);
+	}
+
+	EXT4_I(inode)->i_es_all_nr++;
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
+	return es;
+}
+
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+{
+	EXT4_I(inode)->i_es_all_nr--;
+	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
+	/* Decrease the shrink counter when this es is not delayed */
+	if (!ext4_es_is_delayed(es)) {
+		BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
+		if (!--EXT4_I(inode)->i_es_shk_nr)
+			ext4_es_list_del(inode);
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+					s_es_stats.es_stats_shk_cnt);
+	}
+
+	kmem_cache_free(ext4_es_cachep, es);
+}
+
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ *  - logical block number is contiguous
+ *  - physical block number is contiguous
+ *  - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+				 struct extent_status *es2)
+{
+	if (ext4_es_type(es1) != ext4_es_type(es2))
+		return 0;
+
+	if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+		pr_warn("ES assertion failed when merging extents. "
+			"The sum of lengths of es1 (%d) and es2 (%d) "
+			"is bigger than allowed file size (%d)\n",
+			es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
+		WARN_ON(1);
+		return 0;
+	}
+
+	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
+		return 0;
+
+	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
+		return 1;
+
+	if (ext4_es_is_hole(es1))
+		return 1;
+
+	/* we need to check delayed extent is without unwritten status */
+	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+		return 1;
+
+	return 0;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct extent_status *es1;
+	struct rb_node *node;
+
+	node = rb_prev(&es->rb_node);
+	if (!node)
+		return es;
+
+	es1 = rb_entry(node, struct extent_status, rb_node);
+	if (ext4_es_can_be_merged(es1, es)) {
+		es1->es_len += es->es_len;
+		if (ext4_es_is_referenced(es))
+			ext4_es_set_referenced(es1);
+		rb_erase(&es->rb_node, &tree->root);
+		ext4_es_free_extent(inode, es);
+		es = es1;
+	}
+
+	return es;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct extent_status *es1;
+	struct rb_node *node;
+
+	node = rb_next(&es->rb_node);
+	if (!node)
+		return es;
+
+	es1 = rb_entry(node, struct extent_status, rb_node);
+	if (ext4_es_can_be_merged(es, es1)) {
+		es->es_len += es1->es_len;
+		if (ext4_es_is_referenced(es1))
+			ext4_es_set_referenced(es);
+		rb_erase(node, &tree->root);
+		ext4_es_free_extent(inode, es1);
+	}
+
+	return es;
+}
+
+#ifdef ES_AGGRESSIVE_TEST
+#include "ext4_extents.h"	/* Needed when ES_AGGRESSIVE_TEST is defined */
+
+static void ext4_es_insert_extent_ext_check(struct inode *inode,
+					    struct extent_status *es)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	ext4_fsblk_t ee_start;
+	unsigned short ee_len;
+	int depth, ee_status, es_status;
+
+	path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
+	if (IS_ERR(path))
+		return;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+
+	if (ex) {
+
+		ee_block = le32_to_cpu(ex->ee_block);
+		ee_start = ext4_ext_pblock(ex);
+		ee_len = ext4_ext_get_actual_len(ex);
+
+		ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
+		es_status = ext4_es_is_unwritten(es) ? 1 : 0;
+
+		/*
+		 * Make sure ex and es are not overlap when we try to insert
+		 * a delayed/hole extent.
+		 */
+		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
+			if (in_range(es->es_lblk, ee_block, ee_len)) {
+				pr_warn("ES insert assertion failed for "
+					"inode: %lu we can find an extent "
+					"at block [%d/%d/%llu/%c], but we "
+					"want to add a delayed/hole extent "
+					"[%d/%d/%llu/%x]\n",
+					inode->i_ino, ee_block, ee_len,
+					ee_start, ee_status ? 'u' : 'w',
+					es->es_lblk, es->es_len,
+					ext4_es_pblock(es), ext4_es_status(es));
+			}
+			goto out;
+		}
+
+		/*
+		 * We don't check ee_block == es->es_lblk, etc. because es
+		 * might be a part of whole extent, vice versa.
+		 */
+		if (es->es_lblk < ee_block ||
+		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+			goto out;
+		}
+
+		if (ee_status ^ es_status) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+		}
+	} else {
+		/*
+		 * We can't find an extent on disk.  So we need to make sure
+		 * that we don't want to add an written/unwritten extent.
+		 */
+		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"can't find an extent at block %d but we want "
+				"to add a written/unwritten extent "
+				"[%d/%d/%llu/%x]\n", inode->i_ino,
+				es->es_lblk, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+		}
+	}
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+}
+
+static void ext4_es_insert_extent_ind_check(struct inode *inode,
+					    struct extent_status *es)
+{
+	struct ext4_map_blocks map;
+	int retval;
+
+	/*
+	 * Here we call ext4_ind_map_blocks to lookup a block mapping because
+	 * 'Indirect' structure is defined in indirect.c.  So we couldn't
+	 * access direct/indirect tree from outside.  It is too dirty to define
+	 * this function in indirect.c file.
+	 */
+
+	map.m_lblk = es->es_lblk;
+	map.m_len = es->es_len;
+
+	retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
+	if (retval > 0) {
+		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
+			/*
+			 * We want to add a delayed/hole extent but this
+			 * block has been allocated.
+			 */
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"We can find blocks but we want to add a "
+				"delayed/hole extent [%d/%d/%llu/%x]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		} else if (ext4_es_is_written(es)) {
+			if (retval != es->es_len) {
+				pr_warn("ES insert assertion failed for "
+					"inode: %lu retval %d != es_len %d\n",
+					inode->i_ino, retval, es->es_len);
+				return;
+			}
+			if (map.m_pblk != ext4_es_pblock(es)) {
+				pr_warn("ES insert assertion failed for "
+					"inode: %lu m_pblk %llu != "
+					"es_pblk %llu\n",
+					inode->i_ino, map.m_pblk,
+					ext4_es_pblock(es));
+				return;
+			}
+		} else {
+			/*
+			 * We don't need to check unwritten extent because
+			 * indirect-based file doesn't have it.
+			 */
+			BUG_ON(1);
+		}
+	} else if (retval == 0) {
+		if (ext4_es_is_written(es)) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"We can't find the block but we want to add "
+				"a written extent [%d/%d/%llu/%x]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		}
+	}
+}
+
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+	/*
+	 * We don't need to worry about the race condition because
+	 * caller takes i_data_sem locking.
+	 */
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ext4_es_insert_extent_ext_check(inode, es);
+	else
+		ext4_es_insert_extent_ind_check(inode, es);
+}
+#else
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+}
+#endif
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct rb_node **p = &tree->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_status *es;
+
+	while (*p) {
+		parent = *p;
+		es = rb_entry(parent, struct extent_status, rb_node);
+
+		if (newes->es_lblk < es->es_lblk) {
+			if (ext4_es_can_be_merged(newes, es)) {
+				/*
+				 * Here we can modify es_lblk directly
+				 * because it isn't overlapped.
+				 */
+				es->es_lblk = newes->es_lblk;
+				es->es_len += newes->es_len;
+				if (ext4_es_is_written(es) ||
+				    ext4_es_is_unwritten(es))
+					ext4_es_store_pblock(es,
+							     newes->es_pblk);
+				es = ext4_es_try_to_merge_left(inode, es);
+				goto out;
+			}
+			p = &(*p)->rb_left;
+		} else if (newes->es_lblk > ext4_es_end(es)) {
+			if (ext4_es_can_be_merged(es, newes)) {
+				es->es_len += newes->es_len;
+				es = ext4_es_try_to_merge_right(inode, es);
+				goto out;
+			}
+			p = &(*p)->rb_right;
+		} else {
+			BUG_ON(1);
+			return -EINVAL;
+		}
+	}
+
+	es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+				  newes->es_pblk);
+	if (!es)
+		return -ENOMEM;
+	rb_link_node(&es->rb_node, parent, p);
+	rb_insert_color(&es->rb_node, &tree->root);
+
+out:
+	tree->cache_es = es;
+	return 0;
+}
+
+/*
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len, ext4_fsblk_t pblk,
+			  unsigned int status)
+{
+	struct extent_status newes;
+	ext4_lblk_t end = lblk + len - 1;
+	int err = 0;
+
+	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
+		 lblk, len, pblk, status, inode->i_ino);
+
+	if (!len)
+		return 0;
+
+	BUG_ON(end < lblk);
+
+	if ((status & EXTENT_STATUS_DELAYED) &&
+	    (status & EXTENT_STATUS_WRITTEN)) {
+		ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
+				" delayed and written which can potentially "
+				" cause data loss.\n", lblk, len);
+		WARN_ON(1);
+	}
+
+	newes.es_lblk = lblk;
+	newes.es_len = len;
+	ext4_es_store_pblock_status(&newes, pblk, status);
+	trace_ext4_es_insert_extent(inode, &newes);
+
+	ext4_es_insert_extent_check(inode, &newes);
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+	err = __es_remove_extent(inode, lblk, end);
+	if (err != 0)
+		goto error;
+retry:
+	err = __es_insert_extent(inode, &newes);
+	if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+					  128, EXT4_I(inode)))
+		goto retry;
+	if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+		err = 0;
+
+error:
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+
+	ext4_es_print_tree(inode);
+
+	return err;
+}
+
+/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len, ext4_fsblk_t pblk,
+			  unsigned int status)
+{
+	struct extent_status *es;
+	struct extent_status newes;
+	ext4_lblk_t end = lblk + len - 1;
+
+	newes.es_lblk = lblk;
+	newes.es_len = len;
+	ext4_es_store_pblock_status(&newes, pblk, status);
+	trace_ext4_es_cache_extent(inode, &newes);
+
+	if (!len)
+		return;
+
+	BUG_ON(end < lblk);
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+
+	es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+	if (!es || es->es_lblk > end)
+		__es_insert_extent(inode, &newes);
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
+ *
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
+ */
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+			  struct extent_status *es)
+{
+	struct ext4_es_tree *tree;
+	struct ext4_es_stats *stats;
+	struct extent_status *es1 = NULL;
+	struct rb_node *node;
+	int found = 0;
+
+	trace_ext4_es_lookup_extent_enter(inode, lblk);
+	es_debug("lookup extent in block %u\n", lblk);
+
+	tree = &EXT4_I(inode)->i_es_tree;
+	read_lock(&EXT4_I(inode)->i_es_lock);
+
+	/* find extent in cache firstly */
+	es->es_lblk = es->es_len = es->es_pblk = 0;
+	if (tree->cache_es) {
+		es1 = tree->cache_es;
+		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u)\n",
+				 lblk, es1->es_lblk, es1->es_len);
+			found = 1;
+			goto out;
+		}
+	}
+
+	node = tree->root.rb_node;
+	while (node) {
+		es1 = rb_entry(node, struct extent_status, rb_node);
+		if (lblk < es1->es_lblk)
+			node = node->rb_left;
+		else if (lblk > ext4_es_end(es1))
+			node = node->rb_right;
+		else {
+			found = 1;
+			break;
+		}
+	}
+
+out:
+	stats = &EXT4_SB(inode->i_sb)->s_es_stats;
+	if (found) {
+		BUG_ON(!es1);
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+		if (!ext4_es_is_referenced(es))
+			ext4_es_set_referenced(es);
+		stats->es_stats_cache_hits++;
+	} else {
+		stats->es_stats_cache_misses++;
+	}
+
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	trace_ext4_es_lookup_extent_exit(inode, es, found);
+	return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_lblk_t end)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct rb_node *node;
+	struct extent_status *es;
+	struct extent_status orig_es;
+	ext4_lblk_t len1, len2;
+	ext4_fsblk_t block;
+	int err;
+
+retry:
+	err = 0;
+	es = __es_tree_search(&tree->root, lblk);
+	if (!es)
+		goto out;
+	if (es->es_lblk > end)
+		goto out;
+
+	/* Simply invalidate cache_es. */
+	tree->cache_es = NULL;
+
+	orig_es.es_lblk = es->es_lblk;
+	orig_es.es_len = es->es_len;
+	orig_es.es_pblk = es->es_pblk;
+
+	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+	len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
+	if (len1 > 0)
+		es->es_len = len1;
+	if (len2 > 0) {
+		if (len1 > 0) {
+			struct extent_status newes;
+
+			newes.es_lblk = end + 1;
+			newes.es_len = len2;
+			block = 0x7FDEADBEEFULL;
+			if (ext4_es_is_written(&orig_es) ||
+			    ext4_es_is_unwritten(&orig_es))
+				block = ext4_es_pblock(&orig_es) +
+					orig_es.es_len - len2;
+			ext4_es_store_pblock_status(&newes, block,
+						    ext4_es_status(&orig_es));
+			err = __es_insert_extent(inode, &newes);
+			if (err) {
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				if ((err == -ENOMEM) &&
+				    __es_shrink(EXT4_SB(inode->i_sb),
+							128, EXT4_I(inode)))
+					goto retry;
+				goto out;
+			}
+		} else {
+			es->es_lblk = end + 1;
+			es->es_len = len2;
+			if (ext4_es_is_written(es) ||
+			    ext4_es_is_unwritten(es)) {
+				block = orig_es.es_pblk + orig_es.es_len - len2;
+				ext4_es_store_pblock(es, block);
+			}
+		}
+		goto out;
+	}
+
+	if (len1 > 0) {
+		node = rb_next(&es->rb_node);
+		if (node)
+			es = rb_entry(node, struct extent_status, rb_node);
+		else
+			es = NULL;
+	}
+
+	while (es && ext4_es_end(es) <= end) {
+		node = rb_next(&es->rb_node);
+		rb_erase(&es->rb_node, &tree->root);
+		ext4_es_free_extent(inode, es);
+		if (!node) {
+			es = NULL;
+			break;
+		}
+		es = rb_entry(node, struct extent_status, rb_node);
+	}
+
+	if (es && es->es_lblk < end + 1) {
+		ext4_lblk_t orig_len = es->es_len;
+
+		len1 = ext4_es_end(es) - end;
+		es->es_lblk = end + 1;
+		es->es_len = len1;
+		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+			block = es->es_pblk + orig_len - len1;
+			ext4_es_store_pblock(es, block);
+		}
+	}
+
+out:
+	return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len)
+{
+	ext4_lblk_t end;
+	int err = 0;
+
+	trace_ext4_es_remove_extent(inode, lblk, len);
+	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+		 lblk, len, inode->i_ino);
+
+	if (!len)
+		return err;
+
+	end = lblk + len - 1;
+	BUG_ON(end < lblk);
+
+	/*
+	 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
+	 * so that we are sure __es_shrink() is done with the inode before it
+	 * is reclaimed.
+	 */
+	write_lock(&EXT4_I(inode)->i_es_lock);
+	err = __es_remove_extent(inode, lblk, end);
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+	ext4_es_print_tree(inode);
+	return err;
+}
+
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+		       struct ext4_inode_info *locked_ei)
+{
+	struct ext4_inode_info *ei;
+	struct ext4_es_stats *es_stats;
+	ktime_t start_time;
+	u64 scan_time;
+	int nr_to_walk;
+	int nr_shrunk = 0;
+	int retried = 0, nr_skipped = 0;
+
+	es_stats = &sbi->s_es_stats;
+	start_time = ktime_get();
+
+retry:
+	spin_lock(&sbi->s_es_lock);
+	nr_to_walk = sbi->s_es_nr_inode;
+	while (nr_to_walk-- > 0) {
+		if (list_empty(&sbi->s_es_list)) {
+			spin_unlock(&sbi->s_es_lock);
+			goto out;
+		}
+		ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+				      i_es_list);
+		/* Move the inode to the tail */
+		list_move_tail(&ei->i_es_list, &sbi->s_es_list);
+
+		/*
+		 * Normally we try hard to avoid shrinking precached inodes,
+		 * but we will as a last resort.
+		 */
+		if (!retried && ext4_test_inode_state(&ei->vfs_inode,
+						EXT4_STATE_EXT_PRECACHED)) {
+			nr_skipped++;
+			continue;
+		}
+
+		if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
+			nr_skipped++;
+			continue;
+		}
+		/*
+		 * Now we hold i_es_lock which protects us from inode reclaim
+		 * freeing inode under us
+		 */
+		spin_unlock(&sbi->s_es_lock);
+
+		nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
+		write_unlock(&ei->i_es_lock);
+
+		if (nr_to_scan <= 0)
+			goto out;
+		spin_lock(&sbi->s_es_lock);
+	}
+	spin_unlock(&sbi->s_es_lock);
+
+	/*
+	 * If we skipped any inodes, and we weren't able to make any
+	 * forward progress, try again to scan precached inodes.
+	 */
+	if ((nr_shrunk == 0) && nr_skipped && !retried) {
+		retried++;
+		goto retry;
+	}
+
+	if (locked_ei && nr_shrunk == 0)
+		nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+
+out:
+	scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+	if (likely(es_stats->es_stats_scan_time))
+		es_stats->es_stats_scan_time = (scan_time +
+				es_stats->es_stats_scan_time*3) / 4;
+	else
+		es_stats->es_stats_scan_time = scan_time;
+	if (scan_time > es_stats->es_stats_max_scan_time)
+		es_stats->es_stats_max_scan_time = scan_time;
+	if (likely(es_stats->es_stats_shrunk))
+		es_stats->es_stats_shrunk = (nr_shrunk +
+				es_stats->es_stats_shrunk*3) / 4;
+	else
+		es_stats->es_stats_shrunk = nr_shrunk;
+
+	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
+			     nr_skipped, retried);
+	return nr_shrunk;
+}
+
+static unsigned long ext4_es_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	unsigned long nr;
+	struct ext4_sb_info *sbi;
+
+	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
+	return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
+{
+	struct ext4_sb_info *sbi = container_of(shrink,
+					struct ext4_sb_info, s_es_shrinker);
+	int nr_to_scan = sc->nr_to_scan;
+	int ret, nr_shrunk;
+
+	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
+
+	if (!nr_to_scan)
+		return ret;
+
+	nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
+
+	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
+	return nr_shrunk;
+}
+
+static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *
+ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+{
+	struct ext4_sb_info *sbi = seq->private;
+	struct ext4_es_stats *es_stats = &sbi->s_es_stats;
+	struct ext4_inode_info *ei, *max = NULL;
+	unsigned int inode_cnt = 0;
+
+	if (v != SEQ_START_TOKEN)
+		return 0;
+
+	/* here we just find an inode that has the max nr. of objects */
+	spin_lock(&sbi->s_es_lock);
+	list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
+		inode_cnt++;
+		if (max && max->i_es_all_nr < ei->i_es_all_nr)
+			max = ei;
+		else if (!max)
+			max = ei;
+	}
+	spin_unlock(&sbi->s_es_lock);
+
+	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
+		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
+	seq_printf(seq, "  %lu/%lu cache hits/misses\n",
+		   es_stats->es_stats_cache_hits,
+		   es_stats->es_stats_cache_misses);
+	if (inode_cnt)
+		seq_printf(seq, "  %d inodes on list\n", inode_cnt);
+
+	seq_printf(seq, "average:\n  %llu us scan time\n",
+	    div_u64(es_stats->es_stats_scan_time, 1000));
+	seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
+	if (inode_cnt)
+		seq_printf(seq,
+		    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
+		    "  %llu us max scan time\n",
+		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
+		    div_u64(es_stats->es_stats_max_scan_time, 1000));
+
+	return 0;
+}
+
+static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
+	.start = ext4_es_seq_shrinker_info_start,
+	.next  = ext4_es_seq_shrinker_info_next,
+	.stop  = ext4_es_seq_shrinker_info_stop,
+	.show  = ext4_es_seq_shrinker_info_show,
+};
+
+static int
+ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = PDE_DATA(inode);
+	}
+
+	return ret;
+}
+
+static int
+ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+static const struct file_operations ext4_es_seq_shrinker_info_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_es_seq_shrinker_info_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= ext4_es_seq_shrinker_info_release,
+};
+
+int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+	int err;
+
+	/* Make sure we have enough bits for physical block number */
+	BUILD_BUG_ON(ES_SHIFT < 48);
+	INIT_LIST_HEAD(&sbi->s_es_list);
+	sbi->s_es_nr_inode = 0;
+	spin_lock_init(&sbi->s_es_lock);
+	sbi->s_es_stats.es_stats_shrunk = 0;
+	sbi->s_es_stats.es_stats_cache_hits = 0;
+	sbi->s_es_stats.es_stats_cache_misses = 0;
+	sbi->s_es_stats.es_stats_scan_time = 0;
+	sbi->s_es_stats.es_stats_max_scan_time = 0;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
+	if (err)
+		return err;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
+	if (err)
+		goto err1;
+
+	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+	sbi->s_es_shrinker.count_objects = ext4_es_count;
+	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+	err = register_shrinker(&sbi->s_es_shrinker);
+	if (err)
+		goto err2;
+
+	if (sbi->s_proc)
+		proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
+				 &ext4_es_seq_shrinker_info_fops, sbi);
+
+	return 0;
+
+err2:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
+err1:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+	return err;
+}
+
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+{
+	if (sbi->s_proc)
+		remove_proc_entry("es_shrinker_info", sbi->s_proc);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
+	unregister_shrinker(&sbi->s_es_shrinker);
+}
+
+/*
+ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
+ * most *nr_to_scan extents, update *nr_to_scan accordingly.
+ *
+ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
+ * Increment *nr_shrunk by the number of reclaimed extents. Also update
+ * ei->i_es_shrink_lblk to where we should continue scanning.
+ */
+static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
+				 int *nr_to_scan, int *nr_shrunk)
+{
+	struct inode *inode = &ei->vfs_inode;
+	struct ext4_es_tree *tree = &ei->i_es_tree;
+	struct extent_status *es;
+	struct rb_node *node;
+
+	es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
+	if (!es)
+		goto out_wrap;
+	node = &es->rb_node;
+	while (*nr_to_scan > 0) {
+		if (es->es_lblk > end) {
+			ei->i_es_shrink_lblk = end + 1;
+			return 0;
+		}
+
+		(*nr_to_scan)--;
+		node = rb_next(&es->rb_node);
+		/*
+		 * We can't reclaim delayed extent from status tree because
+		 * fiemap, bigallic, and seek_data/hole need to use it.
+		 */
+		if (ext4_es_is_delayed(es))
+			goto next;
+		if (ext4_es_is_referenced(es)) {
+			ext4_es_clear_referenced(es);
+			goto next;
+		}
+
+		rb_erase(&es->rb_node, &tree->root);
+		ext4_es_free_extent(inode, es);
+		(*nr_shrunk)++;
+next:
+		if (!node)
+			goto out_wrap;
+		es = rb_entry(node, struct extent_status, rb_node);
+	}
+	ei->i_es_shrink_lblk = es->es_lblk;
+	return 1;
+out_wrap:
+	ei->i_es_shrink_lblk = 0;
+	return 0;
+}
+
+static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
+{
+	struct inode *inode = &ei->vfs_inode;
+	int nr_shrunk = 0;
+	ext4_lblk_t start = ei->i_es_shrink_lblk;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	if (ei->i_es_shk_nr == 0)
+		return 0;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+	    __ratelimit(&_rs))
+		ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
+	if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
+	    start != 0)
+		es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
+
+	ei->i_es_tree.cache_es = NULL;
+	return nr_shrunk;
+}
diff --git a/kernel/fs/ext4/extents_status.h b/kernel/fs/ext4/extents_status.h
new file mode 100644
index 000000000..691b52613
--- /dev/null
+++ b/kernel/fs/ext4/extents_status.h
@@ -0,0 +1,175 @@
+/*
+ *  fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *	Allison Henderson <achender@linux.vnet.ibm.com>
+ *	Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
+ * checked with old map_block's result.
+ */
+#define ES_AGGRESSIVE_TEST__
+
+/*
+ * These flags live in the high bits of extent_status.es_pblk
+ */
+enum {
+	ES_WRITTEN_B,
+	ES_UNWRITTEN_B,
+	ES_DELAYED_B,
+	ES_HOLE_B,
+	ES_REFERENCED_B,
+	ES_FLAGS
+};
+
+#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
+#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
+
+#define EXTENT_STATUS_WRITTEN	(1 << ES_WRITTEN_B)
+#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
+#define EXTENT_STATUS_DELAYED	(1 << ES_DELAYED_B)
+#define EXTENT_STATUS_HOLE	(1 << ES_HOLE_B)
+#define EXTENT_STATUS_REFERENCED	(1 << ES_REFERENCED_B)
+
+#define ES_TYPE_MASK	((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
+			  EXTENT_STATUS_UNWRITTEN | \
+			  EXTENT_STATUS_DELAYED | \
+			  EXTENT_STATUS_HOLE) << ES_SHIFT)
+
+struct ext4_sb_info;
+struct ext4_extent;
+
+struct extent_status {
+	struct rb_node rb_node;
+	ext4_lblk_t es_lblk;	/* first logical block extent covers */
+	ext4_lblk_t es_len;	/* length of extent in block */
+	ext4_fsblk_t es_pblk;	/* first physical block */
+};
+
+struct ext4_es_tree {
+	struct rb_root root;
+	struct extent_status *cache_es;	/* recently accessed extent */
+};
+
+struct ext4_es_stats {
+	unsigned long es_stats_shrunk;
+	unsigned long es_stats_cache_hits;
+	unsigned long es_stats_cache_misses;
+	u64 es_stats_scan_time;
+	u64 es_stats_max_scan_time;
+	struct percpu_counter es_stats_all_cnt;
+	struct percpu_counter es_stats_shk_cnt;
+};
+
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len, ext4_fsblk_t pblk,
+				 unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len, ext4_fsblk_t pblk,
+				 unsigned int status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len);
+extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+					ext4_lblk_t lblk, ext4_lblk_t end,
+					struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+				 struct extent_status *es);
+
+static inline unsigned int ext4_es_status(struct extent_status *es)
+{
+	return es->es_pblk >> ES_SHIFT;
+}
+
+static inline unsigned int ext4_es_type(struct extent_status *es)
+{
+	return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
+}
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+	return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+	return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+	return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
+}
+
+static inline int ext4_es_is_hole(struct extent_status *es)
+{
+	return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
+}
+
+static inline void ext4_es_set_referenced(struct extent_status *es)
+{
+	es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
+}
+
+static inline void ext4_es_clear_referenced(struct extent_status *es)
+{
+	es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
+}
+
+static inline int ext4_es_is_referenced(struct extent_status *es)
+{
+	return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
+}
+
+static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+{
+	return es->es_pblk & ~ES_MASK;
+}
+
+static inline void ext4_es_store_pblock(struct extent_status *es,
+					ext4_fsblk_t pb)
+{
+	ext4_fsblk_t block;
+
+	block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
+	es->es_pblk = block;
+}
+
+static inline void ext4_es_store_status(struct extent_status *es,
+					unsigned int status)
+{
+	es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
+		      (es->es_pblk & ~ES_MASK);
+}
+
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+					       ext4_fsblk_t pb,
+					       unsigned int status)
+{
+	es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
+		      (pb & ~ES_MASK);
+}
+
+extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+
+#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/kernel/fs/ext4/file.c b/kernel/fs/ext4/file.c
new file mode 100644
index 000000000..0613c256c
--- /dev/null
+++ b/kernel/fs/ext4/file.c
@@ -0,0 +1,651 @@
+/*
+ *  linux/fs/ext4/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include <linux/uio.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext4_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext4_release_file(struct inode *inode, struct file *filp)
+{
+	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
+		ext4_alloc_da_blocks(inode);
+		ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+	}
+	/* if we are the last writer on the inode, drop the block reservation */
+	if ((filp->f_mode & FMODE_WRITE) &&
+			(atomic_read(&inode->i_writecount) == 1) &&
+		        !EXT4_I(inode)->i_reserved_data_blocks)
+	{
+		down_write(&EXT4_I(inode)->i_data_sem);
+		ext4_discard_preallocations(inode);
+		up_write(&EXT4_I(inode)->i_data_sem);
+	}
+	if (is_dx(inode) && filp->private_data)
+		ext4_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+static void ext4_unwritten_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+{
+	struct super_block *sb = inode->i_sb;
+	int blockmask = sb->s_blocksize - 1;
+
+	if (pos >= i_size_read(inode))
+		return 0;
+
+	if ((pos | iov_iter_alignment(from)) & blockmask)
+		return 1;
+
+	return 0;
+}
+
+static ssize_t
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct mutex *aio_mutex = NULL;
+	struct blk_plug plug;
+	int o_direct = iocb->ki_flags & IOCB_DIRECT;
+	int overwrite = 0;
+	ssize_t ret;
+
+	/*
+	 * Unaligned direct AIO must be serialized; see comment above
+	 * In the case of O_APPEND, assume that we must always serialize
+	 */
+	if (o_direct &&
+	    ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+	    !is_sync_kiocb(iocb) &&
+	    (iocb->ki_flags & IOCB_APPEND ||
+	     ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
+		aio_mutex = ext4_aio_mutex(inode);
+		mutex_lock(aio_mutex);
+		ext4_unwritten_wait(inode);
+	}
+
+	mutex_lock(&inode->i_mutex);
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	/*
+	 * If we have encountered a bitmap-format file, the size limit
+	 * is smaller than s_maxbytes, which is for extent-mapped files.
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
+			ret = -EFBIG;
+			goto out;
+		}
+		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
+	}
+
+	iocb->private = &overwrite;
+	if (o_direct) {
+		size_t length = iov_iter_count(from);
+		loff_t pos = iocb->ki_pos;
+		blk_start_plug(&plug);
+
+		/* check whether we do a DIO overwrite or not */
+		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+			struct ext4_map_blocks map;
+			unsigned int blkbits = inode->i_blkbits;
+			int err, len;
+
+			map.m_lblk = pos >> blkbits;
+			map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+				- map.m_lblk;
+			len = map.m_len;
+
+			err = ext4_map_blocks(NULL, inode, &map, 0);
+			/*
+			 * 'err==len' means that all of blocks has
+			 * been preallocated no matter they are
+			 * initialized or not.  For excluding
+			 * unwritten extents, we need to check
+			 * m_flags.  There are two conditions that
+			 * indicate for initialized extents.  1) If we
+			 * hit extent cache, EXT4_MAP_MAPPED flag is
+			 * returned; 2) If we do a real lookup,
+			 * non-flags are returned.  So we should check
+			 * these two conditions.
+			 */
+			if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+				overwrite = 1;
+		}
+	}
+
+	ret = __generic_file_write_iter(iocb, from);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0) {
+		ssize_t err;
+
+		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	if (o_direct)
+		blk_finish_plug(&plug);
+
+	if (aio_mutex)
+		mutex_unlock(aio_mutex);
+	return ret;
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	if (aio_mutex)
+		mutex_unlock(aio_mutex);
+	return ret;
+}
+
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_fault(vma, vmf, ext4_get_block);
+					/* Is this the right get_block? */
+}
+
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+	.fault		= ext4_dax_fault,
+	.page_mkwrite	= ext4_dax_mkwrite,
+	.pfn_mkwrite	= dax_pfn_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops	ext4_file_vm_ops
+#endif
+
+static const struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file->f_mapping->host;
+
+	if (ext4_encrypted_inode(inode)) {
+		int err = ext4_generate_encryption_key(inode);
+		if (err)
+			return 0;
+	}
+	file_accessed(file);
+	if (IS_DAX(file_inode(file))) {
+		vma->vm_ops = &ext4_dax_vm_ops;
+		vma->vm_flags |= VM_MIXEDMAP;
+	} else {
+		vma->vm_ops = &ext4_file_vm_ops;
+	}
+	return 0;
+}
+
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct vfsmount *mnt = filp->f_path.mnt;
+	struct path path;
+	char buf[64], *cp;
+	int ret;
+
+	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+		     !(sb->s_flags & MS_RDONLY))) {
+		sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+		/*
+		 * Sample where the filesystem has been mounted and
+		 * store it in the superblock for sysadmin convenience
+		 * when trying to sort through large numbers of block
+		 * devices or filesystem images.
+		 */
+		memset(buf, 0, sizeof(buf));
+		path.mnt = mnt;
+		path.dentry = mnt->mnt_root;
+		cp = d_path(&path, buf, sizeof(buf));
+		if (!IS_ERR(cp)) {
+			handle_t *handle;
+			int err;
+
+			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+			if (IS_ERR(handle))
+				return PTR_ERR(handle);
+			BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+			if (err) {
+				ext4_journal_stop(handle);
+				return err;
+			}
+			strlcpy(sbi->s_es->s_last_mounted, cp,
+				sizeof(sbi->s_es->s_last_mounted));
+			ext4_handle_dirty_super(handle, sb);
+			ext4_journal_stop(handle);
+		}
+	}
+	/*
+	 * Set up the jbd2_inode if we are opening the inode for
+	 * writing and the journal is present
+	 */
+	if (filp->f_mode & FMODE_WRITE) {
+		ret = ext4_inode_attach_jinode(inode);
+		if (ret < 0)
+			return ret;
+	}
+	ret = dquot_file_open(inode, filp);
+	if (!ret && ext4_encrypted_inode(inode)) {
+		ret = ext4_generate_encryption_key(inode);
+		if (ret)
+			ret = -EACCES;
+	}
+	return ret;
+}
+
+/*
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function.  When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+				     int whence,
+				     struct ext4_map_blocks *map,
+				     loff_t *offset)
+{
+	struct pagevec pvec;
+	unsigned int blkbits;
+	pgoff_t index;
+	pgoff_t end;
+	loff_t endoff;
+	loff_t startoff;
+	loff_t lastoff;
+	int found = 0;
+
+	blkbits = inode->i_sb->s_blocksize_bits;
+	startoff = *offset;
+	lastoff = startoff;
+	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+
+	index = startoff >> PAGE_CACHE_SHIFT;
+	end = endoff >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+	do {
+		int i, num;
+		unsigned long nr_pages;
+
+		num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+					  (pgoff_t)num);
+		if (nr_pages == 0) {
+			if (whence == SEEK_DATA)
+				break;
+
+			BUG_ON(whence != SEEK_HOLE);
+			/*
+			 * If this is the first time to go into the loop and
+			 * offset is not beyond the end offset, it will be a
+			 * hole at this offset
+			 */
+			if (lastoff == startoff || lastoff < endoff)
+				found = 1;
+			break;
+		}
+
+		/*
+		 * If this is the first time to go into the loop and
+		 * offset is smaller than the first page offset, it will be a
+		 * hole at this offset.
+		 */
+		if (lastoff == startoff && whence == SEEK_HOLE &&
+		    lastoff < page_offset(pvec.pages[0])) {
+			found = 1;
+			break;
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			/*
+			 * If the current offset is not beyond the end of given
+			 * range, it will be a hole.
+			 */
+			if (lastoff < endoff && whence == SEEK_HOLE &&
+			    page->index > end) {
+				found = 1;
+				*offset = lastoff;
+				goto out;
+			}
+
+			lock_page(page);
+
+			if (unlikely(page->mapping != inode->i_mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!page_has_buffers(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (page_has_buffers(page)) {
+				lastoff = page_offset(page);
+				bh = head = page_buffers(page);
+				do {
+					if (buffer_uptodate(bh) ||
+					    buffer_unwritten(bh)) {
+						if (whence == SEEK_DATA)
+							found = 1;
+					} else {
+						if (whence == SEEK_HOLE)
+							found = 1;
+					}
+					if (found) {
+						*offset = max_t(loff_t,
+							startoff, lastoff);
+						unlock_page(page);
+						goto out;
+					}
+					lastoff += bh->b_size;
+					bh = bh->b_this_page;
+				} while (bh != head);
+			}
+
+			lastoff = page_offset(page) + PAGE_SIZE;
+			unlock_page(page);
+		}
+
+		/*
+		 * The no. of pages is less than our desired, that would be a
+		 * hole in there.
+		 */
+		if (nr_pages < num && whence == SEEK_HOLE) {
+			found = 1;
+			*offset = lastoff;
+			break;
+		}
+
+		index = pvec.pages[i - 1]->index + 1;
+		pagevec_release(&pvec);
+	} while (index <= end);
+
+out:
+	pagevec_release(&pvec);
+	return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_map_blocks map;
+	struct extent_status es;
+	ext4_lblk_t start, last, end;
+	loff_t dataoff, isize;
+	int blkbits;
+	int ret = 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+	if (offset >= isize) {
+		mutex_unlock(&inode->i_mutex);
+		return -ENXIO;
+	}
+
+	blkbits = inode->i_sb->s_blocksize_bits;
+	start = offset >> blkbits;
+	last = start;
+	end = isize >> blkbits;
+	dataoff = offset;
+
+	do {
+		map.m_lblk = last;
+		map.m_len = end - last + 1;
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+			if (last != start)
+				dataoff = (loff_t)last << blkbits;
+			break;
+		}
+
+		/*
+		 * If there is a delay extent at this offset,
+		 * it will be as a data.
+		 */
+		ext4_es_find_delayed_extent_range(inode, last, last, &es);
+		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+			if (last != start)
+				dataoff = (loff_t)last << blkbits;
+			break;
+		}
+
+		/*
+		 * If there is a unwritten extent at this offset,
+		 * it will be as a data or a hole according to page
+		 * cache that has data or not.
+		 */
+		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+			int unwritten;
+			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+							      &map, &dataoff);
+			if (unwritten)
+				break;
+		}
+
+		last++;
+		dataoff = (loff_t)last << blkbits;
+	} while (last <= end);
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (dataoff > isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, dataoff, maxsize);
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_map_blocks map;
+	struct extent_status es;
+	ext4_lblk_t start, last, end;
+	loff_t holeoff, isize;
+	int blkbits;
+	int ret = 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+	if (offset >= isize) {
+		mutex_unlock(&inode->i_mutex);
+		return -ENXIO;
+	}
+
+	blkbits = inode->i_sb->s_blocksize_bits;
+	start = offset >> blkbits;
+	last = start;
+	end = isize >> blkbits;
+	holeoff = offset;
+
+	do {
+		map.m_lblk = last;
+		map.m_len = end - last + 1;
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+			last += ret;
+			holeoff = (loff_t)last << blkbits;
+			continue;
+		}
+
+		/*
+		 * If there is a delay extent at this offset,
+		 * we will skip this extent.
+		 */
+		ext4_es_find_delayed_extent_range(inode, last, last, &es);
+		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+			last = es.es_lblk + es.es_len;
+			holeoff = (loff_t)last << blkbits;
+			continue;
+		}
+
+		/*
+		 * If there is a unwritten extent at this offset,
+		 * it will be as a data or a hole according to page
+		 * cache that has data or not.
+		 */
+		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+			int unwritten;
+			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+							      &map, &holeoff);
+			if (!unwritten) {
+				last += ret;
+				holeoff = (loff_t)last << blkbits;
+				continue;
+			}
+		}
+
+		/* find a hole */
+		break;
+	} while (last <= end);
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (holeoff > isize)
+		holeoff = isize;
+
+	return vfs_setpos(file, holeoff, maxsize);
+}
+
+/*
+ * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
+ * by calling generic_file_llseek_size() with the appropriate maxbytes
+ * value for each.
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxbytes;
+
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+	else
+		maxbytes = inode->i_sb->s_maxbytes;
+
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		return generic_file_llseek_size(file, offset, whence,
+						maxbytes, i_size_read(inode));
+	case SEEK_DATA:
+		return ext4_seek_data(file, offset, maxbytes);
+	case SEEK_HOLE:
+		return ext4_seek_hole(file, offset, maxbytes);
+	}
+
+	return -EINVAL;
+}
+
+const struct file_operations ext4_file_operations = {
+	.llseek		= ext4_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= ext4_file_write_iter,
+	.unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.mmap		= ext4_file_mmap,
+	.open		= ext4_file_open,
+	.release	= ext4_release_file,
+	.fsync		= ext4_sync_file,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= ext4_fallocate,
+};
+
+const struct inode_operations ext4_file_inode_operations = {
+	.setattr	= ext4_setattr,
+	.getattr	= ext4_getattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
+	.fiemap		= ext4_fiemap,
+};
+
diff --git a/kernel/fs/ext4/fsync.c b/kernel/fs/ext4/fsync.c
new file mode 100644
index 000000000..885025413
--- /dev/null
+++ b/kernel/fs/ext4/fsync.c
@@ -0,0 +1,150 @@
+/*
+ *  linux/fs/ext4/fsync.c
+ *
+ *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
+ *  from
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *                      Laboratoire MASI - Institut Blaise Pascal
+ *                      Universite Pierre et Marie Curie (Paris VI)
+ *  from
+ *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4fs fsync primitive
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ *  Removed unnecessary code duplication for little endian machines
+ *  and excessive __inline__s.
+ *        Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static int ext4_sync_parent(struct inode *inode)
+{
+	struct dentry *dentry = NULL;
+	struct inode *next;
+	int ret = 0;
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+		return 0;
+	inode = igrab(inode);
+	while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+		dentry = d_find_any_alias(inode);
+		if (!dentry)
+			break;
+		next = igrab(d_inode(dentry->d_parent));
+		dput(dentry);
+		if (!next)
+			break;
+		iput(inode);
+		inode = next;
+		ret = sync_mapping_buffers(inode->i_mapping);
+		if (ret)
+			break;
+		ret = sync_inode_metadata(inode, 1);
+		if (ret)
+			break;
+	}
+	iput(inode);
+	return ret;
+}
+
+/*
+ * akpm: A new design for ext4_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode.  Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it.  This will snapshot the
+ * inode to disk.
+ */
+
+int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	int ret = 0, err;
+	tid_t commit_tid;
+	bool needs_barrier = false;
+
+	J_ASSERT(ext4_journal_current_handle() == NULL);
+
+	trace_ext4_sync_file_enter(file, datasync);
+
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make sure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			ret = -EROFS;
+		goto out;
+	}
+
+	if (!journal) {
+		ret = generic_file_fsync(file, start, end, datasync);
+		if (!ret && !hlist_empty(&inode->i_dentry))
+			ret = ext4_sync_parent(inode);
+		goto out;
+	}
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	/*
+	 * data=writeback,ordered:
+	 *  The caller's filemap_fdatawrite()/wait will sync the data.
+	 *  Metadata is in the journal, we wait for proper transaction to
+	 *  commit here.
+	 *
+	 * data=journal:
+	 *  filemap_fdatawrite won't do anything (the buffers are clean).
+	 *  ext4_force_commit will write the file data into the journal and
+	 *  will wait on that.
+	 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+	 *  (they were dirtied by commit).  But that's OK - the blocks are
+	 *  safe in-journal, which is all fsync() needs to ensure.
+	 */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+	if (journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = true;
+	ret = jbd2_complete_transaction(journal, commit_tid);
+	if (needs_barrier) {
+		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+		if (!ret)
+			ret = err;
+	}
+out:
+	trace_ext4_sync_file_exit(inode, ret);
+	return ret;
+}
diff --git a/kernel/fs/ext4/hash.c b/kernel/fs/ext4/hash.c
new file mode 100644
index 000000000..e026aa941
--- /dev/null
+++ b/kernel/fs/ext4/hash.c
@@ -0,0 +1,207 @@
+/*
+ *  linux/fs/ext4/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include <linux/fs.h>
+#include <linux/cryptohash.h>
+#include "ext4.h"
+
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while (--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+
+/* The old legacy hash */
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) ucp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+/*
+ * Returns the hash of a filename.  If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash.  If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits.  32 bit hashes will return 0 for the minor hash.
+ */
+int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+	__u32	hash;
+	__u32	minor_hash = 0;
+	const char	*p;
+	int		i;
+	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
+
+	/* Initialize the default seed for the hash checksum functions */
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+
+	/* Check to see if the seed is all zero's */
+	if (hinfo->seed) {
+		for (i = 0; i < 4; i++) {
+			if (hinfo->seed[i]) {
+				memcpy(buf, hinfo->seed, sizeof(buf));
+				break;
+			}
+		}
+	}
+
+	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
+	case DX_HASH_LEGACY:
+		hash = dx_hack_hash_signed(name, len);
+		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_HALF_MD4:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 8);
+			half_md4_transform(buf, in);
+			len -= 32;
+			p += 32;
+		}
+		minor_hash = buf[2];
+		hash = buf[1];
+		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_TEA:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 4);
+			TEA_transform(buf, in);
+			len -= 16;
+			p += 16;
+		}
+		hash = buf[0];
+		minor_hash = buf[1];
+		break;
+	default:
+		hinfo->hash = 0;
+		return -1;
+	}
+	hash = hash & ~1;
+	if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+		hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
+	hinfo->hash = hash;
+	hinfo->minor_hash = minor_hash;
+	return 0;
+}
diff --git a/kernel/fs/ext4/ialloc.c b/kernel/fs/ext4/ialloc.c
new file mode 100644
index 000000000..1eaa6cb96
--- /dev/null
+++ b/kernel/fs/ext4/ialloc.c
@@ -0,0 +1,1350 @@
+/*
+ *  linux/fs/ext4/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by
+ *  Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <asm/byteorder.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+	int i;
+
+	if (start_bit >= end_bit)
+		return;
+
+	ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+		ext4_set_bit(i, bitmap);
+	if (i < end_bit)
+		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/* Initializes an uninitialized inode bitmap */
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
+{
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	J_ASSERT_BH(bh, buffer_locked(bh));
+
+	/* If checksum is bad mark all blocks and inodes use to prevent
+	 * allocation, essentially implementing a per-group read-only flag. */
+	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+		ext4_error(sb, "Checksum bad for group %u", block_group);
+		grp = ext4_get_group_info(sb, block_group);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, gdp);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
+		}
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return 0;
+	}
+
+	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+			bh->b_data);
+	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+				   EXT4_INODES_PER_GROUP(sb) / 8);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
+
+	return EXT4_INODES_PER_GROUP(sb);
+}
+
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+		set_bitmap_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh = NULL;
+	ext4_fsblk_t bitmap_blk;
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+
+	bitmap_blk = ext4_inode_bitmap(sb, desc);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext4_error(sb, "Cannot read inode bitmap - "
+			    "block_group = %u, inode_bitmap = %llu",
+			    block_group, bitmap_blk);
+		return NULL;
+	}
+	if (bitmap_uptodate(bh))
+		goto verify;
+
+	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto verify;
+	}
+
+	ext4_lock_group(sb, block_group);
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+		ext4_init_inode_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
+		set_buffer_uptodate(bh);
+		set_buffer_verified(bh);
+		ext4_unlock_group(sb, block_group);
+		unlock_buffer(bh);
+		return bh;
+	}
+	ext4_unlock_group(sb, block_group);
+
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		goto verify;
+	}
+	/*
+	 * submit the buffer_head for reading
+	 */
+	trace_ext4_load_inode_bitmap(sb, block_group);
+	bh->b_end_io = ext4_end_bitmap_read;
+	get_bh(bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		put_bh(bh);
+		ext4_error(sb, "Cannot read inode bitmap - "
+			   "block_group = %u, inode_bitmap = %llu",
+			   block_group, bitmap_blk);
+		return NULL;
+	}
+
+verify:
+	ext4_lock_group(sb, block_group);
+	if (!buffer_verified(bh) &&
+	    !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+					   EXT4_INODES_PER_GROUP(sb) / 8)) {
+		ext4_unlock_group(sb, block_group);
+		put_bh(bh);
+		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+			   "inode_bitmap = %llu", block_group, bitmap_blk);
+		grp = ext4_get_group_info(sb, block_group);
+		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, desc);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
+		}
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return NULL;
+	}
+	ext4_unlock_group(sb, block_group);
+	set_buffer_verified(bh);
+	return bh;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext4_free_inode(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	int is_directory;
+	unsigned long ino;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	ext4_group_t block_group;
+	unsigned long bit;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
+	struct ext4_sb_info *sbi;
+	int fatal = 0, err, count, cleared;
+	struct ext4_group_info *grp;
+
+	if (!sb) {
+		printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
+		       "nonexistent device\n", __func__, __LINE__);
+		return;
+	}
+	if (atomic_read(&inode->i_count) > 1) {
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
+			 __func__, __LINE__, inode->i_ino,
+			 atomic_read(&inode->i_count));
+		return;
+	}
+	if (inode->i_nlink) {
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
+			 __func__, __LINE__, inode->i_ino, inode->i_nlink);
+		return;
+	}
+	sbi = EXT4_SB(sb);
+
+	ino = inode->i_ino;
+	ext4_debug("freeing inode %lu\n", ino);
+	trace_ext4_free_inode(inode);
+
+	/*
+	 * Note: we must free any quota before locking the superblock,
+	 * as writing the quota to disk may need the lock as well.
+	 */
+	dquot_initialize(inode);
+	ext4_xattr_delete_inode(handle, inode);
+	dquot_free_inode(inode);
+	dquot_drop(inode);
+
+	is_directory = S_ISDIR(inode->i_mode);
+
+	/* Do this BEFORE marking the inode not in use or returning an error */
+	ext4_clear_inode(inode);
+
+	es = EXT4_SB(sb)->s_es;
+	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
+		goto error_return;
+	}
+	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+	/* Don't bother if the inode bitmap is corrupt. */
+	grp = ext4_get_group_info(sb, block_group);
+	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
+		goto error_return;
+
+	BUFFER_TRACE(bitmap_bh, "get_write_access");
+	fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (fatal)
+		goto error_return;
+
+	fatal = -ESRCH;
+	gdp = ext4_get_group_desc(sb, block_group, &bh2);
+	if (gdp) {
+		BUFFER_TRACE(bh2, "get_write_access");
+		fatal = ext4_journal_get_write_access(handle, bh2);
+	}
+	ext4_lock_group(sb, block_group);
+	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
+	if (fatal || !cleared) {
+		ext4_unlock_group(sb, block_group);
+		goto out;
+	}
+
+	count = ext4_free_inodes_count(sb, gdp) + 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (is_directory) {
+		count = ext4_used_dirs_count(sb, gdp) - 1;
+		ext4_used_dirs_set(sb, gdp, count);
+		percpu_counter_dec(&sbi->s_dirs_counter);
+	}
+	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+				   EXT4_INODES_PER_GROUP(sb) / 8);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+
+	percpu_counter_inc(&sbi->s_freeinodes_counter);
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		if (is_directory)
+			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+	}
+	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+	if (cleared) {
+		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!fatal)
+			fatal = err;
+	} else {
+		ext4_error(sb, "bit already cleared for inode %lu", ino);
+		if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, gdp);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
+		}
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+	}
+
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, fatal);
+}
+
+struct orlov_stats {
+	__u64 free_clusters;
+	__u32 free_inodes;
+	__u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+			    int flex_size, struct orlov_stats *stats)
+{
+	struct ext4_group_desc *desc;
+	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+
+	if (flex_size > 1) {
+		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
+		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+		return;
+	}
+
+	desc = ext4_get_group_desc(sb, g, NULL);
+	if (desc) {
+		stats->free_inodes = ext4_free_inodes_count(sb, desc);
+		stats->free_clusters = ext4_free_group_clusters(sb, desc);
+		stats->used_dirs = ext4_used_dirs_count(sb, desc);
+	} else {
+		stats->free_inodes = 0;
+		stats->free_clusters = 0;
+		stats->used_dirs = 0;
+	}
+}
+
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * Parent's group is preferred, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ */
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent,
+			    ext4_group_t *group, umode_t mode,
+			    const struct qstr *qstr)
+{
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
+	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
+	unsigned int freei, avefreei, grp_free;
+	ext4_fsblk_t freeb, avefreec;
+	unsigned int ndirs;
+	int max_dirs, min_inodes;
+	ext4_grpblk_t min_clusters;
+	ext4_group_t i, grp, g, ngroups;
+	struct ext4_group_desc *desc;
+	struct orlov_stats stats;
+	int flex_size = ext4_flex_bg_size(sbi);
+	struct dx_hash_info hinfo;
+
+	ngroups = real_ngroups;
+	if (flex_size > 1) {
+		ngroups = (real_ngroups + flex_size - 1) >>
+			sbi->s_log_groups_per_flex;
+		parent_group >>= sbi->s_log_groups_per_flex;
+	}
+
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	freeb = EXT4_C2B(sbi,
+		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+	avefreec = freeb;
+	do_div(avefreec, ngroups);
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+	if (S_ISDIR(mode) &&
+	    ((parent == d_inode(sb->s_root)) ||
+	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
+		int best_ndir = inodes_per_group;
+		int ret = -1;
+
+		if (qstr) {
+			hinfo.hash_version = DX_HASH_HALF_MD4;
+			hinfo.seed = sbi->s_hash_seed;
+			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+			grp = hinfo.hash;
+		} else
+			grp = prandom_u32();
+		parent_group = (unsigned)grp % ngroups;
+		for (i = 0; i < ngroups; i++) {
+			g = (parent_group + i) % ngroups;
+			get_orlov_stats(sb, g, flex_size, &stats);
+			if (!stats.free_inodes)
+				continue;
+			if (stats.used_dirs >= best_ndir)
+				continue;
+			if (stats.free_inodes < avefreei)
+				continue;
+			if (stats.free_clusters < avefreec)
+				continue;
+			grp = g;
+			ret = 0;
+			best_ndir = stats.used_dirs;
+		}
+		if (ret)
+			goto fallback;
+	found_flex_bg:
+		if (flex_size == 1) {
+			*group = grp;
+			return 0;
+		}
+
+		/*
+		 * We pack inodes at the beginning of the flexgroup's
+		 * inode tables.  Block allocation decisions will do
+		 * something similar, although regular files will
+		 * start at 2nd block group of the flexgroup.  See
+		 * ext4_ext_find_goal() and ext4_find_near().
+		 */
+		grp *= flex_size;
+		for (i = 0; i < flex_size; i++) {
+			if (grp+i >= real_ngroups)
+				break;
+			desc = ext4_get_group_desc(sb, grp+i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = grp+i;
+				return 0;
+			}
+		}
+		goto fallback;
+	}
+
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group*flex_size / 4;
+	if (min_inodes < 1)
+		min_inodes = 1;
+	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
+
+	/*
+	 * Start looking in the flex group where we last allocated an
+	 * inode for this parent directory
+	 */
+	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+		parent_group = EXT4_I(parent)->i_last_alloc_group;
+		if (flex_size > 1)
+			parent_group >>= sbi->s_log_groups_per_flex;
+	}
+
+	for (i = 0; i < ngroups; i++) {
+		grp = (parent_group + i) % ngroups;
+		get_orlov_stats(sb, grp, flex_size, &stats);
+		if (stats.used_dirs >= max_dirs)
+			continue;
+		if (stats.free_inodes < min_inodes)
+			continue;
+		if (stats.free_clusters < min_clusters)
+			continue;
+		goto found_flex_bg;
+	}
+
+fallback:
+	ngroups = real_ngroups;
+	avefreei = freei / ngroups;
+fallback_retry:
+	parent_group = EXT4_I(parent)->i_block_group;
+	for (i = 0; i < ngroups; i++) {
+		grp = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, grp, NULL);
+		if (desc) {
+			grp_free = ext4_free_inodes_count(sb, desc);
+			if (grp_free && grp_free >= avefreei) {
+				*group = grp;
+				return 0;
+			}
+		}
+	}
+
+	if (avefreei) {
+		/*
+		 * The free-inodes counter is approximate, and for really small
+		 * filesystems the above test can fail to find any blockgroups
+		 */
+		avefreei = 0;
+		goto fallback_retry;
+	}
+
+	return -1;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent,
+			    ext4_group_t *group, umode_t mode)
+{
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_desc *desc;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+	/*
+	 * Try to place the inode is the same flex group as its
+	 * parent.  If we can't find space, use the Orlov algorithm to
+	 * find another flex group, and store that information in the
+	 * parent directory's inode information so that use that flex
+	 * group for future allocations.
+	 */
+	if (flex_size > 1) {
+		int retry = 0;
+
+	try_again:
+		parent_group &= ~(flex_size-1);
+		last = parent_group + flex_size;
+		if (last > ngroups)
+			last = ngroups;
+		for  (i = parent_group; i < last; i++) {
+			desc = ext4_get_group_desc(sb, i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = i;
+				return 0;
+			}
+		}
+		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+			retry = 1;
+			parent_group = EXT4_I(parent)->i_last_alloc_group;
+			goto try_again;
+		}
+		/*
+		 * If this didn't work, use the Orlov search algorithm
+		 * to find a new flex group; we pass in the mode to
+		 * avoid the topdir algorithms.
+		 */
+		*group = parent_group + flex_size;
+		if (*group > ngroups)
+			*group = 0;
+		return find_group_orlov(sb, parent, group, mode, NULL);
+	}
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	*group = parent_group;
+	desc = ext4_get_group_desc(sb, *group, NULL);
+	if (desc && ext4_free_inodes_count(sb, desc) &&
+	    ext4_free_group_clusters(sb, desc))
+		return 0;
+
+	/*
+	 * We're going to place this inode in a different blockgroup from its
+	 * parent.  We want to cause files in a common directory to all land in
+	 * the same blockgroup.  But we want files which are in a different
+	 * directory which shares a blockgroup with our parent to land in a
+	 * different blockgroup.
+	 *
+	 * So add our directory's i_ino into the starting point for the hash.
+	 */
+	*group = (*group + parent->i_ino) % ngroups;
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode and some free
+	 * blocks.
+	 */
+	for (i = 1; i < ngroups; i <<= 1) {
+		*group += i;
+		if (*group >= ngroups)
+			*group -= ngroups;
+		desc = ext4_get_group_desc(sb, *group, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+		    ext4_free_group_clusters(sb, desc))
+			return 0;
+	}
+
+	/*
+	 * That failed: try linear search for a free inode, even if that group
+	 * has no free blocks.
+	 */
+	*group = parent_group;
+	for (i = 0; i < ngroups; i++) {
+		if (++*group >= ngroups)
+			*group = 0;
+		desc = ext4_get_group_desc(sb, *group, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc))
+			return 0;
+	}
+
+	return -1;
+}
+
+/*
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk.  (Yes, these values are
+ * somewhat arbitrary...)
+ */
+#define RECENTCY_MIN	5
+#define RECENTCY_DIRTY	30
+
+static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+	struct ext4_group_desc	*gdp;
+	struct ext4_inode	*raw_inode;
+	struct buffer_head	*bh;
+	unsigned long		dtime, now;
+	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int	offset, ret = 0, recentcy = RECENTCY_MIN;
+
+	gdp = ext4_get_group_desc(sb, group, NULL);
+	if (unlikely(!gdp))
+		return 0;
+
+	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+		       (ino / inodes_per_block));
+	if (unlikely(!bh) || !buffer_uptodate(bh))
+		/*
+		 * If the block is not in the buffer cache, then it
+		 * must have been written out.
+		 */
+		goto out;
+
+	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+	dtime = le32_to_cpu(raw_inode->i_dtime);
+	now = get_seconds();
+	if (buffer_dirty(bh))
+		recentcy += RECENTCY_DIRTY;
+
+	if (dtime && (dtime < now) && (now < dtime + recentcy))
+		ret = 1;
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+			       umode_t mode, const struct qstr *qstr,
+			       __u32 goal, uid_t *owner, int handle_type,
+			       unsigned int line_no, int nblocks)
+{
+	struct super_block *sb;
+	struct buffer_head *inode_bitmap_bh = NULL;
+	struct buffer_head *group_desc_bh;
+	ext4_group_t ngroups, group = 0;
+	unsigned long ino = 0;
+	struct inode *inode;
+	struct ext4_group_desc *gdp = NULL;
+	struct ext4_inode_info *ei;
+	struct ext4_sb_info *sbi;
+	int ret2, err = 0;
+	struct inode *ret;
+	ext4_group_t i;
+	ext4_group_t flex_group;
+	struct ext4_group_info *grp;
+
+	/* Cannot create files in a deleted directory */
+	if (!dir || !dir->i_nlink)
+		return ERR_PTR(-EPERM);
+
+	sb = dir->i_sb;
+	ngroups = ext4_get_groups_count(sb);
+	trace_ext4_request_inode(dir, mode);
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	ei = EXT4_I(inode);
+	sbi = EXT4_SB(sb);
+
+	/*
+	 * Initalize owners and quota early so that we don't have to account
+	 * for quota initialization worst case in standard inode creating
+	 * transaction
+	 */
+	if (owner) {
+		inode->i_mode = mode;
+		i_uid_write(inode, owner[0]);
+		i_gid_write(inode, owner[1]);
+	} else if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+
+	if (!goal)
+		goal = sbi->s_inode_goal;
+
+	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
+		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+		ret2 = 0;
+		goto got_group;
+	}
+
+	if (S_ISDIR(mode))
+		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+	else
+		ret2 = find_group_other(sb, dir, &group, mode);
+
+got_group:
+	EXT4_I(dir)->i_last_alloc_group = group;
+	err = -ENOSPC;
+	if (ret2 == -1)
+		goto out;
+
+	/*
+	 * Normally we will only go through one pass of this loop,
+	 * unless we get unlucky and it turns out the group we selected
+	 * had its last inode grabbed by someone else.
+	 */
+	for (i = 0; i < ngroups; i++, ino = 0) {
+		err = -EIO;
+
+		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+		if (!gdp)
+			goto out;
+
+		/*
+		 * Check free inodes count before loading bitmap.
+		 */
+		if (ext4_free_inodes_count(sb, gdp) == 0) {
+			if (++group == ngroups)
+				group = 0;
+			continue;
+		}
+
+		grp = ext4_get_group_info(sb, group);
+		/* Skip groups with already-known suspicious inode tables */
+		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			if (++group == ngroups)
+				group = 0;
+			continue;
+		}
+
+		brelse(inode_bitmap_bh);
+		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+		/* Skip groups with suspicious inode tables */
+		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+			if (++group == ngroups)
+				group = 0;
+			continue;
+		}
+
+repeat_in_this_group:
+		ino = ext4_find_next_zero_bit((unsigned long *)
+					      inode_bitmap_bh->b_data,
+					      EXT4_INODES_PER_GROUP(sb), ino);
+		if (ino >= EXT4_INODES_PER_GROUP(sb))
+			goto next_group;
+		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+			ext4_error(sb, "reserved inode found cleared - "
+				   "inode=%lu", ino + 1);
+			continue;
+		}
+		if ((EXT4_SB(sb)->s_journal == NULL) &&
+		    recently_deleted(sb, group, ino)) {
+			ino++;
+			goto next_inode;
+		}
+		if (!handle) {
+			BUG_ON(nblocks <= 0);
+			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+							 handle_type, nblocks,
+							 0);
+			if (IS_ERR(handle)) {
+				err = PTR_ERR(handle);
+				ext4_std_error(sb, err);
+				goto out;
+			}
+		}
+		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
+		}
+		ext4_lock_group(sb, group);
+		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+		ext4_unlock_group(sb, group);
+		ino++;		/* the inode bitmap is zero-based */
+		if (!ret2)
+			goto got; /* we grabbed the inode! */
+next_inode:
+		if (ino < EXT4_INODES_PER_GROUP(sb))
+			goto repeat_in_this_group;
+next_group:
+		if (++group == ngroups)
+			group = 0;
+	}
+	err = -ENOSPC;
+	goto out;
+
+got:
+	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, group_desc_bh);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
+
+	/* We may have to initialize the block bitmap if it isn't already */
+	if (ext4_has_group_desc_csum(sb) &&
+	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		struct buffer_head *block_bitmap_bh;
+
+		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+		if (!block_bitmap_bh) {
+			err = -EIO;
+			goto out;
+		}
+		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
+		if (err) {
+			brelse(block_bitmap_bh);
+			ext4_std_error(sb, err);
+			goto out;
+		}
+
+		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+
+		/* recheck and clear flag under lock if we still need to */
+		ext4_lock_group(sb, group);
+		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+			ext4_free_group_clusters_set(sb, gdp,
+				ext4_free_clusters_after_init(sb, group, gdp));
+			ext4_block_bitmap_csum_set(sb, group, gdp,
+						   block_bitmap_bh);
+			ext4_group_desc_csum_set(sb, group, gdp);
+		}
+		ext4_unlock_group(sb, group);
+		brelse(block_bitmap_bh);
+
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
+		}
+	}
+
+	/* Update the relevant bg descriptor fields */
+	if (ext4_has_group_desc_csum(sb)) {
+		int free;
+		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+		ext4_lock_group(sb, group); /* while we modify the bg desc */
+		free = EXT4_INODES_PER_GROUP(sb) -
+			ext4_itable_unused_count(sb, gdp);
+		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+			free = 0;
+		}
+		/*
+		 * Check the relative inode number against the last used
+		 * relative inode number in this group. if it is greater
+		 * we need to update the bg_itable_unused count
+		 */
+		if (ino > free)
+			ext4_itable_unused_set(sb, gdp,
+					(EXT4_INODES_PER_GROUP(sb) - ino));
+		up_read(&grp->alloc_sem);
+	} else {
+		ext4_lock_group(sb, group);
+	}
+
+	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+	if (S_ISDIR(mode)) {
+		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+		if (sbi->s_log_groups_per_flex) {
+			ext4_group_t f = ext4_flex_group(sbi, group);
+
+			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+		}
+	}
+	if (ext4_has_group_desc_csum(sb)) {
+		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+					   EXT4_INODES_PER_GROUP(sb) / 8);
+		ext4_group_desc_csum_set(sb, group, gdp);
+	}
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
+
+	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
+
+	if (sbi->s_log_groups_per_flex) {
+		flex_group = ext4_flex_group(sbi, group);
+		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+	}
+
+	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
+	/* This is the optimal IO size (for stat), not the fs block size */
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
+						       ext4_current_time(inode);
+
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	ei->i_dir_start_lookup = 0;
+	ei->i_disksize = 0;
+
+	/* Don't inherit extent flag from directory, amongst others. */
+	ei->i_flags =
+		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
+	ei->i_file_acl = 0;
+	ei->i_dtime = 0;
+	ei->i_block_group = group;
+	ei->i_last_alloc_group = ~0;
+
+	/* If the directory encrypted, then we should encrypt the inode. */
+	if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) &&
+	    (ext4_encrypted_inode(dir) ||
+	     DUMMY_ENCRYPTION_ENABLED(sbi)))
+		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+
+	ext4_set_inode_flags(inode);
+	if (IS_DIRSYNC(inode))
+		ext4_handle_sync(handle);
+	if (insert_inode_locked(inode) < 0) {
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+			   inode->i_ino);
+		goto out;
+	}
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	/* Precompute checksum seed for inode metadata */
+	if (ext4_has_metadata_csum(sb)) {
+		__u32 csum;
+		__le32 inum = cpu_to_le32(inode->i_ino);
+		__le32 gen = cpu_to_le32(inode->i_generation);
+		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+				   sizeof(inum));
+		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+					      sizeof(gen));
+	}
+
+	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+	ext4_set_inode_state(inode, EXT4_STATE_NEW);
+
+	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) &&
+	    (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) {
+		ei->i_inline_off = 0;
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+			EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+			ext4_set_inode_state(inode,
+			EXT4_STATE_MAY_INLINE_DATA);
+	} else {
+		/* Inline data and encryption are incompatible
+		 * We turn off inline data since encryption is enabled */
+		ei->i_inline_off = 1;
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+			EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+			ext4_clear_inode_state(inode,
+			EXT4_STATE_MAY_INLINE_DATA);
+	}
+#else
+	ei->i_inline_off = 0;
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+#endif
+	ret = inode;
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto fail_drop;
+
+	err = ext4_init_acl(handle, inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext4_init_security(handle, inode, dir, qstr);
+	if (err)
+		goto fail_free_drop;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		/* set extent flag only for directory, file and normal symlink*/
+		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode);
+		}
+	}
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto fail_free_drop;
+	}
+
+	ext4_debug("allocating inode %lu\n", inode->i_ino);
+	trace_ext4_allocate_inode(inode, dir, mode);
+	brelse(inode_bitmap_bh);
+	return ret;
+
+fail_free_drop:
+	dquot_free_inode(inode);
+fail_drop:
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+out:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	iput(inode);
+	brelse(inode_bitmap_bh);
+	return ERR_PTR(err);
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+{
+	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+	ext4_group_t block_group;
+	int bit;
+	struct buffer_head *bitmap_bh;
+	struct inode *inode = NULL;
+	long err = -EIO;
+
+	/* Error cases - e2fsck has already cleaned up for us */
+	if (ino > max_ino) {
+		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
+		goto error;
+	}
+
+	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
+		goto error;
+	}
+
+	/* Having the inode bit set should be a 100% indicator that this
+	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
+	 * inodes that were being truncated, so we can't check i_nlink==0.
+	 */
+	if (!ext4_test_bit(bit, bitmap_bh->b_data))
+		goto bad_orphan;
+
+	inode = ext4_iget(sb, ino);
+	if (IS_ERR(inode))
+		goto iget_failed;
+
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext4_can_truncate(inode))
+		goto bad_orphan;
+
+	if (NEXT_ORPHAN(inode) > max_ino)
+		goto bad_orphan;
+	brelse(bitmap_bh);
+	return inode;
+
+iget_failed:
+	err = PTR_ERR(inode);
+	inode = NULL;
+bad_orphan:
+	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
+	printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+	       bit, (unsigned long long)bitmap_bh->b_blocknr,
+	       ext4_test_bit(bit, bitmap_bh->b_data));
+	printk(KERN_WARNING "inode=%p\n", inode);
+	if (inode) {
+		printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
+		       is_bad_inode(inode));
+		printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
+		       NEXT_ORPHAN(inode));
+		printk(KERN_WARNING "max_ino=%lu\n", max_ino);
+		printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
+		/* Avoid freeing blocks if we got a bad deleted inode */
+		if (inode->i_nlink == 0)
+			inode->i_blocks = 0;
+		iput(inode);
+	}
+	brelse(bitmap_bh);
+error:
+	return ERR_PTR(err);
+}
+
+unsigned long ext4_count_free_inodes(struct super_block *sb)
+{
+	unsigned long desc_count;
+	struct ext4_group_desc *gdp;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+#ifdef EXT4FS_DEBUG
+	struct ext4_super_block *es;
+	unsigned long bitmap_count, x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT4_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += ext4_free_inodes_count(sb, gdp);
+		brelse(bitmap_bh);
+		bitmap_bh = ext4_read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+
+		x = ext4_count_free(bitmap_bh->b_data,
+				    EXT4_INODES_PER_GROUP(sb) / 8);
+		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk(KERN_DEBUG "ext4_count_free_inodes: "
+	       "stored = %u, computed = %lu, %lu\n",
+	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	return desc_count;
+#else
+	desc_count = 0;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += ext4_free_inodes_count(sb, gdp);
+		cond_resched();
+	}
+	return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext4_count_dirs(struct super_block * sb)
+{
+	unsigned long count = 0;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+	for (i = 0; i < ngroups; i++) {
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		count += ext4_used_dirs_count(sb, gdp);
+	}
+	return count;
+}
+
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_new_inode() until we are finished.
+ */
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+				 int barrier)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *group_desc_bh;
+	handle_t *handle;
+	ext4_fsblk_t blk;
+	int num, ret = 0, used_blks = 0;
+
+	/* This should not happen, but just to be sure check this */
+	if (sb->s_flags & MS_RDONLY) {
+		ret = 1;
+		goto out;
+	}
+
+	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+	if (!gdp)
+		goto out;
+
+	/*
+	 * We do not need to lock this, because we are the only one
+	 * handling this flag.
+	 */
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+		goto out;
+
+	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	down_write(&grp->alloc_sem);
+	/*
+	 * If inode bitmap was already initialized there may be some
+	 * used inodes so we need to skip blocks with used inodes in
+	 * inode table.
+	 */
+	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+			    ext4_itable_unused_count(sb, gdp)),
+			    sbi->s_inodes_per_block);
+
+	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+		ext4_error(sb, "Something is wrong with group %u: "
+			   "used itable blocks: %d; "
+			   "itable unused count: %u",
+			   group, used_blks,
+			   ext4_itable_unused_count(sb, gdp));
+		ret = 1;
+		goto err_out;
+	}
+
+	blk = ext4_inode_table(sb, gdp) + used_blks;
+	num = sbi->s_itb_per_group - used_blks;
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	ret = ext4_journal_get_write_access(handle,
+					    group_desc_bh);
+	if (ret)
+		goto err_out;
+
+	/*
+	 * Skip zeroout if the inode table is full. But we set the ZEROED
+	 * flag anyway, because obviously, when it is full it does not need
+	 * further zeroing.
+	 */
+	if (unlikely(num == 0))
+		goto skip_zeroout;
+
+	ext4_debug("going to zero out inode table in group %d\n",
+		   group);
+	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+	if (ret < 0)
+		goto err_out;
+	if (barrier)
+		blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+
+skip_zeroout:
+	ext4_lock_group(sb, group);
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	ext4_group_desc_csum_set(sb, group, gdp);
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh,
+		     "call ext4_handle_dirty_metadata");
+	ret = ext4_handle_dirty_metadata(handle, NULL,
+					 group_desc_bh);
+
+err_out:
+	up_write(&grp->alloc_sem);
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
diff --git a/kernel/fs/ext4/indirect.c b/kernel/fs/ext4/indirect.c
new file mode 100644
index 000000000..958824019
--- /dev/null
+++ b/kernel/fs/ext4/indirect.c
@@ -0,0 +1,1558 @@
+/*
+ *  linux/fs/ext4/indirect.c
+ *
+ *  from
+ *
+ *  linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *	(sct@redhat.com), 1993, 1998
+ */
+
+#include "ext4_jbd2.h"
+#include "truncate.h"
+#include <linux/uio.h>
+
+#include <trace/events/ext4.h>
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+/**
+ *	ext4_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *	@boundary: set this non-zero if the referred-to block is likely to be
+ *	       followed (on disk) by an indirect block.
+ *
+ *	To store the locations of file's data ext4 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+			      ext4_lblk_t i_block,
+			      ext4_lblk_t offsets[4], int *boundary)
+{
+	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT4_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT4_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT4_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT4_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
+			     i_block + direct_blocks +
+			     indirect_blocks + double_blocks, inode->i_ino);
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+	return n;
+}
+
+/**
+ *	ext4_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ *      Need to be called with
+ *      down_read(&EXT4_I(inode)->i_data_sem)
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+				 ext4_lblk_t  *offsets,
+				 Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+	int ret = -EIO;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		if (unlikely(!bh)) {
+			ret = -ENOMEM;
+			goto failure;
+		}
+
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto failure;
+			}
+			/* validate block references */
+			if (ext4_check_indirect_blockref(inode, bh)) {
+				put_bh(bh);
+				goto failure;
+			}
+		}
+
+		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
+		/* Reader: end */
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+failure:
+	*err = ret;
+no_block:
+	return p;
+}
+
+/**
+ *	ext4_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the preferred place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same
+ *	    cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--) {
+		if (*p)
+			return le32_to_cpu(*p);
+	}
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred to from the inode itself? OK, just put it
+	 * into the same cylinder group then.
+	 */
+	return ext4_inode_to_goal_block(inode);
+}
+
+/**
+ *	ext4_find_goal - find a preferred place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@partial: pointer to the last triple within a chain
+ *
+ *	Normally this function find the preferred place for block allocation,
+ *	returns it.
+ *	Because this is only used for non-extent files, we limit the block nr
+ *	to 32 bits.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+				   Indirect *partial)
+{
+	ext4_fsblk_t goal;
+
+	/*
+	 * XXX need to get goal block from mballoc's data structures
+	 */
+
+	goal = ext4_find_near(inode, partial);
+	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+	return goal;
+}
+
+/**
+ *	ext4_blks_to_allocate - Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ *	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+				 int blocks_to_boundary)
+{
+	unsigned int count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now we don't handle cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary &&
+		le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext4_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@indirect_blks: number of allocated indirect blocks
+ *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext4_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext4_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle,
+			     struct ext4_allocation_request *ar,
+			     int indirect_blks, ext4_lblk_t *offsets,
+			     Indirect *branch)
+{
+	struct buffer_head *		bh;
+	ext4_fsblk_t			b, new_blocks[4];
+	__le32				*p;
+	int				i, j, err, len = 1;
+
+	for (i = 0; i <= indirect_blks; i++) {
+		if (i == indirect_blks) {
+			new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
+		} else
+			ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
+					ar->inode, ar->goal,
+					ar->flags & EXT4_MB_DELALLOC_RESERVED,
+					NULL, &err);
+		if (err) {
+			i--;
+			goto failed;
+		}
+		branch[i].key = cpu_to_le32(new_blocks[i]);
+		if (i == 0)
+			continue;
+
+		bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		err = ext4_journal_get_create_access(handle, bh);
+		if (err) {
+			unlock_buffer(bh);
+			goto failed;
+		}
+
+		memset(bh->b_data, 0, bh->b_size);
+		p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+		b = new_blocks[i];
+
+		if (i == indirect_blks)
+			len = ar->len;
+		for (j = 0; j < len; j++)
+			*p++ = cpu_to_le32(b++);
+
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
+		if (err)
+			goto failed;
+	}
+	return 0;
+failed:
+	for (; i >= 0; i--) {
+		/*
+		 * We want to ext4_forget() only freshly allocated indirect
+		 * blocks.  Buffer for new_blocks[i-1] is at branch[i].bh and
+		 * buffer at branch[0].bh is indirect block / inode already
+		 * existing before ext4_alloc_branch() was called.
+		 */
+		if (i > 0 && i != indirect_blks && branch[i].bh)
+			ext4_forget(handle, 1, ar->inode, branch[i].bh,
+				    branch[i].bh->b_blocknr);
+		ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+				 (i == indirect_blks) ? ar->len : 1, 0);
+	}
+	return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ *	ext4_alloc_branch)
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle,
+			      struct ext4_allocation_request *ar,
+			      Indirect *where, int num)
+{
+	int i;
+	int err = 0;
+	ext4_fsblk_t current_block;
+
+	/*
+	 * If we're splicing into a [td]indirect block (as opposed to the
+	 * inode) then we need to get write access to the [td]indirect block
+	 * before the splice.
+	 */
+	if (where->bh) {
+		BUFFER_TRACE(where->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, where->bh);
+		if (err)
+			goto err_out;
+	}
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && ar->len > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < ar->len; i++)
+			*(where->p + i) = cpu_to_le32(current_block++);
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		/*
+		 * If we spliced it onto an indirect block, we haven't
+		 * altered the inode.  Note however that if it is being spliced
+		 * onto an indirect block at the very end of the file (the
+		 * file is growing) then we *will* alter the inode to reflect
+		 * the new i_size.  But that is not done here - it is done in
+		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+		 */
+		jbd_debug(5, "splicing indirect only\n");
+		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
+		if (err)
+			goto err_out;
+	} else {
+		/*
+		 * OK, we spliced it into the inode itself on a direct block.
+		 */
+		ext4_mark_inode_dirty(handle, ar->inode);
+		jbd_debug(5, "splicing direct\n");
+	}
+	return err;
+
+err_out:
+	for (i = 1; i <= num; i++) {
+		/*
+		 * branch[i].bh is newly allocated, so there is no
+		 * need to revoke the block, which is why we don't
+		 * need to set EXT4_FREE_BLOCKS_METADATA.
+		 */
+		ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
+				 EXT4_FREE_BLOCKS_FORGET);
+	}
+	ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
+			 ar->len, 0);
+
+	return err;
+}
+
+/*
+ * The ext4_ind_map_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_map_blocks().
+ *
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ *
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
+ */
+int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map,
+			int flags)
+{
+	struct ext4_allocation_request ar;
+	int err = -EIO;
+	ext4_lblk_t offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	int count = 0;
+	ext4_fsblk_t first_block = 0;
+
+	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
+				   &blocks_to_boundary);
+
+	if (depth == 0)
+		goto out;
+
+	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		count++;
+		/*map more blocks*/
+		while (count < map->m_len && count <= blocks_to_boundary) {
+			ext4_fsblk_t blk;
+
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+		goto cleanup;
+
+	/*
+	 * Okay, we need to do block allocation.
+	*/
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+		EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+				 "non-extent mapped inodes with bigalloc");
+		return -ENOSPC;
+	}
+
+	/* Set up for the direct block allocation */
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.logical = map->m_lblk;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+
+	ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	ar.len = ext4_blks_to_allocate(partial, indirect_blks,
+				       map->m_len, blocks_to_boundary);
+
+	/*
+	 * Block out ext4_truncate while we alter the tree
+	 */
+	err = ext4_alloc_branch(handle, &ar, indirect_blks,
+				offsets + (partial - chain), partial);
+
+	/*
+	 * The ext4_splice_branch call will free and forget any buffers
+	 * on the new chain if there is a failure, but that risks using
+	 * up transaction credits, especially for bitmaps where the
+	 * credits cannot be returned.  Can we handle this somehow?  We
+	 * may need to return -EAGAIN upwards in the worst case.  --sct
+	 */
+	if (!err)
+		err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
+	if (err)
+		goto cleanup;
+
+	map->m_flags |= EXT4_MAP_NEW;
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
+	count = ar.len;
+got_it:
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = le32_to_cpu(chain[depth-1].key);
+	map->m_len = count;
+	if (count > blocks_to_boundary)
+		map->m_flags |= EXT4_MAP_BOUNDARY;
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+out:
+	trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
+	return err;
+}
+
+/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			   loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle;
+	ssize_t ret;
+	int orphan = 0;
+	size_t count = iov_iter_count(iter);
+	int retries = 0;
+
+	if (iov_iter_rw(iter) == WRITE) {
+		loff_t final_size = offset + count;
+
+		if (final_size > inode->i_size) {
+			/* Credits for sb + inode write */
+			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				goto out;
+			}
+			ret = ext4_orphan_add(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out;
+			}
+			orphan = 1;
+			ei->i_disksize = inode->i_size;
+			ext4_journal_stop(handle);
+		}
+	}
+
+retry:
+	if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
+		/*
+		 * Nolock dioread optimization may be dynamically disabled
+		 * via ext4_inode_block_unlocked_dio(). Check inode's state
+		 * while holding extra i_dio_count ref.
+		 */
+		inode_dio_begin(inode);
+		smp_mb();
+		if (unlikely(ext4_test_inode_state(inode,
+						    EXT4_STATE_DIOREAD_LOCK))) {
+			inode_dio_end(inode);
+			goto locked;
+		}
+		if (IS_DAX(inode))
+			ret = dax_do_io(iocb, inode, iter, offset,
+					ext4_get_block, NULL, 0);
+		else
+			ret = __blockdev_direct_IO(iocb, inode,
+						   inode->i_sb->s_bdev, iter,
+						   offset, ext4_get_block, NULL,
+						   NULL, 0);
+		inode_dio_end(inode);
+	} else {
+locked:
+		if (IS_DAX(inode))
+			ret = dax_do_io(iocb, inode, iter, offset,
+					ext4_get_block, NULL, DIO_LOCKING);
+		else
+			ret = blockdev_direct_IO(iocb, inode, iter, offset,
+						 ext4_get_block);
+
+		if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
+			loff_t isize = i_size_read(inode);
+			loff_t end = offset + count;
+
+			if (end > isize)
+				ext4_truncate_failed_write(inode);
+		}
+	}
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	if (orphan) {
+		int err;
+
+		/* Credits for sb + inode write */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle)) {
+			/* This is really bad luck. We've written the data
+			 * but cannot extend i_size. Bail out and pretend
+			 * the write failed... */
+			ret = PTR_ERR(handle);
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+
+			goto out;
+		}
+		if (inode->i_nlink)
+			ext4_orphan_del(handle, inode);
+		if (ret > 0) {
+			loff_t end = offset + ret;
+			if (end > inode->i_size) {
+				ei->i_disksize = end;
+				i_size_write(inode, end);
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext4_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+		err = ext4_journal_stop(handle);
+		if (ret == 0)
+			ret = err;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a new block at @lblocks for non extent file based file
+ */
+int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+	int blk_bits;
+
+	if (lblock < EXT4_NDIR_BLOCKS)
+		return 0;
+
+	lblock -= EXT4_NDIR_BLOCKS;
+
+	if (ei->i_da_metadata_calc_len &&
+	    (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+		ei->i_da_metadata_calc_len++;
+		return 0;
+	}
+	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+	ei->i_da_metadata_calc_len = 1;
+	blk_bits = order_base_2(lblock);
+	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+}
+
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
+{
+	/*
+	 * With N contiguous data blocks, we need at most
+	 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+	 * 2 dindirect blocks, and 1 tindirect block
+	 */
+	return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * Try to extend this transaction for the purposes of truncation.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+	if (!ext4_handle_valid(handle))
+		return 0;
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+		return 0;
+	if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
+		return 0;
+	return 1;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext4_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext4_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several
+ *	indirect blocks but leave the blocks themselves alive. Block is
+ *	partially truncated if some data below the new i_size is referred
+ *	from it (and it is on the path to the first completely truncated
+ *	data block, indeed).  We have to free the top of that path along
+ *	with everything to the right of the path. Since no allocation
+ *	past the truncation point is possible until ext4_truncate()
+ *	finishes, we may safely do the latter, but top of branch may
+ *	require special attention - pageout below the truncation point
+ *	might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the
+ *	block number of its root in *@top, pointers to buffer_heads of
+ *	partially truncated blocks - in @chain[].bh and pointers to
+ *	their last elements that should not be removed - in
+ *	@chain[].p. Return value is the pointer to last filled element
+ *	of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].
+ *			(no partially truncated stuff there).  */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+				  ext4_lblk_t offsets[4], Indirect chain[4],
+				  __le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	/* Make k index the deepest non-null offset + 1 */
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext4_get_branch(inode, k, offsets, chain, &err);
+	/* Writer: pointers */
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p)
+		/* Writer: end */
+		goto no_top;
+	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		/* Nope, don't do this in ext4.  Must leave the tree intact */
+#if 0
+		*p->p = 0;
+#endif
+	}
+	/* Writer: end */
+
+	while (partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
+ */
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh,
+			     ext4_fsblk_t block_to_free,
+			     unsigned long count, __le32 *first,
+			     __le32 *last)
+{
+	__le32 *p;
+	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
+	int	err;
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+	else if (ext4_should_journal_data(inode))
+		flags |= EXT4_FREE_BLOCKS_FORGET;
+
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+				   count)) {
+		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+				 "blocks %llu len %lu",
+				 (unsigned long long) block_to_free, count);
+		return 1;
+	}
+
+	if (try_to_extend_transaction(handle, inode)) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			err = ext4_handle_dirty_metadata(handle, inode, bh);
+			if (unlikely(err))
+				goto out_err;
+		}
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (unlikely(err))
+			goto out_err;
+		err = ext4_truncate_restart_trans(handle, inode,
+					ext4_blocks_for_truncate(inode));
+		if (unlikely(err))
+			goto out_err;
+		if (bh) {
+			BUFFER_TRACE(bh, "retaking write access");
+			err = ext4_journal_get_write_access(handle, bh);
+			if (unlikely(err))
+				goto out_err;
+		}
+	}
+
+	for (p = first; p < last; p++)
+		*p = 0;
+
+	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
+	return 0;
+out_err:
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle:	handle for this transaction
+ * @inode:	inode we are dealing with
+ * @this_bh:	indirect buffer_head which contains *@first and *@last
+ * @first:	array of block numbers
+ * @last:	points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+			   struct buffer_head *this_bh,
+			   __le32 *first, __le32 *last)
+{
+	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+	unsigned long count = 0;	    /* Number of blocks in the run */
+	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
+					       corresponding to
+					       block_to_free */
+	ext4_fsblk_t nr;		    /* Current block # */
+	__le32 *p;			    /* Pointer into inode/ind
+					       for current block */
+	int err = 0;
+
+	if (this_bh) {				/* For indirect block */
+		BUFFER_TRACE(this_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, this_bh);
+		/* Important: if we can't update the indirect pointers
+		 * to the blocks, we can't free them. */
+		if (err)
+			return;
+	}
+
+	for (p = first; p < last; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0) {
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			} else if (nr == block_to_free + count) {
+				count++;
+			} else {
+				err = ext4_clear_blocks(handle, inode, this_bh,
+						        block_to_free, count,
+						        block_to_free_p, p);
+				if (err)
+					break;
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			}
+		}
+	}
+
+	if (!err && count > 0)
+		err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+					count, block_to_free_p, p);
+	if (err < 0)
+		/* fatal error */
+		return;
+
+	if (this_bh) {
+		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
+			ext4_handle_dirty_metadata(handle, inode, this_bh);
+		else
+			EXT4_ERROR_INODE(inode,
+					 "circular indirect block detected at "
+					 "block %llu",
+				(unsigned long long) this_bh->b_blocknr);
+	}
+}
+
+/**
+ *	ext4_free_branches - free an array of branches
+ *	@handle: JBD handle for this transaction
+ *	@inode:	inode we are dealing with
+ *	@parent_bh: the buffer_head which contains *@first and *@last
+ *	@first:	array of block numbers
+ *	@last:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks referred from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+			       struct buffer_head *parent_bh,
+			       __le32 *first, __le32 *last, int depth)
+{
+	ext4_fsblk_t nr;
+	__le32 *p;
+
+	if (ext4_handle_is_aborted(handle))
+		return;
+
+	if (depth--) {
+		struct buffer_head *bh;
+		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+		p = last;
+		while (--p >= first) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;		/* A hole */
+
+			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						   nr, 1)) {
+				EXT4_ERROR_INODE(inode,
+						 "invalid indirect mapped "
+						 "block %lu (level %d)",
+						 (unsigned long) nr, depth);
+				break;
+			}
+
+			/* Go read the buffer for the next level down */
+			bh = sb_bread(inode->i_sb, nr);
+
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */
+			if (!bh) {
+				EXT4_ERROR_INODE_BLOCK(inode, nr,
+						       "Read failure");
+				continue;
+			}
+
+			/* This zaps the entire block.  Bottom up. */
+			BUFFER_TRACE(bh, "free child branches");
+			ext4_free_branches(handle, inode, bh,
+					(__le32 *) bh->b_data,
+					(__le32 *) bh->b_data + addr_per_block,
+					depth);
+			brelse(bh);
+
+			/*
+			 * Everything below this this pointer has been
+			 * released.  Now let this top-of-subtree go.
+			 *
+			 * We want the freeing of this indirect block to be
+			 * atomic in the journal with the updating of the
+			 * bitmap block which owns it.  So make some room in
+			 * the journal.
+			 *
+			 * We zero the parent pointer *after* freeing its
+			 * pointee in the bitmaps, so if extend_transaction()
+			 * for some reason fails to put the bitmap changes and
+			 * the release into the same transaction, recovery
+			 * will merely complain about releasing a free block,
+			 * rather than leaking blocks.
+			 */
+			if (ext4_handle_is_aborted(handle))
+				return;
+			if (try_to_extend_transaction(handle, inode)) {
+				ext4_mark_inode_dirty(handle, inode);
+				ext4_truncate_restart_trans(handle, inode,
+					    ext4_blocks_for_truncate(inode));
+			}
+
+			/*
+			 * The forget flag here is critical because if
+			 * we are journaling (and not doing data
+			 * journaling), we have to make sure a revoke
+			 * record is written to prevent the journal
+			 * replay from overwriting the (former)
+			 * indirect block if it gets reallocated as a
+			 * data block.  This must happen in the same
+			 * transaction where the data blocks are
+			 * actually freed.
+			 */
+			ext4_free_blocks(handle, inode, NULL, nr, 1,
+					 EXT4_FREE_BLOCKS_METADATA|
+					 EXT4_FREE_BLOCKS_FORGET);
+
+			if (parent_bh) {
+				/*
+				 * The block which we have just freed is
+				 * pointed to by an indirect block: journal it
+				 */
+				BUFFER_TRACE(parent_bh, "get_write_access");
+				if (!ext4_journal_get_write_access(handle,
+								   parent_bh)){
+					*p = 0;
+					BUFFER_TRACE(parent_bh,
+					"call ext4_handle_dirty_metadata");
+					ext4_handle_dirty_metadata(handle,
+								   inode,
+								   parent_bh);
+				}
+			}
+		}
+	} else {
+		/* We have reached the bottom of the tree. */
+		BUFFER_TRACE(parent_bh, "free data blocks");
+		ext4_free_data(handle, inode, parent_bh, first, last);
+	}
+}
+
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *i_data = ei->i_data;
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	ext4_lblk_t offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n = 0;
+	ext4_lblk_t last_block, max_block;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (last_block != max_block) {
+		n = ext4_block_to_path(inode, last_block, offsets, NULL);
+		if (n == 0)
+			return;
+	}
+
+	ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+
+	/*
+	 * The orphan list entry will now protect us from any crash which
+	 * occurs before the truncate completes, so it is now safe to propagate
+	 * the new, shorter inode size (held for now in i_size) into the
+	 * on-disk inode. We do this via i_disksize, which is the value which
+	 * ext4 *really* writes onto the disk inode.
+	 */
+	ei->i_disksize = inode->i_size;
+
+	if (last_block == max_block) {
+		/*
+		 * It is unnecessary to free any data blocks if last_block is
+		 * equal to the indirect block limit.
+		 */
+		return;
+	} else if (n == 1) {		/* direct blocks */
+		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+			       i_data + EXT4_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (not detached) */
+	if (nr) {
+		if (partial == chain) {
+			/* Shared branch grows from the inode */
+			ext4_free_branches(handle, inode, NULL,
+					   &nr, &nr+1, (chain+n-1) - partial);
+			*partial->p = 0;
+			/*
+			 * We mark the inode dirty prior to restart,
+			 * and prior to stop.  No need for it here.
+			 */
+		} else {
+			/* Shared branch grows from an indirect block */
+			BUFFER_TRACE(partial->bh, "get_write_access");
+			ext4_free_branches(handle, inode, partial->bh,
+					partial->p,
+					partial->p+1, (chain+n-1) - partial);
+		}
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+	default:
+		nr = i_data[EXT4_IND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+			i_data[EXT4_IND_BLOCK] = 0;
+		}
+	case EXT4_IND_BLOCK:
+		nr = i_data[EXT4_DIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+			i_data[EXT4_DIND_BLOCK] = 0;
+		}
+	case EXT4_DIND_BLOCK:
+		nr = i_data[EXT4_TIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+			i_data[EXT4_TIND_BLOCK] = 0;
+		}
+	case EXT4_TIND_BLOCK:
+		;
+	}
+}
+
+/**
+ *	ext4_ind_remove_space - remove space from the range
+ *	@handle: JBD handle for this transaction
+ *	@inode:	inode we are dealing with
+ *	@start:	First block to remove
+ *	@end:	One block after the last block to remove (exclusive)
+ *
+ *	Free the blocks in the defined range (end is exclusive endpoint of
+ *	range). This is used by ext4_punch_hole().
+ */
+int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
+			  ext4_lblk_t start, ext4_lblk_t end)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *i_data = ei->i_data;
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	ext4_lblk_t offsets[4], offsets2[4];
+	Indirect chain[4], chain2[4];
+	Indirect *partial, *partial2;
+	ext4_lblk_t max_block;
+	__le32 nr = 0, nr2 = 0;
+	int n = 0, n2 = 0;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+
+	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+	if (end >= max_block)
+		end = max_block;
+	if ((start >= end) || (start > max_block))
+		return 0;
+
+	n = ext4_block_to_path(inode, start, offsets, NULL);
+	n2 = ext4_block_to_path(inode, end, offsets2, NULL);
+
+	BUG_ON(n > n2);
+
+	if ((n == 1) && (n == n2)) {
+		/* We're punching only within direct block range */
+		ext4_free_data(handle, inode, NULL, i_data + offsets[0],
+			       i_data + offsets2[0]);
+		return 0;
+	} else if (n2 > n) {
+		/*
+		 * Start and end are on a different levels so we're going to
+		 * free partial block at start, and partial block at end of
+		 * the range. If there are some levels in between then
+		 * do_indirects label will take care of that.
+		 */
+
+		if (n == 1) {
+			/*
+			 * Start is at the direct block level, free
+			 * everything to the end of the level.
+			 */
+			ext4_free_data(handle, inode, NULL, i_data + offsets[0],
+				       i_data + EXT4_NDIR_BLOCKS);
+			goto end_range;
+		}
+
+
+		partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+		if (nr) {
+			if (partial == chain) {
+				/* Shared branch grows from the inode */
+				ext4_free_branches(handle, inode, NULL,
+					   &nr, &nr+1, (chain+n-1) - partial);
+				*partial->p = 0;
+			} else {
+				/* Shared branch grows from an indirect block */
+				BUFFER_TRACE(partial->bh, "get_write_access");
+				ext4_free_branches(handle, inode, partial->bh,
+					partial->p,
+					partial->p+1, (chain+n-1) - partial);
+			}
+		}
+
+		/*
+		 * Clear the ends of indirect blocks on the shared branch
+		 * at the start of the range
+		 */
+		while (partial > chain) {
+			ext4_free_branches(handle, inode, partial->bh,
+				partial->p + 1,
+				(__le32 *)partial->bh->b_data+addr_per_block,
+				(chain+n-1) - partial);
+			BUFFER_TRACE(partial->bh, "call brelse");
+			brelse(partial->bh);
+			partial--;
+		}
+
+end_range:
+		partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+		if (nr2) {
+			if (partial2 == chain2) {
+				/*
+				 * Remember, end is exclusive so here we're at
+				 * the start of the next level we're not going
+				 * to free. Everything was covered by the start
+				 * of the range.
+				 */
+				goto do_indirects;
+			}
+		} else {
+			/*
+			 * ext4_find_shared returns Indirect structure which
+			 * points to the last element which should not be
+			 * removed by truncate. But this is end of the range
+			 * in punch_hole so we need to point to the next element
+			 */
+			partial2->p++;
+		}
+
+		/*
+		 * Clear the ends of indirect blocks on the shared branch
+		 * at the end of the range
+		 */
+		while (partial2 > chain2) {
+			ext4_free_branches(handle, inode, partial2->bh,
+					   (__le32 *)partial2->bh->b_data,
+					   partial2->p,
+					   (chain2+n2-1) - partial2);
+			BUFFER_TRACE(partial2->bh, "call brelse");
+			brelse(partial2->bh);
+			partial2--;
+		}
+		goto do_indirects;
+	}
+
+	/* Punch happened within the same level (n == n2) */
+	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+
+	/* Free top, but only if partial2 isn't its subtree. */
+	if (nr) {
+		int level = min(partial - chain, partial2 - chain2);
+		int i;
+		int subtree = 1;
+
+		for (i = 0; i <= level; i++) {
+			if (offsets[i] != offsets2[i]) {
+				subtree = 0;
+				break;
+			}
+		}
+
+		if (!subtree) {
+			if (partial == chain) {
+				/* Shared branch grows from the inode */
+				ext4_free_branches(handle, inode, NULL,
+						   &nr, &nr+1,
+						   (chain+n-1) - partial);
+				*partial->p = 0;
+			} else {
+				/* Shared branch grows from an indirect block */
+				BUFFER_TRACE(partial->bh, "get_write_access");
+				ext4_free_branches(handle, inode, partial->bh,
+						   partial->p,
+						   partial->p+1,
+						   (chain+n-1) - partial);
+			}
+		}
+	}
+
+	if (!nr2) {
+		/*
+		 * ext4_find_shared returns Indirect structure which
+		 * points to the last element which should not be
+		 * removed by truncate. But this is end of the range
+		 * in punch_hole so we need to point to the next element
+		 */
+		partial2->p++;
+	}
+
+	while (partial > chain || partial2 > chain2) {
+		int depth = (chain+n-1) - partial;
+		int depth2 = (chain2+n2-1) - partial2;
+
+		if (partial > chain && partial2 > chain2 &&
+		    partial->bh->b_blocknr == partial2->bh->b_blocknr) {
+			/*
+			 * We've converged on the same block. Clear the range,
+			 * then we're done.
+			 */
+			ext4_free_branches(handle, inode, partial->bh,
+					   partial->p + 1,
+					   partial2->p,
+					   (chain+n-1) - partial);
+			BUFFER_TRACE(partial->bh, "call brelse");
+			brelse(partial->bh);
+			BUFFER_TRACE(partial2->bh, "call brelse");
+			brelse(partial2->bh);
+			return 0;
+		}
+
+		/*
+		 * The start and end partial branches may not be at the same
+		 * level even though the punch happened within one level. So, we
+		 * give them a chance to arrive at the same level, then walk
+		 * them in step with each other until we converge on the same
+		 * block.
+		 */
+		if (partial > chain && depth <= depth2) {
+			ext4_free_branches(handle, inode, partial->bh,
+					   partial->p + 1,
+					   (__le32 *)partial->bh->b_data+addr_per_block,
+					   (chain+n-1) - partial);
+			BUFFER_TRACE(partial->bh, "call brelse");
+			brelse(partial->bh);
+			partial--;
+		}
+		if (partial2 > chain2 && depth2 <= depth) {
+			ext4_free_branches(handle, inode, partial2->bh,
+					   (__le32 *)partial2->bh->b_data,
+					   partial2->p,
+					   (chain2+n2-1) - partial2);
+			BUFFER_TRACE(partial2->bh, "call brelse");
+			brelse(partial2->bh);
+			partial2--;
+		}
+	}
+	return 0;
+
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+	default:
+		if (++n >= n2)
+			return 0;
+		nr = i_data[EXT4_IND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+			i_data[EXT4_IND_BLOCK] = 0;
+		}
+	case EXT4_IND_BLOCK:
+		if (++n >= n2)
+			return 0;
+		nr = i_data[EXT4_DIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+			i_data[EXT4_DIND_BLOCK] = 0;
+		}
+	case EXT4_DIND_BLOCK:
+		if (++n >= n2)
+			return 0;
+		nr = i_data[EXT4_TIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+			i_data[EXT4_TIND_BLOCK] = 0;
+		}
+	case EXT4_TIND_BLOCK:
+		;
+	}
+	return 0;
+}
diff --git a/kernel/fs/ext4/inline.c b/kernel/fs/ext4/inline.c
new file mode 100644
index 000000000..095c7a258
--- /dev/null
+++ b/kernel/fs/ext4/inline.c
@@ -0,0 +1,2024 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fiemap.h>
+
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+
+#define EXT4_XATTR_SYSTEM_DATA	"data"
+#define EXT4_MIN_INLINE_DATA_SIZE	((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_OFFSET	2
+#define EXT4_INLINE_DOTDOT_SIZE		4
+
+static int ext4_get_inline_size(struct inode *inode)
+{
+	if (EXT4_I(inode)->i_inline_off)
+		return EXT4_I(inode)->i_inline_size;
+
+	return 0;
+}
+
+static int get_max_inline_xattr_value_size(struct inode *inode,
+					   struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	struct ext4_inode *raw_inode;
+	int free, min_offs;
+
+	min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+			EXT4_GOOD_OLD_INODE_SIZE -
+			EXT4_I(inode)->i_extra_isize -
+			sizeof(struct ext4_xattr_ibody_header);
+
+	/*
+	 * We need to subtract another sizeof(__u32) since an in-inode xattr
+	 * needs an empty 4 bytes to indicate the gap between the xattr entry
+	 * and the name/value pair.
+	 */
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		return EXT4_XATTR_SIZE(min_offs -
+			EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+			EXT4_XATTR_ROUND - sizeof(__u32));
+
+	raw_inode = ext4_raw_inode(iloc);
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+
+	/* Compute min_offs. */
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_block && entry->e_value_size) {
+			size_t offs = le16_to_cpu(entry->e_value_offs);
+			if (offs < min_offs)
+				min_offs = offs;
+		}
+	}
+	free = min_offs -
+		((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+
+	if (EXT4_I(inode)->i_inline_off) {
+		entry = (struct ext4_xattr_entry *)
+			((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+
+		free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
+		goto out;
+	}
+
+	free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+
+	if (free > EXT4_XATTR_ROUND)
+		free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+	else
+		free = 0;
+
+out:
+	return free;
+}
+
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+	int error, max_inline_size;
+	struct ext4_iloc iloc;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error) {
+		ext4_error_inode(inode, __func__, __LINE__, 0,
+				 "can't get inode location %lu",
+				 inode->i_ino);
+		return 0;
+	}
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+
+	brelse(iloc.bh);
+
+	if (!max_inline_size)
+		return 0;
+
+	return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	if (!is.s.not_found) {
+		EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+					(void *)ext4_raw_inode(&is.iloc));
+		EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+				le32_to_cpu(is.s.here->e_value_size);
+		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	}
+out:
+	brelse(is.iloc.bh);
+	return error;
+}
+
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+				 unsigned int len,
+				 struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+	int cp_len = 0;
+	struct ext4_inode *raw_inode;
+
+	if (!len)
+		return 0;
+
+	BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+	cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+			len : EXT4_MIN_INLINE_DATA_SIZE;
+
+	raw_inode = ext4_raw_inode(iloc);
+	memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+
+	len -= cp_len;
+	buffer += cp_len;
+
+	if (!len)
+		goto out;
+
+	header = IHDR(inode, raw_inode);
+	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+					    EXT4_I(inode)->i_inline_off);
+	len = min_t(unsigned int, len,
+		    (unsigned int)le32_to_cpu(entry->e_value_size));
+
+	memcpy(buffer,
+	       (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+	cp_len += len;
+
+out:
+	return cp_len;
+}
+
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+				   void *buffer, loff_t pos, unsigned int len)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	int cp_len = 0;
+
+	BUG_ON(!EXT4_I(inode)->i_inline_off);
+	BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+	raw_inode = ext4_raw_inode(iloc);
+	buffer += pos;
+
+	if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+		cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+			 EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+		memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+
+		len -= cp_len;
+		buffer += cp_len;
+		pos += cp_len;
+	}
+
+	if (!len)
+		return;
+
+	pos -= EXT4_MIN_INLINE_DATA_SIZE;
+	header = IHDR(inode, raw_inode);
+	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+					    EXT4_I(inode)->i_inline_off);
+
+	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+	       buffer, len);
+}
+
+static int ext4_create_inline_data(handle_t *handle,
+				   struct inode *inode, unsigned len)
+{
+	int error;
+	void *value = NULL;
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	BUFFER_TRACE(is.iloc.bh, "get_write_access");
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto out;
+
+	if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+		value = EXT4_ZERO_XATTR_VALUE;
+		len -= EXT4_MIN_INLINE_DATA_SIZE;
+	} else {
+		value = "";
+		len = 0;
+	}
+
+	/* Insert the the xttr entry. */
+	i.value = value;
+	i.value_len = len;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	BUG_ON(!is.s.not_found);
+
+	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+	if (error) {
+		if (error == -ENOSPC)
+			ext4_clear_inode_state(inode,
+					       EXT4_STATE_MAY_INLINE_DATA);
+		goto out;
+	}
+
+	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+		0, EXT4_MIN_INLINE_DATA_SIZE);
+
+	EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+				      (void *)ext4_raw_inode(&is.iloc));
+	EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+	ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+	get_bh(is.iloc.bh);
+	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+	brelse(is.iloc.bh);
+	return error;
+}
+
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+				   unsigned int len)
+{
+	int error;
+	void *value = NULL;
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+
+	/* If the old space is ok, write the data directly. */
+	if (len <= EXT4_I(inode)->i_inline_size)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	BUG_ON(is.s.not_found);
+
+	len -= EXT4_MIN_INLINE_DATA_SIZE;
+	value = kzalloc(len, GFP_NOFS);
+	if (!value)
+		goto out;
+
+	error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+				     value, len);
+	if (error == -ENODATA)
+		goto out;
+
+	BUFFER_TRACE(is.iloc.bh, "get_write_access");
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto out;
+
+	/* Update the xttr entry. */
+	i.value = value;
+	i.value_len = len;
+
+	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+	if (error)
+		goto out;
+
+	EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+				      (void *)ext4_raw_inode(&is.iloc));
+	EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+				le32_to_cpu(is.s.here->e_value_size);
+	ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	get_bh(is.iloc.bh);
+	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+	kfree(value);
+	brelse(is.iloc.bh);
+	return error;
+}
+
+static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+				    unsigned int len)
+{
+	int ret, size;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+		return -ENOSPC;
+
+	size = ext4_get_max_inline_size(inode);
+	if (size < len)
+		return -ENOSPC;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+
+	if (ei->i_inline_off)
+		ret = ext4_update_inline_data(handle, inode, len);
+	else
+		ret = ext4_create_inline_data(handle, inode, len);
+
+	up_write(&EXT4_I(inode)->xattr_sem);
+
+	return ret;
+}
+
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+					   struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = 0, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+		.value = NULL,
+		.value_len = 0,
+	};
+	int error;
+
+	if (!ei->i_inline_off)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	BUFFER_TRACE(is.iloc.bh, "get_write_access");
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto out;
+
+	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+	if (error)
+		goto out;
+
+	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+		0, EXT4_MIN_INLINE_DATA_SIZE);
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		if (S_ISDIR(inode->i_mode) ||
+		    S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode);
+		}
+	}
+	ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+
+	get_bh(is.iloc.bh);
+	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+	EXT4_I(inode)->i_inline_off = 0;
+	EXT4_I(inode)->i_inline_size = 0;
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+	brelse(is.iloc.bh);
+	if (error == -ENODATA)
+		error = 0;
+	return error;
+}
+
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+	void *kaddr;
+	int ret = 0;
+	size_t len;
+	struct ext4_iloc iloc;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!ext4_has_inline_data(inode));
+	BUG_ON(page->index);
+
+	if (!EXT4_I(inode)->i_inline_off) {
+		ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+			     inode->i_ino);
+		goto out;
+	}
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		goto out;
+
+	len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+	kaddr = kmap_atomic(page);
+	ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+	zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	SetPageUptodate(page);
+	brelse(iloc.bh);
+
+out:
+	return ret;
+}
+
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+	int ret = 0;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Current inline data can only exist in the 1st page,
+	 * So for all the other pages, just set them uptodate.
+	 */
+	if (!page->index)
+		ret = ext4_read_inline_page(inode, page);
+	else if (!PageUptodate(page)) {
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+	}
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+
+	unlock_page(page);
+	return ret >= 0 ? 0 : ret;
+}
+
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+					      struct inode *inode,
+					      unsigned flags)
+{
+	int ret, needed_blocks;
+	handle_t *handle = NULL;
+	int retries = 0, sem_held = 0;
+	struct page *page = NULL;
+	unsigned from, to;
+	struct ext4_iloc iloc;
+
+	if (!ext4_has_inline_data(inode)) {
+		/*
+		 * clear the flag so that no new write
+		 * will trap here again.
+		 */
+		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+		return 0;
+	}
+
+	needed_blocks = ext4_writepage_trans_blocks(inode);
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	sem_held = 1;
+	/* If some one has already done this for us, just exit. */
+	if (!ext4_has_inline_data(inode)) {
+		ret = 0;
+		goto out;
+	}
+
+	from = 0;
+	to = ext4_get_inline_size(inode);
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = ext4_destroy_inline_data_nolock(handle, inode);
+	if (ret)
+		goto out;
+
+	if (ext4_should_dioread_nolock(inode))
+		ret = __block_write_begin(page, from, to, ext4_get_block_write);
+	else
+		ret = __block_write_begin(page, from, to, ext4_get_block);
+
+	if (!ret && ext4_should_journal_data(inode)) {
+		ret = ext4_walk_page_buffers(handle, page_buffers(page),
+					     from, to, NULL,
+					     do_journal_get_write_access);
+	}
+
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		ext4_orphan_add(handle, inode);
+		up_write(&EXT4_I(inode)->xattr_sem);
+		sem_held = 0;
+		ext4_journal_stop(handle);
+		handle = NULL;
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might
+		 * still be on the orphan list; we need to
+		 * make sure the inode is removed from the
+		 * orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	if (page)
+		block_commit_write(page, from, to);
+out:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (sem_held)
+		up_write(&EXT4_I(inode)->xattr_sem);
+	if (handle)
+		ext4_journal_stop(handle);
+	brelse(iloc.bh);
+	return ret;
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+				  struct inode *inode,
+				  loff_t pos, unsigned len,
+				  unsigned flags,
+				  struct page **pagep)
+{
+	int ret;
+	handle_t *handle;
+	struct page *page;
+	struct ext4_iloc iloc;
+
+	if (pos + len > ext4_get_max_inline_size(inode))
+		goto convert;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	/*
+	 * The possible write could happen in the inode,
+	 * so try to reserve the space in inode first.
+	 */
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	ret = ext4_prepare_inline_data(handle, inode, pos + len);
+	if (ret && ret != -ENOSPC)
+		goto out;
+
+	/* We don't have space in inline inode, so convert it to extent. */
+	if (ret == -ENOSPC) {
+		ext4_journal_stop(handle);
+		brelse(iloc.bh);
+		goto convert;
+	}
+
+	flags |= AOP_FLAG_NOFS;
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	*pagep = page;
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		ret = 0;
+		unlock_page(page);
+		page_cache_release(page);
+		goto out_up_read;
+	}
+
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out_up_read;
+	}
+
+	ret = 1;
+	handle = NULL;
+out_up_read:
+	up_read(&EXT4_I(inode)->xattr_sem);
+out:
+	if (handle)
+		ext4_journal_stop(handle);
+	brelse(iloc.bh);
+	return ret;
+convert:
+	return ext4_convert_inline_data_to_extent(mapping,
+						  inode, flags);
+}
+
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+			       unsigned copied, struct page *page)
+{
+	int ret;
+	void *kaddr;
+	struct ext4_iloc iloc;
+
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(page)) {
+			copied = 0;
+			goto out;
+		}
+	}
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret) {
+		ext4_std_error(inode->i_sb, ret);
+		copied = 0;
+		goto out;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	BUG_ON(!ext4_has_inline_data(inode));
+
+	kaddr = kmap_atomic(page);
+	ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+	kunmap_atomic(kaddr);
+	SetPageUptodate(page);
+	/* clear page dirty so that writepages wouldn't work for us. */
+	ClearPageDirty(page);
+
+	up_write(&EXT4_I(inode)->xattr_sem);
+	brelse(iloc.bh);
+out:
+	return copied;
+}
+
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+				  unsigned len,
+				  struct page *page)
+{
+	int ret;
+	void *kaddr;
+	struct ext4_iloc iloc;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret) {
+		ext4_std_error(inode->i_sb, ret);
+		return NULL;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	kaddr = kmap_atomic(page);
+	ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+	kunmap_atomic(kaddr);
+	up_write(&EXT4_I(inode)->xattr_sem);
+
+	return iloc.bh;
+}
+
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ *    clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ *    update and dirty so that ext4_da_writepages can handle it. We don't
+ *    need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+						 struct inode *inode,
+						 unsigned flags,
+						 void **fsdata)
+{
+	int ret = 0, inline_size;
+	struct page *page;
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page)
+		return -ENOMEM;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+		goto out;
+	}
+
+	inline_size = ext4_get_inline_size(inode);
+
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = __block_write_begin(page, 0, inline_size,
+				  ext4_da_get_block_prep);
+	if (ret) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		unlock_page(page);
+		page_cache_release(page);
+		ext4_truncate_failed_write(inode);
+		return ret;
+	}
+
+	SetPageDirty(page);
+	SetPageUptodate(page);
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	*fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+				    struct inode *inode,
+				    loff_t pos, unsigned len,
+				    unsigned flags,
+				    struct page **pagep,
+				    void **fsdata)
+{
+	int ret, inline_size;
+	handle_t *handle;
+	struct page *page;
+	struct ext4_iloc iloc;
+	int retries;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	inline_size = ext4_get_max_inline_size(inode);
+
+	ret = -ENOSPC;
+	if (inline_size >= pos + len) {
+		ret = ext4_prepare_inline_data(handle, inode, pos + len);
+		if (ret && ret != -ENOSPC)
+			goto out_journal;
+	}
+
+	/*
+	 * We cannot recurse into the filesystem as the transaction
+	 * is already started.
+	 */
+	flags |= AOP_FLAG_NOFS;
+
+	if (ret == -ENOSPC) {
+		ret = ext4_da_convert_inline_data_to_extent(mapping,
+							    inode,
+							    flags,
+							    fsdata);
+		ext4_journal_stop(handle);
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+		goto out;
+	}
+
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out_journal;
+	}
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		ret = 0;
+		goto out_release_page;
+	}
+
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out_release_page;
+	}
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+	*pagep = page;
+	brelse(iloc.bh);
+	return 1;
+out_release_page:
+	up_read(&EXT4_I(inode)->xattr_sem);
+	unlock_page(page);
+	page_cache_release(page);
+out_journal:
+	ext4_journal_stop(handle);
+out:
+	brelse(iloc.bh);
+	return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+				  unsigned len, unsigned copied,
+				  struct page *page)
+{
+	int i_size_changed = 0;
+
+	copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		i_size_changed = 1;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
+	return copied;
+}
+
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+			  void *inline_start, int inline_size)
+{
+	int offset;
+	unsigned short de_len;
+	struct ext4_dir_entry_2 *de = inline_start;
+	void *dlimit = inline_start + inline_size;
+
+	trace_printk("inode %lu\n", dir->i_ino);
+	offset = 0;
+	while ((void *)de < dlimit) {
+		de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+		trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n",
+			     offset, de_len, de->name_len, de->name,
+			     de->name_len, le32_to_cpu(de->inode));
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+					 inline_start, inline_size, offset))
+			BUG();
+
+		offset += de_len;
+		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+	}
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+				     struct dentry *dentry,
+				     struct inode *inode,
+				     struct ext4_iloc *iloc,
+				     void *inline_start, int inline_size)
+{
+	struct inode	*dir = d_inode(dentry->d_parent);
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	int		err;
+	struct ext4_dir_entry_2 *de;
+
+	err = ext4_find_dest_de(dir, inode, iloc->bh,
+				inline_start, inline_size,
+				name, namelen, &de);
+	if (err)
+		return err;
+
+	BUFFER_TRACE(iloc->bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, iloc->bh);
+	if (err)
+		return err;
+	ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name,
+			   name, namelen);
+
+	ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 *
+	 * XXX similarly, too many callers depend on
+	 * ext4_new_inode() setting the times, but error
+	 * recovery deletes the inode, so the worst that can
+	 * happen is that the times are slightly out of date
+	 * and/or different from the directory change time.
+	 */
+	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	ext4_update_dx_flag(dir);
+	dir->i_version++;
+	ext4_mark_inode_dirty(handle, dir);
+	return 1;
+}
+
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+				       struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+
+	BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+	header = IHDR(inode, ext4_raw_inode(iloc));
+	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+					    EXT4_I(inode)->i_inline_off);
+
+	return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+	struct ext4_dir_entry_2 *de, *prev_de;
+	void *limit;
+	int de_len;
+
+	de = (struct ext4_dir_entry_2 *)de_buf;
+	if (old_size) {
+		limit = de_buf + old_size;
+		do {
+			prev_de = de;
+			de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+			de_buf += de_len;
+			de = (struct ext4_dir_entry_2 *)de_buf;
+		} while (de_buf < limit);
+
+		prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+							old_size, new_size);
+	} else {
+		/* this is just created, so create an empty entry. */
+		de->inode = 0;
+		de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+	}
+}
+
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+				  struct ext4_iloc *iloc)
+{
+	int ret;
+	int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+	int new_size = get_max_inline_xattr_value_size(dir, iloc);
+
+	if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+		return -ENOSPC;
+
+	ret = ext4_update_inline_data(handle, dir,
+				      new_size + EXT4_MIN_INLINE_DATA_SIZE);
+	if (ret)
+		return ret;
+
+	ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+			     EXT4_I(dir)->i_inline_size -
+						EXT4_MIN_INLINE_DATA_SIZE);
+	dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+	return 0;
+}
+
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+				     struct ext4_iloc *iloc,
+				     void *buf, int inline_size)
+{
+	ext4_create_inline_data(handle, inode, inline_size);
+	ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+	ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+					  struct inode *inode,
+					  struct buffer_head *dir_block,
+					  void *buf,
+					  int inline_size)
+{
+	int err, csum_size = 0, header_size = 0;
+	struct ext4_dir_entry_2 *de;
+	struct ext4_dir_entry_tail *t;
+	void *target = dir_block->b_data;
+
+	/*
+	 * First create "." and ".." and then copy the dir information
+	 * back to the block.
+	 */
+	de = (struct ext4_dir_entry_2 *)target;
+	de = ext4_init_dot_dotdot(inode, de,
+		inode->i_sb->s_blocksize, csum_size,
+		le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+	header_size = (void *)de - target;
+
+	memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+		inline_size - EXT4_INLINE_DOTDOT_SIZE);
+
+	if (ext4_has_metadata_csum(inode->i_sb))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	inode->i_size = inode->i_sb->s_blocksize;
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+	ext4_update_final_de(dir_block->b_data,
+			inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+			inode->i_sb->s_blocksize - csum_size);
+
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(dir_block->b_data,
+				     inode->i_sb->s_blocksize);
+		initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+	}
+	set_buffer_uptodate(dir_block);
+	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+	if (err)
+		goto out;
+	set_buffer_verified(dir_block);
+out:
+	return err;
+}
+
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+					   struct inode *inode,
+					   struct ext4_iloc *iloc)
+{
+	int error;
+	void *buf = NULL;
+	struct buffer_head *data_bh = NULL;
+	struct ext4_map_blocks map;
+	int inline_size;
+
+	inline_size = ext4_get_inline_size(inode);
+	buf = kmalloc(inline_size, GFP_NOFS);
+	if (!buf) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * Make sure the inline directory entries pass checks before we try to
+	 * convert them, so that we avoid touching stuff that needs fsck.
+	 */
+	if (S_ISDIR(inode->i_mode)) {
+		error = ext4_check_all_de(inode, iloc->bh,
+					buf + EXT4_INLINE_DOTDOT_SIZE,
+					inline_size - EXT4_INLINE_DOTDOT_SIZE);
+		if (error)
+			goto out;
+	}
+
+	error = ext4_destroy_inline_data_nolock(handle, inode);
+	if (error)
+		goto out;
+
+	map.m_lblk = 0;
+	map.m_len = 1;
+	map.m_flags = 0;
+	error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+	if (error < 0)
+		goto out_restore;
+	if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+		error = -EIO;
+		goto out_restore;
+	}
+
+	data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (!data_bh) {
+		error = -ENOMEM;
+		goto out_restore;
+	}
+
+	lock_buffer(data_bh);
+	error = ext4_journal_get_create_access(handle, data_bh);
+	if (error) {
+		unlock_buffer(data_bh);
+		error = -EIO;
+		goto out_restore;
+	}
+	memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+
+	if (!S_ISDIR(inode->i_mode)) {
+		memcpy(data_bh->b_data, buf, inline_size);
+		set_buffer_uptodate(data_bh);
+		error = ext4_handle_dirty_metadata(handle,
+						   inode, data_bh);
+	} else {
+		error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+						       buf, inline_size);
+	}
+
+	unlock_buffer(data_bh);
+out_restore:
+	if (error)
+		ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+
+out:
+	brelse(data_bh);
+	kfree(buf);
+	return error;
+}
+
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+			      struct inode *inode)
+{
+	int ret, inline_size;
+	void *inline_start;
+	struct ext4_iloc iloc;
+	struct inode *dir = d_inode(dentry->d_parent);
+
+	ret = ext4_get_inode_loc(dir, &iloc);
+	if (ret)
+		return ret;
+
+	down_write(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir))
+		goto out;
+
+	inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+						 EXT4_INLINE_DOTDOT_SIZE;
+	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+
+	ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+					inline_start, inline_size);
+	if (ret != -ENOSPC)
+		goto out;
+
+	/* check whether it can be inserted to inline xattr space. */
+	inline_size = EXT4_I(dir)->i_inline_size -
+			EXT4_MIN_INLINE_DATA_SIZE;
+	if (!inline_size) {
+		/* Try to use the xattr space.*/
+		ret = ext4_update_inline_dir(handle, dir, &iloc);
+		if (ret && ret != -ENOSPC)
+			goto out;
+
+		inline_size = EXT4_I(dir)->i_inline_size -
+				EXT4_MIN_INLINE_DATA_SIZE;
+	}
+
+	if (inline_size) {
+		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+
+		ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+						inline_start, inline_size);
+
+		if (ret != -ENOSPC)
+			goto out;
+	}
+
+	/*
+	 * The inline space is filled up, so create a new block for it.
+	 * As the extent tree will be created, we have to save the inline
+	 * dir first.
+	 */
+	ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+
+out:
+	ext4_mark_inode_dirty(handle, dir);
+	up_write(&EXT4_I(dir)->xattr_sem);
+	brelse(iloc.bh);
+	return ret;
+}
+
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+			    struct inode *dir, ext4_lblk_t block,
+			    struct dx_hash_info *hinfo,
+			    __u32 start_hash, __u32 start_minor_hash,
+			    int *has_inline_data)
+{
+	int err = 0, count = 0;
+	unsigned int parent_ino;
+	int pos;
+	struct ext4_dir_entry_2 *de;
+	struct inode *inode = file_inode(dir_file);
+	int ret, inline_size = 0;
+	struct ext4_iloc iloc;
+	void *dir_buf = NULL;
+	struct ext4_dir_entry_2 fake;
+	struct ext4_str tmp_str;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	inline_size = ext4_get_inline_size(inode);
+	dir_buf = kmalloc(inline_size, GFP_NOFS);
+	if (!dir_buf) {
+		ret = -ENOMEM;
+		up_read(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+
+	ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (ret < 0)
+		goto out;
+
+	pos = 0;
+	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	while (pos < inline_size) {
+		/*
+		 * As inlined dir doesn't store any information about '.' and
+		 * only the inode number of '..' is stored, we have to handle
+		 * them differently.
+		 */
+		if (pos == 0) {
+			fake.inode = cpu_to_le32(inode->i_ino);
+			fake.name_len = 1;
+			strcpy(fake.name, ".");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_OFFSET;
+		} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+			fake.inode = cpu_to_le32(parent_ino);
+			fake.name_len = 2;
+			strcpy(fake.name, "..");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_SIZE;
+		} else {
+			de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+			pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+			if (ext4_check_dir_entry(inode, dir_file, de,
+					 iloc.bh, dir_buf,
+					 inline_size, pos)) {
+				ret = count;
+				goto out;
+			}
+		}
+
+		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		tmp_str.name = de->name;
+		tmp_str.len = de->name_len;
+		err = ext4_htree_store_dirent(dir_file, hinfo->hash,
+					      hinfo->minor_hash, de, &tmp_str);
+		if (err) {
+			count = err;
+			goto out;
+		}
+		count++;
+	}
+	ret = count;
+out:
+	kfree(dir_buf);
+	brelse(iloc.bh);
+	return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
+int ext4_read_inline_dir(struct file *file,
+			 struct dir_context *ctx,
+			 int *has_inline_data)
+{
+	unsigned int offset, parent_ino;
+	int i;
+	struct ext4_dir_entry_2 *de;
+	struct super_block *sb;
+	struct inode *inode = file_inode(file);
+	int ret, inline_size = 0;
+	struct ext4_iloc iloc;
+	void *dir_buf = NULL;
+	int dotdot_offset, dotdot_size, extra_offset, extra_size;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	inline_size = ext4_get_inline_size(inode);
+	dir_buf = kmalloc(inline_size, GFP_NOFS);
+	if (!dir_buf) {
+		ret = -ENOMEM;
+		up_read(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+
+	ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+	sb = inode->i_sb;
+	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	offset = ctx->pos;
+
+	/*
+	 * dotdot_offset and dotdot_size is the real offset and
+	 * size for ".." and "." if the dir is block based while
+	 * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+	 * So we will use extra_offset and extra_size to indicate them
+	 * during the inline dir iteration.
+	 */
+	dotdot_offset = EXT4_DIR_REC_LEN(1);
+	dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+	extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+	extra_size = extra_offset + inline_size;
+
+	/*
+	 * If the version has changed since the last call to
+	 * readdir(2), then we might be pointing to an invalid
+	 * dirent right now.  Scan from the start of the inline
+	 * dir to make sure.
+	 */
+	if (file->f_version != inode->i_version) {
+		for (i = 0; i < extra_size && i < offset;) {
+			/*
+			 * "." is with offset 0 and
+			 * ".." is dotdot_offset.
+			 */
+			if (!i) {
+				i = dotdot_offset;
+				continue;
+			} else if (i == dotdot_offset) {
+				i = dotdot_size;
+				continue;
+			}
+			/* for other entry, the real offset in
+			 * the buf has to be tuned accordingly.
+			 */
+			de = (struct ext4_dir_entry_2 *)
+				(dir_buf + i - extra_offset);
+			/* It's too expensive to do a full
+			 * dirent test each time round this
+			 * loop, but we do have to test at
+			 * least that it is non-zero.  A
+			 * failure will be detected in the
+			 * dirent test below. */
+			if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+				< EXT4_DIR_REC_LEN(1))
+				break;
+			i += ext4_rec_len_from_disk(de->rec_len,
+						    extra_size);
+		}
+		offset = i;
+		ctx->pos = offset;
+		file->f_version = inode->i_version;
+	}
+
+	while (ctx->pos < extra_size) {
+		if (ctx->pos == 0) {
+			if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_offset;
+			continue;
+		}
+
+		if (ctx->pos == dotdot_offset) {
+			if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_size;
+			continue;
+		}
+
+		de = (struct ext4_dir_entry_2 *)
+			(dir_buf + ctx->pos - extra_offset);
+		if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+					 extra_size, ctx->pos))
+			goto out;
+		if (le32_to_cpu(de->inode)) {
+			if (!dir_emit(ctx, de->name, de->name_len,
+				      le32_to_cpu(de->inode),
+				      get_dtype(sb, de->file_type)))
+				goto out;
+		}
+		ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
+	}
+out:
+	kfree(dir_buf);
+	brelse(iloc.bh);
+	return ret;
+}
+
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+					struct ext4_dir_entry_2 **parent_de,
+					int *retval)
+{
+	struct ext4_iloc iloc;
+
+	*retval = ext4_get_inode_loc(inode, &iloc);
+	if (*retval)
+		return NULL;
+
+	*parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+
+	return iloc.bh;
+}
+
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+			       struct inode *inode)
+{
+	int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+	struct ext4_iloc iloc;
+	struct ext4_dir_entry_2 *de;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	ret = ext4_prepare_inline_data(handle, inode, inline_size);
+	if (ret)
+		goto out;
+
+	/*
+	 * For inline dir, we only save the inode information for the ".."
+	 * and create a fake dentry to cover the left space.
+	 */
+	de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+	de->inode = cpu_to_le32(parent->i_ino);
+	de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+	de->inode = 0;
+	de->rec_len = ext4_rec_len_to_disk(
+				inline_size - EXT4_INLINE_DOTDOT_SIZE,
+				inline_size);
+	set_nlink(inode, 2);
+	inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+	brelse(iloc.bh);
+	return ret;
+}
+
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 **res_dir,
+					int *has_inline_data)
+{
+	int ret;
+	struct ext4_iloc iloc;
+	void *inline_start;
+	int inline_size;
+
+	if (ext4_get_inode_loc(dir, &iloc))
+		return NULL;
+
+	down_read(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir)) {
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+						EXT4_INLINE_DOTDOT_SIZE;
+	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+	ret = search_dir(iloc.bh, inline_start, inline_size,
+			 dir, d_name, 0, res_dir);
+	if (ret == 1)
+		goto out_find;
+	if (ret < 0)
+		goto out;
+
+	if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+		goto out;
+
+	inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+	inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+
+	ret = search_dir(iloc.bh, inline_start, inline_size,
+			 dir, d_name, 0, res_dir);
+	if (ret == 1)
+		goto out_find;
+
+out:
+	brelse(iloc.bh);
+	iloc.bh = NULL;
+out_find:
+	up_read(&EXT4_I(dir)->xattr_sem);
+	return iloc.bh;
+}
+
+int ext4_delete_inline_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh,
+			     int *has_inline_data)
+{
+	int err, inline_size;
+	struct ext4_iloc iloc;
+	void *inline_start;
+
+	err = ext4_get_inode_loc(dir, &iloc);
+	if (err)
+		return err;
+
+	down_write(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir)) {
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+		EXT4_MIN_INLINE_DATA_SIZE) {
+		inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+					EXT4_INLINE_DOTDOT_SIZE;
+		inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+				EXT4_INLINE_DOTDOT_SIZE;
+	} else {
+		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+		inline_size = ext4_get_inline_size(dir) -
+				EXT4_MIN_INLINE_DATA_SIZE;
+	}
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err)
+		goto out;
+
+	err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+					inline_start, inline_size, 0);
+	if (err)
+		goto out;
+
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_mark_inode_dirty(handle, dir);
+	if (unlikely(err))
+		goto out;
+
+	ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+	up_write(&EXT4_I(dir)->xattr_sem);
+	brelse(iloc.bh);
+	if (err != -ENOENT)
+		ext4_std_error(dir->i_sb, err);
+	return err;
+}
+
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+		      struct ext4_iloc *iloc,
+		      unsigned int offset,
+		      void **inline_start,
+		      int *inline_size)
+{
+	void *inline_pos;
+
+	BUG_ON(offset > ext4_get_inline_size(inode));
+
+	if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+		inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+		*inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+	} else {
+		inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+		offset -= EXT4_MIN_INLINE_DATA_SIZE;
+		*inline_size = ext4_get_inline_size(inode) -
+				EXT4_MIN_INLINE_DATA_SIZE;
+	}
+
+	if (inline_start)
+		*inline_start = inline_pos;
+	return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+	int err, inline_size;
+	struct ext4_iloc iloc;
+	void *inline_pos;
+	unsigned int offset;
+	struct ext4_dir_entry_2 *de;
+	int ret = 1;
+
+	err = ext4_get_inode_loc(dir, &iloc);
+	if (err) {
+		EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+				 err, dir->i_ino);
+		return 1;
+	}
+
+	down_read(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir)) {
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+	if (!le32_to_cpu(de->inode)) {
+		ext4_warning(dir->i_sb,
+			     "bad inline directory (dir #%lu) - no `..'",
+			     dir->i_ino);
+		ret = 1;
+		goto out;
+	}
+
+	offset = EXT4_INLINE_DOTDOT_SIZE;
+	while (offset < dir->i_size) {
+		de = ext4_get_inline_entry(dir, &iloc, offset,
+					   &inline_pos, &inline_size);
+		if (ext4_check_dir_entry(dir, NULL, de,
+					 iloc.bh, inline_pos,
+					 inline_size, offset)) {
+			ext4_warning(dir->i_sb,
+				     "bad inline directory (dir #%lu) - "
+				     "inode %u, rec_len %u, name_len %d"
+				     "inline size %d\n",
+				     dir->i_ino, le32_to_cpu(de->inode),
+				     le16_to_cpu(de->rec_len), de->name_len,
+				     inline_size);
+			ret = 1;
+			goto out;
+		}
+		if (le32_to_cpu(de->inode)) {
+			ret = 0;
+			goto out;
+		}
+		offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+	}
+
+out:
+	up_read(&EXT4_I(dir)->xattr_sem);
+	brelse(iloc.bh);
+	return ret;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+	int ret;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	ret = ext4_destroy_inline_data_nolock(handle, inode);
+	up_write(&EXT4_I(inode)->xattr_sem);
+
+	return ret;
+}
+
+int ext4_inline_data_fiemap(struct inode *inode,
+			    struct fiemap_extent_info *fieinfo,
+			    int *has_inline, __u64 start, __u64 len)
+{
+	__u64 physical = 0;
+	__u64 inline_len;
+	__u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+		FIEMAP_EXTENT_LAST;
+	int error = 0;
+	struct ext4_iloc iloc;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		*has_inline = 0;
+		goto out;
+	}
+	inline_len = min_t(size_t, ext4_get_inline_size(inode),
+			   i_size_read(inode));
+	if (start >= inline_len)
+		goto out;
+	if (start + len < inline_len)
+		inline_len = start + len;
+	inline_len -= start;
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		goto out;
+
+	physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+	physical += offsetof(struct ext4_inode, i_block);
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, start, physical,
+						inline_len, flags);
+	brelse(iloc.bh);
+out:
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return (error < 0 ? error : 0);
+}
+
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+				  struct inode *inode,
+				  int needed)
+{
+	int error;
+	struct ext4_xattr_entry *entry;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+
+	raw_inode = ext4_raw_inode(&iloc);
+	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+					    EXT4_I(inode)->i_inline_off);
+	if (EXT4_XATTR_LEN(entry->e_name_len) +
+	    EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+		error = -ENOSPC;
+		goto out;
+	}
+
+	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+	brelse(iloc.bh);
+	return error;
+}
+
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+	handle_t *handle;
+	int inline_size, value_len, needed_blocks;
+	size_t i_size;
+	void *value = NULL;
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+
+
+	needed_blocks = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
+	if (IS_ERR(handle))
+		return;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		*has_inline = 0;
+		ext4_journal_stop(handle);
+		return;
+	}
+
+	if (ext4_orphan_add(handle, inode))
+		goto out;
+
+	if (ext4_get_inode_loc(inode, &is.iloc))
+		goto out;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	i_size = inode->i_size;
+	inline_size = ext4_get_inline_size(inode);
+	EXT4_I(inode)->i_disksize = i_size;
+
+	if (i_size < inline_size) {
+		/* Clear the content in the xattr space. */
+		if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+			if (ext4_xattr_ibody_find(inode, &i, &is))
+				goto out_error;
+
+			BUG_ON(is.s.not_found);
+
+			value_len = le32_to_cpu(is.s.here->e_value_size);
+			value = kmalloc(value_len, GFP_NOFS);
+			if (!value)
+				goto out_error;
+
+			if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+						value, value_len))
+				goto out_error;
+
+			i.value = value;
+			i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+					i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+			if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+				goto out_error;
+		}
+
+		/* Clear the content within i_blocks. */
+		if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+			void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+			memset(p + i_size, 0,
+			       EXT4_MIN_INLINE_DATA_SIZE - i_size);
+		}
+
+		EXT4_I(inode)->i_inline_size = i_size <
+					EXT4_MIN_INLINE_DATA_SIZE ?
+					EXT4_MIN_INLINE_DATA_SIZE : i_size;
+	}
+
+out_error:
+	up_write(&EXT4_I(inode)->i_data_sem);
+out:
+	brelse(is.iloc.bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	kfree(value);
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+	ext4_journal_stop(handle);
+	return;
+}
+
+int ext4_convert_inline_data(struct inode *inode)
+{
+	int error, needed_blocks;
+	handle_t *handle;
+	struct ext4_iloc iloc;
+
+	if (!ext4_has_inline_data(inode)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+		return 0;
+	}
+
+	needed_blocks = ext4_writepage_trans_blocks(inode);
+
+	iloc.bh = NULL;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle)) {
+		error = PTR_ERR(handle);
+		goto out_free;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_write(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+
+	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+	up_write(&EXT4_I(inode)->xattr_sem);
+out:
+	ext4_journal_stop(handle);
+out_free:
+	brelse(iloc.bh);
+	return error;
+}
diff --git a/kernel/fs/ext4/inode.c b/kernel/fs/ext4/inode.c
new file mode 100644
index 000000000..0554b0b59
--- /dev/null
+++ b/kernel/fs/ext4/inode.c
@@ -0,0 +1,5279 @@
+/*
+ *  linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "truncate.h"
+
+#include <trace/events/ext4.h>
+
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+			      struct ext4_inode_info *ei)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u16 csum_lo;
+	__u16 csum_hi = 0;
+	__u32 csum;
+
+	csum_lo = le16_to_cpu(raw->i_checksum_lo);
+	raw->i_checksum_lo = 0;
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+		csum_hi = le16_to_cpu(raw->i_checksum_hi);
+		raw->i_checksum_hi = 0;
+	}
+
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+			   EXT4_INODE_SIZE(inode->i_sb));
+
+	raw->i_checksum_lo = cpu_to_le16(csum_lo);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+		raw->i_checksum_hi = cpu_to_le16(csum_hi);
+
+	return csum;
+}
+
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+				  struct ext4_inode_info *ei)
+{
+	__u32 provided, calculated;
+
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_LINUX) ||
+	    !ext4_has_metadata_csum(inode->i_sb))
+		return 1;
+
+	provided = le16_to_cpu(raw->i_checksum_lo);
+	calculated = ext4_inode_csum(inode, raw, ei);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+		provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+	else
+		calculated &= 0xFFFF;
+
+	return provided == calculated;
+}
+
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+				struct ext4_inode_info *ei)
+{
+	__u32 csum;
+
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_LINUX) ||
+	    !ext4_has_metadata_csum(inode->i_sb))
+		return;
+
+	csum = ext4_inode_csum(inode, raw, ei);
+	raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+		raw->i_checksum_hi = cpu_to_le16(csum >> 16);
+}
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+					      loff_t new_size)
+{
+	trace_ext4_begin_ordered_truncate(inode, new_size);
+	/*
+	 * If jinode is zero, then we never opened the file for
+	 * writing, so there's no need to call
+	 * jbd2_journal_begin_ordered_truncate() since there's no
+	 * outstanding writes we need to flush.
+	 */
+	if (!EXT4_I(inode)->jinode)
+		return 0;
+	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+						   EXT4_I(inode)->jinode,
+						   new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents);
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+int ext4_inode_is_fast_symlink(struct inode *inode)
+{
+        int ea_blocks = EXT4_I(inode)->i_file_acl ?
+		EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+	if (ext4_has_inline_data(inode))
+		return 0;
+
+	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+
+/*
+ * Restart the transaction associated with *handle.  This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+				 int nblocks)
+{
+	int ret;
+
+	/*
+	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
+	 * moment, get_block can be called only for blocks inside i_size since
+	 * page cache has been already dropped and writes are blocked by
+	 * i_mutex. So we can safely drop the i_data_sem here.
+	 */
+	BUG_ON(EXT4_JOURNAL(inode) == NULL);
+	jbd_debug(2, "restarting handle %p\n", handle);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_journal_restart(handle, nblocks);
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	return ret;
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext4_evict_inode(struct inode *inode)
+{
+	handle_t *handle;
+	int err;
+
+	trace_ext4_evict_inode(inode);
+
+	if (inode->i_nlink) {
+		/*
+		 * When journalling data dirty buffers are tracked only in the
+		 * journal. So although mm thinks everything is clean and
+		 * ready for reaping the inode might still have some pages to
+		 * write in the running transaction or waiting to be
+		 * checkpointed. Thus calling jbd2_journal_invalidatepage()
+		 * (via truncate_inode_pages()) to discard these buffers can
+		 * cause data loss. Also even if we did not discard these
+		 * buffers, we would have no way to find them after the inode
+		 * is reaped and thus user could see stale data if he tries to
+		 * read them before the transaction is checkpointed. So be
+		 * careful and force everything to disk here... We use
+		 * ei->i_datasync_tid to store the newest transaction
+		 * containing inode's data.
+		 *
+		 * Note that directories do not have this problem because they
+		 * don't use page cache.
+		 */
+		if (ext4_should_journal_data(inode) &&
+		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+		    inode->i_ino != EXT4_JOURNAL_INO) {
+			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+
+			jbd2_complete_transaction(journal, commit_tid);
+			filemap_write_and_wait(&inode->i_data);
+		}
+		truncate_inode_pages_final(&inode->i_data);
+
+		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
+		goto no_delete;
+	}
+
+	if (is_bad_inode(inode))
+		goto no_delete;
+	dquot_initialize(inode);
+
+	if (ext4_should_order_data(inode))
+		ext4_begin_ordered_truncate(inode, 0);
+	truncate_inode_pages_final(&inode->i_data);
+
+	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
+
+	/*
+	 * Protect us against freezing - iput() caller didn't have to have any
+	 * protection against it
+	 */
+	sb_start_intwrite(inode->i_sb);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+				    ext4_blocks_for_truncate(inode)+3);
+	if (IS_ERR(handle)) {
+		ext4_std_error(inode->i_sb, PTR_ERR(handle));
+		/*
+		 * If we're going to skip the normal cleanup, we still need to
+		 * make sure that the in-core orphan linked list is properly
+		 * cleaned up.
+		 */
+		ext4_orphan_del(NULL, inode);
+		sb_end_intwrite(inode->i_sb);
+		goto no_delete;
+	}
+
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	inode->i_size = 0;
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_warning(inode->i_sb,
+			     "couldn't mark inode dirty (err %d)", err);
+		goto stop_handle;
+	}
+	if (inode->i_blocks)
+		ext4_truncate(inode);
+
+	/*
+	 * ext4_ext_truncate() doesn't reserve any slop when it
+	 * restarts journal transactions; therefore there may not be
+	 * enough credits left in the handle to remove the inode from
+	 * the orphan list and set the dtime field.
+	 */
+	if (!ext4_handle_has_enough_credits(handle, 3)) {
+		err = ext4_journal_extend(handle, 3);
+		if (err > 0)
+			err = ext4_journal_restart(handle, 3);
+		if (err != 0) {
+			ext4_warning(inode->i_sb,
+				     "couldn't extend journal (err %d)", err);
+		stop_handle:
+			ext4_journal_stop(handle);
+			ext4_orphan_del(NULL, inode);
+			sb_end_intwrite(inode->i_sb);
+			goto no_delete;
+		}
+	}
+
+	/*
+	 * Kill off the orphan record which ext4_truncate created.
+	 * AKPM: I think this can be inside the above `if'.
+	 * Note that ext4_orphan_del() has to be able to cope with the
+	 * deletion of a non-existent orphan - this is because we don't
+	 * know if ext4_truncate() actually created an orphan record.
+	 * (Well, we could do this if we need to, but heck - it works)
+	 */
+	ext4_orphan_del(handle, inode);
+	EXT4_I(inode)->i_dtime	= get_seconds();
+
+	/*
+	 * One subtle ordering requirement: if anything has gone wrong
+	 * (transaction abort, IO errors, whatever), then we can still
+	 * do these next steps (the fs will already have been marked as
+	 * having errors), but we can't free the inode if the mark_dirty
+	 * fails.
+	 */
+	if (ext4_mark_inode_dirty(handle, inode))
+		/* If that failed, just do the required in-core inode clear. */
+		ext4_clear_inode(inode);
+	else
+		ext4_free_inode(handle, inode);
+	ext4_journal_stop(handle);
+	sb_end_intwrite(inode->i_sb);
+	return;
+no_delete:
+	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
+}
+
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
+{
+	return &EXT4_I(inode)->i_reserved_quota;
+}
+#endif
+
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+					int used, int quota_claim)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	spin_lock(&ei->i_block_reservation_lock);
+	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
+	if (unlikely(used > ei->i_reserved_data_blocks)) {
+		ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
+			 "with only %d reserved data blocks",
+			 __func__, inode->i_ino, used,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		used = ei->i_reserved_data_blocks;
+	}
+
+	/* Update per-inode reservations */
+	ei->i_reserved_data_blocks -= used;
+	percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	/* Update quota subsystem for data blocks */
+	if (quota_claim)
+		dquot_claim_block(inode, EXT4_C2B(sbi, used));
+	else {
+		/*
+		 * We did fallocate with an offset that is already delayed
+		 * allocated. So on delayed allocated writeback we should
+		 * not re-claim the quota for fallocated blocks.
+		 */
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
+	}
+
+	/*
+	 * If we have done all the pending block allocations and if
+	 * there aren't any writers on the inode, we can discard the
+	 * inode's preallocations.
+	 */
+	if ((ei->i_reserved_data_blocks == 0) &&
+	    (atomic_read(&inode->i_writecount) == 0))
+		ext4_discard_preallocations(inode);
+}
+
+static int __check_block_validity(struct inode *inode, const char *func,
+				unsigned int line,
+				struct ext4_map_blocks *map)
+{
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+				   map->m_len)) {
+		ext4_error_inode(inode, func, line, map->m_pblk,
+				 "lblock %lu mapped to illegal pblock "
+				 "(length %d)", (unsigned long) map->m_lblk,
+				 map->m_len);
+		return -EIO;
+	}
+	return 0;
+}
+
+#define check_block_validity(inode, map)	\
+	__check_block_validity((inode), __func__, __LINE__, (map))
+
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+				       struct inode *inode,
+				       struct ext4_map_blocks *es_map,
+				       struct ext4_map_blocks *map,
+				       int flags)
+{
+	int retval;
+
+	map->m_flags = 0;
+	/*
+	 * There is a race window that the result is not the same.
+	 * e.g. xfstests #223 when dioread_nolock enables.  The reason
+	 * is that we lookup a block mapping in extent status tree with
+	 * out taking i_data_sem.  So at the time the unwritten extent
+	 * could be converted.
+	 */
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	} else {
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	}
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
+
+	/*
+	 * We don't check m_len because extent will be collpased in status
+	 * tree.  So the m_len might not equal.
+	 */
+	if (es_map->m_lblk != map->m_lblk ||
+	    es_map->m_flags != map->m_flags ||
+	    es_map->m_pblk != map->m_pblk) {
+		printk("ES cache assertion failed for inode: %lu "
+		       "es_cached ex [%d/%d/%llu/%x] != "
+		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+		       inode->i_ino, es_map->m_lblk, es_map->m_len,
+		       es_map->m_pblk, es_map->m_flags, map->m_lblk,
+		       map->m_len, map->m_pblk, map->m_flags,
+		       retval, flags);
+	}
+}
+#endif /* ES_AGGRESSIVE_TEST */
+
+/*
+ * The ext4_map_blocks() function tries to look up the requested blocks,
+ * and returns if the blocks are already mapped.
+ *
+ * Otherwise it takes the write lock of the i_data_sem and allocate blocks
+ * and store the allocated blocks in the result buffer head and mark it
+ * mapped.
+ *
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
+ * based files
+ *
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
+ * the result buffer head is unmapped. If the create ==1, it will make sure
+ * the buffer head is mapped.
+ *
+ * It returns 0 if plain look up failed (blocks have not been allocated), in
+ * that case, buffer head is unmapped
+ *
+ * It returns the error in case of allocation failure.
+ */
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+		    struct ext4_map_blocks *map, int flags)
+{
+	struct extent_status es;
+	int retval;
+	int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
+
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
+
+	map->m_flags = 0;
+	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
+		  (unsigned long) map->m_lblk);
+
+	/*
+	 * ext4_map_blocks returns an int, and m_len is an unsigned int
+	 */
+	if (unlikely(map->m_len > INT_MAX))
+		map->m_len = INT_MAX;
+
+	/* We can handle the block number less than EXT_MAX_BLOCKS */
+	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+		return -EIO;
+
+	/* Lookup extent status tree firstly */
+	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+			map->m_pblk = ext4_es_pblock(&es) +
+					map->m_lblk - es.es_lblk;
+			map->m_flags |= ext4_es_is_written(&es) ?
+					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+			retval = es.es_len - (map->m_lblk - es.es_lblk);
+			if (retval > map->m_len)
+				retval = map->m_len;
+			map->m_len = retval;
+		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+			retval = 0;
+		} else {
+			BUG_ON(1);
+		}
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(handle, inode, map,
+					   &orig_map, flags);
+#endif
+		goto found;
+	}
+
+	/*
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
+	 */
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	} else {
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	}
+	if (retval > 0) {
+		unsigned int status;
+
+		if (unlikely(retval != map->m_len)) {
+			ext4_warning(inode->i_sb,
+				     "ES len assertion failed for inode "
+				     "%lu: retval %d != map->m_len %d",
+				     inode->i_ino, retval, map->m_len);
+			WARN_ON(1);
+		}
+
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+		    !(status & EXTENT_STATUS_WRITTEN) &&
+		    ext4_find_delalloc_range(inode, map->m_lblk,
+					     map->m_lblk + map->m_len - 1))
+			status |= EXTENT_STATUS_DELAYED;
+		ret = ext4_es_insert_extent(inode, map->m_lblk,
+					    map->m_len, map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
+	}
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
+
+found:
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		ret = check_block_validity(inode, map);
+		if (ret != 0)
+			return ret;
+	}
+
+	/* If it is only a block(s) look up */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
+		return retval;
+
+	/*
+	 * Returns if the blocks have already allocated
+	 *
+	 * Note that if blocks have been preallocated
+	 * ext4_ext_get_block() returns the create = 0
+	 * with buffer head unmapped.
+	 */
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+		/*
+		 * If we need to convert extent to unwritten
+		 * we continue and do the actual work in
+		 * ext4_ext_map_blocks()
+		 */
+		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+			return retval;
+
+	/*
+	 * Here we clear m_flags because after allocating an new extent,
+	 * it will be set again.
+	 */
+	map->m_flags &= ~EXT4_MAP_FLAGS;
+
+	/*
+	 * New blocks allocate and/or writing to unwritten extent
+	 * will possibly result in updating i_data, so we take
+	 * the write lock of i_data_sem, and call get_block()
+	 * with create == 1 flag.
+	 */
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	/*
+	 * We need to check for EXT4 here because migrate
+	 * could have changed the inode type in between
+	 */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
+	} else {
+		retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
+			/*
+			 * We allocated new blocks which will result in
+			 * i_data's format changing.  Force the migrate
+			 * to fail by clearing migrate flags
+			 */
+			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+		}
+
+		/*
+		 * Update reserved blocks/metadata blocks after successful
+		 * block allocation which had been deferred till now. We don't
+		 * support fallocate for non extent files. So we can update
+		 * reserve space here.
+		 */
+		if ((retval > 0) &&
+			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+			ext4_da_update_reserve_space(inode, retval, 1);
+	}
+
+	if (retval > 0) {
+		unsigned int status;
+
+		if (unlikely(retval != map->m_len)) {
+			ext4_warning(inode->i_sb,
+				     "ES len assertion failed for inode "
+				     "%lu: retval %d != map->m_len %d",
+				     inode->i_ino, retval, map->m_len);
+			WARN_ON(1);
+		}
+
+		/*
+		 * If the extent has been zeroed out, we don't need to update
+		 * extent status tree.
+		 */
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+			if (ext4_es_is_written(&es))
+				goto has_zeroout;
+		}
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+		    !(status & EXTENT_STATUS_WRITTEN) &&
+		    ext4_find_delalloc_range(inode, map->m_lblk,
+					     map->m_lblk + map->m_len - 1))
+			status |= EXTENT_STATUS_DELAYED;
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
+	}
+
+has_zeroout:
+	up_write((&EXT4_I(inode)->i_data_sem));
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		ret = check_block_validity(inode, map);
+		if (ret != 0)
+			return ret;
+	}
+	return retval;
+}
+
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+	struct inode *inode = bh->b_assoc_map->host;
+	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+	int err;
+	if (!uptodate)
+		return;
+	WARN_ON(!buffer_unwritten(bh));
+	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int flags)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_map_blocks map;
+	int ret = 0, started = 0;
+	int dio_credits;
+
+	if (ext4_has_inline_data(inode))
+		return -ERANGE;
+
+	map.m_lblk = iblock;
+	map.m_len = bh->b_size >> inode->i_blkbits;
+
+	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
+		/* Direct IO write... */
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    dio_credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			return ret;
+		}
+		started = 1;
+	}
+
+	ret = ext4_map_blocks(handle, inode, &map, flags);
+	if (ret > 0) {
+		ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
+		map_bh(bh, inode->i_sb, map.m_pblk);
+		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+			bh->b_assoc_map = inode->i_mapping;
+			bh->b_private = (void *)(unsigned long)iblock;
+			bh->b_end_io = ext4_end_io_unwritten;
+		}
+		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+			set_buffer_defer_completion(bh);
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
+		ret = 0;
+	}
+	if (started)
+		ext4_journal_stop(handle);
+	return ret;
+}
+
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh, int create)
+{
+	return _ext4_get_block(inode, iblock, bh,
+			       create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
+				ext4_lblk_t block, int create)
+{
+	struct ext4_map_blocks map;
+	struct buffer_head *bh;
+	int err;
+
+	J_ASSERT(handle != NULL || create == 0);
+
+	map.m_lblk = block;
+	map.m_len = 1;
+	err = ext4_map_blocks(handle, inode, &map,
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+	if (err == 0)
+		return create ? ERR_PTR(-ENOSPC) : NULL;
+	if (err < 0)
+		return ERR_PTR(err);
+
+	bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		J_ASSERT(create != 0);
+		J_ASSERT(handle != NULL);
+
+		/*
+		 * Now that we do not always journal data, we should
+		 * keep in mind whether this should always journal the
+		 * new buffer as metadata.  For now, regular file
+		 * writes use ext4_get_block instead, so it's not a
+		 * problem.
+		 */
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		err = ext4_journal_get_create_access(handle, bh);
+		if (unlikely(err)) {
+			unlock_buffer(bh);
+			goto errout;
+		}
+		if (!buffer_uptodate(bh)) {
+			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
+		}
+		unlock_buffer(bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (unlikely(err))
+			goto errout;
+	} else
+		BUFFER_TRACE(bh, "not a new buffer");
+	return bh;
+errout:
+	brelse(bh);
+	return ERR_PTR(err);
+}
+
+struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
+			       ext4_lblk_t block, int create)
+{
+	struct buffer_head *bh;
+
+	bh = ext4_getblk(handle, inode, block, create);
+	if (IS_ERR(bh))
+		return bh;
+	if (!bh || buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	put_bh(bh);
+	return ERR_PTR(-EIO);
+}
+
+int ext4_walk_page_buffers(handle_t *handle,
+			   struct buffer_head *head,
+			   unsigned from,
+			   unsigned to,
+			   int *partial,
+			   int (*fn)(handle_t *handle,
+				     struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (bh = head, block_start = 0;
+	     ret == 0 && (bh != head || !block_start);
+	     block_start = block_end, bh = next) {
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction.  We cannot
+ * close off a transaction and start a new one between the ext4_get_block()
+ * and the commit_write().  So doing the jbd2_journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext4_writepage().  In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page.  So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
+ *
+ * By accident, ext4 can be reentered when a transaction is open via
+ * quota file writes.  If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated.  We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+int do_journal_get_write_access(handle_t *handle,
+				struct buffer_head *bh)
+{
+	int dirty = buffer_dirty(bh);
+	int ret;
+
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	/*
+	 * __block_write_begin() could have dirtied some buffers. Clean
+	 * the dirty bit as jbd2_journal_get_write_access() could complain
+	 * otherwise about fs integrity issues. Setting of the dirty bit
+	 * by __block_write_begin() isn't a real problem here as we clear
+	 * the bit before releasing a page lock and thus writeback cannot
+	 * ever write the buffer.
+	 */
+	if (dirty)
+		clear_buffer_dirty(bh);
+	BUFFER_TRACE(bh, "get write access");
+	ret = ext4_journal_get_write_access(handle, bh);
+	if (!ret && dirty)
+		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+	return ret;
+}
+
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
+				  get_block_t *get_block)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end;
+	sector_t block;
+	int err = 0;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	unsigned bbits;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+	bool decrypt = false;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(from > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > to);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+	head = page_buffers(page);
+	bbits = ilog2(blocksize);
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	    block++, block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+			}
+			continue;
+		}
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+		if (!buffer_mapped(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				break;
+			if (buffer_new(bh)) {
+				unmap_underlying_metadata(bh->b_bdev,
+							  bh->b_blocknr);
+				if (PageUptodate(page)) {
+					clear_buffer_new(bh);
+					set_buffer_uptodate(bh);
+					mark_buffer_dirty(bh);
+					continue;
+				}
+				if (block_end > to || block_start < from)
+					zero_user_segments(page, to, block_end,
+							   block_start, from);
+				continue;
+			}
+		}
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			continue;
+		}
+		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+		    !buffer_unwritten(bh) &&
+		    (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++ = bh;
+			decrypt = ext4_encrypted_inode(inode) &&
+				S_ISREG(inode->i_mode);
+		}
+	}
+	/*
+	 * If we issued read requests, let them complete.
+	 */
+	while (wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			err = -EIO;
+	}
+	if (unlikely(err))
+		page_zero_new_buffers(page, from, to);
+	else if (decrypt)
+		err = ext4_decrypt_one(inode, page);
+	return err;
+}
+#endif
+
+static int ext4_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret, needed_blocks;
+	handle_t *handle;
+	int retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+
+	trace_ext4_write_begin(inode, pos, len, flags);
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+		ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+						    flags, pagep);
+		if (ret < 0)
+			return ret;
+		if (ret == 1)
+			return 0;
+	}
+
+	/*
+	 * grab_cache_page_write_begin() can take a long time if the
+	 * system is thrashing due to memory pressure, or if the page
+	 * is being written back.  So grab it first before we start
+	 * the transaction handle.  This also allows us to allocate
+	 * the page (if needed) without using GFP_NOFS.
+	 */
+retry_grab:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	unlock_page(page);
+
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle)) {
+		page_cache_release(page);
+		return PTR_ERR(handle);
+	}
+
+	lock_page(page);
+	if (page->mapping != mapping) {
+		/* The page got truncated from under us */
+		unlock_page(page);
+		page_cache_release(page);
+		ext4_journal_stop(handle);
+		goto retry_grab;
+	}
+	/* In case writeback began while the page was unlocked */
+	wait_for_stable_page(page);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	if (ext4_should_dioread_nolock(inode))
+		ret = ext4_block_write_begin(page, pos, len,
+					     ext4_get_block_write);
+	else
+		ret = ext4_block_write_begin(page, pos, len,
+					     ext4_get_block);
+#else
+	if (ext4_should_dioread_nolock(inode))
+		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+	else
+		ret = __block_write_begin(page, pos, len, ext4_get_block);
+#endif
+	if (!ret && ext4_should_journal_data(inode)) {
+		ret = ext4_walk_page_buffers(handle, page_buffers(page),
+					     from, to, NULL,
+					     do_journal_get_write_access);
+	}
+
+	if (ret) {
+		unlock_page(page);
+		/*
+		 * __block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before
+		 * truncate finishes
+		 */
+		if (pos + len > inode->i_size && ext4_can_truncate(inode))
+			ext4_orphan_add(handle, inode);
+
+		ext4_journal_stop(handle);
+		if (pos + len > inode->i_size) {
+			ext4_truncate_failed_write(inode);
+			/*
+			 * If truncate failed early the inode might
+			 * still be on the orphan list; we need to
+			 * make sure the inode is removed from the
+			 * orphan list in that case.
+			 */
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+		}
+
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+		page_cache_release(page);
+		return ret;
+	}
+	*pagep = page;
+	return ret;
+}
+
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+{
+	int ret;
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	set_buffer_uptodate(bh);
+	ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+	clear_buffer_meta(bh);
+	clear_buffer_prio(bh);
+	return ret;
+}
+
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+			  struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
+	int ret = 0, ret2;
+	int i_size_changed = 0;
+
+	trace_ext4_write_end(inode, pos, len, copied);
+	if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+		ret = ext4_jbd2_file_inode(handle, inode);
+		if (ret) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto errout;
+		}
+	}
+
+	if (ext4_has_inline_data(inode)) {
+		ret = ext4_write_inline_data_end(inode, pos, len,
+						 copied, page);
+		if (ret < 0)
+			goto errout;
+		copied = ret;
+	} else
+		copied = block_write_end(file, mapping, pos,
+					 len, copied, page, fsdata);
+	/*
+	 * it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	i_size_changed = ext4_update_inode_size(inode, pos + copied);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		ext4_mark_inode_dirty(handle, inode);
+
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+errout:
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+	return ret ? ret : copied;
+}
+
+static int ext4_journalled_write_end(struct file *file,
+				     struct address_space *mapping,
+				     loff_t pos, unsigned len, unsigned copied,
+				     struct page *page, void *fsdata)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
+	int ret = 0, ret2;
+	int partial = 0;
+	unsigned from, to;
+	int size_changed = 0;
+
+	trace_ext4_journalled_write_end(inode, pos, len, copied);
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (ext4_has_inline_data(inode))
+		copied = ext4_write_inline_data_end(inode, pos, len,
+						    copied, page);
+	else {
+		if (copied < len) {
+			if (!PageUptodate(page))
+				copied = 0;
+			page_zero_new_buffers(page, from+copied, to);
+		}
+
+		ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+					     to, &partial, write_end_fn);
+		if (!partial)
+			SetPageUptodate(page);
+	}
+	size_changed = ext4_update_inode_size(inode, pos + copied);
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+
+	if (size_changed) {
+		ret2 = ext4_mark_inode_dirty(handle, inode);
+		if (!ret)
+			ret = ret2;
+	}
+
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+	return ret ? ret : copied;
+}
+
+/*
+ * Reserve a single cluster located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int md_needed;
+	int ret;
+
+	/*
+	 * We will charge metadata quota at writeout time; this saves
+	 * us from metadata over-estimation, though we may go over by
+	 * a small amount in the end.  Here we just reserve for data.
+	 */
+	ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+	if (ret)
+		return ret;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+	spin_lock(&ei->i_block_reservation_lock);
+	/*
+	 * ext4_calc_metadata_amount() has side effects, which we have
+	 * to be prepared undo if we fail to claim space.
+	 */
+	md_needed = 0;
+	trace_ext4_da_reserve_space(inode, 0);
+
+	if (ext4_claim_free_clusters(sbi, 1, 0)) {
+		spin_unlock(&ei->i_block_reservation_lock);
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+		return -ENOSPC;
+	}
+	ei->i_reserved_data_blocks++;
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	return 0;       /* success */
+}
+
+static void ext4_da_release_space(struct inode *inode, int to_free)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (!to_free)
+		return;		/* Nothing to release, exit */
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	trace_ext4_da_release_space(inode, to_free);
+	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
+		/*
+		 * if there aren't enough reserved blocks, then the
+		 * counter is messed up somewhere.  Since this
+		 * function is called from invalidate page, it's
+		 * harmless to return without any action.
+		 */
+		ext4_warning(inode->i_sb, "ext4_da_release_space: "
+			 "ino %lu, to_free %d with only %d reserved "
+			 "data blocks", inode->i_ino, to_free,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		to_free = ei->i_reserved_data_blocks;
+	}
+	ei->i_reserved_data_blocks -= to_free;
+
+	/* update fs dirty data blocks counter */
+	percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+					     unsigned int offset,
+					     unsigned int length)
+{
+	int to_release = 0;
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+	struct inode *inode = page->mapping->host;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned int stop = offset + length;
+	int num_clusters;
+	ext4_fsblk_t lblk;
+
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		if (next_off > stop)
+			break;
+
+		if ((offset <= curr_off) && (buffer_delay(bh))) {
+			to_release++;
+			clear_buffer_delay(bh);
+		}
+		curr_off = next_off;
+	} while ((bh = bh->b_this_page) != head);
+
+	if (to_release) {
+		lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		ext4_es_remove_extent(inode, lblk, to_release);
+	}
+
+	/* If we have released all the blocks belonging to a cluster, then we
+	 * need to release the reserved space for that cluster. */
+	num_clusters = EXT4_NUM_B2C(sbi, to_release);
+	while (num_clusters > 0) {
+		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+			((num_clusters - 1) << sbi->s_cluster_bits);
+		if (sbi->s_cluster_ratio == 1 ||
+		    !ext4_find_delalloc_cluster(inode, lblk))
+			ext4_da_release_space(inode, 1);
+
+		num_clusters--;
+	}
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+	struct inode *inode;
+	struct writeback_control *wbc;
+
+	pgoff_t first_page;	/* The first page to write */
+	pgoff_t next_page;	/* Current page to examine */
+	pgoff_t last_page;	/* Last page to examine */
+	/*
+	 * Extent to map - this can be after first_page because that can be
+	 * fully mapped. We somewhat abuse m_flags to store whether the extent
+	 * is delalloc or unwritten.
+	 */
+	struct ext4_map_blocks map;
+	struct ext4_io_submit io_submit;	/* IO submission data */
+};
+
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+				       bool invalidate)
+{
+	int nr_pages, i;
+	pgoff_t index, end;
+	struct pagevec pvec;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	/* This is necessary when next_page == 0. */
+	if (mpd->first_page >= mpd->next_page)
+		return;
+
+	index = mpd->first_page;
+	end   = mpd->next_page - 1;
+	if (invalidate) {
+		ext4_lblk_t start, last;
+		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		ext4_es_remove_extent(inode, start, last - start + 1);
+	}
+
+	pagevec_init(&pvec, 0);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			if (page->index > end)
+				break;
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			if (invalidate) {
+				block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+				ClearPageUptodate(page);
+			}
+			unlock_page(page);
+		}
+		index = pvec.pages[nr_pages - 1]->index + 1;
+		pagevec_release(&pvec);
+	}
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
+	       EXT4_C2B(EXT4_SB(inode->i_sb),
+			ext4_count_free_clusters(sb)));
+	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
+	       (long long) EXT4_C2B(EXT4_SB(sb),
+		percpu_counter_sum(&sbi->s_freeclusters_counter)));
+	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
+	       (long long) EXT4_C2B(EXT4_SB(sb),
+		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+	ext4_msg(sb, KERN_CRIT, "Block reservation details");
+	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+		 ei->i_reserved_data_blocks);
+	return;
+}
+
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+			      struct ext4_map_blocks *map,
+			      struct buffer_head *bh)
+{
+	struct extent_status es;
+	int retval;
+	sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
+
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
+
+	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+		invalid_block = ~0;
+
+	map->m_flags = 0;
+	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, map->m_len,
+		  (unsigned long) map->m_lblk);
+
+	/* Lookup extent status tree firstly */
+	if (ext4_es_lookup_extent(inode, iblock, &es)) {
+		if (ext4_es_is_hole(&es)) {
+			retval = 0;
+			down_read(&EXT4_I(inode)->i_data_sem);
+			goto add_delayed;
+		}
+
+		/*
+		 * Delayed extent could be allocated by fallocate.
+		 * So we need to check it.
+		 */
+		if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+			map_bh(bh, inode->i_sb, invalid_block);
+			set_buffer_new(bh);
+			set_buffer_delay(bh);
+			return 0;
+		}
+
+		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+		retval = es.es_len - (iblock - es.es_lblk);
+		if (retval > map->m_len)
+			retval = map->m_len;
+		map->m_len = retval;
+		if (ext4_es_is_written(&es))
+			map->m_flags |= EXT4_MAP_MAPPED;
+		else if (ext4_es_is_unwritten(&es))
+			map->m_flags |= EXT4_MAP_UNWRITTEN;
+		else
+			BUG_ON(1);
+
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+		return retval;
+	}
+
+	/*
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
+	 */
+	down_read(&EXT4_I(inode)->i_data_sem);
+	if (ext4_has_inline_data(inode))
+		retval = 0;
+	else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+	else
+		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+
+add_delayed:
+	if (retval == 0) {
+		int ret;
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		/*
+		 * If the block was allocated from previously allocated cluster,
+		 * then we don't need to reserve it again. However we still need
+		 * to reserve metadata for every block we're going to write.
+		 */
+		if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
+		    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
+			ret = ext4_da_reserve_space(inode, iblock);
+			if (ret) {
+				/* not enough space to reserve */
+				retval = ret;
+				goto out_unlock;
+			}
+		}
+
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    ~0, EXTENT_STATUS_DELAYED);
+		if (ret) {
+			retval = ret;
+			goto out_unlock;
+		}
+
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+	} else if (retval > 0) {
+		int ret;
+		unsigned int status;
+
+		if (unlikely(retval != map->m_len)) {
+			ext4_warning(inode->i_sb,
+				     "ES len assertion failed for inode "
+				     "%lu: retval %d != map->m_len %d",
+				     inode->i_ino, retval, map->m_len);
+			WARN_ON(1);
+		}
+
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret != 0)
+			retval = ret;
+	}
+
+out_unlock:
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	return retval;
+}
+
+/*
+ * This is a special get_block_t callback which is used by
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ */
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int create)
+{
+	struct ext4_map_blocks map;
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+	map.m_lblk = iblock;
+	map.m_len = 1;
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+	if (ret <= 0)
+		return ret;
+
+	map_bh(bh, inode->i_sb, map.m_pblk);
+	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+	if (buffer_unwritten(bh)) {
+		/* A delayed write to unwritten bh should be marked
+		 * new and mapped.  Mapped ensures that we don't do
+		 * get_block multiple times when we write to the same
+		 * offset and new ensures that we do proper zero out
+		 * for partial write.
+		 */
+		set_buffer_new(bh);
+		set_buffer_mapped(bh);
+	}
+	return 0;
+}
+
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+	get_bh(bh);
+	return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+	put_bh(bh);
+	return 0;
+}
+
+static int __ext4_journalled_writepage(struct page *page,
+				       unsigned int len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs = NULL;
+	handle_t *handle = NULL;
+	int ret = 0, err = 0;
+	int inline_data = ext4_has_inline_data(inode);
+	struct buffer_head *inode_bh = NULL;
+
+	ClearPageChecked(page);
+
+	if (inline_data) {
+		BUG_ON(page->index != 0);
+		BUG_ON(len > ext4_get_max_inline_size(inode));
+		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+		if (inode_bh == NULL)
+			goto out;
+	} else {
+		page_bufs = page_buffers(page);
+		if (!page_bufs) {
+			BUG();
+			goto out;
+		}
+		ext4_walk_page_buffers(handle, page_bufs, 0, len,
+				       NULL, bget_one);
+	}
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
+
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				    ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (inline_data) {
+		BUFFER_TRACE(inode_bh, "get write access");
+		ret = ext4_journal_get_write_access(handle, inode_bh);
+
+		err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
+
+	} else {
+		ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+					     do_journal_get_write_access);
+
+		err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+					     write_end_fn);
+	}
+	if (ret == 0)
+		ret = err;
+	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+
+	if (!ext4_has_inline_data(inode))
+		ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+				       NULL, bput_one);
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+out:
+	brelse(inode_bh);
+	return ret;
+}
+
+/*
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), no one guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ *   - ext4_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *		ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
+ */
+static int ext4_writepage(struct page *page,
+			  struct writeback_control *wbc)
+{
+	int ret = 0;
+	loff_t size;
+	unsigned int len;
+	struct buffer_head *page_bufs = NULL;
+	struct inode *inode = page->mapping->host;
+	struct ext4_io_submit io_submit;
+	bool keep_towrite = false;
+
+	trace_ext4_writepage(page);
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	page_bufs = page_buffers(page);
+	/*
+	 * We cannot do block allocation or other extent handling in this
+	 * function. If there are buffers needing that, we have to redirty
+	 * the page. But we may reach here when we do a journal commit via
+	 * journal_submit_inode_data_buffers() and in that case we must write
+	 * allocated buffers to achieve data=ordered mode guarantees.
+	 */
+	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+				   ext4_bh_delay_or_unwritten)) {
+		redirty_page_for_writepage(wbc, page);
+		if (current->flags & PF_MEMALLOC) {
+			/*
+			 * For memory cleaning there's no point in writing only
+			 * some buffers. So just bail out. Warn if we came here
+			 * from direct reclaim.
+			 */
+			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+							== PF_MEMALLOC);
+			unlock_page(page);
+			return 0;
+		}
+		keep_towrite = true;
+	}
+
+	if (PageChecked(page) && ext4_should_journal_data(inode))
+		/*
+		 * It's mmapped pagecache.  Add buffers and journal it.  There
+		 * doesn't seem much point in redirtying the page here.
+		 */
+		return __ext4_journalled_writepage(page, len);
+
+	ext4_io_submit_init(&io_submit, wbc);
+	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_submit.io_end) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+	ext4_io_submit(&io_submit);
+	/* Drop io_end reference we got from init */
+	ext4_put_io_end_defer(io_submit.io_end);
+	return ret;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+	int len;
+	loff_t size = i_size_read(mpd->inode);
+	int err;
+
+	BUG_ON(page->index != mpd->first_page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	clear_page_dirty_for_io(page);
+	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+	if (!err)
+		mpd->wbc->nr_to_write--;
+	mpd->first_page++;
+
+	return err;
+}
+
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
+/*
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks up to full group size.
+ */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
+
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @bh - buffer head we want to add to the extent
+ *
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
+ */
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+				   struct buffer_head *bh)
+{
+	struct ext4_map_blocks *map = &mpd->map;
+
+	/* Buffer that doesn't need mapping for writeback? */
+	if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+	    (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+		/* So far no extent to map => we write the buffer right away */
+		if (map->m_len == 0)
+			return true;
+		return false;
+	}
+
+	/* First block in the extent? */
+	if (map->m_len == 0) {
+		map->m_lblk = lblk;
+		map->m_len = 1;
+		map->m_flags = bh->b_state & BH_FLAGS;
+		return true;
+	}
+
+	/* Don't go larger than mballoc is willing to allocate */
+	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+		return false;
+
+	/* Can we merge the block to our big extent? */
+	if (lblk == map->m_lblk + map->m_len &&
+	    (bh->b_state & BH_FLAGS) == map->m_flags) {
+		map->m_len++;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
+ *
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
+ *
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+				   struct buffer_head *head,
+				   struct buffer_head *bh,
+				   ext4_lblk_t lblk)
+{
+	struct inode *inode = mpd->inode;
+	int err;
+	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+							>> inode->i_blkbits;
+
+	do {
+		BUG_ON(buffer_locked(bh));
+
+		if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
+			/* Found extent to map? */
+			if (mpd->map.m_len)
+				return 0;
+			/* Everything mapped so far and we hit EOF */
+			break;
+		}
+	} while (lblk++, (bh = bh->b_this_page) != head);
+	/* So far everything mapped? Submit the page for IO. */
+	if (mpd->map.m_len == 0) {
+		err = mpage_submit_page(mpd, head->b_page);
+		if (err < 0)
+			return err;
+	}
+	return lblk < blocks;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *		       submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to unwritten extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+	struct pagevec pvec;
+	int nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *head, *bh;
+	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+	pgoff_t start, end;
+	ext4_lblk_t lblk;
+	sector_t pblock;
+	int err;
+
+	start = mpd->map.m_lblk >> bpp_bits;
+	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+	lblk = start << bpp_bits;
+	pblock = mpd->map.m_pblk;
+
+	pagevec_init(&pvec, 0);
+	while (start <= end) {
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+					  PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (page->index > end)
+				break;
+			/* Up to 'end' pages must be contiguous */
+			BUG_ON(page->index != start);
+			bh = head = page_buffers(page);
+			do {
+				if (lblk < mpd->map.m_lblk)
+					continue;
+				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+					/*
+					 * Buffer after end of mapped extent.
+					 * Find next buffer in the page to map.
+					 */
+					mpd->map.m_len = 0;
+					mpd->map.m_flags = 0;
+					/*
+					 * FIXME: If dioread_nolock supports
+					 * blocksize < pagesize, we need to make
+					 * sure we add size mapped so far to
+					 * io_end->size as the following call
+					 * can submit the page for IO.
+					 */
+					err = mpage_process_page_bufs(mpd, head,
+								      bh, lblk);
+					pagevec_release(&pvec);
+					if (err > 0)
+						err = 0;
+					return err;
+				}
+				if (buffer_delay(bh)) {
+					clear_buffer_delay(bh);
+					bh->b_blocknr = pblock++;
+				}
+				clear_buffer_unwritten(bh);
+			} while (lblk++, (bh = bh->b_this_page) != head);
+
+			/*
+			 * FIXME: This is going to break if dioread_nolock
+			 * supports blocksize < pagesize as we will try to
+			 * convert potentially unmapped parts of inode.
+			 */
+			mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+			/* Page fully mapped - let IO run! */
+			err = mpage_submit_page(mpd, page);
+			if (err < 0) {
+				pagevec_release(&pvec);
+				return err;
+			}
+			start++;
+		}
+		pagevec_release(&pvec);
+	}
+	/* Extent fully mapped and matches with page boundary. We are done. */
+	mpd->map.m_len = 0;
+	mpd->map.m_flags = 0;
+	return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int get_blocks_flags;
+	int err, dioread_nolock;
+
+	trace_ext4_da_write_pages_extent(inode, map);
+	/*
+	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+	 * to convert an unwritten extent to be initialized (in the case
+	 * where we have written into one or more preallocated blocks).  It is
+	 * possible that we're going to need more metadata blocks than
+	 * previously reserved. However we must not fail because we're in
+	 * writeback and there is nothing we can do about it so it might result
+	 * in data loss.  So use reserved blocks to allocate metadata if
+	 * possible.
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
+	 * the blocks in question are delalloc blocks.  This indicates
+	 * that the blocks and quotas has already been checked when
+	 * the data was copied into the page cache.
+	 */
+	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
+	dioread_nolock = ext4_should_dioread_nolock(inode);
+	if (dioread_nolock)
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+	if (map->m_flags & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+	if (err < 0)
+		return err;
+	if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+		if (!mpd->io_submit.io_end->handle &&
+		    ext4_handle_valid(handle)) {
+			mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+			handle->h_rsv_handle = NULL;
+		}
+		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+	}
+
+	BUG_ON(map->m_len == 0);
+	if (map->m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = inode->i_sb->s_bdev;
+		int i;
+
+		for (i = 0; i < map->m_len; i++)
+			unmap_underlying_metadata(bdev, map->m_pblk + i);
+	}
+	return 0;
+}
+
+/*
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ *				 mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ *                     is no hope of writing the data. The caller should discard
+ *                     dirty pages to avoid infinite loops.
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
+ */
+static int mpage_map_and_submit_extent(handle_t *handle,
+				       struct mpage_da_data *mpd,
+				       bool *give_up_on_write)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int err;
+	loff_t disksize;
+	int progress = 0;
+
+	mpd->io_submit.io_end->offset =
+				((loff_t)map->m_lblk) << inode->i_blkbits;
+	do {
+		err = mpage_map_one_extent(handle, mpd);
+		if (err < 0) {
+			struct super_block *sb = inode->i_sb;
+
+			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+				goto invalidate_dirty_pages;
+			/*
+			 * Let the uper layers retry transient errors.
+			 * In the case of ENOSPC, if ext4_count_free_blocks()
+			 * is non-zero, a commit should free up blocks.
+			 */
+			if ((err == -ENOMEM) ||
+			    (err == -ENOSPC && ext4_count_free_clusters(sb))) {
+				if (progress)
+					goto update_disksize;
+				return err;
+			}
+			ext4_msg(sb, KERN_CRIT,
+				 "Delayed block allocation failed for "
+				 "inode %lu at logical offset %llu with"
+				 " max blocks %u with error %d",
+				 inode->i_ino,
+				 (unsigned long long)map->m_lblk,
+				 (unsigned)map->m_len, -err);
+			ext4_msg(sb, KERN_CRIT,
+				 "This should not happen!! Data will "
+				 "be lost\n");
+			if (err == -ENOSPC)
+				ext4_print_free_blocks(inode);
+		invalidate_dirty_pages:
+			*give_up_on_write = true;
+			return err;
+		}
+		progress = 1;
+		/*
+		 * Update buffer state, submit mapped pages, and get us new
+		 * extent to map
+		 */
+		err = mpage_map_and_submit_buffers(mpd);
+		if (err < 0)
+			goto update_disksize;
+	} while (map->m_len);
+
+update_disksize:
+	/*
+	 * Update on-disk size after IO is submitted.  Races with
+	 * truncate are avoided by checking i_size under i_data_sem.
+	 */
+	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+	if (disksize > EXT4_I(inode)->i_disksize) {
+		int err2;
+		loff_t i_size;
+
+		down_write(&EXT4_I(inode)->i_data_sem);
+		i_size = i_size_read(inode);
+		if (disksize > i_size)
+			disksize = i_size;
+		if (disksize > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = disksize;
+		err2 = ext4_mark_inode_dirty(handle, inode);
+		up_write(&EXT4_I(inode)->i_data_sem);
+		if (err2)
+			ext4_error(inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   inode->i_ino);
+		if (!err)
+			err = err2;
+	}
+	return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+	int bpp = ext4_journal_blocks_per_page(inode);
+
+	return ext4_meta_trans_blocks(inode,
+				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * 				 and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct pagevec pvec;
+	unsigned int nr_pages;
+	long left = mpd->wbc->nr_to_write;
+	pgoff_t index = mpd->first_page;
+	pgoff_t end = mpd->last_page;
+	int tag;
+	int i, err = 0;
+	int blkbits = mpd->inode->i_blkbits;
+	ext4_lblk_t lblk;
+	struct buffer_head *head;
+
+	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+
+	pagevec_init(&pvec, 0);
+	mpd->map.m_len = 0;
+	mpd->next_page = index;
+	while (index <= end) {
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			goto out;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping. However, page->index will not change
+			 * because we have a reference on the page.
+			 */
+			if (page->index > end)
+				goto out;
+
+			/*
+			 * Accumulated enough dirty pages? This doesn't apply
+			 * to WB_SYNC_ALL mode. For integrity sync we have to
+			 * keep going because someone may be concurrently
+			 * dirtying pages, and we might have synced a lot of
+			 * newly appeared dirty pages, but have not synced all
+			 * of the old dirty pages.
+			 */
+			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+				goto out;
+
+			/* If we can't merge this page, we are done. */
+			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+				goto out;
+
+			lock_page(page);
+			/*
+			 * If the page is no longer dirty, or its mapping no
+			 * longer corresponds to inode we are writing (which
+			 * means it has been truncated or invalidated), or the
+			 * page is already under writeback and we are not doing
+			 * a data integrity writeback, skip the page
+			 */
+			if (!PageDirty(page) ||
+			    (PageWriteback(page) &&
+			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
+			    unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			wait_on_page_writeback(page);
+			BUG_ON(PageWriteback(page));
+
+			if (mpd->map.m_len == 0)
+				mpd->first_page = page->index;
+			mpd->next_page = page->index + 1;
+			/* Add all dirty buffers to mpd */
+			lblk = ((ext4_lblk_t)page->index) <<
+				(PAGE_CACHE_SHIFT - blkbits);
+			head = page_buffers(page);
+			err = mpage_process_page_bufs(mpd, head, head, lblk);
+			if (err <= 0)
+				goto out;
+			err = 0;
+			left--;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return 0;
+out:
+	pagevec_release(&pvec);
+	return err;
+}
+
+static int __writepage(struct page *page, struct writeback_control *wbc,
+		       void *data)
+{
+	struct address_space *mapping = data;
+	int ret = ext4_writepage(page, wbc);
+	mapping_set_error(mapping, ret);
+	return ret;
+}
+
+static int ext4_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	pgoff_t	writeback_index = 0;
+	long nr_to_write = wbc->nr_to_write;
+	int range_whole = 0;
+	int cycled = 1;
+	handle_t *handle = NULL;
+	struct mpage_da_data mpd;
+	struct inode *inode = mapping->host;
+	int needed_blocks, rsv_blocks = 0, ret = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+	bool done;
+	struct blk_plug plug;
+	bool give_up_on_write = false;
+
+	trace_ext4_writepages(inode, wbc);
+
+	/*
+	 * No pages to write? This is mainly a kludge to avoid starting
+	 * a transaction for special inodes like journal inode on last iput()
+	 * because that could violate lock ordering on umount
+	 */
+	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		goto out_writepages;
+
+	if (ext4_should_journal_data(inode)) {
+		struct blk_plug plug;
+
+		blk_start_plug(&plug);
+		ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+		blk_finish_plug(&plug);
+		goto out_writepages;
+	}
+
+	/*
+	 * If the filesystem has aborted, it is read-only, so return
+	 * right away instead of dumping stack traces later on that
+	 * will obscure the real source of the problem.  We test
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+	 * the latter could be true if the filesystem is mounted
+	 * read-only, and in that case, ext4_writepages should
+	 * *never* be called, so if that ever happens, we would want
+	 * the stack trace.
+	 */
+	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+		ret = -EROFS;
+		goto out_writepages;
+	}
+
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * We may need to convert up to one extent per block in
+		 * the page and we may dirty the inode.
+		 */
+		rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+	}
+
+	/*
+	 * If we have inline data and arrive here, it means that
+	 * we will soon create the block for the 1st page, so
+	 * we'd better clear the inline data here.
+	 */
+	if (ext4_has_inline_data(inode)) {
+		/* Just inode will be modified... */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		BUG_ON(ext4_test_inode_state(inode,
+				EXT4_STATE_MAY_INLINE_DATA));
+		ext4_destroy_inline_data(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
+	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+		range_whole = 1;
+
+	if (wbc->range_cyclic) {
+		writeback_index = mapping->writeback_index;
+		if (writeback_index)
+			cycled = 0;
+		mpd.first_page = writeback_index;
+		mpd.last_page = -1;
+	} else {
+		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+	}
+
+	mpd.inode = inode;
+	mpd.wbc = wbc;
+	ext4_io_submit_init(&mpd.io_submit, wbc);
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+	done = false;
+	blk_start_plug(&plug);
+	while (!done && mpd.first_page <= mpd.last_page) {
+		/* For each extent of pages we use new io_end */
+		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+		if (!mpd.io_submit.io_end) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * We have two constraints: We find one extent to map and we
+		 * must always write out whole page (makes a difference when
+		 * blocksize < pagesize) so that we don't block on IO when we
+		 * try to write out the rest of the page. Journalled mode is
+		 * not supported by delalloc.
+		 */
+		BUG_ON(ext4_should_journal_data(inode));
+		needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+		/* start a new transaction */
+		handle = ext4_journal_start_with_reserve(inode,
+				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+			       "%ld pages, ino %lu; err %d", __func__,
+				wbc->nr_to_write, inode->i_ino, ret);
+			/* Release allocated io_end */
+			ext4_put_io_end(mpd.io_submit.io_end);
+			break;
+		}
+
+		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+		ret = mpage_prepare_extent_to_map(&mpd);
+		if (!ret) {
+			if (mpd.map.m_len)
+				ret = mpage_map_and_submit_extent(handle, &mpd,
+					&give_up_on_write);
+			else {
+				/*
+				 * We scanned the whole range (or exhausted
+				 * nr_to_write), submitted what was mapped and
+				 * didn't find anything needing mapping. We are
+				 * done.
+				 */
+				done = true;
+			}
+		}
+		ext4_journal_stop(handle);
+		/* Submit prepared bio */
+		ext4_io_submit(&mpd.io_submit);
+		/* Unlock pages we didn't use */
+		mpage_release_unused_pages(&mpd, give_up_on_write);
+		/* Drop our io_end reference we got from init */
+		ext4_put_io_end(mpd.io_submit.io_end);
+
+		if (ret == -ENOSPC && sbi->s_journal) {
+			/*
+			 * Commit the transaction which would
+			 * free blocks released in the transaction
+			 * and try again
+			 */
+			jbd2_journal_force_commit_nested(sbi->s_journal);
+			ret = 0;
+			continue;
+		}
+		/* Fatal error - ENOMEM, EIO... */
+		if (ret)
+			break;
+	}
+	blk_finish_plug(&plug);
+	if (!ret && !cycled && wbc->nr_to_write > 0) {
+		cycled = 1;
+		mpd.last_page = writeback_index - 1;
+		mpd.first_page = 0;
+		goto retry;
+	}
+
+	/* Update index */
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		/*
+		 * Set the writeback_index so that range_cyclic
+		 * mode will write it back later
+		 */
+		mapping->writeback_index = mpd.first_page;
+
+out_writepages:
+	trace_ext4_writepages_result(inode, wbc, ret,
+				     nr_to_write - wbc->nr_to_write);
+	return ret;
+}
+
+static int ext4_nonda_switch(struct super_block *sb)
+{
+	s64 free_clusters, dirty_clusters;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/*
+	 * switch to non delalloc mode if we are running low
+	 * on free block. The free block accounting via percpu
+	 * counters can get slightly wrong with percpu_counter_batch getting
+	 * accumulated on each CPU without updating global counters
+	 * Delalloc need an accurate free block accounting. So switch
+	 * to non delalloc when we are near to error range.
+	 */
+	free_clusters =
+		percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+	dirty_clusters =
+		percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+	/*
+	 * Start pushing delalloc when 1/2 of free blocks are dirty.
+	 */
+	if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
+		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+
+	if (2 * free_clusters < 3 * dirty_clusters ||
+	    free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
+		/*
+		 * free block count is less than 150% of dirty blocks
+		 * or free blocks is less than watermark
+		 */
+		return 1;
+	}
+	return 0;
+}
+
+/* We always reserve for an inode update; the superblock could be there too */
+static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
+{
+	if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+		return 1;
+
+	if (pos + len <= 0x7fffffffULL)
+		return 1;
+
+	/* We might need to update the superblock to set LARGE_FILE */
+	return 2;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+			       loff_t pos, unsigned len, unsigned flags,
+			       struct page **pagep, void **fsdata)
+{
+	int ret, retries = 0;
+	struct page *page;
+	pgoff_t index;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+
+	if (ext4_nonda_switch(inode->i_sb)) {
+		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+		return ext4_write_begin(file, mapping, pos,
+					len, flags, pagep, fsdata);
+	}
+	*fsdata = (void *)0;
+	trace_ext4_da_write_begin(inode, pos, len, flags);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+		ret = ext4_da_write_inline_data_begin(mapping, inode,
+						      pos, len, flags,
+						      pagep, fsdata);
+		if (ret < 0)
+			return ret;
+		if (ret == 1)
+			return 0;
+	}
+
+	/*
+	 * grab_cache_page_write_begin() can take a long time if the
+	 * system is thrashing due to memory pressure, or if the page
+	 * is being written back.  So grab it first before we start
+	 * the transaction handle.  This also allows us to allocate
+	 * the page (if needed) without using GFP_NOFS.
+	 */
+retry_grab:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	unlock_page(page);
+
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				ext4_da_write_credits(inode, pos, len));
+	if (IS_ERR(handle)) {
+		page_cache_release(page);
+		return PTR_ERR(handle);
+	}
+
+	lock_page(page);
+	if (page->mapping != mapping) {
+		/* The page got truncated from under us */
+		unlock_page(page);
+		page_cache_release(page);
+		ext4_journal_stop(handle);
+		goto retry_grab;
+	}
+	/* In case writeback began while the page was unlocked */
+	wait_for_stable_page(page);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	ret = ext4_block_write_begin(page, pos, len,
+				     ext4_da_get_block_prep);
+#else
+	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+#endif
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			ext4_truncate_failed_write(inode);
+
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+
+		page_cache_release(page);
+		return ret;
+	}
+
+	*pagep = page;
+	return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+					    unsigned long offset)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	unsigned int idx;
+	int i;
+
+	bh = page_buffers(page);
+	idx = offset >> inode->i_blkbits;
+
+	for (i = 0; i < idx; i++)
+		bh = bh->b_this_page;
+
+	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
+		return 0;
+	return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned copied,
+			     struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+	unsigned long start, end;
+	int write_mode = (int)(unsigned long)fsdata;
+
+	if (write_mode == FALL_BACK_TO_NONDELALLOC)
+		return ext4_write_end(file, mapping, pos,
+				      len, copied, page, fsdata);
+
+	trace_ext4_da_write_end(inode, pos, len, copied);
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + copied - 1;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+	new_i_size = pos + copied;
+	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+		if (ext4_has_inline_data(inode) ||
+		    ext4_da_should_update_i_disksize(page, end)) {
+			ext4_update_i_disksize(inode, new_i_size);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+
+	if (write_mode != CONVERT_INLINE_DATA &&
+	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+	    ext4_has_inline_data(inode))
+		ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+						     page);
+	else
+		ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+				   unsigned int length)
+{
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	ext4_da_page_release_reservation(page, offset, length);
+
+out:
+	ext4_invalidatepage(page, offset, length);
+
+	return;
+}
+
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+	trace_ext4_alloc_da_blocks(inode);
+
+	if (!EXT4_I(inode)->i_reserved_data_blocks)
+		return 0;
+
+	/*
+	 * We do something simple for now.  The filemap_flush() will
+	 * also start triggering a write of the data blocks, which is
+	 * not strictly speaking necessary (and for users of
+	 * laptop_mode, not even desirable).  However, to do otherwise
+	 * would require replicating code paths in:
+	 *
+	 * ext4_writepages() ->
+	 *    write_cache_pages() ---> (via passed in callback function)
+	 *        __mpage_da_writepage() -->
+	 *           mpage_add_bh_to_extent()
+	 *           mpage_da_map_blocks()
+	 *
+	 * The problem is that write_cache_pages(), located in
+	 * mm/page-writeback.c, marks pages clean in preparation for
+	 * doing I/O, which is not desirable if we're not planning on
+	 * doing I/O at all.
+	 *
+	 * We could call write_cache_pages(), and then redirty all of
+	 * the pages by calling redirty_page_for_writepage() but that
+	 * would be ugly in the extreme.  So instead we would need to
+	 * replicate parts of the code in the above functions,
+	 * simplifying them because we wouldn't actually intend to
+	 * write out the pages, but rather only collect contiguous
+	 * logical block extents, call the multi-block allocator, and
+	 * then update the buffer heads with the block allocations.
+	 *
+	 * For now, though, we'll cheat by calling filemap_flush(),
+	 * which will map the blocks, and start the I/O, but not
+	 * actually wait for the I/O to complete.
+	 */
+	return filemap_flush(inode->i_mapping);
+}
+
+/*
+ * bmap() is special.  It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal.  If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	journal_t *journal;
+	int err;
+
+	/*
+	 * We can get here for an inline file via the FIBMAP ioctl
+	 */
+	if (ext4_has_inline_data(inode))
+		return 0;
+
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
+		/*
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
+		 */
+		filemap_write_and_wait(mapping);
+	}
+
+	if (EXT4_JOURNAL(inode) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
+		/*
+		 * This is a REALLY heavyweight approach, but the use of
+		 * bmap on dirty files is expected to be extremely rare:
+		 * only if we run lilo or swapon on a freshly made file
+		 * do we expect this to happen.
+		 *
+		 * (bmap requires CAP_SYS_RAWIO so this does not
+		 * represent an unprivileged user DOS attack --- we'd be
+		 * in trouble if mortal users could trigger this path at
+		 * will.)
+		 *
+		 * NB. EXT4_STATE_JDATA is not set on files other than
+		 * regular files.  If somebody wants to bmap a directory
+		 * or symlink and gets confused because the buffer
+		 * hasn't yet been flushed to disk, they deserve
+		 * everything they get.
+		 */
+
+		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+		journal = EXT4_JOURNAL(inode);
+		jbd2_journal_lock_updates(journal);
+		err = jbd2_journal_flush(journal);
+		jbd2_journal_unlock_updates(journal);
+
+		if (err)
+			return 0;
+	}
+
+	return generic_block_bmap(mapping, block, ext4_get_block);
+}
+
+static int ext4_readpage(struct file *file, struct page *page)
+{
+	int ret = -EAGAIN;
+	struct inode *inode = page->mapping->host;
+
+	trace_ext4_readpage(page);
+
+	if (ext4_has_inline_data(inode))
+		ret = ext4_readpage_inline(inode, page);
+
+	if (ret == -EAGAIN)
+		return ext4_mpage_readpages(page->mapping, NULL, page, 1);
+
+	return ret;
+}
+
+static int
+ext4_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+
+	/* If the file has inline data, no need to do readpages. */
+	if (ext4_has_inline_data(inode))
+		return 0;
+
+	return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
+{
+	trace_ext4_invalidatepage(page, offset, length);
+
+	/* No journalling happens on data buffers when this function is used */
+	WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+	block_invalidatepage(page, offset, length);
+}
+
+static int __ext4_journalled_invalidatepage(struct page *page,
+					    unsigned int offset,
+					    unsigned int length)
+{
+	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+	trace_ext4_journalled_invalidatepage(page, offset, length);
+
+	/*
+	 * If it's a full truncate we just forget about the pending dirtying
+	 */
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
+		ClearPageChecked(page);
+
+	return jbd2_journal_invalidatepage(journal, page, offset, length);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+					   unsigned int offset,
+					   unsigned int length)
+{
+	WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
+}
+
+static int ext4_releasepage(struct page *page, gfp_t wait)
+{
+	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+	trace_ext4_releasepage(page);
+
+	/* Page has dirty journalled data -> cannot release */
+	if (PageChecked(page))
+		return 0;
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	else
+		return try_to_free_buffers(page);
+}
+
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
+{
+	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+		   inode->i_ino, create);
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
+{
+	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+		   inode->i_ino, create);
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_NO_LOCK);
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+			    ssize_t size, void *private)
+{
+        ext4_io_end_t *io_end = iocb->private;
+
+	/* if not async direct IO just return */
+	if (!io_end)
+		return;
+
+	ext_debug("ext4_end_io_dio(): io_end 0x%p "
+		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
+ 		  iocb->private, io_end->inode->i_ino, iocb, offset,
+		  size);
+
+	iocb->private = NULL;
+	io_end->offset = offset;
+	io_end->size = size;
+	ext4_put_io_end(io_end);
+}
+
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unwritten
+ * If those blocks were preallocated, we mark sure they are split, but
+ * still keep the range to write as unwritten.
+ *
+ * The unwritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the conversion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+				  loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+	size_t count = iov_iter_count(iter);
+	int overwrite = 0;
+	get_block_t *get_block_func = NULL;
+	int dio_flags = 0;
+	loff_t final_size = offset + count;
+	ext4_io_end_t *io_end = NULL;
+
+	/* Use the old path for reads and writes beyond i_size. */
+	if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
+		return ext4_ind_direct_IO(iocb, iter, offset);
+
+	BUG_ON(iocb->private == NULL);
+
+	/*
+	 * Make all waiters for direct IO properly wait also for extent
+	 * conversion. This also disallows race between truncate() and
+	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+	 */
+	if (iov_iter_rw(iter) == WRITE)
+		inode_dio_begin(inode);
+
+	/* If we do a overwrite dio, i_mutex locking can be released */
+	overwrite = *((int *)iocb->private);
+
+	if (overwrite) {
+		down_read(&EXT4_I(inode)->i_data_sem);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	/*
+	 * We could direct write to holes and fallocate.
+	 *
+	 * Allocated blocks to fill the hole are marked as
+	 * unwritten to prevent parallel buffered read to expose
+	 * the stale data before DIO complete the data IO.
+	 *
+	 * As to previously fallocated extents, ext4 get_block will
+	 * just simply mark the buffer mapped but still keep the
+	 * extents unwritten.
+	 *
+	 * For non AIO case, we will convert those unwritten extents
+	 * to written after return back from blockdev_direct_IO.
+	 *
+	 * For async DIO, the conversion needs to be deferred when the
+	 * IO is completed. The ext4 end_io callback function will be
+	 * called to take care of the conversion work.  Here for async
+	 * case, we allocate an io_end structure to hook to the iocb.
+	 */
+	iocb->private = NULL;
+	ext4_inode_aio_set(inode, NULL);
+	if (!is_sync_kiocb(iocb)) {
+		io_end = ext4_init_io_end(inode, GFP_NOFS);
+		if (!io_end) {
+			ret = -ENOMEM;
+			goto retake_lock;
+		}
+		/*
+		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+		 */
+		iocb->private = ext4_get_io_end(io_end);
+		/*
+		 * we save the io structure for current async direct
+		 * IO, so that later ext4_map_blocks() could flag the
+		 * io structure whether there is a unwritten extents
+		 * needs to be converted when IO is completed.
+		 */
+		ext4_inode_aio_set(inode, io_end);
+	}
+
+	if (overwrite) {
+		get_block_func = ext4_get_block_write_nolock;
+	} else {
+		get_block_func = ext4_get_block_write;
+		dio_flags = DIO_LOCKING;
+	}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
+#endif
+	if (IS_DAX(inode))
+		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
+				ext4_end_io_dio, dio_flags);
+	else
+		ret = __blockdev_direct_IO(iocb, inode,
+					   inode->i_sb->s_bdev, iter, offset,
+					   get_block_func,
+					   ext4_end_io_dio, NULL, dio_flags);
+
+	/*
+	 * Put our reference to io_end. This can free the io_end structure e.g.
+	 * in sync IO case or in case of error. It can even perform extent
+	 * conversion if all bios we submitted finished before we got here.
+	 * Note that in that case iocb->private can be already set to NULL
+	 * here.
+	 */
+	if (io_end) {
+		ext4_inode_aio_set(inode, NULL);
+		ext4_put_io_end(io_end);
+		/*
+		 * When no IO was submitted ext4_end_io_dio() was not
+		 * called so we have to put iocb's reference.
+		 */
+		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+			WARN_ON(iocb->private != io_end);
+			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+			ext4_put_io_end(io_end);
+			iocb->private = NULL;
+		}
+	}
+	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+						EXT4_STATE_DIO_UNWRITTEN)) {
+		int err;
+		/*
+		 * for non AIO case, since the IO is already
+		 * completed, we could do the conversion right here
+		 */
+		err = ext4_convert_unwritten_extents(NULL, inode,
+						     offset, ret);
+		if (err < 0)
+			ret = err;
+		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+	}
+
+retake_lock:
+	if (iov_iter_rw(iter) == WRITE)
+		inode_dio_end(inode);
+	/* take i_mutex locking again if we do a ovewrite dio */
+	if (overwrite) {
+		up_read(&EXT4_I(inode)->i_data_sem);
+		mutex_lock(&inode->i_mutex);
+	}
+
+	return ret;
+}
+
+static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			      loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+		return 0;
+#endif
+
+	/*
+	 * If we are doing data journalling we don't support O_DIRECT
+	 */
+	if (ext4_should_journal_data(inode))
+		return 0;
+
+	/* Let buffer I/O handle the inline data case. */
+	if (ext4_has_inline_data(inode))
+		return 0;
+
+	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ret = ext4_ext_direct_IO(iocb, iter, offset);
+	else
+		ret = ext4_ind_direct_IO(iocb, iter, offset);
+	trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
+	return ret;
+}
+
+/*
+ * Pages can be marked dirty completely asynchronously from ext4's journalling
+ * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
+ * much here because ->set_page_dirty is called under VFS locks.  The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext4_journalled_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations ext4_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_journalled_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_journalled_write_end,
+	.set_page_dirty		= ext4_journalled_set_page_dirty,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_journalled_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_da_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
+	.write_begin		= ext4_da_write_begin,
+	.write_end		= ext4_da_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_da_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+void ext4_set_aops(struct inode *inode)
+{
+	switch (ext4_inode_journal_mode(inode)) {
+	case EXT4_INODE_ORDERED_DATA_MODE:
+		ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+		break;
+	case EXT4_INODE_WRITEBACK_DATA_MODE:
+		ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+		break;
+	case EXT4_INODE_JOURNAL_DATA_MODE:
+		inode->i_mapping->a_ops = &ext4_journalled_aops;
+		return;
+	default:
+		BUG();
+	}
+	if (test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else
+		inode->i_mapping->a_ops = &ext4_aops;
+}
+
+static int __ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
+{
+	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, pos;
+	ext4_lblk_t iblock;
+	struct inode *inode = mapping->host;
+	struct buffer_head *bh;
+	struct page *page;
+	int err = 0;
+
+	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+				   mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (!page)
+		return -ENOMEM;
+
+	blocksize = inode->i_sb->s_blocksize;
+
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext4_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh)) {
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
+		}
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+		if (S_ISREG(inode->i_mode) &&
+		    ext4_encrypted_inode(inode)) {
+			/* We expect the key to be set. */
+			BUG_ON(!ext4_has_encryption_key(inode));
+			BUG_ON(blocksize != PAGE_CACHE_SIZE);
+			WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+		}
+	}
+	if (ext4_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			goto unlock;
+	}
+	zero_user(page, offset, length);
+	BUFFER_TRACE(bh, "zeroed end of block");
+
+	if (ext4_should_journal_data(inode)) {
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+	} else {
+		err = 0;
+		mark_buffer_dirty(bh);
+		if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+			err = ext4_jbd2_file_inode(handle, inode);
+	}
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
+{
+	struct inode *inode = mapping->host;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	unsigned max = blocksize - (offset & (blocksize - 1));
+
+	/*
+	 * correct length if it does not fall between
+	 * 'from' and the end of the block
+	 */
+	if (length > max || length < 0)
+		length = max;
+
+	if (IS_DAX(inode))
+		return dax_zero_page_range(inode, from, length, ext4_get_block);
+	return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+static int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from)
+{
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
+	struct inode *inode = mapping->host;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned partial_start, partial_end;
+	ext4_fsblk_t start, end;
+	loff_t byte_end = (lstart + length - 1);
+	int err = 0;
+
+	partial_start = lstart & (sb->s_blocksize - 1);
+	partial_end = byte_end & (sb->s_blocksize - 1);
+
+	start = lstart >> sb->s_blocksize_bits;
+	end = byte_end >> sb->s_blocksize_bits;
+
+	/* Handle partial zero within the single block */
+	if (start == end &&
+	    (partial_start || (partial_end != sb->s_blocksize - 1))) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, length);
+		return err;
+	}
+	/* Handle partial zero out on the start of the range */
+	if (partial_start) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, sb->s_blocksize);
+		if (err)
+			return err;
+	}
+	/* Handle partial zero out on the end of the range */
+	if (partial_end != sb->s_blocksize - 1)
+		err = ext4_block_zero_page_range(handle, mapping,
+						 byte_end - partial_end,
+						 partial_end + 1);
+	return err;
+}
+
+int ext4_can_truncate(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext4_inode_is_fast_symlink(inode);
+	return 0;
+}
+
+/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on success or negative on failure
+ */
+
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t first_block, stop_block;
+	struct address_space *mapping = inode->i_mapping;
+	loff_t first_block_offset, last_block_offset;
+	handle_t *handle;
+	unsigned int credits;
+	int ret = 0;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	trace_ext4_punch_hole(inode, offset, length, 0);
+
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		ret = filemap_write_and_wait_range(mapping, offset,
+						   offset + length - 1);
+		if (ret)
+			return ret;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	/* No need to punch hole beyond i_size */
+	if (offset >= inode->i_size)
+		goto out_mutex;
+
+	/*
+	 * If the hole extends beyond i_size, set the hole
+	 * to end after the page that contains i_size
+	 */
+	if (offset + length > inode->i_size) {
+		length = inode->i_size +
+		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+		   offset;
+	}
+
+	if (offset & (sb->s_blocksize - 1) ||
+	    (offset + length) & (sb->s_blocksize - 1)) {
+		/*
+		 * Attach jinode to inode for jbd2 if we do any zeroing of
+		 * partial block
+		 */
+		ret = ext4_inode_attach_jinode(inode);
+		if (ret < 0)
+			goto out_mutex;
+
+	}
+
+	first_block_offset = round_up(offset, sb->s_blocksize);
+	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+	/* Now release the pages and zero block aligned part of pages*/
+	if (last_block_offset > first_block_offset)
+		truncate_pagecache_range(inode, first_block_offset,
+					 last_block_offset);
+
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		credits = ext4_writepage_trans_blocks(inode);
+	else
+		credits = ext4_blocks_for_truncate(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_std_error(sb, ret);
+		goto out_dio;
+	}
+
+	ret = ext4_zero_partial_blocks(handle, inode, offset,
+				       length);
+	if (ret)
+		goto out_stop;
+
+	first_block = (offset + sb->s_blocksize - 1) >>
+		EXT4_BLOCK_SIZE_BITS(sb);
+	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	/* If there are no blocks to remove, return now */
+	if (first_block >= stop_block)
+		goto out_stop;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	ret = ext4_es_remove_extent(inode, first_block,
+				    stop_block - first_block);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ret = ext4_ext_remove_space(inode, first_block,
+					    stop_block - 1);
+	else
+		ret = ext4_ind_remove_space(handle, inode, first_block,
+					    stop_block);
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+	/* Now release the pages again to reduce race window */
+	if (last_block_offset > first_block_offset)
+		truncate_pagecache_range(inode, first_block_offset,
+					 last_block_offset);
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+out_stop:
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+int ext4_inode_attach_jinode(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct jbd2_inode *jinode;
+
+	if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
+		return 0;
+
+	jinode = jbd2_alloc_inode(GFP_KERNEL);
+	spin_lock(&inode->i_lock);
+	if (!ei->jinode) {
+		if (!jinode) {
+			spin_unlock(&inode->i_lock);
+			return -ENOMEM;
+		}
+		ei->jinode = jinode;
+		jbd2_journal_init_jbd_inode(ei->jinode, inode);
+		jinode = NULL;
+	}
+	spin_unlock(&inode->i_lock);
+	if (unlikely(jinode != NULL))
+		jbd2_free_inode(jinode);
+	return 0;
+}
+
+/*
+ * ext4_truncate()
+ *
+ * We block out ext4_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext4_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk.  We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable.  It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext4_truncate() to have another go.  So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext4 filesystem.  But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext4_truncate() run will find them and release them.
+ */
+void ext4_truncate(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int credits;
+	handle_t *handle;
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * There is a possibility that we're either freeing the inode
+	 * or it's a completely new inode. In those cases we might not
+	 * have i_mutex locked because it's not necessary.
+	 */
+	if (!(inode->i_state & (I_NEW|I_FREEING)))
+		WARN_ON(!mutex_is_locked(&inode->i_mutex));
+	trace_ext4_truncate_enter(inode);
+
+	if (!ext4_can_truncate(inode))
+		return;
+
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
+	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+	if (ext4_has_inline_data(inode)) {
+		int has_inline = 1;
+
+		ext4_inline_data_truncate(inode, &has_inline);
+		if (has_inline)
+			return;
+	}
+
+	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
+	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+		if (ext4_inode_attach_jinode(inode) < 0)
+			return;
+	}
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		credits = ext4_writepage_trans_blocks(inode);
+	else
+		credits = ext4_blocks_for_truncate(inode);
+
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ext4_std_error(inode->i_sb, PTR_ERR(handle));
+		return;
+	}
+
+	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+	/*
+	 * We add the inode to the orphan list, so that if this
+	 * truncate spans multiple transactions, and we crash, we will
+	 * resume the truncate when the filesystem recovers.  It also
+	 * marks the inode dirty, to catch the new size.
+	 *
+	 * Implication: the file must always be in a sane, consistent
+	 * truncatable state while each transaction commits.
+	 */
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	ext4_discard_preallocations(inode);
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ext4_ext_truncate(handle, inode);
+	else
+		ext4_ind_truncate(handle, inode);
+
+	up_write(&ei->i_data_sem);
+
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+out_stop:
+	/*
+	 * If this was a simple ftruncate() and the file will remain alive,
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext4_evict_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+
+	trace_ext4_truncate_exit(inode);
+}
+
+/*
+ * ext4_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext4_get_inode_loc(struct inode *inode,
+				struct ext4_iloc *iloc, int in_mem)
+{
+	struct ext4_group_desc	*gdp;
+	struct buffer_head	*bh;
+	struct super_block	*sb = inode->i_sb;
+	ext4_fsblk_t		block;
+	int			inodes_per_block, inode_offset;
+
+	iloc->bh = NULL;
+	if (!ext4_valid_inum(sb, inode->i_ino))
+		return -EIO;
+
+	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
+		return -EIO;
+
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	inode_offset = ((inode->i_ino - 1) %
+			EXT4_INODES_PER_GROUP(sb));
+	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	bh = sb_getblk(sb, block);
+	if (unlikely(!bh))
+		return -ENOMEM;
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
+		if (buffer_uptodate(bh)) {
+			/* someone brought it uptodate while we waited */
+			unlock_buffer(bh);
+			goto has_buffer;
+		}
+
+		/*
+		 * If we have all information of the inode in memory and this
+		 * is the only valid inode in the block, we need not read the
+		 * block.
+		 */
+		if (in_mem) {
+			struct buffer_head *bitmap_bh;
+			int i, start;
+
+			start = inode_offset & ~(inodes_per_block - 1);
+
+			/* Is the inode bitmap in cache? */
+			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+			if (unlikely(!bitmap_bh))
+				goto make_io;
+
+			/*
+			 * If the inode bitmap isn't in cache then the
+			 * optimisation may end up performing two reads instead
+			 * of one, so skip it.
+			 */
+			if (!buffer_uptodate(bitmap_bh)) {
+				brelse(bitmap_bh);
+				goto make_io;
+			}
+			for (i = start; i < start + inodes_per_block; i++) {
+				if (i == inode_offset)
+					continue;
+				if (ext4_test_bit(i, bitmap_bh->b_data))
+					break;
+			}
+			brelse(bitmap_bh);
+			if (i == start + inodes_per_block) {
+				/* all other inodes are free, so skip I/O */
+				memset(bh->b_data, 0, bh->b_size);
+				set_buffer_uptodate(bh);
+				unlock_buffer(bh);
+				goto has_buffer;
+			}
+		}
+
+make_io:
+		/*
+		 * If we need to do any I/O, try to pre-readahead extra
+		 * blocks from the inode table.
+		 */
+		if (EXT4_SB(sb)->s_inode_readahead_blks) {
+			ext4_fsblk_t b, end, table;
+			unsigned num;
+			__u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
+
+			table = ext4_inode_table(sb, gdp);
+			/* s_inode_readahead_blks is always a power of 2 */
+			b = block & ~((ext4_fsblk_t) ra_blks - 1);
+			if (table > b)
+				b = table;
+			end = b + ra_blks;
+			num = EXT4_INODES_PER_GROUP(sb);
+			if (ext4_has_group_desc_csum(sb))
+				num -= ext4_itable_unused_count(sb, gdp);
+			table += num / inodes_per_block;
+			if (end > table)
+				end = table;
+			while (b <= end)
+				sb_breadahead(sb, b++);
+		}
+
+		/*
+		 * There are other valid inodes in the buffer, this inode
+		 * has in-inode xattrs, or we don't have this inode in memory.
+		 * Read the block from disk.
+		 */
+		trace_ext4_load_inode(inode);
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ | REQ_META | REQ_PRIO, bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			EXT4_ERROR_INODE_BLOCK(inode, block,
+					       "unable to read itable block");
+			brelse(bh);
+			return -EIO;
+		}
+	}
+has_buffer:
+	iloc->bh = bh;
+	return 0;
+}
+
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+{
+	/* We have all inode data except xattrs in memory here. */
+	return __ext4_get_inode_loc(inode, iloc,
+		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+}
+
+void ext4_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = EXT4_I(inode)->i_flags;
+	unsigned int new_fl = 0;
+
+	if (flags & EXT4_SYNC_FL)
+		new_fl |= S_SYNC;
+	if (flags & EXT4_APPEND_FL)
+		new_fl |= S_APPEND;
+	if (flags & EXT4_IMMUTABLE_FL)
+		new_fl |= S_IMMUTABLE;
+	if (flags & EXT4_NOATIME_FL)
+		new_fl |= S_NOATIME;
+	if (flags & EXT4_DIRSYNC_FL)
+		new_fl |= S_DIRSYNC;
+	if (test_opt(inode->i_sb, DAX))
+		new_fl |= S_DAX;
+	inode_set_flags(inode, new_fl,
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
+}
+
+/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
+void ext4_get_inode_flags(struct ext4_inode_info *ei)
+{
+	unsigned int vfs_fl;
+	unsigned long old_fl, new_fl;
+
+	do {
+		vfs_fl = ei->vfs_inode.i_flags;
+		old_fl = ei->i_flags;
+		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+				EXT4_DIRSYNC_FL);
+		if (vfs_fl & S_SYNC)
+			new_fl |= EXT4_SYNC_FL;
+		if (vfs_fl & S_APPEND)
+			new_fl |= EXT4_APPEND_FL;
+		if (vfs_fl & S_IMMUTABLE)
+			new_fl |= EXT4_IMMUTABLE_FL;
+		if (vfs_fl & S_NOATIME)
+			new_fl |= EXT4_NOATIME_FL;
+		if (vfs_fl & S_DIRSYNC)
+			new_fl |= EXT4_DIRSYNC_FL;
+	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
+}
+
+static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+				  struct ext4_inode_info *ei)
+{
+	blkcnt_t i_blocks ;
+	struct inode *inode = &(ei->vfs_inode);
+	struct super_block *sb = inode->i_sb;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		/* we are using combined 48 bit field */
+		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
+					le32_to_cpu(raw_inode->i_blocks_lo);
+		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
+			/* i_blocks represent file system block size */
+			return i_blocks  << (inode->i_blkbits - 9);
+		} else {
+			return i_blocks;
+		}
+	} else {
+		return le32_to_cpu(raw_inode->i_blocks_lo);
+	}
+}
+
+static inline void ext4_iget_extra_inode(struct inode *inode,
+					 struct ext4_inode *raw_inode,
+					 struct ext4_inode_info *ei)
+{
+	__le32 *magic = (void *)raw_inode +
+			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+		ext4_find_inline_data_nolock(inode);
+	} else
+		EXT4_I(inode)->i_inline_off = 0;
+}
+
+struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+{
+	struct ext4_iloc iloc;
+	struct ext4_inode *raw_inode;
+	struct ext4_inode_info *ei;
+	struct inode *inode;
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+	long ret;
+	int block;
+	uid_t i_uid;
+	gid_t i_gid;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = EXT4_I(inode);
+	iloc.bh = NULL;
+
+	ret = __ext4_get_inode_loc(inode, &iloc, 0);
+	if (ret < 0)
+		goto bad_inode;
+	raw_inode = ext4_raw_inode(&iloc);
+
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+		    EXT4_INODE_SIZE(inode->i_sb)) {
+			EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+				EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+				EXT4_INODE_SIZE(inode->i_sb));
+			ret = -EIO;
+			goto bad_inode;
+		}
+	} else
+		ei->i_extra_isize = 0;
+
+	/* Precompute checksum seed for inode metadata */
+	if (ext4_has_metadata_csum(sb)) {
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+		__u32 csum;
+		__le32 inum = cpu_to_le32(inode->i_ino);
+		__le32 gen = raw_inode->i_generation;
+		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+				   sizeof(inum));
+		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+					      sizeof(gen));
+	}
+
+	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+		EXT4_ERROR_INODE(inode, "checksum invalid");
+		ret = -EIO;
+		goto bad_inode;
+	}
+
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
+		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	}
+	i_uid_write(inode, i_uid);
+	i_gid_write(inode, i_gid);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
+
+	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
+	ei->i_inline_off = 0;
+	ei->i_dir_start_lookup = 0;
+	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	/* We now have enough fields to check if the inode was active or not.
+	 * This is needed because nfsd might try to access dead inodes
+	 * the test is that same one that e2fsck uses
+	 * NeilBrown 1999oct15
+	 */
+	if (inode->i_nlink == 0) {
+		if ((inode->i_mode == 0 ||
+		     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+		    ino != EXT4_BOOT_LOADER_INO) {
+			/* this inode is deleted */
+			ret = -ESTALE;
+			goto bad_inode;
+		}
+		/* The only unlinked inodes we let through here have
+		 * valid i_mode and are being read by the orphan
+		 * recovery code: that's fine, we're about to complete
+		 * the process of deleting those.
+		 * OR it is the EXT4_BOOT_LOADER_INO which is
+		 * not initialized on a new filesystem. */
+	}
+	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+		ei->i_file_acl |=
+			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+	inode->i_size = ext4_isize(raw_inode);
+	ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+	ei->i_reserved_quota = 0;
+#endif
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+	ei->i_block_group = iloc.block_group;
+	ei->i_last_alloc_group = ~0;
+	/*
+	 * NOTE! The in-memory inode i_data array is in little-endian order
+	 * even on big-endian machines: we do NOT byteswap the block numbers!
+	 */
+	for (block = 0; block < EXT4_N_BLOCKS; block++)
+		ei->i_data[block] = raw_inode->i_block[block];
+	INIT_LIST_HEAD(&ei->i_orphan);
+
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		transaction_t *transaction;
+		tid_t tid;
+
+		read_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		read_unlock(&journal->j_state_lock);
+		ei->i_sync_tid = tid;
+		ei->i_datasync_tid = tid;
+	}
+
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		if (ei->i_extra_isize == 0) {
+			/* The extra space is currently unused. Use it. */
+			ei->i_extra_isize = sizeof(struct ext4_inode) -
+					    EXT4_GOOD_OLD_INODE_SIZE;
+		} else {
+			ext4_iget_extra_inode(inode, raw_inode, ei);
+		}
+	}
+
+	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+		inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+				inode->i_version |=
+		    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+		}
+	}
+
+	ret = 0;
+	if (ei->i_file_acl &&
+	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+				 ei->i_file_acl);
+		ret = -EIO;
+		goto bad_inode;
+	} else if (!ext4_has_inline_data(inode)) {
+		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+			if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+			    (S_ISLNK(inode->i_mode) &&
+			     !ext4_inode_is_fast_symlink(inode))))
+				/* Validate extent which is part of inode */
+				ret = ext4_ext_check_inode(inode);
+		} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+			   (S_ISLNK(inode->i_mode) &&
+			    !ext4_inode_is_fast_symlink(inode))) {
+			/* Validate block references which are part of inode */
+			ret = ext4_ind_check_inode(inode);
+		}
+	}
+	if (ret)
+		goto bad_inode;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ext4_dir_inode_operations;
+		inode->i_fop = &ext4_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (ext4_inode_is_fast_symlink(inode) &&
+		    !ext4_encrypted_inode(inode)) {
+			inode->i_op = &ext4_fast_symlink_inode_operations;
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
+			inode->i_op = &ext4_symlink_inode_operations;
+			ext4_set_aops(inode);
+		}
+	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		inode->i_op = &ext4_special_inode_operations;
+		if (raw_inode->i_block[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	} else if (ino == EXT4_BOOT_LOADER_INO) {
+		make_bad_inode(inode);
+	} else {
+		ret = -EIO;
+		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+		goto bad_inode;
+	}
+	brelse(iloc.bh);
+	ext4_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	brelse(iloc.bh);
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
+{
+	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+		return ERR_PTR(-EIO);
+	return ext4_iget(sb, ino);
+}
+
+static int ext4_inode_blocks_set(handle_t *handle,
+				struct ext4_inode *raw_inode,
+				struct ext4_inode_info *ei)
+{
+	struct inode *inode = &(ei->vfs_inode);
+	u64 i_blocks = inode->i_blocks;
+	struct super_block *sb = inode->i_sb;
+
+	if (i_blocks <= ~0U) {
+		/*
+		 * i_blocks can be represented in a 32 bit variable
+		 * as multiple of 512 bytes
+		 */
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = 0;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+		return 0;
+	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+		return -EFBIG;
+
+	if (i_blocks <= 0xffffffffffffULL) {
+		/*
+		 * i_blocks can be represented in a 48 bit variable
+		 * as multiple of 512 bytes
+		 */
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+	} else {
+		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+		/* i_block is stored in file system block size */
+		i_blocks = i_blocks >> (inode->i_blkbits - 9);
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+	}
+	return 0;
+}
+
+struct other_inode {
+	unsigned long		orig_ino;
+	struct ext4_inode	*raw_inode;
+};
+
+static int other_inode_match(struct inode * inode, unsigned long ino,
+			     void *data)
+{
+	struct other_inode *oi = (struct other_inode *) data;
+
+	if ((inode->i_ino != ino) ||
+	    (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+			       I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+	    ((inode->i_state & I_DIRTY_TIME) == 0))
+		return 0;
+	spin_lock(&inode->i_lock);
+	if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+				I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+	    (inode->i_state & I_DIRTY_TIME)) {
+		struct ext4_inode_info	*ei = EXT4_I(inode);
+
+		inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+		spin_unlock(&inode->i_lock);
+
+		spin_lock(&ei->i_raw_lock);
+		EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+		EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+		EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+		ext4_inode_csum_set(inode, oi->raw_inode, ei);
+		spin_unlock(&ei->i_raw_lock);
+		trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+		return -1;
+	}
+	spin_unlock(&inode->i_lock);
+	return -1;
+}
+
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+					  unsigned long orig_ino, char *buf)
+{
+	struct other_inode oi;
+	unsigned long ino;
+	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int inode_size = EXT4_INODE_SIZE(sb);
+
+	oi.orig_ino = orig_ino;
+	ino = (orig_ino & ~(inodes_per_block - 1)) + 1;
+	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+		if (ino == orig_ino)
+			continue;
+		oi.raw_inode = (struct ext4_inode *) buf;
+		(void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+	}
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache.  This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext4_do_update_inode(handle_t *handle,
+				struct inode *inode,
+				struct ext4_iloc *iloc)
+{
+	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct buffer_head *bh = iloc->bh;
+	struct super_block *sb = inode->i_sb;
+	int err = 0, rc, block;
+	int need_datasync = 0, set_large_file = 0;
+	uid_t i_uid;
+	gid_t i_gid;
+
+	spin_lock(&ei->i_raw_lock);
+
+	/* For fields not tracked in the in-memory inode,
+	 * initialise them to zero for new inodes. */
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+
+	ext4_get_inode_flags(ei);
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	i_uid = i_uid_read(inode);
+	i_gid = i_gid_read(inode);
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+		if (!ei->i_dtime) {
+			raw_inode->i_uid_high =
+				cpu_to_le16(high_16_bits(i_uid));
+			raw_inode->i_gid_high =
+				cpu_to_le16(high_16_bits(i_gid));
+		} else {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		}
+	} else {
+		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+	err = ext4_inode_blocks_set(handle, raw_inode, ei);
+	if (err) {
+		spin_unlock(&ei->i_raw_lock);
+		goto out_brelse;
+	}
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
+		raw_inode->i_file_acl_high =
+			cpu_to_le16(ei->i_file_acl >> 32);
+	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+	if (ei->i_disksize != ext4_isize(raw_inode)) {
+		ext4_isize_set(raw_inode, ei->i_disksize);
+		need_datasync = 1;
+	}
+	if (ei->i_disksize > 0x7fffffffULL) {
+		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+				EXT4_SB(sb)->s_es->s_rev_level ==
+		    cpu_to_le32(EXT4_GOOD_OLD_REV))
+			set_large_file = 1;
+	}
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else if (!ext4_has_inline_data(inode)) {
+		for (block = 0; block < EXT4_N_BLOCKS; block++)
+			raw_inode->i_block[block] = ei->i_data[block];
+	}
+
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+		raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+		if (ei->i_extra_isize) {
+			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+				raw_inode->i_version_hi =
+					cpu_to_le32(inode->i_version >> 32);
+			raw_inode->i_extra_isize =
+				cpu_to_le16(ei->i_extra_isize);
+		}
+	}
+	ext4_inode_csum_set(inode, raw_inode, ei);
+	spin_unlock(&ei->i_raw_lock);
+	if (inode->i_sb->s_flags & MS_LAZYTIME)
+		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+					      bh->b_data);
+
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
+	if (!err)
+		err = rc;
+	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+	if (set_large_file) {
+		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+		if (err)
+			goto out_brelse;
+		ext4_update_dynamic_rev(sb);
+		EXT4_SET_RO_COMPAT_FEATURE(sb,
+					   EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+		ext4_handle_sync(handle);
+		err = ext4_handle_dirty_super(handle, sb);
+	}
+	ext4_update_inode_fsync_trans(handle, inode, need_datasync);
+out_brelse:
+	brelse(bh);
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext4_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
+ *   Here, there will be no transaction running. We wait for any running
+ *   transaction to commit.
+ *
+ * - Within flush work (sys_sync(), kupdate and such).
+ *   We wait on commit, if told to.
+ *
+ * - Within iput_final() -> write_inode_now()
+ *   We wait on commit, if told to.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
+ * writeback.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this.  The code:
+ *
+ *	mark_inode_dirty(inode)
+ *	stuff();
+ *	inode->i_size = expr;
+ *
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost.  Plus the inode will no longer be on the
+ * superblock's dirty inode list.
+ */
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int err;
+
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+		return 0;
+
+	if (EXT4_SB(inode->i_sb)->s_journal) {
+		if (ext4_journal_current_handle()) {
+			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+			dump_stack();
+			return -EIO;
+		}
+
+		/*
+		 * No need to force transaction in WB_SYNC_NONE mode. Also
+		 * ext4_sync_fs() will force the commit after everything is
+		 * written.
+		 */
+		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
+			return 0;
+
+		err = ext4_force_commit(inode->i_sb);
+	} else {
+		struct ext4_iloc iloc;
+
+		err = __ext4_get_inode_loc(inode, &iloc, 0);
+		if (err)
+			return err;
+		/*
+		 * sync(2) will flush the whole buffer cache. No need to do
+		 * it here separately for each inode.
+		 */
+		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+			sync_dirty_buffer(iloc.bh);
+		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+					 "IO error syncing inode");
+			err = -EIO;
+		}
+		brelse(iloc.bh);
+	}
+	return err;
+}
+
+/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+	struct page *page;
+	unsigned offset;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	tid_t commit_tid = 0;
+	int ret;
+
+	offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	/*
+	 * All buffers in the last page remain valid? Then there's nothing to
+	 * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+	 * blocksize case
+	 */
+	if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+		return;
+	while (1) {
+		page = find_lock_page(inode->i_mapping,
+				      inode->i_size >> PAGE_CACHE_SHIFT);
+		if (!page)
+			return;
+		ret = __ext4_journalled_invalidatepage(page, offset,
+						PAGE_CACHE_SIZE - offset);
+		unlock_page(page);
+		page_cache_release(page);
+		if (ret != -EBUSY)
+			return;
+		commit_tid = 0;
+		read_lock(&journal->j_state_lock);
+		if (journal->j_committing_transaction)
+			commit_tid = journal->j_committing_transaction->t_tid;
+		read_unlock(&journal->j_state_lock);
+		if (commit_tid)
+			jbd2_log_wait_commit(journal, commit_tid);
+	}
+}
+
+/*
+ * ext4_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible.  In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk.  (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
+ */
+int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error, rc = 0;
+	int orphan = 0;
+	const unsigned int ia_valid = attr->ia_valid;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+		handle_t *handle;
+
+		/* (user+group)*(old+new) structure, inode write (sb,
+		 * inode block, ? - but truncate inode update has it) */
+		handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+			(EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+			 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+		error = dquot_transfer(inode, attr);
+		if (error) {
+			ext4_journal_stop(handle);
+			return error;
+		}
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		error = ext4_mark_inode_dirty(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
+	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+		handle_t *handle;
+
+		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+			if (attr->ia_size > sbi->s_bitmap_maxbytes)
+				return -EFBIG;
+		}
+
+		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+			inode_inc_iversion(inode);
+
+		if (S_ISREG(inode->i_mode) &&
+		    (attr->ia_size < inode->i_size)) {
+			if (ext4_should_order_data(inode)) {
+				error = ext4_begin_ordered_truncate(inode,
+							    attr->ia_size);
+				if (error)
+					goto err_out;
+			}
+			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+			if (IS_ERR(handle)) {
+				error = PTR_ERR(handle);
+				goto err_out;
+			}
+			if (ext4_handle_valid(handle)) {
+				error = ext4_orphan_add(handle, inode);
+				orphan = 1;
+			}
+			down_write(&EXT4_I(inode)->i_data_sem);
+			EXT4_I(inode)->i_disksize = attr->ia_size;
+			rc = ext4_mark_inode_dirty(handle, inode);
+			if (!error)
+				error = rc;
+			/*
+			 * We have to update i_size under i_data_sem together
+			 * with i_disksize to avoid races with writeback code
+			 * running ext4_wb_update_i_disksize().
+			 */
+			if (!error)
+				i_size_write(inode, attr->ia_size);
+			up_write(&EXT4_I(inode)->i_data_sem);
+			ext4_journal_stop(handle);
+			if (error) {
+				ext4_orphan_del(NULL, inode);
+				goto err_out;
+			}
+		} else {
+			loff_t oldsize = inode->i_size;
+
+			i_size_write(inode, attr->ia_size);
+			pagecache_isize_extended(inode, oldsize, inode->i_size);
+		}
+
+		/*
+		 * Blocks are going to be removed from the inode. Wait
+		 * for dio in flight.  Temporarily disable
+		 * dioread_nolock to prevent livelock.
+		 */
+		if (orphan) {
+			if (!ext4_should_journal_data(inode)) {
+				ext4_inode_block_unlocked_dio(inode);
+				inode_dio_wait(inode);
+				ext4_inode_resume_unlocked_dio(inode);
+			} else
+				ext4_wait_for_tail_page_commit(inode);
+		}
+		/*
+		 * Truncate pagecache after we've waited for commit
+		 * in data=journal mode to make pages freeable.
+		 */
+		truncate_pagecache(inode, inode->i_size);
+	}
+	/*
+	 * We want to call ext4_truncate() even if attr->ia_size ==
+	 * inode->i_size for cases like truncation of fallocated space
+	 */
+	if (attr->ia_valid & ATTR_SIZE)
+		ext4_truncate(inode);
+
+	if (!rc) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
+
+	/*
+	 * If the call to ext4_truncate failed to get a transaction handle at
+	 * all, we need to clean up the in-core orphan list manually.
+	 */
+	if (orphan && inode->i_nlink)
+		ext4_orphan_del(NULL, inode);
+
+	if (!rc && (ia_valid & ATTR_MODE))
+		rc = posix_acl_chmod(inode, inode->i_mode);
+
+err_out:
+	ext4_std_error(inode->i_sb, error);
+	if (!error)
+		error = rc;
+	return error;
+}
+
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode;
+	unsigned long long delalloc_blocks;
+
+	inode = d_inode(dentry);
+	generic_fillattr(inode, stat);
+
+	/*
+	 * If there is inline data in the inode, the inode will normally not
+	 * have data blocks allocated (it may have an external xattr block).
+	 * Report at least one sector for such files, so tools like tar, rsync,
+	 * others doen't incorrectly think the file is completely sparse.
+	 */
+	if (unlikely(ext4_has_inline_data(inode)))
+		stat->blocks += (stat->size + 511) >> 9;
+
+	/*
+	 * We can't update i_blocks if the block allocation is delayed
+	 * otherwise in the case of system crash before the real block
+	 * allocation is done, we will have i_blocks inconsistent with
+	 * on-disk file blocks.
+	 * We always keep i_blocks updated together with real
+	 * allocation. But to not confuse with user, stat
+	 * will return the blocks that include the delayed allocation
+	 * blocks for this file.
+	 */
+	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+				   EXT4_I(inode)->i_reserved_data_blocks);
+	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
+	return 0;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+				   int pextents)
+{
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return ext4_ind_trans_blocks(inode, lblocks);
+	return ext4_ext_index_trans_blocks(inode, pextents);
+}
+
+/*
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
+ *
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiguous, with flexbg,
+ * they could still across block group boundary.
+ *
+ * Also account for superblock, inode, quota and xattr blocks
+ */
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents)
+{
+	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+	int gdpblocks;
+	int idxblocks;
+	int ret = 0;
+
+	/*
+	 * How many index blocks need to touch to map @lblocks logical blocks
+	 * to @pextents physical extents?
+	 */
+	idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
+
+	ret = idxblocks;
+
+	/*
+	 * Now let's see how many group bitmaps and group descriptors need
+	 * to account
+	 */
+	groups = idxblocks + pextents;
+	gdpblocks = groups;
+	if (groups > ngroups)
+		groups = ngroups;
+	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+
+	/* bitmaps and block group descriptor blocks */
+	ret += groups + gdpblocks;
+
+	/* Blocks for super block, inode, quota and xattr blocks */
+	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+	return ret;
+}
+
+/*
+ * Calculate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
+ *
+ * This could be called via ext4_write_begin()
+ *
+ * We need to consider the worse case, when
+ * one new block per extent.
+ */
+int ext4_writepage_trans_blocks(struct inode *inode)
+{
+	int bpp = ext4_journal_blocks_per_page(inode);
+	int ret;
+
+	ret = ext4_meta_trans_blocks(inode, bpp, bpp);
+
+	/* Account for data blocks for journalled mode */
+	if (ext4_should_journal_data(inode))
+		ret += bpp;
+	return ret;
+}
+
+/*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+	return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
+ * The caller must have previously called ext4_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext4_mark_iloc_dirty(handle_t *handle,
+			 struct inode *inode, struct ext4_iloc *iloc)
+{
+	int err = 0;
+
+	if (IS_I_VERSION(inode))
+		inode_inc_iversion(inode);
+
+	/* the do_update_inode consumes one bh->b_count */
+	get_bh(iloc->bh);
+
+	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+	err = ext4_do_update_inode(handle, inode, iloc);
+	put_bh(iloc->bh);
+	return err;
+}
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int
+ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+			 struct ext4_iloc *iloc)
+{
+	int err;
+
+	err = ext4_get_inode_loc(inode, iloc);
+	if (!err) {
+		BUFFER_TRACE(iloc->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, iloc->bh);
+		if (err) {
+			brelse(iloc->bh);
+			iloc->bh = NULL;
+		}
+	}
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+static int ext4_expand_extra_isize(struct inode *inode,
+				   unsigned int new_extra_isize,
+				   struct ext4_iloc iloc,
+				   handle_t *handle)
+{
+	struct ext4_inode *raw_inode;
+	struct ext4_xattr_ibody_header *header;
+
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
+		return 0;
+
+	raw_inode = ext4_raw_inode(&iloc);
+
+	header = IHDR(inode, raw_inode);
+
+	/* No extended attributes present */
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+			new_extra_isize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		return 0;
+	}
+
+	/* try to expand with EAs present */
+	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+					  raw_inode, handle);
+}
+
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O.  This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating?  Not really.  Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ */
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+	struct ext4_iloc iloc;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	static unsigned int mnt_count;
+	int err, ret;
+
+	might_sleep();
+	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (ext4_handle_valid(handle) &&
+	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
+		/*
+		 * We need extra buffer credits since we may write into EA block
+		 * with this same handle. If journal_extend fails, then it will
+		 * only result in a minor loss of functionality for that inode.
+		 * If this is felt to be critical, then e2fsck should be run to
+		 * force a large enough s_min_extra_isize.
+		 */
+		if ((jbd2_journal_extend(handle,
+			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+			ret = ext4_expand_extra_isize(inode,
+						      sbi->s_want_extra_isize,
+						      iloc, handle);
+			if (ret) {
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_NO_EXPAND);
+				if (mnt_count !=
+					le16_to_cpu(sbi->s_es->s_mnt_count)) {
+					ext4_warning(inode->i_sb,
+					"Unable to expand inode %lu. Delete"
+					" some EAs or run e2fsck.",
+					inode->i_ino);
+					mnt_count =
+					  le16_to_cpu(sbi->s_es->s_mnt_count);
+				}
+			}
+		}
+	}
+	if (!err)
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	return err;
+}
+
+/*
+ * ext4_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
+ */
+void ext4_dirty_inode(struct inode *inode, int flags)
+{
+	handle_t *handle;
+
+	if (flags == I_DIRTY_TIME)
+		return;
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle))
+		goto out;
+
+	ext4_mark_inode_dirty(handle, inode);
+
+	ext4_journal_stop(handle);
+out:
+	return;
+}
+
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early.  Unlike
+ * ext4_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext4_pin_inode(handle_t *handle, struct inode *inode)
+{
+	struct ext4_iloc iloc;
+
+	int err = 0;
+	if (handle) {
+		err = ext4_get_inode_loc(inode, &iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc.bh, "get_write_access");
+			err = jbd2_journal_get_write_access(handle, iloc.bh);
+			if (!err)
+				err = ext4_handle_dirty_metadata(handle,
+								 NULL,
+								 iloc.bh);
+			brelse(iloc.bh);
+		}
+	}
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+#endif
+
+int ext4_change_inode_journal_flag(struct inode *inode, int val)
+{
+	journal_t *journal;
+	handle_t *handle;
+	int err;
+
+	/*
+	 * We have to be very careful here: changing a data block's
+	 * journaling status dynamically is dangerous.  If we write a
+	 * data block to the journal, change the status and then delete
+	 * that block, we risk forgetting to revoke the old log record
+	 * from the journal and so a subsequent replay can corrupt data.
+	 * So, first we make sure that the journal is empty and that
+	 * nobody is changing anything.
+	 */
+
+	journal = EXT4_JOURNAL(inode);
+	if (!journal)
+		return 0;
+	if (is_journal_aborted(journal))
+		return -EROFS;
+	/* We have to allocate physical blocks for delalloc blocks
+	 * before flushing journal. otherwise delalloc blocks can not
+	 * be allocated any more. even more truncate on delalloc blocks
+	 * could trigger BUG by flushing delalloc blocks in journal.
+	 * There is no delalloc block in non-journal data mode.
+	 */
+	if (val && test_opt(inode->i_sb, DELALLOC)) {
+		err = ext4_alloc_da_blocks(inode);
+		if (err < 0)
+			return err;
+	}
+
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	jbd2_journal_lock_updates(journal);
+
+	/*
+	 * OK, there are no updates running now, and all cached data is
+	 * synced to disk.  We are now in a completely consistent state
+	 * which doesn't have anything in the journal, and we know that
+	 * no filesystem updates are running, so it is safe to modify
+	 * the inode's in-core data-journaling state flag now.
+	 */
+
+	if (val)
+		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	else {
+		err = jbd2_journal_flush(journal);
+		if (err < 0) {
+			jbd2_journal_unlock_updates(journal);
+			ext4_inode_resume_unlocked_dio(inode);
+			return err;
+		}
+		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	}
+	ext4_set_aops(inode);
+
+	jbd2_journal_unlock_updates(journal);
+	ext4_inode_resume_unlocked_dio(inode);
+
+	/* Finally we can mark the inode as dirty. */
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	ext4_handle_sync(handle);
+	ext4_journal_stop(handle);
+	ext4_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	loff_t size;
+	unsigned long len;
+	int ret;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	handle_t *handle;
+	get_block_t *get_block;
+	int retries = 0;
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+	/* Delalloc case is easy... */
+	if (test_opt(inode->i_sb, DELALLOC) &&
+	    !ext4_should_journal_data(inode) &&
+	    !ext4_nonda_switch(inode->i_sb)) {
+		do {
+			ret = __block_page_mkwrite(vma, vmf,
+						   ext4_da_get_block_prep);
+		} while (ret == -ENOSPC &&
+		       ext4_should_retry_alloc(inode->i_sb, &retries));
+		goto out_ret;
+	}
+
+	lock_page(page);
+	size = i_size_read(inode);
+	/* Page got truncated from under us? */
+	if (page->mapping != mapping || page_offset(page) > size) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	/*
+	 * Return if we have all the buffers mapped. This avoids the need to do
+	 * journal_start/journal_stop which can block and take a long time
+	 */
+	if (page_has_buffers(page)) {
+		if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+					    0, len, NULL,
+					    ext4_bh_unmapped)) {
+			/* Wait so that we don't change page under IO */
+			wait_for_stable_page(page);
+			ret = VM_FAULT_LOCKED;
+			goto out;
+		}
+	}
+	unlock_page(page);
+	/* OK, we need to fill the hole... */
+	if (ext4_should_dioread_nolock(inode))
+		get_block = ext4_get_block_write;
+	else
+		get_block = ext4_get_block;
+retry_alloc:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				    ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+	ret = __block_page_mkwrite(vma, vmf, get_block);
+	if (!ret && ext4_should_journal_data(inode)) {
+		if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
+			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+			unlock_page(page);
+			ret = VM_FAULT_SIGBUS;
+			ext4_journal_stop(handle);
+			goto out;
+		}
+		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+	}
+	ext4_journal_stop(handle);
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry_alloc;
+out_ret:
+	ret = block_page_mkwrite_return(ret);
+out:
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
diff --git a/kernel/fs/ext4/ioctl.c b/kernel/fs/ext4/ioctl.c
new file mode 100644
index 000000000..2cb9e178d
--- /dev/null
+++ b/kernel/fs/ext4/ioctl.c
@@ -0,0 +1,773 @@
+/*
+ * linux/fs/ext4/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/random.h>
+#include <asm/uaccess.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a:          pointer to first memory area
+ * @b:          pointer to second memory area
+ * @len:        number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+	unsigned char *ap, *bp;
+	unsigned char tmp;
+
+	ap = (unsigned char *)a;
+	bp = (unsigned char *)b;
+	while (len-- > 0) {
+		tmp = *ap;
+		*ap = *bp;
+		*bp = tmp;
+		ap++;
+		bp++;
+	}
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1:     pointer to first inode
+ * @inode2:     pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+	loff_t isize;
+	struct ext4_inode_info *ei1;
+	struct ext4_inode_info *ei2;
+
+	ei1 = EXT4_I(inode1);
+	ei2 = EXT4_I(inode2);
+
+	memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+	memswap(&inode1->i_version, &inode2->i_version,
+		  sizeof(inode1->i_version));
+	memswap(&inode1->i_blocks, &inode2->i_blocks,
+		  sizeof(inode1->i_blocks));
+	memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+	memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+	memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+	memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+	memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+	ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+	ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+
+	isize = i_size_read(inode1);
+	i_size_write(inode1, i_size_read(inode2));
+	i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb:         the super block of the filesystem
+ * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+				struct inode *inode)
+{
+	handle_t *handle;
+	int err;
+	struct inode *inode_bl;
+	struct ext4_inode_info *ei_bl;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+	if (IS_ERR(inode_bl))
+		return PTR_ERR(inode_bl);
+	ei_bl = EXT4_I(inode_bl);
+
+	filemap_flush(inode->i_mapping);
+	filemap_flush(inode_bl->i_mapping);
+
+	/* Protect orig inodes against a truncate and make sure,
+	 * that only 1 swap_inode_boot_loader is running. */
+	lock_two_nondirectories(inode, inode_bl);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages(&inode_bl->i_data, 0);
+
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	ext4_inode_block_unlocked_dio(inode_bl);
+	inode_dio_wait(inode);
+	inode_dio_wait(inode_bl);
+
+	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+	if (IS_ERR(handle)) {
+		err = -EINVAL;
+		goto journal_err_out;
+	}
+
+	/* Protect extent tree against block allocations via delalloc */
+	ext4_double_down_write_data_sem(inode, inode_bl);
+
+	if (inode_bl->i_nlink == 0) {
+		/* this inode has never been used as a BOOT_LOADER */
+		set_nlink(inode_bl, 1);
+		i_uid_write(inode_bl, 0);
+		i_gid_write(inode_bl, 0);
+		inode_bl->i_flags = 0;
+		ei_bl->i_flags = 0;
+		inode_bl->i_version = 1;
+		i_size_write(inode_bl, 0);
+		inode_bl->i_mode = S_IFREG;
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+			ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode_bl);
+		} else
+			memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+	}
+
+	swap_inode_data(inode, inode_bl);
+
+	inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	inode_bl->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	ext4_discard_preallocations(inode);
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err < 0) {
+		ext4_warning(inode->i_sb,
+			"couldn't mark inode #%lu dirty (err %d)",
+			inode->i_ino, err);
+		/* Revert all changes: */
+		swap_inode_data(inode, inode_bl);
+	} else {
+		err = ext4_mark_inode_dirty(handle, inode_bl);
+		if (err < 0) {
+			ext4_warning(inode_bl->i_sb,
+				"couldn't mark inode #%lu dirty (err %d)",
+				inode_bl->i_ino, err);
+			/* Revert all changes: */
+			swap_inode_data(inode, inode_bl);
+			ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+	ext4_journal_stop(handle);
+	ext4_double_up_write_data_sem(inode, inode_bl);
+
+journal_err_out:
+	ext4_inode_resume_unlocked_dio(inode);
+	ext4_inode_resume_unlocked_dio(inode_bl);
+	unlock_two_nondirectories(inode, inode_bl);
+	iput(inode_bl);
+	return err;
+}
+
+static int uuid_is_zero(__u8 u[16])
+{
+	int	i;
+
+	for (i = 0; i < 16; i++)
+		if (u[i])
+			return 0;
+	return 1;
+}
+
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int flags;
+
+	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+
+	switch (cmd) {
+	case EXT4_IOC_GETFLAGS:
+		ext4_get_inode_flags(ei);
+		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case EXT4_IOC_SETFLAGS: {
+		handle_t *handle = NULL;
+		int err, migrate = 0;
+		struct ext4_iloc iloc;
+		unsigned int oldflags, mask, i;
+		unsigned int jflag;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		flags = ext4_mask_flags(inode->i_mode, flags);
+
+		err = -EPERM;
+		mutex_lock(&inode->i_mutex);
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode))
+			goto flags_out;
+
+		oldflags = ei->i_flags;
+
+		/* The JOURNAL_DATA flag is modifiable only by root */
+		jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 *
+		 * This test looks nicer. Thanks to Pauline Middelink
+		 */
+		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE))
+				goto flags_out;
+		}
+
+		/*
+		 * The JOURNAL_DATA flag can only be changed by
+		 * the relevant capability.
+		 */
+		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+			if (!capable(CAP_SYS_RESOURCE))
+				goto flags_out;
+		}
+		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+			migrate = 1;
+
+		if (flags & EXT4_EOFBLOCKS_FL) {
+			/* we don't support adding EOFBLOCKS flag */
+			if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (oldflags & EXT4_EOFBLOCKS_FL)
+			ext4_truncate(inode);
+
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto flags_out;
+		}
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto flags_err;
+
+		for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+			if (!(mask & EXT4_FL_USER_MODIFIABLE))
+				continue;
+			if (mask & flags)
+				ext4_set_inode_flag(inode, i);
+			else
+				ext4_clear_inode_flag(inode, i);
+		}
+
+		ext4_set_inode_flags(inode);
+		inode->i_ctime = ext4_current_time(inode);
+
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+		ext4_journal_stop(handle);
+		if (err)
+			goto flags_out;
+
+		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+			err = ext4_change_inode_journal_flag(inode, jflag);
+		if (err)
+			goto flags_out;
+		if (migrate) {
+			if (flags & EXT4_EXTENTS_FL)
+				err = ext4_ext_migrate(inode);
+			else
+				err = ext4_ind_migrate(inode);
+		}
+
+flags_out:
+		mutex_unlock(&inode->i_mutex);
+		mnt_drop_write_file(filp);
+		return err;
+	}
+	case EXT4_IOC_GETVERSION:
+	case EXT4_IOC_GETVERSION_OLD:
+		return put_user(inode->i_generation, (int __user *) arg);
+	case EXT4_IOC_SETVERSION:
+	case EXT4_IOC_SETVERSION_OLD: {
+		handle_t *handle;
+		struct ext4_iloc iloc;
+		__u32 generation;
+		int err;
+
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+
+		if (ext4_has_metadata_csum(inode->i_sb)) {
+			ext4_warning(sb, "Setting inode version is not "
+				     "supported with metadata_csum enabled.");
+			return -ENOTTY;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		if (get_user(generation, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto unlock_out;
+		}
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err == 0) {
+			inode->i_ctime = ext4_current_time(inode);
+			inode->i_generation = generation;
+			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+		}
+		ext4_journal_stop(handle);
+
+unlock_out:
+		mutex_unlock(&inode->i_mutex);
+setversion_out:
+		mnt_drop_write_file(filp);
+		return err;
+	}
+	case EXT4_IOC_GROUP_EXTEND: {
+		ext4_fsblk_t n_blocks_count;
+		int err, err2=0;
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
+		}
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not supported with bigalloc");
+			err = -EOPNOTSUPP;
+			goto group_extend_out;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto group_extend_out;
+
+		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write_file(filp);
+group_extend_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
+	case EXT4_IOC_MOVE_EXT: {
+		struct move_extent me;
+		struct fd donor;
+		int err;
+
+		if (!(filp->f_mode & FMODE_READ) ||
+		    !(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&me,
+			(struct move_extent __user *)arg, sizeof(me)))
+			return -EFAULT;
+		me.moved_len = 0;
+
+		donor = fdget(me.donor_fd);
+		if (!donor.file)
+			return -EBADF;
+
+		if (!(donor.file->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			goto mext_out;
+		}
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online defrag not supported with bigalloc");
+			err = -EOPNOTSUPP;
+			goto mext_out;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto mext_out;
+
+		err = ext4_move_extents(filp, donor.file, me.orig_start,
+					me.donor_start, me.len, &me.moved_len);
+		mnt_drop_write_file(filp);
+
+		if (copy_to_user((struct move_extent __user *)arg,
+				 &me, sizeof(me)))
+			err = -EFAULT;
+mext_out:
+		fdput(donor);
+		return err;
+	}
+
+	case EXT4_IOC_GROUP_ADD: {
+		struct ext4_new_group_data input;
+		int err, err2=0;
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+				sizeof(input))) {
+			err = -EFAULT;
+			goto group_add_out;
+		}
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not supported with bigalloc");
+			err = -EOPNOTSUPP;
+			goto group_add_out;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto group_add_out;
+
+		err = ext4_group_add(sb, &input);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write_file(filp);
+		if (!err && ext4_has_group_desc_csum(sb) &&
+		    test_opt(sb, INIT_INODE_TABLE))
+			err = ext4_register_li_request(sb, input.group);
+group_add_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
+	case EXT4_IOC_MIGRATE:
+	{
+		int err;
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		/*
+		 * inode_mutex prevent write and truncate on the file.
+		 * Read still goes through. We take i_data_sem in
+		 * ext4_ext_swap_inode_data before we switch the
+		 * inode format to prevent read.
+		 */
+		mutex_lock(&(inode->i_mutex));
+		err = ext4_ext_migrate(inode);
+		mutex_unlock(&(inode->i_mutex));
+		mnt_drop_write_file(filp);
+		return err;
+	}
+
+	case EXT4_IOC_ALLOC_DA_BLKS:
+	{
+		int err;
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		err = ext4_alloc_da_blocks(inode);
+		mnt_drop_write_file(filp);
+		return err;
+	}
+
+	case EXT4_IOC_SWAP_BOOT:
+	{
+		int err;
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		err = swap_inode_boot_loader(sb, inode);
+		mnt_drop_write_file(filp);
+		return err;
+	}
+
+	case EXT4_IOC_RESIZE_FS: {
+		ext4_fsblk_t n_blocks_count;
+		int err = 0, err2 = 0;
+		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not (yet) supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
+			return -EFAULT;
+		}
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto resizefs_out;
+
+		err = ext4_resize_fs(sb, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write_file(filp);
+		if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+		    ext4_has_group_desc_csum(sb) &&
+		    test_opt(sb, INIT_INODE_TABLE))
+			err = ext4_register_li_request(sb, o_group);
+
+resizefs_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
+	case FITRIM:
+	{
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (!blk_queue_discard(q))
+			return -EOPNOTSUPP;
+
+		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		range.minlen = max((unsigned int)range.minlen,
+				   q->limits.discard_granularity);
+		ret = ext4_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range __user *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case EXT4_IOC_PRECACHE_EXTENTS:
+		return ext4_ext_precache(inode);
+	case EXT4_IOC_SET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+		struct ext4_encryption_policy policy;
+		int err = 0;
+
+		if (copy_from_user(&policy,
+				   (struct ext4_encryption_policy __user *)arg,
+				   sizeof(policy))) {
+			err = -EFAULT;
+			goto encryption_policy_out;
+		}
+
+		err = ext4_process_policy(&policy, inode);
+encryption_policy_out:
+		return err;
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
+	case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+		int err, err2;
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		handle_t *handle;
+
+		if (!ext4_sb_has_crypto(sb))
+			return -EOPNOTSUPP;
+		if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+			err = mnt_want_write_file(filp);
+			if (err)
+				return err;
+			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+			if (IS_ERR(handle)) {
+				err = PTR_ERR(handle);
+				goto pwsalt_err_exit;
+			}
+			err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+			if (err)
+				goto pwsalt_err_journal;
+			generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+			err = ext4_handle_dirty_metadata(handle, NULL,
+							 sbi->s_sbh);
+		pwsalt_err_journal:
+			err2 = ext4_journal_stop(handle);
+			if (err2 && !err)
+				err = err2;
+		pwsalt_err_exit:
+			mnt_drop_write_file(filp);
+			if (err)
+				return err;
+		}
+		if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt,
+				 16))
+			return -EFAULT;
+		return 0;
+	}
+	case EXT4_IOC_GET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+		struct ext4_encryption_policy policy;
+		int err = 0;
+
+		if (!ext4_encrypted_inode(inode))
+			return -ENOENT;
+		err = ext4_get_policy(inode, &policy);
+		if (err)
+			return err;
+		if (copy_to_user((void *)arg, &policy, sizeof(policy)))
+			return -EFAULT;
+		return 0;
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT4_IOC32_GETFLAGS:
+		cmd = EXT4_IOC_GETFLAGS;
+		break;
+	case EXT4_IOC32_SETFLAGS:
+		cmd = EXT4_IOC_SETFLAGS;
+		break;
+	case EXT4_IOC32_GETVERSION:
+		cmd = EXT4_IOC_GETVERSION;
+		break;
+	case EXT4_IOC32_SETVERSION:
+		cmd = EXT4_IOC_SETVERSION;
+		break;
+	case EXT4_IOC32_GROUP_EXTEND:
+		cmd = EXT4_IOC_GROUP_EXTEND;
+		break;
+	case EXT4_IOC32_GETVERSION_OLD:
+		cmd = EXT4_IOC_GETVERSION_OLD;
+		break;
+	case EXT4_IOC32_SETVERSION_OLD:
+		cmd = EXT4_IOC_SETVERSION_OLD;
+		break;
+	case EXT4_IOC32_GETRSVSZ:
+		cmd = EXT4_IOC_GETRSVSZ;
+		break;
+	case EXT4_IOC32_SETRSVSZ:
+		cmd = EXT4_IOC_SETRSVSZ;
+		break;
+	case EXT4_IOC32_GROUP_ADD: {
+		struct compat_ext4_new_group_input __user *uinput;
+		struct ext4_new_group_input input;
+		mm_segment_t old_fs;
+		int err;
+
+		uinput = compat_ptr(arg);
+		err = get_user(input.group, &uinput->group);
+		err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+		err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+		err |= get_user(input.inode_table, &uinput->inode_table);
+		err |= get_user(input.blocks_count, &uinput->blocks_count);
+		err |= get_user(input.reserved_blocks,
+				&uinput->reserved_blocks);
+		if (err)
+			return -EFAULT;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+				 (unsigned long) &input);
+		set_fs(old_fs);
+		return err;
+	}
+	case EXT4_IOC_MOVE_EXT:
+	case FITRIM:
+	case EXT4_IOC_RESIZE_FS:
+	case EXT4_IOC_PRECACHE_EXTENTS:
+	case EXT4_IOC_SET_ENCRYPTION_POLICY:
+	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
+	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/kernel/fs/ext4/mballoc.c b/kernel/fs/ext4/mballoc.c
new file mode 100644
index 000000000..8d1e60214
--- /dev/null
+++ b/kernel/fs/ext4/mballoc.c
@@ -0,0 +1,5238 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+
+/*
+ * mballoc.c contains the multiblocks allocation routines
+ */
+
+#include "ext4_jbd2.h"
+#include "mballoc.h"
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/ext4.h>
+
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
+
+/*
+ * MUSTDO:
+ *   - test ext4_ext_search_left() and ext4_ext_search_right()
+ *   - search for metadata in few groups
+ *
+ * TODO v4:
+ *   - normalization should take into account whether file is still open
+ *   - discard preallocations if no free space left (policy?)
+ *   - don't normalize tails
+ *   - quota
+ *   - reservation for superuser
+ *
+ * TODO v3:
+ *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ *   - track min/max extents in each group for better group selection
+ *   - mb_mark_used() may allocate chunk right after splitting buddy
+ *   - tree of groups sorted by number of free blocks
+ *   - error handling
+ */
+
+/*
+ * The allocation request involve request for multiple number of blocks
+ * near to the goal(block) value specified.
+ *
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
+ *
+ * The main motivation for having small file use group preallocation is to
+ * ensure that we have small files closer together on the disk.
+ *
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
+ *
+ * pa_lstart -> the logical start block for this prealloc space
+ * pa_pstart -> the physical start block for this prealloc space
+ * pa_len    -> length for this prealloc space (in clusters)
+ * pa_free   ->  free space available in this prealloc space (in clusters)
+ *
+ * The inode preallocation space is used looking at the _logical_ start
+ * block. If only the logical file block falls within the range of prealloc
+ * space we will consume the particular prealloc space. This makes sure that
+ * we have contiguous physical blocks representing the file blocks
+ *
+ * The important thing to be noted in case of inode prealloc space is that
+ * we don't modify the values associated to inode prealloc space except
+ * pa_free.
+ *
+ * If we are not able to find blocks in the inode prealloc space and if we
+ * have the group allocation flag set then we look at the locality group
+ * prealloc space. These are per CPU prealloc list represented as
+ *
+ * ext4_sb_info.s_locality_groups[smp_processor_id()]
+ *
+ * The reason for having a per cpu locality group is to reduce the contention
+ * between CPUs. It is possible to get scheduled at this point.
+ *
+ * The locality group prealloc space is used looking at whether we have
+ * enough free space (pa_free) within the prealloc space.
+ *
+ * If we can't allocate blocks via inode prealloc or/and locality group
+ * prealloc then we look at the buddy cache. The buddy cache is represented
+ * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
+ * mapped to the buddy and bitmap information regarding different
+ * groups. The buddy information is attached to buddy cache inode so that
+ * we can access them through the page cache. The information regarding
+ * each group is loaded via ext4_mb_load_buddy.  The information involve
+ * block bitmap and buddy information. The information are stored in the
+ * inode as:
+ *
+ *  {                        page                        }
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.  So for each group we
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * blocksize) blocks.  So it can have information regarding groups_per_page
+ * which is blocks_per_page/2
+ *
+ * The buddy cache inode is not stored on disk. The inode is thrown
+ * away when the filesystem is unmounted.
+ *
+ * We look for count number of blocks in the buddy cache. If we were able
+ * to locate that many free blocks we return with additional information
+ * regarding rest of the contiguous physical block available
+ *
+ * Before allocating blocks via buddy cache we normalize the request
+ * blocks. This ensure we ask for more blocks that we needed. The extra
+ * blocks that we get after allocation is added to the respective prealloc
+ * list. In case of inode preallocation we follow a list of heuristics
+ * based on file size. This can be found in ext4_mb_normalize_request. If
+ * we are doing a group prealloc we try to normalize the request to
+ * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
+ * 512 blocks. This can be tuned via
+ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
+ * terms of number of blocks. If we have mounted the file system with -O
+ * stripe=<value> option the group prealloc request is normalized to the
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
+ *
+ * The regular allocator (using the buddy cache) supports a few tunables.
+ *
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
+ *
+ * The regular allocator uses buddy scan only if the request len is power of
+ * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+ * value of s_mb_order2_reqs can be tuned via
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
+ * min_to_scan indicate how long the mballoc __must__ look for a best
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
+ * best extent in the found extents. Searching for the blocks starts with
+ * the group specified as the goal value in allocation context via
+ * ac_g_ex. Each group is first checked based on the criteria whether it
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
+ * checked.
+ *
+ * Both the prealloc space are getting populated as above. So for the first
+ * request we will hit the buddy cache which will result in this prealloc
+ * space getting filled. The prealloc space is then later used for the
+ * subsequent request.
+ */
+
+/*
+ * mballoc operates on the following data:
+ *  - on-disk bitmap
+ *  - in-core buddy (actually includes buddy and bitmap)
+ *  - preallocation descriptors (PAs)
+ *
+ * there are two types of preallocations:
+ *  - inode
+ *    assiged to specific inode and can be used for this inode only.
+ *    it describes part of inode's space preallocated to specific
+ *    physical blocks. any block from that preallocated can be used
+ *    independent. the descriptor just tracks number of blocks left
+ *    unused. so, before taking some block from descriptor, one must
+ *    make sure corresponded logical block isn't allocated yet. this
+ *    also means that freeing any block within descriptor's range
+ *    must discard all preallocated blocks.
+ *  - locality group
+ *    assigned to specific locality group which does not translate to
+ *    permanent set of inodes: inode can join and leave group. space
+ *    from this type of preallocation can be used for any inode. thus
+ *    it's consumed from the beginning to the end.
+ *
+ * relation between them can be expressed as:
+ *    in-core buddy = on-disk bitmap + preallocation descriptors
+ *
+ * this mean blocks mballoc considers used are:
+ *  - allocated blocks (persistent)
+ *  - preallocated blocks (non-persistent)
+ *
+ * consistency in mballoc world means that at any time a block is either
+ * free or used in ALL structures. notice: "any time" should not be read
+ * literally -- time is discrete and delimited by locks.
+ *
+ *  to keep it simple, we don't use block numbers, instead we count number of
+ *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
+ *
+ * all operations can be expressed as:
+ *  - init buddy:			buddy = on-disk + PAs
+ *  - new PA:				buddy += N; PA = N
+ *  - use inode PA:			on-disk += N; PA -= N
+ *  - discard inode PA			buddy -= on-disk - PA; PA = 0
+ *  - use locality group PA		on-disk += N; PA -= N
+ *  - discard locality group PA		buddy -= PA; PA = 0
+ *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
+ *        is used in real operation because we can't know actual used
+ *        bits from PA, only from on-disk bitmap
+ *
+ * if we follow this strict logic, then all operations above should be atomic.
+ * given some of them can block, we'd have to use something like semaphores
+ * killing performance on high-end SMP hardware. let's try to relax it using
+ * the following knowledge:
+ *  1) if buddy is referenced, it's already initialized
+ *  2) while block is used in buddy and the buddy is referenced,
+ *     nobody can re-allocate that block
+ *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
+ *     bit set and PA claims same block, it's OK. IOW, one can set bit in
+ *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
+ *     block
+ *
+ * so, now we're building a concurrency table:
+ *  - init buddy vs.
+ *    - new PA
+ *      blocks for PA are allocated in the buddy, buddy must be referenced
+ *      until PA is linked to allocation group to avoid concurrent buddy init
+ *    - use inode PA
+ *      we need to make sure that either on-disk bitmap or PA has uptodate data
+ *      given (3) we care that PA-=N operation doesn't interfere with init
+ *    - discard inode PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *    - use locality group PA
+ *      again PA-=N must be serialized with init
+ *    - discard locality group PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *  - new PA vs.
+ *    - use inode PA
+ *      i_data_sem serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      some mutex should serialize them
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *  - use inode PA
+ *    - use inode PA
+ *      i_data_sem or another mutex should serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      nothing wrong here -- they're different PAs covering different blocks
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *
+ * now we're ready to make few consequences:
+ *  - PA is referenced and while it is no discard is possible
+ *  - PA is referenced until block isn't marked in on-disk bitmap
+ *  - PA changes only after on-disk bitmap
+ *  - discard must not compete with init. either init is done before
+ *    any discard or they're serialized somehow
+ *  - buddy init as sum of on-disk bitmap and PAs is done atomically
+ *
+ * a special case when we've used PA to emptiness. no need to modify buddy
+ * in this case, but we should care about concurrent init
+ *
+ */
+
+ /*
+ * Logic in few words:
+ *
+ *  - allocation:
+ *    load group
+ *    find blocks
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - use preallocation:
+ *    find proper PA (per-inode or group)
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *    release PA
+ *
+ *  - free:
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - discard preallocations in group:
+ *    mark PAs deleted
+ *    move them onto local list
+ *    load on-disk bitmap
+ *    load group
+ *    remove PA from object (inode or locality group)
+ *    mark free blocks in-core
+ *
+ *  - discard inode's preallocations:
+ */
+
+/*
+ * Locking rules
+ *
+ * Locks:
+ *  - bitlock on a group	(group)
+ *  - object (inode/locality)	(object)
+ *  - per-pa lock		(pa)
+ *
+ * Paths:
+ *  - new pa
+ *    object
+ *    group
+ *
+ *  - find and use pa:
+ *    pa
+ *
+ *  - release consumed pa:
+ *    pa
+ *    group
+ *    object
+ *
+ *  - generate in-core bitmap:
+ *    group
+ *        pa
+ *
+ *  - discard all for given object (inode, locality group):
+ *    object
+ *        pa
+ *    group
+ *
+ *  - discard all for given group:
+ *    group
+ *        pa
+ *    group
+ *        object
+ *
+ */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES 8
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
+
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group);
+static void ext4_free_data_callback(struct super_block *sb,
+				struct ext4_journal_cb_entry *jce, int rc);
+
+static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+	*bit += ((unsigned long) addr & 7UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+	*bit += ((unsigned long) addr & 3UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+	return addr;
+}
+
+static inline int mb_test_bit(int bit, void *addr)
+{
+	/*
+	 * ext4_test_bit on architecture like powerpc
+	 * needs unsigned long aligned address
+	 */
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	return ext4_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	ext4_set_bit(bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	ext4_clear_bit(bit, addr);
+}
+
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	return ext4_test_and_clear_bit(bit, addr);
+}
+
+static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
+	int fix = 0, ret, tmpmax;
+	addr = mb_correct_addr_and_bit(&fix, addr);
+	tmpmax = max + fix;
+	start += fix;
+
+	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
+}
+
+static inline int mb_find_next_bit(void *addr, int max, int start)
+{
+	int fix = 0, ret, tmpmax;
+	addr = mb_correct_addr_and_bit(&fix, addr);
+	tmpmax = max + fix;
+	start += fix;
+
+	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
+}
+
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+{
+	char *bb;
+
+	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
+	BUG_ON(max == NULL);
+
+	if (order > e4b->bd_blkbits + 1) {
+		*max = 0;
+		return NULL;
+	}
+
+	/* at order 0 we see each particular block */
+	if (order == 0) {
+		*max = 1 << (e4b->bd_blkbits + 3);
+		return e4b->bd_bitmap;
+	}
+
+	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
+
+	return bb;
+}
+
+#ifdef DOUBLE_CHECK
+static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
+			   int first, int count)
+{
+	int i;
+	struct super_block *sb = e4b->bd_sb;
+
+	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+		return;
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+	for (i = 0; i < count; i++) {
+		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
+			ext4_fsblk_t blocknr;
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					      inode ? inode->i_ino : 0,
+					      blocknr,
+					      "freeing block already freed "
+					      "(bit %u)",
+					      first + i);
+		}
+		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
+	}
+}
+
+static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
+{
+	int i;
+
+	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+		return;
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+	for (i = 0; i < count; i++) {
+		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
+		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
+	}
+}
+
+static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+	if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
+		unsigned char *b1, *b2;
+		int i;
+		b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
+		b2 = (unsigned char *) bitmap;
+		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
+			if (b1[i] != b2[i]) {
+				ext4_msg(e4b->bd_sb, KERN_ERR,
+					 "corruption in group %u "
+					 "at byte %u(%u): %x in copy != %x "
+					 "on disk/prealloc",
+					 e4b->bd_group, i, i * 8, b1[i], b2[i]);
+				BUG();
+			}
+		}
+	}
+}
+
+#else
+static inline void mb_free_blocks_double(struct inode *inode,
+				struct ext4_buddy *e4b, int first, int count)
+{
+	return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+						int first, int count)
+{
+	return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+	return;
+}
+#endif
+
+#ifdef AGGRESSIVE_CHECK
+
+#define MB_CHECK_ASSERT(assert)						\
+do {									\
+	if (!(assert)) {						\
+		printk(KERN_EMERG					\
+			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
+			function, file, line, # assert);		\
+		BUG();							\
+	}								\
+} while (0)
+
+static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+				const char *function, int line)
+{
+	struct super_block *sb = e4b->bd_sb;
+	int order = e4b->bd_blkbits + 1;
+	int max;
+	int max2;
+	int i;
+	int j;
+	int k;
+	int count;
+	struct ext4_group_info *grp;
+	int fragments = 0;
+	int fstart;
+	struct list_head *cur;
+	void *buddy;
+	void *buddy2;
+
+	{
+		static int mb_check_counter;
+		if (mb_check_counter++ % 100 != 0)
+			return 0;
+	}
+
+	while (order > 1) {
+		buddy = mb_find_buddy(e4b, order, &max);
+		MB_CHECK_ASSERT(buddy);
+		buddy2 = mb_find_buddy(e4b, order - 1, &max2);
+		MB_CHECK_ASSERT(buddy2);
+		MB_CHECK_ASSERT(buddy != buddy2);
+		MB_CHECK_ASSERT(max * 2 == max2);
+
+		count = 0;
+		for (i = 0; i < max; i++) {
+
+			if (mb_test_bit(i, buddy)) {
+				/* only single bit in buddy2 may be 1 */
+				if (!mb_test_bit(i << 1, buddy2)) {
+					MB_CHECK_ASSERT(
+						mb_test_bit((i<<1)+1, buddy2));
+				} else if (!mb_test_bit((i << 1) + 1, buddy2)) {
+					MB_CHECK_ASSERT(
+						mb_test_bit(i << 1, buddy2));
+				}
+				continue;
+			}
+
+			/* both bits in buddy2 must be 1 */
+			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
+			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+
+			for (j = 0; j < (1 << order); j++) {
+				k = (i * (1 << order)) + j;
+				MB_CHECK_ASSERT(
+					!mb_test_bit(k, e4b->bd_bitmap));
+			}
+			count++;
+		}
+		MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
+		order--;
+	}
+
+	fstart = -1;
+	buddy = mb_find_buddy(e4b, 0, &max);
+	for (i = 0; i < max; i++) {
+		if (!mb_test_bit(i, buddy)) {
+			MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
+			if (fstart == -1) {
+				fragments++;
+				fstart = i;
+			}
+			continue;
+		}
+		fstart = -1;
+		/* check used bits only */
+		for (j = 0; j < e4b->bd_blkbits + 1; j++) {
+			buddy2 = mb_find_buddy(e4b, j, &max2);
+			k = i >> j;
+			MB_CHECK_ASSERT(k < max2);
+			MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+		}
+	}
+	MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
+	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
+
+	grp = ext4_get_group_info(sb, e4b->bd_group);
+	list_for_each(cur, &grp->bb_prealloc_list) {
+		ext4_group_t groupnr;
+		struct ext4_prealloc_space *pa;
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
+		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
+		for (i = 0; i < pa->pa_len; i++)
+			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
+	}
+	return 0;
+}
+#undef MB_CHECK_ASSERT
+#define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
+					__FILE__, __func__, __LINE__)
+#else
+#define mb_check_buddy(e4b)
+#endif
+
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
+static void ext4_mb_mark_free_simple(struct super_block *sb,
+				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
+					struct ext4_group_info *grp)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_grpblk_t min;
+	ext4_grpblk_t max;
+	ext4_grpblk_t chunk;
+	unsigned short border;
+
+	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
+
+	border = 2 << sb->s_blocksize_bits;
+
+	while (len > 0) {
+		/* find how many blocks can be covered since this position */
+		max = ffs(first | border) - 1;
+
+		/* find how many blocks of power 2 we need to mark */
+		min = fls(len) - 1;
+
+		if (max < min)
+			min = max;
+		chunk = 1 << min;
+
+		/* mark multiblock chunks only */
+		grp->bb_counters[min]++;
+		if (min > 0)
+			mb_clear_bit(first >> min,
+				     buddy + sbi->s_mb_offsets[min]);
+
+		len -= chunk;
+		first += chunk;
+	}
+}
+
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+	int i;
+	int bits;
+
+	grp->bb_largest_free_order = -1; /* uninit */
+
+	bits = sb->s_blocksize_bits + 1;
+	for (i = bits; i >= 0; i--) {
+		if (grp->bb_counters[i] > 0) {
+			grp->bb_largest_free_order = i;
+			break;
+		}
+	}
+}
+
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
+				void *buddy, void *bitmap, ext4_group_t group)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+	ext4_grpblk_t i = 0;
+	ext4_grpblk_t first;
+	ext4_grpblk_t len;
+	unsigned free = 0;
+	unsigned fragments = 0;
+	unsigned long long period = get_cycles();
+
+	/* initialize buddy from bitmap which is aggregation
+	 * of on-disk bitmap and preallocations */
+	i = mb_find_next_zero_bit(bitmap, max, 0);
+	grp->bb_first_free = i;
+	while (i < max) {
+		fragments++;
+		first = i;
+		i = mb_find_next_bit(bitmap, max, i);
+		len = i - first;
+		free += len;
+		if (len > 1)
+			ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
+		else
+			grp->bb_counters[0]++;
+		if (i < max)
+			i = mb_find_next_zero_bit(bitmap, max, i);
+	}
+	grp->bb_fragments = fragments;
+
+	if (free != grp->bb_free) {
+		ext4_grp_locked_error(sb, group, 0, 0,
+				      "block bitmap and bg descriptor "
+				      "inconsistent: %u vs %u free clusters",
+				      free, grp->bb_free);
+		/*
+		 * If we intend to continue, we consider group descriptor
+		 * corrupt and update bb_free using bitmap value
+		 */
+		grp->bb_free = free;
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+	}
+	mb_set_largest_free_order(sb, grp);
+
+	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+
+	period = get_cycles() - period;
+	spin_lock(&EXT4_SB(sb)->s_bal_lock);
+	EXT4_SB(sb)->s_mb_buddies_generated++;
+	EXT4_SB(sb)->s_mb_generation_time += period;
+	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
+}
+
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+	int count;
+	int order = 1;
+	void *buddy;
+
+	while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+		ext4_set_bits(buddy, 0, count);
+	}
+	e4b->bd_info->bb_fragments = 0;
+	memset(e4b->bd_info->bb_counters, 0,
+		sizeof(*e4b->bd_info->bb_counters) *
+		(e4b->bd_sb->s_blocksize_bits + 2));
+
+	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+		e4b->bd_bitmap, e4b->bd_group);
+}
+
+/* The buddy information is attached the buddy cache inode
+ * for convenience. The information regarding each group
+ * is loaded via ext4_mb_load_buddy. The information involve
+ * block bitmap and buddy information. The information are
+ * stored in the inode as
+ *
+ * {                        page                        }
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.
+ * So for each group we take up 2 blocks. A page can
+ * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
+ * So it can have information regarding groups_per_page which
+ * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
+ */
+
+static int ext4_mb_init_cache(struct page *page, char *incore)
+{
+	ext4_group_t ngroups;
+	int blocksize;
+	int blocks_per_page;
+	int groups_per_page;
+	int err = 0;
+	int i;
+	ext4_group_t first_group, group;
+	int first_block;
+	struct super_block *sb;
+	struct buffer_head *bhs;
+	struct buffer_head **bh = NULL;
+	struct inode *inode;
+	char *data;
+	char *bitmap;
+	struct ext4_group_info *grinfo;
+
+	mb_debug(1, "init page %lu\n", page->index);
+
+	inode = page->mapping->host;
+	sb = inode->i_sb;
+	ngroups = ext4_get_groups_count(sb);
+	blocksize = 1 << inode->i_blkbits;
+	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+
+	/* allocate buffer_heads to read bitmaps */
+	if (groups_per_page > 1) {
+		i = sizeof(struct buffer_head *) * groups_per_page;
+		bh = kzalloc(i, GFP_NOFS);
+		if (bh == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+	} else
+		bh = &bhs;
+
+	first_group = page->index * blocks_per_page / 2;
+
+	/* read all groups the page covers into the cache */
+	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+		if (group >= ngroups)
+			break;
+
+		grinfo = ext4_get_group_info(sb, group);
+		/*
+		 * If page is uptodate then we came here after online resize
+		 * which added some new uninitialized group info structs, so
+		 * we must skip all initialized uptodate buddies on the page,
+		 * which may be currently in use by an allocating task.
+		 */
+		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+			bh[i] = NULL;
+			continue;
+		}
+		if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mb_debug(1, "read bitmap for group %u\n", group);
+	}
+
+	/* wait for I/O completion */
+	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	first_block = page->index * blocks_per_page;
+	for (i = 0; i < blocks_per_page; i++) {
+		group = (first_block + i) >> 1;
+		if (group >= ngroups)
+			break;
+
+		if (!bh[group - first_group])
+			/* skip initialized uptodate buddy */
+			continue;
+
+		/*
+		 * data carry information regarding this
+		 * particular group in the format specified
+		 * above
+		 *
+		 */
+		data = page_address(page) + (i * blocksize);
+		bitmap = bh[group - first_group]->b_data;
+
+		/*
+		 * We place the buddy block and bitmap block
+		 * close together
+		 */
+		if ((first_block + i) & 1) {
+			/* this is block of buddy */
+			BUG_ON(incore == NULL);
+			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
+				group, page->index, i * blocksize);
+			trace_ext4_mb_buddy_bitmap_load(sb, group);
+			grinfo = ext4_get_group_info(sb, group);
+			grinfo->bb_fragments = 0;
+			memset(grinfo->bb_counters, 0,
+			       sizeof(*grinfo->bb_counters) *
+				(sb->s_blocksize_bits+2));
+			/*
+			 * incore got set to the group block bitmap below
+			 */
+			ext4_lock_group(sb, group);
+			/* init the buddy */
+			memset(data, 0xff, blocksize);
+			ext4_mb_generate_buddy(sb, data, incore, group);
+			ext4_unlock_group(sb, group);
+			incore = NULL;
+		} else {
+			/* this is block of bitmap */
+			BUG_ON(incore != NULL);
+			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
+				group, page->index, i * blocksize);
+			trace_ext4_mb_bitmap_load(sb, group);
+
+			/* see comments in ext4_mb_put_pa() */
+			ext4_lock_group(sb, group);
+			memcpy(data, bitmap, blocksize);
+
+			/* mark all preallocated blks used in in-core bitmap */
+			ext4_mb_generate_from_pa(sb, data, group);
+			ext4_mb_generate_from_freelist(sb, data, group);
+			ext4_unlock_group(sb, group);
+
+			/* set incore so that the buddy information can be
+			 * generated using this
+			 */
+			incore = data;
+		}
+	}
+	SetPageUptodate(page);
+
+out:
+	if (bh) {
+		for (i = 0; i < groups_per_page; i++)
+			brelse(bh[i]);
+		if (bh != &bhs)
+			kfree(bh);
+	}
+	return err;
+}
+
+/*
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ */
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+	int block, pnum, poff;
+	int blocks_per_page;
+	struct page *page;
+
+	e4b->bd_buddy_page = NULL;
+	e4b->bd_bitmap_page = NULL;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_bitmap_page = page;
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	if (blocks_per_page >= 2) {
+		/* buddy and bitmap are on the same page */
+		return 0;
+	}
+
+	block++;
+	pnum = block / blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_buddy_page = page;
+	return 0;
+}
+
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+{
+	if (e4b->bd_bitmap_page) {
+		unlock_page(e4b->bd_bitmap_page);
+		page_cache_release(e4b->bd_bitmap_page);
+	}
+	if (e4b->bd_buddy_page) {
+		unlock_page(e4b->bd_buddy_page);
+		page_cache_release(e4b->bd_buddy_page);
+	}
+}
+
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+	struct ext4_group_info *this_grp;
+	struct ext4_buddy e4b;
+	struct page *page;
+	int ret = 0;
+
+	might_sleep();
+	mb_debug(1, "init group %u\n", group);
+	this_grp = ext4_get_group_info(sb, group);
+	/*
+	 * This ensures that we don't reinit the buddy cache
+	 * page which map to the group from which we are already
+	 * allocating. If we are looking at the buddy cache we would
+	 * have taken a reference using ext4_mb_load_buddy and that
+	 * would have pinned buddy page to page cache.
+	 * The call to ext4_mb_get_buddy_page_lock will mark the
+	 * page accessed.
+	 */
+	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+		/*
+		 * somebody initialized the group
+		 * return without doing anything
+		 */
+		goto err;
+	}
+
+	page = e4b.bd_bitmap_page;
+	ret = ext4_mb_init_cache(page, NULL);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+
+	if (e4b.bd_buddy_page == NULL) {
+		/*
+		 * If both the bitmap and buddy are in
+		 * the same page we don't need to force
+		 * init the buddy
+		 */
+		ret = 0;
+		goto err;
+	}
+	/* init buddy cache */
+	page = e4b.bd_buddy_page;
+	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+err:
+	ext4_mb_put_buddy_page_lock(&e4b);
+	return ret;
+}
+
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack int
+ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+					struct ext4_buddy *e4b)
+{
+	int blocks_per_page;
+	int block;
+	int pnum;
+	int poff;
+	struct page *page;
+	int ret;
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+
+	might_sleep();
+	mb_debug(1, "load group %u\n", group);
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	grp = ext4_get_group_info(sb, group);
+
+	e4b->bd_blkbits = sb->s_blocksize_bits;
+	e4b->bd_info = grp;
+	e4b->bd_sb = sb;
+	e4b->bd_group = group;
+	e4b->bd_buddy_page = NULL;
+	e4b->bd_bitmap_page = NULL;
+
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		/*
+		 * we need full data about the group
+		 * to make a good selection
+		 */
+		ret = ext4_mb_init_group(sb, group);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+
+	/* we could use find_or_create_page(), but it locks page
+	 * what we'd like to avoid in fast path ... */
+	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
+	if (page == NULL || !PageUptodate(page)) {
+		if (page)
+			/*
+			 * drop the page reference and try
+			 * to get the page with lock. If we
+			 * are not uptodate that implies
+			 * somebody just created the page but
+			 * is yet to initialize the same. So
+			 * wait for it to initialize.
+			 */
+			page_cache_release(page);
+		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+		if (page) {
+			BUG_ON(page->mapping != inode->i_mapping);
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, NULL);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+				mb_cmp_bitmaps(e4b, page_address(page) +
+					       (poff * sb->s_blocksize));
+			}
+			unlock_page(page);
+		}
+	}
+	if (page == NULL) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/* Pages marked accessed already */
+	e4b->bd_bitmap_page = page;
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+
+	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
+	if (page == NULL || !PageUptodate(page)) {
+		if (page)
+			page_cache_release(page);
+		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+		if (page) {
+			BUG_ON(page->mapping != inode->i_mapping);
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+			}
+			unlock_page(page);
+		}
+	}
+	if (page == NULL) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/* Pages marked accessed already */
+	e4b->bd_buddy_page = page;
+	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+
+	BUG_ON(e4b->bd_bitmap_page == NULL);
+	BUG_ON(e4b->bd_buddy_page == NULL);
+
+	return 0;
+
+err:
+	if (page)
+		page_cache_release(page);
+	if (e4b->bd_bitmap_page)
+		page_cache_release(e4b->bd_bitmap_page);
+	if (e4b->bd_buddy_page)
+		page_cache_release(e4b->bd_buddy_page);
+	e4b->bd_buddy = NULL;
+	e4b->bd_bitmap = NULL;
+	return ret;
+}
+
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
+{
+	if (e4b->bd_bitmap_page)
+		page_cache_release(e4b->bd_bitmap_page);
+	if (e4b->bd_buddy_page)
+		page_cache_release(e4b->bd_buddy_page);
+}
+
+
+static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
+{
+	int order = 1;
+	void *bb;
+
+	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
+	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
+
+	bb = e4b->bd_buddy;
+	while (order <= e4b->bd_blkbits + 1) {
+		block = block >> 1;
+		if (!mb_test_bit(block, bb)) {
+			/* this block is part of buddy of order 'order' */
+			return order;
+		}
+		bb += 1 << (e4b->bd_blkbits - order);
+		order++;
+	}
+	return 0;
+}
+
+static void mb_clear_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0;
+			cur += 32;
+			continue;
+		}
+		mb_clear_bit(cur, bm);
+		cur++;
+	}
+}
+
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+	int zero_bit = -1;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			if (*addr != (__u32)(-1) && zero_bit == -1)
+				zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+			*addr = 0;
+			cur += 32;
+			continue;
+		}
+		if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+			zero_bit = cur;
+		cur++;
+	}
+
+	return zero_bit;
+}
+
+void ext4_set_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: set whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0xffffffff;
+			cur += 32;
+			continue;
+		}
+		mb_set_bit(cur, bm);
+		cur++;
+	}
+}
+
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+	if (mb_test_bit(*bit + side, bitmap)) {
+		mb_clear_bit(*bit, bitmap);
+		(*bit) -= side;
+		return 1;
+	}
+	else {
+		(*bit) += side;
+		mb_set_bit(*bit, bitmap);
+		return -1;
+	}
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+	int max;
+	int order = 1;
+	void *buddy = mb_find_buddy(e4b, order, &max);
+
+	while (buddy) {
+		void *buddy2;
+
+		/* Bits in range [first; last] are known to be set since
+		 * corresponding blocks were allocated. Bits in range
+		 * (first; last) will stay set because they form buddies on
+		 * upper layer. We just deal with borders if they don't
+		 * align with upper layer and then go up.
+		 * Releasing entire group is all about clearing
+		 * single bit of highest order buddy.
+		 */
+
+		/* Example:
+		 * ---------------------------------
+		 * |   1   |   1   |   1   |   1   |
+		 * ---------------------------------
+		 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+		 * ---------------------------------
+		 *   0   1   2   3   4   5   6   7
+		 *      \_____________________/
+		 *
+		 * Neither [1] nor [6] is aligned to above layer.
+		 * Left neighbour [0] is free, so mark it busy,
+		 * decrease bb_counters and extend range to
+		 * [0; 6]
+		 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+		 * mark [6] free, increase bb_counters and shrink range to
+		 * [0; 5].
+		 * Then shift range to [0; 2], go up and do the same.
+		 */
+
+
+		if (first & 1)
+			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+		if (!(last & 1))
+			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+		if (first > last)
+			break;
+		order++;
+
+		if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+			mb_clear_bits(buddy, first, last - first + 1);
+			e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+			break;
+		}
+		first >>= 1;
+		last >>= 1;
+		buddy = buddy2;
+	}
+}
+
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+			   int first, int count)
+{
+	int left_is_free = 0;
+	int right_is_free = 0;
+	int block;
+	int last = first + count - 1;
+	struct super_block *sb = e4b->bd_sb;
+
+	if (WARN_ON(count == 0))
+		return;
+	BUG_ON(last >= (sb->s_blocksize << 3));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+	/* Don't bother if the block group is corrupt. */
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+		return;
+
+	mb_check_buddy(e4b);
+	mb_free_blocks_double(inode, e4b, first, count);
+
+	e4b->bd_info->bb_free += count;
+	if (first < e4b->bd_info->bb_first_free)
+		e4b->bd_info->bb_first_free = first;
+
+	/* access memory sequentially: check left neighbour,
+	 * clear range and then check right neighbour
+	 */
+	if (first != 0)
+		left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+	block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+	if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
+
+	if (unlikely(block != -1)) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		ext4_fsblk_t blocknr;
+
+		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+		blocknr += EXT4_C2B(EXT4_SB(sb), block);
+		ext4_grp_locked_error(sb, e4b->bd_group,
+				      inode ? inode->i_ino : 0,
+				      blocknr,
+				      "freeing already freed block "
+				      "(bit %u); block bitmap corrupt.",
+				      block);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   e4b->bd_info->bb_free);
+		/* Mark the block group as corrupt. */
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+			&e4b->bd_info->bb_state);
+		mb_regenerate_buddy(e4b);
+		goto done;
+	}
+
+	/* let's maintain fragments counter */
+	if (left_is_free && right_is_free)
+		e4b->bd_info->bb_fragments--;
+	else if (!left_is_free && !right_is_free)
+		e4b->bd_info->bb_fragments++;
+
+	/* buddy[0] == bd_bitmap is a special case, so handle
+	 * it right away and let mb_buddy_mark_free stay free of
+	 * zero order checks.
+	 * Check if neighbours are to be coaleasced,
+	 * adjust bitmap bb_counters and borders appropriately.
+	 */
+	if (first & 1) {
+		first += !left_is_free;
+		e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+	}
+	if (!(last & 1)) {
+		last -= !right_is_free;
+		e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+	}
+
+	if (first <= last)
+		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
+
+done:
+	mb_set_largest_free_order(sb, e4b->bd_info);
+	mb_check_buddy(e4b);
+}
+
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
+				int needed, struct ext4_free_extent *ex)
+{
+	int next = block;
+	int max, order;
+	void *buddy;
+
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+	BUG_ON(ex == NULL);
+
+	buddy = mb_find_buddy(e4b, 0, &max);
+	BUG_ON(buddy == NULL);
+	BUG_ON(block >= max);
+	if (mb_test_bit(block, buddy)) {
+		ex->fe_len = 0;
+		ex->fe_start = 0;
+		ex->fe_group = 0;
+		return 0;
+	}
+
+	/* find actual order */
+	order = mb_find_order_for_block(e4b, block);
+	block = block >> order;
+
+	ex->fe_len = 1 << order;
+	ex->fe_start = block << order;
+	ex->fe_group = e4b->bd_group;
+
+	/* calc difference from given start */
+	next = next - ex->fe_start;
+	ex->fe_len -= next;
+	ex->fe_start += next;
+
+	while (needed > ex->fe_len &&
+	       mb_find_buddy(e4b, order, &max)) {
+
+		if (block + 1 >= max)
+			break;
+
+		next = (block + 1) * (1 << order);
+		if (mb_test_bit(next, e4b->bd_bitmap))
+			break;
+
+		order = mb_find_order_for_block(e4b, next);
+
+		block = next >> order;
+		ex->fe_len += 1 << order;
+	}
+
+	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+	return ex->fe_len;
+}
+
+static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
+{
+	int ord;
+	int mlen = 0;
+	int max = 0;
+	int cur;
+	int start = ex->fe_start;
+	int len = ex->fe_len;
+	unsigned ret = 0;
+	int len0 = len;
+	void *buddy;
+
+	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
+	BUG_ON(e4b->bd_group != ex->fe_group);
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+	mb_check_buddy(e4b);
+	mb_mark_used_double(e4b, start, len);
+
+	e4b->bd_info->bb_free -= len;
+	if (e4b->bd_info->bb_first_free == start)
+		e4b->bd_info->bb_first_free += len;
+
+	/* let's maintain fragments counter */
+	if (start != 0)
+		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
+	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
+		max = !mb_test_bit(start + len, e4b->bd_bitmap);
+	if (mlen && max)
+		e4b->bd_info->bb_fragments++;
+	else if (!mlen && !max)
+		e4b->bd_info->bb_fragments--;
+
+	/* let's maintain buddy itself */
+	while (len) {
+		ord = mb_find_order_for_block(e4b, start);
+
+		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+			/* the whole chunk may be allocated at once! */
+			mlen = 1 << ord;
+			buddy = mb_find_buddy(e4b, ord, &max);
+			BUG_ON((start >> ord) >= max);
+			mb_set_bit(start >> ord, buddy);
+			e4b->bd_info->bb_counters[ord]--;
+			start += mlen;
+			len -= mlen;
+			BUG_ON(len < 0);
+			continue;
+		}
+
+		/* store for history */
+		if (ret == 0)
+			ret = len | (ord << 16);
+
+		/* we have to split large buddy */
+		BUG_ON(ord <= 0);
+		buddy = mb_find_buddy(e4b, ord, &max);
+		mb_set_bit(start >> ord, buddy);
+		e4b->bd_info->bb_counters[ord]--;
+
+		ord--;
+		cur = (start >> ord) & ~1U;
+		buddy = mb_find_buddy(e4b, ord, &max);
+		mb_clear_bit(cur, buddy);
+		mb_clear_bit(cur + 1, buddy);
+		e4b->bd_info->bb_counters[ord]++;
+		e4b->bd_info->bb_counters[ord]++;
+	}
+	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
+
+	ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+	mb_check_buddy(e4b);
+
+	return ret;
+}
+
+/*
+ * Must be called under group lock!
+ */
+static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int ret;
+
+	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
+	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
+	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
+	ret = mb_mark_used(e4b, &ac->ac_b_ex);
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_tail = ret & 0xffff;
+	ac->ac_buddy = ret >> 16;
+
+	/*
+	 * take the page reference. We want the page to be pinned
+	 * so that we don't get a ext4_mb_init_cache_call for this
+	 * group until we update the bitmap. That would mean we
+	 * double allocate blocks. The reference is dropped
+	 * in ext4_mb_release_context
+	 */
+	ac->ac_bitmap_page = e4b->bd_bitmap_page;
+	get_page(ac->ac_bitmap_page);
+	ac->ac_buddy_page = e4b->bd_buddy_page;
+	get_page(ac->ac_buddy_page);
+	/* store last allocated for subsequent stream allocation */
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+		spin_lock(&sbi->s_md_lock);
+		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
+		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
+		spin_unlock(&sbi->s_md_lock);
+	}
+}
+
+/*
+ * regular allocator, for general purposes allocation
+ */
+
+static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b,
+					int finish_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_free_extent *bex = &ac->ac_b_ex;
+	struct ext4_free_extent *gex = &ac->ac_g_ex;
+	struct ext4_free_extent ex;
+	int max;
+
+	if (ac->ac_status == AC_STATUS_FOUND)
+		return;
+	/*
+	 * We don't want to scan for a whole year
+	 */
+	if (ac->ac_found > sbi->s_mb_max_to_scan &&
+			!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		ac->ac_status = AC_STATUS_BREAK;
+		return;
+	}
+
+	/*
+	 * Haven't found good chunk so far, let's continue
+	 */
+	if (bex->fe_len < gex->fe_len)
+		return;
+
+	if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+			&& bex->fe_group == e4b->bd_group) {
+		/* recheck chunk's availability - we don't know
+		 * when it was found (within this lock-unlock
+		 * period or not) */
+		max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
+		if (max >= gex->fe_len) {
+			ext4_mb_use_best_found(ac, e4b);
+			return;
+		}
+	}
+}
+
+/*
+ * The routine checks whether found extent is good enough. If it is,
+ * then the extent gets marked used and flag is set to the context
+ * to stop scanning. Otherwise, the extent is compared with the
+ * previous found extent and if new one is better, then it's stored
+ * in the context. Later, the best found extent will be used, if
+ * mballoc can't find good enough extent.
+ *
+ * FIXME: real allocation policy is to be designed yet!
+ */
+static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
+					struct ext4_free_extent *ex,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent *bex = &ac->ac_b_ex;
+	struct ext4_free_extent *gex = &ac->ac_g_ex;
+
+	BUG_ON(ex->fe_len <= 0);
+	BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
+
+	ac->ac_found++;
+
+	/*
+	 * The special case - take what you catch first
+	 */
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		*bex = *ex;
+		ext4_mb_use_best_found(ac, e4b);
+		return;
+	}
+
+	/*
+	 * Let's check whether the chuck is good enough
+	 */
+	if (ex->fe_len == gex->fe_len) {
+		*bex = *ex;
+		ext4_mb_use_best_found(ac, e4b);
+		return;
+	}
+
+	/*
+	 * If this is first found extent, just store it in the context
+	 */
+	if (bex->fe_len == 0) {
+		*bex = *ex;
+		return;
+	}
+
+	/*
+	 * If new found extent is better, store it in the context
+	 */
+	if (bex->fe_len < gex->fe_len) {
+		/* if the request isn't satisfied, any found extent
+		 * larger than previous best one is better */
+		if (ex->fe_len > bex->fe_len)
+			*bex = *ex;
+	} else if (ex->fe_len > gex->fe_len) {
+		/* if the request is satisfied, then we try to find
+		 * an extent that still satisfy the request, but is
+		 * smaller than previous one */
+		if (ex->fe_len < bex->fe_len)
+			*bex = *ex;
+	}
+
+	ext4_mb_check_limits(ac, e4b, 0);
+}
+
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent ex = ac->ac_b_ex;
+	ext4_group_t group = ex.fe_group;
+	int max;
+	int err;
+
+	BUG_ON(ex.fe_len <= 0);
+	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+	if (err)
+		return err;
+
+	ext4_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
+
+	if (max > 0) {
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	}
+
+	ext4_unlock_group(ac->ac_sb, group);
+	ext4_mb_unload_buddy(e4b);
+
+	return 0;
+}
+
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+				struct ext4_buddy *e4b)
+{
+	ext4_group_t group = ac->ac_g_ex.fe_group;
+	int max;
+	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+	struct ext4_free_extent ex;
+
+	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+		return 0;
+	if (grp->bb_free == 0)
+		return 0;
+
+	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+	if (err)
+		return err;
+
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+		ext4_mb_unload_buddy(e4b);
+		return 0;
+	}
+
+	ext4_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
+			     ac->ac_g_ex.fe_len, &ex);
+	ex.fe_logical = 0xDEADFA11; /* debug value */
+
+	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+		ext4_fsblk_t start;
+
+		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+			ex.fe_start;
+		/* use do_div to get remainder (would be 64-bit modulo) */
+		if (do_div(start, sbi->s_stripe) == 0) {
+			ac->ac_found++;
+			ac->ac_b_ex = ex;
+			ext4_mb_use_best_found(ac, e4b);
+		}
+	} else if (max >= ac->ac_g_ex.fe_len) {
+		BUG_ON(ex.fe_len <= 0);
+		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+		ac->ac_found++;
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
+		/* Sometimes, caller may want to merge even small
+		 * number of blocks to an existing extent */
+		BUG_ON(ex.fe_len <= 0);
+		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+		ac->ac_found++;
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	}
+	ext4_unlock_group(ac->ac_sb, group);
+	ext4_mb_unload_buddy(e4b);
+
+	return 0;
+}
+
+/*
+ * The routine scans buddy structures (not bitmap!) from given order
+ * to max order and tries to find big enough chunk to satisfy the req
+ */
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_group_info *grp = e4b->bd_info;
+	void *buddy;
+	int i;
+	int k;
+	int max;
+
+	BUG_ON(ac->ac_2order <= 0);
+	for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+		if (grp->bb_counters[i] == 0)
+			continue;
+
+		buddy = mb_find_buddy(e4b, i, &max);
+		BUG_ON(buddy == NULL);
+
+		k = mb_find_next_zero_bit(buddy, max, 0);
+		BUG_ON(k >= max);
+
+		ac->ac_found++;
+
+		ac->ac_b_ex.fe_len = 1 << i;
+		ac->ac_b_ex.fe_start = k << i;
+		ac->ac_b_ex.fe_group = e4b->bd_group;
+
+		ext4_mb_use_best_found(ac, e4b);
+
+		BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+
+		if (EXT4_SB(sb)->s_mb_stats)
+			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
+
+		break;
+	}
+}
+
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
+ * free blocks in the group, so the routine can know upper limit.
+ */
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	void *bitmap = e4b->bd_bitmap;
+	struct ext4_free_extent ex;
+	int i;
+	int free;
+
+	free = e4b->bd_info->bb_free;
+	BUG_ON(free <= 0);
+
+	i = e4b->bd_info->bb_first_free;
+
+	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
+		i = mb_find_next_zero_bit(bitmap,
+						EXT4_CLUSTERS_PER_GROUP(sb), i);
+		if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
+			/*
+			 * IF we have corrupt bitmap, we won't find any
+			 * free blocks even though group info says we
+			 * we have free blocks
+			 */
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free clusters as per "
+					"group info. But bitmap says 0",
+					free);
+			break;
+		}
+
+		mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
+		BUG_ON(ex.fe_len <= 0);
+		if (free < ex.fe_len) {
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free clusters as per "
+					"group info. But got %d blocks",
+					free, ex.fe_len);
+			/*
+			 * The number of free blocks differs. This mostly
+			 * indicate that the bitmap is corrupt. So exit
+			 * without claiming the space.
+			 */
+			break;
+		}
+		ex.fe_logical = 0xDEADC0DE; /* debug value */
+		ext4_mb_measure_extent(ac, &ex, e4b);
+
+		i += ex.fe_len;
+		free -= ex.fe_len;
+	}
+
+	ext4_mb_check_limits(ac, e4b, 1);
+}
+
+/*
+ * This is a special case for storages like raid5
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
+ */
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+				 struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	void *bitmap = e4b->bd_bitmap;
+	struct ext4_free_extent ex;
+	ext4_fsblk_t first_group_block;
+	ext4_fsblk_t a;
+	ext4_grpblk_t i;
+	int max;
+
+	BUG_ON(sbi->s_stripe == 0);
+
+	/* find first stripe-aligned block in group */
+	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
+	a = first_group_block + sbi->s_stripe - 1;
+	do_div(a, sbi->s_stripe);
+	i = (a * sbi->s_stripe) - first_group_block;
+
+	while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
+		if (!mb_test_bit(i, bitmap)) {
+			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
+			if (max >= sbi->s_stripe) {
+				ac->ac_found++;
+				ex.fe_logical = 0xDEADF00D; /* debug value */
+				ac->ac_b_ex = ex;
+				ext4_mb_use_best_found(ac, e4b);
+				break;
+			}
+		}
+		i += sbi->s_stripe;
+	}
+}
+
+/* This is now called BEFORE we load the buddy bitmap. */
+static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+				ext4_group_t group, int cr)
+{
+	unsigned free, fragments;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
+	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+
+	BUG_ON(cr < 0 || cr >= 4);
+
+	free = grp->bb_free;
+	if (free == 0)
+		return 0;
+	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+		return 0;
+
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+		return 0;
+
+	/* We only do this if the grp has never been initialized */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		int ret = ext4_mb_init_group(ac->ac_sb, group);
+		if (ret)
+			return 0;
+	}
+
+	fragments = grp->bb_fragments;
+	if (fragments == 0)
+		return 0;
+
+	switch (cr) {
+	case 0:
+		BUG_ON(ac->ac_2order == 0);
+
+		/* Avoid using the first bg of a flexgroup for data files */
+		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+		    ((group % flex_size) == 0))
+			return 0;
+
+		if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+		    (free / fragments) >= ac->ac_g_ex.fe_len)
+			return 1;
+
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
+		return 1;
+	case 1:
+		if ((free / fragments) >= ac->ac_g_ex.fe_len)
+			return 1;
+		break;
+	case 2:
+		if (free >= ac->ac_g_ex.fe_len)
+			return 1;
+		break;
+	case 3:
+		return 1;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static noinline_for_stack int
+ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+{
+	ext4_group_t ngroups, group, i;
+	int cr;
+	int err = 0;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	struct ext4_buddy e4b;
+
+	sb = ac->ac_sb;
+	sbi = EXT4_SB(sb);
+	ngroups = ext4_get_groups_count(sb);
+	/* non-extent files are limited to low blocks/groups */
+	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+		ngroups = sbi->s_blockfile_groups;
+
+	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+	/* first, try the goal */
+	err = ext4_mb_find_by_goal(ac, &e4b);
+	if (err || ac->ac_status == AC_STATUS_FOUND)
+		goto out;
+
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		goto out;
+
+	/*
+	 * ac->ac2_order is set only if the fe_len is a power of 2
+	 * if ac2_order is set we also set criteria to 0 so that we
+	 * try exact allocation using buddy.
+	 */
+	i = fls(ac->ac_g_ex.fe_len);
+	ac->ac_2order = 0;
+	/*
+	 * We search using buddy data only if the order of the request
+	 * is greater than equal to the sbi_s_mb_order2_reqs
+	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
+	 */
+	if (i >= sbi->s_mb_order2_reqs) {
+		/*
+		 * This should tell if fe_len is exactly power of 2
+		 */
+		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+			ac->ac_2order = i - 1;
+	}
+
+	/* if stream allocation is enabled, use global goal */
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+		/* TBD: may be hot point */
+		spin_lock(&sbi->s_md_lock);
+		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
+		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
+		spin_unlock(&sbi->s_md_lock);
+	}
+
+	/* Let's just scan groups to find more-less suitable blocks */
+	cr = ac->ac_2order ? 0 : 1;
+	/*
+	 * cr == 0 try to get exact allocation,
+	 * cr == 3  try to get anything
+	 */
+repeat:
+	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+		ac->ac_criteria = cr;
+		/*
+		 * searching for the right group start
+		 * from the goal value specified
+		 */
+		group = ac->ac_g_ex.fe_group;
+
+		for (i = 0; i < ngroups; group++, i++) {
+			cond_resched();
+			/*
+			 * Artificially restricted ngroups for non-extent
+			 * files makes group > ngroups possible on first loop.
+			 */
+			if (group >= ngroups)
+				group = 0;
+
+			/* This now checks without needing the buddy page */
+			if (!ext4_mb_good_group(ac, group, cr))
+				continue;
+
+			err = ext4_mb_load_buddy(sb, group, &e4b);
+			if (err)
+				goto out;
+
+			ext4_lock_group(sb, group);
+
+			/*
+			 * We need to check again after locking the
+			 * block group
+			 */
+			if (!ext4_mb_good_group(ac, group, cr)) {
+				ext4_unlock_group(sb, group);
+				ext4_mb_unload_buddy(&e4b);
+				continue;
+			}
+
+			ac->ac_groups_scanned++;
+			if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
+				ext4_mb_simple_scan_group(ac, &e4b);
+			else if (cr == 1 && sbi->s_stripe &&
+					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
+				ext4_mb_scan_aligned(ac, &e4b);
+			else
+				ext4_mb_complex_scan_group(ac, &e4b);
+
+			ext4_unlock_group(sb, group);
+			ext4_mb_unload_buddy(&e4b);
+
+			if (ac->ac_status != AC_STATUS_CONTINUE)
+				break;
+		}
+	}
+
+	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		/*
+		 * We've been searching too long. Let's try to allocate
+		 * the best chunk we've found so far
+		 */
+
+		ext4_mb_try_best_found(ac, &e4b);
+		if (ac->ac_status != AC_STATUS_FOUND) {
+			/*
+			 * Someone more lucky has already allocated it.
+			 * The only thing we can do is just take first
+			 * found block(s)
+			printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
+			 */
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			ac->ac_flags |= EXT4_MB_HINT_FIRST;
+			cr = 3;
+			atomic_inc(&sbi->s_mb_lost_chunks);
+			goto repeat;
+		}
+	}
+out:
+	return err;
+}
+
+static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+{
+	struct super_block *sb = seq->private;
+	ext4_group_t group;
+
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
+		return NULL;
+	group = *pos + 1;
+	return (void *) ((unsigned long) group);
+}
+
+static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct super_block *sb = seq->private;
+	ext4_group_t group;
+
+	++*pos;
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
+		return NULL;
+	group = *pos + 1;
+	return (void *) ((unsigned long) group);
+}
+
+static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+{
+	struct super_block *sb = seq->private;
+	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
+	int i;
+	int err, buddy_loaded = 0;
+	struct ext4_buddy e4b;
+	struct ext4_group_info *grinfo;
+	struct sg {
+		struct ext4_group_info info;
+		ext4_grpblk_t counters[16];
+	} sg;
+
+	group--;
+	if (group == 0)
+		seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
+				"[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+				  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+			   "group", "free", "frags", "first",
+			   "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+			   "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+		sizeof(struct ext4_group_info);
+	grinfo = ext4_get_group_info(sb, group);
+	/* Load the group info in memory only if not already loaded. */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		if (err) {
+			seq_printf(seq, "#%-5u: I/O error\n", group);
+			return 0;
+		}
+		buddy_loaded = 1;
+	}
+
+	memcpy(&sg, ext4_get_group_info(sb, group), i);
+
+	if (buddy_loaded)
+		ext4_mb_unload_buddy(&e4b);
+
+	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
+			sg.info.bb_fragments, sg.info.bb_first_free);
+	for (i = 0; i <= 13; i++)
+		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+				sg.info.bb_counters[i] : 0);
+	seq_printf(seq, " ]\n");
+
+	return 0;
+}
+
+static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations ext4_mb_seq_groups_ops = {
+	.start  = ext4_mb_seq_groups_start,
+	.next   = ext4_mb_seq_groups_next,
+	.stop   = ext4_mb_seq_groups_stop,
+	.show   = ext4_mb_seq_groups_show,
+};
+
+static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+{
+	struct super_block *sb = PDE_DATA(inode);
+	int rc;
+
+	rc = seq_open(file, &ext4_mb_seq_groups_ops);
+	if (rc == 0) {
+		struct seq_file *m = file->private_data;
+		m->private = sb;
+	}
+	return rc;
+
+}
+
+static const struct file_operations ext4_mb_seq_groups_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_mb_seq_groups_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+	BUG_ON(!cachep);
+	return cachep;
+}
+
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned size;
+	struct ext4_group_info ***new_groupinfo;
+
+	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+		EXT4_DESC_PER_BLOCK_BITS(sb);
+	if (size <= sbi->s_group_info_size)
+		return 0;
+
+	size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+	new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+	if (!new_groupinfo) {
+		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+		return -ENOMEM;
+	}
+	if (sbi->s_group_info) {
+		memcpy(new_groupinfo, sbi->s_group_info,
+		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+		kvfree(sbi->s_group_info);
+	}
+	sbi->s_group_info = new_groupinfo;
+	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
+		   sbi->s_group_info_size);
+	return 0;
+}
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+			  struct ext4_group_desc *desc)
+{
+	int i;
+	int metalen = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info **meta_group_info;
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+
+	/*
+	 * First check if this group is the first of a reserved block.
+	 * If it's true, we have to allocate a new table of pointers
+	 * to ext4_group_info structures
+	 */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		metalen = sizeof(*meta_group_info) <<
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		meta_group_info = kmalloc(metalen, GFP_NOFS);
+		if (meta_group_info == NULL) {
+			ext4_msg(sb, KERN_ERR, "can't allocate mem "
+				 "for a buddy group");
+			goto exit_meta_group_info;
+		}
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+			meta_group_info;
+	}
+
+	meta_group_info =
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
+	if (meta_group_info[i] == NULL) {
+		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
+		goto exit_group_info;
+	}
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+		&(meta_group_info[i]->bb_state));
+
+	/*
+	 * initialize bb_free to be able to skip
+	 * empty groups without initialization
+	 */
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		meta_group_info[i]->bb_free =
+			ext4_free_clusters_after_init(sb, group, desc);
+	} else {
+		meta_group_info[i]->bb_free =
+			ext4_free_group_clusters(sb, desc);
+	}
+
+	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+	init_rwsem(&meta_group_info[i]->alloc_sem);
+	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
+
+#ifdef DOUBLE_CHECK
+	{
+		struct buffer_head *bh;
+		meta_group_info[i]->bb_bitmap =
+			kmalloc(sb->s_blocksize, GFP_NOFS);
+		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+		bh = ext4_read_block_bitmap(sb, group);
+		BUG_ON(bh == NULL);
+		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+			sb->s_blocksize);
+		put_bh(bh);
+	}
+#endif
+
+	return 0;
+
+exit_group_info:
+	/* If a meta_group_info table has been allocated, release it now */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+	}
+exit_meta_group_info:
+	return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+static int ext4_mb_init_backend(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t i;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int err;
+	struct ext4_group_desc *desc;
+	struct kmem_cache *cachep;
+
+	err = ext4_mb_alloc_groupinfo(sb, ngroups);
+	if (err)
+		return err;
+
+	sbi->s_buddy_cache = new_inode(sb);
+	if (sbi->s_buddy_cache == NULL) {
+		ext4_msg(sb, KERN_ERR, "can't get new inode");
+		goto err_freesgi;
+	}
+	/* To avoid potentially colliding with an valid on-disk inode number,
+	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
+	 * not in the inode hash, so it should never be found by iget(), but
+	 * this will avoid confusion if it ever shows up during debugging. */
+	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
+	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+	for (i = 0; i < ngroups; i++) {
+		desc = ext4_get_group_desc(sb, i, NULL);
+		if (desc == NULL) {
+			ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
+			goto err_freebuddy;
+		}
+		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+			goto err_freebuddy;
+	}
+
+	return 0;
+
+err_freebuddy:
+	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+	while (i-- > 0)
+		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+	i = sbi->s_group_info_size;
+	while (i-- > 0)
+		kfree(sbi->s_group_info[i]);
+	iput(sbi->s_buddy_cache);
+err_freesgi:
+	kvfree(sbi->s_group_info);
+	return -ENOMEM;
+}
+
+static void ext4_groupinfo_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+		if (ext4_groupinfo_caches[i])
+			kmem_cache_destroy(ext4_groupinfo_caches[i]);
+		ext4_groupinfo_caches[i] = NULL;
+	}
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+	int slab_size;
+	int blocksize_bits = order_base_2(size);
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep;
+
+	if (cache_index >= NR_GRPINFO_CACHES)
+		return -EINVAL;
+
+	if (unlikely(cache_index < 0))
+		cache_index = 0;
+
+	mutex_lock(&ext4_grpinfo_slab_create_mutex);
+	if (ext4_groupinfo_caches[cache_index]) {
+		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+		return 0;	/* Already created */
+	}
+
+	slab_size = offsetof(struct ext4_group_info,
+				bb_counters[blocksize_bits + 2]);
+
+	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+					NULL);
+
+	ext4_groupinfo_caches[cache_index] = cachep;
+
+	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+	if (!cachep) {
+		printk(KERN_EMERG
+		       "EXT4-fs: no memory for groupinfo slab cache\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int ext4_mb_init(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned i, j;
+	unsigned offset;
+	unsigned max;
+	int ret;
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+
+	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+	if (sbi->s_mb_offsets == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
+	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+	if (sbi->s_mb_maxs == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+	if (ret < 0)
+		goto out;
+
+	/* order 0 is regular bitmap */
+	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+	sbi->s_mb_offsets[0] = 0;
+
+	i = 1;
+	offset = 0;
+	max = sb->s_blocksize << 2;
+	do {
+		sbi->s_mb_offsets[i] = offset;
+		sbi->s_mb_maxs[i] = max;
+		offset += 1 << (sb->s_blocksize_bits - i);
+		max = max >> 1;
+		i++;
+	} while (i <= sb->s_blocksize_bits + 1);
+
+	spin_lock_init(&sbi->s_md_lock);
+	spin_lock_init(&sbi->s_bal_lock);
+
+	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+	sbi->s_mb_stats = MB_DEFAULT_STATS;
+	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+	/*
+	 * The default group preallocation is 512, which for 4k block
+	 * sizes translates to 2 megabytes.  However for bigalloc file
+	 * systems, this is probably too big (i.e, if the cluster size
+	 * is 1 megabyte, then group preallocation size becomes half a
+	 * gigabyte!).  As a default, we will keep a two megabyte
+	 * group pralloc size for cluster sizes up to 64k, and after
+	 * that, we will force a minimum group preallocation size of
+	 * 32 clusters.  This translates to 8 megs when the cluster
+	 * size is 256k, and 32 megs when the cluster size is 1 meg,
+	 * which seems reasonable as a default.
+	 */
+	sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+				       sbi->s_cluster_bits, 32);
+	/*
+	 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
+	 * to the lowest multiple of s_stripe which is bigger than
+	 * the s_mb_group_prealloc as determined above. We want
+	 * the preallocation size to be an exact multiple of the
+	 * RAID stripe size so that preallocations don't fragment
+	 * the stripes.
+	 */
+	if (sbi->s_stripe > 1) {
+		sbi->s_mb_group_prealloc = roundup(
+			sbi->s_mb_group_prealloc, sbi->s_stripe);
+	}
+
+	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+	if (sbi->s_locality_groups == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	for_each_possible_cpu(i) {
+		struct ext4_locality_group *lg;
+		lg = per_cpu_ptr(sbi->s_locality_groups, i);
+		mutex_init(&lg->lg_mutex);
+		for (j = 0; j < PREALLOC_TB_SIZE; j++)
+			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
+		spin_lock_init(&lg->lg_prealloc_lock);
+	}
+
+	/* init file for buddy data */
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0)
+		goto out_free_locality_groups;
+
+	if (sbi->s_proc)
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+				 &ext4_mb_seq_groups_fops, sb);
+
+	return 0;
+
+out_free_locality_groups:
+	free_percpu(sbi->s_locality_groups);
+	sbi->s_locality_groups = NULL;
+out:
+	kfree(sbi->s_mb_offsets);
+	sbi->s_mb_offsets = NULL;
+	kfree(sbi->s_mb_maxs);
+	sbi->s_mb_maxs = NULL;
+	return ret;
+}
+
+/* need to called with the ext4 group lock held */
+static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+{
+	struct ext4_prealloc_space *pa;
+	struct list_head *cur, *tmp;
+	int count = 0;
+
+	list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		list_del(&pa->pa_group_list);
+		count++;
+		kmem_cache_free(ext4_pspace_cachep, pa);
+	}
+	if (count)
+		mb_debug(1, "mballoc: %u PAs left\n", count);
+
+}
+
+int ext4_mb_release(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t i;
+	int num_meta_group_infos;
+	struct ext4_group_info *grinfo;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+
+	if (sbi->s_proc)
+		remove_proc_entry("mb_groups", sbi->s_proc);
+
+	if (sbi->s_group_info) {
+		for (i = 0; i < ngroups; i++) {
+			grinfo = ext4_get_group_info(sb, i);
+#ifdef DOUBLE_CHECK
+			kfree(grinfo->bb_bitmap);
+#endif
+			ext4_lock_group(sb, i);
+			ext4_mb_cleanup_pa(grinfo);
+			ext4_unlock_group(sb, i);
+			kmem_cache_free(cachep, grinfo);
+		}
+		num_meta_group_infos = (ngroups +
+				EXT4_DESC_PER_BLOCK(sb) - 1) >>
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		for (i = 0; i < num_meta_group_infos; i++)
+			kfree(sbi->s_group_info[i]);
+		kvfree(sbi->s_group_info);
+	}
+	kfree(sbi->s_mb_offsets);
+	kfree(sbi->s_mb_maxs);
+	iput(sbi->s_buddy_cache);
+	if (sbi->s_mb_stats) {
+		ext4_msg(sb, KERN_INFO,
+		       "mballoc: %u blocks %u reqs (%u success)",
+				atomic_read(&sbi->s_bal_allocated),
+				atomic_read(&sbi->s_bal_reqs),
+				atomic_read(&sbi->s_bal_success));
+		ext4_msg(sb, KERN_INFO,
+		      "mballoc: %u extents scanned, %u goal hits, "
+				"%u 2^N hits, %u breaks, %u lost",
+				atomic_read(&sbi->s_bal_ex_scanned),
+				atomic_read(&sbi->s_bal_goals),
+				atomic_read(&sbi->s_bal_2orders),
+				atomic_read(&sbi->s_bal_breaks),
+				atomic_read(&sbi->s_mb_lost_chunks));
+		ext4_msg(sb, KERN_INFO,
+		       "mballoc: %lu generated and it took %Lu",
+				sbi->s_mb_buddies_generated,
+				sbi->s_mb_generation_time);
+		ext4_msg(sb, KERN_INFO,
+		       "mballoc: %u preallocated, %u discarded",
+				atomic_read(&sbi->s_mb_preallocated),
+				atomic_read(&sbi->s_mb_discarded));
+	}
+
+	free_percpu(sbi->s_locality_groups);
+
+	return 0;
+}
+
+static inline int ext4_issue_discard(struct super_block *sb,
+		ext4_group_t block_group, ext4_grpblk_t cluster, int count)
+{
+	ext4_fsblk_t discard_block;
+
+	discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+			 ext4_group_first_block_no(sb, block_group));
+	count = EXT4_C2B(EXT4_SB(sb), count);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long) discard_block, count);
+	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+}
+
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void ext4_free_data_callback(struct super_block *sb,
+				    struct ext4_journal_cb_entry *jce,
+				    int rc)
+{
+	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
+	struct ext4_buddy e4b;
+	struct ext4_group_info *db;
+	int err, count = 0, count2 = 0;
+
+	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+		 entry->efd_count, entry->efd_group, entry);
+
+	if (test_opt(sb, DISCARD)) {
+		err = ext4_issue_discard(sb, entry->efd_group,
+					 entry->efd_start_cluster,
+					 entry->efd_count);
+		if (err && err != -EOPNOTSUPP)
+			ext4_msg(sb, KERN_WARNING, "discard request in"
+				 " group:%d block:%d count:%d failed"
+				 " with %d", entry->efd_group,
+				 entry->efd_start_cluster,
+				 entry->efd_count, err);
+	}
+
+	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+	/* we expect to find existing buddy because it's pinned */
+	BUG_ON(err != 0);
+
+
+	db = e4b.bd_info;
+	/* there are blocks to put in buddy to make them really free */
+	count += entry->efd_count;
+	count2++;
+	ext4_lock_group(sb, entry->efd_group);
+	/* Take it out of per group rb tree */
+	rb_erase(&entry->efd_node, &(db->bb_free_root));
+	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
+
+	/*
+	 * Clear the trimmed flag for the group so that the next
+	 * ext4_trim_fs can trim it.
+	 * If the volume is mounted with -o discard, online discard
+	 * is supported and the free blocks will be trimmed online.
+	 */
+	if (!test_opt(sb, DISCARD))
+		EXT4_MB_GRP_CLEAR_TRIMMED(db);
+
+	if (!db->bb_free_root.rb_node) {
+		/* No more items in the per group rb tree
+		 * balance refcounts from ext4_mb_free_metadata()
+		 */
+		page_cache_release(e4b.bd_buddy_page);
+		page_cache_release(e4b.bd_bitmap_page);
+	}
+	ext4_unlock_group(sb, entry->efd_group);
+	kmem_cache_free(ext4_free_data_cachep, entry);
+	ext4_mb_unload_buddy(&e4b);
+
+	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+}
+
+int __init ext4_init_mballoc(void)
+{
+	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+					SLAB_RECLAIM_ACCOUNT);
+	if (ext4_pspace_cachep == NULL)
+		return -ENOMEM;
+
+	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+				    SLAB_RECLAIM_ACCOUNT);
+	if (ext4_ac_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		return -ENOMEM;
+	}
+
+	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+					   SLAB_RECLAIM_ACCOUNT);
+	if (ext4_free_data_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		kmem_cache_destroy(ext4_ac_cachep);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void ext4_exit_mballoc(void)
+{
+	/*
+	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+	 * before destroying the slab cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ext4_pspace_cachep);
+	kmem_cache_destroy(ext4_ac_cachep);
+	kmem_cache_destroy(ext4_free_data_cachep);
+	ext4_groupinfo_destroy_slabs();
+}
+
+
+/*
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
+ * Returns 0 if success or error code
+ */
+static noinline_for_stack int
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+				handle_t *handle, unsigned int reserv_clstrs)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gdp_bh;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	ext4_fsblk_t block;
+	int err, len;
+
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(ac->ac_b_ex.fe_len <= 0);
+
+	sb = ac->ac_sb;
+	sbi = EXT4_SB(sb);
+
+	err = -EIO;
+	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	if (!bitmap_bh)
+		goto out_err;
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+	if (!gdp)
+		goto out_err;
+
+	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
+			ext4_free_group_clusters(sb, gdp));
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto out_err;
+
+	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+
+	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+	if (!ext4_data_block_valid(sbi, block, len)) {
+		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
+			   "fs metadata", block, block+len);
+		/* File system mounted not to panic on error
+		 * Fix the bitmap and repeat the block allocation
+		 * We leak some of the blocks here.
+		 */
+		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+		ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+			      ac->ac_b_ex.fe_len);
+		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!err)
+			err = -EAGAIN;
+		goto out_err;
+	}
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+#ifdef AGGRESSIVE_CHECK
+	{
+		int i;
+		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
+			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
+						bitmap_bh->b_data));
+		}
+	}
+#endif
+	ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+		      ac->ac_b_ex.fe_len);
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+		ext4_free_group_clusters_set(sb, gdp,
+					     ext4_free_clusters_after_init(sb,
+						ac->ac_b_ex.fe_group, gdp));
+	}
+	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+	ext4_free_group_clusters_set(sb, gdp, len);
+	ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
+	ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
+	/*
+	 * Now reduce the dirty block count also. Should not go negative
+	 */
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		/* release all the reserved blocks if non delalloc */
+		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+				   reserv_clstrs);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi,
+							  ac->ac_b_ex.fe_group);
+		atomic64_sub(ac->ac_b_ex.fe_len,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+	}
+
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+	if (err)
+		goto out_err;
+	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+
+out_err:
+	brelse(bitmap_bh);
+	return err;
+}
+
+/*
+ * here we normalize request for locality group
+ * Group request are normalized to s_mb_group_prealloc, which goes to
+ * s_strip if we set the same via mount option.
+ * s_mb_group_prealloc can be configured via
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
+ *
+ * XXX: should we try to preallocate more than the group has now?
+ */
+static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg = ac->ac_lg;
+
+	BUG_ON(lg == NULL);
+	ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+	mb_debug(1, "#%u: goal %u blocks for locality group\n",
+		current->pid, ac->ac_g_ex.fe_len);
+}
+
+/*
+ * Normalization means making request better in terms of
+ * size and alignment
+ */
+static noinline_for_stack void
+ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+				struct ext4_allocation_request *ar)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int bsbits, max;
+	ext4_lblk_t end;
+	loff_t size, start_off;
+	loff_t orig_size __maybe_unused;
+	ext4_lblk_t start;
+	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	struct ext4_prealloc_space *pa;
+
+	/* do normalize only data requests, metadata requests
+	   do not need preallocation */
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return;
+
+	/* sometime caller may want exact blocks */
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
+	/* caller may indicate that preallocation isn't
+	 * required (it's a tail, for example) */
+	if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
+		return;
+
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
+		ext4_mb_normalize_group_request(ac);
+		return ;
+	}
+
+	bsbits = ac->ac_sb->s_blocksize_bits;
+
+	/* first, let's learn actual file size
+	 * given current request is allocated */
+	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+	size = size << bsbits;
+	if (size < i_size_read(ac->ac_inode))
+		size = i_size_read(ac->ac_inode);
+	orig_size = size;
+
+	/* max size of free chunks */
+	max = 2 << bsbits;
+
+#define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
+		(req <= (size) || max <= (chunk_size))
+
+	/* first, try to predict filesize */
+	/* XXX: should this table be tunable? */
+	start_off = 0;
+	if (size <= 16 * 1024) {
+		size = 16 * 1024;
+	} else if (size <= 32 * 1024) {
+		size = 32 * 1024;
+	} else if (size <= 64 * 1024) {
+		size = 64 * 1024;
+	} else if (size <= 128 * 1024) {
+		size = 128 * 1024;
+	} else if (size <= 256 * 1024) {
+		size = 256 * 1024;
+	} else if (size <= 512 * 1024) {
+		size = 512 * 1024;
+	} else if (size <= 1024 * 1024) {
+		size = 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+						(21 - bsbits)) << 21;
+		size = 2 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+							(22 - bsbits)) << 22;
+		size = 4 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+					(8<<20)>>bsbits, max, 8 * 1024)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+							(23 - bsbits)) << 23;
+		size = 8 * 1024 * 1024;
+	} else {
+		start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
+		size	  = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
+					      ac->ac_o_ex.fe_len) << bsbits;
+	}
+	size = size >> bsbits;
+	start = start_off >> bsbits;
+
+	/* don't cover already allocated blocks in selected range */
+	if (ar->pleft && start <= ar->lleft) {
+		size -= ar->lleft + 1 - start;
+		start = ar->lleft + 1;
+	}
+	if (ar->pright && start + size - 1 >= ar->lright)
+		size -= start + size - ar->lright;
+
+	end = start + size;
+
+	/* check we don't cross already preallocated blocks */
+	rcu_read_lock();
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+		ext4_lblk_t pa_end;
+
+		if (pa->pa_deleted)
+			continue;
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+
+		pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+						  pa->pa_len);
+
+		/* PA must not overlap original request */
+		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
+			ac->ac_o_ex.fe_logical < pa->pa_lstart));
+
+		/* skip PAs this normalized request doesn't overlap with */
+		if (pa->pa_lstart >= end || pa_end <= start) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+
+		/* adjust start or end to be adjacent to this pa */
+		if (pa_end <= ac->ac_o_ex.fe_logical) {
+			BUG_ON(pa_end < start);
+			start = pa_end;
+		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+			BUG_ON(pa->pa_lstart > end);
+			end = pa->pa_lstart;
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+	size = end - start;
+
+	/* XXX: extra loop to check we really don't overlap preallocations */
+	rcu_read_lock();
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+		ext4_lblk_t pa_end;
+
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted == 0) {
+			pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+							  pa->pa_len);
+			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+
+	if (start + size <= ac->ac_o_ex.fe_logical &&
+			start > ac->ac_o_ex.fe_logical) {
+		ext4_msg(ac->ac_sb, KERN_ERR,
+			 "start %lu, size %lu, fe_logical %lu",
+			 (unsigned long) start, (unsigned long) size,
+			 (unsigned long) ac->ac_o_ex.fe_logical);
+		BUG();
+	}
+	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+
+	/* now prepare goal request */
+
+	/* XXX: is it better to align blocks WRT to logical
+	 * placement or satisfy big request as is */
+	ac->ac_g_ex.fe_logical = start;
+	ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
+
+	/* define goal start in order to merge */
+	if (ar->pright && (ar->lright == (start + size))) {
+		/* merge to the right */
+		ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
+						&ac->ac_f_ex.fe_group,
+						&ac->ac_f_ex.fe_start);
+		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+	}
+	if (ar->pleft && (ar->lleft + 1 == start)) {
+		/* merge to the left */
+		ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
+						&ac->ac_f_ex.fe_group,
+						&ac->ac_f_ex.fe_start);
+		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+	}
+
+	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
+		(unsigned) orig_size, (unsigned) start);
+}
+
+static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+		atomic_inc(&sbi->s_bal_reqs);
+		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
+			atomic_inc(&sbi->s_bal_success);
+		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+			atomic_inc(&sbi->s_bal_goals);
+		if (ac->ac_found > sbi->s_mb_max_to_scan)
+			atomic_inc(&sbi->s_bal_breaks);
+	}
+
+	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+		trace_ext4_mballoc_alloc(ac);
+	else
+		trace_ext4_mballoc_prealloc(ac);
+}
+
+/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context.  We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+	struct ext4_buddy e4b;
+	int err;
+
+	if (pa == NULL) {
+		if (ac->ac_f_ex.fe_len == 0)
+			return;
+		err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
+		if (err) {
+			/*
+			 * This should never happen since we pin the
+			 * pages in the ext4_allocation_context so
+			 * ext4_mb_load_buddy() should never fail.
+			 */
+			WARN(1, "mb_load_buddy failed (%d)", err);
+			return;
+		}
+		ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+		mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
+			       ac->ac_f_ex.fe_len);
+		ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+		ext4_mb_unload_buddy(&e4b);
+		return;
+	}
+	if (pa->pa_type == MB_INODE_PA)
+		pa->pa_free += ac->ac_b_ex.fe_len;
+}
+
+/*
+ * use blocks preallocated to inode
+ */
+static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
+				struct ext4_prealloc_space *pa)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	ext4_fsblk_t start;
+	ext4_fsblk_t end;
+	int len;
+
+	/* found preallocated blocks, use them */
+	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+	end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+		  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+	len = EXT4_NUM_B2C(sbi, end - start);
+	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
+					&ac->ac_b_ex.fe_start);
+	ac->ac_b_ex.fe_len = len;
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_pa = pa;
+
+	BUG_ON(start < pa->pa_pstart);
+	BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
+	BUG_ON(pa->pa_free < len);
+	pa->pa_free -= len;
+
+	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
+}
+
+/*
+ * use blocks preallocated to locality group
+ */
+static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
+				struct ext4_prealloc_space *pa)
+{
+	unsigned int len = ac->ac_o_ex.fe_len;
+
+	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
+					&ac->ac_b_ex.fe_group,
+					&ac->ac_b_ex.fe_start);
+	ac->ac_b_ex.fe_len = len;
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_pa = pa;
+
+	/* we don't correct pa_pstart or pa_plen here to avoid
+	 * possible race when the group is being loaded concurrently
+	 * instead we correct pa later, after blocks are marked
+	 * in on-disk bitmap -- see ext4_mb_release_context()
+	 * Other CPUs are prevented from allocating from this pa by lg_mutex
+	 */
+	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+
+/*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+			struct ext4_prealloc_space *pa,
+			struct ext4_prealloc_space *cpa)
+{
+	ext4_fsblk_t cur_distance, new_distance;
+
+	if (cpa == NULL) {
+		atomic_inc(&pa->pa_count);
+		return pa;
+	}
+	cur_distance = abs(goal_block - cpa->pa_pstart);
+	new_distance = abs(goal_block - pa->pa_pstart);
+
+	if (cur_distance <= new_distance)
+		return cpa;
+
+	/* drop the previous reference */
+	atomic_dec(&cpa->pa_count);
+	atomic_inc(&pa->pa_count);
+	return pa;
+}
+
+/*
+ * search goal blocks in preallocated space
+ */
+static noinline_for_stack int
+ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int order, i;
+	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	struct ext4_locality_group *lg;
+	struct ext4_prealloc_space *pa, *cpa = NULL;
+	ext4_fsblk_t goal_block;
+
+	/* only data can be preallocated */
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return 0;
+
+	/* first, try per-file preallocation */
+	rcu_read_lock();
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+
+		/* all fields in this condition don't change,
+		 * so we can skip locking for them */
+		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
+		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+					       EXT4_C2B(sbi, pa->pa_len)))
+			continue;
+
+		/* non-extent files can't have physical blocks past 2^32 */
+		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+		     EXT4_MAX_BLOCK_FILE_PHYS))
+			continue;
+
+		/* found preallocated blocks, use them */
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted == 0 && pa->pa_free) {
+			atomic_inc(&pa->pa_count);
+			ext4_mb_use_inode_pa(ac, pa);
+			spin_unlock(&pa->pa_lock);
+			ac->ac_criteria = 10;
+			rcu_read_unlock();
+			return 1;
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+
+	/* can we use group allocation? */
+	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
+		return 0;
+
+	/* inode may have no locality group for some reason */
+	lg = ac->ac_lg;
+	if (lg == NULL)
+		return 0;
+	order  = fls(ac->ac_o_ex.fe_len) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
+
+	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
+	/*
+	 * search for the prealloc space that is having
+	 * minimal distance from the goal block.
+	 */
+	for (i = order; i < PREALLOC_TB_SIZE; i++) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+					pa_inode_list) {
+			spin_lock(&pa->pa_lock);
+			if (pa->pa_deleted == 0 &&
+					pa->pa_free >= ac->ac_o_ex.fe_len) {
+
+				cpa = ext4_mb_check_group_pa(goal_block,
+								pa, cpa);
+			}
+			spin_unlock(&pa->pa_lock);
+		}
+		rcu_read_unlock();
+	}
+	if (cpa) {
+		ext4_mb_use_group_pa(ac, cpa);
+		ac->ac_criteria = 20;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with the ext4 group lock held
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group)
+{
+	struct rb_node *n;
+	struct ext4_group_info *grp;
+	struct ext4_free_data *entry;
+
+	grp = ext4_get_group_info(sb, group);
+	n = rb_first(&(grp->bb_free_root));
+
+	while (n) {
+		entry = rb_entry(n, struct ext4_free_data, efd_node);
+		ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+		n = rb_next(n);
+	}
+	return;
+}
+
+/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock held
+ */
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_prealloc_space *pa;
+	struct list_head *cur;
+	ext4_group_t groupnr;
+	ext4_grpblk_t start;
+	int preallocated = 0;
+	int len;
+
+	/* all form of preallocation discards first load group,
+	 * so the only competing code is preallocation use.
+	 * we don't need any locking here
+	 * notice we do NOT ignore preallocations with pa_deleted
+	 * otherwise we could leave used blocks available for
+	 * allocation in buddy when concurrent ext4_mb_put_pa()
+	 * is dropping preallocation
+	 */
+	list_for_each(cur, &grp->bb_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		spin_lock(&pa->pa_lock);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+					     &groupnr, &start);
+		len = pa->pa_len;
+		spin_unlock(&pa->pa_lock);
+		if (unlikely(len == 0))
+			continue;
+		BUG_ON(groupnr != group);
+		ext4_set_bits(bitmap, start, len);
+		preallocated += len;
+	}
+	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
+}
+
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+	struct ext4_prealloc_space *pa;
+	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+	BUG_ON(atomic_read(&pa->pa_count));
+	BUG_ON(pa->pa_deleted == 0);
+	kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
+/*
+ * drops a reference to preallocated space descriptor
+ * if this was the last reference and the space is consumed
+ */
+static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+			struct super_block *sb, struct ext4_prealloc_space *pa)
+{
+	ext4_group_t grp;
+	ext4_fsblk_t grp_blk;
+
+	/* in this short window concurrent discard can set pa_deleted */
+	spin_lock(&pa->pa_lock);
+	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+		spin_unlock(&pa->pa_lock);
+		return;
+	}
+
+	if (pa->pa_deleted == 1) {
+		spin_unlock(&pa->pa_lock);
+		return;
+	}
+
+	pa->pa_deleted = 1;
+	spin_unlock(&pa->pa_lock);
+
+	grp_blk = pa->pa_pstart;
+	/*
+	 * If doing group-based preallocation, pa_pstart may be in the
+	 * next group when pa is used up
+	 */
+	if (pa->pa_type == MB_GROUP_PA)
+		grp_blk--;
+
+	grp = ext4_get_group_number(sb, grp_blk);
+
+	/*
+	 * possible race:
+	 *
+	 *  P1 (buddy init)			P2 (regular allocation)
+	 *					find block B in PA
+	 *  copy on-disk bitmap to buddy
+	 *  					mark B in on-disk bitmap
+	 *					drop PA from group
+	 *  mark all PAs in buddy
+	 *
+	 * thus, P1 initializes buddy with B available. to prevent this
+	 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
+	 * against that pair
+	 */
+	ext4_lock_group(sb, grp);
+	list_del(&pa->pa_group_list);
+	ext4_unlock_group(sb, grp);
+
+	spin_lock(pa->pa_obj_lock);
+	list_del_rcu(&pa->pa_inode_list);
+	spin_unlock(pa->pa_obj_lock);
+
+	call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+}
+
+/*
+ * creates new preallocated space for given inode
+ */
+static noinline_for_stack int
+ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_prealloc_space *pa;
+	struct ext4_group_info *grp;
+	struct ext4_inode_info *ei;
+
+	/* preallocate only when found space is larger then requested */
+	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+	if (pa == NULL)
+		return -ENOMEM;
+
+	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+		int winl;
+		int wins;
+		int win;
+		int offs;
+
+		/* we can't allocate as much as normalizer wants.
+		 * so, found space must get proper lstart
+		 * to cover original request */
+		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
+		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
+
+		/* we're limited by original request in that
+		 * logical block must be covered any way
+		 * winl is window we can move our chunk within */
+		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+
+		/* also, we should cover whole original request */
+		wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
+
+		/* the smallest one defines real window */
+		win = min(winl, wins);
+
+		offs = ac->ac_o_ex.fe_logical %
+			EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+		if (offs && offs < win)
+			win = offs;
+
+		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+			EXT4_NUM_B2C(sbi, win);
+		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
+		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+	}
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	pa->pa_lstart = ac->ac_b_ex.fe_logical;
+	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+	pa->pa_len = ac->ac_b_ex.fe_len;
+	pa->pa_free = pa->pa_len;
+	atomic_set(&pa->pa_count, 1);
+	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
+	pa->pa_deleted = 0;
+	pa->pa_type = MB_INODE_PA;
+
+	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_inode_pa(ac, pa);
+
+	ext4_mb_use_inode_pa(ac, pa);
+	atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
+
+	ei = EXT4_I(ac->ac_inode);
+	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+
+	pa->pa_obj_lock = &ei->i_prealloc_lock;
+	pa->pa_inode = ac->ac_inode;
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+	spin_lock(pa->pa_obj_lock);
+	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+	spin_unlock(pa->pa_obj_lock);
+
+	return 0;
+}
+
+/*
+ * creates new preallocated space for locality group inodes belongs to
+ */
+static noinline_for_stack int
+ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg;
+	struct ext4_prealloc_space *pa;
+	struct ext4_group_info *grp;
+
+	/* preallocate only when found space is larger then requested */
+	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+	BUG_ON(ext4_pspace_cachep == NULL);
+	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+	if (pa == NULL)
+		return -ENOMEM;
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+	pa->pa_lstart = pa->pa_pstart;
+	pa->pa_len = ac->ac_b_ex.fe_len;
+	pa->pa_free = pa->pa_len;
+	atomic_set(&pa->pa_count, 1);
+	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
+	pa->pa_deleted = 0;
+	pa->pa_type = MB_GROUP_PA;
+
+	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_group_pa(ac, pa);
+
+	ext4_mb_use_group_pa(ac, pa);
+	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+	lg = ac->ac_lg;
+	BUG_ON(lg == NULL);
+
+	pa->pa_obj_lock = &lg->lg_prealloc_lock;
+	pa->pa_inode = NULL;
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+	/*
+	 * We will later add the new pa to the right bucket
+	 * after updating the pa_free in ext4_mb_release_context
+	 */
+	return 0;
+}
+
+static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+{
+	int err;
+
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+		err = ext4_mb_new_group_pa(ac);
+	else
+		err = ext4_mb_new_inode_pa(ac);
+	return err;
+}
+
+/*
+ * finds all unused blocks in on-disk bitmap, frees them in
+ * in-core bitmap and buddy.
+ * @pa must be unlinked from inode and group lists, so that
+ * nobody else can find/use it.
+ * the caller MUST hold group/inode locks.
+ * TODO: optimize the case when there are no in-core structures yet
+ */
+static noinline_for_stack int
+ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
+			struct ext4_prealloc_space *pa)
+{
+	struct super_block *sb = e4b->bd_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned int end;
+	unsigned int next;
+	ext4_group_t group;
+	ext4_grpblk_t bit;
+	unsigned long long grp_blk_start;
+	int err = 0;
+	int free = 0;
+
+	BUG_ON(pa->pa_deleted == 0);
+	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
+	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+	end = bit + pa->pa_len;
+
+	while (bit < end) {
+		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+		if (bit >= end)
+			break;
+		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
+		mb_debug(1, "    free preallocated %u/%u in group %u\n",
+			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
+			 (unsigned) next - bit, (unsigned) group);
+		free += next - bit;
+
+		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+		trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+						    EXT4_C2B(sbi, bit)),
+					       next - bit);
+		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
+		bit = next + 1;
+	}
+	if (free != pa->pa_free) {
+		ext4_msg(e4b->bd_sb, KERN_CRIT,
+			 "pa %p: logic %lu, phys. %lu, len %lu",
+			 pa, (unsigned long) pa->pa_lstart,
+			 (unsigned long) pa->pa_pstart,
+			 (unsigned long) pa->pa_len);
+		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
+					free, pa->pa_free);
+		/*
+		 * pa is already deleted so we use the value obtained
+		 * from the bitmap and continue.
+		 */
+	}
+	atomic_add(free, &sbi->s_mb_discarded);
+
+	return err;
+}
+
+static noinline_for_stack int
+ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+				struct ext4_prealloc_space *pa)
+{
+	struct super_block *sb = e4b->bd_sb;
+	ext4_group_t group;
+	ext4_grpblk_t bit;
+
+	trace_ext4_mb_release_group_pa(sb, pa);
+	BUG_ON(pa->pa_deleted == 0);
+	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
+	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
+
+	return 0;
+}
+
+/*
+ * releases all preallocations in given group
+ *
+ * first, we need to decide discard policy:
+ * - when do we discard
+ *   1) ENOSPC
+ * - how many do we discard
+ *   1) how many requested
+ */
+static noinline_for_stack int
+ext4_mb_discard_group_preallocations(struct super_block *sb,
+					ext4_group_t group, int needed)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_prealloc_space *pa, *tmp;
+	struct list_head list;
+	struct ext4_buddy e4b;
+	int err;
+	int busy = 0;
+	int free = 0;
+
+	mb_debug(1, "discard preallocation for group %u\n", group);
+
+	if (list_empty(&grp->bb_prealloc_list))
+		return 0;
+
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
+	if (bitmap_bh == NULL) {
+		ext4_error(sb, "Error reading block bitmap for %u", group);
+		return 0;
+	}
+
+	err = ext4_mb_load_buddy(sb, group, &e4b);
+	if (err) {
+		ext4_error(sb, "Error loading buddy information for %u", group);
+		put_bh(bitmap_bh);
+		return 0;
+	}
+
+	if (needed == 0)
+		needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+
+	INIT_LIST_HEAD(&list);
+repeat:
+	ext4_lock_group(sb, group);
+	list_for_each_entry_safe(pa, tmp,
+				&grp->bb_prealloc_list, pa_group_list) {
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			spin_unlock(&pa->pa_lock);
+			busy = 1;
+			continue;
+		}
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+
+		/* seems this one can be freed ... */
+		pa->pa_deleted = 1;
+
+		/* we can trust pa_free ... */
+		free += pa->pa_free;
+
+		spin_unlock(&pa->pa_lock);
+
+		list_del(&pa->pa_group_list);
+		list_add(&pa->u.pa_tmp_list, &list);
+	}
+
+	/* if we still need more blocks and some PAs were used, try again */
+	if (free < needed && busy) {
+		busy = 0;
+		ext4_unlock_group(sb, group);
+		cond_resched();
+		goto repeat;
+	}
+
+	/* found anything to free? */
+	if (list_empty(&list)) {
+		BUG_ON(free != 0);
+		goto out;
+	}
+
+	/* now free all selected PAs */
+	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+
+		/* remove from object (inode or locality group) */
+		spin_lock(pa->pa_obj_lock);
+		list_del_rcu(&pa->pa_inode_list);
+		spin_unlock(pa->pa_obj_lock);
+
+		if (pa->pa_type == MB_GROUP_PA)
+			ext4_mb_release_group_pa(&e4b, pa);
+		else
+			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+
+out:
+	ext4_unlock_group(sb, group);
+	ext4_mb_unload_buddy(&e4b);
+	put_bh(bitmap_bh);
+	return free;
+}
+
+/*
+ * releases all non-used preallocated blocks for given inode
+ *
+ * It's important to discard preallocations under i_data_sem
+ * We don't want another block to be served from the prealloc
+ * space when we are discarding the inode prealloc space.
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+void ext4_discard_preallocations(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_prealloc_space *pa, *tmp;
+	ext4_group_t group = 0;
+	struct list_head list;
+	struct ext4_buddy e4b;
+	int err;
+
+	if (!S_ISREG(inode->i_mode)) {
+		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+		return;
+	}
+
+	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
+	trace_ext4_discard_preallocations(inode);
+
+	INIT_LIST_HEAD(&list);
+
+repeat:
+	/* first, collect all pa's in the inode */
+	spin_lock(&ei->i_prealloc_lock);
+	while (!list_empty(&ei->i_prealloc_list)) {
+		pa = list_entry(ei->i_prealloc_list.next,
+				struct ext4_prealloc_space, pa_inode_list);
+		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			/* this shouldn't happen often - nobody should
+			 * use preallocation while we're discarding it */
+			spin_unlock(&pa->pa_lock);
+			spin_unlock(&ei->i_prealloc_lock);
+			ext4_msg(sb, KERN_ERR,
+				 "uh-oh! used pa while discarding");
+			WARN_ON(1);
+			schedule_timeout_uninterruptible(HZ);
+			goto repeat;
+
+		}
+		if (pa->pa_deleted == 0) {
+			pa->pa_deleted = 1;
+			spin_unlock(&pa->pa_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			list_add(&pa->u.pa_tmp_list, &list);
+			continue;
+		}
+
+		/* someone is deleting pa right now */
+		spin_unlock(&pa->pa_lock);
+		spin_unlock(&ei->i_prealloc_lock);
+
+		/* we have to wait here because pa_deleted
+		 * doesn't mean pa is already unlinked from
+		 * the list. as we might be called from
+		 * ->clear_inode() the inode will get freed
+		 * and concurrent thread which is unlinking
+		 * pa from inode's list may access already
+		 * freed memory, bad-bad-bad */
+
+		/* XXX: if this happens too often, we can
+		 * add a flag to force wait only in case
+		 * of ->clear_inode(), but not in case of
+		 * regular truncate */
+		schedule_timeout_uninterruptible(HZ);
+		goto repeat;
+	}
+	spin_unlock(&ei->i_prealloc_lock);
+
+	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+		BUG_ON(pa->pa_type != MB_INODE_PA);
+		group = ext4_get_group_number(sb, pa->pa_pstart);
+
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		if (err) {
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
+			continue;
+		}
+
+		bitmap_bh = ext4_read_block_bitmap(sb, group);
+		if (bitmap_bh == NULL) {
+			ext4_error(sb, "Error reading block bitmap for %u",
+					group);
+			ext4_mb_unload_buddy(&e4b);
+			continue;
+		}
+
+		ext4_lock_group(sb, group);
+		list_del(&pa->pa_group_list);
+		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+		ext4_unlock_group(sb, group);
+
+		ext4_mb_unload_buddy(&e4b);
+		put_bh(bitmap_bh);
+
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	ext4_group_t ngroups, i;
+
+	if (!ext4_mballoc_debug ||
+	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+		return;
+
+	ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+			" Allocation context details:");
+	ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
+			ac->ac_status, ac->ac_flags);
+	ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
+		 	"goal %lu/%lu/%lu@%lu, "
+			"best %lu/%lu/%lu@%lu cr %d",
+			(unsigned long)ac->ac_o_ex.fe_group,
+			(unsigned long)ac->ac_o_ex.fe_start,
+			(unsigned long)ac->ac_o_ex.fe_len,
+			(unsigned long)ac->ac_o_ex.fe_logical,
+			(unsigned long)ac->ac_g_ex.fe_group,
+			(unsigned long)ac->ac_g_ex.fe_start,
+			(unsigned long)ac->ac_g_ex.fe_len,
+			(unsigned long)ac->ac_g_ex.fe_logical,
+			(unsigned long)ac->ac_b_ex.fe_group,
+			(unsigned long)ac->ac_b_ex.fe_start,
+			(unsigned long)ac->ac_b_ex.fe_len,
+			(unsigned long)ac->ac_b_ex.fe_logical,
+			(int)ac->ac_criteria);
+	ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
+	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
+	ngroups = ext4_get_groups_count(sb);
+	for (i = 0; i < ngroups; i++) {
+		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+		struct ext4_prealloc_space *pa;
+		ext4_grpblk_t start;
+		struct list_head *cur;
+		ext4_lock_group(sb, i);
+		list_for_each(cur, &grp->bb_prealloc_list) {
+			pa = list_entry(cur, struct ext4_prealloc_space,
+					pa_group_list);
+			spin_lock(&pa->pa_lock);
+			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+						     NULL, &start);
+			spin_unlock(&pa->pa_lock);
+			printk(KERN_ERR "PA:%u:%d:%u \n", i,
+			       start, pa->pa_len);
+		}
+		ext4_unlock_group(sb, i);
+
+		if (grp->bb_free == 0)
+			continue;
+		printk(KERN_ERR "%u: %d/%d \n",
+		       i, grp->bb_free, grp->bb_fragments);
+	}
+	printk(KERN_ERR "\n");
+}
+#else
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+	return;
+}
+#endif
+
+/*
+ * We use locality group preallocation for small size file. The size of the
+ * file is determined by the current size or the resulting size after
+ * allocation which ever is larger
+ *
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
+ */
+static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int bsbits = ac->ac_sb->s_blocksize_bits;
+	loff_t size, isize;
+
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return;
+
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
+	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+		>> bsbits;
+
+	if ((size == isize) &&
+	    !ext4_fs_is_busy(sbi) &&
+	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+		return;
+	}
+
+	if (sbi->s_mb_group_prealloc <= 0) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+		return;
+	}
+
+	/* don't use group allocation for large files */
+	size = max(size, isize);
+	if (size > sbi->s_mb_stream_request) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+		return;
+	}
+
+	BUG_ON(ac->ac_lg != NULL);
+	/*
+	 * locality group prealloc space are per cpu. The reason for having
+	 * per cpu locality group is to reduce the contention between block
+	 * request from multiple CPUs.
+	 */
+	ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
+
+	/* we're going to use group allocation */
+	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
+
+	/* serialize all allocations in the group */
+	mutex_lock(&ac->ac_lg->lg_mutex);
+}
+
+static noinline_for_stack int
+ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+				struct ext4_allocation_request *ar)
+{
+	struct super_block *sb = ar->inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_group_t group;
+	unsigned int len;
+	ext4_fsblk_t goal;
+	ext4_grpblk_t block;
+
+	/* we can't allocate > group size */
+	len = ar->len;
+
+	/* just a dirty hack to filter too big requests  */
+	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+		len = EXT4_CLUSTERS_PER_GROUP(sb);
+
+	/* start searching from the goal */
+	goal = ar->goal;
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+			goal >= ext4_blocks_count(es))
+		goal = le32_to_cpu(es->s_first_data_block);
+	ext4_get_group_no_and_offset(sb, goal, &group, &block);
+
+	/* set up allocation goals */
+	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
+	ac->ac_status = AC_STATUS_CONTINUE;
+	ac->ac_sb = sb;
+	ac->ac_inode = ar->inode;
+	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
+	ac->ac_o_ex.fe_group = group;
+	ac->ac_o_ex.fe_start = block;
+	ac->ac_o_ex.fe_len = len;
+	ac->ac_g_ex = ac->ac_o_ex;
+	ac->ac_flags = ar->flags;
+
+	/* we have to define context: we'll we work with a file or
+	 * locality group. this is a policy, actually */
+	ext4_mb_group_or_file(ac);
+
+	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+			"left: %u/%u, right %u/%u to %swritable\n",
+			(unsigned) ar->len, (unsigned) ar->logical,
+			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
+			(unsigned) ar->lleft, (unsigned) ar->pleft,
+			(unsigned) ar->lright, (unsigned) ar->pright,
+			atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+	return 0;
+
+}
+
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+					struct ext4_locality_group *lg,
+					int order, int total_entries)
+{
+	ext4_group_t group = 0;
+	struct ext4_buddy e4b;
+	struct list_head discard_list;
+	struct ext4_prealloc_space *pa, *tmp;
+
+	mb_debug(1, "discard locality group preallocation\n");
+
+	INIT_LIST_HEAD(&discard_list);
+
+	spin_lock(&lg->lg_prealloc_lock);
+	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			/*
+			 * This is the pa that we just used
+			 * for block allocation. So don't
+			 * free that
+			 */
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		/* only lg prealloc space */
+		BUG_ON(pa->pa_type != MB_GROUP_PA);
+
+		/* seems this one can be freed ... */
+		pa->pa_deleted = 1;
+		spin_unlock(&pa->pa_lock);
+
+		list_del_rcu(&pa->pa_inode_list);
+		list_add(&pa->u.pa_tmp_list, &discard_list);
+
+		total_entries--;
+		if (total_entries <= 5) {
+			/*
+			 * we want to keep only 5 entries
+			 * allowing it to grow to 8. This
+			 * mak sure we don't call discard
+			 * soon for this list.
+			 */
+			break;
+		}
+	}
+	spin_unlock(&lg->lg_prealloc_lock);
+
+	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+
+		group = ext4_get_group_number(sb, pa->pa_pstart);
+		if (ext4_mb_load_buddy(sb, group, &e4b)) {
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
+			continue;
+		}
+		ext4_lock_group(sb, group);
+		list_del(&pa->pa_group_list);
+		ext4_mb_release_group_pa(&e4b, pa);
+		ext4_unlock_group(sb, group);
+
+		ext4_mb_unload_buddy(&e4b);
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+}
+
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+	int order, added = 0, lg_prealloc_count = 1;
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg = ac->ac_lg;
+	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+
+	order = fls(pa->pa_free) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
+	/* Add the prealloc space to lg */
+	spin_lock(&lg->lg_prealloc_lock);
+	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&tmp_pa->pa_lock);
+		if (tmp_pa->pa_deleted) {
+			spin_unlock(&tmp_pa->pa_lock);
+			continue;
+		}
+		if (!added && pa->pa_free < tmp_pa->pa_free) {
+			/* Add to the tail of the previous entry */
+			list_add_tail_rcu(&pa->pa_inode_list,
+						&tmp_pa->pa_inode_list);
+			added = 1;
+			/*
+			 * we want to count the total
+			 * number of entries in the list
+			 */
+		}
+		spin_unlock(&tmp_pa->pa_lock);
+		lg_prealloc_count++;
+	}
+	if (!added)
+		list_add_tail_rcu(&pa->pa_inode_list,
+					&lg->lg_prealloc_list[order]);
+	spin_unlock(&lg->lg_prealloc_lock);
+
+	/* Now trim the list to be not more than 8 elements */
+	if (lg_prealloc_count > 8) {
+		ext4_mb_discard_lg_preallocations(sb, lg,
+						  order, lg_prealloc_count);
+		return;
+	}
+	return ;
+}
+
+/*
+ * release all resource we used in allocation
+ */
+static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+	if (pa) {
+		if (pa->pa_type == MB_GROUP_PA) {
+			/* see comment in ext4_mb_use_group_pa() */
+			spin_lock(&pa->pa_lock);
+			pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+			pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+			pa->pa_free -= ac->ac_b_ex.fe_len;
+			pa->pa_len -= ac->ac_b_ex.fe_len;
+			spin_unlock(&pa->pa_lock);
+		}
+	}
+	if (pa) {
+		/*
+		 * We want to add the pa to the right bucket.
+		 * Remove it from the list and while adding
+		 * make sure the list to which we are adding
+		 * doesn't grow big.
+		 */
+		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+			spin_lock(pa->pa_obj_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			spin_unlock(pa->pa_obj_lock);
+			ext4_mb_add_n_trim(ac);
+		}
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
+	}
+	if (ac->ac_bitmap_page)
+		page_cache_release(ac->ac_bitmap_page);
+	if (ac->ac_buddy_page)
+		page_cache_release(ac->ac_buddy_page);
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+		mutex_unlock(&ac->ac_lg->lg_mutex);
+	ext4_mb_collect_stats(ac);
+	return 0;
+}
+
+static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
+{
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+	int ret;
+	int freed = 0;
+
+	trace_ext4_mb_discard_preallocations(sb, needed);
+	for (i = 0; i < ngroups && needed > 0; i++) {
+		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+		freed += ret;
+		needed -= ret;
+	}
+
+	return freed;
+}
+
+/*
+ * Main entry point into mballoc to allocate blocks
+ * it tries to use preallocation first, then falls back
+ * to usual allocation
+ */
+ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+				struct ext4_allocation_request *ar, int *errp)
+{
+	int freed;
+	struct ext4_allocation_context *ac = NULL;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	ext4_fsblk_t block = 0;
+	unsigned int inquota = 0;
+	unsigned int reserv_clstrs = 0;
+
+	might_sleep();
+	sb = ar->inode->i_sb;
+	sbi = EXT4_SB(sb);
+
+	trace_ext4_request_blocks(ar);
+
+	/* Allow to use superuser reservation for quota file */
+	if (IS_NOQUOTA(ar->inode))
+		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
+	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
+		/* Without delayed allocation we need to verify
+		 * there is enough free blocks to do block allocation
+		 * and verify allocation doesn't exceed the quota limits.
+		 */
+		while (ar->len &&
+			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
+
+			/* let others to free the space */
+			cond_resched();
+			ar->len = ar->len >> 1;
+		}
+		if (!ar->len) {
+			*errp = -ENOSPC;
+			return 0;
+		}
+		reserv_clstrs = ar->len;
+		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+			dquot_alloc_block_nofail(ar->inode,
+						 EXT4_C2B(sbi, ar->len));
+		} else {
+			while (ar->len &&
+				dquot_alloc_block(ar->inode,
+						  EXT4_C2B(sbi, ar->len))) {
+
+				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+				ar->len--;
+			}
+		}
+		inquota = ar->len;
+		if (ar->len == 0) {
+			*errp = -EDQUOT;
+			goto out;
+		}
+	}
+
+	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
+	if (!ac) {
+		ar->len = 0;
+		*errp = -ENOMEM;
+		goto out;
+	}
+
+	*errp = ext4_mb_initialize_context(ac, ar);
+	if (*errp) {
+		ar->len = 0;
+		goto out;
+	}
+
+	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
+	if (!ext4_mb_use_preallocated(ac)) {
+		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
+		ext4_mb_normalize_request(ac, ar);
+repeat:
+		/* allocate space in core */
+		*errp = ext4_mb_regular_allocator(ac);
+		if (*errp)
+			goto discard_and_exit;
+
+		/* as we've just preallocated more space than
+		 * user requested originally, we store allocated
+		 * space in a special descriptor */
+		if (ac->ac_status == AC_STATUS_FOUND &&
+		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+			*errp = ext4_mb_new_preallocation(ac);
+		if (*errp) {
+		discard_and_exit:
+			ext4_discard_allocated_blocks(ac);
+			goto errout;
+		}
+	}
+	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
+		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+		if (*errp == -EAGAIN) {
+			/*
+			 * drop the reference that we took
+			 * in ext4_mb_use_best_found
+			 */
+			ext4_mb_release_context(ac);
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			goto repeat;
+		} else if (*errp) {
+			ext4_discard_allocated_blocks(ac);
+			goto errout;
+		} else {
+			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+			ar->len = ac->ac_b_ex.fe_len;
+		}
+	} else {
+		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
+		if (freed)
+			goto repeat;
+		*errp = -ENOSPC;
+	}
+
+errout:
+	if (*errp) {
+		ac->ac_b_ex.fe_len = 0;
+		ar->len = 0;
+		ext4_mb_show_ac(ac);
+	}
+	ext4_mb_release_context(ac);
+out:
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
+	if (inquota && ar->len < inquota)
+		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
+	if (!ar->len) {
+		if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
+			/* release all the reserved blocks if non delalloc */
+			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+						reserv_clstrs);
+	}
+
+	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
+
+	return block;
+}
+
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+			struct ext4_free_data *entry2)
+{
+	if ((entry1->efd_tid == entry2->efd_tid) &&
+	    (entry1->efd_group == entry2->efd_group) &&
+	    ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
+		return 1;
+	return 0;
+}
+
+static noinline_for_stack int
+ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+		      struct ext4_free_data *new_entry)
+{
+	ext4_group_t group = e4b->bd_group;
+	ext4_grpblk_t cluster;
+	struct ext4_free_data *entry;
+	struct ext4_group_info *db = e4b->bd_info;
+	struct super_block *sb = e4b->bd_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct rb_node **n = &db->bb_free_root.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node;
+
+	BUG_ON(!ext4_handle_valid(handle));
+	BUG_ON(e4b->bd_bitmap_page == NULL);
+	BUG_ON(e4b->bd_buddy_page == NULL);
+
+	new_node = &new_entry->efd_node;
+	cluster = new_entry->efd_start_cluster;
+
+	if (!*n) {
+		/* first free block exent. We need to
+		   protect buddy cache from being freed,
+		 * otherwise we'll refresh it from
+		 * on-disk bitmap and lose not-yet-available
+		 * blocks */
+		page_cache_get(e4b->bd_buddy_page);
+		page_cache_get(e4b->bd_bitmap_page);
+	}
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_free_data, efd_node);
+		if (cluster < entry->efd_start_cluster)
+			n = &(*n)->rb_left;
+		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
+			n = &(*n)->rb_right;
+		else {
+			ext4_grp_locked_error(sb, group, 0,
+				ext4_group_first_block_no(sb, group) +
+				EXT4_C2B(sbi, cluster),
+				"Block already on to-be-freed list");
+			return 0;
+		}
+	}
+
+	rb_link_node(new_node, parent, n);
+	rb_insert_color(new_node, &db->bb_free_root);
+
+	/* Now try to see the extent can be merged to left and right */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
+		if (can_merge(entry, new_entry) &&
+		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+			new_entry->efd_start_cluster = entry->efd_start_cluster;
+			new_entry->efd_count += entry->efd_count;
+			rb_erase(node, &(db->bb_free_root));
+			kmem_cache_free(ext4_free_data_cachep, entry);
+		}
+	}
+
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
+		if (can_merge(new_entry, entry) &&
+		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+			new_entry->efd_count += entry->efd_count;
+			rb_erase(node, &(db->bb_free_root));
+			kmem_cache_free(ext4_free_data_cachep, entry);
+		}
+	}
+	/* Add the extent to transaction's private list */
+	ext4_journal_callback_add(handle, ext4_free_data_callback,
+				  &new_entry->efd_jce);
+	return 0;
+}
+
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ * @flags:		flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+		      struct buffer_head *bh, ext4_fsblk_t block,
+		      unsigned long count, int flags)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct super_block *sb = inode->i_sb;
+	struct ext4_group_desc *gdp;
+	unsigned int overflow;
+	ext4_grpblk_t bit;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	struct ext4_sb_info *sbi;
+	struct ext4_buddy e4b;
+	unsigned int count_clusters;
+	int err = 0;
+	int ret;
+
+	might_sleep();
+	if (bh) {
+		if (block)
+			BUG_ON(block != bh->b_blocknr);
+		else
+			block = bh->b_blocknr;
+	}
+
+	sbi = EXT4_SB(sb);
+	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+	    !ext4_data_block_valid(sbi, block, count)) {
+		ext4_error(sb, "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext4_debug("freeing block %llu\n", block);
+	trace_ext4_free_blocks(inode, block, count, flags);
+
+	if (flags & EXT4_FREE_BLOCKS_FORGET) {
+		struct buffer_head *tbh = bh;
+		int i;
+
+		BUG_ON(bh && (count > 1));
+
+		for (i = 0; i < count; i++) {
+			cond_resched();
+			if (!bh)
+				tbh = sb_find_get_block(inode->i_sb,
+							block + i);
+			if (!tbh)
+				continue;
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+				    inode, tbh, block + i);
+		}
+	}
+
+	/*
+	 * We need to make sure we don't reuse the freed block until
+	 * after the transaction is committed, which we can do by
+	 * treating the block as metadata, below.  We make an
+	 * exception if the inode is to be written in writeback mode
+	 * since writeback mode has weak data consistency guarantees.
+	 */
+	if (!ext4_should_writeback_data(inode))
+		flags |= EXT4_FREE_BLOCKS_METADATA;
+
+	/*
+	 * If the extent to be freed does not begin on a cluster
+	 * boundary, we need to deal with partial clusters at the
+	 * beginning and end of the extent.  Normally we will free
+	 * blocks at the beginning or the end unless we are explicitly
+	 * requested to avoid doing so.
+	 */
+	overflow = EXT4_PBLK_COFF(sbi, block);
+	if (overflow) {
+		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+			overflow = sbi->s_cluster_ratio - overflow;
+			block += overflow;
+			if (count > overflow)
+				count -= overflow;
+			else
+				return;
+		} else {
+			block -= overflow;
+			count += overflow;
+		}
+	}
+	overflow = EXT4_LBLK_COFF(sbi, count);
+	if (overflow) {
+		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+			if (count > overflow)
+				count -= overflow;
+			else
+				return;
+		} else
+			count += sbi->s_cluster_ratio - overflow;
+	}
+
+do_more:
+	overflow = 0;
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+			ext4_get_group_info(sb, block_group))))
+		return;
+
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		overflow = EXT4_C2B(sbi, bit) + count -
+			EXT4_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	count_clusters = EXT4_NUM_B2C(sbi, count);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto error_return;
+	}
+	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!gdp) {
+		err = -EIO;
+		goto error_return;
+	}
+
+	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
+	    in_range(block, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group)) {
+
+		ext4_error(sb, "Freeing blocks in system zone - "
+			   "Block = %llu, count = %lu", block, count);
+		/* err = 0. ext4_std_error should be a no op */
+		goto error_return;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+#ifdef AGGRESSIVE_CHECK
+	{
+		int i;
+		for (i = 0; i < count_clusters; i++)
+			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
+	}
+#endif
+	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+
+	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+		struct ext4_free_data *new_entry;
+		/*
+		 * blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed
+		 */
+	retry:
+		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
+		if (!new_entry) {
+			/*
+			 * We use a retry loop because
+			 * ext4_free_blocks() is not allowed to fail.
+			 */
+			cond_resched();
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto retry;
+		}
+		new_entry->efd_start_cluster = bit;
+		new_entry->efd_group = block_group;
+		new_entry->efd_count = count_clusters;
+		new_entry->efd_tid = handle->h_transaction->t_tid;
+
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+		ext4_mb_free_metadata(handle, &e4b, new_entry);
+	} else {
+		/* need to update group_info->bb_free and bitmap
+		 * with group lock held. generate_buddy look at
+		 * them with group lock_held
+		 */
+		if (test_opt(sb, DISCARD)) {
+			err = ext4_issue_discard(sb, block_group, bit, count);
+			if (err && err != -EOPNOTSUPP)
+				ext4_msg(sb, KERN_WARNING, "discard request in"
+					 " group:%d block:%d count:%lu failed"
+					 " with %d", block_group, bit, count,
+					 err);
+		} else
+			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+		mb_free_blocks(inode, &e4b, bit, count_clusters);
+	}
+
+	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+	ext4_free_group_clusters_set(sb, gdp, ret);
+	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic64_add(count_clusters,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+	}
+
+	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+
+	ext4_mb_unload_buddy(&e4b);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
+
+	if (overflow && !err) {
+		block += count;
+		count = overflow;
+		put_bh(bitmap_bh);
+		goto do_more;
+	}
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return;
+}
+
+/**
+ * ext4_group_add_blocks() -- Add given blocks to an existing group
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physical block to add to the block group
+ * @count:			number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	ext4_grpblk_t bit;
+	unsigned int i;
+	struct ext4_group_desc *desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_buddy e4b;
+	int err = 0, ret, blk_free_count;
+	ext4_grpblk_t blocks_freed;
+
+	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+	if (count == 0)
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		ext4_warning(sb, "too much blocks added to group %u\n",
+			     block_group);
+		err = -EINVAL;
+		goto error_return;
+	}
+
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto error_return;
+	}
+
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!desc) {
+		err = -EIO;
+		goto error_return;
+	}
+
+	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, desc),
+		     sbi->s_itb_per_group)) {
+		ext4_error(sb, "Adding blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
+		err = -EINVAL;
+		goto error_return;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	for (i = 0, blocks_freed = 0; i < count; i++) {
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+			ext4_error(sb, "bit already cleared for block %llu",
+				   (ext4_fsblk_t)(block + i));
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			blocks_freed++;
+		}
+	}
+
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+
+	/*
+	 * need to update group_info->bb_free and bitmap
+	 * with group lock held. generate_buddy look at
+	 * them with group lock_held
+	 */
+	ext4_lock_group(sb, block_group);
+	mb_clear_bits(bitmap_bh->b_data, bit, count);
+	mb_free_blocks(NULL, &e4b, bit, count);
+	blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+	ext4_free_group_clusters_set(sb, desc, blk_free_count);
+	ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
+	ext4_group_desc_csum_set(sb, block_group, desc);
+	ext4_unlock_group(sb, block_group);
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_NUM_B2C(sbi, blocks_freed));
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+	}
+
+	ext4_mb_unload_buddy(&e4b);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
+
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return err;
+}
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+			     ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
+{
+	struct ext4_free_extent ex;
+	int ret = 0;
+
+	trace_ext4_trim_extent(sb, group, start, count);
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+	ret = ext4_issue_discard(sb, group, start, count);
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @group:		group to be trimmed
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+		   ext4_grpblk_t start, ext4_grpblk_t max,
+		   ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t next, count = 0, free_count = 0;
+	struct ext4_buddy e4b;
+	int ret = 0;
+
+	trace_ext4_trim_all_free(sb, group, start, max);
+
+	ret = ext4_mb_load_buddy(sb, group, &e4b);
+	if (ret) {
+		ext4_error(sb, "Error in loading buddy "
+				"information for %u", group);
+		return ret;
+	}
+	bitmap = e4b.bd_bitmap;
+
+	ext4_lock_group(sb, group);
+	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
+		goto out;
+
+	start = (e4b.bd_info->bb_first_free > start) ?
+		e4b.bd_info->bb_first_free : start;
+
+	while (start <= max) {
+		start = mb_find_next_zero_bit(bitmap, max + 1, start);
+		if (start > max)
+			break;
+		next = mb_find_next_bit(bitmap, max + 1, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start,
+					       next - start, group, &e4b);
+			if (ret && ret != -EOPNOTSUPP)
+				break;
+			ret = 0;
+			count += next - start;
+		}
+		free_count += next - start;
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, group);
+			cond_resched();
+			ext4_lock_group(sb, group);
+		}
+
+		if ((e4b.bd_info->bb_free - free_count) < minblocks)
+			break;
+	}
+
+	if (!ret) {
+		ret = count;
+		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+	}
+out:
+	ext4_unlock_group(sb, group);
+	ext4_mb_unload_buddy(&e4b);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	return ret;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @range:		fstrim_range structure
+ *
+ * start:	First Byte to trim
+ * len:		number of Bytes to trim from start
+ * minlen:	minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ext4_group_info *grp;
+	ext4_group_t group, first_group, last_group;
+	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
+	uint64_t start, end, minlen, trimmed = 0;
+	ext4_fsblk_t first_data_blk =
+			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	end = start + (range->len >> sb->s_blocksize_bits) - 1;
+	minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+			      range->minlen >> sb->s_blocksize_bits);
+
+	if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
+	    start >= max_blks ||
+	    range->len < sb->s_blocksize)
+		return -EINVAL;
+	if (end >= max_blks)
+		end = max_blks - 1;
+	if (end <= first_data_blk)
+		goto out;
+	if (start < first_data_blk)
+		start = first_data_blk;
+
+	/* Determine first and last group to examine based on start and end */
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+				     &first_group, &first_cluster);
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
+				     &last_group, &last_cluster);
+
+	/* end now represents the last cluster to discard in this group */
+	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+
+	for (group = first_group; group <= last_group; group++) {
+		grp = ext4_get_group_info(sb, group);
+		/* We only do this if the grp has never been initialized */
+		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+			ret = ext4_mb_init_group(sb, group);
+			if (ret)
+				break;
+		}
+
+		/*
+		 * For all the groups except the last one, last cluster will
+		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+		 * change it for the last group, note that last_cluster is
+		 * already computed earlier by ext4_get_group_no_and_offset()
+		 */
+		if (group == last_group)
+			end = last_cluster;
+
+		if (grp->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, group, first_cluster,
+						end, minlen);
+			if (cnt < 0) {
+				ret = cnt;
+				break;
+			}
+			trimmed += cnt;
+		}
+
+		/*
+		 * For every group except the first one, we are sure
+		 * that the first cluster to discard will be cluster #0.
+		 */
+		first_cluster = 0;
+	}
+
+	if (!ret)
+		atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+
+out:
+	range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
+	return ret;
+}
diff --git a/kernel/fs/ext4/mballoc.h b/kernel/fs/ext4/mballoc.h
new file mode 100644
index 000000000..d634e183b
--- /dev/null
+++ b/kernel/fs/ext4/mballoc.h
@@ -0,0 +1,215 @@
+/*
+ *  fs/ext4/mballoc.h
+ *
+ *  Written by: Alex Tomas <alex@clusterfs.com>
+ *
+ */
+#ifndef _EXT4_MBALLOC_H
+#define _EXT4_MBALLOC_H
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#ifdef CONFIG_EXT4_DEBUG
+extern ushort ext4_mballoc_debug;
+
+#define mb_debug(n, fmt, a...)	                                        \
+	do {								\
+		if ((n) <= ext4_mballoc_debug) {		        \
+			printk(KERN_DEBUG "(%s, %d): %s: ",		\
+			       __FILE__, __LINE__, __func__);		\
+			printk(fmt, ## a);				\
+		}							\
+	} while (0)
+#else
+#define mb_debug(n, fmt, a...)		no_printk(fmt, ## a)
+#endif
+
+#define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
+#define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN		200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN		10
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS		0
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS		2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC	512
+
+
+struct ext4_free_data {
+	/* MUST be the first member */
+	struct ext4_journal_cb_entry	efd_jce;
+
+	/* ext4_free_data private data starts from here */
+
+	/* this links the free block information from group_info */
+	struct rb_node			efd_node;
+
+	/* group which free block extent belongs */
+	ext4_group_t			efd_group;
+
+	/* free block extent */
+	ext4_grpblk_t			efd_start_cluster;
+	ext4_grpblk_t			efd_count;
+
+	/* transaction which freed this extent */
+	tid_t				efd_tid;
+};
+
+struct ext4_prealloc_space {
+	struct list_head	pa_inode_list;
+	struct list_head	pa_group_list;
+	union {
+		struct list_head pa_tmp_list;
+		struct rcu_head	pa_rcu;
+	} u;
+	spinlock_t		pa_lock;
+	atomic_t		pa_count;
+	unsigned		pa_deleted;
+	ext4_fsblk_t		pa_pstart;	/* phys. block */
+	ext4_lblk_t		pa_lstart;	/* log. block */
+	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
+	ext4_grpblk_t		pa_free;	/* how many blocks are free */
+	unsigned short		pa_type;	/* pa type. inode or group */
+	spinlock_t		*pa_obj_lock;
+	struct inode		*pa_inode;	/* hack, for history only */
+};
+
+enum {
+	MB_INODE_PA = 0,
+	MB_GROUP_PA = 1
+};
+
+struct ext4_free_extent {
+	ext4_lblk_t fe_logical;
+	ext4_grpblk_t fe_start;	/* In cluster units */
+	ext4_group_t fe_group;
+	ext4_grpblk_t fe_len;	/* In cluster units */
+};
+
+/*
+ * Locality group:
+ *   we try to group all related changes together
+ *   so that writeback can flush/allocate them together as well
+ *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ *   (512). We store prealloc space into the hash based on the pa_free blocks
+ *   order value.ie, fls(pa_free)-1;
+ */
+#define PREALLOC_TB_SIZE 10
+struct ext4_locality_group {
+	/* for allocator */
+	/* to serialize allocates */
+	struct mutex		lg_mutex;
+	/* list of preallocations */
+	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
+	spinlock_t		lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+	struct inode *ac_inode;
+	struct super_block *ac_sb;
+
+	/* original request */
+	struct ext4_free_extent ac_o_ex;
+
+	/* goal request (normalized ac_o_ex) */
+	struct ext4_free_extent ac_g_ex;
+
+	/* the best found extent */
+	struct ext4_free_extent ac_b_ex;
+
+	/* copy of the best found extent taken before preallocation efforts */
+	struct ext4_free_extent ac_f_ex;
+
+	__u16 ac_groups_scanned;
+	__u16 ac_found;
+	__u16 ac_tail;
+	__u16 ac_buddy;
+	__u16 ac_flags;		/* allocation hints */
+	__u8 ac_status;
+	__u8 ac_criteria;
+	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
+				 * N > 0, the field stores N, otherwise 0 */
+	__u8 ac_op;		/* operation, for history only */
+	struct page *ac_bitmap_page;
+	struct page *ac_buddy_page;
+	struct ext4_prealloc_space *ac_pa;
+	struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE	1
+#define AC_STATUS_FOUND		2
+#define AC_STATUS_BREAK		3
+
+struct ext4_buddy {
+	struct page *bd_buddy_page;
+	void *bd_buddy;
+	struct page *bd_bitmap_page;
+	void *bd_bitmap;
+	struct ext4_group_info *bd_info;
+	struct super_block *bd_sb;
+	__u16 bd_blkbits;
+	ext4_group_t bd_group;
+};
+
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+					struct ext4_free_extent *fex)
+{
+	return ext4_group_first_block_no(sb, fex->fe_group) +
+		(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
+}
+#endif
diff --git a/kernel/fs/ext4/migrate.c b/kernel/fs/ext4/migrate.c
new file mode 100644
index 000000000..b52374e42
--- /dev/null
+++ b/kernel/fs/ext4/migrate.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+
+/*
+ * The contiguous blocks details which can be
+ * represented by a single extent
+ */
+struct migrate_struct {
+	ext4_lblk_t first_block, last_block, curr_block;
+	ext4_fsblk_t first_pblock, last_pblock;
+};
+
+static int finish_range(handle_t *handle, struct inode *inode,
+				struct migrate_struct *lb)
+
+{
+	int retval = 0, needed;
+	struct ext4_extent newext;
+	struct ext4_ext_path *path;
+	if (lb->first_pblock == 0)
+		return 0;
+
+	/* Add the extent to temp inode*/
+	newext.ee_block = cpu_to_le32(lb->first_block);
+	newext.ee_len   = cpu_to_le16(lb->last_block - lb->first_block + 1);
+	ext4_ext_store_pblock(&newext, lb->first_pblock);
+	/* Locking only for convinience since we are operating on temp inode */
+	down_write(&EXT4_I(inode)->i_data_sem);
+	path = ext4_find_extent(inode, lb->first_block, NULL, 0);
+	if (IS_ERR(path)) {
+		retval = PTR_ERR(path);
+		path = NULL;
+		goto err_out;
+	}
+
+	/*
+	 * Calculate the credit needed to inserting this extent
+	 * Since we are doing this in loop we may accumalate extra
+	 * credit. But below we try to not accumalate too much
+	 * of them by restarting the journal.
+	 */
+	needed = ext4_ext_calc_credits_for_single_extent(inode,
+		    lb->last_block - lb->first_block + 1, path);
+
+	/*
+	 * Make sure the credit we accumalated is not really high
+	 */
+	if (needed && ext4_handle_has_enough_credits(handle,
+						EXT4_RESERVE_TRANS_BLOCKS)) {
+		up_write((&EXT4_I(inode)->i_data_sem));
+		retval = ext4_journal_restart(handle, needed);
+		down_write((&EXT4_I(inode)->i_data_sem));
+		if (retval)
+			goto err_out;
+	} else if (needed) {
+		retval = ext4_journal_extend(handle, needed);
+		if (retval) {
+			/*
+			 * IF not able to extend the journal restart the journal
+			 */
+			up_write((&EXT4_I(inode)->i_data_sem));
+			retval = ext4_journal_restart(handle, needed);
+			down_write((&EXT4_I(inode)->i_data_sem));
+			if (retval)
+				goto err_out;
+		}
+	}
+	retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
+err_out:
+	up_write((&EXT4_I(inode)->i_data_sem));
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	lb->first_pblock = 0;
+	return retval;
+}
+
+static int update_extent_range(handle_t *handle, struct inode *inode,
+			       ext4_fsblk_t pblock, struct migrate_struct *lb)
+{
+	int retval;
+	/*
+	 * See if we can add on to the existing range (if it exists)
+	 */
+	if (lb->first_pblock &&
+		(lb->last_pblock+1 == pblock) &&
+		(lb->last_block+1 == lb->curr_block)) {
+		lb->last_pblock = pblock;
+		lb->last_block = lb->curr_block;
+		lb->curr_block++;
+		return 0;
+	}
+	/*
+	 * Start a new range.
+	 */
+	retval = finish_range(handle, inode, lb);
+	lb->first_pblock = lb->last_pblock = pblock;
+	lb->first_block = lb->last_block = lb->curr_block;
+	lb->curr_block++;
+	return retval;
+}
+
+static int update_ind_extent_range(handle_t *handle, struct inode *inode,
+				   ext4_fsblk_t pblock,
+				   struct migrate_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (i_data[i]) {
+			retval = update_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]), lb);
+			if (retval)
+				break;
+		} else {
+			lb->curr_block++;
+		}
+	}
+	put_bh(bh);
+	return retval;
+
+}
+
+static int update_dind_extent_range(handle_t *handle, struct inode *inode,
+				    ext4_fsblk_t pblock,
+				    struct migrate_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (i_data[i]) {
+			retval = update_ind_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]), lb);
+			if (retval)
+				break;
+		} else {
+			/* Only update the file block number */
+			lb->curr_block += max_entries;
+		}
+	}
+	put_bh(bh);
+	return retval;
+
+}
+
+static int update_tind_extent_range(handle_t *handle, struct inode *inode,
+				    ext4_fsblk_t pblock,
+				    struct migrate_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (i_data[i]) {
+			retval = update_dind_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]), lb);
+			if (retval)
+				break;
+		} else {
+			/* Only update the file block number */
+			lb->curr_block += max_entries * max_entries;
+		}
+	}
+	put_bh(bh);
+	return retval;
+
+}
+
+static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
+{
+	int retval = 0, needed;
+
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+		return 0;
+	/*
+	 * We are freeing a blocks. During this we touch
+	 * superblock, group descriptor and block bitmap.
+	 * So allocate a credit of 3. We may update
+	 * quota (user and group).
+	 */
+	needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+	if (ext4_journal_extend(handle, needed) != 0)
+		retval = ext4_journal_restart(handle, needed);
+
+	return retval;
+}
+
+static int free_dind_blocks(handle_t *handle,
+				struct inode *inode, __le32 i_data)
+{
+	int i;
+	__le32 *tmp_idata;
+	struct buffer_head *bh;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+	if (!bh)
+		return -EIO;
+
+	tmp_idata = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (tmp_idata[i]) {
+			extend_credit_for_blkdel(handle, inode);
+			ext4_free_blocks(handle, inode, NULL,
+					 le32_to_cpu(tmp_idata[i]), 1,
+					 EXT4_FREE_BLOCKS_METADATA |
+					 EXT4_FREE_BLOCKS_FORGET);
+		}
+	}
+	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
+	return 0;
+}
+
+static int free_tind_blocks(handle_t *handle,
+				struct inode *inode, __le32 i_data)
+{
+	int i, retval = 0;
+	__le32 *tmp_idata;
+	struct buffer_head *bh;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+	if (!bh)
+		return -EIO;
+
+	tmp_idata = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (tmp_idata[i]) {
+			retval = free_dind_blocks(handle,
+					inode, tmp_idata[i]);
+			if (retval) {
+				put_bh(bh);
+				return retval;
+			}
+		}
+	}
+	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
+	return 0;
+}
+
+static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
+{
+	int retval;
+
+	/* ei->i_data[EXT4_IND_BLOCK] */
+	if (i_data[0]) {
+		extend_credit_for_blkdel(handle, inode);
+		ext4_free_blocks(handle, inode, NULL,
+				le32_to_cpu(i_data[0]), 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
+	}
+
+	/* ei->i_data[EXT4_DIND_BLOCK] */
+	if (i_data[1]) {
+		retval = free_dind_blocks(handle, inode, i_data[1]);
+		if (retval)
+			return retval;
+	}
+
+	/* ei->i_data[EXT4_TIND_BLOCK] */
+	if (i_data[2]) {
+		retval = free_tind_blocks(handle, inode, i_data[2]);
+		if (retval)
+			return retval;
+	}
+	return 0;
+}
+
+static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
+						struct inode *tmp_inode)
+{
+	int retval;
+	__le32	i_data[3];
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
+
+	/*
+	 * One credit accounted for writing the
+	 * i_data field of the original inode
+	 */
+	retval = ext4_journal_extend(handle, 1);
+	if (retval) {
+		retval = ext4_journal_restart(handle, 1);
+		if (retval)
+			goto err_out;
+	}
+
+	i_data[0] = ei->i_data[EXT4_IND_BLOCK];
+	i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
+	i_data[2] = ei->i_data[EXT4_TIND_BLOCK];
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	/*
+	 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
+	 * happened after we started the migrate. We need to
+	 * fail the migrate
+	 */
+	if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+		retval = -EAGAIN;
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto err_out;
+	} else
+		ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+	/*
+	 * We have the extent map build with the tmp inode.
+	 * Now copy the i_data across
+	 */
+	ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+	memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
+
+	/*
+	 * Update i_blocks with the new blocks that got
+	 * allocated while adding extents for extent index
+	 * blocks.
+	 *
+	 * While converting to extents we need not
+	 * update the orignal inode i_blocks for extent blocks
+	 * via quota APIs. The quota update happened via tmp_inode already.
+	 */
+	spin_lock(&inode->i_lock);
+	inode->i_blocks += tmp_inode->i_blocks;
+	spin_unlock(&inode->i_lock);
+	up_write(&EXT4_I(inode)->i_data_sem);
+
+	/*
+	 * We mark the inode dirty after, because we decrement the
+	 * i_blocks when freeing the indirect meta-data blocks
+	 */
+	retval = free_ind_block(handle, inode, i_data);
+	ext4_mark_inode_dirty(handle, inode);
+
+err_out:
+	return retval;
+}
+
+static int free_ext_idx(handle_t *handle, struct inode *inode,
+					struct ext4_extent_idx *ix)
+{
+	int i, retval = 0;
+	ext4_fsblk_t block;
+	struct buffer_head *bh;
+	struct ext4_extent_header *eh;
+
+	block = ext4_idx_pblock(ix);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		return -EIO;
+
+	eh = (struct ext4_extent_header *)bh->b_data;
+	if (eh->eh_depth != 0) {
+		ix = EXT_FIRST_INDEX(eh);
+		for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+			retval = free_ext_idx(handle, inode, ix);
+			if (retval)
+				break;
+		}
+	}
+	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
+	ext4_free_blocks(handle, inode, NULL, block, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+	return retval;
+}
+
+/*
+ * Free the extent meta data blocks only
+ */
+static int free_ext_block(handle_t *handle, struct inode *inode)
+{
+	int i, retval = 0;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
+	struct ext4_extent_idx *ix;
+	if (eh->eh_depth == 0)
+		/*
+		 * No extra blocks allocated for extent meta data
+		 */
+		return 0;
+	ix = EXT_FIRST_INDEX(eh);
+	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+		retval = free_ext_idx(handle, inode, ix);
+		if (retval)
+			return retval;
+	}
+	return retval;
+}
+
+int ext4_ext_migrate(struct inode *inode)
+{
+	handle_t *handle;
+	int retval = 0, i;
+	__le32 *i_data;
+	struct ext4_inode_info *ei;
+	struct inode *tmp_inode = NULL;
+	struct migrate_struct lb;
+	unsigned long max_entries;
+	__u32 goal;
+	uid_t owner[2];
+
+	/*
+	 * If the filesystem does not support extents, or the inode
+	 * already is extent-based, error out.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EINVAL;
+
+	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+		/*
+		 * don't migrate fast symlink
+		 */
+		return retval;
+
+	/*
+	 * Worst case we can touch the allocation bitmaps, a bgd
+	 * block, and a block to link in the orphan list.  We do need
+	 * need to worry about credits for modifying the quota inode.
+	 */
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+		4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		return retval;
+	}
+	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+	owner[0] = i_uid_read(inode);
+	owner[1] = i_gid_read(inode);
+	tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root),
+				   S_IFREG, NULL, goal, owner);
+	if (IS_ERR(tmp_inode)) {
+		retval = PTR_ERR(tmp_inode);
+		ext4_journal_stop(handle);
+		return retval;
+	}
+	i_size_write(tmp_inode, i_size_read(inode));
+	/*
+	 * Set the i_nlink to zero so it will be deleted later
+	 * when we drop inode reference.
+	 */
+	clear_nlink(tmp_inode);
+
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_orphan_add(handle, tmp_inode);
+	ext4_journal_stop(handle);
+
+	/*
+	 * start with one credit accounted for
+	 * superblock modification.
+	 *
+	 * For the tmp_inode we already have committed the
+	 * transaction that created the inode. Later as and
+	 * when we add extents we extent the journal
+	 */
+	/*
+	 * Even though we take i_mutex we can still cause block
+	 * allocation via mmap write to holes. If we have allocated
+	 * new blocks we fail migrate.  New block allocation will
+	 * clear EXT4_STATE_EXT_MIGRATE flag.  The flag is updated
+	 * with i_data_sem held to prevent racing with block
+	 * allocation.
+	 */
+	down_read(&EXT4_I(inode)->i_data_sem);
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+	if (IS_ERR(handle)) {
+		/*
+		 * It is impossible to update on-disk structures without
+		 * a handle, so just rollback in-core changes and live other
+		 * work to orphan_list_cleanup()
+		 */
+		ext4_orphan_del(NULL, tmp_inode);
+		retval = PTR_ERR(handle);
+		goto out;
+	}
+
+	ei = EXT4_I(inode);
+	i_data = ei->i_data;
+	memset(&lb, 0, sizeof(lb));
+
+	/* 32 bit block address 4 bytes */
+	max_entries = inode->i_sb->s_blocksize >> 2;
+	for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
+		if (i_data[i]) {
+			retval = update_extent_range(handle, tmp_inode,
+						le32_to_cpu(i_data[i]), &lb);
+			if (retval)
+				goto err_out;
+		} else
+			lb.curr_block++;
+	}
+	if (i_data[EXT4_IND_BLOCK]) {
+		retval = update_ind_extent_range(handle, tmp_inode,
+				le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
+			if (retval)
+				goto err_out;
+	} else
+		lb.curr_block += max_entries;
+	if (i_data[EXT4_DIND_BLOCK]) {
+		retval = update_dind_extent_range(handle, tmp_inode,
+				le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
+			if (retval)
+				goto err_out;
+	} else
+		lb.curr_block += max_entries * max_entries;
+	if (i_data[EXT4_TIND_BLOCK]) {
+		retval = update_tind_extent_range(handle, tmp_inode,
+				le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
+			if (retval)
+				goto err_out;
+	}
+	/*
+	 * Build the last extent
+	 */
+	retval = finish_range(handle, tmp_inode, &lb);
+err_out:
+	if (retval)
+		/*
+		 * Failure case delete the extent information with the
+		 * tmp_inode
+		 */
+		free_ext_block(handle, tmp_inode);
+	else {
+		retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
+		if (retval)
+			/*
+			 * if we fail to swap inode data free the extent
+			 * details of the tmp inode
+			 */
+			free_ext_block(handle, tmp_inode);
+	}
+
+	/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
+	if (ext4_journal_extend(handle, 1) != 0)
+		ext4_journal_restart(handle, 1);
+
+	/*
+	 * Mark the tmp_inode as of size zero
+	 */
+	i_size_write(tmp_inode, 0);
+
+	/*
+	 * set the  i_blocks count to zero
+	 * so that the ext4_evict_inode() does the
+	 * right job
+	 *
+	 * We don't need to take the i_lock because
+	 * the inode is not visible to user space.
+	 */
+	tmp_inode->i_blocks = 0;
+
+	/* Reset the extent details */
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_journal_stop(handle);
+out:
+	unlock_new_inode(tmp_inode);
+	iput(tmp_inode);
+
+	return retval;
+}
+
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+	struct ext4_extent_header	*eh;
+	struct ext4_super_block		*es = EXT4_SB(inode->i_sb)->s_es;
+	struct ext4_inode_info		*ei = EXT4_I(inode);
+	struct ext4_extent		*ex;
+	unsigned int			i, len;
+	ext4_fsblk_t			blk;
+	handle_t			*handle;
+	int				ret;
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EINVAL;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+		return -EOPNOTSUPP;
+
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_ext_check_inode(inode);
+	if (ret)
+		goto errout;
+
+	eh = ext_inode_hdr(inode);
+	ex  = EXT_FIRST_EXTENT(eh);
+	if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+	    eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+		ret = -EOPNOTSUPP;
+		goto errout;
+	}
+	if (eh->eh_entries == 0)
+		blk = len = 0;
+	else {
+		len = le16_to_cpu(ex->ee_len);
+		blk = ext4_ext_pblock(ex);
+		if (len > EXT4_NDIR_BLOCKS) {
+			ret = -EOPNOTSUPP;
+			goto errout;
+		}
+	}
+
+	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	for (i=0; i < len; i++)
+		ei->i_data[i] = cpu_to_le32(blk++);
+	ext4_mark_inode_dirty(handle, inode);
+errout:
+	ext4_journal_stop(handle);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ret;
+}
diff --git a/kernel/fs/ext4/mmp.c b/kernel/fs/ext4/mmp.c
new file mode 100644
index 000000000..8313ca332
--- /dev/null
+++ b/kernel/fs/ext4/mmp.c
@@ -0,0 +1,393 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/* Checksumming functions */
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int offset = offsetof(struct mmp_struct, mmp_checksum);
+	__u32 csum;
+
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+
+	return cpu_to_le32(csum);
+}
+
+static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+	if (!ext4_has_metadata_csum(sb))
+		return 1;
+
+	return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+	if (!ext4_has_metadata_csum(sb))
+		return;
+
+	mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+{
+	struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+	/*
+	 * We protect against freezing so that we don't create dirty buffers
+	 * on frozen filesystem.
+	 */
+	sb_start_write(sb);
+	ext4_mmp_csum_set(sb, mmp);
+	mark_buffer_dirty(bh);
+	lock_buffer(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+	wait_on_buffer(bh);
+	sb_end_write(sb);
+	if (unlikely(!buffer_uptodate(bh)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+			  ext4_fsblk_t mmp_block)
+{
+	struct mmp_struct *mmp;
+
+	if (*bh)
+		clear_buffer_uptodate(*bh);
+
+	/* This would be sb_bread(sb, mmp_block), except we need to be sure
+	 * that the MD RAID device cache has been bypassed, and that the read
+	 * is not blocked in the elevator. */
+	if (!*bh)
+		*bh = sb_getblk(sb, mmp_block);
+	if (!*bh)
+		return -ENOMEM;
+	if (*bh) {
+		get_bh(*bh);
+		lock_buffer(*bh);
+		(*bh)->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
+		wait_on_buffer(*bh);
+		if (!buffer_uptodate(*bh)) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+	}
+	if (unlikely(!*bh)) {
+		ext4_warning(sb, "Error while reading MMP block %llu",
+			     mmp_block);
+		return -EIO;
+	}
+
+	mmp = (struct mmp_struct *)((*bh)->b_data);
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
+	    !ext4_mmp_csum_verify(sb, mmp))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+		    const char *function, unsigned int line, const char *msg)
+{
+	__ext4_warning(sb, function, line, msg);
+	__ext4_warning(sb, function, line,
+		       "MMP failure info: last update time: %llu, last update "
+		       "node: %s, last update device: %s\n",
+		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+		       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct super_block *sb = ((struct mmpd_data *) data)->sb;
+	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct mmp_struct *mmp;
+	ext4_fsblk_t mmp_block;
+	u32 seq = 0;
+	unsigned long failed_writes = 0;
+	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval;
+
+	mmp_block = le64_to_cpu(es->s_mmp_block);
+	mmp = (struct mmp_struct *)(bh->b_data);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+				 EXT4_MMP_MIN_CHECK_INTERVAL);
+	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
+	       sizeof(mmp->mmp_nodename));
+
+	while (!kthread_should_stop()) {
+		if (++seq > EXT4_MMP_SEQ_MAX)
+			seq = 1;
+
+		mmp->mmp_seq = cpu_to_le32(seq);
+		mmp->mmp_time = cpu_to_le64(get_seconds());
+		last_update_time = jiffies;
+
+		retval = write_mmp_block(sb, bh);
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval) {
+			if ((failed_writes % 60) == 0)
+				ext4_error(sb, "Error writing to MMP block");
+			failed_writes++;
+		}
+
+		if (!(le32_to_cpu(es->s_feature_incompat) &
+		    EXT4_FEATURE_INCOMPAT_MMP)) {
+			ext4_warning(sb, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_warning(sb, "kmmpd being stopped since filesystem "
+				     "has been remounted as readonly.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			struct buffer_head *bh_check = NULL;
+			struct mmp_struct *mmp_check;
+
+			retval = read_mmp_block(sb, &bh_check, mmp_block);
+			if (retval) {
+				ext4_error(sb, "error reading MMP data: %d",
+					   retval);
+
+				EXT4_SB(sb)->s_mmp_tsk = NULL;
+				goto failed;
+			}
+
+			mmp_check = (struct mmp_struct *)(bh_check->b_data);
+			if (mmp->mmp_seq != mmp_check->mmp_seq ||
+			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+				   sizeof(mmp->mmp_nodename))) {
+				dump_mmp_msg(sb, mmp_check,
+					     "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				ext4_error(sb, "abort");
+				goto failed;
+			}
+			put_bh(bh_check);
+		}
+
+		 /*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+					     EXT4_MMP_MAX_CHECK_INTERVAL),
+					 EXT4_MMP_MIN_CHECK_INTERVAL);
+		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+
+	retval = write_mmp_block(sb, bh);
+
+failed:
+	kfree(data);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		new_seq = prandom_u32();
+	} while (new_seq > EXT4_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+				    ext4_fsblk_t mmp_block)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *bh = NULL;
+	struct mmp_struct *mmp = NULL;
+	struct mmpd_data *mmpd_data;
+	u32 seq;
+	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned int wait_time = 0;
+	int retval;
+
+	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+	    mmp_block >= ext4_blocks_count(es)) {
+		ext4_warning(sb, "Invalid MMP block in superblock");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+
+	mmp = (struct mmp_struct *)(bh->b_data);
+
+	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * update_interval from the superblock.
+	 */
+	if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+		mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
+
+	seq = le32_to_cpu(mmp->mmp_seq);
+	if (seq == EXT4_MMP_SEQ_CLEAN)
+		goto skip;
+
+	if (seq == EXT4_MMP_SEQ_FSCK) {
+		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		goto failed;
+	}
+
+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+		ext4_warning(sb, "MMP interval %u higher than expected, please"
+			     " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	seq = mmp_new_seq();
+	mmp->mmp_seq = cpu_to_le32(seq);
+
+	retval = write_mmp_block(sb, bh);
+	if (retval)
+		goto failed;
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	if (!mmpd_data) {
+		ext4_warning(sb, "not enough memory for mmpd_data");
+		goto failed;
+	}
+	mmpd_data->sb = sb;
+	mmpd_data->bh = bh;
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+					     bdevname(bh->b_bdev,
+						      mmp->mmp_bdevname));
+	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+		EXT4_SB(sb)->s_mmp_tsk = NULL;
+		kfree(mmpd_data);
+		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+			     sb->s_id);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	brelse(bh);
+	return 1;
+}
+
+
diff --git a/kernel/fs/ext4/move_extent.c b/kernel/fs/ext4/move_extent.c
new file mode 100644
index 000000000..370420bfa
--- /dev/null
+++ b/kernel/fs/ext4/move_extent.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "ext4_extents.h"
+
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode:	an inode which is searched
+ * @lblock:	logical block number to find an extent path
+ * @path:	pointer to an extent path pointer (for output)
+ *
+ * ext4_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+		struct ext4_ext_path **ppath)
+{
+	struct ext4_ext_path *path;
+
+	path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	if (path[ext_depth(inode)].p_ext == NULL) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		*ppath = NULL;
+		return -ENODATA;
+	}
+	*ppath = path;
+	return 0;
+}
+
+/**
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ *                                   of i_data_sem
+ *
+ * Acquire write lock of i_data_sem of the two inodes
+ */
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
+{
+	if (first < second) {
+		down_write(&EXT4_I(first)->i_data_sem);
+		down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+	} else {
+		down_write(&EXT4_I(second)->i_data_sem);
+		down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
+
+	}
+}
+
+/**
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ *
+ * @orig_inode:		original inode structure to be released its lock first
+ * @donor_inode:	donor inode structure to be released its lock second
+ * Release write lock of i_data_sem of two inodes (orig and donor).
+ */
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+			      struct inode *donor_inode)
+{
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode:		inode in question
+ * @from:		block offset of inode
+ * @count:		block count to be checked
+ * @unwritten:		extents expected to be unwritten
+ * @err:		pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+		    int unwritten, int *err)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ext;
+	int ret = 0;
+	ext4_lblk_t last = from + count;
+	while (from < last) {
+		*err = get_ext_path(inode, from, &path);
+		if (*err)
+			goto out;
+		ext = path[ext_depth(inode)].p_ext;
+		if (unwritten != ext4_ext_is_unwritten(ext))
+			goto out;
+		from += ext4_ext_get_actual_len(ext);
+		ext4_ext_drop_refs(path);
+	}
+	ret = 1;
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	return ret;
+}
+
+/**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1:	the inode structure
+ * @inode2:	the inode structure
+ * @index1:	page index
+ * @index2:	page index
+ * @page:	result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
+static int
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+		      pgoff_t index1, pgoff_t index2, struct page *page[2])
+{
+	struct address_space *mapping[2];
+	unsigned fl = AOP_FLAG_NOFS;
+
+	BUG_ON(!inode1 || !inode2);
+	if (inode1 < inode2) {
+		mapping[0] = inode1->i_mapping;
+		mapping[1] = inode2->i_mapping;
+	} else {
+		pgoff_t tmp = index1;
+		index1 = index2;
+		index2 = tmp;
+		mapping[0] = inode2->i_mapping;
+		mapping[1] = inode1->i_mapping;
+	}
+
+	page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
+	if (!page[0])
+		return -ENOMEM;
+
+	page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
+	if (!page[1]) {
+		unlock_page(page[0]);
+		page_cache_release(page[0]);
+		return -ENOMEM;
+	}
+	/*
+	 * grab_cache_page_write_begin() may not wait on page's writeback if
+	 * BDI not demand that. But it is reasonable to be very conservative
+	 * here and explicitly wait on page's writeback
+	 */
+	wait_on_page_writeback(page[0]);
+	wait_on_page_writeback(page[1]);
+	if (inode1 > inode2) {
+		struct page *tmp;
+		tmp = page[0];
+		page[0] = page[1];
+		page[1] = tmp;
+	}
+	return 0;
+}
+
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	sector_t block;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize, block_start, block_end;
+	int i, err,  nr = 0, partial = 0;
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+
+	if (PageUptodate(page))
+		return 0;
+
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	head = page_buffers(page);
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	     block++, block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+			continue;
+		}
+		if (buffer_uptodate(bh))
+			continue;
+		if (!buffer_mapped(bh)) {
+			err = ext4_get_block(inode, block, bh, 0);
+			if (err) {
+				SetPageError(page);
+				return err;
+			}
+			if (!buffer_mapped(bh)) {
+				zero_user(page, block_start, blocksize);
+				set_buffer_uptodate(bh);
+				continue;
+			}
+		}
+		BUG_ON(nr >= MAX_BUF_PER_PAGE);
+		arr[nr++] = bh;
+	}
+	/* No io required */
+	if (!nr)
+		goto out;
+
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		if (!bh_uptodate_or_lock(bh)) {
+			err = bh_submit_read(bh);
+			if (err)
+				return err;
+		}
+	}
+out:
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp:			file structure of original file
+ * @donor_inode:		donor inode
+ * @orig_page_offset:		page index on original file
+ * @donor_page_offset:		page index on donor file
+ * @data_offset_in_page:	block index where data swapping starts
+ * @block_len_in_page:		the number of blocks to be swapped
+ * @unwritten:			orig extent is unwritten or not
+ * @err:			pointer to save return value
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling ext4_swap_extents().
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
+ */
+static int
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+		     pgoff_t orig_page_offset, pgoff_t donor_page_offset,
+		     int data_offset_in_page,
+		     int block_len_in_page, int unwritten, int *err)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	struct page *pagep[2] = {NULL, NULL};
+	handle_t *handle;
+	ext4_lblk_t orig_blk_offset, donor_blk_offset;
+	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+	unsigned int tmp_data_size, data_size, replaced_size;
+	int err2, jblocks, retries = 0;
+	int replaced_count = 0;
+	int from = data_offset_in_page << orig_inode->i_blkbits;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	struct super_block *sb = orig_inode->i_sb;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and donor_inode may change each different metadata blocks.
+	 */
+again:
+	*err = 0;
+	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+	if (IS_ERR(handle)) {
+		*err = PTR_ERR(handle);
+		return 0;
+	}
+
+	orig_blk_offset = orig_page_offset * blocks_per_page +
+		data_offset_in_page;
+
+	donor_blk_offset = donor_page_offset * blocks_per_page +
+		data_offset_in_page;
+
+	/* Calculate data_size */
+	if ((orig_blk_offset + block_len_in_page - 1) ==
+	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+		/* Replace the last block */
+		tmp_data_size = orig_inode->i_size & (blocksize - 1);
+		/*
+		 * If data_size equal zero, it shows data_size is multiples of
+		 * blocksize. So we set appropriate value.
+		 */
+		if (tmp_data_size == 0)
+			tmp_data_size = blocksize;
+
+		data_size = tmp_data_size +
+			((block_len_in_page - 1) << orig_inode->i_blkbits);
+	} else
+		data_size = block_len_in_page << orig_inode->i_blkbits;
+
+	replaced_size = data_size;
+
+	*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+				     donor_page_offset, pagep);
+	if (unlikely(*err < 0))
+		goto stop_journal;
+	/*
+	 * If orig extent was unwritten it can become initialized
+	 * at any time after i_data_sem was dropped, in order to
+	 * serialize with delalloc we have recheck extent while we
+	 * hold page's lock, if it is still the case data copy is not
+	 * necessary, just swap data blocks between orig and donor.
+	 */
+	if (unwritten) {
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
+		/* If any of extents in range became initialized we have to
+		 * fallback to data copying */
+		unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+						block_len_in_page, 1, err);
+		if (*err)
+			goto drop_data_sem;
+
+		unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
+						 block_len_in_page, 1, err);
+		if (*err)
+			goto drop_data_sem;
+
+		if (!unwritten) {
+			ext4_double_up_write_data_sem(orig_inode, donor_inode);
+			goto data_copy;
+		}
+		if ((page_has_private(pagep[0]) &&
+		     !try_to_release_page(pagep[0], 0)) ||
+		    (page_has_private(pagep[1]) &&
+		     !try_to_release_page(pagep[1], 0))) {
+			*err = -EBUSY;
+			goto drop_data_sem;
+		}
+		replaced_count = ext4_swap_extents(handle, orig_inode,
+						   donor_inode, orig_blk_offset,
+						   donor_blk_offset,
+						   block_len_in_page, 1, err);
+	drop_data_sem:
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
+		goto unlock_pages;
+	}
+data_copy:
+	*err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+	if (*err)
+		goto unlock_pages;
+
+	/* At this point all buffers in range are uptodate, old mapping layout
+	 * is no longer required, try to drop it now. */
+	if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+	    (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+		*err = -EBUSY;
+		goto unlock_pages;
+	}
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 1, err);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	if (*err) {
+		if (replaced_count) {
+			block_len_in_page = replaced_count;
+			replaced_size =
+				block_len_in_page << orig_inode->i_blkbits;
+		} else
+			goto unlock_pages;
+	}
+	/* Perform all necessary steps similar write_begin()/write_end()
+	 * but keeping in mind that i_size will not change */
+	*err = __block_write_begin(pagep[0], from, replaced_size,
+				   ext4_get_block);
+	if (!*err)
+		*err = block_commit_write(pagep[0], from, from + replaced_size);
+
+	if (unlikely(*err < 0))
+		goto repair_branches;
+
+	/* Even in case of data=writeback it is reasonable to pin
+	 * inode to transaction, to prevent unexpected data loss */
+	*err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+	unlock_page(pagep[0]);
+	page_cache_release(pagep[0]);
+	unlock_page(pagep[1]);
+	page_cache_release(pagep[1]);
+stop_journal:
+	ext4_journal_stop(handle);
+	if (*err == -ENOSPC &&
+	    ext4_should_retry_alloc(sb, &retries))
+		goto again;
+	/* Buffer was busy because probably is pinned to journal transaction,
+	 * force transaction commit may help to free it. */
+	if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
+	    jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
+		goto again;
+	return replaced_count;
+
+repair_branches:
+	/*
+	 * This should never ever happen!
+	 * Extents are swapped already, but we are not able to copy data.
+	 * Try to swap extents to it's original places
+	 */
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 0, &err2);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	if (replaced_count != block_len_in_page) {
+		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
+				       "Unable to copy data block,"
+				       " data will be lost.");
+		*err = -EIO;
+	}
+	replaced_count = 0;
+	goto unlock_pages;
+}
+
+/**
+ * mext_check_arguments - Check whether move extent can be done
+ *
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @orig_start:		logical start offset in block for orig
+ * @donor_start:	logical start offset in block for donor
+ * @len:		the number of blocks to be moved
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+		     struct inode *donor_inode, __u64 orig_start,
+		     __u64 donor_start, __u64 *len)
+{
+	__u64 orig_eof, donor_eof;
+	unsigned int blkbits = orig_inode->i_blkbits;
+	unsigned int blocksize = 1 << blkbits;
+
+	orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+	donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+
+
+	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+		ext4_debug("ext4 move extent: suid or sgid is set"
+			   " to donor file [ino:orig %lu, donor %lu]\n",
+			   orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+		return -EPERM;
+
+	/* Ext4 move extent does not support swapfile */
+	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+		ext4_debug("ext4 move extent: The argument files should "
+			"not be swapfile [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EBUSY;
+	}
+
+	/* Ext4 move extent supports only extent based file */
+	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+		ext4_debug("ext4 move extent: orig file is not extents "
+			"based file [ino:orig %lu]\n", orig_inode->i_ino);
+		return -EOPNOTSUPP;
+	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+		ext4_debug("ext4 move extent: donor file is not extents "
+			"based file [ino:donor %lu]\n", donor_inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+		ext4_debug("ext4 move extent: File size is 0 byte\n");
+		return -EINVAL;
+	}
+
+	/* Start offset should be same */
+	if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+	    (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
+		ext4_debug("ext4 move extent: orig and donor's start "
+			"offset are not alligned [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if ((orig_start >= EXT_MAX_BLOCKS) ||
+	    (donor_start >= EXT_MAX_BLOCKS) ||
+	    (*len > EXT_MAX_BLOCKS) ||
+	    (donor_start + *len >= EXT_MAX_BLOCKS) ||
+	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
+		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+	if (orig_eof < orig_start + *len - 1)
+		*len = orig_eof - orig_start;
+	if (donor_eof < donor_start + *len - 1)
+		*len = donor_eof - donor_start;
+	if (!*len) {
+		ext4_debug("ext4 move extent: len should not be 0 "
+			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+			donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp:		file structure of the original file
+ * @d_filp:		file structure of the donor file
+ * @orig_blk:		start offset in block for orig
+ * @donor_blk:		start offset in block for donor
+ * @len:		the number of blocks to be moved
+ * @moved_len:		moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+		  __u64 donor_blk, __u64 len, __u64 *moved_len)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	struct inode *donor_inode = file_inode(d_filp);
+	struct ext4_ext_path *path = NULL;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	ext4_lblk_t o_end, o_start = orig_blk;
+	ext4_lblk_t d_start = donor_blk;
+	int ret;
+
+	if (orig_inode->i_sb != donor_inode->i_sb) {
+		ext4_debug("ext4 move extent: The argument files "
+			"should be in same FS [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* orig and donor should be different inodes */
+	if (orig_inode == donor_inode) {
+		ext4_debug("ext4 move extent: The argument files should not "
+			"be same inode [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+	/* TODO: This is non obvious task to swap blocks for inodes with full
+	   jornaling enabled */
+	if (ext4_should_journal_data(orig_inode) ||
+	    ext4_should_journal_data(donor_inode)) {
+		return -EINVAL;
+	}
+	/* Protect orig and donor inodes against a truncate */
+	lock_two_nondirectories(orig_inode, donor_inode);
+
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(orig_inode);
+	ext4_inode_block_unlocked_dio(donor_inode);
+	inode_dio_wait(orig_inode);
+	inode_dio_wait(donor_inode);
+
+	/* Protect extent tree against block allocations via delalloc */
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	/* Check the filesystem environment whether move_extent can be done */
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+				    donor_blk, &len);
+	if (ret)
+		goto out;
+	o_end = o_start + len;
+
+	while (o_start < o_end) {
+		struct ext4_extent *ex;
+		ext4_lblk_t cur_blk, next_blk;
+		pgoff_t orig_page_index, donor_page_index;
+		int offset_in_page;
+		int unwritten, cur_len;
+
+		ret = get_ext_path(orig_inode, o_start, &path);
+		if (ret)
+			goto out;
+		ex = path[path->p_depth].p_ext;
+		next_blk = ext4_ext_next_allocated_block(path);
+		cur_blk = le32_to_cpu(ex->ee_block);
+		cur_len = ext4_ext_get_actual_len(ex);
+		/* Check hole before the start pos */
+		if (cur_blk + cur_len - 1 < o_start) {
+			if (next_blk == EXT_MAX_BLOCKS) {
+				o_start = o_end;
+				ret = -ENODATA;
+				goto out;
+			}
+			d_start += next_blk - o_start;
+			o_start = next_blk;
+			continue;
+		/* Check hole after the start pos */
+		} else if (cur_blk > o_start) {
+			/* Skip hole */
+			d_start += cur_blk - o_start;
+			o_start = cur_blk;
+			/* Extent inside requested range ?*/
+			if (cur_blk >= o_end)
+				goto out;
+		} else { /* in_range(o_start, o_blk, o_len) */
+			cur_len += cur_blk - o_start;
+		}
+		unwritten = ext4_ext_is_unwritten(ex);
+		if (o_end - o_start < cur_len)
+			cur_len = o_end - o_start;
+
+		orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+					       orig_inode->i_blkbits);
+		donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+					       donor_inode->i_blkbits);
+		offset_in_page = o_start % blocks_per_page;
+		if (cur_len > blocks_per_page- offset_in_page)
+			cur_len = blocks_per_page - offset_in_page;
+		/*
+		 * Up semaphore to avoid following problems:
+		 * a. transaction deadlock among ext4_journal_start,
+		 *    ->write_begin via pagefault, and jbd2_journal_commit
+		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
+		 *    in move_extent_per_page
+		 */
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
+		/* Swap original branches with new branches */
+		move_extent_per_page(o_filp, donor_inode,
+				     orig_page_index, donor_page_index,
+				     offset_in_page, cur_len,
+				     unwritten, &ret);
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
+		if (ret < 0)
+			break;
+		o_start += cur_len;
+		d_start += cur_len;
+	}
+	*moved_len = o_start - orig_blk;
+	if (*moved_len > len)
+		*moved_len = len;
+
+out:
+	if (*moved_len) {
+		ext4_discard_preallocations(orig_inode);
+		ext4_discard_preallocations(donor_inode);
+	}
+
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_inode_resume_unlocked_dio(orig_inode);
+	ext4_inode_resume_unlocked_dio(donor_inode);
+	unlock_two_nondirectories(orig_inode, donor_inode);
+
+	return ret;
+}
diff --git a/kernel/fs/ext4/namei.c b/kernel/fs/ext4/namei.c
new file mode 100644
index 000000000..814f3beb4
--- /dev/null
+++ b/kernel/fs/ext4/namei.c
@@ -0,0 +1,3918 @@
+/*
+ *  linux/fs/ext4/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  Directory entry file type support and forward compatibility hooks
+ *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *  Hash Tree Directory indexing (c)
+ *	Daniel Phillips, 2001
+ *  Hash Tree Directory indexing porting
+ *	Christopher Li, 2002
+ *  Hash Tree Directory indexing cleanup
+ *	Theodore Ts'o, 2002
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/time.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+#include <trace/events/ext4.h>
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+
+static struct buffer_head *ext4_append(handle_t *handle,
+					struct inode *inode,
+					ext4_lblk_t *block)
+{
+	struct buffer_head *bh;
+	int err;
+
+	if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+		     ((inode->i_size >> 10) >=
+		      EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+		return ERR_PTR(-ENOSPC);
+
+	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+	bh = ext4_bread(handle, inode, *block, 1);
+	if (IS_ERR(bh))
+		return bh;
+	inode->i_size += inode->i_sb->s_blocksize;
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		brelse(bh);
+		ext4_std_error(inode->i_sb, err);
+		return ERR_PTR(err);
+	}
+	return bh;
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+			       struct ext4_dir_entry *dirent);
+
+typedef enum {
+	EITHER, INDEX, DIRENT
+} dirblock_type_t;
+
+#define ext4_read_dirblock(inode, block, type) \
+	__ext4_read_dirblock((inode), (block), (type), __LINE__)
+
+static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+					      ext4_lblk_t block,
+					      dirblock_type_t type,
+					      unsigned int line)
+{
+	struct buffer_head *bh;
+	struct ext4_dir_entry *dirent;
+	int is_dx_block = 0;
+
+	bh = ext4_bread(NULL, inode, block, 0);
+	if (IS_ERR(bh)) {
+		__ext4_warning(inode->i_sb, __func__, line,
+			       "error %ld reading directory block "
+			       "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino,
+			       (unsigned long) block);
+
+		return bh;
+	}
+	if (!bh) {
+		ext4_error_inode(inode, __func__, line, block, "Directory hole found");
+		return ERR_PTR(-EIO);
+	}
+	dirent = (struct ext4_dir_entry *) bh->b_data;
+	/* Determine whether or not we have an index block */
+	if (is_dx(inode)) {
+		if (block == 0)
+			is_dx_block = 1;
+		else if (ext4_rec_len_from_disk(dirent->rec_len,
+						inode->i_sb->s_blocksize) ==
+			 inode->i_sb->s_blocksize)
+			is_dx_block = 1;
+	}
+	if (!is_dx_block && type == INDEX) {
+		ext4_error_inode(inode, __func__, line, block,
+		       "directory leaf block found instead of index block");
+		return ERR_PTR(-EIO);
+	}
+	if (!ext4_has_metadata_csum(inode->i_sb) ||
+	    buffer_verified(bh))
+		return bh;
+
+	/*
+	 * An empty leaf block can get mistaken for a index block; for
+	 * this reason, we can only check the index checksum when the
+	 * caller is sure it should be an index block.
+	 */
+	if (is_dx_block && type == INDEX) {
+		if (ext4_dx_csum_verify(inode, dirent))
+			set_buffer_verified(bh);
+		else {
+			ext4_error_inode(inode, __func__, line, block,
+				"Directory index failed checksum");
+			brelse(bh);
+			return ERR_PTR(-EIO);
+		}
+	}
+	if (!is_dx_block) {
+		if (ext4_dirent_csum_verify(inode, dirent))
+			set_buffer_verified(bh);
+		else {
+			ext4_error_inode(inode, __func__, line, block,
+				"Directory block failed checksum");
+			brelse(bh);
+			return ERR_PTR(-EIO);
+		}
+	}
+	return bh;
+}
+
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+struct fake_dirent
+{
+	__le32 inode;
+	__le16 rec_len;
+	u8 name_len;
+	u8 file_type;
+};
+
+struct dx_countlimit
+{
+	__le16 limit;
+	__le16 count;
+};
+
+struct dx_entry
+{
+	__le32 hash;
+	__le32 block;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero.  Therefore, the
+ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+ */
+
+struct dx_root
+{
+	struct fake_dirent dot;
+	char dot_name[4];
+	struct fake_dirent dotdot;
+	char dotdot_name[4];
+	struct dx_root_info
+	{
+		__le32 reserved_zero;
+		u8 hash_version;
+		u8 info_length; /* 8 */
+		u8 indirect_levels;
+		u8 unused_flags;
+	}
+	info;
+	struct dx_entry	entries[0];
+};
+
+struct dx_node
+{
+	struct fake_dirent fake;
+	struct dx_entry	entries[0];
+};
+
+
+struct dx_frame
+{
+	struct buffer_head *bh;
+	struct dx_entry *entries;
+	struct dx_entry *at;
+};
+
+struct dx_map_entry
+{
+	u32 hash;
+	u16 offs;
+	u16 size;
+};
+
+/*
+ * This goes at the end of each htree block.
+ */
+struct dx_tail {
+	u32 dt_reserved;
+	__le32 dt_checksum;	/* crc32c(uuid+inum+dirblock) */
+};
+
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
+				 struct inode *dir,
+				 struct dx_hash_info *hinfo,
+				 struct dx_frame *frame);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+		       unsigned blocksize, struct dx_hash_info *hinfo,
+		       struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
+		struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
+static void dx_insert_block(struct dx_frame *frame,
+					u32 hash, ext4_lblk_t block);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+		const struct qstr *d_name,
+		struct ext4_dir_entry_2 **res_dir);
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode);
+
+/* checksumming functions */
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+			    unsigned int blocksize)
+{
+	memset(t, 0, sizeof(struct ext4_dir_entry_tail));
+	t->det_rec_len = ext4_rec_len_to_disk(
+			sizeof(struct ext4_dir_entry_tail), blocksize);
+	t->det_reserved_ft = EXT4_FT_DIR_CSUM;
+}
+
+/* Walk through a dirent block to find a checksum "dirent" at the tail */
+static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
+						   struct ext4_dir_entry *de)
+{
+	struct ext4_dir_entry_tail *t;
+
+#ifdef PARANOID
+	struct ext4_dir_entry *d, *top;
+
+	d = de;
+	top = (struct ext4_dir_entry *)(((void *)de) +
+		(EXT4_BLOCK_SIZE(inode->i_sb) -
+		sizeof(struct ext4_dir_entry_tail)));
+	while (d < top && d->rec_len)
+		d = (struct ext4_dir_entry *)(((void *)d) +
+		    le16_to_cpu(d->rec_len));
+
+	if (d != top)
+		return NULL;
+
+	t = (struct ext4_dir_entry_tail *)d;
+#else
+	t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+#endif
+
+	if (t->det_reserved_zero1 ||
+	    le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+	    t->det_reserved_zero2 ||
+	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+		return NULL;
+
+	return t;
+}
+
+static __le32 ext4_dirent_csum(struct inode *inode,
+			       struct ext4_dir_entry *dirent, int size)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__u32 csum;
+
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+	return cpu_to_le32(csum);
+}
+
+static void warn_no_space_for_csum(struct inode *inode)
+{
+	ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+		     "checksum.  Please run e2fsck -D.", inode->i_ino);
+}
+
+int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+	struct ext4_dir_entry_tail *t;
+
+	if (!ext4_has_metadata_csum(inode->i_sb))
+		return 1;
+
+	t = get_dirent_tail(inode, dirent);
+	if (!t) {
+		warn_no_space_for_csum(inode);
+		return 0;
+	}
+
+	if (t->det_checksum != ext4_dirent_csum(inode, dirent,
+						(void *)t - (void *)dirent))
+		return 0;
+
+	return 1;
+}
+
+static void ext4_dirent_csum_set(struct inode *inode,
+				 struct ext4_dir_entry *dirent)
+{
+	struct ext4_dir_entry_tail *t;
+
+	if (!ext4_has_metadata_csum(inode->i_sb))
+		return;
+
+	t = get_dirent_tail(inode, dirent);
+	if (!t) {
+		warn_no_space_for_csum(inode);
+		return;
+	}
+
+	t->det_checksum = ext4_dirent_csum(inode, dirent,
+					   (void *)t - (void *)dirent);
+}
+
+int ext4_handle_dirty_dirent_node(handle_t *handle,
+				  struct inode *inode,
+				  struct buffer_head *bh)
+{
+	ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+	return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
+					       struct ext4_dir_entry *dirent,
+					       int *offset)
+{
+	struct ext4_dir_entry *dp;
+	struct dx_root_info *root;
+	int count_offset;
+
+	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+		count_offset = 8;
+	else if (le16_to_cpu(dirent->rec_len) == 12) {
+		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
+		if (le16_to_cpu(dp->rec_len) !=
+		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+			return NULL;
+		root = (struct dx_root_info *)(((void *)dp + 12));
+		if (root->reserved_zero ||
+		    root->info_length != sizeof(struct dx_root_info))
+			return NULL;
+		count_offset = 32;
+	} else
+		return NULL;
+
+	if (offset)
+		*offset = count_offset;
+	return (struct dx_countlimit *)(((void *)dirent) + count_offset);
+}
+
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+			   int count_offset, int count, struct dx_tail *t)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__u32 csum;
+	__le32 save_csum;
+	int size;
+
+	size = count_offset + (count * sizeof(struct dx_entry));
+	save_csum = t->dt_checksum;
+	t->dt_checksum = 0;
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+	csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
+	t->dt_checksum = save_csum;
+
+	return cpu_to_le32(csum);
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+			       struct ext4_dir_entry *dirent)
+{
+	struct dx_countlimit *c;
+	struct dx_tail *t;
+	int count_offset, limit, count;
+
+	if (!ext4_has_metadata_csum(inode->i_sb))
+		return 1;
+
+	c = get_dx_countlimit(inode, dirent, &count_offset);
+	if (!c) {
+		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+		return 1;
+	}
+	limit = le16_to_cpu(c->limit);
+	count = le16_to_cpu(c->count);
+	if (count_offset + (limit * sizeof(struct dx_entry)) >
+	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+		warn_no_space_for_csum(inode);
+		return 1;
+	}
+	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+	if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
+					    count, t))
+		return 0;
+	return 1;
+}
+
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+	struct dx_countlimit *c;
+	struct dx_tail *t;
+	int count_offset, limit, count;
+
+	if (!ext4_has_metadata_csum(inode->i_sb))
+		return;
+
+	c = get_dx_countlimit(inode, dirent, &count_offset);
+	if (!c) {
+		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+		return;
+	}
+	limit = le16_to_cpu(c->limit);
+	count = le16_to_cpu(c->count);
+	if (count_offset + (limit * sizeof(struct dx_entry)) >
+	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+		warn_no_space_for_csum(inode);
+		return;
+	}
+	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+	t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+}
+
+static inline int ext4_handle_dirty_dx_node(handle_t *handle,
+					    struct inode *inode,
+					    struct buffer_head *bh)
+{
+	ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+	return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+{
+	return (struct ext4_dir_entry_2 *)((char *)p +
+		ext4_rec_len_from_disk(p->rec_len, blocksize));
+}
+
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+
+static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+{
+	entry->block = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_hash(struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->hash);
+}
+
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
+{
+	entry->hash = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_count(struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit(struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+		EXT4_DIR_REC_LEN(2) - infosize;
+
+	if (ext4_has_metadata_csum(dir->i_sb))
+		entry_space -= sizeof(struct dx_tail);
+	return entry_space / sizeof(struct dx_entry);
+}
+
+static inline unsigned dx_node_limit(struct inode *dir)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+
+	if (ext4_has_metadata_csum(dir->i_sb))
+		entry_space -= sizeof(struct dx_tail);
+	return entry_space / sizeof(struct dx_entry);
+}
+
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index(char * label, struct dx_entry *entries)
+{
+	int i, n = dx_get_count (entries);
+	printk(KERN_DEBUG "%s index ", label);
+	for (i = 0; i < n; i++) {
+		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
+				0, (unsigned long)dx_get_block(entries + i));
+	}
+	printk("\n");
+}
+
+struct stats
+{
+	unsigned names;
+	unsigned space;
+	unsigned bcount;
+};
+
+static struct stats dx_show_leaf(struct inode *dir,
+				struct dx_hash_info *hinfo,
+				struct ext4_dir_entry_2 *de,
+				int size, int show_names)
+{
+	unsigned names = 0, space = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	printk("names: ");
+	while ((char *) de < base + size)
+	{
+		if (de->inode)
+		{
+			if (show_names)
+			{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+				int len;
+				char *name;
+				struct ext4_str fname_crypto_str
+					= {.name = NULL, .len = 0};
+				struct ext4_fname_crypto_ctx *ctx = NULL;
+				int res;
+
+				name  = de->name;
+				len = de->name_len;
+				ctx = ext4_get_fname_crypto_ctx(dir,
+								EXT4_NAME_LEN);
+				if (IS_ERR(ctx)) {
+					printk(KERN_WARNING "Error acquiring"
+					" crypto ctxt--skipping crypto\n");
+					ctx = NULL;
+				}
+				if (ctx == NULL) {
+					/* Directory is not encrypted */
+					ext4fs_dirhash(de->name,
+						de->name_len, &h);
+					printk("%*.s:(U)%x.%u ", len,
+					       name, h.hash,
+					       (unsigned) ((char *) de
+							   - base));
+				} else {
+					/* Directory is encrypted */
+					res = ext4_fname_crypto_alloc_buffer(
+						ctx, de->name_len,
+						&fname_crypto_str);
+					if (res < 0) {
+						printk(KERN_WARNING "Error "
+							"allocating crypto "
+							"buffer--skipping "
+							"crypto\n");
+						ext4_put_fname_crypto_ctx(&ctx);
+						ctx = NULL;
+					}
+					res = ext4_fname_disk_to_usr(ctx, NULL, de,
+							&fname_crypto_str);
+					if (res < 0) {
+						printk(KERN_WARNING "Error "
+							"converting filename "
+							"from disk to usr"
+							"\n");
+						name = "??";
+						len = 2;
+					} else {
+						name = fname_crypto_str.name;
+						len = fname_crypto_str.len;
+					}
+					ext4fs_dirhash(de->name, de->name_len,
+						       &h);
+					printk("%*.s:(E)%x.%u ", len, name,
+					       h.hash, (unsigned) ((char *) de
+								   - base));
+					ext4_put_fname_crypto_ctx(&ctx);
+					ext4_fname_crypto_free_buffer(
+						&fname_crypto_str);
+				}
+#else
+				int len = de->name_len;
+				char *name = de->name;
+				ext4fs_dirhash(de->name, de->name_len, &h);
+				printk("%*.s:%x.%u ", len, name, h.hash,
+				       (unsigned) ((char *) de - base));
+#endif
+			}
+			space += EXT4_DIR_REC_LEN(de->name_len);
+			names++;
+		}
+		de = ext4_next_entry(de, size);
+	}
+	printk("(%i)\n", names);
+	return (struct stats) { names, space, 1 };
+}
+
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+			     struct dx_entry *entries, int levels)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
+	unsigned bcount = 0;
+	struct buffer_head *bh;
+	printk("%i indexed blocks...\n", count);
+	for (i = 0; i < count; i++, entries++)
+	{
+		ext4_lblk_t block = dx_get_block(entries);
+		ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
+		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+		struct stats stats;
+		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
+		bh = ext4_bread(NULL,dir, block, 0);
+		if (!bh || IS_ERR(bh))
+			continue;
+		stats = levels?
+		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+		   dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *)
+			bh->b_data, blocksize, 0);
+		names += stats.names;
+		space += stats.space;
+		bcount += stats.bcount;
+		brelse(bh);
+	}
+	if (bcount)
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+		       levels ? "" : "   ", names, space/bcount,
+		       (space/bcount)*100/blocksize);
+	return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally.  The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(const struct qstr *d_name, struct inode *dir,
+	 struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+{
+	unsigned count, indirect;
+	struct dx_entry *at, *entries, *p, *q, *m;
+	struct dx_root *root;
+	struct dx_frame *frame = frame_in;
+	struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
+	u32 hash;
+
+	frame->bh = ext4_read_dirblock(dir, 0, INDEX);
+	if (IS_ERR(frame->bh))
+		return (struct dx_frame *) frame->bh;
+
+	root = (struct dx_root *) frame->bh->b_data;
+	if (root->info.hash_version != DX_HASH_TEA &&
+	    root->info.hash_version != DX_HASH_HALF_MD4 &&
+	    root->info.hash_version != DX_HASH_LEGACY) {
+		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
+			     root->info.hash_version);
+		goto fail;
+	}
+	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	if (d_name) {
+		struct ext4_fname_crypto_ctx *ctx = NULL;
+		int res;
+
+		/* Check if the directory is encrypted */
+		ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+		if (IS_ERR(ctx)) {
+			ret_err = ERR_PTR(PTR_ERR(ctx));
+			goto fail;
+		}
+		res = ext4_fname_usr_to_hash(ctx, d_name, hinfo);
+		if (res < 0) {
+			ret_err = ERR_PTR(res);
+			goto fail;
+		}
+		ext4_put_fname_crypto_ctx(&ctx);
+	}
+#else
+	if (d_name)
+		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
+#endif
+	hash = hinfo->hash;
+
+	if (root->info.unused_flags & 1) {
+		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
+			     root->info.unused_flags);
+		goto fail;
+	}
+
+	if ((indirect = root->info.indirect_levels) > 1) {
+		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+			     root->info.indirect_levels);
+		goto fail;
+	}
+
+	entries = (struct dx_entry *) (((char *)&root->info) +
+				       root->info.info_length);
+
+	if (dx_get_limit(entries) != dx_root_limit(dir,
+						   root->info.info_length)) {
+		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
+		goto fail;
+	}
+
+	dxtrace(printk("Look up %x", hash));
+	while (1) {
+		count = dx_get_count(entries);
+		if (!count || count > dx_get_limit(entries)) {
+			ext4_warning(dir->i_sb,
+				     "dx entry: no count or count > limit");
+			goto fail;
+		}
+
+		p = entries + 1;
+		q = entries + count - 1;
+		while (p <= q) {
+			m = p + (q - p)/2;
+			dxtrace(printk("."));
+			if (dx_get_hash(m) > hash)
+				q = m - 1;
+			else
+				p = m + 1;
+		}
+
+		if (0) { // linear search cross check
+			unsigned n = count - 1;
+			at = entries;
+			while (n--)
+			{
+				dxtrace(printk(","));
+				if (dx_get_hash(++at) > hash)
+				{
+					at--;
+					break;
+				}
+			}
+			assert (at == p - 1);
+		}
+
+		at = p - 1;
+		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+		frame->entries = entries;
+		frame->at = at;
+		if (!indirect--)
+			return frame;
+		frame++;
+		frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+		if (IS_ERR(frame->bh)) {
+			ret_err = (struct dx_frame *) frame->bh;
+			frame->bh = NULL;
+			goto fail;
+		}
+		entries = ((struct dx_node *) frame->bh->b_data)->entries;
+
+		if (dx_get_limit(entries) != dx_node_limit (dir)) {
+			ext4_warning(dir->i_sb,
+				     "dx entry: limit != node limit");
+			goto fail;
+		}
+	}
+fail:
+	while (frame >= frame_in) {
+		brelse(frame->bh);
+		frame--;
+	}
+
+	if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
+		ext4_warning(dir->i_sb,
+			     "Corrupt dir inode %lu, running e2fsck is "
+			     "recommended.", dir->i_ino);
+	return ret_err;
+}
+
+static void dx_release (struct dx_frame *frames)
+{
+	if (frames[0].bh == NULL)
+		return;
+
+	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+		brelse(frames[1].bh);
+	brelse(frames[0].bh);
+}
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary.  Whether or not the search is necessary is
+ * controlled by the hash parameter.  If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value.  This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not.  If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash)
+{
+	struct dx_frame *p;
+	struct buffer_head *bh;
+	int num_frames = 0;
+	__u32 bhash;
+
+	p = frame;
+	/*
+	 * Find the next leaf page by incrementing the frame pointer.
+	 * If we run out of entries in the interior node, loop around and
+	 * increment pointer in the parent node.  When we break out of
+	 * this loop, num_frames indicates the number of interior
+	 * nodes need to be read.
+	 */
+	while (1) {
+		if (++(p->at) < p->entries + dx_get_count(p->entries))
+			break;
+		if (p == frames)
+			return 0;
+		num_frames++;
+		p--;
+	}
+
+	/*
+	 * If the hash is 1, then continue only if the next page has a
+	 * continuation hash of any value.  This is used for readdir
+	 * handling.  Otherwise, check to see if the hash matches the
+	 * desired contiuation hash.  If it doesn't, return since
+	 * there's no point to read in the successive index pages.
+	 */
+	bhash = dx_get_hash(p->at);
+	if (start_hash)
+		*start_hash = bhash;
+	if ((hash & 1) == 0) {
+		if ((bhash & ~1) != hash)
+			return 0;
+	}
+	/*
+	 * If the hash is HASH_NB_ALWAYS, we always go to the next
+	 * block so no check is necessary
+	 */
+	while (num_frames--) {
+		bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		p++;
+		brelse(p->bh);
+		p->bh = bh;
+		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+	}
+	return 1;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory block.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+				  struct inode *dir, ext4_lblk_t block,
+				  struct dx_hash_info *hinfo,
+				  __u32 start_hash, __u32 start_minor_hash)
+{
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *top;
+	int err = 0, count = 0;
+	struct ext4_fname_crypto_ctx *ctx = NULL;
+	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str;
+
+	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
+							(unsigned long)block));
+	bh = ext4_read_dirblock(dir, block, DIRENT);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
+
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	top = (struct ext4_dir_entry_2 *) ((char *) de +
+					   dir->i_sb->s_blocksize -
+					   EXT4_DIR_REC_LEN(0));
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	/* Check if the directory is encrypted */
+	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		brelse(bh);
+		return err;
+	}
+	if (ctx != NULL) {
+		err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
+						     &fname_crypto_str);
+		if (err < 0) {
+			ext4_put_fname_crypto_ctx(&ctx);
+			brelse(bh);
+			return err;
+		}
+	}
+#endif
+	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+				bh->b_data, bh->b_size,
+				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+					 + ((char *)de - bh->b_data))) {
+			/* silently ignore the rest of the block */
+			break;
+		}
+		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		if (ctx == NULL) {
+			/* Directory is not encrypted */
+			tmp_str.name = de->name;
+			tmp_str.len = de->name_len;
+			err = ext4_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de,
+				   &tmp_str);
+		} else {
+			/* Directory is encrypted */
+			err = ext4_fname_disk_to_usr(ctx, hinfo, de,
+						     &fname_crypto_str);
+			if (err < 0) {
+				count = err;
+				goto errout;
+			}
+			err = ext4_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de,
+					&fname_crypto_str);
+		}
+		if (err != 0) {
+			count = err;
+			goto errout;
+		}
+		count++;
+	}
+errout:
+	brelse(bh);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	ext4_put_fname_crypto_ctx(&ctx);
+	ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
+	return count;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory.  We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+			 __u32 start_minor_hash, __u32 *next_hash)
+{
+	struct dx_hash_info hinfo;
+	struct ext4_dir_entry_2 *de;
+	struct dx_frame frames[2], *frame;
+	struct inode *dir;
+	ext4_lblk_t block;
+	int count = 0;
+	int ret, err;
+	__u32 hashval;
+	struct ext4_str tmp_str;
+
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+		       start_hash, start_minor_hash));
+	dir = file_inode(dir_file);
+	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
+		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT4_SB(dir->i_sb)->s_hash_unsigned;
+		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+		if (ext4_has_inline_data(dir)) {
+			int has_inline_data = 1;
+			count = htree_inlinedir_to_tree(dir_file, dir, 0,
+							&hinfo, start_hash,
+							start_minor_hash,
+							&has_inline_data);
+			if (has_inline_data) {
+				*next_hash = ~0;
+				return count;
+			}
+		}
+		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+					       start_hash, start_minor_hash);
+		*next_hash = ~0;
+		return count;
+	}
+	hinfo.hash = start_hash;
+	hinfo.minor_hash = 0;
+	frame = dx_probe(NULL, dir, &hinfo, frames);
+	if (IS_ERR(frame))
+		return PTR_ERR(frame);
+
+	/* Add '.' and '..' from the htree header */
+	if (!start_hash && !start_minor_hash) {
+		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+		tmp_str.name = de->name;
+		tmp_str.len = de->name_len;
+		err = ext4_htree_store_dirent(dir_file, 0, 0,
+					      de, &tmp_str);
+		if (err != 0)
+			goto errout;
+		count++;
+	}
+	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+		tmp_str.name = de->name;
+		tmp_str.len = de->name_len;
+		err = ext4_htree_store_dirent(dir_file, 2, 0,
+					      de, &tmp_str);
+		if (err != 0)
+			goto errout;
+		count++;
+	}
+
+	while (1) {
+		block = dx_get_block(frame->at);
+		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+					     start_hash, start_minor_hash);
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		count += ret;
+		hashval = ~0;
+		ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+					    frame, frames, &hashval);
+		*next_hash = hashval;
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		/*
+		 * Stop if:  (a) there are no more entries, or
+		 * (b) we have inserted at least one entry and the
+		 * next hash value is not a continuation
+		 */
+		if ((ret == 0) ||
+		    (count && ((hashval & 1) == 0)))
+			break;
+	}
+	dx_release(frames);
+	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+		       "next hash: %x\n", count, *next_hash));
+	return count;
+errout:
+	dx_release(frames);
+	return (err);
+}
+
+static inline int search_dirblock(struct buffer_head *bh,
+				  struct inode *dir,
+				  const struct qstr *d_name,
+				  unsigned int offset,
+				  struct ext4_dir_entry_2 **res_dir)
+{
+	return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+			  d_name, offset, res_dir);
+}
+
+/*
+ * Directory block splitting, compacting
+ */
+
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+		       unsigned blocksize, struct dx_hash_info *hinfo,
+		       struct dx_map_entry *map_tail)
+{
+	int count = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	while ((char *) de < base + blocksize) {
+		if (de->name_len && de->inode) {
+			ext4fs_dirhash(de->name, de->name_len, &h);
+			map_tail--;
+			map_tail->hash = h.hash;
+			map_tail->offs = ((char *) de - base)>>2;
+			map_tail->size = le16_to_cpu(de->rec_len);
+			count++;
+			cond_resched();
+		}
+		/* XXX: do we need to check rec_len == 0 case? -Chris */
+		de = ext4_next_entry(de, blocksize);
+	}
+	return count;
+}
+
+/* Sort map by hash value */
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+	struct dx_map_entry *p, *q, *top = map + count - 1;
+	int more;
+	/* Combsort until bubble sort doesn't suck */
+	while (count > 2) {
+		count = count*10/13;
+		if (count - 9 < 2) /* 9, 10 -> 11 */
+			count = 11;
+		for (p = top, q = p - count; q >= map; p--, q--)
+			if (p->hash < q->hash)
+				swap(*p, *q);
+	}
+	/* Garden variety bubble sort */
+	do {
+		more = 0;
+		q = top;
+		while (q-- > map) {
+			if (q[1].hash >= q[0].hash)
+				continue;
+			swap(*(q+1), *q);
+			more = 1;
+		}
+	} while(more);
+}
+
+static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
+{
+	struct dx_entry *entries = frame->entries;
+	struct dx_entry *old = frame->at, *new = old + 1;
+	int count = dx_get_count(entries);
+
+	assert(count < dx_get_limit(entries));
+	assert(old < entries + count);
+	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+	dx_set_hash(new, hash);
+	dx_set_block(new, block);
+	dx_set_count(entries, count + 1);
+}
+
+/*
+ * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT4_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx,
+			     struct ext4_str *fname_crypto_str,
+			     int len, const char * const name,
+			     struct ext4_dir_entry_2 *de)
+{
+	int res;
+
+	if (!de->inode)
+		return 0;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	if (ctx)
+		return ext4_fname_match(ctx, fname_crypto_str, len, name, de);
+#endif
+	if (len != de->name_len)
+		return 0;
+	res = memcmp(name, de->name, len);
+	return (res == 0) ? 1 : 0;
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+int search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
+	       struct inode *dir, const struct qstr *d_name,
+	       unsigned int offset, struct ext4_dir_entry_2 **res_dir)
+{
+	struct ext4_dir_entry_2 * de;
+	char * dlimit;
+	int de_len;
+	const char *name = d_name->name;
+	int namelen = d_name->len;
+	struct ext4_fname_crypto_ctx *ctx = NULL;
+	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+	int res;
+
+	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+	if (IS_ERR(ctx))
+		return -1;
+
+	de = (struct ext4_dir_entry_2 *)search_buf;
+	dlimit = search_buf + buf_size;
+	while ((char *) de < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+		if ((char *) de + de->name_len <= dlimit) {
+			res = ext4_match(ctx, &fname_crypto_str, namelen,
+					 name, de);
+			if (res < 0) {
+				res = -1;
+				goto return_result;
+			}
+			if (res > 0) {
+				/* found a match - just to be sure, do
+				 * a full check */
+				if (ext4_check_dir_entry(dir, NULL, de, bh,
+						bh->b_data,
+						 bh->b_size, offset)) {
+					res = -1;
+					goto return_result;
+				}
+				*res_dir = de;
+				res = 1;
+				goto return_result;
+			}
+
+		}
+		/* prevent looping on a bad block */
+		de_len = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
+		if (de_len <= 0) {
+			res = -1;
+			goto return_result;
+		}
+		offset += de_len;
+		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+	}
+
+	res = 0;
+return_result:
+	ext4_put_fname_crypto_ctx(&ctx);
+	ext4_fname_crypto_free_buffer(&fname_crypto_str);
+	return res;
+}
+
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+			       struct ext4_dir_entry *de)
+{
+	struct super_block *sb = dir->i_sb;
+
+	if (!is_dx(dir))
+		return 0;
+	if (block == 0)
+		return 1;
+	if (de->inode == 0 &&
+	    ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+			sb->s_blocksize)
+		return 1;
+	return 0;
+}
+
+/*
+ *	ext4_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated.  The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 **res_dir,
+					int *inlined)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	ext4_lblk_t start, block, b;
+	const u8 *name = d_name->name;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	ext4_lblk_t  nblocks;
+	int i, namelen;
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+	namelen = d_name->len;
+	if (namelen > EXT4_NAME_LEN)
+		return NULL;
+
+	if (ext4_has_inline_data(dir)) {
+		int has_inline_data = 1;
+		ret = ext4_find_inline_entry(dir, d_name, res_dir,
+					     &has_inline_data);
+		if (has_inline_data) {
+			if (inlined)
+				*inlined = 1;
+			return ret;
+		}
+	}
+
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == '\0')) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
+	if (is_dx(dir)) {
+		bh = ext4_dx_find_entry(dir, d_name, res_dir);
+		/*
+		 * On success, or if the error was file not found,
+		 * return.  Otherwise, fall back to doing a search the
+		 * old fashioned way.
+		 */
+		if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
+			return bh;
+		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+			       "falling back\n"));
+	}
+	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+	start = EXT4_I(dir)->i_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+				bh = ext4_getblk(NULL, dir, b++, 0);
+				if (unlikely(IS_ERR(bh))) {
+					if (ra_max == 0)
+						return bh;
+					break;
+				}
+				bh_use[ra_max] = bh;
+				if (bh)
+					ll_rw_block(READ | REQ_META | REQ_PRIO,
+						    1, &bh);
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+					 (unsigned long) block);
+			brelse(bh);
+			goto next;
+		}
+		if (!buffer_verified(bh) &&
+		    !is_dx_internal_node(dir, block,
+					 (struct ext4_dir_entry *)bh->b_data) &&
+		    !ext4_dirent_csum_verify(dir,
+				(struct ext4_dir_entry *)bh->b_data)) {
+			EXT4_ERROR_INODE(dir, "checksumming directory "
+					 "block %lu", (unsigned long)block);
+			brelse(bh);
+			goto next;
+		}
+		set_buffer_verified(bh);
+		i = search_dirblock(bh, dir, d_name,
+			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+		if (i == 1) {
+			EXT4_I(dir)->i_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+	return ret;
+}
+
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
+		       struct ext4_dir_entry_2 **res_dir)
+{
+	struct super_block * sb = dir->i_sb;
+	struct dx_hash_info	hinfo;
+	struct dx_frame frames[2], *frame;
+	struct buffer_head *bh;
+	ext4_lblk_t block;
+	int retval;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	*res_dir = NULL;
+#endif
+	frame = dx_probe(d_name, dir, &hinfo, frames);
+	if (IS_ERR(frame))
+		return (struct buffer_head *) frame;
+	do {
+		block = dx_get_block(frame->at);
+		bh = ext4_read_dirblock(dir, block, DIRENT);
+		if (IS_ERR(bh))
+			goto errout;
+
+		retval = search_dirblock(bh, dir, d_name,
+					 block << EXT4_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1)
+			goto success;
+		brelse(bh);
+		if (retval == -1) {
+			bh = ERR_PTR(ERR_BAD_DX_DIR);
+			goto errout;
+		}
+
+		/* Check to see if we should continue to search */
+		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
+					       frames, NULL);
+		if (retval < 0) {
+			ext4_warning(sb,
+			     "error %d reading index page in directory #%lu",
+			     retval, dir->i_ino);
+			bh = ERR_PTR(retval);
+			goto errout;
+		}
+	} while (retval == 1);
+
+	bh = NULL;
+errout:
+	dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
+success:
+	dx_release(frames);
+	return bh;
+}
+
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode;
+	struct ext4_dir_entry_2 *de;
+	struct buffer_head *bh;
+
+	if (dentry->d_name.len > EXT4_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+	if (IS_ERR(bh))
+		return (struct dentry *) bh;
+	inode = NULL;
+	if (bh) {
+		__u32 ino = le32_to_cpu(de->inode);
+		brelse(bh);
+		if (!ext4_valid_inum(dir->i_sb, ino)) {
+			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
+			return ERR_PTR(-EIO);
+		}
+		if (unlikely(ino == dir->i_ino)) {
+			EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
+					 dentry);
+			return ERR_PTR(-EIO);
+		}
+		inode = ext4_iget_normal(dir->i_sb, ino);
+		if (inode == ERR_PTR(-ESTALE)) {
+			EXT4_ERROR_INODE(dir,
+					 "deleted inode referenced: %u",
+					 ino);
+			return ERR_PTR(-EIO);
+		}
+		if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+		    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		     S_ISLNK(inode->i_mode)) &&
+		    !ext4_is_child_context_consistent_with_parent(dir,
+								  inode)) {
+			iput(inode);
+			ext4_warning(inode->i_sb,
+				     "Inconsistent encryption contexts: %lu/%lu\n",
+				     (unsigned long) dir->i_ino,
+				     (unsigned long) inode->i_ino);
+			return ERR_PTR(-EPERM);
+		}
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+
+struct dentry *ext4_get_parent(struct dentry *child)
+{
+	__u32 ino;
+	static const struct qstr dotdot = QSTR_INIT("..", 2);
+	struct ext4_dir_entry_2 * de;
+	struct buffer_head *bh;
+
+	bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
+	if (IS_ERR(bh))
+		return (struct dentry *) bh;
+	if (!bh)
+		return ERR_PTR(-ENOENT);
+	ino = le32_to_cpu(de->inode);
+	brelse(bh);
+
+	if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
+		EXT4_ERROR_INODE(d_inode(child),
+				 "bad parent inode number: %u", ino);
+		return ERR_PTR(-EIO);
+	}
+
+	return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
+}
+
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
+static struct ext4_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+		unsigned blocksize)
+{
+	unsigned rec_len = 0;
+
+	while (count--) {
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+						(from + (map->offs<<2));
+		rec_len = EXT4_DIR_REC_LEN(de->name_len);
+		memcpy (to, de, rec_len);
+		((struct ext4_dir_entry_2 *) to)->rec_len =
+				ext4_rec_len_to_disk(rec_len, blocksize);
+		de->inode = 0;
+		map++;
+		to += rec_len;
+	}
+	return (struct ext4_dir_entry_2 *) (to - rec_len);
+}
+
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+{
+	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
+	unsigned rec_len = 0;
+
+	prev = to = de;
+	while ((char*)de < base + blocksize) {
+		next = ext4_next_entry(de, blocksize);
+		if (de->inode && de->name_len) {
+			rec_len = EXT4_DIR_REC_LEN(de->name_len);
+			if (de > to)
+				memmove(to, de, rec_len);
+			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+			prev = to;
+			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
+		}
+		de = next;
+	}
+	return prev;
+}
+
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
+static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+			struct buffer_head **bh,struct dx_frame *frame,
+			struct dx_hash_info *hinfo)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count, continued;
+	struct buffer_head *bh2;
+	ext4_lblk_t newblock;
+	u32 hash2;
+	struct dx_map_entry *map;
+	char *data1 = (*bh)->b_data, *data2;
+	unsigned split, move, size;
+	struct ext4_dir_entry_2 *de = NULL, *de2;
+	struct ext4_dir_entry_tail *t;
+	int	csum_size = 0;
+	int	err = 0, i;
+
+	if (ext4_has_metadata_csum(dir->i_sb))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	bh2 = ext4_append(handle, dir, &newblock);
+	if (IS_ERR(bh2)) {
+		brelse(*bh);
+		*bh = NULL;
+		return (struct ext4_dir_entry_2 *) bh2;
+	}
+
+	BUFFER_TRACE(*bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, *bh);
+	if (err)
+		goto journal_error;
+
+	BUFFER_TRACE(frame->bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, frame->bh);
+	if (err)
+		goto journal_error;
+
+	data2 = bh2->b_data;
+
+	/* create map in the end of data2 block */
+	map = (struct dx_map_entry *) (data2 + blocksize);
+	count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
+			     blocksize, hinfo, map);
+	map -= count;
+	dx_sort_map(map, count);
+	/* Split the existing block in the middle, size-wise */
+	size = 0;
+	move = 0;
+	for (i = count-1; i >= 0; i--) {
+		/* is more than half of this entry in 2nd half of the block? */
+		if (size + map[i].size/2 > blocksize/2)
+			break;
+		size += map[i].size;
+		move++;
+	}
+	/* map index at which we will split */
+	split = count - move;
+	hash2 = map[split].hash;
+	continued = hash2 == map[split - 1].hash;
+	dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
+			(unsigned long)dx_get_block(frame->at),
+					hash2, split, count-split));
+
+	/* Fancy dance to stay within two buffers */
+	de2 = dx_move_dirents(data1, data2, map + split, count - split,
+			      blocksize);
+	de = dx_pack_dirents(data1, blocksize);
+	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+					   (char *) de,
+					   blocksize);
+	de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+					    (char *) de2,
+					    blocksize);
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(data2, blocksize);
+		initialize_dirent_tail(t, blocksize);
+
+		t = EXT4_DIRENT_TAIL(data1, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
+	dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
+			blocksize, 1));
+	dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
+			blocksize, 1));
+
+	/* Which block gets the new entry? */
+	if (hinfo->hash >= hash2) {
+		swap(*bh, bh2);
+		de = de2;
+	}
+	dx_insert_block(frame, hash2 + continued, newblock);
+	err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
+	if (err)
+		goto journal_error;
+	err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+	if (err)
+		goto journal_error;
+	brelse(bh2);
+	dxtrace(dx_show_index("frame", frame->entries));
+	return de;
+
+journal_error:
+	brelse(*bh);
+	brelse(bh2);
+	*bh = NULL;
+	ext4_std_error(dir->i_sb, err);
+	return ERR_PTR(err);
+}
+
+int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+		      struct buffer_head *bh,
+		      void *buf, int buf_size,
+		      const char *name, int namelen,
+		      struct ext4_dir_entry_2 **dest_de)
+{
+	struct ext4_dir_entry_2 *de;
+	unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+	int nlen, rlen;
+	unsigned int offset = 0;
+	char *top;
+	struct ext4_fname_crypto_ctx *ctx = NULL;
+	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+	int res;
+
+	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+	if (IS_ERR(ctx))
+		return -1;
+
+	if (ctx != NULL) {
+		/* Calculate record length needed to store the entry */
+		res = ext4_fname_crypto_namelen_on_disk(ctx, namelen);
+		if (res < 0) {
+			ext4_put_fname_crypto_ctx(&ctx);
+			return res;
+		}
+		reclen = EXT4_DIR_REC_LEN(res);
+	}
+
+	de = (struct ext4_dir_entry_2 *)buf;
+	top = buf + buf_size - reclen;
+	while ((char *) de <= top) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+					 buf, buf_size, offset)) {
+			res = -EIO;
+			goto return_result;
+		}
+		/* Provide crypto context and crypto buffer to ext4 match */
+		res = ext4_match(ctx, &fname_crypto_str, namelen, name, de);
+		if (res < 0)
+			goto return_result;
+		if (res > 0) {
+			res = -EEXIST;
+			goto return_result;
+		}
+		nlen = EXT4_DIR_REC_LEN(de->name_len);
+		rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+		if ((de->inode ? rlen - nlen : rlen) >= reclen)
+			break;
+		de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+		offset += rlen;
+	}
+
+	if ((char *) de > top)
+		res = -ENOSPC;
+	else {
+		*dest_de = de;
+		res = 0;
+	}
+return_result:
+	ext4_put_fname_crypto_ctx(&ctx);
+	ext4_fname_crypto_free_buffer(&fname_crypto_str);
+	return res;
+}
+
+int ext4_insert_dentry(struct inode *dir,
+		       struct inode *inode,
+		       struct ext4_dir_entry_2 *de,
+		       int buf_size,
+		       const struct qstr *iname,
+		       const char *name, int namelen)
+{
+
+	int nlen, rlen;
+	struct ext4_fname_crypto_ctx *ctx = NULL;
+	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+	struct ext4_str tmp_str;
+	int res;
+
+	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+	if (IS_ERR(ctx))
+		return -EIO;
+	/* By default, the input name would be written to the disk */
+	tmp_str.name = (unsigned char *)name;
+	tmp_str.len = namelen;
+	if (ctx != NULL) {
+		/* Directory is encrypted */
+		res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
+						     &fname_crypto_str);
+		if (res < 0) {
+			ext4_put_fname_crypto_ctx(&ctx);
+			return -ENOMEM;
+		}
+		res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str);
+		if (res < 0) {
+			ext4_put_fname_crypto_ctx(&ctx);
+			ext4_fname_crypto_free_buffer(&fname_crypto_str);
+			return res;
+		}
+		tmp_str.name = fname_crypto_str.name;
+		tmp_str.len = fname_crypto_str.len;
+	}
+
+	nlen = EXT4_DIR_REC_LEN(de->name_len);
+	rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+	if (de->inode) {
+		struct ext4_dir_entry_2 *de1 =
+			(struct ext4_dir_entry_2 *)((char *)de + nlen);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
+		de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
+		de = de1;
+	}
+	de->file_type = EXT4_FT_UNKNOWN;
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+	de->name_len = tmp_str.len;
+
+	memcpy(de->name, tmp_str.name, tmp_str.len);
+	ext4_put_fname_crypto_ctx(&ctx);
+	ext4_fname_crypto_free_buffer(&fname_crypto_str);
+	return 0;
+}
+
+/*
+ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry.  If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space.  It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode, struct ext4_dir_entry_2 *de,
+			     struct buffer_head *bh)
+{
+	struct inode	*dir = d_inode(dentry->d_parent);
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	unsigned int	blocksize = dir->i_sb->s_blocksize;
+	int		csum_size = 0;
+	int		err;
+
+	if (ext4_has_metadata_csum(inode->i_sb))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	if (!de) {
+		err = ext4_find_dest_de(dir, inode,
+					bh, bh->b_data, blocksize - csum_size,
+					name, namelen, &de);
+		if (err)
+			return err;
+	}
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		ext4_std_error(dir->i_sb, err);
+		return err;
+	}
+
+	/* By now the buffer is marked for journaling. Due to crypto operations,
+	 * the following function call may fail */
+	err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name,
+				 name, namelen);
+	if (err < 0)
+		return err;
+
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 *
+	 * XXX similarly, too many callers depend on
+	 * ext4_new_inode() setting the times, but error
+	 * recovery deletes the inode, so the worst that can
+	 * happen is that the times are slightly out of date
+	 * and/or different from the directory change time.
+	 */
+	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	ext4_update_dx_flag(dir);
+	dir->i_version++;
+	ext4_mark_inode_dirty(handle, dir);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	if (err)
+		ext4_std_error(dir->i_sb, err);
+	return 0;
+}
+
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+			    struct inode *inode, struct buffer_head *bh)
+{
+	struct inode	*dir = d_inode(dentry->d_parent);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	struct ext4_fname_crypto_ctx *ctx = NULL;
+	int res;
+#else
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+#endif
+	struct buffer_head *bh2;
+	struct dx_root	*root;
+	struct dx_frame	frames[2], *frame;
+	struct dx_entry *entries;
+	struct ext4_dir_entry_2	*de, *de2;
+	struct ext4_dir_entry_tail *t;
+	char		*data1, *top;
+	unsigned	len;
+	int		retval;
+	unsigned	blocksize;
+	struct dx_hash_info hinfo;
+	ext4_lblk_t  block;
+	struct fake_dirent *fde;
+	int csum_size = 0;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+#endif
+
+	if (ext4_has_metadata_csum(inode->i_sb))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	blocksize =  dir->i_sb->s_blocksize;
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+	BUFFER_TRACE(bh, "get_write_access");
+	retval = ext4_journal_get_write_access(handle, bh);
+	if (retval) {
+		ext4_std_error(dir->i_sb, retval);
+		brelse(bh);
+		return retval;
+	}
+	root = (struct dx_root *) bh->b_data;
+
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext4_dir_entry_2 *)((char *)fde +
+		ext4_rec_len_from_disk(fde->rec_len, blocksize));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + (blocksize - csum_size) - (char *) de;
+
+	/* Allocate new block for the 0th block's dirents */
+	bh2 = ext4_append(handle, dir, &block);
+	if (IS_ERR(bh2)) {
+		brelse(bh);
+		return PTR_ERR(bh2);
+	}
+	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
+	data1 = bh2->b_data;
+
+	memcpy (data1, de, len);
+	de = (struct ext4_dir_entry_2 *) data1;
+	top = data1 + len;
+	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+		de = de2;
+	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+					   (char *) de,
+					   blocksize);
+
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(data1, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
+	/* Initialize the root; the dot dirents already exist */
+	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+					   blocksize);
+	memset (&root->info, 0, sizeof(root->info));
+	root->info.info_length = sizeof(root->info);
+	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+	entries = root->entries;
+	dx_set_block(entries, 1);
+	dx_set_count(entries, 1);
+	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
+
+	/* Initialize as for dx_probe */
+	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo);
+	if (res < 0) {
+		ext4_put_fname_crypto_ctx(&ctx);
+		ext4_mark_inode_dirty(handle, dir);
+		brelse(bh);
+		return res;
+	}
+	ext4_put_fname_crypto_ctx(&ctx);
+#else
+	ext4fs_dirhash(name, namelen, &hinfo);
+#endif
+	memset(frames, 0, sizeof(frames));
+	frame = frames;
+	frame->entries = entries;
+	frame->at = entries;
+	frame->bh = bh;
+	bh = bh2;
+
+	retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+	if (retval)
+		goto out_frames;	
+	retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	if (retval)
+		goto out_frames;	
+
+	de = do_split(handle,dir, &bh, frame, &hinfo);
+	if (IS_ERR(de)) {
+		retval = PTR_ERR(de);
+		goto out_frames;
+	}
+	dx_release(frames);
+
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	brelse(bh);
+	return retval;
+out_frames:
+	/*
+	 * Even if the block split failed, we have to properly write
+	 * out all the changes we did so far. Otherwise we can end up
+	 * with corrupted filesystem.
+	 */
+	ext4_mark_inode_dirty(handle, dir);
+	dx_release(frames);
+	return retval;
+}
+
+/*
+ *	ext4_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext4_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+			  struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct buffer_head *bh = NULL;
+	struct ext4_dir_entry_2 *de;
+	struct ext4_dir_entry_tail *t;
+	struct super_block *sb;
+	int	retval;
+	int	dx_fallback=0;
+	unsigned blocksize;
+	ext4_lblk_t block, blocks;
+	int	csum_size = 0;
+
+	if (ext4_has_metadata_csum(inode->i_sb))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	sb = dir->i_sb;
+	blocksize = sb->s_blocksize;
+	if (!dentry->d_name.len)
+		return -EINVAL;
+
+	if (ext4_has_inline_data(dir)) {
+		retval = ext4_try_add_inline_entry(handle, dentry, inode);
+		if (retval < 0)
+			return retval;
+		if (retval == 1) {
+			retval = 0;
+			goto out;
+		}
+	}
+
+	if (is_dx(dir)) {
+		retval = ext4_dx_add_entry(handle, dentry, inode);
+		if (!retval || (retval != ERR_BAD_DX_DIR))
+			goto out;
+		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+		dx_fallback++;
+		ext4_mark_inode_dirty(handle, dir);
+	}
+	blocks = dir->i_size >> sb->s_blocksize_bits;
+	for (block = 0; block < blocks; block++) {
+		bh = ext4_read_dirblock(dir, block, DIRENT);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+
+		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+		if (retval != -ENOSPC)
+			goto out;
+
+		if (blocks == 1 && !dx_fallback &&
+		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+			retval = make_indexed_dir(handle, dentry, inode, bh);
+			bh = NULL; /* make_indexed_dir releases bh */
+			goto out;
+		}
+		brelse(bh);
+	}
+	bh = ext4_append(handle, dir, &block);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	de->inode = 0;
+	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
+
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+out:
+	brelse(bh);
+	if (retval == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+	return retval;
+}
+
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct dx_frame frames[2], *frame;
+	struct dx_entry *entries, *at;
+	struct dx_hash_info hinfo;
+	struct buffer_head *bh;
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct super_block *sb = dir->i_sb;
+	struct ext4_dir_entry_2 *de;
+	int err;
+
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
+	if (IS_ERR(frame))
+		return PTR_ERR(frame);
+	entries = frame->entries;
+	at = frame->at;
+	bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+		bh = NULL;
+		goto cleanup;
+	}
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err)
+		goto journal_error;
+
+	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+	if (err != -ENOSPC)
+		goto cleanup;
+
+	/* Block full, should compress but for now just split */
+	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+		       dx_get_count(entries), dx_get_limit(entries)));
+	/* Need to split index? */
+	if (dx_get_count(entries) == dx_get_limit(entries)) {
+		ext4_lblk_t newblock;
+		unsigned icount = dx_get_count(entries);
+		int levels = frame - frames;
+		struct dx_entry *entries2;
+		struct dx_node *node2;
+		struct buffer_head *bh2;
+
+		if (levels && (dx_get_count(frames->entries) ==
+			       dx_get_limit(frames->entries))) {
+			ext4_warning(sb, "Directory index full!");
+			err = -ENOSPC;
+			goto cleanup;
+		}
+		bh2 = ext4_append(handle, dir, &newblock);
+		if (IS_ERR(bh2)) {
+			err = PTR_ERR(bh2);
+			goto cleanup;
+		}
+		node2 = (struct dx_node *)(bh2->b_data);
+		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
+		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+							   sb->s_blocksize);
+		BUFFER_TRACE(frame->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, frame->bh);
+		if (err)
+			goto journal_error;
+		if (levels) {
+			unsigned icount1 = icount/2, icount2 = icount - icount1;
+			unsigned hash2 = dx_get_hash(entries + icount1);
+			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+				       icount1, icount2));
+
+			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+			err = ext4_journal_get_write_access(handle,
+							     frames[0].bh);
+			if (err)
+				goto journal_error;
+
+			memcpy((char *) entries2, (char *) (entries + icount1),
+			       icount2 * sizeof(struct dx_entry));
+			dx_set_count(entries, icount1);
+			dx_set_count(entries2, icount2);
+			dx_set_limit(entries2, dx_node_limit(dir));
+
+			/* Which index block gets the new entry? */
+			if (at - entries >= icount1) {
+				frame->at = at = at - entries - icount1 + entries2;
+				frame->entries = entries = entries2;
+				swap(frame->bh, bh2);
+			}
+			dx_insert_block(frames + 0, hash2, newblock);
+			dxtrace(dx_show_index("node", frames[1].entries));
+			dxtrace(dx_show_index("node",
+			       ((struct dx_node *) bh2->b_data)->entries));
+			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+			if (err)
+				goto journal_error;
+			brelse (bh2);
+		} else {
+			dxtrace(printk(KERN_DEBUG
+				       "Creating second level index...\n"));
+			memcpy((char *) entries2, (char *) entries,
+			       icount * sizeof(struct dx_entry));
+			dx_set_limit(entries2, dx_node_limit(dir));
+
+			/* Set up root */
+			dx_set_count(entries, 1);
+			dx_set_block(entries + 0, newblock);
+			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+
+			/* Add new access path frame */
+			frame = frames + 1;
+			frame->at = at = at - entries + entries2;
+			frame->entries = entries = entries2;
+			frame->bh = bh2;
+			err = ext4_journal_get_write_access(handle,
+							     frame->bh);
+			if (err)
+				goto journal_error;
+		}
+		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+		if (err) {
+			ext4_std_error(inode->i_sb, err);
+			goto cleanup;
+		}
+	}
+	de = do_split(handle, dir, &bh, frame, &hinfo);
+	if (IS_ERR(de)) {
+		err = PTR_ERR(de);
+		goto cleanup;
+	}
+	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	goto cleanup;
+
+journal_error:
+	ext4_std_error(dir->i_sb, err);
+cleanup:
+	brelse(bh);
+	dx_release(frames);
+	return err;
+}
+
+/*
+ * ext4_generic_delete_entry deletes a directory entry by merging it
+ * with the previous entry
+ */
+int ext4_generic_delete_entry(handle_t *handle,
+			      struct inode *dir,
+			      struct ext4_dir_entry_2 *de_del,
+			      struct buffer_head *bh,
+			      void *entry_buf,
+			      int buf_size,
+			      int csum_size)
+{
+	struct ext4_dir_entry_2 *de, *pde;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
+	int i;
+
+	i = 0;
+	pde = NULL;
+	de = (struct ext4_dir_entry_2 *)entry_buf;
+	while (i < buf_size - csum_size) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+					 bh->b_data, bh->b_size, i))
+			return -EIO;
+		if (de == de_del)  {
+			if (pde)
+				pde->rec_len = ext4_rec_len_to_disk(
+					ext4_rec_len_from_disk(pde->rec_len,
+							       blocksize) +
+					ext4_rec_len_from_disk(de->rec_len,
+							       blocksize),
+					blocksize);
+			else
+				de->inode = 0;
+			dir->i_version++;
+			return 0;
+		}
+		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
+		pde = de;
+		de = ext4_next_entry(de, blocksize);
+	}
+	return -ENOENT;
+}
+
+static int ext4_delete_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh)
+{
+	int err, csum_size = 0;
+
+	if (ext4_has_inline_data(dir)) {
+		int has_inline_data = 1;
+		err = ext4_delete_inline_entry(handle, dir, de_del, bh,
+					       &has_inline_data);
+		if (has_inline_data)
+			return err;
+	}
+
+	if (ext4_has_metadata_csum(dir->i_sb))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (unlikely(err))
+		goto out;
+
+	err = ext4_generic_delete_entry(handle, dir, de_del,
+					bh, bh->b_data,
+					dir->i_sb->s_blocksize, csum_size);
+	if (err)
+		goto out;
+
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	if (unlikely(err))
+		goto out;
+
+	return 0;
+out:
+	if (err != -ENOENT)
+		ext4_std_error(dir->i_sb, err);
+	return err;
+}
+
+/*
+ * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
+ * since this indicates that nlinks count was previously 1.
+ */
+static void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+	inc_nlink(inode);
+	if (is_dx(inode) && inode->i_nlink > 1) {
+		/* limit is 16-bit i_links_count */
+		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
+			set_nlink(inode, 1);
+			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
+					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+		}
+	}
+}
+
+/*
+ * If a directory had nlink == 1, then we should let it be 1. This indicates
+ * directory has >EXT4_LINK_MAX subdirs.
+ */
+static void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+		drop_nlink(inode);
+}
+
+
+static int ext4_add_nondir(handle_t *handle,
+		struct dentry *dentry, struct inode *inode)
+{
+	int err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		unlock_new_inode(inode);
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	drop_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       bool excl)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, credits, retries = 0;
+
+	dquot_initialize(dir);
+
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+					    NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+		err = 0;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+		if (!err && (ext4_encrypted_inode(dir) ||
+			     DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) {
+			err = ext4_inherit_context(dir, inode);
+			if (err) {
+				clear_nlink(inode);
+				unlock_new_inode(inode);
+				iput(inode);
+			}
+		}
+#endif
+		if (!err)
+			err = ext4_add_nondir(handle, dentry, inode);
+		if (!err && IS_DIRSYNC(dir))
+			ext4_handle_sync(handle);
+	}
+	if (handle)
+		ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, credits, retries = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+					    NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		inode->i_op = &ext4_special_inode_operations;
+		err = ext4_add_nondir(handle, dentry, inode);
+		if (!err && IS_DIRSYNC(dir))
+			ext4_handle_sync(handle);
+	}
+	if (handle)
+		ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	dquot_initialize(dir);
+
+retry:
+	inode = ext4_new_inode_start_handle(dir, mode,
+					    NULL, 0, NULL,
+					    EXT4_HT_DIR,
+			EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  4 + EXT4_XATTR_TRANS_BLOCKS);
+	handle = ext4_journal_current_handle();
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+		d_tmpfile(dentry, inode);
+		err = ext4_orphan_add(handle, inode);
+		if (err)
+			goto err_unlock_inode;
+		mark_inode_dirty(inode);
+		unlock_new_inode(inode);
+	}
+	if (handle)
+		ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+err_unlock_inode:
+	ext4_journal_stop(handle);
+	unlock_new_inode(inode);
+	return err;
+}
+
+struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+			  struct ext4_dir_entry_2 *de,
+			  int blocksize, int csum_size,
+			  unsigned int parent_ino, int dotdot_real_len)
+{
+	de->inode = cpu_to_le32(inode->i_ino);
+	de->name_len = 1;
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+					   blocksize);
+	strcpy(de->name, ".");
+	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+	de = ext4_next_entry(de, blocksize);
+	de->inode = cpu_to_le32(parent_ino);
+	de->name_len = 2;
+	if (!dotdot_real_len)
+		de->rec_len = ext4_rec_len_to_disk(blocksize -
+					(csum_size + EXT4_DIR_REC_LEN(1)),
+					blocksize);
+	else
+		de->rec_len = ext4_rec_len_to_disk(
+				EXT4_DIR_REC_LEN(de->name_len), blocksize);
+	strcpy(de->name, "..");
+	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+	return ext4_next_entry(de, blocksize);
+}
+
+static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+			     struct inode *inode)
+{
+	struct buffer_head *dir_block = NULL;
+	struct ext4_dir_entry_2 *de;
+	struct ext4_dir_entry_tail *t;
+	ext4_lblk_t block = 0;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
+	int csum_size = 0;
+	int err;
+
+	if (ext4_has_metadata_csum(dir->i_sb))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+		err = ext4_try_create_inline_dir(handle, dir, inode);
+		if (err < 0 && err != -ENOSPC)
+			goto out;
+		if (!err)
+			goto out;
+	}
+
+	inode->i_size = 0;
+	dir_block = ext4_append(handle, inode, &block);
+	if (IS_ERR(dir_block))
+		return PTR_ERR(dir_block);
+	de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+	ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
+	set_nlink(inode, 2);
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
+	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+	if (err)
+		goto out;
+	set_buffer_verified(dir_block);
+out:
+	brelse(dir_block);
+	return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, credits, retries = 0;
+
+	if (EXT4_DIR_LINK_MAX(dir))
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+	inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+					    &dentry->d_name,
+					    0, NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	inode->i_op = &ext4_dir_inode_operations;
+	inode->i_fop = &ext4_dir_operations;
+	err = ext4_init_new_dir(handle, dir, inode);
+	if (err)
+		goto out_clear_inode;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	if (ext4_encrypted_inode(dir) ||
+	    DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) {
+		err = ext4_inherit_context(dir, inode);
+		if (err)
+			goto out_clear_inode;
+	}
+#endif
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (!err)
+		err = ext4_add_entry(handle, dentry, inode);
+	if (err) {
+out_clear_inode:
+		clear_nlink(inode);
+		unlock_new_inode(inode);
+		ext4_mark_inode_dirty(handle, inode);
+		iput(inode);
+		goto out_stop;
+	}
+	ext4_inc_count(handle, dir);
+	ext4_update_dx_flag(dir);
+	err = ext4_mark_inode_dirty(handle, dir);
+	if (err)
+		goto out_clear_inode;
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+out_stop:
+	if (handle)
+		ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ext4_empty_dir(struct inode *inode)
+{
+	unsigned int offset;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *de1;
+	struct super_block *sb;
+	int err = 0;
+
+	if (ext4_has_inline_data(inode)) {
+		int has_inline_data = 1;
+
+		err = empty_inline_dir(inode, &has_inline_data);
+		if (has_inline_data)
+			return err;
+	}
+
+	sb = inode->i_sb;
+	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+		EXT4_ERROR_INODE(inode, "invalid size");
+		return 1;
+	}
+	bh = ext4_read_dirblock(inode, 0, EITHER);
+	if (IS_ERR(bh))
+		return 1;
+
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	de1 = ext4_next_entry(de, sb->s_blocksize);
+	if (le32_to_cpu(de->inode) != inode->i_ino ||
+			!le32_to_cpu(de1->inode) ||
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+		ext4_warning(inode->i_sb,
+			     "bad directory (dir #%lu) - no `.' or `..'",
+			     inode->i_ino);
+		brelse(bh);
+		return 1;
+	}
+	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+	de = ext4_next_entry(de1, sb->s_blocksize);
+	while (offset < inode->i_size) {
+		if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			unsigned int lblock;
+			err = 0;
+			brelse(bh);
+			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+			bh = ext4_read_dirblock(inode, lblock, EITHER);
+			if (IS_ERR(bh))
+				return 1;
+			de = (struct ext4_dir_entry_2 *) bh->b_data;
+		}
+		if (ext4_check_dir_entry(inode, NULL, de, bh,
+					 bh->b_data, bh->b_size, offset)) {
+			de = (struct ext4_dir_entry_2 *)(bh->b_data +
+							 sb->s_blocksize);
+			offset = (offset | (sb->s_blocksize - 1)) + 1;
+			continue;
+		}
+		if (le32_to_cpu(de->inode)) {
+			brelse(bh);
+			return 0;
+		}
+		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+		de = ext4_next_entry(de, sb->s_blocksize);
+	}
+	brelse(bh);
+	return 1;
+}
+
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_mutex unless
+ * we are just creating the inode or deleting it.
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_iloc iloc;
+	int err = 0, rc;
+	bool dirty = false;
+
+	if (!sbi->s_journal || is_bad_inode(inode))
+		return 0;
+
+	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+		     !mutex_is_locked(&inode->i_mutex));
+	/*
+	 * Exit early if inode already is on orphan list. This is a big speedup
+	 * since we don't have to contend on the global s_orphan_lock.
+	 */
+	if (!list_empty(&EXT4_I(inode)->i_orphan))
+		return 0;
+
+	/*
+	 * Orphan handling is only valid for files with data blocks
+	 * being truncated, or files being unlinked. Note that we either
+	 * hold i_mutex, or the inode can not be referenced from outside,
+	 * so i_nlink should not be bumped due to race
+	 */
+	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto out;
+
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out;
+
+	mutex_lock(&sbi->s_orphan_lock);
+	/*
+	 * Due to previous errors inode may be already a part of on-disk
+	 * orphan list. If so skip on-disk list modification.
+	 */
+	if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+	    (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+		/* Insert this inode at the head of the on-disk orphan list */
+		NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+		sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+		dirty = true;
+	}
+	list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+	mutex_unlock(&sbi->s_orphan_lock);
+
+	if (dirty) {
+		err = ext4_handle_dirty_super(handle, sb);
+		rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+		if (!err)
+			err = rc;
+		if (err) {
+			/*
+			 * We have to remove inode from in-memory list if
+			 * addition to on disk orphan list failed. Stray orphan
+			 * list entries can cause panics at unmount time.
+			 */
+			mutex_lock(&sbi->s_orphan_lock);
+			list_del(&EXT4_I(inode)->i_orphan);
+			mutex_unlock(&sbi->s_orphan_lock);
+		}
+	}
+	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+	jbd_debug(4, "orphan inode %lu will point to %d\n",
+			inode->i_ino, NEXT_ORPHAN(inode));
+out:
+	ext4_std_error(sb, err);
+	return err;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+	struct list_head *prev;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u32 ino_next;
+	struct ext4_iloc iloc;
+	int err = 0;
+
+	if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
+		return 0;
+
+	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+		     !mutex_is_locked(&inode->i_mutex));
+	/* Do this quick check before taking global s_orphan_lock. */
+	if (list_empty(&ei->i_orphan))
+		return 0;
+
+	if (handle) {
+		/* Grab inode buffer early before taking global s_orphan_lock */
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+	}
+
+	mutex_lock(&sbi->s_orphan_lock);
+	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+	prev = ei->i_orphan.prev;
+	list_del_init(&ei->i_orphan);
+
+	/* If we're on an error path, we may not have a valid
+	 * transaction handle with which to update the orphan list on
+	 * disk, but we still need to remove the inode from the linked
+	 * list in memory. */
+	if (!handle || err) {
+		mutex_unlock(&sbi->s_orphan_lock);
+		goto out_err;
+	}
+
+	ino_next = NEXT_ORPHAN(inode);
+	if (prev == &sbi->s_orphan) {
+		jbd_debug(4, "superblock will point to %u\n", ino_next);
+		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+		if (err) {
+			mutex_unlock(&sbi->s_orphan_lock);
+			goto out_brelse;
+		}
+		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+		mutex_unlock(&sbi->s_orphan_lock);
+		err = ext4_handle_dirty_super(handle, inode->i_sb);
+	} else {
+		struct ext4_iloc iloc2;
+		struct inode *i_prev =
+			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+		jbd_debug(4, "orphan inode %lu will point to %u\n",
+			  i_prev->i_ino, ino_next);
+		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+		if (err) {
+			mutex_unlock(&sbi->s_orphan_lock);
+			goto out_brelse;
+		}
+		NEXT_ORPHAN(i_prev) = ino_next;
+		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+		mutex_unlock(&sbi->s_orphan_lock);
+	}
+	if (err)
+		goto out_brelse;
+	NEXT_ORPHAN(inode) = 0;
+	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out_err:
+	ext4_std_error(inode->i_sb, err);
+	return err;
+
+out_brelse:
+	brelse(iloc.bh);
+	goto out_err;
+}
+
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	handle_t *handle = NULL;
+
+	/* Initialize quotas before so that eventual writes go in
+	 * separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(d_inode(dentry));
+
+	retval = -ENOENT;
+	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
+	if (!bh)
+		goto end_rmdir;
+
+	inode = d_inode(dentry);
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_rmdir;
+
+	retval = -ENOTEMPTY;
+	if (!ext4_empty_dir(inode))
+		goto end_rmdir;
+
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		handle = NULL;
+		goto end_rmdir;
+	}
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	retval = ext4_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_rmdir;
+	if (!EXT4_DIR_LINK_EMPTY(inode))
+		ext4_warning(inode->i_sb,
+			     "empty directory has too many links (%d)",
+			     inode->i_nlink);
+	inode->i_version++;
+	clear_nlink(inode);
+	/* There's no need to set i_disksize: the fact that i_nlink is
+	 * zero will ensure that the right thing happens during any
+	 * recovery. */
+	inode->i_size = 0;
+	ext4_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_dec_count(handle, dir);
+	ext4_update_dx_flag(dir);
+	ext4_mark_inode_dirty(handle, dir);
+
+end_rmdir:
+	brelse(bh);
+	if (handle)
+		ext4_journal_stop(handle);
+	return retval;
+}
+
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	handle_t *handle = NULL;
+
+	trace_ext4_unlink_enter(dir, dentry);
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(d_inode(dentry));
+
+	retval = -ENOENT;
+	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
+	if (!bh)
+		goto end_unlink;
+
+	inode = d_inode(dentry);
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_unlink;
+
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		handle = NULL;
+		goto end_unlink;
+	}
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	if (!inode->i_nlink) {
+		ext4_warning(inode->i_sb,
+			     "Deleting nonexistent file (%lu), %d",
+			     inode->i_ino, inode->i_nlink);
+		set_nlink(inode, 1);
+	}
+	retval = ext4_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_unlink;
+	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+	ext4_update_dx_flag(dir);
+	ext4_mark_inode_dirty(handle, dir);
+	drop_nlink(inode);
+	if (!inode->i_nlink)
+		ext4_orphan_add(handle, inode);
+	inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+
+end_unlink:
+	brelse(bh);
+	if (handle)
+		ext4_journal_stop(handle);
+	trace_ext4_unlink_exit(dentry, retval);
+	return retval;
+}
+
+static int ext4_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, len = strlen(symname);
+	int credits;
+	bool encryption_required;
+	struct ext4_str disk_link;
+	struct ext4_encrypted_symlink_data *sd = NULL;
+
+	disk_link.len = len + 1;
+	disk_link.name = (char *) symname;
+
+	encryption_required = (ext4_encrypted_inode(dir) ||
+			       DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
+	if (encryption_required)
+		disk_link.len = encrypted_symlink_data_len(len) + 1;
+	if (disk_link.len > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	dquot_initialize(dir);
+
+	if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
+		/*
+		 * For non-fast symlinks, we just allocate inode and put it on
+		 * orphan list in the first transaction => we need bitmap,
+		 * group descriptor, sb, inode block, quota blocks, and
+		 * possibly selinux xattr blocks.
+		 */
+		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  EXT4_XATTR_TRANS_BLOCKS;
+	} else {
+		/*
+		 * Fast symlink. We have to add entry to directory
+		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+		 * allocate new inode (bitmap, group descriptor, inode block,
+		 * quota blocks, sb is already counted in previous macros).
+		 */
+		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+	}
+
+	inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+					    &dentry->d_name, 0, NULL,
+					    EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
+	if (IS_ERR(inode)) {
+		if (handle)
+			ext4_journal_stop(handle);
+		return PTR_ERR(inode);
+	}
+
+	if (encryption_required) {
+		struct ext4_fname_crypto_ctx *ctx = NULL;
+		struct qstr istr;
+		struct ext4_str ostr;
+
+		sd = kzalloc(disk_link.len, GFP_NOFS);
+		if (!sd) {
+			err = -ENOMEM;
+			goto err_drop_inode;
+		}
+		err = ext4_inherit_context(dir, inode);
+		if (err)
+			goto err_drop_inode;
+		ctx = ext4_get_fname_crypto_ctx(inode,
+						inode->i_sb->s_blocksize);
+		if (IS_ERR_OR_NULL(ctx)) {
+			/* We just set the policy, so ctx should not be NULL */
+			err = (ctx == NULL) ? -EIO : PTR_ERR(ctx);
+			goto err_drop_inode;
+		}
+		istr.name = (const unsigned char *) symname;
+		istr.len = len;
+		ostr.name = sd->encrypted_path;
+		err = ext4_fname_usr_to_disk(ctx, &istr, &ostr);
+		ext4_put_fname_crypto_ctx(&ctx);
+		if (err < 0)
+			goto err_drop_inode;
+		sd->len = cpu_to_le16(ostr.len);
+		disk_link.name = (char *) sd;
+	}
+
+	if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
+		inode->i_op = &ext4_symlink_inode_operations;
+		ext4_set_aops(inode);
+		/*
+		 * We cannot call page_symlink() with transaction started
+		 * because it calls into ext4_write_begin() which can wait
+		 * for transaction commit if we are running out of space
+		 * and thus we deadlock. So we have to stop transaction now
+		 * and restart it when symlink contents is written.
+		 * 
+		 * To keep fs consistent in case of crash, we have to put inode
+		 * to orphan list in the mean time.
+		 */
+		drop_nlink(inode);
+		err = ext4_orphan_add(handle, inode);
+		ext4_journal_stop(handle);
+		handle = NULL;
+		if (err)
+			goto err_drop_inode;
+		err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
+		if (err)
+			goto err_drop_inode;
+		/*
+		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+		 */
+		handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			handle = NULL;
+			goto err_drop_inode;
+		}
+		set_nlink(inode, 1);
+		err = ext4_orphan_del(handle, inode);
+		if (err)
+			goto err_drop_inode;
+	} else {
+		/* clear the extent format for fast symlink */
+		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+		inode->i_op = encryption_required ?
+			&ext4_symlink_inode_operations :
+			&ext4_fast_symlink_inode_operations;
+		memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
+		       disk_link.len);
+		inode->i_size = disk_link.len - 1;
+	}
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	err = ext4_add_nondir(handle, dentry, inode);
+	if (!err && IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	if (handle)
+		ext4_journal_stop(handle);
+	kfree(sd);
+	return err;
+err_drop_inode:
+	if (handle)
+		ext4_journal_stop(handle);
+	kfree(sd);
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+static int ext4_link(struct dentry *old_dentry,
+		     struct inode *dir, struct dentry *dentry)
+{
+	handle_t *handle;
+	struct inode *inode = d_inode(old_dentry);
+	int err, retries = 0;
+
+	if (inode->i_nlink >= EXT4_LINK_MAX)
+		return -EMLINK;
+	if (ext4_encrypted_inode(dir) &&
+	    !ext4_is_child_context_consistent_with_parent(dir, inode))
+		return -EPERM;
+	dquot_initialize(dir);
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+		(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	inode->i_ctime = ext4_current_time(inode);
+	ext4_inc_count(handle, inode);
+	ihold(inode);
+
+	err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		/* this can happen only for tmpfile being
+		 * linked the first time
+		 */
+		if (inode->i_nlink == 1)
+			ext4_orphan_del(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+
+/*
+ * Try to find buffer head where contains the parent block.
+ * It should be the inode block if it is inlined or the 1st block
+ * if it is a normal dir.
+ */
+static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
+					struct inode *inode,
+					int *retval,
+					struct ext4_dir_entry_2 **parent_de,
+					int *inlined)
+{
+	struct buffer_head *bh;
+
+	if (!ext4_has_inline_data(inode)) {
+		bh = ext4_read_dirblock(inode, 0, EITHER);
+		if (IS_ERR(bh)) {
+			*retval = PTR_ERR(bh);
+			return NULL;
+		}
+		*parent_de = ext4_next_entry(
+					(struct ext4_dir_entry_2 *)bh->b_data,
+					inode->i_sb->s_blocksize);
+		return bh;
+	}
+
+	*inlined = 1;
+	return ext4_get_first_inline_block(inode, parent_de, retval);
+}
+
+struct ext4_renament {
+	struct inode *dir;
+	struct dentry *dentry;
+	struct inode *inode;
+	bool is_dir;
+	int dir_nlink_delta;
+
+	/* entry for "dentry" */
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	int inlined;
+
+	/* entry for ".." in inode if it's a directory */
+	struct buffer_head *dir_bh;
+	struct ext4_dir_entry_2 *parent_de;
+	int dir_inlined;
+};
+
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+{
+	int retval;
+
+	ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
+					      &retval, &ent->parent_de,
+					      &ent->dir_inlined);
+	if (!ent->dir_bh)
+		return retval;
+	if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
+		return -EIO;
+	BUFFER_TRACE(ent->dir_bh, "get_write_access");
+	return ext4_journal_get_write_access(handle, ent->dir_bh);
+}
+
+static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
+				  unsigned dir_ino)
+{
+	int retval;
+
+	ent->parent_de->inode = cpu_to_le32(dir_ino);
+	BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
+	if (!ent->dir_inlined) {
+		if (is_dx(ent->inode)) {
+			retval = ext4_handle_dirty_dx_node(handle,
+							   ent->inode,
+							   ent->dir_bh);
+		} else {
+			retval = ext4_handle_dirty_dirent_node(handle,
+							       ent->inode,
+							       ent->dir_bh);
+		}
+	} else {
+		retval = ext4_mark_inode_dirty(handle, ent->inode);
+	}
+	if (retval) {
+		ext4_std_error(ent->dir->i_sb, retval);
+		return retval;
+	}
+	return 0;
+}
+
+static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
+		       unsigned ino, unsigned file_type)
+{
+	int retval;
+
+	BUFFER_TRACE(ent->bh, "get write access");
+	retval = ext4_journal_get_write_access(handle, ent->bh);
+	if (retval)
+		return retval;
+	ent->de->inode = cpu_to_le32(ino);
+	if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
+				      EXT4_FEATURE_INCOMPAT_FILETYPE))
+		ent->de->file_type = file_type;
+	ent->dir->i_version++;
+	ent->dir->i_ctime = ent->dir->i_mtime =
+		ext4_current_time(ent->dir);
+	ext4_mark_inode_dirty(handle, ent->dir);
+	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
+	if (!ent->inlined) {
+		retval = ext4_handle_dirty_dirent_node(handle,
+						       ent->dir, ent->bh);
+		if (unlikely(retval)) {
+			ext4_std_error(ent->dir->i_sb, retval);
+			return retval;
+		}
+	}
+	brelse(ent->bh);
+	ent->bh = NULL;
+
+	return 0;
+}
+
+static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
+				  const struct qstr *d_name)
+{
+	int retval = -ENOENT;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+
+	bh = ext4_find_entry(dir, d_name, &de, NULL);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
+	if (bh) {
+		retval = ext4_delete_entry(handle, dir, de, bh);
+		brelse(bh);
+	}
+	return retval;
+}
+
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
+			       int force_reread)
+{
+	int retval;
+	/*
+	 * ent->de could have moved from under us during htree split, so make
+	 * sure that we are deleting the right entry.  We might also be pointing
+	 * to a stale entry in the unused part of ent->bh so just checking inum
+	 * and the name isn't enough.
+	 */
+	if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
+	    ent->de->name_len != ent->dentry->d_name.len ||
+	    strncmp(ent->de->name, ent->dentry->d_name.name,
+		    ent->de->name_len) ||
+	    force_reread) {
+		retval = ext4_find_delete_entry(handle, ent->dir,
+						&ent->dentry->d_name);
+	} else {
+		retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+		if (retval == -ENOENT) {
+			retval = ext4_find_delete_entry(handle, ent->dir,
+							&ent->dentry->d_name);
+		}
+	}
+
+	if (retval) {
+		ext4_warning(ent->dir->i_sb,
+				"Deleting old file (%lu), %d, error=%d",
+				ent->dir->i_ino, ent->dir->i_nlink, retval);
+	}
+}
+
+static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
+{
+	if (ent->dir_nlink_delta) {
+		if (ent->dir_nlink_delta == -1)
+			ext4_dec_count(handle, ent->dir);
+		else
+			ext4_inc_count(handle, ent->dir);
+		ext4_mark_inode_dirty(handle, ent->dir);
+	}
+}
+
+static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
+					      int credits, handle_t **h)
+{
+	struct inode *wh;
+	handle_t *handle;
+	int retries = 0;
+
+	/*
+	 * for inode block, sb block, group summaries,
+	 * and inode bitmap
+	 */
+	credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
+		    EXT4_XATTR_TRANS_BLOCKS + 4);
+retry:
+	wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
+					 &ent->dentry->d_name, 0, NULL,
+					 EXT4_HT_DIR, credits);
+
+	handle = ext4_journal_current_handle();
+	if (IS_ERR(wh)) {
+		if (handle)
+			ext4_journal_stop(handle);
+		if (PTR_ERR(wh) == -ENOSPC &&
+		    ext4_should_retry_alloc(ent->dir->i_sb, &retries))
+			goto retry;
+	} else {
+		*h = handle;
+		init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
+		wh->i_op = &ext4_special_inode_operations;
+	}
+	return wh;
+}
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ *
+ * n.b.  old_{dentry,inode) refers to the source dentry/inode
+ * while new_{dentry,inode) refers to the destination dentry/inode
+ * This comes from rename(const char *oldpath, const char *newpath)
+ */
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry,
+		       unsigned int flags)
+{
+	handle_t *handle = NULL;
+	struct ext4_renament old = {
+		.dir = old_dir,
+		.dentry = old_dentry,
+		.inode = d_inode(old_dentry),
+	};
+	struct ext4_renament new = {
+		.dir = new_dir,
+		.dentry = new_dentry,
+		.inode = d_inode(new_dentry),
+	};
+	int force_reread;
+	int retval;
+	struct inode *whiteout = NULL;
+	int credits;
+	u8 old_file_type;
+
+	dquot_initialize(old.dir);
+	dquot_initialize(new.dir);
+
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	if (new.inode)
+		dquot_initialize(new.inode);
+
+	old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+	if (IS_ERR(old.bh))
+		return PTR_ERR(old.bh);
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	retval = -ENOENT;
+	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+		goto end_rename;
+
+	if ((old.dir != new.dir) &&
+	    ext4_encrypted_inode(new.dir) &&
+	    !ext4_is_child_context_consistent_with_parent(new.dir,
+							  old.inode)) {
+		retval = -EPERM;
+		goto end_rename;
+	}
+
+	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+				 &new.de, &new.inlined);
+	if (IS_ERR(new.bh)) {
+		retval = PTR_ERR(new.bh);
+		new.bh = NULL;
+		goto end_rename;
+	}
+	if (new.bh) {
+		if (!new.inode) {
+			brelse(new.bh);
+			new.bh = NULL;
+		}
+	}
+	if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
+		ext4_alloc_da_blocks(old.inode);
+
+	credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	if (!(flags & RENAME_WHITEOUT)) {
+		handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
+		if (IS_ERR(handle)) {
+			retval = PTR_ERR(handle);
+			handle = NULL;
+			goto end_rename;
+		}
+	} else {
+		whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
+		if (IS_ERR(whiteout)) {
+			retval = PTR_ERR(whiteout);
+			whiteout = NULL;
+			goto end_rename;
+		}
+	}
+
+	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+		ext4_handle_sync(handle);
+
+	if (S_ISDIR(old.inode->i_mode)) {
+		if (new.inode) {
+			retval = -ENOTEMPTY;
+			if (!ext4_empty_dir(new.inode))
+				goto end_rename;
+		} else {
+			retval = -EMLINK;
+			if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
+				goto end_rename;
+		}
+		retval = ext4_rename_dir_prepare(handle, &old);
+		if (retval)
+			goto end_rename;
+	}
+	/*
+	 * If we're renaming a file within an inline_data dir and adding or
+	 * setting the new dirent causes a conversion from inline_data to
+	 * extents/blockmap, we need to force the dirent delete code to
+	 * re-read the directory, or else we end up trying to delete a dirent
+	 * from what is now the extent tree root (or a block map).
+	 */
+	force_reread = (new.dir->i_ino == old.dir->i_ino &&
+			ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
+
+	old_file_type = old.de->file_type;
+	if (whiteout) {
+		/*
+		 * Do this before adding a new entry, so the old entry is sure
+		 * to be still pointing to the valid old entry.
+		 */
+		retval = ext4_setent(handle, &old, whiteout->i_ino,
+				     EXT4_FT_CHRDEV);
+		if (retval)
+			goto end_rename;
+		ext4_mark_inode_dirty(handle, whiteout);
+	}
+	if (!new.bh) {
+		retval = ext4_add_entry(handle, new.dentry, old.inode);
+		if (retval)
+			goto end_rename;
+	} else {
+		retval = ext4_setent(handle, &new,
+				     old.inode->i_ino, old_file_type);
+		if (retval)
+			goto end_rename;
+	}
+	if (force_reread)
+		force_reread = !ext4_test_inode_flag(new.dir,
+						     EXT4_INODE_INLINE_DATA);
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old.inode->i_ctime = ext4_current_time(old.inode);
+	ext4_mark_inode_dirty(handle, old.inode);
+
+	if (!whiteout) {
+		/*
+		 * ok, that's it
+		 */
+		ext4_rename_delete(handle, &old, force_reread);
+	}
+
+	if (new.inode) {
+		ext4_dec_count(handle, new.inode);
+		new.inode->i_ctime = ext4_current_time(new.inode);
+	}
+	old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+	ext4_update_dx_flag(old.dir);
+	if (old.dir_bh) {
+		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+		if (retval)
+			goto end_rename;
+
+		ext4_dec_count(handle, old.dir);
+		if (new.inode) {
+			/* checked ext4_empty_dir above, can't have another
+			 * parent, ext4_dec_count() won't work for many-linked
+			 * dirs */
+			clear_nlink(new.inode);
+		} else {
+			ext4_inc_count(handle, new.dir);
+			ext4_update_dx_flag(new.dir);
+			ext4_mark_inode_dirty(handle, new.dir);
+		}
+	}
+	ext4_mark_inode_dirty(handle, old.dir);
+	if (new.inode) {
+		ext4_mark_inode_dirty(handle, new.inode);
+		if (!new.inode->i_nlink)
+			ext4_orphan_add(handle, new.inode);
+	}
+	retval = 0;
+
+end_rename:
+	brelse(old.dir_bh);
+	brelse(old.bh);
+	brelse(new.bh);
+	if (whiteout) {
+		if (retval)
+			drop_nlink(whiteout);
+		unlock_new_inode(whiteout);
+		iput(whiteout);
+	}
+	if (handle)
+		ext4_journal_stop(handle);
+	return retval;
+}
+
+static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry)
+{
+	handle_t *handle = NULL;
+	struct ext4_renament old = {
+		.dir = old_dir,
+		.dentry = old_dentry,
+		.inode = d_inode(old_dentry),
+	};
+	struct ext4_renament new = {
+		.dir = new_dir,
+		.dentry = new_dentry,
+		.inode = d_inode(new_dentry),
+	};
+	u8 new_file_type;
+	int retval;
+
+	dquot_initialize(old.dir);
+	dquot_initialize(new.dir);
+
+	old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
+				 &old.de, &old.inlined);
+	if (IS_ERR(old.bh))
+		return PTR_ERR(old.bh);
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	retval = -ENOENT;
+	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+		goto end_rename;
+
+	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+				 &new.de, &new.inlined);
+	if (IS_ERR(new.bh)) {
+		retval = PTR_ERR(new.bh);
+		new.bh = NULL;
+		goto end_rename;
+	}
+
+	/* RENAME_EXCHANGE case: old *and* new must both exist */
+	if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+		goto end_rename;
+
+	handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+		(2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+		 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		handle = NULL;
+		goto end_rename;
+	}
+
+	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+		ext4_handle_sync(handle);
+
+	if (S_ISDIR(old.inode->i_mode)) {
+		old.is_dir = true;
+		retval = ext4_rename_dir_prepare(handle, &old);
+		if (retval)
+			goto end_rename;
+	}
+	if (S_ISDIR(new.inode->i_mode)) {
+		new.is_dir = true;
+		retval = ext4_rename_dir_prepare(handle, &new);
+		if (retval)
+			goto end_rename;
+	}
+
+	/*
+	 * Other than the special case of overwriting a directory, parents'
+	 * nlink only needs to be modified if this is a cross directory rename.
+	 */
+	if (old.dir != new.dir && old.is_dir != new.is_dir) {
+		old.dir_nlink_delta = old.is_dir ? -1 : 1;
+		new.dir_nlink_delta = -old.dir_nlink_delta;
+		retval = -EMLINK;
+		if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
+		    (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
+			goto end_rename;
+	}
+
+	new_file_type = new.de->file_type;
+	retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
+	if (retval)
+		goto end_rename;
+
+	retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
+	if (retval)
+		goto end_rename;
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old.inode->i_ctime = ext4_current_time(old.inode);
+	new.inode->i_ctime = ext4_current_time(new.inode);
+	ext4_mark_inode_dirty(handle, old.inode);
+	ext4_mark_inode_dirty(handle, new.inode);
+
+	if (old.dir_bh) {
+		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+		if (retval)
+			goto end_rename;
+	}
+	if (new.dir_bh) {
+		retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
+		if (retval)
+			goto end_rename;
+	}
+	ext4_update_dir_count(handle, &old);
+	ext4_update_dir_count(handle, &new);
+	retval = 0;
+
+end_rename:
+	brelse(old.dir_bh);
+	brelse(new.dir_bh);
+	brelse(old.bh);
+	brelse(new.bh);
+	if (handle)
+		ext4_journal_stop(handle);
+	return retval;
+}
+
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags)
+{
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+		return -EINVAL;
+
+	if (flags & RENAME_EXCHANGE) {
+		return ext4_cross_rename(old_dir, old_dentry,
+					 new_dir, new_dentry);
+	}
+
+	return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations ext4_dir_inode_operations = {
+	.create		= ext4_create,
+	.lookup		= ext4_lookup,
+	.link		= ext4_link,
+	.unlink		= ext4_unlink,
+	.symlink	= ext4_symlink,
+	.mkdir		= ext4_mkdir,
+	.rmdir		= ext4_rmdir,
+	.mknod		= ext4_mknod,
+	.tmpfile	= ext4_tmpfile,
+	.rename2	= ext4_rename2,
+	.setattr	= ext4_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
+	.fiemap         = ext4_fiemap,
+};
+
+const struct inode_operations ext4_special_inode_operations = {
+	.setattr	= ext4_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
+};
diff --git a/kernel/fs/ext4/page-io.c b/kernel/fs/ext4/page-io.c
new file mode 100644
index 000000000..5765f88b3
--- /dev/null
+++ b/kernel/fs/ext4/page-io.c
@@ -0,0 +1,529 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+static struct kmem_cache *io_end_cachep;
+
+int __init ext4_init_pageio(void)
+{
+	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+	if (io_end_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+	kmem_cache_destroy(io_end_cachep);
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
+{
+	int i;
+	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+		struct page *data_page = NULL;
+		struct ext4_crypto_ctx *ctx = NULL;
+#endif
+		struct buffer_head *bh, *head;
+		unsigned bio_start = bvec->bv_offset;
+		unsigned bio_end = bio_start + bvec->bv_len;
+		unsigned under_io = 0;
+		unsigned long flags;
+
+		if (!page)
+			continue;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+		if (!page->mapping) {
+			/* The bounce data pages are unmapped. */
+			data_page = page;
+			ctx = (struct ext4_crypto_ctx *)page_private(data_page);
+			page = ctx->control_page;
+		}
+#endif
+
+		if (error) {
+			SetPageError(page);
+			set_bit(AS_EIO, &page->mapping->flags);
+		}
+		bh = head = page_buffers(page);
+		/*
+		 * We check all buffers in the page under BH_Uptodate_Lock
+		 * to avoid races with other end io clearing async_write flags
+		 */
+		local_irq_save(flags);
+		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+		do {
+			if (bh_offset(bh) < bio_start ||
+			    bh_offset(bh) + bh->b_size > bio_end) {
+				if (buffer_async_write(bh))
+					under_io++;
+				continue;
+			}
+			clear_buffer_async_write(bh);
+			if (error)
+				buffer_io_error(bh);
+		} while ((bh = bh->b_this_page) != head);
+		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+		local_irq_restore(flags);
+		if (!under_io) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+			if (ctx)
+				ext4_restore_control_page(data_page);
+#endif
+			end_page_writeback(page);
+		}
+	}
+}
+
+static void ext4_release_io_end(ext4_io_end_t *io_end)
+{
+	struct bio *bio, *next_bio;
+
+	BUG_ON(!list_empty(&io_end->list));
+	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+	WARN_ON(io_end->handle);
+
+	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+		wake_up_all(ext4_ioend_wq(io_end->inode));
+
+	for (bio = io_end->bio; bio; bio = next_bio) {
+		next_bio = bio->bi_private;
+		ext4_finish_bio(bio);
+		bio_put(bio);
+	}
+	kmem_cache_free(io_end_cachep, io_end);
+}
+
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+	struct inode *inode = io_end->inode;
+
+	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+	/* Wake up anyone waiting on unwritten extent conversion */
+	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+		wake_up_all(ext4_ioend_wq(inode));
+}
+
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
+static int ext4_end_io(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	ssize_t size = io->size;
+	handle_t *handle = io->handle;
+	int ret = 0;
+
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+		   "list->prev 0x%p\n",
+		   io, inode->i_ino, io->list.next, io->list.prev);
+
+	io->handle = NULL;	/* Following call will use up the handle */
+	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+	if (ret < 0) {
+		ext4_msg(inode->i_sb, KERN_EMERG,
+			 "failed to convert unwritten extents to written "
+			 "extents -- potential data loss!  "
+			 "(inode %lu, offset %llu, size %zd, error %d)",
+			 inode->i_ino, offset, size, ret);
+	}
+	ext4_clear_io_unwritten_flag(io);
+	ext4_release_io_end(io);
+	return ret;
+}
+
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
+{
+#ifdef	EXT4FS_DEBUG
+	struct list_head *cur, *before, *after;
+	ext4_io_end_t *io, *io0, *io1;
+
+	if (list_empty(head))
+		return;
+
+	ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+	list_for_each_entry(io, head, list) {
+		cur = &io->list;
+		before = cur->prev;
+		io0 = container_of(before, ext4_io_end_t, list);
+		after = cur->next;
+		io1 = container_of(after, ext4_io_end_t, list);
+
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+			    io, inode->i_ino, io0, io1);
+	}
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+	struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
+	struct workqueue_struct *wq;
+	unsigned long flags;
+
+	/* Only reserved conversions from writeback should enter here */
+	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+	WARN_ON(!io_end->handle && sbi->s_journal);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	wq = sbi->rsv_conversion_wq;
+	if (list_empty(&ei->i_rsv_conversion_list))
+		queue_work(wq, &ei->i_rsv_conversion_work);
+	list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
+
+static int ext4_do_flush_completed_IO(struct inode *inode,
+				      struct list_head *head)
+{
+	ext4_io_end_t *io;
+	struct list_head unwritten;
+	unsigned long flags;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int err, ret = 0;
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	dump_completed_IO(inode, head);
+	list_replace_init(head, &unwritten);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	while (!list_empty(&unwritten)) {
+		io = list_entry(unwritten.next, ext4_io_end_t, list);
+		BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+		list_del_init(&io->list);
+
+		err = ext4_end_io(io);
+		if (unlikely(!ret && err))
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * work on completed IO, to convert unwritten extents to extents
+ */
+void ext4_end_io_rsv_work(struct work_struct *work)
+{
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_rsv_conversion_work);
+	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+	if (io) {
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
+		io->inode = inode;
+		INIT_LIST_HEAD(&io->list);
+		atomic_set(&io->count, 1);
+	}
+	return io;
+}
+
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+			ext4_release_io_end(io_end);
+			return;
+		}
+		ext4_add_complete_io(io_end);
+	}
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+	int err = 0;
+
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+			err = ext4_convert_unwritten_extents(io_end->handle,
+						io_end->inode, io_end->offset,
+						io_end->size);
+			io_end->handle = NULL;
+			ext4_clear_io_unwritten_flag(io_end);
+		}
+		ext4_release_io_end(io_end);
+	}
+	return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+	atomic_inc(&io_end->count);
+	return io_end;
+}
+
+/* BIO completion function for page writeback */
+static void ext4_end_bio(struct bio *bio, int error)
+{
+	ext4_io_end_t *io_end = bio->bi_private;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
+
+	BUG_ON(!io_end);
+	bio->bi_end_io = NULL;
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = 0;
+
+	if (error) {
+		struct inode *inode = io_end->inode;
+
+		ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
+			     "(offset %llu size %ld starting block %llu)",
+			     error, inode->i_ino,
+			     (unsigned long long) io_end->offset,
+			     (long) io_end->size,
+			     (unsigned long long)
+			     bi_sector >> (inode->i_blkbits - 9));
+		mapping_set_error(inode->i_mapping, error);
+	}
+
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+		/*
+		 * Link bio into list hanging from io_end. We have to do it
+		 * atomically as bio completions can be racing against each
+		 * other.
+		 */
+		bio->bi_private = xchg(&io_end->bio, bio);
+		ext4_put_io_end_defer(io_end);
+	} else {
+		/*
+		 * Drop io_end reference early. Inode can get freed once
+		 * we finish the bio.
+		 */
+		ext4_put_io_end_defer(io_end);
+		ext4_finish_bio(bio);
+		bio_put(bio);
+	}
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+	struct bio *bio = io->io_bio;
+
+	if (bio) {
+		bio_get(io->io_bio);
+		submit_bio(io->io_op, io->io_bio);
+		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+		bio_put(io->io_bio);
+	}
+	io->io_bio = NULL;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+			 struct writeback_control *wbc)
+{
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+	io->io_bio = NULL;
+	io->io_end = NULL;
+}
+
+static int io_submit_init_bio(struct ext4_io_submit *io,
+			      struct buffer_head *bh)
+{
+	int nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_end_io = ext4_end_bio;
+	bio->bi_private = ext4_get_io_end(io->io_end);
+	io->io_bio = bio;
+	io->io_next_block = bh->b_blocknr;
+	return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+			    struct inode *inode,
+			    struct page *page,
+			    struct buffer_head *bh)
+{
+	int ret;
+
+	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+		ext4_io_submit(io);
+	}
+	if (io->io_bio == NULL) {
+		ret = io_submit_init_bio(io, bh);
+		if (ret)
+			return ret;
+	}
+	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
+	if (ret != bh->b_size)
+		goto submit_and_retry;
+	io->io_next_block++;
+	return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+			struct page *page,
+			int len,
+			struct writeback_control *wbc,
+			bool keep_towrite)
+{
+	struct page *data_page = NULL;
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, blocksize;
+	struct buffer_head *bh, *head;
+	int ret = 0;
+	int nr_submitted = 0;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+
+	if (keep_towrite)
+		set_page_writeback_keepwrite(page);
+	else
+		set_page_writeback(page);
+	ClearPageError(page);
+
+	/*
+	 * Comments copied from block_write_full_page:
+	 *
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	if (len < PAGE_CACHE_SIZE)
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	/*
+	 * In the first loop we prepare and mark buffers to submit. We have to
+	 * mark all buffers in the page before submitting so that
+	 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+	 * on the first buffer finishes and we are still working on submitting
+	 * the second buffer.
+	 */
+	bh = head = page_buffers(page);
+	do {
+		block_start = bh_offset(bh);
+		if (block_start >= len) {
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+			continue;
+		}
+		if (!buffer_dirty(bh) || buffer_delay(bh) ||
+		    !buffer_mapped(bh) || buffer_unwritten(bh)) {
+			/* A hole? We can safely clear the dirty bit */
+			if (!buffer_mapped(bh))
+				clear_buffer_dirty(bh);
+			if (io->io_bio)
+				ext4_io_submit(io);
+			continue;
+		}
+		if (buffer_new(bh)) {
+			clear_buffer_new(bh);
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		}
+		set_buffer_async_write(bh);
+	} while ((bh = bh->b_this_page) != head);
+
+	bh = head = page_buffers(page);
+
+	if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+		data_page = ext4_encrypt(inode, page);
+		if (IS_ERR(data_page)) {
+			ret = PTR_ERR(data_page);
+			data_page = NULL;
+			goto out;
+		}
+	}
+
+	/* Now submit buffers to write */
+	do {
+		if (!buffer_async_write(bh))
+			continue;
+		ret = io_submit_add_bh(io, inode,
+				       data_page ? data_page : page, bh);
+		if (ret) {
+			/*
+			 * We only get here on ENOMEM.  Not much else
+			 * we can do but mark the page as dirty, and
+			 * better luck next time.
+			 */
+			break;
+		}
+		nr_submitted++;
+		clear_buffer_dirty(bh);
+	} while ((bh = bh->b_this_page) != head);
+
+	/* Error stopped previous loop? Clean up buffers... */
+	if (ret) {
+	out:
+		if (data_page)
+			ext4_restore_control_page(data_page);
+		printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+		redirty_page_for_writepage(wbc, page);
+		do {
+			clear_buffer_async_write(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	unlock_page(page);
+	/* Nothing submitted - we have to end page writeback */
+	if (!nr_submitted)
+		end_page_writeback(page);
+	return ret;
+}
diff --git a/kernel/fs/ext4/readpage.c b/kernel/fs/ext4/readpage.c
new file mode 100644
index 000000000..171b9ac4b
--- /dev/null
+++ b/kernel/fs/ext4/readpage.c
@@ -0,0 +1,328 @@
+/*
+ * linux/fs/ext4/readpage.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This was originally taken from fs/mpage.c
+ *
+ * The intent is the ext4_mpage_readpages() function here is intended
+ * to replace mpage_readpages() in the general case, not just for
+ * encrypted files.  It has some limitations (see below), where it
+ * will fall back to read_block_full_page(), but these limitations
+ * should only be hit when page_size != block_size.
+ *
+ * This will allow us to attach a callback function to support ext4
+ * encryption.
+ *
+ * If anything unusual happens, such as:
+ *
+ * - encountering a page which has buffers
+ * - encountering a page which has a non-hole after a hole
+ * - encountering a page with non-contiguous blocks
+ *
+ * then this code just gives up and calls the buffer_head-based read function.
+ * It does handle a page which has holes at the end - that is a common case:
+ * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/kdev_t.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/prefetch.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/cleancache.h>
+
+#include "ext4.h"
+
+/*
+ * Call ext4_decrypt on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	struct ext4_crypto_ctx *ctx =
+		container_of(work, struct ext4_crypto_ctx, work);
+	struct bio	*bio	= ctx->bio;
+	struct bio_vec	*bv;
+	int		i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+
+		int ret = ext4_decrypt(ctx, page);
+		if (ret) {
+			WARN_ON_ONCE(1);
+			SetPageError(page);
+		} else
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	ext4_release_crypto_ctx(ctx);
+	bio_put(bio);
+#else
+	BUG();
+#endif
+}
+
+static inline bool ext4_bio_encrypted(struct bio *bio)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	return unlikely(bio->bi_private != NULL);
+#else
+	return false;
+#endif
+}
+
+/*
+ * I/O completion handler for multipage BIOs.
+ *
+ * The mpage code never puts partial pages into a BIO (except for end-of-file).
+ * If a page does not map to a contiguous run of blocks then it simply falls
+ * back to block_read_full_page().
+ *
+ * Why is this?  If a page's completion depends on a number of different BIOs
+ * which can complete in any order (or at the same time) then determining the
+ * status of that page is hard.  See end_buffer_async_read() for the details.
+ * There is no point in duplicating all that complexity.
+ */
+static void mpage_end_io(struct bio *bio, int err)
+{
+	struct bio_vec *bv;
+	int i;
+
+	if (ext4_bio_encrypted(bio)) {
+		struct ext4_crypto_ctx *ctx = bio->bi_private;
+
+		if (err) {
+			ext4_release_crypto_ctx(ctx);
+		} else {
+			INIT_WORK(&ctx->work, completion_pages);
+			ctx->bio = bio;
+			queue_work(ext4_read_workqueue, &ctx->work);
+			return;
+		}
+	}
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+
+		if (!err) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	}
+
+	bio_put(bio);
+}
+
+int ext4_mpage_readpages(struct address_space *mapping,
+			 struct list_head *pages, struct page *page,
+			 unsigned nr_pages)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	sector_t last_block_in_bio = 0;
+
+	struct inode *inode = mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	sector_t last_block_in_file;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	unsigned page_block;
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	int length;
+	unsigned relative_block = 0;
+	struct ext4_map_blocks map;
+
+	map.m_pblk = 0;
+	map.m_lblk = 0;
+	map.m_len = 0;
+	map.m_flags = 0;
+
+	for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+		int fully_mapped = 1;
+		unsigned first_hole = blocks_per_page;
+
+		prefetchw(&page->flags);
+		if (pages) {
+			page = list_entry(pages->prev, struct page, lru);
+			list_del(&page->lru);
+			if (add_to_page_cache_lru(page, mapping,
+						  page->index, GFP_KERNEL))
+				goto next_page;
+		}
+
+		if (page_has_buffers(page))
+			goto confused;
+
+		block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+		last_block = block_in_file + nr_pages * blocks_per_page;
+		last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+		if (last_block > last_block_in_file)
+			last_block = last_block_in_file;
+		page_block = 0;
+
+		/*
+		 * Map blocks using the previous result first.
+		 */
+		if ((map.m_flags & EXT4_MAP_MAPPED) &&
+		    block_in_file > map.m_lblk &&
+		    block_in_file < (map.m_lblk + map.m_len)) {
+			unsigned map_offset = block_in_file - map.m_lblk;
+			unsigned last = map.m_len - map_offset;
+
+			for (relative_block = 0; ; relative_block++) {
+				if (relative_block == last) {
+					/* needed? */
+					map.m_flags &= ~EXT4_MAP_MAPPED;
+					break;
+				}
+				if (page_block == blocks_per_page)
+					break;
+				blocks[page_block] = map.m_pblk + map_offset +
+					relative_block;
+				page_block++;
+				block_in_file++;
+			}
+		}
+
+		/*
+		 * Then do more ext4_map_blocks() calls until we are
+		 * done with this page.
+		 */
+		while (page_block < blocks_per_page) {
+			if (block_in_file < last_block) {
+				map.m_lblk = block_in_file;
+				map.m_len = last_block - block_in_file;
+
+				if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
+				set_error_page:
+					SetPageError(page);
+					zero_user_segment(page, 0,
+							  PAGE_CACHE_SIZE);
+					unlock_page(page);
+					goto next_page;
+				}
+			}
+			if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
+				fully_mapped = 0;
+				if (first_hole == blocks_per_page)
+					first_hole = page_block;
+				page_block++;
+				block_in_file++;
+				continue;
+			}
+			if (first_hole != blocks_per_page)
+				goto confused;		/* hole -> non-hole */
+
+			/* Contiguous blocks? */
+			if (page_block && blocks[page_block-1] != map.m_pblk-1)
+				goto confused;
+			for (relative_block = 0; ; relative_block++) {
+				if (relative_block == map.m_len) {
+					/* needed? */
+					map.m_flags &= ~EXT4_MAP_MAPPED;
+					break;
+				} else if (page_block == blocks_per_page)
+					break;
+				blocks[page_block] = map.m_pblk+relative_block;
+				page_block++;
+				block_in_file++;
+			}
+		}
+		if (first_hole != blocks_per_page) {
+			zero_user_segment(page, first_hole << blkbits,
+					  PAGE_CACHE_SIZE);
+			if (first_hole == 0) {
+				SetPageUptodate(page);
+				unlock_page(page);
+				goto next_page;
+			}
+		} else if (fully_mapped) {
+			SetPageMappedToDisk(page);
+		}
+		if (fully_mapped && blocks_per_page == 1 &&
+		    !PageUptodate(page) && cleancache_get_page(page) == 0) {
+			SetPageUptodate(page);
+			goto confused;
+		}
+
+		/*
+		 * This page will go to BIO.  Do we need to send this
+		 * BIO off first?
+		 */
+		if (bio && (last_block_in_bio != blocks[0] - 1)) {
+		submit_and_realloc:
+			submit_bio(READ, bio);
+			bio = NULL;
+		}
+		if (bio == NULL) {
+			struct ext4_crypto_ctx *ctx = NULL;
+
+			if (ext4_encrypted_inode(inode) &&
+			    S_ISREG(inode->i_mode)) {
+				ctx = ext4_get_crypto_ctx(inode);
+				if (IS_ERR(ctx))
+					goto set_error_page;
+			}
+			bio = bio_alloc(GFP_KERNEL,
+				min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+			if (!bio) {
+				if (ctx)
+					ext4_release_crypto_ctx(ctx);
+				goto set_error_page;
+			}
+			bio->bi_bdev = bdev;
+			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+			bio->bi_end_io = mpage_end_io;
+			bio->bi_private = ctx;
+		}
+
+		length = first_hole << blkbits;
+		if (bio_add_page(bio, page, length, 0) < length)
+			goto submit_and_realloc;
+
+		if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
+		     (relative_block == map.m_len)) ||
+		    (first_hole != blocks_per_page)) {
+			submit_bio(READ, bio);
+			bio = NULL;
+		} else
+			last_block_in_bio = blocks[blocks_per_page - 1];
+		goto next_page;
+	confused:
+		if (bio) {
+			submit_bio(READ, bio);
+			bio = NULL;
+		}
+		if (!PageUptodate(page))
+			block_read_full_page(page, ext4_get_block);
+		else
+			unlock_page(page);
+	next_page:
+		if (pages)
+			page_cache_release(page);
+	}
+	BUG_ON(pages && !list_empty(pages));
+	if (bio)
+		submit_bio(READ, bio);
+	return 0;
+}
diff --git a/kernel/fs/ext4/resize.c b/kernel/fs/ext4/resize.c
new file mode 100644
index 000000000..cf0c47204
--- /dev/null
+++ b/kernel/fs/ext4/resize.c
@@ -0,0 +1,2024 @@
+/*
+ *  linux/fs/ext4/resize.c
+ *
+ * Support for resizing an ext4 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+
+
+#define EXT4FS_DEBUG
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+
+int ext4_resize_begin(struct super_block *sb)
+{
+	int ret = 0;
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	/*
+	 * If we are not using the primary superblock/GDT copy don't resize,
+         * because the user tools have no way of handling this.  Probably a
+         * bad time to do it anyways.
+         */
+	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+		ext4_warning(sb, "won't resize using backup superblock at %llu",
+			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+		return -EPERM;
+	}
+
+	/*
+	 * We are not allowed to do online-resizing on a filesystem mounted
+	 * with error, because it can destroy the filesystem easily.
+	 */
+	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+		ext4_warning(sb, "There are errors in the filesystem, "
+			     "so online resizing is not allowed\n");
+		return -EPERM;
+	}
+
+	if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+		ret = -EBUSY;
+
+	return ret;
+}
+
+void ext4_resize_end(struct super_block *sb)
+{
+	clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+	smp_mb__after_atomic();
+}
+
+static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
+					     ext4_group_t group) {
+	return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
+	       EXT4_DESC_PER_BLOCK_BITS(sb);
+}
+
+static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
+					     ext4_group_t group) {
+	group = ext4_meta_bg_first_group(sb, group);
+	return ext4_group_first_block_no(sb, group);
+}
+
+static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
+						ext4_group_t group) {
+	ext4_grpblk_t overhead;
+	overhead = ext4_bg_num_gdb(sb, group);
+	if (ext4_bg_has_super(sb, group))
+		overhead += 1 +
+			  le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+	return overhead;
+}
+
+#define outside(b, first, last)	((b) < (first) || (b) >= (last))
+#define inside(b, first, last)	((b) >= (first) && (b) < (last))
+
+static int verify_group_input(struct super_block *sb,
+			      struct ext4_new_group_data *input)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t start = ext4_blocks_count(es);
+	ext4_fsblk_t end = start + input->blocks_count;
+	ext4_group_t group = input->group;
+	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+	unsigned overhead;
+	ext4_fsblk_t metaend;
+	struct buffer_head *bh = NULL;
+	ext4_grpblk_t free_blocks_count, offset;
+	int err = -EINVAL;
+
+	if (group != sbi->s_groups_count) {
+		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+			     input->group, sbi->s_groups_count);
+		return -EINVAL;
+	}
+
+	overhead = ext4_group_overhead_blocks(sb, group);
+	metaend = start + overhead;
+	input->free_blocks_count = free_blocks_count =
+		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
+		       "(%d free, %u reserved)\n",
+		       ext4_bg_has_super(sb, input->group) ? "normal" :
+		       "no-super", input->group, input->blocks_count,
+		       free_blocks_count, input->reserved_blocks);
+
+	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
+	if (offset != 0)
+			ext4_warning(sb, "Last group not full");
+	else if (input->reserved_blocks > input->blocks_count / 5)
+		ext4_warning(sb, "Reserved blocks too high (%u)",
+			     input->reserved_blocks);
+	else if (free_blocks_count < 0)
+		ext4_warning(sb, "Bad blocks count %u",
+			     input->blocks_count);
+	else if (!(bh = sb_bread(sb, end - 1)))
+		ext4_warning(sb, "Cannot read last block (%llu)",
+			     end - 1);
+	else if (outside(input->block_bitmap, start, end))
+		ext4_warning(sb, "Block bitmap not in group (block %llu)",
+			     (unsigned long long)input->block_bitmap);
+	else if (outside(input->inode_bitmap, start, end))
+		ext4_warning(sb, "Inode bitmap not in group (block %llu)",
+			     (unsigned long long)input->inode_bitmap);
+	else if (outside(input->inode_table, start, end) ||
+		 outside(itend - 1, start, end))
+		ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
+			     (unsigned long long)input->inode_table, itend - 1);
+	else if (input->inode_bitmap == input->block_bitmap)
+		ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
+			     (unsigned long long)input->block_bitmap);
+	else if (inside(input->block_bitmap, input->inode_table, itend))
+		ext4_warning(sb, "Block bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
+			     (unsigned long long)input->block_bitmap,
+			     (unsigned long long)input->inode_table, itend - 1);
+	else if (inside(input->inode_bitmap, input->inode_table, itend))
+		ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
+			     (unsigned long long)input->inode_bitmap,
+			     (unsigned long long)input->inode_table, itend - 1);
+	else if (inside(input->block_bitmap, start, metaend))
+		ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
+			     (unsigned long long)input->block_bitmap,
+			     start, metaend - 1);
+	else if (inside(input->inode_bitmap, start, metaend))
+		ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
+			     (unsigned long long)input->inode_bitmap,
+			     start, metaend - 1);
+	else if (inside(input->inode_table, start, metaend) ||
+		 inside(itend - 1, start, metaend))
+		ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+			     "(%llu-%llu)",
+			     (unsigned long long)input->inode_table,
+			     itend - 1, start, metaend - 1);
+	else
+		err = 0;
+	brelse(bh);
+
+	return err;
+}
+
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+	struct ext4_new_group_data *groups;	/* new_group_data for groups
+						   in the flex group */
+	__u16 *bg_flags;			/* block group flags of groups
+						   in @groups */
+	ext4_group_t count;			/* number of groups in @groups
+						 */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+	struct ext4_new_flex_group_data *flex_gd;
+
+	flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+	if (flex_gd == NULL)
+		goto out3;
+
+	if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+		goto out2;
+	flex_gd->count = flexbg_size;
+
+	flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+				  flexbg_size, GFP_NOFS);
+	if (flex_gd->groups == NULL)
+		goto out2;
+
+	flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+	if (flex_gd->bg_flags == NULL)
+		goto out1;
+
+	return flex_gd;
+
+out1:
+	kfree(flex_gd->groups);
+out2:
+	kfree(flex_gd);
+out3:
+	return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+	kfree(flex_gd->bg_flags);
+	kfree(flex_gd->groups);
+	kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize.  Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
+ */
+static int ext4_alloc_group_tables(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd,
+				int flexbg_size)
+{
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	ext4_fsblk_t start_blk;
+	ext4_fsblk_t last_blk;
+	ext4_group_t src_group;
+	ext4_group_t bb_index = 0;
+	ext4_group_t ib_index = 0;
+	ext4_group_t it_index = 0;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	unsigned overhead;
+	__u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+	src_group = group_data[0].group;
+	last_group  = src_group + flex_gd->count - 1;
+
+	BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+	       (last_group & ~(flexbg_size - 1))));
+next_group:
+	group = group_data[0].group;
+	if (src_group >= group_data[0].group + flex_gd->count)
+		return -ENOSPC;
+	start_blk = ext4_group_first_block_no(sb, src_group);
+	last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+	overhead = ext4_group_overhead_blocks(sb, src_group);
+
+	start_blk += overhead;
+
+	/* We collect contiguous blocks as much as possible. */
+	src_group++;
+	for (; src_group <= last_group; src_group++) {
+		overhead = ext4_group_overhead_blocks(sb, src_group);
+		if (overhead == 0)
+			last_blk += group_data[src_group - group].blocks_count;
+		else
+			break;
+	}
+
+	/* Allocate block bitmaps */
+	for (; bb_index < flex_gd->count; bb_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[bb_index].block_bitmap = start_blk++;
+		group = ext4_get_group_number(sb, start_blk - 1);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		flex_gd->bg_flags[group] &= uninit_mask;
+	}
+
+	/* Allocate inode bitmaps */
+	for (; ib_index < flex_gd->count; ib_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[ib_index].inode_bitmap = start_blk++;
+		group = ext4_get_group_number(sb, start_blk - 1);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		flex_gd->bg_flags[group] &= uninit_mask;
+	}
+
+	/* Allocate inode tables */
+	for (; it_index < flex_gd->count; it_index++) {
+		unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+		ext4_fsblk_t next_group_start;
+
+		if (start_blk + itb > last_blk)
+			goto next_group;
+		group_data[it_index].inode_table = start_blk;
+		group = ext4_get_group_number(sb, start_blk);
+		next_group_start = ext4_group_first_block_no(sb, group + 1);
+		group -= group_data[0].group;
+
+		if (start_blk + itb > next_group_start) {
+			flex_gd->bg_flags[group + 1] &= uninit_mask;
+			overhead = start_blk + itb - next_group_start;
+			group_data[group + 1].free_blocks_count -= overhead;
+			itb -= overhead;
+		}
+
+		group_data[group].free_blocks_count -= itb;
+		flex_gd->bg_flags[group] &= uninit_mask;
+		start_blk += EXT4_SB(sb)->s_itb_per_group;
+	}
+
+	if (test_opt(sb, DEBUG)) {
+		int i;
+		group = group_data[0].group;
+
+		printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+		       "%d groups, flexbg size is %d:\n", flex_gd->count,
+		       flexbg_size);
+
+		for (i = 0; i < flex_gd->count; i++) {
+			printk(KERN_DEBUG "adding %s group %u: %u "
+			       "blocks (%d free)\n",
+			       ext4_bg_has_super(sb, group + i) ? "normal" :
+			       "no-super", group + i,
+			       group_data[i].blocks_count,
+			       group_data[i].free_blocks_count);
+		}
+	}
+	return 0;
+}
+
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+				  ext4_fsblk_t blk)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = sb_getblk(sb, blk);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
+	BUFFER_TRACE(bh, "get_write_access");
+	if ((err = ext4_journal_get_write_access(handle, bh))) {
+		brelse(bh);
+		bh = ERR_PTR(err);
+	} else {
+		memset(bh->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bh);
+	}
+
+	return bh;
+}
+
+/*
+ * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for block_bitmap modifications.
+ */
+static int extend_or_restart_transaction(handle_t *handle, int thresh)
+{
+	int err;
+
+	if (ext4_handle_has_enough_credits(handle, thresh))
+		return 0;
+
+	err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
+	if (err < 0)
+		return err;
+	if (err) {
+		err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+			struct ext4_new_flex_group_data *flex_gd,
+			ext4_fsblk_t block, ext4_group_t count)
+{
+	ext4_group_t count2;
+
+	ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+	for (count2 = count; count > 0; count -= count2, block += count2) {
+		ext4_fsblk_t start;
+		struct buffer_head *bh;
+		ext4_group_t group;
+		int err;
+
+		group = ext4_get_group_number(sb, block);
+		start = ext4_group_first_block_no(sb, group);
+		group -= flex_gd->groups[0].group;
+
+		count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
+		if (count2 > count)
+			count2 = count;
+
+		if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+			BUG_ON(flex_gd->count > 1);
+			continue;
+		}
+
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			return err;
+
+		bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+		if (unlikely(!bh))
+			return -ENOMEM;
+
+		BUFFER_TRACE(bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			return err;
+		ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+			   block - start, count2);
+		ext4_set_bits(bh->b_data, block - start, count2);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
+			return err;
+		brelse(bh);
+	}
+
+	return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ *  1. copy super block and GDT, and initialize group tables if necessary.
+ *     In this step, we only set bits in blocks bitmaps for blocks taken by
+ *     super block and GDT.
+ *  2. allocate group tables in block bitmaps, that is, set bits in block
+ *     bitmap for blocks taken by group tables.
+ */
+static int setup_new_flex_group_blocks(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+	ext4_fsblk_t start;
+	ext4_fsblk_t block;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	__u16 *bg_flags = flex_gd->bg_flags;
+	handle_t *handle;
+	ext4_group_t group, count;
+	struct buffer_head *bh = NULL;
+	int reserved_gdb, i, j, err = 0, err2;
+	int meta_bg;
+
+	BUG_ON(!flex_gd->count || !group_data ||
+	       group_data[0].group != sbi->s_groups_count);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+
+	/* This transaction may be extended/restarted along the way */
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	group = group_data[0].group;
+	for (i = 0; i < flex_gd->count; i++, group++) {
+		unsigned long gdblocks;
+		ext4_grpblk_t overhead;
+
+		gdblocks = ext4_bg_num_gdb(sb, group);
+		start = ext4_group_first_block_no(sb, group);
+
+		if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
+			goto handle_itb;
+
+		if (meta_bg == 1) {
+			ext4_group_t first_group;
+			first_group = ext4_meta_bg_first_group(sb, group);
+			if (first_group != group + 1 &&
+			    first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
+				goto handle_itb;
+		}
+
+		block = start + ext4_bg_has_super(sb, group);
+		/* Copy all of the GDT blocks into the backup in this group */
+		for (j = 0; j < gdblocks; j++, block++) {
+			struct buffer_head *gdb;
+
+			ext4_debug("update backup group %#04llx\n", block);
+			err = extend_or_restart_transaction(handle, 1);
+			if (err)
+				goto out;
+
+			gdb = sb_getblk(sb, block);
+			if (unlikely(!gdb)) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			BUFFER_TRACE(gdb, "get_write_access");
+			err = ext4_journal_get_write_access(handle, gdb);
+			if (err) {
+				brelse(gdb);
+				goto out;
+			}
+			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+			       gdb->b_size);
+			set_buffer_uptodate(gdb);
+
+			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+			if (unlikely(err)) {
+				brelse(gdb);
+				goto out;
+			}
+			brelse(gdb);
+		}
+
+		/* Zero out all of the reserved backup group descriptor
+		 * table blocks
+		 */
+		if (ext4_bg_has_super(sb, group)) {
+			err = sb_issue_zeroout(sb, gdblocks + start + 1,
+					reserved_gdb, GFP_NOFS);
+			if (err)
+				goto out;
+		}
+
+handle_itb:
+		/* Initialize group tables of the grop @group */
+		if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+			goto handle_bb;
+
+		/* Zero out all of the inode table blocks */
+		block = group_data[i].inode_table;
+		ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+			   block, sbi->s_itb_per_group);
+		err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+				       GFP_NOFS);
+		if (err)
+			goto out;
+
+handle_bb:
+		if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+			goto handle_ib;
+
+		/* Initialize block bitmap of the @group */
+		block = group_data[i].block_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			bh = NULL;
+			goto out;
+		}
+		overhead = ext4_group_overhead_blocks(sb, group);
+		if (overhead != 0) {
+			ext4_debug("mark backup superblock %#04llx (+0)\n",
+				   start);
+			ext4_set_bits(bh->b_data, 0, overhead);
+		}
+		ext4_mark_bitmap_end(group_data[i].blocks_count,
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+
+handle_ib:
+		if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+			continue;
+
+		/* Initialize inode bitmap of the @group */
+		block = group_data[i].inode_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+		/* Mark unused entries in inode bitmap used */
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			bh = NULL;
+			goto out;
+		}
+
+		ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+	}
+	bh = NULL;
+
+	/* Mark group tables in block bitmap */
+	for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+		count = group_table_count[j];
+		start = (&group_data[0].block_bitmap)[j];
+		block = start;
+		for (i = 1; i < flex_gd->count; i++) {
+			block += group_table_count[j];
+			if (block == (&group_data[i].block_bitmap)[j]) {
+				count += group_table_count[j];
+				continue;
+			}
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+			count = group_table_count[j];
+			start = (&group_data[i].block_bitmap)[j];
+			block = start;
+		}
+
+		if (count) {
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	brelse(bh);
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
+		err = err2;
+
+	return err;
+}
+
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext4 filesystem.  The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time.  In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
+				  unsigned *five, unsigned *seven)
+{
+	unsigned *min = three;
+	int mult = 3;
+	unsigned ret;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ret = *min;
+		*min += 1;
+		return ret;
+	}
+
+	if (*five < *min) {
+		min = five;
+		mult = 5;
+	}
+	if (*seven < *min) {
+		min = seven;
+		mult = 7;
+	}
+
+	ret = *min;
+	*min *= mult;
+
+	return ret;
+}
+
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order.  Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+			       ext4_group_t end,
+			       struct buffer_head *primary)
+{
+	const ext4_fsblk_t blk = primary->b_blocknr;
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned grp;
+	__le32 *p = (__le32 *)primary->b_data;
+	int gdbackups = 0;
+
+	while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
+		if (le32_to_cpu(*p++) !=
+		    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
+			ext4_warning(sb, "reserved GDT %llu"
+				     " missing grp %d (%llu)",
+				     blk, grp,
+				     grp *
+				     (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+				     blk);
+			return -EINVAL;
+		}
+		if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
+			return -EFBIG;
+	}
+
+	return gdbackups;
+}
+
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode.  The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order.  Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+		       ext4_group_t group)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	struct buffer_head **o_group_desc, **n_group_desc;
+	struct buffer_head *dind;
+	struct buffer_head *gdb_bh;
+	int gdbackups;
+	struct ext4_iloc iloc;
+	__le32 *data;
+	int err;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG
+		       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
+		       gdb_num);
+
+	gdb_bh = sb_bread(sb, gdblock);
+	if (!gdb_bh)
+		return -EIO;
+
+	gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
+	if (gdbackups < 0) {
+		err = gdbackups;
+		goto exit_bh;
+	}
+
+	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_bh;
+	}
+
+	data = (__le32 *)dind->b_data;
+	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
+		ext4_warning(sb, "new group %u GDT block %llu not reserved",
+			     group, gdblock);
+		err = -EINVAL;
+		goto exit_dind;
+	}
+
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (unlikely(err))
+		goto exit_dind;
+
+	BUFFER_TRACE(gdb_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdb_bh);
+	if (unlikely(err))
+		goto exit_dind;
+
+	BUFFER_TRACE(dind, "get_write_access");
+	err = ext4_journal_get_write_access(handle, dind);
+	if (unlikely(err))
+		ext4_std_error(sb, err);
+
+	/* ext4_reserve_inode_write() gets a reference on the iloc */
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (unlikely(err))
+		goto exit_dind;
+
+	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+				     sizeof(struct buffer_head *),
+				     GFP_NOFS);
+	if (!n_group_desc) {
+		err = -ENOMEM;
+		ext4_warning(sb, "not enough memory for %lu groups",
+			     gdb_num + 1);
+		goto exit_inode;
+	}
+
+	/*
+	 * Finally, we have all of the possible failures behind us...
+	 *
+	 * Remove new GDT block from inode double-indirect block and clear out
+	 * the new GDT block for use (which also "frees" the backup GDT blocks
+	 * from the reserved inode).  We don't need to change the bitmaps for
+	 * these blocks, because they are marked as in-use from being in the
+	 * reserved inode, and will become GDT blocks (primary and backup).
+	 */
+	data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
+	err = ext4_handle_dirty_metadata(handle, NULL, dind);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
+	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+	ext4_mark_iloc_dirty(handle, inode, &iloc);
+	memset(gdb_bh->b_data, 0, sb->s_blocksize);
+	err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
+	brelse(dind);
+
+	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	memcpy(n_group_desc, o_group_desc,
+	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	n_group_desc[gdb_num] = gdb_bh;
+	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	EXT4_SB(sb)->s_gdb_count++;
+	kvfree(o_group_desc);
+
+	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+	err = ext4_handle_dirty_super(handle, sb);
+	if (err)
+		ext4_std_error(sb, err);
+
+	return err;
+
+exit_inode:
+	kvfree(n_group_desc);
+	brelse(iloc.bh);
+exit_dind:
+	brelse(dind);
+exit_bh:
+	brelse(gdb_bh);
+
+	ext4_debug("leaving with error %d\n", err);
+	return err;
+}
+
+/*
+ * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ */
+static int add_new_gdb_meta_bg(struct super_block *sb,
+			       handle_t *handle, ext4_group_t group) {
+	ext4_fsblk_t gdblock;
+	struct buffer_head *gdb_bh;
+	struct buffer_head **o_group_desc, **n_group_desc;
+	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+	int err;
+
+	gdblock = ext4_meta_bg_first_block_no(sb, group) +
+		   ext4_bg_has_super(sb, group);
+	gdb_bh = sb_bread(sb, gdblock);
+	if (!gdb_bh)
+		return -EIO;
+	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+				     sizeof(struct buffer_head *),
+				     GFP_NOFS);
+	if (!n_group_desc) {
+		err = -ENOMEM;
+		ext4_warning(sb, "not enough memory for %lu groups",
+			     gdb_num + 1);
+		return err;
+	}
+
+	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	memcpy(n_group_desc, o_group_desc,
+	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	n_group_desc[gdb_num] = gdb_bh;
+	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	EXT4_SB(sb)->s_gdb_count++;
+	kvfree(o_group_desc);
+	BUFFER_TRACE(gdb_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdb_bh);
+	if (unlikely(err))
+		brelse(gdb_bh);
+	return err;
+}
+
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are.  We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident.  The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+			      ext4_group_t group)
+{
+	struct super_block *sb = inode->i_sb;
+	int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+	struct buffer_head **primary;
+	struct buffer_head *dind;
+	struct ext4_iloc iloc;
+	ext4_fsblk_t blk;
+	__le32 *data, *end;
+	int gdbackups = 0;
+	int res, i;
+	int err;
+
+	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
+	if (!primary)
+		return -ENOMEM;
+
+	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_free;
+	}
+
+	blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+					 EXT4_ADDR_PER_BLOCK(sb));
+	end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
+
+	/* Get each reserved primary GDT block and verify it holds backups */
+	for (res = 0; res < reserved_gdb; res++, blk++) {
+		if (le32_to_cpu(*data) != blk) {
+			ext4_warning(sb, "reserved block %llu"
+				     " not at offset %ld",
+				     blk,
+				     (long)(data - (__le32 *)dind->b_data));
+			err = -EINVAL;
+			goto exit_bh;
+		}
+		primary[res] = sb_bread(sb, blk);
+		if (!primary[res]) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+		if (gdbackups < 0) {
+			brelse(primary[res]);
+			err = gdbackups;
+			goto exit_bh;
+		}
+		if (++data >= end)
+			data = (__le32 *)dind->b_data;
+	}
+
+	for (i = 0; i < reserved_gdb; i++) {
+		BUFFER_TRACE(primary[i], "get_write_access");
+		if ((err = ext4_journal_get_write_access(handle, primary[i])))
+			goto exit_bh;
+	}
+
+	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_bh;
+
+	/*
+	 * Finally we can add each of the reserved backup GDT blocks from
+	 * the new group to its reserved primary GDT block.
+	 */
+	blk = group * EXT4_BLOCKS_PER_GROUP(sb);
+	for (i = 0; i < reserved_gdb; i++) {
+		int err2;
+		data = (__le32 *)primary[i]->b_data;
+		/* printk("reserving backup %lu[%u] = %lu\n",
+		       primary[i]->b_blocknr, gdbackups,
+		       blk + primary[i]->b_blocknr); */
+		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+		err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
+		if (!err)
+			err = err2;
+	}
+	inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+	ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+exit_bh:
+	while (--res >= 0)
+		brelse(primary[res]);
+	brelse(dind);
+
+exit_free:
+	kfree(primary);
+
+	return err;
+}
+
+/*
+ * Update the backup copies of the ext4 metadata.  These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem).  However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock.  The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb, int blk_off, char *data,
+			   int size, int meta_bg)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t last;
+	const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	ext4_group_t group = 0;
+	int rest = sb->s_blocksize - size;
+	handle_t *handle;
+	int err = 0, err2;
+
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
+	if (IS_ERR(handle)) {
+		group = 1;
+		err = PTR_ERR(handle);
+		goto exit_err;
+	}
+
+	if (meta_bg == 0) {
+		group = ext4_list_backups(sb, &three, &five, &seven);
+		last = sbi->s_groups_count;
+	} else {
+		group = ext4_meta_bg_first_group(sb, group) + 1;
+		last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
+	}
+
+	while (group < sbi->s_groups_count) {
+		struct buffer_head *bh;
+		ext4_fsblk_t backup_block;
+
+		/* Out of journal space, and can't get more - abort - so sad */
+		if (ext4_handle_valid(handle) &&
+		    handle->h_buffer_credits == 0 &&
+		    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
+		    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+			break;
+
+		if (meta_bg == 0)
+			backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
+		else
+			backup_block = (ext4_group_first_block_no(sb, group) +
+					ext4_bg_has_super(sb, group));
+
+		bh = sb_getblk(sb, backup_block);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			break;
+		}
+		ext4_debug("update metadata backup %llu(+%llu)\n",
+			   backup_block, backup_block -
+			   ext4_group_first_block_no(sb, group));
+		BUFFER_TRACE(bh, "get_write_access");
+		if ((err = ext4_journal_get_write_access(handle, bh)))
+			break;
+		lock_buffer(bh);
+		memcpy(bh->b_data, data, size);
+		if (rest)
+			memset(bh->b_data + size, 0, rest);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
+			ext4_std_error(sb, err);
+		brelse(bh);
+
+		if (meta_bg == 0)
+			group = ext4_list_backups(sb, &three, &five, &seven);
+		else if (group == last)
+			break;
+		else
+			group = last;
+	}
+	if ((err2 = ext4_journal_stop(handle)) && !err)
+		err = err2;
+
+	/*
+	 * Ugh! Need to have e2fsck write the backup copies.  It is too
+	 * late to revert the resize, we shouldn't fail just because of
+	 * the backup copies (they are only needed in case of corruption).
+	 *
+	 * However, if we got here we have a journal problem too, so we
+	 * can't really start a transaction to mark the superblock.
+	 * Chicken out and just set the flag on the hope it will be written
+	 * to disk, and if not - we will simply wait until next fsck.
+	 */
+exit_err:
+	if (err) {
+		ext4_warning(sb, "can't update backup for group %u (err %d), "
+			     "forcing fsck on next reboot", group, err);
+		sbi->s_mount_state &= ~EXT4_VALID_FS;
+		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+		mark_buffer_dirty(sbi->s_sbh);
+	}
+}
+
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+			      ext4_group_t group, struct inode *resize_inode,
+			      ext4_group_t count)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *gdb_bh;
+	int i, gdb_off, gdb_num, err = 0;
+	int meta_bg;
+
+	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	for (i = 0; i < count; i++, group++) {
+		int reserved_gdb = ext4_bg_has_super(sb, group) ?
+			le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * We will only either add reserved group blocks to a backup group
+		 * or remove reserved blocks for the first group in a new group block.
+		 * Doing both would be mean more complex code, and sane people don't
+		 * use non-sparse filesystems anymore.  This is already checked above.
+		 */
+		if (gdb_off) {
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			BUFFER_TRACE(gdb_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, gdb_bh);
+
+			if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+				err = reserve_backup_gdb(handle, resize_inode, group);
+		} else if (meta_bg != 0) {
+			err = add_new_gdb_meta_bg(sb, handle, group);
+		} else {
+			err = add_new_gdb(handle, resize_inode, group);
+		}
+		if (err)
+			break;
+	}
+	return err;
+}
+
+static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
+{
+	struct buffer_head *bh = sb_getblk(sb, block);
+	if (unlikely(!bh))
+		return NULL;
+	if (!bh_uptodate_or_lock(bh)) {
+		if (bh_submit_read(bh) < 0) {
+			brelse(bh);
+			return NULL;
+		}
+	}
+
+	return bh;
+}
+
+static int ext4_set_bitmap_checksums(struct super_block *sb,
+				     ext4_group_t group,
+				     struct ext4_group_desc *gdp,
+				     struct ext4_new_group_data *group_data)
+{
+	struct buffer_head *bh;
+
+	if (!ext4_has_metadata_csum(sb))
+		return 0;
+
+	bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
+	if (!bh)
+		return -EIO;
+	ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
+				   EXT4_INODES_PER_GROUP(sb) / 8);
+	brelse(bh);
+
+	bh = ext4_get_bitmap(sb, group_data->block_bitmap);
+	if (!bh)
+		return -EIO;
+	ext4_block_bitmap_csum_set(sb, group, gdp, bh);
+	brelse(bh);
+
+	return 0;
+}
+
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_new_group_data	*group_data = flex_gd->groups;
+	struct ext4_group_desc		*gdp;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	struct buffer_head		*gdb_bh;
+	ext4_group_t			group;
+	__u16				*bg_flags = flex_gd->bg_flags;
+	int				i, gdb_off, gdb_num, err = 0;
+	
+
+	for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+		group = group_data->group;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+		 */
+		gdb_bh = sbi->s_group_desc[gdb_num];
+		/* Update group descriptor block for new group */
+		gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
+						 gdb_off * EXT4_DESC_SIZE(sb));
+
+		memset(gdp, 0, EXT4_DESC_SIZE(sb));
+		ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+		ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+		err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+		if (err) {
+			ext4_std_error(sb, err);
+			break;
+		}
+
+		ext4_inode_table_set(sb, gdp, group_data->inode_table);
+		ext4_free_group_clusters_set(sb, gdp,
+			EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
+		ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+		if (ext4_has_group_desc_csum(sb))
+			ext4_itable_unused_set(sb, gdp,
+					       EXT4_INODES_PER_GROUP(sb));
+		gdp->bg_flags = cpu_to_le16(*bg_flags);
+		ext4_group_desc_csum_set(sb, group, gdp);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+		if (unlikely(err)) {
+			ext4_std_error(sb, err);
+			break;
+		}
+
+		/*
+		 * We can allocate memory for mb_alloc based on the new group
+		 * descriptor
+		 */
+		err = ext4_mb_add_groupinfo(sb, group, gdp);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+			     struct ext4_new_flex_group_data *flex_gd)
+{
+	ext4_fsblk_t blocks_count = 0;
+	ext4_fsblk_t free_blocks = 0;
+	ext4_fsblk_t reserved_blocks = 0;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	for (i = 0; i < flex_gd->count; i++) {
+		blocks_count += group_data[i].blocks_count;
+		free_blocks += group_data[i].free_blocks_count;
+	}
+
+	reserved_blocks = ext4_r_blocks_count(es) * 100;
+	reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
+	reserved_blocks *= blocks_count;
+	do_div(reserved_blocks, 100);
+
+	ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
+	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+	le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+
+	ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers must perform a smp_wmb() after updating all
+	 *   dependent data and before modifying the groups count
+	 *
+	 * * Readers must perform an smp_rmb() after reading the groups
+	 *   count and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count += flex_gd->count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+				reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_NUM_B2C(sbi, free_blocks));
+	percpu_counter_add(&sbi->s_freeinodes_counter,
+			   EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+	ext4_debug("free blocks count %llu",
+		   percpu_counter_read(&sbi->s_freeclusters_counter));
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+				      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, group_data[0].group);
+		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+	}
+
+	/*
+	 * Update the fs overhead information
+	 */
+	ext4_calculate_overhead(sb);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: added group %u:"
+		       "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+		       blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+			       struct inode *resize_inode,
+			       struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t o_blocks_count;
+	ext4_grpblk_t last;
+	ext4_group_t group;
+	handle_t *handle;
+	unsigned reserved_gdb;
+	int err = 0, err2 = 0, credit;
+
+	BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+	o_blocks_count = ext4_blocks_count(es);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+
+	err = setup_new_flex_group_blocks(sb, flex_gd);
+	if (err)
+		goto exit;
+	/*
+	 * We will always be modifying at least the superblock and  GDT
+	 * blocks.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	credit = 3;	/* sb, resize inode, resize inode dindirect */
+	/* GDT blocks */
+	credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb));
+	credit += reserved_gdb;	/* Reserved GDT dindirect blocks */
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+
+	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto exit_journal;
+
+	group = flex_gd->groups[0].group;
+	BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+	err = ext4_add_new_descs(handle, sb, group,
+				resize_inode, flex_gd->count);
+	if (err)
+		goto exit_journal;
+
+	err = ext4_setup_new_descs(handle, sb, flex_gd);
+	if (err)
+		goto exit_journal;
+
+	ext4_update_super(sb, flex_gd);
+
+	err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+	err2 = ext4_journal_stop(handle);
+	if (!err)
+		err = err2;
+
+	if (!err) {
+		int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+		int gdb_num_end = ((group + flex_gd->count - 1) /
+				   EXT4_DESC_PER_BLOCK(sb));
+		int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
+				EXT4_FEATURE_INCOMPAT_META_BG);
+		sector_t old_gdb = 0;
+
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block), 0);
+		for (; gdb_num <= gdb_num_end; gdb_num++) {
+			struct buffer_head *gdb_bh;
+
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			if (old_gdb == gdb_bh->b_blocknr)
+				continue;
+			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+				       gdb_bh->b_size, meta_bg);
+			old_gdb = gdb_bh->b_blocknr;
+		}
+	}
+exit:
+	return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+				    struct ext4_new_flex_group_data *flex_gd,
+				    ext4_fsblk_t n_blocks_count,
+				    unsigned long flexbg_size)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t n_group;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	ext4_grpblk_t last;
+	ext4_grpblk_t blocks_per_group;
+	unsigned long i;
+
+	blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (o_blocks_count == n_blocks_count)
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+	last_group = group | (flexbg_size - 1);
+	if (last_group > n_group)
+		last_group = n_group;
+
+	flex_gd->count = last_group - group + 1;
+
+	for (i = 0; i < flex_gd->count; i++) {
+		int overhead;
+
+		group_data[i].group = group + i;
+		group_data[i].blocks_count = blocks_per_group;
+		overhead = ext4_group_overhead_blocks(sb, group + i);
+		group_data[i].free_blocks_count = blocks_per_group - overhead;
+		if (ext4_has_group_desc_csum(sb)) {
+			flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+					       EXT4_BG_INODE_UNINIT;
+			if (!test_opt(sb, INIT_INODE_TABLE))
+				flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+		} else
+			flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+	}
+
+	if (last_group == n_group && ext4_has_group_desc_csum(sb))
+		/* We need to initialize block bitmap of last group. */
+		flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+	if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+		group_data[i - 1].blocks_count = last + 1;
+		group_data[i - 1].free_blocks_count -= blocks_per_group-
+					last - 1;
+	}
+
+	return 1;
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock.  Prior to that we have
+ * not really "added" the group at all.  We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+	struct ext4_new_flex_group_data flex_gd;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+	struct inode *inode = NULL;
+	int gdb_off;
+	int err;
+	__u16 bg_flags = 0;
+
+	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+
+	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ext4_warning(sb, "Can't resize non-sparse filesystem further");
+		return -EPERM;
+	}
+
+	if (ext4_blocks_count(es) + input->blocks_count <
+	    ext4_blocks_count(es)) {
+		ext4_warning(sb, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_inodes_count)) {
+		ext4_warning(sb, "inodes_count overflow");
+		return -EINVAL;
+	}
+
+	if (reserved_gdb || gdb_off == 0) {
+		if (!EXT4_HAS_COMPAT_FEATURE(sb,
+					     EXT4_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+			ext4_warning(sb,
+				     "No reserved GDT blocks, can't resize");
+			return -EPERM;
+		}
+		inode = ext4_iget(sb, EXT4_RESIZE_INO);
+		if (IS_ERR(inode)) {
+			ext4_warning(sb, "Error opening resize inode");
+			return PTR_ERR(inode);
+		}
+	}
+
+
+	err = verify_group_input(sb, input);
+	if (err)
+		goto out;
+
+	err = ext4_alloc_flex_bg_array(sb, input->group + 1);
+	if (err)
+		goto out;
+
+	err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
+	if (err)
+		goto out;
+
+	flex_gd.count = 1;
+	flex_gd.groups = input;
+	flex_gd.bg_flags = &bg_flags;
+	err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
+	iput(inode);
+	return err;
+} /* ext4_group_add */
+
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+				      ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	handle_t *handle;
+	int err = 0, err2;
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext4_group_add_blocks().
+	 */
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext4_warning(sb, "error %d on journal start", err);
+		return err;
+	}
+
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err) {
+		ext4_warning(sb, "error %d on journal write access", err);
+		goto errout;
+	}
+
+	ext4_blocks_count_set(es, o_blocks_count + add);
+	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
+	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
+	if (err)
+		goto errout;
+	ext4_handle_dirty_super(handle, sb);
+	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+errout:
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
+		err = err2;
+
+	if (!err) {
+		if (test_opt(sb, DEBUG))
+			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+			       "blocks\n", ext4_blocks_count(es));
+		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+			       (char *)es, sizeof(struct ext4_super_block), 0);
+	}
+	return err;
+}
+
+/*
+ * Extend the filesystem to the new number of blocks specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext4_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
+		      ext4_fsblk_t n_blocks_count)
+{
+	ext4_fsblk_t o_blocks_count;
+	ext4_grpblk_t last;
+	ext4_grpblk_t add;
+	struct buffer_head *bh;
+	int err;
+	ext4_group_t group;
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (test_opt(sb, DEBUG))
+		ext4_msg(sb, KERN_DEBUG,
+			 "extending last group from %llu to %llu blocks",
+			 o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+		return 0;
+
+	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		ext4_msg(sb, KERN_ERR,
+			 "filesystem too large to resize to %llu blocks safely",
+			 n_blocks_count);
+		if (sizeof(sector_t) < 8)
+			ext4_warning(sb, "CONFIG_LBDAF not enabled");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count < o_blocks_count) {
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EINVAL;
+	}
+
+	/* Handle the remaining blocks in the last group only. */
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+
+	if (last == 0) {
+		ext4_warning(sb, "need to use ext2online to resize further");
+		return -EPERM;
+	}
+
+	add = EXT4_BLOCKS_PER_GROUP(sb) - last;
+
+	if (o_blocks_count + add < o_blocks_count) {
+		ext4_warning(sb, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (o_blocks_count + add > n_blocks_count)
+		add = n_blocks_count - o_blocks_count;
+
+	if (o_blocks_count + add < n_blocks_count)
+		ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
+			     o_blocks_count + add, add);
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, o_blocks_count + add - 1);
+	if (!bh) {
+		ext4_warning(sb, "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+	return err;
+} /* ext4_group_extend */
+
+
+static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
+{
+	return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
+}
+
+/*
+ * Release the resize inode and drop the resize_inode feature if there
+ * are no more reserved gdt blocks, and then convert the file system
+ * to enable meta_bg
+ */
+static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
+{
+	handle_t *handle;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_fsblk_t nr;
+	int i, ret, err = 0;
+	int credits = 1;
+
+	ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
+	if (inode) {
+		if (es->s_reserved_gdt_blocks) {
+			ext4_error(sb, "Unexpected non-zero "
+				   "s_reserved_gdt_blocks");
+			return -EPERM;
+		}
+
+		/* Do a quick sanity check of the resize inode */
+		if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+			goto invalid_resize_inode;
+		for (i = 0; i < EXT4_N_BLOCKS; i++) {
+			if (i == EXT4_DIND_BLOCK) {
+				if (ei->i_data[i])
+					continue;
+				else
+					goto invalid_resize_inode;
+			}
+			if (ei->i_data[i])
+				goto invalid_resize_inode;
+		}
+		credits += 3;	/* block bitmap, bg descriptor, resize inode */
+	}
+
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto errout;
+
+	EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	sbi->s_es->s_first_meta_bg =
+		cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+
+	err = ext4_handle_dirty_super(handle, sb);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto errout;
+	}
+
+	if (inode) {
+		nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
+		ext4_free_blocks(handle, inode, NULL, nr, 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
+		ei->i_data[EXT4_DIND_BLOCK] = 0;
+		inode->i_blocks = 0;
+
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (err)
+			ext4_std_error(sb, err);
+	}
+
+errout:
+	ret = ext4_journal_stop(handle);
+	if (!err)
+		err = ret;
+	return ret;
+
+invalid_resize_inode:
+	ext4_error(sb, "corrupted/inconsistent resize inode");
+	return -EINVAL;
+}
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+	struct ext4_new_flex_group_data *flex_gd = NULL;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *bh;
+	struct inode *resize_inode = NULL;
+	ext4_grpblk_t add, offset;
+	unsigned long n_desc_blocks;
+	unsigned long o_desc_blocks;
+	ext4_group_t o_group;
+	ext4_group_t n_group;
+	ext4_fsblk_t o_blocks_count;
+	ext4_fsblk_t n_blocks_count_retry = 0;
+	unsigned long last_update_time = 0;
+	int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+	int meta_bg;
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, n_blocks_count - 1);
+	if (!bh) {
+		ext4_warning(sb, "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+retry:
+	o_blocks_count = ext4_blocks_count(es);
+
+	ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
+		 "to %llu blocks", o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count < o_blocks_count) {
+		/* On-line shrinking not supported */
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count == o_blocks_count)
+		/* Nothing need to do */
+		return 0;
+
+	n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+	if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+		ext4_warning(sb, "resize would cause inodes_count overflow");
+		return -EINVAL;
+	}
+	ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
+
+	n_desc_blocks = num_desc_blocks(sb, n_group + 1);
+	o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
+
+	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+		if (meta_bg) {
+			ext4_error(sb, "resize_inode and meta_bg enabled "
+				   "simultaneously");
+			return -EINVAL;
+		}
+		if (n_desc_blocks > o_desc_blocks +
+		    le16_to_cpu(es->s_reserved_gdt_blocks)) {
+			n_blocks_count_retry = n_blocks_count;
+			n_desc_blocks = o_desc_blocks +
+				le16_to_cpu(es->s_reserved_gdt_blocks);
+			n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
+			n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
+			n_group--; /* set to last group number */
+		}
+
+		if (!resize_inode)
+			resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+		if (IS_ERR(resize_inode)) {
+			ext4_warning(sb, "Error opening resize inode");
+			return PTR_ERR(resize_inode);
+		}
+	}
+
+	if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+		err = ext4_convert_meta_bg(sb, resize_inode);
+		if (err)
+			goto out;
+		if (resize_inode) {
+			iput(resize_inode);
+			resize_inode = NULL;
+		}
+		if (n_blocks_count_retry) {
+			n_blocks_count = n_blocks_count_retry;
+			n_blocks_count_retry = 0;
+			goto retry;
+		}
+	}
+
+	/* extend the last group */
+	if (n_group == o_group)
+		add = n_blocks_count - o_blocks_count;
+	else
+		add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+	if (add > 0) {
+		err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+		if (err)
+			goto out;
+	}
+
+	if (ext4_blocks_count(es) == n_blocks_count)
+		goto out;
+
+	err = ext4_alloc_flex_bg_array(sb, n_group + 1);
+	if (err)
+		return err;
+
+	err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
+	if (err)
+		goto out;
+
+	flex_gd = alloc_flex_gd(flexbg_size);
+	if (flex_gd == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Add flex groups. Note that a regular group is a
+	 * flex group with 1 group.
+	 */
+	while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+					      flexbg_size)) {
+		if (jiffies - last_update_time > HZ * 10) {
+			if (last_update_time)
+				ext4_msg(sb, KERN_INFO,
+					 "resized to %llu blocks",
+					 ext4_blocks_count(es));
+			last_update_time = jiffies;
+		}
+		if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+			break;
+		err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+		if (unlikely(err))
+			break;
+	}
+
+	if (!err && n_blocks_count_retry) {
+		n_blocks_count = n_blocks_count_retry;
+		n_blocks_count_retry = 0;
+		free_flex_gd(flex_gd);
+		flex_gd = NULL;
+		goto retry;
+	}
+
+out:
+	if (flex_gd)
+		free_flex_gd(flex_gd);
+	if (resize_inode != NULL)
+		iput(resize_inode);
+	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
+	return err;
+}
diff --git a/kernel/fs/ext4/super.c b/kernel/fs/ext4/super.c
new file mode 100644
index 000000000..ca9d4a2fe
--- /dev/null
+++ b/kernel/fs/ext4/super.c
@@ -0,0 +1,5668 @@
+/*
+ *  linux/fs/ext4/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/log2.h>
+#include <linux/crc16.h>
+#include <linux/cleancache.h>
+#include <asm/uaccess.h>
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"	/* Needed for trace points definition */
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "mballoc.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
+static struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
+
+static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
+			     unsigned long journal_devnum);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
+static int ext4_commit_super(struct super_block *sb, int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es);
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es);
+static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int ext4_unfreeze(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext2");
+MODULE_ALIAS("ext2");
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext3");
+MODULE_ALIAS("ext3");
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
+
+static int ext4_verify_csum_type(struct super_block *sb,
+				 struct ext4_super_block *es)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
+}
+
+static __le32 ext4_superblock_csum(struct super_block *sb,
+				   struct ext4_super_block *es)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int offset = offsetof(struct ext4_super_block, s_checksum);
+	__u32 csum;
+
+	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+
+	return cpu_to_le32(csum);
+}
+
+static int ext4_superblock_csum_verify(struct super_block *sb,
+				       struct ext4_super_block *es)
+{
+	if (!ext4_has_metadata_csum(sb))
+		return 1;
+
+	return es->s_checksum == ext4_superblock_csum(sb, es);
+}
+
+void ext4_superblock_csum_set(struct super_block *sb)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	if (!ext4_has_metadata_csum(sb))
+		return;
+
+	es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
+void *ext4_kvmalloc(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	ret = kmalloc(size, flags | __GFP_NOWARN);
+	if (!ret)
+		ret = __vmalloc(size, flags, PAGE_KERNEL);
+	return ret;
+}
+
+void *ext4_kvzalloc(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	ret = kzalloc(size, flags | __GFP_NOWARN);
+	if (!ret)
+		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+	return ret;
+}
+
+ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_block_bitmap_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_inode_table_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+
+__u32 ext4_free_group_clusters(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_itable_unused_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
+void ext4_block_bitmap_set(struct super_block *sb,
+			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_bitmap_set(struct super_block *sb,
+			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_table_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_free_group_clusters_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
+
+static void __save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+	if (bdev_read_only(sb->s_bdev))
+		return;
+	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+	es->s_last_error_time = cpu_to_le32(get_seconds());
+	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+	es->s_last_error_line = cpu_to_le32(line);
+	if (!es->s_first_error_time) {
+		es->s_first_error_time = es->s_last_error_time;
+		strncpy(es->s_first_error_func, func,
+			sizeof(es->s_first_error_func));
+		es->s_first_error_line = cpu_to_le32(line);
+		es->s_first_error_ino = es->s_last_error_ino;
+		es->s_first_error_block = es->s_last_error_block;
+	}
+	/*
+	 * Start the daily error reporting function if it hasn't been
+	 * started already
+	 */
+	if (!es->s_error_count)
+		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+	le32_add_cpu(&es->s_error_count, 1);
+}
+
+static void save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
+{
+	__save_error_info(sb, func, line);
+	ext4_commit_super(sb, 1);
+}
+
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+	struct super_block		*sb = journal->j_private;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	int				error = is_journal_aborted(journal);
+	struct ext4_journal_cb_entry	*jce;
+
+	BUG_ON(txn->t_state == T_FINISHED);
+	spin_lock(&sbi->s_md_lock);
+	while (!list_empty(&txn->t_private_list)) {
+		jce = list_entry(txn->t_private_list.next,
+				 struct ext4_journal_cb_entry, jce_list);
+		list_del_init(&jce->jce_list);
+		spin_unlock(&sbi->s_md_lock);
+		jce->jce_func(sb, jce, error);
+		spin_lock(&sbi->s_md_lock);
+	}
+	spin_unlock(&sbi->s_md_lock);
+}
+
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock.  That is not possible on ext4, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the jbd2_journal_abort() error code to record an error in
+ * the journal instead.  On recovery, the journal will complain about
+ * that error until we've noted it down and cleared it.
+ */
+
+static void ext4_handle_error(struct super_block *sb)
+{
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (!test_opt(sb, ERRORS_CONT)) {
+		journal_t *journal = EXT4_SB(sb)->s_journal;
+
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		if (journal)
+			jbd2_journal_abort(journal, -EIO);
+	}
+	if (test_opt(sb, ERRORS_RO)) {
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
+		sb->s_flags |= MS_RDONLY;
+	}
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT4-fs (device %s): panic forced after error\n",
+			sb->s_id);
+}
+
+#define ext4_error_ratelimit(sb)					\
+		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
+			     "EXT4-fs error")
+
+void __ext4_error(struct super_block *sb, const char *function,
+		  unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (ext4_error_ratelimit(sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_CRIT
+		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+		       sb->s_id, function, line, current->comm, &vaf);
+		va_end(args);
+	}
+	save_error_info(sb, function, line);
+	ext4_handle_error(sb);
+}
+
+void __ext4_error_inode(struct inode *inode, const char *function,
+			unsigned int line, ext4_fsblk_t block,
+			const char *fmt, ...)
+{
+	va_list args;
+	struct va_format vaf;
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	if (ext4_error_ratelimit(inode->i_sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		if (block)
+			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+			       "inode #%lu: block %llu: comm %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       block, current->comm, &vaf);
+		else
+			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+			       "inode #%lu: comm %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       current->comm, &vaf);
+		va_end(args);
+	}
+	save_error_info(inode->i_sb, function, line);
+	ext4_handle_error(inode->i_sb);
+}
+
+void __ext4_error_file(struct file *file, const char *function,
+		       unsigned int line, ext4_fsblk_t block,
+		       const char *fmt, ...)
+{
+	va_list args;
+	struct va_format vaf;
+	struct ext4_super_block *es;
+	struct inode *inode = file_inode(file);
+	char pathname[80], *path;
+
+	es = EXT4_SB(inode->i_sb)->s_es;
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	if (ext4_error_ratelimit(inode->i_sb)) {
+		path = d_path(&(file->f_path), pathname, sizeof(pathname));
+		if (IS_ERR(path))
+			path = "(unknown)";
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		if (block)
+			printk(KERN_CRIT
+			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+			       "block %llu: comm %s: path %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       block, current->comm, path, &vaf);
+		else
+			printk(KERN_CRIT
+			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+			       "comm %s: path %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       current->comm, path, &vaf);
+		va_end(args);
+	}
+	save_error_info(inode->i_sb, function, line);
+	ext4_handle_error(inode->i_sb);
+}
+
+const char *ext4_decode_error(struct super_block *sb, int errno,
+			      char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		if (!sb || (EXT4_SB(sb)->s_journal &&
+			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
+			errstr = "Journal has aborted";
+		else
+			errstr = "Readonly filesystem";
+		break;
+	default:
+		/* If the caller passed in an extra buffer for unknown
+		 * errors, textualise them now.  Else we just return
+		 * NULL. */
+		if (nbuf) {
+			/* Check for truncated error codes... */
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+/* __ext4_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response.  */
+
+void __ext4_std_error(struct super_block *sb, const char *function,
+		      unsigned int line, int errno)
+{
+	char nbuf[16];
+	const char *errstr;
+
+	/* Special case: if the error is EROFS, and we're not already
+	 * inside a transaction, then there's really no point in logging
+	 * an error. */
+	if (errno == -EROFS && journal_current_handle() == NULL &&
+	    (sb->s_flags & MS_RDONLY))
+		return;
+
+	if (ext4_error_ratelimit(sb)) {
+		errstr = ext4_decode_error(sb, errno, nbuf);
+		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+		       sb->s_id, function, line, errstr);
+	}
+
+	save_error_info(sb, function, line);
+	ext4_handle_error(sb);
+}
+
+/*
+ * ext4_abort is a much stronger failure handler than ext4_error.  The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+void __ext4_abort(struct super_block *sb, const char *function,
+		unsigned int line, const char *fmt, ...)
+{
+	va_list args;
+
+	save_error_info(sb, function, line);
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+	       function, line);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	if ((sb->s_flags & MS_RDONLY) == 0) {
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
+		sb->s_flags |= MS_RDONLY;
+		if (EXT4_SB(sb)->s_journal)
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+		save_error_info(sb, function, line);
+	}
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT4-fs panic from previous error\n");
+}
+
+void __ext4_msg(struct super_block *sb,
+		const char *prefix, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+	va_end(args);
+}
+
+void __ext4_warning(struct super_block *sb, const char *function,
+		    unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+			  "EXT4-fs warning"))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
+	       sb->s_id, function, line, &vaf);
+	va_end(args);
+}
+
+void __ext4_grp_locked_error(const char *function, unsigned int line,
+			     struct super_block *sb, ext4_group_t grp,
+			     unsigned long ino, ext4_fsblk_t block,
+			     const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+	struct va_format vaf;
+	va_list args;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	es->s_last_error_ino = cpu_to_le32(ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	__save_error_info(sb, function, line);
+
+	if (ext4_error_ratelimit(sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+		       sb->s_id, function, line, grp);
+		if (ino)
+			printk(KERN_CONT "inode %lu: ", ino);
+		if (block)
+			printk(KERN_CONT "block %llu:",
+			       (unsigned long long) block);
+		printk(KERN_CONT "%pV\n", &vaf);
+		va_end(args);
+	}
+
+	if (test_opt(sb, ERRORS_CONT)) {
+		ext4_commit_super(sb, 0);
+		return;
+	}
+
+	ext4_unlock_group(sb, grp);
+	ext4_handle_error(sb);
+	/*
+	 * We only get here in the ERRORS_RO case; relocking the group
+	 * may be dangerous, but nothing bad will happen since the
+	 * filesystem will have already been marked read/only and the
+	 * journal has been aborted.  We return 1 as a hint to callers
+	 * who might what to use the return value from
+	 * ext4_grp_locked_error() to distinguish between the
+	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+	 * aggressively from the ext4 function in question, with a
+	 * more appropriate error code.
+	 */
+	ext4_lock_group(sb, grp);
+	return;
+}
+
+void ext4_update_dynamic_rev(struct super_block *sb)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
+		return;
+
+	ext4_warning(sb,
+		     "updating to rev %d because of new feature flag, "
+		     "running e2fsck is recommended",
+		     EXT4_DYNAMIC_REV);
+
+	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
+	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
+	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
+	/* leave es->s_feature_*compat flags alone */
+	/* es->s_uuid will be set by e2fsck if empty */
+
+	/*
+	 * The rest of the superblock fields should be zero, and if not it
+	 * means they are likely already in use, so leave them alone.  We
+	 * can leave it up to e2fsck to clean up any inconsistencies there.
+	 */
+}
+
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
+{
+	struct block_device *bdev;
+	char b[BDEVNAME_SIZE];
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	if (IS_ERR(bdev))
+		goto fail;
+	return bdev;
+
+fail:
+	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
+			__bdevname(dev, b), PTR_ERR(bdev));
+	return NULL;
+}
+
+/*
+ * Release the journal device
+ */
+static void ext4_blkdev_put(struct block_device *bdev)
+{
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
+{
+	struct block_device *bdev;
+	bdev = sbi->journal_bdev;
+	if (bdev) {
+		ext4_blkdev_put(bdev);
+		sbi->journal_bdev = NULL;
+	}
+}
+
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
+}
+
+static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
+{
+	struct list_head *l;
+
+	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+		 le32_to_cpu(sbi->s_es->s_last_orphan));
+
+	printk(KERN_ERR "sb_info orphan list:\n");
+	list_for_each(l, &sbi->s_orphan) {
+		struct inode *inode = orphan_list_entry(l);
+		printk(KERN_ERR "  "
+		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+		       inode->i_sb->s_id, inode->i_ino, inode,
+		       inode->i_mode, inode->i_nlink,
+		       NEXT_ORPHAN(inode));
+	}
+}
+
+static void ext4_put_super(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i, err;
+
+	ext4_unregister_li_request(sb);
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	flush_workqueue(sbi->rsv_conversion_wq);
+	destroy_workqueue(sbi->rsv_conversion_wq);
+
+	if (sbi->s_journal) {
+		err = jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+		if (err < 0)
+			ext4_abort(sb, "Couldn't clean up the journal");
+	}
+
+	ext4_es_unregister_shrinker(sbi);
+	del_timer_sync(&sbi->s_err_report);
+	ext4_release_system_zone(sb);
+	ext4_mb_release(sb);
+	ext4_ext_release(sb);
+	ext4_xattr_put_super(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+	}
+	if (!(sb->s_flags & MS_RDONLY))
+		ext4_commit_super(sb, 1);
+
+	if (sbi->s_proc) {
+		remove_proc_entry("options", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
+	kobject_del(&sbi->s_kobj);
+
+	for (i = 0; i < sbi->s_gdb_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kvfree(sbi->s_group_desc);
+	kvfree(sbi->s_flex_groups);
+	percpu_counter_destroy(&sbi->s_freeclusters_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+
+	/* Debugging code just in case the in-memory inode orphan list
+	 * isn't empty.  The on-disk one can be non-empty if we've
+	 * detected an error and taken the fs readonly, but the
+	 * in-memory list had better be clean by this point. */
+	if (!list_empty(&sbi->s_orphan))
+		dump_orphan_list(sb, sbi);
+	J_ASSERT(list_empty(&sbi->s_orphan));
+
+	invalidate_bdev(sb->s_bdev);
+	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+		/*
+		 * Invalidate the journal device's buffers.  We don't want them
+		 * floating about in memory - the physical journal device may
+		 * hotswapped, and it breaks the `ro-after' testing code.
+		 */
+		sync_blockdev(sbi->journal_bdev);
+		invalidate_bdev(sbi->journal_bdev);
+		ext4_blkdev_remove(sbi);
+	}
+	if (sbi->s_mb_cache) {
+		ext4_xattr_destroy_cache(sbi->s_mb_cache);
+		sbi->s_mb_cache = NULL;
+	}
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
+	sb->s_fs_info = NULL;
+	/*
+	 * Now that we are completely done shutting down the
+	 * superblock, we need to actually destroy the kobject.
+	 */
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
+	if (sbi->s_chksum_driver)
+		crypto_free_shash(sbi->s_chksum_driver);
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+}
+
+static struct kmem_cache *ext4_inode_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext4_alloc_inode(struct super_block *sb)
+{
+	struct ext4_inode_info *ei;
+
+	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	ei->vfs_inode.i_version = 1;
+	spin_lock_init(&ei->i_raw_lock);
+	INIT_LIST_HEAD(&ei->i_prealloc_list);
+	spin_lock_init(&ei->i_prealloc_lock);
+	ext4_es_init_tree(&ei->i_es_tree);
+	rwlock_init(&ei->i_es_lock);
+	INIT_LIST_HEAD(&ei->i_es_list);
+	ei->i_es_all_nr = 0;
+	ei->i_es_shk_nr = 0;
+	ei->i_es_shrink_lblk = 0;
+	ei->i_reserved_data_blocks = 0;
+	ei->i_reserved_meta_blocks = 0;
+	ei->i_allocated_meta_blocks = 0;
+	ei->i_da_metadata_calc_len = 0;
+	ei->i_da_metadata_calc_last_lblock = 0;
+	spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+	ei->i_reserved_quota = 0;
+	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
+#endif
+	ei->jinode = NULL;
+	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+	spin_lock_init(&ei->i_completed_io_lock);
+	ei->i_sync_tid = 0;
+	ei->i_datasync_tid = 0;
+	atomic_set(&ei->i_ioend_count, 0);
+	atomic_set(&ei->i_unwritten, 0);
+	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
+#endif
+
+	return &ei->vfs_inode;
+}
+
+static int ext4_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	trace_ext4_drop_inode(inode, drop);
+	return drop;
+}
+
+static void ext4_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
+static void ext4_destroy_inode(struct inode *inode)
+{
+	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
+		ext4_msg(inode->i_sb, KERN_ERR,
+			 "Inode %lu (%p): orphan list check failed!",
+			 inode->i_ino, EXT4_I(inode));
+		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+				EXT4_I(inode), sizeof(struct ext4_inode_info),
+				true);
+		dump_stack();
+	}
+	call_rcu(&inode->i_rcu, ext4_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+
+	INIT_LIST_HEAD(&ei->i_orphan);
+	init_rwsem(&ei->xattr_sem);
+	init_rwsem(&ei->i_data_sem);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
+					     sizeof(struct ext4_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ext4_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ext4_inode_cachep);
+}
+
+void ext4_clear_inode(struct inode *inode)
+{
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+	dquot_drop(inode);
+	ext4_discard_preallocations(inode);
+	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+	if (EXT4_I(inode)->jinode) {
+		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+					       EXT4_I(inode)->jinode);
+		jbd2_free_inode(EXT4_I(inode)->jinode);
+		EXT4_I(inode)->jinode = NULL;
+	}
+}
+
+static struct inode *ext4_nfs_get_inode(struct super_block *sb,
+					u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	/* iget isn't really right if the inode is currently unallocated!!
+	 *
+	 * ext4_read_inode will return a bad_inode if the inode had been
+	 * deleted, so we should be safe.
+	 *
+	 * Currently we don't know the generation for parent directory, so
+	 * a generation of 0 means "accept any"
+	 */
+	inode = ext4_iget_normal(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ext4_nfs_get_inode);
+}
+
+static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ext4_nfs_get_inode);
+}
+
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page,
+							wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+
+static int ext4_write_dquot(struct dquot *dquot);
+static int ext4_acquire_dquot(struct dquot *dquot);
+static int ext4_release_dquot(struct dquot *dquot);
+static int ext4_mark_dquot_dirty(struct dquot *dquot);
+static int ext4_write_info(struct super_block *sb, int type);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path);
+static int ext4_quota_off(struct super_block *sb, int type);
+static int ext4_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off);
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off);
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags);
+static int ext4_enable_quotas(struct super_block *sb);
+
+static struct dquot **ext4_get_dquots(struct inode *inode)
+{
+	return EXT4_I(inode)->i_dquot;
+}
+
+static const struct dquot_operations ext4_quota_operations = {
+	.get_reserved_space = ext4_get_reserved_space,
+	.write_dquot	= ext4_write_dquot,
+	.acquire_dquot	= ext4_acquire_dquot,
+	.release_dquot	= ext4_release_dquot,
+	.mark_dirty	= ext4_mark_dquot_dirty,
+	.write_info	= ext4_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+
+static const struct quotactl_ops ext4_qctl_operations = {
+	.quota_on	= ext4_quota_on,
+	.quota_off	= ext4_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_state	= dquot_get_state,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+#endif
+
+static const struct super_operations ext4_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
+	.evict_inode	= ext4_evict_inode,
+	.put_super	= ext4_put_super,
+	.sync_fs	= ext4_sync_fs,
+	.freeze_fs	= ext4_freeze,
+	.unfreeze_fs	= ext4_unfreeze,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+	.get_dquots	= ext4_get_dquots,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct export_operations ext4_export_ops = {
+	.fh_to_dentry = ext4_fh_to_dentry,
+	.fh_to_parent = ext4_fh_to_parent,
+	.get_parent = ext4_get_parent,
+};
+
+enum {
+	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_nouid32, Opt_debug, Opt_removed,
+	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
+	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
+	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
+	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
+	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+	Opt_lazytime, Opt_nolazytime,
+	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+	Opt_inode_readahead_blks, Opt_journal_ioprio,
+	Opt_dioread_nolock, Opt_dioread_lock,
+	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+	Opt_max_dir_size_kb, Opt_nojournal_checksum,
+};
+
+static const match_table_t tokens = {
+	{Opt_bsd_df, "bsddf"},
+	{Opt_minix_df, "minixdf"},
+	{Opt_grpid, "grpid"},
+	{Opt_grpid, "bsdgroups"},
+	{Opt_nogrpid, "nogrpid"},
+	{Opt_nogrpid, "sysvgroups"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
+	{Opt_sb, "sb=%u"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_nouid32, "nouid32"},
+	{Opt_debug, "debug"},
+	{Opt_removed, "oldalloc"},
+	{Opt_removed, "orlov"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_noload, "norecovery"},
+	{Opt_noload, "noload"},
+	{Opt_removed, "nobh"},
+	{Opt_removed, "bh"},
+	{Opt_commit, "commit=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
+	{Opt_journal_dev, "journal_dev=%u"},
+	{Opt_journal_path, "journal_path=%s"},
+	{Opt_journal_checksum, "journal_checksum"},
+	{Opt_nojournal_checksum, "nojournal_checksum"},
+	{Opt_journal_async_commit, "journal_async_commit"},
+	{Opt_abort, "abort"},
+	{Opt_data_journal, "data=journal"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
+	{Opt_offusrjquota, "usrjquota="},
+	{Opt_usrjquota, "usrjquota=%s"},
+	{Opt_offgrpjquota, "grpjquota="},
+	{Opt_grpjquota, "grpjquota=%s"},
+	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_noquota, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_i_version, "i_version"},
+	{Opt_dax, "dax"},
+	{Opt_stripe, "stripe=%u"},
+	{Opt_delalloc, "delalloc"},
+	{Opt_lazytime, "lazytime"},
+	{Opt_nolazytime, "nolazytime"},
+	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_removed, "mblk_io_submit"},
+	{Opt_removed, "nomblk_io_submit"},
+	{Opt_block_validity, "block_validity"},
+	{Opt_noblock_validity, "noblock_validity"},
+	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+	{Opt_journal_ioprio, "journal_ioprio=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc"},
+	{Opt_noauto_da_alloc, "noauto_da_alloc"},
+	{Opt_dioread_nolock, "dioread_nolock"},
+	{Opt_dioread_lock, "dioread_lock"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_init_itable, "init_itable=%u"},
+	{Opt_init_itable, "init_itable"},
+	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+	{Opt_test_dummy_encryption, "test_dummy_encryption"},
+	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
+	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
+	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
+	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
+	{Opt_err, NULL},
+};
+
+static ext4_fsblk_t get_sb_block(void **data)
+{
+	ext4_fsblk_t	sb_block;
+	char		*options = (char *) *data;
+
+	if (!options || strncmp(options, "sb=", 3) != 0)
+		return 1;	/* Default location */
+
+	options += 3;
+	/* TODO: use simple_strtoll with >32bit ext4 */
+	sb_block = simple_strtoul(options, &options, 0);
+	if (*options && *options != ',') {
+		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
+		       (char *) *data);
+		return 1;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *) options;
+
+	return sb_block;
+}
+
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char *qname;
+	int ret = -1;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return -1;
+	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+		ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
+			 "when QUOTA feature is enabled");
+		return -1;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext4_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return -1;
+	}
+	if (sbi->s_qf_names[qtype]) {
+		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+			ret = 1;
+		else
+			ext4_msg(sb, KERN_ERR,
+				 "%s quota file already specified",
+				 QTYPE2NAME(qtype));
+		goto errout;
+	}
+	if (strchr(qname, '/')) {
+		ext4_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		goto errout;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	set_opt(sb, QUOTA);
+	return 1;
+errout:
+	kfree(qname);
+	return ret;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return -1;
+	}
+	kfree(sbi->s_qf_names[qtype]);
+	sbi->s_qf_names[qtype] = NULL;
+	return 1;
+}
+#endif
+
+#define MOPT_SET	0x0001
+#define MOPT_CLEAR	0x0002
+#define MOPT_NOSUPPORT	0x0004
+#define MOPT_EXPLICIT	0x0008
+#define MOPT_CLEAR_ERR	0x0010
+#define MOPT_GTE0	0x0020
+#ifdef CONFIG_QUOTA
+#define MOPT_Q		0
+#define MOPT_QFMT	0x0040
+#else
+#define MOPT_Q		MOPT_NOSUPPORT
+#define MOPT_QFMT	MOPT_NOSUPPORT
+#endif
+#define MOPT_DATAJ	0x0080
+#define MOPT_NO_EXT2	0x0100
+#define MOPT_NO_EXT3	0x0200
+#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
+#define MOPT_STRING	0x0400
+
+static const struct mount_opts {
+	int	token;
+	int	mount_opt;
+	int	flags;
+} ext4_mount_opts[] = {
+	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
+	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
+	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
+	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
+	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
+	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
+	 MOPT_EXT4_ONLY | MOPT_CLEAR},
+	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
+	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
+	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
+	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
+	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
+	 MOPT_EXT4_ONLY | MOPT_CLEAR},
+	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+	 MOPT_EXT4_ONLY | MOPT_CLEAR},
+	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+				    EXT4_MOUNT_JOURNAL_CHECKSUM),
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
+	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
+	 MOPT_NO_EXT2 | MOPT_SET},
+	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
+	 MOPT_NO_EXT2 | MOPT_CLEAR},
+	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
+	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
+	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+	{Opt_commit, 0, MOPT_GTE0},
+	{Opt_max_batch_time, 0, MOPT_GTE0},
+	{Opt_min_batch_time, 0, MOPT_GTE0},
+	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
+	{Opt_init_itable, 0, MOPT_GTE0},
+	{Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
+	{Opt_stripe, 0, MOPT_GTE0},
+	{Opt_resuid, 0, MOPT_GTE0},
+	{Opt_resgid, 0, MOPT_GTE0},
+	{Opt_journal_dev, 0, MOPT_GTE0},
+	{Opt_journal_path, 0, MOPT_STRING},
+	{Opt_journal_ioprio, 0, MOPT_GTE0},
+	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
+	 MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
+	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
+	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
+#else
+	{Opt_acl, 0, MOPT_NOSUPPORT},
+	{Opt_noacl, 0, MOPT_NOSUPPORT},
+#endif
+	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
+	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
+	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
+							MOPT_SET | MOPT_Q},
+	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
+							MOPT_SET | MOPT_Q},
+	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+		       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+	{Opt_usrjquota, 0, MOPT_Q},
+	{Opt_grpjquota, 0, MOPT_Q},
+	{Opt_offusrjquota, 0, MOPT_Q},
+	{Opt_offgrpjquota, 0, MOPT_Q},
+	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
+	{Opt_err, 0, 0}
+};
+
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+			    substring_t *args, unsigned long *journal_devnum,
+			    unsigned int *journal_ioprio, int is_remount)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	const struct mount_opts *m;
+	kuid_t uid;
+	kgid_t gid;
+	int arg = 0;
+
+#ifdef CONFIG_QUOTA
+	if (token == Opt_usrjquota)
+		return set_qf_name(sb, USRQUOTA, &args[0]);
+	else if (token == Opt_grpjquota)
+		return set_qf_name(sb, GRPQUOTA, &args[0]);
+	else if (token == Opt_offusrjquota)
+		return clear_qf_name(sb, USRQUOTA);
+	else if (token == Opt_offgrpjquota)
+		return clear_qf_name(sb, GRPQUOTA);
+#endif
+	switch (token) {
+	case Opt_noacl:
+	case Opt_nouser_xattr:
+		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+		break;
+	case Opt_sb:
+		return 1;	/* handled by get_sb_block() */
+	case Opt_removed:
+		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
+		return 1;
+	case Opt_abort:
+		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		return 1;
+	case Opt_i_version:
+		sb->s_flags |= MS_I_VERSION;
+		return 1;
+	case Opt_lazytime:
+		sb->s_flags |= MS_LAZYTIME;
+		return 1;
+	case Opt_nolazytime:
+		sb->s_flags &= ~MS_LAZYTIME;
+		return 1;
+	}
+
+	for (m = ext4_mount_opts; m->token != Opt_err; m++)
+		if (token == m->token)
+			break;
+
+	if (m->token == Opt_err) {
+		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+			 "or missing value", opt);
+		return -1;
+	}
+
+	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Mount option \"%s\" incompatible with ext2", opt);
+		return -1;
+	}
+	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Mount option \"%s\" incompatible with ext3", opt);
+		return -1;
+	}
+
+	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
+		return -1;
+	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+		return -1;
+	if (m->flags & MOPT_EXPLICIT)
+		set_opt2(sb, EXPLICIT_DELALLOC);
+	if (m->flags & MOPT_CLEAR_ERR)
+		clear_opt(sb, ERRORS_MASK);
+	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+		ext4_msg(sb, KERN_ERR, "Cannot change quota "
+			 "options when quota turned on");
+		return -1;
+	}
+
+	if (m->flags & MOPT_NOSUPPORT) {
+		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+	} else if (token == Opt_commit) {
+		if (arg == 0)
+			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+		sbi->s_commit_interval = HZ * arg;
+	} else if (token == Opt_max_batch_time) {
+		sbi->s_max_batch_time = arg;
+	} else if (token == Opt_min_batch_time) {
+		sbi->s_min_batch_time = arg;
+	} else if (token == Opt_inode_readahead_blks) {
+		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
+			ext4_msg(sb, KERN_ERR,
+				 "EXT4-fs: inode_readahead_blks must be "
+				 "0 or a power of 2 smaller than 2^31");
+			return -1;
+		}
+		sbi->s_inode_readahead_blks = arg;
+	} else if (token == Opt_init_itable) {
+		set_opt(sb, INIT_INODE_TABLE);
+		if (!args->from)
+			arg = EXT4_DEF_LI_WAIT_MULT;
+		sbi->s_li_wait_mult = arg;
+	} else if (token == Opt_max_dir_size_kb) {
+		sbi->s_max_dir_size_kb = arg;
+	} else if (token == Opt_stripe) {
+		sbi->s_stripe = arg;
+	} else if (token == Opt_resuid) {
+		uid = make_kuid(current_user_ns(), arg);
+		if (!uid_valid(uid)) {
+			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
+			return -1;
+		}
+		sbi->s_resuid = uid;
+	} else if (token == Opt_resgid) {
+		gid = make_kgid(current_user_ns(), arg);
+		if (!gid_valid(gid)) {
+			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
+			return -1;
+		}
+		sbi->s_resgid = gid;
+	} else if (token == Opt_journal_dev) {
+		if (is_remount) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot specify journal on remount");
+			return -1;
+		}
+		*journal_devnum = arg;
+	} else if (token == Opt_journal_path) {
+		char *journal_path;
+		struct inode *journal_inode;
+		struct path path;
+		int error;
+
+		if (is_remount) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot specify journal on remount");
+			return -1;
+		}
+		journal_path = match_strdup(&args[0]);
+		if (!journal_path) {
+			ext4_msg(sb, KERN_ERR, "error: could not dup "
+				"journal device string");
+			return -1;
+		}
+
+		error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+		if (error) {
+			ext4_msg(sb, KERN_ERR, "error: could not find "
+				"journal device path: error %d", error);
+			kfree(journal_path);
+			return -1;
+		}
+
+		journal_inode = d_inode(path.dentry);
+		if (!S_ISBLK(journal_inode->i_mode)) {
+			ext4_msg(sb, KERN_ERR, "error: journal path %s "
+				"is not a block device", journal_path);
+			path_put(&path);
+			kfree(journal_path);
+			return -1;
+		}
+
+		*journal_devnum = new_encode_dev(journal_inode->i_rdev);
+		path_put(&path);
+		kfree(journal_path);
+	} else if (token == Opt_journal_ioprio) {
+		if (arg > 7) {
+			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+				 " (must be 0-7)");
+			return -1;
+		}
+		*journal_ioprio =
+			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+	} else if (token == Opt_test_dummy_encryption) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+		sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
+		ext4_msg(sb, KERN_WARNING,
+			 "Test dummy encryption mode enabled");
+#else
+		ext4_msg(sb, KERN_WARNING,
+			 "Test dummy encryption mount option ignored");
+#endif
+	} else if (m->flags & MOPT_DATAJ) {
+		if (is_remount) {
+			if (!sbi->s_journal)
+				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
+				ext4_msg(sb, KERN_ERR,
+					 "Cannot change data mode on remount");
+				return -1;
+			}
+		} else {
+			clear_opt(sb, DATA_FLAGS);
+			sbi->s_mount_opt |= m->mount_opt;
+		}
+#ifdef CONFIG_QUOTA
+	} else if (m->flags & MOPT_QFMT) {
+		if (sb_any_quota_loaded(sb) &&
+		    sbi->s_jquota_fmt != m->mount_opt) {
+			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
+				 "quota options when quota turned on");
+			return -1;
+		}
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					       EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot set journaled quota options "
+				 "when QUOTA feature is enabled");
+			return -1;
+		}
+		sbi->s_jquota_fmt = m->mount_opt;
+#endif
+#ifndef CONFIG_FS_DAX
+	} else if (token == Opt_dax) {
+		ext4_msg(sb, KERN_INFO, "dax option not supported");
+		return -1;
+#endif
+	} else {
+		if (!args->from)
+			arg = 1;
+		if (m->flags & MOPT_CLEAR)
+			arg = !arg;
+		else if (unlikely(!(m->flags & MOPT_SET))) {
+			ext4_msg(sb, KERN_WARNING,
+				 "buggy handling of option %s", opt);
+			WARN_ON(1);
+			return -1;
+		}
+		if (arg != 0)
+			sbi->s_mount_opt |= m->mount_opt;
+		else
+			sbi->s_mount_opt &= ~m->mount_opt;
+	}
+	return 1;
+}
+
+static int parse_options(char *options, struct super_block *sb,
+			 unsigned long *journal_devnum,
+			 unsigned int *journal_ioprio,
+			 int is_remount)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, tokens, args);
+		if (handle_mount_opt(sb, p, token, args, journal_devnum,
+				     journal_ioprio, is_remount) < 0)
+			return 0;
+	}
+#ifdef CONFIG_QUOTA
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+		ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
+			 "feature is enabled");
+		return 0;
+	}
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+			clear_opt(sb, USRQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sb, GRPQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+			ext4_msg(sb, KERN_ERR, "old and new quota "
+					"format mixing");
+			return 0;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"not specified");
+			return 0;
+		}
+	}
+#endif
+	if (test_opt(sb, DIOREAD_NOLOCK)) {
+		int blocksize =
+			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+		if (blocksize < PAGE_CACHE_SIZE) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "dioread_nolock if block size != PAGE_SIZE");
+			return 0;
+		}
+	}
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+		ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
+			 "in data=ordered mode");
+		return 0;
+	}
+	return 1;
+}
+
+static inline void ext4_show_quota_options(struct seq_file *seq,
+					   struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+#endif
+}
+
+static const char *token2str(int token)
+{
+	const struct match_token *t;
+
+	for (t = tokens; t->token != Opt_err; t++)
+		if (t->token == token && !strchr(t->pattern, '='))
+			break;
+	return t->pattern;
+}
+
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+			      int nodefs)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+	const struct mount_opts *m;
+	char sep = nodefs ? '\n' : ',';
+
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+
+	if (sbi->s_sb_block != 1)
+		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+
+	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+		int want_set = m->flags & MOPT_SET;
+		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+		    (m->flags & MOPT_CLEAR_ERR))
+			continue;
+		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+			continue; /* skip if same as the default */
+		if ((want_set &&
+		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+			continue; /* select Opt_noFoo vs Opt_Foo */
+		SEQ_OPTS_PRINT("%s", token2str(m->token));
+	}
+
+	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
+	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+		SEQ_OPTS_PRINT("resuid=%u",
+				from_kuid_munged(&init_user_ns, sbi->s_resuid));
+	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
+	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+		SEQ_OPTS_PRINT("resgid=%u",
+				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+		SEQ_OPTS_PUTS("errors=remount-ro");
+	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+		SEQ_OPTS_PUTS("errors=continue");
+	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+		SEQ_OPTS_PUTS("errors=panic");
+	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+	if (sb->s_flags & MS_I_VERSION)
+		SEQ_OPTS_PUTS("i_version");
+	if (nodefs || sbi->s_stripe)
+		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+	if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			SEQ_OPTS_PUTS("data=journal");
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			SEQ_OPTS_PUTS("data=ordered");
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+			SEQ_OPTS_PUTS("data=writeback");
+	}
+	if (nodefs ||
+	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+			       sbi->s_inode_readahead_blks);
+
+	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+	if (nodefs || sbi->s_max_dir_size_kb)
+		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+
+	ext4_show_quota_options(seq, sb);
+	return 0;
+}
+
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+	return _ext4_show_options(seq, root->d_sb, 0);
+}
+
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	int rc;
+
+	seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+	rc = _ext4_show_options(seq, sb, 1);
+	seq_puts(seq, "\n");
+	return rc;
+}
+
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+	return single_open(file, options_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ext4_seq_options_fops = {
+	.owner = THIS_MODULE,
+	.open = options_open_fs,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
+			    int read_only)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int res = 0;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
+		ext4_msg(sb, KERN_ERR, "revision level too high, "
+			 "forcing read-only mode");
+		res = MS_RDONLY;
+	}
+	if (read_only)
+		goto done;
+	if (!(sbi->s_mount_state & EXT4_VALID_FS))
+		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+			 "running e2fsck is recommended");
+	else if (sbi->s_mount_state & EXT4_ERROR_FS)
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: mounting fs with errors, "
+			 "running e2fsck is recommended");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
+		 le16_to_cpu(es->s_mnt_count) >=
+		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: maximal mount count reached, "
+			 "running e2fsck is recommended");
+	else if (le32_to_cpu(es->s_checkinterval) &&
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: checktime reached, "
+			 "running e2fsck is recommended");
+	if (!sbi->s_journal)
+		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
+	le16_add_cpu(&es->s_mnt_count, 1);
+	es->s_mtime = cpu_to_le32(get_seconds());
+	ext4_update_dynamic_rev(sb);
+	if (sbi->s_journal)
+		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+
+	ext4_commit_super(sb, 1);
+done:
+	if (test_opt(sb, DEBUG))
+		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
+				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
+			sb->s_blocksize,
+			sbi->s_groups_count,
+			EXT4_BLOCKS_PER_GROUP(sb),
+			EXT4_INODES_PER_GROUP(sb),
+			sbi->s_mount_opt, sbi->s_mount_opt2);
+
+	cleancache_init_fs(sb);
+	return res;
+}
+
+int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct flex_groups *new_groups;
+	int size;
+
+	if (!sbi->s_log_groups_per_flex)
+		return 0;
+
+	size = ext4_flex_group(sbi, ngroup - 1) + 1;
+	if (size <= sbi->s_flex_groups_allocated)
+		return 0;
+
+	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
+	new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+	if (!new_groups) {
+		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
+			 size / (int) sizeof(struct flex_groups));
+		return -ENOMEM;
+	}
+
+	if (sbi->s_flex_groups) {
+		memcpy(new_groups, sbi->s_flex_groups,
+		       (sbi->s_flex_groups_allocated *
+			sizeof(struct flex_groups)));
+		kvfree(sbi->s_flex_groups);
+	}
+	sbi->s_flex_groups = new_groups;
+	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+	return 0;
+}
+
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t flex_group;
+	int i, err;
+
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
+		sbi->s_log_groups_per_flex = 0;
+		return 1;
+	}
+
+	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
+	if (err)
+		goto failed;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+
+		flex_group = ext4_flex_group(sbi, i);
+		atomic_add(ext4_free_inodes_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+		atomic64_add(ext4_free_group_clusters(sb, gdp),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic_add(ext4_used_dirs_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].used_dirs);
+	}
+
+	return 1;
+failed:
+	return 0;
+}
+
+static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+				   struct ext4_group_desc *gdp)
+{
+	int offset;
+	__u16 crc = 0;
+	__le32 le_group = cpu_to_le32(block_group);
+
+	if (ext4_has_metadata_csum(sbi->s_sb)) {
+		/* Use new metadata_csum algorithm */
+		__le16 save_csum;
+		__u32 csum32;
+
+		save_csum = gdp->bg_checksum;
+		gdp->bg_checksum = 0;
+		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+				     sizeof(le_group));
+		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
+				     sbi->s_desc_size);
+		gdp->bg_checksum = save_csum;
+
+		crc = csum32 & 0xFFFF;
+		goto out;
+	}
+
+	/* old crc16 code */
+	if (!(sbi->s_es->s_feature_ro_compat &
+	      cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+		return 0;
+
+	offset = offsetof(struct ext4_group_desc, bg_checksum);
+
+	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+	crc = crc16(crc, (__u8 *)gdp, offset);
+	offset += sizeof(gdp->bg_checksum); /* skip checksum */
+	/* for checksum of struct ext4_group_desc do the rest...*/
+	if ((sbi->s_es->s_feature_incompat &
+	     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
+		crc = crc16(crc, (__u8 *)gdp + offset,
+			    le16_to_cpu(sbi->s_es->s_desc_size) -
+				offset);
+
+out:
+	return cpu_to_le16(crc);
+}
+
+int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
+				struct ext4_group_desc *gdp)
+{
+	if (ext4_has_group_desc_csum(sb) &&
+	    (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
+						      block_group, gdp)))
+		return 0;
+
+	return 1;
+}
+
+void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
+			      struct ext4_group_desc *gdp)
+{
+	if (!ext4_has_group_desc_csum(sb))
+		return;
+	gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+}
+
+/* Called at mount-time, super-block is locked */
+static int ext4_check_descriptors(struct super_block *sb,
+				  ext4_group_t *first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext4_fsblk_t last_block;
+	ext4_fsblk_t block_bitmap;
+	ext4_fsblk_t inode_bitmap;
+	ext4_fsblk_t inode_table;
+	int flexbg_flag = 0;
+	ext4_group_t i, grp = sbi->s_groups_count;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		flexbg_flag = 1;
+
+	ext4_debug("Checking group descriptors");
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
+
+		if (i == sbi->s_groups_count - 1 || flexbg_flag)
+			last_block = ext4_blocks_count(sbi->s_es) - 1;
+		else
+			last_block = first_block +
+				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+		if ((grp == sbi->s_groups_count) &&
+		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			grp = i;
+
+		block_bitmap = ext4_block_bitmap(sb, gdp);
+		if (block_bitmap < first_block || block_bitmap > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Block bitmap for group %u not in group "
+			       "(block %llu)!", i, block_bitmap);
+			return 0;
+		}
+		inode_bitmap = ext4_inode_bitmap(sb, gdp);
+		if (inode_bitmap < first_block || inode_bitmap > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Inode bitmap for group %u not in group "
+			       "(block %llu)!", i, inode_bitmap);
+			return 0;
+		}
+		inode_table = ext4_inode_table(sb, gdp);
+		if (inode_table < first_block ||
+		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Inode table for group %u not in group "
+			       "(block %llu)!", i, inode_table);
+			return 0;
+		}
+		ext4_lock_group(sb, i);
+		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+				 "Checksum for group %u failed (%u!=%u)",
+				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				     gdp)), le16_to_cpu(gdp->bg_checksum));
+			if (!(sb->s_flags & MS_RDONLY)) {
+				ext4_unlock_group(sb, i);
+				return 0;
+			}
+		}
+		ext4_unlock_group(sb, i);
+		if (!flexbg_flag)
+			first_block += EXT4_BLOCKS_PER_GROUP(sb);
+	}
+	if (NULL != first_not_zeroed)
+		*first_not_zeroed = grp;
+	return 1;
+}
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash.  We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode.  The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode().  The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext4_orphan_cleanup(struct super_block *sb,
+				struct ext4_super_block *es)
+{
+	unsigned int s_flags = sb->s_flags;
+	int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+	if (!es->s_last_orphan) {
+		jbd_debug(4, "no orphan inodes to clean up\n");
+		return;
+	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		ext4_msg(sb, KERN_ERR, "write access "
+			"unavailable, skipping orphan cleanup");
+		return;
+	}
+
+	/* Check if feature set would not allow a r/w mount */
+	if (!ext4_feature_set_ok(sb, 0)) {
+		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+			 "unknown ROCOMPAT features");
+		return;
+	}
+
+	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+		/* don't clear list on RO mount w/ errors */
+		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
+			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
+				  "clearing orphan list.\n");
+			es->s_last_orphan = 0;
+		}
+		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+		return;
+	}
+
+	if (s_flags & MS_RDONLY) {
+		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+		sb->s_flags &= ~MS_RDONLY;
+	}
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	sb->s_flags |= MS_ACTIVE;
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+		if (EXT4_SB(sb)->s_qf_names[i]) {
+			int ret = ext4_quota_on_mount(sb, i);
+			if (ret < 0)
+				ext4_msg(sb, KERN_ERR,
+					"Cannot turn on journaled "
+					"quota: error %d", ret);
+		}
+	}
+#endif
+
+	while (es->s_last_orphan) {
+		struct inode *inode;
+
+		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+		if (IS_ERR(inode)) {
+			es->s_last_orphan = 0;
+			break;
+		}
+
+		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+		dquot_initialize(inode);
+		if (inode->i_nlink) {
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: truncating inode %lu to %lld bytes",
+					__func__, inode->i_ino, inode->i_size);
+			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
+				  inode->i_ino, inode->i_size);
+			mutex_lock(&inode->i_mutex);
+			truncate_inode_pages(inode->i_mapping, inode->i_size);
+			ext4_truncate(inode);
+			mutex_unlock(&inode->i_mutex);
+			nr_truncates++;
+		} else {
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: deleting unreferenced inode %lu",
+					__func__, inode->i_ino);
+			jbd_debug(2, "deleting unreferenced inode %lu\n",
+				  inode->i_ino);
+			nr_orphans++;
+		}
+		iput(inode);  /* The delete magic happens here! */
+	}
+
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
+
+	if (nr_orphans)
+		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
+	if (nr_truncates)
+		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+		if (sb_dqopt(sb)->files[i])
+			dquot_quota_off(sb, i);
+	}
+#endif
+	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
+/*
+ * Maximal extent format file size.
+ * Resulting logical blkno at s_maxbytes must fit in our on-disk
+ * extent format containers, within a sector_t, and within i_blocks
+ * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
+ * so that won't be a limiting factor.
+ *
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
+ * Note, this does *not* consider any metadata overhead for vfs i_blocks.
+ */
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
+{
+	loff_t res;
+	loff_t upper_limit = MAX_LFS_FILESIZE;
+
+	/* small i_blocks in vfs inode? */
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+		/*
+		 * CONFIG_LBDAF is not enabled implies the inode
+		 * i_block represent total blocks in 512 bytes
+		 * 32 == size of vfs inode i_blocks * 8
+		 */
+		upper_limit = (1LL << 32) - 1;
+
+		/* total blocks in file system block size */
+		upper_limit >>= (blkbits - 9);
+		upper_limit <<= blkbits;
+	}
+
+	/*
+	 * 32-bit extent-start container, ee_block. We lower the maxbytes
+	 * by one fs block, so ee_len can cover the extent of maximum file
+	 * size
+	 */
+	res = (1LL << 32) - 1;
+	res <<= blkbits;
+
+	/* Sanity check against vm- & vfs- imposed limits */
+	if (res > upper_limit)
+		res = upper_limit;
+
+	return res;
+}
+
+/*
+ * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^48 sector limit.
+ */
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
+{
+	loff_t res = EXT4_NDIR_BLOCKS;
+	int meta_blocks;
+	loff_t upper_limit;
+	/* This is calculated to be the largest file size for a dense, block
+	 * mapped file such that the file's total number of 512-byte sectors,
+	 * including data and all indirect blocks, does not exceed (2^48 - 1).
+	 *
+	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+	 * number of 512-byte sectors of the file.
+	 */
+
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+		/*
+		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
+		 * the inode i_block field represents total file blocks in
+		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
+		 */
+		upper_limit = (1LL << 32) - 1;
+
+		/* total blocks in file system block size */
+		upper_limit >>= (bits - 9);
+
+	} else {
+		/*
+		 * We use 48 bit ext4_inode i_blocks
+		 * With EXT4_HUGE_FILE_FL set the i_blocks
+		 * represent total number of blocks in
+		 * file system block size
+		 */
+		upper_limit = (1LL << 48) - 1;
+
+	}
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
+
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	res <<= bits;
+	if (res > upper_limit)
+		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	return res;
+}
+
+static ext4_fsblk_t descriptor_loc(struct super_block *sb,
+				   ext4_fsblk_t logical_sb_block, int nr)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t bg, first_meta_bg;
+	int has_super = 0;
+
+	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	    nr < first_meta_bg)
+		return logical_sb_block + nr + 1;
+	bg = sbi->s_desc_per_block * nr;
+	if (ext4_bg_has_super(sb, bg))
+		has_super = 1;
+
+	/*
+	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
+	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
+	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
+	 * compensate.
+	 */
+	if (sb->s_blocksize == 1024 && nr == 0 &&
+	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
+		has_super++;
+
+	return (has_super + ext4_group_first_block_no(sb, bg));
+}
+
+/**
+ * ext4_get_stripe_size: Get the stripe size.
+ * @sbi: In memory super block info
+ *
+ * If we have specified it via mount option, then
+ * use the mount option value. If the value specified at mount time is
+ * greater than the blocks per group use the super block value.
+ * If the super block value is greater than blocks per group return 0.
+ * Allocator needs it be less than blocks per group.
+ *
+ */
+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+	unsigned long stripe_width =
+			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+	int ret;
+
+	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
+		ret = sbi->s_stripe;
+	else if (stripe_width <= sbi->s_blocks_per_group)
+		ret = stripe_width;
+	else if (stride <= sbi->s_blocks_per_group)
+		ret = stride;
+	else
+		ret = 0;
+
+	/*
+	 * If the stripe width is 1, this makes no sense and
+	 * we set it to 0 to turn off stripe handling code.
+	 */
+	if (ret <= 1)
+		ret = 0;
+
+	return ret;
+}
+
+/* sysfs supprt */
+
+struct ext4_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+			 const char *, size_t);
+	union {
+		int offset;
+		int deprecated_val;
+	} u;
+};
+
+static int parse_strtoull(const char *buf,
+		unsigned long long max, unsigned long long *value)
+{
+	int ret;
+
+	ret = kstrtoull(skip_spaces(buf), 0, value);
+	if (!ret && *value > max)
+		ret = -EINVAL;
+	return ret;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(s64) EXT4_C2B(sbi,
+			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+					 struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			 sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)(sbi->s_kbytes_written +
+			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi,
+					  const char *buf, size_t count)
+{
+	unsigned long t;
+	int ret;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+
+	if (t && (!is_power_of_2(t) || t > 0x40000000))
+		return -EINVAL;
+
+	sbi->s_inode_readahead_blks = t;
+	return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+			    struct ext4_sb_info *sbi,
+			    const char *buf, size_t count)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+	unsigned long t;
+	int ret;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+	*ui = t;
+	return count;
+}
+
+static ssize_t es_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+
+	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
+			   a->u.offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	unsigned long long val;
+	int ret;
+
+	if (parse_strtoull(buf, -1ULL, &val))
+		return -EINVAL;
+	ret = ext4_reserve_clusters(sbi, val);
+
+	return ret ? ret : count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi,
+				  const char *buf, size_t count)
+{
+	int len = count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (len && buf[len-1] == '\n')
+		len--;
+
+	if (len)
+		ext4_error(sbi->s_sb, "%.*s", len, buf);
+	return count;
+}
+
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.u = {							\
+		.offset = offsetof(struct ext4_sb_info, _elname),\
+	},							\
+}
+
+#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
+static struct ext4_attr ext4_attr_##_name = {				\
+	.attr = {.name = __stringify(_name), .mode = _mode },		\
+	.show	= _show,						\
+	.store	= _store,						\
+	.u = {								\
+		.offset = offsetof(struct ext4_super_block, _elname),	\
+	},								\
+}
+
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+
+#define EXT4_RO_ATTR_ES_UI(name, elname)	\
+	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
+	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val)	\
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = 0444 },	\
+	.show	= sbi_deprecated_show,				\
+	.u = {							\
+		.deprecated_val = _val,				\
+	},							\
+}
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+
+static struct attribute *ext4_attrs[] = {
+	ATTR_LIST(delayed_allocation_blocks),
+	ATTR_LIST(session_write_kbytes),
+	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(reserved_clusters),
+	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(inode_goal),
+	ATTR_LIST(mb_stats),
+	ATTR_LIST(mb_max_to_scan),
+	ATTR_LIST(mb_min_to_scan),
+	ATTR_LIST(mb_order2_req),
+	ATTR_LIST(mb_stream_req),
+	ATTR_LIST(mb_group_prealloc),
+	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_max_zeroout_kb),
+	ATTR_LIST(trigger_fs_error),
+	ATTR_LIST(err_ratelimit_interval_ms),
+	ATTR_LIST(err_ratelimit_burst),
+	ATTR_LIST(warning_ratelimit_interval_ms),
+	ATTR_LIST(warning_ratelimit_burst),
+	ATTR_LIST(msg_ratelimit_interval_ms),
+	ATTR_LIST(msg_ratelimit_burst),
+	ATTR_LIST(errors_count),
+	ATTR_LIST(first_error_time),
+	ATTR_LIST(last_error_time),
+	NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
+EXT4_INFO_ATTR(encryption);
+
+static struct attribute *ext4_feat_attrs[] = {
+	ATTR_LIST(lazy_itable_init),
+	ATTR_LIST(batched_discard),
+	ATTR_LIST(meta_bg_resize),
+	ATTR_LIST(encryption),
+	NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+	.show	= ext4_attr_show,
+	.store	= ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+	.default_attrs	= ext4_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_sb_release,
+};
+
+static void ext4_feat_release(struct kobject *kobj)
+{
+	complete(&ext4_feat->f_kobj_unregister);
+}
+
+static ssize_t ext4_feat_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "supported\n");
+}
+
+/*
+ * We can not use ext4_attr_show/store because it relies on the kobject
+ * being embedded in the ext4_sb_info structure which is definitely not
+ * true in this case.
+ */
+static const struct sysfs_ops ext4_feat_ops = {
+	.show	= ext4_feat_show,
+	.store	= NULL,
+};
+
+static struct kobj_type ext4_feat_ktype = {
+	.default_attrs	= ext4_feat_attrs,
+	.sysfs_ops	= &ext4_feat_ops,
+	.release	= ext4_feat_release,
+};
+
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
+		return 0;
+	}
+
+	if (readonly)
+		return 1;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
+		sb->s_flags |= MS_RDONLY;
+		return 1;
+	}
+
+	/* Check that feature set is OK for a read-write mount */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+			 "unsupported optional features (%x)",
+			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+				~EXT4_FEATURE_RO_COMPAT_SUPP));
+		return 0;
+	}
+	/*
+	 * Large file size enabled file system can only be mounted
+	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+	 */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		if (sizeof(blkcnt_t) < sizeof(u64)) {
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+				 "cannot be mounted RDWR without "
+				 "CONFIG_LBDAF");
+			return 0;
+		}
+	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Can't support bigalloc feature without "
+			 "extents feature\n");
+		return 0;
+	}
+
+#ifndef CONFIG_QUOTA
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    !readonly) {
+		ext4_msg(sb, KERN_ERR,
+			 "Filesystem with quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return 0;
+	}
+#endif  /* CONFIG_QUOTA */
+	return 1;
+}
+
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+	struct super_block *sb = (struct super_block *) arg;
+	struct ext4_sb_info *sbi;
+	struct ext4_super_block *es;
+
+	sbi = EXT4_SB(sb);
+	es = sbi->s_es;
+
+	if (es->s_error_count)
+		/* fsck newer than v1.41.13 is needed to clean this condition. */
+		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
+			 le32_to_cpu(es->s_error_count));
+	if (es->s_first_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_first_error_time),
+		       (int) sizeof(es->s_first_error_func),
+		       es->s_first_error_func,
+		       le32_to_cpu(es->s_first_error_line));
+		if (es->s_first_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_first_error_ino));
+		if (es->s_first_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_first_error_block));
+		printk("\n");
+	}
+	if (es->s_last_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_last_error_time),
+		       (int) sizeof(es->s_last_error_func),
+		       es->s_last_error_func,
+		       le32_to_cpu(es->s_last_error_line));
+		if (es->s_last_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_last_error_ino));
+		if (es->s_last_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_last_error_block));
+		printk("\n");
+	}
+	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t group, ngroups;
+	struct super_block *sb;
+	unsigned long timeout = 0;
+	int ret = 0;
+
+	sb = elr->lr_super;
+	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	sb_start_write(sb);
+	for (group = elr->lr_next_group; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp) {
+			ret = 1;
+			break;
+		}
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	if (group >= ngroups)
+		ret = 1;
+
+	if (!ret) {
+		timeout = jiffies;
+		ret = ext4_init_inode_table(sb, group,
+					    elr->lr_timeout ? 0 : 1);
+		if (elr->lr_timeout == 0) {
+			timeout = (jiffies - timeout) *
+				  elr->lr_sbi->s_li_wait_mult;
+			elr->lr_timeout = timeout;
+		}
+		elr->lr_next_sched = jiffies + elr->lr_timeout;
+		elr->lr_next_group = group + 1;
+	}
+	sb_end_write(sb);
+
+	return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request structure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_sb_info *sbi;
+
+	if (!elr)
+		return;
+
+	sbi = elr->lr_sbi;
+
+	list_del(&elr->lr_request);
+	sbi->s_li_request = NULL;
+	kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+	mutex_lock(&ext4_li_mtx);
+	if (!ext4_li_info) {
+		mutex_unlock(&ext4_li_mtx);
+		return;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+	mutex_unlock(&ext4_li_mtx);
+}
+
+static struct task_struct *ext4_lazyinit_task;
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+	unsigned long next_wakeup, cur;
+
+	BUG_ON(NULL == eli);
+
+cont_thread:
+	while (true) {
+		next_wakeup = MAX_JIFFY_OFFSET;
+
+		mutex_lock(&eli->li_list_mtx);
+		if (list_empty(&eli->li_request_list)) {
+			mutex_unlock(&eli->li_list_mtx);
+			goto exit_thread;
+		}
+
+		list_for_each_safe(pos, n, &eli->li_request_list) {
+			elr = list_entry(pos, struct ext4_li_request,
+					 lr_request);
+
+			if (time_after_eq(jiffies, elr->lr_next_sched)) {
+				if (ext4_run_li_request(elr) != 0) {
+					/* error, remove the lazy_init job */
+					ext4_remove_li_request(elr);
+					continue;
+				}
+			}
+
+			if (time_before(elr->lr_next_sched, next_wakeup))
+				next_wakeup = elr->lr_next_sched;
+		}
+		mutex_unlock(&eli->li_list_mtx);
+
+		try_to_freeze();
+
+		cur = jiffies;
+		if ((time_after_eq(cur, next_wakeup)) ||
+		    (MAX_JIFFY_OFFSET == next_wakeup)) {
+			cond_resched();
+			continue;
+		}
+
+		schedule_timeout_interruptible(next_wakeup - cur);
+
+		if (kthread_should_stop()) {
+			ext4_clear_request_list();
+			goto exit_thread;
+		}
+	}
+
+exit_thread:
+	/*
+	 * It looks like the request list is empty, but we need
+	 * to check it under the li_list_mtx lock, to prevent any
+	 * additions into it, and of course we should lock ext4_li_mtx
+	 * to atomically free the list and ext4_li_info, because at
+	 * this point another ext4 filesystem could be registering
+	 * new one.
+	 */
+	mutex_lock(&ext4_li_mtx);
+	mutex_lock(&eli->li_list_mtx);
+	if (!list_empty(&eli->li_request_list)) {
+		mutex_unlock(&eli->li_list_mtx);
+		mutex_unlock(&ext4_li_mtx);
+		goto cont_thread;
+	}
+	mutex_unlock(&eli->li_list_mtx);
+	kfree(ext4_li_info);
+	ext4_li_info = NULL;
+	mutex_unlock(&ext4_li_mtx);
+
+	return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+		elr = list_entry(pos, struct ext4_li_request,
+				 lr_request);
+		ext4_remove_li_request(elr);
+	}
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+					 ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(ext4_lazyinit_task)) {
+		int err = PTR_ERR(ext4_lazyinit_task);
+		ext4_clear_request_list();
+		kfree(ext4_li_info);
+		ext4_li_info = NULL;
+		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
+				 "initialization thread\n",
+				 err);
+		return err;
+	}
+	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+	return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+	struct ext4_group_desc *gdp = NULL;
+
+	for (group = 0; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			continue;
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	return group;
+}
+
+static int ext4_li_info_new(void)
+{
+	struct ext4_lazy_init *eli = NULL;
+
+	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+	if (!eli)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&eli->li_request_list);
+	mutex_init(&eli->li_list_mtx);
+
+	eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+	ext4_li_info = eli;
+
+	return 0;
+}
+
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+					    ext4_group_t start)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+
+	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+	if (!elr)
+		return NULL;
+
+	elr->lr_super = sb;
+	elr->lr_sbi = sbi;
+	elr->lr_next_group = start;
+
+	/*
+	 * Randomize first schedule time of the request to
+	 * spread the inode table initialization requests
+	 * better.
+	 */
+	elr->lr_next_sched = jiffies + (prandom_u32() %
+				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
+	return elr;
+}
+
+int ext4_register_li_request(struct super_block *sb,
+			     ext4_group_t first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr = NULL;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	int ret = 0;
+
+	mutex_lock(&ext4_li_mtx);
+	if (sbi->s_li_request != NULL) {
+		/*
+		 * Reset timeout so it can be computed again, because
+		 * s_li_wait_mult might have changed.
+		 */
+		sbi->s_li_request->lr_timeout = 0;
+		goto out;
+	}
+
+	if (first_not_zeroed == ngroups ||
+	    (sb->s_flags & MS_RDONLY) ||
+	    !test_opt(sb, INIT_INODE_TABLE))
+		goto out;
+
+	elr = ext4_li_request_new(sb, first_not_zeroed);
+	if (!elr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (NULL == ext4_li_info) {
+		ret = ext4_li_info_new();
+		if (ret)
+			goto out;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+
+	sbi->s_li_request = elr;
+	/*
+	 * set elr to NULL here since it has been inserted to
+	 * the request_list and the removal and free of it is
+	 * handled by ext4_clear_request_list from now on.
+	 */
+	elr = NULL;
+
+	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+		ret = ext4_run_lazyinit_thread();
+		if (ret)
+			goto out;
+	}
+out:
+	mutex_unlock(&ext4_li_mtx);
+	if (ret)
+		kfree(elr);
+	return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+	/*
+	 * If thread exited earlier
+	 * there's nothing to be done.
+	 */
+	if (!ext4_li_info || !ext4_lazyinit_task)
+		return;
+
+	kthread_stop(ext4_lazyinit_task);
+}
+
+static int set_journal_csum_feature_set(struct super_block *sb)
+{
+	int ret = 1;
+	int compat, incompat;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (ext4_has_metadata_csum(sb)) {
+		/* journal checksum v3 */
+		compat = 0;
+		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
+	} else {
+		/* journal checksum v1 */
+		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
+		incompat = 0;
+	}
+
+	jbd2_journal_clear_features(sbi->s_journal,
+			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
+			JBD2_FEATURE_INCOMPAT_CSUM_V2);
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+		ret = jbd2_journal_set_features(sbi->s_journal,
+				compat, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+				incompat);
+	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+		ret = jbd2_journal_set_features(sbi->s_journal,
+				compat, 0,
+				incompat);
+		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	} else {
+		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	}
+
+	return ret;
+}
+
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc.  This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time.  We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock.  If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
+static int count_overhead(struct super_block *sb, ext4_group_t grp,
+			  char *buf)
+{
+	struct ext4_sb_info	*sbi = EXT4_SB(sb);
+	struct ext4_group_desc	*gdp;
+	ext4_fsblk_t		first_block, last_block, b;
+	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
+	int			s, j, count = 0;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+			sbi->s_itb_per_group + 2);
+
+	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
+		(grp * EXT4_BLOCKS_PER_GROUP(sb));
+	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		b = ext4_block_bitmap(sb, gdp);
+		if (b >= first_block && b <= last_block) {
+			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+			count++;
+		}
+		b = ext4_inode_bitmap(sb, gdp);
+		if (b >= first_block && b <= last_block) {
+			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+			count++;
+		}
+		b = ext4_inode_table(sb, gdp);
+		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
+			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
+				int c = EXT4_B2C(sbi, b - first_block);
+				ext4_set_bit(c, buf);
+				count++;
+			}
+		if (i != grp)
+			continue;
+		s = 0;
+		if (ext4_bg_has_super(sb, grp)) {
+			ext4_set_bit(s++, buf);
+			count++;
+		}
+		for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
+			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
+			count++;
+		}
+	}
+	if (!count)
+		return 0;
+	return EXT4_CLUSTERS_PER_GROUP(sb) -
+		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+}
+
+/*
+ * Compute the overhead and stash it in sbi->s_overhead
+ */
+int ext4_calculate_overhead(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+	ext4_fsblk_t overhead = 0;
+	char *buf = (char *) get_zeroed_page(GFP_NOFS);
+
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * Compute the overhead (FS structures).  This is constant
+	 * for a given filesystem unless the number of block groups
+	 * changes so we cache the previous value until it does.
+	 */
+
+	/*
+	 * All of the blocks before first_data_block are overhead
+	 */
+	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
+
+	/*
+	 * Add the overhead found in each block group
+	 */
+	for (i = 0; i < ngroups; i++) {
+		int blks;
+
+		blks = count_overhead(sb, i, buf);
+		overhead += blks;
+		if (blks)
+			memset(buf, 0, PAGE_SIZE);
+		cond_resched();
+	}
+	/* Add the internal journal blocks as well */
+	if (sbi->s_journal && !sbi->journal_bdev)
+		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+
+	sbi->s_overhead = overhead;
+	smp_wmb();
+	free_page((unsigned long) buf);
+	return 0;
+}
+
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+{
+	ext4_fsblk_t resv_clusters;
+
+	/*
+	 * There's no need to reserve anything when we aren't using extents.
+	 * The space estimates are exact, there are no unwritten extents,
+	 * hole punching doesn't need new metadata... This is needed especially
+	 * to keep ext2/3 backward compatibility.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		return 0;
+	/*
+	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
+	 * This should cover the situations where we can not afford to run
+	 * out of space like for example punch hole, or converting
+	 * unwritten extents in delalloc path. In most cases such
+	 * allocation would require 1, or 2 blocks, higher numbers are
+	 * very rare.
+	 */
+	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+			EXT4_SB(sb)->s_cluster_bits;
+
+	do_div(resv_clusters, 50);
+	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+	return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+				sbi->s_cluster_bits;
+
+	if (count >= clusters)
+		return -EINVAL;
+
+	atomic64_set(&sbi->s_resv_clusters, count);
+	return 0;
+}
+
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+{
+	char *orig_data = kstrdup(data, GFP_KERNEL);
+	struct buffer_head *bh;
+	struct ext4_super_block *es = NULL;
+	struct ext4_sb_info *sbi;
+	ext4_fsblk_t block;
+	ext4_fsblk_t sb_block = get_sb_block(&data);
+	ext4_fsblk_t logical_sb_block;
+	unsigned long offset = 0;
+	unsigned long journal_devnum = 0;
+	unsigned long def_mount_opts;
+	struct inode *root;
+	char *cp;
+	const char *descr;
+	int ret = -ENOMEM;
+	int blocksize, clustersize;
+	unsigned int db_count;
+	unsigned int i;
+	int needs_recovery, has_huge_files, has_bigalloc;
+	__u64 blocks_count;
+	int err = 0;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	ext4_group_t first_not_zeroed;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto out_free_orig;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		goto out_free_orig;
+	}
+	sb->s_fs_info = sbi;
+	sbi->s_sb = sb;
+	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+	sbi->s_sb_block = sb_block;
+	if (sb->s_bdev->bd_part)
+		sbi->s_sectors_written_start =
+			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	/* Modes of operations for file and directory encryption. */
+	sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+	sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
+#endif
+
+	/* Cleanup superblock name */
+	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+		*cp = '!';
+
+	/* -EINVAL is default */
+	ret = -EINVAL;
+	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+		goto out_fail;
+	}
+
+	/*
+	 * The ext4 superblock will not be buffer aligned for other than 1kB
+	 * block sizes.  We need to calculate the offset from buffer start.
+	 */
+	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+		offset = do_div(logical_sb_block, blocksize);
+	} else {
+		logical_sb_block = sb_block;
+	}
+
+	if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
+		ext4_msg(sb, KERN_ERR, "unable to read superblock");
+		goto out_fail;
+	}
+	/*
+	 * Note: s_es must be initialized as soon as possible because
+	 *       some ext4 macro-instructions depend on its value
+	 */
+	es = (struct ext4_super_block *) (bh->b_data + offset);
+	sbi->s_es = es;
+	sb->s_magic = le16_to_cpu(es->s_magic);
+	if (sb->s_magic != EXT4_SUPER_MAGIC)
+		goto cantfind_ext4;
+	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+	/* Warn if metadata_csum and gdt_csum are both set. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+	    EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+		ext4_warning(sb, "metadata_csum and uninit_bg are "
+			     "redundant flags; please run fsck.");
+
+	/* Check for a known checksum algorithm */
+	if (!ext4_verify_csum_type(sb, es)) {
+		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+			 "unknown checksum algorithm.");
+		silent = 1;
+		goto cantfind_ext4;
+	}
+
+	/* Load the checksum driver */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+		if (IS_ERR(sbi->s_chksum_driver)) {
+			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+			ret = PTR_ERR(sbi->s_chksum_driver);
+			sbi->s_chksum_driver = NULL;
+			goto failed_mount;
+		}
+	}
+
+	/* Check superblock checksum */
+	if (!ext4_superblock_csum_verify(sb, es)) {
+		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+			 "invalid superblock checksum.  Run e2fsck?");
+		silent = 1;
+		goto cantfind_ext4;
+	}
+
+	/* Precompute checksum seed for all metadata */
+	if (ext4_has_metadata_csum(sb))
+		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+					       sizeof(es->s_uuid));
+
+	/* Set defaults before we parse the mount options */
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	set_opt(sb, INIT_INODE_TABLE);
+	if (def_mount_opts & EXT4_DEFM_DEBUG)
+		set_opt(sb, DEBUG);
+	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+		set_opt(sb, GRPID);
+	if (def_mount_opts & EXT4_DEFM_UID16)
+		set_opt(sb, NO_UID32);
+	/* xattr user namespace & acls are now defaulted on */
+	set_opt(sb, XATTR_USER);
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	set_opt(sb, POSIX_ACL);
+#endif
+	/* don't forget to enable journal_csum when metadata_csum is enabled. */
+	if (ext4_has_metadata_csum(sb))
+		set_opt(sb, JOURNAL_CHECKSUM);
+
+	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
+		set_opt(sb, JOURNAL_DATA);
+	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
+		set_opt(sb, ORDERED_DATA);
+	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
+		set_opt(sb, WRITEBACK_DATA);
+
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+		set_opt(sb, ERRORS_PANIC);
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+		set_opt(sb, ERRORS_CONT);
+	else
+		set_opt(sb, ERRORS_RO);
+	/* block_validity enabled by default; disable with noblock_validity */
+	set_opt(sb, BLOCK_VALIDITY);
+	if (def_mount_opts & EXT4_DEFM_DISCARD)
+		set_opt(sb, DISCARD);
+
+	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+
+	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+		set_opt(sb, BARRIER);
+
+	/*
+	 * enable delayed allocation by default
+	 * Use -o nodelalloc to turn it off
+	 */
+	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
+	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
+		set_opt(sb, DELALLOC);
+
+	/*
+	 * set default s_li_wait_mult for lazyinit, for the case there is
+	 * no mount option specified.
+	 */
+	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+			   &journal_devnum, &journal_ioprio, 0)) {
+		ext4_msg(sb, KERN_WARNING,
+			 "failed to parse options in superblock: %s",
+			 sbi->s_es->s_mount_opts);
+	}
+	sbi->s_def_mount_opt = sbi->s_mount_opt;
+	if (!parse_options((char *) data, sb, &journal_devnum,
+			   &journal_ioprio, 0))
+		goto failed_mount;
+
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+			    "with data=journal disables delayed "
+			    "allocation and O_DIRECT support!\n");
+		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and delalloc");
+			goto failed_mount;
+		}
+		if (test_opt(sb, DIOREAD_NOLOCK)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dioread_nolock");
+			goto failed_mount;
+		}
+		if (test_opt(sb, DAX)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dax");
+			goto failed_mount;
+		}
+		if (test_opt(sb, DELALLOC))
+			clear_opt(sb, DELALLOC);
+	}
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
+	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+		ext4_msg(sb, KERN_WARNING,
+		       "feature flags set on rev 0 fs, "
+		       "running e2fsck is recommended");
+
+	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+		set_opt2(sb, HURD_COMPAT);
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "The Hurd can't support 64-bit file systems");
+			goto failed_mount;
+		}
+	}
+
+	if (IS_EXT2_SB(sb)) {
+		if (ext2_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
+	}
+
+	if (IS_EXT3_SB(sb)) {
+		if (ext3_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
+	}
+
+	/*
+	 * Check feature flags regardless of the revision level, since we
+	 * previously didn't change the revision level when setting the flags,
+	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+	 */
+	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
+		goto failed_mount;
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+	    blocksize > EXT4_MAX_BLOCK_SIZE) {
+		ext4_msg(sb, KERN_ERR,
+		       "Unsupported filesystem blocksize %d", blocksize);
+		goto failed_mount;
+	}
+
+	if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+		if (blocksize != PAGE_SIZE) {
+			ext4_msg(sb, KERN_ERR,
+					"error: unsupported blocksize for dax");
+			goto failed_mount;
+		}
+		if (!sb->s_bdev->bd_disk->fops->direct_access) {
+			ext4_msg(sb, KERN_ERR,
+					"error: device does not support dax");
+			goto failed_mount;
+		}
+	}
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
+	    es->s_encryption_level) {
+		ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
+			 es->s_encryption_level);
+		goto failed_mount;
+	}
+
+	if (sb->s_blocksize != blocksize) {
+		/* Validate the filesystem blocksize */
+		if (!sb_set_blocksize(sb, blocksize)) {
+			ext4_msg(sb, KERN_ERR, "bad block size %d",
+					blocksize);
+			goto failed_mount;
+		}
+
+		brelse(bh);
+		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+		offset = do_div(logical_sb_block, blocksize);
+		bh = sb_bread_unmovable(sb, logical_sb_block);
+		if (!bh) {
+			ext4_msg(sb, KERN_ERR,
+			       "Can't read superblock on 2nd try");
+			goto failed_mount;
+		}
+		es = (struct ext4_super_block *)(bh->b_data + offset);
+		sbi->s_es = es;
+		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+			ext4_msg(sb, KERN_ERR,
+			       "Magic mismatch, very weird!");
+			goto failed_mount;
+		}
+	}
+
+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+						      has_huge_files);
+	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+	} else {
+		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+		    (!is_power_of_2(sbi->s_inode_size)) ||
+		    (sbi->s_inode_size > blocksize)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported inode size: %d",
+			       sbi->s_inode_size);
+			goto failed_mount;
+		}
+		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
+			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
+	}
+
+	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
+		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
+		    !is_power_of_2(sbi->s_desc_size)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported descriptor size %lu",
+			       sbi->s_desc_size);
+			goto failed_mount;
+		}
+	} else
+		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
+	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
+		goto cantfind_ext4;
+
+	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
+	if (sbi->s_inodes_per_block == 0)
+		goto cantfind_ext4;
+	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+					sbi->s_inodes_per_block;
+	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = le16_to_cpu(es->s_state);
+	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
+	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
+	for (i = 0; i < 4; i++)
+		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+	sbi->s_def_hash_version = es->s_def_hash_version;
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+		i = le32_to_cpu(es->s_flags);
+		if (i & EXT2_FLAGS_UNSIGNED_HASH)
+			sbi->s_hash_unsigned = 3;
+		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+			if (!(sb->s_flags & MS_RDONLY))
+				es->s_flags |=
+					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+			sbi->s_hash_unsigned = 3;
+#else
+			if (!(sb->s_flags & MS_RDONLY))
+				es->s_flags |=
+					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+		}
+	}
+
+	/* Handle clustersize */
+	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+	has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+	if (has_bigalloc) {
+		if (clustersize < blocksize) {
+			ext4_msg(sb, KERN_ERR,
+				 "cluster size (%d) smaller than "
+				 "block size (%d)", clustersize, blocksize);
+			goto failed_mount;
+		}
+		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+			le32_to_cpu(es->s_log_block_size);
+		sbi->s_clusters_per_group =
+			le32_to_cpu(es->s_clusters_per_group);
+		if (sbi->s_clusters_per_group > blocksize * 8) {
+			ext4_msg(sb, KERN_ERR,
+				 "#clusters per group too big: %lu",
+				 sbi->s_clusters_per_group);
+			goto failed_mount;
+		}
+		if (sbi->s_blocks_per_group !=
+		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+				 "clusters per group (%lu) inconsistent",
+				 sbi->s_blocks_per_group,
+				 sbi->s_clusters_per_group);
+			goto failed_mount;
+		}
+	} else {
+		if (clustersize != blocksize) {
+			ext4_warning(sb, "fragment/cluster size (%d) != "
+				     "block size (%d)", clustersize,
+				     blocksize);
+			clustersize = blocksize;
+		}
+		if (sbi->s_blocks_per_group > blocksize * 8) {
+			ext4_msg(sb, KERN_ERR,
+				 "#blocks per group too big: %lu",
+				 sbi->s_blocks_per_group);
+			goto failed_mount;
+		}
+		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+		sbi->s_cluster_bits = 0;
+	}
+	sbi->s_cluster_ratio = clustersize / blocksize;
+
+	if (sbi->s_inodes_per_group > blocksize * 8) {
+		ext4_msg(sb, KERN_ERR,
+		       "#inodes per group too big: %lu",
+		       sbi->s_inodes_per_group);
+		goto failed_mount;
+	}
+
+	/* Do we have standard group size of clustersize * 8 blocks ? */
+	if (sbi->s_blocks_per_group == clustersize << 3)
+		set_opt2(sb, STD_GROUP_SIZE);
+
+	/*
+	 * Test whether we have more sectors than will fit in sector_t,
+	 * and whether the max offset is addressable by the page cache.
+	 */
+	err = generic_check_addressable(sb->s_blocksize_bits,
+					ext4_blocks_count(es));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "filesystem"
+			 " too large to mount safely on this system");
+		if (sizeof(sector_t) < 8)
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+		goto failed_mount;
+	}
+
+	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
+		goto cantfind_ext4;
+
+	/* check blocks count against device size */
+	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)",
+		       ext4_blocks_count(es), blocks_count);
+		goto failed_mount;
+	}
+
+	/*
+	 * It makes no sense for the first data block to be beyond the end
+	 * of the filesystem.
+	 */
+	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+			 "block %u is beyond end of filesystem (%llu)",
+			 le32_to_cpu(es->s_first_data_block),
+			 ext4_blocks_count(es));
+		goto failed_mount;
+	}
+	blocks_count = (ext4_blocks_count(es) -
+			le32_to_cpu(es->s_first_data_block) +
+			EXT4_BLOCKS_PER_GROUP(sb) - 1);
+	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
+		       "(block count %llu, first data block %u, "
+		       "blocks per group %lu)", sbi->s_groups_count,
+		       ext4_blocks_count(es),
+		       le32_to_cpu(es->s_first_data_block),
+		       EXT4_BLOCKS_PER_GROUP(sb));
+		goto failed_mount;
+	}
+	sbi->s_groups_count = blocks_count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+		   EXT4_DESC_PER_BLOCK(sb);
+	sbi->s_group_desc = ext4_kvmalloc(db_count *
+					  sizeof(struct buffer_head *),
+					  GFP_KERNEL);
+	if (sbi->s_group_desc == NULL) {
+		ext4_msg(sb, KERN_ERR, "not enough memory");
+		ret = -ENOMEM;
+		goto failed_mount;
+	}
+
+	if (ext4_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+	if (sbi->s_proc)
+		proc_create_data("options", S_IRUGO, sbi->s_proc,
+				 &ext4_seq_options_fops, sb);
+
+	bgl_lock_init(sbi->s_blockgroup_lock);
+
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logical_sb_block, i);
+		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
+		if (!sbi->s_group_desc[i]) {
+			ext4_msg(sb, KERN_ERR,
+			       "can't read group descriptor %d", i);
+			db_count = i;
+			goto failed_mount2;
+		}
+	}
+	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
+		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+		goto failed_mount2;
+	}
+
+	sbi->s_gdb_count = db_count;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	setup_timer(&sbi->s_err_report, print_daily_error_info,
+		(unsigned long) sb);
+
+	/* Register extent status tree shrinker */
+	if (ext4_es_register_shrinker(sbi))
+		goto failed_mount3;
+
+	sbi->s_stripe = ext4_get_stripe_size(sbi);
+	sbi->s_extent_max_zeroout_kb = 32;
+
+	/*
+	 * set up enough so that it can read an inode
+	 */
+	sb->s_op = &ext4_sops;
+	sb->s_export_op = &ext4_export_ops;
+	sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &ext4_quota_operations;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+		sb->s_qcop = &dquot_quotactl_sysfile_ops;
+	else
+		sb->s_qcop = &ext4_qctl_operations;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+#endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
+	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
+
+	sb->s_root = NULL;
+
+	needs_recovery = (es->s_last_orphan != 0 ||
+			  EXT4_HAS_INCOMPAT_FEATURE(sb,
+				    EXT4_FEATURE_INCOMPAT_RECOVER));
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(sb->s_flags & MS_RDONLY))
+		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+			goto failed_mount3a;
+
+	/*
+	 * The first inode we look at is the journal inode.  Don't try
+	 * root first: it may be modified in the journal!
+	 */
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		if (ext4_load_journal(sb, es, journal_devnum))
+			goto failed_mount3a;
+	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		ext4_msg(sb, KERN_ERR, "required journal recovery "
+		       "suppressed and not mounted read-only");
+		goto failed_mount_wq;
+	} else {
+		clear_opt(sb, DATA_FLAGS);
+		sbi->s_journal = NULL;
+		needs_recovery = 0;
+		goto no_journal;
+	}
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
+	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+				       JBD2_FEATURE_INCOMPAT_64BIT)) {
+		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+		goto failed_mount_wq;
+	}
+
+	if (!set_journal_csum_feature_set(sb)) {
+		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+			 "feature set");
+		goto failed_mount_wq;
+	}
+
+	/* We have now updated the journal if required, so we can
+	 * validate the data journaling mode. */
+	switch (test_opt(sb, DATA_FLAGS)) {
+	case 0:
+		/* No mode set, assume a default based on the journal
+		 * capabilities: ORDERED_DATA if the journal can
+		 * cope, else JOURNAL_DATA
+		 */
+		if (jbd2_journal_check_available_features
+		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
+			set_opt(sb, ORDERED_DATA);
+		else
+			set_opt(sb, JOURNAL_DATA);
+		break;
+
+	case EXT4_MOUNT_ORDERED_DATA:
+	case EXT4_MOUNT_WRITEBACK_DATA:
+		if (!jbd2_journal_check_available_features
+		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+			ext4_msg(sb, KERN_ERR, "Journal does not support "
+			       "requested data journaling mode");
+			goto failed_mount_wq;
+		}
+	default:
+		break;
+	}
+	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+
+	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
+no_journal:
+	if (ext4_mballoc_ready) {
+		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+		if (!sbi->s_mb_cache) {
+			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+			goto failed_mount_wq;
+		}
+	}
+
+	if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
+	    !(sb->s_flags & MS_RDONLY) &&
+	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
+		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+		ext4_commit_super(sb, 1);
+	}
+
+	/*
+	 * Get the # of file system overhead blocks from the
+	 * superblock if present.
+	 */
+	if (es->s_overhead_clusters)
+		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+	else {
+		err = ext4_calculate_overhead(sb);
+		if (err)
+			goto failed_mount_wq;
+	}
+
+	/*
+	 * The maximum number of concurrent works can be high and
+	 * concurrency isn't really necessary.  Limit it to 1.
+	 */
+	EXT4_SB(sb)->rsv_conversion_wq =
+		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->rsv_conversion_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+		ret = -ENOMEM;
+		goto failed_mount4;
+	}
+
+	/*
+	 * The jbd2_journal_load will have done any necessary log recovery,
+	 * so we can safely mount the rest of the filesystem now.
+	 */
+
+	root = ext4_iget(sb, EXT4_ROOT_INO);
+	if (IS_ERR(root)) {
+		ext4_msg(sb, KERN_ERR, "get root inode failed");
+		ret = PTR_ERR(root);
+		root = NULL;
+		goto failed_mount4;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+		iput(root);
+		goto failed_mount4;
+	}
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		ext4_msg(sb, KERN_ERR, "get root dentry failed");
+		ret = -ENOMEM;
+		goto failed_mount4;
+	}
+
+	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
+
+	/* determine the minimum size of new large inodes, if present */
+	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						     EXT4_GOOD_OLD_INODE_SIZE;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_want_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_want_extra_isize);
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_min_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_min_extra_isize);
+		}
+	}
+	/* Check if enough inode space is available */
+	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+							sbi->s_inode_size) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						       EXT4_GOOD_OLD_INODE_SIZE;
+		ext4_msg(sb, KERN_INFO, "required extra inode space not"
+			 "available");
+	}
+
+	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+			 "reserved pool", ext4_calculate_resv_clusters(sb));
+		goto failed_mount4a;
+	}
+
+	err = ext4_setup_system_zone(sb);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to initialize system "
+			 "zone (%d)", err);
+		goto failed_mount4a;
+	}
+
+	ext4_ext_init(sb);
+	err = ext4_mb_init(sb);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+			 err);
+		goto failed_mount5;
+	}
+
+	block = ext4_count_free_clusters(sb);
+	ext4_free_blocks_count_set(sbi->s_es, 
+				   EXT4_C2B(sbi, block));
+	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
+				  GFP_KERNEL);
+	if (!err) {
+		unsigned long freei = ext4_count_free_inodes(sb);
+		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
+		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
+					  GFP_KERNEL);
+	}
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+					  ext4_count_dirs(sb), GFP_KERNEL);
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
+					  GFP_KERNEL);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		goto failed_mount6;
+	}
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		if (!ext4_fill_flex_info(sb)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unable to initialize "
+			       "flex_bg meta info!");
+			goto failed_mount6;
+		}
+
+	err = ext4_register_li_request(sb, first_not_zeroed);
+	if (err)
+		goto failed_mount6;
+
+	sbi->s_kobj.kset = ext4_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+				   "%s", sb->s_id);
+	if (err)
+		goto failed_mount7;
+
+#ifdef CONFIG_QUOTA
+	/* Enable quota usage during mount. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		err = ext4_enable_quotas(sb);
+		if (err)
+			goto failed_mount8;
+	}
+#endif  /* CONFIG_QUOTA */
+
+	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
+	ext4_orphan_cleanup(sb, es);
+	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+	if (needs_recovery) {
+		ext4_msg(sb, KERN_INFO, "recovery complete");
+		ext4_mark_recovery_complete(sb, es);
+	}
+	if (EXT4_SB(sb)->s_journal) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			descr = " journalled data mode";
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			descr = " ordered data mode";
+		else
+			descr = " writeback data mode";
+	} else
+		descr = "out journal";
+
+	if (test_opt(sb, DISCARD)) {
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		if (!blk_queue_discard(q))
+			ext4_msg(sb, KERN_WARNING,
+				 "mounting with \"discard\" option, but "
+				 "the device does not support discard");
+	}
+
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+
+	if (es->s_error_count)
+		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+
+	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
+	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
+	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
+	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+
+	kfree(orig_data);
+	return 0;
+
+cantfind_ext4:
+	if (!silent)
+		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+	goto failed_mount;
+
+#ifdef CONFIG_QUOTA
+failed_mount8:
+	kobject_del(&sbi->s_kobj);
+#endif
+failed_mount7:
+	ext4_unregister_li_request(sb);
+failed_mount6:
+	ext4_mb_release(sb);
+	if (sbi->s_flex_groups)
+		kvfree(sbi->s_flex_groups);
+	percpu_counter_destroy(&sbi->s_freeclusters_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+failed_mount5:
+	ext4_ext_release(sb);
+	ext4_release_system_zone(sb);
+failed_mount4a:
+	dput(sb->s_root);
+	sb->s_root = NULL;
+failed_mount4:
+	ext4_msg(sb, KERN_ERR, "mount failed");
+	if (EXT4_SB(sb)->rsv_conversion_wq)
+		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+failed_mount_wq:
+	if (sbi->s_journal) {
+		jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+	}
+failed_mount3a:
+	ext4_es_unregister_shrinker(sbi);
+failed_mount3:
+	del_timer_sync(&sbi->s_err_report);
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
+failed_mount2:
+	for (i = 0; i < db_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kvfree(sbi->s_group_desc);
+failed_mount:
+	if (sbi->s_chksum_driver)
+		crypto_free_shash(sbi->s_chksum_driver);
+	if (sbi->s_proc) {
+		remove_proc_entry("options", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+	ext4_blkdev_remove(sbi);
+	brelse(bh);
+out_fail:
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+out_free_orig:
+	kfree(orig_data);
+	return err ? err : ret;
+}
+
+/*
+ * Setup any per-fs journal parameters now.  We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
+
+	write_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JBD2_BARRIER;
+	else
+		journal->j_flags &= ~JBD2_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+	write_unlock(&journal->j_state_lock);
+}
+
+static journal_t *ext4_get_journal(struct super_block *sb,
+				   unsigned int journal_inum)
+{
+	struct inode *journal_inode;
+	journal_t *journal;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	/* First, test for the existence of a valid inode on disk.  Bad
+	 * things happen if we iget() an unused inode, as the subsequent
+	 * iput() will try to delete it. */
+
+	journal_inode = ext4_iget(sb, journal_inum);
+	if (IS_ERR(journal_inode)) {
+		ext4_msg(sb, KERN_ERR, "no journal found");
+		return NULL;
+	}
+	if (!journal_inode->i_nlink) {
+		make_bad_inode(journal_inode);
+		iput(journal_inode);
+		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
+		return NULL;
+	}
+
+	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
+		  journal_inode, journal_inode->i_size);
+	if (!S_ISREG(journal_inode->i_mode)) {
+		ext4_msg(sb, KERN_ERR, "invalid journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+
+	journal = jbd2_journal_init_inode(journal_inode);
+	if (!journal) {
+		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+	journal->j_private = sb;
+	ext4_init_journal_params(sb, journal);
+	return journal;
+}
+
+static journal_t *ext4_get_dev_journal(struct super_block *sb,
+				       dev_t j_dev)
+{
+	struct buffer_head *bh;
+	journal_t *journal;
+	ext4_fsblk_t start;
+	ext4_fsblk_t len;
+	int hblock, blocksize;
+	ext4_fsblk_t sb_block;
+	unsigned long offset;
+	struct ext4_super_block *es;
+	struct block_device *bdev;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	bdev = ext4_blkdev_get(j_dev, sb);
+	if (bdev == NULL)
+		return NULL;
+
+	blocksize = sb->s_blocksize;
+	hblock = bdev_logical_block_size(bdev);
+	if (blocksize < hblock) {
+		ext4_msg(sb, KERN_ERR,
+			"blocksize too small for journal device");
+		goto out_bdev;
+	}
+
+	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
+	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
+	set_blocksize(bdev, blocksize);
+	if (!(bh = __bread(bdev, sb_block, blocksize))) {
+		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+		       "external journal");
+		goto out_bdev;
+	}
+
+	es = (struct ext4_super_block *) (bh->b_data + offset);
+	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
+	    !(le32_to_cpu(es->s_feature_incompat) &
+	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+		ext4_msg(sb, KERN_ERR, "external journal has "
+					"bad superblock");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	if ((le32_to_cpu(es->s_feature_ro_compat) &
+	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+	    es->s_checksum != ext4_superblock_csum(sb, es)) {
+		ext4_msg(sb, KERN_ERR, "external journal has "
+				       "corrupt superblock");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	len = ext4_blocks_count(es);
+	start = sb_block + 1;
+	brelse(bh);	/* we're done with the superblock */
+
+	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
+					start, len, blocksize);
+	if (!journal) {
+		ext4_msg(sb, KERN_ERR, "failed to create device journal");
+		goto out_bdev;
+	}
+	journal->j_private = sb;
+	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
+	wait_on_buffer(journal->j_sb_buffer);
+	if (!buffer_uptodate(journal->j_sb_buffer)) {
+		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
+		goto out_journal;
+	}
+	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+		ext4_msg(sb, KERN_ERR, "External journal has more than one "
+					"user (unsupported) - %d",
+			be32_to_cpu(journal->j_superblock->s_nr_users));
+		goto out_journal;
+	}
+	EXT4_SB(sb)->journal_bdev = bdev;
+	ext4_init_journal_params(sb, journal);
+	return journal;
+
+out_journal:
+	jbd2_journal_destroy(journal);
+out_bdev:
+	ext4_blkdev_put(bdev);
+	return NULL;
+}
+
+static int ext4_load_journal(struct super_block *sb,
+			     struct ext4_super_block *es,
+			     unsigned long journal_devnum)
+{
+	journal_t *journal;
+	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+	dev_t journal_dev;
+	int err = 0;
+	int really_read_only;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
+		journal_dev = new_decode_dev(journal_devnum);
+	} else
+		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
+	really_read_only = bdev_read_only(sb->s_bdev);
+
+	/*
+	 * Are we loading a blank journal or performing recovery after a
+	 * crash?  For recovery, we need to check in advance whether we
+	 * can get read-write access to the device.
+	 */
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_msg(sb, KERN_INFO, "INFO: recovery "
+					"required on readonly filesystem");
+			if (really_read_only) {
+				ext4_msg(sb, KERN_ERR, "write access "
+					"unavailable, cannot proceed");
+				return -EROFS;
+			}
+			ext4_msg(sb, KERN_INFO, "write access will "
+			       "be enabled during recovery");
+		}
+	}
+
+	if (journal_inum && journal_dev) {
+		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+		       "and inode journals!");
+		return -EINVAL;
+	}
+
+	if (journal_inum) {
+		if (!(journal = ext4_get_journal(sb, journal_inum)))
+			return -EINVAL;
+	} else {
+		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
+			return -EINVAL;
+	}
+
+	if (!(journal->j_flags & JBD2_BARRIER))
+		ext4_msg(sb, KERN_INFO, "barriers disabled");
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+		err = jbd2_journal_wipe(journal, !really_read_only);
+	if (!err) {
+		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+		if (save)
+			memcpy(save, ((char *) es) +
+			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
+		err = jbd2_journal_load(journal);
+		if (save)
+			memcpy(((char *) es) + EXT4_S_ERR_START,
+			       save, EXT4_S_ERR_LEN);
+		kfree(save);
+	}
+
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "error loading journal");
+		jbd2_journal_destroy(journal);
+		return err;
+	}
+
+	EXT4_SB(sb)->s_journal = journal;
+	ext4_clear_journal_err(sb, es);
+
+	if (!really_read_only && journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		es->s_journal_dev = cpu_to_le32(journal_devnum);
+
+		/* Make sure we flush the recovery flag to disk. */
+		ext4_commit_super(sb, 1);
+	}
+
+	return 0;
+}
+
+static int ext4_commit_super(struct super_block *sb, int sync)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+	int error = 0;
+
+	if (!sbh)
+		return error;
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext4_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
+	if (sb->s_bdev->bd_part)
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
+	else
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
+		ext4_free_blocks_count_set(es,
+			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeclusters_counter)));
+	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+		es->s_free_inodes_count =
+			cpu_to_le32(percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeinodes_counter));
+	BUFFER_TRACE(sbh, "marking dirty");
+	ext4_superblock_csum_set(sb);
+	mark_buffer_dirty(sbh);
+	if (sync) {
+		error = sync_dirty_buffer(sbh);
+		if (error)
+			return error;
+
+		error = buffer_write_io_error(sbh);
+		if (error) {
+			ext4_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
+	return error;
+}
+
+/*
+ * Have we just finished recovery?  If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk.  Record that fact.
+ */
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		BUG_ON(journal != NULL);
+		return;
+	}
+	jbd2_journal_lock_updates(journal);
+	if (jbd2_journal_flush(journal) < 0)
+		goto out;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+	    sb->s_flags & MS_RDONLY) {
+		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_commit_super(sb, 1);
+	}
+
+out:
+	jbd2_journal_unlock_updates(journal);
+}
+
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es)
+{
+	journal_t *journal;
+	int j_errno;
+	const char *errstr;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	journal = EXT4_SB(sb)->s_journal;
+
+	/*
+	 * Now check for any error status which may have been recorded in the
+	 * journal by a prior ext4_error() or ext4_abort()
+	 */
+
+	j_errno = jbd2_journal_errno(journal);
+	if (j_errno) {
+		char nbuf[16];
+
+		errstr = ext4_decode_error(sb, j_errno, nbuf);
+		ext4_warning(sb, "Filesystem error recorded "
+			     "from previous mount: %s", errstr);
+		ext4_warning(sb, "Marking fs in need of filesystem check.");
+
+		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+		ext4_commit_super(sb, 1);
+
+		jbd2_journal_clear_err(journal);
+		jbd2_journal_update_sb_errno(journal);
+	}
+}
+
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext4_force_commit(struct super_block *sb)
+{
+	journal_t *journal;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	journal = EXT4_SB(sb)->s_journal;
+	return ext4_journal_force_commit(journal);
+}
+
+static int ext4_sync_fs(struct super_block *sb, int wait)
+{
+	int ret = 0;
+	tid_t target;
+	bool needs_barrier = false;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	trace_ext4_sync_fs(sb, wait);
+	flush_workqueue(sbi->rsv_conversion_wq);
+	/*
+	 * Writeback quota in non-journalled quota case - journalled quota has
+	 * no dirty dquots
+	 */
+	dquot_writeback_dquots(sb, -1);
+	/*
+	 * Data writeback is possible w/o journal transaction, so barrier must
+	 * being sent at the end of the function. But we can skip it if
+	 * transaction_commit will do it for us.
+	 */
+	if (sbi->s_journal) {
+		target = jbd2_get_latest_transaction(sbi->s_journal);
+		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+			needs_barrier = true;
+
+		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+			if (wait)
+				ret = jbd2_log_wait_commit(sbi->s_journal,
+							   target);
+		}
+	} else if (wait && test_opt(sb, BARRIER))
+		needs_barrier = true;
+	if (needs_barrier) {
+		int err;
+		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
+/*
+ * LVM calls this function before a (read-only) snapshot is created.  This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently. It relies on upper layer to stop all data & metadata
+ * modifications.
+ */
+static int ext4_freeze(struct super_block *sb)
+{
+	int error = 0;
+	journal_t *journal;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	journal = EXT4_SB(sb)->s_journal;
+
+	if (journal) {
+		/* Now we set up the journal barrier. */
+		jbd2_journal_lock_updates(journal);
+
+		/*
+		 * Don't clear the needs_recovery flag if we failed to
+		 * flush the journal.
+		 */
+		error = jbd2_journal_flush(journal);
+		if (error < 0)
+			goto out;
+	}
+
+	/* Journal blocked and flushed, clear needs_recovery flag. */
+	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	error = ext4_commit_super(sb, 1);
+out:
+	if (journal)
+		/* we rely on upper layer to stop further updates */
+		jbd2_journal_unlock_updates(journal);
+	return error;
+}
+
+/*
+ * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static int ext4_unfreeze(struct super_block *sb)
+{
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	/* Reset the needs_recovery flag before the fs is unlocked. */
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	ext4_commit_super(sb, 1);
+	return 0;
+}
+
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+	unsigned long s_mount_opt;
+	unsigned long s_mount_opt2;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[EXT4_MAXQUOTAS];
+#endif
+};
+
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct ext4_super_block *es;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned long old_sb_flags;
+	struct ext4_mount_options old_opts;
+	int enable_quota = 0;
+	ext4_group_t g;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	int err = 0;
+#ifdef CONFIG_QUOTA
+	int i, j;
+#endif
+	char *orig_data = kstrdup(data, GFP_KERNEL);
+
+	/* Store the original options */
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
+		if (sbi->s_qf_names[i]) {
+			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+							 GFP_KERNEL);
+			if (!old_opts.s_qf_names[i]) {
+				for (j = 0; j < i; j++)
+					kfree(old_opts.s_qf_names[j]);
+				kfree(orig_data);
+				return -ENOMEM;
+			}
+		} else
+			old_opts.s_qf_names[i] = NULL;
+#endif
+	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+
+	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+	    test_opt(sb, JOURNAL_CHECKSUM)) {
+		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+			 "during remount not supported; ignoring");
+		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
+	}
+
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and delalloc");
+			err = -EINVAL;
+			goto restore_opts;
+		}
+		if (test_opt(sb, DIOREAD_NOLOCK)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dioread_nolock");
+			err = -EINVAL;
+			goto restore_opts;
+		}
+		if (test_opt(sb, DAX)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dax");
+			err = -EINVAL;
+			goto restore_opts;
+		}
+	}
+
+	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+			"dax flag with busy inodes while remounting");
+		sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
+	}
+
+	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+		ext4_abort(sb, "Abort forced by user");
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	es = sbi->s_es;
+
+	if (sbi->s_journal) {
+		ext4_init_journal_params(sb, sbi->s_journal);
+		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+	}
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
+			err = -EROFS;
+			goto restore_opts;
+		}
+
+		if (*flags & MS_RDONLY) {
+			err = sync_filesystem(sb);
+			if (err < 0)
+				goto restore_opts;
+			err = dquot_suspend(sb, -1);
+			if (err < 0)
+				goto restore_opts;
+
+			/*
+			 * First of all, the unconditional stuff we have to do
+			 * to disable replay of the journal when we next remount
+			 */
+			sb->s_flags |= MS_RDONLY;
+
+			/*
+			 * OK, test if we are remounting a valid rw partition
+			 * readonly, and if so set the rdonly flag and then
+			 * mark the partition as valid again.
+			 */
+			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
+			    (sbi->s_mount_state & EXT4_VALID_FS))
+				es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+			if (sbi->s_journal)
+				ext4_mark_recovery_complete(sb, es);
+		} else {
+			/* Make sure we can mount this feature set readwrite */
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_READONLY) ||
+			    !ext4_feature_set_ok(sb, 0)) {
+				err = -EROFS;
+				goto restore_opts;
+			}
+			/*
+			 * Make sure the group descriptor checksums
+			 * are sane.  If they aren't, refuse to remount r/w.
+			 */
+			for (g = 0; g < sbi->s_groups_count; g++) {
+				struct ext4_group_desc *gdp =
+					ext4_get_group_desc(sb, g, NULL);
+
+				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
+					ext4_msg(sb, KERN_ERR,
+	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
+		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+					       le16_to_cpu(gdp->bg_checksum));
+					err = -EINVAL;
+					goto restore_opts;
+				}
+			}
+
+			/*
+			 * If we have an unprocessed orphan list hanging
+			 * around from a previously readonly bdev mount,
+			 * require a full umount/remount for now.
+			 */
+			if (es->s_last_orphan) {
+				ext4_msg(sb, KERN_WARNING, "Couldn't "
+				       "remount RDWR because of unprocessed "
+				       "orphan inode list.  Please "
+				       "umount/remount instead");
+				err = -EINVAL;
+				goto restore_opts;
+			}
+
+			/*
+			 * Mounting a RDONLY partition read-write, so reread
+			 * and store the current valid flag.  (It may have
+			 * been changed by e2fsck since we originally mounted
+			 * the partition.)
+			 */
+			if (sbi->s_journal)
+				ext4_clear_journal_err(sb, es);
+			sbi->s_mount_state = le16_to_cpu(es->s_state);
+			if (!ext4_setup_super(sb, es, 0))
+				sb->s_flags &= ~MS_RDONLY;
+			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+						     EXT4_FEATURE_INCOMPAT_MMP))
+				if (ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block))) {
+					err = -EROFS;
+					goto restore_opts;
+				}
+			enable_quota = 1;
+		}
+	}
+
+	/*
+	 * Reinitialize lazy itable initialization thread based on
+	 * current settings
+	 */
+	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+		ext4_unregister_li_request(sb);
+	else {
+		ext4_group_t first_not_zeroed;
+		first_not_zeroed = ext4_has_uninit_itable(sb);
+		ext4_register_li_request(sb, first_not_zeroed);
+	}
+
+	ext4_setup_system_zone(sb);
+	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
+		ext4_commit_super(sb, 1);
+
+#ifdef CONFIG_QUOTA
+	/* Release old quota file names */
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
+		kfree(old_opts.s_qf_names[i]);
+	if (enable_quota) {
+		if (sb_any_quota_suspended(sb))
+			dquot_resume(sb, -1);
+		else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+			err = ext4_enable_quotas(sb);
+			if (err)
+				goto restore_opts;
+		}
+	}
+#endif
+
+	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
+	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+	kfree(orig_data);
+	return 0;
+
+restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+		kfree(sbi->s_qf_names[i]);
+		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+	}
+#endif
+	kfree(orig_data);
+	return err;
+}
+
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t overhead = 0, resv_blocks;
+	u64 fsid;
+	s64 bfree;
+	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
+
+	if (!test_opt(sb, MINIX_DF))
+		overhead = sbi->s_overhead;
+
+	buf->f_type = EXT4_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
+	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+	/* prevent underflow in case that few free space is available */
+	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
+	buf->f_bavail = buf->f_bfree -
+			(ext4_r_blocks_count(es) + resv_blocks);
+	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
+		buf->f_bavail = 0;
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
+	buf->f_namelen = EXT4_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
+	return 0;
+}
+
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
+ * Process 1                         Process 2
+ * ext4_create()                     quota_sync()
+ *   jbd2_journal_start()                  write_dquot()
+ *   dquot_initialize()                         down(dqio_mutex)
+ *     down(dqio_mutex)                    jbd2_journal_start()
+ *
+ */
+
+#ifdef CONFIG_QUOTA
+
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
+}
+
+static int ext4_write_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+	struct inode *inode;
+
+	inode = dquot_to_inode(dquot);
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_acquire_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
+				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_acquire(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_release_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
+				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle)) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
+		return PTR_ERR(handle);
+	}
+	ret = dquot_release(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_mark_dquot_dirty(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/* Are we journaling quotas? */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return ext4_write_dquot(dquot);
+	} else {
+		return dquot_mark_dquot_dirty(dquot);
+	}
+}
+
+static int ext4_write_info(struct super_block *sb, int type)
+{
+	int ret, err;
+	handle_t *handle;
+
+	/* Data block + inode block */
+	handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit_info(sb, type);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+					EXT4_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path)
+{
+	int err;
+
+	if (!test_opt(sb, QUOTA))
+		return -EINVAL;
+
+	/* Quotafile not on the same filesystem? */
+	if (path->dentry->d_sb != sb)
+		return -EXDEV;
+	/* Journaling quota? */
+	if (EXT4_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not in fs root? */
+		if (path->dentry->d_parent != sb->s_root)
+			ext4_msg(sb, KERN_WARNING,
+				"Quota file not on filesystem root. "
+				"Journaled quota will not work");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (EXT4_SB(sb)->s_journal &&
+	    ext4_should_journal_data(d_inode(path->dentry))) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err)
+			return err;
+	}
+
+	return dquot_quota_on(sb, type, format_id, path);
+}
+
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags)
+{
+	int err;
+	struct inode *qf_inode;
+	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+	};
+
+	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+
+	if (!qf_inums[type])
+		return -EPERM;
+
+	qf_inode = ext4_iget(sb, qf_inums[type]);
+	if (IS_ERR(qf_inode)) {
+		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+		return PTR_ERR(qf_inode);
+	}
+
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
+	err = dquot_enable(qf_inode, type, format_id, flags);
+	iput(qf_inode);
+
+	return err;
+}
+
+/* Enable usage tracking for all quota types. */
+static int ext4_enable_quotas(struct super_block *sb)
+{
+	int type, err = 0;
+	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+	};
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
+		if (qf_inums[type]) {
+			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
+						DQUOT_USAGE_ENABLED);
+			if (err) {
+				ext4_warning(sb,
+					"Failed to enable quota tracking "
+					"(type=%d, err=%d). Please run "
+					"e2fsck to fix.", type, err);
+				return err;
+			}
+		}
+	}
+	return 0;
+}
+
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	handle_t *handle;
+
+	/* Force all delayed allocation blocks to be allocated.
+	 * Caller already holds s_umount sem */
+	if (test_opt(sb, DELALLOC))
+		sync_filesystem(sb);
+
+	if (!inode)
+		goto out;
+
+	/* Update modification times of quota files when userspace can
+	 * start looking at them */
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
+	if (IS_ERR(handle))
+		goto out;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+
+out:
+	return dquot_quota_off(sb, type);
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+		bh = ext4_bread(NULL, inode, blk, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		if (!bh)	/* A hole? */
+			memset(data, 0, tocopy);
+		else
+			memcpy(data, bh->b_data+offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	int err, offset = off & (sb->s_blocksize - 1);
+	struct buffer_head *bh;
+	handle_t *handle = journal_current_handle();
+
+	if (EXT4_SB(sb)->s_journal && !handle) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+
+	bh = ext4_bread(handle, inode, blk, 1);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
+	if (!bh)
+		goto out;
+	BUFFER_TRACE(bh, "get write access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		brelse(bh);
+		return err;
+	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	brelse(bh);
+out:
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
+		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_mark_inode_dirty(handle, inode);
+	}
+	return len;
+}
+
+#endif
+
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
+}
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext2(void)
+{
+	int err = register_filesystem(&ext2_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+
+static inline void unregister_as_ext2(void)
+{
+	unregister_filesystem(&ext2_fs_type);
+}
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext3(void)
+{
+	int err = register_filesystem(&ext3_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+
+static inline void unregister_as_ext3(void)
+{
+	unregister_filesystem(&ext3_fs_type);
+}
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+static struct file_system_type ext4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext4",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext4");
+
+static int __init ext4_init_feat_adverts(void)
+{
+	struct ext4_features *ef;
+	int ret = -ENOMEM;
+
+	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+	if (!ef)
+		goto out;
+
+	ef->f_kobj.kset = ext4_kset;
+	init_completion(&ef->f_kobj_unregister);
+	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+				   "features");
+	if (ret) {
+		kfree(ef);
+		goto out;
+	}
+
+	ext4_feat = ef;
+	ret = 0;
+out:
+	return ret;
+}
+
+static void ext4_exit_feat_adverts(void)
+{
+	kobject_put(&ext4_feat->f_kobj);
+	wait_for_completion(&ext4_feat->f_kobj_unregister);
+	kfree(ext4_feat);
+}
+
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+static int __init ext4_init_fs(void)
+{
+	int i, err;
+
+	ext4_li_info = NULL;
+	mutex_init(&ext4_li_mtx);
+
+	/* Build-time check for flags consistency */
+	ext4_check_flag_values();
+
+	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+		mutex_init(&ext4__aio_mutex[i]);
+		init_waitqueue_head(&ext4__ioend_wq[i]);
+	}
+
+	err = ext4_init_es();
+	if (err)
+		return err;
+
+	err = ext4_init_pageio();
+	if (err)
+		goto out7;
+
+	err = ext4_init_system_zone();
+	if (err)
+		goto out6;
+	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+	if (!ext4_kset) {
+		err = -ENOMEM;
+		goto out5;
+	}
+	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+
+	err = ext4_init_feat_adverts();
+	if (err)
+		goto out4;
+
+	err = ext4_init_mballoc();
+	if (err)
+		goto out2;
+	else
+		ext4_mballoc_ready = 1;
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	register_as_ext3();
+	register_as_ext2();
+	err = register_filesystem(&ext4_fs_type);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	unregister_as_ext2();
+	unregister_as_ext3();
+	destroy_inodecache();
+out1:
+	ext4_mballoc_ready = 0;
+	ext4_exit_mballoc();
+out2:
+	ext4_exit_feat_adverts();
+out4:
+	if (ext4_proc_root)
+		remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+out5:
+	ext4_exit_system_zone();
+out6:
+	ext4_exit_pageio();
+out7:
+	ext4_exit_es();
+
+	return err;
+}
+
+static void __exit ext4_exit_fs(void)
+{
+	ext4_destroy_lazyinit_thread();
+	unregister_as_ext2();
+	unregister_as_ext3();
+	unregister_filesystem(&ext4_fs_type);
+	destroy_inodecache();
+	ext4_exit_mballoc();
+	ext4_exit_feat_adverts();
+	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+	ext4_exit_system_zone();
+	ext4_exit_pageio();
+	ext4_exit_es();
+}
+
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
+MODULE_LICENSE("GPL");
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
diff --git a/kernel/fs/ext4/symlink.c b/kernel/fs/ext4/symlink.c
new file mode 100644
index 000000000..187b78920
--- /dev/null
+++ b/kernel/fs/ext4/symlink.c
@@ -0,0 +1,145 @@
+/*
+ *  linux/fs/ext4/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include "ext4.h"
+#include "xattr.h"
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct page *cpage = NULL;
+	char *caddr, *paddr = NULL;
+	struct ext4_str cstr, pstr;
+	struct inode *inode = d_inode(dentry);
+	struct ext4_fname_crypto_ctx *ctx = NULL;
+	struct ext4_encrypted_symlink_data *sd;
+	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
+	int res;
+	u32 plen, max_size = inode->i_sb->s_blocksize;
+
+	if (!ext4_encrypted_inode(inode))
+		return page_follow_link_light(dentry, nd);
+
+	ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize);
+	if (IS_ERR(ctx))
+		return ctx;
+
+	if (ext4_inode_is_fast_symlink(inode)) {
+		caddr = (char *) EXT4_I(inode)->i_data;
+		max_size = sizeof(EXT4_I(inode)->i_data);
+	} else {
+		cpage = read_mapping_page(inode->i_mapping, 0, NULL);
+		if (IS_ERR(cpage)) {
+			ext4_put_fname_crypto_ctx(&ctx);
+			return cpage;
+		}
+		caddr = kmap(cpage);
+		caddr[size] = 0;
+	}
+
+	/* Symlink is encrypted */
+	sd = (struct ext4_encrypted_symlink_data *)caddr;
+	cstr.name = sd->encrypted_path;
+	cstr.len  = le32_to_cpu(sd->len);
+	if ((cstr.len +
+	     sizeof(struct ext4_encrypted_symlink_data) - 1) >
+	    max_size) {
+		/* Symlink data on the disk is corrupted */
+		res = -EIO;
+		goto errout;
+	}
+	plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
+		EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len;
+	paddr = kmalloc(plen + 1, GFP_NOFS);
+	if (!paddr) {
+		res = -ENOMEM;
+		goto errout;
+	}
+	pstr.name = paddr;
+	res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr);
+	if (res < 0)
+		goto errout;
+	/* Null-terminate the name */
+	if (res <= plen)
+		paddr[res] = '\0';
+	nd_set_link(nd, paddr);
+	ext4_put_fname_crypto_ctx(&ctx);
+	if (cpage) {
+		kunmap(cpage);
+		page_cache_release(cpage);
+	}
+	return NULL;
+errout:
+	ext4_put_fname_crypto_ctx(&ctx);
+	if (cpage) {
+		kunmap(cpage);
+		page_cache_release(cpage);
+	}
+	kfree(paddr);
+	return ERR_PTR(res);
+}
+
+static void ext4_put_link(struct dentry *dentry, struct nameidata *nd,
+			  void *cookie)
+{
+	struct page *page = cookie;
+
+	if (!page) {
+		kfree(nd_get_link(nd));
+	} else {
+		kunmap(page);
+		page_cache_release(page);
+	}
+}
+#endif
+
+static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ext4_inode_info *ei = EXT4_I(d_inode(dentry));
+	nd_set_link(nd, (char *) ei->i_data);
+	return NULL;
+}
+
+const struct inode_operations ext4_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	.follow_link    = ext4_follow_link,
+	.put_link       = ext4_put_link,
+#else
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+#endif
+	.setattr	= ext4_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+};
+
+const struct inode_operations ext4_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link    = ext4_follow_fast_link,
+	.setattr	= ext4_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+};
diff --git a/kernel/fs/ext4/truncate.h b/kernel/fs/ext4/truncate.h
new file mode 100644
index 000000000..011ba6670
--- /dev/null
+++ b/kernel/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
+/*
+ * linux/fs/ext4/truncate.h
+ *
+ * Common inline functions needed for truncate support
+ */
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static inline void ext4_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext4_truncate(inode);
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
+{
+	ext4_lblk_t needed;
+
+	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+	/* Give ourselves just enough room to cope with inodes in which
+	 * i_blocks is corrupt: we've seen disk corruptions in the past
+	 * which resulted in random data in an inode which looked enough
+	 * like a regular file for ext4 to try to delete it.  Things
+	 * will go a bit crazy if that happens, but at least we should
+	 * try not to panic the whole kernel. */
+	if (needed < 2)
+		needed = 2;
+
+	/* But we need to bound the transaction so we don't overflow the
+	 * journal. */
+	if (needed > EXT4_MAX_TRANS_DATA)
+		needed = EXT4_MAX_TRANS_DATA;
+
+	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
diff --git a/kernel/fs/ext4/xattr.c b/kernel/fs/ext4/xattr.c
new file mode 100644
index 000000000..16e28c08d
--- /dev/null
+++ b/kernel/fs/ext4/xattr.c
@@ -0,0 +1,1727 @@
+/*
+ * linux/fs/ext4/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ *  and Andreas Gruenbacher <agruen@suse.de>.
+ */
+
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "acl.h"
+
+#ifdef EXT4_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
+			inode->i_sb->s_id, inode->i_ino); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+# define ea_bdebug(bh, f...) do { \
+		char b[BDEVNAME_SIZE]; \
+		printk(KERN_DEBUG "block %s:%lu: ", \
+			bdevname(bh->b_bdev, b), \
+			(unsigned long) bh->b_blocknr); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+#else
+# define ea_idebug(inode, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
+static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+						 struct ext4_xattr_header *,
+						 struct mb_cache_entry **);
+static void ext4_xattr_rehash(struct ext4_xattr_header *,
+			      struct ext4_xattr_entry *);
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
+			   size_t buffer_size);
+
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
+	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
+	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+#endif
+	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4_FS_SECURITY
+	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
+#endif
+};
+
+const struct xattr_handler *ext4_xattr_handlers[] = {
+	&ext4_xattr_user_handler,
+	&ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+#ifdef CONFIG_EXT4_FS_SECURITY
+	&ext4_xattr_security_handler,
+#endif
+	NULL
+};
+
+#define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_mb_cache)
+
+static __le32 ext4_xattr_block_csum(struct inode *inode,
+				    sector_t block_nr,
+				    struct ext4_xattr_header *hdr)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u32 csum;
+	__le32 save_csum;
+	__le64 dsk_block_nr = cpu_to_le64(block_nr);
+
+	save_csum = hdr->h_checksum;
+	hdr->h_checksum = 0;
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+			   sizeof(dsk_block_nr));
+	csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
+			   EXT4_BLOCK_SIZE(inode->i_sb));
+
+	hdr->h_checksum = save_csum;
+	return cpu_to_le32(csum);
+}
+
+static int ext4_xattr_block_csum_verify(struct inode *inode,
+					sector_t block_nr,
+					struct ext4_xattr_header *hdr)
+{
+	if (ext4_has_metadata_csum(inode->i_sb) &&
+	    (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
+		return 0;
+	return 1;
+}
+
+static void ext4_xattr_block_csum_set(struct inode *inode,
+				      sector_t block_nr,
+				      struct ext4_xattr_header *hdr)
+{
+	if (!ext4_has_metadata_csum(inode->i_sb))
+		return;
+
+	hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
+}
+
+static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
+						struct inode *inode,
+						struct buffer_head *bh)
+{
+	ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
+	return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static inline const struct xattr_handler *
+ext4_xattr_handler(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
+		handler = ext4_xattr_handler_map[name_index];
+	return handler;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * d_inode(dentry)->i_mutex: don't care
+ */
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	return ext4_xattr_list(dentry, buffer, size);
+}
+
+static int
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
+		       void *value_start)
+{
+	struct ext4_xattr_entry *e = entry;
+
+	while (!IS_LAST_ENTRY(e)) {
+		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
+		if ((void *)next >= end)
+			return -EIO;
+		e = next;
+	}
+
+	while (!IS_LAST_ENTRY(entry)) {
+		if (entry->e_value_size != 0 &&
+		    (value_start + le16_to_cpu(entry->e_value_offs) <
+		     (void *)e + sizeof(__u32) ||
+		     value_start + le16_to_cpu(entry->e_value_offs) +
+		    le32_to_cpu(entry->e_value_size) > end))
+			return -EIO;
+		entry = EXT4_XATTR_NEXT(entry);
+	}
+
+	return 0;
+}
+
+static inline int
+ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
+{
+	int error;
+
+	if (buffer_verified(bh))
+		return 0;
+
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1))
+		return -EIO;
+	if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
+		return -EIO;
+	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
+				       bh->b_data);
+	if (!error)
+		set_buffer_verified(bh);
+	return error;
+}
+
+static inline int
+ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
+{
+	size_t value_size = le32_to_cpu(entry->e_value_size);
+
+	if (entry->e_value_block != 0 || value_size > size ||
+	    le16_to_cpu(entry->e_value_offs) + value_size > size)
+		return -EIO;
+	return 0;
+}
+
+static int
+ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+		      const char *name, size_t size, int sorted)
+{
+	struct ext4_xattr_entry *entry;
+	size_t name_len;
+	int cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	entry = *pentry;
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		cmp = name_index - entry->e_name_index;
+		if (!cmp)
+			cmp = name_len - entry->e_name_len;
+		if (!cmp)
+			cmp = memcmp(name, entry->e_name, name_len);
+		if (cmp <= 0 && (sorted || cmp == 0))
+			break;
+	}
+	*pentry = entry;
+	if (!cmp && ext4_xattr_check_entry(entry, size))
+			return -EIO;
+	return cmp ? -ENODATA : 0;
+}
+
+static int
+ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	struct ext4_xattr_entry *entry;
+	size_t size;
+	int error;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+
+	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+		  name_index, name, buffer, (long)buffer_size);
+
+	error = -ENODATA;
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %llu",
+		  (unsigned long long)EXT4_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext4_xattr_check_block(inode, bh)) {
+bad_block:
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext4_xattr_cache_insert(ext4_mb_cache, bh);
+	entry = BFIRST(bh);
+	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+		       size);
+	}
+	error = size;
+
+cleanup:
+	brelse(bh);
+	return error;
+}
+
+int
+ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	size_t size;
+	void *end;
+	int error;
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		return -ENODATA;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	error = ext4_xattr_check_names(entry, end, entry);
+	if (error)
+		goto cleanup;
+	error = ext4_xattr_find_entry(&entry, name_index, name,
+				      end - (void *)entry, 0);
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, (void *)IFIRST(header) +
+		       le16_to_cpu(entry->e_value_offs), size);
+	}
+	error = size;
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext4_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t buffer_size)
+{
+	int error;
+
+	if (strlen(name) > 255)
+		return -ERANGE;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
+				     buffer_size);
+	if (error == -ENODATA)
+		error = ext4_xattr_block_get(inode, name_index, name, buffer,
+					     buffer_size);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+static int
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
+			char *buffer, size_t buffer_size)
+{
+	size_t rest = buffer_size;
+
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		const struct xattr_handler *handler =
+			ext4_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(dentry, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len,
+						    handler->flags);
+			if (buffer) {
+				if (size > rest)
+					return -ERANGE;
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+	return buffer_size - rest;
+}
+
+static int
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct buffer_head *bh = NULL;
+	int error;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+
+	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+		  buffer, (long)buffer_size);
+
+	error = 0;
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %llu",
+		  (unsigned long long)EXT4_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext4_xattr_check_block(inode, bh)) {
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext4_xattr_cache_insert(ext4_mb_cache, bh);
+	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
+
+cleanup:
+	brelse(bh);
+
+	return error;
+}
+
+static int
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	void *end;
+	int error;
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		return 0;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
+	if (error)
+		goto cleanup;
+	error = ext4_xattr_list_entries(dentry, IFIRST(header),
+					buffer, buffer_size);
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext4_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+static int
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	int ret, ret2;
+
+	down_read(&EXT4_I(d_inode(dentry))->xattr_sem);
+	ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	if (buffer) {
+		buffer += ret;
+		buffer_size -= ret;
+	}
+	ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	ret += ret2;
+errout:
+	up_read(&EXT4_I(d_inode(dentry))->xattr_sem);
+	return ret;
+}
+
+/*
+ * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext4_xattr_update_super_block(handle_t *handle,
+					  struct super_block *sb)
+{
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+		return;
+
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
+		ext4_handle_dirty_super(handle, sb);
+	}
+}
+
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
+ * otherwise free the block.
+ */
+static void
+ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh)
+{
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+
+	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
+	BUFFER_TRACE(bh, "get_write_access");
+	error = ext4_journal_get_write_access(handle, bh);
+	if (error)
+		goto out;
+
+	lock_buffer(bh);
+	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+		ea_bdebug(bh, "refcount now=0; freeing");
+		if (ce)
+			mb_cache_entry_free(ce);
+		get_bh(bh);
+		unlock_buffer(bh);
+		ext4_free_blocks(handle, inode, bh, 0, 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
+	} else {
+		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+		if (ce)
+			mb_cache_entry_release(ce);
+		/*
+		 * Beware of this ugliness: Releasing of xattr block references
+		 * from different inodes can race and so we have to protect
+		 * from a race where someone else frees the block (and releases
+		 * its journal_head) before we are done dirtying the buffer. In
+		 * nojournal mode this race is harmless and we actually cannot
+		 * call ext4_handle_dirty_xattr_block() with locked buffer as
+		 * that function can call sync_dirty_buffer() so for that case
+		 * we handle the dirtying after unlocking the buffer.
+		 */
+		if (ext4_handle_valid(handle))
+			error = ext4_handle_dirty_xattr_block(handle, inode,
+							      bh);
+		unlock_buffer(bh);
+		if (!ext4_handle_valid(handle))
+			error = ext4_handle_dirty_xattr_block(handle, inode,
+							      bh);
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+		dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
+		ea_bdebug(bh, "refcount now=%d; releasing",
+			  le32_to_cpu(BHDR(bh)->h_refcount));
+	}
+out:
+	ext4_std_error(inode->i_sb, error);
+	return;
+}
+
+/*
+ * Find the available free space for EAs. This also returns the total number of
+ * bytes used by EA entries.
+ */
+static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+				    size_t *min_offs, void *base, int *total)
+{
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < *min_offs)
+				*min_offs = offs;
+		}
+		if (total)
+			*total += EXT4_XATTR_LEN(last->e_name_len);
+	}
+	return (*min_offs - ((void *)last - base) - sizeof(__u32));
+}
+
+static int
+ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+{
+	struct ext4_xattr_entry *last;
+	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+
+	/* Compute min_offs and last. */
+	last = s->first;
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < min_offs)
+				min_offs = offs;
+		}
+	}
+	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+	if (!s->not_found) {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			size_t size = le32_to_cpu(s->here->e_value_size);
+			free += EXT4_XATTR_SIZE(size);
+		}
+		free += EXT4_XATTR_LEN(name_len);
+	}
+	if (i->value) {
+		if (free < EXT4_XATTR_LEN(name_len) +
+			   EXT4_XATTR_SIZE(i->value_len))
+			return -ENOSPC;
+	}
+
+	if (i->value && s->not_found) {
+		/* Insert the new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+		memmove((void *)s->here + size, s->here, rest);
+		memset(s->here, 0, size);
+		s->here->e_name_index = i->name_index;
+		s->here->e_name_len = name_len;
+		memcpy(s->here->e_name, i->name, name_len);
+	} else {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			void *first_val = s->base + min_offs;
+			size_t offs = le16_to_cpu(s->here->e_value_offs);
+			void *val = s->base + offs;
+			size_t size = EXT4_XATTR_SIZE(
+				le32_to_cpu(s->here->e_value_size));
+
+			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
+				/* The old and the new value have the same
+				   size. Just replace. */
+				s->here->e_value_size =
+					cpu_to_le32(i->value_len);
+				if (i->value == EXT4_ZERO_XATTR_VALUE) {
+					memset(val, 0, size);
+				} else {
+					/* Clear pad bytes first. */
+					memset(val + size - EXT4_XATTR_PAD, 0,
+					       EXT4_XATTR_PAD);
+					memcpy(val, i->value, i->value_len);
+				}
+				return 0;
+			}
+
+			/* Remove the old value. */
+			memmove(first_val + size, first_val, val - first_val);
+			memset(first_val, 0, size);
+			s->here->e_value_size = 0;
+			s->here->e_value_offs = 0;
+			min_offs += size;
+
+			/* Adjust all value offsets. */
+			last = s->first;
+			while (!IS_LAST_ENTRY(last)) {
+				size_t o = le16_to_cpu(last->e_value_offs);
+				if (!last->e_value_block &&
+				    last->e_value_size && o < offs)
+					last->e_value_offs =
+						cpu_to_le16(o + size);
+				last = EXT4_XATTR_NEXT(last);
+			}
+		}
+		if (!i->value) {
+			/* Remove the old name. */
+			size_t size = EXT4_XATTR_LEN(name_len);
+			last = ENTRY((void *)last - size);
+			memmove(s->here, (void *)s->here + size,
+				(void *)last - (void *)s->here + sizeof(__u32));
+			memset(last, 0, size);
+		}
+	}
+
+	if (i->value) {
+		/* Insert the new value. */
+		s->here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value_len) {
+			size_t size = EXT4_XATTR_SIZE(i->value_len);
+			void *val = s->base + min_offs - size;
+			s->here->e_value_offs = cpu_to_le16(min_offs - size);
+			if (i->value == EXT4_ZERO_XATTR_VALUE) {
+				memset(val, 0, size);
+			} else {
+				/* Clear the pad bytes first. */
+				memset(val + size - EXT4_XATTR_PAD, 0,
+				       EXT4_XATTR_PAD);
+				memcpy(val, i->value, i->value_len);
+			}
+		}
+	}
+	return 0;
+}
+
+struct ext4_xattr_block_find {
+	struct ext4_xattr_search s;
+	struct buffer_head *bh;
+};
+
+static int
+ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
+		      struct ext4_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+		  i->name_index, i->name, i->value, (long)i->value_len);
+
+	if (EXT4_I(inode)->i_file_acl) {
+		/* The inode already has an extended attribute block. */
+		bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bs->bh)
+			goto cleanup;
+		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+			atomic_read(&(bs->bh->b_count)),
+			le32_to_cpu(BHDR(bs->bh)->h_refcount));
+		if (ext4_xattr_check_block(inode, bs->bh)) {
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		/* Find the named attribute. */
+		bs->s.base = BHDR(bs->bh);
+		bs->s.first = BFIRST(bs->bh);
+		bs->s.end = bs->bh->b_data + bs->bh->b_size;
+		bs->s.here = bs->s.first;
+		error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
+					      i->name, bs->bh->b_size, 1);
+		if (error && error != -ENODATA)
+			goto cleanup;
+		bs->s.not_found = error;
+	}
+	error = 0;
+
+cleanup:
+	return error;
+}
+
+static int
+ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+		     struct ext4_xattr_info *i,
+		     struct ext4_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh = NULL;
+	struct ext4_xattr_search *s = &bs->s;
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+
+#define header(x) ((struct ext4_xattr_header *)(x))
+
+	if (i->value && i->value_len > sb->s_blocksize)
+		return -ENOSPC;
+	if (s->base) {
+		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
+					bs->bh->b_blocknr);
+		BUFFER_TRACE(bs->bh, "get_write_access");
+		error = ext4_journal_get_write_access(handle, bs->bh);
+		if (error)
+			goto cleanup;
+		lock_buffer(bs->bh);
+
+		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+			if (ce) {
+				mb_cache_entry_free(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "modifying in-place");
+			error = ext4_xattr_set_entry(i, s);
+			if (!error) {
+				if (!IS_LAST_ENTRY(s->first))
+					ext4_xattr_rehash(header(s->base),
+							  s->here);
+				ext4_xattr_cache_insert(ext4_mb_cache,
+					bs->bh);
+			}
+			unlock_buffer(bs->bh);
+			if (error == -EIO)
+				goto bad_block;
+			if (!error)
+				error = ext4_handle_dirty_xattr_block(handle,
+								      inode,
+								      bs->bh);
+			if (error)
+				goto cleanup;
+			goto inserted;
+		} else {
+			int offset = (char *)s->here - bs->bh->b_data;
+
+			unlock_buffer(bs->bh);
+			if (ce) {
+				mb_cache_entry_release(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "cloning");
+			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+			error = -ENOMEM;
+			if (s->base == NULL)
+				goto cleanup;
+			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+			s->first = ENTRY(header(s->base)+1);
+			header(s->base)->h_refcount = cpu_to_le32(1);
+			s->here = ENTRY(s->base + offset);
+			s->end = s->base + bs->bh->b_size;
+		}
+	} else {
+		/* Allocate a buffer where we construct the new block. */
+		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
+		/* assert(header == s->base) */
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		header(s->base)->h_blocks = cpu_to_le32(1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->first = ENTRY(header(s->base)+1);
+		s->here = ENTRY(header(s->base)+1);
+		s->end = s->base + sb->s_blocksize;
+	}
+
+	error = ext4_xattr_set_entry(i, s);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	if (!IS_LAST_ENTRY(s->first))
+		ext4_xattr_rehash(header(s->base), s->here);
+
+inserted:
+	if (!IS_LAST_ENTRY(s->first)) {
+		new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+		if (new_bh) {
+			/* We found an identical block in the cache. */
+			if (new_bh == bs->bh)
+				ea_bdebug(new_bh, "keeping");
+			else {
+				/* The old block is released after updating
+				   the inode. */
+				error = dquot_alloc_block(inode,
+						EXT4_C2B(EXT4_SB(sb), 1));
+				if (error)
+					goto cleanup;
+				BUFFER_TRACE(new_bh, "get_write_access");
+				error = ext4_journal_get_write_access(handle,
+								      new_bh);
+				if (error)
+					goto cleanup_dquot;
+				lock_buffer(new_bh);
+				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+				ea_bdebug(new_bh, "reusing; refcount now=%d",
+					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				unlock_buffer(new_bh);
+				error = ext4_handle_dirty_xattr_block(handle,
+								      inode,
+								      new_bh);
+				if (error)
+					goto cleanup_dquot;
+			}
+			mb_cache_entry_release(ce);
+			ce = NULL;
+		} else if (bs->bh && s->base == bs->bh->b_data) {
+			/* We were modifying this block in-place. */
+			ea_bdebug(bs->bh, "keeping this block");
+			new_bh = bs->bh;
+			get_bh(new_bh);
+		} else {
+			/* We need to allocate a new block */
+			ext4_fsblk_t goal, block;
+
+			goal = ext4_group_first_block_no(sb,
+						EXT4_I(inode)->i_block_group);
+
+			/* non-extent files can't have physical blocks past 2^32 */
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+			block = ext4_new_meta_blocks(handle, inode, goal, 0,
+						     NULL, &error);
+			if (error)
+				goto cleanup;
+
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
+			ea_idebug(inode, "creating block %llu",
+				  (unsigned long long)block);
+
+			new_bh = sb_getblk(sb, block);
+			if (unlikely(!new_bh)) {
+				error = -ENOMEM;
+getblk_failed:
+				ext4_free_blocks(handle, inode, NULL, block, 1,
+						 EXT4_FREE_BLOCKS_METADATA);
+				goto cleanup;
+			}
+			lock_buffer(new_bh);
+			error = ext4_journal_get_create_access(handle, new_bh);
+			if (error) {
+				unlock_buffer(new_bh);
+				error = -EIO;
+				goto getblk_failed;
+			}
+			memcpy(new_bh->b_data, s->base, new_bh->b_size);
+			set_buffer_uptodate(new_bh);
+			unlock_buffer(new_bh);
+			ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+			error = ext4_handle_dirty_xattr_block(handle,
+							      inode, new_bh);
+			if (error)
+				goto cleanup;
+		}
+	}
+
+	/* Update the inode. */
+	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+
+	/* Drop the previous xattr block. */
+	if (bs->bh && bs->bh != new_bh)
+		ext4_xattr_release_block(handle, inode, bs->bh);
+	error = 0;
+
+cleanup:
+	if (ce)
+		mb_cache_entry_release(ce);
+	brelse(new_bh);
+	if (!(bs->bh && s->base == bs->bh->b_data))
+		kfree(s->base);
+
+	return error;
+
+cleanup_dquot:
+	dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
+	goto cleanup;
+
+bad_block:
+	EXT4_ERROR_INODE(inode, "bad block %llu",
+			 EXT4_I(inode)->i_file_acl);
+	goto cleanup;
+
+#undef header
+}
+
+int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+			  struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+	raw_inode = ext4_raw_inode(&is->iloc);
+	header = IHDR(inode, raw_inode);
+	is->s.base = is->s.first = IFIRST(header);
+	is->s.here = is->s.first;
+	is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		error = ext4_xattr_check_names(IFIRST(header), is->s.end,
+					       IFIRST(header));
+		if (error)
+			return error;
+		/* Find the named attribute. */
+		error = ext4_xattr_find_entry(&is->s.here, i->name_index,
+					      i->name, is->s.end -
+					      (void *)is->s.base, 0);
+		if (error && error != -ENODATA)
+			return error;
+		is->s.not_found = error;
+	}
+	return 0;
+}
+
+int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+				struct ext4_xattr_info *i,
+				struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_search *s = &is->s;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return -ENOSPC;
+	error = ext4_xattr_set_entry(i, s);
+	if (error) {
+		if (error == -ENOSPC &&
+		    ext4_has_inline_data(inode)) {
+			error = ext4_try_to_evict_inline_data(handle, inode,
+					EXT4_XATTR_LEN(strlen(i->name) +
+					EXT4_XATTR_SIZE(i->value_len)));
+			if (error)
+				return error;
+			error = ext4_xattr_ibody_find(inode, i, is);
+			if (error)
+				return error;
+			error = ext4_xattr_set_entry(i, s);
+		}
+		if (error)
+			return error;
+	}
+	header = IHDR(inode, ext4_raw_inode(&is->iloc));
+	if (!IS_LAST_ENTRY(s->first)) {
+		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+	} else {
+		header->h_magic = cpu_to_le32(0);
+		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+	}
+	return 0;
+}
+
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+				struct ext4_xattr_info *i,
+				struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_search *s = &is->s;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return -ENOSPC;
+	error = ext4_xattr_set_entry(i, s);
+	if (error)
+		return error;
+	header = IHDR(inode, ext4_raw_inode(&is->iloc));
+	if (!IS_LAST_ENTRY(s->first)) {
+		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+	} else {
+		header->h_magic = cpu_to_le32(0);
+		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+	}
+	return 0;
+}
+
+/*
+ * ext4_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode.  Value
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+		      const char *name, const void *value, size_t value_len,
+		      int flags)
+{
+	struct ext4_xattr_info i = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_block_find bs = {
+		.s = { .not_found = -ENODATA, },
+	};
+	unsigned long no_expand;
+	int error;
+
+	if (!name)
+		return -EINVAL;
+	if (strlen(name) > 255)
+		return -ERANGE;
+	down_write(&EXT4_I(inode)->xattr_sem);
+	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+
+	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
+	if (error)
+		goto cleanup;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
+		struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+		ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+	}
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found)
+		error = ext4_xattr_block_find(inode, &i, &bs);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found && bs.s.not_found) {
+		error = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		error = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+	if (!value) {
+		if (!is.s.not_found)
+			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		else if (!bs.s.not_found)
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+	} else {
+		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		if (!error && !bs.s.not_found) {
+			i.value = NULL;
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+		} else if (error == -ENOSPC) {
+			if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext4_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+			if (error)
+				goto cleanup;
+			if (!is.s.not_found) {
+				i.value = NULL;
+				error = ext4_xattr_ibody_set(handle, inode, &i,
+							     &is);
+			}
+		}
+	}
+	if (!error) {
+		ext4_xattr_update_super_block(handle, inode->i_sb);
+		inode->i_ctime = ext4_current_time(inode);
+		if (!value)
+			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+		/*
+		 * The bh is consumed by ext4_mark_iloc_dirty, even with
+		 * error != 0.
+		 */
+		is.iloc.bh = NULL;
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+	}
+
+cleanup:
+	brelse(is.iloc.bh);
+	brelse(bs.bh);
+	if (no_expand == 0)
+		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+/*
+ * ext4_xattr_set()
+ *
+ * Like ext4_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t value_len, int flags)
+{
+	handle_t *handle;
+	int error, retries = 0;
+	int credits = ext4_jbd2_credits_xattr(inode);
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
+	if (IS_ERR(handle)) {
+		error = PTR_ERR(handle);
+	} else {
+		int error2;
+
+		error = ext4_xattr_set_handle(handle, inode, name_index, name,
+					      value, value_len, flags);
+		error2 = ext4_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+		if (error == 0)
+			error = error2;
+	}
+
+	return error;
+}
+
+/*
+ * Shift the EA entries in the inode to create space for the increased
+ * i_extra_isize.
+ */
+static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+				     int value_offs_shift, void *to,
+				     void *from, size_t n, int blocksize)
+{
+	struct ext4_xattr_entry *last = entry;
+	int new_offs;
+
+	/* Adjust the value offsets of the entries */
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			new_offs = le16_to_cpu(last->e_value_offs) +
+							value_offs_shift;
+			BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
+				 > blocksize);
+			last->e_value_offs = cpu_to_le16(new_offs);
+		}
+	}
+	/* Shift the entries by n bytes */
+	memmove(to, from, n);
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes when EAs are present.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			       struct ext4_inode *raw_inode, handle_t *handle)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry, *last, *first;
+	struct buffer_head *bh = NULL;
+	struct ext4_xattr_ibody_find *is = NULL;
+	struct ext4_xattr_block_find *bs = NULL;
+	char *buffer = NULL, *b_entry_name = NULL;
+	size_t min_offs, free;
+	int total_ino;
+	void *base, *start, *end;
+	int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+	int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+retry:
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+		up_write(&EXT4_I(inode)->xattr_sem);
+		return 0;
+	}
+
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+
+	/*
+	 * Check if enough free space is available in the inode to shift the
+	 * entries ahead by new_extra_isize.
+	 */
+
+	base = start = entry;
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	min_offs = end - base;
+	last = entry;
+	total_ino = sizeof(struct ext4_xattr_ibody_header);
+
+	free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
+	if (free >= new_extra_isize) {
+		entry = IFIRST(header);
+		ext4_xattr_shift_entries(entry,	EXT4_I(inode)->i_extra_isize
+				- new_extra_isize, (void *)raw_inode +
+				EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+				(void *)header, total_ino,
+				inode->i_sb->s_blocksize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		error = 0;
+		goto cleanup;
+	}
+
+	/*
+	 * Enough free space isn't available in the inode, check if
+	 * EA block can hold new_extra_isize bytes.
+	 */
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bh)
+			goto cleanup;
+		if (ext4_xattr_check_block(inode, bh)) {
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		base = BHDR(bh);
+		first = BFIRST(bh);
+		end = bh->b_data + bh->b_size;
+		min_offs = end - base;
+		free = ext4_xattr_free_space(first, &min_offs, base, NULL);
+		if (free < new_extra_isize) {
+			if (!tried_min_extra_isize && s_min_extra_isize) {
+				tried_min_extra_isize++;
+				new_extra_isize = s_min_extra_isize;
+				brelse(bh);
+				goto retry;
+			}
+			error = -1;
+			goto cleanup;
+		}
+	} else {
+		free = inode->i_sb->s_blocksize;
+	}
+
+	while (new_extra_isize > 0) {
+		size_t offs, size, entry_size;
+		struct ext4_xattr_entry *small_entry = NULL;
+		struct ext4_xattr_info i = {
+			.value = NULL,
+			.value_len = 0,
+		};
+		unsigned int total_size;  /* EA entry size + value size */
+		unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
+		unsigned int min_total_size = ~0U;
+
+		is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+		bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
+		if (!is || !bs) {
+			error = -ENOMEM;
+			goto cleanup;
+		}
+
+		is->s.not_found = -ENODATA;
+		bs->s.not_found = -ENODATA;
+		is->iloc.bh = NULL;
+		bs->bh = NULL;
+
+		last = IFIRST(header);
+		/* Find the entry best suited to be pushed into EA block */
+		entry = NULL;
+		for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+			total_size =
+			EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+					EXT4_XATTR_LEN(last->e_name_len);
+			if (total_size <= free && total_size < min_total_size) {
+				if (total_size < new_extra_isize) {
+					small_entry = last;
+				} else {
+					entry = last;
+					min_total_size = total_size;
+				}
+			}
+		}
+
+		if (entry == NULL) {
+			if (small_entry) {
+				entry = small_entry;
+			} else {
+				if (!tried_min_extra_isize &&
+				    s_min_extra_isize) {
+					tried_min_extra_isize++;
+					new_extra_isize = s_min_extra_isize;
+					kfree(is); is = NULL;
+					kfree(bs); bs = NULL;
+					brelse(bh);
+					goto retry;
+				}
+				error = -1;
+				goto cleanup;
+			}
+		}
+		offs = le16_to_cpu(entry->e_value_offs);
+		size = le32_to_cpu(entry->e_value_size);
+		entry_size = EXT4_XATTR_LEN(entry->e_name_len);
+		i.name_index = entry->e_name_index,
+		buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+		b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
+		if (!buffer || !b_entry_name) {
+			error = -ENOMEM;
+			goto cleanup;
+		}
+		/* Save the entry name and the entry value */
+		memcpy(buffer, (void *)IFIRST(header) + offs,
+		       EXT4_XATTR_SIZE(size));
+		memcpy(b_entry_name, entry->e_name, entry->e_name_len);
+		b_entry_name[entry->e_name_len] = '\0';
+		i.name = b_entry_name;
+
+		error = ext4_get_inode_loc(inode, &is->iloc);
+		if (error)
+			goto cleanup;
+
+		error = ext4_xattr_ibody_find(inode, &i, is);
+		if (error)
+			goto cleanup;
+
+		/* Remove the chosen entry from the inode */
+		error = ext4_xattr_ibody_set(handle, inode, &i, is);
+		if (error)
+			goto cleanup;
+
+		entry = IFIRST(header);
+		if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
+			shift_bytes = new_extra_isize;
+		else
+			shift_bytes = entry_size + size;
+		/* Adjust the offsets and shift the remaining entries ahead */
+		ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
+			shift_bytes, (void *)raw_inode +
+			EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
+			(void *)header, total_ino - entry_size,
+			inode->i_sb->s_blocksize);
+
+		extra_isize += shift_bytes;
+		new_extra_isize -= shift_bytes;
+		EXT4_I(inode)->i_extra_isize = extra_isize;
+
+		i.name = b_entry_name;
+		i.value = buffer;
+		i.value_len = size;
+		error = ext4_xattr_block_find(inode, &i, bs);
+		if (error)
+			goto cleanup;
+
+		/* Add entry which was removed from the inode into the block */
+		error = ext4_xattr_block_set(handle, inode, &i, bs);
+		if (error)
+			goto cleanup;
+		kfree(b_entry_name);
+		kfree(buffer);
+		b_entry_name = NULL;
+		buffer = NULL;
+		brelse(is->iloc.bh);
+		kfree(is);
+		kfree(bs);
+	}
+	brelse(bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return 0;
+
+cleanup:
+	kfree(b_entry_name);
+	kfree(buffer);
+	if (is)
+		brelse(is->iloc.bh);
+	kfree(is);
+	kfree(bs);
+	brelse(bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+
+
+/*
+ * ext4_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh) {
+		EXT4_ERROR_INODE(inode, "block %llu read error",
+				 EXT4_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ext4_xattr_release_block(handle, inode, bh);
+	EXT4_I(inode)->i_file_acl = 0;
+
+cleanup:
+	brelse(bh);
+}
+
+/*
+ * ext4_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext4_xattr_put_super(struct super_block *sb)
+{
+	mb_cache_shrink(sb->s_bdev);
+}
+
+/*
+ * ext4_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
+{
+	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+	struct mb_cache_entry *ce;
+	int error;
+
+	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
+	if (!ce) {
+		ea_bdebug(bh, "out of memory");
+		return;
+	}
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	if (error) {
+		mb_cache_entry_free(ce);
+		if (error == -EBUSY) {
+			ea_bdebug(bh, "already in cache");
+			error = 0;
+		}
+	} else {
+		ea_bdebug(bh, "inserting [%x]", (int)hash);
+		mb_cache_entry_release(ce);
+	}
+}
+
+/*
+ * ext4_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext4_xattr_cmp(struct ext4_xattr_header *header1,
+	       struct ext4_xattr_header *header2)
+{
+	struct ext4_xattr_entry *entry1, *entry2;
+
+	entry1 = ENTRY(header1+1);
+	entry2 = ENTRY(header2+1);
+	while (!IS_LAST_ENTRY(entry1)) {
+		if (IS_LAST_ENTRY(entry2))
+			return 1;
+		if (entry1->e_hash != entry2->e_hash ||
+		    entry1->e_name_index != entry2->e_name_index ||
+		    entry1->e_name_len != entry2->e_name_len ||
+		    entry1->e_value_size != entry2->e_value_size ||
+		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+			return 1;
+		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+			return -EIO;
+		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+			   le32_to_cpu(entry1->e_value_size)))
+			return 1;
+
+		entry1 = EXT4_XATTR_NEXT(entry1);
+		entry2 = EXT4_XATTR_NEXT(entry2);
+	}
+	if (!IS_LAST_ENTRY(entry2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext4_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+		      struct mb_cache_entry **pce)
+{
+	__u32 hash = le32_to_cpu(header->h_hash);
+	struct mb_cache_entry *ce;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+
+	if (!header->h_hash)
+		return NULL;  /* never share */
+	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
+				       hash);
+	while (ce) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(ce)) {
+			if (PTR_ERR(ce) == -EAGAIN)
+				goto again;
+			break;
+		}
+		bh = sb_bread(inode->i_sb, ce->e_block);
+		if (!bh) {
+			EXT4_ERROR_INODE(inode, "block %lu read error",
+					 (unsigned long) ce->e_block);
+		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+				EXT4_XATTR_REFCOUNT_MAX) {
+			ea_idebug(inode, "block %lu refcount %d>=%d",
+				  (unsigned long) ce->e_block,
+				  le32_to_cpu(BHDR(bh)->h_refcount),
+					  EXT4_XATTR_REFCOUNT_MAX);
+		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
+			*pce = ce;
+			return bh;
+		}
+		brelse(bh);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+	}
+	return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
+					 struct ext4_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	char *name = entry->e_name;
+	int n;
+
+	for (n = 0; n < entry->e_name_len; n++) {
+		hash = (hash << NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+		__le32 *value = (__le32 *)((char *)header +
+			le16_to_cpu(entry->e_value_offs));
+		for (n = (le32_to_cpu(entry->e_value_size) +
+		     EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
+			hash = (hash << VALUE_HASH_SHIFT) ^
+			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+			       le32_to_cpu(*value++);
+		}
+	}
+	entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext4_xattr_rehash(struct ext4_xattr_header *header,
+			      struct ext4_xattr_entry *entry)
+{
+	struct ext4_xattr_entry *here;
+	__u32 hash = 0;
+
+	ext4_xattr_hash_entry(header, entry);
+	here = ENTRY(header+1);
+	while (!IS_LAST_ENTRY(here)) {
+		if (!here->e_hash) {
+			/* Block is not shared if an entry's hash value == 0 */
+			hash = 0;
+			break;
+		}
+		hash = (hash << BLOCK_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+		       le32_to_cpu(here->e_hash);
+		here = EXT4_XATTR_NEXT(here);
+	}
+	header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+#define	HASH_BUCKET_BITS	10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
+{
+	return mb_cache_create(name, HASH_BUCKET_BITS);
+}
+
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
+{
+	if (cache)
+		mb_cache_destroy(cache);
+}
+
diff --git a/kernel/fs/ext4/xattr.h b/kernel/fs/ext4/xattr.h
new file mode 100644
index 000000000..ddc095776
--- /dev/null
+++ b/kernel/fs/ext4/xattr.h
@@ -0,0 +1,139 @@
+/*
+  File: fs/ext4/xattr.h
+
+  On-disk format of extended attributes for the ext4 filesystem.
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT4_XATTR_MAGIC		0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT4_XATTR_REFCOUNT_MAX		1024
+
+/* Name indexes */
+#define EXT4_XATTR_INDEX_USER			1
+#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define EXT4_XATTR_INDEX_TRUSTED		4
+#define	EXT4_XATTR_INDEX_LUSTRE			5
+#define EXT4_XATTR_INDEX_SECURITY	        6
+#define EXT4_XATTR_INDEX_SYSTEM			7
+#define EXT4_XATTR_INDEX_RICHACL		8
+#define EXT4_XATTR_INDEX_ENCRYPTION		9
+
+struct ext4_xattr_header {
+	__le32	h_magic;	/* magic number for identification */
+	__le32	h_refcount;	/* reference count */
+	__le32	h_blocks;	/* number of disk blocks used */
+	__le32	h_hash;		/* hash value of all attributes */
+	__le32	h_checksum;	/* crc32c(uuid+id+xattrblock) */
+				/* id = inum if refcount=1, blknum otherwise */
+	__u32	h_reserved[3];	/* zero right now */
+};
+
+struct ext4_xattr_ibody_header {
+	__le32	h_magic;	/* magic number for identification */
+};
+
+struct ext4_xattr_entry {
+	__u8	e_name_len;	/* length of name */
+	__u8	e_name_index;	/* attribute name index */
+	__le16	e_value_offs;	/* offset in disk block of value */
+	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_size;	/* size of attribute value */
+	__le32	e_hash;		/* hash value of name and value */
+	char	e_name[0];	/* attribute name */
+};
+
+#define EXT4_XATTR_PAD_BITS		2
+#define EXT4_XATTR_PAD		(1<<EXT4_XATTR_PAD_BITS)
+#define EXT4_XATTR_ROUND		(EXT4_XATTR_PAD-1)
+#define EXT4_XATTR_LEN(name_len) \
+	(((name_len) + EXT4_XATTR_ROUND + \
+	sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
+#define EXT4_XATTR_NEXT(entry) \
+	((struct ext4_xattr_entry *)( \
+	 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
+#define EXT4_XATTR_SIZE(size) \
+	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
+
+#define IHDR(inode, raw_inode) \
+	((struct ext4_xattr_ibody_header *) \
+		((void *)raw_inode + \
+		EXT4_GOOD_OLD_INODE_SIZE + \
+		EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+
+struct ext4_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ext4_xattr_search {
+	struct ext4_xattr_entry *first;
+	void *base;
+	void *end;
+	struct ext4_xattr_entry *here;
+	int not_found;
+};
+
+struct ext4_xattr_ibody_find {
+	struct ext4_xattr_search s;
+	struct ext4_iloc iloc;
+};
+
+extern const struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
+
+#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+
+extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
+
+extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext4_xattr_put_super(struct super_block *);
+
+extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle);
+
+extern const struct xattr_handler *ext4_xattr_handlers[];
+
+extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+				 struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
+				const char *name,
+				void *buffer, size_t buffer_size);
+extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+				       struct ext4_xattr_info *i,
+				       struct ext4_xattr_ibody_find *is);
+
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
+
+#ifdef CONFIG_EXT4_FS_SECURITY
+extern int ext4_init_security(handle_t *handle, struct inode *inode,
+			      struct inode *dir, const struct qstr *qstr);
+#else
+static inline int ext4_init_security(handle_t *handle, struct inode *inode,
+				     struct inode *dir, const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/fs/ext4/xattr_security.c b/kernel/fs/ext4/xattr_security.c
new file mode 100644
index 000000000..95d90e056
--- /dev/null
+++ b/kernel/fs/ext4/xattr_security.c
@@ -0,0 +1,82 @@
+/*
+ * linux/fs/ext4/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+			      name, buffer, size);
+}
+
+static int
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+			      name, value, size, flags);
+}
+
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		void *fs_info)
+{
+	const struct xattr *xattr;
+	handle_t *handle = fs_info;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ext4_xattr_set_handle(handle, inode,
+					    EXT4_XATTR_INDEX_SECURITY,
+					    xattr->name, xattr->value,
+					    xattr->value_len, 0);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &ext4_initxattrs, handle);
+}
+
+const struct xattr_handler ext4_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ext4_xattr_security_list,
+	.get	= ext4_xattr_security_get,
+	.set	= ext4_xattr_security_set,
+};
diff --git a/kernel/fs/ext4/xattr_trusted.c b/kernel/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000..891ee2ddf
--- /dev/null
+++ b/kernel/fs/ext4/xattr_trusted.c
@@ -0,0 +1,58 @@
+/*
+ * linux/fs/ext4/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
+		size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+			      name, buffer, size);
+}
+
+static int
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ext4_xattr_trusted_list,
+	.get	= ext4_xattr_trusted_get,
+	.set	= ext4_xattr_trusted_set,
+};
diff --git a/kernel/fs/ext4/xattr_user.c b/kernel/fs/ext4/xattr_user.c
new file mode 100644
index 000000000..6ed932b3c
--- /dev/null
+++ b/kernel/fs/ext4/xattr_user.c
@@ -0,0 +1,61 @@
+/*
+ * linux/fs/ext4/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/string.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+		     const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+			      name, buffer, size);
+}
+
+static int
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
+		    const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ext4_xattr_user_list,
+	.get	= ext4_xattr_user_get,
+	.set	= ext4_xattr_user_set,
+};
diff --git a/kernel/fs/f2fs/Kconfig b/kernel/fs/f2fs/Kconfig
new file mode 100644
index 000000000..05f0f663f
--- /dev/null
+++ b/kernel/fs/f2fs/Kconfig
@@ -0,0 +1,83 @@
+config F2FS_FS
+	tristate "F2FS filesystem support"
+	depends on BLOCK
+	help
+	  F2FS is based on Log-structured File System (LFS), which supports
+	  versatile "flash-friendly" features. The design has been focused on
+	  addressing the fundamental issues in LFS, which are snowball effect
+	  of wandering tree and high cleaning overhead.
+
+	  Since flash-based storages show different characteristics according to
+	  the internal geometry or flash memory management schemes aka FTL, F2FS
+	  and tools support various parameters not only for configuring on-disk
+	  layout, but also for selecting allocation and cleaning algorithms.
+
+	  If unsure, say N.
+
+config F2FS_STAT_FS
+	bool "F2FS Status Information"
+	depends on F2FS_FS && DEBUG_FS
+	default y
+	help
+	  /sys/kernel/debug/f2fs/ contains information about all the partitions
+	  mounted as f2fs. Each file shows the whole f2fs information.
+
+	  /sys/kernel/debug/f2fs/status includes:
+	    - major filesystem information managed by f2fs currently
+	    - average SIT information about whole segments
+	    - current memory footprint consumed by f2fs.
+
+config F2FS_FS_XATTR
+	bool "F2FS extended attributes"
+	depends on F2FS_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config F2FS_FS_POSIX_ACL
+	bool "F2FS Access Control Lists"
+	depends on F2FS_FS_XATTR
+	select FS_POSIX_ACL
+	default y
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  gourps beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config F2FS_FS_SECURITY
+	bool "F2FS Security Labels"
+	depends on F2FS_FS_XATTR
+	help
+	  Security labels provide an access control facility to support Linux
+	  Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+	  Linux. This option enables an extended attribute handler for file
+	  security labels in the f2fs filesystem, so that it requires enabling
+	  the extended attribute support in advance.
+
+	  If you are not using a security module, say N.
+
+config F2FS_CHECK_FS
+	bool "F2FS consistency checking feature"
+	depends on F2FS_FS
+	help
+	  Enables BUG_ONs which check the filesystem consistency in runtime.
+
+	  If you want to improve the performance, say N.
+
+config F2FS_IO_TRACE
+	bool "F2FS IO tracer"
+	depends on F2FS_FS
+	depends on FUNCTION_TRACER
+	help
+	  F2FS IO trace is based on a function trace, which gathers process
+	  information and block IO patterns in the filesystem level.
+
+	  If unsure, say N.
diff --git a/kernel/fs/f2fs/Makefile b/kernel/fs/f2fs/Makefile
new file mode 100644
index 000000000..d92397731
--- /dev/null
+++ b/kernel/fs/f2fs/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_F2FS_FS) += f2fs.o
+
+f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o inline.o
+f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
+f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
+f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
+f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
+f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
diff --git a/kernel/fs/f2fs/acl.c b/kernel/fs/f2fs/acl.c
new file mode 100644
index 000000000..4320ffab3
--- /dev/null
+++ b/kernel/fs/f2fs/acl.c
@@ -0,0 +1,410 @@
+/*
+ * fs/f2fs/acl.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+static inline size_t f2fs_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(struct f2fs_acl_header) +
+			count * sizeof(struct f2fs_acl_entry_short);
+	} else {
+		return sizeof(struct f2fs_acl_header) +
+			4 * sizeof(struct f2fs_acl_entry_short) +
+			(count - 4) * sizeof(struct f2fs_acl_entry);
+	}
+}
+
+static inline int f2fs_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(struct f2fs_acl_header);
+	s = size - 4 * sizeof(struct f2fs_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(struct f2fs_acl_entry_short))
+			return -1;
+		return size / sizeof(struct f2fs_acl_entry_short);
+	} else {
+		if (s % sizeof(struct f2fs_acl_entry))
+			return -1;
+		return s / sizeof(struct f2fs_acl_entry) + 4;
+	}
+}
+
+static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
+{
+	int i, count;
+	struct posix_acl *acl;
+	struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value;
+	struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
+	const char *end = value + size;
+
+	if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+
+	count = f2fs_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+
+		if ((char *)entry > end)
+			goto fail;
+
+		acl->a_entries[i].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm);
+
+		switch (acl->a_entries[i].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			entry = (struct f2fs_acl_entry *)((char *)entry +
+					sizeof(struct f2fs_acl_entry_short));
+			break;
+
+		case ACL_USER:
+			acl->a_entries[i].e_uid =
+				make_kuid(&init_user_ns,
+						le32_to_cpu(entry->e_id));
+			entry = (struct f2fs_acl_entry *)((char *)entry +
+					sizeof(struct f2fs_acl_entry));
+			break;
+		case ACL_GROUP:
+			acl->a_entries[i].e_gid =
+				make_kgid(&init_user_ns,
+						le32_to_cpu(entry->e_id));
+			entry = (struct f2fs_acl_entry *)((char *)entry +
+					sizeof(struct f2fs_acl_entry));
+			break;
+		default:
+			goto fail;
+		}
+	}
+	if ((char *)entry != end)
+		goto fail;
+	return acl;
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	struct f2fs_acl_header *f2fs_acl;
+	struct f2fs_acl_entry *entry;
+	int i;
+
+	f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
+			sizeof(struct f2fs_acl_entry), GFP_NOFS);
+	if (!f2fs_acl)
+		return ERR_PTR(-ENOMEM);
+
+	f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION);
+	entry = (struct f2fs_acl_entry *)(f2fs_acl + 1);
+
+	for (i = 0; i < acl->a_count; i++) {
+
+		entry->e_tag  = cpu_to_le16(acl->a_entries[i].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm);
+
+		switch (acl->a_entries[i].e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+					from_kuid(&init_user_ns,
+						acl->a_entries[i].e_uid));
+			entry = (struct f2fs_acl_entry *)((char *)entry +
+					sizeof(struct f2fs_acl_entry));
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+					from_kgid(&init_user_ns,
+						acl->a_entries[i].e_gid));
+			entry = (struct f2fs_acl_entry *)((char *)entry +
+					sizeof(struct f2fs_acl_entry));
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			entry = (struct f2fs_acl_entry *)((char *)entry +
+					sizeof(struct f2fs_acl_entry_short));
+			break;
+		default:
+			goto fail;
+		}
+	}
+	*size = f2fs_acl_size(acl->a_count);
+	return (void *)f2fs_acl;
+
+fail:
+	kfree(f2fs_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
+						struct page *dpage)
+{
+	int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+	void *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (type == ACL_TYPE_ACCESS)
+		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+
+	retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_F2FS_ZERO);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = f2fs_getxattr(inode, name_index, "", value,
+							retval, dpage);
+	}
+
+	if (retval > 0)
+		acl = f2fs_acl_from_disk(value, retval);
+	else if (retval == -ENODATA)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+{
+	return __f2fs_get_acl(inode, type, NULL);
+}
+
+static int __f2fs_set_acl(struct inode *inode, int type,
+			struct posix_acl *acl, struct page *ipage)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			error = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (error < 0)
+				return error;
+			set_acl_inode(fi, inode->i_mode);
+			if (error == 0)
+				acl = NULL;
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = f2fs_acl_to_disk(acl, &size);
+		if (IS_ERR(value)) {
+			clear_inode_flag(fi, FI_ACL_MODE);
+			return (int)PTR_ERR(value);
+		}
+	}
+
+	error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
+
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
+	clear_inode_flag(fi, FI_ACL_MODE);
+	return error;
+}
+
+int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	return __f2fs_set_acl(inode, type, acl, NULL);
+}
+
+/*
+ * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create
+ * are copied from posix_acl.c
+ */
+static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl,
+							gfp_t flags)
+{
+	struct posix_acl *clone = NULL;
+
+	if (acl) {
+		int size = sizeof(struct posix_acl) + acl->a_count *
+				sizeof(struct posix_acl_entry);
+		clone = kmemdup(acl, size, flags);
+		if (clone)
+			atomic_set(&clone->a_refcount, 1);
+	}
+	return clone;
+}
+
+static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
+{
+	struct posix_acl_entry *pa, *pe;
+	struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
+	umode_t mode = *mode_p;
+	int not_equiv = 0;
+
+	/* assert(atomic_read(acl->a_refcount) == 1); */
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch(pa->e_tag) {
+		case ACL_USER_OBJ:
+			pa->e_perm &= (mode >> 6) | ~S_IRWXO;
+			mode &= (pa->e_perm << 6) | ~S_IRWXU;
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			not_equiv = 1;
+			break;
+
+		case ACL_GROUP_OBJ:
+			group_obj = pa;
+			break;
+
+		case ACL_OTHER:
+			pa->e_perm &= mode | ~S_IRWXO;
+			mode &= pa->e_perm | ~S_IRWXO;
+			break;
+
+		case ACL_MASK:
+			mask_obj = pa;
+			not_equiv = 1;
+			break;
+
+		default:
+			return -EIO;
+		}
+	}
+
+	if (mask_obj) {
+		mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+		mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
+	} else {
+		if (!group_obj)
+			return -EIO;
+		group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+		mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
+	}
+
+	*mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+        return not_equiv;
+}
+
+static int f2fs_acl_create(struct inode *dir, umode_t *mode,
+		struct posix_acl **default_acl, struct posix_acl **acl,
+		struct page *dpage)
+{
+	struct posix_acl *p;
+	int ret;
+
+	if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
+		goto no_acl;
+
+	p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage);
+	if (IS_ERR(p)) {
+		if (p == ERR_PTR(-EOPNOTSUPP))
+			goto apply_umask;
+		return PTR_ERR(p);
+	}
+
+	if (!p)
+		goto apply_umask;
+
+	*acl = f2fs_acl_clone(p, GFP_NOFS);
+	if (!*acl)
+		goto no_mem;
+
+	ret = f2fs_acl_create_masq(*acl, mode);
+	if (ret < 0)
+		goto no_mem_clone;
+
+	if (ret == 0) {
+		posix_acl_release(*acl);
+		*acl = NULL;
+	}
+
+	if (!S_ISDIR(*mode)) {
+		posix_acl_release(p);
+		*default_acl = NULL;
+	} else {
+		*default_acl = p;
+	}
+	return 0;
+
+apply_umask:
+	*mode &= ~current_umask();
+no_acl:
+	*default_acl = NULL;
+	*acl = NULL;
+	return 0;
+
+no_mem_clone:
+	posix_acl_release(*acl);
+no_mem:
+	posix_acl_release(p);
+	return -ENOMEM;
+}
+
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
+							struct page *dpage)
+{
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	int error = 0;
+
+	error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage);
+	if (error)
+		return error;
+
+	if (default_acl) {
+		error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
+				       ipage);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
+					       ipage);
+		posix_acl_release(acl);
+	}
+
+	return error;
+}
diff --git a/kernel/fs/f2fs/acl.h b/kernel/fs/f2fs/acl.h
new file mode 100644
index 000000000..997ca8edb
--- /dev/null
+++ b/kernel/fs/f2fs/acl.h
@@ -0,0 +1,54 @@
+/*
+ * fs/f2fs/acl.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.h
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_ACL_H__
+#define __F2FS_ACL_H__
+
+#include <linux/posix_acl_xattr.h>
+
+#define F2FS_ACL_VERSION	0x0001
+
+struct f2fs_acl_entry {
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
+};
+
+struct f2fs_acl_entry_short {
+	__le16 e_tag;
+	__le16 e_perm;
+};
+
+struct f2fs_acl_header {
+	__le32 a_version;
+};
+
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+
+extern struct posix_acl *f2fs_get_acl(struct inode *, int);
+extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
+							struct page *);
+#else
+#define f2fs_check_acl	NULL
+#define f2fs_get_acl	NULL
+#define f2fs_set_acl	NULL
+
+static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
+				struct page *ipage, struct page *dpage)
+{
+	return 0;
+}
+#endif
+#endif /* __F2FS_ACL_H__ */
diff --git a/kernel/fs/f2fs/checkpoint.c b/kernel/fs/f2fs/checkpoint.c
new file mode 100644
index 000000000..a5e17a2a0
--- /dev/null
+++ b/kernel/fs/f2fs/checkpoint.c
@@ -0,0 +1,1135 @@
+/*
+ * fs/f2fs/checkpoint.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "trace.h"
+#include <trace/events/f2fs.h>
+
+static struct kmem_cache *ino_entry_slab;
+struct kmem_cache *inode_entry_slab;
+
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+	struct address_space *mapping = META_MAPPING(sbi);
+	struct page *page = NULL;
+repeat:
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		cond_resched();
+		goto repeat;
+	}
+	f2fs_wait_on_page_writeback(page, META);
+	SetPageUptodate(page);
+	return page;
+}
+
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+	struct address_space *mapping = META_MAPPING(sbi);
+	struct page *page;
+	struct f2fs_io_info fio = {
+		.type = META,
+		.rw = READ_SYNC | REQ_META | REQ_PRIO,
+		.blk_addr = index,
+	};
+repeat:
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		cond_resched();
+		goto repeat;
+	}
+	if (PageUptodate(page))
+		goto out;
+
+	if (f2fs_submit_page_bio(sbi, page, &fio))
+		goto repeat;
+
+	lock_page(page);
+	if (unlikely(page->mapping != mapping)) {
+		f2fs_put_page(page, 1);
+		goto repeat;
+	}
+out:
+	return page;
+}
+
+static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi,
+						block_t blkaddr, int type)
+{
+	switch (type) {
+	case META_NAT:
+		break;
+	case META_SIT:
+		if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
+			return false;
+		break;
+	case META_SSA:
+		if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
+			blkaddr < SM_I(sbi)->ssa_blkaddr))
+			return false;
+		break;
+	case META_CP:
+		if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
+			blkaddr < __start_cp_addr(sbi)))
+			return false;
+		break;
+	case META_POR:
+		if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
+			blkaddr < MAIN_BLKADDR(sbi)))
+			return false;
+		break;
+	default:
+		BUG();
+	}
+
+	return true;
+}
+
+/*
+ * Readahead CP/NAT/SIT/SSA pages
+ */
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
+{
+	block_t prev_blk_addr = 0;
+	struct page *page;
+	block_t blkno = start;
+	struct f2fs_io_info fio = {
+		.type = META,
+		.rw = READ_SYNC | REQ_META | REQ_PRIO
+	};
+
+	for (; nrpages-- > 0; blkno++) {
+
+		if (!is_valid_blkaddr(sbi, blkno, type))
+			goto out;
+
+		switch (type) {
+		case META_NAT:
+			if (unlikely(blkno >=
+					NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
+				blkno = 0;
+			/* get nat block addr */
+			fio.blk_addr = current_nat_addr(sbi,
+					blkno * NAT_ENTRY_PER_BLOCK);
+			break;
+		case META_SIT:
+			/* get sit block addr */
+			fio.blk_addr = current_sit_addr(sbi,
+					blkno * SIT_ENTRY_PER_BLOCK);
+			if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
+				goto out;
+			prev_blk_addr = fio.blk_addr;
+			break;
+		case META_SSA:
+		case META_CP:
+		case META_POR:
+			fio.blk_addr = blkno;
+			break;
+		default:
+			BUG();
+		}
+
+		page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
+		if (!page)
+			continue;
+		if (PageUptodate(page)) {
+			f2fs_put_page(page, 1);
+			continue;
+		}
+
+		f2fs_submit_page_mbio(sbi, page, &fio);
+		f2fs_put_page(page, 0);
+	}
+out:
+	f2fs_submit_merged_bio(sbi, META, READ);
+	return blkno - start;
+}
+
+void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+	struct page *page;
+	bool readahead = false;
+
+	page = find_get_page(META_MAPPING(sbi), index);
+	if (!page || (page && !PageUptodate(page)))
+		readahead = true;
+	f2fs_put_page(page, 0);
+
+	if (readahead)
+		ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+}
+
+static int f2fs_write_meta_page(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+
+	trace_f2fs_writepage(page, META);
+
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto redirty_out;
+	if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
+		goto redirty_out;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto redirty_out;
+
+	f2fs_wait_on_page_writeback(page, META);
+	write_meta_page(sbi, page);
+	dec_page_count(sbi, F2FS_DIRTY_META);
+	unlock_page(page);
+
+	if (wbc->for_reclaim)
+		f2fs_submit_merged_bio(sbi, META, WRITE);
+	return 0;
+
+redirty_out:
+	redirty_page_for_writepage(wbc, page);
+	return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int f2fs_write_meta_pages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+	long diff, written;
+
+	trace_f2fs_writepages(mapping->host, wbc, META);
+
+	/* collect a number of dirty meta pages and write together */
+	if (wbc->for_kupdate ||
+		get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+		goto skip_write;
+
+	/* if mounting is failed, skip writing node pages */
+	mutex_lock(&sbi->cp_mutex);
+	diff = nr_pages_to_write(sbi, META, wbc);
+	written = sync_meta_pages(sbi, META, wbc->nr_to_write);
+	mutex_unlock(&sbi->cp_mutex);
+	wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
+	return 0;
+
+skip_write:
+	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
+	return 0;
+}
+
+long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+						long nr_to_write)
+{
+	struct address_space *mapping = META_MAPPING(sbi);
+	pgoff_t index = 0, end = LONG_MAX;
+	struct pagevec pvec;
+	long nwritten = 0;
+	struct writeback_control wbc = {
+		.for_reclaim = 0,
+	};
+
+	pagevec_init(&pvec, 0);
+
+	while (index <= end) {
+		int i, nr_pages;
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+				PAGECACHE_TAG_DIRTY,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (unlikely(nr_pages == 0))
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+			if (!PageDirty(page)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			if (mapping->a_ops->writepage(page, &wbc)) {
+				unlock_page(page);
+				break;
+			}
+			nwritten++;
+			if (unlikely(nwritten >= nr_to_write))
+				break;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (nwritten)
+		f2fs_submit_merged_bio(sbi, type, WRITE);
+
+	return nwritten;
+}
+
+static int f2fs_set_meta_page_dirty(struct page *page)
+{
+	trace_f2fs_set_page_dirty(page, META);
+
+	SetPageUptodate(page);
+	if (!PageDirty(page)) {
+		__set_page_dirty_nobuffers(page);
+		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
+		SetPagePrivate(page);
+		f2fs_trace_pid(page);
+		return 1;
+	}
+	return 0;
+}
+
+const struct address_space_operations f2fs_meta_aops = {
+	.writepage	= f2fs_write_meta_page,
+	.writepages	= f2fs_write_meta_pages,
+	.set_page_dirty	= f2fs_set_meta_page_dirty,
+	.invalidatepage = f2fs_invalidate_page,
+	.releasepage	= f2fs_release_page,
+};
+
+static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+	struct inode_management *im = &sbi->im[type];
+	struct ino_entry *e;
+retry:
+	if (radix_tree_preload(GFP_NOFS)) {
+		cond_resched();
+		goto retry;
+	}
+
+	spin_lock(&im->ino_lock);
+
+	e = radix_tree_lookup(&im->ino_root, ino);
+	if (!e) {
+		e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
+		if (!e) {
+			spin_unlock(&im->ino_lock);
+			radix_tree_preload_end();
+			goto retry;
+		}
+		if (radix_tree_insert(&im->ino_root, ino, e)) {
+			spin_unlock(&im->ino_lock);
+			kmem_cache_free(ino_entry_slab, e);
+			radix_tree_preload_end();
+			goto retry;
+		}
+		memset(e, 0, sizeof(struct ino_entry));
+		e->ino = ino;
+
+		list_add_tail(&e->list, &im->ino_list);
+		if (type != ORPHAN_INO)
+			im->ino_num++;
+	}
+	spin_unlock(&im->ino_lock);
+	radix_tree_preload_end();
+}
+
+static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+	struct inode_management *im = &sbi->im[type];
+	struct ino_entry *e;
+
+	spin_lock(&im->ino_lock);
+	e = radix_tree_lookup(&im->ino_root, ino);
+	if (e) {
+		list_del(&e->list);
+		radix_tree_delete(&im->ino_root, ino);
+		im->ino_num--;
+		spin_unlock(&im->ino_lock);
+		kmem_cache_free(ino_entry_slab, e);
+		return;
+	}
+	spin_unlock(&im->ino_lock);
+}
+
+void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+	/* add new dirty ino entry into list */
+	__add_ino_entry(sbi, ino, type);
+}
+
+void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+	/* remove dirty ino entry from list */
+	__remove_ino_entry(sbi, ino, type);
+}
+
+/* mode should be APPEND_INO or UPDATE_INO */
+bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
+{
+	struct inode_management *im = &sbi->im[mode];
+	struct ino_entry *e;
+
+	spin_lock(&im->ino_lock);
+	e = radix_tree_lookup(&im->ino_root, ino);
+	spin_unlock(&im->ino_lock);
+	return e ? true : false;
+}
+
+void release_dirty_inode(struct f2fs_sb_info *sbi)
+{
+	struct ino_entry *e, *tmp;
+	int i;
+
+	for (i = APPEND_INO; i <= UPDATE_INO; i++) {
+		struct inode_management *im = &sbi->im[i];
+
+		spin_lock(&im->ino_lock);
+		list_for_each_entry_safe(e, tmp, &im->ino_list, list) {
+			list_del(&e->list);
+			radix_tree_delete(&im->ino_root, e->ino);
+			kmem_cache_free(ino_entry_slab, e);
+			im->ino_num--;
+		}
+		spin_unlock(&im->ino_lock);
+	}
+}
+
+int acquire_orphan_inode(struct f2fs_sb_info *sbi)
+{
+	struct inode_management *im = &sbi->im[ORPHAN_INO];
+	int err = 0;
+
+	spin_lock(&im->ino_lock);
+	if (unlikely(im->ino_num >= sbi->max_orphans))
+		err = -ENOSPC;
+	else
+		im->ino_num++;
+	spin_unlock(&im->ino_lock);
+
+	return err;
+}
+
+void release_orphan_inode(struct f2fs_sb_info *sbi)
+{
+	struct inode_management *im = &sbi->im[ORPHAN_INO];
+
+	spin_lock(&im->ino_lock);
+	f2fs_bug_on(sbi, im->ino_num == 0);
+	im->ino_num--;
+	spin_unlock(&im->ino_lock);
+}
+
+void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	/* add new orphan ino entry into list */
+	__add_ino_entry(sbi, ino, ORPHAN_INO);
+}
+
+void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	/* remove orphan entry from orphan list */
+	__remove_ino_entry(sbi, ino, ORPHAN_INO);
+}
+
+static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct inode *inode = f2fs_iget(sbi->sb, ino);
+	f2fs_bug_on(sbi, IS_ERR(inode));
+	clear_nlink(inode);
+
+	/* truncate all the data during iput */
+	iput(inode);
+}
+
+void recover_orphan_inodes(struct f2fs_sb_info *sbi)
+{
+	block_t start_blk, orphan_blocks, i, j;
+
+	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
+		return;
+
+	set_sbi_flag(sbi, SBI_POR_DOING);
+
+	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
+	orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
+
+	ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
+
+	for (i = 0; i < orphan_blocks; i++) {
+		struct page *page = get_meta_page(sbi, start_blk + i);
+		struct f2fs_orphan_block *orphan_blk;
+
+		orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+		for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
+			nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
+			recover_orphan_inode(sbi, ino);
+		}
+		f2fs_put_page(page, 1);
+	}
+	/* clear Orphan Flag */
+	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
+	clear_sbi_flag(sbi, SBI_POR_DOING);
+	return;
+}
+
+static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+	struct list_head *head;
+	struct f2fs_orphan_block *orphan_blk = NULL;
+	unsigned int nentries = 0;
+	unsigned short index;
+	unsigned short orphan_blocks;
+	struct page *page = NULL;
+	struct ino_entry *orphan = NULL;
+	struct inode_management *im = &sbi->im[ORPHAN_INO];
+
+	orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
+
+	for (index = 0; index < orphan_blocks; index++)
+		grab_meta_page(sbi, start_blk + index);
+
+	index = 1;
+	spin_lock(&im->ino_lock);
+	head = &im->ino_list;
+
+	/* loop for each orphan inode entry and write them in Jornal block */
+	list_for_each_entry(orphan, head, list) {
+		if (!page) {
+			page = find_get_page(META_MAPPING(sbi), start_blk++);
+			f2fs_bug_on(sbi, !page);
+			orphan_blk =
+				(struct f2fs_orphan_block *)page_address(page);
+			memset(orphan_blk, 0, sizeof(*orphan_blk));
+			f2fs_put_page(page, 0);
+		}
+
+		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+
+		if (nentries == F2FS_ORPHANS_PER_BLOCK) {
+			/*
+			 * an orphan block is full of 1020 entries,
+			 * then we need to flush current orphan blocks
+			 * and bring another one in memory
+			 */
+			orphan_blk->blk_addr = cpu_to_le16(index);
+			orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+			orphan_blk->entry_count = cpu_to_le32(nentries);
+			set_page_dirty(page);
+			f2fs_put_page(page, 1);
+			index++;
+			nentries = 0;
+			page = NULL;
+		}
+	}
+
+	if (page) {
+		orphan_blk->blk_addr = cpu_to_le16(index);
+		orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+		orphan_blk->entry_count = cpu_to_le32(nentries);
+		set_page_dirty(page);
+		f2fs_put_page(page, 1);
+	}
+
+	spin_unlock(&im->ino_lock);
+}
+
+static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+				block_t cp_addr, unsigned long long *version)
+{
+	struct page *cp_page_1, *cp_page_2 = NULL;
+	unsigned long blk_size = sbi->blocksize;
+	struct f2fs_checkpoint *cp_block;
+	unsigned long long cur_version = 0, pre_version = 0;
+	size_t crc_offset;
+	__u32 crc = 0;
+
+	/* Read the 1st cp block in this CP pack */
+	cp_page_1 = get_meta_page(sbi, cp_addr);
+
+	/* get the version number */
+	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
+	crc_offset = le32_to_cpu(cp_block->checksum_offset);
+	if (crc_offset >= blk_size)
+		goto invalid_cp1;
+
+	crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
+	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+		goto invalid_cp1;
+
+	pre_version = cur_cp_version(cp_block);
+
+	/* Read the 2nd cp block in this CP pack */
+	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+	cp_page_2 = get_meta_page(sbi, cp_addr);
+
+	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
+	crc_offset = le32_to_cpu(cp_block->checksum_offset);
+	if (crc_offset >= blk_size)
+		goto invalid_cp2;
+
+	crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
+	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+		goto invalid_cp2;
+
+	cur_version = cur_cp_version(cp_block);
+
+	if (cur_version == pre_version) {
+		*version = cur_version;
+		f2fs_put_page(cp_page_2, 1);
+		return cp_page_1;
+	}
+invalid_cp2:
+	f2fs_put_page(cp_page_2, 1);
+invalid_cp1:
+	f2fs_put_page(cp_page_1, 1);
+	return NULL;
+}
+
+int get_valid_checkpoint(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_checkpoint *cp_block;
+	struct f2fs_super_block *fsb = sbi->raw_super;
+	struct page *cp1, *cp2, *cur_page;
+	unsigned long blk_size = sbi->blocksize;
+	unsigned long long cp1_version = 0, cp2_version = 0;
+	unsigned long long cp_start_blk_no;
+	unsigned int cp_blks = 1 + __cp_payload(sbi);
+	block_t cp_blk_no;
+	int i;
+
+	sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
+	if (!sbi->ckpt)
+		return -ENOMEM;
+	/*
+	 * Finding out valid cp block involves read both
+	 * sets( cp pack1 and cp pack 2)
+	 */
+	cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+	cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
+
+	/* The second checkpoint pack should start at the next segment */
+	cp_start_blk_no += ((unsigned long long)1) <<
+				le32_to_cpu(fsb->log_blocks_per_seg);
+	cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
+
+	if (cp1 && cp2) {
+		if (ver_after(cp2_version, cp1_version))
+			cur_page = cp2;
+		else
+			cur_page = cp1;
+	} else if (cp1) {
+		cur_page = cp1;
+	} else if (cp2) {
+		cur_page = cp2;
+	} else {
+		goto fail_no_cp;
+	}
+
+	cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
+	memcpy(sbi->ckpt, cp_block, blk_size);
+
+	if (cp_blks <= 1)
+		goto done;
+
+	cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+	if (cur_page == cp2)
+		cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+
+	for (i = 1; i < cp_blks; i++) {
+		void *sit_bitmap_ptr;
+		unsigned char *ckpt = (unsigned char *)sbi->ckpt;
+
+		cur_page = get_meta_page(sbi, cp_blk_no + i);
+		sit_bitmap_ptr = page_address(cur_page);
+		memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
+		f2fs_put_page(cur_page, 1);
+	}
+done:
+	f2fs_put_page(cp1, 1);
+	f2fs_put_page(cp2, 1);
+	return 0;
+
+fail_no_cp:
+	kfree(sbi->ckpt);
+	return -EINVAL;
+}
+
+static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
+		return -EEXIST;
+
+	set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
+	F2FS_I(inode)->dirty_dir = new;
+	list_add_tail(&new->list, &sbi->dir_inode_list);
+	stat_inc_dirty_dir(sbi);
+	return 0;
+}
+
+void update_dirty_page(struct inode *inode, struct page *page)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct inode_entry *new;
+	int ret = 0;
+
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+		return;
+
+	if (!S_ISDIR(inode->i_mode)) {
+		inode_inc_dirty_pages(inode);
+		goto out;
+	}
+
+	new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+	new->inode = inode;
+	INIT_LIST_HEAD(&new->list);
+
+	spin_lock(&sbi->dir_inode_lock);
+	ret = __add_dirty_inode(inode, new);
+	inode_inc_dirty_pages(inode);
+	spin_unlock(&sbi->dir_inode_lock);
+
+	if (ret)
+		kmem_cache_free(inode_entry_slab, new);
+out:
+	SetPagePrivate(page);
+	f2fs_trace_pid(page);
+}
+
+void add_dirty_dir_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct inode_entry *new =
+			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+	int ret = 0;
+
+	new->inode = inode;
+	INIT_LIST_HEAD(&new->list);
+
+	spin_lock(&sbi->dir_inode_lock);
+	ret = __add_dirty_inode(inode, new);
+	spin_unlock(&sbi->dir_inode_lock);
+
+	if (ret)
+		kmem_cache_free(inode_entry_slab, new);
+}
+
+void remove_dirty_dir_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct inode_entry *entry;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	spin_lock(&sbi->dir_inode_lock);
+	if (get_dirty_pages(inode) ||
+			!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
+		spin_unlock(&sbi->dir_inode_lock);
+		return;
+	}
+
+	entry = F2FS_I(inode)->dirty_dir;
+	list_del(&entry->list);
+	F2FS_I(inode)->dirty_dir = NULL;
+	clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
+	stat_dec_dirty_dir(sbi);
+	spin_unlock(&sbi->dir_inode_lock);
+	kmem_cache_free(inode_entry_slab, entry);
+
+	/* Only from the recovery routine */
+	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+		iput(inode);
+	}
+}
+
+void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head;
+	struct inode_entry *entry;
+	struct inode *inode;
+retry:
+	if (unlikely(f2fs_cp_error(sbi)))
+		return;
+
+	spin_lock(&sbi->dir_inode_lock);
+
+	head = &sbi->dir_inode_list;
+	if (list_empty(head)) {
+		spin_unlock(&sbi->dir_inode_lock);
+		return;
+	}
+	entry = list_entry(head->next, struct inode_entry, list);
+	inode = igrab(entry->inode);
+	spin_unlock(&sbi->dir_inode_lock);
+	if (inode) {
+		filemap_fdatawrite(inode->i_mapping);
+		iput(inode);
+	} else {
+		/*
+		 * We should submit bio, since it exists several
+		 * wribacking dentry pages in the freeing inode.
+		 */
+		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+		cond_resched();
+	}
+	goto retry;
+}
+
+/*
+ * Freeze all the FS-operations for checkpoint.
+ */
+static int block_operations(struct f2fs_sb_info *sbi)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.for_reclaim = 0,
+	};
+	struct blk_plug plug;
+	int err = 0;
+
+	blk_start_plug(&plug);
+
+retry_flush_dents:
+	f2fs_lock_all(sbi);
+	/* write all the dirty dentry pages */
+	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
+		f2fs_unlock_all(sbi);
+		sync_dirty_dir_inodes(sbi);
+		if (unlikely(f2fs_cp_error(sbi))) {
+			err = -EIO;
+			goto out;
+		}
+		goto retry_flush_dents;
+	}
+
+	/*
+	 * POR: we should ensure that there are no dirty node pages
+	 * until finishing nat/sit flush.
+	 */
+retry_flush_nodes:
+	down_write(&sbi->node_write);
+
+	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
+		up_write(&sbi->node_write);
+		sync_node_pages(sbi, 0, &wbc);
+		if (unlikely(f2fs_cp_error(sbi))) {
+			f2fs_unlock_all(sbi);
+			err = -EIO;
+			goto out;
+		}
+		goto retry_flush_nodes;
+	}
+out:
+	blk_finish_plug(&plug);
+	return err;
+}
+
+static void unblock_operations(struct f2fs_sb_info *sbi)
+{
+	up_write(&sbi->node_write);
+	f2fs_unlock_all(sbi);
+}
+
+static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+{
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		if (!get_pages(sbi, F2FS_WRITEBACK))
+			break;
+
+		io_schedule();
+	}
+	finish_wait(&sbi->cp_wait, &wait);
+}
+
+static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
+	nid_t last_nid = nm_i->next_scan_nid;
+	block_t start_blk;
+	struct page *cp_page;
+	unsigned int data_sum_blocks, orphan_blocks;
+	__u32 crc32 = 0;
+	void *kaddr;
+	int i;
+	int cp_payload_blks = __cp_payload(sbi);
+
+	/*
+	 * This avoids to conduct wrong roll-forward operations and uses
+	 * metapages, so should be called prior to sync_meta_pages below.
+	 */
+	discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
+
+	/* Flush all the NAT/SIT pages */
+	while (get_pages(sbi, F2FS_DIRTY_META)) {
+		sync_meta_pages(sbi, META, LONG_MAX);
+		if (unlikely(f2fs_cp_error(sbi)))
+			return;
+	}
+
+	next_free_nid(sbi, &last_nid);
+
+	/*
+	 * modify checkpoint
+	 * version number is already updated
+	 */
+	ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
+	ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+	ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
+	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+		ckpt->cur_node_segno[i] =
+			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
+		ckpt->cur_node_blkoff[i] =
+			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
+		ckpt->alloc_type[i + CURSEG_HOT_NODE] =
+				curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
+	}
+	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
+		ckpt->cur_data_segno[i] =
+			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
+		ckpt->cur_data_blkoff[i] =
+			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
+		ckpt->alloc_type[i + CURSEG_HOT_DATA] =
+				curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
+	}
+
+	ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+	ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+	ckpt->next_free_nid = cpu_to_le32(last_nid);
+
+	/* 2 cp  + n data seg summary + orphan inode blocks */
+	data_sum_blocks = npages_for_summary_flush(sbi, false);
+	if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
+		set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+
+	orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
+	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
+			orphan_blocks);
+
+	if (__remain_node_summaries(cpc->reason))
+		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
+				cp_payload_blks + data_sum_blocks +
+				orphan_blocks + NR_CURSEG_NODE_TYPE);
+	else
+		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
+				cp_payload_blks + data_sum_blocks +
+				orphan_blocks);
+
+	if (cpc->reason == CP_UMOUNT)
+		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+	if (cpc->reason == CP_FASTBOOT)
+		set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+
+	if (orphan_num)
+		set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+
+	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+		set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
+	/* update SIT/NAT bitmap */
+	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
+	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
+
+	crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+	*((__le32 *)((unsigned char *)ckpt +
+				le32_to_cpu(ckpt->checksum_offset)))
+				= cpu_to_le32(crc32);
+
+	start_blk = __start_cp_addr(sbi);
+
+	/* write out checkpoint buffer at block 0 */
+	cp_page = grab_meta_page(sbi, start_blk++);
+	kaddr = page_address(cp_page);
+	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
+	set_page_dirty(cp_page);
+	f2fs_put_page(cp_page, 1);
+
+	for (i = 1; i < 1 + cp_payload_blks; i++) {
+		cp_page = grab_meta_page(sbi, start_blk++);
+		kaddr = page_address(cp_page);
+		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
+		set_page_dirty(cp_page);
+		f2fs_put_page(cp_page, 1);
+	}
+
+	if (orphan_num) {
+		write_orphan_inodes(sbi, start_blk);
+		start_blk += orphan_blocks;
+	}
+
+	write_data_summaries(sbi, start_blk);
+	start_blk += data_sum_blocks;
+	if (__remain_node_summaries(cpc->reason)) {
+		write_node_summaries(sbi, start_blk);
+		start_blk += NR_CURSEG_NODE_TYPE;
+	}
+
+	/* writeout checkpoint block */
+	cp_page = grab_meta_page(sbi, start_blk);
+	kaddr = page_address(cp_page);
+	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
+	set_page_dirty(cp_page);
+	f2fs_put_page(cp_page, 1);
+
+	/* wait for previous submitted node/meta pages writeback */
+	wait_on_all_pages_writeback(sbi);
+
+	if (unlikely(f2fs_cp_error(sbi)))
+		return;
+
+	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
+	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
+
+	/* update user_block_counts */
+	sbi->last_valid_block_count = sbi->total_valid_block_count;
+	sbi->alloc_valid_block_count = 0;
+
+	/* Here, we only have one bio having CP pack */
+	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+
+	/* wait for previous submitted meta pages writeback */
+	wait_on_all_pages_writeback(sbi);
+
+	release_dirty_inode(sbi);
+
+	if (unlikely(f2fs_cp_error(sbi)))
+		return;
+
+	clear_prefree_segments(sbi);
+	clear_sbi_flag(sbi, SBI_IS_DIRTY);
+}
+
+/*
+ * We guarantee that this checkpoint procedure will not fail.
+ */
+void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	unsigned long long ckpt_ver;
+
+	mutex_lock(&sbi->cp_mutex);
+
+	if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
+		(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC))
+		goto out;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto out;
+	if (f2fs_readonly(sbi->sb))
+		goto out;
+
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
+
+	if (block_operations(sbi))
+		goto out;
+
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
+
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
+	f2fs_submit_merged_bio(sbi, NODE, WRITE);
+	f2fs_submit_merged_bio(sbi, META, WRITE);
+
+	/*
+	 * update checkpoint pack index
+	 * Increase the version number so that
+	 * SIT entries and seg summaries are written at correct place
+	 */
+	ckpt_ver = cur_cp_version(ckpt);
+	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
+
+	/* write cached NAT/SIT entries to NAT/SIT area */
+	flush_nat_entries(sbi);
+	flush_sit_entries(sbi, cpc);
+
+	/* unlock all the fs_lock[] in do_checkpoint() */
+	do_checkpoint(sbi, cpc);
+
+	unblock_operations(sbi);
+	stat_inc_cp_count(sbi->stat_info);
+
+	if (cpc->reason == CP_RECOVERY)
+		f2fs_msg(sbi->sb, KERN_NOTICE,
+			"checkpoint: version = %llx", ckpt_ver);
+out:
+	mutex_unlock(&sbi->cp_mutex);
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+}
+
+void init_ino_entry_info(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	for (i = 0; i < MAX_INO_ENTRY; i++) {
+		struct inode_management *im = &sbi->im[i];
+
+		INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC);
+		spin_lock_init(&im->ino_lock);
+		INIT_LIST_HEAD(&im->ino_list);
+		im->ino_num = 0;
+	}
+
+	sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+			NR_CURSEG_TYPE - __cp_payload(sbi)) *
+				F2FS_ORPHANS_PER_BLOCK;
+}
+
+int __init create_checkpoint_caches(void)
+{
+	ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
+			sizeof(struct ino_entry));
+	if (!ino_entry_slab)
+		return -ENOMEM;
+	inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
+			sizeof(struct inode_entry));
+	if (!inode_entry_slab) {
+		kmem_cache_destroy(ino_entry_slab);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void destroy_checkpoint_caches(void)
+{
+	kmem_cache_destroy(ino_entry_slab);
+	kmem_cache_destroy(inode_entry_slab);
+}
diff --git a/kernel/fs/f2fs/data.c b/kernel/fs/f2fs/data.c
new file mode 100644
index 000000000..1e1aae669
--- /dev/null
+++ b/kernel/fs/f2fs/data.c
@@ -0,0 +1,1864 @@
+/*
+ * fs/f2fs/data.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/prefetch.h>
+#include <linux/uio.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "trace.h"
+#include <trace/events/f2fs.h>
+
+static struct kmem_cache *extent_tree_slab;
+static struct kmem_cache *extent_node_slab;
+
+static void f2fs_read_end_io(struct bio *bio, int err)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		if (!err) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	}
+	bio_put(bio);
+}
+
+static void f2fs_write_end_io(struct bio *bio, int err)
+{
+	struct f2fs_sb_info *sbi = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		if (unlikely(err)) {
+			set_page_dirty(page);
+			set_bit(AS_EIO, &page->mapping->flags);
+			f2fs_stop_checkpoint(sbi);
+		}
+		end_page_writeback(page);
+		dec_page_count(sbi, F2FS_WRITEBACK);
+	}
+
+	if (!get_pages(sbi, F2FS_WRITEBACK) &&
+			!list_empty(&sbi->cp_wait.task_list))
+		wake_up(&sbi->cp_wait);
+
+	bio_put(bio);
+}
+
+/*
+ * Low-level block read/write IO operations.
+ */
+static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+				int npages, bool is_read)
+{
+	struct bio *bio;
+
+	/* No failure on bio allocation */
+	bio = bio_alloc(GFP_NOIO, npages);
+
+	bio->bi_bdev = sbi->sb->s_bdev;
+	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+	bio->bi_private = sbi;
+
+	return bio;
+}
+
+static void __submit_merged_bio(struct f2fs_bio_info *io)
+{
+	struct f2fs_io_info *fio = &io->fio;
+
+	if (!io->bio)
+		return;
+
+	if (is_read_io(fio->rw))
+		trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
+	else
+		trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
+
+	submit_bio(fio->rw, io->bio);
+	io->bio = NULL;
+}
+
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+				enum page_type type, int rw)
+{
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct f2fs_bio_info *io;
+
+	io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+
+	down_write(&io->io_rwsem);
+
+	/* change META to META_FLUSH in the checkpoint procedure */
+	if (type >= META_FLUSH) {
+		io->fio.type = META_FLUSH;
+		if (test_opt(sbi, NOBARRIER))
+			io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO;
+		else
+			io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+	}
+	__submit_merged_bio(io);
+	up_write(&io->io_rwsem);
+}
+
+/*
+ * Fill the locked page with data located in the block address.
+ * Return unlocked page.
+ */
+int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
+					struct f2fs_io_info *fio)
+{
+	struct bio *bio;
+
+	trace_f2fs_submit_page_bio(page, fio);
+	f2fs_trace_ios(page, fio, 0);
+
+	/* Allocate a new bio */
+	bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
+
+	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+		bio_put(bio);
+		f2fs_put_page(page, 1);
+		return -EFAULT;
+	}
+
+	submit_bio(fio->rw, bio);
+	return 0;
+}
+
+void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
+					struct f2fs_io_info *fio)
+{
+	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
+	struct f2fs_bio_info *io;
+	bool is_read = is_read_io(fio->rw);
+
+	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+
+	verify_block_addr(sbi, fio->blk_addr);
+
+	down_write(&io->io_rwsem);
+
+	if (!is_read)
+		inc_page_count(sbi, F2FS_WRITEBACK);
+
+	if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
+						io->fio.rw != fio->rw))
+		__submit_merged_bio(io);
+alloc_new:
+	if (io->bio == NULL) {
+		int bio_blocks = MAX_BIO_BLOCKS(sbi);
+
+		io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
+		io->fio = *fio;
+	}
+
+	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+							PAGE_CACHE_SIZE) {
+		__submit_merged_bio(io);
+		goto alloc_new;
+	}
+
+	io->last_block_in_bio = fio->blk_addr;
+	f2fs_trace_ios(page, fio, 0);
+
+	up_write(&io->io_rwsem);
+	trace_f2fs_submit_page_mbio(page, fio);
+}
+
+/*
+ * Lock ordering for the change of data block address:
+ * ->data_page
+ *  ->node_page
+ *    update block addresses in the node page
+ */
+void set_data_blkaddr(struct dnode_of_data *dn)
+{
+	struct f2fs_node *rn;
+	__le32 *addr_array;
+	struct page *node_page = dn->node_page;
+	unsigned int ofs_in_node = dn->ofs_in_node;
+
+	f2fs_wait_on_page_writeback(node_page, NODE);
+
+	rn = F2FS_NODE(node_page);
+
+	/* Get physical address of data block */
+	addr_array = blkaddr_in_node(rn);
+	addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+	set_page_dirty(node_page);
+}
+
+int reserve_new_block(struct dnode_of_data *dn)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+
+	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+		return -EPERM;
+	if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+		return -ENOSPC;
+
+	trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+
+	dn->data_blkaddr = NEW_ADDR;
+	set_data_blkaddr(dn);
+	mark_inode_dirty(dn->inode);
+	sync_inode_page(dn);
+	return 0;
+}
+
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
+{
+	bool need_put = dn->inode_page ? false : true;
+	int err;
+
+	err = get_dnode_of_data(dn, index, ALLOC_NODE);
+	if (err)
+		return err;
+
+	if (dn->data_blkaddr == NULL_ADDR)
+		err = reserve_new_block(dn);
+	if (err || need_put)
+		f2fs_put_dnode(dn);
+	return err;
+}
+
+static void f2fs_map_bh(struct super_block *sb, pgoff_t pgofs,
+			struct extent_info *ei, struct buffer_head *bh_result)
+{
+	unsigned int blkbits = sb->s_blocksize_bits;
+	size_t max_size = bh_result->b_size;
+	size_t mapped_size;
+
+	clear_buffer_new(bh_result);
+	map_bh(bh_result, sb, ei->blk + pgofs - ei->fofs);
+	mapped_size = (ei->fofs + ei->len - pgofs) << blkbits;
+	bh_result->b_size = min(max_size, mapped_size);
+}
+
+static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs,
+							struct extent_info *ei)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	pgoff_t start_fofs, end_fofs;
+	block_t start_blkaddr;
+
+	read_lock(&fi->ext_lock);
+	if (fi->ext.len == 0) {
+		read_unlock(&fi->ext_lock);
+		return false;
+	}
+
+	stat_inc_total_hit(inode->i_sb);
+
+	start_fofs = fi->ext.fofs;
+	end_fofs = fi->ext.fofs + fi->ext.len - 1;
+	start_blkaddr = fi->ext.blk;
+
+	if (pgofs >= start_fofs && pgofs <= end_fofs) {
+		*ei = fi->ext;
+		stat_inc_read_hit(inode->i_sb);
+		read_unlock(&fi->ext_lock);
+		return true;
+	}
+	read_unlock(&fi->ext_lock);
+	return false;
+}
+
+static bool update_extent_info(struct inode *inode, pgoff_t fofs,
+								block_t blkaddr)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	pgoff_t start_fofs, end_fofs;
+	block_t start_blkaddr, end_blkaddr;
+	int need_update = true;
+
+	write_lock(&fi->ext_lock);
+
+	start_fofs = fi->ext.fofs;
+	end_fofs = fi->ext.fofs + fi->ext.len - 1;
+	start_blkaddr = fi->ext.blk;
+	end_blkaddr = fi->ext.blk + fi->ext.len - 1;
+
+	/* Drop and initialize the matched extent */
+	if (fi->ext.len == 1 && fofs == start_fofs)
+		fi->ext.len = 0;
+
+	/* Initial extent */
+	if (fi->ext.len == 0) {
+		if (blkaddr != NULL_ADDR) {
+			fi->ext.fofs = fofs;
+			fi->ext.blk = blkaddr;
+			fi->ext.len = 1;
+		}
+		goto end_update;
+	}
+
+	/* Front merge */
+	if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) {
+		fi->ext.fofs--;
+		fi->ext.blk--;
+		fi->ext.len++;
+		goto end_update;
+	}
+
+	/* Back merge */
+	if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) {
+		fi->ext.len++;
+		goto end_update;
+	}
+
+	/* Split the existing extent */
+	if (fi->ext.len > 1 &&
+		fofs >= start_fofs && fofs <= end_fofs) {
+		if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
+			fi->ext.len = fofs - start_fofs;
+		} else {
+			fi->ext.fofs = fofs + 1;
+			fi->ext.blk = start_blkaddr + fofs - start_fofs + 1;
+			fi->ext.len -= fofs - start_fofs + 1;
+		}
+	} else {
+		need_update = false;
+	}
+
+	/* Finally, if the extent is very fragmented, let's drop the cache. */
+	if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
+		fi->ext.len = 0;
+		set_inode_flag(fi, FI_NO_EXTENT);
+		need_update = true;
+	}
+end_update:
+	write_unlock(&fi->ext_lock);
+	return need_update;
+}
+
+static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_info *ei,
+				struct rb_node *parent, struct rb_node **p)
+{
+	struct extent_node *en;
+
+	en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+	if (!en)
+		return NULL;
+
+	en->ei = *ei;
+	INIT_LIST_HEAD(&en->list);
+
+	rb_link_node(&en->rb_node, parent, p);
+	rb_insert_color(&en->rb_node, &et->root);
+	et->count++;
+	atomic_inc(&sbi->total_ext_node);
+	return en;
+}
+
+static void __detach_extent_node(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_node *en)
+{
+	rb_erase(&en->rb_node, &et->root);
+	et->count--;
+	atomic_dec(&sbi->total_ext_node);
+
+	if (et->cached_en == en)
+		et->cached_en = NULL;
+}
+
+static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi,
+							nid_t ino)
+{
+	struct extent_tree *et;
+
+	down_read(&sbi->extent_tree_lock);
+	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+	if (!et) {
+		up_read(&sbi->extent_tree_lock);
+		return NULL;
+	}
+	atomic_inc(&et->refcount);
+	up_read(&sbi->extent_tree_lock);
+
+	return et;
+}
+
+static struct extent_tree *__grab_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et;
+	nid_t ino = inode->i_ino;
+
+	down_write(&sbi->extent_tree_lock);
+	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+	if (!et) {
+		et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+		f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+		memset(et, 0, sizeof(struct extent_tree));
+		et->ino = ino;
+		et->root = RB_ROOT;
+		et->cached_en = NULL;
+		rwlock_init(&et->lock);
+		atomic_set(&et->refcount, 0);
+		et->count = 0;
+		sbi->total_ext_tree++;
+	}
+	atomic_inc(&et->refcount);
+	up_write(&sbi->extent_tree_lock);
+
+	return et;
+}
+
+static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
+							unsigned int fofs)
+{
+	struct rb_node *node = et->root.rb_node;
+	struct extent_node *en;
+
+	if (et->cached_en) {
+		struct extent_info *cei = &et->cached_en->ei;
+
+		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
+			return et->cached_en;
+	}
+
+	while (node) {
+		en = rb_entry(node, struct extent_node, rb_node);
+
+		if (fofs < en->ei.fofs) {
+			node = node->rb_left;
+		} else if (fofs >= en->ei.fofs + en->ei.len) {
+			node = node->rb_right;
+		} else {
+			et->cached_en = en;
+			return en;
+		}
+	}
+	return NULL;
+}
+
+static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_node *en)
+{
+	struct extent_node *prev;
+	struct rb_node *node;
+
+	node = rb_prev(&en->rb_node);
+	if (!node)
+		return NULL;
+
+	prev = rb_entry(node, struct extent_node, rb_node);
+	if (__is_back_mergeable(&en->ei, &prev->ei)) {
+		en->ei.fofs = prev->ei.fofs;
+		en->ei.blk = prev->ei.blk;
+		en->ei.len += prev->ei.len;
+		__detach_extent_node(sbi, et, prev);
+		return prev;
+	}
+	return NULL;
+}
+
+static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_node *en)
+{
+	struct extent_node *next;
+	struct rb_node *node;
+
+	node = rb_next(&en->rb_node);
+	if (!node)
+		return NULL;
+
+	next = rb_entry(node, struct extent_node, rb_node);
+	if (__is_front_mergeable(&en->ei, &next->ei)) {
+		en->ei.len += next->ei.len;
+		__detach_extent_node(sbi, et, next);
+		return next;
+	}
+	return NULL;
+}
+
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_info *ei,
+				struct extent_node **den)
+{
+	struct rb_node **p = &et->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_node *en;
+
+	while (*p) {
+		parent = *p;
+		en = rb_entry(parent, struct extent_node, rb_node);
+
+		if (ei->fofs < en->ei.fofs) {
+			if (__is_front_mergeable(ei, &en->ei)) {
+				f2fs_bug_on(sbi, !den);
+				en->ei.fofs = ei->fofs;
+				en->ei.blk = ei->blk;
+				en->ei.len += ei->len;
+				*den = __try_back_merge(sbi, et, en);
+				return en;
+			}
+			p = &(*p)->rb_left;
+		} else if (ei->fofs >= en->ei.fofs + en->ei.len) {
+			if (__is_back_mergeable(ei, &en->ei)) {
+				f2fs_bug_on(sbi, !den);
+				en->ei.len += ei->len;
+				*den = __try_front_merge(sbi, et, en);
+				return en;
+			}
+			p = &(*p)->rb_right;
+		} else {
+			f2fs_bug_on(sbi, 1);
+		}
+	}
+
+	return __attach_extent_node(sbi, et, ei, parent, p);
+}
+
+static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
+					struct extent_tree *et, bool free_all)
+{
+	struct rb_node *node, *next;
+	struct extent_node *en;
+	unsigned int count = et->count;
+
+	node = rb_first(&et->root);
+	while (node) {
+		next = rb_next(node);
+		en = rb_entry(node, struct extent_node, rb_node);
+
+		if (free_all) {
+			spin_lock(&sbi->extent_lock);
+			if (!list_empty(&en->list))
+				list_del_init(&en->list);
+			spin_unlock(&sbi->extent_lock);
+		}
+
+		if (free_all || list_empty(&en->list)) {
+			__detach_extent_node(sbi, et, en);
+			kmem_cache_free(extent_node_slab, en);
+		}
+		node = next;
+	}
+
+	return count - et->count;
+}
+
+static void f2fs_init_extent_tree(struct inode *inode,
+						struct f2fs_extent *i_ext)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et;
+	struct extent_node *en;
+	struct extent_info ei;
+
+	if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
+		return;
+
+	et = __grab_extent_tree(inode);
+
+	write_lock(&et->lock);
+	if (et->count)
+		goto out;
+
+	set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
+		le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+
+	en = __insert_extent_tree(sbi, et, &ei, NULL);
+	if (en) {
+		et->cached_en = en;
+
+		spin_lock(&sbi->extent_lock);
+		list_add_tail(&en->list, &sbi->extent_list);
+		spin_unlock(&sbi->extent_lock);
+	}
+out:
+	write_unlock(&et->lock);
+	atomic_dec(&et->refcount);
+}
+
+static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+							struct extent_info *ei)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et;
+	struct extent_node *en;
+
+	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+
+	et = __find_extent_tree(sbi, inode->i_ino);
+	if (!et)
+		return false;
+
+	read_lock(&et->lock);
+	en = __lookup_extent_tree(et, pgofs);
+	if (en) {
+		*ei = en->ei;
+		spin_lock(&sbi->extent_lock);
+		if (!list_empty(&en->list))
+			list_move_tail(&en->list, &sbi->extent_list);
+		spin_unlock(&sbi->extent_lock);
+		stat_inc_read_hit(sbi->sb);
+	}
+	stat_inc_total_hit(sbi->sb);
+	read_unlock(&et->lock);
+
+	trace_f2fs_lookup_extent_tree_end(inode, pgofs, en);
+
+	atomic_dec(&et->refcount);
+	return en ? true : false;
+}
+
+static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
+							block_t blkaddr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et;
+	struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
+	struct extent_node *den = NULL;
+	struct extent_info ei, dei;
+	unsigned int endofs;
+
+	trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
+
+	et = __grab_extent_tree(inode);
+
+	write_lock(&et->lock);
+
+	/* 1. lookup and remove existing extent info in cache */
+	en = __lookup_extent_tree(et, fofs);
+	if (!en)
+		goto update_extent;
+
+	dei = en->ei;
+	__detach_extent_node(sbi, et, en);
+
+	/* 2. if extent can be split more, split and insert the left part */
+	if (dei.len > 1) {
+		/*  insert left part of split extent into cache */
+		if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+			set_extent_info(&ei, dei.fofs, dei.blk,
+							fofs - dei.fofs);
+			en1 = __insert_extent_tree(sbi, et, &ei, NULL);
+		}
+
+		/* insert right part of split extent into cache */
+		endofs = dei.fofs + dei.len - 1;
+		if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
+			set_extent_info(&ei, fofs + 1,
+				fofs - dei.fofs + dei.blk, endofs - fofs);
+			en2 = __insert_extent_tree(sbi, et, &ei, NULL);
+		}
+	}
+
+update_extent:
+	/* 3. update extent in extent cache */
+	if (blkaddr) {
+		set_extent_info(&ei, fofs, blkaddr, 1);
+		en3 = __insert_extent_tree(sbi, et, &ei, &den);
+	}
+
+	/* 4. update in global extent list */
+	spin_lock(&sbi->extent_lock);
+	if (en && !list_empty(&en->list))
+		list_del(&en->list);
+	/*
+	 * en1 and en2 split from en, they will become more and more smaller
+	 * fragments after splitting several times. So if the length is smaller
+	 * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
+	 */
+	if (en1)
+		list_add_tail(&en1->list, &sbi->extent_list);
+	if (en2)
+		list_add_tail(&en2->list, &sbi->extent_list);
+	if (en3) {
+		if (list_empty(&en3->list))
+			list_add_tail(&en3->list, &sbi->extent_list);
+		else
+			list_move_tail(&en3->list, &sbi->extent_list);
+	}
+	if (den && !list_empty(&den->list))
+		list_del(&den->list);
+	spin_unlock(&sbi->extent_lock);
+
+	/* 5. release extent node */
+	if (en)
+		kmem_cache_free(extent_node_slab, en);
+	if (den)
+		kmem_cache_free(extent_node_slab, den);
+
+	write_unlock(&et->lock);
+	atomic_dec(&et->refcount);
+}
+
+void f2fs_preserve_extent_tree(struct inode *inode)
+{
+	struct extent_tree *et;
+	struct extent_info *ext = &F2FS_I(inode)->ext;
+	bool sync = false;
+
+	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+		return;
+
+	et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino);
+	if (!et) {
+		if (ext->len) {
+			ext->len = 0;
+			update_inode_page(inode);
+		}
+		return;
+	}
+
+	read_lock(&et->lock);
+	if (et->count) {
+		struct extent_node *en;
+
+		if (et->cached_en) {
+			en = et->cached_en;
+		} else {
+			struct rb_node *node = rb_first(&et->root);
+
+			if (!node)
+				node = rb_last(&et->root);
+			en = rb_entry(node, struct extent_node, rb_node);
+		}
+
+		if (__is_extent_same(ext, &en->ei))
+			goto out;
+
+		*ext = en->ei;
+		sync = true;
+	} else if (ext->len) {
+		ext->len = 0;
+		sync = true;
+	}
+out:
+	read_unlock(&et->lock);
+	atomic_dec(&et->refcount);
+
+	if (sync)
+		update_inode_page(inode);
+}
+
+void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
+	struct extent_node *en, *tmp;
+	unsigned long ino = F2FS_ROOT_INO(sbi);
+	struct radix_tree_iter iter;
+	void **slot;
+	unsigned int found;
+	unsigned int node_cnt = 0, tree_cnt = 0;
+
+	if (!test_opt(sbi, EXTENT_CACHE))
+		return;
+
+	if (available_free_memory(sbi, EXTENT_CACHE))
+		return;
+
+	spin_lock(&sbi->extent_lock);
+	list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
+		if (!nr_shrink--)
+			break;
+		list_del_init(&en->list);
+	}
+	spin_unlock(&sbi->extent_lock);
+
+	down_read(&sbi->extent_tree_lock);
+	while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
+				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
+		unsigned i;
+
+		ino = treevec[found - 1]->ino + 1;
+		for (i = 0; i < found; i++) {
+			struct extent_tree *et = treevec[i];
+
+			atomic_inc(&et->refcount);
+			write_lock(&et->lock);
+			node_cnt += __free_extent_tree(sbi, et, false);
+			write_unlock(&et->lock);
+			atomic_dec(&et->refcount);
+		}
+	}
+	up_read(&sbi->extent_tree_lock);
+
+	down_write(&sbi->extent_tree_lock);
+	radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter,
+							F2FS_ROOT_INO(sbi)) {
+		struct extent_tree *et = (struct extent_tree *)*slot;
+
+		if (!atomic_read(&et->refcount) && !et->count) {
+			radix_tree_delete(&sbi->extent_tree_root, et->ino);
+			kmem_cache_free(extent_tree_slab, et);
+			sbi->total_ext_tree--;
+			tree_cnt++;
+		}
+	}
+	up_write(&sbi->extent_tree_lock);
+
+	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+}
+
+void f2fs_destroy_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et;
+	unsigned int node_cnt = 0;
+
+	if (!test_opt(sbi, EXTENT_CACHE))
+		return;
+
+	et = __find_extent_tree(sbi, inode->i_ino);
+	if (!et)
+		goto out;
+
+	/* free all extent info belong to this extent tree */
+	write_lock(&et->lock);
+	node_cnt = __free_extent_tree(sbi, et, true);
+	write_unlock(&et->lock);
+
+	atomic_dec(&et->refcount);
+
+	/* try to find and delete extent tree entry in radix tree */
+	down_write(&sbi->extent_tree_lock);
+	et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino);
+	if (!et) {
+		up_write(&sbi->extent_tree_lock);
+		goto out;
+	}
+	f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+	kmem_cache_free(extent_tree_slab, et);
+	sbi->total_ext_tree--;
+	up_write(&sbi->extent_tree_lock);
+out:
+	trace_f2fs_destroy_extent_tree(inode, node_cnt);
+	return;
+}
+
+void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext)
+{
+	if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+		f2fs_init_extent_tree(inode, i_ext);
+
+	write_lock(&F2FS_I(inode)->ext_lock);
+	get_extent_info(&F2FS_I(inode)->ext, *i_ext);
+	write_unlock(&F2FS_I(inode)->ext_lock);
+}
+
+static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+							struct extent_info *ei)
+{
+	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+		return false;
+
+	if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+		return f2fs_lookup_extent_tree(inode, pgofs, ei);
+
+	return lookup_extent_info(inode, pgofs, ei);
+}
+
+void f2fs_update_extent_cache(struct dnode_of_data *dn)
+{
+	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+	pgoff_t fofs;
+
+	f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+
+	if (is_inode_flag_set(fi, FI_NO_EXTENT))
+		return;
+
+	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+							dn->ofs_in_node;
+
+	if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE))
+		return f2fs_update_extent_tree(dn->inode, fofs,
+							dn->data_blkaddr);
+
+	if (update_extent_info(dn->inode, fofs, dn->data_blkaddr))
+		sync_inode_page(dn);
+}
+
+struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct dnode_of_data dn;
+	struct page *page;
+	struct extent_info ei;
+	int err;
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = sync ? READ_SYNC : READA,
+	};
+
+	/*
+	 * If sync is false, it needs to check its block allocation.
+	 * This is need and triggered by two flows:
+	 *   gc and truncate_partial_data_page.
+	 */
+	if (!sync)
+		goto search;
+
+	page = find_get_page(mapping, index);
+	if (page && PageUptodate(page))
+		return page;
+	f2fs_put_page(page, 0);
+search:
+	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+		dn.data_blkaddr = ei.blk + index - ei.fofs;
+		goto got_it;
+	}
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+	if (err)
+		return ERR_PTR(err);
+	f2fs_put_dnode(&dn);
+
+	if (dn.data_blkaddr == NULL_ADDR)
+		return ERR_PTR(-ENOENT);
+
+	/* By fallocate(), there is no cached page, but with NEW_ADDR */
+	if (unlikely(dn.data_blkaddr == NEW_ADDR))
+		return ERR_PTR(-EINVAL);
+
+got_it:
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		return page;
+	}
+
+	fio.blk_addr = dn.data_blkaddr;
+	err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
+	if (err)
+		return ERR_PTR(err);
+
+	if (sync) {
+		wait_on_page_locked(page);
+		if (unlikely(!PageUptodate(page))) {
+			f2fs_put_page(page, 0);
+			return ERR_PTR(-EIO);
+		}
+	}
+	return page;
+}
+
+/*
+ * If it tries to access a hole, return an error.
+ * Because, the callers, functions in dir.c and GC, should be able to know
+ * whether this page exists or not.
+ */
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct dnode_of_data dn;
+	struct page *page;
+	struct extent_info ei;
+	int err;
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = READ_SYNC,
+	};
+repeat:
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+		dn.data_blkaddr = ei.blk + index - ei.fofs;
+		goto got_it;
+	}
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+	if (err) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(err);
+	}
+	f2fs_put_dnode(&dn);
+
+	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-ENOENT);
+	}
+
+got_it:
+	if (PageUptodate(page))
+		return page;
+
+	/*
+	 * A new dentry page is allocated but not able to be written, since its
+	 * new inode page couldn't be allocated due to -ENOSPC.
+	 * In such the case, its blkaddr can be remained as NEW_ADDR.
+	 * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
+	 */
+	if (dn.data_blkaddr == NEW_ADDR) {
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+		return page;
+	}
+
+	fio.blk_addr = dn.data_blkaddr;
+	err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
+	if (err)
+		return ERR_PTR(err);
+
+	lock_page(page);
+	if (unlikely(!PageUptodate(page))) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
+	if (unlikely(page->mapping != mapping)) {
+		f2fs_put_page(page, 1);
+		goto repeat;
+	}
+	return page;
+}
+
+/*
+ * Caller ensures that this data page is never allocated.
+ * A new zero-filled data page is allocated in the page cache.
+ *
+ * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ * Note that, ipage is set only by make_empty_dir.
+ */
+struct page *get_new_data_page(struct inode *inode,
+		struct page *ipage, pgoff_t index, bool new_i_size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	struct dnode_of_data dn;
+	int err;
+
+	set_new_dnode(&dn, inode, ipage, NULL, 0);
+	err = f2fs_reserve_block(&dn, index);
+	if (err)
+		return ERR_PTR(err);
+repeat:
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		err = -ENOMEM;
+		goto put_err;
+	}
+
+	if (PageUptodate(page))
+		return page;
+
+	if (dn.data_blkaddr == NEW_ADDR) {
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+	} else {
+		struct f2fs_io_info fio = {
+			.type = DATA,
+			.rw = READ_SYNC,
+			.blk_addr = dn.data_blkaddr,
+		};
+		err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
+		if (err)
+			goto put_err;
+
+		lock_page(page);
+		if (unlikely(!PageUptodate(page))) {
+			f2fs_put_page(page, 1);
+			err = -EIO;
+			goto put_err;
+		}
+		if (unlikely(page->mapping != mapping)) {
+			f2fs_put_page(page, 1);
+			goto repeat;
+		}
+	}
+
+	if (new_i_size &&
+		i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
+		i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+		/* Only the directory inode sets new_i_size */
+		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
+	}
+	return page;
+
+put_err:
+	f2fs_put_dnode(&dn);
+	return ERR_PTR(err);
+}
+
+static int __allocate_data_block(struct dnode_of_data *dn)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+	struct f2fs_summary sum;
+	struct node_info ni;
+	int seg = CURSEG_WARM_DATA;
+	pgoff_t fofs;
+
+	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+		return -EPERM;
+
+	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+	if (dn->data_blkaddr == NEW_ADDR)
+		goto alloc;
+
+	if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+		return -ENOSPC;
+
+alloc:
+	get_node_info(sbi, dn->nid, &ni);
+	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+
+	if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
+		seg = CURSEG_DIRECT_IO;
+
+	allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
+								&sum, seg);
+
+	/* direct IO doesn't use extent cache to maximize the performance */
+	set_data_blkaddr(dn);
+
+	/* update i_size */
+	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+							dn->ofs_in_node;
+	if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
+		i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
+
+	return 0;
+}
+
+static void __allocate_data_blocks(struct inode *inode, loff_t offset,
+							size_t count)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct dnode_of_data dn;
+	u64 start = F2FS_BYTES_TO_BLK(offset);
+	u64 len = F2FS_BYTES_TO_BLK(count);
+	bool allocated;
+	u64 end_offset;
+
+	while (len) {
+		f2fs_balance_fs(sbi);
+		f2fs_lock_op(sbi);
+
+		/* When reading holes, we need its node page */
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		if (get_dnode_of_data(&dn, start, ALLOC_NODE))
+			goto out;
+
+		allocated = false;
+		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+
+		while (dn.ofs_in_node < end_offset && len) {
+			block_t blkaddr;
+
+			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+			if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
+				if (__allocate_data_block(&dn))
+					goto sync_out;
+				allocated = true;
+			}
+			len--;
+			start++;
+			dn.ofs_in_node++;
+		}
+
+		if (allocated)
+			sync_inode_page(&dn);
+
+		f2fs_put_dnode(&dn);
+		f2fs_unlock_op(sbi);
+	}
+	return;
+
+sync_out:
+	if (allocated)
+		sync_inode_page(&dn);
+	f2fs_put_dnode(&dn);
+out:
+	f2fs_unlock_op(sbi);
+	return;
+}
+
+/*
+ * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ * If original data blocks are allocated, then give them to blockdev.
+ * Otherwise,
+ *     a. preallocate requested block addresses
+ *     b. do not use extent cache for better performance
+ *     c. give the block addresses to blockdev
+ */
+static int __get_data_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create, bool fiemap)
+{
+	unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+	unsigned maxblocks = bh_result->b_size >> blkbits;
+	struct dnode_of_data dn;
+	int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
+	pgoff_t pgofs, end_offset;
+	int err = 0, ofs = 1;
+	struct extent_info ei;
+	bool allocated = false;
+
+	/* Get the page offset from the block offset(iblock) */
+	pgofs =	(pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
+
+	if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+		f2fs_map_bh(inode->i_sb, pgofs, &ei, bh_result);
+		goto out;
+	}
+
+	if (create)
+		f2fs_lock_op(F2FS_I_SB(inode));
+
+	/* When reading holes, we need its node page */
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, pgofs, mode);
+	if (err) {
+		if (err == -ENOENT)
+			err = 0;
+		goto unlock_out;
+	}
+	if (dn.data_blkaddr == NEW_ADDR && !fiemap)
+		goto put_out;
+
+	if (dn.data_blkaddr != NULL_ADDR) {
+		clear_buffer_new(bh_result);
+		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+	} else if (create) {
+		err = __allocate_data_block(&dn);
+		if (err)
+			goto put_out;
+		allocated = true;
+		set_buffer_new(bh_result);
+		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+	} else {
+		goto put_out;
+	}
+
+	end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+	bh_result->b_size = (((size_t)1) << blkbits);
+	dn.ofs_in_node++;
+	pgofs++;
+
+get_next:
+	if (dn.ofs_in_node >= end_offset) {
+		if (allocated)
+			sync_inode_page(&dn);
+		allocated = false;
+		f2fs_put_dnode(&dn);
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		err = get_dnode_of_data(&dn, pgofs, mode);
+		if (err) {
+			if (err == -ENOENT)
+				err = 0;
+			goto unlock_out;
+		}
+		if (dn.data_blkaddr == NEW_ADDR && !fiemap)
+			goto put_out;
+
+		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+	}
+
+	if (maxblocks > (bh_result->b_size >> blkbits)) {
+		block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+		if (blkaddr == NULL_ADDR && create) {
+			err = __allocate_data_block(&dn);
+			if (err)
+				goto sync_out;
+			allocated = true;
+			set_buffer_new(bh_result);
+			blkaddr = dn.data_blkaddr;
+		}
+		/* Give more consecutive addresses for the readahead */
+		if (blkaddr == (bh_result->b_blocknr + ofs)) {
+			ofs++;
+			dn.ofs_in_node++;
+			pgofs++;
+			bh_result->b_size += (((size_t)1) << blkbits);
+			goto get_next;
+		}
+	}
+sync_out:
+	if (allocated)
+		sync_inode_page(&dn);
+put_out:
+	f2fs_put_dnode(&dn);
+unlock_out:
+	if (create)
+		f2fs_unlock_op(F2FS_I_SB(inode));
+out:
+	trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+	return err;
+}
+
+static int get_data_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	return __get_data_block(inode, iblock, bh_result, create, false);
+}
+
+static int get_data_block_fiemap(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	return __get_data_block(inode, iblock, bh_result, create, true);
+}
+
+int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo,
+				start, len, get_data_block_fiemap);
+}
+
+static int f2fs_read_data_page(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = -EAGAIN;
+
+	trace_f2fs_readpage(page, DATA);
+
+	/* If the file has inline data, try to read it directly */
+	if (f2fs_has_inline_data(inode))
+		ret = f2fs_read_inline_data(inode, page);
+	if (ret == -EAGAIN)
+		ret = mpage_readpage(page, get_data_block);
+
+	return ret;
+}
+
+static int f2fs_read_data_pages(struct file *file,
+			struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = file->f_mapping->host;
+
+	/* If the file has inline data, skip readpages */
+	if (f2fs_has_inline_data(inode))
+		return 0;
+
+	return mpage_readpages(mapping, pages, nr_pages, get_data_block);
+}
+
+int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
+{
+	struct inode *inode = page->mapping->host;
+	struct dnode_of_data dn;
+	int err = 0;
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+	if (err)
+		return err;
+
+	fio->blk_addr = dn.data_blkaddr;
+
+	/* This page is already truncated */
+	if (fio->blk_addr == NULL_ADDR) {
+		ClearPageUptodate(page);
+		goto out_writepage;
+	}
+
+	set_page_writeback(page);
+
+	/*
+	 * If current allocation needs SSR,
+	 * it had better in-place writes for updated data.
+	 */
+	if (unlikely(fio->blk_addr != NEW_ADDR &&
+			!is_cold_data(page) &&
+			need_inplace_update(inode))) {
+		rewrite_data_page(page, fio);
+		set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+		trace_f2fs_do_write_data_page(page, IPU);
+	} else {
+		write_data_page(page, &dn, fio);
+		set_data_blkaddr(&dn);
+		f2fs_update_extent_cache(&dn);
+		trace_f2fs_do_write_data_page(page, OPU);
+		set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+		if (page->index == 0)
+			set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+	}
+out_writepage:
+	f2fs_put_dnode(&dn);
+	return err;
+}
+
+static int f2fs_write_data_page(struct page *page,
+					struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = ((unsigned long long) i_size)
+							>> PAGE_CACHE_SHIFT;
+	unsigned offset = 0;
+	bool need_balance_fs = false;
+	int err = 0;
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+	};
+
+	trace_f2fs_writepage(page, DATA);
+
+	if (page->index < end_index)
+		goto write;
+
+	/*
+	 * If the offset is out-of-range of file size,
+	 * this page does not have to be written to disk.
+	 */
+	offset = i_size & (PAGE_CACHE_SIZE - 1);
+	if ((page->index >= end_index + 1) || !offset)
+		goto out;
+
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+write:
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto redirty_out;
+	if (f2fs_is_drop_cache(inode))
+		goto out;
+	if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
+			available_free_memory(sbi, BASE_CHECK))
+		goto redirty_out;
+
+	/* Dentry blocks are controlled by checkpoint */
+	if (S_ISDIR(inode->i_mode)) {
+		if (unlikely(f2fs_cp_error(sbi)))
+			goto redirty_out;
+		err = do_write_data_page(page, &fio);
+		goto done;
+	}
+
+	/* we should bypass data pages to proceed the kworkder jobs */
+	if (unlikely(f2fs_cp_error(sbi))) {
+		SetPageError(page);
+		goto out;
+	}
+
+	if (!wbc->for_reclaim)
+		need_balance_fs = true;
+	else if (has_not_enough_free_secs(sbi, 0))
+		goto redirty_out;
+
+	err = -EAGAIN;
+	f2fs_lock_op(sbi);
+	if (f2fs_has_inline_data(inode))
+		err = f2fs_write_inline_data(inode, page);
+	if (err == -EAGAIN)
+		err = do_write_data_page(page, &fio);
+	f2fs_unlock_op(sbi);
+done:
+	if (err && err != -ENOENT)
+		goto redirty_out;
+
+	clear_cold_data(page);
+out:
+	inode_dec_dirty_pages(inode);
+	if (err)
+		ClearPageUptodate(page);
+	unlock_page(page);
+	if (need_balance_fs)
+		f2fs_balance_fs(sbi);
+	if (wbc->for_reclaim)
+		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+	return 0;
+
+redirty_out:
+	redirty_page_for_writepage(wbc, page);
+	return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
+			void *data)
+{
+	struct address_space *mapping = data;
+	int ret = mapping->a_ops->writepage(page, wbc);
+	mapping_set_error(mapping, ret);
+	return ret;
+}
+
+static int f2fs_write_data_pages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	bool locked = false;
+	int ret;
+	long diff;
+
+	trace_f2fs_writepages(mapping->host, wbc, DATA);
+
+	/* deal with chardevs and other special file */
+	if (!mapping->a_ops->writepage)
+		return 0;
+
+	if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+			get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
+			available_free_memory(sbi, DIRTY_DENTS))
+		goto skip_write;
+
+	/* during POR, we don't need to trigger writepage at all. */
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto skip_write;
+
+	diff = nr_pages_to_write(sbi, DATA, wbc);
+
+	if (!S_ISDIR(inode->i_mode)) {
+		mutex_lock(&sbi->writepages);
+		locked = true;
+	}
+	ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
+	if (locked)
+		mutex_unlock(&sbi->writepages);
+
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
+
+	remove_dirty_dir_inode(inode);
+
+	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
+	return ret;
+
+skip_write:
+	wbc->pages_skipped += get_dirty_pages(inode);
+	return 0;
+}
+
+static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		truncate_blocks(inode, inode->i_size, true);
+	}
+}
+
+static int f2fs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct page *page, *ipage;
+	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
+	struct dnode_of_data dn;
+	int err = 0;
+
+	trace_f2fs_write_begin(inode, pos, len, flags);
+
+	f2fs_balance_fs(sbi);
+
+	/*
+	 * We should check this at this moment to avoid deadlock on inode page
+	 * and #0 page. The locking rule for inline_data conversion should be:
+	 * lock_page(page #0) -> lock_page(inode_page)
+	 */
+	if (index != 0) {
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			goto fail;
+	}
+repeat:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	*pagep = page;
+
+	f2fs_lock_op(sbi);
+
+	/* check inline_data */
+	ipage = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(ipage)) {
+		err = PTR_ERR(ipage);
+		goto unlock_fail;
+	}
+
+	set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+	if (f2fs_has_inline_data(inode)) {
+		if (pos + len <= MAX_INLINE_DATA) {
+			read_inline_data(page, ipage);
+			set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+			sync_inode_page(&dn);
+			goto put_next;
+		}
+		err = f2fs_convert_inline_page(&dn, page);
+		if (err)
+			goto put_fail;
+	}
+	err = f2fs_reserve_block(&dn, index);
+	if (err)
+		goto put_fail;
+put_next:
+	f2fs_put_dnode(&dn);
+	f2fs_unlock_op(sbi);
+
+	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+		return 0;
+
+	f2fs_wait_on_page_writeback(page, DATA);
+
+	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned end = start + len;
+
+		/* Reading beyond i_size is simple: memset to zero */
+		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+		goto out;
+	}
+
+	if (dn.data_blkaddr == NEW_ADDR) {
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	} else {
+		struct f2fs_io_info fio = {
+			.type = DATA,
+			.rw = READ_SYNC,
+			.blk_addr = dn.data_blkaddr,
+		};
+		err = f2fs_submit_page_bio(sbi, page, &fio);
+		if (err)
+			goto fail;
+
+		lock_page(page);
+		if (unlikely(!PageUptodate(page))) {
+			f2fs_put_page(page, 1);
+			err = -EIO;
+			goto fail;
+		}
+		if (unlikely(page->mapping != mapping)) {
+			f2fs_put_page(page, 1);
+			goto repeat;
+		}
+	}
+out:
+	SetPageUptodate(page);
+	clear_cold_data(page);
+	return 0;
+
+put_fail:
+	f2fs_put_dnode(&dn);
+unlock_fail:
+	f2fs_unlock_op(sbi);
+	f2fs_put_page(page, 1);
+fail:
+	f2fs_write_failed(mapping, pos + len);
+	return err;
+}
+
+static int f2fs_write_end(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+
+	trace_f2fs_write_end(inode, pos, len, copied);
+
+	set_page_dirty(page);
+
+	if (pos + copied > i_size_read(inode)) {
+		i_size_write(inode, pos + copied);
+		mark_inode_dirty(inode);
+		update_inode_page(inode);
+	}
+
+	f2fs_put_page(page, 1);
+	return copied;
+}
+
+static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
+			   loff_t offset)
+{
+	unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+
+	if (iov_iter_rw(iter) == READ)
+		return 0;
+
+	if (offset & blocksize_mask)
+		return -EINVAL;
+
+	if (iov_iter_alignment(iter) & blocksize_mask)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			      loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count = iov_iter_count(iter);
+	int err;
+
+	/* we don't need to use inline_data strictly */
+	if (f2fs_has_inline_data(inode)) {
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
+	}
+
+	if (check_direct_IO(inode, iter, offset))
+		return 0;
+
+	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+
+	if (iov_iter_rw(iter) == WRITE)
+		__allocate_data_blocks(inode, offset, count);
+
+	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block);
+	if (err < 0 && iov_iter_rw(iter) == WRITE)
+		f2fs_write_failed(mapping, offset + count);
+
+	trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err);
+
+	return err;
+}
+
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
+							unsigned int length)
+{
+	struct inode *inode = page->mapping->host;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
+		(offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
+		return;
+
+	if (PageDirty(page)) {
+		if (inode->i_ino == F2FS_META_INO(sbi))
+			dec_page_count(sbi, F2FS_DIRTY_META);
+		else if (inode->i_ino == F2FS_NODE_INO(sbi))
+			dec_page_count(sbi, F2FS_DIRTY_NODES);
+		else
+			inode_dec_dirty_pages(inode);
+	}
+	ClearPagePrivate(page);
+}
+
+int f2fs_release_page(struct page *page, gfp_t wait)
+{
+	/* If this is dirty page, keep PagePrivate */
+	if (PageDirty(page))
+		return 0;
+
+	ClearPagePrivate(page);
+	return 1;
+}
+
+static int f2fs_set_data_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+
+	trace_f2fs_set_page_dirty(page, DATA);
+
+	SetPageUptodate(page);
+
+	if (f2fs_is_atomic_file(inode)) {
+		register_inmem_page(inode, page);
+		return 1;
+	}
+
+	mark_inode_dirty(inode);
+
+	if (!PageDirty(page)) {
+		__set_page_dirty_nobuffers(page);
+		update_dirty_page(inode, page);
+		return 1;
+	}
+	return 0;
+}
+
+static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+
+	/* we don't need to use inline_data strictly */
+	if (f2fs_has_inline_data(inode)) {
+		int err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
+	}
+	return generic_block_bmap(mapping, block, get_data_block);
+}
+
+void init_extent_cache_info(struct f2fs_sb_info *sbi)
+{
+	INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
+	init_rwsem(&sbi->extent_tree_lock);
+	INIT_LIST_HEAD(&sbi->extent_list);
+	spin_lock_init(&sbi->extent_lock);
+	sbi->total_ext_tree = 0;
+	atomic_set(&sbi->total_ext_node, 0);
+}
+
+int __init create_extent_cache(void)
+{
+	extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
+			sizeof(struct extent_tree));
+	if (!extent_tree_slab)
+		return -ENOMEM;
+	extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
+			sizeof(struct extent_node));
+	if (!extent_node_slab) {
+		kmem_cache_destroy(extent_tree_slab);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void destroy_extent_cache(void)
+{
+	kmem_cache_destroy(extent_node_slab);
+	kmem_cache_destroy(extent_tree_slab);
+}
+
+const struct address_space_operations f2fs_dblock_aops = {
+	.readpage	= f2fs_read_data_page,
+	.readpages	= f2fs_read_data_pages,
+	.writepage	= f2fs_write_data_page,
+	.writepages	= f2fs_write_data_pages,
+	.write_begin	= f2fs_write_begin,
+	.write_end	= f2fs_write_end,
+	.set_page_dirty	= f2fs_set_data_page_dirty,
+	.invalidatepage	= f2fs_invalidate_page,
+	.releasepage	= f2fs_release_page,
+	.direct_IO	= f2fs_direct_IO,
+	.bmap		= f2fs_bmap,
+};
diff --git a/kernel/fs/f2fs/debug.c b/kernel/fs/f2fs/debug.c
new file mode 100644
index 000000000..f5388f372
--- /dev/null
+++ b/kernel/fs/f2fs/debug.c
@@ -0,0 +1,413 @@
+/*
+ * f2fs debugging statistics
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ * Copyright (c) 2012 Linux Foundation
+ * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+#include <linux/f2fs_fs.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+
+static LIST_HEAD(f2fs_stat_list);
+static struct dentry *f2fs_debugfs_root;
+static DEFINE_MUTEX(f2fs_stat_mutex);
+
+static void update_general_status(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_stat_info *si = F2FS_STAT(sbi);
+	int i;
+
+	/* validation check of the segment numbers */
+	si->hit_ext = sbi->read_hit_ext;
+	si->total_ext = sbi->total_hit_ext;
+	si->ext_tree = sbi->total_ext_tree;
+	si->ext_node = atomic_read(&sbi->total_ext_node);
+	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
+	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
+	si->ndirty_dirs = sbi->n_dirty_dirs;
+	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
+	si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
+	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
+	si->rsvd_segs = reserved_segments(sbi);
+	si->overp_segs = overprovision_segments(sbi);
+	si->valid_count = valid_user_blocks(sbi);
+	si->valid_node_count = valid_node_count(sbi);
+	si->valid_inode_count = valid_inode_count(sbi);
+	si->inline_inode = atomic_read(&sbi->inline_inode);
+	si->inline_dir = atomic_read(&sbi->inline_dir);
+	si->utilization = utilization(sbi);
+
+	si->free_segs = free_segments(sbi);
+	si->free_secs = free_sections(sbi);
+	si->prefree_count = prefree_segments(sbi);
+	si->dirty_count = dirty_segments(sbi);
+	si->node_pages = NODE_MAPPING(sbi)->nrpages;
+	si->meta_pages = META_MAPPING(sbi)->nrpages;
+	si->nats = NM_I(sbi)->nat_cnt;
+	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
+	si->sits = MAIN_SEGS(sbi);
+	si->dirty_sits = SIT_I(sbi)->dirty_sentries;
+	si->fnids = NM_I(sbi)->fcnt;
+	si->bg_gc = sbi->bg_gc;
+	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
+		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+		/ 2;
+	si->util_valid = (int)(written_block_count(sbi) >>
+						sbi->log_blocks_per_seg)
+		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+		/ 2;
+	si->util_invalid = 50 - si->util_free - si->util_valid;
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
+		struct curseg_info *curseg = CURSEG_I(sbi, i);
+		si->curseg[i] = curseg->segno;
+		si->cursec[i] = curseg->segno / sbi->segs_per_sec;
+		si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+	}
+
+	for (i = 0; i < 2; i++) {
+		si->segment_count[i] = sbi->segment_count[i];
+		si->block_count[i] = sbi->block_count[i];
+	}
+
+	si->inplace_count = atomic_read(&sbi->inplace_count);
+}
+
+/*
+ * This function calculates BDF of every segments
+ */
+static void update_sit_info(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_stat_info *si = F2FS_STAT(sbi);
+	unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
+	unsigned int segno, vblocks;
+	int ndirty = 0;
+
+	bimodal = 0;
+	total_vblocks = 0;
+	blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+	hblks_per_sec = blks_per_sec / 2;
+	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+		vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+		dist = abs(vblocks - hblks_per_sec);
+		bimodal += dist * dist;
+
+		if (vblocks > 0 && vblocks < blks_per_sec) {
+			total_vblocks += vblocks;
+			ndirty++;
+		}
+	}
+	dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
+	si->bimodal = bimodal / dist;
+	if (si->dirty_count)
+		si->avg_vblocks = total_vblocks / ndirty;
+	else
+		si->avg_vblocks = 0;
+}
+
+/*
+ * This function calculates memory footprint.
+ */
+static void update_mem_info(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_stat_info *si = F2FS_STAT(sbi);
+	unsigned npages;
+	int i;
+
+	if (si->base_mem)
+		goto get_cache;
+
+	si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+	si->base_mem += 2 * sizeof(struct f2fs_inode_info);
+	si->base_mem += sizeof(*sbi->ckpt);
+
+	/* build sm */
+	si->base_mem += sizeof(struct f2fs_sm_info);
+
+	/* build sit */
+	si->base_mem += sizeof(struct sit_info);
+	si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
+	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+	si->base_mem += SIT_VBLOCK_MAP_SIZE;
+	if (sbi->segs_per_sec > 1)
+		si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
+	si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
+
+	/* build free segmap */
+	si->base_mem += sizeof(struct free_segmap_info);
+	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+	si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
+
+	/* build curseg */
+	si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
+	si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+
+	/* build dirty segmap */
+	si->base_mem += sizeof(struct dirty_seglist_info);
+	si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
+	si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
+
+	/* build nm */
+	si->base_mem += sizeof(struct f2fs_nm_info);
+	si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+
+get_cache:
+	si->cache_mem = 0;
+
+	/* build gc */
+	if (sbi->gc_thread)
+		si->cache_mem += sizeof(struct f2fs_gc_kthread);
+
+	/* build merge flush thread */
+	if (SM_I(sbi)->cmd_control_info)
+		si->cache_mem += sizeof(struct flush_cmd_control);
+
+	/* free nids */
+	si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
+	si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
+	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
+					sizeof(struct nat_entry_set);
+	si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
+	si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
+	for (i = 0; i <= UPDATE_INO; i++)
+		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+	si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+	si->cache_mem += atomic_read(&sbi->total_ext_node) *
+						sizeof(struct extent_node);
+
+	si->page_mem = 0;
+	npages = NODE_MAPPING(sbi)->nrpages;
+	si->page_mem += npages << PAGE_CACHE_SHIFT;
+	npages = META_MAPPING(sbi)->nrpages;
+	si->page_mem += npages << PAGE_CACHE_SHIFT;
+}
+
+static int stat_show(struct seq_file *s, void *v)
+{
+	struct f2fs_stat_info *si;
+	int i = 0;
+	int j;
+
+	mutex_lock(&f2fs_stat_mutex);
+	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
+		char devname[BDEVNAME_SIZE];
+
+		update_general_status(si->sbi);
+
+		seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
+			bdevname(si->sbi->sb->s_bdev, devname), i++);
+		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
+			   si->sit_area_segs, si->nat_area_segs);
+		seq_printf(s, "[SSA: %d] [MAIN: %d",
+			   si->ssa_area_segs, si->main_area_segs);
+		seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
+			   si->overp_segs, si->rsvd_segs);
+		seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
+			   si->utilization, si->valid_count);
+		seq_printf(s, "  - Node: %u (Inode: %u, ",
+			   si->valid_node_count, si->valid_inode_count);
+		seq_printf(s, "Other: %u)\n  - Data: %u\n",
+			   si->valid_node_count - si->valid_inode_count,
+			   si->valid_count - si->valid_node_count);
+		seq_printf(s, "  - Inline_data Inode: %u\n",
+			   si->inline_inode);
+		seq_printf(s, "  - Inline_dentry Inode: %u\n",
+			   si->inline_dir);
+		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
+			   si->main_area_segs, si->main_area_sections,
+			   si->main_area_zones);
+		seq_printf(s, "  - COLD  data: %d, %d, %d\n",
+			   si->curseg[CURSEG_COLD_DATA],
+			   si->cursec[CURSEG_COLD_DATA],
+			   si->curzone[CURSEG_COLD_DATA]);
+		seq_printf(s, "  - WARM  data: %d, %d, %d\n",
+			   si->curseg[CURSEG_WARM_DATA],
+			   si->cursec[CURSEG_WARM_DATA],
+			   si->curzone[CURSEG_WARM_DATA]);
+		seq_printf(s, "  - HOT   data: %d, %d, %d\n",
+			   si->curseg[CURSEG_HOT_DATA],
+			   si->cursec[CURSEG_HOT_DATA],
+			   si->curzone[CURSEG_HOT_DATA]);
+		seq_printf(s, "  - Dir   dnode: %d, %d, %d\n",
+			   si->curseg[CURSEG_HOT_NODE],
+			   si->cursec[CURSEG_HOT_NODE],
+			   si->curzone[CURSEG_HOT_NODE]);
+		seq_printf(s, "  - File   dnode: %d, %d, %d\n",
+			   si->curseg[CURSEG_WARM_NODE],
+			   si->cursec[CURSEG_WARM_NODE],
+			   si->curzone[CURSEG_WARM_NODE]);
+		seq_printf(s, "  - Indir nodes: %d, %d, %d\n",
+			   si->curseg[CURSEG_COLD_NODE],
+			   si->cursec[CURSEG_COLD_NODE],
+			   si->curzone[CURSEG_COLD_NODE]);
+		seq_printf(s, "\n  - Valid: %d\n  - Dirty: %d\n",
+			   si->main_area_segs - si->dirty_count -
+			   si->prefree_count - si->free_segs,
+			   si->dirty_count);
+		seq_printf(s, "  - Prefree: %d\n  - Free: %d (%d)\n\n",
+			   si->prefree_count, si->free_segs, si->free_secs);
+		seq_printf(s, "CP calls: %d\n", si->cp_count);
+		seq_printf(s, "GC calls: %d (BG: %d)\n",
+			   si->call_count, si->bg_gc);
+		seq_printf(s, "  - data segments : %d (%d)\n",
+				si->data_segs, si->bg_data_segs);
+		seq_printf(s, "  - node segments : %d (%d)\n",
+				si->node_segs, si->bg_node_segs);
+		seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
+				si->bg_data_blks + si->bg_node_blks);
+		seq_printf(s, "  - data blocks : %d (%d)\n", si->data_blks,
+				si->bg_data_blks);
+		seq_printf(s, "  - node blocks : %d (%d)\n", si->node_blks,
+				si->bg_node_blks);
+		seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
+			   si->hit_ext, si->total_ext);
+		seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree);
+		seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node);
+		seq_puts(s, "\nBalancing F2FS Async:\n");
+		seq_printf(s, "  - inmem: %4d, wb: %4d\n",
+			   si->inmem_pages, si->wb_pages);
+		seq_printf(s, "  - nodes: %4d in %4d\n",
+			   si->ndirty_node, si->node_pages);
+		seq_printf(s, "  - dents: %4d in dirs:%4d\n",
+			   si->ndirty_dent, si->ndirty_dirs);
+		seq_printf(s, "  - meta: %4d in %4d\n",
+			   si->ndirty_meta, si->meta_pages);
+		seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
+			   si->dirty_nats, si->nats, si->dirty_sits, si->sits);
+		seq_printf(s, "  - free_nids: %9d\n",
+			   si->fnids);
+		seq_puts(s, "\nDistribution of User Blocks:");
+		seq_puts(s, " [ valid | invalid | free ]\n");
+		seq_puts(s, "  [");
+
+		for (j = 0; j < si->util_valid; j++)
+			seq_putc(s, '-');
+		seq_putc(s, '|');
+
+		for (j = 0; j < si->util_invalid; j++)
+			seq_putc(s, '-');
+		seq_putc(s, '|');
+
+		for (j = 0; j < si->util_free; j++)
+			seq_putc(s, '-');
+		seq_puts(s, "]\n\n");
+		seq_printf(s, "IPU: %u blocks\n", si->inplace_count);
+		seq_printf(s, "SSR: %u blocks in %u segments\n",
+			   si->block_count[SSR], si->segment_count[SSR]);
+		seq_printf(s, "LFS: %u blocks in %u segments\n",
+			   si->block_count[LFS], si->segment_count[LFS]);
+
+		/* segment usage info */
+		update_sit_info(si->sbi);
+		seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
+			   si->bimodal, si->avg_vblocks);
+
+		/* memory footprint */
+		update_mem_info(si->sbi);
+		seq_printf(s, "\nMemory: %u KB\n",
+			(si->base_mem + si->cache_mem + si->page_mem) >> 10);
+		seq_printf(s, "  - static: %u KB\n",
+				si->base_mem >> 10);
+		seq_printf(s, "  - cached: %u KB\n",
+				si->cache_mem >> 10);
+		seq_printf(s, "  - paged : %u KB\n",
+				si->page_mem >> 10);
+	}
+	mutex_unlock(&f2fs_stat_mutex);
+	return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, stat_show, inode->i_private);
+}
+
+static const struct file_operations stat_fops = {
+	.open = stat_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+int f2fs_build_stats(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	struct f2fs_stat_info *si;
+
+	si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+	if (!si)
+		return -ENOMEM;
+
+	si->all_area_segs = le32_to_cpu(raw_super->segment_count);
+	si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
+	si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
+	si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa);
+	si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
+	si->main_area_sections = le32_to_cpu(raw_super->section_count);
+	si->main_area_zones = si->main_area_sections /
+				le32_to_cpu(raw_super->secs_per_zone);
+	si->sbi = sbi;
+	sbi->stat_info = si;
+
+	atomic_set(&sbi->inline_inode, 0);
+	atomic_set(&sbi->inline_dir, 0);
+	atomic_set(&sbi->inplace_count, 0);
+
+	mutex_lock(&f2fs_stat_mutex);
+	list_add_tail(&si->stat_list, &f2fs_stat_list);
+	mutex_unlock(&f2fs_stat_mutex);
+
+	return 0;
+}
+
+void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_stat_info *si = F2FS_STAT(sbi);
+
+	mutex_lock(&f2fs_stat_mutex);
+	list_del(&si->stat_list);
+	mutex_unlock(&f2fs_stat_mutex);
+
+	kfree(si);
+}
+
+void __init f2fs_create_root_stats(void)
+{
+	struct dentry *file;
+
+	f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
+	if (!f2fs_debugfs_root)
+		return;
+
+	file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
+			NULL, &stat_fops);
+	if (!file) {
+		debugfs_remove(f2fs_debugfs_root);
+		f2fs_debugfs_root = NULL;
+	}
+}
+
+void f2fs_destroy_root_stats(void)
+{
+	if (!f2fs_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(f2fs_debugfs_root);
+	f2fs_debugfs_root = NULL;
+}
diff --git a/kernel/fs/f2fs/dir.c b/kernel/fs/f2fs/dir.c
new file mode 100644
index 000000000..3a3302ab7
--- /dev/null
+++ b/kernel/fs/f2fs/dir.c
@@ -0,0 +1,811 @@
+/*
+ * fs/f2fs/dir.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "node.h"
+#include "acl.h"
+#include "xattr.h"
+
+static unsigned long dir_blocks(struct inode *inode)
+{
+	return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
+							>> PAGE_CACHE_SHIFT;
+}
+
+static unsigned int dir_buckets(unsigned int level, int dir_level)
+{
+	if (level + dir_level < MAX_DIR_HASH_DEPTH / 2)
+		return 1 << (level + dir_level);
+	else
+		return MAX_DIR_BUCKETS;
+}
+
+static unsigned int bucket_blocks(unsigned int level)
+{
+	if (level < MAX_DIR_HASH_DEPTH / 2)
+		return 2;
+	else
+		return 4;
+}
+
+unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+	[F2FS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[F2FS_FT_REG_FILE]	= DT_REG,
+	[F2FS_FT_DIR]		= DT_DIR,
+	[F2FS_FT_CHRDEV]	= DT_CHR,
+	[F2FS_FT_BLKDEV]	= DT_BLK,
+	[F2FS_FT_FIFO]		= DT_FIFO,
+	[F2FS_FT_SOCK]		= DT_SOCK,
+	[F2FS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= F2FS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= F2FS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= F2FS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= F2FS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= F2FS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= F2FS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= F2FS_FT_SYMLINK,
+};
+
+void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
+{
+	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static unsigned long dir_block_index(unsigned int level,
+				int dir_level, unsigned int idx)
+{
+	unsigned long i;
+	unsigned long bidx = 0;
+
+	for (i = 0; i < level; i++)
+		bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
+	bidx += idx * bucket_blocks(level);
+	return bidx;
+}
+
+static bool early_match_name(size_t namelen, f2fs_hash_t namehash,
+				struct f2fs_dir_entry *de)
+{
+	if (le16_to_cpu(de->name_len) != namelen)
+		return false;
+
+	if (de->hash_code != namehash)
+		return false;
+
+	return true;
+}
+
+static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
+				struct qstr *name, int *max_slots,
+				struct page **res_page)
+{
+	struct f2fs_dentry_block *dentry_blk;
+	struct f2fs_dir_entry *de;
+	struct f2fs_dentry_ptr d;
+
+	dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
+
+	make_dentry_ptr(&d, (void *)dentry_blk, 1);
+	de = find_target_dentry(name, max_slots, &d);
+
+	if (de)
+		*res_page = dentry_page;
+	else
+		kunmap(dentry_page);
+
+	/*
+	 * For the most part, it should be a bug when name_len is zero.
+	 * We stop here for figuring out where the bugs has occurred.
+	 */
+	f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0);
+	return de;
+}
+
+struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots,
+						struct f2fs_dentry_ptr *d)
+{
+	struct f2fs_dir_entry *de;
+	unsigned long bit_pos = 0;
+	f2fs_hash_t namehash = f2fs_dentry_hash(name);
+	int max_len = 0;
+
+	if (max_slots)
+		*max_slots = 0;
+	while (bit_pos < d->max) {
+		if (!test_bit_le(bit_pos, d->bitmap)) {
+			bit_pos++;
+			max_len++;
+			continue;
+		}
+
+		de = &d->dentry[bit_pos];
+		if (early_match_name(name->len, namehash, de) &&
+			!memcmp(d->filename[bit_pos], name->name, name->len))
+			goto found;
+
+		if (max_slots && max_len > *max_slots)
+			*max_slots = max_len;
+		max_len = 0;
+
+		/* remain bug on condition */
+		if (unlikely(!de->name_len))
+			d->max = -1;
+
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+	}
+
+	de = NULL;
+found:
+	if (max_slots && max_len > *max_slots)
+		*max_slots = max_len;
+	return de;
+}
+
+static struct f2fs_dir_entry *find_in_level(struct inode *dir,
+			unsigned int level, struct qstr *name,
+			f2fs_hash_t namehash, struct page **res_page)
+{
+	int s = GET_DENTRY_SLOTS(name->len);
+	unsigned int nbucket, nblock;
+	unsigned int bidx, end_block;
+	struct page *dentry_page;
+	struct f2fs_dir_entry *de = NULL;
+	bool room = false;
+	int max_slots;
+
+	f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
+
+	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
+	nblock = bucket_blocks(level);
+
+	bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+					le32_to_cpu(namehash) % nbucket);
+	end_block = bidx + nblock;
+
+	for (; bidx < end_block; bidx++) {
+		/* no need to allocate new dentry pages to all the indices */
+		dentry_page = find_data_page(dir, bidx, true);
+		if (IS_ERR(dentry_page)) {
+			room = true;
+			continue;
+		}
+
+		de = find_in_block(dentry_page, name, &max_slots, res_page);
+		if (de)
+			break;
+
+		if (max_slots >= s)
+			room = true;
+		f2fs_put_page(dentry_page, 0);
+	}
+
+	if (!de && room && F2FS_I(dir)->chash != namehash) {
+		F2FS_I(dir)->chash = namehash;
+		F2FS_I(dir)->clevel = level;
+	}
+
+	return de;
+}
+
+/*
+ * Find an entry in the specified directory with the wanted name.
+ * It returns the page where the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+			struct qstr *child, struct page **res_page)
+{
+	unsigned long npages = dir_blocks(dir);
+	struct f2fs_dir_entry *de = NULL;
+	f2fs_hash_t name_hash;
+	unsigned int max_depth;
+	unsigned int level;
+
+	*res_page = NULL;
+
+	if (f2fs_has_inline_dentry(dir))
+		return find_in_inline_dir(dir, child, res_page);
+
+	if (npages == 0)
+		return NULL;
+
+	name_hash = f2fs_dentry_hash(child);
+	max_depth = F2FS_I(dir)->i_current_depth;
+
+	for (level = 0; level < max_depth; level++) {
+		de = find_in_level(dir, level, child, name_hash, res_page);
+		if (de)
+			break;
+	}
+	if (!de && F2FS_I(dir)->chash != name_hash) {
+		F2FS_I(dir)->chash = name_hash;
+		F2FS_I(dir)->clevel = level - 1;
+	}
+	return de;
+}
+
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
+{
+	struct page *page;
+	struct f2fs_dir_entry *de;
+	struct f2fs_dentry_block *dentry_blk;
+
+	if (f2fs_has_inline_dentry(dir))
+		return f2fs_parent_inline_dir(dir, p);
+
+	page = get_lock_data_page(dir, 0);
+	if (IS_ERR(page))
+		return NULL;
+
+	dentry_blk = kmap(page);
+	de = &dentry_blk->dentry[1];
+	*p = page;
+	unlock_page(page);
+	return de;
+}
+
+ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+{
+	ino_t res = 0;
+	struct f2fs_dir_entry *de;
+	struct page *page;
+
+	de = f2fs_find_entry(dir, qstr, &page);
+	if (de) {
+		res = le32_to_cpu(de->ino);
+		f2fs_dentry_kunmap(dir, page);
+		f2fs_put_page(page, 0);
+	}
+
+	return res;
+}
+
+void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
+		struct page *page, struct inode *inode)
+{
+	enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
+	lock_page(page);
+	f2fs_wait_on_page_writeback(page, type);
+	de->ino = cpu_to_le32(inode->i_ino);
+	set_de_type(de, inode->i_mode);
+	f2fs_dentry_kunmap(dir, page);
+	set_page_dirty(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+
+	f2fs_put_page(page, 1);
+}
+
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
+{
+	struct f2fs_inode *ri;
+
+	f2fs_wait_on_page_writeback(ipage, NODE);
+
+	/* copy name info. to this inode page */
+	ri = F2FS_INODE(ipage);
+	ri->i_namelen = cpu_to_le32(name->len);
+	memcpy(ri->i_name, name->name, name->len);
+	set_page_dirty(ipage);
+}
+
+int update_dent_inode(struct inode *inode, const struct qstr *name)
+{
+	struct page *page;
+
+	page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	init_dent_inode(name, page);
+	f2fs_put_page(page, 1);
+
+	return 0;
+}
+
+void do_make_empty_dir(struct inode *inode, struct inode *parent,
+					struct f2fs_dentry_ptr *d)
+{
+	struct f2fs_dir_entry *de;
+
+	de = &d->dentry[0];
+	de->name_len = cpu_to_le16(1);
+	de->hash_code = 0;
+	de->ino = cpu_to_le32(inode->i_ino);
+	memcpy(d->filename[0], ".", 1);
+	set_de_type(de, inode->i_mode);
+
+	de = &d->dentry[1];
+	de->hash_code = 0;
+	de->name_len = cpu_to_le16(2);
+	de->ino = cpu_to_le32(parent->i_ino);
+	memcpy(d->filename[1], "..", 2);
+	set_de_type(de, parent->i_mode);
+
+	test_and_set_bit_le(0, (void *)d->bitmap);
+	test_and_set_bit_le(1, (void *)d->bitmap);
+}
+
+static int make_empty_dir(struct inode *inode,
+		struct inode *parent, struct page *page)
+{
+	struct page *dentry_page;
+	struct f2fs_dentry_block *dentry_blk;
+	struct f2fs_dentry_ptr d;
+
+	if (f2fs_has_inline_dentry(inode))
+		return make_empty_inline_dir(inode, parent, page);
+
+	dentry_page = get_new_data_page(inode, page, 0, true);
+	if (IS_ERR(dentry_page))
+		return PTR_ERR(dentry_page);
+
+	dentry_blk = kmap_atomic(dentry_page);
+
+	make_dentry_ptr(&d, (void *)dentry_blk, 1);
+	do_make_empty_dir(inode, parent, &d);
+
+	kunmap_atomic(dentry_blk);
+
+	set_page_dirty(dentry_page);
+	f2fs_put_page(dentry_page, 1);
+	return 0;
+}
+
+struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
+			const struct qstr *name, struct page *dpage)
+{
+	struct page *page;
+	int err;
+
+	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+		page = new_inode_page(inode);
+		if (IS_ERR(page))
+			return page;
+
+		if (S_ISDIR(inode->i_mode)) {
+			err = make_empty_dir(inode, dir, page);
+			if (err)
+				goto error;
+		}
+
+		err = f2fs_init_acl(inode, dir, page, dpage);
+		if (err)
+			goto put_error;
+
+		err = f2fs_init_security(inode, dir, name, page);
+		if (err)
+			goto put_error;
+	} else {
+		page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
+		if (IS_ERR(page))
+			return page;
+
+		set_cold_node(inode, page);
+	}
+
+	if (name)
+		init_dent_inode(name, page);
+
+	/*
+	 * This file should be checkpointed during fsync.
+	 * We lost i_pino from now on.
+	 */
+	if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+		file_lost_pino(inode);
+		/*
+		 * If link the tmpfile to alias through linkat path,
+		 * we should remove this inode from orphan list.
+		 */
+		if (inode->i_nlink == 0)
+			remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
+		inc_nlink(inode);
+	}
+	return page;
+
+put_error:
+	f2fs_put_page(page, 1);
+error:
+	/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
+	truncate_inode_pages(&inode->i_data, 0);
+	truncate_blocks(inode, 0, false);
+	remove_dirty_dir_inode(inode);
+	remove_inode_page(inode);
+	return ERR_PTR(err);
+}
+
+void update_parent_metadata(struct inode *dir, struct inode *inode,
+						unsigned int current_depth)
+{
+	if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+		if (S_ISDIR(inode->i_mode)) {
+			inc_nlink(dir);
+			set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+		}
+		clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+	}
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+
+	if (F2FS_I(dir)->i_current_depth != current_depth) {
+		F2FS_I(dir)->i_current_depth = current_depth;
+		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+	}
+
+	if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
+		clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+}
+
+int room_for_filename(const void *bitmap, int slots, int max_slots)
+{
+	int bit_start = 0;
+	int zero_start, zero_end;
+next:
+	zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start);
+	if (zero_start >= max_slots)
+		return max_slots;
+
+	zero_end = find_next_bit_le(bitmap, max_slots, zero_start);
+	if (zero_end - zero_start >= slots)
+		return zero_start;
+
+	bit_start = zero_end + 1;
+
+	if (zero_end + 1 >= max_slots)
+		return max_slots;
+	goto next;
+}
+
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
+				const struct qstr *name, f2fs_hash_t name_hash,
+				unsigned int bit_pos)
+{
+	struct f2fs_dir_entry *de;
+	int slots = GET_DENTRY_SLOTS(name->len);
+	int i;
+
+	de = &d->dentry[bit_pos];
+	de->hash_code = name_hash;
+	de->name_len = cpu_to_le16(name->len);
+	memcpy(d->filename[bit_pos], name->name, name->len);
+	de->ino = cpu_to_le32(ino);
+	set_de_type(de, mode);
+	for (i = 0; i < slots; i++)
+		test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+				struct inode *inode, nid_t ino, umode_t mode)
+{
+	unsigned int bit_pos;
+	unsigned int level;
+	unsigned int current_depth;
+	unsigned long bidx, block;
+	f2fs_hash_t dentry_hash;
+	unsigned int nbucket, nblock;
+	size_t namelen = name->len;
+	struct page *dentry_page = NULL;
+	struct f2fs_dentry_block *dentry_blk = NULL;
+	struct f2fs_dentry_ptr d;
+	int slots = GET_DENTRY_SLOTS(namelen);
+	struct page *page = NULL;
+	int err = 0;
+
+	if (f2fs_has_inline_dentry(dir)) {
+		err = f2fs_add_inline_entry(dir, name, inode, ino, mode);
+		if (!err || err != -EAGAIN)
+			return err;
+		else
+			err = 0;
+	}
+
+	dentry_hash = f2fs_dentry_hash(name);
+	level = 0;
+	current_depth = F2FS_I(dir)->i_current_depth;
+	if (F2FS_I(dir)->chash == dentry_hash) {
+		level = F2FS_I(dir)->clevel;
+		F2FS_I(dir)->chash = 0;
+	}
+
+start:
+	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
+		return -ENOSPC;
+
+	/* Increase the depth, if required */
+	if (level == current_depth)
+		++current_depth;
+
+	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
+	nblock = bucket_blocks(level);
+
+	bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+				(le32_to_cpu(dentry_hash) % nbucket));
+
+	for (block = bidx; block <= (bidx + nblock - 1); block++) {
+		dentry_page = get_new_data_page(dir, NULL, block, true);
+		if (IS_ERR(dentry_page))
+			return PTR_ERR(dentry_page);
+
+		dentry_blk = kmap(dentry_page);
+		bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
+						slots, NR_DENTRY_IN_BLOCK);
+		if (bit_pos < NR_DENTRY_IN_BLOCK)
+			goto add_dentry;
+
+		kunmap(dentry_page);
+		f2fs_put_page(dentry_page, 1);
+	}
+
+	/* Move to next level to find the empty slot for new dentry */
+	++level;
+	goto start;
+add_dentry:
+	f2fs_wait_on_page_writeback(dentry_page, DATA);
+
+	if (inode) {
+		down_write(&F2FS_I(inode)->i_sem);
+		page = init_inode_metadata(inode, dir, name, NULL);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto fail;
+		}
+	}
+
+	make_dentry_ptr(&d, (void *)dentry_blk, 1);
+	f2fs_update_dentry(ino, mode, &d, name, dentry_hash, bit_pos);
+
+	set_page_dirty(dentry_page);
+
+	if (inode) {
+		/* we don't need to mark_inode_dirty now */
+		F2FS_I(inode)->i_pino = dir->i_ino;
+		update_inode(inode, page);
+		f2fs_put_page(page, 1);
+	}
+
+	update_parent_metadata(dir, inode, current_depth);
+fail:
+	if (inode)
+		up_write(&F2FS_I(inode)->i_sem);
+
+	if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
+		update_inode_page(dir);
+		clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+	}
+	kunmap(dentry_page);
+	f2fs_put_page(dentry_page, 1);
+	return err;
+}
+
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
+{
+	struct page *page;
+	int err = 0;
+
+	down_write(&F2FS_I(inode)->i_sem);
+	page = init_inode_metadata(inode, dir, NULL, NULL);
+	if (IS_ERR(page)) {
+		err = PTR_ERR(page);
+		goto fail;
+	}
+	/* we don't need to mark_inode_dirty now */
+	update_inode(inode, page);
+	f2fs_put_page(page, 1);
+
+	clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+fail:
+	up_write(&F2FS_I(inode)->i_sem);
+	return err;
+}
+
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+	down_write(&F2FS_I(inode)->i_sem);
+
+	if (S_ISDIR(inode->i_mode)) {
+		drop_nlink(dir);
+		if (page)
+			update_inode(dir, page);
+		else
+			update_inode_page(dir);
+	}
+	inode->i_ctime = CURRENT_TIME;
+
+	drop_nlink(inode);
+	if (S_ISDIR(inode->i_mode)) {
+		drop_nlink(inode);
+		i_size_write(inode, 0);
+	}
+	up_write(&F2FS_I(inode)->i_sem);
+	update_inode_page(inode);
+
+	if (inode->i_nlink == 0)
+		add_orphan_inode(sbi, inode->i_ino);
+	else
+		release_orphan_inode(sbi);
+}
+
+/*
+ * It only removes the dentry from the dentry page, corresponding name
+ * entry in name page does not need to be touched during deletion.
+ */
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+					struct inode *dir, struct inode *inode)
+{
+	struct	f2fs_dentry_block *dentry_blk;
+	unsigned int bit_pos;
+	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+	int i;
+
+	if (f2fs_has_inline_dentry(dir))
+		return f2fs_delete_inline_entry(dentry, page, dir, inode);
+
+	lock_page(page);
+	f2fs_wait_on_page_writeback(page, DATA);
+
+	dentry_blk = page_address(page);
+	bit_pos = dentry - dentry_blk->dentry;
+	for (i = 0; i < slots; i++)
+		clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+
+	/* Let's check and deallocate this dentry page */
+	bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+			NR_DENTRY_IN_BLOCK,
+			0);
+	kunmap(page); /* kunmap - pair of f2fs_find_entry */
+	set_page_dirty(page);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	if (inode)
+		f2fs_drop_nlink(dir, inode, NULL);
+
+	if (bit_pos == NR_DENTRY_IN_BLOCK) {
+		truncate_hole(dir, page->index, page->index + 1);
+		clear_page_dirty_for_io(page);
+		ClearPagePrivate(page);
+		ClearPageUptodate(page);
+		inode_dec_dirty_pages(dir);
+	}
+	f2fs_put_page(page, 1);
+}
+
+bool f2fs_empty_dir(struct inode *dir)
+{
+	unsigned long bidx;
+	struct page *dentry_page;
+	unsigned int bit_pos;
+	struct f2fs_dentry_block *dentry_blk;
+	unsigned long nblock = dir_blocks(dir);
+
+	if (f2fs_has_inline_dentry(dir))
+		return f2fs_empty_inline_dir(dir);
+
+	for (bidx = 0; bidx < nblock; bidx++) {
+		dentry_page = get_lock_data_page(dir, bidx);
+		if (IS_ERR(dentry_page)) {
+			if (PTR_ERR(dentry_page) == -ENOENT)
+				continue;
+			else
+				return false;
+		}
+
+		dentry_blk = kmap_atomic(dentry_page);
+		if (bidx == 0)
+			bit_pos = 2;
+		else
+			bit_pos = 0;
+		bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+						NR_DENTRY_IN_BLOCK,
+						bit_pos);
+		kunmap_atomic(dentry_blk);
+
+		f2fs_put_page(dentry_page, 1);
+
+		if (bit_pos < NR_DENTRY_IN_BLOCK)
+			return false;
+	}
+	return true;
+}
+
+bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+						unsigned int start_pos)
+{
+	unsigned char d_type = DT_UNKNOWN;
+	unsigned int bit_pos;
+	struct f2fs_dir_entry *de = NULL;
+
+	bit_pos = ((unsigned long)ctx->pos % d->max);
+
+	while (bit_pos < d->max) {
+		bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos);
+		if (bit_pos >= d->max)
+			break;
+
+		de = &d->dentry[bit_pos];
+		if (de->file_type < F2FS_FT_MAX)
+			d_type = f2fs_filetype_table[de->file_type];
+		else
+			d_type = DT_UNKNOWN;
+		if (!dir_emit(ctx, d->filename[bit_pos],
+					le16_to_cpu(de->name_len),
+					le32_to_cpu(de->ino), d_type))
+			return true;
+
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+		ctx->pos = start_pos + bit_pos;
+	}
+	return false;
+}
+
+static int f2fs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	unsigned long npages = dir_blocks(inode);
+	struct f2fs_dentry_block *dentry_blk = NULL;
+	struct page *dentry_page = NULL;
+	struct file_ra_state *ra = &file->f_ra;
+	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
+	struct f2fs_dentry_ptr d;
+
+	if (f2fs_has_inline_dentry(inode))
+		return f2fs_read_inline_dir(file, ctx);
+
+	/* readahead for multi pages of dir */
+	if (npages - n > 1 && !ra_has_index(ra, n))
+		page_cache_sync_readahead(inode->i_mapping, ra, file, n,
+				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
+
+	for (; n < npages; n++) {
+		dentry_page = get_lock_data_page(inode, n);
+		if (IS_ERR(dentry_page))
+			continue;
+
+		dentry_blk = kmap(dentry_page);
+
+		make_dentry_ptr(&d, (void *)dentry_blk, 1);
+
+		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK))
+			goto stop;
+
+		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+		kunmap(dentry_page);
+		f2fs_put_page(dentry_page, 1);
+		dentry_page = NULL;
+	}
+stop:
+	if (dentry_page && !IS_ERR(dentry_page)) {
+		kunmap(dentry_page);
+		f2fs_put_page(dentry_page, 1);
+	}
+
+	return 0;
+}
+
+const struct file_operations f2fs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= f2fs_readdir,
+	.fsync		= f2fs_sync_file,
+	.unlocked_ioctl	= f2fs_ioctl,
+};
diff --git a/kernel/fs/f2fs/f2fs.h b/kernel/fs/f2fs/f2fs.h
new file mode 100644
index 000000000..8de34ab6d
--- /dev/null
+++ b/kernel/fs/f2fs/f2fs.h
@@ -0,0 +1,1814 @@
+/*
+ * fs/f2fs/f2fs.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_F2FS_H
+#define _LINUX_F2FS_H
+
+#include <linux/types.h>
+#include <linux/page-flags.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/magic.h>
+#include <linux/kobject.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_F2FS_CHECK_FS
+#define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
+#define f2fs_down_write(x, y)	down_write_nest_lock(x, y)
+#else
+#define f2fs_bug_on(sbi, condition)					\
+	do {								\
+		if (unlikely(condition)) {				\
+			WARN_ON(1);					\
+			set_sbi_flag(sbi, SBI_NEED_FSCK);		\
+		}							\
+	} while (0)
+#define f2fs_down_write(x, y)	down_write(x)
+#endif
+
+/*
+ * For mount options
+ */
+#define F2FS_MOUNT_BG_GC		0x00000001
+#define F2FS_MOUNT_DISABLE_ROLL_FORWARD	0x00000002
+#define F2FS_MOUNT_DISCARD		0x00000004
+#define F2FS_MOUNT_NOHEAP		0x00000008
+#define F2FS_MOUNT_XATTR_USER		0x00000010
+#define F2FS_MOUNT_POSIX_ACL		0x00000020
+#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY	0x00000040
+#define F2FS_MOUNT_INLINE_XATTR		0x00000080
+#define F2FS_MOUNT_INLINE_DATA		0x00000100
+#define F2FS_MOUNT_INLINE_DENTRY	0x00000200
+#define F2FS_MOUNT_FLUSH_MERGE		0x00000400
+#define F2FS_MOUNT_NOBARRIER		0x00000800
+#define F2FS_MOUNT_FASTBOOT		0x00001000
+#define F2FS_MOUNT_EXTENT_CACHE		0x00002000
+
+#define clear_opt(sbi, option)	(sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option)	(sbi->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option)	(sbi->mount_opt.opt & F2FS_MOUNT_##option)
+
+#define ver_after(a, b)	(typecheck(unsigned long long, a) &&		\
+		typecheck(unsigned long long, b) &&			\
+		((long long)((a) - (b)) > 0))
+
+typedef u32 block_t;	/*
+			 * should not change u32, since it is the on-disk block
+			 * address format, __le32.
+			 */
+typedef u32 nid_t;
+
+struct f2fs_mount_info {
+	unsigned int	opt;
+};
+
+#define CRCPOLY_LE 0xedb88320
+
+static inline __u32 f2fs_crc32(void *buf, size_t len)
+{
+	unsigned char *p = (unsigned char *)buf;
+	__u32 crc = F2FS_SUPER_MAGIC;
+	int i;
+
+	while (len--) {
+		crc ^= *p++;
+		for (i = 0; i < 8; i++)
+			crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+	}
+	return crc;
+}
+
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
+{
+	return f2fs_crc32(buf, buf_size) == blk_crc;
+}
+
+/*
+ * For checkpoint manager
+ */
+enum {
+	NAT_BITMAP,
+	SIT_BITMAP
+};
+
+enum {
+	CP_UMOUNT,
+	CP_FASTBOOT,
+	CP_SYNC,
+	CP_RECOVERY,
+	CP_DISCARD,
+};
+
+#define DEF_BATCHED_TRIM_SECTIONS	32
+#define BATCHED_TRIM_SEGMENTS(sbi)	\
+		(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+
+struct cp_control {
+	int reason;
+	__u64 trim_start;
+	__u64 trim_end;
+	__u64 trim_minlen;
+	__u64 trimmed;
+};
+
+/*
+ * For CP/NAT/SIT/SSA readahead
+ */
+enum {
+	META_CP,
+	META_NAT,
+	META_SIT,
+	META_SSA,
+	META_POR,
+};
+
+/* for the list of ino */
+enum {
+	ORPHAN_INO,		/* for orphan ino list */
+	APPEND_INO,		/* for append ino list */
+	UPDATE_INO,		/* for update ino list */
+	MAX_INO_ENTRY,		/* max. list */
+};
+
+struct ino_entry {
+	struct list_head list;	/* list head */
+	nid_t ino;		/* inode number */
+};
+
+/*
+ * for the list of directory inodes or gc inodes.
+ * NOTE: there are two slab users for this structure, if we add/modify/delete
+ * fields in structure for one of slab users, it may affect fields or size of
+ * other one, in this condition, it's better to split both of slab and related
+ * data structure.
+ */
+struct inode_entry {
+	struct list_head list;	/* list head */
+	struct inode *inode;	/* vfs inode pointer */
+};
+
+/* for the list of blockaddresses to be discarded */
+struct discard_entry {
+	struct list_head list;	/* list head */
+	block_t blkaddr;	/* block address to be discarded */
+	int len;		/* # of consecutive blocks of the discard */
+};
+
+/* for the list of fsync inodes, used only during recovery */
+struct fsync_inode_entry {
+	struct list_head list;	/* list head */
+	struct inode *inode;	/* vfs inode pointer */
+	block_t blkaddr;	/* block address locating the last fsync */
+	block_t last_dentry;	/* block address locating the last dentry */
+	block_t last_inode;	/* block address locating the last inode */
+};
+
+#define nats_in_cursum(sum)		(le16_to_cpu(sum->n_nats))
+#define sits_in_cursum(sum)		(le16_to_cpu(sum->n_sits))
+
+#define nat_in_journal(sum, i)		(sum->nat_j.entries[i].ne)
+#define nid_in_journal(sum, i)		(sum->nat_j.entries[i].nid)
+#define sit_in_journal(sum, i)		(sum->sit_j.entries[i].se)
+#define segno_in_journal(sum, i)	(sum->sit_j.entries[i].segno)
+
+#define MAX_NAT_JENTRIES(sum)	(NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum)	(SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+
+static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+	int before = nats_in_cursum(rs);
+	rs->n_nats = cpu_to_le16(before + i);
+	return before;
+}
+
+static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+	int before = sits_in_cursum(rs);
+	rs->n_sits = cpu_to_le16(before + i);
+	return before;
+}
+
+static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
+								int type)
+{
+	if (type == NAT_JOURNAL)
+		return size <= MAX_NAT_JENTRIES(sum);
+	return size <= MAX_SIT_JENTRIES(sum);
+}
+
+/*
+ * ioctl commands
+ */
+#define F2FS_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define F2FS_IOC_GETVERSION		FS_IOC_GETVERSION
+
+#define F2FS_IOCTL_MAGIC		0xf5
+#define F2FS_IOC_START_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 1)
+#define F2FS_IOC_COMMIT_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 2)
+#define F2FS_IOC_START_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 3)
+#define F2FS_IOC_RELEASE_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 4)
+#define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
+
+/*
+ * should be same as XFS_IOC_GOINGDOWN.
+ * Flags for going down operation used by FS_IOC_GOINGDOWN
+ */
+#define F2FS_IOC_SHUTDOWN	_IOR('X', 125, __u32)	/* Shutdown */
+#define F2FS_GOING_DOWN_FULLSYNC	0x0	/* going down with full sync */
+#define F2FS_GOING_DOWN_METASYNC	0x1	/* going down with metadata */
+#define F2FS_GOING_DOWN_NOSYNC		0x2	/* going down */
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define F2FS_IOC32_GETFLAGS             FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
+#endif
+
+/*
+ * For INODE and NODE manager
+ */
+/* for directory operations */
+struct f2fs_dentry_ptr {
+	const void *bitmap;
+	struct f2fs_dir_entry *dentry;
+	__u8 (*filename)[F2FS_SLOT_LEN];
+	int max;
+};
+
+static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d,
+					void *src, int type)
+{
+	if (type == 1) {
+		struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
+		d->max = NR_DENTRY_IN_BLOCK;
+		d->bitmap = &t->dentry_bitmap;
+		d->dentry = t->dentry;
+		d->filename = t->filename;
+	} else {
+		struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src;
+		d->max = NR_INLINE_DENTRY;
+		d->bitmap = &t->dentry_bitmap;
+		d->dentry = t->dentry;
+		d->filename = t->filename;
+	}
+}
+
+/*
+ * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1
+ * as its node offset to distinguish from index node blocks.
+ * But some bits are used to mark the node block.
+ */
+#define XATTR_NODE_OFFSET	((((unsigned int)-1) << OFFSET_BIT_SHIFT) \
+				>> OFFSET_BIT_SHIFT)
+enum {
+	ALLOC_NODE,			/* allocate a new node page if needed */
+	LOOKUP_NODE,			/* look up a node without readahead */
+	LOOKUP_NODE_RA,			/*
+					 * look up a node with readahead called
+					 * by get_data_block.
+					 */
+};
+
+#define F2FS_LINK_MAX		32000	/* maximum link count per file */
+
+#define MAX_DIR_RA_PAGES	4	/* maximum ra pages of dir */
+
+/* vector size for gang look-up from extent cache that consists of radix tree */
+#define EXT_TREE_VEC_SIZE	64
+
+/* for in-memory extent cache entry */
+#define F2FS_MIN_EXTENT_LEN	64	/* minimum extent length */
+
+/* number of extent info in extent cache we try to shrink */
+#define EXTENT_CACHE_SHRINK_NUMBER	128
+
+struct extent_info {
+	unsigned int fofs;		/* start offset in a file */
+	u32 blk;			/* start block address of the extent */
+	unsigned int len;		/* length of the extent */
+};
+
+struct extent_node {
+	struct rb_node rb_node;		/* rb node located in rb-tree */
+	struct list_head list;		/* node in global extent list of sbi */
+	struct extent_info ei;		/* extent info */
+};
+
+struct extent_tree {
+	nid_t ino;			/* inode number */
+	struct rb_root root;		/* root of extent info rb-tree */
+	struct extent_node *cached_en;	/* recently accessed extent node */
+	rwlock_t lock;			/* protect extent info rb-tree */
+	atomic_t refcount;		/* reference count of rb-tree */
+	unsigned int count;		/* # of extent node in rb-tree*/
+};
+
+/*
+ * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
+ */
+#define FADVISE_COLD_BIT	0x01
+#define FADVISE_LOST_PINO_BIT	0x02
+
+#define DEF_DIR_LEVEL		0
+
+struct f2fs_inode_info {
+	struct inode vfs_inode;		/* serve a vfs inode */
+	unsigned long i_flags;		/* keep an inode flags for ioctl */
+	unsigned char i_advise;		/* use to give file attribute hints */
+	unsigned char i_dir_level;	/* use for dentry level for large dir */
+	unsigned int i_current_depth;	/* use only in directory structure */
+	unsigned int i_pino;		/* parent inode number */
+	umode_t i_acl_mode;		/* keep file acl mode temporarily */
+
+	/* Use below internally in f2fs*/
+	unsigned long flags;		/* use to pass per-file flags */
+	struct rw_semaphore i_sem;	/* protect fi info */
+	atomic_t dirty_pages;		/* # of dirty pages */
+	f2fs_hash_t chash;		/* hash value of given file name */
+	unsigned int clevel;		/* maximum level of given file name */
+	nid_t i_xattr_nid;		/* node id that contains xattrs */
+	unsigned long long xattr_ver;	/* cp version of xattr modification */
+	struct extent_info ext;		/* in-memory extent cache entry */
+	rwlock_t ext_lock;		/* rwlock for single extent cache */
+	struct inode_entry *dirty_dir;	/* the pointer of dirty dir */
+
+	struct radix_tree_root inmem_root;	/* radix tree for inmem pages */
+	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
+	struct mutex inmem_lock;	/* lock for inmemory pages */
+};
+
+static inline void get_extent_info(struct extent_info *ext,
+					struct f2fs_extent i_ext)
+{
+	ext->fofs = le32_to_cpu(i_ext.fofs);
+	ext->blk = le32_to_cpu(i_ext.blk);
+	ext->len = le32_to_cpu(i_ext.len);
+}
+
+static inline void set_raw_extent(struct extent_info *ext,
+					struct f2fs_extent *i_ext)
+{
+	i_ext->fofs = cpu_to_le32(ext->fofs);
+	i_ext->blk = cpu_to_le32(ext->blk);
+	i_ext->len = cpu_to_le32(ext->len);
+}
+
+static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
+						u32 blk, unsigned int len)
+{
+	ei->fofs = fofs;
+	ei->blk = blk;
+	ei->len = len;
+}
+
+static inline bool __is_extent_same(struct extent_info *ei1,
+						struct extent_info *ei2)
+{
+	return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk &&
+						ei1->len == ei2->len);
+}
+
+static inline bool __is_extent_mergeable(struct extent_info *back,
+						struct extent_info *front)
+{
+	return (back->fofs + back->len == front->fofs &&
+			back->blk + back->len == front->blk);
+}
+
+static inline bool __is_back_mergeable(struct extent_info *cur,
+						struct extent_info *back)
+{
+	return __is_extent_mergeable(back, cur);
+}
+
+static inline bool __is_front_mergeable(struct extent_info *cur,
+						struct extent_info *front)
+{
+	return __is_extent_mergeable(cur, front);
+}
+
+struct f2fs_nm_info {
+	block_t nat_blkaddr;		/* base disk address of NAT */
+	nid_t max_nid;			/* maximum possible node ids */
+	nid_t available_nids;		/* maximum available node ids */
+	nid_t next_scan_nid;		/* the next nid to be scanned */
+	unsigned int ram_thresh;	/* control the memory footprint */
+
+	/* NAT cache management */
+	struct radix_tree_root nat_root;/* root of the nat entry cache */
+	struct radix_tree_root nat_set_root;/* root of the nat set cache */
+	struct rw_semaphore nat_tree_lock;	/* protect nat_tree_lock */
+	struct list_head nat_entries;	/* cached nat entry list (clean) */
+	unsigned int nat_cnt;		/* the # of cached nat entries */
+	unsigned int dirty_nat_cnt;	/* total num of nat entries in set */
+
+	/* free node ids management */
+	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
+	struct list_head free_nid_list;	/* a list for free nids */
+	spinlock_t free_nid_list_lock;	/* protect free nid list */
+	unsigned int fcnt;		/* the number of free node id */
+	struct mutex build_lock;	/* lock for build free nids */
+
+	/* for checkpoint */
+	char *nat_bitmap;		/* NAT bitmap pointer */
+	int bitmap_size;		/* bitmap size */
+};
+
+/*
+ * this structure is used as one of function parameters.
+ * all the information are dedicated to a given direct node block determined
+ * by the data offset in a file.
+ */
+struct dnode_of_data {
+	struct inode *inode;		/* vfs inode pointer */
+	struct page *inode_page;	/* its inode page, NULL is possible */
+	struct page *node_page;		/* cached direct node page */
+	nid_t nid;			/* node id of the direct node block */
+	unsigned int ofs_in_node;	/* data offset in the node page */
+	bool inode_page_locked;		/* inode page is locked or not */
+	block_t	data_blkaddr;		/* block address of the node block */
+};
+
+static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
+		struct page *ipage, struct page *npage, nid_t nid)
+{
+	memset(dn, 0, sizeof(*dn));
+	dn->inode = inode;
+	dn->inode_page = ipage;
+	dn->node_page = npage;
+	dn->nid = nid;
+}
+
+/*
+ * For SIT manager
+ *
+ * By default, there are 6 active log areas across the whole main area.
+ * When considering hot and cold data separation to reduce cleaning overhead,
+ * we split 3 for data logs and 3 for node logs as hot, warm, and cold types,
+ * respectively.
+ * In the current design, you should not change the numbers intentionally.
+ * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6
+ * logs individually according to the underlying devices. (default: 6)
+ * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for
+ * data and 8 for node logs.
+ */
+#define	NR_CURSEG_DATA_TYPE	(3)
+#define NR_CURSEG_NODE_TYPE	(3)
+#define NR_CURSEG_TYPE	(NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
+
+enum {
+	CURSEG_HOT_DATA	= 0,	/* directory entry blocks */
+	CURSEG_WARM_DATA,	/* data blocks */
+	CURSEG_COLD_DATA,	/* multimedia or GCed data blocks */
+	CURSEG_HOT_NODE,	/* direct node blocks of directory files */
+	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
+	CURSEG_COLD_NODE,	/* indirect node blocks */
+	NO_CHECK_TYPE,
+	CURSEG_DIRECT_IO,	/* to use for the direct IO path */
+};
+
+struct flush_cmd {
+	struct completion wait;
+	struct llist_node llnode;
+	int ret;
+};
+
+struct flush_cmd_control {
+	struct task_struct *f2fs_issue_flush;	/* flush thread */
+	wait_queue_head_t flush_wait_queue;	/* waiting queue for wake-up */
+	struct llist_head issue_list;		/* list for command issue */
+	struct llist_node *dispatch_list;	/* list for command dispatch */
+};
+
+struct f2fs_sm_info {
+	struct sit_info *sit_info;		/* whole segment information */
+	struct free_segmap_info *free_info;	/* free segment information */
+	struct dirty_seglist_info *dirty_info;	/* dirty segment information */
+	struct curseg_info *curseg_array;	/* active segment information */
+
+	block_t seg0_blkaddr;		/* block address of 0'th segment */
+	block_t main_blkaddr;		/* start block address of main area */
+	block_t ssa_blkaddr;		/* start block address of SSA area */
+
+	unsigned int segment_count;	/* total # of segments */
+	unsigned int main_segments;	/* # of segments in main area */
+	unsigned int reserved_segments;	/* # of reserved segments */
+	unsigned int ovp_segments;	/* # of overprovision segments */
+
+	/* a threshold to reclaim prefree segments */
+	unsigned int rec_prefree_segments;
+
+	/* for small discard management */
+	struct list_head discard_list;		/* 4KB discard list */
+	int nr_discards;			/* # of discards in the list */
+	int max_discards;			/* max. discards to be issued */
+
+	/* for batched trimming */
+	unsigned int trim_sections;		/* # of sections to trim */
+
+	struct list_head sit_entry_set;	/* sit entry set list */
+
+	unsigned int ipu_policy;	/* in-place-update policy */
+	unsigned int min_ipu_util;	/* in-place-update threshold */
+	unsigned int min_fsync_blocks;	/* threshold for fsync */
+
+	/* for flush command control */
+	struct flush_cmd_control *cmd_control_info;
+
+};
+
+/*
+ * For superblock
+ */
+/*
+ * COUNT_TYPE for monitoring
+ *
+ * f2fs monitors the number of several block types such as on-writeback,
+ * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
+ */
+enum count_type {
+	F2FS_WRITEBACK,
+	F2FS_DIRTY_DENTS,
+	F2FS_DIRTY_NODES,
+	F2FS_DIRTY_META,
+	F2FS_INMEM_PAGES,
+	NR_COUNT_TYPE,
+};
+
+/*
+ * The below are the page types of bios used in submit_bio().
+ * The available types are:
+ * DATA			User data pages. It operates as async mode.
+ * NODE			Node pages. It operates as async mode.
+ * META			FS metadata pages such as SIT, NAT, CP.
+ * NR_PAGE_TYPE		The number of page types.
+ * META_FLUSH		Make sure the previous pages are written
+ *			with waiting the bio's completion
+ * ...			Only can be used with META.
+ */
+#define PAGE_TYPE_OF_BIO(type)	((type) > META ? META : (type))
+enum page_type {
+	DATA,
+	NODE,
+	META,
+	NR_PAGE_TYPE,
+	META_FLUSH,
+	INMEM,		/* the below types are used by tracepoints only. */
+	INMEM_DROP,
+	IPU,
+	OPU,
+};
+
+struct f2fs_io_info {
+	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
+	int rw;			/* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+	block_t blk_addr;	/* block address to be written */
+};
+
+#define is_read_io(rw)	(((rw) & 1) == READ)
+struct f2fs_bio_info {
+	struct f2fs_sb_info *sbi;	/* f2fs superblock */
+	struct bio *bio;		/* bios to merge */
+	sector_t last_block_in_bio;	/* last block number */
+	struct f2fs_io_info fio;	/* store buffered io info. */
+	struct rw_semaphore io_rwsem;	/* blocking op for bio */
+};
+
+/* for inner inode cache management */
+struct inode_management {
+	struct radix_tree_root ino_root;	/* ino entry array */
+	spinlock_t ino_lock;			/* for ino entry lock */
+	struct list_head ino_list;		/* inode list head */
+	unsigned long ino_num;			/* number of entries */
+};
+
+/* For s_flag in struct f2fs_sb_info */
+enum {
+	SBI_IS_DIRTY,				/* dirty flag for checkpoint */
+	SBI_IS_CLOSE,				/* specify unmounting */
+	SBI_NEED_FSCK,				/* need fsck.f2fs to fix */
+	SBI_POR_DOING,				/* recovery is doing or not */
+};
+
+struct f2fs_sb_info {
+	struct super_block *sb;			/* pointer to VFS super block */
+	struct proc_dir_entry *s_proc;		/* proc entry */
+	struct buffer_head *raw_super_buf;	/* buffer head of raw sb */
+	struct f2fs_super_block *raw_super;	/* raw super block pointer */
+	int s_flag;				/* flags for sbi */
+
+	/* for node-related operations */
+	struct f2fs_nm_info *nm_info;		/* node manager */
+	struct inode *node_inode;		/* cache node blocks */
+
+	/* for segment-related operations */
+	struct f2fs_sm_info *sm_info;		/* segment manager */
+
+	/* for bio operations */
+	struct f2fs_bio_info read_io;			/* for read bios */
+	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
+
+	/* for checkpoint */
+	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
+	struct inode *meta_inode;		/* cache meta blocks */
+	struct mutex cp_mutex;			/* checkpoint procedure lock */
+	struct rw_semaphore cp_rwsem;		/* blocking FS operations */
+	struct rw_semaphore node_write;		/* locking node writes */
+	struct mutex writepages;		/* mutex for writepages() */
+	wait_queue_head_t cp_wait;
+
+	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
+
+	/* for orphan inode, use 0'th array */
+	unsigned int max_orphans;		/* max orphan inodes */
+
+	/* for directory inode management */
+	struct list_head dir_inode_list;	/* dir inode list */
+	spinlock_t dir_inode_lock;		/* for dir inode list lock */
+
+	/* for extent tree cache */
+	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+	struct rw_semaphore extent_tree_lock;	/* locking extent radix tree */
+	struct list_head extent_list;		/* lru list for shrinker */
+	spinlock_t extent_lock;			/* locking extent lru list */
+	int total_ext_tree;			/* extent tree count */
+	atomic_t total_ext_node;		/* extent info count */
+
+	/* basic filesystem units */
+	unsigned int log_sectors_per_block;	/* log2 sectors per block */
+	unsigned int log_blocksize;		/* log2 block size */
+	unsigned int blocksize;			/* block size */
+	unsigned int root_ino_num;		/* root inode number*/
+	unsigned int node_ino_num;		/* node inode number*/
+	unsigned int meta_ino_num;		/* meta inode number*/
+	unsigned int log_blocks_per_seg;	/* log2 blocks per segment */
+	unsigned int blocks_per_seg;		/* blocks per segment */
+	unsigned int segs_per_sec;		/* segments per section */
+	unsigned int secs_per_zone;		/* sections per zone */
+	unsigned int total_sections;		/* total section count */
+	unsigned int total_node_count;		/* total node block count */
+	unsigned int total_valid_node_count;	/* valid node block count */
+	unsigned int total_valid_inode_count;	/* valid inode count */
+	int active_logs;			/* # of active logs */
+	int dir_level;				/* directory level */
+
+	block_t user_block_count;		/* # of user blocks */
+	block_t total_valid_block_count;	/* # of valid blocks */
+	block_t alloc_valid_block_count;	/* # of allocated blocks */
+	block_t last_valid_block_count;		/* for recovery */
+	u32 s_next_generation;			/* for NFS support */
+	atomic_t nr_pages[NR_COUNT_TYPE];	/* # of pages, see count_type */
+
+	struct f2fs_mount_info mount_opt;	/* mount options */
+
+	/* for cleaning operations */
+	struct mutex gc_mutex;			/* mutex for GC */
+	struct f2fs_gc_kthread	*gc_thread;	/* GC thread */
+	unsigned int cur_victim_sec;		/* current victim section num */
+
+	/* maximum # of trials to find a victim segment for SSR and GC */
+	unsigned int max_victim_search;
+
+	/*
+	 * for stat information.
+	 * one is for the LFS mode, and the other is for the SSR mode.
+	 */
+#ifdef CONFIG_F2FS_STAT_FS
+	struct f2fs_stat_info *stat_info;	/* FS status information */
+	unsigned int segment_count[2];		/* # of allocated segments */
+	unsigned int block_count[2];		/* # of allocated blocks */
+	atomic_t inplace_count;		/* # of inplace update */
+	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
+	atomic_t inline_inode;			/* # of inline_data inodes */
+	atomic_t inline_dir;			/* # of inline_dentry inodes */
+	int bg_gc;				/* background gc calls */
+	unsigned int n_dirty_dirs;		/* # of dir inodes */
+#endif
+	unsigned int last_victim[2];		/* last victim segment # */
+	spinlock_t stat_lock;			/* lock for stat operations */
+
+	/* For sysfs suppport */
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
+};
+
+/*
+ * Inline functions
+ */
+static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
+{
+	return container_of(inode, struct f2fs_inode_info, vfs_inode);
+}
+
+static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
+{
+	return F2FS_SB(inode->i_sb);
+}
+
+static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
+{
+	return F2FS_I_SB(mapping->host);
+}
+
+static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
+{
+	return F2FS_M_SB(page->mapping);
+}
+
+static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
+{
+	return (struct f2fs_super_block *)(sbi->raw_super);
+}
+
+static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
+{
+	return (struct f2fs_checkpoint *)(sbi->ckpt);
+}
+
+static inline struct f2fs_node *F2FS_NODE(struct page *page)
+{
+	return (struct f2fs_node *)page_address(page);
+}
+
+static inline struct f2fs_inode *F2FS_INODE(struct page *page)
+{
+	return &((struct f2fs_node *)page_address(page))->i;
+}
+
+static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
+{
+	return (struct f2fs_nm_info *)(sbi->nm_info);
+}
+
+static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi)
+{
+	return (struct f2fs_sm_info *)(sbi->sm_info);
+}
+
+static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi)
+{
+	return (struct sit_info *)(SM_I(sbi)->sit_info);
+}
+
+static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi)
+{
+	return (struct free_segmap_info *)(SM_I(sbi)->free_info);
+}
+
+static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
+{
+	return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
+}
+
+static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
+{
+	return sbi->meta_inode->i_mapping;
+}
+
+static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
+{
+	return sbi->node_inode->i_mapping;
+}
+
+static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
+{
+	return sbi->s_flag & (0x01 << type);
+}
+
+static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
+{
+	sbi->s_flag |= (0x01 << type);
+}
+
+static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
+{
+	sbi->s_flag &= ~(0x01 << type);
+}
+
+static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
+{
+	return le64_to_cpu(cp->checkpoint_ver);
+}
+
+static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+	return ckpt_flags & f;
+}
+
+static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+	ckpt_flags |= f;
+	cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+
+static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+	ckpt_flags &= (~f);
+	cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+
+static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
+{
+	down_read(&sbi->cp_rwsem);
+}
+
+static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
+{
+	up_read(&sbi->cp_rwsem);
+}
+
+static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
+{
+	f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
+}
+
+static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
+{
+	up_write(&sbi->cp_rwsem);
+}
+
+static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
+{
+	int reason = CP_SYNC;
+
+	if (test_opt(sbi, FASTBOOT))
+		reason = CP_FASTBOOT;
+	if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
+		reason = CP_UMOUNT;
+	return reason;
+}
+
+static inline bool __remain_node_summaries(int reason)
+{
+	return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+}
+
+static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
+{
+	return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
+			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+}
+
+/*
+ * Check whether the given nid is within node id range.
+ */
+static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+{
+	if (unlikely(nid < F2FS_ROOT_INO(sbi)))
+		return -EINVAL;
+	if (unlikely(nid >= NM_I(sbi)->max_nid))
+		return -EINVAL;
+	return 0;
+}
+
+#define F2FS_DEFAULT_ALLOCATED_BLOCKS	1
+
+/*
+ * Check whether the inode has blocks or not
+ */
+static inline int F2FS_HAS_BLOCKS(struct inode *inode)
+{
+	if (F2FS_I(inode)->i_xattr_nid)
+		return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
+	else
+		return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
+}
+
+static inline bool f2fs_has_xattr_block(unsigned int ofs)
+{
+	return ofs == XATTR_NODE_OFFSET;
+}
+
+static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
+				 struct inode *inode, blkcnt_t count)
+{
+	block_t	valid_block_count;
+
+	spin_lock(&sbi->stat_lock);
+	valid_block_count =
+		sbi->total_valid_block_count + (block_t)count;
+	if (unlikely(valid_block_count > sbi->user_block_count)) {
+		spin_unlock(&sbi->stat_lock);
+		return false;
+	}
+	inode->i_blocks += count;
+	sbi->total_valid_block_count = valid_block_count;
+	sbi->alloc_valid_block_count += (block_t)count;
+	spin_unlock(&sbi->stat_lock);
+	return true;
+}
+
+static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
+						struct inode *inode,
+						blkcnt_t count)
+{
+	spin_lock(&sbi->stat_lock);
+	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
+	f2fs_bug_on(sbi, inode->i_blocks < count);
+	inode->i_blocks -= count;
+	sbi->total_valid_block_count -= (block_t)count;
+	spin_unlock(&sbi->stat_lock);
+}
+
+static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+	atomic_inc(&sbi->nr_pages[count_type]);
+	set_sbi_flag(sbi, SBI_IS_DIRTY);
+}
+
+static inline void inode_inc_dirty_pages(struct inode *inode)
+{
+	atomic_inc(&F2FS_I(inode)->dirty_pages);
+	if (S_ISDIR(inode->i_mode))
+		inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+}
+
+static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+	atomic_dec(&sbi->nr_pages[count_type]);
+}
+
+static inline void inode_dec_dirty_pages(struct inode *inode)
+{
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+		return;
+
+	atomic_dec(&F2FS_I(inode)->dirty_pages);
+
+	if (S_ISDIR(inode->i_mode))
+		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+}
+
+static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+{
+	return atomic_read(&sbi->nr_pages[count_type]);
+}
+
+static inline int get_dirty_pages(struct inode *inode)
+{
+	return atomic_read(&F2FS_I(inode)->dirty_pages);
+}
+
+static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
+{
+	unsigned int pages_per_sec = sbi->segs_per_sec *
+					(1 << sbi->log_blocks_per_seg);
+	return ((get_pages(sbi, block_type) + pages_per_sec - 1)
+			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+}
+
+static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
+{
+	return sbi->total_valid_block_count;
+}
+
+static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+	/* return NAT or SIT bitmap */
+	if (flag == NAT_BITMAP)
+		return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
+	else if (flag == SIT_BITMAP)
+		return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
+
+	return 0;
+}
+
+static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
+{
+	return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+}
+
+static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	int offset;
+
+	if (__cp_payload(sbi) > 0) {
+		if (flag == NAT_BITMAP)
+			return &ckpt->sit_nat_version_bitmap;
+		else
+			return (unsigned char *)ckpt + F2FS_BLKSIZE;
+	} else {
+		offset = (flag == NAT_BITMAP) ?
+			le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
+		return &ckpt->sit_nat_version_bitmap + offset;
+	}
+}
+
+static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
+{
+	block_t start_addr;
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	unsigned long long ckpt_version = cur_cp_version(ckpt);
+
+	start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+
+	/*
+	 * odd numbered checkpoint should at cp segment 0
+	 * and even segment must be at cp segment 1
+	 */
+	if (!(ckpt_version & 1))
+		start_addr += sbi->blocks_per_seg;
+
+	return start_addr;
+}
+
+static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
+{
+	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+
+static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
+						struct inode *inode)
+{
+	block_t	valid_block_count;
+	unsigned int valid_node_count;
+
+	spin_lock(&sbi->stat_lock);
+
+	valid_block_count = sbi->total_valid_block_count + 1;
+	if (unlikely(valid_block_count > sbi->user_block_count)) {
+		spin_unlock(&sbi->stat_lock);
+		return false;
+	}
+
+	valid_node_count = sbi->total_valid_node_count + 1;
+	if (unlikely(valid_node_count > sbi->total_node_count)) {
+		spin_unlock(&sbi->stat_lock);
+		return false;
+	}
+
+	if (inode)
+		inode->i_blocks++;
+
+	sbi->alloc_valid_block_count++;
+	sbi->total_valid_node_count++;
+	sbi->total_valid_block_count++;
+	spin_unlock(&sbi->stat_lock);
+
+	return true;
+}
+
+static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
+						struct inode *inode)
+{
+	spin_lock(&sbi->stat_lock);
+
+	f2fs_bug_on(sbi, !sbi->total_valid_block_count);
+	f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+	f2fs_bug_on(sbi, !inode->i_blocks);
+
+	inode->i_blocks--;
+	sbi->total_valid_node_count--;
+	sbi->total_valid_block_count--;
+
+	spin_unlock(&sbi->stat_lock);
+}
+
+static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
+{
+	return sbi->total_valid_node_count;
+}
+
+static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+	spin_lock(&sbi->stat_lock);
+	f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
+	sbi->total_valid_inode_count++;
+	spin_unlock(&sbi->stat_lock);
+}
+
+static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+	spin_lock(&sbi->stat_lock);
+	f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
+	sbi->total_valid_inode_count--;
+	spin_unlock(&sbi->stat_lock);
+}
+
+static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+{
+	return sbi->total_valid_inode_count;
+}
+
+static inline void f2fs_put_page(struct page *page, int unlock)
+{
+	if (!page)
+		return;
+
+	if (unlock) {
+		f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
+		unlock_page(page);
+	}
+	page_cache_release(page);
+}
+
+static inline void f2fs_put_dnode(struct dnode_of_data *dn)
+{
+	if (dn->node_page)
+		f2fs_put_page(dn->node_page, 1);
+	if (dn->inode_page && dn->node_page != dn->inode_page)
+		f2fs_put_page(dn->inode_page, 0);
+	dn->node_page = NULL;
+	dn->inode_page = NULL;
+}
+
+static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
+					size_t size)
+{
+	return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
+}
+
+static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
+						gfp_t flags)
+{
+	void *entry;
+retry:
+	entry = kmem_cache_alloc(cachep, flags);
+	if (!entry) {
+		cond_resched();
+		goto retry;
+	}
+
+	return entry;
+}
+
+static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
+				unsigned long index, void *item)
+{
+	while (radix_tree_insert(root, index, item))
+		cond_resched();
+}
+
+#define RAW_IS_INODE(p)	((p)->footer.nid == (p)->footer.ino)
+
+static inline bool IS_INODE(struct page *page)
+{
+	struct f2fs_node *p = F2FS_NODE(page);
+	return RAW_IS_INODE(p);
+}
+
+static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
+{
+	return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
+}
+
+static inline block_t datablock_addr(struct page *node_page,
+		unsigned int offset)
+{
+	struct f2fs_node *raw_node;
+	__le32 *addr_array;
+	raw_node = F2FS_NODE(node_page);
+	addr_array = blkaddr_in_node(raw_node);
+	return le32_to_cpu(addr_array[offset]);
+}
+
+static inline int f2fs_test_bit(unsigned int nr, char *addr)
+{
+	int mask;
+
+	addr += (nr >> 3);
+	mask = 1 << (7 - (nr & 0x07));
+	return mask & *addr;
+}
+
+static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr)
+{
+	int mask;
+	int ret;
+
+	addr += (nr >> 3);
+	mask = 1 << (7 - (nr & 0x07));
+	ret = mask & *addr;
+	*addr |= mask;
+	return ret;
+}
+
+static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr)
+{
+	int mask;
+	int ret;
+
+	addr += (nr >> 3);
+	mask = 1 << (7 - (nr & 0x07));
+	ret = mask & *addr;
+	*addr &= ~mask;
+	return ret;
+}
+
+static inline void f2fs_change_bit(unsigned int nr, char *addr)
+{
+	int mask;
+
+	addr += (nr >> 3);
+	mask = 1 << (7 - (nr & 0x07));
+	*addr ^= mask;
+}
+
+/* used for f2fs_inode_info->flags */
+enum {
+	FI_NEW_INODE,		/* indicate newly allocated inode */
+	FI_DIRTY_INODE,		/* indicate inode is dirty or not */
+	FI_DIRTY_DIR,		/* indicate directory has dirty pages */
+	FI_INC_LINK,		/* need to increment i_nlink */
+	FI_ACL_MODE,		/* indicate acl mode */
+	FI_NO_ALLOC,		/* should not allocate any blocks */
+	FI_UPDATE_DIR,		/* should update inode block for consistency */
+	FI_DELAY_IPUT,		/* used for the recovery */
+	FI_NO_EXTENT,		/* not to use the extent cache */
+	FI_INLINE_XATTR,	/* used for inline xattr */
+	FI_INLINE_DATA,		/* used for inline data*/
+	FI_INLINE_DENTRY,	/* used for inline dentry */
+	FI_APPEND_WRITE,	/* inode has appended data */
+	FI_UPDATE_WRITE,	/* inode has in-place-update data */
+	FI_NEED_IPU,		/* used for ipu per file */
+	FI_ATOMIC_FILE,		/* indicate atomic file */
+	FI_VOLATILE_FILE,	/* indicate volatile file */
+	FI_FIRST_BLOCK_WRITTEN,	/* indicate #0 data block was written */
+	FI_DROP_CACHE,		/* drop dirty page cache */
+	FI_DATA_EXIST,		/* indicate data exists */
+	FI_INLINE_DOTS,		/* indicate inline dot dentries */
+};
+
+static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+	if (!test_bit(flag, &fi->flags))
+		set_bit(flag, &fi->flags);
+}
+
+static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+{
+	return test_bit(flag, &fi->flags);
+}
+
+static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+	if (test_bit(flag, &fi->flags))
+		clear_bit(flag, &fi->flags);
+}
+
+static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+{
+	fi->i_acl_mode = mode;
+	set_inode_flag(fi, FI_ACL_MODE);
+}
+
+static inline void get_inline_info(struct f2fs_inode_info *fi,
+					struct f2fs_inode *ri)
+{
+	if (ri->i_inline & F2FS_INLINE_XATTR)
+		set_inode_flag(fi, FI_INLINE_XATTR);
+	if (ri->i_inline & F2FS_INLINE_DATA)
+		set_inode_flag(fi, FI_INLINE_DATA);
+	if (ri->i_inline & F2FS_INLINE_DENTRY)
+		set_inode_flag(fi, FI_INLINE_DENTRY);
+	if (ri->i_inline & F2FS_DATA_EXIST)
+		set_inode_flag(fi, FI_DATA_EXIST);
+	if (ri->i_inline & F2FS_INLINE_DOTS)
+		set_inode_flag(fi, FI_INLINE_DOTS);
+}
+
+static inline void set_raw_inline(struct f2fs_inode_info *fi,
+					struct f2fs_inode *ri)
+{
+	ri->i_inline = 0;
+
+	if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+		ri->i_inline |= F2FS_INLINE_XATTR;
+	if (is_inode_flag_set(fi, FI_INLINE_DATA))
+		ri->i_inline |= F2FS_INLINE_DATA;
+	if (is_inode_flag_set(fi, FI_INLINE_DENTRY))
+		ri->i_inline |= F2FS_INLINE_DENTRY;
+	if (is_inode_flag_set(fi, FI_DATA_EXIST))
+		ri->i_inline |= F2FS_DATA_EXIST;
+	if (is_inode_flag_set(fi, FI_INLINE_DOTS))
+		ri->i_inline |= F2FS_INLINE_DOTS;
+}
+
+static inline int f2fs_has_inline_xattr(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
+}
+
+static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
+{
+	if (f2fs_has_inline_xattr(&fi->vfs_inode))
+		return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
+	return DEF_ADDRS_PER_INODE;
+}
+
+static inline void *inline_xattr_addr(struct page *page)
+{
+	struct f2fs_inode *ri = F2FS_INODE(page);
+	return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
+					F2FS_INLINE_XATTR_ADDRS]);
+}
+
+static inline int inline_xattr_size(struct inode *inode)
+{
+	if (f2fs_has_inline_xattr(inode))
+		return F2FS_INLINE_XATTR_ADDRS << 2;
+	else
+		return 0;
+}
+
+static inline int f2fs_has_inline_data(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+}
+
+static inline void f2fs_clear_inline_inode(struct inode *inode)
+{
+	clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+	clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+}
+
+static inline int f2fs_exist_data(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
+}
+
+static inline int f2fs_has_inline_dots(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS);
+}
+
+static inline bool f2fs_is_atomic_file(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+}
+
+static inline bool f2fs_is_volatile_file(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+}
+
+static inline bool f2fs_is_first_block_written(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+}
+
+static inline bool f2fs_is_drop_cache(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
+}
+
+static inline void *inline_data_addr(struct page *page)
+{
+	struct f2fs_inode *ri = F2FS_INODE(page);
+	return (void *)&(ri->i_addr[1]);
+}
+
+static inline int f2fs_has_inline_dentry(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
+}
+
+static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
+{
+	if (!f2fs_has_inline_dentry(dir))
+		kunmap(page);
+}
+
+static inline int f2fs_readonly(struct super_block *sb)
+{
+	return sb->s_flags & MS_RDONLY;
+}
+
+static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
+{
+	return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+}
+
+static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
+{
+	set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+	sbi->sb->s_flags |= MS_RDONLY;
+}
+
+#define get_inode_mode(i) \
+	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
+
+/* get offset of first page in next direct node */
+#define PGOFS_OF_NEXT_DNODE(pgofs, fi)				\
+	((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) :	\
+	(pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) /	\
+	ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
+
+/*
+ * file.c
+ */
+int f2fs_sync_file(struct file *, loff_t, loff_t, int);
+void truncate_data_blocks(struct dnode_of_data *);
+int truncate_blocks(struct inode *, u64, bool);
+void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int f2fs_setattr(struct dentry *, struct iattr *);
+int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
+long f2fs_ioctl(struct file *, unsigned int, unsigned long);
+long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/*
+ * inode.c
+ */
+void f2fs_set_inode_flags(struct inode *);
+struct inode *f2fs_iget(struct super_block *, unsigned long);
+int try_to_free_nats(struct f2fs_sb_info *, int);
+void update_inode(struct inode *, struct page *);
+void update_inode_page(struct inode *);
+int f2fs_write_inode(struct inode *, struct writeback_control *);
+void f2fs_evict_inode(struct inode *);
+void handle_failed_inode(struct inode *);
+
+/*
+ * namei.c
+ */
+struct dentry *f2fs_get_parent(struct dentry *child);
+
+/*
+ * dir.c
+ */
+extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
+void set_de_type(struct f2fs_dir_entry *, umode_t);
+struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *,
+			struct f2fs_dentry_ptr *);
+bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
+			unsigned int);
+void do_make_empty_dir(struct inode *, struct inode *,
+			struct f2fs_dentry_ptr *);
+struct page *init_inode_metadata(struct inode *, struct inode *,
+			const struct qstr *, struct page *);
+void update_parent_metadata(struct inode *, struct inode *, unsigned int);
+int room_for_filename(const void *, int, int);
+void f2fs_drop_nlink(struct inode *, struct inode *, struct page *);
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
+							struct page **);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
+ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
+void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
+				struct page *, struct inode *);
+int update_dent_inode(struct inode *, const struct qstr *);
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
+			const struct qstr *, f2fs_hash_t , unsigned int);
+int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
+			umode_t);
+void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
+							struct inode *);
+int f2fs_do_tmpfile(struct inode *, struct inode *);
+int f2fs_make_empty(struct inode *, struct inode *);
+bool f2fs_empty_dir(struct inode *);
+
+static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name,
+				inode, inode->i_ino, inode->i_mode);
+}
+
+/*
+ * super.c
+ */
+int f2fs_sync_fs(struct super_block *, int);
+extern __printf(3, 4)
+void f2fs_msg(struct super_block *, const char *, const char *, ...);
+
+/*
+ * hash.c
+ */
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
+
+/*
+ * node.c
+ */
+struct dnode_of_data;
+struct node_info;
+
+bool available_free_memory(struct f2fs_sb_info *, int);
+bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
+bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
+void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
+int truncate_inode_blocks(struct inode *, pgoff_t);
+int truncate_xattr_node(struct inode *, struct page *);
+int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
+void remove_inode_page(struct inode *);
+struct page *new_inode_page(struct inode *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
+void ra_node_page(struct f2fs_sb_info *, nid_t);
+struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_node_page_ra(struct page *, int);
+void sync_inode_page(struct dnode_of_data *);
+int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
+bool alloc_nid(struct f2fs_sb_info *, nid_t *);
+void alloc_nid_done(struct f2fs_sb_info *, nid_t);
+void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
+void recover_inline_xattr(struct inode *, struct page *);
+void recover_xattr_data(struct inode *, struct page *, block_t);
+int recover_inode_page(struct f2fs_sb_info *, struct page *);
+int restore_node_summary(struct f2fs_sb_info *, unsigned int,
+				struct f2fs_summary_block *);
+void flush_nat_entries(struct f2fs_sb_info *);
+int build_node_manager(struct f2fs_sb_info *);
+void destroy_node_manager(struct f2fs_sb_info *);
+int __init create_node_manager_caches(void);
+void destroy_node_manager_caches(void);
+
+/*
+ * segment.c
+ */
+void register_inmem_page(struct inode *, struct page *);
+void commit_inmem_pages(struct inode *, bool);
+void f2fs_balance_fs(struct f2fs_sb_info *);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *);
+int f2fs_issue_flush(struct f2fs_sb_info *);
+int create_flush_cmd_control(struct f2fs_sb_info *);
+void destroy_flush_cmd_control(struct f2fs_sb_info *);
+void invalidate_blocks(struct f2fs_sb_info *, block_t);
+void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
+void clear_prefree_segments(struct f2fs_sb_info *);
+void release_discard_addrs(struct f2fs_sb_info *);
+void discard_next_dnode(struct f2fs_sb_info *, block_t);
+int npages_for_summary_flush(struct f2fs_sb_info *, bool);
+void allocate_new_segments(struct f2fs_sb_info *);
+int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
+struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
+void write_meta_page(struct f2fs_sb_info *, struct page *);
+void write_node_page(struct f2fs_sb_info *, struct page *,
+				unsigned int, struct f2fs_io_info *);
+void write_data_page(struct page *, struct dnode_of_data *,
+			struct f2fs_io_info *);
+void rewrite_data_page(struct page *, struct f2fs_io_info *);
+void recover_data_page(struct f2fs_sb_info *, struct page *,
+				struct f2fs_summary *, block_t, block_t);
+void allocate_data_block(struct f2fs_sb_info *, struct page *,
+		block_t, block_t *, struct f2fs_summary *, int);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void write_data_summaries(struct f2fs_sb_info *, block_t);
+void write_node_summaries(struct f2fs_sb_info *, block_t);
+int lookup_journal_in_cursum(struct f2fs_summary_block *,
+					int, unsigned int, int);
+void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
+int build_segment_manager(struct f2fs_sb_info *);
+void destroy_segment_manager(struct f2fs_sb_info *);
+int __init create_segment_manager_caches(void);
+void destroy_segment_manager_caches(void);
+
+/*
+ * checkpoint.c
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
+void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
+long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
+void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void release_dirty_inode(struct f2fs_sb_info *);
+bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
+int acquire_orphan_inode(struct f2fs_sb_info *);
+void release_orphan_inode(struct f2fs_sb_info *);
+void add_orphan_inode(struct f2fs_sb_info *, nid_t);
+void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
+void recover_orphan_inodes(struct f2fs_sb_info *);
+int get_valid_checkpoint(struct f2fs_sb_info *);
+void update_dirty_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
+void remove_dirty_dir_inode(struct inode *);
+void sync_dirty_dir_inodes(struct f2fs_sb_info *);
+void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
+void init_ino_entry_info(struct f2fs_sb_info *);
+int __init create_checkpoint_caches(void);
+void destroy_checkpoint_caches(void);
+
+/*
+ * data.c
+ */
+void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
+						struct f2fs_io_info *);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
+						struct f2fs_io_info *);
+void set_data_blkaddr(struct dnode_of_data *);
+int reserve_new_block(struct dnode_of_data *);
+int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
+void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
+void f2fs_destroy_extent_tree(struct inode *);
+void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
+void f2fs_update_extent_cache(struct dnode_of_data *);
+void f2fs_preserve_extent_tree(struct inode *);
+struct page *find_data_page(struct inode *, pgoff_t, bool);
+struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
+int do_write_data_page(struct page *, struct f2fs_io_info *);
+int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void init_extent_cache_info(struct f2fs_sb_info *);
+int __init create_extent_cache(void);
+void destroy_extent_cache(void);
+void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
+int f2fs_release_page(struct page *, gfp_t);
+
+/*
+ * gc.c
+ */
+int start_gc_thread(struct f2fs_sb_info *);
+void stop_gc_thread(struct f2fs_sb_info *);
+block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
+int f2fs_gc(struct f2fs_sb_info *);
+void build_gc_manager(struct f2fs_sb_info *);
+
+/*
+ * recovery.c
+ */
+int recover_fsync_data(struct f2fs_sb_info *);
+bool space_for_roll_forward(struct f2fs_sb_info *);
+
+/*
+ * debug.c
+ */
+#ifdef CONFIG_F2FS_STAT_FS
+struct f2fs_stat_info {
+	struct list_head stat_list;
+	struct f2fs_sb_info *sbi;
+	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
+	int main_area_segs, main_area_sections, main_area_zones;
+	int hit_ext, total_ext, ext_tree, ext_node;
+	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+	int nats, dirty_nats, sits, dirty_sits, fnids;
+	int total_count, utilization;
+	int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
+	unsigned int valid_count, valid_node_count, valid_inode_count;
+	unsigned int bimodal, avg_vblocks;
+	int util_free, util_valid, util_invalid;
+	int rsvd_segs, overp_segs;
+	int dirty_count, node_pages, meta_pages;
+	int prefree_count, call_count, cp_count;
+	int tot_segs, node_segs, data_segs, free_segs, free_secs;
+	int bg_node_segs, bg_data_segs;
+	int tot_blks, data_blks, node_blks;
+	int bg_data_blks, bg_node_blks;
+	int curseg[NR_CURSEG_TYPE];
+	int cursec[NR_CURSEG_TYPE];
+	int curzone[NR_CURSEG_TYPE];
+
+	unsigned int segment_count[2];
+	unsigned int block_count[2];
+	unsigned int inplace_count;
+	unsigned base_mem, cache_mem, page_mem;
+};
+
+static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
+{
+	return (struct f2fs_stat_info *)sbi->stat_info;
+}
+
+#define stat_inc_cp_count(si)		((si)->cp_count++)
+#define stat_inc_call_count(si)		((si)->call_count++)
+#define stat_inc_bggc_count(sbi)	((sbi)->bg_gc++)
+#define stat_inc_dirty_dir(sbi)		((sbi)->n_dirty_dirs++)
+#define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
+#define stat_inc_total_hit(sb)		((F2FS_SB(sb))->total_hit_ext++)
+#define stat_inc_read_hit(sb)		((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_inline_inode(inode)					\
+	do {								\
+		if (f2fs_has_inline_data(inode))			\
+			(atomic_inc(&F2FS_I_SB(inode)->inline_inode));	\
+	} while (0)
+#define stat_dec_inline_inode(inode)					\
+	do {								\
+		if (f2fs_has_inline_data(inode))			\
+			(atomic_dec(&F2FS_I_SB(inode)->inline_inode));	\
+	} while (0)
+#define stat_inc_inline_dir(inode)					\
+	do {								\
+		if (f2fs_has_inline_dentry(inode))			\
+			(atomic_inc(&F2FS_I_SB(inode)->inline_dir));	\
+	} while (0)
+#define stat_dec_inline_dir(inode)					\
+	do {								\
+		if (f2fs_has_inline_dentry(inode))			\
+			(atomic_dec(&F2FS_I_SB(inode)->inline_dir));	\
+	} while (0)
+#define stat_inc_seg_type(sbi, curseg)					\
+		((sbi)->segment_count[(curseg)->alloc_type]++)
+#define stat_inc_block_count(sbi, curseg)				\
+		((sbi)->block_count[(curseg)->alloc_type]++)
+#define stat_inc_inplace_blocks(sbi)					\
+		(atomic_inc(&(sbi)->inplace_count))
+#define stat_inc_seg_count(sbi, type, gc_type)				\
+	do {								\
+		struct f2fs_stat_info *si = F2FS_STAT(sbi);		\
+		(si)->tot_segs++;					\
+		if (type == SUM_TYPE_DATA) {				\
+			si->data_segs++;				\
+			si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0;	\
+		} else {						\
+			si->node_segs++;				\
+			si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0;	\
+		}							\
+	} while (0)
+
+#define stat_inc_tot_blk_count(si, blks)				\
+	(si->tot_blks += (blks))
+
+#define stat_inc_data_blk_count(sbi, blks, gc_type)			\
+	do {								\
+		struct f2fs_stat_info *si = F2FS_STAT(sbi);		\
+		stat_inc_tot_blk_count(si, blks);			\
+		si->data_blks += (blks);				\
+		si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0;	\
+	} while (0)
+
+#define stat_inc_node_blk_count(sbi, blks, gc_type)			\
+	do {								\
+		struct f2fs_stat_info *si = F2FS_STAT(sbi);		\
+		stat_inc_tot_blk_count(si, blks);			\
+		si->node_blks += (blks);				\
+		si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0;	\
+	} while (0)
+
+int f2fs_build_stats(struct f2fs_sb_info *);
+void f2fs_destroy_stats(struct f2fs_sb_info *);
+void __init f2fs_create_root_stats(void);
+void f2fs_destroy_root_stats(void);
+#else
+#define stat_inc_cp_count(si)
+#define stat_inc_call_count(si)
+#define stat_inc_bggc_count(si)
+#define stat_inc_dirty_dir(sbi)
+#define stat_dec_dirty_dir(sbi)
+#define stat_inc_total_hit(sb)
+#define stat_inc_read_hit(sb)
+#define stat_inc_inline_inode(inode)
+#define stat_dec_inline_inode(inode)
+#define stat_inc_inline_dir(inode)
+#define stat_dec_inline_dir(inode)
+#define stat_inc_seg_type(sbi, curseg)
+#define stat_inc_block_count(sbi, curseg)
+#define stat_inc_inplace_blocks(sbi)
+#define stat_inc_seg_count(sbi, type, gc_type)
+#define stat_inc_tot_blk_count(si, blks)
+#define stat_inc_data_blk_count(sbi, blks, gc_type)
+#define stat_inc_node_blk_count(sbi, blks, gc_type)
+
+static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
+static inline void __init f2fs_create_root_stats(void) { }
+static inline void f2fs_destroy_root_stats(void) { }
+#endif
+
+extern const struct file_operations f2fs_dir_operations;
+extern const struct file_operations f2fs_file_operations;
+extern const struct inode_operations f2fs_file_inode_operations;
+extern const struct address_space_operations f2fs_dblock_aops;
+extern const struct address_space_operations f2fs_node_aops;
+extern const struct address_space_operations f2fs_meta_aops;
+extern const struct inode_operations f2fs_dir_inode_operations;
+extern const struct inode_operations f2fs_symlink_inode_operations;
+extern const struct inode_operations f2fs_special_inode_operations;
+extern struct kmem_cache *inode_entry_slab;
+
+/*
+ * inline.c
+ */
+bool f2fs_may_inline(struct inode *);
+void read_inline_data(struct page *, struct page *);
+bool truncate_inline_inode(struct page *, u64);
+int f2fs_read_inline_data(struct inode *, struct page *);
+int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
+int f2fs_convert_inline_inode(struct inode *);
+int f2fs_write_inline_data(struct inode *, struct page *);
+bool recover_inline_data(struct inode *, struct page *);
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
+							struct page **);
+struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
+int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
+int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
+						nid_t, umode_t);
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
+						struct inode *, struct inode *);
+bool f2fs_empty_inline_dir(struct inode *);
+int f2fs_read_inline_dir(struct file *, struct dir_context *);
+#endif
diff --git a/kernel/fs/f2fs/file.c b/kernel/fs/f2fs/file.c
new file mode 100644
index 000000000..2b52e48d7
--- /dev/null
+++ b/kernel/fs/f2fs/file.c
@@ -0,0 +1,1172 @@
+/*
+ * fs/f2fs/file.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/stat.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/falloc.h>
+#include <linux/types.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <linux/mount.h>
+#include <linux/pagevec.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "xattr.h"
+#include "acl.h"
+#include "trace.h"
+#include <trace/events/f2fs.h>
+
+static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
+						struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct dnode_of_data dn;
+	int err;
+
+	f2fs_balance_fs(sbi);
+
+	sb_start_pagefault(inode->i_sb);
+
+	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
+
+	/* block allocation */
+	f2fs_lock_op(sbi);
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = f2fs_reserve_block(&dn, page->index);
+	if (err) {
+		f2fs_unlock_op(sbi);
+		goto out;
+	}
+	f2fs_put_dnode(&dn);
+	f2fs_unlock_op(sbi);
+
+	file_update_time(vma->vm_file);
+	lock_page(page);
+	if (unlikely(page->mapping != inode->i_mapping ||
+			page_offset(page) > i_size_read(inode) ||
+			!PageUptodate(page))) {
+		unlock_page(page);
+		err = -EFAULT;
+		goto out;
+	}
+
+	/*
+	 * check to see if the page is mapped already (no holes)
+	 */
+	if (PageMappedToDisk(page))
+		goto mapped;
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+		unsigned offset;
+		offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
+		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	}
+	set_page_dirty(page);
+	SetPageUptodate(page);
+
+	trace_f2fs_vm_page_mkwrite(page, DATA);
+mapped:
+	/* fill the page */
+	f2fs_wait_on_page_writeback(page, DATA);
+out:
+	sb_end_pagefault(inode->i_sb);
+	return block_page_mkwrite_return(err);
+}
+
+static const struct vm_operations_struct f2fs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= f2fs_vm_page_mkwrite,
+};
+
+static int get_parent_ino(struct inode *inode, nid_t *pino)
+{
+	struct dentry *dentry;
+
+	inode = igrab(inode);
+	dentry = d_find_any_alias(inode);
+	iput(inode);
+	if (!dentry)
+		return 0;
+
+	if (update_dent_inode(inode, &dentry->d_name)) {
+		dput(dentry);
+		return 0;
+	}
+
+	*pino = parent_ino(dentry);
+	dput(dentry);
+	return 1;
+}
+
+static inline bool need_do_checkpoint(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	bool need_cp = false;
+
+	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+		need_cp = true;
+	else if (file_wrong_pino(inode))
+		need_cp = true;
+	else if (!space_for_roll_forward(sbi))
+		need_cp = true;
+	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+		need_cp = true;
+	else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+		need_cp = true;
+	else if (test_opt(sbi, FASTBOOT))
+		need_cp = true;
+	else if (sbi->active_logs == 2)
+		need_cp = true;
+
+	return need_cp;
+}
+
+static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+	bool ret = false;
+	/* But we need to avoid that there are some inode updates */
+	if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino))
+		ret = true;
+	f2fs_put_page(i, 0);
+	return ret;
+}
+
+static void try_to_fix_pino(struct inode *inode)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	nid_t pino;
+
+	down_write(&fi->i_sem);
+	fi->xattr_ver = 0;
+	if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+			get_parent_ino(inode, &pino)) {
+		fi->i_pino = pino;
+		file_got_pino(inode);
+		up_write(&fi->i_sem);
+
+		mark_inode_dirty_sync(inode);
+		f2fs_write_inode(inode, NULL);
+	} else {
+		up_write(&fi->i_sem);
+	}
+}
+
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t ino = inode->i_ino;
+	int ret = 0;
+	bool need_cp = false;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.for_reclaim = 0,
+	};
+
+	if (unlikely(f2fs_readonly(inode->i_sb)))
+		return 0;
+
+	trace_f2fs_sync_file_enter(inode);
+
+	/* if fdatasync is triggered, let's do in-place-update */
+	if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
+		set_inode_flag(fi, FI_NEED_IPU);
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	clear_inode_flag(fi, FI_NEED_IPU);
+
+	if (ret) {
+		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+		return ret;
+	}
+
+	/* if the inode is dirty, let's recover all the time */
+	if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) {
+		update_inode_page(inode);
+		goto go_write;
+	}
+
+	/*
+	 * if there is no written data, don't waste time to write recovery info.
+	 */
+	if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
+			!exist_written_data(sbi, ino, APPEND_INO)) {
+
+		/* it may call write_inode just prior to fsync */
+		if (need_inode_page_update(sbi, ino))
+			goto go_write;
+
+		if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
+				exist_written_data(sbi, ino, UPDATE_INO))
+			goto flush_out;
+		goto out;
+	}
+go_write:
+	/* guarantee free sections for fsync */
+	f2fs_balance_fs(sbi);
+
+	/*
+	 * Both of fdatasync() and fsync() are able to be recovered from
+	 * sudden-power-off.
+	 */
+	down_read(&fi->i_sem);
+	need_cp = need_do_checkpoint(inode);
+	up_read(&fi->i_sem);
+
+	if (need_cp) {
+		/* all the dirty node pages should be flushed for POR */
+		ret = f2fs_sync_fs(inode->i_sb, 1);
+
+		/*
+		 * We've secured consistency through sync_fs. Following pino
+		 * will be used only for fsynced inodes after checkpoint.
+		 */
+		try_to_fix_pino(inode);
+		clear_inode_flag(fi, FI_APPEND_WRITE);
+		clear_inode_flag(fi, FI_UPDATE_WRITE);
+		goto out;
+	}
+sync_nodes:
+	sync_node_pages(sbi, ino, &wbc);
+
+	/* if cp_error was enabled, we should avoid infinite loop */
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto out;
+
+	if (need_inode_block_update(sbi, ino)) {
+		mark_inode_dirty_sync(inode);
+		f2fs_write_inode(inode, NULL);
+		goto sync_nodes;
+	}
+
+	ret = wait_on_node_pages_writeback(sbi, ino);
+	if (ret)
+		goto out;
+
+	/* once recovery info is written, don't need to tack this */
+	remove_dirty_inode(sbi, ino, APPEND_INO);
+	clear_inode_flag(fi, FI_APPEND_WRITE);
+flush_out:
+	remove_dirty_inode(sbi, ino, UPDATE_INO);
+	clear_inode_flag(fi, FI_UPDATE_WRITE);
+	ret = f2fs_issue_flush(sbi);
+out:
+	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+	f2fs_trace_ios(NULL, NULL, 1);
+	return ret;
+}
+
+static pgoff_t __get_first_dirty_index(struct address_space *mapping,
+						pgoff_t pgofs, int whence)
+{
+	struct pagevec pvec;
+	int nr_pages;
+
+	if (whence != SEEK_DATA)
+		return 0;
+
+	/* find first dirty page index */
+	pagevec_init(&pvec, 0);
+	nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
+					PAGECACHE_TAG_DIRTY, 1);
+	pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
+	pagevec_release(&pvec);
+	return pgofs;
+}
+
+static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs,
+							int whence)
+{
+	switch (whence) {
+	case SEEK_DATA:
+		if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
+			(blkaddr != NEW_ADDR && blkaddr != NULL_ADDR))
+			return true;
+		break;
+	case SEEK_HOLE:
+		if (blkaddr == NULL_ADDR)
+			return true;
+		break;
+	}
+	return false;
+}
+
+static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxbytes = inode->i_sb->s_maxbytes;
+	struct dnode_of_data dn;
+	pgoff_t pgofs, end_offset, dirty;
+	loff_t data_ofs = offset;
+	loff_t isize;
+	int err = 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+	if (offset >= isize)
+		goto fail;
+
+	/* handle inline data case */
+	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) {
+		if (whence == SEEK_HOLE)
+			data_ofs = isize;
+		goto found;
+	}
+
+	pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+
+	dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
+
+	for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+		if (err && err != -ENOENT) {
+			goto fail;
+		} else if (err == -ENOENT) {
+			/* direct node does not exists */
+			if (whence == SEEK_DATA) {
+				pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
+							F2FS_I(inode));
+				continue;
+			} else {
+				goto found;
+			}
+		}
+
+		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+
+		/* find data/hole in dnode block */
+		for (; dn.ofs_in_node < end_offset;
+				dn.ofs_in_node++, pgofs++,
+				data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+			block_t blkaddr;
+			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+			if (__found_offset(blkaddr, dirty, pgofs, whence)) {
+				f2fs_put_dnode(&dn);
+				goto found;
+			}
+		}
+		f2fs_put_dnode(&dn);
+	}
+
+	if (whence == SEEK_DATA)
+		goto fail;
+found:
+	if (whence == SEEK_HOLE && data_ofs > isize)
+		data_ofs = isize;
+	mutex_unlock(&inode->i_mutex);
+	return vfs_setpos(file, data_ofs, maxbytes);
+fail:
+	mutex_unlock(&inode->i_mutex);
+	return -ENXIO;
+}
+
+static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxbytes = inode->i_sb->s_maxbytes;
+
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		return generic_file_llseek_size(file, offset, whence,
+						maxbytes, i_size_read(inode));
+	case SEEK_DATA:
+	case SEEK_HOLE:
+		if (offset < 0)
+			return -ENXIO;
+		return f2fs_seek_block(file, offset, whence);
+	}
+
+	return -EINVAL;
+}
+
+static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(file);
+
+	/* we don't need to use inline_data strictly */
+	if (f2fs_has_inline_data(inode)) {
+		int err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
+	}
+
+	file_accessed(file);
+	vma->vm_ops = &f2fs_file_vm_ops;
+	return 0;
+}
+
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+{
+	int nr_free = 0, ofs = dn->ofs_in_node;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct f2fs_node *raw_node;
+	__le32 *addr;
+
+	raw_node = F2FS_NODE(dn->node_page);
+	addr = blkaddr_in_node(raw_node) + ofs;
+
+	for (; count > 0; count--, addr++, dn->ofs_in_node++) {
+		block_t blkaddr = le32_to_cpu(*addr);
+		if (blkaddr == NULL_ADDR)
+			continue;
+
+		dn->data_blkaddr = NULL_ADDR;
+		set_data_blkaddr(dn);
+		f2fs_update_extent_cache(dn);
+		invalidate_blocks(sbi, blkaddr);
+		if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
+			clear_inode_flag(F2FS_I(dn->inode),
+						FI_FIRST_BLOCK_WRITTEN);
+		nr_free++;
+	}
+	if (nr_free) {
+		dec_valid_block_count(sbi, dn->inode, nr_free);
+		set_page_dirty(dn->node_page);
+		sync_inode_page(dn);
+	}
+	dn->ofs_in_node = ofs;
+
+	trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
+					 dn->ofs_in_node, nr_free);
+	return nr_free;
+}
+
+void truncate_data_blocks(struct dnode_of_data *dn)
+{
+	truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
+}
+
+static int truncate_partial_data_page(struct inode *inode, u64 from,
+								bool force)
+{
+	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+	struct page *page;
+
+	if (!offset && !force)
+		return 0;
+
+	page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, force);
+	if (IS_ERR(page))
+		return 0;
+
+	lock_page(page);
+	if (unlikely(!PageUptodate(page) ||
+			page->mapping != inode->i_mapping))
+		goto out;
+
+	f2fs_wait_on_page_writeback(page, DATA);
+	zero_user(page, offset, PAGE_CACHE_SIZE - offset);
+	if (!force)
+		set_page_dirty(page);
+out:
+	f2fs_put_page(page, 1);
+	return 0;
+}
+
+int truncate_blocks(struct inode *inode, u64 from, bool lock)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned int blocksize = inode->i_sb->s_blocksize;
+	struct dnode_of_data dn;
+	pgoff_t free_from;
+	int count = 0, err = 0;
+	struct page *ipage;
+	bool truncate_page = false;
+
+	trace_f2fs_truncate_blocks_enter(inode, from);
+
+	free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
+
+	if (lock)
+		f2fs_lock_op(sbi);
+
+	ipage = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(ipage)) {
+		err = PTR_ERR(ipage);
+		goto out;
+	}
+
+	if (f2fs_has_inline_data(inode)) {
+		if (truncate_inline_inode(ipage, from))
+			set_page_dirty(ipage);
+		f2fs_put_page(ipage, 1);
+		truncate_page = true;
+		goto out;
+	}
+
+	set_new_dnode(&dn, inode, ipage, NULL, 0);
+	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
+	if (err) {
+		if (err == -ENOENT)
+			goto free_next;
+		goto out;
+	}
+
+	count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+
+	count -= dn.ofs_in_node;
+	f2fs_bug_on(sbi, count < 0);
+
+	if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
+		truncate_data_blocks_range(&dn, count);
+		free_from += count;
+	}
+
+	f2fs_put_dnode(&dn);
+free_next:
+	err = truncate_inode_blocks(inode, free_from);
+out:
+	if (lock)
+		f2fs_unlock_op(sbi);
+
+	/* lastly zero out the first data page */
+	if (!err)
+		err = truncate_partial_data_page(inode, from, truncate_page);
+
+	trace_f2fs_truncate_blocks_exit(inode, err);
+	return err;
+}
+
+void f2fs_truncate(struct inode *inode)
+{
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+				S_ISLNK(inode->i_mode)))
+		return;
+
+	trace_f2fs_truncate(inode);
+
+	/* we should check inline_data size */
+	if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) {
+		if (f2fs_convert_inline_inode(inode))
+			return;
+	}
+
+	if (!truncate_blocks(inode, i_size_read(inode), true)) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty(inode);
+	}
+}
+
+int f2fs_getattr(struct vfsmount *mnt,
+			 struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	generic_fillattr(inode, stat);
+	stat->blocks <<= 3;
+	return 0;
+}
+
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_UID)
+		inode->i_uid = attr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		inode->i_gid = attr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		inode->i_atime = timespec_trunc(attr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		inode->i_mtime = timespec_trunc(attr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		inode->i_ctime = timespec_trunc(attr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		set_acl_inode(fi, mode);
+	}
+}
+#else
+#define __setattr_copy setattr_copy
+#endif
+
+int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	int err;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size != i_size_read(inode)) {
+			truncate_setsize(inode, attr->ia_size);
+			f2fs_truncate(inode);
+			f2fs_balance_fs(F2FS_I_SB(inode));
+		} else {
+			/*
+			 * giving a chance to truncate blocks past EOF which
+			 * are fallocated with FALLOC_FL_KEEP_SIZE.
+			 */
+			f2fs_truncate(inode);
+		}
+	}
+
+	__setattr_copy(inode, attr);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		err = posix_acl_chmod(inode, get_inode_mode(inode));
+		if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
+			inode->i_mode = fi->i_acl_mode;
+			clear_inode_flag(fi, FI_ACL_MODE);
+		}
+	}
+
+	mark_inode_dirty(inode);
+	return err;
+}
+
+const struct inode_operations f2fs_file_inode_operations = {
+	.getattr	= f2fs_getattr,
+	.setattr	= f2fs_setattr,
+	.get_acl	= f2fs_get_acl,
+	.set_acl	= f2fs_set_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= f2fs_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.fiemap		= f2fs_fiemap,
+};
+
+static void fill_zero(struct inode *inode, pgoff_t index,
+					loff_t start, loff_t len)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct page *page;
+
+	if (!len)
+		return;
+
+	f2fs_balance_fs(sbi);
+
+	f2fs_lock_op(sbi);
+	page = get_new_data_page(inode, NULL, index, false);
+	f2fs_unlock_op(sbi);
+
+	if (!IS_ERR(page)) {
+		f2fs_wait_on_page_writeback(page, DATA);
+		zero_user(page, start, len);
+		set_page_dirty(page);
+		f2fs_put_page(page, 1);
+	}
+}
+
+int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
+{
+	pgoff_t index;
+	int err;
+
+	for (index = pg_start; index < pg_end; index++) {
+		struct dnode_of_data dn;
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+		if (err) {
+			if (err == -ENOENT)
+				continue;
+			return err;
+		}
+
+		if (dn.data_blkaddr != NULL_ADDR)
+			truncate_data_blocks_range(&dn, 1);
+		f2fs_put_dnode(&dn);
+	}
+	return 0;
+}
+
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	pgoff_t pg_start, pg_end;
+	loff_t off_start, off_end;
+	int ret = 0;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	/* skip punching hole beyond i_size */
+	if (offset >= inode->i_size)
+		return ret;
+
+	if (f2fs_has_inline_data(inode)) {
+		ret = f2fs_convert_inline_inode(inode);
+		if (ret)
+			return ret;
+	}
+
+	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+	off_start = offset & (PAGE_CACHE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+	if (pg_start == pg_end) {
+		fill_zero(inode, pg_start, off_start,
+						off_end - off_start);
+	} else {
+		if (off_start)
+			fill_zero(inode, pg_start++, off_start,
+					PAGE_CACHE_SIZE - off_start);
+		if (off_end)
+			fill_zero(inode, pg_end, 0, off_end);
+
+		if (pg_start < pg_end) {
+			struct address_space *mapping = inode->i_mapping;
+			loff_t blk_start, blk_end;
+			struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+			f2fs_balance_fs(sbi);
+
+			blk_start = pg_start << PAGE_CACHE_SHIFT;
+			blk_end = pg_end << PAGE_CACHE_SHIFT;
+			truncate_inode_pages_range(mapping, blk_start,
+					blk_end - 1);
+
+			f2fs_lock_op(sbi);
+			ret = truncate_hole(inode, pg_start, pg_end);
+			f2fs_unlock_op(sbi);
+		}
+	}
+
+	return ret;
+}
+
+static int expand_inode_data(struct inode *inode, loff_t offset,
+					loff_t len, int mode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	pgoff_t index, pg_start, pg_end;
+	loff_t new_size = i_size_read(inode);
+	loff_t off_start, off_end;
+	int ret = 0;
+
+	f2fs_balance_fs(sbi);
+
+	ret = inode_newsize_ok(inode, (len + offset));
+	if (ret)
+		return ret;
+
+	if (f2fs_has_inline_data(inode)) {
+		ret = f2fs_convert_inline_inode(inode);
+		if (ret)
+			return ret;
+	}
+
+	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+	off_start = offset & (PAGE_CACHE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+	f2fs_lock_op(sbi);
+
+	for (index = pg_start; index <= pg_end; index++) {
+		struct dnode_of_data dn;
+
+		if (index == pg_end && !off_end)
+			goto noalloc;
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		ret = f2fs_reserve_block(&dn, index);
+		if (ret)
+			break;
+noalloc:
+		if (pg_start == pg_end)
+			new_size = offset + len;
+		else if (index == pg_start && off_start)
+			new_size = (index + 1) << PAGE_CACHE_SHIFT;
+		else if (index == pg_end)
+			new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+		else
+			new_size += PAGE_CACHE_SIZE;
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		i_size_read(inode) < new_size) {
+		i_size_write(inode, new_size);
+		mark_inode_dirty(inode);
+		update_inode_page(inode);
+	}
+	f2fs_unlock_op(sbi);
+
+	return ret;
+}
+
+static long f2fs_fallocate(struct file *file, int mode,
+				loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	long ret;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		ret = punch_hole(inode, offset, len);
+	else
+		ret = expand_inode_data(inode, offset, len, mode);
+
+	if (!ret) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty(inode);
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	trace_f2fs_fallocate(inode, mode, offset, len, ret);
+	return ret;
+}
+
+static int f2fs_release_file(struct inode *inode, struct file *filp)
+{
+	/* some remained atomic pages should discarded */
+	if (f2fs_is_atomic_file(inode))
+		commit_inmem_pages(inode, true);
+	if (f2fs_is_volatile_file(inode)) {
+		set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+		filemap_fdatawrite(inode->i_mapping);
+		clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+	}
+	return 0;
+}
+
+#define F2FS_REG_FLMASK		(~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+#define F2FS_OTHER_FLMASK	(FS_NODUMP_FL | FS_NOATIME_FL)
+
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & F2FS_REG_FLMASK;
+	else
+		return flags & F2FS_OTHER_FLMASK;
+}
+
+static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+	return put_user(flags, (int __user *)arg);
+}
+
+static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+	unsigned int oldflags;
+	int ret;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	if (!inode_owner_or_capable(inode)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	if (get_user(flags, (int __user *)arg)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	flags = f2fs_mask_flags(inode->i_mode, flags);
+
+	mutex_lock(&inode->i_mutex);
+
+	oldflags = fi->i_flags;
+
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			mutex_unlock(&inode->i_mutex);
+			ret = -EPERM;
+			goto out;
+		}
+	}
+
+	flags = flags & FS_FL_USER_MODIFIABLE;
+	flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+	fi->i_flags = flags;
+	mutex_unlock(&inode->i_mutex);
+
+	f2fs_set_inode_flags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+
+	return put_user(inode->i_generation, (int __user *)arg);
+}
+
+static int f2fs_ioc_start_atomic_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	f2fs_balance_fs(F2FS_I_SB(inode));
+
+	if (f2fs_is_atomic_file(inode))
+		return 0;
+
+	set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+
+	return f2fs_convert_inline_inode(inode);
+}
+
+static int f2fs_ioc_commit_atomic_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	int ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (f2fs_is_volatile_file(inode))
+		return 0;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	if (f2fs_is_atomic_file(inode))
+		commit_inmem_pages(inode, false);
+
+	ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+	mnt_drop_write_file(filp);
+	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+	return ret;
+}
+
+static int f2fs_ioc_start_volatile_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (f2fs_is_volatile_file(inode))
+		return 0;
+
+	set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+
+	return f2fs_convert_inline_inode(inode);
+}
+
+static int f2fs_ioc_release_volatile_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (!f2fs_is_volatile_file(inode))
+		return 0;
+
+	if (!f2fs_is_first_block_written(inode))
+		return truncate_partial_data_page(inode, 0, true);
+
+	punch_hole(inode, 0, F2FS_BLKSIZE);
+	return 0;
+}
+
+static int f2fs_ioc_abort_volatile_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	int ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	f2fs_balance_fs(F2FS_I_SB(inode));
+
+	if (f2fs_is_atomic_file(inode)) {
+		commit_inmem_pages(inode, false);
+		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+	}
+
+	if (f2fs_is_volatile_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+		filemap_fdatawrite(inode->i_mapping);
+		set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+	}
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct super_block *sb = sbi->sb;
+	__u32 in;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(in, (__u32 __user *)arg))
+		return -EFAULT;
+
+	switch (in) {
+	case F2FS_GOING_DOWN_FULLSYNC:
+		sb = freeze_bdev(sb->s_bdev);
+		if (sb && !IS_ERR(sb)) {
+			f2fs_stop_checkpoint(sbi);
+			thaw_bdev(sb->s_bdev, sb);
+		}
+		break;
+	case F2FS_GOING_DOWN_METASYNC:
+		/* do checkpoint only */
+		f2fs_sync_fs(sb, 1);
+		f2fs_stop_checkpoint(sbi);
+		break;
+	case F2FS_GOING_DOWN_NOSYNC:
+		f2fs_stop_checkpoint(sbi);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	struct fstrim_range range;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+				sizeof(range)))
+		return -EFAULT;
+
+	range.minlen = max((unsigned int)range.minlen,
+				q->limits.discard_granularity);
+	ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user((struct fstrim_range __user *)arg, &range,
+				sizeof(range)))
+		return -EFAULT;
+	return 0;
+}
+
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case F2FS_IOC_GETFLAGS:
+		return f2fs_ioc_getflags(filp, arg);
+	case F2FS_IOC_SETFLAGS:
+		return f2fs_ioc_setflags(filp, arg);
+	case F2FS_IOC_GETVERSION:
+		return f2fs_ioc_getversion(filp, arg);
+	case F2FS_IOC_START_ATOMIC_WRITE:
+		return f2fs_ioc_start_atomic_write(filp);
+	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+		return f2fs_ioc_commit_atomic_write(filp);
+	case F2FS_IOC_START_VOLATILE_WRITE:
+		return f2fs_ioc_start_volatile_write(filp);
+	case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+		return f2fs_ioc_release_volatile_write(filp);
+	case F2FS_IOC_ABORT_VOLATILE_WRITE:
+		return f2fs_ioc_abort_volatile_write(filp);
+	case F2FS_IOC_SHUTDOWN:
+		return f2fs_ioc_shutdown(filp, arg);
+	case FITRIM:
+		return f2fs_ioc_fitrim(filp, arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case F2FS_IOC32_GETFLAGS:
+		cmd = F2FS_IOC_GETFLAGS;
+		break;
+	case F2FS_IOC32_SETFLAGS:
+		cmd = F2FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+const struct file_operations f2fs_file_operations = {
+	.llseek		= f2fs_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.open		= generic_file_open,
+	.release	= f2fs_release_file,
+	.mmap		= f2fs_file_mmap,
+	.fsync		= f2fs_sync_file,
+	.fallocate	= f2fs_fallocate,
+	.unlocked_ioctl	= f2fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= f2fs_compat_ioctl,
+#endif
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+};
diff --git a/kernel/fs/f2fs/gc.c b/kernel/fs/f2fs/gc.c
new file mode 100644
index 000000000..ed58211fe
--- /dev/null
+++ b/kernel/fs/f2fs/gc.c
@@ -0,0 +1,746 @@
+/*
+ * fs/f2fs/gc.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/init.h>
+#include <linux/f2fs_fs.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/blkdev.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+#include <trace/events/f2fs.h>
+
+static int gc_thread_func(void *data)
+{
+	struct f2fs_sb_info *sbi = data;
+	struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
+	wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
+	long wait_ms;
+
+	wait_ms = gc_th->min_sleep_time;
+
+	do {
+		if (try_to_freeze())
+			continue;
+		else
+			wait_event_interruptible_timeout(*wq,
+						kthread_should_stop(),
+						msecs_to_jiffies(wait_ms));
+		if (kthread_should_stop())
+			break;
+
+		if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
+			increase_sleep_time(gc_th, &wait_ms);
+			continue;
+		}
+
+		/*
+		 * [GC triggering condition]
+		 * 0. GC is not conducted currently.
+		 * 1. There are enough dirty segments.
+		 * 2. IO subsystem is idle by checking the # of writeback pages.
+		 * 3. IO subsystem is idle by checking the # of requests in
+		 *    bdev's request list.
+		 *
+		 * Note) We have to avoid triggering GCs frequently.
+		 * Because it is possible that some segments can be
+		 * invalidated soon after by user update or deletion.
+		 * So, I'd like to wait some time to collect dirty segments.
+		 */
+		if (!mutex_trylock(&sbi->gc_mutex))
+			continue;
+
+		if (!is_idle(sbi)) {
+			increase_sleep_time(gc_th, &wait_ms);
+			mutex_unlock(&sbi->gc_mutex);
+			continue;
+		}
+
+		if (has_enough_invalid_blocks(sbi))
+			decrease_sleep_time(gc_th, &wait_ms);
+		else
+			increase_sleep_time(gc_th, &wait_ms);
+
+		stat_inc_bggc_count(sbi);
+
+		/* if return value is not zero, no victim was selected */
+		if (f2fs_gc(sbi))
+			wait_ms = gc_th->no_gc_sleep_time;
+
+		/* balancing f2fs's metadata periodically */
+		f2fs_balance_fs_bg(sbi);
+
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+int start_gc_thread(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_gc_kthread *gc_th;
+	dev_t dev = sbi->sb->s_bdev->bd_dev;
+	int err = 0;
+
+	gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+	if (!gc_th) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
+	gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
+	gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
+
+	gc_th->gc_idle = 0;
+
+	sbi->gc_thread = gc_th;
+	init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
+	sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
+			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
+	if (IS_ERR(gc_th->f2fs_gc_task)) {
+		err = PTR_ERR(gc_th->f2fs_gc_task);
+		kfree(gc_th);
+		sbi->gc_thread = NULL;
+	}
+out:
+	return err;
+}
+
+void stop_gc_thread(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
+	if (!gc_th)
+		return;
+	kthread_stop(gc_th->f2fs_gc_task);
+	kfree(gc_th);
+	sbi->gc_thread = NULL;
+}
+
+static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
+{
+	int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+
+	if (gc_th && gc_th->gc_idle) {
+		if (gc_th->gc_idle == 1)
+			gc_mode = GC_CB;
+		else if (gc_th->gc_idle == 2)
+			gc_mode = GC_GREEDY;
+	}
+	return gc_mode;
+}
+
+static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
+			int type, struct victim_sel_policy *p)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+	if (p->alloc_mode == SSR) {
+		p->gc_mode = GC_GREEDY;
+		p->dirty_segmap = dirty_i->dirty_segmap[type];
+		p->max_search = dirty_i->nr_dirty[type];
+		p->ofs_unit = 1;
+	} else {
+		p->gc_mode = select_gc_type(sbi->gc_thread, gc_type);
+		p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
+		p->max_search = dirty_i->nr_dirty[DIRTY];
+		p->ofs_unit = sbi->segs_per_sec;
+	}
+
+	if (p->max_search > sbi->max_victim_search)
+		p->max_search = sbi->max_victim_search;
+
+	p->offset = sbi->last_victim[p->gc_mode];
+}
+
+static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
+				struct victim_sel_policy *p)
+{
+	/* SSR allocates in a segment unit */
+	if (p->alloc_mode == SSR)
+		return 1 << sbi->log_blocks_per_seg;
+	if (p->gc_mode == GC_GREEDY)
+		return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+	else if (p->gc_mode == GC_CB)
+		return UINT_MAX;
+	else /* No other gc_mode */
+		return 0;
+}
+
+static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	unsigned int secno;
+
+	/*
+	 * If the gc_type is FG_GC, we can select victim segments
+	 * selected by background GC before.
+	 * Those segments guarantee they have small valid blocks.
+	 */
+	for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
+		if (sec_usage_check(sbi, secno))
+			continue;
+		clear_bit(secno, dirty_i->victim_secmap);
+		return secno * sbi->segs_per_sec;
+	}
+	return NULL_SEGNO;
+}
+
+static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	unsigned int secno = GET_SECNO(sbi, segno);
+	unsigned int start = secno * sbi->segs_per_sec;
+	unsigned long long mtime = 0;
+	unsigned int vblocks;
+	unsigned char age = 0;
+	unsigned char u;
+	unsigned int i;
+
+	for (i = 0; i < sbi->segs_per_sec; i++)
+		mtime += get_seg_entry(sbi, start + i)->mtime;
+	vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+
+	mtime = div_u64(mtime, sbi->segs_per_sec);
+	vblocks = div_u64(vblocks, sbi->segs_per_sec);
+
+	u = (vblocks * 100) >> sbi->log_blocks_per_seg;
+
+	/* Handle if the system time has changed by the user */
+	if (mtime < sit_i->min_mtime)
+		sit_i->min_mtime = mtime;
+	if (mtime > sit_i->max_mtime)
+		sit_i->max_mtime = mtime;
+	if (sit_i->max_mtime != sit_i->min_mtime)
+		age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
+				sit_i->max_mtime - sit_i->min_mtime);
+
+	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
+}
+
+static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
+			unsigned int segno, struct victim_sel_policy *p)
+{
+	if (p->alloc_mode == SSR)
+		return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+
+	/* alloc_mode == LFS */
+	if (p->gc_mode == GC_GREEDY)
+		return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+	else
+		return get_cb_cost(sbi, segno);
+}
+
+/*
+ * This function is called from two paths.
+ * One is garbage collection and the other is SSR segment selection.
+ * When it is called during GC, it just gets a victim segment
+ * and it does not remove it from dirty seglist.
+ * When it is called from SSR segment selection, it finds a segment
+ * which has minimum valid blocks and removes it from dirty seglist.
+ */
+static int get_victim_by_default(struct f2fs_sb_info *sbi,
+		unsigned int *result, int gc_type, int type, char alloc_mode)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	struct victim_sel_policy p;
+	unsigned int secno, max_cost;
+	int nsearched = 0;
+
+	mutex_lock(&dirty_i->seglist_lock);
+
+	p.alloc_mode = alloc_mode;
+	select_policy(sbi, gc_type, type, &p);
+
+	p.min_segno = NULL_SEGNO;
+	p.min_cost = max_cost = get_max_cost(sbi, &p);
+
+	if (p.alloc_mode == LFS && gc_type == FG_GC) {
+		p.min_segno = check_bg_victims(sbi);
+		if (p.min_segno != NULL_SEGNO)
+			goto got_it;
+	}
+
+	while (1) {
+		unsigned long cost;
+		unsigned int segno;
+
+		segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
+		if (segno >= MAIN_SEGS(sbi)) {
+			if (sbi->last_victim[p.gc_mode]) {
+				sbi->last_victim[p.gc_mode] = 0;
+				p.offset = 0;
+				continue;
+			}
+			break;
+		}
+
+		p.offset = segno + p.ofs_unit;
+		if (p.ofs_unit > 1)
+			p.offset -= segno % p.ofs_unit;
+
+		secno = GET_SECNO(sbi, segno);
+
+		if (sec_usage_check(sbi, secno))
+			continue;
+		if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
+			continue;
+
+		cost = get_gc_cost(sbi, segno, &p);
+
+		if (p.min_cost > cost) {
+			p.min_segno = segno;
+			p.min_cost = cost;
+		} else if (unlikely(cost == max_cost)) {
+			continue;
+		}
+
+		if (nsearched++ >= p.max_search) {
+			sbi->last_victim[p.gc_mode] = segno;
+			break;
+		}
+	}
+	if (p.min_segno != NULL_SEGNO) {
+got_it:
+		if (p.alloc_mode == LFS) {
+			secno = GET_SECNO(sbi, p.min_segno);
+			if (gc_type == FG_GC)
+				sbi->cur_victim_sec = secno;
+			else
+				set_bit(secno, dirty_i->victim_secmap);
+		}
+		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+
+		trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
+				sbi->cur_victim_sec,
+				prefree_segments(sbi), free_segments(sbi));
+	}
+	mutex_unlock(&dirty_i->seglist_lock);
+
+	return (p.min_segno == NULL_SEGNO) ? 0 : 1;
+}
+
+static const struct victim_selection default_v_ops = {
+	.get_victim = get_victim_by_default,
+};
+
+static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
+{
+	struct inode_entry *ie;
+
+	ie = radix_tree_lookup(&gc_list->iroot, ino);
+	if (ie)
+		return ie->inode;
+	return NULL;
+}
+
+static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
+{
+	struct inode_entry *new_ie;
+
+	if (inode == find_gc_inode(gc_list, inode->i_ino)) {
+		iput(inode);
+		return;
+	}
+	new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+	new_ie->inode = inode;
+
+	f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
+	list_add_tail(&new_ie->list, &gc_list->ilist);
+}
+
+static void put_gc_inode(struct gc_inode_list *gc_list)
+{
+	struct inode_entry *ie, *next_ie;
+	list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
+		radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
+		iput(ie->inode);
+		list_del(&ie->list);
+		kmem_cache_free(inode_entry_slab, ie);
+	}
+}
+
+static int check_valid_map(struct f2fs_sb_info *sbi,
+				unsigned int segno, int offset)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	struct seg_entry *sentry;
+	int ret;
+
+	mutex_lock(&sit_i->sentry_lock);
+	sentry = get_seg_entry(sbi, segno);
+	ret = f2fs_test_bit(offset, sentry->cur_valid_map);
+	mutex_unlock(&sit_i->sentry_lock);
+	return ret;
+}
+
+/*
+ * This function compares node address got in summary with that in NAT.
+ * On validity, copy that node with cold status, otherwise (invalid node)
+ * ignore that.
+ */
+static void gc_node_segment(struct f2fs_sb_info *sbi,
+		struct f2fs_summary *sum, unsigned int segno, int gc_type)
+{
+	bool initial = true;
+	struct f2fs_summary *entry;
+	int off;
+
+next_step:
+	entry = sum;
+
+	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+		nid_t nid = le32_to_cpu(entry->nid);
+		struct page *node_page;
+
+		/* stop BG_GC if there is not enough free sections. */
+		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+			return;
+
+		if (check_valid_map(sbi, segno, off) == 0)
+			continue;
+
+		if (initial) {
+			ra_node_page(sbi, nid);
+			continue;
+		}
+		node_page = get_node_page(sbi, nid);
+		if (IS_ERR(node_page))
+			continue;
+
+		/* block may become invalid during get_node_page */
+		if (check_valid_map(sbi, segno, off) == 0) {
+			f2fs_put_page(node_page, 1);
+			continue;
+		}
+
+		/* set page dirty and write it */
+		if (gc_type == FG_GC) {
+			f2fs_wait_on_page_writeback(node_page, NODE);
+			set_page_dirty(node_page);
+		} else {
+			if (!PageWriteback(node_page))
+				set_page_dirty(node_page);
+		}
+		f2fs_put_page(node_page, 1);
+		stat_inc_node_blk_count(sbi, 1, gc_type);
+	}
+
+	if (initial) {
+		initial = false;
+		goto next_step;
+	}
+
+	if (gc_type == FG_GC) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = LONG_MAX,
+			.for_reclaim = 0,
+		};
+		sync_node_pages(sbi, 0, &wbc);
+
+		/*
+		 * In the case of FG_GC, it'd be better to reclaim this victim
+		 * completely.
+		 */
+		if (get_valid_blocks(sbi, segno, 1) != 0)
+			goto next_step;
+	}
+}
+
+/*
+ * Calculate start block index indicating the given node offset.
+ * Be careful, caller should give this node offset only indicating direct node
+ * blocks. If any node offsets, which point the other types of node blocks such
+ * as indirect or double indirect node blocks, are given, it must be a caller's
+ * bug.
+ */
+block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
+{
+	unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
+	unsigned int bidx;
+
+	if (node_ofs == 0)
+		return 0;
+
+	if (node_ofs <= 2) {
+		bidx = node_ofs - 1;
+	} else if (node_ofs <= indirect_blks) {
+		int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
+		bidx = node_ofs - 2 - dec;
+	} else {
+		int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
+		bidx = node_ofs - 5 - dec;
+	}
+	return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
+}
+
+static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+		struct node_info *dni, block_t blkaddr, unsigned int *nofs)
+{
+	struct page *node_page;
+	nid_t nid;
+	unsigned int ofs_in_node;
+	block_t source_blkaddr;
+
+	nid = le32_to_cpu(sum->nid);
+	ofs_in_node = le16_to_cpu(sum->ofs_in_node);
+
+	node_page = get_node_page(sbi, nid);
+	if (IS_ERR(node_page))
+		return 0;
+
+	get_node_info(sbi, nid, dni);
+
+	if (sum->version != dni->version) {
+		f2fs_put_page(node_page, 1);
+		return 0;
+	}
+
+	*nofs = ofs_of_node(node_page);
+	source_blkaddr = datablock_addr(node_page, ofs_in_node);
+	f2fs_put_page(node_page, 1);
+
+	if (source_blkaddr != blkaddr)
+		return 0;
+	return 1;
+}
+
+static void move_data_page(struct inode *inode, struct page *page, int gc_type)
+{
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = WRITE_SYNC,
+	};
+
+	if (gc_type == BG_GC) {
+		if (PageWriteback(page))
+			goto out;
+		set_page_dirty(page);
+		set_cold_data(page);
+	} else {
+		f2fs_wait_on_page_writeback(page, DATA);
+
+		if (clear_page_dirty_for_io(page))
+			inode_dec_dirty_pages(inode);
+		set_cold_data(page);
+		do_write_data_page(page, &fio);
+		clear_cold_data(page);
+	}
+out:
+	f2fs_put_page(page, 1);
+}
+
+/*
+ * This function tries to get parent node of victim data block, and identifies
+ * data block validity. If the block is valid, copy that with cold status and
+ * modify parent node.
+ * If the parent node is not valid or the data block address is different,
+ * the victim data block is ignored.
+ */
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+		struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
+{
+	struct super_block *sb = sbi->sb;
+	struct f2fs_summary *entry;
+	block_t start_addr;
+	int off;
+	int phase = 0;
+
+	start_addr = START_BLOCK(sbi, segno);
+
+next_step:
+	entry = sum;
+
+	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+		struct page *data_page;
+		struct inode *inode;
+		struct node_info dni; /* dnode info for the data */
+		unsigned int ofs_in_node, nofs;
+		block_t start_bidx;
+
+		/* stop BG_GC if there is not enough free sections. */
+		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+			return;
+
+		if (check_valid_map(sbi, segno, off) == 0)
+			continue;
+
+		if (phase == 0) {
+			ra_node_page(sbi, le32_to_cpu(entry->nid));
+			continue;
+		}
+
+		/* Get an inode by ino with checking validity */
+		if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0)
+			continue;
+
+		if (phase == 1) {
+			ra_node_page(sbi, dni.ino);
+			continue;
+		}
+
+		ofs_in_node = le16_to_cpu(entry->ofs_in_node);
+
+		if (phase == 2) {
+			inode = f2fs_iget(sb, dni.ino);
+			if (IS_ERR(inode) || is_bad_inode(inode))
+				continue;
+
+			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+
+			data_page = find_data_page(inode,
+					start_bidx + ofs_in_node, false);
+			if (IS_ERR(data_page)) {
+				iput(inode);
+				continue;
+			}
+
+			f2fs_put_page(data_page, 0);
+			add_gc_inode(gc_list, inode);
+			continue;
+		}
+
+		/* phase 3 */
+		inode = find_gc_inode(gc_list, dni.ino);
+		if (inode) {
+			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+			data_page = get_lock_data_page(inode,
+						start_bidx + ofs_in_node);
+			if (IS_ERR(data_page))
+				continue;
+			move_data_page(inode, data_page, gc_type);
+			stat_inc_data_blk_count(sbi, 1, gc_type);
+		}
+	}
+
+	if (++phase < 4)
+		goto next_step;
+
+	if (gc_type == FG_GC) {
+		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+
+		/*
+		 * In the case of FG_GC, it'd be better to reclaim this victim
+		 * completely.
+		 */
+		if (get_valid_blocks(sbi, segno, 1) != 0) {
+			phase = 2;
+			goto next_step;
+		}
+	}
+}
+
+static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
+			int gc_type)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	int ret;
+
+	mutex_lock(&sit_i->sentry_lock);
+	ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
+					      NO_CHECK_TYPE, LFS);
+	mutex_unlock(&sit_i->sentry_lock);
+	return ret;
+}
+
+static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+				struct gc_inode_list *gc_list, int gc_type)
+{
+	struct page *sum_page;
+	struct f2fs_summary_block *sum;
+	struct blk_plug plug;
+
+	/* read segment summary of victim */
+	sum_page = get_sum_page(sbi, segno);
+
+	blk_start_plug(&plug);
+
+	sum = page_address(sum_page);
+
+	switch (GET_SUM_TYPE((&sum->footer))) {
+	case SUM_TYPE_NODE:
+		gc_node_segment(sbi, sum->entries, segno, gc_type);
+		break;
+	case SUM_TYPE_DATA:
+		gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type);
+		break;
+	}
+	blk_finish_plug(&plug);
+
+	stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
+	stat_inc_call_count(sbi->stat_info);
+
+	f2fs_put_page(sum_page, 1);
+}
+
+int f2fs_gc(struct f2fs_sb_info *sbi)
+{
+	unsigned int segno, i;
+	int gc_type = BG_GC;
+	int nfree = 0;
+	int ret = -1;
+	struct cp_control cpc;
+	struct gc_inode_list gc_list = {
+		.ilist = LIST_HEAD_INIT(gc_list.ilist),
+		.iroot = RADIX_TREE_INIT(GFP_NOFS),
+	};
+
+	cpc.reason = __get_cp_reason(sbi);
+gc_more:
+	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
+		goto stop;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto stop;
+
+	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
+		gc_type = FG_GC;
+		write_checkpoint(sbi, &cpc);
+	}
+
+	if (!__get_victim(sbi, &segno, gc_type))
+		goto stop;
+	ret = 0;
+
+	/* readahead multi ssa blocks those have contiguous address */
+	if (sbi->segs_per_sec > 1)
+		ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
+								META_SSA);
+
+	for (i = 0; i < sbi->segs_per_sec; i++)
+		do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
+
+	if (gc_type == FG_GC) {
+		sbi->cur_victim_sec = NULL_SEGNO;
+		nfree++;
+		WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec));
+	}
+
+	if (has_not_enough_free_secs(sbi, nfree))
+		goto gc_more;
+
+	if (gc_type == FG_GC)
+		write_checkpoint(sbi, &cpc);
+stop:
+	mutex_unlock(&sbi->gc_mutex);
+
+	put_gc_inode(&gc_list);
+	return ret;
+}
+
+void build_gc_manager(struct f2fs_sb_info *sbi)
+{
+	DIRTY_I(sbi)->v_ops = &default_v_ops;
+}
diff --git a/kernel/fs/f2fs/gc.h b/kernel/fs/f2fs/gc.h
new file mode 100644
index 000000000..b4a65be9f
--- /dev/null
+++ b/kernel/fs/f2fs/gc.h
@@ -0,0 +1,110 @@
+/*
+ * fs/f2fs/gc.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define GC_THREAD_MIN_WB_PAGES		1	/*
+						 * a threshold to determine
+						 * whether IO subsystem is idle
+						 * or not
+						 */
+#define DEF_GC_THREAD_MIN_SLEEP_TIME	30000	/* milliseconds */
+#define DEF_GC_THREAD_MAX_SLEEP_TIME	60000
+#define DEF_GC_THREAD_NOGC_SLEEP_TIME	300000	/* wait 5 min */
+#define LIMIT_INVALID_BLOCK	40 /* percentage over total user space */
+#define LIMIT_FREE_BLOCK	40 /* percentage over invalid + free space */
+
+/* Search max. number of dirty segments to select a victim segment */
+#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
+
+struct f2fs_gc_kthread {
+	struct task_struct *f2fs_gc_task;
+	wait_queue_head_t gc_wait_queue_head;
+
+	/* for gc sleep time */
+	unsigned int min_sleep_time;
+	unsigned int max_sleep_time;
+	unsigned int no_gc_sleep_time;
+
+	/* for changing gc mode */
+	unsigned int gc_idle;
+};
+
+struct gc_inode_list {
+	struct list_head ilist;
+	struct radix_tree_root iroot;
+};
+
+/*
+ * inline functions
+ */
+static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
+{
+	if (free_segments(sbi) < overprovision_segments(sbi))
+		return 0;
+	else
+		return (free_segments(sbi) - overprovision_segments(sbi))
+			<< sbi->log_blocks_per_seg;
+}
+
+static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
+{
+	return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
+}
+
+static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
+{
+	block_t reclaimable_user_blocks = sbi->user_block_count -
+		written_block_count(sbi);
+	return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
+}
+
+static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
+								long *wait)
+{
+	if (*wait == gc_th->no_gc_sleep_time)
+		return;
+
+	*wait += gc_th->min_sleep_time;
+	if (*wait > gc_th->max_sleep_time)
+		*wait = gc_th->max_sleep_time;
+}
+
+static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
+								long *wait)
+{
+	if (*wait == gc_th->no_gc_sleep_time)
+		*wait = gc_th->max_sleep_time;
+
+	*wait -= gc_th->min_sleep_time;
+	if (*wait <= gc_th->min_sleep_time)
+		*wait = gc_th->min_sleep_time;
+}
+
+static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
+{
+	block_t invalid_user_blocks = sbi->user_block_count -
+					written_block_count(sbi);
+	/*
+	 * Background GC is triggered with the following conditions.
+	 * 1. There are a number of invalid blocks.
+	 * 2. There is not enough free space.
+	 */
+	if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
+			free_user_blocks(sbi) < limit_free_user_blocks(sbi))
+		return true;
+	return false;
+}
+
+static inline int is_idle(struct f2fs_sb_info *sbi)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct request_list *rl = &q->root_rl;
+	return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
+}
diff --git a/kernel/fs/f2fs/hash.c b/kernel/fs/f2fs/hash.c
new file mode 100644
index 000000000..a844fcfb9
--- /dev/null
+++ b/kernel/fs/f2fs/hash.c
@@ -0,0 +1,104 @@
+/*
+ * fs/f2fs/hash.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext3/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/cryptohash.h>
+#include <linux/pagemap.h>
+
+#include "f2fs.h"
+
+/*
+ * Hashing code copied from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(unsigned int buf[4], unsigned int const in[])
+{
+	__u32 sum = 0;
+	__u32 b0 = buf[0], b1 = buf[1];
+	__u32 a = in[0], b = in[1], c = in[2], d = in[3];
+	int n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while (--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+static void str2hashbuf(const unsigned char *msg, size_t len,
+				unsigned int *buf, int num)
+{
+	unsigned pad, val;
+	int i;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num * 4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = msg[i] + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
+{
+	__u32 hash;
+	f2fs_hash_t f2fs_hash;
+	const unsigned char *p;
+	__u32 in[8], buf[4];
+	const unsigned char *name = name_info->name;
+	size_t len = name_info->len;
+
+	if ((len <= 2) && (name[0] == '.') &&
+		(name[1] == '.' || name[1] == '\0'))
+		return 0;
+
+	/* Initialize the default seed for the hash checksum functions */
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+
+	p = name;
+	while (1) {
+		str2hashbuf(p, len, in, 4);
+		TEA_transform(buf, in);
+		p += 16;
+		if (len <= 16)
+			break;
+		len -= 16;
+	}
+	hash = buf[0];
+	f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
+	return f2fs_hash;
+}
diff --git a/kernel/fs/f2fs/inline.c b/kernel/fs/f2fs/inline.c
new file mode 100644
index 000000000..8140e4f0e
--- /dev/null
+++ b/kernel/fs/f2fs/inline.c
@@ -0,0 +1,532 @@
+/*
+ * fs/f2fs/inline.c
+ * Copyright (c) 2013, Intel Corporation
+ * Authors: Huajun Li <huajun.li@intel.com>
+ *          Haicheng Li <haicheng.li@intel.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+
+bool f2fs_may_inline(struct inode *inode)
+{
+	if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
+		return false;
+
+	if (f2fs_is_atomic_file(inode))
+		return false;
+
+	if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
+		return false;
+
+	if (i_size_read(inode) > MAX_INLINE_DATA)
+		return false;
+
+	return true;
+}
+
+void read_inline_data(struct page *page, struct page *ipage)
+{
+	void *src_addr, *dst_addr;
+
+	if (PageUptodate(page))
+		return;
+
+	f2fs_bug_on(F2FS_P_SB(page), page->index);
+
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+
+	/* Copy the whole inline data block */
+	src_addr = inline_data_addr(ipage);
+	dst_addr = kmap_atomic(page);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	flush_dcache_page(page);
+	kunmap_atomic(dst_addr);
+	SetPageUptodate(page);
+}
+
+bool truncate_inline_inode(struct page *ipage, u64 from)
+{
+	void *addr;
+
+	if (from >= MAX_INLINE_DATA)
+		return false;
+
+	addr = inline_data_addr(ipage);
+
+	f2fs_wait_on_page_writeback(ipage, NODE);
+	memset(addr + from, 0, MAX_INLINE_DATA - from);
+
+	return true;
+}
+
+int f2fs_read_inline_data(struct inode *inode, struct page *page)
+{
+	struct page *ipage;
+
+	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+	if (IS_ERR(ipage)) {
+		unlock_page(page);
+		return PTR_ERR(ipage);
+	}
+
+	if (!f2fs_has_inline_data(inode)) {
+		f2fs_put_page(ipage, 1);
+		return -EAGAIN;
+	}
+
+	if (page->index)
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	else
+		read_inline_data(page, ipage);
+
+	SetPageUptodate(page);
+	f2fs_put_page(ipage, 1);
+	unlock_page(page);
+	return 0;
+}
+
+int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
+{
+	void *src_addr, *dst_addr;
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = WRITE_SYNC | REQ_PRIO,
+	};
+	int dirty, err;
+
+	f2fs_bug_on(F2FS_I_SB(dn->inode), page->index);
+
+	if (!f2fs_exist_data(dn->inode))
+		goto clear_out;
+
+	err = f2fs_reserve_block(dn, 0);
+	if (err)
+		return err;
+
+	f2fs_wait_on_page_writeback(page, DATA);
+
+	if (PageUptodate(page))
+		goto no_update;
+
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+
+	/* Copy the whole inline data block */
+	src_addr = inline_data_addr(dn->inode_page);
+	dst_addr = kmap_atomic(page);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	flush_dcache_page(page);
+	kunmap_atomic(dst_addr);
+	SetPageUptodate(page);
+no_update:
+	/* clear dirty state */
+	dirty = clear_page_dirty_for_io(page);
+
+	/* write data page to try to make data consistent */
+	set_page_writeback(page);
+	fio.blk_addr = dn->data_blkaddr;
+	write_data_page(page, dn, &fio);
+	set_data_blkaddr(dn);
+	f2fs_update_extent_cache(dn);
+	f2fs_wait_on_page_writeback(page, DATA);
+	if (dirty)
+		inode_dec_dirty_pages(dn->inode);
+
+	/* this converted inline_data should be recovered. */
+	set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
+
+	/* clear inline data and flag after data writeback */
+	truncate_inline_inode(dn->inode_page, 0);
+clear_out:
+	stat_dec_inline_inode(dn->inode);
+	f2fs_clear_inline_inode(dn->inode);
+	sync_inode_page(dn);
+	f2fs_put_dnode(dn);
+	return 0;
+}
+
+int f2fs_convert_inline_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct dnode_of_data dn;
+	struct page *ipage, *page;
+	int err = 0;
+
+	page = grab_cache_page(inode->i_mapping, 0);
+	if (!page)
+		return -ENOMEM;
+
+	f2fs_lock_op(sbi);
+
+	ipage = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(ipage)) {
+		err = PTR_ERR(ipage);
+		goto out;
+	}
+
+	set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+	if (f2fs_has_inline_data(inode))
+		err = f2fs_convert_inline_page(&dn, page);
+
+	f2fs_put_dnode(&dn);
+out:
+	f2fs_unlock_op(sbi);
+
+	f2fs_put_page(page, 1);
+	return err;
+}
+
+int f2fs_write_inline_data(struct inode *inode, struct page *page)
+{
+	void *src_addr, *dst_addr;
+	struct dnode_of_data dn;
+	int err;
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+	if (err)
+		return err;
+
+	if (!f2fs_has_inline_data(inode)) {
+		f2fs_put_dnode(&dn);
+		return -EAGAIN;
+	}
+
+	f2fs_bug_on(F2FS_I_SB(inode), page->index);
+
+	f2fs_wait_on_page_writeback(dn.inode_page, NODE);
+	src_addr = kmap_atomic(page);
+	dst_addr = inline_data_addr(dn.inode_page);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	kunmap_atomic(src_addr);
+
+	set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+	set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+
+	sync_inode_page(&dn);
+	f2fs_put_dnode(&dn);
+	return 0;
+}
+
+bool recover_inline_data(struct inode *inode, struct page *npage)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode *ri = NULL;
+	void *src_addr, *dst_addr;
+	struct page *ipage;
+
+	/*
+	 * The inline_data recovery policy is as follows.
+	 * [prev.] [next] of inline_data flag
+	 *    o       o  -> recover inline_data
+	 *    o       x  -> remove inline_data, and then recover data blocks
+	 *    x       o  -> remove inline_data, and then recover inline_data
+	 *    x       x  -> recover data blocks
+	 */
+	if (IS_INODE(npage))
+		ri = F2FS_INODE(npage);
+
+	if (f2fs_has_inline_data(inode) &&
+			ri && (ri->i_inline & F2FS_INLINE_DATA)) {
+process_inline:
+		ipage = get_node_page(sbi, inode->i_ino);
+		f2fs_bug_on(sbi, IS_ERR(ipage));
+
+		f2fs_wait_on_page_writeback(ipage, NODE);
+
+		src_addr = inline_data_addr(npage);
+		dst_addr = inline_data_addr(ipage);
+		memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+
+		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+		set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+
+		update_inode(inode, ipage);
+		f2fs_put_page(ipage, 1);
+		return true;
+	}
+
+	if (f2fs_has_inline_data(inode)) {
+		ipage = get_node_page(sbi, inode->i_ino);
+		f2fs_bug_on(sbi, IS_ERR(ipage));
+		truncate_inline_inode(ipage, 0);
+		f2fs_clear_inline_inode(inode);
+		update_inode(inode, ipage);
+		f2fs_put_page(ipage, 1);
+	} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
+		truncate_blocks(inode, 0, false);
+		goto process_inline;
+	}
+	return false;
+}
+
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
+				struct qstr *name, struct page **res_page)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_inline_dentry *inline_dentry;
+	struct f2fs_dir_entry *de;
+	struct f2fs_dentry_ptr d;
+	struct page *ipage;
+
+	ipage = get_node_page(sbi, dir->i_ino);
+	if (IS_ERR(ipage))
+		return NULL;
+
+	inline_dentry = inline_data_addr(ipage);
+
+	make_dentry_ptr(&d, (void *)inline_dentry, 2);
+	de = find_target_dentry(name, NULL, &d);
+
+	unlock_page(ipage);
+	if (de)
+		*res_page = ipage;
+	else
+		f2fs_put_page(ipage, 0);
+
+	/*
+	 * For the most part, it should be a bug when name_len is zero.
+	 * We stop here for figuring out where the bugs has occurred.
+	 */
+	f2fs_bug_on(sbi, d.max < 0);
+	return de;
+}
+
+struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir,
+							struct page **p)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct page *ipage;
+	struct f2fs_dir_entry *de;
+	struct f2fs_inline_dentry *dentry_blk;
+
+	ipage = get_node_page(sbi, dir->i_ino);
+	if (IS_ERR(ipage))
+		return NULL;
+
+	dentry_blk = inline_data_addr(ipage);
+	de = &dentry_blk->dentry[1];
+	*p = ipage;
+	unlock_page(ipage);
+	return de;
+}
+
+int make_empty_inline_dir(struct inode *inode, struct inode *parent,
+							struct page *ipage)
+{
+	struct f2fs_inline_dentry *dentry_blk;
+	struct f2fs_dentry_ptr d;
+
+	dentry_blk = inline_data_addr(ipage);
+
+	make_dentry_ptr(&d, (void *)dentry_blk, 2);
+	do_make_empty_dir(inode, parent, &d);
+
+	set_page_dirty(ipage);
+
+	/* update i_size to MAX_INLINE_DATA */
+	if (i_size_read(inode) < MAX_INLINE_DATA) {
+		i_size_write(inode, MAX_INLINE_DATA);
+		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
+	}
+	return 0;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	struct page *page;
+	struct dnode_of_data dn;
+	struct f2fs_dentry_block *dentry_blk;
+	int err;
+
+	page = grab_cache_page(dir->i_mapping, 0);
+	if (!page)
+		return -ENOMEM;
+
+	set_new_dnode(&dn, dir, ipage, NULL, 0);
+	err = f2fs_reserve_block(&dn, 0);
+	if (err)
+		goto out;
+
+	f2fs_wait_on_page_writeback(page, DATA);
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+
+	dentry_blk = kmap_atomic(page);
+
+	/* copy data from inline dentry block to new dentry block */
+	memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
+					INLINE_DENTRY_BITMAP_SIZE);
+	memcpy(dentry_blk->dentry, inline_dentry->dentry,
+			sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
+	memcpy(dentry_blk->filename, inline_dentry->filename,
+					NR_INLINE_DENTRY * F2FS_SLOT_LEN);
+
+	kunmap_atomic(dentry_blk);
+	SetPageUptodate(page);
+	set_page_dirty(page);
+
+	/* clear inline dir and flag after data writeback */
+	truncate_inline_inode(ipage, 0);
+
+	stat_dec_inline_dir(dir);
+	clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+
+	if (i_size_read(dir) < PAGE_CACHE_SIZE) {
+		i_size_write(dir, PAGE_CACHE_SIZE);
+		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+	}
+
+	sync_inode_page(&dn);
+out:
+	f2fs_put_page(page, 1);
+	return err;
+}
+
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
+			struct inode *inode, nid_t ino, umode_t mode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct page *ipage;
+	unsigned int bit_pos;
+	f2fs_hash_t name_hash;
+	size_t namelen = name->len;
+	struct f2fs_inline_dentry *dentry_blk = NULL;
+	struct f2fs_dentry_ptr d;
+	int slots = GET_DENTRY_SLOTS(namelen);
+	struct page *page = NULL;
+	int err = 0;
+
+	ipage = get_node_page(sbi, dir->i_ino);
+	if (IS_ERR(ipage))
+		return PTR_ERR(ipage);
+
+	dentry_blk = inline_data_addr(ipage);
+	bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
+						slots, NR_INLINE_DENTRY);
+	if (bit_pos >= NR_INLINE_DENTRY) {
+		err = f2fs_convert_inline_dir(dir, ipage, dentry_blk);
+		if (!err)
+			err = -EAGAIN;
+		goto out;
+	}
+
+	if (inode) {
+		down_write(&F2FS_I(inode)->i_sem);
+		page = init_inode_metadata(inode, dir, name, ipage);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto fail;
+		}
+	}
+
+	f2fs_wait_on_page_writeback(ipage, NODE);
+
+	name_hash = f2fs_dentry_hash(name);
+	make_dentry_ptr(&d, (void *)dentry_blk, 2);
+	f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
+
+	set_page_dirty(ipage);
+
+	/* we don't need to mark_inode_dirty now */
+	if (inode) {
+		F2FS_I(inode)->i_pino = dir->i_ino;
+		update_inode(inode, page);
+		f2fs_put_page(page, 1);
+	}
+
+	update_parent_metadata(dir, inode, 0);
+fail:
+	if (inode)
+		up_write(&F2FS_I(inode)->i_sem);
+
+	if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
+		update_inode(dir, ipage);
+		clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+	}
+out:
+	f2fs_put_page(ipage, 1);
+	return err;
+}
+
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
+					struct inode *dir, struct inode *inode)
+{
+	struct f2fs_inline_dentry *inline_dentry;
+	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+	unsigned int bit_pos;
+	int i;
+
+	lock_page(page);
+	f2fs_wait_on_page_writeback(page, NODE);
+
+	inline_dentry = inline_data_addr(page);
+	bit_pos = dentry - inline_dentry->dentry;
+	for (i = 0; i < slots; i++)
+		test_and_clear_bit_le(bit_pos + i,
+				&inline_dentry->dentry_bitmap);
+
+	set_page_dirty(page);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	if (inode)
+		f2fs_drop_nlink(dir, inode, page);
+
+	f2fs_put_page(page, 1);
+}
+
+bool f2fs_empty_inline_dir(struct inode *dir)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct page *ipage;
+	unsigned int bit_pos = 2;
+	struct f2fs_inline_dentry *dentry_blk;
+
+	ipage = get_node_page(sbi, dir->i_ino);
+	if (IS_ERR(ipage))
+		return false;
+
+	dentry_blk = inline_data_addr(ipage);
+	bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+					NR_INLINE_DENTRY,
+					bit_pos);
+
+	f2fs_put_page(ipage, 1);
+
+	if (bit_pos < NR_INLINE_DENTRY)
+		return false;
+
+	return true;
+}
+
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct f2fs_inline_dentry *inline_dentry = NULL;
+	struct page *ipage = NULL;
+	struct f2fs_dentry_ptr d;
+
+	if (ctx->pos == NR_INLINE_DENTRY)
+		return 0;
+
+	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+	if (IS_ERR(ipage))
+		return PTR_ERR(ipage);
+
+	inline_dentry = inline_data_addr(ipage);
+
+	make_dentry_ptr(&d, (void *)inline_dentry, 2);
+
+	if (!f2fs_fill_dentries(ctx, &d, 0))
+		ctx->pos = NR_INLINE_DENTRY;
+
+	f2fs_put_page(ipage, 1);
+	return 0;
+}
diff --git a/kernel/fs/f2fs/inode.c b/kernel/fs/f2fs/inode.c
new file mode 100644
index 000000000..e622ec954
--- /dev/null
+++ b/kernel/fs/f2fs/inode.c
@@ -0,0 +1,387 @@
+/*
+ * fs/f2fs/inode.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bitops.h>
+
+#include "f2fs.h"
+#include "node.h"
+
+#include <trace/events/f2fs.h>
+
+void f2fs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = F2FS_I(inode)->i_flags;
+	unsigned int new_fl = 0;
+
+	if (flags & FS_SYNC_FL)
+		new_fl |= S_SYNC;
+	if (flags & FS_APPEND_FL)
+		new_fl |= S_APPEND;
+	if (flags & FS_IMMUTABLE_FL)
+		new_fl |= S_IMMUTABLE;
+	if (flags & FS_NOATIME_FL)
+		new_fl |= S_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		new_fl |= S_DIRSYNC;
+	set_mask_bits(&inode->i_flags,
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+}
+
+static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+{
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		if (ri->i_addr[0])
+			inode->i_rdev =
+				old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+		else
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+	}
+}
+
+static bool __written_first_block(struct f2fs_inode *ri)
+{
+	block_t addr = le32_to_cpu(ri->i_addr[0]);
+
+	if (addr != NEW_ADDR && addr != NULL_ADDR)
+		return true;
+	return false;
+}
+
+static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+{
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			ri->i_addr[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			ri->i_addr[1] = 0;
+		} else {
+			ri->i_addr[0] = 0;
+			ri->i_addr[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			ri->i_addr[2] = 0;
+		}
+	}
+}
+
+static void __recover_inline_status(struct inode *inode, struct page *ipage)
+{
+	void *inline_data = inline_data_addr(ipage);
+	__le32 *start = inline_data;
+	__le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
+
+	while (start < end) {
+		if (*start++) {
+			f2fs_wait_on_page_writeback(ipage, NODE);
+
+			set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+			set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
+			set_page_dirty(ipage);
+			return;
+		}
+	}
+	return;
+}
+
+static int do_read_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct page *node_page;
+	struct f2fs_inode *ri;
+
+	/* Check if ino is within scope */
+	if (check_nid_range(sbi, inode->i_ino)) {
+		f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu",
+			 (unsigned long) inode->i_ino);
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	node_page = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(node_page))
+		return PTR_ERR(node_page);
+
+	ri = F2FS_INODE(node_page);
+
+	inode->i_mode = le16_to_cpu(ri->i_mode);
+	i_uid_write(inode, le32_to_cpu(ri->i_uid));
+	i_gid_write(inode, le32_to_cpu(ri->i_gid));
+	set_nlink(inode, le32_to_cpu(ri->i_links));
+	inode->i_size = le64_to_cpu(ri->i_size);
+	inode->i_blocks = le64_to_cpu(ri->i_blocks);
+
+	inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+	inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
+	inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
+	inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+	inode->i_generation = le32_to_cpu(ri->i_generation);
+
+	fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
+	fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
+	fi->i_flags = le32_to_cpu(ri->i_flags);
+	fi->flags = 0;
+	fi->i_advise = ri->i_advise;
+	fi->i_pino = le32_to_cpu(ri->i_pino);
+	fi->i_dir_level = ri->i_dir_level;
+
+	f2fs_init_extent_cache(inode, &ri->i_ext);
+
+	get_inline_info(fi, ri);
+
+	/* check data exist */
+	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
+		__recover_inline_status(inode, node_page);
+
+	/* get rdev by using inline_info */
+	__get_inode_rdev(inode, ri);
+
+	if (__written_first_block(ri))
+		set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+
+	f2fs_put_page(node_page, 1);
+
+	stat_inc_inline_inode(inode);
+	stat_inc_inline_dir(inode);
+
+	return 0;
+}
+
+struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct inode *inode;
+	int ret = 0;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(inode->i_state & I_NEW)) {
+		trace_f2fs_iget(inode);
+		return inode;
+	}
+	if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
+		goto make_now;
+
+	ret = do_read_inode(inode);
+	if (ret)
+		goto bad_inode;
+make_now:
+	if (ino == F2FS_NODE_INO(sbi)) {
+		inode->i_mapping->a_ops = &f2fs_node_aops;
+		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+	} else if (ino == F2FS_META_INO(sbi)) {
+		inode->i_mapping->a_ops = &f2fs_meta_aops;
+		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+	} else if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &f2fs_file_inode_operations;
+		inode->i_fop = &f2fs_file_operations;
+		inode->i_mapping->a_ops = &f2fs_dblock_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &f2fs_dir_inode_operations;
+		inode->i_fop = &f2fs_dir_operations;
+		inode->i_mapping->a_ops = &f2fs_dblock_aops;
+		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &f2fs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &f2fs_dblock_aops;
+	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		inode->i_op = &f2fs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	} else {
+		ret = -EIO;
+		goto bad_inode;
+	}
+	unlock_new_inode(inode);
+	trace_f2fs_iget(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	trace_f2fs_iget_exit(inode, ret);
+	return ERR_PTR(ret);
+}
+
+void update_inode(struct inode *inode, struct page *node_page)
+{
+	struct f2fs_inode *ri;
+
+	f2fs_wait_on_page_writeback(node_page, NODE);
+
+	ri = F2FS_INODE(node_page);
+
+	ri->i_mode = cpu_to_le16(inode->i_mode);
+	ri->i_advise = F2FS_I(inode)->i_advise;
+	ri->i_uid = cpu_to_le32(i_uid_read(inode));
+	ri->i_gid = cpu_to_le32(i_gid_read(inode));
+	ri->i_links = cpu_to_le32(inode->i_nlink);
+	ri->i_size = cpu_to_le64(i_size_read(inode));
+	ri->i_blocks = cpu_to_le64(inode->i_blocks);
+
+	read_lock(&F2FS_I(inode)->ext_lock);
+	set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+	read_unlock(&F2FS_I(inode)->ext_lock);
+
+	set_raw_inline(F2FS_I(inode), ri);
+
+	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
+	ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
+	ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
+	ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
+	ri->i_generation = cpu_to_le32(inode->i_generation);
+	ri->i_dir_level = F2FS_I(inode)->i_dir_level;
+
+	__set_inode_rdev(inode, ri);
+	set_cold_node(inode, node_page);
+	set_page_dirty(node_page);
+
+	clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+}
+
+void update_inode_page(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct page *node_page;
+retry:
+	node_page = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(node_page)) {
+		int err = PTR_ERR(node_page);
+		if (err == -ENOMEM) {
+			cond_resched();
+			goto retry;
+		} else if (err != -ENOENT) {
+			f2fs_stop_checkpoint(sbi);
+		}
+		return;
+	}
+	update_inode(inode, node_page);
+	f2fs_put_page(node_page, 1);
+}
+
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+			inode->i_ino == F2FS_META_INO(sbi))
+		return 0;
+
+	if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+		return 0;
+
+	/*
+	 * We need to lock here to prevent from producing dirty node pages
+	 * during the urgent cleaning time when runing out of free sections.
+	 */
+	f2fs_lock_op(sbi);
+	update_inode_page(inode);
+	f2fs_unlock_op(sbi);
+
+	if (wbc)
+		f2fs_balance_fs(sbi);
+
+	return 0;
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero
+ */
+void f2fs_evict_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+
+	/* some remained atomic pages should discarded */
+	if (f2fs_is_atomic_file(inode))
+		commit_inmem_pages(inode, true);
+
+	trace_f2fs_evict_inode(inode);
+	truncate_inode_pages_final(&inode->i_data);
+
+	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+			inode->i_ino == F2FS_META_INO(sbi))
+		goto out_clear;
+
+	f2fs_bug_on(sbi, get_dirty_pages(inode));
+	remove_dirty_dir_inode(inode);
+
+	if (inode->i_nlink || is_bad_inode(inode))
+		goto no_delete;
+
+	sb_start_intwrite(inode->i_sb);
+	set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
+	i_size_write(inode, 0);
+
+	if (F2FS_HAS_BLOCKS(inode))
+		f2fs_truncate(inode);
+
+	f2fs_lock_op(sbi);
+	remove_inode_page(inode);
+	f2fs_unlock_op(sbi);
+
+	sb_end_intwrite(inode->i_sb);
+no_delete:
+	stat_dec_inline_dir(inode);
+	stat_dec_inline_inode(inode);
+
+	/* update extent info in inode */
+	if (inode->i_nlink)
+		f2fs_preserve_extent_tree(inode);
+	f2fs_destroy_extent_tree(inode);
+
+	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+	if (xnid)
+		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
+	if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE))
+		add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+	if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
+		add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+out_clear:
+	clear_inode(inode);
+}
+
+/* caller should call f2fs_lock_op() */
+void handle_failed_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	clear_nlink(inode);
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+
+	i_size_write(inode, 0);
+	if (F2FS_HAS_BLOCKS(inode))
+		f2fs_truncate(inode);
+
+	remove_inode_page(inode);
+
+	clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+	clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+	alloc_nid_failed(sbi, inode->i_ino);
+	f2fs_unlock_op(sbi);
+
+	/* iput will drop the inode object */
+	iput(inode);
+}
diff --git a/kernel/fs/f2fs/namei.c b/kernel/fs/f2fs/namei.c
new file mode 100644
index 000000000..658e8079a
--- /dev/null
+++ b/kernel/fs/f2fs/namei.c
@@ -0,0 +1,832 @@
+/*
+ * fs/f2fs/namei.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "xattr.h"
+#include "acl.h"
+#include <trace/events/f2fs.h>
+
+static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	nid_t ino;
+	struct inode *inode;
+	bool nid_free = false;
+	int err;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	f2fs_lock_op(sbi);
+	if (!alloc_nid(sbi, &ino)) {
+		f2fs_unlock_op(sbi);
+		err = -ENOSPC;
+		goto fail;
+	}
+	f2fs_unlock_op(sbi);
+
+	inode_init_owner(inode, dir, mode);
+
+	inode->i_ino = ino;
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_generation = sbi->s_next_generation++;
+
+	err = insert_inode_locked(inode);
+	if (err) {
+		err = -EINVAL;
+		nid_free = true;
+		goto out;
+	}
+
+	if (f2fs_may_inline(inode))
+		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+	if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode))
+		set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+
+	trace_f2fs_new_inode(inode, 0);
+	mark_inode_dirty(inode);
+	return inode;
+
+out:
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+fail:
+	trace_f2fs_new_inode(inode, err);
+	make_bad_inode(inode);
+	iput(inode);
+	if (nid_free)
+		alloc_nid_failed(sbi, ino);
+	return ERR_PTR(err);
+}
+
+static int is_multimedia_file(const unsigned char *s, const char *sub)
+{
+	size_t slen = strlen(s);
+	size_t sublen = strlen(sub);
+
+	if (sublen > slen)
+		return 0;
+
+	return !strncasecmp(s + slen - sublen, sub, sublen);
+}
+
+/*
+ * Set multimedia files as cold files for hot/cold data separation
+ */
+static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
+		const unsigned char *name)
+{
+	int i;
+	__u8 (*extlist)[8] = sbi->raw_super->extension_list;
+
+	int count = le32_to_cpu(sbi->raw_super->extension_count);
+	for (i = 0; i < count; i++) {
+		if (is_multimedia_file(name, extlist[i])) {
+			file_set_cold(inode);
+			break;
+		}
+	}
+}
+
+static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+						bool excl)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct inode *inode;
+	nid_t ino = 0;
+	int err;
+
+	f2fs_balance_fs(sbi);
+
+	inode = f2fs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
+		set_cold_files(sbi, inode, dentry->d_name.name);
+
+	inode->i_op = &f2fs_file_inode_operations;
+	inode->i_fop = &f2fs_file_operations;
+	inode->i_mapping->a_ops = &f2fs_dblock_aops;
+	ino = inode->i_ino;
+
+	f2fs_lock_op(sbi);
+	err = f2fs_add_link(dentry, inode);
+	if (err)
+		goto out;
+	f2fs_unlock_op(sbi);
+
+	alloc_nid_done(sbi, ino);
+
+	stat_inc_inline_inode(inode);
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	if (IS_DIRSYNC(dir))
+		f2fs_sync_fs(sbi->sb, 1);
+	return 0;
+out:
+	handle_failed_inode(inode);
+	return err;
+}
+
+static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	int err;
+
+	f2fs_balance_fs(sbi);
+
+	inode->i_ctime = CURRENT_TIME;
+	ihold(inode);
+
+	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	f2fs_lock_op(sbi);
+	err = f2fs_add_link(dentry, inode);
+	if (err)
+		goto out;
+	f2fs_unlock_op(sbi);
+
+	d_instantiate(dentry, inode);
+
+	if (IS_DIRSYNC(dir))
+		f2fs_sync_fs(sbi->sb, 1);
+	return 0;
+out:
+	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	iput(inode);
+	f2fs_unlock_op(sbi);
+	return err;
+}
+
+struct dentry *f2fs_get_parent(struct dentry *child)
+{
+	struct qstr dotdot = QSTR_INIT("..", 2);
+	unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+	return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino));
+}
+
+static int __recover_dot_dentries(struct inode *dir, nid_t pino)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct qstr dot = QSTR_INIT(".", 1);
+	struct qstr dotdot = QSTR_INIT("..", 2);
+	struct f2fs_dir_entry *de;
+	struct page *page;
+	int err = 0;
+
+	f2fs_lock_op(sbi);
+
+	de = f2fs_find_entry(dir, &dot, &page);
+	if (de) {
+		f2fs_dentry_kunmap(dir, page);
+		f2fs_put_page(page, 0);
+	} else {
+		err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
+		if (err)
+			goto out;
+	}
+
+	de = f2fs_find_entry(dir, &dotdot, &page);
+	if (de) {
+		f2fs_dentry_kunmap(dir, page);
+		f2fs_put_page(page, 0);
+	} else {
+		err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
+	}
+out:
+	if (!err) {
+		clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS);
+		mark_inode_dirty(dir);
+	}
+
+	f2fs_unlock_op(sbi);
+	return err;
+}
+
+static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
+		unsigned int flags)
+{
+	struct inode *inode = NULL;
+	struct f2fs_dir_entry *de;
+	struct page *page;
+
+	if (dentry->d_name.len > F2FS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	de = f2fs_find_entry(dir, &dentry->d_name, &page);
+	if (de) {
+		nid_t ino = le32_to_cpu(de->ino);
+		f2fs_dentry_kunmap(dir, page);
+		f2fs_put_page(page, 0);
+
+		inode = f2fs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+
+		if (f2fs_has_inline_dots(inode)) {
+			int err;
+
+			err = __recover_dot_dentries(inode, dir->i_ino);
+			if (err) {
+				iget_failed(inode);
+				return ERR_PTR(err);
+			}
+		}
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct inode *inode = d_inode(dentry);
+	struct f2fs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	trace_f2fs_unlink_enter(dir, dentry);
+	f2fs_balance_fs(sbi);
+
+	de = f2fs_find_entry(dir, &dentry->d_name, &page);
+	if (!de)
+		goto fail;
+
+	f2fs_lock_op(sbi);
+	err = acquire_orphan_inode(sbi);
+	if (err) {
+		f2fs_unlock_op(sbi);
+		f2fs_dentry_kunmap(dir, page);
+		f2fs_put_page(page, 0);
+		goto fail;
+	}
+	f2fs_delete_entry(de, page, dir, inode);
+	f2fs_unlock_op(sbi);
+
+	/* In order to evict this inode, we set it dirty */
+	mark_inode_dirty(inode);
+
+	if (IS_DIRSYNC(dir))
+		f2fs_sync_fs(sbi->sb, 1);
+fail:
+	trace_f2fs_unlink_exit(inode, err);
+	return err;
+}
+
+static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct page *page = page_follow_link_light(dentry, nd);
+
+	if (IS_ERR_OR_NULL(page))
+		return page;
+
+	/* this is broken symlink case */
+	if (*nd_get_link(nd) == 0) {
+		page_put_link(dentry, nd, page);
+		return ERR_PTR(-ENOENT);
+	}
+	return page;
+}
+
+static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
+					const char *symname)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct inode *inode;
+	size_t symlen = strlen(symname) + 1;
+	int err;
+
+	f2fs_balance_fs(sbi);
+
+	inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &f2fs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &f2fs_dblock_aops;
+
+	f2fs_lock_op(sbi);
+	err = f2fs_add_link(dentry, inode);
+	if (err)
+		goto out;
+	f2fs_unlock_op(sbi);
+
+	err = page_symlink(inode, symname, symlen);
+	alloc_nid_done(sbi, inode->i_ino);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	/*
+	 * Let's flush symlink data in order to avoid broken symlink as much as
+	 * possible. Nevertheless, fsyncing is the best way, but there is no
+	 * way to get a file descriptor in order to flush that.
+	 *
+	 * Note that, it needs to do dir->fsync to make this recoverable.
+	 * If the symlink path is stored into inline_data, there is no
+	 * performance regression.
+	 */
+	filemap_write_and_wait_range(inode->i_mapping, 0, symlen - 1);
+
+	if (IS_DIRSYNC(dir))
+		f2fs_sync_fs(sbi->sb, 1);
+	return err;
+out:
+	handle_failed_inode(inode);
+	return err;
+}
+
+static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct inode *inode;
+	int err;
+
+	f2fs_balance_fs(sbi);
+
+	inode = f2fs_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &f2fs_dir_inode_operations;
+	inode->i_fop = &f2fs_dir_operations;
+	inode->i_mapping->a_ops = &f2fs_dblock_aops;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+
+	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	f2fs_lock_op(sbi);
+	err = f2fs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+	f2fs_unlock_op(sbi);
+
+	stat_inc_inline_dir(inode);
+	alloc_nid_done(sbi, inode->i_ino);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	if (IS_DIRSYNC(dir))
+		f2fs_sync_fs(sbi->sb, 1);
+	return 0;
+
+out_fail:
+	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	handle_failed_inode(inode);
+	return err;
+}
+
+static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	if (f2fs_empty_dir(inode))
+		return f2fs_unlink(dir, dentry);
+	return -ENOTEMPTY;
+}
+
+static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
+				umode_t mode, dev_t rdev)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct inode *inode;
+	int err = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	f2fs_balance_fs(sbi);
+
+	inode = f2fs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, inode->i_mode, rdev);
+	inode->i_op = &f2fs_special_inode_operations;
+
+	f2fs_lock_op(sbi);
+	err = f2fs_add_link(dentry, inode);
+	if (err)
+		goto out;
+	f2fs_unlock_op(sbi);
+
+	alloc_nid_done(sbi, inode->i_ino);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	if (IS_DIRSYNC(dir))
+		f2fs_sync_fs(sbi->sb, 1);
+	return 0;
+out:
+	handle_failed_inode(inode);
+	return err;
+}
+
+static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct page *old_dir_page;
+	struct page *old_page, *new_page;
+	struct f2fs_dir_entry *old_dir_entry = NULL;
+	struct f2fs_dir_entry *old_entry;
+	struct f2fs_dir_entry *new_entry;
+	int err = -ENOENT;
+
+	f2fs_balance_fs(sbi);
+
+	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+	if (!old_entry)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
+		if (!old_dir_entry)
+			goto out_old;
+	}
+
+	if (new_inode) {
+
+		err = -ENOTEMPTY;
+		if (old_dir_entry && !f2fs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
+						&new_page);
+		if (!new_entry)
+			goto out_dir;
+
+		f2fs_lock_op(sbi);
+
+		err = acquire_orphan_inode(sbi);
+		if (err)
+			goto put_out_dir;
+
+		if (update_dent_inode(old_inode, &new_dentry->d_name)) {
+			release_orphan_inode(sbi);
+			goto put_out_dir;
+		}
+
+		f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+
+		new_inode->i_ctime = CURRENT_TIME;
+		down_write(&F2FS_I(new_inode)->i_sem);
+		if (old_dir_entry)
+			drop_nlink(new_inode);
+		drop_nlink(new_inode);
+		up_write(&F2FS_I(new_inode)->i_sem);
+
+		mark_inode_dirty(new_inode);
+
+		if (!new_inode->i_nlink)
+			add_orphan_inode(sbi, new_inode->i_ino);
+		else
+			release_orphan_inode(sbi);
+
+		update_inode_page(old_inode);
+		update_inode_page(new_inode);
+	} else {
+		f2fs_lock_op(sbi);
+
+		err = f2fs_add_link(new_dentry, old_inode);
+		if (err) {
+			f2fs_unlock_op(sbi);
+			goto out_dir;
+		}
+
+		if (old_dir_entry) {
+			inc_nlink(new_dir);
+			update_inode_page(new_dir);
+		}
+	}
+
+	down_write(&F2FS_I(old_inode)->i_sem);
+	file_lost_pino(old_inode);
+	up_write(&F2FS_I(old_inode)->i_sem);
+
+	old_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_inode);
+
+	f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
+
+	if (old_dir_entry) {
+		if (old_dir != new_dir) {
+			f2fs_set_link(old_inode, old_dir_entry,
+						old_dir_page, new_dir);
+			update_inode_page(old_inode);
+		} else {
+			f2fs_dentry_kunmap(old_inode, old_dir_page);
+			f2fs_put_page(old_dir_page, 0);
+		}
+		drop_nlink(old_dir);
+		mark_inode_dirty(old_dir);
+		update_inode_page(old_dir);
+	}
+
+	f2fs_unlock_op(sbi);
+
+	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+		f2fs_sync_fs(sbi->sb, 1);
+	return 0;
+
+put_out_dir:
+	f2fs_unlock_op(sbi);
+	f2fs_dentry_kunmap(new_dir, new_page);
+	f2fs_put_page(new_page, 0);
+out_dir:
+	if (old_dir_entry) {
+		f2fs_dentry_kunmap(old_inode, old_dir_page);
+		f2fs_put_page(old_dir_page, 0);
+	}
+out_old:
+	f2fs_dentry_kunmap(old_dir, old_page);
+	f2fs_put_page(old_page, 0);
+out:
+	return err;
+}
+
+static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct page *old_dir_page, *new_dir_page;
+	struct page *old_page, *new_page;
+	struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
+	struct f2fs_dir_entry *old_entry, *new_entry;
+	int old_nlink = 0, new_nlink = 0;
+	int err = -ENOENT;
+
+	f2fs_balance_fs(sbi);
+
+	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+	if (!old_entry)
+		goto out;
+
+	new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+	if (!new_entry)
+		goto out_old;
+
+	/* prepare for updating ".." directory entry info later */
+	if (old_dir != new_dir) {
+		if (S_ISDIR(old_inode->i_mode)) {
+			err = -EIO;
+			old_dir_entry = f2fs_parent_dir(old_inode,
+							&old_dir_page);
+			if (!old_dir_entry)
+				goto out_new;
+		}
+
+		if (S_ISDIR(new_inode->i_mode)) {
+			err = -EIO;
+			new_dir_entry = f2fs_parent_dir(new_inode,
+							&new_dir_page);
+			if (!new_dir_entry)
+				goto out_old_dir;
+		}
+	}
+
+	/*
+	 * If cross rename between file and directory those are not
+	 * in the same directory, we will inc nlink of file's parent
+	 * later, so we should check upper boundary of its nlink.
+	 */
+	if ((!old_dir_entry || !new_dir_entry) &&
+				old_dir_entry != new_dir_entry) {
+		old_nlink = old_dir_entry ? -1 : 1;
+		new_nlink = -old_nlink;
+		err = -EMLINK;
+		if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
+			(new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+			goto out_new_dir;
+	}
+
+	f2fs_lock_op(sbi);
+
+	err = update_dent_inode(old_inode, &new_dentry->d_name);
+	if (err)
+		goto out_unlock;
+
+	err = update_dent_inode(new_inode, &old_dentry->d_name);
+	if (err)
+		goto out_undo;
+
+	/* update ".." directory entry info of old dentry */
+	if (old_dir_entry)
+		f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+
+	/* update ".." directory entry info of new dentry */
+	if (new_dir_entry)
+		f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir);
+
+	/* update directory entry info of old dir inode */
+	f2fs_set_link(old_dir, old_entry, old_page, new_inode);
+
+	down_write(&F2FS_I(old_inode)->i_sem);
+	file_lost_pino(old_inode);
+	up_write(&F2FS_I(old_inode)->i_sem);
+
+	update_inode_page(old_inode);
+
+	old_dir->i_ctime = CURRENT_TIME;
+	if (old_nlink) {
+		down_write(&F2FS_I(old_dir)->i_sem);
+		if (old_nlink < 0)
+			drop_nlink(old_dir);
+		else
+			inc_nlink(old_dir);
+		up_write(&F2FS_I(old_dir)->i_sem);
+	}
+	mark_inode_dirty(old_dir);
+	update_inode_page(old_dir);
+
+	/* update directory entry info of new dir inode */
+	f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+
+	down_write(&F2FS_I(new_inode)->i_sem);
+	file_lost_pino(new_inode);
+	up_write(&F2FS_I(new_inode)->i_sem);
+
+	update_inode_page(new_inode);
+
+	new_dir->i_ctime = CURRENT_TIME;
+	if (new_nlink) {
+		down_write(&F2FS_I(new_dir)->i_sem);
+		if (new_nlink < 0)
+			drop_nlink(new_dir);
+		else
+			inc_nlink(new_dir);
+		up_write(&F2FS_I(new_dir)->i_sem);
+	}
+	mark_inode_dirty(new_dir);
+	update_inode_page(new_dir);
+
+	f2fs_unlock_op(sbi);
+
+	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+		f2fs_sync_fs(sbi->sb, 1);
+	return 0;
+out_undo:
+	/* Still we may fail to recover name info of f2fs_inode here */
+	update_dent_inode(old_inode, &old_dentry->d_name);
+out_unlock:
+	f2fs_unlock_op(sbi);
+out_new_dir:
+	if (new_dir_entry) {
+		f2fs_dentry_kunmap(new_inode, new_dir_page);
+		f2fs_put_page(new_dir_page, 0);
+	}
+out_old_dir:
+	if (old_dir_entry) {
+		f2fs_dentry_kunmap(old_inode, old_dir_page);
+		f2fs_put_page(old_dir_page, 0);
+	}
+out_new:
+	f2fs_dentry_kunmap(new_dir, new_page);
+	f2fs_put_page(new_page, 0);
+out_old:
+	f2fs_dentry_kunmap(old_dir, old_page);
+	f2fs_put_page(old_page, 0);
+out:
+	return err;
+}
+
+static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags)
+{
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (flags & RENAME_EXCHANGE) {
+		return f2fs_cross_rename(old_dir, old_dentry,
+					 new_dir, new_dentry);
+	}
+	/*
+	 * VFS has already handled the new dentry existence case,
+	 * here, we just deal with "RENAME_NOREPLACE" as regular rename.
+	 */
+	return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct inode *inode;
+	int err;
+
+	inode = f2fs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &f2fs_file_inode_operations;
+	inode->i_fop = &f2fs_file_operations;
+	inode->i_mapping->a_ops = &f2fs_dblock_aops;
+
+	f2fs_lock_op(sbi);
+	err = acquire_orphan_inode(sbi);
+	if (err)
+		goto out;
+
+	err = f2fs_do_tmpfile(inode, dir);
+	if (err)
+		goto release_out;
+
+	/*
+	 * add this non-linked tmpfile to orphan list, in this way we could
+	 * remove all unused data of tmpfile after abnormal power-off.
+	 */
+	add_orphan_inode(sbi, inode->i_ino);
+	f2fs_unlock_op(sbi);
+
+	alloc_nid_done(sbi, inode->i_ino);
+
+	stat_inc_inline_inode(inode);
+	d_tmpfile(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+
+release_out:
+	release_orphan_inode(sbi);
+out:
+	handle_failed_inode(inode);
+	return err;
+}
+
+const struct inode_operations f2fs_dir_inode_operations = {
+	.create		= f2fs_create,
+	.lookup		= f2fs_lookup,
+	.link		= f2fs_link,
+	.unlink		= f2fs_unlink,
+	.symlink	= f2fs_symlink,
+	.mkdir		= f2fs_mkdir,
+	.rmdir		= f2fs_rmdir,
+	.mknod		= f2fs_mknod,
+	.rename2	= f2fs_rename2,
+	.tmpfile	= f2fs_tmpfile,
+	.getattr	= f2fs_getattr,
+	.setattr	= f2fs_setattr,
+	.get_acl	= f2fs_get_acl,
+	.set_acl	= f2fs_set_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= f2fs_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_symlink_inode_operations = {
+	.readlink       = generic_readlink,
+	.follow_link    = f2fs_follow_link,
+	.put_link       = page_put_link,
+	.getattr	= f2fs_getattr,
+	.setattr	= f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= f2fs_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_special_inode_operations = {
+	.getattr	= f2fs_getattr,
+	.setattr        = f2fs_setattr,
+	.get_acl	= f2fs_get_acl,
+	.set_acl	= f2fs_set_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+	.setxattr       = generic_setxattr,
+	.getxattr       = generic_getxattr,
+	.listxattr	= f2fs_listxattr,
+	.removexattr    = generic_removexattr,
+#endif
+};
diff --git a/kernel/fs/f2fs/node.c b/kernel/fs/f2fs/node.c
new file mode 100644
index 000000000..8ab0cf193
--- /dev/null
+++ b/kernel/fs/f2fs/node.c
@@ -0,0 +1,2084 @@
+/*
+ * fs/f2fs/node.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "trace.h"
+#include <trace/events/f2fs.h>
+
+#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+
+static struct kmem_cache *nat_entry_slab;
+static struct kmem_cache *free_nid_slab;
+static struct kmem_cache *nat_entry_set_slab;
+
+bool available_free_memory(struct f2fs_sb_info *sbi, int type)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct sysinfo val;
+	unsigned long avail_ram;
+	unsigned long mem_size = 0;
+	bool res = false;
+
+	si_meminfo(&val);
+
+	/* only uses low memory */
+	avail_ram = val.totalram - val.totalhigh;
+
+	/*
+	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+	 */
+	if (type == FREE_NIDS) {
+		mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
+							PAGE_CACHE_SHIFT;
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+	} else if (type == NAT_ENTRIES) {
+		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
+							PAGE_CACHE_SHIFT;
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+	} else if (type == DIRTY_DENTS) {
+		if (sbi->sb->s_bdi->dirty_exceeded)
+			return false;
+		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+	} else if (type == INO_ENTRIES) {
+		int i;
+
+		for (i = 0; i <= UPDATE_INO; i++)
+			mem_size += (sbi->im[i].ino_num *
+				sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+	} else if (type == EXTENT_CACHE) {
+		mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+				atomic_read(&sbi->total_ext_node) *
+				sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+	} else {
+		if (sbi->sb->s_bdi->dirty_exceeded)
+			return false;
+	}
+	return res;
+}
+
+static void clear_node_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	unsigned int long flags;
+
+	if (PageDirty(page)) {
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		radix_tree_tag_clear(&mapping->page_tree,
+				page_index(page),
+				PAGECACHE_TAG_DIRTY);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+		clear_page_dirty_for_io(page);
+		dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
+	}
+	ClearPageUptodate(page);
+}
+
+static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+	pgoff_t index = current_nat_addr(sbi, nid);
+	return get_meta_page(sbi, index);
+}
+
+static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+	struct page *src_page;
+	struct page *dst_page;
+	pgoff_t src_off;
+	pgoff_t dst_off;
+	void *src_addr;
+	void *dst_addr;
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	src_off = current_nat_addr(sbi, nid);
+	dst_off = next_nat_addr(sbi, src_off);
+
+	/* get current nat block page with lock */
+	src_page = get_meta_page(sbi, src_off);
+	dst_page = grab_meta_page(sbi, dst_off);
+	f2fs_bug_on(sbi, PageDirty(src_page));
+
+	src_addr = page_address(src_page);
+	dst_addr = page_address(dst_page);
+	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+	set_page_dirty(dst_page);
+	f2fs_put_page(src_page, 1);
+
+	set_to_next_nat(nm_i, nid);
+
+	return dst_page;
+}
+
+static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
+{
+	return radix_tree_lookup(&nm_i->nat_root, n);
+}
+
+static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
+		nid_t start, unsigned int nr, struct nat_entry **ep)
+{
+	return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
+}
+
+static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
+{
+	list_del(&e->list);
+	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
+	nm_i->nat_cnt--;
+	kmem_cache_free(nat_entry_slab, e);
+}
+
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+						struct nat_entry *ne)
+{
+	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
+	struct nat_entry_set *head;
+
+	if (get_nat_flag(ne, IS_DIRTY))
+		return;
+
+	head = radix_tree_lookup(&nm_i->nat_set_root, set);
+	if (!head) {
+		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+
+		INIT_LIST_HEAD(&head->entry_list);
+		INIT_LIST_HEAD(&head->set_list);
+		head->set = set;
+		head->entry_cnt = 0;
+		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
+	}
+	list_move_tail(&ne->list, &head->entry_list);
+	nm_i->dirty_nat_cnt++;
+	head->entry_cnt++;
+	set_nat_flag(ne, IS_DIRTY, true);
+}
+
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+						struct nat_entry *ne)
+{
+	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
+	struct nat_entry_set *head;
+
+	head = radix_tree_lookup(&nm_i->nat_set_root, set);
+	if (head) {
+		list_move_tail(&ne->list, &nm_i->nat_entries);
+		set_nat_flag(ne, IS_DIRTY, false);
+		head->entry_cnt--;
+		nm_i->dirty_nat_cnt--;
+	}
+}
+
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+		nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+	return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+							start, nr);
+}
+
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct nat_entry *e;
+	bool is_cp = true;
+
+	down_read(&nm_i->nat_tree_lock);
+	e = __lookup_nat_cache(nm_i, nid);
+	if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+		is_cp = false;
+	up_read(&nm_i->nat_tree_lock);
+	return is_cp;
+}
+
+bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct nat_entry *e;
+	bool fsynced = false;
+
+	down_read(&nm_i->nat_tree_lock);
+	e = __lookup_nat_cache(nm_i, ino);
+	if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
+		fsynced = true;
+	up_read(&nm_i->nat_tree_lock);
+	return fsynced;
+}
+
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct nat_entry *e;
+	bool need_update = true;
+
+	down_read(&nm_i->nat_tree_lock);
+	e = __lookup_nat_cache(nm_i, ino);
+	if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
+			(get_nat_flag(e, IS_CHECKPOINTED) ||
+			 get_nat_flag(e, HAS_FSYNCED_INODE)))
+		need_update = false;
+	up_read(&nm_i->nat_tree_lock);
+	return need_update;
+}
+
+static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+	struct nat_entry *new;
+
+	new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
+	f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
+	memset(new, 0, sizeof(struct nat_entry));
+	nat_set_nid(new, nid);
+	nat_reset_flag(new);
+	list_add_tail(&new->list, &nm_i->nat_entries);
+	nm_i->nat_cnt++;
+	return new;
+}
+
+static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+						struct f2fs_nat_entry *ne)
+{
+	struct nat_entry *e;
+
+	down_write(&nm_i->nat_tree_lock);
+	e = __lookup_nat_cache(nm_i, nid);
+	if (!e) {
+		e = grab_nat_entry(nm_i, nid);
+		node_info_from_raw_nat(&e->ni, ne);
+	}
+	up_write(&nm_i->nat_tree_lock);
+}
+
+static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
+			block_t new_blkaddr, bool fsync_done)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct nat_entry *e;
+
+	down_write(&nm_i->nat_tree_lock);
+	e = __lookup_nat_cache(nm_i, ni->nid);
+	if (!e) {
+		e = grab_nat_entry(nm_i, ni->nid);
+		copy_node_info(&e->ni, ni);
+		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
+	} else if (new_blkaddr == NEW_ADDR) {
+		/*
+		 * when nid is reallocated,
+		 * previous nat entry can be remained in nat cache.
+		 * So, reinitialize it with new information.
+		 */
+		copy_node_info(&e->ni, ni);
+		f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
+	}
+
+	/* sanity check */
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
+			new_blkaddr == NULL_ADDR);
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
+			new_blkaddr == NEW_ADDR);
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
+			nat_get_blkaddr(e) != NULL_ADDR &&
+			new_blkaddr == NEW_ADDR);
+
+	/* increment version no as node is removed */
+	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
+		unsigned char version = nat_get_version(e);
+		nat_set_version(e, inc_node_version(version));
+	}
+
+	/* change address */
+	nat_set_blkaddr(e, new_blkaddr);
+	if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
+		set_nat_flag(e, IS_CHECKPOINTED, false);
+	__set_nat_cache_dirty(nm_i, e);
+
+	/* update fsync_mark if its inode nat entry is still alive */
+	e = __lookup_nat_cache(nm_i, ni->ino);
+	if (e) {
+		if (fsync_done && ni->nid == ni->ino)
+			set_nat_flag(e, HAS_FSYNCED_INODE, true);
+		set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
+	}
+	up_write(&nm_i->nat_tree_lock);
+}
+
+int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	if (available_free_memory(sbi, NAT_ENTRIES))
+		return 0;
+
+	down_write(&nm_i->nat_tree_lock);
+	while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
+		struct nat_entry *ne;
+		ne = list_first_entry(&nm_i->nat_entries,
+					struct nat_entry, list);
+		__del_from_nat_cache(nm_i, ne);
+		nr_shrink--;
+	}
+	up_write(&nm_i->nat_tree_lock);
+	return nr_shrink;
+}
+
+/*
+ * This function always returns success
+ */
+void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	nid_t start_nid = START_NID(nid);
+	struct f2fs_nat_block *nat_blk;
+	struct page *page = NULL;
+	struct f2fs_nat_entry ne;
+	struct nat_entry *e;
+	int i;
+
+	ni->nid = nid;
+
+	/* Check nat cache */
+	down_read(&nm_i->nat_tree_lock);
+	e = __lookup_nat_cache(nm_i, nid);
+	if (e) {
+		ni->ino = nat_get_ino(e);
+		ni->blk_addr = nat_get_blkaddr(e);
+		ni->version = nat_get_version(e);
+	}
+	up_read(&nm_i->nat_tree_lock);
+	if (e)
+		return;
+
+	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+
+	/* Check current segment summary */
+	mutex_lock(&curseg->curseg_mutex);
+	i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+	if (i >= 0) {
+		ne = nat_in_journal(sum, i);
+		node_info_from_raw_nat(ni, &ne);
+	}
+	mutex_unlock(&curseg->curseg_mutex);
+	if (i >= 0)
+		goto cache;
+
+	/* Fill node_info from nat page */
+	page = get_current_nat_page(sbi, start_nid);
+	nat_blk = (struct f2fs_nat_block *)page_address(page);
+	ne = nat_blk->entries[nid - start_nid];
+	node_info_from_raw_nat(ni, &ne);
+	f2fs_put_page(page, 1);
+cache:
+	/* cache nat entry */
+	cache_nat_entry(NM_I(sbi), nid, &ne);
+}
+
+/*
+ * The maximum depth is four.
+ * Offset[0] will have raw inode offset.
+ */
+static int get_node_path(struct f2fs_inode_info *fi, long block,
+				int offset[4], unsigned int noffset[4])
+{
+	const long direct_index = ADDRS_PER_INODE(fi);
+	const long direct_blks = ADDRS_PER_BLOCK;
+	const long dptrs_per_blk = NIDS_PER_BLOCK;
+	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+	const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
+	int n = 0;
+	int level = 0;
+
+	noffset[0] = 0;
+
+	if (block < direct_index) {
+		offset[n] = block;
+		goto got;
+	}
+	block -= direct_index;
+	if (block < direct_blks) {
+		offset[n++] = NODE_DIR1_BLOCK;
+		noffset[n] = 1;
+		offset[n] = block;
+		level = 1;
+		goto got;
+	}
+	block -= direct_blks;
+	if (block < direct_blks) {
+		offset[n++] = NODE_DIR2_BLOCK;
+		noffset[n] = 2;
+		offset[n] = block;
+		level = 1;
+		goto got;
+	}
+	block -= direct_blks;
+	if (block < indirect_blks) {
+		offset[n++] = NODE_IND1_BLOCK;
+		noffset[n] = 3;
+		offset[n++] = block / direct_blks;
+		noffset[n] = 4 + offset[n - 1];
+		offset[n] = block % direct_blks;
+		level = 2;
+		goto got;
+	}
+	block -= indirect_blks;
+	if (block < indirect_blks) {
+		offset[n++] = NODE_IND2_BLOCK;
+		noffset[n] = 4 + dptrs_per_blk;
+		offset[n++] = block / direct_blks;
+		noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
+		offset[n] = block % direct_blks;
+		level = 2;
+		goto got;
+	}
+	block -= indirect_blks;
+	if (block < dindirect_blks) {
+		offset[n++] = NODE_DIND_BLOCK;
+		noffset[n] = 5 + (dptrs_per_blk * 2);
+		offset[n++] = block / indirect_blks;
+		noffset[n] = 6 + (dptrs_per_blk * 2) +
+			      offset[n - 1] * (dptrs_per_blk + 1);
+		offset[n++] = (block / direct_blks) % dptrs_per_blk;
+		noffset[n] = 7 + (dptrs_per_blk * 2) +
+			      offset[n - 2] * (dptrs_per_blk + 1) +
+			      offset[n - 1];
+		offset[n] = block % direct_blks;
+		level = 3;
+		goto got;
+	} else {
+		BUG();
+	}
+got:
+	return level;
+}
+
+/*
+ * Caller should call f2fs_put_dnode(dn).
+ * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
+ * In the case of RDONLY_NODE, we don't need to care about mutex.
+ */
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct page *npage[4];
+	struct page *parent = NULL;
+	int offset[4];
+	unsigned int noffset[4];
+	nid_t nids[4];
+	int level, i;
+	int err = 0;
+
+	level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
+
+	nids[0] = dn->inode->i_ino;
+	npage[0] = dn->inode_page;
+
+	if (!npage[0]) {
+		npage[0] = get_node_page(sbi, nids[0]);
+		if (IS_ERR(npage[0]))
+			return PTR_ERR(npage[0]);
+	}
+
+	/* if inline_data is set, should not report any block indices */
+	if (f2fs_has_inline_data(dn->inode) && index) {
+		err = -ENOENT;
+		f2fs_put_page(npage[0], 1);
+		goto release_out;
+	}
+
+	parent = npage[0];
+	if (level != 0)
+		nids[1] = get_nid(parent, offset[0], true);
+	dn->inode_page = npage[0];
+	dn->inode_page_locked = true;
+
+	/* get indirect or direct nodes */
+	for (i = 1; i <= level; i++) {
+		bool done = false;
+
+		if (!nids[i] && mode == ALLOC_NODE) {
+			/* alloc new node */
+			if (!alloc_nid(sbi, &(nids[i]))) {
+				err = -ENOSPC;
+				goto release_pages;
+			}
+
+			dn->nid = nids[i];
+			npage[i] = new_node_page(dn, noffset[i], NULL);
+			if (IS_ERR(npage[i])) {
+				alloc_nid_failed(sbi, nids[i]);
+				err = PTR_ERR(npage[i]);
+				goto release_pages;
+			}
+
+			set_nid(parent, offset[i - 1], nids[i], i == 1);
+			alloc_nid_done(sbi, nids[i]);
+			done = true;
+		} else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
+			npage[i] = get_node_page_ra(parent, offset[i - 1]);
+			if (IS_ERR(npage[i])) {
+				err = PTR_ERR(npage[i]);
+				goto release_pages;
+			}
+			done = true;
+		}
+		if (i == 1) {
+			dn->inode_page_locked = false;
+			unlock_page(parent);
+		} else {
+			f2fs_put_page(parent, 1);
+		}
+
+		if (!done) {
+			npage[i] = get_node_page(sbi, nids[i]);
+			if (IS_ERR(npage[i])) {
+				err = PTR_ERR(npage[i]);
+				f2fs_put_page(npage[0], 0);
+				goto release_out;
+			}
+		}
+		if (i < level) {
+			parent = npage[i];
+			nids[i + 1] = get_nid(parent, offset[i], false);
+		}
+	}
+	dn->nid = nids[level];
+	dn->ofs_in_node = offset[level];
+	dn->node_page = npage[level];
+	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+	return 0;
+
+release_pages:
+	f2fs_put_page(parent, 1);
+	if (i > 1)
+		f2fs_put_page(npage[0], 0);
+release_out:
+	dn->inode_page = NULL;
+	dn->node_page = NULL;
+	return err;
+}
+
+static void truncate_node(struct dnode_of_data *dn)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct node_info ni;
+
+	get_node_info(sbi, dn->nid, &ni);
+	if (dn->inode->i_blocks == 0) {
+		f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
+		goto invalidate;
+	}
+	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
+
+	/* Deallocate node address */
+	invalidate_blocks(sbi, ni.blk_addr);
+	dec_valid_node_count(sbi, dn->inode);
+	set_node_addr(sbi, &ni, NULL_ADDR, false);
+
+	if (dn->nid == dn->inode->i_ino) {
+		remove_orphan_inode(sbi, dn->nid);
+		dec_valid_inode_count(sbi);
+	} else {
+		sync_inode_page(dn);
+	}
+invalidate:
+	clear_node_page_dirty(dn->node_page);
+	set_sbi_flag(sbi, SBI_IS_DIRTY);
+
+	f2fs_put_page(dn->node_page, 1);
+
+	invalidate_mapping_pages(NODE_MAPPING(sbi),
+			dn->node_page->index, dn->node_page->index);
+
+	dn->node_page = NULL;
+	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
+}
+
+static int truncate_dnode(struct dnode_of_data *dn)
+{
+	struct page *page;
+
+	if (dn->nid == 0)
+		return 1;
+
+	/* get direct node */
+	page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
+	if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
+		return 1;
+	else if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	/* Make dnode_of_data for parameter */
+	dn->node_page = page;
+	dn->ofs_in_node = 0;
+	truncate_data_blocks(dn);
+	truncate_node(dn);
+	return 1;
+}
+
+static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
+						int ofs, int depth)
+{
+	struct dnode_of_data rdn = *dn;
+	struct page *page;
+	struct f2fs_node *rn;
+	nid_t child_nid;
+	unsigned int child_nofs;
+	int freed = 0;
+	int i, ret;
+
+	if (dn->nid == 0)
+		return NIDS_PER_BLOCK + 1;
+
+	trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
+
+	page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
+	if (IS_ERR(page)) {
+		trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+
+	rn = F2FS_NODE(page);
+	if (depth < 3) {
+		for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
+			child_nid = le32_to_cpu(rn->in.nid[i]);
+			if (child_nid == 0)
+				continue;
+			rdn.nid = child_nid;
+			ret = truncate_dnode(&rdn);
+			if (ret < 0)
+				goto out_err;
+			set_nid(page, i, 0, false);
+		}
+	} else {
+		child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
+		for (i = ofs; i < NIDS_PER_BLOCK; i++) {
+			child_nid = le32_to_cpu(rn->in.nid[i]);
+			if (child_nid == 0) {
+				child_nofs += NIDS_PER_BLOCK + 1;
+				continue;
+			}
+			rdn.nid = child_nid;
+			ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
+			if (ret == (NIDS_PER_BLOCK + 1)) {
+				set_nid(page, i, 0, false);
+				child_nofs += ret;
+			} else if (ret < 0 && ret != -ENOENT) {
+				goto out_err;
+			}
+		}
+		freed = child_nofs;
+	}
+
+	if (!ofs) {
+		/* remove current indirect node */
+		dn->node_page = page;
+		truncate_node(dn);
+		freed++;
+	} else {
+		f2fs_put_page(page, 1);
+	}
+	trace_f2fs_truncate_nodes_exit(dn->inode, freed);
+	return freed;
+
+out_err:
+	f2fs_put_page(page, 1);
+	trace_f2fs_truncate_nodes_exit(dn->inode, ret);
+	return ret;
+}
+
+static int truncate_partial_nodes(struct dnode_of_data *dn,
+			struct f2fs_inode *ri, int *offset, int depth)
+{
+	struct page *pages[2];
+	nid_t nid[3];
+	nid_t child_nid;
+	int err = 0;
+	int i;
+	int idx = depth - 2;
+
+	nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
+	if (!nid[0])
+		return 0;
+
+	/* get indirect nodes in the path */
+	for (i = 0; i < idx + 1; i++) {
+		/* reference count'll be increased */
+		pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
+		if (IS_ERR(pages[i])) {
+			err = PTR_ERR(pages[i]);
+			idx = i - 1;
+			goto fail;
+		}
+		nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
+	}
+
+	/* free direct nodes linked to a partial indirect node */
+	for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
+		child_nid = get_nid(pages[idx], i, false);
+		if (!child_nid)
+			continue;
+		dn->nid = child_nid;
+		err = truncate_dnode(dn);
+		if (err < 0)
+			goto fail;
+		set_nid(pages[idx], i, 0, false);
+	}
+
+	if (offset[idx + 1] == 0) {
+		dn->node_page = pages[idx];
+		dn->nid = nid[idx];
+		truncate_node(dn);
+	} else {
+		f2fs_put_page(pages[idx], 1);
+	}
+	offset[idx]++;
+	offset[idx + 1] = 0;
+	idx--;
+fail:
+	for (i = idx; i >= 0; i--)
+		f2fs_put_page(pages[i], 1);
+
+	trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
+
+	return err;
+}
+
+/*
+ * All the block addresses of data and nodes should be nullified.
+ */
+int truncate_inode_blocks(struct inode *inode, pgoff_t from)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int err = 0, cont = 1;
+	int level, offset[4], noffset[4];
+	unsigned int nofs = 0;
+	struct f2fs_inode *ri;
+	struct dnode_of_data dn;
+	struct page *page;
+
+	trace_f2fs_truncate_inode_blocks_enter(inode, from);
+
+	level = get_node_path(F2FS_I(inode), from, offset, noffset);
+restart:
+	page = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(page)) {
+		trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+
+	set_new_dnode(&dn, inode, page, NULL, 0);
+	unlock_page(page);
+
+	ri = F2FS_INODE(page);
+	switch (level) {
+	case 0:
+	case 1:
+		nofs = noffset[1];
+		break;
+	case 2:
+		nofs = noffset[1];
+		if (!offset[level - 1])
+			goto skip_partial;
+		err = truncate_partial_nodes(&dn, ri, offset, level);
+		if (err < 0 && err != -ENOENT)
+			goto fail;
+		nofs += 1 + NIDS_PER_BLOCK;
+		break;
+	case 3:
+		nofs = 5 + 2 * NIDS_PER_BLOCK;
+		if (!offset[level - 1])
+			goto skip_partial;
+		err = truncate_partial_nodes(&dn, ri, offset, level);
+		if (err < 0 && err != -ENOENT)
+			goto fail;
+		break;
+	default:
+		BUG();
+	}
+
+skip_partial:
+	while (cont) {
+		dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
+		switch (offset[0]) {
+		case NODE_DIR1_BLOCK:
+		case NODE_DIR2_BLOCK:
+			err = truncate_dnode(&dn);
+			break;
+
+		case NODE_IND1_BLOCK:
+		case NODE_IND2_BLOCK:
+			err = truncate_nodes(&dn, nofs, offset[1], 2);
+			break;
+
+		case NODE_DIND_BLOCK:
+			err = truncate_nodes(&dn, nofs, offset[1], 3);
+			cont = 0;
+			break;
+
+		default:
+			BUG();
+		}
+		if (err < 0 && err != -ENOENT)
+			goto fail;
+		if (offset[1] == 0 &&
+				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+			lock_page(page);
+			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+				f2fs_put_page(page, 1);
+				goto restart;
+			}
+			f2fs_wait_on_page_writeback(page, NODE);
+			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+			set_page_dirty(page);
+			unlock_page(page);
+		}
+		offset[1] = 0;
+		offset[0]++;
+		nofs += err;
+	}
+fail:
+	f2fs_put_page(page, 0);
+	trace_f2fs_truncate_inode_blocks_exit(inode, err);
+	return err > 0 ? 0 : err;
+}
+
+int truncate_xattr_node(struct inode *inode, struct page *page)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t nid = F2FS_I(inode)->i_xattr_nid;
+	struct dnode_of_data dn;
+	struct page *npage;
+
+	if (!nid)
+		return 0;
+
+	npage = get_node_page(sbi, nid);
+	if (IS_ERR(npage))
+		return PTR_ERR(npage);
+
+	F2FS_I(inode)->i_xattr_nid = 0;
+
+	/* need to do checkpoint during fsync */
+	F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
+
+	set_new_dnode(&dn, inode, page, npage, nid);
+
+	if (page)
+		dn.inode_page_locked = true;
+	truncate_node(&dn);
+	return 0;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+void remove_inode_page(struct inode *inode)
+{
+	struct dnode_of_data dn;
+
+	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+	if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
+		return;
+
+	if (truncate_xattr_node(inode, dn.inode_page)) {
+		f2fs_put_dnode(&dn);
+		return;
+	}
+
+	/* remove potential inline_data blocks */
+	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+				S_ISLNK(inode->i_mode))
+		truncate_data_blocks_range(&dn, 1);
+
+	/* 0 is possible, after f2fs_new_inode() has failed */
+	f2fs_bug_on(F2FS_I_SB(inode),
+			inode->i_blocks != 0 && inode->i_blocks != 1);
+
+	/* will put inode & node pages */
+	truncate_node(&dn);
+}
+
+struct page *new_inode_page(struct inode *inode)
+{
+	struct dnode_of_data dn;
+
+	/* allocate inode page for new inode */
+	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+
+	/* caller should f2fs_put_page(page, 1); */
+	return new_node_page(&dn, 0, NULL);
+}
+
+struct page *new_node_page(struct dnode_of_data *dn,
+				unsigned int ofs, struct page *ipage)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct node_info old_ni, new_ni;
+	struct page *page;
+	int err;
+
+	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+		return ERR_PTR(-EPERM);
+
+	page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
+		err = -ENOSPC;
+		goto fail;
+	}
+
+	get_node_info(sbi, dn->nid, &old_ni);
+
+	/* Reinitialize old_ni with new node page */
+	f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
+	new_ni = old_ni;
+	new_ni.ino = dn->inode->i_ino;
+	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
+
+	f2fs_wait_on_page_writeback(page, NODE);
+	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
+	set_cold_node(dn->inode, page);
+	SetPageUptodate(page);
+	set_page_dirty(page);
+
+	if (f2fs_has_xattr_block(ofs))
+		F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
+
+	dn->node_page = page;
+	if (ipage)
+		update_inode(dn->inode, ipage);
+	else
+		sync_inode_page(dn);
+	if (ofs == 0)
+		inc_valid_inode_count(sbi);
+
+	return page;
+
+fail:
+	clear_node_page_dirty(page);
+	f2fs_put_page(page, 1);
+	return ERR_PTR(err);
+}
+
+/*
+ * Caller should do after getting the following values.
+ * 0: f2fs_put_page(page, 0)
+ * LOCKED_PAGE: f2fs_put_page(page, 1)
+ * error: nothing
+ */
+static int read_node_page(struct page *page, int rw)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+	struct node_info ni;
+	struct f2fs_io_info fio = {
+		.type = NODE,
+		.rw = rw,
+	};
+
+	get_node_info(sbi, page->index, &ni);
+
+	if (unlikely(ni.blk_addr == NULL_ADDR)) {
+		ClearPageUptodate(page);
+		f2fs_put_page(page, 1);
+		return -ENOENT;
+	}
+
+	if (PageUptodate(page))
+		return LOCKED_PAGE;
+
+	fio.blk_addr = ni.blk_addr;
+	return f2fs_submit_page_bio(sbi, page, &fio);
+}
+
+/*
+ * Readahead a node page
+ */
+void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+	struct page *apage;
+	int err;
+
+	apage = find_get_page(NODE_MAPPING(sbi), nid);
+	if (apage && PageUptodate(apage)) {
+		f2fs_put_page(apage, 0);
+		return;
+	}
+	f2fs_put_page(apage, 0);
+
+	apage = grab_cache_page(NODE_MAPPING(sbi), nid);
+	if (!apage)
+		return;
+
+	err = read_node_page(apage, READA);
+	if (err == 0)
+		f2fs_put_page(apage, 0);
+	else if (err == LOCKED_PAGE)
+		f2fs_put_page(apage, 1);
+}
+
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+	struct page *page;
+	int err;
+repeat:
+	page = grab_cache_page(NODE_MAPPING(sbi), nid);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	err = read_node_page(page, READ_SYNC);
+	if (err < 0)
+		return ERR_PTR(err);
+	else if (err != LOCKED_PAGE)
+		lock_page(page);
+
+	if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
+		ClearPageUptodate(page);
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
+	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+		f2fs_put_page(page, 1);
+		goto repeat;
+	}
+	return page;
+}
+
+/*
+ * Return a locked page for the desired node page.
+ * And, readahead MAX_RA_NODE number of node pages.
+ */
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+	struct blk_plug plug;
+	struct page *page;
+	int err, i, end;
+	nid_t nid;
+
+	/* First, try getting the desired direct node. */
+	nid = get_nid(parent, start, false);
+	if (!nid)
+		return ERR_PTR(-ENOENT);
+repeat:
+	page = grab_cache_page(NODE_MAPPING(sbi), nid);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	err = read_node_page(page, READ_SYNC);
+	if (err < 0)
+		return ERR_PTR(err);
+	else if (err == LOCKED_PAGE)
+		goto page_hit;
+
+	blk_start_plug(&plug);
+
+	/* Then, try readahead for siblings of the desired node */
+	end = start + MAX_RA_NODE;
+	end = min(end, NIDS_PER_BLOCK);
+	for (i = start + 1; i < end; i++) {
+		nid = get_nid(parent, i, false);
+		if (!nid)
+			continue;
+		ra_node_page(sbi, nid);
+	}
+
+	blk_finish_plug(&plug);
+
+	lock_page(page);
+	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+		f2fs_put_page(page, 1);
+		goto repeat;
+	}
+page_hit:
+	if (unlikely(!PageUptodate(page))) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
+	return page;
+}
+
+void sync_inode_page(struct dnode_of_data *dn)
+{
+	if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
+		update_inode(dn->inode, dn->node_page);
+	} else if (dn->inode_page) {
+		if (!dn->inode_page_locked)
+			lock_page(dn->inode_page);
+		update_inode(dn->inode, dn->inode_page);
+		if (!dn->inode_page_locked)
+			unlock_page(dn->inode_page);
+	} else {
+		update_inode_page(dn->inode);
+	}
+}
+
+int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
+					struct writeback_control *wbc)
+{
+	pgoff_t index, end;
+	struct pagevec pvec;
+	int step = ino ? 2 : 0;
+	int nwritten = 0, wrote = 0;
+
+	pagevec_init(&pvec, 0);
+
+next_step:
+	index = 0;
+	end = LONG_MAX;
+
+	while (index <= end) {
+		int i, nr_pages;
+		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * flushing sequence with step:
+			 * 0. indirect nodes
+			 * 1. dentry dnodes
+			 * 2. file dnodes
+			 */
+			if (step == 0 && IS_DNODE(page))
+				continue;
+			if (step == 1 && (!IS_DNODE(page) ||
+						is_cold_node(page)))
+				continue;
+			if (step == 2 && (!IS_DNODE(page) ||
+						!is_cold_node(page)))
+				continue;
+
+			/*
+			 * If an fsync mode,
+			 * we should not skip writing node pages.
+			 */
+			if (ino && ino_of_node(page) == ino)
+				lock_page(page);
+			else if (!trylock_page(page))
+				continue;
+
+			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+			if (ino && ino_of_node(page) != ino)
+				goto continue_unlock;
+
+			if (!PageDirty(page)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			/* called by fsync() */
+			if (ino && IS_DNODE(page)) {
+				set_fsync_mark(page, 1);
+				if (IS_INODE(page)) {
+					if (!is_checkpointed_node(sbi, ino) &&
+						!has_fsynced_inode(sbi, ino))
+						set_dentry_mark(page, 1);
+					else
+						set_dentry_mark(page, 0);
+				}
+				nwritten++;
+			} else {
+				set_fsync_mark(page, 0);
+				set_dentry_mark(page, 0);
+			}
+
+			if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+				unlock_page(page);
+			else
+				wrote++;
+
+			if (--wbc->nr_to_write == 0)
+				break;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+
+		if (wbc->nr_to_write == 0) {
+			step = 2;
+			break;
+		}
+	}
+
+	if (step < 2) {
+		step++;
+		goto next_step;
+	}
+
+	if (wrote)
+		f2fs_submit_merged_bio(sbi, NODE, WRITE);
+	return nwritten;
+}
+
+int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	pgoff_t index = 0, end = LONG_MAX;
+	struct pagevec pvec;
+	int ret2 = 0, ret = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (index <= end) {
+		int i, nr_pages;
+		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_WRITEBACK,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/* until radix tree lookup accepts end_index */
+			if (unlikely(page->index > end))
+				continue;
+
+			if (ino && ino_of_node(page) == ino) {
+				f2fs_wait_on_page_writeback(page, NODE);
+				if (TestClearPageError(page))
+					ret = -EIO;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
+		ret2 = -ENOSPC;
+	if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
+		ret2 = -EIO;
+	if (!ret)
+		ret = ret2;
+	return ret;
+}
+
+static int f2fs_write_node_page(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+	nid_t nid;
+	struct node_info ni;
+	struct f2fs_io_info fio = {
+		.type = NODE,
+		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+	};
+
+	trace_f2fs_writepage(page, NODE);
+
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto redirty_out;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto redirty_out;
+
+	f2fs_wait_on_page_writeback(page, NODE);
+
+	/* get old block addr of this node page */
+	nid = nid_of_node(page);
+	f2fs_bug_on(sbi, page->index != nid);
+
+	get_node_info(sbi, nid, &ni);
+
+	/* This page is already truncated */
+	if (unlikely(ni.blk_addr == NULL_ADDR)) {
+		ClearPageUptodate(page);
+		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (wbc->for_reclaim) {
+		if (!down_read_trylock(&sbi->node_write))
+			goto redirty_out;
+	} else {
+		down_read(&sbi->node_write);
+	}
+
+	set_page_writeback(page);
+	fio.blk_addr = ni.blk_addr;
+	write_node_page(sbi, page, nid, &fio);
+	set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
+	dec_page_count(sbi, F2FS_DIRTY_NODES);
+	up_read(&sbi->node_write);
+	unlock_page(page);
+
+	if (wbc->for_reclaim)
+		f2fs_submit_merged_bio(sbi, NODE, WRITE);
+
+	return 0;
+
+redirty_out:
+	redirty_page_for_writepage(wbc, page);
+	return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int f2fs_write_node_pages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+	long diff;
+
+	trace_f2fs_writepages(mapping->host, wbc, NODE);
+
+	/* balancing f2fs's metadata in background */
+	f2fs_balance_fs_bg(sbi);
+
+	/* collect a number of dirty node pages and write together */
+	if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+		goto skip_write;
+
+	diff = nr_pages_to_write(sbi, NODE, wbc);
+	wbc->sync_mode = WB_SYNC_NONE;
+	sync_node_pages(sbi, 0, wbc);
+	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
+	return 0;
+
+skip_write:
+	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
+	return 0;
+}
+
+static int f2fs_set_node_page_dirty(struct page *page)
+{
+	trace_f2fs_set_page_dirty(page, NODE);
+
+	SetPageUptodate(page);
+	if (!PageDirty(page)) {
+		__set_page_dirty_nobuffers(page);
+		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
+		SetPagePrivate(page);
+		f2fs_trace_pid(page);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Structure of the f2fs node operations
+ */
+const struct address_space_operations f2fs_node_aops = {
+	.writepage	= f2fs_write_node_page,
+	.writepages	= f2fs_write_node_pages,
+	.set_page_dirty	= f2fs_set_node_page_dirty,
+	.invalidatepage	= f2fs_invalidate_page,
+	.releasepage	= f2fs_release_page,
+};
+
+static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
+						nid_t n)
+{
+	return radix_tree_lookup(&nm_i->free_nid_root, n);
+}
+
+static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
+						struct free_nid *i)
+{
+	list_del(&i->list);
+	radix_tree_delete(&nm_i->free_nid_root, i->nid);
+}
+
+static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *i;
+	struct nat_entry *ne;
+	bool allocated = false;
+
+	if (!available_free_memory(sbi, FREE_NIDS))
+		return -1;
+
+	/* 0 nid should not be used */
+	if (unlikely(nid == 0))
+		return 0;
+
+	if (build) {
+		/* do not add allocated nids */
+		down_read(&nm_i->nat_tree_lock);
+		ne = __lookup_nat_cache(nm_i, nid);
+		if (ne &&
+			(!get_nat_flag(ne, IS_CHECKPOINTED) ||
+				nat_get_blkaddr(ne) != NULL_ADDR))
+			allocated = true;
+		up_read(&nm_i->nat_tree_lock);
+		if (allocated)
+			return 0;
+	}
+
+	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
+	i->nid = nid;
+	i->state = NID_NEW;
+
+	if (radix_tree_preload(GFP_NOFS)) {
+		kmem_cache_free(free_nid_slab, i);
+		return 0;
+	}
+
+	spin_lock(&nm_i->free_nid_list_lock);
+	if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
+		spin_unlock(&nm_i->free_nid_list_lock);
+		radix_tree_preload_end();
+		kmem_cache_free(free_nid_slab, i);
+		return 0;
+	}
+	list_add_tail(&i->list, &nm_i->free_nid_list);
+	nm_i->fcnt++;
+	spin_unlock(&nm_i->free_nid_list_lock);
+	radix_tree_preload_end();
+	return 1;
+}
+
+static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+	struct free_nid *i;
+	bool need_free = false;
+
+	spin_lock(&nm_i->free_nid_list_lock);
+	i = __lookup_free_nid_list(nm_i, nid);
+	if (i && i->state == NID_NEW) {
+		__del_from_free_nid_list(nm_i, i);
+		nm_i->fcnt--;
+		need_free = true;
+	}
+	spin_unlock(&nm_i->free_nid_list_lock);
+
+	if (need_free)
+		kmem_cache_free(free_nid_slab, i);
+}
+
+static void scan_nat_page(struct f2fs_sb_info *sbi,
+			struct page *nat_page, nid_t start_nid)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct f2fs_nat_block *nat_blk = page_address(nat_page);
+	block_t blk_addr;
+	int i;
+
+	i = start_nid % NAT_ENTRY_PER_BLOCK;
+
+	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
+
+		if (unlikely(start_nid >= nm_i->max_nid))
+			break;
+
+		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
+		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
+		if (blk_addr == NULL_ADDR) {
+			if (add_free_nid(sbi, start_nid, true) < 0)
+				break;
+		}
+	}
+}
+
+static void build_free_nids(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	int i = 0;
+	nid_t nid = nm_i->next_scan_nid;
+
+	/* Enough entries */
+	if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+		return;
+
+	/* readahead nat pages to be scanned */
+	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
+
+	while (1) {
+		struct page *page = get_current_nat_page(sbi, nid);
+
+		scan_nat_page(sbi, page, nid);
+		f2fs_put_page(page, 1);
+
+		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
+		if (unlikely(nid >= nm_i->max_nid))
+			nid = 0;
+
+		if (i++ == FREE_NID_PAGES)
+			break;
+	}
+
+	/* go to the next free nat pages to find free nids abundantly */
+	nm_i->next_scan_nid = nid;
+
+	/* find free nids from current sum_pages */
+	mutex_lock(&curseg->curseg_mutex);
+	for (i = 0; i < nats_in_cursum(sum); i++) {
+		block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
+		nid = le32_to_cpu(nid_in_journal(sum, i));
+		if (addr == NULL_ADDR)
+			add_free_nid(sbi, nid, true);
+		else
+			remove_free_nid(nm_i, nid);
+	}
+	mutex_unlock(&curseg->curseg_mutex);
+}
+
+/*
+ * If this function returns success, caller can obtain a new nid
+ * from second parameter of this function.
+ * The returned nid could be used ino as well as nid when inode is created.
+ */
+bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *i = NULL;
+retry:
+	if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
+		return false;
+
+	spin_lock(&nm_i->free_nid_list_lock);
+
+	/* We should not use stale free nids created by build_free_nids */
+	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
+		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
+		list_for_each_entry(i, &nm_i->free_nid_list, list)
+			if (i->state == NID_NEW)
+				break;
+
+		f2fs_bug_on(sbi, i->state != NID_NEW);
+		*nid = i->nid;
+		i->state = NID_ALLOC;
+		nm_i->fcnt--;
+		spin_unlock(&nm_i->free_nid_list_lock);
+		return true;
+	}
+	spin_unlock(&nm_i->free_nid_list_lock);
+
+	/* Let's scan nat pages and its caches to get free nids */
+	mutex_lock(&nm_i->build_lock);
+	build_free_nids(sbi);
+	mutex_unlock(&nm_i->build_lock);
+	goto retry;
+}
+
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *i;
+
+	spin_lock(&nm_i->free_nid_list_lock);
+	i = __lookup_free_nid_list(nm_i, nid);
+	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+	__del_from_free_nid_list(nm_i, i);
+	spin_unlock(&nm_i->free_nid_list_lock);
+
+	kmem_cache_free(free_nid_slab, i);
+}
+
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *i;
+	bool need_free = false;
+
+	if (!nid)
+		return;
+
+	spin_lock(&nm_i->free_nid_list_lock);
+	i = __lookup_free_nid_list(nm_i, nid);
+	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+	if (!available_free_memory(sbi, FREE_NIDS)) {
+		__del_from_free_nid_list(nm_i, i);
+		need_free = true;
+	} else {
+		i->state = NID_NEW;
+		nm_i->fcnt++;
+	}
+	spin_unlock(&nm_i->free_nid_list_lock);
+
+	if (need_free)
+		kmem_cache_free(free_nid_slab, i);
+}
+
+void recover_inline_xattr(struct inode *inode, struct page *page)
+{
+	void *src_addr, *dst_addr;
+	size_t inline_size;
+	struct page *ipage;
+	struct f2fs_inode *ri;
+
+	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+	f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
+
+	ri = F2FS_INODE(page);
+	if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+		clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+		goto update_inode;
+	}
+
+	dst_addr = inline_xattr_addr(ipage);
+	src_addr = inline_xattr_addr(page);
+	inline_size = inline_xattr_size(inode);
+
+	f2fs_wait_on_page_writeback(ipage, NODE);
+	memcpy(dst_addr, src_addr, inline_size);
+update_inode:
+	update_inode(inode, ipage);
+	f2fs_put_page(ipage, 1);
+}
+
+void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
+	nid_t new_xnid = nid_of_node(page);
+	struct node_info ni;
+
+	/* 1: invalidate the previous xattr nid */
+	if (!prev_xnid)
+		goto recover_xnid;
+
+	/* Deallocate node address */
+	get_node_info(sbi, prev_xnid, &ni);
+	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
+	invalidate_blocks(sbi, ni.blk_addr);
+	dec_valid_node_count(sbi, inode);
+	set_node_addr(sbi, &ni, NULL_ADDR, false);
+
+recover_xnid:
+	/* 2: allocate new xattr nid */
+	if (unlikely(!inc_valid_node_count(sbi, inode)))
+		f2fs_bug_on(sbi, 1);
+
+	remove_free_nid(NM_I(sbi), new_xnid);
+	get_node_info(sbi, new_xnid, &ni);
+	ni.ino = inode->i_ino;
+	set_node_addr(sbi, &ni, NEW_ADDR, false);
+	F2FS_I(inode)->i_xattr_nid = new_xnid;
+
+	/* 3: update xattr blkaddr */
+	refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
+	set_node_addr(sbi, &ni, blkaddr, false);
+
+	update_inode_page(inode);
+}
+
+int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_inode *src, *dst;
+	nid_t ino = ino_of_node(page);
+	struct node_info old_ni, new_ni;
+	struct page *ipage;
+
+	get_node_info(sbi, ino, &old_ni);
+
+	if (unlikely(old_ni.blk_addr != NULL_ADDR))
+		return -EINVAL;
+
+	ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
+	if (!ipage)
+		return -ENOMEM;
+
+	/* Should not use this inode from free nid list */
+	remove_free_nid(NM_I(sbi), ino);
+
+	SetPageUptodate(ipage);
+	fill_node_footer(ipage, ino, ino, 0, true);
+
+	src = F2FS_INODE(page);
+	dst = F2FS_INODE(ipage);
+
+	memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
+	dst->i_size = 0;
+	dst->i_blocks = cpu_to_le64(1);
+	dst->i_links = cpu_to_le32(1);
+	dst->i_xattr_nid = 0;
+	dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
+
+	new_ni = old_ni;
+	new_ni.ino = ino;
+
+	if (unlikely(!inc_valid_node_count(sbi, NULL)))
+		WARN_ON(1);
+	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
+	inc_valid_inode_count(sbi);
+	set_page_dirty(ipage);
+	f2fs_put_page(ipage, 1);
+	return 0;
+}
+
+int restore_node_summary(struct f2fs_sb_info *sbi,
+			unsigned int segno, struct f2fs_summary_block *sum)
+{
+	struct f2fs_node *rn;
+	struct f2fs_summary *sum_entry;
+	block_t addr;
+	int bio_blocks = MAX_BIO_BLOCKS(sbi);
+	int i, idx, last_offset, nrpages;
+
+	/* scan the node segment */
+	last_offset = sbi->blocks_per_seg;
+	addr = START_BLOCK(sbi, segno);
+	sum_entry = &sum->entries[0];
+
+	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
+		nrpages = min(last_offset - i, bio_blocks);
+
+		/* readahead node pages */
+		ra_meta_pages(sbi, addr, nrpages, META_POR);
+
+		for (idx = addr; idx < addr + nrpages; idx++) {
+			struct page *page = get_meta_page(sbi, idx);
+
+			rn = F2FS_NODE(page);
+			sum_entry->nid = rn->footer.nid;
+			sum_entry->version = 0;
+			sum_entry->ofs_in_node = 0;
+			sum_entry++;
+			f2fs_put_page(page, 1);
+		}
+
+		invalidate_mapping_pages(META_MAPPING(sbi), addr,
+							addr + nrpages);
+	}
+	return 0;
+}
+
+static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	int i;
+
+	mutex_lock(&curseg->curseg_mutex);
+	for (i = 0; i < nats_in_cursum(sum); i++) {
+		struct nat_entry *ne;
+		struct f2fs_nat_entry raw_ne;
+		nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+
+		raw_ne = nat_in_journal(sum, i);
+
+		down_write(&nm_i->nat_tree_lock);
+		ne = __lookup_nat_cache(nm_i, nid);
+		if (!ne) {
+			ne = grab_nat_entry(nm_i, nid);
+			node_info_from_raw_nat(&ne->ni, &raw_ne);
+		}
+		__set_nat_cache_dirty(nm_i, ne);
+		up_write(&nm_i->nat_tree_lock);
+	}
+	update_nats_in_cursum(sum, -i);
+	mutex_unlock(&curseg->curseg_mutex);
+}
+
+static void __adjust_nat_entry_set(struct nat_entry_set *nes,
+						struct list_head *head, int max)
+{
+	struct nat_entry_set *cur;
+
+	if (nes->entry_cnt >= max)
+		goto add_out;
+
+	list_for_each_entry(cur, head, set_list) {
+		if (cur->entry_cnt >= nes->entry_cnt) {
+			list_add(&nes->set_list, cur->set_list.prev);
+			return;
+		}
+	}
+add_out:
+	list_add_tail(&nes->set_list, head);
+}
+
+static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+					struct nat_entry_set *set)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
+	bool to_journal = true;
+	struct f2fs_nat_block *nat_blk;
+	struct nat_entry *ne, *cur;
+	struct page *page = NULL;
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	/*
+	 * there are two steps to flush nat entries:
+	 * #1, flush nat entries to journal in current hot data summary block.
+	 * #2, flush nat entries to nat page.
+	 */
+	if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+		to_journal = false;
+
+	if (to_journal) {
+		mutex_lock(&curseg->curseg_mutex);
+	} else {
+		page = get_next_nat_page(sbi, start_nid);
+		nat_blk = page_address(page);
+		f2fs_bug_on(sbi, !nat_blk);
+	}
+
+	/* flush dirty nats in nat entry set */
+	list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
+		struct f2fs_nat_entry *raw_ne;
+		nid_t nid = nat_get_nid(ne);
+		int offset;
+
+		if (nat_get_blkaddr(ne) == NEW_ADDR)
+			continue;
+
+		if (to_journal) {
+			offset = lookup_journal_in_cursum(sum,
+							NAT_JOURNAL, nid, 1);
+			f2fs_bug_on(sbi, offset < 0);
+			raw_ne = &nat_in_journal(sum, offset);
+			nid_in_journal(sum, offset) = cpu_to_le32(nid);
+		} else {
+			raw_ne = &nat_blk->entries[nid - start_nid];
+		}
+		raw_nat_from_node_info(raw_ne, &ne->ni);
+
+		down_write(&NM_I(sbi)->nat_tree_lock);
+		nat_reset_flag(ne);
+		__clear_nat_cache_dirty(NM_I(sbi), ne);
+		up_write(&NM_I(sbi)->nat_tree_lock);
+
+		if (nat_get_blkaddr(ne) == NULL_ADDR)
+			add_free_nid(sbi, nid, false);
+	}
+
+	if (to_journal)
+		mutex_unlock(&curseg->curseg_mutex);
+	else
+		f2fs_put_page(page, 1);
+
+	f2fs_bug_on(sbi, set->entry_cnt);
+
+	down_write(&nm_i->nat_tree_lock);
+	radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+	up_write(&nm_i->nat_tree_lock);
+	kmem_cache_free(nat_entry_set_slab, set);
+}
+
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct nat_entry_set *setvec[SETVEC_SIZE];
+	struct nat_entry_set *set, *tmp;
+	unsigned int found;
+	nid_t set_idx = 0;
+	LIST_HEAD(sets);
+
+	if (!nm_i->dirty_nat_cnt)
+		return;
+	/*
+	 * if there are no enough space in journal to store dirty nat
+	 * entries, remove all entries from journal and merge them
+	 * into nat entry set.
+	 */
+	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+		remove_nats_in_journal(sbi);
+
+	down_write(&nm_i->nat_tree_lock);
+	while ((found = __gang_lookup_nat_set(nm_i,
+					set_idx, SETVEC_SIZE, setvec))) {
+		unsigned idx;
+		set_idx = setvec[found - 1]->set + 1;
+		for (idx = 0; idx < found; idx++)
+			__adjust_nat_entry_set(setvec[idx], &sets,
+							MAX_NAT_JENTRIES(sum));
+	}
+	up_write(&nm_i->nat_tree_lock);
+
+	/* flush dirty nats in nat entry set */
+	list_for_each_entry_safe(set, tmp, &sets, set_list)
+		__flush_nat_entry_set(sbi, set);
+
+	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
+}
+
+static int init_node_manager(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned char *version_bitmap;
+	unsigned int nat_segs, nat_blocks;
+
+	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
+
+	/* segment_count_nat includes pair segment so divide to 2. */
+	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
+	nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+
+	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+
+	/* not used nids: 0, node, meta, (and root counted as valid node) */
+	nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
+	nm_i->fcnt = 0;
+	nm_i->nat_cnt = 0;
+	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+
+	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
+	INIT_LIST_HEAD(&nm_i->free_nid_list);
+	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
+	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
+	INIT_LIST_HEAD(&nm_i->nat_entries);
+
+	mutex_init(&nm_i->build_lock);
+	spin_lock_init(&nm_i->free_nid_list_lock);
+	init_rwsem(&nm_i->nat_tree_lock);
+
+	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
+	nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
+	version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
+	if (!version_bitmap)
+		return -EFAULT;
+
+	nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
+					GFP_KERNEL);
+	if (!nm_i->nat_bitmap)
+		return -ENOMEM;
+	return 0;
+}
+
+int build_node_manager(struct f2fs_sb_info *sbi)
+{
+	int err;
+
+	sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
+	if (!sbi->nm_info)
+		return -ENOMEM;
+
+	err = init_node_manager(sbi);
+	if (err)
+		return err;
+
+	build_free_nids(sbi);
+	return 0;
+}
+
+void destroy_node_manager(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *i, *next_i;
+	struct nat_entry *natvec[NATVEC_SIZE];
+	struct nat_entry_set *setvec[SETVEC_SIZE];
+	nid_t nid = 0;
+	unsigned int found;
+
+	if (!nm_i)
+		return;
+
+	/* destroy free nid list */
+	spin_lock(&nm_i->free_nid_list_lock);
+	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+		f2fs_bug_on(sbi, i->state == NID_ALLOC);
+		__del_from_free_nid_list(nm_i, i);
+		nm_i->fcnt--;
+		spin_unlock(&nm_i->free_nid_list_lock);
+		kmem_cache_free(free_nid_slab, i);
+		spin_lock(&nm_i->free_nid_list_lock);
+	}
+	f2fs_bug_on(sbi, nm_i->fcnt);
+	spin_unlock(&nm_i->free_nid_list_lock);
+
+	/* destroy nat cache */
+	down_write(&nm_i->nat_tree_lock);
+	while ((found = __gang_lookup_nat_cache(nm_i,
+					nid, NATVEC_SIZE, natvec))) {
+		unsigned idx;
+
+		nid = nat_get_nid(natvec[found - 1]) + 1;
+		for (idx = 0; idx < found; idx++)
+			__del_from_nat_cache(nm_i, natvec[idx]);
+	}
+	f2fs_bug_on(sbi, nm_i->nat_cnt);
+
+	/* destroy nat set cache */
+	nid = 0;
+	while ((found = __gang_lookup_nat_set(nm_i,
+					nid, SETVEC_SIZE, setvec))) {
+		unsigned idx;
+
+		nid = setvec[found - 1]->set + 1;
+		for (idx = 0; idx < found; idx++) {
+			/* entry_cnt is not zero, when cp_error was occurred */
+			f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
+			radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
+			kmem_cache_free(nat_entry_set_slab, setvec[idx]);
+		}
+	}
+	up_write(&nm_i->nat_tree_lock);
+
+	kfree(nm_i->nat_bitmap);
+	sbi->nm_info = NULL;
+	kfree(nm_i);
+}
+
+int __init create_node_manager_caches(void)
+{
+	nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+			sizeof(struct nat_entry));
+	if (!nat_entry_slab)
+		goto fail;
+
+	free_nid_slab = f2fs_kmem_cache_create("free_nid",
+			sizeof(struct free_nid));
+	if (!free_nid_slab)
+		goto destroy_nat_entry;
+
+	nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
+			sizeof(struct nat_entry_set));
+	if (!nat_entry_set_slab)
+		goto destroy_free_nid;
+	return 0;
+
+destroy_free_nid:
+	kmem_cache_destroy(free_nid_slab);
+destroy_nat_entry:
+	kmem_cache_destroy(nat_entry_slab);
+fail:
+	return -ENOMEM;
+}
+
+void destroy_node_manager_caches(void)
+{
+	kmem_cache_destroy(nat_entry_set_slab);
+	kmem_cache_destroy(free_nid_slab);
+	kmem_cache_destroy(nat_entry_slab);
+}
diff --git a/kernel/fs/f2fs/node.h b/kernel/fs/f2fs/node.h
new file mode 100644
index 000000000..c56026f17
--- /dev/null
+++ b/kernel/fs/f2fs/node.h
@@ -0,0 +1,416 @@
+/*
+ * fs/f2fs/node.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* start node id of a node block dedicated to the given node id */
+#define	START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+
+/* node block offset on the NAT area dedicated to the given start node id */
+#define	NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+
+/* # of pages to perform readahead before building free nids */
+#define FREE_NID_PAGES 4
+
+/* maximum readahead size for node during getting data blocks */
+#define MAX_RA_NODE		128
+
+/* control the memory footprint threshold (10MB per 1GB ram) */
+#define DEF_RAM_THRESHOLD	10
+
+/* vector size for gang look-up from nat cache that consists of radix tree */
+#define NATVEC_SIZE	64
+#define SETVEC_SIZE	32
+
+/* return value for read_node_page */
+#define LOCKED_PAGE	1
+
+/* For flag in struct node_info */
+enum {
+	IS_CHECKPOINTED,	/* is it checkpointed before? */
+	HAS_FSYNCED_INODE,	/* is the inode fsynced before? */
+	HAS_LAST_FSYNC,		/* has the latest node fsync mark? */
+	IS_DIRTY,		/* this nat entry is dirty? */
+};
+
+/*
+ * For node information
+ */
+struct node_info {
+	nid_t nid;		/* node id */
+	nid_t ino;		/* inode number of the node's owner */
+	block_t	blk_addr;	/* block address of the node */
+	unsigned char version;	/* version of the node */
+	unsigned char flag;	/* for node information bits */
+};
+
+struct nat_entry {
+	struct list_head list;	/* for clean or dirty nat list */
+	struct node_info ni;	/* in-memory node information */
+};
+
+#define nat_get_nid(nat)		(nat->ni.nid)
+#define nat_set_nid(nat, n)		(nat->ni.nid = n)
+#define nat_get_blkaddr(nat)		(nat->ni.blk_addr)
+#define nat_set_blkaddr(nat, b)		(nat->ni.blk_addr = b)
+#define nat_get_ino(nat)		(nat->ni.ino)
+#define nat_set_ino(nat, i)		(nat->ni.ino = i)
+#define nat_get_version(nat)		(nat->ni.version)
+#define nat_set_version(nat, v)		(nat->ni.version = v)
+
+#define inc_node_version(version)	(++version)
+
+static inline void copy_node_info(struct node_info *dst,
+						struct node_info *src)
+{
+	dst->nid = src->nid;
+	dst->ino = src->ino;
+	dst->blk_addr = src->blk_addr;
+	dst->version = src->version;
+	/* should not copy flag here */
+}
+
+static inline void set_nat_flag(struct nat_entry *ne,
+				unsigned int type, bool set)
+{
+	unsigned char mask = 0x01 << type;
+	if (set)
+		ne->ni.flag |= mask;
+	else
+		ne->ni.flag &= ~mask;
+}
+
+static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
+{
+	unsigned char mask = 0x01 << type;
+	return ne->ni.flag & mask;
+}
+
+static inline void nat_reset_flag(struct nat_entry *ne)
+{
+	/* these states can be set only after checkpoint was done */
+	set_nat_flag(ne, IS_CHECKPOINTED, true);
+	set_nat_flag(ne, HAS_FSYNCED_INODE, false);
+	set_nat_flag(ne, HAS_LAST_FSYNC, true);
+}
+
+static inline void node_info_from_raw_nat(struct node_info *ni,
+						struct f2fs_nat_entry *raw_ne)
+{
+	ni->ino = le32_to_cpu(raw_ne->ino);
+	ni->blk_addr = le32_to_cpu(raw_ne->block_addr);
+	ni->version = raw_ne->version;
+}
+
+static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
+						struct node_info *ni)
+{
+	raw_ne->ino = cpu_to_le32(ni->ino);
+	raw_ne->block_addr = cpu_to_le32(ni->blk_addr);
+	raw_ne->version = ni->version;
+}
+
+enum mem_type {
+	FREE_NIDS,	/* indicates the free nid list */
+	NAT_ENTRIES,	/* indicates the cached nat entry */
+	DIRTY_DENTS,	/* indicates dirty dentry pages */
+	INO_ENTRIES,	/* indicates inode entries */
+	EXTENT_CACHE,	/* indicates extent cache */
+	BASE_CHECK,	/* check kernel status */
+};
+
+struct nat_entry_set {
+	struct list_head set_list;	/* link with other nat sets */
+	struct list_head entry_list;	/* link with dirty nat entries */
+	nid_t set;			/* set number*/
+	unsigned int entry_cnt;		/* the # of nat entries in set */
+};
+
+/*
+ * For free nid mangement
+ */
+enum nid_state {
+	NID_NEW,	/* newly added to free nid list */
+	NID_ALLOC	/* it is allocated */
+};
+
+struct free_nid {
+	struct list_head list;	/* for free node id list */
+	nid_t nid;		/* node id */
+	int state;		/* in use or not: NID_NEW or NID_ALLOC */
+};
+
+static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *fnid;
+
+	spin_lock(&nm_i->free_nid_list_lock);
+	if (nm_i->fcnt <= 0) {
+		spin_unlock(&nm_i->free_nid_list_lock);
+		return;
+	}
+	fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+	*nid = fnid->nid;
+	spin_unlock(&nm_i->free_nid_list_lock);
+}
+
+/*
+ * inline functions
+ */
+static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
+}
+
+static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	pgoff_t block_off;
+	pgoff_t block_addr;
+	int seg_off;
+
+	block_off = NAT_BLOCK_OFFSET(start);
+	seg_off = block_off >> sbi->log_blocks_per_seg;
+
+	block_addr = (pgoff_t)(nm_i->nat_blkaddr +
+		(seg_off << sbi->log_blocks_per_seg << 1) +
+		(block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+
+	if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
+		block_addr += sbi->blocks_per_seg;
+
+	return block_addr;
+}
+
+static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
+						pgoff_t block_addr)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	block_addr -= nm_i->nat_blkaddr;
+	if ((block_addr >> sbi->log_blocks_per_seg) % 2)
+		block_addr -= sbi->blocks_per_seg;
+	else
+		block_addr += sbi->blocks_per_seg;
+
+	return block_addr + nm_i->nat_blkaddr;
+}
+
+static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
+{
+	unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
+
+	f2fs_change_bit(block_off, nm_i->nat_bitmap);
+}
+
+static inline void fill_node_footer(struct page *page, nid_t nid,
+				nid_t ino, unsigned int ofs, bool reset)
+{
+	struct f2fs_node *rn = F2FS_NODE(page);
+	unsigned int old_flag = 0;
+
+	if (reset)
+		memset(rn, 0, sizeof(*rn));
+	else
+		old_flag = le32_to_cpu(rn->footer.flag);
+
+	rn->footer.nid = cpu_to_le32(nid);
+	rn->footer.ino = cpu_to_le32(ino);
+
+	/* should remain old flag bits such as COLD_BIT_SHIFT */
+	rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
+					(old_flag & OFFSET_BIT_MASK));
+}
+
+static inline void copy_node_footer(struct page *dst, struct page *src)
+{
+	struct f2fs_node *src_rn = F2FS_NODE(src);
+	struct f2fs_node *dst_rn = F2FS_NODE(dst);
+	memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
+}
+
+static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+	struct f2fs_node *rn = F2FS_NODE(page);
+
+	rn->footer.cp_ver = ckpt->checkpoint_ver;
+	rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
+}
+
+static inline nid_t ino_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.ino);
+}
+
+static inline nid_t nid_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.nid);
+}
+
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	unsigned flag = le32_to_cpu(rn->footer.flag);
+	return flag >> OFFSET_BIT_SHIFT;
+}
+
+static inline unsigned long long cpver_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le64_to_cpu(rn->footer.cp_ver);
+}
+
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.next_blkaddr);
+}
+
+/*
+ * f2fs assigns the following node offsets described as (num).
+ * N = NIDS_PER_BLOCK
+ *
+ *  Inode block (0)
+ *    |- direct node (1)
+ *    |- direct node (2)
+ *    |- indirect node (3)
+ *    |            `- direct node (4 => 4 + N - 1)
+ *    |- indirect node (4 + N)
+ *    |            `- direct node (5 + N => 5 + 2N - 1)
+ *    `- double indirect node (5 + 2N)
+ *                 `- indirect node (6 + 2N)
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + x(N + 1))
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + (N - 1)(N + 1))
+ *                       `- direct node
+ */
+static inline bool IS_DNODE(struct page *node_page)
+{
+	unsigned int ofs = ofs_of_node(node_page);
+
+	if (f2fs_has_xattr_block(ofs))
+		return false;
+
+	if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
+			ofs == 5 + 2 * NIDS_PER_BLOCK)
+		return false;
+	if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
+		ofs -= 6 + 2 * NIDS_PER_BLOCK;
+		if (!((long int)ofs % (NIDS_PER_BLOCK + 1)))
+			return false;
+	}
+	return true;
+}
+
+static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+{
+	struct f2fs_node *rn = F2FS_NODE(p);
+
+	f2fs_wait_on_page_writeback(p, NODE);
+
+	if (i)
+		rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
+	else
+		rn->in.nid[off] = cpu_to_le32(nid);
+	set_page_dirty(p);
+}
+
+static inline nid_t get_nid(struct page *p, int off, bool i)
+{
+	struct f2fs_node *rn = F2FS_NODE(p);
+
+	if (i)
+		return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
+	return le32_to_cpu(rn->in.nid[off]);
+}
+
+/*
+ * Coldness identification:
+ *  - Mark cold files in f2fs_inode_info
+ *  - Mark cold node blocks in their node footer
+ *  - Mark cold data pages in page cache
+ */
+static inline int is_file(struct inode *inode, int type)
+{
+	return F2FS_I(inode)->i_advise & type;
+}
+
+static inline void set_file(struct inode *inode, int type)
+{
+	F2FS_I(inode)->i_advise |= type;
+}
+
+static inline void clear_file(struct inode *inode, int type)
+{
+	F2FS_I(inode)->i_advise &= ~type;
+}
+
+#define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode)	set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode)	set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode)	clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode)	clear_file(inode, FADVISE_LOST_PINO_BIT)
+
+static inline int is_cold_data(struct page *page)
+{
+	return PageChecked(page);
+}
+
+static inline void set_cold_data(struct page *page)
+{
+	SetPageChecked(page);
+}
+
+static inline void clear_cold_data(struct page *page)
+{
+	ClearPageChecked(page);
+}
+
+static inline int is_node(struct page *page, int type)
+{
+	struct f2fs_node *rn = F2FS_NODE(page);
+	return le32_to_cpu(rn->footer.flag) & (1 << type);
+}
+
+#define is_cold_node(page)	is_node(page, COLD_BIT_SHIFT)
+#define is_fsync_dnode(page)	is_node(page, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(page)	is_node(page, DENT_BIT_SHIFT)
+
+static inline void set_cold_node(struct inode *inode, struct page *page)
+{
+	struct f2fs_node *rn = F2FS_NODE(page);
+	unsigned int flag = le32_to_cpu(rn->footer.flag);
+
+	if (S_ISDIR(inode->i_mode))
+		flag &= ~(0x1 << COLD_BIT_SHIFT);
+	else
+		flag |= (0x1 << COLD_BIT_SHIFT);
+	rn->footer.flag = cpu_to_le32(flag);
+}
+
+static inline void set_mark(struct page *page, int mark, int type)
+{
+	struct f2fs_node *rn = F2FS_NODE(page);
+	unsigned int flag = le32_to_cpu(rn->footer.flag);
+	if (mark)
+		flag |= (0x1 << type);
+	else
+		flag &= ~(0x1 << type);
+	rn->footer.flag = cpu_to_le32(flag);
+}
+#define set_dentry_mark(page, mark)	set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark)	set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/kernel/fs/f2fs/recovery.c b/kernel/fs/f2fs/recovery.c
new file mode 100644
index 000000000..8d8ea99f2
--- /dev/null
+++ b/kernel/fs/f2fs/recovery.c
@@ -0,0 +1,575 @@
+/*
+ * fs/f2fs/recovery.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+/*
+ * Roll forward recovery scenarios.
+ *
+ * [Term] F: fsync_mark, D: dentry_mark
+ *
+ * 1. inode(x) | CP | inode(x) | dnode(F)
+ * -> Update the latest inode(x).
+ *
+ * 2. inode(x) | CP | inode(F) | dnode(F)
+ * -> No problem.
+ *
+ * 3. inode(x) | CP | dnode(F) | inode(x)
+ * -> Recover to the latest dnode(F), and drop the last inode(x)
+ *
+ * 4. inode(x) | CP | dnode(F) | inode(F)
+ * -> No problem.
+ *
+ * 5. CP | inode(x) | dnode(F)
+ * -> The inode(DF) was missing. Should drop this dnode(F).
+ *
+ * 6. CP | inode(DF) | dnode(F)
+ * -> No problem.
+ *
+ * 7. CP | dnode(F) | inode(DF)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *
+ * 8. CP | dnode(F) | inode(x)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *    But it will fail due to no inode(DF).
+ */
+
+static struct kmem_cache *fsync_entry_slab;
+
+bool space_for_roll_forward(struct f2fs_sb_info *sbi)
+{
+	if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
+			> sbi->user_block_count)
+		return false;
+	return true;
+}
+
+static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
+								nid_t ino)
+{
+	struct fsync_inode_entry *entry;
+
+	list_for_each_entry(entry, head, list)
+		if (entry->inode->i_ino == ino)
+			return entry;
+
+	return NULL;
+}
+
+static int recover_dentry(struct inode *inode, struct page *ipage)
+{
+	struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
+	nid_t pino = le32_to_cpu(raw_inode->i_pino);
+	struct f2fs_dir_entry *de;
+	struct qstr name;
+	struct page *page;
+	struct inode *dir, *einode;
+	int err = 0;
+
+	dir = f2fs_iget(inode->i_sb, pino);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		goto out;
+	}
+
+	name.len = le32_to_cpu(raw_inode->i_namelen);
+	name.name = raw_inode->i_name;
+
+	if (unlikely(name.len > F2FS_NAME_LEN)) {
+		WARN_ON(1);
+		err = -ENAMETOOLONG;
+		goto out_err;
+	}
+retry:
+	de = f2fs_find_entry(dir, &name, &page);
+	if (de && inode->i_ino == le32_to_cpu(de->ino))
+		goto out_unmap_put;
+
+	if (de) {
+		einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+		if (IS_ERR(einode)) {
+			WARN_ON(1);
+			err = PTR_ERR(einode);
+			if (err == -ENOENT)
+				err = -EEXIST;
+			goto out_unmap_put;
+		}
+		err = acquire_orphan_inode(F2FS_I_SB(inode));
+		if (err) {
+			iput(einode);
+			goto out_unmap_put;
+		}
+		f2fs_delete_entry(de, page, dir, einode);
+		iput(einode);
+		goto retry;
+	}
+	err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
+	if (err)
+		goto out_err;
+
+	if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) {
+		iput(dir);
+	} else {
+		add_dirty_dir_inode(dir);
+		set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+	}
+
+	goto out;
+
+out_unmap_put:
+	f2fs_dentry_kunmap(dir, page);
+	f2fs_put_page(page, 0);
+out_err:
+	iput(dir);
+out:
+	f2fs_msg(inode->i_sb, KERN_NOTICE,
+			"%s: ino = %x, name = %s, dir = %lx, err = %d",
+			__func__, ino_of_node(ipage), raw_inode->i_name,
+			IS_ERR(dir) ? 0 : dir->i_ino, err);
+	return err;
+}
+
+static void recover_inode(struct inode *inode, struct page *page)
+{
+	struct f2fs_inode *raw = F2FS_INODE(page);
+
+	inode->i_mode = le16_to_cpu(raw->i_mode);
+	i_size_write(inode, le64_to_cpu(raw->i_size));
+	inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+	inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
+	inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+
+	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+			ino_of_node(page), F2FS_INODE(page)->i_name);
+}
+
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+{
+	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
+	struct curseg_info *curseg;
+	struct page *page = NULL;
+	block_t blkaddr;
+	int err = 0;
+
+	/* get node pages in the current segment */
+	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
+	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+
+	ra_meta_pages(sbi, blkaddr, 1, META_POR);
+
+	while (1) {
+		struct fsync_inode_entry *entry;
+
+		if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+			return 0;
+
+		page = get_meta_page(sbi, blkaddr);
+
+		if (cp_ver != cpver_of_node(page))
+			break;
+
+		if (!is_fsync_dnode(page))
+			goto next;
+
+		entry = get_fsync_inode(head, ino_of_node(page));
+		if (!entry) {
+			if (IS_INODE(page) && is_dent_dnode(page)) {
+				err = recover_inode_page(sbi, page);
+				if (err)
+					break;
+			}
+
+			/* add this fsync inode to the list */
+			entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+			if (!entry) {
+				err = -ENOMEM;
+				break;
+			}
+			/*
+			 * CP | dnode(F) | inode(DF)
+			 * For this case, we should not give up now.
+			 */
+			entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
+			if (IS_ERR(entry->inode)) {
+				err = PTR_ERR(entry->inode);
+				kmem_cache_free(fsync_entry_slab, entry);
+				if (err == -ENOENT) {
+					err = 0;
+					goto next;
+				}
+				break;
+			}
+			list_add_tail(&entry->list, head);
+		}
+		entry->blkaddr = blkaddr;
+
+		if (IS_INODE(page)) {
+			entry->last_inode = blkaddr;
+			if (is_dent_dnode(page))
+				entry->last_dentry = blkaddr;
+		}
+next:
+		/* check next segment */
+		blkaddr = next_blkaddr_of_node(page);
+		f2fs_put_page(page, 1);
+
+		ra_meta_pages_cond(sbi, blkaddr);
+	}
+	f2fs_put_page(page, 1);
+	return err;
+}
+
+static void destroy_fsync_dnodes(struct list_head *head)
+{
+	struct fsync_inode_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, head, list) {
+		iput(entry->inode);
+		list_del(&entry->list);
+		kmem_cache_free(fsync_entry_slab, entry);
+	}
+}
+
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+			block_t blkaddr, struct dnode_of_data *dn)
+{
+	struct seg_entry *sentry;
+	unsigned int segno = GET_SEGNO(sbi, blkaddr);
+	unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+	struct f2fs_summary_block *sum_node;
+	struct f2fs_summary sum;
+	struct page *sum_page, *node_page;
+	struct dnode_of_data tdn = *dn;
+	nid_t ino, nid;
+	struct inode *inode;
+	unsigned int offset;
+	block_t bidx;
+	int i;
+
+	sentry = get_seg_entry(sbi, segno);
+	if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
+		return 0;
+
+	/* Get the previous summary */
+	for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
+		struct curseg_info *curseg = CURSEG_I(sbi, i);
+		if (curseg->segno == segno) {
+			sum = curseg->sum_blk->entries[blkoff];
+			goto got_it;
+		}
+	}
+
+	sum_page = get_sum_page(sbi, segno);
+	sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+	sum = sum_node->entries[blkoff];
+	f2fs_put_page(sum_page, 1);
+got_it:
+	/* Use the locked dnode page and inode */
+	nid = le32_to_cpu(sum.nid);
+	if (dn->inode->i_ino == nid) {
+		tdn.nid = nid;
+		if (!dn->inode_page_locked)
+			lock_page(dn->inode_page);
+		tdn.node_page = dn->inode_page;
+		tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+		goto truncate_out;
+	} else if (dn->nid == nid) {
+		tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+		goto truncate_out;
+	}
+
+	/* Get the node page */
+	node_page = get_node_page(sbi, nid);
+	if (IS_ERR(node_page))
+		return PTR_ERR(node_page);
+
+	offset = ofs_of_node(node_page);
+	ino = ino_of_node(node_page);
+	f2fs_put_page(node_page, 1);
+
+	if (ino != dn->inode->i_ino) {
+		/* Deallocate previous index in the node page */
+		inode = f2fs_iget(sbi->sb, ino);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+	} else {
+		inode = dn->inode;
+	}
+
+	bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
+			le16_to_cpu(sum.ofs_in_node);
+
+	/*
+	 * if inode page is locked, unlock temporarily, but its reference
+	 * count keeps alive.
+	 */
+	if (ino == dn->inode->i_ino && dn->inode_page_locked)
+		unlock_page(dn->inode_page);
+
+	set_new_dnode(&tdn, inode, NULL, NULL, 0);
+	if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+		goto out;
+
+	if (tdn.data_blkaddr == blkaddr)
+		truncate_data_blocks_range(&tdn, 1);
+
+	f2fs_put_dnode(&tdn);
+out:
+	if (ino != dn->inode->i_ino)
+		iput(inode);
+	else if (dn->inode_page_locked)
+		lock_page(dn->inode_page);
+	return 0;
+
+truncate_out:
+	if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
+		truncate_data_blocks_range(&tdn, 1);
+	if (dn->inode->i_ino == nid && !dn->inode_page_locked)
+		unlock_page(dn->inode_page);
+	return 0;
+}
+
+static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
+					struct page *page, block_t blkaddr)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	unsigned int start, end;
+	struct dnode_of_data dn;
+	struct f2fs_summary sum;
+	struct node_info ni;
+	int err = 0, recovered = 0;
+
+	/* step 1: recover xattr */
+	if (IS_INODE(page)) {
+		recover_inline_xattr(inode, page);
+	} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+		/*
+		 * Deprecated; xattr blocks should be found from cold log.
+		 * But, we should remain this for backward compatibility.
+		 */
+		recover_xattr_data(inode, page, blkaddr);
+		goto out;
+	}
+
+	/* step 2: recover inline data */
+	if (recover_inline_data(inode, page))
+		goto out;
+
+	/* step 3: recover data indices */
+	start = start_bidx_of_node(ofs_of_node(page), fi);
+	end = start + ADDRS_PER_PAGE(page, fi);
+
+	f2fs_lock_op(sbi);
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+
+	err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+	if (err) {
+		f2fs_unlock_op(sbi);
+		goto out;
+	}
+
+	f2fs_wait_on_page_writeback(dn.node_page, NODE);
+
+	get_node_info(sbi, dn.nid, &ni);
+	f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
+	f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
+
+	for (; start < end; start++) {
+		block_t src, dest;
+
+		src = datablock_addr(dn.node_page, dn.ofs_in_node);
+		dest = datablock_addr(page, dn.ofs_in_node);
+
+		if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR &&
+			dest >= MAIN_BLKADDR(sbi) && dest < MAX_BLKADDR(sbi)) {
+
+			if (src == NULL_ADDR) {
+				err = reserve_new_block(&dn);
+				/* We should not get -ENOSPC */
+				f2fs_bug_on(sbi, err);
+			}
+
+			/* Check the previous node page having this index */
+			err = check_index_in_prev_nodes(sbi, dest, &dn);
+			if (err)
+				goto err;
+
+			set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+			/* write dummy data page */
+			recover_data_page(sbi, NULL, &sum, src, dest);
+			dn.data_blkaddr = dest;
+			set_data_blkaddr(&dn);
+			f2fs_update_extent_cache(&dn);
+			recovered++;
+		}
+		dn.ofs_in_node++;
+	}
+
+	if (IS_INODE(dn.node_page))
+		sync_inode_page(&dn);
+
+	copy_node_footer(dn.node_page, page);
+	fill_node_footer(dn.node_page, dn.nid, ni.ino,
+					ofs_of_node(page), false);
+	set_page_dirty(dn.node_page);
+err:
+	f2fs_put_dnode(&dn);
+	f2fs_unlock_op(sbi);
+out:
+	f2fs_msg(sbi->sb, KERN_NOTICE,
+		"recover_data: ino = %lx, recovered = %d blocks, err = %d",
+		inode->i_ino, recovered, err);
+	return err;
+}
+
+static int recover_data(struct f2fs_sb_info *sbi,
+				struct list_head *head, int type)
+{
+	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
+	struct curseg_info *curseg;
+	struct page *page = NULL;
+	int err = 0;
+	block_t blkaddr;
+
+	/* get node pages in the current segment */
+	curseg = CURSEG_I(sbi, type);
+	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+
+	while (1) {
+		struct fsync_inode_entry *entry;
+
+		if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+			break;
+
+		ra_meta_pages_cond(sbi, blkaddr);
+
+		page = get_meta_page(sbi, blkaddr);
+
+		if (cp_ver != cpver_of_node(page)) {
+			f2fs_put_page(page, 1);
+			break;
+		}
+
+		entry = get_fsync_inode(head, ino_of_node(page));
+		if (!entry)
+			goto next;
+		/*
+		 * inode(x) | CP | inode(x) | dnode(F)
+		 * In this case, we can lose the latest inode(x).
+		 * So, call recover_inode for the inode update.
+		 */
+		if (entry->last_inode == blkaddr)
+			recover_inode(entry->inode, page);
+		if (entry->last_dentry == blkaddr) {
+			err = recover_dentry(entry->inode, page);
+			if (err) {
+				f2fs_put_page(page, 1);
+				break;
+			}
+		}
+		err = do_recover_data(sbi, entry->inode, page, blkaddr);
+		if (err) {
+			f2fs_put_page(page, 1);
+			break;
+		}
+
+		if (entry->blkaddr == blkaddr) {
+			iput(entry->inode);
+			list_del(&entry->list);
+			kmem_cache_free(fsync_entry_slab, entry);
+		}
+next:
+		/* check next segment */
+		blkaddr = next_blkaddr_of_node(page);
+		f2fs_put_page(page, 1);
+	}
+	if (!err)
+		allocate_new_segments(sbi);
+	return err;
+}
+
+int recover_fsync_data(struct f2fs_sb_info *sbi)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
+	struct list_head inode_list;
+	block_t blkaddr;
+	int err;
+	bool need_writecp = false;
+
+	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
+			sizeof(struct fsync_inode_entry));
+	if (!fsync_entry_slab)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&inode_list);
+
+	/* step #1: find fsynced inode numbers */
+	set_sbi_flag(sbi, SBI_POR_DOING);
+
+	/* prevent checkpoint */
+	mutex_lock(&sbi->cp_mutex);
+
+	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+
+	err = find_fsync_dnodes(sbi, &inode_list);
+	if (err)
+		goto out;
+
+	if (list_empty(&inode_list))
+		goto out;
+
+	need_writecp = true;
+
+	/* step #2: recover data */
+	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+	if (!err)
+		f2fs_bug_on(sbi, !list_empty(&inode_list));
+out:
+	destroy_fsync_dnodes(&inode_list);
+	kmem_cache_destroy(fsync_entry_slab);
+
+	/* truncate meta pages to be used by the recovery */
+	truncate_inode_pages_range(META_MAPPING(sbi),
+			MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+
+	if (err) {
+		truncate_inode_pages_final(NODE_MAPPING(sbi));
+		truncate_inode_pages_final(META_MAPPING(sbi));
+	}
+
+	clear_sbi_flag(sbi, SBI_POR_DOING);
+	if (err) {
+		discard_next_dnode(sbi, blkaddr);
+
+		/* Flush all the NAT/SIT pages */
+		while (get_pages(sbi, F2FS_DIRTY_META))
+			sync_meta_pages(sbi, META, LONG_MAX);
+		set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+		mutex_unlock(&sbi->cp_mutex);
+	} else if (need_writecp) {
+		struct cp_control cpc = {
+			.reason = CP_RECOVERY,
+		};
+		mutex_unlock(&sbi->cp_mutex);
+		write_checkpoint(sbi, &cpc);
+	} else {
+		mutex_unlock(&sbi->cp_mutex);
+	}
+	return err;
+}
diff --git a/kernel/fs/f2fs/segment.c b/kernel/fs/f2fs/segment.c
new file mode 100644
index 000000000..f93966094
--- /dev/null
+++ b/kernel/fs/f2fs/segment.c
@@ -0,0 +1,2307 @@
+/*
+ * fs/f2fs/segment.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+
+#include "f2fs.h"
+#include "segment.h"
+#include "node.h"
+#include "trace.h"
+#include <trace/events/f2fs.h>
+
+#define __reverse_ffz(x) __reverse_ffs(~(x))
+
+static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *sit_entry_set_slab;
+static struct kmem_cache *inmem_entry_slab;
+
+/*
+ * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
+ * MSB and LSB are reversed in a byte by f2fs_set_bit.
+ */
+static inline unsigned long __reverse_ffs(unsigned long word)
+{
+	int num = 0;
+
+#if BITS_PER_LONG == 64
+	if ((word & 0xffffffff) == 0) {
+		num += 32;
+		word >>= 32;
+	}
+#endif
+	if ((word & 0xffff) == 0) {
+		num += 16;
+		word >>= 16;
+	}
+	if ((word & 0xff) == 0) {
+		num += 8;
+		word >>= 8;
+	}
+	if ((word & 0xf0) == 0)
+		num += 4;
+	else
+		word >>= 4;
+	if ((word & 0xc) == 0)
+		num += 2;
+	else
+		word >>= 2;
+	if ((word & 0x2) == 0)
+		num += 1;
+	return num;
+}
+
+/*
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
+ * f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * Example:
+ *                             LSB <--> MSB
+ *   f2fs_set_bit(0, bitmap) => 0000 0001
+ *   f2fs_set_bit(7, bitmap) => 1000 0000
+ */
+static unsigned long __find_rev_next_bit(const unsigned long *addr,
+			unsigned long size, unsigned long offset)
+{
+	const unsigned long *p = addr + BIT_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long tmp;
+	unsigned long mask, submask;
+	unsigned long quot, rest;
+
+	if (offset >= size)
+		return size;
+
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (!offset)
+		goto aligned;
+
+	tmp = *(p++);
+	quot = (offset >> 3) << 3;
+	rest = offset & 0x7;
+	mask = ~0UL << quot;
+	submask = (unsigned char)(0xff << rest) >> rest;
+	submask <<= quot;
+	mask &= submask;
+	tmp &= mask;
+	if (size < BITS_PER_LONG)
+		goto found_first;
+	if (tmp)
+		goto found_middle;
+
+	size -= BITS_PER_LONG;
+	result += BITS_PER_LONG;
+aligned:
+	while (size & ~(BITS_PER_LONG-1)) {
+		tmp = *(p++);
+		if (tmp)
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+found_first:
+	tmp &= (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;   /* Nope. */
+found_middle:
+	return result + __reverse_ffs(tmp);
+}
+
+static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
+			unsigned long size, unsigned long offset)
+{
+	const unsigned long *p = addr + BIT_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long tmp;
+	unsigned long mask, submask;
+	unsigned long quot, rest;
+
+	if (offset >= size)
+		return size;
+
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (!offset)
+		goto aligned;
+
+	tmp = *(p++);
+	quot = (offset >> 3) << 3;
+	rest = offset & 0x7;
+	mask = ~(~0UL << quot);
+	submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
+	submask <<= quot;
+	mask += submask;
+	tmp |= mask;
+	if (size < BITS_PER_LONG)
+		goto found_first;
+	if (~tmp)
+		goto found_middle;
+
+	size -= BITS_PER_LONG;
+	result += BITS_PER_LONG;
+aligned:
+	while (size & ~(BITS_PER_LONG - 1)) {
+		tmp = *(p++);
+		if (~tmp)
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp |= ~0UL << size;
+	if (tmp == ~0UL)        /* Are any bits zero? */
+		return result + size;   /* Nope. */
+found_middle:
+	return result + __reverse_ffz(tmp);
+}
+
+void register_inmem_page(struct inode *inode, struct page *page)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct inmem_pages *new;
+	int err;
+
+	SetPagePrivate(page);
+	f2fs_trace_pid(page);
+
+	new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
+
+	/* add atomic page indices to the list */
+	new->page = page;
+	INIT_LIST_HEAD(&new->list);
+retry:
+	/* increase reference count with clean state */
+	mutex_lock(&fi->inmem_lock);
+	err = radix_tree_insert(&fi->inmem_root, page->index, new);
+	if (err == -EEXIST) {
+		mutex_unlock(&fi->inmem_lock);
+		kmem_cache_free(inmem_entry_slab, new);
+		return;
+	} else if (err) {
+		mutex_unlock(&fi->inmem_lock);
+		goto retry;
+	}
+	get_page(page);
+	list_add_tail(&new->list, &fi->inmem_pages);
+	inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+	mutex_unlock(&fi->inmem_lock);
+
+	trace_f2fs_register_inmem_page(page, INMEM);
+}
+
+void commit_inmem_pages(struct inode *inode, bool abort)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct inmem_pages *cur, *tmp;
+	bool submit_bio = false;
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = WRITE_SYNC | REQ_PRIO,
+	};
+
+	/*
+	 * The abort is true only when f2fs_evict_inode is called.
+	 * Basically, the f2fs_evict_inode doesn't produce any data writes, so
+	 * that we don't need to call f2fs_balance_fs.
+	 * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
+	 * inode becomes free by iget_locked in f2fs_iget.
+	 */
+	if (!abort) {
+		f2fs_balance_fs(sbi);
+		f2fs_lock_op(sbi);
+	}
+
+	mutex_lock(&fi->inmem_lock);
+	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+		if (!abort) {
+			lock_page(cur->page);
+			if (cur->page->mapping == inode->i_mapping) {
+				f2fs_wait_on_page_writeback(cur->page, DATA);
+				if (clear_page_dirty_for_io(cur->page))
+					inode_dec_dirty_pages(inode);
+				trace_f2fs_commit_inmem_page(cur->page, INMEM);
+				do_write_data_page(cur->page, &fio);
+				submit_bio = true;
+			}
+			f2fs_put_page(cur->page, 1);
+		} else {
+			trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
+			put_page(cur->page);
+		}
+		radix_tree_delete(&fi->inmem_root, cur->page->index);
+		list_del(&cur->list);
+		kmem_cache_free(inmem_entry_slab, cur);
+		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+	}
+	mutex_unlock(&fi->inmem_lock);
+
+	if (!abort) {
+		f2fs_unlock_op(sbi);
+		if (submit_bio)
+			f2fs_submit_merged_bio(sbi, DATA, WRITE);
+	}
+}
+
+/*
+ * This function balances dirty node and dentry pages.
+ * In addition, it controls garbage collection.
+ */
+void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+{
+	/*
+	 * We should do GC or end up with checkpoint, if there are so many dirty
+	 * dir/node pages without enough free segments.
+	 */
+	if (has_not_enough_free_secs(sbi, 0)) {
+		mutex_lock(&sbi->gc_mutex);
+		f2fs_gc(sbi);
+	}
+}
+
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
+{
+	/* try to shrink extent cache when there is no enough memory */
+	f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+
+	/* check the # of cached NAT entries and prefree segments */
+	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
+			excess_prefree_segs(sbi) ||
+			!available_free_memory(sbi, INO_ENTRIES))
+		f2fs_sync_fs(sbi->sb, true);
+}
+
+static int issue_flush_thread(void *data)
+{
+	struct f2fs_sb_info *sbi = data;
+	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+	wait_queue_head_t *q = &fcc->flush_wait_queue;
+repeat:
+	if (kthread_should_stop())
+		return 0;
+
+	if (!llist_empty(&fcc->issue_list)) {
+		struct bio *bio = bio_alloc(GFP_NOIO, 0);
+		struct flush_cmd *cmd, *next;
+		int ret;
+
+		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
+		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
+
+		bio->bi_bdev = sbi->sb->s_bdev;
+		ret = submit_bio_wait(WRITE_FLUSH, bio);
+
+		llist_for_each_entry_safe(cmd, next,
+					  fcc->dispatch_list, llnode) {
+			cmd->ret = ret;
+			complete(&cmd->wait);
+		}
+		bio_put(bio);
+		fcc->dispatch_list = NULL;
+	}
+
+	wait_event_interruptible(*q,
+		kthread_should_stop() || !llist_empty(&fcc->issue_list));
+	goto repeat;
+}
+
+int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+{
+	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+	struct flush_cmd cmd;
+
+	trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER),
+					test_opt(sbi, FLUSH_MERGE));
+
+	if (test_opt(sbi, NOBARRIER))
+		return 0;
+
+	if (!test_opt(sbi, FLUSH_MERGE))
+		return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
+
+	init_completion(&cmd.wait);
+
+	llist_add(&cmd.llnode, &fcc->issue_list);
+
+	if (!fcc->dispatch_list)
+		wake_up(&fcc->flush_wait_queue);
+
+	wait_for_completion(&cmd.wait);
+
+	return cmd.ret;
+}
+
+int create_flush_cmd_control(struct f2fs_sb_info *sbi)
+{
+	dev_t dev = sbi->sb->s_bdev->bd_dev;
+	struct flush_cmd_control *fcc;
+	int err = 0;
+
+	fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
+	if (!fcc)
+		return -ENOMEM;
+	init_waitqueue_head(&fcc->flush_wait_queue);
+	init_llist_head(&fcc->issue_list);
+	SM_I(sbi)->cmd_control_info = fcc;
+	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
+				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
+	if (IS_ERR(fcc->f2fs_issue_flush)) {
+		err = PTR_ERR(fcc->f2fs_issue_flush);
+		kfree(fcc);
+		SM_I(sbi)->cmd_control_info = NULL;
+		return err;
+	}
+
+	return err;
+}
+
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+{
+	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+
+	if (fcc && fcc->f2fs_issue_flush)
+		kthread_stop(fcc->f2fs_issue_flush);
+	kfree(fcc);
+	SM_I(sbi)->cmd_control_info = NULL;
+}
+
+static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+		enum dirty_type dirty_type)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+	/* need not be added */
+	if (IS_CURSEG(sbi, segno))
+		return;
+
+	if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+		dirty_i->nr_dirty[dirty_type]++;
+
+	if (dirty_type == DIRTY) {
+		struct seg_entry *sentry = get_seg_entry(sbi, segno);
+		enum dirty_type t = sentry->type;
+
+		if (unlikely(t >= DIRTY)) {
+			f2fs_bug_on(sbi, 1);
+			return;
+		}
+		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
+			dirty_i->nr_dirty[t]++;
+	}
+}
+
+static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+		enum dirty_type dirty_type)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+	if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+		dirty_i->nr_dirty[dirty_type]--;
+
+	if (dirty_type == DIRTY) {
+		struct seg_entry *sentry = get_seg_entry(sbi, segno);
+		enum dirty_type t = sentry->type;
+
+		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+			dirty_i->nr_dirty[t]--;
+
+		if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+			clear_bit(GET_SECNO(sbi, segno),
+						dirty_i->victim_secmap);
+	}
+}
+
+/*
+ * Should not occur error such as -ENOMEM.
+ * Adding dirty entry into seglist is not critical operation.
+ * If a given segment is one of current working segments, it won't be added.
+ */
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	unsigned short valid_blocks;
+
+	if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
+		return;
+
+	mutex_lock(&dirty_i->seglist_lock);
+
+	valid_blocks = get_valid_blocks(sbi, segno, 0);
+
+	if (valid_blocks == 0) {
+		__locate_dirty_segment(sbi, segno, PRE);
+		__remove_dirty_segment(sbi, segno, DIRTY);
+	} else if (valid_blocks < sbi->blocks_per_seg) {
+		__locate_dirty_segment(sbi, segno, DIRTY);
+	} else {
+		/* Recovery routine with SSR needs this */
+		__remove_dirty_segment(sbi, segno, DIRTY);
+	}
+
+	mutex_unlock(&dirty_i->seglist_lock);
+}
+
+static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
+				block_t blkstart, block_t blklen)
+{
+	sector_t start = SECTOR_FROM_BLOCK(blkstart);
+	sector_t len = SECTOR_FROM_BLOCK(blklen);
+	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+}
+
+void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	if (f2fs_issue_discard(sbi, blkaddr, 1)) {
+		struct page *page = grab_meta_page(sbi, blkaddr);
+		/* zero-filled page */
+		set_page_dirty(page);
+		f2fs_put_page(page, 1);
+	}
+}
+
+static void __add_discard_entry(struct f2fs_sb_info *sbi,
+		struct cp_control *cpc, unsigned int start, unsigned int end)
+{
+	struct list_head *head = &SM_I(sbi)->discard_list;
+	struct discard_entry *new, *last;
+
+	if (!list_empty(head)) {
+		last = list_last_entry(head, struct discard_entry, list);
+		if (START_BLOCK(sbi, cpc->trim_start) + start ==
+						last->blkaddr + last->len) {
+			last->len += end - start;
+			goto done;
+		}
+	}
+
+	new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+	INIT_LIST_HEAD(&new->list);
+	new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
+	new->len = end - start;
+	list_add_tail(&new->list, head);
+done:
+	SM_I(sbi)->nr_discards += end - start;
+	cpc->trimmed += end - start;
+}
+
+static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+	int max_blocks = sbi->blocks_per_seg;
+	struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
+	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+	unsigned long *dmap = SIT_I(sbi)->tmp_map;
+	unsigned int start = 0, end = -1;
+	bool force = (cpc->reason == CP_DISCARD);
+	int i;
+
+	if (!force && (!test_opt(sbi, DISCARD) ||
+			SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
+		return;
+
+	if (force && !se->valid_blocks) {
+		struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+		/*
+		 * if this segment is registered in the prefree list, then
+		 * we should skip adding a discard candidate, and let the
+		 * checkpoint do that later.
+		 */
+		mutex_lock(&dirty_i->seglist_lock);
+		if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
+			mutex_unlock(&dirty_i->seglist_lock);
+			cpc->trimmed += sbi->blocks_per_seg;
+			return;
+		}
+		mutex_unlock(&dirty_i->seglist_lock);
+
+		__add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg);
+		return;
+	}
+
+	/* zero block will be discarded through the prefree list */
+	if (!se->valid_blocks || se->valid_blocks == max_blocks)
+		return;
+
+	/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
+	for (i = 0; i < entries; i++)
+		dmap[i] = force ? ~ckpt_map[i] :
+				(cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+
+	while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+		start = __find_rev_next_bit(dmap, max_blocks, end + 1);
+		if (start >= max_blocks)
+			break;
+
+		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+
+		if (force && end - start < cpc->trim_minlen)
+			continue;
+
+		__add_discard_entry(sbi, cpc, start, end);
+	}
+}
+
+void release_discard_addrs(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head = &(SM_I(sbi)->discard_list);
+	struct discard_entry *entry, *this;
+
+	/* drop caches */
+	list_for_each_entry_safe(entry, this, head, list) {
+		list_del(&entry->list);
+		kmem_cache_free(discard_entry_slab, entry);
+	}
+}
+
+/*
+ * Should call clear_prefree_segments after checkpoint is done.
+ */
+static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	unsigned int segno;
+
+	mutex_lock(&dirty_i->seglist_lock);
+	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
+		__set_test_and_free(sbi, segno);
+	mutex_unlock(&dirty_i->seglist_lock);
+}
+
+void clear_prefree_segments(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head = &(SM_I(sbi)->discard_list);
+	struct discard_entry *entry, *this;
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
+	unsigned int start = 0, end = -1;
+
+	mutex_lock(&dirty_i->seglist_lock);
+
+	while (1) {
+		int i;
+		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
+		if (start >= MAIN_SEGS(sbi))
+			break;
+		end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
+								start + 1);
+
+		for (i = start; i < end; i++)
+			clear_bit(i, prefree_map);
+
+		dirty_i->nr_dirty[PRE] -= end - start;
+
+		if (!test_opt(sbi, DISCARD))
+			continue;
+
+		f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+				(end - start) << sbi->log_blocks_per_seg);
+	}
+	mutex_unlock(&dirty_i->seglist_lock);
+
+	/* send small discards */
+	list_for_each_entry_safe(entry, this, head, list) {
+		f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
+		list_del(&entry->list);
+		SM_I(sbi)->nr_discards -= entry->len;
+		kmem_cache_free(discard_entry_slab, entry);
+	}
+}
+
+static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+
+	if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
+		sit_i->dirty_sentries++;
+		return false;
+	}
+
+	return true;
+}
+
+static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
+					unsigned int segno, int modified)
+{
+	struct seg_entry *se = get_seg_entry(sbi, segno);
+	se->type = type;
+	if (modified)
+		__mark_sit_entry_dirty(sbi, segno);
+}
+
+static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
+{
+	struct seg_entry *se;
+	unsigned int segno, offset;
+	long int new_vblocks;
+
+	segno = GET_SEGNO(sbi, blkaddr);
+
+	se = get_seg_entry(sbi, segno);
+	new_vblocks = se->valid_blocks + del;
+	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+	f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
+				(new_vblocks > sbi->blocks_per_seg)));
+
+	se->valid_blocks = new_vblocks;
+	se->mtime = get_mtime(sbi);
+	SIT_I(sbi)->max_mtime = se->mtime;
+
+	/* Update valid block bitmap */
+	if (del > 0) {
+		if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
+			f2fs_bug_on(sbi, 1);
+	} else {
+		if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
+			f2fs_bug_on(sbi, 1);
+	}
+	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
+		se->ckpt_valid_blocks += del;
+
+	__mark_sit_entry_dirty(sbi, segno);
+
+	/* update total number of valid blocks to be written in ckpt area */
+	SIT_I(sbi)->written_valid_blocks += del;
+
+	if (sbi->segs_per_sec > 1)
+		get_sec_entry(sbi, segno)->valid_blocks += del;
+}
+
+void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
+{
+	update_sit_entry(sbi, new, 1);
+	if (GET_SEGNO(sbi, old) != NULL_SEGNO)
+		update_sit_entry(sbi, old, -1);
+
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
+}
+
+void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
+{
+	unsigned int segno = GET_SEGNO(sbi, addr);
+	struct sit_info *sit_i = SIT_I(sbi);
+
+	f2fs_bug_on(sbi, addr == NULL_ADDR);
+	if (addr == NEW_ADDR)
+		return;
+
+	/* add it into sit main buffer */
+	mutex_lock(&sit_i->sentry_lock);
+
+	update_sit_entry(sbi, addr, -1);
+
+	/* add it into dirty seglist */
+	locate_dirty_segment(sbi, segno);
+
+	mutex_unlock(&sit_i->sentry_lock);
+}
+
+/*
+ * This function should be resided under the curseg_mutex lock
+ */
+static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
+					struct f2fs_summary *sum)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	void *addr = curseg->sum_blk;
+	addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
+	memcpy(addr, sum, sizeof(struct f2fs_summary));
+}
+
+/*
+ * Calculate the number of current summary pages for writing
+ */
+int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
+{
+	int valid_sum_count = 0;
+	int i, sum_in_page;
+
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+		if (sbi->ckpt->alloc_type[i] == SSR)
+			valid_sum_count += sbi->blocks_per_seg;
+		else {
+			if (for_ra)
+				valid_sum_count += le16_to_cpu(
+					F2FS_CKPT(sbi)->cur_data_blkoff[i]);
+			else
+				valid_sum_count += curseg_blkoff(sbi, i);
+		}
+	}
+
+	sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+			SUM_FOOTER_SIZE) / SUMMARY_SIZE;
+	if (valid_sum_count <= sum_in_page)
+		return 1;
+	else if ((valid_sum_count - sum_in_page) <=
+		(PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
+		return 2;
+	return 3;
+}
+
+/*
+ * Caller should put this summary page
+ */
+struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
+}
+
+static void write_sum_page(struct f2fs_sb_info *sbi,
+			struct f2fs_summary_block *sum_blk, block_t blk_addr)
+{
+	struct page *page = grab_meta_page(sbi, blk_addr);
+	void *kaddr = page_address(page);
+	memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
+	set_page_dirty(page);
+	f2fs_put_page(page, 1);
+}
+
+static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	unsigned int segno = curseg->segno + 1;
+	struct free_segmap_info *free_i = FREE_I(sbi);
+
+	if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
+		return !test_bit(segno, free_i->free_segmap);
+	return 0;
+}
+
+/*
+ * Find a new segment from the free segments bitmap to right order
+ * This function should be returned with success, otherwise BUG
+ */
+static void get_new_segment(struct f2fs_sb_info *sbi,
+			unsigned int *newseg, bool new_sec, int dir)
+{
+	struct free_segmap_info *free_i = FREE_I(sbi);
+	unsigned int segno, secno, zoneno;
+	unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
+	unsigned int hint = *newseg / sbi->segs_per_sec;
+	unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+	unsigned int left_start = hint;
+	bool init = true;
+	int go_left = 0;
+	int i;
+
+	spin_lock(&free_i->segmap_lock);
+
+	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
+		segno = find_next_zero_bit(free_i->free_segmap,
+					MAIN_SEGS(sbi), *newseg + 1);
+		if (segno - *newseg < sbi->segs_per_sec -
+					(*newseg % sbi->segs_per_sec))
+			goto got_it;
+	}
+find_other_zone:
+	secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
+	if (secno >= MAIN_SECS(sbi)) {
+		if (dir == ALLOC_RIGHT) {
+			secno = find_next_zero_bit(free_i->free_secmap,
+							MAIN_SECS(sbi), 0);
+			f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
+		} else {
+			go_left = 1;
+			left_start = hint - 1;
+		}
+	}
+	if (go_left == 0)
+		goto skip_left;
+
+	while (test_bit(left_start, free_i->free_secmap)) {
+		if (left_start > 0) {
+			left_start--;
+			continue;
+		}
+		left_start = find_next_zero_bit(free_i->free_secmap,
+							MAIN_SECS(sbi), 0);
+		f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
+		break;
+	}
+	secno = left_start;
+skip_left:
+	hint = secno;
+	segno = secno * sbi->segs_per_sec;
+	zoneno = secno / sbi->secs_per_zone;
+
+	/* give up on finding another zone */
+	if (!init)
+		goto got_it;
+	if (sbi->secs_per_zone == 1)
+		goto got_it;
+	if (zoneno == old_zoneno)
+		goto got_it;
+	if (dir == ALLOC_LEFT) {
+		if (!go_left && zoneno + 1 >= total_zones)
+			goto got_it;
+		if (go_left && zoneno == 0)
+			goto got_it;
+	}
+	for (i = 0; i < NR_CURSEG_TYPE; i++)
+		if (CURSEG_I(sbi, i)->zone == zoneno)
+			break;
+
+	if (i < NR_CURSEG_TYPE) {
+		/* zone is in user, try another */
+		if (go_left)
+			hint = zoneno * sbi->secs_per_zone - 1;
+		else if (zoneno + 1 >= total_zones)
+			hint = 0;
+		else
+			hint = (zoneno + 1) * sbi->secs_per_zone;
+		init = false;
+		goto find_other_zone;
+	}
+got_it:
+	/* set it as dirty segment in free segmap */
+	f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
+	__set_inuse(sbi, segno);
+	*newseg = segno;
+	spin_unlock(&free_i->segmap_lock);
+}
+
+static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	struct summary_footer *sum_footer;
+
+	curseg->segno = curseg->next_segno;
+	curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+	curseg->next_blkoff = 0;
+	curseg->next_segno = NULL_SEGNO;
+
+	sum_footer = &(curseg->sum_blk->footer);
+	memset(sum_footer, 0, sizeof(struct summary_footer));
+	if (IS_DATASEG(type))
+		SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
+	if (IS_NODESEG(type))
+		SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
+	__set_sit_entry_type(sbi, type, curseg->segno, modified);
+}
+
+/*
+ * Allocate a current working segment.
+ * This function always allocates a free segment in LFS manner.
+ */
+static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	unsigned int segno = curseg->segno;
+	int dir = ALLOC_LEFT;
+
+	write_sum_page(sbi, curseg->sum_blk,
+				GET_SUM_BLOCK(sbi, segno));
+	if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
+		dir = ALLOC_RIGHT;
+
+	if (test_opt(sbi, NOHEAP))
+		dir = ALLOC_RIGHT;
+
+	get_new_segment(sbi, &segno, new_sec, dir);
+	curseg->next_segno = segno;
+	reset_curseg(sbi, type, 1);
+	curseg->alloc_type = LFS;
+}
+
+static void __next_free_blkoff(struct f2fs_sb_info *sbi,
+			struct curseg_info *seg, block_t start)
+{
+	struct seg_entry *se = get_seg_entry(sbi, seg->segno);
+	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+	unsigned long *target_map = SIT_I(sbi)->tmp_map;
+	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+	int i, pos;
+
+	for (i = 0; i < entries; i++)
+		target_map[i] = ckpt_map[i] | cur_map[i];
+
+	pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+
+	seg->next_blkoff = pos;
+}
+
+/*
+ * If a segment is written by LFS manner, next block offset is just obtained
+ * by increasing the current block offset. However, if a segment is written by
+ * SSR manner, next block offset obtained by calling __next_free_blkoff
+ */
+static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
+				struct curseg_info *seg)
+{
+	if (seg->alloc_type == SSR)
+		__next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
+	else
+		seg->next_blkoff++;
+}
+
+/*
+ * This function always allocates a used segment(from dirty seglist) by SSR
+ * manner, so it should recover the existing segment information of valid blocks
+ */
+static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	unsigned int new_segno = curseg->next_segno;
+	struct f2fs_summary_block *sum_node;
+	struct page *sum_page;
+
+	write_sum_page(sbi, curseg->sum_blk,
+				GET_SUM_BLOCK(sbi, curseg->segno));
+	__set_test_and_inuse(sbi, new_segno);
+
+	mutex_lock(&dirty_i->seglist_lock);
+	__remove_dirty_segment(sbi, new_segno, PRE);
+	__remove_dirty_segment(sbi, new_segno, DIRTY);
+	mutex_unlock(&dirty_i->seglist_lock);
+
+	reset_curseg(sbi, type, 1);
+	curseg->alloc_type = SSR;
+	__next_free_blkoff(sbi, curseg, 0);
+
+	if (reuse) {
+		sum_page = get_sum_page(sbi, new_segno);
+		sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+		memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+		f2fs_put_page(sum_page, 1);
+	}
+}
+
+static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+
+	if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
+		return v_ops->get_victim(sbi,
+				&(curseg)->next_segno, BG_GC, type, SSR);
+
+	/* For data segments, let's do SSR more intensively */
+	for (; type >= CURSEG_HOT_DATA; type--)
+		if (v_ops->get_victim(sbi, &(curseg)->next_segno,
+						BG_GC, type, SSR))
+			return 1;
+	return 0;
+}
+
+/*
+ * flush out current segment and replace it with new segment
+ * This function should be returned with success, otherwise BUG
+ */
+static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
+						int type, bool force)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+
+	if (force)
+		new_curseg(sbi, type, true);
+	else if (type == CURSEG_WARM_NODE)
+		new_curseg(sbi, type, false);
+	else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
+		new_curseg(sbi, type, false);
+	else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
+		change_curseg(sbi, type, true);
+	else
+		new_curseg(sbi, type, false);
+
+	stat_inc_seg_type(sbi, curseg);
+}
+
+static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	unsigned int old_segno;
+
+	old_segno = curseg->segno;
+	SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+	locate_dirty_segment(sbi, old_segno);
+}
+
+void allocate_new_segments(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
+		__allocate_new_segments(sbi, i);
+}
+
+static const struct segment_allocation default_salloc_ops = {
+	.allocate_segment = allocate_segment_by_default,
+};
+
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
+{
+	__u64 start = F2FS_BYTES_TO_BLK(range->start);
+	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
+	unsigned int start_segno, end_segno;
+	struct cp_control cpc;
+
+	if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
+						range->len < sbi->blocksize)
+		return -EINVAL;
+
+	cpc.trimmed = 0;
+	if (end <= MAIN_BLKADDR(sbi))
+		goto out;
+
+	/* start/end segment number in main_area */
+	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
+	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
+						GET_SEGNO(sbi, end);
+	cpc.reason = CP_DISCARD;
+	cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
+
+	/* do checkpoint to issue discard commands safely */
+	for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
+		cpc.trim_start = start_segno;
+		cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
+				BATCHED_TRIM_SEGMENTS(sbi),
+				sbi->segs_per_sec) - 1, end_segno);
+
+		mutex_lock(&sbi->gc_mutex);
+		write_checkpoint(sbi, &cpc);
+		mutex_unlock(&sbi->gc_mutex);
+	}
+out:
+	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
+	return 0;
+}
+
+static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	if (curseg->next_blkoff < sbi->blocks_per_seg)
+		return true;
+	return false;
+}
+
+static int __get_segment_type_2(struct page *page, enum page_type p_type)
+{
+	if (p_type == DATA)
+		return CURSEG_HOT_DATA;
+	else
+		return CURSEG_HOT_NODE;
+}
+
+static int __get_segment_type_4(struct page *page, enum page_type p_type)
+{
+	if (p_type == DATA) {
+		struct inode *inode = page->mapping->host;
+
+		if (S_ISDIR(inode->i_mode))
+			return CURSEG_HOT_DATA;
+		else
+			return CURSEG_COLD_DATA;
+	} else {
+		if (IS_DNODE(page) && is_cold_node(page))
+			return CURSEG_WARM_NODE;
+		else
+			return CURSEG_COLD_NODE;
+	}
+}
+
+static int __get_segment_type_6(struct page *page, enum page_type p_type)
+{
+	if (p_type == DATA) {
+		struct inode *inode = page->mapping->host;
+
+		if (S_ISDIR(inode->i_mode))
+			return CURSEG_HOT_DATA;
+		else if (is_cold_data(page) || file_is_cold(inode))
+			return CURSEG_COLD_DATA;
+		else
+			return CURSEG_WARM_DATA;
+	} else {
+		if (IS_DNODE(page))
+			return is_cold_node(page) ? CURSEG_WARM_NODE :
+						CURSEG_HOT_NODE;
+		else
+			return CURSEG_COLD_NODE;
+	}
+}
+
+static int __get_segment_type(struct page *page, enum page_type p_type)
+{
+	switch (F2FS_P_SB(page)->active_logs) {
+	case 2:
+		return __get_segment_type_2(page, p_type);
+	case 4:
+		return __get_segment_type_4(page, p_type);
+	}
+	/* NR_CURSEG_TYPE(6) logs by default */
+	f2fs_bug_on(F2FS_P_SB(page),
+		F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
+	return __get_segment_type_6(page, p_type);
+}
+
+void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+		block_t old_blkaddr, block_t *new_blkaddr,
+		struct f2fs_summary *sum, int type)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	struct curseg_info *curseg;
+	bool direct_io = (type == CURSEG_DIRECT_IO);
+
+	type = direct_io ? CURSEG_WARM_DATA : type;
+
+	curseg = CURSEG_I(sbi, type);
+
+	mutex_lock(&curseg->curseg_mutex);
+	mutex_lock(&sit_i->sentry_lock);
+
+	/* direct_io'ed data is aligned to the segment for better performance */
+	if (direct_io && curseg->next_blkoff)
+		__allocate_new_segments(sbi, type);
+
+	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+
+	/*
+	 * __add_sum_entry should be resided under the curseg_mutex
+	 * because, this function updates a summary entry in the
+	 * current summary block.
+	 */
+	__add_sum_entry(sbi, type, sum);
+
+	__refresh_next_blkoff(sbi, curseg);
+
+	stat_inc_block_count(sbi, curseg);
+
+	if (!__has_curseg_space(sbi, type))
+		sit_i->s_ops->allocate_segment(sbi, type, false);
+	/*
+	 * SIT information should be updated before segment allocation,
+	 * since SSR needs latest valid block information.
+	 */
+	refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+
+	mutex_unlock(&sit_i->sentry_lock);
+
+	if (page && IS_NODESEG(type))
+		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+
+	mutex_unlock(&curseg->curseg_mutex);
+}
+
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+			struct f2fs_summary *sum,
+			struct f2fs_io_info *fio)
+{
+	int type = __get_segment_type(page, fio->type);
+
+	allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
+
+	/* writeout dirty page into bdev */
+	f2fs_submit_page_mbio(sbi, page, fio);
+}
+
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_io_info fio = {
+		.type = META,
+		.rw = WRITE_SYNC | REQ_META | REQ_PRIO,
+		.blk_addr = page->index,
+	};
+
+	set_page_writeback(page);
+	f2fs_submit_page_mbio(sbi, page, &fio);
+}
+
+void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+			unsigned int nid, struct f2fs_io_info *fio)
+{
+	struct f2fs_summary sum;
+	set_summary(&sum, nid, 0, 0);
+	do_write_page(sbi, page, &sum, fio);
+}
+
+void write_data_page(struct page *page, struct dnode_of_data *dn,
+				struct f2fs_io_info *fio)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct f2fs_summary sum;
+	struct node_info ni;
+
+	f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
+	get_node_info(sbi, dn->nid, &ni);
+	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+	do_write_page(sbi, page, &sum, fio);
+	dn->data_blkaddr = fio->blk_addr;
+}
+
+void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
+{
+	stat_inc_inplace_blocks(F2FS_P_SB(page));
+	f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
+}
+
+void recover_data_page(struct f2fs_sb_info *sbi,
+			struct page *page, struct f2fs_summary *sum,
+			block_t old_blkaddr, block_t new_blkaddr)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	struct curseg_info *curseg;
+	unsigned int segno, old_cursegno;
+	struct seg_entry *se;
+	int type;
+
+	segno = GET_SEGNO(sbi, new_blkaddr);
+	se = get_seg_entry(sbi, segno);
+	type = se->type;
+
+	if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+		if (old_blkaddr == NULL_ADDR)
+			type = CURSEG_COLD_DATA;
+		else
+			type = CURSEG_WARM_DATA;
+	}
+	curseg = CURSEG_I(sbi, type);
+
+	mutex_lock(&curseg->curseg_mutex);
+	mutex_lock(&sit_i->sentry_lock);
+
+	old_cursegno = curseg->segno;
+
+	/* change the current segment */
+	if (segno != curseg->segno) {
+		curseg->next_segno = segno;
+		change_curseg(sbi, type, true);
+	}
+
+	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
+	__add_sum_entry(sbi, type, sum);
+
+	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+	locate_dirty_segment(sbi, old_cursegno);
+
+	mutex_unlock(&sit_i->sentry_lock);
+	mutex_unlock(&curseg->curseg_mutex);
+}
+
+static inline bool is_merged_page(struct f2fs_sb_info *sbi,
+					struct page *page, enum page_type type)
+{
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct f2fs_bio_info *io = &sbi->write_io[btype];
+	struct bio_vec *bvec;
+	int i;
+
+	down_read(&io->io_rwsem);
+	if (!io->bio)
+		goto out;
+
+	bio_for_each_segment_all(bvec, io->bio, i) {
+		if (page == bvec->bv_page) {
+			up_read(&io->io_rwsem);
+			return true;
+		}
+	}
+
+out:
+	up_read(&io->io_rwsem);
+	return false;
+}
+
+void f2fs_wait_on_page_writeback(struct page *page,
+				enum page_type type)
+{
+	if (PageWriteback(page)) {
+		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+
+		if (is_merged_page(sbi, page, type))
+			f2fs_submit_merged_bio(sbi, type, WRITE);
+		wait_on_page_writeback(page);
+	}
+}
+
+static int read_compacted_summaries(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct curseg_info *seg_i;
+	unsigned char *kaddr;
+	struct page *page;
+	block_t start;
+	int i, j, offset;
+
+	start = start_sum_block(sbi);
+
+	page = get_meta_page(sbi, start++);
+	kaddr = (unsigned char *)page_address(page);
+
+	/* Step 1: restore nat cache */
+	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+
+	/* Step 2: restore sit cache */
+	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+	memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
+						SUM_JOURNAL_SIZE);
+	offset = 2 * SUM_JOURNAL_SIZE;
+
+	/* Step 3: restore summary entries */
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+		unsigned short blk_off;
+		unsigned int segno;
+
+		seg_i = CURSEG_I(sbi, i);
+		segno = le32_to_cpu(ckpt->cur_data_segno[i]);
+		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
+		seg_i->next_segno = segno;
+		reset_curseg(sbi, i, 0);
+		seg_i->alloc_type = ckpt->alloc_type[i];
+		seg_i->next_blkoff = blk_off;
+
+		if (seg_i->alloc_type == SSR)
+			blk_off = sbi->blocks_per_seg;
+
+		for (j = 0; j < blk_off; j++) {
+			struct f2fs_summary *s;
+			s = (struct f2fs_summary *)(kaddr + offset);
+			seg_i->sum_blk->entries[j] = *s;
+			offset += SUMMARY_SIZE;
+			if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+						SUM_FOOTER_SIZE)
+				continue;
+
+			f2fs_put_page(page, 1);
+			page = NULL;
+
+			page = get_meta_page(sbi, start++);
+			kaddr = (unsigned char *)page_address(page);
+			offset = 0;
+		}
+	}
+	f2fs_put_page(page, 1);
+	return 0;
+}
+
+static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct f2fs_summary_block *sum;
+	struct curseg_info *curseg;
+	struct page *new;
+	unsigned short blk_off;
+	unsigned int segno = 0;
+	block_t blk_addr = 0;
+
+	/* get segment number and block addr */
+	if (IS_DATASEG(type)) {
+		segno = le32_to_cpu(ckpt->cur_data_segno[type]);
+		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
+							CURSEG_HOT_DATA]);
+		if (__exist_node_summaries(sbi))
+			blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
+		else
+			blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
+	} else {
+		segno = le32_to_cpu(ckpt->cur_node_segno[type -
+							CURSEG_HOT_NODE]);
+		blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
+							CURSEG_HOT_NODE]);
+		if (__exist_node_summaries(sbi))
+			blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
+							type - CURSEG_HOT_NODE);
+		else
+			blk_addr = GET_SUM_BLOCK(sbi, segno);
+	}
+
+	new = get_meta_page(sbi, blk_addr);
+	sum = (struct f2fs_summary_block *)page_address(new);
+
+	if (IS_NODESEG(type)) {
+		if (__exist_node_summaries(sbi)) {
+			struct f2fs_summary *ns = &sum->entries[0];
+			int i;
+			for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
+				ns->version = 0;
+				ns->ofs_in_node = 0;
+			}
+		} else {
+			int err;
+
+			err = restore_node_summary(sbi, segno, sum);
+			if (err) {
+				f2fs_put_page(new, 1);
+				return err;
+			}
+		}
+	}
+
+	/* set uncompleted segment to curseg */
+	curseg = CURSEG_I(sbi, type);
+	mutex_lock(&curseg->curseg_mutex);
+	memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+	curseg->next_segno = segno;
+	reset_curseg(sbi, type, 0);
+	curseg->alloc_type = ckpt->alloc_type[type];
+	curseg->next_blkoff = blk_off;
+	mutex_unlock(&curseg->curseg_mutex);
+	f2fs_put_page(new, 1);
+	return 0;
+}
+
+static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
+{
+	int type = CURSEG_HOT_DATA;
+	int err;
+
+	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+		int npages = npages_for_summary_flush(sbi, true);
+
+		if (npages >= 2)
+			ra_meta_pages(sbi, start_sum_block(sbi), npages,
+								META_CP);
+
+		/* restore for compacted data summary */
+		if (read_compacted_summaries(sbi))
+			return -EINVAL;
+		type = CURSEG_HOT_NODE;
+	}
+
+	if (__exist_node_summaries(sbi))
+		ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
+					NR_CURSEG_TYPE - type, META_CP);
+
+	for (; type <= CURSEG_COLD_NODE; type++) {
+		err = read_normal_summaries(sbi, type);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	struct page *page;
+	unsigned char *kaddr;
+	struct f2fs_summary *summary;
+	struct curseg_info *seg_i;
+	int written_size = 0;
+	int i, j;
+
+	page = grab_meta_page(sbi, blkaddr++);
+	kaddr = (unsigned char *)page_address(page);
+
+	/* Step 1: write nat cache */
+	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+	written_size += SUM_JOURNAL_SIZE;
+
+	/* Step 2: write sit cache */
+	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+	memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
+						SUM_JOURNAL_SIZE);
+	written_size += SUM_JOURNAL_SIZE;
+
+	/* Step 3: write summary entries */
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+		unsigned short blkoff;
+		seg_i = CURSEG_I(sbi, i);
+		if (sbi->ckpt->alloc_type[i] == SSR)
+			blkoff = sbi->blocks_per_seg;
+		else
+			blkoff = curseg_blkoff(sbi, i);
+
+		for (j = 0; j < blkoff; j++) {
+			if (!page) {
+				page = grab_meta_page(sbi, blkaddr++);
+				kaddr = (unsigned char *)page_address(page);
+				written_size = 0;
+			}
+			summary = (struct f2fs_summary *)(kaddr + written_size);
+			*summary = seg_i->sum_blk->entries[j];
+			written_size += SUMMARY_SIZE;
+
+			if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+							SUM_FOOTER_SIZE)
+				continue;
+
+			set_page_dirty(page);
+			f2fs_put_page(page, 1);
+			page = NULL;
+		}
+	}
+	if (page) {
+		set_page_dirty(page);
+		f2fs_put_page(page, 1);
+	}
+}
+
+static void write_normal_summaries(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type)
+{
+	int i, end;
+	if (IS_DATASEG(type))
+		end = type + NR_CURSEG_DATA_TYPE;
+	else
+		end = type + NR_CURSEG_NODE_TYPE;
+
+	for (i = type; i < end; i++) {
+		struct curseg_info *sum = CURSEG_I(sbi, i);
+		mutex_lock(&sum->curseg_mutex);
+		write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
+		mutex_unlock(&sum->curseg_mutex);
+	}
+}
+
+void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+		write_compacted_summaries(sbi, start_blk);
+	else
+		write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
+}
+
+void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+	write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+}
+
+int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+					unsigned int val, int alloc)
+{
+	int i;
+
+	if (type == NAT_JOURNAL) {
+		for (i = 0; i < nats_in_cursum(sum); i++) {
+			if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+				return i;
+		}
+		if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
+			return update_nats_in_cursum(sum, 1);
+	} else if (type == SIT_JOURNAL) {
+		for (i = 0; i < sits_in_cursum(sum); i++)
+			if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+				return i;
+		if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
+			return update_sits_in_cursum(sum, 1);
+	}
+	return -1;
+}
+
+static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
+					unsigned int segno)
+{
+	return get_meta_page(sbi, current_sit_addr(sbi, segno));
+}
+
+static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
+					unsigned int start)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	struct page *src_page, *dst_page;
+	pgoff_t src_off, dst_off;
+	void *src_addr, *dst_addr;
+
+	src_off = current_sit_addr(sbi, start);
+	dst_off = next_sit_addr(sbi, src_off);
+
+	/* get current sit block page without lock */
+	src_page = get_meta_page(sbi, src_off);
+	dst_page = grab_meta_page(sbi, dst_off);
+	f2fs_bug_on(sbi, PageDirty(src_page));
+
+	src_addr = page_address(src_page);
+	dst_addr = page_address(dst_page);
+	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+
+	set_page_dirty(dst_page);
+	f2fs_put_page(src_page, 1);
+
+	set_to_next_sit(sit_i, start);
+
+	return dst_page;
+}
+
+static struct sit_entry_set *grab_sit_entry_set(void)
+{
+	struct sit_entry_set *ses =
+			f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+
+	ses->entry_cnt = 0;
+	INIT_LIST_HEAD(&ses->set_list);
+	return ses;
+}
+
+static void release_sit_entry_set(struct sit_entry_set *ses)
+{
+	list_del(&ses->set_list);
+	kmem_cache_free(sit_entry_set_slab, ses);
+}
+
+static void adjust_sit_entry_set(struct sit_entry_set *ses,
+						struct list_head *head)
+{
+	struct sit_entry_set *next = ses;
+
+	if (list_is_last(&ses->set_list, head))
+		return;
+
+	list_for_each_entry_continue(next, head, set_list)
+		if (ses->entry_cnt <= next->entry_cnt)
+			break;
+
+	list_move_tail(&ses->set_list, &next->set_list);
+}
+
+static void add_sit_entry(unsigned int segno, struct list_head *head)
+{
+	struct sit_entry_set *ses;
+	unsigned int start_segno = START_SEGNO(segno);
+
+	list_for_each_entry(ses, head, set_list) {
+		if (ses->start_segno == start_segno) {
+			ses->entry_cnt++;
+			adjust_sit_entry_set(ses, head);
+			return;
+		}
+	}
+
+	ses = grab_sit_entry_set();
+
+	ses->start_segno = start_segno;
+	ses->entry_cnt++;
+	list_add(&ses->set_list, head);
+}
+
+static void add_sits_in_set(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_sm_info *sm_info = SM_I(sbi);
+	struct list_head *set_list = &sm_info->sit_entry_set;
+	unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
+	unsigned int segno;
+
+	for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
+		add_sit_entry(segno, set_list);
+}
+
+static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	int i;
+
+	for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+		unsigned int segno;
+		bool dirtied;
+
+		segno = le32_to_cpu(segno_in_journal(sum, i));
+		dirtied = __mark_sit_entry_dirty(sbi, segno);
+
+		if (!dirtied)
+			add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
+	}
+	update_sits_in_cursum(sum, -sits_in_cursum(sum));
+}
+
+/*
+ * CP calls this function, which flushes SIT entries including sit_journal,
+ * and moves prefree segs to free segs.
+ */
+void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct sit_entry_set *ses, *tmp;
+	struct list_head *head = &SM_I(sbi)->sit_entry_set;
+	bool to_journal = true;
+	struct seg_entry *se;
+
+	mutex_lock(&curseg->curseg_mutex);
+	mutex_lock(&sit_i->sentry_lock);
+
+	if (!sit_i->dirty_sentries)
+		goto out;
+
+	/*
+	 * add and account sit entries of dirty bitmap in sit entry
+	 * set temporarily
+	 */
+	add_sits_in_set(sbi);
+
+	/*
+	 * if there are no enough space in journal to store dirty sit
+	 * entries, remove all entries from journal and add and account
+	 * them in sit entry set.
+	 */
+	if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+		remove_sits_in_journal(sbi);
+
+	/*
+	 * there are two steps to flush sit entries:
+	 * #1, flush sit entries to journal in current cold data summary block.
+	 * #2, flush sit entries to sit page.
+	 */
+	list_for_each_entry_safe(ses, tmp, head, set_list) {
+		struct page *page = NULL;
+		struct f2fs_sit_block *raw_sit = NULL;
+		unsigned int start_segno = ses->start_segno;
+		unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
+						(unsigned long)MAIN_SEGS(sbi));
+		unsigned int segno = start_segno;
+
+		if (to_journal &&
+			!__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+			to_journal = false;
+
+		if (!to_journal) {
+			page = get_next_sit_page(sbi, start_segno);
+			raw_sit = page_address(page);
+		}
+
+		/* flush dirty sit entries in region of current sit set */
+		for_each_set_bit_from(segno, bitmap, end) {
+			int offset, sit_offset;
+
+			se = get_seg_entry(sbi, segno);
+
+			/* add discard candidates */
+			if (cpc->reason != CP_DISCARD) {
+				cpc->trim_start = segno;
+				add_discard_addrs(sbi, cpc);
+			}
+
+			if (to_journal) {
+				offset = lookup_journal_in_cursum(sum,
+							SIT_JOURNAL, segno, 1);
+				f2fs_bug_on(sbi, offset < 0);
+				segno_in_journal(sum, offset) =
+							cpu_to_le32(segno);
+				seg_info_to_raw_sit(se,
+						&sit_in_journal(sum, offset));
+			} else {
+				sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+				seg_info_to_raw_sit(se,
+						&raw_sit->entries[sit_offset]);
+			}
+
+			__clear_bit(segno, bitmap);
+			sit_i->dirty_sentries--;
+			ses->entry_cnt--;
+		}
+
+		if (!to_journal)
+			f2fs_put_page(page, 1);
+
+		f2fs_bug_on(sbi, ses->entry_cnt);
+		release_sit_entry_set(ses);
+	}
+
+	f2fs_bug_on(sbi, !list_empty(head));
+	f2fs_bug_on(sbi, sit_i->dirty_sentries);
+out:
+	if (cpc->reason == CP_DISCARD) {
+		for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
+			add_discard_addrs(sbi, cpc);
+	}
+	mutex_unlock(&sit_i->sentry_lock);
+	mutex_unlock(&curseg->curseg_mutex);
+
+	set_prefree_as_free_segments(sbi);
+}
+
+static int build_sit_info(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct sit_info *sit_i;
+	unsigned int sit_segs, start;
+	char *src_bitmap, *dst_bitmap;
+	unsigned int bitmap_size;
+
+	/* allocate memory for SIT information */
+	sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
+	if (!sit_i)
+		return -ENOMEM;
+
+	SM_I(sbi)->sit_info = sit_i;
+
+	sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
+	if (!sit_i->sentries)
+		return -ENOMEM;
+
+	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
+	sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!sit_i->dirty_sentries_bitmap)
+		return -ENOMEM;
+
+	for (start = 0; start < MAIN_SEGS(sbi); start++) {
+		sit_i->sentries[start].cur_valid_map
+			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+		sit_i->sentries[start].ckpt_valid_map
+			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+		if (!sit_i->sentries[start].cur_valid_map
+				|| !sit_i->sentries[start].ckpt_valid_map)
+			return -ENOMEM;
+	}
+
+	sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+	if (!sit_i->tmp_map)
+		return -ENOMEM;
+
+	if (sbi->segs_per_sec > 1) {
+		sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
+					sizeof(struct sec_entry));
+		if (!sit_i->sec_entries)
+			return -ENOMEM;
+	}
+
+	/* get information related with SIT */
+	sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
+
+	/* setup SIT bitmap from ckeckpoint pack */
+	bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
+	src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
+
+	dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+	if (!dst_bitmap)
+		return -ENOMEM;
+
+	/* init SIT information */
+	sit_i->s_ops = &default_salloc_ops;
+
+	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
+	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
+	sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+	sit_i->sit_bitmap = dst_bitmap;
+	sit_i->bitmap_size = bitmap_size;
+	sit_i->dirty_sentries = 0;
+	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
+	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
+	sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
+	mutex_init(&sit_i->sentry_lock);
+	return 0;
+}
+
+static int build_free_segmap(struct f2fs_sb_info *sbi)
+{
+	struct free_segmap_info *free_i;
+	unsigned int bitmap_size, sec_bitmap_size;
+
+	/* allocate memory for free segmap information */
+	free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
+	if (!free_i)
+		return -ENOMEM;
+
+	SM_I(sbi)->free_info = free_i;
+
+	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
+	free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+	if (!free_i->free_segmap)
+		return -ENOMEM;
+
+	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
+	free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+	if (!free_i->free_secmap)
+		return -ENOMEM;
+
+	/* set all segments as dirty temporarily */
+	memset(free_i->free_segmap, 0xff, bitmap_size);
+	memset(free_i->free_secmap, 0xff, sec_bitmap_size);
+
+	/* init free segmap information */
+	free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
+	free_i->free_segments = 0;
+	free_i->free_sections = 0;
+	spin_lock_init(&free_i->segmap_lock);
+	return 0;
+}
+
+static int build_curseg(struct f2fs_sb_info *sbi)
+{
+	struct curseg_info *array;
+	int i;
+
+	array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	SM_I(sbi)->curseg_array = array;
+
+	for (i = 0; i < NR_CURSEG_TYPE; i++) {
+		mutex_init(&array[i].curseg_mutex);
+		array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		if (!array[i].sum_blk)
+			return -ENOMEM;
+		array[i].segno = NULL_SEGNO;
+		array[i].next_blkoff = 0;
+	}
+	return restore_curseg_summaries(sbi);
+}
+
+static void build_sit_entries(struct f2fs_sb_info *sbi)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	int sit_blk_cnt = SIT_BLK_CNT(sbi);
+	unsigned int i, start, end;
+	unsigned int readed, start_blk = 0;
+	int nrpages = MAX_BIO_BLOCKS(sbi);
+
+	do {
+		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
+
+		start = start_blk * sit_i->sents_per_block;
+		end = (start_blk + readed) * sit_i->sents_per_block;
+
+		for (; start < end && start < MAIN_SEGS(sbi); start++) {
+			struct seg_entry *se = &sit_i->sentries[start];
+			struct f2fs_sit_block *sit_blk;
+			struct f2fs_sit_entry sit;
+			struct page *page;
+
+			mutex_lock(&curseg->curseg_mutex);
+			for (i = 0; i < sits_in_cursum(sum); i++) {
+				if (le32_to_cpu(segno_in_journal(sum, i))
+								== start) {
+					sit = sit_in_journal(sum, i);
+					mutex_unlock(&curseg->curseg_mutex);
+					goto got_it;
+				}
+			}
+			mutex_unlock(&curseg->curseg_mutex);
+
+			page = get_current_sit_page(sbi, start);
+			sit_blk = (struct f2fs_sit_block *)page_address(page);
+			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+			f2fs_put_page(page, 1);
+got_it:
+			check_block_count(sbi, start, &sit);
+			seg_info_from_raw_sit(se, &sit);
+			if (sbi->segs_per_sec > 1) {
+				struct sec_entry *e = get_sec_entry(sbi, start);
+				e->valid_blocks += se->valid_blocks;
+			}
+		}
+		start_blk += readed;
+	} while (start_blk < sit_blk_cnt);
+}
+
+static void init_free_segmap(struct f2fs_sb_info *sbi)
+{
+	unsigned int start;
+	int type;
+
+	for (start = 0; start < MAIN_SEGS(sbi); start++) {
+		struct seg_entry *sentry = get_seg_entry(sbi, start);
+		if (!sentry->valid_blocks)
+			__set_free(sbi, start);
+	}
+
+	/* set use the current segments */
+	for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
+		struct curseg_info *curseg_t = CURSEG_I(sbi, type);
+		__set_test_and_inuse(sbi, curseg_t->segno);
+	}
+}
+
+static void init_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	struct free_segmap_info *free_i = FREE_I(sbi);
+	unsigned int segno = 0, offset = 0;
+	unsigned short valid_blocks;
+
+	while (1) {
+		/* find dirty segment based on free segmap */
+		segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
+		if (segno >= MAIN_SEGS(sbi))
+			break;
+		offset = segno + 1;
+		valid_blocks = get_valid_blocks(sbi, segno, 0);
+		if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
+			continue;
+		if (valid_blocks > sbi->blocks_per_seg) {
+			f2fs_bug_on(sbi, 1);
+			continue;
+		}
+		mutex_lock(&dirty_i->seglist_lock);
+		__locate_dirty_segment(sbi, segno, DIRTY);
+		mutex_unlock(&dirty_i->seglist_lock);
+	}
+}
+
+static int init_victim_secmap(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
+
+	dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!dirty_i->victim_secmap)
+		return -ENOMEM;
+	return 0;
+}
+
+static int build_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i;
+	unsigned int bitmap_size, i;
+
+	/* allocate memory for dirty segments list information */
+	dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
+	if (!dirty_i)
+		return -ENOMEM;
+
+	SM_I(sbi)->dirty_info = dirty_i;
+	mutex_init(&dirty_i->seglist_lock);
+
+	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
+
+	for (i = 0; i < NR_DIRTY_TYPE; i++) {
+		dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+		if (!dirty_i->dirty_segmap[i])
+			return -ENOMEM;
+	}
+
+	init_dirty_segmap(sbi);
+	return init_victim_secmap(sbi);
+}
+
+/*
+ * Update min, max modified time for cost-benefit GC algorithm
+ */
+static void init_min_max_mtime(struct f2fs_sb_info *sbi)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	unsigned int segno;
+
+	mutex_lock(&sit_i->sentry_lock);
+
+	sit_i->min_mtime = LLONG_MAX;
+
+	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+		unsigned int i;
+		unsigned long long mtime = 0;
+
+		for (i = 0; i < sbi->segs_per_sec; i++)
+			mtime += get_seg_entry(sbi, segno + i)->mtime;
+
+		mtime = div_u64(mtime, sbi->segs_per_sec);
+
+		if (sit_i->min_mtime > mtime)
+			sit_i->min_mtime = mtime;
+	}
+	sit_i->max_mtime = get_mtime(sbi);
+	mutex_unlock(&sit_i->sentry_lock);
+}
+
+int build_segment_manager(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct f2fs_sm_info *sm_info;
+	int err;
+
+	sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
+	if (!sm_info)
+		return -ENOMEM;
+
+	/* init sm info */
+	sbi->sm_info = sm_info;
+	sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+	sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+	sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
+	sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+	sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+	sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
+	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+	sm_info->rec_prefree_segments = sm_info->main_segments *
+					DEF_RECLAIM_PREFREE_SEGMENTS / 100;
+	sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
+	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
+
+	INIT_LIST_HEAD(&sm_info->discard_list);
+	sm_info->nr_discards = 0;
+	sm_info->max_discards = 0;
+
+	sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
+
+	INIT_LIST_HEAD(&sm_info->sit_entry_set);
+
+	if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
+		err = create_flush_cmd_control(sbi);
+		if (err)
+			return err;
+	}
+
+	err = build_sit_info(sbi);
+	if (err)
+		return err;
+	err = build_free_segmap(sbi);
+	if (err)
+		return err;
+	err = build_curseg(sbi);
+	if (err)
+		return err;
+
+	/* reinit free segmap based on SIT */
+	build_sit_entries(sbi);
+
+	init_free_segmap(sbi);
+	err = build_dirty_segmap(sbi);
+	if (err)
+		return err;
+
+	init_min_max_mtime(sbi);
+	return 0;
+}
+
+static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
+		enum dirty_type dirty_type)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+	mutex_lock(&dirty_i->seglist_lock);
+	kfree(dirty_i->dirty_segmap[dirty_type]);
+	dirty_i->nr_dirty[dirty_type] = 0;
+	mutex_unlock(&dirty_i->seglist_lock);
+}
+
+static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	kfree(dirty_i->victim_secmap);
+}
+
+static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	int i;
+
+	if (!dirty_i)
+		return;
+
+	/* discard pre-free/dirty segments list */
+	for (i = 0; i < NR_DIRTY_TYPE; i++)
+		discard_dirty_segmap(sbi, i);
+
+	destroy_victim_secmap(sbi);
+	SM_I(sbi)->dirty_info = NULL;
+	kfree(dirty_i);
+}
+
+static void destroy_curseg(struct f2fs_sb_info *sbi)
+{
+	struct curseg_info *array = SM_I(sbi)->curseg_array;
+	int i;
+
+	if (!array)
+		return;
+	SM_I(sbi)->curseg_array = NULL;
+	for (i = 0; i < NR_CURSEG_TYPE; i++)
+		kfree(array[i].sum_blk);
+	kfree(array);
+}
+
+static void destroy_free_segmap(struct f2fs_sb_info *sbi)
+{
+	struct free_segmap_info *free_i = SM_I(sbi)->free_info;
+	if (!free_i)
+		return;
+	SM_I(sbi)->free_info = NULL;
+	kfree(free_i->free_segmap);
+	kfree(free_i->free_secmap);
+	kfree(free_i);
+}
+
+static void destroy_sit_info(struct f2fs_sb_info *sbi)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	unsigned int start;
+
+	if (!sit_i)
+		return;
+
+	if (sit_i->sentries) {
+		for (start = 0; start < MAIN_SEGS(sbi); start++) {
+			kfree(sit_i->sentries[start].cur_valid_map);
+			kfree(sit_i->sentries[start].ckpt_valid_map);
+		}
+	}
+	kfree(sit_i->tmp_map);
+
+	vfree(sit_i->sentries);
+	vfree(sit_i->sec_entries);
+	kfree(sit_i->dirty_sentries_bitmap);
+
+	SM_I(sbi)->sit_info = NULL;
+	kfree(sit_i->sit_bitmap);
+	kfree(sit_i);
+}
+
+void destroy_segment_manager(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_sm_info *sm_info = SM_I(sbi);
+
+	if (!sm_info)
+		return;
+	destroy_flush_cmd_control(sbi);
+	destroy_dirty_segmap(sbi);
+	destroy_curseg(sbi);
+	destroy_free_segmap(sbi);
+	destroy_sit_info(sbi);
+	sbi->sm_info = NULL;
+	kfree(sm_info);
+}
+
+int __init create_segment_manager_caches(void)
+{
+	discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+			sizeof(struct discard_entry));
+	if (!discard_entry_slab)
+		goto fail;
+
+	sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+			sizeof(struct sit_entry_set));
+	if (!sit_entry_set_slab)
+		goto destory_discard_entry;
+
+	inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+			sizeof(struct inmem_pages));
+	if (!inmem_entry_slab)
+		goto destroy_sit_entry_set;
+	return 0;
+
+destroy_sit_entry_set:
+	kmem_cache_destroy(sit_entry_set_slab);
+destory_discard_entry:
+	kmem_cache_destroy(discard_entry_slab);
+fail:
+	return -ENOMEM;
+}
+
+void destroy_segment_manager_caches(void)
+{
+	kmem_cache_destroy(sit_entry_set_slab);
+	kmem_cache_destroy(discard_entry_slab);
+	kmem_cache_destroy(inmem_entry_slab);
+}
diff --git a/kernel/fs/f2fs/segment.h b/kernel/fs/f2fs/segment.h
new file mode 100644
index 000000000..85d7fa751
--- /dev/null
+++ b/kernel/fs/f2fs/segment.h
@@ -0,0 +1,751 @@
+/*
+ * fs/f2fs/segment.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/blkdev.h>
+
+/* constant macro */
+#define NULL_SEGNO			((unsigned int)(~0))
+#define NULL_SECNO			((unsigned int)(~0))
+
+#define DEF_RECLAIM_PREFREE_SEGMENTS	5	/* 5% over total segments */
+
+/* L: Logical segment # in volume, R: Relative segment # in main area */
+#define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
+#define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
+
+#define IS_DATASEG(t)	(t <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t)	(t >= CURSEG_HOT_NODE)
+
+#define IS_CURSEG(sbi, seg)						\
+	((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+
+#define IS_CURSEC(sbi, secno)						\
+	((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /		\
+	  sbi->segs_per_sec) ||	\
+	 (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /		\
+	  sbi->segs_per_sec) ||	\
+	 (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /		\
+	  sbi->segs_per_sec) ||	\
+	 (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /		\
+	  sbi->segs_per_sec) ||	\
+	 (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /		\
+	  sbi->segs_per_sec) ||	\
+	 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\
+	  sbi->segs_per_sec))	\
+
+#define MAIN_BLKADDR(sbi)	(SM_I(sbi)->main_blkaddr)
+#define SEG0_BLKADDR(sbi)	(SM_I(sbi)->seg0_blkaddr)
+
+#define MAIN_SEGS(sbi)	(SM_I(sbi)->main_segments)
+#define MAIN_SECS(sbi)	(sbi->total_sections)
+
+#define TOTAL_SEGS(sbi)	(SM_I(sbi)->segment_count)
+#define TOTAL_BLKS(sbi)	(TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+
+#define MAX_BLKADDR(sbi)	(SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
+#define SEGMENT_SIZE(sbi)	(1ULL << (sbi->log_blocksize +		\
+					sbi->log_blocks_per_seg))
+
+#define START_BLOCK(sbi, segno)	(SEG0_BLKADDR(sbi) +			\
+	 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+
+#define NEXT_FREE_BLKADDR(sbi, curseg)					\
+	(START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)	((blk_addr) - SEG0_BLKADDR(sbi))
+#define GET_SEGNO_FROM_SEG0(sbi, blk_addr)				\
+	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)				\
+	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+
+#define GET_SEGNO(sbi, blk_addr)					\
+	(((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?		\
+	NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),			\
+		GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
+#define GET_SECNO(sbi, segno)					\
+	((segno) / sbi->segs_per_sec)
+#define GET_ZONENO_FROM_SEGNO(sbi, segno)				\
+	((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+
+#define GET_SUM_BLOCK(sbi, segno)				\
+	((sbi->sm_info->ssa_blkaddr) + segno)
+
+#define GET_SUM_TYPE(footer) ((footer)->entry_type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+
+#define SIT_ENTRY_OFFSET(sit_i, segno)					\
+	(segno % sit_i->sents_per_block)
+#define SIT_BLOCK_OFFSET(segno)					\
+	(segno / SIT_ENTRY_PER_BLOCK)
+#define	START_SEGNO(segno)		\
+	(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
+#define SIT_BLK_CNT(sbi)			\
+	((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
+#define f2fs_bitmap_size(nr)			\
+	(BITS_TO_LONGS(nr) * sizeof(unsigned long))
+
+#define SECTOR_FROM_BLOCK(blk_addr)					\
+	(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
+#define SECTOR_TO_BLOCK(sectors)					\
+	(sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+#define MAX_BIO_BLOCKS(sbi)						\
+	((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
+
+/*
+ * indicate a block allocation direction: RIGHT and LEFT.
+ * RIGHT means allocating new sections towards the end of volume.
+ * LEFT means the opposite direction.
+ */
+enum {
+	ALLOC_RIGHT = 0,
+	ALLOC_LEFT
+};
+
+/*
+ * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
+ * LFS writes data sequentially with cleaning operations.
+ * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
+ */
+enum {
+	LFS = 0,
+	SSR
+};
+
+/*
+ * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
+ * GC_CB is based on cost-benefit algorithm.
+ * GC_GREEDY is based on greedy algorithm.
+ */
+enum {
+	GC_CB = 0,
+	GC_GREEDY
+};
+
+/*
+ * BG_GC means the background cleaning job.
+ * FG_GC means the on-demand cleaning job.
+ */
+enum {
+	BG_GC = 0,
+	FG_GC
+};
+
+/* for a function parameter to select a victim segment */
+struct victim_sel_policy {
+	int alloc_mode;			/* LFS or SSR */
+	int gc_mode;			/* GC_CB or GC_GREEDY */
+	unsigned long *dirty_segmap;	/* dirty segment bitmap */
+	unsigned int max_search;	/* maximum # of segments to search */
+	unsigned int offset;		/* last scanned bitmap offset */
+	unsigned int ofs_unit;		/* bitmap search unit */
+	unsigned int min_cost;		/* minimum cost */
+	unsigned int min_segno;		/* segment # having min. cost */
+};
+
+struct seg_entry {
+	unsigned short valid_blocks;	/* # of valid blocks */
+	unsigned char *cur_valid_map;	/* validity bitmap of blocks */
+	/*
+	 * # of valid blocks and the validity bitmap stored in the the last
+	 * checkpoint pack. This information is used by the SSR mode.
+	 */
+	unsigned short ckpt_valid_blocks;
+	unsigned char *ckpt_valid_map;
+	unsigned char type;		/* segment type like CURSEG_XXX_TYPE */
+	unsigned long long mtime;	/* modification time of the segment */
+};
+
+struct sec_entry {
+	unsigned int valid_blocks;	/* # of valid blocks in a section */
+};
+
+struct segment_allocation {
+	void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
+};
+
+struct inmem_pages {
+	struct list_head list;
+	struct page *page;
+};
+
+struct sit_info {
+	const struct segment_allocation *s_ops;
+
+	block_t sit_base_addr;		/* start block address of SIT area */
+	block_t sit_blocks;		/* # of blocks used by SIT area */
+	block_t written_valid_blocks;	/* # of valid blocks in main area */
+	char *sit_bitmap;		/* SIT bitmap pointer */
+	unsigned int bitmap_size;	/* SIT bitmap size */
+
+	unsigned long *tmp_map;			/* bitmap for temporal use */
+	unsigned long *dirty_sentries_bitmap;	/* bitmap for dirty sentries */
+	unsigned int dirty_sentries;		/* # of dirty sentries */
+	unsigned int sents_per_block;		/* # of SIT entries per block */
+	struct mutex sentry_lock;		/* to protect SIT cache */
+	struct seg_entry *sentries;		/* SIT segment-level cache */
+	struct sec_entry *sec_entries;		/* SIT section-level cache */
+
+	/* for cost-benefit algorithm in cleaning procedure */
+	unsigned long long elapsed_time;	/* elapsed time after mount */
+	unsigned long long mounted_time;	/* mount time */
+	unsigned long long min_mtime;		/* min. modification time */
+	unsigned long long max_mtime;		/* max. modification time */
+};
+
+struct free_segmap_info {
+	unsigned int start_segno;	/* start segment number logically */
+	unsigned int free_segments;	/* # of free segments */
+	unsigned int free_sections;	/* # of free sections */
+	spinlock_t segmap_lock;		/* free segmap lock */
+	unsigned long *free_segmap;	/* free segment bitmap */
+	unsigned long *free_secmap;	/* free section bitmap */
+};
+
+/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */
+enum dirty_type {
+	DIRTY_HOT_DATA,		/* dirty segments assigned as hot data logs */
+	DIRTY_WARM_DATA,	/* dirty segments assigned as warm data logs */
+	DIRTY_COLD_DATA,	/* dirty segments assigned as cold data logs */
+	DIRTY_HOT_NODE,		/* dirty segments assigned as hot node logs */
+	DIRTY_WARM_NODE,	/* dirty segments assigned as warm node logs */
+	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */
+	DIRTY,			/* to count # of dirty segments */
+	PRE,			/* to count # of entirely obsolete segments */
+	NR_DIRTY_TYPE
+};
+
+struct dirty_seglist_info {
+	const struct victim_selection *v_ops;	/* victim selction operation */
+	unsigned long *dirty_segmap[NR_DIRTY_TYPE];
+	struct mutex seglist_lock;		/* lock for segment bitmaps */
+	int nr_dirty[NR_DIRTY_TYPE];		/* # of dirty segments */
+	unsigned long *victim_secmap;		/* background GC victims */
+};
+
+/* victim selection function for cleaning and SSR */
+struct victim_selection {
+	int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
+							int, int, char);
+};
+
+/* for active log information */
+struct curseg_info {
+	struct mutex curseg_mutex;		/* lock for consistency */
+	struct f2fs_summary_block *sum_blk;	/* cached summary block */
+	unsigned char alloc_type;		/* current allocation type */
+	unsigned int segno;			/* current segment number */
+	unsigned short next_blkoff;		/* next block offset to write */
+	unsigned int zone;			/* current zone number */
+	unsigned int next_segno;		/* preallocated segment */
+};
+
+struct sit_entry_set {
+	struct list_head set_list;	/* link with all sit sets */
+	unsigned int start_segno;	/* start segno of sits in set */
+	unsigned int entry_cnt;		/* the # of sit entries in set */
+};
+
+/*
+ * inline functions
+ */
+static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
+{
+	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
+}
+
+static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
+						unsigned int segno)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	return &sit_i->sentries[segno];
+}
+
+static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
+						unsigned int segno)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+}
+
+static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
+				unsigned int segno, int section)
+{
+	/*
+	 * In order to get # of valid blocks in a section instantly from many
+	 * segments, f2fs manages two counting structures separately.
+	 */
+	if (section > 1)
+		return get_sec_entry(sbi, segno)->valid_blocks;
+	else
+		return get_seg_entry(sbi, segno)->valid_blocks;
+}
+
+static inline void seg_info_from_raw_sit(struct seg_entry *se,
+					struct f2fs_sit_entry *rs)
+{
+	se->valid_blocks = GET_SIT_VBLOCKS(rs);
+	se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
+	memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+	memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+	se->type = GET_SIT_TYPE(rs);
+	se->mtime = le64_to_cpu(rs->mtime);
+}
+
+static inline void seg_info_to_raw_sit(struct seg_entry *se,
+					struct f2fs_sit_entry *rs)
+{
+	unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
+					se->valid_blocks;
+	rs->vblocks = cpu_to_le16(raw_vblocks);
+	memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+	memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+	se->ckpt_valid_blocks = se->valid_blocks;
+	rs->mtime = cpu_to_le64(se->mtime);
+}
+
+static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
+		unsigned int max, unsigned int segno)
+{
+	unsigned int ret;
+	spin_lock(&free_i->segmap_lock);
+	ret = find_next_bit(free_i->free_segmap, max, segno);
+	spin_unlock(&free_i->segmap_lock);
+	return ret;
+}
+
+static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	struct free_segmap_info *free_i = FREE_I(sbi);
+	unsigned int secno = segno / sbi->segs_per_sec;
+	unsigned int start_segno = secno * sbi->segs_per_sec;
+	unsigned int next;
+
+	spin_lock(&free_i->segmap_lock);
+	clear_bit(segno, free_i->free_segmap);
+	free_i->free_segments++;
+
+	next = find_next_bit(free_i->free_segmap,
+			start_segno + sbi->segs_per_sec, start_segno);
+	if (next >= start_segno + sbi->segs_per_sec) {
+		clear_bit(secno, free_i->free_secmap);
+		free_i->free_sections++;
+	}
+	spin_unlock(&free_i->segmap_lock);
+}
+
+static inline void __set_inuse(struct f2fs_sb_info *sbi,
+		unsigned int segno)
+{
+	struct free_segmap_info *free_i = FREE_I(sbi);
+	unsigned int secno = segno / sbi->segs_per_sec;
+	set_bit(segno, free_i->free_segmap);
+	free_i->free_segments--;
+	if (!test_and_set_bit(secno, free_i->free_secmap))
+		free_i->free_sections--;
+}
+
+static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
+		unsigned int segno)
+{
+	struct free_segmap_info *free_i = FREE_I(sbi);
+	unsigned int secno = segno / sbi->segs_per_sec;
+	unsigned int start_segno = secno * sbi->segs_per_sec;
+	unsigned int next;
+
+	spin_lock(&free_i->segmap_lock);
+	if (test_and_clear_bit(segno, free_i->free_segmap)) {
+		free_i->free_segments++;
+
+		next = find_next_bit(free_i->free_segmap,
+				start_segno + sbi->segs_per_sec, start_segno);
+		if (next >= start_segno + sbi->segs_per_sec) {
+			if (test_and_clear_bit(secno, free_i->free_secmap))
+				free_i->free_sections++;
+		}
+	}
+	spin_unlock(&free_i->segmap_lock);
+}
+
+static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
+		unsigned int segno)
+{
+	struct free_segmap_info *free_i = FREE_I(sbi);
+	unsigned int secno = segno / sbi->segs_per_sec;
+	spin_lock(&free_i->segmap_lock);
+	if (!test_and_set_bit(segno, free_i->free_segmap)) {
+		free_i->free_segments--;
+		if (!test_and_set_bit(secno, free_i->free_secmap))
+			free_i->free_sections--;
+	}
+	spin_unlock(&free_i->segmap_lock);
+}
+
+static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
+		void *dst_addr)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
+}
+
+static inline block_t written_block_count(struct f2fs_sb_info *sbi)
+{
+	return SIT_I(sbi)->written_valid_blocks;
+}
+
+static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
+{
+	return FREE_I(sbi)->free_segments;
+}
+
+static inline int reserved_segments(struct f2fs_sb_info *sbi)
+{
+	return SM_I(sbi)->reserved_segments;
+}
+
+static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
+{
+	return FREE_I(sbi)->free_sections;
+}
+
+static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
+{
+	return DIRTY_I(sbi)->nr_dirty[PRE];
+}
+
+static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
+{
+	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
+		DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] +
+		DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] +
+		DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] +
+		DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] +
+		DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE];
+}
+
+static inline int overprovision_segments(struct f2fs_sb_info *sbi)
+{
+	return SM_I(sbi)->ovp_segments;
+}
+
+static inline int overprovision_sections(struct f2fs_sb_info *sbi)
+{
+	return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+}
+
+static inline int reserved_sections(struct f2fs_sb_info *sbi)
+{
+	return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+}
+
+static inline bool need_SSR(struct f2fs_sb_info *sbi)
+{
+	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+	return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+						reserved_sections(sbi) + 1);
+}
+
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
+{
+	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		return false;
+
+	return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+						reserved_sections(sbi));
+}
+
+static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
+{
+	return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
+}
+
+static inline int utilization(struct f2fs_sb_info *sbi)
+{
+	return div_u64((u64)valid_user_blocks(sbi) * 100,
+					sbi->user_block_count);
+}
+
+/*
+ * Sometimes f2fs may be better to drop out-of-place update policy.
+ * And, users can control the policy through sysfs entries.
+ * There are five policies with triggering conditions as follows.
+ * F2FS_IPU_FORCE - all the time,
+ * F2FS_IPU_SSR - if SSR mode is activated,
+ * F2FS_IPU_UTIL - if FS utilization is over threashold,
+ * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
+ *                     threashold,
+ * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
+ *                     storages. IPU will be triggered only if the # of dirty
+ *                     pages over min_fsync_blocks.
+ * F2FS_IPUT_DISABLE - disable IPU. (=default option)
+ */
+#define DEF_MIN_IPU_UTIL	70
+#define DEF_MIN_FSYNC_BLOCKS	8
+
+enum {
+	F2FS_IPU_FORCE,
+	F2FS_IPU_SSR,
+	F2FS_IPU_UTIL,
+	F2FS_IPU_SSR_UTIL,
+	F2FS_IPU_FSYNC,
+};
+
+static inline bool need_inplace_update(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned int policy = SM_I(sbi)->ipu_policy;
+
+	/* IPU can be done only for the user data */
+	if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
+		return false;
+
+	if (policy & (0x1 << F2FS_IPU_FORCE))
+		return true;
+	if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
+		return true;
+	if (policy & (0x1 << F2FS_IPU_UTIL) &&
+			utilization(sbi) > SM_I(sbi)->min_ipu_util)
+		return true;
+	if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
+			utilization(sbi) > SM_I(sbi)->min_ipu_util)
+		return true;
+
+	/* this is only set during fdatasync */
+	if (policy & (0x1 << F2FS_IPU_FSYNC) &&
+			is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+		return true;
+
+	return false;
+}
+
+static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
+		int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	return curseg->segno;
+}
+
+static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi,
+		int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	return curseg->alloc_type;
+}
+
+static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	return curseg->next_blkoff;
+}
+
+#ifdef CONFIG_F2FS_CHECK_FS
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
+}
+
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+	BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
+	BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
+}
+
+/*
+ * Summary block is always treated as an invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+		int segno, struct f2fs_sit_entry *raw_sit)
+{
+	bool is_valid  = test_bit_le(0, raw_sit->valid_map) ? true : false;
+	int valid_blocks = 0;
+	int cur_pos = 0, next_pos;
+
+	/* check segment usage */
+	BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
+
+	/* check boundary of a given segment number */
+	BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
+
+	/* check bitmap with valid block count */
+	do {
+		if (is_valid) {
+			next_pos = find_next_zero_bit_le(&raw_sit->valid_map,
+					sbi->blocks_per_seg,
+					cur_pos);
+			valid_blocks += next_pos - cur_pos;
+		} else
+			next_pos = find_next_bit_le(&raw_sit->valid_map,
+					sbi->blocks_per_seg,
+					cur_pos);
+		cur_pos = next_pos;
+		is_valid = !is_valid;
+	} while (cur_pos < sbi->blocks_per_seg);
+	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
+}
+#else
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	if (segno > TOTAL_SEGS(sbi) - 1)
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+}
+
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+	if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+}
+
+/*
+ * Summary block is always treated as an invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+		int segno, struct f2fs_sit_entry *raw_sit)
+{
+	/* check segment usage */
+	if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+
+	/* check boundary of a given segment number */
+	if (segno > TOTAL_SEGS(sbi) - 1)
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+}
+#endif
+
+static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
+						unsigned int start)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	unsigned int offset = SIT_BLOCK_OFFSET(start);
+	block_t blk_addr = sit_i->sit_base_addr + offset;
+
+	check_seg_range(sbi, start);
+
+	/* calculate sit block address */
+	if (f2fs_test_bit(offset, sit_i->sit_bitmap))
+		blk_addr += sit_i->sit_blocks;
+
+	return blk_addr;
+}
+
+static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
+						pgoff_t block_addr)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	block_addr -= sit_i->sit_base_addr;
+	if (block_addr < sit_i->sit_blocks)
+		block_addr += sit_i->sit_blocks;
+	else
+		block_addr -= sit_i->sit_blocks;
+
+	return block_addr + sit_i->sit_base_addr;
+}
+
+static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
+{
+	unsigned int block_off = SIT_BLOCK_OFFSET(start);
+
+	f2fs_change_bit(block_off, sit_i->sit_bitmap);
+}
+
+static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
+						sit_i->mounted_time;
+}
+
+static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
+			unsigned int ofs_in_node, unsigned char version)
+{
+	sum->nid = cpu_to_le32(nid);
+	sum->ofs_in_node = cpu_to_le16(ofs_in_node);
+	sum->version = version;
+}
+
+static inline block_t start_sum_block(struct f2fs_sb_info *sbi)
+{
+	return __start_cp_addr(sbi) +
+		le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+
+static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
+{
+	return __start_cp_addr(sbi) +
+		le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
+				- (base + 1) + type;
+}
+
+static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
+{
+	if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+		return true;
+	return false;
+}
+
+static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	return SECTOR_TO_BLOCK(queue_max_sectors(q));
+}
+
+/*
+ * It is very important to gather dirty pages and write at once, so that we can
+ * submit a big bio without interfering other data writes.
+ * By default, 512 pages for directory data,
+ * 512 pages (2MB) * 3 for three types of nodes, and
+ * max_bio_blocks for meta are set.
+ */
+static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
+{
+	if (sbi->sb->s_bdi->dirty_exceeded)
+		return 0;
+
+	if (type == DATA)
+		return sbi->blocks_per_seg;
+	else if (type == NODE)
+		return 3 * sbi->blocks_per_seg;
+	else if (type == META)
+		return MAX_BIO_BLOCKS(sbi);
+	else
+		return 0;
+}
+
+/*
+ * When writing pages, it'd better align nr_to_write for segment size.
+ */
+static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
+					struct writeback_control *wbc)
+{
+	long nr_to_write, desired;
+
+	if (wbc->sync_mode != WB_SYNC_NONE)
+		return 0;
+
+	nr_to_write = wbc->nr_to_write;
+
+	if (type == DATA)
+		desired = 4096;
+	else if (type == NODE)
+		desired = 3 * max_hw_blocks(sbi);
+	else
+		desired = MAX_BIO_BLOCKS(sbi);
+
+	wbc->nr_to_write = desired;
+	return desired - nr_to_write;
+}
diff --git a/kernel/fs/f2fs/super.c b/kernel/fs/f2fs/super.c
new file mode 100644
index 000000000..b2dd1b01f
--- /dev/null
+++ b/kernel/fs/f2fs/super.c
@@ -0,0 +1,1350 @@
+/*
+ * fs/f2fs/super.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+#include <linux/blkdev.h>
+#include <linux/f2fs_fs.h>
+#include <linux/sysfs.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "xattr.h"
+#include "gc.h"
+#include "trace.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/f2fs.h>
+
+static struct proc_dir_entry *f2fs_proc_root;
+static struct kmem_cache *f2fs_inode_cachep;
+static struct kset *f2fs_kset;
+
+enum {
+	Opt_gc_background,
+	Opt_disable_roll_forward,
+	Opt_norecovery,
+	Opt_discard,
+	Opt_noheap,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_acl,
+	Opt_noacl,
+	Opt_active_logs,
+	Opt_disable_ext_identify,
+	Opt_inline_xattr,
+	Opt_inline_data,
+	Opt_inline_dentry,
+	Opt_flush_merge,
+	Opt_nobarrier,
+	Opt_fastboot,
+	Opt_extent_cache,
+	Opt_noinline_data,
+	Opt_err,
+};
+
+static match_table_t f2fs_tokens = {
+	{Opt_gc_background, "background_gc=%s"},
+	{Opt_disable_roll_forward, "disable_roll_forward"},
+	{Opt_norecovery, "norecovery"},
+	{Opt_discard, "discard"},
+	{Opt_noheap, "no_heap"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_active_logs, "active_logs=%u"},
+	{Opt_disable_ext_identify, "disable_ext_identify"},
+	{Opt_inline_xattr, "inline_xattr"},
+	{Opt_inline_data, "inline_data"},
+	{Opt_inline_dentry, "inline_dentry"},
+	{Opt_flush_merge, "flush_merge"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_fastboot, "fastboot"},
+	{Opt_extent_cache, "extent_cache"},
+	{Opt_noinline_data, "noinline_data"},
+	{Opt_err, NULL},
+};
+
+/* Sysfs support for f2fs */
+enum {
+	GC_THREAD,	/* struct f2fs_gc_thread */
+	SM_INFO,	/* struct f2fs_sm_info */
+	NM_INFO,	/* struct f2fs_nm_info */
+	F2FS_SBI,	/* struct f2fs_sb_info */
+};
+
+struct f2fs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
+	ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
+			 const char *, size_t);
+	int struct_type;
+	int offset;
+};
+
+static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
+{
+	if (struct_type == GC_THREAD)
+		return (unsigned char *)sbi->gc_thread;
+	else if (struct_type == SM_INFO)
+		return (unsigned char *)SM_I(sbi);
+	else if (struct_type == NM_INFO)
+		return (unsigned char *)NM_I(sbi);
+	else if (struct_type == F2FS_SBI)
+		return (unsigned char *)sbi;
+	return NULL;
+}
+
+static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
+			struct f2fs_sb_info *sbi, char *buf)
+{
+	unsigned char *ptr = NULL;
+	unsigned int *ui;
+
+	ptr = __struct_ptr(sbi, a->struct_type);
+	if (!ptr)
+		return -EINVAL;
+
+	ui = (unsigned int *)(ptr + a->offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
+			struct f2fs_sb_info *sbi,
+			const char *buf, size_t count)
+{
+	unsigned char *ptr;
+	unsigned long t;
+	unsigned int *ui;
+	ssize_t ret;
+
+	ptr = __struct_ptr(sbi, a->struct_type);
+	if (!ptr)
+		return -EINVAL;
+
+	ui = (unsigned int *)(ptr + a->offset);
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret < 0)
+		return ret;
+	*ui = t;
+	return count;
+}
+
+static ssize_t f2fs_attr_show(struct kobject *kobj,
+				struct attribute *attr, char *buf)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+								s_kobj);
+	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
+						const char *buf, size_t len)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+									s_kobj);
+	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void f2fs_sb_release(struct kobject *kobj)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+								s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
+static struct f2fs_attr f2fs_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.struct_type = _struct_type,				\
+	.offset = _offset					\
+}
+
+#define F2FS_RW_ATTR(struct_type, struct_name, name, elname)	\
+	F2FS_ATTR_OFFSET(struct_type, name, 0644,		\
+		f2fs_sbi_show, f2fs_sbi_store,			\
+		offsetof(struct struct_name, elname))
+
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+
+#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
+static struct attribute *f2fs_attrs[] = {
+	ATTR_LIST(gc_min_sleep_time),
+	ATTR_LIST(gc_max_sleep_time),
+	ATTR_LIST(gc_no_gc_sleep_time),
+	ATTR_LIST(gc_idle),
+	ATTR_LIST(reclaim_segments),
+	ATTR_LIST(max_small_discards),
+	ATTR_LIST(batched_trim_sections),
+	ATTR_LIST(ipu_policy),
+	ATTR_LIST(min_ipu_util),
+	ATTR_LIST(min_fsync_blocks),
+	ATTR_LIST(max_victim_search),
+	ATTR_LIST(dir_level),
+	ATTR_LIST(ram_thresh),
+	NULL,
+};
+
+static const struct sysfs_ops f2fs_attr_ops = {
+	.show	= f2fs_attr_show,
+	.store	= f2fs_attr_store,
+};
+
+static struct kobj_type f2fs_ktype = {
+	.default_attrs	= f2fs_attrs,
+	.sysfs_ops	= &f2fs_attr_ops,
+	.release	= f2fs_sb_release,
+};
+
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+	va_end(args);
+}
+
+static void init_once(void *foo)
+{
+	struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
+
+	inode_init_once(&fi->vfs_inode);
+}
+
+static int parse_options(struct super_block *sb, char *options)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *name;
+	int arg = 0;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, f2fs_tokens, args);
+
+		switch (token) {
+		case Opt_gc_background:
+			name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (strlen(name) == 2 && !strncmp(name, "on", 2))
+				set_opt(sbi, BG_GC);
+			else if (strlen(name) == 3 && !strncmp(name, "off", 3))
+				clear_opt(sbi, BG_GC);
+			else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
+		case Opt_disable_roll_forward:
+			set_opt(sbi, DISABLE_ROLL_FORWARD);
+			break;
+		case Opt_norecovery:
+			/* this option mounts f2fs with ro */
+			set_opt(sbi, DISABLE_ROLL_FORWARD);
+			if (!f2fs_readonly(sb))
+				return -EINVAL;
+			break;
+		case Opt_discard:
+			set_opt(sbi, DISCARD);
+			break;
+		case Opt_noheap:
+			set_opt(sbi, NOHEAP);
+			break;
+#ifdef CONFIG_F2FS_FS_XATTR
+		case Opt_user_xattr:
+			set_opt(sbi, XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt(sbi, XATTR_USER);
+			break;
+		case Opt_inline_xattr:
+			set_opt(sbi, INLINE_XATTR);
+			break;
+#else
+		case Opt_user_xattr:
+			f2fs_msg(sb, KERN_INFO,
+				"user_xattr options not supported");
+			break;
+		case Opt_nouser_xattr:
+			f2fs_msg(sb, KERN_INFO,
+				"nouser_xattr options not supported");
+			break;
+		case Opt_inline_xattr:
+			f2fs_msg(sb, KERN_INFO,
+				"inline_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sbi, POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(sbi, POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+			f2fs_msg(sb, KERN_INFO, "acl options not supported");
+			break;
+		case Opt_noacl:
+			f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+			break;
+#endif
+		case Opt_active_logs:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+				return -EINVAL;
+			sbi->active_logs = arg;
+			break;
+		case Opt_disable_ext_identify:
+			set_opt(sbi, DISABLE_EXT_IDENTIFY);
+			break;
+		case Opt_inline_data:
+			set_opt(sbi, INLINE_DATA);
+			break;
+		case Opt_inline_dentry:
+			set_opt(sbi, INLINE_DENTRY);
+			break;
+		case Opt_flush_merge:
+			set_opt(sbi, FLUSH_MERGE);
+			break;
+		case Opt_nobarrier:
+			set_opt(sbi, NOBARRIER);
+			break;
+		case Opt_fastboot:
+			set_opt(sbi, FASTBOOT);
+			break;
+		case Opt_extent_cache:
+			set_opt(sbi, EXTENT_CACHE);
+			break;
+		case Opt_noinline_data:
+			clear_opt(sbi, INLINE_DATA);
+			break;
+		default:
+			f2fs_msg(sb, KERN_ERR,
+				"Unrecognized mount option \"%s\" or missing value",
+				p);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static struct inode *f2fs_alloc_inode(struct super_block *sb)
+{
+	struct f2fs_inode_info *fi;
+
+	fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
+	if (!fi)
+		return NULL;
+
+	init_once((void *) fi);
+
+	/* Initialize f2fs-specific inode info */
+	fi->vfs_inode.i_version = 1;
+	atomic_set(&fi->dirty_pages, 0);
+	fi->i_current_depth = 1;
+	fi->i_advise = 0;
+	rwlock_init(&fi->ext_lock);
+	init_rwsem(&fi->i_sem);
+	INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
+	INIT_LIST_HEAD(&fi->inmem_pages);
+	mutex_init(&fi->inmem_lock);
+
+	set_inode_flag(fi, FI_NEW_INODE);
+
+	if (test_opt(F2FS_SB(sb), INLINE_XATTR))
+		set_inode_flag(fi, FI_INLINE_XATTR);
+
+	/* Will be used by directory only */
+	fi->i_dir_level = F2FS_SB(sb)->dir_level;
+
+	return &fi->vfs_inode;
+}
+
+static int f2fs_drop_inode(struct inode *inode)
+{
+	/*
+	 * This is to avoid a deadlock condition like below.
+	 * writeback_single_inode(inode)
+	 *  - f2fs_write_data_page
+	 *    - f2fs_gc -> iput -> evict
+	 *       - inode_wait_for_writeback(inode)
+	 */
+	if (!inode_unhashed(inode) && inode->i_state & I_SYNC)
+		return 0;
+	return generic_drop_inode(inode);
+}
+
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+	set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+}
+
+static void f2fs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode));
+}
+
+static void f2fs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, f2fs_i_callback);
+}
+
+static void f2fs_put_super(struct super_block *sb)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+	if (sbi->s_proc) {
+		remove_proc_entry("segment_info", sbi->s_proc);
+		remove_proc_entry(sb->s_id, f2fs_proc_root);
+	}
+	kobject_del(&sbi->s_kobj);
+
+	f2fs_destroy_stats(sbi);
+	stop_gc_thread(sbi);
+
+	/*
+	 * We don't need to do checkpoint when superblock is clean.
+	 * But, the previous checkpoint was not done by umount, it needs to do
+	 * clean checkpoint again.
+	 */
+	if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+			!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
+		struct cp_control cpc = {
+			.reason = CP_UMOUNT,
+		};
+		write_checkpoint(sbi, &cpc);
+	}
+
+	/*
+	 * normally superblock is clean, so we need to release this.
+	 * In addition, EIO will skip do checkpoint, we need this as well.
+	 */
+	release_dirty_inode(sbi);
+	release_discard_addrs(sbi);
+
+	iput(sbi->node_inode);
+	iput(sbi->meta_inode);
+
+	/* destroy f2fs internal modules */
+	destroy_node_manager(sbi);
+	destroy_segment_manager(sbi);
+
+	kfree(sbi->ckpt);
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
+
+	sb->s_fs_info = NULL;
+	brelse(sbi->raw_super_buf);
+	kfree(sbi);
+}
+
+int f2fs_sync_fs(struct super_block *sb, int sync)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+	trace_f2fs_sync_fs(sb, sync);
+
+	if (sync) {
+		struct cp_control cpc;
+
+		cpc.reason = __get_cp_reason(sbi);
+
+		mutex_lock(&sbi->gc_mutex);
+		write_checkpoint(sbi, &cpc);
+		mutex_unlock(&sbi->gc_mutex);
+	} else {
+		f2fs_balance_fs(sbi);
+	}
+	f2fs_trace_ios(NULL, NULL, 1);
+
+	return 0;
+}
+
+static int f2fs_freeze(struct super_block *sb)
+{
+	int err;
+
+	if (f2fs_readonly(sb))
+		return 0;
+
+	err = f2fs_sync_fs(sb, 1);
+	return err;
+}
+
+static int f2fs_unfreeze(struct super_block *sb)
+{
+	return 0;
+}
+
+static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	block_t total_count, user_block_count, start_count, ovp_count;
+
+	total_count = le64_to_cpu(sbi->raw_super->block_count);
+	user_block_count = sbi->user_block_count;
+	start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
+	ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
+	buf->f_type = F2FS_SUPER_MAGIC;
+	buf->f_bsize = sbi->blocksize;
+
+	buf->f_blocks = total_count - start_count;
+	buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+	buf->f_bavail = user_block_count - valid_user_blocks(sbi);
+
+	buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+	buf->f_ffree = buf->f_files - valid_inode_count(sbi);
+
+	buf->f_namelen = F2FS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
+
+	if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC))
+		seq_printf(seq, ",background_gc=%s", "on");
+	else
+		seq_printf(seq, ",background_gc=%s", "off");
+	if (test_opt(sbi, DISABLE_ROLL_FORWARD))
+		seq_puts(seq, ",disable_roll_forward");
+	if (test_opt(sbi, DISCARD))
+		seq_puts(seq, ",discard");
+	if (test_opt(sbi, NOHEAP))
+		seq_puts(seq, ",no_heap_alloc");
+#ifdef CONFIG_F2FS_FS_XATTR
+	if (test_opt(sbi, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	else
+		seq_puts(seq, ",nouser_xattr");
+	if (test_opt(sbi, INLINE_XATTR))
+		seq_puts(seq, ",inline_xattr");
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+	if (test_opt(sbi, POSIX_ACL))
+		seq_puts(seq, ",acl");
+	else
+		seq_puts(seq, ",noacl");
+#endif
+	if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
+		seq_puts(seq, ",disable_ext_identify");
+	if (test_opt(sbi, INLINE_DATA))
+		seq_puts(seq, ",inline_data");
+	else
+		seq_puts(seq, ",noinline_data");
+	if (test_opt(sbi, INLINE_DENTRY))
+		seq_puts(seq, ",inline_dentry");
+	if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
+		seq_puts(seq, ",flush_merge");
+	if (test_opt(sbi, NOBARRIER))
+		seq_puts(seq, ",nobarrier");
+	if (test_opt(sbi, FASTBOOT))
+		seq_puts(seq, ",fastboot");
+	if (test_opt(sbi, EXTENT_CACHE))
+		seq_puts(seq, ",extent_cache");
+	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+
+	return 0;
+}
+
+static int segment_info_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	unsigned int total_segs =
+			le32_to_cpu(sbi->raw_super->segment_count_main);
+	int i;
+
+	seq_puts(seq, "format: segment_type|valid_blocks\n"
+		"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+	for (i = 0; i < total_segs; i++) {
+		struct seg_entry *se = get_seg_entry(sbi, i);
+
+		if ((i % 10) == 0)
+			seq_printf(seq, "%-5d", i);
+		seq_printf(seq, "%d|%-3u", se->type,
+					get_valid_blocks(sbi, i, 1));
+		if ((i % 10) == 9 || i == (total_segs - 1))
+			seq_putc(seq, '\n');
+		else
+			seq_putc(seq, ' ');
+	}
+
+	return 0;
+}
+
+static int segment_info_open_fs(struct inode *inode, struct file *file)
+{
+	return single_open(file, segment_info_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations f2fs_seq_segment_info_fops = {
+	.owner = THIS_MODULE,
+	.open = segment_info_open_fs,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct f2fs_mount_info org_mount_opt;
+	int err, active_logs;
+	bool need_restart_gc = false;
+	bool need_stop_gc = false;
+
+	sync_filesystem(sb);
+
+	/*
+	 * Save the old mount options in case we
+	 * need to restore them.
+	 */
+	org_mount_opt = sbi->mount_opt;
+	active_logs = sbi->active_logs;
+
+	sbi->mount_opt.opt = 0;
+	sbi->active_logs = NR_CURSEG_TYPE;
+
+	/* parse mount options */
+	err = parse_options(sb, data);
+	if (err)
+		goto restore_opts;
+
+	/*
+	 * Previous and new state of filesystem is RO,
+	 * so skip checking GC and FLUSH_MERGE conditions.
+	 */
+	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
+		goto skip;
+
+	/*
+	 * We stop the GC thread if FS is mounted as RO
+	 * or if background_gc = off is passed in mount
+	 * option. Also sync the filesystem.
+	 */
+	if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+		if (sbi->gc_thread) {
+			stop_gc_thread(sbi);
+			f2fs_sync_fs(sb, 1);
+			need_restart_gc = true;
+		}
+	} else if (!sbi->gc_thread) {
+		err = start_gc_thread(sbi);
+		if (err)
+			goto restore_opts;
+		need_stop_gc = true;
+	}
+
+	/*
+	 * We stop issue flush thread if FS is mounted as RO
+	 * or if flush_merge is not passed in mount option.
+	 */
+	if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+		destroy_flush_cmd_control(sbi);
+	} else if (!SM_I(sbi)->cmd_control_info) {
+		err = create_flush_cmd_control(sbi);
+		if (err)
+			goto restore_gc;
+	}
+skip:
+	/* Update the POSIXACL Flag */
+	 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	return 0;
+restore_gc:
+	if (need_restart_gc) {
+		if (start_gc_thread(sbi))
+			f2fs_msg(sbi->sb, KERN_WARNING,
+				"background gc thread has stopped");
+	} else if (need_stop_gc) {
+		stop_gc_thread(sbi);
+	}
+restore_opts:
+	sbi->mount_opt = org_mount_opt;
+	sbi->active_logs = active_logs;
+	return err;
+}
+
+static struct super_operations f2fs_sops = {
+	.alloc_inode	= f2fs_alloc_inode,
+	.drop_inode	= f2fs_drop_inode,
+	.destroy_inode	= f2fs_destroy_inode,
+	.write_inode	= f2fs_write_inode,
+	.dirty_inode	= f2fs_dirty_inode,
+	.show_options	= f2fs_show_options,
+	.evict_inode	= f2fs_evict_inode,
+	.put_super	= f2fs_put_super,
+	.sync_fs	= f2fs_sync_fs,
+	.freeze_fs	= f2fs_freeze,
+	.unfreeze_fs	= f2fs_unfreeze,
+	.statfs		= f2fs_statfs,
+	.remount_fs	= f2fs_remount,
+};
+
+static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct inode *inode;
+
+	if (check_nid_range(sbi, ino))
+		return ERR_PTR(-ESTALE);
+
+	/*
+	 * f2fs_iget isn't quite right if the inode is currently unallocated!
+	 * However f2fs_iget currently does appropriate checks to handle stale
+	 * inodes so everything is OK.
+	 */
+	inode = f2fs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (unlikely(generation && inode->i_generation != generation)) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    f2fs_nfs_get_inode);
+}
+
+static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    f2fs_nfs_get_inode);
+}
+
+static const struct export_operations f2fs_export_ops = {
+	.fh_to_dentry = f2fs_fh_to_dentry,
+	.fh_to_parent = f2fs_fh_to_parent,
+	.get_parent = f2fs_get_parent,
+};
+
+static loff_t max_file_size(unsigned bits)
+{
+	loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
+	loff_t leaf_count = ADDRS_PER_BLOCK;
+
+	/* two direct node blocks */
+	result += (leaf_count * 2);
+
+	/* two indirect node blocks */
+	leaf_count *= NIDS_PER_BLOCK;
+	result += (leaf_count * 2);
+
+	/* one double indirect node block */
+	leaf_count *= NIDS_PER_BLOCK;
+	result += leaf_count;
+
+	result <<= bits;
+	return result;
+}
+
+static int sanity_check_raw_super(struct super_block *sb,
+			struct f2fs_super_block *raw_super)
+{
+	unsigned int blocksize;
+
+	if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
+		f2fs_msg(sb, KERN_INFO,
+			"Magic Mismatch, valid(0x%x) - read(0x%x)",
+			F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
+		return 1;
+	}
+
+	/* Currently, support only 4KB page cache size */
+	if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid page_cache_size (%lu), supports only 4KB\n",
+			PAGE_CACHE_SIZE);
+		return 1;
+	}
+
+	/* Currently, support only 4KB block size */
+	blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
+	if (blocksize != F2FS_BLKSIZE) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid blocksize (%u), supports only 4KB\n",
+			blocksize);
+		return 1;
+	}
+
+	/* Currently, support 512/1024/2048/4096 bytes sector size */
+	if (le32_to_cpu(raw_super->log_sectorsize) >
+				F2FS_MAX_LOG_SECTOR_SIZE ||
+		le32_to_cpu(raw_super->log_sectorsize) <
+				F2FS_MIN_LOG_SECTOR_SIZE) {
+		f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
+			le32_to_cpu(raw_super->log_sectorsize));
+		return 1;
+	}
+	if (le32_to_cpu(raw_super->log_sectors_per_block) +
+		le32_to_cpu(raw_super->log_sectorsize) !=
+			F2FS_MAX_LOG_SECTOR_SIZE) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid log sectors per block(%u) log sectorsize(%u)",
+			le32_to_cpu(raw_super->log_sectors_per_block),
+			le32_to_cpu(raw_super->log_sectorsize));
+		return 1;
+	}
+	return 0;
+}
+
+static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+{
+	unsigned int total, fsmeta;
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+	total = le32_to_cpu(raw_super->segment_count);
+	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
+	fsmeta += le32_to_cpu(raw_super->segment_count_sit);
+	fsmeta += le32_to_cpu(raw_super->segment_count_nat);
+	fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
+	fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
+
+	if (unlikely(fsmeta >= total))
+		return 1;
+
+	if (unlikely(f2fs_cp_error(sbi))) {
+		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
+		return 1;
+	}
+	return 0;
+}
+
+static void init_sb_info(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *raw_super = sbi->raw_super;
+	int i;
+
+	sbi->log_sectors_per_block =
+		le32_to_cpu(raw_super->log_sectors_per_block);
+	sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
+	sbi->blocksize = 1 << sbi->log_blocksize;
+	sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+	sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
+	sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
+	sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
+	sbi->total_sections = le32_to_cpu(raw_super->section_count);
+	sbi->total_node_count =
+		(le32_to_cpu(raw_super->segment_count_nat) / 2)
+			* sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
+	sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
+	sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
+	sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+	sbi->cur_victim_sec = NULL_SECNO;
+	sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
+
+	for (i = 0; i < NR_COUNT_TYPE; i++)
+		atomic_set(&sbi->nr_pages[i], 0);
+
+	sbi->dir_level = DEF_DIR_LEVEL;
+	clear_sbi_flag(sbi, SBI_NEED_FSCK);
+}
+
+/*
+ * Read f2fs raw super block.
+ * Because we have two copies of super block, so read the first one at first,
+ * if the first one is invalid, move to read the second one.
+ */
+static int read_raw_super_block(struct super_block *sb,
+			struct f2fs_super_block **raw_super,
+			struct buffer_head **raw_super_buf)
+{
+	int block = 0;
+
+retry:
+	*raw_super_buf = sb_bread(sb, block);
+	if (!*raw_super_buf) {
+		f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+				block + 1);
+		if (block == 0) {
+			block++;
+			goto retry;
+		} else {
+			return -EIO;
+		}
+	}
+
+	*raw_super = (struct f2fs_super_block *)
+		((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
+
+	/* sanity checking of raw super */
+	if (sanity_check_raw_super(sb, *raw_super)) {
+		brelse(*raw_super_buf);
+		f2fs_msg(sb, KERN_ERR,
+			"Can't find valid F2FS filesystem in %dth superblock",
+								block + 1);
+		if (block == 0) {
+			block++;
+			goto retry;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct f2fs_sb_info *sbi;
+	struct f2fs_super_block *raw_super = NULL;
+	struct buffer_head *raw_super_buf;
+	struct inode *root;
+	long err = -EINVAL;
+	bool retry = true, need_fsck = false;
+	char *options = NULL;
+	int i;
+
+try_onemore:
+	/* allocate memory for f2fs-specific super block info */
+	sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	/* set a block size */
+	if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
+		f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
+		goto free_sbi;
+	}
+
+	err = read_raw_super_block(sb, &raw_super, &raw_super_buf);
+	if (err)
+		goto free_sbi;
+
+	sb->s_fs_info = sbi;
+	/* init some FS parameters */
+	sbi->active_logs = NR_CURSEG_TYPE;
+
+	set_opt(sbi, BG_GC);
+	set_opt(sbi, INLINE_DATA);
+
+#ifdef CONFIG_F2FS_FS_XATTR
+	set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+	set_opt(sbi, POSIX_ACL);
+#endif
+	/* parse mount options */
+	options = kstrdup((const char *)data, GFP_KERNEL);
+	if (data && !options) {
+		err = -ENOMEM;
+		goto free_sb_buf;
+	}
+
+	err = parse_options(sb, options);
+	if (err)
+		goto free_options;
+
+	sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+	sb->s_max_links = F2FS_LINK_MAX;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+
+	sb->s_op = &f2fs_sops;
+	sb->s_xattr = f2fs_xattr_handlers;
+	sb->s_export_op = &f2fs_export_ops;
+	sb->s_magic = F2FS_SUPER_MAGIC;
+	sb->s_time_gran = 1;
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+
+	/* init f2fs-specific super block info */
+	sbi->sb = sb;
+	sbi->raw_super = raw_super;
+	sbi->raw_super_buf = raw_super_buf;
+	mutex_init(&sbi->gc_mutex);
+	mutex_init(&sbi->writepages);
+	mutex_init(&sbi->cp_mutex);
+	init_rwsem(&sbi->node_write);
+	clear_sbi_flag(sbi, SBI_POR_DOING);
+	spin_lock_init(&sbi->stat_lock);
+
+	init_rwsem(&sbi->read_io.io_rwsem);
+	sbi->read_io.sbi = sbi;
+	sbi->read_io.bio = NULL;
+	for (i = 0; i < NR_PAGE_TYPE; i++) {
+		init_rwsem(&sbi->write_io[i].io_rwsem);
+		sbi->write_io[i].sbi = sbi;
+		sbi->write_io[i].bio = NULL;
+	}
+
+	init_rwsem(&sbi->cp_rwsem);
+	init_waitqueue_head(&sbi->cp_wait);
+	init_sb_info(sbi);
+
+	/* get an inode for meta space */
+	sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
+	if (IS_ERR(sbi->meta_inode)) {
+		f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
+		err = PTR_ERR(sbi->meta_inode);
+		goto free_options;
+	}
+
+	err = get_valid_checkpoint(sbi);
+	if (err) {
+		f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
+		goto free_meta_inode;
+	}
+
+	/* sanity checking of checkpoint */
+	err = -EINVAL;
+	if (sanity_check_ckpt(sbi)) {
+		f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
+		goto free_cp;
+	}
+
+	sbi->total_valid_node_count =
+				le32_to_cpu(sbi->ckpt->valid_node_count);
+	sbi->total_valid_inode_count =
+				le32_to_cpu(sbi->ckpt->valid_inode_count);
+	sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
+	sbi->total_valid_block_count =
+				le64_to_cpu(sbi->ckpt->valid_block_count);
+	sbi->last_valid_block_count = sbi->total_valid_block_count;
+	sbi->alloc_valid_block_count = 0;
+	INIT_LIST_HEAD(&sbi->dir_inode_list);
+	spin_lock_init(&sbi->dir_inode_lock);
+
+	init_extent_cache_info(sbi);
+
+	init_ino_entry_info(sbi);
+
+	/* setup f2fs internal modules */
+	err = build_segment_manager(sbi);
+	if (err) {
+		f2fs_msg(sb, KERN_ERR,
+			"Failed to initialize F2FS segment manager");
+		goto free_sm;
+	}
+	err = build_node_manager(sbi);
+	if (err) {
+		f2fs_msg(sb, KERN_ERR,
+			"Failed to initialize F2FS node manager");
+		goto free_nm;
+	}
+
+	build_gc_manager(sbi);
+
+	/* get an inode for node space */
+	sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
+	if (IS_ERR(sbi->node_inode)) {
+		f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
+		err = PTR_ERR(sbi->node_inode);
+		goto free_nm;
+	}
+
+	/* if there are nt orphan nodes free them */
+	recover_orphan_inodes(sbi);
+
+	/* read root inode and dentry */
+	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
+	if (IS_ERR(root)) {
+		f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
+		err = PTR_ERR(root);
+		goto free_node_inode;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		iput(root);
+		err = -EINVAL;
+		goto free_node_inode;
+	}
+
+	sb->s_root = d_make_root(root); /* allocate root dentry */
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto free_root_inode;
+	}
+
+	err = f2fs_build_stats(sbi);
+	if (err)
+		goto free_root_inode;
+
+	if (f2fs_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
+
+	if (sbi->s_proc)
+		proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
+				 &f2fs_seq_segment_info_fops, sb);
+
+	if (test_opt(sbi, DISCARD)) {
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		if (!blk_queue_discard(q))
+			f2fs_msg(sb, KERN_WARNING,
+					"mounting with \"discard\" option, but "
+					"the device does not support discard");
+	}
+
+	sbi->s_kobj.kset = f2fs_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
+							"%s", sb->s_id);
+	if (err)
+		goto free_proc;
+
+	/* recover fsynced data */
+	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+		/*
+		 * mount should be failed, when device has readonly mode, and
+		 * previous checkpoint was not done by clean system shutdown.
+		 */
+		if (bdev_read_only(sb->s_bdev) &&
+				!is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
+			err = -EROFS;
+			goto free_kobj;
+		}
+
+		if (need_fsck)
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+
+		err = recover_fsync_data(sbi);
+		if (err) {
+			need_fsck = true;
+			f2fs_msg(sb, KERN_ERR,
+				"Cannot recover all fsync data errno=%ld", err);
+			goto free_kobj;
+		}
+	}
+
+	/*
+	 * If filesystem is not mounted as read-only then
+	 * do start the gc_thread.
+	 */
+	if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
+		/* After POR, we can run background GC thread.*/
+		err = start_gc_thread(sbi);
+		if (err)
+			goto free_kobj;
+	}
+	kfree(options);
+	return 0;
+
+free_kobj:
+	kobject_del(&sbi->s_kobj);
+free_proc:
+	if (sbi->s_proc) {
+		remove_proc_entry("segment_info", sbi->s_proc);
+		remove_proc_entry(sb->s_id, f2fs_proc_root);
+	}
+	f2fs_destroy_stats(sbi);
+free_root_inode:
+	dput(sb->s_root);
+	sb->s_root = NULL;
+free_node_inode:
+	iput(sbi->node_inode);
+free_nm:
+	destroy_node_manager(sbi);
+free_sm:
+	destroy_segment_manager(sbi);
+free_cp:
+	kfree(sbi->ckpt);
+free_meta_inode:
+	make_bad_inode(sbi->meta_inode);
+	iput(sbi->meta_inode);
+free_options:
+	kfree(options);
+free_sb_buf:
+	brelse(raw_super_buf);
+free_sbi:
+	kfree(sbi);
+
+	/* give only one another chance */
+	if (retry) {
+		retry = false;
+		shrink_dcache_sb(sb);
+		goto try_onemore;
+	}
+	return err;
+}
+
+static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
+}
+
+static void kill_f2fs_super(struct super_block *sb)
+{
+	if (sb->s_root)
+		set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
+	kill_block_super(sb);
+}
+
+static struct file_system_type f2fs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "f2fs",
+	.mount		= f2fs_mount,
+	.kill_sb	= kill_f2fs_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("f2fs");
+
+static int __init init_inodecache(void)
+{
+	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
+			sizeof(struct f2fs_inode_info));
+	if (!f2fs_inode_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(f2fs_inode_cachep);
+}
+
+static int __init init_f2fs_fs(void)
+{
+	int err;
+
+	f2fs_build_trace_ios();
+
+	err = init_inodecache();
+	if (err)
+		goto fail;
+	err = create_node_manager_caches();
+	if (err)
+		goto free_inodecache;
+	err = create_segment_manager_caches();
+	if (err)
+		goto free_node_manager_caches;
+	err = create_checkpoint_caches();
+	if (err)
+		goto free_segment_manager_caches;
+	err = create_extent_cache();
+	if (err)
+		goto free_checkpoint_caches;
+	f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
+	if (!f2fs_kset) {
+		err = -ENOMEM;
+		goto free_extent_cache;
+	}
+	err = register_filesystem(&f2fs_fs_type);
+	if (err)
+		goto free_kset;
+	f2fs_create_root_stats();
+	f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+	return 0;
+
+free_kset:
+	kset_unregister(f2fs_kset);
+free_extent_cache:
+	destroy_extent_cache();
+free_checkpoint_caches:
+	destroy_checkpoint_caches();
+free_segment_manager_caches:
+	destroy_segment_manager_caches();
+free_node_manager_caches:
+	destroy_node_manager_caches();
+free_inodecache:
+	destroy_inodecache();
+fail:
+	return err;
+}
+
+static void __exit exit_f2fs_fs(void)
+{
+	remove_proc_entry("fs/f2fs", NULL);
+	f2fs_destroy_root_stats();
+	unregister_filesystem(&f2fs_fs_type);
+	destroy_extent_cache();
+	destroy_checkpoint_caches();
+	destroy_segment_manager_caches();
+	destroy_node_manager_caches();
+	destroy_inodecache();
+	kset_unregister(f2fs_kset);
+	f2fs_destroy_trace_ios();
+}
+
+module_init(init_f2fs_fs)
+module_exit(exit_f2fs_fs)
+
+MODULE_AUTHOR("Samsung Electronics's Praesto Team");
+MODULE_DESCRIPTION("Flash Friendly File System");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/f2fs/trace.c b/kernel/fs/f2fs/trace.c
new file mode 100644
index 000000000..875aa8179
--- /dev/null
+++ b/kernel/fs/f2fs/trace.c
@@ -0,0 +1,159 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/sched.h>
+#include <linux/radix-tree.h>
+
+#include "f2fs.h"
+#include "trace.h"
+
+static RADIX_TREE(pids, GFP_ATOMIC);
+static spinlock_t pids_lock;
+static struct last_io_info last_io;
+
+static inline void __print_last_io(void)
+{
+	if (!last_io.len)
+		return;
+
+	trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
+			last_io.major, last_io.minor,
+			last_io.pid, "----------------",
+			last_io.type,
+			last_io.fio.rw, last_io.fio.blk_addr,
+			last_io.len);
+	memset(&last_io, 0, sizeof(last_io));
+}
+
+static int __file_type(struct inode *inode, pid_t pid)
+{
+	if (f2fs_is_atomic_file(inode))
+		return __ATOMIC_FILE;
+	else if (f2fs_is_volatile_file(inode))
+		return __VOLATILE_FILE;
+	else if (S_ISDIR(inode->i_mode))
+		return __DIR_FILE;
+	else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
+		return __NODE_FILE;
+	else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
+		return __META_FILE;
+	else if (pid)
+		return __NORMAL_FILE;
+	else
+		return __MISC_FILE;
+}
+
+void f2fs_trace_pid(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	pid_t pid = task_pid_nr(current);
+	void *p;
+
+	page->private = pid;
+
+	if (radix_tree_preload(GFP_NOFS))
+		return;
+
+	spin_lock(&pids_lock);
+	p = radix_tree_lookup(&pids, pid);
+	if (p == current)
+		goto out;
+	if (p)
+		radix_tree_delete(&pids, pid);
+
+	f2fs_radix_tree_insert(&pids, pid, current);
+
+	trace_printk("%3x:%3x %4x %-16s\n",
+			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+			pid, current->comm);
+out:
+	spin_unlock(&pids_lock);
+	radix_tree_preload_end();
+}
+
+void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
+{
+	struct inode *inode;
+	pid_t pid;
+	int major, minor;
+
+	if (flush) {
+		__print_last_io();
+		return;
+	}
+
+	inode = page->mapping->host;
+	pid = page_private(page);
+
+	major = MAJOR(inode->i_sb->s_dev);
+	minor = MINOR(inode->i_sb->s_dev);
+
+	if (last_io.major == major && last_io.minor == minor &&
+			last_io.pid == pid &&
+			last_io.type == __file_type(inode, pid) &&
+			last_io.fio.rw == fio->rw &&
+			last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+		last_io.len++;
+		return;
+	}
+
+	__print_last_io();
+
+	last_io.major = major;
+	last_io.minor = minor;
+	last_io.pid = pid;
+	last_io.type = __file_type(inode, pid);
+	last_io.fio = *fio;
+	last_io.len = 1;
+	return;
+}
+
+void f2fs_build_trace_ios(void)
+{
+	spin_lock_init(&pids_lock);
+}
+
+#define PIDVEC_SIZE	128
+static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
+							unsigned int max_items)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+	unsigned int ret = 0;
+
+	if (unlikely(!max_items))
+		return 0;
+
+	radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
+		results[ret] = iter.index;
+		if (++ret == PIDVEC_SIZE)
+			break;
+	}
+	return ret;
+}
+
+void f2fs_destroy_trace_ios(void)
+{
+	pid_t pid[PIDVEC_SIZE];
+	pid_t next_pid = 0;
+	unsigned int found;
+
+	spin_lock(&pids_lock);
+	while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
+		unsigned idx;
+
+		next_pid = pid[found - 1] + 1;
+		for (idx = 0; idx < found; idx++)
+			radix_tree_delete(&pids, pid[idx]);
+	}
+	spin_unlock(&pids_lock);
+}
diff --git a/kernel/fs/f2fs/trace.h b/kernel/fs/f2fs/trace.h
new file mode 100644
index 000000000..1041dbeb5
--- /dev/null
+++ b/kernel/fs/f2fs/trace.h
@@ -0,0 +1,46 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_TRACE_H__
+#define __F2FS_TRACE_H__
+
+#ifdef CONFIG_F2FS_IO_TRACE
+#include <trace/events/f2fs.h>
+
+enum file_type {
+	__NORMAL_FILE,
+	__DIR_FILE,
+	__NODE_FILE,
+	__META_FILE,
+	__ATOMIC_FILE,
+	__VOLATILE_FILE,
+	__MISC_FILE,
+};
+
+struct last_io_info {
+	int major, minor;
+	pid_t pid;
+	enum file_type type;
+	struct f2fs_io_info fio;
+	block_t len;
+};
+
+extern void f2fs_trace_pid(struct page *);
+extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
+extern void f2fs_build_trace_ios(void);
+extern void f2fs_destroy_trace_ios(void);
+#else
+#define f2fs_trace_pid(p)
+#define f2fs_trace_ios(p, i, n)
+#define f2fs_build_trace_ios()
+#define f2fs_destroy_trace_ios()
+
+#endif
+#endif /* __F2FS_TRACE_H__ */
diff --git a/kernel/fs/f2fs/xattr.c b/kernel/fs/f2fs/xattr.c
new file mode 100644
index 000000000..9757f65a0
--- /dev/null
+++ b/kernel/fs/f2fs/xattr.c
@@ -0,0 +1,618 @@
+/*
+ * fs/f2fs/xattr.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/rwsem.h>
+#include <linux/f2fs_fs.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include "f2fs.h"
+#include "xattr.h"
+
+static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t len, int type)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+	int total_len, prefix_len = 0;
+	const char *prefix = NULL;
+
+	switch (type) {
+	case F2FS_XATTR_INDEX_USER:
+		if (!test_opt(sbi, XATTR_USER))
+			return -EOPNOTSUPP;
+		prefix = XATTR_USER_PREFIX;
+		prefix_len = XATTR_USER_PREFIX_LEN;
+		break;
+	case F2FS_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		prefix = XATTR_TRUSTED_PREFIX;
+		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		prefix = XATTR_SECURITY_PREFIX;
+		prefix_len = XATTR_SECURITY_PREFIX_LEN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	total_len = prefix_len + len + 1;
+	if (list && total_len <= list_size) {
+		memcpy(list, prefix, prefix_len);
+		memcpy(list + prefix_len, name, len);
+		list[prefix_len + len] = '\0';
+	}
+	return total_len;
+}
+
+static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+
+	switch (type) {
+	case F2FS_XATTR_INDEX_USER:
+		if (!test_opt(sbi, XATTR_USER))
+			return -EOPNOTSUPP;
+		break;
+	case F2FS_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
+}
+
+static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+
+	switch (type) {
+	case F2FS_XATTR_INDEX_USER:
+		if (!test_opt(sbi, XATTR_USER))
+			return -EOPNOTSUPP;
+		break;
+	case F2FS_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return f2fs_setxattr(d_inode(dentry), type, name,
+					value, size, NULL, flags);
+}
+
+static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t len, int type)
+{
+	const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
+	size_t size;
+
+	if (type != F2FS_XATTR_INDEX_ADVISE)
+		return 0;
+
+	size = strlen(xname) + 1;
+	if (list && size <= list_size)
+		memcpy(list, xname, size);
+	return size;
+}
+
+static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	if (buffer)
+		*((char *)buffer) = F2FS_I(inode)->i_advise;
+	return sizeof(char);
+}
+
+static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+	if (value == NULL)
+		return -EINVAL;
+
+	F2FS_I(inode)->i_advise |= *(char *)value;
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		void *page)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+				xattr->name, xattr->value,
+				xattr->value_len, (struct page *)page, 0);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr, struct page *ipage)
+{
+	return security_inode_init_security(inode, dir, qstr,
+				&f2fs_initxattrs, ipage);
+}
+#endif
+
+const struct xattr_handler f2fs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.flags	= F2FS_XATTR_INDEX_USER,
+	.list	= f2fs_xattr_generic_list,
+	.get	= f2fs_xattr_generic_get,
+	.set	= f2fs_xattr_generic_set,
+};
+
+const struct xattr_handler f2fs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.flags	= F2FS_XATTR_INDEX_TRUSTED,
+	.list	= f2fs_xattr_generic_list,
+	.get	= f2fs_xattr_generic_get,
+	.set	= f2fs_xattr_generic_set,
+};
+
+const struct xattr_handler f2fs_xattr_advise_handler = {
+	.prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+	.flags	= F2FS_XATTR_INDEX_ADVISE,
+	.list   = f2fs_xattr_advise_list,
+	.get    = f2fs_xattr_advise_get,
+	.set    = f2fs_xattr_advise_set,
+};
+
+const struct xattr_handler f2fs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= F2FS_XATTR_INDEX_SECURITY,
+	.list	= f2fs_xattr_generic_list,
+	.get	= f2fs_xattr_generic_get,
+	.set	= f2fs_xattr_generic_set,
+};
+
+static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+#endif
+	[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+	[F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
+	[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
+};
+
+const struct xattr_handler *f2fs_xattr_handlers[] = {
+	&f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+	&f2fs_xattr_security_handler,
+#endif
+	&f2fs_xattr_advise_handler,
+	NULL,
+};
+
+static inline const struct xattr_handler *f2fs_xattr_handler(int index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map))
+		handler = f2fs_xattr_handler_map[index];
+	return handler;
+}
+
+static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
+					size_t len, const char *name)
+{
+	struct f2fs_xattr_entry *entry;
+
+	list_for_each_xattr(entry, base_addr) {
+		if (entry->e_name_index != index)
+			continue;
+		if (entry->e_name_len != len)
+			continue;
+		if (!memcmp(entry->e_name, name, len))
+			break;
+	}
+	return entry;
+}
+
+static void *read_all_xattrs(struct inode *inode, struct page *ipage)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_xattr_header *header;
+	size_t size = PAGE_SIZE, inline_size = 0;
+	void *txattr_addr;
+
+	inline_size = inline_xattr_size(inode);
+
+	txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
+	if (!txattr_addr)
+		return NULL;
+
+	/* read from inline xattr */
+	if (inline_size) {
+		struct page *page = NULL;
+		void *inline_addr;
+
+		if (ipage) {
+			inline_addr = inline_xattr_addr(ipage);
+		} else {
+			page = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(page))
+				goto fail;
+			inline_addr = inline_xattr_addr(page);
+		}
+		memcpy(txattr_addr, inline_addr, inline_size);
+		f2fs_put_page(page, 1);
+	}
+
+	/* read from xattr node block */
+	if (F2FS_I(inode)->i_xattr_nid) {
+		struct page *xpage;
+		void *xattr_addr;
+
+		/* The inode already has an extended attribute block. */
+		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+		if (IS_ERR(xpage))
+			goto fail;
+
+		xattr_addr = page_address(xpage);
+		memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
+		f2fs_put_page(xpage, 1);
+	}
+
+	header = XATTR_HDR(txattr_addr);
+
+	/* never been allocated xattrs */
+	if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
+		header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
+		header->h_refcount = cpu_to_le32(1);
+	}
+	return txattr_addr;
+fail:
+	kzfree(txattr_addr);
+	return NULL;
+}
+
+static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
+				void *txattr_addr, struct page *ipage)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	size_t inline_size = 0;
+	void *xattr_addr;
+	struct page *xpage;
+	nid_t new_nid = 0;
+	int err;
+
+	inline_size = inline_xattr_size(inode);
+
+	if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
+		if (!alloc_nid(sbi, &new_nid))
+			return -ENOSPC;
+
+	/* write to inline xattr */
+	if (inline_size) {
+		struct page *page = NULL;
+		void *inline_addr;
+
+		if (ipage) {
+			inline_addr = inline_xattr_addr(ipage);
+			f2fs_wait_on_page_writeback(ipage, NODE);
+		} else {
+			page = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(page)) {
+				alloc_nid_failed(sbi, new_nid);
+				return PTR_ERR(page);
+			}
+			inline_addr = inline_xattr_addr(page);
+			f2fs_wait_on_page_writeback(page, NODE);
+		}
+		memcpy(inline_addr, txattr_addr, inline_size);
+		f2fs_put_page(page, 1);
+
+		/* no need to use xattr node block */
+		if (hsize <= inline_size) {
+			err = truncate_xattr_node(inode, ipage);
+			alloc_nid_failed(sbi, new_nid);
+			return err;
+		}
+	}
+
+	/* write to xattr node block */
+	if (F2FS_I(inode)->i_xattr_nid) {
+		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+		if (IS_ERR(xpage)) {
+			alloc_nid_failed(sbi, new_nid);
+			return PTR_ERR(xpage);
+		}
+		f2fs_bug_on(sbi, new_nid);
+		f2fs_wait_on_page_writeback(xpage, NODE);
+	} else {
+		struct dnode_of_data dn;
+		set_new_dnode(&dn, inode, NULL, NULL, new_nid);
+		xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
+		if (IS_ERR(xpage)) {
+			alloc_nid_failed(sbi, new_nid);
+			return PTR_ERR(xpage);
+		}
+		alloc_nid_done(sbi, new_nid);
+	}
+
+	xattr_addr = page_address(xpage);
+	memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE -
+						sizeof(struct node_footer));
+	set_page_dirty(xpage);
+	f2fs_put_page(xpage, 1);
+
+	/* need to checkpoint during fsync */
+	F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
+	return 0;
+}
+
+int f2fs_getxattr(struct inode *inode, int index, const char *name,
+		void *buffer, size_t buffer_size, struct page *ipage)
+{
+	struct f2fs_xattr_entry *entry;
+	void *base_addr;
+	int error = 0;
+	size_t size, len;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	len = strlen(name);
+	if (len > F2FS_NAME_LEN)
+		return -ERANGE;
+
+	base_addr = read_all_xattrs(inode, ipage);
+	if (!base_addr)
+		return -ENOMEM;
+
+	entry = __find_xattr(base_addr, index, len, name);
+	if (IS_XATTR_LAST_ENTRY(entry)) {
+		error = -ENODATA;
+		goto cleanup;
+	}
+
+	size = le16_to_cpu(entry->e_value_size);
+
+	if (buffer && size > buffer_size) {
+		error = -ERANGE;
+		goto cleanup;
+	}
+
+	if (buffer) {
+		char *pval = entry->e_name + entry->e_name_len;
+		memcpy(buffer, pval, size);
+	}
+	error = size;
+
+cleanup:
+	kzfree(base_addr);
+	return error;
+}
+
+ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct f2fs_xattr_entry *entry;
+	void *base_addr;
+	int error = 0;
+	size_t rest = buffer_size;
+
+	base_addr = read_all_xattrs(inode, NULL);
+	if (!base_addr)
+		return -ENOMEM;
+
+	list_for_each_xattr(entry, base_addr) {
+		const struct xattr_handler *handler =
+			f2fs_xattr_handler(entry->e_name_index);
+		size_t size;
+
+		if (!handler)
+			continue;
+
+		size = handler->list(dentry, buffer, rest, entry->e_name,
+				entry->e_name_len, handler->flags);
+		if (buffer && size > rest) {
+			error = -ERANGE;
+			goto cleanup;
+		}
+
+		if (buffer)
+			buffer += size;
+		rest -= size;
+	}
+	error = buffer_size - rest;
+cleanup:
+	kzfree(base_addr);
+	return error;
+}
+
+static int __f2fs_setxattr(struct inode *inode, int index,
+			const char *name, const void *value, size_t size,
+			struct page *ipage, int flags)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_xattr_entry *here, *last;
+	void *base_addr;
+	int found, newsize;
+	size_t len;
+	__u32 new_hsize;
+	int error = -ENOMEM;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (value == NULL)
+		size = 0;
+
+	len = strlen(name);
+
+	if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode))
+		return -ERANGE;
+
+	base_addr = read_all_xattrs(inode, ipage);
+	if (!base_addr)
+		goto exit;
+
+	/* find entry with wanted name. */
+	here = __find_xattr(base_addr, index, len, name);
+
+	found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
+
+	if ((flags & XATTR_REPLACE) && !found) {
+		error = -ENODATA;
+		goto exit;
+	} else if ((flags & XATTR_CREATE) && found) {
+		error = -EEXIST;
+		goto exit;
+	}
+
+	last = here;
+	while (!IS_XATTR_LAST_ENTRY(last))
+		last = XATTR_NEXT_ENTRY(last);
+
+	newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size);
+
+	/* 1. Check space */
+	if (value) {
+		int free;
+		/*
+		 * If value is NULL, it is remove operation.
+		 * In case of update operation, we calculate free.
+		 */
+		free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
+		if (found)
+			free = free + ENTRY_SIZE(here);
+
+		if (unlikely(free < newsize)) {
+			error = -ENOSPC;
+			goto exit;
+		}
+	}
+
+	/* 2. Remove old entry */
+	if (found) {
+		/*
+		 * If entry is found, remove old entry.
+		 * If not found, remove operation is not needed.
+		 */
+		struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
+		int oldsize = ENTRY_SIZE(here);
+
+		memmove(here, next, (char *)last - (char *)next);
+		last = (struct f2fs_xattr_entry *)((char *)last - oldsize);
+		memset(last, 0, oldsize);
+	}
+
+	new_hsize = (char *)last - (char *)base_addr;
+
+	/* 3. Write new entry */
+	if (value) {
+		char *pval;
+		/*
+		 * Before we come here, old entry is removed.
+		 * We just write new entry.
+		 */
+		memset(last, 0, newsize);
+		last->e_name_index = index;
+		last->e_name_len = len;
+		memcpy(last->e_name, name, len);
+		pval = last->e_name + len;
+		memcpy(pval, value, size);
+		last->e_value_size = cpu_to_le16(size);
+		new_hsize += newsize;
+	}
+
+	error = write_all_xattrs(inode, new_hsize, base_addr, ipage);
+	if (error)
+		goto exit;
+
+	if (is_inode_flag_set(fi, FI_ACL_MODE)) {
+		inode->i_mode = fi->i_acl_mode;
+		inode->i_ctime = CURRENT_TIME;
+		clear_inode_flag(fi, FI_ACL_MODE);
+	}
+
+	if (ipage)
+		update_inode(inode, ipage);
+	else
+		update_inode_page(inode);
+exit:
+	kzfree(base_addr);
+	return error;
+}
+
+int f2fs_setxattr(struct inode *inode, int index, const char *name,
+				const void *value, size_t size,
+				struct page *ipage, int flags)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int err;
+
+	/* this case is only from init_inode_metadata */
+	if (ipage)
+		return __f2fs_setxattr(inode, index, name, value,
+						size, ipage, flags);
+	f2fs_balance_fs(sbi);
+
+	f2fs_lock_op(sbi);
+	/* protect xattr_ver */
+	down_write(&F2FS_I(inode)->i_sem);
+	err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
+	up_write(&F2FS_I(inode)->i_sem);
+	f2fs_unlock_op(sbi);
+
+	return err;
+}
diff --git a/kernel/fs/f2fs/xattr.h b/kernel/fs/f2fs/xattr.h
new file mode 100644
index 000000000..969d792ca
--- /dev/null
+++ b/kernel/fs/f2fs/xattr.h
@@ -0,0 +1,152 @@
+/*
+ * fs/f2fs/xattr.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.h
+ *
+ * On-disk format of extended attributes for the ext2 filesystem.
+ *
+ * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_XATTR_H__
+#define __F2FS_XATTR_H__
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define F2FS_XATTR_MAGIC                0xF2F52011
+
+/* Maximum number of references to one attribute block */
+#define F2FS_XATTR_REFCOUNT_MAX         1024
+
+/* Name indexes */
+#define F2FS_SYSTEM_ADVISE_PREFIX		"system.advise"
+#define F2FS_XATTR_INDEX_USER			1
+#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define F2FS_XATTR_INDEX_TRUSTED		4
+#define F2FS_XATTR_INDEX_LUSTRE			5
+#define F2FS_XATTR_INDEX_SECURITY		6
+#define F2FS_XATTR_INDEX_ADVISE			7
+
+struct f2fs_xattr_header {
+	__le32  h_magic;        /* magic number for identification */
+	__le32  h_refcount;     /* reference count */
+	__u32   h_reserved[4];  /* zero right now */
+};
+
+struct f2fs_xattr_entry {
+	__u8    e_name_index;
+	__u8    e_name_len;
+	__le16  e_value_size;   /* size of attribute value */
+	char    e_name[0];      /* attribute name */
+};
+
+#define XATTR_HDR(ptr)		((struct f2fs_xattr_header *)(ptr))
+#define XATTR_ENTRY(ptr)	((struct f2fs_xattr_entry *)(ptr))
+#define XATTR_FIRST_ENTRY(ptr)	(XATTR_ENTRY(XATTR_HDR(ptr) + 1))
+#define XATTR_ROUND		(3)
+
+#define XATTR_ALIGN(size)	((size + XATTR_ROUND) & ~XATTR_ROUND)
+
+#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
+			entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+
+#define XATTR_NEXT_ENTRY(entry)	((struct f2fs_xattr_entry *)((char *)(entry) +\
+			ENTRY_SIZE(entry)))
+
+#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define list_for_each_xattr(entry, addr) \
+		for (entry = XATTR_FIRST_ENTRY(addr);\
+				!IS_XATTR_LAST_ENTRY(entry);\
+				entry = XATTR_NEXT_ENTRY(entry))
+
+#define MIN_OFFSET(i)	XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE -	\
+				sizeof(struct node_footer) - sizeof(__u32))
+
+#define MAX_VALUE_LEN(i)	(MIN_OFFSET(i) -			\
+				sizeof(struct f2fs_xattr_header) -	\
+				sizeof(struct f2fs_xattr_entry))
+
+/*
+ * On-disk structure of f2fs_xattr
+ * We use inline xattrs space + 1 block for xattr.
+ *
+ * +--------------------+
+ * | f2fs_xattr_header  |
+ * |                    |
+ * +--------------------+
+ * | f2fs_xattr_entry   |
+ * | .e_name_index = 1  |
+ * | .e_name_len = 3    |
+ * | .e_value_size = 14 |
+ * | .e_name = "foo"    |
+ * | "value_of_xattr"   |<- value_offs = e_name + e_name_len
+ * +--------------------+
+ * | f2fs_xattr_entry   |
+ * | .e_name_index = 4  |
+ * | .e_name = "bar"    |
+ * +--------------------+
+ * |                    |
+ * |        Free        |
+ * |                    |
+ * +--------------------+<- MIN_OFFSET
+ * |   node_footer      |
+ * | (nid, ino, offset) |
+ * +--------------------+
+ *
+ **/
+
+#ifdef CONFIG_F2FS_FS_XATTR
+extern const struct xattr_handler f2fs_xattr_user_handler;
+extern const struct xattr_handler f2fs_xattr_trusted_handler;
+extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
+
+extern const struct xattr_handler *f2fs_xattr_handlers[];
+
+extern int f2fs_setxattr(struct inode *, int, const char *,
+				const void *, size_t, struct page *, int);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *,
+						size_t, struct page *);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
+#else
+
+#define f2fs_xattr_handlers	NULL
+static inline int f2fs_setxattr(struct inode *inode, int index,
+		const char *name, const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+static inline int f2fs_getxattr(struct inode *inode, int index,
+			const char *name, void *buffer,
+			size_t buffer_size, struct page *dpage)
+{
+	return -EOPNOTSUPP;
+}
+static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
+		size_t buffer_size)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+				const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr, struct page *ipage)
+{
+	return 0;
+}
+#endif
+#endif /* __F2FS_XATTR_H__ */
diff --git a/kernel/fs/fat/Kconfig b/kernel/fs/fat/Kconfig
new file mode 100644
index 000000000..182f9ffe2
--- /dev/null
+++ b/kernel/fs/fat/Kconfig
@@ -0,0 +1,100 @@
+config FAT_FS
+	tristate
+	select NLS
+	help
+	  If you want to use one of the FAT-based file systems (the MS-DOS and
+	  VFAT (Windows 95) file systems), then you must say Y or M here
+	  to include FAT support. You will then be able to mount partitions or
+	  diskettes with FAT-based file systems and transparently access the
+	  files on them, i.e. MSDOS files will look and behave just like all
+	  other Unix files.
+
+	  This FAT support is not a file system in itself, it only provides
+	  the foundation for the other file systems. You will have to say Y or
+	  M to at least one of "MSDOS fs support" or "VFAT fs support" in
+	  order to make use of it.
+
+	  Another way to read and write MSDOS floppies and hard drive
+	  partitions from within Linux (but not transparently) is with the
+	  mtools ("man mtools") program suite. You don't need to say Y here in
+	  order to do that.
+
+	  If you need to move large files on floppies between a DOS and a
+	  Linux box, say Y here, mount the floppy under Linux with an MSDOS
+	  file system and use GNU tar's M option. GNU tar is a program
+	  available for Unix and DOS ("man tar" or "info tar").
+
+	  The FAT support will enlarge your kernel by about 37 KB. If unsure,
+	  say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  fat.  Note that if you compile the FAT support as a module, you
+	  cannot compile any of the FAT-based file systems into the kernel
+	  -- they will have to be modules as well.
+
+config MSDOS_FS
+	tristate "MSDOS fs support"
+	select FAT_FS
+	help
+	  This allows you to mount MSDOS partitions of your hard drive (unless
+	  they are compressed; to access compressed MSDOS partitions under
+	  Linux, you can either use the DOS emulator DOSEMU, described in the
+	  DOSEMU-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
+	  <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
+	  intend to use dosemu with a non-compressed MSDOS partition, say Y
+	  here) and MSDOS floppies. This means that file access becomes
+	  transparent, i.e. the MSDOS files look and behave just like all
+	  other Unix files.
+
+	  If you have Windows 95 or Windows NT installed on your MSDOS
+	  partitions, you should use the VFAT file system (say Y to "VFAT fs
+	  support" below), or you will not be able to see the long filenames
+	  generated by Windows 95 / Windows NT.
+
+	  This option will enlarge your kernel by about 7 KB. If unsure,
+	  answer Y. This will only work if you said Y to "DOS FAT fs support"
+	  as well. To compile this as a module, choose M here: the module will
+	  be called msdos.
+
+config VFAT_FS
+	tristate "VFAT (Windows-95) fs support"
+	select FAT_FS
+	help
+	  This option provides support for normal Windows file systems with
+	  long filenames.  That includes non-compressed FAT-based file systems
+	  used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
+	  programs from the mtools package.
+
+	  The VFAT support enlarges your kernel by about 10 KB and it only
+	  works if you said Y to the "DOS FAT fs support" above.  Please read
+	  the file <file:Documentation/filesystems/vfat.txt> for details.  If
+	  unsure, say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  vfat.
+
+config FAT_DEFAULT_CODEPAGE
+	int "Default codepage for FAT"
+	depends on MSDOS_FS || VFAT_FS
+	default 437
+	help
+	  This option should be set to the codepage of your FAT filesystems.
+	  It can be overridden with the "codepage" mount option.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+config FAT_DEFAULT_IOCHARSET
+	string "Default iocharset for FAT"
+	depends on VFAT_FS
+	default "iso8859-1"
+	help
+	  Set this to the default input/output character set you'd
+	  like FAT to use. It should probably match the character set
+	  that most of your FAT filesystems use, and can be overridden
+	  with the "iocharset" mount option for FAT filesystems.
+	  Note that "utf8" is not recommended for FAT filesystems.
+	  If unsure, you shouldn't set "utf8" here.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+	  Enable any character sets you need in File Systems/Native Language
+	  Support.
diff --git a/kernel/fs/fat/Makefile b/kernel/fs/fat/Makefile
new file mode 100644
index 000000000..964b634f6
--- /dev/null
+++ b/kernel/fs/fat/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Linux fat filesystem support.
+#
+
+obj-$(CONFIG_FAT_FS) += fat.o
+obj-$(CONFIG_VFAT_FS) += vfat.o
+obj-$(CONFIG_MSDOS_FS) += msdos.o
+
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o
+vfat-y := namei_vfat.o
+msdos-y := namei_msdos.o
diff --git a/kernel/fs/fat/cache.c b/kernel/fs/fat/cache.c
new file mode 100644
index 000000000..93fc62232
--- /dev/null
+++ b/kernel/fs/fat/cache.c
@@ -0,0 +1,351 @@
+/*
+ *  linux/fs/fat/cache.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Mar 1999. AV. Changed cache, so that it uses the starting cluster instead
+ *	of inode number.
+ *  May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers.
+ */
+
+#include <linux/slab.h>
+#include "fat.h"
+
+/* this must be > 0. */
+#define FAT_MAX_CACHE	8
+
+struct fat_cache {
+	struct list_head cache_list;
+	int nr_contig;	/* number of contiguous clusters */
+	int fcluster;	/* cluster number in the file. */
+	int dcluster;	/* cluster number on disk. */
+};
+
+struct fat_cache_id {
+	unsigned int id;
+	int nr_contig;
+	int fcluster;
+	int dcluster;
+};
+
+static inline int fat_max_cache(struct inode *inode)
+{
+	return FAT_MAX_CACHE;
+}
+
+static struct kmem_cache *fat_cache_cachep;
+
+static void init_once(void *foo)
+{
+	struct fat_cache *cache = (struct fat_cache *)foo;
+
+	INIT_LIST_HEAD(&cache->cache_list);
+}
+
+int __init fat_cache_init(void)
+{
+	fat_cache_cachep = kmem_cache_create("fat_cache",
+				sizeof(struct fat_cache),
+				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				init_once);
+	if (fat_cache_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void fat_cache_destroy(void)
+{
+	kmem_cache_destroy(fat_cache_cachep);
+}
+
+static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
+{
+	return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
+}
+
+static inline void fat_cache_free(struct fat_cache *cache)
+{
+	BUG_ON(!list_empty(&cache->cache_list));
+	kmem_cache_free(fat_cache_cachep, cache);
+}
+
+static inline void fat_cache_update_lru(struct inode *inode,
+					struct fat_cache *cache)
+{
+	if (MSDOS_I(inode)->cache_lru.next != &cache->cache_list)
+		list_move(&cache->cache_list, &MSDOS_I(inode)->cache_lru);
+}
+
+static int fat_cache_lookup(struct inode *inode, int fclus,
+			    struct fat_cache_id *cid,
+			    int *cached_fclus, int *cached_dclus)
+{
+	static struct fat_cache nohit = { .fcluster = 0, };
+
+	struct fat_cache *hit = &nohit, *p;
+	int offset = -1;
+
+	spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+	list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) {
+		/* Find the cache of "fclus" or nearest cache. */
+		if (p->fcluster <= fclus && hit->fcluster < p->fcluster) {
+			hit = p;
+			if ((hit->fcluster + hit->nr_contig) < fclus) {
+				offset = hit->nr_contig;
+			} else {
+				offset = fclus - hit->fcluster;
+				break;
+			}
+		}
+	}
+	if (hit != &nohit) {
+		fat_cache_update_lru(inode, hit);
+
+		cid->id = MSDOS_I(inode)->cache_valid_id;
+		cid->nr_contig = hit->nr_contig;
+		cid->fcluster = hit->fcluster;
+		cid->dcluster = hit->dcluster;
+		*cached_fclus = cid->fcluster + offset;
+		*cached_dclus = cid->dcluster + offset;
+	}
+	spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+
+	return offset;
+}
+
+static struct fat_cache *fat_cache_merge(struct inode *inode,
+					 struct fat_cache_id *new)
+{
+	struct fat_cache *p;
+
+	list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) {
+		/* Find the same part as "new" in cluster-chain. */
+		if (p->fcluster == new->fcluster) {
+			BUG_ON(p->dcluster != new->dcluster);
+			if (new->nr_contig > p->nr_contig)
+				p->nr_contig = new->nr_contig;
+			return p;
+		}
+	}
+	return NULL;
+}
+
+static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
+{
+	struct fat_cache *cache, *tmp;
+
+	if (new->fcluster == -1) /* dummy cache */
+		return;
+
+	spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+	if (new->id != FAT_CACHE_VALID &&
+	    new->id != MSDOS_I(inode)->cache_valid_id)
+		goto out;	/* this cache was invalidated */
+
+	cache = fat_cache_merge(inode, new);
+	if (cache == NULL) {
+		if (MSDOS_I(inode)->nr_caches < fat_max_cache(inode)) {
+			MSDOS_I(inode)->nr_caches++;
+			spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+
+			tmp = fat_cache_alloc(inode);
+			if (!tmp) {
+				spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+				MSDOS_I(inode)->nr_caches--;
+				spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+				return;
+			}
+
+			spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+			cache = fat_cache_merge(inode, new);
+			if (cache != NULL) {
+				MSDOS_I(inode)->nr_caches--;
+				fat_cache_free(tmp);
+				goto out_update_lru;
+			}
+			cache = tmp;
+		} else {
+			struct list_head *p = MSDOS_I(inode)->cache_lru.prev;
+			cache = list_entry(p, struct fat_cache, cache_list);
+		}
+		cache->fcluster = new->fcluster;
+		cache->dcluster = new->dcluster;
+		cache->nr_contig = new->nr_contig;
+	}
+out_update_lru:
+	fat_cache_update_lru(inode, cache);
+out:
+	spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+}
+
+/*
+ * Cache invalidation occurs rarely, thus the LRU chain is not updated. It
+ * fixes itself after a while.
+ */
+static void __fat_cache_inval_inode(struct inode *inode)
+{
+	struct msdos_inode_info *i = MSDOS_I(inode);
+	struct fat_cache *cache;
+
+	while (!list_empty(&i->cache_lru)) {
+		cache = list_entry(i->cache_lru.next,
+				   struct fat_cache, cache_list);
+		list_del_init(&cache->cache_list);
+		i->nr_caches--;
+		fat_cache_free(cache);
+	}
+	/* Update. The copy of caches before this id is discarded. */
+	i->cache_valid_id++;
+	if (i->cache_valid_id == FAT_CACHE_VALID)
+		i->cache_valid_id++;
+}
+
+void fat_cache_inval_inode(struct inode *inode)
+{
+	spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+	__fat_cache_inval_inode(inode);
+	spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+}
+
+static inline int cache_contiguous(struct fat_cache_id *cid, int dclus)
+{
+	cid->nr_contig++;
+	return ((cid->dcluster + cid->nr_contig) == dclus);
+}
+
+static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus)
+{
+	cid->id = FAT_CACHE_VALID;
+	cid->fcluster = fclus;
+	cid->dcluster = dclus;
+	cid->nr_contig = 0;
+}
+
+int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
+{
+	struct super_block *sb = inode->i_sb;
+	const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits;
+	struct fat_entry fatent;
+	struct fat_cache_id cid;
+	int nr;
+
+	BUG_ON(MSDOS_I(inode)->i_start == 0);
+
+	*fclus = 0;
+	*dclus = MSDOS_I(inode)->i_start;
+	if (cluster == 0)
+		return 0;
+
+	if (fat_cache_lookup(inode, cluster, &cid, fclus, dclus) < 0) {
+		/*
+		 * dummy, always not contiguous
+		 * This is reinitialized by cache_init(), later.
+		 */
+		cache_init(&cid, -1, -1);
+	}
+
+	fatent_init(&fatent);
+	while (*fclus < cluster) {
+		/* prevent the infinite loop of cluster chain */
+		if (*fclus > limit) {
+			fat_fs_error_ratelimit(sb,
+					"%s: detected the cluster chain loop"
+					" (i_pos %lld)", __func__,
+					MSDOS_I(inode)->i_pos);
+			nr = -EIO;
+			goto out;
+		}
+
+		nr = fat_ent_read(inode, &fatent, *dclus);
+		if (nr < 0)
+			goto out;
+		else if (nr == FAT_ENT_FREE) {
+			fat_fs_error_ratelimit(sb,
+				       "%s: invalid cluster chain (i_pos %lld)",
+				       __func__,
+				       MSDOS_I(inode)->i_pos);
+			nr = -EIO;
+			goto out;
+		} else if (nr == FAT_ENT_EOF) {
+			fat_cache_add(inode, &cid);
+			goto out;
+		}
+		(*fclus)++;
+		*dclus = nr;
+		if (!cache_contiguous(&cid, *dclus))
+			cache_init(&cid, *fclus, *dclus);
+	}
+	nr = 0;
+	fat_cache_add(inode, &cid);
+out:
+	fatent_brelse(&fatent);
+	return nr;
+}
+
+static int fat_bmap_cluster(struct inode *inode, int cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret, fclus, dclus;
+
+	if (MSDOS_I(inode)->i_start == 0)
+		return 0;
+
+	ret = fat_get_cluster(inode, cluster, &fclus, &dclus);
+	if (ret < 0)
+		return ret;
+	else if (ret == FAT_ENT_EOF) {
+		fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
+			     __func__, MSDOS_I(inode)->i_pos);
+		return -EIO;
+	}
+	return dclus;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+	     unsigned long *mapped_blocks, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	const unsigned long blocksize = sb->s_blocksize;
+	const unsigned char blocksize_bits = sb->s_blocksize_bits;
+	sector_t last_block;
+	int cluster, offset;
+
+	*phys = 0;
+	*mapped_blocks = 0;
+	if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
+		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
+			*phys = sector + sbi->dir_start;
+			*mapped_blocks = 1;
+		}
+		return 0;
+	}
+
+	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= last_block) {
+		if (!create)
+			return 0;
+
+		/*
+		 * ->mmu_private can access on only allocation path.
+		 * (caller must hold ->i_mutex)
+		 */
+		last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+			>> blocksize_bits;
+		if (sector >= last_block)
+			return 0;
+	}
+
+	cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+	offset  = sector & (sbi->sec_per_clus - 1);
+	cluster = fat_bmap_cluster(inode, cluster);
+	if (cluster < 0)
+		return cluster;
+	else if (cluster) {
+		*phys = fat_clus_to_blknr(sbi, cluster) + offset;
+		*mapped_blocks = sbi->sec_per_clus - offset;
+		if (*mapped_blocks > last_block - sector)
+			*mapped_blocks = last_block - sector;
+	}
+	return 0;
+}
diff --git a/kernel/fs/fat/dir.c b/kernel/fs/fat/dir.c
new file mode 100644
index 000000000..4afc4d9d2
--- /dev/null
+++ b/kernel/fs/fat/dir.c
@@ -0,0 +1,1402 @@
+/*
+ *  linux/fs/fat/dir.c
+ *
+ *  directory handling functions for fat-based filesystems
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
+ *
+ *  VFAT extensions by Gordon Chaffee <chaffee@plateau.cs.berkeley.edu>
+ *  Merged with msdos fs by Henrik Storner <storner@osiris.ping.dk>
+ *  Rewritten for constant inumbers. Plugged buffer overrun in readdir(). AV
+ *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include "fat.h"
+
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
+static inline unsigned char fat_tolower(unsigned char c)
+{
+	return ((c >= 'A') && (c <= 'Z')) ? c+32 : c;
+}
+
+static inline loff_t fat_make_i_pos(struct super_block *sb,
+				    struct buffer_head *bh,
+				    struct msdos_dir_entry *de)
+{
+	return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits)
+		| (de - (struct msdos_dir_entry *)bh->b_data);
+}
+
+static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
+				     sector_t phys)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	int sec;
+
+	/* This is not a first sector of cluster, or sec_per_clus == 1 */
+	if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1)
+		return;
+	/* root dir of FAT12/FAT16 */
+	if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
+		return;
+
+	bh = sb_find_get_block(sb, phys);
+	if (bh == NULL || !buffer_uptodate(bh)) {
+		for (sec = 0; sec < sbi->sec_per_clus; sec++)
+			sb_breadahead(sb, phys + sec);
+	}
+	brelse(bh);
+}
+
+/* Returns the inode number of the directory entry at offset pos. If bh is
+   non-NULL, it is brelse'd before. Pos is incremented. The buffer header is
+   returned in bh.
+   AV. Most often we do it item-by-item. Makes sense to optimize.
+   AV. OK, there we go: if both bh and de are non-NULL we assume that we just
+   AV. want the next entry (took one explicit de=NULL in vfat/namei.c).
+   AV. It's done in fat_get_entry() (inlined), here the slow case lives.
+   AV. Additionally, when we return -1 (i.e. reached the end of directory)
+   AV. we make bh NULL.
+ */
+static int fat__get_entry(struct inode *dir, loff_t *pos,
+			  struct buffer_head **bh, struct msdos_dir_entry **de)
+{
+	struct super_block *sb = dir->i_sb;
+	sector_t phys, iblock;
+	unsigned long mapped_blocks;
+	int err, offset;
+
+next:
+	if (*bh)
+		brelse(*bh);
+
+	*bh = NULL;
+	iblock = *pos >> sb->s_blocksize_bits;
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+	if (err || !phys)
+		return -1;	/* beyond EOF or error */
+
+	fat_dir_readahead(dir, iblock, phys);
+
+	*bh = sb_bread(sb, phys);
+	if (*bh == NULL) {
+		fat_msg_ratelimit(sb, KERN_ERR,
+			"Directory bread(block %llu) failed", (llu)phys);
+		/* skip this block */
+		*pos = (iblock + 1) << sb->s_blocksize_bits;
+		goto next;
+	}
+
+	offset = *pos & (sb->s_blocksize - 1);
+	*pos += sizeof(struct msdos_dir_entry);
+	*de = (struct msdos_dir_entry *)((*bh)->b_data + offset);
+
+	return 0;
+}
+
+static inline int fat_get_entry(struct inode *dir, loff_t *pos,
+				struct buffer_head **bh,
+				struct msdos_dir_entry **de)
+{
+	/* Fast stuff first */
+	if (*bh && *de &&
+	   (*de - (struct msdos_dir_entry *)(*bh)->b_data) <
+				MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
+		*pos += sizeof(struct msdos_dir_entry);
+		(*de)++;
+		return 0;
+	}
+	return fat__get_entry(dir, pos, bh, de);
+}
+
+/*
+ * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII.
+ * If uni_xlate is enabled and we can't get a 1:1 conversion, use a
+ * colon as an escape character since it is normally invalid on the vfat
+ * filesystem. The following four characters are the hexadecimal digits
+ * of Unicode value. This lets us do a full dump and restore of Unicode
+ * filenames. We could get into some trouble with long Unicode names,
+ * but ignore that right now.
+ * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
+ */
+static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
+		       const wchar_t *uni, int len, struct nls_table *nls)
+{
+	int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
+	const wchar_t *ip;
+	wchar_t ec;
+	unsigned char *op;
+	int charlen;
+
+	ip = uni;
+	op = ascii;
+
+	while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
+		ec = *ip++;
+		charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE);
+		if (charlen > 0) {
+			op += charlen;
+			len -= charlen;
+		} else {
+			if (uni_xlate == 1) {
+				*op++ = ':';
+				op = hex_byte_pack(op, ec >> 8);
+				op = hex_byte_pack(op, ec);
+				len -= 5;
+			} else {
+				*op++ = '?';
+				len--;
+			}
+		}
+	}
+
+	if (unlikely(*ip)) {
+		fat_msg(sb, KERN_WARNING,
+			"filename was truncated while converting.");
+	}
+
+	*op = 0;
+	return op - ascii;
+}
+
+static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
+				unsigned char *buf, int size)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	if (sbi->options.utf8)
+		return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+				UTF16_HOST_ENDIAN, buf, size);
+	else
+		return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
+}
+
+static inline int
+fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
+{
+	int charlen;
+
+	charlen = t->char2uni(c, clen, uni);
+	if (charlen < 0) {
+		*uni = 0x003f;	/* a question mark */
+		charlen = 1;
+	}
+	return charlen;
+}
+
+static inline int
+fat_short2lower_uni(struct nls_table *t, unsigned char *c,
+		    int clen, wchar_t *uni)
+{
+	int charlen;
+	wchar_t wc;
+
+	charlen = t->char2uni(c, clen, &wc);
+	if (charlen < 0) {
+		*uni = 0x003f;	/* a question mark */
+		charlen = 1;
+	} else if (charlen <= 1) {
+		unsigned char nc = t->charset2lower[*c];
+
+		if (!nc)
+			nc = *c;
+
+		charlen = t->char2uni(&nc, 1, uni);
+		if (charlen < 0) {
+			*uni = 0x003f;	/* a question mark */
+			charlen = 1;
+		}
+	} else
+		*uni = wc;
+
+	return charlen;
+}
+
+static inline int
+fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
+		  wchar_t *uni_buf, unsigned short opt, int lower)
+{
+	int len = 0;
+
+	if (opt & VFAT_SFN_DISPLAY_LOWER)
+		len =  fat_short2lower_uni(nls, buf, buf_size, uni_buf);
+	else if (opt & VFAT_SFN_DISPLAY_WIN95)
+		len = fat_short2uni(nls, buf, buf_size, uni_buf);
+	else if (opt & VFAT_SFN_DISPLAY_WINNT) {
+		if (lower)
+			len = fat_short2lower_uni(nls, buf, buf_size, uni_buf);
+		else
+			len = fat_short2uni(nls, buf, buf_size, uni_buf);
+	} else
+		len = fat_short2uni(nls, buf, buf_size, uni_buf);
+
+	return len;
+}
+
+static inline int fat_name_match(struct msdos_sb_info *sbi,
+				 const unsigned char *a, int a_len,
+				 const unsigned char *b, int b_len)
+{
+	if (a_len != b_len)
+		return 0;
+
+	if (sbi->options.name_check != 's')
+		return !nls_strnicmp(sbi->nls_io, a, b, a_len);
+	else
+		return !memcmp(a, b, a_len);
+}
+
+enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
+
+/**
+ * fat_parse_long - Parse extended directory entry.
+ *
+ * This function returns zero on success, negative value on error, or one of
+ * the following:
+ *
+ * %PARSE_INVALID - Directory entry is invalid.
+ * %PARSE_NOT_LONGNAME - Directory entry does not contain longname.
+ * %PARSE_EOF - Directory has no more entries.
+ */
+static int fat_parse_long(struct inode *dir, loff_t *pos,
+			  struct buffer_head **bh, struct msdos_dir_entry **de,
+			  wchar_t **unicode, unsigned char *nr_slots)
+{
+	struct msdos_dir_slot *ds;
+	unsigned char id, slot, slots, alias_checksum;
+
+	if (!*unicode) {
+		*unicode = __getname();
+		if (!*unicode) {
+			brelse(*bh);
+			return -ENOMEM;
+		}
+	}
+parse_long:
+	slots = 0;
+	ds = (struct msdos_dir_slot *)*de;
+	id = ds->id;
+	if (!(id & 0x40))
+		return PARSE_INVALID;
+	slots = id & ~0x40;
+	if (slots > 20 || !slots)	/* ceil(256 * 2 / 26) */
+		return PARSE_INVALID;
+	*nr_slots = slots;
+	alias_checksum = ds->alias_checksum;
+
+	slot = slots;
+	while (1) {
+		int offset;
+
+		slot--;
+		offset = slot * 13;
+		fat16_towchar(*unicode + offset, ds->name0_4, 5);
+		fat16_towchar(*unicode + offset + 5, ds->name5_10, 6);
+		fat16_towchar(*unicode + offset + 11, ds->name11_12, 2);
+
+		if (ds->id & 0x40)
+			(*unicode)[offset + 13] = 0;
+		if (fat_get_entry(dir, pos, bh, de) < 0)
+			return PARSE_EOF;
+		if (slot == 0)
+			break;
+		ds = (struct msdos_dir_slot *)*de;
+		if (ds->attr != ATTR_EXT)
+			return PARSE_NOT_LONGNAME;
+		if ((ds->id & ~0x40) != slot)
+			goto parse_long;
+		if (ds->alias_checksum != alias_checksum)
+			goto parse_long;
+	}
+	if ((*de)->name[0] == DELETED_FLAG)
+		return PARSE_INVALID;
+	if ((*de)->attr == ATTR_EXT)
+		goto parse_long;
+	if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME))
+		return PARSE_INVALID;
+	if (fat_checksum((*de)->name) != alias_checksum)
+		*nr_slots = 0;
+
+	return 0;
+}
+
+/**
+ * fat_parse_short - Parse MS-DOS (short) directory entry.
+ * @sb:		superblock
+ * @de:		directory entry to parse
+ * @name:	FAT_MAX_SHORT_SIZE array in which to place extracted name
+ * @dot_hidden:	Nonzero == prepend '.' to names with ATTR_HIDDEN
+ *
+ * Returns the number of characters extracted into 'name'.
+ */
+static int fat_parse_short(struct super_block *sb,
+			   const struct msdos_dir_entry *de,
+			   unsigned char *name, int dot_hidden)
+{
+	const struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int isvfat = sbi->options.isvfat;
+	int nocase = sbi->options.nocase;
+	unsigned short opt_shortname = sbi->options.shortname;
+	struct nls_table *nls_disk = sbi->nls_disk;
+	wchar_t uni_name[14];
+	unsigned char c, work[MSDOS_NAME];
+	unsigned char *ptname = name;
+	int chi, chl, i, j, k;
+	int dotoffset = 0;
+	int name_len = 0, uni_len = 0;
+
+	if (!isvfat && dot_hidden && (de->attr & ATTR_HIDDEN)) {
+		*ptname++ = '.';
+		dotoffset = 1;
+	}
+
+	memcpy(work, de->name, sizeof(work));
+	/* see namei.c, msdos_format_name */
+	if (work[0] == 0x05)
+		work[0] = 0xE5;
+
+	/* Filename */
+	for (i = 0, j = 0; i < 8;) {
+		c = work[i];
+		if (!c)
+			break;
+		chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
+					&uni_name[j++], opt_shortname,
+					de->lcase & CASE_LOWER_BASE);
+		if (chl <= 1) {
+			if (!isvfat)
+				ptname[i] = nocase ? c : fat_tolower(c);
+			i++;
+			if (c != ' ') {
+				name_len = i;
+				uni_len  = j;
+			}
+		} else {
+			uni_len = j;
+			if (isvfat)
+				i += min(chl, 8-i);
+			else {
+				for (chi = 0; chi < chl && i < 8; chi++, i++)
+					ptname[i] = work[i];
+			}
+			if (chl)
+				name_len = i;
+		}
+	}
+
+	i = name_len;
+	j = uni_len;
+	fat_short2uni(nls_disk, ".", 1, &uni_name[j++]);
+	if (!isvfat)
+		ptname[i] = '.';
+	i++;
+
+	/* Extension */
+	for (k = 8; k < MSDOS_NAME;) {
+		c = work[k];
+		if (!c)
+			break;
+		chl = fat_shortname2uni(nls_disk, &work[k], MSDOS_NAME - k,
+					&uni_name[j++], opt_shortname,
+					de->lcase & CASE_LOWER_EXT);
+		if (chl <= 1) {
+			k++;
+			if (!isvfat)
+				ptname[i] = nocase ? c : fat_tolower(c);
+			i++;
+			if (c != ' ') {
+				name_len = i;
+				uni_len  = j;
+			}
+		} else {
+			uni_len = j;
+			if (isvfat) {
+				int offset = min(chl, MSDOS_NAME-k);
+				k += offset;
+				i += offset;
+			} else {
+				for (chi = 0; chi < chl && k < MSDOS_NAME;
+				     chi++, i++, k++) {
+						ptname[i] = work[k];
+				}
+			}
+			if (chl)
+				name_len = i;
+		}
+	}
+
+	if (name_len > 0) {
+		name_len += dotoffset;
+
+		if (sbi->options.isvfat) {
+			uni_name[uni_len] = 0x0000;
+			name_len = fat_uni_to_x8(sb, uni_name, name,
+						 FAT_MAX_SHORT_SIZE);
+		}
+	}
+
+	return name_len;
+}
+
+/*
+ * Return values: negative -> error/not found, 0 -> found.
+ */
+int fat_search_long(struct inode *inode, const unsigned char *name,
+		    int name_len, struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct msdos_dir_entry *de;
+	unsigned char nr_slots;
+	wchar_t *unicode = NULL;
+	unsigned char bufname[FAT_MAX_SHORT_SIZE];
+	loff_t cpos = 0;
+	int err, len;
+
+	err = -ENOENT;
+	while (1) {
+		if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
+			goto end_of_dir;
+parse_record:
+		nr_slots = 0;
+		if (de->name[0] == DELETED_FLAG)
+			continue;
+		if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
+			continue;
+		if (de->attr != ATTR_EXT && IS_FREE(de->name))
+			continue;
+		if (de->attr == ATTR_EXT) {
+			int status = fat_parse_long(inode, &cpos, &bh, &de,
+						    &unicode, &nr_slots);
+			if (status < 0) {
+				err = status;
+				goto end_of_dir;
+			} else if (status == PARSE_INVALID)
+				continue;
+			else if (status == PARSE_NOT_LONGNAME)
+				goto parse_record;
+			else if (status == PARSE_EOF)
+				goto end_of_dir;
+		}
+
+		/* Never prepend '.' to hidden files here.
+		 * That is done only for msdos mounts (and only when
+		 * 'dotsOK=yes'); if we are executing here, it is in the
+		 * context of a vfat mount.
+		 */
+		len = fat_parse_short(sb, de, bufname, 0);
+		if (len == 0)
+			continue;
+
+		/* Compare shortname */
+		if (fat_name_match(sbi, name, name_len, bufname, len))
+			goto found;
+
+		if (nr_slots) {
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+
+			/* Compare longname */
+			len = fat_uni_to_x8(sb, unicode, longname, size);
+			if (fat_name_match(sbi, name, name_len, longname, len))
+				goto found;
+		}
+	}
+
+found:
+	nr_slots++;	/* include the de */
+	sinfo->slot_off = cpos - nr_slots * sizeof(*de);
+	sinfo->nr_slots = nr_slots;
+	sinfo->de = de;
+	sinfo->bh = bh;
+	sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+	err = 0;
+end_of_dir:
+	if (unicode)
+		__putname(unicode);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(fat_search_long);
+
+struct fat_ioctl_filldir_callback {
+	struct dir_context ctx;
+	void __user *dirent;
+	int result;
+	/* for dir ioctl */
+	const char *longname;
+	int long_len;
+	const char *shortname;
+	int short_len;
+};
+
+static int __fat_readdir(struct inode *inode, struct file *file,
+			 struct dir_context *ctx, int short_only,
+			 struct fat_ioctl_filldir_callback *both)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de;
+	unsigned char nr_slots;
+	wchar_t *unicode = NULL;
+	unsigned char bufname[FAT_MAX_SHORT_SIZE];
+	int isvfat = sbi->options.isvfat;
+	const char *fill_name = NULL;
+	int fake_offset = 0;
+	loff_t cpos;
+	int short_len = 0, fill_len = 0;
+	int ret = 0;
+
+	mutex_lock(&sbi->s_lock);
+
+	cpos = ctx->pos;
+	/* Fake . and .. for the root directory. */
+	if (inode->i_ino == MSDOS_ROOT_INO) {
+		if (!dir_emit_dots(file, ctx))
+			goto out;
+		if (ctx->pos == 2) {
+			fake_offset = 1;
+			cpos = 0;
+		}
+	}
+	if (cpos & (sizeof(struct msdos_dir_entry) - 1)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	bh = NULL;
+get_new:
+	if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
+		goto end_of_dir;
+parse_record:
+	nr_slots = 0;
+	/*
+	 * Check for long filename entry, but if short_only, we don't
+	 * need to parse long filename.
+	 */
+	if (isvfat && !short_only) {
+		if (de->name[0] == DELETED_FLAG)
+			goto record_end;
+		if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
+			goto record_end;
+		if (de->attr != ATTR_EXT && IS_FREE(de->name))
+			goto record_end;
+	} else {
+		if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
+			goto record_end;
+	}
+
+	if (isvfat && de->attr == ATTR_EXT) {
+		int status = fat_parse_long(inode, &cpos, &bh, &de,
+					    &unicode, &nr_slots);
+		if (status < 0) {
+			ctx->pos = cpos;
+			ret = status;
+			goto out;
+		} else if (status == PARSE_INVALID)
+			goto record_end;
+		else if (status == PARSE_NOT_LONGNAME)
+			goto parse_record;
+		else if (status == PARSE_EOF)
+			goto end_of_dir;
+
+		if (nr_slots) {
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+			int len = fat_uni_to_x8(sb, unicode, longname, size);
+
+			fill_name = longname;
+			fill_len = len;
+			/* !both && !short_only, so we don't need shortname. */
+			if (!both)
+				goto start_filldir;
+
+			short_len = fat_parse_short(sb, de, bufname,
+						    sbi->options.dotsOK);
+			if (short_len == 0)
+				goto record_end;
+			/* hack for fat_ioctl_filldir() */
+			both->longname = fill_name;
+			both->long_len = fill_len;
+			both->shortname = bufname;
+			both->short_len = short_len;
+			fill_name = NULL;
+			fill_len = 0;
+			goto start_filldir;
+		}
+	}
+
+	short_len = fat_parse_short(sb, de, bufname, sbi->options.dotsOK);
+	if (short_len == 0)
+		goto record_end;
+
+	fill_name = bufname;
+	fill_len = short_len;
+
+start_filldir:
+	if (!fake_offset)
+		ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+
+	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
+		if (!dir_emit_dot(file, ctx))
+			goto fill_failed;
+	} else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+		if (!dir_emit_dotdot(file, ctx))
+			goto fill_failed;
+	} else {
+		unsigned long inum;
+		loff_t i_pos = fat_make_i_pos(sb, bh, de);
+		struct inode *tmp = fat_iget(sb, i_pos);
+		if (tmp) {
+			inum = tmp->i_ino;
+			iput(tmp);
+		} else
+			inum = iunique(sb, MSDOS_ROOT_INO);
+		if (!dir_emit(ctx, fill_name, fill_len, inum,
+			    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
+			goto fill_failed;
+	}
+
+record_end:
+	fake_offset = 0;
+	ctx->pos = cpos;
+	goto get_new;
+end_of_dir:
+	ctx->pos = cpos;
+fill_failed:
+	brelse(bh);
+	if (unicode)
+		__putname(unicode);
+out:
+	mutex_unlock(&sbi->s_lock);
+	return ret;
+}
+
+static int fat_readdir(struct file *file, struct dir_context *ctx)
+{
+	return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
+}
+
+#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)			   \
+static int func(struct dir_context *ctx, const char *name, int name_len,   \
+			     loff_t offset, u64 ino, unsigned int d_type)  \
+{									   \
+	struct fat_ioctl_filldir_callback *buf =			   \
+		container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \
+	struct dirent_type __user *d1 = buf->dirent;			   \
+	struct dirent_type __user *d2 = d1 + 1;				   \
+									   \
+	if (buf->result)						   \
+		return -EINVAL;						   \
+	buf->result++;							   \
+									   \
+	if (name != NULL) {						   \
+		/* dirent has only short name */			   \
+		if (name_len >= sizeof(d1->d_name))			   \
+			name_len = sizeof(d1->d_name) - 1;		   \
+									   \
+		if (put_user(0, d2->d_name)			||	   \
+		    put_user(0, &d2->d_reclen)			||	   \
+		    copy_to_user(d1->d_name, name, name_len)	||	   \
+		    put_user(0, d1->d_name + name_len)		||	   \
+		    put_user(name_len, &d1->d_reclen))			   \
+			goto efault;					   \
+	} else {							   \
+		/* dirent has short and long name */			   \
+		const char *longname = buf->longname;			   \
+		int long_len = buf->long_len;				   \
+		const char *shortname = buf->shortname;			   \
+		int short_len = buf->short_len;				   \
+									   \
+		if (long_len >= sizeof(d1->d_name))			   \
+			long_len = sizeof(d1->d_name) - 1;		   \
+		if (short_len >= sizeof(d1->d_name))			   \
+			short_len = sizeof(d1->d_name) - 1;		   \
+									   \
+		if (copy_to_user(d2->d_name, longname, long_len)	|| \
+		    put_user(0, d2->d_name + long_len)			|| \
+		    put_user(long_len, &d2->d_reclen)			|| \
+		    put_user(ino, &d2->d_ino)				|| \
+		    put_user(offset, &d2->d_off)			|| \
+		    copy_to_user(d1->d_name, shortname, short_len)	|| \
+		    put_user(0, d1->d_name + short_len)			|| \
+		    put_user(short_len, &d1->d_reclen))			   \
+			goto efault;					   \
+	}								   \
+	return 0;							   \
+efault:									   \
+	buf->result = -EFAULT;						   \
+	return -EFAULT;							   \
+}
+
+FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
+
+static int fat_ioctl_readdir(struct inode *inode, struct file *file,
+			     void __user *dirent, filldir_t filldir,
+			     int short_only, int both)
+{
+	struct fat_ioctl_filldir_callback buf = {
+		.ctx.actor = filldir,
+		.dirent = dirent
+	};
+	int ret;
+
+	buf.dirent = dirent;
+	buf.result = 0;
+	mutex_lock(&inode->i_mutex);
+	buf.ctx.pos = file->f_pos;
+	ret = -ENOENT;
+	if (!IS_DEADDIR(inode)) {
+		ret = __fat_readdir(inode, file, &buf.ctx,
+				    short_only, both ? &buf : NULL);
+		file->f_pos = buf.ctx.pos;
+	}
+	mutex_unlock(&inode->i_mutex);
+	if (ret >= 0)
+		ret = buf.result;
+	return ret;
+}
+
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
+	int short_only, both;
+
+	switch (cmd) {
+	case VFAT_IOCTL_READDIR_SHORT:
+		short_only = 1;
+		both = 0;
+		break;
+	case VFAT_IOCTL_READDIR_BOTH:
+		short_only = 0;
+		both = 1;
+		break;
+	default:
+		return fat_generic_ioctl(filp, cmd, arg);
+	}
+
+	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
+		return -EFAULT;
+	/*
+	 * Yes, we don't need this put_user() absolutely. However old
+	 * code didn't return the right value. So, app use this value,
+	 * in order to check whether it is EOF.
+	 */
+	if (put_user(0, &d1->d_reclen))
+		return -EFAULT;
+
+	return fat_ioctl_readdir(inode, filp, d1, fat_ioctl_filldir,
+				 short_only, both);
+}
+
+#ifdef CONFIG_COMPAT
+#define	VFAT_IOCTL_READDIR_BOTH32	_IOR('r', 1, struct compat_dirent[2])
+#define	VFAT_IOCTL_READDIR_SHORT32	_IOR('r', 2, struct compat_dirent[2])
+
+FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent)
+
+static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
+				 unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct compat_dirent __user *d1 = compat_ptr(arg);
+	int short_only, both;
+
+	switch (cmd) {
+	case VFAT_IOCTL_READDIR_SHORT32:
+		short_only = 1;
+		both = 0;
+		break;
+	case VFAT_IOCTL_READDIR_BOTH32:
+		short_only = 0;
+		both = 1;
+		break;
+	default:
+		return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
+	}
+
+	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
+		return -EFAULT;
+	/*
+	 * Yes, we don't need this put_user() absolutely. However old
+	 * code didn't return the right value. So, app use this value,
+	 * in order to check whether it is EOF.
+	 */
+	if (put_user(0, &d1->d_reclen))
+		return -EFAULT;
+
+	return fat_ioctl_readdir(inode, filp, d1, fat_compat_ioctl_filldir,
+				 short_only, both);
+}
+#endif /* CONFIG_COMPAT */
+
+const struct file_operations fat_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= fat_readdir,
+	.unlocked_ioctl	= fat_dir_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= fat_compat_dir_ioctl,
+#endif
+	.fsync		= fat_file_fsync,
+};
+
+static int fat_get_short_entry(struct inode *dir, loff_t *pos,
+			       struct buffer_head **bh,
+			       struct msdos_dir_entry **de)
+{
+	while (fat_get_entry(dir, pos, bh, de) >= 0) {
+		/* free entry or long name entry or volume label */
+		if (!IS_FREE((*de)->name) && !((*de)->attr & ATTR_VOLUME))
+			return 0;
+	}
+	return -ENOENT;
+}
+
+/*
+ * The ".." entry can not provide the "struct fat_slot_info" information
+ * for inode, nor a usable i_pos. So, this function provides some information
+ * only.
+ *
+ * Since this function walks through the on-disk inodes within a directory,
+ * callers are responsible for taking any locks necessary to prevent the
+ * directory from changing.
+ */
+int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+			 struct msdos_dir_entry **de)
+{
+	loff_t offset = 0;
+
+	*de = NULL;
+	while (fat_get_short_entry(dir, &offset, bh, de) >= 0) {
+		if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME))
+			return 0;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
+
+/* See if directory is empty */
+int fat_dir_empty(struct inode *dir)
+{
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de;
+	loff_t cpos;
+	int result = 0;
+
+	bh = NULL;
+	cpos = 0;
+	while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) {
+		if (strncmp(de->name, MSDOS_DOT   , MSDOS_NAME) &&
+		    strncmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+			result = -ENOTEMPTY;
+			break;
+		}
+	}
+	brelse(bh);
+	return result;
+}
+EXPORT_SYMBOL_GPL(fat_dir_empty);
+
+/*
+ * fat_subdirs counts the number of sub-directories of dir. It can be run
+ * on directories being created.
+ */
+int fat_subdirs(struct inode *dir)
+{
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de;
+	loff_t cpos;
+	int count = 0;
+
+	bh = NULL;
+	cpos = 0;
+	while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) {
+		if (de->attr & ATTR_DIR)
+			count++;
+	}
+	brelse(bh);
+	return count;
+}
+
+/*
+ * Scans a directory for a given file (name points to its formatted name).
+ * Returns an error code or zero.
+ */
+int fat_scan(struct inode *dir, const unsigned char *name,
+	     struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+
+	sinfo->slot_off = 0;
+	sinfo->bh = NULL;
+	while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh,
+				   &sinfo->de) >= 0) {
+		if (!strncmp(sinfo->de->name, name, MSDOS_NAME)) {
+			sinfo->slot_off -= sizeof(*sinfo->de);
+			sinfo->nr_slots = 1;
+			sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(fat_scan);
+
+/*
+ * Scans a directory for a given logstart.
+ * Returns an error code or zero.
+ */
+int fat_scan_logstart(struct inode *dir, int i_logstart,
+		      struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+
+	sinfo->slot_off = 0;
+	sinfo->bh = NULL;
+	while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh,
+				   &sinfo->de) >= 0) {
+		if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) {
+			sinfo->slot_off -= sizeof(*sinfo->de);
+			sinfo->nr_slots = 1;
+			sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de, *endp;
+	int err = 0, orig_slots;
+
+	while (nr_slots) {
+		bh = NULL;
+		if (fat_get_entry(dir, &pos, &bh, &de) < 0) {
+			err = -EIO;
+			break;
+		}
+
+		orig_slots = nr_slots;
+		endp = (struct msdos_dir_entry *)(bh->b_data + sb->s_blocksize);
+		while (nr_slots && de < endp) {
+			de->name[0] = DELETED_FLAG;
+			de++;
+			nr_slots--;
+		}
+		mark_buffer_dirty_inode(bh, dir);
+		if (IS_DIRSYNC(dir))
+			err = sync_dirty_buffer(bh);
+		brelse(bh);
+		if (err)
+			break;
+
+		/* pos is *next* de's position, so this does `- sizeof(de)' */
+		pos += ((orig_slots - nr_slots) * sizeof(*de)) - sizeof(*de);
+	}
+
+	return err;
+}
+
+int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+	int err = 0, nr_slots;
+
+	/*
+	 * First stage: Remove the shortname. By this, the directory
+	 * entry is removed.
+	 */
+	nr_slots = sinfo->nr_slots;
+	de = sinfo->de;
+	sinfo->de = NULL;
+	bh = sinfo->bh;
+	sinfo->bh = NULL;
+	while (nr_slots && de >= (struct msdos_dir_entry *)bh->b_data) {
+		de->name[0] = DELETED_FLAG;
+		de--;
+		nr_slots--;
+	}
+	mark_buffer_dirty_inode(bh, dir);
+	if (IS_DIRSYNC(dir))
+		err = sync_dirty_buffer(bh);
+	brelse(bh);
+	if (err)
+		return err;
+	dir->i_version++;
+
+	if (nr_slots) {
+		/*
+		 * Second stage: remove the remaining longname slots.
+		 * (This directory entry is already removed, and so return
+		 * the success)
+		 */
+		err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
+		if (err) {
+			fat_msg(sb, KERN_WARNING,
+			       "Couldn't remove the long name slots");
+		}
+	}
+
+	dir->i_mtime = dir->i_atime = CURRENT_TIME_SEC;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fat_remove_entries);
+
+static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
+			      struct buffer_head **bhs, int nr_bhs)
+{
+	struct super_block *sb = dir->i_sb;
+	sector_t last_blknr = blknr + MSDOS_SB(sb)->sec_per_clus;
+	int err, i, n;
+
+	/* Zeroing the unused blocks on this cluster */
+	blknr += nr_used;
+	n = nr_used;
+	while (blknr < last_blknr) {
+		bhs[n] = sb_getblk(sb, blknr);
+		if (!bhs[n]) {
+			err = -ENOMEM;
+			goto error;
+		}
+		memset(bhs[n]->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bhs[n]);
+		mark_buffer_dirty_inode(bhs[n], dir);
+
+		n++;
+		blknr++;
+		if (n == nr_bhs) {
+			if (IS_DIRSYNC(dir)) {
+				err = fat_sync_bhs(bhs, n);
+				if (err)
+					goto error;
+			}
+			for (i = 0; i < n; i++)
+				brelse(bhs[i]);
+			n = 0;
+		}
+	}
+	if (IS_DIRSYNC(dir)) {
+		err = fat_sync_bhs(bhs, n);
+		if (err)
+			goto error;
+	}
+	for (i = 0; i < n; i++)
+		brelse(bhs[i]);
+
+	return 0;
+
+error:
+	for (i = 0; i < n; i++)
+		bforget(bhs[i]);
+	return err;
+}
+
+int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	struct msdos_dir_entry *de;
+	sector_t blknr;
+	__le16 date, time;
+	u8 time_cs;
+	int err, cluster;
+
+	err = fat_alloc_clusters(dir, &cluster, 1);
+	if (err)
+		goto error;
+
+	blknr = fat_clus_to_blknr(sbi, cluster);
+	bhs[0] = sb_getblk(sb, blknr);
+	if (!bhs[0]) {
+		err = -ENOMEM;
+		goto error_free;
+	}
+
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
+
+	de = (struct msdos_dir_entry *)bhs[0]->b_data;
+	/* filling the new directory slots ("." and ".." entries) */
+	memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
+	memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
+	de->attr = de[1].attr = ATTR_DIR;
+	de[0].lcase = de[1].lcase = 0;
+	de[0].time = de[1].time = time;
+	de[0].date = de[1].date = date;
+	if (sbi->options.isvfat) {
+		/* extra timestamps */
+		de[0].ctime = de[1].ctime = time;
+		de[0].ctime_cs = de[1].ctime_cs = time_cs;
+		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date;
+	} else {
+		de[0].ctime = de[1].ctime = 0;
+		de[0].ctime_cs = de[1].ctime_cs = 0;
+		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
+	}
+	fat_set_start(&de[0], cluster);
+	fat_set_start(&de[1], MSDOS_I(dir)->i_logstart);
+	de[0].size = de[1].size = 0;
+	memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
+	set_buffer_uptodate(bhs[0]);
+	mark_buffer_dirty_inode(bhs[0], dir);
+
+	err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
+	if (err)
+		goto error_free;
+
+	return cluster;
+
+error_free:
+	fat_free_clusters(dir, cluster);
+error:
+	return err;
+}
+EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
+
+static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
+			       int *nr_cluster, struct msdos_dir_entry **de,
+			       struct buffer_head **bh, loff_t *i_pos)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	sector_t blknr, start_blknr, last_blknr;
+	unsigned long size, copy;
+	int err, i, n, offset, cluster[2];
+
+	/*
+	 * The minimum cluster size is 512bytes, and maximum entry
+	 * size is 32*slots (672bytes).  So, iff the cluster size is
+	 * 512bytes, we may need two clusters.
+	 */
+	size = nr_slots * sizeof(struct msdos_dir_entry);
+	*nr_cluster = (size + (sbi->cluster_size - 1)) >> sbi->cluster_bits;
+	BUG_ON(*nr_cluster > 2);
+
+	err = fat_alloc_clusters(dir, cluster, *nr_cluster);
+	if (err)
+		goto error;
+
+	/*
+	 * First stage: Fill the directory entry.  NOTE: This cluster
+	 * is not referenced from any inode yet, so updates order is
+	 * not important.
+	 */
+	i = n = copy = 0;
+	do {
+		start_blknr = blknr = fat_clus_to_blknr(sbi, cluster[i]);
+		last_blknr = start_blknr + sbi->sec_per_clus;
+		while (blknr < last_blknr) {
+			bhs[n] = sb_getblk(sb, blknr);
+			if (!bhs[n]) {
+				err = -ENOMEM;
+				goto error_nomem;
+			}
+
+			/* fill the directory entry */
+			copy = min(size, sb->s_blocksize);
+			memcpy(bhs[n]->b_data, slots, copy);
+			slots += copy;
+			size -= copy;
+			set_buffer_uptodate(bhs[n]);
+			mark_buffer_dirty_inode(bhs[n], dir);
+			if (!size)
+				break;
+			n++;
+			blknr++;
+		}
+	} while (++i < *nr_cluster);
+
+	memset(bhs[n]->b_data + copy, 0, sb->s_blocksize - copy);
+	offset = copy - sizeof(struct msdos_dir_entry);
+	get_bh(bhs[n]);
+	*bh = bhs[n];
+	*de = (struct msdos_dir_entry *)((*bh)->b_data + offset);
+	*i_pos = fat_make_i_pos(sb, *bh, *de);
+
+	/* Second stage: clear the rest of cluster, and write outs */
+	err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE);
+	if (err)
+		goto error_free;
+
+	return cluster[0];
+
+error_free:
+	brelse(*bh);
+	*bh = NULL;
+	n = 0;
+error_nomem:
+	for (i = 0; i < n; i++)
+		bforget(bhs[i]);
+	fat_free_clusters(dir, cluster[0]);
+error:
+	return err;
+}
+
+int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+		    struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
+	struct msdos_dir_entry *uninitialized_var(de);
+	int err, free_slots, i, nr_bhs;
+	loff_t pos, i_pos;
+
+	sinfo->nr_slots = nr_slots;
+
+	/* First stage: search free directory entries */
+	free_slots = nr_bhs = 0;
+	bh = prev = NULL;
+	pos = 0;
+	err = -ENOSPC;
+	while (fat_get_entry(dir, &pos, &bh, &de) > -1) {
+		/* check the maximum size of directory */
+		if (pos >= FAT_MAX_DIR_SIZE)
+			goto error;
+
+		if (IS_FREE(de->name)) {
+			if (prev != bh) {
+				get_bh(bh);
+				bhs[nr_bhs] = prev = bh;
+				nr_bhs++;
+			}
+			free_slots++;
+			if (free_slots == nr_slots)
+				goto found;
+		} else {
+			for (i = 0; i < nr_bhs; i++)
+				brelse(bhs[i]);
+			prev = NULL;
+			free_slots = nr_bhs = 0;
+		}
+	}
+	if (dir->i_ino == MSDOS_ROOT_INO) {
+		if (sbi->fat_bits != 32)
+			goto error;
+	} else if (MSDOS_I(dir)->i_start == 0) {
+		fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
+		       MSDOS_I(dir)->i_pos);
+		err = -EIO;
+		goto error;
+	}
+
+found:
+	err = 0;
+	pos -= free_slots * sizeof(*de);
+	nr_slots -= free_slots;
+	if (free_slots) {
+		/*
+		 * Second stage: filling the free entries with new entries.
+		 * NOTE: If this slots has shortname, first, we write
+		 * the long name slots, then write the short name.
+		 */
+		int size = free_slots * sizeof(*de);
+		int offset = pos & (sb->s_blocksize - 1);
+		int long_bhs = nr_bhs - (nr_slots == 0);
+
+		/* Fill the long name slots. */
+		for (i = 0; i < long_bhs; i++) {
+			int copy = min_t(int, sb->s_blocksize - offset, size);
+			memcpy(bhs[i]->b_data + offset, slots, copy);
+			mark_buffer_dirty_inode(bhs[i], dir);
+			offset = 0;
+			slots += copy;
+			size -= copy;
+		}
+		if (long_bhs && IS_DIRSYNC(dir))
+			err = fat_sync_bhs(bhs, long_bhs);
+		if (!err && i < nr_bhs) {
+			/* Fill the short name slot. */
+			int copy = min_t(int, sb->s_blocksize - offset, size);
+			memcpy(bhs[i]->b_data + offset, slots, copy);
+			mark_buffer_dirty_inode(bhs[i], dir);
+			if (IS_DIRSYNC(dir))
+				err = sync_dirty_buffer(bhs[i]);
+		}
+		for (i = 0; i < nr_bhs; i++)
+			brelse(bhs[i]);
+		if (err)
+			goto error_remove;
+	}
+
+	if (nr_slots) {
+		int cluster, nr_cluster;
+
+		/*
+		 * Third stage: allocate the cluster for new entries.
+		 * And initialize the cluster with new entries, then
+		 * add the cluster to dir.
+		 */
+		cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster,
+					      &de, &bh, &i_pos);
+		if (cluster < 0) {
+			err = cluster;
+			goto error_remove;
+		}
+		err = fat_chain_add(dir, cluster, nr_cluster);
+		if (err) {
+			fat_free_clusters(dir, cluster);
+			goto error_remove;
+		}
+		if (dir->i_size & (sbi->cluster_size - 1)) {
+			fat_fs_error(sb, "Odd directory size");
+			dir->i_size = (dir->i_size + sbi->cluster_size - 1)
+				& ~((loff_t)sbi->cluster_size - 1);
+		}
+		dir->i_size += nr_cluster << sbi->cluster_bits;
+		MSDOS_I(dir)->mmu_private += nr_cluster << sbi->cluster_bits;
+	}
+	sinfo->slot_off = pos;
+	sinfo->de = de;
+	sinfo->bh = bh;
+	sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+
+	return 0;
+
+error:
+	brelse(bh);
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+	return err;
+
+error_remove:
+	brelse(bh);
+	if (free_slots)
+		__fat_remove_entries(dir, pos, free_slots);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fat_add_entries);
diff --git a/kernel/fs/fat/fat.h b/kernel/fs/fat/fat.h
new file mode 100644
index 000000000..be5e15323
--- /dev/null
+++ b/kernel/fs/fat/fat.h
@@ -0,0 +1,420 @@
+#ifndef _FAT_H
+#define _FAT_H
+
+#include <linux/buffer_head.h>
+#include <linux/nls.h>
+#include <linux/hash.h>
+#include <linux/ratelimit.h>
+#include <linux/msdos_fs.h>
+
+/*
+ * vfat shortname flags
+ */
+#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
+#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
+#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
+#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
+#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
+
+#define FAT_ERRORS_CONT		1      /* ignore error and continue */
+#define FAT_ERRORS_PANIC	2      /* panic on error */
+#define FAT_ERRORS_RO		3      /* remount r/o on error */
+
+#define FAT_NFS_STALE_RW	1      /* NFS RW support, can cause ESTALE */
+#define FAT_NFS_NOSTALE_RO	2      /* NFS RO support, no ESTALE issue */
+
+struct fat_mount_options {
+	kuid_t fs_uid;
+	kgid_t fs_gid;
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short codepage;   /* Codepage for shortname conversions */
+	int time_offset;	   /* Offset of timestamps from UTC (in minutes) */
+	char *iocharset;           /* Charset used for filename input/display */
+	unsigned short shortname;  /* flags for shortname display/create rule */
+	unsigned char name_check;  /* r = relaxed, n = normal, s = strict */
+	unsigned char errors;	   /* On error: continue, panic, remount-ro */
+	unsigned char nfs;	  /* NFS support: nostale_ro, stale_rw */
+	unsigned short allow_utime;/* permission for setting the [am]time */
+	unsigned quiet:1,          /* set = fake successful chmods and chowns */
+		 showexec:1,       /* set = only set x bit for com/exe/bat */
+		 sys_immutable:1,  /* set = system files are immutable */
+		 dotsOK:1,         /* set = hidden and system files are named '.filename' */
+		 isvfat:1,         /* 0=no vfat long filename support, 1=vfat support */
+		 utf8:1,	   /* Use of UTF-8 character set (Default) */
+		 unicode_xlate:1,  /* create escape sequences for unhandled Unicode */
+		 numtail:1,        /* Does first alias have a numeric '~1' type tail? */
+		 flush:1,	   /* write things quickly */
+		 nocase:1,	   /* Does this need case conversion? 0=need case conversion*/
+		 usefree:1,	   /* Use free_clusters for FAT32 */
+		 tz_set:1,	   /* Filesystem timestamps' offset set */
+		 rodir:1,	   /* allow ATTR_RO for directory */
+		 discard:1,	   /* Issue discard requests on deletions */
+		 dos1xfloppy:1;	   /* Assume default BPB for DOS 1.x floppies */
+};
+
+#define FAT_HASH_BITS	8
+#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
+
+/*
+ * MS-DOS file system in-core superblock data
+ */
+struct msdos_sb_info {
+	unsigned short sec_per_clus;  /* sectors/cluster */
+	unsigned short cluster_bits;  /* log2(cluster_size) */
+	unsigned int cluster_size;    /* cluster size */
+	unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */
+	unsigned short fat_start;
+	unsigned long fat_length;     /* FAT start & length (sec.) */
+	unsigned long dir_start;
+	unsigned short dir_entries;   /* root dir start & entries */
+	unsigned long data_start;     /* first data sector */
+	unsigned long max_cluster;    /* maximum cluster number */
+	unsigned long root_cluster;   /* first cluster of the root directory */
+	unsigned long fsinfo_sector;  /* sector number of FAT32 fsinfo */
+	struct mutex fat_lock;
+	struct mutex nfs_build_inode_lock;
+	struct mutex s_lock;
+	unsigned int prev_free;      /* previously allocated cluster number */
+	unsigned int free_clusters;  /* -1 if undefined */
+	unsigned int free_clus_valid; /* is free_clusters valid? */
+	struct fat_mount_options options;
+	struct nls_table *nls_disk;   /* Codepage used on disk */
+	struct nls_table *nls_io;     /* Charset used for input and display */
+	const void *dir_ops;	      /* Opaque; default directory operations */
+	int dir_per_block;	      /* dir entries per block */
+	int dir_per_block_bits;	      /* log2(dir_per_block) */
+	unsigned int vol_id;		/*volume ID*/
+
+	int fatent_shift;
+	struct fatent_operations *fatent_ops;
+	struct inode *fat_inode;
+	struct inode *fsinfo_inode;
+
+	struct ratelimit_state ratelimit;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+
+	spinlock_t dir_hash_lock;
+	struct hlist_head dir_hashtable[FAT_HASH_SIZE];
+
+	unsigned int dirty;           /* fs state before mount */
+	struct rcu_head rcu;
+};
+
+#define FAT_CACHE_VALID	0	/* special case for valid cache */
+
+/*
+ * MS-DOS file system inode data in memory
+ */
+struct msdos_inode_info {
+	spinlock_t cache_lru_lock;
+	struct list_head cache_lru;
+	int nr_caches;
+	/* for avoiding the race between fat_free() and fat_get_cluster() */
+	unsigned int cache_valid_id;
+
+	/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+	loff_t mmu_private;	/* physically allocated size */
+
+	int i_start;		/* first cluster or 0 */
+	int i_logstart;		/* logical first cluster */
+	int i_attrs;		/* unused attribute bits */
+	loff_t i_pos;		/* on-disk position of directory entry or 0 */
+	struct hlist_node i_fat_hash;	/* hash by i_location */
+	struct hlist_node i_dir_hash;	/* hash by i_logstart */
+	struct rw_semaphore truncate_lock; /* protect bmap against truncate */
+	struct inode vfs_inode;
+};
+
+struct fat_slot_info {
+	loff_t i_pos;		/* on-disk position of directory entry */
+	loff_t slot_off;	/* offset for slot or de start */
+	int nr_slots;		/* number of slots + 1(de) in filename */
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+};
+
+static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
+{
+	return container_of(inode, struct msdos_inode_info, vfs_inode);
+}
+
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
+ */
+static inline int fat_mode_can_hold_ro(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	umode_t mask;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (!sbi->options.rodir)
+			return 0;
+		mask = ~sbi->options.fs_dmask;
+	} else
+		mask = ~sbi->options.fs_fmask;
+
+	if (!(mask & S_IWUGO))
+		return 0;
+	return 1;
+}
+
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline umode_t fat_make_mode(struct msdos_sb_info *sbi,
+				   u8 attrs, umode_t mode)
+{
+	if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
+		mode &= ~S_IWUGO;
+
+	if (attrs & ATTR_DIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u8 fat_make_attrs(struct inode *inode)
+{
+	u8 attrs = MSDOS_I(inode)->i_attrs;
+	if (S_ISDIR(inode->i_mode))
+		attrs |= ATTR_DIR;
+	if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+		attrs |= ATTR_RO;
+	return attrs;
+}
+
+static inline void fat_save_attrs(struct inode *inode, u8 attrs)
+{
+	if (fat_mode_can_hold_ro(inode))
+		MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	else
+		MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
+}
+
+static inline unsigned char fat_checksum(const __u8 *name)
+{
+	unsigned char s = name[0];
+	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
+	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
+	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
+	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
+	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
+	return s;
+}
+
+static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+{
+	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
+		+ sbi->data_start;
+}
+
+static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi,
+				loff_t i_pos, sector_t *blknr, int *offset)
+{
+	*blknr = i_pos >> sbi->dir_per_block_bits;
+	*offset = i_pos & (sbi->dir_per_block - 1);
+}
+
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+					struct inode *inode)
+{
+	loff_t i_pos;
+#if BITS_PER_LONG == 32
+	spin_lock(&sbi->inode_hash_lock);
+#endif
+	i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+	spin_unlock(&sbi->inode_hash_lock);
+#endif
+	return i_pos;
+}
+
+static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		*dst++ = src[0] | (src[1] << 8);
+		src += 2;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+static inline int fat_get_start(const struct msdos_sb_info *sbi,
+				const struct msdos_dir_entry *de)
+{
+	int cluster = le16_to_cpu(de->start);
+	if (sbi->fat_bits == 32)
+		cluster |= (le16_to_cpu(de->starthi) << 16);
+	return cluster;
+}
+
+static inline void fat_set_start(struct msdos_dir_entry *de, int cluster)
+{
+	de->start   = cpu_to_le16(cluster);
+	de->starthi = cpu_to_le16(cluster >> 16);
+}
+
+static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		dst[0] = *src & 0x00FF;
+		dst[1] = (*src & 0xFF00) >> 8;
+		dst += 2;
+		src++;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+/* fat/cache.c */
+extern void fat_cache_inval_inode(struct inode *inode);
+extern int fat_get_cluster(struct inode *inode, int cluster,
+			   int *fclus, int *dclus);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks, int create);
+
+/* fat/dir.c */
+extern const struct file_operations fat_dir_operations;
+extern int fat_search_long(struct inode *inode, const unsigned char *name,
+			   int name_len, struct fat_slot_info *sinfo);
+extern int fat_dir_empty(struct inode *dir);
+extern int fat_subdirs(struct inode *dir);
+extern int fat_scan(struct inode *dir, const unsigned char *name,
+		    struct fat_slot_info *sinfo);
+extern int fat_scan_logstart(struct inode *dir, int i_logstart,
+			     struct fat_slot_info *sinfo);
+extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+				struct msdos_dir_entry **de);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+			   struct fat_slot_info *sinfo);
+extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+
+/* fat/fatent.c */
+struct fat_entry {
+	int entry;
+	union {
+		u8 *ent12_p[2];
+		__le16 *ent16_p;
+		__le32 *ent32_p;
+	} u;
+	int nr_bhs;
+	struct buffer_head *bhs[2];
+	struct inode *fat_inode;
+};
+
+static inline void fatent_init(struct fat_entry *fatent)
+{
+	fatent->nr_bhs = 0;
+	fatent->entry = 0;
+	fatent->u.ent32_p = NULL;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
+}
+
+static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
+{
+	fatent->entry = entry;
+	fatent->u.ent32_p = NULL;
+}
+
+static inline void fatent_brelse(struct fat_entry *fatent)
+{
+	int i;
+	fatent->u.ent32_p = NULL;
+	for (i = 0; i < fatent->nr_bhs; i++)
+		brelse(fatent->bhs[i]);
+	fatent->nr_bhs = 0;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
+}
+
+extern void fat_ent_access_init(struct super_block *sb);
+extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
+			int entry);
+extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+			 int new, int wait);
+extern int fat_alloc_clusters(struct inode *inode, int *cluster,
+			      int nr_cluster);
+extern int fat_free_clusters(struct inode *inode, int cluster);
+extern int fat_count_free_clusters(struct super_block *sb);
+
+/* fat/file.c */
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
+			      unsigned long arg);
+extern const struct file_operations fat_file_operations;
+extern const struct inode_operations fat_file_inode_operations;
+extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
+extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync);
+
+/* fat/inode.c */
+extern int fat_block_truncate_page(struct inode *inode, loff_t from);
+extern void fat_attach(struct inode *inode, loff_t i_pos);
+extern void fat_detach(struct inode *inode);
+extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
+extern struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos);
+extern int fat_sync_inode(struct inode *inode);
+extern int fat_fill_super(struct super_block *sb, void *data, int silent,
+			  int isvfat, void (*setup)(struct super_block *));
+extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de);
+
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+			    struct inode *i2);
+static inline unsigned long fat_dir_hash(int logstart)
+{
+	return hash_32(logstart, FAT_HASH_BITS);
+}
+
+/* fat/misc.c */
+extern __printf(3, 4) __cold
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
+#define fat_fs_error(sb, fmt, args...)		\
+	__fat_fs_error(sb, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(sb, fmt, args...) \
+	__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+__printf(3, 4) __cold
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+#define fat_msg_ratelimit(sb, level, fmt, args...)	\
+	do {	\
+			if (__ratelimit(&MSDOS_SB(sb)->ratelimit))	\
+				fat_msg(sb, level, fmt, ## args);	\
+	 } while (0)
+extern int fat_clusters_flush(struct super_block *sb);
+extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 __time, __le16 __date, u8 time_cs);
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 *time, __le16 *date, u8 *time_cs);
+extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
+
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
+/* fat/nfs.c */
+extern const struct export_operations fat_export_ops;
+extern const struct export_operations fat_export_ops_nostale;
+
+/* helper for printk */
+typedef unsigned long long	llu;
+
+#endif /* !_FAT_H */
diff --git a/kernel/fs/fat/fatent.c b/kernel/fs/fat/fatent.c
new file mode 100644
index 000000000..822655713
--- /dev/null
+++ b/kernel/fs/fat/fatent.c
@@ -0,0 +1,692 @@
+/*
+ * Copyright (C) 2004, OGAWA Hirofumi
+ * Released under GPL v2.
+ */
+
+#include <linux/blkdev.h>
+#include "fat.h"
+
+struct fatent_operations {
+	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
+	void (*ent_set_ptr)(struct fat_entry *, int);
+	int (*ent_bread)(struct super_block *, struct fat_entry *,
+			 int, sector_t);
+	int (*ent_get)(struct fat_entry *);
+	void (*ent_put)(struct fat_entry *, int);
+	int (*ent_next)(struct fat_entry *);
+};
+
+static DEFINE_SPINLOCK(fat12_entry_lock);
+
+static void fat12_ent_blocknr(struct super_block *sb, int entry,
+			      int *offset, sector_t *blocknr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int bytes = entry + (entry >> 1);
+	WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+	*offset = bytes & (sb->s_blocksize - 1);
+	*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
+}
+
+static void fat_ent_blocknr(struct super_block *sb, int entry,
+			    int *offset, sector_t *blocknr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int bytes = (entry << sbi->fatent_shift);
+	WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+	*offset = bytes & (sb->s_blocksize - 1);
+	*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
+}
+
+static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset)
+{
+	struct buffer_head **bhs = fatent->bhs;
+	if (fatent->nr_bhs == 1) {
+		WARN_ON(offset >= (bhs[0]->b_size - 1));
+		fatent->u.ent12_p[0] = bhs[0]->b_data + offset;
+		fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1);
+	} else {
+		WARN_ON(offset != (bhs[0]->b_size - 1));
+		fatent->u.ent12_p[0] = bhs[0]->b_data + offset;
+		fatent->u.ent12_p[1] = bhs[1]->b_data;
+	}
+}
+
+static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset)
+{
+	WARN_ON(offset & (2 - 1));
+	fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset);
+}
+
+static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset)
+{
+	WARN_ON(offset & (4 - 1));
+	fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset);
+}
+
+static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
+			   int offset, sector_t blocknr)
+{
+	struct buffer_head **bhs = fatent->bhs;
+
+	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
+
+	bhs[0] = sb_bread(sb, blocknr);
+	if (!bhs[0])
+		goto err;
+
+	if ((offset + 1) < sb->s_blocksize)
+		fatent->nr_bhs = 1;
+	else {
+		/* This entry is block boundary, it needs the next block */
+		blocknr++;
+		bhs[1] = sb_bread(sb, blocknr);
+		if (!bhs[1])
+			goto err_brelse;
+		fatent->nr_bhs = 2;
+	}
+	fat12_ent_set_ptr(fatent, offset);
+	return 0;
+
+err_brelse:
+	brelse(bhs[0]);
+err:
+	fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
+	return -EIO;
+}
+
+static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
+			 int offset, sector_t blocknr)
+{
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+
+	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
+	fatent->bhs[0] = sb_bread(sb, blocknr);
+	if (!fatent->bhs[0]) {
+		fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+		       (llu)blocknr);
+		return -EIO;
+	}
+	fatent->nr_bhs = 1;
+	ops->ent_set_ptr(fatent, offset);
+	return 0;
+}
+
+static int fat12_ent_get(struct fat_entry *fatent)
+{
+	u8 **ent12_p = fatent->u.ent12_p;
+	int next;
+
+	spin_lock(&fat12_entry_lock);
+	if (fatent->entry & 1)
+		next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4);
+	else
+		next = (*ent12_p[1] << 8) | *ent12_p[0];
+	spin_unlock(&fat12_entry_lock);
+
+	next &= 0x0fff;
+	if (next >= BAD_FAT12)
+		next = FAT_ENT_EOF;
+	return next;
+}
+
+static int fat16_ent_get(struct fat_entry *fatent)
+{
+	int next = le16_to_cpu(*fatent->u.ent16_p);
+	WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1));
+	if (next >= BAD_FAT16)
+		next = FAT_ENT_EOF;
+	return next;
+}
+
+static int fat32_ent_get(struct fat_entry *fatent)
+{
+	int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff;
+	WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1));
+	if (next >= BAD_FAT32)
+		next = FAT_ENT_EOF;
+	return next;
+}
+
+static void fat12_ent_put(struct fat_entry *fatent, int new)
+{
+	u8 **ent12_p = fatent->u.ent12_p;
+
+	if (new == FAT_ENT_EOF)
+		new = EOF_FAT12;
+
+	spin_lock(&fat12_entry_lock);
+	if (fatent->entry & 1) {
+		*ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f);
+		*ent12_p[1] = new >> 4;
+	} else {
+		*ent12_p[0] = new & 0xff;
+		*ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8);
+	}
+	spin_unlock(&fat12_entry_lock);
+
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+	if (fatent->nr_bhs == 2)
+		mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
+}
+
+static void fat16_ent_put(struct fat_entry *fatent, int new)
+{
+	if (new == FAT_ENT_EOF)
+		new = EOF_FAT16;
+
+	*fatent->u.ent16_p = cpu_to_le16(new);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+}
+
+static void fat32_ent_put(struct fat_entry *fatent, int new)
+{
+	WARN_ON(new & 0xf0000000);
+	new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
+	*fatent->u.ent32_p = cpu_to_le32(new);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+}
+
+static int fat12_ent_next(struct fat_entry *fatent)
+{
+	u8 **ent12_p = fatent->u.ent12_p;
+	struct buffer_head **bhs = fatent->bhs;
+	u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1);
+
+	fatent->entry++;
+	if (fatent->nr_bhs == 1) {
+		WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data +
+							(bhs[0]->b_size - 2)));
+		WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data +
+							(bhs[0]->b_size - 1)));
+		if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) {
+			ent12_p[0] = nextp - 1;
+			ent12_p[1] = nextp;
+			return 1;
+		}
+	} else {
+		WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data +
+							(bhs[0]->b_size - 1)));
+		WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data);
+		ent12_p[0] = nextp - 1;
+		ent12_p[1] = nextp;
+		brelse(bhs[0]);
+		bhs[0] = bhs[1];
+		fatent->nr_bhs = 1;
+		return 1;
+	}
+	ent12_p[0] = NULL;
+	ent12_p[1] = NULL;
+	return 0;
+}
+
+static int fat16_ent_next(struct fat_entry *fatent)
+{
+	const struct buffer_head *bh = fatent->bhs[0];
+	fatent->entry++;
+	if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) {
+		fatent->u.ent16_p++;
+		return 1;
+	}
+	fatent->u.ent16_p = NULL;
+	return 0;
+}
+
+static int fat32_ent_next(struct fat_entry *fatent)
+{
+	const struct buffer_head *bh = fatent->bhs[0];
+	fatent->entry++;
+	if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) {
+		fatent->u.ent32_p++;
+		return 1;
+	}
+	fatent->u.ent32_p = NULL;
+	return 0;
+}
+
+static struct fatent_operations fat12_ops = {
+	.ent_blocknr	= fat12_ent_blocknr,
+	.ent_set_ptr	= fat12_ent_set_ptr,
+	.ent_bread	= fat12_ent_bread,
+	.ent_get	= fat12_ent_get,
+	.ent_put	= fat12_ent_put,
+	.ent_next	= fat12_ent_next,
+};
+
+static struct fatent_operations fat16_ops = {
+	.ent_blocknr	= fat_ent_blocknr,
+	.ent_set_ptr	= fat16_ent_set_ptr,
+	.ent_bread	= fat_ent_bread,
+	.ent_get	= fat16_ent_get,
+	.ent_put	= fat16_ent_put,
+	.ent_next	= fat16_ent_next,
+};
+
+static struct fatent_operations fat32_ops = {
+	.ent_blocknr	= fat_ent_blocknr,
+	.ent_set_ptr	= fat32_ent_set_ptr,
+	.ent_bread	= fat_ent_bread,
+	.ent_get	= fat32_ent_get,
+	.ent_put	= fat32_ent_put,
+	.ent_next	= fat32_ent_next,
+};
+
+static inline void lock_fat(struct msdos_sb_info *sbi)
+{
+	mutex_lock(&sbi->fat_lock);
+}
+
+static inline void unlock_fat(struct msdos_sb_info *sbi)
+{
+	mutex_unlock(&sbi->fat_lock);
+}
+
+void fat_ent_access_init(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	mutex_init(&sbi->fat_lock);
+
+	switch (sbi->fat_bits) {
+	case 32:
+		sbi->fatent_shift = 2;
+		sbi->fatent_ops = &fat32_ops;
+		break;
+	case 16:
+		sbi->fatent_shift = 1;
+		sbi->fatent_ops = &fat16_ops;
+		break;
+	case 12:
+		sbi->fatent_shift = -1;
+		sbi->fatent_ops = &fat12_ops;
+		break;
+	}
+}
+
+static void mark_fsinfo_dirty(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	if (sb->s_flags & MS_RDONLY || sbi->fat_bits != 32)
+		return;
+
+	__mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC);
+}
+
+static inline int fat_ent_update_ptr(struct super_block *sb,
+				     struct fat_entry *fatent,
+				     int offset, sector_t blocknr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct buffer_head **bhs = fatent->bhs;
+
+	/* Is this fatent's blocks including this entry? */
+	if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
+		return 0;
+	if (sbi->fat_bits == 12) {
+		if ((offset + 1) < sb->s_blocksize) {
+			/* This entry is on bhs[0]. */
+			if (fatent->nr_bhs == 2) {
+				brelse(bhs[1]);
+				fatent->nr_bhs = 1;
+			}
+		} else {
+			/* This entry needs the next block. */
+			if (fatent->nr_bhs != 2)
+				return 0;
+			if (bhs[1]->b_blocknr != (blocknr + 1))
+				return 0;
+		}
+	}
+	ops->ent_set_ptr(fatent, offset);
+	return 1;
+}
+
+int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	int err, offset;
+	sector_t blocknr;
+
+	if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
+		fatent_brelse(fatent);
+		fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
+		return -EIO;
+	}
+
+	fatent_set_entry(fatent, entry);
+	ops->ent_blocknr(sb, entry, &offset, &blocknr);
+
+	if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) {
+		fatent_brelse(fatent);
+		err = ops->ent_bread(sb, fatent, offset, blocknr);
+		if (err)
+			return err;
+	}
+	return ops->ent_get(fatent);
+}
+
+/* FIXME: We can write the blocks as more big chunk. */
+static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
+			  int nr_bhs)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *c_bh;
+	int err, n, copy;
+
+	err = 0;
+	for (copy = 1; copy < sbi->fats; copy++) {
+		sector_t backup_fat = sbi->fat_length * copy;
+
+		for (n = 0; n < nr_bhs; n++) {
+			c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr);
+			if (!c_bh) {
+				err = -ENOMEM;
+				goto error;
+			}
+			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
+			set_buffer_uptodate(c_bh);
+			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
+			if (sb->s_flags & MS_SYNCHRONOUS)
+				err = sync_dirty_buffer(c_bh);
+			brelse(c_bh);
+			if (err)
+				goto error;
+		}
+	}
+error:
+	return err;
+}
+
+int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+		  int new, int wait)
+{
+	struct super_block *sb = inode->i_sb;
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	int err;
+
+	ops->ent_put(fatent, new);
+	if (wait) {
+		err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs);
+		if (err)
+			return err;
+	}
+	return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs);
+}
+
+static inline int fat_ent_next(struct msdos_sb_info *sbi,
+			       struct fat_entry *fatent)
+{
+	if (sbi->fatent_ops->ent_next(fatent)) {
+		if (fatent->entry < sbi->max_cluster)
+			return 1;
+	}
+	return 0;
+}
+
+static inline int fat_ent_read_block(struct super_block *sb,
+				     struct fat_entry *fatent)
+{
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	sector_t blocknr;
+	int offset;
+
+	fatent_brelse(fatent);
+	ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+	return ops->ent_bread(sb, fatent, offset, blocknr);
+}
+
+static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs,
+			    struct fat_entry *fatent)
+{
+	int n, i;
+
+	for (n = 0; n < fatent->nr_bhs; n++) {
+		for (i = 0; i < *nr_bhs; i++) {
+			if (fatent->bhs[n] == bhs[i])
+				break;
+		}
+		if (i == *nr_bhs) {
+			get_bh(fatent->bhs[n]);
+			bhs[i] = fatent->bhs[n];
+			(*nr_bhs)++;
+		}
+	}
+}
+
+int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct fat_entry fatent, prev_ent;
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	int i, count, err, nr_bhs, idx_clus;
+
+	BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2));	/* fixed limit */
+
+	lock_fat(sbi);
+	if (sbi->free_clusters != -1 && sbi->free_clus_valid &&
+	    sbi->free_clusters < nr_cluster) {
+		unlock_fat(sbi);
+		return -ENOSPC;
+	}
+
+	err = nr_bhs = idx_clus = 0;
+	count = FAT_START_ENT;
+	fatent_init(&prev_ent);
+	fatent_init(&fatent);
+	fatent_set_entry(&fatent, sbi->prev_free + 1);
+	while (count < sbi->max_cluster) {
+		if (fatent.entry >= sbi->max_cluster)
+			fatent.entry = FAT_START_ENT;
+		fatent_set_entry(&fatent, fatent.entry);
+		err = fat_ent_read_block(sb, &fatent);
+		if (err)
+			goto out;
+
+		/* Find the free entries in a block */
+		do {
+			if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
+				int entry = fatent.entry;
+
+				/* make the cluster chain */
+				ops->ent_put(&fatent, FAT_ENT_EOF);
+				if (prev_ent.nr_bhs)
+					ops->ent_put(&prev_ent, entry);
+
+				fat_collect_bhs(bhs, &nr_bhs, &fatent);
+
+				sbi->prev_free = entry;
+				if (sbi->free_clusters != -1)
+					sbi->free_clusters--;
+
+				cluster[idx_clus] = entry;
+				idx_clus++;
+				if (idx_clus == nr_cluster)
+					goto out;
+
+				/*
+				 * fat_collect_bhs() gets ref-count of bhs,
+				 * so we can still use the prev_ent.
+				 */
+				prev_ent = fatent;
+			}
+			count++;
+			if (count == sbi->max_cluster)
+				break;
+		} while (fat_ent_next(sbi, &fatent));
+	}
+
+	/* Couldn't allocate the free entries */
+	sbi->free_clusters = 0;
+	sbi->free_clus_valid = 1;
+	err = -ENOSPC;
+
+out:
+	unlock_fat(sbi);
+	mark_fsinfo_dirty(sb);
+	fatent_brelse(&fatent);
+	if (!err) {
+		if (inode_needs_sync(inode))
+			err = fat_sync_bhs(bhs, nr_bhs);
+		if (!err)
+			err = fat_mirror_bhs(sb, bhs, nr_bhs);
+	}
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+
+	if (err && idx_clus)
+		fat_free_clusters(inode, cluster[0]);
+
+	return err;
+}
+
+int fat_free_clusters(struct inode *inode, int cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct fat_entry fatent;
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	int i, err, nr_bhs;
+	int first_cl = cluster, dirty_fsinfo = 0;
+
+	nr_bhs = 0;
+	fatent_init(&fatent);
+	lock_fat(sbi);
+	do {
+		cluster = fat_ent_read(inode, &fatent, cluster);
+		if (cluster < 0) {
+			err = cluster;
+			goto error;
+		} else if (cluster == FAT_ENT_FREE) {
+			fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
+				     __func__);
+			err = -EIO;
+			goto error;
+		}
+
+		if (sbi->options.discard) {
+			/*
+			 * Issue discard for the sectors we no longer
+			 * care about, batching contiguous clusters
+			 * into one request
+			 */
+			if (cluster != fatent.entry + 1) {
+				int nr_clus = fatent.entry - first_cl + 1;
+
+				sb_issue_discard(sb,
+					fat_clus_to_blknr(sbi, first_cl),
+					nr_clus * sbi->sec_per_clus,
+					GFP_NOFS, 0);
+
+				first_cl = cluster;
+			}
+		}
+
+		ops->ent_put(&fatent, FAT_ENT_FREE);
+		if (sbi->free_clusters != -1) {
+			sbi->free_clusters++;
+			dirty_fsinfo = 1;
+		}
+
+		if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
+			if (sb->s_flags & MS_SYNCHRONOUS) {
+				err = fat_sync_bhs(bhs, nr_bhs);
+				if (err)
+					goto error;
+			}
+			err = fat_mirror_bhs(sb, bhs, nr_bhs);
+			if (err)
+				goto error;
+			for (i = 0; i < nr_bhs; i++)
+				brelse(bhs[i]);
+			nr_bhs = 0;
+		}
+		fat_collect_bhs(bhs, &nr_bhs, &fatent);
+	} while (cluster != FAT_ENT_EOF);
+
+	if (sb->s_flags & MS_SYNCHRONOUS) {
+		err = fat_sync_bhs(bhs, nr_bhs);
+		if (err)
+			goto error;
+	}
+	err = fat_mirror_bhs(sb, bhs, nr_bhs);
+error:
+	fatent_brelse(&fatent);
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+	unlock_fat(sbi);
+	if (dirty_fsinfo)
+		mark_fsinfo_dirty(sb);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(fat_free_clusters);
+
+/* 128kb is the whole sectors for FAT12 and FAT16 */
+#define FAT_READA_SIZE		(128 * 1024)
+
+static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
+			  unsigned long reada_blocks)
+{
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	sector_t blocknr;
+	int i, offset;
+
+	ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+
+	for (i = 0; i < reada_blocks; i++)
+		sb_breadahead(sb, blocknr + i);
+}
+
+int fat_count_free_clusters(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct fat_entry fatent;
+	unsigned long reada_blocks, reada_mask, cur_block;
+	int err = 0, free;
+
+	lock_fat(sbi);
+	if (sbi->free_clusters != -1 && sbi->free_clus_valid)
+		goto out;
+
+	reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
+	reada_mask = reada_blocks - 1;
+	cur_block = 0;
+
+	free = 0;
+	fatent_init(&fatent);
+	fatent_set_entry(&fatent, FAT_START_ENT);
+	while (fatent.entry < sbi->max_cluster) {
+		/* readahead of fat blocks */
+		if ((cur_block & reada_mask) == 0) {
+			unsigned long rest = sbi->fat_length - cur_block;
+			fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
+		}
+		cur_block++;
+
+		err = fat_ent_read_block(sb, &fatent);
+		if (err)
+			goto out;
+
+		do {
+			if (ops->ent_get(&fatent) == FAT_ENT_FREE)
+				free++;
+		} while (fat_ent_next(sbi, &fatent));
+	}
+	sbi->free_clusters = free;
+	sbi->free_clus_valid = 1;
+	mark_fsinfo_dirty(sb);
+	fatent_brelse(&fatent);
+out:
+	unlock_fat(sbi);
+	return err;
+}
diff --git a/kernel/fs/fat/file.c b/kernel/fs/fat/file.c
new file mode 100644
index 000000000..442d50a0e
--- /dev/null
+++ b/kernel/fs/fat/file.c
@@ -0,0 +1,459 @@
+/*
+ *  linux/fs/fat/file.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  regular file handling primitives for fat-based filesystems
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include "fat.h"
+
+static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
+{
+	u32 attr;
+
+	mutex_lock(&inode->i_mutex);
+	attr = fat_make_attrs(inode);
+	mutex_unlock(&inode->i_mutex);
+
+	return put_user(attr, user_attr);
+}
+
+static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
+{
+	struct inode *inode = file_inode(file);
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	int is_dir = S_ISDIR(inode->i_mode);
+	u32 attr, oldattr;
+	struct iattr ia;
+	int err;
+
+	err = get_user(attr, user_attr);
+	if (err)
+		goto out;
+
+	err = mnt_want_write_file(file);
+	if (err)
+		goto out;
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
+	 * prevents the user from turning us into a VFAT
+	 * longname entry.  Also, we obviously can't set
+	 * any of the NTFS attributes in the high 24 bits.
+	 */
+	attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
+	/* Merge in ATTR_VOLUME and ATTR_DIR */
+	attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
+		(is_dir ? ATTR_DIR : 0);
+	oldattr = fat_make_attrs(inode);
+
+	/* Equivalent to a chmod() */
+	ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+	ia.ia_ctime = current_fs_time(inode->i_sb);
+	if (is_dir)
+		ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+	else {
+		ia.ia_mode = fat_make_mode(sbi, attr,
+			S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
+	}
+
+	/* The root directory has no attributes */
+	if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
+		err = -EINVAL;
+		goto out_unlock_inode;
+	}
+
+	if (sbi->options.sys_immutable &&
+	    ((attr | oldattr) & ATTR_SYS) &&
+	    !capable(CAP_LINUX_IMMUTABLE)) {
+		err = -EPERM;
+		goto out_unlock_inode;
+	}
+
+	/*
+	 * The security check is questionable...  We single
+	 * out the RO attribute for checking by the security
+	 * module, just because it maps to a file mode.
+	 */
+	err = security_inode_setattr(file->f_path.dentry, &ia);
+	if (err)
+		goto out_unlock_inode;
+
+	/* This MUST be done before doing anything irreversible... */
+	err = fat_setattr(file->f_path.dentry, &ia);
+	if (err)
+		goto out_unlock_inode;
+
+	fsnotify_change(file->f_path.dentry, ia.ia_valid);
+	if (sbi->options.sys_immutable) {
+		if (attr & ATTR_SYS)
+			inode->i_flags |= S_IMMUTABLE;
+		else
+			inode->i_flags &= ~S_IMMUTABLE;
+	}
+
+	fat_save_attrs(inode, attr);
+	mark_inode_dirty(inode);
+out_unlock_inode:
+	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write_file(file);
+out:
+	return err;
+}
+
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	return put_user(sbi->vol_id, user_attr);
+}
+
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	u32 __user *user_attr = (u32 __user *)arg;
+
+	switch (cmd) {
+	case FAT_IOCTL_GET_ATTRIBUTES:
+		return fat_ioctl_get_attributes(inode, user_attr);
+	case FAT_IOCTL_SET_ATTRIBUTES:
+		return fat_ioctl_set_attributes(filp, user_attr);
+	case FAT_IOCTL_GET_VOLUME_ID:
+		return fat_ioctl_get_volume_id(inode, user_attr);
+	default:
+		return -ENOTTY;	/* Inappropriate ioctl for device */
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+				      unsigned long arg)
+
+{
+	return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static int fat_file_release(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_mode & FMODE_WRITE) &&
+	     MSDOS_SB(inode->i_sb)->options.flush) {
+		fat_flush_inodes(inode->i_sb, inode, NULL);
+		congestion_wait(BLK_RW_ASYNC, HZ/10);
+	}
+	return 0;
+}
+
+int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int res, err;
+
+	res = generic_file_fsync(filp, start, end, datasync);
+	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+
+	return res ? res : err;
+}
+
+
+const struct file_operations fat_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.release	= fat_file_release,
+	.unlocked_ioctl	= fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= fat_generic_compat_ioctl,
+#endif
+	.fsync		= fat_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int fat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = inode->i_size, count = size - inode->i_size;
+	int err;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	if (IS_SYNC(inode)) {
+		int err2;
+
+		/*
+		 * Opencode syncing since we don't have a file open to use
+		 * standard fsync path.
+		 */
+		err = filemap_fdatawrite_range(mapping, start,
+					       start + count - 1);
+		err2 = sync_mapping_buffers(mapping);
+		if (!err)
+			err = err2;
+		err2 = write_inode_now(inode, 1);
+		if (!err)
+			err = err2;
+		if (!err) {
+			err =  filemap_fdatawait_range(mapping, start,
+						       start + count - 1);
+		}
+	}
+out:
+	return err;
+}
+
+/* Free all clusters after the skip'th cluster. */
+static int fat_free(struct inode *inode, int skip)
+{
+	struct super_block *sb = inode->i_sb;
+	int err, wait, free_start, i_start, i_logstart;
+
+	if (MSDOS_I(inode)->i_start == 0)
+		return 0;
+
+	fat_cache_inval_inode(inode);
+
+	wait = IS_DIRSYNC(inode);
+	i_start = free_start = MSDOS_I(inode)->i_start;
+	i_logstart = MSDOS_I(inode)->i_logstart;
+
+	/* First, we write the new file size. */
+	if (!skip) {
+		MSDOS_I(inode)->i_start = 0;
+		MSDOS_I(inode)->i_logstart = 0;
+	}
+	MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	if (wait) {
+		err = fat_sync_inode(inode);
+		if (err) {
+			MSDOS_I(inode)->i_start = i_start;
+			MSDOS_I(inode)->i_logstart = i_logstart;
+			return err;
+		}
+	} else
+		mark_inode_dirty(inode);
+
+	/* Write a new EOF, and get the remaining cluster chain for freeing. */
+	if (skip) {
+		struct fat_entry fatent;
+		int ret, fclus, dclus;
+
+		ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus);
+		if (ret < 0)
+			return ret;
+		else if (ret == FAT_ENT_EOF)
+			return 0;
+
+		fatent_init(&fatent);
+		ret = fat_ent_read(inode, &fatent, dclus);
+		if (ret == FAT_ENT_EOF) {
+			fatent_brelse(&fatent);
+			return 0;
+		} else if (ret == FAT_ENT_FREE) {
+			fat_fs_error(sb,
+				     "%s: invalid cluster chain (i_pos %lld)",
+				     __func__, MSDOS_I(inode)->i_pos);
+			ret = -EIO;
+		} else if (ret > 0) {
+			err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait);
+			if (err)
+				ret = err;
+		}
+		fatent_brelse(&fatent);
+		if (ret < 0)
+			return ret;
+
+		free_start = ret;
+	}
+	inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9);
+
+	/* Freeing the remained cluster chain */
+	return fat_free_clusters(inode, free_start);
+}
+
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	const unsigned int cluster_size = sbi->cluster_size;
+	int nr_clusters;
+
+	/*
+	 * This protects against truncating a file bigger than it was then
+	 * trying to write into the hole.
+	 */
+	if (MSDOS_I(inode)->mmu_private > offset)
+		MSDOS_I(inode)->mmu_private = offset;
+
+	nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
+
+	fat_free(inode, nr_clusters);
+	fat_flush_inodes(inode->i_sb, inode, NULL);
+}
+
+int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	generic_fillattr(inode, stat);
+	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
+
+	if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+		/* Use i_pos for ino. This is used as fileid of nfs. */
+		stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fat_getattr);
+
+static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
+			     struct inode *inode, umode_t *mode_ptr)
+{
+	umode_t mask, perm;
+
+	/*
+	 * Note, the basic check is already done by a caller of
+	 * (attr->ia_mode & ~FAT_VALID_MODE)
+	 */
+
+	if (S_ISREG(inode->i_mode))
+		mask = sbi->options.fs_fmask;
+	else
+		mask = sbi->options.fs_dmask;
+
+	perm = *mode_ptr & ~(S_IFMT | mask);
+
+	/*
+	 * Of the r and x bits, all (subject to umask) must be present. Of the
+	 * w bits, either all (subject to umask) or none must be present.
+	 *
+	 * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
+	 */
+	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
+		return -EPERM;
+	if (fat_mode_can_hold_ro(inode)) {
+		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+			return -EPERM;
+	} else {
+		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+			return -EPERM;
+	}
+
+	*mode_ptr &= S_IFMT | perm;
+
+	return 0;
+}
+
+static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
+{
+	umode_t allow_utime = sbi->options.allow_utime;
+
+	if (!uid_eq(current_fsuid(), inode->i_uid)) {
+		if (in_group_p(inode->i_gid))
+			allow_utime >>= 3;
+		if (allow_utime & MAY_WRITE)
+			return 1;
+	}
+
+	/* use a default check */
+	return 0;
+}
+
+#define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+/* valid file mode bits */
+#define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
+
+int fat_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
+	struct inode *inode = d_inode(dentry);
+	unsigned int ia_valid;
+	int error;
+
+	/* Check for setting the inode time. */
+	ia_valid = attr->ia_valid;
+	if (ia_valid & TIMES_SET_FLAGS) {
+		if (fat_allow_set_time(sbi, inode))
+			attr->ia_valid &= ~TIMES_SET_FLAGS;
+	}
+
+	error = inode_change_ok(inode, attr);
+	attr->ia_valid = ia_valid;
+	if (error) {
+		if (sbi->options.quiet)
+			error = 0;
+		goto out;
+	}
+
+	/*
+	 * Expand the file. Since inode_setattr() updates ->i_size
+	 * before calling the ->truncate(), but FAT needs to fill the
+	 * hole before it. XXX: this is no longer true with new truncate
+	 * sequence.
+	 */
+	if (attr->ia_valid & ATTR_SIZE) {
+		inode_dio_wait(inode);
+
+		if (attr->ia_size > inode->i_size) {
+			error = fat_cont_expand(inode, attr->ia_size);
+			if (error || attr->ia_valid == ATTR_SIZE)
+				goto out;
+			attr->ia_valid &= ~ATTR_SIZE;
+		}
+	}
+
+	if (((attr->ia_valid & ATTR_UID) &&
+	     (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
+	    ((attr->ia_valid & ATTR_GID) &&
+	     (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
+	    ((attr->ia_valid & ATTR_MODE) &&
+	     (attr->ia_mode & ~FAT_VALID_MODE)))
+		error = -EPERM;
+
+	if (error) {
+		if (sbi->options.quiet)
+			error = 0;
+		goto out;
+	}
+
+	/*
+	 * We don't return -EPERM here. Yes, strange, but this is too
+	 * old behavior.
+	 */
+	if (attr->ia_valid & ATTR_MODE) {
+		if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+			attr->ia_valid &= ~ATTR_MODE;
+	}
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		error = fat_block_truncate_page(inode, attr->ia_size);
+		if (error)
+			goto out;
+		down_write(&MSDOS_I(inode)->truncate_lock);
+		truncate_setsize(inode, attr->ia_size);
+		fat_truncate_blocks(inode, attr->ia_size);
+		up_write(&MSDOS_I(inode)->truncate_lock);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+out:
+	return error;
+}
+EXPORT_SYMBOL_GPL(fat_setattr);
+
+const struct inode_operations fat_file_inode_operations = {
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
diff --git a/kernel/fs/fat/inode.c b/kernel/fs/fat/inode.c
new file mode 100644
index 000000000..c06774658
--- /dev/null
+++ b/kernel/fs/fat/inode.c
@@ -0,0 +1,1850 @@
+/*
+ *  linux/fs/fat/inode.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner
+ *  Rewritten for the constant inumbers support by Al Viro
+ *
+ *  Fixes:
+ *
+ *	Max Cohan: Fixed invalid FSINFO offset when info_sector is 0
+ */
+
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <asm/unaligned.h>
+#include "fat.h"
+
+#ifndef CONFIG_FAT_DEFAULT_IOCHARSET
+/* if user don't select VFAT, this is undefined. */
+#define CONFIG_FAT_DEFAULT_IOCHARSET	""
+#endif
+
+#define KB_IN_SECTORS 2
+
+/*
+ * A deserialized copy of the on-disk structure laid out in struct
+ * fat_boot_sector.
+ */
+struct fat_bios_param_block {
+	u16	fat_sector_size;
+	u8	fat_sec_per_clus;
+	u16	fat_reserved;
+	u8	fat_fats;
+	u16	fat_dir_entries;
+	u16	fat_sectors;
+	u16	fat_fat_length;
+	u32	fat_total_sect;
+
+	u8	fat16_state;
+	u32	fat16_vol_id;
+
+	u32	fat32_length;
+	u32	fat32_root_cluster;
+	u16	fat32_info_sector;
+	u8	fat32_state;
+	u32	fat32_vol_id;
+};
+
+static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE;
+static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET;
+
+static struct fat_floppy_defaults {
+	unsigned nr_sectors;
+	unsigned sec_per_clus;
+	unsigned dir_entries;
+	unsigned media;
+	unsigned fat_length;
+} floppy_defaults[] = {
+{
+	.nr_sectors = 160 * KB_IN_SECTORS,
+	.sec_per_clus = 1,
+	.dir_entries = 64,
+	.media = 0xFE,
+	.fat_length = 1,
+},
+{
+	.nr_sectors = 180 * KB_IN_SECTORS,
+	.sec_per_clus = 1,
+	.dir_entries = 64,
+	.media = 0xFC,
+	.fat_length = 2,
+},
+{
+	.nr_sectors = 320 * KB_IN_SECTORS,
+	.sec_per_clus = 2,
+	.dir_entries = 112,
+	.media = 0xFF,
+	.fat_length = 1,
+},
+{
+	.nr_sectors = 360 * KB_IN_SECTORS,
+	.sec_per_clus = 2,
+	.dir_entries = 112,
+	.media = 0xFD,
+	.fat_length = 2,
+},
+};
+
+static int fat_add_cluster(struct inode *inode)
+{
+	int err, cluster;
+
+	err = fat_alloc_clusters(inode, &cluster, 1);
+	if (err)
+		return err;
+	/* FIXME: this cluster should be added after data of this
+	 * cluster is writed */
+	err = fat_chain_add(inode, cluster, 1);
+	if (err)
+		fat_free_clusters(inode, cluster);
+	return err;
+}
+
+static inline int __fat_get_block(struct inode *inode, sector_t iblock,
+				  unsigned long *max_blocks,
+				  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	unsigned long mapped_blocks;
+	sector_t phys;
+	int err, offset;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+	if (err)
+		return err;
+	if (phys) {
+		map_bh(bh_result, sb, phys);
+		*max_blocks = min(mapped_blocks, *max_blocks);
+		return 0;
+	}
+	if (!create)
+		return 0;
+
+	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
+		fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
+			MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
+		return -EIO;
+	}
+
+	offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
+	if (!offset) {
+		/* TODO: multiple cluster allocation would be desirable. */
+		err = fat_add_cluster(inode);
+		if (err)
+			return err;
+	}
+	/* available blocks on this cluster */
+	mapped_blocks = sbi->sec_per_clus - offset;
+
+	*max_blocks = min(mapped_blocks, *max_blocks);
+	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+	if (err)
+		return err;
+
+	BUG_ON(!phys);
+	BUG_ON(*max_blocks != mapped_blocks);
+	set_buffer_new(bh_result);
+	map_bh(bh_result, sb, phys);
+
+	return 0;
+}
+
+static int fat_get_block(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int err;
+
+	err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create);
+	if (err)
+		return err;
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	return 0;
+}
+
+static int fat_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, fat_get_block, wbc);
+}
+
+static int fat_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, fat_get_block);
+}
+
+static int fat_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, fat_get_block);
+}
+
+static int fat_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
+}
+
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		fat_truncate_blocks(inode, inode->i_size);
+	}
+}
+
+static int fat_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int err;
+
+	*pagep = NULL;
+	err = cont_write_begin(file, mapping, pos, len, flags,
+				pagep, fsdata, fat_get_block,
+				&MSDOS_I(mapping->host)->mmu_private);
+	if (err < 0)
+		fat_write_failed(mapping, pos + len);
+	return err;
+}
+
+static int fat_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pagep, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int err;
+	err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+	if (err < len)
+		fat_write_failed(mapping, pos + len);
+	if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+		MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
+		mark_inode_dirty(inode);
+	}
+	return err;
+}
+
+static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			     loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (iov_iter_rw(iter) == WRITE) {
+		/*
+		 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
+		 * so we need to update the ->mmu_private to block boundary.
+		 *
+		 * But we must fill the remaining area or hole by nul for
+		 * updating ->mmu_private.
+		 *
+		 * Return 0, and fallback to normal buffered write.
+		 */
+		loff_t size = offset + count;
+		if (MSDOS_I(inode)->mmu_private < size)
+			return 0;
+	}
+
+	/*
+	 * FAT need to use the DIO_LOCKING for avoiding the race
+	 * condition of fat_get_block() and ->truncate().
+	 */
+	ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block);
+	if (ret < 0 && iov_iter_rw(iter) == WRITE)
+		fat_write_failed(mapping, offset + count);
+
+	return ret;
+}
+
+static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t blocknr;
+
+	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
+	down_read(&MSDOS_I(mapping->host)->truncate_lock);
+	blocknr = generic_block_bmap(mapping, block, fat_get_block);
+	up_read(&MSDOS_I(mapping->host)->truncate_lock);
+
+	return blocknr;
+}
+
+/*
+ * fat_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This is required during truncate to physically zeroout the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ * Also, avoid causing failure from fsx for cases of "data past EOF"
+ */
+int fat_block_truncate_page(struct inode *inode, loff_t from)
+{
+	return block_truncate_page(inode->i_mapping, from, fat_get_block);
+}
+
+static const struct address_space_operations fat_aops = {
+	.readpage	= fat_readpage,
+	.readpages	= fat_readpages,
+	.writepage	= fat_writepage,
+	.writepages	= fat_writepages,
+	.write_begin	= fat_write_begin,
+	.write_end	= fat_write_end,
+	.direct_IO	= fat_direct_IO,
+	.bmap		= _fat_bmap
+};
+
+/*
+ * New FAT inode stuff. We do the following:
+ *	a) i_ino is constant and has nothing with on-disk location.
+ *	b) FAT manages its own cache of directory entries.
+ *	c) *This* cache is indexed by on-disk location.
+ *	d) inode has an associated directory entry, all right, but
+ *		it may be unhashed.
+ *	e) currently entries are stored within struct inode. That should
+ *		change.
+ *	f) we deal with races in the following way:
+ *		1. readdir() and lookup() do FAT-dir-cache lookup.
+ *		2. rename() unhashes the F-d-c entry and rehashes it in
+ *			a new place.
+ *		3. unlink() and rmdir() unhash F-d-c entry.
+ *		4. fat_write_inode() checks whether the thing is unhashed.
+ *			If it is we silently return. If it isn't we do bread(),
+ *			check if the location is still valid and retry if it
+ *			isn't. Otherwise we do changes.
+ *		5. Spinlock is used to protect hash/unhash/location check/lookup
+ *		6. fat_evict_inode() unhashes the F-d-c entry.
+ *		7. lookup() and readdir() do igrab() if they find a F-d-c entry
+ *			and consider negative result as cache miss.
+ */
+
+static void fat_hash_init(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int i;
+
+	spin_lock_init(&sbi->inode_hash_lock);
+	for (i = 0; i < FAT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
+}
+
+static inline unsigned long fat_hash(loff_t i_pos)
+{
+	return hash_32(i_pos, FAT_HASH_BITS);
+}
+
+static void dir_hash_init(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int i;
+
+	spin_lock_init(&sbi->dir_hash_lock);
+	for (i = 0; i < FAT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&sbi->dir_hashtable[i]);
+}
+
+void fat_attach(struct inode *inode, loff_t i_pos)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+
+	if (inode->i_ino != MSDOS_ROOT_INO) {
+		struct hlist_head *head =   sbi->inode_hashtable
+					  + fat_hash(i_pos);
+
+		spin_lock(&sbi->inode_hash_lock);
+		MSDOS_I(inode)->i_pos = i_pos;
+		hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
+		spin_unlock(&sbi->inode_hash_lock);
+	}
+
+	/* If NFS support is enabled, cache the mapping of start cluster
+	 * to directory inode. This is used during reconnection of
+	 * dentries to the filesystem root.
+	 */
+	if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
+		struct hlist_head *d_head = sbi->dir_hashtable;
+		d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart);
+
+		spin_lock(&sbi->dir_hash_lock);
+		hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head);
+		spin_unlock(&sbi->dir_hash_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(fat_attach);
+
+void fat_detach(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	spin_lock(&sbi->inode_hash_lock);
+	MSDOS_I(inode)->i_pos = 0;
+	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
+	spin_unlock(&sbi->inode_hash_lock);
+
+	if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
+		spin_lock(&sbi->dir_hash_lock);
+		hlist_del_init(&MSDOS_I(inode)->i_dir_hash);
+		spin_unlock(&sbi->dir_hash_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(fat_detach);
+
+struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
+	struct msdos_inode_info *i;
+	struct inode *inode = NULL;
+
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_for_each_entry(i, head, i_fat_hash) {
+		BUG_ON(i->vfs_inode.i_sb != sb);
+		if (i->i_pos != i_pos)
+			continue;
+		inode = igrab(&i->vfs_inode);
+		if (inode)
+			break;
+	}
+	spin_unlock(&sbi->inode_hash_lock);
+	return inode;
+}
+
+static int is_exec(unsigned char *extension)
+{
+	unsigned char exe_extensions[] = "EXECOMBAT", *walk;
+
+	for (walk = exe_extensions; *walk; walk += 3)
+		if (!strncmp(extension, walk, 3))
+			return 1;
+	return 0;
+}
+
+static int fat_calc_dir_size(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	int ret, fclus, dclus;
+
+	inode->i_size = 0;
+	if (MSDOS_I(inode)->i_start == 0)
+		return 0;
+
+	ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus);
+	if (ret < 0)
+		return ret;
+	inode->i_size = (fclus + 1) << sbi->cluster_bits;
+
+	return 0;
+}
+
+/* doesn't deal with root inode */
+int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	int error;
+
+	MSDOS_I(inode)->i_pos = 0;
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = get_seconds();
+
+	if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
+		inode->i_generation &= ~1;
+		inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
+		inode->i_op = sbi->dir_ops;
+		inode->i_fop = &fat_dir_operations;
+
+		MSDOS_I(inode)->i_start = fat_get_start(sbi, de);
+		MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start;
+		error = fat_calc_dir_size(inode);
+		if (error < 0)
+			return error;
+		MSDOS_I(inode)->mmu_private = inode->i_size;
+
+		set_nlink(inode, fat_subdirs(inode));
+	} else { /* not a directory */
+		inode->i_generation |= 1;
+		inode->i_mode = fat_make_mode(sbi, de->attr,
+			((sbi->options.showexec && !is_exec(de->name + 8))
+			 ? S_IRUGO|S_IWUGO : S_IRWXUGO));
+		MSDOS_I(inode)->i_start = fat_get_start(sbi, de);
+
+		MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start;
+		inode->i_size = le32_to_cpu(de->size);
+		inode->i_op = &fat_file_inode_operations;
+		inode->i_fop = &fat_file_operations;
+		inode->i_mapping->a_ops = &fat_aops;
+		MSDOS_I(inode)->mmu_private = inode->i_size;
+	}
+	if (de->attr & ATTR_SYS) {
+		if (sbi->options.sys_immutable)
+			inode->i_flags |= S_IMMUTABLE;
+	}
+	fat_save_attrs(inode, de->attr);
+
+	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
+			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
+
+	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
+	if (sbi->options.isvfat) {
+		fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
+				  de->cdate, de->ctime_cs);
+		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+	} else
+		inode->i_ctime = inode->i_atime = inode->i_mtime;
+
+	return 0;
+}
+
+static inline void fat_lock_build_inode(struct msdos_sb_info *sbi)
+{
+	if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+		mutex_lock(&sbi->nfs_build_inode_lock);
+}
+
+static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi)
+{
+	if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+		mutex_unlock(&sbi->nfs_build_inode_lock);
+}
+
+struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos)
+{
+	struct inode *inode;
+	int err;
+
+	fat_lock_build_inode(MSDOS_SB(sb));
+	inode = fat_iget(sb, i_pos);
+	if (inode)
+		goto out;
+	inode = new_inode(sb);
+	if (!inode) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	inode->i_ino = iunique(sb, MSDOS_ROOT_INO);
+	inode->i_version = 1;
+	err = fat_fill_inode(inode, de);
+	if (err) {
+		iput(inode);
+		inode = ERR_PTR(err);
+		goto out;
+	}
+	fat_attach(inode, i_pos);
+	insert_inode_hash(inode);
+out:
+	fat_unlock_build_inode(MSDOS_SB(sb));
+	return inode;
+}
+
+EXPORT_SYMBOL_GPL(fat_build_inode);
+
+static void fat_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		fat_truncate_blocks(inode, 0);
+	}
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+	fat_cache_inval_inode(inode);
+	fat_detach(inode);
+}
+
+static void fat_set_state(struct super_block *sb,
+			unsigned int set, unsigned int force)
+{
+	struct buffer_head *bh;
+	struct fat_boot_sector *b;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	/* do not change any thing if mounted read only */
+	if ((sb->s_flags & MS_RDONLY) && !force)
+		return;
+
+	/* do not change state if fs was dirty */
+	if (sbi->dirty) {
+		/* warn only on set (mount). */
+		if (set)
+			fat_msg(sb, KERN_WARNING, "Volume was not properly "
+				"unmounted. Some data may be corrupt. "
+				"Please run fsck.");
+		return;
+	}
+
+	bh = sb_bread(sb, 0);
+	if (bh == NULL) {
+		fat_msg(sb, KERN_ERR, "unable to read boot sector "
+			"to mark fs as dirty");
+		return;
+	}
+
+	b = (struct fat_boot_sector *) bh->b_data;
+
+	if (sbi->fat_bits == 32) {
+		if (set)
+			b->fat32.state |= FAT_STATE_DIRTY;
+		else
+			b->fat32.state &= ~FAT_STATE_DIRTY;
+	} else /* fat 16 and 12 */ {
+		if (set)
+			b->fat16.state |= FAT_STATE_DIRTY;
+		else
+			b->fat16.state &= ~FAT_STATE_DIRTY;
+	}
+
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+}
+
+static void delayed_free(struct rcu_head *p)
+{
+	struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu);
+	unload_nls(sbi->nls_disk);
+	unload_nls(sbi->nls_io);
+	if (sbi->options.iocharset != fat_default_iocharset)
+		kfree(sbi->options.iocharset);
+	kfree(sbi);
+}
+
+static void fat_put_super(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	fat_set_state(sb, 0, 0);
+
+	iput(sbi->fsinfo_inode);
+	iput(sbi->fat_inode);
+
+	call_rcu(&sbi->rcu, delayed_free);
+}
+
+static struct kmem_cache *fat_inode_cachep;
+
+static struct inode *fat_alloc_inode(struct super_block *sb)
+{
+	struct msdos_inode_info *ei;
+	ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	init_rwsem(&ei->truncate_lock);
+	return &ei->vfs_inode;
+}
+
+static void fat_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
+}
+
+static void fat_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, fat_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
+
+	spin_lock_init(&ei->cache_lru_lock);
+	ei->nr_caches = 0;
+	ei->cache_valid_id = FAT_CACHE_VALID + 1;
+	INIT_LIST_HEAD(&ei->cache_lru);
+	INIT_HLIST_NODE(&ei->i_fat_hash);
+	INIT_HLIST_NODE(&ei->i_dir_hash);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init fat_init_inodecache(void)
+{
+	fat_inode_cachep = kmem_cache_create("fat_inode_cache",
+					     sizeof(struct msdos_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (fat_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void __exit fat_destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(fat_inode_cachep);
+}
+
+static int fat_remount(struct super_block *sb, int *flags, char *data)
+{
+	int new_rdonly;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+
+	sync_filesystem(sb);
+
+	/* make sure we update state on remount. */
+	new_rdonly = *flags & MS_RDONLY;
+	if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
+		if (new_rdonly)
+			fat_set_state(sb, 0, 0);
+		else
+			fat_set_state(sb, 1, 1);
+	}
+	return 0;
+}
+
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	/* If the count of free cluster is still unknown, counts it here. */
+	if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
+		int err = fat_count_free_clusters(dentry->d_sb);
+		if (err)
+			return err;
+	}
+
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = sbi->cluster_size;
+	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
+	buf->f_bfree = sbi->free_clusters;
+	buf->f_bavail = sbi->free_clusters;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen =
+		(sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
+
+	return 0;
+}
+
+static int __fat_write_inode(struct inode *inode, int wait)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	struct msdos_dir_entry *raw_entry;
+	loff_t i_pos;
+	sector_t blocknr;
+	int err, offset;
+
+	if (inode->i_ino == MSDOS_ROOT_INO)
+		return 0;
+
+retry:
+	i_pos = fat_i_pos_read(sbi, inode);
+	if (!i_pos)
+		return 0;
+
+	fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset);
+	bh = sb_bread(sb, blocknr);
+	if (!bh) {
+		fat_msg(sb, KERN_ERR, "unable to read inode block "
+		       "for updating (i_pos %lld)", i_pos);
+		return -EIO;
+	}
+	spin_lock(&sbi->inode_hash_lock);
+	if (i_pos != MSDOS_I(inode)->i_pos) {
+		spin_unlock(&sbi->inode_hash_lock);
+		brelse(bh);
+		goto retry;
+	}
+
+	raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset];
+	if (S_ISDIR(inode->i_mode))
+		raw_entry->size = 0;
+	else
+		raw_entry->size = cpu_to_le32(inode->i_size);
+	raw_entry->attr = fat_make_attrs(inode);
+	fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
+	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+			  &raw_entry->date, NULL);
+	if (sbi->options.isvfat) {
+		__le16 atime;
+		fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
+				  &raw_entry->cdate, &raw_entry->ctime_cs);
+		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
+				  &raw_entry->adate, NULL);
+	}
+	spin_unlock(&sbi->inode_hash_lock);
+	mark_buffer_dirty(bh);
+	err = 0;
+	if (wait)
+		err = sync_dirty_buffer(bh);
+	brelse(bh);
+	return err;
+}
+
+static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int err;
+
+	if (inode->i_ino == MSDOS_FSINFO_INO) {
+		struct super_block *sb = inode->i_sb;
+
+		mutex_lock(&MSDOS_SB(sb)->s_lock);
+		err = fat_clusters_flush(sb);
+		mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	} else
+		err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+
+	return err;
+}
+
+int fat_sync_inode(struct inode *inode)
+{
+	return __fat_write_inode(inode, 1);
+}
+
+EXPORT_SYMBOL_GPL(fat_sync_inode);
+
+static int fat_show_options(struct seq_file *m, struct dentry *root);
+static const struct super_operations fat_sops = {
+	.alloc_inode	= fat_alloc_inode,
+	.destroy_inode	= fat_destroy_inode,
+	.write_inode	= fat_write_inode,
+	.evict_inode	= fat_evict_inode,
+	.put_super	= fat_put_super,
+	.statfs		= fat_statfs,
+	.remount_fs	= fat_remount,
+
+	.show_options	= fat_show_options,
+};
+
+static int fat_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb);
+	struct fat_mount_options *opts = &sbi->options;
+	int isvfat = opts->isvfat;
+
+	if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+				from_kuid_munged(&init_user_ns, opts->fs_uid));
+	if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+				from_kgid_munged(&init_user_ns, opts->fs_gid));
+	seq_printf(m, ",fmask=%04o", opts->fs_fmask);
+	seq_printf(m, ",dmask=%04o", opts->fs_dmask);
+	if (opts->allow_utime)
+		seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
+	if (sbi->nls_disk)
+		/* strip "cp" prefix from displayed option */
+		seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]);
+	if (isvfat) {
+		if (sbi->nls_io)
+			seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
+
+		switch (opts->shortname) {
+		case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95:
+			seq_puts(m, ",shortname=win95");
+			break;
+		case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT:
+			seq_puts(m, ",shortname=winnt");
+			break;
+		case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95:
+			seq_puts(m, ",shortname=mixed");
+			break;
+		case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
+			seq_puts(m, ",shortname=lower");
+			break;
+		default:
+			seq_puts(m, ",shortname=unknown");
+			break;
+		}
+	}
+	if (opts->name_check != 'n')
+		seq_printf(m, ",check=%c", opts->name_check);
+	if (opts->usefree)
+		seq_puts(m, ",usefree");
+	if (opts->quiet)
+		seq_puts(m, ",quiet");
+	if (opts->showexec)
+		seq_puts(m, ",showexec");
+	if (opts->sys_immutable)
+		seq_puts(m, ",sys_immutable");
+	if (!isvfat) {
+		if (opts->dotsOK)
+			seq_puts(m, ",dotsOK=yes");
+		if (opts->nocase)
+			seq_puts(m, ",nocase");
+	} else {
+		if (opts->utf8)
+			seq_puts(m, ",utf8");
+		if (opts->unicode_xlate)
+			seq_puts(m, ",uni_xlate");
+		if (!opts->numtail)
+			seq_puts(m, ",nonumtail");
+		if (opts->rodir)
+			seq_puts(m, ",rodir");
+	}
+	if (opts->flush)
+		seq_puts(m, ",flush");
+	if (opts->tz_set) {
+		if (opts->time_offset)
+			seq_printf(m, ",time_offset=%d", opts->time_offset);
+		else
+			seq_puts(m, ",tz=UTC");
+	}
+	if (opts->errors == FAT_ERRORS_CONT)
+		seq_puts(m, ",errors=continue");
+	else if (opts->errors == FAT_ERRORS_PANIC)
+		seq_puts(m, ",errors=panic");
+	else
+		seq_puts(m, ",errors=remount-ro");
+	if (opts->nfs == FAT_NFS_NOSTALE_RO)
+		seq_puts(m, ",nfs=nostale_ro");
+	else if (opts->nfs)
+		seq_puts(m, ",nfs=stale_rw");
+	if (opts->discard)
+		seq_puts(m, ",discard");
+	if (opts->dos1xfloppy)
+		seq_puts(m, ",dos1xfloppy");
+
+	return 0;
+}
+
+enum {
+	Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
+	Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage,
+	Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug,
+	Opt_immutable, Opt_dots, Opt_nodots,
+	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
+	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
+	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
+	Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
+	Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
+	Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
+};
+
+static const match_table_t fat_tokens = {
+	{Opt_check_r, "check=relaxed"},
+	{Opt_check_s, "check=strict"},
+	{Opt_check_n, "check=normal"},
+	{Opt_check_r, "check=r"},
+	{Opt_check_s, "check=s"},
+	{Opt_check_n, "check=n"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+	{Opt_allow_utime, "allow_utime=%o"},
+	{Opt_codepage, "codepage=%u"},
+	{Opt_usefree, "usefree"},
+	{Opt_nocase, "nocase"},
+	{Opt_quiet, "quiet"},
+	{Opt_showexec, "showexec"},
+	{Opt_debug, "debug"},
+	{Opt_immutable, "sys_immutable"},
+	{Opt_flush, "flush"},
+	{Opt_tz_utc, "tz=UTC"},
+	{Opt_time_offset, "time_offset=%d"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_discard, "discard"},
+	{Opt_nfs_stale_rw, "nfs"},
+	{Opt_nfs_stale_rw, "nfs=stale_rw"},
+	{Opt_nfs_nostale_ro, "nfs=nostale_ro"},
+	{Opt_dos1xfloppy, "dos1xfloppy"},
+	{Opt_obsolete, "conv=binary"},
+	{Opt_obsolete, "conv=text"},
+	{Opt_obsolete, "conv=auto"},
+	{Opt_obsolete, "conv=b"},
+	{Opt_obsolete, "conv=t"},
+	{Opt_obsolete, "conv=a"},
+	{Opt_obsolete, "fat=%u"},
+	{Opt_obsolete, "blocksize=%u"},
+	{Opt_obsolete, "cvf_format=%20s"},
+	{Opt_obsolete, "cvf_options=%100s"},
+	{Opt_obsolete, "posix"},
+	{Opt_err, NULL},
+};
+static const match_table_t msdos_tokens = {
+	{Opt_nodots, "nodots"},
+	{Opt_nodots, "dotsOK=no"},
+	{Opt_dots, "dots"},
+	{Opt_dots, "dotsOK=yes"},
+	{Opt_err, NULL}
+};
+static const match_table_t vfat_tokens = {
+	{Opt_charset, "iocharset=%s"},
+	{Opt_shortname_lower, "shortname=lower"},
+	{Opt_shortname_win95, "shortname=win95"},
+	{Opt_shortname_winnt, "shortname=winnt"},
+	{Opt_shortname_mixed, "shortname=mixed"},
+	{Opt_utf8_no, "utf8=0"},		/* 0 or no or false */
+	{Opt_utf8_no, "utf8=no"},
+	{Opt_utf8_no, "utf8=false"},
+	{Opt_utf8_yes, "utf8=1"},		/* empty or 1 or yes or true */
+	{Opt_utf8_yes, "utf8=yes"},
+	{Opt_utf8_yes, "utf8=true"},
+	{Opt_utf8_yes, "utf8"},
+	{Opt_uni_xl_no, "uni_xlate=0"},		/* 0 or no or false */
+	{Opt_uni_xl_no, "uni_xlate=no"},
+	{Opt_uni_xl_no, "uni_xlate=false"},
+	{Opt_uni_xl_yes, "uni_xlate=1"},	/* empty or 1 or yes or true */
+	{Opt_uni_xl_yes, "uni_xlate=yes"},
+	{Opt_uni_xl_yes, "uni_xlate=true"},
+	{Opt_uni_xl_yes, "uni_xlate"},
+	{Opt_nonumtail_no, "nonumtail=0"},	/* 0 or no or false */
+	{Opt_nonumtail_no, "nonumtail=no"},
+	{Opt_nonumtail_no, "nonumtail=false"},
+	{Opt_nonumtail_yes, "nonumtail=1"},	/* empty or 1 or yes or true */
+	{Opt_nonumtail_yes, "nonumtail=yes"},
+	{Opt_nonumtail_yes, "nonumtail=true"},
+	{Opt_nonumtail_yes, "nonumtail"},
+	{Opt_rodir, "rodir"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options, int is_vfat,
+			 int silent, int *debug, struct fat_mount_options *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *iocharset;
+
+	opts->isvfat = is_vfat;
+
+	opts->fs_uid = current_uid();
+	opts->fs_gid = current_gid();
+	opts->fs_fmask = opts->fs_dmask = current_umask();
+	opts->allow_utime = -1;
+	opts->codepage = fat_default_codepage;
+	opts->iocharset = fat_default_iocharset;
+	if (is_vfat) {
+		opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
+		opts->rodir = 0;
+	} else {
+		opts->shortname = 0;
+		opts->rodir = 1;
+	}
+	opts->name_check = 'n';
+	opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
+	opts->utf8 = opts->unicode_xlate = 0;
+	opts->numtail = 1;
+	opts->usefree = opts->nocase = 0;
+	opts->tz_set = 0;
+	opts->nfs = 0;
+	opts->errors = FAT_ERRORS_RO;
+	*debug = 0;
+
+	if (!options)
+		goto out;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, fat_tokens, args);
+		if (token == Opt_err) {
+			if (is_vfat)
+				token = match_token(p, vfat_tokens, args);
+			else
+				token = match_token(p, msdos_tokens, args);
+		}
+		switch (token) {
+		case Opt_check_s:
+			opts->name_check = 's';
+			break;
+		case Opt_check_r:
+			opts->name_check = 'r';
+			break;
+		case Opt_check_n:
+			opts->name_check = 'n';
+			break;
+		case Opt_usefree:
+			opts->usefree = 1;
+			break;
+		case Opt_nocase:
+			if (!is_vfat)
+				opts->nocase = 1;
+			else {
+				/* for backward compatibility */
+				opts->shortname = VFAT_SFN_DISPLAY_WIN95
+					| VFAT_SFN_CREATE_WIN95;
+			}
+			break;
+		case Opt_quiet:
+			opts->quiet = 1;
+			break;
+		case Opt_showexec:
+			opts->showexec = 1;
+			break;
+		case Opt_debug:
+			*debug = 1;
+			break;
+		case Opt_immutable:
+			opts->sys_immutable = 1;
+			break;
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			opts->fs_uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(opts->fs_uid))
+				return -EINVAL;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			opts->fs_gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(opts->fs_gid))
+				return -EINVAL;
+			break;
+		case Opt_umask:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->fs_fmask = opts->fs_dmask = option;
+			break;
+		case Opt_dmask:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->fs_dmask = option;
+			break;
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->fs_fmask = option;
+			break;
+		case Opt_allow_utime:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->allow_utime = option & (S_IWGRP | S_IWOTH);
+			break;
+		case Opt_codepage:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			opts->codepage = option;
+			break;
+		case Opt_flush:
+			opts->flush = 1;
+			break;
+		case Opt_time_offset:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option < -12 * 60 || option > 12 * 60)
+				return -EINVAL;
+			opts->tz_set = 1;
+			opts->time_offset = option;
+			break;
+		case Opt_tz_utc:
+			opts->tz_set = 1;
+			opts->time_offset = 0;
+			break;
+		case Opt_err_cont:
+			opts->errors = FAT_ERRORS_CONT;
+			break;
+		case Opt_err_panic:
+			opts->errors = FAT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			opts->errors = FAT_ERRORS_RO;
+			break;
+		case Opt_nfs_stale_rw:
+			opts->nfs = FAT_NFS_STALE_RW;
+			break;
+		case Opt_nfs_nostale_ro:
+			opts->nfs = FAT_NFS_NOSTALE_RO;
+			break;
+		case Opt_dos1xfloppy:
+			opts->dos1xfloppy = 1;
+			break;
+
+		/* msdos specific */
+		case Opt_dots:
+			opts->dotsOK = 1;
+			break;
+		case Opt_nodots:
+			opts->dotsOK = 0;
+			break;
+
+		/* vfat specific */
+		case Opt_charset:
+			if (opts->iocharset != fat_default_iocharset)
+				kfree(opts->iocharset);
+			iocharset = match_strdup(&args[0]);
+			if (!iocharset)
+				return -ENOMEM;
+			opts->iocharset = iocharset;
+			break;
+		case Opt_shortname_lower:
+			opts->shortname = VFAT_SFN_DISPLAY_LOWER
+					| VFAT_SFN_CREATE_WIN95;
+			break;
+		case Opt_shortname_win95:
+			opts->shortname = VFAT_SFN_DISPLAY_WIN95
+					| VFAT_SFN_CREATE_WIN95;
+			break;
+		case Opt_shortname_winnt:
+			opts->shortname = VFAT_SFN_DISPLAY_WINNT
+					| VFAT_SFN_CREATE_WINNT;
+			break;
+		case Opt_shortname_mixed:
+			opts->shortname = VFAT_SFN_DISPLAY_WINNT
+					| VFAT_SFN_CREATE_WIN95;
+			break;
+		case Opt_utf8_no:		/* 0 or no or false */
+			opts->utf8 = 0;
+			break;
+		case Opt_utf8_yes:		/* empty or 1 or yes or true */
+			opts->utf8 = 1;
+			break;
+		case Opt_uni_xl_no:		/* 0 or no or false */
+			opts->unicode_xlate = 0;
+			break;
+		case Opt_uni_xl_yes:		/* empty or 1 or yes or true */
+			opts->unicode_xlate = 1;
+			break;
+		case Opt_nonumtail_no:		/* 0 or no or false */
+			opts->numtail = 1;	/* negated option */
+			break;
+		case Opt_nonumtail_yes:		/* empty or 1 or yes or true */
+			opts->numtail = 0;	/* negated option */
+			break;
+		case Opt_rodir:
+			opts->rodir = 1;
+			break;
+		case Opt_discard:
+			opts->discard = 1;
+			break;
+
+		/* obsolete mount options */
+		case Opt_obsolete:
+			fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
+			       "not supported now", p);
+			break;
+		/* unknown option */
+		default:
+			if (!silent) {
+				fat_msg(sb, KERN_ERR,
+				       "Unrecognized mount option \"%s\" "
+				       "or missing value", p);
+			}
+			return -EINVAL;
+		}
+	}
+
+out:
+	/* UTF-8 doesn't provide FAT semantics */
+	if (!strcmp(opts->iocharset, "utf8")) {
+		fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
+		       " for FAT filesystems, filesystem will be "
+		       "case sensitive!");
+	}
+
+	/* If user doesn't specify allow_utime, it's initialized from dmask. */
+	if (opts->allow_utime == (unsigned short)-1)
+		opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
+	if (opts->unicode_xlate)
+		opts->utf8 = 0;
+	if (opts->nfs == FAT_NFS_NOSTALE_RO) {
+		sb->s_flags |= MS_RDONLY;
+		sb->s_export_op = &fat_export_ops_nostale;
+	}
+
+	return 0;
+}
+
+static int fat_read_root(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	int error;
+
+	MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = 0;
+	inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
+	inode->i_op = sbi->dir_ops;
+	inode->i_fop = &fat_dir_operations;
+	if (sbi->fat_bits == 32) {
+		MSDOS_I(inode)->i_start = sbi->root_cluster;
+		error = fat_calc_dir_size(inode);
+		if (error < 0)
+			return error;
+	} else {
+		MSDOS_I(inode)->i_start = 0;
+		inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry);
+	}
+	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
+			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
+	MSDOS_I(inode)->i_logstart = 0;
+	MSDOS_I(inode)->mmu_private = inode->i_size;
+
+	fat_save_attrs(inode, ATTR_DIR);
+	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
+	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+	set_nlink(inode, fat_subdirs(inode)+2);
+
+	return 0;
+}
+
+static unsigned long calc_fat_clusters(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	/* Divide first to avoid overflow */
+	if (sbi->fat_bits != 12) {
+		unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
+		return ent_per_sec * sbi->fat_length;
+	}
+
+	return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+}
+
+static bool fat_bpb_is_zero(struct fat_boot_sector *b)
+{
+	if (get_unaligned_le16(&b->sector_size))
+		return false;
+	if (b->sec_per_clus)
+		return false;
+	if (b->reserved)
+		return false;
+	if (b->fats)
+		return false;
+	if (get_unaligned_le16(&b->dir_entries))
+		return false;
+	if (get_unaligned_le16(&b->sectors))
+		return false;
+	if (b->media)
+		return false;
+	if (b->fat_length)
+		return false;
+	if (b->secs_track)
+		return false;
+	if (b->heads)
+		return false;
+	return true;
+}
+
+static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
+	int silent, struct fat_bios_param_block *bpb)
+{
+	int error = -EINVAL;
+
+	/* Read in BPB ... */
+	memset(bpb, 0, sizeof(*bpb));
+	bpb->fat_sector_size = get_unaligned_le16(&b->sector_size);
+	bpb->fat_sec_per_clus = b->sec_per_clus;
+	bpb->fat_reserved = le16_to_cpu(b->reserved);
+	bpb->fat_fats = b->fats;
+	bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries);
+	bpb->fat_sectors = get_unaligned_le16(&b->sectors);
+	bpb->fat_fat_length = le16_to_cpu(b->fat_length);
+	bpb->fat_total_sect = le32_to_cpu(b->total_sect);
+
+	bpb->fat16_state = b->fat16.state;
+	bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id);
+
+	bpb->fat32_length = le32_to_cpu(b->fat32.length);
+	bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster);
+	bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector);
+	bpb->fat32_state = b->fat32.state;
+	bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id);
+
+	/* Validate this looks like a FAT filesystem BPB */
+	if (!bpb->fat_reserved) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR,
+				"bogus number of reserved sectors");
+		goto out;
+	}
+	if (!bpb->fat_fats) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
+		goto out;
+	}
+
+	/*
+	 * Earlier we checked here that b->secs_track and b->head are nonzero,
+	 * but it turns out valid FAT filesystems can have zero there.
+	 */
+
+	if (!fat_valid_media(b->media)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
+				(unsigned)b->media);
+		goto out;
+	}
+
+	if (!is_power_of_2(bpb->fat_sector_size)
+	    || (bpb->fat_sector_size < 512)
+	    || (bpb->fat_sector_size > 4096)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
+			       (unsigned)bpb->fat_sector_size);
+		goto out;
+	}
+
+	if (!is_power_of_2(bpb->fat_sec_per_clus)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
+				(unsigned)bpb->fat_sec_per_clus);
+		goto out;
+	}
+
+	error = 0;
+
+out:
+	return error;
+}
+
+static int fat_read_static_bpb(struct super_block *sb,
+	struct fat_boot_sector *b, int silent,
+	struct fat_bios_param_block *bpb)
+{
+	static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
+
+	struct fat_floppy_defaults *fdefaults = NULL;
+	int error = -EINVAL;
+	sector_t bd_sects;
+	unsigned i;
+
+	bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
+
+	/* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
+	if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR,
+				"%s; no bootstrapping code", notdos1x);
+		goto out;
+	}
+
+	/*
+	 * If any value in this region is non-zero, it isn't archaic
+	 * DOS.
+	 */
+	if (!fat_bpb_is_zero(b)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR,
+				"%s; DOS 2.x BPB is non-zero", notdos1x);
+		goto out;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) {
+		if (floppy_defaults[i].nr_sectors == bd_sects) {
+			fdefaults = &floppy_defaults[i];
+			break;
+		}
+	}
+
+	if (fdefaults == NULL) {
+		if (!silent)
+			fat_msg(sb, KERN_WARNING,
+				"This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)",
+				(u64)bd_sects);
+		goto out;
+	}
+
+	if (!silent)
+		fat_msg(sb, KERN_INFO,
+			"This looks like a DOS 1.x volume; assuming default BPB values");
+
+	memset(bpb, 0, sizeof(*bpb));
+	bpb->fat_sector_size = SECTOR_SIZE;
+	bpb->fat_sec_per_clus = fdefaults->sec_per_clus;
+	bpb->fat_reserved = 1;
+	bpb->fat_fats = 2;
+	bpb->fat_dir_entries = fdefaults->dir_entries;
+	bpb->fat_sectors = fdefaults->nr_sectors;
+	bpb->fat_fat_length = fdefaults->fat_length;
+
+	error = 0;
+
+out:
+	return error;
+}
+
+/*
+ * Read the super block of an MS-DOS FS.
+ */
+int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
+		   void (*setup)(struct super_block *))
+{
+	struct inode *root_inode = NULL, *fat_inode = NULL;
+	struct inode *fsinfo_inode = NULL;
+	struct buffer_head *bh;
+	struct fat_bios_param_block bpb;
+	struct msdos_sb_info *sbi;
+	u16 logical_sector_size;
+	u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
+	int debug;
+	long error;
+	char buf[50];
+
+	/*
+	 * GFP_KERNEL is ok here, because while we do hold the
+	 * supeblock lock, memory pressure can't call back into
+	 * the filesystem, since we're only just about to mount
+	 * it and have no inodes etc active!
+	 */
+	sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	sb->s_flags |= MS_NODIRATIME;
+	sb->s_magic = MSDOS_SUPER_MAGIC;
+	sb->s_op = &fat_sops;
+	sb->s_export_op = &fat_export_ops;
+	mutex_init(&sbi->nfs_build_inode_lock);
+	ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+			     DEFAULT_RATELIMIT_BURST);
+
+	error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
+	if (error)
+		goto out_fail;
+
+	setup(sb); /* flavour-specific stuff that needs options */
+
+	error = -EIO;
+	sb_min_blocksize(sb, 512);
+	bh = sb_bread(sb, 0);
+	if (bh == NULL) {
+		fat_msg(sb, KERN_ERR, "unable to read boot sector");
+		goto out_fail;
+	}
+
+	error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent,
+		&bpb);
+	if (error == -EINVAL && sbi->options.dos1xfloppy)
+		error = fat_read_static_bpb(sb,
+			(struct fat_boot_sector *)bh->b_data, silent, &bpb);
+	brelse(bh);
+
+	if (error == -EINVAL)
+		goto out_invalid;
+	else if (error)
+		goto out_fail;
+
+	logical_sector_size = bpb.fat_sector_size;
+	sbi->sec_per_clus = bpb.fat_sec_per_clus;
+
+	error = -EIO;
+	if (logical_sector_size < sb->s_blocksize) {
+		fat_msg(sb, KERN_ERR, "logical sector size too small for device"
+		       " (logical sector size = %u)", logical_sector_size);
+		goto out_fail;
+	}
+
+	if (logical_sector_size > sb->s_blocksize) {
+		struct buffer_head *bh_resize;
+
+		if (!sb_set_blocksize(sb, logical_sector_size)) {
+			fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
+			       logical_sector_size);
+			goto out_fail;
+		}
+
+		/* Verify that the larger boot sector is fully readable */
+		bh_resize = sb_bread(sb, 0);
+		if (bh_resize == NULL) {
+			fat_msg(sb, KERN_ERR, "unable to read boot sector"
+			       " (logical sector size = %lu)",
+			       sb->s_blocksize);
+			goto out_fail;
+		}
+		brelse(bh_resize);
+	}
+
+	mutex_init(&sbi->s_lock);
+	sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
+	sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
+	sbi->fats = bpb.fat_fats;
+	sbi->fat_bits = 0;		/* Don't know yet */
+	sbi->fat_start = bpb.fat_reserved;
+	sbi->fat_length = bpb.fat_fat_length;
+	sbi->root_cluster = 0;
+	sbi->free_clusters = -1;	/* Don't know yet */
+	sbi->free_clus_valid = 0;
+	sbi->prev_free = FAT_START_ENT;
+	sb->s_maxbytes = 0xffffffff;
+
+	if (!sbi->fat_length && bpb.fat32_length) {
+		struct fat_boot_fsinfo *fsinfo;
+		struct buffer_head *fsinfo_bh;
+
+		/* Must be FAT32 */
+		sbi->fat_bits = 32;
+		sbi->fat_length = bpb.fat32_length;
+		sbi->root_cluster = bpb.fat32_root_cluster;
+
+		/* MC - if info_sector is 0, don't multiply by 0 */
+		sbi->fsinfo_sector = bpb.fat32_info_sector;
+		if (sbi->fsinfo_sector == 0)
+			sbi->fsinfo_sector = 1;
+
+		fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
+		if (fsinfo_bh == NULL) {
+			fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
+			       " (sector = %lu)", sbi->fsinfo_sector);
+			goto out_fail;
+		}
+
+		fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
+		if (!IS_FSINFO(fsinfo)) {
+			fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
+			       "0x%08x, 0x%08x (sector = %lu)",
+			       le32_to_cpu(fsinfo->signature1),
+			       le32_to_cpu(fsinfo->signature2),
+			       sbi->fsinfo_sector);
+		} else {
+			if (sbi->options.usefree)
+				sbi->free_clus_valid = 1;
+			sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters);
+			sbi->prev_free = le32_to_cpu(fsinfo->next_cluster);
+		}
+
+		brelse(fsinfo_bh);
+	}
+
+	/* interpret volume ID as a little endian 32 bit integer */
+	if (sbi->fat_bits == 32)
+		sbi->vol_id = bpb.fat32_vol_id;
+	else /* fat 16 or 12 */
+		sbi->vol_id = bpb.fat16_vol_id;
+
+	sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
+	sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
+
+	sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
+	sbi->dir_entries = bpb.fat_dir_entries;
+	if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
+			       " (%u)", sbi->dir_entries);
+		goto out_invalid;
+	}
+
+	rootdir_sectors = sbi->dir_entries
+		* sizeof(struct msdos_dir_entry) / sb->s_blocksize;
+	sbi->data_start = sbi->dir_start + rootdir_sectors;
+	total_sectors = bpb.fat_sectors;
+	if (total_sectors == 0)
+		total_sectors = bpb.fat_total_sect;
+
+	total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
+
+	if (sbi->fat_bits != 32)
+		sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;
+
+	/* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
+	if (sbi->fat_bits == 32)
+		sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
+	else /* fat 16 or 12 */
+		sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;
+
+	/* check that FAT table does not overflow */
+	fat_clusters = calc_fat_clusters(sb);
+	total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
+	if (total_clusters > MAX_FAT(sb)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
+			       total_clusters);
+		goto out_invalid;
+	}
+
+	sbi->max_cluster = total_clusters + FAT_START_ENT;
+	/* check the free_clusters, it's not necessarily correct */
+	if (sbi->free_clusters != -1 && sbi->free_clusters > total_clusters)
+		sbi->free_clusters = -1;
+	/* check the prev_free, it's not necessarily correct */
+	sbi->prev_free %= sbi->max_cluster;
+	if (sbi->prev_free < FAT_START_ENT)
+		sbi->prev_free = FAT_START_ENT;
+
+	/* set up enough so that it can read an inode */
+	fat_hash_init(sb);
+	dir_hash_init(sb);
+	fat_ent_access_init(sb);
+
+	/*
+	 * The low byte of FAT's first entry must have same value with
+	 * media-field.  But in real world, too many devices is
+	 * writing wrong value.  So, removed that validity check.
+	 *
+	 * if (FAT_FIRST_ENT(sb, media) != first)
+	 */
+
+	error = -EINVAL;
+	sprintf(buf, "cp%d", sbi->options.codepage);
+	sbi->nls_disk = load_nls(buf);
+	if (!sbi->nls_disk) {
+		fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
+		goto out_fail;
+	}
+
+	/* FIXME: utf8 is using iocharset for upper/lower conversion */
+	if (sbi->options.isvfat) {
+		sbi->nls_io = load_nls(sbi->options.iocharset);
+		if (!sbi->nls_io) {
+			fat_msg(sb, KERN_ERR, "IO charset %s not found",
+			       sbi->options.iocharset);
+			goto out_fail;
+		}
+	}
+
+	error = -ENOMEM;
+	fat_inode = new_inode(sb);
+	if (!fat_inode)
+		goto out_fail;
+	MSDOS_I(fat_inode)->i_pos = 0;
+	sbi->fat_inode = fat_inode;
+
+	fsinfo_inode = new_inode(sb);
+	if (!fsinfo_inode)
+		goto out_fail;
+	fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
+	sbi->fsinfo_inode = fsinfo_inode;
+	insert_inode_hash(fsinfo_inode);
+
+	root_inode = new_inode(sb);
+	if (!root_inode)
+		goto out_fail;
+	root_inode->i_ino = MSDOS_ROOT_INO;
+	root_inode->i_version = 1;
+	error = fat_read_root(root_inode);
+	if (error < 0) {
+		iput(root_inode);
+		goto out_fail;
+	}
+	error = -ENOMEM;
+	insert_inode_hash(root_inode);
+	fat_attach(root_inode, 0);
+	sb->s_root = d_make_root(root_inode);
+	if (!sb->s_root) {
+		fat_msg(sb, KERN_ERR, "get root inode failed");
+		goto out_fail;
+	}
+
+	if (sbi->options.discard) {
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		if (!blk_queue_discard(q))
+			fat_msg(sb, KERN_WARNING,
+					"mounting with \"discard\" option, but "
+					"the device does not support discard");
+	}
+
+	fat_set_state(sb, 1, 0);
+	return 0;
+
+out_invalid:
+	error = -EINVAL;
+	if (!silent)
+		fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
+
+out_fail:
+	if (fsinfo_inode)
+		iput(fsinfo_inode);
+	if (fat_inode)
+		iput(fat_inode);
+	unload_nls(sbi->nls_io);
+	unload_nls(sbi->nls_disk);
+	if (sbi->options.iocharset != fat_default_iocharset)
+		kfree(sbi->options.iocharset);
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+	return error;
+}
+
+EXPORT_SYMBOL_GPL(fat_fill_super);
+
+/*
+ * helper function for fat_flush_inodes.  This writes both the inode
+ * and the file data blocks, waiting for in flight data blocks before
+ * the start of the call.  It does not wait for any io started
+ * during the call
+ */
+static int writeback_inode(struct inode *inode)
+{
+
+	int ret;
+
+	/* if we used wait=1, sync_inode_metadata waits for the io for the
+	* inode to finish.  So wait=0 is sent down to sync_inode_metadata
+	* and filemap_fdatawrite is used for the data blocks
+	*/
+	ret = sync_inode_metadata(inode, 0);
+	if (!ret)
+		ret = filemap_fdatawrite(inode->i_mapping);
+	return ret;
+}
+
+/*
+ * write data and metadata corresponding to i1 and i2.  The io is
+ * started but we do not wait for any of it to finish.
+ *
+ * filemap_flush is used for the block device, so if there is a dirty
+ * page for a block already in flight, we will not wait and start the
+ * io over again
+ */
+int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
+{
+	int ret = 0;
+	if (!MSDOS_SB(sb)->options.flush)
+		return 0;
+	if (i1)
+		ret = writeback_inode(i1);
+	if (!ret && i2)
+		ret = writeback_inode(i2);
+	if (!ret) {
+		struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+		ret = filemap_flush(mapping);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fat_flush_inodes);
+
+static int __init init_fat_fs(void)
+{
+	int err;
+
+	err = fat_cache_init();
+	if (err)
+		return err;
+
+	err = fat_init_inodecache();
+	if (err)
+		goto failed;
+
+	return 0;
+
+failed:
+	fat_cache_destroy();
+	return err;
+}
+
+static void __exit exit_fat_fs(void)
+{
+	fat_cache_destroy();
+	fat_destroy_inodecache();
+}
+
+module_init(init_fat_fs)
+module_exit(exit_fat_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/fat/misc.c b/kernel/fs/fat/misc.c
new file mode 100644
index 000000000..c4589e981
--- /dev/null
+++ b/kernel/fs/fat/misc.c
@@ -0,0 +1,278 @@
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *		 and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+#include "fat.h"
+
+/*
+ * fat_fs_error reports a file system problem that might indicate fa data
+ * corruption/inconsistency. Depending on 'errors' mount option the
+ * panic() is called, or error message is printed FAT and nothing is done,
+ * or filesystem is remounted read-only (default behavior).
+ * In case the file system is remounted read-only, it can be made writable
+ * again by remounting it.
+ */
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+{
+	struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
+	va_list args;
+	struct va_format vaf;
+
+	if (report) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
+		va_end(args);
+	}
+
+	if (opts->errors == FAT_ERRORS_PANIC)
+		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
+	else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_flags |= MS_RDONLY;
+		fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
+	}
+}
+EXPORT_SYMBOL_GPL(__fat_fs_error);
+
+/**
+ * fat_msg() - print preformated FAT specific messages. Every thing what is
+ * not fat_fs_error() should be fat_msg().
+ */
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+	va_end(args);
+}
+
+/* Flushes the number of free clusters on FAT32 */
+/* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
+int fat_clusters_flush(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	struct fat_boot_fsinfo *fsinfo;
+
+	if (sbi->fat_bits != 32)
+		return 0;
+
+	bh = sb_bread(sb, sbi->fsinfo_sector);
+	if (bh == NULL) {
+		fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
+		return -EIO;
+	}
+
+	fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
+	/* Sanity check */
+	if (!IS_FSINFO(fsinfo)) {
+		fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
+		       "0x%08x, 0x%08x (sector = %lu)",
+		       le32_to_cpu(fsinfo->signature1),
+		       le32_to_cpu(fsinfo->signature2),
+		       sbi->fsinfo_sector);
+	} else {
+		if (sbi->free_clusters != -1)
+			fsinfo->free_clusters = cpu_to_le32(sbi->free_clusters);
+		if (sbi->prev_free != -1)
+			fsinfo->next_cluster = cpu_to_le32(sbi->prev_free);
+		mark_buffer_dirty(bh);
+	}
+	brelse(bh);
+
+	return 0;
+}
+
+/*
+ * fat_chain_add() adds a new cluster to the chain of clusters represented
+ * by inode.
+ */
+int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int ret, new_fclus, last;
+
+	/*
+	 * We must locate the last cluster of the file to add this new
+	 * one (new_dclus) to the end of the link list (the FAT).
+	 */
+	last = new_fclus = 0;
+	if (MSDOS_I(inode)->i_start) {
+		int fclus, dclus;
+
+		ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus);
+		if (ret < 0)
+			return ret;
+		new_fclus = fclus + 1;
+		last = dclus;
+	}
+
+	/* add new one to the last of the cluster chain */
+	if (last) {
+		struct fat_entry fatent;
+
+		fatent_init(&fatent);
+		ret = fat_ent_read(inode, &fatent, last);
+		if (ret >= 0) {
+			int wait = inode_needs_sync(inode);
+			ret = fat_ent_write(inode, &fatent, new_dclus, wait);
+			fatent_brelse(&fatent);
+		}
+		if (ret < 0)
+			return ret;
+		/*
+		 * FIXME:Although we can add this cache, fat_cache_add() is
+		 * assuming to be called after linear search with fat_cache_id.
+		 */
+//		fat_cache_add(inode, new_fclus, new_dclus);
+	} else {
+		MSDOS_I(inode)->i_start = new_dclus;
+		MSDOS_I(inode)->i_logstart = new_dclus;
+		/*
+		 * Since generic_write_sync() synchronizes regular files later,
+		 * we sync here only directories.
+		 */
+		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
+			ret = fat_sync_inode(inode);
+			if (ret)
+				return ret;
+		} else
+			mark_inode_dirty(inode);
+	}
+	if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
+		fat_fs_error(sb, "clusters badly computed (%d != %llu)",
+			     new_fclus,
+			     (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
+		fat_cache_inval_inode(inode);
+	}
+	inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);
+
+	return 0;
+}
+
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits :     value
+ * date:  0 -  4: day	(1 -  31)
+ * date:  5 -  8: month	(1 -  12)
+ * date:  9 - 15: year	(0 - 127) from 1980
+ * time:  0 -  4: sec	(0 -  29) 2sec counts
+ * time:  5 - 10: min	(0 -  59)
+ * time: 11 - 15: hour	(0 -  23)
+ */
+#define SECS_PER_MIN	60
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA	(365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define YEAR_2100	120
+#define IS_LEAP_YEAR(y)	(!((y) & 3) && (y) != YEAR_2100)
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+static time_t days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
+};
+
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 __time, __le16 __date, u8 time_cs)
+{
+	u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
+	time_t second, day, leap_day, month, year;
+
+	year  = date >> 9;
+	month = max(1, (date >> 5) & 0xf);
+	day   = max(1, date & 0x1f) - 1;
+
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (IS_LEAP_YEAR(year) && month > 2)
+		leap_day++;
+
+	second =  (time & 0x1f) << 1;
+	second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
+	second += (time >> 11) * SECS_PER_HOUR;
+	second += (year * 365 + leap_day
+		   + days_in_year[month] + day
+		   + DAYS_DELTA) * SECS_PER_DAY;
+
+	if (!sbi->options.tz_set)
+		second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+	else
+		second -= sbi->options.time_offset * SECS_PER_MIN;
+
+	if (time_cs) {
+		ts->tv_sec = second + (time_cs / 100);
+		ts->tv_nsec = (time_cs % 100) * 10000000;
+	} else {
+		ts->tv_sec = second;
+		ts->tv_nsec = 0;
+	}
+}
+
+/* Convert linear UNIX date to a FAT time/date pair. */
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 *time, __le16 *date, u8 *time_cs)
+{
+	struct tm tm;
+	time_to_tm(ts->tv_sec,
+		   (sbi->options.tz_set ? sbi->options.time_offset :
+		   -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
+
+	/*  FAT can only support year between 1980 to 2107 */
+	if (tm.tm_year < 1980 - 1900) {
+		*time = 0;
+		*date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
+		if (time_cs)
+			*time_cs = 0;
+		return;
+	}
+	if (tm.tm_year > 2107 - 1900) {
+		*time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
+		*date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
+		if (time_cs)
+			*time_cs = 199;
+		return;
+	}
+
+	/* from 1900 -> from 1980 */
+	tm.tm_year -= 80;
+	/* 0~11 -> 1~12 */
+	tm.tm_mon++;
+	/* 0~59 -> 0~29(2sec counts) */
+	tm.tm_sec >>= 1;
+
+	*time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec);
+	*date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday);
+	if (time_cs)
+		*time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
+}
+EXPORT_SYMBOL_GPL(fat_time_unix2fat);
+
+int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
+{
+	int i, err = 0;
+
+	for (i = 0; i < nr_bhs; i++)
+		write_dirty_buffer(bhs[i], WRITE);
+
+	for (i = 0; i < nr_bhs; i++) {
+		wait_on_buffer(bhs[i]);
+		if (!err && !buffer_uptodate(bhs[i]))
+			err = -EIO;
+	}
+	return err;
+}
diff --git a/kernel/fs/fat/namei_msdos.c b/kernel/fs/fat/namei_msdos.c
new file mode 100644
index 000000000..b7e2b33aa
--- /dev/null
+++ b/kernel/fs/fat/namei_msdos.c
@@ -0,0 +1,684 @@
+/*
+ *  linux/fs/msdos/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
+ *  Rewritten for constant inumbers 1999 by Al Viro
+ */
+
+#include <linux/module.h>
+#include "fat.h"
+
+/* Characters that are undesirable in an MS-DOS file name */
+static unsigned char bad_chars[] = "*?<>|\"";
+static unsigned char bad_if_strict[] = "+=,; ";
+
+/***** Formats an MS-DOS file name. Rejects invalid names. */
+static int msdos_format_name(const unsigned char *name, int len,
+			     unsigned char *res, struct fat_mount_options *opts)
+	/*
+	 * name is the proposed name, len is its length, res is
+	 * the resulting name, opts->name_check is either (r)elaxed,
+	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
+	 * beginning of name (for hidden files)
+	 */
+{
+	unsigned char *walk;
+	unsigned char c;
+	int space;
+
+	if (name[0] == '.') {	/* dotfile because . and .. already done */
+		if (opts->dotsOK) {
+			/* Get rid of dot - test for it elsewhere */
+			name++;
+			len--;
+		} else
+			return -EINVAL;
+	}
+	/*
+	 * disallow names that _really_ start with a dot
+	 */
+	space = 1;
+	c = 0;
+	for (walk = res; len && walk - res < 8; walk++) {
+		c = *name++;
+		len--;
+		if (opts->name_check != 'r' && strchr(bad_chars, c))
+			return -EINVAL;
+		if (opts->name_check == 's' && strchr(bad_if_strict, c))
+			return -EINVAL;
+		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+			return -EINVAL;
+		if (c < ' ' || c == ':' || c == '\\')
+			return -EINVAL;
+	/*
+	 * 0xE5 is legal as a first character, but we must substitute
+	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
+	 * does this.
+	 * It seems that Microsoft hacked DOS to support non-US
+	 * characters after the 0xE5 character was already in use to
+	 * mark deleted files.
+	 */
+		if ((res == walk) && (c == 0xE5))
+			c = 0x05;
+		if (c == '.')
+			break;
+		space = (c == ' ');
+		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
+	}
+	if (space)
+		return -EINVAL;
+	if (opts->name_check == 's' && len && c != '.') {
+		c = *name++;
+		len--;
+		if (c != '.')
+			return -EINVAL;
+	}
+	while (c != '.' && len--)
+		c = *name++;
+	if (c == '.') {
+		while (walk - res < 8)
+			*walk++ = ' ';
+		while (len > 0 && walk - res < MSDOS_NAME) {
+			c = *name++;
+			len--;
+			if (opts->name_check != 'r' && strchr(bad_chars, c))
+				return -EINVAL;
+			if (opts->name_check == 's' &&
+			    strchr(bad_if_strict, c))
+				return -EINVAL;
+			if (c < ' ' || c == ':' || c == '\\')
+				return -EINVAL;
+			if (c == '.') {
+				if (opts->name_check == 's')
+					return -EINVAL;
+				break;
+			}
+			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+				return -EINVAL;
+			space = c == ' ';
+			if (!opts->nocase && c >= 'a' && c <= 'z')
+				*walk++ = c - 32;
+			else
+				*walk++ = c;
+		}
+		if (space)
+			return -EINVAL;
+		if (opts->name_check == 's' && len)
+			return -EINVAL;
+	}
+	while (walk - res < MSDOS_NAME)
+		*walk++ = ' ';
+
+	return 0;
+}
+
+/***** Locates a directory entry.  Uses unformatted name. */
+static int msdos_find(struct inode *dir, const unsigned char *name, int len,
+		      struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	unsigned char msdos_name[MSDOS_NAME];
+	int err;
+
+	err = msdos_format_name(name, len, msdos_name, &sbi->options);
+	if (err)
+		return -ENOENT;
+
+	err = fat_scan(dir, msdos_name, sinfo);
+	if (!err && sbi->options.dotsOK) {
+		if (name[0] == '.') {
+			if (!(sinfo->de->attr & ATTR_HIDDEN))
+				err = -ENOENT;
+		} else {
+			if (sinfo->de->attr & ATTR_HIDDEN)
+				err = -ENOENT;
+		}
+		if (err)
+			brelse(sinfo->bh);
+	}
+	return err;
+}
+
+/*
+ * Compute the hash for the msdos name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The msdos fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
+	if (!error)
+		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
+	return 0;
+}
+
+/*
+ * Compare two msdos names. If either of the names are invalid,
+ * we fall back to doing the standard name comparison.
+ */
+static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
+	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(name->name, name->len, a_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = msdos_format_name(str, len, b_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
+out:
+	return error;
+
+old_compare:
+	error = 1;
+	if (name->len == len)
+		error = memcmp(name->name, str, len);
+	goto out;
+}
+
+static const struct dentry_operations msdos_dentry_operations = {
+	.d_hash		= msdos_hash,
+	.d_compare	= msdos_cmp,
+};
+
+/*
+ * AV. Wrappers for FAT sb operations. Is it wise?
+ */
+
+/***** Get inode using directory and name */
+static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	int err;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	switch (err) {
+	case -ENOENT:
+		inode = NULL;
+		break;
+	case 0:
+		inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+		brelse(sinfo.bh);
+		break;
+	default:
+		inode = ERR_PTR(err);
+	}
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	return d_splice_alias(inode, dentry);
+}
+
+/***** Creates a directory entry (name is already formatted). */
+static int msdos_add_entry(struct inode *dir, const unsigned char *name,
+			   int is_dir, int is_hid, int cluster,
+			   struct timespec *ts, struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct msdos_dir_entry de;
+	__le16 time, date;
+	int err;
+
+	memcpy(de.name, name, MSDOS_NAME);
+	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	if (is_hid)
+		de.attr |= ATTR_HIDDEN;
+	de.lcase = 0;
+	fat_time_unix2fat(sbi, ts, &time, &date, NULL);
+	de.cdate = de.adate = 0;
+	de.ctime = 0;
+	de.ctime_cs = 0;
+	de.time = time;
+	de.date = date;
+	fat_set_start(&de, cluster);
+	de.size = 0;
+
+	err = fat_add_entries(dir, &de, 1, sinfo);
+	if (err)
+		return err;
+
+	dir->i_ctime = dir->i_mtime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	return 0;
+}
+
+/***** Create a file */
+static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			bool excl)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	unsigned char msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* Have to do it due to foo vs. .foo conflicts */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+	return err;
+}
+
+/***** Remove a directory */
+static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = d_inode(dentry);
+	struct fat_slot_info sinfo;
+	int err;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+	/*
+	 * Check whether the directory is not in use, then check
+	 * whether it is empty.
+	 */
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+/***** Make a directory */
+static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	unsigned char msdos_name[MSDOS_NAME];
+	struct timespec ts;
+	int err, is_hid, cluster;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* foo vs .foo situation */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	set_nlink(inode, 2);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	fat_flush_inodes(sb, dir, inode);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	return err;
+}
+
+/***** Unlink a file */
+static int msdos_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = inode->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
+			   struct dentry *old_dentry,
+			   struct inode *new_dir, unsigned char *new_name,
+			   struct dentry *new_dentry, int is_hid)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t new_i_pos;
+	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = d_inode(old_dentry);
+	new_inode = d_inode(new_dentry);
+
+	err = fat_scan(old_dir, old_name, &old_sinfo);
+	if (err) {
+		err = -EIO;
+		goto out;
+	}
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	old_attrs = MSDOS_I(old_inode)->i_attrs;
+	err = fat_scan(new_dir, new_name, &sinfo);
+	if (!err) {
+		if (!new_inode) {
+			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
+			if (sinfo.de != old_sinfo.de) {
+				err = -EINVAL;
+				goto out;
+			}
+			if (is_hid)
+				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+			else
+				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+			if (IS_DIRSYNC(old_dir)) {
+				err = fat_sync_inode(old_inode);
+				if (err) {
+					MSDOS_I(old_inode)->i_attrs = old_attrs;
+					goto out;
+				}
+			} else
+				mark_inode_dirty(old_inode);
+
+			old_dir->i_version++;
+			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+			if (IS_DIRSYNC(old_dir))
+				(void)fat_sync_inode(old_dir);
+			else
+				mark_inode_dirty(old_dir);
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (err)
+			goto out;
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
+				      &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (is_hid)
+		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+	else
+		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	MSDOS_I(old_inode)->i_attrs = old_attrs;
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_error(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
+static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct super_block *sb = old_dir->i_sb;
+	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	err = msdos_format_name(old_dentry->d_name.name,
+				old_dentry->d_name.len, old_msdos_name,
+				&MSDOS_SB(old_dir->i_sb)->options);
+	if (err)
+		goto out;
+	err = msdos_format_name(new_dentry->d_name.name,
+				new_dentry->d_name.len, new_msdos_name,
+				&MSDOS_SB(new_dir->i_sb)->options);
+	if (err)
+		goto out;
+
+	is_hid =
+	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
+
+	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
+			      new_dir, new_msdos_name, new_dentry, is_hid);
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	if (!err)
+		err = fat_flush_inodes(sb, old_dir, new_dir);
+	return err;
+}
+
+static const struct inode_operations msdos_dir_inode_operations = {
+	.create		= msdos_create,
+	.lookup		= msdos_lookup,
+	.unlink		= msdos_unlink,
+	.mkdir		= msdos_mkdir,
+	.rmdir		= msdos_rmdir,
+	.rename		= msdos_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static void setup(struct super_block *sb)
+{
+	MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
+	sb->s_d_op = &msdos_dentry_operations;
+	sb->s_flags |= MS_NOATIME;
+}
+
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+	return fat_fill_super(sb, data, silent, 0, setup);
+}
+
+static struct dentry *msdos_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+}
+
+static struct file_system_type msdos_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "msdos",
+	.mount		= msdos_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("msdos");
+
+static int __init init_msdos_fs(void)
+{
+	return register_filesystem(&msdos_fs_type);
+}
+
+static void __exit exit_msdos_fs(void)
+{
+	unregister_filesystem(&msdos_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Werner Almesberger");
+MODULE_DESCRIPTION("MS-DOS filesystem support");
+
+module_init(init_msdos_fs)
+module_exit(exit_msdos_fs)
diff --git a/kernel/fs/fat/namei_vfat.c b/kernel/fs/fat/namei_vfat.c
new file mode 100644
index 000000000..7092584f4
--- /dev/null
+++ b/kernel/fs/fat/namei_vfat.c
@@ -0,0 +1,1089 @@
+/*
+ *  linux/fs/vfat/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Windows95/Windows NT compatible extended MSDOS filesystem
+ *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
+ *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
+ *    what file operation caused you trouble and if you can duplicate
+ *    the problem, send a script that demonstrates it.
+ *
+ *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
+ *
+ *  Support Multibyte characters and cleanup by
+ *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include "fat.h"
+
+/*
+ * If new entry was created in the parent, it could create the 8.3
+ * alias (the shortname of logname).  So, the parent may have the
+ * negative-dentry which matches the created 8.3 alias.
+ *
+ * If it happened, the negative dentry isn't actually negative
+ * anymore.  So, drop it.
+ */
+static int vfat_revalidate_shortname(struct dentry *dentry)
+{
+	int ret = 1;
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_time != d_inode(dentry->d_parent)->i_version)
+		ret = 0;
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
+
+static int vfat_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	/* This is not negative dentry. Always valid. */
+	if (d_really_is_positive(dentry))
+		return 1;
+	return vfat_revalidate_shortname(dentry);
+}
+
+static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
+{
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	/*
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
+	 */
+	if (d_really_is_positive(dentry))
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!flags)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+		return 0;
+
+	return vfat_revalidate_shortname(dentry);
+}
+
+/* returns the length of a struct qstr, ignoring trailing dots */
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
+{
+	while (len && name[len - 1] == '.')
+		len--;
+	return len;
+}
+
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+	return __vfat_striptail_len(qstr->len, qstr->name);
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
+	return 0;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
+	const unsigned char *name;
+	unsigned int len;
+	unsigned long hash;
+
+	name = qstr->name;
+	len = vfat_striptail_len(qstr);
+
+	hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(nls_tolower(t, *name++), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
+	if (alen == blen) {
+		if (nls_strnicmp(t, name->name, str, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Case sensitive compare of two vfat names.
+ */
+static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
+	if (alen == blen) {
+		if (strncmp(name->name, str, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+static const struct dentry_operations vfat_ci_dentry_ops = {
+	.d_revalidate	= vfat_revalidate_ci,
+	.d_hash		= vfat_hashi,
+	.d_compare	= vfat_cmpi,
+};
+
+static const struct dentry_operations vfat_dentry_ops = {
+	.d_revalidate	= vfat_revalidate,
+	.d_hash		= vfat_hash,
+	.d_compare	= vfat_cmp,
+};
+
+/* Characters that are undesirable in an MS-DOS file name */
+
+static inline wchar_t vfat_bad_char(wchar_t w)
+{
+	return (w < 0x0020)
+	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
+	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
+	    || (w == '\\');
+}
+
+static inline wchar_t vfat_replace_char(wchar_t w)
+{
+	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
+	    || (w == '+') || (w == '=');
+}
+
+static wchar_t vfat_skip_char(wchar_t w)
+{
+	return (w == '.') || (w == ' ');
+}
+
+static inline int vfat_is_used_badchars(const wchar_t *s, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (vfat_bad_char(s[i]))
+			return -EINVAL;
+
+	if (s[i - 1] == ' ') /* last character cannot be space */
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vfat_find_form(struct inode *dir, unsigned char *name)
+{
+	struct fat_slot_info sinfo;
+	int err = fat_scan(dir, name, &sinfo);
+	if (err)
+		return -ENOENT;
+	brelse(sinfo.bh);
+	return 0;
+}
+
+/*
+ * 1) Valid characters for the 8.3 format alias are any combination of
+ * letters, uppercase alphabets, digits, any of the
+ * following special characters:
+ *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
+ * In this case Longfilename is not stored in disk.
+ *
+ * WinNT's Extension:
+ * File name and extension name is contain uppercase/lowercase
+ * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
+ *
+ * 2) File name is 8.3 format, but it contain the uppercase and
+ * lowercase char, muliti bytes char, etc. In this case numtail is not
+ * added, but Longfilename is stored.
+ *
+ * 3) When the one except for the above, or the following special
+ * character are contained:
+ *        .   [ ] ; , + =
+ * numtail is added, and Longfilename must be stored in disk .
+ */
+struct shortname_info {
+	unsigned char lower:1,
+		      upper:1,
+		      valid:1;
+};
+#define INIT_SHORTNAME_INFO(x)	do {		\
+	(x)->lower = 1;				\
+	(x)->upper = 1;				\
+	(x)->valid = 1;				\
+} while (0)
+
+static inline int to_shortname_char(struct nls_table *nls,
+				    unsigned char *buf, int buf_size,
+				    wchar_t *src, struct shortname_info *info)
+{
+	int len;
+
+	if (vfat_skip_char(*src)) {
+		info->valid = 0;
+		return 0;
+	}
+	if (vfat_replace_char(*src)) {
+		info->valid = 0;
+		buf[0] = '_';
+		return 1;
+	}
+
+	len = nls->uni2char(*src, buf, buf_size);
+	if (len <= 0) {
+		info->valid = 0;
+		buf[0] = '_';
+		len = 1;
+	} else if (len == 1) {
+		unsigned char prev = buf[0];
+
+		if (buf[0] >= 0x7F) {
+			info->lower = 0;
+			info->upper = 0;
+		}
+
+		buf[0] = nls_toupper(nls, buf[0]);
+		if (isalpha(buf[0])) {
+			if (buf[0] == prev)
+				info->lower = 0;
+			else
+				info->upper = 0;
+		}
+	} else {
+		info->lower = 0;
+		info->upper = 0;
+	}
+
+	return len;
+}
+
+/*
+ * Given a valid longname, create a unique shortname.  Make sure the
+ * shortname does not exist
+ * Returns negative number on error, 0 for a normal
+ * return, and 1 for valid shortname
+ */
+static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
+				 wchar_t *uname, int ulen,
+				 unsigned char *name_res, unsigned char *lcase)
+{
+	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
+	wchar_t *ip, *ext_start, *end, *name_start;
+	unsigned char base[9], ext[4], buf[5], *p;
+	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
+	int chl, chi;
+	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
+	int is_shortname;
+	struct shortname_info base_info, ext_info;
+
+	is_shortname = 1;
+	INIT_SHORTNAME_INFO(&base_info);
+	INIT_SHORTNAME_INFO(&ext_info);
+
+	/* Now, we need to create a shortname from the long name */
+	ext_start = end = &uname[ulen];
+	while (--ext_start >= uname) {
+		if (*ext_start == 0x002E) {	/* is `.' */
+			if (ext_start == end - 1) {
+				sz = ulen;
+				ext_start = NULL;
+			}
+			break;
+		}
+	}
+
+	if (ext_start == uname - 1) {
+		sz = ulen;
+		ext_start = NULL;
+	} else if (ext_start) {
+		/*
+		 * Names which start with a dot could be just
+		 * an extension eg. "...test".  In this case Win95
+		 * uses the extension as the name and sets no extension.
+		 */
+		name_start = &uname[0];
+		while (name_start < ext_start) {
+			if (!vfat_skip_char(*name_start))
+				break;
+			name_start++;
+		}
+		if (name_start != ext_start) {
+			sz = ext_start - uname;
+			ext_start++;
+		} else {
+			sz = ulen;
+			ext_start = NULL;
+		}
+	}
+
+	numtail_baselen = 6;
+	numtail2_baselen = 2;
+	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
+		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+					ip, &base_info);
+		if (chl == 0)
+			continue;
+
+		if (baselen < 2 && (baselen + chl) > 2)
+			numtail2_baselen = baselen;
+		if (baselen < 6 && (baselen + chl) > 6)
+			numtail_baselen = baselen;
+		for (chi = 0; chi < chl; chi++) {
+			*p++ = charbuf[chi];
+			baselen++;
+			if (baselen >= 8)
+				break;
+		}
+		if (baselen >= 8) {
+			if ((chi < chl - 1) || (ip + 1) - uname < sz)
+				is_shortname = 0;
+			break;
+		}
+	}
+	if (baselen == 0) {
+		return -EINVAL;
+	}
+
+	extlen = 0;
+	if (ext_start) {
+		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
+			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+						ip, &ext_info);
+			if (chl == 0)
+				continue;
+
+			if ((extlen + chl) > 3) {
+				is_shortname = 0;
+				break;
+			}
+			for (chi = 0; chi < chl; chi++) {
+				*p++ = charbuf[chi];
+				extlen++;
+			}
+			if (extlen >= 3) {
+				if (ip + 1 != end)
+					is_shortname = 0;
+				break;
+			}
+		}
+	}
+	ext[extlen] = '\0';
+	base[baselen] = '\0';
+
+	/* Yes, it can happen. ".\xe5" would do it. */
+	if (base[0] == DELETED_FLAG)
+		base[0] = 0x05;
+
+	/* OK, at this point we know that base is not longer than 8 symbols,
+	 * ext is not longer than 3, base is nonempty, both don't contain
+	 * any bad symbols (lowercase transformed to uppercase).
+	 */
+
+	memset(name_res, ' ', MSDOS_NAME);
+	memcpy(name_res, base, baselen);
+	memcpy(name_res + 8, ext, extlen);
+	*lcase = 0;
+	if (is_shortname && base_info.valid && ext_info.valid) {
+		if (vfat_find_form(dir, name_res) == 0)
+			return -EEXIST;
+
+		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
+			return (base_info.upper && ext_info.upper);
+		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
+			if ((base_info.upper || base_info.lower) &&
+			    (ext_info.upper || ext_info.lower)) {
+				if (!base_info.upper && base_info.lower)
+					*lcase |= CASE_LOWER_BASE;
+				if (!ext_info.upper && ext_info.lower)
+					*lcase |= CASE_LOWER_EXT;
+				return 1;
+			}
+			return 0;
+		} else {
+			BUG();
+		}
+	}
+
+	if (opts->numtail == 0)
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+
+	/*
+	 * Try to find a unique extension.  This used to
+	 * iterate through all possibilities sequentially,
+	 * but that gave extremely bad performance.  Windows
+	 * only tries a few cases before using random
+	 * values for part of the base.
+	 */
+
+	if (baselen > 6) {
+		baselen = numtail_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen] = '~';
+	for (i = 1; i < 10; i++) {
+		name_res[baselen + 1] = i + '0';
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+	}
+
+	i = jiffies;
+	sz = (jiffies >> 16) & 0x7;
+	if (baselen > 2) {
+		baselen = numtail2_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen + 4] = '~';
+	name_res[baselen + 5] = '1' + sz;
+	while (1) {
+		snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
+		memcpy(&name_res[baselen], buf, 4);
+		if (vfat_find_form(dir, name_res) < 0)
+			break;
+		i -= 11;
+	}
+	return 0;
+}
+
+/* Translate a string, including coded sequences into Unicode */
+static int
+xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
+	     int *longlen, int *outlen, int escape, int utf8,
+	     struct nls_table *nls)
+{
+	const unsigned char *ip;
+	unsigned char nc;
+	unsigned char *op;
+	unsigned int ec;
+	int i, k, fill;
+	int charlen;
+
+	if (utf8) {
+		*outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN,
+				(wchar_t *) outname, FAT_LFN_LEN + 2);
+		if (*outlen < 0)
+			return *outlen;
+		else if (*outlen > FAT_LFN_LEN)
+			return -ENAMETOOLONG;
+
+		op = &outname[*outlen * sizeof(wchar_t)];
+	} else {
+		for (i = 0, ip = name, op = outname, *outlen = 0;
+			 i < len && *outlen < FAT_LFN_LEN;
+			 *outlen += 1) {
+			if (escape && (*ip == ':')) {
+				if (i > len - 5)
+					return -EINVAL;
+				ec = 0;
+				for (k = 1; k < 5; k++) {
+					nc = ip[k];
+					ec <<= 4;
+					if (nc >= '0' && nc <= '9') {
+						ec |= nc - '0';
+						continue;
+					}
+					if (nc >= 'a' && nc <= 'f') {
+						ec |= nc - ('a' - 10);
+						continue;
+					}
+					if (nc >= 'A' && nc <= 'F') {
+						ec |= nc - ('A' - 10);
+						continue;
+					}
+					return -EINVAL;
+				}
+				*op++ = ec & 0xFF;
+				*op++ = ec >> 8;
+				ip += 5;
+				i += 5;
+			} else {
+				charlen = nls->char2uni(ip, len - i,
+									(wchar_t *)op);
+				if (charlen < 0)
+					return -EINVAL;
+				ip += charlen;
+				i += charlen;
+				op += 2;
+			}
+		}
+		if (i < len)
+			return -ENAMETOOLONG;
+	}
+
+	*longlen = *outlen;
+	if (*outlen % 13) {
+		*op++ = 0;
+		*op++ = 0;
+		*outlen += 1;
+		if (*outlen % 13) {
+			fill = 13 - (*outlen % 13);
+			for (i = 0; i < fill; i++) {
+				*op++ = 0xff;
+				*op++ = 0xff;
+			}
+			*outlen += fill;
+		}
+	}
+
+	return 0;
+}
+
+static int vfat_build_slots(struct inode *dir, const unsigned char *name,
+			    int len, int is_dir, int cluster,
+			    struct timespec *ts,
+			    struct msdos_dir_slot *slots, int *nr_slots)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct fat_mount_options *opts = &sbi->options;
+	struct msdos_dir_slot *ps;
+	struct msdos_dir_entry *de;
+	unsigned char cksum, lcase;
+	unsigned char msdos_name[MSDOS_NAME];
+	wchar_t *uname;
+	__le16 time, date;
+	u8 time_cs;
+	int err, ulen, usize, i;
+	loff_t offset;
+
+	*nr_slots = 0;
+
+	uname = __getname();
+	if (!uname)
+		return -ENOMEM;
+
+	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
+			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
+	if (err)
+		goto out_free;
+
+	err = vfat_is_used_badchars(uname, ulen);
+	if (err)
+		goto out_free;
+
+	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
+				    msdos_name, &lcase);
+	if (err < 0)
+		goto out_free;
+	else if (err == 1) {
+		de = (struct msdos_dir_entry *)slots;
+		err = 0;
+		goto shortname;
+	}
+
+	/* build the entry of long file name */
+	cksum = fat_checksum(msdos_name);
+
+	*nr_slots = usize / 13;
+	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
+		ps->id = i;
+		ps->attr = ATTR_EXT;
+		ps->reserved = 0;
+		ps->alias_checksum = cksum;
+		ps->start = 0;
+		offset = (i - 1) * 13;
+		fatwchar_to16(ps->name0_4, uname + offset, 5);
+		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
+		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
+	}
+	slots[0].id |= 0x40;
+	de = (struct msdos_dir_entry *)ps;
+
+shortname:
+	/* build the entry of 8.3 alias name */
+	(*nr_slots)++;
+	memcpy(de->name, msdos_name, MSDOS_NAME);
+	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	de->lcase = lcase;
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
+	de->time = de->ctime = time;
+	de->date = de->cdate = de->adate = date;
+	de->ctime_cs = time_cs;
+	fat_set_start(de, cluster);
+	de->size = 0;
+out_free:
+	__putname(uname);
+	return err;
+}
+
+static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
+			  int cluster, struct timespec *ts,
+			  struct fat_slot_info *sinfo)
+{
+	struct msdos_dir_slot *slots;
+	unsigned int len;
+	int err, nr_slots;
+
+	len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+
+	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
+	if (slots == NULL)
+		return -ENOMEM;
+
+	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
+			       slots, &nr_slots);
+	if (err)
+		goto cleanup;
+
+	err = fat_add_entries(dir, slots, nr_slots, sinfo);
+	if (err)
+		goto cleanup;
+
+	/* update timestamp */
+	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+cleanup:
+	kfree(slots);
+	return err;
+}
+
+static int vfat_find(struct inode *dir, struct qstr *qname,
+		     struct fat_slot_info *sinfo)
+{
+	unsigned int len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+	return fat_search_long(dir, qname->name, len, sinfo);
+}
+
+/*
+ * (nfsd's) anonymous disconnected dentry?
+ * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
+ */
+static int vfat_d_anon_disconn(struct dentry *dentry)
+{
+	return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
+static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	struct dentry *alias;
+	int err;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto error;
+	}
+
+	alias = d_find_alias(inode);
+	/*
+	 * Checking "alias->d_parent == dentry->d_parent" to make sure
+	 * FS is not corrupted (especially double linked dir).
+	 */
+	if (alias && alias->d_parent == dentry->d_parent &&
+	    !vfat_d_anon_disconn(alias)) {
+		/*
+		 * This inode has non anonymous-DCACHE_DISCONNECTED
+		 * dentry. This means, the user did ->lookup() by an
+		 * another name (longname vs 8.3 alias of it) in past.
+		 *
+		 * Switch to new one for reason of locality if possible.
+		 */
+		BUG_ON(d_unhashed(alias));
+		if (!S_ISDIR(inode->i_mode))
+			d_move(alias, dentry);
+		iput(inode);
+		mutex_unlock(&MSDOS_SB(sb)->s_lock);
+		return alias;
+	} else
+		dput(alias);
+
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	if (!inode)
+		dentry->d_time = dir->i_version;
+	return d_splice_alias(inode, dentry);
+error:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	return ERR_PTR(err);
+}
+
+static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       bool excl)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	ts = CURRENT_TIME_SEC;
+	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	dir->i_version++;
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	return err;
+}
+
+static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+	dentry->d_time = dir->i_version;
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+
+	return err;
+}
+
+static int vfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+	dentry->d_time = dir->i_version;
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+
+	return err;
+}
+
+static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err, cluster;
+
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	dir->i_version++;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_version++;
+	set_nlink(inode, 2);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+	return err;
+}
+
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t new_i_pos;
+	int err, is_dir, update_dotdot, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = d_inode(old_dentry);
+	new_inode = d_inode(new_dentry);
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
+	if (err)
+		goto out;
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
+				     &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+ 			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_error(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+static const struct inode_operations vfat_dir_inode_operations = {
+	.create		= vfat_create,
+	.lookup		= vfat_lookup,
+	.unlink		= vfat_unlink,
+	.mkdir		= vfat_mkdir,
+	.rmdir		= vfat_rmdir,
+	.rename		= vfat_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static void setup(struct super_block *sb)
+{
+	MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
+	if (MSDOS_SB(sb)->options.name_check != 's')
+		sb->s_d_op = &vfat_ci_dentry_ops;
+	else
+		sb->s_d_op = &vfat_dentry_ops;
+}
+
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	return fat_fill_super(sb, data, silent, 1, setup);
+}
+
+static struct dentry *vfat_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+}
+
+static struct file_system_type vfat_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "vfat",
+	.mount		= vfat_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("vfat");
+
+static int __init init_vfat_fs(void)
+{
+	return register_filesystem(&vfat_fs_type);
+}
+
+static void __exit exit_vfat_fs(void)
+{
+	unregister_filesystem(&vfat_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("VFAT filesystem support");
+MODULE_AUTHOR("Gordon Chaffee");
+
+module_init(init_vfat_fs)
+module_exit(exit_vfat_fs)
diff --git a/kernel/fs/fat/nfs.c b/kernel/fs/fat/nfs.c
new file mode 100644
index 000000000..eb192656f
--- /dev/null
+++ b/kernel/fs/fat/nfs.c
@@ -0,0 +1,301 @@
+/* fs/fat/nfs.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/exportfs.h>
+#include "fat.h"
+
+struct fat_fid {
+	u32 i_gen;
+	u32 i_pos_low;
+	u16 i_pos_hi;
+	u16 parent_i_pos_hi;
+	u32 parent_i_pos_low;
+	u32 parent_i_gen;
+};
+
+#define FAT_FID_SIZE_WITHOUT_PARENT 3
+#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
+
+/**
+ * Look up a directory inode given its starting cluster.
+ */
+static struct inode *fat_dget(struct super_block *sb, int i_logstart)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct hlist_head *head;
+	struct msdos_inode_info *i;
+	struct inode *inode = NULL;
+
+	head = sbi->dir_hashtable + fat_dir_hash(i_logstart);
+	spin_lock(&sbi->dir_hash_lock);
+	hlist_for_each_entry(i, head, i_dir_hash) {
+		BUG_ON(i->vfs_inode.i_sb != sb);
+		if (i->i_logstart != i_logstart)
+			continue;
+		inode = igrab(&i->vfs_inode);
+		if (inode)
+			break;
+	}
+	spin_unlock(&sbi->dir_hash_lock);
+	return inode;
+}
+
+static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos)
+{
+	if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO)
+		return fat_iget(sb, i_pos);
+
+	else {
+		if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
+			return NULL;
+		return ilookup(sb, ino);
+	}
+}
+
+static struct inode *__fat_nfs_get_inode(struct super_block *sb,
+				       u64 ino, u32 generation, loff_t i_pos)
+{
+	struct inode *inode = fat_ilookup(sb, ino, i_pos);
+
+	if (inode && generation && (inode->i_generation != generation)) {
+		iput(inode);
+		inode = NULL;
+	}
+	if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+		struct buffer_head *bh = NULL;
+		struct msdos_dir_entry *de ;
+		sector_t blocknr;
+		int offset;
+		fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset);
+		bh = sb_bread(sb, blocknr);
+		if (!bh) {
+			fat_msg(sb, KERN_ERR,
+				"unable to read block(%llu) for building NFS inode",
+				(llu)blocknr);
+			return inode;
+		}
+		de = (struct msdos_dir_entry *)bh->b_data;
+		/* If a file is deleted on server and client is not updated
+		 * yet, we must not build the inode upon a lookup call.
+		 */
+		if (IS_FREE(de[offset].name))
+			inode = NULL;
+		else
+			inode = fat_build_inode(sb, &de[offset], i_pos);
+		brelse(bh);
+	}
+
+	return inode;
+}
+
+static struct inode *fat_nfs_get_inode(struct super_block *sb,
+				       u64 ino, u32 generation)
+{
+
+	return __fat_nfs_get_inode(sb, ino, generation, 0);
+}
+
+static int
+fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
+		      struct inode *parent)
+{
+	int len = *lenp;
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct fat_fid *fid = (struct fat_fid *) fh;
+	loff_t i_pos;
+	int type = FILEID_FAT_WITHOUT_PARENT;
+
+	if (parent) {
+		if (len < FAT_FID_SIZE_WITH_PARENT) {
+			*lenp = FAT_FID_SIZE_WITH_PARENT;
+			return FILEID_INVALID;
+		}
+	} else {
+		if (len < FAT_FID_SIZE_WITHOUT_PARENT) {
+			*lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+			return FILEID_INVALID;
+		}
+	}
+
+	i_pos = fat_i_pos_read(sbi, inode);
+	*lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+	fid->i_gen = inode->i_generation;
+	fid->i_pos_low = i_pos & 0xFFFFFFFF;
+	fid->i_pos_hi = (i_pos >> 32) & 0xFFFF;
+	if (parent) {
+		i_pos = fat_i_pos_read(sbi, parent);
+		fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF;
+		fid->parent_i_pos_low = i_pos & 0xFFFFFFFF;
+		fid->parent_i_gen = parent->i_generation;
+		type = FILEID_FAT_WITH_PARENT;
+		*lenp = FAT_FID_SIZE_WITH_PARENT;
+	}
+
+	return type;
+}
+
+/**
+ * Map a NFS file handle to a corresponding dentry.
+ * The dentry may or may not be connected to the filesystem root.
+ */
+static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    fat_nfs_get_inode);
+}
+
+static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb,
+					       struct fid *fh, int fh_len,
+					       int fh_type)
+{
+	struct inode *inode = NULL;
+	struct fat_fid *fid = (struct fat_fid *)fh;
+	loff_t i_pos;
+
+	switch (fh_type) {
+	case FILEID_FAT_WITHOUT_PARENT:
+		if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT)
+			return NULL;
+		break;
+	case FILEID_FAT_WITH_PARENT:
+		if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+			return NULL;
+		break;
+	default:
+		return NULL;
+	}
+	i_pos = fid->i_pos_hi;
+	i_pos = (i_pos << 32) | (fid->i_pos_low);
+	inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos);
+
+	return d_obtain_alias(inode);
+}
+
+/*
+ * Find the parent for a file specified by NFS handle.
+ * This requires that the handle contain the i_ino of the parent.
+ */
+static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+				int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    fat_nfs_get_inode);
+}
+
+static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb,
+					       struct fid *fh, int fh_len,
+					       int fh_type)
+{
+	struct inode *inode = NULL;
+	struct fat_fid *fid = (struct fat_fid *)fh;
+	loff_t i_pos;
+
+	if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_FAT_WITH_PARENT:
+		i_pos = fid->parent_i_pos_hi;
+		i_pos = (i_pos << 32) | (fid->parent_i_pos_low);
+		inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+/*
+ * Rebuild the parent for a directory that is not connected
+ *  to the filesystem root
+ */
+static
+struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart)
+{
+	int search_clus, clus_to_match;
+	struct msdos_dir_entry *de;
+	struct inode *parent = NULL;
+	struct inode *dummy_grand_parent = NULL;
+	struct fat_slot_info sinfo;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart);
+	struct buffer_head *parent_bh = sb_bread(sb, blknr);
+	if (!parent_bh) {
+		fat_msg(sb, KERN_ERR,
+			"unable to read cluster of parent directory");
+		return NULL;
+	}
+
+	de = (struct msdos_dir_entry *) parent_bh->b_data;
+	clus_to_match = fat_get_start(sbi, &de[0]);
+	search_clus = fat_get_start(sbi, &de[1]);
+
+	dummy_grand_parent = fat_dget(sb, search_clus);
+	if (!dummy_grand_parent) {
+		dummy_grand_parent = new_inode(sb);
+		if (!dummy_grand_parent) {
+			brelse(parent_bh);
+			return parent;
+		}
+
+		dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO);
+		fat_fill_inode(dummy_grand_parent, &de[1]);
+		MSDOS_I(dummy_grand_parent)->i_pos = -1;
+	}
+
+	if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo))
+		parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+
+	brelse(parent_bh);
+	iput(dummy_grand_parent);
+
+	return parent;
+}
+
+/*
+ * Find the parent for a directory that is not currently connected to
+ * the filesystem root.
+ *
+ * On entry, the caller holds d_inode(child_dir)->i_mutex.
+ */
+static struct dentry *fat_get_parent(struct dentry *child_dir)
+{
+	struct super_block *sb = child_dir->d_sb;
+	struct buffer_head *bh = NULL;
+	struct msdos_dir_entry *de;
+	struct inode *parent_inode = NULL;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) {
+		int parent_logstart = fat_get_start(sbi, de);
+		parent_inode = fat_dget(sb, parent_logstart);
+		if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+			parent_inode = fat_rebuild_parent(sb, parent_logstart);
+	}
+	brelse(bh);
+
+	return d_obtain_alias(parent_inode);
+}
+
+const struct export_operations fat_export_ops = {
+	.fh_to_dentry   = fat_fh_to_dentry,
+	.fh_to_parent   = fat_fh_to_parent,
+	.get_parent     = fat_get_parent,
+};
+
+const struct export_operations fat_export_ops_nostale = {
+	.encode_fh      = fat_encode_fh_nostale,
+	.fh_to_dentry   = fat_fh_to_dentry_nostale,
+	.fh_to_parent   = fat_fh_to_parent_nostale,
+	.get_parent     = fat_get_parent,
+};
diff --git a/kernel/fs/fcntl.c b/kernel/fs/fcntl.c
new file mode 100644
index 000000000..ee85cd4e1
--- /dev/null
+++ b/kernel/fs/fcntl.c
@@ -0,0 +1,759 @@
+/*
+ *  linux/fs/fcntl.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/syscalls.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/capability.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/security.h>
+#include <linux/ptrace.h>
+#include <linux/signal.h>
+#include <linux/rcupdate.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/shmem_fs.h>
+
+#include <asm/poll.h>
+#include <asm/siginfo.h>
+#include <asm/uaccess.h>
+
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+
+static int setfl(int fd, struct file * filp, unsigned long arg)
+{
+	struct inode * inode = file_inode(filp);
+	int error = 0;
+
+	/*
+	 * O_APPEND cannot be cleared if the file is marked as append-only
+	 * and the file is open for write.
+	 */
+	if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
+		return -EPERM;
+
+	/* O_NOATIME can only be set by the owner or superuser */
+	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+
+	/* required for strict SunOS emulation */
+	if (O_NONBLOCK != O_NDELAY)
+	       if (arg & O_NDELAY)
+		   arg |= O_NONBLOCK;
+
+	if (arg & O_DIRECT) {
+		if (!filp->f_mapping || !filp->f_mapping->a_ops ||
+			!filp->f_mapping->a_ops->direct_IO)
+				return -EINVAL;
+	}
+
+	if (filp->f_op->check_flags)
+		error = filp->f_op->check_flags(arg);
+	if (error)
+		return error;
+
+	/*
+	 * ->fasync() is responsible for setting the FASYNC bit.
+	 */
+	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
+		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+		if (error < 0)
+			goto out;
+		if (error > 0)
+			error = 0;
+	}
+	spin_lock(&filp->f_lock);
+	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+	spin_unlock(&filp->f_lock);
+
+ out:
+	return error;
+}
+
+static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
+                     int force)
+{
+	write_lock_irq(&filp->f_owner.lock);
+	if (force || !filp->f_owner.pid) {
+		put_pid(filp->f_owner.pid);
+		filp->f_owner.pid = get_pid(pid);
+		filp->f_owner.pid_type = type;
+
+		if (pid) {
+			const struct cred *cred = current_cred();
+			filp->f_owner.uid = cred->uid;
+			filp->f_owner.euid = cred->euid;
+		}
+	}
+	write_unlock_irq(&filp->f_owner.lock);
+}
+
+void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
+		int force)
+{
+	security_file_set_fowner(filp);
+	f_modown(filp, pid, type, force);
+}
+EXPORT_SYMBOL(__f_setown);
+
+void f_setown(struct file *filp, unsigned long arg, int force)
+{
+	enum pid_type type;
+	struct pid *pid;
+	int who = arg;
+	type = PIDTYPE_PID;
+	if (who < 0) {
+		type = PIDTYPE_PGID;
+		who = -who;
+	}
+	rcu_read_lock();
+	pid = find_vpid(who);
+	__f_setown(filp, pid, type, force);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(f_setown);
+
+void f_delown(struct file *filp)
+{
+	f_modown(filp, NULL, PIDTYPE_PID, 1);
+}
+
+pid_t f_getown(struct file *filp)
+{
+	pid_t pid;
+	read_lock(&filp->f_owner.lock);
+	pid = pid_vnr(filp->f_owner.pid);
+	if (filp->f_owner.pid_type == PIDTYPE_PGID)
+		pid = -pid;
+	read_unlock(&filp->f_owner.lock);
+	return pid;
+}
+
+static int f_setown_ex(struct file *filp, unsigned long arg)
+{
+	struct f_owner_ex __user *owner_p = (void __user *)arg;
+	struct f_owner_ex owner;
+	struct pid *pid;
+	int type;
+	int ret;
+
+	ret = copy_from_user(&owner, owner_p, sizeof(owner));
+	if (ret)
+		return -EFAULT;
+
+	switch (owner.type) {
+	case F_OWNER_TID:
+		type = PIDTYPE_MAX;
+		break;
+
+	case F_OWNER_PID:
+		type = PIDTYPE_PID;
+		break;
+
+	case F_OWNER_PGRP:
+		type = PIDTYPE_PGID;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+	pid = find_vpid(owner.pid);
+	if (owner.pid && !pid)
+		ret = -ESRCH;
+	else
+		 __f_setown(filp, pid, type, 1);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int f_getown_ex(struct file *filp, unsigned long arg)
+{
+	struct f_owner_ex __user *owner_p = (void __user *)arg;
+	struct f_owner_ex owner;
+	int ret = 0;
+
+	read_lock(&filp->f_owner.lock);
+	owner.pid = pid_vnr(filp->f_owner.pid);
+	switch (filp->f_owner.pid_type) {
+	case PIDTYPE_MAX:
+		owner.type = F_OWNER_TID;
+		break;
+
+	case PIDTYPE_PID:
+		owner.type = F_OWNER_PID;
+		break;
+
+	case PIDTYPE_PGID:
+		owner.type = F_OWNER_PGRP;
+		break;
+
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+		break;
+	}
+	read_unlock(&filp->f_owner.lock);
+
+	if (!ret) {
+		ret = copy_to_user(owner_p, &owner, sizeof(owner));
+		if (ret)
+			ret = -EFAULT;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int f_getowner_uids(struct file *filp, unsigned long arg)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	uid_t __user *dst = (void __user *)arg;
+	uid_t src[2];
+	int err;
+
+	read_lock(&filp->f_owner.lock);
+	src[0] = from_kuid(user_ns, filp->f_owner.uid);
+	src[1] = from_kuid(user_ns, filp->f_owner.euid);
+	read_unlock(&filp->f_owner.lock);
+
+	err  = put_user(src[0], &dst[0]);
+	err |= put_user(src[1], &dst[1]);
+
+	return err;
+}
+#else
+static int f_getowner_uids(struct file *filp, unsigned long arg)
+{
+	return -EINVAL;
+}
+#endif
+
+static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
+		struct file *filp)
+{
+	long err = -EINVAL;
+
+	switch (cmd) {
+	case F_DUPFD:
+		err = f_dupfd(arg, filp, 0);
+		break;
+	case F_DUPFD_CLOEXEC:
+		err = f_dupfd(arg, filp, O_CLOEXEC);
+		break;
+	case F_GETFD:
+		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+		break;
+	case F_SETFD:
+		err = 0;
+		set_close_on_exec(fd, arg & FD_CLOEXEC);
+		break;
+	case F_GETFL:
+		err = filp->f_flags;
+		break;
+	case F_SETFL:
+		err = setfl(fd, filp, arg);
+		break;
+#if BITS_PER_LONG != 32
+	/* 32-bit arches must use fcntl64() */
+	case F_OFD_GETLK:
+#endif
+	case F_GETLK:
+		err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
+		break;
+#if BITS_PER_LONG != 32
+	/* 32-bit arches must use fcntl64() */
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
+#endif
+		/* Fallthrough */
+	case F_SETLK:
+	case F_SETLKW:
+		err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
+		break;
+	case F_GETOWN:
+		/*
+		 * XXX If f_owner is a process group, the
+		 * negative return value will get converted
+		 * into an error.  Oops.  If we keep the
+		 * current syscall conventions, the only way
+		 * to fix this will be in libc.
+		 */
+		err = f_getown(filp);
+		force_successful_syscall_return();
+		break;
+	case F_SETOWN:
+		f_setown(filp, arg, 1);
+		err = 0;
+		break;
+	case F_GETOWN_EX:
+		err = f_getown_ex(filp, arg);
+		break;
+	case F_SETOWN_EX:
+		err = f_setown_ex(filp, arg);
+		break;
+	case F_GETOWNER_UIDS:
+		err = f_getowner_uids(filp, arg);
+		break;
+	case F_GETSIG:
+		err = filp->f_owner.signum;
+		break;
+	case F_SETSIG:
+		/* arg == 0 restores default behaviour. */
+		if (!valid_signal(arg)) {
+			break;
+		}
+		err = 0;
+		filp->f_owner.signum = arg;
+		break;
+	case F_GETLEASE:
+		err = fcntl_getlease(filp);
+		break;
+	case F_SETLEASE:
+		err = fcntl_setlease(fd, filp, arg);
+		break;
+	case F_NOTIFY:
+		err = fcntl_dirnotify(fd, filp, arg);
+		break;
+	case F_SETPIPE_SZ:
+	case F_GETPIPE_SZ:
+		err = pipe_fcntl(filp, cmd, arg);
+		break;
+	case F_ADD_SEALS:
+	case F_GET_SEALS:
+		err = shmem_fcntl(filp, cmd, arg);
+		break;
+	default:
+		break;
+	}
+	return err;
+}
+
+static int check_fcntl_cmd(unsigned cmd)
+{
+	switch (cmd) {
+	case F_DUPFD:
+	case F_DUPFD_CLOEXEC:
+	case F_GETFD:
+	case F_SETFD:
+	case F_GETFL:
+		return 1;
+	}
+	return 0;
+}
+
+SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+{	
+	struct fd f = fdget_raw(fd);
+	long err = -EBADF;
+
+	if (!f.file)
+		goto out;
+
+	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd))
+			goto out1;
+	}
+
+	err = security_file_fcntl(f.file, cmd, arg);
+	if (!err)
+		err = do_fcntl(fd, cmd, arg, f.file);
+
+out1:
+ 	fdput(f);
+out:
+	return err;
+}
+
+#if BITS_PER_LONG == 32
+SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+		unsigned long, arg)
+{	
+	struct fd f = fdget_raw(fd);
+	long err = -EBADF;
+
+	if (!f.file)
+		goto out;
+
+	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd))
+			goto out1;
+	}
+
+	err = security_file_fcntl(f.file, cmd, arg);
+	if (err)
+		goto out1;
+	
+	switch (cmd) {
+	case F_GETLK64:
+	case F_OFD_GETLK:
+		err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
+		break;
+	case F_SETLK64:
+	case F_SETLKW64:
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
+		err = fcntl_setlk64(fd, f.file, cmd,
+				(struct flock64 __user *) arg);
+		break;
+	default:
+		err = do_fcntl(fd, cmd, arg, f.file);
+		break;
+	}
+out1:
+	fdput(f);
+out:
+	return err;
+}
+#endif
+
+/* Table to convert sigio signal codes into poll band bitmaps */
+
+static const long band_table[NSIGPOLL] = {
+	POLLIN | POLLRDNORM,			/* POLL_IN */
+	POLLOUT | POLLWRNORM | POLLWRBAND,	/* POLL_OUT */
+	POLLIN | POLLRDNORM | POLLMSG,		/* POLL_MSG */
+	POLLERR,				/* POLL_ERR */
+	POLLPRI | POLLRDBAND,			/* POLL_PRI */
+	POLLHUP | POLLERR			/* POLL_HUP */
+};
+
+static inline int sigio_perm(struct task_struct *p,
+                             struct fown_struct *fown, int sig)
+{
+	const struct cred *cred;
+	int ret;
+
+	rcu_read_lock();
+	cred = __task_cred(p);
+	ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
+		uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
+		uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
+	       !security_file_send_sigiotask(p, fown, sig));
+	rcu_read_unlock();
+	return ret;
+}
+
+static void send_sigio_to_task(struct task_struct *p,
+			       struct fown_struct *fown,
+			       int fd, int reason, int group)
+{
+	/*
+	 * F_SETSIG can change ->signum lockless in parallel, make
+	 * sure we read it once and use the same value throughout.
+	 */
+	int signum = ACCESS_ONCE(fown->signum);
+
+	if (!sigio_perm(p, fown, signum))
+		return;
+
+	switch (signum) {
+		siginfo_t si;
+		default:
+			/* Queue a rt signal with the appropriate fd as its
+			   value.  We use SI_SIGIO as the source, not 
+			   SI_KERNEL, since kernel signals always get 
+			   delivered even if we can't queue.  Failure to
+			   queue in this case _should_ be reported; we fall
+			   back to SIGIO in that case. --sct */
+			si.si_signo = signum;
+			si.si_errno = 0;
+		        si.si_code  = reason;
+			/* Make sure we are called with one of the POLL_*
+			   reasons, otherwise we could leak kernel stack into
+			   userspace.  */
+			BUG_ON((reason & __SI_MASK) != __SI_POLL);
+			if (reason - POLL_IN >= NSIGPOLL)
+				si.si_band  = ~0L;
+			else
+				si.si_band = band_table[reason - POLL_IN];
+			si.si_fd    = fd;
+			if (!do_send_sig_info(signum, &si, p, group))
+				break;
+		/* fall-through: fall back on the old plain SIGIO signal */
+		case 0:
+			do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
+	}
+}
+
+void send_sigio(struct fown_struct *fown, int fd, int band)
+{
+	struct task_struct *p;
+	enum pid_type type;
+	struct pid *pid;
+	int group = 1;
+	
+	read_lock(&fown->lock);
+
+	type = fown->pid_type;
+	if (type == PIDTYPE_MAX) {
+		group = 0;
+		type = PIDTYPE_PID;
+	}
+
+	pid = fown->pid;
+	if (!pid)
+		goto out_unlock_fown;
+	
+	read_lock(&tasklist_lock);
+	do_each_pid_task(pid, type, p) {
+		send_sigio_to_task(p, fown, fd, band, group);
+	} while_each_pid_task(pid, type, p);
+	read_unlock(&tasklist_lock);
+ out_unlock_fown:
+	read_unlock(&fown->lock);
+}
+
+static void send_sigurg_to_task(struct task_struct *p,
+				struct fown_struct *fown, int group)
+{
+	if (sigio_perm(p, fown, SIGURG))
+		do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
+}
+
+int send_sigurg(struct fown_struct *fown)
+{
+	struct task_struct *p;
+	enum pid_type type;
+	struct pid *pid;
+	int group = 1;
+	int ret = 0;
+	
+	read_lock(&fown->lock);
+
+	type = fown->pid_type;
+	if (type == PIDTYPE_MAX) {
+		group = 0;
+		type = PIDTYPE_PID;
+	}
+
+	pid = fown->pid;
+	if (!pid)
+		goto out_unlock_fown;
+
+	ret = 1;
+	
+	read_lock(&tasklist_lock);
+	do_each_pid_task(pid, type, p) {
+		send_sigurg_to_task(p, fown, group);
+	} while_each_pid_task(pid, type, p);
+	read_unlock(&tasklist_lock);
+ out_unlock_fown:
+	read_unlock(&fown->lock);
+	return ret;
+}
+
+static DEFINE_SPINLOCK(fasync_lock);
+static struct kmem_cache *fasync_cache __read_mostly;
+
+static void fasync_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(fasync_cache,
+			container_of(head, struct fasync_struct, fa_rcu));
+}
+
+/*
+ * Remove a fasync entry. If successfully removed, return
+ * positive and clear the FASYNC flag. If no entry exists,
+ * do nothing and return 0.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ *
+ */
+int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+{
+	struct fasync_struct *fa, **fp;
+	int result = 0;
+
+	spin_lock(&filp->f_lock);
+	spin_lock(&fasync_lock);
+	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+		if (fa->fa_file != filp)
+			continue;
+
+		spin_lock_irq(&fa->fa_lock);
+		fa->fa_file = NULL;
+		spin_unlock_irq(&fa->fa_lock);
+
+		*fp = fa->fa_next;
+		call_rcu(&fa->fa_rcu, fasync_free_rcu);
+		filp->f_flags &= ~FASYNC;
+		result = 1;
+		break;
+	}
+	spin_unlock(&fasync_lock);
+	spin_unlock(&filp->f_lock);
+	return result;
+}
+
+struct fasync_struct *fasync_alloc(void)
+{
+	return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+}
+
+/*
+ * NOTE! This can be used only for unused fasync entries:
+ * entries that actually got inserted on the fasync list
+ * need to be released by rcu - see fasync_remove_entry.
+ */
+void fasync_free(struct fasync_struct *new)
+{
+	kmem_cache_free(fasync_cache, new);
+}
+
+/*
+ * Insert a new entry into the fasync list.  Return the pointer to the
+ * old one if we didn't use the new one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ */
+struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
+{
+        struct fasync_struct *fa, **fp;
+
+	spin_lock(&filp->f_lock);
+	spin_lock(&fasync_lock);
+	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+		if (fa->fa_file != filp)
+			continue;
+
+		spin_lock_irq(&fa->fa_lock);
+		fa->fa_fd = fd;
+		spin_unlock_irq(&fa->fa_lock);
+		goto out;
+	}
+
+	spin_lock_init(&new->fa_lock);
+	new->magic = FASYNC_MAGIC;
+	new->fa_file = filp;
+	new->fa_fd = fd;
+	new->fa_next = *fapp;
+	rcu_assign_pointer(*fapp, new);
+	filp->f_flags |= FASYNC;
+
+out:
+	spin_unlock(&fasync_lock);
+	spin_unlock(&filp->f_lock);
+	return fa;
+}
+
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+	struct fasync_struct *new;
+
+	new = fasync_alloc();
+	if (!new)
+		return -ENOMEM;
+
+	/*
+	 * fasync_insert_entry() returns the old (update) entry if
+	 * it existed.
+	 *
+	 * So free the (unused) new entry and return 0 to let the
+	 * caller know that we didn't add any new fasync entries.
+	 */
+	if (fasync_insert_entry(fd, filp, fapp, new)) {
+		fasync_free(new);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * fasync_helper() is used by almost all character device drivers
+ * to set up the fasync queue, and for regular files by the file
+ * lease code. It returns negative on error, 0 if it did no changes
+ * and positive if it added/deleted the entry.
+ */
+int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+{
+	if (!on)
+		return fasync_remove_entry(filp, fapp);
+	return fasync_add_entry(fd, filp, fapp);
+}
+
+EXPORT_SYMBOL(fasync_helper);
+
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
+{
+	while (fa) {
+		struct fown_struct *fown;
+		unsigned long flags;
+
+		if (fa->magic != FASYNC_MAGIC) {
+			printk(KERN_ERR "kill_fasync: bad magic number in "
+			       "fasync_struct!\n");
+			return;
+		}
+		spin_lock_irqsave(&fa->fa_lock, flags);
+		if (fa->fa_file) {
+			fown = &fa->fa_file->f_owner;
+			/* Don't send SIGURG to processes which have not set a
+			   queued signum: SIGURG has its own default signalling
+			   mechanism. */
+			if (!(sig == SIGURG && fown->signum == 0))
+				send_sigio(fown, fa->fa_fd, band);
+		}
+		spin_unlock_irqrestore(&fa->fa_lock, flags);
+		fa = rcu_dereference(fa->fa_next);
+	}
+}
+
+void kill_fasync(struct fasync_struct **fp, int sig, int band)
+{
+	/* First a quick test without locking: usually
+	 * the list is empty.
+	 */
+	if (*fp) {
+		rcu_read_lock();
+		kill_fasync_rcu(rcu_dereference(*fp), sig, band);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(kill_fasync);
+
+static int __init fcntl_init(void)
+{
+	/*
+	 * Please add new bits here to ensure allocation uniqueness.
+	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
+	 * is defined as O_NONBLOCK on some platforms and not on others.
+	 */
+	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+		O_RDONLY	| O_WRONLY	| O_RDWR	|
+		O_CREAT		| O_EXCL	| O_NOCTTY	|
+		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
+		__O_SYNC	| O_DSYNC	| FASYNC	|
+		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
+		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
+		__FMODE_EXEC	| O_PATH	| __O_TMPFILE	|
+		__FMODE_NONOTIFY
+		));
+
+	fasync_cache = kmem_cache_create("fasync_cache",
+		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+	return 0;
+}
+
+module_init(fcntl_init)
diff --git a/kernel/fs/fhandle.c b/kernel/fs/fhandle.c
new file mode 100644
index 000000000..d59712dfa
--- /dev/null
+++ b/kernel/fs/fhandle.c
@@ -0,0 +1,266 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <linux/personality.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+#include "mount.h"
+
+static long do_sys_name_to_handle(struct path *path,
+				  struct file_handle __user *ufh,
+				  int __user *mnt_id)
+{
+	long retval;
+	struct file_handle f_handle;
+	int handle_dwords, handle_bytes;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * We need to make sure whether the file system
+	 * support decoding of the file handle
+	 */
+	if (!path->dentry->d_sb->s_export_op ||
+	    !path->dentry->d_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+		return -EFAULT;
+
+	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+		return -EINVAL;
+
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	/* convert handle size to multiple of sizeof(u32) */
+	handle_dwords = f_handle.handle_bytes >> 2;
+
+	/* we ask for a non connected handle */
+	retval = exportfs_encode_fh(path->dentry,
+				    (struct fid *)handle->f_handle,
+				    &handle_dwords,  0);
+	handle->handle_type = retval;
+	/* convert handle size to bytes */
+	handle_bytes = handle_dwords * sizeof(u32);
+	handle->handle_bytes = handle_bytes;
+	if ((handle->handle_bytes > f_handle.handle_bytes) ||
+	    (retval == FILEID_INVALID) || (retval == -ENOSPC)) {
+		/* As per old exportfs_encode_fh documentation
+		 * we could return ENOSPC to indicate overflow
+		 * But file system returned 255 always. So handle
+		 * both the values
+		 */
+		/*
+		 * set the handle size to zero so we copy only
+		 * non variable part of the file_handle
+		 */
+		handle_bytes = 0;
+		retval = -EOVERFLOW;
+	} else
+		retval = 0;
+	/* copy the mount id */
+	if (copy_to_user(mnt_id, &real_mount(path->mnt)->mnt_id,
+			 sizeof(*mnt_id)) ||
+	    copy_to_user(ufh, handle,
+			 sizeof(struct file_handle) + handle_bytes))
+		retval = -EFAULT;
+	kfree(handle);
+	return retval;
+}
+
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+		struct file_handle __user *, handle, int __user *, mnt_id,
+		int, flag)
+{
+	struct path path;
+	int lookup_flags;
+	int err;
+
+	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	err = user_path_at(dfd, name, lookup_flags, &path);
+	if (!err) {
+		err = do_sys_name_to_handle(&path, handle, mnt_id);
+		path_put(&path);
+	}
+	return err;
+}
+
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+	struct vfsmount *mnt;
+
+	if (fd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		mnt = mntget(fs->pwd.mnt);
+		spin_unlock(&fs->lock);
+	} else {
+		struct fd f = fdget(fd);
+		if (!f.file)
+			return ERR_PTR(-EBADF);
+		mnt = mntget(f.file->f_path.mnt);
+		fdput(f);
+	}
+	return mnt;
+}
+
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+	return 1;
+}
+
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+			     struct path *path)
+{
+	int retval = 0;
+	int handle_dwords;
+
+	path->mnt = get_vfsmount_from_fd(mountdirfd);
+	if (IS_ERR(path->mnt)) {
+		retval = PTR_ERR(path->mnt);
+		goto out_err;
+	}
+	/* change the handle size to multiple of sizeof(u32) */
+	handle_dwords = handle->handle_bytes >> 2;
+	path->dentry = exportfs_decode_fh(path->mnt,
+					  (struct fid *)handle->f_handle,
+					  handle_dwords, handle->handle_type,
+					  vfs_dentry_acceptable, NULL);
+	if (IS_ERR(path->dentry)) {
+		retval = PTR_ERR(path->dentry);
+		goto out_mnt;
+	}
+	return 0;
+out_mnt:
+	mntput(path->mnt);
+out_err:
+	return retval;
+}
+
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+		   struct path *path)
+{
+	int retval = 0;
+	struct file_handle f_handle;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * With handle we don't look at the execute bit on the
+	 * the directory. Ideally we would like CAP_DAC_SEARCH.
+	 * But we don't have that
+	 */
+	if (!capable(CAP_DAC_READ_SEARCH)) {
+		retval = -EPERM;
+		goto out_err;
+	}
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+		retval = -EFAULT;
+		goto out_err;
+	}
+	if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+	    (f_handle.handle_bytes == 0)) {
+		retval = -EINVAL;
+		goto out_err;
+	}
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle) {
+		retval = -ENOMEM;
+		goto out_err;
+	}
+	/* copy the full handle */
+	*handle = f_handle;
+	if (copy_from_user(&handle->f_handle,
+			   &ufh->f_handle,
+			   f_handle.handle_bytes)) {
+		retval = -EFAULT;
+		goto out_handle;
+	}
+
+	retval = do_handle_to_path(mountdirfd, handle, path);
+
+out_handle:
+	kfree(handle);
+out_err:
+	return retval;
+}
+
+long do_handle_open(int mountdirfd,
+		    struct file_handle __user *ufh, int open_flag)
+{
+	long retval = 0;
+	struct path path;
+	struct file *file;
+	int fd;
+
+	retval = handle_to_path(mountdirfd, ufh, &path);
+	if (retval)
+		return retval;
+
+	fd = get_unused_fd_flags(open_flag);
+	if (fd < 0) {
+		path_put(&path);
+		return fd;
+	}
+	file = file_open_root(path.dentry, path.mnt, "", open_flag);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		retval =  PTR_ERR(file);
+	} else {
+		retval = fd;
+		fsnotify_open(file);
+		fd_install(fd, file);
+	}
+	path_put(&path);
+	return retval;
+}
+
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+		struct file_handle __user *, handle,
+		int, flags)
+{
+	long ret;
+
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_handle_open(mountdirfd, handle, flags);
+	return ret;
+}
diff --git a/kernel/fs/file.c b/kernel/fs/file.c
new file mode 100644
index 000000000..93c5f89c2
--- /dev/null
+++ b/kernel/fs/file.c
@@ -0,0 +1,914 @@
+/*
+ *  linux/fs/file.c
+ *
+ *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
+ *
+ *  Manage the dynamic fd arrays in the process files_struct.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+
+int sysctl_nr_open __read_mostly = 1024*1024;
+int sysctl_nr_open_min = BITS_PER_LONG;
+/* our max() is unusable in constant expressions ;-/ */
+#define __const_max(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+			 -BITS_PER_LONG;
+
+static void *alloc_fdmem(size_t size)
+{
+	/*
+	 * Very large allocations can stress page reclaim, so fall back to
+	 * vmalloc() if the allocation size will be considered "large" by the VM.
+	 */
+	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+		if (data != NULL)
+			return data;
+	}
+	return vmalloc(size);
+}
+
+static void __free_fdtable(struct fdtable *fdt)
+{
+	kvfree(fdt->fd);
+	kvfree(fdt->open_fds);
+	kfree(fdt);
+}
+
+static void free_fdtable_rcu(struct rcu_head *rcu)
+{
+	__free_fdtable(container_of(rcu, struct fdtable, rcu));
+}
+
+/*
+ * Expand the fdset in the files_struct.  Called with the files spinlock
+ * held for write.
+ */
+static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
+{
+	unsigned int cpy, set;
+
+	BUG_ON(nfdt->max_fds < ofdt->max_fds);
+
+	cpy = ofdt->max_fds * sizeof(struct file *);
+	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
+	memcpy(nfdt->fd, ofdt->fd, cpy);
+	memset((char *)(nfdt->fd) + cpy, 0, set);
+
+	cpy = ofdt->max_fds / BITS_PER_BYTE;
+	set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
+	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+	memset((char *)(nfdt->open_fds) + cpy, 0, set);
+	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+	memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+}
+
+static struct fdtable * alloc_fdtable(unsigned int nr)
+{
+	struct fdtable *fdt;
+	void *data;
+
+	/*
+	 * Figure out how many fds we actually want to support in this fdtable.
+	 * Allocation steps are keyed to the size of the fdarray, since it
+	 * grows far faster than any of the other dynamic data. We try to fit
+	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
+	 * and growing in powers of two from there on.
+	 */
+	nr /= (1024 / sizeof(struct file *));
+	nr = roundup_pow_of_two(nr + 1);
+	nr *= (1024 / sizeof(struct file *));
+	/*
+	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
+	 * had been set lower between the check in expand_files() and here.  Deal
+	 * with that in caller, it's cheaper that way.
+	 *
+	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
+	 * bitmaps handling below becomes unpleasant, to put it mildly...
+	 */
+	if (unlikely(nr > sysctl_nr_open))
+		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
+
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+	if (!fdt)
+		goto out;
+	fdt->max_fds = nr;
+	data = alloc_fdmem(nr * sizeof(struct file *));
+	if (!data)
+		goto out_fdt;
+	fdt->fd = data;
+
+	data = alloc_fdmem(max_t(size_t,
+				 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+	if (!data)
+		goto out_arr;
+	fdt->open_fds = data;
+	data += nr / BITS_PER_BYTE;
+	fdt->close_on_exec = data;
+
+	return fdt;
+
+out_arr:
+	kvfree(fdt->fd);
+out_fdt:
+	kfree(fdt);
+out:
+	return NULL;
+}
+
+/*
+ * Expand the file descriptor table.
+ * This function will allocate a new fdtable and both fd array and fdset, of
+ * the given size.
+ * Return <0 error code on error; 1 on successful completion.
+ * The files->file_lock should be held on entry, and will be held on exit.
+ */
+static int expand_fdtable(struct files_struct *files, int nr)
+	__releases(files->file_lock)
+	__acquires(files->file_lock)
+{
+	struct fdtable *new_fdt, *cur_fdt;
+
+	spin_unlock(&files->file_lock);
+	new_fdt = alloc_fdtable(nr);
+	spin_lock(&files->file_lock);
+	if (!new_fdt)
+		return -ENOMEM;
+	/*
+	 * extremely unlikely race - sysctl_nr_open decreased between the check in
+	 * caller and alloc_fdtable().  Cheaper to catch it here...
+	 */
+	if (unlikely(new_fdt->max_fds <= nr)) {
+		__free_fdtable(new_fdt);
+		return -EMFILE;
+	}
+	/*
+	 * Check again since another task may have expanded the fd table while
+	 * we dropped the lock
+	 */
+	cur_fdt = files_fdtable(files);
+	if (nr >= cur_fdt->max_fds) {
+		/* Continue as planned */
+		copy_fdtable(new_fdt, cur_fdt);
+		rcu_assign_pointer(files->fdt, new_fdt);
+		if (cur_fdt != &files->fdtab)
+			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+	} else {
+		/* Somebody else expanded, so undo our attempt */
+		__free_fdtable(new_fdt);
+	}
+	return 1;
+}
+
+/*
+ * Expand files.
+ * This function will expand the file structures, if the requested size exceeds
+ * the current capacity and there is room for expansion.
+ * Return <0 error code on error; 0 when nothing done; 1 when files were
+ * expanded and execution may have blocked.
+ * The files->file_lock should be held on entry, and will be held on exit.
+ */
+static int expand_files(struct files_struct *files, int nr)
+{
+	struct fdtable *fdt;
+
+	fdt = files_fdtable(files);
+
+	/* Do we need to expand? */
+	if (nr < fdt->max_fds)
+		return 0;
+
+	/* Can we expand? */
+	if (nr >= sysctl_nr_open)
+		return -EMFILE;
+
+	/* All good, so we try */
+	return expand_fdtable(files, nr);
+}
+
+static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
+{
+	__set_bit(fd, fdt->close_on_exec);
+}
+
+static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
+{
+	__clear_bit(fd, fdt->close_on_exec);
+}
+
+static inline void __set_open_fd(int fd, struct fdtable *fdt)
+{
+	__set_bit(fd, fdt->open_fds);
+}
+
+static inline void __clear_open_fd(int fd, struct fdtable *fdt)
+{
+	__clear_bit(fd, fdt->open_fds);
+}
+
+static int count_open_files(struct fdtable *fdt)
+{
+	int size = fdt->max_fds;
+	int i;
+
+	/* Find the last open fd */
+	for (i = size / BITS_PER_LONG; i > 0; ) {
+		if (fdt->open_fds[--i])
+			break;
+	}
+	i = (i + 1) * BITS_PER_LONG;
+	return i;
+}
+
+/*
+ * Allocate a new files structure and copy contents from the
+ * passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
+ */
+struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+{
+	struct files_struct *newf;
+	struct file **old_fds, **new_fds;
+	int open_files, size, i;
+	struct fdtable *old_fdt, *new_fdt;
+
+	*errorp = -ENOMEM;
+	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
+	new_fdt = &newf->fdtab;
+	new_fdt->max_fds = NR_OPEN_DEFAULT;
+	new_fdt->close_on_exec = newf->close_on_exec_init;
+	new_fdt->open_fds = newf->open_fds_init;
+	new_fdt->fd = &newf->fd_array[0];
+
+	spin_lock(&oldf->file_lock);
+	old_fdt = files_fdtable(oldf);
+	open_files = count_open_files(old_fdt);
+
+	/*
+	 * Check whether we need to allocate a larger fd array and fd set.
+	 */
+	while (unlikely(open_files > new_fdt->max_fds)) {
+		spin_unlock(&oldf->file_lock);
+
+		if (new_fdt != &newf->fdtab)
+			__free_fdtable(new_fdt);
+
+		new_fdt = alloc_fdtable(open_files - 1);
+		if (!new_fdt) {
+			*errorp = -ENOMEM;
+			goto out_release;
+		}
+
+		/* beyond sysctl_nr_open; nothing to do */
+		if (unlikely(new_fdt->max_fds < open_files)) {
+			__free_fdtable(new_fdt);
+			*errorp = -EMFILE;
+			goto out_release;
+		}
+
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
+		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
+		open_files = count_open_files(old_fdt);
+	}
+
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
+
+	memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
+	memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
+
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *old_fds++;
+		if (f) {
+			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			__clear_open_fd(open_files - i, new_fdt);
+		}
+		rcu_assign_pointer(*new_fds++, f);
+	}
+	spin_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
+
+	/* This is long word aligned thus could use a optimized version */
+	memset(new_fds, 0, size);
+
+	if (new_fdt->max_fds > open_files) {
+		int left = (new_fdt->max_fds - open_files) / 8;
+		int start = open_files / BITS_PER_LONG;
+
+		memset(&new_fdt->open_fds[start], 0, left);
+		memset(&new_fdt->close_on_exec[start], 0, left);
+	}
+
+	rcu_assign_pointer(newf->fdt, new_fdt);
+
+	return newf;
+
+out_release:
+	kmem_cache_free(files_cachep, newf);
+out:
+	return NULL;
+}
+
+static struct fdtable *close_files(struct files_struct * files)
+{
+	/*
+	 * It is safe to dereference the fd table without RCU or
+	 * ->file_lock because this is the last reference to the
+	 * files structure.
+	 */
+	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+	int i, j = 0;
+
+	for (;;) {
+		unsigned long set;
+		i = j * BITS_PER_LONG;
+		if (i >= fdt->max_fds)
+			break;
+		set = fdt->open_fds[j++];
+		while (set) {
+			if (set & 1) {
+				struct file * file = xchg(&fdt->fd[i], NULL);
+				if (file) {
+					filp_close(file, files);
+					cond_resched_rcu_qs();
+				}
+			}
+			i++;
+			set >>= 1;
+		}
+	}
+
+	return fdt;
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+	struct files_struct *files;
+
+	task_lock(task);
+	files = task->files;
+	if (files)
+		atomic_inc(&files->count);
+	task_unlock(task);
+
+	return files;
+}
+
+void put_files_struct(struct files_struct *files)
+{
+	if (atomic_dec_and_test(&files->count)) {
+		struct fdtable *fdt = close_files(files);
+
+		/* free the arrays if they are not embedded */
+		if (fdt != &files->fdtab)
+			__free_fdtable(fdt);
+		kmem_cache_free(files_cachep, files);
+	}
+}
+
+void reset_files_struct(struct files_struct *files)
+{
+	struct task_struct *tsk = current;
+	struct files_struct *old;
+
+	old = tsk->files;
+	task_lock(tsk);
+	tsk->files = files;
+	task_unlock(tsk);
+	put_files_struct(old);
+}
+
+void exit_files(struct task_struct *tsk)
+{
+	struct files_struct * files = tsk->files;
+
+	if (files) {
+		task_lock(tsk);
+		tsk->files = NULL;
+		task_unlock(tsk);
+		put_files_struct(files);
+	}
+}
+
+struct files_struct init_files = {
+	.count		= ATOMIC_INIT(1),
+	.fdt		= &init_files.fdtab,
+	.fdtab		= {
+		.max_fds	= NR_OPEN_DEFAULT,
+		.fd		= &init_files.fd_array[0],
+		.close_on_exec	= init_files.close_on_exec_init,
+		.open_fds	= init_files.open_fds_init,
+	},
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+};
+
+/*
+ * allocate a file descriptor, mark it busy.
+ */
+int __alloc_fd(struct files_struct *files,
+	       unsigned start, unsigned end, unsigned flags)
+{
+	unsigned int fd;
+	int error;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+repeat:
+	fdt = files_fdtable(files);
+	fd = start;
+	if (fd < files->next_fd)
+		fd = files->next_fd;
+
+	if (fd < fdt->max_fds)
+		fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
+
+	/*
+	 * N.B. For clone tasks sharing a files structure, this test
+	 * will limit the total number of files that can be opened.
+	 */
+	error = -EMFILE;
+	if (fd >= end)
+		goto out;
+
+	error = expand_files(files, fd);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * If we needed to expand the fs array we
+	 * might have blocked - try again.
+	 */
+	if (error)
+		goto repeat;
+
+	if (start <= files->next_fd)
+		files->next_fd = fd + 1;
+
+	__set_open_fd(fd, fdt);
+	if (flags & O_CLOEXEC)
+		__set_close_on_exec(fd, fdt);
+	else
+		__clear_close_on_exec(fd, fdt);
+	error = fd;
+#if 1
+	/* Sanity check */
+	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
+		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
+		rcu_assign_pointer(fdt->fd[fd], NULL);
+	}
+#endif
+
+out:
+	spin_unlock(&files->file_lock);
+	return error;
+}
+
+static int alloc_fd(unsigned start, unsigned flags)
+{
+	return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
+}
+
+int get_unused_fd_flags(unsigned flags)
+{
+	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
+}
+EXPORT_SYMBOL(get_unused_fd_flags);
+
+static void __put_unused_fd(struct files_struct *files, unsigned int fd)
+{
+	struct fdtable *fdt = files_fdtable(files);
+	__clear_open_fd(fd, fdt);
+	if (fd < files->next_fd)
+		files->next_fd = fd;
+}
+
+void put_unused_fd(unsigned int fd)
+{
+	struct files_struct *files = current->files;
+	spin_lock(&files->file_lock);
+	__put_unused_fd(files, fd);
+	spin_unlock(&files->file_lock);
+}
+
+EXPORT_SYMBOL(put_unused_fd);
+
+/*
+ * Install a file pointer in the fd array.
+ *
+ * The VFS is full of places where we drop the files lock between
+ * setting the open_fds bitmap and installing the file in the file
+ * array.  At any such point, we are vulnerable to a dup2() race
+ * installing a file in the array before us.  We need to detect this and
+ * fput() the struct file we are about to overwrite in this case.
+ *
+ * It should never happen - if we allow dup2() do it, _really_ bad things
+ * will follow.
+ *
+ * NOTE: __fd_install() variant is really, really low-level; don't
+ * use it unless you are forced to by truly lousy API shoved down
+ * your throat.  'files' *MUST* be either current->files or obtained
+ * by get_files_struct(current) done by whoever had given it to you,
+ * or really bad things will happen.  Normally you want to use
+ * fd_install() instead.
+ */
+
+void __fd_install(struct files_struct *files, unsigned int fd,
+		struct file *file)
+{
+	struct fdtable *fdt;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	BUG_ON(fdt->fd[fd] != NULL);
+	rcu_assign_pointer(fdt->fd[fd], file);
+	spin_unlock(&files->file_lock);
+}
+
+void fd_install(unsigned int fd, struct file *file)
+{
+	__fd_install(current->files, fd, file);
+}
+
+EXPORT_SYMBOL(fd_install);
+
+/*
+ * The same warnings as for __alloc_fd()/__fd_install() apply here...
+ */
+int __close_fd(struct files_struct *files, unsigned fd)
+{
+	struct file *file;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (fd >= fdt->max_fds)
+		goto out_unlock;
+	file = fdt->fd[fd];
+	if (!file)
+		goto out_unlock;
+	rcu_assign_pointer(fdt->fd[fd], NULL);
+	__clear_close_on_exec(fd, fdt);
+	__put_unused_fd(files, fd);
+	spin_unlock(&files->file_lock);
+	return filp_close(file, files);
+
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return -EBADF;
+}
+
+void do_close_on_exec(struct files_struct *files)
+{
+	unsigned i;
+	struct fdtable *fdt;
+
+	/* exec unshares first */
+	spin_lock(&files->file_lock);
+	for (i = 0; ; i++) {
+		unsigned long set;
+		unsigned fd = i * BITS_PER_LONG;
+		fdt = files_fdtable(files);
+		if (fd >= fdt->max_fds)
+			break;
+		set = fdt->close_on_exec[i];
+		if (!set)
+			continue;
+		fdt->close_on_exec[i] = 0;
+		for ( ; set ; fd++, set >>= 1) {
+			struct file *file;
+			if (!(set & 1))
+				continue;
+			file = fdt->fd[fd];
+			if (!file)
+				continue;
+			rcu_assign_pointer(fdt->fd[fd], NULL);
+			__put_unused_fd(files, fd);
+			spin_unlock(&files->file_lock);
+			filp_close(file, files);
+			cond_resched();
+			spin_lock(&files->file_lock);
+		}
+
+	}
+	spin_unlock(&files->file_lock);
+}
+
+static struct file *__fget(unsigned int fd, fmode_t mask)
+{
+	struct files_struct *files = current->files;
+	struct file *file;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if ((file->f_mode & mask) || !get_file_rcu(file))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
+struct file *fget(unsigned int fd)
+{
+	return __fget(fd, FMODE_PATH);
+}
+EXPORT_SYMBOL(fget);
+
+struct file *fget_raw(unsigned int fd)
+{
+	return __fget(fd, 0);
+}
+EXPORT_SYMBOL(fget_raw);
+
+/*
+ * Lightweight file lookup - no refcnt increment if fd table isn't shared.
+ *
+ * You can use this instead of fget if you satisfy all of the following
+ * conditions:
+ * 1) You must call fput_light before exiting the syscall and returning control
+ *    to userspace (i.e. you cannot remember the returned struct file * after
+ *    returning to userspace).
+ * 2) You must not call filp_close on the returned struct file * in between
+ *    calls to fget_light and fput_light.
+ * 3) You must not clone the current task in between the calls to fget_light
+ *    and fput_light.
+ *
+ * The fput_needed flag returned by fget_light should be passed to the
+ * corresponding fput_light.
+ */
+static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+{
+	struct files_struct *files = current->files;
+	struct file *file;
+
+	if (atomic_read(&files->count) == 1) {
+		file = __fcheck_files(files, fd);
+		if (!file || unlikely(file->f_mode & mask))
+			return 0;
+		return (unsigned long)file;
+	} else {
+		file = __fget(fd, mask);
+		if (!file)
+			return 0;
+		return FDPUT_FPUT | (unsigned long)file;
+	}
+}
+unsigned long __fdget(unsigned int fd)
+{
+	return __fget_light(fd, FMODE_PATH);
+}
+EXPORT_SYMBOL(__fdget);
+
+unsigned long __fdget_raw(unsigned int fd)
+{
+	return __fget_light(fd, 0);
+}
+
+unsigned long __fdget_pos(unsigned int fd)
+{
+	unsigned long v = __fdget(fd);
+	struct file *file = (struct file *)(v & ~3);
+
+	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+		if (file_count(file) > 1) {
+			v |= FDPUT_POS_UNLOCK;
+			mutex_lock(&file->f_pos_lock);
+		}
+	}
+	return v;
+}
+
+/*
+ * We only lock f_pos if we have threads or if the file might be
+ * shared with another process. In both cases we'll have an elevated
+ * file count (done either by fdget() or by fork()).
+ */
+
+void set_close_on_exec(unsigned int fd, int flag)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (flag)
+		__set_close_on_exec(fd, fdt);
+	else
+		__clear_close_on_exec(fd, fdt);
+	spin_unlock(&files->file_lock);
+}
+
+bool get_close_on_exec(unsigned int fd)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	bool res;
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	res = close_on_exec(fd, fdt);
+	rcu_read_unlock();
+	return res;
+}
+
+static int do_dup2(struct files_struct *files,
+	struct file *file, unsigned fd, unsigned flags)
+__releases(&files->file_lock)
+{
+	struct file *tofree;
+	struct fdtable *fdt;
+
+	/*
+	 * We need to detect attempts to do dup2() over allocated but still
+	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
+	 * extra work in their equivalent of fget() - they insert struct
+	 * file immediately after grabbing descriptor, mark it larval if
+	 * more work (e.g. actual opening) is needed and make sure that
+	 * fget() treats larval files as absent.  Potentially interesting,
+	 * but while extra work in fget() is trivial, locking implications
+	 * and amount of surgery on open()-related paths in VFS are not.
+	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
+	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
+	 * scope of POSIX or SUS, since neither considers shared descriptor
+	 * tables and this condition does not arise without those.
+	 */
+	fdt = files_fdtable(files);
+	tofree = fdt->fd[fd];
+	if (!tofree && fd_is_open(fd, fdt))
+		goto Ebusy;
+	get_file(file);
+	rcu_assign_pointer(fdt->fd[fd], file);
+	__set_open_fd(fd, fdt);
+	if (flags & O_CLOEXEC)
+		__set_close_on_exec(fd, fdt);
+	else
+		__clear_close_on_exec(fd, fdt);
+	spin_unlock(&files->file_lock);
+
+	if (tofree)
+		filp_close(tofree, files);
+
+	return fd;
+
+Ebusy:
+	spin_unlock(&files->file_lock);
+	return -EBUSY;
+}
+
+int replace_fd(unsigned fd, struct file *file, unsigned flags)
+{
+	int err;
+	struct files_struct *files = current->files;
+
+	if (!file)
+		return __close_fd(files, fd);
+
+	if (fd >= rlimit(RLIMIT_NOFILE))
+		return -EBADF;
+
+	spin_lock(&files->file_lock);
+	err = expand_files(files, fd);
+	if (unlikely(err < 0))
+		goto out_unlock;
+	return do_dup2(files, file, fd, flags);
+
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return err;
+}
+
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
+{
+	int err = -EBADF;
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	if ((flags & ~O_CLOEXEC) != 0)
+		return -EINVAL;
+
+	if (unlikely(oldfd == newfd))
+		return -EINVAL;
+
+	if (newfd >= rlimit(RLIMIT_NOFILE))
+		return -EBADF;
+
+	spin_lock(&files->file_lock);
+	err = expand_files(files, newfd);
+	file = fcheck(oldfd);
+	if (unlikely(!file))
+		goto Ebadf;
+	if (unlikely(err < 0)) {
+		if (err == -EMFILE)
+			goto Ebadf;
+		goto out_unlock;
+	}
+	return do_dup2(files, file, newfd, flags);
+
+Ebadf:
+	err = -EBADF;
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return err;
+}
+
+SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
+{
+	if (unlikely(newfd == oldfd)) { /* corner case */
+		struct files_struct *files = current->files;
+		int retval = oldfd;
+
+		rcu_read_lock();
+		if (!fcheck_files(files, oldfd))
+			retval = -EBADF;
+		rcu_read_unlock();
+		return retval;
+	}
+	return sys_dup3(oldfd, newfd, 0);
+}
+
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
+{
+	int ret = -EBADF;
+	struct file *file = fget_raw(fildes);
+
+	if (file) {
+		ret = get_unused_fd_flags(0);
+		if (ret >= 0)
+			fd_install(ret, file);
+		else
+			fput(file);
+	}
+	return ret;
+}
+
+int f_dupfd(unsigned int from, struct file *file, unsigned flags)
+{
+	int err;
+	if (from >= rlimit(RLIMIT_NOFILE))
+		return -EINVAL;
+	err = alloc_fd(from, flags);
+	if (err >= 0) {
+		get_file(file);
+		fd_install(err, file);
+	}
+	return err;
+}
+
+int iterate_fd(struct files_struct *files, unsigned n,
+		int (*f)(const void *, struct file *, unsigned),
+		const void *p)
+{
+	struct fdtable *fdt;
+	int res = 0;
+	if (!files)
+		return 0;
+	spin_lock(&files->file_lock);
+	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
+		struct file *file;
+		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
+		if (!file)
+			continue;
+		res = f(p, file, n);
+		if (res)
+			break;
+	}
+	spin_unlock(&files->file_lock);
+	return res;
+}
+EXPORT_SYMBOL(iterate_fd);
diff --git a/kernel/fs/file_table.c b/kernel/fs/file_table.c
new file mode 100644
index 000000000..294174dcc
--- /dev/null
+++ b/kernel/fs/file_table.c
@@ -0,0 +1,327 @@
+/*
+ *  linux/fs/file_table.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/eventpoll.h>
+#include <linux/rcupdate.h>
+#include <linux/mount.h>
+#include <linux/capability.h>
+#include <linux/cdev.h>
+#include <linux/fsnotify.h>
+#include <linux/sysctl.h>
+#include <linux/lglock.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/task_work.h>
+#include <linux/ima.h>
+
+#include <linux/atomic.h>
+
+#include "internal.h"
+
+/* sysctl tunables... */
+struct files_stat_struct files_stat = {
+	.max_files = NR_FILE
+};
+
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
+
+static struct percpu_counter nr_files __cacheline_aligned_in_smp;
+
+static void file_free_rcu(struct rcu_head *head)
+{
+	struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
+
+	put_cred(f->f_cred);
+	kmem_cache_free(filp_cachep, f);
+}
+
+static inline void file_free(struct file *f)
+{
+	percpu_counter_dec(&nr_files);
+	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+}
+
+/*
+ * Return the total number of open files in the system
+ */
+static long get_nr_files(void)
+{
+	return percpu_counter_read_positive(&nr_files);
+}
+
+/*
+ * Return the maximum number of open files in the system
+ */
+unsigned long get_max_files(void)
+{
+	return files_stat.max_files;
+}
+EXPORT_SYMBOL_GPL(get_max_files);
+
+/*
+ * Handle nr_files sysctl
+ */
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_files(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	files_stat.nr_files = get_nr_files();
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+#else
+int proc_nr_files(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+#endif
+
+/* Find an unused file structure and return a pointer to it.
+ * Returns an error pointer if some error happend e.g. we over file
+ * structures limit, run out of memory or operation is not permitted.
+ *
+ * Be very careful using this.  You are responsible for
+ * getting write access to any mount that you might assign
+ * to this filp, if it is opened for write.  If this is not
+ * done, you will imbalance int the mount's writer count
+ * and a warning at __fput() time.
+ */
+struct file *get_empty_filp(void)
+{
+	const struct cred *cred = current_cred();
+	static long old_max;
+	struct file *f;
+	int error;
+
+	/*
+	 * Privileged users can go above max_files
+	 */
+	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+		/*
+		 * percpu_counters are inaccurate.  Do an expensive check before
+		 * we go and fail.
+		 */
+		if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
+			goto over;
+	}
+
+	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+	if (unlikely(!f))
+		return ERR_PTR(-ENOMEM);
+
+	percpu_counter_inc(&nr_files);
+	f->f_cred = get_cred(cred);
+	error = security_file_alloc(f);
+	if (unlikely(error)) {
+		file_free(f);
+		return ERR_PTR(error);
+	}
+
+	atomic_long_set(&f->f_count, 1);
+	rwlock_init(&f->f_owner.lock);
+	spin_lock_init(&f->f_lock);
+	mutex_init(&f->f_pos_lock);
+	eventpoll_init_file(f);
+	/* f->f_version: 0 */
+	return f;
+
+over:
+	/* Ran out of filps - report that */
+	if (get_nr_files() > old_max) {
+		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
+		old_max = get_nr_files();
+	}
+	return ERR_PTR(-ENFILE);
+}
+
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ *
+ * @path: the (dentry, vfsmount) pair for the new file
+ * @mode: the mode with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ */
+struct file *alloc_file(struct path *path, fmode_t mode,
+		const struct file_operations *fop)
+{
+	struct file *file;
+
+	file = get_empty_filp();
+	if (IS_ERR(file))
+		return file;
+
+	file->f_path = *path;
+	file->f_inode = path->dentry->d_inode;
+	file->f_mapping = path->dentry->d_inode->i_mapping;
+	if ((mode & FMODE_READ) &&
+	     likely(fop->read || fop->read_iter))
+		mode |= FMODE_CAN_READ;
+	if ((mode & FMODE_WRITE) &&
+	     likely(fop->write || fop->write_iter))
+		mode |= FMODE_CAN_WRITE;
+	file->f_mode = mode;
+	file->f_op = fop;
+	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(path->dentry->d_inode);
+	return file;
+}
+EXPORT_SYMBOL(alloc_file);
+
+/* the real guts of fput() - releasing the last reference to file
+ */
+static void __fput(struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct vfsmount *mnt = file->f_path.mnt;
+	struct inode *inode = file->f_inode;
+
+	might_sleep();
+
+	fsnotify_close(file);
+	/*
+	 * The function eventpoll_release() should be the first called
+	 * in the file cleanup chain.
+	 */
+	eventpoll_release(file);
+	locks_remove_file(file);
+
+	if (unlikely(file->f_flags & FASYNC)) {
+		if (file->f_op->fasync)
+			file->f_op->fasync(-1, file, 0);
+	}
+	ima_file_free(file);
+	if (file->f_op->release)
+		file->f_op->release(inode, file);
+	security_file_free(file);
+	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
+		     !(file->f_mode & FMODE_PATH))) {
+		cdev_put(inode->i_cdev);
+	}
+	fops_put(file->f_op);
+	put_pid(file->f_owner.pid);
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_dec(inode);
+	if (file->f_mode & FMODE_WRITER) {
+		put_write_access(inode);
+		__mnt_drop_write(mnt);
+	}
+	file->f_path.dentry = NULL;
+	file->f_path.mnt = NULL;
+	file->f_inode = NULL;
+	file_free(file);
+	dput(dentry);
+	mntput(mnt);
+}
+
+static LLIST_HEAD(delayed_fput_list);
+static void delayed_fput(struct work_struct *unused)
+{
+	struct llist_node *node = llist_del_all(&delayed_fput_list);
+	struct llist_node *next;
+
+	for (; node; node = next) {
+		next = llist_next(node);
+		__fput(llist_entry(node, struct file, f_u.fu_llist));
+	}
+}
+
+static void ____fput(struct callback_head *work)
+{
+	__fput(container_of(work, struct file, f_u.fu_rcuhead));
+}
+
+/*
+ * If kernel thread really needs to have the final fput() it has done
+ * to complete, call this.  The only user right now is the boot - we
+ * *do* need to make sure our writes to binaries on initramfs has
+ * not left us with opened struct file waiting for __fput() - execve()
+ * won't work without that.  Please, don't add more callers without
+ * very good reasons; in particular, never call that with locks
+ * held and never call that from a thread that might need to do
+ * some work on any kind of umount.
+ */
+void flush_delayed_fput(void)
+{
+	delayed_fput(NULL);
+}
+
+static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
+
+void fput(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count)) {
+		struct task_struct *task = current;
+
+		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+			init_task_work(&file->f_u.fu_rcuhead, ____fput);
+			if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
+				return;
+			/*
+			 * After this task has run exit_task_work(),
+			 * task_work_add() will fail.  Fall through to delayed
+			 * fput to avoid leaking *file.
+			 */
+		}
+
+		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
+			schedule_delayed_work(&delayed_fput_work, 1);
+	}
+}
+
+/*
+ * synchronous analog of fput(); for kernel threads that might be needed
+ * in some umount() (and thus can't use flush_delayed_fput() without
+ * risking deadlocks), need to wait for completion of __fput() and know
+ * for this specific struct file it won't involve anything that would
+ * need them.  Use only if you really need it - at the very least,
+ * don't blindly convert fput() by kernel thread to that.
+ */
+void __fput_sync(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count)) {
+		struct task_struct *task = current;
+		BUG_ON(!(task->flags & PF_KTHREAD));
+		__fput(file);
+	}
+}
+
+EXPORT_SYMBOL(fput);
+
+void put_filp(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count)) {
+		security_file_free(file);
+		file_free(file);
+	}
+}
+
+void __init files_init(unsigned long mempages)
+{ 
+	unsigned long n;
+
+	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	/*
+	 * One file with associated inode and dcache is very roughly 1K.
+	 * Per default don't use more than 10% of our memory for files. 
+	 */ 
+
+	n = (mempages * (PAGE_SIZE / 1024)) / 10;
+	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+} 
diff --git a/kernel/fs/filesystems.c b/kernel/fs/filesystems.c
new file mode 100644
index 000000000..5797d45a7
--- /dev/null
+++ b/kernel/fs/filesystems.c
@@ -0,0 +1,288 @@
+/*
+ *  linux/fs/filesystems.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  table of configured filesystems
+ */
+
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kmod.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+/*
+ * Handling of filesystem drivers list.
+ * Rules:
+ *	Inclusion to/removals from/scanning of list are protected by spinlock.
+ *	During the unload module must call unregister_filesystem().
+ *	We can access the fields of list element if:
+ *		1) spinlock is held or
+ *		2) we hold the reference to the module.
+ *	The latter can be guaranteed by call of try_module_get(); if it
+ *	returned 0 we must skip the element, otherwise we got the reference.
+ *	Once the reference is obtained we can drop the spinlock.
+ */
+
+static struct file_system_type *file_systems;
+static DEFINE_RWLOCK(file_systems_lock);
+
+/* WARNING: This can be used only if we _already_ own a reference */
+void get_filesystem(struct file_system_type *fs)
+{
+	__module_get(fs->owner);
+}
+
+void put_filesystem(struct file_system_type *fs)
+{
+	module_put(fs->owner);
+}
+
+static struct file_system_type **find_filesystem(const char *name, unsigned len)
+{
+	struct file_system_type **p;
+	for (p=&file_systems; *p; p=&(*p)->next)
+		if (strlen((*p)->name) == len &&
+		    strncmp((*p)->name, name, len) == 0)
+			break;
+	return p;
+}
+
+/**
+ *	register_filesystem - register a new filesystem
+ *	@fs: the file system structure
+ *
+ *	Adds the file system passed to the list of file systems the kernel
+ *	is aware of for mount and other syscalls. Returns 0 on success,
+ *	or a negative errno code on an error.
+ *
+ *	The &struct file_system_type that is passed is linked into the kernel 
+ *	structures and must not be freed until the file system has been
+ *	unregistered.
+ */
+ 
+int register_filesystem(struct file_system_type * fs)
+{
+	int res = 0;
+	struct file_system_type ** p;
+
+	BUG_ON(strchr(fs->name, '.'));
+	if (fs->next)
+		return -EBUSY;
+	write_lock(&file_systems_lock);
+	p = find_filesystem(fs->name, strlen(fs->name));
+	if (*p)
+		res = -EBUSY;
+	else
+		*p = fs;
+	write_unlock(&file_systems_lock);
+	return res;
+}
+
+EXPORT_SYMBOL(register_filesystem);
+
+/**
+ *	unregister_filesystem - unregister a file system
+ *	@fs: filesystem to unregister
+ *
+ *	Remove a file system that was previously successfully registered
+ *	with the kernel. An error is returned if the file system is not found.
+ *	Zero is returned on a success.
+ *	
+ *	Once this function has returned the &struct file_system_type structure
+ *	may be freed or reused.
+ */
+ 
+int unregister_filesystem(struct file_system_type * fs)
+{
+	struct file_system_type ** tmp;
+
+	write_lock(&file_systems_lock);
+	tmp = &file_systems;
+	while (*tmp) {
+		if (fs == *tmp) {
+			*tmp = fs->next;
+			fs->next = NULL;
+			write_unlock(&file_systems_lock);
+			synchronize_rcu();
+			return 0;
+		}
+		tmp = &(*tmp)->next;
+	}
+	write_unlock(&file_systems_lock);
+
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(unregister_filesystem);
+
+#ifdef CONFIG_SYSFS_SYSCALL
+static int fs_index(const char __user * __name)
+{
+	struct file_system_type * tmp;
+	struct filename *name;
+	int err, index;
+
+	name = getname(__name);
+	err = PTR_ERR(name);
+	if (IS_ERR(name))
+		return err;
+
+	err = -EINVAL;
+	read_lock(&file_systems_lock);
+	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
+		if (strcmp(tmp->name, name->name) == 0) {
+			err = index;
+			break;
+		}
+	}
+	read_unlock(&file_systems_lock);
+	putname(name);
+	return err;
+}
+
+static int fs_name(unsigned int index, char __user * buf)
+{
+	struct file_system_type * tmp;
+	int len, res;
+
+	read_lock(&file_systems_lock);
+	for (tmp = file_systems; tmp; tmp = tmp->next, index--)
+		if (index <= 0 && try_module_get(tmp->owner))
+			break;
+	read_unlock(&file_systems_lock);
+	if (!tmp)
+		return -EINVAL;
+
+	/* OK, we got the reference, so we can safely block */
+	len = strlen(tmp->name) + 1;
+	res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
+	put_filesystem(tmp);
+	return res;
+}
+
+static int fs_maxindex(void)
+{
+	struct file_system_type * tmp;
+	int index;
+
+	read_lock(&file_systems_lock);
+	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
+		;
+	read_unlock(&file_systems_lock);
+	return index;
+}
+
+/*
+ * Whee.. Weird sysv syscall. 
+ */
+SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
+{
+	int retval = -EINVAL;
+
+	switch (option) {
+		case 1:
+			retval = fs_index((const char __user *) arg1);
+			break;
+
+		case 2:
+			retval = fs_name(arg1, (char __user *) arg2);
+			break;
+
+		case 3:
+			retval = fs_maxindex();
+			break;
+	}
+	return retval;
+}
+#endif
+
+int __init get_filesystem_list(char *buf)
+{
+	int len = 0;
+	struct file_system_type * tmp;
+
+	read_lock(&file_systems_lock);
+	tmp = file_systems;
+	while (tmp && len < PAGE_SIZE - 80) {
+		len += sprintf(buf+len, "%s\t%s\n",
+			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			tmp->name);
+		tmp = tmp->next;
+	}
+	read_unlock(&file_systems_lock);
+	return len;
+}
+
+#ifdef CONFIG_PROC_FS
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+	struct file_system_type * tmp;
+
+	read_lock(&file_systems_lock);
+	tmp = file_systems;
+	while (tmp) {
+		seq_printf(m, "%s\t%s\n",
+			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			tmp->name);
+		tmp = tmp->next;
+	}
+	read_unlock(&file_systems_lock);
+	return 0;
+}
+
+static int filesystems_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, filesystems_proc_show, NULL);
+}
+
+static const struct file_operations filesystems_proc_fops = {
+	.open		= filesystems_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_filesystems_init(void)
+{
+	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	return 0;
+}
+module_init(proc_filesystems_init);
+#endif
+
+static struct file_system_type *__get_fs_type(const char *name, int len)
+{
+	struct file_system_type *fs;
+
+	read_lock(&file_systems_lock);
+	fs = *(find_filesystem(name, len));
+	if (fs && !try_module_get(fs->owner))
+		fs = NULL;
+	read_unlock(&file_systems_lock);
+	return fs;
+}
+
+struct file_system_type *get_fs_type(const char *name)
+{
+	struct file_system_type *fs;
+	const char *dot = strchr(name, '.');
+	int len = dot ? dot - name : strlen(name);
+
+	fs = __get_fs_type(name, len);
+	if (!fs && (request_module("fs-%.*s", len, name) == 0))
+		fs = __get_fs_type(name, len);
+
+	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
+		put_filesystem(fs);
+		fs = NULL;
+	}
+	return fs;
+}
+
+EXPORT_SYMBOL(get_fs_type);
diff --git a/kernel/fs/freevxfs/Kconfig b/kernel/fs/freevxfs/Kconfig
new file mode 100644
index 000000000..8dc1cd5c1
--- /dev/null
+++ b/kernel/fs/freevxfs/Kconfig
@@ -0,0 +1,16 @@
+config VXFS_FS
+	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+	depends on BLOCK
+	help
+	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
+	  file system format.  VERITAS VxFS(TM) is the standard file system
+	  of SCO UnixWare (and possibly others) and optionally available
+	  for Sunsoft Solaris, HP-UX and many other operating systems.
+	  Currently only readonly access is supported.
+
+	  NOTE: the file system type as used by mount(1), mount(2) and
+	  fstab(5) is 'vxfs' as it describes the file system format, not
+	  the actual driver.
+
+	  To compile this as a module, choose M here: the module will be
+	  called freevxfs.  If unsure, say N.
diff --git a/kernel/fs/freevxfs/Makefile b/kernel/fs/freevxfs/Makefile
new file mode 100644
index 000000000..87ad09744
--- /dev/null
+++ b/kernel/fs/freevxfs/Makefile
@@ -0,0 +1,8 @@
+#
+# VxFS Makefile
+#
+
+obj-$(CONFIG_VXFS_FS) += freevxfs.o
+
+freevxfs-objs := vxfs_bmap.o vxfs_fshead.o vxfs_immed.o vxfs_inode.o \
+		 vxfs_lookup.o vxfs_olt.o vxfs_subr.o vxfs_super.o
diff --git a/kernel/fs/freevxfs/vxfs.h b/kernel/fs/freevxfs/vxfs.h
new file mode 100644
index 000000000..c8a926526
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_SUPER_H_
+#define _VXFS_SUPER_H_
+
+/*
+ * Veritas filesystem driver - superblock structure.
+ *
+ * This file contains the definition of the disk and core
+ * superblocks of the Veritas Filesystem.
+ */
+#include <linux/types.h>
+
+
+/*
+ * Data types for use with the VxFS ondisk format.
+ */
+typedef	int32_t		vx_daddr_t;
+typedef int32_t		vx_ino_t;
+
+/*
+ * Superblock magic number (vxfs_super->vs_magic).
+ */
+#define VXFS_SUPER_MAGIC	0xa501FCF5
+
+/*
+ * The root inode.
+ */
+#define VXFS_ROOT_INO		2
+
+/*
+ * Num of entries in free extent array
+ */
+#define VXFS_NEFREE		32
+
+
+/*
+ * VxFS superblock (disk).
+ */
+struct vxfs_sb {
+	/*
+	 * Readonly fields for the version 1 superblock.
+	 *
+	 * Lots of this fields are no more used by version 2
+	 * and never filesystems.
+	 */
+	u_int32_t	vs_magic;		/* Magic number */
+	int32_t		vs_version;		/* VxFS version */
+	u_int32_t	vs_ctime;		/* create time - secs */
+	u_int32_t	vs_cutime;		/* create time - usecs */
+	int32_t		__unused1;		/* unused */
+	int32_t		__unused2;		/* unused */
+	vx_daddr_t	vs_old_logstart;	/* obsolete */
+	vx_daddr_t	vs_old_logend;		/* obsolete */
+	int32_t		vs_bsize;		/* block size */
+	int32_t		vs_size;		/* number of blocks */
+	int32_t		vs_dsize;		/* number of data blocks */
+	u_int32_t	vs_old_ninode;		/* obsolete */
+	int32_t		vs_old_nau;		/* obsolete */
+	int32_t		__unused3;		/* unused */
+	int32_t		vs_old_defiextsize;	/* obsolete */
+	int32_t		vs_old_ilbsize;		/* obsolete */
+	int32_t		vs_immedlen;		/* size of immediate data area */
+	int32_t		vs_ndaddr;		/* number of direct extentes */
+	vx_daddr_t	vs_firstau;		/* address of first AU */
+	vx_daddr_t	vs_emap;		/* offset of extent map in AU */
+	vx_daddr_t	vs_imap;		/* offset of inode map in AU */
+	vx_daddr_t	vs_iextop;		/* offset of ExtOp. map in AU */
+	vx_daddr_t	vs_istart;		/* offset of inode list in AU */
+	vx_daddr_t	vs_bstart;		/* offset of fdblock in AU */
+	vx_daddr_t	vs_femap;		/* aufirst + emap */
+	vx_daddr_t	vs_fimap;		/* aufirst + imap */
+	vx_daddr_t	vs_fiextop;		/* aufirst + iextop */
+	vx_daddr_t	vs_fistart;		/* aufirst + istart */
+	vx_daddr_t	vs_fbstart;		/* aufirst + bstart */
+	int32_t		vs_nindir;		/* number of entries in indir */
+	int32_t		vs_aulen;		/* length of AU in blocks */
+	int32_t		vs_auimlen;		/* length of imap in blocks */
+	int32_t		vs_auemlen;		/* length of emap in blocks */
+	int32_t		vs_auilen;		/* length of ilist in blocks */
+	int32_t		vs_aupad;		/* length of pad in blocks */
+	int32_t		vs_aublocks;		/* data blocks in AU */
+	int32_t		vs_maxtier;		/* log base 2 of aublocks */
+	int32_t		vs_inopb;		/* number of inodes per blk */
+	int32_t		vs_old_inopau;		/* obsolete */
+	int32_t		vs_old_inopilb;		/* obsolete */
+	int32_t		vs_old_ndiripau;	/* obsolete */
+	int32_t		vs_iaddrlen;		/* size of indirect addr ext. */
+	int32_t		vs_bshift;		/* log base 2 of bsize */
+	int32_t		vs_inoshift;		/* log base 2 of inobp */
+	int32_t		vs_bmask;		/* ~( bsize - 1 ) */
+	int32_t		vs_boffmask;		/* bsize - 1 */
+	int32_t		vs_old_inomask;		/* old_inopilb - 1 */
+	int32_t		vs_checksum;		/* checksum of V1 data */
+	
+	/*
+	 * Version 1, writable
+	 */
+	int32_t		vs_free;		/* number of free blocks */
+	int32_t		vs_ifree;		/* number of free inodes */
+	int32_t		vs_efree[VXFS_NEFREE];	/* number of free extents by size */
+	int32_t		vs_flags;		/* flags ?!? */
+	u_int8_t	vs_mod;			/* filesystem has been changed */
+	u_int8_t	vs_clean;		/* clean FS */
+	u_int16_t	__unused4;		/* unused */
+	u_int32_t	vs_firstlogid;		/* mount time log ID */
+	u_int32_t	vs_wtime;		/* last time written - sec */
+	u_int32_t	vs_wutime;		/* last time written - usec */
+	u_int8_t	vs_fname[6];		/* FS name */
+	u_int8_t	vs_fpack[6];		/* FS pack name */
+	int32_t		vs_logversion;		/* log format version */
+	int32_t		__unused5;		/* unused */
+	
+	/*
+	 * Version 2, Read-only
+	 */
+	vx_daddr_t	vs_oltext[2];		/* OLT extent and replica */
+	int32_t		vs_oltsize;		/* OLT extent size */
+	int32_t		vs_iauimlen;		/* size of inode map */
+	int32_t		vs_iausize;		/* size of IAU in blocks */
+	int32_t		vs_dinosize;		/* size of inode in bytes */
+	int32_t		vs_old_dniaddr;		/* indir levels per inode */
+	int32_t		vs_checksum2;		/* checksum of V2 RO */
+
+	/*
+	 * Actually much more...
+	 */
+};
+
+
+/*
+ * In core superblock filesystem private data for VxFS.
+ */
+struct vxfs_sb_info {
+	struct vxfs_sb		*vsi_raw;	/* raw (on disk) superblock */
+	struct buffer_head	*vsi_bp;	/* buffer for raw superblock*/
+	struct inode		*vsi_fship;	/* fileset header inode */
+	struct inode		*vsi_ilist;	/* inode list inode */
+	struct inode		*vsi_stilist;	/* structural inode list inode */
+	u_long			vsi_iext;	/* initial inode list */
+	ino_t			vsi_fshino;	/* fileset header inode */
+	daddr_t			vsi_oltext;	/* OLT extent */
+	daddr_t			vsi_oltsize;	/* OLT size */
+};
+
+
+/*
+ * File modes.  File types above 0xf000 are vxfs internal only, they should
+ * not be passed back to higher levels of the system.  vxfs file types must
+ * never have one of the regular file type bits set.
+ */
+enum vxfs_mode {
+	VXFS_ISUID = 0x00000800,	/* setuid */
+	VXFS_ISGID = 0x00000400,	/* setgid */
+	VXFS_ISVTX = 0x00000200,	/* sticky bit */
+	VXFS_IREAD = 0x00000100,	/* read */
+	VXFS_IWRITE = 0x00000080,	/* write */
+	VXFS_IEXEC = 0x00000040,	/* exec */
+
+	VXFS_IFIFO = 0x00001000,	/* Named pipe */
+	VXFS_IFCHR = 0x00002000,	/* Character device */
+	VXFS_IFDIR = 0x00004000,	/* Directory */
+	VXFS_IFNAM = 0x00005000,	/* Xenix device ?? */
+	VXFS_IFBLK = 0x00006000,	/* Block device */
+	VXFS_IFREG = 0x00008000,	/* Regular file */
+	VXFS_IFCMP = 0x00009000,	/* Compressed file ?!? */
+	VXFS_IFLNK = 0x0000a000,	/* Symlink */
+	VXFS_IFSOC = 0x0000c000,	/* Socket */
+
+	/* VxFS internal */
+	VXFS_IFFSH = 0x10000000,	/* Fileset header */
+	VXFS_IFILT = 0x20000000,	/* Inode list */
+	VXFS_IFIAU = 0x30000000,	/* Inode allocation unit */
+	VXFS_IFCUT = 0x40000000,	/* Current usage table */
+	VXFS_IFATT = 0x50000000,	/* Attr. inode */
+	VXFS_IFLCT = 0x60000000,	/* Link count table */
+	VXFS_IFIAT = 0x70000000,	/* Indirect attribute file */
+	VXFS_IFEMR = 0x80000000,	/* Extent map reorg file */
+	VXFS_IFQUO = 0x90000000,	/* BSD quota file */
+	VXFS_IFPTI = 0xa0000000,	/* "Pass through" inode */
+	VXFS_IFLAB = 0x11000000,	/* Device label file */
+	VXFS_IFOLT = 0x12000000,	/* OLT file */
+	VXFS_IFLOG = 0x13000000,	/* Log file */
+	VXFS_IFEMP = 0x14000000,	/* Extent map file */
+	VXFS_IFEAU = 0x15000000,	/* Extent AU file */
+	VXFS_IFAUS = 0x16000000,	/* Extent AU summary file */
+	VXFS_IFDEV = 0x17000000,	/* Device config file */
+
+};
+
+#define	VXFS_TYPE_MASK		0xfffff000
+
+#define VXFS_IS_TYPE(ip,type)	(((ip)->vii_mode & VXFS_TYPE_MASK) == (type))
+#define VXFS_ISFIFO(x)		VXFS_IS_TYPE((x),VXFS_IFIFO)
+#define VXFS_ISCHR(x)		VXFS_IS_TYPE((x),VXFS_IFCHR)
+#define VXFS_ISDIR(x)		VXFS_IS_TYPE((x),VXFS_IFDIR)
+#define VXFS_ISNAM(x)		VXFS_IS_TYPE((x),VXFS_IFNAM)
+#define VXFS_ISBLK(x)		VXFS_IS_TYPE((x),VXFS_IFBLK)
+#define VXFS_ISLNK(x)		VXFS_IS_TYPE((x),VXFS_IFLNK)
+#define VXFS_ISREG(x)		VXFS_IS_TYPE((x),VXFS_IFREG)
+#define VXFS_ISCMP(x)		VXFS_IS_TYPE((x),VXFS_IFCMP)
+#define VXFS_ISSOC(x)		VXFS_IS_TYPE((x),VXFS_IFSOC)
+
+#define VXFS_ISFSH(x)		VXFS_IS_TYPE((x),VXFS_IFFSH)
+#define VXFS_ISILT(x)		VXFS_IS_TYPE((x),VXFS_IFILT)
+
+/*
+ * Inmode organisation types.
+ */
+enum {
+	VXFS_ORG_NONE	= 0,	/* Inode has *no* format ?!? */
+	VXFS_ORG_EXT4	= 1,	/* Ext4 */
+	VXFS_ORG_IMMED	= 2,	/* All data stored in inode */
+	VXFS_ORG_TYPED	= 3,	/* Typed extents */
+};
+
+#define VXFS_IS_ORG(ip,org)	((ip)->vii_orgtype == (org))
+#define VXFS_ISNONE(ip)		VXFS_IS_ORG((ip), VXFS_ORG_NONE)
+#define VXFS_ISEXT4(ip)		VXFS_IS_ORG((ip), VXFS_ORG_EXT4)
+#define VXFS_ISIMMED(ip)	VXFS_IS_ORG((ip), VXFS_ORG_IMMED)
+#define VXFS_ISTYPED(ip)	VXFS_IS_ORG((ip), VXFS_ORG_TYPED)
+
+
+/*
+ * Get filesystem private data from VFS inode.
+ */
+#define VXFS_INO(ip) \
+	((struct vxfs_inode_info *)(ip)->i_private)
+
+/*
+ * Get filesystem private data from VFS superblock.
+ */
+#define VXFS_SBI(sbp) \
+	((struct vxfs_sb_info *)(sbp)->s_fs_info)
+
+#endif /* _VXFS_SUPER_H_ */
diff --git a/kernel/fs/freevxfs/vxfs_bmap.c b/kernel/fs/freevxfs/vxfs_bmap.c
new file mode 100644
index 000000000..f86fd3cac
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_bmap.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - filesystem to disk block mapping.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+
+#include "vxfs.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+
+
+#ifdef DIAGNOSTIC
+static void
+vxfs_typdump(struct vxfs_typed *typ)
+{
+	printk(KERN_DEBUG "type=%Lu ", typ->vt_hdr >> VXFS_TYPED_TYPESHIFT);
+	printk("offset=%Lx ", typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+	printk("block=%x ", typ->vt_block);
+	printk("size=%x\n", typ->vt_size);
+}
+#endif
+
+/**
+ * vxfs_bmap_ext4 - do bmap for ext4 extents
+ * @ip:		pointer to the inode we do bmap for
+ * @iblock:	logical block.
+ *
+ * Description:
+ *   vxfs_bmap_ext4 performs the bmap operation for inodes with
+ *   ext4-style extents (which are much like the traditional UNIX
+ *   inode organisation).
+ *
+ * Returns:
+ *   The physical block number on success, else Zero.
+ */
+static daddr_t
+vxfs_bmap_ext4(struct inode *ip, long bn)
+{
+	struct super_block *sb = ip->i_sb;
+	struct vxfs_inode_info *vip = VXFS_INO(ip);
+	unsigned long bsize = sb->s_blocksize;
+	u32 indsize = vip->vii_ext4.ve4_indsize;
+	int i;
+
+	if (indsize > sb->s_blocksize)
+		goto fail_size;
+
+	for (i = 0; i < VXFS_NDADDR; i++) {
+		struct direct *d = vip->vii_ext4.ve4_direct + i;
+		if (bn >= 0 && bn < d->size)
+			return (bn + d->extent);
+		bn -= d->size;
+	}
+
+	if ((bn / (indsize * indsize * bsize / 4)) == 0) {
+		struct buffer_head *buf;
+		daddr_t	bno;
+		u32 *indir;
+
+		buf = sb_bread(sb, vip->vii_ext4.ve4_indir[0]);
+		if (!buf || !buffer_mapped(buf))
+			goto fail_buf;
+
+		indir = (u32 *)buf->b_data;
+		bno = indir[(bn/indsize) % (indsize*bn)] + (bn%indsize);
+
+		brelse(buf);
+		return bno;
+	} else
+		printk(KERN_WARNING "no matching indir?");
+
+	return 0;
+
+fail_size:
+	printk("vxfs: indirect extent too big!\n");
+fail_buf:
+	return 0;
+}
+
+/**
+ * vxfs_bmap_indir - recursion for vxfs_bmap_typed
+ * @ip:		pointer to the inode we do bmap for
+ * @indir:	indirect block we start reading at
+ * @size:	size of the typed area to search
+ * @block:	partially result from further searches
+ *
+ * Description:
+ *   vxfs_bmap_indir reads a &struct vxfs_typed at @indir
+ *   and performs the type-defined action.
+ *
+ * Return Value:
+ *   The physical block number on success, else Zero.
+ *
+ * Note:
+ *   Kernelstack is rare.  Unrecurse?
+ */
+static daddr_t
+vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
+{
+	struct buffer_head		*bp = NULL;
+	daddr_t				pblock = 0;
+	int				i;
+
+	for (i = 0; i < size * VXFS_TYPED_PER_BLOCK(ip->i_sb); i++) {
+		struct vxfs_typed	*typ;
+		int64_t			off;
+
+		bp = sb_bread(ip->i_sb,
+				indir + (i / VXFS_TYPED_PER_BLOCK(ip->i_sb)));
+		if (!bp || !buffer_mapped(bp))
+			return 0;
+
+		typ = ((struct vxfs_typed *)bp->b_data) +
+			(i % VXFS_TYPED_PER_BLOCK(ip->i_sb));
+		off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+
+		if (block < off) {
+			brelse(bp);
+			continue;
+		}
+
+		switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+		case VXFS_TYPED_INDIRECT:
+			pblock = vxfs_bmap_indir(ip, typ->vt_block,
+					typ->vt_size, block - off);
+			if (pblock == -2)
+				break;
+			goto out;
+		case VXFS_TYPED_DATA:
+			if ((block - off) >= typ->vt_size)
+				break;
+			pblock = (typ->vt_block + block - off);
+			goto out;
+		case VXFS_TYPED_INDIRECT_DEV4:
+		case VXFS_TYPED_DATA_DEV4: {
+			struct vxfs_typed_dev4	*typ4 =
+				(struct vxfs_typed_dev4 *)typ;
+
+			printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
+			printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
+			       (unsigned long long) typ4->vd4_block,
+			       (unsigned long long) typ4->vd4_size,
+			       typ4->vd4_dev);
+			goto fail;
+		}
+		default:
+			BUG();
+		}
+		brelse(bp);
+	}
+
+fail:
+	pblock = 0;
+out:
+	brelse(bp);
+	return (pblock);
+}
+
+/**
+ * vxfs_bmap_typed - bmap for typed extents
+ * @ip:		pointer to the inode we do bmap for
+ * @iblock:	logical block
+ *
+ * Description:
+ *   Performs the bmap operation for typed extents.
+ *
+ * Return Value:
+ *   The physical block number on success, else Zero.
+ */
+static daddr_t
+vxfs_bmap_typed(struct inode *ip, long iblock)
+{
+	struct vxfs_inode_info		*vip = VXFS_INO(ip);
+	daddr_t				pblock = 0;
+	int				i;
+
+	for (i = 0; i < VXFS_NTYPED; i++) {
+		struct vxfs_typed	*typ = vip->vii_org.typed + i;
+		int64_t			off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+
+#ifdef DIAGNOSTIC
+		vxfs_typdump(typ);
+#endif
+		if (iblock < off)
+			continue;
+		switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+		case VXFS_TYPED_INDIRECT:
+			pblock = vxfs_bmap_indir(ip, typ->vt_block,
+					typ->vt_size, iblock - off);
+			if (pblock == -2)
+				break;
+			return (pblock);
+		case VXFS_TYPED_DATA:
+			if ((iblock - off) < typ->vt_size)
+				return (typ->vt_block + iblock - off);
+			break;
+		case VXFS_TYPED_INDIRECT_DEV4:
+		case VXFS_TYPED_DATA_DEV4: {
+			struct vxfs_typed_dev4	*typ4 =
+				(struct vxfs_typed_dev4 *)typ;
+
+			printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
+			printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
+			       (unsigned long long) typ4->vd4_block,
+			       (unsigned long long) typ4->vd4_size,
+			       typ4->vd4_dev);
+			return 0;
+		}
+		default:
+			BUG();
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * vxfs_bmap1 - vxfs-internal bmap operation
+ * @ip:			pointer to the inode we do bmap for
+ * @iblock:		logical block
+ *
+ * Description:
+ *   vxfs_bmap1 perfoms a logical to physical block mapping
+ *   for vxfs-internal purposes.
+ *
+ * Return Value:
+ *   The physical block number on success, else Zero.
+ */
+daddr_t
+vxfs_bmap1(struct inode *ip, long iblock)
+{
+	struct vxfs_inode_info		*vip = VXFS_INO(ip);
+
+	if (VXFS_ISEXT4(vip))
+		return vxfs_bmap_ext4(ip, iblock);
+	if (VXFS_ISTYPED(vip))
+		return vxfs_bmap_typed(ip, iblock);
+	if (VXFS_ISNONE(vip))
+		goto unsupp;
+	if (VXFS_ISIMMED(vip))
+		goto unsupp;
+
+	printk(KERN_WARNING "vxfs: inode %ld has no valid orgtype (%x)\n",
+			ip->i_ino, vip->vii_orgtype);
+	BUG();
+
+unsupp:
+	printk(KERN_WARNING "vxfs: inode %ld has an unsupported orgtype (%x)\n",
+			ip->i_ino, vip->vii_orgtype);
+	return 0;
+}
diff --git a/kernel/fs/freevxfs/vxfs_dir.h b/kernel/fs/freevxfs/vxfs_dir.h
new file mode 100644
index 000000000..aaf1fb098
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_dir.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_DIR_H_
+#define _VXFS_DIR_H_
+
+/*
+ * Veritas filesystem driver - directory structure.
+ *
+ * This file contains the definition of the vxfs directory format.
+ */
+
+
+/*
+ * VxFS directory block header.
+ *
+ * This entry is the head of every filesystem block in a directory.
+ * It is used for free space management and additionally includes
+ * a hash for speeding up directory search (lookup).
+ *
+ * The hash may be empty and in fact we do not use it all in the
+ * Linux driver for now.
+ */
+struct vxfs_dirblk {
+	u_int16_t	d_free;		/* free space in dirblock */
+	u_int16_t	d_nhash;	/* no of hash chains */
+	u_int16_t	d_hash[1];	/* hash chain */
+};
+
+/*
+ * VXFS_NAMELEN is the maximum length of the d_name field
+ *	of an VxFS directory entry.
+ */
+#define VXFS_NAMELEN	256
+
+/*
+ * VxFS directory entry.
+ */
+struct vxfs_direct {
+	vx_ino_t	d_ino;			/* inode number */
+	u_int16_t	d_reclen;		/* record length */
+	u_int16_t	d_namelen;		/* d_name length */
+	u_int16_t	d_hashnext;		/* next hash entry */
+	char		d_name[VXFS_NAMELEN];	/* name */
+};
+
+/*
+ * VXFS_DIRPAD defines the directory entry boundaries, is _must_ be
+ *	a multiple of four.
+ * VXFS_NAMEMIN is the length of a directory entry with a NULL d_name.
+ * VXFS_DIRROUND is an internal macros that rounds a length to a value
+ *	usable for directory sizes.
+ * VXFS_DIRLEN calculates the directory entry size for an entry with
+ *	a d_name with size len.
+ */
+#define VXFS_DIRPAD		4
+#define VXFS_NAMEMIN		offsetof(struct vxfs_direct, d_name)
+#define VXFS_DIRROUND(len)	((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1))
+#define VXFS_DIRLEN(len)	(VXFS_DIRROUND(VXFS_NAMEMIN + (len)))
+
+/*
+ * VXFS_DIRBLKOV is the overhead of a specific dirblock.
+ */
+#define VXFS_DIRBLKOV(dbp)	((sizeof(short) * dbp->d_nhash) + 4)
+
+#endif /* _VXFS_DIR_H_ */
diff --git a/kernel/fs/freevxfs/vxfs_extern.h b/kernel/fs/freevxfs/vxfs_extern.h
new file mode 100644
index 000000000..881aa3d21
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_extern.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_EXTERN_H_
+#define _VXFS_EXTERN_H_
+
+/*
+ * Veritas filesystem driver - external prototypes.
+ *
+ * This file contains prototypes for all vxfs functions used
+ * outside their respective source files.
+ */
+
+
+struct kmem_cache;
+struct super_block;
+struct vxfs_inode_info;
+struct inode;
+
+
+/* vxfs_bmap.c */
+extern daddr_t			vxfs_bmap1(struct inode *, long);
+
+/* vxfs_fshead.c */
+extern int			vxfs_read_fshead(struct super_block *);
+
+/* vxfs_immed.c */
+extern const struct inode_operations vxfs_immed_symlink_iops;
+
+/* vxfs_inode.c */
+extern const struct address_space_operations vxfs_immed_aops;
+extern struct kmem_cache	*vxfs_inode_cachep;
+extern void			vxfs_dumpi(struct vxfs_inode_info *, ino_t);
+extern struct inode *		vxfs_get_fake_inode(struct super_block *,
+					struct vxfs_inode_info *);
+extern void			vxfs_put_fake_inode(struct inode *);
+extern struct vxfs_inode_info *	vxfs_blkiget(struct super_block *, u_long, ino_t);
+extern struct vxfs_inode_info *	vxfs_stiget(struct super_block *, ino_t);
+extern struct inode *		vxfs_iget(struct super_block *, ino_t);
+extern void			vxfs_evict_inode(struct inode *);
+
+/* vxfs_lookup.c */
+extern const struct inode_operations	vxfs_dir_inode_ops;
+extern const struct file_operations	vxfs_dir_operations;
+
+/* vxfs_olt.c */
+extern int			vxfs_read_olt(struct super_block *, u_long);
+
+/* vxfs_subr.c */
+extern const struct address_space_operations vxfs_aops;
+extern struct page *		vxfs_get_page(struct address_space *, u_long);
+extern void			vxfs_put_page(struct page *);
+extern struct buffer_head *	vxfs_bread(struct inode *, int);
+
+#endif /* _VXFS_EXTERN_H_ */
diff --git a/kernel/fs/freevxfs/vxfs_fshead.c b/kernel/fs/freevxfs/vxfs_fshead.c
new file mode 100644
index 000000000..c9a6a94e5
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_fshead.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - fileset header routines.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "vxfs.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+#include "vxfs_fshead.h"
+
+
+#ifdef DIAGNOSTIC
+static void
+vxfs_dumpfsh(struct vxfs_fsh *fhp)
+{
+	printk("\n\ndumping fileset header:\n");
+	printk("----------------------------\n");
+	printk("version: %u\n", fhp->fsh_version);
+	printk("fsindex: %u\n", fhp->fsh_fsindex);
+	printk("iauino: %u\tninodes:%u\n",
+			fhp->fsh_iauino, fhp->fsh_ninodes);
+	printk("maxinode: %u\tlctino: %u\n",
+			fhp->fsh_maxinode, fhp->fsh_lctino);
+	printk("nau: %u\n", fhp->fsh_nau);
+	printk("ilistino[0]: %u\tilistino[1]: %u\n",
+			fhp->fsh_ilistino[0], fhp->fsh_ilistino[1]);
+}
+#endif
+
+/**
+ * vxfs_getfsh - read fileset header into memory
+ * @ip:		the (fake) fileset header inode
+ * @which:	0 for the structural, 1 for the primary fsh.
+ *
+ * Description:
+ *   vxfs_getfsh reads either the structural or primary fileset header
+ *   described by @ip into memory.
+ *
+ * Returns:
+ *   The fileset header structure on success, else Zero.
+ */
+static struct vxfs_fsh *
+vxfs_getfsh(struct inode *ip, int which)
+{
+	struct buffer_head		*bp;
+
+	bp = vxfs_bread(ip, which);
+	if (bp) {
+		struct vxfs_fsh		*fhp;
+
+		if (!(fhp = kmalloc(sizeof(*fhp), GFP_KERNEL)))
+			goto out;
+		memcpy(fhp, bp->b_data, sizeof(*fhp));
+
+		put_bh(bp);
+		return (fhp);
+	}
+out:
+	brelse(bp);
+	return NULL;
+}
+
+/**
+ * vxfs_read_fshead - read the fileset headers
+ * @sbp:	superblock to which the fileset belongs
+ *
+ * Description:
+ *   vxfs_read_fshead will fill the inode and structural inode list in @sb.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code (-EINVAL).
+ */
+int
+vxfs_read_fshead(struct super_block *sbp)
+{
+	struct vxfs_sb_info		*infp = VXFS_SBI(sbp);
+	struct vxfs_fsh			*pfp, *sfp;
+	struct vxfs_inode_info		*vip, *tip;
+
+	vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
+	if (!vip) {
+		printk(KERN_ERR "vxfs: unable to read fsh inode\n");
+		return -EINVAL;
+	}
+	if (!VXFS_ISFSH(vip)) {
+		printk(KERN_ERR "vxfs: fsh list inode is of wrong type (%x)\n",
+				vip->vii_mode & VXFS_TYPE_MASK); 
+		goto out_free_fship;
+	}
+
+
+#ifdef DIAGNOSTIC
+	printk("vxfs: fsh inode dump:\n");
+	vxfs_dumpi(vip, infp->vsi_fshino);
+#endif
+
+	infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
+	if (!infp->vsi_fship) {
+		printk(KERN_ERR "vxfs: unable to get fsh inode\n");
+		goto out_free_fship;
+	}
+
+	sfp = vxfs_getfsh(infp->vsi_fship, 0);
+	if (!sfp) {
+		printk(KERN_ERR "vxfs: unable to get structural fsh\n");
+		goto out_iput_fship;
+	} 
+
+#ifdef DIAGNOSTIC
+	vxfs_dumpfsh(sfp);
+#endif
+
+	pfp = vxfs_getfsh(infp->vsi_fship, 1);
+	if (!pfp) {
+		printk(KERN_ERR "vxfs: unable to get primary fsh\n");
+		goto out_free_sfp;
+	}
+
+#ifdef DIAGNOSTIC
+	vxfs_dumpfsh(pfp);
+#endif
+
+	tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]);
+	if (!tip)
+		goto out_free_pfp;
+
+	infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
+	if (!infp->vsi_stilist) {
+		printk(KERN_ERR "vxfs: unable to get structural list inode\n");
+		kfree(tip);
+		goto out_free_pfp;
+	}
+	if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) {
+		printk(KERN_ERR "vxfs: structural list inode is of wrong type (%x)\n",
+				VXFS_INO(infp->vsi_stilist)->vii_mode & VXFS_TYPE_MASK); 
+		goto out_iput_stilist;
+	}
+
+	tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]);
+	if (!tip)
+		goto out_iput_stilist;
+	infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
+	if (!infp->vsi_ilist) {
+		printk(KERN_ERR "vxfs: unable to get inode list inode\n");
+		kfree(tip);
+		goto out_iput_stilist;
+	}
+	if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) {
+		printk(KERN_ERR "vxfs: inode list inode is of wrong type (%x)\n",
+				VXFS_INO(infp->vsi_ilist)->vii_mode & VXFS_TYPE_MASK);
+		goto out_iput_ilist;
+	}
+
+	return 0;
+
+ out_iput_ilist:
+ 	iput(infp->vsi_ilist);
+ out_iput_stilist:
+ 	iput(infp->vsi_stilist);
+ out_free_pfp:
+	kfree(pfp);
+ out_free_sfp:
+ 	kfree(sfp);
+ out_iput_fship:
+	iput(infp->vsi_fship);
+	return -EINVAL;
+ out_free_fship:
+ 	kfree(vip);
+	return -EINVAL;
+}
diff --git a/kernel/fs/freevxfs/vxfs_fshead.h b/kernel/fs/freevxfs/vxfs_fshead.h
new file mode 100644
index 000000000..ead0d640c
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_fshead.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_FSHEAD_H_
+#define _VXFS_FSHEAD_H_
+
+/*
+ * Veritas filesystem driver - fileset header structures.
+ *
+ * This file contains the physical structure of the VxFS
+ * fileset header.
+ */
+
+
+/*
+ * Fileset header 
+ */
+struct vxfs_fsh {
+	u_int32_t	fsh_version;		/* fileset header version */
+	u_int32_t	fsh_fsindex;		/* fileset index */
+	u_int32_t	fsh_time;		/* modification time - sec */
+	u_int32_t	fsh_utime;		/* modification time - usec */
+	u_int32_t	fsh_extop;		/* extop flags */
+	vx_ino_t	fsh_ninodes;		/* allocated inodes */
+	u_int32_t	fsh_nau;		/* number of IAUs */
+	u_int32_t	fsh_old_ilesize;	/* old size of ilist */
+	u_int32_t	fsh_dflags;		/* flags */
+	u_int32_t	fsh_quota;		/* quota limit */
+	vx_ino_t	fsh_maxinode;		/* maximum inode number */
+	vx_ino_t	fsh_iauino;		/* IAU inode */
+	vx_ino_t	fsh_ilistino[2];	/* ilist inodes */
+	vx_ino_t	fsh_lctino;		/* link count table inode */
+
+	/*
+	 * Slightly more fields follow, but they
+	 *  a) are not of any interest for us, and
+	 *  b) differ a lot in different vxfs versions/ports
+	 */
+};
+
+#endif /* _VXFS_FSHEAD_H_ */
diff --git a/kernel/fs/freevxfs/vxfs_immed.c b/kernel/fs/freevxfs/vxfs_immed.c
new file mode 100644
index 000000000..8b9229e2c
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_immed.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - support for 'immed' inodes.
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+
+#include "vxfs.h"
+#include "vxfs_extern.h"
+#include "vxfs_inode.h"
+
+
+static void *	vxfs_immed_follow_link(struct dentry *, struct nameidata *);
+
+static int	vxfs_immed_readpage(struct file *, struct page *);
+
+/*
+ * Inode operations for immed symlinks.
+ *
+ * Unliked all other operations we do not go through the pagecache,
+ * but do all work directly on the inode.
+ */
+const struct inode_operations vxfs_immed_symlink_iops = {
+	.readlink =		generic_readlink,
+	.follow_link =		vxfs_immed_follow_link,
+};
+
+/*
+ * Address space operations for immed files and directories.
+ */
+const struct address_space_operations vxfs_immed_aops = {
+	.readpage =		vxfs_immed_readpage,
+};
+
+/**
+ * vxfs_immed_follow_link - follow immed symlink
+ * @dp:		dentry for the link
+ * @np:		pathname lookup data for the current path walk
+ *
+ * Description:
+ *   vxfs_immed_follow_link restarts the pathname lookup with
+ *   the data obtained from @dp.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ */
+static void *
+vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np)
+{
+	struct vxfs_inode_info		*vip = VXFS_INO(d_inode(dp));
+	nd_set_link(np, vip->vii_immed.vi_immed);
+	return NULL;
+}
+
+/**
+ * vxfs_immed_readpage - read part of an immed inode into pagecache
+ * @file:	file context (unused)
+ * @page:	page frame to fill in.
+ *
+ * Description:
+ *   vxfs_immed_readpage reads a part of the immed area of the
+ *   file that hosts @pp into the pagecache.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ *
+ * Locking status:
+ *   @page is locked and will be unlocked.
+ */
+static int
+vxfs_immed_readpage(struct file *fp, struct page *pp)
+{
+	struct vxfs_inode_info	*vip = VXFS_INO(pp->mapping->host);
+	u_int64_t	offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
+	caddr_t		kaddr;
+
+	kaddr = kmap(pp);
+	memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
+	kunmap(pp);
+	
+	flush_dcache_page(pp);
+	SetPageUptodate(pp);
+        unlock_page(pp);
+
+	return 0;
+}
diff --git a/kernel/fs/freevxfs/vxfs_inode.c b/kernel/fs/freevxfs/vxfs_inode.c
new file mode 100644
index 000000000..363e3ae25
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_inode.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - inode routines.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "vxfs.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+
+
+struct kmem_cache		*vxfs_inode_cachep;
+
+
+#ifdef DIAGNOSTIC
+/*
+ * Dump inode contents (partially).
+ */
+void
+vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino)
+{
+	printk(KERN_DEBUG "\n\n");
+	if (ino)
+		printk(KERN_DEBUG "dumping vxfs inode %ld\n", ino);
+	else
+		printk(KERN_DEBUG "dumping unknown vxfs inode\n");
+
+	printk(KERN_DEBUG "---------------------------\n");
+	printk(KERN_DEBUG "mode is %x\n", vip->vii_mode);
+	printk(KERN_DEBUG "nlink:%u, uid:%u, gid:%u\n",
+			vip->vii_nlink, vip->vii_uid, vip->vii_gid);
+	printk(KERN_DEBUG "size:%Lx, blocks:%u\n",
+			vip->vii_size, vip->vii_blocks);
+	printk(KERN_DEBUG "orgtype:%u\n", vip->vii_orgtype);
+}
+#endif
+
+
+/**
+ * vxfs_blkiget - find inode based on extent #
+ * @sbp:	superblock of the filesystem we search in
+ * @extent:	number of the extent to search
+ * @ino:	inode number to search
+ *
+ * Description:
+ *  vxfs_blkiget searches inode @ino in the filesystem described by
+ *  @sbp in the extent @extent.
+ *  Returns the matching VxFS inode on success, else a NULL pointer.
+ *
+ * NOTE:
+ *  While __vxfs_iget uses the pagecache vxfs_blkiget uses the
+ *  buffercache.  This function should not be used outside the
+ *  read_super() method, otherwise the data may be incoherent.
+ */
+struct vxfs_inode_info *
+vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino)
+{
+	struct buffer_head		*bp;
+	u_long				block, offset;
+
+	block = extent + ((ino * VXFS_ISIZE) / sbp->s_blocksize);
+	offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE);
+	bp = sb_bread(sbp, block);
+
+	if (bp && buffer_mapped(bp)) {
+		struct vxfs_inode_info	*vip;
+		struct vxfs_dinode	*dip;
+
+		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
+			goto fail;
+		dip = (struct vxfs_dinode *)(bp->b_data + offset);
+		memcpy(vip, dip, sizeof(*vip));
+#ifdef DIAGNOSTIC
+		vxfs_dumpi(vip, ino);
+#endif
+		brelse(bp);
+		return (vip);
+	}
+
+fail:
+	printk(KERN_WARNING "vxfs: unable to read block %ld\n", block);
+	brelse(bp);
+	return NULL;
+}
+
+/**
+ * __vxfs_iget - generic find inode facility
+ * @sbp:		VFS superblock
+ * @ino:		inode number
+ * @ilistp:		inode list
+ *
+ * Description:
+ *  Search the for inode number @ino in the filesystem
+ *  described by @sbp.  Use the specified inode table (@ilistp).
+ *  Returns the matching VxFS inode on success, else an error code.
+ */
+static struct vxfs_inode_info *
+__vxfs_iget(ino_t ino, struct inode *ilistp)
+{
+	struct page			*pp;
+	u_long				offset;
+
+	offset = (ino % (PAGE_SIZE / VXFS_ISIZE)) * VXFS_ISIZE;
+	pp = vxfs_get_page(ilistp->i_mapping, ino * VXFS_ISIZE / PAGE_SIZE);
+
+	if (!IS_ERR(pp)) {
+		struct vxfs_inode_info	*vip;
+		struct vxfs_dinode	*dip;
+		caddr_t			kaddr = (char *)page_address(pp);
+
+		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
+			goto fail;
+		dip = (struct vxfs_dinode *)(kaddr + offset);
+		memcpy(vip, dip, sizeof(*vip));
+#ifdef DIAGNOSTIC
+		vxfs_dumpi(vip, ino);
+#endif
+		vxfs_put_page(pp);
+		return (vip);
+	}
+
+	printk(KERN_WARNING "vxfs: error on page %p\n", pp);
+	return ERR_CAST(pp);
+
+fail:
+	printk(KERN_WARNING "vxfs: unable to read inode %ld\n", (unsigned long)ino);
+	vxfs_put_page(pp);
+	return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * vxfs_stiget - find inode using the structural inode list
+ * @sbp:	VFS superblock
+ * @ino:	inode #
+ *
+ * Description:
+ *  Find inode @ino in the filesystem described by @sbp using
+ *  the structural inode list.
+ *  Returns the matching VxFS inode on success, else a NULL pointer.
+ */
+struct vxfs_inode_info *
+vxfs_stiget(struct super_block *sbp, ino_t ino)
+{
+	struct vxfs_inode_info *vip;
+
+	vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_stilist);
+	return IS_ERR(vip) ? NULL : vip;
+}
+
+/**
+ * vxfs_transmod - mode for a VxFS inode
+ * @vip:	VxFS inode
+ *
+ * Description:
+ *  vxfs_transmod returns a Linux mode_t for a given
+ *  VxFS inode structure.
+ */
+static __inline__ umode_t
+vxfs_transmod(struct vxfs_inode_info *vip)
+{
+	umode_t			ret = vip->vii_mode & ~VXFS_TYPE_MASK;
+
+	if (VXFS_ISFIFO(vip))
+		ret |= S_IFIFO;
+	if (VXFS_ISCHR(vip))
+		ret |= S_IFCHR;
+	if (VXFS_ISDIR(vip))
+		ret |= S_IFDIR;
+	if (VXFS_ISBLK(vip))
+		ret |= S_IFBLK;
+	if (VXFS_ISLNK(vip))
+		ret |= S_IFLNK;
+	if (VXFS_ISREG(vip))
+		ret |= S_IFREG;
+	if (VXFS_ISSOC(vip))
+		ret |= S_IFSOCK;
+
+	return (ret);
+}
+
+/**
+ * vxfs_iinit- helper to fill inode fields
+ * @ip:		VFS inode
+ * @vip:	VxFS inode
+ *
+ * Description:
+ *  vxfs_instino is a helper function to fill in all relevant
+ *  fields in @ip from @vip.
+ */
+static void
+vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
+{
+
+	ip->i_mode = vxfs_transmod(vip);
+	i_uid_write(ip, (uid_t)vip->vii_uid);
+	i_gid_write(ip, (gid_t)vip->vii_gid);
+
+	set_nlink(ip, vip->vii_nlink);
+	ip->i_size = vip->vii_size;
+
+	ip->i_atime.tv_sec = vip->vii_atime;
+	ip->i_ctime.tv_sec = vip->vii_ctime;
+	ip->i_mtime.tv_sec = vip->vii_mtime;
+	ip->i_atime.tv_nsec = 0;
+	ip->i_ctime.tv_nsec = 0;
+	ip->i_mtime.tv_nsec = 0;
+
+	ip->i_blocks = vip->vii_blocks;
+	ip->i_generation = vip->vii_gen;
+
+	ip->i_private = vip;
+	
+}
+
+/**
+ * vxfs_get_fake_inode - get fake inode structure
+ * @sbp:		filesystem superblock
+ * @vip:		fspriv inode
+ *
+ * Description:
+ *  vxfs_fake_inode gets a fake inode (not in the inode hash) for a
+ *  superblock, vxfs_inode pair.
+ *  Returns the filled VFS inode.
+ */
+struct inode *
+vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
+{
+	struct inode			*ip = NULL;
+
+	if ((ip = new_inode(sbp))) {
+		ip->i_ino = get_next_ino();
+		vxfs_iinit(ip, vip);
+		ip->i_mapping->a_ops = &vxfs_aops;
+	}
+	return (ip);
+}
+
+/**
+ * vxfs_put_fake_inode - free faked inode
+ * *ip:			VFS inode
+ *
+ * Description:
+ *  vxfs_put_fake_inode frees all data associated with @ip.
+ */
+void
+vxfs_put_fake_inode(struct inode *ip)
+{
+	iput(ip);
+}
+
+/**
+ * vxfs_iget - get an inode
+ * @sbp:	the superblock to get the inode for
+ * @ino:	the number of the inode to get
+ *
+ * Description:
+ *  vxfs_read_inode creates an inode, reads the disk inode for @ino and fills
+ *  in all relevant fields in the new inode.
+ */
+struct inode *
+vxfs_iget(struct super_block *sbp, ino_t ino)
+{
+	struct vxfs_inode_info		*vip;
+	const struct address_space_operations	*aops;
+	struct inode *ip;
+
+	ip = iget_locked(sbp, ino);
+	if (!ip)
+		return ERR_PTR(-ENOMEM);
+	if (!(ip->i_state & I_NEW))
+		return ip;
+
+	vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist);
+	if (IS_ERR(vip)) {
+		iget_failed(ip);
+		return ERR_CAST(vip);
+	}
+
+	vxfs_iinit(ip, vip);
+
+	if (VXFS_ISIMMED(vip))
+		aops = &vxfs_immed_aops;
+	else
+		aops = &vxfs_aops;
+
+	if (S_ISREG(ip->i_mode)) {
+		ip->i_fop = &generic_ro_fops;
+		ip->i_mapping->a_ops = aops;
+	} else if (S_ISDIR(ip->i_mode)) {
+		ip->i_op = &vxfs_dir_inode_ops;
+		ip->i_fop = &vxfs_dir_operations;
+		ip->i_mapping->a_ops = aops;
+	} else if (S_ISLNK(ip->i_mode)) {
+		if (!VXFS_ISIMMED(vip)) {
+			ip->i_op = &page_symlink_inode_operations;
+			ip->i_mapping->a_ops = &vxfs_aops;
+		} else {
+			ip->i_op = &vxfs_immed_symlink_iops;
+			vip->vii_immed.vi_immed[ip->i_size] = '\0';
+		}
+	} else
+		init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
+
+	unlock_new_inode(ip);
+	return ip;
+}
+
+static void vxfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
+
+/**
+ * vxfs_evict_inode - remove inode from main memory
+ * @ip:		inode to discard.
+ *
+ * Description:
+ *  vxfs_evict_inode() is called on the final iput and frees the private
+ *  inode area.
+ */
+void
+vxfs_evict_inode(struct inode *ip)
+{
+	truncate_inode_pages_final(&ip->i_data);
+	clear_inode(ip);
+	call_rcu(&ip->i_rcu, vxfs_i_callback);
+}
diff --git a/kernel/fs/freevxfs/vxfs_inode.h b/kernel/fs/freevxfs/vxfs_inode.h
new file mode 100644
index 000000000..240aeb112
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_inode.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_INODE_H_
+#define _VXFS_INODE_H_
+
+/*
+ * Veritas filesystem driver - inode structure.
+ *
+ * This file contains the definition of the disk and core
+ * inodes of the Veritas Filesystem.
+ */
+
+
+#define VXFS_ISIZE		0x100		/* Inode size */
+
+#define VXFS_NDADDR		10		/* Number of direct addrs in inode */
+#define VXFS_NIADDR		2		/* Number of indirect addrs in inode */
+#define VXFS_NIMMED		96		/* Size of immediate data in inode */
+#define VXFS_NTYPED		6		/* Num of typed extents */
+
+#define VXFS_TYPED_OFFSETMASK	(0x00FFFFFFFFFFFFFFULL)
+#define VXFS_TYPED_TYPEMASK	(0xFF00000000000000ULL)
+#define VXFS_TYPED_TYPESHIFT	56
+
+#define VXFS_TYPED_PER_BLOCK(sbp) \
+	((sbp)->s_blocksize / sizeof(struct vxfs_typed))
+
+/*
+ * Possible extent descriptor types for %VXFS_ORG_TYPED extents.
+ */
+enum {
+	VXFS_TYPED_INDIRECT		= 1,
+	VXFS_TYPED_DATA			= 2,
+	VXFS_TYPED_INDIRECT_DEV4	= 3,
+	VXFS_TYPED_DATA_DEV4		= 4,
+};
+
+/*
+ * Data stored immediately in the inode.
+ */
+struct vxfs_immed {
+	u_int8_t	vi_immed[VXFS_NIMMED];
+};
+
+struct vxfs_ext4 {
+	u_int32_t		ve4_spare;		/* ?? */
+	u_int32_t		ve4_indsize;		/* Indirect extent size */
+	vx_daddr_t		ve4_indir[VXFS_NIADDR];	/* Indirect extents */
+	struct direct {					/* Direct extents */
+		vx_daddr_t	extent;			/* Extent number */
+		int32_t		size;			/* Size of extent */
+	} ve4_direct[VXFS_NDADDR];
+};
+
+struct vxfs_typed {
+	u_int64_t	vt_hdr;		/* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+	vx_daddr_t	vt_block;	/* Extent block */
+	int32_t		vt_size;	/* Size in blocks */
+};
+
+struct vxfs_typed_dev4 {
+	u_int64_t	vd4_hdr;	/* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+	u_int64_t	vd4_block;	/* Extent block */
+	u_int64_t	vd4_size;	/* Size in blocks */
+	int32_t		vd4_dev;	/* Device ID */
+	u_int32_t	__pad1;
+};
+
+/*
+ * The inode as contained on the physical device.
+ */
+struct vxfs_dinode {
+	int32_t		vdi_mode;
+	u_int32_t	vdi_nlink;	/* Link count */
+	u_int32_t	vdi_uid;	/* UID */
+	u_int32_t	vdi_gid;	/* GID */
+	u_int64_t	vdi_size;	/* Inode size in bytes */
+	u_int32_t	vdi_atime;	/* Last time accessed - sec */
+	u_int32_t	vdi_autime;	/* Last time accessed - usec */
+	u_int32_t	vdi_mtime;	/* Last modify time - sec */
+	u_int32_t	vdi_mutime;	/* Last modify time - usec */
+	u_int32_t	vdi_ctime;	/* Create time - sec */
+	u_int32_t	vdi_cutime;	/* Create time - usec */
+	u_int8_t	vdi_aflags;	/* Allocation flags */
+	u_int8_t	vdi_orgtype;	/* Organisation type */
+	u_int16_t	vdi_eopflags;
+	u_int32_t	vdi_eopdata;
+	union {
+		u_int32_t		rdev;
+		u_int32_t		dotdot;
+		struct {
+			u_int32_t	reserved;
+			u_int32_t	fixextsize;
+		} i_regular;
+		struct {
+			u_int32_t	matchino;
+			u_int32_t	fsetindex;
+		} i_vxspec;
+		u_int64_t		align;
+	} vdi_ftarea;
+	u_int32_t	vdi_blocks;	/* How much blocks does inode occupy */
+	u_int32_t	vdi_gen;	/* Inode generation */
+	u_int64_t	vdi_version;	/* Version */
+	union {
+		struct vxfs_immed	immed;
+		struct vxfs_ext4	ext4;
+		struct vxfs_typed	typed[VXFS_NTYPED];
+	} vdi_org;
+	u_int32_t	vdi_iattrino;
+};
+
+#define vdi_rdev	vdi_ftarea.rdev
+#define vdi_dotdot	vdi_ftarea.dotdot
+#define vdi_fixextsize	vdi_ftarea.regular.fixextsize
+#define vdi_matchino	vdi_ftarea.vxspec.matchino
+#define vdi_fsetindex	vdi_ftarea.vxspec.fsetindex
+
+#define vdi_immed	vdi_org.immed
+#define vdi_ext4	vdi_org.ext4
+#define vdi_typed	vdi_org.typed
+
+
+/*
+ * The inode as represented in the main memory.
+ *
+ * TBD: This should become a separate structure...
+ */
+#define vxfs_inode_info	vxfs_dinode
+
+#define vii_mode	vdi_mode
+#define vii_uid		vdi_uid
+#define vii_gid		vdi_gid
+#define vii_nlink	vdi_nlink
+#define vii_size	vdi_size
+#define vii_atime	vdi_atime
+#define vii_ctime	vdi_ctime
+#define vii_mtime	vdi_mtime
+#define vii_blocks	vdi_blocks
+#define vii_org		vdi_org
+#define vii_orgtype	vdi_orgtype
+#define vii_gen		vdi_gen
+
+#define vii_rdev	vdi_ftarea.rdev
+#define vii_dotdot	vdi_ftarea.dotdot
+#define vii_fixextsize	vdi_ftarea.regular.fixextsize
+#define vii_matchino	vdi_ftarea.vxspec.matchino
+#define vii_fsetindex	vdi_ftarea.vxspec.fsetindex
+
+#define vii_immed	vdi_org.immed
+#define vii_ext4	vdi_org.ext4
+#define vii_typed	vdi_org.typed
+
+#endif /* _VXFS_INODE_H_ */
diff --git a/kernel/fs/freevxfs/vxfs_lookup.c b/kernel/fs/freevxfs/vxfs_lookup.c
new file mode 100644
index 000000000..99c7f0a37
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_lookup.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - lookup and other directory related code.
+ */
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+
+#include "vxfs.h"
+#include "vxfs_dir.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+
+/*
+ * Number of VxFS blocks per page.
+ */
+#define VXFS_BLOCK_PER_PAGE(sbp)  ((PAGE_CACHE_SIZE / (sbp)->s_blocksize))
+
+
+static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, unsigned int);
+static int		vxfs_readdir(struct file *, struct dir_context *);
+
+const struct inode_operations vxfs_dir_inode_ops = {
+	.lookup =		vxfs_lookup,
+};
+
+const struct file_operations vxfs_dir_operations = {
+	.llseek =		generic_file_llseek,
+	.read =			generic_read_dir,
+	.iterate =		vxfs_readdir,
+};
+
+ 
+static inline u_long
+dir_pages(struct inode *inode)
+{
+	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+ 
+static inline u_long
+dir_blocks(struct inode *ip)
+{
+	u_long			bsize = ip->i_sb->s_blocksize;
+	return (ip->i_size + bsize - 1) & ~(bsize - 1);
+}
+
+/*
+ * NOTE! unlike strncmp, vxfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller.
+ */
+static inline int
+vxfs_match(int len, const char * const name, struct vxfs_direct *de)
+{
+	if (len != de->d_namelen)
+		return 0;
+	if (!de->d_ino)
+		return 0;
+	return !memcmp(name, de->d_name, len);
+}
+
+static inline struct vxfs_direct *
+vxfs_next_entry(struct vxfs_direct *de)
+{
+	return ((struct vxfs_direct *)((char*)de + de->d_reclen));
+}
+
+/**
+ * vxfs_find_entry - find a mathing directory entry for a dentry
+ * @ip:		directory inode
+ * @dp:		dentry for which we want to find a direct
+ * @ppp:	gets filled with the page the return value sits in
+ *
+ * Description:
+ *   vxfs_find_entry finds a &struct vxfs_direct for the VFS directory
+ *   cache entry @dp.  @ppp will be filled with the page the return
+ *   value resides in.
+ *
+ * Returns:
+ *   The wanted direct on success, else a NULL pointer.
+ */
+static struct vxfs_direct *
+vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp)
+{
+	u_long				npages, page, nblocks, pblocks, block;
+	u_long				bsize = ip->i_sb->s_blocksize;
+	const char			*name = dp->d_name.name;
+	int				namelen = dp->d_name.len;
+
+	npages = dir_pages(ip);
+	nblocks = dir_blocks(ip);
+	pblocks = VXFS_BLOCK_PER_PAGE(ip->i_sb);
+	
+	for (page = 0; page < npages; page++) {
+		caddr_t			kaddr;
+		struct page		*pp;
+
+		pp = vxfs_get_page(ip->i_mapping, page);
+		if (IS_ERR(pp))
+			continue;
+		kaddr = (caddr_t)page_address(pp);
+
+		for (block = 0; block <= nblocks && block <= pblocks; block++) {
+			caddr_t			baddr, limit;
+			struct vxfs_dirblk	*dbp;
+			struct vxfs_direct	*de;
+
+			baddr = kaddr + (block * bsize);
+			limit = baddr + bsize - VXFS_DIRLEN(1);
+			
+			dbp = (struct vxfs_dirblk *)baddr;
+			de = (struct vxfs_direct *)(baddr + VXFS_DIRBLKOV(dbp));
+
+			for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
+				if (!de->d_reclen)
+					break;
+				if (!de->d_ino)
+					continue;
+				if (vxfs_match(namelen, name, de)) {
+					*ppp = pp;
+					return (de);
+				}
+			}
+		}
+		vxfs_put_page(pp);
+	}
+
+	return NULL;
+}
+
+/**
+ * vxfs_inode_by_name - find inode number for dentry
+ * @dip:	directory to search in
+ * @dp:		dentry we search for
+ *
+ * Description:
+ *   vxfs_inode_by_name finds out the inode number of
+ *   the path component described by @dp in @dip.
+ *
+ * Returns:
+ *   The wanted inode number on success, else Zero.
+ */
+static ino_t
+vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
+{
+	struct vxfs_direct		*de;
+	struct page			*pp;
+	ino_t				ino = 0;
+
+	de = vxfs_find_entry(dip, dp, &pp);
+	if (de) {
+		ino = de->d_ino;
+		kunmap(pp);
+		page_cache_release(pp);
+	}
+	
+	return (ino);
+}
+
+/**
+ * vxfs_lookup - lookup pathname component
+ * @dip:	dir in which we lookup
+ * @dp:		dentry we lookup
+ * @flags:	lookup flags
+ *
+ * Description:
+ *   vxfs_lookup tries to lookup the pathname component described
+ *   by @dp in @dip.
+ *
+ * Returns:
+ *   A NULL-pointer on success, else an negative error code encoded
+ *   in the return pointer.
+ */
+static struct dentry *
+vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
+{
+	struct inode		*ip = NULL;
+	ino_t			ino;
+			 
+	if (dp->d_name.len > VXFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+				 
+	ino = vxfs_inode_by_name(dip, dp);
+	if (ino) {
+		ip = vxfs_iget(dip->i_sb, ino);
+		if (IS_ERR(ip))
+			return ERR_CAST(ip);
+	}
+	d_add(dp, ip);
+	return NULL;
+}
+
+/**
+ * vxfs_readdir - read a directory
+ * @fp:		the directory to read
+ * @retp:	return buffer
+ * @filler:	filldir callback
+ *
+ * Description:
+ *   vxfs_readdir fills @retp with directory entries from @fp
+ *   using the VFS supplied callback @filler.
+ *
+ * Returns:
+ *   Zero.
+ */
+static int
+vxfs_readdir(struct file *fp, struct dir_context *ctx)
+{
+	struct inode		*ip = file_inode(fp);
+	struct super_block	*sbp = ip->i_sb;
+	u_long			bsize = sbp->s_blocksize;
+	u_long			page, npages, block, pblocks, nblocks, offset;
+	loff_t			pos;
+
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(fp, ctx))
+			return 0;
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
+			return 0;
+		ctx->pos = 2;
+	}
+	pos = ctx->pos - 2;
+	
+	if (pos > VXFS_DIRROUND(ip->i_size))
+		return 0;
+
+	npages = dir_pages(ip);
+	nblocks = dir_blocks(ip);
+	pblocks = VXFS_BLOCK_PER_PAGE(sbp);
+
+	page = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
+	block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
+
+	for (; page < npages; page++, block = 0) {
+		char			*kaddr;
+		struct page		*pp;
+
+		pp = vxfs_get_page(ip->i_mapping, page);
+		if (IS_ERR(pp))
+			continue;
+		kaddr = (char *)page_address(pp);
+
+		for (; block <= nblocks && block <= pblocks; block++) {
+			char			*baddr, *limit;
+			struct vxfs_dirblk	*dbp;
+			struct vxfs_direct	*de;
+
+			baddr = kaddr + (block * bsize);
+			limit = baddr + bsize - VXFS_DIRLEN(1);
+	
+			dbp = (struct vxfs_dirblk *)baddr;
+			de = (struct vxfs_direct *)
+				(offset ?
+				 (kaddr + offset) :
+				 (baddr + VXFS_DIRBLKOV(dbp)));
+
+			for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
+				if (!de->d_reclen)
+					break;
+				if (!de->d_ino)
+					continue;
+
+				offset = (char *)de - kaddr;
+				ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+				if (!dir_emit(ctx, de->d_name, de->d_namelen,
+					de->d_ino, DT_UNKNOWN)) {
+					vxfs_put_page(pp);
+					return 0;
+				}
+			}
+			offset = 0;
+		}
+		vxfs_put_page(pp);
+		offset = 0;
+	}
+	ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+	return 0;
+}
diff --git a/kernel/fs/freevxfs/vxfs_olt.c b/kernel/fs/freevxfs/vxfs_olt.c
new file mode 100644
index 000000000..049500847
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_olt.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* 
+ * Veritas filesystem driver - object location table support.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+
+#include "vxfs.h"
+#include "vxfs_olt.h"
+#include "vxfs_extern.h"
+
+
+static inline void
+vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp)
+{
+	BUG_ON(infp->vsi_fshino);
+	infp->vsi_fshino = fshp->olt_fsino[0];
+}
+
+static inline void
+vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp)
+{
+	BUG_ON(infp->vsi_iext);
+	infp->vsi_iext = ilistp->olt_iext[0]; 
+}
+
+static inline u_long
+vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize)
+{
+	BUG_ON(sbp->s_blocksize % bsize);
+	return (block * (sbp->s_blocksize / bsize));
+}
+
+
+/**
+ * vxfs_read_olt - read olt
+ * @sbp:	superblock of the filesystem
+ * @bsize:	blocksize of the filesystem
+ *
+ * Description:
+ *   vxfs_read_olt reads the olt of the filesystem described by @sbp
+ *   into main memory and does some basic setup.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ */
+int
+vxfs_read_olt(struct super_block *sbp, u_long bsize)
+{
+	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
+	struct buffer_head	*bp;
+	struct vxfs_olt		*op;
+	char			*oaddr, *eaddr;
+
+
+	bp = sb_bread(sbp, vxfs_oblock(sbp, infp->vsi_oltext, bsize));
+	if (!bp || !bp->b_data)
+		goto fail;
+
+	op = (struct vxfs_olt *)bp->b_data;
+	if (op->olt_magic != VXFS_OLT_MAGIC) {
+		printk(KERN_NOTICE "vxfs: ivalid olt magic number\n");
+		goto fail;
+	}
+
+	/*
+	 * It is in theory possible that vsi_oltsize is > 1.
+	 * I've not seen any such filesystem yet and I'm lazy..  --hch
+	 */
+	if (infp->vsi_oltsize > 1) {
+		printk(KERN_NOTICE "vxfs: oltsize > 1 detected.\n");
+		printk(KERN_NOTICE "vxfs: please notify hch@infradead.org\n");
+		goto fail;
+	}
+
+	oaddr = bp->b_data + op->olt_size;
+	eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize);
+
+	while (oaddr < eaddr) {
+		struct vxfs_oltcommon	*ocp =
+			(struct vxfs_oltcommon *)oaddr;
+		
+		switch (ocp->olt_type) {
+		case VXFS_OLT_FSHEAD:
+			vxfs_get_fshead((struct vxfs_oltfshead *)oaddr, infp);
+			break;
+		case VXFS_OLT_ILIST:
+			vxfs_get_ilist((struct vxfs_oltilist *)oaddr, infp);
+			break;
+		}
+
+		oaddr += ocp->olt_size;
+	}
+
+	brelse(bp);
+	return 0;
+
+fail:
+	brelse(bp);
+	return -EINVAL;
+}
diff --git a/kernel/fs/freevxfs/vxfs_olt.h b/kernel/fs/freevxfs/vxfs_olt.h
new file mode 100644
index 000000000..b7b3af502
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_olt.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_OLT_H_
+#define _VXFS_OLT_H_
+
+/*
+ * Veritas filesystem driver - Object Location Table data structures.
+ *
+ * This file contains definitions for the Object Location Table used
+ * by the Veritas Filesystem version 2 and newer.
+ */
+
+
+/*
+ * OLT magic number (vxfs_olt->olt_magic).
+ */
+#define VXFS_OLT_MAGIC		0xa504FCF5
+
+/*
+ * VxFS OLT entry types.
+ */
+enum {
+	VXFS_OLT_FREE	= 1,
+	VXFS_OLT_FSHEAD	= 2,
+	VXFS_OLT_CUT	= 3,
+	VXFS_OLT_ILIST	= 4,
+	VXFS_OLT_DEV	= 5,
+	VXFS_OLT_SB	= 6
+};
+
+/*
+ * VxFS OLT header.
+ *
+ * The Object Location Table header is placed at the beginning of each
+ * OLT extent.  It is used to fing certain filesystem-wide metadata, e.g.
+ * the initial inode list, the fileset header or the device configuration.
+ */
+struct vxfs_olt {
+	u_int32_t	olt_magic;	/* magic number			*/
+	u_int32_t	olt_size;	/* size of this entry		*/
+	u_int32_t	olt_checksum;	/* checksum of extent		*/
+	u_int32_t	__unused1;	/* ???				*/
+	u_int32_t	olt_mtime;	/* time of last mod. (sec)	*/
+	u_int32_t	olt_mutime;	/* time of last mod. (usec)	*/
+	u_int32_t	olt_totfree;	/* free space in OLT extent	*/
+	vx_daddr_t	olt_extents[2];	/* addr of this extent, replica	*/
+	u_int32_t	olt_esize;	/* size of this extent		*/
+	vx_daddr_t	olt_next[2];    /* addr of next extent, replica	*/
+	u_int32_t	olt_nsize;	/* size of next extent		*/
+	u_int32_t	__unused2;	/* align to 8 byte boundary	*/
+};
+
+/*
+ * VxFS common OLT entry (on disk).
+ */
+struct vxfs_oltcommon {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+};
+
+/*
+ * VxFS free OLT entry (on disk).
+ */
+struct vxfs_oltfree {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_fsize;	/* size of this free record	*/
+};
+
+/*
+ * VxFS initial-inode list (on disk).
+ */
+struct vxfs_oltilist {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_iext[2];	/* initial inode list, replica	*/
+};
+
+/*
+ * Current Usage Table 
+ */
+struct vxfs_oltcut {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_cutino;	/* inode of current usage table	*/
+	u_int32_t	__pad;		/* unused, 8 byte align		*/
+};
+
+/*
+ * Inodes containing Superblock, Intent log and OLTs 
+ */
+struct vxfs_oltsb {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_sbino;	/* inode of superblock file	*/
+	u_int32_t	__unused1;	/* ???				*/
+	vx_ino_t	olt_logino[2];	/* inode of log file,replica	*/
+	vx_ino_t	olt_oltino[2];	/* inode of OLT, replica	*/
+};
+
+/*
+ * Inode containing device configuration + it's replica 
+ */
+struct vxfs_oltdev {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_devino[2];	/* inode of device config files	*/
+};
+
+/*
+ * Fileset header 
+ */
+struct vxfs_oltfshead {
+	u_int32_t	olt_type;	/* type number			*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_fsino[2];   /* inodes of fileset header	*/
+};
+
+#endif /* _VXFS_OLT_H_ */
diff --git a/kernel/fs/freevxfs/vxfs_subr.c b/kernel/fs/freevxfs/vxfs_subr.c
new file mode 100644
index 000000000..5d318c44f
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_subr.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - shared subroutines.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+
+#include "vxfs_extern.h"
+
+
+static int		vxfs_readpage(struct file *, struct page *);
+static sector_t		vxfs_bmap(struct address_space *, sector_t);
+
+const struct address_space_operations vxfs_aops = {
+	.readpage =		vxfs_readpage,
+	.bmap =			vxfs_bmap,
+};
+
+inline void
+vxfs_put_page(struct page *pp)
+{
+	kunmap(pp);
+	page_cache_release(pp);
+}
+
+/**
+ * vxfs_get_page - read a page into memory.
+ * @ip:		inode to read from
+ * @n:		page number
+ *
+ * Description:
+ *   vxfs_get_page reads the @n th page of @ip into the pagecache.
+ *
+ * Returns:
+ *   The wanted page on success, else a NULL pointer.
+ */
+struct page *
+vxfs_get_page(struct address_space *mapping, u_long n)
+{
+	struct page *			pp;
+
+	pp = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(pp)) {
+		kmap(pp);
+		/** if (!PageChecked(pp)) **/
+			/** vxfs_check_page(pp); **/
+		if (PageError(pp))
+			goto fail;
+	}
+	
+	return (pp);
+		 
+fail:
+	vxfs_put_page(pp);
+	return ERR_PTR(-EIO);
+}
+
+/**
+ * vxfs_bread - read buffer for a give inode,block tuple
+ * @ip:		inode
+ * @block:	logical block
+ *
+ * Description:
+ *   The vxfs_bread function reads block no @block  of
+ *   @ip into the buffercache.
+ *
+ * Returns:
+ *   The resulting &struct buffer_head.
+ */
+struct buffer_head *
+vxfs_bread(struct inode *ip, int block)
+{
+	struct buffer_head	*bp;
+	daddr_t			pblock;
+
+	pblock = vxfs_bmap1(ip, block);
+	bp = sb_bread(ip->i_sb, pblock);
+
+	return (bp);
+}
+
+/**
+ * vxfs_get_block - locate buffer for given inode,block tuple 
+ * @ip:		inode
+ * @iblock:	logical block
+ * @bp:		buffer skeleton
+ * @create:	%TRUE if blocks may be newly allocated.
+ *
+ * Description:
+ *   The vxfs_get_block function fills @bp with the right physical
+ *   block and device number to perform a lowlevel read/write on
+ *   it.
+ *
+ * Returns:
+ *   Zero on success, else a negativ error code (-EIO).
+ */
+static int
+vxfs_getblk(struct inode *ip, sector_t iblock,
+	    struct buffer_head *bp, int create)
+{
+	daddr_t			pblock;
+
+	pblock = vxfs_bmap1(ip, iblock);
+	if (pblock != 0) {
+		map_bh(bp, ip->i_sb, pblock);
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/**
+ * vxfs_readpage - read one page synchronously into the pagecache
+ * @file:	file context (unused)
+ * @page:	page frame to fill in.
+ *
+ * Description:
+ *   The vxfs_readpage routine reads @page synchronously into the
+ *   pagecache.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ *
+ * Locking status:
+ *   @page is locked and will be unlocked.
+ */
+static int
+vxfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, vxfs_getblk);
+}
+ 
+/**
+ * vxfs_bmap - perform logical to physical block mapping
+ * @mapping:	logical to physical mapping to use
+ * @block:	logical block (relative to @mapping).
+ *
+ * Description:
+ *   Vxfs_bmap find out the corresponding phsical block to the
+ *   @mapping, @block pair.
+ *
+ * Returns:
+ *   Physical block number on success, else Zero.
+ *
+ * Locking status:
+ *   We are under the bkl.
+ */
+static sector_t
+vxfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, vxfs_getblk);
+}
diff --git a/kernel/fs/freevxfs/vxfs_super.c b/kernel/fs/freevxfs/vxfs_super.c
new file mode 100644
index 000000000..7ca8c75d5
--- /dev/null
+++ b/kernel/fs/freevxfs/vxfs_super.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - superblock related routines.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+
+#include "vxfs.h"
+#include "vxfs_extern.h"
+#include "vxfs_dir.h"
+#include "vxfs_inode.h"
+
+
+MODULE_AUTHOR("Christoph Hellwig");
+MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+
+
+static void		vxfs_put_super(struct super_block *);
+static int		vxfs_statfs(struct dentry *, struct kstatfs *);
+static int		vxfs_remount(struct super_block *, int *, char *);
+
+static const struct super_operations vxfs_super_ops = {
+	.evict_inode =		vxfs_evict_inode,
+	.put_super =		vxfs_put_super,
+	.statfs =		vxfs_statfs,
+	.remount_fs =		vxfs_remount,
+};
+
+/**
+ * vxfs_put_super - free superblock resources
+ * @sbp:	VFS superblock.
+ *
+ * Description:
+ *   vxfs_put_super frees all resources allocated for @sbp
+ *   after the last instance of the filesystem is unmounted.
+ */
+
+static void
+vxfs_put_super(struct super_block *sbp)
+{
+	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
+
+	vxfs_put_fake_inode(infp->vsi_fship);
+	vxfs_put_fake_inode(infp->vsi_ilist);
+	vxfs_put_fake_inode(infp->vsi_stilist);
+
+	brelse(infp->vsi_bp);
+	kfree(infp);
+}
+
+/**
+ * vxfs_statfs - get filesystem information
+ * @dentry:	VFS dentry to locate superblock
+ * @bufp:	output buffer
+ *
+ * Description:
+ *   vxfs_statfs fills the statfs buffer @bufp with information
+ *   about the filesystem described by @dentry.
+ *
+ * Returns:
+ *   Zero.
+ *
+ * Locking:
+ *   No locks held.
+ *
+ * Notes:
+ *   This is everything but complete...
+ */
+static int
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
+{
+	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
+
+	bufp->f_type = VXFS_SUPER_MAGIC;
+	bufp->f_bsize = dentry->d_sb->s_blocksize;
+	bufp->f_blocks = infp->vsi_raw->vs_dsize;
+	bufp->f_bfree = infp->vsi_raw->vs_free;
+	bufp->f_bavail = 0;
+	bufp->f_files = 0;
+	bufp->f_ffree = infp->vsi_raw->vs_ifree;
+	bufp->f_namelen = VXFS_NAMELEN;
+
+	return 0;
+}
+
+static int vxfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+/**
+ * vxfs_read_super - read superblock into memory and initialize filesystem
+ * @sbp:		VFS superblock (to fill)
+ * @dp:			fs private mount data
+ * @silent:		do not complain loudly when sth is wrong
+ *
+ * Description:
+ *   We are called on the first mount of a filesystem to read the
+ *   superblock into memory and do some basic setup.
+ *
+ * Returns:
+ *   The superblock on success, else %NULL.
+ *
+ * Locking:
+ *   We are under @sbp->s_lock.
+ */
+static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
+{
+	struct vxfs_sb_info	*infp;
+	struct vxfs_sb		*rsbp;
+	struct buffer_head	*bp = NULL;
+	u_long			bsize;
+	struct inode *root;
+	int ret = -EINVAL;
+
+	sbp->s_flags |= MS_RDONLY;
+
+	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
+	if (!infp) {
+		printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
+		return -ENOMEM;
+	}
+
+	bsize = sb_min_blocksize(sbp, BLOCK_SIZE);
+	if (!bsize) {
+		printk(KERN_WARNING "vxfs: unable to set blocksize\n");
+		goto out;
+	}
+
+	bp = sb_bread(sbp, 1);
+	if (!bp || !buffer_mapped(bp)) {
+		if (!silent) {
+			printk(KERN_WARNING
+				"vxfs: unable to read disk superblock\n");
+		}
+		goto out;
+	}
+
+	rsbp = (struct vxfs_sb *)bp->b_data;
+	if (rsbp->vs_magic != VXFS_SUPER_MAGIC) {
+		if (!silent)
+			printk(KERN_NOTICE "vxfs: WRONG superblock magic\n");
+		goto out;
+	}
+
+	if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) {
+		printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n",
+		       rsbp->vs_version);
+		goto out;
+	}
+
+#ifdef DIAGNOSTIC
+	printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version);
+	printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize);
+#endif
+
+	sbp->s_magic = rsbp->vs_magic;
+	sbp->s_fs_info = infp;
+
+	infp->vsi_raw = rsbp;
+	infp->vsi_bp = bp;
+	infp->vsi_oltext = rsbp->vs_oltext[0];
+	infp->vsi_oltsize = rsbp->vs_oltsize;
+
+	if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) {
+		printk(KERN_WARNING "vxfs: unable to set final block size\n");
+		goto out;
+	}
+
+	if (vxfs_read_olt(sbp, bsize)) {
+		printk(KERN_WARNING "vxfs: unable to read olt\n");
+		goto out;
+	}
+
+	if (vxfs_read_fshead(sbp)) {
+		printk(KERN_WARNING "vxfs: unable to read fshead\n");
+		goto out;
+	}
+
+	sbp->s_op = &vxfs_super_ops;
+	root = vxfs_iget(sbp, VXFS_ROOT_INO);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+	sbp->s_root = d_make_root(root);
+	if (!sbp->s_root) {
+		printk(KERN_WARNING "vxfs: unable to get root dentry.\n");
+		goto out_free_ilist;
+	}
+
+	return 0;
+	
+out_free_ilist:
+	vxfs_put_fake_inode(infp->vsi_fship);
+	vxfs_put_fake_inode(infp->vsi_ilist);
+	vxfs_put_fake_inode(infp->vsi_stilist);
+out:
+	brelse(bp);
+	kfree(infp);
+	return ret;
+}
+
+/*
+ * The usual module blurb.
+ */
+static struct dentry *vxfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+}
+
+static struct file_system_type vxfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "vxfs",
+	.mount		= vxfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */
+MODULE_ALIAS("vxfs");
+
+static int __init
+vxfs_init(void)
+{
+	int rv;
+
+	vxfs_inode_cachep = kmem_cache_create("vxfs_inode",
+			sizeof(struct vxfs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	if (!vxfs_inode_cachep)
+		return -ENOMEM;
+	rv = register_filesystem(&vxfs_fs_type);
+	if (rv < 0)
+		kmem_cache_destroy(vxfs_inode_cachep);
+	return rv;
+}
+
+static void __exit
+vxfs_cleanup(void)
+{
+	unregister_filesystem(&vxfs_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(vxfs_inode_cachep);
+}
+
+module_init(vxfs_init);
+module_exit(vxfs_cleanup);
diff --git a/kernel/fs/fs-writeback.c b/kernel/fs/fs-writeback.c
new file mode 100644
index 000000000..32a8bbd7a
--- /dev/null
+++ b/kernel/fs/fs-writeback.c
@@ -0,0 +1,1595 @@
+/*
+ * fs/fs-writeback.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains all the functions related to writing back and waiting
+ * upon dirty inodes against superblocks, and writing back dirty
+ * pages against inodes.  ie: data writeback.  Writeout of the
+ * inode itself is not handled here.
+ *
+ * 10Apr2002	Andrew Morton
+ *		Split out of fs/inode.c
+ *		Additions for address_space-based writeback
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/tracepoint.h>
+#include <linux/device.h>
+#include "internal.h"
+
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
+
+/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_work {
+	long nr_pages;
+	struct super_block *sb;
+	unsigned long *older_than_this;
+	enum writeback_sync_modes sync_mode;
+	unsigned int tagged_writepages:1;
+	unsigned int for_kupdate:1;
+	unsigned int range_cyclic:1;
+	unsigned int for_background:1;
+	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	enum wb_reason reason;		/* why was writeback initiated? */
+
+	struct list_head list;		/* pending work list */
+	struct completion *done;	/* set if the caller waits */
+};
+
+/*
+ * If an inode is constantly having its pages dirtied, but then the
+ * updates stop dirtytime_expire_interval seconds in the past, it's
+ * possible for the worst case time between when an inode has its
+ * timestamps updated and when they finally get written out to be two
+ * dirtytime_expire_intervals.  We set the default to 12 hours (in
+ * seconds), which means most of the time inodes will have their
+ * timestamps written to disk after 12 hours, but in the worst case a
+ * few inodes might not their timestamps updated for 24 hours.
+ */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @bdi: the device's backing_dev_info structure.
+ *
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
+ */
+int writeback_in_progress(struct backing_dev_info *bdi)
+{
+	return test_bit(BDI_writeback_running, &bdi->state);
+}
+EXPORT_SYMBOL(writeback_in_progress);
+
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+	struct super_block *sb;
+
+	if (!inode)
+		return &noop_backing_dev_info;
+
+	sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+	if (sb_is_blkdev_sb(sb))
+		return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
+	return sb->s_bdi;
+}
+EXPORT_SYMBOL_GPL(inode_to_bdi);
+
+static inline struct inode *wb_inode(struct list_head *head)
+{
+	return list_entry(head, struct inode, i_wb_list);
+}
+
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure and inline functions so that the definition
+ * remains local to this file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
+
+static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+{
+	spin_lock_bh(&bdi->wb_lock);
+	if (test_bit(BDI_registered, &bdi->state))
+		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+	spin_unlock_bh(&bdi->wb_lock);
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+			   struct wb_writeback_work *work)
+{
+	trace_writeback_queue(bdi, work);
+
+	spin_lock_bh(&bdi->wb_lock);
+	if (!test_bit(BDI_registered, &bdi->state)) {
+		if (work->done)
+			complete(work->done);
+		goto out_unlock;
+	}
+	list_add_tail(&work->list, &bdi->work_list);
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+out_unlock:
+	spin_unlock_bh(&bdi->wb_lock);
+}
+
+static void
+__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+		      bool range_cyclic, enum wb_reason reason)
+{
+	struct wb_writeback_work *work;
+
+	/*
+	 * This is WB_SYNC_NONE writeback, so if allocation fails just
+	 * wakeup the thread for old dirty data writeback
+	 */
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		trace_writeback_nowork(bdi);
+		bdi_wakeup_thread(bdi);
+		return;
+	}
+
+	work->sync_mode	= WB_SYNC_NONE;
+	work->nr_pages	= nr_pages;
+	work->range_cyclic = range_cyclic;
+	work->reason	= reason;
+
+	bdi_queue_work(bdi, work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ * @reason: reason why some writeback work was initiated
+ *
+ * Description:
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarantees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+			enum wb_reason reason)
+{
+	__bdi_start_writeback(bdi, nr_pages, true, reason);
+}
+
+/**
+ * bdi_start_background_writeback - start background writeback
+ * @bdi: the backing device to write from
+ *
+ * Description:
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
+ */
+void bdi_start_background_writeback(struct backing_dev_info *bdi)
+{
+	/*
+	 * We just wake up the flusher thread. It will perform background
+	 * writeback as soon as there is no other work to do.
+	 */
+	trace_writeback_wake_background(bdi);
+	bdi_wakeup_thread(bdi);
+}
+
+/*
+ * Remove the inode from the writeback list it is on.
+ */
+void inode_wb_list_del(struct inode *inode)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+	spin_lock(&bdi->wb.list_lock);
+	list_del_init(&inode->i_wb_list);
+	spin_unlock(&bdi->wb.list_lock);
+}
+
+/*
+ * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
+ * furthest end of its superblock's dirty-inode list.
+ *
+ * Before stamping the inode's ->dirtied_when, we check to see whether it is
+ * already the most-recently-dirtied inode on the b_dirty list.  If that is
+ * the case then the inode must have been redirtied while it was being written
+ * out and we don't reset its dirtied_when.
+ */
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+	if (!list_empty(&wb->b_dirty)) {
+		struct inode *tail;
+
+		tail = wb_inode(wb->b_dirty.next);
+		if (time_before(inode->dirtied_when, tail->dirtied_when))
+			inode->dirtied_when = jiffies;
+	}
+	list_move(&inode->i_wb_list, &wb->b_dirty);
+}
+
+/*
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
+ */
+static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+	list_move(&inode->i_wb_list, &wb->b_more_io);
+}
+
+static void inode_sync_complete(struct inode *inode)
+{
+	inode->i_state &= ~I_SYNC;
+	/* If inode is clean an unused, put it into LRU now... */
+	inode_add_lru(inode);
+	/* Waiters must see I_SYNC cleared before being woken up */
+	smp_mb();
+	wake_up_bit(&inode->i_state, __I_SYNC);
+}
+
+static bool inode_dirtied_after(struct inode *inode, unsigned long t)
+{
+	bool ret = time_after(inode->dirtied_when, t);
+#ifndef CONFIG_64BIT
+	/*
+	 * For inodes being constantly redirtied, dirtied_when can get stuck.
+	 * It _appears_ to be in the future, but is actually in distant past.
+	 * This test is necessary to prevent such wrapped-around relative times
+	 * from permanently stopping the whole bdi writeback.
+	 */
+	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
+#endif
+	return ret;
+}
+
+#define EXPIRE_DIRTY_ATIME 0x0001
+
+/*
+ * Move expired (dirtied before work->older_than_this) dirty inodes from
+ * @delaying_queue to @dispatch_queue.
+ */
+static int move_expired_inodes(struct list_head *delaying_queue,
+			       struct list_head *dispatch_queue,
+			       int flags,
+			       struct wb_writeback_work *work)
+{
+	unsigned long *older_than_this = NULL;
+	unsigned long expire_time;
+	LIST_HEAD(tmp);
+	struct list_head *pos, *node;
+	struct super_block *sb = NULL;
+	struct inode *inode;
+	int do_sb_sort = 0;
+	int moved = 0;
+
+	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+		older_than_this = work->older_than_this;
+	else if (!work->for_sync) {
+		expire_time = jiffies - (dirtytime_expire_interval * HZ);
+		older_than_this = &expire_time;
+	}
+	while (!list_empty(delaying_queue)) {
+		inode = wb_inode(delaying_queue->prev);
+		if (older_than_this &&
+		    inode_dirtied_after(inode, *older_than_this))
+			break;
+		list_move(&inode->i_wb_list, &tmp);
+		moved++;
+		if (flags & EXPIRE_DIRTY_ATIME)
+			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
+		if (sb_is_blkdev_sb(inode->i_sb))
+			continue;
+		if (sb && sb != inode->i_sb)
+			do_sb_sort = 1;
+		sb = inode->i_sb;
+	}
+
+	/* just one sb in list, splice to dispatch_queue and we're done */
+	if (!do_sb_sort) {
+		list_splice(&tmp, dispatch_queue);
+		goto out;
+	}
+
+	/* Move inodes from one superblock together */
+	while (!list_empty(&tmp)) {
+		sb = wb_inode(tmp.prev)->i_sb;
+		list_for_each_prev_safe(pos, node, &tmp) {
+			inode = wb_inode(pos);
+			if (inode->i_sb == sb)
+				list_move(&inode->i_wb_list, dispatch_queue);
+		}
+	}
+out:
+	return moved;
+}
+
+/*
+ * Queue all expired dirty inodes for io, eldest first.
+ * Before
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    gf         edc     BA
+ * After
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    g          fBAedc
+ *                                           |
+ *                                           +--> dequeue for IO
+ */
+static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
+{
+	int moved;
+
+	assert_spin_locked(&wb->list_lock);
+	list_splice_init(&wb->b_more_io, &wb->b_io);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+				     EXPIRE_DIRTY_ATIME, work);
+	trace_writeback_queue_io(wb, work, moved);
+}
+
+static int write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+
+	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
+		trace_writeback_write_inode_start(inode, wbc);
+		ret = inode->i_sb->s_op->write_inode(inode, wbc);
+		trace_writeback_write_inode(inode, wbc);
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Wait for writeback on an inode to complete. Called with i_lock held.
+ * Caller must make sure inode cannot go away when we drop i_lock.
+ */
+static void __inode_wait_for_writeback(struct inode *inode)
+	__releases(inode->i_lock)
+	__acquires(inode->i_lock)
+{
+	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
+	wait_queue_head_t *wqh;
+
+	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+	while (inode->i_state & I_SYNC) {
+		spin_unlock(&inode->i_lock);
+		__wait_on_bit(wqh, &wq, bit_wait,
+			      TASK_UNINTERRUPTIBLE);
+		spin_lock(&inode->i_lock);
+	}
+}
+
+/*
+ * Wait for writeback on an inode to complete. Caller must have inode pinned.
+ */
+void inode_wait_for_writeback(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	__inode_wait_for_writeback(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Sleep until I_SYNC is cleared. This function must be called with i_lock
+ * held and drops it. It is aimed for callers not holding any inode reference
+ * so once i_lock is dropped, inode can go away.
+ */
+static void inode_sleep_on_writeback(struct inode *inode)
+	__releases(inode->i_lock)
+{
+	DEFINE_WAIT(wait);
+	wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+	int sleep;
+
+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	sleep = inode->i_state & I_SYNC;
+	spin_unlock(&inode->i_lock);
+	if (sleep)
+		schedule();
+	finish_wait(wqh, &wait);
+}
+
+/*
+ * Find proper writeback list for the inode depending on its current state and
+ * possibly also change of its state while we were doing writeback.  Here we
+ * handle things such as livelock prevention or fairness of writeback among
+ * inodes. This function can be called only by flusher thread - noone else
+ * processes all inodes in writeback lists and requeueing inodes behind flusher
+ * thread's back can have unexpected consequences.
+ */
+static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
+			  struct writeback_control *wbc)
+{
+	if (inode->i_state & I_FREEING)
+		return;
+
+	/*
+	 * Sync livelock prevention. Each inode is tagged and synced in one
+	 * shot. If still dirty, it will be redirty_tail()'ed below.  Update
+	 * the dirty time to prevent enqueue and sync it again.
+	 */
+	if ((inode->i_state & I_DIRTY) &&
+	    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+		inode->dirtied_when = jiffies;
+
+	if (wbc->pages_skipped) {
+		/*
+		 * writeback is not making progress due to locked
+		 * buffers. Skip this inode for now.
+		 */
+		redirty_tail(inode, wb);
+		return;
+	}
+
+	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+		/*
+		 * We didn't write back all the pages.  nfs_writepages()
+		 * sometimes bales out without doing anything.
+		 */
+		if (wbc->nr_to_write <= 0) {
+			/* Slice used up. Queue for next turn. */
+			requeue_io(inode, wb);
+		} else {
+			/*
+			 * Writeback blocked by something other than
+			 * congestion. Delay the inode for some time to
+			 * avoid spinning on the CPU (100% iowait)
+			 * retrying writeback of the dirty page/inode
+			 * that cannot be performed immediately.
+			 */
+			redirty_tail(inode, wb);
+		}
+	} else if (inode->i_state & I_DIRTY) {
+		/*
+		 * Filesystems can dirty the inode during writeback operations,
+		 * such as delayed allocation during submission or metadata
+		 * updates after data IO completion.
+		 */
+		redirty_tail(inode, wb);
+	} else if (inode->i_state & I_DIRTY_TIME) {
+		inode->dirtied_when = jiffies;
+		list_move(&inode->i_wb_list, &wb->b_dirty_time);
+	} else {
+		/* The inode is clean. Remove from writeback lists. */
+		list_del_init(&inode->i_wb_list);
+	}
+}
+
+/*
+ * Write out an inode and its dirty pages. Do not update the writeback list
+ * linkage. That is left to the caller. The caller is also responsible for
+ * setting I_SYNC flag and calling inode_sync_complete() to clear it.
+ */
+static int
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct address_space *mapping = inode->i_mapping;
+	long nr_to_write = wbc->nr_to_write;
+	unsigned dirty;
+	int ret;
+
+	WARN_ON(!(inode->i_state & I_SYNC));
+
+	trace_writeback_single_inode_start(inode, wbc, nr_to_write);
+
+	ret = do_writepages(mapping, wbc);
+
+	/*
+	 * Make sure to wait on the data before writing out the metadata.
+	 * This is important for filesystems that modify metadata on data
+	 * I/O completion. We don't do it for sync(2) writeback because it has a
+	 * separate, external IO completion path and ->sync_fs for guaranteeing
+	 * inode metadata is written back correctly.
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
+		int err = filemap_fdatawait(mapping);
+		if (ret == 0)
+			ret = err;
+	}
+
+	/*
+	 * Some filesystems may redirty the inode during the writeback
+	 * due to delalloc, clear dirty metadata flags right before
+	 * write_inode()
+	 */
+	spin_lock(&inode->i_lock);
+
+	dirty = inode->i_state & I_DIRTY;
+	if (inode->i_state & I_DIRTY_TIME) {
+		if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+		    unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
+		    unlikely(time_after(jiffies,
+					(inode->dirtied_time_when +
+					 dirtytime_expire_interval * HZ)))) {
+			dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+			trace_writeback_lazytime(inode);
+		}
+	} else
+		inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
+	inode->i_state &= ~dirty;
+
+	/*
+	 * Paired with smp_mb() in __mark_inode_dirty().  This allows
+	 * __mark_inode_dirty() to test i_state without grabbing i_lock -
+	 * either they see the I_DIRTY bits cleared or we see the dirtied
+	 * inode.
+	 *
+	 * I_DIRTY_PAGES is always cleared together above even if @mapping
+	 * still has dirty pages.  The flag is reinstated after smp_mb() if
+	 * necessary.  This guarantees that either __mark_inode_dirty()
+	 * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
+	 */
+	smp_mb();
+
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		inode->i_state |= I_DIRTY_PAGES;
+
+	spin_unlock(&inode->i_lock);
+
+	if (dirty & I_DIRTY_TIME)
+		mark_inode_dirty_sync(inode);
+	/* Don't write the inode if only I_DIRTY_PAGES was set */
+	if (dirty & ~I_DIRTY_PAGES) {
+		int err = write_inode(inode, wbc);
+		if (ret == 0)
+			ret = err;
+	}
+	trace_writeback_single_inode(inode, wbc, nr_to_write);
+	return ret;
+}
+
+/*
+ * Write out an inode's dirty pages. Either the caller has an active reference
+ * on the inode or the inode has I_WILL_FREE set.
+ *
+ * This function is designed to be called for writing back one inode which
+ * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
+ * and does more profound writeback list handling in writeback_sb_inodes().
+ */
+static int
+writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+		       struct writeback_control *wbc)
+{
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	if (!atomic_read(&inode->i_count))
+		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+	else
+		WARN_ON(inode->i_state & I_WILL_FREE);
+
+	if (inode->i_state & I_SYNC) {
+		if (wbc->sync_mode != WB_SYNC_ALL)
+			goto out;
+		/*
+		 * It's a data-integrity sync. We must wait. Since callers hold
+		 * inode reference or inode has I_WILL_FREE set, it cannot go
+		 * away under us.
+		 */
+		__inode_wait_for_writeback(inode);
+	}
+	WARN_ON(inode->i_state & I_SYNC);
+	/*
+	 * Skip inode if it is clean and we have no outstanding writeback in
+	 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
+	 * function since flusher thread may be doing for example sync in
+	 * parallel and if we move the inode, it could get skipped. So here we
+	 * make sure inode is on some writeback list and leave it there unless
+	 * we have completely cleaned the inode.
+	 */
+	if (!(inode->i_state & I_DIRTY_ALL) &&
+	    (wbc->sync_mode != WB_SYNC_ALL ||
+	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
+		goto out;
+	inode->i_state |= I_SYNC;
+	spin_unlock(&inode->i_lock);
+
+	ret = __writeback_single_inode(inode, wbc);
+
+	spin_lock(&wb->list_lock);
+	spin_lock(&inode->i_lock);
+	/*
+	 * If inode is clean, remove it from writeback lists. Otherwise don't
+	 * touch it. See comment above for explanation.
+	 */
+	if (!(inode->i_state & I_DIRTY_ALL))
+		list_del_init(&inode->i_wb_list);
+	spin_unlock(&wb->list_lock);
+	inode_sync_complete(inode);
+out:
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+static long writeback_chunk_size(struct backing_dev_info *bdi,
+				 struct wb_writeback_work *work)
+{
+	long pages;
+
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          writeback_sb_inodes()       <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+		pages = LONG_MAX;
+	else {
+		pages = min(bdi->avg_write_bandwidth / 2,
+			    global_dirty_limit / DIRTY_SCOPE);
+		pages = min(pages, work->nr_pages);
+		pages = round_down(pages + MIN_WRITEBACK_PAGES,
+				   MIN_WRITEBACK_PAGES);
+	}
+
+	return pages;
+}
+
+/*
+ * Write a portion of b_io inodes which belong to @sb.
+ *
+ * Return the number of pages and/or inodes written.
+ */
+static long writeback_sb_inodes(struct super_block *sb,
+				struct bdi_writeback *wb,
+				struct wb_writeback_work *work)
+{
+	struct writeback_control wbc = {
+		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
+		.for_kupdate		= work->for_kupdate,
+		.for_background		= work->for_background,
+		.for_sync		= work->for_sync,
+		.range_cyclic		= work->range_cyclic,
+		.range_start		= 0,
+		.range_end		= LLONG_MAX,
+	};
+	unsigned long start_time = jiffies;
+	long write_chunk;
+	long wrote = 0;  /* count both pages and inodes */
+
+	while (!list_empty(&wb->b_io)) {
+		struct inode *inode = wb_inode(wb->b_io.prev);
+
+		if (inode->i_sb != sb) {
+			if (work->sb) {
+				/*
+				 * We only want to write back data for this
+				 * superblock, move all inodes not belonging
+				 * to it back onto the dirty list.
+				 */
+				redirty_tail(inode, wb);
+				continue;
+			}
+
+			/*
+			 * The inode belongs to a different superblock.
+			 * Bounce back to the caller to unpin this and
+			 * pin the next superblock.
+			 */
+			break;
+		}
+
+		/*
+		 * Don't bother with new inodes or inodes being freed, first
+		 * kind does not need periodic writeout yet, and for the latter
+		 * kind writeout is handled by the freer.
+		 */
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
+			redirty_tail(inode, wb);
+			continue;
+		}
+		if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
+			/*
+			 * If this inode is locked for writeback and we are not
+			 * doing writeback-for-data-integrity, move it to
+			 * b_more_io so that writeback can proceed with the
+			 * other inodes on s_io.
+			 *
+			 * We'll have another go at writing back this inode
+			 * when we completed a full scan of b_io.
+			 */
+			spin_unlock(&inode->i_lock);
+			requeue_io(inode, wb);
+			trace_writeback_sb_inodes_requeue(inode);
+			continue;
+		}
+		spin_unlock(&wb->list_lock);
+
+		/*
+		 * We already requeued the inode if it had I_SYNC set and we
+		 * are doing WB_SYNC_NONE writeback. So this catches only the
+		 * WB_SYNC_ALL case.
+		 */
+		if (inode->i_state & I_SYNC) {
+			/* Wait for I_SYNC. This function drops i_lock... */
+			inode_sleep_on_writeback(inode);
+			/* Inode may be gone, start again */
+			spin_lock(&wb->list_lock);
+			continue;
+		}
+		inode->i_state |= I_SYNC;
+		spin_unlock(&inode->i_lock);
+
+		write_chunk = writeback_chunk_size(wb->bdi, work);
+		wbc.nr_to_write = write_chunk;
+		wbc.pages_skipped = 0;
+
+		/*
+		 * We use I_SYNC to pin the inode in memory. While it is set
+		 * evict_inode() will wait so the inode cannot be freed.
+		 */
+		__writeback_single_inode(inode, &wbc);
+
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
+		spin_lock(&wb->list_lock);
+		spin_lock(&inode->i_lock);
+		if (!(inode->i_state & I_DIRTY_ALL))
+			wrote++;
+		requeue_inode(inode, wb, &wbc);
+		inode_sync_complete(inode);
+		spin_unlock(&inode->i_lock);
+		cond_resched_lock(&wb->list_lock);
+		/*
+		 * bail out to wb_writeback() often enough to check
+		 * background threshold and other termination conditions.
+		 */
+		if (wrote) {
+			if (time_is_before_jiffies(start_time + HZ / 10UL))
+				break;
+			if (work->nr_pages <= 0)
+				break;
+		}
+	}
+	return wrote;
+}
+
+static long __writeback_inodes_wb(struct bdi_writeback *wb,
+				  struct wb_writeback_work *work)
+{
+	unsigned long start_time = jiffies;
+	long wrote = 0;
+
+	while (!list_empty(&wb->b_io)) {
+		struct inode *inode = wb_inode(wb->b_io.prev);
+		struct super_block *sb = inode->i_sb;
+
+		if (!trylock_super(sb)) {
+			/*
+			 * trylock_super() may fail consistently due to
+			 * s_umount being grabbed by someone else. Don't use
+			 * requeue_io() to avoid busy retrying the inode/sb.
+			 */
+			redirty_tail(inode, wb);
+			continue;
+		}
+		wrote += writeback_sb_inodes(sb, wb, work);
+		up_read(&sb->s_umount);
+
+		/* refer to the same tests at the end of writeback_sb_inodes */
+		if (wrote) {
+			if (time_is_before_jiffies(start_time + HZ / 10UL))
+				break;
+			if (work->nr_pages <= 0)
+				break;
+		}
+	}
+	/* Leave any unwritten inodes on b_io */
+	return wrote;
+}
+
+static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+				enum wb_reason reason)
+{
+	struct wb_writeback_work work = {
+		.nr_pages	= nr_pages,
+		.sync_mode	= WB_SYNC_NONE,
+		.range_cyclic	= 1,
+		.reason		= reason,
+	};
+
+	spin_lock(&wb->list_lock);
+	if (list_empty(&wb->b_io))
+		queue_io(wb, &work);
+	__writeback_inodes_wb(wb, &work);
+	spin_unlock(&wb->list_lock);
+
+	return nr_pages - work.nr_pages;
+}
+
+static bool over_bground_thresh(struct backing_dev_info *bdi)
+{
+	unsigned long background_thresh, dirty_thresh;
+
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+
+	if (global_page_state(NR_FILE_DIRTY) +
+	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+		return true;
+
+	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
+				bdi_dirty_limit(bdi, background_thresh))
+		return true;
+
+	return false;
+}
+
+/*
+ * Called under wb->list_lock. If there are multiple wb per bdi,
+ * only the flusher working on the first wb should do it.
+ */
+static void wb_update_bandwidth(struct bdi_writeback *wb,
+				unsigned long start_time)
+{
+	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write.  So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static long wb_writeback(struct bdi_writeback *wb,
+			 struct wb_writeback_work *work)
+{
+	unsigned long wb_start = jiffies;
+	long nr_pages = work->nr_pages;
+	unsigned long oldest_jif;
+	struct inode *inode;
+	long progress;
+
+	oldest_jif = jiffies;
+	work->older_than_this = &oldest_jif;
+
+	spin_lock(&wb->list_lock);
+	for (;;) {
+		/*
+		 * Stop writeback when nr_pages has been consumed
+		 */
+		if (work->nr_pages <= 0)
+			break;
+
+		/*
+		 * Background writeout and kupdate-style writeback may
+		 * run forever. Stop them if there is other work to do
+		 * so that e.g. sync can proceed. They'll be restarted
+		 * after the other works are all done.
+		 */
+		if ((work->for_background || work->for_kupdate) &&
+		    !list_empty(&wb->bdi->work_list))
+			break;
+
+		/*
+		 * For background writeout, stop when we are below the
+		 * background dirty threshold
+		 */
+		if (work->for_background && !over_bground_thresh(wb->bdi))
+			break;
+
+		/*
+		 * Kupdate and background works are special and we want to
+		 * include all inodes that need writing. Livelock avoidance is
+		 * handled by these works yielding to any other work so we are
+		 * safe.
+		 */
+		if (work->for_kupdate) {
+			oldest_jif = jiffies -
+				msecs_to_jiffies(dirty_expire_interval * 10);
+		} else if (work->for_background)
+			oldest_jif = jiffies;
+
+		trace_writeback_start(wb->bdi, work);
+		if (list_empty(&wb->b_io))
+			queue_io(wb, work);
+		if (work->sb)
+			progress = writeback_sb_inodes(work->sb, wb, work);
+		else
+			progress = __writeback_inodes_wb(wb, work);
+		trace_writeback_written(wb->bdi, work);
+
+		wb_update_bandwidth(wb, wb_start);
+
+		/*
+		 * Did we write something? Try for more
+		 *
+		 * Dirty inodes are moved to b_io for writeback in batches.
+		 * The completion of the current batch does not necessarily
+		 * mean the overall work is done. So we keep looping as long
+		 * as made some progress on cleaning pages or inodes.
+		 */
+		if (progress)
+			continue;
+		/*
+		 * No more inodes for IO, bail
+		 */
+		if (list_empty(&wb->b_more_io))
+			break;
+		/*
+		 * Nothing written. Wait for some inode to
+		 * become available for writeback. Otherwise
+		 * we'll just busyloop.
+		 */
+		if (!list_empty(&wb->b_more_io))  {
+			trace_writeback_wait(wb->bdi, work);
+			inode = wb_inode(wb->b_more_io.prev);
+			spin_lock(&inode->i_lock);
+			spin_unlock(&wb->list_lock);
+			/* This function drops i_lock... */
+			inode_sleep_on_writeback(inode);
+			spin_lock(&wb->list_lock);
+		}
+	}
+	spin_unlock(&wb->list_lock);
+
+	return nr_pages - work->nr_pages;
+}
+
+/*
+ * Return the next wb_writeback_work struct that hasn't been processed yet.
+ */
+static struct wb_writeback_work *
+get_next_work_item(struct backing_dev_info *bdi)
+{
+	struct wb_writeback_work *work = NULL;
+
+	spin_lock_bh(&bdi->wb_lock);
+	if (!list_empty(&bdi->work_list)) {
+		work = list_entry(bdi->work_list.next,
+				  struct wb_writeback_work, list);
+		list_del_init(&work->list);
+	}
+	spin_unlock_bh(&bdi->wb_lock);
+	return work;
+}
+
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+	return global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS) +
+		get_nr_dirty_inodes();
+}
+
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+	if (over_bground_thresh(wb->bdi)) {
+
+		struct wb_writeback_work work = {
+			.nr_pages	= LONG_MAX,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_background	= 1,
+			.range_cyclic	= 1,
+			.reason		= WB_REASON_BACKGROUND,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+	unsigned long expired;
+	long nr_pages;
+
+	/*
+	 * When set to zero, disable periodic writeback
+	 */
+	if (!dirty_writeback_interval)
+		return 0;
+
+	expired = wb->last_old_flush +
+			msecs_to_jiffies(dirty_writeback_interval * 10);
+	if (time_before(jiffies, expired))
+		return 0;
+
+	wb->last_old_flush = jiffies;
+	nr_pages = get_nr_dirty_pages();
+
+	if (nr_pages) {
+		struct wb_writeback_work work = {
+			.nr_pages	= nr_pages,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_kupdate	= 1,
+			.range_cyclic	= 1,
+			.reason		= WB_REASON_PERIODIC,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+static long wb_do_writeback(struct bdi_writeback *wb)
+{
+	struct backing_dev_info *bdi = wb->bdi;
+	struct wb_writeback_work *work;
+	long wrote = 0;
+
+	set_bit(BDI_writeback_running, &wb->bdi->state);
+	while ((work = get_next_work_item(bdi)) != NULL) {
+
+		trace_writeback_exec(bdi, work);
+
+		wrote += wb_writeback(wb, work);
+
+		/*
+		 * Notify the caller of completion if this is a synchronous
+		 * work item, otherwise just free it.
+		 */
+		if (work->done)
+			complete(work->done);
+		else
+			kfree(work);
+	}
+
+	/*
+	 * Check for periodic writeback, kupdated() style
+	 */
+	wrote += wb_check_old_data_flush(wb);
+	wrote += wb_check_background_flush(wb);
+	clear_bit(BDI_writeback_running, &wb->bdi->state);
+
+	return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * reschedules periodically and does kupdated style flushing.
+ */
+void bdi_writeback_workfn(struct work_struct *work)
+{
+	struct bdi_writeback *wb = container_of(to_delayed_work(work),
+						struct bdi_writeback, dwork);
+	struct backing_dev_info *bdi = wb->bdi;
+	long pages_written;
+
+	set_worker_desc("flush-%s", dev_name(bdi->dev));
+	current->flags |= PF_SWAPWRITE;
+
+	if (likely(!current_is_workqueue_rescuer() ||
+		   !test_bit(BDI_registered, &bdi->state))) {
+		/*
+		 * The normal path.  Keep writing back @bdi until its
+		 * work_list is empty.  Note that this path is also taken
+		 * if @bdi is shutting down even when we're running off the
+		 * rescuer as work_list needs to be drained.
+		 */
+		do {
+			pages_written = wb_do_writeback(wb);
+			trace_writeback_pages_written(pages_written);
+		} while (!list_empty(&bdi->work_list));
+	} else {
+		/*
+		 * bdi_wq can't get enough workers and we're running off
+		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+		 * enough for efficient IO.
+		 */
+		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+						    WB_REASON_FORKER_THREAD);
+		trace_writeback_pages_written(pages_written);
+	}
+
+	if (!list_empty(&bdi->work_list))
+		mod_delayed_work(bdi_wq, &wb->dwork, 0);
+	else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+		bdi_wakeup_thread_delayed(bdi);
+
+	current->flags &= ~PF_SWAPWRITE;
+}
+
+/*
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+{
+	struct backing_dev_info *bdi;
+
+	if (!nr_pages)
+		nr_pages = get_nr_dirty_pages();
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		if (!bdi_has_dirty_io(bdi))
+			continue;
+		__bdi_start_writeback(bdi, nr_pages, false, reason);
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically.  We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system.  So instead we define a separate delayed work
+ * function which gets called much more rarely.  (By default, only
+ * once every 12 hours.)
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary.  But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		if (list_empty(&bdi->wb.b_dirty_time))
+			continue;
+		bdi_wakeup_thread(bdi);
+	}
+	rcu_read_unlock();
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+
+static int __init start_dirtytime_writeback(void)
+{
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+	return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret == 0 && write)
+		mod_delayed_work(system_wq, &dirtytime_work, 0);
+	return ret;
+}
+
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+		struct dentry *dentry;
+		const char *name = "?";
+
+		dentry = d_find_alias(inode);
+		if (dentry) {
+			spin_lock(&dentry->d_lock);
+			name = (const char *) dentry->d_name.name;
+		}
+		printk(KERN_DEBUG
+		       "%s(%d): dirtied inode %lu (%s) on %s\n",
+		       current->comm, task_pid_nr(current), inode->i_ino,
+		       name, inode->i_sb->s_id);
+		if (dentry) {
+			spin_unlock(&dentry->d_lock);
+			dput(dentry);
+		}
+	}
+}
+
+/**
+ *	__mark_inode_dirty -	internal function
+ *	@inode: inode to mark
+ *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *	Mark an inode as dirty. Callers should use mark_inode_dirty or
+ *  	mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
+ *
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
+ *
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
+ *
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
+ * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
+ */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
+void __mark_inode_dirty(struct inode *inode, int flags)
+{
+	struct super_block *sb = inode->i_sb;
+	struct backing_dev_info *bdi = NULL;
+	int dirtytime;
+
+	trace_writeback_mark_inode_dirty(inode, flags);
+
+	/*
+	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
+	 * dirty the inode itself
+	 */
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
+		trace_writeback_dirty_inode_start(inode, flags);
+
+		if (sb->s_op->dirty_inode)
+			sb->s_op->dirty_inode(inode, flags);
+
+		trace_writeback_dirty_inode(inode, flags);
+	}
+	if (flags & I_DIRTY_INODE)
+		flags &= ~I_DIRTY_TIME;
+	dirtytime = flags & I_DIRTY_TIME;
+
+	/*
+	 * Paired with smp_mb() in __writeback_single_inode() for the
+	 * following lockless i_state test.  See there for details.
+	 */
+	smp_mb();
+
+	if (((inode->i_state & flags) == flags) ||
+	    (dirtytime && (inode->i_state & I_DIRTY_INODE)))
+		return;
+
+	if (unlikely(block_dump))
+		block_dump___mark_inode_dirty(inode);
+
+	spin_lock(&inode->i_lock);
+	if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+		goto out_unlock_inode;
+	if ((inode->i_state & flags) != flags) {
+		const int was_dirty = inode->i_state & I_DIRTY;
+
+		if (flags & I_DIRTY_INODE)
+			inode->i_state &= ~I_DIRTY_TIME;
+		inode->i_state |= flags;
+
+		/*
+		 * If the inode is being synced, just update its dirty state.
+		 * The unlocker will place the inode on the appropriate
+		 * superblock list, based upon its state.
+		 */
+		if (inode->i_state & I_SYNC)
+			goto out_unlock_inode;
+
+		/*
+		 * Only add valid (hashed) inodes to the superblock's
+		 * dirty list.  Add blockdev inodes as well.
+		 */
+		if (!S_ISBLK(inode->i_mode)) {
+			if (inode_unhashed(inode))
+				goto out_unlock_inode;
+		}
+		if (inode->i_state & I_FREEING)
+			goto out_unlock_inode;
+
+		/*
+		 * If the inode was already on b_dirty/b_io/b_more_io, don't
+		 * reposition it (that would break b_dirty time-ordering).
+		 */
+		if (!was_dirty) {
+			bool wakeup_bdi = false;
+			bdi = inode_to_bdi(inode);
+
+			spin_unlock(&inode->i_lock);
+			spin_lock(&bdi->wb.list_lock);
+			if (bdi_cap_writeback_dirty(bdi)) {
+				WARN(!test_bit(BDI_registered, &bdi->state),
+				     "bdi-%s not registered\n", bdi->name);
+
+				/*
+				 * If this is the first dirty inode for this
+				 * bdi, we have to wake-up the corresponding
+				 * bdi thread to make sure background
+				 * write-back happens later.
+				 */
+				if (!wb_has_dirty_io(&bdi->wb))
+					wakeup_bdi = true;
+			}
+
+			inode->dirtied_when = jiffies;
+			if (dirtytime)
+				inode->dirtied_time_when = jiffies;
+			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			else
+				list_move(&inode->i_wb_list,
+					  &bdi->wb.b_dirty_time);
+			spin_unlock(&bdi->wb.list_lock);
+			trace_writeback_dirty_inode_enqueue(inode);
+
+			if (wakeup_bdi)
+				bdi_wakeup_thread_delayed(bdi);
+			return;
+		}
+	}
+out_unlock_inode:
+	spin_unlock(&inode->i_lock);
+
+}
+EXPORT_SYMBOL(__mark_inode_dirty);
+
+static void wait_sb_inodes(struct super_block *sb)
+{
+	struct inode *inode, *old_inode = NULL;
+
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	spin_lock(&inode_sb_list_lock);
+
+	/*
+	 * Data integrity sync. Must wait for all pages under writeback,
+	 * because there may have been pages dirtied before our sync
+	 * call, but which had writeout started before we write it out.
+	 * In which case, the inode may not be on the dirty list, but
+	 * we still have to wait for that writeout.
+	 */
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		struct address_space *mapping = inode->i_mapping;
+
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
+		iput(old_inode);
+		old_inode = inode;
+
+		filemap_fdatawait(mapping);
+
+		cond_resched();
+
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+}
+
+/**
+ * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ * @reason: reason why some writeback work initiated
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb_nr(struct super_block *sb,
+			    unsigned long nr,
+			    enum wb_reason reason)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb			= sb,
+		.sync_mode		= WB_SYNC_NONE,
+		.tagged_writepages	= 1,
+		.done			= &done,
+		.nr_pages		= nr,
+		.reason			= reason,
+	};
+
+	if (sb->s_bdi == &noop_backing_dev_info)
+		return;
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr);
+
+/**
+ * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+{
+	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
+}
+EXPORT_SYMBOL(writeback_inodes_sb);
+
+/**
+ * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ * @reason: the reason of writeback
+ *
+ * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int try_to_writeback_inodes_sb_nr(struct super_block *sb,
+				  unsigned long nr,
+				  enum wb_reason reason)
+{
+	if (writeback_in_progress(sb->s_bdi))
+		return 1;
+
+	if (!down_read_trylock(&sb->s_umount))
+		return 0;
+
+	writeback_inodes_sb_nr(sb, nr, reason);
+	up_read(&sb->s_umount);
+	return 1;
+}
+EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
+
+/**
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
+ * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
+ *
+ * Implement by try_to_writeback_inodes_sb_nr()
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+{
+	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
+}
+EXPORT_SYMBOL(try_to_writeback_inodes_sb);
+
+/**
+ * sync_inodes_sb	-	sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_pages	= LONG_MAX,
+		.range_cyclic	= 0,
+		.done		= &done,
+		.reason		= WB_REASON_SYNC,
+		.for_sync	= 1,
+	};
+
+	/* Nothing to do? */
+	if (sb->s_bdi == &noop_backing_dev_info)
+		return;
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
+
+	wait_sb_inodes(sb);
+}
+EXPORT_SYMBOL(sync_inodes_sb);
+
+/**
+ * write_inode_now	-	write an inode to disk
+ * @inode: inode to write to disk
+ * @sync: whether the write should be synchronous or not
+ *
+ * This function commits an inode to disk immediately if it is dirty. This is
+ * primarily needed by knfsd.
+ *
+ * The caller must either have a ref on the inode or must have set I_WILL_FREE.
+ */
+int write_inode_now(struct inode *inode, int sync)
+{
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+	struct writeback_control wbc = {
+		.nr_to_write = LONG_MAX,
+		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
+
+	if (!mapping_cap_writeback_dirty(inode->i_mapping))
+		wbc.nr_to_write = 0;
+
+	might_sleep();
+	return writeback_single_inode(inode, wb, &wbc);
+}
+EXPORT_SYMBOL(write_inode_now);
+
+/**
+ * sync_inode - write an inode and its pages to disk.
+ * @inode: the inode to sync
+ * @wbc: controls the writeback mode
+ *
+ * sync_inode() will write an inode and its pages to disk.  It will also
+ * correctly update the inode on its superblock's dirty inode lists and will
+ * update inode->i_state.
+ *
+ * The caller must have a ref on the inode.
+ */
+int sync_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+}
+EXPORT_SYMBOL(sync_inode);
+
+/**
+ * sync_inode_metadata - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust its dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+	struct writeback_control wbc = {
+		.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.nr_to_write = 0, /* metadata-only */
+	};
+
+	return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/kernel/fs/fs_pin.c b/kernel/fs/fs_pin.c
new file mode 100644
index 000000000..611b5408f
--- /dev/null
+++ b/kernel/fs/fs_pin.c
@@ -0,0 +1,102 @@
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "internal.h"
+#include "mount.h"
+
+static DEFINE_SPINLOCK(pin_lock);
+
+void pin_remove(struct fs_pin *pin)
+{
+	spin_lock(&pin_lock);
+	hlist_del_init(&pin->m_list);
+	hlist_del_init(&pin->s_list);
+	spin_unlock(&pin_lock);
+	spin_lock_irq(&pin->wait.lock);
+	pin->done = 1;
+	wake_up_locked(&pin->wait);
+	spin_unlock_irq(&pin->wait.lock);
+}
+
+void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
+{
+	spin_lock(&pin_lock);
+	if (p)
+		hlist_add_head(&pin->s_list, p);
+	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
+	spin_unlock(&pin_lock);
+}
+
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
+}
+
+void pin_kill(struct fs_pin *p)
+{
+	wait_queue_t wait;
+
+	if (!p) {
+		rcu_read_unlock();
+		return;
+	}
+	init_wait(&wait);
+	spin_lock_irq(&p->wait.lock);
+	if (likely(!p->done)) {
+		p->done = -1;
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		p->kill(p);
+		return;
+	}
+	if (p->done > 0) {
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		return;
+	}
+	__add_wait_queue(&p->wait, &wait);
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		schedule();
+		rcu_read_lock();
+		if (likely(list_empty(&wait.task_list)))
+			break;
+		/* OK, we know p couldn't have been freed yet */
+		spin_lock_irq(&p->wait.lock);
+		if (p->done > 0) {
+			spin_unlock_irq(&p->wait.lock);
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+void mnt_pin_kill(struct mount *m)
+{
+	while (1) {
+		struct hlist_node *p;
+		rcu_read_lock();
+		p = ACCESS_ONCE(m->mnt_pins.first);
+		if (!p) {
+			rcu_read_unlock();
+			break;
+		}
+		pin_kill(hlist_entry(p, struct fs_pin, m_list));
+	}
+}
+
+void group_pin_kill(struct hlist_head *p)
+{
+	while (1) {
+		struct hlist_node *q;
+		rcu_read_lock();
+		q = ACCESS_ONCE(p->first);
+		if (!q) {
+			rcu_read_unlock();
+			break;
+		}
+		pin_kill(hlist_entry(q, struct fs_pin, s_list));
+	}
+}
diff --git a/kernel/fs/fs_struct.c b/kernel/fs/fs_struct.c
new file mode 100644
index 000000000..7dca743b2
--- /dev/null
+++ b/kernel/fs/fs_struct.c
@@ -0,0 +1,166 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, const struct path *path)
+{
+	struct path old_root;
+
+	path_get(path);
+	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
+	old_root = fs->root;
+	fs->root = *path;
+	write_seqcount_end(&fs->seq);
+	spin_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, const struct path *path)
+{
+	struct path old_pwd;
+
+	path_get(path);
+	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
+	old_pwd = fs->pwd;
+	fs->pwd = *path;
+	write_seqcount_end(&fs->seq);
+	spin_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
+{
+	if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
+		return 0;
+	*p = *new;
+	return 1;
+}
+
+void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
+{
+	struct task_struct *g, *p;
+	struct fs_struct *fs;
+	int count = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+		fs = p->fs;
+		if (fs) {
+			int hits = 0;
+			spin_lock(&fs->lock);
+			write_seqcount_begin(&fs->seq);
+			hits += replace_path(&fs->root, old_root, new_root);
+			hits += replace_path(&fs->pwd, old_root, new_root);
+			write_seqcount_end(&fs->seq);
+			while (hits--) {
+				count++;
+				path_get(new_root);
+			}
+			spin_unlock(&fs->lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	while (count--)
+		path_put(old_root);
+}
+
+void free_fs_struct(struct fs_struct *fs)
+{
+	path_put(&fs->root);
+	path_put(&fs->pwd);
+	kmem_cache_free(fs_cachep, fs);
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct *fs = tsk->fs;
+
+	if (fs) {
+		int kill;
+		task_lock(tsk);
+		spin_lock(&fs->lock);
+		tsk->fs = NULL;
+		kill = !--fs->users;
+		spin_unlock(&fs->lock);
+		task_unlock(tsk);
+		if (kill)
+			free_fs_struct(fs);
+	}
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		fs->users = 1;
+		fs->in_exec = 0;
+		spin_lock_init(&fs->lock);
+		seqcount_init(&fs->seq);
+		fs->umask = old->umask;
+
+		spin_lock(&old->lock);
+		fs->root = old->root;
+		path_get(&fs->root);
+		fs->pwd = old->pwd;
+		path_get(&fs->pwd);
+		spin_unlock(&old->lock);
+	}
+	return fs;
+}
+
+int unshare_fs_struct(void)
+{
+	struct fs_struct *fs = current->fs;
+	struct fs_struct *new_fs = copy_fs_struct(fs);
+	int kill;
+
+	if (!new_fs)
+		return -ENOMEM;
+
+	task_lock(current);
+	spin_lock(&fs->lock);
+	kill = !--fs->users;
+	current->fs = new_fs;
+	spin_unlock(&fs->lock);
+	task_unlock(current);
+
+	if (kill)
+		free_fs_struct(fs);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+int current_umask(void)
+{
+	return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.users		= 1,
+	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
+	.seq		= SEQCNT_ZERO(init_fs.seq),
+	.umask		= 0022,
+};
diff --git a/kernel/fs/fscache/Kconfig b/kernel/fs/fscache/Kconfig
new file mode 100644
index 000000000..3f6dfa989
--- /dev/null
+++ b/kernel/fs/fscache/Kconfig
@@ -0,0 +1,61 @@
+
+config FSCACHE
+	tristate "General filesystem local caching manager"
+	help
+	  This option enables a generic filesystem caching manager that can be
+	  used by various network and other filesystems to cache data locally.
+	  Different sorts of caches can be plugged in, depending on the
+	  resources available.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_STATS
+	bool "Gather statistical information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes statistical information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/stats
+
+	  The gathering of statistics adds a certain amount of overhead to
+	  execution as there are a quite a few stats gathered, and on a
+	  multi-CPU system these may be on cachelines that keep bouncing
+	  between CPUs.  On the other hand, the stats are very useful for
+	  debugging purposes.  Saying 'Y' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_HISTOGRAM
+	bool "Gather latency information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes latency information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_DEBUG
+	bool "Debug FS-Cache"
+	depends on FSCACHE
+	help
+	  This permits debugging to be dynamically enabled in the local caching
+	  management module.  If this is set, the debugging output may be
+	  enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_OBJECT_LIST
+	bool "Maintain global object list for debugging purposes"
+	depends on FSCACHE && PROC_FS
+	help
+	  Maintain a global list of active fscache objects that can be
+	  retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/kernel/fs/fscache/Makefile b/kernel/fs/fscache/Makefile
new file mode 100644
index 000000000..6d561531c
--- /dev/null
+++ b/kernel/fs/fscache/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for general filesystem caching code
+#
+
+fscache-y := \
+	cache.o \
+	cookie.o \
+	fsdef.o \
+	main.o \
+	netfs.o \
+	object.o \
+	operation.o \
+	page.o
+
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
+
+obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/kernel/fs/fscache/cache.c b/kernel/fs/fscache/cache.c
new file mode 100644
index 000000000..56cce7fdd
--- /dev/null
+++ b/kernel/fs/fscache/cache.c
@@ -0,0 +1,421 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_cache_cleared_wq);
+
+static LIST_HEAD(fscache_cache_tag_list);
+
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+	struct fscache_cache_tag *tag, *xtag;
+
+	/* firstly check for the existence of the tag under read lock */
+	down_read(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_read(&fscache_addremove_sem);
+			return tag;
+		}
+	}
+
+	up_read(&fscache_addremove_sem);
+
+	/* the tag does not exist - create a candidate */
+	xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+	if (!xtag)
+		/* return a dummy tag if out of memory */
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&xtag->usage, 1);
+	strcpy(xtag->name, name);
+
+	/* write lock, search again and add if still not present */
+	down_write(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_write(&fscache_addremove_sem);
+			kfree(xtag);
+			return tag;
+		}
+	}
+
+	list_add_tail(&xtag->link, &fscache_cache_tag_list);
+	up_write(&fscache_addremove_sem);
+	return xtag;
+}
+
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+	if (tag != ERR_PTR(-ENOMEM)) {
+		down_write(&fscache_addremove_sem);
+
+		if (atomic_dec_and_test(&tag->usage))
+			list_del_init(&tag->link);
+		else
+			tag = NULL;
+
+		up_write(&fscache_addremove_sem);
+
+		kfree(tag);
+	}
+}
+
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *cookie)
+{
+	struct fscache_cache_tag *tag;
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("");
+
+	if (list_empty(&fscache_cache_list)) {
+		_leave(" = NULL [no cache]");
+		return NULL;
+	}
+
+	/* we check the parent to determine the cache to use */
+	spin_lock(&cookie->lock);
+
+	/* the first in the parent's backing list should be the preferred
+	 * cache */
+	if (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object, cookie_link);
+
+		cache = object->cache;
+		if (fscache_object_is_dying(object) ||
+		    test_bit(FSCACHE_IOERROR, &cache->flags))
+			cache = NULL;
+
+		spin_unlock(&cookie->lock);
+		_leave(" = %p [parent]", cache);
+		return cache;
+	}
+
+	/* the parent is unbacked */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* cookie not an index and is unbacked */
+		spin_unlock(&cookie->lock);
+		_leave(" = NULL [cookie ub,ni]");
+		return NULL;
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (!cookie->def->select_cache)
+		goto no_preference;
+
+	/* ask the netfs for its preference */
+	tag = cookie->def->select_cache(cookie->parent->netfs_data,
+					cookie->netfs_data);
+	if (!tag)
+		goto no_preference;
+
+	if (tag == ERR_PTR(-ENOMEM)) {
+		_leave(" = NULL [nomem tag]");
+		return NULL;
+	}
+
+	if (!tag->cache) {
+		_leave(" = NULL [unbacked tag]");
+		return NULL;
+	}
+
+	if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+		return NULL;
+
+	_leave(" = %p [specific]", tag->cache);
+	return tag->cache;
+
+no_preference:
+	/* netfs has no preference - just select first cache */
+	cache = list_entry(fscache_cache_list.next,
+			   struct fscache_cache, link);
+	_leave(" = %p [first]", cache);
+	return cache;
+}
+
+/**
+ * fscache_init_cache - Initialise a cache record
+ * @cache: The cache record to be initialised
+ * @ops: The cache operations to be installed in that record
+ * @idfmt: Format string to define identifier
+ * @...: sprintf-style arguments
+ *
+ * Initialise a record of a cache and fill in the name.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_init_cache(struct fscache_cache *cache,
+			const struct fscache_cache_ops *ops,
+			const char *idfmt,
+			...)
+{
+	va_list va;
+
+	memset(cache, 0, sizeof(*cache));
+
+	cache->ops = ops;
+
+	va_start(va, idfmt);
+	vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
+	va_end(va);
+
+	INIT_WORK(&cache->op_gc, fscache_operation_gc);
+	INIT_LIST_HEAD(&cache->link);
+	INIT_LIST_HEAD(&cache->object_list);
+	INIT_LIST_HEAD(&cache->op_gc_list);
+	spin_lock_init(&cache->object_list_lock);
+	spin_lock_init(&cache->op_gc_list_lock);
+}
+EXPORT_SYMBOL(fscache_init_cache);
+
+/**
+ * fscache_add_cache - Declare a cache as being open for business
+ * @cache: The record describing the cache
+ * @ifsdef: The record of the cache object describing the top-level index
+ * @tagname: The tag describing this cache
+ *
+ * Add a cache to the system, making it available for netfs's to use.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+int fscache_add_cache(struct fscache_cache *cache,
+		      struct fscache_object *ifsdef,
+		      const char *tagname)
+{
+	struct fscache_cache_tag *tag;
+
+	BUG_ON(!cache->ops);
+	BUG_ON(!ifsdef);
+
+	cache->flags = 0;
+	ifsdef->event_mask =
+		((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+		~(1 << FSCACHE_OBJECT_EV_CLEARED);
+	__set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
+
+	if (!tagname)
+		tagname = cache->identifier;
+
+	BUG_ON(!tagname[0]);
+
+	_enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
+
+	/* we use the cache tag to uniquely identify caches */
+	tag = __fscache_lookup_cache_tag(tagname);
+	if (IS_ERR(tag))
+		goto nomem;
+
+	if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
+		goto tag_in_use;
+
+	cache->kobj = kobject_create_and_add(tagname, fscache_root);
+	if (!cache->kobj)
+		goto error;
+
+	ifsdef->cookie = &fscache_fsdef_index;
+	ifsdef->cache = cache;
+	cache->fsdef = ifsdef;
+
+	down_write(&fscache_addremove_sem);
+
+	tag->cache = cache;
+	cache->tag = tag;
+
+	/* add the cache to the list */
+	list_add(&cache->link, &fscache_cache_list);
+
+	/* add the cache's netfs definition index object to the cache's
+	 * list */
+	spin_lock(&cache->object_list_lock);
+	list_add_tail(&ifsdef->cache_link, &cache->object_list);
+	spin_unlock(&cache->object_list_lock);
+	fscache_objlist_add(ifsdef);
+
+	/* add the cache's netfs definition index object to the top level index
+	 * cookie as a known backing object */
+	spin_lock(&fscache_fsdef_index.lock);
+
+	hlist_add_head(&ifsdef->cookie_link,
+		       &fscache_fsdef_index.backing_objects);
+
+	atomic_inc(&fscache_fsdef_index.usage);
+
+	/* done */
+	spin_unlock(&fscache_fsdef_index.lock);
+	up_write(&fscache_addremove_sem);
+
+	pr_notice("Cache \"%s\" added (type %s)\n",
+		  cache->tag->name, cache->ops->name);
+	kobject_uevent(cache->kobj, KOBJ_ADD);
+
+	_leave(" = 0 [%s]", cache->identifier);
+	return 0;
+
+tag_in_use:
+	pr_err("Cache tag '%s' already in use\n", tagname);
+	__fscache_release_cache_tag(tag);
+	_leave(" = -EXIST");
+	return -EEXIST;
+
+error:
+	__fscache_release_cache_tag(tag);
+	_leave(" = -EINVAL");
+	return -EINVAL;
+
+nomem:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(fscache_add_cache);
+
+/**
+ * fscache_io_error - Note a cache I/O error
+ * @cache: The record describing the cache
+ *
+ * Note that an I/O error occurred in a cache and that it should no longer be
+ * used for anything.  This also reports the error into the kernel log.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_io_error(struct fscache_cache *cache)
+{
+	if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
+		pr_err("Cache '%s' stopped due to I/O error\n",
+		       cache->ops->name);
+}
+EXPORT_SYMBOL(fscache_io_error);
+
+/*
+ * request withdrawal of all the objects in a cache
+ * - all the objects being withdrawn are moved onto the supplied list
+ */
+static void fscache_withdraw_all_objects(struct fscache_cache *cache,
+					 struct list_head *dying_objects)
+{
+	struct fscache_object *object;
+
+	while (!list_empty(&cache->object_list)) {
+		spin_lock(&cache->object_list_lock);
+
+		if (!list_empty(&cache->object_list)) {
+			object = list_entry(cache->object_list.next,
+					    struct fscache_object, cache_link);
+			list_move_tail(&object->cache_link, dying_objects);
+
+			_debug("withdraw %p", object->cookie);
+
+			/* This must be done under object_list_lock to prevent
+			 * a race with fscache_drop_object().
+			 */
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+		}
+
+		spin_unlock(&cache->object_list_lock);
+		cond_resched();
+	}
+}
+
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The record describing the cache
+ *
+ * Withdraw a cache from service, unbinding all its cache objects from the
+ * netfs cookies they're currently representing.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_withdraw_cache(struct fscache_cache *cache)
+{
+	LIST_HEAD(dying_objects);
+
+	_enter("");
+
+	pr_notice("Withdrawing cache \"%s\"\n",
+		  cache->tag->name);
+
+	/* make the cache unavailable for cookie acquisition */
+	if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
+		BUG();
+
+	down_write(&fscache_addremove_sem);
+	list_del_init(&cache->link);
+	cache->tag->cache = NULL;
+	up_write(&fscache_addremove_sem);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disk */
+	fscache_stat(&fscache_n_cop_sync_cache);
+	cache->ops->sync_cache(cache);
+	fscache_stat_d(&fscache_n_cop_sync_cache);
+
+	/* dissociate all the netfs pages backed by this cache from the block
+	 * mappings in the cache */
+	fscache_stat(&fscache_n_cop_dissociate_pages);
+	cache->ops->dissociate_pages(cache);
+	fscache_stat_d(&fscache_n_cop_dissociate_pages);
+
+	/* we now have to destroy all the active objects pertaining to this
+	 * cache - which we do by passing them off to thread pool to be
+	 * disposed of */
+	_debug("destroy");
+
+	fscache_withdraw_all_objects(cache, &dying_objects);
+
+	/* wait for all extant objects to finish their outstanding operations
+	 * and go away */
+	_debug("wait for finish");
+	wait_event(fscache_cache_cleared_wq,
+		   atomic_read(&cache->object_count) == 0);
+	_debug("wait for clearance");
+	wait_event(fscache_cache_cleared_wq,
+		   list_empty(&cache->object_list));
+	_debug("cleared");
+	ASSERT(list_empty(&dying_objects));
+
+	kobject_put(cache->kobj);
+
+	clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
+	fscache_release_cache_tag(cache->tag);
+	cache->tag = NULL;
+
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/kernel/fs/fscache/cookie.c b/kernel/fs/fscache/cookie.c
new file mode 100644
index 000000000..89acec742
--- /dev/null
+++ b/kernel/fs/fscache/cookie.c
@@ -0,0 +1,722 @@
+/* netfs cookie management
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct kmem_cache *fscache_cookie_jar;
+
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object);
+
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void fscache_cookie_init_once(void *_cookie)
+{
+	struct fscache_cookie *cookie = _cookie;
+
+	memset(cookie, 0, sizeof(*cookie));
+	spin_lock_init(&cookie->lock);
+	spin_lock_init(&cookie->stores_lock);
+	INIT_HLIST_HEAD(&cookie->backing_objects);
+}
+
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_cookie *parent,
+	const struct fscache_cookie_def *def,
+	void *netfs_data,
+	bool enable)
+{
+	struct fscache_cookie *cookie;
+
+	BUG_ON(!def);
+
+	_enter("{%s},{%s},%p,%u",
+	       parent ? (char *) parent->def->name : "<no-parent>",
+	       def->name, netfs_data, enable);
+
+	fscache_stat(&fscache_n_acquires);
+
+	/* if there's no parent cookie, then we don't create one here either */
+	if (!parent) {
+		fscache_stat(&fscache_n_acquires_null);
+		_leave(" [no parent]");
+		return NULL;
+	}
+
+	/* validate the definition */
+	BUG_ON(!def->get_key);
+	BUG_ON(!def->name[0]);
+
+	BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+	       parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+
+	/* allocate and initialise a cookie */
+	cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+	if (!cookie) {
+		fscache_stat(&fscache_n_acquires_oom);
+		_leave(" [ENOMEM]");
+		return NULL;
+	}
+
+	atomic_set(&cookie->usage, 1);
+	atomic_set(&cookie->n_children, 0);
+
+	/* We keep the active count elevated until relinquishment to prevent an
+	 * attempt to wake up every time the object operations queue quiesces.
+	 */
+	atomic_set(&cookie->n_active, 1);
+
+	atomic_inc(&parent->usage);
+	atomic_inc(&parent->n_children);
+
+	cookie->def		= def;
+	cookie->parent		= parent;
+	cookie->netfs_data	= netfs_data;
+	cookie->flags		= (1 << FSCACHE_COOKIE_NO_DATA_YET);
+
+	/* radix tree insertion won't use the preallocation pool unless it's
+	 * told it may not wait */
+	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+
+	switch (cookie->def->type) {
+	case FSCACHE_COOKIE_TYPE_INDEX:
+		fscache_stat(&fscache_n_cookie_index);
+		break;
+	case FSCACHE_COOKIE_TYPE_DATAFILE:
+		fscache_stat(&fscache_n_cookie_data);
+		break;
+	default:
+		fscache_stat(&fscache_n_cookie_special);
+		break;
+	}
+
+	if (enable) {
+		/* if the object is an index then we need do nothing more here
+		 * - we create indices on disk when we need them as an index
+		 * may exist in multiple caches */
+		if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+			if (fscache_acquire_non_index_cookie(cookie) == 0) {
+				set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+			} else {
+				atomic_dec(&parent->n_children);
+				__fscache_cookie_put(cookie);
+				fscache_stat(&fscache_n_acquires_nobufs);
+				_leave(" = NULL");
+				return NULL;
+			}
+		} else {
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+		}
+	}
+
+	fscache_stat(&fscache_n_acquires_ok);
+	_leave(" = %p", cookie);
+	return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+
+/*
+ * Enable a cookie to permit it to accept new operations.
+ */
+void __fscache_enable_cookie(struct fscache_cookie *cookie,
+			     bool (*can_enable)(void *data),
+			     void *data)
+{
+	_enter("%p", cookie);
+
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 TASK_UNINTERRUPTIBLE);
+
+	if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock;
+
+	if (can_enable && !can_enable(data)) {
+		/* The netfs decided it didn't want to enable after all */
+	} else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* Wait for outstanding disablement to complete */
+		__fscache_wait_on_invalidate(cookie);
+
+		if (fscache_acquire_non_index_cookie(cookie) == 0)
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	} else {
+		set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	}
+
+out_unlock:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+}
+EXPORT_SYMBOL(__fscache_enable_cookie);
+
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+	uint64_t i_size;
+	int ret;
+
+	_enter("");
+
+	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+
+	/* now we need to see whether the backing objects for this cookie yet
+	 * exist, if not there'll be nothing to search */
+	down_read(&fscache_addremove_sem);
+
+	if (list_empty(&fscache_cache_list)) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = 0 [no caches]");
+		return 0;
+	}
+
+	/* select a cache in which to store the object */
+	cache = fscache_select_cache_for_object(cookie->parent);
+	if (!cache) {
+		up_read(&fscache_addremove_sem);
+		fscache_stat(&fscache_n_acquires_no_cache);
+		_leave(" = -ENOMEDIUM [no cache]");
+		return -ENOMEDIUM;
+	}
+
+	_debug("cache %s", cache->tag->name);
+
+	set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+
+	/* ask the cache to allocate objects for this cookie and its parent
+	 * chain */
+	ret = fscache_alloc_object(cache, cookie);
+	if (ret < 0) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* pass on how big the object we're caching is supposed to be */
+	cookie->def->get_attr(cookie->netfs_data, &i_size);
+
+	spin_lock(&cookie->lock);
+	if (hlist_empty(&cookie->backing_objects)) {
+		spin_unlock(&cookie->lock);
+		goto unavailable;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	fscache_set_store_limit(object, i_size);
+
+	/* initiate the process of looking up all the objects in the chain
+	 * (done by fscache_initialise_object()) */
+	fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
+
+	spin_unlock(&cookie->lock);
+
+	/* we may be required to wait for lookup to complete at this point */
+	if (!fscache_defer_lookup) {
+		_debug("non-deferred lookup %p", &cookie->flags);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			    TASK_UNINTERRUPTIBLE);
+		_debug("complete");
+		if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+			goto unavailable;
+	}
+
+	up_read(&fscache_addremove_sem);
+	_leave(" = 0 [deferred]");
+	return 0;
+
+unavailable:
+	up_read(&fscache_addremove_sem);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+
+	spin_lock(&cookie->lock);
+	hlist_for_each_entry(object, &cookie->backing_objects,
+			     cookie_link) {
+		if (object->cache == cache)
+			goto object_already_extant;
+	}
+	spin_unlock(&cookie->lock);
+
+	/* ask the cache to allocate an object (we may end up with duplicate
+	 * objects at this stage, but we sort that out later) */
+	fscache_stat(&fscache_n_cop_alloc_object);
+	object = cache->ops->alloc_object(cache, cookie);
+	fscache_stat_d(&fscache_n_cop_alloc_object);
+	if (IS_ERR(object)) {
+		fscache_stat(&fscache_n_object_no_alloc);
+		ret = PTR_ERR(object);
+		goto error;
+	}
+
+	fscache_stat(&fscache_n_object_alloc);
+
+	object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+
+	_debug("ALLOC OBJ%x: %s {%lx}",
+	       object->debug_id, cookie->def->name, object->events);
+
+	ret = fscache_alloc_object(cache, cookie->parent);
+	if (ret < 0)
+		goto error_put;
+
+	/* only attach if we managed to allocate all we needed, otherwise
+	 * discard the object we just allocated and instead use the one
+	 * attached to the cookie */
+	if (fscache_attach_object(cookie, object) < 0) {
+		fscache_stat(&fscache_n_cop_put_object);
+		cache->ops->put_object(object);
+		fscache_stat_d(&fscache_n_cop_put_object);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+object_already_extant:
+	ret = -ENOBUFS;
+	if (fscache_object_is_dead(object)) {
+		spin_unlock(&cookie->lock);
+		goto error;
+	}
+	spin_unlock(&cookie->lock);
+	_leave(" = 0 [found]");
+	return 0;
+
+error_put:
+	fscache_stat(&fscache_n_cop_put_object);
+	cache->ops->put_object(object);
+	fscache_stat_d(&fscache_n_cop_put_object);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object)
+{
+	struct fscache_object *p;
+	struct fscache_cache *cache = object->cache;
+	int ret;
+
+	_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+
+	spin_lock(&cookie->lock);
+
+	/* there may be multiple initial creations of this object, but we only
+	 * want one */
+	ret = -EEXIST;
+	hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
+		if (p->cache == object->cache) {
+			if (fscache_object_is_dying(p))
+				ret = -ENOBUFS;
+			goto cant_attach_object;
+		}
+	}
+
+	/* pin the parent object */
+	spin_lock_nested(&cookie->parent->lock, 1);
+	hlist_for_each_entry(p, &cookie->parent->backing_objects,
+			     cookie_link) {
+		if (p->cache == object->cache) {
+			if (fscache_object_is_dying(p)) {
+				ret = -ENOBUFS;
+				spin_unlock(&cookie->parent->lock);
+				goto cant_attach_object;
+			}
+			object->parent = p;
+			spin_lock(&p->lock);
+			p->n_children++;
+			spin_unlock(&p->lock);
+			break;
+		}
+	}
+	spin_unlock(&cookie->parent->lock);
+
+	/* attach to the cache's object list */
+	if (list_empty(&object->cache_link)) {
+		spin_lock(&cache->object_list_lock);
+		list_add(&object->cache_link, &cache->object_list);
+		spin_unlock(&cache->object_list_lock);
+	}
+
+	/* attach to the cookie */
+	object->cookie = cookie;
+	atomic_inc(&cookie->usage);
+	hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+
+	fscache_objlist_add(object);
+	ret = 0;
+
+cant_attach_object:
+	spin_unlock(&cookie->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Invalidate an object.  Callable with spinlocks held.
+ */
+void __fscache_invalidate(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+
+	_enter("{%s}", cookie->def->name);
+
+	fscache_stat(&fscache_n_invalidates);
+
+	/* Only permit invalidation of data files.  Invalidating an index will
+	 * require the caller to release all its attachments to the tree rooted
+	 * there, and if it's doing that, it may as well just retire the
+	 * cookie.
+	 */
+	ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+
+	/* We will be updating the cookie too. */
+	BUG_ON(!cookie->def->get_aux);
+
+	/* If there's an object, we tell the object state machine to handle the
+	 * invalidation on our behalf, otherwise there's nothing to do.
+	 */
+	if (!hlist_empty(&cookie->backing_objects)) {
+		spin_lock(&cookie->lock);
+
+		if (fscache_cookie_enabled(cookie) &&
+		    !hlist_empty(&cookie->backing_objects) &&
+		    !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
+				      &cookie->flags)) {
+			object = hlist_entry(cookie->backing_objects.first,
+					     struct fscache_object,
+					     cookie_link);
+			if (fscache_object_is_live(object))
+				fscache_raise_event(
+					object, FSCACHE_OBJECT_EV_INVALIDATE);
+		}
+
+		spin_unlock(&cookie->lock);
+	}
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_invalidate);
+
+/*
+ * Wait for object invalidation to complete.
+ */
+void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
+{
+	_enter("%p", cookie);
+
+	wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
+		    TASK_UNINTERRUPTIBLE);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_wait_on_invalidate);
+
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+
+	fscache_stat(&fscache_n_updates);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_updates_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("{%s}", cookie->def->name);
+
+	BUG_ON(!cookie->def->get_aux);
+
+	spin_lock(&cookie->lock);
+
+	if (fscache_cookie_enabled(cookie)) {
+		/* update the index entry on disk in each cache backing this
+		 * cookie.
+		 */
+		hlist_for_each_entry(object,
+				     &cookie->backing_objects, cookie_link) {
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+		}
+	}
+
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+
+/*
+ * Disable a cookie to stop it from accepting new requests from the netfs.
+ */
+void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
+{
+	struct fscache_object *object;
+	bool awaken = false;
+
+	_enter("%p,%u", cookie, invalidate);
+
+	ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
+
+	if (atomic_read(&cookie->n_children) != 0) {
+		pr_err("Cookie '%s' still has children\n",
+		       cookie->def->name);
+		BUG();
+	}
+
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 TASK_UNINTERRUPTIBLE);
+	if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock_enable;
+
+	/* If the cookie is being invalidated, wait for that to complete first
+	 * so that we can reuse the flag.
+	 */
+	__fscache_wait_on_invalidate(cookie);
+
+	/* Dispose of the backing objects */
+	set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags);
+
+	spin_lock(&cookie->lock);
+	if (!hlist_empty(&cookie->backing_objects)) {
+		hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+			if (invalidate)
+				set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+		}
+	} else {
+		if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+			awaken = true;
+	}
+	spin_unlock(&cookie->lock);
+	if (awaken)
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+
+	/* Wait for cessation of activity requiring access to the netfs (when
+	 * n_active reaches 0).  This makes sure outstanding reads and writes
+	 * have completed.
+	 */
+	if (!atomic_dec_and_test(&cookie->n_active))
+		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+				 TASK_UNINTERRUPTIBLE);
+
+	/* Reset the cookie state if it wasn't relinquished */
+	if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
+		atomic_inc(&cookie->n_active);
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+	}
+
+out_unlock_enable:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_disable_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
+{
+	fscache_stat(&fscache_n_relinquishes);
+	if (retire)
+		fscache_stat(&fscache_n_relinquishes_retire);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p,%d},%d",
+	       cookie, cookie->def->name, cookie->netfs_data,
+	       atomic_read(&cookie->n_active), retire);
+
+	/* No further netfs-accessing operations on this cookie permitted */
+	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+
+	__fscache_disable_cookie(cookie, retire);
+
+	/* Clear pointers back to the netfs */
+	cookie->netfs_data	= NULL;
+	cookie->def		= NULL;
+	BUG_ON(cookie->stores.rnode);
+
+	if (cookie->parent) {
+		ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+		ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+		atomic_dec(&cookie->parent->n_children);
+	}
+
+	/* Dispose of the netfs's link to the cookie */
+	ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+	fscache_cookie_put(cookie);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+
+/*
+ * destroy a cookie
+ */
+void __fscache_cookie_put(struct fscache_cookie *cookie)
+{
+	struct fscache_cookie *parent;
+
+	_enter("%p", cookie);
+
+	for (;;) {
+		_debug("FREE COOKIE %p", cookie);
+		parent = cookie->parent;
+		BUG_ON(!hlist_empty(&cookie->backing_objects));
+		kmem_cache_free(fscache_cookie_jar, cookie);
+
+		if (!parent)
+			break;
+
+		cookie = parent;
+		BUG_ON(atomic_read(&cookie->usage) <= 0);
+		if (!atomic_dec_and_test(&cookie->usage))
+			break;
+	}
+
+	_leave("");
+}
+
+/*
+ * check the consistency between the netfs inode and the backing cache
+ *
+ * NOTE: it only serves no-index type
+ */
+int __fscache_check_consistency(struct fscache_cookie *cookie)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+	int ret;
+
+	_enter("%p,", cookie);
+
+	ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	if (hlist_empty(&cookie->backing_objects))
+		return 0;
+
+	op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
+	if (!op)
+		return -ENOMEM;
+
+	fscache_operation_init(op, NULL, NULL);
+	op->flags = FSCACHE_OP_MYTHREAD |
+		(1 << FSCACHE_OP_WAITING) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
+
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto inconsistent;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+	if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+		goto inconsistent;
+
+	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+
+	__fscache_use_cookie(cookie);
+	if (fscache_submit_op(object, op) < 0)
+		goto submit_failed;
+
+	/* the work queue now carries its own ref on the object */
+	spin_unlock(&cookie->lock);
+
+	ret = fscache_wait_for_operation_activation(object, op,
+						    NULL, NULL, NULL);
+	if (ret == 0) {
+		/* ask the cache to honour the operation */
+		ret = object->cache->ops->check_consistency(op);
+		fscache_op_complete(op, false);
+	} else if (ret == -ENOBUFS) {
+		ret = 0;
+	}
+
+	fscache_put_operation(op);
+	_leave(" = %d", ret);
+	return ret;
+
+submit_failed:
+	wake_cookie = __fscache_unuse_cookie(cookie);
+inconsistent:
+	spin_unlock(&cookie->lock);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+	kfree(op);
+	_leave(" = -ESTALE");
+	return -ESTALE;
+}
+EXPORT_SYMBOL(__fscache_check_consistency);
diff --git a/kernel/fs/fscache/fsdef.c b/kernel/fs/fscache/fsdef.c
new file mode 100644
index 000000000..5a117df2a
--- /dev/null
+++ b/kernel/fs/fscache/fsdef.c
@@ -0,0 +1,146 @@
+/* Filesystem index definition
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include "internal.h"
+
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax);
+
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax);
+
+static
+enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
+						    const void *data,
+						    uint16_t datalen);
+
+/*
+ * The root index is owned by FS-Cache itself.
+ *
+ * When a netfs requests caching facilities, FS-Cache will, if one doesn't
+ * already exist, create an entry in the root index with the key being the name
+ * of the netfs ("AFS" for example), and the auxiliary data holding the index
+ * structure version supplied by the netfs:
+ *
+ *				     FSDEF
+ *				       |
+ *				 +-----------+
+ *				 |           |
+ *				NFS         AFS
+ *			       [v=1]       [v=1]
+ *
+ * If an entry with the appropriate name does already exist, the version is
+ * compared.  If the version is different, the entire subtree from that entry
+ * will be discarded and a new entry created.
+ *
+ * The new entry will be an index, and a cookie referring to it will be passed
+ * to the netfs.  This is then the root handle by which the netfs accesses the
+ * cache.  It can create whatever objects it likes in that index, including
+ * further indices.
+ */
+static struct fscache_cookie_def fscache_fsdef_index_def = {
+	.name		= ".FS-Cache",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+struct fscache_cookie fscache_fsdef_index = {
+	.usage		= ATOMIC_INIT(1),
+	.n_active	= ATOMIC_INIT(1),
+	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
+	.backing_objects = HLIST_HEAD_INIT,
+	.def		= &fscache_fsdef_index_def,
+	.flags		= 1 << FSCACHE_COOKIE_ENABLED,
+};
+EXPORT_SYMBOL(fscache_fsdef_index);
+
+/*
+ * Definition of an entry in the root index.  Each entry is an index, keyed to
+ * a specific netfs and only applicable to a particular version of the index
+ * structure used by that netfs.
+ */
+struct fscache_cookie_def fscache_fsdef_netfs_def = {
+	.name		= "FSDEF.netfs",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= fscache_fsdef_netfs_get_key,
+	.get_aux	= fscache_fsdef_netfs_get_aux,
+	.check_aux	= fscache_fsdef_netfs_check_aux,
+};
+
+/*
+ * get the key data for an FSDEF index record - this is the name of the netfs
+ * for which this entry is created
+ */
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct fscache_netfs *netfs = cookie_netfs_data;
+	unsigned klen;
+
+	_enter("{%s.%u},", netfs->name, netfs->version);
+
+	klen = strlen(netfs->name);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, netfs->name, klen);
+	return klen;
+}
+
+/*
+ * get the auxiliary data for an FSDEF index record - this is the index
+ * structure version number of the netfs for which this version is created
+ */
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct fscache_netfs *netfs = cookie_netfs_data;
+	unsigned dlen;
+
+	_enter("{%s.%u},", netfs->name, netfs->version);
+
+	dlen = sizeof(uint32_t);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, &netfs->version, dlen);
+	return dlen;
+}
+
+/*
+ * check that the index structure version number stored in the auxiliary data
+ * matches the one the netfs gave us
+ */
+static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
+	void *cookie_netfs_data,
+	const void *data,
+	uint16_t datalen)
+{
+	struct fscache_netfs *netfs = cookie_netfs_data;
+	uint32_t version;
+
+	_enter("{%s},,%hu", netfs->name, datalen);
+
+	if (datalen != sizeof(version)) {
+		_leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	memcpy(&version, data, sizeof(version));
+	if (version != netfs->version) {
+		_leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
+}
diff --git a/kernel/fs/fscache/histogram.c b/kernel/fs/fscache/histogram.c
new file mode 100644
index 000000000..7d637e233
--- /dev/null
+++ b/kernel/fs/fscache/histogram.c
@@ -0,0 +1,107 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned n[5], t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+		n[1] = atomic_read(&fscache_ops_histogram[index]);
+		n[2] = atomic_read(&fscache_objs_histogram[index]);
+		n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+		n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+		if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+			   index, t, n[0], n[1], n[2], n[3], n[4]);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations fscache_histogram_ops = {
+	.start		= fscache_histogram_start,
+	.stop		= fscache_histogram_stop,
+	.next		= fscache_histogram_next,
+	.show		= fscache_histogram_show,
+};
+
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &fscache_histogram_ops);
+}
+
+const struct file_operations fscache_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/kernel/fs/fscache/internal.h b/kernel/fs/fscache/internal.h
new file mode 100644
index 000000000..7872a62ef
--- /dev/null
+++ b/kernel/fs/fscache/internal.h
@@ -0,0 +1,464 @@
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Lock order, in the order in which multiple locks should be obtained:
+ * - fscache_addremove_sem
+ * - cookie->lock
+ * - cookie->parent->lock
+ * - cache->object_list_lock
+ * - object->lock
+ * - object->parent->lock
+ * - cookie->stores_lock
+ * - fscache_thread_lock
+ *
+ */
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "FS-Cache: " fmt
+
+#include <linux/fscache-cache.h>
+#include <linux/sched.h>
+
+#define FSCACHE_MIN_THREADS	4
+#define FSCACHE_MAX_THREADS	32
+
+/*
+ * cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+
+extern struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *);
+
+/*
+ * cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+
+extern void fscache_cookie_init_once(void *);
+extern void __fscache_cookie_put(struct fscache_cookie *);
+
+/*
+ * fsdef.c
+ */
+extern struct fscache_cookie fscache_fsdef_index;
+extern struct fscache_cookie_def fscache_fsdef_netfs_def;
+
+/*
+ * histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+extern const struct file_operations fscache_histogram_fops;
+
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * main.c
+ */
+extern unsigned fscache_defer_lookup;
+extern unsigned fscache_defer_create;
+extern unsigned fscache_debug;
+extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+extern struct workqueue_struct *fscache_op_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+static inline bool fscache_object_congested(void)
+{
+	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
+
+extern int fscache_wait_atomic_t(atomic_t *);
+
+/*
+ * object.c
+ */
+extern void fscache_enqueue_object(struct fscache_object *);
+
+/*
+ * object-list.c
+ */
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+extern const struct file_operations fscache_objlist_fops;
+
+extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
+#else
+#define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
+#endif
+
+/*
+ * operation.c
+ */
+extern int fscache_submit_exclusive_op(struct fscache_object *,
+				       struct fscache_operation *);
+extern int fscache_submit_op(struct fscache_object *,
+			     struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *,
+			     void (*)(struct fscache_operation *));
+extern void fscache_cancel_all_ops(struct fscache_object *);
+extern void fscache_abort_object(struct fscache_object *);
+extern void fscache_start_operations(struct fscache_object *);
+extern void fscache_operation_gc(struct work_struct *);
+
+/*
+ * page.c
+ */
+extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
+extern int fscache_wait_for_operation_activation(struct fscache_object *,
+						 struct fscache_operation *,
+						 atomic_t *,
+						 atomic_t *,
+						 void (*)(struct fscache_operation *));
+extern void fscache_invalidate_writes(struct fscache_cookie *);
+
+/*
+ * proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()	(0)
+#define fscache_proc_cleanup()	do {} while (0)
+#endif
+
+/*
+ * stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+extern atomic_t fscache_n_op_cancelled;
+extern atomic_t fscache_n_op_rejected;
+
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_allocs_intr;
+extern atomic_t fscache_n_allocs_object_dead;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrievals_object_dead;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_store_pages;
+extern atomic_t fscache_n_store_radix_deletes;
+extern atomic_t fscache_n_store_pages_over_limit;
+
+extern atomic_t fscache_n_store_vmscan_not_storing;
+extern atomic_t fscache_n_store_vmscan_gone;
+extern atomic_t fscache_n_store_vmscan_busy;
+extern atomic_t fscache_n_store_vmscan_cancelled;
+extern atomic_t fscache_n_store_vmscan_wait;
+
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_invalidates;
+extern atomic_t fscache_n_invalidates_run;
+
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+extern atomic_t fscache_n_relinquishes_retire;
+
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_lookups_timed_out;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+
+extern atomic_t fscache_n_cop_alloc_object;
+extern atomic_t fscache_n_cop_lookup_object;
+extern atomic_t fscache_n_cop_lookup_complete;
+extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_invalidate_object;
+extern atomic_t fscache_n_cop_update_object;
+extern atomic_t fscache_n_cop_drop_object;
+extern atomic_t fscache_n_cop_put_object;
+extern atomic_t fscache_n_cop_sync_cache;
+extern atomic_t fscache_n_cop_attr_changed;
+extern atomic_t fscache_n_cop_read_or_alloc_page;
+extern atomic_t fscache_n_cop_read_or_alloc_pages;
+extern atomic_t fscache_n_cop_allocate_page;
+extern atomic_t fscache_n_cop_allocate_pages;
+extern atomic_t fscache_n_cop_write_page;
+extern atomic_t fscache_n_cop_uncache_page;
+extern atomic_t fscache_n_cop_dissociate_pages;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+	atomic_inc(stat);
+}
+
+static inline void fscache_stat_d(atomic_t *stat)
+{
+	atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
+extern const struct file_operations fscache_stats_fops;
+#else
+
+#define __fscache_stat(stat) (NULL)
+#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
+#endif
+
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+				       unsigned event)
+{
+	BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+	printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+	       object->debug_id, object->event_mask, (1 << event));
+#endif
+	if (!test_and_set_bit(event, &object->events) &&
+	    test_bit(event, &object->event_mask))
+		fscache_enqueue_object(object);
+}
+
+/*
+ * drop a reference to a cookie
+ */
+static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+{
+	BUG_ON(atomic_read(&cookie->usage) <= 0);
+	if (atomic_dec_and_test(&cookie->usage))
+		__fscache_cookie_put(cookie);
+}
+
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->get_context)
+		cookie->def->get_context(cookie->netfs_data, context);
+	return context;
+}
+
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->put_context)
+		cookie->def->put_context(cookie->netfs_data, context);
+}
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_FSCACHE_DEBUG)
+#define _enter(FMT, ...)			\
+do {						\
+	if (__do_kdebug(ENTER))			\
+		kenter(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _leave(FMT, ...)			\
+do {						\
+	if (__do_kdebug(LEAVE))			\
+		kleave(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _debug(FMT, ...)			\
+do {						\
+	if (__do_kdebug(DEBUG))			\
+		kdebug(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#else
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+#endif
+
+/*
+ * determine whether a particular optional debugging point should be logged
+ * - we need to go through three steps to persuade cpp to correctly join the
+ *   shorthand in FSCACHE_DEBUG_LEVEL with its prefix
+ */
+#define ____do_kdebug(LEVEL, POINT) \
+	unlikely((fscache_debug & \
+		  (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
+#define ___do_kdebug(LEVEL, POINT) \
+	____do_kdebug(LEVEL, POINT)
+#define __do_kdebug(POINT) \
+	___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
+
+#define FSCACHE_DEBUG_CACHE	0
+#define FSCACHE_DEBUG_COOKIE	1
+#define FSCACHE_DEBUG_PAGE	2
+#define FSCACHE_DEBUG_OPERATION	3
+
+#define FSCACHE_POINT_ENTER	1
+#define FSCACHE_POINT_LEAVE	2
+#define FSCACHE_POINT_DEBUG	4
+
+#ifndef FSCACHE_DEBUG_LEVEL
+#define FSCACHE_DEBUG_LEVEL CACHE
+#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)							\
+do {									\
+	if (unlikely(!(X))) {						\
+		pr_err("\n");					\
+		pr_err("Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		pr_err("\n");					\
+		pr_err("Assertion failed\n");	\
+		pr_err("%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIF(C, X)							\
+do {									\
+	if (unlikely((C) && !(X))) {					\
+		pr_err("\n");					\
+		pr_err("Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		pr_err("\n");					\
+		pr_err("Assertion failed\n");	\
+		pr_err("%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#else
+
+#define ASSERT(X)			do {} while (0)
+#define ASSERTCMP(X, OP, Y)		do {} while (0)
+#define ASSERTIF(C, X)			do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)	do {} while (0)
+
+#endif /* assert or not */
diff --git a/kernel/fs/fscache/main.c b/kernel/fs/fscache/main.c
new file mode 100644
index 000000000..b39d487cc
--- /dev/null
+++ b/kernel/fs/fscache/main.c
@@ -0,0 +1,206 @@
+/* General filesystem local caching manager
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("FS Cache Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned fscache_defer_lookup = 1;
+module_param_named(defer_lookup, fscache_defer_lookup, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_lookup,
+		 "Defer cookie lookup to background thread");
+
+unsigned fscache_defer_create = 1;
+module_param_named(defer_create, fscache_defer_create, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_create,
+		 "Defer cookie creation to background thread");
+
+unsigned fscache_debug;
+module_param_named(debug, fscache_debug, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_debug,
+		 "FS-Cache debugging mask");
+
+struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+struct workqueue_struct *fscache_op_wq;
+
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+static unsigned fscache_op_max_active = 2;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos)
+{
+	struct workqueue_struct **wqp = table->extra1;
+	unsigned int *datap = table->data;
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret == 0)
+		workqueue_set_max_active(*wqp, *datap);
+	return ret;
+}
+
+static struct ctl_table fscache_sysctls[] = {
+	{
+		.procname	= "object_max_active",
+		.data		= &fscache_object_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_object_wq,
+	},
+	{
+		.procname	= "operation_max_active",
+		.data		= &fscache_op_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_op_wq,
+	},
+	{}
+};
+
+static struct ctl_table fscache_sysctls_root[] = {
+	{
+		.procname	= "fscache",
+		.mode		= 0555,
+		.child		= fscache_sysctls,
+	},
+	{}
+};
+#endif
+
+/*
+ * initialise the fs caching module
+ */
+static int __init fscache_init(void)
+{
+	unsigned int nr_cpus = num_possible_cpus();
+	unsigned int cpu;
+	int ret;
+
+	fscache_object_max_active =
+		clamp_val(nr_cpus,
+			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+					    fscache_object_max_active);
+	if (!fscache_object_wq)
+		goto error_object_wq;
+
+	fscache_op_max_active =
+		clamp_val(fscache_object_max_active / 2,
+			  fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
+					fscache_op_max_active);
+	if (!fscache_op_wq)
+		goto error_op_wq;
+
+	for_each_possible_cpu(cpu)
+		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
+
+	ret = fscache_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+#ifdef CONFIG_SYSCTL
+	ret = -ENOMEM;
+	fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+	if (!fscache_sysctl_header)
+		goto error_sysctl;
+#endif
+
+	fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
+					       sizeof(struct fscache_cookie),
+					       0,
+					       0,
+					       fscache_cookie_init_once);
+	if (!fscache_cookie_jar) {
+		pr_notice("Failed to allocate a cookie jar\n");
+		ret = -ENOMEM;
+		goto error_cookie_jar;
+	}
+
+	fscache_root = kobject_create_and_add("fscache", kernel_kobj);
+	if (!fscache_root)
+		goto error_kobj;
+
+	pr_notice("Loaded\n");
+	return 0;
+
+error_kobj:
+	kmem_cache_destroy(fscache_cookie_jar);
+error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
+	fscache_proc_cleanup();
+error_proc:
+	destroy_workqueue(fscache_op_wq);
+error_op_wq:
+	destroy_workqueue(fscache_object_wq);
+error_object_wq:
+	return ret;
+}
+
+fs_initcall(fscache_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit fscache_exit(void)
+{
+	_enter("");
+
+	kobject_put(fscache_root);
+	kmem_cache_destroy(fscache_cookie_jar);
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+#endif
+	fscache_proc_cleanup();
+	destroy_workqueue(fscache_op_wq);
+	destroy_workqueue(fscache_object_wq);
+	pr_notice("Unloaded\n");
+}
+
+module_exit(fscache_exit);
+
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+	schedule();
+	return 0;
+}
diff --git a/kernel/fs/fscache/netfs.c b/kernel/fs/fscache/netfs.c
new file mode 100644
index 000000000..6d941f56f
--- /dev/null
+++ b/kernel/fs/fscache/netfs.c
@@ -0,0 +1,104 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static LIST_HEAD(fscache_netfs_list);
+
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+	struct fscache_netfs *ptr;
+	int ret;
+
+	_enter("{%s}", netfs->name);
+
+	INIT_LIST_HEAD(&netfs->link);
+
+	/* allocate a cookie for the primary index */
+	netfs->primary_index =
+		kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+
+	if (!netfs->primary_index) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* initialise the primary index cookie */
+	atomic_set(&netfs->primary_index->usage, 1);
+	atomic_set(&netfs->primary_index->n_children, 0);
+	atomic_set(&netfs->primary_index->n_active, 1);
+
+	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
+	netfs->primary_index->parent		= &fscache_fsdef_index;
+	netfs->primary_index->netfs_data	= netfs;
+	netfs->primary_index->flags		= 1 << FSCACHE_COOKIE_ENABLED;
+
+	atomic_inc(&netfs->primary_index->parent->usage);
+	atomic_inc(&netfs->primary_index->parent->n_children);
+
+	spin_lock_init(&netfs->primary_index->lock);
+	INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+
+	/* check the netfs type is not already present */
+	down_write(&fscache_addremove_sem);
+
+	ret = -EEXIST;
+	list_for_each_entry(ptr, &fscache_netfs_list, link) {
+		if (strcmp(ptr->name, netfs->name) == 0)
+			goto already_registered;
+	}
+
+	list_add(&netfs->link, &fscache_netfs_list);
+	ret = 0;
+
+	pr_notice("Netfs '%s' registered for caching\n", netfs->name);
+
+already_registered:
+	up_write(&fscache_addremove_sem);
+
+	if (ret < 0) {
+		netfs->primary_index->parent = NULL;
+		__fscache_cookie_put(netfs->primary_index);
+		netfs->primary_index = NULL;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+	_enter("{%s.%u}", netfs->name, netfs->version);
+
+	down_write(&fscache_addremove_sem);
+
+	list_del(&netfs->link);
+	fscache_relinquish_cookie(netfs->primary_index, 0);
+
+	up_write(&fscache_addremove_sem);
+
+	pr_notice("Netfs '%s' unregistered from caching\n",
+		  netfs->name);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/kernel/fs/fscache/object-list.c b/kernel/fs/fscache/object-list.c
new file mode 100644
index 000000000..51dde817e
--- /dev/null
+++ b/kernel/fs/fscache/object-list.c
@@ -0,0 +1,412 @@
+/* Global fscache object list maintainer and viewer
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+static struct rb_root fscache_object_list;
+static DEFINE_RWLOCK(fscache_object_list_lock);
+
+struct fscache_objlist_data {
+	unsigned long	config;		/* display configuration */
+#define FSCACHE_OBJLIST_CONFIG_KEY	0x00000001	/* show object keys */
+#define FSCACHE_OBJLIST_CONFIG_AUX	0x00000002	/* show object auxdata */
+#define FSCACHE_OBJLIST_CONFIG_COOKIE	0x00000004	/* show objects with cookies */
+#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE	0x00000008	/* show objects without cookies */
+#define FSCACHE_OBJLIST_CONFIG_BUSY	0x00000010	/* show busy objects */
+#define FSCACHE_OBJLIST_CONFIG_IDLE	0x00000020	/* show idle objects */
+#define FSCACHE_OBJLIST_CONFIG_PENDWR	0x00000040	/* show objects with pending writes */
+#define FSCACHE_OBJLIST_CONFIG_NOPENDWR	0x00000080	/* show objects without pending writes */
+#define FSCACHE_OBJLIST_CONFIG_READS	0x00000100	/* show objects with active reads */
+#define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
+#define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
+#define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without work */
+
+	u8		buf[512];	/* key and aux data buffer */
+};
+
+/*
+ * Add an object to the object list
+ * - we use the address of the fscache_object structure as the key into the
+ *   tree
+ */
+void fscache_objlist_add(struct fscache_object *obj)
+{
+	struct fscache_object *xobj;
+	struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+
+	ASSERT(RB_EMPTY_NODE(&obj->objlist_link));
+
+	write_lock(&fscache_object_list_lock);
+
+	while (*p) {
+		parent = *p;
+		xobj = rb_entry(parent, struct fscache_object, objlist_link);
+
+		if (obj < xobj)
+			p = &(*p)->rb_left;
+		else if (obj > xobj)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&obj->objlist_link, parent, p);
+	rb_insert_color(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+
+/*
+ * Remove an object from the object list.
+ */
+void fscache_objlist_remove(struct fscache_object *obj)
+{
+	if (RB_EMPTY_NODE(&obj->objlist_link))
+		return;
+
+	write_lock(&fscache_object_list_lock);
+
+	BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
+	rb_erase(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+
+/*
+ * find the object in the tree on or after the specified index
+ */
+static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
+{
+	struct fscache_object *pobj, *obj = NULL, *minobj = NULL;
+	struct rb_node *p;
+	unsigned long pos;
+
+	if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
+		return NULL;
+	pos = *_pos;
+
+	/* banners (can't represent line 0 by pos 0 as that would involve
+	 * returning a NULL pointer) */
+	if (pos == 0)
+		return (struct fscache_object *)(long)++(*_pos);
+	if (pos < 3)
+		return (struct fscache_object *)pos;
+
+	pobj = (struct fscache_object *)pos;
+	p = fscache_object_list.rb_node;
+	while (p) {
+		obj = rb_entry(p, struct fscache_object, objlist_link);
+		if (pobj < obj) {
+			if (!minobj || minobj > obj)
+				minobj = obj;
+			p = p->rb_left;
+		} else if (pobj > obj) {
+			p = p->rb_right;
+		} else {
+			minobj = obj;
+			break;
+		}
+		obj = NULL;
+	}
+
+	if (!minobj)
+		*_pos = (unsigned long) ERR_PTR(-ENOENT);
+	else if (minobj != obj)
+		*_pos = (unsigned long) minobj;
+	return minobj;
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
+	__acquires(&fscache_object_list_lock)
+{
+	read_lock(&fscache_object_list_lock);
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	(*_pos)++;
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_objlist_stop(struct seq_file *m, void *v)
+	__releases(&fscache_object_list_lock)
+{
+	read_unlock(&fscache_object_list_lock);
+}
+
+/*
+ * display an object
+ */
+static int fscache_objlist_show(struct seq_file *m, void *v)
+{
+	struct fscache_objlist_data *data = m->private;
+	struct fscache_object *obj = v;
+	struct fscache_cookie *cookie;
+	unsigned long config = data->config;
+	char _type[3], *type;
+	u8 *buf = data->buf, *p;
+
+	if ((unsigned long) v == 1) {
+		seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
+			 " EM EV FL S"
+			 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, "       ");
+		if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+			seq_puts(m, "OBJECT_KEY");
+		if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			       FSCACHE_OBJLIST_CONFIG_AUX)) ==
+		    (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, ", ");
+		if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+			seq_puts(m, "AUX_DATA");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	if ((unsigned long) v == 2) {
+		seq_puts(m, "======== ======== ==== ===== === === === == ====="
+			 " == == == ="
+			 " | ================ == == ================");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, " ================");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	/* filter out any unwanted objects */
+#define FILTER(criterion, _yes, _no)					\
+	do {								\
+		unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes;	\
+		unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no;	\
+		if (criterion) {					\
+			if (!(config & yes))				\
+				return 0;				\
+		} else {						\
+			if (!(config & no))				\
+				return 0;				\
+		}							\
+	} while(0)
+
+	cookie = obj->cookie;
+	if (~config) {
+		FILTER(cookie->def,
+		       COOKIE, NOCOOKIE);
+		FILTER(fscache_object_is_active(obj) ||
+		       obj->n_ops != 0 ||
+		       obj->n_obj_ops != 0 ||
+		       obj->flags ||
+		       !list_empty(&obj->dependents),
+		       BUSY, IDLE);
+		FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
+		       PENDWR, NOPENDWR);
+		FILTER(atomic_read(&obj->n_reads),
+		       READS, NOREADS);
+		FILTER(obj->events & obj->event_mask,
+		       EVENTS, NOEVENTS);
+		FILTER(work_busy(&obj->work), WORK, NOWORK);
+	}
+
+	seq_printf(m,
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
+		   obj->debug_id,
+		   obj->parent ? obj->parent->debug_id : -1,
+		   obj->state->short_name,
+		   obj->n_children,
+		   obj->n_ops,
+		   obj->n_obj_ops,
+		   obj->n_in_progress,
+		   obj->n_exclusive,
+		   atomic_read(&obj->n_reads),
+		   obj->event_mask,
+		   obj->events,
+		   obj->flags,
+		   work_busy(&obj->work));
+
+	if (fscache_use_cookie(obj)) {
+		uint16_t keylen = 0, auxlen = 0;
+
+		switch (cookie->def->type) {
+		case 0:
+			type = "IX";
+			break;
+		case 1:
+			type = "DT";
+			break;
+		default:
+			sprintf(_type, "%02u", cookie->def->type);
+			type = _type;
+			break;
+		}
+
+		seq_printf(m, "%-16s %s %2lx %16p",
+			   cookie->def->name,
+			   type,
+			   cookie->flags,
+			   cookie->netfs_data);
+
+		if (cookie->def->get_key &&
+		    config & FSCACHE_OBJLIST_CONFIG_KEY)
+			keylen = cookie->def->get_key(cookie->netfs_data,
+						      buf, 400);
+
+		if (cookie->def->get_aux &&
+		    config & FSCACHE_OBJLIST_CONFIG_AUX)
+			auxlen = cookie->def->get_aux(cookie->netfs_data,
+						      buf + keylen, 512 - keylen);
+		fscache_unuse_cookie(obj);
+
+		if (keylen > 0 || auxlen > 0) {
+			seq_puts(m, " ");
+			for (p = buf; keylen > 0; keylen--)
+				seq_printf(m, "%02x", *p++);
+			if (auxlen > 0) {
+				if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+					seq_puts(m, ", ");
+				for (; auxlen > 0; auxlen--)
+					seq_printf(m, "%02x", *p++);
+			}
+		}
+
+		seq_puts(m, "\n");
+	} else {
+		seq_puts(m, "<no_netfs>\n");
+	}
+	return 0;
+}
+
+static const struct seq_operations fscache_objlist_ops = {
+	.start		= fscache_objlist_start,
+	.stop		= fscache_objlist_stop,
+	.next		= fscache_objlist_next,
+	.show		= fscache_objlist_show,
+};
+
+/*
+ * get the configuration for filtering the list
+ */
+static void fscache_objlist_config(struct fscache_objlist_data *data)
+{
+#ifdef CONFIG_KEYS
+	struct user_key_payload *confkey;
+	unsigned long config;
+	struct key *key;
+	const char *buf;
+	int len;
+
+	key = request_key(&key_type_user, "fscache:objlist", NULL);
+	if (IS_ERR(key))
+		goto no_config;
+
+	config = 0;
+	rcu_read_lock();
+
+	confkey = key->payload.data;
+	buf = confkey->data;
+
+	for (len = confkey->datalen - 1; len >= 0; len--) {
+		switch (buf[len]) {
+		case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY;		break;
+		case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX;		break;
+		case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE;	break;
+		case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE;	break;
+		case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY;	break;
+		case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE;	break;
+		case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR;	break;
+		case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR;	break;
+		case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS;	break;
+		case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS;	break;
+		case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK;	break;
+		case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK;	break;
+		}
+	}
+
+	rcu_read_unlock();
+	key_put(key);
+
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
+
+	data->config = config;
+	return;
+
+no_config:
+#endif
+	data->config = ULONG_MAX;
+}
+
+/*
+ * open "/proc/fs/fscache/objects" to provide a list of active objects
+ * - can be configured by a user-defined key added to the caller's keyrings
+ */
+static int fscache_objlist_open(struct inode *inode, struct file *file)
+{
+	struct fscache_objlist_data *data;
+
+	data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data));
+	if (!data)
+		return -ENOMEM;
+
+	/* get the configuration key */
+	fscache_objlist_config(data);
+
+	return 0;
+}
+
+/*
+ * clean up on close
+ */
+static int fscache_objlist_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	kfree(m->private);
+	m->private = NULL;
+	return seq_release(inode, file);
+}
+
+const struct file_operations fscache_objlist_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_objlist_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= fscache_objlist_release,
+};
diff --git a/kernel/fs/fscache/object.c b/kernel/fs/fscache/object.c
new file mode 100644
index 000000000..da032daf0
--- /dev/null
+++ b/kernel/fs/fscache/object.c
@@ -0,0 +1,1018 @@
+/* FS-Cache object state machine handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/object.txt for a description of the
+ * object state machine and the in-kernel representations.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include "internal.h"
+
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+
+#define __STATE_NAME(n) fscache_osm_##n
+#define STATE(n) (&__STATE_NAME(n))
+
+/*
+ * Define a work state.  Work states are execution states.  No event processing
+ * is performed by them.  The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition.  Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+	const struct fscache_state __STATE_NAME(n) = {			\
+		.name = #n,						\
+		.short_name = sn,					\
+		.work = f						\
+	}
+
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+
+/*
+ * Define a wait state.  Wait states are event processing states.  No execution
+ * is performed by them.  Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y".  The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+	const struct fscache_state __STATE_NAME(n) = {			\
+		.name = #n,						\
+		.short_name = sn,					\
+		.work = NULL,						\
+		.transitions = { __VA_ARGS__, { 0, NULL } }		\
+	}
+
+#define TRANSIT_TO(state, emask) \
+	{ .events = (emask), .transit_to = STATE(state) }
+
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT,		"INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY,		"PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT,		"ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT,	"LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT,	"CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE,	"AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS,	"JUMP", fscache_jumpstart_dependents);
+
+static WORK_STATE(INVALIDATE_OBJECT,	"INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT,	"UPDT", fscache_update_object);
+
+static WORK_STATE(LOOKUP_FAILURE,	"LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT,		"KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS,	"KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT,		"DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD,		"DEAD", (void*)2UL);
+
+static WAIT_STATE(WAIT_FOR_INIT,	"?INI",
+		  TRANSIT_TO(INIT_OBJECT,	1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_PARENT,	"?PRN",
+		  TRANSIT_TO(PARENT_READY,	1 << FSCACHE_OBJECT_EV_PARENT_READY));
+
+static WAIT_STATE(WAIT_FOR_CMD,		"?CMD",
+		  TRANSIT_TO(INVALIDATE_OBJECT,	1 << FSCACHE_OBJECT_EV_INVALIDATE),
+		  TRANSIT_TO(UPDATE_OBJECT,	1 << FSCACHE_OBJECT_EV_UPDATE),
+		  TRANSIT_TO(JUMPSTART_DEPS,	1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_CLEARANCE,	"?CLR",
+		  TRANSIT_TO(KILL_OBJECT,	1 << FSCACHE_OBJECT_EV_CLEARED));
+
+/*
+ * Out-of-band event transition tables.  These are for handling unexpected
+ * events, such as an I/O error.  If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+	   TRANSIT_TO(ABORT_INIT,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+	   TRANSIT_TO(LOOKUP_FAILURE,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_run_oob[] = {
+	   TRANSIT_TO(KILL_OBJECT,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
+};
+
+static int  fscache_get_object(struct fscache_object *);
+static void fscache_put_object(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
+static void fscache_dequeue_object(struct fscache_object *);
+
+/*
+ * we need to notify the parent when an op completes that we had outstanding
+ * upon it
+ */
+static inline void fscache_done_parent_op(struct fscache_object *object)
+{
+	struct fscache_object *parent = object->parent;
+
+	_enter("OBJ%x {OBJ%x,%x}",
+	       object->debug_id, parent->debug_id, parent->n_ops);
+
+	spin_lock_nested(&parent->lock, 1);
+	parent->n_obj_ops--;
+	parent->n_ops--;
+	if (parent->n_ops == 0)
+		fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+	spin_unlock(&parent->lock);
+}
+
+/*
+ * Object state machine dispatcher.
+ */
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
+{
+	const struct fscache_transition *t;
+	const struct fscache_state *state, *new_state;
+	unsigned long events, event_mask;
+	int event = -1;
+
+	ASSERT(object != NULL);
+
+	_enter("{OBJ%x,%s,%lx}",
+	       object->debug_id, object->state->name, object->events);
+
+	event_mask = object->event_mask;
+restart:
+	object->event_mask = 0; /* Mask normal event handling */
+	state = object->state;
+restart_masked:
+	events = object->events;
+
+	/* Handle any out-of-band events (typically an error) */
+	if (events & object->oob_event_mask) {
+		_debug("{OBJ%x} oob %lx",
+		       object->debug_id, events & object->oob_event_mask);
+		for (t = object->oob_table; t->events; t++) {
+			if (events & t->events) {
+				state = t->transit_to;
+				ASSERT(state->work != NULL);
+				event = fls(events & t->events) - 1;
+				__clear_bit(event, &object->oob_event_mask);
+				clear_bit(event, &object->events);
+				goto execute_work_state;
+			}
+		}
+	}
+
+	/* Wait states are just transition tables */
+	if (!state->work) {
+		if (events & event_mask) {
+			for (t = state->transitions; t->events; t++) {
+				if (events & t->events) {
+					new_state = t->transit_to;
+					event = fls(events & t->events) - 1;
+					clear_bit(event, &object->events);
+					_debug("{OBJ%x} ev %d: %s -> %s",
+					       object->debug_id, event,
+					       state->name, new_state->name);
+					object->state = state = new_state;
+					goto execute_work_state;
+				}
+			}
+
+			/* The event mask didn't include all the tabled bits */
+			BUG();
+		}
+		/* Randomly woke up */
+		goto unmask_events;
+	}
+
+execute_work_state:
+	_debug("{OBJ%x} exec %s", object->debug_id, state->name);
+
+	new_state = state->work(object, event);
+	event = -1;
+	if (new_state == NO_TRANSIT) {
+		_debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+		fscache_enqueue_object(object);
+		event_mask = object->oob_event_mask;
+		goto unmask_events;
+	}
+
+	_debug("{OBJ%x} %s -> %s",
+	       object->debug_id, state->name, new_state->name);
+	object->state = state = new_state;
+
+	if (state->work) {
+		if (unlikely(state->work == ((void *)2UL))) {
+			_leave(" [dead]");
+			return;
+		}
+		goto restart_masked;
+	}
+
+	/* Transited to wait state */
+	event_mask = object->oob_event_mask;
+	for (t = state->transitions; t->events; t++)
+		event_mask |= t->events;
+
+unmask_events:
+	object->event_mask = event_mask;
+	smp_mb();
+	events = object->events;
+	if (events & event_mask)
+		goto restart;
+	_leave(" [msk %lx]", event_mask);
+}
+
+/*
+ * execute an object
+ */
+static void fscache_object_work_func(struct work_struct *work)
+{
+	struct fscache_object *object =
+		container_of(work, struct fscache_object, work);
+	unsigned long start;
+
+	_enter("{OBJ%x}", object->debug_id);
+
+	start = jiffies;
+	fscache_object_sm_dispatcher(object);
+	fscache_hist(fscache_objs_histogram, start);
+	fscache_put_object(object);
+}
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+			 struct fscache_cookie *cookie,
+			 struct fscache_cache *cache)
+{
+	const struct fscache_transition *t;
+
+	atomic_inc(&cache->object_count);
+
+	object->state = STATE(WAIT_FOR_INIT);
+	object->oob_table = fscache_osm_init_oob;
+	object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+	spin_lock_init(&object->lock);
+	INIT_LIST_HEAD(&object->cache_link);
+	INIT_HLIST_NODE(&object->cookie_link);
+	INIT_WORK(&object->work, fscache_object_work_func);
+	INIT_LIST_HEAD(&object->dependents);
+	INIT_LIST_HEAD(&object->dep_link);
+	INIT_LIST_HEAD(&object->pending_ops);
+	object->n_children = 0;
+	object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+	object->events = 0;
+	object->store_limit = 0;
+	object->store_limit_l = 0;
+	object->cache = cache;
+	object->cookie = cookie;
+	object->parent = NULL;
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	RB_CLEAR_NODE(&object->objlist_link);
+#endif
+
+	object->oob_event_mask = 0;
+	for (t = object->oob_table; t->events; t++)
+		object->oob_event_mask |= t->events;
+	object->event_mask = object->oob_event_mask;
+	for (t = object->state->transitions; t->events; t++)
+		object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+								int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_event_mask = 0;
+	fscache_dequeue_object(object);
+	return transit_to(KILL_OBJECT);
+}
+
+/*
+ * initialise an object
+ * - check the specified object's parent to see if we can make use of it
+ *   immediately to do a creation
+ * - we may need to start the process of creating a parent and we need to wait
+ *   for the parent's lookup and creation to complete if it's not there yet
+ */
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+							     int event)
+{
+	struct fscache_object *parent;
+	bool success;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	ASSERT(list_empty(&object->dep_link));
+
+	parent = object->parent;
+	if (!parent) {
+		_leave(" [no parent]");
+		return transit_to(DROP_OBJECT);
+	}
+
+	_debug("parent: %s of:%lx", parent->state->name, parent->flags);
+
+	if (fscache_object_is_dying(parent)) {
+		_leave(" [bad parent]");
+		return transit_to(DROP_OBJECT);
+	}
+
+	if (fscache_object_is_available(parent)) {
+		_leave(" [ready]");
+		return transit_to(PARENT_READY);
+	}
+
+	_debug("wait");
+
+	spin_lock(&parent->lock);
+	fscache_stat(&fscache_n_cop_grab_object);
+	success = false;
+	if (fscache_object_is_live(parent) &&
+	    object->cache->ops->grab_object(object)) {
+		list_add(&object->dep_link, &parent->dependents);
+		success = true;
+	}
+	fscache_stat_d(&fscache_n_cop_grab_object);
+	spin_unlock(&parent->lock);
+	if (!success) {
+		_leave(" [grab failed]");
+		return transit_to(DROP_OBJECT);
+	}
+
+	/* fscache_acquire_non_index_cookie() uses this
+	 * to wake the chain up */
+	fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+	_leave(" [wait]");
+	return transit_to(WAIT_FOR_PARENT);
+}
+
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+							int event)
+{
+	struct fscache_object *parent = object->parent;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	ASSERT(parent != NULL);
+
+	spin_lock(&parent->lock);
+	parent->n_ops++;
+	parent->n_obj_ops++;
+	object->lookup_jif = jiffies;
+	spin_unlock(&parent->lock);
+
+	_leave("");
+	return transit_to(LOOK_UP_OBJECT);
+}
+
+/*
+ * look an object up in the cache from which it was allocated
+ * - we hold an "access lock" on the parent object, so the parent object cannot
+ *   be withdrawn by either party till we've finished
+ */
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+							  int event)
+{
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_object *parent = object->parent;
+	int ret;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_table = fscache_osm_lookup_oob;
+
+	ASSERT(parent != NULL);
+	ASSERTCMP(parent->n_ops, >, 0);
+	ASSERTCMP(parent->n_obj_ops, >, 0);
+
+	/* make sure the parent is still available */
+	ASSERT(fscache_object_is_available(parent));
+
+	if (fscache_object_is_dying(parent) ||
+	    test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
+	    !fscache_use_cookie(object)) {
+		_leave(" [unavailable]");
+		return transit_to(LOOKUP_FAILURE);
+	}
+
+	_debug("LOOKUP \"%s\" in \"%s\"",
+	       cookie->def->name, object->cache->tag->name);
+
+	fscache_stat(&fscache_n_object_lookups);
+	fscache_stat(&fscache_n_cop_lookup_object);
+	ret = object->cache->ops->lookup_object(object);
+	fscache_stat_d(&fscache_n_cop_lookup_object);
+
+	fscache_unuse_cookie(object);
+
+	if (ret == -ETIMEDOUT) {
+		/* probably stuck behind another object, so move this one to
+		 * the back of the queue */
+		fscache_stat(&fscache_n_object_lookups_timed_out);
+		_leave(" [timeout]");
+		return NO_TRANSIT;
+	}
+
+	if (ret < 0) {
+		_leave(" [error]");
+		return transit_to(LOOKUP_FAILURE);
+	}
+
+	_leave(" [ok]");
+	return transit_to(OBJECT_AVAILABLE);
+}
+
+/**
+ * fscache_object_lookup_negative - Note negative cookie lookup
+ * @object: Object pointing to cookie to mark
+ *
+ * Note negative lookup, permitting those waiting to read data from an already
+ * existing backing object to continue as there's no data for them to read.
+ */
+void fscache_object_lookup_negative(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+
+	_enter("{OBJ%x,%s}", object->debug_id, object->state->name);
+
+	if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+		fscache_stat(&fscache_n_object_lookups_negative);
+
+		/* Allow write requests to begin stacking up and read requests to begin
+		 * returning ENODATA.
+		 */
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+
+		_debug("wake up lookup %p", &cookie->flags);
+		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+	}
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_object_lookup_negative);
+
+/**
+ * fscache_obtained_object - Note successful object lookup or creation
+ * @object: Object pointing to cookie to mark
+ *
+ * Note successful lookup and/or creation, permitting those waiting to write
+ * data to a backing object to continue.
+ *
+ * Note that after calling this, an object's cookie may be relinquished by the
+ * netfs, and so must be accessed with object lock held.
+ */
+void fscache_obtained_object(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+
+	_enter("{OBJ%x,%s}", object->debug_id, object->state->name);
+
+	/* if we were still looking up, then we must have a positive lookup
+	 * result, in which case there may be data available */
+	if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+		fscache_stat(&fscache_n_object_lookups_positive);
+
+		/* We do (presumably) have data */
+		clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+
+		/* Allow write requests to begin stacking up and read requests
+		 * to begin shovelling data.
+		 */
+		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+	} else {
+		fscache_stat(&fscache_n_object_created);
+	}
+
+	set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_obtained_object);
+
+/*
+ * handle an object that has just become available
+ */
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+							    int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_table = fscache_osm_run_oob;
+
+	spin_lock(&object->lock);
+
+	fscache_done_parent_op(object);
+	if (object->n_in_progress == 0) {
+		if (object->n_ops > 0) {
+			ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
+			fscache_start_operations(object);
+		} else {
+			ASSERT(list_empty(&object->pending_ops));
+		}
+	}
+	spin_unlock(&object->lock);
+
+	fscache_stat(&fscache_n_cop_lookup_complete);
+	object->cache->ops->lookup_complete(object);
+	fscache_stat_d(&fscache_n_cop_lookup_complete);
+
+	fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
+	fscache_stat(&fscache_n_object_avail);
+
+	_leave("");
+	return transit_to(JUMPSTART_DEPS);
+}
+
+/*
+ * Wake up this object's dependent objects now that we've become available.
+ */
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+								int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+		return NO_TRANSIT; /* Not finished; requeue */
+	return transit_to(WAIT_FOR_CMD);
+}
+
+/*
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+							  int event)
+{
+	struct fscache_cookie *cookie;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_event_mask = 0;
+
+	fscache_stat(&fscache_n_cop_lookup_complete);
+	object->cache->ops->lookup_complete(object);
+	fscache_stat_d(&fscache_n_cop_lookup_complete);
+
+	cookie = object->cookie;
+	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+	if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
+	fscache_done_parent_op(object);
+	return transit_to(KILL_OBJECT);
+}
+
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+						       int event)
+{
+	_enter("{OBJ%x,%d,%d},%d",
+	       object->debug_id, object->n_ops, object->n_children, event);
+
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	object->oob_event_mask = 0;
+
+	if (list_empty(&object->dependents) &&
+	    object->n_ops == 0 &&
+	    object->n_children == 0)
+		return transit_to(DROP_OBJECT);
+
+	if (object->n_in_progress == 0) {
+		spin_lock(&object->lock);
+		if (object->n_ops > 0 && object->n_in_progress == 0)
+			fscache_start_operations(object);
+		spin_unlock(&object->lock);
+	}
+
+	if (!list_empty(&object->dependents))
+		return transit_to(KILL_DEPENDENTS);
+
+	return transit_to(WAIT_FOR_CLEARANCE);
+}
+
+/*
+ * Kill dependent objects.
+ */
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+							   int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+		return NO_TRANSIT; /* Not finished */
+	return transit_to(WAIT_FOR_CLEARANCE);
+}
+
+/*
+ * Drop an object's attachments
+ */
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+						       int event)
+{
+	struct fscache_object *parent = object->parent;
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_cache *cache = object->cache;
+	bool awaken = false;
+
+	_enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
+
+	ASSERT(cookie != NULL);
+	ASSERT(!hlist_unhashed(&object->cookie_link));
+
+	/* Make sure the cookie no longer points here and that the netfs isn't
+	 * waiting for us.
+	 */
+	spin_lock(&cookie->lock);
+	hlist_del_init(&object->cookie_link);
+	if (hlist_empty(&cookie->backing_objects) &&
+	    test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+		awaken = true;
+	spin_unlock(&cookie->lock);
+
+	if (awaken)
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+
+	/* Prevent a race with our last child, which has to signal EV_CLEARED
+	 * before dropping our spinlock.
+	 */
+	spin_lock(&object->lock);
+	spin_unlock(&object->lock);
+
+	/* Discard from the cache's collection of objects */
+	spin_lock(&cache->object_list_lock);
+	list_del_init(&object->cache_link);
+	spin_unlock(&cache->object_list_lock);
+
+	fscache_stat(&fscache_n_cop_drop_object);
+	cache->ops->drop_object(object);
+	fscache_stat_d(&fscache_n_cop_drop_object);
+
+	/* The parent object wants to know when all it dependents have gone */
+	if (parent) {
+		_debug("release parent OBJ%x {%d}",
+		       parent->debug_id, parent->n_children);
+
+		spin_lock(&parent->lock);
+		parent->n_children--;
+		if (parent->n_children == 0)
+			fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+		spin_unlock(&parent->lock);
+		object->parent = NULL;
+	}
+
+	/* this just shifts the object release to the work processor */
+	fscache_put_object(object);
+	fscache_stat(&fscache_n_object_dead);
+
+	_leave("");
+	return transit_to(OBJECT_DEAD);
+}
+
+/*
+ * get a ref on an object
+ */
+static int fscache_get_object(struct fscache_object *object)
+{
+	int ret;
+
+	fscache_stat(&fscache_n_cop_grab_object);
+	ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+	fscache_stat_d(&fscache_n_cop_grab_object);
+	return ret;
+}
+
+/*
+ * Discard a ref on an object
+ */
+static void fscache_put_object(struct fscache_object *object)
+{
+	fscache_stat(&fscache_n_cop_put_object);
+	object->cache->ops->put_object(object);
+	fscache_stat_d(&fscache_n_cop_put_object);
+}
+
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+	fscache_objlist_remove(object);
+
+	/* We can get rid of the cookie now */
+	fscache_cookie_put(object->cookie);
+	object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
+/*
+ * enqueue an object for metadata-type processing
+ */
+void fscache_enqueue_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (fscache_get_object(object) >= 0) {
+		wait_queue_head_t *cong_wq =
+			&get_cpu_var(fscache_object_cong_wait);
+
+		if (queue_work(fscache_object_wq, &object->work)) {
+			if (fscache_object_congested())
+				wake_up(cong_wq);
+		} else
+			fscache_put_object(object);
+
+		put_cpu_var(fscache_object_cong_wait);
+	}
+}
+
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timeoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+	wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait);
+	DEFINE_WAIT(wait);
+
+	if (fscache_object_congested())
+		return true;
+
+	add_wait_queue_exclusive(cong_wq, &wait);
+	if (!fscache_object_congested())
+		*timeoutp = schedule_timeout(*timeoutp);
+	finish_wait(cong_wq, &wait);
+
+	return fscache_object_congested();
+}
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
+
+/*
+ * Enqueue the dependents of an object for metadata-type processing.
+ *
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately.  We return true if the list was
+ * cleared.
+ */
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
+{
+	struct fscache_object *dep;
+	bool ret = true;
+
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (list_empty(&object->dependents))
+		return true;
+
+	spin_lock(&object->lock);
+
+	while (!list_empty(&object->dependents)) {
+		dep = list_entry(object->dependents.next,
+				 struct fscache_object, dep_link);
+		list_del_init(&dep->dep_link);
+
+		fscache_raise_event(dep, event);
+		fscache_put_object(dep);
+
+		if (!list_empty(&object->dependents) && need_resched()) {
+			ret = false;
+			break;
+		}
+	}
+
+	spin_unlock(&object->lock);
+	return ret;
+}
+
+/*
+ * remove an object from whatever queue it's waiting on
+ */
+static void fscache_dequeue_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (!list_empty(&object->dep_link)) {
+		spin_lock(&object->parent->lock);
+		list_del_init(&object->dep_link);
+		spin_unlock(&object->parent->lock);
+	}
+
+	_leave("");
+}
+
+/**
+ * fscache_check_aux - Ask the netfs whether an object on disk is still valid
+ * @object: The object to ask about
+ * @data: The auxiliary data for the object
+ * @datalen: The size of the auxiliary data
+ *
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
+ */
+enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+					const void *data, uint16_t datalen)
+{
+	enum fscache_checkaux result;
+
+	if (!object->cookie->def->check_aux) {
+		fscache_stat(&fscache_n_checkaux_none);
+		return FSCACHE_CHECKAUX_OKAY;
+	}
+
+	result = object->cookie->def->check_aux(object->cookie->netfs_data,
+						data, datalen);
+	switch (result) {
+		/* entry okay as is */
+	case FSCACHE_CHECKAUX_OKAY:
+		fscache_stat(&fscache_n_checkaux_okay);
+		break;
+
+		/* entry requires update */
+	case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+		fscache_stat(&fscache_n_checkaux_update);
+		break;
+
+		/* entry requires deletion */
+	case FSCACHE_CHECKAUX_OBSOLETE:
+		fscache_stat(&fscache_n_checkaux_obsolete);
+		break;
+
+	default:
+		BUG();
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(fscache_check_aux);
+
+/*
+ * Asynchronously invalidate an object.
+ */
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+							      int event)
+{
+	struct fscache_operation *op;
+	struct fscache_cookie *cookie = object->cookie;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	/* We're going to need the cookie.  If the cookie is not available then
+	 * retire the object instead.
+	 */
+	if (!fscache_use_cookie(object)) {
+		ASSERT(object->cookie->stores.rnode == NULL);
+		set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+		_leave(" [no cookie]");
+		return transit_to(KILL_OBJECT);
+	}
+
+	/* Reject any new read/write ops and abort any that are pending. */
+	fscache_invalidate_writes(cookie);
+	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+	fscache_cancel_all_ops(object);
+
+	/* Now we have to wait for in-progress reads and writes */
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op)
+		goto nomem;
+
+	fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+	op->flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_EXCLUSIVE) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
+
+	spin_lock(&cookie->lock);
+	if (fscache_submit_exclusive_op(object, op) < 0)
+		goto submit_op_failed;
+	spin_unlock(&cookie->lock);
+	fscache_put_operation(op);
+
+	/* Once we've completed the invalidation, we know there will be no data
+	 * stored in the cache and thus we can reinstate the data-check-skip
+	 * optimisation.
+	 */
+	set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+	/* We can allow read and write requests to come in once again.  They'll
+	 * queue up behind our exclusive invalidation operation.
+	 */
+	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+	_leave(" [ok]");
+	return transit_to(UPDATE_OBJECT);
+
+nomem:
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	fscache_unuse_cookie(object);
+	_leave(" [ENOMEM]");
+	return transit_to(KILL_OBJECT);
+
+submit_op_failed:
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	spin_unlock(&cookie->lock);
+	fscache_unuse_cookie(object);
+	kfree(op);
+	_leave(" [EIO]");
+	return transit_to(KILL_OBJECT);
+}
+
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+							     int event)
+{
+	const struct fscache_state *s;
+
+	fscache_stat(&fscache_n_invalidates_run);
+	fscache_stat(&fscache_n_cop_invalidate_object);
+	s = _fscache_invalidate_object(object, event);
+	fscache_stat_d(&fscache_n_cop_invalidate_object);
+	return s;
+}
+
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+							 int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	fscache_stat(&fscache_n_updates_run);
+	fscache_stat(&fscache_n_cop_update_object);
+	object->cache->ops->update_object(object);
+	fscache_stat_d(&fscache_n_cop_update_object);
+
+	_leave("");
+	return transit_to(WAIT_FOR_CMD);
+}
diff --git a/kernel/fs/fscache/operation.c b/kernel/fs/fscache/operation.c
new file mode 100644
index 000000000..e7b87a0e5
--- /dev/null
+++ b/kernel/fs/fscache/operation.c
@@ -0,0 +1,529 @@
+/* FS-Cache worker operation management routines
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/operations.txt
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+atomic_t fscache_op_debug_id;
+EXPORT_SYMBOL(fscache_op_debug_id);
+
+/**
+ * fscache_enqueue_operation - Enqueue an operation for processing
+ * @op: The operation to enqueue
+ *
+ * Enqueue an operation for processing by the FS-Cache thread pool.
+ *
+ * This will get its own ref on the object.
+ */
+void fscache_enqueue_operation(struct fscache_operation *op)
+{
+	_enter("{OBJ%x OP%x,%u}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERT(list_empty(&op->pend_link));
+	ASSERT(op->processor != NULL);
+	ASSERT(fscache_object_is_available(op->object));
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
+
+	fscache_stat(&fscache_n_op_enqueue);
+	switch (op->flags & FSCACHE_OP_TYPE) {
+	case FSCACHE_OP_ASYNC:
+		_debug("queue async");
+		atomic_inc(&op->usage);
+		if (!queue_work(fscache_op_wq, &op->work))
+			fscache_put_operation(op);
+		break;
+	case FSCACHE_OP_MYTHREAD:
+		_debug("queue for caller's attention");
+		break;
+	default:
+		pr_err("Unexpected op type %lx", op->flags);
+		BUG();
+		break;
+	}
+}
+EXPORT_SYMBOL(fscache_enqueue_operation);
+
+/*
+ * start an op running
+ */
+static void fscache_run_op(struct fscache_object *object,
+			   struct fscache_operation *op)
+{
+	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+
+	op->state = FSCACHE_OP_ST_IN_PROGRESS;
+	object->n_in_progress++;
+	if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+		wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+	if (op->processor)
+		fscache_enqueue_operation(op);
+	fscache_stat(&fscache_n_op_run);
+}
+
+/*
+ * submit an exclusive operation for an object
+ * - other ops are excluded from running simultaneously with this one
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_exclusive_op(struct fscache_object *object,
+				struct fscache_operation *op)
+{
+	int ret;
+
+	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+
+	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	spin_lock(&object->lock);
+	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+	ASSERT(list_empty(&op->pend_link));
+
+	op->state = FSCACHE_OP_ST_PENDING;
+	if (fscache_object_is_active(object)) {
+		op->object = object;
+		object->n_ops++;
+		object->n_exclusive++;	/* reads and writes must wait */
+
+		if (object->n_in_progress > 0) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+		} else if (!list_empty(&object->pending_ops)) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+			fscache_start_operations(object);
+		} else {
+			ASSERTCMP(object->n_in_progress, ==, 0);
+			fscache_run_op(object, op);
+		}
+
+		/* need to issue a new write op after this */
+		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+		ret = 0;
+	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+		op->object = object;
+		object->n_ops++;
+		object->n_exclusive++;	/* reads and writes must wait */
+		atomic_inc(&op->usage);
+		list_add_tail(&op->pend_link, &object->pending_ops);
+		fscache_stat(&fscache_n_op_pend);
+		ret = 0;
+	} else {
+		/* If we're in any other state, there must have been an I/O
+		 * error of some nature.
+		 */
+		ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
+		ret = -EIO;
+	}
+
+	spin_unlock(&object->lock);
+	return ret;
+}
+
+/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+						 struct fscache_operation *op,
+						 const struct fscache_state *ostate)
+{
+	static bool once_only;
+	struct fscache_operation *p;
+	unsigned n;
+
+	if (once_only)
+		return;
+	once_only = true;
+
+	kdebug("unexpected submission OP%x [OBJ%x %s]",
+	       op->debug_id, object->debug_id, object->state->name);
+	kdebug("objstate=%s [%s]", object->state->name, ostate->name);
+	kdebug("objflags=%lx", object->flags);
+	kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+	kdebug("ops=%u inp=%u exc=%u",
+	       object->n_ops, object->n_in_progress, object->n_exclusive);
+
+	if (!list_empty(&object->pending_ops)) {
+		n = 0;
+		list_for_each_entry(p, &object->pending_ops, pend_link) {
+			ASSERTCMP(p->object, ==, object);
+			kdebug("%p %p", op->processor, op->release);
+			n++;
+		}
+
+		kdebug("n=%u", n);
+	}
+
+	dump_stack();
+}
+
+/*
+ * submit an operation for an object
+ * - objects may be submitted only in the following states:
+ *   - during object creation (write ops may be submitted)
+ *   - whilst the object is active
+ *   - after an I/O error incurred in one of the two above states (op rejected)
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_op(struct fscache_object *object,
+		      struct fscache_operation *op)
+{
+	const struct fscache_state *ostate;
+	int ret;
+
+	_enter("{OBJ%x OP%x},{%u}",
+	       object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	spin_lock(&object->lock);
+	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+	ASSERT(list_empty(&op->pend_link));
+
+	ostate = object->state;
+	smp_rmb();
+
+	op->state = FSCACHE_OP_ST_PENDING;
+	if (fscache_object_is_active(object)) {
+		op->object = object;
+		object->n_ops++;
+
+		if (object->n_exclusive > 0) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+		} else if (!list_empty(&object->pending_ops)) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+			fscache_start_operations(object);
+		} else {
+			ASSERTCMP(object->n_exclusive, ==, 0);
+			fscache_run_op(object, op);
+		}
+		ret = 0;
+	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+		op->object = object;
+		object->n_ops++;
+		atomic_inc(&op->usage);
+		list_add_tail(&op->pend_link, &object->pending_ops);
+		fscache_stat(&fscache_n_op_pend);
+		ret = 0;
+	} else if (fscache_object_is_dying(object)) {
+		fscache_stat(&fscache_n_op_rejected);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+		fscache_report_unexpected_submission(object, op, ostate);
+		ASSERT(!fscache_object_is_active(object));
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else {
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	}
+
+	spin_unlock(&object->lock);
+	return ret;
+}
+
+/*
+ * queue an object for withdrawal on error, aborting all following asynchronous
+ * operations
+ */
+void fscache_abort_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+}
+
+/*
+ * Jump start the operation processing on an object.  The caller must hold
+ * object->lock.
+ */
+void fscache_start_operations(struct fscache_object *object)
+{
+	struct fscache_operation *op;
+	bool stop = false;
+
+	while (!list_empty(&object->pending_ops) && !stop) {
+		op = list_entry(object->pending_ops.next,
+				struct fscache_operation, pend_link);
+
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+			if (object->n_in_progress > 0)
+				break;
+			stop = true;
+		}
+		list_del_init(&op->pend_link);
+		fscache_run_op(object, op);
+
+		/* the pending queue was holding a ref on the object */
+		fscache_put_operation(op);
+	}
+
+	ASSERTCMP(object->n_in_progress, <=, object->n_ops);
+
+	_debug("woke %d ops on OBJ%x",
+	       object->n_in_progress, object->debug_id);
+}
+
+/*
+ * cancel an operation that's pending on an object
+ */
+int fscache_cancel_op(struct fscache_operation *op,
+		      void (*do_cancel)(struct fscache_operation *))
+{
+	struct fscache_object *object = op->object;
+	int ret;
+
+	_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+
+	ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
+	ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	spin_lock(&object->lock);
+
+	ret = -EBUSY;
+	if (op->state == FSCACHE_OP_ST_PENDING) {
+		ASSERT(!list_empty(&op->pend_link));
+		fscache_stat(&fscache_n_op_cancelled);
+		list_del_init(&op->pend_link);
+		if (do_cancel)
+			do_cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+			object->n_exclusive--;
+		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+		fscache_put_operation(op);
+		ret = 0;
+	}
+
+	spin_unlock(&object->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Cancel all pending operations on an object
+ */
+void fscache_cancel_all_ops(struct fscache_object *object)
+{
+	struct fscache_operation *op;
+
+	_enter("OBJ%x", object->debug_id);
+
+	spin_lock(&object->lock);
+
+	while (!list_empty(&object->pending_ops)) {
+		op = list_entry(object->pending_ops.next,
+				struct fscache_operation, pend_link);
+		fscache_stat(&fscache_n_op_cancelled);
+		list_del_init(&op->pend_link);
+
+		ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+			object->n_exclusive--;
+		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+		fscache_put_operation(op);
+		cond_resched_lock(&object->lock);
+	}
+
+	spin_unlock(&object->lock);
+	_leave("");
+}
+
+/*
+ * Record the completion or cancellation of an in-progress operation.
+ */
+void fscache_op_complete(struct fscache_operation *op, bool cancelled)
+{
+	struct fscache_object *object = op->object;
+
+	_enter("OBJ%x", object->debug_id);
+
+	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
+	ASSERTCMP(object->n_in_progress, >, 0);
+	ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+		    object->n_exclusive, >, 0);
+	ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+		    object->n_in_progress, ==, 1);
+
+	spin_lock(&object->lock);
+
+	op->state = cancelled ?
+		FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+
+	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+		object->n_exclusive--;
+	object->n_in_progress--;
+	if (object->n_in_progress == 0)
+		fscache_start_operations(object);
+
+	spin_unlock(&object->lock);
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_op_complete);
+
+/*
+ * release an operation
+ * - queues pending ops if this is the last in-progress op
+ */
+void fscache_put_operation(struct fscache_operation *op)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("{OBJ%x OP%x,%d}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	if (!atomic_dec_and_test(&op->usage))
+		return;
+
+	_debug("PUT OP");
+	ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
+		    op->state, ==, FSCACHE_OP_ST_CANCELLED);
+	op->state = FSCACHE_OP_ST_DEAD;
+
+	fscache_stat(&fscache_n_op_release);
+
+	if (op->release) {
+		op->release(op);
+		op->release = NULL;
+	}
+
+	object = op->object;
+
+	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+		atomic_dec(&object->n_reads);
+	if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+		fscache_unuse_cookie(object);
+
+	/* now... we may get called with the object spinlock held, so we
+	 * complete the cleanup here only if we can immediately acquire the
+	 * lock, and defer it otherwise */
+	if (!spin_trylock(&object->lock)) {
+		_debug("defer put");
+		fscache_stat(&fscache_n_op_deferred_release);
+
+		cache = object->cache;
+		spin_lock(&cache->op_gc_list_lock);
+		list_add_tail(&op->pend_link, &cache->op_gc_list);
+		spin_unlock(&cache->op_gc_list_lock);
+		schedule_work(&cache->op_gc);
+		_leave(" [defer]");
+		return;
+	}
+
+	ASSERTCMP(object->n_ops, >, 0);
+	object->n_ops--;
+	if (object->n_ops == 0)
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+	spin_unlock(&object->lock);
+
+	kfree(op);
+	_leave(" [done]");
+}
+EXPORT_SYMBOL(fscache_put_operation);
+
+/*
+ * garbage collect operations that have had their release deferred
+ */
+void fscache_operation_gc(struct work_struct *work)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+	struct fscache_cache *cache =
+		container_of(work, struct fscache_cache, op_gc);
+	int count = 0;
+
+	_enter("");
+
+	do {
+		spin_lock(&cache->op_gc_list_lock);
+		if (list_empty(&cache->op_gc_list)) {
+			spin_unlock(&cache->op_gc_list_lock);
+			break;
+		}
+
+		op = list_entry(cache->op_gc_list.next,
+				struct fscache_operation, pend_link);
+		list_del(&op->pend_link);
+		spin_unlock(&cache->op_gc_list_lock);
+
+		object = op->object;
+		spin_lock(&object->lock);
+
+		_debug("GC DEFERRED REL OBJ%x OP%x",
+		       object->debug_id, op->debug_id);
+		fscache_stat(&fscache_n_op_gc);
+
+		ASSERTCMP(atomic_read(&op->usage), ==, 0);
+		ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);
+
+		ASSERTCMP(object->n_ops, >, 0);
+		object->n_ops--;
+		if (object->n_ops == 0)
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+		spin_unlock(&object->lock);
+		kfree(op);
+
+	} while (count++ < 20);
+
+	if (!list_empty(&cache->op_gc_list))
+		schedule_work(&cache->op_gc);
+
+	_leave("");
+}
+
+/*
+ * execute an operation using fs_op_wq to provide processing context -
+ * the caller holds a ref to this object, so we don't need to hold one
+ */
+void fscache_op_work_func(struct work_struct *work)
+{
+	struct fscache_operation *op =
+		container_of(work, struct fscache_operation, work);
+	unsigned long start;
+
+	_enter("{OBJ%x OP%x,%d}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERT(op->processor != NULL);
+	start = jiffies;
+	op->processor(op);
+	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(op);
+
+	_leave("");
+}
diff --git a/kernel/fs/fscache/page.c b/kernel/fs/fscache/page.c
new file mode 100644
index 000000000..de33b3fcc
--- /dev/null
+++ b/kernel/fs/fscache/page.c
@@ -0,0 +1,1195 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	void *val;
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	rcu_read_unlock();
+
+	return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+
+/*
+ * wait for a page to finish being written to the cache. Put a timeout here
+ * since we might be called recursively via parent fs.
+ */
+static
+bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
+				  HZ);
+}
+
+/*
+ * decide whether a page can be released, possibly by cancelling a store to it
+ * - we're allowed to sleep if __GFP_WAIT is flagged
+ */
+bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
+				  struct page *page,
+				  gfp_t gfp)
+{
+	struct page *xpage;
+	void *val;
+
+	_enter("%p,%p,%x", cookie, page, gfp);
+
+try_again:
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	if (!val) {
+		rcu_read_unlock();
+		fscache_stat(&fscache_n_store_vmscan_not_storing);
+		__fscache_uncache_page(cookie, page);
+		return true;
+	}
+
+	/* see if the page is actually undergoing storage - if so we can't get
+	 * rid of it till the cache has finished with it */
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		rcu_read_unlock();
+		goto page_busy;
+	}
+
+	/* the page is pending storage, so we attempt to cancel the store and
+	 * discard the store request so that the page can be reclaimed */
+	spin_lock(&cookie->stores_lock);
+	rcu_read_unlock();
+
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		/* the page started to undergo storage whilst we were looking,
+		 * so now we can only wait or return */
+		spin_unlock(&cookie->stores_lock);
+		goto page_busy;
+	}
+
+	xpage = radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->stores_lock);
+
+	if (xpage) {
+		fscache_stat(&fscache_n_store_vmscan_cancelled);
+		fscache_stat(&fscache_n_store_radix_deletes);
+		ASSERTCMP(xpage, ==, page);
+	} else {
+		fscache_stat(&fscache_n_store_vmscan_gone);
+	}
+
+	wake_up_bit(&cookie->flags, 0);
+	if (xpage)
+		page_cache_release(xpage);
+	__fscache_uncache_page(cookie, page);
+	return true;
+
+page_busy:
+	/* We will wait here if we're allowed to, but that could deadlock the
+	 * allocator as the work threads writing to the cache may all end up
+	 * sleeping on memory allocation, so we may need to impose a timeout
+	 * too. */
+	if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+		fscache_stat(&fscache_n_store_vmscan_busy);
+		return false;
+	}
+
+	fscache_stat(&fscache_n_store_vmscan_wait);
+	if (!release_page_wait_timeout(cookie, page))
+		_debug("fscache writeout timeout page: %p{%lx}",
+			page, page->index);
+
+	gfp &= ~__GFP_WAIT;
+	goto try_again;
+}
+EXPORT_SYMBOL(__fscache_maybe_release_page);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_object *object,
+				   struct page *page)
+{
+	struct fscache_cookie *cookie;
+	struct page *xpage = NULL;
+
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+	if (cookie) {
+		/* delete the page from the tree if it is now no longer
+		 * pending */
+		spin_lock(&cookie->stores_lock);
+		radix_tree_tag_clear(&cookie->stores, page->index,
+				     FSCACHE_COOKIE_STORING_TAG);
+		if (!radix_tree_tag_get(&cookie->stores, page->index,
+					FSCACHE_COOKIE_PENDING_TAG)) {
+			fscache_stat(&fscache_n_store_radix_deletes);
+			xpage = radix_tree_delete(&cookie->stores, page->index);
+		}
+		spin_unlock(&cookie->stores_lock);
+		wake_up_bit(&cookie->flags, 0);
+	}
+	spin_unlock(&object->lock);
+	if (xpage)
+		page_cache_release(xpage);
+}
+
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+	struct fscache_object *object = op->object;
+	int ret;
+
+	_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+
+	fscache_stat(&fscache_n_attr_changed_calls);
+
+	if (fscache_object_is_active(object)) {
+		fscache_stat(&fscache_n_cop_attr_changed);
+		ret = object->cache->ops->attr_changed(object);
+		fscache_stat_d(&fscache_n_cop_attr_changed);
+		if (ret < 0)
+			fscache_abort_object(object);
+	}
+
+	fscache_op_complete(op, true);
+	_leave("");
+}
+
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+
+	_enter("%p", cookie);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+	fscache_stat(&fscache_n_attr_changed);
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op) {
+		fscache_stat(&fscache_n_attr_changed_nomem);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	op->flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_EXCLUSIVE) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
+
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	__fscache_use_cookie(cookie);
+	if (fscache_submit_exclusive_op(object, op) < 0)
+		goto nobufs_dec;
+	spin_unlock(&cookie->lock);
+	fscache_stat(&fscache_n_attr_changed_ok);
+	fscache_put_operation(op);
+	_leave(" = 0");
+	return 0;
+
+nobufs_dec:
+	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+	fscache_stat(&fscache_n_attr_changed_nobufs);
+	_leave(" = %d", -ENOBUFS);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
+
+	fscache_hist(fscache_retrieval_histogram, op->start_time);
+	if (op->context)
+		fscache_put_context(op->op.object->cookie, op->context);
+
+	_leave("");
+}
+
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+	struct fscache_cookie *cookie,
+	struct address_space *mapping,
+	fscache_rw_complete_t end_io_func,
+	void *context)
+{
+	struct fscache_retrieval *op;
+
+	/* allocate a retrieval operation and attempt to submit it */
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op) {
+		fscache_stat(&fscache_n_retrievals_nomem);
+		return NULL;
+	}
+
+	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
+	op->op.flags	= FSCACHE_OP_MYTHREAD |
+		(1UL << FSCACHE_OP_WAITING) |
+		(1UL << FSCACHE_OP_UNUSE_COOKIE);
+	op->mapping	= mapping;
+	op->end_io_func	= end_io_func;
+	op->context	= context;
+	op->start_time	= jiffies;
+	INIT_LIST_HEAD(&op->to_do);
+	return op;
+}
+
+/*
+ * wait for a deferred lookup to complete
+ */
+int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+	unsigned long jif;
+
+	_enter("");
+
+	if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+		_leave(" = 0 [imm]");
+		return 0;
+	}
+
+	fscache_stat(&fscache_n_retrievals_wait);
+
+	jif = jiffies;
+	if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			TASK_INTERRUPTIBLE) != 0) {
+		fscache_stat(&fscache_n_retrievals_intr);
+		_leave(" = -ERESTARTSYS");
+		return -ERESTARTSYS;
+	}
+
+	ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+
+	smp_rmb();
+	fscache_hist(fscache_retrieval_delay_histogram, jif);
+	_leave(" = 0 [dly]");
+	return 0;
+}
+
+/*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	atomic_set(&op->n_pages, 0);
+}
+
+/*
+ * wait for an object to become active (or dead)
+ */
+int fscache_wait_for_operation_activation(struct fscache_object *object,
+					  struct fscache_operation *op,
+					  atomic_t *stat_op_waits,
+					  atomic_t *stat_object_dead,
+					  void (*do_cancel)(struct fscache_operation *))
+{
+	int ret;
+
+	if (!test_bit(FSCACHE_OP_WAITING, &op->flags))
+		goto check_if_dead;
+
+	_debug(">>> WT");
+	if (stat_op_waits)
+		fscache_stat(stat_op_waits);
+	if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
+			TASK_INTERRUPTIBLE) != 0) {
+		ret = fscache_cancel_op(op, do_cancel);
+		if (ret == 0)
+			return -ERESTARTSYS;
+
+		/* it's been removed from the pending queue by another party,
+		 * so we should get to run shortly */
+		wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
+			    TASK_UNINTERRUPTIBLE);
+	}
+	_debug("<<< GO");
+
+check_if_dead:
+	if (op->state == FSCACHE_OP_ST_CANCELLED) {
+		if (stat_object_dead)
+			fscache_stat(stat_object_dead);
+		_leave(" = -ENOBUFS [cancelled]");
+		return -ENOBUFS;
+	}
+	if (unlikely(fscache_object_is_dead(object))) {
+		pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state);
+		fscache_cancel_op(op, do_cancel);
+		if (stat_object_dead)
+			fscache_stat(stat_object_dead);
+		return -ENOBUFS;
+	}
+	return 0;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   -ENODATA	- no data available in the backing object for this block
+ *   0		- dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+				 struct page *page,
+				 fscache_rw_complete_t end_io_func,
+				 void *context,
+				 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+		_leave(" = -ENOBUFS [invalidating]");
+		return -ENOBUFS;
+	}
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(cookie, page->mapping,
+				     end_io_func, context);
+	if (!op) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+	atomic_set(&op->n_pages, 1);
+
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
+
+	__fscache_use_cookie(cookie);
+	atomic_inc(&object->n_reads);
+	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock_dec;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	ret = fscache_wait_for_operation_activation(
+		object, &op->op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead),
+		fscache_do_cancel_retrieval);
+	if (ret < 0)
+		goto error;
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		fscache_stat(&fscache_n_cop_allocate_page);
+		ret = object->cache->ops->allocate_page(op, page, gfp);
+		fscache_stat_d(&fscache_n_cop_allocate_page);
+		if (ret == 0)
+			ret = -ENODATA;
+	} else {
+		fscache_stat(&fscache_n_cop_read_or_alloc_page);
+		ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+		fscache_stat_d(&fscache_n_cop_read_or_alloc_page);
+	}
+
+error:
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock_dec:
+	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM	- out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS	- no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA	- no data available in the backing object for some or all of
+ *                the pages
+ *   0		- dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages,
+				  fscache_rw_complete_t end_io_func,
+				  void *context,
+				  gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+	int ret;
+
+	_enter("%p,,%d,,,", cookie, *nr_pages);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+		_leave(" = -ENOBUFS [invalidating]");
+		return -ENOBUFS;
+	}
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(*nr_pages, >, 0);
+	ASSERT(!list_empty(pages));
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
+	if (!op)
+		return -ENOMEM;
+	atomic_set(&op->n_pages, *nr_pages);
+
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	__fscache_use_cookie(cookie);
+	atomic_inc(&object->n_reads);
+	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock_dec;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	ret = fscache_wait_for_operation_activation(
+		object, &op->op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead),
+		fscache_do_cancel_retrieval);
+	if (ret < 0)
+		goto error;
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		fscache_stat(&fscache_n_cop_allocate_pages);
+		ret = object->cache->ops->allocate_pages(
+			op, pages, nr_pages, gfp);
+		fscache_stat_d(&fscache_n_cop_allocate_pages);
+	} else {
+		fscache_stat(&fscache_n_cop_read_or_alloc_pages);
+		ret = object->cache->ops->read_or_alloc_pages(
+			op, pages, nr_pages, gfp);
+		fscache_stat_d(&fscache_n_cop_read_or_alloc_pages);
+	}
+
+error:
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock_dec:
+	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   0		- block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_allocs);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+		_leave(" = -ENOBUFS [invalidating]");
+		return -ENOBUFS;
+	}
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
+	if (!op)
+		return -ENOMEM;
+	atomic_set(&op->n_pages, 1);
+
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	__fscache_use_cookie(cookie);
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock_dec;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_alloc_ops);
+
+	ret = fscache_wait_for_operation_activation(
+		object, &op->op,
+		__fscache_stat(&fscache_n_alloc_op_waits),
+		__fscache_stat(&fscache_n_allocs_object_dead),
+		fscache_do_cancel_retrieval);
+	if (ret < 0)
+		goto error;
+
+	/* ask the cache to honour the operation */
+	fscache_stat(&fscache_n_cop_allocate_page);
+	ret = object->cache->ops->allocate_page(op, page, gfp);
+	fscache_stat_d(&fscache_n_cop_allocate_page);
+
+error:
+	if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_allocs_intr);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_allocs_nobufs);
+	else
+		fscache_stat(&fscache_n_allocs_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock_dec:
+	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+nobufs:
+	fscache_stat(&fscache_n_allocs_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+
+/*
+ * Unmark pages allocate in the readahead code path (via:
+ * fscache_readpages_or_alloc) after delegating to the base filesystem
+ */
+void __fscache_readpages_cancel(struct fscache_cookie *cookie,
+				struct list_head *pages)
+{
+	struct page *page;
+
+	list_for_each_entry(page, pages, lru) {
+		if (PageFsCache(page))
+			__fscache_uncache_page(cookie, page);
+	}
+}
+EXPORT_SYMBOL(__fscache_readpages_cancel);
+
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+	_enter("{OP%x}", _op->debug_id);
+}
+
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+	struct fscache_storage *op =
+		container_of(_op, struct fscache_storage, op);
+	struct fscache_object *object = op->op.object;
+	struct fscache_cookie *cookie;
+	struct page *page;
+	unsigned n;
+	void *results[1];
+	int ret;
+
+	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+
+	if (!fscache_object_is_active(object)) {
+		/* If we get here, then the on-disk cache object likely longer
+		 * exists, so we should just cancel this write operation.
+		 */
+		spin_unlock(&object->lock);
+		fscache_op_complete(&op->op, false);
+		_leave(" [inactive]");
+		return;
+	}
+
+	if (!cookie) {
+		/* If we get here, then the cookie belonging to the object was
+		 * detached, probably by the cookie being withdrawn due to
+		 * memory pressure, which means that the pages we might write
+		 * to the cache from no longer exist - therefore, we can just
+		 * cancel this write operation.
+		 */
+		spin_unlock(&object->lock);
+		fscache_op_complete(&op->op, false);
+		_leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
+		       _op->flags, _op->state, object->state->short_name,
+		       object->flags);
+		return;
+	}
+
+	spin_lock(&cookie->stores_lock);
+
+	fscache_stat(&fscache_n_store_calls);
+
+	/* find a page to store */
+	page = NULL;
+	n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+				       FSCACHE_COOKIE_PENDING_TAG);
+	if (n != 1)
+		goto superseded;
+	page = results[0];
+	_debug("gang %d [%lx]", n, page->index);
+	if (page->index > op->store_limit) {
+		fscache_stat(&fscache_n_store_pages_over_limit);
+		goto superseded;
+	}
+
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_STORING_TAG);
+	radix_tree_tag_clear(&cookie->stores, page->index,
+			     FSCACHE_COOKIE_PENDING_TAG);
+
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+
+	fscache_stat(&fscache_n_store_pages);
+	fscache_stat(&fscache_n_cop_write_page);
+	ret = object->cache->ops->write_page(op, page);
+	fscache_stat_d(&fscache_n_cop_write_page);
+	fscache_end_page_write(object, page);
+	if (ret < 0) {
+		fscache_abort_object(object);
+		fscache_op_complete(&op->op, true);
+	} else {
+		fscache_enqueue_operation(&op->op);
+	}
+
+	_leave("");
+	return;
+
+superseded:
+	/* this writer is going away and there aren't any more things to
+	 * write */
+	_debug("cease");
+	spin_unlock(&cookie->stores_lock);
+	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+	spin_unlock(&object->lock);
+	fscache_op_complete(&op->op, true);
+	_leave("");
+}
+
+/*
+ * Clear the pages pending writing for invalidation
+ */
+void fscache_invalidate_writes(struct fscache_cookie *cookie)
+{
+	struct page *page;
+	void *results[16];
+	int n, i;
+
+	_enter("");
+
+	for (;;) {
+		spin_lock(&cookie->stores_lock);
+		n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+					       ARRAY_SIZE(results),
+					       FSCACHE_COOKIE_PENDING_TAG);
+		if (n == 0) {
+			spin_unlock(&cookie->stores_lock);
+			break;
+		}
+
+		for (i = n - 1; i >= 0; i--) {
+			page = results[i];
+			radix_tree_delete(&cookie->stores, page->index);
+		}
+
+		spin_unlock(&cookie->stores_lock);
+
+		for (i = n - 1; i >= 0; i--)
+			page_cache_release(results[i]);
+	}
+
+	_leave("");
+}
+
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ENOBUFS	- no backing object available in which to cache the page
+ *   0		- dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *	(a) no writes yet
+ *
+ *	(b) writes deferred till post-creation (mark page for writing and
+ *	    return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *
+ *	(a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *	(b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_storage *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+	int ret;
+
+	_enter("%p,%x,", cookie, (u32) page->flags);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERT(PageFsCache(page));
+
+	fscache_stat(&fscache_n_stores);
+
+	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+		_leave(" = -ENOBUFS [invalidating]");
+		return -ENOBUFS;
+	}
+
+	op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
+	if (!op)
+		goto nomem;
+
+	fscache_operation_init(&op->op, fscache_write_op,
+			       fscache_release_write_op);
+	op->op.flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_WAITING) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
+
+	ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM);
+	if (ret < 0)
+		goto nomem_free;
+
+	ret = -ENOBUFS;
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+	if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+		goto nobufs;
+
+	/* add the page to the pending-storage radix tree on the backing
+	 * object */
+	spin_lock(&object->lock);
+	spin_lock(&cookie->stores_lock);
+
+	_debug("store limit %llx", (unsigned long long) object->store_limit);
+
+	ret = radix_tree_insert(&cookie->stores, page->index, page);
+	if (ret < 0) {
+		if (ret == -EEXIST)
+			goto already_queued;
+		_debug("insert failed %d", ret);
+		goto nobufs_unlock_obj;
+	}
+
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_PENDING_TAG);
+	page_cache_get(page);
+
+	/* we only want one writer at a time, but we do need to queue new
+	 * writers after exclusive ops */
+	if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+		goto already_pending;
+
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+
+	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
+	op->store_limit = object->store_limit;
+
+	__fscache_use_cookie(cookie);
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto submit_failed;
+
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	fscache_stat(&fscache_n_store_ops);
+	fscache_stat(&fscache_n_stores_ok);
+
+	/* the work queue now carries its own ref on the object */
+	fscache_put_operation(&op->op);
+	_leave(" = 0");
+	return 0;
+
+already_queued:
+	fscache_stat(&fscache_n_stores_again);
+already_pending:
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_ok);
+	_leave(" = 0");
+	return 0;
+
+submit_failed:
+	spin_lock(&cookie->stores_lock);
+	radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->stores_lock);
+	wake_cookie = __fscache_unuse_cookie(cookie);
+	page_cache_release(page);
+	ret = -ENOBUFS;
+	goto nobufs;
+
+nobufs_unlock_obj:
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+nobufs:
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+	fscache_stat(&fscache_n_stores_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+
+nomem_free:
+	kfree(op);
+nomem:
+	fscache_stat(&fscache_n_stores_oom);
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+	struct fscache_object *object;
+
+	_enter(",%p", page);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	fscache_stat(&fscache_n_uncaches);
+
+	/* cache withdrawal may beat us to it */
+	if (!PageFsCache(page))
+		goto done;
+
+	/* get the object */
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects)) {
+		ClearPageFsCache(page);
+		goto done_unlock;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	/* there might now be stuff on disk we could read */
+	clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+	/* only invoke the cache backend if we managed to mark the page
+	 * uncached here; this deals with synchronisation vs withdrawal */
+	if (TestClearPageFsCache(page) &&
+	    object->cache->ops->uncache_page) {
+		/* the cache backend releases the cookie lock */
+		fscache_stat(&fscache_n_cop_uncache_page);
+		object->cache->ops->uncache_page(object, page);
+		fscache_stat_d(&fscache_n_cop_uncache_page);
+		goto done;
+	}
+
+done_unlock:
+	spin_unlock(&cookie->lock);
+done:
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+
+/**
+ * fscache_mark_page_cached - Mark a page as being cached
+ * @op: The retrieval op pages are being marked for
+ * @page: The page to be marked
+ *
+ * Mark a netfs page as being cached.  After this is called, the netfs
+ * must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
+{
+	struct fscache_cookie *cookie = op->op.object->cookie;
+
+#ifdef CONFIG_FSCACHE_STATS
+	atomic_inc(&fscache_n_marks);
+#endif
+
+	_debug("- mark %p{%lx}", page, page->index);
+	if (TestSetPageFsCache(page)) {
+		static bool once_only;
+		if (!once_only) {
+			once_only = true;
+			pr_warn("Cookie type %s marked page %lx multiple times\n",
+				cookie->def->name, page->index);
+		}
+	}
+
+	if (cookie->def->mark_page_cached)
+		cookie->def->mark_page_cached(cookie->netfs_data,
+					      op->mapping, page);
+}
+EXPORT_SYMBOL(fscache_mark_page_cached);
+
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+			       struct pagevec *pagevec)
+{
+	unsigned long loop;
+
+	for (loop = 0; loop < pagevec->nr; loop++)
+		fscache_mark_page_cached(op, pagevec->pages[loop]);
+
+	pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
+
+/*
+ * Uncache all the pages in an inode that are marked PG_fscache, assuming them
+ * to be associated with the given cookie.
+ */
+void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+				       struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	_enter("%p,%p", cookie, inode);
+
+	if (!mapping || mapping->nrpages == 0) {
+		_leave(" [no pages]");
+		return;
+	}
+
+	pagevec_init(&pvec, 0);
+	next = 0;
+	do {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+			break;
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			next = page->index;
+			if (PageFsCache(page)) {
+				__fscache_wait_on_page_write(cookie, page);
+				__fscache_uncache_page(cookie, page);
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	} while (++next);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_all_inode_pages);
diff --git a/kernel/fs/fscache/proc.c b/kernel/fs/fscache/proc.c
new file mode 100644
index 000000000..1d9e4951a
--- /dev/null
+++ b/kernel/fs/fscache/proc.c
@@ -0,0 +1,81 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/fscache", NULL))
+		goto error_dir;
+
+#ifdef CONFIG_FSCACHE_STATS
+	if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+			 &fscache_stats_fops))
+		goto error_stats;
+#endif
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+			 &fscache_histogram_fops))
+		goto error_histogram;
+#endif
+
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
+			 &fscache_objlist_fops))
+		goto error_objects;
+#endif
+
+	_leave(" = 0");
+	return 0;
+
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+error_objects:
+#endif
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	remove_proc_entry("fs/fscache/objects", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/kernel/fs/fscache/stats.c b/kernel/fs/fscache/stats.c
new file mode 100644
index 000000000..40d13c70e
--- /dev/null
+++ b/kernel/fs/fscache/stats.c
@@ -0,0 +1,291 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+atomic_t fscache_n_op_cancelled;
+atomic_t fscache_n_op_rejected;
+
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_allocs_intr;
+atomic_t fscache_n_allocs_object_dead;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrievals_object_dead;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+atomic_t fscache_n_store_pages;
+atomic_t fscache_n_store_radix_deletes;
+atomic_t fscache_n_store_pages_over_limit;
+
+atomic_t fscache_n_store_vmscan_not_storing;
+atomic_t fscache_n_store_vmscan_gone;
+atomic_t fscache_n_store_vmscan_busy;
+atomic_t fscache_n_store_vmscan_cancelled;
+atomic_t fscache_n_store_vmscan_wait;
+
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+
+atomic_t fscache_n_invalidates;
+atomic_t fscache_n_invalidates_run;
+
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+atomic_t fscache_n_relinquishes_retire;
+
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_lookups_timed_out;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+
+atomic_t fscache_n_cop_alloc_object;
+atomic_t fscache_n_cop_lookup_object;
+atomic_t fscache_n_cop_lookup_complete;
+atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_invalidate_object;
+atomic_t fscache_n_cop_update_object;
+atomic_t fscache_n_cop_drop_object;
+atomic_t fscache_n_cop_put_object;
+atomic_t fscache_n_cop_sync_cache;
+atomic_t fscache_n_cop_attr_changed;
+atomic_t fscache_n_cop_read_or_alloc_page;
+atomic_t fscache_n_cop_read_or_alloc_pages;
+atomic_t fscache_n_cop_allocate_page;
+atomic_t fscache_n_cop_allocate_pages;
+atomic_t fscache_n_cop_write_page;
+atomic_t fscache_n_cop_uncache_page;
+atomic_t fscache_n_cop_dissociate_pages;
+
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "FS-Cache statistics\n");
+
+	seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+		   atomic_read(&fscache_n_cookie_index),
+		   atomic_read(&fscache_n_cookie_data),
+		   atomic_read(&fscache_n_cookie_special));
+
+	seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+		   atomic_read(&fscache_n_object_alloc),
+		   atomic_read(&fscache_n_object_no_alloc),
+		   atomic_read(&fscache_n_object_avail),
+		   atomic_read(&fscache_n_object_dead));
+	seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+		   atomic_read(&fscache_n_checkaux_none),
+		   atomic_read(&fscache_n_checkaux_okay),
+		   atomic_read(&fscache_n_checkaux_update),
+		   atomic_read(&fscache_n_checkaux_obsolete));
+
+	seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+		   atomic_read(&fscache_n_marks),
+		   atomic_read(&fscache_n_uncaches));
+
+	seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+		   " oom=%u\n",
+		   atomic_read(&fscache_n_acquires),
+		   atomic_read(&fscache_n_acquires_null),
+		   atomic_read(&fscache_n_acquires_no_cache),
+		   atomic_read(&fscache_n_acquires_ok),
+		   atomic_read(&fscache_n_acquires_nobufs),
+		   atomic_read(&fscache_n_acquires_oom));
+
+	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
+		   atomic_read(&fscache_n_object_lookups),
+		   atomic_read(&fscache_n_object_lookups_negative),
+		   atomic_read(&fscache_n_object_lookups_positive),
+		   atomic_read(&fscache_n_object_created),
+		   atomic_read(&fscache_n_object_lookups_timed_out));
+
+	seq_printf(m, "Invals : n=%u run=%u\n",
+		   atomic_read(&fscache_n_invalidates),
+		   atomic_read(&fscache_n_invalidates_run));
+
+	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+		   atomic_read(&fscache_n_updates),
+		   atomic_read(&fscache_n_updates_null),
+		   atomic_read(&fscache_n_updates_run));
+
+	seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n",
+		   atomic_read(&fscache_n_relinquishes),
+		   atomic_read(&fscache_n_relinquishes_null),
+		   atomic_read(&fscache_n_relinquishes_waitcrt),
+		   atomic_read(&fscache_n_relinquishes_retire));
+
+	seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+		   atomic_read(&fscache_n_attr_changed),
+		   atomic_read(&fscache_n_attr_changed_ok),
+		   atomic_read(&fscache_n_attr_changed_nobufs),
+		   atomic_read(&fscache_n_attr_changed_nomem),
+		   atomic_read(&fscache_n_attr_changed_calls));
+
+	seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n",
+		   atomic_read(&fscache_n_allocs),
+		   atomic_read(&fscache_n_allocs_ok),
+		   atomic_read(&fscache_n_allocs_wait),
+		   atomic_read(&fscache_n_allocs_nobufs),
+		   atomic_read(&fscache_n_allocs_intr));
+	seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
+		   atomic_read(&fscache_n_alloc_ops),
+		   atomic_read(&fscache_n_alloc_op_waits),
+		   atomic_read(&fscache_n_allocs_object_dead));
+
+	seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+		   " int=%u oom=%u\n",
+		   atomic_read(&fscache_n_retrievals),
+		   atomic_read(&fscache_n_retrievals_ok),
+		   atomic_read(&fscache_n_retrievals_wait),
+		   atomic_read(&fscache_n_retrievals_nodata),
+		   atomic_read(&fscache_n_retrievals_nobufs),
+		   atomic_read(&fscache_n_retrievals_intr),
+		   atomic_read(&fscache_n_retrievals_nomem));
+	seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
+		   atomic_read(&fscache_n_retrieval_ops),
+		   atomic_read(&fscache_n_retrieval_op_waits),
+		   atomic_read(&fscache_n_retrievals_object_dead));
+
+	seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+		   atomic_read(&fscache_n_stores),
+		   atomic_read(&fscache_n_stores_ok),
+		   atomic_read(&fscache_n_stores_again),
+		   atomic_read(&fscache_n_stores_nobufs),
+		   atomic_read(&fscache_n_stores_oom));
+	seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
+		   atomic_read(&fscache_n_store_ops),
+		   atomic_read(&fscache_n_store_calls),
+		   atomic_read(&fscache_n_store_pages),
+		   atomic_read(&fscache_n_store_radix_deletes),
+		   atomic_read(&fscache_n_store_pages_over_limit));
+
+	seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",
+		   atomic_read(&fscache_n_store_vmscan_not_storing),
+		   atomic_read(&fscache_n_store_vmscan_gone),
+		   atomic_read(&fscache_n_store_vmscan_busy),
+		   atomic_read(&fscache_n_store_vmscan_cancelled),
+		   atomic_read(&fscache_n_store_vmscan_wait));
+
+	seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",
+		   atomic_read(&fscache_n_op_pend),
+		   atomic_read(&fscache_n_op_run),
+		   atomic_read(&fscache_n_op_enqueue),
+		   atomic_read(&fscache_n_op_cancelled),
+		   atomic_read(&fscache_n_op_rejected));
+	seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+		   atomic_read(&fscache_n_op_deferred_release),
+		   atomic_read(&fscache_n_op_release),
+		   atomic_read(&fscache_n_op_gc));
+
+	seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n",
+		   atomic_read(&fscache_n_cop_alloc_object),
+		   atomic_read(&fscache_n_cop_lookup_object),
+		   atomic_read(&fscache_n_cop_lookup_complete),
+		   atomic_read(&fscache_n_cop_grab_object));
+	seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+		   atomic_read(&fscache_n_cop_invalidate_object),
+		   atomic_read(&fscache_n_cop_update_object),
+		   atomic_read(&fscache_n_cop_drop_object),
+		   atomic_read(&fscache_n_cop_put_object),
+		   atomic_read(&fscache_n_cop_attr_changed),
+		   atomic_read(&fscache_n_cop_sync_cache));
+	seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n",
+		   atomic_read(&fscache_n_cop_read_or_alloc_page),
+		   atomic_read(&fscache_n_cop_read_or_alloc_pages),
+		   atomic_read(&fscache_n_cop_allocate_page),
+		   atomic_read(&fscache_n_cop_allocate_pages),
+		   atomic_read(&fscache_n_cop_write_page),
+		   atomic_read(&fscache_n_cop_uncache_page),
+		   atomic_read(&fscache_n_cop_dissociate_pages));
+	return 0;
+}
+
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fscache_stats_show, NULL);
+}
+
+const struct file_operations fscache_stats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release        = single_release,
+};
diff --git a/kernel/fs/fuse/Kconfig b/kernel/fs/fuse/Kconfig
new file mode 100644
index 000000000..1b2f6c2c3
--- /dev/null
+++ b/kernel/fs/fuse/Kconfig
@@ -0,0 +1,27 @@
+config FUSE_FS
+	tristate "FUSE (Filesystem in Userspace) support"
+	help
+	  With FUSE it is possible to implement a fully functional filesystem
+	  in a userspace program.
+
+	  There's also a companion library: libfuse2.  This library is available
+	  from the FUSE homepage:
+	  <http://fuse.sourceforge.net/>
+	  although chances are your distribution already has that library
+	  installed if you've installed the "fuse" package itself.
+
+	  See <file:Documentation/filesystems/fuse.txt> for more information.
+	  See <file:Documentation/Changes> for needed library/utility version.
+
+	  If you want to develop a userspace FS, or if you want to use
+	  a filesystem based on FUSE, answer Y or M.
+
+config CUSE
+	tristate "Character device in Userspace support"
+	depends on FUSE_FS
+	help
+	  This FUSE extension allows character devices to be
+	  implemented in userspace.
+
+	  If you want to develop or use a userspace character device
+	  based on CUSE, answer Y or M.
diff --git a/kernel/fs/fuse/Makefile b/kernel/fs/fuse/Makefile
new file mode 100644
index 000000000..e95eeb445
--- /dev/null
+++ b/kernel/fs/fuse/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the FUSE filesystem.
+#
+
+obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
+
+fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/kernel/fs/fuse/control.c b/kernel/fs/fuse/control.c
new file mode 100644
index 000000000..f863ac664
--- /dev/null
+++ b/kernel/fs/fuse/control.c
@@ -0,0 +1,354 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists.  Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+	struct fuse_conn *fc;
+	mutex_lock(&fuse_mutex);
+	fc = file_inode(file)->i_private;
+	if (fc)
+		fc = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
+	return fc;
+}
+
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+	if (fc) {
+		fuse_abort_conn(fc);
+		fuse_conn_put(fc);
+	}
+	return count;
+}
+
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+				      size_t len, loff_t *ppos)
+{
+	char tmp[32];
+	size_t size;
+
+	if (!*ppos) {
+		long value;
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (!fc)
+			return 0;
+
+		value = atomic_read(&fc->num_waiting);
+		file->private_data = (void *)value;
+		fuse_conn_put(fc);
+	}
+	size = sprintf(tmp, "%ld\n", (long)file->private_data);
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos, unsigned val)
+{
+	char tmp[32];
+	size_t size = sprintf(tmp, "%u\n", val);
+
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos, unsigned *val,
+				     unsigned global_limit)
+{
+	unsigned long t;
+	unsigned limit = (1 << 16) - 1;
+	int err;
+
+	if (*ppos)
+		return -EINVAL;
+
+	err = kstrtoul_from_user(buf, count, 0, &t);
+	if (err)
+		return err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		limit = min(limit, global_limit);
+
+	if (t > limit)
+		return -EINVAL;
+
+	*val = t;
+
+	return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+					     char __user *buf, size_t len,
+					     loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->max_background;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+					      const char __user *buf,
+					      size_t count, loff_t *ppos)
+{
+	unsigned uninitialized_var(val);
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_bgreq);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->max_background = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+						   char __user *buf, size_t len,
+						   loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->congestion_threshold;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+						    const char __user *buf,
+						    size_t count, loff_t *ppos)
+{
+	unsigned uninitialized_var(val);
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_congthresh);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->congestion_threshold = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
+static const struct file_operations fuse_ctl_abort_ops = {
+	.open = nonseekable_open,
+	.write = fuse_conn_abort_write,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations fuse_ctl_waiting_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_waiting_read,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_max_background_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_max_background_read,
+	.write = fuse_conn_max_background_write,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_congestion_threshold_read,
+	.write = fuse_conn_congestion_threshold_write,
+	.llseek = no_llseek,
+};
+
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+					  struct fuse_conn *fc,
+					  const char *name,
+					  int mode, int nlink,
+					  const struct inode_operations *iop,
+					  const struct file_operations *fop)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+	dentry = d_alloc_name(parent, name);
+	if (!dentry)
+		return NULL;
+
+	fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+	inode = new_inode(fuse_control_sb);
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_uid = fc->user_id;
+	inode->i_gid = fc->group_id;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	/* setting ->i_op to NULL is not allowed */
+	if (iop)
+		inode->i_op = iop;
+	inode->i_fop = fop;
+	set_nlink(inode, nlink);
+	inode->i_private = fc;
+	d_add(dentry, inode);
+	return dentry;
+}
+
+/*
+ * Add a connection to the control filesystem (if it exists).  Caller
+ * must hold fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+	struct dentry *parent;
+	char name[32];
+
+	if (!fuse_control_sb)
+		return 0;
+
+	parent = fuse_control_sb->s_root;
+	inc_nlink(d_inode(parent));
+	sprintf(name, "%u", fc->dev);
+	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+				     &simple_dir_inode_operations,
+				     &simple_dir_operations);
+	if (!parent)
+		goto err;
+
+	if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+				 NULL, &fuse_ctl_waiting_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+				 NULL, &fuse_ctl_abort_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+				 1, NULL, &fuse_conn_max_background_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+				 S_IFREG | 0600, 1, NULL,
+				 &fuse_conn_congestion_threshold_ops))
+		goto err;
+
+	return 0;
+
+ err:
+	fuse_ctl_remove_conn(fc);
+	return -ENOMEM;
+}
+
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must hold fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+	int i;
+
+	if (!fuse_control_sb)
+		return;
+
+	for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+		struct dentry *dentry = fc->ctl_dentry[i];
+		d_inode(dentry)->i_private = NULL;
+		d_drop(dentry);
+		dput(dentry);
+	}
+	drop_nlink(d_inode(fuse_control_sb->s_root));
+}
+
+static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct tree_descr empty_descr = {""};
+	struct fuse_conn *fc;
+	int err;
+
+	err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+	if (err)
+		return err;
+
+	mutex_lock(&fuse_mutex);
+	BUG_ON(fuse_control_sb);
+	fuse_control_sb = sb;
+	list_for_each_entry(fc, &fuse_conn_list, entry) {
+		err = fuse_ctl_add_conn(fc);
+		if (err) {
+			fuse_control_sb = NULL;
+			mutex_unlock(&fuse_mutex);
+			return err;
+		}
+	}
+	mutex_unlock(&fuse_mutex);
+
+	return 0;
+}
+
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *raw_data)
+{
+	return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
+}
+
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+	struct fuse_conn *fc;
+
+	mutex_lock(&fuse_mutex);
+	fuse_control_sb = NULL;
+	list_for_each_entry(fc, &fuse_conn_list, entry)
+		fc->ctl_ndents = 0;
+	mutex_unlock(&fuse_mutex);
+
+	kill_litter_super(sb);
+}
+
+static struct file_system_type fuse_ctl_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fusectl",
+	.mount		= fuse_ctl_mount,
+	.kill_sb	= fuse_ctl_kill_sb,
+};
+MODULE_ALIAS_FS("fusectl");
+
+int __init fuse_ctl_init(void)
+{
+	return register_filesystem(&fuse_ctl_fs_type);
+}
+
+void __exit fuse_ctl_cleanup(void)
+{
+	unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/kernel/fs/fuse/cuse.c b/kernel/fs/fuse/cuse.c
new file mode 100644
index 000000000..e5bbf748b
--- /dev/null
+++ b/kernel/fs/fuse/cuse.c
@@ -0,0 +1,636 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009  SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009  Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems.  On initialization /dev/cuse is
+ * created.  By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device.  After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn	: contains fuse_conn and serves as bonding structure
+ * channel	: file handle connected to the userland CUSE server
+ * cdev		: the implemented character device
+ * dev		: generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE.  As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies.  When channel is
+ * closed, everything begins to destruct.  The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+
+#include "fuse_i.h"
+
+#define CUSE_CONNTBL_LEN	64
+
+struct cuse_conn {
+	struct list_head	list;	/* linked on cuse_conntbl */
+	struct fuse_conn	fc;	/* fuse connection */
+	struct cdev		*cdev;	/* associated character device */
+	struct device		*dev;	/* device representing @cdev */
+
+	/* init parameters, set once during initialization */
+	bool			unrestricted_ioctl;
+};
+
+static DEFINE_MUTEX(cuse_lock);		/* protects registration */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+	return container_of(fc, struct cuse_conn, fc);
+}
+
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+	return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+
+
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file.  All other ops call FUSE ops on the
+ * FUSE file.
+ */
+
+static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
+{
+	struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+	loff_t pos = 0;
+
+	return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
+}
+
+static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+	struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+	loff_t pos = 0;
+	/*
+	 * No locking or generic_write_checks(), the server is
+	 * responsible for locking and sanity checks.
+	 */
+	return fuse_direct_io(&io, from, &pos,
+			      FUSE_DIO_WRITE | FUSE_DIO_CUSE);
+}
+
+static int cuse_open(struct inode *inode, struct file *file)
+{
+	dev_t devt = inode->i_cdev->dev;
+	struct cuse_conn *cc = NULL, *pos;
+	int rc;
+
+	/* look up and get the connection */
+	mutex_lock(&cuse_lock);
+	list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+		if (pos->dev->devt == devt) {
+			fuse_conn_get(&pos->fc);
+			cc = pos;
+			break;
+		}
+	mutex_unlock(&cuse_lock);
+
+	/* dead? */
+	if (!cc)
+		return -ENODEV;
+
+	/*
+	 * Generic permission check is already done against the chrdev
+	 * file, proceed to open.
+	 */
+	rc = fuse_do_open(&cc->fc, 0, file, 0);
+	if (rc)
+		fuse_conn_put(&cc->fc);
+	return rc;
+}
+
+static int cuse_release(struct inode *inode, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+
+	fuse_sync_release(ff, file->f_flags);
+	fuse_conn_put(fc);
+
+	return 0;
+}
+
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct fuse_file *ff = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(ff->fc);
+	unsigned int flags = 0;
+
+	if (cc->unrestricted_ioctl)
+		flags |= FUSE_IOCTL_UNRESTRICTED;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct fuse_file *ff = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(ff->fc);
+	unsigned int flags = FUSE_IOCTL_COMPAT;
+
+	if (cc->unrestricted_ioctl)
+		flags |= FUSE_IOCTL_UNRESTRICTED;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static const struct file_operations cuse_frontend_fops = {
+	.owner			= THIS_MODULE,
+	.read_iter		= cuse_read_iter,
+	.write_iter		= cuse_write_iter,
+	.open			= cuse_open,
+	.release		= cuse_release,
+	.unlocked_ioctl		= cuse_file_ioctl,
+	.compat_ioctl		= cuse_file_compat_ioctl,
+	.poll			= fuse_file_poll,
+	.llseek		= noop_llseek,
+};
+
+
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+
+struct cuse_devinfo {
+	const char		*name;
+};
+
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1.  This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value.  Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'.  *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+	char *p = *pp;
+	char *key, *val;
+
+	while (p < end && *p == '\0')
+		p++;
+	if (p == end)
+		return 0;
+
+	if (end[-1] != '\0') {
+		printk(KERN_ERR "CUSE: info not properly terminated\n");
+		return -EINVAL;
+	}
+
+	key = val = p;
+	p += strlen(p);
+
+	if (valp) {
+		strsep(&val, "=");
+		if (!val)
+			val = key + strlen(key);
+		key = strstrip(key);
+		val = strstrip(val);
+	} else
+		key = strstrip(key);
+
+	if (!strlen(key)) {
+		printk(KERN_ERR "CUSE: zero length info key specified\n");
+		return -EINVAL;
+	}
+
+	*pp = p;
+	*keyp = key;
+	if (valp)
+		*valp = val;
+
+	return 1;
+}
+
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo.  String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+	char *end = p + len;
+	char *uninitialized_var(key), *uninitialized_var(val);
+	int rc;
+
+	while (true) {
+		rc = cuse_parse_one(&p, end, &key, &val);
+		if (rc < 0)
+			return rc;
+		if (!rc)
+			break;
+		if (strcmp(key, "DEVNAME") == 0)
+			devinfo->name = val;
+		else
+			printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+			       key);
+	}
+
+	if (!devinfo->name || !strlen(devinfo->name)) {
+		printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void cuse_gendev_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it.  Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct cuse_conn *cc = fc_to_cc(fc), *pos;
+	struct cuse_init_out *arg = req->out.args[0].value;
+	struct page *page = req->pages[0];
+	struct cuse_devinfo devinfo = { };
+	struct device *dev;
+	struct cdev *cdev;
+	dev_t devt;
+	int rc, i;
+
+	if (req->out.h.error ||
+	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+		goto err;
+	}
+
+	fc->minor = arg->minor;
+	fc->max_read = max_t(unsigned, arg->max_read, 4096);
+	fc->max_write = max_t(unsigned, arg->max_write, 4096);
+
+	/* parse init reply */
+	cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+
+	rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+				&devinfo);
+	if (rc)
+		goto err;
+
+	/* determine and reserve devt */
+	devt = MKDEV(arg->dev_major, arg->dev_minor);
+	if (!MAJOR(devt))
+		rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+	else
+		rc = register_chrdev_region(devt, 1, devinfo.name);
+	if (rc) {
+		printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+		goto err;
+	}
+
+	/* devt determined, create device */
+	rc = -ENOMEM;
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		goto err_region;
+
+	device_initialize(dev);
+	dev_set_uevent_suppress(dev, 1);
+	dev->class = cuse_class;
+	dev->devt = devt;
+	dev->release = cuse_gendev_release;
+	dev_set_drvdata(dev, cc);
+	dev_set_name(dev, "%s", devinfo.name);
+
+	mutex_lock(&cuse_lock);
+
+	/* make sure the device-name is unique */
+	for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
+		list_for_each_entry(pos, &cuse_conntbl[i], list)
+			if (!strcmp(dev_name(pos->dev), dev_name(dev)))
+				goto err_unlock;
+	}
+
+	rc = device_add(dev);
+	if (rc)
+		goto err_unlock;
+
+	/* register cdev */
+	rc = -ENOMEM;
+	cdev = cdev_alloc();
+	if (!cdev)
+		goto err_unlock;
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = &cuse_frontend_fops;
+
+	rc = cdev_add(cdev, devt, 1);
+	if (rc)
+		goto err_cdev;
+
+	cc->dev = dev;
+	cc->cdev = cdev;
+
+	/* make the device available */
+	list_add(&cc->list, cuse_conntbl_head(devt));
+	mutex_unlock(&cuse_lock);
+
+	/* announce device availability */
+	dev_set_uevent_suppress(dev, 0);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+	kfree(arg);
+	__free_page(page);
+	return;
+
+err_cdev:
+	cdev_del(cdev);
+err_unlock:
+	mutex_unlock(&cuse_lock);
+	put_device(dev);
+err_region:
+	unregister_chrdev_region(devt, 1);
+err:
+	fuse_abort_conn(fc);
+	goto out;
+}
+
+static int cuse_send_init(struct cuse_conn *cc)
+{
+	int rc;
+	struct fuse_req *req;
+	struct page *page;
+	struct fuse_conn *fc = &cc->fc;
+	struct cuse_init_in *arg;
+	void *outarg;
+
+	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+
+	req = fuse_get_req_for_background(fc, 1);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		goto err;
+	}
+
+	rc = -ENOMEM;
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		goto err_put_req;
+
+	outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
+	if (!outarg)
+		goto err_free_page;
+
+	arg = &req->misc.cuse_init_in;
+	arg->major = FUSE_KERNEL_VERSION;
+	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	arg->flags |= CUSE_UNRESTRICTED_IOCTL;
+	req->in.h.opcode = CUSE_INIT;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct cuse_init_in);
+	req->in.args[0].value = arg;
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(struct cuse_init_out);
+	req->out.args[0].value = outarg;
+	req->out.args[1].size = CUSE_INIT_INFO_MAX;
+	req->out.argvar = 1;
+	req->out.argpages = 1;
+	req->pages[0] = page;
+	req->page_descs[0].length = req->out.args[1].size;
+	req->num_pages = 1;
+	req->end = cuse_process_init_reply;
+	fuse_request_send_background(fc, req);
+
+	return 0;
+
+err_free_page:
+	__free_page(page);
+err_put_req:
+	fuse_put_request(fc, req);
+err:
+	return rc;
+}
+
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+	struct cuse_conn *cc = fc_to_cc(fc);
+	kfree_rcu(cc, fc.rcu);
+}
+
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initialization request kernel sends.  This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init.  The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+	struct cuse_conn *cc;
+	int rc;
+
+	/* set up cuse_conn */
+	cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+	if (!cc)
+		return -ENOMEM;
+
+	fuse_conn_init(&cc->fc);
+
+	INIT_LIST_HEAD(&cc->list);
+	cc->fc.release = cuse_fc_release;
+
+	cc->fc.connected = 1;
+	cc->fc.initialized = 1;
+	rc = cuse_send_init(cc);
+	if (rc) {
+		fuse_conn_put(&cc->fc);
+		return rc;
+	}
+	file->private_data = &cc->fc;	/* channel owns base reference to cc */
+
+	return 0;
+}
+
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+	struct cuse_conn *cc = fc_to_cc(file->private_data);
+	int rc;
+
+	/* remove from the conntbl, no more access from this point on */
+	mutex_lock(&cuse_lock);
+	list_del_init(&cc->list);
+	mutex_unlock(&cuse_lock);
+
+	/* remove device */
+	if (cc->dev)
+		device_unregister(cc->dev);
+	if (cc->cdev) {
+		unregister_chrdev_region(cc->cdev->dev, 1);
+		cdev_del(cc->cdev);
+	}
+
+	rc = fuse_dev_release(inode, file);	/* puts the base reference */
+
+	return rc;
+}
+
+static struct file_operations cuse_channel_fops; /* initialized during init */
+
+
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+
+static ssize_t cuse_class_waiting_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct cuse_conn *cc = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
+
+static ssize_t cuse_class_abort_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct cuse_conn *cc = dev_get_drvdata(dev);
+
+	fuse_abort_conn(&cc->fc);
+	return count;
+}
+static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
+
+static struct attribute *cuse_class_dev_attrs[] = {
+	&dev_attr_waiting.attr,
+	&dev_attr_abort.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(cuse_class_dev);
+
+static struct miscdevice cuse_miscdev = {
+	.minor		= CUSE_MINOR,
+	.name		= "cuse",
+	.fops		= &cuse_channel_fops,
+};
+
+MODULE_ALIAS_MISCDEV(CUSE_MINOR);
+MODULE_ALIAS("devname:cuse");
+
+static int __init cuse_init(void)
+{
+	int i, rc;
+
+	/* init conntbl */
+	for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+		INIT_LIST_HEAD(&cuse_conntbl[i]);
+
+	/* inherit and extend fuse_dev_operations */
+	cuse_channel_fops		= fuse_dev_operations;
+	cuse_channel_fops.owner		= THIS_MODULE;
+	cuse_channel_fops.open		= cuse_channel_open;
+	cuse_channel_fops.release	= cuse_channel_release;
+
+	cuse_class = class_create(THIS_MODULE, "cuse");
+	if (IS_ERR(cuse_class))
+		return PTR_ERR(cuse_class);
+
+	cuse_class->dev_groups = cuse_class_dev_groups;
+
+	rc = misc_register(&cuse_miscdev);
+	if (rc) {
+		class_destroy(cuse_class);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void __exit cuse_exit(void)
+{
+	misc_deregister(&cuse_miscdev);
+	class_destroy(cuse_class);
+}
+
+module_init(cuse_init);
+module_exit(cuse_exit);
+
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/fuse/dev.c b/kernel/fs/fuse/dev.c
new file mode 100644
index 000000000..c8b68ab2e
--- /dev/null
+++ b/kernel/fs/fuse/dev.c
@@ -0,0 +1,2273 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
+
+MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
+
+static struct kmem_cache *fuse_req_cachep;
+
+static struct fuse_conn *fuse_get_conn(struct file *file)
+{
+	/*
+	 * Lockless access is OK, because file->private data is set
+	 * once during mount and is valid until the file is released.
+	 */
+	return file->private_data;
+}
+
+static void fuse_request_init(struct fuse_req *req, struct page **pages,
+			      struct fuse_page_desc *page_descs,
+			      unsigned npages)
+{
+	memset(req, 0, sizeof(*req));
+	memset(pages, 0, sizeof(*pages) * npages);
+	memset(page_descs, 0, sizeof(*page_descs) * npages);
+	INIT_LIST_HEAD(&req->list);
+	INIT_LIST_HEAD(&req->intr_entry);
+	init_waitqueue_head(&req->waitq);
+	atomic_set(&req->count, 1);
+	req->pages = pages;
+	req->page_descs = page_descs;
+	req->max_pages = npages;
+}
+
+static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
+{
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+	if (req) {
+		struct page **pages;
+		struct fuse_page_desc *page_descs;
+
+		if (npages <= FUSE_REQ_INLINE_PAGES) {
+			pages = req->inline_pages;
+			page_descs = req->inline_page_descs;
+		} else {
+			pages = kmalloc(sizeof(struct page *) * npages, flags);
+			page_descs = kmalloc(sizeof(struct fuse_page_desc) *
+					     npages, flags);
+		}
+
+		if (!pages || !page_descs) {
+			kfree(pages);
+			kfree(page_descs);
+			kmem_cache_free(fuse_req_cachep, req);
+			return NULL;
+		}
+
+		fuse_request_init(req, pages, page_descs, npages);
+	}
+	return req;
+}
+
+struct fuse_req *fuse_request_alloc(unsigned npages)
+{
+	return __fuse_request_alloc(npages, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(fuse_request_alloc);
+
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
+{
+	return __fuse_request_alloc(npages, GFP_NOFS);
+}
+
+void fuse_request_free(struct fuse_req *req)
+{
+	if (req->pages != req->inline_pages) {
+		kfree(req->pages);
+		kfree(req->page_descs);
+	}
+	kmem_cache_free(fuse_req_cachep, req);
+}
+
+static void block_sigs(sigset_t *oldset)
+{
+	sigset_t mask;
+
+	siginitsetinv(&mask, sigmask(SIGKILL));
+	sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void restore_sigs(sigset_t *oldset)
+{
+	sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
+void __fuse_get_request(struct fuse_req *req)
+{
+	atomic_inc(&req->count);
+}
+
+/* Must be called with > 1 refcount */
+static void __fuse_put_request(struct fuse_req *req)
+{
+	BUG_ON(atomic_read(&req->count) < 2);
+	atomic_dec(&req->count);
+}
+
+static void fuse_req_init_context(struct fuse_req *req)
+{
+	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
+	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
+	req->in.h.pid = current->pid;
+}
+
+void fuse_set_initialized(struct fuse_conn *fc)
+{
+	/* Make sure stores before this are seen on another CPU */
+	smp_wmb();
+	fc->initialized = 1;
+}
+
+static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
+{
+	return !fc->initialized || (for_background && fc->blocked);
+}
+
+static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
+				       bool for_background)
+{
+	struct fuse_req *req;
+	int err;
+	atomic_inc(&fc->num_waiting);
+
+	if (fuse_block_alloc(fc, for_background)) {
+		sigset_t oldset;
+		int intr;
+
+		block_sigs(&oldset);
+		intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
+				!fuse_block_alloc(fc, for_background));
+		restore_sigs(&oldset);
+		err = -EINTR;
+		if (intr)
+			goto out;
+	}
+	/* Matches smp_wmb() in fuse_set_initialized() */
+	smp_rmb();
+
+	err = -ENOTCONN;
+	if (!fc->connected)
+		goto out;
+
+	req = fuse_request_alloc(npages);
+	err = -ENOMEM;
+	if (!req) {
+		if (for_background)
+			wake_up(&fc->blocked_waitq);
+		goto out;
+	}
+
+	fuse_req_init_context(req);
+	req->waiting = 1;
+	req->background = for_background;
+	return req;
+
+ out:
+	atomic_dec(&fc->num_waiting);
+	return ERR_PTR(err);
+}
+
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
+{
+	return __fuse_get_req(fc, npages, false);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req);
+
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+					     unsigned npages)
+{
+	return __fuse_get_req(fc, npages, true);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
+
+/*
+ * Return request in fuse_file->reserved_req.  However that may
+ * currently be in use.  If that is the case, wait for it to become
+ * available.
+ */
+static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
+					 struct file *file)
+{
+	struct fuse_req *req = NULL;
+	struct fuse_file *ff = file->private_data;
+
+	do {
+		wait_event(fc->reserved_req_waitq, ff->reserved_req);
+		spin_lock(&fc->lock);
+		if (ff->reserved_req) {
+			req = ff->reserved_req;
+			ff->reserved_req = NULL;
+			req->stolen_file = get_file(file);
+		}
+		spin_unlock(&fc->lock);
+	} while (!req);
+
+	return req;
+}
+
+/*
+ * Put stolen request back into fuse_file->reserved_req
+ */
+static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct file *file = req->stolen_file;
+	struct fuse_file *ff = file->private_data;
+
+	spin_lock(&fc->lock);
+	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
+	BUG_ON(ff->reserved_req);
+	ff->reserved_req = req;
+	wake_up_all(&fc->reserved_req_waitq);
+	spin_unlock(&fc->lock);
+	fput(file);
+}
+
+/*
+ * Gets a requests for a file operation, always succeeds
+ *
+ * This is used for sending the FLUSH request, which must get to
+ * userspace, due to POSIX locks which may need to be unlocked.
+ *
+ * If allocation fails due to OOM, use the reserved request in
+ * fuse_file.
+ *
+ * This is very unlikely to deadlock accidentally, since the
+ * filesystem should not have it's own file open.  If deadlock is
+ * intentional, it can still be broken by "aborting" the filesystem.
+ */
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+					     struct file *file)
+{
+	struct fuse_req *req;
+
+	atomic_inc(&fc->num_waiting);
+	wait_event(fc->blocked_waitq, fc->initialized);
+	/* Matches smp_wmb() in fuse_set_initialized() */
+	smp_rmb();
+	req = fuse_request_alloc(0);
+	if (!req)
+		req = get_reserved_req(fc, file);
+
+	fuse_req_init_context(req);
+	req->waiting = 1;
+	req->background = 0;
+	return req;
+}
+
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (atomic_dec_and_test(&req->count)) {
+		if (unlikely(req->background)) {
+			/*
+			 * We get here in the unlikely case that a background
+			 * request was allocated but not sent
+			 */
+			spin_lock(&fc->lock);
+			if (!fc->blocked)
+				wake_up(&fc->blocked_waitq);
+			spin_unlock(&fc->lock);
+		}
+
+		if (req->waiting)
+			atomic_dec(&fc->num_waiting);
+
+		if (req->stolen_file)
+			put_reserved_req(fc, req);
+		else
+			fuse_request_free(req);
+	}
+}
+EXPORT_SYMBOL_GPL(fuse_put_request);
+
+static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+{
+	unsigned nbytes = 0;
+	unsigned i;
+
+	for (i = 0; i < numargs; i++)
+		nbytes += args[i].size;
+
+	return nbytes;
+}
+
+static u64 fuse_get_unique(struct fuse_conn *fc)
+{
+	fc->reqctr++;
+	/* zero is special */
+	if (fc->reqctr == 0)
+		fc->reqctr = 1;
+
+	return fc->reqctr;
+}
+
+static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->in.h.len = sizeof(struct fuse_in_header) +
+		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+	list_add_tail(&req->list, &fc->pending);
+	req->state = FUSE_REQ_PENDING;
+	if (!req->waiting) {
+		req->waiting = 1;
+		atomic_inc(&fc->num_waiting);
+	}
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+		       u64 nodeid, u64 nlookup)
+{
+	forget->forget_one.nodeid = nodeid;
+	forget->forget_one.nlookup = nlookup;
+
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		fc->forget_list_tail->next = forget;
+		fc->forget_list_tail = forget;
+		wake_up(&fc->waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	} else {
+		kfree(forget);
+	}
+	spin_unlock(&fc->lock);
+}
+
+static void flush_bg_queue(struct fuse_conn *fc)
+{
+	while (fc->active_background < fc->max_background &&
+	       !list_empty(&fc->bg_queue)) {
+		struct fuse_req *req;
+
+		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
+		list_del(&req->list);
+		fc->active_background++;
+		req->in.h.unique = fuse_get_unique(fc);
+		queue_request(fc, req);
+	}
+}
+
+/*
+ * This function is called when a request is finished.  Either a reply
+ * has arrived or it was aborted (and not yet sent) or some error
+ * occurred during communication with userspace, or the device file
+ * was closed.  The requester thread is woken up (if still waiting),
+ * the 'end' callback is called if given, else the reference to the
+ * request is released
+ *
+ * Called with fc->lock, unlocks it
+ */
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+__releases(fc->lock)
+{
+	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+	req->end = NULL;
+	list_del(&req->list);
+	list_del(&req->intr_entry);
+	req->state = FUSE_REQ_FINISHED;
+	if (req->background) {
+		req->background = 0;
+
+		if (fc->num_background == fc->max_background)
+			fc->blocked = 0;
+
+		/* Wake up next waiter, if any */
+		if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
+			wake_up(&fc->blocked_waitq);
+
+		if (fc->num_background == fc->congestion_threshold &&
+		    fc->connected && fc->bdi_initialized) {
+			clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+			clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+		}
+		fc->num_background--;
+		fc->active_background--;
+		flush_bg_queue(fc);
+	}
+	spin_unlock(&fc->lock);
+	wake_up(&req->waitq);
+	if (end)
+		end(fc, req);
+	fuse_put_request(fc, req);
+}
+
+static void wait_answer_interruptible(struct fuse_conn *fc,
+				      struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	if (signal_pending(current))
+		return;
+
+	spin_unlock(&fc->lock);
+	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
+	spin_lock(&fc->lock);
+}
+
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+	list_add_tail(&req->intr_entry, &fc->interrupts);
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	if (!fc->no_interrupt) {
+		/* Any signal may interrupt this */
+		wait_answer_interruptible(fc, req);
+
+		if (req->aborted)
+			goto aborted;
+		if (req->state == FUSE_REQ_FINISHED)
+			return;
+
+		req->interrupted = 1;
+		if (req->state == FUSE_REQ_SENT)
+			queue_interrupt(fc, req);
+	}
+
+	if (!req->force) {
+		sigset_t oldset;
+
+		/* Only fatal signals may interrupt this */
+		block_sigs(&oldset);
+		wait_answer_interruptible(fc, req);
+		restore_sigs(&oldset);
+
+		if (req->aborted)
+			goto aborted;
+		if (req->state == FUSE_REQ_FINISHED)
+			return;
+
+		/* Request is not yet in userspace, bail out */
+		if (req->state == FUSE_REQ_PENDING) {
+			list_del(&req->list);
+			__fuse_put_request(req);
+			req->out.h.error = -EINTR;
+			return;
+		}
+	}
+
+	/*
+	 * Either request is already in userspace, or it was forced.
+	 * Wait it out.
+	 */
+	spin_unlock(&fc->lock);
+	wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+	spin_lock(&fc->lock);
+
+	if (!req->aborted)
+		return;
+
+ aborted:
+	BUG_ON(req->state != FUSE_REQ_FINISHED);
+	if (req->locked) {
+		/* This is uninterruptible sleep, because data is
+		   being copied to/from the buffers of req.  During
+		   locked state, there mustn't be any filesystem
+		   operation (e.g. page fault), since that could lead
+		   to deadlock */
+		spin_unlock(&fc->lock);
+		wait_event(req->waitq, !req->locked);
+		spin_lock(&fc->lock);
+	}
+}
+
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	BUG_ON(req->background);
+	spin_lock(&fc->lock);
+	if (!fc->connected)
+		req->out.h.error = -ENOTCONN;
+	else if (fc->conn_error)
+		req->out.h.error = -ECONNREFUSED;
+	else {
+		req->in.h.unique = fuse_get_unique(fc);
+		queue_request(fc, req);
+		/* acquire extra reference, since request is still needed
+		   after request_end() */
+		__fuse_get_request(req);
+
+		request_wait_answer(fc, req);
+	}
+	spin_unlock(&fc->lock);
+}
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	__fuse_request_send(fc, req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_send);
+
+static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
+{
+	if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS)
+		args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE;
+
+	if (fc->minor < 9) {
+		switch (args->in.h.opcode) {
+		case FUSE_LOOKUP:
+		case FUSE_CREATE:
+		case FUSE_MKNOD:
+		case FUSE_MKDIR:
+		case FUSE_SYMLINK:
+		case FUSE_LINK:
+			args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+			break;
+		case FUSE_GETATTR:
+		case FUSE_SETATTR:
+			args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+			break;
+		}
+	}
+	if (fc->minor < 12) {
+		switch (args->in.h.opcode) {
+		case FUSE_CREATE:
+			args->in.args[0].size = sizeof(struct fuse_open_in);
+			break;
+		case FUSE_MKNOD:
+			args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
+			break;
+		}
+	}
+}
+
+ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+{
+	struct fuse_req *req;
+	ssize_t ret;
+
+	req = fuse_get_req(fc, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	/* Needs to be done after fuse_get_req() so that fc->minor is valid */
+	fuse_adjust_compat(fc, args);
+
+	req->in.h.opcode = args->in.h.opcode;
+	req->in.h.nodeid = args->in.h.nodeid;
+	req->in.numargs = args->in.numargs;
+	memcpy(req->in.args, args->in.args,
+	       args->in.numargs * sizeof(struct fuse_in_arg));
+	req->out.argvar = args->out.argvar;
+	req->out.numargs = args->out.numargs;
+	memcpy(req->out.args, args->out.args,
+	       args->out.numargs * sizeof(struct fuse_arg));
+	fuse_request_send(fc, req);
+	ret = req->out.h.error;
+	if (!ret && args->out.argvar) {
+		BUG_ON(args->out.numargs != 1);
+		ret = req->out.args[0].size;
+	}
+	fuse_put_request(fc, req);
+
+	return ret;
+}
+
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+					    struct fuse_req *req)
+{
+	BUG_ON(!req->background);
+	fc->num_background++;
+	if (fc->num_background == fc->max_background)
+		fc->blocked = 1;
+	if (fc->num_background == fc->congestion_threshold &&
+	    fc->bdi_initialized) {
+		set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+		set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+	}
+	list_add_tail(&req->list, &fc->bg_queue);
+	flush_bg_queue(fc);
+}
+
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+{
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		fuse_request_send_nowait_locked(fc, req);
+		spin_unlock(&fc->lock);
+	} else {
+		req->out.h.error = -ENOTCONN;
+		request_end(fc, req);
+	}
+}
+
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	fuse_request_send_nowait(fc, req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_send_background);
+
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+					  struct fuse_req *req, u64 unique)
+{
+	int err = -ENODEV;
+
+	req->isreply = 0;
+	req->in.h.unique = unique;
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		queue_request(fc, req);
+		err = 0;
+	}
+	spin_unlock(&fc->lock);
+
+	return err;
+}
+
+/*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req)
+{
+	req->isreply = 1;
+	fuse_request_send_nowait_locked(fc, req);
+}
+
+void fuse_force_forget(struct file *file, u64 nodeid)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_forget_in inarg;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.nlookup = 1;
+	req = fuse_get_req_nofail_nopages(fc, file);
+	req->in.h.opcode = FUSE_FORGET;
+	req->in.h.nodeid = nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->isreply = 0;
+	__fuse_request_send(fc, req);
+	/* ignore errors */
+	fuse_put_request(fc, req);
+}
+
+/*
+ * Lock the request.  Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault.  If the request was already
+ * aborted bail out.
+ */
+static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int err = 0;
+	if (req) {
+		spin_lock(&fc->lock);
+		if (req->aborted)
+			err = -ENOENT;
+		else
+			req->locked = 1;
+		spin_unlock(&fc->lock);
+	}
+	return err;
+}
+
+/*
+ * Unlock request.  If it was aborted during being locked, the
+ * requester thread is currently waiting for it to be unlocked, so
+ * wake it up.
+ */
+static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (req) {
+		spin_lock(&fc->lock);
+		req->locked = 0;
+		if (req->aborted)
+			wake_up(&req->waitq);
+		spin_unlock(&fc->lock);
+	}
+}
+
+struct fuse_copy_state {
+	struct fuse_conn *fc;
+	int write;
+	struct fuse_req *req;
+	struct iov_iter *iter;
+	struct pipe_buffer *pipebufs;
+	struct pipe_buffer *currbuf;
+	struct pipe_inode_info *pipe;
+	unsigned long nr_segs;
+	struct page *pg;
+	unsigned len;
+	unsigned offset;
+	unsigned move_pages:1;
+};
+
+static void fuse_copy_init(struct fuse_copy_state *cs,
+			   struct fuse_conn *fc,
+			   int write,
+			   struct iov_iter *iter)
+{
+	memset(cs, 0, sizeof(*cs));
+	cs->fc = fc;
+	cs->write = write;
+	cs->iter = iter;
+}
+
+/* Unmap and put previous page of userspace buffer */
+static void fuse_copy_finish(struct fuse_copy_state *cs)
+{
+	if (cs->currbuf) {
+		struct pipe_buffer *buf = cs->currbuf;
+
+		if (cs->write)
+			buf->len = PAGE_SIZE - cs->len;
+		cs->currbuf = NULL;
+	} else if (cs->pg) {
+		if (cs->write) {
+			flush_dcache_page(cs->pg);
+			set_page_dirty_lock(cs->pg);
+		}
+		put_page(cs->pg);
+	}
+	cs->pg = NULL;
+}
+
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
+static int fuse_copy_fill(struct fuse_copy_state *cs)
+{
+	struct page *page;
+	int err;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+	if (cs->pipebufs) {
+		struct pipe_buffer *buf = cs->pipebufs;
+
+		if (!cs->write) {
+			err = buf->ops->confirm(cs->pipe, buf);
+			if (err)
+				return err;
+
+			BUG_ON(!cs->nr_segs);
+			cs->currbuf = buf;
+			cs->pg = buf->page;
+			cs->offset = buf->offset;
+			cs->len = buf->len;
+			cs->pipebufs++;
+			cs->nr_segs--;
+		} else {
+			if (cs->nr_segs == cs->pipe->buffers)
+				return -EIO;
+
+			page = alloc_page(GFP_HIGHUSER);
+			if (!page)
+				return -ENOMEM;
+
+			buf->page = page;
+			buf->offset = 0;
+			buf->len = 0;
+
+			cs->currbuf = buf;
+			cs->pg = page;
+			cs->offset = 0;
+			cs->len = PAGE_SIZE;
+			cs->pipebufs++;
+			cs->nr_segs++;
+		}
+	} else {
+		size_t off;
+		err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+		if (err < 0)
+			return err;
+		BUG_ON(!err);
+		cs->len = err;
+		cs->offset = off;
+		cs->pg = page;
+		cs->offset = off;
+		iov_iter_advance(cs->iter, err);
+	}
+
+	return lock_request(cs->fc, cs->req);
+}
+
+/* Do as much copy to/from userspace buffer as we can */
+static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
+{
+	unsigned ncpy = min(*size, cs->len);
+	if (val) {
+		void *pgaddr = kmap_atomic(cs->pg);
+		void *buf = pgaddr + cs->offset;
+
+		if (cs->write)
+			memcpy(buf, *val, ncpy);
+		else
+			memcpy(*val, buf, ncpy);
+
+		kunmap_atomic(pgaddr);
+		*val += ncpy;
+	}
+	*size -= ncpy;
+	cs->len -= ncpy;
+	cs->offset += ncpy;
+	return ncpy;
+}
+
+static int fuse_check_page(struct page *page)
+{
+	if (page_mapcount(page) ||
+	    page->mapping != NULL ||
+	    page_count(page) != 1 ||
+	    (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+	     ~(1 << PG_locked |
+	       1 << PG_referenced |
+	       1 << PG_uptodate |
+	       1 << PG_lru |
+	       1 << PG_active |
+	       1 << PG_reclaim))) {
+		printk(KERN_WARNING "fuse: trying to steal weird page\n");
+		printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+		return 1;
+	}
+	return 0;
+}
+
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+	int err;
+	struct page *oldpage = *pagep;
+	struct page *newpage;
+	struct pipe_buffer *buf = cs->pipebufs;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+
+	err = buf->ops->confirm(cs->pipe, buf);
+	if (err)
+		return err;
+
+	BUG_ON(!cs->nr_segs);
+	cs->currbuf = buf;
+	cs->len = buf->len;
+	cs->pipebufs++;
+	cs->nr_segs--;
+
+	if (cs->len != PAGE_SIZE)
+		goto out_fallback;
+
+	if (buf->ops->steal(cs->pipe, buf) != 0)
+		goto out_fallback;
+
+	newpage = buf->page;
+
+	if (!PageUptodate(newpage))
+		SetPageUptodate(newpage);
+
+	ClearPageMappedToDisk(newpage);
+
+	if (fuse_check_page(newpage) != 0)
+		goto out_fallback_unlock;
+
+	/*
+	 * This is a new and locked page, it shouldn't be mapped or
+	 * have any special flags on it
+	 */
+	if (WARN_ON(page_mapped(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(page_has_private(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(PageMlocked(oldpage)))
+		goto out_fallback_unlock;
+
+	err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
+	if (err) {
+		unlock_page(newpage);
+		return err;
+	}
+
+	page_cache_get(newpage);
+
+	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+		lru_cache_add_file(newpage);
+
+	err = 0;
+	spin_lock(&cs->fc->lock);
+	if (cs->req->aborted)
+		err = -ENOENT;
+	else
+		*pagep = newpage;
+	spin_unlock(&cs->fc->lock);
+
+	if (err) {
+		unlock_page(newpage);
+		page_cache_release(newpage);
+		return err;
+	}
+
+	unlock_page(oldpage);
+	page_cache_release(oldpage);
+	cs->len = 0;
+
+	return 0;
+
+out_fallback_unlock:
+	unlock_page(newpage);
+out_fallback:
+	cs->pg = buf->page;
+	cs->offset = buf->offset;
+
+	err = lock_request(cs->fc, cs->req);
+	if (err)
+		return err;
+
+	return 1;
+}
+
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+			 unsigned offset, unsigned count)
+{
+	struct pipe_buffer *buf;
+
+	if (cs->nr_segs == cs->pipe->buffers)
+		return -EIO;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+
+	buf = cs->pipebufs;
+	page_cache_get(page);
+	buf->page = page;
+	buf->offset = offset;
+	buf->len = count;
+
+	cs->pipebufs++;
+	cs->nr_segs++;
+	cs->len = 0;
+
+	return 0;
+}
+
+/*
+ * Copy a page in the request to/from the userspace buffer.  Must be
+ * done atomically
+ */
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
+			  unsigned offset, unsigned count, int zeroing)
+{
+	int err;
+	struct page *page = *pagep;
+
+	if (page && zeroing && count < PAGE_SIZE)
+		clear_highpage(page);
+
+	while (count) {
+		if (cs->write && cs->pipebufs && page) {
+			return fuse_ref_page(cs, page, offset, count);
+		} else if (!cs->len) {
+			if (cs->move_pages && page &&
+			    offset == 0 && count == PAGE_SIZE) {
+				err = fuse_try_move_page(cs, pagep);
+				if (err <= 0)
+					return err;
+			} else {
+				err = fuse_copy_fill(cs);
+				if (err)
+					return err;
+			}
+		}
+		if (page) {
+			void *mapaddr = kmap_atomic(page);
+			void *buf = mapaddr + offset;
+			offset += fuse_copy_do(cs, &buf, &count);
+			kunmap_atomic(mapaddr);
+		} else
+			offset += fuse_copy_do(cs, NULL, &count);
+	}
+	if (page && !cs->write)
+		flush_dcache_page(page);
+	return 0;
+}
+
+/* Copy pages in the request to/from userspace buffer */
+static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
+			   int zeroing)
+{
+	unsigned i;
+	struct fuse_req *req = cs->req;
+
+	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+		int err;
+		unsigned offset = req->page_descs[i].offset;
+		unsigned count = min(nbytes, req->page_descs[i].length);
+
+		err = fuse_copy_page(cs, &req->pages[i], offset, count,
+				     zeroing);
+		if (err)
+			return err;
+
+		nbytes -= count;
+	}
+	return 0;
+}
+
+/* Copy a single argument in the request to/from userspace buffer */
+static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
+{
+	while (size) {
+		if (!cs->len) {
+			int err = fuse_copy_fill(cs);
+			if (err)
+				return err;
+		}
+		fuse_copy_do(cs, &val, &size);
+	}
+	return 0;
+}
+
+/* Copy request arguments to/from userspace buffer */
+static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+			  unsigned argpages, struct fuse_arg *args,
+			  int zeroing)
+{
+	int err = 0;
+	unsigned i;
+
+	for (i = 0; !err && i < numargs; i++)  {
+		struct fuse_arg *arg = &args[i];
+		if (i == numargs - 1 && argpages)
+			err = fuse_copy_pages(cs, arg->size, zeroing);
+		else
+			err = fuse_copy_one(cs, arg->value, arg->size);
+	}
+	return err;
+}
+
+static int forget_pending(struct fuse_conn *fc)
+{
+	return fc->forget_list_head.next != NULL;
+}
+
+static int request_pending(struct fuse_conn *fc)
+{
+	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+		forget_pending(fc);
+}
+
+/* Wait until a request is available on the pending list */
+static void request_wait(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(&fc->waitq, &wait);
+	while (fc->connected && !request_pending(fc)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
+
+		spin_unlock(&fc->lock);
+		schedule();
+		spin_lock(&fc->lock);
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&fc->waitq, &wait);
+}
+
+/*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+			       size_t nbytes, struct fuse_req *req)
+__releases(fc->lock)
+{
+	struct fuse_in_header ih;
+	struct fuse_interrupt_in arg;
+	unsigned reqsize = sizeof(ih) + sizeof(arg);
+	int err;
+
+	list_del_init(&req->intr_entry);
+	req->intr_unique = fuse_get_unique(fc);
+	memset(&ih, 0, sizeof(ih));
+	memset(&arg, 0, sizeof(arg));
+	ih.len = reqsize;
+	ih.opcode = FUSE_INTERRUPT;
+	ih.unique = req->intr_unique;
+	arg.unique = req->in.h.unique;
+
+	spin_unlock(&fc->lock);
+	if (nbytes < reqsize)
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+	fuse_copy_finish(cs);
+
+	return err ? err : reqsize;
+}
+
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+					       unsigned max,
+					       unsigned *countp)
+{
+	struct fuse_forget_link *head = fc->forget_list_head.next;
+	struct fuse_forget_link **newhead = &head;
+	unsigned count;
+
+	for (count = 0; *newhead != NULL && count < max; count++)
+		newhead = &(*newhead)->next;
+
+	fc->forget_list_head.next = *newhead;
+	*newhead = NULL;
+	if (fc->forget_list_head.next == NULL)
+		fc->forget_list_tail = &fc->forget_list_head;
+
+	if (countp != NULL)
+		*countp = count;
+
+	return head;
+}
+
+static int fuse_read_single_forget(struct fuse_conn *fc,
+				   struct fuse_copy_state *cs,
+				   size_t nbytes)
+__releases(fc->lock)
+{
+	int err;
+	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+	struct fuse_forget_in arg = {
+		.nlookup = forget->forget_one.nlookup,
+	};
+	struct fuse_in_header ih = {
+		.opcode = FUSE_FORGET,
+		.nodeid = forget->forget_one.nodeid,
+		.unique = fuse_get_unique(fc),
+		.len = sizeof(ih) + sizeof(arg),
+	};
+
+	spin_unlock(&fc->lock);
+	kfree(forget);
+	if (nbytes < ih.len)
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+	fuse_copy_finish(cs);
+
+	if (err)
+		return err;
+
+	return ih.len;
+}
+
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+				   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+	int err;
+	unsigned max_forgets;
+	unsigned count;
+	struct fuse_forget_link *head;
+	struct fuse_batch_forget_in arg = { .count = 0 };
+	struct fuse_in_header ih = {
+		.opcode = FUSE_BATCH_FORGET,
+		.unique = fuse_get_unique(fc),
+		.len = sizeof(ih) + sizeof(arg),
+	};
+
+	if (nbytes < ih.len) {
+		spin_unlock(&fc->lock);
+		return -EINVAL;
+	}
+
+	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+	head = dequeue_forget(fc, max_forgets, &count);
+	spin_unlock(&fc->lock);
+
+	arg.count = count;
+	ih.len += count * sizeof(struct fuse_forget_one);
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+
+	while (head) {
+		struct fuse_forget_link *forget = head;
+
+		if (!err) {
+			err = fuse_copy_one(cs, &forget->forget_one,
+					    sizeof(forget->forget_one));
+		}
+		head = forget->next;
+		kfree(forget);
+	}
+
+	fuse_copy_finish(cs);
+
+	if (err)
+		return err;
+
+	return ih.len;
+}
+
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+			    size_t nbytes)
+__releases(fc->lock)
+{
+	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+		return fuse_read_single_forget(fc, cs, nbytes);
+	else
+		return fuse_read_batch_forget(fc, cs, nbytes);
+}
+
+/*
+ * Read a single request into the userspace filesystem's buffer.  This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer.  If
+ * no reply is needed (FORGET) or request has been aborted or there
+ * was an error during the copying then it's finished by calling
+ * request_end().  Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+				struct fuse_copy_state *cs, size_t nbytes)
+{
+	int err;
+	struct fuse_req *req;
+	struct fuse_in *in;
+	unsigned reqsize;
+
+ restart:
+	spin_lock(&fc->lock);
+	err = -EAGAIN;
+	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
+	    !request_pending(fc))
+		goto err_unlock;
+
+	request_wait(fc);
+	err = -ENODEV;
+	if (!fc->connected)
+		goto err_unlock;
+	err = -ERESTARTSYS;
+	if (!request_pending(fc))
+		goto err_unlock;
+
+	if (!list_empty(&fc->interrupts)) {
+		req = list_entry(fc->interrupts.next, struct fuse_req,
+				 intr_entry);
+		return fuse_read_interrupt(fc, cs, nbytes, req);
+	}
+
+	if (forget_pending(fc)) {
+		if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+			return fuse_read_forget(fc, cs, nbytes);
+
+		if (fc->forget_batch <= -8)
+			fc->forget_batch = 16;
+	}
+
+	req = list_entry(fc->pending.next, struct fuse_req, list);
+	req->state = FUSE_REQ_READING;
+	list_move(&req->list, &fc->io);
+
+	in = &req->in;
+	reqsize = in->h.len;
+	/* If request is too large, reply with an error and restart the read */
+	if (nbytes < reqsize) {
+		req->out.h.error = -EIO;
+		/* SETXATTR is special, since it may contain too large data */
+		if (in->h.opcode == FUSE_SETXATTR)
+			req->out.h.error = -E2BIG;
+		request_end(fc, req);
+		goto restart;
+	}
+	spin_unlock(&fc->lock);
+	cs->req = req;
+	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
+	if (!err)
+		err = fuse_copy_args(cs, in->numargs, in->argpages,
+				     (struct fuse_arg *) in->args, 0);
+	fuse_copy_finish(cs);
+	spin_lock(&fc->lock);
+	req->locked = 0;
+	if (req->aborted) {
+		request_end(fc, req);
+		return -ENODEV;
+	}
+	if (err) {
+		req->out.h.error = -EIO;
+		request_end(fc, req);
+		return err;
+	}
+	if (!req->isreply)
+		request_end(fc, req);
+	else {
+		req->state = FUSE_REQ_SENT;
+		list_move_tail(&req->list, &fc->processing);
+		if (req->interrupted)
+			queue_interrupt(fc, req);
+		spin_unlock(&fc->lock);
+	}
+	return reqsize;
+
+ err_unlock:
+	spin_unlock(&fc->lock);
+	return err;
+}
+
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * The fuse device's file's private_data is used to hold
+	 * the fuse_conn(ection) when it is mounted, and is used to
+	 * keep track of whether the file has been mounted already.
+	 */
+	file->private_data = NULL;
+	return 0;
+}
+
+static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct fuse_copy_state cs;
+	struct file *file = iocb->ki_filp;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
+
+	if (!iter_is_iovec(to))
+		return -EINVAL;
+
+	fuse_copy_init(&cs, fc, 1, to);
+
+	return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to));
+}
+
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+				    struct pipe_inode_info *pipe,
+				    size_t len, unsigned int flags)
+{
+	int ret;
+	int page_nr = 0;
+	int do_wakeup = 0;
+	struct pipe_buffer *bufs;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(in);
+	if (!fc)
+		return -EPERM;
+
+	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (!bufs)
+		return -ENOMEM;
+
+	fuse_copy_init(&cs, fc, 1, NULL);
+	cs.pipebufs = bufs;
+	cs.pipe = pipe;
+	ret = fuse_dev_do_read(fc, in, &cs, len);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+	pipe_lock(pipe);
+
+	if (!pipe->readers) {
+		send_sig(SIGPIPE, current, 0);
+		if (!ret)
+			ret = -EPIPE;
+		goto out_unlock;
+	}
+
+	if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+		ret = -EIO;
+		goto out_unlock;
+	}
+
+	while (page_nr < cs.nr_segs) {
+		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+		struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+		buf->page = bufs[page_nr].page;
+		buf->offset = bufs[page_nr].offset;
+		buf->len = bufs[page_nr].len;
+		/*
+		 * Need to be careful about this.  Having buf->ops in module
+		 * code can Oops if the buffer persists after module unload.
+		 */
+		buf->ops = &nosteal_pipe_buf_ops;
+
+		pipe->nrbufs++;
+		page_nr++;
+		ret += buf->len;
+
+		if (pipe->files)
+			do_wakeup = 1;
+	}
+
+out_unlock:
+	pipe_unlock(pipe);
+
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	}
+
+out:
+	for (; page_nr < cs.nr_segs; page_nr++)
+		page_cache_release(bufs[page_nr].page);
+
+	kfree(bufs);
+	return ret;
+}
+
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+			    struct fuse_copy_state *cs)
+{
+	struct fuse_notify_poll_wakeup_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	fuse_copy_finish(cs);
+	return fuse_notify_poll_wakeup(fc, &outarg);
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_inode_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb) {
+		err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+					       outarg.off, outarg.len);
+	}
+	up_read(&fc->killsb);
+	return err;
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_entry_out outarg;
+	int err = -ENOMEM;
+	char *buf;
+	struct qstr name;
+
+	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	err = -ENAMETOOLONG;
+	if (outarg.namelen > FUSE_NAME_MAX)
+		goto err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg) + outarg.namelen + 1)
+		goto err;
+
+	name.name = buf;
+	name.len = outarg.namelen;
+	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+	buf[outarg.namelen] = 0;
+	name.hash = full_name_hash(name.name, name.len);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb)
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+	up_read(&fc->killsb);
+	kfree(buf);
+	return err;
+
+err:
+	kfree(buf);
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+			      struct fuse_copy_state *cs)
+{
+	struct fuse_notify_delete_out outarg;
+	int err = -ENOMEM;
+	char *buf;
+	struct qstr name;
+
+	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	err = -ENAMETOOLONG;
+	if (outarg.namelen > FUSE_NAME_MAX)
+		goto err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg) + outarg.namelen + 1)
+		goto err;
+
+	name.name = buf;
+	name.len = outarg.namelen;
+	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+	buf[outarg.namelen] = 0;
+	name.hash = full_name_hash(name.name, name.len);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb)
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+					       outarg.child, &name);
+	up_read(&fc->killsb);
+	kfree(buf);
+	return err;
+
+err:
+	kfree(buf);
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+			     struct fuse_copy_state *cs)
+{
+	struct fuse_notify_store_out outarg;
+	struct inode *inode;
+	struct address_space *mapping;
+	u64 nodeid;
+	int err;
+	pgoff_t index;
+	unsigned int offset;
+	unsigned int num;
+	loff_t file_size;
+	loff_t end;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto out_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto out_finish;
+
+	err = -EINVAL;
+	if (size - sizeof(outarg) != outarg.size)
+		goto out_finish;
+
+	nodeid = outarg.nodeid;
+
+	down_read(&fc->killsb);
+
+	err = -ENOENT;
+	if (!fc->sb)
+		goto out_up_killsb;
+
+	inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		goto out_up_killsb;
+
+	mapping = inode->i_mapping;
+	index = outarg.offset >> PAGE_CACHE_SHIFT;
+	offset = outarg.offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
+	end = outarg.offset + outarg.size;
+	if (end > file_size) {
+		file_size = end;
+		fuse_write_update_size(inode, file_size);
+	}
+
+	num = outarg.size;
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		err = -ENOMEM;
+		page = find_or_create_page(mapping, index,
+					   mapping_gfp_mask(mapping));
+		if (!page)
+			goto out_iput;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		err = fuse_copy_page(cs, &page, offset, this_num, 0);
+		if (!err && offset == 0 &&
+		    (this_num == PAGE_CACHE_SIZE || file_size == end))
+			SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (err)
+			goto out_iput;
+
+		num -= this_num;
+		offset = 0;
+		index++;
+	}
+
+	err = 0;
+
+out_iput:
+	iput(inode);
+out_up_killsb:
+	up_read(&fc->killsb);
+out_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	release_pages(req->pages, req->num_pages, false);
+}
+
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+			 struct fuse_notify_retrieve_out *outarg)
+{
+	int err;
+	struct address_space *mapping = inode->i_mapping;
+	struct fuse_req *req;
+	pgoff_t index;
+	loff_t file_size;
+	unsigned int num;
+	unsigned int offset;
+	size_t total_len = 0;
+	int num_pages;
+
+	offset = outarg->offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
+
+	num = outarg->size;
+	if (outarg->offset > file_size)
+		num = 0;
+	else if (outarg->offset + num > file_size)
+		num = file_size - outarg->offset;
+
+	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+
+	req = fuse_get_req(fc, num_pages);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_NOTIFY_REPLY;
+	req->in.h.nodeid = outarg->nodeid;
+	req->in.numargs = 2;
+	req->in.argpages = 1;
+	req->page_descs[0].offset = offset;
+	req->end = fuse_retrieve_end;
+
+	index = outarg->offset >> PAGE_CACHE_SHIFT;
+
+	while (num && req->num_pages < num_pages) {
+		struct page *page;
+		unsigned int this_num;
+
+		page = find_get_page(mapping, index);
+		if (!page)
+			break;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		req->pages[req->num_pages] = page;
+		req->page_descs[req->num_pages].length = this_num;
+		req->num_pages++;
+
+		offset = 0;
+		num -= this_num;
+		total_len += this_num;
+		index++;
+	}
+	req->misc.retrieve_in.offset = outarg->offset;
+	req->misc.retrieve_in.size = total_len;
+	req->in.args[0].size = sizeof(req->misc.retrieve_in);
+	req->in.args[0].value = &req->misc.retrieve_in;
+	req->in.args[1].size = total_len;
+
+	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+	if (err)
+		fuse_retrieve_end(fc, req);
+
+	return err;
+}
+
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+				struct fuse_copy_state *cs)
+{
+	struct fuse_notify_retrieve_out outarg;
+	struct inode *inode;
+	int err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg))
+		goto copy_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto copy_finish;
+
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb) {
+		u64 nodeid = outarg.nodeid;
+
+		inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+		if (inode) {
+			err = fuse_retrieve(fc, inode, &outarg);
+			iput(inode);
+		}
+	}
+	up_read(&fc->killsb);
+
+	return err;
+
+copy_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+		       unsigned int size, struct fuse_copy_state *cs)
+{
+	/* Don't try to move pages (yet) */
+	cs->move_pages = 0;
+
+	switch (code) {
+	case FUSE_NOTIFY_POLL:
+		return fuse_notify_poll(fc, size, cs);
+
+	case FUSE_NOTIFY_INVAL_INODE:
+		return fuse_notify_inval_inode(fc, size, cs);
+
+	case FUSE_NOTIFY_INVAL_ENTRY:
+		return fuse_notify_inval_entry(fc, size, cs);
+
+	case FUSE_NOTIFY_STORE:
+		return fuse_notify_store(fc, size, cs);
+
+	case FUSE_NOTIFY_RETRIEVE:
+		return fuse_notify_retrieve(fc, size, cs);
+
+	case FUSE_NOTIFY_DELETE:
+		return fuse_notify_delete(fc, size, cs);
+
+	default:
+		fuse_copy_finish(cs);
+		return -EINVAL;
+	}
+}
+
+/* Look up request on processing list by unique ID */
+static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+{
+	struct fuse_req *req;
+
+	list_for_each_entry(req, &fc->processing, list) {
+		if (req->in.h.unique == unique || req->intr_unique == unique)
+			return req;
+	}
+	return NULL;
+}
+
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+			 unsigned nbytes)
+{
+	unsigned reqsize = sizeof(struct fuse_out_header);
+
+	if (out->h.error)
+		return nbytes != reqsize ? -EINVAL : 0;
+
+	reqsize += len_args(out->numargs, out->args);
+
+	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+		return -EINVAL;
+	else if (reqsize > nbytes) {
+		struct fuse_arg *lastarg = &out->args[out->numargs-1];
+		unsigned diffsize = reqsize - nbytes;
+		if (diffsize > lastarg->size)
+			return -EINVAL;
+		lastarg->size -= diffsize;
+	}
+	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
+			      out->page_zeroing);
+}
+
+/*
+ * Write a single reply to a request.  First the header is copied from
+ * the write buffer.  The request is then searched on the processing
+ * list by the unique ID found in the header.  If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling request_end()
+ */
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+				 struct fuse_copy_state *cs, size_t nbytes)
+{
+	int err;
+	struct fuse_req *req;
+	struct fuse_out_header oh;
+
+	if (nbytes < sizeof(struct fuse_out_header))
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &oh, sizeof(oh));
+	if (err)
+		goto err_finish;
+
+	err = -EINVAL;
+	if (oh.len != nbytes)
+		goto err_finish;
+
+	/*
+	 * Zero oh.unique indicates unsolicited notification message
+	 * and error contains notification code.
+	 */
+	if (!oh.unique) {
+		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
+		return err ? err : nbytes;
+	}
+
+	err = -EINVAL;
+	if (oh.error <= -1000 || oh.error > 0)
+		goto err_finish;
+
+	spin_lock(&fc->lock);
+	err = -ENOENT;
+	if (!fc->connected)
+		goto err_unlock;
+
+	req = request_find(fc, oh.unique);
+	if (!req)
+		goto err_unlock;
+
+	if (req->aborted) {
+		spin_unlock(&fc->lock);
+		fuse_copy_finish(cs);
+		spin_lock(&fc->lock);
+		request_end(fc, req);
+		return -ENOENT;
+	}
+	/* Is it an interrupt reply? */
+	if (req->intr_unique == oh.unique) {
+		err = -EINVAL;
+		if (nbytes != sizeof(struct fuse_out_header))
+			goto err_unlock;
+
+		if (oh.error == -ENOSYS)
+			fc->no_interrupt = 1;
+		else if (oh.error == -EAGAIN)
+			queue_interrupt(fc, req);
+
+		spin_unlock(&fc->lock);
+		fuse_copy_finish(cs);
+		return nbytes;
+	}
+
+	req->state = FUSE_REQ_WRITING;
+	list_move(&req->list, &fc->io);
+	req->out.h = oh;
+	req->locked = 1;
+	cs->req = req;
+	if (!req->out.page_replace)
+		cs->move_pages = 0;
+	spin_unlock(&fc->lock);
+
+	err = copy_out_args(cs, &req->out, nbytes);
+	fuse_copy_finish(cs);
+
+	spin_lock(&fc->lock);
+	req->locked = 0;
+	if (!err) {
+		if (req->aborted)
+			err = -ENOENT;
+	} else if (!req->aborted)
+		req->out.h.error = -EIO;
+	request_end(fc, req);
+
+	return err ? err : nbytes;
+
+ err_unlock:
+	spin_unlock(&fc->lock);
+ err_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+	if (!fc)
+		return -EPERM;
+
+	if (!iter_is_iovec(from))
+		return -EINVAL;
+
+	fuse_copy_init(&cs, fc, 0, from);
+
+	return fuse_dev_do_write(fc, &cs, iov_iter_count(from));
+}
+
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+				     struct file *out, loff_t *ppos,
+				     size_t len, unsigned int flags)
+{
+	unsigned nbuf;
+	unsigned idx;
+	struct pipe_buffer *bufs;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc;
+	size_t rem;
+	ssize_t ret;
+
+	fc = fuse_get_conn(out);
+	if (!fc)
+		return -EPERM;
+
+	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (!bufs)
+		return -ENOMEM;
+
+	pipe_lock(pipe);
+	nbuf = 0;
+	rem = 0;
+	for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+
+	ret = -EINVAL;
+	if (rem < len) {
+		pipe_unlock(pipe);
+		goto out;
+	}
+
+	rem = len;
+	while (rem) {
+		struct pipe_buffer *ibuf;
+		struct pipe_buffer *obuf;
+
+		BUG_ON(nbuf >= pipe->buffers);
+		BUG_ON(!pipe->nrbufs);
+		ibuf = &pipe->bufs[pipe->curbuf];
+		obuf = &bufs[nbuf];
+
+		if (rem >= ibuf->len) {
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+			pipe->nrbufs--;
+		} else {
+			ibuf->ops->get(pipe, ibuf);
+			*obuf = *ibuf;
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+			obuf->len = rem;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		nbuf++;
+		rem -= obuf->len;
+	}
+	pipe_unlock(pipe);
+
+	fuse_copy_init(&cs, fc, 0, NULL);
+	cs.pipebufs = bufs;
+	cs.nr_segs = nbuf;
+	cs.pipe = pipe;
+
+	if (flags & SPLICE_F_MOVE)
+		cs.move_pages = 1;
+
+	ret = fuse_dev_do_write(fc, &cs, len);
+
+	for (idx = 0; idx < nbuf; idx++) {
+		struct pipe_buffer *buf = &bufs[idx];
+		buf->ops->release(pipe, buf);
+	}
+out:
+	kfree(bufs);
+	return ret;
+}
+
+static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
+{
+	unsigned mask = POLLOUT | POLLWRNORM;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return POLLERR;
+
+	poll_wait(file, &fc->waitq, wait);
+
+	spin_lock(&fc->lock);
+	if (!fc->connected)
+		mask = POLLERR;
+	else if (request_pending(fc))
+		mask |= POLLIN | POLLRDNORM;
+	spin_unlock(&fc->lock);
+
+	return mask;
+}
+
+/*
+ * Abort all requests on the given list (pending or processing)
+ *
+ * This function releases and reacquires fc->lock
+ */
+static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	while (!list_empty(head)) {
+		struct fuse_req *req;
+		req = list_entry(head->next, struct fuse_req, list);
+		req->out.h.error = -ECONNABORTED;
+		request_end(fc, req);
+		spin_lock(&fc->lock);
+	}
+}
+
+/*
+ * Abort requests under I/O
+ *
+ * The requests are set to aborted and finished, and the request
+ * waiter is woken up.  This will make request_wait_answer() wait
+ * until the request is unlocked and then return.
+ *
+ * If the request is asynchronous, then the end function needs to be
+ * called after waiting for the request to be unlocked (if it was
+ * locked).
+ */
+static void end_io_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	while (!list_empty(&fc->io)) {
+		struct fuse_req *req =
+			list_entry(fc->io.next, struct fuse_req, list);
+		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+
+		req->aborted = 1;
+		req->out.h.error = -ECONNABORTED;
+		req->state = FUSE_REQ_FINISHED;
+		list_del_init(&req->list);
+		wake_up(&req->waitq);
+		if (end) {
+			req->end = NULL;
+			__fuse_get_request(req);
+			spin_unlock(&fc->lock);
+			wait_event(req->waitq, !req->locked);
+			end(fc, req);
+			fuse_put_request(fc, req);
+			spin_lock(&fc->lock);
+		}
+	}
+}
+
+static void end_queued_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	fc->max_background = UINT_MAX;
+	flush_bg_queue(fc);
+	end_requests(fc, &fc->pending);
+	end_requests(fc, &fc->processing);
+	while (forget_pending(fc))
+		kfree(dequeue_forget(fc, 1, NULL));
+}
+
+static void end_polls(struct fuse_conn *fc)
+{
+	struct rb_node *p;
+
+	p = rb_first(&fc->polled_files);
+
+	while (p) {
+		struct fuse_file *ff;
+		ff = rb_entry(p, struct fuse_file, polled_node);
+		wake_up_interruptible_all(&ff->poll_wait);
+
+		p = rb_next(p);
+	}
+}
+
+/*
+ * Abort all requests.
+ *
+ * Emergency exit in case of a malicious or accidental deadlock, or
+ * just a hung filesystem.
+ *
+ * The same effect is usually achievable through killing the
+ * filesystem daemon and all users of the filesystem.  The exception
+ * is the combination of an asynchronous request and the tricky
+ * deadlock (see Documentation/filesystems/fuse.txt).
+ *
+ * During the aborting, progression of requests from the pending and
+ * processing lists onto the io list, and progression of new requests
+ * onto the pending list is prevented by req->connected being false.
+ *
+ * Progression of requests under I/O to the processing list is
+ * prevented by the req->aborted flag being true for these requests.
+ * For this reason requests on the io list must be aborted first.
+ */
+void fuse_abort_conn(struct fuse_conn *fc)
+{
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		fc->connected = 0;
+		fc->blocked = 0;
+		fuse_set_initialized(fc);
+		end_io_requests(fc);
+		end_queued_requests(fc);
+		end_polls(fc);
+		wake_up_all(&fc->waitq);
+		wake_up_all(&fc->blocked_waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	}
+	spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
+
+int fuse_dev_release(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (fc) {
+		spin_lock(&fc->lock);
+		fc->connected = 0;
+		fc->blocked = 0;
+		fuse_set_initialized(fc);
+		end_queued_requests(fc);
+		end_polls(fc);
+		wake_up_all(&fc->blocked_waitq);
+		spin_unlock(&fc->lock);
+		fuse_conn_put(fc);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_release);
+
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
+
+	/* No locking - fasync_helper does its own locking */
+	return fasync_helper(fd, file, on, &fc->fasync);
+}
+
+const struct file_operations fuse_dev_operations = {
+	.owner		= THIS_MODULE,
+	.open		= fuse_dev_open,
+	.llseek		= no_llseek,
+	.read_iter	= fuse_dev_read,
+	.splice_read	= fuse_dev_splice_read,
+	.write_iter	= fuse_dev_write,
+	.splice_write	= fuse_dev_splice_write,
+	.poll		= fuse_dev_poll,
+	.release	= fuse_dev_release,
+	.fasync		= fuse_dev_fasync,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_operations);
+
+static struct miscdevice fuse_miscdevice = {
+	.minor = FUSE_MINOR,
+	.name  = "fuse",
+	.fops = &fuse_dev_operations,
+};
+
+int __init fuse_dev_init(void)
+{
+	int err = -ENOMEM;
+	fuse_req_cachep = kmem_cache_create("fuse_request",
+					    sizeof(struct fuse_req),
+					    0, 0, NULL);
+	if (!fuse_req_cachep)
+		goto out;
+
+	err = misc_register(&fuse_miscdevice);
+	if (err)
+		goto out_cache_clean;
+
+	return 0;
+
+ out_cache_clean:
+	kmem_cache_destroy(fuse_req_cachep);
+ out:
+	return err;
+}
+
+void fuse_dev_cleanup(void)
+{
+	misc_deregister(&fuse_miscdevice);
+	kmem_cache_destroy(fuse_req_cachep);
+}
diff --git a/kernel/fs/fuse/dir.c b/kernel/fs/fuse/dir.c
new file mode 100644
index 000000000..0572bca49
--- /dev/null
+++ b/kernel/fs/fuse/dir.c
@@ -0,0 +1,1952 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_inode *fi = get_fuse_inode(dir);
+
+	if (!fc->do_readdirplus)
+		return false;
+	if (!fc->readdirplus_auto)
+		return true;
+	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+		return true;
+	if (ctx->pos == 0)
+		return true;
+	return false;
+}
+
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+	struct fuse_inode *fi = get_fuse_inode(dir);
+
+	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
+
+#if BITS_PER_LONG >= 64
+static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+	entry->d_time = time;
+}
+
+static inline u64 fuse_dentry_time(struct dentry *entry)
+{
+	return entry->d_time;
+}
+#else
+/*
+ * On 32 bit archs store the high 32 bits of time in d_fsdata
+ */
+static void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+	entry->d_time = time;
+	entry->d_fsdata = (void *) (unsigned long) (time >> 32);
+}
+
+static u64 fuse_dentry_time(struct dentry *entry)
+{
+	return (u64) entry->d_time +
+		((u64) (unsigned long) entry->d_fsdata << 32);
+}
+#endif
+
+/*
+ * FUSE caches dentries and attributes with separate timeout.  The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_time and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
+static u64 time_to_jiffies(unsigned long sec, unsigned long nsec)
+{
+	if (sec || nsec) {
+		struct timespec ts = {sec, nsec};
+		return get_jiffies_64() + timespec_to_jiffies(&ts);
+	} else
+		return 0;
+}
+
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
+static void fuse_change_entry_timeout(struct dentry *entry,
+				      struct fuse_entry_out *o)
+{
+	fuse_dentry_settime(entry,
+		time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
+}
+
+static u64 attr_timeout(struct fuse_attr_out *o)
+{
+	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+static u64 entry_attr_timeout(struct fuse_entry_out *o)
+{
+	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
+void fuse_invalidate_attr(struct inode *inode)
+{
+	get_fuse_inode(inode)->i_time = 0;
+}
+
+/**
+ * Mark the attributes as stale due to an atime change.  Avoid the invalidate if
+ * atime is not used.
+ */
+void fuse_invalidate_atime(struct inode *inode)
+{
+	if (!IS_RDONLY(inode))
+		fuse_invalidate_attr(inode);
+}
+
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
+void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+	fuse_dentry_settime(entry, 0);
+}
+
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+	d_invalidate(entry);
+	fuse_invalidate_entry_cache(entry);
+}
+
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
+			     u64 nodeid, struct qstr *name,
+			     struct fuse_entry_out *outarg)
+{
+	memset(outarg, 0, sizeof(struct fuse_entry_out));
+	args->in.h.opcode = FUSE_LOOKUP;
+	args->in.h.nodeid = nodeid;
+	args->in.numargs = 1;
+	args->in.args[0].size = name->len + 1;
+	args->in.args[0].value = name->name;
+	args->out.numargs = 1;
+	args->out.args[0].size = sizeof(struct fuse_entry_out);
+	args->out.args[0].value = outarg;
+}
+
+u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+	u64 curr_version;
+
+	/*
+	 * The spin lock isn't actually needed on 64bit archs, but we
+	 * don't yet care too much about such optimizations.
+	 */
+	spin_lock(&fc->lock);
+	curr_version = fc->attr_version;
+	spin_unlock(&fc->lock);
+
+	return curr_version;
+}
+
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup.  If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more.  If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
+static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
+{
+	struct inode *inode;
+	struct dentry *parent;
+	struct fuse_conn *fc;
+	struct fuse_inode *fi;
+	int ret;
+
+	inode = d_inode_rcu(entry);
+	if (inode && is_bad_inode(inode))
+		goto invalid;
+	else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
+		 (flags & LOOKUP_REVAL)) {
+		struct fuse_entry_out outarg;
+		FUSE_ARGS(args);
+		struct fuse_forget_link *forget;
+		u64 attr_version;
+
+		/* For negative dentries, always do a fresh lookup */
+		if (!inode)
+			goto invalid;
+
+		ret = -ECHILD;
+		if (flags & LOOKUP_RCU)
+			goto out;
+
+		fc = get_fuse_conn(inode);
+
+		forget = fuse_alloc_forget();
+		ret = -ENOMEM;
+		if (!forget)
+			goto out;
+
+		attr_version = fuse_get_attr_version(fc);
+
+		parent = dget_parent(entry);
+		fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
+				 &entry->d_name, &outarg);
+		ret = fuse_simple_request(fc, &args);
+		dput(parent);
+		/* Zero nodeid is same as -ENOENT */
+		if (!ret && !outarg.nodeid)
+			ret = -ENOENT;
+		if (!ret) {
+			fi = get_fuse_inode(inode);
+			if (outarg.nodeid != get_node_id(inode)) {
+				fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+				goto invalid;
+			}
+			spin_lock(&fc->lock);
+			fi->nlookup++;
+			spin_unlock(&fc->lock);
+		}
+		kfree(forget);
+		if (ret == -ENOMEM)
+			goto out;
+		if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+			goto invalid;
+
+		fuse_change_attributes(inode, &outarg.attr,
+				       entry_attr_timeout(&outarg),
+				       attr_version);
+		fuse_change_entry_timeout(entry, &outarg);
+	} else if (inode) {
+		fi = get_fuse_inode(inode);
+		if (flags & LOOKUP_RCU) {
+			if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
+				return -ECHILD;
+		} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
+			parent = dget_parent(entry);
+			fuse_advise_use_readdirplus(d_inode(parent));
+			dput(parent);
+		}
+	}
+	ret = 1;
+out:
+	return ret;
+
+invalid:
+	ret = 0;
+	goto out;
+}
+
+static int invalid_nodeid(u64 nodeid)
+{
+	return !nodeid || nodeid == FUSE_ROOT_ID;
+}
+
+const struct dentry_operations fuse_dentry_operations = {
+	.d_revalidate	= fuse_dentry_revalidate,
+};
+
+int fuse_valid_type(int m)
+{
+	return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	FUSE_ARGS(args);
+	struct fuse_forget_link *forget;
+	u64 attr_version;
+	int err;
+
+	*inode = NULL;
+	err = -ENAMETOOLONG;
+	if (name->len > FUSE_NAME_MAX)
+		goto out;
+
+
+	forget = fuse_alloc_forget();
+	err = -ENOMEM;
+	if (!forget)
+		goto out;
+
+	attr_version = fuse_get_attr_version(fc);
+
+	fuse_lookup_init(fc, &args, nodeid, name, outarg);
+	err = fuse_simple_request(fc, &args);
+	/* Zero nodeid is same as -ENOENT, but with valid timeout */
+	if (err || !outarg->nodeid)
+		goto out_put_forget;
+
+	err = -EIO;
+	if (!outarg->nodeid)
+		goto out_put_forget;
+	if (!fuse_valid_type(outarg->attr.mode))
+		goto out_put_forget;
+
+	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+			   &outarg->attr, entry_attr_timeout(outarg),
+			   attr_version);
+	err = -ENOMEM;
+	if (!*inode) {
+		fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+		goto out;
+	}
+	err = 0;
+
+ out_put_forget:
+	kfree(forget);
+ out:
+	return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  unsigned int flags)
+{
+	int err;
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	struct dentry *newent;
+	bool outarg_valid = true;
+
+	err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+			       &outarg, &inode);
+	if (err == -ENOENT) {
+		outarg_valid = false;
+		err = 0;
+	}
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+		goto out_iput;
+
+	newent = d_splice_alias(inode, entry);
+	err = PTR_ERR(newent);
+	if (IS_ERR(newent))
+		goto out_err;
+
+	entry = newent ? newent : entry;
+	if (outarg_valid)
+		fuse_change_entry_timeout(entry, &outarg);
+	else
+		fuse_invalidate_entry_cache(entry);
+
+	fuse_advise_use_readdirplus(dir);
+	return newent;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
+}
+
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
+static int fuse_create_open(struct inode *dir, struct dentry *entry,
+			    struct file *file, unsigned flags,
+			    umode_t mode, int *opened)
+{
+	int err;
+	struct inode *inode;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	FUSE_ARGS(args);
+	struct fuse_forget_link *forget;
+	struct fuse_create_in inarg;
+	struct fuse_open_out outopen;
+	struct fuse_entry_out outentry;
+	struct fuse_file *ff;
+
+	/* Userspace expects S_IFREG in create mode */
+	BUG_ON((mode & S_IFMT) != S_IFREG);
+
+	forget = fuse_alloc_forget();
+	err = -ENOMEM;
+	if (!forget)
+		goto out_err;
+
+	err = -ENOMEM;
+	ff = fuse_file_alloc(fc);
+	if (!ff)
+		goto out_put_forget_req;
+
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
+	flags &= ~O_NOCTTY;
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outentry, 0, sizeof(outentry));
+	inarg.flags = flags;
+	inarg.mode = mode;
+	inarg.umask = current_umask();
+	args.in.h.opcode = FUSE_CREATE;
+	args.in.h.nodeid = get_node_id(dir);
+	args.in.numargs = 2;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = entry->d_name.len + 1;
+	args.in.args[1].value = entry->d_name.name;
+	args.out.numargs = 2;
+	args.out.args[0].size = sizeof(outentry);
+	args.out.args[0].value = &outentry;
+	args.out.args[1].size = sizeof(outopen);
+	args.out.args[1].value = &outopen;
+	err = fuse_simple_request(fc, &args);
+	if (err)
+		goto out_free_ff;
+
+	err = -EIO;
+	if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
+		goto out_free_ff;
+
+	ff->fh = outopen.fh;
+	ff->nodeid = outentry.nodeid;
+	ff->open_flags = outopen.open_flags;
+	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
+			  &outentry.attr, entry_attr_timeout(&outentry), 0);
+	if (!inode) {
+		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
+		fuse_sync_release(ff, flags);
+		fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+		err = -ENOMEM;
+		goto out_err;
+	}
+	kfree(forget);
+	d_instantiate(entry, inode);
+	fuse_change_entry_timeout(entry, &outentry);
+	fuse_invalidate_attr(dir);
+	err = finish_open(file, entry, generic_file_open, opened);
+	if (err) {
+		fuse_sync_release(ff, flags);
+	} else {
+		file->private_data = fuse_file_get(ff);
+		fuse_finish_open(inode, file);
+	}
+	return err;
+
+out_free_ff:
+	fuse_file_free(ff);
+out_put_forget_req:
+	kfree(forget);
+out_err:
+	return err;
+}
+
+static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
+			    struct file *file, unsigned flags,
+			    umode_t mode, int *opened)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct dentry *res = NULL;
+
+	if (d_unhashed(entry)) {
+		res = fuse_lookup(dir, entry, 0);
+		if (IS_ERR(res))
+			return PTR_ERR(res);
+
+		if (res)
+			entry = res;
+	}
+
+	if (!(flags & O_CREAT) || d_really_is_positive(entry))
+		goto no_open;
+
+	/* Only creates */
+	*opened |= FILE_CREATED;
+
+	if (fc->no_create)
+		goto mknod;
+
+	err = fuse_create_open(dir, entry, file, flags, mode, opened);
+	if (err == -ENOSYS) {
+		fc->no_create = 1;
+		goto mknod;
+	}
+out_dput:
+	dput(res);
+	return err;
+
+mknod:
+	err = fuse_mknod(dir, entry, mode, 0);
+	if (err)
+		goto out_dput;
+no_open:
+	return finish_no_open(file, res);
+}
+
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
+static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
+			    struct inode *dir, struct dentry *entry,
+			    umode_t mode)
+{
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	int err;
+	struct fuse_forget_link *forget;
+
+	forget = fuse_alloc_forget();
+	if (!forget)
+		return -ENOMEM;
+
+	memset(&outarg, 0, sizeof(outarg));
+	args->in.h.nodeid = get_node_id(dir);
+	args->out.numargs = 1;
+	args->out.args[0].size = sizeof(outarg);
+	args->out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, args);
+	if (err)
+		goto out_put_forget_req;
+
+	err = -EIO;
+	if (invalid_nodeid(outarg.nodeid))
+		goto out_put_forget_req;
+
+	if ((outarg.attr.mode ^ mode) & S_IFMT)
+		goto out_put_forget_req;
+
+	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+			  &outarg.attr, entry_attr_timeout(&outarg), 0);
+	if (!inode) {
+		fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+		return -ENOMEM;
+	}
+	kfree(forget);
+
+	err = d_instantiate_no_diralias(entry, inode);
+	if (err)
+		return err;
+
+	fuse_change_entry_timeout(entry, &outarg);
+	fuse_invalidate_attr(dir);
+	return 0;
+
+ out_put_forget_req:
+	kfree(forget);
+	return err;
+}
+
+static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
+		      dev_t rdev)
+{
+	struct fuse_mknod_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	FUSE_ARGS(args);
+
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	inarg.rdev = new_encode_dev(rdev);
+	inarg.umask = current_umask();
+	args.in.h.opcode = FUSE_MKNOD;
+	args.in.numargs = 2;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = entry->d_name.len + 1;
+	args.in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, &args, dir, entry, mode);
+}
+
+static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
+		       bool excl)
+{
+	return fuse_mknod(dir, entry, mode, 0);
+}
+
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
+{
+	struct fuse_mkdir_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	FUSE_ARGS(args);
+
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	inarg.umask = current_umask();
+	args.in.h.opcode = FUSE_MKDIR;
+	args.in.numargs = 2;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = entry->d_name.len + 1;
+	args.in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, &args, dir, entry, S_IFDIR);
+}
+
+static int fuse_symlink(struct inode *dir, struct dentry *entry,
+			const char *link)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	unsigned len = strlen(link) + 1;
+	FUSE_ARGS(args);
+
+	args.in.h.opcode = FUSE_SYMLINK;
+	args.in.numargs = 2;
+	args.in.args[0].size = entry->d_name.len + 1;
+	args.in.args[0].value = entry->d_name.name;
+	args.in.args[1].size = len;
+	args.in.args[1].value = link;
+	return create_new_entry(fc, &args, dir, entry, S_IFLNK);
+}
+
+static inline void fuse_update_ctime(struct inode *inode)
+{
+	if (!IS_NOCMTIME(inode)) {
+		inode->i_ctime = current_fs_time(inode->i_sb);
+		mark_inode_dirty_sync(inode);
+	}
+}
+
+static int fuse_unlink(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	FUSE_ARGS(args);
+
+	args.in.h.opcode = FUSE_UNLINK;
+	args.in.h.nodeid = get_node_id(dir);
+	args.in.numargs = 1;
+	args.in.args[0].size = entry->d_name.len + 1;
+	args.in.args[0].value = entry->d_name.name;
+	err = fuse_simple_request(fc, &args);
+	if (!err) {
+		struct inode *inode = d_inode(entry);
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
+		/*
+		 * If i_nlink == 0 then unlink doesn't make sense, yet this can
+		 * happen if userspace filesystem is careless.  It would be
+		 * difficult to enforce correct nlink usage so just ignore this
+		 * condition here
+		 */
+		if (inode->i_nlink > 0)
+			drop_nlink(inode);
+		spin_unlock(&fc->lock);
+		fuse_invalidate_attr(inode);
+		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
+		fuse_update_ctime(inode);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rmdir(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	FUSE_ARGS(args);
+
+	args.in.h.opcode = FUSE_RMDIR;
+	args.in.h.nodeid = get_node_id(dir);
+	args.in.numargs = 1;
+	args.in.args[0].size = entry->d_name.len + 1;
+	args.in.args[0].value = entry->d_name.name;
+	err = fuse_simple_request(fc, &args);
+	if (!err) {
+		clear_nlink(d_inode(entry));
+		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+			      struct inode *newdir, struct dentry *newent,
+			      unsigned int flags, int opcode, size_t argsize)
+{
+	int err;
+	struct fuse_rename2_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(olddir);
+	FUSE_ARGS(args);
+
+	memset(&inarg, 0, argsize);
+	inarg.newdir = get_node_id(newdir);
+	inarg.flags = flags;
+	args.in.h.opcode = opcode;
+	args.in.h.nodeid = get_node_id(olddir);
+	args.in.numargs = 3;
+	args.in.args[0].size = argsize;
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = oldent->d_name.len + 1;
+	args.in.args[1].value = oldent->d_name.name;
+	args.in.args[2].size = newent->d_name.len + 1;
+	args.in.args[2].value = newent->d_name.name;
+	err = fuse_simple_request(fc, &args);
+	if (!err) {
+		/* ctime changes */
+		fuse_invalidate_attr(d_inode(oldent));
+		fuse_update_ctime(d_inode(oldent));
+
+		if (flags & RENAME_EXCHANGE) {
+			fuse_invalidate_attr(d_inode(newent));
+			fuse_update_ctime(d_inode(newent));
+		}
+
+		fuse_invalidate_attr(olddir);
+		if (olddir != newdir)
+			fuse_invalidate_attr(newdir);
+
+		/* newent will end up negative */
+		if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
+			fuse_invalidate_attr(d_inode(newent));
+			fuse_invalidate_entry_cache(newent);
+			fuse_update_ctime(d_inode(newent));
+		}
+	} else if (err == -EINTR) {
+		/* If request was interrupted, DEITY only knows if the
+		   rename actually took place.  If the invalidation
+		   fails (e.g. some process has CWD under the renamed
+		   directory), then there can be inconsistency between
+		   the dcache and the real filesystem.  Tough luck. */
+		fuse_invalidate_entry(oldent);
+		if (d_really_is_positive(newent))
+			fuse_invalidate_entry(newent);
+	}
+
+	return err;
+}
+
+static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
+			struct inode *newdir, struct dentry *newent,
+			unsigned int flags)
+{
+	struct fuse_conn *fc = get_fuse_conn(olddir);
+	int err;
+
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (flags) {
+		if (fc->no_rename2 || fc->minor < 23)
+			return -EINVAL;
+
+		err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+					 FUSE_RENAME2,
+					 sizeof(struct fuse_rename2_in));
+		if (err == -ENOSYS) {
+			fc->no_rename2 = 1;
+			err = -EINVAL;
+		}
+	} else {
+		err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
+					 FUSE_RENAME,
+					 sizeof(struct fuse_rename_in));
+	}
+
+	return err;
+}
+
+static int fuse_link(struct dentry *entry, struct inode *newdir,
+		     struct dentry *newent)
+{
+	int err;
+	struct fuse_link_in inarg;
+	struct inode *inode = d_inode(entry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.oldnodeid = get_node_id(inode);
+	args.in.h.opcode = FUSE_LINK;
+	args.in.numargs = 2;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = newent->d_name.len + 1;
+	args.in.args[1].value = newent->d_name.name;
+	err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
+	/* Contrary to "normal" filesystems it can happen that link
+	   makes two "logical" inodes point to the same "physical"
+	   inode.  We invalidate the attributes of the old one, so it
+	   will reflect changes in the backing inode (link count,
+	   etc.)
+	*/
+	if (!err) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
+		inc_nlink(inode);
+		spin_unlock(&fc->lock);
+		fuse_invalidate_attr(inode);
+		fuse_update_ctime(inode);
+	} else if (err == -EINTR) {
+		fuse_invalidate_attr(inode);
+	}
+	return err;
+}
+
+static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
+			  struct kstat *stat)
+{
+	unsigned int blkbits;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	/* see the comment in fuse_change_attributes() */
+	if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+		attr->size = i_size_read(inode);
+		attr->mtime = inode->i_mtime.tv_sec;
+		attr->mtimensec = inode->i_mtime.tv_nsec;
+		attr->ctime = inode->i_ctime.tv_sec;
+		attr->ctimensec = inode->i_ctime.tv_nsec;
+	}
+
+	stat->dev = inode->i_sb->s_dev;
+	stat->ino = attr->ino;
+	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+	stat->nlink = attr->nlink;
+	stat->uid = make_kuid(&init_user_ns, attr->uid);
+	stat->gid = make_kgid(&init_user_ns, attr->gid);
+	stat->rdev = inode->i_rdev;
+	stat->atime.tv_sec = attr->atime;
+	stat->atime.tv_nsec = attr->atimensec;
+	stat->mtime.tv_sec = attr->mtime;
+	stat->mtime.tv_nsec = attr->mtimensec;
+	stat->ctime.tv_sec = attr->ctime;
+	stat->ctime.tv_nsec = attr->ctimensec;
+	stat->size = attr->size;
+	stat->blocks = attr->blocks;
+
+	if (attr->blksize != 0)
+		blkbits = ilog2(attr->blksize);
+	else
+		blkbits = inode->i_sb->s_blocksize_bits;
+
+	stat->blksize = 1 << blkbits;
+}
+
+static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
+			   struct file *file)
+{
+	int err;
+	struct fuse_getattr_in inarg;
+	struct fuse_attr_out outarg;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	u64 attr_version;
+
+	attr_version = fuse_get_attr_version(fc);
+
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outarg, 0, sizeof(outarg));
+	/* Directories have separate file-handle space */
+	if (file && S_ISREG(inode->i_mode)) {
+		struct fuse_file *ff = file->private_data;
+
+		inarg.getattr_flags |= FUSE_GETATTR_FH;
+		inarg.fh = ff->fh;
+	}
+	args.in.h.opcode = FUSE_GETATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(outarg);
+	args.out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+	if (!err) {
+		if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+			make_bad_inode(inode);
+			err = -EIO;
+		} else {
+			fuse_change_attributes(inode, &outarg.attr,
+					       attr_timeout(&outarg),
+					       attr_version);
+			if (stat)
+				fuse_fillattr(inode, &outarg.attr, stat);
+		}
+	}
+	return err;
+}
+
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+			   struct file *file, bool *refreshed)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	int err;
+	bool r;
+
+	if (time_before64(fi->i_time, get_jiffies_64())) {
+		r = true;
+		err = fuse_do_getattr(inode, stat, file);
+	} else {
+		r = false;
+		err = 0;
+		if (stat) {
+			generic_fillattr(inode, stat);
+			stat->mode = fi->orig_i_mode;
+			stat->ino = fi->orig_ino;
+		}
+	}
+
+	if (refreshed != NULL)
+		*refreshed = r;
+
+	return err;
+}
+
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+			     u64 child_nodeid, struct qstr *name)
+{
+	int err = -ENOTDIR;
+	struct inode *parent;
+	struct dentry *dir;
+	struct dentry *entry;
+
+	parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+	if (!parent)
+		return -ENOENT;
+
+	mutex_lock(&parent->i_mutex);
+	if (!S_ISDIR(parent->i_mode))
+		goto unlock;
+
+	err = -ENOENT;
+	dir = d_find_alias(parent);
+	if (!dir)
+		goto unlock;
+
+	entry = d_lookup(dir, name);
+	dput(dir);
+	if (!entry)
+		goto unlock;
+
+	fuse_invalidate_attr(parent);
+	fuse_invalidate_entry(entry);
+
+	if (child_nodeid != 0 && d_really_is_positive(entry)) {
+		mutex_lock(&d_inode(entry)->i_mutex);
+		if (get_node_id(d_inode(entry)) != child_nodeid) {
+			err = -ENOENT;
+			goto badentry;
+		}
+		if (d_mountpoint(entry)) {
+			err = -EBUSY;
+			goto badentry;
+		}
+		if (d_is_dir(entry)) {
+			shrink_dcache_parent(entry);
+			if (!simple_empty(entry)) {
+				err = -ENOTEMPTY;
+				goto badentry;
+			}
+			d_inode(entry)->i_flags |= S_DEAD;
+		}
+		dont_mount(entry);
+		clear_nlink(d_inode(entry));
+		err = 0;
+ badentry:
+		mutex_unlock(&d_inode(entry)->i_mutex);
+		if (!err)
+			d_delete(entry);
+	} else {
+		err = 0;
+	}
+	dput(entry);
+
+ unlock:
+	mutex_unlock(&parent->i_mutex);
+	iput(parent);
+	return err;
+}
+
+/*
+ * Calling into a user-controlled filesystem gives the filesystem
+ * daemon ptrace-like capabilities over the current process.  This
+ * means, that the filesystem daemon is able to record the exact
+ * filesystem operations performed, and can also control the behavior
+ * of the requester process in otherwise impossible ways.  For example
+ * it can delay the operation for arbitrary length of time allowing
+ * DoS against the requester.
+ *
+ * For this reason only those processes can call into the filesystem,
+ * for which the owner of the mount has ptrace privilege.  This
+ * excludes processes started by other users, suid or sgid processes.
+ */
+int fuse_allow_current_process(struct fuse_conn *fc)
+{
+	const struct cred *cred;
+
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		return 1;
+
+	cred = current_cred();
+	if (uid_eq(cred->euid, fc->user_id) &&
+	    uid_eq(cred->suid, fc->user_id) &&
+	    uid_eq(cred->uid,  fc->user_id) &&
+	    gid_eq(cred->egid, fc->group_id) &&
+	    gid_eq(cred->sgid, fc->group_id) &&
+	    gid_eq(cred->gid,  fc->group_id))
+		return 1;
+
+	return 0;
+}
+
+static int fuse_access(struct inode *inode, int mask)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_access_in inarg;
+	int err;
+
+	BUG_ON(mask & MAY_NOT_BLOCK);
+
+	if (fc->no_access)
+		return 0;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
+	args.in.h.opcode = FUSE_ACCESS;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_access = 1;
+		err = 0;
+	}
+	return err;
+}
+
+static int fuse_perm_getattr(struct inode *inode, int mask)
+{
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	return fuse_do_getattr(inode, NULL, NULL);
+}
+
+/*
+ * Check permission.  The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode.  This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation.  An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent.  Execute permission is still checked
+ * locally based on file mode.
+ */
+static int fuse_permission(struct inode *inode, int mask)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	bool refreshed = false;
+	int err = 0;
+
+	if (!fuse_allow_current_process(fc))
+		return -EACCES;
+
+	/*
+	 * If attributes are needed, refresh them before proceeding
+	 */
+	if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
+	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		if (time_before64(fi->i_time, get_jiffies_64())) {
+			refreshed = true;
+
+			err = fuse_perm_getattr(inode, mask);
+			if (err)
+				return err;
+		}
+	}
+
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+		err = generic_permission(inode, mask);
+
+		/* If permission is denied, try to refresh file
+		   attributes.  This is also needed, because the root
+		   node will at first have no permissions */
+		if (err == -EACCES && !refreshed) {
+			err = fuse_perm_getattr(inode, mask);
+			if (!err)
+				err = generic_permission(inode, mask);
+		}
+
+		/* Note: the opposite of the above test does not
+		   exist.  So if permissions are revoked this won't be
+		   noticed immediately, only after the attribute
+		   timeout has expired */
+	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+		err = fuse_access(inode, mask);
+	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+		if (!(inode->i_mode & S_IXUGO)) {
+			if (refreshed)
+				return -EACCES;
+
+			err = fuse_perm_getattr(inode, mask);
+			if (!err && !(inode->i_mode & S_IXUGO))
+				return -EACCES;
+		}
+	}
+	return err;
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+			 struct dir_context *ctx)
+{
+	while (nbytes >= FUSE_NAME_OFFSET) {
+		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+		size_t reclen = FUSE_DIRENT_SIZE(dirent);
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+			return -EIO;
+
+		if (!dir_emit(ctx, dirent->name, dirent->namelen,
+			       dirent->ino, dirent->type))
+			break;
+
+		buf += reclen;
+		nbytes -= reclen;
+		ctx->pos = dirent->off;
+	}
+
+	return 0;
+}
+
+static int fuse_direntplus_link(struct file *file,
+				struct fuse_direntplus *direntplus,
+				u64 attr_version)
+{
+	int err;
+	struct fuse_entry_out *o = &direntplus->entry_out;
+	struct fuse_dirent *dirent = &direntplus->dirent;
+	struct dentry *parent = file->f_path.dentry;
+	struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+	struct dentry *dentry;
+	struct dentry *alias;
+	struct inode *dir = d_inode(parent);
+	struct fuse_conn *fc;
+	struct inode *inode;
+
+	if (!o->nodeid) {
+		/*
+		 * Unlike in the case of fuse_lookup, zero nodeid does not mean
+		 * ENOENT. Instead, it only means the userspace filesystem did
+		 * not want to return attributes/handle for this entry.
+		 *
+		 * So do nothing.
+		 */
+		return 0;
+	}
+
+	if (name.name[0] == '.') {
+		/*
+		 * We could potentially refresh the attributes of the directory
+		 * and its parent?
+		 */
+		if (name.len == 1)
+			return 0;
+		if (name.name[1] == '.' && name.len == 2)
+			return 0;
+	}
+
+	if (invalid_nodeid(o->nodeid))
+		return -EIO;
+	if (!fuse_valid_type(o->attr.mode))
+		return -EIO;
+
+	fc = get_fuse_conn(dir);
+
+	name.hash = full_name_hash(name.name, name.len);
+	dentry = d_lookup(parent, &name);
+	if (dentry) {
+		inode = d_inode(dentry);
+		if (!inode) {
+			d_drop(dentry);
+		} else if (get_node_id(inode) != o->nodeid ||
+			   ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+			d_invalidate(dentry);
+		} else if (is_bad_inode(inode)) {
+			err = -EIO;
+			goto out;
+		} else {
+			struct fuse_inode *fi;
+			fi = get_fuse_inode(inode);
+			spin_lock(&fc->lock);
+			fi->nlookup++;
+			spin_unlock(&fc->lock);
+
+			fuse_change_attributes(inode, &o->attr,
+					       entry_attr_timeout(o),
+					       attr_version);
+
+			/*
+			 * The other branch to 'found' comes via fuse_iget()
+			 * which bumps nlookup inside
+			 */
+			goto found;
+		}
+		dput(dentry);
+	}
+
+	dentry = d_alloc(parent, &name);
+	err = -ENOMEM;
+	if (!dentry)
+		goto out;
+
+	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+			  &o->attr, entry_attr_timeout(o), attr_version);
+	if (!inode)
+		goto out;
+
+	alias = d_splice_alias(inode, dentry);
+	err = PTR_ERR(alias);
+	if (IS_ERR(alias))
+		goto out;
+
+	if (alias) {
+		dput(dentry);
+		dentry = alias;
+	}
+
+found:
+	if (fc->readdirplus_auto)
+		set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+	fuse_change_entry_timeout(dentry, o);
+
+	err = 0;
+out:
+	dput(dentry);
+	return err;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+			     struct dir_context *ctx, u64 attr_version)
+{
+	struct fuse_direntplus *direntplus;
+	struct fuse_dirent *dirent;
+	size_t reclen;
+	int over = 0;
+	int ret;
+
+	while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+		direntplus = (struct fuse_direntplus *) buf;
+		dirent = &direntplus->dirent;
+		reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+			return -EIO;
+
+		if (!over) {
+			/* We fill entries into dstbuf only as much as
+			   it can hold. But we still continue iterating
+			   over remaining entries to link them. If not,
+			   we need to send a FORGET for each of those
+			   which we did not link.
+			*/
+			over = !dir_emit(ctx, dirent->name, dirent->namelen,
+				       dirent->ino, dirent->type);
+			ctx->pos = dirent->off;
+		}
+
+		buf += reclen;
+		nbytes -= reclen;
+
+		ret = fuse_direntplus_link(file, direntplus, attr_version);
+		if (ret)
+			fuse_force_forget(file, direntplus->entry_out.nodeid);
+	}
+
+	return 0;
+}
+
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
+{
+	int plus, err;
+	size_t nbytes;
+	struct page *page;
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	u64 attr_version = 0;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_req(fc, 1);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		fuse_put_request(fc, req);
+		return -ENOMEM;
+	}
+
+	plus = fuse_use_readdirplus(inode, ctx);
+	req->out.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	req->page_descs[0].length = PAGE_SIZE;
+	if (plus) {
+		attr_version = fuse_get_attr_version(fc);
+		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+			       FUSE_READDIRPLUS);
+	} else {
+		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+			       FUSE_READDIR);
+	}
+	fuse_request_send(fc, req);
+	nbytes = req->out.args[0].size;
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		if (plus) {
+			err = parse_dirplusfile(page_address(page), nbytes,
+						file, ctx,
+						attr_version);
+		} else {
+			err = parse_dirfile(page_address(page), nbytes, file,
+					    ctx);
+		}
+	}
+
+	__free_page(page);
+	fuse_invalidate_atime(inode);
+	return err;
+}
+
+static char *read_link(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	char *link;
+	ssize_t ret;
+
+	link = (char *) __get_free_page(GFP_KERNEL);
+	if (!link)
+		return ERR_PTR(-ENOMEM);
+
+	args.in.h.opcode = FUSE_READLINK;
+	args.in.h.nodeid = get_node_id(inode);
+	args.out.argvar = 1;
+	args.out.numargs = 1;
+	args.out.args[0].size = PAGE_SIZE - 1;
+	args.out.args[0].value = link;
+	ret = fuse_simple_request(fc, &args);
+	if (ret < 0) {
+		free_page((unsigned long) link);
+		link = ERR_PTR(ret);
+	} else {
+		link[ret] = '\0';
+	}
+	fuse_invalidate_atime(inode);
+	return link;
+}
+
+static void free_link(char *link)
+{
+	if (!IS_ERR(link))
+		free_page((unsigned long) link);
+}
+
+static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, read_link(dentry));
+	return NULL;
+}
+
+static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+	free_link(nd_get_link(nd));
+}
+
+static int fuse_dir_open(struct inode *inode, struct file *file)
+{
+	return fuse_open_common(inode, file, true);
+}
+
+static int fuse_dir_release(struct inode *inode, struct file *file)
+{
+	fuse_release_common(file, FUSE_RELEASEDIR);
+
+	return 0;
+}
+
+static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
+{
+	return fuse_fsync_common(file, start, end, datasync, 1);
+}
+
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+	/* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+	if (fc->minor < 18)
+		return -ENOTTY;
+
+	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+	if (fc->minor < 18)
+		return -ENOTTY;
+
+	return fuse_ioctl_common(file, cmd, arg,
+				 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
+static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
+{
+	/* Always update if mtime is explicitly set  */
+	if (ivalid & ATTR_MTIME_SET)
+		return true;
+
+	/* Or if kernel i_mtime is the official one */
+	if (trust_local_mtime)
+		return true;
+
+	/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
+	if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
+		return false;
+
+	/* In all other cases update */
+	return true;
+}
+
+static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
+			   bool trust_local_cmtime)
+{
+	unsigned ivalid = iattr->ia_valid;
+
+	if (ivalid & ATTR_MODE)
+		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
+	if (ivalid & ATTR_UID)
+		arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
+	if (ivalid & ATTR_GID)
+		arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
+	if (ivalid & ATTR_SIZE)
+		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
+	if (ivalid & ATTR_ATIME) {
+		arg->valid |= FATTR_ATIME;
+		arg->atime = iattr->ia_atime.tv_sec;
+		arg->atimensec = iattr->ia_atime.tv_nsec;
+		if (!(ivalid & ATTR_ATIME_SET))
+			arg->valid |= FATTR_ATIME_NOW;
+	}
+	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
+		arg->valid |= FATTR_MTIME;
+		arg->mtime = iattr->ia_mtime.tv_sec;
+		arg->mtimensec = iattr->ia_mtime.tv_nsec;
+		if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
+			arg->valid |= FATTR_MTIME_NOW;
+	}
+	if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+		arg->valid |= FATTR_CTIME;
+		arg->ctime = iattr->ia_ctime.tv_sec;
+		arg->ctimensec = iattr->ia_ctime.tv_nsec;
+	}
+}
+
+/*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	BUG_ON(!mutex_is_locked(&inode->i_mutex));
+
+	spin_lock(&fc->lock);
+	BUG_ON(fi->writectr < 0);
+	fi->writectr += FUSE_NOWRITE;
+	spin_unlock(&fc->lock);
+	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
+}
+
+/*
+ * Allow writepages on inode
+ *
+ * Remove the bias from the writecounter and send any queued
+ * writepages.
+ */
+static void __fuse_release_nowrite(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	BUG_ON(fi->writectr != FUSE_NOWRITE);
+	fi->writectr = 0;
+	fuse_flush_writepages(inode);
+}
+
+void fuse_release_nowrite(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	spin_lock(&fc->lock);
+	__fuse_release_nowrite(inode);
+	spin_unlock(&fc->lock);
+}
+
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
+			      struct inode *inode,
+			      struct fuse_setattr_in *inarg_p,
+			      struct fuse_attr_out *outarg_p)
+{
+	args->in.h.opcode = FUSE_SETATTR;
+	args->in.h.nodeid = get_node_id(inode);
+	args->in.numargs = 1;
+	args->in.args[0].size = sizeof(*inarg_p);
+	args->in.args[0].value = inarg_p;
+	args->out.numargs = 1;
+	args->out.args[0].size = sizeof(*outarg_p);
+	args->out.args[0].value = outarg_p;
+}
+
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_setattr_in inarg;
+	struct fuse_attr_out outarg;
+
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outarg, 0, sizeof(outarg));
+
+	inarg.valid = FATTR_MTIME;
+	inarg.mtime = inode->i_mtime.tv_sec;
+	inarg.mtimensec = inode->i_mtime.tv_nsec;
+	if (fc->minor >= 23) {
+		inarg.valid |= FATTR_CTIME;
+		inarg.ctime = inode->i_ctime.tv_sec;
+		inarg.ctimensec = inode->i_ctime.tv_nsec;
+	}
+	if (ff) {
+		inarg.valid |= FATTR_FH;
+		inarg.fh = ff->fh;
+	}
+	fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+
+	return fuse_simple_request(fc, &args);
+}
+
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case, so do the rlimit checking
+ * and the actual truncation by hand.
+ */
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+		    struct file *file)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	FUSE_ARGS(args);
+	struct fuse_setattr_in inarg;
+	struct fuse_attr_out outarg;
+	bool is_truncate = false;
+	bool is_wb = fc->writeback_cache;
+	loff_t oldsize;
+	int err;
+	bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+
+	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+		attr->ia_valid |= ATTR_FORCE;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_OPEN) {
+		if (fc->atomic_o_trunc)
+			return 0;
+		file = NULL;
+	}
+
+	if (attr->ia_valid & ATTR_SIZE)
+		is_truncate = true;
+
+	if (is_truncate) {
+		fuse_set_nowrite(inode);
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+		if (trust_local_cmtime && attr->ia_size != inode->i_size)
+			attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	}
+
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outarg, 0, sizeof(outarg));
+	iattr_to_fattr(attr, &inarg, trust_local_cmtime);
+	if (file) {
+		struct fuse_file *ff = file->private_data;
+		inarg.valid |= FATTR_FH;
+		inarg.fh = ff->fh;
+	}
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* For mandatory locking in truncate */
+		inarg.valid |= FATTR_LOCKOWNER;
+		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
+	}
+	fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+	err = fuse_simple_request(fc, &args);
+	if (err) {
+		if (err == -EINTR)
+			fuse_invalidate_attr(inode);
+		goto error;
+	}
+
+	if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+		make_bad_inode(inode);
+		err = -EIO;
+		goto error;
+	}
+
+	spin_lock(&fc->lock);
+	/* the kernel maintains i_mtime locally */
+	if (trust_local_cmtime) {
+		if (attr->ia_valid & ATTR_MTIME)
+			inode->i_mtime = attr->ia_mtime;
+		if (attr->ia_valid & ATTR_CTIME)
+			inode->i_ctime = attr->ia_ctime;
+		/* FIXME: clear I_DIRTY_SYNC? */
+	}
+
+	fuse_change_attributes_common(inode, &outarg.attr,
+				      attr_timeout(&outarg));
+	oldsize = inode->i_size;
+	/* see the comment in fuse_change_attributes() */
+	if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+		i_size_write(inode, outarg.attr.size);
+
+	if (is_truncate) {
+		/* NOTE: this may release/reacquire fc->lock */
+		__fuse_release_nowrite(inode);
+	}
+	spin_unlock(&fc->lock);
+
+	/*
+	 * Only call invalidate_inode_pages2() after removing
+	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+	 */
+	if ((is_truncate || !is_wb) &&
+	    S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+		truncate_pagecache(inode, outarg.attr.size);
+		invalidate_inode_pages2(inode->i_mapping);
+	}
+
+	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+	return 0;
+
+error:
+	if (is_truncate)
+		fuse_release_nowrite(inode);
+
+	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+	return err;
+}
+
+static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(entry);
+
+	if (!fuse_allow_current_process(get_fuse_conn(inode)))
+		return -EACCES;
+
+	if (attr->ia_valid & ATTR_FILE)
+		return fuse_do_setattr(inode, attr, attr->ia_file);
+	else
+		return fuse_do_setattr(inode, attr, NULL);
+}
+
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+			struct kstat *stat)
+{
+	struct inode *inode = d_inode(entry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (!fuse_allow_current_process(fc))
+		return -EACCES;
+
+	return fuse_update_attributes(inode, stat, NULL, NULL);
+}
+
+static int fuse_setxattr(struct dentry *entry, const char *name,
+			 const void *value, size_t size, int flags)
+{
+	struct inode *inode = d_inode(entry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_setxattr_in inarg;
+	int err;
+
+	if (fc->no_setxattr)
+		return -EOPNOTSUPP;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	inarg.flags = flags;
+	args.in.h.opcode = FUSE_SETXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 3;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = strlen(name) + 1;
+	args.in.args[1].value = name;
+	args.in.args[2].size = size;
+	args.in.args[2].value = value;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_setxattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	if (!err) {
+		fuse_invalidate_attr(inode);
+		fuse_update_ctime(inode);
+	}
+	return err;
+}
+
+static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
+			     void *value, size_t size)
+{
+	struct inode *inode = d_inode(entry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (fc->no_getxattr)
+		return -EOPNOTSUPP;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	args.in.h.opcode = FUSE_GETXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 2;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = strlen(name) + 1;
+	args.in.args[1].value = name;
+	/* This is really two different operations rolled into one */
+	args.out.numargs = 1;
+	if (size) {
+		args.out.argvar = 1;
+		args.out.args[0].size = size;
+		args.out.args[0].value = value;
+	} else {
+		args.out.args[0].size = sizeof(outarg);
+		args.out.args[0].value = &outarg;
+	}
+	ret = fuse_simple_request(fc, &args);
+	if (!ret && !size)
+		ret = outarg.size;
+	if (ret == -ENOSYS) {
+		fc->no_getxattr = 1;
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+	struct inode *inode = d_inode(entry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (!fuse_allow_current_process(fc))
+		return -EACCES;
+
+	if (fc->no_listxattr)
+		return -EOPNOTSUPP;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	args.in.h.opcode = FUSE_LISTXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	/* This is really two different operations rolled into one */
+	args.out.numargs = 1;
+	if (size) {
+		args.out.argvar = 1;
+		args.out.args[0].size = size;
+		args.out.args[0].value = list;
+	} else {
+		args.out.args[0].size = sizeof(outarg);
+		args.out.args[0].value = &outarg;
+	}
+	ret = fuse_simple_request(fc, &args);
+	if (!ret && !size)
+		ret = outarg.size;
+	if (ret == -ENOSYS) {
+		fc->no_listxattr = 1;
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+static int fuse_removexattr(struct dentry *entry, const char *name)
+{
+	struct inode *inode = d_inode(entry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	int err;
+
+	if (fc->no_removexattr)
+		return -EOPNOTSUPP;
+
+	args.in.h.opcode = FUSE_REMOVEXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = strlen(name) + 1;
+	args.in.args[0].value = name;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_removexattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	if (!err) {
+		fuse_invalidate_attr(inode);
+		fuse_update_ctime(inode);
+	}
+	return err;
+}
+
+static const struct inode_operations fuse_dir_inode_operations = {
+	.lookup		= fuse_lookup,
+	.mkdir		= fuse_mkdir,
+	.symlink	= fuse_symlink,
+	.unlink		= fuse_unlink,
+	.rmdir		= fuse_rmdir,
+	.rename2	= fuse_rename2,
+	.link		= fuse_link,
+	.setattr	= fuse_setattr,
+	.create		= fuse_create,
+	.atomic_open	= fuse_atomic_open,
+	.mknod		= fuse_mknod,
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+static const struct file_operations fuse_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= fuse_readdir,
+	.open		= fuse_dir_open,
+	.release	= fuse_dir_release,
+	.fsync		= fuse_dir_fsync,
+	.unlocked_ioctl	= fuse_dir_ioctl,
+	.compat_ioctl	= fuse_dir_compat_ioctl,
+};
+
+static const struct inode_operations fuse_common_inode_operations = {
+	.setattr	= fuse_setattr,
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+static const struct inode_operations fuse_symlink_inode_operations = {
+	.setattr	= fuse_setattr,
+	.follow_link	= fuse_follow_link,
+	.put_link	= fuse_put_link,
+	.readlink	= generic_readlink,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+void fuse_init_common(struct inode *inode)
+{
+	inode->i_op = &fuse_common_inode_operations;
+}
+
+void fuse_init_dir(struct inode *inode)
+{
+	inode->i_op = &fuse_dir_inode_operations;
+	inode->i_fop = &fuse_dir_operations;
+}
+
+void fuse_init_symlink(struct inode *inode)
+{
+	inode->i_op = &fuse_symlink_inode_operations;
+}
diff --git a/kernel/fs/fuse/file.c b/kernel/fs/fuse/file.c
new file mode 100644
index 000000000..5ef05b5c4
--- /dev/null
+++ b/kernel/fs/fuse/file.c
@@ -0,0 +1,2994 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+
+static const struct file_operations fuse_direct_io_file_operations;
+
+static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+			  int opcode, struct fuse_open_out *outargp)
+{
+	struct fuse_open_in inarg;
+	FUSE_ARGS(args);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
+	if (!fc->atomic_o_trunc)
+		inarg.flags &= ~O_TRUNC;
+	args.in.h.opcode = opcode;
+	args.in.h.nodeid = nodeid;
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(*outargp);
+	args.out.args[0].value = outargp;
+
+	return fuse_simple_request(fc, &args);
+}
+
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+{
+	struct fuse_file *ff;
+
+	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
+	if (unlikely(!ff))
+		return NULL;
+
+	ff->fc = fc;
+	ff->reserved_req = fuse_request_alloc(0);
+	if (unlikely(!ff->reserved_req)) {
+		kfree(ff);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&ff->write_entry);
+	atomic_set(&ff->count, 0);
+	RB_CLEAR_NODE(&ff->polled_node);
+	init_waitqueue_head(&ff->poll_wait);
+
+	spin_lock(&fc->lock);
+	ff->kh = ++fc->khctr;
+	spin_unlock(&fc->lock);
+
+	return ff;
+}
+
+void fuse_file_free(struct fuse_file *ff)
+{
+	fuse_request_free(ff->reserved_req);
+	kfree(ff);
+}
+
+struct fuse_file *fuse_file_get(struct fuse_file *ff)
+{
+	atomic_inc(&ff->count);
+	return ff;
+}
+
+static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	iput(req->misc.release.inode);
+}
+
+static void fuse_file_put(struct fuse_file *ff, bool sync)
+{
+	if (atomic_dec_and_test(&ff->count)) {
+		struct fuse_req *req = ff->reserved_req;
+
+		if (ff->fc->no_open) {
+			/*
+			 * Drop the release request when client does not
+			 * implement 'open'
+			 */
+			req->background = 0;
+			iput(req->misc.release.inode);
+			fuse_put_request(ff->fc, req);
+		} else if (sync) {
+			req->background = 0;
+			fuse_request_send(ff->fc, req);
+			iput(req->misc.release.inode);
+			fuse_put_request(ff->fc, req);
+		} else {
+			req->end = fuse_release_end;
+			req->background = 1;
+			fuse_request_send_background(ff->fc, req);
+		}
+		kfree(ff);
+	}
+}
+
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+		 bool isdir)
+{
+	struct fuse_file *ff;
+	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+
+	ff = fuse_file_alloc(fc);
+	if (!ff)
+		return -ENOMEM;
+
+	ff->fh = 0;
+	ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
+	if (!fc->no_open || isdir) {
+		struct fuse_open_out outarg;
+		int err;
+
+		err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+		if (!err) {
+			ff->fh = outarg.fh;
+			ff->open_flags = outarg.open_flags;
+
+		} else if (err != -ENOSYS || isdir) {
+			fuse_file_free(ff);
+			return err;
+		} else {
+			fc->no_open = 1;
+		}
+	}
+
+	if (isdir)
+		ff->open_flags &= ~FOPEN_DIRECT_IO;
+
+	ff->nodeid = nodeid;
+	file->private_data = fuse_file_get(ff);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_do_open);
+
+static void fuse_link_write_file(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff = file->private_data;
+	/*
+	 * file may be written through mmap, so chain it onto the
+	 * inodes's write_file list
+	 */
+	spin_lock(&fc->lock);
+	if (list_empty(&ff->write_entry))
+		list_add(&ff->write_entry, &fi->write_files);
+	spin_unlock(&fc->lock);
+}
+
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (ff->open_flags & FOPEN_DIRECT_IO)
+		file->f_op = &fuse_direct_io_file_operations;
+	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+		invalidate_inode_pages2(inode->i_mapping);
+	if (ff->open_flags & FOPEN_NONSEEKABLE)
+		nonseekable_open(inode, file);
+	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
+		i_size_write(inode, 0);
+		spin_unlock(&fc->lock);
+		fuse_invalidate_attr(inode);
+		if (fc->writeback_cache)
+			file_update_time(file);
+	}
+	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+		fuse_link_write_file(file);
+}
+
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+	bool lock_inode = (file->f_flags & O_TRUNC) &&
+			  fc->atomic_o_trunc &&
+			  fc->writeback_cache;
+
+	err = generic_file_open(inode, file);
+	if (err)
+		return err;
+
+	if (lock_inode)
+		mutex_lock(&inode->i_mutex);
+
+	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+
+	if (!err)
+		fuse_finish_open(inode, file);
+
+	if (lock_inode)
+		mutex_unlock(&inode->i_mutex);
+
+	return err;
+}
+
+static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
+{
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_req *req = ff->reserved_req;
+	struct fuse_release_in *inarg = &req->misc.release.in;
+
+	spin_lock(&fc->lock);
+	list_del(&ff->write_entry);
+	if (!RB_EMPTY_NODE(&ff->polled_node))
+		rb_erase(&ff->polled_node, &fc->polled_files);
+	spin_unlock(&fc->lock);
+
+	wake_up_interruptible_all(&ff->poll_wait);
+
+	inarg->fh = ff->fh;
+	inarg->flags = flags;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_release_in);
+	req->in.args[0].value = inarg;
+}
+
+void fuse_release_common(struct file *file, int opcode)
+{
+	struct fuse_file *ff;
+	struct fuse_req *req;
+
+	ff = file->private_data;
+	if (unlikely(!ff))
+		return;
+
+	req = ff->reserved_req;
+	fuse_prepare_release(ff, file->f_flags, opcode);
+
+	if (ff->flock) {
+		struct fuse_release_in *inarg = &req->misc.release.in;
+		inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+		inarg->lock_owner = fuse_lock_owner_id(ff->fc,
+						       (fl_owner_t) file);
+	}
+	/* Hold inode until release is finished */
+	req->misc.release.inode = igrab(file_inode(file));
+
+	/*
+	 * Normally this will send the RELEASE request, however if
+	 * some asynchronous READ or WRITE requests are outstanding,
+	 * the sending will be delayed.
+	 *
+	 * Make the release synchronous if this is a fuseblk mount,
+	 * synchronous RELEASE is allowed (and desirable) in this case
+	 * because the server can be trusted not to screw up.
+	 */
+	fuse_file_put(ff, ff->fc->destroy_req != NULL);
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
+{
+	return fuse_open_common(inode, file, false);
+}
+
+static int fuse_release(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	/* see fuse_vma_close() for !writeback_cache case */
+	if (fc->writeback_cache)
+		write_inode_now(inode, 1);
+
+	fuse_release_common(file, FUSE_RELEASE);
+
+	/* return value is ignored by VFS */
+	return 0;
+}
+
+void fuse_sync_release(struct fuse_file *ff, int flags)
+{
+	WARN_ON(atomic_read(&ff->count) > 1);
+	fuse_prepare_release(ff, flags, FUSE_RELEASE);
+	ff->reserved_req->force = 1;
+	ff->reserved_req->background = 0;
+	fuse_request_send(ff->fc, ff->reserved_req);
+	fuse_put_request(ff->fc, ff->reserved_req);
+	kfree(ff);
+}
+EXPORT_SYMBOL_GPL(fuse_sync_release);
+
+/*
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
+ */
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
+{
+	u32 *k = fc->scramble_key;
+	u64 v = (unsigned long) id;
+	u32 v0 = v;
+	u32 v1 = v >> 32;
+	u32 sum = 0;
+	int i;
+
+	for (i = 0; i < 32; i++) {
+		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+		sum += 0x9E3779B9;
+		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+	}
+
+	return (u64) v0 + ((u64) v1 << 32);
+}
+
+/*
+ * Check if any page in a range is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+				   pgoff_t idx_to)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	bool found = false;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (idx_from < curr_index + req->num_pages &&
+		    curr_index <= idx_to) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
+static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+{
+	return fuse_range_is_writeback(inode, index, index);
+}
+
+/*
+ * Wait for page writeback to be completed.
+ *
+ * Since fuse doesn't rely on the VM writeback tracking, this has to
+ * use some other means.
+ */
+static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
+	return 0;
+}
+
+/*
+ * Wait for all pending writepages on the inode to finish.
+ *
+ * This is currently done by blocking further writes with FUSE_NOWRITE
+ * and waiting for all sent writes to complete.
+ *
+ * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
+ * could conflict with truncation.
+ */
+static void fuse_sync_writes(struct inode *inode)
+{
+	fuse_set_nowrite(inode);
+	fuse_release_nowrite(inode);
+}
+
+static int fuse_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_flush_in inarg;
+	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	if (fc->no_flush)
+		return 0;
+
+	err = write_inode_now(inode, 1);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	fuse_sync_writes(inode);
+	mutex_unlock(&inode->i_mutex);
+
+	req = fuse_get_req_nofail_nopages(fc, file);
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	inarg.lock_owner = fuse_lock_owner_id(fc, id);
+	req->in.h.opcode = FUSE_FLUSH;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->force = 1;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_flush = 1;
+		err = 0;
+	}
+	return err;
+}
+
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+		      int datasync, int isdir)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	FUSE_ARGS(args);
+	struct fuse_fsync_in inarg;
+	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * Start writeback against all dirty pages of the inode, then
+	 * wait for all outstanding writes, before sending the FSYNC
+	 * request.
+	 */
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		goto out;
+
+	fuse_sync_writes(inode);
+	err = sync_inode_metadata(inode, 1);
+	if (err)
+		goto out;
+
+	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+		goto out;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	inarg.fsync_flags = datasync ? 1 : 0;
+	args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		if (isdir)
+			fc->no_fsyncdir = 1;
+		else
+			fc->no_fsync = 1;
+		err = 0;
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	return err;
+}
+
+static int fuse_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
+{
+	return fuse_fsync_common(file, start, end, datasync, 0);
+}
+
+void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
+		    size_t count, int opcode)
+{
+	struct fuse_read_in *inarg = &req->misc.read.in;
+	struct fuse_file *ff = file->private_data;
+
+	inarg->fh = ff->fh;
+	inarg->offset = pos;
+	inarg->size = count;
+	inarg->flags = file->f_flags;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_read_in);
+	req->in.args[0].value = inarg;
+	req->out.argvar = 1;
+	req->out.numargs = 1;
+	req->out.args[0].size = count;
+}
+
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (write)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+}
+
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+	if (io->err)
+		return io->err;
+
+	if (io->bytes >= 0 && io->write)
+		return -EIO;
+
+	return io->bytes < 0 ? io->size : io->bytes;
+}
+
+/**
+ * In case of short read, the caller sets 'pos' to the position of
+ * actual end of fuse request in IO request. Otherwise, if bytes_requested
+ * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
+ *
+ * An example:
+ * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
+ * both submitted asynchronously. The first of them was ACKed by userspace as
+ * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
+ * second request was ACKed as short, e.g. only 1K was read, resulting in
+ * pos == 33K.
+ *
+ * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
+ * will be equal to the length of the longest contiguous fragment of
+ * transferred data starting from the beginning of IO request.
+ */
+static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
+{
+	bool is_sync = is_sync_kiocb(io->iocb);
+	int left;
+
+	spin_lock(&io->lock);
+	if (err)
+		io->err = io->err ? : err;
+	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
+		io->bytes = pos;
+
+	left = --io->reqs;
+	if (!left && is_sync)
+		complete(io->done);
+	spin_unlock(&io->lock);
+
+	if (!left && !is_sync) {
+		ssize_t res = fuse_get_res_by_io(io);
+
+		if (res >= 0) {
+			struct inode *inode = file_inode(io->iocb->ki_filp);
+			struct fuse_conn *fc = get_fuse_conn(inode);
+			struct fuse_inode *fi = get_fuse_inode(inode);
+
+			spin_lock(&fc->lock);
+			fi->attr_version = ++fc->attr_version;
+			spin_unlock(&fc->lock);
+		}
+
+		io->iocb->ki_complete(io->iocb, res, 0);
+		kfree(io);
+	}
+}
+
+static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_io_priv *io = req->io;
+	ssize_t pos = -1;
+
+	fuse_release_user_pages(req, !io->write);
+
+	if (io->write) {
+		if (req->misc.write.in.size != req->misc.write.out.size)
+			pos = req->misc.write.in.offset - io->offset +
+				req->misc.write.out.size;
+	} else {
+		if (req->misc.read.in.size != req->out.args[0].size)
+			pos = req->misc.read.in.offset - io->offset +
+				req->out.args[0].size;
+	}
+
+	fuse_aio_complete(io, req->out.h.error, pos);
+}
+
+static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
+		size_t num_bytes, struct fuse_io_priv *io)
+{
+	spin_lock(&io->lock);
+	io->size += num_bytes;
+	io->reqs++;
+	spin_unlock(&io->lock);
+
+	req->io = io;
+	req->end = fuse_aio_complete_req;
+
+	__fuse_get_request(req);
+	fuse_request_send_background(fc, req);
+
+	return num_bytes;
+}
+
+static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
+			     loff_t pos, size_t count, fl_owner_t owner)
+{
+	struct file *file = io->file;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+
+	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	if (owner != NULL) {
+		struct fuse_read_in *inarg = &req->misc.read.in;
+
+		inarg->read_flags |= FUSE_READ_LOCKOWNER;
+		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+	}
+
+	if (io->async)
+		return fuse_async_req_send(fc, req, count, io);
+
+	fuse_request_send(fc, req);
+	return req->out.args[0].size;
+}
+
+static void fuse_read_update_size(struct inode *inode, loff_t size,
+				  u64 attr_ver)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	spin_lock(&fc->lock);
+	if (attr_ver == fi->attr_version && size < inode->i_size &&
+	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
+		fi->attr_version = ++fc->attr_version;
+		i_size_write(inode, size);
+	}
+	spin_unlock(&fc->lock);
+}
+
+static void fuse_short_read(struct fuse_req *req, struct inode *inode,
+			    u64 attr_ver)
+{
+	size_t num_read = req->out.args[0].size;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (fc->writeback_cache) {
+		/*
+		 * A hole in a file. Some data after the hole are in page cache,
+		 * but have not reached the client fs yet. So, the hole is not
+		 * present there.
+		 */
+		int i;
+		int start_idx = num_read >> PAGE_CACHE_SHIFT;
+		size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+
+		for (i = start_idx; i < req->num_pages; i++) {
+			zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+			off = 0;
+		}
+	} else {
+		loff_t pos = page_offset(req->pages[0]) + num_read;
+		fuse_read_update_size(inode, pos, attr_ver);
+	}
+}
+
+static int fuse_do_readpage(struct file *file, struct page *page)
+{
+	struct fuse_io_priv io = { .async = 0, .file = file };
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	size_t num_read;
+	loff_t pos = page_offset(page);
+	size_t count = PAGE_CACHE_SIZE;
+	u64 attr_ver;
+	int err;
+
+	/*
+	 * Page writeback can extend beyond the lifetime of the
+	 * page-cache page, so make sure we read a properly synced
+	 * page.
+	 */
+	fuse_wait_on_page_writeback(inode, page->index);
+
+	req = fuse_get_req(fc, 1);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	attr_ver = fuse_get_attr_version(fc);
+
+	req->out.page_zeroing = 1;
+	req->out.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	req->page_descs[0].length = count;
+	num_read = fuse_send_read(req, &io, pos, count, NULL);
+	err = req->out.h.error;
+
+	if (!err) {
+		/*
+		 * Short read means EOF.  If file size is larger, truncate it
+		 */
+		if (num_read < count)
+			fuse_short_read(req, inode, attr_ver);
+
+		SetPageUptodate(page);
+	}
+
+	fuse_put_request(fc, req);
+
+	return err;
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	err = fuse_do_readpage(file, page);
+	fuse_invalidate_atime(inode);
+ out:
+	unlock_page(page);
+	return err;
+}
+
+static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+	size_t count = req->misc.read.in.size;
+	size_t num_read = req->out.args[0].size;
+	struct address_space *mapping = NULL;
+
+	for (i = 0; mapping == NULL && i < req->num_pages; i++)
+		mapping = req->pages[i]->mapping;
+
+	if (mapping) {
+		struct inode *inode = mapping->host;
+
+		/*
+		 * Short read means EOF. If file size is larger, truncate it
+		 */
+		if (!req->out.h.error && num_read < count)
+			fuse_short_read(req, inode, req->misc.read.attr_ver);
+
+		fuse_invalidate_atime(inode);
+	}
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (!req->out.h.error)
+			SetPageUptodate(page);
+		else
+			SetPageError(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (req->ff)
+		fuse_file_put(req->ff, false);
+}
+
+static void fuse_send_readpages(struct fuse_req *req, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	loff_t pos = page_offset(req->pages[0]);
+	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+
+	req->out.argpages = 1;
+	req->out.page_zeroing = 1;
+	req->out.page_replace = 1;
+	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	req->misc.read.attr_ver = fuse_get_attr_version(fc);
+	if (fc->async_read) {
+		req->ff = fuse_file_get(ff);
+		req->end = fuse_readpages_end;
+		fuse_request_send_background(fc, req);
+	} else {
+		fuse_request_send(fc, req);
+		fuse_readpages_end(fc, req);
+		fuse_put_request(fc, req);
+	}
+}
+
+struct fuse_fill_data {
+	struct fuse_req *req;
+	struct file *file;
+	struct inode *inode;
+	unsigned nr_pages;
+};
+
+static int fuse_readpages_fill(void *_data, struct page *page)
+{
+	struct fuse_fill_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	fuse_wait_on_page_writeback(inode, page->index);
+
+	if (req->num_pages &&
+	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int nr_alloc = min_t(unsigned, data->nr_pages,
+				     FUSE_MAX_PAGES_PER_REQ);
+		fuse_send_readpages(req, data->file);
+		if (fc->async_read)
+			req = fuse_get_req_for_background(fc, nr_alloc);
+		else
+			req = fuse_get_req(fc, nr_alloc);
+
+		data->req = req;
+		if (IS_ERR(req)) {
+			unlock_page(page);
+			return PTR_ERR(req);
+		}
+	}
+
+	if (WARN_ON(req->num_pages >= req->max_pages)) {
+		fuse_put_request(fc, req);
+		return -EIO;
+	}
+
+	page_cache_get(page);
+	req->pages[req->num_pages] = page;
+	req->page_descs[req->num_pages].length = PAGE_SIZE;
+	req->num_pages++;
+	data->nr_pages--;
+	return 0;
+}
+
+static int fuse_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_fill_data data;
+	int err;
+	int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	data.file = file;
+	data.inode = inode;
+	if (fc->async_read)
+		data.req = fuse_get_req_for_background(fc, nr_alloc);
+	else
+		data.req = fuse_get_req(fc, nr_alloc);
+	data.nr_pages = nr_pages;
+	err = PTR_ERR(data.req);
+	if (IS_ERR(data.req))
+		goto out;
+
+	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
+	if (!err) {
+		if (data.req->num_pages)
+			fuse_send_readpages(data.req, file);
+		else
+			fuse_put_request(fc, data.req);
+	}
+out:
+	return err;
+}
+
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	/*
+	 * In auto invalidate mode, always update attributes on read.
+	 * Otherwise, only update if we attempt to read past EOF (to ensure
+	 * i_size is up to date).
+	 */
+	if (fc->auto_inval_data ||
+	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
+		int err;
+		err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
+		if (err)
+			return err;
+	}
+
+	return generic_file_read_iter(iocb, to);
+}
+
+static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
+			    loff_t pos, size_t count)
+{
+	struct fuse_write_in *inarg = &req->misc.write.in;
+	struct fuse_write_out *outarg = &req->misc.write.out;
+
+	inarg->fh = ff->fh;
+	inarg->offset = pos;
+	inarg->size = count;
+	req->in.h.opcode = FUSE_WRITE;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 2;
+	if (ff->fc->minor < 9)
+		req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
+	else
+		req->in.args[0].size = sizeof(struct fuse_write_in);
+	req->in.args[0].value = inarg;
+	req->in.args[1].size = count;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(struct fuse_write_out);
+	req->out.args[0].value = outarg;
+}
+
+static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
+			      loff_t pos, size_t count, fl_owner_t owner)
+{
+	struct file *file = io->file;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_write_in *inarg = &req->misc.write.in;
+
+	fuse_write_fill(req, ff, pos, count);
+	inarg->flags = file->f_flags;
+	if (owner != NULL) {
+		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
+		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+	}
+
+	if (io->async)
+		return fuse_async_req_send(fc, req, count, io);
+
+	fuse_request_send(fc, req);
+	return req->misc.write.out.size;
+}
+
+bool fuse_write_update_size(struct inode *inode, loff_t pos)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool ret = false;
+
+	spin_lock(&fc->lock);
+	fi->attr_version = ++fc->attr_version;
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		ret = true;
+	}
+	spin_unlock(&fc->lock);
+
+	return ret;
+}
+
+static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+				    struct inode *inode, loff_t pos,
+				    size_t count)
+{
+	size_t res;
+	unsigned offset;
+	unsigned i;
+	struct fuse_io_priv io = { .async = 0, .file = file };
+
+	for (i = 0; i < req->num_pages; i++)
+		fuse_wait_on_page_writeback(inode, req->pages[i]->index);
+
+	res = fuse_send_write(req, &io, pos, count, NULL);
+
+	offset = req->page_descs[0].offset;
+	count = res;
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+
+		if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+			SetPageUptodate(page);
+
+		if (count > PAGE_CACHE_SIZE - offset)
+			count -= PAGE_CACHE_SIZE - offset;
+		else
+			count = 0;
+		offset = 0;
+
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return res;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+			       struct address_space *mapping,
+			       struct iov_iter *ii, loff_t pos)
+{
+	struct fuse_conn *fc = get_fuse_conn(mapping->host);
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	size_t count = 0;
+	int err;
+
+	req->in.argpages = 1;
+	req->page_descs[0].offset = offset;
+
+	do {
+		size_t tmp;
+		struct page *page;
+		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+		size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+				     iov_iter_count(ii));
+
+		bytes = min_t(size_t, bytes, fc->max_write - count);
+
+ again:
+		err = -EFAULT;
+		if (iov_iter_fault_in_readable(ii, bytes))
+			break;
+
+		err = -ENOMEM;
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page)
+			break;
+
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+		flush_dcache_page(page);
+
+		if (!tmp) {
+			unlock_page(page);
+			page_cache_release(page);
+			bytes = min(bytes, iov_iter_single_seg_count(ii));
+			goto again;
+		}
+
+		err = 0;
+		req->pages[req->num_pages] = page;
+		req->page_descs[req->num_pages].length = tmp;
+		req->num_pages++;
+
+		iov_iter_advance(ii, tmp);
+		count += tmp;
+		pos += tmp;
+		offset += tmp;
+		if (offset == PAGE_CACHE_SIZE)
+			offset = 0;
+
+		if (!fc->big_writes)
+			break;
+	} while (iov_iter_count(ii) && count < fc->max_write &&
+		 req->num_pages < req->max_pages && offset == 0);
+
+	return count > 0 ? count : err;
+}
+
+static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+{
+	return min_t(unsigned,
+		     ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
+		     (pos >> PAGE_CACHE_SHIFT) + 1,
+		     FUSE_MAX_PAGES_PER_REQ);
+}
+
+static ssize_t fuse_perform_write(struct file *file,
+				  struct address_space *mapping,
+				  struct iov_iter *ii, loff_t pos)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	int err = 0;
+	ssize_t res = 0;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	if (inode->i_size < pos + iov_iter_count(ii))
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+	do {
+		struct fuse_req *req;
+		ssize_t count;
+		unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
+
+		req = fuse_get_req(fc, nr_pages);
+		if (IS_ERR(req)) {
+			err = PTR_ERR(req);
+			break;
+		}
+
+		count = fuse_fill_write_pages(req, mapping, ii, pos);
+		if (count <= 0) {
+			err = count;
+		} else {
+			size_t num_written;
+
+			num_written = fuse_send_write_pages(req, file, inode,
+							    pos, count);
+			err = req->out.h.error;
+			if (!err) {
+				res += num_written;
+				pos += num_written;
+
+				/* break out of the loop on short write */
+				if (num_written != count)
+					err = -EIO;
+			}
+		}
+		fuse_put_request(fc, req);
+	} while (!err && iov_iter_count(ii));
+
+	if (res > 0)
+		fuse_write_update_size(inode, pos);
+
+	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+	fuse_invalidate_attr(inode);
+
+	return res > 0 ? res : err;
+}
+
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	ssize_t written = 0;
+	ssize_t written_buffered = 0;
+	struct inode *inode = mapping->host;
+	ssize_t err;
+	loff_t endbyte = 0;
+
+	if (get_fuse_conn(inode)->writeback_cache) {
+		/* Update size (EOF optimization) and mode (SUID clearing) */
+		err = fuse_update_attributes(mapping->host, NULL, file, NULL);
+		if (err)
+			return err;
+
+		return generic_file_write_iter(iocb, from);
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+
+	err = generic_write_checks(iocb, from);
+	if (err <= 0)
+		goto out;
+
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+
+	err = file_update_time(file);
+	if (err)
+		goto out;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		loff_t pos = iocb->ki_pos;
+		written = generic_file_direct_write(iocb, from, pos);
+		if (written < 0 || !iov_iter_count(from))
+			goto out;
+
+		pos += written;
+
+		written_buffered = fuse_perform_write(file, mapping, from, pos);
+		if (written_buffered < 0) {
+			err = written_buffered;
+			goto out;
+		}
+		endbyte = pos + written_buffered - 1;
+
+		err = filemap_write_and_wait_range(file->f_mapping, pos,
+						   endbyte);
+		if (err)
+			goto out;
+
+		invalidate_mapping_pages(file->f_mapping,
+					 pos >> PAGE_CACHE_SHIFT,
+					 endbyte >> PAGE_CACHE_SHIFT);
+
+		written += written_buffered;
+		iocb->ki_pos = pos + written_buffered;
+	} else {
+		written = fuse_perform_write(file, mapping, from, iocb->ki_pos);
+		if (written >= 0)
+			iocb->ki_pos += written;
+	}
+out:
+	current->backing_dev_info = NULL;
+	mutex_unlock(&inode->i_mutex);
+
+	return written ? written : err;
+}
+
+static inline void fuse_page_descs_length_init(struct fuse_req *req,
+		unsigned index, unsigned nr_pages)
+{
+	int i;
+
+	for (i = index; i < index + nr_pages; i++)
+		req->page_descs[i].length = PAGE_SIZE -
+			req->page_descs[i].offset;
+}
+
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+					size_t max_size)
+{
+	return min(iov_iter_single_seg_count(ii), max_size);
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
+			       size_t *nbytesp, int write)
+{
+	size_t nbytes = 0;  /* # bytes already packed in req */
+
+	/* Special case for kernel I/O: can copy directly into the buffer */
+	if (ii->type & ITER_KVEC) {
+		unsigned long user_addr = fuse_get_user_addr(ii);
+		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+
+		if (write)
+			req->in.args[1].value = (void *) user_addr;
+		else
+			req->out.args[0].value = (void *) user_addr;
+
+		iov_iter_advance(ii, frag_size);
+		*nbytesp = frag_size;
+		return 0;
+	}
+
+	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+		unsigned npages;
+		size_t start;
+		ssize_t ret = iov_iter_get_pages(ii,
+					&req->pages[req->num_pages],
+					*nbytesp - nbytes,
+					req->max_pages - req->num_pages,
+					&start);
+		if (ret < 0)
+			return ret;
+
+		iov_iter_advance(ii, ret);
+		nbytes += ret;
+
+		ret += start;
+		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+
+		req->page_descs[req->num_pages].offset = start;
+		fuse_page_descs_length_init(req, req->num_pages, npages);
+
+		req->num_pages += npages;
+		req->page_descs[req->num_pages - 1].length -=
+			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
+	}
+
+	if (write)
+		req->in.argpages = 1;
+	else
+		req->out.argpages = 1;
+
+	*nbytesp = nbytes;
+
+	return 0;
+}
+
+static inline int fuse_iter_npages(const struct iov_iter *ii_p)
+{
+	return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
+}
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+		       loff_t *ppos, int flags)
+{
+	int write = flags & FUSE_DIO_WRITE;
+	int cuse = flags & FUSE_DIO_CUSE;
+	struct file *file = io->file;
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	size_t nmax = write ? fc->max_write : fc->max_read;
+	loff_t pos = *ppos;
+	size_t count = iov_iter_count(iter);
+	pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+	ssize_t res = 0;
+	struct fuse_req *req;
+
+	if (io->async)
+		req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
+	else
+		req = fuse_get_req(fc, fuse_iter_npages(iter));
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
+		if (!write)
+			mutex_lock(&inode->i_mutex);
+		fuse_sync_writes(inode);
+		if (!write)
+			mutex_unlock(&inode->i_mutex);
+	}
+
+	while (count) {
+		size_t nres;
+		fl_owner_t owner = current->files;
+		size_t nbytes = min(count, nmax);
+		int err = fuse_get_user_pages(req, iter, &nbytes, write);
+		if (err) {
+			res = err;
+			break;
+		}
+
+		if (write)
+			nres = fuse_send_write(req, io, pos, nbytes, owner);
+		else
+			nres = fuse_send_read(req, io, pos, nbytes, owner);
+
+		if (!io->async)
+			fuse_release_user_pages(req, !write);
+		if (req->out.h.error) {
+			if (!res)
+				res = req->out.h.error;
+			break;
+		} else if (nres > nbytes) {
+			res = -EIO;
+			break;
+		}
+		count -= nres;
+		res += nres;
+		pos += nres;
+		if (nres != nbytes)
+			break;
+		if (count) {
+			fuse_put_request(fc, req);
+			if (io->async)
+				req = fuse_get_req_for_background(fc,
+					fuse_iter_npages(iter));
+			else
+				req = fuse_get_req(fc, fuse_iter_npages(iter));
+			if (IS_ERR(req))
+				break;
+		}
+	}
+	if (!IS_ERR(req))
+		fuse_put_request(fc, req);
+	if (res > 0)
+		*ppos = pos;
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(fuse_direct_io);
+
+static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
+				  struct iov_iter *iter,
+				  loff_t *ppos)
+{
+	ssize_t res;
+	struct file *file = io->file;
+	struct inode *inode = file_inode(file);
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	res = fuse_direct_io(io, iter, ppos, 0);
+
+	fuse_invalidate_attr(inode);
+
+	return res;
+}
+
+static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp };
+	return __fuse_direct_read(&io, to, &iocb->ki_pos);
+}
+
+static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct fuse_io_priv io = { .async = 0, .file = file };
+	ssize_t res;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	/* Don't allow parallel writes to the same file */
+	mutex_lock(&inode->i_mutex);
+	res = generic_write_checks(iocb, from);
+	if (res > 0)
+		res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+	fuse_invalidate_attr(inode);
+	if (res > 0)
+		fuse_write_update_size(inode, iocb->ki_pos);
+	mutex_unlock(&inode->i_mutex);
+
+	return res;
+}
+
+static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+
+	for (i = 0; i < req->num_pages; i++)
+		__free_page(req->pages[i]);
+
+	if (req->ff)
+		fuse_file_put(req->ff, false);
+}
+
+static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct inode *inode = req->inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	int i;
+
+	list_del(&req->writepages_entry);
+	for (i = 0; i < req->num_pages; i++) {
+		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+		bdi_writeout_inc(bdi);
+	}
+	wake_up(&fi->page_waitq);
+}
+
+/* Called under fc->lock, may release and reacquire it */
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
+				loff_t size)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	struct fuse_inode *fi = get_fuse_inode(req->inode);
+	struct fuse_write_in *inarg = &req->misc.write.in;
+	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
+
+	if (!fc->connected)
+		goto out_free;
+
+	if (inarg->offset + data_size <= size) {
+		inarg->size = data_size;
+	} else if (inarg->offset < size) {
+		inarg->size = size - inarg->offset;
+	} else {
+		/* Got truncated off completely */
+		goto out_free;
+	}
+
+	req->in.args[1].size = inarg->size;
+	fi->writectr++;
+	fuse_request_send_background_locked(fc, req);
+	return;
+
+ out_free:
+	fuse_writepage_finish(fc, req);
+	spin_unlock(&fc->lock);
+	fuse_writepage_free(fc, req);
+	fuse_put_request(fc, req);
+	spin_lock(&fc->lock);
+}
+
+/*
+ * If fi->writectr is positive (no truncate or fsync going on) send
+ * all queued writepage requests.
+ *
+ * Called with fc->lock
+ */
+void fuse_flush_writepages(struct inode *inode)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	size_t crop = i_size_read(inode);
+	struct fuse_req *req;
+
+	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
+		req = list_entry(fi->queued_writes.next, struct fuse_req, list);
+		list_del_init(&req->list);
+		fuse_send_writepage(fc, req, crop);
+	}
+}
+
+static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct inode *inode = req->inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	mapping_set_error(inode->i_mapping, req->out.h.error);
+	spin_lock(&fc->lock);
+	while (req->misc.write.next) {
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_write_in *inarg = &req->misc.write.in;
+		struct fuse_req *next = req->misc.write.next;
+		req->misc.write.next = next->misc.write.next;
+		next->misc.write.next = NULL;
+		next->ff = fuse_file_get(req->ff);
+		list_add(&next->writepages_entry, &fi->writepages);
+
+		/*
+		 * Skip fuse_flush_writepages() to make it easy to crop requests
+		 * based on primary request size.
+		 *
+		 * 1st case (trivial): there are no concurrent activities using
+		 * fuse_set/release_nowrite.  Then we're on safe side because
+		 * fuse_flush_writepages() would call fuse_send_writepage()
+		 * anyway.
+		 *
+		 * 2nd case: someone called fuse_set_nowrite and it is waiting
+		 * now for completion of all in-flight requests.  This happens
+		 * rarely and no more than once per page, so this should be
+		 * okay.
+		 *
+		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
+		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
+		 * that fuse_set_nowrite returned implies that all in-flight
+		 * requests were completed along with all of their secondary
+		 * requests.  Further primary requests are blocked by negative
+		 * writectr.  Hence there cannot be any in-flight requests and
+		 * no invocations of fuse_writepage_end() while we're in
+		 * fuse_set_nowrite..fuse_release_nowrite section.
+		 */
+		fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+	}
+	fi->writectr--;
+	fuse_writepage_finish(fc, req);
+	spin_unlock(&fc->lock);
+	fuse_writepage_free(fc, req);
+}
+
+static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
+					       struct fuse_inode *fi)
+{
+	struct fuse_file *ff = NULL;
+
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->write_files)) {
+		ff = list_entry(fi->write_files.next, struct fuse_file,
+				write_entry);
+		fuse_file_get(ff);
+	}
+	spin_unlock(&fc->lock);
+
+	return ff;
+}
+
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+					     struct fuse_inode *fi)
+{
+	struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+	WARN_ON(!ff);
+	return ff;
+}
+
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff;
+	int err;
+
+	ff = __fuse_write_file_get(fc, fi);
+	err = fuse_flush_times(inode, ff);
+	if (ff)
+		fuse_file_put(ff, 0);
+
+	return err;
+}
+
+static int fuse_writepage_locked(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	struct page *tmp_page;
+	int error = -ENOMEM;
+
+	set_page_writeback(page);
+
+	req = fuse_request_alloc_nofs(1);
+	if (!req)
+		goto err;
+
+	req->background = 1; /* writeback always goes to bg_queue */
+	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (!tmp_page)
+		goto err_free;
+
+	error = -EIO;
+	req->ff = fuse_write_file_get(fc, fi);
+	if (!req->ff)
+		goto err_nofile;
+
+	fuse_write_fill(req, req->ff, page_offset(page), 0);
+
+	copy_highpage(tmp_page, page);
+	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+	req->misc.write.next = NULL;
+	req->in.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = tmp_page;
+	req->page_descs[0].offset = 0;
+	req->page_descs[0].length = PAGE_SIZE;
+	req->end = fuse_writepage_end;
+	req->inode = inode;
+
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+	spin_lock(&fc->lock);
+	list_add(&req->writepages_entry, &fi->writepages);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(inode);
+	spin_unlock(&fc->lock);
+
+	end_page_writeback(page);
+
+	return 0;
+
+err_nofile:
+	__free_page(tmp_page);
+err_free:
+	fuse_request_free(req);
+err:
+	end_page_writeback(page);
+	return error;
+}
+
+static int fuse_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err;
+
+	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
+		/*
+		 * ->writepages() should be called for sync() and friends.  We
+		 * should only get here on direct reclaim and then we are
+		 * allowed to skip a page which is already in flight
+		 */
+		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
+		redirty_page_for_writepage(wbc, page);
+		return 0;
+	}
+
+	err = fuse_writepage_locked(page);
+	unlock_page(page);
+
+	return err;
+}
+
+struct fuse_fill_wb_data {
+	struct fuse_req *req;
+	struct fuse_file *ff;
+	struct inode *inode;
+	struct page **orig_pages;
+};
+
+static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+{
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	int num_pages = req->num_pages;
+	int i;
+
+	req->ff = fuse_file_get(data->ff);
+	spin_lock(&fc->lock);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(inode);
+	spin_unlock(&fc->lock);
+
+	for (i = 0; i < num_pages; i++)
+		end_page_writeback(data->orig_pages[i]);
+}
+
+static bool fuse_writepage_in_flight(struct fuse_req *new_req,
+				     struct page *page)
+{
+	struct fuse_conn *fc = get_fuse_conn(new_req->inode);
+	struct fuse_inode *fi = get_fuse_inode(new_req->inode);
+	struct fuse_req *tmp;
+	struct fuse_req *old_req;
+	bool found = false;
+	pgoff_t curr_index;
+
+	BUG_ON(new_req->num_pages != 0);
+
+	spin_lock(&fc->lock);
+	list_del(&new_req->writepages_entry);
+	list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
+		BUG_ON(old_req->inode != new_req->inode);
+		curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (curr_index <= page->index &&
+		    page->index < curr_index + old_req->num_pages) {
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		list_add(&new_req->writepages_entry, &fi->writepages);
+		goto out_unlock;
+	}
+
+	new_req->num_pages = 1;
+	for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
+		BUG_ON(tmp->inode != new_req->inode);
+		curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (tmp->num_pages == 1 &&
+		    curr_index == page->index) {
+			old_req = tmp;
+		}
+	}
+
+	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
+					old_req->state == FUSE_REQ_PENDING)) {
+		struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
+
+		copy_highpage(old_req->pages[0], page);
+		spin_unlock(&fc->lock);
+
+		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+		bdi_writeout_inc(bdi);
+		fuse_writepage_free(fc, new_req);
+		fuse_request_free(new_req);
+		goto out;
+	} else {
+		new_req->misc.write.next = old_req->misc.write.next;
+		old_req->misc.write.next = new_req;
+	}
+out_unlock:
+	spin_unlock(&fc->lock);
+out:
+	return found;
+}
+
+static int fuse_writepages_fill(struct page *page,
+		struct writeback_control *wbc, void *_data)
+{
+	struct fuse_fill_wb_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct page *tmp_page;
+	bool is_writeback;
+	int err;
+
+	if (!data->ff) {
+		err = -EIO;
+		data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+		if (!data->ff)
+			goto out_unlock;
+	}
+
+	/*
+	 * Being under writeback is unlikely but possible.  For example direct
+	 * read to an mmaped fuse file will set the page dirty twice; once when
+	 * the pages are faulted with get_user_pages(), and then after the read
+	 * completed.
+	 */
+	is_writeback = fuse_page_is_writeback(inode, page->index);
+
+	if (req && req->num_pages &&
+	    (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+	     data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
+		fuse_writepages_send(data);
+		data->req = NULL;
+	}
+	err = -ENOMEM;
+	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (!tmp_page)
+		goto out_unlock;
+
+	/*
+	 * The page must not be redirtied until the writeout is completed
+	 * (i.e. userspace has sent a reply to the write request).  Otherwise
+	 * there could be more than one temporary page instance for each real
+	 * page.
+	 *
+	 * This is ensured by holding the page lock in page_mkwrite() while
+	 * checking fuse_page_is_writeback().  We already hold the page lock
+	 * since clear_page_dirty_for_io() and keep it held until we add the
+	 * request to the fi->writepages list and increment req->num_pages.
+	 * After this fuse_page_is_writeback() will indicate that the page is
+	 * under writeback, so we can release the page lock.
+	 */
+	if (data->req == NULL) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		err = -ENOMEM;
+		req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+		if (!req) {
+			__free_page(tmp_page);
+			goto out_unlock;
+		}
+
+		fuse_write_fill(req, data->ff, page_offset(page), 0);
+		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+		req->misc.write.next = NULL;
+		req->in.argpages = 1;
+		req->background = 1;
+		req->num_pages = 0;
+		req->end = fuse_writepage_end;
+		req->inode = inode;
+
+		spin_lock(&fc->lock);
+		list_add(&req->writepages_entry, &fi->writepages);
+		spin_unlock(&fc->lock);
+
+		data->req = req;
+	}
+	set_page_writeback(page);
+
+	copy_highpage(tmp_page, page);
+	req->pages[req->num_pages] = tmp_page;
+	req->page_descs[req->num_pages].offset = 0;
+	req->page_descs[req->num_pages].length = PAGE_SIZE;
+
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+	err = 0;
+	if (is_writeback && fuse_writepage_in_flight(req, page)) {
+		end_page_writeback(page);
+		data->req = NULL;
+		goto out_unlock;
+	}
+	data->orig_pages[req->num_pages] = page;
+
+	/*
+	 * Protected by fc->lock against concurrent access by
+	 * fuse_page_is_writeback().
+	 */
+	spin_lock(&fc->lock);
+	req->num_pages++;
+	spin_unlock(&fc->lock);
+
+out_unlock:
+	unlock_page(page);
+
+	return err;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_fill_wb_data data;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	data.inode = inode;
+	data.req = NULL;
+	data.ff = NULL;
+
+	err = -ENOMEM;
+	data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
+				  sizeof(struct page *),
+				  GFP_NOFS);
+	if (!data.orig_pages)
+		goto out;
+
+	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+	if (data.req) {
+		/* Ignore errors if we can write at least one page */
+		BUG_ON(!data.req->num_pages);
+		fuse_writepages_send(&data);
+		err = 0;
+	}
+	if (data.ff)
+		fuse_file_put(data.ff, false);
+
+	kfree(data.orig_pages);
+out:
+	return err;
+}
+
+/*
+ * It's worthy to make sure that space is reserved on disk for the write,
+ * but how to implement it without killing performance need more thinking.
+ */
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
+	struct page *page;
+	loff_t fsize;
+	int err = -ENOMEM;
+
+	WARN_ON(!fc->writeback_cache);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		goto error;
+
+	fuse_wait_on_page_writeback(mapping->host, page->index);
+
+	if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
+		goto success;
+	/*
+	 * Check if the start this page comes after the end of file, in which
+	 * case the readpage can be optimized away.
+	 */
+	fsize = i_size_read(mapping->host);
+	if (fsize <= (pos & PAGE_CACHE_MASK)) {
+		size_t off = pos & ~PAGE_CACHE_MASK;
+		if (off)
+			zero_user_segment(page, 0, off);
+		goto success;
+	}
+	err = fuse_do_readpage(file, page);
+	if (err)
+		goto cleanup;
+success:
+	*pagep = page;
+	return 0;
+
+cleanup:
+	unlock_page(page);
+	page_cache_release(page);
+error:
+	return err;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied,
+		struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+
+	if (!PageUptodate(page)) {
+		/* Zero any unwritten bytes at the end of the page */
+		size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
+		if (endoff)
+			zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+	}
+
+	fuse_write_update_size(inode, pos + copied);
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+static int fuse_launder_page(struct page *page)
+{
+	int err = 0;
+	if (clear_page_dirty_for_io(page)) {
+		struct inode *inode = page->mapping->host;
+		err = fuse_writepage_locked(page);
+		if (!err)
+			fuse_wait_on_page_writeback(inode, page->index);
+	}
+	return err;
+}
+
+/*
+ * Write back dirty pages now, because there may not be any suitable
+ * open files later
+ */
+static void fuse_vma_close(struct vm_area_struct *vma)
+{
+	filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+
+/*
+ * Wait for writeback against this page to complete before allowing it
+ * to be marked dirty again, and hence written back again, possibly
+ * before the previous writepage completed.
+ *
+ * Block here, instead of in ->writepage(), so that the userspace fs
+ * can only block processes actually operating on the filesystem.
+ *
+ * Otherwise unprivileged userspace fs would be able to block
+ * unrelated:
+ *
+ * - page migration
+ * - sync(2)
+ * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
+ */
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+
+	file_update_time(vma->vm_file);
+	lock_page(page);
+	if (page->mapping != inode->i_mapping) {
+		unlock_page(page);
+		return VM_FAULT_NOPAGE;
+	}
+
+	fuse_wait_on_page_writeback(inode, page->index);
+	return VM_FAULT_LOCKED;
+}
+
+static const struct vm_operations_struct fuse_file_vm_ops = {
+	.close		= fuse_vma_close,
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= fuse_page_mkwrite,
+};
+
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		fuse_link_write_file(file);
+
+	file_accessed(file);
+	vma->vm_ops = &fuse_file_vm_ops;
+	return 0;
+}
+
+static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* Can't provide the coherency needed for MAP_SHARED */
+	if (vma->vm_flags & VM_MAYSHARE)
+		return -ENODEV;
+
+	invalidate_inode_pages2(file->f_mapping);
+
+	return generic_file_mmap(file, vma);
+}
+
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+				  struct file_lock *fl)
+{
+	switch (ffl->type) {
+	case F_UNLCK:
+		break;
+
+	case F_RDLCK:
+	case F_WRLCK:
+		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+		    ffl->end < ffl->start)
+			return -EIO;
+
+		fl->fl_start = ffl->start;
+		fl->fl_end = ffl->end;
+		fl->fl_pid = ffl->pid;
+		break;
+
+	default:
+		return -EIO;
+	}
+	fl->fl_type = ffl->type;
+	return 0;
+}
+
+static void fuse_lk_fill(struct fuse_args *args, struct file *file,
+			 const struct file_lock *fl, int opcode, pid_t pid,
+			 int flock, struct fuse_lk_in *inarg)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+
+	memset(inarg, 0, sizeof(*inarg));
+	inarg->fh = ff->fh;
+	inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+	inarg->lk.start = fl->fl_start;
+	inarg->lk.end = fl->fl_end;
+	inarg->lk.type = fl->fl_type;
+	inarg->lk.pid = pid;
+	if (flock)
+		inarg->lk_flags |= FUSE_LK_FLOCK;
+	args->in.h.opcode = opcode;
+	args->in.h.nodeid = get_node_id(inode);
+	args->in.numargs = 1;
+	args->in.args[0].size = sizeof(*inarg);
+	args->in.args[0].value = inarg;
+}
+
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_lk_in inarg;
+	struct fuse_lk_out outarg;
+	int err;
+
+	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(outarg);
+	args.out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+	if (!err)
+		err = convert_fuse_file_lock(&outarg.lk, fl);
+
+	return err;
+}
+
+static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_lk_in inarg;
+	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+	int err;
+
+	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+		/* NLM needs asynchronous locks, which we don't support yet */
+		return -ENOLCK;
+	}
+
+	/* Unlock on close is handled by the flush method */
+	if (fl->fl_flags & FL_CLOSE)
+		return 0;
+
+	fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
+	err = fuse_simple_request(fc, &args);
+
+	/* locking is restartable */
+	if (err == -EINTR)
+		err = -ERESTARTSYS;
+
+	return err;
+}
+
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (cmd == F_CANCELLK) {
+		err = 0;
+	} else if (cmd == F_GETLK) {
+		if (fc->no_lock) {
+			posix_test_lock(file, fl);
+			err = 0;
+		} else
+			err = fuse_getlk(file, fl);
+	} else {
+		if (fc->no_lock)
+			err = posix_lock_file(file, fl, NULL);
+		else
+			err = fuse_setlk(file, fl, 0);
+	}
+	return err;
+}
+
+static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (fc->no_flock) {
+		err = flock_lock_file_wait(file, fl);
+	} else {
+		struct fuse_file *ff = file->private_data;
+
+		/* emulate flock with POSIX locks */
+		ff->flock = true;
+		err = fuse_setlk(file, fl, 1);
+	}
+
+	return err;
+}
+
+static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_bmap_in inarg;
+	struct fuse_bmap_out outarg;
+	int err;
+
+	if (!inode->i_sb->s_bdev || fc->no_bmap)
+		return 0;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.block = block;
+	inarg.blocksize = inode->i_sb->s_blocksize;
+	args.in.h.opcode = FUSE_BMAP;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(outarg);
+	args.out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS)
+		fc->no_bmap = 1;
+
+	return err ? 0 : outarg.block;
+}
+
+static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t retval;
+	struct inode *inode = file_inode(file);
+
+	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+	if (whence == SEEK_CUR || whence == SEEK_SET)
+		return generic_file_llseek(file, offset, whence);
+
+	mutex_lock(&inode->i_mutex);
+	retval = fuse_update_attributes(inode, NULL, file, NULL);
+	if (!retval)
+		retval = generic_file_llseek(file, offset, whence);
+	mutex_unlock(&inode->i_mutex);
+
+	return retval;
+}
+
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+			unsigned int nr_segs, size_t bytes, bool to_user)
+{
+	struct iov_iter ii;
+	int page_idx = 0;
+
+	if (!bytes)
+		return 0;
+
+	iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
+
+	while (iov_iter_count(&ii)) {
+		struct page *page = pages[page_idx++];
+		size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+		void *kaddr;
+
+		kaddr = kmap(page);
+
+		while (todo) {
+			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+			size_t copy = min(todo, iov_len);
+			size_t left;
+
+			if (!to_user)
+				left = copy_from_user(kaddr, uaddr, copy);
+			else
+				left = copy_to_user(uaddr, kaddr, copy);
+
+			if (unlikely(left))
+				return -EFAULT;
+
+			iov_iter_advance(&ii, copy);
+			todo -= copy;
+			kaddr += copy;
+		}
+
+		kunmap(page);
+	}
+
+	return 0;
+}
+
+/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit.  Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+				     size_t transferred, unsigned count,
+				     bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+	if (count * sizeof(struct compat_iovec) == transferred) {
+		struct compat_iovec *ciov = src;
+		unsigned i;
+
+		/*
+		 * With this interface a 32bit server cannot support
+		 * non-compat (i.e. ones coming from 64bit apps) ioctl
+		 * requests
+		 */
+		if (!is_compat)
+			return -EINVAL;
+
+		for (i = 0; i < count; i++) {
+			dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+			dst[i].iov_len = ciov[i].iov_len;
+		}
+		return 0;
+	}
+#endif
+
+	if (count * sizeof(struct iovec) != transferred)
+		return -EIO;
+
+	memcpy(dst, src, transferred);
+	return 0;
+}
+
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+	size_t n;
+	u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+	for (n = 0; n < count; n++, iov++) {
+		if (iov->iov_len > (size_t) max)
+			return -ENOMEM;
+		max -= iov->iov_len;
+	}
+	return 0;
+}
+
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+				 void *src, size_t transferred, unsigned count,
+				 bool is_compat)
+{
+	unsigned i;
+	struct fuse_ioctl_iovec *fiov = src;
+
+	if (fc->minor < 16) {
+		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+						 count, is_compat);
+	}
+
+	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+		return -EIO;
+
+	for (i = 0; i < count; i++) {
+		/* Did the server supply an inappropriate value? */
+		if (fiov[i].base != (unsigned long) fiov[i].base ||
+		    fiov[i].len != (unsigned long) fiov[i].len)
+			return -EIO;
+
+		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+		dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+		if (is_compat &&
+		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+		     (compat_size_t) dst[i].iov_len != fiov[i].len))
+			return -EIO;
+#endif
+	}
+
+	return 0;
+}
+
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written.  Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs.  Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ *	char	*buf;
+ *	size_t	buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
+ *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY.  This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+		   unsigned int flags)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_ioctl_in inarg = {
+		.fh = ff->fh,
+		.cmd = cmd,
+		.arg = arg,
+		.flags = flags
+	};
+	struct fuse_ioctl_out outarg;
+	struct fuse_req *req = NULL;
+	struct page **pages = NULL;
+	struct iovec *iov_page = NULL;
+	struct iovec *in_iov = NULL, *out_iov = NULL;
+	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+	size_t in_size, out_size, transferred;
+	int err;
+
+#if BITS_PER_LONG == 32
+	inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+	if (flags & FUSE_IOCTL_COMPAT)
+		inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+
+	/* assume all the iovs returned by client always fits in a page */
+	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+	err = -ENOMEM;
+	pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
+	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
+	if (!pages || !iov_page)
+		goto out;
+
+	/*
+	 * If restricted, initialize IO parameters as encoded in @cmd.
+	 * RETRY from server is not allowed.
+	 */
+	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+		struct iovec *iov = iov_page;
+
+		iov->iov_base = (void __user *)arg;
+		iov->iov_len = _IOC_SIZE(cmd);
+
+		if (_IOC_DIR(cmd) & _IOC_WRITE) {
+			in_iov = iov;
+			in_iovs = 1;
+		}
+
+		if (_IOC_DIR(cmd) & _IOC_READ) {
+			out_iov = iov;
+			out_iovs = 1;
+		}
+	}
+
+ retry:
+	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+	inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+	/*
+	 * Out data can be used either for actual out data or iovs,
+	 * make sure there always is at least one page.
+	 */
+	out_size = max_t(size_t, out_size, PAGE_SIZE);
+	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+	/* make sure there are enough buffer pages and init request with them */
+	err = -ENOMEM;
+	if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+		goto out;
+	while (num_pages < max_pages) {
+		pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (!pages[num_pages])
+			goto out;
+		num_pages++;
+	}
+
+	req = fuse_get_req(fc, num_pages);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		req = NULL;
+		goto out;
+	}
+	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+	req->num_pages = num_pages;
+	fuse_page_descs_length_init(req, 0, req->num_pages);
+
+	/* okay, let's send it to the client */
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	if (in_size) {
+		req->in.numargs++;
+		req->in.args[1].size = in_size;
+		req->in.argpages = 1;
+
+		err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+					   false);
+		if (err)
+			goto out;
+	}
+
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	req->out.args[1].size = out_size;
+	req->out.argpages = 1;
+	req->out.argvar = 1;
+
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	transferred = req->out.args[1].size;
+	fuse_put_request(fc, req);
+	req = NULL;
+	if (err)
+		goto out;
+
+	/* did it ask for retry? */
+	if (outarg.flags & FUSE_IOCTL_RETRY) {
+		void *vaddr;
+
+		/* no retry if in restricted mode */
+		err = -EIO;
+		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+			goto out;
+
+		in_iovs = outarg.in_iovs;
+		out_iovs = outarg.out_iovs;
+
+		/*
+		 * Make sure things are in boundary, separate checks
+		 * are to protect against overflow.
+		 */
+		err = -ENOMEM;
+		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+		    out_iovs > FUSE_IOCTL_MAX_IOV ||
+		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+			goto out;
+
+		vaddr = kmap_atomic(pages[0]);
+		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+					    transferred, in_iovs + out_iovs,
+					    (flags & FUSE_IOCTL_COMPAT) != 0);
+		kunmap_atomic(vaddr);
+		if (err)
+			goto out;
+
+		in_iov = iov_page;
+		out_iov = in_iov + in_iovs;
+
+		err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+		if (err)
+			goto out;
+
+		err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+		if (err)
+			goto out;
+
+		goto retry;
+	}
+
+	err = -EIO;
+	if (transferred > inarg.out_size)
+		goto out;
+
+	err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+	if (req)
+		fuse_put_request(fc, req);
+	free_page((unsigned long) iov_page);
+	while (num_pages)
+		__free_page(pages[--num_pages]);
+	kfree(pages);
+
+	return err ? err : outarg.result;
+}
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+		       unsigned long arg, unsigned int flags)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (!fuse_allow_current_process(fc))
+		return -EACCES;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	return fuse_ioctl_common(file, cmd, arg, 0);
+}
+
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+					      struct rb_node **parent_out)
+{
+	struct rb_node **link = &fc->polled_files.rb_node;
+	struct rb_node *last = NULL;
+
+	while (*link) {
+		struct fuse_file *ff;
+
+		last = *link;
+		ff = rb_entry(last, struct fuse_file, polled_node);
+
+		if (kh < ff->kh)
+			link = &last->rb_left;
+		else if (kh > ff->kh)
+			link = &last->rb_right;
+		else
+			return link;
+	}
+
+	if (parent_out)
+		*parent_out = last;
+	return link;
+}
+
+/*
+ * The file is about to be polled.  Make sure it's on the polled_files
+ * RB tree.  Note that files once added to the polled_files tree are
+ * not removed before the file is released.  This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+				      struct fuse_file *ff)
+{
+	spin_lock(&fc->lock);
+	if (RB_EMPTY_NODE(&ff->polled_node)) {
+		struct rb_node **link, *uninitialized_var(parent);
+
+		link = fuse_find_polled_node(fc, ff->kh, &parent);
+		BUG_ON(*link);
+		rb_link_node(&ff->polled_node, parent, link);
+		rb_insert_color(&ff->polled_node, &fc->polled_files);
+	}
+	spin_unlock(&fc->lock);
+}
+
+unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+	struct fuse_poll_out outarg;
+	FUSE_ARGS(args);
+	int err;
+
+	if (fc->no_poll)
+		return DEFAULT_POLLMASK;
+
+	poll_wait(file, &ff->poll_wait, wait);
+	inarg.events = (__u32)poll_requested_events(wait);
+
+	/*
+	 * Ask for notification iff there's someone waiting for it.
+	 * The client may ignore the flag and always notify.
+	 */
+	if (waitqueue_active(&ff->poll_wait)) {
+		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+		fuse_register_polled_file(fc, ff);
+	}
+
+	args.in.h.opcode = FUSE_POLL;
+	args.in.h.nodeid = ff->nodeid;
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(outarg);
+	args.out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+
+	if (!err)
+		return outarg.revents;
+	if (err == -ENOSYS) {
+		fc->no_poll = 1;
+		return DEFAULT_POLLMASK;
+	}
+	return POLLERR;
+}
+EXPORT_SYMBOL_GPL(fuse_file_poll);
+
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg)
+{
+	u64 kh = outarg->kh;
+	struct rb_node **link;
+
+	spin_lock(&fc->lock);
+
+	link = fuse_find_polled_node(fc, kh, NULL);
+	if (*link) {
+		struct fuse_file *ff;
+
+		ff = rb_entry(*link, struct fuse_file, polled_node);
+		wake_up_interruptible_sync(&ff->poll_wait);
+	}
+
+	spin_unlock(&fc->lock);
+	return 0;
+}
+
+static void fuse_do_truncate(struct file *file)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct iattr attr;
+
+	attr.ia_valid = ATTR_SIZE;
+	attr.ia_size = i_size_read(inode);
+
+	attr.ia_file = file;
+	attr.ia_valid |= ATTR_FILE;
+
+	fuse_do_setattr(inode, &attr, file);
+}
+
+static inline loff_t fuse_round_up(loff_t off)
+{
+	return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+}
+
+static ssize_t
+fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	ssize_t ret = 0;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	bool async_dio = ff->fc->async_dio;
+	loff_t pos = 0;
+	struct inode *inode;
+	loff_t i_size;
+	size_t count = iov_iter_count(iter);
+	struct fuse_io_priv *io;
+
+	pos = offset;
+	inode = file->f_mapping->host;
+	i_size = i_size_read(inode);
+
+	if ((iov_iter_rw(iter) == READ) && (offset > i_size))
+		return 0;
+
+	/* optimization for short read */
+	if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
+		if (offset >= i_size)
+			return 0;
+		iov_iter_truncate(iter, fuse_round_up(i_size - offset));
+		count = iov_iter_count(iter);
+	}
+
+	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+	if (!io)
+		return -ENOMEM;
+	spin_lock_init(&io->lock);
+	io->reqs = 1;
+	io->bytes = -1;
+	io->size = 0;
+	io->offset = offset;
+	io->write = (iov_iter_rw(iter) == WRITE);
+	io->err = 0;
+	io->file = file;
+	/*
+	 * By default, we want to optimize all I/Os with async request
+	 * submission to the client filesystem if supported.
+	 */
+	io->async = async_dio;
+	io->iocb = iocb;
+
+	/*
+	 * We cannot asynchronously extend the size of a file. We have no method
+	 * to wait on real async I/O requests, so we must submit this request
+	 * synchronously.
+	 */
+	if (!is_sync_kiocb(iocb) && (offset + count > i_size) &&
+	    iov_iter_rw(iter) == WRITE)
+		io->async = false;
+
+	if (io->async && is_sync_kiocb(iocb))
+		io->done = &wait;
+
+	if (iov_iter_rw(iter) == WRITE) {
+		ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
+		fuse_invalidate_attr(inode);
+	} else {
+		ret = __fuse_direct_read(io, iter, &pos);
+	}
+
+	if (io->async) {
+		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
+
+		/* we have a non-extending, async request, so return */
+		if (!is_sync_kiocb(iocb))
+			return -EIOCBQUEUED;
+
+		wait_for_completion(&wait);
+		ret = fuse_get_res_by_io(io);
+	}
+
+	kfree(io);
+
+	if (iov_iter_rw(iter) == WRITE) {
+		if (ret > 0)
+			fuse_write_update_size(inode, pos);
+		else if (ret < 0 && offset + count > i_size)
+			fuse_do_truncate(file);
+	}
+
+	return ret;
+}
+
+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+				loff_t length)
+{
+	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = ff->fc;
+	FUSE_ARGS(args);
+	struct fuse_fallocate_in inarg = {
+		.fh = ff->fh,
+		.offset = offset,
+		.length = length,
+		.mode = mode
+	};
+	int err;
+	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+			   (mode & FALLOC_FL_PUNCH_HOLE);
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (fc->no_fallocate)
+		return -EOPNOTSUPP;
+
+	if (lock_inode) {
+		mutex_lock(&inode->i_mutex);
+		if (mode & FALLOC_FL_PUNCH_HOLE) {
+			loff_t endbyte = offset + length - 1;
+			err = filemap_write_and_wait_range(inode->i_mapping,
+							   offset, endbyte);
+			if (err)
+				goto out;
+
+			fuse_sync_writes(inode);
+		}
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+	args.in.h.opcode = FUSE_FALLOCATE;
+	args.in.h.nodeid = ff->nodeid;
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_fallocate = 1;
+		err = -EOPNOTSUPP;
+	}
+	if (err)
+		goto out;
+
+	/* we could have extended the file */
+	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+		bool changed = fuse_write_update_size(inode, offset + length);
+
+		if (changed && fc->writeback_cache)
+			file_update_time(file);
+	}
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		truncate_pagecache_range(inode, offset, offset + length - 1);
+
+	fuse_invalidate_attr(inode);
+
+out:
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+	if (lock_inode)
+		mutex_unlock(&inode->i_mutex);
+
+	return err;
+}
+
+static const struct file_operations fuse_file_operations = {
+	.llseek		= fuse_file_llseek,
+	.read_iter	= fuse_file_read_iter,
+	.write_iter	= fuse_file_write_iter,
+	.mmap		= fuse_file_mmap,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
+	.flock		= fuse_file_flock,
+	.splice_read	= generic_file_splice_read,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
+	.fallocate	= fuse_file_fallocate,
+};
+
+static const struct file_operations fuse_direct_io_file_operations = {
+	.llseek		= fuse_file_llseek,
+	.read_iter	= fuse_direct_read_iter,
+	.write_iter	= fuse_direct_write_iter,
+	.mmap		= fuse_direct_mmap,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
+	.flock		= fuse_file_flock,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
+	.fallocate	= fuse_file_fallocate,
+	/* no splice_read */
+};
+
+static const struct address_space_operations fuse_file_aops  = {
+	.readpage	= fuse_readpage,
+	.writepage	= fuse_writepage,
+	.writepages	= fuse_writepages,
+	.launder_page	= fuse_launder_page,
+	.readpages	= fuse_readpages,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.bmap		= fuse_bmap,
+	.direct_IO	= fuse_direct_IO,
+	.write_begin	= fuse_write_begin,
+	.write_end	= fuse_write_end,
+};
+
+void fuse_init_file_inode(struct inode *inode)
+{
+	inode->i_fop = &fuse_file_operations;
+	inode->i_data.a_ops = &fuse_file_aops;
+}
diff --git a/kernel/fs/fuse/fuse_i.h b/kernel/fs/fuse/fuse_i.h
new file mode 100644
index 000000000..7354dc142
--- /dev/null
+++ b/kernel/fs/fuse/fuse_i.h
@@ -0,0 +1,912 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
+#include <linux/fuse.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
+#include <linux/workqueue.h>
+
+/** Max number of pages that can be used in a single read request */
+#define FUSE_MAX_PAGES_PER_REQ 32
+
+/** Bias for fi->writectr, meaning new writepages must not be sent */
+#define FUSE_NOWRITE INT_MIN
+
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 5
+
+/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
+    module will check permissions based on the file mode.  Otherwise no
+    permission checking is done in the kernel */
+#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
+
+/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
+    doing the mount will be allowed to access the filesystem */
+#define FUSE_ALLOW_OTHER         (1 << 1)
+
+/** Number of page pointers embedded in fuse_req */
+#define FUSE_REQ_INLINE_PAGES 1
+
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
+
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
+/* One forget request */
+struct fuse_forget_link {
+	struct fuse_forget_one forget_one;
+	struct fuse_forget_link *next;
+};
+
+/** FUSE inode */
+struct fuse_inode {
+	/** Inode data */
+	struct inode inode;
+
+	/** Unique ID, which identifies the inode between userspace
+	 * and kernel */
+	u64 nodeid;
+
+	/** Number of lookups on this inode */
+	u64 nlookup;
+
+	/** The request used for sending the FORGET message */
+	struct fuse_forget_link *forget;
+
+	/** Time in jiffies until the file attributes are valid */
+	u64 i_time;
+
+	/** The sticky bit in inode->i_mode may have been removed, so
+	    preserve the original mode */
+	umode_t orig_i_mode;
+
+	/** 64 bit inode number */
+	u64 orig_ino;
+
+	/** Version of last attribute change */
+	u64 attr_version;
+
+	/** Files usable in writepage.  Protected by fc->lock */
+	struct list_head write_files;
+
+	/** Writepages pending on truncate or fsync */
+	struct list_head queued_writes;
+
+	/** Number of sent writes, a negative bias (FUSE_NOWRITE)
+	 * means more writes are blocked */
+	int writectr;
+
+	/** Waitq for writepage completion */
+	wait_queue_head_t page_waitq;
+
+	/** List of writepage requestst (pending or sent) */
+	struct list_head writepages;
+
+	/** Miscellaneous bits describing inode state */
+	unsigned long state;
+};
+
+/** FUSE inode state bits */
+enum {
+	/** Advise readdirplus  */
+	FUSE_I_ADVISE_RDPLUS,
+	/** Initialized with readdirplus */
+	FUSE_I_INIT_RDPLUS,
+	/** An operation changing file size is in progress  */
+	FUSE_I_SIZE_UNSTABLE,
+};
+
+struct fuse_conn;
+
+/** FUSE specific file data */
+struct fuse_file {
+	/** Fuse connection for this file */
+	struct fuse_conn *fc;
+
+	/** Request reserved for flush and release */
+	struct fuse_req *reserved_req;
+
+	/** Kernel file handle guaranteed to be unique */
+	u64 kh;
+
+	/** File handle used by userspace */
+	u64 fh;
+
+	/** Node id of this file */
+	u64 nodeid;
+
+	/** Refcount */
+	atomic_t count;
+
+	/** FOPEN_* flags returned by open */
+	u32 open_flags;
+
+	/** Entry on inode's write_files list */
+	struct list_head write_entry;
+
+	/** RB node to be linked on fuse_conn->polled_files */
+	struct rb_node polled_node;
+
+	/** Wait queue head for poll */
+	wait_queue_head_t poll_wait;
+
+	/** Has flock been performed on this file? */
+	bool flock:1;
+};
+
+/** One input argument of a request */
+struct fuse_in_arg {
+	unsigned size;
+	const void *value;
+};
+
+/** The request input */
+struct fuse_in {
+	/** The request header */
+	struct fuse_in_header h;
+
+	/** True if the data for the last argument is in req->pages */
+	unsigned argpages:1;
+
+	/** Number of arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_in_arg args[3];
+};
+
+/** One output argument of a request */
+struct fuse_arg {
+	unsigned size;
+	void *value;
+};
+
+/** The request output */
+struct fuse_out {
+	/** Header returned from userspace */
+	struct fuse_out_header h;
+
+	/*
+	 * The following bitfields are not changed during the request
+	 * processing
+	 */
+
+	/** Last argument is variable length (can be shorter than
+	    arg->size) */
+	unsigned argvar:1;
+
+	/** Last argument is a list of pages to copy data to */
+	unsigned argpages:1;
+
+	/** Zero partially or not copied pages */
+	unsigned page_zeroing:1;
+
+	/** Pages may be replaced with new ones */
+	unsigned page_replace:1;
+
+	/** Number or arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_arg args[2];
+};
+
+/** FUSE page descriptor */
+struct fuse_page_desc {
+	unsigned int length;
+	unsigned int offset;
+};
+
+struct fuse_args {
+	struct {
+		struct {
+			uint32_t opcode;
+			uint64_t nodeid;
+		} h;
+		unsigned numargs;
+		struct fuse_in_arg args[3];
+
+	} in;
+	struct {
+		unsigned argvar:1;
+		unsigned numargs;
+		struct fuse_arg args[2];
+	} out;
+};
+
+#define FUSE_ARGS(args) struct fuse_args args = {}
+
+/** The request state */
+enum fuse_req_state {
+	FUSE_REQ_INIT = 0,
+	FUSE_REQ_PENDING,
+	FUSE_REQ_READING,
+	FUSE_REQ_SENT,
+	FUSE_REQ_WRITING,
+	FUSE_REQ_FINISHED
+};
+
+/** The request IO state (for asynchronous processing) */
+struct fuse_io_priv {
+	int async;
+	spinlock_t lock;
+	unsigned reqs;
+	ssize_t bytes;
+	size_t size;
+	__u64 offset;
+	bool write;
+	int err;
+	struct kiocb *iocb;
+	struct file *file;
+	struct completion *done;
+};
+
+/**
+ * A request to the client
+ */
+struct fuse_req {
+	/** This can be on either pending processing or io lists in
+	    fuse_conn */
+	struct list_head list;
+
+	/** Entry on the interrupts list  */
+	struct list_head intr_entry;
+
+	/** refcount */
+	atomic_t count;
+
+	/** Unique ID for the interrupt request */
+	u64 intr_unique;
+
+	/*
+	 * The following bitfields are either set once before the
+	 * request is queued or setting/clearing them is protected by
+	 * fuse_conn->lock
+	 */
+
+	/** True if the request has reply */
+	unsigned isreply:1;
+
+	/** Force sending of the request even if interrupted */
+	unsigned force:1;
+
+	/** The request was aborted */
+	unsigned aborted:1;
+
+	/** Request is sent in the background */
+	unsigned background:1;
+
+	/** The request has been interrupted */
+	unsigned interrupted:1;
+
+	/** Data is being copied to/from the request */
+	unsigned locked:1;
+
+	/** Request is counted as "waiting" */
+	unsigned waiting:1;
+
+	/** State of the request */
+	enum fuse_req_state state;
+
+	/** The request input */
+	struct fuse_in in;
+
+	/** The request output */
+	struct fuse_out out;
+
+	/** Used to wake up the task waiting for completion of request*/
+	wait_queue_head_t waitq;
+
+	/** Data for asynchronous requests */
+	union {
+		struct {
+			struct fuse_release_in in;
+			struct inode *inode;
+		} release;
+		struct fuse_init_in init_in;
+		struct fuse_init_out init_out;
+		struct cuse_init_in cuse_init_in;
+		struct {
+			struct fuse_read_in in;
+			u64 attr_ver;
+		} read;
+		struct {
+			struct fuse_write_in in;
+			struct fuse_write_out out;
+			struct fuse_req *next;
+		} write;
+		struct fuse_notify_retrieve_in retrieve_in;
+	} misc;
+
+	/** page vector */
+	struct page **pages;
+
+	/** page-descriptor vector */
+	struct fuse_page_desc *page_descs;
+
+	/** size of the 'pages' array */
+	unsigned max_pages;
+
+	/** inline page vector */
+	struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
+
+	/** inline page-descriptor vector */
+	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
+
+	/** number of pages in vector */
+	unsigned num_pages;
+
+	/** File used in the request (or NULL) */
+	struct fuse_file *ff;
+
+	/** Inode used in the request or NULL */
+	struct inode *inode;
+
+	/** AIO control block */
+	struct fuse_io_priv *io;
+
+	/** Link on fi->writepages */
+	struct list_head writepages_entry;
+
+	/** Request completion callback */
+	void (*end)(struct fuse_conn *, struct fuse_req *);
+
+	/** Request is stolen from fuse_file->reserved_req */
+	struct file *stolen_file;
+};
+
+/**
+ * A Fuse connection.
+ *
+ * This structure is created, when the filesystem is mounted, and is
+ * destroyed, when the client device is closed and the filesystem is
+ * unmounted.
+ */
+struct fuse_conn {
+	/** Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
+	/** Refcount */
+	atomic_t count;
+
+	struct rcu_head rcu;
+
+	/** The user id for this mount */
+	kuid_t user_id;
+
+	/** The group id for this mount */
+	kgid_t group_id;
+
+	/** The fuse mount flags for this mount */
+	unsigned flags;
+
+	/** Maximum read size */
+	unsigned max_read;
+
+	/** Maximum write size */
+	unsigned max_write;
+
+	/** Readers of the connection are waiting on this */
+	wait_queue_head_t waitq;
+
+	/** The list of pending requests */
+	struct list_head pending;
+
+	/** The list of requests being processed */
+	struct list_head processing;
+
+	/** The list of requests under I/O */
+	struct list_head io;
+
+	/** The next unique kernel file handle */
+	u64 khctr;
+
+	/** rbtree of fuse_files waiting for poll events indexed by ph */
+	struct rb_root polled_files;
+
+	/** Maximum number of outstanding background requests */
+	unsigned max_background;
+
+	/** Number of background requests at which congestion starts */
+	unsigned congestion_threshold;
+
+	/** Number of requests currently in the background */
+	unsigned num_background;
+
+	/** Number of background requests currently queued for userspace */
+	unsigned active_background;
+
+	/** The list of background requests set aside for later queuing */
+	struct list_head bg_queue;
+
+	/** Pending interrupts */
+	struct list_head interrupts;
+
+	/** Queue of pending forgets */
+	struct fuse_forget_link forget_list_head;
+	struct fuse_forget_link *forget_list_tail;
+
+	/** Batching of FORGET requests (positive indicates FORGET batch) */
+	int forget_batch;
+
+	/** Flag indicating that INIT reply has been received. Allocating
+	 * any fuse request will be suspended until the flag is set */
+	int initialized;
+
+	/** Flag indicating if connection is blocked.  This will be
+	    the case before the INIT reply is received, and if there
+	    are too many outstading backgrounds requests */
+	int blocked;
+
+	/** waitq for blocked connection */
+	wait_queue_head_t blocked_waitq;
+
+	/** waitq for reserved requests */
+	wait_queue_head_t reserved_req_waitq;
+
+	/** The next unique request id */
+	u64 reqctr;
+
+	/** Connection established, cleared on umount, connection
+	    abort and device release */
+	unsigned connected;
+
+	/** Connection failed (version mismatch).  Cannot race with
+	    setting other bitfields since it is only set once in INIT
+	    reply, before any other request, and never cleared */
+	unsigned conn_error:1;
+
+	/** Connection successful.  Only set in INIT */
+	unsigned conn_init:1;
+
+	/** Do readpages asynchronously?  Only set in INIT */
+	unsigned async_read:1;
+
+	/** Do not send separate SETATTR request before open(O_TRUNC)  */
+	unsigned atomic_o_trunc:1;
+
+	/** Filesystem supports NFS exporting.  Only set in INIT */
+	unsigned export_support:1;
+
+	/** Set if bdi is valid */
+	unsigned bdi_initialized:1;
+
+	/** write-back cache policy (default is write-through) */
+	unsigned writeback_cache:1;
+
+	/*
+	 * The following bitfields are only for optimization purposes
+	 * and hence races in setting them will not cause malfunction
+	 */
+
+	/** Is open/release not implemented by fs? */
+	unsigned no_open:1;
+
+	/** Is fsync not implemented by fs? */
+	unsigned no_fsync:1;
+
+	/** Is fsyncdir not implemented by fs? */
+	unsigned no_fsyncdir:1;
+
+	/** Is flush not implemented by fs? */
+	unsigned no_flush:1;
+
+	/** Is setxattr not implemented by fs? */
+	unsigned no_setxattr:1;
+
+	/** Is getxattr not implemented by fs? */
+	unsigned no_getxattr:1;
+
+	/** Is listxattr not implemented by fs? */
+	unsigned no_listxattr:1;
+
+	/** Is removexattr not implemented by fs? */
+	unsigned no_removexattr:1;
+
+	/** Are posix file locking primitives not implemented by fs? */
+	unsigned no_lock:1;
+
+	/** Is access not implemented by fs? */
+	unsigned no_access:1;
+
+	/** Is create not implemented by fs? */
+	unsigned no_create:1;
+
+	/** Is interrupt not implemented by fs? */
+	unsigned no_interrupt:1;
+
+	/** Is bmap not implemented by fs? */
+	unsigned no_bmap:1;
+
+	/** Is poll not implemented by fs? */
+	unsigned no_poll:1;
+
+	/** Do multi-page cached writes */
+	unsigned big_writes:1;
+
+	/** Don't apply umask to creation modes */
+	unsigned dont_mask:1;
+
+	/** Are BSD file locking primitives not implemented by fs? */
+	unsigned no_flock:1;
+
+	/** Is fallocate not implemented by fs? */
+	unsigned no_fallocate:1;
+
+	/** Is rename with flags implemented by fs? */
+	unsigned no_rename2:1;
+
+	/** Use enhanced/automatic page cache invalidation. */
+	unsigned auto_inval_data:1;
+
+	/** Does the filesystem support readdirplus? */
+	unsigned do_readdirplus:1;
+
+	/** Does the filesystem want adaptive readdirplus? */
+	unsigned readdirplus_auto:1;
+
+	/** Does the filesystem support asynchronous direct-IO submission? */
+	unsigned async_dio:1;
+
+	/** The number of requests waiting for completion */
+	atomic_t num_waiting;
+
+	/** Negotiated minor version */
+	unsigned minor;
+
+	/** Backing dev info */
+	struct backing_dev_info bdi;
+
+	/** Entry on the fuse_conn_list */
+	struct list_head entry;
+
+	/** Device ID from super block */
+	dev_t dev;
+
+	/** Dentries in the control filesystem */
+	struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+
+	/** number of dentries used in the above array */
+	int ctl_ndents;
+
+	/** O_ASYNC requests */
+	struct fasync_struct *fasync;
+
+	/** Key for lock owner ID scrambling */
+	u32 scramble_key[4];
+
+	/** Reserved request for the DESTROY message */
+	struct fuse_req *destroy_req;
+
+	/** Version counter for attribute changes */
+	u64 attr_version;
+
+	/** Called on final put */
+	void (*release)(struct fuse_conn *);
+
+	/** Super block for this connection. */
+	struct super_block *sb;
+
+	/** Read/write semaphore to hold when accessing sb. */
+	struct rw_semaphore killsb;
+};
+
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
+{
+	return get_fuse_conn_super(inode->i_sb);
+}
+
+static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+{
+	return container_of(inode, struct fuse_inode, inode);
+}
+
+static inline u64 get_node_id(struct inode *inode)
+{
+	return get_fuse_inode(inode)->nodeid;
+}
+
+/** Device operations */
+extern const struct file_operations fuse_dev_operations;
+
+extern const struct dentry_operations fuse_dentry_operations;
+
+/**
+ * Inode to nodeid comparison.
+ */
+int fuse_inode_eq(struct inode *inode, void *_nodeidp);
+
+/**
+ * Get a filled in inode
+ */
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+			int generation, struct fuse_attr *attr,
+			u64 attr_valid, u64 attr_version);
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode);
+
+/**
+ * Send FORGET command
+ */
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+		       u64 nodeid, u64 nlookup);
+
+struct fuse_forget_link *fuse_alloc_forget(void);
+
+/* Used by READDIRPLUS */
+void fuse_force_forget(struct file *file, u64 nodeid);
+
+/**
+ * Initialize READ or READDIR request
+ */
+void fuse_read_fill(struct fuse_req *req, struct file *file,
+		    loff_t pos, size_t count, int opcode);
+
+/**
+ * Send OPEN or OPENDIR request
+ */
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
+
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_get(struct fuse_file *ff);
+void fuse_file_free(struct fuse_file *ff);
+void fuse_finish_open(struct inode *inode, struct file *file);
+
+void fuse_sync_release(struct fuse_file *ff, int flags);
+
+/**
+ * Send RELEASE or RELEASEDIR request
+ */
+void fuse_release_common(struct file *file, int opcode);
+
+/**
+ * Send FSYNC or FSYNCDIR request
+ */
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+		      int datasync, int isdir);
+
+/**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg);
+
+/**
+ * Initialize file operations on a regular file
+ */
+void fuse_init_file_inode(struct inode *inode);
+
+/**
+ * Initialize inode operations on regular files and special files
+ */
+void fuse_init_common(struct inode *inode);
+
+/**
+ * Initialize inode and file operations on a directory
+ */
+void fuse_init_dir(struct inode *inode);
+
+/**
+ * Initialize inode operations on a symlink
+ */
+void fuse_init_symlink(struct inode *inode);
+
+/**
+ * Change attributes of an inode
+ */
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+			    u64 attr_valid, u64 attr_version);
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+				   u64 attr_valid);
+
+/**
+ * Initialize the client device
+ */
+int fuse_dev_init(void);
+
+/**
+ * Cleanup the client device
+ */
+void fuse_dev_cleanup(void);
+
+int fuse_ctl_init(void);
+void __exit fuse_ctl_cleanup(void);
+
+/**
+ * Allocate a request
+ */
+struct fuse_req *fuse_request_alloc(unsigned npages);
+
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
+
+/**
+ * Free a request
+ */
+void fuse_request_free(struct fuse_req *req);
+
+/**
+ * Get a request, may fail with -ENOMEM,
+ * caller should specify # elements in req->pages[] explicitly
+ */
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+					     unsigned npages);
+
+/*
+ * Increment reference count on request
+ */
+void __fuse_get_request(struct fuse_req *req);
+
+/**
+ * Gets a requests for a file operation, always succeeds
+ */
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+					     struct file *file);
+
+/**
+ * Decrement reference count of a request.  If count goes to zero free
+ * the request.
+ */
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request (synchronous)
+ */
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Simple request sending that does request allocation and freeing
+ */
+ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
+
+/**
+ * Send a request in the background
+ */
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req);
+
+/* Abort all requests */
+void fuse_abort_conn(struct fuse_conn *fc);
+
+/**
+ * Invalidate inode attributes
+ */
+void fuse_invalidate_attr(struct inode *inode);
+
+void fuse_invalidate_entry_cache(struct dentry *entry);
+
+void fuse_invalidate_atime(struct inode *inode);
+
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+
+/**
+ * Initialize fuse_conn
+ */
+void fuse_conn_init(struct fuse_conn *fc);
+
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
+
+/**
+ * Is file type valid?
+ */
+int fuse_valid_type(int m);
+
+/**
+ * Is current process allowed to perform filesystem operation?
+ */
+int fuse_allow_current_process(struct fuse_conn *fc);
+
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+			   struct file *file, bool *refreshed);
+
+void fuse_flush_writepages(struct inode *inode);
+
+void fuse_set_nowrite(struct inode *inode);
+void fuse_release_nowrite(struct inode *inode);
+
+u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+			     loff_t offset, loff_t len);
+
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ *    - matches the inode number for the dentry matching parent/name,
+ *    - is not a mount point
+ *    - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
+ */
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+			     u64 child_nodeid, struct qstr *name);
+
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+		 bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE  (1 << 1)
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+		       loff_t *ppos, int flags);
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+		   unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+		       unsigned long arg, unsigned int flags);
+unsigned fuse_file_poll(struct file *file, poll_table *wait);
+int fuse_dev_release(struct inode *inode, struct file *file);
+
+bool fuse_write_update_size(struct inode *inode, loff_t pos);
+
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
+
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+		    struct file *file);
+
+void fuse_set_initialized(struct fuse_conn *fc);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/kernel/fs/fuse/inode.c b/kernel/fs/fuse/inode.c
new file mode 100644
index 000000000..18dacf9ed
--- /dev/null
+++ b/kernel/fs/fuse/inode.c
@@ -0,0 +1,1320 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/exportfs.h>
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Filesystem in Userspace");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *fuse_inode_cachep;
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
+
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+		  &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+		  &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
+#define FUSE_SUPER_MAGIC 0x65735546
+
+#define FUSE_DEFAULT_BLKSIZE 512
+
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
+struct fuse_mount_data {
+	int fd;
+	unsigned rootmode;
+	kuid_t user_id;
+	kgid_t group_id;
+	unsigned fd_present:1;
+	unsigned rootmode_present:1;
+	unsigned user_id_present:1;
+	unsigned group_id_present:1;
+	unsigned flags;
+	unsigned max_read;
+	unsigned blksize;
+};
+
+struct fuse_forget_link *fuse_alloc_forget(void)
+{
+	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
+
+static struct inode *fuse_alloc_inode(struct super_block *sb)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+
+	inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+	if (!inode)
+		return NULL;
+
+	fi = get_fuse_inode(inode);
+	fi->i_time = 0;
+	fi->nodeid = 0;
+	fi->nlookup = 0;
+	fi->attr_version = 0;
+	fi->writectr = 0;
+	fi->orig_ino = 0;
+	fi->state = 0;
+	INIT_LIST_HEAD(&fi->write_files);
+	INIT_LIST_HEAD(&fi->queued_writes);
+	INIT_LIST_HEAD(&fi->writepages);
+	init_waitqueue_head(&fi->page_waitq);
+	fi->forget = fuse_alloc_forget();
+	if (!fi->forget) {
+		kmem_cache_free(fuse_inode_cachep, inode);
+		return NULL;
+	}
+
+	return inode;
+}
+
+static void fuse_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(fuse_inode_cachep, inode);
+}
+
+static void fuse_destroy_inode(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	BUG_ON(!list_empty(&fi->write_files));
+	BUG_ON(!list_empty(&fi->queued_writes));
+	kfree(fi->forget);
+	call_rcu(&inode->i_rcu, fuse_i_callback);
+}
+
+static void fuse_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (inode->i_sb->s_flags & MS_ACTIVE) {
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
+		fi->forget = NULL;
+	}
+}
+
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	if (*flags & MS_MANDLOCK)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static ino_t fuse_squash_ino(u64 ino64)
+{
+	ino_t ino = (ino_t) ino64;
+	if (sizeof(ino_t) < sizeof(u64))
+		ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8;
+	return ino;
+}
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+				   u64 attr_valid)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	fi->attr_version = ++fc->attr_version;
+	fi->i_time = attr_valid;
+
+	inode->i_ino     = fuse_squash_ino(attr->ino);
+	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+	set_nlink(inode, attr->nlink);
+	inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
+	inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
+	inode->i_blocks  = attr->blocks;
+	inode->i_atime.tv_sec   = attr->atime;
+	inode->i_atime.tv_nsec  = attr->atimensec;
+	/* mtime from server may be stale due to local buffered write */
+	if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+		inode->i_mtime.tv_sec   = attr->mtime;
+		inode->i_mtime.tv_nsec  = attr->mtimensec;
+		inode->i_ctime.tv_sec   = attr->ctime;
+		inode->i_ctime.tv_nsec  = attr->ctimensec;
+	}
+
+	if (attr->blksize != 0)
+		inode->i_blkbits = ilog2(attr->blksize);
+	else
+		inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+
+	/*
+	 * Don't set the sticky bit in i_mode, unless we want the VFS
+	 * to check permissions.  This prevents failures due to the
+	 * check in may_delete().
+	 */
+	fi->orig_i_mode = inode->i_mode;
+	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+		inode->i_mode &= ~S_ISVTX;
+
+	fi->orig_ino = attr->ino;
+}
+
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+			    u64 attr_valid, u64 attr_version)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool is_wb = fc->writeback_cache;
+	loff_t oldsize;
+	struct timespec old_mtime;
+
+	spin_lock(&fc->lock);
+	if ((attr_version != 0 && fi->attr_version > attr_version) ||
+	    test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
+		spin_unlock(&fc->lock);
+		return;
+	}
+
+	old_mtime = inode->i_mtime;
+	fuse_change_attributes_common(inode, attr, attr_valid);
+
+	oldsize = inode->i_size;
+	/*
+	 * In case of writeback_cache enabled, the cached writes beyond EOF
+	 * extend local i_size without keeping userspace server in sync. So,
+	 * attr->size coming from server can be stale. We cannot trust it.
+	 */
+	if (!is_wb || !S_ISREG(inode->i_mode))
+		i_size_write(inode, attr->size);
+	spin_unlock(&fc->lock);
+
+	if (!is_wb && S_ISREG(inode->i_mode)) {
+		bool inval = false;
+
+		if (oldsize != attr->size) {
+			truncate_pagecache(inode, attr->size);
+			inval = true;
+		} else if (fc->auto_inval_data) {
+			struct timespec new_mtime = {
+				.tv_sec = attr->mtime,
+				.tv_nsec = attr->mtimensec,
+			};
+
+			/*
+			 * Auto inval mode also checks and invalidates if mtime
+			 * has changed.
+			 */
+			if (!timespec_equal(&old_mtime, &new_mtime))
+				inval = true;
+		}
+
+		if (inval)
+			invalidate_inode_pages2(inode->i_mapping);
+	}
+}
+
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+{
+	inode->i_mode = attr->mode & S_IFMT;
+	inode->i_size = attr->size;
+	inode->i_mtime.tv_sec  = attr->mtime;
+	inode->i_mtime.tv_nsec = attr->mtimensec;
+	inode->i_ctime.tv_sec  = attr->ctime;
+	inode->i_ctime.tv_nsec = attr->ctimensec;
+	if (S_ISREG(inode->i_mode)) {
+		fuse_init_common(inode);
+		fuse_init_file_inode(inode);
+	} else if (S_ISDIR(inode->i_mode))
+		fuse_init_dir(inode);
+	else if (S_ISLNK(inode->i_mode))
+		fuse_init_symlink(inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+		 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		fuse_init_common(inode);
+		init_special_inode(inode, inode->i_mode,
+				   new_decode_dev(attr->rdev));
+	} else
+		BUG();
+}
+
+int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+{
+	u64 nodeid = *(u64 *) _nodeidp;
+	if (get_node_id(inode) == nodeid)
+		return 1;
+	else
+		return 0;
+}
+
+static int fuse_inode_set(struct inode *inode, void *_nodeidp)
+{
+	u64 nodeid = *(u64 *) _nodeidp;
+	get_fuse_inode(inode)->nodeid = nodeid;
+	return 0;
+}
+
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+			int generation, struct fuse_attr *attr,
+			u64 attr_valid, u64 attr_version)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ retry:
+	inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+	if (!inode)
+		return NULL;
+
+	if ((inode->i_state & I_NEW)) {
+		inode->i_flags |= S_NOATIME;
+		if (!fc->writeback_cache || !S_ISREG(attr->mode))
+			inode->i_flags |= S_NOCMTIME;
+		inode->i_generation = generation;
+		fuse_init_inode(inode, attr);
+		unlock_new_inode(inode);
+	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+		/* Inode has changed type, any I/O on the old should fail */
+		make_bad_inode(inode);
+		iput(inode);
+		goto retry;
+	}
+
+	fi = get_fuse_inode(inode);
+	spin_lock(&fc->lock);
+	fi->nlookup++;
+	spin_unlock(&fc->lock);
+	fuse_change_attributes(inode, attr, attr_valid, attr_version);
+
+	return inode;
+}
+
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+			     loff_t offset, loff_t len)
+{
+	struct inode *inode;
+	pgoff_t pg_start;
+	pgoff_t pg_end;
+
+	inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		return -ENOENT;
+
+	fuse_invalidate_attr(inode);
+	if (offset >= 0) {
+		pg_start = offset >> PAGE_CACHE_SHIFT;
+		if (len <= 0)
+			pg_end = -1;
+		else
+			pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+		invalidate_inode_pages2_range(inode->i_mapping,
+					      pg_start, pg_end);
+	}
+	iput(inode);
+	return 0;
+}
+
+static void fuse_umount_begin(struct super_block *sb)
+{
+	fuse_abort_conn(get_fuse_conn_super(sb));
+}
+
+static void fuse_send_destroy(struct fuse_conn *fc)
+{
+	struct fuse_req *req = fc->destroy_req;
+	if (req && fc->conn_init) {
+		fc->destroy_req = NULL;
+		req->in.h.opcode = FUSE_DESTROY;
+		req->force = 1;
+		req->background = 0;
+		fuse_request_send(fc, req);
+		fuse_put_request(fc, req);
+	}
+}
+
+static void fuse_bdi_destroy(struct fuse_conn *fc)
+{
+	if (fc->bdi_initialized)
+		bdi_destroy(&fc->bdi);
+}
+
+static void fuse_put_super(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	fuse_send_destroy(fc);
+
+	fuse_abort_conn(fc);
+	mutex_lock(&fuse_mutex);
+	list_del(&fc->entry);
+	fuse_ctl_remove_conn(fc);
+	mutex_unlock(&fuse_mutex);
+	fuse_bdi_destroy(fc);
+
+	fuse_conn_put(fc);
+}
+
+static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+	stbuf->f_type    = FUSE_SUPER_MAGIC;
+	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_frsize  = attr->frsize;
+	stbuf->f_blocks  = attr->blocks;
+	stbuf->f_bfree   = attr->bfree;
+	stbuf->f_bavail  = attr->bavail;
+	stbuf->f_files   = attr->files;
+	stbuf->f_ffree   = attr->ffree;
+	stbuf->f_namelen = attr->namelen;
+	/* fsid is left zero */
+}
+
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	FUSE_ARGS(args);
+	struct fuse_statfs_out outarg;
+	int err;
+
+	if (!fuse_allow_current_process(fc)) {
+		buf->f_type = FUSE_SUPER_MAGIC;
+		return 0;
+	}
+
+	memset(&outarg, 0, sizeof(outarg));
+	args.in.numargs = 0;
+	args.in.h.opcode = FUSE_STATFS;
+	args.in.h.nodeid = get_node_id(d_inode(dentry));
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(outarg);
+	args.out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+	if (!err)
+		convert_fuse_statfs(buf, &outarg.st);
+	return err;
+}
+
+enum {
+	OPT_FD,
+	OPT_ROOTMODE,
+	OPT_USER_ID,
+	OPT_GROUP_ID,
+	OPT_DEFAULT_PERMISSIONS,
+	OPT_ALLOW_OTHER,
+	OPT_MAX_READ,
+	OPT_BLKSIZE,
+	OPT_ERR
+};
+
+static const match_table_t tokens = {
+	{OPT_FD,			"fd=%u"},
+	{OPT_ROOTMODE,			"rootmode=%o"},
+	{OPT_USER_ID,			"user_id=%u"},
+	{OPT_GROUP_ID,			"group_id=%u"},
+	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
+	{OPT_ALLOW_OTHER,		"allow_other"},
+	{OPT_MAX_READ,			"max_read=%u"},
+	{OPT_BLKSIZE,			"blksize=%u"},
+	{OPT_ERR,			NULL}
+};
+
+static int fuse_match_uint(substring_t *s, unsigned int *res)
+{
+	int err = -ENOMEM;
+	char *buf = match_strdup(s);
+	if (buf) {
+		err = kstrtouint(buf, 10, res);
+		kfree(buf);
+	}
+	return err;
+}
+
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
+{
+	char *p;
+	memset(d, 0, sizeof(struct fuse_mount_data));
+	d->max_read = ~0;
+	d->blksize = FUSE_DEFAULT_BLKSIZE;
+
+	while ((p = strsep(&opt, ",")) != NULL) {
+		int token;
+		int value;
+		unsigned uv;
+		substring_t args[MAX_OPT_ARGS];
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case OPT_FD:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->fd = value;
+			d->fd_present = 1;
+			break;
+
+		case OPT_ROOTMODE:
+			if (match_octal(&args[0], &value))
+				return 0;
+			if (!fuse_valid_type(value))
+				return 0;
+			d->rootmode = value;
+			d->rootmode_present = 1;
+			break;
+
+		case OPT_USER_ID:
+			if (fuse_match_uint(&args[0], &uv))
+				return 0;
+			d->user_id = make_kuid(current_user_ns(), uv);
+			if (!uid_valid(d->user_id))
+				return 0;
+			d->user_id_present = 1;
+			break;
+
+		case OPT_GROUP_ID:
+			if (fuse_match_uint(&args[0], &uv))
+				return 0;
+			d->group_id = make_kgid(current_user_ns(), uv);
+			if (!gid_valid(d->group_id))
+				return 0;
+			d->group_id_present = 1;
+			break;
+
+		case OPT_DEFAULT_PERMISSIONS:
+			d->flags |= FUSE_DEFAULT_PERMISSIONS;
+			break;
+
+		case OPT_ALLOW_OTHER:
+			d->flags |= FUSE_ALLOW_OTHER;
+			break;
+
+		case OPT_MAX_READ:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->max_read = value;
+			break;
+
+		case OPT_BLKSIZE:
+			if (!is_bdev || match_int(&args[0], &value))
+				return 0;
+			d->blksize = value;
+			break;
+
+		default:
+			return 0;
+		}
+	}
+
+	if (!d->fd_present || !d->rootmode_present ||
+	    !d->user_id_present || !d->group_id_present)
+		return 0;
+
+	return 1;
+}
+
+static int fuse_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
+	seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+		seq_puts(m, ",default_permissions");
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		seq_puts(m, ",allow_other");
+	if (fc->max_read != ~0)
+		seq_printf(m, ",max_read=%u", fc->max_read);
+	if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+		seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+	return 0;
+}
+
+void fuse_conn_init(struct fuse_conn *fc)
+{
+	memset(fc, 0, sizeof(*fc));
+	spin_lock_init(&fc->lock);
+	init_rwsem(&fc->killsb);
+	atomic_set(&fc->count, 1);
+	init_waitqueue_head(&fc->waitq);
+	init_waitqueue_head(&fc->blocked_waitq);
+	init_waitqueue_head(&fc->reserved_req_waitq);
+	INIT_LIST_HEAD(&fc->pending);
+	INIT_LIST_HEAD(&fc->processing);
+	INIT_LIST_HEAD(&fc->io);
+	INIT_LIST_HEAD(&fc->interrupts);
+	INIT_LIST_HEAD(&fc->bg_queue);
+	INIT_LIST_HEAD(&fc->entry);
+	fc->forget_list_tail = &fc->forget_list_head;
+	atomic_set(&fc->num_waiting, 0);
+	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
+	fc->khctr = 0;
+	fc->polled_files = RB_ROOT;
+	fc->reqctr = 0;
+	fc->blocked = 0;
+	fc->initialized = 0;
+	fc->attr_version = 1;
+	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+}
+EXPORT_SYMBOL_GPL(fuse_conn_init);
+
+void fuse_conn_put(struct fuse_conn *fc)
+{
+	if (atomic_dec_and_test(&fc->count)) {
+		if (fc->destroy_req)
+			fuse_request_free(fc->destroy_req);
+		fc->release(fc);
+	}
+}
+EXPORT_SYMBOL_GPL(fuse_conn_put);
+
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+	atomic_inc(&fc->count);
+	return fc;
+}
+EXPORT_SYMBOL_GPL(fuse_conn_get);
+
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+{
+	struct fuse_attr attr;
+	memset(&attr, 0, sizeof(attr));
+
+	attr.mode = mode;
+	attr.ino = FUSE_ROOT_ID;
+	attr.nlink = 1;
+	return fuse_iget(sb, 1, 0, &attr, 0, 0);
+}
+
+struct fuse_inode_handle {
+	u64 nodeid;
+	u32 generation;
+};
+
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+				      struct fuse_inode_handle *handle)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct inode *inode;
+	struct dentry *entry;
+	int err = -ESTALE;
+
+	if (handle->nodeid == 0)
+		goto out_err;
+
+	inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+	if (!inode) {
+		struct fuse_entry_out outarg;
+		struct qstr name;
+
+		if (!fc->export_support)
+			goto out_err;
+
+		name.len = 1;
+		name.name = ".";
+		err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+				       &inode);
+		if (err && err != -ENOENT)
+			goto out_err;
+		if (err || !inode) {
+			err = -ESTALE;
+			goto out_err;
+		}
+		err = -EIO;
+		if (get_node_id(inode) != handle->nodeid)
+			goto out_iput;
+	}
+	err = -ESTALE;
+	if (inode->i_generation != handle->generation)
+		goto out_iput;
+
+	entry = d_obtain_alias(inode);
+	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
+		fuse_invalidate_entry_cache(entry);
+
+	return entry;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
+}
+
+static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			   struct inode *parent)
+{
+	int len = parent ? 6 : 3;
+	u64 nodeid;
+	u32 generation;
+
+	if (*max_len < len) {
+		*max_len = len;
+		return  FILEID_INVALID;
+	}
+
+	nodeid = get_fuse_inode(inode)->nodeid;
+	generation = inode->i_generation;
+
+	fh[0] = (u32)(nodeid >> 32);
+	fh[1] = (u32)(nodeid & 0xffffffff);
+	fh[2] = generation;
+
+	if (parent) {
+		nodeid = get_fuse_inode(parent)->nodeid;
+		generation = parent->i_generation;
+
+		fh[3] = (u32)(nodeid >> 32);
+		fh[4] = (u32)(nodeid & 0xffffffff);
+		fh[5] = generation;
+	}
+
+	*max_len = len;
+	return parent ? 0x82 : 0x81;
+}
+
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle handle;
+
+	if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+		return NULL;
+
+	handle.nodeid = (u64) fid->raw[0] << 32;
+	handle.nodeid |= (u64) fid->raw[1];
+	handle.generation = fid->raw[2];
+	return fuse_get_dentry(sb, &handle);
+}
+
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle parent;
+
+	if (fh_type != 0x82 || fh_len < 6)
+		return NULL;
+
+	parent.nodeid = (u64) fid->raw[3] << 32;
+	parent.nodeid |= (u64) fid->raw[4];
+	parent.generation = fid->raw[5];
+	return fuse_get_dentry(sb, &parent);
+}
+
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+	struct inode *child_inode = d_inode(child);
+	struct fuse_conn *fc = get_fuse_conn(child_inode);
+	struct inode *inode;
+	struct dentry *parent;
+	struct fuse_entry_out outarg;
+	struct qstr name;
+	int err;
+
+	if (!fc->export_support)
+		return ERR_PTR(-ESTALE);
+
+	name.len = 2;
+	name.name = "..";
+	err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+			       &name, &outarg, &inode);
+	if (err) {
+		if (err == -ENOENT)
+			return ERR_PTR(-ESTALE);
+		return ERR_PTR(err);
+	}
+
+	parent = d_obtain_alias(inode);
+	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
+		fuse_invalidate_entry_cache(parent);
+
+	return parent;
+}
+
+static const struct export_operations fuse_export_operations = {
+	.fh_to_dentry	= fuse_fh_to_dentry,
+	.fh_to_parent	= fuse_fh_to_parent,
+	.encode_fh	= fuse_encode_fh,
+	.get_parent	= fuse_get_parent,
+};
+
+static const struct super_operations fuse_super_operations = {
+	.alloc_inode    = fuse_alloc_inode,
+	.destroy_inode  = fuse_destroy_inode,
+	.evict_inode	= fuse_evict_inode,
+	.write_inode	= fuse_write_inode,
+	.drop_inode	= generic_delete_inode,
+	.remount_fs	= fuse_remount_fs,
+	.put_super	= fuse_put_super,
+	.umount_begin	= fuse_umount_begin,
+	.statfs		= fuse_statfs,
+	.show_options	= fuse_show_options,
+};
+
+static void sanitize_global_limit(unsigned *limit)
+{
+	if (*limit == 0)
+		*limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
+			 sizeof(struct fuse_req);
+
+	if (*limit >= 1 << 16)
+		*limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+	int rv;
+
+	rv = param_set_uint(val, kp);
+	if (rv)
+		return rv;
+
+	sanitize_global_limit((unsigned *)kp->arg);
+
+	return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+	int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+	if (arg->minor < 13)
+		return;
+
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
+	if (arg->max_background) {
+		fc->max_background = arg->max_background;
+
+		if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+			fc->max_background = max_user_bgreq;
+	}
+	if (arg->congestion_threshold) {
+		fc->congestion_threshold = arg->congestion_threshold;
+
+		if (!cap_sys_admin &&
+		    fc->congestion_threshold > max_user_congthresh)
+			fc->congestion_threshold = max_user_congthresh;
+	}
+}
+
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_init_out *arg = &req->misc.init_out;
+
+	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
+		fc->conn_error = 1;
+	else {
+		unsigned long ra_pages;
+
+		process_init_limits(fc, arg);
+
+		if (arg->minor >= 6) {
+			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
+			if (arg->flags & FUSE_ASYNC_READ)
+				fc->async_read = 1;
+			if (!(arg->flags & FUSE_POSIX_LOCKS))
+				fc->no_lock = 1;
+			if (arg->minor >= 17) {
+				if (!(arg->flags & FUSE_FLOCK_LOCKS))
+					fc->no_flock = 1;
+			} else {
+				if (!(arg->flags & FUSE_POSIX_LOCKS))
+					fc->no_flock = 1;
+			}
+			if (arg->flags & FUSE_ATOMIC_O_TRUNC)
+				fc->atomic_o_trunc = 1;
+			if (arg->minor >= 9) {
+				/* LOOKUP has dependency on proto version */
+				if (arg->flags & FUSE_EXPORT_SUPPORT)
+					fc->export_support = 1;
+			}
+			if (arg->flags & FUSE_BIG_WRITES)
+				fc->big_writes = 1;
+			if (arg->flags & FUSE_DONT_MASK)
+				fc->dont_mask = 1;
+			if (arg->flags & FUSE_AUTO_INVAL_DATA)
+				fc->auto_inval_data = 1;
+			if (arg->flags & FUSE_DO_READDIRPLUS) {
+				fc->do_readdirplus = 1;
+				if (arg->flags & FUSE_READDIRPLUS_AUTO)
+					fc->readdirplus_auto = 1;
+			}
+			if (arg->flags & FUSE_ASYNC_DIO)
+				fc->async_dio = 1;
+			if (arg->flags & FUSE_WRITEBACK_CACHE)
+				fc->writeback_cache = 1;
+			if (arg->time_gran && arg->time_gran <= 1000000000)
+				fc->sb->s_time_gran = arg->time_gran;
+		} else {
+			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+			fc->no_lock = 1;
+			fc->no_flock = 1;
+		}
+
+		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+		fc->minor = arg->minor;
+		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+		fc->max_write = max_t(unsigned, 4096, fc->max_write);
+		fc->conn_init = 1;
+	}
+	fuse_set_initialized(fc);
+	wake_up_all(&fc->blocked_waitq);
+}
+
+static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_init_in *arg = &req->misc.init_in;
+
+	arg->major = FUSE_KERNEL_VERSION;
+	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
+		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
+		FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
+		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT;
+	req->in.h.opcode = FUSE_INIT;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+	req->out.numargs = 1;
+	/* Variable length argument used for backward compatibility
+	   with interface version < 7.5.  Rest of init_out is zeroed
+	   by do_get_request(), so a short reply is not a problem */
+	req->out.argvar = 1;
+	req->out.args[0].size = sizeof(struct fuse_init_out);
+	req->out.args[0].value = &req->misc.init_out;
+	req->end = process_init_reply;
+	fuse_request_send_background(fc, req);
+}
+
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+	kfree_rcu(fc, rcu);
+}
+
+static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+{
+	int err;
+
+	fc->bdi.name = "fuse";
+	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	/* fuse does it's own writeback accounting */
+	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
+
+	err = bdi_init(&fc->bdi);
+	if (err)
+		return err;
+
+	fc->bdi_initialized = 1;
+
+	if (sb->s_bdev) {
+		err =  bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+				    MAJOR(fc->dev), MINOR(fc->dev));
+	} else {
+		err = bdi_register_dev(&fc->bdi, fc->dev);
+	}
+
+	if (err)
+		return err;
+
+	/*
+	 * For a single fuse filesystem use max 1% of dirty +
+	 * writeback threshold.
+	 *
+	 * This gives about 1M of write buffer for memory maps on a
+	 * machine with 1G and 10% dirty_ratio, which should be more
+	 * than enough.
+	 *
+	 * Privileged users can raise it by writing to
+	 *
+	 *    /sys/class/bdi/<bdi>/max_ratio
+	 */
+	bdi_set_max_ratio(&fc->bdi, 1);
+
+	return 0;
+}
+
+static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct fuse_conn *fc;
+	struct inode *root;
+	struct fuse_mount_data d;
+	struct file *file;
+	struct dentry *root_dentry;
+	struct fuse_req *init_req;
+	int err;
+	int is_bdev = sb->s_bdev != NULL;
+
+	err = -EINVAL;
+	if (sb->s_flags & MS_MANDLOCK)
+		goto err;
+
+	sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
+
+	if (!parse_fuse_opt(data, &d, is_bdev))
+		goto err;
+
+	if (is_bdev) {
+#ifdef CONFIG_BLOCK
+		err = -EINVAL;
+		if (!sb_set_blocksize(sb, d.blksize))
+			goto err;
+#endif
+	} else {
+		sb->s_blocksize = PAGE_CACHE_SIZE;
+		sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	}
+	sb->s_magic = FUSE_SUPER_MAGIC;
+	sb->s_op = &fuse_super_operations;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_time_gran = 1;
+	sb->s_export_op = &fuse_export_operations;
+
+	file = fget(d.fd);
+	err = -EINVAL;
+	if (!file)
+		goto err;
+
+	if ((file->f_op != &fuse_dev_operations) ||
+	    (file->f_cred->user_ns != &init_user_ns))
+		goto err_fput;
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!fc)
+		goto err_fput;
+
+	fuse_conn_init(fc);
+
+	fc->dev = sb->s_dev;
+	fc->sb = sb;
+	err = fuse_bdi_init(fc, sb);
+	if (err)
+		goto err_put_conn;
+
+	sb->s_bdi = &fc->bdi;
+
+	/* Handle umasking inside the fuse code */
+	if (sb->s_flags & MS_POSIXACL)
+		fc->dont_mask = 1;
+	sb->s_flags |= MS_POSIXACL;
+
+	fc->release = fuse_free_conn;
+	fc->flags = d.flags;
+	fc->user_id = d.user_id;
+	fc->group_id = d.group_id;
+	fc->max_read = max_t(unsigned, 4096, d.max_read);
+
+	/* Used by get_root_inode() */
+	sb->s_fs_info = fc;
+
+	err = -ENOMEM;
+	root = fuse_get_root_inode(sb, d.rootmode);
+	root_dentry = d_make_root(root);
+	if (!root_dentry)
+		goto err_put_conn;
+	/* only now - we want root dentry with NULL ->d_op */
+	sb->s_d_op = &fuse_dentry_operations;
+
+	init_req = fuse_request_alloc(0);
+	if (!init_req)
+		goto err_put_root;
+	init_req->background = 1;
+
+	if (is_bdev) {
+		fc->destroy_req = fuse_request_alloc(0);
+		if (!fc->destroy_req)
+			goto err_free_init_req;
+	}
+
+	mutex_lock(&fuse_mutex);
+	err = -EINVAL;
+	if (file->private_data)
+		goto err_unlock;
+
+	err = fuse_ctl_add_conn(fc);
+	if (err)
+		goto err_unlock;
+
+	list_add_tail(&fc->entry, &fuse_conn_list);
+	sb->s_root = root_dentry;
+	fc->connected = 1;
+	file->private_data = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
+	/*
+	 * atomic_dec_and_test() in fput() provides the necessary
+	 * memory barrier for file->private_data to be visible on all
+	 * CPUs after this
+	 */
+	fput(file);
+
+	fuse_send_init(fc, init_req);
+
+	return 0;
+
+ err_unlock:
+	mutex_unlock(&fuse_mutex);
+ err_free_init_req:
+	fuse_request_free(init_req);
+ err_put_root:
+	dput(root_dentry);
+ err_put_conn:
+	fuse_bdi_destroy(fc);
+	fuse_conn_put(fc);
+ err_fput:
+	fput(file);
+ err:
+	return err;
+}
+
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *raw_data)
+{
+	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+}
+
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (fc) {
+		down_write(&fc->killsb);
+		fc->sb = NULL;
+		up_write(&fc->killsb);
+	}
+
+	kill_anon_super(sb);
+}
+
+static struct file_system_type fuse_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuse",
+	.fs_flags	= FS_HAS_SUBTYPE,
+	.mount		= fuse_mount,
+	.kill_sb	= fuse_kill_sb_anon,
+};
+MODULE_ALIAS_FS("fuse");
+
+#ifdef CONFIG_BLOCK
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *raw_data)
+{
+	return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
+}
+
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (fc) {
+		down_write(&fc->killsb);
+		fc->sb = NULL;
+		up_write(&fc->killsb);
+	}
+
+	kill_block_super(sb);
+}
+
+static struct file_system_type fuseblk_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuseblk",
+	.mount		= fuse_mount_blk,
+	.kill_sb	= fuse_kill_sb_blk,
+	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+};
+MODULE_ALIAS_FS("fuseblk");
+
+static inline int register_fuseblk(void)
+{
+	return register_filesystem(&fuseblk_fs_type);
+}
+
+static inline void unregister_fuseblk(void)
+{
+	unregister_filesystem(&fuseblk_fs_type);
+}
+#else
+static inline int register_fuseblk(void)
+{
+	return 0;
+}
+
+static inline void unregister_fuseblk(void)
+{
+}
+#endif
+
+static void fuse_inode_init_once(void *foo)
+{
+	struct inode *inode = foo;
+
+	inode_init_once(inode);
+}
+
+static int __init fuse_fs_init(void)
+{
+	int err;
+
+	fuse_inode_cachep = kmem_cache_create("fuse_inode",
+					      sizeof(struct fuse_inode),
+					      0, SLAB_HWCACHE_ALIGN,
+					      fuse_inode_init_once);
+	err = -ENOMEM;
+	if (!fuse_inode_cachep)
+		goto out;
+
+	err = register_fuseblk();
+	if (err)
+		goto out2;
+
+	err = register_filesystem(&fuse_fs_type);
+	if (err)
+		goto out3;
+
+	return 0;
+
+ out3:
+	unregister_fuseblk();
+ out2:
+	kmem_cache_destroy(fuse_inode_cachep);
+ out:
+	return err;
+}
+
+static void fuse_fs_cleanup(void)
+{
+	unregister_filesystem(&fuse_fs_type);
+	unregister_fuseblk();
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(fuse_inode_cachep);
+}
+
+static struct kobject *fuse_kobj;
+
+static int fuse_sysfs_init(void)
+{
+	int err;
+
+	fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
+	if (!fuse_kobj) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	err = sysfs_create_mount_point(fuse_kobj, "connections");
+	if (err)
+		goto out_fuse_unregister;
+
+	return 0;
+
+ out_fuse_unregister:
+	kobject_put(fuse_kobj);
+ out_err:
+	return err;
+}
+
+static void fuse_sysfs_cleanup(void)
+{
+	sysfs_remove_mount_point(fuse_kobj, "connections");
+	kobject_put(fuse_kobj);
+}
+
+static int __init fuse_init(void)
+{
+	int res;
+
+	printk(KERN_INFO "fuse init (API version %i.%i)\n",
+	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+
+	INIT_LIST_HEAD(&fuse_conn_list);
+	res = fuse_fs_init();
+	if (res)
+		goto err;
+
+	res = fuse_dev_init();
+	if (res)
+		goto err_fs_cleanup;
+
+	res = fuse_sysfs_init();
+	if (res)
+		goto err_dev_cleanup;
+
+	res = fuse_ctl_init();
+	if (res)
+		goto err_sysfs_cleanup;
+
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
+	return 0;
+
+ err_sysfs_cleanup:
+	fuse_sysfs_cleanup();
+ err_dev_cleanup:
+	fuse_dev_cleanup();
+ err_fs_cleanup:
+	fuse_fs_cleanup();
+ err:
+	return res;
+}
+
+static void __exit fuse_exit(void)
+{
+	printk(KERN_DEBUG "fuse exit\n");
+
+	fuse_ctl_cleanup();
+	fuse_sysfs_cleanup();
+	fuse_fs_cleanup();
+	fuse_dev_cleanup();
+}
+
+module_init(fuse_init);
+module_exit(fuse_exit);
diff --git a/kernel/fs/gfs2/Kconfig b/kernel/fs/gfs2/Kconfig
new file mode 100644
index 000000000..90c6a8faa
--- /dev/null
+++ b/kernel/fs/gfs2/Kconfig
@@ -0,0 +1,34 @@
+config GFS2_FS
+	tristate "GFS2 file system support"
+	depends on (64BIT || LBDAF)
+	select FS_POSIX_ACL
+	select CRC32
+	select QUOTACTL
+	help
+	  A cluster filesystem.
+
+	  Allows a cluster of computers to simultaneously use a block device
+	  that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
+	  and writes to the block device like a local filesystem, but also uses
+	  a lock module to allow the computers coordinate their I/O so
+	  filesystem consistency is maintained.  One of the nifty features of
+	  GFS is perfect consistency -- changes made to the filesystem on one
+	  machine show up immediately on all other machines in the cluster.
+
+	  To use the GFS2 filesystem in a cluster, you will need to enable
+	  the locking module below. Documentation and utilities for GFS2 can
+	  be found here: http://sources.redhat.com/cluster
+
+	  The "nolock" lock module is now built in to GFS2 by default. If
+	  you want to use the DLM, be sure to enable IPv4/6 networking.
+
+config GFS2_FS_LOCKING_DLM
+	bool "GFS2 DLM locking"
+	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
+		CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
+	help
+	  Multiple node locking module for GFS2
+
+	  Most users of GFS2 will require this. It provides the locking
+	  interface between GFS2 and the DLM, which is required to use GFS2
+	  in a cluster environment.
diff --git a/kernel/fs/gfs2/Makefile b/kernel/fs/gfs2/Makefile
new file mode 100644
index 000000000..861282023
--- /dev/null
+++ b/kernel/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
+ccflags-y := -I$(src)
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
+	glops.o log.o lops.o main.o meta_io.o \
+	aops.o dentry.o export.o file.o \
+	ops_fstype.o inode.o quota.o \
+	recovery.o rgrp.o super.o sys.o trans.o util.o
+
+gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
+
diff --git a/kernel/fs/gfs2/acl.c b/kernel/fs/gfs2/acl.c
new file mode 100644
index 000000000..1be3b061c
--- /dev/null
+++ b/kernel/fs/gfs2/acl.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+
+static const char *gfs2_acl_name(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return GFS2_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return GFS2_POSIX_ACL_DEFAULT;
+	}
+	return NULL;
+}
+
+struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct posix_acl *acl;
+	const char *name;
+	char *data;
+	int len;
+
+	if (!ip->i_eattr)
+		return NULL;
+
+	name = gfs2_acl_name(type);
+	if (name == NULL)
+		return ERR_PTR(-EINVAL);
+
+	len = gfs2_xattr_acl_get(ip, name, &data);
+	if (len < 0)
+		return ERR_PTR(len);
+	if (len == 0)
+		return NULL;
+
+	acl = posix_acl_from_xattr(&init_user_ns, data, len);
+	kfree(data);
+	return acl;
+}
+
+int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int error;
+	int len;
+	char *data;
+	const char *name = gfs2_acl_name(type);
+
+	BUG_ON(name == NULL);
+
+	if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+		return -E2BIG;
+
+	if (type == ACL_TYPE_ACCESS) {
+		umode_t mode = inode->i_mode;
+
+		error = posix_acl_equiv_mode(acl, &mode);
+		if (error < 0)
+			return error;
+
+		if (error == 0)
+			acl = NULL;
+
+		if (mode != inode->i_mode) {
+			inode->i_mode = mode;
+			mark_inode_dirty(inode);
+		}
+	}
+
+	if (acl) {
+		len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
+		if (len == 0)
+			return 0;
+		data = kmalloc(len, GFP_NOFS);
+		if (data == NULL)
+			return -ENOMEM;
+		error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
+		if (error < 0)
+			goto out;
+	} else {
+		data = NULL;
+		len = 0;
+	}
+
+	error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
+	if (error)
+		goto out;
+	set_cached_acl(inode, type, acl);
+out:
+	kfree(data);
+	return error;
+}
diff --git a/kernel/fs/gfs2/acl.h b/kernel/fs/gfs2/acl.h
new file mode 100644
index 000000000..2d65ec4cd
--- /dev/null
+++ b/kernel/fs/gfs2/acl.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+
+#include "incore.h"
+
+#define GFS2_POSIX_ACL_ACCESS		"posix_acl_access"
+#define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
+#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
+
+extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
+extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+
+#endif /* __ACL_DOT_H__ */
diff --git a/kernel/fs/gfs2/aops.c b/kernel/fs/gfs2/aops.c
new file mode 100644
index 000000000..5551fea0a
--- /dev/null
+++ b/kernel/fs/gfs2/aops.c
@@ -0,0 +1,1229 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
+#include <trace/events/writeback.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "super.h"
+#include "util.h"
+#include "glops.h"
+
+
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to)
+{
+	struct buffer_head *head = page_buffers(page);
+	unsigned int bsize = head->b_size;
+	struct buffer_head *bh;
+	unsigned int start, end;
+
+	for (bh = head, start = 0; bh != head || !start;
+	     bh = bh->b_this_page, start = end) {
+		end = start + bsize;
+		if (end <= from || start >= to)
+			continue;
+		if (gfs2_is_jdata(ip))
+			set_buffer_uptodate(bh);
+		gfs2_trans_add_data(ip->i_gl, bh);
+	}
+}
+
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int error;
+
+	error = gfs2_block_map(inode, lblock, bh_result, 0);
+	if (error)
+		return error;
+	if (!buffer_mapped(bh_result))
+		return -EIO;
+	return 0;
+}
+
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+				 struct buffer_head *bh_result, int create)
+{
+	return gfs2_block_map(inode, lblock, bh_result, 0);
+}
+
+/**
+ * gfs2_writepage_common - Common bits of writepage
+ * @page: The page to be written
+ * @wbc: The writeback control
+ *
+ * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
+ */
+
+static int gfs2_writepage_common(struct page *page,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+		goto out;
+	if (current->journal_info)
+		goto redirty;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index > end_index || (page->index == end_index && !offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		goto out;
+	}
+	return 1;
+redirty:
+	redirty_page_for_writepage(wbc, page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writepage - Write page for writeback mappings
+ * @page: The page
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * __gfs2_jdata_writepage - The core of jdata writepage
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is shared between writepage and writepages and implements the
+ * core of the writepage operation. If a transaction is required then
+ * PageChecked will have been set and the transaction will have
+ * already been started before this is called.
+ */
+
+static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, inode->i_sb->s_blocksize,
+					     (1 << BH_Dirty)|(1 << BH_Uptodate));
+		}
+		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+	}
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * gfs2_jdata_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ */
+
+static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int ret;
+	int done_trans = 0;
+
+	if (PageChecked(page)) {
+		if (wbc->sync_mode != WB_SYNC_ALL)
+			goto out_ignore;
+		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+		if (ret)
+			goto out_ignore;
+		done_trans = 1;
+	}
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret > 0)
+		ret = __gfs2_jdata_writepage(page, wbc);
+	if (done_trans)
+		gfs2_trans_end(sdp);
+	return ret;
+
+out_ignore:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: Write-back control
+ *
+ * Used for both ordered and writeback modes.
+ */
+static int gfs2_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+}
+
+/**
+ * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
+ * @mapping: The mapping
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call for each page
+ * @pvec: The vector of pages
+ * @nr_pages: The number of pages to write
+ *
+ * Returns: non-zero if loop should terminate, zero otherwise
+ */
+
+static int gfs2_write_jdata_pagevec(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct pagevec *pvec,
+				    int nr_pages, pgoff_t end,
+				    pgoff_t *done_index)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+	int i;
+	int ret;
+
+	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
+	if (ret < 0)
+		return ret;
+
+	for(i = 0; i < nr_pages; i++) {
+		struct page *page = pvec->pages[i];
+
+		/*
+		 * At this point, the page may be truncated or
+		 * invalidated (changing page->mapping to NULL), or
+		 * even swizzled back from swapper_space to tmpfs file
+		 * mapping. However, page->index will not change
+		 * because we have a reference on the page.
+		 */
+		if (page->index > end) {
+			/*
+			 * can't be range_cyclic (1st pass) because
+			 * end == -1 in that case.
+			 */
+			ret = 1;
+			break;
+		}
+
+		*done_index = page->index;
+
+		lock_page(page);
+
+		if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+			unlock_page(page);
+			continue;
+		}
+
+		if (!PageDirty(page)) {
+			/* someone wrote it for us */
+			goto continue_unlock;
+		}
+
+		if (PageWriteback(page)) {
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+			else
+				goto continue_unlock;
+		}
+
+		BUG_ON(PageWriteback(page));
+		if (!clear_page_dirty_for_io(page))
+			goto continue_unlock;
+
+		trace_wbc_writepage(wbc, inode_to_bdi(inode));
+
+		ret = __gfs2_jdata_writepage(page, wbc);
+		if (unlikely(ret)) {
+			if (ret == AOP_WRITEPAGE_ACTIVATE) {
+				unlock_page(page);
+				ret = 0;
+			} else {
+
+				/*
+				 * done_index is set past this page,
+				 * so media errors will not choke
+				 * background writeout for the entire
+				 * file. This has consequences for
+				 * range_cyclic semantics (ie. it may
+				 * not be suitable for data integrity
+				 * writeout).
+				 */
+				*done_index = page->index + 1;
+				ret = 1;
+				break;
+			}
+		}
+
+		/*
+		 * We stop writing back only if we are not doing
+		 * integrity sync. In case of integrity sync we have to
+		 * keep going until we have written all the pages
+		 * we tagged for writeback prior to entering this loop.
+		 */
+		if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
+			ret = 1;
+			break;
+		}
+
+	}
+	gfs2_trans_end(sdp);
+	return ret;
+}
+
+/**
+ * gfs2_write_cache_jdata - Like write_cache_pages but different
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call
+ * @data: The data to pass to writepage
+ *
+ * The reason that we use our own function here is that we need to
+ * start transactions before we grab page locks. This allows us
+ * to get the ordering right.
+ */
+
+static int gfs2_write_cache_jdata(struct address_space *mapping,
+				  struct writeback_control *wbc)
+{
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t uninitialized_var(writeback_index);
+	pgoff_t index;
+	pgoff_t end;
+	pgoff_t done_index;
+	int cycled;
+	int range_whole = 0;
+	int tag;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		writeback_index = mapping->writeback_index; /* prev offset */
+		index = writeback_index;
+		if (index == 0)
+			cycled = 1;
+		else
+			cycled = 0;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		cycled = 1; /* ignore range_cyclic tests */
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, index, end);
+	done_index = index;
+	while (!done && (index <= end)) {
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index);
+		if (ret)
+			done = 1;
+		if (ret > 0)
+			ret = 0;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (!cycled && !done) {
+		/*
+		 * range_cyclic:
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		cycled = 1;
+		index = 0;
+		end = writeback_index - 1;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = done_index;
+
+	return ret;
+}
+
+
+/**
+ * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * 
+ */
+
+static int gfs2_jdata_writepages(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	int ret;
+
+	ret = gfs2_write_cache_jdata(mapping, wbc);
+	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
+		gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+		ret = gfs2_write_cache_jdata(mapping, wbc);
+	}
+	return ret;
+}
+
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *dibh;
+	u64 dsize = i_size_read(&ip->i_inode);
+	void *kaddr;
+	int error;
+
+	/*
+	 * Due to the order of unstuffing files and ->fault(), we can be
+	 * asked for a zero page in the case of a stuffed file being extended,
+	 * so we need to supply one here. It doesn't happen often.
+	 */
+	if (unlikely(page->index)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+		return 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	kaddr = kmap_atomic(page);
+	if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+		dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
+	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+	memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+	kunmap_atomic(kaddr);
+	flush_dcache_page(page);
+	brelse(dibh);
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+
+/**
+ * __gfs2_readpage - readpage
+ * @file: The file to read a page for
+ * @page: The page to read
+ *
+ * This is the core of gfs2's readpage. Its used by the internal file
+ * reading code as in that case we already hold the glock. Also its
+ * called by gfs2_readpage() once the required lock has been granted.
+ *
+ */
+
+static int __gfs2_readpage(void *file, struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	int error;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = stuffed_readpage(ip, page);
+		unlock_page(page);
+	} else {
+		error = mpage_readpage(page, gfs2_block_map);
+	}
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	return error;
+}
+
+/**
+ * gfs2_readpage - read a page of a file
+ * @file: The file to read
+ * @page: The page of the file
+ *
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder gh;
+	int error;
+
+	unlock_page(page);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (unlikely(error))
+		goto out;
+	error = AOP_TRUNCATED_PAGE;
+	lock_page(page);
+	if (page->mapping == mapping && !PageUptodate(page))
+		error = __gfs2_readpage(file, page);
+	else
+		unlock_page(page);
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	if (error && error != AOP_TRUNCATED_PAGE)
+		lock_page(page);
+	return error;
+}
+
+/**
+ * gfs2_internal_read - read an internal file
+ * @ip: The gfs2 inode
+ * @buf: The buffer to fill
+ * @pos: The file position
+ * @size: The amount to read
+ *
+ */
+
+int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
+                       unsigned size)
+{
+	struct address_space *mapping = ip->i_inode.i_mapping;
+	unsigned long index = *pos / PAGE_CACHE_SIZE;
+	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+	unsigned copied = 0;
+	unsigned amt;
+	struct page *page;
+	void *p;
+
+	do {
+		amt = size - copied;
+		if (offset + size > PAGE_CACHE_SIZE)
+			amt = PAGE_CACHE_SIZE - offset;
+		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		p = kmap_atomic(page);
+		memcpy(buf + copied, p + offset, amt);
+		kunmap_atomic(p);
+		page_cache_release(page);
+		copied += amt;
+		index++;
+		offset = 0;
+	} while(copied < size);
+	(*pos) += size;
+	return size;
+}
+
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ *    which are slightly inconvenient (such as locking conflicts between
+ *    the page lock and the glock) and return having done no I/O. Its
+ *    obviously not something we'd want to do on too regular a basis.
+ *    Any I/O we ignore at this time will be done via readpage later.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
+ */
+
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (unlikely(ret))
+		goto out_uninit;
+	if (!gfs2_is_stuffed(ip))
+		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = -EIO;
+	return ret;
+}
+
+/**
+ * gfs2_write_begin - Begin to write to a file
+ * @file: The file to write to
+ * @mapping: The mapping in which to write
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused by GFS2)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	unsigned requested = 0;
+	int alloc_required;
+	int error = 0;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	struct page *page;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+	if (&ip->i_inode == sdp->sd_rindex) {
+		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &m_ip->i_gh);
+		if (unlikely(error)) {
+			gfs2_glock_dq(&ip->i_gh);
+			goto out_uninit;
+		}
+	}
+
+	alloc_required = gfs2_write_alloc_required(ip, pos, len);
+
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+
+	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .aflags = 0, };
+		requested = data_blocks + ind_blocks;
+		ap.target = requested;
+		error = gfs2_quota_lock_check(ip, &ap);
+		if (error)
+			goto out_unlock;
+
+		error = gfs2_inplace_reserve(ip, &ap);
+		if (error)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+	if (&ip->i_inode == sdp->sd_rindex)
+		rblocks += 2 * RES_STATFS;
+	if (alloc_required)
+		rblocks += gfs2_rg_blocks(ip, requested);
+
+	error = gfs2_trans_begin(sdp, rblocks,
+				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+	if (error)
+		goto out_trans_fail;
+
+	error = -ENOMEM;
+	flags |= AOP_FLAG_NOFS;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	*pagep = page;
+	if (unlikely(!page))
+		goto out_endtrans;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = 0;
+		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+			error = gfs2_unstuff_dinode(ip, page);
+			if (error == 0)
+				goto prepare_write;
+		} else if (!PageUptodate(page)) {
+			error = stuffed_readpage(ip, page);
+		}
+		goto out;
+	}
+
+prepare_write:
+	error = __block_write_begin(page, from, len, gfs2_block_map);
+out:
+	if (error == 0)
+		return 0;
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	gfs2_trans_end(sdp);
+	if (pos + len > ip->i_inode.i_size)
+		gfs2_trim_blocks(&ip->i_inode);
+	goto out_trans_fail;
+
+out_endtrans:
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+out_qunlock:
+		gfs2_quota_unlock(ip);
+	}
+out_unlock:
+	if (&ip->i_inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+/**
+ * adjust_fs_space - Adjusts the free space available due to gfs2_grow
+ * @inode: the rindex inode
+ */
+static void adjust_fs_space(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct buffer_head *m_bh, *l_bh;
+	u64 fs_total, new_free;
+
+	/* Total up the file system space, according to the latest rindex. */
+	fs_total = gfs2_ri_total(sdp);
+	if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+		return;
+
+	spin_lock(&sdp->sd_statfs_spin);
+	gfs2_statfs_change_in(m_sc, m_bh->b_data +
+			      sizeof(struct gfs2_dinode));
+	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
+		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
+	else
+		new_free = 0;
+	spin_unlock(&sdp->sd_statfs_spin);
+	fs_warn(sdp, "File system extended by %llu blocks.\n",
+		(unsigned long long)new_free);
+	gfs2_statfs_change(sdp, new_free, new_free, 0);
+
+	if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+		goto out;
+	update_statfs(sdp, m_bh, l_bh);
+	brelse(l_bh);
+out:
+	brelse(m_bh);
+}
+
+/**
+ * gfs2_stuffed_write_end - Write end for stuffed files
+ * @inode: The inode
+ * @dibh: The buffer_head containing the on-disk inode
+ * @pos: The file position
+ * @len: The length of the write
+ * @copied: How much was actually copied by the VFS
+ * @page: The page
+ *
+ * This copies the data from the page into the inode block after
+ * the inode data structure itself.
+ *
+ * Returns: errno
+ */
+static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	u64 to = pos + copied;
+	void *kaddr;
+	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
+
+	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
+	kaddr = kmap_atomic(page);
+	memcpy(buf + pos, kaddr + pos, copied);
+	memset(kaddr + pos + copied, 0, len - copied);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (copied) {
+		if (inode->i_size < to)
+			i_size_write(inode, to);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex) {
+		adjust_fs_space(inode);
+		sdp->sd_rindex_uptodate = 0;
+	}
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+	if (inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return copied;
+}
+
+/**
+ * gfs2_write_end
+ * @file: The file to write to
+ * @mapping: The address space to write to
+ * @pos: The file position
+ * @len: The length of the data
+ * @copied:
+ * @page: The page that has been written
+ * @fsdata: The fsdata (unused in GFS2)
+ *
+ * The main write_end function for GFS2. We have a separate one for
+ * stuffed files as they are slightly different, otherwise we just
+ * put our locking around the VFS provided functions.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct buffer_head *dibh;
+	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int to = from + len;
+	int ret;
+	struct gfs2_trans *tr = current->journal_info;
+	BUG_ON(!tr);
+
+	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
+
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(ret)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto failed;
+	}
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (tr->tr_num_buf_new)
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	else
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+
+
+	if (inode == sdp->sd_rindex) {
+		adjust_fs_space(inode);
+		sdp->sd_rindex_uptodate = 0;
+	}
+
+	brelse(dibh);
+failed:
+	gfs2_trans_end(sdp);
+	gfs2_inplace_release(ip);
+	if (ip->i_res->rs_qa_qd_num)
+		gfs2_quota_unlock(ip);
+	if (inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return ret;
+}
+
+/**
+ * gfs2_set_page_dirty - Page dirtying function
+ * @page: The page to dirty
+ *
+ * Returns: 1 if it dirtyed the page, or 0 otherwise
+ */
+ 
+static int gfs2_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_buffers(page);
+}
+
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder i_gh;
+	sector_t dblock = 0;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return 0;
+
+	if (!gfs2_is_stuffed(ip))
+		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return dblock;
+}
+
+static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	clear_buffer_dirty(bh);
+	bd = bh->b_private;
+	if (bd) {
+		if (!list_empty(&bd->bd_list) && !buffer_pinned(bh))
+			list_del_init(&bd->bd_list);
+		else
+			gfs2_remove_from_journal(bh, current->journal_info, 0);
+	}
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	unsigned int stop = offset + length;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
+	struct buffer_head *bh, *head;
+	unsigned long pos = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (!partial_page)
+		ClearPageChecked(page);
+	if (!page_has_buffers(page))
+		goto out;
+
+	bh = head = page_buffers(page);
+	do {
+		if (pos + bh->b_size > stop)
+			return;
+
+		if (offset <= pos)
+			gfs2_discard(sdp, bh);
+		pos += bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+out:
+	if (!partial_page)
+		try_to_release_page(page, 0);
+}
+
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
+{
+	/*
+	 * Should we return an error here? I can't see that O_DIRECT for
+	 * a stuffed file makes any sense. For now we'll silently fall
+	 * back to buffered I/O
+	 */
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	if (offset >= i_size_read(&ip->i_inode))
+		return 0;
+	return 1;
+}
+
+
+
+static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			      loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = inode->i_mapping;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int rv;
+
+	/*
+	 * Deferred lock, even if its a write, since we do no allocation
+	 * on this path. All we need change is atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately have the option of only flushing a range like
+	 * the VFS does.
+	 */
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	rv = gfs2_glock_nq(&gh);
+	if (rv)
+		return rv;
+	rv = gfs2_ok_for_dio(ip, offset);
+	if (rv != 1)
+		goto out; /* dio not valid, fall back to buffered i/o */
+
+	/*
+	 * Now since we are holding a deferred (CW) lock at this point, you
+	 * might be wondering why this is ever needed. There is a case however
+	 * where we've granted a deferred local lock against a cached exclusive
+	 * glock. That is ok provided all granted local locks are deferred, but
+	 * it also means that it is possible to encounter pages which are
+	 * cached and possibly also mapped. So here we check for that and sort
+	 * them out ahead of the dio. The glock state machine will take care of
+	 * everything else.
+	 *
+	 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
+	 * the first place, mapping->nr_pages will always be zero.
+	 */
+	if (mapping->nrpages) {
+		loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+		loff_t len = iov_iter_count(iter);
+		loff_t end = PAGE_ALIGN(offset + len) - 1;
+
+		rv = 0;
+		if (len == 0)
+			goto out;
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
+		rv = filemap_write_and_wait_range(mapping, lstart, end);
+		if (rv)
+			goto out;
+		if (iov_iter_rw(iter) == WRITE)
+			truncate_inode_pages_range(mapping, lstart, end);
+	}
+
+	rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+				  offset, gfs2_get_block_direct, NULL, NULL, 0);
+out:
+	gfs2_glock_dq(&gh);
+	gfs2_holder_uninit(&gh);
+	return rv;
+}
+
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct address_space *mapping = page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
+	struct buffer_head *bh, *head;
+	struct gfs2_bufdata *bd;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	head = bh = page_buffers(page);
+	do {
+		if (atomic_read(&bh->b_count))
+			goto cannot_release;
+		bd = bh->b_private;
+		if (bd && bd->bd_tr)
+			goto cannot_release;
+		if (buffer_pinned(bh) || buffer_dirty(bh))
+			goto not_possible;
+		bh = bh->b_this_page;
+	} while(bh != head);
+	spin_unlock(&sdp->sd_ail_lock);
+
+	head = bh = page_buffers(page);
+	do {
+		bd = bh->b_private;
+		if (bd) {
+			gfs2_assert_warn(sdp, bd->bd_bh == bh);
+			if (!list_empty(&bd->bd_list))
+				list_del_init(&bd->bd_list);
+			bd->bd_bh = NULL;
+			bh->b_private = NULL;
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+		}
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+	gfs2_log_unlock(sdp);
+
+	return try_to_free_buffers(page);
+
+not_possible: /* Should never happen */
+	WARN_ON(buffer_dirty(bh));
+	WARN_ON(buffer_pinned(bh));
+cannot_release:
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+	return 0;
+}
+
+static const struct address_space_operations gfs2_writeback_aops = {
+	.writepage = gfs2_writepage,
+	.writepages = gfs2_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations gfs2_ordered_aops = {
+	.writepage = gfs2_writepage,
+	.writepages = gfs2_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations gfs2_jdata_aops = {
+	.writepage = gfs2_jdata_writepage,
+	.writepages = gfs2_jdata_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+void gfs2_set_aops(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (gfs2_is_writeback(ip))
+		inode->i_mapping->a_ops = &gfs2_writeback_aops;
+	else if (gfs2_is_ordered(ip))
+		inode->i_mapping->a_ops = &gfs2_ordered_aops;
+	else if (gfs2_is_jdata(ip))
+		inode->i_mapping->a_ops = &gfs2_jdata_aops;
+	else
+		BUG();
+}
+
diff --git a/kernel/fs/gfs2/bmap.c b/kernel/fs/gfs2/bmap.c
new file mode 100644
index 000000000..61296ecbd
--- /dev/null
+++ b/kernel/fs/gfs2/bmap.c
@@ -0,0 +1,1495 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "log.h"
+#include "super.h"
+#include "trans.h"
+#include "dir.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
+	__u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+
+struct strip_mine {
+	int sm_first;
+	unsigned int sm_height;
+};
+
+/**
+ * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @block: the block number that was allocated
+ * @page: The (optional) page. This is looked up if @page is NULL
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
+			       u64 block, struct page *page)
+{
+	struct inode *inode = &ip->i_inode;
+	struct buffer_head *bh;
+	int release = 0;
+
+	if (!page || page->index) {
+		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+		if (!page)
+			return -ENOMEM;
+		release = 1;
+	}
+
+	if (!PageUptodate(page)) {
+		void *kaddr = kmap(page);
+		u64 dsize = i_size_read(inode);
+ 
+		if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
+
+		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+		memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+		kunmap(page);
+
+		SetPageUptodate(page);
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << inode->i_blkbits,
+				     (1 << BH_Uptodate));
+
+	bh = page_buffers(page);
+
+	if (!buffer_mapped(bh))
+		map_bh(bh, inode->i_sb, block);
+
+	set_buffer_uptodate(bh);
+	if (!gfs2_is_jdata(ip))
+		mark_buffer_dirty(bh);
+	if (!gfs2_is_writeback(ip))
+		gfs2_trans_add_data(ip->i_gl, bh);
+
+	if (release) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @page: The (optional) page. This is looked up if the @page is NULL
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *bh, *dibh;
+	struct gfs2_dinode *di;
+	u64 block = 0;
+	int isdir = gfs2_is_dir(ip);
+	int error;
+
+	down_write(&ip->i_rw_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (i_size_read(&ip->i_inode)) {
+		/* Get a free block, fill it with the stuffed data,
+		   and write it out to disk */
+
+		unsigned int n = 1;
+		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+		if (error)
+			goto out_brelse;
+		if (isdir) {
+			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
+			error = gfs2_dir_get_new_buffer(ip, block, &bh);
+			if (error)
+				goto out_brelse;
+			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
+					      dibh, sizeof(struct gfs2_dinode));
+			brelse(bh);
+		} else {
+			error = gfs2_unstuffer_page(ip, dibh, block, page);
+			if (error)
+				goto out_brelse;
+		}
+	}
+
+	/*  Set up the pointer to the new block  */
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	di = (struct gfs2_dinode *)dibh->b_data;
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+	if (i_size_read(&ip->i_inode)) {
+		*(__be64 *)(di + 1) = cpu_to_be64(block);
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
+		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+	}
+
+	ip->i_height = 1;
+	di->di_height = cpu_to_be16(1);
+
+out_brelse:
+	brelse(dibh);
+out:
+	up_write(&ip->i_rw_mutex);
+	return error;
+}
+
+
+/**
+ * find_metapath - Find path through the metadata tree
+ * @sdp: The superblock
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ * @height: The pre-calculated height of the metadata tree
+ *
+ *   This routine returns a struct metapath structure that defines a path
+ *   through the metadata of inode "ip" to get to block "block".
+ *
+ *   Example:
+ *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
+ *   filesystem with a blocksize of 4096.
+ *
+ *   find_metapath() would return a struct metapath structure set to:
+ *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ *   and mp_list[2] = 165.
+ *
+ *   That means that in order to get to the block containing the byte at
+ *   offset 101342453, we would load the indirect block pointed to by pointer
+ *   0 in the dinode.  We would then load the indirect block pointed to by
+ *   pointer 48 in that indirect block.  We would then load the data block
+ *   pointed to by pointer 165 in that indirect block.
+ *
+ *             ----------------------------------------
+ *             | Dinode |                             |
+ *             |        |                            4|
+ *             |        |0 1 2 3 4 5                 9|
+ *             |        |                            6|
+ *             ----------------------------------------
+ *                       |
+ *                       |
+ *                       V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                                     5|
+ *             |            4 4 4 4 4 5 5            1|
+ *             |0           5 6 7 8 9 0 1            2|
+ *             ----------------------------------------
+ *                                |
+ *                                |
+ *                                V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                         1 1 1 1 1   5|
+ *             |                         6 6 6 6 6   1|
+ *             |0                        3 4 5 6 7   2|
+ *             ----------------------------------------
+ *                                           |
+ *                                           |
+ *                                           V
+ *             ----------------------------------------
+ *             | Data block containing offset         |
+ *             |            101342453                 |
+ *             |                                      |
+ *             |                                      |
+ *             ----------------------------------------
+ *
+ */
+
+static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
+			  struct metapath *mp, unsigned int height)
+{
+	unsigned int i;
+
+	for (i = height; i--;)
+		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
+
+}
+
+static inline unsigned int metapath_branch_start(const struct metapath *mp)
+{
+	if (mp->mp_list[0] == 0)
+		return 2;
+	return 1;
+}
+
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+
+static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
+{
+	struct buffer_head *bh = mp->mp_bh[height];
+	unsigned int head_size = (height > 0) ?
+		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+	return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+}
+
+static void gfs2_metapath_ra(struct gfs2_glock *gl,
+			     const struct buffer_head *bh, const __be64 *pos)
+{
+	struct buffer_head *rabh;
+	const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
+	const __be64 *t;
+
+	for (t = pos; t < endp; t++) {
+		if (!*t)
+			continue;
+
+		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
+		if (trylock_buffer(rabh)) {
+			if (!buffer_uptodate(rabh)) {
+				rabh->b_end_io = end_buffer_read_sync;
+				submit_bh(READA | REQ_META, rabh);
+				continue;
+			}
+			unlock_buffer(rabh);
+		}
+		brelse(rabh);
+	}
+}
+
+/**
+ * lookup_metapath - Walk the metadata tree to a specific point
+ * @ip: The inode
+ * @mp: The metapath
+ *
+ * Assumes that the inode's buffer has already been looked up and
+ * hooked onto mp->mp_bh[0] and that the metapath has been initialised
+ * by find_metapath().
+ *
+ * If this function encounters part of the tree which has not been
+ * allocated, it returns the current height of the tree at the point
+ * at which it found the unallocated block. Blocks which are found are
+ * added to the mp->mp_bh[] list.
+ *
+ * Returns: error or height of metadata tree
+ */
+
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
+{
+	unsigned int end_of_metadata = ip->i_height - 1;
+	unsigned int x;
+	__be64 *ptr;
+	u64 dblock;
+	int ret;
+
+	for (x = 0; x < end_of_metadata; x++) {
+		ptr = metapointer(x, mp);
+		dblock = be64_to_cpu(*ptr);
+		if (!dblock)
+			return x + 1;
+
+		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
+		if (ret)
+			return ret;
+	}
+
+	return ip->i_height;
+}
+
+static inline void release_metapath(struct metapath *mp)
+{
+	int i;
+
+	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
+		if (mp->mp_bh[i] == NULL)
+			break;
+		brelse(mp->mp_bh[i]);
+	}
+}
+
+/**
+ * gfs2_extent_length - Returns length of an extent of blocks
+ * @start: Start of the buffer
+ * @len: Length of the buffer in bytes
+ * @ptr: Current position in the buffer
+ * @limit: Max extent length to return (0 = unlimited)
+ * @eob: Set to 1 if we hit "end of block"
+ *
+ * If the first block is zero (unallocated) it will return the number of
+ * unallocated blocks in the extent, otherwise it will return the number
+ * of contiguous blocks in the extent.
+ *
+ * Returns: The length of the extent (minimum of one block)
+ */
+
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
+{
+	const __be64 *end = (start + len);
+	const __be64 *first = ptr;
+	u64 d = be64_to_cpu(*ptr);
+
+	*eob = 0;
+	do {
+		ptr++;
+		if (ptr >= end)
+			break;
+		if (limit && --limit == 0)
+			break;
+		if (d)
+			d++;
+	} while(be64_to_cpu(*ptr) == d);
+	if (ptr >= end)
+		*eob = 1;
+	return (ptr - first);
+}
+
+static inline void bmap_lock(struct gfs2_inode *ip, int create)
+{
+	if (create)
+		down_write(&ip->i_rw_mutex);
+	else
+		down_read(&ip->i_rw_mutex);
+}
+
+static inline void bmap_unlock(struct gfs2_inode *ip, int create)
+{
+	if (create)
+		up_write(&ip->i_rw_mutex);
+	else
+		up_read(&ip->i_rw_mutex);
+}
+
+static inline __be64 *gfs2_indirect_init(struct metapath *mp,
+					 struct gfs2_glock *gl, unsigned int i,
+					 unsigned offset, u64 bn)
+{
+	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
+		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
+				 sizeof(struct gfs2_dinode)));
+	BUG_ON(i < 1);
+	BUG_ON(mp->mp_bh[i] != NULL);
+	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
+	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
+	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
+	ptr += offset;
+	*ptr = cpu_to_be64(bn);
+	return ptr;
+}
+
+enum alloc_state {
+	ALLOC_DATA = 0,
+	ALLOC_GROW_DEPTH = 1,
+	ALLOC_GROW_HEIGHT = 2,
+	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
+};
+
+/**
+ * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * @inode: The GFS2 inode
+ * @lblock: The logical starting block of the extent
+ * @bh_map: This is used to return the mapping details
+ * @mp: The metapath
+ * @sheight: The starting height (i.e. whats already mapped)
+ * @height: The height to build to
+ * @maxlen: The max number of data blocks to alloc
+ *
+ * In this routine we may have to alloc:
+ *   i) Indirect blocks to grow the metadata tree height
+ *  ii) Indirect blocks to fill in lower part of the metadata tree
+ * iii) Data blocks
+ *
+ * The function is in two parts. The first part works out the total
+ * number of blocks which we need. The second part does the actual
+ * allocation asking for an extent at a time (if enough contiguous free
+ * blocks are available, there will only be one request per bmap call)
+ * and uses the state machine to initialise the blocks in order.
+ *
+ * Returns: errno on error
+ */
+
+static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
+			   struct buffer_head *bh_map, struct metapath *mp,
+			   const unsigned int sheight,
+			   const unsigned int height,
+			   const size_t maxlen)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct super_block *sb = sdp->sd_vfs;
+	struct buffer_head *dibh = mp->mp_bh[0];
+	u64 bn, dblock = 0;
+	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
+	unsigned dblks = 0;
+	unsigned ptrs_per_blk;
+	const unsigned end_of_metadata = height - 1;
+	int ret;
+	int eob = 0;
+	enum alloc_state state;
+	__be64 *ptr;
+	__be64 zero_bn = 0;
+
+	BUG_ON(sheight < 1);
+	BUG_ON(dibh == NULL);
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+
+	if (height == sheight) {
+		struct buffer_head *bh;
+		/* Bottom indirect block exists, find unalloced extent size */
+		ptr = metapointer(end_of_metadata, mp);
+		bh = mp->mp_bh[end_of_metadata];
+		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
+					   &eob);
+		BUG_ON(dblks < 1);
+		state = ALLOC_DATA;
+	} else {
+		/* Need to allocate indirect blocks */
+		ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
+		dblks = min(maxlen, (size_t)(ptrs_per_blk -
+					     mp->mp_list[end_of_metadata]));
+		if (height == ip->i_height) {
+			/* Writing into existing tree, extend tree down */
+			iblks = height - sheight;
+			state = ALLOC_GROW_DEPTH;
+		} else {
+			/* Building up tree height */
+			state = ALLOC_GROW_HEIGHT;
+			iblks = height - ip->i_height;
+			branch_start = metapath_branch_start(mp);
+			iblks += (height - branch_start);
+		}
+	}
+
+	/* start of the second part of the function (state machine) */
+
+	blks = dblks + iblks;
+	i = sheight;
+	do {
+		int error;
+		n = blks - alloced;
+		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+		if (error)
+			return error;
+		alloced += n;
+		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
+			gfs2_trans_add_unrevoke(sdp, bn, n);
+		switch (state) {
+		/* Growing height of tree */
+		case ALLOC_GROW_HEIGHT:
+			if (i == 1) {
+				ptr = (__be64 *)(dibh->b_data +
+						 sizeof(struct gfs2_dinode));
+				zero_bn = *ptr;
+			}
+			for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
+				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
+			if (i - 1 == height - ip->i_height) {
+				i--;
+				gfs2_buffer_copy_tail(mp->mp_bh[i],
+						sizeof(struct gfs2_meta_header),
+						dibh, sizeof(struct gfs2_dinode));
+				gfs2_buffer_clear_tail(dibh,
+						sizeof(struct gfs2_dinode) +
+						sizeof(__be64));
+				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
+					sizeof(struct gfs2_meta_header));
+				*ptr = zero_bn;
+				state = ALLOC_GROW_DEPTH;
+				for(i = branch_start; i < height; i++) {
+					if (mp->mp_bh[i] == NULL)
+						break;
+					brelse(mp->mp_bh[i]);
+					mp->mp_bh[i] = NULL;
+				}
+				i = branch_start;
+			}
+			if (n == 0)
+				break;
+		/* Branching from existing tree */
+		case ALLOC_GROW_DEPTH:
+			if (i > 1 && i < height)
+				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
+			for (; i < height && n > 0; i++, n--)
+				gfs2_indirect_init(mp, ip->i_gl, i,
+						   mp->mp_list[i-1], bn++);
+			if (i == height)
+				state = ALLOC_DATA;
+			if (n == 0)
+				break;
+		/* Tree complete, adding data blocks */
+		case ALLOC_DATA:
+			BUG_ON(n > dblks);
+			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
+			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
+			dblks = n;
+			ptr = metapointer(end_of_metadata, mp);
+			dblock = bn;
+			while (n-- > 0)
+				*ptr++ = cpu_to_be64(bn++);
+			if (buffer_zeronew(bh_map)) {
+				ret = sb_issue_zeroout(sb, dblock, dblks,
+						       GFP_NOFS);
+				if (ret) {
+					fs_err(sdp,
+					       "Failed to zero data buffers\n");
+					clear_buffer_zeronew(bh_map);
+				}
+			}
+			break;
+		}
+	} while ((state != ALLOC_DATA) || !dblock);
+
+	ip->i_height = height;
+	gfs2_add_inode_blocks(&ip->i_inode, alloced);
+	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
+	map_bh(bh_map, inode->i_sb, dblock);
+	bh_map->b_size = dblks << inode->i_blkbits;
+	set_buffer_new(bh_map);
+	return 0;
+}
+
+/**
+ * gfs2_block_map - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
+ *
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
+ *
+ * Returns: errno
+ */
+
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+		   struct buffer_head *bh_map, int create)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned int bsize = sdp->sd_sb.sb_bsize;
+	const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
+	const u64 *arr = sdp->sd_heightsize;
+	__be64 *ptr;
+	u64 size;
+	struct metapath mp;
+	int ret;
+	int eob;
+	unsigned int len;
+	struct buffer_head *bh;
+	u8 height;
+
+	BUG_ON(maxlen == 0);
+
+	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+	bmap_lock(ip, create);
+	clear_buffer_mapped(bh_map);
+	clear_buffer_new(bh_map);
+	clear_buffer_boundary(bh_map);
+	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
+	if (gfs2_is_dir(ip)) {
+		bsize = sdp->sd_jbsize;
+		arr = sdp->sd_jheightsize;
+	}
+
+	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+	if (ret)
+		goto out;
+
+	height = ip->i_height;
+	size = (lblock + 1) * bsize;
+	while (size > arr[height])
+		height++;
+	find_metapath(sdp, lblock, &mp, height);
+	ret = 1;
+	if (height > ip->i_height || gfs2_is_stuffed(ip))
+		goto do_alloc;
+	ret = lookup_metapath(ip, &mp);
+	if (ret < 0)
+		goto out;
+	if (ret != ip->i_height)
+		goto do_alloc;
+	ptr = metapointer(ip->i_height - 1, &mp);
+	if (*ptr == 0)
+		goto do_alloc;
+	map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+	bh = mp.mp_bh[ip->i_height - 1];
+	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
+	bh_map->b_size = (len << inode->i_blkbits);
+	if (eob)
+		set_buffer_boundary(bh_map);
+	ret = 0;
+out:
+	release_metapath(&mp);
+	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
+	bmap_unlock(ip, create);
+	return ret;
+
+do_alloc:
+	/* All allocations are done here, firstly check create flag */
+	if (!create) {
+		BUG_ON(gfs2_is_stuffed(ip));
+		ret = 0;
+		goto out;
+	}
+
+	/* At this point ret is the tree depth of already allocated blocks */
+	ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
+	goto out;
+}
+
+/*
+ * Deprecated: do not use in new code
+ */
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
+{
+	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
+	int ret;
+	int create = *new;
+
+	BUG_ON(!extlen);
+	BUG_ON(!dblock);
+	BUG_ON(!new);
+
+	bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
+	ret = gfs2_block_map(inode, lblock, &bh, create);
+	*extlen = bh.b_size >> inode->i_blkbits;
+	*dblock = bh.b_blocknr;
+	if (buffer_new(&bh))
+		*new = 1;
+	else
+		*new = 0;
+	return ret;
+}
+
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @sm: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
+		    unsigned int height, struct strip_mine *sm)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	u64 bn, bstart;
+	u32 blen, btotal;
+	__be64 *p;
+	unsigned int rg_blocks = 0;
+	int metadata;
+	unsigned int revokes = 0;
+	int x;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	if (!*top)
+		sm->sm_first = 0;
+
+	if (height != sm->sm_height)
+		return 0;
+
+	if (sm->sm_first) {
+		top++;
+		sm->sm_first = 0;
+	}
+
+	metadata = (height != ip->i_height - 1);
+	if (metadata)
+		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+	else if (ip->i_depth)
+		revokes = sdp->sd_inptrs;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+	bstart = 0;
+	blen = 0;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+
+		bn = be64_to_cpu(*p);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(ip, &rlist, bstart);
+
+			bstart = bn;
+			blen = 1;
+		}
+	}
+
+	if (bstart)
+		gfs2_rlist_add(ip, &rlist, bstart);
+	else
+		goto out; /* Nothing to do */
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist;
+
+	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
+		gfs2_rs_deltree(ip->i_res);
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
+				 revokes);
+	if (error)
+		goto out_rg_gunlock;
+
+	down_write(&ip->i_rw_mutex);
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	gfs2_trans_add_meta(ip->i_gl, bh);
+
+	bstart = 0;
+	blen = 0;
+	btotal = 0;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+
+		bn = be64_to_cpu(*p);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart) {
+				__gfs2_free_blocks(ip, bstart, blen, metadata);
+				btotal += blen;
+			}
+
+			bstart = bn;
+			blen = 1;
+		}
+
+		*p = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart) {
+		__gfs2_free_blocks(ip, bstart, blen, metadata);
+		btotal += blen;
+	}
+
+	gfs2_statfs_change(sdp, 0, +btotal, 0);
+	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+			  ip->i_inode.i_gid);
+
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+
+	gfs2_dinode_out(ip, dibh->b_data);
+
+	up_write(&ip->i_rw_mutex);
+
+	gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+	gfs2_rlist_free(&rlist);
+out:
+	return error;
+}
+
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @sm: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+			  struct metapath *mp, unsigned int height,
+			  u64 block, int first, struct strip_mine *sm)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *bh = NULL;
+	__be64 *top, *bottom;
+	u64 bn;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (!height) {
+		error = gfs2_meta_inode_buffer(ip, &bh);
+		if (error)
+			return error;
+		dibh = bh;
+
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+	} else {
+		error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
+		if (error)
+			return error;
+
+		top = (__be64 *)(bh->b_data + mh_size) +
+				  (first ? mp->mp_list[height] : 0);
+
+		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+	}
+
+	error = do_strip(ip, dibh, bh, top, bottom, height, sm);
+	if (error)
+		goto out;
+
+	if (height < ip->i_height - 1) {
+
+		gfs2_metapath_ra(ip->i_gl, bh, top);
+
+		for (; top < bottom; top++, first = 0) {
+			if (!*top)
+				continue;
+
+			bn = be64_to_cpu(*top);
+
+			error = recursive_scan(ip, dibh, mp, height + 1, bn,
+					       first, sm);
+			if (error)
+				break;
+		}
+	}
+out:
+	brelse(bh);
+	return error;
+}
+
+
+/**
+ * gfs2_block_truncate_page - Deal with zeroing out data for truncate
+ *
+ * This is partly borrowed from ext3.
+ */
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, iblock, length, pos;
+	struct buffer_head *bh;
+	struct page *page;
+	int err;
+
+	page = find_or_create_page(mapping, index, GFP_NOFS);
+	if (!page)
+		return 0;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+
+	if (!buffer_mapped(bh)) {
+		gfs2_block_map(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+		err = 0;
+	}
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_trans_add_data(ip->i_gl, bh);
+
+	zero_user(page, offset, length);
+	mark_buffer_dirty(bh);
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+#define GFS2_JTRUNC_REVOKES 8192
+
+/**
+ * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
+ * @inode: The inode being truncated
+ * @oldsize: The original (larger) size
+ * @newsize: The new smaller size
+ *
+ * With jdata files, we have to journal a revoke for each block which is
+ * truncated. As a result, we need to split this into separate transactions
+ * if the number of pages being truncated gets too large.
+ */
+
+static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
+	u64 chunk;
+	int error;
+
+	while (oldsize != newsize) {
+		chunk = oldsize - newsize;
+		if (chunk > max_chunk)
+			chunk = max_chunk;
+		truncate_pagecache(inode, oldsize - chunk);
+		oldsize -= chunk;
+		gfs2_trans_end(sdp);
+		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *dibh;
+	int journaled = gfs2_is_jdata(ip);
+	int error;
+
+	if (journaled)
+		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
+	else
+		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+
+	if (gfs2_is_stuffed(ip)) {
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
+	} else {
+		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+			error = gfs2_block_truncate_page(mapping, newsize);
+			if (error)
+				goto out_brelse;
+		}
+		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
+	}
+
+	i_size_write(inode, newsize);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
+
+	if (journaled)
+		error = gfs2_journaled_truncate(inode, oldsize, newsize);
+	else
+		truncate_pagecache(inode, newsize);
+
+	if (error) {
+		brelse(dibh);
+		return error;
+	}
+
+out_brelse:
+	brelse(dibh);
+out:
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int height = ip->i_height;
+	u64 lblock;
+	struct metapath mp;
+	int error;
+
+	if (!size)
+		lblock = 0;
+	else
+		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+
+	find_metapath(sdp, lblock, &mp, ip->i_height);
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (error)
+		return error;
+
+	while (height--) {
+		struct strip_mine sm;
+		sm.sm_first = !!size;
+		sm.sm_height = height;
+
+		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
+		if (error)
+			break;
+	}
+
+	gfs2_quota_unhold(ip);
+
+	return error;
+}
+
+static int trunc_end(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+
+	down_write(&ip->i_rw_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (!i_size_read(&ip->i_inode)) {
+		ip->i_height = 0;
+		ip->i_goal = ip->i_no_addr;
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+		gfs2_ordered_del_inode(ip);
+	}
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+out:
+	up_write(&ip->i_rw_mutex);
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+/**
+ * do_shrink - make a file smaller
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
+ *
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
+ *
+ * Returns: errno
+ */
+
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	error = trunc_start(inode, oldsize, newsize);
+	if (error < 0)
+		return error;
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	error = trunc_dealloc(ip, newsize);
+	if (error == 0)
+		error = trunc_end(ip);
+
+	return error;
+}
+
+void gfs2_trim_blocks(struct inode *inode)
+{
+	u64 size = inode->i_size;
+	int ret;
+
+	ret = do_shrink(inode, size, size);
+	WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occurring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_alloc_parms ap = { .target = 1, };
+	struct buffer_head *dibh;
+	int error;
+	int unstuff = 0;
+
+	if (gfs2_is_stuffed(ip) &&
+	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+		error = gfs2_quota_lock_check(ip, &ap);
+		if (error)
+			return error;
+
+		error = gfs2_inplace_reserve(ip, &ap);
+		if (error)
+			goto do_grow_qunlock;
+		unstuff = 1;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+				  0 : RES_QUOTA), 0);
+	if (error)
+		goto do_grow_release;
+
+	if (unstuff) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			goto do_end_trans;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto do_end_trans;
+
+	i_size_write(inode, size);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+do_end_trans:
+	gfs2_trans_end(sdp);
+do_grow_release:
+	if (unstuff) {
+		gfs2_inplace_release(ip);
+do_grow_qunlock:
+		gfs2_quota_unlock(ip);
+	}
+	return error;
+}
+
+/**
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
+ *
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
+ *
+ * Returns: errno
+ */
+
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret;
+	u64 oldsize;
+
+	BUG_ON(!S_ISREG(inode->i_mode));
+
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
+
+	ret = get_write_access(inode);
+	if (ret)
+		return ret;
+
+	inode_dio_wait(inode);
+
+	ret = gfs2_rs_alloc(ip);
+	if (ret)
+		goto out;
+
+	oldsize = inode->i_size;
+	if (newsize >= oldsize) {
+		ret = do_grow(inode, newsize);
+		goto out;
+	}
+
+	gfs2_rs_deltree(ip->i_res);
+	ret = do_shrink(inode, oldsize, newsize);
+out:
+	put_write_access(inode);
+	return ret;
+}
+
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+	int error;
+	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
+	if (!error)
+		error = trunc_end(ip);
+	return error;
+}
+
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+	return trunc_dealloc(ip, 0);
+}
+
+/**
+ * gfs2_free_journal_extents - Free cached journal bmap info
+ * @jd: The journal
+ *
+ */
+
+void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
+{
+	struct gfs2_journal_extent *jext;
+
+	while(!list_empty(&jd->extent_list)) {
+		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
+		list_del(&jext->list);
+		kfree(jext);
+	}
+}
+
+/**
+ * gfs2_add_jextent - Add or merge a new extent to extent cache
+ * @jd: The journal descriptor
+ * @lblock: The logical block at start of new extent
+ * @dblock: The physical block at start of new extent
+ * @blocks: Size of extent in fs blocks
+ *
+ * Returns: 0 on success or -ENOMEM
+ */
+
+static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
+{
+	struct gfs2_journal_extent *jext;
+
+	if (!list_empty(&jd->extent_list)) {
+		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
+		if ((jext->dblock + jext->blocks) == dblock) {
+			jext->blocks += blocks;
+			return 0;
+		}
+	}
+
+	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
+	if (jext == NULL)
+		return -ENOMEM;
+	jext->dblock = dblock;
+	jext->lblock = lblock;
+	jext->blocks = blocks;
+	list_add_tail(&jext->list, &jd->extent_list);
+	jd->nr_extents++;
+	return 0;
+}
+
+/**
+ * gfs2_map_journal_extents - Cache journal bmap info
+ * @sdp: The super block
+ * @jd: The journal to map
+ *
+ * Create a reusable "extent" mapping from all logical
+ * blocks to all physical blocks for the given journal.  This will save
+ * us time when writing journal blocks.  Most journals will have only one
+ * extent that maps all their logical blocks.  That's because gfs2.mkfs
+ * arranges the journal blocks sequentially to maximize performance.
+ * So the extent would map the first block for the entire file length.
+ * However, gfs2_jadd can happen while file activity is happening, so
+ * those journals may not be sequential.  Less likely is the case where
+ * the users created their own journals by mounting the metafs and
+ * laying it out.  But it's still possible.  These journals might have
+ * several extents.
+ *
+ * Returns: 0 on success, or error on failure
+ */
+
+int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
+{
+	u64 lblock = 0;
+	u64 lblock_stop;
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct buffer_head bh;
+	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+	u64 size;
+	int rc;
+
+	lblock_stop = i_size_read(jd->jd_inode) >> shift;
+	size = (lblock_stop - lblock) << shift;
+	jd->nr_extents = 0;
+	WARN_ON(!list_empty(&jd->extent_list));
+
+	do {
+		bh.b_state = 0;
+		bh.b_blocknr = 0;
+		bh.b_size = size;
+		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
+		if (rc || !buffer_mapped(&bh))
+			goto fail;
+		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
+		if (rc)
+			goto fail;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+	} while(size > 0);
+
+	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
+		jd->nr_extents);
+	return 0;
+
+fail:
+	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
+		rc, jd->jd_jid,
+		(unsigned long long)(i_size_read(jd->jd_inode) - size),
+		jd->nr_extents);
+	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
+		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
+		bh.b_state, (unsigned long long)bh.b_size);
+	gfs2_free_journal_extents(jd);
+	return rc;
+}
+
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ *
+ * Returns: 1 if an alloc is required, 0 otherwise
+ */
+
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+			      unsigned int len)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head bh;
+	unsigned int shift;
+	u64 lblock, lblock_stop, size;
+	u64 end_of_file;
+
+	if (!len)
+		return 0;
+
+	if (gfs2_is_stuffed(ip)) {
+		if (offset + len >
+		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+			return 1;
+		return 0;
+	}
+
+	shift = sdp->sd_sb.sb_bsize_shift;
+	BUG_ON(gfs2_is_dir(ip));
+	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
+	lblock = offset >> shift;
+	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+	if (lblock_stop > end_of_file)
+		return 1;
+
+	size = (lblock_stop - lblock) << shift;
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
+		if (!buffer_mapped(&bh))
+			return 1;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+	} while(size > 0);
+
+	return 0;
+}
+
diff --git a/kernel/fs/gfs2/bmap.h b/kernel/fs/gfs2/bmap.h
new file mode 100644
index 000000000..81ded5e2a
--- /dev/null
+++ b/kernel/fs/gfs2/bmap.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+
+#include "inode.h"
+
+struct inode;
+struct gfs2_inode;
+struct page;
+
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+					  unsigned int len,
+					  unsigned int *data_blocks,
+					  unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int tmp;
+
+	BUG_ON(gfs2_is_dir(ip));
+	*data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+	*ind_blocks = 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		*ind_blocks += tmp;
+	}
+}
+
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+			  struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+			   u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+				     unsigned int len);
+extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
+extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
+
+#endif /* __BMAP_DOT_H__ */
diff --git a/kernel/fs/gfs2/dentry.c b/kernel/fs/gfs2/dentry.c
new file mode 100644
index 000000000..30822b148
--- /dev/null
+++ b/kernel/fs/gfs2/dentry.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "super.h"
+#include "util.h"
+#include "inode.h"
+
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @flags: lookup flags
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+
+static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct dentry *parent;
+	struct gfs2_sbd *sdp;
+	struct gfs2_inode *dip;
+	struct inode *inode;
+	struct gfs2_holder d_gh;
+	struct gfs2_inode *ip = NULL;
+	int error;
+	int had_lock = 0;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	sdp = GFS2_SB(d_inode(parent));
+	dip = GFS2_I(d_inode(parent));
+	inode = d_inode(dentry);
+
+	if (inode) {
+		if (is_bad_inode(inode))
+			goto invalid;
+		ip = GFS2_I(inode);
+	}
+
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto valid;
+
+	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
+	if (!had_lock) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+		if (error)
+			goto fail;
+	} 
+
+	error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip);
+	switch (error) {
+	case 0:
+		if (!inode)
+			goto invalid_gunlock;
+		break;
+	case -ENOENT:
+		if (!inode)
+			goto valid_gunlock;
+		goto invalid_gunlock;
+	default:
+		goto fail_gunlock;
+	}
+
+valid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+valid:
+	dput(parent);
+	return 1;
+
+invalid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+invalid:
+	dput(parent);
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&d_gh);
+fail:
+	dput(parent);
+	return 0;
+}
+
+static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
+{
+	str->hash = gfs2_disk_hash(str->name, str->len);
+	return 0;
+}
+
+static int gfs2_dentry_delete(const struct dentry *dentry)
+{
+	struct gfs2_inode *ginode;
+
+	if (d_really_is_negative(dentry))
+		return 0;
+
+	ginode = GFS2_I(d_inode(dentry));
+	if (!ginode->i_iopen_gh.gh_gl)
+		return 0;
+
+	if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+		return 1;
+
+	return 0;
+}
+
+const struct dentry_operations gfs2_dops = {
+	.d_revalidate = gfs2_drevalidate,
+	.d_hash = gfs2_dhash,
+	.d_delete = gfs2_dentry_delete,
+};
+
diff --git a/kernel/fs/gfs2/dir.c b/kernel/fs/gfs2/dir.c
new file mode 100644
index 000000000..487527b42
--- /dev/null
+++ b/kernel/fs/gfs2/dir.c
@@ -0,0 +1,2090 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Implements Extendible Hashing as described in:
+ *   "Extendible Hashing" by Fagin, et al in
+ *     __ACM Trans. on Database Systems__, Sept 1979.
+ *
+ *
+ * Here's the layout of dirents which is essentially the same as that of ext2
+ * within a single block. The field de_name_len is the number of bytes
+ * actually required for the name (no null terminator). The field de_rec_len
+ * is the number of bytes allocated to the dirent. The offset of the next
+ * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+ * deleted, the preceding dirent inherits its allocated space, ie
+ * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+ * by adding de_rec_len to the current dirent, this essentially causes the
+ * deleted dirent to get jumped over when iterating through all the dirents.
+ *
+ * When deleting the first dirent in a block, there is no previous dirent so
+ * the field de_ino is set to zero to designate it as deleted. When allocating
+ * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+ * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+ * dirent is allocated. Otherwise it must go through all the 'used' dirents
+ * searching for one in which the amount of total space minus the amount of
+ * used space will provide enough space for the new dirent.
+ *
+ * There are two types of blocks in which dirents reside. In a stuffed dinode,
+ * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+ * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+ * beginning of the leaf block. The dirents reside in leaves when
+ *
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
+ *
+ * Otherwise, the dirents are "linear", within a single stuffed dinode block.
+ *
+ * When the dirents are in leaves, the actual contents of the directory file are
+ * used as an array of 64-bit block pointers pointing to the leaf blocks. The
+ * dirents are NOT in the directory file itself. There can be more than one
+ * block pointer in the array that points to the same leaf. In fact, when a
+ * directory is first converted from linear to exhash, all of the pointers
+ * point to the same leaf.
+ *
+ * When a leaf is completely full, the size of the hash table can be
+ * doubled unless it is already at the maximum size which is hard coded into
+ * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+ * but never before the maximum hash table size has been reached.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "bmap.h"
+#include "util.h"
+
+#define IS_LEAF     1 /* Hashed (leaf) directory */
+#define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
+
+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
+#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
+typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
+			    const struct qstr *name, void *opaque);
+
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+			    struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+
+	bh = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
+	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+	*bhp = bh;
+	return 0;
+}
+
+static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
+					struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+	if (error)
+		return error;
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
+		brelse(bh);
+		return -EIO;
+	}
+	*bhp = bh;
+	return 0;
+}
+
+static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
+				  unsigned int offset, unsigned int size)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
+	if (ip->i_inode.i_size < offset + size)
+		i_size_write(&ip->i_inode, offset + size);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
+
+	brelse(dibh);
+
+	return size;
+}
+
+
+
+/**
+ * gfs2_dir_write_data - Write directory information to the inode
+ * @ip: The GFS2 inode
+ * @buf: The buffer containing information to be written
+ * @offset: The file offset to start writing at
+ * @size: The amount of data to write
+ *
+ * Returns: The number of bytes correctly written or error code
+ */
+static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
+			       u64 offset, unsigned int size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	u64 lblock, dblock;
+	u32 extlen = 0;
+	unsigned int o;
+	int copied = 0;
+	int error = 0;
+	int new = 0;
+
+	if (!size)
+		return 0;
+
+	if (gfs2_is_stuffed(ip) &&
+	    offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+		return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
+					      size);
+
+	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+		return -EINVAL;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			return error;
+	}
+
+	lblock = offset;
+	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+	while (copied < size) {
+		unsigned int amount;
+		struct buffer_head *bh;
+
+		amount = size - copied;
+		if (amount > sdp->sd_sb.sb_bsize - o)
+			amount = sdp->sd_sb.sb_bsize - o;
+
+		if (!extlen) {
+			new = 1;
+			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+						&dblock, &extlen);
+			if (error)
+				goto fail;
+			error = -EIO;
+			if (gfs2_assert_withdraw(sdp, dblock))
+				goto fail;
+		}
+
+		if (amount == sdp->sd_jbsize || new)
+			error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
+		else
+			error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
+
+		if (error)
+			goto fail;
+
+		gfs2_trans_add_meta(ip->i_gl, bh);
+		memcpy(bh->b_data + o, buf, amount);
+		brelse(bh);
+
+		buf += amount;
+		copied += amount;
+		lblock++;
+		dblock++;
+		extlen--;
+
+		o = sizeof(struct gfs2_meta_header);
+	}
+
+out:
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	if (ip->i_inode.i_size < offset + copied)
+		i_size_write(&ip->i_inode, offset + copied);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+	return copied;
+fail:
+	if (copied)
+		goto out;
+	return error;
+}
+
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
+				 unsigned int size)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
+		brelse(dibh);
+	}
+
+	return (error) ? error : size;
+}
+
+
+/**
+ * gfs2_dir_read_data - Read a data from a directory inode
+ * @ip: The GFS2 Inode
+ * @buf: The buffer to place result into
+ * @size: Amount of data to transfer
+ *
+ * Returns: The amount of data actually copied or the error
+ */
+static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
+			      unsigned int size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 lblock, dblock;
+	u32 extlen = 0;
+	unsigned int o;
+	int copied = 0;
+	int error = 0;
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_dir_read_stuffed(ip, buf, size);
+
+	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+		return -EINVAL;
+
+	lblock = 0;
+	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+	while (copied < size) {
+		unsigned int amount;
+		struct buffer_head *bh;
+		int new;
+
+		amount = size - copied;
+		if (amount > sdp->sd_sb.sb_bsize - o)
+			amount = sdp->sd_sb.sb_bsize - o;
+
+		if (!extlen) {
+			new = 0;
+			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+						&dblock, &extlen);
+			if (error || !dblock)
+				goto fail;
+			BUG_ON(extlen < 1);
+			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+		} else {
+			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+			if (error)
+				goto fail;
+		}
+		error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
+		if (error) {
+			brelse(bh);
+			goto fail;
+		}
+		dblock++;
+		extlen--;
+		memcpy(buf, bh->b_data + o, amount);
+		brelse(bh);
+		buf += (amount/sizeof(__be64));
+		copied += amount;
+		lblock++;
+		o = sizeof(struct gfs2_meta_header);
+	}
+
+	return copied;
+fail:
+	return (copied) ? copied : error;
+}
+
+/**
+ * gfs2_dir_get_hash_table - Get pointer to the dir hash table
+ * @ip: The inode in question
+ *
+ * Returns: The hash table or an error
+ */
+
+static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
+{
+	struct inode *inode = &ip->i_inode;
+	int ret;
+	u32 hsize;
+	__be64 *hc;
+
+	BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH));
+
+	hc = ip->i_hash_cache;
+	if (hc)
+		return hc;
+
+	hsize = 1 << ip->i_depth;
+	hsize *= sizeof(__be64);
+	if (hsize != i_size_read(&ip->i_inode)) {
+		gfs2_consist_inode(ip);
+		return ERR_PTR(-EIO);
+	}
+
+	hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN);
+	if (hc == NULL)
+		hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL);
+
+	if (hc == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ret = gfs2_dir_read_data(ip, hc, hsize);
+	if (ret < 0) {
+		kvfree(hc);
+		return ERR_PTR(ret);
+	}
+
+	spin_lock(&inode->i_lock);
+	if (likely(!ip->i_hash_cache)) {
+		ip->i_hash_cache = hc;
+		hc = NULL;
+	}
+	spin_unlock(&inode->i_lock);
+	kvfree(hc);
+
+	return ip->i_hash_cache;
+}
+
+/**
+ * gfs2_dir_hash_inval - Invalidate dir hash
+ * @ip: The directory inode
+ *
+ * Must be called with an exclusive glock, or during glock invalidation.
+ */
+void gfs2_dir_hash_inval(struct gfs2_inode *ip)
+{
+	__be64 *hc = ip->i_hash_cache;
+	ip->i_hash_cache = NULL;
+	kvfree(hc);
+}
+
+static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
+{
+	return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0;
+}
+
+static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
+				     const struct qstr *name, int ret)
+{
+	if (!gfs2_dirent_sentinel(dent) &&
+	    be32_to_cpu(dent->de_hash) == name->hash &&
+	    be16_to_cpu(dent->de_name_len) == name->len &&
+	    memcmp(dent+1, name->name, name->len) == 0)
+		return ret;
+	return 0;
+}
+
+static int gfs2_dirent_find(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	return __gfs2_dirent_find(dent, name, 1);
+}
+
+static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	return __gfs2_dirent_find(dent, name, 2);
+}
+
+/*
+ * name->name holds ptr to start of block.
+ * name->len holds size of block.
+ */
+static int gfs2_dirent_last(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	const char *start = name->name;
+	const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
+	if (name->len == (end - start))
+		return 1;
+	return 0;
+}
+
+static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
+				  const struct qstr *name,
+				  void *opaque)
+{
+	unsigned required = GFS2_DIRENT_SIZE(name->len);
+	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+	if (gfs2_dirent_sentinel(dent))
+		actual = 0;
+	if (totlen - actual >= required)
+		return 1;
+	return 0;
+}
+
+struct dirent_gather {
+	const struct gfs2_dirent **pdent;
+	unsigned offset;
+};
+
+static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
+			      const struct qstr *name,
+			      void *opaque)
+{
+	struct dirent_gather *g = opaque;
+	if (!gfs2_dirent_sentinel(dent)) {
+		g->pdent[g->offset++] = dent;
+	}
+	return 0;
+}
+
+/*
+ * Other possible things to check:
+ * - Inode located within filesystem size (and on valid block)
+ * - Valid directory entry type
+ * Not sure how heavy-weight we want to make this... could also check
+ * hash is correct for example, but that would take a lot of extra time.
+ * For now the most important thing is to check that the various sizes
+ * are correct.
+ */
+static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+			     unsigned int size, unsigned int len, int first)
+{
+	const char *msg = "gfs2_dirent too small";
+	if (unlikely(size < sizeof(struct gfs2_dirent)))
+		goto error;
+	msg = "gfs2_dirent misaligned";
+	if (unlikely(offset & 0x7))
+		goto error;
+	msg = "gfs2_dirent points beyond end of block";
+	if (unlikely(offset + size > len))
+		goto error;
+	msg = "zero inode number";
+	if (unlikely(!first && gfs2_dirent_sentinel(dent)))
+		goto error;
+	msg = "name length is greater than space in dirent";
+	if (!gfs2_dirent_sentinel(dent) &&
+	    unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
+		     size))
+		goto error;
+	return 0;
+error:
+	pr_warn("%s: %s (%s)\n",
+		__func__, msg, first ? "first in block" : "not first in block");
+	return -EIO;
+}
+
+static int gfs2_dirent_offset(const void *buf)
+{
+	const struct gfs2_meta_header *h = buf;
+	int offset;
+
+	BUG_ON(buf == NULL);
+
+	switch(be32_to_cpu(h->mh_type)) {
+	case GFS2_METATYPE_LF:
+		offset = sizeof(struct gfs2_leaf);
+		break;
+	case GFS2_METATYPE_DI:
+		offset = sizeof(struct gfs2_dinode);
+		break;
+	default:
+		goto wrong_type;
+	}
+	return offset;
+wrong_type:
+	pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
+	return -1;
+}
+
+static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
+					    unsigned int len, gfs2_dscan_t scan,
+					    const struct qstr *name,
+					    void *opaque)
+{
+	struct gfs2_dirent *dent, *prev;
+	unsigned offset;
+	unsigned size;
+	int ret = 0;
+
+	ret = gfs2_dirent_offset(buf);
+	if (ret < 0)
+		goto consist_inode;
+
+	offset = ret;
+	prev = NULL;
+	dent = buf + offset;
+	size = be16_to_cpu(dent->de_rec_len);
+	if (gfs2_check_dirent(dent, offset, size, len, 1))
+		goto consist_inode;
+	do {
+		ret = scan(dent, name, opaque);
+		if (ret)
+			break;
+		offset += size;
+		if (offset == len)
+			break;
+		prev = dent;
+		dent = buf + offset;
+		size = be16_to_cpu(dent->de_rec_len);
+		if (gfs2_check_dirent(dent, offset, size, len, 0))
+			goto consist_inode;
+	} while(1);
+
+	switch(ret) {
+	case 0:
+		return NULL;
+	case 1:
+		return dent;
+	case 2:
+		return prev ? prev : dent;
+	default:
+		BUG_ON(ret > 0);
+		return ERR_PTR(ret);
+	}
+
+consist_inode:
+	gfs2_consist_inode(GFS2_I(inode));
+	return ERR_PTR(-EIO);
+}
+
+static int dirent_check_reclen(struct gfs2_inode *dip,
+			       const struct gfs2_dirent *d, const void *end_p)
+{
+	const void *ptr = d;
+	u16 rec_len = be16_to_cpu(d->de_rec_len);
+
+	if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
+		goto broken;
+	ptr += rec_len;
+	if (ptr < end_p)
+		return rec_len;
+	if (ptr == end_p)
+		return -ENOENT;
+broken:
+	gfs2_consist_inode(dip);
+	return -EIO;
+}
+
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+		       struct gfs2_dirent **dent)
+{
+	struct gfs2_dirent *cur = *dent, *tmp;
+	char *bh_end = bh->b_data + bh->b_size;
+	int ret;
+
+	ret = dirent_check_reclen(dip, cur, bh_end);
+	if (ret < 0)
+		return ret;
+
+	tmp = (void *)cur + ret;
+	ret = dirent_check_reclen(dip, tmp, bh_end);
+	if (ret == -EIO)
+		return ret;
+
+        /* Only the first dent could ever have de_inum.no_addr == 0 */
+	if (gfs2_dirent_sentinel(tmp)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	*dent = tmp;
+	return 0;
+}
+
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+		       struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+	u16 cur_rec_len, prev_rec_len;
+
+	if (gfs2_dirent_sentinel(cur)) {
+		gfs2_consist_inode(dip);
+		return;
+	}
+
+	gfs2_trans_add_meta(dip->i_gl, bh);
+
+	/* If there is no prev entry, this is the first entry in the block.
+	   The de_rec_len is already as big as it needs to be.  Just zero
+	   out the inode number and return.  */
+
+	if (!prev) {
+		cur->de_inum.no_addr = 0;
+		cur->de_inum.no_formal_ino = 0;
+		return;
+	}
+
+	/*  Combine this dentry with the previous one.  */
+
+	prev_rec_len = be16_to_cpu(prev->de_rec_len);
+	cur_rec_len = be16_to_cpu(cur->de_rec_len);
+
+	if ((char *)prev + prev_rec_len != (char *)cur)
+		gfs2_consist_inode(dip);
+	if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+		gfs2_consist_inode(dip);
+
+	prev_rec_len += cur_rec_len;
+	prev->de_rec_len = cpu_to_be16(prev_rec_len);
+}
+
+/*
+ * Takes a dent from which to grab space as an argument. Returns the
+ * newly created dent.
+ */
+static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
+					    struct gfs2_dirent *dent,
+					    const struct qstr *name,
+					    struct buffer_head *bh)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_dirent *ndent;
+	unsigned offset = 0, totlen;
+
+	if (!gfs2_dirent_sentinel(dent))
+		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	totlen = be16_to_cpu(dent->de_rec_len);
+	BUG_ON(offset + name->len > totlen);
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	ndent = (struct gfs2_dirent *)((char *)dent + offset);
+	dent->de_rec_len = cpu_to_be16(offset);
+	gfs2_qstr2dirent(name, totlen - offset, ndent);
+	return ndent;
+}
+
+static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
+					     struct buffer_head *bh,
+					     const struct qstr *name)
+{
+	struct gfs2_dirent *dent;
+	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+				gfs2_dirent_find_space, name, NULL);
+	if (!dent || IS_ERR(dent))
+		return dent;
+	return gfs2_init_dirent(inode, dent, name, bh);
+}
+
+static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
+		    struct buffer_head **bhp)
+{
+	int error;
+
+	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
+		/* pr_info("block num=%llu\n", leaf_no); */
+		error = -EIO;
+	}
+
+	return error;
+}
+
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
+		       u64 *leaf_out)
+{
+	__be64 *hash;
+
+	hash = gfs2_dir_get_hash_table(dip);
+	if (IS_ERR(hash))
+		return PTR_ERR(hash);
+	*leaf_out = be64_to_cpu(*(hash + index));
+	return 0;
+}
+
+static int get_first_leaf(struct gfs2_inode *dip, u32 index,
+			  struct buffer_head **bh_out)
+{
+	u64 leaf_no;
+	int error;
+
+	error = get_leaf_nr(dip, index, &leaf_no);
+	if (!error)
+		error = get_leaf(dip, leaf_no, bh_out);
+
+	return error;
+}
+
+static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
+					      const struct qstr *name,
+					      gfs2_dscan_t scan,
+					      struct buffer_head **pbh)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	if (ip->i_diskflags & GFS2_DIF_EXHASH) {
+		struct gfs2_leaf *leaf;
+		unsigned hsize = 1 << ip->i_depth;
+		unsigned index;
+		u64 ln;
+		if (hsize * sizeof(u64) != i_size_read(inode)) {
+			gfs2_consist_inode(ip);
+			return ERR_PTR(-EIO);
+		}
+
+		index = name->hash >> (32 - ip->i_depth);
+		error = get_first_leaf(ip, index, &bh);
+		if (error)
+			return ERR_PTR(error);
+		do {
+			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+						scan, name, NULL);
+			if (dent)
+				goto got_dent;
+			leaf = (struct gfs2_leaf *)bh->b_data;
+			ln = be64_to_cpu(leaf->lf_next);
+			brelse(bh);
+			if (!ln)
+				break;
+
+			error = get_leaf(ip, ln, &bh);
+		} while(!error);
+
+		return error ? ERR_PTR(error) : NULL;
+	}
+
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		return ERR_PTR(error);
+	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
+got_dent:
+	if (unlikely(dent == NULL || IS_ERR(dent))) {
+		brelse(bh);
+		bh = NULL;
+	}
+	*pbh = bh;
+	return dent;
+}
+
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int n = 1;
+	u64 bn;
+	int error;
+	struct buffer_head *bh;
+	struct gfs2_leaf *leaf;
+	struct gfs2_dirent *dent;
+	struct qstr name = { .name = "" };
+	struct timespec tv = CURRENT_TIME;
+
+	error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+	if (error)
+		return NULL;
+	bh = gfs2_meta_new(ip->i_gl, bn);
+	if (!bh)
+		return NULL;
+
+	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+	leaf = (struct gfs2_leaf *)bh->b_data;
+	leaf->lf_depth = cpu_to_be16(depth);
+	leaf->lf_entries = 0;
+	leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+	leaf->lf_next = 0;
+	leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
+	leaf->lf_dist = cpu_to_be32(1);
+	leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+	leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+	memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
+	dent = (struct gfs2_dirent *)(leaf+1);
+	gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
+	*pbh = bh;
+	return leaf;
+}
+
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dir_make_exhash(struct inode *inode)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_dirent *dent;
+	struct qstr args;
+	struct buffer_head *bh, *dibh;
+	struct gfs2_leaf *leaf;
+	int y;
+	u32 x;
+	__be64 *lp;
+	u64 bn;
+	int error;
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		return error;
+
+	/*  Turn over a new leaf  */
+
+	leaf = new_leaf(inode, &bh, 0);
+	if (!leaf)
+		return -ENOSPC;
+	bn = bh->b_blocknr;
+
+	gfs2_assert(sdp, dip->i_entries < (1 << 16));
+	leaf->lf_entries = cpu_to_be16(dip->i_entries);
+
+	/*  Copy dirents  */
+
+	gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+			     sizeof(struct gfs2_dinode));
+
+	/*  Find last entry  */
+
+	x = 0;
+	args.len = bh->b_size - sizeof(struct gfs2_dinode) +
+		   sizeof(struct gfs2_leaf);
+	args.name = bh->b_data;
+	dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
+				gfs2_dirent_last, &args, NULL);
+	if (!dent) {
+		brelse(bh);
+		brelse(dibh);
+		return -EIO;
+	}
+	if (IS_ERR(dent)) {
+		brelse(bh);
+		brelse(dibh);
+		return PTR_ERR(dent);
+	}
+
+	/*  Adjust the last dirent's record length
+	   (Remember that dent still points to the last entry.)  */
+
+	dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
+		sizeof(struct gfs2_dinode) -
+		sizeof(struct gfs2_leaf));
+
+	brelse(bh);
+
+	/*  We're done with the new leaf block, now setup the new
+	    hash table.  */
+
+	gfs2_trans_add_meta(dip->i_gl, dibh);
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+	lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+
+	for (x = sdp->sd_hash_ptrs; x--; lp++)
+		*lp = cpu_to_be64(bn);
+
+	i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
+	gfs2_add_inode_blocks(&dip->i_inode, 1);
+	dip->i_diskflags |= GFS2_DIF_EXHASH;
+
+	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+	dip->i_depth = y;
+
+	gfs2_dinode_out(dip, dibh->b_data);
+
+	brelse(dibh);
+
+	return 0;
+}
+
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_split_leaf(struct inode *inode, const struct qstr *name)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct buffer_head *nbh, *obh, *dibh;
+	struct gfs2_leaf *nleaf, *oleaf;
+	struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
+	u32 start, len, half_len, divider;
+	u64 bn, leaf_no;
+	__be64 *lp;
+	u32 index;
+	int x, moved = 0;
+	int error;
+
+	index = name->hash >> (32 - dip->i_depth);
+	error = get_leaf_nr(dip, index, &leaf_no);
+	if (error)
+		return error;
+
+	/*  Get the old leaf block  */
+	error = get_leaf(dip, leaf_no, &obh);
+	if (error)
+		return error;
+
+	oleaf = (struct gfs2_leaf *)obh->b_data;
+	if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
+		brelse(obh);
+		return 1; /* can't split */
+	}
+
+	gfs2_trans_add_meta(dip->i_gl, obh);
+
+	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+	if (!nleaf) {
+		brelse(obh);
+		return -ENOSPC;
+	}
+	bn = nbh->b_blocknr;
+
+	/*  Compute the start and len of leaf pointers in the hash table.  */
+	len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
+	half_len = len >> 1;
+	if (!half_len) {
+		pr_warn("i_depth %u lf_depth %u index %u\n",
+			dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
+		gfs2_consist_inode(dip);
+		error = -EIO;
+		goto fail_brelse;
+	}
+
+	start = (index & ~(len - 1));
+
+	/* Change the pointers.
+	   Don't bother distinguishing stuffed from non-stuffed.
+	   This code is complicated enough already. */
+	lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
+	if (!lp) {
+		error = -ENOMEM;
+		goto fail_brelse;
+	}
+
+	/*  Change the pointers  */
+	for (x = 0; x < half_len; x++)
+		lp[x] = cpu_to_be64(bn);
+
+	gfs2_dir_hash_inval(dip);
+
+	error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
+				    half_len * sizeof(u64));
+	if (error != half_len * sizeof(u64)) {
+		if (error >= 0)
+			error = -EIO;
+		goto fail_lpfree;
+	}
+
+	kfree(lp);
+
+	/*  Compute the divider  */
+	divider = (start + half_len) << (32 - dip->i_depth);
+
+	/*  Copy the entries  */
+	dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
+
+	do {
+		next = dent;
+		if (dirent_next(dip, obh, &next))
+			next = NULL;
+
+		if (!gfs2_dirent_sentinel(dent) &&
+		    be32_to_cpu(dent->de_hash) < divider) {
+			struct qstr str;
+			str.name = (char*)(dent+1);
+			str.len = be16_to_cpu(dent->de_name_len);
+			str.hash = be32_to_cpu(dent->de_hash);
+			new = gfs2_dirent_alloc(inode, nbh, &str);
+			if (IS_ERR(new)) {
+				error = PTR_ERR(new);
+				break;
+			}
+
+			new->de_inum = dent->de_inum; /* No endian worries */
+			new->de_type = dent->de_type; /* No endian worries */
+			be16_add_cpu(&nleaf->lf_entries, 1);
+
+			dirent_del(dip, obh, prev, dent);
+
+			if (!oleaf->lf_entries)
+				gfs2_consist_inode(dip);
+			be16_add_cpu(&oleaf->lf_entries, -1);
+
+			if (!prev)
+				prev = dent;
+
+			moved = 1;
+		} else {
+			prev = dent;
+		}
+		dent = next;
+	} while (dent);
+
+	oleaf->lf_depth = nleaf->lf_depth;
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+		gfs2_trans_add_meta(dip->i_gl, dibh);
+		gfs2_add_inode_blocks(&dip->i_inode, 1);
+		gfs2_dinode_out(dip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	brelse(obh);
+	brelse(nbh);
+
+	return error;
+
+fail_lpfree:
+	kfree(lp);
+
+fail_brelse:
+	brelse(obh);
+	brelse(nbh);
+	return error;
+}
+
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+	struct buffer_head *dibh;
+	u32 hsize;
+	u32 hsize_bytes;
+	__be64 *hc;
+	__be64 *hc2, *h;
+	int x;
+	int error = 0;
+
+	hsize = 1 << dip->i_depth;
+	hsize_bytes = hsize * sizeof(__be64);
+
+	hc = gfs2_dir_get_hash_table(dip);
+	if (IS_ERR(hc))
+		return PTR_ERR(hc);
+
+	hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+	if (hc2 == NULL)
+		hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
+
+	if (!hc2)
+		return -ENOMEM;
+
+	h = hc2;
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		goto out_kfree;
+
+	for (x = 0; x < hsize; x++) {
+		*h++ = *hc;
+		*h++ = *hc;
+		hc++;
+	}
+
+	error = gfs2_dir_write_data(dip, (char *)hc2, 0, hsize_bytes * 2);
+	if (error != (hsize_bytes * 2))
+		goto fail;
+
+	gfs2_dir_hash_inval(dip);
+	dip->i_hash_cache = hc2;
+	dip->i_depth++;
+	gfs2_dinode_out(dip, dibh->b_data);
+	brelse(dibh);
+	return 0;
+
+fail:
+	/* Replace original hash table & size */
+	gfs2_dir_write_data(dip, (char *)hc, 0, hsize_bytes);
+	i_size_write(&dip->i_inode, hsize_bytes);
+	gfs2_dinode_out(dip, dibh->b_data);
+	brelse(dibh);
+out_kfree:
+	kvfree(hc2);
+	return error;
+}
+
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ *   gt: returns 1
+ *   lt: returns -1
+ *   eq: returns 0
+ */
+
+static int compare_dents(const void *a, const void *b)
+{
+	const struct gfs2_dirent *dent_a, *dent_b;
+	u32 hash_a, hash_b;
+	int ret = 0;
+
+	dent_a = *(const struct gfs2_dirent **)a;
+	hash_a = be32_to_cpu(dent_a->de_hash);
+
+	dent_b = *(const struct gfs2_dirent **)b;
+	hash_b = be32_to_cpu(dent_b->de_hash);
+
+	if (hash_a > hash_b)
+		ret = 1;
+	else if (hash_a < hash_b)
+		ret = -1;
+	else {
+		unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
+		unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
+
+		if (len_a > len_b)
+			ret = 1;
+		else if (len_a < len_b)
+			ret = -1;
+		else
+			ret = memcmp(dent_a + 1, dent_b + 1, len_a);
+	}
+
+	return ret;
+}
+
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @ctx: what to feed the entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer.  We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 if the actor tells you to stop
+ */
+
+static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
+			   const struct gfs2_dirent **darr, u32 entries,
+			   int *copied)
+{
+	const struct gfs2_dirent *dent, *dent_next;
+	u64 off, off_next;
+	unsigned int x, y;
+	int run = 0;
+
+	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+
+	dent_next = darr[0];
+	off_next = be32_to_cpu(dent_next->de_hash);
+	off_next = gfs2_disk_hash2offset(off_next);
+
+	for (x = 0, y = 1; x < entries; x++, y++) {
+		dent = dent_next;
+		off = off_next;
+
+		if (y < entries) {
+			dent_next = darr[y];
+			off_next = be32_to_cpu(dent_next->de_hash);
+			off_next = gfs2_disk_hash2offset(off_next);
+
+			if (off < ctx->pos)
+				continue;
+			ctx->pos = off;
+
+			if (off_next == off) {
+				if (*copied && !run)
+					return 1;
+				run = 1;
+			} else
+				run = 0;
+		} else {
+			if (off < ctx->pos)
+				continue;
+			ctx->pos = off;
+		}
+
+		if (!dir_emit(ctx, (const char *)(dent + 1),
+				be16_to_cpu(dent->de_name_len),
+				be64_to_cpu(dent->de_inum.no_addr),
+				be16_to_cpu(dent->de_type)))
+			return 1;
+
+		*copied = 1;
+	}
+
+	/* Increment the ctx->pos by one, so the next time we come into the
+	   do_filldir fxn, we get the next entry instead of the last one in the
+	   current leaf */
+
+	ctx->pos++;
+
+	return 0;
+}
+
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+	void *ptr = NULL;
+
+	if (size < KMALLOC_MAX_SIZE)
+		ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+	if (!ptr)
+		ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+	return ptr;
+}
+
+static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
+			      int *copied, unsigned *depth,
+			      u64 leaf_no)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_leaf *lf;
+	unsigned entries = 0, entries2 = 0;
+	unsigned leaves = 0;
+	const struct gfs2_dirent **darr, *dent;
+	struct dirent_gather g;
+	struct buffer_head **larr;
+	int leaf = 0;
+	int error, i;
+	u64 lfn = leaf_no;
+
+	do {
+		error = get_leaf(ip, lfn, &bh);
+		if (error)
+			goto out;
+		lf = (struct gfs2_leaf *)bh->b_data;
+		if (leaves == 0)
+			*depth = be16_to_cpu(lf->lf_depth);
+		entries += be16_to_cpu(lf->lf_entries);
+		leaves++;
+		lfn = be64_to_cpu(lf->lf_next);
+		brelse(bh);
+	} while(lfn);
+
+	if (!entries)
+		return 0;
+
+	error = -ENOMEM;
+	/*
+	 * The extra 99 entries are not normally used, but are a buffer
+	 * zone in case the number of entries in the leaf is corrupt.
+	 * 99 is the maximum number of entries that can fit in a single
+	 * leaf block.
+	 */
+	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
+	if (!larr)
+		goto out;
+	darr = (const struct gfs2_dirent **)(larr + leaves);
+	g.pdent = darr;
+	g.offset = 0;
+	lfn = leaf_no;
+
+	do {
+		error = get_leaf(ip, lfn, &bh);
+		if (error)
+			goto out_free;
+		lf = (struct gfs2_leaf *)bh->b_data;
+		lfn = be64_to_cpu(lf->lf_next);
+		if (lf->lf_entries) {
+			entries2 += be16_to_cpu(lf->lf_entries);
+			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+						gfs2_dirent_gather, NULL, &g);
+			error = PTR_ERR(dent);
+			if (IS_ERR(dent))
+				goto out_free;
+			if (entries2 != g.offset) {
+				fs_warn(sdp, "Number of entries corrupt in dir "
+						"leaf %llu, entries2 (%u) != "
+						"g.offset (%u)\n",
+					(unsigned long long)bh->b_blocknr,
+					entries2, g.offset);
+					
+				error = -EIO;
+				goto out_free;
+			}
+			error = 0;
+			larr[leaf++] = bh;
+		} else {
+			brelse(bh);
+		}
+	} while(lfn);
+
+	BUG_ON(entries2 != entries);
+	error = do_filldir_main(ip, ctx, darr, entries, copied);
+out_free:
+	for(i = 0; i < leaf; i++)
+		brelse(larr[i]);
+	kvfree(larr);
+out:
+	return error;
+}
+
+/**
+ * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information.
+ */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+			       struct file_ra_state *f_ra)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	u64 blocknr = 0, last;
+	unsigned count;
+
+	/* First check if we've already read-ahead for the whole range. */
+	if (index + MAX_RA_BLOCKS < f_ra->start)
+		return;
+
+	f_ra->start = max((pgoff_t)index, f_ra->start);
+	for (count = 0; count < MAX_RA_BLOCKS; count++) {
+		if (f_ra->start >= hsize) /* if exceeded the hash table */
+			break;
+
+		last = blocknr;
+		blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
+		f_ra->start++;
+		if (blocknr == last)
+			continue;
+
+		bh = gfs2_getbuf(gl, blocknr, 1);
+		if (trylock_buffer(bh)) {
+			if (buffer_uptodate(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				continue;
+			}
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READA | REQ_META, bh);
+			continue;
+		}
+		brelse(bh);
+	}
+}
+
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @ctx: actor to feed the entries to
+ *
+ * Returns: errno
+ */
+
+static int dir_e_read(struct inode *inode, struct dir_context *ctx,
+		      struct file_ra_state *f_ra)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	u32 hsize, len = 0;
+	u32 hash, index;
+	__be64 *lp;
+	int copied = 0;
+	int error = 0;
+	unsigned depth = 0;
+
+	hsize = 1 << dip->i_depth;
+	hash = gfs2_dir_offset2hash(ctx->pos);
+	index = hash >> (32 - dip->i_depth);
+
+	if (dip->i_hash_cache == NULL)
+		f_ra->start = 0;
+	lp = gfs2_dir_get_hash_table(dip);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+
+	gfs2_dir_readahead(inode, hsize, index, f_ra);
+
+	while (index < hsize) {
+		error = gfs2_dir_read_leaf(inode, ctx,
+					   &copied, &depth,
+					   be64_to_cpu(lp[index]));
+		if (error)
+			break;
+
+		len = 1 << (dip->i_depth - depth);
+		index = (index & ~(len - 1)) + len;
+	}
+
+	if (error > 0)
+		error = 0;
+	return error;
+}
+
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+		  struct file_ra_state *f_ra)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct dirent_gather g;
+	const struct gfs2_dirent **darr, *dent;
+	struct buffer_head *dibh;
+	int copied = 0;
+	int error;
+
+	if (!dip->i_entries)
+		return 0;
+
+	if (dip->i_diskflags & GFS2_DIF_EXHASH)
+		return dir_e_read(inode, ctx, f_ra);
+
+	if (!gfs2_is_stuffed(dip)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		return error;
+
+	error = -ENOMEM;
+	/* 96 is max number of dirents which can be stuffed into an inode */
+	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
+	if (darr) {
+		g.pdent = darr;
+		g.offset = 0;
+		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
+					gfs2_dirent_gather, NULL, &g);
+		if (IS_ERR(dent)) {
+			error = PTR_ERR(dent);
+			goto out;
+		}
+		if (dip->i_entries != g.offset) {
+			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
+				"ip->i_entries (%u) != g.offset (%u)\n",
+				(unsigned long long)dip->i_no_addr,
+				dip->i_entries,
+				g.offset);
+			error = -EIO;
+			goto out;
+		}
+		error = do_filldir_main(dip, ctx, darr,
+					dip->i_entries, &copied);
+out:
+		kfree(darr);
+	}
+
+	if (error > 0)
+		error = 0;
+
+	brelse(dibh);
+
+	return error;
+}
+
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 dir inode
+ * @name: The name we are looking up
+ * @fail_on_exist: Fail if the name exists rather than looking it up
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+			      bool fail_on_exist)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	u64 addr, formal_ino;
+	u16 dtype;
+
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+	if (dent) {
+		if (IS_ERR(dent))
+			return ERR_CAST(dent);
+		dtype = be16_to_cpu(dent->de_type);
+		addr = be64_to_cpu(dent->de_inum.no_addr);
+		formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
+		brelse(bh);
+		if (fail_on_exist)
+			return ERR_PTR(-EEXIST);
+		return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+	}
+	return ERR_PTR(-ENOENT);
+}
+
+int gfs2_dir_check(struct inode *dir, const struct qstr *name,
+		   const struct gfs2_inode *ip)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	int ret = -ENOENT;
+
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+	if (dent) {
+		if (IS_ERR(dent))
+			return PTR_ERR(dent);
+		if (ip) {
+			if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr)
+				goto out;
+			if (be64_to_cpu(dent->de_inum.no_formal_ino) !=
+			    ip->i_no_formal_ino)
+				goto out;
+			if (unlikely(IF2DT(ip->i_inode.i_mode) !=
+			    be16_to_cpu(dent->de_type))) {
+				gfs2_consist_inode(GFS2_I(dir));
+				ret = -EIO;
+				goto out;
+			}
+		}
+		ret = 0;
+out:
+		brelse(bh);
+	}
+	return ret;
+}
+
+/**
+ * dir_new_leaf - Add a new leaf onto hash chain
+ * @inode: The directory
+ * @name: The name we are adding
+ *
+ * This adds a new dir leaf onto an existing leaf when there is not
+ * enough space to add a new dir entry. This is a last resort after
+ * we've expanded the hash table to max size and also split existing
+ * leaf blocks, so it will only occur for very large directories.
+ *
+ * The dist parameter is set to 1 for leaf blocks directly attached
+ * to the hash table, 2 for one layer of indirection, 3 for two layers
+ * etc. We are thus able to tell the difference between an old leaf
+ * with dist set to zero (i.e. "don't know") and a new one where we
+ * set this information for debug/fsck purposes.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int dir_new_leaf(struct inode *inode, const struct qstr *name)
+{
+	struct buffer_head *bh, *obh;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_leaf *leaf, *oleaf;
+	u32 dist = 1;
+	int error;
+	u32 index;
+	u64 bn;
+
+	index = name->hash >> (32 - ip->i_depth);
+	error = get_first_leaf(ip, index, &obh);
+	if (error)
+		return error;
+	do {
+		dist++;
+		oleaf = (struct gfs2_leaf *)obh->b_data;
+		bn = be64_to_cpu(oleaf->lf_next);
+		if (!bn)
+			break;
+		brelse(obh);
+		error = get_leaf(ip, bn, &obh);
+		if (error)
+			return error;
+	} while(1);
+
+	gfs2_trans_add_meta(ip->i_gl, obh);
+
+	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+	if (!leaf) {
+		brelse(obh);
+		return -ENOSPC;
+	}
+	leaf->lf_dist = cpu_to_be32(dist);
+	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
+	brelse(bh);
+	brelse(obh);
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		return error;
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
+	gfs2_dinode_out(ip, bh->b_data);
+	brelse(bh);
+	return 0;
+}
+
+static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip)
+{
+	u64 where = ip->i_no_addr + 1;
+	if (ip->i_eattr == where)
+		return 1;
+	return 0;
+}
+
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @inode: The directory inode
+ * @name: The new name
+ * @nip: The GFS2 inode to be linked in to the directory
+ * @da: The directory addition info
+ *
+ * If the call to gfs2_diradd_alloc_required resulted in there being
+ * no need to allocate any new directory blocks, then it will contain
+ * a pointer to the directory entry and the bh in which it resides. We
+ * can use that without having to repeat the search. If there was no
+ * free space, then we must now create more space.
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_add(struct inode *inode, const struct qstr *name,
+		 const struct gfs2_inode *nip, struct gfs2_diradd *da)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh = da->bh;
+	struct gfs2_dirent *dent = da->dent;
+	struct timespec tv;
+	struct gfs2_leaf *leaf;
+	int error;
+
+	while(1) {
+		if (da->bh == NULL) {
+			dent = gfs2_dirent_search(inode, name,
+						  gfs2_dirent_find_space, &bh);
+		}
+		if (dent) {
+			if (IS_ERR(dent))
+				return PTR_ERR(dent);
+			dent = gfs2_init_dirent(inode, dent, name, bh);
+			gfs2_inum_out(nip, dent);
+			dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+			dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
+			tv = CURRENT_TIME;
+			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
+				leaf = (struct gfs2_leaf *)bh->b_data;
+				be16_add_cpu(&leaf->lf_entries, 1);
+				leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+				leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+			}
+			da->dent = NULL;
+			da->bh = NULL;
+			brelse(bh);
+			ip->i_entries++;
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
+			if (S_ISDIR(nip->i_inode.i_mode))
+				inc_nlink(&ip->i_inode);
+			mark_inode_dirty(inode);
+			error = 0;
+			break;
+		}
+		if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
+			error = dir_make_exhash(inode);
+			if (error)
+				break;
+			continue;
+		}
+		error = dir_split_leaf(inode, name);
+		if (error == 0)
+			continue;
+		if (error < 0)
+			break;
+		if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
+			error = dir_double_exhash(ip);
+			if (error)
+				break;
+			error = dir_split_leaf(inode, name);
+			if (error < 0)
+				break;
+			if (error == 0)
+				continue;
+		}
+		error = dir_new_leaf(inode, name);
+		if (!error)
+			continue;
+		error = -ENOSPC;
+		break;
+	}
+	return error;
+}
+
+
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
+{
+	const struct qstr *name = &dentry->d_name;
+	struct gfs2_dirent *dent, *prev = NULL;
+	struct buffer_head *bh;
+	struct timespec tv = CURRENT_TIME;
+
+	/* Returns _either_ the entry (if its first in block) or the
+	   previous entry otherwise */
+	dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+	if (!dent) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+	if (IS_ERR(dent)) {
+		gfs2_consist_inode(dip);
+		return PTR_ERR(dent);
+	}
+	/* If not first in block, adjust pointers accordingly */
+	if (gfs2_dirent_find(dent, name, NULL) == 0) {
+		prev = dent;
+		dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
+	}
+
+	dirent_del(dip, bh, prev, dent);
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
+		struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+		u16 entries = be16_to_cpu(leaf->lf_entries);
+		if (!entries)
+			gfs2_consist_inode(dip);
+		leaf->lf_entries = cpu_to_be16(--entries);
+		leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+		leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+	}
+	brelse(bh);
+
+	if (!dip->i_entries)
+		gfs2_consist_inode(dip);
+	dip->i_entries--;
+	dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
+	if (d_is_dir(dentry))
+		drop_nlink(&dip->i_inode);
+	mark_inode_dirty(&dip->i_inode);
+
+	return 0;
+}
+
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry.  It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+		   const struct gfs2_inode *nip, unsigned int new_type)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	int error;
+
+	dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+	if (!dent) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+	if (IS_ERR(dent))
+		return PTR_ERR(dent);
+
+	gfs2_trans_add_meta(dip->i_gl, bh);
+	gfs2_inum_out(nip, dent);
+	dent->de_type = cpu_to_be16(new_type);
+
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
+		brelse(bh);
+		error = gfs2_meta_inode_buffer(dip, &bh);
+		if (error)
+			return error;
+		gfs2_trans_add_meta(dip->i_gl, bh);
+	}
+
+	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(dip, bh->b_data);
+	brelse(bh);
+	return 0;
+}
+
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @leaf_bh: buffer_head for the starting leaf
+ * last_dealloc: 1 if this is the final dealloc for the leaf, else 0
+ *
+ * Returns: errno
+ */
+
+static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
+			u64 leaf_no, struct buffer_head *leaf_bh,
+			int last_dealloc)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_leaf *tmp_leaf;
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *bh, *dibh;
+	u64 blk, nblk;
+	unsigned int rg_blocks = 0, l_blocks = 0;
+	char *ht;
+	unsigned int x, size = len * sizeof(u64);
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
+	if (ht == NULL)
+		ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
+			       PAGE_KERNEL);
+	if (!ht)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	/*  Count the number of leaves  */
+	bh = leaf_bh;
+
+	for (blk = leaf_no; blk; blk = nblk) {
+		if (blk != leaf_no) {
+			error = get_leaf(dip, blk, &bh);
+			if (error)
+				goto out_rlist;
+		}
+		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+		nblk = be64_to_cpu(tmp_leaf->lf_next);
+		if (blk != leaf_no)
+			brelse(bh);
+
+		gfs2_rlist_add(dip, &rlist, blk);
+		l_blocks++;
+	}
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist;
+
+	error = gfs2_trans_begin(sdp,
+			rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
+			RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+	if (error)
+		goto out_rg_gunlock;
+
+	bh = leaf_bh;
+
+	for (blk = leaf_no; blk; blk = nblk) {
+		if (blk != leaf_no) {
+			error = get_leaf(dip, blk, &bh);
+			if (error)
+				goto out_end_trans;
+		}
+		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+		nblk = be64_to_cpu(tmp_leaf->lf_next);
+		if (blk != leaf_no)
+			brelse(bh);
+
+		gfs2_free_meta(dip, blk, 1);
+		gfs2_add_inode_blocks(&dip->i_inode, -1);
+	}
+
+	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
+	if (error != size) {
+		if (error >= 0)
+			error = -EIO;
+		goto out_end_trans;
+	}
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	gfs2_trans_add_meta(dip->i_gl, dibh);
+	/* On the last dealloc, make this a regular file in case we crash.
+	   (We don't want to free these blocks a second time.)  */
+	if (last_dealloc)
+		dip->i_inode.i_mode = S_IFREG;
+	gfs2_dinode_out(dip, dibh->b_data);
+	brelse(dibh);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_rg_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+	gfs2_rlist_free(&rlist);
+	gfs2_quota_unhold(dip);
+out:
+	kvfree(ht);
+	return error;
+}
+
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+	struct buffer_head *bh;
+	struct gfs2_leaf *leaf;
+	u32 hsize, len;
+	u32 index = 0, next_index;
+	__be64 *lp;
+	u64 leaf_no;
+	int error = 0, last;
+
+	hsize = 1 << dip->i_depth;
+
+	lp = gfs2_dir_get_hash_table(dip);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+
+	while (index < hsize) {
+		leaf_no = be64_to_cpu(lp[index]);
+		if (leaf_no) {
+			error = get_leaf(dip, leaf_no, &bh);
+			if (error)
+				goto out;
+			leaf = (struct gfs2_leaf *)bh->b_data;
+			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
+
+			next_index = (index & ~(len - 1)) + len;
+			last = ((next_index >= hsize) ? 1 : 0);
+			error = leaf_dealloc(dip, index, len, leaf_no, bh,
+					     last);
+			brelse(bh);
+			if (error)
+				goto out;
+			index = next_index;
+		} else
+			index++;
+	}
+
+	if (index != hsize) {
+		gfs2_consist_inode(dip);
+		error = -EIO;
+	}
+
+out:
+
+	return error;
+}
+
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ * @da: The structure to return dir alloc info
+ *
+ * Returns: 0 if ok, -ve on error
+ */
+
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
+			       struct gfs2_diradd *da)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
+	struct gfs2_dirent *dent;
+	struct buffer_head *bh;
+
+	da->nr_blocks = 0;
+	da->bh = NULL;
+	da->dent = NULL;
+
+	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+	if (!dent) {
+		da->nr_blocks = sdp->sd_max_dirres;
+		if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
+		    (GFS2_DIRENT_SIZE(name->len) < extra))
+			da->nr_blocks = 1;
+		return 0;
+	}
+	if (IS_ERR(dent))
+		return PTR_ERR(dent);
+
+	if (da->save_loc) {
+		da->bh = bh;
+		da->dent = dent;
+	} else {
+		brelse(bh);
+	}
+	return 0;
+}
+
diff --git a/kernel/fs/gfs2/dir.h b/kernel/fs/gfs2/dir.h
new file mode 100644
index 000000000..e1b309c24
--- /dev/null
+++ b/kernel/fs/gfs2/dir.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+#include <linux/dcache.h>
+#include <linux/crc32.h>
+
+struct inode;
+struct gfs2_inode;
+struct gfs2_inum;
+struct buffer_head;
+struct gfs2_dirent;
+
+struct gfs2_diradd {
+	unsigned nr_blocks;
+	struct gfs2_dirent *dent;
+	struct buffer_head *bh;
+	int save_loc;
+};
+
+extern struct inode *gfs2_dir_search(struct inode *dir,
+				     const struct qstr *filename,
+				     bool fail_on_exist);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+			  const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+			const struct gfs2_inode *ip, struct gfs2_diradd *da);
+static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
+{
+	if (da->bh)
+		brelse(da->bh);
+	da->bh = NULL;
+}
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
+extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+			 struct file_ra_state *f_ra);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+			  const struct gfs2_inode *nip, unsigned int new_type);
+
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+				      const struct qstr *filename,
+				      struct gfs2_diradd *da);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+				   struct buffer_head **bhp);
+extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
+
+static inline u32 gfs2_disk_hash(const char *data, int len)
+{
+        return crc32_le((u32)~0, data, len) ^ (u32)~0;
+}
+
+
+static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
+{
+	name->name = fname;
+	name->len = strlen(fname);
+	name->hash = gfs2_disk_hash(name->name, name->len);
+}
+
+/* N.B. This probably ought to take inum & type as args as well */
+static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
+{
+	dent->de_inum.no_addr = cpu_to_be64(0);
+	dent->de_inum.no_formal_ino = cpu_to_be64(0);
+	dent->de_hash = cpu_to_be32(name->hash);
+	dent->de_rec_len = cpu_to_be16(reclen);
+	dent->de_name_len = cpu_to_be16(name->len);
+	dent->de_type = cpu_to_be16(0);
+	memset(dent->__pad, 0, sizeof(dent->__pad));
+	memcpy(dent + 1, name->name, name->len);
+}
+
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
+#endif /* __DIR_DOT_H__ */
diff --git a/kernel/fs/gfs2/export.c b/kernel/fs/gfs2/export.c
new file mode 100644
index 000000000..5d15e9498
--- /dev/null
+++ b/kernel/fs/gfs2/export.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "super.h"
+#include "rgrp.h"
+#include "util.h"
+
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 8
+#define GFS2_OLD_FH_SIZE 10
+
+static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
+			  struct inode *parent)
+{
+	__be32 *fh = (__force __be32 *)p;
+	struct super_block *sb = inode->i_sb;
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (parent && (*len < GFS2_LARGE_FH_SIZE)) {
+		*len = GFS2_LARGE_FH_SIZE;
+		return FILEID_INVALID;
+	} else if (*len < GFS2_SMALL_FH_SIZE) {
+		*len = GFS2_SMALL_FH_SIZE;
+		return FILEID_INVALID;
+	}
+
+	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[2] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_SMALL_FH_SIZE;
+
+	if (!parent || inode == d_inode(sb->s_root))
+		return *len;
+
+	ip = GFS2_I(parent);
+
+	fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[6] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_LARGE_FH_SIZE;
+
+	return *len;
+}
+
+struct get_name_filldir {
+	struct dir_context ctx;
+	struct gfs2_inum_host inum;
+	char *name;
+};
+
+static int get_name_filldir(struct dir_context *ctx, const char *name,
+			    int length, loff_t offset, u64 inum,
+			    unsigned int type)
+{
+	struct get_name_filldir *gnfd =
+		container_of(ctx, struct get_name_filldir, ctx);
+
+	if (inum != gnfd->inum.no_addr)
+		return 0;
+
+	memcpy(gnfd->name, name, length);
+	gnfd->name[length] = 0;
+
+	return 1;
+}
+
+static int gfs2_get_name(struct dentry *parent, char *name,
+			 struct dentry *child)
+{
+	struct inode *dir = d_inode(parent);
+	struct inode *inode = d_inode(child);
+	struct gfs2_inode *dip, *ip;
+	struct get_name_filldir gnfd = {
+		.ctx.actor = get_name_filldir,
+		.name = name
+	};
+	struct gfs2_holder gh;
+	int error;
+	struct file_ra_state f_ra = { .start = 0 };
+
+	if (!dir)
+		return -EINVAL;
+
+	if (!S_ISDIR(dir->i_mode) || !inode)
+		return -EINVAL;
+
+	dip = GFS2_I(dir);
+	ip = GFS2_I(inode);
+
+	*name = 0;
+	gnfd.inum.no_addr = ip->i_no_addr;
+	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (error)
+		return error;
+
+	error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
+
+	gfs2_glock_dq_uninit(&gh);
+
+	if (!error && !*name)
+		error = -ENOENT;
+
+	return error;
+}
+
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+	return d_obtain_alias(gfs2_lookupi(d_inode(child), &gfs2_qdotdot, 1));
+}
+
+static struct dentry *gfs2_get_dentry(struct super_block *sb,
+				      struct gfs2_inum_host *inum)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct inode *inode;
+
+	inode = gfs2_ilookup(sb, inum->no_addr, 0);
+	if (inode) {
+		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
+		goto out_inode;
+	}
+
+	inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
+				    GFS2_BLKST_DINODE);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+out_inode:
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host this;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_SMALL_FH_SIZE:
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		if (fh_len < GFS2_SMALL_FH_SIZE)
+			return NULL;
+		this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+		this.no_formal_ino |= be32_to_cpu(fh[1]);
+		this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+		this.no_addr |= be32_to_cpu(fh[3]);
+		return gfs2_get_dentry(sb, &this);
+	default:
+		return NULL;
+	}
+}
+
+static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host parent;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		if (fh_len < GFS2_LARGE_FH_SIZE)
+			return NULL;
+		parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+		parent.no_formal_ino |= be32_to_cpu(fh[5]);
+		parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+		parent.no_addr |= be32_to_cpu(fh[7]);
+		return gfs2_get_dentry(sb, &parent);
+	default:
+		return NULL;
+	}
+}
+
+const struct export_operations gfs2_export_ops = {
+	.encode_fh = gfs2_encode_fh,
+	.fh_to_dentry = gfs2_fh_to_dentry,
+	.fh_to_parent = gfs2_fh_to_parent,
+	.get_name = gfs2_get_name,
+	.get_parent = gfs2_get_parent,
+};
+
diff --git a/kernel/fs/gfs2/file.c b/kernel/fs/gfs2/file.c
new file mode 100644
index 000000000..31892871e
--- /dev/null
+++ b/kernel/fs/gfs2/file.c
@@ -0,0 +1,1159 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
+#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+#include <linux/delay.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_holder i_gh;
+	loff_t error;
+
+	switch (whence) {
+	case SEEK_END: /* These reference inode->i_size */
+	case SEEK_DATA:
+	case SEEK_HOLE:
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (!error) {
+			error = generic_file_llseek(file, offset, whence);
+			gfs2_glock_dq_uninit(&i_gh);
+		}
+		break;
+	case SEEK_CUR:
+	case SEEK_SET:
+		error = generic_file_llseek(file, offset, whence);
+		break;
+	default:
+		error = -EINVAL;
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_readdir - Iterator for a directory
+ * @file: The directory to read from
+ * @ctx: What to feed directory entries to
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *dir = file->f_mapping->host;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_holder d_gh;
+	int error;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	if (error)
+		return error;
+
+	error = gfs2_dir_read(dir, ctx, &file->f_ra);
+
+	gfs2_glock_dq_uninit(&d_gh);
+
+	return error;
+}
+
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+	u32 res = 0;
+	while(val) {
+		if (val & 1)
+			res |= *table;
+		table++;
+		val >>= 1;
+	}
+	return res;
+}
+
+static const u32 fsflags_to_gfs2[32] = {
+	[3] = GFS2_DIF_SYNC,
+	[4] = GFS2_DIF_IMMUTABLE,
+	[5] = GFS2_DIF_APPENDONLY,
+	[7] = GFS2_DIF_NOATIME,
+	[12] = GFS2_DIF_EXHASH,
+	[14] = GFS2_DIF_INHERIT_JDATA,
+	[17] = GFS2_DIF_TOPDIR,
+};
+
+static const u32 gfs2_to_fsflags[32] = {
+	[gfs2fl_Sync] = FS_SYNC_FL,
+	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+	[gfs2fl_AppendOnly] = FS_APPEND_FL,
+	[gfs2fl_NoAtime] = FS_NOATIME_FL,
+	[gfs2fl_ExHash] = FS_INDEX_FL,
+	[gfs2fl_TopLevel] = FS_TOPDIR_FL,
+	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = file_inode(filp);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	u32 fsflags;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (error)
+		return error;
+
+	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
+	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
+		fsflags |= FS_JOURNAL_DATA_FL;
+	if (put_user(fsflags, ptr))
+		error = -EFAULT;
+
+	gfs2_glock_dq(&gh);
+	gfs2_holder_uninit(&gh);
+	return error;
+}
+
+void gfs2_set_inode_flags(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int flags = inode->i_flags;
+
+	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC);
+	if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode))
+		inode->i_flags |= S_NOSEC;
+	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
+		flags |= S_APPEND;
+	if (ip->i_diskflags & GFS2_DIF_NOATIME)
+		flags |= S_NOATIME;
+	if (ip->i_diskflags & GFS2_DIF_SYNC)
+		flags |= S_SYNC;
+	inode->i_flags = flags;
+}
+
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
+			     GFS2_DIF_IMMUTABLE|		\
+			     GFS2_DIF_APPENDONLY|		\
+			     GFS2_DIF_NOATIME|			\
+			     GFS2_DIF_SYNC|			\
+			     GFS2_DIF_SYSTEM|			\
+			     GFS2_DIF_TOPDIR|			\
+			     GFS2_DIF_INHERIT_JDATA)
+
+/**
+ * do_gfs2_set_flags - set flags on an inode
+ * @filp: file pointer
+ * @reqflags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+	struct inode *inode = file_inode(filp);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder gh;
+	int error;
+	u32 new_flags, flags;
+
+	error = mnt_want_write_file(filp);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		goto out_drop_write;
+
+	error = -EACCES;
+	if (!inode_owner_or_capable(inode))
+		goto out;
+
+	error = 0;
+	flags = ip->i_diskflags;
+	new_flags = (flags & ~mask) | (reqflags & mask);
+	if ((new_flags ^ flags) == 0)
+		goto out;
+
+	error = -EINVAL;
+	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+		goto out;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+		goto out;
+	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+		goto out;
+	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+	if (!IS_IMMUTABLE(inode)) {
+		error = gfs2_permission(inode, MAY_WRITE);
+		if (error)
+			goto out;
+	}
+	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
+		if (flags & GFS2_DIF_JDATA)
+			gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+		error = filemap_fdatawrite(inode->i_mapping);
+		if (error)
+			goto out;
+		error = filemap_fdatawait(inode->i_mapping);
+		if (error)
+			goto out;
+	}
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		goto out;
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_trans_end;
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	ip->i_diskflags = new_flags;
+	gfs2_dinode_out(ip, bh->b_data);
+	brelse(bh);
+	gfs2_set_inode_flags(inode);
+	gfs2_set_aops(inode);
+out_trans_end:
+	gfs2_trans_end(sdp);
+out:
+	gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+	mnt_drop_write_file(filp);
+	return error;
+}
+
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = file_inode(filp);
+	u32 fsflags, gfsflags;
+
+	if (get_user(fsflags, ptr))
+		return -EFAULT;
+
+	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+	if (!S_ISDIR(inode->i_mode)) {
+		gfsflags &= ~GFS2_DIF_TOPDIR;
+		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
+			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
+		return do_gfs2_set_flags(filp, gfsflags, ~0);
+	}
+	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+}
+
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch(cmd) {
+	case FS_IOC_GETFLAGS:
+		return gfs2_get_flags(filp, (u32 __user *)arg);
+	case FS_IOC_SETFLAGS:
+		return gfs2_set_flags(filp, (u32 __user *)arg);
+	case FITRIM:
+		return gfs2_fitrim(filp, (void __user *)arg);
+	}
+	return -ENOTTY;
+}
+
+/**
+ * gfs2_size_hint - Give a hint to the size of a write request
+ * @filep: The struct file
+ * @offset: The file offset of the write
+ * @size: The length of the write
+ *
+ * When we are about to do a write, this function records the total
+ * write size in order to provide a suitable hint to the lower layers
+ * about how many blocks will be required.
+ *
+ */
+
+static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
+{
+	struct inode *inode = file_inode(filep);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
+	int hint = min_t(size_t, INT_MAX, blks);
+
+	if (hint > atomic_read(&ip->i_res->rs_sizehint))
+		atomic_set(&ip->i_res->rs_sizehint, hint);
+}
+
+/**
+ * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * @page: The (locked) page to allocate backing for
+ *
+ * We try to allocate all the blocks required for the page in
+ * one go. This might fail for various reasons, so we keep
+ * trying until all the blocks to back this page are allocated.
+ * If some of the blocks are already allocated, thats ok too.
+ */
+
+static int gfs2_allocate_page_backing(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head bh;
+	unsigned long size = PAGE_CACHE_SIZE;
+	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(inode, lblock, &bh, 1);
+		if (!buffer_mapped(&bh))
+			return -EIO;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> inode->i_blkbits);
+	} while(size > 0);
+	return 0;
+}
+
+/**
+ * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
+ * @vma: The virtual memory area
+ * @vmf: The virtual memory fault containing the page to become writable
+ *
+ * When the page becomes writable, we need to ensure that we have
+ * blocks allocated on disk to back that page.
+ */
+
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
+	unsigned long last_index;
+	u64 pos = page->index << PAGE_CACHE_SHIFT;
+	unsigned int data_blocks, ind_blocks, rblocks;
+	struct gfs2_holder gh;
+	loff_t size;
+	int ret;
+
+	sb_start_pagefault(inode->i_sb);
+
+	/* Update file times before taking page lock */
+	file_update_time(vma->vm_file);
+
+	ret = get_write_access(inode);
+	if (ret)
+		goto out;
+
+	ret = gfs2_rs_alloc(ip);
+	if (ret)
+		goto out_write_access;
+
+	gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out_uninit;
+
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+	set_bit(GIF_SW_PAGED, &ip->i_flags);
+
+	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+		lock_page(page);
+		if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+			ret = -EAGAIN;
+			unlock_page(page);
+		}
+		goto out_unlock;
+	}
+
+	ret = gfs2_rindex_update(sdp);
+	if (ret)
+		goto out_unlock;
+
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+	ap.target = data_blocks + ind_blocks;
+	ret = gfs2_quota_lock_check(ip, &ap);
+	if (ret)
+		goto out_unlock;
+	ret = gfs2_inplace_reserve(ip, &ap);
+	if (ret)
+		goto out_quota_unlock;
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks) {
+		rblocks += RES_STATFS + RES_QUOTA;
+		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
+	}
+	ret = gfs2_trans_begin(sdp, rblocks, 0);
+	if (ret)
+		goto out_trans_fail;
+
+	lock_page(page);
+	ret = -EINVAL;
+	size = i_size_read(inode);
+	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+	/* Check page index against inode size */
+	if (size == 0 || (page->index > last_index))
+		goto out_trans_end;
+
+	ret = -EAGAIN;
+	/* If truncated, we must retry the operation, we may have raced
+	 * with the glock demotion code.
+	 */
+	if (!PageUptodate(page) || page->mapping != inode->i_mapping)
+		goto out_trans_end;
+
+	/* Unstuff, if required, and allocate backing blocks for page */
+	ret = 0;
+	if (gfs2_is_stuffed(ip))
+		ret = gfs2_unstuff_dinode(ip, page);
+	if (ret == 0)
+		ret = gfs2_allocate_page_backing(page);
+
+out_trans_end:
+	if (ret)
+		unlock_page(page);
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_quota_unlock:
+	gfs2_quota_unlock(ip);
+out_unlock:
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	if (ret == 0) {
+		set_page_dirty(page);
+		wait_for_stable_page(page);
+	}
+out_write_access:
+	put_write_access(inode);
+out:
+	sb_end_pagefault(inode->i_sb);
+	return block_page_mkwrite_return(ret);
+}
+
+static const struct vm_operations_struct gfs2_vm_ops = {
+	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = gfs2_page_mkwrite,
+};
+
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
+ */
+
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+
+	if (!(file->f_flags & O_NOATIME) &&
+	    !IS_NOATIME(&ip->i_inode)) {
+		struct gfs2_holder i_gh;
+		int error;
+
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (error)
+			return error;
+		/* grab lock to update inode */
+		gfs2_glock_dq_uninit(&i_gh);
+		file_accessed(file);
+	}
+	vma->vm_ops = &gfs2_vm_ops;
+
+	return 0;
+}
+
+/**
+ * gfs2_open_common - This is common to open and atomic_open
+ * @inode: The inode being opened
+ * @file: The file being opened
+ *
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
+ */
+
+int gfs2_open_common(struct inode *inode, struct file *file)
+{
+	struct gfs2_file *fp;
+	int ret;
+
+	if (S_ISREG(inode->i_mode)) {
+		ret = generic_file_open(inode, file);
+		if (ret)
+			return ret;
+	}
+
+	fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
+	if (!fp)
+		return -ENOMEM;
+
+	mutex_init(&fp->f_fl_mutex);
+
+	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+	file->private_data = fp;
+	return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+	bool need_unlock = false;
+
+	if (S_ISREG(ip->i_inode.i_mode)) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (error)
+			return error;
+		need_unlock = true;
+	}
+
+	error = gfs2_open_common(inode, file);
+
+	if (need_unlock)
+		gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * gfs2_release - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+
+static int gfs2_release(struct inode *inode, struct file *file)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	kfree(file->private_data);
+	file->private_data = NULL;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return 0;
+
+	gfs2_rs_delete(ip, &inode->i_writecount);
+	return 0;
+}
+
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry
+ * @start: the start position in the file to sync
+ * @end: the end position in the file to sync
+ * @datasync: set if we can ignore timestamp changes
+ *
+ * We split the data flushing here so that we don't wait for the data
+ * until after we've also sent the metadata to disk. Note that for
+ * data=ordered, we will write & wait for the data at the log flush
+ * stage anyway, so this is unlikely to make much of a difference
+ * except in the data=writeback case.
+ *
+ * If the fdatawrite fails due to any reason except -EIO, we will
+ * continue the remainder of the fsync, although we'll still report
+ * the error at the end. This is to match filemap_write_and_wait_range()
+ * behaviour.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	int sync_state = inode->i_state & I_DIRTY_ALL;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret = 0, ret1 = 0;
+
+	if (mapping->nrpages) {
+		ret1 = filemap_fdatawrite_range(mapping, start, end);
+		if (ret1 == -EIO)
+			return ret1;
+	}
+
+	if (!gfs2_is_jdata(ip))
+		sync_state &= ~I_DIRTY_PAGES;
+	if (datasync)
+		sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
+
+	if (sync_state) {
+		ret = sync_inode_metadata(inode, 1);
+		if (ret)
+			return ret;
+		if (gfs2_is_jdata(ip))
+			filemap_write_and_wait(mapping);
+		gfs2_ail_flush(ip->i_gl, 1);
+	}
+
+	if (mapping->nrpages)
+		ret = filemap_fdatawait_range(mapping, start, end);
+
+	return ret ? ret : ret1;
+}
+
+/**
+ * gfs2_file_write_iter - Perform a write to a file
+ * @iocb: The io context
+ * @iov: The data to write
+ * @nr_segs: Number of @iov segments
+ * @pos: The file position
+ *
+ * We have to do a lock/unlock here to refresh the inode size for
+ * O_APPEND writes, otherwise we can land up writing at the wrong
+ * offset. There is still a race, but provided the app is using its
+ * own file locking, this will make O_APPEND work as expected.
+ *
+ */
+
+static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct gfs2_inode *ip = GFS2_I(file_inode(file));
+	int ret;
+
+	ret = gfs2_rs_alloc(ip);
+	if (ret)
+		return ret;
+
+	gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
+
+	if (iocb->ki_flags & IOCB_APPEND) {
+		struct gfs2_holder gh;
+
+		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+		if (ret)
+			return ret;
+		gfs2_glock_dq_uninit(&gh);
+	}
+
+	return generic_file_write_iter(iocb, from);
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	unsigned int nr_blks;
+	sector_t lblock = offset >> inode->i_blkbits;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		return error;
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	while (len) {
+		struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+		bh_map.b_size = len;
+		set_buffer_zeronew(&bh_map);
+
+		error = gfs2_block_map(inode, lblock, &bh_map, 1);
+		if (unlikely(error))
+			goto out;
+		len -= bh_map.b_size;
+		nr_blks = bh_map.b_size >> inode->i_blkbits;
+		lblock += nr_blks;
+		if (!buffer_new(&bh_map))
+			continue;
+		if (unlikely(!buffer_zeronew(&bh_map))) {
+			error = -EIO;
+			goto out;
+		}
+	}
+out:
+	brelse(dibh);
+	return error;
+}
+/**
+ * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of
+ *                     blocks, determine how many bytes can be written.
+ * @ip:          The inode in question.
+ * @len:         Max cap of bytes. What we return in *len must be <= this.
+ * @data_blocks: Compute and return the number of data blocks needed
+ * @ind_blocks:  Compute and return the number of indirect blocks needed
+ * @max_blocks:  The total blocks available to work with.
+ *
+ * Returns: void, but @len, @data_blocks and @ind_blocks are filled in.
+ */
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks,
+			    unsigned int max_blocks)
+{
+	loff_t max = *len;
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes, max_blks = UINT_MAX;
+	int error;
+	const loff_t pos = offset;
+	const loff_t count = len;
+	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	loff_t max_chunk_size = UINT_MAX & bsize_mask;
+
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	offset &= bsize_mask;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+	bytes &= bsize_mask;
+	if (bytes == 0)
+		bytes = sdp->sd_sb.sb_bsize;
+
+	gfs2_size_hint(file, offset, len);
+
+	gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks);
+	ap.min_target = data_blocks + ind_blocks;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		if (!gfs2_write_alloc_required(ip, offset, bytes)) {
+			len -= bytes;
+			offset += bytes;
+			continue;
+		}
+
+		/* We need to determine how many bytes we can actually
+		 * fallocate without exceeding quota or going over the
+		 * end of the fs. We start off optimistically by assuming
+		 * we can write max_bytes */
+		max_bytes = (len > max_chunk_size) ? max_chunk_size : len;
+
+		/* Since max_bytes is most likely a theoretical max, we
+		 * calculate a more realistic 'bytes' to serve as a good
+		 * starting point for the number of bytes we may be able
+		 * to write */
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+		ap.target = data_blocks + ind_blocks;
+
+		error = gfs2_quota_lock_check(ip, &ap);
+		if (error)
+			return error;
+		/* ap.allowed tells us how many blocks quota will allow
+		 * us to write. Check if this reduces max_blks */
+		if (ap.allowed && ap.allowed < max_blks)
+			max_blks = ap.allowed;
+
+		error = gfs2_inplace_reserve(ip, &ap);
+		if (error)
+			goto out_qunlock;
+
+		/* check if the selected rgrp limits our max_blks further */
+		if (ap.allowed && ap.allowed < max_blks)
+			max_blks = ap.allowed;
+
+		/* Almost done. Calculate bytes that can be written using
+		 * max_blks. We also recompute max_bytes, data_blocks and
+		 * ind_blocks */
+		calc_max_reserv(ip, &max_bytes, &data_blocks,
+				&ind_blocks, max_blks);
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
+		i_size_write(inode, pos + count);
+		/* Marks the inode as dirty */
+		file_update_time(file);
+	}
+
+	return generic_write_sync(file, pos, count);
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+	return error;
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&inode->i_mutex);
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out_uninit;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    (offset + len) > inode->i_size) {
+		ret = inode_newsize_ok(inode, offset + len);
+		if (ret)
+			goto out_unlock;
+	}
+
+	ret = get_write_access(inode);
+	if (ret)
+		goto out_unlock;
+
+	ret = gfs2_rs_alloc(ip);
+	if (ret)
+		goto out_putw;
+
+	ret = __gfs2_fallocate(file, mode, offset, len);
+	if (ret)
+		gfs2_rs_deltree(ip->i_res);
+out_putw:
+	put_write_access(inode);
+out_unlock:
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
+				      struct file *out, loff_t *ppos,
+				      size_t len, unsigned int flags)
+{
+	int error;
+	struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
+
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		return (ssize_t)error;
+
+	gfs2_size_hint(out, *ppos, len);
+
+	return iter_file_splice_write(pipe, out, ppos, len, flags);
+}
+
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	if (cmd == F_CANCELLK) {
+		/* Hack: */
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		if (fl->fl_type == F_UNLCK)
+			posix_lock_file_wait(file, fl);
+		return -EIO;
+	}
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else
+		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+}
+
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+	struct gfs2_inode *ip = GFS2_I(file_inode(file));
+	struct gfs2_glock *gl;
+	unsigned int state;
+	int flags;
+	int error = 0;
+	int sleeptime;
+
+	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
+
+	mutex_lock(&fp->f_fl_mutex);
+
+	gl = fl_gh->gh_gl;
+	if (gl) {
+		if (fl_gh->gh_state == state)
+			goto out;
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+		gfs2_glock_dq(fl_gh);
+		gfs2_holder_reinit(state, flags, fl_gh);
+	} else {
+		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+				       &gfs2_flock_glops, CREATE, &gl);
+		if (error)
+			goto out;
+		gfs2_holder_init(gl, state, flags, fl_gh);
+		gfs2_glock_put(gl);
+	}
+	for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
+		error = gfs2_glock_nq(fl_gh);
+		if (error != GLR_TRYFAILED)
+			break;
+		fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
+		fl_gh->gh_error = 0;
+		msleep(sleeptime);
+	}
+	if (error) {
+		gfs2_holder_uninit(fl_gh);
+		if (error == GLR_TRYFAILED)
+			error = -EAGAIN;
+	} else {
+		error = flock_lock_file_wait(file, fl);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+	}
+
+out:
+	mutex_unlock(&fp->f_fl_mutex);
+	return error;
+}
+
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+
+	mutex_lock(&fp->f_fl_mutex);
+	flock_lock_file_wait(file, fl);
+	if (fl_gh->gh_gl) {
+		gfs2_glock_dq(fl_gh);
+		gfs2_holder_uninit(fl_gh);
+	}
+	mutex_unlock(&fp->f_fl_mutex);
+}
+
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if (fl->fl_type & LOCK_MAND)
+		return -EOPNOTSUPP;
+
+	if (fl->fl_type == F_UNLCK) {
+		do_unflock(file, fl);
+		return 0;
+	} else {
+		return do_flock(file, cmd, fl);
+	}
+}
+
+const struct file_operations gfs2_file_fops = {
+	.llseek		= gfs2_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= gfs2_file_write_iter,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_release,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= gfs2_file_splice_write,
+	.setlease	= simple_nosetlease,
+	.fallocate	= gfs2_fallocate,
+};
+
+const struct file_operations gfs2_dir_fops = {
+	.iterate	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_release,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+	.llseek		= default_llseek,
+};
+
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+const struct file_operations gfs2_file_fops_nolock = {
+	.llseek		= gfs2_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= gfs2_file_write_iter,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_release,
+	.fsync		= gfs2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= gfs2_file_splice_write,
+	.setlease	= generic_setlease,
+	.fallocate	= gfs2_fallocate,
+};
+
+const struct file_operations gfs2_dir_fops_nolock = {
+	.iterate	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_release,
+	.fsync		= gfs2_fsync,
+	.llseek		= default_llseek,
+};
+
diff --git a/kernel/fs/gfs2/gfs2.h b/kernel/fs/gfs2/gfs2.h
new file mode 100644
index 000000000..ef606e3a5
--- /dev/null
+++ b/kernel/fs/gfs2/gfs2.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+
+enum {
+	NO_CREATE = 0,
+	CREATE = 1,
+};
+
+enum {
+	NO_FORCE = 0,
+	FORCE = 1,
+};
+
+#define GFS2_FAST_NAME_SIZE 8
+
+#endif /* __GFS2_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/glock.c b/kernel/fs/gfs2/glock.c
new file mode 100644
index 000000000..0fa8062f8
--- /dev/null
+++ b/kernel/fs/gfs2/glock.c
@@ -0,0 +1,2119 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+#include <linux/percpu.h>
+#include <linux/list_sort.h>
+#include <linux/lockref.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+#include "util.h"
+#include "bmap.h"
+#define CREATE_TRACE_POINTS
+#include "trace_gfs2.h"
+
+struct gfs2_glock_iter {
+	int hash;			/* hash bucket index           */
+	unsigned nhash;			/* Index within current bucket */
+	struct gfs2_sbd *sdp;		/* incore superblock           */
+	struct gfs2_glock *gl;		/* current glock struct        */
+	loff_t last_pos;		/* last position               */
+};
+
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
+
+static struct dentry *gfs2_root;
+static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(lru_lock);
+
+#define GFS2_GL_HASH_SHIFT      15
+#define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
+
+static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct dentry *gfs2_root;
+
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+
+static unsigned int gl_hash(const struct gfs2_sbd *sdp,
+			    const struct lm_lockname *name)
+{
+	unsigned int h;
+
+	h = jhash(&name->ln_number, sizeof(u64), 0);
+	h = jhash(&name->ln_type, sizeof(unsigned int), h);
+	h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
+	h &= GFS2_GL_HASH_MASK;
+
+	return h;
+}
+
+static inline void spin_lock_bucket(unsigned int hash)
+{
+	hlist_bl_lock(&gl_hash_table[hash]);
+}
+
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+	hlist_bl_unlock(&gl_hash_table[hash]);
+}
+
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
+{
+	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+
+	if (gl->gl_ops->go_flags & GLOF_ASPACE) {
+		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+	} else {
+		kfree(gl->gl_lksb.sb_lvbptr);
+		kmem_cache_free(gfs2_glock_cachep, gl);
+	}
+}
+
+void gfs2_glock_free(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
+	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+		wake_up(&sdp->sd_glock_wait);
+}
+
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+
+static void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+	GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
+	lockref_get(&gl->gl_lockref);
+}
+
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(const struct gfs2_glock *gl)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	if (gl->gl_state == LM_ST_UNLOCKED)
+		return 0;
+	if (!list_empty(&gl->gl_holders))
+		return 0;
+	if (glops->go_demote_ok)
+		return glops->go_demote_ok(gl);
+	return 1;
+}
+
+
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+
+	if (!list_empty(&gl->gl_lru))
+		list_del_init(&gl->gl_lru);
+	else
+		atomic_inc(&lru_count);
+
+	list_add_tail(&gl->gl_lru, &lru_list);
+	set_bit(GLF_LRU, &gl->gl_flags);
+	spin_unlock(&lru_lock);
+}
+
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+	if (!list_empty(&gl->gl_lru)) {
+		list_del_init(&gl->gl_lru);
+		atomic_dec(&lru_count);
+		clear_bit(GLF_LRU, &gl->gl_flags);
+	}
+	spin_unlock(&lru_lock);
+}
+
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+
+void gfs2_glock_put(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+
+	if (lockref_put_or_lock(&gl->gl_lockref))
+		return;
+
+	lockref_mark_dead(&gl->gl_lockref);
+
+	gfs2_glock_remove_from_lru(gl);
+	spin_unlock(&gl->gl_lockref.lock);
+	spin_lock_bucket(gl->gl_hash);
+	hlist_bl_del_rcu(&gl->gl_list);
+	spin_unlock_bucket(gl->gl_hash);
+	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
+	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+	trace_gfs2_glock_put(gl);
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+}
+
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *search_bucket(unsigned int hash,
+					const struct gfs2_sbd *sdp,
+					const struct lm_lockname *name)
+{
+	struct gfs2_glock *gl;
+	struct hlist_bl_node *h;
+
+	hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
+		if (!lm_name_equal(&gl->gl_name, name))
+			continue;
+		if (gl->gl_sbd != sdp)
+			continue;
+		if (lockref_get_not_dead(&gl->gl_lockref))
+			return gl;
+	}
+
+	return NULL;
+}
+
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+	const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+	if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+	     gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+		return 0;
+	if (gl->gl_state == gh->gh_state)
+		return 1;
+	if (gh->gh_flags & GL_EXACT)
+		return 0;
+	if (gl->gl_state == LM_ST_EXCLUSIVE) {
+		if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+			return 1;
+		if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+			return 1;
+	}
+	if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+		return 1;
+	return 0;
+}
+
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+	clear_bit(HIF_WAIT, &gh->gh_iflags);
+	smp_mb__after_atomic();
+	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+	struct gfs2_holder *gh, *tmp;
+
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (ret & LM_OUT_ERROR)
+			gh->gh_error = -EIO;
+		else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			gh->gh_error = GLR_TRYFAILED;
+		else
+			continue;
+		list_del_init(&gh->gh_list);
+		trace_gfs2_glock_queue(gh, 0);
+		gfs2_holder_wake(gh);
+	}
+}
+
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ * 
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ *          if a type specific operation is underway.
+ */
+
+static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh, *tmp;
+	int ret;
+
+restart:
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (may_grant(gl, gh)) {
+			if (gh->gh_list.prev == &gl->gl_holders &&
+			    glops->go_lock) {
+				spin_unlock(&gl->gl_spin);
+				/* FIXME: eliminate this eventually */
+				ret = glops->go_lock(gh);
+				spin_lock(&gl->gl_spin);
+				if (ret) {
+					if (ret == 1)
+						return 2;
+					gh->gh_error = ret;
+					list_del_init(&gh->gh_list);
+					trace_gfs2_glock_queue(gh, 0);
+					gfs2_holder_wake(gh);
+					goto restart;
+				}
+				set_bit(HIF_HOLDER, &gh->gh_iflags);
+				trace_gfs2_promote(gh, 1);
+				gfs2_holder_wake(gh);
+				goto restart;
+			}
+			set_bit(HIF_HOLDER, &gh->gh_iflags);
+			trace_gfs2_promote(gh, 0);
+			gfs2_holder_wake(gh);
+			continue;
+		}
+		if (gh->gh_list.prev == &gl->gl_holders)
+			return 1;
+		do_error(gl, 0);
+		break;
+	}
+	return 0;
+}
+
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+	int held1, held2;
+
+	held1 = (gl->gl_state != LM_ST_UNLOCKED);
+	held2 = (new_state != LM_ST_UNLOCKED);
+
+	if (held1 != held2) {
+		GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
+		if (held2)
+			gl->gl_lockref.count++;
+		else
+			gl->gl_lockref.count--;
+	}
+	if (held1 && held2 && list_empty(&gl->gl_holders))
+		clear_bit(GLF_QUEUED, &gl->gl_flags);
+
+	if (new_state != gl->gl_target)
+		/* shorten our minimum hold time */
+		gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR,
+				       GL_GLOCK_MIN_HOLD);
+	gl->gl_state = new_state;
+	gl->gl_tchange = jiffies;
+}
+
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
+	clear_bit(GLF_DEMOTE, &gl->gl_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh;
+	unsigned state = ret & LM_OUT_ST_MASK;
+	int rv;
+
+	spin_lock(&gl->gl_spin);
+	trace_gfs2_glock_state_change(gl, state);
+	state_change(gl, state);
+	gh = find_first_waiter(gl);
+
+	/* Demote to UN request arrived during demote to SH or DF */
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+	    state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+		gl->gl_target = LM_ST_UNLOCKED;
+
+	/* Check for state != intended state */
+	if (unlikely(state != gl->gl_target)) {
+		if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+			/* move to back of queue and try next entry */
+			if (ret & LM_OUT_CANCELED) {
+				if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+					list_move_tail(&gh->gh_list, &gl->gl_holders);
+				gh = find_first_waiter(gl);
+				gl->gl_target = gh->gh_state;
+				goto retry;
+			}
+			/* Some error or failed "try lock" - report it */
+			if ((ret & LM_OUT_ERROR) ||
+			    (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+				gl->gl_target = gl->gl_state;
+				do_error(gl, ret);
+				goto out;
+			}
+		}
+		switch(state) {
+		/* Unlocked due to conversion deadlock, try again */
+		case LM_ST_UNLOCKED:
+retry:
+			do_xmote(gl, gh, gl->gl_target);
+			break;
+		/* Conversion fails, unlock and try again */
+		case LM_ST_SHARED:
+		case LM_ST_DEFERRED:
+			do_xmote(gl, gh, LM_ST_UNLOCKED);
+			break;
+		default: /* Everything else */
+			pr_err("wanted %u got %u\n", gl->gl_target, state);
+			GLOCK_BUG_ON(gl, 1);
+		}
+		spin_unlock(&gl->gl_spin);
+		return;
+	}
+
+	/* Fast path - we got what we asked for */
+	if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+		gfs2_demote_wake(gl);
+	if (state != LM_ST_UNLOCKED) {
+		if (glops->go_xmote_bh) {
+			spin_unlock(&gl->gl_spin);
+			rv = glops->go_xmote_bh(gl, gh);
+			spin_lock(&gl->gl_spin);
+			if (rv) {
+				do_error(gl, rv);
+				goto out;
+			}
+		}
+		rv = do_promote(gl);
+		if (rv == 2)
+			goto out_locked;
+	}
+out:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
+	spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int lck_flags = gh ? gh->gh_flags : 0;
+	int ret;
+
+	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+		      LM_FLAG_PRIORITY);
+	GLOCK_BUG_ON(gl, gl->gl_state == target);
+	GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
+	if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+	    glops->go_inval) {
+		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+		do_error(gl, 0); /* Fail queued try locks */
+	}
+	gl->gl_req = target;
+	set_bit(GLF_BLOCKING, &gl->gl_flags);
+	if ((gl->gl_req == LM_ST_UNLOCKED) ||
+	    (gl->gl_state == LM_ST_EXCLUSIVE) ||
+	    (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
+		clear_bit(GLF_BLOCKING, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+	if (glops->go_sync)
+		glops->go_sync(gl);
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+		glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+	clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+
+	gfs2_glock_hold(gl);
+	if (sdp->sd_lockstruct.ls_ops->lm_lock)	{
+		/* lock_dlm */
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+		if (ret) {
+			pr_err("lm_lock ret %d\n", ret);
+			GLOCK_BUG_ON(gl, 1);
+		}
+	} else { /* lock_nolock */
+		finish_xmote(gl, target);
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gfs2_glock_put(gl);
+	}
+
+	spin_lock(&gl->gl_spin);
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	if (!list_empty(&gl->gl_holders)) {
+		gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	struct gfs2_holder *gh = NULL;
+	int ret;
+
+	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+		return;
+
+	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_demote_state != gl->gl_state) {
+		if (find_first_holder(gl))
+			goto out_unlock;
+		if (nonblock)
+			goto out_sched;
+		set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+		GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+		gl->gl_target = gl->gl_demote_state;
+	} else {
+		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+			gfs2_demote_wake(gl);
+		ret = do_promote(gl);
+		if (ret == 0)
+			goto out_unlock;
+		if (ret == 2)
+			goto out;
+		gh = find_first_waiter(gl);
+		gl->gl_target = gh->gh_state;
+		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+			do_error(gl, 0); /* Fail queued try locks */
+	}
+	do_xmote(gl, gh, gl->gl_target);
+out:
+	return;
+
+out_sched:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	smp_mb__after_atomic();
+	gl->gl_lockref.count++;
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gl->gl_lockref.count--;
+	return;
+
+out_unlock:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	smp_mb__after_atomic();
+	return;
+}
+
+static void delete_work_func(struct work_struct *work)
+{
+	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip;
+	struct inode *inode;
+	u64 no_addr = gl->gl_name.ln_number;
+
+	ip = gl->gl_object;
+	/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
+
+	if (ip)
+		inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
+	else
+		inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
+	if (inode && !IS_ERR(inode)) {
+		d_prune_aliases(inode);
+		iput(inode);
+	}
+	gfs2_glock_put(gl);
+}
+
+static void glock_work_func(struct work_struct *work)
+{
+	unsigned long delay = 0;
+	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+	int drop_ref = 0;
+
+	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
+		finish_xmote(gl, gl->gl_reply);
+		drop_ref = 1;
+	}
+	spin_lock(&gl->gl_spin);
+	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_state != LM_ST_UNLOCKED &&
+	    gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+		unsigned long holdtime, now = jiffies;
+
+		holdtime = gl->gl_tchange + gl->gl_hold_time;
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+
+		if (!delay) {
+			clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
+			set_bit(GLF_DEMOTE, &gl->gl_flags);
+		}
+	}
+	run_queue(gl, 0);
+	spin_unlock(&gl->gl_spin);
+	if (!delay)
+		gfs2_glock_put(gl);
+	else {
+		if (gl->gl_name.ln_type != LM_TYPE_INODE)
+			delay = 0;
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+			gfs2_glock_put(gl);
+	}
+	if (drop_ref)
+		gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+		   const struct gfs2_glock_operations *glops, int create,
+		   struct gfs2_glock **glp)
+{
+	struct super_block *s = sdp->sd_vfs;
+	struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
+	struct gfs2_glock *gl, *tmp;
+	unsigned int hash = gl_hash(sdp, &name);
+	struct address_space *mapping;
+	struct kmem_cache *cachep;
+
+	rcu_read_lock();
+	gl = search_bucket(hash, sdp, &name);
+	rcu_read_unlock();
+
+	*glp = gl;
+	if (gl)
+		return 0;
+	if (!create)
+		return -ENOENT;
+
+	if (glops->go_flags & GLOF_ASPACE)
+		cachep = gfs2_glock_aspace_cachep;
+	else
+		cachep = gfs2_glock_cachep;
+	gl = kmem_cache_alloc(cachep, GFP_NOFS);
+	if (!gl)
+		return -ENOMEM;
+
+	memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+
+	if (glops->go_flags & GLOF_LVB) {
+		gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS);
+		if (!gl->gl_lksb.sb_lvbptr) {
+			kmem_cache_free(cachep, gl);
+			return -ENOMEM;
+		}
+	}
+
+	atomic_inc(&sdp->sd_glock_disposal);
+	gl->gl_sbd = sdp;
+	gl->gl_flags = 0;
+	gl->gl_name = name;
+	gl->gl_lockref.count = 1;
+	gl->gl_state = LM_ST_UNLOCKED;
+	gl->gl_target = LM_ST_UNLOCKED;
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
+	gl->gl_hash = hash;
+	gl->gl_ops = glops;
+	gl->gl_dstamp = ktime_set(0, 0);
+	preempt_disable();
+	/* We use the global stats to estimate the initial per-glock stats */
+	gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type];
+	preempt_enable();
+	gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0;
+	gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;
+	gl->gl_tchange = jiffies;
+	gl->gl_object = NULL;
+	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
+	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+	INIT_WORK(&gl->gl_delete, delete_work_func);
+
+	mapping = gfs2_glock2aspace(gl);
+	if (mapping) {
+                mapping->a_ops = &gfs2_meta_aops;
+		mapping->host = s->s_bdev->bd_inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_NOFS);
+		mapping->private_data = NULL;
+		mapping->writeback_index = 0;
+	}
+
+	spin_lock_bucket(hash);
+	tmp = search_bucket(hash, sdp, &name);
+	if (tmp) {
+		spin_unlock_bucket(hash);
+		kfree(gl->gl_lksb.sb_lvbptr);
+		kmem_cache_free(cachep, gl);
+		atomic_dec(&sdp->sd_glock_disposal);
+		gl = tmp;
+	} else {
+		hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
+		spin_unlock_bucket(hash);
+	}
+
+	*glp = gl;
+
+	return 0;
+}
+
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+		      struct gfs2_holder *gh)
+{
+	INIT_LIST_HEAD(&gh->gh_list);
+	gh->gh_gl = gl;
+	gh->gh_ip = _RET_IP_;
+	gh->gh_owner_pid = get_pid(task_pid(current));
+	gh->gh_state = state;
+	gh->gh_flags = flags;
+	gh->gh_error = 0;
+	gh->gh_iflags = 0;
+	gfs2_glock_hold(gl);
+}
+
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+
+void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+{
+	gh->gh_state = state;
+	gh->gh_flags = flags;
+	gh->gh_iflags = 0;
+	gh->gh_ip = _RET_IP_;
+	put_pid(gh->gh_owner_pid);
+	gh->gh_owner_pid = get_pid(task_pid(current));
+}
+
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+	put_pid(gh->gh_owner_pid);
+	gfs2_glock_put(gh->gh_gl);
+	gh->gh_gl = NULL;
+	gh->gh_ip = 0;
+}
+
+/**
+ * gfs2_glock_wait - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+	unsigned long time1 = jiffies;
+
+	might_sleep();
+	wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
+	if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */
+		/* Lengthen the minimum hold time. */
+		gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time +
+					      GL_GLOCK_HOLD_INCR,
+					      GL_GLOCK_MAX_HOLD);
+	return gh->gh_error;
+}
+
+/**
+ * handle_callback - process a demote request
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ * There are only two requests that we are going to see in actual
+ * practise: LM_ST_SHARED and LM_ST_UNLOCKED
+ */
+
+static void handle_callback(struct gfs2_glock *gl, unsigned int state,
+			    unsigned long delay, bool remote)
+{
+	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
+
+	set_bit(bit, &gl->gl_flags);
+	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
+		gl->gl_demote_state = state;
+		gl->gl_demote_time = jiffies;
+	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
+			gl->gl_demote_state != state) {
+		gl->gl_demote_state = LM_ST_UNLOCKED;
+	}
+	if (gl->gl_ops->go_callback)
+		gl->gl_ops->go_callback(gl, remote);
+	trace_gfs2_demote_rq(gl, remote);
+}
+
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	if (seq) {
+		seq_vprintf(seq, fmt, args);
+	} else {
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		pr_err("%pV", &vaf);
+	}
+
+	va_end(args);
+}
+
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ * 
+ */
+
+static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct list_head *insert_pt = NULL;
+	struct gfs2_holder *gh2;
+	int try_futile = 0;
+
+	BUG_ON(gh->gh_owner_pid == NULL);
+	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
+		BUG();
+
+	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+		if (test_bit(GLF_LOCK, &gl->gl_flags))
+			try_futile = !may_grant(gl, gh);
+		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+			goto fail;
+	}
+
+	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+			goto trap_recursive;
+		if (try_futile &&
+		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+fail:
+			gh->gh_error = GLR_TRYFAILED;
+			gfs2_holder_wake(gh);
+			return;
+		}
+		if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+			continue;
+		if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+			insert_pt = &gh2->gh_list;
+	}
+	set_bit(GLF_QUEUED, &gl->gl_flags);
+	trace_gfs2_glock_queue(gh, 1);
+	gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
+	gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
+	if (likely(insert_pt == NULL)) {
+		list_add_tail(&gh->gh_list, &gl->gl_holders);
+		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+			goto do_cancel;
+		return;
+	}
+	list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+	gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+		spin_unlock(&gl->gl_spin);
+		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+			sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
+		spin_lock(&gl->gl_spin);
+	}
+	return;
+
+trap_recursive:
+	pr_err("original: %pSR\n", (void *)gh2->gh_ip);
+	pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
+	pr_err("lock type: %d req lock state : %d\n",
+	       gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+	pr_err("new: %pSR\n", (void *)gh->gh_ip);
+	pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
+	pr_err("lock type: %d req lock state : %d\n",
+	       gh->gh_gl->gl_name.ln_type, gh->gh_state);
+	gfs2_dump_glock(NULL, gl);
+	BUG();
+}
+
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	int error = 0;
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	if (test_bit(GLF_LRU, &gl->gl_flags))
+		gfs2_glock_remove_from_lru(gl);
+
+	spin_lock(&gl->gl_spin);
+	add_to_queue(gh);
+	if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
+		     test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
+		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+		gl->gl_lockref.count++;
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gl->gl_lockref.count--;
+	}
+	run_queue(gl, 1);
+	spin_unlock(&gl->gl_spin);
+
+	if (!(gh->gh_flags & GL_ASYNC))
+		error = gfs2_glock_wait(gh);
+
+	return error;
+}
+
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+	return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned delay = 0;
+	int fast_path = 0;
+
+	spin_lock(&gl->gl_spin);
+	if (gh->gh_flags & GL_NOCACHE)
+		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+
+	list_del_init(&gh->gh_list);
+	if (find_first_holder(gl) == NULL) {
+		if (glops->go_unlock) {
+			GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
+			spin_unlock(&gl->gl_spin);
+			glops->go_unlock(gh);
+			spin_lock(&gl->gl_spin);
+			clear_bit(GLF_LOCK, &gl->gl_flags);
+		}
+		if (list_empty(&gl->gl_holders) &&
+		    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+			fast_path = 1;
+	}
+	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
+		gfs2_glock_add_to_lru(gl);
+
+	trace_gfs2_glock_queue(gh, 0);
+	spin_unlock(&gl->gl_spin);
+	if (likely(fast_path))
+		return;
+
+	gfs2_glock_hold(gl);
+	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+	    !test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_name.ln_type == LM_TYPE_INODE)
+		delay = gl->gl_hold_time;
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
+}
+
+void gfs2_glock_dq_wait(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	gfs2_glock_dq(gh);
+	might_sleep();
+	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE);
+}
+
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+	gfs2_glock_dq(gh);
+	gfs2_holder_uninit(gh);
+}
+
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the acquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+		      const struct gfs2_glock_operations *glops,
+		      unsigned int state, int flags, struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl;
+	int error;
+
+	error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+	if (!error) {
+		error = gfs2_glock_nq_init(gl, state, flags, gh);
+		gfs2_glock_put(gl);
+	}
+
+	return error;
+}
+
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+	const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+	const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+	const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+	const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+
+	if (a->ln_number > b->ln_number)
+		return 1;
+	if (a->ln_number < b->ln_number)
+		return -1;
+	BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type);
+	return 0;
+}
+
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+		     struct gfs2_holder **p)
+{
+	unsigned int x;
+	int error = 0;
+
+	for (x = 0; x < num_gh; x++)
+		p[x] = &ghs[x];
+
+	sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+
+	for (x = 0; x < num_gh; x++) {
+		p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+		error = gfs2_glock_nq(p[x]);
+		if (error) {
+			while (x--)
+				gfs2_glock_dq(p[x]);
+			break;
+		}
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	struct gfs2_holder *tmp[4];
+	struct gfs2_holder **pph = tmp;
+	int error = 0;
+
+	switch(num_gh) {
+	case 0:
+		return 0;
+	case 1:
+		ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+		return gfs2_glock_nq(ghs);
+	default:
+		if (num_gh <= 4)
+			break;
+		pph = kmalloc(num_gh * sizeof(struct gfs2_holder *), GFP_NOFS);
+		if (!pph)
+			return -ENOMEM;
+	}
+
+	error = nq_m_sync(num_gh, ghs, pph);
+
+	if (pph != tmp)
+		kfree(pph);
+
+	return error;
+}
+
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	while (num_gh--)
+		gfs2_glock_dq(&ghs[num_gh]);
+}
+
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
+{
+	unsigned long delay = 0;
+	unsigned long holdtime;
+	unsigned long now = jiffies;
+
+	gfs2_glock_hold(gl);
+	holdtime = gl->gl_tchange + gl->gl_hold_time;
+	if (test_bit(GLF_QUEUED, &gl->gl_flags) &&
+	    gl->gl_name.ln_type == LM_TYPE_INODE) {
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+			delay = gl->gl_hold_time;
+	}
+
+	spin_lock(&gl->gl_spin);
+	handle_callback(gl, state, delay, true);
+	spin_unlock(&gl->gl_spin);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_should_freeze - Figure out if glock should be frozen
+ * @gl: The glock in question
+ *
+ * Glocks are not frozen if (a) the result of the dlm operation is
+ * an error, (b) the locking operation was an unlock operation or
+ * (c) if there is a "noexp" flagged request anywhere in the queue
+ *
+ * Returns: 1 if freezing should occur, 0 otherwise
+ */
+
+static int gfs2_should_freeze(const struct gfs2_glock *gl)
+{
+	const struct gfs2_holder *gh;
+
+	if (gl->gl_reply & ~LM_OUT_ST_MASK)
+		return 0;
+	if (gl->gl_target == LM_ST_UNLOCKED)
+		return 0;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (LM_FLAG_NOEXP & gh->gh_flags)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * gfs2_glock_complete - Callback used by locking
+ * @gl: Pointer to the glock
+ * @ret: The return value from the dlm
+ *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
+ */
+
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+
+	spin_lock(&gl->gl_spin);
+	gl->gl_reply = ret;
+
+	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
+		if (gfs2_should_freeze(gl)) {
+			set_bit(GLF_FROZEN, &gl->gl_flags);
+			spin_unlock(&gl->gl_spin);
+			return;
+		}
+	}
+
+	gl->gl_lockref.count++;
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+}
+
+static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct gfs2_glock *gla, *glb;
+
+	gla = list_entry(a, struct gfs2_glock, gl_lru);
+	glb = list_entry(b, struct gfs2_glock, gl_lru);
+
+	if (gla->gl_name.ln_number > glb->gl_name.ln_number)
+		return 1;
+	if (gla->gl_name.ln_number < glb->gl_name.ln_number)
+		return -1;
+
+	return 0;
+}
+
+/**
+ * gfs2_dispose_glock_lru - Demote a list of glocks
+ * @list: The list to dispose of
+ *
+ * Disposing of glocks may involve disk accesses, so that here we sort
+ * the glocks by number (i.e. disk location of the inodes) so that if
+ * there are any such accesses, they'll be sent in order (mostly).
+ *
+ * Must be called under the lru_lock, but may drop and retake this
+ * lock. While the lru_lock is dropped, entries may vanish from the
+ * list, but no new entries will appear on the list (since it is
+ * private)
+ */
+
+static void gfs2_dispose_glock_lru(struct list_head *list)
+__releases(&lru_lock)
+__acquires(&lru_lock)
+{
+	struct gfs2_glock *gl;
+
+	list_sort(NULL, list, glock_cmp);
+
+	while(!list_empty(list)) {
+		gl = list_entry(list->next, struct gfs2_glock, gl_lru);
+		list_del_init(&gl->gl_lru);
+		if (!spin_trylock(&gl->gl_spin)) {
+add_back_to_lru:
+			list_add(&gl->gl_lru, &lru_list);
+			atomic_inc(&lru_count);
+			continue;
+		}
+		if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+			spin_unlock(&gl->gl_spin);
+			goto add_back_to_lru;
+		}
+		clear_bit(GLF_LRU, &gl->gl_flags);
+		gl->gl_lockref.count++;
+		if (demote_ok(gl))
+			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gl->gl_lockref.count--;
+		spin_unlock(&gl->gl_spin);
+		cond_resched_lock(&lru_lock);
+	}
+}
+
+/**
+ * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote
+ * @nr: The number of entries to scan
+ *
+ * This function selects the entries on the LRU which are able to
+ * be demoted, and then kicks off the process by calling
+ * gfs2_dispose_glock_lru() above.
+ */
+
+static long gfs2_scan_glock_lru(int nr)
+{
+	struct gfs2_glock *gl;
+	LIST_HEAD(skipped);
+	LIST_HEAD(dispose);
+	long freed = 0;
+
+	spin_lock(&lru_lock);
+	while ((nr-- >= 0) && !list_empty(&lru_list)) {
+		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
+
+		/* Test for being demotable */
+		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
+			list_move(&gl->gl_lru, &dispose);
+			atomic_dec(&lru_count);
+			freed++;
+			continue;
+		}
+
+		list_move(&gl->gl_lru, &skipped);
+	}
+	list_splice(&skipped, &lru_list);
+	if (!list_empty(&dispose))
+		gfs2_dispose_glock_lru(&dispose);
+	spin_unlock(&lru_lock);
+
+	return freed;
+}
+
+static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+	return gfs2_scan_glock_lru(sc->nr_to_scan);
+}
+
+static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
+					     struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_read(&lru_count));
+}
+
+static struct shrinker glock_shrinker = {
+	.seeks = DEFAULT_SEEKS,
+	.count_objects = gfs2_glock_shrink_count,
+	.scan_objects = gfs2_glock_shrink_scan,
+};
+
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ */
+
+static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
+			  unsigned int hash)
+{
+	struct gfs2_glock *gl;
+	struct hlist_bl_head *head = &gl_hash_table[hash];
+	struct hlist_bl_node *pos;
+
+	rcu_read_lock();
+	hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
+		if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
+			examiner(gl);
+	}
+	rcu_read_unlock();
+	cond_resched();
+}
+
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
+{
+	unsigned x;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(examiner, sdp, x);
+}
+
+
+/**
+ * thaw_glock - thaw out a glock which has an unprocessed reply waiting
+ * @gl: The glock to thaw
+ *
+ */
+
+static void thaw_glock(struct gfs2_glock *gl)
+{
+	if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+		goto out;
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) {
+out:
+		gfs2_glock_put(gl);
+	}
+}
+
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+
+static void clear_glock(struct gfs2_glock *gl)
+{
+	gfs2_glock_remove_from_lru(gl);
+
+	spin_lock(&gl->gl_spin);
+	if (gl->gl_state != LM_ST_UNLOCKED)
+		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+	spin_unlock(&gl->gl_spin);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_thaw - Thaw any frozen glocks
+ * @sdp: The super block
+ *
+ */
+
+void gfs2_glock_thaw(struct gfs2_sbd *sdp)
+{
+	glock_hash_walk(thaw_glock, sdp);
+}
+
+static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+	spin_lock(&gl->gl_spin);
+	gfs2_dump_glock(seq, gl);
+	spin_unlock(&gl->gl_spin);
+}
+
+static void dump_glock_func(struct gfs2_glock *gl)
+{
+	dump_glock(NULL, gl);
+}
+
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem.
+ */
+
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
+{
+	set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
+	flush_workqueue(glock_workqueue);
+	glock_hash_walk(clear_glock, sdp);
+	flush_workqueue(glock_workqueue);
+	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
+	glock_hash_walk(dump_glock_func, sdp);
+}
+
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+	struct gfs2_glock *gl = ip->i_gl;
+	int ret;
+
+	ret = gfs2_truncatei_resume(ip);
+	gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+
+	spin_lock(&gl->gl_spin);
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	run_queue(gl, 1);
+	spin_unlock(&gl->gl_spin);
+}
+
+static const char *state2str(unsigned state)
+{
+	switch(state) {
+	case LM_ST_UNLOCKED:
+		return "UN";
+	case LM_ST_SHARED:
+		return "SH";
+	case LM_ST_DEFERRED:
+		return "DF";
+	case LM_ST_EXCLUSIVE:
+		return "EX";
+	}
+	return "??";
+}
+
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+	char *p = buf;
+	if (flags & LM_FLAG_TRY)
+		*p++ = 't';
+	if (flags & LM_FLAG_TRY_1CB)
+		*p++ = 'T';
+	if (flags & LM_FLAG_NOEXP)
+		*p++ = 'e';
+	if (flags & LM_FLAG_ANY)
+		*p++ = 'A';
+	if (flags & LM_FLAG_PRIORITY)
+		*p++ = 'p';
+	if (flags & GL_ASYNC)
+		*p++ = 'a';
+	if (flags & GL_EXACT)
+		*p++ = 'E';
+	if (flags & GL_NOCACHE)
+		*p++ = 'c';
+	if (test_bit(HIF_HOLDER, &iflags))
+		*p++ = 'H';
+	if (test_bit(HIF_WAIT, &iflags))
+		*p++ = 'W';
+	if (test_bit(HIF_FIRST, &iflags))
+		*p++ = 'F';
+	*p = 0;
+	return buf;
+}
+
+/**
+ * dump_holder - print information about a glock holder
+ * @seq: the seq_file struct
+ * @gh: the glock holder
+ *
+ */
+
+static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+{
+	struct task_struct *gh_owner = NULL;
+	char flags_buf[32];
+
+	rcu_read_lock();
+	if (gh->gh_owner_pid)
+		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
+	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+		       state2str(gh->gh_state),
+		       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+		       gh->gh_error,
+		       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+		       gh_owner ? gh_owner->comm : "(ended)",
+		       (void *)gh->gh_ip);
+	rcu_read_unlock();
+}
+
+static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
+{
+	const unsigned long *gflags = &gl->gl_flags;
+	char *p = buf;
+
+	if (test_bit(GLF_LOCK, gflags))
+		*p++ = 'l';
+	if (test_bit(GLF_DEMOTE, gflags))
+		*p++ = 'D';
+	if (test_bit(GLF_PENDING_DEMOTE, gflags))
+		*p++ = 'd';
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
+		*p++ = 'p';
+	if (test_bit(GLF_DIRTY, gflags))
+		*p++ = 'y';
+	if (test_bit(GLF_LFLUSH, gflags))
+		*p++ = 'f';
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
+		*p++ = 'i';
+	if (test_bit(GLF_REPLY_PENDING, gflags))
+		*p++ = 'r';
+	if (test_bit(GLF_INITIAL, gflags))
+		*p++ = 'I';
+	if (test_bit(GLF_FROZEN, gflags))
+		*p++ = 'F';
+	if (test_bit(GLF_QUEUED, gflags))
+		*p++ = 'q';
+	if (test_bit(GLF_LRU, gflags))
+		*p++ = 'L';
+	if (gl->gl_object)
+		*p++ = 'o';
+	if (test_bit(GLF_BLOCKING, gflags))
+		*p++ = 'b';
+	*p = 0;
+	return buf;
+}
+
+/**
+ * gfs2_dump_glock - print information about a glock
+ * @seq: The seq_file struct
+ * @gl: the glock
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
+ *
+ */
+
+void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned long long dtime;
+	const struct gfs2_holder *gh;
+	char gflags_buf[32];
+
+	dtime = jiffies - gl->gl_demote_time;
+	dtime *= 1000000/HZ; /* demote time in uSec */
+	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+		dtime = 0;
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n",
+		  state2str(gl->gl_state),
+		  gl->gl_name.ln_type,
+		  (unsigned long long)gl->gl_name.ln_number,
+		  gflags2str(gflags_buf, gl),
+		  state2str(gl->gl_target),
+		  state2str(gl->gl_demote_state), dtime,
+		  atomic_read(&gl->gl_ail_count),
+		  atomic_read(&gl->gl_revokes),
+		  (int)gl->gl_lockref.count, gl->gl_hold_time);
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list)
+		dump_holder(seq, gh);
+
+	if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
+		glops->go_dump(seq, gl);
+}
+
+static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+	struct gfs2_glock *gl = iter_ptr;
+
+	seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n",
+		   gl->gl_name.ln_type,
+		   (unsigned long long)gl->gl_name.ln_number,
+		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
+		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
+		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
+		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
+		   (long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
+		   (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
+		   (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
+		   (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
+	return 0;
+}
+
+static const char *gfs2_gltype[] = {
+	"type",
+	"reserved",
+	"nondisk",
+	"inode",
+	"rgrp",
+	"meta",
+	"iopen",
+	"flock",
+	"plock",
+	"quota",
+	"journal",
+};
+
+static const char *gfs2_stype[] = {
+	[GFS2_LKS_SRTT]		= "srtt",
+	[GFS2_LKS_SRTTVAR]	= "srttvar",
+	[GFS2_LKS_SRTTB]	= "srttb",
+	[GFS2_LKS_SRTTVARB]	= "srttvarb",
+	[GFS2_LKS_SIRT]		= "sirt",
+	[GFS2_LKS_SIRTVAR]	= "sirtvar",
+	[GFS2_LKS_DCOUNT]	= "dlm",
+	[GFS2_LKS_QCOUNT]	= "queue",
+};
+
+#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype))
+
+static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+	struct gfs2_sbd *sdp = gi->sdp;
+	unsigned index = gi->hash >> 3;
+	unsigned subindex = gi->hash & 0x07;
+	s64 value;
+	int i;
+
+	if (index == 0 && subindex != 0)
+		return 0;
+
+	seq_printf(seq, "%-10s %8s:", gfs2_gltype[index],
+		   (index == 0) ? "cpu": gfs2_stype[subindex]);
+
+	for_each_possible_cpu(i) {
+                const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
+		if (index == 0) {
+			value = i;
+		} else {
+			value = lkstats->lkstats[index - 1].stats[subindex];
+		}
+		seq_printf(seq, " %15lld", (long long)value);
+	}
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+int __init gfs2_glock_init(void)
+{
+	unsigned i;
+	for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
+		INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
+	}
+
+	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
+					  WQ_HIGHPRI | WQ_FREEZABLE, 0);
+	if (!glock_workqueue)
+		return -ENOMEM;
+	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+						WQ_MEM_RECLAIM | WQ_FREEZABLE,
+						0);
+	if (!gfs2_delete_workqueue) {
+		destroy_workqueue(glock_workqueue);
+		return -ENOMEM;
+	}
+
+	register_shrinker(&glock_shrinker);
+
+	return 0;
+}
+
+void gfs2_glock_exit(void)
+{
+	unregister_shrinker(&glock_shrinker);
+	destroy_workqueue(glock_workqueue);
+	destroy_workqueue(gfs2_delete_workqueue);
+}
+
+static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+{
+	return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
+			      struct gfs2_glock, gl_list);
+}
+
+static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+{
+	return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
+			      struct gfs2_glock, gl_list);
+}
+
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
+{
+	struct gfs2_glock *gl;
+
+	do {
+		gl = gi->gl;
+		if (gl) {
+			gi->gl = glock_hash_next(gl);
+			gi->nhash++;
+		} else {
+			if (gi->hash >= GFS2_GL_HASH_SIZE) {
+				rcu_read_unlock();
+				return 1;
+			}
+			gi->gl = glock_hash_chain(gi->hash);
+			gi->nhash = 0;
+		}
+		while (gi->gl == NULL) {
+			gi->hash++;
+			if (gi->hash >= GFS2_GL_HASH_SIZE) {
+				rcu_read_unlock();
+				return 1;
+			}
+			gi->gl = glock_hash_chain(gi->hash);
+			gi->nhash = 0;
+		}
+	/* Skip entries for other sb and dead entries */
+	} while (gi->sdp != gi->gl->gl_sbd ||
+		 __lockref_is_dead(&gi->gl->gl_lockref));
+
+	return 0;
+}
+
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+	loff_t n = *pos;
+
+	if (gi->last_pos <= *pos)
+		n = gi->nhash + (*pos - gi->last_pos);
+	else
+		gi->hash = 0;
+
+	gi->nhash = 0;
+	rcu_read_lock();
+
+	do {
+		if (gfs2_glock_iter_next(gi))
+			return NULL;
+	} while (n--);
+
+	gi->last_pos = *pos;
+	return gi->gl;
+}
+
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
+				 loff_t *pos)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+
+	(*pos)++;
+	gi->last_pos = *pos;
+	if (gfs2_glock_iter_next(gi))
+		return NULL;
+
+	return gi->gl;
+}
+
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+
+	if (gi->gl)
+		rcu_read_unlock();
+	gi->gl = NULL;
+}
+
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+	dump_glock(seq, iter_ptr);
+	return 0;
+}
+
+static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+
+	gi->hash = *pos;
+	if (*pos >= GFS2_NR_SBSTATS)
+		return NULL;
+	preempt_disable();
+	return SEQ_START_TOKEN;
+}
+
+static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+	(*pos)++;
+	gi->hash++;
+	if (gi->hash >= GFS2_NR_SBSTATS) {
+		preempt_enable();
+		return NULL;
+	}
+	return SEQ_START_TOKEN;
+}
+
+static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+	preempt_enable();
+}
+
+static const struct seq_operations gfs2_glock_seq_ops = {
+	.start = gfs2_glock_seq_start,
+	.next  = gfs2_glock_seq_next,
+	.stop  = gfs2_glock_seq_stop,
+	.show  = gfs2_glock_seq_show,
+};
+
+static const struct seq_operations gfs2_glstats_seq_ops = {
+	.start = gfs2_glock_seq_start,
+	.next  = gfs2_glock_seq_next,
+	.stop  = gfs2_glock_seq_stop,
+	.show  = gfs2_glstats_seq_show,
+};
+
+static const struct seq_operations gfs2_sbstats_seq_ops = {
+	.start = gfs2_sbstats_seq_start,
+	.next  = gfs2_sbstats_seq_next,
+	.stop  = gfs2_sbstats_seq_stop,
+	.show  = gfs2_sbstats_seq_show,
+};
+
+#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL)
+
+static int gfs2_glocks_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open_private(file, &gfs2_glock_seq_ops,
+				   sizeof(struct gfs2_glock_iter));
+	if (ret == 0) {
+		struct seq_file *seq = file->private_data;
+		struct gfs2_glock_iter *gi = seq->private;
+		gi->sdp = inode->i_private;
+		seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
+		if (seq->buf)
+			seq->size = GFS2_SEQ_GOODSIZE;
+	}
+	return ret;
+}
+
+static int gfs2_glstats_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
+				   sizeof(struct gfs2_glock_iter));
+	if (ret == 0) {
+		struct seq_file *seq = file->private_data;
+		struct gfs2_glock_iter *gi = seq->private;
+		gi->sdp = inode->i_private;
+		seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
+		if (seq->buf)
+			seq->size = GFS2_SEQ_GOODSIZE;
+	}
+	return ret;
+}
+
+static int gfs2_sbstats_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open_private(file, &gfs2_sbstats_seq_ops,
+				   sizeof(struct gfs2_glock_iter));
+	if (ret == 0) {
+		struct seq_file *seq = file->private_data;
+		struct gfs2_glock_iter *gi = seq->private;
+		gi->sdp = inode->i_private;
+	}
+	return ret;
+}
+
+static const struct file_operations gfs2_glocks_fops = {
+	.owner   = THIS_MODULE,
+	.open    = gfs2_glocks_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+
+static const struct file_operations gfs2_glstats_fops = {
+	.owner   = THIS_MODULE,
+	.open    = gfs2_glstats_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+
+static const struct file_operations gfs2_sbstats_fops = {
+	.owner   = THIS_MODULE,
+	.open	 = gfs2_sbstats_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+	struct dentry *dent;
+
+	dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+	if (IS_ERR_OR_NULL(dent))
+		goto fail;
+	sdp->debugfs_dir = dent;
+
+	dent = debugfs_create_file("glocks",
+				   S_IFREG | S_IRUGO,
+				   sdp->debugfs_dir, sdp,
+				   &gfs2_glocks_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto fail;
+	sdp->debugfs_dentry_glocks = dent;
+
+	dent = debugfs_create_file("glstats",
+				   S_IFREG | S_IRUGO,
+				   sdp->debugfs_dir, sdp,
+				   &gfs2_glstats_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto fail;
+	sdp->debugfs_dentry_glstats = dent;
+
+	dent = debugfs_create_file("sbstats",
+				   S_IFREG | S_IRUGO,
+				   sdp->debugfs_dir, sdp,
+				   &gfs2_sbstats_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto fail;
+	sdp->debugfs_dentry_sbstats = dent;
+
+	return 0;
+fail:
+	gfs2_delete_debugfs_file(sdp);
+	return dent ? PTR_ERR(dent) : -ENOMEM;
+}
+
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
+{
+	if (sdp->debugfs_dir) {
+		if (sdp->debugfs_dentry_glocks) {
+			debugfs_remove(sdp->debugfs_dentry_glocks);
+			sdp->debugfs_dentry_glocks = NULL;
+		}
+		if (sdp->debugfs_dentry_glstats) {
+			debugfs_remove(sdp->debugfs_dentry_glstats);
+			sdp->debugfs_dentry_glstats = NULL;
+		}
+		if (sdp->debugfs_dentry_sbstats) {
+			debugfs_remove(sdp->debugfs_dentry_sbstats);
+			sdp->debugfs_dentry_sbstats = NULL;
+		}
+		debugfs_remove(sdp->debugfs_dir);
+		sdp->debugfs_dir = NULL;
+	}
+}
+
+int gfs2_register_debugfs(void)
+{
+	gfs2_root = debugfs_create_dir("gfs2", NULL);
+	if (IS_ERR(gfs2_root))
+		return PTR_ERR(gfs2_root);
+	return gfs2_root ? 0 : -ENOMEM;
+}
+
+void gfs2_unregister_debugfs(void)
+{
+	debugfs_remove(gfs2_root);
+	gfs2_root = NULL;
+}
diff --git a/kernel/fs/gfs2/glock.h b/kernel/fs/gfs2/glock.h
new file mode 100644
index 000000000..32572f71f
--- /dev/null
+++ b/kernel/fs/gfs2/glock.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include "incore.h"
+
+/* Options for hostdata parser */
+
+enum {
+	Opt_jid,
+	Opt_id,
+	Opt_first,
+	Opt_nodir,
+	Opt_err,
+};
+
+/*
+ * lm_lockname types
+ */
+
+#define LM_TYPE_RESERVED	0x00
+#define LM_TYPE_NONDISK		0x01
+#define LM_TYPE_INODE		0x02
+#define LM_TYPE_RGRP		0x03
+#define LM_TYPE_META		0x04
+#define LM_TYPE_IOPEN		0x05
+#define LM_TYPE_FLOCK		0x06
+#define LM_TYPE_PLOCK		0x07
+#define LM_TYPE_QUOTA		0x08
+#define LM_TYPE_JOURNAL		0x09
+
+/*
+ * lm_lock() states
+ *
+ * SHARED is compatible with SHARED, not with DEFERRED or EX.
+ * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
+ */
+
+#define LM_ST_UNLOCKED		0
+#define LM_ST_EXCLUSIVE		1
+#define LM_ST_DEFERRED		2
+#define LM_ST_SHARED		3
+
+/*
+ * lm_lock() flags
+ *
+ * LM_FLAG_TRY
+ * Don't wait to acquire the lock if it can't be granted immediately.
+ *
+ * LM_FLAG_TRY_1CB
+ * Send one blocking callback if TRY is set and the lock is not granted.
+ *
+ * LM_FLAG_NOEXP
+ * GFS sets this flag on lock requests it makes while doing journal recovery.
+ * These special requests should not be blocked due to the recovery like
+ * ordinary locks would be.
+ *
+ * LM_FLAG_ANY
+ * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
+ * also be granted in SHARED.  The preferred state is whichever is compatible
+ * with other granted locks, or the specified state if no other locks exist.
+ *
+ * LM_FLAG_PRIORITY
+ * Override fairness considerations.  Suppose a lock is held in a shared state
+ * and there is a pending request for the deferred state.  A shared lock
+ * request with the priority flag would be allowed to bypass the deferred
+ * request and directly join the other shared lock.  A shared lock request
+ * without the priority flag might be forced to wait until the deferred
+ * requested had acquired and released the lock.
+ */
+
+#define LM_FLAG_TRY		0x00000001
+#define LM_FLAG_TRY_1CB		0x00000002
+#define LM_FLAG_NOEXP		0x00000004
+#define LM_FLAG_ANY		0x00000008
+#define LM_FLAG_PRIORITY	0x00000010
+#define GL_ASYNC		0x00000040
+#define GL_EXACT		0x00000080
+#define GL_SKIP			0x00000100
+#define GL_NOCACHE		0x00000400
+  
+/*
+ * lm_async_cb return flags
+ *
+ * LM_OUT_ST_MASK
+ * Masks the lower two bits of lock state in the returned value.
+ *
+ * LM_OUT_CANCELED
+ * The lock request was canceled.
+ *
+ */
+
+#define LM_OUT_ST_MASK		0x00000003
+#define LM_OUT_CANCELED		0x00000008
+#define LM_OUT_ERROR		0x00000004
+
+/*
+ * lm_recovery_done() messages
+ */
+
+#define LM_RD_GAVEUP		308
+#define LM_RD_SUCCESS		309
+
+#define GLR_TRYFAILED		13
+
+#define GL_GLOCK_MAX_HOLD        (long)(HZ / 5)
+#define GL_GLOCK_DFT_HOLD        (long)(HZ / 5)
+#define GL_GLOCK_MIN_HOLD        (long)(10)
+#define GL_GLOCK_HOLD_INCR       (long)(HZ / 20)
+#define GL_GLOCK_HOLD_DECR       (long)(HZ / 40)
+
+struct lm_lockops {
+	const char *lm_proto_name;
+	int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
+	void (*lm_first_done) (struct gfs2_sbd *sdp);
+	void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
+				    unsigned int result);
+	void (*lm_unmount) (struct gfs2_sbd *sdp);
+	void (*lm_withdraw) (struct gfs2_sbd *sdp);
+	void (*lm_put_lock) (struct gfs2_glock *gl);
+	int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+			unsigned int flags);
+	void (*lm_cancel) (struct gfs2_glock *gl);
+	const match_table_t *lm_tokens;
+};
+
+extern struct workqueue_struct *gfs2_delete_workqueue;
+static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+	struct pid *pid;
+
+	/* Look in glock's list of holders for one with current task as owner */
+	spin_lock(&gl->gl_spin);
+	pid = task_pid(current);
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			break;
+		if (gh->gh_owner_pid == pid)
+			goto out;
+	}
+	gh = NULL;
+out:
+	spin_unlock(&gl->gl_spin);
+
+	return gh;
+}
+
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_EXCLUSIVE;
+}
+
+static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_DEFERRED;
+}
+
+static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_SHARED;
+}
+
+static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
+{
+	if (gl->gl_ops->go_flags & GLOF_ASPACE)
+		return (struct address_space *)(gl + 1);
+	return NULL;
+}
+
+extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+			  const struct gfs2_glock_operations *glops,
+			  int create, struct gfs2_glock **glp);
+extern void gfs2_glock_put(struct gfs2_glock *gl);
+extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+			     unsigned flags, struct gfs2_holder *gh);
+extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
+			       struct gfs2_holder *gh);
+extern void gfs2_holder_uninit(struct gfs2_holder *gh);
+extern int gfs2_glock_nq(struct gfs2_holder *gh);
+extern int gfs2_glock_poll(struct gfs2_holder *gh);
+extern int gfs2_glock_wait(struct gfs2_holder *gh);
+extern void gfs2_glock_dq(struct gfs2_holder *gh);
+extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+			     const struct gfs2_glock_operations *glops,
+			     unsigned int state, int flags,
+			     struct gfs2_holder *gh);
+extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
+extern __printf(2, 3)
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
+
+/**
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_*, or errno
+ */
+
+static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
+				     unsigned int state, int flags,
+				     struct gfs2_holder *gh)
+{
+	int error;
+
+	gfs2_holder_init(gl, state, flags, gh);
+
+	error = gfs2_glock_nq(gh);
+	if (error)
+		gfs2_holder_uninit(gh);
+
+	return error;
+}
+
+extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
+extern void gfs2_glock_free(struct gfs2_glock *gl);
+
+extern int __init gfs2_glock_init(void);
+extern void gfs2_glock_exit(void);
+
+extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+extern int gfs2_register_debugfs(void);
+extern void gfs2_unregister_debugfs(void);
+
+extern const struct lm_lockops gfs2_dlm_ops;
+
+#endif /* __GLOCK_DOT_H__ */
diff --git a/kernel/fs/gfs2/glops.c b/kernel/fs/gfs2/glops.c
new file mode 100644
index 000000000..fe91951c3
--- /dev/null
+++ b/kernel/fs/gfs2/glops.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/posix_acl.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "util.h"
+#include "trans.h"
+#include "dir.h"
+
+struct workqueue_struct *gfs2_freeze_wq;
+
+static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
+{
+	fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+	       bh, (unsigned long long)bh->b_blocknr, bh->b_state,
+	       bh->b_page->mapping, bh->b_page->flags);
+	fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+	       gl->gl_name.ln_type, gl->gl_name.ln_number,
+	       gfs2_glock2aspace(gl));
+	gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+}
+
+/**
+ * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
+ * @gl: the glock
+ * @fsync: set when called from fsync (not all buffers will be clean)
+ *
+ * None of the buffers should be dirty, locked, or pinned.
+ */
+
+static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
+			     unsigned int nr_revokes)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct list_head *head = &gl->gl_ail_list;
+	struct gfs2_bufdata *bd, *tmp;
+	struct buffer_head *bh;
+	const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
+
+	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) {
+		if (nr_revokes == 0)
+			break;
+		bh = bd->bd_bh;
+		if (bh->b_state & b_state) {
+			if (fsync)
+				continue;
+			gfs2_ail_error(gl, bh);
+		}
+		gfs2_trans_add_revoke(sdp, bd);
+		nr_revokes--;
+	}
+	GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+}
+
+
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_trans tr;
+
+	memset(&tr, 0, sizeof(tr));
+	INIT_LIST_HEAD(&tr.tr_buf);
+	INIT_LIST_HEAD(&tr.tr_databuf);
+	tr.tr_revokes = atomic_read(&gl->gl_ail_count);
+
+	if (!tr.tr_revokes)
+		return;
+
+	/* A shortened, inline version of gfs2_trans_begin()
+         * tr->alloced is not set since the transaction structure is
+         * on the stack */
+	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
+	tr.tr_ip = _RET_IP_;
+	if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0)
+		return;
+	WARN_ON_ONCE(current->journal_info);
+	current->journal_info = &tr;
+
+	__gfs2_ail_flush(gl, 0, tr.tr_revokes);
+
+	gfs2_trans_end(sdp);
+	gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+}
+
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int revokes = atomic_read(&gl->gl_ail_count);
+	unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+	int ret;
+
+	if (!revokes)
+		return;
+
+	while (revokes > max_revokes)
+		max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+
+	ret = gfs2_trans_begin(sdp, 0, max_revokes);
+	if (ret)
+		return;
+	__gfs2_ail_flush(gl, fsync, max_revokes);
+	gfs2_trans_end(sdp);
+	gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+}
+
+/**
+ * rgrp_go_sync - sync out the metadata for this glock
+ * @gl: the glock
+ *
+ * Called when demoting or unlocking an EX glock.  We must flush
+ * to disk all dirty buffers/pages relating to this glock, and must not
+ * not return to caller to demote/unlock the glock until I/O is complete.
+ */
+
+static void rgrp_go_sync(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = &sdp->sd_aspace;
+	struct gfs2_rgrpd *rgd;
+	int error;
+
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
+		return;
+	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
+
+	gfs2_log_flush(sdp, gl, NORMAL_FLUSH);
+	filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+	error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+	mapping_set_error(mapping, error);
+	gfs2_ail_empty_gl(gl);
+
+	spin_lock(&gl->gl_spin);
+	rgd = gl->gl_object;
+	if (rgd)
+		gfs2_free_clones(rgd);
+	spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * rgrp_go_inval - invalidate the metadata for this glock
+ * @gl: the glock
+ * @flags:
+ *
+ * We never used LM_ST_DEFERRED with resource groups, so that we
+ * should always see the metadata flag set here.
+ *
+ */
+
+static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = &sdp->sd_aspace;
+
+	WARN_ON_ONCE(!(flags & DIO_METADATA));
+	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+	truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+
+	if (gl->gl_object) {
+		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+		rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+	}
+}
+
+/**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ *
+ */
+
+static void inode_go_sync(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip = gl->gl_object;
+	struct address_space *metamapping = gfs2_glock2aspace(gl);
+	int error;
+
+	if (ip && !S_ISREG(ip->i_inode.i_mode))
+		ip = NULL;
+	if (ip) {
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+		inode_dio_wait(&ip->i_inode);
+	}
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
+		return;
+
+	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
+
+	gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
+	filemap_fdatawrite(metamapping);
+	if (ip) {
+		struct address_space *mapping = ip->i_inode.i_mapping;
+		filemap_fdatawrite(mapping);
+		error = filemap_fdatawait(mapping);
+		mapping_set_error(mapping, error);
+	}
+	error = filemap_fdatawait(metamapping);
+	mapping_set_error(metamapping, error);
+	gfs2_ail_empty_gl(gl);
+	/*
+	 * Writeback of the data mapping may cause the dirty flag to be set
+	 * so we have to clear it again here.
+	 */
+	smp_mb__before_atomic();
+	clear_bit(GLF_DIRTY, &gl->gl_flags);
+}
+
+/**
+ * inode_go_inval - prepare a inode glock to be released
+ * @gl: the glock
+ * @flags:
+ *
+ * Normally we invalidate everything, but if we are moving into
+ * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we
+ * can keep hold of the metadata, since it won't have changed.
+ *
+ */
+
+static void inode_go_inval(struct gfs2_glock *gl, int flags)
+{
+	struct gfs2_inode *ip = gl->gl_object;
+
+	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+
+	if (flags & DIO_METADATA) {
+		struct address_space *mapping = gfs2_glock2aspace(gl);
+		truncate_inode_pages(mapping, 0);
+		if (ip) {
+			set_bit(GIF_INVALID, &ip->i_flags);
+			forget_all_cached_acls(&ip->i_inode);
+			gfs2_dir_hash_inval(ip);
+		}
+	}
+
+	if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
+		gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
+		gl->gl_sbd->sd_rindex_uptodate = 0;
+	}
+	if (ip && S_ISREG(ip->i_inode.i_mode))
+		truncate_inode_pages(ip->i_inode.i_mapping, 0);
+}
+
+/**
+ * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_holder *gh;
+
+	if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+		return 0;
+
+	if (!list_empty(&gl->gl_holders)) {
+		gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+		if (gh->gh_list.next != &gl->gl_holders)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * gfs2_set_nlink - Set the inode's link count based on on-disk info
+ * @inode: The inode in question
+ * @nlink: The link count
+ *
+ * If the link count has hit zero, it must never be raised, whatever the
+ * on-disk inode might say. When new struct inodes are created the link
+ * count is set to 1, so that we can safely use this test even when reading
+ * in on disk information for the first time.
+ */
+
+static void gfs2_set_nlink(struct inode *inode, u32 nlink)
+{
+	/*
+	 * We will need to review setting the nlink count here in the
+	 * light of the forthcoming ro bind mount work. This is a reminder
+	 * to do that.
+	 */
+	if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) {
+		if (nlink == 0)
+			clear_nlink(inode);
+		else
+			set_nlink(inode, nlink);
+	}
+}
+
+static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+{
+	const struct gfs2_dinode *str = buf;
+	struct timespec atime;
+	u16 height, depth;
+
+	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+		goto corrupt;
+	ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
+	ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
+	ip->i_inode.i_rdev = 0;
+	switch (ip->i_inode.i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
+					   be32_to_cpu(str->di_minor));
+		break;
+	};
+
+	i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
+	i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
+	gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
+	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
+	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+	atime.tv_sec = be64_to_cpu(str->di_atime);
+	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+		ip->i_inode.i_atime = atime;
+	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+	ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+	ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+
+	ip->i_goal = be64_to_cpu(str->di_goal_meta);
+	ip->i_generation = be64_to_cpu(str->di_generation);
+
+	ip->i_diskflags = be32_to_cpu(str->di_flags);
+	ip->i_eattr = be64_to_cpu(str->di_eattr);
+	/* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */
+	gfs2_set_inode_flags(&ip->i_inode);
+	height = be16_to_cpu(str->di_height);
+	if (unlikely(height > GFS2_MAX_META_HEIGHT))
+		goto corrupt;
+	ip->i_height = (u8)height;
+
+	depth = be16_to_cpu(str->di_depth);
+	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+		goto corrupt;
+	ip->i_depth = (u8)depth;
+	ip->i_entries = be32_to_cpu(str->di_entries);
+
+	if (S_ISREG(ip->i_inode.i_mode))
+		gfs2_set_aops(&ip->i_inode);
+
+	return 0;
+corrupt:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	error = gfs2_dinode_in(ip, dibh->b_data);
+	brelse(dibh);
+	clear_bit(GIF_INVALID, &ip->i_flags);
+
+	return error;
+}
+
+/**
+ * inode_go_lock - operation done after an inode lock is locked by a process
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int inode_go_lock(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = gl->gl_object;
+	int error = 0;
+
+	if (!ip || (gh->gh_flags & GL_SKIP))
+		return 0;
+
+	if (test_bit(GIF_INVALID, &ip->i_flags)) {
+		error = gfs2_inode_refresh(ip);
+		if (error)
+			return error;
+	}
+
+	if (gh->gh_state != LM_ST_DEFERRED)
+		inode_dio_wait(&ip->i_inode);
+
+	if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
+	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
+	    (gh->gh_state == LM_ST_EXCLUSIVE)) {
+		spin_lock(&sdp->sd_trunc_lock);
+		if (list_empty(&ip->i_trunc_list))
+			list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		wake_up(&sdp->sd_quota_wait);
+		return 1;
+	}
+
+	return error;
+}
+
+/**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ */
+
+static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_inode *ip = gl->gl_object;
+	if (ip == NULL)
+		return;
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
+		  (unsigned long long)ip->i_no_formal_ino,
+		  (unsigned long long)ip->i_no_addr,
+		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
+		  (unsigned int)ip->i_diskflags,
+		  (unsigned long long)i_size_read(&ip->i_inode));
+}
+
+/**
+ * freeze_go_sync - promote/demote the freeze glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+
+static void freeze_go_sync(struct gfs2_glock *gl)
+{
+	int error = 0;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	if (gl->gl_state == LM_ST_SHARED &&
+	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
+		error = freeze_super(sdp->sd_vfs);
+		if (error) {
+			printk(KERN_INFO "GFS2: couldn't freeze filesystem: %d\n", error);
+			gfs2_assert_withdraw(sdp, 0);
+		}
+		queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
+		gfs2_log_flush(sdp, NULL, FREEZE_FLUSH);
+	}
+}
+
+/**
+ * freeze_go_xmote_bh - After promoting/demoting the freeze glock
+ * @gl: the glock
+ *
+ */
+
+static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+	struct gfs2_glock *j_gl = ip->i_gl;
+	struct gfs2_log_header_host head;
+	int error;
+
+	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
+
+		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+		if (error)
+			gfs2_consist(sdp);
+		if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
+			gfs2_consist(sdp);
+
+		/*  Initialize some head of the log stuff  */
+		if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+			sdp->sd_log_sequence = head.lh_sequence + 1;
+			gfs2_log_pointers_init(sdp, head.lh_blkno);
+		}
+	}
+	return 0;
+}
+
+/**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+
+static int freeze_go_demote_ok(const struct gfs2_glock *gl)
+{
+	return 0;
+}
+
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
+{
+	struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
+		return;
+
+	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+	    gl->gl_state == LM_ST_SHARED && ip) {
+		gl->gl_lockref.count++;
+		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+			gl->gl_lockref.count--;
+	}
+}
+
+const struct gfs2_glock_operations gfs2_meta_glops = {
+	.go_type = LM_TYPE_META,
+};
+
+const struct gfs2_glock_operations gfs2_inode_glops = {
+	.go_sync = inode_go_sync,
+	.go_inval = inode_go_inval,
+	.go_demote_ok = inode_go_demote_ok,
+	.go_lock = inode_go_lock,
+	.go_dump = inode_go_dump,
+	.go_type = LM_TYPE_INODE,
+	.go_flags = GLOF_ASPACE,
+};
+
+const struct gfs2_glock_operations gfs2_rgrp_glops = {
+	.go_sync = rgrp_go_sync,
+	.go_inval = rgrp_go_inval,
+	.go_lock = gfs2_rgrp_go_lock,
+	.go_unlock = gfs2_rgrp_go_unlock,
+	.go_dump = gfs2_rgrp_dump,
+	.go_type = LM_TYPE_RGRP,
+	.go_flags = GLOF_LVB,
+};
+
+const struct gfs2_glock_operations gfs2_freeze_glops = {
+	.go_sync = freeze_go_sync,
+	.go_xmote_bh = freeze_go_xmote_bh,
+	.go_demote_ok = freeze_go_demote_ok,
+	.go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_iopen_glops = {
+	.go_type = LM_TYPE_IOPEN,
+	.go_callback = iopen_go_callback,
+};
+
+const struct gfs2_glock_operations gfs2_flock_glops = {
+	.go_type = LM_TYPE_FLOCK,
+};
+
+const struct gfs2_glock_operations gfs2_nondisk_glops = {
+	.go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_quota_glops = {
+	.go_type = LM_TYPE_QUOTA,
+	.go_flags = GLOF_LVB,
+};
+
+const struct gfs2_glock_operations gfs2_journal_glops = {
+	.go_type = LM_TYPE_JOURNAL,
+};
+
+const struct gfs2_glock_operations *gfs2_glops_list[] = {
+	[LM_TYPE_META] = &gfs2_meta_glops,
+	[LM_TYPE_INODE] = &gfs2_inode_glops,
+	[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
+	[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
+	[LM_TYPE_FLOCK] = &gfs2_flock_glops,
+	[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
+	[LM_TYPE_QUOTA] = &gfs2_quota_glops,
+	[LM_TYPE_JOURNAL] = &gfs2_journal_glops,
+};
+
diff --git a/kernel/fs/gfs2/glops.h b/kernel/fs/gfs2/glops.h
new file mode 100644
index 000000000..8ed1857c1
--- /dev/null
+++ b/kernel/fs/gfs2/glops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOPS_DOT_H__
+#define __GLOPS_DOT_H__
+
+#include "incore.h"
+
+extern struct workqueue_struct *gfs2_freeze_wq;
+
+extern const struct gfs2_glock_operations gfs2_meta_glops;
+extern const struct gfs2_glock_operations gfs2_inode_glops;
+extern const struct gfs2_glock_operations gfs2_rgrp_glops;
+extern const struct gfs2_glock_operations gfs2_freeze_glops;
+extern const struct gfs2_glock_operations gfs2_iopen_glops;
+extern const struct gfs2_glock_operations gfs2_flock_glops;
+extern const struct gfs2_glock_operations gfs2_nondisk_glops;
+extern const struct gfs2_glock_operations gfs2_quota_glops;
+extern const struct gfs2_glock_operations gfs2_journal_glops;
+extern const struct gfs2_glock_operations *gfs2_glops_list[];
+
+extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
+
+#endif /* __GLOPS_DOT_H__ */
diff --git a/kernel/fs/gfs2/incore.h b/kernel/fs/gfs2/incore.h
new file mode 100644
index 000000000..58b75abf6
--- /dev/null
+++ b/kernel/fs/gfs2/incore.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INCORE_DOT_H__
+#define __INCORE_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/workqueue.h>
+#include <linux/dlm.h>
+#include <linux/buffer_head.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+#include <linux/percpu.h>
+#include <linux/lockref.h>
+
+#define DIO_WAIT	0x00000010
+#define DIO_METADATA	0x00000020
+
+struct gfs2_log_operations;
+struct gfs2_bufdata;
+struct gfs2_holder;
+struct gfs2_glock;
+struct gfs2_quota_data;
+struct gfs2_trans;
+struct gfs2_jdesc;
+struct gfs2_sbd;
+struct lm_lockops;
+
+typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
+
+struct gfs2_log_header_host {
+	u64 lh_sequence;	/* Sequence number of this transaction */
+	u32 lh_flags;		/* GFS2_LOG_HEAD_... */
+	u32 lh_tail;		/* Block number of log tail */
+	u32 lh_blkno;
+	u32 lh_hash;
+};
+
+/*
+ * Structure of operations that are associated with each
+ * type of element in the log.
+ */
+
+struct gfs2_log_operations {
+	void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+	void (*lo_before_scan) (struct gfs2_jdesc *jd,
+				struct gfs2_log_header_host *head, int pass);
+	int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
+				 struct gfs2_log_descriptor *ld, __be64 *ptr,
+				 int pass);
+	void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
+	const char *lo_name;
+};
+
+#define GBF_FULL 1
+
+struct gfs2_bitmap {
+	struct buffer_head *bi_bh;
+	char *bi_clone;
+	unsigned long bi_flags;
+	u32 bi_offset;
+	u32 bi_start;
+	u32 bi_len;
+	u32 bi_blocks;
+};
+
+struct gfs2_rgrpd {
+	struct rb_node rd_node;		/* Link with superblock */
+	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
+	u64 rd_addr;			/* grp block disk address */
+	u64 rd_data0;			/* first data location */
+	u32 rd_length;			/* length of rgrp header in fs blocks */
+	u32 rd_data;			/* num of data blocks in rgrp */
+	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
+	u32 rd_free;
+	u32 rd_reserved;                /* number of blocks reserved */
+	u32 rd_free_clone;
+	u32 rd_dinodes;
+	u64 rd_igeneration;
+	struct gfs2_bitmap *rd_bits;
+	struct gfs2_sbd *rd_sbd;
+	struct gfs2_rgrp_lvb *rd_rgl;
+	u32 rd_last_alloc;
+	u32 rd_flags;
+	u32 rd_extfail_pt;		/* extent failure point */
+#define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
+#define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
+#define GFS2_RDF_ERROR		0x40000000 /* error in rg */
+#define GFS2_RDF_PREFERRED	0x80000000 /* This rgrp is preferred */
+#define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
+	spinlock_t rd_rsspin;           /* protects reservation related vars */
+	struct rb_root rd_rstree;       /* multi-block reservation tree */
+};
+
+struct gfs2_rbm {
+	struct gfs2_rgrpd *rgd;
+	u32 offset;		/* The offset is bitmap relative */
+	int bii;		/* Bitmap index */
+};
+
+static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
+{
+	return rbm->rgd->rd_bits + rbm->bii;
+}
+
+static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
+{
+	return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
+		rbm->offset;
+}
+
+static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
+			       const struct gfs2_rbm *rbm2)
+{
+	return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) &&
+	       (rbm1->offset == rbm2->offset);
+}
+
+enum gfs2_state_bits {
+	BH_Pinned = BH_PrivateStart,
+	BH_Escaped = BH_PrivateStart + 1,
+	BH_Zeronew = BH_PrivateStart + 2,
+};
+
+BUFFER_FNS(Pinned, pinned)
+TAS_BUFFER_FNS(Pinned, pinned)
+BUFFER_FNS(Escaped, escaped)
+TAS_BUFFER_FNS(Escaped, escaped)
+BUFFER_FNS(Zeronew, zeronew)
+TAS_BUFFER_FNS(Zeronew, zeronew)
+
+struct gfs2_bufdata {
+	struct buffer_head *bd_bh;
+	struct gfs2_glock *bd_gl;
+	u64 bd_blkno;
+
+	struct list_head bd_list;
+	const struct gfs2_log_operations *bd_ops;
+
+	struct gfs2_trans *bd_tr;
+	struct list_head bd_ail_st_list;
+	struct list_head bd_ail_gl_list;
+};
+
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.
+ */
+
+#define GDLM_STRNAME_BYTES	25
+#define GDLM_LVB_SIZE		32
+
+/*
+ * ls_recover_flags:
+ *
+ * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
+ * held by failed nodes whose journals need recovery.  Those locks should
+ * only be used for journal recovery until the journal recovery is done.
+ * This is set by the dlm recover_prep callback and cleared by the
+ * gfs2_control thread when journal recovery is complete.  To avoid
+ * races between recover_prep setting and gfs2_control clearing, recover_spin
+ * is held while changing this bit and reading/writing recover_block
+ * and recover_start.
+ *
+ * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
+ *
+ * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
+ * recovery of all journals before allowing other nodes to mount the fs.
+ * This is cleared when FIRST_MOUNT_DONE is set.
+ *
+ * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
+ * recovery of all journals, and now allows other nodes to mount the fs.
+ *
+ * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
+ * BLOCK_LOCKS for the first time.  The gfs2_control thread should now
+ * control clearing BLOCK_LOCKS for further recoveries.
+ *
+ * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
+ *
+ * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
+ * and recover_done(), i.e. set while recover_block == recover_start.
+ */
+
+enum {
+	DFL_BLOCK_LOCKS		= 0,
+	DFL_NO_DLM_OPS		= 1,
+	DFL_FIRST_MOUNT		= 2,
+	DFL_FIRST_MOUNT_DONE	= 3,
+	DFL_MOUNT_DONE		= 4,
+	DFL_UNMOUNT		= 5,
+	DFL_DLM_RECOVERY	= 6,
+};
+
+struct lm_lockname {
+	u64 ln_number;
+	unsigned int ln_type;
+};
+
+#define lm_name_equal(name1, name2) \
+        (((name1)->ln_number == (name2)->ln_number) && \
+         ((name1)->ln_type == (name2)->ln_type))
+
+
+struct gfs2_glock_operations {
+	void (*go_sync) (struct gfs2_glock *gl);
+	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
+	void (*go_inval) (struct gfs2_glock *gl, int flags);
+	int (*go_demote_ok) (const struct gfs2_glock *gl);
+	int (*go_lock) (struct gfs2_holder *gh);
+	void (*go_unlock) (struct gfs2_holder *gh);
+	void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+	void (*go_callback)(struct gfs2_glock *gl, bool remote);
+	const int go_type;
+	const unsigned long go_flags;
+#define GLOF_ASPACE 1
+#define GLOF_LVB    2
+};
+
+enum {
+	GFS2_LKS_SRTT = 0,	/* Non blocking smoothed round trip time */
+	GFS2_LKS_SRTTVAR = 1,	/* Non blocking smoothed variance */
+	GFS2_LKS_SRTTB = 2,	/* Blocking smoothed round trip time */
+	GFS2_LKS_SRTTVARB = 3,	/* Blocking smoothed variance */
+	GFS2_LKS_SIRT = 4,	/* Smoothed Inter-request time */
+	GFS2_LKS_SIRTVAR = 5,	/* Smoothed Inter-request variance */
+	GFS2_LKS_DCOUNT = 6,	/* Count of dlm requests */
+	GFS2_LKS_QCOUNT = 7,	/* Count of gfs2_holder queues */
+	GFS2_NR_LKSTATS
+};
+
+struct gfs2_lkstats {
+	s64 stats[GFS2_NR_LKSTATS];
+};
+
+enum {
+	/* States */
+	HIF_HOLDER		= 6,  /* Set for gh that "holds" the glock */
+	HIF_FIRST		= 7,
+	HIF_WAIT		= 10,
+};
+
+struct gfs2_holder {
+	struct list_head gh_list;
+
+	struct gfs2_glock *gh_gl;
+	struct pid *gh_owner_pid;
+	unsigned int gh_state;
+	unsigned gh_flags;
+
+	int gh_error;
+	unsigned long gh_iflags; /* HIF_... */
+	unsigned long gh_ip;
+};
+
+/* Number of quota types we support */
+#define GFS2_MAXQUOTAS 2
+
+/* Resource group multi-block reservation, in order of appearance:
+
+   Step 1. Function prepares to write, allocates a mb, sets the size hint.
+   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
+   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
+   Step 4. Bits are assigned from the rgrp based on either the reservation
+           or wherever it can.
+*/
+
+struct gfs2_blkreserv {
+	/* components used during write (step 1): */
+	atomic_t rs_sizehint;         /* hint of the write size */
+
+	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
+	struct rb_node rs_node;       /* link to other block reservations */
+	struct gfs2_rbm rs_rbm;       /* Start of reservation */
+	u32 rs_free;                  /* how many blocks are still free */
+	u64 rs_inum;                  /* Inode number for reservation */
+
+	/* ancillary quota stuff */
+	struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
+	struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
+	unsigned int rs_qa_qd_num;
+};
+
+/*
+ * Allocation parameters
+ * @target: The number of blocks we'd ideally like to allocate
+ * @aflags: The flags (e.g. Orlov flag)
+ *
+ * The intent is to gradually expand this structure over time in
+ * order to give more information, e.g. alignment, min extent size
+ * to the allocation code.
+ */
+struct gfs2_alloc_parms {
+	u64 target;
+	u32 min_target;
+	u32 aflags;
+	u64 allowed;
+};
+
+enum {
+	GLF_LOCK			= 1,
+	GLF_DEMOTE			= 3,
+	GLF_PENDING_DEMOTE		= 4,
+	GLF_DEMOTE_IN_PROGRESS		= 5,
+	GLF_DIRTY			= 6,
+	GLF_LFLUSH			= 7,
+	GLF_INVALIDATE_IN_PROGRESS	= 8,
+	GLF_REPLY_PENDING		= 9,
+	GLF_INITIAL			= 10,
+	GLF_FROZEN			= 11,
+	GLF_QUEUED			= 12,
+	GLF_LRU				= 13,
+	GLF_OBJECT			= 14, /* Used only for tracing */
+	GLF_BLOCKING			= 15,
+};
+
+struct gfs2_glock {
+	struct hlist_bl_node gl_list;
+	struct gfs2_sbd *gl_sbd;
+	unsigned long gl_flags;		/* GLF_... */
+	struct lm_lockname gl_name;
+
+	struct lockref gl_lockref;
+#define gl_spin gl_lockref.lock
+
+	/* State fields protected by gl_spin */
+	unsigned int gl_state:2,	/* Current state */
+		     gl_target:2,	/* Target state */
+		     gl_demote_state:2,	/* State requested by remote node */
+		     gl_req:2,		/* State in last dlm request */
+		     gl_reply:8;	/* Last reply from the dlm */
+
+	unsigned int gl_hash;
+	unsigned long gl_demote_time; /* time of first demote request */
+	long gl_hold_time;
+	struct list_head gl_holders;
+
+	const struct gfs2_glock_operations *gl_ops;
+	ktime_t gl_dstamp;
+	struct gfs2_lkstats gl_stats;
+	struct dlm_lksb gl_lksb;
+	unsigned long gl_tchange;
+	void *gl_object;
+
+	struct list_head gl_lru;
+	struct list_head gl_ail_list;
+	atomic_t gl_ail_count;
+	atomic_t gl_revokes;
+	struct delayed_work gl_work;
+	union {
+		/* For inode and iopen glocks only */
+		struct work_struct gl_delete;
+		/* For rgrp glocks only */
+		struct {
+			loff_t start;
+			loff_t end;
+		} gl_vm;
+	};
+	struct rcu_head gl_rcu;
+};
+
+#define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
+
+enum {
+	GIF_INVALID		= 0,
+	GIF_QD_LOCKED		= 1,
+	GIF_ALLOC_FAILED	= 2,
+	GIF_SW_PAGED		= 3,
+	GIF_ORDERED		= 4,
+	GIF_FREE_VFS_INODE      = 5,
+};
+
+struct gfs2_inode {
+	struct inode i_inode;
+	u64 i_no_addr;
+	u64 i_no_formal_ino;
+	u64 i_generation;
+	u64 i_eattr;
+	unsigned long i_flags;		/* GIF_... */
+	struct gfs2_glock *i_gl; /* Move into i_gh? */
+	struct gfs2_holder i_iopen_gh;
+	struct gfs2_holder i_gh; /* for prepare/commit_write only */
+	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
+	struct gfs2_rgrpd *i_rgd;
+	u64 i_goal;	/* goal block for allocations */
+	struct rw_semaphore i_rw_mutex;
+	struct list_head i_ordered;
+	struct list_head i_trunc_list;
+	__be64 *i_hash_cache;
+	u32 i_entries;
+	u32 i_diskflags;
+	u8 i_height;
+	u8 i_depth;
+};
+
+/*
+ * Since i_inode is the first element of struct gfs2_inode,
+ * this is effectively a cast.
+ */
+static inline struct gfs2_inode *GFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct gfs2_inode, i_inode);
+}
+
+static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode)
+{
+	return inode->i_sb->s_fs_info;
+}
+
+struct gfs2_file {
+	struct mutex f_fl_mutex;
+	struct gfs2_holder f_fl_gh;
+};
+
+struct gfs2_revoke_replay {
+	struct list_head rr_list;
+	u64 rr_blkno;
+	unsigned int rr_where;
+};
+
+enum {
+	QDF_CHANGE		= 1,
+	QDF_LOCKED		= 2,
+	QDF_REFRESH		= 3,
+};
+
+struct gfs2_quota_data {
+	struct hlist_bl_node qd_hlist;
+	struct list_head qd_list;
+	struct kqid qd_id;
+	struct gfs2_sbd *qd_sbd;
+	struct lockref qd_lockref;
+	struct list_head qd_lru;
+	unsigned qd_hash;
+
+	unsigned long qd_flags;		/* QDF_... */
+
+	s64 qd_change;
+	s64 qd_change_sync;
+
+	unsigned int qd_slot;
+	unsigned int qd_slot_count;
+
+	struct buffer_head *qd_bh;
+	struct gfs2_quota_change *qd_bh_qc;
+	unsigned int qd_bh_count;
+
+	struct gfs2_glock *qd_gl;
+	struct gfs2_quota_lvb qd_qb;
+
+	u64 qd_sync_gen;
+	unsigned long qd_last_warn;
+	struct rcu_head qd_rcu;
+};
+
+struct gfs2_trans {
+	unsigned long tr_ip;
+
+	unsigned int tr_blocks;
+	unsigned int tr_revokes;
+	unsigned int tr_reserved;
+	unsigned int tr_touched:1;
+	unsigned int tr_attached:1;
+	unsigned int tr_alloced:1;
+
+	unsigned int tr_num_buf_new;
+	unsigned int tr_num_databuf_new;
+	unsigned int tr_num_buf_rm;
+	unsigned int tr_num_databuf_rm;
+	unsigned int tr_num_revoke;
+	unsigned int tr_num_revoke_rm;
+
+	struct list_head tr_list;
+	struct list_head tr_databuf;
+	struct list_head tr_buf;
+
+	unsigned int tr_first;
+	struct list_head tr_ail1_list;
+	struct list_head tr_ail2_list;
+};
+
+struct gfs2_journal_extent {
+	struct list_head list;
+
+	unsigned int lblock; /* First logical block */
+	u64 dblock; /* First disk block */
+	u64 blocks;
+};
+
+struct gfs2_jdesc {
+	struct list_head jd_list;
+	struct list_head extent_list;
+	unsigned int nr_extents;
+	struct work_struct jd_work;
+	struct inode *jd_inode;
+	unsigned long jd_flags;
+#define JDF_RECOVERY 1
+	unsigned int jd_jid;
+	unsigned int jd_blocks;
+	int jd_recover_error;
+	/* Replay stuff */
+
+	unsigned int jd_found_blocks;
+	unsigned int jd_found_revokes;
+	unsigned int jd_replayed_blocks;
+
+	struct list_head jd_revoke_list;
+	unsigned int jd_replay_tail;
+
+};
+
+struct gfs2_statfs_change_host {
+	s64 sc_total;
+	s64 sc_free;
+	s64 sc_dinodes;
+};
+
+#define GFS2_QUOTA_DEFAULT	GFS2_QUOTA_OFF
+#define GFS2_QUOTA_OFF		0
+#define GFS2_QUOTA_ACCOUNT	1
+#define GFS2_QUOTA_ON		2
+
+#define GFS2_DATA_DEFAULT	GFS2_DATA_ORDERED
+#define GFS2_DATA_WRITEBACK	1
+#define GFS2_DATA_ORDERED	2
+
+#define GFS2_ERRORS_DEFAULT     GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW    0
+#define GFS2_ERRORS_CONTINUE    1 /* place holder for future feature */
+#define GFS2_ERRORS_RO          2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC       3
+
+struct gfs2_args {
+	char ar_lockproto[GFS2_LOCKNAME_LEN];	/* Name of the Lock Protocol */
+	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
+	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
+	unsigned int ar_spectator:1;		/* Don't get a journal */
+	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
+	unsigned int ar_debug:1;		/* Oops on errors */
+	unsigned int ar_posix_acl:1;		/* Enable posix acls */
+	unsigned int ar_quota:2;		/* off/account/on */
+	unsigned int ar_suiddir:1;		/* suiddir support */
+	unsigned int ar_data:2;			/* ordered/writeback */
+	unsigned int ar_meta:1;			/* mount metafs */
+	unsigned int ar_discard:1;		/* discard requests */
+	unsigned int ar_errors:2;               /* errors=withdraw | panic */
+	unsigned int ar_nobarrier:1;            /* do not send barriers */
+	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
+	int ar_commit;				/* Commit interval */
+	int ar_statfs_quantum;			/* The fast statfs interval */
+	int ar_quota_quantum;			/* The quota interval */
+	int ar_statfs_percent;			/* The % change to force sync */
+};
+
+struct gfs2_tune {
+	spinlock_t gt_spin;
+
+	unsigned int gt_logd_secs;
+
+	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
+	unsigned int gt_quota_scale_num; /* Numerator */
+	unsigned int gt_quota_scale_den; /* Denominator */
+	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
+	unsigned int gt_new_files_jdata;
+	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
+	unsigned int gt_complain_secs;
+	unsigned int gt_statfs_quantum;
+	unsigned int gt_statfs_slow;
+};
+
+enum {
+	SDF_JOURNAL_CHECKED	= 0,
+	SDF_JOURNAL_LIVE	= 1,
+	SDF_SHUTDOWN		= 2,
+	SDF_NOBARRIERS		= 3,
+	SDF_NORECOVERY		= 4,
+	SDF_DEMOTE		= 5,
+	SDF_NOJOURNALID		= 6,
+	SDF_RORECOVERY		= 7, /* read only recovery */
+	SDF_SKIP_DLM_UNLOCK	= 8,
+};
+
+enum gfs2_freeze_state {
+	SFS_UNFROZEN		= 0,
+	SFS_STARTING_FREEZE	= 1,
+	SFS_FROZEN		= 2,
+};
+
+#define GFS2_FSNAME_LEN		256
+
+struct gfs2_inum_host {
+	u64 no_formal_ino;
+	u64 no_addr;
+};
+
+struct gfs2_sb_host {
+	u32 sb_magic;
+	u32 sb_type;
+	u32 sb_format;
+
+	u32 sb_fs_format;
+	u32 sb_multihost_format;
+	u32 sb_bsize;
+	u32 sb_bsize_shift;
+
+	struct gfs2_inum_host sb_master_dir;
+	struct gfs2_inum_host sb_root_dir;
+
+	char sb_lockproto[GFS2_LOCKNAME_LEN];
+	char sb_locktable[GFS2_LOCKNAME_LEN];
+};
+
+/*
+ * lm_mount() return values
+ *
+ * ls_jid - the journal ID this node should use
+ * ls_first - this node is the first to mount the file system
+ * ls_lockspace - lock module's context for this file system
+ * ls_ops - lock module's functions
+ */
+
+struct lm_lockstruct {
+	int ls_jid;
+	unsigned int ls_first;
+	const struct lm_lockops *ls_ops;
+	dlm_lockspace_t *ls_dlm;
+
+	int ls_recover_jid_done;   /* These two are deprecated, */
+	int ls_recover_jid_status; /* used previously by gfs_controld */
+
+	struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
+	struct dlm_lksb ls_control_lksb; /* control_lock */
+	char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
+	struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
+	char *ls_lvb_bits;
+
+	spinlock_t ls_recover_spin; /* protects following fields */
+	unsigned long ls_recover_flags; /* DFL_ */
+	uint32_t ls_recover_mount; /* gen in first recover_done cb */
+	uint32_t ls_recover_start; /* gen in last recover_done cb */
+	uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
+	uint32_t ls_recover_size; /* size of recover_submit, recover_result */
+	uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
+	uint32_t *ls_recover_result; /* result of last jid recovery */
+};
+
+struct gfs2_pcpu_lkstats {
+	/* One struct for each glock type */
+	struct gfs2_lkstats lkstats[10];
+};
+
+struct gfs2_sbd {
+	struct super_block *sd_vfs;
+	struct gfs2_pcpu_lkstats __percpu *sd_lkstats;
+	struct kobject sd_kobj;
+	unsigned long sd_flags;	/* SDF_... */
+	struct gfs2_sb_host sd_sb;
+
+	/* Constants computed on mount */
+
+	u32 sd_fsb2bb;
+	u32 sd_fsb2bb_shift;
+	u32 sd_diptrs;	/* Number of pointers in a dinode */
+	u32 sd_inptrs;	/* Number of pointers in a indirect block */
+	u32 sd_jbsize;	/* Size of a journaled data block */
+	u32 sd_hash_bsize;	/* sizeof(exhash block) */
+	u32 sd_hash_bsize_shift;
+	u32 sd_hash_ptrs;	/* Number of pointers in a hash block */
+	u32 sd_qc_per_block;
+	u32 sd_blocks_per_bitmap;
+	u32 sd_max_dirres;	/* Max blocks needed to add a directory entry */
+	u32 sd_max_height;	/* Max height of a file's metadata tree */
+	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
+	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
+	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+
+	struct gfs2_args sd_args;	/* Mount arguments */
+	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
+
+	/* Lock Stuff */
+
+	struct lm_lockstruct sd_lockstruct;
+	struct gfs2_holder sd_live_gh;
+	struct gfs2_glock *sd_rename_gl;
+	struct gfs2_glock *sd_freeze_gl;
+	struct work_struct sd_freeze_work;
+	wait_queue_head_t sd_glock_wait;
+	atomic_t sd_glock_disposal;
+	struct completion sd_locking_init;
+	struct completion sd_wdack;
+	struct delayed_work sd_control_work;
+
+	/* Inode Stuff */
+
+	struct dentry *sd_master_dir;
+	struct dentry *sd_root_dir;
+
+	struct inode *sd_jindex;
+	struct inode *sd_statfs_inode;
+	struct inode *sd_sc_inode;
+	struct inode *sd_qc_inode;
+	struct inode *sd_rindex;
+	struct inode *sd_quota_inode;
+
+	/* StatFS stuff */
+
+	spinlock_t sd_statfs_spin;
+	struct gfs2_statfs_change_host sd_statfs_master;
+	struct gfs2_statfs_change_host sd_statfs_local;
+	int sd_statfs_force_sync;
+
+	/* Resource group stuff */
+
+	int sd_rindex_uptodate;
+	spinlock_t sd_rindex_spin;
+	struct rb_root sd_rindex_tree;
+	unsigned int sd_rgrps;
+	unsigned int sd_max_rg_data;
+
+	/* Journal index stuff */
+
+	struct list_head sd_jindex_list;
+	spinlock_t sd_jindex_spin;
+	struct mutex sd_jindex_mutex;
+	unsigned int sd_journals;
+
+	struct gfs2_jdesc *sd_jdesc;
+	struct gfs2_holder sd_journal_gh;
+	struct gfs2_holder sd_jinode_gh;
+
+	struct gfs2_holder sd_sc_gh;
+	struct gfs2_holder sd_qc_gh;
+
+	struct completion sd_journal_ready;
+
+	/* Daemon stuff */
+
+	struct task_struct *sd_logd_process;
+	struct task_struct *sd_quotad_process;
+
+	/* Quota stuff */
+
+	struct list_head sd_quota_list;
+	atomic_t sd_quota_count;
+	struct mutex sd_quota_mutex;
+	struct mutex sd_quota_sync_mutex;
+	wait_queue_head_t sd_quota_wait;
+	struct list_head sd_trunc_list;
+	spinlock_t sd_trunc_lock;
+
+	unsigned int sd_quota_slots;
+	unsigned long *sd_quota_bitmap;
+	spinlock_t sd_bitmap_lock;
+
+	u64 sd_quota_sync_gen;
+
+	/* Log stuff */
+
+	struct address_space sd_aspace;
+
+	spinlock_t sd_log_lock;
+
+	struct gfs2_trans *sd_log_tr;
+	unsigned int sd_log_blks_reserved;
+	int sd_log_commited_revoke;
+
+	atomic_t sd_log_pinned;
+	unsigned int sd_log_num_revoke;
+
+	struct list_head sd_log_le_revoke;
+	struct list_head sd_log_le_ordered;
+	spinlock_t sd_ordered_lock;
+
+	atomic_t sd_log_thresh1;
+	atomic_t sd_log_thresh2;
+	atomic_t sd_log_blks_free;
+	wait_queue_head_t sd_log_waitq;
+	wait_queue_head_t sd_logd_waitq;
+
+	u64 sd_log_sequence;
+	unsigned int sd_log_head;
+	unsigned int sd_log_tail;
+	int sd_log_idle;
+
+	struct rw_semaphore sd_log_flush_lock;
+	atomic_t sd_log_in_flight;
+	struct bio *sd_log_bio;
+	wait_queue_head_t sd_log_flush_wait;
+	int sd_log_error;
+
+	atomic_t sd_reserving_log;
+	wait_queue_head_t sd_reserving_log_wait;
+
+	unsigned int sd_log_flush_head;
+	u64 sd_log_flush_wrapped;
+
+	spinlock_t sd_ail_lock;
+	struct list_head sd_ail1_list;
+	struct list_head sd_ail2_list;
+
+	/* For quiescing the filesystem */
+	struct gfs2_holder sd_freeze_gh;
+	atomic_t sd_freeze_state;
+	struct mutex sd_freeze_mutex;
+
+	char sd_fsname[GFS2_FSNAME_LEN];
+	char sd_table_name[GFS2_FSNAME_LEN];
+	char sd_proto_name[GFS2_FSNAME_LEN];
+
+	/* Debugging crud */
+
+	unsigned long sd_last_warning;
+	struct dentry *debugfs_dir;    /* debugfs directory */
+	struct dentry *debugfs_dentry_glocks;
+	struct dentry *debugfs_dentry_glstats;
+	struct dentry *debugfs_dentry_sbstats;
+};
+
+static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
+{
+	gl->gl_stats.stats[which]++;
+}
+
+static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
+{
+	const struct gfs2_sbd *sdp = gl->gl_sbd;
+	preempt_disable();
+	this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++;
+	preempt_enable();
+}
+
+#endif /* __INCORE_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/inode.c b/kernel/fs/gfs2/inode.c
new file mode 100644
index 000000000..1b3ca7a2e
--- /dev/null
+++ b/kernel/fs/gfs2/inode.c
@@ -0,0 +1,1973 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/fiemap.h>
+#include <linux/security.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "super.h"
+#include "glops.h"
+
+struct gfs2_skip_data {
+	u64 no_addr;
+	int skipped;
+	int non_block;
+};
+
+static int iget_test(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_skip_data *data = opaque;
+
+	if (ip->i_no_addr == data->no_addr) {
+		if (data->non_block &&
+		    inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+			data->skipped = 1;
+			return 0;
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int iget_set(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_skip_data *data = opaque;
+
+	if (data->skipped)
+		return -ENOENT;
+	inode->i_ino = (unsigned long)(data->no_addr);
+	ip->i_no_addr = data->no_addr;
+	return 0;
+}
+
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
+{
+	unsigned long hash = (unsigned long)no_addr;
+	struct gfs2_skip_data data;
+
+	data.no_addr = no_addr;
+	data.skipped = 0;
+	data.non_block = non_block;
+	return ilookup5(sb, hash, iget_test, &data);
+}
+
+static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
+			       int non_block)
+{
+	struct gfs2_skip_data data;
+	unsigned long hash = (unsigned long)no_addr;
+
+	data.no_addr = no_addr;
+	data.skipped = 0;
+	data.non_block = non_block;
+	return iget5_locked(sb, hash, iget_test, iget_set, &data);
+}
+
+/**
+ * gfs2_set_iop - Sets inode operations
+ * @inode: The inode with correct i_mode filled in
+ *
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * from directory entry inside gfs2_inode_lookup().
+ */
+
+static void gfs2_set_iop(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	umode_t mode = inode->i_mode;
+
+	if (S_ISREG(mode)) {
+		inode->i_op = &gfs2_file_iops;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = &gfs2_file_fops_nolock;
+		else
+			inode->i_fop = &gfs2_file_fops;
+	} else if (S_ISDIR(mode)) {
+		inode->i_op = &gfs2_dir_iops;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = &gfs2_dir_fops_nolock;
+		else
+			inode->i_fop = &gfs2_dir_fops;
+	} else if (S_ISLNK(mode)) {
+		inode->i_op = &gfs2_symlink_iops;
+	} else {
+		inode->i_op = &gfs2_file_iops;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	}
+}
+
+/**
+ * gfs2_inode_lookup - Lookup an inode
+ * @sb: The super block
+ * @no_addr: The inode number
+ * @type: The type of the inode
+ * non_block: Can we block on inodes that are being freed?
+ *
+ * Returns: A VFS inode, or an error
+ */
+
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
+				u64 no_addr, u64 no_formal_ino, int non_block)
+{
+	struct inode *inode;
+	struct gfs2_inode *ip;
+	struct gfs2_glock *io_gl = NULL;
+	int error;
+
+	inode = gfs2_iget(sb, no_addr, non_block);
+	ip = GFS2_I(inode);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		struct gfs2_sbd *sdp = GFS2_SB(inode);
+		ip->i_no_formal_ino = no_formal_ino;
+
+		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+		if (unlikely(error))
+			goto fail;
+		ip->i_gl->gl_object = ip;
+
+		error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+		if (unlikely(error))
+			goto fail_put;
+
+		set_bit(GIF_INVALID, &ip->i_flags);
+		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+		if (unlikely(error))
+			goto fail_iopen;
+
+		ip->i_iopen_gh.gh_gl->gl_object = ip;
+		gfs2_glock_put(io_gl);
+		io_gl = NULL;
+
+		if (type == DT_UNKNOWN) {
+			/* Inode glock must be locked already */
+			error = gfs2_inode_refresh(GFS2_I(inode));
+			if (error)
+				goto fail_refresh;
+		} else {
+			inode->i_mode = DT2IF(type);
+		}
+
+		gfs2_set_iop(inode);
+		unlock_new_inode(inode);
+	}
+
+	return inode;
+
+fail_refresh:
+	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+	ip->i_iopen_gh.gh_gl->gl_object = NULL;
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+fail_iopen:
+	if (io_gl)
+		gfs2_glock_put(io_gl);
+fail_put:
+	ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+fail:
+	iget_failed(inode);
+	return ERR_PTR(error);
+}
+
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+				  u64 *no_formal_ino, unsigned int blktype)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_holder i_gh;
+	struct inode *inode = NULL;
+	int error;
+
+	/* Must not read in block until block type is verified */
+	error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+				  LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
+	if (error)
+		return ERR_PTR(error);
+
+	error = gfs2_check_blk_type(sdp, no_addr, blktype);
+	if (error)
+		goto fail;
+
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
+	if (IS_ERR(inode))
+		goto fail;
+
+	/* Two extra checks for NFS only */
+	if (no_formal_ino) {
+		error = -ESTALE;
+		if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
+			goto fail_iput;
+
+		error = -EIO;
+		if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
+			goto fail_iput;
+
+		error = 0;
+	}
+
+fail:
+	gfs2_glock_dq_uninit(&i_gh);
+	return error ? ERR_PTR(error) : inode;
+fail_iput:
+	iput(inode);
+	goto fail;
+}
+
+
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+{
+	struct qstr qstr;
+	struct inode *inode;
+	gfs2_str2qstr(&qstr, name);
+	inode = gfs2_lookupi(dip, &qstr, 1);
+	/* gfs2_lookupi has inconsistent callers: vfs
+	 * related routines expect NULL for no entry found,
+	 * gfs2_lookup_simple callers expect ENOENT
+	 * and do not check for NULL.
+	 */
+	if (inode == NULL)
+		return ERR_PTR(-ENOENT);
+	else
+		return inode;
+}
+
+
+/**
+ * gfs2_lookupi - Look up a filename in a directory and return its inode
+ * @d_gh: An initialized holder for the directory glock
+ * @name: The name of the inode to look for
+ * @is_root: If 1, ignore the caller's permissions
+ * @i_gh: An uninitialized holder for the new inode glock
+ *
+ * This can be called via the VFS filldir function when NFS is doing
+ * a readdirplus and the inode which its intending to stat isn't
+ * already in cache. In this case we must not take the directory glock
+ * again, since the readdir call will have already taken that lock.
+ *
+ * Returns: errno
+ */
+
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+			   int is_root)
+{
+	struct super_block *sb = dir->i_sb;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_holder d_gh;
+	int error = 0;
+	struct inode *inode = NULL;
+	int unlock = 0;
+
+	if (!name->len || name->len > GFS2_FNAMESIZE)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
+	    (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
+	     dir == d_inode(sb->s_root))) {
+		igrab(dir);
+		return dir;
+	}
+
+	if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+		if (error)
+			return ERR_PTR(error);
+		unlock = 1;
+	}
+
+	if (!is_root) {
+		error = gfs2_permission(dir, MAY_EXEC);
+		if (error)
+			goto out;
+	}
+
+	inode = gfs2_dir_search(dir, name, false);
+	if (IS_ERR(inode))
+		error = PTR_ERR(inode);
+out:
+	if (unlock)
+		gfs2_glock_dq_uninit(&d_gh);
+	if (error == -ENOENT)
+		return NULL;
+	return inode ? inode : ERR_PTR(error);
+}
+
+/**
+ * create_ok - OK to create a new on-disk inode here?
+ * @dip:  Directory in which dinode is to be created
+ * @name:  Name of new dinode
+ * @mode:
+ *
+ * Returns: errno
+ */
+
+static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
+		     umode_t mode)
+{
+	int error;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+
+	/*  Don't create entries in an unlinked directory  */
+	if (!dip->i_inode.i_nlink)
+		return -ENOENT;
+
+	if (dip->i_entries == (u32)-1)
+		return -EFBIG;
+	if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
+		return -EMLINK;
+
+	return 0;
+}
+
+static void munge_mode_uid_gid(const struct gfs2_inode *dip,
+			       struct inode *inode)
+{
+	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
+	    (dip->i_inode.i_mode & S_ISUID) &&
+	    !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) {
+		if (S_ISDIR(inode->i_mode))
+			inode->i_mode |= S_ISUID;
+		else if (!uid_eq(dip->i_inode.i_uid, current_fsuid()))
+			inode->i_mode &= ~07111;
+		inode->i_uid = dip->i_inode.i_uid;
+	} else
+		inode->i_uid = current_fsuid();
+
+	if (dip->i_inode.i_mode & S_ISGID) {
+		if (S_ISDIR(inode->i_mode))
+			inode->i_mode |= S_ISGID;
+		inode->i_gid = dip->i_inode.i_gid;
+	} else
+		inode->i_gid = current_fsgid();
+}
+
+static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
+	int error;
+
+	error = gfs2_quota_lock_check(ip, &ap);
+	if (error)
+		goto out;
+
+	error = gfs2_inplace_reserve(ip, &ap);
+	if (error)
+		goto out_quota;
+
+	error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipreserv;
+
+	error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
+	ip->i_no_formal_ino = ip->i_generation;
+	ip->i_inode.i_ino = ip->i_no_addr;
+	ip->i_goal = ip->i_no_addr;
+
+	gfs2_trans_end(sdp);
+
+out_ipreserv:
+	gfs2_inplace_release(ip);
+out_quota:
+	gfs2_quota_unlock(ip);
+out:
+	return error;
+}
+
+static void gfs2_init_dir(struct buffer_head *dibh,
+			  const struct gfs2_inode *parent)
+{
+	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+	struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+
+	gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
+	dent->de_inum = di->di_num; /* already GFS2 endian */
+	dent->de_type = cpu_to_be16(DT_DIR);
+
+	dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+	gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+	gfs2_inum_out(parent, dent);
+	dent->de_type = cpu_to_be16(DT_DIR);
+	
+}
+
+/**
+ * gfs2_init_xattr - Initialise an xattr block for a new inode
+ * @ip: The inode in question
+ *
+ * This sets up an empty xattr block for a new inode, ready to
+ * take any ACLs, LSM xattrs, etc.
+ */
+
+static void gfs2_init_xattr(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *bh;
+	struct gfs2_ea_header *ea;
+
+	bh = gfs2_meta_new(ip->i_gl, ip->i_eattr);
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+
+	ea = GFS2_EA_BH2FIRST(bh);
+	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+	ea->ea_type = GFS2_EATYPE_UNUSED;
+	ea->ea_flags = GFS2_EAFLAG_LAST;
+
+	brelse(bh);
+}
+
+/**
+ * init_dinode - Fill in a new dinode structure
+ * @dip: The directory this inode is being created in
+ * @ip: The inode
+ * @symname: The symlink destination (if a symlink)
+ * @bhp: The buffer head (returned to caller)
+ *
+ */
+
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
+			const char *symname)
+{
+	struct gfs2_dinode *di;
+	struct buffer_head *dibh;
+
+	dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	di = (struct gfs2_dinode *)dibh->b_data;
+	gfs2_dinode_out(ip, di);
+
+	di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
+	di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
+	di->__pad1 = 0;
+	di->__pad2 = 0;
+	di->__pad3 = 0;
+	memset(&di->__pad4, 0, sizeof(di->__pad4));
+	memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+	switch(ip->i_inode.i_mode & S_IFMT) {
+	case S_IFDIR:
+		gfs2_init_dir(dibh, dip);
+		break;
+	case S_IFLNK:
+		memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size);
+		break;
+	}
+
+	set_buffer_uptodate(dibh);
+	brelse(dibh);
+}
+
+/**
+ * gfs2_trans_da_blocks - Calculate number of blocks to link inode
+ * @dip: The directory we are linking into
+ * @da: The dir add information
+ * @nr_inodes: The number of inodes involved
+ *
+ * This calculate the number of blocks we need to reserve in a
+ * transaction to link @nr_inodes into a directory. In most cases
+ * @nr_inodes will be 2 (the directory plus the inode being linked in)
+ * but in case of rename, 4 may be required.
+ *
+ * Returns: Number of blocks
+ */
+
+static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
+				   const struct gfs2_diradd *da,
+				   unsigned nr_inodes)
+{
+	return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
+	       (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
+}
+
+static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip, struct gfs2_diradd *da)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
+	int error;
+
+	if (da->nr_blocks) {
+		error = gfs2_quota_lock_check(dip, &ap);
+		if (error)
+			goto fail_quota_locks;
+
+		error = gfs2_inplace_reserve(dip, &ap);
+		if (error)
+			goto fail_quota_locks;
+
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
+		if (error)
+			goto fail_ipreserv;
+	} else {
+		error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
+		if (error)
+			goto fail_quota_locks;
+	}
+
+	error = gfs2_dir_add(&dip->i_inode, name, ip, da);
+
+	gfs2_trans_end(sdp);
+fail_ipreserv:
+	gfs2_inplace_release(dip);
+fail_quota_locks:
+	gfs2_quota_unlock(dip);
+	return error;
+}
+
+static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		    void *fs_info)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
+				       xattr->value_len, 0,
+				       GFS2_EATYPE_SECURITY);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+/**
+ * gfs2_create_inode - Create a new inode
+ * @dir: The parent directory
+ * @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
+ * @mode: The permissions on the new inode
+ * @dev: For device nodes, this is the device number
+ * @symname: For symlinks, this is the link destination
+ * @size: The initial size of the inode (ignored for directories)
+ *
+ * Returns: 0 on success, or error code
+ */
+
+static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+			     struct file *file,
+			     umode_t mode, dev_t dev, const char *symname,
+			     unsigned int size, int excl, int *opened)
+{
+	const struct qstr *name = &dentry->d_name;
+	struct posix_acl *default_acl, *acl;
+	struct gfs2_holder ghs[2];
+	struct inode *inode = NULL;
+	struct gfs2_inode *dip = GFS2_I(dir), *ip;
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_glock *io_gl;
+	int error, free_vfs_inode = 0;
+	u32 aflags = 0;
+	unsigned blocks = 1;
+	struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
+
+	if (!name->len || name->len > GFS2_FNAMESIZE)
+		return -ENAMETOOLONG;
+
+	error = gfs2_rs_alloc(dip);
+	if (error)
+		return error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	if (error)
+		goto fail;
+
+	error = create_ok(dip, name, mode);
+	if (error)
+		goto fail_gunlock;
+
+	inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+	error = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		if (S_ISDIR(inode->i_mode)) {
+			iput(inode);
+			inode = ERR_PTR(-EISDIR);
+			goto fail_gunlock;
+		}
+		d_instantiate(dentry, inode);
+		error = 0;
+		if (file) {
+			if (S_ISREG(inode->i_mode))
+				error = finish_open(file, dentry, gfs2_open_common, opened);
+			else
+				error = finish_no_open(file, NULL);
+		}
+		gfs2_glock_dq_uninit(ghs);
+		return error;
+	} else if (error != -ENOENT) {
+		goto fail_gunlock;
+	}
+
+	error = gfs2_diradd_alloc_required(dir, name, &da);
+	if (error < 0)
+		goto fail_gunlock;
+
+	inode = new_inode(sdp->sd_vfs);
+	error = -ENOMEM;
+	if (!inode)
+		goto fail_gunlock;
+
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		goto fail_free_vfs_inode;
+
+	ip = GFS2_I(inode);
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		goto fail_free_acls;
+
+	inode->i_mode = mode;
+	set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
+	inode->i_rdev = dev;
+	inode->i_size = size;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	gfs2_set_inode_blocks(inode, 1);
+	munge_mode_uid_gid(dip, inode);
+	check_and_update_goal(dip);
+	ip->i_goal = dip->i_goal;
+	ip->i_diskflags = 0;
+	ip->i_eattr = 0;
+	ip->i_height = 0;
+	ip->i_depth = 0;
+	ip->i_entries = 0;
+
+	switch(mode & S_IFMT) {
+	case S_IFREG:
+		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
+		    gfs2_tune_get(sdp, gt_new_files_jdata))
+			ip->i_diskflags |= GFS2_DIF_JDATA;
+		gfs2_set_aops(inode);
+		break;
+	case S_IFDIR:
+		ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA);
+		ip->i_diskflags |= GFS2_DIF_JDATA;
+		ip->i_entries = 2;
+		break;
+	}
+	gfs2_set_inode_flags(inode);
+
+	if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) ||
+	    (dip->i_diskflags & GFS2_DIF_TOPDIR))
+		aflags |= GFS2_AF_ORLOV;
+
+	if (default_acl || acl)
+		blocks++;
+
+	error = alloc_dinode(ip, aflags, &blocks);
+	if (error)
+		goto fail_free_inode;
+
+	gfs2_set_inode_blocks(inode, blocks);
+
+	error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+	if (error)
+		goto fail_free_inode;
+
+	ip->i_gl->gl_object = ip;
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+	if (error)
+		goto fail_free_inode;
+
+	error = gfs2_trans_begin(sdp, blocks, 0);
+	if (error)
+		goto fail_gunlock2;
+
+	if (blocks > 1) {
+		ip->i_eattr = ip->i_no_addr + 1;
+		gfs2_init_xattr(ip);
+	}
+	init_dinode(dip, ip, symname);
+	gfs2_trans_end(sdp);
+
+	error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+	if (error)
+		goto fail_gunlock2;
+
+	error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+	if (error)
+		goto fail_gunlock2;
+
+	ip->i_iopen_gh.gh_gl->gl_object = ip;
+	gfs2_glock_put(io_gl);
+	gfs2_set_iop(inode);
+	insert_inode_hash(inode);
+
+	if (default_acl) {
+		error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+
+	if (error)
+		goto fail_gunlock3;
+
+	error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
+					     &gfs2_initxattrs, NULL);
+	if (error)
+		goto fail_gunlock3;
+
+	error = link_dinode(dip, name, ip, &da);
+	if (error)
+		goto fail_gunlock3;
+
+	mark_inode_dirty(inode);
+	d_instantiate(dentry, inode);
+	if (file) {
+		*opened |= FILE_CREATED;
+		error = finish_open(file, dentry, gfs2_open_common, opened);
+	}
+	gfs2_glock_dq_uninit(ghs);
+	gfs2_glock_dq_uninit(ghs + 1);
+	return error;
+
+fail_gunlock3:
+	gfs2_glock_dq_uninit(ghs + 1);
+	if (ip->i_gl)
+		gfs2_glock_put(ip->i_gl);
+	goto fail_gunlock;
+
+fail_gunlock2:
+	gfs2_glock_dq_uninit(ghs + 1);
+fail_free_inode:
+	if (ip->i_gl)
+		gfs2_glock_put(ip->i_gl);
+	gfs2_rs_delete(ip, NULL);
+fail_free_acls:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
+fail_free_vfs_inode:
+	free_vfs_inode = 1;
+fail_gunlock:
+	gfs2_dir_no_add(&da);
+	gfs2_glock_dq_uninit(ghs);
+	if (inode && !IS_ERR(inode)) {
+		clear_nlink(inode);
+		if (!free_vfs_inode)
+			mark_inode_dirty(inode);
+		set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
+			&GFS2_I(inode)->i_flags);
+		iput(inode);
+	}
+fail:
+	return error;
+}
+
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+		       umode_t mode, bool excl)
+{
+	return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
+}
+
+/**
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @file: File to be opened
+ * @opened: atomic_open flags
+ *
+ *
+ * Returns: errno
+ */
+
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				    struct file *file, int *opened)
+{
+	struct inode *inode;
+	struct dentry *d;
+	struct gfs2_holder gh;
+	struct gfs2_glock *gl;
+	int error;
+
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+	if (inode == NULL) {
+		d_add(dentry, NULL);
+		return NULL;
+	}
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	gl = GFS2_I(inode)->i_gl;
+	error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	if (error) {
+		iput(inode);
+		return ERR_PTR(error);
+	}
+
+	d = d_splice_alias(inode, dentry);
+	if (IS_ERR(d)) {
+		gfs2_glock_dq_uninit(&gh);
+		return d;
+	}
+	if (file && S_ISREG(inode->i_mode))
+		error = finish_open(file, dentry, gfs2_open_common, opened);
+
+	gfs2_glock_dq_uninit(&gh);
+	if (error) {
+		dput(d);
+		return ERR_PTR(error);
+	}
+	return d;
+}
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned flags)
+{
+	return __gfs2_lookup(dir, dentry, NULL, NULL);
+}
+
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct inode *inode = d_inode(old_dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder ghs[2];
+	struct buffer_head *dibh;
+	struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
+	int error;
+
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	error = gfs2_rs_alloc(dip);
+	if (error)
+		return error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	error = gfs2_glock_nq(ghs); /* parent */
+	if (error)
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = -ENOENT;
+	if (inode->i_nlink == 0)
+		goto out_gunlock;
+
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_dir_check(dir, &dentry->d_name, NULL);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+	default:
+		goto out_gunlock;
+	}
+
+	error = -EINVAL;
+	if (!dip->i_inode.i_nlink)
+		goto out_gunlock;
+	error = -EFBIG;
+	if (dip->i_entries == (u32)-1)
+		goto out_gunlock;
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out_gunlock;
+	error = -EINVAL;
+	if (!ip->i_inode.i_nlink)
+		goto out_gunlock;
+	error = -EMLINK;
+	if (ip->i_inode.i_nlink == (u32)-1)
+		goto out_gunlock;
+
+	error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
+	if (error < 0)
+		goto out_gunlock;
+
+	if (da.nr_blocks) {
+		struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
+		error = gfs2_quota_lock_check(dip, &ap);
+		if (error)
+			goto out_gunlock;
+
+		error = gfs2_inplace_reserve(dip, &ap);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
+		if (error)
+			goto out_ipres;
+	} else {
+		error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+		if (error)
+			goto out_ipres;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
+	if (error)
+		goto out_brelse;
+
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	inc_nlink(&ip->i_inode);
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+
+out_brelse:
+	brelse(dibh);
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	if (da.nr_blocks)
+		gfs2_inplace_release(dip);
+out_gunlock_q:
+	if (da.nr_blocks)
+		gfs2_quota_unlock(dip);
+out_gunlock:
+	gfs2_dir_no_add(&da);
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
+	gfs2_holder_uninit(ghs + 1);
+	return error;
+}
+
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip)
+{
+	int error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		return -EPERM;
+
+	if ((dip->i_inode.i_mode & S_ISVTX) &&
+	    !uid_eq(dip->i_inode.i_uid, current_fsuid()) &&
+	    !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (IS_APPEND(&dip->i_inode))
+		return -EPERM;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+
+	return gfs2_dir_check(&dip->i_inode, name, ip);
+}
+
+/**
+ * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
+ * @dip: The parent directory
+ * @name: The name of the entry in the parent directory
+ * @inode: The inode to be removed
+ *
+ * Called with all the locks and in a transaction. This will only be
+ * called for a directory after it has been checked to ensure it is empty.
+ *
+ * Returns: 0 on success, or an error
+ */
+
+static int gfs2_unlink_inode(struct gfs2_inode *dip,
+			     const struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	error = gfs2_dir_del(dip, dentry);
+	if (error)
+		return error;
+
+	ip->i_entries = 0;
+	inode->i_ctime = CURRENT_TIME;
+	if (S_ISDIR(inode->i_mode))
+		clear_nlink(inode);
+	else
+		drop_nlink(inode);
+	mark_inode_dirty(inode);
+	if (inode->i_nlink == 0)
+		gfs2_unlink_di(inode);
+	return 0;
+}
+
+
+/**
+ * gfs2_unlink - Unlink an inode (this does rmdir as well)
+ * @dir: The inode of the directory containing the inode to unlink
+ * @dentry: The file itself
+ *
+ * This routine uses the type of the inode as a flag to figure out
+ * whether this is an unlink or an rmdir.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct inode *inode = d_inode(dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder ghs[3];
+	struct gfs2_rgrpd *rgd;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	error = -EROFS;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);
+	if (!rgd)
+		goto out_inodes;
+
+	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
+
+
+	error = gfs2_glock_nq(ghs); /* parent */
+	if (error)
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = -ENOENT;
+	if (inode->i_nlink == 0)
+		goto out_rgrp;
+
+	if (S_ISDIR(inode->i_mode)) {
+		error = -ENOTEMPTY;
+		if (ip->i_entries > 2 || inode->i_nlink > 2)
+			goto out_rgrp;
+	}
+
+	error = gfs2_glock_nq(ghs + 2); /* rgrp */
+	if (error)
+		goto out_rgrp;
+
+	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_unlink_inode(dip, dentry);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock:
+	gfs2_glock_dq(ghs + 2);
+out_rgrp:
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs + 2);
+out_inodes:
+	gfs2_holder_uninit(ghs + 1);
+	gfs2_holder_uninit(ghs);
+	return error;
+}
+
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+			const char *symname)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	unsigned int size;
+
+	size = strlen(symname);
+	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+		return -ENAMETOOLONG;
+
+	return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
+}
+
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
+}
+
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @dev: The device specification of the special file
+ *
+ */
+
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      dev_t dev)
+{
+	return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+                            struct file *file, unsigned flags,
+                            umode_t mode, int *opened)
+{
+	struct dentry *d;
+	bool excl = !!(flags & O_EXCL);
+
+	if (!d_unhashed(dentry))
+		goto skip_lookup;
+
+	d = __gfs2_lookup(dir, dentry, file, opened);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
+	if (d != NULL)
+		dentry = d;
+	if (d_really_is_positive(dentry)) {
+		if (!(*opened & FILE_OPENED))
+			return finish_no_open(file, d);
+		dput(d);
+		return 0;
+	}
+
+	BUG_ON(d != NULL);
+
+skip_lookup:
+	if (!(flags & O_CREAT))
+		return -ENOENT;
+
+	return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
+}
+
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+	struct inode *dir = &to->i_inode;
+	struct super_block *sb = dir->i_sb;
+	struct inode *tmp;
+	int error = 0;
+
+	igrab(dir);
+
+	for (;;) {
+		if (dir == &this->i_inode) {
+			error = -EINVAL;
+			break;
+		}
+		if (dir == d_inode(sb->s_root)) {
+			error = 0;
+			break;
+		}
+
+		tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
+		if (!tmp) {
+			error = -ENOENT;
+			break;
+		}
+		if (IS_ERR(tmp)) {
+			error = PTR_ERR(tmp);
+			break;
+		}
+
+		iput(dir);
+		dir = tmp;
+	}
+
+	iput(dir);
+
+	return error;
+}
+
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+		       struct inode *ndir, struct dentry *ndentry)
+{
+	struct gfs2_inode *odip = GFS2_I(odir);
+	struct gfs2_inode *ndip = GFS2_I(ndir);
+	struct gfs2_inode *ip = GFS2_I(d_inode(odentry));
+	struct gfs2_inode *nip = NULL;
+	struct gfs2_sbd *sdp = GFS2_SB(odir);
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+	struct gfs2_rgrpd *nrgd;
+	unsigned int num_gh;
+	int dir_rename = 0;
+	struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, };
+	unsigned int x;
+	int error;
+
+	if (d_really_is_positive(ndentry)) {
+		nip = GFS2_I(d_inode(ndentry));
+		if (ip == nip)
+			return 0;
+	}
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	error = gfs2_rs_alloc(ndip);
+	if (error)
+		return error;
+
+	if (odip != ndip) {
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+					   0, &r_gh);
+		if (error)
+			goto out;
+
+		if (S_ISDIR(ip->i_inode.i_mode)) {
+			dir_rename = 1;
+			/* don't move a dirctory into it's subdir */
+			error = gfs2_ok_to_move(ip, ndip);
+			if (error)
+				goto out_gunlock_r;
+		}
+	}
+
+	num_gh = 1;
+	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	if (odip != ndip) {
+		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+	}
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	num_gh++;
+
+	if (nip) {
+		gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+		/* grab the resource lock for unlink flag twiddling 
+		 * this is the case of the target file already existing
+		 * so we unlink before doing the rename
+		 */
+		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1);
+		if (nrgd)
+			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
+	}
+
+	for (x = 0; x < num_gh; x++) {
+		error = gfs2_glock_nq(ghs + x);
+		if (error)
+			goto out_gunlock;
+	}
+
+	error = -ENOENT;
+	if (ip->i_inode.i_nlink == 0)
+		goto out_gunlock;
+
+	/* Check out the old directory */
+
+	error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	/* Check out the new directory */
+
+	if (nip) {
+		error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+		if (error)
+			goto out_gunlock;
+
+		if (nip->i_inode.i_nlink == 0) {
+			error = -EAGAIN;
+			goto out_gunlock;
+		}
+
+		if (S_ISDIR(nip->i_inode.i_mode)) {
+			if (nip->i_entries < 2) {
+				gfs2_consist_inode(nip);
+				error = -EIO;
+				goto out_gunlock;
+			}
+			if (nip->i_entries > 2) {
+				error = -ENOTEMPTY;
+				goto out_gunlock;
+			}
+		}
+	} else {
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+		if (error)
+			goto out_gunlock;
+
+		error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
+		switch (error) {
+		case -ENOENT:
+			error = 0;
+			break;
+		case 0:
+			error = -EEXIST;
+		default:
+			goto out_gunlock;
+		};
+
+		if (odip != ndip) {
+			if (!ndip->i_inode.i_nlink) {
+				error = -ENOENT;
+				goto out_gunlock;
+			}
+			if (ndip->i_entries == (u32)-1) {
+				error = -EFBIG;
+				goto out_gunlock;
+			}
+			if (S_ISDIR(ip->i_inode.i_mode) &&
+			    ndip->i_inode.i_nlink == (u32)-1) {
+				error = -EMLINK;
+				goto out_gunlock;
+			}
+		}
+	}
+
+	/* Check out the dir to be renamed */
+
+	if (dir_rename) {
+		error = gfs2_permission(d_inode(odentry), MAY_WRITE);
+		if (error)
+			goto out_gunlock;
+	}
+
+	if (nip == NULL) {
+		error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
+		if (error)
+			goto out_gunlock;
+	}
+
+	if (da.nr_blocks) {
+		struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
+		error = gfs2_quota_lock_check(ndip, &ap);
+		if (error)
+			goto out_gunlock;
+
+		error = gfs2_inplace_reserve(ndip, &ap);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
+					 4 * RES_LEAF + 4, 0);
+		if (error)
+			goto out_ipreserv;
+	} else {
+		error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+					 5 * RES_LEAF + 4, 0);
+		if (error)
+			goto out_gunlock;
+	}
+
+	/* Remove the target file, if it exists */
+
+	if (nip)
+		error = gfs2_unlink_inode(ndip, ndentry);
+
+	if (dir_rename) {
+		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
+		if (error)
+			goto out_end_trans;
+	} else {
+		struct buffer_head *dibh;
+		error = gfs2_meta_inode_buffer(ip, &dibh);
+		if (error)
+			goto out_end_trans;
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	error = gfs2_dir_del(odip, odentry);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
+	if (error)
+		goto out_end_trans;
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipreserv:
+	if (da.nr_blocks)
+		gfs2_inplace_release(ndip);
+out_gunlock_q:
+	if (da.nr_blocks)
+		gfs2_quota_unlock(ndip);
+out_gunlock:
+	gfs2_dir_no_add(&da);
+	while (x--) {
+		gfs2_glock_dq(ghs + x);
+		gfs2_holder_uninit(ghs + x);
+	}
+out_gunlock_r:
+	if (r_gh.gh_gl)
+		gfs2_glock_dq_uninit(&r_gh);
+out:
+	return error;
+}
+
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size.
+ *
+ * Returns: 0 on success or error code
+ */
+
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+	struct gfs2_holder i_gh;
+	struct buffer_head *dibh;
+	unsigned int size;
+	char *buf;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		nd_set_link(nd, ERR_PTR(error));
+		return NULL;
+	}
+
+	size = (unsigned int)i_size_read(&ip->i_inode);
+	if (size == 0) {
+		gfs2_consist_inode(ip);
+		buf = ERR_PTR(-EIO);
+		goto out;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error) {
+		buf = ERR_PTR(error);
+		goto out;
+	}
+
+	buf = kzalloc(size + 1, GFP_NOFS);
+	if (!buf)
+		buf = ERR_PTR(-ENOMEM);
+	else
+		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
+	brelse(dibh);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	nd_set_link(nd, buf);
+	return NULL;
+}
+
+/**
+ * gfs2_permission -
+ * @inode: The inode
+ * @mask: The mask to be tested
+ * @flags: Indicates whether this is an RCU path walk or not
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done.
+ *
+ * Returns: errno
+ */
+
+int gfs2_permission(struct inode *inode, int mask)
+{
+	struct gfs2_inode *ip;
+	struct gfs2_holder i_gh;
+	int error;
+	int unlock = 0;
+
+
+	ip = GFS2_I(inode);
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+		if (mask & MAY_NOT_BLOCK)
+			return -ECHILD;
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+		if (error)
+			return error;
+		unlock = 1;
+	}
+
+	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+		error = -EACCES;
+	else
+		error = generic_permission(inode, mask);
+	if (unlock)
+		gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
+{
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/**
+ * gfs2_setattr_simple -
+ * @ip:
+ * @attr:
+ *
+ * Returns: errno
+ */
+
+int gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
+{
+	int error;
+
+	if (current->journal_info)
+		return __gfs2_setattr_simple(inode, attr);
+
+	error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0);
+	if (error)
+		return error;
+
+	error = __gfs2_setattr_simple(inode, attr);
+	gfs2_trans_end(GFS2_SB(inode));
+	return error;
+}
+
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	kuid_t ouid, nuid;
+	kgid_t ogid, ngid;
+	int error;
+	struct gfs2_alloc_parms ap;
+
+	ouid = inode->i_uid;
+	ogid = inode->i_gid;
+	nuid = attr->ia_uid;
+	ngid = attr->ia_gid;
+
+	if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid))
+		ouid = nuid = NO_UID_QUOTA_CHANGE;
+	if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
+		ogid = ngid = NO_GID_QUOTA_CHANGE;
+
+	error = get_write_access(inode);
+	if (error)
+		return error;
+
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_lock(ip, nuid, ngid);
+	if (error)
+		goto out;
+
+	ap.target = gfs2_get_inode_blocks(&ip->i_inode);
+
+	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
+		error = gfs2_quota_check(ip, nuid, ngid, &ap);
+		if (error)
+			goto out_gunlock_q;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_setattr_simple(inode, attr);
+	if (error)
+		goto out_end_trans;
+
+	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
+		gfs2_quota_change(ip, -ap.target, ouid, ogid);
+		gfs2_quota_change(ip, ap.target, nuid, ngid);
+	}
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out:
+	put_write_access(inode);
+	return error;
+}
+
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes.  Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		return error;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		error = gfs2_setattr_size(inode, attr->ia_size);
+	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+		error = setattr_chown(inode, attr);
+	else {
+		error = gfs2_setattr_simple(inode, attr);
+		if (!error && attr->ia_valid & ATTR_MODE)
+			error = posix_acl_chmod(inode, inode->i_mode);
+	}
+
+out:
+	if (!error)
+		mark_inode_dirty(inode);
+	gfs2_glock_dq_uninit(&i_gh);
+	return error;
+}
+
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done. Note that its the NFS
+ * readdirplus operation which causes this to be called (from filldir)
+ * with the glock already held.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	int unlock = 0;
+
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+		if (error)
+			return error;
+		unlock = 1;
+	}
+
+	generic_fillattr(inode, stat);
+	if (unlock)
+		gfs2_glock_dq_uninit(&gh);
+
+	return 0;
+}
+
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+			 const void *data, size_t size, int flags)
+{
+	struct inode *inode = d_inode(dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = gfs2_rs_alloc(ip);
+		if (ret == 0)
+			ret = generic_setxattr(dentry, name, data, size, flags);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+			     void *data, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	/* For selinux during lookup */
+	if (gfs2_glock_is_locked_by_me(ip->i_gl))
+		return generic_getxattr(dentry, name, data, size);
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_getxattr(dentry, name, data, size);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = d_inode(dentry);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = gfs2_rs_alloc(ip);
+		if (ret == 0)
+			ret = generic_removexattr(dentry, name);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (ret)
+		goto out;
+
+	if (gfs2_is_stuffed(ip)) {
+		u64 phys = ip->i_no_addr << inode->i_blkbits;
+		u64 size = i_size_read(inode);
+		u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+			    FIEMAP_EXTENT_DATA_INLINE;
+		phys += sizeof(struct gfs2_dinode);
+		phys += start;
+		if (start + len > size)
+			len = size - start;
+		if (start < size)
+			ret = fiemap_fill_next_extent(fieinfo, start, phys,
+						      len, flags);
+		if (ret == 1)
+			ret = 0;
+	} else {
+		ret = __generic_block_fiemap(inode, fieinfo, start, len,
+					     gfs2_block_map);
+	}
+
+	gfs2_glock_dq_uninit(&gh);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+const struct inode_operations gfs2_file_iops = {
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+	.get_acl = gfs2_get_acl,
+	.set_acl = gfs2_set_acl,
+};
+
+const struct inode_operations gfs2_dir_iops = {
+	.create = gfs2_create,
+	.lookup = gfs2_lookup,
+	.link = gfs2_link,
+	.unlink = gfs2_unlink,
+	.symlink = gfs2_symlink,
+	.mkdir = gfs2_mkdir,
+	.rmdir = gfs2_unlink,
+	.mknod = gfs2_mknod,
+	.rename = gfs2_rename,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+	.get_acl = gfs2_get_acl,
+	.set_acl = gfs2_set_acl,
+	.atomic_open = gfs2_atomic_open,
+};
+
+const struct inode_operations gfs2_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = gfs2_follow_link,
+	.put_link = kfree_put_link,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+};
+
diff --git a/kernel/fs/gfs2/inode.h b/kernel/fs/gfs2/inode.h
new file mode 100644
index 000000000..ba4d9492d
--- /dev/null
+++ b/kernel/fs/gfs2/inode.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INODE_DOT_H__
+#define __INODE_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include "util.h"
+
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
+
+static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
+{
+	return !ip->i_height;
+}
+
+static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
+{
+	return ip->i_diskflags & GFS2_DIF_JDATA;
+}
+
+static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
+}
+
+static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
+}
+
+static inline int gfs2_is_dir(const struct gfs2_inode *ip)
+{
+	return S_ISDIR(ip->i_inode.i_mode);
+}
+
+static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
+{
+	inode->i_blocks = blocks <<
+		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+
+static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
+{
+	return inode->i_blocks >>
+		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+
+static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
+{
+	gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
+	change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+	inode->i_blocks += change;
+}
+
+static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
+				  u64 no_formal_ino)
+{
+	return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino;
+}
+
+static inline void gfs2_inum_out(const struct gfs2_inode *ip,
+				 struct gfs2_dirent *dent)
+{
+	dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+	dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
+}
+
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+						u64 minsize, u64 maxsize)
+{
+	u64 size = i_size_read(inode);
+	if (size < minsize || size > maxsize)
+		goto err;
+	if (size & ((1 << inode->i_blkbits) - 1))
+		goto err;
+	return 0;
+err:
+	gfs2_consist_inode(GFS2_I(inode));
+	return -EIO;
+}
+
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+				       u64 no_addr, u64 no_formal_ino,
+				       int non_block);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+					 u64 *no_formal_ino,
+					 unsigned int blktype);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
+
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+				  int is_root);
+extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
+
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+
+extern void gfs2_set_inode_flags(struct inode *inode);
+ 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return sdp->sd_args.ar_localflocks;
+}
+#else /* Single node only */
+#define gfs2_file_fops gfs2_file_fops_nolock
+#define gfs2_dir_fops gfs2_dir_fops_nolock
+
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return 1;
+}
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+#endif /* __INODE_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/lock_dlm.c b/kernel/fs/gfs2/lock_dlm.c
new file mode 100644
index 000000000..641383a9c
--- /dev/null
+++ b/kernel/fs/gfs2/lock_dlm.c
@@ -0,0 +1,1336 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright 2004-2011 Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/fs.h>
+#include <linux/dlm.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "incore.h"
+#include "glock.h"
+#include "util.h"
+#include "sys.h"
+#include "trace_gfs2.h"
+
+extern struct workqueue_struct *gfs2_control_wq;
+
+/**
+ * gfs2_update_stats - Update time based stats
+ * @mv: Pointer to mean/variance structure to update
+ * @sample: New data to include
+ *
+ * @delta is the difference between the current rtt sample and the
+ * running average srtt. We add 1/8 of that to the srtt in order to
+ * update the current srtt estimate. The varience estimate is a bit
+ * more complicated. We subtract the abs value of the @delta from
+ * the current variance estimate and add 1/4 of that to the running
+ * total.
+ *
+ * Note that the index points at the array entry containing the smoothed
+ * mean value, and the variance is always in the following entry
+ *
+ * Reference: TCP/IP Illustrated, vol 2, p. 831,832
+ * All times are in units of integer nanoseconds. Unlike the TCP/IP case,
+ * they are not scaled fixed point.
+ */
+
+static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
+				     s64 sample)
+{
+	s64 delta = sample - s->stats[index];
+	s->stats[index] += (delta >> 3);
+	index++;
+	s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2);
+}
+
+/**
+ * gfs2_update_reply_times - Update locking statistics
+ * @gl: The glock to update
+ *
+ * This assumes that gl->gl_dstamp has been set earlier.
+ *
+ * The rtt (lock round trip time) is an estimate of the time
+ * taken to perform a dlm lock request. We update it on each
+ * reply from the dlm.
+ *
+ * The blocking flag is set on the glock for all dlm requests
+ * which may potentially block due to lock requests from other nodes.
+ * DLM requests where the current lock state is exclusive, the
+ * requested state is null (or unlocked) or where the TRY or
+ * TRY_1CB flags are set are classified as non-blocking. All
+ * other DLM requests are counted as (potentially) blocking.
+ */
+static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
+{
+	struct gfs2_pcpu_lkstats *lks;
+	const unsigned gltype = gl->gl_name.ln_type;
+	unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ?
+			 GFS2_LKS_SRTTB : GFS2_LKS_SRTT;
+	s64 rtt;
+
+	preempt_disable();
+	rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
+	lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+	gfs2_update_stats(&gl->gl_stats, index, rtt);		/* Local */
+	gfs2_update_stats(&lks->lkstats[gltype], index, rtt);	/* Global */
+	preempt_enable();
+
+	trace_gfs2_glock_lock_time(gl, rtt);
+}
+
+/**
+ * gfs2_update_request_times - Update locking statistics
+ * @gl: The glock to update
+ *
+ * The irt (lock inter-request times) measures the average time
+ * between requests to the dlm. It is updated immediately before
+ * each dlm call.
+ */
+
+static inline void gfs2_update_request_times(struct gfs2_glock *gl)
+{
+	struct gfs2_pcpu_lkstats *lks;
+	const unsigned gltype = gl->gl_name.ln_type;
+	ktime_t dstamp;
+	s64 irt;
+
+	preempt_disable();
+	dstamp = gl->gl_dstamp;
+	gl->gl_dstamp = ktime_get_real();
+	irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
+	lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+	gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt);		/* Local */
+	gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt);	/* Global */
+	preempt_enable();
+}
+ 
+static void gdlm_ast(void *arg)
+{
+	struct gfs2_glock *gl = arg;
+	unsigned ret = gl->gl_state;
+
+	gfs2_update_reply_times(gl);
+	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
+
+	if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
+		memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+	switch (gl->gl_lksb.sb_status) {
+	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
+		gfs2_glock_free(gl);
+		return;
+	case -DLM_ECANCEL: /* Cancel while getting lock */
+		ret |= LM_OUT_CANCELED;
+		goto out;
+	case -EAGAIN: /* Try lock fails */
+	case -EDEADLK: /* Deadlock detected */
+		goto out;
+	case -ETIMEDOUT: /* Canceled due to timeout */
+		ret |= LM_OUT_ERROR;
+		goto out;
+	case 0: /* Success */
+		break;
+	default: /* Something unexpected */
+		BUG();
+	}
+
+	ret = gl->gl_req;
+	if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (gl->gl_req == LM_ST_SHARED)
+			ret = LM_ST_DEFERRED;
+		else if (gl->gl_req == LM_ST_DEFERRED)
+			ret = LM_ST_SHARED;
+		else
+			BUG();
+	}
+
+	set_bit(GLF_INITIAL, &gl->gl_flags);
+	gfs2_glock_complete(gl, ret);
+	return;
+out:
+	if (!test_bit(GLF_INITIAL, &gl->gl_flags))
+		gl->gl_lksb.sb_lkid = 0;
+	gfs2_glock_complete(gl, ret);
+}
+
+static void gdlm_bast(void *arg, int mode)
+{
+	struct gfs2_glock *gl = arg;
+
+	switch (mode) {
+	case DLM_LOCK_EX:
+		gfs2_glock_cb(gl, LM_ST_UNLOCKED);
+		break;
+	case DLM_LOCK_CW:
+		gfs2_glock_cb(gl, LM_ST_DEFERRED);
+		break;
+	case DLM_LOCK_PR:
+		gfs2_glock_cb(gl, LM_ST_SHARED);
+		break;
+	default:
+		pr_err("unknown bast mode %d\n", mode);
+		BUG();
+	}
+}
+
+/* convert gfs lock-state to dlm lock-mode */
+
+static int make_mode(const unsigned int lmstate)
+{
+	switch (lmstate) {
+	case LM_ST_UNLOCKED:
+		return DLM_LOCK_NL;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	}
+	pr_err("unknown LM state %d\n", lmstate);
+	BUG();
+	return -1;
+}
+
+static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
+		      const int req)
+{
+	u32 lkf = 0;
+
+	if (gl->gl_lksb.sb_lvbptr)
+		lkf |= DLM_LKF_VALBLK;
+
+	if (gfs_flags & LM_FLAG_TRY)
+		lkf |= DLM_LKF_NOQUEUE;
+
+	if (gfs_flags & LM_FLAG_TRY_1CB) {
+		lkf |= DLM_LKF_NOQUEUE;
+		lkf |= DLM_LKF_NOQUEUEBAST;
+	}
+
+	if (gfs_flags & LM_FLAG_PRIORITY) {
+		lkf |= DLM_LKF_NOORDER;
+		lkf |= DLM_LKF_HEADQUE;
+	}
+
+	if (gfs_flags & LM_FLAG_ANY) {
+		if (req == DLM_LOCK_PR)
+			lkf |= DLM_LKF_ALTCW;
+		else if (req == DLM_LOCK_CW)
+			lkf |= DLM_LKF_ALTPR;
+		else
+			BUG();
+	}
+
+	if (gl->gl_lksb.sb_lkid != 0) {
+		lkf |= DLM_LKF_CONVERT;
+		if (test_bit(GLF_BLOCKING, &gl->gl_flags))
+			lkf |= DLM_LKF_QUECVT;
+	}
+
+	return lkf;
+}
+
+static void gfs2_reverse_hex(char *c, u64 value)
+{
+	*c = '0';
+	while (value) {
+		*c-- = hex_asc[value & 0x0f];
+		value >>= 4;
+	}
+}
+
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+		     unsigned int flags)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int req;
+	u32 lkf;
+	char strname[GDLM_STRNAME_BYTES] = "";
+
+	req = make_mode(req_state);
+	lkf = make_flags(gl, flags, req);
+	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
+	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
+	if (gl->gl_lksb.sb_lkid) {
+		gfs2_update_request_times(gl);
+	} else {
+		memset(strname, ' ', GDLM_STRNAME_BYTES - 1);
+		strname[GDLM_STRNAME_BYTES - 1] = '\0';
+		gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type);
+		gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number);
+		gl->gl_dstamp = ktime_get_real();
+	}
+	/*
+	 * Submit the actual lock request.
+	 */
+
+	return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
+			GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+}
+
+static void gdlm_put_lock(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int lvb_needs_unlock = 0;
+	int error;
+
+	if (gl->gl_lksb.sb_lkid == 0) {
+		gfs2_glock_free(gl);
+		return;
+	}
+
+	clear_bit(GLF_BLOCKING, &gl->gl_flags);
+	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
+	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
+	gfs2_update_request_times(gl);
+
+	/* don't want to skip dlm_unlock writing the lvb when lock is ex */
+
+	if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
+		lvb_needs_unlock = 1;
+
+	if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
+	    !lvb_needs_unlock) {
+		gfs2_glock_free(gl);
+		return;
+	}
+
+	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
+			   NULL, gl);
+	if (error) {
+		pr_err("gdlm_unlock %x,%llx err=%d\n",
+		       gl->gl_name.ln_type,
+		       (unsigned long long)gl->gl_name.ln_number, error);
+		return;
+	}
+}
+
+static void gdlm_cancel(struct gfs2_glock *gl)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+}
+
+/*
+ * dlm/gfs2 recovery coordination using dlm_recover callbacks
+ *
+ *  1. dlm_controld sees lockspace members change
+ *  2. dlm_controld blocks dlm-kernel locking activity
+ *  3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
+ *  4. dlm_controld starts and finishes its own user level recovery
+ *  5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
+ *  6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
+ *  7. dlm_recoverd does its own lock recovery
+ *  8. dlm_recoverd unblocks dlm-kernel locking activity
+ *  9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
+ * 10. gfs2_control updates control_lock lvb with new generation and jid bits
+ * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
+ * 12. gfs2_recover dequeues and recovers journals of failed nodes
+ * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
+ * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
+ * 15. gfs2_control unblocks normal locking when all journals are recovered
+ *
+ * - failures during recovery
+ *
+ * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
+ * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
+ * recovering for a prior failure.  gfs2_control needs a way to detect
+ * this so it can leave BLOCK_LOCKS set in step 15.  This is managed using
+ * the recover_block and recover_start values.
+ *
+ * recover_done() provides a new lockspace generation number each time it
+ * is called (step 9).  This generation number is saved as recover_start.
+ * When recover_prep() is called, it sets BLOCK_LOCKS and sets
+ * recover_block = recover_start.  So, while recover_block is equal to
+ * recover_start, BLOCK_LOCKS should remain set.  (recover_spin must
+ * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
+ *
+ * - more specific gfs2 steps in sequence above
+ *
+ *  3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
+ *  6. recover_slot records any failed jids (maybe none)
+ *  9. recover_done sets recover_start = new generation number
+ * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
+ * 12. gfs2_recover does journal recoveries for failed jids identified above
+ * 14. gfs2_control clears control_lock lvb bits for recovered jids
+ * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
+ *     again) then do nothing, otherwise if recover_start > recover_block
+ *     then clear BLOCK_LOCKS.
+ *
+ * - parallel recovery steps across all nodes
+ *
+ * All nodes attempt to update the control_lock lvb with the new generation
+ * number and jid bits, but only the first to get the control_lock EX will
+ * do so; others will see that it's already done (lvb already contains new
+ * generation number.)
+ *
+ * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
+ * . All nodes attempt to set control_lock lvb gen + bits for the new gen
+ * . One node gets control_lock first and writes the lvb, others see it's done
+ * . All nodes attempt to recover jids for which they see control_lock bits set
+ * . One node succeeds for a jid, and that one clears the jid bit in the lvb
+ * . All nodes will eventually see all lvb bits clear and unblock locks
+ *
+ * - is there a problem with clearing an lvb bit that should be set
+ *   and missing a journal recovery?
+ *
+ * 1. jid fails
+ * 2. lvb bit set for step 1
+ * 3. jid recovered for step 1
+ * 4. jid taken again (new mount)
+ * 5. jid fails (for step 4)
+ * 6. lvb bit set for step 5 (will already be set)
+ * 7. lvb bit cleared for step 3
+ *
+ * This is not a problem because the failure in step 5 does not
+ * require recovery, because the mount in step 4 could not have
+ * progressed far enough to unblock locks and access the fs.  The
+ * control_mount() function waits for all recoveries to be complete
+ * for the latest lockspace generation before ever unblocking locks
+ * and returning.  The mount in step 4 waits until the recovery in
+ * step 1 is done.
+ *
+ * - special case of first mounter: first node to mount the fs
+ *
+ * The first node to mount a gfs2 fs needs to check all the journals
+ * and recover any that need recovery before other nodes are allowed
+ * to mount the fs.  (Others may begin mounting, but they must wait
+ * for the first mounter to be done before taking locks on the fs
+ * or accessing the fs.)  This has two parts:
+ *
+ * 1. The mounted_lock tells a node it's the first to mount the fs.
+ * Each node holds the mounted_lock in PR while it's mounted.
+ * Each node tries to acquire the mounted_lock in EX when it mounts.
+ * If a node is granted the mounted_lock EX it means there are no
+ * other mounted nodes (no PR locks exist), and it is the first mounter.
+ * The mounted_lock is demoted to PR when first recovery is done, so
+ * others will fail to get an EX lock, but will get a PR lock.
+ *
+ * 2. The control_lock blocks others in control_mount() while the first
+ * mounter is doing first mount recovery of all journals.
+ * A mounting node needs to acquire control_lock in EX mode before
+ * it can proceed.  The first mounter holds control_lock in EX while doing
+ * the first mount recovery, blocking mounts from other nodes, then demotes
+ * control_lock to NL when it's done (others_may_mount/first_done),
+ * allowing other nodes to continue mounting.
+ *
+ * first mounter:
+ * control_lock EX/NOQUEUE success
+ * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
+ * set first=1
+ * do first mounter recovery
+ * mounted_lock EX->PR
+ * control_lock EX->NL, write lvb generation
+ *
+ * other mounter:
+ * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
+ * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
+ * mounted_lock PR/NOQUEUE success
+ * read lvb generation
+ * control_lock EX->NL
+ * set first=0
+ *
+ * - mount during recovery
+ *
+ * If a node mounts while others are doing recovery (not first mounter),
+ * the mounting node will get its initial recover_done() callback without
+ * having seen any previous failures/callbacks.
+ *
+ * It must wait for all recoveries preceding its mount to be finished
+ * before it unblocks locks.  It does this by repeating the "other mounter"
+ * steps above until the lvb generation number is >= its mount generation
+ * number (from initial recover_done) and all lvb bits are clear.
+ *
+ * - control_lock lvb format
+ *
+ * 4 bytes generation number: the latest dlm lockspace generation number
+ * from recover_done callback.  Indicates the jid bitmap has been updated
+ * to reflect all slot failures through that generation.
+ * 4 bytes unused.
+ * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
+ * that jid N needs recovery.
+ */
+
+#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
+
+static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
+			     char *lvb_bits)
+{
+	__le32 gen;
+	memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
+	memcpy(&gen, lvb_bits, sizeof(__le32));
+	*lvb_gen = le32_to_cpu(gen);
+}
+
+static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
+			      char *lvb_bits)
+{
+	__le32 gen;
+	memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
+	gen = cpu_to_le32(lvb_gen);
+	memcpy(ls->ls_control_lvb, &gen, sizeof(__le32));
+}
+
+static int all_jid_bits_clear(char *lvb)
+{
+	return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0,
+			GDLM_LVB_SIZE - JID_BITMAP_OFFSET);
+}
+
+static void sync_wait_cb(void *arg)
+{
+	struct lm_lockstruct *ls = arg;
+	complete(&ls->ls_sync_wait);
+}
+
+static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+	if (error) {
+		fs_err(sdp, "%s lkid %x error %d\n",
+		       name, lksb->sb_lkid, error);
+		return error;
+	}
+
+	wait_for_completion(&ls->ls_sync_wait);
+
+	if (lksb->sb_status != -DLM_EUNLOCK) {
+		fs_err(sdp, "%s lkid %x status %d\n",
+		       name, lksb->sb_lkid, lksb->sb_status);
+		return -1;
+	}
+	return 0;
+}
+
+static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
+		     unsigned int num, struct dlm_lksb *lksb, char *name)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	char strname[GDLM_STRNAME_BYTES];
+	int error, status;
+
+	memset(strname, 0, GDLM_STRNAME_BYTES);
+	snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
+
+	error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+			 strname, GDLM_STRNAME_BYTES - 1,
+			 0, sync_wait_cb, ls, NULL);
+	if (error) {
+		fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
+		       name, lksb->sb_lkid, flags, mode, error);
+		return error;
+	}
+
+	wait_for_completion(&ls->ls_sync_wait);
+
+	status = lksb->sb_status;
+
+	if (status && status != -EAGAIN) {
+		fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
+		       name, lksb->sb_lkid, flags, mode, status);
+	}
+
+	return status;
+}
+
+static int mounted_unlock(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
+			 &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int control_unlock(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
+}
+
+static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
+			 &ls->ls_control_lksb, "control_lock");
+}
+
+static void gfs2_control_func(struct work_struct *work)
+{
+	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	uint32_t block_gen, start_gen, lvb_gen, flags;
+	int recover_set = 0;
+	int write_lvb = 0;
+	int recover_size;
+	int i, error;
+
+	spin_lock(&ls->ls_recover_spin);
+	/*
+	 * No MOUNT_DONE means we're still mounting; control_mount()
+	 * will set this flag, after which this thread will take over
+	 * all further clearing of BLOCK_LOCKS.
+	 *
+	 * FIRST_MOUNT means this node is doing first mounter recovery,
+	 * for which recovery control is handled by
+	 * control_mount()/control_first_done(), not this thread.
+	 */
+	if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+	     test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+	block_gen = ls->ls_recover_block;
+	start_gen = ls->ls_recover_start;
+	spin_unlock(&ls->ls_recover_spin);
+
+	/*
+	 * Equal block_gen and start_gen implies we are between
+	 * recover_prep and recover_done callbacks, which means
+	 * dlm recovery is in progress and dlm locking is blocked.
+	 * There's no point trying to do any work until recover_done.
+	 */
+
+	if (block_gen == start_gen)
+		return;
+
+	/*
+	 * Propagate recover_submit[] and recover_result[] to lvb:
+	 * dlm_recoverd adds to recover_submit[] jids needing recovery
+	 * gfs2_recover adds to recover_result[] journal recovery results
+	 *
+	 * set lvb bit for jids in recover_submit[] if the lvb has not
+	 * yet been updated for the generation of the failure
+	 *
+	 * clear lvb bit for jids in recover_result[] if the result of
+	 * the journal recovery is SUCCESS
+	 */
+
+	error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+	if (error) {
+		fs_err(sdp, "control lock EX error %d\n", error);
+		return;
+	}
+
+	control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);
+
+	spin_lock(&ls->ls_recover_spin);
+	if (block_gen != ls->ls_recover_block ||
+	    start_gen != ls->ls_recover_start) {
+		fs_info(sdp, "recover generation %u block1 %u %u\n",
+			start_gen, block_gen, ls->ls_recover_block);
+		spin_unlock(&ls->ls_recover_spin);
+		control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+		return;
+	}
+
+	recover_size = ls->ls_recover_size;
+
+	if (lvb_gen <= start_gen) {
+		/*
+		 * Clear lvb bits for jids we've successfully recovered.
+		 * Because all nodes attempt to recover failed journals,
+		 * a journal can be recovered multiple times successfully
+		 * in succession.  Only the first will really do recovery,
+		 * the others find it clean, but still report a successful
+		 * recovery.  So, another node may have already recovered
+		 * the jid and cleared the lvb bit for it.
+		 */
+		for (i = 0; i < recover_size; i++) {
+			if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
+				continue;
+
+			ls->ls_recover_result[i] = 0;
+
+			if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET))
+				continue;
+
+			__clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);
+			write_lvb = 1;
+		}
+	}
+
+	if (lvb_gen == start_gen) {
+		/*
+		 * Failed slots before start_gen are already set in lvb.
+		 */
+		for (i = 0; i < recover_size; i++) {
+			if (!ls->ls_recover_submit[i])
+				continue;
+			if (ls->ls_recover_submit[i] < lvb_gen)
+				ls->ls_recover_submit[i] = 0;
+		}
+	} else if (lvb_gen < start_gen) {
+		/*
+		 * Failed slots before start_gen are not yet set in lvb.
+		 */
+		for (i = 0; i < recover_size; i++) {
+			if (!ls->ls_recover_submit[i])
+				continue;
+			if (ls->ls_recover_submit[i] < start_gen) {
+				ls->ls_recover_submit[i] = 0;
+				__set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);
+			}
+		}
+		/* even if there are no bits to set, we need to write the
+		   latest generation to the lvb */
+		write_lvb = 1;
+	} else {
+		/*
+		 * we should be getting a recover_done() for lvb_gen soon
+		 */
+	}
+	spin_unlock(&ls->ls_recover_spin);
+
+	if (write_lvb) {
+		control_lvb_write(ls, start_gen, ls->ls_lvb_bits);
+		flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
+	} else {
+		flags = DLM_LKF_CONVERT;
+	}
+
+	error = control_lock(sdp, DLM_LOCK_NL, flags);
+	if (error) {
+		fs_err(sdp, "control lock NL error %d\n", error);
+		return;
+	}
+
+	/*
+	 * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
+	 * and clear a jid bit in the lvb if the recovery is a success.
+	 * Eventually all journals will be recovered, all jid bits will
+	 * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
+	 */
+
+	for (i = 0; i < recover_size; i++) {
+		if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) {
+			fs_info(sdp, "recover generation %u jid %d\n",
+				start_gen, i);
+			gfs2_recover_set(sdp, i);
+			recover_set++;
+		}
+	}
+	if (recover_set)
+		return;
+
+	/*
+	 * No more jid bits set in lvb, all recovery is done, unblock locks
+	 * (unless a new recover_prep callback has occured blocking locks
+	 * again while working above)
+	 */
+
+	spin_lock(&ls->ls_recover_spin);
+	if (ls->ls_recover_block == block_gen &&
+	    ls->ls_recover_start == start_gen) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		fs_info(sdp, "recover generation %u done\n", start_gen);
+		gfs2_glock_thaw(sdp);
+	} else {
+		fs_info(sdp, "recover generation %u block2 %u %u\n",
+			start_gen, block_gen, ls->ls_recover_block);
+		spin_unlock(&ls->ls_recover_spin);
+	}
+}
+
+static int control_mount(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	uint32_t start_gen, block_gen, mount_gen, lvb_gen;
+	int mounted_mode;
+	int retries = 0;
+	int error;
+
+	memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
+	memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
+	memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
+	ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
+	init_completion(&ls->ls_sync_wait);
+
+	set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+
+	error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
+	if (error) {
+		fs_err(sdp, "control_mount control_lock NL error %d\n", error);
+		return error;
+	}
+
+	error = mounted_lock(sdp, DLM_LOCK_NL, 0);
+	if (error) {
+		fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
+		control_unlock(sdp);
+		return error;
+	}
+	mounted_mode = DLM_LOCK_NL;
+
+restart:
+	if (retries++ && signal_pending(current)) {
+		error = -EINTR;
+		goto fail;
+	}
+
+	/*
+	 * We always start with both locks in NL. control_lock is
+	 * demoted to NL below so we don't need to do it here.
+	 */
+
+	if (mounted_mode != DLM_LOCK_NL) {
+		error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+		if (error)
+			goto fail;
+		mounted_mode = DLM_LOCK_NL;
+	}
+
+	/*
+	 * Other nodes need to do some work in dlm recovery and gfs2_control
+	 * before the recover_done and control_lock will be ready for us below.
+	 * A delay here is not required but often avoids having to retry.
+	 */
+
+	msleep_interruptible(500);
+
+	/*
+	 * Acquire control_lock in EX and mounted_lock in either EX or PR.
+	 * control_lock lvb keeps track of any pending journal recoveries.
+	 * mounted_lock indicates if any other nodes have the fs mounted.
+	 */
+
+	error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
+	if (error == -EAGAIN) {
+		goto restart;
+	} else if (error) {
+		fs_err(sdp, "control_mount control_lock EX error %d\n", error);
+		goto fail;
+	}
+
+	error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+	if (!error) {
+		mounted_mode = DLM_LOCK_EX;
+		goto locks_done;
+	} else if (error != -EAGAIN) {
+		fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
+		goto fail;
+	}
+
+	error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+	if (!error) {
+		mounted_mode = DLM_LOCK_PR;
+		goto locks_done;
+	} else {
+		/* not even -EAGAIN should happen here */
+		fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
+		goto fail;
+	}
+
+locks_done:
+	/*
+	 * If we got both locks above in EX, then we're the first mounter.
+	 * If not, then we need to wait for the control_lock lvb to be
+	 * updated by other mounted nodes to reflect our mount generation.
+	 *
+	 * In simple first mounter cases, first mounter will see zero lvb_gen,
+	 * but in cases where all existing nodes leave/fail before mounting
+	 * nodes finish control_mount, then all nodes will be mounting and
+	 * lvb_gen will be non-zero.
+	 */
+
+	control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);
+
+	if (lvb_gen == 0xFFFFFFFF) {
+		/* special value to force mount attempts to fail */
+		fs_err(sdp, "control_mount control_lock disabled\n");
+		error = -EINVAL;
+		goto fail;
+	}
+
+	if (mounted_mode == DLM_LOCK_EX) {
+		/* first mounter, keep both EX while doing first recovery */
+		spin_lock(&ls->ls_recover_spin);
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+		set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+		set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
+		return 0;
+	}
+
+	error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+	if (error)
+		goto fail;
+
+	/*
+	 * We are not first mounter, now we need to wait for the control_lock
+	 * lvb generation to be >= the generation from our first recover_done
+	 * and all lvb bits to be clear (no pending journal recoveries.)
+	 */
+
+	if (!all_jid_bits_clear(ls->ls_lvb_bits)) {
+		/* journals need recovery, wait until all are clear */
+		fs_info(sdp, "control_mount wait for journal recovery\n");
+		goto restart;
+	}
+
+	spin_lock(&ls->ls_recover_spin);
+	block_gen = ls->ls_recover_block;
+	start_gen = ls->ls_recover_start;
+	mount_gen = ls->ls_recover_mount;
+
+	if (lvb_gen < mount_gen) {
+		/* wait for mounted nodes to update control_lock lvb to our
+		   generation, which might include new recovery bits set */
+		fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
+			"lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+			lvb_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		goto restart;
+	}
+
+	if (lvb_gen != start_gen) {
+		/* wait for mounted nodes to update control_lock lvb to the
+		   latest recovery generation */
+		fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
+			"lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+			lvb_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		goto restart;
+	}
+
+	if (block_gen == start_gen) {
+		/* dlm recovery in progress, wait for it to finish */
+		fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
+			"lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+			lvb_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		goto restart;
+	}
+
+	clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+	set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+	memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+	memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+	spin_unlock(&ls->ls_recover_spin);
+	return 0;
+
+fail:
+	mounted_unlock(sdp);
+	control_unlock(sdp);
+	return error;
+}
+
+static int control_first_done(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	uint32_t start_gen, block_gen;
+	int error;
+
+restart:
+	spin_lock(&ls->ls_recover_spin);
+	start_gen = ls->ls_recover_start;
+	block_gen = ls->ls_recover_block;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
+	    !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+	    !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		/* sanity check, should not happen */
+		fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
+		       start_gen, block_gen, ls->ls_recover_flags);
+		spin_unlock(&ls->ls_recover_spin);
+		control_unlock(sdp);
+		return -1;
+	}
+
+	if (start_gen == block_gen) {
+		/*
+		 * Wait for the end of a dlm recovery cycle to switch from
+		 * first mounter recovery.  We can ignore any recover_slot
+		 * callbacks between the recover_prep and next recover_done
+		 * because we are still the first mounter and any failed nodes
+		 * have not fully mounted, so they don't need recovery.
+		 */
+		spin_unlock(&ls->ls_recover_spin);
+		fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
+
+		wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
+			    TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+
+	clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+	set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
+	memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+	memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+	spin_unlock(&ls->ls_recover_spin);
+
+	memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE);
+	control_lvb_write(ls, start_gen, ls->ls_lvb_bits);
+
+	error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
+	if (error)
+		fs_err(sdp, "control_first_done mounted PR error %d\n", error);
+
+	error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+	if (error)
+		fs_err(sdp, "control_first_done control NL error %d\n", error);
+
+	return error;
+}
+
+/*
+ * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
+ * to accomodate the largest slot number.  (NB dlm slot numbers start at 1,
+ * gfs2 jids start at 0, so jid = slot - 1)
+ */
+
+#define RECOVER_SIZE_INC 16
+
+static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
+			    int num_slots)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	uint32_t *submit = NULL;
+	uint32_t *result = NULL;
+	uint32_t old_size, new_size;
+	int i, max_jid;
+
+	if (!ls->ls_lvb_bits) {
+		ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
+		if (!ls->ls_lvb_bits)
+			return -ENOMEM;
+	}
+
+	max_jid = 0;
+	for (i = 0; i < num_slots; i++) {
+		if (max_jid < slots[i].slot - 1)
+			max_jid = slots[i].slot - 1;
+	}
+
+	old_size = ls->ls_recover_size;
+
+	if (old_size >= max_jid + 1)
+		return 0;
+
+	new_size = old_size + RECOVER_SIZE_INC;
+
+	submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
+	result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
+	if (!submit || !result) {
+		kfree(submit);
+		kfree(result);
+		return -ENOMEM;
+	}
+
+	spin_lock(&ls->ls_recover_spin);
+	memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
+	memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
+	kfree(ls->ls_recover_submit);
+	kfree(ls->ls_recover_result);
+	ls->ls_recover_submit = submit;
+	ls->ls_recover_result = result;
+	ls->ls_recover_size = new_size;
+	spin_unlock(&ls->ls_recover_spin);
+	return 0;
+}
+
+static void free_recover_size(struct lm_lockstruct *ls)
+{
+	kfree(ls->ls_lvb_bits);
+	kfree(ls->ls_recover_submit);
+	kfree(ls->ls_recover_result);
+	ls->ls_recover_submit = NULL;
+	ls->ls_recover_result = NULL;
+	ls->ls_recover_size = 0;
+}
+
+/* dlm calls before it does lock recovery */
+
+static void gdlm_recover_prep(void *arg)
+{
+	struct gfs2_sbd *sdp = arg;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	spin_lock(&ls->ls_recover_spin);
+	ls->ls_recover_block = ls->ls_recover_start;
+	set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+
+	if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+	     test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+	set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_prep has been completed on all lockspace members;
+   identifies slot/jid of failed member */
+
+static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
+{
+	struct gfs2_sbd *sdp = arg;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int jid = slot->slot - 1;
+
+	spin_lock(&ls->ls_recover_spin);
+	if (ls->ls_recover_size < jid + 1) {
+		fs_err(sdp, "recover_slot jid %d gen %u short size %d",
+		       jid, ls->ls_recover_block, ls->ls_recover_size);
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+
+	if (ls->ls_recover_submit[jid]) {
+		fs_info(sdp, "recover_slot jid %d gen %u prev %u\n",
+			jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
+	}
+	ls->ls_recover_submit[jid] = ls->ls_recover_block;
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_slot and after it completes lock recovery */
+
+static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
+			      int our_slot, uint32_t generation)
+{
+	struct gfs2_sbd *sdp = arg;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	/* ensure the ls jid arrays are large enough */
+	set_recover_size(sdp, slots, num_slots);
+
+	spin_lock(&ls->ls_recover_spin);
+	ls->ls_recover_start = generation;
+
+	if (!ls->ls_recover_mount) {
+		ls->ls_recover_mount = generation;
+		ls->ls_jid = our_slot - 1;
+	}
+
+	if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+		queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+
+	clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+/* gfs2_recover thread has a journal recovery result */
+
+static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
+				 unsigned int result)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+		return;
+
+	/* don't care about the recovery of own journal during mount */
+	if (jid == ls->ls_jid)
+		return;
+
+	spin_lock(&ls->ls_recover_spin);
+	if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+	if (ls->ls_recover_size < jid + 1) {
+		fs_err(sdp, "recovery_result jid %d short size %d",
+		       jid, ls->ls_recover_size);
+		spin_unlock(&ls->ls_recover_spin);
+		return;
+	}
+
+	fs_info(sdp, "recover jid %d result %s\n", jid,
+		result == LM_RD_GAVEUP ? "busy" : "success");
+
+	ls->ls_recover_result[jid] = result;
+
+	/* GAVEUP means another node is recovering the journal; delay our
+	   next attempt to recover it, to give the other node a chance to
+	   finish before trying again */
+
+	if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+		queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
+				   result == LM_RD_GAVEUP ? HZ : 0);
+	spin_unlock(&ls->ls_recover_spin);
+}
+
+const struct dlm_lockspace_ops gdlm_lockspace_ops = {
+	.recover_prep = gdlm_recover_prep,
+	.recover_slot = gdlm_recover_slot,
+	.recover_done = gdlm_recover_done,
+};
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	char cluster[GFS2_LOCKNAME_LEN];
+	const char *fsname;
+	uint32_t flags;
+	int error, ops_result;
+
+	/*
+	 * initialize everything
+	 */
+
+	INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+	spin_lock_init(&ls->ls_recover_spin);
+	ls->ls_recover_flags = 0;
+	ls->ls_recover_mount = 0;
+	ls->ls_recover_start = 0;
+	ls->ls_recover_block = 0;
+	ls->ls_recover_size = 0;
+	ls->ls_recover_submit = NULL;
+	ls->ls_recover_result = NULL;
+	ls->ls_lvb_bits = NULL;
+
+	error = set_recover_size(sdp, NULL, 0);
+	if (error)
+		goto fail;
+
+	/*
+	 * prepare dlm_new_lockspace args
+	 */
+
+	fsname = strchr(table, ':');
+	if (!fsname) {
+		fs_info(sdp, "no fsname found\n");
+		error = -EINVAL;
+		goto fail_free;
+	}
+	memset(cluster, 0, sizeof(cluster));
+	memcpy(cluster, table, strlen(table) - strlen(fsname));
+	fsname++;
+
+	flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+
+	/*
+	 * create/join lockspace
+	 */
+
+	error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
+				  &gdlm_lockspace_ops, sdp, &ops_result,
+				  &ls->ls_dlm);
+	if (error) {
+		fs_err(sdp, "dlm_new_lockspace error %d\n", error);
+		goto fail_free;
+	}
+
+	if (ops_result < 0) {
+		/*
+		 * dlm does not support ops callbacks,
+		 * old dlm_controld/gfs_controld are used, try without ops.
+		 */
+		fs_info(sdp, "dlm lockspace ops not used\n");
+		free_recover_size(ls);
+		set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
+		return 0;
+	}
+
+	if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
+		fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
+		error = -EINVAL;
+		goto fail_release;
+	}
+
+	/*
+	 * control_mount() uses control_lock to determine first mounter,
+	 * and for later mounts, waits for any recoveries to be cleared.
+	 */
+
+	error = control_mount(sdp);
+	if (error) {
+		fs_err(sdp, "mount control error %d\n", error);
+		goto fail_release;
+	}
+
+	ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+	return 0;
+
+fail_release:
+	dlm_release_lockspace(ls->ls_dlm, 2);
+fail_free:
+	free_recover_size(ls);
+fail:
+	return error;
+}
+
+static void gdlm_first_done(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+		return;
+
+	error = control_first_done(sdp);
+	if (error)
+		fs_err(sdp, "mount first_done error %d\n", error);
+}
+
+static void gdlm_unmount(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+		goto release;
+
+	/* wait for gfs2_control_wq to be done with this mount */
+
+	spin_lock(&ls->ls_recover_spin);
+	set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
+	spin_unlock(&ls->ls_recover_spin);
+	flush_delayed_work(&sdp->sd_control_work);
+
+	/* mounted_lock and control_lock will be purged in dlm recovery */
+release:
+	if (ls->ls_dlm) {
+		dlm_release_lockspace(ls->ls_dlm, 2);
+		ls->ls_dlm = NULL;
+	}
+
+	free_recover_size(ls);
+}
+
+static const match_table_t dlm_tokens = {
+	{ Opt_jid, "jid=%d"},
+	{ Opt_id, "id=%d"},
+	{ Opt_first, "first=%d"},
+	{ Opt_nodir, "nodir=%d"},
+	{ Opt_err, NULL },
+};
+
+const struct lm_lockops gfs2_dlm_ops = {
+	.lm_proto_name = "lock_dlm",
+	.lm_mount = gdlm_mount,
+	.lm_first_done = gdlm_first_done,
+	.lm_recovery_result = gdlm_recovery_result,
+	.lm_unmount = gdlm_unmount,
+	.lm_put_lock = gdlm_put_lock,
+	.lm_lock = gdlm_lock,
+	.lm_cancel = gdlm_cancel,
+	.lm_tokens = &dlm_tokens,
+};
+
diff --git a/kernel/fs/gfs2/log.c b/kernel/fs/gfs2/log.c
new file mode 100644
index 000000000..536e7a625
--- /dev/null
+++ b/kernel/fs/gfs2/log.c
@@ -0,0 +1,950 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/list_sort.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "util.h"
+#include "dir.h"
+#include "trace_gfs2.h"
+
+/**
+ * gfs2_struct2blk - compute stuff
+ * @sdp: the filesystem
+ * @nstruct: the number of structures
+ * @ssize: the size of the structures
+ *
+ * Compute the number of log descriptor blocks needed to hold a certain number
+ * of structures of a certain size.
+ *
+ * Returns: the number of blocks needed (minimum is always 1)
+ */
+
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+			     unsigned int ssize)
+{
+	unsigned int blks;
+	unsigned int first, second;
+
+	blks = 1;
+	first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
+
+	if (nstruct > first) {
+		second = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / ssize;
+		blks += DIV_ROUND_UP(nstruct - first, second);
+	}
+
+	return blks;
+}
+
+/**
+ * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters
+ * @mapping: The associated mapping (maybe NULL)
+ * @bd: The gfs2_bufdata to remove
+ *
+ * The ail lock _must_ be held when calling this function
+ *
+ */
+
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
+{
+	bd->bd_tr = NULL;
+	list_del_init(&bd->bd_ail_st_list);
+	list_del_init(&bd->bd_ail_gl_list);
+	atomic_dec(&bd->bd_gl->gl_ail_count);
+	brelse(bd->bd_bh);
+}
+
+/**
+ * gfs2_ail1_start_one - Start I/O on a part of the AIL
+ * @sdp: the filesystem
+ * @wbc: The writeback control structure
+ * @ai: The ail structure
+ *
+ */
+
+static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
+			       struct writeback_control *wbc,
+			       struct gfs2_trans *tr)
+__releases(&sdp->sd_ail_lock)
+__acquires(&sdp->sd_ail_lock)
+{
+	struct gfs2_glock *gl = NULL;
+	struct address_space *mapping;
+	struct gfs2_bufdata *bd, *s;
+	struct buffer_head *bh;
+
+	list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) {
+		bh = bd->bd_bh;
+
+		gfs2_assert(sdp, bd->bd_tr == tr);
+
+		if (!buffer_busy(bh)) {
+			if (!buffer_uptodate(bh))
+				gfs2_io_error_bh(sdp, bh);
+			list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
+			continue;
+		}
+
+		if (!buffer_dirty(bh))
+			continue;
+		if (gl == bd->bd_gl)
+			continue;
+		gl = bd->bd_gl;
+		list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list);
+		mapping = bh->b_page->mapping;
+		if (!mapping)
+			continue;
+		spin_unlock(&sdp->sd_ail_lock);
+		generic_writepages(mapping, wbc);
+		spin_lock(&sdp->sd_ail_lock);
+		if (wbc->nr_to_write <= 0)
+			break;
+		return 1;
+	}
+
+	return 0;
+}
+
+
+/**
+ * gfs2_ail1_flush - start writeback of some ail1 entries 
+ * @sdp: The super block
+ * @wbc: The writeback control structure
+ *
+ * Writes back some ail1 entries, according to the limits in the
+ * writeback control structure
+ */
+
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
+{
+	struct list_head *head = &sdp->sd_ail1_list;
+	struct gfs2_trans *tr;
+	struct blk_plug plug;
+
+	trace_gfs2_ail_flush(sdp, wbc, 1);
+	blk_start_plug(&plug);
+	spin_lock(&sdp->sd_ail_lock);
+restart:
+	list_for_each_entry_reverse(tr, head, tr_list) {
+		if (wbc->nr_to_write <= 0)
+			break;
+		if (gfs2_ail1_start_one(sdp, wbc, tr))
+			goto restart;
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+	blk_finish_plug(&plug);
+	trace_gfs2_ail_flush(sdp, wbc, 0);
+}
+
+/**
+ * gfs2_ail1_start - start writeback of all ail1 entries
+ * @sdp: The superblock
+ */
+
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = LONG_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
+
+	return gfs2_ail1_flush(sdp, &wbc);
+}
+
+/**
+ * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct gfs2_bufdata *bd, *s;
+	struct buffer_head *bh;
+
+	list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list,
+					 bd_ail_st_list) {
+		bh = bd->bd_bh;
+		gfs2_assert(sdp, bd->bd_tr == tr);
+		if (buffer_busy(bh))
+			continue;
+		if (!buffer_uptodate(bh))
+			gfs2_io_error_bh(sdp, bh);
+		list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
+	}
+
+}
+
+/**
+ * gfs2_ail1_empty - Try to empty the ail1 lists
+ * @sdp: The superblock
+ *
+ * Tries to empty the ail1 lists, starting with the oldest first
+ */
+
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr, *s;
+	int oldest_tr = 1;
+	int ret;
+
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
+		gfs2_ail1_empty_one(sdp, tr);
+		if (list_empty(&tr->tr_ail1_list) && oldest_tr)
+			list_move(&tr->tr_list, &sdp->sd_ail2_list);
+		else
+			oldest_tr = 0;
+	}
+	ret = list_empty(&sdp->sd_ail1_list);
+	spin_unlock(&sdp->sd_ail_lock);
+
+	return ret;
+}
+
+static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr;
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) {
+			bh = bd->bd_bh;
+			if (!buffer_locked(bh))
+				continue;
+			get_bh(bh);
+			spin_unlock(&sdp->sd_ail_lock);
+			wait_on_buffer(bh);
+			brelse(bh);
+			return;
+		}
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+}
+
+/**
+ * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct list_head *head = &tr->tr_ail2_list;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->prev, struct gfs2_bufdata,
+				bd_ail_st_list);
+		gfs2_assert(sdp, bd->bd_tr == tr);
+		gfs2_remove_from_ail(bd);
+	}
+}
+
+static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+	struct gfs2_trans *tr, *safe;
+	unsigned int old_tail = sdp->sd_log_tail;
+	int wrap = (new_tail < old_tail);
+	int a, b, rm;
+
+	spin_lock(&sdp->sd_ail_lock);
+
+	list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) {
+		a = (old_tail <= tr->tr_first);
+		b = (tr->tr_first < new_tail);
+		rm = (wrap) ? (a || b) : (a && b);
+		if (!rm)
+			continue;
+
+		gfs2_ail2_empty_one(sdp, tr);
+		list_del(&tr->tr_list);
+		gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list));
+		gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list));
+		kfree(tr);
+	}
+
+	spin_unlock(&sdp->sd_ail_lock);
+}
+
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+	atomic_add(blks, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, blks);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+				  sdp->sd_jdesc->jd_blocks);
+	up_read(&sdp->sd_log_flush_lock);
+}
+
+/**
+ * gfs2_log_reserve - Make a log reservation
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks to reserve
+ *
+ * Note that we never give out the last few blocks of the journal. Thats
+ * due to the fact that there is a small number of header blocks
+ * associated with each log flush. The exact number can't be known until
+ * flush time, so we ensure that we have just enough free blocks at all
+ * times to avoid running out during a log flush.
+ *
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
+ * Returns: errno
+ */
+
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
+{
+	int ret = 0;
+	unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
+	unsigned wanted = blks + reserved_blks;
+	DEFINE_WAIT(wait);
+	int did_wait = 0;
+	unsigned int free_blocks;
+
+	if (gfs2_assert_warn(sdp, blks) ||
+	    gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
+		return -EINVAL;
+retry:
+	free_blocks = atomic_read(&sdp->sd_log_blks_free);
+	if (unlikely(free_blocks <= wanted)) {
+		do {
+			prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
+					TASK_UNINTERRUPTIBLE);
+			wake_up(&sdp->sd_logd_waitq);
+			did_wait = 1;
+			if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
+				io_schedule();
+			free_blocks = atomic_read(&sdp->sd_log_blks_free);
+		} while(free_blocks <= wanted);
+		finish_wait(&sdp->sd_log_waitq, &wait);
+	}
+	atomic_inc(&sdp->sd_reserving_log);
+	if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+				free_blocks - blks) != free_blocks) {
+		if (atomic_dec_and_test(&sdp->sd_reserving_log))
+			wake_up(&sdp->sd_reserving_log_wait);
+		goto retry;
+	}
+	trace_gfs2_log_blocks(sdp, -blks);
+
+	/*
+	 * If we waited, then so might others, wake them up _after_ we get
+	 * our share of the log.
+	 */
+	if (unlikely(did_wait))
+		wake_up(&sdp->sd_log_waitq);
+
+	down_read(&sdp->sd_log_flush_lock);
+	if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) {
+		gfs2_log_release(sdp, blks);
+		ret = -EROFS;
+	}
+	if (atomic_dec_and_test(&sdp->sd_reserving_log))
+		wake_up(&sdp->sd_reserving_log_wait);
+	return ret;
+}
+
+/**
+ * log_distance - Compute distance between two journal blocks
+ * @sdp: The GFS2 superblock
+ * @newer: The most recent journal block of the pair
+ * @older: The older journal block of the pair
+ *
+ *   Compute the distance (in the journal direction) between two
+ *   blocks in the journal
+ *
+ * Returns: the distance in blocks
+ */
+
+static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
+					unsigned int older)
+{
+	int dist;
+
+	dist = newer - older;
+	if (dist < 0)
+		dist += sdp->sd_jdesc->jd_blocks;
+
+	return dist;
+}
+
+/**
+ * calc_reserved - Calculate the number of blocks to reserve when
+ *                 refunding a transaction's unused buffers.
+ * @sdp: The GFS2 superblock
+ *
+ * This is complex.  We need to reserve room for all our currently used
+ * metadata buffers (e.g. normal file I/O rewriting file time stamps) and 
+ * all our journaled data buffers for journaled files (e.g. files in the 
+ * meta_fs like rindex, or files for which chattr +j was done.)
+ * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush
+ * will count it as free space (sd_log_blks_free) and corruption will follow.
+ *
+ * We can have metadata bufs and jdata bufs in the same journal.  So each
+ * type gets its own log header, for which we need to reserve a block.
+ * In fact, each type has the potential for needing more than one header 
+ * in cases where we have more buffers than will fit on a journal page.
+ * Metadata journal entries take up half the space of journaled buffer entries.
+ * Thus, metadata entries have buf_limit (502) and journaled buffers have
+ * databuf_limit (251) before they cause a wrap around.
+ *
+ * Also, we need to reserve blocks for revoke journal entries and one for an
+ * overall header for the lot.
+ *
+ * Returns: the number of blocks reserved
+ */
+static unsigned int calc_reserved(struct gfs2_sbd *sdp)
+{
+	unsigned int reserved = 0;
+	unsigned int mbuf;
+	unsigned int dbuf;
+	struct gfs2_trans *tr = sdp->sd_log_tr;
+
+	if (tr) {
+		mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
+		dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
+		reserved = mbuf + dbuf;
+		/* Account for header blocks */
+		reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp));
+		reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp));
+	}
+
+	if (sdp->sd_log_commited_revoke > 0)
+		reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+					  sizeof(u64));
+	/* One for the overall header */
+	if (reserved)
+		reserved++;
+	return reserved;
+}
+
+static unsigned int current_tail(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr;
+	unsigned int tail;
+
+	spin_lock(&sdp->sd_ail_lock);
+
+	if (list_empty(&sdp->sd_ail1_list)) {
+		tail = sdp->sd_log_head;
+	} else {
+		tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans,
+				tr_list);
+		tail = tr->tr_first;
+	}
+
+	spin_unlock(&sdp->sd_ail_lock);
+
+	return tail;
+}
+
+static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+	unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
+
+	ail2_empty(sdp, new_tail);
+
+	atomic_add(dist, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, dist);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+			     sdp->sd_jdesc->jd_blocks);
+
+	sdp->sd_log_tail = new_tail;
+}
+
+
+static void log_flush_wait(struct gfs2_sbd *sdp)
+{
+	DEFINE_WAIT(wait);
+
+	if (atomic_read(&sdp->sd_log_in_flight)) {
+		do {
+			prepare_to_wait(&sdp->sd_log_flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&sdp->sd_log_in_flight))
+				io_schedule();
+		} while(atomic_read(&sdp->sd_log_in_flight));
+		finish_wait(&sdp->sd_log_flush_wait, &wait);
+	}
+}
+
+static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct gfs2_inode *ipa, *ipb;
+
+	ipa = list_entry(a, struct gfs2_inode, i_ordered);
+	ipb = list_entry(b, struct gfs2_inode, i_ordered);
+
+	if (ipa->i_no_addr < ipb->i_no_addr)
+		return -1;
+	if (ipa->i_no_addr > ipb->i_no_addr)
+		return 1;
+	return 0;
+}
+
+static void gfs2_ordered_write(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip;
+	LIST_HEAD(written);
+
+	spin_lock(&sdp->sd_ordered_lock);
+	list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp);
+	while (!list_empty(&sdp->sd_log_le_ordered)) {
+		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+		list_move(&ip->i_ordered, &written);
+		if (ip->i_inode.i_mapping->nrpages == 0)
+			continue;
+		spin_unlock(&sdp->sd_ordered_lock);
+		filemap_fdatawrite(ip->i_inode.i_mapping);
+		spin_lock(&sdp->sd_ordered_lock);
+	}
+	list_splice(&written, &sdp->sd_log_le_ordered);
+	spin_unlock(&sdp->sd_ordered_lock);
+}
+
+static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip;
+
+	spin_lock(&sdp->sd_ordered_lock);
+	while (!list_empty(&sdp->sd_log_le_ordered)) {
+		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+		list_del(&ip->i_ordered);
+		WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
+		if (ip->i_inode.i_mapping->nrpages == 0)
+			continue;
+		spin_unlock(&sdp->sd_ordered_lock);
+		filemap_fdatawait(ip->i_inode.i_mapping);
+		spin_lock(&sdp->sd_ordered_lock);
+	}
+	spin_unlock(&sdp->sd_ordered_lock);
+}
+
+void gfs2_ordered_del_inode(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	spin_lock(&sdp->sd_ordered_lock);
+	if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
+		list_del(&ip->i_ordered);
+	spin_unlock(&sdp->sd_ordered_lock);
+}
+
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+	struct buffer_head *bh = bd->bd_bh;
+	struct gfs2_glock *gl = bd->bd_gl;
+
+	bh->b_private = NULL;
+	bd->bd_blkno = bh->b_blocknr;
+	gfs2_remove_from_ail(bd); /* drops ref on bh */
+	bd->bd_bh = NULL;
+	bd->bd_ops = &gfs2_revoke_lops;
+	sdp->sd_log_num_revoke++;
+	atomic_inc(&gl->gl_revokes);
+	set_bit(GLF_LFLUSH, &gl->gl_flags);
+	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr;
+	struct gfs2_bufdata *bd, *tmp;
+	int have_revokes = 0;
+	int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+
+	gfs2_ail1_empty(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+			if (list_empty(&bd->bd_list)) {
+				have_revokes = 1;
+				goto done;
+			}
+		}
+	}
+done:
+	spin_unlock(&sdp->sd_ail_lock);
+	if (have_revokes == 0)
+		return;
+	while (sdp->sd_log_num_revoke > max_revokes)
+		max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	max_revokes -= sdp->sd_log_num_revoke;
+	if (!sdp->sd_log_num_revoke) {
+		atomic_dec(&sdp->sd_log_blks_free);
+		/* If no blocks have been reserved, we need to also
+		 * reserve a block for the header */
+		if (!sdp->sd_log_blks_reserved)
+			atomic_dec(&sdp->sd_log_blks_free);
+	}
+	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+			if (max_revokes == 0)
+				goto out_of_blocks;
+			if (!list_empty(&bd->bd_list))
+				continue;
+			gfs2_add_revoke(sdp, bd);
+			max_revokes--;
+		}
+	}
+out_of_blocks:
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+
+	if (!sdp->sd_log_num_revoke) {
+		atomic_inc(&sdp->sd_log_blks_free);
+		if (!sdp->sd_log_blks_reserved)
+			atomic_inc(&sdp->sd_log_blks_free);
+	}
+}
+
+/**
+ * log_write_header - Get and initialize a journal header buffer
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the initialized log buffer descriptor
+ */
+
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
+{
+	struct gfs2_log_header *lh;
+	unsigned int tail;
+	u32 hash;
+	int rw = WRITE_FLUSH_FUA | REQ_META;
+	struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
+	lh = page_address(page);
+	clear_page(lh);
+
+	gfs2_assert_withdraw(sdp, (state != SFS_FROZEN));
+
+	tail = current_tail(sdp);
+
+	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
+	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
+	lh->lh_flags = cpu_to_be32(flags);
+	lh->lh_tail = cpu_to_be32(tail);
+	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+	hash = gfs2_disk_hash(page_address(page), sizeof(struct gfs2_log_header));
+	lh->lh_hash = cpu_to_be32(hash);
+
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
+		gfs2_ordered_wait(sdp);
+		log_flush_wait(sdp);
+		rw = WRITE_SYNC | REQ_META | REQ_PRIO;
+	}
+
+	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+	gfs2_log_write_page(sdp, page);
+	gfs2_log_flush_bio(sdp, rw);
+	log_flush_wait(sdp);
+
+	if (sdp->sd_log_tail != tail)
+		log_pull_tail(sdp, tail);
+}
+
+/**
+ * gfs2_log_flush - flush incore transaction(s)
+ * @sdp: the filesystem
+ * @gl: The glock structure to flush.  If NULL, flush the whole incore log
+ *
+ */
+
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+		    enum gfs2_flush_type type)
+{
+	struct gfs2_trans *tr;
+	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
+
+	down_write(&sdp->sd_log_flush_lock);
+
+	/* Log might have been flushed while we waited for the flush lock */
+	if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
+		up_write(&sdp->sd_log_flush_lock);
+		return;
+	}
+	trace_gfs2_log_flush(sdp, 1);
+
+	sdp->sd_log_flush_head = sdp->sd_log_head;
+	sdp->sd_log_flush_wrapped = 0;
+	tr = sdp->sd_log_tr;
+	if (tr) {
+		sdp->sd_log_tr = NULL;
+		INIT_LIST_HEAD(&tr->tr_ail1_list);
+		INIT_LIST_HEAD(&tr->tr_ail2_list);
+		tr->tr_first = sdp->sd_log_flush_head;
+		if (unlikely (state == SFS_FROZEN))
+			gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new);
+	}
+
+	if (unlikely(state == SFS_FROZEN))
+		gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+	gfs2_assert_withdraw(sdp,
+			sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
+
+	gfs2_ordered_write(sdp);
+	lops_before_commit(sdp, tr);
+	gfs2_log_flush_bio(sdp, WRITE);
+
+	if (sdp->sd_log_head != sdp->sd_log_flush_head) {
+		log_flush_wait(sdp);
+		log_write_header(sdp, 0);
+	} else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
+		atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
+		trace_gfs2_log_blocks(sdp, -1);
+		log_write_header(sdp, 0);
+	}
+	lops_after_commit(sdp, tr);
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_head = sdp->sd_log_flush_head;
+	sdp->sd_log_blks_reserved = 0;
+	sdp->sd_log_commited_revoke = 0;
+
+	spin_lock(&sdp->sd_ail_lock);
+	if (tr && !list_empty(&tr->tr_ail1_list)) {
+		list_add(&tr->tr_list, &sdp->sd_ail1_list);
+		tr = NULL;
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+
+	if (type != NORMAL_FLUSH) {
+		if (!sdp->sd_log_idle) {
+			for (;;) {
+				gfs2_ail1_start(sdp);
+				gfs2_ail1_wait(sdp);
+				if (gfs2_ail1_empty(sdp))
+					break;
+			}
+			atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
+			trace_gfs2_log_blocks(sdp, -1);
+			sdp->sd_log_flush_wrapped = 0;
+			log_write_header(sdp, 0);
+			sdp->sd_log_head = sdp->sd_log_flush_head;
+		}
+		if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH)
+			gfs2_log_shutdown(sdp);
+		if (type == FREEZE_FLUSH)
+			atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
+	}
+
+	trace_gfs2_log_flush(sdp, 0);
+	up_write(&sdp->sd_log_flush_lock);
+
+	kfree(tr);
+}
+
+/**
+ * gfs2_merge_trans - Merge a new transaction into a cached transaction
+ * @old: Original transaction to be expanded
+ * @new: New transaction to be merged
+ */
+
+static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
+{
+	WARN_ON_ONCE(old->tr_attached != 1);
+
+	old->tr_num_buf_new	+= new->tr_num_buf_new;
+	old->tr_num_databuf_new	+= new->tr_num_databuf_new;
+	old->tr_num_buf_rm	+= new->tr_num_buf_rm;
+	old->tr_num_databuf_rm	+= new->tr_num_databuf_rm;
+	old->tr_num_revoke	+= new->tr_num_revoke;
+	old->tr_num_revoke_rm	+= new->tr_num_revoke_rm;
+
+	list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
+	list_splice_tail_init(&new->tr_buf, &old->tr_buf);
+}
+
+static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	unsigned int reserved;
+	unsigned int unused;
+	unsigned int maxres;
+
+	gfs2_log_lock(sdp);
+
+	if (sdp->sd_log_tr) {
+		gfs2_merge_trans(sdp->sd_log_tr, tr);
+	} else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
+		gfs2_assert_withdraw(sdp, tr->tr_alloced);
+		sdp->sd_log_tr = tr;
+		tr->tr_attached = 1;
+	}
+
+	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
+	reserved = calc_reserved(sdp);
+	maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
+	gfs2_assert_withdraw(sdp, maxres >= reserved);
+	unused = maxres - reserved;
+	atomic_add(unused, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, unused);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+			     sdp->sd_jdesc->jd_blocks);
+	sdp->sd_log_blks_reserved = reserved;
+
+	gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_log_commit - Commit a transaction to the log
+ * @sdp: the filesystem
+ * @tr: the transaction
+ *
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
+ * Returns: errno
+ */
+
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	log_refund(sdp, tr);
+
+	if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
+	    ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
+	    atomic_read(&sdp->sd_log_thresh2)))
+		wake_up(&sdp->sd_logd_waitq);
+}
+
+/**
+ * gfs2_log_shutdown - write a shutdown header into a journal
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_log_shutdown(struct gfs2_sbd *sdp)
+{
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+	gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
+
+	sdp->sd_log_flush_head = sdp->sd_log_head;
+	sdp->sd_log_flush_wrapped = 0;
+
+	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT);
+
+	gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
+	gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
+
+	sdp->sd_log_head = sdp->sd_log_flush_head;
+	sdp->sd_log_tail = sdp->sd_log_head;
+}
+
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+	return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+	unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+	return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	unsigned long t = 1;
+	DEFINE_WAIT(wait);
+
+	while (!kthread_should_stop()) {
+
+		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+			gfs2_ail1_empty(sdp);
+			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+		}
+
+		if (gfs2_ail_flush_reqd(sdp)) {
+			gfs2_ail1_start(sdp);
+			gfs2_ail1_wait(sdp);
+			gfs2_ail1_empty(sdp);
+			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+		}
+
+		if (!gfs2_ail_flush_reqd(sdp))
+			wake_up(&sdp->sd_log_waitq);
+
+		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+
+		try_to_freeze();
+
+		do {
+			prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+					TASK_INTERRUPTIBLE);
+			if (!gfs2_ail_flush_reqd(sdp) &&
+			    !gfs2_jrnl_flush_reqd(sdp) &&
+			    !kthread_should_stop())
+				t = schedule_timeout(t);
+		} while(t && !gfs2_ail_flush_reqd(sdp) &&
+			!gfs2_jrnl_flush_reqd(sdp) &&
+			!kthread_should_stop());
+		finish_wait(&sdp->sd_logd_waitq, &wait);
+	}
+
+	return 0;
+}
+
diff --git a/kernel/fs/gfs2/log.h b/kernel/fs/gfs2/log.h
new file mode 100644
index 000000000..9499a6049
--- /dev/null
+++ b/kernel/fs/gfs2/log.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOG_DOT_H__
+#define __LOG_DOT_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h>
+#include "incore.h"
+
+/**
+ * gfs2_log_lock - acquire the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
+{
+	spin_lock(&sdp->sd_log_lock);
+}
+
+/**
+ * gfs2_log_unlock - release the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
+{
+	spin_unlock(&sdp->sd_log_lock);
+}
+
+static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+					  unsigned int value)
+{
+	if (++value == sdp->sd_jdesc->jd_blocks) {
+		value = 0;
+	}
+	sdp->sd_log_head = sdp->sd_log_tail = value;
+}
+
+static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
+		spin_lock(&sdp->sd_ordered_lock);
+		if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
+			list_add(&ip->i_ordered, &sdp->sd_log_le_ordered);
+		spin_unlock(&sdp->sd_ordered_lock);
+	}
+}
+extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+			    unsigned int ssize);
+
+extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+enum gfs2_flush_type {
+	NORMAL_FLUSH = 0,
+	SYNC_FLUSH,
+	SHUTDOWN_FLUSH,
+	FREEZE_FLUSH
+};
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+			   enum gfs2_flush_type type);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
+
+#endif /* __LOG_DOT_H__ */
diff --git a/kernel/fs/gfs2/lops.c b/kernel/fs/gfs2/lops.c
new file mode 100644
index 000000000..2c1ae861d
--- /dev/null
+++ b/kernel/fs/gfs2/lops.c
@@ -0,0 +1,886 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mempool.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+#include <linux/list_sort.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "inode.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+/**
+ * gfs2_pin - Pin a buffer in memory
+ * @sdp: The superblock
+ * @bh: The buffer to be pinned
+ *
+ * The log lock must be held when calling this function
+ */
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	BUG_ON(!current->journal_info);
+
+	clear_buffer_dirty(bh);
+	if (test_set_buffer_pinned(bh))
+		gfs2_assert_withdraw(sdp, 0);
+	if (!buffer_uptodate(bh))
+		gfs2_io_error_bh(sdp, bh);
+	bd = bh->b_private;
+	/* If this buffer is in the AIL and it has already been written
+	 * to in-place disk block, remove it from the AIL.
+	 */
+	spin_lock(&sdp->sd_ail_lock);
+	if (bd->bd_tr)
+		list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list);
+	spin_unlock(&sdp->sd_ail_lock);
+	get_bh(bh);
+	atomic_inc(&sdp->sd_log_pinned);
+	trace_gfs2_pin(bd, 1);
+}
+
+static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
+{
+	return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
+}
+
+static void maybe_release_space(struct gfs2_bufdata *bd)
+{
+	struct gfs2_glock *gl = bd->bd_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+	unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
+	struct gfs2_bitmap *bi = rgd->rd_bits + index;
+
+	if (bi->bi_clone == NULL)
+		return;
+	if (sdp->sd_args.ar_discard)
+		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
+	memcpy(bi->bi_clone + bi->bi_offset,
+	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
+	clear_bit(GBF_FULL, &bi->bi_flags);
+	rgd->rd_free_clone = rgd->rd_free;
+	rgd->rd_extfail_pt = rgd->rd_free;
+}
+
+/**
+ * gfs2_unpin - Unpin a buffer
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to unpin
+ * @ai:
+ * @flags: The inode dirty flags
+ *
+ */
+
+static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       struct gfs2_trans *tr)
+{
+	struct gfs2_bufdata *bd = bh->b_private;
+
+	BUG_ON(!buffer_uptodate(bh));
+	BUG_ON(!buffer_pinned(bh));
+
+	lock_buffer(bh);
+	mark_buffer_dirty(bh);
+	clear_buffer_pinned(bh);
+
+	if (buffer_is_rgrp(bd))
+		maybe_release_space(bd);
+
+	spin_lock(&sdp->sd_ail_lock);
+	if (bd->bd_tr) {
+		list_del(&bd->bd_ail_st_list);
+		brelse(bh);
+	} else {
+		struct gfs2_glock *gl = bd->bd_gl;
+		list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
+		atomic_inc(&gl->gl_ail_count);
+	}
+	bd->bd_tr = tr;
+	list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list);
+	spin_unlock(&sdp->sd_ail_lock);
+
+	clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	trace_gfs2_pin(bd, 0);
+	unlock_buffer(bh);
+	atomic_dec(&sdp->sd_log_pinned);
+}
+
+static void gfs2_log_incr_head(struct gfs2_sbd *sdp)
+{
+	BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) &&
+	       (sdp->sd_log_flush_head != sdp->sd_log_head));
+
+	if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+		sdp->sd_log_flush_head = 0;
+		sdp->sd_log_flush_wrapped = 1;
+	}
+}
+
+static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
+{
+	unsigned int lbn = sdp->sd_log_flush_head;
+	struct gfs2_journal_extent *je;
+	u64 block;
+
+	list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) {
+		if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) {
+			block = je->dblock + lbn - je->lblock;
+			gfs2_log_incr_head(sdp);
+			return block;
+		}
+	}
+
+	return -1;
+}
+
+/**
+ * gfs2_end_log_write_bh - end log write of pagecache data with buffers
+ * @sdp: The superblock
+ * @bvec: The bio_vec
+ * @error: The i/o status
+ *
+ * This finds the relavent buffers and unlocks then and sets the
+ * error flag according to the status of the i/o request. This is
+ * used when the log is writing data which has an in-place version
+ * that is pinned in the pagecache.
+ */
+
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+				  int error)
+{
+	struct buffer_head *bh, *next;
+	struct page *page = bvec->bv_page;
+	unsigned size;
+
+	bh = page_buffers(page);
+	size = bvec->bv_len;
+	while (bh_offset(bh) < bvec->bv_offset)
+		bh = bh->b_this_page;
+	do {
+		if (error)
+			set_buffer_write_io_error(bh);
+		unlock_buffer(bh);
+		next = bh->b_this_page;
+		size -= bh->b_size;
+		brelse(bh);
+		bh = next;
+	} while(bh && size);
+}
+
+/**
+ * gfs2_end_log_write - end of i/o to the log
+ * @bio: The bio
+ * @error: Status of i/o request
+ *
+ * Each bio_vec contains either data from the pagecache or data
+ * relating to the log itself. Here we iterate over the bio_vec
+ * array, processing both kinds of data.
+ *
+ */
+
+static void gfs2_end_log_write(struct bio *bio, int error)
+{
+	struct gfs2_sbd *sdp = bio->bi_private;
+	struct bio_vec *bvec;
+	struct page *page;
+	int i;
+
+	if (error) {
+		sdp->sd_log_error = error;
+		fs_err(sdp, "Error %d writing to log\n", error);
+	}
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		page = bvec->bv_page;
+		if (page_has_buffers(page))
+			gfs2_end_log_write_bh(sdp, bvec, error);
+		else
+			mempool_free(page, gfs2_page_pool);
+	}
+
+	bio_put(bio);
+	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
+		wake_up(&sdp->sd_log_flush_wait);
+}
+
+/**
+ * gfs2_log_flush_bio - Submit any pending log bio
+ * @sdp: The superblock
+ * @rw: The rw flags
+ *
+ * Submit any pending part-built or full bio to the block device. If
+ * there is no pending bio, then this is a no-op.
+ */
+
+void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw)
+{
+	if (sdp->sd_log_bio) {
+		atomic_inc(&sdp->sd_log_in_flight);
+		submit_bio(rw, sdp->sd_log_bio);
+		sdp->sd_log_bio = NULL;
+	}
+}
+
+/**
+ * gfs2_log_alloc_bio - Allocate a new bio for log writing
+ * @sdp: The superblock
+ * @blkno: The next device block number we want to write to
+ *
+ * This should never be called when there is a cached bio in the
+ * super block. When it returns, there will be a cached bio in the
+ * super block which will have as many bio_vecs as the device is
+ * happy to handle.
+ *
+ * Returns: Newly allocated bio
+ */
+
+static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev);
+	struct bio *bio;
+
+	BUG_ON(sdp->sd_log_bio);
+
+	while (1) {
+		bio = bio_alloc(GFP_NOIO, nrvecs);
+		if (likely(bio))
+			break;
+		nrvecs = max(nrvecs/2, 1U);
+	}
+
+	bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
+	bio->bi_bdev = sb->s_bdev;
+	bio->bi_end_io = gfs2_end_log_write;
+	bio->bi_private = sdp;
+
+	sdp->sd_log_bio = bio;
+
+	return bio;
+}
+
+/**
+ * gfs2_log_get_bio - Get cached log bio, or allocate a new one
+ * @sdp: The superblock
+ * @blkno: The device block number we want to write to
+ *
+ * If there is a cached bio, then if the next block number is sequential
+ * with the previous one, return it, otherwise flush the bio to the
+ * device. If there is not a cached bio, or we just flushed it, then
+ * allocate a new one.
+ *
+ * Returns: The bio to use for log writes
+ */
+
+static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
+{
+	struct bio *bio = sdp->sd_log_bio;
+	u64 nblk;
+
+	if (bio) {
+		nblk = bio_end_sector(bio);
+		nblk >>= sdp->sd_fsb2bb_shift;
+		if (blkno == nblk)
+			return bio;
+		gfs2_log_flush_bio(sdp, WRITE);
+	}
+
+	return gfs2_log_alloc_bio(sdp, blkno);
+}
+
+
+/**
+ * gfs2_log_write - write to log
+ * @sdp: the filesystem
+ * @page: the page to write
+ * @size: the size of the data to write
+ * @offset: the offset within the page 
+ *
+ * Try and add the page segment to the current bio. If that fails,
+ * submit the current bio to the device and create a new one, and
+ * then add the page segment to that.
+ */
+
+static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
+			   unsigned size, unsigned offset)
+{
+	u64 blkno = gfs2_log_bmap(sdp);
+	struct bio *bio;
+	int ret;
+
+	bio = gfs2_log_get_bio(sdp, blkno);
+	ret = bio_add_page(bio, page, size, offset);
+	if (ret == 0) {
+		gfs2_log_flush_bio(sdp, WRITE);
+		bio = gfs2_log_alloc_bio(sdp, blkno);
+		ret = bio_add_page(bio, page, size, offset);
+		WARN_ON(ret == 0);
+	}
+}
+
+/**
+ * gfs2_log_write_bh - write a buffer's content to the log
+ * @sdp: The super block
+ * @bh: The buffer pointing to the in-place location
+ * 
+ * This writes the content of the buffer to the next available location
+ * in the log. The buffer will be unlocked once the i/o to the log has
+ * completed.
+ */
+
+static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/**
+ * gfs2_log_write_page - write one block stored in a page, into the log
+ * @sdp: The superblock
+ * @page: The struct page
+ *
+ * This writes the first block-sized part of the page into the log. Note
+ * that the page must have been allocated from the gfs2_page_pool mempool
+ * and that after this has been called, ownership has been transferred and
+ * the page may be freed at any time.
+ */
+
+void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	gfs2_log_write(sdp, page, sb->s_blocksize, 0);
+}
+
+static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
+				      u32 ld_length, u32 ld_data1)
+{
+	struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+	struct gfs2_log_descriptor *ld = page_address(page);
+	clear_page(ld);
+	ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+	ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+	ld->ld_type = cpu_to_be32(ld_type);
+	ld->ld_length = cpu_to_be32(ld_length);
+	ld->ld_data1 = cpu_to_be32(ld_data1);
+	ld->ld_data2 = 0;
+	return page;
+}
+
+static void gfs2_check_magic(struct buffer_head *bh)
+{
+	void *kaddr;
+	__be32 *ptr;
+
+	clear_buffer_escaped(bh);
+	kaddr = kmap_atomic(bh->b_page);
+	ptr = kaddr + bh_offset(bh);
+	if (*ptr == cpu_to_be32(GFS2_MAGIC))
+		set_buffer_escaped(bh);
+	kunmap_atomic(kaddr);
+}
+
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct gfs2_bufdata *bda, *bdb;
+
+	bda = list_entry(a, struct gfs2_bufdata, bd_list);
+	bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+
+	if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+		return -1;
+	if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+		return 1;
+	return 0;
+}
+
+static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
+				unsigned int total, struct list_head *blist,
+				bool is_databuf)
+{
+	struct gfs2_log_descriptor *ld;
+	struct gfs2_bufdata *bd1 = NULL, *bd2;
+	struct page *page;
+	unsigned int num;
+	unsigned n;
+	__be64 *ptr;
+
+	gfs2_log_lock(sdp);
+	list_sort(NULL, blist, blocknr_cmp);
+	bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
+	while(total) {
+		num = total;
+		if (total > limit)
+			num = limit;
+		gfs2_log_unlock(sdp);
+		page = gfs2_get_log_desc(sdp,
+					 is_databuf ? GFS2_LOG_DESC_JDATA :
+					 GFS2_LOG_DESC_METADATA, num + 1, num);
+		ld = page_address(page);
+		gfs2_log_lock(sdp);
+		ptr = (__be64 *)(ld + 1);
+
+		n = 0;
+		list_for_each_entry_continue(bd1, blist, bd_list) {
+			*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+			if (is_databuf) {
+				gfs2_check_magic(bd1->bd_bh);
+				*ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0);
+			}
+			if (++n >= num)
+				break;
+		}
+
+		gfs2_log_unlock(sdp);
+		gfs2_log_write_page(sdp, page);
+		gfs2_log_lock(sdp);
+
+		n = 0;
+		list_for_each_entry_continue(bd2, blist, bd_list) {
+			get_bh(bd2->bd_bh);
+			gfs2_log_unlock(sdp);
+			lock_buffer(bd2->bd_bh);
+
+			if (buffer_escaped(bd2->bd_bh)) {
+				void *kaddr;
+				page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+				ptr = page_address(page);
+				kaddr = kmap_atomic(bd2->bd_bh->b_page);
+				memcpy(ptr, kaddr + bh_offset(bd2->bd_bh),
+				       bd2->bd_bh->b_size);
+				kunmap_atomic(kaddr);
+				*(__be32 *)ptr = 0;
+				clear_buffer_escaped(bd2->bd_bh);
+				unlock_buffer(bd2->bd_bh);
+				brelse(bd2->bd_bh);
+				gfs2_log_write_page(sdp, page);
+			} else {
+				gfs2_log_write_bh(sdp, bd2->bd_bh);
+			}
+			gfs2_log_lock(sdp);
+			if (++n >= num)
+				break;
+		}
+
+		BUG_ON(total < num);
+		total -= num;
+	}
+	gfs2_log_unlock(sdp);
+}
+
+static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
+	unsigned int nbuf;
+	if (tr == NULL)
+		return;
+	nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
+	gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
+}
+
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct list_head *head;
+	struct gfs2_bufdata *bd;
+
+	if (tr == NULL)
+		return;
+
+	head = &tr->tr_buf;
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+		list_del_init(&bd->bd_list);
+		gfs2_unpin(sdp, bd->bd_bh, tr);
+	}
+}
+
+static void buf_lo_before_scan(struct gfs2_jdesc *jd,
+			       struct gfs2_log_header_host *head, int pass)
+{
+	if (pass != 0)
+		return;
+
+	jd->jd_found_blocks = 0;
+	jd->jd_replayed_blocks = 0;
+}
+
+static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				struct gfs2_log_descriptor *ld, __be64 *ptr,
+				int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	unsigned int blks = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh_log, *bh_ip;
+	u64 blkno;
+	int error = 0;
+
+	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
+		return 0;
+
+	gfs2_replay_incr_blk(sdp, &start);
+
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		blkno = be64_to_cpu(*ptr++);
+
+		jd->jd_found_blocks++;
+
+		if (gfs2_revoke_check(jd, blkno, start))
+			continue;
+
+		error = gfs2_replay_read_block(jd, start, &bh_log);
+		if (error)
+			return error;
+
+		bh_ip = gfs2_meta_new(gl, blkno);
+		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+		if (gfs2_meta_check(sdp, bh_ip))
+			error = -EIO;
+		else
+			mark_buffer_dirty(bh_ip);
+
+		brelse(bh_log);
+		brelse(bh_ip);
+
+		if (error)
+			break;
+
+		jd->jd_replayed_blocks++;
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+
+static void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	int error;
+
+	if (mapping == NULL)
+		mapping = &sdp->sd_aspace;
+
+	filemap_fdatawrite(mapping);
+	error = filemap_fdatawait(mapping);
+
+	if (error)
+		gfs2_io_error(gl->gl_sbd);
+}
+
+static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_meta_sync(ip->i_gl);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	gfs2_meta_sync(ip->i_gl);
+
+	fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
+	        jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
+}
+
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct gfs2_meta_header *mh;
+	unsigned int offset;
+	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct gfs2_bufdata *bd;
+	struct page *page;
+	unsigned int length;
+
+	gfs2_write_revokes(sdp);
+	if (!sdp->sd_log_num_revoke)
+		return;
+
+	length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
+	page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
+	offset = sizeof(struct gfs2_log_descriptor);
+
+	list_for_each_entry(bd, head, bd_list) {
+		sdp->sd_log_num_revoke--;
+
+		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
+
+			gfs2_log_write_page(sdp, page);
+			page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+			mh = page_address(page);
+			clear_page(mh);
+			mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+			mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
+			mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
+			offset = sizeof(struct gfs2_meta_header);
+		}
+
+		*(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno);
+		offset += sizeof(u64);
+	}
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+
+	gfs2_log_write_page(sdp, page);
+}
+
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct gfs2_bufdata *bd;
+	struct gfs2_glock *gl;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+		list_del_init(&bd->bd_list);
+		gl = bd->bd_gl;
+		atomic_dec(&gl->gl_revokes);
+		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		kmem_cache_free(gfs2_bufdata_cachep, bd);
+	}
+}
+
+static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
+				  struct gfs2_log_header_host *head, int pass)
+{
+	if (pass != 0)
+		return;
+
+	jd->jd_found_revokes = 0;
+	jd->jd_replay_tail = head->lh_tail;
+}
+
+static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				   struct gfs2_log_descriptor *ld, __be64 *ptr,
+				   int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	unsigned int blks = be32_to_cpu(ld->ld_length);
+	unsigned int revokes = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh;
+	unsigned int offset;
+	u64 blkno;
+	int first = 1;
+	int error;
+
+	if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
+		return 0;
+
+	offset = sizeof(struct gfs2_log_descriptor);
+
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		error = gfs2_replay_read_block(jd, start, &bh);
+		if (error)
+			return error;
+
+		if (!first)
+			gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
+
+		while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
+			blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
+
+			error = gfs2_revoke_add(jd, blkno, start);
+			if (error < 0) {
+				brelse(bh);
+				return error;
+			}
+			else if (error)
+				jd->jd_found_revokes++;
+
+			if (!--revokes)
+				break;
+			offset += sizeof(u64);
+		}
+
+		brelse(bh);
+		offset = sizeof(struct gfs2_meta_header);
+		first = 0;
+	}
+
+	return 0;
+}
+
+static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_revoke_clean(jd);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	fs_info(sdp, "jid=%u: Found %u revoke tags\n",
+	        jd->jd_jid, jd->jd_found_revokes);
+
+	gfs2_revoke_clean(jd);
+}
+
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ */
+
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	unsigned int limit = databuf_limit(sdp);
+	unsigned int nbuf;
+	if (tr == NULL)
+		return;
+	nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
+	gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
+}
+
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				    struct gfs2_log_descriptor *ld,
+				    __be64 *ptr, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	unsigned int blks = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh_log, *bh_ip;
+	u64 blkno;
+	u64 esc;
+	int error = 0;
+
+	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
+		return 0;
+
+	gfs2_replay_incr_blk(sdp, &start);
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		blkno = be64_to_cpu(*ptr++);
+		esc = be64_to_cpu(*ptr++);
+
+		jd->jd_found_blocks++;
+
+		if (gfs2_revoke_check(jd, blkno, start))
+			continue;
+
+		error = gfs2_replay_read_block(jd, start, &bh_log);
+		if (error)
+			return error;
+
+		bh_ip = gfs2_meta_new(gl, blkno);
+		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+		/* Unescape */
+		if (esc) {
+			__be32 *eptr = (__be32 *)bh_ip->b_data;
+			*eptr = cpu_to_be32(GFS2_MAGIC);
+		}
+		mark_buffer_dirty(bh_ip);
+
+		brelse(bh_log);
+		brelse(bh_ip);
+
+		jd->jd_replayed_blocks++;
+	}
+
+	return error;
+}
+
+/* FIXME: sort out accounting for log blocks etc. */
+
+static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_meta_sync(ip->i_gl);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	/* data sync? */
+	gfs2_meta_sync(ip->i_gl);
+
+	fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
+		jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
+}
+
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct list_head *head;
+	struct gfs2_bufdata *bd;
+
+	if (tr == NULL)
+		return;
+
+	head = &tr->tr_databuf;
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+		list_del_init(&bd->bd_list);
+		gfs2_unpin(sdp, bd->bd_bh, tr);
+	}
+}
+
+
+const struct gfs2_log_operations gfs2_buf_lops = {
+	.lo_before_commit = buf_lo_before_commit,
+	.lo_after_commit = buf_lo_after_commit,
+	.lo_before_scan = buf_lo_before_scan,
+	.lo_scan_elements = buf_lo_scan_elements,
+	.lo_after_scan = buf_lo_after_scan,
+	.lo_name = "buf",
+};
+
+const struct gfs2_log_operations gfs2_revoke_lops = {
+	.lo_before_commit = revoke_lo_before_commit,
+	.lo_after_commit = revoke_lo_after_commit,
+	.lo_before_scan = revoke_lo_before_scan,
+	.lo_scan_elements = revoke_lo_scan_elements,
+	.lo_after_scan = revoke_lo_after_scan,
+	.lo_name = "revoke",
+};
+
+const struct gfs2_log_operations gfs2_databuf_lops = {
+	.lo_before_commit = databuf_lo_before_commit,
+	.lo_after_commit = databuf_lo_after_commit,
+	.lo_scan_elements = databuf_lo_scan_elements,
+	.lo_after_scan = databuf_lo_after_scan,
+	.lo_name = "databuf",
+};
+
+const struct gfs2_log_operations *gfs2_log_ops[] = {
+	&gfs2_databuf_lops,
+	&gfs2_buf_lops,
+	&gfs2_revoke_lops,
+	NULL,
+};
+
diff --git a/kernel/fs/gfs2/lops.h b/kernel/fs/gfs2/lops.h
new file mode 100644
index 000000000..a65a7ba32
--- /dev/null
+++ b/kernel/fs/gfs2/lops.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOPS_DOT_H__
+#define __LOPS_DOT_H__
+
+#include <linux/list.h>
+#include "incore.h"
+
+#define BUF_OFFSET \
+	((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \
+	 ~(sizeof(__be64) - 1))
+#define DATABUF_OFFSET \
+	((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \
+	 ~(2 * sizeof(__be64) - 1))
+
+extern const struct gfs2_log_operations gfs2_glock_lops;
+extern const struct gfs2_log_operations gfs2_buf_lops;
+extern const struct gfs2_log_operations gfs2_revoke_lops;
+extern const struct gfs2_log_operations gfs2_databuf_lops;
+
+extern const struct gfs2_log_operations *gfs2_log_ops[];
+extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
+extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw);
+extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+
+static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
+{
+	unsigned int limit;
+
+	limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64);
+	return limit;
+}
+
+static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
+{
+	unsigned int limit;
+
+	limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64));
+	return limit;
+}
+
+static inline void lops_before_commit(struct gfs2_sbd *sdp,
+				      struct gfs2_trans *tr)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_commit)
+			gfs2_log_ops[x]->lo_before_commit(sdp, tr);
+}
+
+static inline void lops_after_commit(struct gfs2_sbd *sdp,
+				     struct gfs2_trans *tr)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_after_commit)
+			gfs2_log_ops[x]->lo_after_commit(sdp, tr);
+}
+
+static inline void lops_before_scan(struct gfs2_jdesc *jd,
+				    struct gfs2_log_header_host *head,
+				    unsigned int pass)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_scan)
+			gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
+}
+
+static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				     struct gfs2_log_descriptor *ld,
+				     __be64 *ptr,
+				     unsigned int pass)
+{
+	int x, error;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_scan_elements) {
+			error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
+								  ld, ptr, pass);
+			if (error)
+				return error;
+		}
+
+	return 0;
+}
+
+static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
+				   unsigned int pass)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_scan)
+			gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
+}
+
+#endif /* __LOPS_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/main.c b/kernel/fs/gfs2/main.c
new file mode 100644
index 000000000..241a399bf
--- /dev/null
+++ b/kernel/fs/gfs2/main.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/atomic.h>
+#include <linux/mempool.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "glock.h"
+#include "quota.h"
+#include "recovery.h"
+#include "dir.h"
+#include "glops.h"
+
+struct workqueue_struct *gfs2_control_wq;
+
+static void gfs2_init_inode_once(void *foo)
+{
+	struct gfs2_inode *ip = foo;
+
+	inode_init_once(&ip->i_inode);
+	init_rwsem(&ip->i_rw_mutex);
+	INIT_LIST_HEAD(&ip->i_trunc_list);
+	ip->i_res = NULL;
+	ip->i_hash_cache = NULL;
+}
+
+static void gfs2_init_glock_once(void *foo)
+{
+	struct gfs2_glock *gl = foo;
+
+	INIT_HLIST_BL_NODE(&gl->gl_list);
+	spin_lock_init(&gl->gl_spin);
+	INIT_LIST_HEAD(&gl->gl_holders);
+	INIT_LIST_HEAD(&gl->gl_lru);
+	INIT_LIST_HEAD(&gl->gl_ail_list);
+	atomic_set(&gl->gl_ail_count, 0);
+	atomic_set(&gl->gl_revokes, 0);
+}
+
+static void gfs2_init_gl_aspace_once(void *foo)
+{
+	struct gfs2_glock *gl = foo;
+	struct address_space *mapping = (struct address_space *)(gl + 1);
+
+	gfs2_init_glock_once(gl);
+	address_space_init_once(mapping);
+}
+
+/**
+ * init_gfs2_fs - Register GFS2 as a filesystem
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int __init init_gfs2_fs(void)
+{
+	int error;
+
+	gfs2_str2qstr(&gfs2_qdot, ".");
+	gfs2_str2qstr(&gfs2_qdotdot, "..");
+	gfs2_quota_hash_init();
+
+	error = gfs2_sys_init();
+	if (error)
+		return error;
+
+	error = list_lru_init(&gfs2_qd_lru);
+	if (error)
+		goto fail_lru;
+
+	error = gfs2_glock_init();
+	if (error)
+		goto fail;
+
+	error = -ENOMEM;
+	gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
+					      sizeof(struct gfs2_glock),
+					      0, 0,
+					      gfs2_init_glock_once);
+	if (!gfs2_glock_cachep)
+		goto fail;
+
+	gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
+					sizeof(struct gfs2_glock) +
+					sizeof(struct address_space),
+					0, 0, gfs2_init_gl_aspace_once);
+
+	if (!gfs2_glock_aspace_cachep)
+		goto fail;
+
+	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
+					      sizeof(struct gfs2_inode),
+					      0,  SLAB_RECLAIM_ACCOUNT|
+					          SLAB_MEM_SPREAD,
+					      gfs2_init_inode_once);
+	if (!gfs2_inode_cachep)
+		goto fail;
+
+	gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
+						sizeof(struct gfs2_bufdata),
+					        0, 0, NULL);
+	if (!gfs2_bufdata_cachep)
+		goto fail;
+
+	gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd",
+					      sizeof(struct gfs2_rgrpd),
+					      0, 0, NULL);
+	if (!gfs2_rgrpd_cachep)
+		goto fail;
+
+	gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+					       sizeof(struct gfs2_quota_data),
+					       0, 0, NULL);
+	if (!gfs2_quotad_cachep)
+		goto fail;
+
+	gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk",
+					     sizeof(struct gfs2_blkreserv),
+					       0, 0, NULL);
+	if (!gfs2_rsrv_cachep)
+		goto fail;
+
+	register_shrinker(&gfs2_qd_shrinker);
+
+	error = register_filesystem(&gfs2_fs_type);
+	if (error)
+		goto fail;
+
+	error = register_filesystem(&gfs2meta_fs_type);
+	if (error)
+		goto fail_unregister;
+
+	error = -ENOMEM;
+	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
+					  WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
+	if (!gfs_recovery_wq)
+		goto fail_wq;
+
+	gfs2_control_wq = alloc_workqueue("gfs2_control",
+					  WQ_UNBOUND | WQ_FREEZABLE, 0);
+	if (!gfs2_control_wq)
+		goto fail_recovery;
+
+	gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0);
+
+	if (!gfs2_freeze_wq)
+		goto fail_control;
+
+	gfs2_page_pool = mempool_create_page_pool(64, 0);
+	if (!gfs2_page_pool)
+		goto fail_freeze;
+
+	gfs2_register_debugfs();
+
+	pr_info("GFS2 installed\n");
+
+	return 0;
+
+fail_freeze:
+	destroy_workqueue(gfs2_freeze_wq);
+fail_control:
+	destroy_workqueue(gfs2_control_wq);
+fail_recovery:
+	destroy_workqueue(gfs_recovery_wq);
+fail_wq:
+	unregister_filesystem(&gfs2meta_fs_type);
+fail_unregister:
+	unregister_filesystem(&gfs2_fs_type);
+fail:
+	list_lru_destroy(&gfs2_qd_lru);
+fail_lru:
+	unregister_shrinker(&gfs2_qd_shrinker);
+	gfs2_glock_exit();
+
+	if (gfs2_rsrv_cachep)
+		kmem_cache_destroy(gfs2_rsrv_cachep);
+
+	if (gfs2_quotad_cachep)
+		kmem_cache_destroy(gfs2_quotad_cachep);
+
+	if (gfs2_rgrpd_cachep)
+		kmem_cache_destroy(gfs2_rgrpd_cachep);
+
+	if (gfs2_bufdata_cachep)
+		kmem_cache_destroy(gfs2_bufdata_cachep);
+
+	if (gfs2_inode_cachep)
+		kmem_cache_destroy(gfs2_inode_cachep);
+
+	if (gfs2_glock_aspace_cachep)
+		kmem_cache_destroy(gfs2_glock_aspace_cachep);
+
+	if (gfs2_glock_cachep)
+		kmem_cache_destroy(gfs2_glock_cachep);
+
+	gfs2_sys_uninit();
+	return error;
+}
+
+/**
+ * exit_gfs2_fs - Unregister the file system
+ *
+ */
+
+static void __exit exit_gfs2_fs(void)
+{
+	unregister_shrinker(&gfs2_qd_shrinker);
+	gfs2_glock_exit();
+	gfs2_unregister_debugfs();
+	unregister_filesystem(&gfs2_fs_type);
+	unregister_filesystem(&gfs2meta_fs_type);
+	destroy_workqueue(gfs_recovery_wq);
+	destroy_workqueue(gfs2_control_wq);
+	destroy_workqueue(gfs2_freeze_wq);
+	list_lru_destroy(&gfs2_qd_lru);
+
+	rcu_barrier();
+
+	mempool_destroy(gfs2_page_pool);
+	kmem_cache_destroy(gfs2_rsrv_cachep);
+	kmem_cache_destroy(gfs2_quotad_cachep);
+	kmem_cache_destroy(gfs2_rgrpd_cachep);
+	kmem_cache_destroy(gfs2_bufdata_cachep);
+	kmem_cache_destroy(gfs2_inode_cachep);
+	kmem_cache_destroy(gfs2_glock_aspace_cachep);
+	kmem_cache_destroy(gfs2_glock_cachep);
+
+	gfs2_sys_uninit();
+}
+
+MODULE_DESCRIPTION("Global File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+module_init(init_gfs2_fs);
+module_exit(exit_gfs2_fs);
+
diff --git a/kernel/fs/gfs2/meta_io.c b/kernel/fs/gfs2/meta_io.c
new file mode 100644
index 000000000..b984a6e19
--- /dev/null
+++ b/kernel/fs/gfs2/meta_io.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct buffer_head *bh, *head;
+	int nr_underway = 0;
+	int write_op = REQ_META | REQ_PRIO |
+		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page_has_buffers(page));
+
+	head = page_buffers(page);
+	bh = head;
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from flusher thread and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			lock_buffer(bh);
+		} else if (!trylock_buffer(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
+	return 0;
+}
+
+const struct address_space_operations gfs2_meta_aops = {
+	.writepage = gfs2_aspace_writepage,
+	.releasepage = gfs2_releasepage,
+};
+
+const struct address_space_operations gfs2_rgrp_aops = {
+	.writepage = gfs2_aspace_writepage,
+	.releasepage = gfs2_releasepage,
+};
+
+/**
+ * gfs2_getbuf - Get a buffer with a given address space
+ * @gl: the glock
+ * @blkno: the block number (filesystem scope)
+ * @create: 1 if the buffer should be created
+ *
+ * Returns: the buffer
+ */
+
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+{
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct page *page;
+	struct buffer_head *bh;
+	unsigned int shift;
+	unsigned long index;
+	unsigned int bufnum;
+
+	if (mapping == NULL)
+		mapping = &sdp->sd_aspace;
+
+	shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+	index = blkno >> shift;             /* convert block to page */
+	bufnum = blkno - (index << shift);  /* block buf index within page */
+
+	if (create) {
+		for (;;) {
+			page = grab_cache_page(mapping, index);
+			if (page)
+				break;
+			yield();
+		}
+	} else {
+		page = find_get_page_flags(mapping, index,
+						FGP_LOCK|FGP_ACCESSED);
+		if (!page)
+			return NULL;
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+
+	/* Locate header for our buffer within our page */
+	for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
+		/* Do nothing */;
+	get_bh(bh);
+
+	if (!buffer_mapped(bh))
+		map_bh(bh, sdp->sd_vfs, blkno);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return bh;
+}
+
+static void meta_prep_new(struct buffer_head *bh)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+}
+
+/**
+ * gfs2_meta_new - Get a block
+ * @gl: The glock associated with this block
+ * @blkno: The block number
+ *
+ * Returns: The buffer
+ */
+
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
+{
+	struct buffer_head *bh;
+	bh = gfs2_getbuf(gl, blkno, CREATE);
+	meta_prep_new(bh);
+	return bh;
+}
+
+/**
+ * gfs2_meta_read - Read a block from disk
+ * @gl: The glock covering the block
+ * @blkno: The block number
+ * @flags: flags
+ * @bhp: the place where the buffer is returned (NULL on failure)
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+		   struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct buffer_head *bh;
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		*bhp = NULL;
+		return -EIO;
+	}
+
+	*bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
+	if (!(flags & DIO_WAIT))
+		return 0;
+
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh))) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		brelse(bh);
+		*bhp = NULL;
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_meta_wait - Reread a block from disk
+ * @sdp: the filesystem
+ * @bh: The block to wait for
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh)) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		return -EIO;
+	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	return 0;
+}
+
+void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
+{
+	struct address_space *mapping = bh->b_page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
+	struct gfs2_bufdata *bd = bh->b_private;
+	int was_pinned = 0;
+
+	if (test_clear_buffer_pinned(bh)) {
+		trace_gfs2_pin(bd, 0);
+		atomic_dec(&sdp->sd_log_pinned);
+		list_del_init(&bd->bd_list);
+		if (meta)
+			tr->tr_num_buf_rm++;
+		else
+			tr->tr_num_databuf_rm++;
+		tr->tr_touched = 1;
+		was_pinned = 1;
+		brelse(bh);
+	}
+	if (bd) {
+		spin_lock(&sdp->sd_ail_lock);
+		if (bd->bd_tr) {
+			gfs2_trans_add_revoke(sdp, bd);
+		} else if (was_pinned) {
+			bh->b_private = NULL;
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+		}
+		spin_unlock(&sdp->sd_ail_lock);
+	}
+	clear_buffer_dirty(bh);
+	clear_buffer_uptodate(bh);
+}
+
+/**
+ * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
+ * @ip: the inode who owns the buffers
+ * @bstart: the first buffer in the run
+ * @blen: the number of buffers in the run
+ *
+ */
+
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *bh;
+
+	while (blen) {
+		bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
+		if (bh) {
+			lock_buffer(bh);
+			gfs2_log_lock(sdp);
+			gfs2_remove_from_journal(bh, current->journal_info, 1);
+			gfs2_log_unlock(sdp);
+			unlock_buffer(bh);
+			brelse(bh);
+		}
+
+		bstart++;
+		blen--;
+	}
+}
+
+/**
+ * gfs2_meta_indirect_buffer - Get a metadata buffer
+ * @ip: The GFS2 inode
+ * @height: The level of this buf in the metadata (indir addr) tree (if any)
+ * @num: The block number (device relative) of the buffer
+ * @bhp: the buffer is returned here
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+			      struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	int ret = 0;
+	u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+
+	ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+	if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
+		brelse(bh);
+		ret = -EIO;
+	}
+	*bhp = bh;
+	return ret;
+}
+
+/**
+ * gfs2_meta_ra - start readahead on an extent of a file
+ * @gl: the glock the blocks belong to
+ * @dblock: the starting disk block
+ * @extlen: the number of blocks in the extent
+ *
+ * returns: the first buffer in the extent
+ */
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct buffer_head *first_bh, *bh;
+	u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
+			  sdp->sd_sb.sb_bsize_shift;
+
+	BUG_ON(!extlen);
+
+	if (max_ra < 1)
+		max_ra = 1;
+	if (extlen > max_ra)
+		extlen = max_ra;
+
+	first_bh = gfs2_getbuf(gl, dblock, CREATE);
+
+	if (buffer_uptodate(first_bh))
+		goto out;
+	if (!buffer_locked(first_bh))
+		ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
+
+	dblock++;
+	extlen--;
+
+	while (extlen) {
+		bh = gfs2_getbuf(gl, dblock, CREATE);
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh))
+			ll_rw_block(READA | REQ_META, 1, &bh);
+		brelse(bh);
+		dblock++;
+		extlen--;
+		if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
+			goto out;
+	}
+
+	wait_on_buffer(first_bh);
+out:
+	return first_bh;
+}
+
diff --git a/kernel/fs/gfs2/meta_io.h b/kernel/fs/gfs2/meta_io.h
new file mode 100644
index 000000000..ac5d8027d
--- /dev/null
+++ b/kernel/fs/gfs2/meta_io.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIO_DOT_H__
+#define __DIO_DOT_H__
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "incore.h"
+
+static inline void gfs2_buffer_clear(struct buffer_head *bh)
+{
+	memset(bh->b_data, 0, bh->b_size);
+}
+
+static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
+{
+	BUG_ON(head > bh->b_size);
+	memset(bh->b_data + head, 0, bh->b_size - head);
+}
+
+static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
+					 int to_head,
+					 struct buffer_head *from_bh,
+					 int from_head)
+{
+	BUG_ON(from_head < to_head);
+	memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
+	       from_bh->b_size - from_head);
+	memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
+	       0, from_head - to_head);
+}
+
+extern const struct address_space_operations gfs2_meta_aops;
+extern const struct address_space_operations gfs2_rgrp_aops;
+
+static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	if (mapping->a_ops == &gfs2_meta_aops)
+		return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+	else if (mapping->a_ops == &gfs2_rgrp_aops)
+		return container_of(mapping, struct gfs2_sbd, sd_aspace);
+	else
+		return inode->i_sb->s_fs_info;
+}
+
+extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+			  struct buffer_head **bhp);
+extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
+				       int create);
+extern void gfs2_remove_from_journal(struct buffer_head *bh,
+				     struct gfs2_trans *tr, int meta);
+extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+				     struct buffer_head **bhp);
+
+static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
+					 struct buffer_head **bhp)
+{
+	return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, bhp);
+}
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
+
+#define buffer_busy(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
+
+#endif /* __DIO_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/ops_fstype.c b/kernel/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000..35b49f44c
--- /dev/null
+++ b/kernel/fs/gfs2/ops_fstype.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/export.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/quotaops.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "log.h"
+#include "quota.h"
+#include "dir.h"
+#include "meta_io.h"
+#include "trace_gfs2.h"
+
+#define DO 0
+#define UNDO 1
+
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+
+static void gfs2_tune_init(struct gfs2_tune *gt)
+{
+	spin_lock_init(&gt->gt_spin);
+
+	gt->gt_quota_warn_period = 10;
+	gt->gt_quota_scale_num = 1;
+	gt->gt_quota_scale_den = 1;
+	gt->gt_new_files_jdata = 0;
+	gt->gt_max_readahead = 1 << 18;
+	gt->gt_complain_secs = 10;
+}
+
+static struct gfs2_sbd *init_sbd(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp;
+	struct address_space *mapping;
+
+	sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
+	if (!sdp)
+		return NULL;
+
+	sb->s_fs_info = sdp;
+	sdp->sd_vfs = sb;
+	sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats);
+	if (!sdp->sd_lkstats) {
+		kfree(sdp);
+		return NULL;
+	}
+
+	set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+	gfs2_tune_init(&sdp->sd_tune);
+
+	init_waitqueue_head(&sdp->sd_glock_wait);
+	atomic_set(&sdp->sd_glock_disposal, 0);
+	init_completion(&sdp->sd_locking_init);
+	init_completion(&sdp->sd_wdack);
+	spin_lock_init(&sdp->sd_statfs_spin);
+
+	spin_lock_init(&sdp->sd_rindex_spin);
+	sdp->sd_rindex_tree.rb_node = NULL;
+
+	INIT_LIST_HEAD(&sdp->sd_jindex_list);
+	spin_lock_init(&sdp->sd_jindex_spin);
+	mutex_init(&sdp->sd_jindex_mutex);
+	init_completion(&sdp->sd_journal_ready);
+
+	INIT_LIST_HEAD(&sdp->sd_quota_list);
+	mutex_init(&sdp->sd_quota_mutex);
+	mutex_init(&sdp->sd_quota_sync_mutex);
+	init_waitqueue_head(&sdp->sd_quota_wait);
+	INIT_LIST_HEAD(&sdp->sd_trunc_list);
+	spin_lock_init(&sdp->sd_trunc_lock);
+	spin_lock_init(&sdp->sd_bitmap_lock);
+
+	mapping = &sdp->sd_aspace;
+
+	address_space_init_once(mapping);
+	mapping->a_ops = &gfs2_rgrp_aops;
+	mapping->host = sb->s_bdev->bd_inode;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	mapping->private_data = NULL;
+	mapping->writeback_index = 0;
+
+	spin_lock_init(&sdp->sd_log_lock);
+	atomic_set(&sdp->sd_log_pinned, 0);
+	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
+	INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
+	spin_lock_init(&sdp->sd_ordered_lock);
+
+	init_waitqueue_head(&sdp->sd_log_waitq);
+	init_waitqueue_head(&sdp->sd_logd_waitq);
+	spin_lock_init(&sdp->sd_ail_lock);
+	INIT_LIST_HEAD(&sdp->sd_ail1_list);
+	INIT_LIST_HEAD(&sdp->sd_ail2_list);
+
+	init_rwsem(&sdp->sd_log_flush_lock);
+	atomic_set(&sdp->sd_log_in_flight, 0);
+	atomic_set(&sdp->sd_reserving_log, 0);
+	init_waitqueue_head(&sdp->sd_reserving_log_wait);
+	init_waitqueue_head(&sdp->sd_log_flush_wait);
+	atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+	mutex_init(&sdp->sd_freeze_mutex);
+
+	return sdp;
+}
+
+
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+
+static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
+{
+	struct gfs2_sb_host *sb = &sdp->sd_sb;
+
+	if (sb->sb_magic != GFS2_MAGIC ||
+	    sb->sb_type != GFS2_METATYPE_SB) {
+		if (!silent)
+			pr_warn("not a GFS2 filesystem\n");
+		return -EINVAL;
+	}
+
+	/*  If format numbers match exactly, we're done.  */
+
+	if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+		return 0;
+
+	fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
+
+	return -EINVAL;
+}
+
+static void end_bio_io_page(struct bio *bio, int error)
+{
+	struct page *page = bio->bi_private;
+
+	if (!error)
+		SetPageUptodate(page);
+	else
+		pr_warn("error %d reading superblock\n", error);
+	unlock_page(page);
+}
+
+static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
+{
+	struct gfs2_sb_host *sb = &sdp->sd_sb;
+	struct super_block *s = sdp->sd_vfs;
+	const struct gfs2_sb *str = buf;
+
+	sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
+	sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
+	sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
+	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+	sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
+	sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
+	sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
+	sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
+
+	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+	memcpy(s->s_uuid, str->sb_uuid, 16);
+}
+
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sdp: The GFS2 super block
+ * @sector: The location of the super block
+ * @error: The error code to return
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: 0 on success or error
+ */
+
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_sb *p;
+	struct page *page;
+	struct bio *bio;
+
+	page = alloc_page(GFP_NOFS);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	ClearPageUptodate(page);
+	ClearPageDirty(page);
+	lock_page(page);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
+	bio->bi_bdev = sb->s_bdev;
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+
+	bio->bi_end_io = end_bio_io_page;
+	bio->bi_private = page;
+	submit_bio(READ_SYNC | REQ_META, bio);
+	wait_on_page_locked(page);
+	bio_put(bio);
+	if (!PageUptodate(page)) {
+		__free_page(page);
+		return -EIO;
+	}
+	p = kmap(page);
+	gfs2_sb_in(sdp, p);
+	kunmap(page);
+	__free_page(page);
+	return gfs2_check_sb(sdp, silent);
+}
+
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @silent: Don't print message if mount fails
+ *
+ */
+
+static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
+{
+	u32 hash_blocks, ind_blocks, leaf_blocks;
+	u32 tmp_blocks;
+	unsigned int x;
+	int error;
+
+	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
+	if (error) {
+		if (!silent)
+			fs_err(sdp, "can't read superblock\n");
+		return error;
+	}
+
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+			       GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_dinode)) / sizeof(u64);
+	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+	sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+	sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+	sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+	sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header)) /
+			        sizeof(struct gfs2_quota_change);
+	sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize -
+				     sizeof(struct gfs2_meta_header))
+		* GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */
+
+	/* Compute maximum reservation required to add a entry to a directory */
+
+	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+			     sdp->sd_jbsize);
+
+	ind_blocks = 0;
+	for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+		tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+		ind_blocks += tmp_blocks;
+	}
+
+	leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+
+	sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+
+	sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_dinode);
+	sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_heightsize[x - 1] || m)
+			break;
+		sdp->sd_heightsize[x] = space;
+	}
+	sdp->sd_max_height = x;
+	sdp->sd_heightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+
+	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+				 sizeof(struct gfs2_dinode);
+	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_jheightsize[x - 1] || m)
+			break;
+		sdp->sd_jheightsize[x] = space;
+	}
+	sdp->sd_max_jheight = x;
+	sdp->sd_jheightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+
+	return 0;
+}
+
+static int init_names(struct gfs2_sbd *sdp, int silent)
+{
+	char *proto, *table;
+	int error = 0;
+
+	proto = sdp->sd_args.ar_lockproto;
+	table = sdp->sd_args.ar_locktable;
+
+	/*  Try to autodetect  */
+
+	if (!proto[0] || !table[0]) {
+		error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
+		if (error)
+			return error;
+
+		if (!proto[0])
+			proto = sdp->sd_sb.sb_lockproto;
+		if (!table[0])
+			table = sdp->sd_sb.sb_locktable;
+	}
+
+	if (!table[0])
+		table = sdp->sd_vfs->s_id;
+
+	strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
+	strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
+
+	table = sdp->sd_table_name;
+	while ((table = strchr(table, '/')))
+		*table = '_';
+
+	return error;
+}
+
+static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
+			int undo)
+{
+	int error = 0;
+
+	if (undo)
+		goto fail_trans;
+
+	error = gfs2_glock_nq_num(sdp,
+				  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
+				  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+				  mount_gh);
+	if (error) {
+		fs_err(sdp, "can't acquire mount glock: %d\n", error);
+		goto fail;
+	}
+
+	error = gfs2_glock_nq_num(sdp,
+				  GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
+				  LM_ST_SHARED,
+				  LM_FLAG_NOEXP | GL_EXACT,
+				  &sdp->sd_live_gh);
+	if (error) {
+		fs_err(sdp, "can't acquire live glock: %d\n", error);
+		goto fail_mount;
+	}
+
+	error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
+			       CREATE, &sdp->sd_rename_gl);
+	if (error) {
+		fs_err(sdp, "can't create rename glock: %d\n", error);
+		goto fail_live;
+	}
+
+	error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops,
+			       CREATE, &sdp->sd_freeze_gl);
+	if (error) {
+		fs_err(sdp, "can't create transaction glock: %d\n", error);
+		goto fail_rename;
+	}
+
+	return 0;
+
+fail_trans:
+	gfs2_glock_put(sdp->sd_freeze_gl);
+fail_rename:
+	gfs2_glock_put(sdp->sd_rename_gl);
+fail_live:
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+fail_mount:
+	gfs2_glock_dq_uninit(mount_gh);
+fail:
+	return error;
+}
+
+static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
+			    u64 no_addr, const char *name)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct dentry *dentry;
+	struct inode *inode;
+
+	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	if (IS_ERR(inode)) {
+		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
+		return PTR_ERR(inode);
+	}
+	dentry = d_make_root(inode);
+	if (!dentry) {
+		fs_err(sdp, "can't alloc %s dentry\n", name);
+		return -ENOMEM;
+	}
+	*dptr = dentry;
+	return 0;
+}
+
+static int init_sb(struct gfs2_sbd *sdp, int silent)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_holder sb_gh;
+	u64 no_addr;
+	int ret;
+
+	ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+				LM_ST_SHARED, 0, &sb_gh);
+	if (ret) {
+		fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
+		return ret;
+	}
+
+	ret = gfs2_read_sb(sdp, silent);
+	if (ret) {
+		fs_err(sdp, "can't read superblock: %d\n", ret);
+		goto out;
+	}
+
+	/* Set up the buffer cache and SB for real */
+	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
+		ret = -EINVAL;
+		fs_err(sdp, "FS block size (%u) is too small for device "
+		       "block size (%u)\n",
+		       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
+		goto out;
+	}
+	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
+		ret = -EINVAL;
+		fs_err(sdp, "FS block size (%u) is too big for machine "
+		       "page size (%u)\n",
+		       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
+		goto out;
+	}
+	sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
+
+	/* Get the root inode */
+	no_addr = sdp->sd_sb.sb_root_dir.no_addr;
+	ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
+	if (ret)
+		goto out;
+
+	/* Get the master inode */
+	no_addr = sdp->sd_sb.sb_master_dir.no_addr;
+	ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
+	if (ret) {
+		dput(sdp->sd_root_dir);
+		goto out;
+	}
+	sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
+out:
+	gfs2_glock_dq_uninit(&sb_gh);
+	return ret;
+}
+
+static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
+{
+	char *message = "FIRSTMOUNT=Done";
+	char *envp[] = { message, NULL };
+
+	fs_info(sdp, "first mount done, others may mount\n");
+
+	if (sdp->sd_lockstruct.ls_ops->lm_first_done)
+		sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
+
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+}
+
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+	struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+	struct qstr name;
+	char buf[20];
+	struct gfs2_jdesc *jd;
+	int error;
+
+	name.name = buf;
+
+	mutex_lock(&sdp->sd_jindex_mutex);
+
+	for (;;) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+		if (error)
+			break;
+
+		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+		name.hash = gfs2_disk_hash(name.name, name.len);
+
+		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+		if (error == -ENOENT) {
+			error = 0;
+			break;
+		}
+
+		gfs2_glock_dq_uninit(ji_gh);
+
+		if (error)
+			break;
+
+		error = -ENOMEM;
+		jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+		if (!jd)
+			break;
+
+		INIT_LIST_HEAD(&jd->extent_list);
+		INIT_LIST_HEAD(&jd->jd_revoke_list);
+
+		INIT_WORK(&jd->jd_work, gfs2_recover_func);
+		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+			if (!jd->jd_inode)
+				error = -ENOENT;
+			else
+				error = PTR_ERR(jd->jd_inode);
+			kfree(jd);
+			break;
+		}
+
+		spin_lock(&sdp->sd_jindex_spin);
+		jd->jd_jid = sdp->sd_journals++;
+		list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+		spin_unlock(&sdp->sd_jindex_spin);
+	}
+
+	mutex_unlock(&sdp->sd_jindex_mutex);
+
+	return error;
+}
+
+/**
+ * check_journal_clean - Make sure a journal is clean for a spectator mount
+ * @sdp: The GFS2 superblock
+ * @jd: The journal descriptor
+ *
+ * Returns: 0 if the journal is clean or locked, else an error
+ */
+static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
+{
+	int error;
+	struct gfs2_holder j_gh;
+	struct gfs2_log_header_host head;
+	struct gfs2_inode *ip;
+
+	ip = GFS2_I(jd->jd_inode);
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP |
+				   GL_EXACT | GL_NOCACHE, &j_gh);
+	if (error) {
+		fs_err(sdp, "Error locking journal for spectator mount.\n");
+		return -EPERM;
+	}
+	error = gfs2_jdesc_check(jd);
+	if (error) {
+		fs_err(sdp, "Error checking journal for spectator mount.\n");
+		goto out_unlock;
+	}
+	error = gfs2_find_jhead(jd, &head);
+	if (error) {
+		fs_err(sdp, "Error parsing journal for spectator mount.\n");
+		goto out_unlock;
+	}
+	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+		error = -EPERM;
+		fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter "
+		       "must not be a spectator.\n", jd->jd_jid);
+	}
+
+out_unlock:
+	gfs2_glock_dq_uninit(&j_gh);
+	return error;
+}
+
+static int init_journal(struct gfs2_sbd *sdp, int undo)
+{
+	struct inode *master = d_inode(sdp->sd_master_dir);
+	struct gfs2_holder ji_gh;
+	struct gfs2_inode *ip;
+	int jindex = 1;
+	int error = 0;
+
+	if (undo) {
+		jindex = 0;
+		goto fail_jinode_gh;
+	}
+
+	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
+	if (IS_ERR(sdp->sd_jindex)) {
+		fs_err(sdp, "can't lookup journal index: %d\n", error);
+		return PTR_ERR(sdp->sd_jindex);
+	}
+
+	/* Load in the journal index special file */
+
+	error = gfs2_jindex_hold(sdp, &ji_gh);
+	if (error) {
+		fs_err(sdp, "can't read journal index: %d\n", error);
+		goto fail;
+	}
+
+	error = -EUSERS;
+	if (!gfs2_jindex_size(sdp)) {
+		fs_err(sdp, "no journals!\n");
+		goto fail_jindex;
+	}
+
+	if (sdp->sd_args.ar_spectator) {
+		sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
+		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+		atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+		atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
+	} else {
+		if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
+			fs_err(sdp, "can't mount journal #%u\n",
+			       sdp->sd_lockstruct.ls_jid);
+			fs_err(sdp, "there are only %u journals (0 - %u)\n",
+			       gfs2_jindex_size(sdp),
+			       gfs2_jindex_size(sdp) - 1);
+			goto fail_jindex;
+		}
+		sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
+
+		error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
+					  &gfs2_journal_glops,
+					  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+					  &sdp->sd_journal_gh);
+		if (error) {
+			fs_err(sdp, "can't acquire journal glock: %d\n", error);
+			goto fail_jindex;
+		}
+
+		ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
+					   &sdp->sd_jinode_gh);
+		if (error) {
+			fs_err(sdp, "can't acquire journal inode glock: %d\n",
+			       error);
+			goto fail_journal_gh;
+		}
+
+		error = gfs2_jdesc_check(sdp->sd_jdesc);
+		if (error) {
+			fs_err(sdp, "my journal (%u) is bad: %d\n",
+			       sdp->sd_jdesc->jd_jid, error);
+			goto fail_jinode_gh;
+		}
+		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+		atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+		atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
+
+		/* Map the extents for this journal's blocks */
+		gfs2_map_journal_extents(sdp, sdp->sd_jdesc);
+	}
+	trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
+
+	if (sdp->sd_lockstruct.ls_first) {
+		unsigned int x;
+		for (x = 0; x < sdp->sd_journals; x++) {
+			struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x);
+
+			if (sdp->sd_args.ar_spectator) {
+				error = check_journal_clean(sdp, jd);
+				if (error)
+					goto fail_jinode_gh;
+				continue;
+			}
+			error = gfs2_recover_journal(jd, true);
+			if (error) {
+				fs_err(sdp, "error recovering journal %u: %d\n",
+				       x, error);
+				goto fail_jinode_gh;
+			}
+		}
+
+		gfs2_others_may_mount(sdp);
+	} else if (!sdp->sd_args.ar_spectator) {
+		error = gfs2_recover_journal(sdp->sd_jdesc, true);
+		if (error) {
+			fs_err(sdp, "error recovering my journal: %d\n", error);
+			goto fail_jinode_gh;
+		}
+	}
+
+	set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
+	gfs2_glock_dq_uninit(&ji_gh);
+	jindex = 0;
+	INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func);
+	return 0;
+
+fail_jinode_gh:
+	if (!sdp->sd_args.ar_spectator)
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+fail_journal_gh:
+	if (!sdp->sd_args.ar_spectator)
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+fail_jindex:
+	gfs2_jindex_free(sdp);
+	if (jindex)
+		gfs2_glock_dq_uninit(&ji_gh);
+fail:
+	iput(sdp->sd_jindex);
+	return error;
+}
+
+static struct lock_class_key gfs2_quota_imutex_key;
+
+static int init_inodes(struct gfs2_sbd *sdp, int undo)
+{
+	int error = 0;
+	struct inode *master = d_inode(sdp->sd_master_dir);
+
+	if (undo)
+		goto fail_qinode;
+
+	error = init_journal(sdp, undo);
+	complete_all(&sdp->sd_journal_ready);
+	if (error)
+		goto fail;
+
+	/* Read in the master statfs inode */
+	sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
+	if (IS_ERR(sdp->sd_statfs_inode)) {
+		error = PTR_ERR(sdp->sd_statfs_inode);
+		fs_err(sdp, "can't read in statfs inode: %d\n", error);
+		goto fail_journal;
+	}
+
+	/* Read in the resource index inode */
+	sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
+	if (IS_ERR(sdp->sd_rindex)) {
+		error = PTR_ERR(sdp->sd_rindex);
+		fs_err(sdp, "can't get resource index inode: %d\n", error);
+		goto fail_statfs;
+	}
+	sdp->sd_rindex_uptodate = 0;
+
+	/* Read in the quota inode */
+	sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
+	if (IS_ERR(sdp->sd_quota_inode)) {
+		error = PTR_ERR(sdp->sd_quota_inode);
+		fs_err(sdp, "can't get quota file inode: %d\n", error);
+		goto fail_rindex;
+	}
+	/*
+	 * i_mutex on quota files is special. Since this inode is hidden system
+	 * file, we are safe to define locking ourselves.
+	 */
+	lockdep_set_class(&sdp->sd_quota_inode->i_mutex,
+			  &gfs2_quota_imutex_key);
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		goto fail_qinode;
+
+	return 0;
+
+fail_qinode:
+	iput(sdp->sd_quota_inode);
+fail_rindex:
+	gfs2_clear_rgrpd(sdp);
+	iput(sdp->sd_rindex);
+fail_statfs:
+	iput(sdp->sd_statfs_inode);
+fail_journal:
+	init_journal(sdp, UNDO);
+fail:
+	return error;
+}
+
+static int init_per_node(struct gfs2_sbd *sdp, int undo)
+{
+	struct inode *pn = NULL;
+	char buf[30];
+	int error = 0;
+	struct gfs2_inode *ip;
+	struct inode *master = d_inode(sdp->sd_master_dir);
+
+	if (sdp->sd_args.ar_spectator)
+		return 0;
+
+	if (undo)
+		goto fail_qc_gh;
+
+	pn = gfs2_lookup_simple(master, "per_node");
+	if (IS_ERR(pn)) {
+		error = PTR_ERR(pn);
+		fs_err(sdp, "can't find per_node directory: %d\n", error);
+		return error;
+	}
+
+	sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
+	sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
+	if (IS_ERR(sdp->sd_sc_inode)) {
+		error = PTR_ERR(sdp->sd_sc_inode);
+		fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
+		goto fail;
+	}
+
+	sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
+	sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+	if (IS_ERR(sdp->sd_qc_inode)) {
+		error = PTR_ERR(sdp->sd_qc_inode);
+		fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
+		goto fail_ut_i;
+	}
+
+	iput(pn);
+	pn = NULL;
+
+	ip = GFS2_I(sdp->sd_sc_inode);
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+				   &sdp->sd_sc_gh);
+	if (error) {
+		fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
+		goto fail_qc_i;
+	}
+
+	ip = GFS2_I(sdp->sd_qc_inode);
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+				   &sdp->sd_qc_gh);
+	if (error) {
+		fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
+		goto fail_ut_gh;
+	}
+
+	return 0;
+
+fail_qc_gh:
+	gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+fail_ut_gh:
+	gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+fail_qc_i:
+	iput(sdp->sd_qc_inode);
+fail_ut_i:
+	iput(sdp->sd_sc_inode);
+fail:
+	if (pn)
+		iput(pn);
+	return error;
+}
+
+static const match_table_t nolock_tokens = {
+	{ Opt_jid, "jid=%d\n", },
+	{ Opt_err, NULL },
+};
+
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_put_lock = gfs2_glock_free,
+	.lm_tokens = &nolock_tokens,
+};
+
+/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguments
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+	const struct lm_lockops *lm;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	struct gfs2_args *args = &sdp->sd_args;
+	const char *proto = sdp->sd_proto_name;
+	const char *table = sdp->sd_table_name;
+	char *o, *options;
+	int ret;
+
+	if (!strcmp("lock_nolock", proto)) {
+		lm = &nolock_ops;
+		sdp->sd_args.ar_localflocks = 1;
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+	} else if (!strcmp("lock_dlm", proto)) {
+		lm = &gfs2_dlm_ops;
+#endif
+	} else {
+		pr_info("can't find protocol %s\n", proto);
+		return -ENOENT;
+	}
+
+	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+
+	ls->ls_ops = lm;
+	ls->ls_first = 1;
+
+	for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
+		substring_t tmp[MAX_OPT_ARGS];
+		int token, option;
+
+		if (!o || !*o)
+			continue;
+
+		token = match_token(o, *lm->lm_tokens, tmp);
+		switch (token) {
+		case Opt_jid:
+			ret = match_int(&tmp[0], &option);
+			if (ret || option < 0) 
+				goto hostdata_error;
+			if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+				ls->ls_jid = option;
+			break;
+		case Opt_id:
+		case Opt_nodir:
+			/* Obsolete, but left for backward compat purposes */
+			break;
+		case Opt_first:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_first = option;
+			break;
+		case Opt_err:
+		default:
+hostdata_error:
+			fs_info(sdp, "unknown hostdata (%s)\n", o);
+			return -EINVAL;
+		}
+	}
+
+	if (lm->lm_mount == NULL) {
+		fs_info(sdp, "Now mounting FS...\n");
+		complete_all(&sdp->sd_locking_init);
+		return 0;
+	}
+	ret = lm->lm_mount(sdp, table);
+	if (ret == 0)
+		fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+	complete_all(&sdp->sd_locking_init);
+	return ret;
+}
+
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+	const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	    lm->lm_unmount)
+		lm->lm_unmount(sdp);
+}
+
+static int wait_on_journal(struct gfs2_sbd *sdp)
+{
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		return 0;
+
+	return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE)
+		? -EINTR : 0;
+}
+
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
+
+/**
+ * fill_super - Read in superblock
+ * @sb: The VFS superblock
+ * @data: Mount options
+ * @silent: Don't complain if it's not a GFS2 filesystem
+ *
+ * Returns: errno
+ */
+
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
+{
+	struct gfs2_sbd *sdp;
+	struct gfs2_holder mount_gh;
+	int error;
+
+	sdp = init_sbd(sb);
+	if (!sdp) {
+		pr_warn("can't alloc struct gfs2_sbd\n");
+		return -ENOMEM;
+	}
+	sdp->sd_args = *args;
+
+	if (sdp->sd_args.ar_spectator) {
+                sb->s_flags |= MS_RDONLY;
+		set_bit(SDF_RORECOVERY, &sdp->sd_flags);
+	}
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+
+	sb->s_flags |= MS_NOSEC;
+	sb->s_magic = GFS2_MAGIC;
+	sb->s_op = &gfs2_super_ops;
+	sb->s_d_op = &gfs2_dops;
+	sb->s_export_op = &gfs2_export_ops;
+	sb->s_xattr = gfs2_xattr_handlers;
+	sb->s_qcop = &gfs2_quotactl_ops;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+	sb->s_time_gran = 1;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	/* Set up the buffer cache and fill in some fake block size values
+	   to allow us to read-in the on-disk superblock. */
+	sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+	sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+
+	sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
+	sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+	if (sdp->sd_args.ar_statfs_quantum) {
+		sdp->sd_tune.gt_statfs_slow = 0;
+		sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+	} else {
+		sdp->sd_tune.gt_statfs_slow = 1;
+		sdp->sd_tune.gt_statfs_quantum = 30;
+	}
+
+	error = init_names(sdp, silent);
+	if (error) {
+		/* In this case, we haven't initialized sysfs, so we have to
+		   manually free the sdp. */
+		free_percpu(sdp->sd_lkstats);
+		kfree(sdp);
+		sb->s_fs_info = NULL;
+		return error;
+	}
+
+	snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+
+	error = gfs2_sys_fs_add(sdp);
+	/*
+	 * If we hit an error here, gfs2_sys_fs_add will have called function
+	 * kobject_put which causes the sysfs usage count to go to zero, which
+	 * causes sysfs to call function gfs2_sbd_release, which frees sdp.
+	 * Subsequent error paths here will call gfs2_sys_fs_del, which also
+	 * kobject_put to free sdp.
+	 */
+	if (error)
+		return error;
+
+	gfs2_create_debugfs_file(sdp);
+
+	error = gfs2_lm_mount(sdp, silent);
+	if (error)
+		goto fail_debug;
+
+	error = init_locking(sdp, &mount_gh, DO);
+	if (error)
+		goto fail_lm;
+
+	error = init_sb(sdp, silent);
+	if (error)
+		goto fail_locking;
+
+	error = wait_on_journal(sdp);
+	if (error)
+		goto fail_sb;
+
+	/*
+	 * If user space has failed to join the cluster or some similar
+	 * failure has occurred, then the journal id will contain a
+	 * negative (error) number. This will then be returned to the
+	 * caller (of the mount syscall). We do this even for spectator
+	 * mounts (which just write a jid of 0 to indicate "ok" even though
+	 * the jid is unused in the spectator case)
+	 */
+	if (sdp->sd_lockstruct.ls_jid < 0) {
+		error = sdp->sd_lockstruct.ls_jid;
+		sdp->sd_lockstruct.ls_jid = 0;
+		goto fail_sb;
+	}
+
+	if (sdp->sd_args.ar_spectator)
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+			 sdp->sd_table_name);
+	else
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+			 sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
+
+	error = init_inodes(sdp, DO);
+	if (error)
+		goto fail_sb;
+
+	error = init_per_node(sdp, DO);
+	if (error)
+		goto fail_inodes;
+
+	error = gfs2_statfs_init(sdp);
+	if (error) {
+		fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
+		goto fail_per_node;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_rw(sdp);
+		if (error) {
+			fs_err(sdp, "can't make FS RW: %d\n", error);
+			goto fail_per_node;
+		}
+	}
+
+	gfs2_glock_dq_uninit(&mount_gh);
+	gfs2_online_uevent(sdp);
+	return 0;
+
+fail_per_node:
+	init_per_node(sdp, UNDO);
+fail_inodes:
+	init_inodes(sdp, UNDO);
+fail_sb:
+	if (sdp->sd_root_dir)
+		dput(sdp->sd_root_dir);
+	if (sdp->sd_master_dir)
+		dput(sdp->sd_master_dir);
+	if (sb->s_root)
+		dput(sb->s_root);
+	sb->s_root = NULL;
+fail_locking:
+	init_locking(sdp, &mount_gh, UNDO);
+fail_lm:
+	complete_all(&sdp->sd_journal_ready);
+	gfs2_gl_hash_clear(sdp);
+	gfs2_lm_unmount(sdp);
+fail_debug:
+	gfs2_delete_debugfs_file(sdp);
+	free_percpu(sdp->sd_lkstats);
+	/* gfs2_sys_fs_del must be the last thing we do, since it causes
+	 * sysfs to call function gfs2_sbd_release, which frees sdp. */
+	gfs2_sys_fs_del(sdp);
+	sb->s_fs_info = NULL;
+	return error;
+}
+
+static int set_gfs2_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	return 0;
+}
+
+static int test_gfs2_super(struct super_block *s, void *ptr)
+{
+	struct block_device *bdev = ptr;
+	return (bdev == s->s_bdev);
+}
+
+/**
+ * gfs2_mount - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ *    of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+
+static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
+{
+	struct block_device *bdev;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
+	int error;
+	struct gfs2_args args;
+	struct gfs2_sbd *sdp;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		error = -EBUSY;
+		goto error_bdev;
+	}
+	s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	error = PTR_ERR(s);
+	if (IS_ERR(s))
+		goto error_bdev;
+
+	if (s->s_root) {
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
+		 */
+		up_write(&s->s_umount);
+		blkdev_put(bdev, mode);
+		down_write(&s->s_umount);
+	}
+
+	memset(&args, 0, sizeof(args));
+	args.ar_quota = GFS2_QUOTA_DEFAULT;
+	args.ar_data = GFS2_DATA_DEFAULT;
+	args.ar_commit = 30;
+	args.ar_statfs_quantum = 30;
+	args.ar_quota_quantum = 60;
+	args.ar_errors = GFS2_ERRORS_DEFAULT;
+
+	error = gfs2_mount_args(&args, data);
+	if (error) {
+		pr_warn("can't parse mount arguments\n");
+		goto error_super;
+	}
+
+	if (s->s_root) {
+		error = -EBUSY;
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			goto error_super;
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_mode = mode;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(bdev));
+		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+		if (error)
+			goto error_super;
+		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
+	}
+
+	sdp = s->s_fs_info;
+	if (args.ar_meta)
+		return dget(sdp->sd_master_dir);
+	else
+		return dget(sdp->sd_root_dir);
+
+error_super:
+	deactivate_locked_super(s);
+	return ERR_PTR(error);
+error_bdev:
+	blkdev_put(bdev, mode);
+	return ERR_PTR(error);
+}
+
+static int set_meta_super(struct super_block *s, void *ptr)
+{
+	return -EINVAL;
+}
+
+static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	struct super_block *s;
+	struct gfs2_sbd *sdp;
+	struct path path;
+	int error;
+
+	error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
+	if (error) {
+		pr_warn("path_lookup on %s returned error %d\n",
+			dev_name, error);
+		return ERR_PTR(error);
+	}
+	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
+		 d_inode(path.dentry)->i_sb->s_bdev);
+	path_put(&path);
+	if (IS_ERR(s)) {
+		pr_warn("gfs2 mount does not exist\n");
+		return ERR_CAST(s);
+	}
+	if ((flags ^ s->s_flags) & MS_RDONLY) {
+		deactivate_locked_super(s);
+		return ERR_PTR(-EBUSY);
+	}
+	sdp = s->s_fs_info;
+	return dget(sdp->sd_master_dir);
+}
+
+static void gfs2_kill_sb(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	if (sdp == NULL) {
+		kill_block_super(sb);
+		return;
+	}
+
+	gfs2_log_flush(sdp, NULL, SYNC_FLUSH);
+	dput(sdp->sd_root_dir);
+	dput(sdp->sd_master_dir);
+	sdp->sd_root_dir = NULL;
+	sdp->sd_master_dir = NULL;
+	shrink_dcache_sb(sb);
+	gfs2_delete_debugfs_file(sdp);
+	free_percpu(sdp->sd_lkstats);
+	kill_block_super(sb);
+}
+
+struct file_system_type gfs2_fs_type = {
+	.name = "gfs2",
+	.fs_flags = FS_REQUIRES_DEV,
+	.mount = gfs2_mount,
+	.kill_sb = gfs2_kill_sb,
+	.owner = THIS_MODULE,
+};
+MODULE_ALIAS_FS("gfs2");
+
+struct file_system_type gfs2meta_fs_type = {
+	.name = "gfs2meta",
+	.fs_flags = FS_REQUIRES_DEV,
+	.mount = gfs2_mount_meta,
+	.owner = THIS_MODULE,
+};
+MODULE_ALIAS_FS("gfs2meta");
diff --git a/kernel/fs/gfs2/quota.c b/kernel/fs/gfs2/quota.c
new file mode 100644
index 000000000..e3065cb9a
--- /dev/null
+++ b/kernel/fs/gfs2/quota.c
@@ -0,0 +1,1680 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Quota change tags are associated with each transaction that allocates or
+ * deallocates space.  Those changes are accumulated locally to each node (in a
+ * per-node file) and then are periodically synced to the quota file.  This
+ * avoids the bottleneck of constantly touching the quota file, but introduces
+ * fuzziness in the current usage value of IDs that are being used on different
+ * nodes in the cluster simultaneously.  So, it is possible for a user on
+ * multiple nodes to overrun their quota, but that overrun is controlable.
+ * Since quota tags are part of transactions, there is no need for a quota check
+ * program to be run on node crashes or anything like that.
+ *
+ * There are couple of knobs that let the administrator manage the quota
+ * fuzziness.  "quota_quantum" sets the maximum time a quota change can be
+ * sitting on one node before being synced to the quota file.  (The default is
+ * 60 seconds.)  Another knob, "quota_scale" controls how quickly the frequency
+ * of quota file syncs increases as the user moves closer to their limit.  The
+ * more frequent the syncs, the more accurate the quota enforcement, but that
+ * means that there is more contention between the nodes for the quota file.
+ * The default value is one.  This sets the maximum theoretical quota overrun
+ * (with infinite node with infinite bandwidth) to twice the user's limit.  (In
+ * practice, the maximum overrun you see should be much less.)  A "quota_scale"
+ * number greater than one makes quota syncs more frequent and reduces the
+ * maximum overrun.  Numbers less than one (but greater than zero) make quota
+ * syncs less frequent.
+ *
+ * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
+ * the quota file, so it is not being constantly read.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
+#include <linux/lockref.h>
+#include <linux/list_lru.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+#include <linux/jhash.h>
+#include <linux/vmalloc.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "inode.h"
+#include "util.h"
+
+#define GFS2_QD_HASH_SHIFT      12
+#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
+#define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
+
+/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
+/*                     -> sd_bitmap_lock                              */
+static DEFINE_SPINLOCK(qd_lock);
+struct list_lru gfs2_qd_lru;
+
+static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
+
+static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
+				 const struct kqid qid)
+{
+	unsigned int h;
+
+	h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
+	h = jhash(&qid, sizeof(struct kqid), h);
+
+	return h & GFS2_QD_HASH_MASK;
+}
+
+static inline void spin_lock_bucket(unsigned int hash)
+{
+        hlist_bl_lock(&qd_hash_table[hash]);
+}
+
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        hlist_bl_unlock(&qd_hash_table[hash]);
+}
+
+static void gfs2_qd_dealloc(struct rcu_head *rcu)
+{
+	struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
+	kmem_cache_free(gfs2_quotad_cachep, qd);
+}
+
+static void gfs2_qd_dispose(struct list_head *list)
+{
+	struct gfs2_quota_data *qd;
+	struct gfs2_sbd *sdp;
+
+	while (!list_empty(list)) {
+		qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
+		sdp = qd->qd_gl->gl_sbd;
+
+		list_del(&qd->qd_lru);
+
+		/* Free from the filesystem-specific list */
+		spin_lock(&qd_lock);
+		list_del(&qd->qd_list);
+		spin_unlock(&qd_lock);
+
+		spin_lock_bucket(qd->qd_hash);
+		hlist_bl_del_rcu(&qd->qd_hlist);
+		spin_unlock_bucket(qd->qd_hash);
+
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_glock_put(qd->qd_gl);
+		atomic_dec(&sdp->sd_quota_count);
+
+		/* Delete it from the common reclaim list */
+		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
+	}
+}
+
+
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *dispose = arg;
+	struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
+
+	if (!spin_trylock(&qd->qd_lockref.lock))
+		return LRU_SKIP;
+
+	if (qd->qd_lockref.count == 0) {
+		lockref_mark_dead(&qd->qd_lockref);
+		list_lru_isolate_move(lru, &qd->qd_lru, dispose);
+	}
+
+	spin_unlock(&qd->qd_lockref.lock);
+	return LRU_REMOVED;
+}
+
+static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	LIST_HEAD(dispose);
+	unsigned long freed;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
+				     gfs2_qd_isolate, &dispose);
+
+	gfs2_qd_dispose(&dispose);
+
+	return freed;
+}
+
+static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
+}
+
+struct shrinker gfs2_qd_shrinker = {
+	.count_objects = gfs2_qd_shrink_count,
+	.scan_objects = gfs2_qd_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE,
+};
+
+
+static u64 qd2index(struct gfs2_quota_data *qd)
+{
+	struct kqid qid = qd->qd_id;
+	return (2 * (u64)from_kqid(&init_user_ns, qid)) +
+		((qid.type == USRQUOTA) ? 0 : 1);
+}
+
+static u64 qd2offset(struct gfs2_quota_data *qd)
+{
+	u64 offset;
+
+	offset = qd2index(qd);
+	offset *= sizeof(struct gfs2_quota);
+
+	return offset;
+}
+
+static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
+{
+	struct gfs2_quota_data *qd;
+	int error;
+
+	qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
+	if (!qd)
+		return NULL;
+
+	qd->qd_sbd = sdp;
+	qd->qd_lockref.count = 1;
+	spin_lock_init(&qd->qd_lockref.lock);
+	qd->qd_id = qid;
+	qd->qd_slot = -1;
+	INIT_LIST_HEAD(&qd->qd_lru);
+	qd->qd_hash = hash;
+
+	error = gfs2_glock_get(sdp, qd2index(qd),
+			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
+	if (error)
+		goto fail;
+
+	return qd;
+
+fail:
+	kmem_cache_free(gfs2_quotad_cachep, qd);
+	return NULL;
+}
+
+static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
+						     const struct gfs2_sbd *sdp,
+						     struct kqid qid)
+{
+	struct gfs2_quota_data *qd;
+	struct hlist_bl_node *h;
+
+	hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
+		if (!qid_eq(qd->qd_id, qid))
+			continue;
+		if (qd->qd_sbd != sdp)
+			continue;
+		if (lockref_get_not_dead(&qd->qd_lockref)) {
+			list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+			return qd;
+		}
+	}
+
+	return NULL;
+}
+
+
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+		  struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd, *new_qd;
+	unsigned int hash = gfs2_qd_hash(sdp, qid);
+
+	rcu_read_lock();
+	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+	rcu_read_unlock();
+
+	if (qd)
+		return 0;
+
+	new_qd = qd_alloc(hash, sdp, qid);
+	if (!new_qd)
+		return -ENOMEM;
+
+	spin_lock(&qd_lock);
+	spin_lock_bucket(hash);
+	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+	if (qd == NULL) {
+		*qdp = new_qd;
+		list_add(&new_qd->qd_list, &sdp->sd_quota_list);
+		hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
+		atomic_inc(&sdp->sd_quota_count);
+	}
+	spin_unlock_bucket(hash);
+	spin_unlock(&qd_lock);
+
+	if (qd) {
+		gfs2_glock_put(new_qd->qd_gl);
+		kmem_cache_free(gfs2_quotad_cachep, new_qd);
+	}
+
+	return 0;
+}
+
+
+static void qd_hold(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
+	lockref_get(&qd->qd_lockref);
+}
+
+static void qd_put(struct gfs2_quota_data *qd)
+{
+	if (lockref_put_or_lock(&qd->qd_lockref))
+		return;
+
+	qd->qd_lockref.count = 0;
+	list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
+	spin_unlock(&qd->qd_lockref.lock);
+
+}
+
+static int slot_get(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_sbd;
+	unsigned int bit;
+	int error = 0;
+
+	spin_lock(&sdp->sd_bitmap_lock);
+	if (qd->qd_slot_count != 0)
+		goto out;
+
+	error = -ENOSPC;
+	bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
+	if (bit < sdp->sd_quota_slots) {
+		set_bit(bit, sdp->sd_quota_bitmap);
+		qd->qd_slot = bit;
+		error = 0;
+out:
+		qd->qd_slot_count++;
+	}
+	spin_unlock(&sdp->sd_bitmap_lock);
+
+	return error;
+}
+
+static void slot_hold(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_sbd;
+
+	spin_lock(&sdp->sd_bitmap_lock);
+	gfs2_assert(sdp, qd->qd_slot_count);
+	qd->qd_slot_count++;
+	spin_unlock(&sdp->sd_bitmap_lock);
+}
+
+static void slot_put(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_sbd;
+
+	spin_lock(&sdp->sd_bitmap_lock);
+	gfs2_assert(sdp, qd->qd_slot_count);
+	if (!--qd->qd_slot_count) {
+		BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
+		qd->qd_slot = -1;
+	}
+	spin_unlock(&sdp->sd_bitmap_lock);
+}
+
+static int bh_get(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	unsigned int block, offset;
+	struct buffer_head *bh;
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	mutex_lock(&sdp->sd_quota_mutex);
+
+	if (qd->qd_bh_count++) {
+		mutex_unlock(&sdp->sd_quota_mutex);
+		return 0;
+	}
+
+	block = qd->qd_slot / sdp->sd_qc_per_block;
+	offset = qd->qd_slot % sdp->sd_qc_per_block;
+
+	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+	error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
+	if (error)
+		goto fail;
+	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+	if (error)
+		goto fail;
+	error = -EIO;
+	if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
+		goto fail_brelse;
+
+	qd->qd_bh = bh;
+	qd->qd_bh_qc = (struct gfs2_quota_change *)
+		(bh->b_data + sizeof(struct gfs2_meta_header) +
+		 offset * sizeof(struct gfs2_quota_change));
+
+	mutex_unlock(&sdp->sd_quota_mutex);
+
+	return 0;
+
+fail_brelse:
+	brelse(bh);
+fail:
+	qd->qd_bh_count--;
+	mutex_unlock(&sdp->sd_quota_mutex);
+	return error;
+}
+
+static void bh_put(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	mutex_lock(&sdp->sd_quota_mutex);
+	gfs2_assert(sdp, qd->qd_bh_count);
+	if (!--qd->qd_bh_count) {
+		brelse(qd->qd_bh);
+		qd->qd_bh = NULL;
+		qd->qd_bh_qc = NULL;
+	}
+	mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
+			 u64 *sync_gen)
+{
+	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+	    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+	    (sync_gen && (qd->qd_sync_gen >= *sync_gen)))
+		return 0;
+
+	if (!lockref_get_not_dead(&qd->qd_lockref))
+		return 0;
+
+	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+	set_bit(QDF_LOCKED, &qd->qd_flags);
+	qd->qd_change_sync = qd->qd_change;
+	slot_hold(qd);
+	return 1;
+}
+
+static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd = NULL;
+	int error;
+	int found = 0;
+
+	*qdp = NULL;
+
+	if (sdp->sd_vfs->s_flags & MS_RDONLY)
+		return 0;
+
+	spin_lock(&qd_lock);
+
+	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+		found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen);
+		if (found)
+			break;
+	}
+
+	if (!found)
+		qd = NULL;
+
+	spin_unlock(&qd_lock);
+
+	if (qd) {
+		gfs2_assert_warn(sdp, qd->qd_change_sync);
+		error = bh_get(qd);
+		if (error) {
+			clear_bit(QDF_LOCKED, &qd->qd_flags);
+			slot_put(qd);
+			qd_put(qd);
+			return error;
+		}
+	}
+
+	*qdp = qd;
+
+	return 0;
+}
+
+static void qd_unlock(struct gfs2_quota_data *qd)
+{
+	gfs2_assert_warn(qd->qd_gl->gl_sbd,
+			 test_bit(QDF_LOCKED, &qd->qd_flags));
+	clear_bit(QDF_LOCKED, &qd->qd_flags);
+	bh_put(qd);
+	slot_put(qd);
+	qd_put(qd);
+}
+
+static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid,
+		    struct gfs2_quota_data **qdp)
+{
+	int error;
+
+	error = qd_get(sdp, qid, qdp);
+	if (error)
+		return error;
+
+	error = slot_get(*qdp);
+	if (error)
+		goto fail;
+
+	error = bh_get(*qdp);
+	if (error)
+		goto fail_slot;
+
+	return 0;
+
+fail_slot:
+	slot_put(*qdp);
+fail:
+	qd_put(*qdp);
+	return error;
+}
+
+static void qdsb_put(struct gfs2_quota_data *qd)
+{
+	bh_put(qd);
+	slot_put(qd);
+	qd_put(qd);
+}
+
+int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_quota_data **qd;
+	int error;
+
+	if (ip->i_res == NULL) {
+		error = gfs2_rs_alloc(ip);
+		if (error)
+			return error;
+	}
+
+	qd = ip->i_res->rs_qa_qd;
+
+	if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) ||
+	    gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
+		return -EIO;
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+
+	error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
+	if (error)
+		goto out;
+	ip->i_res->rs_qa_qd_num++;
+	qd++;
+
+	error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
+	if (error)
+		goto out;
+	ip->i_res->rs_qa_qd_num++;
+	qd++;
+
+	if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
+	    !uid_eq(uid, ip->i_inode.i_uid)) {
+		error = qdsb_get(sdp, make_kqid_uid(uid), qd);
+		if (error)
+			goto out;
+		ip->i_res->rs_qa_qd_num++;
+		qd++;
+	}
+
+	if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) &&
+	    !gid_eq(gid, ip->i_inode.i_gid)) {
+		error = qdsb_get(sdp, make_kqid_gid(gid), qd);
+		if (error)
+			goto out;
+		ip->i_res->rs_qa_qd_num++;
+		qd++;
+	}
+
+out:
+	if (error)
+		gfs2_quota_unhold(ip);
+	return error;
+}
+
+void gfs2_quota_unhold(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int x;
+
+	if (ip->i_res == NULL)
+		return;
+	gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
+
+	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+		qdsb_put(ip->i_res->rs_qa_qd[x]);
+		ip->i_res->rs_qa_qd[x] = NULL;
+	}
+	ip->i_res->rs_qa_qd_num = 0;
+}
+
+static int sort_qd(const void *a, const void *b)
+{
+	const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
+	const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
+
+	if (qid_lt(qd_a->qd_id, qd_b->qd_id))
+		return -1;
+	if (qid_lt(qd_b->qd_id, qd_a->qd_id))
+		return 1;
+	return 0;
+}
+
+static void do_qc(struct gfs2_quota_data *qd, s64 change)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	struct gfs2_quota_change *qc = qd->qd_bh_qc;
+	s64 x;
+
+	mutex_lock(&sdp->sd_quota_mutex);
+	gfs2_trans_add_meta(ip->i_gl, qd->qd_bh);
+
+	if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
+		qc->qc_change = 0;
+		qc->qc_flags = 0;
+		if (qd->qd_id.type == USRQUOTA)
+			qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
+		qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id));
+	}
+
+	x = be64_to_cpu(qc->qc_change) + change;
+	qc->qc_change = cpu_to_be64(x);
+
+	spin_lock(&qd_lock);
+	qd->qd_change = x;
+	spin_unlock(&qd_lock);
+
+	if (!x) {
+		gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
+		clear_bit(QDF_CHANGE, &qd->qd_flags);
+		qc->qc_flags = 0;
+		qc->qc_id = 0;
+		slot_put(qd);
+		qd_put(qd);
+	} else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
+		qd_hold(qd);
+		slot_hold(qd);
+	}
+
+	mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+/**
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
+ *
+ * This function was mostly borrowed from gfs2_block_truncate_page which was
+ * in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
+ */
+
+static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
+			     s64 change, struct gfs2_quota_data *qd,
+			     struct qc_dqblk *fdq)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index = loc >> PAGE_CACHE_SHIFT;
+	unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize, iblock, pos;
+	struct buffer_head *bh;
+	struct page *page;
+	void *kaddr, *ptr;
+	struct gfs2_quota q;
+	int err, nbytes;
+	u64 size;
+
+	if (gfs2_is_stuffed(ip)) {
+		err = gfs2_unstuff_dinode(ip, NULL);
+		if (err)
+			return err;
+	}
+
+	memset(&q, 0, sizeof(struct gfs2_quota));
+	err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q));
+	if (err < 0)
+		return err;
+
+	err = -EIO;
+	be64_add_cpu(&q.qu_value, change);
+	qd->qd_qb.qb_value = q.qu_value;
+	if (fdq) {
+		if (fdq->d_fieldmask & QC_SPC_SOFT) {
+			q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift);
+			qd->qd_qb.qb_warn = q.qu_warn;
+		}
+		if (fdq->d_fieldmask & QC_SPC_HARD) {
+			q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift);
+			qd->qd_qb.qb_limit = q.qu_limit;
+		}
+		if (fdq->d_fieldmask & QC_SPACE) {
+			q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift);
+			qd->qd_qb.qb_value = q.qu_value;
+		}
+	}
+
+	/* Write the quota into the quota file on disk */
+	ptr = &q;
+	nbytes = sizeof(struct gfs2_quota);
+get_a_page:
+	page = find_or_create_page(mapping, index, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	blocksize = inode->i_sb->s_blocksize;
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	if (!buffer_mapped(bh)) {
+		gfs2_block_map(inode, iblock, bh, 1);
+		if (!buffer_mapped(bh))
+			goto unlock_out;
+		/* If it's a newly allocated disk block for quota, zero it */
+		if (buffer_new(bh))
+			zero_user(page, pos - blocksize, bh->b_size);
+	}
+
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ | REQ_META, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			goto unlock_out;
+	}
+
+	gfs2_trans_add_data(ip->i_gl, bh);
+
+	kaddr = kmap_atomic(page);
+	if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
+		nbytes = PAGE_CACHE_SIZE - offset;
+	memcpy(kaddr + offset, ptr, nbytes);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+	unlock_page(page);
+	page_cache_release(page);
+
+	/* If quota straddles page boundary, we need to update the rest of the
+	 * quota at the beginning of the next page */
+	if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
+		ptr = ptr + nbytes;
+		nbytes = sizeof(struct gfs2_quota) - nbytes;
+		offset = 0;
+		index++;
+		goto get_a_page;
+	}
+
+	size = loc + sizeof(struct gfs2_quota);
+	if (size > inode->i_size)
+		i_size_write(inode, size);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	set_bit(QDF_REFRESH, &qd->qd_flags);
+	return 0;
+
+unlock_out:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
+{
+	struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
+	unsigned int data_blocks, ind_blocks;
+	struct gfs2_holder *ghs, i_gh;
+	unsigned int qx, x;
+	struct gfs2_quota_data *qd;
+	unsigned reserved;
+	loff_t offset;
+	unsigned int nalloc = 0, blocks;
+	int error;
+
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		return error;
+
+	gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+			      &data_blocks, &ind_blocks);
+
+	ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
+	if (!ghs)
+		return -ENOMEM;
+
+	sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+	mutex_lock(&ip->i_inode.i_mutex);
+	for (qx = 0; qx < num_qd; qx++) {
+		error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &ghs[qx]);
+		if (error)
+			goto out;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		goto out;
+
+	for (x = 0; x < num_qd; x++) {
+		offset = qd2offset(qda[x]);
+		if (gfs2_write_alloc_required(ip, offset,
+					      sizeof(struct gfs2_quota)))
+			nalloc++;
+	}
+
+	/* 
+	 * 1 blk for unstuffing inode if stuffed. We add this extra
+	 * block to the reservation unconditionally. If the inode
+	 * doesn't need unstuffing, the block will be released to the 
+	 * rgrp since it won't be allocated during the transaction
+	 */
+	/* +3 in the end for unstuffing block, inode size update block
+	 * and another block in case quota straddles page boundary and 
+	 * two blocks need to be updated instead of 1 */
+	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
+
+	reserved = 1 + (nalloc * (data_blocks + ind_blocks));
+	ap.target = reserved;
+	error = gfs2_inplace_reserve(ip, &ap);
+	if (error)
+		goto out_alloc;
+
+	if (nalloc)
+		blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS;
+
+	error = gfs2_trans_begin(sdp, blocks, 0);
+	if (error)
+		goto out_ipres;
+
+	for (x = 0; x < num_qd; x++) {
+		qd = qda[x];
+		offset = qd2offset(qd);
+		error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
+		if (error)
+			goto out_end_trans;
+
+		do_qc(qd, -qd->qd_change_sync);
+		set_bit(QDF_REFRESH, &qd->qd_flags);
+	}
+
+	error = 0;
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	gfs2_inplace_release(ip);
+out_alloc:
+	gfs2_glock_dq_uninit(&i_gh);
+out:
+	while (qx--)
+		gfs2_glock_dq_uninit(&ghs[qx]);
+	mutex_unlock(&ip->i_inode.i_mutex);
+	kfree(ghs);
+	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
+	return error;
+}
+
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_quota q;
+	struct gfs2_quota_lvb *qlvb;
+	loff_t pos;
+	int error;
+
+	memset(&q, 0, sizeof(struct gfs2_quota));
+	pos = qd2offset(qd);
+	error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q));
+	if (error < 0)
+		return error;
+
+	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
+	qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+	qlvb->__pad = 0;
+	qlvb->qb_limit = q.qu_limit;
+	qlvb->qb_warn = q.qu_warn;
+	qlvb->qb_value = q.qu_value;
+	qd->qd_qb = *qlvb;
+
+	return 0;
+}
+
+static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
+		    struct gfs2_holder *q_gh)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+restart:
+	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
+	if (error)
+		return error;
+
+	if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+		force_refresh = FORCE;
+
+	qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
+
+	if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
+		gfs2_glock_dq_uninit(q_gh);
+		error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, q_gh);
+		if (error)
+			return error;
+
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+		if (error)
+			goto fail;
+
+		error = update_qd(sdp, qd);
+		if (error)
+			goto fail_gunlock;
+
+		gfs2_glock_dq_uninit(&i_gh);
+		gfs2_glock_dq_uninit(q_gh);
+		force_refresh = 0;
+		goto restart;
+	}
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+fail:
+	gfs2_glock_dq_uninit(q_gh);
+	return error;
+}
+
+int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_quota_data *qd;
+	unsigned int x;
+	int error = 0;
+
+	error = gfs2_quota_hold(ip, uid, gid);
+	if (error)
+		return error;
+
+	if (capable(CAP_SYS_RESOURCE) ||
+	    sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+		return 0;
+
+	sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num,
+	     sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+
+	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+		qd = ip->i_res->rs_qa_qd[x];
+		error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
+		if (error)
+			break;
+	}
+
+	if (!error)
+		set_bit(GIF_QD_LOCKED, &ip->i_flags);
+	else {
+		while (x--)
+			gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+		gfs2_quota_unhold(ip);
+	}
+
+	return error;
+}
+
+static int need_sync(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	s64 value;
+	unsigned int num, den;
+	int do_sync = 1;
+
+	if (!qd->qd_qb.qb_limit)
+		return 0;
+
+	spin_lock(&qd_lock);
+	value = qd->qd_change;
+	spin_unlock(&qd_lock);
+
+	spin_lock(&gt->gt_spin);
+	num = gt->gt_quota_scale_num;
+	den = gt->gt_quota_scale_den;
+	spin_unlock(&gt->gt_spin);
+
+	if (value < 0)
+		do_sync = 0;
+	else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
+		 (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+		do_sync = 0;
+	else {
+		value *= gfs2_jindex_size(sdp) * num;
+		value = div_s64(value, den);
+		value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
+		if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+			do_sync = 0;
+	}
+
+	return do_sync;
+}
+
+void gfs2_quota_unlock(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_quota_data *qda[4];
+	unsigned int count = 0;
+	unsigned int x;
+	int found;
+
+	if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
+		goto out;
+
+	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+		struct gfs2_quota_data *qd;
+		int sync;
+
+		qd = ip->i_res->rs_qa_qd[x];
+		sync = need_sync(qd);
+
+		gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+		if (!sync)
+			continue;
+
+		spin_lock(&qd_lock);
+		found = qd_check_sync(sdp, qd, NULL);
+		spin_unlock(&qd_lock);
+
+		if (!found)
+			continue;
+
+		gfs2_assert_warn(sdp, qd->qd_change_sync);
+		if (bh_get(qd)) {
+			clear_bit(QDF_LOCKED, &qd->qd_flags);
+			slot_put(qd);
+			qd_put(qd);
+			continue;
+		}
+
+		qda[count++] = qd;
+	}
+
+	if (count) {
+		do_sync(count, qda);
+		for (x = 0; x < count; x++)
+			qd_unlock(qda[x]);
+	}
+
+out:
+	gfs2_quota_unhold(ip);
+}
+
+#define MAX_LINE 256
+
+static int print_message(struct gfs2_quota_data *qd, char *type)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	fs_info(sdp, "quota %s for %s %u\n",
+		type,
+		(qd->qd_id.type == USRQUOTA) ? "user" : "group",
+		from_kqid(&init_user_ns, qd->qd_id));
+
+	return 0;
+}
+
+/**
+ * gfs2_quota_check - check if allocating new blocks will exceed quota
+ * @ip:  The inode for which this check is being performed
+ * @uid: The uid to check against
+ * @gid: The gid to check against
+ * @ap:  The allocation parameters. ap->target contains the requested
+ *       blocks. ap->min_target, if set, contains the minimum blks
+ *       requested.
+ *
+ * Returns: 0 on success.
+ *                  min_req = ap->min_target ? ap->min_target : ap->target;
+ *                  quota must allow atleast min_req blks for success and
+ *                  ap->allowed is set to the number of blocks allowed
+ *
+ *          -EDQUOT otherwise, quota violation. ap->allowed is set to number
+ *                  of blocks available.
+ */
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+		     struct gfs2_alloc_parms *ap)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_quota_data *qd;
+	s64 value, warn, limit;
+	unsigned int x;
+	int error = 0;
+
+	ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
+	if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
+		return 0;
+
+        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+
+	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+		qd = ip->i_res->rs_qa_qd[x];
+
+		if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+		      qid_eq(qd->qd_id, make_kqid_gid(gid))))
+			continue;
+
+		warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn);
+		limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit);
+		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
+		spin_lock(&qd_lock);
+		value += qd->qd_change;
+		spin_unlock(&qd_lock);
+
+		if (limit > 0 && (limit - value) < ap->allowed)
+			ap->allowed = limit - value;
+		/* If we can't meet the target */
+		if (limit && limit < (value + (s64)ap->target)) {
+			/* If no min_target specified or we don't meet
+			 * min_target, return -EDQUOT */
+			if (!ap->min_target || ap->min_target > ap->allowed) {
+				print_message(qd, "exceeded");
+				quota_send_warning(qd->qd_id,
+						   sdp->sd_vfs->s_dev,
+						   QUOTA_NL_BHARDWARN);
+				error = -EDQUOT;
+				break;
+			}
+		} else if (warn && warn < value &&
+			   time_after_eq(jiffies, qd->qd_last_warn +
+					 gfs2_tune_get(sdp, gt_quota_warn_period)
+					 * HZ)) {
+			quota_send_warning(qd->qd_id,
+					   sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
+			error = print_message(qd, "warning");
+			qd->qd_last_warn = jiffies;
+		}
+	}
+	return error;
+}
+
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+		       kuid_t uid, kgid_t gid)
+{
+	struct gfs2_quota_data *qd;
+	unsigned int x;
+
+	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+		return;
+	if (ip->i_diskflags & GFS2_DIF_SYSTEM)
+		return;
+
+	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+		qd = ip->i_res->rs_qa_qd[x];
+
+		if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+		    qid_eq(qd->qd_id, make_kqid_gid(gid))) {
+			do_qc(qd, change);
+		}
+	}
+}
+
+int gfs2_quota_sync(struct super_block *sb, int type)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_quota_data **qda;
+	unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder);
+	unsigned int num_qd;
+	unsigned int x;
+	int error = 0;
+
+	qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
+	if (!qda)
+		return -ENOMEM;
+
+	mutex_lock(&sdp->sd_quota_sync_mutex);
+	sdp->sd_quota_sync_gen++;
+
+	do {
+		num_qd = 0;
+
+		for (;;) {
+			error = qd_fish(sdp, qda + num_qd);
+			if (error || !qda[num_qd])
+				break;
+			if (++num_qd == max_qd)
+				break;
+		}
+
+		if (num_qd) {
+			if (!error)
+				error = do_sync(num_qd, qda);
+			if (!error)
+				for (x = 0; x < num_qd; x++)
+					qda[x]->qd_sync_gen =
+						sdp->sd_quota_sync_gen;
+
+			for (x = 0; x < num_qd; x++)
+				qd_unlock(qda[x]);
+		}
+	} while (!error && num_qd == max_qd);
+
+	mutex_unlock(&sdp->sd_quota_sync_mutex);
+	kfree(qda);
+
+	return error;
+}
+
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
+{
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh;
+	int error;
+
+	error = qd_get(sdp, qid, &qd);
+	if (error)
+		return error;
+
+	error = do_glock(qd, FORCE, &q_gh);
+	if (!error)
+		gfs2_glock_dq_uninit(&q_gh);
+
+	qd_put(qd);
+	return error;
+}
+
+int gfs2_quota_init(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	u64 size = i_size_read(sdp->sd_qc_inode);
+	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
+	unsigned int x, slot = 0;
+	unsigned int found = 0;
+	unsigned int hash;
+	unsigned int bm_size;
+	u64 dblock;
+	u32 extlen = 0;
+	int error;
+
+	if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
+		return -EIO;
+
+	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
+	bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
+	bm_size *= sizeof(unsigned long);
+	error = -ENOMEM;
+	sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN);
+	if (sdp->sd_quota_bitmap == NULL)
+		sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS |
+						 __GFP_ZERO, PAGE_KERNEL);
+	if (!sdp->sd_quota_bitmap)
+		return error;
+
+	for (x = 0; x < blocks; x++) {
+		struct buffer_head *bh;
+		const struct gfs2_quota_change *qc;
+		unsigned int y;
+
+		if (!extlen) {
+			int new = 0;
+			error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
+			if (error)
+				goto fail;
+		}
+		error = -EIO;
+		bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+		if (!bh)
+			goto fail;
+		if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
+			brelse(bh);
+			goto fail;
+		}
+
+		qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
+		for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
+		     y++, slot++) {
+			struct gfs2_quota_data *qd;
+			s64 qc_change = be64_to_cpu(qc->qc_change);
+			u32 qc_flags = be32_to_cpu(qc->qc_flags);
+			enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
+						USRQUOTA : GRPQUOTA;
+			struct kqid qc_id = make_kqid(&init_user_ns, qtype,
+						      be32_to_cpu(qc->qc_id));
+			qc++;
+			if (!qc_change)
+				continue;
+
+			hash = gfs2_qd_hash(sdp, qc_id);
+			qd = qd_alloc(hash, sdp, qc_id);
+			if (qd == NULL) {
+				brelse(bh);
+				goto fail;
+			}
+
+			set_bit(QDF_CHANGE, &qd->qd_flags);
+			qd->qd_change = qc_change;
+			qd->qd_slot = slot;
+			qd->qd_slot_count = 1;
+
+			spin_lock(&qd_lock);
+			BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
+			list_add(&qd->qd_list, &sdp->sd_quota_list);
+			atomic_inc(&sdp->sd_quota_count);
+			spin_unlock(&qd_lock);
+
+			spin_lock_bucket(hash);
+			hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
+			spin_unlock_bucket(hash);
+
+			found++;
+		}
+
+		brelse(bh);
+		dblock++;
+		extlen--;
+	}
+
+	if (found)
+		fs_info(sdp, "found %u quota changes\n", found);
+
+	return 0;
+
+fail:
+	gfs2_quota_cleanup(sdp);
+	return error;
+}
+
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
+{
+	struct list_head *head = &sdp->sd_quota_list;
+	struct gfs2_quota_data *qd;
+
+	spin_lock(&qd_lock);
+	while (!list_empty(head)) {
+		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+
+		list_del(&qd->qd_list);
+
+		/* Also remove if this qd exists in the reclaim list */
+		list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+		atomic_dec(&sdp->sd_quota_count);
+		spin_unlock(&qd_lock);
+
+		spin_lock_bucket(qd->qd_hash);
+		hlist_bl_del_rcu(&qd->qd_hlist);
+		spin_unlock_bucket(qd->qd_hash);
+
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_glock_put(qd->qd_gl);
+		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
+
+		spin_lock(&qd_lock);
+	}
+	spin_unlock(&qd_lock);
+
+	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
+
+	kvfree(sdp->sd_quota_bitmap);
+	sdp->sd_quota_bitmap = NULL;
+}
+
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+	if (error == 0 || error == -EROFS)
+		return;
+	if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+			       int (*fxn)(struct super_block *sb, int type),
+			       unsigned long t, unsigned long *timeo,
+			       unsigned int *new_timeo)
+{
+	if (t >= *timeo) {
+		int error = fxn(sdp->sd_vfs, 0);
+		quotad_error(sdp, msg, error);
+		*timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+	} else {
+		*timeo -= t;
+	}
+}
+
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip;
+
+	while(1) {
+		ip = NULL;
+		spin_lock(&sdp->sd_trunc_lock);
+		if (!list_empty(&sdp->sd_trunc_list)) {
+			ip = list_entry(sdp->sd_trunc_list.next,
+					struct gfs2_inode, i_trunc_list);
+			list_del_init(&ip->i_trunc_list);
+		}
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (ip == NULL)
+			return;
+		gfs2_glock_finish_truncate(ip);
+	}
+}
+
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+	if (!sdp->sd_statfs_force_sync) {
+		sdp->sd_statfs_force_sync = 1;
+		wake_up(&sdp->sd_quota_wait);
+	}
+}
+
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	struct gfs2_tune *tune = &sdp->sd_tune;
+	unsigned long statfs_timeo = 0;
+	unsigned long quotad_timeo = 0;
+	unsigned long t = 0;
+	DEFINE_WAIT(wait);
+	int empty;
+
+	while (!kthread_should_stop()) {
+
+		/* Update the master statfs file */
+		if (sdp->sd_statfs_force_sync) {
+			int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+			quotad_error(sdp, "statfs", error);
+			statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+		}
+		else
+			quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+				   	   &statfs_timeo,
+					   &tune->gt_statfs_quantum);
+
+		/* Update quota file */
+		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+				   &quotad_timeo, &tune->gt_quota_quantum);
+
+		/* Check for & recover partially truncated inodes */
+		quotad_check_trunc_list(sdp);
+
+		try_to_freeze();
+
+		t = min(quotad_timeo, statfs_timeo);
+
+		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_lock(&sdp->sd_trunc_lock);
+		empty = list_empty(&sdp->sd_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (empty && !sdp->sd_statfs_force_sync)
+			t -= schedule_timeout(t);
+		else
+			t = 0;
+		finish_wait(&sdp->sd_quota_wait, &wait);
+	}
+
+	return 0;
+}
+
+static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	memset(state, 0, sizeof(*state));
+
+	switch (sdp->sd_args.ar_quota) {
+	case GFS2_QUOTA_ON:
+		state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
+		state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
+		/*FALLTHRU*/
+	case GFS2_QUOTA_ACCOUNT:
+		state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED |
+						  QCI_SYSFILE;
+		state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED |
+						  QCI_SYSFILE;
+		break;
+	case GFS2_QUOTA_OFF:
+		break;
+	}
+	if (sdp->sd_quota_inode) {
+		state->s_state[USRQUOTA].ino =
+					GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+		state->s_state[USRQUOTA].blocks = sdp->sd_quota_inode->i_blocks;
+	}
+	state->s_state[USRQUOTA].nextents = 1;	/* unsupported */
+	state->s_state[GRPQUOTA] = state->s_state[USRQUOTA];
+	state->s_incoredqs = list_lru_count(&gfs2_qd_lru);
+	return 0;
+}
+
+static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *fdq)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_quota_lvb *qlvb;
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh;
+	int error;
+
+	memset(fdq, 0, sizeof(*fdq));
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return -ESRCH; /* Crazy XFS error code */
+
+	if ((qid.type != USRQUOTA) &&
+	    (qid.type != GRPQUOTA))
+		return -EINVAL;
+
+	error = qd_get(sdp, qid, &qd);
+	if (error)
+		return error;
+	error = do_glock(qd, FORCE, &q_gh);
+	if (error)
+		goto out;
+
+	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
+	fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift;
+	fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift;
+	fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift;
+
+	gfs2_glock_dq_uninit(&q_gh);
+out:
+	qd_put(qd);
+	return error;
+}
+
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE)
+
+static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *fdq)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh, i_gh;
+	unsigned int data_blocks, ind_blocks;
+	unsigned int blocks = 0;
+	int alloc_required;
+	loff_t offset;
+	int error;
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return -ESRCH; /* Crazy XFS error code */
+
+	if ((qid.type != USRQUOTA) &&
+	    (qid.type != GRPQUOTA))
+		return -EINVAL;
+
+	if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+		return -EINVAL;
+
+	error = qd_get(sdp, qid, &qd);
+	if (error)
+		return error;
+
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		goto out_put;
+
+	mutex_lock(&ip->i_inode.i_mutex);
+	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+	if (error)
+		goto out_unlockput;
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		goto out_q;
+
+	/* Check for existing entry, if none then alloc new blocks */
+	error = update_qd(sdp, qd);
+	if (error)
+		goto out_i;
+
+	/* If nothing has changed, this is a no-op */
+	if ((fdq->d_fieldmask & QC_SPC_SOFT) &&
+	    ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
+		fdq->d_fieldmask ^= QC_SPC_SOFT;
+
+	if ((fdq->d_fieldmask & QC_SPC_HARD) &&
+	    ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
+		fdq->d_fieldmask ^= QC_SPC_HARD;
+
+	if ((fdq->d_fieldmask & QC_SPACE) &&
+	    ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+		fdq->d_fieldmask ^= QC_SPACE;
+
+	if (fdq->d_fieldmask == 0)
+		goto out_i;
+
+	offset = qd2offset(qd);
+	alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
+	if (gfs2_is_stuffed(ip))
+		alloc_required = 1;
+	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .aflags = 0, };
+		gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+				       &data_blocks, &ind_blocks);
+		blocks = 1 + data_blocks + ind_blocks;
+		ap.target = blocks;
+		error = gfs2_inplace_reserve(ip, &ap);
+		if (error)
+			goto out_i;
+		blocks += gfs2_rg_blocks(ip, blocks);
+	}
+
+	/* Some quotas span block boundaries and can update two blocks,
+	   adding an extra block to the transaction to handle such quotas */
+	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
+	if (error)
+		goto out_release;
+
+	/* Apply changes */
+	error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+
+	gfs2_trans_end(sdp);
+out_release:
+	if (alloc_required)
+		gfs2_inplace_release(ip);
+out_i:
+	gfs2_glock_dq_uninit(&i_gh);
+out_q:
+	gfs2_glock_dq_uninit(&q_gh);
+out_unlockput:
+	mutex_unlock(&ip->i_inode.i_mutex);
+out_put:
+	qd_put(qd);
+	return error;
+}
+
+const struct quotactl_ops gfs2_quotactl_ops = {
+	.quota_sync     = gfs2_quota_sync,
+	.get_state	= gfs2_quota_get_state,
+	.get_dqblk	= gfs2_get_dqblk,
+	.set_dqblk	= gfs2_set_dqblk,
+};
+
+void __init gfs2_quota_hash_init(void)
+{
+	unsigned i;
+
+	for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
+		INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
+}
diff --git a/kernel/fs/gfs2/quota.h b/kernel/fs/gfs2/quota.h
new file mode 100644
index 000000000..ad04b3aca
--- /dev/null
+++ b/kernel/fs/gfs2/quota.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __QUOTA_DOT_H__
+#define __QUOTA_DOT_H__
+
+#include <linux/list_lru.h>
+
+struct gfs2_inode;
+struct gfs2_sbd;
+
+#define NO_UID_QUOTA_CHANGE INVALID_UID
+#define NO_GID_QUOTA_CHANGE INVALID_GID
+
+extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
+
+extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
+
+extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+			    struct gfs2_alloc_parms *ap);
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+			      kuid_t uid, kgid_t gid);
+
+extern int gfs2_quota_sync(struct super_block *sb, int type);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
+
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
+
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
+					struct gfs2_alloc_parms *ap)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int ret;
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+	ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (ret)
+		return ret;
+	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+		return 0;
+	ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
+	if (ret)
+		gfs2_quota_unlock(ip);
+	return ret;
+}
+
+extern const struct quotactl_ops gfs2_quotactl_ops;
+extern struct shrinker gfs2_qd_shrinker;
+extern struct list_lru gfs2_qd_lru;
+extern void __init gfs2_quota_hash_init(void);
+
+#endif /* __QUOTA_DOT_H__ */
diff --git a/kernel/fs/gfs2/recovery.c b/kernel/fs/gfs2/recovery.c
new file mode 100644
index 000000000..1b645773c
--- /dev/null
+++ b/kernel/fs/gfs2/recovery.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+#include "dir.h"
+
+struct workqueue_struct *gfs_recovery_wq;
+
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+			   struct buffer_head **bh)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	int new = 0;
+	u64 dblock;
+	u32 extlen;
+	int error;
+
+	error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
+	if (error)
+		return error;
+	if (!dblock) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	*bh = gfs2_meta_ra(gl, dblock, extlen);
+
+	return error;
+}
+
+int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
+{
+	struct list_head *head = &jd->jd_revoke_list;
+	struct gfs2_revoke_replay *rr;
+	int found = 0;
+
+	list_for_each_entry(rr, head, rr_list) {
+		if (rr->rr_blkno == blkno) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		rr->rr_where = where;
+		return 0;
+	}
+
+	rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
+	if (!rr)
+		return -ENOMEM;
+
+	rr->rr_blkno = blkno;
+	rr->rr_where = where;
+	list_add(&rr->rr_list, head);
+
+	return 1;
+}
+
+int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
+{
+	struct gfs2_revoke_replay *rr;
+	int wrap, a, b, revoke;
+	int found = 0;
+
+	list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
+		if (rr->rr_blkno == blkno) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		return 0;
+
+	wrap = (rr->rr_where < jd->jd_replay_tail);
+	a = (jd->jd_replay_tail < where);
+	b = (where < rr->rr_where);
+	revoke = (wrap) ? (a || b) : (a && b);
+
+	return revoke;
+}
+
+void gfs2_revoke_clean(struct gfs2_jdesc *jd)
+{
+	struct list_head *head = &jd->jd_revoke_list;
+	struct gfs2_revoke_replay *rr;
+
+	while (!list_empty(head)) {
+		rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+		list_del(&rr->rr_list);
+		kfree(rr);
+	}
+}
+
+static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf)
+{
+	const struct gfs2_log_header *str = buf;
+
+	if (str->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
+	    str->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH))
+		return 1;
+
+	lh->lh_sequence = be64_to_cpu(str->lh_sequence);
+	lh->lh_flags = be32_to_cpu(str->lh_flags);
+	lh->lh_tail = be32_to_cpu(str->lh_tail);
+	lh->lh_blkno = be32_to_cpu(str->lh_blkno);
+	lh->lh_hash = be32_to_cpu(str->lh_hash);
+	return 0;
+}
+
+/**
+ * get_log_header - read the log header for a given segment
+ * @jd: the journal
+ * @blk: the block to look at
+ * @lh: the log header to return
+ *
+ * Read the log header for a given segement in a given journal.  Do a few
+ * sanity checks on it.
+ *
+ * Returns: 0 on success,
+ *          1 if the header was invalid or incomplete,
+ *          errno on error
+ */
+
+static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
+			  struct gfs2_log_header_host *head)
+{
+	struct buffer_head *bh;
+	struct gfs2_log_header_host uninitialized_var(lh);
+	const u32 nothing = 0;
+	u32 hash;
+	int error;
+
+	error = gfs2_replay_read_block(jd, blk, &bh);
+	if (error)
+		return error;
+
+	hash = crc32_le((u32)~0, bh->b_data, sizeof(struct gfs2_log_header) -
+					     sizeof(u32));
+	hash = crc32_le(hash, (unsigned char const *)&nothing, sizeof(nothing));
+	hash ^= (u32)~0;
+	error = gfs2_log_header_in(&lh, bh->b_data);
+	brelse(bh);
+
+	if (error || lh.lh_blkno != blk || lh.lh_hash != hash)
+		return 1;
+
+	*head = lh;
+
+	return 0;
+}
+
+/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+			struct gfs2_log_header_host *head)
+{
+	unsigned int orig_blk = *blk;
+	int error;
+
+	for (;;) {
+		error = get_log_header(jd, *blk, head);
+		if (error <= 0)
+			return error;
+
+		if (++*blk == jd->jd_blocks)
+			*blk = 0;
+
+		if (*blk == orig_blk) {
+			gfs2_consist_inode(GFS2_I(jd->jd_inode));
+			return -EIO;
+		}
+	}
+}
+
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before.  Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+	unsigned int blk = head->lh_blkno;
+	struct gfs2_log_header_host lh;
+	int error;
+
+	for (;;) {
+		if (++blk == jd->jd_blocks)
+			blk = 0;
+
+		error = get_log_header(jd, blk, &lh);
+		if (error < 0)
+			return error;
+		if (error == 1)
+			continue;
+
+		if (lh.lh_sequence == head->lh_sequence) {
+			gfs2_consist_inode(GFS2_I(jd->jd_inode));
+			return -EIO;
+		}
+		if (lh.lh_sequence < head->lh_sequence)
+			break;
+
+		*head = lh;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number.  (i.e. the log head)
+ *
+ * Returns: errno
+ */
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+	struct gfs2_log_header_host lh_1, lh_m;
+	u32 blk_1, blk_2, blk_m;
+	int error;
+
+	blk_1 = 0;
+	blk_2 = jd->jd_blocks - 1;
+
+	for (;;) {
+		blk_m = (blk_1 + blk_2) / 2;
+
+		error = find_good_lh(jd, &blk_1, &lh_1);
+		if (error)
+			return error;
+
+		error = find_good_lh(jd, &blk_m, &lh_m);
+		if (error)
+			return error;
+
+		if (blk_1 == blk_m || blk_m == blk_2)
+			break;
+
+		if (lh_1.lh_sequence <= lh_m.lh_sequence)
+			blk_1 = blk_m;
+		else
+			blk_2 = blk_m;
+	}
+
+	error = jhead_scan(jd, &lh_1);
+	if (error)
+		return error;
+
+	*head = lh_1;
+
+	return error;
+}
+
+/**
+ * foreach_descriptor - go through the active part of the log
+ * @jd: the journal
+ * @start: the first log header in the active region
+ * @end: the last log header (don't process the contents of this entry))
+ *
+ * Call a given function once for every log descriptor in the active
+ * portion of the log.
+ *
+ * Returns: errno
+ */
+
+static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
+			      unsigned int end, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct buffer_head *bh;
+	struct gfs2_log_descriptor *ld;
+	int error = 0;
+	u32 length;
+	__be64 *ptr;
+	unsigned int offset = sizeof(struct gfs2_log_descriptor);
+	offset += sizeof(__be64) - 1;
+	offset &= ~(sizeof(__be64) - 1);
+
+	while (start != end) {
+		error = gfs2_replay_read_block(jd, start, &bh);
+		if (error)
+			return error;
+		if (gfs2_meta_check(sdp, bh)) {
+			brelse(bh);
+			return -EIO;
+		}
+		ld = (struct gfs2_log_descriptor *)bh->b_data;
+		length = be32_to_cpu(ld->ld_length);
+
+		if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
+			struct gfs2_log_header_host lh;
+			error = get_log_header(jd, start, &lh);
+			if (!error) {
+				gfs2_replay_incr_blk(sdp, &start);
+				brelse(bh);
+				continue;
+			}
+			if (error == 1) {
+				gfs2_consist_inode(GFS2_I(jd->jd_inode));
+				error = -EIO;
+			}
+			brelse(bh);
+			return error;
+		} else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
+			brelse(bh);
+			return -EIO;
+		}
+		ptr = (__be64 *)(bh->b_data + offset);
+		error = lops_scan_elements(jd, start, ld, ptr, pass);
+		if (error) {
+			brelse(bh);
+			return error;
+		}
+
+		while (length--)
+			gfs2_replay_incr_blk(sdp, &start);
+
+		brelse(bh);
+	}
+
+	return 0;
+}
+
+/**
+ * clean_journal - mark a dirty journal as being clean
+ * @sdp: the filesystem
+ * @jd: the journal
+ * @gl: the journal's glock
+ * @head: the head journal to start from
+ *
+ * Returns: errno
+ */
+
+static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	unsigned int lblock;
+	struct gfs2_log_header *lh;
+	u32 hash;
+	struct buffer_head *bh;
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	lblock = head->lh_blkno;
+	gfs2_replay_incr_blk(sdp, &lblock);
+	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+	error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
+	if (error)
+		return error;
+	if (!bh_map.b_blocknr) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+	unlock_buffer(bh);
+
+	lh = (struct gfs2_log_header *)bh->b_data;
+	memset(lh, 0, sizeof(struct gfs2_log_header));
+	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
+	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
+	lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
+	lh->lh_blkno = cpu_to_be32(lblock);
+	hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
+	lh->lh_hash = cpu_to_be32(hash);
+
+	set_buffer_dirty(bh);
+	if (sync_dirty_buffer(bh))
+		gfs2_io_error_bh(sdp, bh);
+	brelse(bh);
+
+	return error;
+}
+
+
+static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                               unsigned int message)
+{
+	char env_jid[20];
+	char env_status[20];
+	char *envp[] = { env_jid, env_status, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+        ls->ls_recover_jid_done = jid;
+        ls->ls_recover_jid_status = message;
+	sprintf(env_jid, "JID=%u", jid);
+	sprintf(env_status, "RECOVERY=%s",
+		message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+
+	if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
+		sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
+}
+
+void gfs2_recover_func(struct work_struct *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_log_header_host head;
+	struct gfs2_holder j_gh, ji_gh, thaw_gh;
+	unsigned long t;
+	int ro = 0;
+	unsigned int pass;
+	int error;
+	int jlocked = 0;
+
+	if (sdp->sd_args.ar_spectator ||
+	    (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
+		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
+			jd->jd_jid);
+		jlocked = 1;
+		/* Acquire the journal lock so we can do recovery */
+
+		error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
+					  LM_ST_EXCLUSIVE,
+					  LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
+					  &j_gh);
+		switch (error) {
+		case 0:
+			break;
+
+		case GLR_TRYFAILED:
+			fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
+			error = 0;
+
+		default:
+			goto fail;
+		};
+
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh);
+		if (error)
+			goto fail_gunlock_j;
+	} else {
+		fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
+	}
+
+	fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
+
+	error = gfs2_jdesc_check(jd);
+	if (error)
+		goto fail_gunlock_ji;
+
+	error = gfs2_find_jhead(jd, &head);
+	if (error)
+		goto fail_gunlock_ji;
+
+	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+		fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
+			jd->jd_jid);
+
+		t = jiffies;
+
+		/* Acquire a shared hold on the freeze lock */
+
+		error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | LM_FLAG_PRIORITY,
+					   &thaw_gh);
+		if (error)
+			goto fail_gunlock_ji;
+
+		if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
+			ro = 1;
+		} else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+			if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+				ro = 1;
+		} else {
+			if (sdp->sd_vfs->s_flags & MS_RDONLY) {
+				/* check if device itself is read-only */
+				ro = bdev_read_only(sdp->sd_vfs->s_bdev);
+				if (!ro) {
+					fs_info(sdp, "recovery required on "
+						"read-only filesystem.\n");
+					fs_info(sdp, "write access will be "
+						"enabled during recovery.\n");
+				}
+			}
+		}
+
+		if (ro) {
+			fs_warn(sdp, "jid=%u: Can't replay: read-only block "
+				"device\n", jd->jd_jid);
+			error = -EROFS;
+			goto fail_gunlock_thaw;
+		}
+
+		fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
+
+		for (pass = 0; pass < 2; pass++) {
+			lops_before_scan(jd, &head, pass);
+			error = foreach_descriptor(jd, head.lh_tail,
+						   head.lh_blkno, pass);
+			lops_after_scan(jd, error, pass);
+			if (error)
+				goto fail_gunlock_thaw;
+		}
+
+		error = clean_journal(jd, &head);
+		if (error)
+			goto fail_gunlock_thaw;
+
+		gfs2_glock_dq_uninit(&thaw_gh);
+		t = DIV_ROUND_UP(jiffies - t, HZ);
+		fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
+			jd->jd_jid, t);
+	}
+
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+
+	if (jlocked) {
+		gfs2_glock_dq_uninit(&ji_gh);
+		gfs2_glock_dq_uninit(&j_gh);
+	}
+
+	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
+	goto done;
+
+fail_gunlock_thaw:
+	gfs2_glock_dq_uninit(&thaw_gh);
+fail_gunlock_ji:
+	if (jlocked) {
+		gfs2_glock_dq_uninit(&ji_gh);
+fail_gunlock_j:
+		gfs2_glock_dq_uninit(&j_gh);
+	}
+
+	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
+fail:
+	jd->jd_recover_error = error;
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+done:
+	clear_bit(JDF_RECOVERY, &jd->jd_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+
+int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
+{
+	int rv;
+
+	if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
+		return -EBUSY;
+
+	/* we have JDF_RECOVERY, queue should always succeed */
+	rv = queue_work(gfs_recovery_wq, &jd->jd_work);
+	BUG_ON(!rv);
+
+	if (wait)
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    TASK_UNINTERRUPTIBLE);
+
+	return wait ? jd->jd_recover_error : 0;
+}
+
diff --git a/kernel/fs/gfs2/recovery.h b/kernel/fs/gfs2/recovery.h
new file mode 100644
index 000000000..6142836cc
--- /dev/null
+++ b/kernel/fs/gfs2/recovery.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RECOVERY_DOT_H__
+#define __RECOVERY_DOT_H__
+
+#include "incore.h"
+
+extern struct workqueue_struct *gfs_recovery_wq;
+
+static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
+{
+	if (++*blk == sdp->sd_jdesc->jd_blocks)
+	        *blk = 0;
+}
+
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+			   struct buffer_head **bh);
+
+extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
+
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
+		    struct gfs2_log_header_host *head);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
+extern void gfs2_recover_func(struct work_struct *work);
+
+#endif /* __RECOVERY_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/rgrp.c b/kernel/fs/gfs2/rgrp.c
new file mode 100644
index 000000000..6af2396a3
--- /dev/null
+++ b/kernel/fs/gfs2/rgrp.c
@@ -0,0 +1,2623 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/prefetch.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/random.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+#include "log.h"
+#include "inode.h"
+#include "trace_gfs2.h"
+
+#define BFITNOENT ((u32)~0)
+#define NO_BLOCK ((u64)~0)
+
+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
+
+/*
+ * These routines are used by the resource group routines (rgrp.c)
+ * to keep track of block allocation.  Each block is represented by two
+ * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
+ *
+ * 0 = Free
+ * 1 = Used (not metadata)
+ * 2 = Unlinked (still in use) inode
+ * 3 = Used (metadata)
+ */
+
+struct gfs2_extent {
+	struct gfs2_rbm rbm;
+	u32 len;
+};
+
+static const char valid_change[16] = {
+	        /* current */
+	/* n */ 0, 1, 1, 1,
+	/* e */ 1, 0, 0, 0,
+	/* w */ 0, 0, 0, 1,
+	        1, 0, 0, 0
+};
+
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+			 const struct gfs2_inode *ip, bool nowrap,
+			 const struct gfs2_alloc_parms *ap);
+
+
+/**
+ * gfs2_setbit - Set a bit in the bitmaps
+ * @rbm: The position of the bit to set
+ * @do_clone: Also set the clone bitmap, if it exists
+ * @new_state: the new state of the block
+ *
+ */
+
+static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
+			       unsigned char new_state)
+{
+	unsigned char *byte1, *byte2, *end, cur_state;
+	struct gfs2_bitmap *bi = rbm_bi(rbm);
+	unsigned int buflen = bi->bi_len;
+	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
+
+	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
+	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
+
+	BUG_ON(byte1 >= end);
+
+	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
+
+	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
+		pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
+			rbm->offset, cur_state, new_state);
+		pr_warn("rgrp=0x%llx bi_start=0x%x\n",
+			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
+		pr_warn("bi_offset=0x%x bi_len=0x%x\n",
+			bi->bi_offset, bi->bi_len);
+		dump_stack();
+		gfs2_consist_rgrpd(rbm->rgd);
+		return;
+	}
+	*byte1 ^= (cur_state ^ new_state) << bit;
+
+	if (do_clone && bi->bi_clone) {
+		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
+		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
+		*byte2 ^= (cur_state ^ new_state) << bit;
+	}
+}
+
+/**
+ * gfs2_testbit - test a bit in the bitmaps
+ * @rbm: The bit to test
+ *
+ * Returns: The two bit block state of the requested bit
+ */
+
+static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
+{
+	struct gfs2_bitmap *bi = rbm_bi(rbm);
+	const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
+	const u8 *byte;
+	unsigned int bit;
+
+	byte = buffer + (rbm->offset / GFS2_NBBY);
+	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
+
+	return (*byte >> bit) & GFS2_BIT_MASK;
+}
+
+/**
+ * gfs2_bit_search
+ * @ptr: Pointer to bitmap data
+ * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
+ * @state: The state we are searching for
+ *
+ * We xor the bitmap data with a patter which is the bitwise opposite
+ * of what we are looking for, this gives rise to a pattern of ones
+ * wherever there is a match. Since we have two bits per entry, we
+ * take this pattern, shift it down by one place and then and it with
+ * the original. All the even bit positions (0,2,4, etc) then represent
+ * successful matches, so we mask with 0x55555..... to remove the unwanted
+ * odd bit positions.
+ *
+ * This allows searching of a whole u64 at once (32 blocks) with a
+ * single test (on 64 bit arches).
+ */
+
+static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
+{
+	u64 tmp;
+	static const u64 search[] = {
+		[0] = 0xffffffffffffffffULL,
+		[1] = 0xaaaaaaaaaaaaaaaaULL,
+		[2] = 0x5555555555555555ULL,
+		[3] = 0x0000000000000000ULL,
+	};
+	tmp = le64_to_cpu(*ptr) ^ search[state];
+	tmp &= (tmp >> 1);
+	tmp &= mask;
+	return tmp;
+}
+
+/**
+ * rs_cmp - multi-block reservation range compare
+ * @blk: absolute file system block number of the new reservation
+ * @len: number of blocks in the new reservation
+ * @rs: existing reservation to compare against
+ *
+ * returns: 1 if the block range is beyond the reach of the reservation
+ *         -1 if the block range is before the start of the reservation
+ *          0 if the block range overlaps with the reservation
+ */
+static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+{
+	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
+
+	if (blk >= startblk + rs->rs_free)
+		return 1;
+	if (blk + len - 1 < startblk)
+		return -1;
+	return 0;
+}
+
+/**
+ * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
+ *       a block in a given allocation state.
+ * @buf: the buffer that holds the bitmaps
+ * @len: the length (in bytes) of the buffer
+ * @goal: start search at this block's bit-pair (within @buffer)
+ * @state: GFS2_BLKST_XXX the state of the block we're looking for.
+ *
+ * Scope of @goal and returned block number is only within this bitmap buffer,
+ * not entire rgrp or filesystem.  @buffer will be offset from the actual
+ * beginning of a bitmap block buffer, skipping any header structures, but
+ * headers are always a multiple of 64 bits long so that the buffer is
+ * always aligned to a 64 bit boundary.
+ *
+ * The size of the buffer is in bytes, but is it assumed that it is
+ * always ok to read a complete multiple of 64 bits at the end
+ * of the block in case the end is no aligned to a natural boundary.
+ *
+ * Return: the block number (bitmap buffer scope) that was found
+ */
+
+static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
+		       u32 goal, u8 state)
+{
+	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
+	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
+	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
+	u64 tmp;
+	u64 mask = 0x5555555555555555ULL;
+	u32 bit;
+
+	/* Mask off bits we don't care about at the start of the search */
+	mask <<= spoint;
+	tmp = gfs2_bit_search(ptr, mask, state);
+	ptr++;
+	while(tmp == 0 && ptr < end) {
+		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
+		ptr++;
+	}
+	/* Mask off any bits which are more than len bytes from the start */
+	if (ptr == end && (len & (sizeof(u64) - 1)))
+		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
+	/* Didn't find anything, so return */
+	if (tmp == 0)
+		return BFITNOENT;
+	ptr--;
+	bit = __ffs64(tmp);
+	bit /= 2;	/* two bits per entry in the bitmap */
+	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
+}
+
+/**
+ * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
+ * @rbm: The rbm with rgd already set correctly
+ * @block: The block number (filesystem relative)
+ *
+ * This sets the bi and offset members of an rbm based on a
+ * resource group and a filesystem relative block number. The
+ * resource group must be set in the rbm on entry, the bi and
+ * offset members will be set by this function.
+ *
+ * Returns: 0 on success, or an error code
+ */
+
+static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
+{
+	u64 rblock = block - rbm->rgd->rd_data0;
+
+	if (WARN_ON_ONCE(rblock > UINT_MAX))
+		return -EINVAL;
+	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
+		return -E2BIG;
+
+	rbm->bii = 0;
+	rbm->offset = (u32)(rblock);
+	/* Check if the block is within the first block */
+	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
+		return 0;
+
+	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
+	rbm->offset += (sizeof(struct gfs2_rgrp) -
+			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
+	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+	return 0;
+}
+
+/**
+ * gfs2_rbm_incr - increment an rbm structure
+ * @rbm: The rbm with rgd already set correctly
+ *
+ * This function takes an existing rbm structure and increments it to the next
+ * viable block offset.
+ *
+ * Returns: If incrementing the offset would cause the rbm to go past the
+ *          end of the rgrp, true is returned, otherwise false.
+ *
+ */
+
+static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
+{
+	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
+		rbm->offset++;
+		return false;
+	}
+	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
+		return true;
+
+	rbm->offset = 0;
+	rbm->bii++;
+	return false;
+}
+
+/**
+ * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
+ * @rbm: Position to search (value/result)
+ * @n_unaligned: Number of unaligned blocks to check
+ * @len: Decremented for each block found (terminate on zero)
+ *
+ * Returns: true if a non-free block is encountered
+ */
+
+static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
+{
+	u32 n;
+	u8 res;
+
+	for (n = 0; n < n_unaligned; n++) {
+		res = gfs2_testbit(rbm);
+		if (res != GFS2_BLKST_FREE)
+			return true;
+		(*len)--;
+		if (*len == 0)
+			return true;
+		if (gfs2_rbm_incr(rbm))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * gfs2_free_extlen - Return extent length of free blocks
+ * @rrbm: Starting position
+ * @len: Max length to check
+ *
+ * Starting at the block specified by the rbm, see how many free blocks
+ * there are, not reading more than len blocks ahead. This can be done
+ * using memchr_inv when the blocks are byte aligned, but has to be done
+ * on a block by block basis in case of unaligned blocks. Also this
+ * function can cope with bitmap boundaries (although it must stop on
+ * a resource group boundary)
+ *
+ * Returns: Number of free blocks in the extent
+ */
+
+static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
+{
+	struct gfs2_rbm rbm = *rrbm;
+	u32 n_unaligned = rbm.offset & 3;
+	u32 size = len;
+	u32 bytes;
+	u32 chunk_size;
+	u8 *ptr, *start, *end;
+	u64 block;
+	struct gfs2_bitmap *bi;
+
+	if (n_unaligned &&
+	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
+		goto out;
+
+	n_unaligned = len & 3;
+	/* Start is now byte aligned */
+	while (len > 3) {
+		bi = rbm_bi(&rbm);
+		start = bi->bi_bh->b_data;
+		if (bi->bi_clone)
+			start = bi->bi_clone;
+		end = start + bi->bi_bh->b_size;
+		start += bi->bi_offset;
+		BUG_ON(rbm.offset & 3);
+		start += (rbm.offset / GFS2_NBBY);
+		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
+		ptr = memchr_inv(start, 0, bytes);
+		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
+		chunk_size *= GFS2_NBBY;
+		BUG_ON(len < chunk_size);
+		len -= chunk_size;
+		block = gfs2_rbm_to_block(&rbm);
+		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
+			n_unaligned = 0;
+			break;
+		}
+		if (ptr) {
+			n_unaligned = 3;
+			break;
+		}
+		n_unaligned = len & 3;
+	}
+
+	/* Deal with any bits left over at the end */
+	if (n_unaligned)
+		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
+out:
+	return size - len;
+}
+
+/**
+ * gfs2_bitcount - count the number of bits in a certain state
+ * @rgd: the resource group descriptor
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @state: the state of the block we're looking for
+ *
+ * Returns: The number of bits
+ */
+
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
+			 unsigned int buflen, u8 state)
+{
+	const u8 *byte = buffer;
+	const u8 *end = buffer + buflen;
+	const u8 state1 = state << 2;
+	const u8 state2 = state << 4;
+	const u8 state3 = state << 6;
+	u32 count = 0;
+
+	for (; byte < end; byte++) {
+		if (((*byte) & 0x03) == state)
+			count++;
+		if (((*byte) & 0x0C) == state1)
+			count++;
+		if (((*byte) & 0x30) == state2)
+			count++;
+		if (((*byte) & 0xC0) == state3)
+			count++;
+	}
+
+	return count;
+}
+
+/**
+ * gfs2_rgrp_verify - Verify that a resource group is consistent
+ * @rgd: the rgrp
+ *
+ */
+
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_bitmap *bi = NULL;
+	u32 length = rgd->rd_length;
+	u32 count[4], tmp;
+	int buf, x;
+
+	memset(count, 0, 4 * sizeof(u32));
+
+	/* Count # blocks in each of 4 possible allocation states */
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		for (x = 0; x < 4; x++)
+			count[x] += gfs2_bitcount(rgd,
+						  bi->bi_bh->b_data +
+						  bi->bi_offset,
+						  bi->bi_len, x);
+	}
+
+	if (count[0] != rgd->rd_free) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "free data mismatch:  %u != %u\n",
+			       count[0], rgd->rd_free);
+		return;
+	}
+
+	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
+	if (count[1] != tmp) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "used data mismatch:  %u != %u\n",
+			       count[1], tmp);
+		return;
+	}
+
+	if (count[2] + count[3] != rgd->rd_dinodes) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
+			       count[2] + count[3], rgd->rd_dinodes);
+		return;
+	}
+}
+
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+	u64 first = rgd->rd_data0;
+	u64 last = first + rgd->rd_data;
+	return first <= block && block < last;
+}
+
+/**
+ * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
+ * @sdp: The GFS2 superblock
+ * @blk: The data block number
+ * @exact: True if this needs to be an exact match
+ *
+ * Returns: The resource group, or NULL if not found
+ */
+
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
+{
+	struct rb_node *n, *next;
+	struct gfs2_rgrpd *cur;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	n = sdp->sd_rindex_tree.rb_node;
+	while (n) {
+		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
+		next = NULL;
+		if (blk < cur->rd_addr)
+			next = n->rb_left;
+		else if (blk >= cur->rd_data0 + cur->rd_data)
+			next = n->rb_right;
+		if (next == NULL) {
+			spin_unlock(&sdp->sd_rindex_spin);
+			if (exact) {
+				if (blk < cur->rd_addr)
+					return NULL;
+				if (blk >= cur->rd_data0 + cur->rd_data)
+					return NULL;
+			}
+			return cur;
+		}
+		n = next;
+	}
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return NULL;
+}
+
+/**
+ * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The first rgrp in the filesystem
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
+{
+	const struct rb_node *n;
+	struct gfs2_rgrpd *rgd;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	n = rb_first(&sdp->sd_rindex_tree);
+	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return rgd;
+}
+
+/**
+ * gfs2_rgrpd_get_next - get the next RG
+ * @rgd: the resource group descriptor
+ *
+ * Returns: The next rgrp
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	const struct rb_node *n;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	n = rb_next(&rgd->rd_node);
+	if (n == NULL)
+		n = rb_first(&sdp->sd_rindex_tree);
+
+	if (unlikely(&rgd->rd_node == n)) {
+		spin_unlock(&sdp->sd_rindex_spin);
+		return NULL;
+	}
+	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+	spin_unlock(&sdp->sd_rindex_spin);
+	return rgd;
+}
+
+void check_and_update_goal(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
+		ip->i_goal = ip->i_no_addr;
+}
+
+void gfs2_free_clones(struct gfs2_rgrpd *rgd)
+{
+	int x;
+
+	for (x = 0; x < rgd->rd_length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		kfree(bi->bi_clone);
+		bi->bi_clone = NULL;
+	}
+}
+
+/**
+ * gfs2_rs_alloc - make sure we have a reservation assigned to the inode
+ * @ip: the inode for this reservation
+ */
+int gfs2_rs_alloc(struct gfs2_inode *ip)
+{
+	int error = 0;
+
+	down_write(&ip->i_rw_mutex);
+	if (ip->i_res)
+		goto out;
+
+	ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
+	if (!ip->i_res) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	RB_CLEAR_NODE(&ip->i_res->rs_node);
+out:
+	up_write(&ip->i_rw_mutex);
+	return error;
+}
+
+static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
+{
+	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
+		       (unsigned long long)rs->rs_inum,
+		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
+		       rs->rs_rbm.offset, rs->rs_free);
+}
+
+/**
+ * __rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+static void __rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rbm.rgd;
+	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
+	rb_erase(&rs->rs_node, &rgd->rd_rstree);
+	RB_CLEAR_NODE(&rs->rs_node);
+
+	if (rs->rs_free) {
+		struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
+
+		/* return reserved blocks to the rgrp */
+		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
+		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
+		/* The rgrp extent failure point is likely not to increase;
+		   it will only do so if the freed blocks are somehow
+		   contiguous with a span of free blocks that follows. Still,
+		   it will force the number to be recalculated later. */
+		rgd->rd_extfail_pt += rs->rs_free;
+		rs->rs_free = 0;
+		clear_bit(GBF_FULL, &bi->bi_flags);
+	}
+}
+
+/**
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	rgd = rs->rs_rbm.rgd;
+	if (rgd) {
+		spin_lock(&rgd->rd_rsspin);
+		__rs_deltree(rs);
+		spin_unlock(&rgd->rd_rsspin);
+	}
+}
+
+/**
+ * gfs2_rs_delete - delete a multi-block reservation
+ * @ip: The inode for this reservation
+ * @wcount: The inode's write count, or NULL
+ *
+ */
+void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+{
+	down_write(&ip->i_rw_mutex);
+	if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
+		gfs2_rs_deltree(ip->i_res);
+		BUG_ON(ip->i_res->rs_free);
+		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
+		ip->i_res = NULL;
+	}
+	up_write(&ip->i_rw_mutex);
+}
+
+/**
+ * return_all_reservations - return all reserved blocks back to the rgrp.
+ * @rgd: the rgrp that needs its space back
+ *
+ * We previously reserved a bunch of blocks for allocation. Now we need to
+ * give them back. This leave the reservation structures in tact, but removes
+ * all of their corresponding "no-fly zones".
+ */
+static void return_all_reservations(struct gfs2_rgrpd *rgd)
+{
+	struct rb_node *n;
+	struct gfs2_blkreserv *rs;
+
+	spin_lock(&rgd->rd_rsspin);
+	while ((n = rb_first(&rgd->rd_rstree))) {
+		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		__rs_deltree(rs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
+}
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+	struct rb_node *n;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_glock *gl;
+
+	while ((n = rb_first(&sdp->sd_rindex_tree))) {
+		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+		gl = rgd->rd_gl;
+
+		rb_erase(n, &sdp->sd_rindex_tree);
+
+		if (gl) {
+			spin_lock(&gl->gl_spin);
+			gl->gl_object = NULL;
+			spin_unlock(&gl->gl_spin);
+			gfs2_glock_add_to_lru(gl);
+			gfs2_glock_put(gl);
+		}
+
+		gfs2_free_clones(rgd);
+		kfree(rgd->rd_bits);
+		return_all_reservations(rgd);
+		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
+	}
+}
+
+static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
+{
+	pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
+	pr_info("ri_length = %u\n", rgd->rd_length);
+	pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
+	pr_info("ri_data = %u\n", rgd->rd_data);
+	pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
+}
+
+/**
+ * gfs2_compute_bitstructs - Compute the bitmap sizes
+ * @rgd: The resource group descriptor
+ *
+ * Calculates bitmap descriptors, one for each block that contains bitmap data
+ *
+ * Returns: errno
+ */
+
+static int compute_bitstructs(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_bitmap *bi;
+	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
+	u32 bytes_left, bytes;
+	int x;
+
+	if (!length)
+		return -EINVAL;
+
+	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
+	if (!rgd->rd_bits)
+		return -ENOMEM;
+
+	bytes_left = rgd->rd_bitbytes;
+
+	for (x = 0; x < length; x++) {
+		bi = rgd->rd_bits + x;
+
+		bi->bi_flags = 0;
+		/* small rgrp; bitmap stored completely in header block */
+		if (length == 1) {
+			bytes = bytes_left;
+			bi->bi_offset = sizeof(struct gfs2_rgrp);
+			bi->bi_start = 0;
+			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
+		/* header block */
+		} else if (x == 0) {
+			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
+			bi->bi_offset = sizeof(struct gfs2_rgrp);
+			bi->bi_start = 0;
+			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
+		/* last block */
+		} else if (x + 1 == length) {
+			bytes = bytes_left;
+			bi->bi_offset = sizeof(struct gfs2_meta_header);
+			bi->bi_start = rgd->rd_bitbytes - bytes_left;
+			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
+		/* other blocks */
+		} else {
+			bytes = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header);
+			bi->bi_offset = sizeof(struct gfs2_meta_header);
+			bi->bi_start = rgd->rd_bitbytes - bytes_left;
+			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
+		}
+
+		bytes_left -= bytes;
+	}
+
+	if (bytes_left) {
+		gfs2_consist_rgrpd(rgd);
+		return -EIO;
+	}
+	bi = rgd->rd_bits + (length - 1);
+	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
+		if (gfs2_consist_rgrpd(rgd)) {
+			gfs2_rindex_print(rgd);
+			fs_err(sdp, "start=%u len=%u offset=%u\n",
+			       bi->bi_start, bi->bi_len, bi->bi_offset);
+		}
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_ri_total - Total up the file system space, according to the rindex.
+ * @sdp: the filesystem
+ *
+ */
+u64 gfs2_ri_total(struct gfs2_sbd *sdp)
+{
+	u64 total_data = 0;	
+	struct inode *inode = sdp->sd_rindex;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	char buf[sizeof(struct gfs2_rindex)];
+	int error, rgrps;
+
+	for (rgrps = 0;; rgrps++) {
+		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
+
+		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
+			break;
+		error = gfs2_internal_read(ip, buf, &pos,
+					   sizeof(struct gfs2_rindex));
+		if (error != sizeof(struct gfs2_rindex))
+			break;
+		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
+	}
+	return total_data;
+}
+
+static int rgd_insert(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*newn) {
+		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
+						  rd_node);
+
+		parent = *newn;
+		if (rgd->rd_addr < cur->rd_addr)
+			newn = &((*newn)->rb_left);
+		else if (rgd->rd_addr > cur->rd_addr)
+			newn = &((*newn)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&rgd->rd_node, parent, newn);
+	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
+	sdp->sd_rgrps++;
+	return 0;
+}
+
+/**
+ * read_rindex_entry - Pull in a new resource index entry from the disk
+ * @ip: Pointer to the rindex inode
+ *
+ * Returns: 0 on success, > 0 on EOF, error code otherwise
+ */
+
+static int read_rindex_entry(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	const unsigned bsize = sdp->sd_sb.sb_bsize;
+	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
+	struct gfs2_rindex buf;
+	int error;
+	struct gfs2_rgrpd *rgd;
+
+	if (pos >= i_size_read(&ip->i_inode))
+		return 1;
+
+	error = gfs2_internal_read(ip, (char *)&buf, &pos,
+				   sizeof(struct gfs2_rindex));
+
+	if (error != sizeof(struct gfs2_rindex))
+		return (error == 0) ? 1 : error;
+
+	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
+	error = -ENOMEM;
+	if (!rgd)
+		return error;
+
+	rgd->rd_sbd = sdp;
+	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
+	rgd->rd_length = be32_to_cpu(buf.ri_length);
+	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
+	rgd->rd_data = be32_to_cpu(buf.ri_data);
+	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
+	spin_lock_init(&rgd->rd_rsspin);
+
+	error = compute_bitstructs(rgd);
+	if (error)
+		goto fail;
+
+	error = gfs2_glock_get(sdp, rgd->rd_addr,
+			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
+	if (error)
+		goto fail;
+
+	rgd->rd_gl->gl_object = rgd;
+	rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
+	rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
+	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
+	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
+	if (rgd->rd_data > sdp->sd_max_rg_data)
+		sdp->sd_max_rg_data = rgd->rd_data;
+	spin_lock(&sdp->sd_rindex_spin);
+	error = rgd_insert(rgd);
+	spin_unlock(&sdp->sd_rindex_spin);
+	if (!error)
+		return 0;
+
+	error = 0; /* someone else read in the rgrp; free it and ignore it */
+	gfs2_glock_put(rgd->rd_gl);
+
+fail:
+	kfree(rgd->rd_bits);
+	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
+	return error;
+}
+
+/**
+ * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
+ * @sdp: the GFS2 superblock
+ *
+ * The purpose of this function is to select a subset of the resource groups
+ * and mark them as PREFERRED. We do it in such a way that each node prefers
+ * to use a unique set of rgrps to minimize glock contention.
+ */
+static void set_rgrp_preferences(struct gfs2_sbd *sdp)
+{
+	struct gfs2_rgrpd *rgd, *first;
+	int i;
+
+	/* Skip an initial number of rgrps, based on this node's journal ID.
+	   That should start each node out on its own set. */
+	rgd = gfs2_rgrpd_get_first(sdp);
+	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
+		rgd = gfs2_rgrpd_get_next(rgd);
+	first = rgd;
+
+	do {
+		rgd->rd_flags |= GFS2_RDF_PREFERRED;
+		for (i = 0; i < sdp->sd_journals; i++) {
+			rgd = gfs2_rgrpd_get_next(rgd);
+			if (rgd == first)
+				break;
+		}
+	} while (rgd != first);
+}
+
+/**
+ * gfs2_ri_update - Pull in a new resource index from the disk
+ * @ip: pointer to the rindex inode
+ *
+ * Returns: 0 on successful update, error code otherwise
+ */
+
+static int gfs2_ri_update(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
+
+	do {
+		error = read_rindex_entry(ip);
+	} while (error == 0);
+
+	if (error < 0)
+		return error;
+
+	set_rgrp_preferences(sdp);
+
+	sdp->sd_rindex_uptodate = 1;
+	return 0;
+}
+
+/**
+ * gfs2_rindex_update - Update the rindex if required
+ * @sdp: The GFS2 superblock
+ *
+ * We grab a lock on the rindex inode to make sure that it doesn't
+ * change whilst we are performing an operation. We keep this lock
+ * for quite long periods of time compared to other locks. This
+ * doesn't matter, since it is shared and it is very, very rarely
+ * accessed in the exclusive mode (i.e. only when expanding the filesystem).
+ *
+ * This makes sure that we're using the latest copy of the resource index
+ * special file, which might have been updated if someone expanded the
+ * filesystem (via gfs2_grow utility), which adds new resource groups.
+ *
+ * Returns: 0 on succeess, error code otherwise
+ */
+
+int gfs2_rindex_update(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct gfs2_holder ri_gh;
+	int error = 0;
+	int unlock_required = 0;
+
+	/* Read new copy from disk if we don't have the latest */
+	if (!sdp->sd_rindex_uptodate) {
+		if (!gfs2_glock_is_locked_by_me(gl)) {
+			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
+			if (error)
+				return error;
+			unlock_required = 1;
+		}
+		if (!sdp->sd_rindex_uptodate)
+			error = gfs2_ri_update(ip);
+		if (unlock_required)
+			gfs2_glock_dq_uninit(&ri_gh);
+	}
+
+	return error;
+}
+
+static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
+{
+	const struct gfs2_rgrp *str = buf;
+	u32 rg_flags;
+
+	rg_flags = be32_to_cpu(str->rg_flags);
+	rg_flags &= ~GFS2_RDF_MASK;
+	rgd->rd_flags &= GFS2_RDF_MASK;
+	rgd->rd_flags |= rg_flags;
+	rgd->rd_free = be32_to_cpu(str->rg_free);
+	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
+	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
+}
+
+static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
+{
+	struct gfs2_rgrp *str = buf;
+
+	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
+	str->rg_free = cpu_to_be32(rgd->rd_free);
+	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
+	str->__pad = cpu_to_be32(0);
+	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
+	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
+}
+
+static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
+	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
+
+	if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
+	    rgl->rl_dinodes != str->rg_dinodes ||
+	    rgl->rl_igeneration != str->rg_igeneration)
+		return 0;
+	return 1;
+}
+
+static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
+{
+	const struct gfs2_rgrp *str = buf;
+
+	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
+	rgl->rl_flags = str->rg_flags;
+	rgl->rl_free = str->rg_free;
+	rgl->rl_dinodes = str->rg_dinodes;
+	rgl->rl_igeneration = str->rg_igeneration;
+	rgl->__pad = 0UL;
+}
+
+static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change)
+{
+	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
+	u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change;
+	rgl->rl_unlinked = cpu_to_be32(unlinked);
+}
+
+static u32 count_unlinked(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_bitmap *bi;
+	const u32 length = rgd->rd_length;
+	const u8 *buffer = NULL;
+	u32 i, goal, count = 0;
+
+	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
+		goal = 0;
+		buffer = bi->bi_bh->b_data + bi->bi_offset;
+		WARN_ON(!buffer_uptodate(bi->bi_bh));
+		while (goal < bi->bi_len * GFS2_NBBY) {
+			goal = gfs2_bitfit(buffer, bi->bi_len, goal,
+					   GFS2_BLKST_UNLINKED);
+			if (goal == BFITNOENT)
+				break;
+			count++;
+			goal++;
+		}
+	}
+
+	return count;
+}
+
+
+/**
+ * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ * Read in all of a Resource Group's header and bitmap blocks.
+ * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_glock *gl = rgd->rd_gl;
+	unsigned int length = rgd->rd_length;
+	struct gfs2_bitmap *bi;
+	unsigned int x, y;
+	int error;
+
+	if (rgd->rd_bits[0].bi_bh != NULL)
+		return 0;
+
+	for (x = 0; x < length; x++) {
+		bi = rgd->rd_bits + x;
+		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
+		if (error)
+			goto fail;
+	}
+
+	for (y = length; y--;) {
+		bi = rgd->rd_bits + y;
+		error = gfs2_meta_wait(sdp, bi->bi_bh);
+		if (error)
+			goto fail;
+		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
+					      GFS2_METATYPE_RG)) {
+			error = -EIO;
+			goto fail;
+		}
+	}
+
+	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+		for (x = 0; x < length; x++)
+			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
+		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
+		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+		rgd->rd_free_clone = rgd->rd_free;
+		/* max out the rgrp allocation failure point */
+		rgd->rd_extfail_pt = rgd->rd_free;
+	}
+	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
+		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
+		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
+				     rgd->rd_bits[0].bi_bh->b_data);
+	}
+	else if (sdp->sd_args.ar_rgrplvb) {
+		if (!gfs2_rgrp_lvb_valid(rgd)){
+			gfs2_consist_rgrpd(rgd);
+			error = -EIO;
+			goto fail;
+		}
+		if (rgd->rd_rgl->rl_unlinked == 0)
+			rgd->rd_flags &= ~GFS2_RDF_CHECK;
+	}
+	return 0;
+
+fail:
+	while (x--) {
+		bi = rgd->rd_bits + x;
+		brelse(bi->bi_bh);
+		bi->bi_bh = NULL;
+		gfs2_assert_warn(sdp, !bi->bi_clone);
+	}
+
+	return error;
+}
+
+static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
+{
+	u32 rl_flags;
+
+	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
+		return 0;
+
+	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
+		return gfs2_rgrp_bh_get(rgd);
+
+	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
+	rl_flags &= ~GFS2_RDF_MASK;
+	rgd->rd_flags &= GFS2_RDF_MASK;
+	rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+	if (rgd->rd_rgl->rl_unlinked == 0)
+		rgd->rd_flags &= ~GFS2_RDF_CHECK;
+	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
+	rgd->rd_free_clone = rgd->rd_free;
+	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
+	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
+	return 0;
+}
+
+int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
+{
+	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+
+	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
+		return 0;
+	return gfs2_rgrp_bh_get(rgd);
+}
+
+/**
+ * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @gh: The glock holder for the resource group
+ *
+ */
+
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
+{
+	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
+	int x, length = rgd->rd_length;
+
+	for (x = 0; x < length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		if (bi->bi_bh) {
+			brelse(bi->bi_bh);
+			bi->bi_bh = NULL;
+		}
+	}
+
+}
+
+int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+			     struct buffer_head *bh,
+			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	u64 blk;
+	sector_t start = 0;
+	sector_t nr_blks = 0;
+	int rv;
+	unsigned int x;
+	u32 trimmed = 0;
+	u8 diff;
+
+	for (x = 0; x < bi->bi_len; x++) {
+		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
+		clone += bi->bi_offset;
+		clone += x;
+		if (bh) {
+			const u8 *orig = bh->b_data + bi->bi_offset + x;
+			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
+		} else {
+			diff = ~(*clone | (*clone >> 1));
+		}
+		diff &= 0x55;
+		if (diff == 0)
+			continue;
+		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
+		while(diff) {
+			if (diff & 1) {
+				if (nr_blks == 0)
+					goto start_new_extent;
+				if ((start + nr_blks) != blk) {
+					if (nr_blks >= minlen) {
+						rv = sb_issue_discard(sb,
+							start, nr_blks,
+							GFP_NOFS, 0);
+						if (rv)
+							goto fail;
+						trimmed += nr_blks;
+					}
+					nr_blks = 0;
+start_new_extent:
+					start = blk;
+				}
+				nr_blks++;
+			}
+			diff >>= 2;
+			blk++;
+		}
+	}
+	if (nr_blks >= minlen) {
+		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
+		if (rv)
+			goto fail;
+		trimmed += nr_blks;
+	}
+	if (ptrimmed)
+		*ptrimmed = trimmed;
+	return 0;
+
+fail:
+	if (sdp->sd_args.ar_discard)
+		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
+	sdp->sd_args.ar_discard = 0;
+	return -EIO;
+}
+
+/**
+ * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
+ * @filp: Any file on the filesystem
+ * @argp: Pointer to the arguments (also used to pass result)
+ *
+ * Returns: 0 on success, otherwise error code
+ */
+
+int gfs2_fitrim(struct file *filp, void __user *argp)
+{
+	struct inode *inode = file_inode(filp);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
+	struct buffer_head *bh;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_rgrpd *rgd_end;
+	struct gfs2_holder gh;
+	struct fstrim_range r;
+	int ret = 0;
+	u64 amt;
+	u64 trimmed = 0;
+	u64 start, end, minlen;
+	unsigned int x;
+	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&r, argp, sizeof(r)))
+		return -EFAULT;
+
+	ret = gfs2_rindex_update(sdp);
+	if (ret)
+		return ret;
+
+	start = r.start >> bs_shift;
+	end = start + (r.len >> bs_shift);
+	minlen = max_t(u64, r.minlen,
+		       q->limits.discard_granularity) >> bs_shift;
+
+	if (end <= start || minlen > sdp->sd_max_rg_data)
+		return -EINVAL;
+
+	rgd = gfs2_blk2rgrpd(sdp, start, 0);
+	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
+
+	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
+	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
+		return -EINVAL; /* start is beyond the end of the fs */
+
+	while (1) {
+
+		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
+		if (ret)
+			goto out;
+
+		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
+			/* Trim each bitmap in the rgrp */
+			for (x = 0; x < rgd->rd_length; x++) {
+				struct gfs2_bitmap *bi = rgd->rd_bits + x;
+				ret = gfs2_rgrp_send_discards(sdp,
+						rgd->rd_data0, NULL, bi, minlen,
+						&amt);
+				if (ret) {
+					gfs2_glock_dq_uninit(&gh);
+					goto out;
+				}
+				trimmed += amt;
+			}
+
+			/* Mark rgrp as having been trimmed */
+			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
+			if (ret == 0) {
+				bh = rgd->rd_bits[0].bi_bh;
+				rgd->rd_flags |= GFS2_RGF_TRIMMED;
+				gfs2_trans_add_meta(rgd->rd_gl, bh);
+				gfs2_rgrp_out(rgd, bh->b_data);
+				gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
+				gfs2_trans_end(sdp);
+			}
+		}
+		gfs2_glock_dq_uninit(&gh);
+
+		if (rgd == rgd_end)
+			break;
+
+		rgd = gfs2_rgrpd_get_next(rgd);
+	}
+
+out:
+	r.len = trimmed << bs_shift;
+	if (copy_to_user(argp, &r, sizeof(r)))
+		return -EFAULT;
+
+	return ret;
+}
+
+/**
+ * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
+ * @ip: the inode structure
+ *
+ */
+static void rs_insert(struct gfs2_inode *ip)
+{
+	struct rb_node **newn, *parent = NULL;
+	int rc;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
+	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
+
+	BUG_ON(gfs2_rs_active(rs));
+
+	spin_lock(&rgd->rd_rsspin);
+	newn = &rgd->rd_rstree.rb_node;
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+
+		parent = *newn;
+		rc = rs_cmp(fsblock, rs->rs_free, cur);
+		if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else {
+			spin_unlock(&rgd->rd_rsspin);
+			WARN_ON(1);
+			return;
+		}
+	}
+
+	rb_link_node(&rs->rs_node, parent, newn);
+	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
+
+	/* Do our rgrp accounting for the reservation */
+	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
+	spin_unlock(&rgd->rd_rsspin);
+	trace_gfs2_rs(rs, TRACE_RS_INSERT);
+}
+
+/**
+ * rg_mblk_search - find a group of multiple free blocks to form a reservation
+ * @rgd: the resource group descriptor
+ * @ip: pointer to the inode for which we're reserving blocks
+ * @ap: the allocation parameters
+ *
+ */
+
+static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
+			   const struct gfs2_alloc_parms *ap)
+{
+	struct gfs2_rbm rbm = { .rgd = rgd, };
+	u64 goal;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	u32 extlen;
+	u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
+	int ret;
+	struct inode *inode = &ip->i_inode;
+
+	if (S_ISDIR(inode->i_mode))
+		extlen = 1;
+	else {
+		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
+		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+	}
+	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
+		return;
+
+	/* Find bitmap block that contains bits for goal block */
+	if (rgrp_contains_block(rgd, ip->i_goal))
+		goal = ip->i_goal;
+	else
+		goal = rgd->rd_last_alloc + rgd->rd_data0;
+
+	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
+		return;
+
+	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
+	if (ret == 0) {
+		rs->rs_rbm = rbm;
+		rs->rs_free = extlen;
+		rs->rs_inum = ip->i_no_addr;
+		rs_insert(ip);
+	} else {
+		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
+			rgd->rd_last_alloc = 0;
+	}
+}
+
+/**
+ * gfs2_next_unreserved_block - Return next block that is not reserved
+ * @rgd: The resource group
+ * @block: The starting block
+ * @length: The required length
+ * @ip: Ignore any reservations for this inode
+ *
+ * If the block does not appear in any reservation, then return the
+ * block number unchanged. If it does appear in the reservation, then
+ * keep looking through the tree of reservations in order to find the
+ * first block number which is not reserved.
+ */
+
+static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
+				      u32 length,
+				      const struct gfs2_inode *ip)
+{
+	struct gfs2_blkreserv *rs;
+	struct rb_node *n;
+	int rc;
+
+	spin_lock(&rgd->rd_rsspin);
+	n = rgd->rd_rstree.rb_node;
+	while (n) {
+		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		rc = rs_cmp(block, length, rs);
+		if (rc < 0)
+			n = n->rb_left;
+		else if (rc > 0)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	if (n) {
+		while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) {
+			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
+			n = n->rb_right;
+			if (n == NULL)
+				break;
+			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		}
+	}
+
+	spin_unlock(&rgd->rd_rsspin);
+	return block;
+}
+
+/**
+ * gfs2_reservation_check_and_update - Check for reservations during block alloc
+ * @rbm: The current position in the resource group
+ * @ip: The inode for which we are searching for blocks
+ * @minext: The minimum extent length
+ * @maxext: A pointer to the maximum extent structure
+ *
+ * This checks the current position in the rgrp to see whether there is
+ * a reservation covering this block. If not then this function is a
+ * no-op. If there is, then the position is moved to the end of the
+ * contiguous reservation(s) so that we are pointing at the first
+ * non-reserved block.
+ *
+ * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
+ */
+
+static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
+					     const struct gfs2_inode *ip,
+					     u32 minext,
+					     struct gfs2_extent *maxext)
+{
+	u64 block = gfs2_rbm_to_block(rbm);
+	u32 extlen = 1;
+	u64 nblock;
+	int ret;
+
+	/*
+	 * If we have a minimum extent length, then skip over any extent
+	 * which is less than the min extent length in size.
+	 */
+	if (minext) {
+		extlen = gfs2_free_extlen(rbm, minext);
+		if (extlen <= maxext->len)
+			goto fail;
+	}
+
+	/*
+	 * Check the extent which has been found against the reservations
+	 * and skip if parts of it are already reserved
+	 */
+	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
+	if (nblock == block) {
+		if (!minext || extlen >= minext)
+			return 0;
+
+		if (extlen > maxext->len) {
+			maxext->len = extlen;
+			maxext->rbm = *rbm;
+		}
+fail:
+		nblock = block + extlen;
+	}
+	ret = gfs2_rbm_from_block(rbm, nblock);
+	if (ret < 0)
+		return ret;
+	return 1;
+}
+
+/**
+ * gfs2_rbm_find - Look for blocks of a particular state
+ * @rbm: Value/result starting position and final position
+ * @state: The state which we want to find
+ * @minext: Pointer to the requested extent length (NULL for a single block)
+ *          This is updated to be the actual reservation size.
+ * @ip: If set, check for reservations
+ * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
+ *          around until we've reached the starting point.
+ * @ap: the allocation parameters
+ *
+ * Side effects:
+ * - If looking for free blocks, we set GBF_FULL on each bitmap which
+ *   has no free blocks in it.
+ * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
+ *   has come up short on a free block search.
+ *
+ * Returns: 0 on success, -ENOSPC if there is no block of the requested state
+ */
+
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+			 const struct gfs2_inode *ip, bool nowrap,
+			 const struct gfs2_alloc_parms *ap)
+{
+	struct buffer_head *bh;
+	int initial_bii;
+	u32 initial_offset;
+	int first_bii = rbm->bii;
+	u32 first_offset = rbm->offset;
+	u32 offset;
+	u8 *buffer;
+	int n = 0;
+	int iters = rbm->rgd->rd_length;
+	int ret;
+	struct gfs2_bitmap *bi;
+	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
+
+	/* If we are not starting at the beginning of a bitmap, then we
+	 * need to add one to the bitmap count to ensure that we search
+	 * the starting bitmap twice.
+	 */
+	if (rbm->offset != 0)
+		iters++;
+
+	while(1) {
+		bi = rbm_bi(rbm);
+		if (test_bit(GBF_FULL, &bi->bi_flags) &&
+		    (state == GFS2_BLKST_FREE))
+			goto next_bitmap;
+
+		bh = bi->bi_bh;
+		buffer = bh->b_data + bi->bi_offset;
+		WARN_ON(!buffer_uptodate(bh));
+		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
+		initial_offset = rbm->offset;
+		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
+		if (offset == BFITNOENT)
+			goto bitmap_full;
+		rbm->offset = offset;
+		if (ip == NULL)
+			return 0;
+
+		initial_bii = rbm->bii;
+		ret = gfs2_reservation_check_and_update(rbm, ip,
+							minext ? *minext : 0,
+							&maxext);
+		if (ret == 0)
+			return 0;
+		if (ret > 0) {
+			n += (rbm->bii - initial_bii);
+			goto next_iter;
+		}
+		if (ret == -E2BIG) {
+			rbm->bii = 0;
+			rbm->offset = 0;
+			n += (rbm->bii - initial_bii);
+			goto res_covered_end_of_rgrp;
+		}
+		return ret;
+
+bitmap_full:	/* Mark bitmap as full and fall through */
+		if ((state == GFS2_BLKST_FREE) && initial_offset == 0) {
+			struct gfs2_bitmap *bi = rbm_bi(rbm);
+			set_bit(GBF_FULL, &bi->bi_flags);
+		}
+
+next_bitmap:	/* Find next bitmap in the rgrp */
+		rbm->offset = 0;
+		rbm->bii++;
+		if (rbm->bii == rbm->rgd->rd_length)
+			rbm->bii = 0;
+res_covered_end_of_rgrp:
+		if ((rbm->bii == 0) && nowrap)
+			break;
+		n++;
+next_iter:
+		if (n >= iters)
+			break;
+	}
+
+	if (minext == NULL || state != GFS2_BLKST_FREE)
+		return -ENOSPC;
+
+	/* If the extent was too small, and it's smaller than the smallest
+	   to have failed before, remember for future reference that it's
+	   useless to search this rgrp again for this amount or more. */
+	if ((first_offset == 0) && (first_bii == 0) &&
+	    (*minext < rbm->rgd->rd_extfail_pt))
+		rbm->rgd->rd_extfail_pt = *minext;
+
+	/* If the maximum extent we found is big enough to fulfill the
+	   minimum requirements, use it anyway. */
+	if (maxext.len) {
+		*rbm = maxext.rbm;
+		*minext = maxext.len;
+		return 0;
+	}
+
+	return -ENOSPC;
+}
+
+/**
+ * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
+ * @rgd: The rgrp
+ * @last_unlinked: block address of the last dinode we unlinked
+ * @skip: block address we should explicitly not unlink
+ *
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
+ */
+
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
+{
+	u64 block;
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_glock *gl;
+	struct gfs2_inode *ip;
+	int error;
+	int found = 0;
+	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
+
+	while (1) {
+		down_write(&sdp->sd_log_flush_lock);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
+				      true, NULL);
+		up_write(&sdp->sd_log_flush_lock);
+		if (error == -ENOSPC)
+			break;
+		if (WARN_ON_ONCE(error))
+			break;
+
+		block = gfs2_rbm_to_block(&rbm);
+		if (gfs2_rbm_from_block(&rbm, block + 1))
+			break;
+		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
+			continue;
+		if (block == skip)
+			continue;
+		*last_unlinked = block;
+
+		error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl);
+		if (error)
+			continue;
+
+		/* If the inode is already in cache, we can ignore it here
+		 * because the existing inode disposal code will deal with
+		 * it when all refs have gone away. Accessing gl_object like
+		 * this is not safe in general. Here it is ok because we do
+		 * not dereference the pointer, and we only need an approx
+		 * answer to whether it is NULL or not.
+		 */
+		ip = gl->gl_object;
+
+		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+			gfs2_glock_put(gl);
+		else
+			found++;
+
+		/* Limit reclaim to sensible number of tasks */
+		if (found > NR_CPUS)
+			return;
+	}
+
+	rgd->rd_flags &= ~GFS2_RDF_CHECK;
+	return;
+}
+
+/**
+ * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
+ * @rgd: The rgrp in question
+ * @loops: An indication of how picky we can be (0=very, 1=less so)
+ *
+ * This function uses the recently added glock statistics in order to
+ * figure out whether a parciular resource group is suffering from
+ * contention from multiple nodes. This is done purely on the basis
+ * of timings, since this is the only data we have to work with and
+ * our aim here is to reject a resource group which is highly contended
+ * but (very important) not to do this too often in order to ensure that
+ * we do not land up introducing fragmentation by changing resource
+ * groups when not actually required.
+ *
+ * The calculation is fairly simple, we want to know whether the SRTTB
+ * (i.e. smoothed round trip time for blocking operations) to acquire
+ * the lock for this rgrp's glock is significantly greater than the
+ * time taken for resource groups on average. We introduce a margin in
+ * the form of the variable @var which is computed as the sum of the two
+ * respective variences, and multiplied by a factor depending on @loops
+ * and whether we have a lot of data to base the decision on. This is
+ * then tested against the square difference of the means in order to
+ * decide whether the result is statistically significant or not.
+ *
+ * Returns: A boolean verdict on the congestion status
+ */
+
+static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
+{
+	const struct gfs2_glock *gl = rgd->rd_gl;
+	const struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_lkstats *st;
+	s64 r_dcount, l_dcount;
+	s64 r_srttb, l_srttb;
+	s64 srttb_diff;
+	s64 sqr_diff;
+	s64 var;
+
+	preempt_disable();
+	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
+	r_srttb = st->stats[GFS2_LKS_SRTTB];
+	r_dcount = st->stats[GFS2_LKS_DCOUNT];
+	var = st->stats[GFS2_LKS_SRTTVARB] +
+	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
+	preempt_enable();
+
+	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
+	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
+
+	if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0))
+		return false;
+
+	srttb_diff = r_srttb - l_srttb;
+	sqr_diff = srttb_diff * srttb_diff;
+
+	var *= 2;
+	if (l_dcount < 8 || r_dcount < 8)
+		var *= 2;
+	if (loops == 1)
+		var *= 2;
+
+	return ((srttb_diff < 0) && (sqr_diff > var));
+}
+
+/**
+ * gfs2_rgrp_used_recently
+ * @rs: The block reservation with the rgrp to test
+ * @msecs: The time limit in milliseconds
+ *
+ * Returns: True if the rgrp glock has been used within the time limit
+ */
+static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
+				    u64 msecs)
+{
+	u64 tdiff;
+
+	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
+                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
+
+	return tdiff > (msecs * 1000 * 1000);
+}
+
+static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u32 skip;
+
+	get_random_bytes(&skip, sizeof(skip));
+	return skip % sdp->sd_rgrps;
+}
+
+static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
+{
+	struct gfs2_rgrpd *rgd = *pos;
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+
+	rgd = gfs2_rgrpd_get_next(rgd);
+	if (rgd == NULL)
+		rgd = gfs2_rgrpd_get_first(sdp);
+	*pos = rgd;
+	if (rgd != begin) /* If we didn't wrap */
+		return true;
+	return false;
+}
+
+/**
+ * fast_to_acquire - determine if a resource group will be fast to acquire
+ *
+ * If this is one of our preferred rgrps, it should be quicker to acquire,
+ * because we tried to set ourselves up as dlm lock master.
+ */
+static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_glock *gl = rgd->rd_gl;
+
+	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
+	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+		return 1;
+	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
+		return 1;
+	return 0;
+}
+
+/**
+ * gfs2_inplace_reserve - Reserve space in the filesystem
+ * @ip: the inode to reserve space for
+ * @ap: the allocation parameters
+ *
+ * We try our best to find an rgrp that has at least ap->target blocks
+ * available. After a couple of passes (loops == 2), the prospects of finding
+ * such an rgrp diminish. At this stage, we return the first rgrp that has
+ * atleast ap->min_target blocks available. Either way, we set ap->allowed to
+ * the number of blocks available in the chosen rgrp.
+ *
+ * Returns: 0 on success,
+ *          -ENOMEM if a suitable rgrp can't be found
+ *          errno otherwise
+ */
+
+int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *begin = NULL;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	int error = 0, rg_locked, flags = 0;
+	u64 last_unlinked = NO_BLOCK;
+	int loops = 0;
+	u32 skip = 0;
+
+	if (sdp->sd_args.ar_rgrplvb)
+		flags |= GL_SKIP;
+	if (gfs2_assert_warn(sdp, ap->target))
+		return -EINVAL;
+	if (gfs2_rs_active(rs)) {
+		begin = rs->rs_rbm.rgd;
+	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
+		rs->rs_rbm.rgd = begin = ip->i_rgd;
+	} else {
+		check_and_update_goal(ip);
+		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
+	}
+	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
+		skip = gfs2_orlov_skip(ip);
+	if (rs->rs_rbm.rgd == NULL)
+		return -EBADSLT;
+
+	while (loops < 3) {
+		rg_locked = 1;
+
+		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
+			rg_locked = 0;
+			if (skip && skip--)
+				goto next_rgrp;
+			if (!gfs2_rs_active(rs)) {
+				if (loops == 0 &&
+				    !fast_to_acquire(rs->rs_rbm.rgd))
+					goto next_rgrp;
+				if ((loops < 2) &&
+				    gfs2_rgrp_used_recently(rs, 1000) &&
+				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+					goto next_rgrp;
+			}
+			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
+						   LM_ST_EXCLUSIVE, flags,
+						   &rs->rs_rgd_gh);
+			if (unlikely(error))
+				return error;
+			if (!gfs2_rs_active(rs) && (loops < 2) &&
+			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+				goto skip_rgrp;
+			if (sdp->sd_args.ar_rgrplvb) {
+				error = update_rgrp_lvb(rs->rs_rbm.rgd);
+				if (unlikely(error)) {
+					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+					return error;
+				}
+			}
+		}
+
+		/* Skip unuseable resource groups */
+		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
+						 GFS2_RDF_ERROR)) ||
+		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
+			goto skip_rgrp;
+
+		if (sdp->sd_args.ar_rgrplvb)
+			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
+
+		/* Get a reservation if we don't already have one */
+		if (!gfs2_rs_active(rs))
+			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
+
+		/* Skip rgrps when we can't get a reservation on first pass */
+		if (!gfs2_rs_active(rs) && (loops < 1))
+			goto check_rgrp;
+
+		/* If rgrp has enough free space, use it */
+		if (rs->rs_rbm.rgd->rd_free_clone >= ap->target ||
+		    (loops == 2 && ap->min_target &&
+		     rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
+			ip->i_rgd = rs->rs_rbm.rgd;
+			ap->allowed = ip->i_rgd->rd_free_clone;
+			return 0;
+		}
+check_rgrp:
+		/* Check for unlinked inodes which can be reclaimed */
+		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
+			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
+					ip->i_no_addr);
+skip_rgrp:
+		/* Drop reservation, if we couldn't use reserved rgrp */
+		if (gfs2_rs_active(rs))
+			gfs2_rs_deltree(rs);
+
+		/* Unlock rgrp if required */
+		if (!rg_locked)
+			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+next_rgrp:
+		/* Find the next rgrp, and continue looking */
+		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
+			continue;
+		if (skip)
+			continue;
+
+		/* If we've scanned all the rgrps, but found no free blocks
+		 * then this checks for some less likely conditions before
+		 * trying again.
+		 */
+		loops++;
+		/* Check that fs hasn't grown if writing to rindex */
+		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
+			error = gfs2_ri_update(ip);
+			if (error)
+				return error;
+		}
+		/* Flushing the log may release space */
+		if (loops == 2)
+			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+	}
+
+	return -ENOSPC;
+}
+
+/**
+ * gfs2_inplace_release - release an inplace reservation
+ * @ip: the inode the reservation was taken out on
+ *
+ * Release a reservation made by gfs2_inplace_reserve().
+ */
+
+void gfs2_inplace_release(struct gfs2_inode *ip)
+{
+	struct gfs2_blkreserv *rs = ip->i_res;
+
+	if (rs->rs_rgd_gh.gh_gl)
+		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+}
+
+/**
+ * gfs2_get_block_type - Check a block in a RG is of given type
+ * @rgd: the resource group holding the block
+ * @block: the block number
+ *
+ * Returns: The block type (GFS2_BLKST_*)
+ */
+
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+{
+	struct gfs2_rbm rbm = { .rgd = rgd, };
+	int ret;
+
+	ret = gfs2_rbm_from_block(&rbm, block);
+	WARN_ON_ONCE(ret != 0);
+
+	return gfs2_testbit(&rbm);
+}
+
+
+/**
+ * gfs2_alloc_extent - allocate an extent from a given bitmap
+ * @rbm: the resource group information
+ * @dinode: TRUE if the first block we allocate is for a dinode
+ * @n: The extent length (value/result)
+ *
+ * Add the bitmap buffer to the transaction.
+ * Set the found bits to @new_state to change block's allocation state.
+ */
+static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
+			     unsigned int *n)
+{
+	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
+	const unsigned int elen = *n;
+	u64 block;
+	int ret;
+
+	*n = 1;
+	block = gfs2_rbm_to_block(rbm);
+	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
+	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+	block++;
+	while (*n < elen) {
+		ret = gfs2_rbm_from_block(&pos, block);
+		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
+			break;
+		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
+		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
+		(*n)++;
+		block++;
+	}
+}
+
+/**
+ * rgblk_free - Change alloc state of given block(s)
+ * @sdp: the filesystem
+ * @bstart: the start of a run of blocks to free
+ * @blen: the length of the block run (all must lie within ONE RG!)
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Returns:  Resource group containing the block(s)
+ */
+
+static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
+				     u32 blen, unsigned char new_state)
+{
+	struct gfs2_rbm rbm;
+	struct gfs2_bitmap *bi, *bi_prev = NULL;
+
+	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
+	if (!rbm.rgd) {
+		if (gfs2_consist(sdp))
+			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
+		return NULL;
+	}
+
+	gfs2_rbm_from_block(&rbm, bstart);
+	while (blen--) {
+		bi = rbm_bi(&rbm);
+		if (bi != bi_prev) {
+			if (!bi->bi_clone) {
+				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+						      GFP_NOFS | __GFP_NOFAIL);
+				memcpy(bi->bi_clone + bi->bi_offset,
+				       bi->bi_bh->b_data + bi->bi_offset,
+				       bi->bi_len);
+			}
+			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
+			bi_prev = bi;
+		}
+		gfs2_setbit(&rbm, false, new_state);
+		gfs2_rbm_incr(&rbm);
+	}
+
+	return rbm.rgd;
+}
+
+/**
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_blkreserv *trs;
+	const struct rb_node *n;
+
+	if (rgd == NULL)
+		return;
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
+		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
+		       rgd->rd_reserved, rgd->rd_extfail_pt);
+	spin_lock(&rgd->rd_rsspin);
+	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
+		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		dump_rs(seq, trs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
+}
+
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+		(unsigned long long)rgd->rd_addr);
+	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	rgd->rd_flags |= GFS2_RDF_ERROR;
+}
+
+/**
+ * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
+ * @ip: The inode we have just allocated blocks for
+ * @rbm: The start of the allocated blocks
+ * @len: The extent length
+ *
+ * Adjusts a reservation after an allocation has taken place. If the
+ * reservation does not match the allocation, or if it is now empty
+ * then it is removed.
+ */
+
+static void gfs2_adjust_reservation(struct gfs2_inode *ip,
+				    const struct gfs2_rbm *rbm, unsigned len)
+{
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rbm->rgd;
+	unsigned rlen;
+	u64 block;
+	int ret;
+
+	spin_lock(&rgd->rd_rsspin);
+	if (gfs2_rs_active(rs)) {
+		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
+			block = gfs2_rbm_to_block(rbm);
+			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
+			rlen = min(rs->rs_free, len);
+			rs->rs_free -= rlen;
+			rgd->rd_reserved -= rlen;
+			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
+			if (rs->rs_free && !ret)
+				goto out;
+			/* We used up our block reservation, so we should
+			   reserve more blocks next time. */
+			atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
+		}
+		__rs_deltree(rs);
+	}
+out:
+	spin_unlock(&rgd->rd_rsspin);
+}
+
+/**
+ * gfs2_set_alloc_start - Set starting point for block allocation
+ * @rbm: The rbm which will be set to the required location
+ * @ip: The gfs2 inode
+ * @dinode: Flag to say if allocation includes a new inode
+ *
+ * This sets the starting point from the reservation if one is active
+ * otherwise it falls back to guessing a start point based on the
+ * inode's goal block or the last allocation point in the rgrp.
+ */
+
+static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
+				 const struct gfs2_inode *ip, bool dinode)
+{
+	u64 goal;
+
+	if (gfs2_rs_active(ip->i_res)) {
+		*rbm = ip->i_res->rs_rbm;
+		return;
+	}
+
+	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
+		goal = ip->i_goal;
+	else
+		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
+
+	gfs2_rbm_from_block(rbm, goal);
+}
+
+/**
+ * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
+ * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @nblocks: requested number of blocks/extent length (value/result)
+ * @dinode: 1 if we're allocating a dinode block, else 0
+ * @generation: the generation number of the inode
+ *
+ * Returns: 0 or error
+ */
+
+int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
+		      bool dinode, u64 *generation)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
+	unsigned int ndata;
+	u64 block; /* block, within the file system scope */
+	int error;
+
+	gfs2_set_alloc_start(&rbm, ip, dinode);
+	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
+
+	if (error == -ENOSPC) {
+		gfs2_set_alloc_start(&rbm, ip, dinode);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
+				      NULL);
+	}
+
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (error) {
+		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
+			(unsigned long long)ip->i_no_addr, error, *nblocks,
+			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
+			rbm.rgd->rd_extfail_pt);
+		goto rgrp_error;
+	}
+
+	gfs2_alloc_extent(&rbm, dinode, nblocks);
+	block = gfs2_rbm_to_block(&rbm);
+	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
+	if (gfs2_rs_active(ip->i_res))
+		gfs2_adjust_reservation(ip, &rbm, *nblocks);
+	ndata = *nblocks;
+	if (dinode)
+		ndata--;
+
+	if (!dinode) {
+		ip->i_goal = block + ndata - 1;
+		error = gfs2_meta_inode_buffer(ip, &dibh);
+		if (error == 0) {
+			struct gfs2_dinode *di =
+				(struct gfs2_dinode *)dibh->b_data;
+			gfs2_trans_add_meta(ip->i_gl, dibh);
+			di->di_goal_meta = di->di_goal_data =
+				cpu_to_be64(ip->i_goal);
+			brelse(dibh);
+		}
+	}
+	if (rbm.rgd->rd_free < *nblocks) {
+		pr_warn("nblocks=%u\n", *nblocks);
+		goto rgrp_error;
+	}
+
+	rbm.rgd->rd_free -= *nblocks;
+	if (dinode) {
+		rbm.rgd->rd_dinodes++;
+		*generation = rbm.rgd->rd_igeneration++;
+		if (*generation == 0)
+			*generation = rbm.rgd->rd_igeneration++;
+	}
+
+	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
+	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
+
+	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
+	if (dinode)
+		gfs2_trans_add_unrevoke(sdp, block, *nblocks);
+
+	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
+
+	rbm.rgd->rd_free_clone -= *nblocks;
+	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
+			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	gfs2_rgrp_error(rbm.rgd);
+	return -EIO;
+}
+
+/**
+ * __gfs2_free_blocks - free a contiguous run of block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ * @meta: 1 if the blocks represent metadata
+ *
+ */
+
+void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+
+	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+	if (!rgd)
+		return;
+	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
+	rgd->rd_free += blen;
+	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
+	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
+
+	/* Directories keep their data in the metadata address space */
+	if (meta || ip->i_depth)
+		gfs2_meta_wipe(ip, bstart, blen);
+}
+
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	__gfs2_free_blocks(ip, bstart, blen, 1);
+	gfs2_statfs_change(sdp, 0, +blen, 0);
+	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
+}
+
+void gfs2_unlink_di(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_rgrpd *rgd;
+	u64 blkno = ip->i_no_addr;
+
+	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+	if (!rgd)
+		return;
+	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
+	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
+	update_rgrp_lvb_unlinked(rgd, 1);
+}
+
+static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_rgrpd *tmp_rgd;
+
+	tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
+	if (!tmp_rgd)
+		return;
+	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
+
+	if (!rgd->rd_dinodes)
+		gfs2_consist_rgrpd(rgd);
+	rgd->rd_dinodes--;
+	rgd->rd_free++;
+
+	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
+	update_rgrp_lvb_unlinked(rgd, -1);
+
+	gfs2_statfs_change(sdp, 0, +1, -1);
+}
+
+
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+	gfs2_free_uninit_di(rgd, ip->i_no_addr);
+	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
+	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
+	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
+}
+
+/**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ *          -ESTALE if it doesn't match
+ *          or -ve errno if something went wrong while checking
+ */
+
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder rgd_gh;
+	int error = -EINVAL;
+
+	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
+	if (!rgd)
+		goto fail;
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+	if (error)
+		goto fail;
+
+	if (gfs2_get_block_type(rgd, no_addr) != type)
+		error = -ESTALE;
+
+	gfs2_glock_dq_uninit(&rgd_gh);
+fail:
+	return error;
+}
+
+/**
+ * gfs2_rlist_add - add a RG to a list of RGs
+ * @ip: the inode
+ * @rlist: the list of resource groups
+ * @block: the block
+ *
+ * Figure out what RG a block belongs to and add that RG to the list
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
+		    u64 block)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_rgrpd **tmp;
+	unsigned int new_space;
+	unsigned int x;
+
+	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
+		return;
+
+	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
+		rgd = ip->i_rgd;
+	else
+		rgd = gfs2_blk2rgrpd(sdp, block, 1);
+	if (!rgd) {
+		fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
+		return;
+	}
+	ip->i_rgd = rgd;
+
+	for (x = 0; x < rlist->rl_rgrps; x++)
+		if (rlist->rl_rgd[x] == rgd)
+			return;
+
+	if (rlist->rl_rgrps == rlist->rl_space) {
+		new_space = rlist->rl_space + 10;
+
+		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
+			      GFP_NOFS | __GFP_NOFAIL);
+
+		if (rlist->rl_rgd) {
+			memcpy(tmp, rlist->rl_rgd,
+			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
+			kfree(rlist->rl_rgd);
+		}
+
+		rlist->rl_space = new_space;
+		rlist->rl_rgd = tmp;
+	}
+
+	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
+}
+
+/**
+ * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
+ *      and initialize an array of glock holders for them
+ * @rlist: the list of resource groups
+ * @state: the lock state to acquire the RG lock in
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
+{
+	unsigned int x;
+
+	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
+				GFP_NOFS | __GFP_NOFAIL);
+	for (x = 0; x < rlist->rl_rgrps; x++)
+		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
+				state, 0,
+				&rlist->rl_ghs[x]);
+}
+
+/**
+ * gfs2_rlist_free - free a resource group list
+ * @rlist: the list of resource groups
+ *
+ */
+
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
+{
+	unsigned int x;
+
+	kfree(rlist->rl_rgd);
+
+	if (rlist->rl_ghs) {
+		for (x = 0; x < rlist->rl_rgrps; x++)
+			gfs2_holder_uninit(&rlist->rl_ghs[x]);
+		kfree(rlist->rl_ghs);
+		rlist->rl_ghs = NULL;
+	}
+}
+
diff --git a/kernel/fs/gfs2/rgrp.h b/kernel/fs/gfs2/rgrp.h
new file mode 100644
index 000000000..68972ecfb
--- /dev/null
+++ b/kernel/fs/gfs2/rgrp.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RGRP_DOT_H__
+#define __RGRP_DOT_H__
+
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* Since each block in the file system is represented by two bits in the
+ * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
+ * By reserving 32 blocks at a time, we can optimize / shortcut how we search
+ * through the bitmaps by looking a word at a time.
+ */
+#define RGRP_RSRV_MINBYTES 8
+#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+#define RGRP_RSRV_ADDBLKS 64
+
+struct gfs2_rgrpd;
+struct gfs2_sbd;
+struct gfs2_holder;
+
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+
+extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
+extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
+
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+
+#define GFS2_AF_ORLOV 1
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip,
+				struct gfs2_alloc_parms *ap);
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
+
+extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
+			     bool dinode, u64 *generation);
+
+extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
+extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+			       unsigned int type);
+
+struct gfs2_rgrp_list {
+	unsigned int rl_rgrps;
+	unsigned int rl_space;
+	struct gfs2_rgrpd **rl_rgd;
+	struct gfs2_holder *rl_ghs;
+};
+
+extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
+			   u64 block);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+				   struct buffer_head *bh,
+				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
+extern int gfs2_fitrim(struct file *filp, void __user *argp);
+
+/* This is how to tell if a reservation is in the rgrp tree: */
+static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
+{
+	return rs && !RB_EMPTY_NODE(&rs->rs_node);
+}
+
+extern void check_and_update_goal(struct gfs2_inode *ip);
+#endif /* __RGRP_DOT_H__ */
diff --git a/kernel/fs/gfs2/super.c b/kernel/fs/gfs2/super.c
new file mode 100644
index 000000000..859c6edbf
--- /dev/null
+++ b/kernel/fs/gfs2/super.c
@@ -0,0 +1,1666 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bio.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/kernel.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+#include "sys.h"
+#include "xattr.h"
+
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_quota,
+	Opt_noquota,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+	Opt_meta,
+	Opt_discard,
+	Opt_nodiscard,
+	Opt_commit,
+	Opt_err_withdraw,
+	Opt_err_panic,
+	Opt_statfs_quantum,
+	Opt_statfs_percent,
+	Opt_quota_quantum,
+	Opt_barrier,
+	Opt_nobarrier,
+	Opt_rgrplvb,
+	Opt_norgrplvb,
+	Opt_error,
+};
+
+static const match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_spectator, "norecovery"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_meta, "meta"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
+	{Opt_err_withdraw, "errors=withdraw"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_statfs_quantum, "statfs_quantum=%d"},
+	{Opt_statfs_percent, "statfs_percent=%d"},
+	{Opt_quota_quantum, "quota_quantum=%d"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_rgrplvb, "rgrplvb"},
+	{Opt_norgrplvb, "norgrplvb"},
+	{Opt_error, NULL}
+};
+
+/**
+ * gfs2_mount_args - Parse mount options
+ * @args: The structure into which the parsed options will be written
+ * @options: The options to parse
+ *
+ * Return: errno
+ */
+
+int gfs2_mount_args(struct gfs2_args *args, char *options)
+{
+	char *o;
+	int token;
+	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
+
+	/* Split the options into tokens with the "," character and
+	   process them */
+
+	while (1) {
+		o = strsep(&options, ",");
+		if (o == NULL)
+			break;
+		if (*o == '\0')
+			continue;
+
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			match_strlcpy(args->ar_lockproto, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_locktable:
+			match_strlcpy(args->ar_locktable, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_hostdata:
+			match_strlcpy(args->ar_hostdata, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_spectator:
+			args->ar_spectator = 1;
+			break;
+		case Opt_ignore_local_fs:
+			/* Retained for backwards compat only */
+			break;
+		case Opt_localflocks:
+			args->ar_localflocks = 1;
+			break;
+		case Opt_localcaching:
+			/* Retained for backwards compat only */
+			break;
+		case Opt_debug:
+			if (args->ar_errors == GFS2_ERRORS_PANIC) {
+				pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
+				return -EINVAL;
+			}
+			args->ar_debug = 1;
+			break;
+		case Opt_nodebug:
+			args->ar_debug = 0;
+			break;
+		case Opt_upgrade:
+			/* Retained for backwards compat only */
+			break;
+		case Opt_acl:
+			args->ar_posix_acl = 1;
+			break;
+		case Opt_noacl:
+			args->ar_posix_acl = 0;
+			break;
+		case Opt_quota_off:
+		case Opt_noquota:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+		case Opt_quota:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
+			args->ar_suiddir = 1;
+			break;
+		case Opt_nosuiddir:
+			args->ar_suiddir = 0;
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		case Opt_meta:
+			args->ar_meta = 1;
+			break;
+		case Opt_discard:
+			args->ar_discard = 1;
+			break;
+		case Opt_nodiscard:
+			args->ar_discard = 0;
+			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				pr_warn("commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_statfs_quantum:
+			rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+			if (rv || args->ar_statfs_quantum < 0) {
+				pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_quota_quantum:
+			rv = match_int(&tmp[0], &args->ar_quota_quantum);
+			if (rv || args->ar_quota_quantum <= 0) {
+				pr_warn("quota_quantum mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_statfs_percent:
+			rv = match_int(&tmp[0], &args->ar_statfs_percent);
+			if (rv || args->ar_statfs_percent < 0 ||
+			    args->ar_statfs_percent > 100) {
+				pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_err_withdraw:
+			args->ar_errors = GFS2_ERRORS_WITHDRAW;
+			break;
+		case Opt_err_panic:
+			if (args->ar_debug) {
+				pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
+				return -EINVAL;
+			}
+			args->ar_errors = GFS2_ERRORS_PANIC;
+			break;
+		case Opt_barrier:
+			args->ar_nobarrier = 0;
+			break;
+		case Opt_nobarrier:
+			args->ar_nobarrier = 1;
+			break;
+		case Opt_rgrplvb:
+			args->ar_rgrplvb = 1;
+			break;
+		case Opt_norgrplvb:
+			args->ar_rgrplvb = 0;
+			break;
+		case Opt_error:
+		default:
+			pr_warn("invalid mount option: %s\n", o);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+
+void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+	struct list_head list;
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_add(&list, &sdp->sd_jindex_list);
+	list_del_init(&sdp->sd_jindex_list);
+	sdp->sd_journals = 0;
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	while (!list_empty(&list)) {
+		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+		gfs2_free_journal_extents(jd);
+		list_del(&jd->jd_list);
+		iput(jd->jd_inode);
+		kfree(jd);
+	}
+}
+
+static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+	int found = 0;
+
+	list_for_each_entry(jd, head, jd_list) {
+		if (jd->jd_jid == jid) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		jd = NULL;
+
+	return jd;
+}
+
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	return jd;
+}
+
+int gfs2_jdesc_check(struct gfs2_jdesc *jd)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	u64 size = i_size_read(jd->jd_inode);
+
+	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
+		return -EIO;
+
+	jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+	if (gfs2_write_alloc_required(ip, 0, size)) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int init_threads(struct gfs2_sbd *sdp)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
+		fs_err(sdp, "can't start logd thread: %d\n", error);
+		return error;
+	}
+	sdp->sd_logd_process = p;
+
+	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
+		fs_err(sdp, "can't start quotad thread: %d\n", error);
+		goto fail;
+	}
+	sdp->sd_quotad_process = p;
+	return 0;
+
+fail:
+	kthread_stop(sdp->sd_logd_process);
+	return error;
+}
+
+/**
+ * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+	struct gfs2_glock *j_gl = ip->i_gl;
+	struct gfs2_holder freeze_gh;
+	struct gfs2_log_header_host head;
+	int error;
+
+	error = init_threads(sdp);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+				   &freeze_gh);
+	if (error)
+		goto fail_threads;
+
+	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
+
+	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+	if (error)
+		goto fail;
+
+	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+		gfs2_consist(sdp);
+		error = -EIO;
+		goto fail;
+	}
+
+	/*  Initialize some head of the log stuff  */
+	sdp->sd_log_sequence = head.lh_sequence + 1;
+	gfs2_log_pointers_init(sdp, head.lh_blkno);
+
+	error = gfs2_quota_init(sdp);
+	if (error)
+		goto fail;
+
+	set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	gfs2_glock_dq_uninit(&freeze_gh);
+
+	return 0;
+
+fail:
+	freeze_gh.gh_flags |= GL_NOCACHE;
+	gfs2_glock_dq_uninit(&freeze_gh);
+fail_threads:
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+	return error;
+}
+
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+{
+	const struct gfs2_statfs_change *str = buf;
+
+	sc->sc_total = be64_to_cpu(str->sc_total);
+	sc->sc_free = be64_to_cpu(str->sc_free);
+	sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
+}
+
+static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
+{
+	struct gfs2_statfs_change *str = buf;
+
+	str->sc_total = cpu_to_be64(sc->sc_total);
+	str->sc_free = cpu_to_be64(sc->sc_free);
+	str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
+}
+
+int gfs2_statfs_init(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct buffer_head *m_bh, *l_bh;
+	struct gfs2_holder gh;
+	int error;
+
+	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+				   &gh);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	if (error)
+		goto out;
+
+	if (sdp->sd_args.ar_spectator) {
+		spin_lock(&sdp->sd_statfs_spin);
+		gfs2_statfs_change_in(m_sc, m_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		spin_unlock(&sdp->sd_statfs_spin);
+	} else {
+		error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+		if (error)
+			goto out_m_bh;
+
+		spin_lock(&sdp->sd_statfs_spin);
+		gfs2_statfs_change_in(m_sc, m_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		gfs2_statfs_change_in(l_sc, l_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		spin_unlock(&sdp->sd_statfs_spin);
+
+		brelse(l_bh);
+	}
+
+out_m_bh:
+	brelse(m_bh);
+out:
+	gfs2_glock_dq_uninit(&gh);
+	return 0;
+}
+
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+			s64 dinodes)
+{
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct buffer_head *l_bh;
+	s64 x, y;
+	int need_sync = 0;
+	int error;
+
+	error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+	if (error)
+		return;
+
+	gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	l_sc->sc_total += total;
+	l_sc->sc_free += free;
+	l_sc->sc_dinodes += dinodes;
+	gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+	if (sdp->sd_args.ar_statfs_percent) {
+		x = 100 * l_sc->sc_free;
+		y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+		if (x >= y || x <= -y)
+			need_sync = 1;
+	}
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	brelse(l_bh);
+	if (need_sync)
+		gfs2_wake_up_statfs(sdp);
+}
+
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+		   struct buffer_head *l_bh)
+{
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	m_sc->sc_total += l_sc->sc_total;
+	m_sc->sc_free += l_sc->sc_free;
+	m_sc->sc_dinodes += l_sc->sc_dinodes;
+	memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+	memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+	       0, sizeof(struct gfs2_statfs_change));
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	gfs2_trans_add_meta(m_ip->i_gl, m_bh);
+	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
+
+int gfs2_statfs_sync(struct super_block *sb, int type)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_holder gh;
+	struct buffer_head *m_bh, *l_bh;
+	int error;
+
+	sb_start_write(sb);
+	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+				   &gh);
+	if (error)
+		goto out;
+
+	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	if (error)
+		goto out_unlock;
+
+	spin_lock(&sdp->sd_statfs_spin);
+	gfs2_statfs_change_in(m_sc, m_bh->b_data +
+			      sizeof(struct gfs2_dinode));
+	if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
+		spin_unlock(&sdp->sd_statfs_spin);
+		goto out_bh;
+	}
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+	if (error)
+		goto out_bh;
+
+	error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+	if (error)
+		goto out_bh2;
+
+	update_statfs(sdp, m_bh, l_bh);
+	sdp->sd_statfs_force_sync = 0;
+
+	gfs2_trans_end(sdp);
+
+out_bh2:
+	brelse(l_bh);
+out_bh:
+	brelse(m_bh);
+out_unlock:
+	gfs2_glock_dq_uninit(&gh);
+out:
+	sb_end_write(sb);
+	return error;
+}
+
+struct lfcc {
+	struct list_head list;
+	struct gfs2_holder gh;
+};
+
+/**
+ * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
+ *                            journals are clean
+ * @sdp: the file system
+ * @state: the state to put the transaction lock into
+ * @t_gh: the hold on the transaction lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
+				    struct gfs2_holder *freeze_gh)
+{
+	struct gfs2_inode *ip;
+	struct gfs2_jdesc *jd;
+	struct lfcc *lfcc;
+	LIST_HEAD(list);
+	struct gfs2_log_header_host lh;
+	int error;
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
+		if (!lfcc) {
+			error = -ENOMEM;
+			goto out;
+		}
+		ip = GFS2_I(jd->jd_inode);
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
+		if (error) {
+			kfree(lfcc);
+			goto out;
+		}
+		list_add(&lfcc->list, &list);
+	}
+
+	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
+				   GL_NOCACHE, freeze_gh);
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		error = gfs2_jdesc_check(jd);
+		if (error)
+			break;
+		error = gfs2_find_jhead(jd, &lh);
+		if (error)
+			break;
+		if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+			error = -EBUSY;
+			break;
+		}
+	}
+
+	if (error)
+		gfs2_glock_dq_uninit(freeze_gh);
+
+out:
+	while (!list_empty(&list)) {
+		lfcc = list_entry(list.next, struct lfcc, list);
+		list_del(&lfcc->list);
+		gfs2_glock_dq_uninit(&lfcc->gh);
+		kfree(lfcc);
+	}
+	return error;
+}
+
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
+{
+	struct gfs2_dinode *str = buf;
+
+	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
+	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
+	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
+	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
+	str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
+	str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
+	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
+	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
+	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
+	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
+	str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+
+	str->di_goal_meta = cpu_to_be64(ip->i_goal);
+	str->di_goal_data = cpu_to_be64(ip->i_goal);
+	str->di_generation = cpu_to_be64(ip->i_generation);
+
+	str->di_flags = cpu_to_be32(ip->i_diskflags);
+	str->di_height = cpu_to_be16(ip->i_height);
+	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+					     !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
+					     GFS2_FORMAT_DE : 0);
+	str->di_depth = cpu_to_be16(ip->i_depth);
+	str->di_entries = cpu_to_be32(ip->i_entries);
+
+	str->di_eattr = cpu_to_be64(ip->i_eattr);
+	str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
+	str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
+	str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+}
+
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @wbc: The writeback control structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
+	struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
+	int ret = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
+	if (bdi->dirty_exceeded)
+		gfs2_ail1_flush(sdp, wbc);
+	else
+		filemap_fdatawrite(metamapping);
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ret = filemap_fdatawait(metamapping);
+	if (ret)
+		mark_inode_dirty_sync(inode);
+	return ret;
+}
+
+/**
+ * gfs2_dirty_inode - check for atime updates
+ * @inode: The inode in question
+ * @flags: The type of dirty
+ *
+ * Unfortunately it can be called under any combination of inode
+ * glock and transaction lock, so we have to check carefully.
+ *
+ * At the moment this deals only with atime - it should be possible
+ * to expand that role in future, once a review of the locking has
+ * been carried out.
+ */
+
+static void gfs2_dirty_inode(struct inode *inode, int flags)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder gh;
+	int need_unlock = 0;
+	int need_endtrans = 0;
+	int ret;
+
+	if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
+		return;
+
+	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+		if (ret) {
+			fs_err(sdp, "dirty_inode: glock %d\n", ret);
+			return;
+		}
+		need_unlock = 1;
+	} else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
+		return;
+
+	if (current->journal_info == NULL) {
+		ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+		if (ret) {
+			fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
+			goto out;
+		}
+		need_endtrans = 1;
+	}
+
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		gfs2_trans_add_meta(ip->i_gl, bh);
+		gfs2_dinode_out(ip, bh->b_data);
+		brelse(bh);
+	}
+
+	if (need_endtrans)
+		gfs2_trans_end(sdp);
+out:
+	if (need_unlock)
+		gfs2_glock_dq_uninit(&gh);
+}
+
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder freeze_gh;
+	int error;
+
+	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &freeze_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+
+	flush_workqueue(gfs2_delete_workqueue);
+	gfs2_quota_sync(sdp->sd_vfs, 0);
+	gfs2_statfs_sync(sdp->sd_vfs, 0);
+
+	down_write(&sdp->sd_log_flush_lock);
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+	up_write(&sdp->sd_log_flush_lock);
+
+	gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
+	wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
+	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
+
+	if (freeze_gh.gh_gl)
+		gfs2_glock_dq_uninit(&freeze_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
+}
+
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+	struct gfs2_jdesc *jd;
+
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+	/*  At this point, we're through modifying the disk  */
+
+	/*  Release stuff  */
+
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_freeze_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	gfs2_clear_rgrpd(sdp);
+	gfs2_jindex_free(sdp);
+	/*  Take apart glock structures and buffer lists  */
+	gfs2_gl_hash_clear(sdp);
+	/*  Unmount the locking protocol  */
+	gfs2_lm_unmount(sdp);
+
+	/*  At this point, we're through participating in the lockspace  */
+	gfs2_sys_fs_del(sdp);
+}
+
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	gfs2_quota_sync(sb, -1);
+	if (wait && sdp)
+		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+	return 0;
+}
+
+void gfs2_freeze_func(struct work_struct *work)
+{
+	int error;
+	struct gfs2_holder freeze_gh;
+	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work);
+	struct super_block *sb = sdp->sd_vfs;
+
+	atomic_inc(&sb->s_active);
+	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+				   &freeze_gh);
+	if (error) {
+		printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error);
+		gfs2_assert_withdraw(sdp, 0);
+	}
+	else {
+		atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+		error = thaw_super(sb);
+		if (error) {
+			printk(KERN_INFO "GFS2: couldn't thaw filesystem: %d\n",
+			       error);
+			gfs2_assert_withdraw(sdp, 0);
+		}
+		if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+			freeze_gh.gh_flags |= GL_NOCACHE;
+		gfs2_glock_dq_uninit(&freeze_gh);
+	}
+	deactivate_super(sb);
+	return;
+}
+
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_freeze(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error = 0;
+
+	mutex_lock(&sdp->sd_freeze_mutex);
+	if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
+		goto out;
+
+	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	for (;;) {
+		error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
+		if (!error)
+			break;
+
+		switch (error) {
+		case -EBUSY:
+			fs_err(sdp, "waiting for recovery before freeze\n");
+			break;
+
+		default:
+			fs_err(sdp, "error freezing FS: %d\n", error);
+			break;
+		}
+
+		fs_err(sdp, "retrying...\n");
+		msleep(1000);
+	}
+	error = 0;
+out:
+	mutex_unlock(&sdp->sd_freeze_mutex);
+	return error;
+}
+
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_unfreeze(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	mutex_lock(&sdp->sd_freeze_mutex);
+        if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
+	    sdp->sd_freeze_gh.gh_gl == NULL) {
+		mutex_unlock(&sdp->sd_freeze_mutex);
+                return 0;
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_mutex);
+	return 0;
+}
+
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change_host *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_data;
+	sc->sc_free += rgd->rd_free;
+	sc->sc_dinodes += rgd->rd_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	kfree(gha);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = d_inode(dentry)->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_statfs_change_host sc;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	if (gfs2_tune_get(sdp, gt_statfs_slow))
+		error = gfs2_statfs_slow(sdp, &sc);
+	else
+		error = gfs2_statfs_i(sdp, &sc);
+
+	if (error)
+		return error;
+
+	buf->f_type = GFS2_MAGIC;
+	buf->f_bsize = sdp->sd_sb.sb_bsize;
+	buf->f_blocks = sc.sc_total;
+	buf->f_bfree = sc.sc_free;
+	buf->f_bavail = sc.sc_free;
+	buf->f_files = sc.sc_dinodes + sc.sc_free;
+	buf->f_ffree = sc.sc_free;
+	buf->f_namelen = GFS2_FNAMESIZE;
+
+	return 0;
+}
+
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	int error;
+
+	sync_filesystem(sb);
+
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_logd_secs;
+	args.ar_quota_quantum = gt->gt_quota_quantum;
+	if (gt->gt_statfs_slow)
+		args.ar_statfs_quantum = 0;
+	else
+		args.ar_statfs_quantum = gt->gt_statfs_quantum;
+	spin_unlock(&gt->gt_spin);
+	error = gfs2_mount_args(&args, data);
+	if (error)
+		return error;
+
+	/* Not allowed to change locking details */
+	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+		return -EINVAL;
+
+	/* Some flags must not be changed */
+	if (args_neq(&args, &sdp->sd_args, spectator) ||
+	    args_neq(&args, &sdp->sd_args, localflocks) ||
+	    args_neq(&args, &sdp->sd_args, meta))
+		return -EINVAL;
+
+	if (sdp->sd_args.ar_spectator)
+		*flags |= MS_RDONLY;
+
+	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+		if (*flags & MS_RDONLY)
+			error = gfs2_make_fs_ro(sdp);
+		else
+			error = gfs2_make_fs_rw(sdp);
+		if (error)
+			return error;
+	}
+
+	sdp->sd_args = args;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	else
+		sb->s_flags &= ~MS_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	else
+		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	spin_lock(&gt->gt_spin);
+	gt->gt_logd_secs = args.ar_commit;
+	gt->gt_quota_quantum = args.ar_quota_quantum;
+	if (args.ar_statfs_quantum) {
+		gt->gt_statfs_slow = 0;
+		gt->gt_statfs_quantum = args.ar_statfs_quantum;
+	}
+	else {
+		gt->gt_statfs_slow = 1;
+		gt->gt_statfs_quantum = 30;
+	}
+	spin_unlock(&gt->gt_spin);
+
+	gfs2_online_uevent(sdp);
+	return 0;
+}
+
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+
+static int gfs2_drop_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) {
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	return generic_drop_inode(inode);
+}
+
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+	do {
+		if (d1 == d2)
+			return 1;
+		d1 = d1->d_parent;
+	} while (!IS_ROOT(d1));
+	return 0;
+}
+
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @root: root of this (sub)tree
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_show_options(struct seq_file *s, struct dentry *root)
+{
+	struct gfs2_sbd *sdp = root->d_sb->s_fs_info;
+	struct gfs2_args *args = &sdp->sd_args;
+	int val;
+
+	if (is_ancestor(root, sdp->sd_master_dir))
+		seq_puts(s, ",meta");
+	if (args->ar_lockproto[0])
+		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+	if (args->ar_locktable[0])
+		seq_printf(s, ",locktable=%s", args->ar_locktable);
+	if (args->ar_hostdata[0])
+		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+	if (args->ar_spectator)
+		seq_puts(s, ",spectator");
+	if (args->ar_localflocks)
+		seq_puts(s, ",localflocks");
+	if (args->ar_debug)
+		seq_puts(s, ",debug");
+	if (args->ar_posix_acl)
+		seq_puts(s, ",acl");
+	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+		char *state;
+		switch (args->ar_quota) {
+		case GFS2_QUOTA_OFF:
+			state = "off";
+			break;
+		case GFS2_QUOTA_ACCOUNT:
+			state = "account";
+			break;
+		case GFS2_QUOTA_ON:
+			state = "on";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",quota=%s", state);
+	}
+	if (args->ar_suiddir)
+		seq_puts(s, ",suiddir");
+	if (args->ar_data != GFS2_DATA_DEFAULT) {
+		char *state;
+		switch (args->ar_data) {
+		case GFS2_DATA_WRITEBACK:
+			state = "writeback";
+			break;
+		case GFS2_DATA_ORDERED:
+			state = "ordered";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",data=%s", state);
+	}
+	if (args->ar_discard)
+		seq_puts(s, ",discard");
+	val = sdp->sd_tune.gt_logd_secs;
+	if (val != 30)
+		seq_printf(s, ",commit=%d", val);
+	val = sdp->sd_tune.gt_statfs_quantum;
+	if (val != 30)
+		seq_printf(s, ",statfs_quantum=%d", val);
+	else if (sdp->sd_tune.gt_statfs_slow)
+		seq_puts(s, ",statfs_quantum=0");
+	val = sdp->sd_tune.gt_quota_quantum;
+	if (val != 60)
+		seq_printf(s, ",quota_quantum=%d", val);
+	if (args->ar_statfs_percent)
+		seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
+	if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+		const char *state;
+
+		switch (args->ar_errors) {
+		case GFS2_ERRORS_WITHDRAW:
+			state = "withdraw";
+			break;
+		case GFS2_ERRORS_PANIC:
+			state = "panic";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",errors=%s", state);
+	}
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+		seq_puts(s, ",nobarrier");
+	if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+		seq_puts(s, ",demote_interface_used");
+	if (args->ar_rgrplvb)
+		seq_puts(s, ",rgrplvb");
+	return 0;
+}
+
+static void gfs2_final_release_pages(struct gfs2_inode *ip)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_glock *gl = ip->i_gl;
+
+	truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (atomic_read(&gl->gl_revokes) == 0) {
+		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		clear_bit(GLF_DIRTY, &gl->gl_flags);
+	}
+}
+
+static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder gh;
+	int error;
+
+	if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (error)
+		return error;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out_qs;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		goto out_qs;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
+				 sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_rg_gunlock;
+
+	gfs2_free_di(rgd, ip);
+
+	gfs2_final_release_pages(ip);
+
+	gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+	gfs2_glock_dq_uninit(&gh);
+out_qs:
+	gfs2_quota_unhold(ip);
+	return error;
+}
+
+/**
+ * gfs2_evict_inode - Remove an inode from cache
+ * @inode: The inode to evict
+ *
+ * There are three cases to consider:
+ * 1. i_nlink == 0, we are final opener (and must deallocate)
+ * 2. i_nlink == 0, we are not the final opener (and cannot deallocate)
+ * 3. i_nlink > 0
+ *
+ * If the fs is read only, then we have to treat all cases as per #3
+ * since we are unable to do any deallocation. The inode will be
+ * deallocated by the next read/write node to attempt an allocation
+ * in the same resource group
+ *
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+
+static void gfs2_evict_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+
+	if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
+		clear_inode(inode);
+		return;
+	}
+
+	if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
+		goto out;
+
+	/* Must not read inode block until block type has been verified */
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
+	if (unlikely(error)) {
+		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		goto out;
+	}
+
+	if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
+		error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+		if (error)
+			goto out_truncate;
+	}
+
+	if (test_bit(GIF_INVALID, &ip->i_flags)) {
+		error = gfs2_inode_refresh(ip);
+		if (error)
+			goto out_truncate;
+	}
+
+	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+	gfs2_glock_dq_wait(&ip->i_iopen_gh);
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+	error = gfs2_glock_nq(&ip->i_iopen_gh);
+	if (error)
+		goto out_truncate;
+
+	/* Case 1 starts here */
+
+	if (S_ISDIR(inode->i_mode) &&
+	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+		error = gfs2_dir_exhash_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (ip->i_eattr) {
+		error = gfs2_ea_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (!gfs2_is_stuffed(ip)) {
+		error = gfs2_file_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = gfs2_dinode_dealloc(ip);
+	goto out_unlock;
+
+out_truncate:
+	gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
+		struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
+		filemap_fdatawrite(metamapping);
+		filemap_fdatawait(metamapping);
+	}
+	write_inode_now(inode, 1);
+	gfs2_ail_flush(ip->i_gl, 0);
+
+	/* Case 2 starts here */
+	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_unlock;
+	/* Needs to be done before glock release & also in a transaction */
+	truncate_inode_pages(&inode->i_data, 0);
+	gfs2_trans_end(sdp);
+
+out_unlock:
+	/* Error path for case 1 */
+	if (gfs2_rs_active(ip->i_res))
+		gfs2_rs_deltree(ip->i_res);
+
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+		gfs2_glock_dq(&ip->i_iopen_gh);
+	}
+	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&gh);
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
+		fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
+out:
+	/* Case 3 starts here */
+	truncate_inode_pages_final(&inode->i_data);
+	gfs2_rs_delete(ip, NULL);
+	gfs2_ordered_del_inode(ip);
+	clear_inode(inode);
+	gfs2_dir_hash_inval(ip);
+	ip->i_gl->gl_object = NULL;
+	flush_delayed_work(&ip->i_gl->gl_work);
+	gfs2_glock_add_to_lru(ip->i_gl);
+	gfs2_glock_put(ip->i_gl);
+	ip->i_gl = NULL;
+	if (ip->i_iopen_gh.gh_gl) {
+		ip->i_iopen_gh.gh_gl->gl_object = NULL;
+		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+	}
+}
+
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+	struct gfs2_inode *ip;
+
+	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+	if (ip) {
+		ip->i_flags = 0;
+		ip->i_gl = NULL;
+		ip->i_rgd = NULL;
+		ip->i_res = NULL;
+	}
+	return &ip->i_inode;
+}
+
+static void gfs2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(gfs2_inode_cachep, inode);
+}
+
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
+
+const struct super_operations gfs2_super_ops = {
+	.alloc_inode		= gfs2_alloc_inode,
+	.destroy_inode		= gfs2_destroy_inode,
+	.write_inode		= gfs2_write_inode,
+	.dirty_inode		= gfs2_dirty_inode,
+	.evict_inode		= gfs2_evict_inode,
+	.put_super		= gfs2_put_super,
+	.sync_fs		= gfs2_sync_fs,
+	.freeze_super		= gfs2_freeze,
+	.thaw_super		= gfs2_unfreeze,
+	.statfs			= gfs2_statfs,
+	.remount_fs		= gfs2_remount_fs,
+	.drop_inode		= gfs2_drop_inode,
+	.show_options		= gfs2_show_options,
+};
+
diff --git a/kernel/fs/gfs2/super.h b/kernel/fs/gfs2/super.h
new file mode 100644
index 000000000..73c97dcca
--- /dev/null
+++ b/kernel/fs/gfs2/super.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SUPER_DOT_H__
+#define __SUPER_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include "incore.h"
+
+extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+
+static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
+{
+	unsigned int x;
+	spin_lock(&sdp->sd_jindex_spin);
+	x = sdp->sd_journals;
+	spin_unlock(&sdp->sd_jindex_spin);
+	return x;
+}
+
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
+
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
+
+extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+
+extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+				     struct gfs2_inode **ipp);
+
+extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
+extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+			       s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+				  const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+			  struct buffer_head *l_bh);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
+extern void gfs2_freeze_func(struct work_struct *work);
+
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern const struct dentry_operations gfs2_dops;
+extern const struct xattr_handler *gfs2_xattr_handlers[];
+
+#endif /* __SUPER_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/sys.c b/kernel/fs/gfs2/sys.c
new file mode 100644
index 000000000..ae8e8811f
--- /dev/null
+++ b/kernel/fs/gfs2/sys.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <asm/uaccess.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "sys.h"
+#include "super.h"
+#include "glock.h"
+#include "quota.h"
+#include "util.h"
+#include "glops.h"
+#include "recovery.h"
+
+struct gfs2_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->show ? a->show(sdp, buf) : 0;
+}
+
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->store ? a->store(sdp, buf, len) : len;
+}
+
+static const struct sysfs_ops gfs2_attr_ops = {
+	.show  = gfs2_attr_show,
+	.store = gfs2_attr_store,
+};
+
+
+static struct kset *gfs2_kset;
+
+static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
+			MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
+}
+
+static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
+}
+
+static int gfs2_uuid_valid(const u8 *uuid)
+{
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		if (uuid[i])
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct super_block *s = sdp->sd_vfs;
+	const u8 *uuid = s->s_uuid;
+	buf[0] = '\0';
+	if (!gfs2_uuid_valid(uuid))
+		return 0;
+	return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid);
+}
+
+static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", frozen);
+}
+
+static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	int error;
+	int n = simple_strtol(buf, NULL, 0);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (n) {
+	case 0:
+		error = thaw_super(sdp->sd_vfs);
+		break;
+	case 1:
+		error = freeze_super(sdp->sd_vfs);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (error) {
+		fs_warn(sdp, "freeze %d error %d", n, error);
+		return error;
+	}
+
+	return len;
+}
+
+static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
+{
+	unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
+	return snprintf(buf, PAGE_SIZE, "%u\n", b);
+}
+
+static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
+
+	return len;
+}
+
+static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
+				 size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_statfs_sync(sdp->sd_vfs, 0);
+	return len;
+}
+
+static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
+				size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_quota_sync(sdp->sd_vfs, 0);
+	return len;
+}
+
+static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
+					size_t len)
+{
+	struct kqid qid;
+	int error;
+	u32 id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	id = simple_strtoul(buf, NULL, 0);
+
+	qid = make_kqid(current_user_ns(), USRQUOTA, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+
+	error = gfs2_quota_refresh(sdp, qid);
+	return error ? error : len;
+}
+
+static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
+					 size_t len)
+{
+	struct kqid qid;
+	int error;
+	u32 id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	id = simple_strtoul(buf, NULL, 0);
+
+	qid = make_kqid(current_user_ns(), GRPQUOTA, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+
+	error = gfs2_quota_refresh(sdp, qid);
+	return error ? error : len;
+}
+
+static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct gfs2_glock *gl;
+	const struct gfs2_glock_operations *glops;
+	unsigned int glmode;
+	unsigned int gltype;
+	unsigned long long glnum;
+	char mode[16];
+	int rv;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
+		    mode);
+	if (rv != 3)
+		return -EINVAL;
+
+	if (strcmp(mode, "EX") == 0)
+		glmode = LM_ST_UNLOCKED;
+	else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0))
+		glmode = LM_ST_DEFERRED;
+	else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0))
+		glmode = LM_ST_SHARED;
+	else
+		return -EINVAL;
+
+	if (gltype > LM_TYPE_JOURNAL)
+		return -EINVAL;
+	if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK)
+		glops = &gfs2_freeze_glops;
+	else
+		glops = gfs2_glops_list[gltype];
+	if (glops == NULL)
+		return -EINVAL;
+	if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+		fs_info(sdp, "demote interface used\n");
+	rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
+	if (rv)
+		return rv;
+	gfs2_glock_cb(gl, glmode);
+	gfs2_glock_put(gl);
+	return len;
+}
+
+
+#define GFS2_ATTR(name, mode, show, store) \
+static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
+
+GFS2_ATTR(id,                  0444, id_show,       NULL);
+GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
+GFS2_ATTR(uuid,                0444, uuid_show,     NULL);
+GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
+GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
+GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
+GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
+GFS2_ATTR(quota_refresh_user,  0200, NULL,          quota_refresh_user_store);
+GFS2_ATTR(quota_refresh_group, 0200, NULL,          quota_refresh_group_store);
+GFS2_ATTR(demote_rq,           0200, NULL,	    demote_rq_store);
+
+static struct attribute *gfs2_attrs[] = {
+	&gfs2_attr_id.attr,
+	&gfs2_attr_fsname.attr,
+	&gfs2_attr_uuid.attr,
+	&gfs2_attr_freeze.attr,
+	&gfs2_attr_withdraw.attr,
+	&gfs2_attr_statfs_sync.attr,
+	&gfs2_attr_quota_sync.attr,
+	&gfs2_attr_quota_refresh_user.attr,
+	&gfs2_attr_quota_refresh_group.attr,
+	&gfs2_attr_demote_rq.attr,
+	NULL,
+};
+
+static void gfs2_sbd_release(struct kobject *kobj)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+
+	kfree(sdp);
+}
+
+static struct kobj_type gfs2_ktype = {
+	.release = gfs2_sbd_release,
+	.default_attrs = gfs2_attrs,
+	.sysfs_ops     = &gfs2_attr_ops,
+};
+
+
+/*
+ * lock_module. Originally from lock_dlm
+ */
+
+static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf)
+{
+	const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops;
+	return sprintf(buf, "%s\n", ops->lm_proto_name);
+}
+
+static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret;
+	int val = 0;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
+		val = 1;
+	ret = sprintf(buf, "%d\n", val);
+	return ret;
+}
+
+static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if (val == 1)
+		set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+	else if (val == 0) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+		smp_mb__after_atomic();
+		gfs2_glock_thaw(sdp);
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
+{
+	int val = completion_done(&sdp->sd_wdack) ? 1 : 0;
+
+	return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if ((val == 1) &&
+	    !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+		complete(&sdp->sd_wdack);
+	else
+		ret = -EINVAL;
+	return ret;
+}
+
+static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first);
+}
+
+static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	unsigned first;
+	int rv;
+
+	rv = sscanf(buf, "%u", &first);
+	if (rv != 1 || first > 1)
+		return -EINVAL;
+	rv = wait_for_completion_killable(&sdp->sd_locking_init);
+	if (rv)
+		return rv;
+	spin_lock(&sdp->sd_jindex_spin);
+	rv = -EBUSY;
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+		goto out;
+	rv = -EINVAL;
+	if (sdp->sd_args.ar_spectator)
+		goto out;
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto out;
+	sdp->sd_lockstruct.ls_first = first;
+	rv = 0;
+out:
+        spin_unlock(&sdp->sd_jindex_spin);
+        return rv ? rv : len;
+}
+
+static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
+}
+
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
+{
+	struct gfs2_jdesc *jd;
+	int rv;
+
+	/* Wait for our primary journal to be initialized */
+	wait_for_completion(&sdp->sd_journal_ready);
+
+	spin_lock(&sdp->sd_jindex_spin);
+	rv = -EBUSY;
+	if (sdp->sd_jdesc->jd_jid == jid)
+		goto out;
+	rv = -ENOENT;
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_jid != jid)
+			continue;
+		rv = gfs2_recover_journal(jd, false);
+		break;
+	}
+out:
+	spin_unlock(&sdp->sd_jindex_spin);
+	return rv;
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	unsigned jid;
+	int rv;
+
+	rv = sscanf(buf, "%u", &jid);
+	if (rv != 1)
+		return -EINVAL;
+
+	if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) {
+		rv = -ESHUTDOWN;
+		goto out;
+	}
+
+	rv = gfs2_recover_set(sdp, jid);
+out:
+	return rv ? rv : len;
+}
+
+static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_done);
+}
+
+static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
+}
+
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
+}
+
+static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        int jid;
+	int rv;
+
+	rv = sscanf(buf, "%d", &jid);
+	if (rv != 1)
+		return -EINVAL;
+	rv = wait_for_completion_killable(&sdp->sd_locking_init);
+	if (rv)
+		return rv;
+	spin_lock(&sdp->sd_jindex_spin);
+	rv = -EINVAL;
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto out;
+	rv = -EBUSY;
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+		goto out;
+	rv = 0;
+	if (sdp->sd_args.ar_spectator && jid > 0)
+		rv = jid = -EINVAL;
+	sdp->sd_lockstruct.ls_jid = jid;
+	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+out:
+	spin_unlock(&sdp->sd_jindex_spin);
+	return rv ? rv : len;
+}
+
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+
+GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);
+GDLM_ATTR(block,		0644, block_show,		block_store);
+GDLM_ATTR(withdraw,		0644, wdack_show,		wdack_store);
+GDLM_ATTR(jid,			0644, jid_show,			jid_store);
+GDLM_ATTR(first,		0644, lkfirst_show,		lkfirst_store);
+GDLM_ATTR(first_done,		0444, first_done_show,		NULL);
+GDLM_ATTR(recover,		0600, NULL,			recover_store);
+GDLM_ATTR(recover_done,		0444, recover_done_show,	NULL);
+GDLM_ATTR(recover_status,	0444, recover_status_show,	NULL);
+
+static struct attribute *lock_module_attrs[] = {
+	&gdlm_attr_proto_name.attr,
+	&gdlm_attr_block.attr,
+	&gdlm_attr_withdraw.attr,
+	&gdlm_attr_jid.attr,
+	&gdlm_attr_first.attr,
+	&gdlm_attr_first_done.attr,
+	&gdlm_attr_recover.attr,
+	&gdlm_attr_recover_done.attr,
+	&gdlm_attr_recover_status.attr,
+	NULL,
+};
+
+/*
+ * get and set struct gfs2_tune fields
+ */
+
+static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u %u\n",
+			sdp->sd_tune.gt_quota_scale_num,
+			sdp->sd_tune.gt_quota_scale_den);
+}
+
+static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
+				 size_t len)
+{
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	unsigned int x, y;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
+		return -EINVAL;
+
+	spin_lock(&gt->gt_spin);
+	gt->gt_quota_scale_num = x;
+	gt->gt_quota_scale_den = y;
+	spin_unlock(&gt->gt_spin);
+	return len;
+}
+
+static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
+			int check_zero, const char *buf, size_t len)
+{
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	unsigned int x;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	x = simple_strtoul(buf, NULL, 0);
+
+	if (check_zero && !x)
+		return -EINVAL;
+
+	spin_lock(&gt->gt_spin);
+	*field = x;
+	spin_unlock(&gt->gt_spin);
+	return len;
+}
+
+#define TUNE_ATTR_3(name, show, store)                                        \
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+
+#define TUNE_ATTR_2(name, store)                                              \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
+{                                                                             \
+	return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name);      \
+}                                                                             \
+TUNE_ATTR_3(name, name##_show, store)
+
+#define TUNE_ATTR(name, check_zero)                                           \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+	return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len);  \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+
+TUNE_ATTR(quota_warn_period, 0);
+TUNE_ATTR(quota_quantum, 0);
+TUNE_ATTR(max_readahead, 0);
+TUNE_ATTR(complain_secs, 0);
+TUNE_ATTR(statfs_slow, 0);
+TUNE_ATTR(new_files_jdata, 0);
+TUNE_ATTR(statfs_quantum, 1);
+TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
+
+static struct attribute *tune_attrs[] = {
+	&tune_attr_quota_warn_period.attr,
+	&tune_attr_quota_quantum.attr,
+	&tune_attr_max_readahead.attr,
+	&tune_attr_complain_secs.attr,
+	&tune_attr_statfs_slow.attr,
+	&tune_attr_statfs_quantum.attr,
+	&tune_attr_quota_scale.attr,
+	&tune_attr_new_files_jdata.attr,
+	NULL,
+};
+
+static struct attribute_group tune_group = {
+	.name = "tune",
+	.attrs = tune_attrs,
+};
+
+static struct attribute_group lock_module_group = {
+	.name = "lock_module",
+	.attrs = lock_module_attrs,
+};
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	int error;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+	int sysfs_frees_sdp = 0;
+
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+
+	sdp->sd_kobj.kset = gfs2_kset;
+	error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
+				     "%s", sdp->sd_table_name);
+	if (error)
+		goto fail_reg;
+
+	sysfs_frees_sdp = 1; /* Freeing sdp is now done by sysfs calling
+				function gfs2_sbd_release. */
+	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
+	if (error)
+		goto fail_reg;
+
+	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
+	if (error)
+		goto fail_tune;
+
+	error = sysfs_create_link(&sdp->sd_kobj,
+				  &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+				  "device");
+	if (error)
+		goto fail_lock_module;
+
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
+	return 0;
+
+fail_lock_module:
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
+fail_tune:
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+fail_reg:
+	free_percpu(sdp->sd_lkstats);
+	fs_err(sdp, "error %d adding sysfs files", error);
+	if (sysfs_frees_sdp)
+		kobject_put(&sdp->sd_kobj);
+	else
+		kfree(sdp);
+	sb->s_fs_info = NULL;
+	return error;
+}
+
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
+{
+	sysfs_remove_link(&sdp->sd_kobj, "device");
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
+	kobject_put(&sdp->sd_kobj);
+}
+
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+		       struct kobj_uevent_env *env)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct super_block *s = sdp->sd_vfs;
+	const u8 *uuid = s->s_uuid;
+
+	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+		add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
+	if (gfs2_uuid_valid(uuid))
+		add_uevent_var(env, "UUID=%pUB", uuid);
+	return 0;
+}
+
+static const struct kset_uevent_ops gfs2_uevent_ops = {
+	.uevent = gfs2_uevent,
+};
+
+int gfs2_sys_init(void)
+{
+	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
+	if (!gfs2_kset)
+		return -ENOMEM;
+	return 0;
+}
+
+void gfs2_sys_uninit(void)
+{
+	kset_unregister(gfs2_kset);
+}
+
diff --git a/kernel/fs/gfs2/sys.h b/kernel/fs/gfs2/sys.h
new file mode 100644
index 000000000..79182d6ad
--- /dev/null
+++ b/kernel/fs/gfs2/sys.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SYS_DOT_H__
+#define __SYS_DOT_H__
+
+#include <linux/spinlock.h>
+struct gfs2_sbd;
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
+
+int gfs2_sys_init(void);
+void gfs2_sys_uninit(void);
+
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
+
+#endif /* __SYS_DOT_H__ */
+
diff --git a/kernel/fs/gfs2/trace_gfs2.h b/kernel/fs/gfs2/trace_gfs2.h
new file mode 100644
index 000000000..20c007d74
--- /dev/null
+++ b/kernel/fs/gfs2/trace_gfs2.h
@@ -0,0 +1,558 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gfs2
+
+#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GFS2_H
+
+#include <linux/tracepoint.h>
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/dlmconstants.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/writeback.h>
+#include <linux/ktime.h>
+#include "incore.h"
+#include "glock.h"
+#include "rgrp.h"
+
+#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
+#define glock_trace_name(x) __print_symbolic(x,		\
+			    dlm_state_name(IV),		\
+			    dlm_state_name(NL),		\
+			    dlm_state_name(CR),		\
+			    dlm_state_name(CW),		\
+			    dlm_state_name(PR),		\
+			    dlm_state_name(PW),		\
+			    dlm_state_name(EX))
+
+#define block_state_name(x) __print_symbolic(x,			\
+			    { GFS2_BLKST_FREE, "free" },	\
+			    { GFS2_BLKST_USED, "used" },	\
+			    { GFS2_BLKST_DINODE, "dinode" },	\
+			    { GFS2_BLKST_UNLINKED, "unlinked" })
+
+#define TRACE_RS_DELETE  0
+#define TRACE_RS_TREEDEL 1
+#define TRACE_RS_INSERT  2
+#define TRACE_RS_CLAIM   3
+
+#define rs_func_name(x) __print_symbolic(x,	\
+					 { 0, "del " },	\
+					 { 1, "tdel" },	\
+					 { 2, "ins " },	\
+					 { 3, "clm " })
+
+#define show_glock_flags(flags) __print_flags(flags, "",	\
+	{(1UL << GLF_LOCK),			"l" },		\
+	{(1UL << GLF_DEMOTE),			"D" },		\
+	{(1UL << GLF_PENDING_DEMOTE),		"d" },		\
+	{(1UL << GLF_DEMOTE_IN_PROGRESS),	"p" },		\
+	{(1UL << GLF_DIRTY),			"y" },		\
+	{(1UL << GLF_LFLUSH),			"f" },		\
+	{(1UL << GLF_INVALIDATE_IN_PROGRESS),	"i" },		\
+	{(1UL << GLF_REPLY_PENDING),		"r" },		\
+	{(1UL << GLF_INITIAL),			"I" },		\
+	{(1UL << GLF_FROZEN),			"F" },		\
+	{(1UL << GLF_QUEUED),			"q" },		\
+	{(1UL << GLF_LRU),			"L" },		\
+	{(1UL << GLF_OBJECT),			"o" },		\
+	{(1UL << GLF_BLOCKING),			"b" })
+
+#ifndef NUMPTY
+#define NUMPTY
+static inline u8 glock_trace_state(unsigned int state)
+{
+	switch(state) {
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	}
+	return DLM_LOCK_NL;
+}
+#endif
+
+/* Section 1 - Locking
+ *
+ * Objectives:
+ * Latency: Remote demote request to state change
+ * Latency: Local lock request to state change
+ * Latency: State change to lock grant
+ * Correctness: Ordering of local lock state vs. I/O requests
+ * Correctness: Responses to remote demote requests
+ */
+
+/* General glock state change (DLM lock request completes) */
+TRACE_EVENT(gfs2_glock_state_change,
+
+	TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state),
+
+	TP_ARGS(gl, new_state),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	u8,	cur_state		)
+		__field(	u8,	new_state		)
+		__field(	u8,	dmt_state		)
+		__field(	u8,	tgt_state		)
+		__field(	unsigned long,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= gl->gl_sbd->sd_vfs->s_dev;
+		__entry->glnum		= gl->gl_name.ln_number;
+		__entry->gltype		= gl->gl_name.ln_type;
+		__entry->cur_state	= glock_trace_state(gl->gl_state);
+		__entry->new_state	= glock_trace_state(new_state);
+		__entry->tgt_state	= glock_trace_state(gl->gl_target);
+		__entry->dmt_state	= glock_trace_state(gl->gl_demote_state);
+		__entry->flags		= gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+	),
+
+	TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		 (unsigned long long)__entry->glnum,
+		  glock_trace_name(__entry->cur_state),
+		  glock_trace_name(__entry->new_state),
+		  glock_trace_name(__entry->tgt_state),
+		  glock_trace_name(__entry->dmt_state),
+		  show_glock_flags(__entry->flags))
+);
+
+/* State change -> unlocked, glock is being deallocated */
+TRACE_EVENT(gfs2_glock_put,
+
+	TP_PROTO(const struct gfs2_glock *gl),
+
+	TP_ARGS(gl),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	u8,	cur_state		)
+		__field(	unsigned long,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= gl->gl_sbd->sd_vfs->s_dev;
+		__entry->gltype		= gl->gl_name.ln_type;
+		__entry->glnum		= gl->gl_name.ln_number;
+		__entry->cur_state	= glock_trace_state(gl->gl_state);
+		__entry->flags		= gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+	),
+
+	TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->gltype, (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+		  glock_trace_name(DLM_LOCK_IV),
+		  show_glock_flags(__entry->flags))
+
+);
+
+/* Callback (local or remote) requesting lock demotion */
+TRACE_EVENT(gfs2_demote_rq,
+
+	TP_PROTO(const struct gfs2_glock *gl, bool remote),
+
+	TP_ARGS(gl, remote),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	u8,	cur_state		)
+		__field(	u8,	dmt_state		)
+		__field(	unsigned long,	flags		)
+		__field(	bool,	remote			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= gl->gl_sbd->sd_vfs->s_dev;
+		__entry->gltype		= gl->gl_name.ln_type;
+		__entry->glnum		= gl->gl_name.ln_number;
+		__entry->cur_state	= glock_trace_state(gl->gl_state);
+		__entry->dmt_state	= glock_trace_state(gl->gl_demote_state);
+		__entry->flags		= gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+		__entry->remote		= remote;
+	),
+
+	TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		  (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(__entry->dmt_state),
+		  show_glock_flags(__entry->flags),
+		  __entry->remote ? "remote" : "local")
+
+);
+
+/* Promotion/grant of a glock */
+TRACE_EVENT(gfs2_promote,
+
+	TP_PROTO(const struct gfs2_holder *gh, int first),
+
+	TP_ARGS(gh, first),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	int,	first			)
+		__field(	u8,	state			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->glnum	= gh->gh_gl->gl_name.ln_number;
+		__entry->gltype	= gh->gh_gl->gl_name.ln_type;
+		__entry->first	= first;
+		__entry->state	= glock_trace_state(gh->gh_state);
+	),
+
+	TP_printk("%u,%u glock %u:%llu promote %s %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		  (unsigned long long)__entry->glnum,
+		  __entry->first ? "first": "other",
+		  glock_trace_name(__entry->state))
+);
+
+/* Queue/dequeue a lock request */
+TRACE_EVENT(gfs2_glock_queue,
+
+	TP_PROTO(const struct gfs2_holder *gh, int queue),
+
+	TP_ARGS(gh, queue),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	int,	queue			)
+		__field(	u8,	state			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->glnum	= gh->gh_gl->gl_name.ln_number;
+		__entry->gltype	= gh->gh_gl->gl_name.ln_type;
+		__entry->queue	= queue;
+		__entry->state	= glock_trace_state(gh->gh_state);
+	),
+
+	TP_printk("%u,%u glock %u:%llu %squeue %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		  (unsigned long long)__entry->glnum,
+		  __entry->queue ? "" : "de",
+		  glock_trace_name(__entry->state))
+);
+
+/* DLM sends a reply to GFS2 */
+TRACE_EVENT(gfs2_glock_lock_time,
+
+	TP_PROTO(const struct gfs2_glock *gl, s64 tdiff),
+
+	TP_ARGS(gl, tdiff),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev		)
+		__field(	u64,	glnum		)
+		__field(	u32,	gltype		)
+		__field(	int,	status		)
+		__field(	char,	flags		)
+		__field(	s64,	tdiff		)
+		__field(	s64,	srtt		)
+		__field(	s64,	srttvar		)
+		__field(	s64,	srttb		)
+		__field(	s64,	srttvarb	)
+		__field(	s64,	sirt		)
+		__field(	s64,	sirtvar		)
+		__field(	s64,	dcount		)
+		__field(	s64,	qcount		)
+	),
+
+	TP_fast_assign(
+		__entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+		__entry->glnum          = gl->gl_name.ln_number;
+		__entry->gltype         = gl->gl_name.ln_type;
+		__entry->status		= gl->gl_lksb.sb_status;
+		__entry->flags		= gl->gl_lksb.sb_flags;
+		__entry->tdiff		= tdiff;
+		__entry->srtt		= gl->gl_stats.stats[GFS2_LKS_SRTT];
+		__entry->srttvar	= gl->gl_stats.stats[GFS2_LKS_SRTTVAR];
+		__entry->srttb		= gl->gl_stats.stats[GFS2_LKS_SRTTB];
+		__entry->srttvarb	= gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
+		__entry->sirt		= gl->gl_stats.stats[GFS2_LKS_SIRT];
+		__entry->sirtvar	= gl->gl_stats.stats[GFS2_LKS_SIRTVAR];
+		__entry->dcount		= gl->gl_stats.stats[GFS2_LKS_DCOUNT];
+		__entry->qcount		= gl->gl_stats.stats[GFS2_LKS_QCOUNT];
+	),
+
+	TP_printk("%u,%u glock %d:%lld status:%d flags:%02x tdiff:%lld srtt:%lld/%lld srttb:%lld/%lld sirt:%lld/%lld dcnt:%lld qcnt:%lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		  (unsigned long long)__entry->glnum,
+		  __entry->status, __entry->flags,
+		  (long long)__entry->tdiff,
+		  (long long)__entry->srtt,
+		  (long long)__entry->srttvar,
+		  (long long)__entry->srttb,
+		  (long long)__entry->srttvarb,
+		  (long long)__entry->sirt,
+		  (long long)__entry->sirtvar,
+		  (long long)__entry->dcount,
+		  (long long)__entry->qcount)
+);
+
+/* Section 2 - Log/journal
+ *
+ * Objectives:
+ * Latency: Log flush time
+ * Correctness: pin/unpin vs. disk I/O ordering
+ * Performance: Log usage stats
+ */
+
+/* Pin/unpin a block in the log */
+TRACE_EVENT(gfs2_pin,
+
+	TP_PROTO(const struct gfs2_bufdata *bd, int pin),
+
+	TP_ARGS(bd, pin),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	int,	pin			)
+		__field(	u32,	len			)
+		__field(	sector_t,	block		)
+		__field(	u64,	ino			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->pin		= pin;
+		__entry->len		= bd->bd_bh->b_size;
+		__entry->block		= bd->bd_bh->b_blocknr;
+		__entry->ino		= bd->bd_gl->gl_name.ln_number;
+	),
+
+	TP_printk("%u,%u log %s %llu/%lu inode %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->pin ? "pin" : "unpin",
+		  (unsigned long long)__entry->block,
+		  (unsigned long)__entry->len,
+		  (unsigned long long)__entry->ino)
+);
+
+/* Flushing the log */
+TRACE_EVENT(gfs2_log_flush,
+
+	TP_PROTO(const struct gfs2_sbd *sdp, int start),
+
+	TP_ARGS(sdp, start),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	int,	start			)
+		__field(	u64,	log_seq			)
+	),
+
+	TP_fast_assign(
+		__entry->dev            = sdp->sd_vfs->s_dev;
+		__entry->start		= start;
+		__entry->log_seq	= sdp->sd_log_sequence;
+	),
+
+	TP_printk("%u,%u log flush %s %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->start ? "start" : "end",
+		  (unsigned long long)__entry->log_seq)
+);
+
+/* Reserving/releasing blocks in the log */
+TRACE_EVENT(gfs2_log_blocks,
+
+	TP_PROTO(const struct gfs2_sbd *sdp, int blocks),
+
+	TP_ARGS(sdp, blocks),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	int,	blocks			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sdp->sd_vfs->s_dev;
+		__entry->blocks		= blocks;
+	),
+
+	TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->blocks)
+);
+
+/* Writing back the AIL */
+TRACE_EVENT(gfs2_ail_flush,
+
+	TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start),
+
+	TP_ARGS(sdp, wbc, start),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int, start			)
+		__field(	int, sync_mode			)
+		__field(	long, nr_to_write		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sdp->sd_vfs->s_dev;
+		__entry->start		= start;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->nr_to_write	= wbc->nr_to_write;
+	),
+
+	TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->start ? "start" : "end",
+		  __entry->sync_mode == WB_SYNC_ALL ? "all" : "none",
+		  __entry->nr_to_write)
+);
+
+/* Section 3 - bmap
+ *
+ * Objectives:
+ * Latency: Bmap request time
+ * Performance: Block allocator tracing
+ * Correctness: Test of disard generation vs. blocks allocated
+ */
+
+/* Map an extent of blocks, possibly a new allocation */
+TRACE_EVENT(gfs2_bmap,
+
+	TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh,
+		sector_t lblock, int create, int errno),
+
+	TP_ARGS(ip, bh, lblock, create, errno),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	sector_t, lblock		)
+		__field(	sector_t, pblock		)
+		__field(	u64,	inum			)
+		__field(	unsigned long, state		)
+		__field(	u32,	len			)
+		__field(	int,	create			)
+		__field(	int,	errno			)
+	),
+
+	TP_fast_assign(
+		__entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->lblock		= lblock;
+		__entry->pblock		= buffer_mapped(bh) ?  bh->b_blocknr : 0;
+		__entry->inum		= ip->i_no_addr;
+		__entry->state		= bh->b_state;
+		__entry->len		= bh->b_size;
+		__entry->create		= create;
+		__entry->errno		= errno;
+	),
+
+	TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->lblock,
+		  (unsigned long)__entry->len,
+		  (unsigned long long)__entry->pblock,
+		  __entry->state, __entry->create ? "create " : "nocreate",
+		  __entry->errno)
+);
+
+/* Keep track of blocks as they are allocated/freed */
+TRACE_EVENT(gfs2_block_alloc,
+
+	TP_PROTO(const struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+		 u64 block, unsigned len, u8 block_state),
+
+	TP_ARGS(ip, rgd, block, len, block_state),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	start			)
+		__field(	u64,	inum			)
+		__field(	u32,	len			)
+		__field(	u8,	block_state		)
+		__field(        u64,	rd_addr			)
+		__field(        u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->start		= block;
+		__entry->inum		= ip->i_no_addr;
+		__entry->len		= len;
+		__entry->block_state	= block_state;
+		__entry->rd_addr	= rgd->rd_addr;
+		__entry->rd_free_clone	= rgd->rd_free_clone;
+		__entry->rd_reserved	= rgd->rd_reserved;
+	),
+
+	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->start,
+		  (unsigned long)__entry->len,
+		  block_state_name(__entry->block_state),
+		  (unsigned long long)__entry->rd_addr,
+		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
+);
+
+/* Keep track of multi-block reservations as they are allocated/freed */
+TRACE_EVENT(gfs2_rs,
+
+	TP_PROTO(const struct gfs2_blkreserv *rs, u8 func),
+
+	TP_ARGS(rs, func),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	rd_addr			)
+		__field(	u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
+		__field(	u64,	inum			)
+		__field(	u64,	start			)
+		__field(	u32,	free			)
+		__field(	u8,	func			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev;
+		__entry->rd_addr	= rs->rs_rbm.rgd->rd_addr;
+		__entry->rd_free_clone	= rs->rs_rbm.rgd->rd_free_clone;
+		__entry->rd_reserved	= rs->rs_rbm.rgd->rd_reserved;
+		__entry->inum		= rs->rs_inum;
+		__entry->start		= gfs2_rbm_to_block(&rs->rs_rbm);
+		__entry->free		= rs->rs_free;
+		__entry->func		= func;
+	),
+
+	TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->start,
+		  (unsigned long long)__entry->rd_addr,
+		  (unsigned long)__entry->rd_free_clone,
+		  (unsigned long)__entry->rd_reserved,
+		  rs_func_name(__entry->func), (unsigned long)__entry->free)
+);
+
+#endif /* _TRACE_GFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_gfs2
+#include <trace/define_trace.h>
+
diff --git a/kernel/fs/gfs2/trans.c b/kernel/fs/gfs2/trans.c
new file mode 100644
index 000000000..88bff2430
--- /dev/null
+++ b/kernel/fs/gfs2/trans.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+		     unsigned int revokes)
+{
+	struct gfs2_trans *tr;
+	int error;
+
+	BUG_ON(current->journal_info);
+	BUG_ON(blocks == 0 && revokes == 0);
+
+	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+		return -EROFS;
+
+	tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+	if (!tr)
+		return -ENOMEM;
+
+	tr->tr_ip = _RET_IP_;
+	tr->tr_blocks = blocks;
+	tr->tr_revokes = revokes;
+	tr->tr_reserved = 1;
+	tr->tr_alloced = 1;
+	if (blocks)
+		tr->tr_reserved += 6 + blocks;
+	if (revokes)
+		tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
+						   sizeof(u64));
+	INIT_LIST_HEAD(&tr->tr_databuf);
+	INIT_LIST_HEAD(&tr->tr_buf);
+
+	sb_start_intwrite(sdp->sd_vfs);
+
+	error = gfs2_log_reserve(sdp, tr->tr_reserved);
+	if (error)
+		goto fail;
+
+	current->journal_info = tr;
+
+	return 0;
+
+fail:
+	sb_end_intwrite(sdp->sd_vfs);
+	kfree(tr);
+
+	return error;
+}
+
+static void gfs2_print_trans(const struct gfs2_trans *tr)
+{
+	pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
+	pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
+		tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
+	pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
+		tr->tr_num_buf_new, tr->tr_num_buf_rm,
+		tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
+		tr->tr_num_revoke, tr->tr_num_revoke_rm);
+}
+
+void gfs2_trans_end(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr = current->journal_info;
+	s64 nbuf;
+	int alloced = tr->tr_alloced;
+
+	BUG_ON(!tr);
+	current->journal_info = NULL;
+
+	if (!tr->tr_touched) {
+		gfs2_log_release(sdp, tr->tr_reserved);
+		if (alloced) {
+			kfree(tr);
+			sb_end_intwrite(sdp->sd_vfs);
+		}
+		return;
+	}
+
+	nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new;
+	nbuf -= tr->tr_num_buf_rm;
+	nbuf -= tr->tr_num_databuf_rm;
+
+	if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) &&
+				       (tr->tr_num_revoke <= tr->tr_revokes)))
+		gfs2_print_trans(tr);
+
+	gfs2_log_commit(sdp, tr);
+	if (alloced && !tr->tr_attached)
+			kfree(tr);
+	up_read(&sdp->sd_log_flush_lock);
+
+	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+	if (alloced)
+		sb_end_intwrite(sdp->sd_vfs);
+}
+
+static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
+					       struct buffer_head *bh,
+					       const struct gfs2_log_operations *lops)
+{
+	struct gfs2_bufdata *bd;
+
+	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+	bd->bd_bh = bh;
+	bd->bd_gl = gl;
+	bd->bd_ops = lops;
+	INIT_LIST_HEAD(&bd->bd_list);
+	bh->b_private = bd;
+	return bd;
+}
+
+/**
+ * gfs2_trans_add_data - Add a databuf to the transaction.
+ * @gl: The inode glock associated with the buffer
+ * @bh: The buffer to add
+ *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
+ */
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+	struct gfs2_trans *tr = current->journal_info;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = bh->b_page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_bufdata *bd;
+
+	if (!gfs2_is_jdata(ip)) {
+		gfs2_ordered_add_inode(ip);
+		return;
+	}
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	bd = bh->b_private;
+	if (bd == NULL) {
+		gfs2_log_unlock(sdp);
+		unlock_buffer(bh);
+		if (bh->b_private == NULL)
+			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+		lock_buffer(bh);
+		gfs2_log_lock(sdp);
+	}
+	gfs2_assert(sdp, bd->bd_gl == gl);
+	tr->tr_touched = 1;
+	if (list_empty(&bd->bd_list)) {
+		set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+		set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+		gfs2_pin(sdp, bd->bd_bh);
+		tr->tr_num_databuf_new++;
+		list_add_tail(&bd->bd_list, &tr->tr_databuf);
+	}
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+	struct gfs2_meta_header *mh;
+	struct gfs2_trans *tr;
+	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
+
+	tr = current->journal_info;
+	tr->tr_touched = 1;
+	if (!list_empty(&bd->bd_list))
+		return;
+	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
+		pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+		       (unsigned long long)bd->bd_bh->b_blocknr);
+		BUG();
+	}
+	if (unlikely(state == SFS_FROZEN)) {
+		printk(KERN_INFO "GFS2:adding buf while frozen\n");
+		gfs2_assert_withdraw(sdp, 0);
+	}
+	gfs2_pin(sdp, bd->bd_bh);
+	mh->__pad0 = cpu_to_be64(0);
+	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	list_add(&bd->bd_list, &tr->tr_buf);
+	tr->tr_num_buf_new++;
+}
+
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_bufdata *bd;
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	bd = bh->b_private;
+	if (bd == NULL) {
+		gfs2_log_unlock(sdp);
+		unlock_buffer(bh);
+		lock_page(bh->b_page);
+		if (bh->b_private == NULL)
+			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+		unlock_page(bh->b_page);
+		lock_buffer(bh);
+		gfs2_log_lock(sdp);
+	}
+	gfs2_assert(sdp, bd->bd_gl == gl);
+	meta_lo_add(sdp, bd);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+	struct gfs2_trans *tr = current->journal_info;
+
+	BUG_ON(!list_empty(&bd->bd_list));
+	gfs2_add_revoke(sdp, bd);
+	tr->tr_touched = 1;
+	tr->tr_num_revoke++;
+}
+
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
+{
+	struct gfs2_bufdata *bd, *tmp;
+	struct gfs2_trans *tr = current->journal_info;
+	unsigned int n = len;
+
+	gfs2_log_lock(sdp);
+	list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_list) {
+		if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
+			list_del_init(&bd->bd_list);
+			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
+			sdp->sd_log_num_revoke--;
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+			tr->tr_num_revoke_rm++;
+			if (--n == 0)
+				break;
+		}
+	}
+	gfs2_log_unlock(sdp);
+}
+
diff --git a/kernel/fs/gfs2/trans.h b/kernel/fs/gfs2/trans.h
new file mode 100644
index 000000000..1e6e7da25
--- /dev/null
+++ b/kernel/fs/gfs2/trans.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __TRANS_DOT_H__
+#define __TRANS_DOT_H__
+
+#include <linux/buffer_head.h>
+struct gfs2_sbd;
+struct gfs2_rgrpd;
+struct gfs2_glock;
+
+#define RES_DINODE	1
+#define RES_INDIRECT	1
+#define RES_JDATA	1
+#define RES_DATA	1
+#define RES_LEAF	1
+#define RES_RG_HDR	1
+#define RES_RG_BIT	2
+#define RES_EATTR	1
+#define RES_STATFS	1
+#define RES_QUOTA	2
+
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested)
+{
+	if (requested < ip->i_rgd->rd_length)
+		return requested + 1;
+	return ip->i_rgd->rd_length;
+}
+
+extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+			    unsigned int revokes);
+
+extern void gfs2_trans_end(struct gfs2_sbd *sdp);
+extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
+extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
+
+#endif /* __TRANS_DOT_H__ */
diff --git a/kernel/fs/gfs2/util.c b/kernel/fs/gfs2/util.c
new file mode 100644
index 000000000..86d2035ac
--- /dev/null
+++ b/kernel/fs/gfs2/util.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "util.h"
+
+struct kmem_cache *gfs2_glock_cachep __read_mostly;
+struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
+struct kmem_cache *gfs2_inode_cachep __read_mostly;
+struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
+struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
+struct kmem_cache *gfs2_rsrv_cachep __read_mostly;
+mempool_t *gfs2_page_pool __read_mostly;
+
+void gfs2_assert_i(struct gfs2_sbd *sdp)
+{
+	fs_emerg(sdp, "fatal assertion failed\n");
+}
+
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	const struct lm_lockops *lm = ls->ls_ops;
+	va_list args;
+	struct va_format vaf;
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+	    test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return 0;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	fs_err(sdp, "%pV", &vaf);
+
+	va_end(args);
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+		fs_err(sdp, "about to withdraw this file system\n");
+		BUG_ON(sdp->sd_args.ar_debug);
+
+		kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+
+		if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+			wait_for_completion(&sdp->sd_wdack);
+
+		if (lm->lm_unmount) {
+			fs_err(sdp, "telling LM to unmount\n");
+			lm->lm_unmount(sdp);
+		}
+		fs_err(sdp, "withdrawn\n");
+		dump_stack();
+	}
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
+
+	return -1;
+}
+
+/**
+ * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+			   const char *function, char *file, unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+			      "fatal: assertion \"%s\" failed\n"
+			      "   function = %s, file = %s, line = %u\n",
+			      assertion, function, file, line);
+	dump_stack();
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_assert_warn_i - Print a message to the console if @assertion is false
+ * Returns: -1 if we printed something
+ *          -2 if we didn't
+ */
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+		       const char *function, char *file, unsigned int line)
+{
+	if (time_before(jiffies,
+			sdp->sd_last_warning +
+			gfs2_tune_get(sdp, gt_complain_secs) * HZ))
+		return -2;
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
+		fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
+			assertion, function, file, line);
+
+	if (sdp->sd_args.ar_debug)
+		BUG();
+	else
+		dump_stack();
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+		      "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		      sdp->sd_fsname, assertion,
+		      sdp->sd_fsname, function, file, line);
+
+	sdp->sd_last_warning = jiffies;
+
+	return -1;
+}
+
+/**
+ * gfs2_consist_i - Flag a filesystem consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
+		   char *file, unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+			      "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
+			      function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+			 const char *function, char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+			      "fatal: filesystem consistency error\n"
+			      "  inode = %llu %llu\n"
+			      "  function = %s, file = %s, line = %u\n",
+			      (unsigned long long)ip->i_no_formal_ino,
+			      (unsigned long long)ip->i_no_addr,
+			      function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+			 const char *function, char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+			      "fatal: filesystem consistency error\n"
+			      "  RG = %llu\n"
+			      "  function = %s, file = %s, line = %u\n",
+			      (unsigned long long)rgd->rd_addr,
+			      function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *type, const char *function, char *file,
+		       unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+			      "fatal: invalid metadata block\n"
+			      "  bh = %llu (%s)\n"
+			      "  function = %s, file = %s, line = %u\n",
+			      (unsigned long long)bh->b_blocknr, type,
+			      function, file, line);
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			   u16 type, u16 t, const char *function,
+			   char *file, unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+			      "fatal: invalid metadata block\n"
+			      "  bh = %llu (type: exp=%u, found=%u)\n"
+			      "  function = %s, file = %s, line = %u\n",
+			      (unsigned long long)bh->b_blocknr, type, t,
+			      function, file, line);
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_io_error_i - Flag an I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+		    unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+			      "fatal: I/O error\n"
+			      "  function = %s, file = %s, line = %u\n",
+			      function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *function, char *file, unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+			      "fatal: I/O error\n"
+			      "  block = %llu\n"
+			      "  function = %s, file = %s, line = %u\n",
+			      (unsigned long long)bh->b_blocknr,
+			      function, file, line);
+	return rv;
+}
+
diff --git a/kernel/fs/gfs2/util.h b/kernel/fs/gfs2/util.h
new file mode 100644
index 000000000..cbdcbdf39
--- /dev/null
+++ b/kernel/fs/gfs2/util.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+#include <linux/mempool.h>
+
+#include "incore.h"
+
+#define fs_emerg(fs, fmt, ...)						\
+	pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+#define fs_warn(fs, fmt, ...)						\
+	pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+#define fs_err(fs, fmt, ...)						\
+	pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+#define fs_info(fs, fmt, ...)						\
+	pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+
+void gfs2_assert_i(struct gfs2_sbd *sdp);
+
+#define gfs2_assert(sdp, assertion) \
+do { \
+	if (unlikely(!(assertion))) { \
+		gfs2_assert_i(sdp); \
+		BUG(); \
+        } \
+} while (0)
+
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+			   const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_withdraw(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
+					__func__, __FILE__, __LINE__))
+
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+		       const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_warn(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
+					__func__, __FILE__, __LINE__))
+
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
+		   const char *function, char *file, unsigned int line);
+
+#define gfs2_consist(sdp) \
+gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__)
+
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+			 const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_inode(ip) \
+gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__)
+
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+			 const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_rgrpd(rgd) \
+gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__)
+
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *type, const char *function,
+		       char *file, unsigned int line);
+
+static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
+				    struct buffer_head *bh)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+	u32 magic = be32_to_cpu(mh->mh_magic);
+	if (unlikely(magic != GFS2_MAGIC)) {
+		pr_err("Magic number missing at %llu\n",
+		       (unsigned long long)bh->b_blocknr);
+		return -EIO;
+	}
+	return 0;
+}
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			   u16 type, u16 t,
+			   const char *function,
+			   char *file, unsigned int line);
+
+static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
+					struct buffer_head *bh,
+					u16 type,
+					const char *function,
+					char *file, unsigned int line)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+	u32 magic = be32_to_cpu(mh->mh_magic);
+	u16 t = be32_to_cpu(mh->mh_type);
+	if (unlikely(magic != GFS2_MAGIC))
+		return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+					  file, line);
+        if (unlikely(t != type))
+		return gfs2_metatype_check_ii(sdp, bh, type, t, function,
+					      file, line);
+	return 0;
+}
+
+#define gfs2_metatype_check(sdp, bh, type) \
+gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__)
+
+static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
+				     u16 format)
+{
+	struct gfs2_meta_header *mh;
+	mh = (struct gfs2_meta_header *)bh->b_data;
+	mh->mh_type = cpu_to_be32(type);
+	mh->mh_format = cpu_to_be32(format);
+}
+
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+		    char *file, unsigned int line);
+
+#define gfs2_io_error(sdp) \
+gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__);
+
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *function, char *file, unsigned int line);
+
+#define gfs2_io_error_bh(sdp, bh) \
+gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
+
+
+extern struct kmem_cache *gfs2_glock_cachep;
+extern struct kmem_cache *gfs2_glock_aspace_cachep;
+extern struct kmem_cache *gfs2_inode_cachep;
+extern struct kmem_cache *gfs2_bufdata_cachep;
+extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
+extern struct kmem_cache *gfs2_rsrv_cachep;
+extern mempool_t *gfs2_page_pool;
+
+static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
+					   unsigned int *p)
+{
+	unsigned int x;
+	spin_lock(&gt->gt_spin);
+	x = *p;
+	spin_unlock(&gt->gt_spin);
+	return x;
+}
+
+#define gfs2_tune_get(sdp, field) \
+gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
+
+__printf(2, 3)
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
+
+#endif /* __UTIL_DOT_H__ */
diff --git a/kernel/fs/gfs2/xattr.c b/kernel/fs/gfs2/xattr.c
new file mode 100644
index 000000000..4c096fa9e
--- /dev/null
+++ b/kernel/fs/gfs2/xattr.c
@@ -0,0 +1,1508 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/posix_acl_xattr.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
+			unsigned int *size)
+{
+	unsigned int jbsize = sdp->sd_jbsize;
+
+	/* Stuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+	if (*size <= jbsize)
+		return 1;
+
+	/* Unstuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
+
+	return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
+{
+	unsigned int size;
+
+	if (dsize > GFS2_EA_MAX_DATA_LEN)
+		return -ERANGE;
+
+	ea_calc_size(sdp, nsize, dsize, &size);
+
+	/* This can only happen with 512 byte blocks */
+	if (size > sdp->sd_jbsize)
+		return -ERANGE;
+
+	return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+			  struct gfs2_ea_header *ea,
+			  struct gfs2_ea_header *prev, void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+			ea_call_t ea_call, void *data)
+{
+	struct gfs2_ea_header *ea, *prev = NULL;
+	int error = 0;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+		return -EIO;
+
+	for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+		if (!GFS2_EA_REC_LEN(ea))
+			goto fail;
+		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+						  bh->b_data + bh->b_size))
+			goto fail;
+		if (!GFS2_EATYPE_VALID(ea->ea_type))
+			goto fail;
+
+		error = ea_call(ip, bh, ea, prev, data);
+		if (error)
+			return error;
+
+		if (GFS2_EA_IS_LAST(ea)) {
+			if ((char *)GFS2_EA2NEXT(ea) !=
+			    bh->b_data + bh->b_size)
+				goto fail;
+			break;
+		}
+	}
+
+	return error;
+
+fail:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+	struct buffer_head *bh, *eabh;
+	__be64 *eablk, *end;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
+		error = ea_foreach_i(ip, bh, ea_call, data);
+		goto out;
+	}
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+		if (error)
+			break;
+		error = ea_foreach_i(ip, eabh, ea_call, data);
+		brelse(eabh);
+		if (error)
+			break;
+	}
+out:
+	brelse(bh);
+	return error;
+}
+
+struct ea_find {
+	int type;
+	const char *name;
+	size_t namel;
+	struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_find *ef = private;
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (ea->ea_type == ef->type) {
+		if (ea->ea_name_len == ef->namel &&
+		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
+			struct gfs2_ea_location *el = ef->ef_el;
+			get_bh(bh);
+			el->el_bh = bh;
+			el->el_ea = ea;
+			el->el_prev = prev;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+			struct gfs2_ea_location *el)
+{
+	struct ea_find ef;
+	int error;
+
+	ef.type = type;
+	ef.name = name;
+	ef.namel = strlen(name);
+	ef.ef_el = el;
+
+	memset(el, 0, sizeof(struct gfs2_ea_location));
+
+	error = ea_foreach(ip, ea_find_i, &ef);
+	if (error > 0)
+		return 0;
+
+	return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+				struct gfs2_ea_header *ea,
+				struct gfs2_ea_header *prev, void *private)
+{
+	int *leave = private;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder rg_gh;
+	struct buffer_head *dibh;
+	__be64 *dataptrs;
+	u64 bn = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	if (GFS2_EA_IS_STUFFED(ea))
+		return 0;
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (*dataptrs) {
+			blks++;
+			bn = be64_to_cpu(*dataptrs);
+		}
+	}
+	if (!blks)
+		return 0;
+
+	rgd = gfs2_blk2rgrpd(sdp, bn, 1);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE +
+				 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_meta(ip->i_gl, bh);
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (!*dataptrs)
+			break;
+		bn = be64_to_cpu(*dataptrs);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*dataptrs = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	if (prev && !leave) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		ea->ea_num_ptrs = 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&rg_gh);
+	return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+			       struct gfs2_ea_header *ea,
+			       struct gfs2_ea_header *prev, int leave)
+{
+	int error;
+
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+
+	gfs2_quota_unhold(ip);
+out_alloc:
+	return error;
+}
+
+struct ea_list {
+	struct gfs2_ea_request *ei_er;
+	unsigned int ei_size;
+};
+
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		return 5 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SYS:
+		return 7 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SECURITY:
+		return 9 + ea->ea_name_len + 1;
+	default:
+		return 0;
+	}
+}
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_list *ei = private;
+	struct gfs2_ea_request *er = ei->ei_er;
+	unsigned int ea_size = gfs2_ea_strlen(ea);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (er->er_data_len) {
+		char *prefix = NULL;
+		unsigned int l = 0;
+		char c = 0;
+
+		if (ei->ei_size + ea_size > er->er_data_len)
+			return -ERANGE;
+
+		switch (ea->ea_type) {
+		case GFS2_EATYPE_USR:
+			prefix = "user.";
+			l = 5;
+			break;
+		case GFS2_EATYPE_SYS:
+			prefix = "system.";
+			l = 7;
+			break;
+		case GFS2_EATYPE_SECURITY:
+			prefix = "security.";
+			l = 9;
+			break;
+		}
+
+		BUG_ON(l == 0);
+
+		memcpy(er->er_data + ei->ei_size, prefix, l);
+		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+		       ea->ea_name_len);
+		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+	}
+
+	ei->ei_size += ea_size;
+
+	return 0;
+}
+
+/**
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+	struct gfs2_ea_request er;
+	struct gfs2_holder i_gh;
+	int error;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	if (size) {
+		er.er_data = buffer;
+		er.er_data_len = size;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return error;
+
+	if (ip->i_eattr) {
+		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
+
+		error = ea_foreach(ip, ea_list_i, &ei);
+		if (!error)
+			error = ei.ei_size;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * ea_iter_unstuffed - copies the unstuffed xattr data to/from the
+ *                     request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @din: The data to be copied in
+ * @dout: The data to be copied out (one of din,dout will be NULL)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+			       const char *din, char *dout)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error = 0;
+	unsigned char *pos;
+	unsigned cp_size;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bh)
+		return -ENOMEM;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto out;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto out;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto out;
+		}
+
+		pos = bh[x]->b_data + sizeof(struct gfs2_meta_header);
+		cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize;
+
+		if (dout) {
+			memcpy(dout, pos, cp_size);
+			dout += sdp->sd_jbsize;
+		}
+
+		if (din) {
+			gfs2_trans_add_meta(ip->i_gl, bh[x]);
+			memcpy(pos, din, cp_size);
+			din += sdp->sd_jbsize;
+		}
+
+		amount -= sdp->sd_jbsize;
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+}
+
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			    char *data, size_t size)
+{
+	int ret;
+	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+	if (len > size)
+		return -ERANGE;
+
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+		return len;
+	}
+	ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
+{
+	struct gfs2_ea_location el;
+	int error;
+	int len;
+	char *data;
+
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		goto out;
+	if (!GFS2_EA_DATA_LEN(el.el_ea))
+		goto out;
+
+	len = GFS2_EA_DATA_LEN(el.el_ea);
+	data = kmalloc(len, GFP_NOFS);
+	error = -ENOMEM;
+	if (data == NULL)
+		goto out;
+
+	error = gfs2_ea_get_copy(ip, &el, data, len);
+	if (error < 0)
+		kfree(data);
+	else
+		*ppdata = data;
+out:
+	brelse(el.el_bh);
+	return error;
+}
+
+/**
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
+ * @type: The type of extended attribute
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+static int gfs2_xattr_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+	if (size)
+		error = gfs2_ea_get_copy(ip, &el, buffer, size);
+	else
+		error = GFS2_EA_DATA_LEN(el.el_ea);
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_ea_header *ea;
+	unsigned int n = 1;
+	u64 block;
+	int error;
+
+	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+	if (error)
+		return error;
+	gfs2_trans_add_unrevoke(sdp, block, 1);
+	*bhp = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_meta(ip->i_gl, *bhp);
+	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+	ea = GFS2_EA_BH2FIRST(*bhp);
+	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+	ea->ea_type = GFS2_EATYPE_UNUSED;
+	ea->ea_flags = GFS2_EAFLAG_LAST;
+	ea->ea_num_ptrs = 0;
+
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+	return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+		    struct gfs2_ea_request *er)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
+
+	ea->ea_data_len = cpu_to_be32(er->er_data_len);
+	ea->ea_name_len = er->er_name_len;
+	ea->ea_type = er->er_type;
+	ea->__pad = 0;
+
+	memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+	if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+		ea->ea_num_ptrs = 0;
+		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+	} else {
+		__be64 *dataptr = GFS2_EA2DATAPTRS(ea);
+		const char *data = er->er_data;
+		unsigned int data_len = er->er_data_len;
+		unsigned int copy;
+		unsigned int x;
+
+		ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+		for (x = 0; x < ea->ea_num_ptrs; x++) {
+			struct buffer_head *bh;
+			u64 block;
+			int mh_size = sizeof(struct gfs2_meta_header);
+			unsigned int n = 1;
+
+			error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+			if (error)
+				return error;
+			gfs2_trans_add_unrevoke(sdp, block, 1);
+			bh = gfs2_meta_new(ip->i_gl, block);
+			gfs2_trans_add_meta(ip->i_gl, bh);
+			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+			gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+							   data_len;
+			memcpy(bh->b_data + mh_size, data, copy);
+			if (copy < sdp->sd_jbsize)
+				memset(bh->b_data + mh_size + copy, 0,
+				       sdp->sd_jbsize - copy);
+
+			*dataptr++ = cpu_to_be64(bh->b_blocknr);
+			data += copy;
+			data_len -= copy;
+
+			brelse(bh);
+		}
+
+		gfs2_assert_withdraw(sdp, !data_len);
+	}
+
+	return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+				   struct gfs2_ea_request *er, void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			     unsigned int blks,
+			     ea_skeleton_call_t skeleton_call, void *private)
+{
+	struct gfs2_alloc_parms ap = { .target = blks };
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
+	error = gfs2_quota_lock_check(ip, &ap);
+	if (error)
+		return error;
+
+	error = gfs2_inplace_reserve(ip, &ap);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+				 blks + gfs2_rg_blocks(ip, blks) +
+				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipres;
+
+	error = skeleton_call(ip, er, private);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+out_end_trans:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+	gfs2_inplace_release(ip);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+	return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+		     void *private)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = ea_alloc_blk(ip, &bh);
+	if (error)
+		return error;
+
+	ip->i_eattr = bh->b_blocknr;
+	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+	brelse(bh);
+
+	return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+		   const void *data, size_t size)
+{
+	struct gfs2_ea_request er;
+	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+	unsigned int blks = 1;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_name_len = strlen(name);
+	er.er_data = (void *)data;
+	er.er_data_len = size;
+
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+	u32 ea_size = GFS2_EA_SIZE(ea);
+	struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+				     ea_size);
+	u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+	int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+	ea->ea_rec_len = cpu_to_be32(ea_size);
+	ea->ea_flags ^= last;
+
+	new->ea_rec_len = cpu_to_be32(new_size);
+	new->ea_flags = last;
+
+	return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	u32 len;
+
+	gfs2_trans_add_meta(ip->i_gl, el->el_bh);
+
+	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		return;
+	} else if (GFS2_EA2NEXT(prev) != ea) {
+		prev = GFS2_EA2NEXT(prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+	}
+
+	len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+	prev->ea_rec_len = cpu_to_be32(len);
+
+	if (GFS2_EA_IS_LAST(ea))
+		prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+	int ea_split;
+
+	struct gfs2_ea_request *es_er;
+	struct gfs2_ea_location *es_el;
+
+	struct buffer_head *es_bh;
+	struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+				 struct gfs2_ea_header *ea, struct ea_set *es)
+{
+	struct gfs2_ea_request *er = es->es_er;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_meta(ip->i_gl, bh);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	ea_write(ip, ea, er);
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+out:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+			       struct gfs2_ea_request *er, void *private)
+{
+	struct ea_set *es = private;
+	struct gfs2_ea_header *ea = es->es_ea;
+	int error;
+
+	gfs2_trans_add_meta(ip->i_gl, es->es_bh);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	error = ea_write(ip, ea, er);
+	if (error)
+		return error;
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+			 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+			 void *private)
+{
+	struct ea_set *es = private;
+	unsigned int size;
+	int stuffed;
+	int error;
+
+	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+			       es->es_er->er_data_len, &size);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+		if (GFS2_EA_REC_LEN(ea) < size)
+			return 0;
+		if (!GFS2_EA_IS_STUFFED(ea)) {
+			error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+			if (error)
+				return error;
+		}
+		es->ea_split = 0;
+	} else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+		es->ea_split = 1;
+	else
+		return 0;
+
+	if (stuffed) {
+		error = ea_set_simple_noalloc(ip, bh, ea, es);
+		if (error)
+			return error;
+	} else {
+		unsigned int blks;
+
+		es->es_bh = bh;
+		es->es_ea = ea;
+		blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+					GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+		error = ea_alloc_skeleton(ip, es->es_er, blks,
+					  ea_set_simple_alloc, es);
+		if (error)
+			return error;
+	}
+
+	return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			void *private)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *indbh, *newbh;
+	__be64 *eablk;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		__be64 *end;
+
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+				       &indbh);
+		if (error)
+			return error;
+
+		if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+			error = -EIO;
+			goto out;
+		}
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		end = eablk + sdp->sd_inptrs;
+
+		for (; eablk < end; eablk++)
+			if (!*eablk)
+				break;
+
+		if (eablk == end) {
+			error = -ENOSPC;
+			goto out;
+		}
+
+		gfs2_trans_add_meta(ip->i_gl, indbh);
+	} else {
+		u64 blk;
+		unsigned int n = 1;
+		error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
+		if (error)
+			return error;
+		gfs2_trans_add_unrevoke(sdp, blk, 1);
+		indbh = gfs2_meta_new(ip->i_gl, blk);
+		gfs2_trans_add_meta(ip->i_gl, indbh);
+		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+		gfs2_buffer_clear_tail(indbh, mh_size);
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		*eablk = cpu_to_be64(ip->i_eattr);
+		ip->i_eattr = blk;
+		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+		eablk++;
+	}
+
+	error = ea_alloc_blk(ip, &newbh);
+	if (error)
+		goto out;
+
+	*eablk = cpu_to_be64((u64)newbh->b_blocknr);
+	error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+	brelse(newbh);
+	if (error)
+		goto out;
+
+	if (private)
+		ea_set_remove_stuffed(ip, private);
+
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+		    const void *value, size_t size, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_request er;
+	struct ea_set es;
+	unsigned int blks = 2;
+	int error;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_data = (void *)value;
+	er.er_name_len = strlen(name);
+	er.er_data_len = size;
+
+	memset(&es, 0, sizeof(struct ea_set));
+	es.es_er = &er;
+	es.es_el = el;
+
+	error = ea_foreach(ip, ea_set_simple, &es);
+	if (error > 0)
+		return 0;
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
+		blks++;
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+				   struct gfs2_ea_location *el)
+{
+	if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+		el->el_prev = GFS2_EA2NEXT(el->el_prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+	}
+
+	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_meta(ip->i_gl, el->el_bh);
+
+	if (prev) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @ip: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name)
+{
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea))
+		error = ea_remove_stuffed(ip, &el);
+	else
+		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
+
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @ip: The inode
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
+ * @type: The type of the extended attribute
+ *
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
+ */
+
+int __gfs2_xattr_set(struct inode *inode, const char *name,
+		   const void *value, size_t size, int flags, int type)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_ea_location el;
+	unsigned int namel = strlen(name);
+	int error;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	if (namel > GFS2_EA_MAX_NAME_LEN)
+		return -ERANGE;
+
+	if (value == NULL)
+		return gfs2_xattr_remove(ip, type, name);
+
+	if (ea_check_size(sdp, namel, size))
+		return -ERANGE;
+
+	if (!ip->i_eattr) {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+		return ea_init(ip, type, name, value, size);
+	}
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+
+	if (el.el_ea) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+			brelse(el.el_bh);
+			return -EPERM;
+		}
+
+		error = -EEXIST;
+		if (!(flags & XATTR_CREATE)) {
+			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+			error = ea_set_i(ip, type, name, value, size, &el);
+			if (!error && unstuffed)
+				ea_set_remove_unstuffed(ip, &el);
+		}
+
+		brelse(el.el_bh);
+		return error;
+	}
+
+	error = -ENODATA;
+	if (!(flags & XATTR_REPLACE))
+		error = ea_set_i(ip, type, name, value, size, NULL);
+
+	return error;
+}
+
+static int gfs2_xattr_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	return __gfs2_xattr_set(d_inode(dentry), name, value,
+				size, flags, type);
+}
+
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_header *ea, char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	int ret;
+
+	ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+	if (ret)
+		return ret;
+
+	ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
+	gfs2_trans_end(sdp);
+
+	return ret;
+}
+
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_ea_location el;
+	int error;
+
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
+	if (error)
+		return error;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea)) {
+		error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
+		if (error == 0) {
+			gfs2_trans_add_meta(ip->i_gl, el.el_bh);
+			memcpy(GFS2_EA2DATA(el.el_ea), data,
+			       GFS2_EA_DATA_LEN(el.el_ea));
+		}
+	} else {
+		error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
+	}
+
+	brelse(el.el_bh);
+	if (error)
+		return error;
+
+	error = gfs2_setattr_simple(inode, attr);
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *indbh, *dibh;
+	__be64 *eablk, *end;
+	unsigned int rg_blocks = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + sdp->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(ip, &rlist, bstart);
+			bstart = bn;
+			blen = 1;
+		}
+		blks++;
+	}
+	if (bstart)
+		gfs2_rlist_add(ip, &rlist, bstart);
+	else
+		goto out;
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist_free;
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+				 RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_meta(ip->i_gl, indbh);
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	bstart = 0;
+	blen = 0;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*eablk = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+	gfs2_rlist_free(&rlist);
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct buffer_head *dibh;
+	struct gfs2_holder gh;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+				 RES_QUOTA, 1);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_free_meta(ip, ip->i_eattr, 1);
+
+	ip->i_eattr = 0;
+	gfs2_add_inode_blocks(&ip->i_inode, -1);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&gh);
+	return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+	int error;
+
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (error)
+		return error;
+
+	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+	if (error)
+		goto out_quota;
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		error = ea_dealloc_indirect(ip);
+		if (error)
+			goto out_quota;
+	}
+
+	error = ea_dealloc_block(ip);
+
+out_quota:
+	gfs2_quota_unhold(ip);
+	return error;
+}
+
+static const struct xattr_handler gfs2_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.flags  = GFS2_EATYPE_USR,
+	.get    = gfs2_xattr_get,
+	.set    = gfs2_xattr_set,
+};
+
+static const struct xattr_handler gfs2_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.flags  = GFS2_EATYPE_SECURITY,
+	.get    = gfs2_xattr_get,
+	.set    = gfs2_xattr_set,
+};
+
+const struct xattr_handler *gfs2_xattr_handlers[] = {
+	&gfs2_xattr_user_handler,
+	&gfs2_xattr_security_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	NULL,
+};
+
diff --git a/kernel/fs/gfs2/xattr.h b/kernel/fs/gfs2/xattr.h
new file mode 100644
index 000000000..d392f8358
--- /dev/null
+++ b/kernel/fs/gfs2/xattr.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+struct gfs2_inode;
+struct iattr;
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+				  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+struct gfs2_ea_request {
+	const char *er_name;
+	char *er_data;
+	unsigned int er_name_len;
+	unsigned int er_data_len;
+	unsigned int er_type; /* GFS2_EATYPE_... */
+};
+
+struct gfs2_ea_location {
+	struct buffer_head *el_bh;
+	struct gfs2_ea_header *el_ea;
+	struct gfs2_ea_header *el_prev;
+};
+
+extern int __gfs2_xattr_set(struct inode *inode, const char *name,
+			    const void *value, size_t size,
+			    int flags, int type);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+/* Exported to acl.c */
+
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
+
+#endif /* __EATTR_DOT_H__ */
diff --git a/kernel/fs/hfs/Kconfig b/kernel/fs/hfs/Kconfig
new file mode 100644
index 000000000..998e3a6de
--- /dev/null
+++ b/kernel/fs/hfs/Kconfig
@@ -0,0 +1,12 @@
+config HFS_FS
+	tristate "Apple Macintosh file system support"
+	depends on BLOCK
+	select NLS
+	help
+	  If you say Y here, you will be able to mount Macintosh-formatted
+	  floppy disks and hard drive partitions with full read-write access.
+	  Please read <file:Documentation/filesystems/hfs.txt> to learn about
+	  the available mount options.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hfs.
diff --git a/kernel/fs/hfs/Makefile b/kernel/fs/hfs/Makefile
new file mode 100644
index 000000000..c41f5a85f
--- /dev/null
+++ b/kernel/fs/hfs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux hfs filesystem routines.
+#
+
+obj-$(CONFIG_HFS_FS) += hfs.o
+
+hfs-objs := bitmap.o bfind.o bnode.o brec.o btree.o \
+	    catalog.o dir.o extent.o inode.o attr.o mdb.o \
+            part_tbl.o string.o super.o sysdep.o trans.o
+
diff --git a/kernel/fs/hfs/attr.c b/kernel/fs/hfs/attr.c
new file mode 100644
index 000000000..8d931b157
--- /dev/null
+++ b/kernel/fs/hfs/attr.c
@@ -0,0 +1,121 @@
+/*
+ *  linux/fs/hfs/attr.c
+ *
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Export hfs data via xattr
+ */
+
+
+#include <linux/fs.h>
+#include <linux/xattr.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+int hfs_setxattr(struct dentry *dentry, const char *name,
+		 const void *value, size_t size, int flags)
+{
+	struct inode *inode = d_inode(dentry);
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+	struct hfs_cat_file *file;
+	int res;
+
+	if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd);
+	if (res)
+		return res;
+	fd.search_key->cat = HFS_I(inode)->cat_key;
+	res = hfs_brec_find(&fd);
+	if (res)
+		goto out;
+	hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			sizeof(struct hfs_cat_file));
+	file = &rec.file;
+
+	if (!strcmp(name, "hfs.type")) {
+		if (size == 4)
+			memcpy(&file->UsrWds.fdType, value, 4);
+		else
+			res = -ERANGE;
+	} else if (!strcmp(name, "hfs.creator")) {
+		if (size == 4)
+			memcpy(&file->UsrWds.fdCreator, value, 4);
+		else
+			res = -ERANGE;
+	} else
+		res = -EOPNOTSUPP;
+	if (!res)
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+				sizeof(struct hfs_cat_file));
+out:
+	hfs_find_exit(&fd);
+	return res;
+}
+
+ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
+			 void *value, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+	struct hfs_cat_file *file;
+	ssize_t res = 0;
+
+	if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (size) {
+		res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd);
+		if (res)
+			return res;
+		fd.search_key->cat = HFS_I(inode)->cat_key;
+		res = hfs_brec_find(&fd);
+		if (res)
+			goto out;
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+				sizeof(struct hfs_cat_file));
+	}
+	file = &rec.file;
+
+	if (!strcmp(name, "hfs.type")) {
+		if (size >= 4) {
+			memcpy(value, &file->UsrWds.fdType, 4);
+			res = 4;
+		} else
+			res = size ? -ERANGE : 4;
+	} else if (!strcmp(name, "hfs.creator")) {
+		if (size >= 4) {
+			memcpy(value, &file->UsrWds.fdCreator, 4);
+			res = 4;
+		} else
+			res = size ? -ERANGE : 4;
+	} else
+		res = -ENODATA;
+out:
+	if (size)
+		hfs_find_exit(&fd);
+	return res;
+}
+
+#define HFS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type"))
+
+ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (!buffer || !size)
+		return HFS_ATTRLIST_SIZE;
+	if (size < HFS_ATTRLIST_SIZE)
+		return -ERANGE;
+	strcpy(buffer, "hfs.type");
+	strcpy(buffer + sizeof("hfs.type"), "hfs.creator");
+
+	return HFS_ATTRLIST_SIZE;
+}
diff --git a/kernel/fs/hfs/bfind.c b/kernel/fs/hfs/bfind.c
new file mode 100644
index 000000000..de69d8a24
--- /dev/null
+++ b/kernel/fs/hfs/bfind.c
@@ -0,0 +1,224 @@
+/*
+ *  linux/fs/hfs/bfind.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Search routines for btrees
+ */
+
+#include <linux/slab.h>
+#include "btree.h"
+
+int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
+{
+	void *ptr;
+
+	fd->tree = tree;
+	fd->bnode = NULL;
+	ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+	fd->search_key = ptr;
+	fd->key = ptr + tree->max_key_len + 2;
+	hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+		tree->cnid, __builtin_return_address(0));
+	mutex_lock(&tree->tree_lock);
+	return 0;
+}
+
+void hfs_find_exit(struct hfs_find_data *fd)
+{
+	hfs_bnode_put(fd->bnode);
+	kfree(fd->search_key);
+	hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+		fd->tree->cnid, __builtin_return_address(0));
+	mutex_unlock(&fd->tree->tree_lock);
+	fd->tree = NULL;
+}
+
+/* Find the record in bnode that best matches key (not greater than...)*/
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
+{
+	int cmpval;
+	u16 off, len, keylen;
+	int rec;
+	int b, e;
+	int res;
+
+	b = 0;
+	e = bnode->num_recs - 1;
+	res = -ENOENT;
+	do {
+		rec = (e + b) / 2;
+		len = hfs_brec_lenoff(bnode, rec, &off);
+		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
+		if (!cmpval) {
+			e = rec;
+			res = 0;
+			goto done;
+		}
+		if (cmpval < 0)
+			b = rec + 1;
+		else
+			e = rec - 1;
+	} while (b <= e);
+	if (rec != e && e >= 0) {
+		len = hfs_brec_lenoff(bnode, e, &off);
+		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+	}
+done:
+	fd->record = e;
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+fail:
+	return res;
+}
+
+/* Traverse a B*Tree from the root to a leaf finding best fit to key */
+/* Return allocated copy of node found, set recnum to best record */
+int hfs_brec_find(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	u32 nidx, parent;
+	__be32 data;
+	int height, res;
+
+	tree = fd->tree;
+	if (fd->bnode)
+		hfs_bnode_put(fd->bnode);
+	fd->bnode = NULL;
+	nidx = tree->root;
+	if (!nidx)
+		return -ENOENT;
+	height = tree->depth;
+	res = 0;
+	parent = 0;
+	for (;;) {
+		bnode = hfs_bnode_find(tree, nidx);
+		if (IS_ERR(bnode)) {
+			res = PTR_ERR(bnode);
+			bnode = NULL;
+			break;
+		}
+		if (bnode->height != height)
+			goto invalid;
+		if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF))
+			goto invalid;
+		bnode->parent = parent;
+
+		res = __hfs_brec_find(bnode, fd);
+		if (!height)
+			break;
+		if (fd->record < 0)
+			goto release;
+
+		parent = nidx;
+		hfs_bnode_read(bnode, &data, fd->entryoffset, 4);
+		nidx = be32_to_cpu(data);
+		hfs_bnode_put(bnode);
+	}
+	fd->bnode = bnode;
+	return res;
+
+invalid:
+	pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+	       height, bnode->height, bnode->type, nidx, parent);
+	res = -EIO;
+release:
+	hfs_bnode_put(bnode);
+	return res;
+}
+
+int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)
+{
+	int res;
+
+	res = hfs_brec_find(fd);
+	if (res)
+		return res;
+	if (fd->entrylength > rec_len)
+		return -EINVAL;
+	hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength);
+	return 0;
+}
+
+int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	int idx, res = 0;
+	u16 off, len, keylen;
+
+	bnode = fd->bnode;
+	tree = bnode->tree;
+
+	if (cnt < 0) {
+		cnt = -cnt;
+		while (cnt > fd->record) {
+			cnt -= fd->record + 1;
+			fd->record = bnode->num_recs - 1;
+			idx = bnode->prev;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record -= cnt;
+	} else {
+		while (cnt >= bnode->num_recs - fd->record) {
+			cnt -= bnode->num_recs - fd->record;
+			fd->record = 0;
+			idx = bnode->next;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record += cnt;
+	}
+
+	len = hfs_brec_lenoff(bnode, fd->record, &off);
+	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == 0) {
+		res = -EINVAL;
+		goto out;
+	}
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+	hfs_bnode_read(bnode, fd->key, off, keylen);
+out:
+	fd->bnode = bnode;
+	return res;
+}
diff --git a/kernel/fs/hfs/bitmap.c b/kernel/fs/hfs/bitmap.c
new file mode 100644
index 000000000..28307bc9e
--- /dev/null
+++ b/kernel/fs/hfs/bitmap.c
@@ -0,0 +1,243 @@
+/*
+ *  linux/fs/hfs/bitmap.c
+ *
+ * Copyright (C) 1996-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * Based on GPLed code Copyright (C) 1995  Michael Dreher
+ *
+ * This file contains the code to modify the volume bitmap:
+ * search/set/clear bits.
+ */
+
+#include "hfs_fs.h"
+
+/*
+ * hfs_find_zero_bit()
+ *
+ * Description:
+ *  Given a block of memory, its length in bits, and a starting bit number,
+ *  determine the number of the first zero bits (in left-to-right ordering)
+ *  in that range.
+ *
+ *  Returns >= 'size' if no zero bits are found in the range.
+ *
+ *  Accesses memory in 32-bit aligned chunks of 32-bits and thus
+ *  may read beyond the 'size'th bit.
+ */
+static u32 hfs_find_set_zero_bits(__be32 *bitmap, u32 size, u32 offset, u32 *max)
+{
+	__be32 *curr, *end;
+	u32 mask, start, len, n;
+	__be32 val;
+	int i;
+
+	len = *max;
+	if (!len)
+		return size;
+
+	curr = bitmap + (offset / 32);
+	end = bitmap + ((size + 31) / 32);
+
+	/* scan the first partial u32 for zero bits */
+	val = *curr;
+	if (~val) {
+		n = be32_to_cpu(val);
+		i = offset % 32;
+		mask = (1U << 31) >> i;
+		for (; i < 32; mask >>= 1, i++) {
+			if (!(n & mask))
+				goto found;
+		}
+	}
+
+	/* scan complete u32s for the first zero bit */
+	while (++curr < end) {
+		val = *curr;
+		if (~val) {
+			n = be32_to_cpu(val);
+			mask = 1 << 31;
+			for (i = 0; i < 32; mask >>= 1, i++) {
+				if (!(n & mask))
+					goto found;
+			}
+		}
+	}
+	return size;
+
+found:
+	start = (curr - bitmap) * 32 + i;
+	if (start >= size)
+		return start;
+	/* do any partial u32 at the start */
+	len = min(size - start, len);
+	while (1) {
+		n |= mask;
+		if (++i >= 32)
+			break;
+		mask >>= 1;
+		if (!--len || n & mask)
+			goto done;
+	}
+	if (!--len)
+		goto done;
+	*curr++ = cpu_to_be32(n);
+	/* do full u32s */
+	while (1) {
+		n = be32_to_cpu(*curr);
+		if (len < 32)
+			break;
+		if (n) {
+			len = 32;
+			break;
+		}
+		*curr++ = cpu_to_be32(0xffffffff);
+		len -= 32;
+	}
+	/* do any partial u32 at end */
+	mask = 1U << 31;
+	for (i = 0; i < len; i++) {
+		if (n & mask)
+			break;
+		n |= mask;
+		mask >>= 1;
+	}
+done:
+	*curr = cpu_to_be32(n);
+	*max = (curr - bitmap) * 32 + i - start;
+	return start;
+}
+
+/*
+ * hfs_vbm_search_free()
+ *
+ * Description:
+ *   Search for 'num_bits' consecutive cleared bits in the bitmap blocks of
+ *   the hfs MDB. 'mdb' had better be locked or the returned range
+ *   may be no longer free, when this functions returns!
+ *   XXX Currently the search starts from bit 0, but it should start with
+ *   the bit number stored in 's_alloc_ptr' of the MDB.
+ * Input Variable(s):
+ *   struct hfs_mdb *mdb: Pointer to the hfs MDB
+ *   u16 *num_bits: Pointer to the number of cleared bits
+ *     to search for
+ * Output Variable(s):
+ *   u16 *num_bits: The number of consecutive clear bits of the
+ *     returned range. If the bitmap is fragmented, this will be less than
+ *     requested and it will be zero, when the disk is full.
+ * Returns:
+ *   The number of the first bit of the range of cleared bits which has been
+ *   found. When 'num_bits' is zero, this is invalid!
+ * Preconditions:
+ *   'mdb' points to a "valid" (struct hfs_mdb).
+ *   'num_bits' points to a variable of type (u16), which contains
+ *	the number of cleared bits to find.
+ * Postconditions:
+ *   'num_bits' is set to the length of the found sequence.
+ */
+u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
+{
+	void *bitmap;
+	u32 pos;
+
+	/* make sure we have actual work to perform */
+	if (!*num_bits)
+		return 0;
+
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
+	bitmap = HFS_SB(sb)->bitmap;
+
+	pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits);
+	if (pos >= HFS_SB(sb)->fs_ablocks) {
+		if (goal)
+			pos = hfs_find_set_zero_bits(bitmap, goal, 0, num_bits);
+		if (pos >= HFS_SB(sb)->fs_ablocks) {
+			*num_bits = pos = 0;
+			goto out;
+		}
+	}
+
+	hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
+	HFS_SB(sb)->free_ablocks -= *num_bits;
+	hfs_bitmap_dirty(sb);
+out:
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
+	return pos;
+}
+
+
+/*
+ * hfs_clear_vbm_bits()
+ *
+ * Description:
+ *   Clear the requested bits in the volume bitmap of the hfs filesystem
+ * Input Variable(s):
+ *   struct hfs_mdb *mdb: Pointer to the hfs MDB
+ *   u16 start: The offset of the first bit
+ *   u16 count: The number of bits
+ * Output Variable(s):
+ *   None
+ * Returns:
+ *    0: no error
+ *   -1: One of the bits was already clear.  This is a strange
+ *	 error and when it happens, the filesystem must be repaired!
+ *   -2: One or more of the bits are out of range of the bitmap.
+ * Preconditions:
+ *   'mdb' points to a "valid" (struct hfs_mdb).
+ * Postconditions:
+ *   Starting with bit number 'start', 'count' bits in the volume bitmap
+ *   are cleared. The affected bitmap blocks are marked "dirty", the free
+ *   block count of the MDB is updated and the MDB is marked dirty.
+ */
+int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
+{
+	__be32 *curr;
+	u32 mask;
+	int i, len;
+
+	/* is there any actual work to be done? */
+	if (!count)
+		return 0;
+
+	hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count);
+	/* are all of the bits in range? */
+	if ((start + count) > HFS_SB(sb)->fs_ablocks)
+		return -2;
+
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
+	/* bitmap is always on a 32-bit boundary */
+	curr = HFS_SB(sb)->bitmap + (start / 32);
+	len = count;
+
+	/* do any partial u32 at the start */
+	i = start % 32;
+	if (i) {
+		int j = 32 - i;
+		mask = 0xffffffffU << j;
+		if (j > count) {
+			mask |= 0xffffffffU >> (i + count);
+			*curr &= cpu_to_be32(mask);
+			goto out;
+		}
+		*curr++ &= cpu_to_be32(mask);
+		count -= j;
+	}
+
+	/* do full u32s */
+	while (count >= 32) {
+		*curr++ = 0;
+		count -= 32;
+	}
+	/* do any partial u32 at end */
+	if (count) {
+		mask = 0xffffffffU >> count;
+		*curr &= cpu_to_be32(mask);
+	}
+out:
+	HFS_SB(sb)->free_ablocks += len;
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
+	hfs_bitmap_dirty(sb);
+
+	return 0;
+}
diff --git a/kernel/fs/hfs/bnode.c b/kernel/fs/hfs/bnode.c
new file mode 100644
index 000000000..d3fa6bd95
--- /dev/null
+++ b/kernel/fs/hfs/bnode.c
@@ -0,0 +1,485 @@
+/*
+ *  linux/fs/hfs/bnode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle basic btree node operations
+ */
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+
+#include "btree.h"
+
+void hfs_bnode_read(struct hfs_bnode *node, void *buf,
+		int off, int len)
+{
+	struct page *page;
+
+	off += node->page_offset;
+	page = node->page[0];
+
+	memcpy(buf, kmap(page) + off, len);
+	kunmap(page);
+}
+
+u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
+{
+	__be16 data;
+	// optimize later...
+	hfs_bnode_read(node, &data, off, 2);
+	return be16_to_cpu(data);
+}
+
+u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
+{
+	u8 data;
+	// optimize later...
+	hfs_bnode_read(node, &data, off, 1);
+	return data;
+}
+
+void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
+{
+	struct hfs_btree *tree;
+	int key_len;
+
+	tree = node->tree;
+	if (node->type == HFS_NODE_LEAF ||
+	    tree->attributes & HFS_TREE_VARIDXKEYS)
+		key_len = hfs_bnode_read_u8(node, off) + 1;
+	else
+		key_len = tree->max_key_len + 1;
+
+	hfs_bnode_read(node, key, off, key_len);
+}
+
+void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
+{
+	struct page *page;
+
+	off += node->page_offset;
+	page = node->page[0];
+
+	memcpy(kmap(page) + off, buf, len);
+	kunmap(page);
+	set_page_dirty(page);
+}
+
+void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
+{
+	__be16 v = cpu_to_be16(data);
+	// optimize later...
+	hfs_bnode_write(node, &v, off, 2);
+}
+
+void hfs_bnode_write_u8(struct hfs_bnode *node, int off, u8 data)
+{
+	// optimize later...
+	hfs_bnode_write(node, &data, off, 1);
+}
+
+void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
+{
+	struct page *page;
+
+	off += node->page_offset;
+	page = node->page[0];
+
+	memset(kmap(page) + off, 0, len);
+	kunmap(page);
+	set_page_dirty(page);
+}
+
+void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
+		struct hfs_bnode *src_node, int src, int len)
+{
+	struct hfs_btree *tree;
+	struct page *src_page, *dst_page;
+
+	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	tree = src_node->tree;
+	src += src_node->page_offset;
+	dst += dst_node->page_offset;
+	src_page = src_node->page[0];
+	dst_page = dst_node->page[0];
+
+	memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len);
+	kunmap(src_page);
+	kunmap(dst_page);
+	set_page_dirty(dst_page);
+}
+
+void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
+{
+	struct page *page;
+	void *ptr;
+
+	hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	src += node->page_offset;
+	dst += node->page_offset;
+	page = node->page[0];
+	ptr = kmap(page);
+	memmove(ptr + dst, ptr + src, len);
+	kunmap(page);
+	set_page_dirty(page);
+}
+
+void hfs_bnode_dump(struct hfs_bnode *node)
+{
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+	int i, off, key_off;
+
+	hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_bnode_read(node, &desc, 0, sizeof(desc));
+	hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
+		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
+		desc.type, desc.height, be16_to_cpu(desc.num_recs));
+
+	off = node->tree->node_size - 2;
+	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
+		key_off = hfs_bnode_read_u16(node, off);
+		hfs_dbg_cont(BNODE_MOD, " %d", key_off);
+		if (i && node->type == HFS_NODE_INDEX) {
+			int tmp;
+
+			if (node->tree->attributes & HFS_TREE_VARIDXKEYS)
+				tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
+			else
+				tmp = node->tree->max_key_len + 1;
+			hfs_dbg_cont(BNODE_MOD, " (%d,%d",
+				     tmp, hfs_bnode_read_u8(node, key_off));
+			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
+			hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+		} else if (i && node->type == HFS_NODE_LEAF) {
+			int tmp;
+
+			tmp = hfs_bnode_read_u8(node, key_off);
+			hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
+		}
+	}
+	hfs_dbg_cont(BNODE_MOD, "\n");
+}
+
+void hfs_bnode_unlink(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *tmp;
+	__be32 cnid;
+
+	tree = node->tree;
+	if (node->prev) {
+		tmp = hfs_bnode_find(tree, node->prev);
+		if (IS_ERR(tmp))
+			return;
+		tmp->next = node->next;
+		cnid = cpu_to_be32(tmp->next);
+		hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_head = node->next;
+
+	if (node->next) {
+		tmp = hfs_bnode_find(tree, node->next);
+		if (IS_ERR(tmp))
+			return;
+		tmp->prev = node->prev;
+		cnid = cpu_to_be32(tmp->prev);
+		hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_tail = node->prev;
+
+	// move down?
+	if (!node->prev && !node->next) {
+		printk(KERN_DEBUG "hfs_btree_del_level\n");
+	}
+	if (!node->parent) {
+		tree->root = 0;
+		tree->depth = 0;
+	}
+	set_bit(HFS_BNODE_DELETED, &node->flags);
+}
+
+static inline int hfs_bnode_hash(u32 num)
+{
+	num = (num >> 16) + num;
+	num += num >> 8;
+	return num & (NODE_HASH_SIZE - 1);
+}
+
+struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
+{
+	struct hfs_bnode *node;
+
+	if (cnid >= tree->node_count) {
+		pr_err("request for non-existent node %d in B*Tree\n", cnid);
+		return NULL;
+	}
+
+	for (node = tree->node_hash[hfs_bnode_hash(cnid)];
+	     node; node = node->next_hash) {
+		if (node->this == cnid) {
+			return node;
+		}
+	}
+	return NULL;
+}
+
+static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
+{
+	struct super_block *sb;
+	struct hfs_bnode *node, *node2;
+	struct address_space *mapping;
+	struct page *page;
+	int size, block, i, hash;
+	loff_t off;
+
+	if (cnid >= tree->node_count) {
+		pr_err("request for non-existent node %d in B*Tree\n", cnid);
+		return NULL;
+	}
+
+	sb = tree->inode->i_sb;
+	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
+		sizeof(struct page *);
+	node = kzalloc(size, GFP_KERNEL);
+	if (!node)
+		return NULL;
+	node->tree = tree;
+	node->this = cnid;
+	set_bit(HFS_BNODE_NEW, &node->flags);
+	atomic_set(&node->refcnt, 1);
+	hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+		node->tree->cnid, node->this);
+	init_waitqueue_head(&node->lock_wq);
+	spin_lock(&tree->hash_lock);
+	node2 = hfs_bnode_findhash(tree, cnid);
+	if (!node2) {
+		hash = hfs_bnode_hash(cnid);
+		node->next_hash = tree->node_hash[hash];
+		tree->node_hash[hash] = node;
+		tree->node_hash_cnt++;
+	} else {
+		spin_unlock(&tree->hash_lock);
+		kfree(node);
+		wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
+		return node2;
+	}
+	spin_unlock(&tree->hash_lock);
+
+	mapping = tree->inode->i_mapping;
+	off = (loff_t)cnid * tree->node_size;
+	block = off >> PAGE_CACHE_SHIFT;
+	node->page_offset = off & ~PAGE_CACHE_MASK;
+	for (i = 0; i < tree->pages_per_bnode; i++) {
+		page = read_mapping_page(mapping, block++, NULL);
+		if (IS_ERR(page))
+			goto fail;
+		if (PageError(page)) {
+			page_cache_release(page);
+			goto fail;
+		}
+		page_cache_release(page);
+		node->page[i] = page;
+	}
+
+	return node;
+fail:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	return node;
+}
+
+void hfs_bnode_unhash(struct hfs_bnode *node)
+{
+	struct hfs_bnode **p;
+
+	hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
+		node->tree->cnid, node->this, atomic_read(&node->refcnt));
+	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
+	     *p && *p != node; p = &(*p)->next_hash)
+		;
+	BUG_ON(!*p);
+	*p = node->next_hash;
+	node->tree->node_hash_cnt--;
+}
+
+/* Load a particular node out of a tree */
+struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc *desc;
+	int i, rec_off, off, next_off;
+	int entry_size, key_size;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	if (node) {
+		hfs_bnode_get(node);
+		spin_unlock(&tree->hash_lock);
+		wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
+		if (test_bit(HFS_BNODE_ERROR, &node->flags))
+			goto node_error;
+		return node;
+	}
+	spin_unlock(&tree->hash_lock);
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags))
+		goto node_error;
+	if (!test_bit(HFS_BNODE_NEW, &node->flags))
+		return node;
+
+	desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+	node->prev = be32_to_cpu(desc->prev);
+	node->next = be32_to_cpu(desc->next);
+	node->num_recs = be16_to_cpu(desc->num_recs);
+	node->type = desc->type;
+	node->height = desc->height;
+	kunmap(node->page[0]);
+
+	switch (node->type) {
+	case HFS_NODE_HEADER:
+	case HFS_NODE_MAP:
+		if (node->height != 0)
+			goto node_error;
+		break;
+	case HFS_NODE_LEAF:
+		if (node->height != 1)
+			goto node_error;
+		break;
+	case HFS_NODE_INDEX:
+		if (node->height <= 1 || node->height > tree->depth)
+			goto node_error;
+		break;
+	default:
+		goto node_error;
+	}
+
+	rec_off = tree->node_size - 2;
+	off = hfs_bnode_read_u16(node, rec_off);
+	if (off != sizeof(struct hfs_bnode_desc))
+		goto node_error;
+	for (i = 1; i <= node->num_recs; off = next_off, i++) {
+		rec_off -= 2;
+		next_off = hfs_bnode_read_u16(node, rec_off);
+		if (next_off <= off ||
+		    next_off > tree->node_size ||
+		    next_off & 1)
+			goto node_error;
+		entry_size = next_off - off;
+		if (node->type != HFS_NODE_INDEX &&
+		    node->type != HFS_NODE_LEAF)
+			continue;
+		key_size = hfs_bnode_read_u8(node, off) + 1;
+		if (key_size >= entry_size /*|| key_size & 1*/)
+			goto node_error;
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	return node;
+
+node_error:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	hfs_bnode_put(node);
+	return ERR_PTR(-EIO);
+}
+
+void hfs_bnode_free(struct hfs_bnode *node)
+{
+	//int i;
+
+	//for (i = 0; i < node->tree->pages_per_bnode; i++)
+	//	if (node->page[i])
+	//		page_cache_release(node->page[i]);
+	kfree(node);
+}
+
+struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct page **pagep;
+	int i;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	spin_unlock(&tree->hash_lock);
+	if (node) {
+		pr_crit("new node %u already hashed?\n", num);
+		WARN_ON(1);
+		return node;
+	}
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags)) {
+		hfs_bnode_put(node);
+		return ERR_PTR(-EIO);
+	}
+
+	pagep = node->page;
+	memset(kmap(*pagep) + node->page_offset, 0,
+	       min((int)PAGE_CACHE_SIZE, (int)tree->node_size));
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+	for (i = 1; i < tree->pages_per_bnode; i++) {
+		memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+
+	return node;
+}
+
+void hfs_bnode_get(struct hfs_bnode *node)
+{
+	if (node) {
+		atomic_inc(&node->refcnt);
+		hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
+	}
+}
+
+/* Dispose of resources used by a node */
+void hfs_bnode_put(struct hfs_bnode *node)
+{
+	if (node) {
+		struct hfs_btree *tree = node->tree;
+		int i;
+
+		hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
+		BUG_ON(!atomic_read(&node->refcnt));
+		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
+			return;
+		for (i = 0; i < tree->pages_per_bnode; i++) {
+			if (!node->page[i])
+				continue;
+			mark_page_accessed(node->page[i]);
+		}
+
+		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
+			hfs_bnode_unhash(node);
+			spin_unlock(&tree->hash_lock);
+			hfs_bmap_free(node);
+			hfs_bnode_free(node);
+			return;
+		}
+		spin_unlock(&tree->hash_lock);
+	}
+}
diff --git a/kernel/fs/hfs/brec.c b/kernel/fs/hfs/brec.c
new file mode 100644
index 000000000..9f4ee7f52
--- /dev/null
+++ b/kernel/fs/hfs/brec.c
@@ -0,0 +1,520 @@
+/*
+ *  linux/fs/hfs/brec.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle individual btree records
+ */
+
+#include "btree.h"
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd);
+static int hfs_brec_update_parent(struct hfs_find_data *fd);
+static int hfs_btree_inc_height(struct hfs_btree *tree);
+
+/* Get the length and offset of the given record in the given node */
+u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off)
+{
+	__be16 retval[2];
+	u16 dataoff;
+
+	dataoff = node->tree->node_size - (rec + 2) * 2;
+	hfs_bnode_read(node, retval, dataoff, 4);
+	*off = be16_to_cpu(retval[1]);
+	return be16_to_cpu(retval[0]) - *off;
+}
+
+/* Get the length of the key from a keyed record */
+u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
+{
+	u16 retval, recoff;
+
+	if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF)
+		return 0;
+
+	if ((node->type == HFS_NODE_INDEX) &&
+	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
+		if (node->tree->attributes & HFS_TREE_BIGKEYS)
+			retval = node->tree->max_key_len + 2;
+		else
+			retval = node->tree->max_key_len + 1;
+	} else {
+		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
+		if (!recoff)
+			return 0;
+		if (node->tree->attributes & HFS_TREE_BIGKEYS) {
+			retval = hfs_bnode_read_u16(node, recoff) + 2;
+			if (retval > node->tree->max_key_len + 2) {
+				pr_err("keylen %d too large\n", retval);
+				retval = 0;
+			}
+		} else {
+			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+			if (retval > node->tree->max_key_len + 1) {
+				pr_err("keylen %d too large\n", retval);
+				retval = 0;
+			}
+		}
+	}
+	return retval;
+}
+
+int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node;
+	int size, key_len, rec;
+	int data_off, end_off;
+	int idx_rec_off, data_rec_off, end_rec_off;
+	__be32 cnid;
+
+	tree = fd->tree;
+	if (!fd->bnode) {
+		if (!tree->root)
+			hfs_btree_inc_height(tree);
+		fd->bnode = hfs_bnode_find(tree, tree->leaf_head);
+		if (IS_ERR(fd->bnode))
+			return PTR_ERR(fd->bnode);
+		fd->record = -1;
+	}
+	new_node = NULL;
+	key_len = (fd->search_key->key_len | 1) + 1;
+again:
+	/* new record idx and complete record size */
+	rec = fd->record + 1;
+	size = key_len + entry_len;
+
+	node = fd->bnode;
+	hfs_bnode_dump(node);
+	/* get last offset */
+	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
+	end_off = hfs_bnode_read_u16(node, end_rec_off);
+	end_rec_off -= 2;
+	hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+		rec, size, end_off, end_rec_off);
+	if (size > end_rec_off - end_off) {
+		if (new_node)
+			panic("not enough room!\n");
+		new_node = hfs_bnode_split(fd);
+		if (IS_ERR(new_node))
+			return PTR_ERR(new_node);
+		goto again;
+	}
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count++;
+		mark_inode_dirty(tree->inode);
+	}
+	node->num_recs++;
+	/* write new last offset */
+	hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+	hfs_bnode_write_u16(node, end_rec_off, end_off + size);
+	data_off = end_off;
+	data_rec_off = end_rec_off + 2;
+	idx_rec_off = tree->node_size - (rec + 1) * 2;
+	if (idx_rec_off == data_rec_off)
+		goto skip;
+	/* move all following entries */
+	do {
+		data_off = hfs_bnode_read_u16(node, data_rec_off + 2);
+		hfs_bnode_write_u16(node, data_rec_off, data_off + size);
+		data_rec_off += 2;
+	} while (data_rec_off < idx_rec_off);
+
+	/* move data away */
+	hfs_bnode_move(node, data_off + size, data_off,
+		       end_off - data_off);
+
+skip:
+	hfs_bnode_write(node, fd->search_key, data_off, key_len);
+	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
+	hfs_bnode_dump(node);
+
+	if (new_node) {
+		/* update parent key if we inserted a key
+		 * at the start of the first node
+		 */
+		if (!rec && new_node != node)
+			hfs_brec_update_parent(fd);
+
+		hfs_bnode_put(fd->bnode);
+		if (!new_node->parent) {
+			hfs_btree_inc_height(tree);
+			new_node->parent = tree->root;
+		}
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+
+		/* create index data entry */
+		cnid = cpu_to_be32(new_node->this);
+		entry = &cnid;
+		entry_len = sizeof(cnid);
+
+		/* get index key */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		__hfs_brec_find(fd->bnode, fd);
+
+		hfs_bnode_put(new_node);
+		new_node = NULL;
+
+		if (tree->attributes & HFS_TREE_VARIDXKEYS)
+			key_len = fd->search_key->key_len + 1;
+		else {
+			fd->search_key->key_len = tree->max_key_len;
+			key_len = tree->max_key_len + 1;
+		}
+		goto again;
+	}
+
+	if (!rec)
+		hfs_brec_update_parent(fd);
+
+	return 0;
+}
+
+int hfs_brec_remove(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *parent;
+	int end_off, rec_off, data_off, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+again:
+	rec_off = tree->node_size - (fd->record + 2) * 2;
+	end_off = tree->node_size - (node->num_recs + 1) * 2;
+
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count--;
+		mark_inode_dirty(tree->inode);
+	}
+	hfs_bnode_dump(node);
+	hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+		fd->record, fd->keylength + fd->entrylength);
+	if (!--node->num_recs) {
+		hfs_bnode_unlink(node);
+		if (!node->parent)
+			return 0;
+		parent = hfs_bnode_find(tree, node->parent);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+		hfs_bnode_put(node);
+		node = fd->bnode = parent;
+
+		__hfs_brec_find(node, fd);
+		goto again;
+	}
+	hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+
+	if (rec_off == end_off)
+		goto skip;
+	size = fd->keylength + fd->entrylength;
+
+	do {
+		data_off = hfs_bnode_read_u16(node, rec_off);
+		hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
+		rec_off -= 2;
+	} while (rec_off >= end_off);
+
+	/* fill hole */
+	hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
+		       data_off - fd->keyoffset - size);
+skip:
+	hfs_bnode_dump(node);
+	if (!fd->record)
+		hfs_brec_update_parent(fd);
+	return 0;
+}
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *next_node;
+	struct hfs_bnode_desc node_desc;
+	int num_recs, new_rec_off, new_off, old_rec_off;
+	int data_start, data_end, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node))
+		return new_node;
+	hfs_bnode_get(node);
+	hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
+		node->this, new_node->this, node->next);
+	new_node->next = node->next;
+	new_node->prev = node->this;
+	new_node->parent = node->parent;
+	new_node->type = node->type;
+	new_node->height = node->height;
+
+	if (node->next)
+		next_node = hfs_bnode_find(tree, node->next);
+	else
+		next_node = NULL;
+
+	if (IS_ERR(next_node)) {
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		return next_node;
+	}
+
+	size = tree->node_size / 2 - node->num_recs * 2 - 14;
+	old_rec_off = tree->node_size - 4;
+	num_recs = 1;
+	for (;;) {
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+		if (data_start > size)
+			break;
+		old_rec_off -= 2;
+		if (++num_recs < node->num_recs)
+			continue;
+		/* panic? */
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		if (next_node)
+			hfs_bnode_put(next_node);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	if (fd->record + 1 < num_recs) {
+		/* new record is in the lower half,
+		 * so leave some more space there
+		 */
+		old_rec_off += 2;
+		num_recs--;
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+	} else {
+		hfs_bnode_put(node);
+		hfs_bnode_get(new_node);
+		fd->bnode = new_node;
+		fd->record -= num_recs;
+		fd->keyoffset -= data_start - 14;
+		fd->entryoffset -= data_start - 14;
+	}
+	new_node->num_recs = node->num_recs - num_recs;
+	node->num_recs = num_recs;
+
+	new_rec_off = tree->node_size - 2;
+	new_off = 14;
+	size = data_start - new_off;
+	num_recs = new_node->num_recs;
+	data_end = data_start;
+	while (num_recs) {
+		hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+		old_rec_off -= 2;
+		new_rec_off -= 2;
+		data_end = hfs_bnode_read_u16(node, old_rec_off);
+		new_off = data_end - size;
+		num_recs--;
+	}
+	hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+	hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start);
+
+	/* update new bnode header */
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	/* update previous bnode header */
+	node->next = new_node->this;
+	hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc));
+	node_desc.next = cpu_to_be32(node->next);
+	node_desc.num_recs = cpu_to_be16(node->num_recs);
+	hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
+
+	/* update next bnode header */
+	if (next_node) {
+		next_node->prev = new_node->this;
+		hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
+		node_desc.prev = cpu_to_be32(next_node->prev);
+		hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc));
+		hfs_bnode_put(next_node);
+	} else if (node->this == tree->leaf_tail) {
+		/* if there is no next node, this might be the new tail */
+		tree->leaf_tail = new_node->this;
+		mark_inode_dirty(tree->inode);
+	}
+
+	hfs_bnode_dump(node);
+	hfs_bnode_dump(new_node);
+	hfs_bnode_put(node);
+
+	return new_node;
+}
+
+static int hfs_brec_update_parent(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *parent;
+	int newkeylen, diff;
+	int rec, rec_off, end_rec_off;
+	int start_off, end_off;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = NULL;
+	if (!node->parent)
+		return 0;
+
+again:
+	parent = hfs_bnode_find(tree, node->parent);
+	if (IS_ERR(parent))
+		return PTR_ERR(parent);
+	__hfs_brec_find(parent, fd);
+	hfs_bnode_dump(parent);
+	rec = fd->record;
+
+	/* size difference between old and new key */
+	if (tree->attributes & HFS_TREE_VARIDXKEYS)
+		newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1;
+	else
+		fd->keylength = newkeylen = tree->max_key_len + 1;
+	hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+		rec, fd->keylength, newkeylen);
+
+	rec_off = tree->node_size - (rec + 2) * 2;
+	end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+	diff = newkeylen - fd->keylength;
+	if (!diff)
+		goto skip;
+	if (diff > 0) {
+		end_off = hfs_bnode_read_u16(parent, end_rec_off);
+		if (end_rec_off - end_off < diff) {
+
+			printk(KERN_DEBUG "splitting index node...\n");
+			fd->bnode = parent;
+			new_node = hfs_bnode_split(fd);
+			if (IS_ERR(new_node))
+				return PTR_ERR(new_node);
+			parent = fd->bnode;
+			rec = fd->record;
+			rec_off = tree->node_size - (rec + 2) * 2;
+			end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+		}
+	}
+
+	end_off = start_off = hfs_bnode_read_u16(parent, rec_off);
+	hfs_bnode_write_u16(parent, rec_off, start_off + diff);
+	start_off -= 4;	/* move previous cnid too */
+
+	while (rec_off > end_rec_off) {
+		rec_off -= 2;
+		end_off = hfs_bnode_read_u16(parent, rec_off);
+		hfs_bnode_write_u16(parent, rec_off, end_off + diff);
+	}
+	hfs_bnode_move(parent, start_off + diff, start_off,
+		       end_off - start_off);
+skip:
+	hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen);
+	if (!(tree->attributes & HFS_TREE_VARIDXKEYS))
+		hfs_bnode_write_u8(parent, fd->keyoffset, newkeylen - 1);
+	hfs_bnode_dump(parent);
+
+	hfs_bnode_put(node);
+	node = parent;
+
+	if (new_node) {
+		__be32 cnid;
+
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+		/* create index key and entry */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		cnid = cpu_to_be32(new_node->this);
+
+		__hfs_brec_find(fd->bnode, fd);
+		hfs_brec_insert(fd, &cnid, sizeof(cnid));
+		hfs_bnode_put(fd->bnode);
+		hfs_bnode_put(new_node);
+
+		if (!rec) {
+			if (new_node == node)
+				goto out;
+			/* restore search_key */
+			hfs_bnode_read_key(node, fd->search_key, 14);
+		}
+	}
+
+	if (!rec && node->parent)
+		goto again;
+out:
+	fd->bnode = node;
+	return 0;
+}
+
+static int hfs_btree_inc_height(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *new_node;
+	struct hfs_bnode_desc node_desc;
+	int key_size, rec;
+	__be32 cnid;
+
+	node = NULL;
+	if (tree->root) {
+		node = hfs_bnode_find(tree, tree->root);
+		if (IS_ERR(node))
+			return PTR_ERR(node);
+	}
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node)) {
+		hfs_bnode_put(node);
+		return PTR_ERR(new_node);
+	}
+
+	tree->root = new_node->this;
+	if (!tree->depth) {
+		tree->leaf_head = tree->leaf_tail = new_node->this;
+		new_node->type = HFS_NODE_LEAF;
+		new_node->num_recs = 0;
+	} else {
+		new_node->type = HFS_NODE_INDEX;
+		new_node->num_recs = 1;
+	}
+	new_node->parent = 0;
+	new_node->next = 0;
+	new_node->prev = 0;
+	new_node->height = ++tree->depth;
+
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	rec = tree->node_size - 2;
+	hfs_bnode_write_u16(new_node, rec, 14);
+
+	if (node) {
+		/* insert old root idx into new root */
+		node->parent = tree->root;
+		if (node->type == HFS_NODE_LEAF ||
+		    tree->attributes & HFS_TREE_VARIDXKEYS)
+			key_size = hfs_bnode_read_u8(node, 14) + 1;
+		else
+			key_size = tree->max_key_len + 1;
+		hfs_bnode_copy(new_node, 14, node, 14, key_size);
+
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			key_size = tree->max_key_len + 1;
+			hfs_bnode_write_u8(new_node, 14, tree->max_key_len);
+		}
+		key_size = (key_size + 1) & -2;
+		cnid = cpu_to_be32(node->this);
+		hfs_bnode_write(new_node, &cnid, 14 + key_size, 4);
+
+		rec -= 2;
+		hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4);
+
+		hfs_bnode_put(node);
+	}
+	hfs_bnode_put(new_node);
+	mark_inode_dirty(tree->inode);
+
+	return 0;
+}
diff --git a/kernel/fs/hfs/btree.c b/kernel/fs/hfs/btree.c
new file mode 100644
index 000000000..1ab19e660
--- /dev/null
+++ b/kernel/fs/hfs/btree.c
@@ -0,0 +1,369 @@
+/*
+ *  linux/fs/hfs/btree.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle opening/closing btree
+ */
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+
+#include "btree.h"
+
+/* Get a reference to a B*Tree and do some initial checks */
+struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp keycmp)
+{
+	struct hfs_btree *tree;
+	struct hfs_btree_header_rec *head;
+	struct address_space *mapping;
+	struct page *page;
+	unsigned int size;
+
+	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
+	if (!tree)
+		return NULL;
+
+	mutex_init(&tree->tree_lock);
+	spin_lock_init(&tree->hash_lock);
+	/* Set the correct compare function */
+	tree->sb = sb;
+	tree->cnid = id;
+	tree->keycmp = keycmp;
+
+	tree->inode = iget_locked(sb, id);
+	if (!tree->inode)
+		goto free_tree;
+	BUG_ON(!(tree->inode->i_state & I_NEW));
+	{
+	struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
+	HFS_I(tree->inode)->flags = 0;
+	mutex_init(&HFS_I(tree->inode)->extents_lock);
+	switch (id) {
+	case HFS_EXT_CNID:
+		hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
+				    mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
+		if (HFS_I(tree->inode)->alloc_blocks >
+					HFS_I(tree->inode)->first_blocks) {
+			pr_err("invalid btree extent records\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
+		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
+		break;
+	case HFS_CAT_CNID:
+		hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
+				    mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
+
+		if (!HFS_I(tree->inode)->first_blocks) {
+			pr_err("invalid btree extent records (0 size)\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
+		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
+		break;
+	default:
+		BUG();
+	}
+	}
+	unlock_new_inode(tree->inode);
+
+	mapping = tree->inode->i_mapping;
+	page = read_mapping_page(mapping, 0, NULL);
+	if (IS_ERR(page))
+		goto free_inode;
+
+	/* Load the header */
+	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+	tree->root = be32_to_cpu(head->root);
+	tree->leaf_count = be32_to_cpu(head->leaf_count);
+	tree->leaf_head = be32_to_cpu(head->leaf_head);
+	tree->leaf_tail = be32_to_cpu(head->leaf_tail);
+	tree->node_count = be32_to_cpu(head->node_count);
+	tree->free_nodes = be32_to_cpu(head->free_nodes);
+	tree->attributes = be32_to_cpu(head->attributes);
+	tree->node_size = be16_to_cpu(head->node_size);
+	tree->max_key_len = be16_to_cpu(head->max_key_len);
+	tree->depth = be16_to_cpu(head->depth);
+
+	size = tree->node_size;
+	if (!is_power_of_2(size))
+		goto fail_page;
+	if (!tree->node_count)
+		goto fail_page;
+	switch (id) {
+	case HFS_EXT_CNID:
+		if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) {
+			pr_err("invalid extent max_key_len %d\n",
+			       tree->max_key_len);
+			goto fail_page;
+		}
+		break;
+	case HFS_CAT_CNID:
+		if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) {
+			pr_err("invalid catalog max_key_len %d\n",
+			       tree->max_key_len);
+			goto fail_page;
+		}
+		break;
+	default:
+		BUG();
+	}
+
+	tree->node_size_shift = ffs(size) - 1;
+	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	kunmap(page);
+	page_cache_release(page);
+	return tree;
+
+fail_page:
+	page_cache_release(page);
+free_inode:
+	tree->inode->i_mapping->a_ops = &hfs_aops;
+	iput(tree->inode);
+free_tree:
+	kfree(tree);
+	return NULL;
+}
+
+/* Release resources used by a btree */
+void hfs_btree_close(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node;
+	int i;
+
+	if (!tree)
+		return;
+
+	for (i = 0; i < NODE_HASH_SIZE; i++) {
+		while ((node = tree->node_hash[i])) {
+			tree->node_hash[i] = node->next_hash;
+			if (atomic_read(&node->refcnt))
+				pr_err("node %d:%d still has %d user(s)!\n",
+				       node->tree->cnid, node->this,
+				       atomic_read(&node->refcnt));
+			hfs_bnode_free(node);
+			tree->node_hash_cnt--;
+		}
+	}
+	iput(tree->inode);
+	kfree(tree);
+}
+
+void hfs_btree_write(struct hfs_btree *tree)
+{
+	struct hfs_btree_header_rec *head;
+	struct hfs_bnode *node;
+	struct page *page;
+
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		/* panic? */
+		return;
+	/* Load the header */
+	page = node->page[0];
+	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+
+	head->root = cpu_to_be32(tree->root);
+	head->leaf_count = cpu_to_be32(tree->leaf_count);
+	head->leaf_head = cpu_to_be32(tree->leaf_head);
+	head->leaf_tail = cpu_to_be32(tree->leaf_tail);
+	head->node_count = cpu_to_be32(tree->node_count);
+	head->free_nodes = cpu_to_be32(tree->free_nodes);
+	head->attributes = cpu_to_be32(tree->attributes);
+	head->depth = cpu_to_be16(tree->depth);
+
+	kunmap(page);
+	set_page_dirty(page);
+	hfs_bnode_put(node);
+}
+
+static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
+{
+	struct hfs_btree *tree = prev->tree;
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+
+	node = hfs_bnode_create(tree, idx);
+	if (IS_ERR(node))
+		return node;
+
+	if (!tree->free_nodes)
+		panic("FIXME!!!");
+	tree->free_nodes--;
+	prev->next = idx;
+	cnid = cpu_to_be32(idx);
+	hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+
+	node->type = HFS_NODE_MAP;
+	node->num_recs = 1;
+	hfs_bnode_clear(node, 0, tree->node_size);
+	desc.next = 0;
+	desc.prev = 0;
+	desc.type = HFS_NODE_MAP;
+	desc.height = 0;
+	desc.num_recs = cpu_to_be16(1);
+	desc.reserved = 0;
+	hfs_bnode_write(node, &desc, 0, sizeof(desc));
+	hfs_bnode_write_u16(node, 14, 0x8000);
+	hfs_bnode_write_u16(node, tree->node_size - 2, 14);
+	hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6);
+
+	return node;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *next_node;
+	struct page **pagep;
+	u32 nidx, idx;
+	unsigned off;
+	u16 off16;
+	u16 len;
+	u8 *data, byte, m;
+	int i;
+
+	while (!tree->free_nodes) {
+		struct inode *inode = tree->inode;
+		u32 count;
+		int res;
+
+		res = hfs_extend_file(inode);
+		if (res)
+			return ERR_PTR(res);
+		HFS_I(inode)->phys_size = inode->i_size =
+				(loff_t)HFS_I(inode)->alloc_blocks *
+				HFS_SB(tree->sb)->alloc_blksz;
+		HFS_I(inode)->fs_blocks = inode->i_size >>
+					  tree->sb->s_blocksize_bits;
+		inode_set_bytes(inode, inode->i_size);
+		count = inode->i_size >> tree->node_size_shift;
+		tree->free_nodes = count - tree->node_count;
+		tree->node_count = count;
+	}
+
+	nidx = 0;
+	node = hfs_bnode_find(tree, nidx);
+	if (IS_ERR(node))
+		return node;
+	len = hfs_brec_lenoff(node, 2, &off16);
+	off = off16;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	data = kmap(*pagep);
+	off &= ~PAGE_CACHE_MASK;
+	idx = 0;
+
+	for (;;) {
+		while (len) {
+			byte = data[off];
+			if (byte != 0xff) {
+				for (m = 0x80, i = 0; i < 8; m >>= 1, i++) {
+					if (!(byte & m)) {
+						idx += i;
+						data[off] |= m;
+						set_page_dirty(*pagep);
+						kunmap(*pagep);
+						tree->free_nodes--;
+						mark_inode_dirty(tree->inode);
+						hfs_bnode_put(node);
+						return hfs_bnode_create(tree, idx);
+					}
+				}
+			}
+			if (++off >= PAGE_CACHE_SIZE) {
+				kunmap(*pagep);
+				data = kmap(*++pagep);
+				off = 0;
+			}
+			idx += 8;
+			len--;
+		}
+		kunmap(*pagep);
+		nidx = node->next;
+		if (!nidx) {
+			printk(KERN_DEBUG "create new bmap node...\n");
+			next_node = hfs_bmap_new_bmap(node, idx);
+		} else
+			next_node = hfs_bnode_find(tree, nidx);
+		hfs_bnode_put(node);
+		if (IS_ERR(next_node))
+			return next_node;
+		node = next_node;
+
+		len = hfs_brec_lenoff(node, 0, &off16);
+		off = off16;
+		off += node->page_offset;
+		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+		data = kmap(*pagep);
+		off &= ~PAGE_CACHE_MASK;
+	}
+}
+
+void hfs_bmap_free(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct page *page;
+	u16 off, len;
+	u32 nidx;
+	u8 *data, byte, m;
+
+	hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
+	tree = node->tree;
+	nidx = node->this;
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		return;
+	len = hfs_brec_lenoff(node, 2, &off);
+	while (nidx >= len * 8) {
+		u32 i;
+
+		nidx -= len * 8;
+		i = node->next;
+		hfs_bnode_put(node);
+		if (!i) {
+			/* panic */;
+			pr_crit("unable to free bnode %u. bmap not found!\n",
+				node->this);
+			return;
+		}
+		node = hfs_bnode_find(tree, i);
+		if (IS_ERR(node))
+			return;
+		if (node->type != HFS_NODE_MAP) {
+			/* panic */;
+			pr_crit("invalid bmap found! (%u,%d)\n",
+				node->this, node->type);
+			hfs_bnode_put(node);
+			return;
+		}
+		len = hfs_brec_lenoff(node, 0, &off);
+	}
+	off += node->page_offset + nidx / 8;
+	page = node->page[off >> PAGE_CACHE_SHIFT];
+	data = kmap(page);
+	off &= ~PAGE_CACHE_MASK;
+	m = 1 << (~nidx & 7);
+	byte = data[off];
+	if (!(byte & m)) {
+		pr_crit("trying to free free bnode %u(%d)\n",
+			node->this, node->type);
+		kunmap(page);
+		hfs_bnode_put(node);
+		return;
+	}
+	data[off] = byte & ~m;
+	set_page_dirty(page);
+	kunmap(page);
+	hfs_bnode_put(node);
+	tree->free_nodes++;
+	mark_inode_dirty(tree->inode);
+}
diff --git a/kernel/fs/hfs/btree.h b/kernel/fs/hfs/btree.h
new file mode 100644
index 000000000..f6bd266d7
--- /dev/null
+++ b/kernel/fs/hfs/btree.h
@@ -0,0 +1,163 @@
+/*
+ *  linux/fs/hfs/btree.h
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ */
+
+#include "hfs_fs.h"
+
+typedef int (*btree_keycmp)(const btree_key *, const btree_key *);
+
+#define NODE_HASH_SIZE  256
+
+/* A HFS BTree held in memory */
+struct hfs_btree {
+	struct super_block *sb;
+	struct inode *inode;
+	btree_keycmp keycmp;
+
+	u32 cnid;
+	u32 root;
+	u32 leaf_count;
+	u32 leaf_head;
+	u32 leaf_tail;
+	u32 node_count;
+	u32 free_nodes;
+	u32 attributes;
+
+	unsigned int node_size;
+	unsigned int node_size_shift;
+	unsigned int max_key_len;
+	unsigned int depth;
+
+	//unsigned int map1_size, map_size;
+	struct mutex tree_lock;
+
+	unsigned int pages_per_bnode;
+	spinlock_t hash_lock;
+	struct hfs_bnode *node_hash[NODE_HASH_SIZE];
+	int node_hash_cnt;
+};
+
+/* A HFS BTree node in memory */
+struct hfs_bnode {
+	struct hfs_btree *tree;
+
+	u32 prev;
+	u32 this;
+	u32 next;
+	u32 parent;
+
+	u16 num_recs;
+	u8 type;
+	u8 height;
+
+	struct hfs_bnode *next_hash;
+	unsigned long flags;
+	wait_queue_head_t lock_wq;
+	atomic_t refcnt;
+	unsigned int page_offset;
+	struct page *page[0];
+};
+
+#define HFS_BNODE_ERROR		0
+#define HFS_BNODE_NEW		1
+#define HFS_BNODE_DELETED	2
+
+struct hfs_find_data {
+	btree_key *key;
+	btree_key *search_key;
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	int record;
+	int keyoffset, keylength;
+	int entryoffset, entrylength;
+};
+
+
+/* btree.c */
+extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp);
+extern void hfs_btree_close(struct hfs_btree *);
+extern void hfs_btree_write(struct hfs_btree *);
+extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *);
+extern void hfs_bmap_free(struct hfs_bnode *node);
+
+/* bnode.c */
+extern void hfs_bnode_read(struct hfs_bnode *, void *, int, int);
+extern u16 hfs_bnode_read_u16(struct hfs_bnode *, int);
+extern u8 hfs_bnode_read_u8(struct hfs_bnode *, int);
+extern void hfs_bnode_read_key(struct hfs_bnode *, void *, int);
+extern void hfs_bnode_write(struct hfs_bnode *, void *, int, int);
+extern void hfs_bnode_write_u16(struct hfs_bnode *, int, u16);
+extern void hfs_bnode_write_u8(struct hfs_bnode *, int, u8);
+extern void hfs_bnode_clear(struct hfs_bnode *, int, int);
+extern void hfs_bnode_copy(struct hfs_bnode *, int,
+			   struct hfs_bnode *, int, int);
+extern void hfs_bnode_move(struct hfs_bnode *, int, int, int);
+extern void hfs_bnode_dump(struct hfs_bnode *);
+extern void hfs_bnode_unlink(struct hfs_bnode *);
+extern struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32);
+extern struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32);
+extern void hfs_bnode_unhash(struct hfs_bnode *);
+extern void hfs_bnode_free(struct hfs_bnode *);
+extern struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32);
+extern void hfs_bnode_get(struct hfs_bnode *);
+extern void hfs_bnode_put(struct hfs_bnode *);
+
+/* brec.c */
+extern u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *);
+extern u16 hfs_brec_keylen(struct hfs_bnode *, u16);
+extern int hfs_brec_insert(struct hfs_find_data *, void *, int);
+extern int hfs_brec_remove(struct hfs_find_data *);
+
+/* bfind.c */
+extern int hfs_find_init(struct hfs_btree *, struct hfs_find_data *);
+extern void hfs_find_exit(struct hfs_find_data *);
+extern int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *);
+extern int hfs_brec_find(struct hfs_find_data *);
+extern int hfs_brec_read(struct hfs_find_data *, void *, int);
+extern int hfs_brec_goto(struct hfs_find_data *, int);
+
+
+struct hfs_bnode_desc {
+	__be32 next;		/* (V) Number of the next node at this level */
+	__be32 prev;		/* (V) Number of the prev node at this level */
+	u8 type;		/* (F) The type of node */
+	u8 height;		/* (F) The level of this node (leaves=1) */
+	__be16 num_recs;	/* (V) The number of records in this node */
+	u16 reserved;
+} __packed;
+
+#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
+#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
+#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
+
+struct hfs_btree_header_rec {
+	__be16 depth;		/* (V) The number of levels in this B-tree */
+	__be32 root;		/* (V) The node number of the root node */
+	__be32 leaf_count;	/* (V) The number of leaf records */
+	__be32 leaf_head;	/* (V) The number of the first leaf node */
+	__be32 leaf_tail;	/* (V) The number of the last leaf node */
+	__be16 node_size;	/* (F) The number of bytes in a node (=512) */
+	__be16 max_key_len;	/* (F) The length of a key in an index node */
+	__be32 node_count;	/* (V) The total number of nodes */
+	__be32 free_nodes;	/* (V) The number of unused nodes */
+	u16 reserved1;
+	__be32 clump_size;	/* (F) clump size. not usually used. */
+	u8 btree_type;		/* (F) BTree type */
+	u8 reserved2;
+	__be32 attributes;	/* (F) attributes */
+	u32 reserved3[16];
+} __packed;
+
+#define BTREE_ATTR_BADCLOSE	0x00000001	/* b-tree not closed properly. not
+						   used by hfsplus. */
+#define HFS_TREE_BIGKEYS	0x00000002	/* key length is u16 instead of u8.
+						   used by hfsplus. */
+#define HFS_TREE_VARIDXKEYS	0x00000004	/* variable key length instead of
+						   max key length. use din catalog
+						   b-tree but not in extents
+						   b-tree (hfsplus). */
diff --git a/kernel/fs/hfs/catalog.c b/kernel/fs/hfs/catalog.c
new file mode 100644
index 000000000..db458ee3a
--- /dev/null
+++ b/kernel/fs/hfs/catalog.c
@@ -0,0 +1,366 @@
+/*
+ *  linux/fs/hfs/catalog.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the functions related to the catalog B-tree.
+ *
+ * Cache code shamelessly stolen from
+ *     linux/fs/inode.c Copyright (C) 1991, 1992  Linus Torvalds
+ *     re-shamelessly stolen Copyright (C) 1997 Linus Torvalds
+ */
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*
+ * hfs_cat_build_key()
+ *
+ * Given the ID of the parent and the name build a search key.
+ */
+void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name)
+{
+	key->cat.reserved = 0;
+	key->cat.ParID = cpu_to_be32(parent);
+	if (name) {
+		hfs_asc2mac(sb, &key->cat.CName, name);
+		key->key_len = 6 + key->cat.CName.len;
+	} else {
+		memset(&key->cat.CName, 0, sizeof(struct hfs_name));
+		key->key_len = 6;
+	}
+}
+
+static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode)
+{
+	__be32 mtime = hfs_mtime();
+
+	memset(rec, 0, sizeof(*rec));
+	if (S_ISDIR(inode->i_mode)) {
+		rec->type = HFS_CDR_DIR;
+		rec->dir.DirID = cpu_to_be32(cnid);
+		rec->dir.CrDat = mtime;
+		rec->dir.MdDat = mtime;
+		rec->dir.BkDat = 0;
+		rec->dir.UsrInfo.frView = cpu_to_be16(0xff);
+		return sizeof(struct hfs_cat_dir);
+	} else {
+		/* init some fields for the file record */
+		rec->type = HFS_CDR_FIL;
+		rec->file.Flags = HFS_FIL_USED | HFS_FIL_THD;
+		if (!(inode->i_mode & S_IWUSR))
+			rec->file.Flags |= HFS_FIL_LOCK;
+		rec->file.FlNum = cpu_to_be32(cnid);
+		rec->file.CrDat = mtime;
+		rec->file.MdDat = mtime;
+		rec->file.BkDat = 0;
+		rec->file.UsrWds.fdType = HFS_SB(inode->i_sb)->s_type;
+		rec->file.UsrWds.fdCreator = HFS_SB(inode->i_sb)->s_creator;
+		return sizeof(struct hfs_cat_file);
+	}
+}
+
+static int hfs_cat_build_thread(struct super_block *sb,
+				hfs_cat_rec *rec, int type,
+				u32 parentid, struct qstr *name)
+{
+	rec->type = type;
+	memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved));
+	rec->thread.ParID = cpu_to_be32(parentid);
+	hfs_asc2mac(sb, &rec->thread.CName, name);
+	return sizeof(struct hfs_cat_thread);
+}
+
+/*
+ * create_entry()
+ *
+ * Add a new file or directory to the catalog B-tree and
+ * return a (struct hfs_cat_entry) for it in '*result'.
+ */
+int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
+{
+	struct hfs_find_data fd;
+	struct super_block *sb;
+	union hfs_cat_rec entry;
+	int entry_size;
+	int err;
+
+	hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+		str->name, cnid, inode->i_nlink);
+	if (dir->i_size >= HFS_MAX_VALENCE)
+		return -ENOSPC;
+
+	sb = dir->i_sb;
+	err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return err;
+
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
+	entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
+			HFS_CDR_THD : HFS_CDR_FTH,
+			dir->i_ino, str);
+	err = hfs_brec_find(&fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto err2;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err2;
+
+	hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+	entry_size = hfs_cat_build_record(&entry, cnid, inode);
+	err = hfs_brec_find(&fd);
+	if (err != -ENOENT) {
+		/* panic? */
+		if (!err)
+			err = -EEXIST;
+		goto err1;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err1;
+
+	dir->i_size++;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+	hfs_find_exit(&fd);
+	return 0;
+
+err1:
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
+	if (!hfs_brec_find(&fd))
+		hfs_brec_remove(&fd);
+err2:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+/*
+ * hfs_cat_compare()
+ *
+ * Description:
+ *   This is the comparison function used for the catalog B-tree.  In
+ *   comparing catalog B-tree entries, the parent id is the most
+ *   significant field (compared as unsigned ints).  The name field is
+ *   the least significant (compared in "Macintosh lexical order",
+ *   see hfs_strcmp() in string.c)
+ * Input Variable(s):
+ *   struct hfs_cat_key *key1: pointer to the first key to compare
+ *   struct hfs_cat_key *key2: pointer to the second key to compare
+ * Output Variable(s):
+ *   NONE
+ * Returns:
+ *   int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2
+ * Preconditions:
+ *   key1 and key2 point to "valid" (struct hfs_cat_key)s.
+ * Postconditions:
+ *   This function has no side-effects
+ */
+int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2)
+{
+	__be32 k1p, k2p;
+
+	k1p = key1->cat.ParID;
+	k2p = key2->cat.ParID;
+
+	if (k1p != k2p)
+		return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
+
+	return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
+			  key2->cat.CName.name, key2->cat.CName.len);
+}
+
+/* Try to get a catalog entry for given catalog id */
+// move to read_super???
+int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
+		      struct hfs_find_data *fd)
+{
+	hfs_cat_rec rec;
+	int res, len, type;
+
+	hfs_cat_build_key(sb, fd->search_key, cnid, NULL);
+	res = hfs_brec_read(fd, &rec, sizeof(rec));
+	if (res)
+		return res;
+
+	type = rec.type;
+	if (type != HFS_CDR_THD && type != HFS_CDR_FTH) {
+		pr_err("found bad thread record in catalog\n");
+		return -EIO;
+	}
+
+	fd->search_key->cat.ParID = rec.thread.ParID;
+	len = fd->search_key->cat.CName.len = rec.thread.CName.len;
+	if (len > HFS_NAMELEN) {
+		pr_err("bad catalog namelength\n");
+		return -EIO;
+	}
+	memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
+	return hfs_brec_find(fd);
+}
+
+
+/*
+ * hfs_cat_delete()
+ *
+ * Delete the indicated file or directory.
+ * The associated thread is also removed unless ('with_thread'==0).
+ */
+int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
+{
+	struct super_block *sb;
+	struct hfs_find_data fd;
+	struct list_head *pos;
+	int res, type;
+
+	hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+	sb = dir->i_sb;
+	res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (res)
+		return res;
+
+	hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+	res = hfs_brec_find(&fd);
+	if (res)
+		goto out;
+
+	type = hfs_bnode_read_u8(fd.bnode, fd.entryoffset);
+	if (type == HFS_CDR_FIL) {
+		struct hfs_cat_file file;
+		hfs_bnode_read(fd.bnode, &file, fd.entryoffset, sizeof(file));
+		if (be32_to_cpu(file.FlNum) == cnid) {
+#if 0
+			hfs_free_fork(sb, &file, HFS_FK_DATA);
+#endif
+			hfs_free_fork(sb, &file, HFS_FK_RSRC);
+		}
+	}
+
+	list_for_each(pos, &HFS_I(dir)->open_dir_list) {
+		struct hfs_readdir_data *rd =
+			list_entry(pos, struct hfs_readdir_data, list);
+		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
+			rd->file->f_pos--;
+	}
+
+	res = hfs_brec_remove(&fd);
+	if (res)
+		goto out;
+
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
+	res = hfs_brec_find(&fd);
+	if (!res) {
+		res = hfs_brec_remove(&fd);
+		if (res)
+			goto out;
+	}
+
+	dir->i_size--;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+	res = 0;
+out:
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+/*
+ * hfs_cat_move()
+ *
+ * Rename a file or directory, possibly to a new directory.
+ * If the destination exists it is removed and a
+ * (struct hfs_cat_entry) for it is returned in '*result'.
+ */
+int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
+		 struct inode *dst_dir, struct qstr *dst_name)
+{
+	struct super_block *sb;
+	struct hfs_find_data src_fd, dst_fd;
+	union hfs_cat_rec entry;
+	int entry_size, type;
+	int err;
+
+	hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+		cnid, src_dir->i_ino, src_name->name,
+		dst_dir->i_ino, dst_name->name);
+	sb = src_dir->i_sb;
+	err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
+	if (err)
+		return err;
+	dst_fd = src_fd;
+
+	/* find the old dir entry and read the data */
+	hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
+			    src_fd.entrylength);
+
+	/* create new dir entry with the data from the old entry */
+	hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
+	err = hfs_brec_find(&dst_fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+
+	err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength);
+	if (err)
+		goto out;
+	dst_dir->i_size++;
+	dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dst_dir);
+
+	/* finally remove the old entry */
+	hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+	src_dir->i_size--;
+	src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(src_dir);
+
+	type = entry.type;
+	if (type == HFS_CDR_FIL && !(entry.file.Flags & HFS_FIL_THD))
+		goto out;
+
+	/* remove old thread entry */
+	hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+
+	/* create new thread entry */
+	hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
+	entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
+					dst_dir->i_ino, dst_name);
+	err = hfs_brec_find(&dst_fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+	err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+out:
+	hfs_bnode_put(dst_fd.bnode);
+	hfs_find_exit(&src_fd);
+	return err;
+}
diff --git a/kernel/fs/hfs/dir.c b/kernel/fs/hfs/dir.c
new file mode 100644
index 000000000..70788e038
--- /dev/null
+++ b/kernel/fs/hfs/dir.c
@@ -0,0 +1,319 @@
+/*
+ *  linux/fs/hfs/dir.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains directory-related functions independent of which
+ * scheme is being used to represent forks.
+ *
+ * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
+ */
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*
+ * hfs_lookup()
+ */
+static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags)
+{
+	hfs_cat_rec rec;
+	struct hfs_find_data fd;
+	struct inode *inode = NULL;
+	int res;
+
+	res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	if (res)
+		return ERR_PTR(res);
+	hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
+	res = hfs_brec_read(&fd, &rec, sizeof(rec));
+	if (res) {
+		hfs_find_exit(&fd);
+		if (res == -ENOENT) {
+			/* No such entry */
+			inode = NULL;
+			goto done;
+		}
+		return ERR_PTR(res);
+	}
+	inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
+	hfs_find_exit(&fd);
+	if (!inode)
+		return ERR_PTR(-EACCES);
+done:
+	d_add(dentry, inode);
+	return NULL;
+}
+
+/*
+ * hfs_readdir
+ */
+static int hfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	int len, err;
+	char strbuf[HFS_MAX_NAMELEN];
+	union hfs_cat_rec entry;
+	struct hfs_find_data fd;
+	struct hfs_readdir_data *rd;
+	u16 type;
+
+	if (ctx->pos >= inode->i_size)
+		return 0;
+
+	err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return err;
+	hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
+	err = hfs_brec_find(&fd);
+	if (err)
+		goto out;
+
+	if (ctx->pos == 0) {
+		/* This is completely artificial... */
+		if (!dir_emit_dot(file, ctx))
+			goto out;
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+		if (entry.type != HFS_CDR_THD) {
+			pr_err("bad catalog folder thread\n");
+			err = -EIO;
+			goto out;
+		}
+		//if (fd.entrylength < HFS_MIN_THREAD_SZ) {
+		//	pr_err("truncated catalog thread\n");
+		//	err = -EIO;
+		//	goto out;
+		//}
+		if (!dir_emit(ctx, "..", 2,
+			    be32_to_cpu(entry.thread.ParID), DT_DIR))
+			goto out;
+		ctx->pos = 2;
+	}
+	if (ctx->pos >= inode->i_size)
+		goto out;
+	err = hfs_brec_goto(&fd, ctx->pos - 1);
+	if (err)
+		goto out;
+
+	for (;;) {
+		if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
+			pr_err("walked past end of dir\n");
+			err = -EIO;
+			goto out;
+		}
+
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+		type = entry.type;
+		len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
+		if (type == HFS_CDR_DIR) {
+			if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
+				pr_err("small dir entry\n");
+				err = -EIO;
+				goto out;
+			}
+			if (!dir_emit(ctx, strbuf, len,
+				    be32_to_cpu(entry.dir.DirID), DT_DIR))
+				break;
+		} else if (type == HFS_CDR_FIL) {
+			if (fd.entrylength < sizeof(struct hfs_cat_file)) {
+				pr_err("small file entry\n");
+				err = -EIO;
+				goto out;
+			}
+			if (!dir_emit(ctx, strbuf, len,
+				    be32_to_cpu(entry.file.FlNum), DT_REG))
+				break;
+		} else {
+			pr_err("bad catalog entry type %d\n", type);
+			err = -EIO;
+			goto out;
+		}
+		ctx->pos++;
+		if (ctx->pos >= inode->i_size)
+			goto out;
+		err = hfs_brec_goto(&fd, 1);
+		if (err)
+			goto out;
+	}
+	rd = file->private_data;
+	if (!rd) {
+		rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
+		if (!rd) {
+			err = -ENOMEM;
+			goto out;
+		}
+		file->private_data = rd;
+		rd->file = file;
+		list_add(&rd->list, &HFS_I(inode)->open_dir_list);
+	}
+	memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
+out:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+static int hfs_dir_release(struct inode *inode, struct file *file)
+{
+	struct hfs_readdir_data *rd = file->private_data;
+	if (rd) {
+		mutex_lock(&inode->i_mutex);
+		list_del(&rd->list);
+		mutex_unlock(&inode->i_mutex);
+		kfree(rd);
+	}
+	return 0;
+}
+
+/*
+ * hfs_create()
+ *
+ * This is the create() entry in the inode_operations structure for
+ * regular HFS directories.  The purpose is to create a new file in
+ * a directory and return a corresponding inode, given the inode for
+ * the directory and the name (and its length) of the new file.
+ */
+static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl)
+{
+	struct inode *inode;
+	int res;
+
+	inode = hfs_new_inode(dir, &dentry->d_name, mode);
+	if (!inode)
+		return -ENOMEM;
+
+	res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res) {
+		clear_nlink(inode);
+		hfs_delete_inode(inode);
+		iput(inode);
+		return res;
+	}
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * hfs_mkdir()
+ *
+ * This is the mkdir() entry in the inode_operations structure for
+ * regular HFS directories.  The purpose is to create a new directory
+ * in a directory, given the inode for the parent directory and the
+ * name (and its length) of the new directory.
+ */
+static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	int res;
+
+	inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode);
+	if (!inode)
+		return -ENOMEM;
+
+	res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res) {
+		clear_nlink(inode);
+		hfs_delete_inode(inode);
+		iput(inode);
+		return res;
+	}
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * hfs_remove()
+ *
+ * This serves as both unlink() and rmdir() in the inode_operations
+ * structure for regular HFS directories.  The purpose is to delete
+ * an existing child, given the inode for the parent directory and
+ * the name (and its length) of the existing directory.
+ *
+ * HFS does not have hardlinks, so both rmdir and unlink set the
+ * link count to 0.  The only difference is the emptiness check.
+ */
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	int res;
+
+	if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
+		return -ENOTEMPTY;
+	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
+	if (res)
+		return res;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	hfs_delete_inode(inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * hfs_rename()
+ *
+ * This is the rename() entry in the inode_operations structure for
+ * regular HFS directories.  The purpose is to rename an existing
+ * file or directory, given the inode for the current directory and
+ * the name (and its length) of the existing file/directory and the
+ * inode for the new directory and the name (and its length) of the
+ * new file/directory.
+ * XXX: how do you handle must_be dir?
+ */
+static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	int res;
+
+	/* Unlink destination if it already exists */
+	if (d_really_is_positive(new_dentry)) {
+		res = hfs_remove(new_dir, new_dentry);
+		if (res)
+			return res;
+	}
+
+	res = hfs_cat_move(d_inode(old_dentry)->i_ino,
+			   old_dir, &old_dentry->d_name,
+			   new_dir, &new_dentry->d_name);
+	if (!res)
+		hfs_cat_build_key(old_dir->i_sb,
+				  (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key,
+				  new_dir->i_ino, &new_dentry->d_name);
+	return res;
+}
+
+const struct file_operations hfs_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= hfs_readdir,
+	.llseek		= generic_file_llseek,
+	.release	= hfs_dir_release,
+};
+
+const struct inode_operations hfs_dir_inode_operations = {
+	.create		= hfs_create,
+	.lookup		= hfs_lookup,
+	.unlink		= hfs_remove,
+	.mkdir		= hfs_mkdir,
+	.rmdir		= hfs_remove,
+	.rename		= hfs_rename,
+	.setattr	= hfs_inode_setattr,
+};
diff --git a/kernel/fs/hfs/extent.c b/kernel/fs/hfs/extent.c
new file mode 100644
index 000000000..e33a0d36a
--- /dev/null
+++ b/kernel/fs/hfs/extent.c
@@ -0,0 +1,545 @@
+/*
+ *  linux/fs/hfs/extent.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the functions related to the extents B-tree.
+ */
+
+#include <linux/pagemap.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*================ File-local functions ================*/
+
+/*
+ * build_key
+ */
+static void hfs_ext_build_key(hfs_btree_key *key, u32 cnid, u16 block, u8 type)
+{
+	key->key_len = 7;
+	key->ext.FkType = type;
+	key->ext.FNum = cpu_to_be32(cnid);
+	key->ext.FABN = cpu_to_be16(block);
+}
+
+/*
+ * hfs_ext_compare()
+ *
+ * Description:
+ *   This is the comparison function used for the extents B-tree.  In
+ *   comparing extent B-tree entries, the file id is the most
+ *   significant field (compared as unsigned ints); the fork type is
+ *   the second most significant field (compared as unsigned chars);
+ *   and the allocation block number field is the least significant
+ *   (compared as unsigned ints).
+ * Input Variable(s):
+ *   struct hfs_ext_key *key1: pointer to the first key to compare
+ *   struct hfs_ext_key *key2: pointer to the second key to compare
+ * Output Variable(s):
+ *   NONE
+ * Returns:
+ *   int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2
+ * Preconditions:
+ *   key1 and key2 point to "valid" (struct hfs_ext_key)s.
+ * Postconditions:
+ *   This function has no side-effects */
+int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2)
+{
+	__be32 fnum1, fnum2;
+	__be16 block1, block2;
+
+	fnum1 = key1->ext.FNum;
+	fnum2 = key2->ext.FNum;
+	if (fnum1 != fnum2)
+		return be32_to_cpu(fnum1) < be32_to_cpu(fnum2) ? -1 : 1;
+	if (key1->ext.FkType != key2->ext.FkType)
+		return key1->ext.FkType < key2->ext.FkType ? -1 : 1;
+
+	block1 = key1->ext.FABN;
+	block2 = key2->ext.FABN;
+	if (block1 == block2)
+		return 0;
+	return be16_to_cpu(block1) < be16_to_cpu(block2) ? -1 : 1;
+}
+
+/*
+ * hfs_ext_find_block
+ *
+ * Find a block within an extent record
+ */
+static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off)
+{
+	int i;
+	u16 count;
+
+	for (i = 0; i < 3; ext++, i++) {
+		count = be16_to_cpu(ext->count);
+		if (off < count)
+			return be16_to_cpu(ext->block) + off;
+		off -= count;
+	}
+	/* panic? */
+	return 0;
+}
+
+static int hfs_ext_block_count(struct hfs_extent *ext)
+{
+	int i;
+	u16 count = 0;
+
+	for (i = 0; i < 3; ext++, i++)
+		count += be16_to_cpu(ext->count);
+	return count;
+}
+
+static u16 hfs_ext_lastblock(struct hfs_extent *ext)
+{
+	int i;
+
+	ext += 2;
+	for (i = 0; i < 2; ext--, i++)
+		if (ext->count)
+			break;
+	return be16_to_cpu(ext->block) + be16_to_cpu(ext->count);
+}
+
+static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+{
+	int res;
+
+	hfs_ext_build_key(fd->search_key, inode->i_ino, HFS_I(inode)->cached_start,
+			  HFS_IS_RSRC(inode) ?  HFS_FK_RSRC : HFS_FK_DATA);
+	res = hfs_brec_find(fd);
+	if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
+		if (res != -ENOENT)
+			return res;
+		hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
+		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
+	} else {
+		if (res)
+			return res;
+		hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength);
+		HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY;
+	}
+	return 0;
+}
+
+int hfs_ext_write_extent(struct inode *inode)
+{
+	struct hfs_find_data fd;
+	int res = 0;
+
+	if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
+		res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+		if (res)
+			return res;
+		res = __hfs_ext_write_extent(inode, &fd);
+		hfs_find_exit(&fd);
+	}
+	return res;
+}
+
+static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent,
+					u32 cnid, u32 block, u8 type)
+{
+	int res;
+
+	hfs_ext_build_key(fd->search_key, cnid, block, type);
+	fd->key->ext.FNum = 0;
+	res = hfs_brec_find(fd);
+	if (res && res != -ENOENT)
+		return res;
+	if (fd->key->ext.FNum != fd->search_key->ext.FNum ||
+	    fd->key->ext.FkType != fd->search_key->ext.FkType)
+		return -ENOENT;
+	if (fd->entrylength != sizeof(hfs_extent_rec))
+		return -EIO;
+	hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec));
+	return 0;
+}
+
+static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
+{
+	int res;
+
+	if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
+		res = __hfs_ext_write_extent(inode, fd);
+		if (res)
+			return res;
+	}
+
+	res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino,
+				    block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA);
+	if (!res) {
+		HFS_I(inode)->cached_start = be16_to_cpu(fd->key->ext.FABN);
+		HFS_I(inode)->cached_blocks = hfs_ext_block_count(HFS_I(inode)->cached_extents);
+	} else {
+		HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0;
+		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
+	}
+	return res;
+}
+
+static int hfs_ext_read_extent(struct inode *inode, u16 block)
+{
+	struct hfs_find_data fd;
+	int res;
+
+	if (block >= HFS_I(inode)->cached_start &&
+	    block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks)
+		return 0;
+
+	res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+	if (!res) {
+		res = __hfs_ext_cache_extent(&fd, inode, block);
+		hfs_find_exit(&fd);
+	}
+	return res;
+}
+
+static void hfs_dump_extent(struct hfs_extent *extent)
+{
+	int i;
+
+	hfs_dbg(EXTENT, "   ");
+	for (i = 0; i < 3; i++)
+		hfs_dbg_cont(EXTENT, " %u:%u",
+			     be16_to_cpu(extent[i].block),
+			     be16_to_cpu(extent[i].count));
+	hfs_dbg_cont(EXTENT, "\n");
+}
+
+static int hfs_add_extent(struct hfs_extent *extent, u16 offset,
+			  u16 alloc_block, u16 block_count)
+{
+	u16 count, start;
+	int i;
+
+	hfs_dump_extent(extent);
+	for (i = 0; i < 3; extent++, i++) {
+		count = be16_to_cpu(extent->count);
+		if (offset == count) {
+			start = be16_to_cpu(extent->block);
+			if (alloc_block != start + count) {
+				if (++i >= 3)
+					return -ENOSPC;
+				extent++;
+				extent->block = cpu_to_be16(alloc_block);
+			} else
+				block_count += count;
+			extent->count = cpu_to_be16(block_count);
+			return 0;
+		} else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+}
+
+static int hfs_free_extents(struct super_block *sb, struct hfs_extent *extent,
+			    u16 offset, u16 block_nr)
+{
+	u16 count, start;
+	int i;
+
+	hfs_dump_extent(extent);
+	for (i = 0; i < 3; extent++, i++) {
+		count = be16_to_cpu(extent->count);
+		if (offset == count)
+			goto found;
+		else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+found:
+	for (;;) {
+		start = be16_to_cpu(extent->block);
+		if (count <= block_nr) {
+			hfs_clear_vbm_bits(sb, start, count);
+			extent->block = 0;
+			extent->count = 0;
+			block_nr -= count;
+		} else {
+			count -= block_nr;
+			hfs_clear_vbm_bits(sb, start + count, block_nr);
+			extent->count = cpu_to_be16(count);
+			block_nr = 0;
+		}
+		if (!block_nr || !i)
+			return 0;
+		i--;
+		extent--;
+		count = be16_to_cpu(extent->count);
+	}
+}
+
+int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
+{
+	struct hfs_find_data fd;
+	u32 total_blocks, blocks, start;
+	u32 cnid = be32_to_cpu(file->FlNum);
+	struct hfs_extent *extent;
+	int res, i;
+
+	if (type == HFS_FK_DATA) {
+		total_blocks = be32_to_cpu(file->PyLen);
+		extent = file->ExtRec;
+	} else {
+		total_blocks = be32_to_cpu(file->RPyLen);
+		extent = file->RExtRec;
+	}
+	total_blocks /= HFS_SB(sb)->alloc_blksz;
+	if (!total_blocks)
+		return 0;
+
+	blocks = 0;
+	for (i = 0; i < 3; extent++, i++)
+		blocks += be16_to_cpu(extent[i].count);
+
+	res = hfs_free_extents(sb, extent, blocks, blocks);
+	if (res)
+		return res;
+	if (total_blocks == blocks)
+		return 0;
+
+	res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	if (res)
+		return res;
+	do {
+		res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type);
+		if (res)
+			break;
+		start = be16_to_cpu(fd.key->ext.FABN);
+		hfs_free_extents(sb, extent, total_blocks - start, total_blocks);
+		hfs_brec_remove(&fd);
+		total_blocks = start;
+	} while (total_blocks > blocks);
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+/*
+ * hfs_get_block
+ */
+int hfs_get_block(struct inode *inode, sector_t block,
+		  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb;
+	u16 dblock, ablock;
+	int res;
+
+	sb = inode->i_sb;
+	/* Convert inode block to disk allocation block */
+	ablock = (u32)block / HFS_SB(sb)->fs_div;
+
+	if (block >= HFS_I(inode)->fs_blocks) {
+		if (block > HFS_I(inode)->fs_blocks || !create)
+			return -EIO;
+		if (ablock >= HFS_I(inode)->alloc_blocks) {
+			res = hfs_extend_file(inode);
+			if (res)
+				return res;
+		}
+	} else
+		create = 0;
+
+	if (ablock < HFS_I(inode)->first_blocks) {
+		dblock = hfs_ext_find_block(HFS_I(inode)->first_extents, ablock);
+		goto done;
+	}
+
+	mutex_lock(&HFS_I(inode)->extents_lock);
+	res = hfs_ext_read_extent(inode, ablock);
+	if (!res)
+		dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents,
+					    ablock - HFS_I(inode)->cached_start);
+	else {
+		mutex_unlock(&HFS_I(inode)->extents_lock);
+		return -EIO;
+	}
+	mutex_unlock(&HFS_I(inode)->extents_lock);
+
+done:
+	map_bh(bh_result, sb, HFS_SB(sb)->fs_start +
+	       dblock * HFS_SB(sb)->fs_div +
+	       (u32)block % HFS_SB(sb)->fs_div);
+
+	if (create) {
+		set_buffer_new(bh_result);
+		HFS_I(inode)->phys_size += sb->s_blocksize;
+		HFS_I(inode)->fs_blocks++;
+		inode_add_bytes(inode, sb->s_blocksize);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
+int hfs_extend_file(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	u32 start, len, goal;
+	int res;
+
+	mutex_lock(&HFS_I(inode)->extents_lock);
+	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks)
+		goal = hfs_ext_lastblock(HFS_I(inode)->first_extents);
+	else {
+		res = hfs_ext_read_extent(inode, HFS_I(inode)->alloc_blocks);
+		if (res)
+			goto out;
+		goal = hfs_ext_lastblock(HFS_I(inode)->cached_extents);
+	}
+
+	len = HFS_I(inode)->clump_blocks;
+	start = hfs_vbm_search_free(sb, goal, &len);
+	if (!len) {
+		res = -ENOSPC;
+		goto out;
+	}
+
+	hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) {
+		if (!HFS_I(inode)->first_blocks) {
+			hfs_dbg(EXTENT, "first extents\n");
+			/* no extents yet */
+			HFS_I(inode)->first_extents[0].block = cpu_to_be16(start);
+			HFS_I(inode)->first_extents[0].count = cpu_to_be16(len);
+			res = 0;
+		} else {
+			/* try to append to extents in inode */
+			res = hfs_add_extent(HFS_I(inode)->first_extents,
+					     HFS_I(inode)->alloc_blocks,
+					     start, len);
+			if (res == -ENOSPC)
+				goto insert_extent;
+		}
+		if (!res) {
+			hfs_dump_extent(HFS_I(inode)->first_extents);
+			HFS_I(inode)->first_blocks += len;
+		}
+	} else {
+		res = hfs_add_extent(HFS_I(inode)->cached_extents,
+				     HFS_I(inode)->alloc_blocks -
+				     HFS_I(inode)->cached_start,
+				     start, len);
+		if (!res) {
+			hfs_dump_extent(HFS_I(inode)->cached_extents);
+			HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY;
+			HFS_I(inode)->cached_blocks += len;
+		} else if (res == -ENOSPC)
+			goto insert_extent;
+	}
+out:
+	mutex_unlock(&HFS_I(inode)->extents_lock);
+	if (!res) {
+		HFS_I(inode)->alloc_blocks += len;
+		mark_inode_dirty(inode);
+		if (inode->i_ino < HFS_FIRSTUSER_CNID)
+			set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags);
+		set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+		hfs_mark_mdb_dirty(sb);
+	}
+	return res;
+
+insert_extent:
+	hfs_dbg(EXTENT, "insert new extent\n");
+	res = hfs_ext_write_extent(inode);
+	if (res)
+		goto out;
+
+	memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
+	HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start);
+	HFS_I(inode)->cached_extents[0].count = cpu_to_be16(len);
+	hfs_dump_extent(HFS_I(inode)->cached_extents);
+	HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW;
+	HFS_I(inode)->cached_start = HFS_I(inode)->alloc_blocks;
+	HFS_I(inode)->cached_blocks = len;
+
+	res = 0;
+	goto out;
+}
+
+void hfs_file_truncate(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+	u16 blk_cnt, alloc_cnt, start;
+	u32 size;
+	int res;
+
+	hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n",
+		inode->i_ino, (long long)HFS_I(inode)->phys_size,
+		inode->i_size);
+	if (inode->i_size > HFS_I(inode)->phys_size) {
+		struct address_space *mapping = inode->i_mapping;
+		void *fsdata;
+		struct page *page;
+
+		/* XXX: Can use generic_cont_expand? */
+		size = inode->i_size - 1;
+		res = pagecache_write_begin(NULL, mapping, size+1, 0,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+		if (!res) {
+			res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
+					page, fsdata);
+		}
+		if (res)
+			inode->i_size = HFS_I(inode)->phys_size;
+		return;
+	} else if (inode->i_size == HFS_I(inode)->phys_size)
+		return;
+	size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1;
+	blk_cnt = size / HFS_SB(sb)->alloc_blksz;
+	alloc_cnt = HFS_I(inode)->alloc_blocks;
+	if (blk_cnt == alloc_cnt)
+		goto out;
+
+	mutex_lock(&HFS_I(inode)->extents_lock);
+	res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	if (res) {
+		mutex_unlock(&HFS_I(inode)->extents_lock);
+		/* XXX: We lack error handling of hfs_file_truncate() */
+		return;
+	}
+	while (1) {
+		if (alloc_cnt == HFS_I(inode)->first_blocks) {
+			hfs_free_extents(sb, HFS_I(inode)->first_extents,
+					 alloc_cnt, alloc_cnt - blk_cnt);
+			hfs_dump_extent(HFS_I(inode)->first_extents);
+			HFS_I(inode)->first_blocks = blk_cnt;
+			break;
+		}
+		res = __hfs_ext_cache_extent(&fd, inode, alloc_cnt);
+		if (res)
+			break;
+		start = HFS_I(inode)->cached_start;
+		hfs_free_extents(sb, HFS_I(inode)->cached_extents,
+				 alloc_cnt - start, alloc_cnt - blk_cnt);
+		hfs_dump_extent(HFS_I(inode)->cached_extents);
+		if (blk_cnt > start) {
+			HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY;
+			break;
+		}
+		alloc_cnt = start;
+		HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0;
+		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
+		hfs_brec_remove(&fd);
+	}
+	hfs_find_exit(&fd);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
+
+	HFS_I(inode)->alloc_blocks = blk_cnt;
+out:
+	HFS_I(inode)->phys_size = inode->i_size;
+	HFS_I(inode)->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
+	mark_inode_dirty(inode);
+}
diff --git a/kernel/fs/hfs/hfs.h b/kernel/fs/hfs/hfs.h
new file mode 100644
index 000000000..6f194d076
--- /dev/null
+++ b/kernel/fs/hfs/hfs.h
@@ -0,0 +1,289 @@
+/*
+ *  linux/fs/hfs/hfs.h
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ */
+
+#ifndef _HFS_H
+#define _HFS_H
+
+/* offsets to various blocks */
+#define HFS_DD_BLK		0 /* Driver Descriptor block */
+#define HFS_PMAP_BLK		1 /* First block of partition map */
+#define HFS_MDB_BLK		2 /* Block (w/i partition) of MDB */
+
+/* magic numbers for various disk blocks */
+#define HFS_DRVR_DESC_MAGIC	0x4552 /* "ER": driver descriptor map */
+#define HFS_OLD_PMAP_MAGIC	0x5453 /* "TS": old-type partition map */
+#define HFS_NEW_PMAP_MAGIC	0x504D /* "PM": new-type partition map */
+#define HFS_SUPER_MAGIC		0x4244 /* "BD": HFS MDB (super block) */
+#define HFS_MFS_SUPER_MAGIC	0xD2D7 /* MFS MDB (super block) */
+
+/* various FIXED size parameters */
+#define HFS_SECTOR_SIZE		512    /* size of an HFS sector */
+#define HFS_SECTOR_SIZE_BITS	9      /* log_2(HFS_SECTOR_SIZE) */
+#define HFS_NAMELEN		31     /* maximum length of an HFS filename */
+#define HFS_MAX_NAMELEN		128
+#define HFS_MAX_VALENCE		32767U
+
+/* Meanings of the drAtrb field of the MDB,
+ * Reference: _Inside Macintosh: Files_ p. 2-61
+ */
+#define HFS_SB_ATTRIB_HLOCK	(1 << 7)
+#define HFS_SB_ATTRIB_UNMNT	(1 << 8)
+#define HFS_SB_ATTRIB_SPARED	(1 << 9)
+#define HFS_SB_ATTRIB_INCNSTNT	(1 << 11)
+#define HFS_SB_ATTRIB_SLOCK	(1 << 15)
+
+/* Some special File ID numbers */
+#define HFS_POR_CNID		1	/* Parent Of the Root */
+#define HFS_ROOT_CNID		2	/* ROOT directory */
+#define HFS_EXT_CNID		3	/* EXTents B-tree */
+#define HFS_CAT_CNID		4	/* CATalog B-tree */
+#define HFS_BAD_CNID		5	/* BAD blocks file */
+#define HFS_ALLOC_CNID		6	/* ALLOCation file (HFS+) */
+#define HFS_START_CNID		7	/* STARTup file (HFS+) */
+#define HFS_ATTR_CNID		8	/* ATTRibutes file (HFS+) */
+#define HFS_EXCH_CNID		15	/* ExchangeFiles temp id */
+#define HFS_FIRSTUSER_CNID	16
+
+/* values for hfs_cat_rec.cdrType */
+#define HFS_CDR_DIR    0x01    /* folder (directory) */
+#define HFS_CDR_FIL    0x02    /* file */
+#define HFS_CDR_THD    0x03    /* folder (directory) thread */
+#define HFS_CDR_FTH    0x04    /* file thread */
+
+/* legal values for hfs_ext_key.FkType and hfs_file.fork */
+#define HFS_FK_DATA	0x00
+#define HFS_FK_RSRC	0xFF
+
+/* bits in hfs_fil_entry.Flags */
+#define HFS_FIL_LOCK	0x01  /* locked */
+#define HFS_FIL_THD	0x02  /* file thread */
+#define HFS_FIL_DOPEN   0x04  /* data fork open */
+#define HFS_FIL_ROPEN   0x08  /* resource fork open */
+#define HFS_FIL_DIR     0x10  /* directory (always clear) */
+#define HFS_FIL_NOCOPY  0x40  /* copy-protected file */
+#define HFS_FIL_USED	0x80  /* open */
+
+/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */
+#define HFS_DIR_LOCK        0x01  /* locked */
+#define HFS_DIR_THD         0x02  /* directory thread */
+#define HFS_DIR_INEXPFOLDER 0x04  /* in a shared area */
+#define HFS_DIR_MOUNTED     0x08  /* mounted */
+#define HFS_DIR_DIR         0x10  /* directory (always set) */
+#define HFS_DIR_EXPFOLDER   0x20  /* share point */
+
+/* bits hfs_finfo.fdFlags */
+#define HFS_FLG_INITED		0x0100
+#define HFS_FLG_LOCKED		0x1000
+#define HFS_FLG_INVISIBLE	0x4000
+
+/*======== HFS structures as they appear on the disk ========*/
+
+/* Pascal-style string of up to 31 characters */
+struct hfs_name {
+	u8 len;
+	u8 name[HFS_NAMELEN];
+} __packed;
+
+struct hfs_point {
+	__be16 v;
+	__be16 h;
+} __packed;
+
+struct hfs_rect {
+	__be16 top;
+	__be16 left;
+	__be16 bottom;
+	__be16 right;
+} __packed;
+
+struct hfs_finfo {
+	__be32 fdType;
+	__be32 fdCreator;
+	__be16 fdFlags;
+	struct hfs_point fdLocation;
+	__be16 fdFldr;
+} __packed;
+
+struct hfs_fxinfo {
+	__be16 fdIconID;
+	u8 fdUnused[8];
+	__be16 fdComment;
+	__be32 fdPutAway;
+} __packed;
+
+struct hfs_dinfo {
+	struct hfs_rect frRect;
+	__be16 frFlags;
+	struct hfs_point frLocation;
+	__be16 frView;
+} __packed;
+
+struct hfs_dxinfo {
+	struct hfs_point frScroll;
+	__be32 frOpenChain;
+	__be16 frUnused;
+	__be16 frComment;
+	__be32 frPutAway;
+} __packed;
+
+union hfs_finder_info {
+	struct {
+		struct hfs_finfo finfo;
+		struct hfs_fxinfo fxinfo;
+	} file;
+	struct {
+		struct hfs_dinfo dinfo;
+		struct hfs_dxinfo dxinfo;
+	} dir;
+} __packed;
+
+/* Cast to a pointer to a generic bkey */
+#define	HFS_BKEY(X)	(((void)((X)->KeyLen)), ((struct hfs_bkey *)(X)))
+
+/* The key used in the catalog b-tree: */
+struct hfs_cat_key {
+	u8 key_len;		/* number of bytes in the key */
+	u8 reserved;		/* padding */
+	__be32 ParID;		/* CNID of the parent dir */
+	struct hfs_name	CName;	/* The filename of the entry */
+} __packed;
+
+/* The key used in the extents b-tree: */
+struct hfs_ext_key {
+	u8 key_len;		/* number of bytes in the key */
+	u8 FkType;		/* HFS_FK_{DATA,RSRC} */
+	__be32 FNum;		/* The File ID of the file */
+	__be16 FABN;		/* allocation blocks number*/
+} __packed;
+
+typedef union hfs_btree_key {
+	u8 key_len;			/* number of bytes in the key */
+	struct hfs_cat_key cat;
+	struct hfs_ext_key ext;
+} hfs_btree_key;
+
+#define HFS_MAX_CAT_KEYLEN	(sizeof(struct hfs_cat_key) - sizeof(u8))
+#define HFS_MAX_EXT_KEYLEN	(sizeof(struct hfs_ext_key) - sizeof(u8))
+
+typedef union hfs_btree_key btree_key;
+
+struct hfs_extent {
+	__be16 block;
+	__be16 count;
+};
+typedef struct hfs_extent hfs_extent_rec[3];
+
+/* The catalog record for a file */
+struct hfs_cat_file {
+	s8 type;			/* The type of entry */
+	u8 reserved;
+	u8 Flags;			/* Flags such as read-only */
+	s8 Typ;				/* file version number = 0 */
+	struct hfs_finfo UsrWds;	/* data used by the Finder */
+	__be32 FlNum;			/* The CNID */
+	__be16 StBlk;			/* obsolete */
+	__be32 LgLen;			/* The logical EOF of the data fork*/
+	__be32 PyLen;			/* The physical EOF of the data fork */
+	__be16 RStBlk;			/* obsolete */
+	__be32 RLgLen;			/* The logical EOF of the rsrc fork */
+	__be32 RPyLen;			/* The physical EOF of the rsrc fork */
+	__be32 CrDat;			/* The creation date */
+	__be32 MdDat;			/* The modified date */
+	__be32 BkDat;			/* The last backup date */
+	struct hfs_fxinfo FndrInfo;	/* more data for the Finder */
+	__be16 ClpSize;			/* number of bytes to allocate
+					   when extending files */
+	hfs_extent_rec ExtRec;		/* first extent record
+					   for the data fork */
+	hfs_extent_rec RExtRec;		/* first extent record
+					   for the resource fork */
+	u32 Resrv;			/* reserved by Apple */
+} __packed;
+
+/* the catalog record for a directory */
+struct hfs_cat_dir {
+	s8 type;			/* The type of entry */
+	u8 reserved;
+	__be16 Flags;			/* flags */
+	__be16 Val;			/* Valence: number of files and
+					   dirs in the directory */
+	__be32 DirID;			/* The CNID */
+	__be32 CrDat;			/* The creation date */
+	__be32 MdDat;			/* The modification date */
+	__be32 BkDat;			/* The last backup date */
+	struct hfs_dinfo UsrInfo;	/* data used by the Finder */
+	struct hfs_dxinfo FndrInfo;	/* more data used by Finder */
+	u8 Resrv[16];			/* reserved by Apple */
+} __packed;
+
+/* the catalog record for a thread */
+struct hfs_cat_thread {
+	s8 type;			/* The type of entry */
+	u8 reserved[9];			/* reserved by Apple */
+	__be32 ParID;			/* CNID of parent directory */
+	struct hfs_name CName;		/* The name of this entry */
+}  __packed;
+
+/* A catalog tree record */
+typedef union hfs_cat_rec {
+	s8 type;			/* The type of entry */
+	struct hfs_cat_file file;
+	struct hfs_cat_dir dir;
+	struct hfs_cat_thread thread;
+} hfs_cat_rec;
+
+struct hfs_mdb {
+	__be16 drSigWord;		/* Signature word indicating fs type */
+	__be32 drCrDate;		/* fs creation date/time */
+	__be32 drLsMod;			/* fs modification date/time */
+	__be16 drAtrb;			/* fs attributes */
+	__be16 drNmFls;			/* number of files in root directory */
+	__be16 drVBMSt;			/* location (in 512-byte blocks)
+					   of the volume bitmap */
+	__be16 drAllocPtr;		/* location (in allocation blocks)
+					   to begin next allocation search */
+	__be16 drNmAlBlks;		/* number of allocation blocks */
+	__be32 drAlBlkSiz;		/* bytes in an allocation block */
+	__be32 drClpSiz;		/* clumpsize, the number of bytes to
+					   allocate when extending a file */
+	__be16 drAlBlSt;		/* location (in 512-byte blocks)
+					   of the first allocation block */
+	__be32 drNxtCNID;		/* CNID to assign to the next
+					   file or directory created */
+	__be16 drFreeBks;		/* number of free allocation blocks */
+	u8 drVN[28];			/* the volume label */
+	__be32 drVolBkUp;		/* fs backup date/time */
+	__be16 drVSeqNum;		/* backup sequence number */
+	__be32 drWrCnt;			/* fs write count */
+	__be32 drXTClpSiz;		/* clumpsize for the extents B-tree */
+	__be32 drCTClpSiz;		/* clumpsize for the catalog B-tree */
+	__be16 drNmRtDirs;		/* number of directories in
+					   the root directory */
+	__be32 drFilCnt;		/* number of files in the fs */
+	__be32 drDirCnt;		/* number of directories in the fs */
+	u8 drFndrInfo[32];		/* data used by the Finder */
+	__be16 drEmbedSigWord;		/* embedded volume signature */
+	__be32 drEmbedExtent;		/* starting block number (xdrStABN)
+					   and number of allocation blocks
+					   (xdrNumABlks) occupied by embedded
+					   volume */
+	__be32 drXTFlSize;		/* bytes in the extents B-tree */
+	hfs_extent_rec drXTExtRec;	/* extents B-tree's first 3 extents */
+	__be32 drCTFlSize;		/* bytes in the catalog B-tree */
+	hfs_extent_rec drCTExtRec;	/* catalog B-tree's first 3 extents */
+} __packed;
+
+/*======== Data structures kept in memory ========*/
+
+struct hfs_readdir_data {
+	struct list_head list;
+	struct file *file;
+	struct hfs_cat_key key;
+};
+
+#endif
diff --git a/kernel/fs/hfs/hfs_fs.h b/kernel/fs/hfs/hfs_fs.h
new file mode 100644
index 000000000..95d255219
--- /dev/null
+++ b/kernel/fs/hfs/hfs_fs.h
@@ -0,0 +1,290 @@
+/*
+ *  linux/fs/hfs/hfs_fs.h
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ */
+
+#ifndef _LINUX_HFS_FS_H
+#define _LINUX_HFS_FS_H
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/workqueue.h>
+
+#include <asm/byteorder.h>
+#include <asm/uaccess.h>
+
+#include "hfs.h"
+
+#define DBG_BNODE_REFS	0x00000001
+#define DBG_BNODE_MOD	0x00000002
+#define DBG_CAT_MOD	0x00000004
+#define DBG_INODE	0x00000008
+#define DBG_SUPER	0x00000010
+#define DBG_EXTENT	0x00000020
+#define DBG_BITMAP	0x00000040
+
+//#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP)
+//#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+//#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#define DBG_MASK	(0)
+
+#define hfs_dbg(flg, fmt, ...)					\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__);	\
+} while (0)
+
+#define hfs_dbg_cont(flg, fmt, ...)				\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		pr_cont(fmt, ##__VA_ARGS__);			\
+} while (0)
+
+
+/*
+ * struct hfs_inode_info
+ *
+ * The HFS-specific part of a Linux (struct inode)
+ */
+struct hfs_inode_info {
+	atomic_t opencnt;
+
+	unsigned int flags;
+
+	/* to deal with localtime ugliness */
+	int tz_secondswest;
+
+	struct hfs_cat_key cat_key;
+
+	struct list_head open_dir_list;
+	struct inode *rsrc_inode;
+
+	struct mutex extents_lock;
+
+	u16 alloc_blocks, clump_blocks;
+	sector_t fs_blocks;
+	/* Allocation extents from catlog record or volume header */
+	hfs_extent_rec first_extents;
+	u16 first_blocks;
+	hfs_extent_rec cached_extents;
+	u16 cached_start, cached_blocks;
+
+	loff_t phys_size;
+	struct inode vfs_inode;
+};
+
+#define HFS_FLG_RSRC		0x0001
+#define HFS_FLG_EXT_DIRTY	0x0002
+#define HFS_FLG_EXT_NEW		0x0004
+
+#define HFS_IS_RSRC(inode)	(HFS_I(inode)->flags & HFS_FLG_RSRC)
+
+/*
+ * struct hfs_sb_info
+ *
+ * The HFS-specific part of a Linux (struct super_block)
+ */
+struct hfs_sb_info {
+	struct buffer_head *mdb_bh;		/* The hfs_buffer
+						   holding the real
+						   superblock (aka VIB
+						   or MDB) */
+	struct hfs_mdb *mdb;
+	struct buffer_head *alt_mdb_bh;		/* The hfs_buffer holding
+						   the alternate superblock */
+	struct hfs_mdb *alt_mdb;
+	__be32 *bitmap;				/* The page holding the
+						   allocation bitmap */
+	struct hfs_btree *ext_tree;			/* Information about
+						   the extents b-tree */
+	struct hfs_btree *cat_tree;			/* Information about
+						   the catalog b-tree */
+	u32 file_count;				/* The number of
+						   regular files in
+						   the filesystem */
+	u32 folder_count;			/* The number of
+						   directories in the
+						   filesystem */
+	u32 next_id;				/* The next available
+						   file id number */
+	u32 clumpablks;				/* The number of allocation
+						   blocks to try to add when
+						   extending a file */
+	u32 fs_start;				/* The first 512-byte
+						   block represented
+						   in the bitmap */
+	u32 part_start;
+	u16 root_files;				/* The number of
+						   regular
+						   (non-directory)
+						   files in the root
+						   directory */
+	u16 root_dirs;				/* The number of
+						   directories in the
+						   root directory */
+	u16 fs_ablocks;				/* The number of
+						   allocation blocks
+						   in the filesystem */
+	u16 free_ablocks;			/* the number of unused
+						   allocation blocks
+						   in the filesystem */
+	u32 alloc_blksz;			/* The size of an
+						   "allocation block" */
+	int s_quiet;				/* Silent failure when
+						   changing owner or mode? */
+	__be32 s_type;				/* Type for new files */
+	__be32 s_creator;			/* Creator for new files */
+	umode_t s_file_umask;			/* The umask applied to the
+						   permissions on all files */
+	umode_t s_dir_umask;			/* The umask applied to the
+						   permissions on all dirs */
+	kuid_t s_uid;				/* The uid of all files */
+	kgid_t s_gid;				/* The gid of all files */
+
+	int session, part;
+	struct nls_table *nls_io, *nls_disk;
+	struct mutex bitmap_lock;
+	unsigned long flags;
+	u16 blockoffset;
+	int fs_div;
+	struct super_block *sb;
+	int work_queued;		/* non-zero delayed work is queued */
+	struct delayed_work mdb_work;	/* MDB flush delayed work */
+	spinlock_t work_lock;		/* protects mdb_work and work_queued */
+};
+
+#define HFS_FLG_BITMAP_DIRTY	0
+#define HFS_FLG_MDB_DIRTY	1
+#define HFS_FLG_ALT_MDB_DIRTY	2
+
+/* bitmap.c */
+extern u32 hfs_vbm_search_free(struct super_block *, u32, u32 *);
+extern int hfs_clear_vbm_bits(struct super_block *, u16, u16);
+
+/* catalog.c */
+extern int hfs_cat_keycmp(const btree_key *, const btree_key *);
+struct hfs_find_data;
+extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *);
+extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *);
+extern int hfs_cat_delete(u32, struct inode *, struct qstr *);
+extern int hfs_cat_move(u32, struct inode *, struct qstr *,
+			struct inode *, struct qstr *);
+extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *);
+
+/* dir.c */
+extern const struct file_operations hfs_dir_operations;
+extern const struct inode_operations hfs_dir_inode_operations;
+
+/* extent.c */
+extern int hfs_ext_keycmp(const btree_key *, const btree_key *);
+extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int);
+extern int hfs_ext_write_extent(struct inode *);
+extern int hfs_extend_file(struct inode *);
+extern void hfs_file_truncate(struct inode *);
+
+extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
+/* inode.c */
+extern const struct address_space_operations hfs_aops;
+extern const struct address_space_operations hfs_btree_aops;
+
+extern struct inode *hfs_new_inode(struct inode *, struct qstr *, umode_t);
+extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
+extern int hfs_write_inode(struct inode *, struct writeback_control *);
+extern int hfs_inode_setattr(struct dentry *, struct iattr *);
+extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
+			__be32 log_size, __be32 phys_size, u32 clump_size);
+extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
+extern void hfs_evict_inode(struct inode *);
+extern void hfs_delete_inode(struct inode *);
+
+/* attr.c */
+extern int hfs_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags);
+extern ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
+			    void *value, size_t size);
+extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* mdb.c */
+extern int hfs_mdb_get(struct super_block *);
+extern void hfs_mdb_commit(struct super_block *);
+extern void hfs_mdb_close(struct super_block *);
+extern void hfs_mdb_put(struct super_block *);
+
+/* part_tbl.c */
+extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
+
+/* string.c */
+extern const struct dentry_operations hfs_dentry_operations;
+
+extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
+extern int hfs_strcmp(const unsigned char *, unsigned int,
+		      const unsigned char *, unsigned int);
+extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name);
+
+/* trans.c */
+extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
+extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *);
+
+/* super.c */
+extern void hfs_mark_mdb_dirty(struct super_block *sb);
+
+/*
+ * There are two time systems.  Both are based on seconds since
+ * a particular time/date.
+ *	Unix:	unsigned lil-endian since 00:00 GMT, Jan. 1, 1970
+ *	mac:	unsigned big-endian since 00:00 GMT, Jan. 1, 1904
+ *
+ */
+#define __hfs_u_to_mtime(sec)	cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60)
+#define __hfs_m_to_utime(sec)	(be32_to_cpu(sec) - 2082844800U  + sys_tz.tz_minuteswest * 60)
+
+#define HFS_I(inode)	(list_entry(inode, struct hfs_inode_info, vfs_inode))
+#define HFS_SB(sb)	((struct hfs_sb_info *)(sb)->s_fs_info)
+
+#define hfs_m_to_utime(time)	(struct timespec){ .tv_sec = __hfs_m_to_utime(time) }
+#define hfs_u_to_mtime(time)	__hfs_u_to_mtime((time).tv_sec)
+#define hfs_mtime()		__hfs_u_to_mtime(get_seconds())
+
+static inline const char *hfs_mdb_name(struct super_block *sb)
+{
+	return sb->s_id;
+}
+
+static inline void hfs_bitmap_dirty(struct super_block *sb)
+{
+	set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags);
+	hfs_mark_mdb_dirty(sb);
+}
+
+#define sb_bread512(sb, sec, data) ({			\
+	struct buffer_head *__bh;			\
+	sector_t __block;				\
+	loff_t __start;					\
+	int __offset;					\
+							\
+	__start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\
+	__block = __start >> (sb)->s_blocksize_bits;	\
+	__offset = __start & ((sb)->s_blocksize - 1);	\
+	__bh = sb_bread((sb), __block);			\
+	if (likely(__bh != NULL))			\
+		data = (void *)(__bh->b_data + __offset);\
+	else						\
+		data = NULL;				\
+	__bh;						\
+})
+
+#endif
diff --git a/kernel/fs/hfs/inode.c b/kernel/fs/hfs/inode.c
new file mode 100644
index 000000000..b99ebddb1
--- /dev/null
+++ b/kernel/fs/hfs/inode.c
@@ -0,0 +1,692 @@
+/*
+ *  linux/fs/hfs/inode.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains inode-related functions which do not depend on
+ * which scheme is being used to represent forks.
+ *
+ * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
+ */
+
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+static const struct file_operations hfs_file_operations;
+static const struct inode_operations hfs_file_inode_operations;
+
+/*================ Variable-like macros ================*/
+
+#define HFS_VALID_MODE_BITS  (S_IFREG | S_IFDIR | S_IRWXUGO)
+
+static int hfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, hfs_get_block, wbc);
+}
+
+static int hfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, hfs_get_block);
+}
+
+static void hfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		hfs_file_truncate(inode);
+	}
+}
+
+static int hfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				hfs_get_block,
+				&HFS_I(mapping->host)->phys_size);
+	if (unlikely(ret))
+		hfs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, hfs_get_block);
+}
+
+static int hfs_releasepage(struct page *page, gfp_t mask)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_btree *tree;
+	struct hfs_bnode *node;
+	u32 nidx;
+	int i, res = 1;
+
+	switch (inode->i_ino) {
+	case HFS_EXT_CNID:
+		tree = HFS_SB(sb)->ext_tree;
+		break;
+	case HFS_CAT_CNID:
+		tree = HFS_SB(sb)->cat_tree;
+		break;
+	default:
+		BUG();
+		return 0;
+	}
+
+	if (!tree)
+		return 0;
+
+	if (tree->node_size >= PAGE_CACHE_SIZE) {
+		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+		spin_lock(&tree->hash_lock);
+		node = hfs_bnode_findhash(tree, nidx);
+		if (!node)
+			;
+		else if (atomic_read(&node->refcnt))
+			res = 0;
+		if (res && node) {
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		}
+		spin_unlock(&tree->hash_lock);
+	} else {
+		nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		spin_lock(&tree->hash_lock);
+		do {
+			node = hfs_bnode_findhash(tree, nidx++);
+			if (!node)
+				continue;
+			if (atomic_read(&node->refcnt)) {
+				res = 0;
+				break;
+			}
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		} while (--i && nidx < tree->node_count);
+		spin_unlock(&tree->hash_lock);
+	}
+	return res ? try_to_free_buffers(page) : 0;
+}
+
+static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			     loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = file_inode(file)->i_mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + count;
+
+		if (end > isize)
+			hfs_write_failed(mapping, end);
+	}
+
+	return ret;
+}
+
+static int hfs_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, hfs_get_block);
+}
+
+const struct address_space_operations hfs_btree_aops = {
+	.readpage	= hfs_readpage,
+	.writepage	= hfs_writepage,
+	.write_begin	= hfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfs_bmap,
+	.releasepage	= hfs_releasepage,
+};
+
+const struct address_space_operations hfs_aops = {
+	.readpage	= hfs_readpage,
+	.writepage	= hfs_writepage,
+	.write_begin	= hfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfs_bmap,
+	.direct_IO	= hfs_direct_IO,
+	.writepages	= hfs_writepages,
+};
+
+/*
+ * hfs_new_inode
+ */
+struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	mutex_init(&HFS_I(inode)->extents_lock);
+	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
+	hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
+	inode->i_ino = HFS_SB(sb)->next_id++;
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	set_nlink(inode, 1);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	HFS_I(inode)->flags = 0;
+	HFS_I(inode)->rsrc_inode = NULL;
+	HFS_I(inode)->fs_blocks = 0;
+	if (S_ISDIR(mode)) {
+		inode->i_size = 2;
+		HFS_SB(sb)->folder_count++;
+		if (dir->i_ino == HFS_ROOT_CNID)
+			HFS_SB(sb)->root_dirs++;
+		inode->i_op = &hfs_dir_inode_operations;
+		inode->i_fop = &hfs_dir_operations;
+		inode->i_mode |= S_IRWXUGO;
+		inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
+	} else if (S_ISREG(mode)) {
+		HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
+		HFS_SB(sb)->file_count++;
+		if (dir->i_ino == HFS_ROOT_CNID)
+			HFS_SB(sb)->root_files++;
+		inode->i_op = &hfs_file_inode_operations;
+		inode->i_fop = &hfs_file_operations;
+		inode->i_mapping->a_ops = &hfs_aops;
+		inode->i_mode |= S_IRUGO|S_IXUGO;
+		if (mode & S_IWUSR)
+			inode->i_mode |= S_IWUGO;
+		inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask;
+		HFS_I(inode)->phys_size = 0;
+		HFS_I(inode)->alloc_blocks = 0;
+		HFS_I(inode)->first_blocks = 0;
+		HFS_I(inode)->cached_start = 0;
+		HFS_I(inode)->cached_blocks = 0;
+		memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec));
+		memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
+	}
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+	hfs_mark_mdb_dirty(sb);
+
+	return inode;
+}
+
+void hfs_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino);
+	if (S_ISDIR(inode->i_mode)) {
+		HFS_SB(sb)->folder_count--;
+		if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
+			HFS_SB(sb)->root_dirs--;
+		set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+		hfs_mark_mdb_dirty(sb);
+		return;
+	}
+	HFS_SB(sb)->file_count--;
+	if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
+		HFS_SB(sb)->root_files--;
+	if (S_ISREG(inode->i_mode)) {
+		if (!inode->i_nlink) {
+			inode->i_size = 0;
+			hfs_file_truncate(inode);
+		}
+	}
+	set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+	hfs_mark_mdb_dirty(sb);
+}
+
+void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
+			 __be32 __log_size, __be32 phys_size, u32 clump_size)
+{
+	struct super_block *sb = inode->i_sb;
+	u32 log_size = be32_to_cpu(__log_size);
+	u16 count;
+	int i;
+
+	memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec));
+	for (count = 0, i = 0; i < 3; i++)
+		count += be16_to_cpu(ext[i].count);
+	HFS_I(inode)->first_blocks = count;
+
+	inode->i_size = HFS_I(inode)->phys_size = log_size;
+	HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
+	HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) /
+				     HFS_SB(sb)->alloc_blksz;
+	HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz;
+	if (!HFS_I(inode)->clump_blocks)
+		HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
+}
+
+struct hfs_iget_data {
+	struct hfs_cat_key *key;
+	hfs_cat_rec *rec;
+};
+
+static int hfs_test_inode(struct inode *inode, void *data)
+{
+	struct hfs_iget_data *idata = data;
+	hfs_cat_rec *rec;
+
+	rec = idata->rec;
+	switch (rec->type) {
+	case HFS_CDR_DIR:
+		return inode->i_ino == be32_to_cpu(rec->dir.DirID);
+	case HFS_CDR_FIL:
+		return inode->i_ino == be32_to_cpu(rec->file.FlNum);
+	default:
+		BUG();
+		return 1;
+	}
+}
+
+/*
+ * hfs_read_inode
+ */
+static int hfs_read_inode(struct inode *inode, void *data)
+{
+	struct hfs_iget_data *idata = data;
+	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
+	hfs_cat_rec *rec;
+
+	HFS_I(inode)->flags = 0;
+	HFS_I(inode)->rsrc_inode = NULL;
+	mutex_init(&HFS_I(inode)->extents_lock);
+	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
+
+	/* Initialize the inode */
+	inode->i_uid = hsb->s_uid;
+	inode->i_gid = hsb->s_gid;
+	set_nlink(inode, 1);
+
+	if (idata->key)
+		HFS_I(inode)->cat_key = *idata->key;
+	else
+		HFS_I(inode)->flags |= HFS_FLG_RSRC;
+	HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
+
+	rec = idata->rec;
+	switch (rec->type) {
+	case HFS_CDR_FIL:
+		if (!HFS_IS_RSRC(inode)) {
+			hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen,
+					    rec->file.PyLen, be16_to_cpu(rec->file.ClpSize));
+		} else {
+			hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen,
+					    rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize));
+		}
+
+		inode->i_ino = be32_to_cpu(rec->file.FlNum);
+		inode->i_mode = S_IRUGO | S_IXUGO;
+		if (!(rec->file.Flags & HFS_FIL_LOCK))
+			inode->i_mode |= S_IWUGO;
+		inode->i_mode &= ~hsb->s_file_umask;
+		inode->i_mode |= S_IFREG;
+		inode->i_ctime = inode->i_atime = inode->i_mtime =
+				hfs_m_to_utime(rec->file.MdDat);
+		inode->i_op = &hfs_file_inode_operations;
+		inode->i_fop = &hfs_file_operations;
+		inode->i_mapping->a_ops = &hfs_aops;
+		break;
+	case HFS_CDR_DIR:
+		inode->i_ino = be32_to_cpu(rec->dir.DirID);
+		inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
+		HFS_I(inode)->fs_blocks = 0;
+		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
+		inode->i_ctime = inode->i_atime = inode->i_mtime =
+				hfs_m_to_utime(rec->dir.MdDat);
+		inode->i_op = &hfs_dir_inode_operations;
+		inode->i_fop = &hfs_dir_operations;
+		break;
+	default:
+		make_bad_inode(inode);
+	}
+	return 0;
+}
+
+/*
+ * __hfs_iget()
+ *
+ * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in
+ * the catalog B-tree and the 'type' of the desired file return the
+ * inode for that file/directory or NULL.  Note that 'type' indicates
+ * whether we want the actual file or directory, or the corresponding
+ * metadata (AppleDouble header file or CAP metadata file).
+ */
+struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec)
+{
+	struct hfs_iget_data data = { key, rec };
+	struct inode *inode;
+	u32 cnid;
+
+	switch (rec->type) {
+	case HFS_CDR_DIR:
+		cnid = be32_to_cpu(rec->dir.DirID);
+		break;
+	case HFS_CDR_FIL:
+		cnid = be32_to_cpu(rec->file.FlNum);
+		break;
+	default:
+		return NULL;
+	}
+	inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
+	if (inode && (inode->i_state & I_NEW))
+		unlock_new_inode(inode);
+	return inode;
+}
+
+void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
+			  __be32 *log_size, __be32 *phys_size)
+{
+	memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec));
+
+	if (log_size)
+		*log_size = cpu_to_be32(inode->i_size);
+	if (phys_size)
+		*phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks *
+					 HFS_SB(inode->i_sb)->alloc_blksz);
+}
+
+int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct inode *main_inode = inode;
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+	int res;
+
+	hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino);
+	res = hfs_ext_write_extent(inode);
+	if (res)
+		return res;
+
+	if (inode->i_ino < HFS_FIRSTUSER_CNID) {
+		switch (inode->i_ino) {
+		case HFS_ROOT_CNID:
+			break;
+		case HFS_EXT_CNID:
+			hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree);
+			return 0;
+		case HFS_CAT_CNID:
+			hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree);
+			return 0;
+		default:
+			BUG();
+			return -EIO;
+		}
+	}
+
+	if (HFS_IS_RSRC(inode))
+		main_inode = HFS_I(inode)->rsrc_inode;
+
+	if (!main_inode->i_nlink)
+		return 0;
+
+	if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd))
+		/* panic? */
+		return -EIO;
+
+	fd.search_key->cat = HFS_I(main_inode)->cat_key;
+	if (hfs_brec_find(&fd))
+		/* panic? */
+		goto out;
+
+	if (S_ISDIR(main_inode->i_mode)) {
+		if (fd.entrylength < sizeof(struct hfs_cat_dir))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			   sizeof(struct hfs_cat_dir));
+		if (rec.type != HFS_CDR_DIR ||
+		    be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
+		}
+
+		rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
+		rec.dir.Val = cpu_to_be16(inode->i_size - 2);
+
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+			    sizeof(struct hfs_cat_dir));
+	} else if (HFS_IS_RSRC(inode)) {
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			       sizeof(struct hfs_cat_file));
+		hfs_inode_write_fork(inode, rec.file.RExtRec,
+				     &rec.file.RLgLen, &rec.file.RPyLen);
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+				sizeof(struct hfs_cat_file));
+	} else {
+		if (fd.entrylength < sizeof(struct hfs_cat_file))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			   sizeof(struct hfs_cat_file));
+		if (rec.type != HFS_CDR_FIL ||
+		    be32_to_cpu(rec.file.FlNum) != inode->i_ino) {
+		}
+
+		if (inode->i_mode & S_IWUSR)
+			rec.file.Flags &= ~HFS_FIL_LOCK;
+		else
+			rec.file.Flags |= HFS_FIL_LOCK;
+		hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
+		rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
+
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+			    sizeof(struct hfs_cat_file));
+	}
+out:
+	hfs_find_exit(&fd);
+	return 0;
+}
+
+static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
+				      unsigned int flags)
+{
+	struct inode *inode = NULL;
+	hfs_cat_rec rec;
+	struct hfs_find_data fd;
+	int res;
+
+	if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
+		goto out;
+
+	inode = HFS_I(dir)->rsrc_inode;
+	if (inode)
+		goto out;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	if (res) {
+		iput(inode);
+		return ERR_PTR(res);
+	}
+	fd.search_key->cat = HFS_I(dir)->cat_key;
+	res = hfs_brec_read(&fd, &rec, sizeof(rec));
+	if (!res) {
+		struct hfs_iget_data idata = { NULL, &rec };
+		hfs_read_inode(inode, &idata);
+	}
+	hfs_find_exit(&fd);
+	if (res) {
+		iput(inode);
+		return ERR_PTR(res);
+	}
+	HFS_I(inode)->rsrc_inode = dir;
+	HFS_I(dir)->rsrc_inode = inode;
+	igrab(dir);
+	hlist_add_fake(&inode->i_hash);
+	mark_inode_dirty(inode);
+out:
+	d_add(dentry, inode);
+	return NULL;
+}
+
+void hfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
+		HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+		iput(HFS_I(inode)->rsrc_inode);
+	}
+}
+
+static int hfs_file_open(struct inode *inode, struct file *file)
+{
+	if (HFS_IS_RSRC(inode))
+		inode = HFS_I(inode)->rsrc_inode;
+	atomic_inc(&HFS_I(inode)->opencnt);
+	return 0;
+}
+
+static int hfs_file_release(struct inode *inode, struct file *file)
+{
+	//struct super_block *sb = inode->i_sb;
+
+	if (HFS_IS_RSRC(inode))
+		inode = HFS_I(inode)->rsrc_inode;
+	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		hfs_file_truncate(inode);
+		//if (inode->i_flags & S_DEAD) {
+		//	hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+		//	hfs_delete_inode(inode);
+		//}
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+/*
+ * hfs_notify_change()
+ *
+ * Based very closely on fs/msdos/inode.c by Werner Almesberger
+ *
+ * This is the notify_change() field in the super_operations structure
+ * for HFS file systems.  The purpose is to take that changes made to
+ * an inode and apply then in a filesystem-dependent manner.  In this
+ * case the process has a few of tasks to do:
+ *  1) prevent changes to the i_uid and i_gid fields.
+ *  2) map file permissions to the closest allowable permissions
+ *  3) Since multiple Linux files can share the same on-disk inode under
+ *     HFS (for instance the data and resource forks of a file) a change
+ *     to permissions must be applied to all other in-core inodes which
+ *     correspond to the same HFS file.
+ */
+
+int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
+	int error;
+
+	error = inode_change_ok(inode, attr); /* basic permission checks */
+	if (error)
+		return error;
+
+	/* no uig/gid changes and limit which mode bits can be set */
+	if (((attr->ia_valid & ATTR_UID) &&
+	     (!uid_eq(attr->ia_uid, hsb->s_uid))) ||
+	    ((attr->ia_valid & ATTR_GID) &&
+	     (!gid_eq(attr->ia_gid, hsb->s_gid))) ||
+	    ((attr->ia_valid & ATTR_MODE) &&
+	     ((S_ISDIR(inode->i_mode) &&
+	       (attr->ia_mode != inode->i_mode)) ||
+	      (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) {
+		return hsb->s_quiet ? 0 : error;
+	}
+
+	if (attr->ia_valid & ATTR_MODE) {
+		/* Only the 'w' bits can ever change and only all together. */
+		if (attr->ia_mode & S_IWUSR)
+			attr->ia_mode = inode->i_mode | S_IWUGO;
+		else
+			attr->ia_mode = inode->i_mode & ~S_IWUGO;
+		attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
+	}
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
+
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+
+		truncate_setsize(inode, attr->ia_size);
+		hfs_file_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			  int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block * sb;
+	int ret, err;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	flush_delayed_work(&HFS_SB(sb)->mdb_work);
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static const struct file_operations hfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+	.fsync		= hfs_file_fsync,
+	.open		= hfs_file_open,
+	.release	= hfs_file_release,
+};
+
+static const struct inode_operations hfs_file_inode_operations = {
+	.lookup		= hfs_file_lookup,
+	.setattr	= hfs_inode_setattr,
+	.setxattr	= hfs_setxattr,
+	.getxattr	= hfs_getxattr,
+	.listxattr	= hfs_listxattr,
+};
diff --git a/kernel/fs/hfs/mdb.c b/kernel/fs/hfs/mdb.c
new file mode 100644
index 000000000..aa3f0d6d0
--- /dev/null
+++ b/kernel/fs/hfs/mdb.c
@@ -0,0 +1,366 @@
+/*
+ *  linux/fs/hfs/mdb.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains functions for reading/writing the MDB.
+ */
+
+#include <linux/cdrom.h>
+#include <linux/genhd.h>
+#include <linux/nls.h>
+#include <linux/slab.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*================ File-local data types ================*/
+
+/*
+ * The HFS Master Directory Block (MDB).
+ *
+ * Also known as the Volume Information Block (VIB), this structure is
+ * the HFS equivalent of a superblock.
+ *
+ * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62
+ *
+ * modified for HFS Extended
+ */
+
+static int hfs_get_last_session(struct super_block *sb,
+				sector_t *start, sector_t *size)
+{
+	struct cdrom_multisession ms_info;
+	struct cdrom_tocentry te;
+	int res;
+
+	/* default values */
+	*start = 0;
+	*size = sb->s_bdev->bd_inode->i_size >> 9;
+
+	if (HFS_SB(sb)->session >= 0) {
+		te.cdte_track = HFS_SB(sb)->session;
+		te.cdte_format = CDROM_LBA;
+		res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
+		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
+			*start = (sector_t)te.cdte_addr.lba << 2;
+			return 0;
+		}
+		pr_err("invalid session number or type of track\n");
+		return -EINVAL;
+	}
+	ms_info.addr_format = CDROM_LBA;
+	res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+	if (!res && ms_info.xa_flag)
+		*start = (sector_t)ms_info.addr.lba << 2;
+	return 0;
+}
+
+/*
+ * hfs_mdb_get()
+ *
+ * Build the in-core MDB for a filesystem, including
+ * the B-trees and the volume bitmap.
+ */
+int hfs_mdb_get(struct super_block *sb)
+{
+	struct buffer_head *bh;
+	struct hfs_mdb *mdb, *mdb2;
+	unsigned int block;
+	char *ptr;
+	int off2, len, size, sect;
+	sector_t part_start, part_size;
+	loff_t off;
+	__be16 attrib;
+
+	/* set the device driver to 512-byte blocks */
+	size = sb_min_blocksize(sb, HFS_SECTOR_SIZE);
+	if (!size)
+		return -EINVAL;
+
+	if (hfs_get_last_session(sb, &part_start, &part_size))
+		return -EINVAL;
+	while (1) {
+		/* See if this is an HFS filesystem */
+		bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb);
+		if (!bh)
+			goto out;
+
+		if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC))
+			break;
+		brelse(bh);
+
+		/* check for a partition block
+		 * (should do this only for cdrom/loop though)
+		 */
+		if (hfs_part_find(sb, &part_start, &part_size))
+			goto out;
+	}
+
+	HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz);
+	if (!size || (size & (HFS_SECTOR_SIZE - 1))) {
+		pr_err("bad allocation block size %d\n", size);
+		goto out_bh;
+	}
+
+	size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE);
+	/* size must be a multiple of 512 */
+	while (size & (size - 1))
+		size -= HFS_SECTOR_SIZE;
+	sect = be16_to_cpu(mdb->drAlBlSt) + part_start;
+	/* align block size to first sector */
+	while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS))
+		size >>= 1;
+	/* align block size to weird alloc size */
+	while (HFS_SB(sb)->alloc_blksz & (size - 1))
+		size >>= 1;
+	brelse(bh);
+	if (!sb_set_blocksize(sb, size)) {
+		pr_err("unable to set blocksize to %u\n", size);
+		goto out;
+	}
+
+	bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb);
+	if (!bh)
+		goto out;
+	if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC))
+		goto out_bh;
+
+	HFS_SB(sb)->mdb_bh = bh;
+	HFS_SB(sb)->mdb = mdb;
+
+	/* These parameters are read from the MDB, and never written */
+	HFS_SB(sb)->part_start = part_start;
+	HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks);
+	HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits;
+	HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) /
+				 HFS_SB(sb)->alloc_blksz;
+	if (!HFS_SB(sb)->clumpablks)
+		HFS_SB(sb)->clumpablks = 1;
+	HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >>
+			       (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS);
+
+	/* These parameters are read from and written to the MDB */
+	HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks);
+	HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID);
+	HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls);
+	HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs);
+	HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt);
+	HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt);
+
+	/* TRY to get the alternate (backup) MDB. */
+	sect = part_start + part_size - 2;
+	bh = sb_bread512(sb, sect, mdb2);
+	if (bh) {
+		if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) {
+			HFS_SB(sb)->alt_mdb_bh = bh;
+			HFS_SB(sb)->alt_mdb = mdb2;
+		} else
+			brelse(bh);
+	}
+
+	if (!HFS_SB(sb)->alt_mdb) {
+		pr_warn("unable to locate alternate MDB\n");
+		pr_warn("continuing without an alternate MDB\n");
+	}
+
+	HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
+	if (!HFS_SB(sb)->bitmap)
+		goto out;
+
+	/* read in the bitmap */
+	block = be16_to_cpu(mdb->drVBMSt) + part_start;
+	off = (loff_t)block << HFS_SECTOR_SIZE_BITS;
+	size = (HFS_SB(sb)->fs_ablocks + 8) / 8;
+	ptr = (u8 *)HFS_SB(sb)->bitmap;
+	while (size) {
+		bh = sb_bread(sb, off >> sb->s_blocksize_bits);
+		if (!bh) {
+			pr_err("unable to read volume bitmap\n");
+			goto out;
+		}
+		off2 = off & (sb->s_blocksize - 1);
+		len = min((int)sb->s_blocksize - off2, size);
+		memcpy(ptr, bh->b_data + off2, len);
+		brelse(bh);
+		ptr += len;
+		off += len;
+		size -= len;
+	}
+
+	HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp);
+	if (!HFS_SB(sb)->ext_tree) {
+		pr_err("unable to open extent tree\n");
+		goto out;
+	}
+	HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp);
+	if (!HFS_SB(sb)->cat_tree) {
+		pr_err("unable to open catalog tree\n");
+		goto out;
+	}
+
+	attrib = mdb->drAtrb;
+	if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
+		pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+	if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
+		pr_warn("filesystem is marked locked, mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* Mark the volume uncleanly unmounted in case we crash */
+		attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT);
+		attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT);
+		mdb->drAtrb = attrib;
+		be32_add_cpu(&mdb->drWrCnt, 1);
+		mdb->drLsMod = hfs_mtime();
+
+		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
+	}
+
+	return 0;
+
+out_bh:
+	brelse(bh);
+out:
+	hfs_mdb_put(sb);
+	return -EIO;
+}
+
+/*
+ * hfs_mdb_commit()
+ *
+ * Description:
+ *   This updates the MDB on disk.
+ *   It does not check, if the superblock has been modified, or
+ *   if the filesystem has been mounted read-only. It is mainly
+ *   called by hfs_sync_fs() and flush_mdb().
+ * Input Variable(s):
+ *   struct hfs_mdb *mdb: Pointer to the hfs MDB
+ *   int backup;
+ * Output Variable(s):
+ *   NONE
+ * Returns:
+ *   void
+ * Preconditions:
+ *   'mdb' points to a "valid" (struct hfs_mdb).
+ * Postconditions:
+ *   The HFS MDB and on disk will be updated, by copying the possibly
+ *   modified fields from the in memory MDB (in native byte order) to
+ *   the disk block buffer.
+ *   If 'backup' is non-zero then the alternate MDB is also written
+ *   and the function doesn't return until it is actually on disk.
+ */
+void hfs_mdb_commit(struct super_block *sb)
+{
+	struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	lock_buffer(HFS_SB(sb)->mdb_bh);
+	if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) {
+		/* These parameters may have been modified, so write them back */
+		mdb->drLsMod = hfs_mtime();
+		mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks);
+		mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id);
+		mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files);
+		mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs);
+		mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count);
+		mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count);
+
+		/* write MDB to disk */
+		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
+	}
+
+	/* write the backup MDB, not returning until it is written.
+	 * we only do this when either the catalog or extents overflow
+	 * files grow. */
+	if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) &&
+	    HFS_SB(sb)->alt_mdb) {
+		hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec,
+				     &mdb->drXTFlSize, NULL);
+		hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec,
+				     &mdb->drCTFlSize, NULL);
+
+		lock_buffer(HFS_SB(sb)->alt_mdb_bh);
+		memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE);
+		HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
+		HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
+		unlock_buffer(HFS_SB(sb)->alt_mdb_bh);
+
+		mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
+	}
+
+	if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
+		struct buffer_head *bh;
+		sector_t block;
+		char *ptr;
+		int off, size, len;
+
+		block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start;
+		off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1);
+		block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS;
+		size = (HFS_SB(sb)->fs_ablocks + 7) / 8;
+		ptr = (u8 *)HFS_SB(sb)->bitmap;
+		while (size) {
+			bh = sb_bread(sb, block);
+			if (!bh) {
+				pr_err("unable to read volume bitmap\n");
+				break;
+			}
+			len = min((int)sb->s_blocksize - off, size);
+
+			lock_buffer(bh);
+			memcpy(bh->b_data + off, ptr, len);
+			unlock_buffer(bh);
+
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			block++;
+			off = 0;
+			ptr += len;
+			size -= len;
+		}
+	}
+	unlock_buffer(HFS_SB(sb)->mdb_bh);
+}
+
+void hfs_mdb_close(struct super_block *sb)
+{
+	/* update volume attributes */
+	if (sb->s_flags & MS_RDONLY)
+		return;
+	HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
+	HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
+	mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
+}
+
+/*
+ * hfs_mdb_put()
+ *
+ * Release the resources associated with the in-core MDB.  */
+void hfs_mdb_put(struct super_block *sb)
+{
+	if (!HFS_SB(sb))
+		return;
+	/* free the B-trees */
+	hfs_btree_close(HFS_SB(sb)->ext_tree);
+	hfs_btree_close(HFS_SB(sb)->cat_tree);
+
+	/* free the buffers holding the primary and alternate MDBs */
+	brelse(HFS_SB(sb)->mdb_bh);
+	brelse(HFS_SB(sb)->alt_mdb_bh);
+
+	unload_nls(HFS_SB(sb)->nls_io);
+	unload_nls(HFS_SB(sb)->nls_disk);
+
+	free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
+	kfree(HFS_SB(sb));
+	sb->s_fs_info = NULL;
+}
diff --git a/kernel/fs/hfs/part_tbl.c b/kernel/fs/hfs/part_tbl.c
new file mode 100644
index 000000000..36add537d
--- /dev/null
+++ b/kernel/fs/hfs/part_tbl.c
@@ -0,0 +1,117 @@
+/*
+ *  linux/fs/hfs/part_tbl.c
+ *
+ * Copyright (C) 1996-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * Original code to handle the new style Mac partition table based on
+ * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
+ */
+
+#include "hfs_fs.h"
+
+/*
+ * The new style Mac partition map
+ *
+ * For each partition on the media there is a physical block (512-byte
+ * block) containing one of these structures.  These blocks are
+ * contiguous starting at block 1.
+ */
+struct new_pmap {
+	__be16	pmSig;		/* signature */
+	__be16	reSigPad;	/* padding */
+	__be32	pmMapBlkCnt;	/* partition blocks count */
+	__be32	pmPyPartStart;	/* physical block start of partition */
+	__be32	pmPartBlkCnt;	/* physical block count of partition */
+	u8	pmPartName[32];	/* (null terminated?) string
+				   giving the name of this
+				   partition */
+	u8	pmPartType[32];	/* (null terminated?) string
+				   giving the type of this
+				   partition */
+	/* a bunch more stuff we don't need */
+} __packed;
+
+/*
+ * The old style Mac partition map
+ *
+ * The partition map consists for a 2-byte signature followed by an
+ * array of these structures.  The map is terminated with an all-zero
+ * one of these.
+ */
+struct old_pmap {
+	__be16		pdSig;	/* Signature bytes */
+	struct 	old_pmap_entry {
+		__be32	pdStart;
+		__be32	pdSize;
+		__be32	pdFSID;
+	}	pdEntry[42];
+} __packed;
+
+/*
+ * hfs_part_find()
+ *
+ * Parse the partition map looking for the
+ * start and length of the 'part'th HFS partition.
+ */
+int hfs_part_find(struct super_block *sb,
+		  sector_t *part_start, sector_t *part_size)
+{
+	struct buffer_head *bh;
+	__be16 *data;
+	int i, size, res;
+
+	res = -ENOENT;
+	bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data);
+	if (!bh)
+		return -EIO;
+
+	switch (be16_to_cpu(*data)) {
+	case HFS_OLD_PMAP_MAGIC:
+	  {
+		struct old_pmap *pm;
+		struct old_pmap_entry *p;
+
+		pm = (struct old_pmap *)bh->b_data;
+		p = pm->pdEntry;
+		size = 42;
+		for (i = 0; i < size; p++, i++) {
+			if (p->pdStart && p->pdSize &&
+			    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+			    (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) {
+				*part_start += be32_to_cpu(p->pdStart);
+				*part_size = be32_to_cpu(p->pdSize);
+				res = 0;
+			}
+		}
+		break;
+	  }
+	case HFS_NEW_PMAP_MAGIC:
+	  {
+		struct new_pmap *pm;
+
+		pm = (struct new_pmap *)bh->b_data;
+		size = be32_to_cpu(pm->pmMapBlkCnt);
+		for (i = 0; i < size;) {
+			if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
+			    (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) {
+				*part_start += be32_to_cpu(pm->pmPyPartStart);
+				*part_size = be32_to_cpu(pm->pmPartBlkCnt);
+				res = 0;
+				break;
+			}
+			brelse(bh);
+			bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
+			if (!bh)
+				return -EIO;
+			if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
+				break;
+		}
+		break;
+	  }
+	}
+	brelse(bh);
+
+	return res;
+}
diff --git a/kernel/fs/hfs/string.c b/kernel/fs/hfs/string.c
new file mode 100644
index 000000000..85b610c39
--- /dev/null
+++ b/kernel/fs/hfs/string.c
@@ -0,0 +1,114 @@
+/*
+ *  linux/fs/hfs/string.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the string comparison function for the
+ * Macintosh character set.
+ *
+ * The code in this file is derived from code which is copyright
+ * 1986, 1989, 1990 by Abacus Research and Development, Inc. (ARDI)
+ * It is used here by the permission of ARDI's president Cliff Matthews.
+ */
+
+#include "hfs_fs.h"
+#include <linux/dcache.h>
+
+/*================ File-local variables ================*/
+
+/*
+ * unsigned char caseorder[]
+ *
+ * Defines the lexical ordering of characters on the Macintosh
+ *
+ * Composition of the 'casefold' and 'order' tables from ARDI's code
+ * with the entry for 0x20 changed to match that for 0xCA to remove
+ * special case for those two characters.
+ */
+static unsigned char caseorder[256] = {
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
+	0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
+	0x20,0x22,0x23,0x28,0x29,0x2A,0x2B,0x2C,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36,
+	0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43,0x44,0x45,0x46,
+	0x47,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E,
+	0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xA9,0xAA,0xAB,0xAC,0xAD,
+	0x4E,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E,
+	0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xAF,0xB0,0xB1,0xB2,0xB3,
+	0x4A,0x4C,0x5A,0x60,0x7B,0x7F,0x98,0x4F,0x49,0x51,0x4A,0x4B,0x4C,0x5A,0x60,0x63,
+	0x64,0x65,0x6E,0x6F,0x70,0x71,0x7B,0x84,0x85,0x86,0x7F,0x80,0x9A,0x9B,0x9C,0x98,
+	0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0x94,0xBB,0xBC,0xBD,0xBE,0xBF,0xC0,0x4D,0x81,
+	0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0x55,0x8A,0xCC,0x4D,0x81,
+	0xCD,0xCE,0xCF,0xD0,0xD1,0xD2,0xD3,0x26,0x27,0xD4,0x20,0x49,0x4B,0x80,0x82,0x82,
+	0xD5,0xD6,0x24,0x25,0x2D,0x2E,0xD7,0xD8,0xA6,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
+	0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
+	0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
+};
+
+/*================ Global functions ================*/
+
+/*
+ * Hash a string to an integer in a case-independent way
+ */
+int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
+{
+	const unsigned char *name = this->name;
+	unsigned int hash, len = this->len;
+
+	if (len > HFS_NAMELEN)
+		len = HFS_NAMELEN;
+
+	hash = init_name_hash();
+	for (; len; len--)
+		hash = partial_name_hash(caseorder[*name++], hash);
+	this->hash = end_name_hash(hash);
+	return 0;
+}
+
+/*
+ * Compare two strings in the HFS filename character ordering
+ * Returns positive, negative, or zero, not just 0 or (+/-)1
+ *
+ * Equivalent to ARDI's call:
+ *	ROMlib_RelString(s1+1, s2+1, true, false, (s1[0]<<16) | s2[0])
+ */
+int hfs_strcmp(const unsigned char *s1, unsigned int len1,
+	       const unsigned char *s2, unsigned int len2)
+{
+	int len, tmp;
+
+	len = (len1 > len2) ? len2 : len1;
+
+	while (len--) {
+		tmp = (int)caseorder[*(s1++)] - (int)caseorder[*(s2++)];
+		if (tmp)
+			return tmp;
+	}
+	return len1 - len2;
+}
+
+/*
+ * Test for equality of two strings in the HFS filename character ordering.
+ * return 1 on failure and 0 on success
+ */
+int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	const unsigned char *n1, *n2;
+
+	if (len >= HFS_NAMELEN) {
+		if (name->len < HFS_NAMELEN)
+			return 1;
+		len = HFS_NAMELEN;
+	} else if (len != name->len)
+		return 1;
+
+	n1 = str;
+	n2 = name->name;
+	while (len--) {
+		if (caseorder[*n1++] != caseorder[*n2++])
+			return 1;
+	}
+	return 0;
+}
diff --git a/kernel/fs/hfs/super.c b/kernel/fs/hfs/super.c
new file mode 100644
index 000000000..eee7206c3
--- /dev/null
+++ b/kernel/fs/hfs/super.c
@@ -0,0 +1,508 @@
+/*
+ *  linux/fs/hfs/super.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains hfs_read_super(), some of the super_ops and
+ * init_hfs_fs() and exit_hfs_fs().  The remaining super_ops are in
+ * inode.c since they deal with inodes.
+ *
+ * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
+ */
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/nls.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+static struct kmem_cache *hfs_inode_cachep;
+
+MODULE_LICENSE("GPL");
+
+static int hfs_sync_fs(struct super_block *sb, int wait)
+{
+	hfs_mdb_commit(sb);
+	return 0;
+}
+
+/*
+ * hfs_put_super()
+ *
+ * This is the put_super() entry in the super_operations structure for
+ * HFS filesystems.  The purpose is to release the resources
+ * associated with the superblock sb.
+ */
+static void hfs_put_super(struct super_block *sb)
+{
+	cancel_delayed_work_sync(&HFS_SB(sb)->mdb_work);
+	hfs_mdb_close(sb);
+	/* release the MDB's resources */
+	hfs_mdb_put(sb);
+}
+
+static void flush_mdb(struct work_struct *work)
+{
+	struct hfs_sb_info *sbi;
+	struct super_block *sb;
+
+	sbi = container_of(work, struct hfs_sb_info, mdb_work.work);
+	sb = sbi->sb;
+
+	spin_lock(&sbi->work_lock);
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->work_lock);
+
+	hfs_mdb_commit(sb);
+}
+
+void hfs_mark_mdb_dirty(struct super_block *sb)
+{
+	struct hfs_sb_info *sbi = HFS_SB(sb);
+	unsigned long delay;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	spin_lock(&sbi->work_lock);
+	if (!sbi->work_queued) {
+		delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+		queue_delayed_work(system_long_wq, &sbi->mdb_work, delay);
+		sbi->work_queued = 1;
+	}
+	spin_unlock(&sbi->work_lock);
+}
+
+/*
+ * hfs_statfs()
+ *
+ * This is the statfs() entry in the super_operations structure for
+ * HFS filesystems.  The purpose is to return various data about the
+ * filesystem.
+ *
+ * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
+ */
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = HFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
+	buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = HFS_SB(sb)->fs_ablocks;
+	buf->f_ffree = HFS_SB(sb)->free_ablocks;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = HFS_NAMELEN;
+
+	return 0;
+}
+
+static int hfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_NODIRATIME;
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+	if (!(*flags & MS_RDONLY)) {
+		if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
+			pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
+			pr_warn("filesystem is marked locked, leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		}
+	}
+	return 0;
+}
+
+static int hfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
+
+	if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
+		seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+	if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
+		seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+	seq_printf(seq, ",uid=%u,gid=%u",
+			from_kuid_munged(&init_user_ns, sbi->s_uid),
+			from_kgid_munged(&init_user_ns, sbi->s_gid));
+	if (sbi->s_file_umask != 0133)
+		seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
+	if (sbi->s_dir_umask != 0022)
+		seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask);
+	if (sbi->part >= 0)
+		seq_printf(seq, ",part=%u", sbi->part);
+	if (sbi->session >= 0)
+		seq_printf(seq, ",session=%u", sbi->session);
+	if (sbi->nls_disk)
+		seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset);
+	if (sbi->nls_io)
+		seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset);
+	if (sbi->s_quiet)
+		seq_printf(seq, ",quiet");
+	return 0;
+}
+
+static struct inode *hfs_alloc_inode(struct super_block *sb)
+{
+	struct hfs_inode_info *i;
+
+	i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL);
+	return i ? &i->vfs_inode : NULL;
+}
+
+static void hfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
+}
+
+static void hfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfs_i_callback);
+}
+
+static const struct super_operations hfs_super_operations = {
+	.alloc_inode	= hfs_alloc_inode,
+	.destroy_inode	= hfs_destroy_inode,
+	.write_inode	= hfs_write_inode,
+	.evict_inode	= hfs_evict_inode,
+	.put_super	= hfs_put_super,
+	.sync_fs	= hfs_sync_fs,
+	.statfs		= hfs_statfs,
+	.remount_fs     = hfs_remount,
+	.show_options	= hfs_show_options,
+};
+
+enum {
+	opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
+	opt_part, opt_session, opt_type, opt_creator, opt_quiet,
+	opt_codepage, opt_iocharset,
+	opt_err
+};
+
+static const match_table_t tokens = {
+	{ opt_uid, "uid=%u" },
+	{ opt_gid, "gid=%u" },
+	{ opt_umask, "umask=%o" },
+	{ opt_file_umask, "file_umask=%o" },
+	{ opt_dir_umask, "dir_umask=%o" },
+	{ opt_part, "part=%u" },
+	{ opt_session, "session=%u" },
+	{ opt_type, "type=%s" },
+	{ opt_creator, "creator=%s" },
+	{ opt_quiet, "quiet" },
+	{ opt_codepage, "codepage=%s" },
+	{ opt_iocharset, "iocharset=%s" },
+	{ opt_err, NULL }
+};
+
+static inline int match_fourchar(substring_t *arg, u32 *result)
+{
+	if (arg->to - arg->from != 4)
+		return -EINVAL;
+	memcpy(result, arg->from, 4);
+	return 0;
+}
+
+/*
+ * parse_options()
+ *
+ * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger
+ * This function is called by hfs_read_super() to parse the mount options.
+ */
+static int parse_options(char *options, struct hfs_sb_info *hsb)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int tmp, token;
+
+	/* initialize the sb with defaults */
+	hsb->s_uid = current_uid();
+	hsb->s_gid = current_gid();
+	hsb->s_file_umask = 0133;
+	hsb->s_dir_umask = 0022;
+	hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f);	/* == '????' */
+	hsb->s_quiet = 0;
+	hsb->part = -1;
+	hsb->session = -1;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case opt_uid:
+			if (match_int(&args[0], &tmp)) {
+				pr_err("uid requires an argument\n");
+				return 0;
+			}
+			hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
+			if (!uid_valid(hsb->s_uid)) {
+				pr_err("invalid uid %d\n", tmp);
+				return 0;
+			}
+			break;
+		case opt_gid:
+			if (match_int(&args[0], &tmp)) {
+				pr_err("gid requires an argument\n");
+				return 0;
+			}
+			hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
+			if (!gid_valid(hsb->s_gid)) {
+				pr_err("invalid gid %d\n", tmp);
+				return 0;
+			}
+			break;
+		case opt_umask:
+			if (match_octal(&args[0], &tmp)) {
+				pr_err("umask requires a value\n");
+				return 0;
+			}
+			hsb->s_file_umask = (umode_t)tmp;
+			hsb->s_dir_umask = (umode_t)tmp;
+			break;
+		case opt_file_umask:
+			if (match_octal(&args[0], &tmp)) {
+				pr_err("file_umask requires a value\n");
+				return 0;
+			}
+			hsb->s_file_umask = (umode_t)tmp;
+			break;
+		case opt_dir_umask:
+			if (match_octal(&args[0], &tmp)) {
+				pr_err("dir_umask requires a value\n");
+				return 0;
+			}
+			hsb->s_dir_umask = (umode_t)tmp;
+			break;
+		case opt_part:
+			if (match_int(&args[0], &hsb->part)) {
+				pr_err("part requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_session:
+			if (match_int(&args[0], &hsb->session)) {
+				pr_err("session requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_type:
+			if (match_fourchar(&args[0], &hsb->s_type)) {
+				pr_err("type requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_creator:
+			if (match_fourchar(&args[0], &hsb->s_creator)) {
+				pr_err("creator requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_quiet:
+			hsb->s_quiet = 1;
+			break;
+		case opt_codepage:
+			if (hsb->nls_disk) {
+				pr_err("unable to change codepage\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			if (p)
+				hsb->nls_disk = load_nls(p);
+			if (!hsb->nls_disk) {
+				pr_err("unable to load codepage \"%s\"\n", p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
+		case opt_iocharset:
+			if (hsb->nls_io) {
+				pr_err("unable to change iocharset\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			if (p)
+				hsb->nls_io = load_nls(p);
+			if (!hsb->nls_io) {
+				pr_err("unable to load iocharset \"%s\"\n", p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
+		default:
+			return 0;
+		}
+	}
+
+	if (hsb->nls_disk && !hsb->nls_io) {
+		hsb->nls_io = load_nls_default();
+		if (!hsb->nls_io) {
+			pr_err("unable to load default iocharset\n");
+			return 0;
+		}
+	}
+	hsb->s_dir_umask &= 0777;
+	hsb->s_file_umask &= 0577;
+
+	return 1;
+}
+
+/*
+ * hfs_read_super()
+ *
+ * This is the function that is responsible for mounting an HFS
+ * filesystem.	It performs all the tasks necessary to get enough data
+ * from the disk to read the root inode.  This includes parsing the
+ * mount options, dealing with Macintosh partitions, reading the
+ * superblock and the allocation bitmap blocks, calling
+ * hfs_btree_init() to get the necessary data about the extents and
+ * catalog B-trees and, finally, reading the root inode into memory.
+ */
+static int hfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct hfs_sb_info *sbi;
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+	struct inode *root_inode;
+	int res;
+
+	sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sbi->sb = sb;
+	sb->s_fs_info = sbi;
+	spin_lock_init(&sbi->work_lock);
+	INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb);
+
+	res = -EINVAL;
+	if (!parse_options((char *)data, sbi)) {
+		pr_err("unable to parse mount options\n");
+		goto bail;
+	}
+
+	sb->s_op = &hfs_super_operations;
+	sb->s_flags |= MS_NODIRATIME;
+	mutex_init(&sbi->bitmap_lock);
+
+	res = hfs_mdb_get(sb);
+	if (res) {
+		if (!silent)
+			pr_warn("can't find a HFS filesystem on dev %s\n",
+				hfs_mdb_name(sb));
+		res = -EINVAL;
+		goto bail;
+	}
+
+	/* try to get the root inode */
+	res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (res)
+		goto bail_no_root;
+	res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
+	if (!res) {
+		if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+			res =  -EIO;
+			goto bail;
+		}
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+	}
+	if (res) {
+		hfs_find_exit(&fd);
+		goto bail_no_root;
+	}
+	res = -EINVAL;
+	root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
+	hfs_find_exit(&fd);
+	if (!root_inode)
+		goto bail_no_root;
+
+	sb->s_d_op = &hfs_dentry_operations;
+	res = -ENOMEM;
+	sb->s_root = d_make_root(root_inode);
+	if (!sb->s_root)
+		goto bail_no_root;
+
+	/* everything's okay */
+	return 0;
+
+bail_no_root:
+	pr_err("get root inode failed\n");
+bail:
+	hfs_mdb_put(sb);
+	return res;
+}
+
+static struct dentry *hfs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+}
+
+static struct file_system_type hfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "hfs",
+	.mount		= hfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("hfs");
+
+static void hfs_init_once(void *p)
+{
+	struct hfs_inode_info *i = p;
+
+	inode_init_once(&i->vfs_inode);
+}
+
+static int __init init_hfs_fs(void)
+{
+	int err;
+
+	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
+		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
+		hfs_init_once);
+	if (!hfs_inode_cachep)
+		return -ENOMEM;
+	err = register_filesystem(&hfs_fs_type);
+	if (err)
+		kmem_cache_destroy(hfs_inode_cachep);
+	return err;
+}
+
+static void __exit exit_hfs_fs(void)
+{
+	unregister_filesystem(&hfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(hfs_inode_cachep);
+}
+
+module_init(init_hfs_fs)
+module_exit(exit_hfs_fs)
diff --git a/kernel/fs/hfs/sysdep.c b/kernel/fs/hfs/sysdep.c
new file mode 100644
index 000000000..2875961fd
--- /dev/null
+++ b/kernel/fs/hfs/sysdep.c
@@ -0,0 +1,45 @@
+/*
+ *  linux/fs/hfs/sysdep.c
+ *
+ * Copyright (C) 1996  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the code to do various system dependent things.
+ */
+
+#include <linux/namei.h>
+#include "hfs_fs.h"
+
+/* dentry case-handling: just lowercase everything */
+
+static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode;
+	int diff;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = d_inode(dentry);
+	if(!inode)
+		return 1;
+
+	/* fix up inode on a timezone change */
+	diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
+	if (diff) {
+		inode->i_ctime.tv_sec += diff;
+		inode->i_atime.tv_sec += diff;
+		inode->i_mtime.tv_sec += diff;
+		HFS_I(inode)->tz_secondswest += diff;
+	}
+	return 1;
+}
+
+const struct dentry_operations hfs_dentry_operations =
+{
+	.d_revalidate	= hfs_revalidate_dentry,
+	.d_hash		= hfs_hash_dentry,
+	.d_compare	= hfs_compare_dentry,
+};
+
diff --git a/kernel/fs/hfs/trans.c b/kernel/fs/hfs/trans.c
new file mode 100644
index 000000000..b1ce4c7ad
--- /dev/null
+++ b/kernel/fs/hfs/trans.c
@@ -0,0 +1,150 @@
+/*
+ *  linux/fs/hfs/trans.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains routines for converting between the Macintosh
+ * character set and various other encodings.  This includes dealing
+ * with ':' vs. '/' as the path-element separator.
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#include "hfs_fs.h"
+
+/*================ Global functions ================*/
+
+/*
+ * hfs_mac2asc()
+ *
+ * Given a 'Pascal String' (a string preceded by a length byte) in
+ * the Macintosh character set produce the corresponding filename using
+ * the 'trivial' name-mangling scheme, returning the length of the
+ * mangled filename.  Note that the output string is not NULL
+ * terminated.
+ *
+ * The name-mangling works as follows:
+ * The character '/', which is illegal in Linux filenames is replaced
+ * by ':' which never appears in HFS filenames.	 All other characters
+ * are passed unchanged from input to output.
+ */
+int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
+{
+	struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
+	struct nls_table *nls_io = HFS_SB(sb)->nls_io;
+	const char *src;
+	char *dst;
+	int srclen, dstlen, size;
+
+	src = in->name;
+	srclen = in->len;
+	if (srclen > HFS_NAMELEN)
+		srclen = HFS_NAMELEN;
+	dst = out;
+	dstlen = HFS_MAX_NAMELEN;
+	if (nls_io) {
+		wchar_t ch;
+
+		while (srclen > 0) {
+			if (nls_disk) {
+				size = nls_disk->char2uni(src, srclen, &ch);
+				if (size <= 0) {
+					ch = '?';
+					size = 1;
+				}
+				src += size;
+				srclen -= size;
+			} else {
+				ch = *src++;
+				srclen--;
+			}
+			if (ch == '/')
+				ch = ':';
+			size = nls_io->uni2char(ch, dst, dstlen);
+			if (size < 0) {
+				if (size == -ENAMETOOLONG)
+					goto out;
+				*dst = '?';
+				size = 1;
+			}
+			dst += size;
+			dstlen -= size;
+		}
+	} else {
+		char ch;
+
+		while (--srclen >= 0)
+			*dst++ = (ch = *src++) == '/' ? ':' : ch;
+	}
+out:
+	return dst - out;
+}
+
+/*
+ * hfs_asc2mac()
+ *
+ * Given an ASCII string (not null-terminated) and its length,
+ * generate the corresponding filename in the Macintosh character set
+ * using the 'trivial' name-mangling scheme, returning the length of
+ * the mangled filename.  Note that the output string is not NULL
+ * terminated.
+ *
+ * This routine is a inverse to hfs_mac2triv().
+ * A ':' is replaced by a '/'.
+ */
+void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in)
+{
+	struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
+	struct nls_table *nls_io = HFS_SB(sb)->nls_io;
+	const char *src;
+	char *dst;
+	int srclen, dstlen, size;
+
+	src = in->name;
+	srclen = in->len;
+	dst = out->name;
+	dstlen = HFS_NAMELEN;
+	if (nls_io) {
+		wchar_t ch;
+
+		while (srclen > 0) {
+			size = nls_io->char2uni(src, srclen, &ch);
+			if (size < 0) {
+				ch = '?';
+				size = 1;
+			}
+			src += size;
+			srclen -= size;
+			if (ch == ':')
+				ch = '/';
+			if (nls_disk) {
+				size = nls_disk->uni2char(ch, dst, dstlen);
+				if (size < 0) {
+					if (size == -ENAMETOOLONG)
+						goto out;
+					*dst = '?';
+					size = 1;
+				}
+				dst += size;
+				dstlen -= size;
+			} else {
+				*dst++ = ch > 0xff ? '?' : ch;
+				dstlen--;
+			}
+		}
+	} else {
+		char ch;
+
+		if (dstlen > srclen)
+			dstlen = srclen;
+		while (--dstlen >= 0)
+			*dst++ = (ch = *src++) == ':' ? '/' : ch;
+	}
+out:
+	out->len = dst - (char *)out->name;
+	dstlen = HFS_NAMELEN - out->len;
+	while (--dstlen >= 0)
+		*dst++ = 0;
+}
diff --git a/kernel/fs/hfsplus/Kconfig b/kernel/fs/hfsplus/Kconfig
new file mode 100644
index 000000000..24bc20fd4
--- /dev/null
+++ b/kernel/fs/hfsplus/Kconfig
@@ -0,0 +1,31 @@
+config HFSPLUS_FS
+	tristate "Apple Extended HFS file system support"
+	depends on BLOCK
+	select NLS
+	select NLS_UTF8
+	help
+	  If you say Y here, you will be able to mount extended format
+	  Macintosh-formatted hard drive partitions with full read-write access.
+
+	  This file system is often called HFS+ and was introduced with
+	  MacOS 8. It includes all Mac specific filesystem data such as
+	  data forks and creator codes, but it also has several UNIX
+	  style features such as file ownership and permissions.
+
+config HFSPLUS_FS_POSIX_ACL
+	bool "HFS+ POSIX Access Control Lists"
+	depends on HFSPLUS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  It needs to understand that POSIX ACLs are treated only under
+	  Linux. POSIX ACLs doesn't mean something under Mac OS X.
+	  Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs,
+	  which are part of the NFSv4 standard.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/kernel/fs/hfsplus/Makefile b/kernel/fs/hfsplus/Makefile
new file mode 100644
index 000000000..683fca2e5
--- /dev/null
+++ b/kernel/fs/hfsplus/Makefile
@@ -0,0 +1,11 @@
+#
+## Makefile for the linux hfsplus filesystem routines.
+#
+
+obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
+
+hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
+		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
+		attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
+
+hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL)	+= posix_acl.o
diff --git a/kernel/fs/hfsplus/acl.h b/kernel/fs/hfsplus/acl.h
new file mode 100644
index 000000000..95c8ed9ec
--- /dev/null
+++ b/kernel/fs/hfsplus/acl.h
@@ -0,0 +1,27 @@
+/*
+ * linux/fs/hfsplus/acl.h
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for Posix Access Control Lists (ACLs) support.
+ */
+
+#include <linux/posix_acl_xattr.h>
+
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+
+/* posix_acl.c */
+struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
+		int type);
+extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
+
+#else  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
+#define hfsplus_get_posix_acl NULL
+#define hfsplus_set_posix_acl NULL
+
+static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
diff --git a/kernel/fs/hfsplus/attributes.c b/kernel/fs/hfsplus/attributes.c
new file mode 100644
index 000000000..e5b221de7
--- /dev/null
+++ b/kernel/fs/hfsplus/attributes.c
@@ -0,0 +1,371 @@
+/*
+ * linux/fs/hfsplus/attributes.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handling of records in attributes tree
+ */
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+static struct kmem_cache *hfsplus_attr_tree_cachep;
+
+int __init hfsplus_create_attr_tree_cache(void)
+{
+	if (hfsplus_attr_tree_cachep)
+		return -EEXIST;
+
+	hfsplus_attr_tree_cachep =
+		kmem_cache_create("hfsplus_attr_cache",
+			sizeof(hfsplus_attr_entry), 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+	if (!hfsplus_attr_tree_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void hfsplus_destroy_attr_tree_cache(void)
+{
+	kmem_cache_destroy(hfsplus_attr_tree_cachep);
+}
+
+int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1,
+				const hfsplus_btree_key *k2)
+{
+	__be32 k1_cnid, k2_cnid;
+
+	k1_cnid = k1->attr.cnid;
+	k2_cnid = k2->attr.cnid;
+	if (k1_cnid != k2_cnid)
+		return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1;
+
+	return hfsplus_strcmp(
+			(const struct hfsplus_unistr *)&k1->attr.key_name,
+			(const struct hfsplus_unistr *)&k2->attr.key_name);
+}
+
+int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
+			u32 cnid, const char *name)
+{
+	int len;
+
+	memset(key, 0, sizeof(struct hfsplus_attr_key));
+	key->attr.cnid = cpu_to_be32(cnid);
+	if (name) {
+		int res = hfsplus_asc2uni(sb,
+				(struct hfsplus_unistr *)&key->attr.key_name,
+				HFSPLUS_ATTR_MAX_STRLEN, name, strlen(name));
+		if (res)
+			return res;
+		len = be16_to_cpu(key->attr.key_name.length);
+	} else {
+		key->attr.key_name.length = 0;
+		len = 0;
+	}
+
+	/* The length of the key, as stored in key_len field, does not include
+	 * the size of the key_len field itself.
+	 * So, offsetof(hfsplus_attr_key, key_name) is a trick because
+	 * it takes into consideration key_len field (__be16) of
+	 * hfsplus_attr_key structure instead of length field (__be16) of
+	 * hfsplus_attr_unistr structure.
+	 */
+	key->key_len =
+		cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) +
+				2 * len);
+
+	return 0;
+}
+
+hfsplus_attr_entry *hfsplus_alloc_attr_entry(void)
+{
+	return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL);
+}
+
+void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry)
+{
+	if (entry)
+		kmem_cache_free(hfsplus_attr_tree_cachep, entry);
+}
+
+#define HFSPLUS_INVALID_ATTR_RECORD -1
+
+static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type,
+				u32 cnid, const void *value, size_t size)
+{
+	if (record_type == HFSPLUS_ATTR_FORK_DATA) {
+		/*
+		 * Mac OS X supports only inline data attributes.
+		 * Do nothing
+		 */
+		memset(entry, 0, sizeof(*entry));
+		return sizeof(struct hfsplus_attr_fork_data);
+	} else if (record_type == HFSPLUS_ATTR_EXTENTS) {
+		/*
+		 * Mac OS X supports only inline data attributes.
+		 * Do nothing.
+		 */
+		memset(entry, 0, sizeof(*entry));
+		return sizeof(struct hfsplus_attr_extents);
+	} else if (record_type == HFSPLUS_ATTR_INLINE_DATA) {
+		u16 len;
+
+		memset(entry, 0, sizeof(struct hfsplus_attr_inline_data));
+		entry->inline_data.record_type = cpu_to_be32(record_type);
+		if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE)
+			len = size;
+		else
+			return HFSPLUS_INVALID_ATTR_RECORD;
+		entry->inline_data.length = cpu_to_be16(len);
+		memcpy(entry->inline_data.raw_bytes, value, len);
+		/*
+		 * Align len on two-byte boundary.
+		 * It needs to add pad byte if we have odd len.
+		 */
+		len = round_up(len, 2);
+		return offsetof(struct hfsplus_attr_inline_data, raw_bytes) +
+					len;
+	} else /* invalid input */
+		memset(entry, 0, sizeof(*entry));
+
+	return HFSPLUS_INVALID_ATTR_RECORD;
+}
+
+int hfsplus_find_attr(struct super_block *sb, u32 cnid,
+			const char *name, struct hfs_find_data *fd)
+{
+	int err = 0;
+
+	hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid);
+
+	if (!HFSPLUS_SB(sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	if (name) {
+		err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name);
+		if (err)
+			goto failed_find_attr;
+		err = hfs_brec_find(fd, hfs_find_rec_by_key);
+		if (err)
+			goto failed_find_attr;
+	} else {
+		err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL);
+		if (err)
+			goto failed_find_attr;
+		err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid);
+		if (err)
+			goto failed_find_attr;
+	}
+
+failed_find_attr:
+	return err;
+}
+
+int hfsplus_attr_exists(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+
+	if (!HFSPLUS_SB(sb)->attr_tree)
+		return 0;
+
+	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+	if (err)
+		return 0;
+
+	err = hfsplus_find_attr(sb, inode->i_ino, name, &fd);
+	if (err)
+		goto attr_not_found;
+
+	hfs_find_exit(&fd);
+	return 1;
+
+attr_not_found:
+	hfs_find_exit(&fd);
+	return 0;
+}
+
+int hfsplus_create_attr(struct inode *inode,
+				const char *name,
+				const void *value, size_t size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+	hfsplus_attr_entry *entry_ptr;
+	int entry_size;
+	int err;
+
+	hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n",
+		name ? name : NULL, inode->i_ino);
+
+	if (!HFSPLUS_SB(sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	entry_ptr = hfsplus_alloc_attr_entry();
+	if (!entry_ptr)
+		return -ENOMEM;
+
+	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+	if (err)
+		goto failed_init_create_attr;
+
+	if (name) {
+		err = hfsplus_attr_build_key(sb, fd.search_key,
+						inode->i_ino, name);
+		if (err)
+			goto failed_create_attr;
+	} else {
+		err = -EINVAL;
+		goto failed_create_attr;
+	}
+
+	/* Mac OS X supports only inline data attributes. */
+	entry_size = hfsplus_attr_build_record(entry_ptr,
+					HFSPLUS_ATTR_INLINE_DATA,
+					inode->i_ino,
+					value, size);
+	if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) {
+		err = -EINVAL;
+		goto failed_create_attr;
+	}
+
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto failed_create_attr;
+	}
+
+	err = hfs_brec_insert(&fd, entry_ptr, entry_size);
+	if (err)
+		goto failed_create_attr;
+
+	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY);
+
+failed_create_attr:
+	hfs_find_exit(&fd);
+
+failed_init_create_attr:
+	hfsplus_destroy_attr_entry(entry_ptr);
+	return err;
+}
+
+static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
+					struct hfs_find_data *fd)
+{
+	int err = 0;
+	__be32 found_cnid, record_type;
+
+	hfs_bnode_read(fd->bnode, &found_cnid,
+			fd->keyoffset +
+			offsetof(struct hfsplus_attr_key, cnid),
+			sizeof(__be32));
+	if (cnid != be32_to_cpu(found_cnid))
+		return -ENOENT;
+
+	hfs_bnode_read(fd->bnode, &record_type,
+			fd->entryoffset, sizeof(record_type));
+
+	switch (be32_to_cpu(record_type)) {
+	case HFSPLUS_ATTR_INLINE_DATA:
+		/* All is OK. Do nothing. */
+		break;
+	case HFSPLUS_ATTR_FORK_DATA:
+	case HFSPLUS_ATTR_EXTENTS:
+		pr_err("only inline data xattr are supported\n");
+		return -EOPNOTSUPP;
+	default:
+		pr_err("invalid extended attribute record\n");
+		return -ENOENT;
+	}
+
+	err = hfs_brec_remove(fd);
+	if (err)
+		return err;
+
+	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY);
+	return err;
+}
+
+int hfsplus_delete_attr(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+
+	hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n",
+		name ? name : NULL, inode->i_ino);
+
+	if (!HFSPLUS_SB(sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+	if (err)
+		return err;
+
+	if (name) {
+		err = hfsplus_attr_build_key(sb, fd.search_key,
+						inode->i_ino, name);
+		if (err)
+			goto out;
+	} else {
+		pr_err("invalid extended attribute name\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+
+	err = __hfsplus_delete_attr(inode, inode->i_ino, &fd);
+	if (err)
+		goto out;
+
+out:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid)
+{
+	int err = 0;
+	struct hfs_find_data fd;
+
+	hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid);
+
+	if (!HFSPLUS_SB(dir->i_sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd);
+	if (err)
+		return err;
+
+	for (;;) {
+		err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd);
+		if (err) {
+			if (err != -ENOENT)
+				pr_err("xattr search failed\n");
+			goto end_delete_all;
+		}
+
+		err = __hfsplus_delete_attr(dir, cnid, &fd);
+		if (err)
+			goto end_delete_all;
+	}
+
+end_delete_all:
+	hfs_find_exit(&fd);
+	return err;
+}
diff --git a/kernel/fs/hfsplus/bfind.c b/kernel/fs/hfsplus/bfind.c
new file mode 100644
index 000000000..528e38b5a
--- /dev/null
+++ b/kernel/fs/hfsplus/bfind.c
@@ -0,0 +1,293 @@
+/*
+ *  linux/fs/hfsplus/bfind.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Search routines for btrees
+ */
+
+#include <linux/slab.h>
+#include "hfsplus_fs.h"
+
+int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
+{
+	void *ptr;
+
+	fd->tree = tree;
+	fd->bnode = NULL;
+	ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+	fd->search_key = ptr;
+	fd->key = ptr + tree->max_key_len + 2;
+	hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+		tree->cnid, __builtin_return_address(0));
+	switch (tree->cnid) {
+	case HFSPLUS_CAT_CNID:
+		mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
+		break;
+	case HFSPLUS_EXT_CNID:
+		mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
+		break;
+	case HFSPLUS_ATTR_CNID:
+		mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
+		break;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+void hfs_find_exit(struct hfs_find_data *fd)
+{
+	hfs_bnode_put(fd->bnode);
+	kfree(fd->search_key);
+	hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+		fd->tree->cnid, __builtin_return_address(0));
+	mutex_unlock(&fd->tree->tree_lock);
+	fd->tree = NULL;
+}
+
+int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode,
+				struct hfs_find_data *fd,
+				int *begin,
+				int *end,
+				int *cur_rec)
+{
+	__be32 cur_cnid;
+	__be32 search_cnid;
+
+	if (bnode->tree->cnid == HFSPLUS_EXT_CNID) {
+		cur_cnid = fd->key->ext.cnid;
+		search_cnid = fd->search_key->ext.cnid;
+	} else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) {
+		cur_cnid = fd->key->cat.parent;
+		search_cnid = fd->search_key->cat.parent;
+	} else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) {
+		cur_cnid = fd->key->attr.cnid;
+		search_cnid = fd->search_key->attr.cnid;
+	} else {
+		cur_cnid = 0;	/* used-uninitialized warning */
+		search_cnid = 0;
+		BUG();
+	}
+
+	if (cur_cnid == search_cnid) {
+		(*end) = (*cur_rec);
+		if ((*begin) == (*end))
+			return 1;
+	} else {
+		if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid))
+			(*begin) = (*cur_rec) + 1;
+		else
+			(*end) = (*cur_rec) - 1;
+	}
+
+	return 0;
+}
+
+int hfs_find_rec_by_key(struct hfs_bnode *bnode,
+				struct hfs_find_data *fd,
+				int *begin,
+				int *end,
+				int *cur_rec)
+{
+	int cmpval;
+
+	cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
+	if (!cmpval) {
+		(*end) = (*cur_rec);
+		return 1;
+	}
+	if (cmpval < 0)
+		(*begin) = (*cur_rec) + 1;
+	else
+		*(end) = (*cur_rec) - 1;
+
+	return 0;
+}
+
+/* Find the record in bnode that best matches key (not greater than...)*/
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+					search_strategy_t rec_found)
+{
+	u16 off, len, keylen;
+	int rec;
+	int b, e;
+	int res;
+
+	BUG_ON(!rec_found);
+	b = 0;
+	e = bnode->num_recs - 1;
+	res = -ENOENT;
+	do {
+		rec = (e + b) / 2;
+		len = hfs_brec_lenoff(bnode, rec, &off);
+		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+		if (rec_found(bnode, fd, &b, &e, &rec)) {
+			res = 0;
+			goto done;
+		}
+	} while (b <= e);
+
+	if (rec != e && e >= 0) {
+		len = hfs_brec_lenoff(bnode, e, &off);
+		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+	}
+
+done:
+	fd->record = e;
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+
+fail:
+	return res;
+}
+
+/* Traverse a B*Tree from the root to a leaf finding best fit to key */
+/* Return allocated copy of node found, set recnum to best record */
+int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	u32 nidx, parent;
+	__be32 data;
+	int height, res;
+
+	tree = fd->tree;
+	if (fd->bnode)
+		hfs_bnode_put(fd->bnode);
+	fd->bnode = NULL;
+	nidx = tree->root;
+	if (!nidx)
+		return -ENOENT;
+	height = tree->depth;
+	res = 0;
+	parent = 0;
+	for (;;) {
+		bnode = hfs_bnode_find(tree, nidx);
+		if (IS_ERR(bnode)) {
+			res = PTR_ERR(bnode);
+			bnode = NULL;
+			break;
+		}
+		if (bnode->height != height)
+			goto invalid;
+		if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF))
+			goto invalid;
+		bnode->parent = parent;
+
+		res = __hfs_brec_find(bnode, fd, do_key_compare);
+		if (!height)
+			break;
+		if (fd->record < 0)
+			goto release;
+
+		parent = nidx;
+		hfs_bnode_read(bnode, &data, fd->entryoffset, 4);
+		nidx = be32_to_cpu(data);
+		hfs_bnode_put(bnode);
+	}
+	fd->bnode = bnode;
+	return res;
+
+invalid:
+	pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+		height, bnode->height, bnode->type, nidx, parent);
+	res = -EIO;
+release:
+	hfs_bnode_put(bnode);
+	return res;
+}
+
+int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)
+{
+	int res;
+
+	res = hfs_brec_find(fd, hfs_find_rec_by_key);
+	if (res)
+		return res;
+	if (fd->entrylength > rec_len)
+		return -EINVAL;
+	hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength);
+	return 0;
+}
+
+int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	int idx, res = 0;
+	u16 off, len, keylen;
+
+	bnode = fd->bnode;
+	tree = bnode->tree;
+
+	if (cnt < 0) {
+		cnt = -cnt;
+		while (cnt > fd->record) {
+			cnt -= fd->record + 1;
+			fd->record = bnode->num_recs - 1;
+			idx = bnode->prev;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record -= cnt;
+	} else {
+		while (cnt >= bnode->num_recs - fd->record) {
+			cnt -= bnode->num_recs - fd->record;
+			fd->record = 0;
+			idx = bnode->next;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record += cnt;
+	}
+
+	len = hfs_brec_lenoff(bnode, fd->record, &off);
+	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == 0) {
+		res = -EINVAL;
+		goto out;
+	}
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+	hfs_bnode_read(bnode, fd->key, off, keylen);
+out:
+	fd->bnode = bnode;
+	return res;
+}
diff --git a/kernel/fs/hfsplus/bitmap.c b/kernel/fs/hfsplus/bitmap.c
new file mode 100644
index 000000000..d29544515
--- /dev/null
+++ b/kernel/fs/hfsplus/bitmap.c
@@ -0,0 +1,245 @@
+/*
+ *  linux/fs/hfsplus/bitmap.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of allocation file
+ */
+
+#include <linux/pagemap.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+#define PAGE_CACHE_BITS	(PAGE_CACHE_SIZE * 8)
+
+int hfsplus_block_allocate(struct super_block *sb, u32 size,
+		u32 offset, u32 *max)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct page *page;
+	struct address_space *mapping;
+	__be32 *pptr, *curr, *end;
+	u32 mask, start, len, n;
+	__be32 val;
+	int i;
+
+	len = *max;
+	if (!len)
+		return size;
+
+	hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
+	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
+	if (IS_ERR(page)) {
+		start = size;
+		goto out;
+	}
+	pptr = kmap(page);
+	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
+	i = offset % 32;
+	offset &= ~(PAGE_CACHE_BITS - 1);
+	if ((size ^ offset) / PAGE_CACHE_BITS)
+		end = pptr + PAGE_CACHE_BITS / 32;
+	else
+		end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
+
+	/* scan the first partial u32 for zero bits */
+	val = *curr;
+	if (~val) {
+		n = be32_to_cpu(val);
+		mask = (1U << 31) >> i;
+		for (; i < 32; mask >>= 1, i++) {
+			if (!(n & mask))
+				goto found;
+		}
+	}
+	curr++;
+
+	/* scan complete u32s for the first zero bit */
+	while (1) {
+		while (curr < end) {
+			val = *curr;
+			if (~val) {
+				n = be32_to_cpu(val);
+				mask = 1 << 31;
+				for (i = 0; i < 32; mask >>= 1, i++) {
+					if (!(n & mask))
+						goto found;
+				}
+			}
+			curr++;
+		}
+		kunmap(page);
+		offset += PAGE_CACHE_BITS;
+		if (offset >= size)
+			break;
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
+		if (IS_ERR(page)) {
+			start = size;
+			goto out;
+		}
+		curr = pptr = kmap(page);
+		if ((size ^ offset) / PAGE_CACHE_BITS)
+			end = pptr + PAGE_CACHE_BITS / 32;
+		else
+			end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
+	}
+	hfs_dbg(BITMAP, "bitmap full\n");
+	start = size;
+	goto out;
+
+found:
+	start = offset + (curr - pptr) * 32 + i;
+	if (start >= size) {
+		hfs_dbg(BITMAP, "bitmap full\n");
+		goto out;
+	}
+	/* do any partial u32 at the start */
+	len = min(size - start, len);
+	while (1) {
+		n |= mask;
+		if (++i >= 32)
+			break;
+		mask >>= 1;
+		if (!--len || n & mask)
+			goto done;
+	}
+	if (!--len)
+		goto done;
+	*curr++ = cpu_to_be32(n);
+	/* do full u32s */
+	while (1) {
+		while (curr < end) {
+			n = be32_to_cpu(*curr);
+			if (len < 32)
+				goto last;
+			if (n) {
+				len = 32;
+				goto last;
+			}
+			*curr++ = cpu_to_be32(0xffffffff);
+			len -= 32;
+		}
+		set_page_dirty(page);
+		kunmap(page);
+		offset += PAGE_CACHE_BITS;
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
+		if (IS_ERR(page)) {
+			start = size;
+			goto out;
+		}
+		pptr = kmap(page);
+		curr = pptr;
+		end = pptr + PAGE_CACHE_BITS / 32;
+	}
+last:
+	/* do any partial u32 at end */
+	mask = 1U << 31;
+	for (i = 0; i < len; i++) {
+		if (n & mask)
+			break;
+		n |= mask;
+		mask >>= 1;
+	}
+done:
+	*curr = cpu_to_be32(n);
+	set_page_dirty(page);
+	kunmap(page);
+	*max = offset + (curr - pptr) * 32 + i - start;
+	sbi->free_blocks -= *max;
+	hfsplus_mark_mdb_dirty(sb);
+	hfs_dbg(BITMAP, "-> %u,%u\n", start, *max);
+out:
+	mutex_unlock(&sbi->alloc_mutex);
+	return start;
+}
+
+int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct page *page;
+	struct address_space *mapping;
+	__be32 *pptr, *curr, *end;
+	u32 mask, len, pnr;
+	int i;
+
+	/* is there any actual work to be done? */
+	if (!count)
+		return 0;
+
+	hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count);
+	/* are all of the bits in range? */
+	if ((offset + count) > sbi->total_blocks)
+		return -ENOENT;
+
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
+	pnr = offset / PAGE_CACHE_BITS;
+	page = read_mapping_page(mapping, pnr, NULL);
+	if (IS_ERR(page))
+		goto kaboom;
+	pptr = kmap(page);
+	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
+	end = pptr + PAGE_CACHE_BITS / 32;
+	len = count;
+
+	/* do any partial u32 at the start */
+	i = offset % 32;
+	if (i) {
+		int j = 32 - i;
+		mask = 0xffffffffU << j;
+		if (j > count) {
+			mask |= 0xffffffffU >> (i + count);
+			*curr++ &= cpu_to_be32(mask);
+			goto out;
+		}
+		*curr++ &= cpu_to_be32(mask);
+		count -= j;
+	}
+
+	/* do full u32s */
+	while (1) {
+		while (curr < end) {
+			if (count < 32)
+				goto done;
+			*curr++ = 0;
+			count -= 32;
+		}
+		if (!count)
+			break;
+		set_page_dirty(page);
+		kunmap(page);
+		page = read_mapping_page(mapping, ++pnr, NULL);
+		if (IS_ERR(page))
+			goto kaboom;
+		pptr = kmap(page);
+		curr = pptr;
+		end = pptr + PAGE_CACHE_BITS / 32;
+	}
+done:
+	/* do any partial u32 at end */
+	if (count) {
+		mask = 0xffffffffU >> count;
+		*curr &= cpu_to_be32(mask);
+	}
+out:
+	set_page_dirty(page);
+	kunmap(page);
+	sbi->free_blocks += len;
+	hfsplus_mark_mdb_dirty(sb);
+	mutex_unlock(&sbi->alloc_mutex);
+
+	return 0;
+
+kaboom:
+	pr_crit("unable to mark blocks free: error %ld\n", PTR_ERR(page));
+	mutex_unlock(&sbi->alloc_mutex);
+
+	return -EIO;
+}
diff --git a/kernel/fs/hfsplus/bnode.c b/kernel/fs/hfsplus/bnode.c
new file mode 100644
index 000000000..759708fd9
--- /dev/null
+++ b/kernel/fs/hfsplus/bnode.c
@@ -0,0 +1,671 @@
+/*
+ *  linux/fs/hfsplus/bnode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle basic btree node operations
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/* Copy a specified range of bytes from the raw data of a node */
+void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
+{
+	struct page **pagep;
+	int l;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	off &= ~PAGE_CACHE_MASK;
+
+	l = min_t(int, len, PAGE_CACHE_SIZE - off);
+	memcpy(buf, kmap(*pagep) + off, l);
+	kunmap(*pagep);
+
+	while ((len -= l) != 0) {
+		buf += l;
+		l = min_t(int, len, PAGE_CACHE_SIZE);
+		memcpy(buf, kmap(*++pagep), l);
+		kunmap(*pagep);
+	}
+}
+
+u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
+{
+	__be16 data;
+	/* TODO: optimize later... */
+	hfs_bnode_read(node, &data, off, 2);
+	return be16_to_cpu(data);
+}
+
+u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
+{
+	u8 data;
+	/* TODO: optimize later... */
+	hfs_bnode_read(node, &data, off, 1);
+	return data;
+}
+
+void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
+{
+	struct hfs_btree *tree;
+	int key_len;
+
+	tree = node->tree;
+	if (node->type == HFS_NODE_LEAF ||
+	    tree->attributes & HFS_TREE_VARIDXKEYS ||
+	    node->tree->cnid == HFSPLUS_ATTR_CNID)
+		key_len = hfs_bnode_read_u16(node, off) + 2;
+	else
+		key_len = tree->max_key_len + 2;
+
+	hfs_bnode_read(node, key, off, key_len);
+}
+
+void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
+{
+	struct page **pagep;
+	int l;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	off &= ~PAGE_CACHE_MASK;
+
+	l = min_t(int, len, PAGE_CACHE_SIZE - off);
+	memcpy(kmap(*pagep) + off, buf, l);
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+
+	while ((len -= l) != 0) {
+		buf += l;
+		l = min_t(int, len, PAGE_CACHE_SIZE);
+		memcpy(kmap(*++pagep), buf, l);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+}
+
+void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
+{
+	__be16 v = cpu_to_be16(data);
+	/* TODO: optimize later... */
+	hfs_bnode_write(node, &v, off, 2);
+}
+
+void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
+{
+	struct page **pagep;
+	int l;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	off &= ~PAGE_CACHE_MASK;
+
+	l = min_t(int, len, PAGE_CACHE_SIZE - off);
+	memset(kmap(*pagep) + off, 0, l);
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+
+	while ((len -= l) != 0) {
+		l = min_t(int, len, PAGE_CACHE_SIZE);
+		memset(kmap(*++pagep), 0, l);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+}
+
+void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
+		    struct hfs_bnode *src_node, int src, int len)
+{
+	struct hfs_btree *tree;
+	struct page **src_page, **dst_page;
+	int l;
+
+	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	tree = src_node->tree;
+	src += src_node->page_offset;
+	dst += dst_node->page_offset;
+	src_page = src_node->page + (src >> PAGE_CACHE_SHIFT);
+	src &= ~PAGE_CACHE_MASK;
+	dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT);
+	dst &= ~PAGE_CACHE_MASK;
+
+	if (src == dst) {
+		l = min_t(int, len, PAGE_CACHE_SIZE - src);
+		memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
+		kunmap(*src_page);
+		set_page_dirty(*dst_page);
+		kunmap(*dst_page);
+
+		while ((len -= l) != 0) {
+			l = min_t(int, len, PAGE_CACHE_SIZE);
+			memcpy(kmap(*++dst_page), kmap(*++src_page), l);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+		}
+	} else {
+		void *src_ptr, *dst_ptr;
+
+		do {
+			src_ptr = kmap(*src_page) + src;
+			dst_ptr = kmap(*dst_page) + dst;
+			if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
+				l = PAGE_CACHE_SIZE - src;
+				src = 0;
+				dst += l;
+			} else {
+				l = PAGE_CACHE_SIZE - dst;
+				src += l;
+				dst = 0;
+			}
+			l = min(len, l);
+			memcpy(dst_ptr, src_ptr, l);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+			if (!dst)
+				dst_page++;
+			else
+				src_page++;
+		} while ((len -= l));
+	}
+}
+
+void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
+{
+	struct page **src_page, **dst_page;
+	int l;
+
+	hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	src += node->page_offset;
+	dst += node->page_offset;
+	if (dst > src) {
+		src += len - 1;
+		src_page = node->page + (src >> PAGE_CACHE_SHIFT);
+		src = (src & ~PAGE_CACHE_MASK) + 1;
+		dst += len - 1;
+		dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
+		dst = (dst & ~PAGE_CACHE_MASK) + 1;
+
+		if (src == dst) {
+			while (src < len) {
+				memmove(kmap(*dst_page), kmap(*src_page), src);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+				len -= src;
+				src = PAGE_CACHE_SIZE;
+				src_page--;
+				dst_page--;
+			}
+			src -= len;
+			memmove(kmap(*dst_page) + src,
+				kmap(*src_page) + src, len);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+		} else {
+			void *src_ptr, *dst_ptr;
+
+			do {
+				src_ptr = kmap(*src_page) + src;
+				dst_ptr = kmap(*dst_page) + dst;
+				if (src < dst) {
+					l = src;
+					src = PAGE_CACHE_SIZE;
+					dst -= l;
+				} else {
+					l = dst;
+					src -= l;
+					dst = PAGE_CACHE_SIZE;
+				}
+				l = min(len, l);
+				memmove(dst_ptr - l, src_ptr - l, l);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+				if (dst == PAGE_CACHE_SIZE)
+					dst_page--;
+				else
+					src_page--;
+			} while ((len -= l));
+		}
+	} else {
+		src_page = node->page + (src >> PAGE_CACHE_SHIFT);
+		src &= ~PAGE_CACHE_MASK;
+		dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
+		dst &= ~PAGE_CACHE_MASK;
+
+		if (src == dst) {
+			l = min_t(int, len, PAGE_CACHE_SIZE - src);
+			memmove(kmap(*dst_page) + src,
+				kmap(*src_page) + src, l);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+
+			while ((len -= l) != 0) {
+				l = min_t(int, len, PAGE_CACHE_SIZE);
+				memmove(kmap(*++dst_page),
+					kmap(*++src_page), l);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+			}
+		} else {
+			void *src_ptr, *dst_ptr;
+
+			do {
+				src_ptr = kmap(*src_page) + src;
+				dst_ptr = kmap(*dst_page) + dst;
+				if (PAGE_CACHE_SIZE - src <
+						PAGE_CACHE_SIZE - dst) {
+					l = PAGE_CACHE_SIZE - src;
+					src = 0;
+					dst += l;
+				} else {
+					l = PAGE_CACHE_SIZE - dst;
+					src += l;
+					dst = 0;
+				}
+				l = min(len, l);
+				memmove(dst_ptr, src_ptr, l);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+				if (!dst)
+					dst_page++;
+				else
+					src_page++;
+			} while ((len -= l));
+		}
+	}
+}
+
+void hfs_bnode_dump(struct hfs_bnode *node)
+{
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+	int i, off, key_off;
+
+	hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_bnode_read(node, &desc, 0, sizeof(desc));
+	hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
+		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
+		desc.type, desc.height, be16_to_cpu(desc.num_recs));
+
+	off = node->tree->node_size - 2;
+	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
+		key_off = hfs_bnode_read_u16(node, off);
+		hfs_dbg(BNODE_MOD, " %d", key_off);
+		if (i && node->type == HFS_NODE_INDEX) {
+			int tmp;
+
+			if (node->tree->attributes & HFS_TREE_VARIDXKEYS ||
+					node->tree->cnid == HFSPLUS_ATTR_CNID)
+				tmp = hfs_bnode_read_u16(node, key_off) + 2;
+			else
+				tmp = node->tree->max_key_len + 2;
+			hfs_dbg_cont(BNODE_MOD, " (%d", tmp);
+			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
+			hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+		} else if (i && node->type == HFS_NODE_LEAF) {
+			int tmp;
+
+			tmp = hfs_bnode_read_u16(node, key_off);
+			hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
+		}
+	}
+	hfs_dbg_cont(BNODE_MOD, "\n");
+}
+
+void hfs_bnode_unlink(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *tmp;
+	__be32 cnid;
+
+	tree = node->tree;
+	if (node->prev) {
+		tmp = hfs_bnode_find(tree, node->prev);
+		if (IS_ERR(tmp))
+			return;
+		tmp->next = node->next;
+		cnid = cpu_to_be32(tmp->next);
+		hfs_bnode_write(tmp, &cnid,
+			offsetof(struct hfs_bnode_desc, next), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_head = node->next;
+
+	if (node->next) {
+		tmp = hfs_bnode_find(tree, node->next);
+		if (IS_ERR(tmp))
+			return;
+		tmp->prev = node->prev;
+		cnid = cpu_to_be32(tmp->prev);
+		hfs_bnode_write(tmp, &cnid,
+			offsetof(struct hfs_bnode_desc, prev), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_tail = node->prev;
+
+	/* move down? */
+	if (!node->prev && !node->next)
+		hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n");
+	if (!node->parent) {
+		tree->root = 0;
+		tree->depth = 0;
+	}
+	set_bit(HFS_BNODE_DELETED, &node->flags);
+}
+
+static inline int hfs_bnode_hash(u32 num)
+{
+	num = (num >> 16) + num;
+	num += num >> 8;
+	return num & (NODE_HASH_SIZE - 1);
+}
+
+struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
+{
+	struct hfs_bnode *node;
+
+	if (cnid >= tree->node_count) {
+		pr_err("request for non-existent node %d in B*Tree\n",
+		       cnid);
+		return NULL;
+	}
+
+	for (node = tree->node_hash[hfs_bnode_hash(cnid)];
+			node; node = node->next_hash)
+		if (node->this == cnid)
+			return node;
+	return NULL;
+}
+
+static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
+{
+	struct super_block *sb;
+	struct hfs_bnode *node, *node2;
+	struct address_space *mapping;
+	struct page *page;
+	int size, block, i, hash;
+	loff_t off;
+
+	if (cnid >= tree->node_count) {
+		pr_err("request for non-existent node %d in B*Tree\n",
+		       cnid);
+		return NULL;
+	}
+
+	sb = tree->inode->i_sb;
+	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
+		sizeof(struct page *);
+	node = kzalloc(size, GFP_KERNEL);
+	if (!node)
+		return NULL;
+	node->tree = tree;
+	node->this = cnid;
+	set_bit(HFS_BNODE_NEW, &node->flags);
+	atomic_set(&node->refcnt, 1);
+	hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+		node->tree->cnid, node->this);
+	init_waitqueue_head(&node->lock_wq);
+	spin_lock(&tree->hash_lock);
+	node2 = hfs_bnode_findhash(tree, cnid);
+	if (!node2) {
+		hash = hfs_bnode_hash(cnid);
+		node->next_hash = tree->node_hash[hash];
+		tree->node_hash[hash] = node;
+		tree->node_hash_cnt++;
+	} else {
+		spin_unlock(&tree->hash_lock);
+		kfree(node);
+		wait_event(node2->lock_wq,
+			!test_bit(HFS_BNODE_NEW, &node2->flags));
+		return node2;
+	}
+	spin_unlock(&tree->hash_lock);
+
+	mapping = tree->inode->i_mapping;
+	off = (loff_t)cnid << tree->node_size_shift;
+	block = off >> PAGE_CACHE_SHIFT;
+	node->page_offset = off & ~PAGE_CACHE_MASK;
+	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
+		page = read_mapping_page(mapping, block, NULL);
+		if (IS_ERR(page))
+			goto fail;
+		if (PageError(page)) {
+			page_cache_release(page);
+			goto fail;
+		}
+		page_cache_release(page);
+		node->page[i] = page;
+	}
+
+	return node;
+fail:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	return node;
+}
+
+void hfs_bnode_unhash(struct hfs_bnode *node)
+{
+	struct hfs_bnode **p;
+
+	hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
+		node->tree->cnid, node->this, atomic_read(&node->refcnt));
+	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
+	     *p && *p != node; p = &(*p)->next_hash)
+		;
+	BUG_ON(!*p);
+	*p = node->next_hash;
+	node->tree->node_hash_cnt--;
+}
+
+/* Load a particular node out of a tree */
+struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc *desc;
+	int i, rec_off, off, next_off;
+	int entry_size, key_size;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	if (node) {
+		hfs_bnode_get(node);
+		spin_unlock(&tree->hash_lock);
+		wait_event(node->lock_wq,
+			!test_bit(HFS_BNODE_NEW, &node->flags));
+		if (test_bit(HFS_BNODE_ERROR, &node->flags))
+			goto node_error;
+		return node;
+	}
+	spin_unlock(&tree->hash_lock);
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags))
+		goto node_error;
+	if (!test_bit(HFS_BNODE_NEW, &node->flags))
+		return node;
+
+	desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
+			node->page_offset);
+	node->prev = be32_to_cpu(desc->prev);
+	node->next = be32_to_cpu(desc->next);
+	node->num_recs = be16_to_cpu(desc->num_recs);
+	node->type = desc->type;
+	node->height = desc->height;
+	kunmap(node->page[0]);
+
+	switch (node->type) {
+	case HFS_NODE_HEADER:
+	case HFS_NODE_MAP:
+		if (node->height != 0)
+			goto node_error;
+		break;
+	case HFS_NODE_LEAF:
+		if (node->height != 1)
+			goto node_error;
+		break;
+	case HFS_NODE_INDEX:
+		if (node->height <= 1 || node->height > tree->depth)
+			goto node_error;
+		break;
+	default:
+		goto node_error;
+	}
+
+	rec_off = tree->node_size - 2;
+	off = hfs_bnode_read_u16(node, rec_off);
+	if (off != sizeof(struct hfs_bnode_desc))
+		goto node_error;
+	for (i = 1; i <= node->num_recs; off = next_off, i++) {
+		rec_off -= 2;
+		next_off = hfs_bnode_read_u16(node, rec_off);
+		if (next_off <= off ||
+		    next_off > tree->node_size ||
+		    next_off & 1)
+			goto node_error;
+		entry_size = next_off - off;
+		if (node->type != HFS_NODE_INDEX &&
+		    node->type != HFS_NODE_LEAF)
+			continue;
+		key_size = hfs_bnode_read_u16(node, off) + 2;
+		if (key_size >= entry_size || key_size & 1)
+			goto node_error;
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	return node;
+
+node_error:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	hfs_bnode_put(node);
+	return ERR_PTR(-EIO);
+}
+
+void hfs_bnode_free(struct hfs_bnode *node)
+{
+#if 0
+	int i;
+
+	for (i = 0; i < node->tree->pages_per_bnode; i++)
+		if (node->page[i])
+			page_cache_release(node->page[i]);
+#endif
+	kfree(node);
+}
+
+struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct page **pagep;
+	int i;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	spin_unlock(&tree->hash_lock);
+	if (node) {
+		pr_crit("new node %u already hashed?\n", num);
+		WARN_ON(1);
+		return node;
+	}
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags)) {
+		hfs_bnode_put(node);
+		return ERR_PTR(-EIO);
+	}
+
+	pagep = node->page;
+	memset(kmap(*pagep) + node->page_offset, 0,
+	       min_t(int, PAGE_CACHE_SIZE, tree->node_size));
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+	for (i = 1; i < tree->pages_per_bnode; i++) {
+		memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+
+	return node;
+}
+
+void hfs_bnode_get(struct hfs_bnode *node)
+{
+	if (node) {
+		atomic_inc(&node->refcnt);
+		hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
+	}
+}
+
+/* Dispose of resources used by a node */
+void hfs_bnode_put(struct hfs_bnode *node)
+{
+	if (node) {
+		struct hfs_btree *tree = node->tree;
+		int i;
+
+		hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
+		BUG_ON(!atomic_read(&node->refcnt));
+		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
+			return;
+		for (i = 0; i < tree->pages_per_bnode; i++) {
+			if (!node->page[i])
+				continue;
+			mark_page_accessed(node->page[i]);
+		}
+
+		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
+			hfs_bnode_unhash(node);
+			spin_unlock(&tree->hash_lock);
+			if (hfs_bnode_need_zeroout(tree))
+				hfs_bnode_clear(node, 0, tree->node_size);
+			hfs_bmap_free(node);
+			hfs_bnode_free(node);
+			return;
+		}
+		spin_unlock(&tree->hash_lock);
+	}
+}
+
+/*
+ * Unused nodes have to be zeroed if this is the catalog tree and
+ * a corresponding flag in the volume header is set.
+ */
+bool hfs_bnode_need_zeroout(struct hfs_btree *tree)
+{
+	struct super_block *sb = tree->inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes);
+
+	return tree->cnid == HFSPLUS_CAT_CNID &&
+		volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX;
+}
diff --git a/kernel/fs/hfsplus/brec.c b/kernel/fs/hfsplus/brec.c
new file mode 100644
index 000000000..754fdf8c6
--- /dev/null
+++ b/kernel/fs/hfsplus/brec.c
@@ -0,0 +1,527 @@
+/*
+ *  linux/fs/hfsplus/brec.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle individual btree records
+ */
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd);
+static int hfs_brec_update_parent(struct hfs_find_data *fd);
+static int hfs_btree_inc_height(struct hfs_btree *);
+
+/* Get the length and offset of the given record in the given node */
+u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off)
+{
+	__be16 retval[2];
+	u16 dataoff;
+
+	dataoff = node->tree->node_size - (rec + 2) * 2;
+	hfs_bnode_read(node, retval, dataoff, 4);
+	*off = be16_to_cpu(retval[1]);
+	return be16_to_cpu(retval[0]) - *off;
+}
+
+/* Get the length of the key from a keyed record */
+u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
+{
+	u16 retval, recoff;
+
+	if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF)
+		return 0;
+
+	if ((node->type == HFS_NODE_INDEX) &&
+	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS) &&
+	   (node->tree->cnid != HFSPLUS_ATTR_CNID)) {
+		retval = node->tree->max_key_len + 2;
+	} else {
+		recoff = hfs_bnode_read_u16(node,
+			node->tree->node_size - (rec + 1) * 2);
+		if (!recoff)
+			return 0;
+		if (recoff > node->tree->node_size - 2) {
+			pr_err("recoff %d too large\n", recoff);
+			return 0;
+		}
+
+		retval = hfs_bnode_read_u16(node, recoff) + 2;
+		if (retval > node->tree->max_key_len + 2) {
+			pr_err("keylen %d too large\n",
+				retval);
+			retval = 0;
+		}
+	}
+	return retval;
+}
+
+int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node;
+	int size, key_len, rec;
+	int data_off, end_off;
+	int idx_rec_off, data_rec_off, end_rec_off;
+	__be32 cnid;
+
+	tree = fd->tree;
+	if (!fd->bnode) {
+		if (!tree->root)
+			hfs_btree_inc_height(tree);
+		fd->bnode = hfs_bnode_find(tree, tree->leaf_head);
+		if (IS_ERR(fd->bnode))
+			return PTR_ERR(fd->bnode);
+		fd->record = -1;
+	}
+	new_node = NULL;
+	key_len = be16_to_cpu(fd->search_key->key_len) + 2;
+again:
+	/* new record idx and complete record size */
+	rec = fd->record + 1;
+	size = key_len + entry_len;
+
+	node = fd->bnode;
+	hfs_bnode_dump(node);
+	/* get last offset */
+	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
+	end_off = hfs_bnode_read_u16(node, end_rec_off);
+	end_rec_off -= 2;
+	hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+		rec, size, end_off, end_rec_off);
+	if (size > end_rec_off - end_off) {
+		if (new_node)
+			panic("not enough room!\n");
+		new_node = hfs_bnode_split(fd);
+		if (IS_ERR(new_node))
+			return PTR_ERR(new_node);
+		goto again;
+	}
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count++;
+		mark_inode_dirty(tree->inode);
+	}
+	node->num_recs++;
+	/* write new last offset */
+	hfs_bnode_write_u16(node,
+		offsetof(struct hfs_bnode_desc, num_recs),
+		node->num_recs);
+	hfs_bnode_write_u16(node, end_rec_off, end_off + size);
+	data_off = end_off;
+	data_rec_off = end_rec_off + 2;
+	idx_rec_off = tree->node_size - (rec + 1) * 2;
+	if (idx_rec_off == data_rec_off)
+		goto skip;
+	/* move all following entries */
+	do {
+		data_off = hfs_bnode_read_u16(node, data_rec_off + 2);
+		hfs_bnode_write_u16(node, data_rec_off, data_off + size);
+		data_rec_off += 2;
+	} while (data_rec_off < idx_rec_off);
+
+	/* move data away */
+	hfs_bnode_move(node, data_off + size, data_off,
+		       end_off - data_off);
+
+skip:
+	hfs_bnode_write(node, fd->search_key, data_off, key_len);
+	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
+	hfs_bnode_dump(node);
+
+	/*
+	 * update parent key if we inserted a key
+	 * at the start of the node and it is not the new node
+	 */
+	if (!rec && new_node != node) {
+		hfs_bnode_read_key(node, fd->search_key, data_off + size);
+		hfs_brec_update_parent(fd);
+	}
+
+	if (new_node) {
+		hfs_bnode_put(fd->bnode);
+		if (!new_node->parent) {
+			hfs_btree_inc_height(tree);
+			new_node->parent = tree->root;
+		}
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+
+		/* create index data entry */
+		cnid = cpu_to_be32(new_node->this);
+		entry = &cnid;
+		entry_len = sizeof(cnid);
+
+		/* get index key */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		__hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);
+
+		hfs_bnode_put(new_node);
+		new_node = NULL;
+
+		if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
+				(tree->cnid == HFSPLUS_ATTR_CNID))
+			key_len = be16_to_cpu(fd->search_key->key_len) + 2;
+		else {
+			fd->search_key->key_len =
+				cpu_to_be16(tree->max_key_len);
+			key_len = tree->max_key_len + 2;
+		}
+		goto again;
+	}
+
+	return 0;
+}
+
+int hfs_brec_remove(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *parent;
+	int end_off, rec_off, data_off, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+again:
+	rec_off = tree->node_size - (fd->record + 2) * 2;
+	end_off = tree->node_size - (node->num_recs + 1) * 2;
+
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count--;
+		mark_inode_dirty(tree->inode);
+	}
+	hfs_bnode_dump(node);
+	hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+		fd->record, fd->keylength + fd->entrylength);
+	if (!--node->num_recs) {
+		hfs_bnode_unlink(node);
+		if (!node->parent)
+			return 0;
+		parent = hfs_bnode_find(tree, node->parent);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+		hfs_bnode_put(node);
+		node = fd->bnode = parent;
+
+		__hfs_brec_find(node, fd, hfs_find_rec_by_key);
+		goto again;
+	}
+	hfs_bnode_write_u16(node,
+		offsetof(struct hfs_bnode_desc, num_recs),
+		node->num_recs);
+
+	if (rec_off == end_off)
+		goto skip;
+	size = fd->keylength + fd->entrylength;
+
+	do {
+		data_off = hfs_bnode_read_u16(node, rec_off);
+		hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
+		rec_off -= 2;
+	} while (rec_off >= end_off);
+
+	/* fill hole */
+	hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
+		       data_off - fd->keyoffset - size);
+skip:
+	hfs_bnode_dump(node);
+	if (!fd->record)
+		hfs_brec_update_parent(fd);
+	return 0;
+}
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *next_node;
+	struct hfs_bnode_desc node_desc;
+	int num_recs, new_rec_off, new_off, old_rec_off;
+	int data_start, data_end, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node))
+		return new_node;
+	hfs_bnode_get(node);
+	hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
+		node->this, new_node->this, node->next);
+	new_node->next = node->next;
+	new_node->prev = node->this;
+	new_node->parent = node->parent;
+	new_node->type = node->type;
+	new_node->height = node->height;
+
+	if (node->next)
+		next_node = hfs_bnode_find(tree, node->next);
+	else
+		next_node = NULL;
+
+	if (IS_ERR(next_node)) {
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		return next_node;
+	}
+
+	size = tree->node_size / 2 - node->num_recs * 2 - 14;
+	old_rec_off = tree->node_size - 4;
+	num_recs = 1;
+	for (;;) {
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+		if (data_start > size)
+			break;
+		old_rec_off -= 2;
+		if (++num_recs < node->num_recs)
+			continue;
+		/* panic? */
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		if (next_node)
+			hfs_bnode_put(next_node);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	if (fd->record + 1 < num_recs) {
+		/* new record is in the lower half,
+		 * so leave some more space there
+		 */
+		old_rec_off += 2;
+		num_recs--;
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+	} else {
+		hfs_bnode_put(node);
+		hfs_bnode_get(new_node);
+		fd->bnode = new_node;
+		fd->record -= num_recs;
+		fd->keyoffset -= data_start - 14;
+		fd->entryoffset -= data_start - 14;
+	}
+	new_node->num_recs = node->num_recs - num_recs;
+	node->num_recs = num_recs;
+
+	new_rec_off = tree->node_size - 2;
+	new_off = 14;
+	size = data_start - new_off;
+	num_recs = new_node->num_recs;
+	data_end = data_start;
+	while (num_recs) {
+		hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+		old_rec_off -= 2;
+		new_rec_off -= 2;
+		data_end = hfs_bnode_read_u16(node, old_rec_off);
+		new_off = data_end - size;
+		num_recs--;
+	}
+	hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+	hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start);
+
+	/* update new bnode header */
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	/* update previous bnode header */
+	node->next = new_node->this;
+	hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc));
+	node_desc.next = cpu_to_be32(node->next);
+	node_desc.num_recs = cpu_to_be16(node->num_recs);
+	hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
+
+	/* update next bnode header */
+	if (next_node) {
+		next_node->prev = new_node->this;
+		hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
+		node_desc.prev = cpu_to_be32(next_node->prev);
+		hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc));
+		hfs_bnode_put(next_node);
+	} else if (node->this == tree->leaf_tail) {
+		/* if there is no next node, this might be the new tail */
+		tree->leaf_tail = new_node->this;
+		mark_inode_dirty(tree->inode);
+	}
+
+	hfs_bnode_dump(node);
+	hfs_bnode_dump(new_node);
+	hfs_bnode_put(node);
+
+	return new_node;
+}
+
+static int hfs_brec_update_parent(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *parent;
+	int newkeylen, diff;
+	int rec, rec_off, end_rec_off;
+	int start_off, end_off;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = NULL;
+	if (!node->parent)
+		return 0;
+
+again:
+	parent = hfs_bnode_find(tree, node->parent);
+	if (IS_ERR(parent))
+		return PTR_ERR(parent);
+	__hfs_brec_find(parent, fd, hfs_find_rec_by_key);
+	if (fd->record < 0)
+		return -ENOENT;
+	hfs_bnode_dump(parent);
+	rec = fd->record;
+
+	/* size difference between old and new key */
+	if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
+				(tree->cnid == HFSPLUS_ATTR_CNID))
+		newkeylen = hfs_bnode_read_u16(node, 14) + 2;
+	else
+		fd->keylength = newkeylen = tree->max_key_len + 2;
+	hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+		rec, fd->keylength, newkeylen);
+
+	rec_off = tree->node_size - (rec + 2) * 2;
+	end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+	diff = newkeylen - fd->keylength;
+	if (!diff)
+		goto skip;
+	if (diff > 0) {
+		end_off = hfs_bnode_read_u16(parent, end_rec_off);
+		if (end_rec_off - end_off < diff) {
+
+			hfs_dbg(BNODE_MOD, "splitting index node\n");
+			fd->bnode = parent;
+			new_node = hfs_bnode_split(fd);
+			if (IS_ERR(new_node))
+				return PTR_ERR(new_node);
+			parent = fd->bnode;
+			rec = fd->record;
+			rec_off = tree->node_size - (rec + 2) * 2;
+			end_rec_off = tree->node_size -
+				(parent->num_recs + 1) * 2;
+		}
+	}
+
+	end_off = start_off = hfs_bnode_read_u16(parent, rec_off);
+	hfs_bnode_write_u16(parent, rec_off, start_off + diff);
+	start_off -= 4;	/* move previous cnid too */
+
+	while (rec_off > end_rec_off) {
+		rec_off -= 2;
+		end_off = hfs_bnode_read_u16(parent, rec_off);
+		hfs_bnode_write_u16(parent, rec_off, end_off + diff);
+	}
+	hfs_bnode_move(parent, start_off + diff, start_off,
+		       end_off - start_off);
+skip:
+	hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen);
+	hfs_bnode_dump(parent);
+
+	hfs_bnode_put(node);
+	node = parent;
+
+	if (new_node) {
+		__be32 cnid;
+
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+		/* create index key and entry */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		cnid = cpu_to_be32(new_node->this);
+
+		__hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);
+		hfs_brec_insert(fd, &cnid, sizeof(cnid));
+		hfs_bnode_put(fd->bnode);
+		hfs_bnode_put(new_node);
+
+		if (!rec) {
+			if (new_node == node)
+				goto out;
+			/* restore search_key */
+			hfs_bnode_read_key(node, fd->search_key, 14);
+		}
+	}
+
+	if (!rec && node->parent)
+		goto again;
+out:
+	fd->bnode = node;
+	return 0;
+}
+
+static int hfs_btree_inc_height(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *new_node;
+	struct hfs_bnode_desc node_desc;
+	int key_size, rec;
+	__be32 cnid;
+
+	node = NULL;
+	if (tree->root) {
+		node = hfs_bnode_find(tree, tree->root);
+		if (IS_ERR(node))
+			return PTR_ERR(node);
+	}
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node)) {
+		hfs_bnode_put(node);
+		return PTR_ERR(new_node);
+	}
+
+	tree->root = new_node->this;
+	if (!tree->depth) {
+		tree->leaf_head = tree->leaf_tail = new_node->this;
+		new_node->type = HFS_NODE_LEAF;
+		new_node->num_recs = 0;
+	} else {
+		new_node->type = HFS_NODE_INDEX;
+		new_node->num_recs = 1;
+	}
+	new_node->parent = 0;
+	new_node->next = 0;
+	new_node->prev = 0;
+	new_node->height = ++tree->depth;
+
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	rec = tree->node_size - 2;
+	hfs_bnode_write_u16(new_node, rec, 14);
+
+	if (node) {
+		/* insert old root idx into new root */
+		node->parent = tree->root;
+		if (node->type == HFS_NODE_LEAF ||
+				tree->attributes & HFS_TREE_VARIDXKEYS ||
+				tree->cnid == HFSPLUS_ATTR_CNID)
+			key_size = hfs_bnode_read_u16(node, 14) + 2;
+		else
+			key_size = tree->max_key_len + 2;
+		hfs_bnode_copy(new_node, 14, node, 14, key_size);
+
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS) &&
+				(tree->cnid != HFSPLUS_ATTR_CNID)) {
+			key_size = tree->max_key_len + 2;
+			hfs_bnode_write_u16(new_node, 14, tree->max_key_len);
+		}
+		cnid = cpu_to_be32(node->this);
+		hfs_bnode_write(new_node, &cnid, 14 + key_size, 4);
+
+		rec -= 2;
+		hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4);
+
+		hfs_bnode_put(node);
+	}
+	hfs_bnode_put(new_node);
+	mark_inode_dirty(tree->inode);
+
+	return 0;
+}
diff --git a/kernel/fs/hfsplus/btree.c b/kernel/fs/hfsplus/btree.c
new file mode 100644
index 000000000..3345c7553
--- /dev/null
+++ b/kernel/fs/hfsplus/btree.c
@@ -0,0 +1,497 @@
+/*
+ *  linux/fs/hfsplus/btree.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle opening/closing btree
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/log2.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/*
+ * Initial source code of clump size calculation is gotten
+ * from http://opensource.apple.com/tarballs/diskdev_cmds/
+ */
+#define CLUMP_ENTRIES	15
+
+static short clumptbl[CLUMP_ENTRIES * 3] = {
+/*
+ *	    Volume	Attributes	 Catalog	 Extents
+ *	     Size	Clump (MB)	Clump (MB)	Clump (MB)
+ */
+	/*   1GB */	  4,		  4,		 4,
+	/*   2GB */	  6,		  6,		 4,
+	/*   4GB */	  8,		  8,		 4,
+	/*   8GB */	 11,		 11,		 5,
+	/*
+	 * For volumes 16GB and larger, we want to make sure that a full OS
+	 * install won't require fragmentation of the Catalog or Attributes
+	 * B-trees.  We do this by making the clump sizes sufficiently large,
+	 * and by leaving a gap after the B-trees for them to grow into.
+	 *
+	 * For SnowLeopard 10A298, a FullNetInstall with all packages selected
+	 * results in:
+	 * Catalog B-tree Header
+	 *	nodeSize:          8192
+	 *	totalNodes:       31616
+	 *	freeNodes:         1978
+	 * (used = 231.55 MB)
+	 * Attributes B-tree Header
+	 *	nodeSize:          8192
+	 *	totalNodes:       63232
+	 *	freeNodes:          958
+	 * (used = 486.52 MB)
+	 *
+	 * We also want Time Machine backup volumes to have a sufficiently
+	 * large clump size to reduce fragmentation.
+	 *
+	 * The series of numbers for Catalog and Attribute form a geometric
+	 * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times
+	 * the previous term.  For Attributes (16GB to 512GB), each term is
+	 * 4**(1/5) times the previous term.  For 1TB to 16TB, each term is
+	 * 2**(1/5) times the previous term.
+	 */
+	/*  16GB */	 64,		 32,		 5,
+	/*  32GB */	 84,		 49,		 6,
+	/*  64GB */	111,		 74,		 7,
+	/* 128GB */	147,		111,		 8,
+	/* 256GB */	194,		169,		 9,
+	/* 512GB */	256,		256,		11,
+	/*   1TB */	294,		294,		14,
+	/*   2TB */	338,		338,		16,
+	/*   4TB */	388,		388,		20,
+	/*   8TB */	446,		446,		25,
+	/*  16TB */	512,		512,		32
+};
+
+u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size,
+					u64 sectors, int file_id)
+{
+	u32 mod = max(node_size, block_size);
+	u32 clump_size;
+	int column;
+	int i;
+
+	/* Figure out which column of the above table to use for this file. */
+	switch (file_id) {
+	case HFSPLUS_ATTR_CNID:
+		column = 0;
+		break;
+	case HFSPLUS_CAT_CNID:
+		column = 1;
+		break;
+	default:
+		column = 2;
+		break;
+	}
+
+	/*
+	 * The default clump size is 0.8% of the volume size. And
+	 * it must also be a multiple of the node and block size.
+	 */
+	if (sectors < 0x200000) {
+		clump_size = sectors << 2;	/*  0.8 %  */
+		if (clump_size < (8 * node_size))
+			clump_size = 8 * node_size;
+	} else {
+		/* turn exponent into table index... */
+		for (i = 0, sectors = sectors >> 22;
+		     sectors && (i < CLUMP_ENTRIES - 1);
+		     ++i, sectors = sectors >> 1) {
+			/* empty body */
+		}
+
+		clump_size = clumptbl[column + (i) * 3] * 1024 * 1024;
+	}
+
+	/*
+	 * Round the clump size to a multiple of node and block size.
+	 * NOTE: This rounds down.
+	 */
+	clump_size /= mod;
+	clump_size *= mod;
+
+	/*
+	 * Rounding down could have rounded down to 0 if the block size was
+	 * greater than the clump size.  If so, just use one block or node.
+	 */
+	if (clump_size == 0)
+		clump_size = mod;
+
+	return clump_size;
+}
+
+/* Get a reference to a B*Tree and do some initial checks */
+struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
+{
+	struct hfs_btree *tree;
+	struct hfs_btree_header_rec *head;
+	struct address_space *mapping;
+	struct inode *inode;
+	struct page *page;
+	unsigned int size;
+
+	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
+	if (!tree)
+		return NULL;
+
+	mutex_init(&tree->tree_lock);
+	spin_lock_init(&tree->hash_lock);
+	tree->sb = sb;
+	tree->cnid = id;
+	inode = hfsplus_iget(sb, id);
+	if (IS_ERR(inode))
+		goto free_tree;
+	tree->inode = inode;
+
+	if (!HFSPLUS_I(tree->inode)->first_blocks) {
+		pr_err("invalid btree extent records (0 size)\n");
+		goto free_inode;
+	}
+
+	mapping = tree->inode->i_mapping;
+	page = read_mapping_page(mapping, 0, NULL);
+	if (IS_ERR(page))
+		goto free_inode;
+
+	/* Load the header */
+	head = (struct hfs_btree_header_rec *)(kmap(page) +
+		sizeof(struct hfs_bnode_desc));
+	tree->root = be32_to_cpu(head->root);
+	tree->leaf_count = be32_to_cpu(head->leaf_count);
+	tree->leaf_head = be32_to_cpu(head->leaf_head);
+	tree->leaf_tail = be32_to_cpu(head->leaf_tail);
+	tree->node_count = be32_to_cpu(head->node_count);
+	tree->free_nodes = be32_to_cpu(head->free_nodes);
+	tree->attributes = be32_to_cpu(head->attributes);
+	tree->node_size = be16_to_cpu(head->node_size);
+	tree->max_key_len = be16_to_cpu(head->max_key_len);
+	tree->depth = be16_to_cpu(head->depth);
+
+	/* Verify the tree and set the correct compare function */
+	switch (id) {
+	case HFSPLUS_EXT_CNID:
+		if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+			pr_err("invalid extent max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+			pr_err("invalid extent btree flag\n");
+			goto fail_page;
+		}
+
+		tree->keycmp = hfsplus_ext_cmp_key;
+		break;
+	case HFSPLUS_CAT_CNID:
+		if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+			pr_err("invalid catalog max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			pr_err("invalid catalog btree flag\n");
+			goto fail_page;
+		}
+
+		if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
+		    (head->key_type == HFSPLUS_KEY_BINARY))
+			tree->keycmp = hfsplus_cat_bin_cmp_key;
+		else {
+			tree->keycmp = hfsplus_cat_case_cmp_key;
+			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+		}
+		break;
+	case HFSPLUS_ATTR_CNID:
+		if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) {
+			pr_err("invalid attributes max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		tree->keycmp = hfsplus_attr_bin_cmp_key;
+		break;
+	default:
+		pr_err("unknown B*Tree requested\n");
+		goto fail_page;
+	}
+
+	if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+		pr_err("invalid btree flag\n");
+		goto fail_page;
+	}
+
+	size = tree->node_size;
+	if (!is_power_of_2(size))
+		goto fail_page;
+	if (!tree->node_count)
+		goto fail_page;
+
+	tree->node_size_shift = ffs(size) - 1;
+
+	tree->pages_per_bnode =
+		(tree->node_size + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	kunmap(page);
+	page_cache_release(page);
+	return tree;
+
+ fail_page:
+	page_cache_release(page);
+ free_inode:
+	tree->inode->i_mapping->a_ops = &hfsplus_aops;
+	iput(tree->inode);
+ free_tree:
+	kfree(tree);
+	return NULL;
+}
+
+/* Release resources used by a btree */
+void hfs_btree_close(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node;
+	int i;
+
+	if (!tree)
+		return;
+
+	for (i = 0; i < NODE_HASH_SIZE; i++) {
+		while ((node = tree->node_hash[i])) {
+			tree->node_hash[i] = node->next_hash;
+			if (atomic_read(&node->refcnt))
+				pr_crit("node %d:%d "
+						"still has %d user(s)!\n",
+					node->tree->cnid, node->this,
+					atomic_read(&node->refcnt));
+			hfs_bnode_free(node);
+			tree->node_hash_cnt--;
+		}
+	}
+	iput(tree->inode);
+	kfree(tree);
+}
+
+int hfs_btree_write(struct hfs_btree *tree)
+{
+	struct hfs_btree_header_rec *head;
+	struct hfs_bnode *node;
+	struct page *page;
+
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		/* panic? */
+		return -EIO;
+	/* Load the header */
+	page = node->page[0];
+	head = (struct hfs_btree_header_rec *)(kmap(page) +
+		sizeof(struct hfs_bnode_desc));
+
+	head->root = cpu_to_be32(tree->root);
+	head->leaf_count = cpu_to_be32(tree->leaf_count);
+	head->leaf_head = cpu_to_be32(tree->leaf_head);
+	head->leaf_tail = cpu_to_be32(tree->leaf_tail);
+	head->node_count = cpu_to_be32(tree->node_count);
+	head->free_nodes = cpu_to_be32(tree->free_nodes);
+	head->attributes = cpu_to_be32(tree->attributes);
+	head->depth = cpu_to_be16(tree->depth);
+
+	kunmap(page);
+	set_page_dirty(page);
+	hfs_bnode_put(node);
+	return 0;
+}
+
+static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
+{
+	struct hfs_btree *tree = prev->tree;
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+
+	node = hfs_bnode_create(tree, idx);
+	if (IS_ERR(node))
+		return node;
+
+	tree->free_nodes--;
+	prev->next = idx;
+	cnid = cpu_to_be32(idx);
+	hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+
+	node->type = HFS_NODE_MAP;
+	node->num_recs = 1;
+	hfs_bnode_clear(node, 0, tree->node_size);
+	desc.next = 0;
+	desc.prev = 0;
+	desc.type = HFS_NODE_MAP;
+	desc.height = 0;
+	desc.num_recs = cpu_to_be16(1);
+	desc.reserved = 0;
+	hfs_bnode_write(node, &desc, 0, sizeof(desc));
+	hfs_bnode_write_u16(node, 14, 0x8000);
+	hfs_bnode_write_u16(node, tree->node_size - 2, 14);
+	hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6);
+
+	return node;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *next_node;
+	struct page **pagep;
+	u32 nidx, idx;
+	unsigned off;
+	u16 off16;
+	u16 len;
+	u8 *data, byte, m;
+	int i;
+
+	while (!tree->free_nodes) {
+		struct inode *inode = tree->inode;
+		struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+		u32 count;
+		int res;
+
+		res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
+		if (res)
+			return ERR_PTR(res);
+		hip->phys_size = inode->i_size =
+			(loff_t)hip->alloc_blocks <<
+				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+		hip->fs_blocks =
+			hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
+		inode_set_bytes(inode, inode->i_size);
+		count = inode->i_size >> tree->node_size_shift;
+		tree->free_nodes = count - tree->node_count;
+		tree->node_count = count;
+	}
+
+	nidx = 0;
+	node = hfs_bnode_find(tree, nidx);
+	if (IS_ERR(node))
+		return node;
+	len = hfs_brec_lenoff(node, 2, &off16);
+	off = off16;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	data = kmap(*pagep);
+	off &= ~PAGE_CACHE_MASK;
+	idx = 0;
+
+	for (;;) {
+		while (len) {
+			byte = data[off];
+			if (byte != 0xff) {
+				for (m = 0x80, i = 0; i < 8; m >>= 1, i++) {
+					if (!(byte & m)) {
+						idx += i;
+						data[off] |= m;
+						set_page_dirty(*pagep);
+						kunmap(*pagep);
+						tree->free_nodes--;
+						mark_inode_dirty(tree->inode);
+						hfs_bnode_put(node);
+						return hfs_bnode_create(tree,
+							idx);
+					}
+				}
+			}
+			if (++off >= PAGE_CACHE_SIZE) {
+				kunmap(*pagep);
+				data = kmap(*++pagep);
+				off = 0;
+			}
+			idx += 8;
+			len--;
+		}
+		kunmap(*pagep);
+		nidx = node->next;
+		if (!nidx) {
+			hfs_dbg(BNODE_MOD, "create new bmap node\n");
+			next_node = hfs_bmap_new_bmap(node, idx);
+		} else
+			next_node = hfs_bnode_find(tree, nidx);
+		hfs_bnode_put(node);
+		if (IS_ERR(next_node))
+			return next_node;
+		node = next_node;
+
+		len = hfs_brec_lenoff(node, 0, &off16);
+		off = off16;
+		off += node->page_offset;
+		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+		data = kmap(*pagep);
+		off &= ~PAGE_CACHE_MASK;
+	}
+}
+
+void hfs_bmap_free(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct page *page;
+	u16 off, len;
+	u32 nidx;
+	u8 *data, byte, m;
+
+	hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
+	BUG_ON(!node->this);
+	tree = node->tree;
+	nidx = node->this;
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		return;
+	len = hfs_brec_lenoff(node, 2, &off);
+	while (nidx >= len * 8) {
+		u32 i;
+
+		nidx -= len * 8;
+		i = node->next;
+		hfs_bnode_put(node);
+		if (!i) {
+			/* panic */;
+			pr_crit("unable to free bnode %u. "
+					"bmap not found!\n",
+				node->this);
+			return;
+		}
+		node = hfs_bnode_find(tree, i);
+		if (IS_ERR(node))
+			return;
+		if (node->type != HFS_NODE_MAP) {
+			/* panic */;
+			pr_crit("invalid bmap found! "
+					"(%u,%d)\n",
+				node->this, node->type);
+			hfs_bnode_put(node);
+			return;
+		}
+		len = hfs_brec_lenoff(node, 0, &off);
+	}
+	off += node->page_offset + nidx / 8;
+	page = node->page[off >> PAGE_CACHE_SHIFT];
+	data = kmap(page);
+	off &= ~PAGE_CACHE_MASK;
+	m = 1 << (~nidx & 7);
+	byte = data[off];
+	if (!(byte & m)) {
+		pr_crit("trying to free free bnode "
+				"%u(%d)\n",
+			node->this, node->type);
+		kunmap(page);
+		hfs_bnode_put(node);
+		return;
+	}
+	data[off] = byte & ~m;
+	set_page_dirty(page);
+	kunmap(page);
+	hfs_bnode_put(node);
+	tree->free_nodes++;
+	mark_inode_dirty(tree->inode);
+}
diff --git a/kernel/fs/hfsplus/catalog.c b/kernel/fs/hfsplus/catalog.c
new file mode 100644
index 000000000..022974ab6
--- /dev/null
+++ b/kernel/fs/hfsplus/catalog.c
@@ -0,0 +1,521 @@
+/*
+ *  linux/fs/hfsplus/catalog.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of catalog records
+ */
+
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
+			     const hfsplus_btree_key *k2)
+{
+	__be32 k1p, k2p;
+
+	k1p = k1->cat.parent;
+	k2p = k2->cat.parent;
+	if (k1p != k2p)
+		return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
+
+	return hfsplus_strcasecmp(&k1->cat.name, &k2->cat.name);
+}
+
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
+			    const hfsplus_btree_key *k2)
+{
+	__be32 k1p, k2p;
+
+	k1p = k1->cat.parent;
+	k2p = k2->cat.parent;
+	if (k1p != k2p)
+		return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
+
+	return hfsplus_strcmp(&k1->cat.name, &k2->cat.name);
+}
+
+/* Generates key for catalog file/folders record. */
+int hfsplus_cat_build_key(struct super_block *sb,
+		hfsplus_btree_key *key, u32 parent, struct qstr *str)
+{
+	int len, err;
+
+	key->cat.parent = cpu_to_be32(parent);
+	err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
+			str->name, str->len);
+	if (unlikely(err < 0))
+		return err;
+
+	len = be16_to_cpu(key->cat.name.length);
+	key->key_len = cpu_to_be16(6 + 2 * len);
+	return 0;
+}
+
+/* Generates key for catalog thread record. */
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+			hfsplus_btree_key *key, u32 parent)
+{
+	key->cat.parent = cpu_to_be32(parent);
+	key->cat.name.length = 0;
+	key->key_len = cpu_to_be16(6);
+}
+
+static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
+				      struct hfsplus_unistr *name)
+{
+	int ustrlen;
+
+	ustrlen = be16_to_cpu(name->length);
+	key->cat.parent = cpu_to_be32(parent);
+	key->cat.name.length = cpu_to_be16(ustrlen);
+	ustrlen *= 2;
+	memcpy(key->cat.name.unicode, name->unicode, ustrlen);
+	key->key_len = cpu_to_be16(6 + ustrlen);
+}
+
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+{
+	if (inode->i_flags & S_IMMUTABLE)
+		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
+	else
+		perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
+	if (inode->i_flags & S_APPEND)
+		perms->rootflags |= HFSPLUS_FLG_APPEND;
+	else
+		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
+
+	perms->userflags = HFSPLUS_I(inode)->userflags;
+	perms->mode = cpu_to_be16(inode->i_mode);
+	perms->owner = cpu_to_be32(i_uid_read(inode));
+	perms->group = cpu_to_be32(i_gid_read(inode));
+
+	if (S_ISREG(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_nlink);
+	else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_rdev);
+	else
+		perms->dev = 0;
+}
+
+static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
+		u32 cnid, struct inode *inode)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
+	if (S_ISDIR(inode->i_mode)) {
+		struct hfsplus_cat_folder *folder;
+
+		folder = &entry->folder;
+		memset(folder, 0, sizeof(*folder));
+		folder->type = cpu_to_be16(HFSPLUS_FOLDER);
+		if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags))
+			folder->flags |= cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT);
+		folder->id = cpu_to_be32(inode->i_ino);
+		HFSPLUS_I(inode)->create_date =
+			folder->create_date =
+			folder->content_mod_date =
+			folder->attribute_mod_date =
+			folder->access_date = hfsp_now2mt();
+		hfsplus_cat_set_perms(inode, &folder->permissions);
+		if (inode == sbi->hidden_dir)
+			/* invisible and namelocked */
+			folder->user_info.frFlags = cpu_to_be16(0x5000);
+		return sizeof(*folder);
+	} else {
+		struct hfsplus_cat_file *file;
+
+		file = &entry->file;
+		memset(file, 0, sizeof(*file));
+		file->type = cpu_to_be16(HFSPLUS_FILE);
+		file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
+		file->id = cpu_to_be32(cnid);
+		HFSPLUS_I(inode)->create_date =
+			file->create_date =
+			file->content_mod_date =
+			file->attribute_mod_date =
+			file->access_date = hfsp_now2mt();
+		if (cnid == inode->i_ino) {
+			hfsplus_cat_set_perms(inode, &file->permissions);
+			if (S_ISLNK(inode->i_mode)) {
+				file->user_info.fdType =
+					cpu_to_be32(HFSP_SYMLINK_TYPE);
+				file->user_info.fdCreator =
+					cpu_to_be32(HFSP_SYMLINK_CREATOR);
+			} else {
+				file->user_info.fdType =
+					cpu_to_be32(sbi->type);
+				file->user_info.fdCreator =
+					cpu_to_be32(sbi->creator);
+			}
+			if (HFSPLUS_FLG_IMMUTABLE &
+					(file->permissions.rootflags |
+					file->permissions.userflags))
+				file->flags |=
+					cpu_to_be16(HFSPLUS_FILE_LOCKED);
+		} else {
+			file->user_info.fdType =
+				cpu_to_be32(HFSP_HARDLINK_TYPE);
+			file->user_info.fdCreator =
+				cpu_to_be32(HFSP_HFSPLUS_CREATOR);
+			file->user_info.fdFlags =
+				cpu_to_be16(0x100);
+			file->create_date =
+				HFSPLUS_I(sbi->hidden_dir)->create_date;
+			file->permissions.dev =
+				cpu_to_be32(HFSPLUS_I(inode)->linkid);
+		}
+		return sizeof(*file);
+	}
+}
+
+static int hfsplus_fill_cat_thread(struct super_block *sb,
+				   hfsplus_cat_entry *entry, int type,
+				   u32 parentid, struct qstr *str)
+{
+	int err;
+
+	entry->type = cpu_to_be16(type);
+	entry->thread.reserved = 0;
+	entry->thread.parentID = cpu_to_be32(parentid);
+	err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
+				str->name, str->len);
+	if (unlikely(err < 0))
+		return err;
+
+	return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;
+}
+
+/* Try to get a catalog entry for given catalog id */
+int hfsplus_find_cat(struct super_block *sb, u32 cnid,
+		     struct hfs_find_data *fd)
+{
+	hfsplus_cat_entry tmp;
+	int err;
+	u16 type;
+
+	hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid);
+	err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry));
+	if (err)
+		return err;
+
+	type = be16_to_cpu(tmp.type);
+	if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) {
+		pr_err("found bad thread record in catalog\n");
+		return -EIO;
+	}
+
+	if (be16_to_cpu(tmp.thread.nodeName.length) > 255) {
+		pr_err("catalog name length corrupted\n");
+		return -EIO;
+	}
+
+	hfsplus_cat_build_key_uni(fd->search_key,
+		be32_to_cpu(tmp.thread.parentID),
+		&tmp.thread.nodeName);
+	return hfs_brec_find(fd, hfs_find_rec_by_key);
+}
+
+static void hfsplus_subfolders_inc(struct inode *dir)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+
+	if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+		/*
+		 * Increment subfolder count. Note, the value is only meaningful
+		 * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
+		 */
+		HFSPLUS_I(dir)->subfolders++;
+	}
+}
+
+static void hfsplus_subfolders_dec(struct inode *dir)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+
+	if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+		/*
+		 * Decrement subfolder count. Note, the value is only meaningful
+		 * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
+		 *
+		 * Check for zero. Some subfolders may have been created
+		 * by an implementation ignorant of this counter.
+		 */
+		if (HFSPLUS_I(dir)->subfolders)
+			HFSPLUS_I(dir)->subfolders--;
+	}
+}
+
+int hfsplus_create_cat(u32 cnid, struct inode *dir,
+		struct qstr *str, struct inode *inode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct hfs_find_data fd;
+	hfsplus_cat_entry entry;
+	int entry_size;
+	int err;
+
+	hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+		str->name, cnid, inode->i_nlink);
+	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return err;
+
+	hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
+	entry_size = hfsplus_fill_cat_thread(sb, &entry,
+		S_ISDIR(inode->i_mode) ?
+			HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
+		dir->i_ino, str);
+	if (unlikely(entry_size < 0)) {
+		err = entry_size;
+		goto err2;
+	}
+
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto err2;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err2;
+
+	err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+	if (unlikely(err))
+		goto err1;
+
+	entry_size = hfsplus_cat_build_record(&entry, cnid, inode);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err != -ENOENT) {
+		/* panic? */
+		if (!err)
+			err = -EEXIST;
+		goto err1;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err1;
+
+	dir->i_size++;
+	if (S_ISDIR(inode->i_mode))
+		hfsplus_subfolders_inc(dir);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
+
+	hfs_find_exit(&fd);
+	return 0;
+
+err1:
+	hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
+	if (!hfs_brec_find(&fd, hfs_find_rec_by_key))
+		hfs_brec_remove(&fd);
+err2:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
+{
+	struct super_block *sb = dir->i_sb;
+	struct hfs_find_data fd;
+	struct hfsplus_fork_raw fork;
+	struct list_head *pos;
+	int err, off;
+	u16 type;
+
+	hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return err;
+
+	if (!str) {
+		int len;
+
+		hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
+		err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+		if (err)
+			goto out;
+
+		off = fd.entryoffset +
+			offsetof(struct hfsplus_cat_thread, nodeName);
+		fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
+		hfs_bnode_read(fd.bnode,
+			&fd.search_key->cat.name.length, off, 2);
+		len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
+		hfs_bnode_read(fd.bnode,
+			&fd.search_key->cat.name.unicode,
+			off + 2, len);
+		fd.search_key->key_len = cpu_to_be16(6 + len);
+	} else {
+		err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+		if (unlikely(err))
+			goto out;
+	}
+
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+
+	type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+	if (type == HFSPLUS_FILE) {
+#if 0
+		off = fd.entryoffset + offsetof(hfsplus_cat_file, data_fork);
+		hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
+		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
+#endif
+
+		off = fd.entryoffset +
+			offsetof(struct hfsplus_cat_file, rsrc_fork);
+		hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
+		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
+	}
+
+	list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
+		struct hfsplus_readdir_data *rd =
+			list_entry(pos, struct hfsplus_readdir_data, list);
+		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
+			rd->file->f_pos--;
+	}
+
+	err = hfs_brec_remove(&fd);
+	if (err)
+		goto out;
+
+	hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+
+	err = hfs_brec_remove(&fd);
+	if (err)
+		goto out;
+
+	dir->i_size--;
+	if (type == HFSPLUS_FOLDER)
+		hfsplus_subfolders_dec(dir);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
+
+	if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
+		if (HFSPLUS_SB(sb)->attr_tree)
+			hfsplus_delete_all_attrs(dir, cnid);
+	}
+
+out:
+	hfs_find_exit(&fd);
+
+	return err;
+}
+
+int hfsplus_rename_cat(u32 cnid,
+		       struct inode *src_dir, struct qstr *src_name,
+		       struct inode *dst_dir, struct qstr *dst_name)
+{
+	struct super_block *sb = src_dir->i_sb;
+	struct hfs_find_data src_fd, dst_fd;
+	hfsplus_cat_entry entry;
+	int entry_size, type;
+	int err;
+
+	hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+		cnid, src_dir->i_ino, src_name->name,
+		dst_dir->i_ino, dst_name->name);
+	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
+	if (err)
+		return err;
+	dst_fd = src_fd;
+
+	/* find the old dir entry and read the data */
+	err = hfsplus_cat_build_key(sb, src_fd.search_key,
+			src_dir->i_ino, src_name);
+	if (unlikely(err))
+		goto out;
+
+	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+	if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
+				src_fd.entrylength);
+	type = be16_to_cpu(entry.type);
+
+	/* create new dir entry with the data from the old entry */
+	err = hfsplus_cat_build_key(sb, dst_fd.search_key,
+			dst_dir->i_ino, dst_name);
+	if (unlikely(err))
+		goto out;
+
+	err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+
+	err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength);
+	if (err)
+		goto out;
+	dst_dir->i_size++;
+	if (type == HFSPLUS_FOLDER)
+		hfsplus_subfolders_inc(dst_dir);
+	dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* finally remove the old entry */
+	err = hfsplus_cat_build_key(sb, src_fd.search_key,
+			src_dir->i_ino, src_name);
+	if (unlikely(err))
+		goto out;
+
+	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+	src_dir->i_size--;
+	if (type == HFSPLUS_FOLDER)
+		hfsplus_subfolders_dec(src_dir);
+	src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* remove old thread entry */
+	hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
+	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+	type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset);
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+
+	/* create new thread entry */
+	hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid);
+	entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
+		dst_dir->i_ino, dst_name);
+	if (unlikely(entry_size < 0)) {
+		err = entry_size;
+		goto out;
+	}
+
+	err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+	err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+
+	hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
+	hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
+out:
+	hfs_bnode_put(dst_fd.bnode);
+	hfs_find_exit(&src_fd);
+	return err;
+}
diff --git a/kernel/fs/hfsplus/dir.c b/kernel/fs/hfsplus/dir.c
new file mode 100644
index 000000000..d0f39dcbb
--- /dev/null
+++ b/kernel/fs/hfsplus/dir.c
@@ -0,0 +1,576 @@
+/*
+ *  linux/fs/hfsplus/dir.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of directories
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/nls.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+#include "xattr.h"
+#include "acl.h"
+
+static inline void hfsplus_instantiate(struct dentry *dentry,
+				       struct inode *inode, u32 cnid)
+{
+	dentry->d_fsdata = (void *)(unsigned long)cnid;
+	d_instantiate(dentry, inode);
+}
+
+/* Find the entry inside dir named dentry->d_name */
+static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
+				     unsigned int flags)
+{
+	struct inode *inode = NULL;
+	struct hfs_find_data fd;
+	struct super_block *sb;
+	hfsplus_cat_entry entry;
+	int err;
+	u32 cnid, linkid = 0;
+	u16 type;
+
+	sb = dir->i_sb;
+
+	dentry->d_fsdata = NULL;
+	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return ERR_PTR(err);
+	err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino,
+			&dentry->d_name);
+	if (unlikely(err < 0))
+		goto fail;
+again:
+	err = hfs_brec_read(&fd, &entry, sizeof(entry));
+	if (err) {
+		if (err == -ENOENT) {
+			hfs_find_exit(&fd);
+			/* No such entry */
+			inode = NULL;
+			goto out;
+		}
+		goto fail;
+	}
+	type = be16_to_cpu(entry.type);
+	if (type == HFSPLUS_FOLDER) {
+		if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
+			err = -EIO;
+			goto fail;
+		}
+		cnid = be32_to_cpu(entry.folder.id);
+		dentry->d_fsdata = (void *)(unsigned long)cnid;
+	} else if (type == HFSPLUS_FILE) {
+		if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
+			err = -EIO;
+			goto fail;
+		}
+		cnid = be32_to_cpu(entry.file.id);
+		if (entry.file.user_info.fdType ==
+				cpu_to_be32(HFSP_HARDLINK_TYPE) &&
+				entry.file.user_info.fdCreator ==
+				cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
+				(entry.file.create_date ==
+					HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
+						create_date ||
+				entry.file.create_date ==
+					HFSPLUS_I(d_inode(sb->s_root))->
+						create_date) &&
+				HFSPLUS_SB(sb)->hidden_dir) {
+			struct qstr str;
+			char name[32];
+
+			if (dentry->d_fsdata) {
+				/*
+				 * We found a link pointing to another link,
+				 * so ignore it and treat it as regular file.
+				 */
+				cnid = (unsigned long)dentry->d_fsdata;
+				linkid = 0;
+			} else {
+				dentry->d_fsdata = (void *)(unsigned long)cnid;
+				linkid =
+					be32_to_cpu(entry.file.permissions.dev);
+				str.len = sprintf(name, "iNode%d", linkid);
+				str.name = name;
+				err = hfsplus_cat_build_key(sb, fd.search_key,
+					HFSPLUS_SB(sb)->hidden_dir->i_ino,
+					&str);
+				if (unlikely(err < 0))
+					goto fail;
+				goto again;
+			}
+		} else if (!dentry->d_fsdata)
+			dentry->d_fsdata = (void *)(unsigned long)cnid;
+	} else {
+		pr_err("invalid catalog entry type in lookup\n");
+		err = -EIO;
+		goto fail;
+	}
+	hfs_find_exit(&fd);
+	inode = hfsplus_iget(dir->i_sb, cnid);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (S_ISREG(inode->i_mode))
+		HFSPLUS_I(inode)->linkid = linkid;
+out:
+	d_add(dentry, inode);
+	return NULL;
+fail:
+	hfs_find_exit(&fd);
+	return ERR_PTR(err);
+}
+
+static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	int len, err;
+	char *strbuf;
+	hfsplus_cat_entry entry;
+	struct hfs_find_data fd;
+	struct hfsplus_readdir_data *rd;
+	u16 type;
+
+	if (file->f_pos >= inode->i_size)
+		return 0;
+
+	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return err;
+	strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN + 1, GFP_KERNEL);
+	if (!strbuf) {
+		err = -ENOMEM;
+		goto out;
+	}
+	hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+
+	if (ctx->pos == 0) {
+		/* This is completely artificial... */
+		if (!dir_emit_dot(file, ctx))
+			goto out;
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			fd.entrylength);
+		if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
+			pr_err("bad catalog folder thread\n");
+			err = -EIO;
+			goto out;
+		}
+		if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) {
+			pr_err("truncated catalog thread\n");
+			err = -EIO;
+			goto out;
+		}
+		if (!dir_emit(ctx, "..", 2,
+			    be32_to_cpu(entry.thread.parentID), DT_DIR))
+			goto out;
+		ctx->pos = 2;
+	}
+	if (ctx->pos >= inode->i_size)
+		goto out;
+	err = hfs_brec_goto(&fd, ctx->pos - 1);
+	if (err)
+		goto out;
+	for (;;) {
+		if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
+			pr_err("walked past end of dir\n");
+			err = -EIO;
+			goto out;
+		}
+
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			fd.entrylength);
+		type = be16_to_cpu(entry.type);
+		len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN;
+		err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
+		if (err)
+			goto out;
+		if (type == HFSPLUS_FOLDER) {
+			if (fd.entrylength <
+					sizeof(struct hfsplus_cat_folder)) {
+				pr_err("small dir entry\n");
+				err = -EIO;
+				goto out;
+			}
+			if (HFSPLUS_SB(sb)->hidden_dir &&
+			    HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+					be32_to_cpu(entry.folder.id))
+				goto next;
+			if (!dir_emit(ctx, strbuf, len,
+				    be32_to_cpu(entry.folder.id), DT_DIR))
+				break;
+		} else if (type == HFSPLUS_FILE) {
+			u16 mode;
+			unsigned type = DT_UNKNOWN;
+
+			if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
+				pr_err("small file entry\n");
+				err = -EIO;
+				goto out;
+			}
+
+			mode = be16_to_cpu(entry.file.permissions.mode);
+			if (S_ISREG(mode))
+				type = DT_REG;
+			else if (S_ISLNK(mode))
+				type = DT_LNK;
+			else if (S_ISFIFO(mode))
+				type = DT_FIFO;
+			else if (S_ISCHR(mode))
+				type = DT_CHR;
+			else if (S_ISBLK(mode))
+				type = DT_BLK;
+			else if (S_ISSOCK(mode))
+				type = DT_SOCK;
+
+			if (!dir_emit(ctx, strbuf, len,
+				      be32_to_cpu(entry.file.id), type))
+				break;
+		} else {
+			pr_err("bad catalog entry type\n");
+			err = -EIO;
+			goto out;
+		}
+next:
+		ctx->pos++;
+		if (ctx->pos >= inode->i_size)
+			goto out;
+		err = hfs_brec_goto(&fd, 1);
+		if (err)
+			goto out;
+	}
+	rd = file->private_data;
+	if (!rd) {
+		rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
+		if (!rd) {
+			err = -ENOMEM;
+			goto out;
+		}
+		file->private_data = rd;
+		rd->file = file;
+		list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
+	}
+	memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
+out:
+	kfree(strbuf);
+	hfs_find_exit(&fd);
+	return err;
+}
+
+static int hfsplus_dir_release(struct inode *inode, struct file *file)
+{
+	struct hfsplus_readdir_data *rd = file->private_data;
+	if (rd) {
+		mutex_lock(&inode->i_mutex);
+		list_del(&rd->list);
+		mutex_unlock(&inode->i_mutex);
+		kfree(rd);
+	}
+	return 0;
+}
+
+static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
+			struct dentry *dst_dentry)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
+	struct inode *inode = d_inode(src_dentry);
+	struct inode *src_dir = d_inode(src_dentry->d_parent);
+	struct qstr str;
+	char name[32];
+	u32 cnid, id;
+	int res;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		return -EPERM;
+	if (!S_ISREG(inode->i_mode))
+		return -EPERM;
+
+	mutex_lock(&sbi->vh_mutex);
+	if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
+		for (;;) {
+			get_random_bytes(&id, sizeof(cnid));
+			id &= 0x3fffffff;
+			str.name = name;
+			str.len = sprintf(name, "iNode%d", id);
+			res = hfsplus_rename_cat(inode->i_ino,
+						 src_dir, &src_dentry->d_name,
+						 sbi->hidden_dir, &str);
+			if (!res)
+				break;
+			if (res != -EEXIST)
+				goto out;
+		}
+		HFSPLUS_I(inode)->linkid = id;
+		cnid = sbi->next_cnid++;
+		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
+		res = hfsplus_create_cat(cnid, src_dir,
+			&src_dentry->d_name, inode);
+		if (res)
+			/* panic? */
+			goto out;
+		sbi->file_count++;
+	}
+	cnid = sbi->next_cnid++;
+	res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
+	if (res)
+		goto out;
+
+	inc_nlink(inode);
+	hfsplus_instantiate(dst_dentry, inode, cnid);
+	ihold(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	sbi->file_count++;
+	hfsplus_mark_mdb_dirty(dst_dir->i_sb);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode = d_inode(dentry);
+	struct qstr str;
+	char name[32];
+	u32 cnid;
+	int res;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		return -EPERM;
+
+	mutex_lock(&sbi->vh_mutex);
+	cnid = (u32)(unsigned long)dentry->d_fsdata;
+	if (inode->i_ino == cnid &&
+	    atomic_read(&HFSPLUS_I(inode)->opencnt)) {
+		str.name = name;
+		str.len = sprintf(name, "temp%lu", inode->i_ino);
+		res = hfsplus_rename_cat(inode->i_ino,
+					 dir, &dentry->d_name,
+					 sbi->hidden_dir, &str);
+		if (!res) {
+			inode->i_flags |= S_DEAD;
+			drop_nlink(inode);
+		}
+		goto out;
+	}
+	res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
+	if (res)
+		goto out;
+
+	if (inode->i_nlink > 0)
+		drop_nlink(inode);
+	if (inode->i_ino == cnid)
+		clear_nlink(inode);
+	if (!inode->i_nlink) {
+		if (inode->i_ino != cnid) {
+			sbi->file_count--;
+			if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
+				res = hfsplus_delete_cat(inode->i_ino,
+							 sbi->hidden_dir,
+							 NULL);
+				if (!res)
+					hfsplus_delete_inode(inode);
+			} else
+				inode->i_flags |= S_DEAD;
+		} else
+			hfsplus_delete_inode(inode);
+	} else
+		sbi->file_count--;
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode = d_inode(dentry);
+	int res;
+
+	if (inode->i_size != 2)
+		return -ENOTEMPTY;
+
+	mutex_lock(&sbi->vh_mutex);
+	res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+	if (res)
+		goto out;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	hfsplus_delete_inode(inode);
+	mark_inode_dirty(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
+			   const char *symname)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode;
+	int res = -ENOMEM;
+
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
+	if (!inode)
+		goto out;
+
+	res = page_symlink(inode, symname, strlen(symname) + 1);
+	if (res)
+		goto out_err;
+
+	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res)
+		goto out_err;
+
+	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+	if (res == -EOPNOTSUPP)
+		res = 0; /* Operation is not supported. */
+	else if (res) {
+		/* Try to delete anyway without error analysis. */
+		hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+		goto out_err;
+	}
+
+	hfsplus_instantiate(dentry, inode, inode->i_ino);
+	mark_inode_dirty(inode);
+	goto out;
+
+out_err:
+	clear_nlink(inode);
+	hfsplus_delete_inode(inode);
+	iput(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, dev_t rdev)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode;
+	int res = -ENOMEM;
+
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, mode);
+	if (!inode)
+		goto out;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+		init_special_inode(inode, mode, rdev);
+
+	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res)
+		goto failed_mknod;
+
+	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+	if (res == -EOPNOTSUPP)
+		res = 0; /* Operation is not supported. */
+	else if (res) {
+		/* Try to delete anyway without error analysis. */
+		hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+		goto failed_mknod;
+	}
+
+	hfsplus_instantiate(dentry, inode, inode->i_ino);
+	mark_inode_dirty(inode);
+	goto out;
+
+failed_mknod:
+	clear_nlink(inode);
+	hfsplus_delete_inode(inode);
+	iput(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			  bool excl)
+{
+	return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
+}
+
+static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
+			  struct inode *new_dir, struct dentry *new_dentry)
+{
+	int res;
+
+	/* Unlink destination if it already exists */
+	if (d_really_is_positive(new_dentry)) {
+		if (d_is_dir(new_dentry))
+			res = hfsplus_rmdir(new_dir, new_dentry);
+		else
+			res = hfsplus_unlink(new_dir, new_dentry);
+		if (res)
+			return res;
+	}
+
+	res = hfsplus_rename_cat((u32)(unsigned long)old_dentry->d_fsdata,
+				 old_dir, &old_dentry->d_name,
+				 new_dir, &new_dentry->d_name);
+	if (!res)
+		new_dentry->d_fsdata = old_dentry->d_fsdata;
+	return res;
+}
+
+const struct inode_operations hfsplus_dir_inode_operations = {
+	.lookup			= hfsplus_lookup,
+	.create			= hfsplus_create,
+	.link			= hfsplus_link,
+	.unlink			= hfsplus_unlink,
+	.mkdir			= hfsplus_mkdir,
+	.rmdir			= hfsplus_rmdir,
+	.symlink		= hfsplus_symlink,
+	.mknod			= hfsplus_mknod,
+	.rename			= hfsplus_rename,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.listxattr		= hfsplus_listxattr,
+	.removexattr		= generic_removexattr,
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+	.get_acl		= hfsplus_get_posix_acl,
+	.set_acl		= hfsplus_set_posix_acl,
+#endif
+};
+
+const struct file_operations hfsplus_dir_operations = {
+	.fsync		= hfsplus_file_fsync,
+	.read		= generic_read_dir,
+	.iterate	= hfsplus_readdir,
+	.unlocked_ioctl = hfsplus_ioctl,
+	.llseek		= generic_file_llseek,
+	.release	= hfsplus_dir_release,
+};
diff --git a/kernel/fs/hfsplus/extents.c b/kernel/fs/hfsplus/extents.c
new file mode 100644
index 000000000..feca524ce
--- /dev/null
+++ b/kernel/fs/hfsplus/extents.c
@@ -0,0 +1,611 @@
+/*
+ *  linux/fs/hfsplus/extents.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of Extents both in catalog and extents overflow trees
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/* Compare two extents keys, returns 0 on same, pos/neg for difference */
+int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1,
+			const hfsplus_btree_key *k2)
+{
+	__be32 k1id, k2id;
+	__be32 k1s, k2s;
+
+	k1id = k1->ext.cnid;
+	k2id = k2->ext.cnid;
+	if (k1id != k2id)
+		return be32_to_cpu(k1id) < be32_to_cpu(k2id) ? -1 : 1;
+
+	if (k1->ext.fork_type != k2->ext.fork_type)
+		return k1->ext.fork_type < k2->ext.fork_type ? -1 : 1;
+
+	k1s = k1->ext.start_block;
+	k2s = k2->ext.start_block;
+	if (k1s == k2s)
+		return 0;
+	return be32_to_cpu(k1s) < be32_to_cpu(k2s) ? -1 : 1;
+}
+
+static void hfsplus_ext_build_key(hfsplus_btree_key *key, u32 cnid,
+				  u32 block, u8 type)
+{
+	key->key_len = cpu_to_be16(HFSPLUS_EXT_KEYLEN - 2);
+	key->ext.cnid = cpu_to_be32(cnid);
+	key->ext.start_block = cpu_to_be32(block);
+	key->ext.fork_type = type;
+	key->ext.pad = 0;
+}
+
+static u32 hfsplus_ext_find_block(struct hfsplus_extent *ext, u32 off)
+{
+	int i;
+	u32 count;
+
+	for (i = 0; i < 8; ext++, i++) {
+		count = be32_to_cpu(ext->block_count);
+		if (off < count)
+			return be32_to_cpu(ext->start_block) + off;
+		off -= count;
+	}
+	/* panic? */
+	return 0;
+}
+
+static int hfsplus_ext_block_count(struct hfsplus_extent *ext)
+{
+	int i;
+	u32 count = 0;
+
+	for (i = 0; i < 8; ext++, i++)
+		count += be32_to_cpu(ext->block_count);
+	return count;
+}
+
+static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
+{
+	int i;
+
+	ext += 7;
+	for (i = 0; i < 7; ext--, i++)
+		if (ext->block_count)
+			break;
+	return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
+}
+
+static int __hfsplus_ext_write_extent(struct inode *inode,
+		struct hfs_find_data *fd)
+{
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	int res;
+
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+			      HFSPLUS_IS_RSRC(inode) ?
+				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
+	res = hfs_brec_find(fd, hfs_find_rec_by_key);
+	if (hip->extent_state & HFSPLUS_EXT_NEW) {
+		if (res != -ENOENT)
+			return res;
+		hfs_brec_insert(fd, hip->cached_extents,
+				sizeof(hfsplus_extent_rec));
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
+	} else {
+		if (res)
+			return res;
+		hfs_bnode_write(fd->bnode, hip->cached_extents,
+				fd->entryoffset, fd->entrylength);
+		hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
+	}
+
+	/*
+	 * We can't just use hfsplus_mark_inode_dirty here, because we
+	 * also get called from hfsplus_write_inode, which should not
+	 * redirty the inode.  Instead the callers have to be careful
+	 * to explicily mark the inode dirty, too.
+	 */
+	set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
+
+	return 0;
+}
+
+static int hfsplus_ext_write_extent_locked(struct inode *inode)
+{
+	int res = 0;
+
+	if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
+		struct hfs_find_data fd;
+
+		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
+		if (res)
+			return res;
+		res = __hfsplus_ext_write_extent(inode, &fd);
+		hfs_find_exit(&fd);
+	}
+	return res;
+}
+
+int hfsplus_ext_write_extent(struct inode *inode)
+{
+	int res;
+
+	mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+	res = hfsplus_ext_write_extent_locked(inode);
+	mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+
+	return res;
+}
+
+static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
+					    struct hfsplus_extent *extent,
+					    u32 cnid, u32 block, u8 type)
+{
+	int res;
+
+	hfsplus_ext_build_key(fd->search_key, cnid, block, type);
+	fd->key->ext.cnid = 0;
+	res = hfs_brec_find(fd, hfs_find_rec_by_key);
+	if (res && res != -ENOENT)
+		return res;
+	if (fd->key->ext.cnid != fd->search_key->ext.cnid ||
+	    fd->key->ext.fork_type != fd->search_key->ext.fork_type)
+		return -ENOENT;
+	if (fd->entrylength != sizeof(hfsplus_extent_rec))
+		return -EIO;
+	hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
+		sizeof(hfsplus_extent_rec));
+	return 0;
+}
+
+static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
+		struct inode *inode, u32 block)
+{
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	int res;
+
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	if (hip->extent_state & HFSPLUS_EXT_DIRTY) {
+		res = __hfsplus_ext_write_extent(inode, fd);
+		if (res)
+			return res;
+	}
+
+	res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+					block, HFSPLUS_IS_RSRC(inode) ?
+						HFSPLUS_TYPE_RSRC :
+						HFSPLUS_TYPE_DATA);
+	if (!res) {
+		hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+		hip->cached_blocks =
+			hfsplus_ext_block_count(hip->cached_extents);
+	} else {
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
+	}
+	return res;
+}
+
+static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
+{
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	struct hfs_find_data fd;
+	int res;
+
+	if (block >= hip->cached_start &&
+	    block < hip->cached_start + hip->cached_blocks)
+		return 0;
+
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
+	if (!res) {
+		res = __hfsplus_ext_cache_extent(&fd, inode, block);
+		hfs_find_exit(&fd);
+	}
+	return res;
+}
+
+/* Get a block at iblock for inode, possibly allocating if create */
+int hfsplus_get_block(struct inode *inode, sector_t iblock,
+		      struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	int res = -EIO;
+	u32 ablock, dblock, mask;
+	sector_t sector;
+	int was_dirty = 0;
+
+	/* Convert inode block to disk allocation block */
+	ablock = iblock >> sbi->fs_shift;
+
+	if (iblock >= hip->fs_blocks) {
+		if (iblock > hip->fs_blocks || !create)
+			return -EIO;
+		if (ablock >= hip->alloc_blocks) {
+			res = hfsplus_file_extend(inode, false);
+			if (res)
+				return res;
+		}
+	} else
+		create = 0;
+
+	if (ablock < hip->first_blocks) {
+		dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
+		goto done;
+	}
+
+	if (inode->i_ino == HFSPLUS_EXT_CNID)
+		return -EIO;
+
+	mutex_lock(&hip->extents_lock);
+
+	/*
+	 * hfsplus_ext_read_extent will write out a cached extent into
+	 * the extents btree.  In that case we may have to mark the inode
+	 * dirty even for a pure read of an extent here.
+	 */
+	was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
+	res = hfsplus_ext_read_extent(inode, ablock);
+	if (res) {
+		mutex_unlock(&hip->extents_lock);
+		return -EIO;
+	}
+	dblock = hfsplus_ext_find_block(hip->cached_extents,
+					ablock - hip->cached_start);
+	mutex_unlock(&hip->extents_lock);
+
+done:
+	hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n",
+		inode->i_ino, (long long)iblock, dblock);
+
+	mask = (1 << sbi->fs_shift) - 1;
+	sector = ((sector_t)dblock << sbi->fs_shift) +
+		  sbi->blockoffset + (iblock & mask);
+	map_bh(bh_result, sb, sector);
+
+	if (create) {
+		set_buffer_new(bh_result);
+		hip->phys_size += sb->s_blocksize;
+		hip->fs_blocks++;
+		inode_add_bytes(inode, sb->s_blocksize);
+	}
+	if (create || was_dirty)
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+static void hfsplus_dump_extent(struct hfsplus_extent *extent)
+{
+	int i;
+
+	hfs_dbg(EXTENT, "   ");
+	for (i = 0; i < 8; i++)
+		hfs_dbg_cont(EXTENT, " %u:%u",
+			     be32_to_cpu(extent[i].start_block),
+			     be32_to_cpu(extent[i].block_count));
+	hfs_dbg_cont(EXTENT, "\n");
+}
+
+static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset,
+			      u32 alloc_block, u32 block_count)
+{
+	u32 count, start;
+	int i;
+
+	hfsplus_dump_extent(extent);
+	for (i = 0; i < 8; extent++, i++) {
+		count = be32_to_cpu(extent->block_count);
+		if (offset == count) {
+			start = be32_to_cpu(extent->start_block);
+			if (alloc_block != start + count) {
+				if (++i >= 8)
+					return -ENOSPC;
+				extent++;
+				extent->start_block = cpu_to_be32(alloc_block);
+			} else
+				block_count += count;
+			extent->block_count = cpu_to_be32(block_count);
+			return 0;
+		} else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+}
+
+static int hfsplus_free_extents(struct super_block *sb,
+				struct hfsplus_extent *extent,
+				u32 offset, u32 block_nr)
+{
+	u32 count, start;
+	int i;
+	int err = 0;
+
+	hfsplus_dump_extent(extent);
+	for (i = 0; i < 8; extent++, i++) {
+		count = be32_to_cpu(extent->block_count);
+		if (offset == count)
+			goto found;
+		else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+found:
+	for (;;) {
+		start = be32_to_cpu(extent->start_block);
+		if (count <= block_nr) {
+			err = hfsplus_block_free(sb, start, count);
+			if (err) {
+				pr_err("can't free extent\n");
+				hfs_dbg(EXTENT, " start: %u count: %u\n",
+					start, count);
+			}
+			extent->block_count = 0;
+			extent->start_block = 0;
+			block_nr -= count;
+		} else {
+			count -= block_nr;
+			err = hfsplus_block_free(sb, start + count, block_nr);
+			if (err) {
+				pr_err("can't free extent\n");
+				hfs_dbg(EXTENT, " start: %u count: %u\n",
+					start, count);
+			}
+			extent->block_count = cpu_to_be32(count);
+			block_nr = 0;
+		}
+		if (!block_nr || !i) {
+			/*
+			 * Try to free all extents and
+			 * return only last error
+			 */
+			return err;
+		}
+		i--;
+		extent--;
+		count = be32_to_cpu(extent->block_count);
+	}
+}
+
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+		struct hfsplus_fork_raw *fork, int type)
+{
+	struct hfs_find_data fd;
+	hfsplus_extent_rec ext_entry;
+	u32 total_blocks, blocks, start;
+	int res, i;
+
+	total_blocks = be32_to_cpu(fork->total_blocks);
+	if (!total_blocks)
+		return 0;
+
+	blocks = 0;
+	for (i = 0; i < 8; i++)
+		blocks += be32_to_cpu(fork->extents[i].block_count);
+
+	res = hfsplus_free_extents(sb, fork->extents, blocks, blocks);
+	if (res)
+		return res;
+	if (total_blocks == blocks)
+		return 0;
+
+	res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+	if (res)
+		return res;
+	do {
+		res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
+						total_blocks, type);
+		if (res)
+			break;
+		start = be32_to_cpu(fd.key->ext.start_block);
+		hfsplus_free_extents(sb, ext_entry,
+				     total_blocks - start,
+				     total_blocks);
+		hfs_brec_remove(&fd);
+		total_blocks = start;
+	} while (total_blocks > blocks);
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+int hfsplus_file_extend(struct inode *inode, bool zeroout)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	u32 start, len, goal;
+	int res;
+
+	if (sbi->alloc_file->i_size * 8 <
+	    sbi->total_blocks - sbi->free_blocks + 8) {
+		/* extend alloc file */
+		pr_err("extend alloc file! (%llu,%u,%u)\n",
+		       sbi->alloc_file->i_size * 8,
+		       sbi->total_blocks, sbi->free_blocks);
+		return -ENOSPC;
+	}
+
+	mutex_lock(&hip->extents_lock);
+	if (hip->alloc_blocks == hip->first_blocks)
+		goal = hfsplus_ext_lastblock(hip->first_extents);
+	else {
+		res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
+		if (res)
+			goto out;
+		goal = hfsplus_ext_lastblock(hip->cached_extents);
+	}
+
+	len = hip->clump_blocks;
+	start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+	if (start >= sbi->total_blocks) {
+		start = hfsplus_block_allocate(sb, goal, 0, &len);
+		if (start >= goal) {
+			res = -ENOSPC;
+			goto out;
+		}
+	}
+
+	if (zeroout) {
+		res = sb_issue_zeroout(sb, start, len, GFP_NOFS);
+		if (res)
+			goto out;
+	}
+
+	hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+
+	if (hip->alloc_blocks <= hip->first_blocks) {
+		if (!hip->first_blocks) {
+			hfs_dbg(EXTENT, "first extents\n");
+			/* no extents yet */
+			hip->first_extents[0].start_block = cpu_to_be32(start);
+			hip->first_extents[0].block_count = cpu_to_be32(len);
+			res = 0;
+		} else {
+			/* try to append to extents in inode */
+			res = hfsplus_add_extent(hip->first_extents,
+						 hip->alloc_blocks,
+						 start, len);
+			if (res == -ENOSPC)
+				goto insert_extent;
+		}
+		if (!res) {
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks += len;
+		}
+	} else {
+		res = hfsplus_add_extent(hip->cached_extents,
+					 hip->alloc_blocks - hip->cached_start,
+					 start, len);
+		if (!res) {
+			hfsplus_dump_extent(hip->cached_extents);
+			hip->extent_state |= HFSPLUS_EXT_DIRTY;
+			hip->cached_blocks += len;
+		} else if (res == -ENOSPC)
+			goto insert_extent;
+	}
+out:
+	if (!res) {
+		hip->alloc_blocks += len;
+		mutex_unlock(&hip->extents_lock);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
+		return 0;
+	}
+	mutex_unlock(&hip->extents_lock);
+	return res;
+
+insert_extent:
+	hfs_dbg(EXTENT, "insert new extent\n");
+	res = hfsplus_ext_write_extent_locked(inode);
+	if (res)
+		goto out;
+
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_extents[0].start_block = cpu_to_be32(start);
+	hip->cached_extents[0].block_count = cpu_to_be32(len);
+	hfsplus_dump_extent(hip->cached_extents);
+	hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
+	hip->cached_start = hip->alloc_blocks;
+	hip->cached_blocks = len;
+
+	res = 0;
+	goto out;
+}
+
+void hfsplus_file_truncate(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	struct hfs_find_data fd;
+	u32 alloc_cnt, blk_cnt, start;
+	int res;
+
+	hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n",
+		inode->i_ino, (long long)hip->phys_size, inode->i_size);
+
+	if (inode->i_size > hip->phys_size) {
+		struct address_space *mapping = inode->i_mapping;
+		struct page *page;
+		void *fsdata;
+		loff_t size = inode->i_size;
+
+		res = pagecache_write_begin(NULL, mapping, size, 0,
+						AOP_FLAG_UNINTERRUPTIBLE,
+						&page, &fsdata);
+		if (res)
+			return;
+		res = pagecache_write_end(NULL, mapping, size,
+			0, 0, page, fsdata);
+		if (res < 0)
+			return;
+		mark_inode_dirty(inode);
+		return;
+	} else if (inode->i_size == hip->phys_size)
+		return;
+
+	blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+			HFSPLUS_SB(sb)->alloc_blksz_shift;
+
+	mutex_lock(&hip->extents_lock);
+
+	alloc_cnt = hip->alloc_blocks;
+	if (blk_cnt == alloc_cnt)
+		goto out_unlock;
+
+	res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+	if (res) {
+		mutex_unlock(&hip->extents_lock);
+		/* XXX: We lack error handling of hfsplus_file_truncate() */
+		return;
+	}
+	while (1) {
+		if (alloc_cnt == hip->first_blocks) {
+			hfsplus_free_extents(sb, hip->first_extents,
+					     alloc_cnt, alloc_cnt - blk_cnt);
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks = blk_cnt;
+			break;
+		}
+		res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
+		if (res)
+			break;
+		start = hip->cached_start;
+		hfsplus_free_extents(sb, hip->cached_extents,
+				     alloc_cnt - start, alloc_cnt - blk_cnt);
+		hfsplus_dump_extent(hip->cached_extents);
+		if (blk_cnt > start) {
+			hip->extent_state |= HFSPLUS_EXT_DIRTY;
+			break;
+		}
+		alloc_cnt = start;
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
+		hfs_brec_remove(&fd);
+	}
+	hfs_find_exit(&fd);
+
+	hip->alloc_blocks = blk_cnt;
+out_unlock:
+	mutex_unlock(&hip->extents_lock);
+	hip->phys_size = inode->i_size;
+	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
+		sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
+}
diff --git a/kernel/fs/hfsplus/hfsplus_fs.h b/kernel/fs/hfsplus/hfsplus_fs.h
new file mode 100644
index 000000000..b0441d65f
--- /dev/null
+++ b/kernel/fs/hfsplus/hfsplus_fs.h
@@ -0,0 +1,540 @@
+/*
+ *  linux/include/linux/hfsplus_fs.h
+ *
+ * Copyright (C) 1999
+ * Brad Boyer (flar@pants.nu)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ */
+
+#ifndef _LINUX_HFSPLUS_FS_H
+#define _LINUX_HFSPLUS_FS_H
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include "hfsplus_raw.h"
+
+#define DBG_BNODE_REFS	0x00000001
+#define DBG_BNODE_MOD	0x00000002
+#define DBG_CAT_MOD	0x00000004
+#define DBG_INODE	0x00000008
+#define DBG_SUPER	0x00000010
+#define DBG_EXTENT	0x00000020
+#define DBG_BITMAP	0x00000040
+#define DBG_ATTR_MOD	0x00000080
+#define DBG_ACL_MOD	0x00000100
+
+#if 0
+#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
+#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#endif
+#define DBG_MASK	(0)
+
+#define hfs_dbg(flg, fmt, ...)					\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__);	\
+} while (0)
+
+#define hfs_dbg_cont(flg, fmt, ...)				\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		pr_cont(fmt, ##__VA_ARGS__);			\
+} while (0)
+
+/* Runtime config options */
+#define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
+
+#define HFSPLUS_TYPE_DATA 0x00
+#define HFSPLUS_TYPE_RSRC 0xFF
+
+typedef int (*btree_keycmp)(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
+
+#define NODE_HASH_SIZE	256
+
+/* B-tree mutex nested subclasses */
+enum hfsplus_btree_mutex_classes {
+	CATALOG_BTREE_MUTEX,
+	EXTENTS_BTREE_MUTEX,
+	ATTR_BTREE_MUTEX,
+};
+
+/* An HFS+ BTree held in memory */
+struct hfs_btree {
+	struct super_block *sb;
+	struct inode *inode;
+	btree_keycmp keycmp;
+
+	u32 cnid;
+	u32 root;
+	u32 leaf_count;
+	u32 leaf_head;
+	u32 leaf_tail;
+	u32 node_count;
+	u32 free_nodes;
+	u32 attributes;
+
+	unsigned int node_size;
+	unsigned int node_size_shift;
+	unsigned int max_key_len;
+	unsigned int depth;
+
+	struct mutex tree_lock;
+
+	unsigned int pages_per_bnode;
+	spinlock_t hash_lock;
+	struct hfs_bnode *node_hash[NODE_HASH_SIZE];
+	int node_hash_cnt;
+};
+
+struct page;
+
+/* An HFS+ BTree node in memory */
+struct hfs_bnode {
+	struct hfs_btree *tree;
+
+	u32 prev;
+	u32 this;
+	u32 next;
+	u32 parent;
+
+	u16 num_recs;
+	u8 type;
+	u8 height;
+
+	struct hfs_bnode *next_hash;
+	unsigned long flags;
+	wait_queue_head_t lock_wq;
+	atomic_t refcnt;
+	unsigned int page_offset;
+	struct page *page[0];
+};
+
+#define HFS_BNODE_LOCK		0
+#define HFS_BNODE_ERROR		1
+#define HFS_BNODE_NEW		2
+#define HFS_BNODE_DIRTY		3
+#define HFS_BNODE_DELETED	4
+
+/*
+ * Attributes file states
+ */
+#define HFSPLUS_EMPTY_ATTR_TREE		0
+#define HFSPLUS_CREATING_ATTR_TREE	1
+#define HFSPLUS_VALID_ATTR_TREE		2
+#define HFSPLUS_FAILED_ATTR_TREE	3
+
+/*
+ * HFS+ superblock info (built from Volume Header on disk)
+ */
+
+struct hfsplus_vh;
+struct hfs_btree;
+
+struct hfsplus_sb_info {
+	void *s_vhdr_buf;
+	struct hfsplus_vh *s_vhdr;
+	void *s_backup_vhdr_buf;
+	struct hfsplus_vh *s_backup_vhdr;
+	struct hfs_btree *ext_tree;
+	struct hfs_btree *cat_tree;
+	struct hfs_btree *attr_tree;
+	atomic_t attr_tree_state;
+	struct inode *alloc_file;
+	struct inode *hidden_dir;
+	struct nls_table *nls;
+
+	/* Runtime variables */
+	u32 blockoffset;
+	sector_t part_start;
+	sector_t sect_count;
+	int fs_shift;
+
+	/* immutable data from the volume header */
+	u32 alloc_blksz;
+	int alloc_blksz_shift;
+	u32 total_blocks;
+	u32 data_clump_blocks, rsrc_clump_blocks;
+
+	/* mutable data from the volume header, protected by alloc_mutex */
+	u32 free_blocks;
+	struct mutex alloc_mutex;
+
+	/* mutable data from the volume header, protected by vh_mutex */
+	u32 next_cnid;
+	u32 file_count;
+	u32 folder_count;
+	struct mutex vh_mutex;
+
+	/* Config options */
+	u32 creator;
+	u32 type;
+
+	umode_t umask;
+	kuid_t uid;
+	kgid_t gid;
+
+	int part, session;
+	unsigned long flags;
+
+	int work_queued;               /* non-zero delayed work is queued */
+	struct delayed_work sync_work; /* FS sync delayed work */
+	spinlock_t work_lock;          /* protects sync_work and work_queued */
+};
+
+#define HFSPLUS_SB_WRITEBACKUP	0
+#define HFSPLUS_SB_NODECOMPOSE	1
+#define HFSPLUS_SB_FORCE	2
+#define HFSPLUS_SB_HFSX		3
+#define HFSPLUS_SB_CASEFOLD	4
+#define HFSPLUS_SB_NOBARRIER	5
+
+static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+
+struct hfsplus_inode_info {
+	atomic_t opencnt;
+
+	/*
+	 * Extent allocation information, protected by extents_lock.
+	 */
+	u32 first_blocks;
+	u32 clump_blocks;
+	u32 alloc_blocks;
+	u32 cached_start;
+	u32 cached_blocks;
+	hfsplus_extent_rec first_extents;
+	hfsplus_extent_rec cached_extents;
+	unsigned int extent_state;
+	struct mutex extents_lock;
+
+	/*
+	 * Immutable data.
+	 */
+	struct inode *rsrc_inode;
+	__be32 create_date;
+
+	/*
+	 * Protected by sbi->vh_mutex.
+	 */
+	u32 linkid;
+
+	/*
+	 * Accessed using atomic bitops.
+	 */
+	unsigned long flags;
+
+	/*
+	 * Protected by i_mutex.
+	 */
+	sector_t fs_blocks;
+	u8 userflags;		/* BSD user file flags */
+	u32 subfolders;		/* Subfolder count (HFSX only) */
+	struct list_head open_dir_list;
+	loff_t phys_size;
+
+	struct inode vfs_inode;
+};
+
+#define HFSPLUS_EXT_DIRTY	0x0001
+#define HFSPLUS_EXT_NEW		0x0002
+
+#define HFSPLUS_I_RSRC		0	/* represents a resource fork */
+#define HFSPLUS_I_CAT_DIRTY	1	/* has changes in the catalog tree */
+#define HFSPLUS_I_EXT_DIRTY	2	/* has changes in the extent tree */
+#define HFSPLUS_I_ALLOC_DIRTY	3	/* has changes in the allocation file */
+#define HFSPLUS_I_ATTR_DIRTY	4	/* has changes in the attributes tree */
+
+#define HFSPLUS_IS_RSRC(inode) \
+	test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
+
+static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
+{
+	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+}
+
+/*
+ * Mark an inode dirty, and also mark the btree in which the
+ * specific type of metadata is stored.
+ * For data or metadata that gets written back by into the catalog btree
+ * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
+ */
+static inline void hfsplus_mark_inode_dirty(struct inode *inode,
+		unsigned int flag)
+{
+	set_bit(flag, &HFSPLUS_I(inode)->flags);
+	mark_inode_dirty(inode);
+}
+
+struct hfs_find_data {
+	/* filled by caller */
+	hfsplus_btree_key *search_key;
+	hfsplus_btree_key *key;
+	/* filled by find */
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	/* filled by findrec */
+	int record;
+	int keyoffset, keylength;
+	int entryoffset, entrylength;
+};
+
+struct hfsplus_readdir_data {
+	struct list_head list;
+	struct file *file;
+	struct hfsplus_cat_key key;
+};
+
+/*
+ * Find minimum acceptible I/O size for an hfsplus sb.
+ */
+static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
+{
+	return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev),
+		     HFSPLUS_SECTOR_SIZE);
+}
+
+#define hfs_btree_open hfsplus_btree_open
+#define hfs_btree_close hfsplus_btree_close
+#define hfs_btree_write hfsplus_btree_write
+#define hfs_bmap_alloc hfsplus_bmap_alloc
+#define hfs_bmap_free hfsplus_bmap_free
+#define hfs_bnode_read hfsplus_bnode_read
+#define hfs_bnode_read_u16 hfsplus_bnode_read_u16
+#define hfs_bnode_read_u8 hfsplus_bnode_read_u8
+#define hfs_bnode_read_key hfsplus_bnode_read_key
+#define hfs_bnode_write hfsplus_bnode_write
+#define hfs_bnode_write_u16 hfsplus_bnode_write_u16
+#define hfs_bnode_clear hfsplus_bnode_clear
+#define hfs_bnode_copy hfsplus_bnode_copy
+#define hfs_bnode_move hfsplus_bnode_move
+#define hfs_bnode_dump hfsplus_bnode_dump
+#define hfs_bnode_unlink hfsplus_bnode_unlink
+#define hfs_bnode_findhash hfsplus_bnode_findhash
+#define hfs_bnode_find hfsplus_bnode_find
+#define hfs_bnode_unhash hfsplus_bnode_unhash
+#define hfs_bnode_free hfsplus_bnode_free
+#define hfs_bnode_create hfsplus_bnode_create
+#define hfs_bnode_get hfsplus_bnode_get
+#define hfs_bnode_put hfsplus_bnode_put
+#define hfs_brec_lenoff hfsplus_brec_lenoff
+#define hfs_brec_keylen hfsplus_brec_keylen
+#define hfs_brec_insert hfsplus_brec_insert
+#define hfs_brec_remove hfsplus_brec_remove
+#define hfs_find_init hfsplus_find_init
+#define hfs_find_exit hfsplus_find_exit
+#define __hfs_brec_find __hfsplus_brec_find
+#define hfs_brec_find hfsplus_brec_find
+#define hfs_brec_read hfsplus_brec_read
+#define hfs_brec_goto hfsplus_brec_goto
+#define hfs_part_find hfsplus_part_find
+
+/*
+ * definitions for ext2 flag ioctls (linux really needs a generic
+ * interface for this).
+ */
+
+/* ext2 ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) to support
+ * chattr/lsattr */
+#define HFSPLUS_IOC_EXT2_GETFLAGS	FS_IOC_GETFLAGS
+#define HFSPLUS_IOC_EXT2_SETFLAGS	FS_IOC_SETFLAGS
+
+
+/*
+ * hfs+-specific ioctl for making the filesystem bootable
+ */
+#define HFSPLUS_IOC_BLESS _IO('h', 0x80)
+
+typedef int (*search_strategy_t)(struct hfs_bnode *,
+				struct hfs_find_data *,
+				int *, int *, int *);
+
+/*
+ * Functions in any *.c used in other files
+ */
+
+/* attributes.c */
+int __init hfsplus_create_attr_tree_cache(void);
+void hfsplus_destroy_attr_tree_cache(void);
+int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1,
+			     const hfsplus_btree_key *k2);
+int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
+			   u32 cnid, const char *name);
+hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
+void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry);
+int hfsplus_find_attr(struct super_block *sb, u32 cnid, const char *name,
+		      struct hfs_find_data *fd);
+int hfsplus_attr_exists(struct inode *inode, const char *name);
+int hfsplus_create_attr(struct inode *inode, const char *name,
+			const void *value, size_t size);
+int hfsplus_delete_attr(struct inode *inode, const char *name);
+int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid);
+
+/* bitmap.c */
+int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset,
+			   u32 *max);
+int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count);
+
+/* btree.c */
+u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors,
+				  int file_id);
+struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
+void hfs_btree_close(struct hfs_btree *tree);
+int hfs_btree_write(struct hfs_btree *tree);
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
+void hfs_bmap_free(struct hfs_bnode *node);
+
+/* bnode.c */
+void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len);
+u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off);
+u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off);
+void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off);
+void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len);
+void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data);
+void hfs_bnode_clear(struct hfs_bnode *node, int off, int len);
+void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
+		    struct hfs_bnode *src_node, int src, int len);
+void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len);
+void hfs_bnode_dump(struct hfs_bnode *node);
+void hfs_bnode_unlink(struct hfs_bnode *node);
+struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid);
+void hfs_bnode_unhash(struct hfs_bnode *node);
+struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num);
+void hfs_bnode_free(struct hfs_bnode *node);
+struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num);
+void hfs_bnode_get(struct hfs_bnode *node);
+void hfs_bnode_put(struct hfs_bnode *node);
+bool hfs_bnode_need_zeroout(struct hfs_btree *tree);
+
+/* brec.c */
+u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off);
+u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec);
+int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len);
+int hfs_brec_remove(struct hfs_find_data *fd);
+
+/* bfind.c */
+int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd);
+void hfs_find_exit(struct hfs_find_data *fd);
+int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+			     int *begin, int *end, int *cur_rec);
+int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+			int *begin, int *end, int *cur_rec);
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+		    search_strategy_t rec_found);
+int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare);
+int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len);
+int hfs_brec_goto(struct hfs_find_data *fd, int cnt);
+
+/* catalog.c */
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
+			     const hfsplus_btree_key *k2);
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
+			    const hfsplus_btree_key *k2);
+int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
+			   u32 parent, struct qstr *str);
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+				     hfsplus_btree_key *key, u32 parent);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
+int hfsplus_find_cat(struct super_block *sb, u32 cnid,
+		     struct hfs_find_data *fd);
+int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str,
+		       struct inode *inode);
+int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str);
+int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name,
+		       struct inode *dst_dir, struct qstr *dst_name);
+
+/* dir.c */
+extern const struct inode_operations hfsplus_dir_inode_operations;
+extern const struct file_operations hfsplus_dir_operations;
+
+/* extents.c */
+int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1,
+			const hfsplus_btree_key *k2);
+int hfsplus_ext_write_extent(struct inode *inode);
+int hfsplus_get_block(struct inode *inode, sector_t iblock,
+		      struct buffer_head *bh_result, int create);
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+		      struct hfsplus_fork_raw *fork, int type);
+int hfsplus_file_extend(struct inode *inode, bool zeroout);
+void hfsplus_file_truncate(struct inode *inode);
+
+/* inode.c */
+extern const struct address_space_operations hfsplus_aops;
+extern const struct address_space_operations hfsplus_btree_aops;
+extern const struct dentry_operations hfsplus_dentry_operations;
+
+struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode);
+void hfsplus_delete_inode(struct inode *inode);
+void hfsplus_inode_read_fork(struct inode *inode,
+			     struct hfsplus_fork_raw *fork);
+void hfsplus_inode_write_fork(struct inode *inode,
+			      struct hfsplus_fork_raw *fork);
+int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
+int hfsplus_cat_write_inode(struct inode *inode);
+int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
+		       int datasync);
+
+/* ioctl.c */
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
+/* options.c */
+void hfsplus_fill_defaults(struct hfsplus_sb_info *opts);
+int hfsplus_parse_options_remount(char *input, int *force);
+int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi);
+int hfsplus_show_options(struct seq_file *seq, struct dentry *root);
+
+/* part_tbl.c */
+int hfs_part_find(struct super_block *sb, sector_t *part_start,
+		  sector_t *part_size);
+
+/* super.c */
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino);
+void hfsplus_mark_mdb_dirty(struct super_block *sb);
+
+/* tables.c */
+extern u16 hfsplus_case_fold_table[];
+extern u16 hfsplus_decompose_table[];
+extern u16 hfsplus_compose_table[];
+
+/* unicode.c */
+int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
+		       const struct hfsplus_unistr *s2);
+int hfsplus_strcmp(const struct hfsplus_unistr *s1,
+		   const struct hfsplus_unistr *s2);
+int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
+		    char *astr, int *len_p);
+int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
+		    int max_unistr_len, const char *astr, int len);
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent,
+			   const struct dentry *dentry, unsigned int len,
+			   const char *str, const struct qstr *name);
+
+/* wrapper.c */
+int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf,
+		       void **data, int rw);
+int hfsplus_read_wrapper(struct super_block *sb);
+
+/* time macros */
+#define __hfsp_mt2ut(t)		(be32_to_cpu(t) - 2082844800U)
+#define __hfsp_ut2mt(t)		(cpu_to_be32(t + 2082844800U))
+
+/* compatibility */
+#define hfsp_mt2ut(t)		(struct timespec){ .tv_sec = __hfsp_mt2ut(t) }
+#define hfsp_ut2mt(t)		__hfsp_ut2mt((t).tv_sec)
+#define hfsp_now2mt()		__hfsp_ut2mt(get_seconds())
+
+#endif
diff --git a/kernel/fs/hfsplus/hfsplus_raw.h b/kernel/fs/hfsplus/hfsplus_raw.h
new file mode 100644
index 000000000..8298d0985
--- /dev/null
+++ b/kernel/fs/hfsplus/hfsplus_raw.h
@@ -0,0 +1,407 @@
+/*
+ *  linux/include/linux/hfsplus_raw.h
+ *
+ * Copyright (C) 1999
+ * Brad Boyer (flar@pants.nu)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Format of structures on disk
+ * Information taken from Apple Technote #1150 (HFS Plus Volume Format)
+ *
+ */
+
+#ifndef _LINUX_HFSPLUS_RAW_H
+#define _LINUX_HFSPLUS_RAW_H
+
+#include <linux/types.h>
+
+/* Some constants */
+#define HFSPLUS_SECTOR_SIZE        512
+#define HFSPLUS_SECTOR_SHIFT         9
+#define HFSPLUS_VOLHEAD_SECTOR       2
+#define HFSPLUS_VOLHEAD_SIG     0x482b
+#define HFSPLUS_VOLHEAD_SIGX    0x4858
+#define HFSPLUS_SUPER_MAGIC     0x482b
+#define HFSPLUS_MIN_VERSION          4
+#define HFSPLUS_CURRENT_VERSION      5
+
+#define HFSP_WRAP_MAGIC         0x4244
+#define HFSP_WRAP_ATTRIB_SLOCK  0x8000
+#define HFSP_WRAP_ATTRIB_SPARED 0x0200
+
+#define HFSP_WRAPOFF_SIG          0x00
+#define HFSP_WRAPOFF_ATTRIB       0x0A
+#define HFSP_WRAPOFF_ABLKSIZE     0x14
+#define HFSP_WRAPOFF_ABLKSTART    0x1C
+#define HFSP_WRAPOFF_EMBEDSIG     0x7C
+#define HFSP_WRAPOFF_EMBEDEXT     0x7E
+
+#define HFSP_HIDDENDIR_NAME \
+	"\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
+
+#define HFSP_HARDLINK_TYPE	0x686c6e6b	/* 'hlnk' */
+#define HFSP_HFSPLUS_CREATOR	0x6866732b	/* 'hfs+' */
+
+#define HFSP_SYMLINK_TYPE	0x736c6e6b	/* 'slnk' */
+#define HFSP_SYMLINK_CREATOR	0x72686170	/* 'rhap' */
+
+#define HFSP_MOUNT_VERSION	0x482b4c78	/* 'H+Lx' */
+
+/* Structures used on disk */
+
+typedef __be32 hfsplus_cnid;
+typedef __be16 hfsplus_unichr;
+
+#define HFSPLUS_MAX_STRLEN 255
+#define HFSPLUS_ATTR_MAX_STRLEN 127
+
+/* A "string" as used in filenames, etc. */
+struct hfsplus_unistr {
+	__be16 length;
+	hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN];
+} __packed;
+
+/*
+ * A "string" is used in attributes file
+ * for name of extended attribute
+ */
+struct hfsplus_attr_unistr {
+	__be16 length;
+	hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN];
+} __packed;
+
+/* POSIX permissions */
+struct hfsplus_perm {
+	__be32 owner;
+	__be32 group;
+	u8  rootflags;
+	u8  userflags;
+	__be16 mode;
+	__be32 dev;
+} __packed;
+
+#define HFSPLUS_FLG_NODUMP	0x01
+#define HFSPLUS_FLG_IMMUTABLE	0x02
+#define HFSPLUS_FLG_APPEND	0x04
+
+/* A single contiguous area of a file */
+struct hfsplus_extent {
+	__be32 start_block;
+	__be32 block_count;
+} __packed;
+typedef struct hfsplus_extent hfsplus_extent_rec[8];
+
+/* Information for a "Fork" in a file */
+struct hfsplus_fork_raw {
+	__be64 total_size;
+	__be32 clump_size;
+	__be32 total_blocks;
+	hfsplus_extent_rec extents;
+} __packed;
+
+/* HFS+ Volume Header */
+struct hfsplus_vh {
+	__be16 signature;
+	__be16 version;
+	__be32 attributes;
+	__be32 last_mount_vers;
+	u32 reserved;
+
+	__be32 create_date;
+	__be32 modify_date;
+	__be32 backup_date;
+	__be32 checked_date;
+
+	__be32 file_count;
+	__be32 folder_count;
+
+	__be32 blocksize;
+	__be32 total_blocks;
+	__be32 free_blocks;
+
+	__be32 next_alloc;
+	__be32 rsrc_clump_sz;
+	__be32 data_clump_sz;
+	hfsplus_cnid next_cnid;
+
+	__be32 write_count;
+	__be64 encodings_bmp;
+
+	u32 finder_info[8];
+
+	struct hfsplus_fork_raw alloc_file;
+	struct hfsplus_fork_raw ext_file;
+	struct hfsplus_fork_raw cat_file;
+	struct hfsplus_fork_raw attr_file;
+	struct hfsplus_fork_raw start_file;
+} __packed;
+
+/* HFS+ volume attributes */
+#define HFSPLUS_VOL_UNMNT		(1 << 8)
+#define HFSPLUS_VOL_SPARE_BLK		(1 << 9)
+#define HFSPLUS_VOL_NOCACHE		(1 << 10)
+#define HFSPLUS_VOL_INCNSTNT		(1 << 11)
+#define HFSPLUS_VOL_NODEID_REUSED	(1 << 12)
+#define HFSPLUS_VOL_JOURNALED		(1 << 13)
+#define HFSPLUS_VOL_SOFTLOCK		(1 << 15)
+#define HFSPLUS_VOL_UNUSED_NODE_FIX	(1 << 31)
+
+/* HFS+ BTree node descriptor */
+struct hfs_bnode_desc {
+	__be32 next;
+	__be32 prev;
+	s8 type;
+	u8 height;
+	__be16 num_recs;
+	u16 reserved;
+} __packed;
+
+/* HFS+ BTree node types */
+#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
+#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
+#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
+
+/* HFS+ BTree header */
+struct hfs_btree_header_rec {
+	__be16 depth;
+	__be32 root;
+	__be32 leaf_count;
+	__be32 leaf_head;
+	__be32 leaf_tail;
+	__be16 node_size;
+	__be16 max_key_len;
+	__be32 node_count;
+	__be32 free_nodes;
+	u16 reserved1;
+	__be32 clump_size;
+	u8 btree_type;
+	u8 key_type;
+	__be32 attributes;
+	u32 reserved3[16];
+} __packed;
+
+/* BTree attributes */
+#define HFS_TREE_BIGKEYS	2
+#define HFS_TREE_VARIDXKEYS	4
+
+/* HFS+ BTree misc info */
+#define HFSPLUS_TREE_HEAD 0
+#define HFSPLUS_NODE_MXSZ 32768
+#define HFSPLUS_ATTR_TREE_NODE_SIZE		8192
+#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT	3
+#define HFSPLUS_BTREE_HDR_USER_BYTES		128
+
+/* Some special File ID numbers (stolen from hfs.h) */
+#define HFSPLUS_POR_CNID		1	/* Parent Of the Root */
+#define HFSPLUS_ROOT_CNID		2	/* ROOT directory */
+#define HFSPLUS_EXT_CNID		3	/* EXTents B-tree */
+#define HFSPLUS_CAT_CNID		4	/* CATalog B-tree */
+#define HFSPLUS_BAD_CNID		5	/* BAD blocks file */
+#define HFSPLUS_ALLOC_CNID		6	/* ALLOCation file */
+#define HFSPLUS_START_CNID		7	/* STARTup file */
+#define HFSPLUS_ATTR_CNID		8	/* ATTRibutes file */
+#define HFSPLUS_EXCH_CNID		15	/* ExchangeFiles temp id */
+#define HFSPLUS_FIRSTUSER_CNID		16	/* first available user id */
+
+/* btree key type */
+#define HFSPLUS_KEY_CASEFOLDING		0xCF	/* case-insensitive */
+#define HFSPLUS_KEY_BINARY		0xBC	/* case-sensitive */
+
+/* HFS+ catalog entry key */
+struct hfsplus_cat_key {
+	__be16 key_len;
+	hfsplus_cnid parent;
+	struct hfsplus_unistr name;
+} __packed;
+
+#define HFSPLUS_CAT_KEYLEN	(sizeof(struct hfsplus_cat_key))
+
+/* Structs from hfs.h */
+struct hfsp_point {
+	__be16 v;
+	__be16 h;
+} __packed;
+
+struct hfsp_rect {
+	__be16 top;
+	__be16 left;
+	__be16 bottom;
+	__be16 right;
+} __packed;
+
+
+/* HFS directory info (stolen from hfs.h */
+struct DInfo {
+	struct hfsp_rect frRect;
+	__be16 frFlags;
+	struct hfsp_point frLocation;
+	__be16 frView;
+} __packed;
+
+struct DXInfo {
+	struct hfsp_point frScroll;
+	__be32 frOpenChain;
+	__be16 frUnused;
+	__be16 frComment;
+	__be32 frPutAway;
+} __packed;
+
+/* HFS+ folder data (part of an hfsplus_cat_entry) */
+struct hfsplus_cat_folder {
+	__be16 type;
+	__be16 flags;
+	__be32 valence;
+	hfsplus_cnid id;
+	__be32 create_date;
+	__be32 content_mod_date;
+	__be32 attribute_mod_date;
+	__be32 access_date;
+	__be32 backup_date;
+	struct hfsplus_perm permissions;
+	struct DInfo user_info;
+	struct DXInfo finder_info;
+	__be32 text_encoding;
+	__be32 subfolders;	/* Subfolder count in HFSX. Reserved in HFS+. */
+} __packed;
+
+/* HFS file info (stolen from hfs.h) */
+struct FInfo {
+	__be32 fdType;
+	__be32 fdCreator;
+	__be16 fdFlags;
+	struct hfsp_point fdLocation;
+	__be16 fdFldr;
+} __packed;
+
+struct FXInfo {
+	__be16 fdIconID;
+	u8 fdUnused[8];
+	__be16 fdComment;
+	__be32 fdPutAway;
+} __packed;
+
+/* HFS+ file data (part of a cat_entry) */
+struct hfsplus_cat_file {
+	__be16 type;
+	__be16 flags;
+	u32 reserved1;
+	hfsplus_cnid id;
+	__be32 create_date;
+	__be32 content_mod_date;
+	__be32 attribute_mod_date;
+	__be32 access_date;
+	__be32 backup_date;
+	struct hfsplus_perm permissions;
+	struct FInfo user_info;
+	struct FXInfo finder_info;
+	__be32 text_encoding;
+	u32 reserved2;
+
+	struct hfsplus_fork_raw data_fork;
+	struct hfsplus_fork_raw rsrc_fork;
+} __packed;
+
+/* File and folder flag bits */
+#define HFSPLUS_FILE_LOCKED		0x0001
+#define HFSPLUS_FILE_THREAD_EXISTS	0x0002
+#define HFSPLUS_XATTR_EXISTS		0x0004
+#define HFSPLUS_ACL_EXISTS		0x0008
+#define HFSPLUS_HAS_FOLDER_COUNT	0x0010	/* Folder has subfolder count
+						 * (HFSX only) */
+
+/* HFS+ catalog thread (part of a cat_entry) */
+struct hfsplus_cat_thread {
+	__be16 type;
+	s16 reserved;
+	hfsplus_cnid parentID;
+	struct hfsplus_unistr nodeName;
+} __packed;
+
+#define HFSPLUS_MIN_THREAD_SZ 10
+
+/* A data record in the catalog tree */
+typedef union {
+	__be16 type;
+	struct hfsplus_cat_folder folder;
+	struct hfsplus_cat_file file;
+	struct hfsplus_cat_thread thread;
+} __packed hfsplus_cat_entry;
+
+/* HFS+ catalog entry type */
+#define HFSPLUS_FOLDER         0x0001
+#define HFSPLUS_FILE           0x0002
+#define HFSPLUS_FOLDER_THREAD  0x0003
+#define HFSPLUS_FILE_THREAD    0x0004
+
+/* HFS+ extents tree key */
+struct hfsplus_ext_key {
+	__be16 key_len;
+	u8 fork_type;
+	u8 pad;
+	hfsplus_cnid cnid;
+	__be32 start_block;
+} __packed;
+
+#define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
+
+#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo"
+#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security"
+
+#define HFSPLUS_ATTR_INLINE_DATA 0x10
+#define HFSPLUS_ATTR_FORK_DATA   0x20
+#define HFSPLUS_ATTR_EXTENTS     0x30
+
+/* HFS+ attributes tree key */
+struct hfsplus_attr_key {
+	__be16 key_len;
+	__be16 pad;
+	hfsplus_cnid cnid;
+	__be32 start_block;
+	struct hfsplus_attr_unistr key_name;
+} __packed;
+
+#define HFSPLUS_ATTR_KEYLEN	sizeof(struct hfsplus_attr_key)
+
+/* HFS+ fork data attribute */
+struct hfsplus_attr_fork_data {
+	__be32 record_type;
+	__be32 reserved;
+	struct hfsplus_fork_raw the_fork;
+} __packed;
+
+/* HFS+ extension attribute */
+struct hfsplus_attr_extents {
+	__be32 record_type;
+	__be32 reserved;
+	struct hfsplus_extent extents;
+} __packed;
+
+#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802
+
+/* HFS+ attribute inline data */
+struct hfsplus_attr_inline_data {
+	__be32 record_type;
+	__be32 reserved1;
+	u8 reserved2[6];
+	__be16 length;
+	u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE];
+} __packed;
+
+/* A data record in the attributes tree */
+typedef union {
+	__be32 record_type;
+	struct hfsplus_attr_fork_data fork_data;
+	struct hfsplus_attr_extents extents;
+	struct hfsplus_attr_inline_data inline_data;
+} __packed hfsplus_attr_entry;
+
+/* HFS+ generic BTree key */
+typedef union {
+	__be16 key_len;
+	struct hfsplus_cat_key cat;
+	struct hfsplus_ext_key ext;
+	struct hfsplus_attr_key attr;
+} __packed hfsplus_btree_key;
+
+#endif
diff --git a/kernel/fs/hfsplus/inode.c b/kernel/fs/hfsplus/inode.c
new file mode 100644
index 000000000..6dd107d74
--- /dev/null
+++ b/kernel/fs/hfsplus/inode.c
@@ -0,0 +1,617 @@
+/*
+ *  linux/fs/hfsplus/inode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Inode handling routines
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int hfsplus_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, hfsplus_get_block);
+}
+
+static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, hfsplus_get_block, wbc);
+}
+
+static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		hfsplus_file_truncate(inode);
+	}
+}
+
+static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				hfsplus_get_block,
+				&HFSPLUS_I(mapping->host)->phys_size);
+	if (unlikely(ret))
+		hfsplus_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, hfsplus_get_block);
+}
+
+static int hfsplus_releasepage(struct page *page, gfp_t mask)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_btree *tree;
+	struct hfs_bnode *node;
+	u32 nidx;
+	int i, res = 1;
+
+	switch (inode->i_ino) {
+	case HFSPLUS_EXT_CNID:
+		tree = HFSPLUS_SB(sb)->ext_tree;
+		break;
+	case HFSPLUS_CAT_CNID:
+		tree = HFSPLUS_SB(sb)->cat_tree;
+		break;
+	case HFSPLUS_ATTR_CNID:
+		tree = HFSPLUS_SB(sb)->attr_tree;
+		break;
+	default:
+		BUG();
+		return 0;
+	}
+	if (!tree)
+		return 0;
+	if (tree->node_size >= PAGE_CACHE_SIZE) {
+		nidx = page->index >>
+			(tree->node_size_shift - PAGE_CACHE_SHIFT);
+		spin_lock(&tree->hash_lock);
+		node = hfs_bnode_findhash(tree, nidx);
+		if (!node)
+			;
+		else if (atomic_read(&node->refcnt))
+			res = 0;
+		if (res && node) {
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		}
+		spin_unlock(&tree->hash_lock);
+	} else {
+		nidx = page->index <<
+			(PAGE_CACHE_SHIFT - tree->node_size_shift);
+		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		spin_lock(&tree->hash_lock);
+		do {
+			node = hfs_bnode_findhash(tree, nidx++);
+			if (!node)
+				continue;
+			if (atomic_read(&node->refcnt)) {
+				res = 0;
+				break;
+			}
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		} while (--i && nidx < tree->node_count);
+		spin_unlock(&tree->hash_lock);
+	}
+	return res ? try_to_free_buffers(page) : 0;
+}
+
+static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+				 loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = file_inode(file)->i_mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + count;
+
+		if (end > isize)
+			hfsplus_write_failed(mapping, end);
+	}
+
+	return ret;
+}
+
+static int hfsplus_writepages(struct address_space *mapping,
+			      struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, hfsplus_get_block);
+}
+
+const struct address_space_operations hfsplus_btree_aops = {
+	.readpage	= hfsplus_readpage,
+	.writepage	= hfsplus_writepage,
+	.write_begin	= hfsplus_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfsplus_bmap,
+	.releasepage	= hfsplus_releasepage,
+};
+
+const struct address_space_operations hfsplus_aops = {
+	.readpage	= hfsplus_readpage,
+	.writepage	= hfsplus_writepage,
+	.write_begin	= hfsplus_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfsplus_bmap,
+	.direct_IO	= hfsplus_direct_IO,
+	.writepages	= hfsplus_writepages,
+};
+
+const struct dentry_operations hfsplus_dentry_operations = {
+	.d_hash       = hfsplus_hash_dentry,
+	.d_compare    = hfsplus_compare_dentry,
+};
+
+static void hfsplus_get_perms(struct inode *inode,
+		struct hfsplus_perm *perms, int dir)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	u16 mode;
+
+	mode = be16_to_cpu(perms->mode);
+
+	i_uid_write(inode, be32_to_cpu(perms->owner));
+	if (!i_uid_read(inode) && !mode)
+		inode->i_uid = sbi->uid;
+
+	i_gid_write(inode, be32_to_cpu(perms->group));
+	if (!i_gid_read(inode) && !mode)
+		inode->i_gid = sbi->gid;
+
+	if (dir) {
+		mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
+		mode |= S_IFDIR;
+	} else if (!mode)
+		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
+	inode->i_mode = mode;
+
+	HFSPLUS_I(inode)->userflags = perms->userflags;
+	if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (perms->rootflags & HFSPLUS_FLG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+}
+
+static int hfsplus_file_open(struct inode *inode, struct file *file)
+{
+	if (HFSPLUS_IS_RSRC(inode))
+		inode = HFSPLUS_I(inode)->rsrc_inode;
+	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EOVERFLOW;
+	atomic_inc(&HFSPLUS_I(inode)->opencnt);
+	return 0;
+}
+
+static int hfsplus_file_release(struct inode *inode, struct file *file)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		inode = HFSPLUS_I(inode)->rsrc_inode;
+	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		hfsplus_file_truncate(inode);
+		if (inode->i_flags & S_DEAD) {
+			hfsplus_delete_cat(inode->i_ino,
+					   HFSPLUS_SB(sb)->hidden_dir, NULL);
+			hfsplus_delete_inode(inode);
+		}
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
+		if (attr->ia_size > inode->i_size) {
+			error = generic_cont_expand_simple(inode,
+							   attr->ia_size);
+			if (error)
+				return error;
+		}
+		truncate_setsize(inode, attr->ia_size);
+		hfsplus_file_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		error = posix_acl_chmod(inode, inode->i_mode);
+		if (unlikely(error))
+			return error;
+	}
+
+	return 0;
+}
+
+int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
+		       int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	int error = 0, error2;
+
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return error;
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * Sync inode metadata into the catalog and extent trees.
+	 */
+	sync_inode_metadata(inode, 1);
+
+	/*
+	 * And explicitly write out the btrees.
+	 */
+	if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
+		error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+
+	if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
+		error2 =
+			filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+		if (!error)
+			error = error2;
+	}
+
+	if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) {
+		if (sbi->attr_tree) {
+			error2 =
+				filemap_write_and_wait(
+					    sbi->attr_tree->inode->i_mapping);
+			if (!error)
+				error = error2;
+		} else {
+			pr_err("sync non-existent attributes tree\n");
+		}
+	}
+
+	if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
+		error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+		if (!error)
+			error = error2;
+	}
+
+	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+
+	mutex_unlock(&inode->i_mutex);
+
+	return error;
+}
+
+static const struct inode_operations hfsplus_file_inode_operations = {
+	.setattr	= hfsplus_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= hfsplus_listxattr,
+	.removexattr	= generic_removexattr,
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+	.get_acl	= hfsplus_get_posix_acl,
+	.set_acl	= hfsplus_set_posix_acl,
+#endif
+};
+
+static const struct file_operations hfsplus_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+	.fsync		= hfsplus_file_fsync,
+	.open		= hfsplus_file_open,
+	.release	= hfsplus_file_release,
+	.unlocked_ioctl = hfsplus_ioctl,
+};
+
+struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct inode *inode = new_inode(sb);
+	struct hfsplus_inode_info *hip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = sbi->next_cnid++;
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	set_nlink(inode, 1);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+	hip = HFSPLUS_I(inode);
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	atomic_set(&hip->opencnt, 0);
+	hip->extent_state = 0;
+	hip->flags = 0;
+	hip->userflags = 0;
+	hip->subfolders = 0;
+	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->alloc_blocks = 0;
+	hip->first_blocks = 0;
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+	hip->phys_size = 0;
+	hip->fs_blocks = 0;
+	hip->rsrc_inode = NULL;
+	if (S_ISDIR(inode->i_mode)) {
+		inode->i_size = 2;
+		sbi->folder_count++;
+		inode->i_op = &hfsplus_dir_inode_operations;
+		inode->i_fop = &hfsplus_dir_operations;
+	} else if (S_ISREG(inode->i_mode)) {
+		sbi->file_count++;
+		inode->i_op = &hfsplus_file_inode_operations;
+		inode->i_fop = &hfsplus_file_operations;
+		inode->i_mapping->a_ops = &hfsplus_aops;
+		hip->clump_blocks = sbi->data_clump_blocks;
+	} else if (S_ISLNK(inode->i_mode)) {
+		sbi->file_count++;
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_mapping->a_ops = &hfsplus_aops;
+		hip->clump_blocks = 1;
+	} else
+		sbi->file_count++;
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	hfsplus_mark_mdb_dirty(sb);
+
+	return inode;
+}
+
+void hfsplus_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (S_ISDIR(inode->i_mode)) {
+		HFSPLUS_SB(sb)->folder_count--;
+		hfsplus_mark_mdb_dirty(sb);
+		return;
+	}
+	HFSPLUS_SB(sb)->file_count--;
+	if (S_ISREG(inode->i_mode)) {
+		if (!inode->i_nlink) {
+			inode->i_size = 0;
+			hfsplus_file_truncate(inode);
+		}
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_size = 0;
+		hfsplus_file_truncate(inode);
+	}
+	hfsplus_mark_mdb_dirty(sb);
+}
+
+void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	u32 count;
+	int i;
+
+	memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
+	for (count = 0, i = 0; i < 8; i++)
+		count += be32_to_cpu(fork->extents[i].block_count);
+	hip->first_blocks = count;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+
+	hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+	hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+	hip->fs_blocks =
+		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+	hip->clump_blocks =
+		be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+	if (!hip->clump_blocks) {
+		hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+			sbi->rsrc_clump_blocks :
+			sbi->data_clump_blocks;
+	}
+}
+
+void hfsplus_inode_write_fork(struct inode *inode,
+		struct hfsplus_fork_raw *fork)
+{
+	memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
+	       sizeof(hfsplus_extent_rec));
+	fork->total_size = cpu_to_be64(inode->i_size);
+	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
+}
+
+int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
+{
+	hfsplus_cat_entry entry;
+	int res = 0;
+	u16 type;
+
+	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
+
+	HFSPLUS_I(inode)->linkid = 0;
+	if (type == HFSPLUS_FOLDER) {
+		struct hfsplus_cat_folder *folder = &entry.folder;
+
+		if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
+			/* panic? */;
+		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
+					sizeof(struct hfsplus_cat_folder));
+		hfsplus_get_perms(inode, &folder->permissions, 1);
+		set_nlink(inode, 1);
+		inode->i_size = 2 + be32_to_cpu(folder->valence);
+		inode->i_atime = hfsp_mt2ut(folder->access_date);
+		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
+		inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
+		HFSPLUS_I(inode)->create_date = folder->create_date;
+		HFSPLUS_I(inode)->fs_blocks = 0;
+		if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
+			HFSPLUS_I(inode)->subfolders =
+				be32_to_cpu(folder->subfolders);
+		}
+		inode->i_op = &hfsplus_dir_inode_operations;
+		inode->i_fop = &hfsplus_dir_operations;
+	} else if (type == HFSPLUS_FILE) {
+		struct hfsplus_cat_file *file = &entry.file;
+
+		if (fd->entrylength < sizeof(struct hfsplus_cat_file))
+			/* panic? */;
+		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
+					sizeof(struct hfsplus_cat_file));
+
+		hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
+					&file->rsrc_fork : &file->data_fork);
+		hfsplus_get_perms(inode, &file->permissions, 0);
+		set_nlink(inode, 1);
+		if (S_ISREG(inode->i_mode)) {
+			if (file->permissions.dev)
+				set_nlink(inode,
+					  be32_to_cpu(file->permissions.dev));
+			inode->i_op = &hfsplus_file_inode_operations;
+			inode->i_fop = &hfsplus_file_operations;
+			inode->i_mapping->a_ops = &hfsplus_aops;
+		} else if (S_ISLNK(inode->i_mode)) {
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_mapping->a_ops = &hfsplus_aops;
+		} else {
+			init_special_inode(inode, inode->i_mode,
+					   be32_to_cpu(file->permissions.dev));
+		}
+		inode->i_atime = hfsp_mt2ut(file->access_date);
+		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
+		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
+		HFSPLUS_I(inode)->create_date = file->create_date;
+	} else {
+		pr_err("bad catalog entry used to create inode\n");
+		res = -EIO;
+	}
+	return res;
+}
+
+int hfsplus_cat_write_inode(struct inode *inode)
+{
+	struct inode *main_inode = inode;
+	struct hfs_find_data fd;
+	hfsplus_cat_entry entry;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		main_inode = HFSPLUS_I(inode)->rsrc_inode;
+
+	if (!main_inode->i_nlink)
+		return 0;
+
+	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
+		/* panic? */
+		return -EIO;
+
+	if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd))
+		/* panic? */
+		goto out;
+
+	if (S_ISDIR(main_inode->i_mode)) {
+		struct hfsplus_cat_folder *folder = &entry.folder;
+
+		if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+					sizeof(struct hfsplus_cat_folder));
+		/* simple node checks? */
+		hfsplus_cat_set_perms(inode, &folder->permissions);
+		folder->access_date = hfsp_ut2mt(inode->i_atime);
+		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+		folder->valence = cpu_to_be32(inode->i_size - 2);
+		if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
+			folder->subfolders =
+				cpu_to_be32(HFSPLUS_I(inode)->subfolders);
+		}
+		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
+					 sizeof(struct hfsplus_cat_folder));
+	} else if (HFSPLUS_IS_RSRC(inode)) {
+		struct hfsplus_cat_file *file = &entry.file;
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			       sizeof(struct hfsplus_cat_file));
+		hfsplus_inode_write_fork(inode, &file->rsrc_fork);
+		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
+				sizeof(struct hfsplus_cat_file));
+	} else {
+		struct hfsplus_cat_file *file = &entry.file;
+
+		if (fd.entrylength < sizeof(struct hfsplus_cat_file))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+					sizeof(struct hfsplus_cat_file));
+		hfsplus_inode_write_fork(inode, &file->data_fork);
+		hfsplus_cat_set_perms(inode, &file->permissions);
+		if (HFSPLUS_FLG_IMMUTABLE &
+				(file->permissions.rootflags |
+					file->permissions.userflags))
+			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
+		else
+			file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
+		file->access_date = hfsp_ut2mt(inode->i_atime);
+		file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
+					 sizeof(struct hfsplus_cat_file));
+	}
+
+	set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
+out:
+	hfs_find_exit(&fd);
+	return 0;
+}
diff --git a/kernel/fs/hfsplus/ioctl.c b/kernel/fs/hfsplus/ioctl.c
new file mode 100644
index 000000000..0624ce4e0
--- /dev/null
+++ b/kernel/fs/hfsplus/ioctl.c
@@ -0,0 +1,150 @@
+/*
+ *  linux/fs/hfsplus/ioctl.c
+ *
+ * Copyright (C) 2003
+ * Ethan Benson <erbenson@alaska.net>
+ * partially derived from linux/fs/ext2/ioctl.c
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * hfsplus ioctls
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include "hfsplus_fs.h"
+
+/*
+ * "Blessing" an HFS+ filesystem writes metadata to the superblock informing
+ * the platform firmware which file to boot from
+ */
+static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	struct hfsplus_vh *vh = sbi->s_vhdr;
+	struct hfsplus_vh *bvh = sbi->s_backup_vhdr;
+	u32 cnid = (unsigned long)dentry->d_fsdata;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&sbi->vh_mutex);
+
+	/* Directory containing the bootable system */
+	vh->finder_info[0] = bvh->finder_info[0] =
+		cpu_to_be32(parent_ino(dentry));
+
+	/*
+	 * Bootloader. Just using the inode here breaks in the case of
+	 * hard links - the firmware wants the ID of the hard link file,
+	 * but the inode points at the indirect inode
+	 */
+	vh->finder_info[1] = bvh->finder_info[1] = cpu_to_be32(cnid);
+
+	/* Per spec, the OS X system folder - same as finder_info[0] here */
+	vh->finder_info[5] = bvh->finder_info[5] =
+		cpu_to_be32(parent_ino(dentry));
+
+	mutex_unlock(&sbi->vh_mutex);
+	return 0;
+}
+
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file_inode(file);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	unsigned int flags = 0;
+
+	if (inode->i_flags & S_IMMUTABLE)
+		flags |= FS_IMMUTABLE_FL;
+	if (inode->i_flags & S_APPEND)
+		flags |= FS_APPEND_FL;
+	if (hip->userflags & HFSPLUS_FLG_NODUMP)
+		flags |= FS_NODUMP_FL;
+
+	return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file_inode(file);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	unsigned int flags, new_fl = 0;
+	int err = 0;
+
+	err = mnt_want_write_file(file);
+	if (err)
+		goto out;
+
+	if (!inode_owner_or_capable(inode)) {
+		err = -EACCES;
+		goto out_drop_write;
+	}
+
+	if (get_user(flags, user_flags)) {
+		err = -EFAULT;
+		goto out_drop_write;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_unlock_inode;
+		}
+	}
+
+	/* don't silently ignore unsupported ext2 flags */
+	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+		err = -EOPNOTSUPP;
+		goto out_unlock_inode;
+	}
+
+	if (flags & FS_IMMUTABLE_FL)
+		new_fl |= S_IMMUTABLE;
+
+	if (flags & FS_APPEND_FL)
+		new_fl |= S_APPEND;
+
+	inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);
+
+	if (flags & FS_NODUMP_FL)
+		hip->userflags |= HFSPLUS_FLG_NODUMP;
+	else
+		hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+out_unlock_inode:
+	mutex_unlock(&inode->i_mutex);
+out_drop_write:
+	mnt_drop_write_file(file);
+out:
+	return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case HFSPLUS_IOC_EXT2_GETFLAGS:
+		return hfsplus_ioctl_getflags(file, argp);
+	case HFSPLUS_IOC_EXT2_SETFLAGS:
+		return hfsplus_ioctl_setflags(file, argp);
+	case HFSPLUS_IOC_BLESS:
+		return hfsplus_ioctl_bless(file, argp);
+	default:
+		return -ENOTTY;
+	}
+}
diff --git a/kernel/fs/hfsplus/options.c b/kernel/fs/hfsplus/options.c
new file mode 100644
index 000000000..c90b72ee6
--- /dev/null
+++ b/kernel/fs/hfsplus/options.c
@@ -0,0 +1,238 @@
+/*
+ *  linux/fs/hfsplus/options.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Option parsing
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/nls.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "hfsplus_fs.h"
+
+enum {
+	opt_creator, opt_type,
+	opt_umask, opt_uid, opt_gid,
+	opt_part, opt_session, opt_nls,
+	opt_nodecompose, opt_decompose,
+	opt_barrier, opt_nobarrier,
+	opt_force, opt_err
+};
+
+static const match_table_t tokens = {
+	{ opt_creator, "creator=%s" },
+	{ opt_type, "type=%s" },
+	{ opt_umask, "umask=%o" },
+	{ opt_uid, "uid=%u" },
+	{ opt_gid, "gid=%u" },
+	{ opt_part, "part=%u" },
+	{ opt_session, "session=%u" },
+	{ opt_nls, "nls=%s" },
+	{ opt_decompose, "decompose" },
+	{ opt_nodecompose, "nodecompose" },
+	{ opt_barrier, "barrier" },
+	{ opt_nobarrier, "nobarrier" },
+	{ opt_force, "force" },
+	{ opt_err, NULL }
+};
+
+/* Initialize an options object to reasonable defaults */
+void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
+{
+	if (!opts)
+		return;
+
+	opts->creator = HFSPLUS_DEF_CR_TYPE;
+	opts->type = HFSPLUS_DEF_CR_TYPE;
+	opts->umask = current_umask();
+	opts->uid = current_uid();
+	opts->gid = current_gid();
+	opts->part = -1;
+	opts->session = -1;
+}
+
+/* convert a "four byte character" to a 32 bit int with error checks */
+static inline int match_fourchar(substring_t *arg, u32 *result)
+{
+	if (arg->to - arg->from != 4)
+		return -EINVAL;
+	memcpy(result, arg->from, 4);
+	return 0;
+}
+
+int hfsplus_parse_options_remount(char *input, int *force)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+
+	if (!input)
+		return 1;
+
+	while ((p = strsep(&input, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case opt_force:
+			*force = 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return 1;
+}
+
+/* Parse options from mount. Returns 0 on failure */
+/* input is the options passed to mount() as a string */
+int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int tmp, token;
+
+	if (!input)
+		goto done;
+
+	while ((p = strsep(&input, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case opt_creator:
+			if (match_fourchar(&args[0], &sbi->creator)) {
+				pr_err("creator requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_type:
+			if (match_fourchar(&args[0], &sbi->type)) {
+				pr_err("type requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_umask:
+			if (match_octal(&args[0], &tmp)) {
+				pr_err("umask requires a value\n");
+				return 0;
+			}
+			sbi->umask = (umode_t)tmp;
+			break;
+		case opt_uid:
+			if (match_int(&args[0], &tmp)) {
+				pr_err("uid requires an argument\n");
+				return 0;
+			}
+			sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp);
+			if (!uid_valid(sbi->uid)) {
+				pr_err("invalid uid specified\n");
+				return 0;
+			}
+			break;
+		case opt_gid:
+			if (match_int(&args[0], &tmp)) {
+				pr_err("gid requires an argument\n");
+				return 0;
+			}
+			sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp);
+			if (!gid_valid(sbi->gid)) {
+				pr_err("invalid gid specified\n");
+				return 0;
+			}
+			break;
+		case opt_part:
+			if (match_int(&args[0], &sbi->part)) {
+				pr_err("part requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_session:
+			if (match_int(&args[0], &sbi->session)) {
+				pr_err("session requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_nls:
+			if (sbi->nls) {
+				pr_err("unable to change nls mapping\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			if (p)
+				sbi->nls = load_nls(p);
+			if (!sbi->nls) {
+				pr_err("unable to load nls mapping \"%s\"\n",
+				       p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
+		case opt_decompose:
+			clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+			break;
+		case opt_nodecompose:
+			set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+			break;
+		case opt_barrier:
+			clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+			break;
+		case opt_nobarrier:
+			set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+			break;
+		case opt_force:
+			set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
+			break;
+		default:
+			return 0;
+		}
+	}
+
+done:
+	if (!sbi->nls) {
+		/* try utf8 first, as this is the old default behaviour */
+		sbi->nls = load_nls("utf8");
+		if (!sbi->nls)
+			sbi->nls = load_nls_default();
+		if (!sbi->nls)
+			return 0;
+	}
+
+	return 1;
+}
+
+int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
+
+	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
+		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+	if (sbi->type != HFSPLUS_DEF_CR_TYPE)
+		seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
+			from_kuid_munged(&init_user_ns, sbi->uid),
+			from_kgid_munged(&init_user_ns, sbi->gid));
+	if (sbi->part >= 0)
+		seq_printf(seq, ",part=%u", sbi->part);
+	if (sbi->session >= 0)
+		seq_printf(seq, ",session=%u", sbi->session);
+	if (sbi->nls)
+		seq_printf(seq, ",nls=%s", sbi->nls->charset);
+	if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
+		seq_puts(seq, ",nodecompose");
+	if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		seq_puts(seq, ",nobarrier");
+	return 0;
+}
diff --git a/kernel/fs/hfsplus/part_tbl.c b/kernel/fs/hfsplus/part_tbl.c
new file mode 100644
index 000000000..eb355d81e
--- /dev/null
+++ b/kernel/fs/hfsplus/part_tbl.c
@@ -0,0 +1,157 @@
+/*
+ * linux/fs/hfsplus/part_tbl.c
+ *
+ * Copyright (C) 1996-1997  Paul H. Hargrove
+ * This file may be distributed under the terms of
+ * the GNU General Public License.
+ *
+ * Original code to handle the new style Mac partition table based on
+ * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
+ *
+ * In function preconditions the term "valid" applied to a pointer to
+ * a structure means that the pointer is non-NULL and the structure it
+ * points to has all fields initialized to consistent values.
+ *
+ */
+
+#include <linux/slab.h>
+#include "hfsplus_fs.h"
+
+/* offsets to various blocks */
+#define HFS_DD_BLK		0 /* Driver Descriptor block */
+#define HFS_PMAP_BLK		1 /* First block of partition map */
+#define HFS_MDB_BLK		2 /* Block (w/i partition) of MDB */
+
+/* magic numbers for various disk blocks */
+#define HFS_DRVR_DESC_MAGIC	0x4552 /* "ER": driver descriptor map */
+#define HFS_OLD_PMAP_MAGIC	0x5453 /* "TS": old-type partition map */
+#define HFS_NEW_PMAP_MAGIC	0x504D /* "PM": new-type partition map */
+#define HFS_SUPER_MAGIC		0x4244 /* "BD": HFS MDB (super block) */
+#define HFS_MFS_SUPER_MAGIC	0xD2D7 /* MFS MDB (super block) */
+
+/*
+ * The new style Mac partition map
+ *
+ * For each partition on the media there is a physical block (512-byte
+ * block) containing one of these structures.  These blocks are
+ * contiguous starting at block 1.
+ */
+struct new_pmap {
+	__be16	pmSig;		/* signature */
+	__be16	reSigPad;	/* padding */
+	__be32	pmMapBlkCnt;	/* partition blocks count */
+	__be32	pmPyPartStart;	/* physical block start of partition */
+	__be32	pmPartBlkCnt;	/* physical block count of partition */
+	u8	pmPartName[32];	/* (null terminated?) string
+				   giving the name of this
+				   partition */
+	u8	pmPartType[32];	/* (null terminated?) string
+				   giving the type of this
+				   partition */
+	/* a bunch more stuff we don't need */
+} __packed;
+
+/*
+ * The old style Mac partition map
+ *
+ * The partition map consists for a 2-byte signature followed by an
+ * array of these structures.  The map is terminated with an all-zero
+ * one of these.
+ */
+struct old_pmap {
+	__be16		pdSig;	/* Signature bytes */
+	struct old_pmap_entry {
+		__be32	pdStart;
+		__be32	pdSize;
+		__be32	pdFSID;
+	}	pdEntry[42];
+} __packed;
+
+static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
+		sector_t *part_start, sector_t *part_size)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	int i;
+
+	for (i = 0; i < 42; i++) {
+		struct old_pmap_entry *p = &pm->pdEntry[i];
+
+		if (p->pdStart && p->pdSize &&
+		    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+		    (sbi->part < 0 || sbi->part == i)) {
+			*part_start += be32_to_cpu(p->pdStart);
+			*part_size = be32_to_cpu(p->pdSize);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int hfs_parse_new_pmap(struct super_block *sb, void *buf,
+		struct new_pmap *pm, sector_t *part_start, sector_t *part_size)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	int size = be32_to_cpu(pm->pmMapBlkCnt);
+	int buf_size = hfsplus_min_io_size(sb);
+	int res;
+	int i = 0;
+
+	do {
+		if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
+		    (sbi->part < 0 || sbi->part == i)) {
+			*part_start += be32_to_cpu(pm->pmPyPartStart);
+			*part_size = be32_to_cpu(pm->pmPartBlkCnt);
+			return 0;
+		}
+
+		if (++i >= size)
+			return -ENOENT;
+
+		pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE);
+		if ((u8 *)pm - (u8 *)buf >= buf_size) {
+			res = hfsplus_submit_bio(sb,
+						 *part_start + HFS_PMAP_BLK + i,
+						 buf, (void **)&pm, READ);
+			if (res)
+				return res;
+		}
+	} while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
+
+	return -ENOENT;
+}
+
+/*
+ * Parse the partition map looking for the start and length of a
+ * HFS/HFS+ partition.
+ */
+int hfs_part_find(struct super_block *sb,
+		sector_t *part_start, sector_t *part_size)
+{
+	void *buf, *data;
+	int res;
+
+	buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK,
+				 buf, &data, READ);
+	if (res)
+		goto out;
+
+	switch (be16_to_cpu(*((__be16 *)data))) {
+	case HFS_OLD_PMAP_MAGIC:
+		res = hfs_parse_old_pmap(sb, data, part_start, part_size);
+		break;
+	case HFS_NEW_PMAP_MAGIC:
+		res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size);
+		break;
+	default:
+		res = -ENOENT;
+		break;
+	}
+out:
+	kfree(buf);
+	return res;
+}
diff --git a/kernel/fs/hfsplus/posix_acl.c b/kernel/fs/hfsplus/posix_acl.c
new file mode 100644
index 000000000..df0c9af68
--- /dev/null
+++ b/kernel/fs/hfsplus/posix_acl.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/hfsplus/posix_acl.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for Posix Access Control Lists (ACLs) support.
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	char *xattr_name;
+	char *value = NULL;
+	ssize_t size;
+
+	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	size = __hfsplus_getxattr(inode, xattr_name, NULL, 0);
+
+	if (size > 0) {
+		value = (char *)hfsplus_alloc_attr_entry();
+		if (unlikely(!value))
+			return ERR_PTR(-ENOMEM);
+		size = __hfsplus_getxattr(inode, xattr_name, value, size);
+	}
+
+	if (size > 0)
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	else if (size == -ENODATA)
+		acl = NULL;
+	else
+		acl = ERR_PTR(size);
+
+	hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
+		int type)
+{
+	int err;
+	char *xattr_name;
+	size_t size = 0;
+	char *value = NULL;
+
+	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			err = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (err < 0)
+				return err;
+		}
+		err = 0;
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE))
+			return -ENOMEM;
+		value = (char *)hfsplus_alloc_attr_entry();
+		if (unlikely(!value))
+			return -ENOMEM;
+		err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (unlikely(err < 0))
+			goto end_set_acl;
+	}
+
+	err = __hfsplus_setxattr(inode, xattr_name, value, size, 0);
+
+end_set_acl:
+	hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
+
+	if (!err)
+		set_cached_acl(inode, type, acl);
+
+	return err;
+}
+
+int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
+{
+	int err = 0;
+	struct posix_acl *default_acl, *acl;
+
+	hfs_dbg(ACL_MOD,
+		"[%s]: ino %lu, dir->ino %lu\n",
+		__func__, inode->i_ino, dir->i_ino);
+
+	if (S_ISLNK(inode->i_mode))
+		return 0;
+
+	err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (err)
+		return err;
+
+	if (default_acl) {
+		err = hfsplus_set_posix_acl(inode, default_acl,
+					    ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+
+	if (acl) {
+		if (!err)
+			err = hfsplus_set_posix_acl(inode, acl,
+						    ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+	return err;
+}
diff --git a/kernel/fs/hfsplus/super.c b/kernel/fs/hfsplus/super.c
new file mode 100644
index 000000000..593af2fdc
--- /dev/null
+++ b/kernel/fs/hfsplus/super.c
@@ -0,0 +1,700 @@
+/*
+ *  linux/fs/hfsplus/super.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+#include <linux/nls.h>
+
+static struct inode *hfsplus_alloc_inode(struct super_block *sb);
+static void hfsplus_destroy_inode(struct inode *inode);
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_system_read_inode(struct inode *inode)
+{
+	struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
+
+	switch (inode->i_ino) {
+	case HFSPLUS_EXT_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->ext_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	case HFSPLUS_CAT_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->cat_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	case HFSPLUS_ALLOC_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->alloc_file);
+		inode->i_mapping->a_ops = &hfsplus_aops;
+		break;
+	case HFSPLUS_START_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->start_file);
+		break;
+	case HFSPLUS_ATTR_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->attr_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	default:
+		return -EIO;
+	}
+
+	return 0;
+}
+
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+	struct hfs_find_data fd;
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+	mutex_init(&HFSPLUS_I(inode)->extents_lock);
+	HFSPLUS_I(inode)->flags = 0;
+	HFSPLUS_I(inode)->extent_state = 0;
+	HFSPLUS_I(inode)->rsrc_inode = NULL;
+	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID) {
+		err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+		if (!err) {
+			err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+			if (!err)
+				err = hfsplus_cat_read_inode(inode, &fd);
+			hfs_find_exit(&fd);
+		}
+	} else {
+		err = hfsplus_system_read_inode(inode);
+	}
+
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+}
+
+static int hfsplus_system_write_inode(struct inode *inode)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	struct hfsplus_fork_raw *fork;
+	struct hfs_btree *tree = NULL;
+
+	switch (inode->i_ino) {
+	case HFSPLUS_EXT_CNID:
+		fork = &vhdr->ext_file;
+		tree = sbi->ext_tree;
+		break;
+	case HFSPLUS_CAT_CNID:
+		fork = &vhdr->cat_file;
+		tree = sbi->cat_tree;
+		break;
+	case HFSPLUS_ALLOC_CNID:
+		fork = &vhdr->alloc_file;
+		break;
+	case HFSPLUS_START_CNID:
+		fork = &vhdr->start_file;
+		break;
+	case HFSPLUS_ATTR_CNID:
+		fork = &vhdr->attr_file;
+		tree = sbi->attr_tree;
+		break;
+	default:
+		return -EIO;
+	}
+
+	if (fork->total_size != cpu_to_be64(inode->i_size)) {
+		set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+		hfsplus_mark_mdb_dirty(inode->i_sb);
+	}
+	hfsplus_inode_write_fork(inode, fork);
+	if (tree) {
+		int err = hfs_btree_write(tree);
+
+		if (err) {
+			pr_err("b-tree write err: %d, ino %lu\n",
+			       err, inode->i_ino);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int hfsplus_write_inode(struct inode *inode,
+		struct writeback_control *wbc)
+{
+	int err;
+
+	hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+	err = hfsplus_ext_write_extent(inode);
+	if (err)
+		return err;
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID)
+		return hfsplus_cat_write_inode(inode);
+	else
+		return hfsplus_system_write_inode(inode);
+}
+
+static void hfsplus_evict_inode(struct inode *inode)
+{
+	hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (HFSPLUS_IS_RSRC(inode)) {
+		HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+		iput(HFSPLUS_I(inode)->rsrc_inode);
+	}
+}
+
+static int hfsplus_sync_fs(struct super_block *sb, int wait)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	int write_backup = 0;
+	int error, error2;
+
+	if (!wait)
+		return 0;
+
+	hfs_dbg(SUPER, "hfsplus_sync_fs\n");
+
+	/*
+	 * Explicitly write out the special metadata inodes.
+	 *
+	 * While these special inodes are marked as hashed and written
+	 * out peridocically by the flusher threads we redirty them
+	 * during writeout of normal inodes, and thus the life lock
+	 * prevents us from getting the latest state to disk.
+	 */
+	error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+	error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+	if (!error)
+		error = error2;
+	if (sbi->attr_tree) {
+		error2 =
+		    filemap_write_and_wait(sbi->attr_tree->inode->i_mapping);
+		if (!error)
+			error = error2;
+	}
+	error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+	if (!error)
+		error = error2;
+
+	mutex_lock(&sbi->vh_mutex);
+	mutex_lock(&sbi->alloc_mutex);
+	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+	vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+	vhdr->file_count = cpu_to_be32(sbi->file_count);
+
+	if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
+		memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
+		write_backup = 1;
+	}
+
+	error2 = hfsplus_submit_bio(sb,
+				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
+				   sbi->s_vhdr_buf, NULL, WRITE_SYNC);
+	if (!error)
+		error = error2;
+	if (!write_backup)
+		goto out;
+
+	error2 = hfsplus_submit_bio(sb,
+				  sbi->part_start + sbi->sect_count - 2,
+				  sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC);
+	if (!error)
+		error2 = error;
+out:
+	mutex_unlock(&sbi->alloc_mutex);
+	mutex_unlock(&sbi->vh_mutex);
+
+	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
+	return error;
+}
+
+static void delayed_sync_fs(struct work_struct *work)
+{
+	int err;
+	struct hfsplus_sb_info *sbi;
+
+	sbi = container_of(work, struct hfsplus_sb_info, sync_work.work);
+
+	spin_lock(&sbi->work_lock);
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->work_lock);
+
+	err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1);
+	if (err)
+		pr_err("delayed sync fs err %d\n", err);
+}
+
+void hfsplus_mark_mdb_dirty(struct super_block *sb)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	unsigned long delay;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	spin_lock(&sbi->work_lock);
+	if (!sbi->work_queued) {
+		delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+		queue_delayed_work(system_long_wq, &sbi->sync_work, delay);
+		sbi->work_queued = 1;
+	}
+	spin_unlock(&sbi->work_lock);
+}
+
+static void hfsplus_put_super(struct super_block *sb)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
+	hfs_dbg(SUPER, "hfsplus_put_super\n");
+
+	cancel_delayed_work_sync(&sbi->sync_work);
+
+	if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+		struct hfsplus_vh *vhdr = sbi->s_vhdr;
+
+		vhdr->modify_date = hfsp_now2mt();
+		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
+		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
+
+		hfsplus_sync_fs(sb, 1);
+	}
+
+	hfs_btree_close(sbi->attr_tree);
+	hfs_btree_close(sbi->cat_tree);
+	hfs_btree_close(sbi->ext_tree);
+	iput(sbi->alloc_file);
+	iput(sbi->hidden_dir);
+	kfree(sbi->s_vhdr_buf);
+	kfree(sbi->s_backup_vhdr_buf);
+	unload_nls(sbi->nls);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = HFSPLUS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+	buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = 0xFFFFFFFF;
+	buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = HFSPLUS_MAX_STRLEN;
+
+	return 0;
+}
+
+static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+	if (!(*flags & MS_RDONLY)) {
+		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
+		int force = 0;
+
+		if (!hfsplus_parse_options_remount(data, &force))
+			return -EINVAL;
+
+		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
+			pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		} else if (force) {
+			/* nothing */
+		} else if (vhdr->attributes &
+				cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+			pr_warn("filesystem is marked locked, leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		} else if (vhdr->attributes &
+				cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+			pr_warn("filesystem is marked journaled, leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		}
+	}
+	return 0;
+}
+
+static const struct super_operations hfsplus_sops = {
+	.alloc_inode	= hfsplus_alloc_inode,
+	.destroy_inode	= hfsplus_destroy_inode,
+	.write_inode	= hfsplus_write_inode,
+	.evict_inode	= hfsplus_evict_inode,
+	.put_super	= hfsplus_put_super,
+	.sync_fs	= hfsplus_sync_fs,
+	.statfs		= hfsplus_statfs,
+	.remount_fs	= hfsplus_remount,
+	.show_options	= hfsplus_show_options,
+};
+
+static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct hfsplus_vh *vhdr;
+	struct hfsplus_sb_info *sbi;
+	hfsplus_cat_entry entry;
+	struct hfs_find_data fd;
+	struct inode *root, *inode;
+	struct qstr str;
+	struct nls_table *nls = NULL;
+	u64 last_fs_block, last_fs_page;
+	int err;
+
+	err = -ENOMEM;
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto out;
+
+	sb->s_fs_info = sbi;
+	mutex_init(&sbi->alloc_mutex);
+	mutex_init(&sbi->vh_mutex);
+	spin_lock_init(&sbi->work_lock);
+	INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
+	hfsplus_fill_defaults(sbi);
+
+	err = -EINVAL;
+	if (!hfsplus_parse_options(data, sbi)) {
+		pr_err("unable to parse mount options\n");
+		goto out_unload_nls;
+	}
+
+	/* temporarily use utf8 to correctly find the hidden dir below */
+	nls = sbi->nls;
+	sbi->nls = load_nls("utf8");
+	if (!sbi->nls) {
+		pr_err("unable to load nls for utf8\n");
+		goto out_unload_nls;
+	}
+
+	/* Grab the volume header */
+	if (hfsplus_read_wrapper(sb)) {
+		if (!silent)
+			pr_warn("unable to find HFS+ superblock\n");
+		goto out_unload_nls;
+	}
+	vhdr = sbi->s_vhdr;
+
+	/* Copy parts of the volume header into the superblock */
+	sb->s_magic = HFSPLUS_VOLHEAD_SIG;
+	if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
+	    be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
+		pr_err("wrong filesystem version\n");
+		goto out_free_vhdr;
+	}
+	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+	sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+	sbi->file_count = be32_to_cpu(vhdr->file_count);
+	sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+	sbi->data_clump_blocks =
+		be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->data_clump_blocks)
+		sbi->data_clump_blocks = 1;
+	sbi->rsrc_clump_blocks =
+		be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->rsrc_clump_blocks)
+		sbi->rsrc_clump_blocks = 1;
+
+	err = -EFBIG;
+	last_fs_block = sbi->total_blocks - 1;
+	last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
+			PAGE_CACHE_SHIFT;
+
+	if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
+	    (last_fs_page > (pgoff_t)(~0ULL))) {
+		pr_err("filesystem size too large\n");
+		goto out_free_vhdr;
+	}
+
+	/* Set up operations so we can load metadata */
+	sb->s_op = &hfsplus_sops;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
+		pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
+		/* nothing */
+	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+		pr_warn("Filesystem is marked locked, mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
+			!(sb->s_flags & MS_RDONLY)) {
+		pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	err = -EINVAL;
+
+	/* Load metadata objects (B*Trees) */
+	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+	if (!sbi->ext_tree) {
+		pr_err("failed to load extents file\n");
+		goto out_free_vhdr;
+	}
+	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+	if (!sbi->cat_tree) {
+		pr_err("failed to load catalog file\n");
+		goto out_close_ext_tree;
+	}
+	atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
+	if (vhdr->attr_file.total_blocks != 0) {
+		sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
+		if (!sbi->attr_tree) {
+			pr_err("failed to load attributes file\n");
+			goto out_close_cat_tree;
+		}
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
+	}
+	sb->s_xattr = hfsplus_xattr_handlers;
+
+	inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
+	if (IS_ERR(inode)) {
+		pr_err("failed to load allocation file\n");
+		err = PTR_ERR(inode);
+		goto out_close_attr_tree;
+	}
+	sbi->alloc_file = inode;
+
+	/* Load the root directory */
+	root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
+	if (IS_ERR(root)) {
+		pr_err("failed to load root directory\n");
+		err = PTR_ERR(root);
+		goto out_put_alloc_file;
+	}
+
+	sb->s_d_op = &hfsplus_dentry_operations;
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto out_put_alloc_file;
+	}
+
+	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
+	str.name = HFSP_HIDDENDIR_NAME;
+	err = hfs_find_init(sbi->cat_tree, &fd);
+	if (err)
+		goto out_put_root;
+	err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+	if (unlikely(err < 0))
+		goto out_put_root;
+	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
+		hfs_find_exit(&fd);
+		if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
+			goto out_put_root;
+		inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto out_put_root;
+		}
+		sbi->hidden_dir = inode;
+	} else
+		hfs_find_exit(&fd);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/*
+		 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+		 * all three are registered with Apple for our use
+		 */
+		vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+		vhdr->modify_date = hfsp_now2mt();
+		be32_add_cpu(&vhdr->write_count, 1);
+		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
+		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+		hfsplus_sync_fs(sb, 1);
+
+		if (!sbi->hidden_dir) {
+			mutex_lock(&sbi->vh_mutex);
+			sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+			if (!sbi->hidden_dir) {
+				mutex_unlock(&sbi->vh_mutex);
+				err = -ENOMEM;
+				goto out_put_root;
+			}
+			err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
+						 &str, sbi->hidden_dir);
+			if (err) {
+				mutex_unlock(&sbi->vh_mutex);
+				goto out_put_hidden_dir;
+			}
+
+			err = hfsplus_init_inode_security(sbi->hidden_dir,
+								root, &str);
+			if (err == -EOPNOTSUPP)
+				err = 0; /* Operation is not supported. */
+			else if (err) {
+				/*
+				 * Try to delete anyway without
+				 * error analysis.
+				 */
+				hfsplus_delete_cat(sbi->hidden_dir->i_ino,
+							root, &str);
+				mutex_unlock(&sbi->vh_mutex);
+				goto out_put_hidden_dir;
+			}
+
+			mutex_unlock(&sbi->vh_mutex);
+			hfsplus_mark_inode_dirty(sbi->hidden_dir,
+						 HFSPLUS_I_CAT_DIRTY);
+		}
+	}
+
+	unload_nls(sbi->nls);
+	sbi->nls = nls;
+	return 0;
+
+out_put_hidden_dir:
+	iput(sbi->hidden_dir);
+out_put_root:
+	dput(sb->s_root);
+	sb->s_root = NULL;
+out_put_alloc_file:
+	iput(sbi->alloc_file);
+out_close_attr_tree:
+	hfs_btree_close(sbi->attr_tree);
+out_close_cat_tree:
+	hfs_btree_close(sbi->cat_tree);
+out_close_ext_tree:
+	hfs_btree_close(sbi->ext_tree);
+out_free_vhdr:
+	kfree(sbi->s_vhdr_buf);
+	kfree(sbi->s_backup_vhdr_buf);
+out_unload_nls:
+	unload_nls(sbi->nls);
+	unload_nls(nls);
+	kfree(sbi);
+out:
+	return err;
+}
+
+MODULE_AUTHOR("Brad Boyer");
+MODULE_DESCRIPTION("Extended Macintosh Filesystem");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *hfsplus_inode_cachep;
+
+static struct inode *hfsplus_alloc_inode(struct super_block *sb)
+{
+	struct hfsplus_inode_info *i;
+
+	i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL);
+	return i ? &i->vfs_inode : NULL;
+}
+
+static void hfsplus_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
+}
+
+static void hfsplus_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfsplus_i_callback);
+}
+
+#define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
+
+static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+}
+
+static struct file_system_type hfsplus_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "hfsplus",
+	.mount		= hfsplus_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("hfsplus");
+
+static void hfsplus_init_once(void *p)
+{
+	struct hfsplus_inode_info *i = p;
+
+	inode_init_once(&i->vfs_inode);
+}
+
+static int __init init_hfsplus_fs(void)
+{
+	int err;
+
+	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
+		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+		hfsplus_init_once);
+	if (!hfsplus_inode_cachep)
+		return -ENOMEM;
+	err = hfsplus_create_attr_tree_cache();
+	if (err)
+		goto destroy_inode_cache;
+	err = register_filesystem(&hfsplus_fs_type);
+	if (err)
+		goto destroy_attr_tree_cache;
+	return 0;
+
+destroy_attr_tree_cache:
+	hfsplus_destroy_attr_tree_cache();
+
+destroy_inode_cache:
+	kmem_cache_destroy(hfsplus_inode_cachep);
+
+	return err;
+}
+
+static void __exit exit_hfsplus_fs(void)
+{
+	unregister_filesystem(&hfsplus_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	hfsplus_destroy_attr_tree_cache();
+	kmem_cache_destroy(hfsplus_inode_cachep);
+}
+
+module_init(init_hfsplus_fs)
+module_exit(exit_hfsplus_fs)
diff --git a/kernel/fs/hfsplus/tables.c b/kernel/fs/hfsplus/tables.c
new file mode 100644
index 000000000..1b911730a
--- /dev/null
+++ b/kernel/fs/hfsplus/tables.c
@@ -0,0 +1,3245 @@
+/*
+ * linux/fs/hfsplus/tables.c
+ *
+ * Various data tables
+ */
+
+#include "hfsplus_fs.h"
+
+/*
+ *  Unicode case folding table taken from Apple Technote #1150
+ *  (HFS Plus Volume Format)
+ */
+
+u16 hfsplus_case_fold_table[] = {
+/*
+ *  The lower case table consists of a 256-entry high-byte table followed by
+ *  some number of 256-entry subtables. The high-byte table contains either an
+ *  offset to the subtable for characters with that high byte or zero, which
+ *  means that there are no case mappings or ignored characters in that block.
+ *  Ignored characters are mapped to zero.
+ */
+
+    // High-byte indices ( == 0 iff no case mapping and no ignorables )
+
+
+    /* 0 */ 0x0100, 0x0200, 0x0000, 0x0300, 0x0400, 0x0500, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 1 */ 0x0600, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 2 */ 0x0700, 0x0800, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 3 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 4 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 5 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 6 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 7 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 9 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* A */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* B */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* C */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* D */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* E */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* F */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0900, 0x0A00,
+
+    // Table 1 (for high byte 0x00)
+
+    /* 0 */ 0xFFFF, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+            0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
+    /* 1 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+            0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
+    /* 2 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+            0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
+    /* 3 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+            0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
+    /* 4 */ 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+            0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
+    /* 5 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+            0x0078, 0x0079, 0x007A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
+    /* 6 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+            0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
+    /* 7 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+            0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
+    /* 8 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+            0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
+    /* 9 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+            0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
+    /* A */ 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+            0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+    /* B */ 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+            0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+    /* C */ 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00E6, 0x00C7,
+            0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+    /* D */ 0x00F0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+            0x00F8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00FE, 0x00DF,
+    /* E */ 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+            0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+    /* F */ 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+            0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
+
+    // Table 2 (for high byte 0x01)
+
+    /* 0 */ 0x0100, 0x0101, 0x0102, 0x0103, 0x0104, 0x0105, 0x0106, 0x0107,
+            0x0108, 0x0109, 0x010A, 0x010B, 0x010C, 0x010D, 0x010E, 0x010F,
+    /* 1 */ 0x0111, 0x0111, 0x0112, 0x0113, 0x0114, 0x0115, 0x0116, 0x0117,
+            0x0118, 0x0119, 0x011A, 0x011B, 0x011C, 0x011D, 0x011E, 0x011F,
+    /* 2 */ 0x0120, 0x0121, 0x0122, 0x0123, 0x0124, 0x0125, 0x0127, 0x0127,
+            0x0128, 0x0129, 0x012A, 0x012B, 0x012C, 0x012D, 0x012E, 0x012F,
+    /* 3 */ 0x0130, 0x0131, 0x0133, 0x0133, 0x0134, 0x0135, 0x0136, 0x0137,
+            0x0138, 0x0139, 0x013A, 0x013B, 0x013C, 0x013D, 0x013E, 0x0140,
+    /* 4 */ 0x0140, 0x0142, 0x0142, 0x0143, 0x0144, 0x0145, 0x0146, 0x0147,
+            0x0148, 0x0149, 0x014B, 0x014B, 0x014C, 0x014D, 0x014E, 0x014F,
+    /* 5 */ 0x0150, 0x0151, 0x0153, 0x0153, 0x0154, 0x0155, 0x0156, 0x0157,
+            0x0158, 0x0159, 0x015A, 0x015B, 0x015C, 0x015D, 0x015E, 0x015F,
+    /* 6 */ 0x0160, 0x0161, 0x0162, 0x0163, 0x0164, 0x0165, 0x0167, 0x0167,
+            0x0168, 0x0169, 0x016A, 0x016B, 0x016C, 0x016D, 0x016E, 0x016F,
+    /* 7 */ 0x0170, 0x0171, 0x0172, 0x0173, 0x0174, 0x0175, 0x0176, 0x0177,
+            0x0178, 0x0179, 0x017A, 0x017B, 0x017C, 0x017D, 0x017E, 0x017F,
+    /* 8 */ 0x0180, 0x0253, 0x0183, 0x0183, 0x0185, 0x0185, 0x0254, 0x0188,
+            0x0188, 0x0256, 0x0257, 0x018C, 0x018C, 0x018D, 0x01DD, 0x0259,
+    /* 9 */ 0x025B, 0x0192, 0x0192, 0x0260, 0x0263, 0x0195, 0x0269, 0x0268,
+            0x0199, 0x0199, 0x019A, 0x019B, 0x026F, 0x0272, 0x019E, 0x0275,
+    /* A */ 0x01A0, 0x01A1, 0x01A3, 0x01A3, 0x01A5, 0x01A5, 0x01A6, 0x01A8,
+            0x01A8, 0x0283, 0x01AA, 0x01AB, 0x01AD, 0x01AD, 0x0288, 0x01AF,
+    /* B */ 0x01B0, 0x028A, 0x028B, 0x01B4, 0x01B4, 0x01B6, 0x01B6, 0x0292,
+            0x01B9, 0x01B9, 0x01BA, 0x01BB, 0x01BD, 0x01BD, 0x01BE, 0x01BF,
+    /* C */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x01C6, 0x01C6, 0x01C6, 0x01C9,
+            0x01C9, 0x01C9, 0x01CC, 0x01CC, 0x01CC, 0x01CD, 0x01CE, 0x01CF,
+    /* D */ 0x01D0, 0x01D1, 0x01D2, 0x01D3, 0x01D4, 0x01D5, 0x01D6, 0x01D7,
+            0x01D8, 0x01D9, 0x01DA, 0x01DB, 0x01DC, 0x01DD, 0x01DE, 0x01DF,
+    /* E */ 0x01E0, 0x01E1, 0x01E2, 0x01E3, 0x01E5, 0x01E5, 0x01E6, 0x01E7,
+            0x01E8, 0x01E9, 0x01EA, 0x01EB, 0x01EC, 0x01ED, 0x01EE, 0x01EF,
+    /* F */ 0x01F0, 0x01F3, 0x01F3, 0x01F3, 0x01F4, 0x01F5, 0x01F6, 0x01F7,
+            0x01F8, 0x01F9, 0x01FA, 0x01FB, 0x01FC, 0x01FD, 0x01FE, 0x01FF,
+
+    // Table 3 (for high byte 0x03)
+
+    /* 0 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307,
+            0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F,
+    /* 1 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317,
+            0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F,
+    /* 2 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327,
+            0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F,
+    /* 3 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337,
+            0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F,
+    /* 4 */ 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347,
+            0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F,
+    /* 5 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357,
+            0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F,
+    /* 6 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367,
+            0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F,
+    /* 7 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377,
+            0x0378, 0x0379, 0x037A, 0x037B, 0x037C, 0x037D, 0x037E, 0x037F,
+    /* 8 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387,
+            0x0388, 0x0389, 0x038A, 0x038B, 0x038C, 0x038D, 0x038E, 0x038F,
+    /* 9 */ 0x0390, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
+            0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
+    /* A */ 0x03C0, 0x03C1, 0x03A2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
+            0x03C8, 0x03C9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
+    /* B */ 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
+            0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
+    /* C */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
+            0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0x03CF,
+    /* D */ 0x03D0, 0x03D1, 0x03D2, 0x03D3, 0x03D4, 0x03D5, 0x03D6, 0x03D7,
+            0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF,
+    /* E */ 0x03E0, 0x03E1, 0x03E3, 0x03E3, 0x03E5, 0x03E5, 0x03E7, 0x03E7,
+            0x03E9, 0x03E9, 0x03EB, 0x03EB, 0x03ED, 0x03ED, 0x03EF, 0x03EF,
+    /* F */ 0x03F0, 0x03F1, 0x03F2, 0x03F3, 0x03F4, 0x03F5, 0x03F6, 0x03F7,
+            0x03F8, 0x03F9, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF,
+
+    // Table 4 (for high byte 0x04)
+
+    /* 0 */ 0x0400, 0x0401, 0x0452, 0x0403, 0x0454, 0x0455, 0x0456, 0x0407,
+            0x0458, 0x0459, 0x045A, 0x045B, 0x040C, 0x040D, 0x040E, 0x045F,
+    /* 1 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
+            0x0438, 0x0419, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
+    /* 2 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
+            0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
+    /* 3 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
+            0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
+    /* 4 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
+            0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
+    /* 5 */ 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
+            0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E, 0x045F,
+    /* 6 */ 0x0461, 0x0461, 0x0463, 0x0463, 0x0465, 0x0465, 0x0467, 0x0467,
+            0x0469, 0x0469, 0x046B, 0x046B, 0x046D, 0x046D, 0x046F, 0x046F,
+    /* 7 */ 0x0471, 0x0471, 0x0473, 0x0473, 0x0475, 0x0475, 0x0476, 0x0477,
+            0x0479, 0x0479, 0x047B, 0x047B, 0x047D, 0x047D, 0x047F, 0x047F,
+    /* 8 */ 0x0481, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487,
+            0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F,
+    /* 9 */ 0x0491, 0x0491, 0x0493, 0x0493, 0x0495, 0x0495, 0x0497, 0x0497,
+            0x0499, 0x0499, 0x049B, 0x049B, 0x049D, 0x049D, 0x049F, 0x049F,
+    /* A */ 0x04A1, 0x04A1, 0x04A3, 0x04A3, 0x04A5, 0x04A5, 0x04A7, 0x04A7,
+            0x04A9, 0x04A9, 0x04AB, 0x04AB, 0x04AD, 0x04AD, 0x04AF, 0x04AF,
+    /* B */ 0x04B1, 0x04B1, 0x04B3, 0x04B3, 0x04B5, 0x04B5, 0x04B7, 0x04B7,
+            0x04B9, 0x04B9, 0x04BB, 0x04BB, 0x04BD, 0x04BD, 0x04BF, 0x04BF,
+    /* C */ 0x04C0, 0x04C1, 0x04C2, 0x04C4, 0x04C4, 0x04C5, 0x04C6, 0x04C8,
+            0x04C8, 0x04C9, 0x04CA, 0x04CC, 0x04CC, 0x04CD, 0x04CE, 0x04CF,
+    /* D */ 0x04D0, 0x04D1, 0x04D2, 0x04D3, 0x04D4, 0x04D5, 0x04D6, 0x04D7,
+            0x04D8, 0x04D9, 0x04DA, 0x04DB, 0x04DC, 0x04DD, 0x04DE, 0x04DF,
+    /* E */ 0x04E0, 0x04E1, 0x04E2, 0x04E3, 0x04E4, 0x04E5, 0x04E6, 0x04E7,
+            0x04E8, 0x04E9, 0x04EA, 0x04EB, 0x04EC, 0x04ED, 0x04EE, 0x04EF,
+    /* F */ 0x04F0, 0x04F1, 0x04F2, 0x04F3, 0x04F4, 0x04F5, 0x04F6, 0x04F7,
+            0x04F8, 0x04F9, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF,
+
+    // Table 5 (for high byte 0x05)
+
+    /* 0 */ 0x0500, 0x0501, 0x0502, 0x0503, 0x0504, 0x0505, 0x0506, 0x0507,
+            0x0508, 0x0509, 0x050A, 0x050B, 0x050C, 0x050D, 0x050E, 0x050F,
+    /* 1 */ 0x0510, 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0516, 0x0517,
+            0x0518, 0x0519, 0x051A, 0x051B, 0x051C, 0x051D, 0x051E, 0x051F,
+    /* 2 */ 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527,
+            0x0528, 0x0529, 0x052A, 0x052B, 0x052C, 0x052D, 0x052E, 0x052F,
+    /* 3 */ 0x0530, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567,
+            0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F,
+    /* 4 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577,
+            0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F,
+    /* 5 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0557,
+            0x0558, 0x0559, 0x055A, 0x055B, 0x055C, 0x055D, 0x055E, 0x055F,
+    /* 6 */ 0x0560, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567,
+            0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F,
+    /* 7 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577,
+            0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F,
+    /* 8 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0587,
+            0x0588, 0x0589, 0x058A, 0x058B, 0x058C, 0x058D, 0x058E, 0x058F,
+    /* 9 */ 0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597,
+            0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, 0x059E, 0x059F,
+    /* A */ 0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7,
+            0x05A8, 0x05A9, 0x05AA, 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF,
+    /* B */ 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7,
+            0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
+    /* C */ 0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05C4, 0x05C5, 0x05C6, 0x05C7,
+            0x05C8, 0x05C9, 0x05CA, 0x05CB, 0x05CC, 0x05CD, 0x05CE, 0x05CF,
+    /* D */ 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
+            0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
+    /* E */ 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
+            0x05E8, 0x05E9, 0x05EA, 0x05EB, 0x05EC, 0x05ED, 0x05EE, 0x05EF,
+    /* F */ 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0x05F5, 0x05F6, 0x05F7,
+            0x05F8, 0x05F9, 0x05FA, 0x05FB, 0x05FC, 0x05FD, 0x05FE, 0x05FF,
+
+    // Table 6 (for high byte 0x10)
+
+    /* 0 */ 0x1000, 0x1001, 0x1002, 0x1003, 0x1004, 0x1005, 0x1006, 0x1007,
+            0x1008, 0x1009, 0x100A, 0x100B, 0x100C, 0x100D, 0x100E, 0x100F,
+    /* 1 */ 0x1010, 0x1011, 0x1012, 0x1013, 0x1014, 0x1015, 0x1016, 0x1017,
+            0x1018, 0x1019, 0x101A, 0x101B, 0x101C, 0x101D, 0x101E, 0x101F,
+    /* 2 */ 0x1020, 0x1021, 0x1022, 0x1023, 0x1024, 0x1025, 0x1026, 0x1027,
+            0x1028, 0x1029, 0x102A, 0x102B, 0x102C, 0x102D, 0x102E, 0x102F,
+    /* 3 */ 0x1030, 0x1031, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037,
+            0x1038, 0x1039, 0x103A, 0x103B, 0x103C, 0x103D, 0x103E, 0x103F,
+    /* 4 */ 0x1040, 0x1041, 0x1042, 0x1043, 0x1044, 0x1045, 0x1046, 0x1047,
+            0x1048, 0x1049, 0x104A, 0x104B, 0x104C, 0x104D, 0x104E, 0x104F,
+    /* 5 */ 0x1050, 0x1051, 0x1052, 0x1053, 0x1054, 0x1055, 0x1056, 0x1057,
+            0x1058, 0x1059, 0x105A, 0x105B, 0x105C, 0x105D, 0x105E, 0x105F,
+    /* 6 */ 0x1060, 0x1061, 0x1062, 0x1063, 0x1064, 0x1065, 0x1066, 0x1067,
+            0x1068, 0x1069, 0x106A, 0x106B, 0x106C, 0x106D, 0x106E, 0x106F,
+    /* 7 */ 0x1070, 0x1071, 0x1072, 0x1073, 0x1074, 0x1075, 0x1076, 0x1077,
+            0x1078, 0x1079, 0x107A, 0x107B, 0x107C, 0x107D, 0x107E, 0x107F,
+    /* 8 */ 0x1080, 0x1081, 0x1082, 0x1083, 0x1084, 0x1085, 0x1086, 0x1087,
+            0x1088, 0x1089, 0x108A, 0x108B, 0x108C, 0x108D, 0x108E, 0x108F,
+    /* 9 */ 0x1090, 0x1091, 0x1092, 0x1093, 0x1094, 0x1095, 0x1096, 0x1097,
+            0x1098, 0x1099, 0x109A, 0x109B, 0x109C, 0x109D, 0x109E, 0x109F,
+    /* A */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7,
+            0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF,
+    /* B */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7,
+            0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF,
+    /* C */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10C6, 0x10C7,
+            0x10C8, 0x10C9, 0x10CA, 0x10CB, 0x10CC, 0x10CD, 0x10CE, 0x10CF,
+    /* D */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7,
+            0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF,
+    /* E */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7,
+            0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF,
+    /* F */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10F6, 0x10F7,
+            0x10F8, 0x10F9, 0x10FA, 0x10FB, 0x10FC, 0x10FD, 0x10FE, 0x10FF,
+
+    // Table 7 (for high byte 0x20)
+
+    /* 0 */ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
+            0x2008, 0x2009, 0x200A, 0x200B, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 1 */ 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017,
+            0x2018, 0x2019, 0x201A, 0x201B, 0x201C, 0x201D, 0x201E, 0x201F,
+    /* 2 */ 0x2020, 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027,
+            0x2028, 0x2029, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x202F,
+    /* 3 */ 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037,
+            0x2038, 0x2039, 0x203A, 0x203B, 0x203C, 0x203D, 0x203E, 0x203F,
+    /* 4 */ 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x2047,
+            0x2048, 0x2049, 0x204A, 0x204B, 0x204C, 0x204D, 0x204E, 0x204F,
+    /* 5 */ 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057,
+            0x2058, 0x2059, 0x205A, 0x205B, 0x205C, 0x205D, 0x205E, 0x205F,
+    /* 6 */ 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067,
+            0x2068, 0x2069, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 7 */ 0x2070, 0x2071, 0x2072, 0x2073, 0x2074, 0x2075, 0x2076, 0x2077,
+            0x2078, 0x2079, 0x207A, 0x207B, 0x207C, 0x207D, 0x207E, 0x207F,
+    /* 8 */ 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087,
+            0x2088, 0x2089, 0x208A, 0x208B, 0x208C, 0x208D, 0x208E, 0x208F,
+    /* 9 */ 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097,
+            0x2098, 0x2099, 0x209A, 0x209B, 0x209C, 0x209D, 0x209E, 0x209F,
+    /* A */ 0x20A0, 0x20A1, 0x20A2, 0x20A3, 0x20A4, 0x20A5, 0x20A6, 0x20A7,
+            0x20A8, 0x20A9, 0x20AA, 0x20AB, 0x20AC, 0x20AD, 0x20AE, 0x20AF,
+    /* B */ 0x20B0, 0x20B1, 0x20B2, 0x20B3, 0x20B4, 0x20B5, 0x20B6, 0x20B7,
+            0x20B8, 0x20B9, 0x20BA, 0x20BB, 0x20BC, 0x20BD, 0x20BE, 0x20BF,
+    /* C */ 0x20C0, 0x20C1, 0x20C2, 0x20C3, 0x20C4, 0x20C5, 0x20C6, 0x20C7,
+            0x20C8, 0x20C9, 0x20CA, 0x20CB, 0x20CC, 0x20CD, 0x20CE, 0x20CF,
+    /* D */ 0x20D0, 0x20D1, 0x20D2, 0x20D3, 0x20D4, 0x20D5, 0x20D6, 0x20D7,
+            0x20D8, 0x20D9, 0x20DA, 0x20DB, 0x20DC, 0x20DD, 0x20DE, 0x20DF,
+    /* E */ 0x20E0, 0x20E1, 0x20E2, 0x20E3, 0x20E4, 0x20E5, 0x20E6, 0x20E7,
+            0x20E8, 0x20E9, 0x20EA, 0x20EB, 0x20EC, 0x20ED, 0x20EE, 0x20EF,
+    /* F */ 0x20F0, 0x20F1, 0x20F2, 0x20F3, 0x20F4, 0x20F5, 0x20F6, 0x20F7,
+            0x20F8, 0x20F9, 0x20FA, 0x20FB, 0x20FC, 0x20FD, 0x20FE, 0x20FF,
+
+    // Table 8 (for high byte 0x21)
+
+    /* 0 */ 0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107,
+            0x2108, 0x2109, 0x210A, 0x210B, 0x210C, 0x210D, 0x210E, 0x210F,
+    /* 1 */ 0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117,
+            0x2118, 0x2119, 0x211A, 0x211B, 0x211C, 0x211D, 0x211E, 0x211F,
+    /* 2 */ 0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127,
+            0x2128, 0x2129, 0x212A, 0x212B, 0x212C, 0x212D, 0x212E, 0x212F,
+    /* 3 */ 0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137,
+            0x2138, 0x2139, 0x213A, 0x213B, 0x213C, 0x213D, 0x213E, 0x213F,
+    /* 4 */ 0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147,
+            0x2148, 0x2149, 0x214A, 0x214B, 0x214C, 0x214D, 0x214E, 0x214F,
+    /* 5 */ 0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157,
+            0x2158, 0x2159, 0x215A, 0x215B, 0x215C, 0x215D, 0x215E, 0x215F,
+    /* 6 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177,
+            0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F,
+    /* 7 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177,
+            0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F,
+    /* 8 */ 0x2180, 0x2181, 0x2182, 0x2183, 0x2184, 0x2185, 0x2186, 0x2187,
+            0x2188, 0x2189, 0x218A, 0x218B, 0x218C, 0x218D, 0x218E, 0x218F,
+    /* 9 */ 0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197,
+            0x2198, 0x2199, 0x219A, 0x219B, 0x219C, 0x219D, 0x219E, 0x219F,
+    /* A */ 0x21A0, 0x21A1, 0x21A2, 0x21A3, 0x21A4, 0x21A5, 0x21A6, 0x21A7,
+            0x21A8, 0x21A9, 0x21AA, 0x21AB, 0x21AC, 0x21AD, 0x21AE, 0x21AF,
+    /* B */ 0x21B0, 0x21B1, 0x21B2, 0x21B3, 0x21B4, 0x21B5, 0x21B6, 0x21B7,
+            0x21B8, 0x21B9, 0x21BA, 0x21BB, 0x21BC, 0x21BD, 0x21BE, 0x21BF,
+    /* C */ 0x21C0, 0x21C1, 0x21C2, 0x21C3, 0x21C4, 0x21C5, 0x21C6, 0x21C7,
+            0x21C8, 0x21C9, 0x21CA, 0x21CB, 0x21CC, 0x21CD, 0x21CE, 0x21CF,
+    /* D */ 0x21D0, 0x21D1, 0x21D2, 0x21D3, 0x21D4, 0x21D5, 0x21D6, 0x21D7,
+            0x21D8, 0x21D9, 0x21DA, 0x21DB, 0x21DC, 0x21DD, 0x21DE, 0x21DF,
+    /* E */ 0x21E0, 0x21E1, 0x21E2, 0x21E3, 0x21E4, 0x21E5, 0x21E6, 0x21E7,
+            0x21E8, 0x21E9, 0x21EA, 0x21EB, 0x21EC, 0x21ED, 0x21EE, 0x21EF,
+    /* F */ 0x21F0, 0x21F1, 0x21F2, 0x21F3, 0x21F4, 0x21F5, 0x21F6, 0x21F7,
+            0x21F8, 0x21F9, 0x21FA, 0x21FB, 0x21FC, 0x21FD, 0x21FE, 0x21FF,
+
+    // Table 9 (for high byte 0xFE)
+
+    /* 0 */ 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, 0xFE06, 0xFE07,
+            0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F,
+    /* 1 */ 0xFE10, 0xFE11, 0xFE12, 0xFE13, 0xFE14, 0xFE15, 0xFE16, 0xFE17,
+            0xFE18, 0xFE19, 0xFE1A, 0xFE1B, 0xFE1C, 0xFE1D, 0xFE1E, 0xFE1F,
+    /* 2 */ 0xFE20, 0xFE21, 0xFE22, 0xFE23, 0xFE24, 0xFE25, 0xFE26, 0xFE27,
+            0xFE28, 0xFE29, 0xFE2A, 0xFE2B, 0xFE2C, 0xFE2D, 0xFE2E, 0xFE2F,
+    /* 3 */ 0xFE30, 0xFE31, 0xFE32, 0xFE33, 0xFE34, 0xFE35, 0xFE36, 0xFE37,
+            0xFE38, 0xFE39, 0xFE3A, 0xFE3B, 0xFE3C, 0xFE3D, 0xFE3E, 0xFE3F,
+    /* 4 */ 0xFE40, 0xFE41, 0xFE42, 0xFE43, 0xFE44, 0xFE45, 0xFE46, 0xFE47,
+            0xFE48, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C, 0xFE4D, 0xFE4E, 0xFE4F,
+    /* 5 */ 0xFE50, 0xFE51, 0xFE52, 0xFE53, 0xFE54, 0xFE55, 0xFE56, 0xFE57,
+            0xFE58, 0xFE59, 0xFE5A, 0xFE5B, 0xFE5C, 0xFE5D, 0xFE5E, 0xFE5F,
+    /* 6 */ 0xFE60, 0xFE61, 0xFE62, 0xFE63, 0xFE64, 0xFE65, 0xFE66, 0xFE67,
+            0xFE68, 0xFE69, 0xFE6A, 0xFE6B, 0xFE6C, 0xFE6D, 0xFE6E, 0xFE6F,
+    /* 7 */ 0xFE70, 0xFE71, 0xFE72, 0xFE73, 0xFE74, 0xFE75, 0xFE76, 0xFE77,
+            0xFE78, 0xFE79, 0xFE7A, 0xFE7B, 0xFE7C, 0xFE7D, 0xFE7E, 0xFE7F,
+    /* 8 */ 0xFE80, 0xFE81, 0xFE82, 0xFE83, 0xFE84, 0xFE85, 0xFE86, 0xFE87,
+            0xFE88, 0xFE89, 0xFE8A, 0xFE8B, 0xFE8C, 0xFE8D, 0xFE8E, 0xFE8F,
+    /* 9 */ 0xFE90, 0xFE91, 0xFE92, 0xFE93, 0xFE94, 0xFE95, 0xFE96, 0xFE97,
+            0xFE98, 0xFE99, 0xFE9A, 0xFE9B, 0xFE9C, 0xFE9D, 0xFE9E, 0xFE9F,
+    /* A */ 0xFEA0, 0xFEA1, 0xFEA2, 0xFEA3, 0xFEA4, 0xFEA5, 0xFEA6, 0xFEA7,
+            0xFEA8, 0xFEA9, 0xFEAA, 0xFEAB, 0xFEAC, 0xFEAD, 0xFEAE, 0xFEAF,
+    /* B */ 0xFEB0, 0xFEB1, 0xFEB2, 0xFEB3, 0xFEB4, 0xFEB5, 0xFEB6, 0xFEB7,
+            0xFEB8, 0xFEB9, 0xFEBA, 0xFEBB, 0xFEBC, 0xFEBD, 0xFEBE, 0xFEBF,
+    /* C */ 0xFEC0, 0xFEC1, 0xFEC2, 0xFEC3, 0xFEC4, 0xFEC5, 0xFEC6, 0xFEC7,
+            0xFEC8, 0xFEC9, 0xFECA, 0xFECB, 0xFECC, 0xFECD, 0xFECE, 0xFECF,
+    /* D */ 0xFED0, 0xFED1, 0xFED2, 0xFED3, 0xFED4, 0xFED5, 0xFED6, 0xFED7,
+            0xFED8, 0xFED9, 0xFEDA, 0xFEDB, 0xFEDC, 0xFEDD, 0xFEDE, 0xFEDF,
+    /* E */ 0xFEE0, 0xFEE1, 0xFEE2, 0xFEE3, 0xFEE4, 0xFEE5, 0xFEE6, 0xFEE7,
+            0xFEE8, 0xFEE9, 0xFEEA, 0xFEEB, 0xFEEC, 0xFEED, 0xFEEE, 0xFEEF,
+    /* F */ 0xFEF0, 0xFEF1, 0xFEF2, 0xFEF3, 0xFEF4, 0xFEF5, 0xFEF6, 0xFEF7,
+            0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC, 0xFEFD, 0xFEFE, 0x0000,
+
+    // Table 10 (for high byte 0xFF)
+
+    /* 0 */ 0xFF00, 0xFF01, 0xFF02, 0xFF03, 0xFF04, 0xFF05, 0xFF06, 0xFF07,
+            0xFF08, 0xFF09, 0xFF0A, 0xFF0B, 0xFF0C, 0xFF0D, 0xFF0E, 0xFF0F,
+    /* 1 */ 0xFF10, 0xFF11, 0xFF12, 0xFF13, 0xFF14, 0xFF15, 0xFF16, 0xFF17,
+            0xFF18, 0xFF19, 0xFF1A, 0xFF1B, 0xFF1C, 0xFF1D, 0xFF1E, 0xFF1F,
+    /* 2 */ 0xFF20, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47,
+            0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F,
+    /* 3 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57,
+            0xFF58, 0xFF59, 0xFF5A, 0xFF3B, 0xFF3C, 0xFF3D, 0xFF3E, 0xFF3F,
+    /* 4 */ 0xFF40, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47,
+            0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F,
+    /* 5 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57,
+            0xFF58, 0xFF59, 0xFF5A, 0xFF5B, 0xFF5C, 0xFF5D, 0xFF5E, 0xFF5F,
+    /* 6 */ 0xFF60, 0xFF61, 0xFF62, 0xFF63, 0xFF64, 0xFF65, 0xFF66, 0xFF67,
+            0xFF68, 0xFF69, 0xFF6A, 0xFF6B, 0xFF6C, 0xFF6D, 0xFF6E, 0xFF6F,
+    /* 7 */ 0xFF70, 0xFF71, 0xFF72, 0xFF73, 0xFF74, 0xFF75, 0xFF76, 0xFF77,
+            0xFF78, 0xFF79, 0xFF7A, 0xFF7B, 0xFF7C, 0xFF7D, 0xFF7E, 0xFF7F,
+    /* 8 */ 0xFF80, 0xFF81, 0xFF82, 0xFF83, 0xFF84, 0xFF85, 0xFF86, 0xFF87,
+            0xFF88, 0xFF89, 0xFF8A, 0xFF8B, 0xFF8C, 0xFF8D, 0xFF8E, 0xFF8F,
+    /* 9 */ 0xFF90, 0xFF91, 0xFF92, 0xFF93, 0xFF94, 0xFF95, 0xFF96, 0xFF97,
+            0xFF98, 0xFF99, 0xFF9A, 0xFF9B, 0xFF9C, 0xFF9D, 0xFF9E, 0xFF9F,
+    /* A */ 0xFFA0, 0xFFA1, 0xFFA2, 0xFFA3, 0xFFA4, 0xFFA5, 0xFFA6, 0xFFA7,
+            0xFFA8, 0xFFA9, 0xFFAA, 0xFFAB, 0xFFAC, 0xFFAD, 0xFFAE, 0xFFAF,
+    /* B */ 0xFFB0, 0xFFB1, 0xFFB2, 0xFFB3, 0xFFB4, 0xFFB5, 0xFFB6, 0xFFB7,
+            0xFFB8, 0xFFB9, 0xFFBA, 0xFFBB, 0xFFBC, 0xFFBD, 0xFFBE, 0xFFBF,
+    /* C */ 0xFFC0, 0xFFC1, 0xFFC2, 0xFFC3, 0xFFC4, 0xFFC5, 0xFFC6, 0xFFC7,
+            0xFFC8, 0xFFC9, 0xFFCA, 0xFFCB, 0xFFCC, 0xFFCD, 0xFFCE, 0xFFCF,
+    /* D */ 0xFFD0, 0xFFD1, 0xFFD2, 0xFFD3, 0xFFD4, 0xFFD5, 0xFFD6, 0xFFD7,
+            0xFFD8, 0xFFD9, 0xFFDA, 0xFFDB, 0xFFDC, 0xFFDD, 0xFFDE, 0xFFDF,
+    /* E */ 0xFFE0, 0xFFE1, 0xFFE2, 0xFFE3, 0xFFE4, 0xFFE5, 0xFFE6, 0xFFE7,
+            0xFFE8, 0xFFE9, 0xFFEA, 0xFFEB, 0xFFEC, 0xFFED, 0xFFEE, 0xFFEF,
+    /* F */ 0xFFF0, 0xFFF1, 0xFFF2, 0xFFF3, 0xFFF4, 0xFFF5, 0xFFF6, 0xFFF7,
+            0xFFF8, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF,
+};
+
+u16 hfsplus_decompose_table[] = {
+	/* base table */
+	0x0010, 0x04c0, 0x0000, 0x06f0, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x07b0,
+	/* char table 0x0___ */
+	0x0020, 0x0070, 0x0160, 0x0190, 0x0230, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x02d0, 0x0340, 0x0360, 0x03b0, 0x03e0, 0x0400, 0x0430,
+	/* char table 0x00__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0030, 0x0040, 0x0050, 0x0060,
+	/* char values 0x00c_ */
+	0x2042, 0x204a, 0x2052, 0x205a, 0x2062, 0x206a, 0x0000, 0x2072,
+	0x207a, 0x2082, 0x208a, 0x2092, 0x209a, 0x20a2, 0x20aa, 0x20b2,
+	/* char values 0x00d_ */
+	0x0000, 0x20ba, 0x20c2, 0x20ca, 0x20d2, 0x20da, 0x20e2, 0x0000,
+	0x0000, 0x20ea, 0x20f2, 0x20fa, 0x2102, 0x210a, 0x0000, 0x0000,
+	/* char values 0x00e_ */
+	0x2112, 0x211a, 0x2122, 0x212a, 0x2132, 0x213a, 0x0000, 0x2142,
+	0x214a, 0x2152, 0x215a, 0x2162, 0x216a, 0x2172, 0x217a, 0x2182,
+	/* char values 0x00f_ */
+	0x0000, 0x218a, 0x2192, 0x219a, 0x21a2, 0x21aa, 0x21b2, 0x0000,
+	0x0000, 0x21ba, 0x21c2, 0x21ca, 0x21d2, 0x21da, 0x0000, 0x21e2,
+	/* char table 0x01__ */
+	0x0080, 0x0090, 0x00a0, 0x00b0, 0x00c0, 0x00d0, 0x00e0, 0x00f0,
+	0x0000, 0x0000, 0x0100, 0x0110, 0x0120, 0x0130, 0x0140, 0x0150,
+	/* char values 0x010_ */
+	0x21ea, 0x21f2, 0x21fa, 0x2202, 0x220a, 0x2212, 0x221a, 0x2222,
+	0x222a, 0x2232, 0x223a, 0x2242, 0x224a, 0x2252, 0x225a, 0x2262,
+	/* char values 0x011_ */
+	0x0000, 0x0000, 0x226a, 0x2272, 0x227a, 0x2282, 0x228a, 0x2292,
+	0x229a, 0x22a2, 0x22aa, 0x22b2, 0x22ba, 0x22c2, 0x22ca, 0x22d2,
+	/* char values 0x012_ */
+	0x22da, 0x22e2, 0x22ea, 0x22f2, 0x22fa, 0x2302, 0x0000, 0x0000,
+	0x230a, 0x2312, 0x231a, 0x2322, 0x232a, 0x2332, 0x233a, 0x2342,
+	/* char values 0x013_ */
+	0x234a, 0x0000, 0x0000, 0x0000, 0x2352, 0x235a, 0x2362, 0x236a,
+	0x0000, 0x2372, 0x237a, 0x2382, 0x238a, 0x2392, 0x239a, 0x0000,
+	/* char values 0x014_ */
+	0x0000, 0x0000, 0x0000, 0x23a2, 0x23aa, 0x23b2, 0x23ba, 0x23c2,
+	0x23ca, 0x0000, 0x0000, 0x0000, 0x23d2, 0x23da, 0x23e2, 0x23ea,
+	/* char values 0x015_ */
+	0x23f2, 0x23fa, 0x0000, 0x0000, 0x2402, 0x240a, 0x2412, 0x241a,
+	0x2422, 0x242a, 0x2432, 0x243a, 0x2442, 0x244a, 0x2452, 0x245a,
+	/* char values 0x016_ */
+	0x2462, 0x246a, 0x2472, 0x247a, 0x2482, 0x248a, 0x0000, 0x0000,
+	0x2492, 0x249a, 0x24a2, 0x24aa, 0x24b2, 0x24ba, 0x24c2, 0x24ca,
+	/* char values 0x017_ */
+	0x24d2, 0x24da, 0x24e2, 0x24ea, 0x24f2, 0x24fa, 0x2502, 0x250a,
+	0x2512, 0x251a, 0x2522, 0x252a, 0x2532, 0x253a, 0x2542, 0x0000,
+	/* char values 0x01a_ */
+	0x254a, 0x2552, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x255a,
+	/* char values 0x01b_ */
+	0x2562, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x01c_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x256a, 0x2572, 0x257a,
+	/* char values 0x01d_ */
+	0x2582, 0x258a, 0x2592, 0x259a, 0x25a2, 0x25ab, 0x25b7, 0x25c3,
+	0x25cf, 0x25db, 0x25e7, 0x25f3, 0x25ff, 0x0000, 0x260b, 0x2617,
+	/* char values 0x01e_ */
+	0x2623, 0x262f, 0x263a, 0x2642, 0x0000, 0x0000, 0x264a, 0x2652,
+	0x265a, 0x2662, 0x266a, 0x2672, 0x267b, 0x2687, 0x2692, 0x269a,
+	/* char values 0x01f_ */
+	0x26a2, 0x0000, 0x0000, 0x0000, 0x26aa, 0x26b2, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x26bb, 0x26c7, 0x26d2, 0x26da, 0x26e2, 0x26ea,
+	/* char table 0x02__ */
+	0x0170, 0x0180, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x020_ */
+	0x26f2, 0x26fa, 0x2702, 0x270a, 0x2712, 0x271a, 0x2722, 0x272a,
+	0x2732, 0x273a, 0x2742, 0x274a, 0x2752, 0x275a, 0x2762, 0x276a,
+	/* char values 0x021_ */
+	0x2772, 0x277a, 0x2782, 0x278a, 0x2792, 0x279a, 0x27a2, 0x27aa,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x03__ */
+	0x0000, 0x01a0, 0x0000, 0x0000, 0x01b0, 0x0000, 0x0000, 0x01c0,
+	0x01d0, 0x01e0, 0x01f0, 0x0200, 0x0210, 0x0220, 0x0000, 0x0000,
+	/* char values 0x031_ */
+	0x27b2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x034_ */
+	0x27b9, 0x27bd, 0x0000, 0x27c1, 0x27c6, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x037_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x27cd, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d1, 0x0000,
+	/* char values 0x038_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d6, 0x27de, 0x27e5,
+	0x27ea, 0x27f2, 0x27fa, 0x0000, 0x2802, 0x0000, 0x280a, 0x2812,
+	/* char values 0x039_ */
+	0x281b, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x03a_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2826, 0x282e, 0x2836, 0x283e, 0x2846, 0x284e,
+	/* char values 0x03b_ */
+	0x2857, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x03c_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2862, 0x286a, 0x2872, 0x287a, 0x2882, 0x0000,
+	/* char values 0x03d_ */
+	0x0000, 0x0000, 0x0000, 0x288a, 0x2892, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x04__ */
+	0x0240, 0x0250, 0x0000, 0x0260, 0x0000, 0x0270, 0x0000, 0x0280,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0290, 0x02a0, 0x02b0, 0x02c0,
+	/* char values 0x040_ */
+	0x0000, 0x289a, 0x0000, 0x28a2, 0x0000, 0x0000, 0x0000, 0x28aa,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x28b2, 0x0000, 0x28ba, 0x0000,
+	/* char values 0x041_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x28c2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x043_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x28ca, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x045_ */
+	0x0000, 0x28d2, 0x0000, 0x28da, 0x0000, 0x0000, 0x0000, 0x28e2,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x28ea, 0x0000, 0x28f2, 0x0000,
+	/* char values 0x047_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x28fa, 0x2902,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x04c_ */
+	0x0000, 0x290a, 0x2912, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x04d_ */
+	0x291a, 0x2922, 0x292a, 0x2932, 0x2939, 0x293d, 0x2942, 0x294a,
+	0x2951, 0x2955, 0x295a, 0x2962, 0x296a, 0x2972, 0x297a, 0x2982,
+	/* char values 0x04e_ */
+	0x2989, 0x298d, 0x2992, 0x299a, 0x29a2, 0x29aa, 0x29b2, 0x29ba,
+	0x29c1, 0x29c5, 0x29ca, 0x29d2, 0x0000, 0x0000, 0x29da, 0x29e2,
+	/* char values 0x04f_ */
+	0x29ea, 0x29f2, 0x29fa, 0x2a02, 0x2a0a, 0x2a12, 0x0000, 0x0000,
+	0x2a1a, 0x2a22, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x09__ */
+	0x0000, 0x0000, 0x02e0, 0x02f0, 0x0000, 0x0300, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0310, 0x0320, 0x0330, 0x0000, 0x0000,
+	/* char values 0x092_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2a2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x093_ */
+	0x0000, 0x2a32, 0x0000, 0x0000, 0x2a3a, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x095_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2a42, 0x2a4a, 0x2a52, 0x2a5a, 0x2a62, 0x2a6a, 0x2a72, 0x2a7a,
+	/* char values 0x09b_ */
+	0x2a82, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x09c_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x2a8a, 0x2a92, 0x0000, 0x0000, 0x0000,
+	/* char values 0x09d_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2a9a, 0x2aa2, 0x0000, 0x2aaa,
+	/* char table 0x0a__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0350, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0a5_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2ab2, 0x2aba, 0x2ac2, 0x2aca, 0x0000, 0x2ad2, 0x0000,
+	/* char table 0x0b__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0370, 0x0380, 0x0000, 0x0000,
+	0x0000, 0x0390, 0x0000, 0x0000, 0x03a0, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0b4_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2ada, 0x0000, 0x0000, 0x2ae2, 0x2aea, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0b5_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2af2, 0x2afa, 0x0000, 0x2b02,
+	/* char values 0x0b9_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2b0a, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0bc_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2b12, 0x2b1a, 0x2b22, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0c__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x03c0, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x03d0, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0c4_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2b2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0cc_ */
+	0x2b32, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b3a,
+	0x2b42, 0x0000, 0x2b4a, 0x2b53, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0d__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x03f0, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0d4_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2b5e, 0x2b66, 0x2b6e, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0e__ */
+	0x0000, 0x0000, 0x0000, 0x0410, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0420, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0e3_ */
+	0x0000, 0x0000, 0x0000, 0x2b76, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0eb_ */
+	0x0000, 0x0000, 0x0000, 0x2b7e, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0f__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0440, 0x0450, 0x0460, 0x0470,
+	0x0480, 0x0490, 0x04a0, 0x04b0, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f4_ */
+	0x0000, 0x0000, 0x0000, 0x2b86, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b8e, 0x0000, 0x0000,
+	/* char values 0x0f5_ */
+	0x0000, 0x0000, 0x2b96, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b9e,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2ba6, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f6_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2bae, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f7_ */
+	0x0000, 0x0000, 0x0000, 0x2bb6, 0x0000, 0x2bbe, 0x2bc6, 0x2bcf,
+	0x2bda, 0x2be3, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f8_ */
+	0x0000, 0x2bee, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f9_ */
+	0x0000, 0x0000, 0x0000, 0x2bf6, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2bfe, 0x0000, 0x0000,
+	/* char values 0x0fa_ */
+	0x0000, 0x0000, 0x2c06, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c0e,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2c16, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0fb_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2c1e, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x1___ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x04d0, 0x05e0,
+	/* char table 0x1e__ */
+	0x04e0, 0x04f0, 0x0500, 0x0510, 0x0520, 0x0530, 0x0540, 0x0550,
+	0x0560, 0x0570, 0x0580, 0x0590, 0x05a0, 0x05b0, 0x05c0, 0x05d0,
+	/* char values 0x1e0_ */
+	0x2c26, 0x2c2e, 0x2c36, 0x2c3e, 0x2c46, 0x2c4e, 0x2c56, 0x2c5e,
+	0x2c67, 0x2c73, 0x2c7e, 0x2c86, 0x2c8e, 0x2c96, 0x2c9e, 0x2ca6,
+	/* char values 0x1e1_ */
+	0x2cae, 0x2cb6, 0x2cbe, 0x2cc6, 0x2ccf, 0x2cdb, 0x2ce7, 0x2cf3,
+	0x2cfe, 0x2d06, 0x2d0e, 0x2d16, 0x2d1f, 0x2d2b, 0x2d36, 0x2d3e,
+	/* char values 0x1e2_ */
+	0x2d46, 0x2d4e, 0x2d56, 0x2d5e, 0x2d66, 0x2d6e, 0x2d76, 0x2d7e,
+	0x2d86, 0x2d8e, 0x2d96, 0x2d9e, 0x2da6, 0x2dae, 0x2db7, 0x2dc3,
+	/* char values 0x1e3_ */
+	0x2dce, 0x2dd6, 0x2dde, 0x2de6, 0x2dee, 0x2df6, 0x2dfe, 0x2e06,
+	0x2e0f, 0x2e1b, 0x2e26, 0x2e2e, 0x2e36, 0x2e3e, 0x2e46, 0x2e4e,
+	/* char values 0x1e4_ */
+	0x2e56, 0x2e5e, 0x2e66, 0x2e6e, 0x2e76, 0x2e7e, 0x2e86, 0x2e8e,
+	0x2e96, 0x2e9e, 0x2ea6, 0x2eae, 0x2eb7, 0x2ec3, 0x2ecf, 0x2edb,
+	/* char values 0x1e5_ */
+	0x2ee7, 0x2ef3, 0x2eff, 0x2f0b, 0x2f16, 0x2f1e, 0x2f26, 0x2f2e,
+	0x2f36, 0x2f3e, 0x2f46, 0x2f4e, 0x2f57, 0x2f63, 0x2f6e, 0x2f76,
+	/* char values 0x1e6_ */
+	0x2f7e, 0x2f86, 0x2f8e, 0x2f96, 0x2f9f, 0x2fab, 0x2fb7, 0x2fc3,
+	0x2fcf, 0x2fdb, 0x2fe6, 0x2fee, 0x2ff6, 0x2ffe, 0x3006, 0x300e,
+	/* char values 0x1e7_ */
+	0x3016, 0x301e, 0x3026, 0x302e, 0x3036, 0x303e, 0x3046, 0x304e,
+	0x3057, 0x3063, 0x306f, 0x307b, 0x3086, 0x308e, 0x3096, 0x309e,
+	/* char values 0x1e8_ */
+	0x30a6, 0x30ae, 0x30b6, 0x30be, 0x30c6, 0x30ce, 0x30d6, 0x30de,
+	0x30e6, 0x30ee, 0x30f6, 0x30fe, 0x3106, 0x310e, 0x3116, 0x311e,
+	/* char values 0x1e9_ */
+	0x3126, 0x312e, 0x3136, 0x313e, 0x3146, 0x314e, 0x3156, 0x315e,
+	0x3166, 0x316e, 0x0000, 0x3176, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x1ea_ */
+	0x317e, 0x3186, 0x318e, 0x3196, 0x319f, 0x31ab, 0x31b7, 0x31c3,
+	0x31cf, 0x31db, 0x31e7, 0x31f3, 0x31ff, 0x320b, 0x3217, 0x3223,
+	/* char values 0x1eb_ */
+	0x322f, 0x323b, 0x3247, 0x3253, 0x325f, 0x326b, 0x3277, 0x3283,
+	0x328e, 0x3296, 0x329e, 0x32a6, 0x32ae, 0x32b6, 0x32bf, 0x32cb,
+	/* char values 0x1ec_ */
+	0x32d7, 0x32e3, 0x32ef, 0x32fb, 0x3307, 0x3313, 0x331f, 0x332b,
+	0x3336, 0x333e, 0x3346, 0x334e, 0x3356, 0x335e, 0x3366, 0x336e,
+	/* char values 0x1ed_ */
+	0x3377, 0x3383, 0x338f, 0x339b, 0x33a7, 0x33b3, 0x33bf, 0x33cb,
+	0x33d7, 0x33e3, 0x33ef, 0x33fb, 0x3407, 0x3413, 0x341f, 0x342b,
+	/* char values 0x1ee_ */
+	0x3437, 0x3443, 0x344f, 0x345b, 0x3466, 0x346e, 0x3476, 0x347e,
+	0x3487, 0x3493, 0x349f, 0x34ab, 0x34b7, 0x34c3, 0x34cf, 0x34db,
+	/* char values 0x1ef_ */
+	0x34e7, 0x34f3, 0x34fe, 0x3506, 0x350e, 0x3516, 0x351e, 0x3526,
+	0x352e, 0x3536, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x1f__ */
+	0x05f0, 0x0600, 0x0610, 0x0620, 0x0630, 0x0640, 0x0650, 0x0660,
+	0x0670, 0x0680, 0x0690, 0x06a0, 0x06b0, 0x06c0, 0x06d0, 0x06e0,
+	/* char values 0x1f0_ */
+	0x353e, 0x3546, 0x354f, 0x355b, 0x3567, 0x3573, 0x357f, 0x358b,
+	0x3596, 0x359e, 0x35a7, 0x35b3, 0x35bf, 0x35cb, 0x35d7, 0x35e3,
+	/* char values 0x1f1_ */
+	0x35ee, 0x35f6, 0x35ff, 0x360b, 0x3617, 0x3623, 0x0000, 0x0000,
+	0x362e, 0x3636, 0x363f, 0x364b, 0x3657, 0x3663, 0x0000, 0x0000,
+	/* char values 0x1f2_ */
+	0x366e, 0x3676, 0x367f, 0x368b, 0x3697, 0x36a3, 0x36af, 0x36bb,
+	0x36c6, 0x36ce, 0x36d7, 0x36e3, 0x36ef, 0x36fb, 0x3707, 0x3713,
+	/* char values 0x1f3_ */
+	0x371e, 0x3726, 0x372f, 0x373b, 0x3747, 0x3753, 0x375f, 0x376b,
+	0x3776, 0x377e, 0x3787, 0x3793, 0x379f, 0x37ab, 0x37b7, 0x37c3,
+	/* char values 0x1f4_ */
+	0x37ce, 0x37d6, 0x37df, 0x37eb, 0x37f7, 0x3803, 0x0000, 0x0000,
+	0x380e, 0x3816, 0x381f, 0x382b, 0x3837, 0x3843, 0x0000, 0x0000,
+	/* char values 0x1f5_ */
+	0x384e, 0x3856, 0x385f, 0x386b, 0x3877, 0x3883, 0x388f, 0x389b,
+	0x0000, 0x38a6, 0x0000, 0x38af, 0x0000, 0x38bb, 0x0000, 0x38c7,
+	/* char values 0x1f6_ */
+	0x38d2, 0x38da, 0x38e3, 0x38ef, 0x38fb, 0x3907, 0x3913, 0x391f,
+	0x392a, 0x3932, 0x393b, 0x3947, 0x3953, 0x395f, 0x396b, 0x3977,
+	/* char values 0x1f7_ */
+	0x3982, 0x398a, 0x3992, 0x399a, 0x39a2, 0x39aa, 0x39b2, 0x39ba,
+	0x39c2, 0x39ca, 0x39d2, 0x39da, 0x39e2, 0x39ea, 0x0000, 0x0000,
+	/* char values 0x1f8_ */
+	0x39f3, 0x39ff, 0x3a0c, 0x3a1c, 0x3a2c, 0x3a3c, 0x3a4c, 0x3a5c,
+	0x3a6b, 0x3a77, 0x3a84, 0x3a94, 0x3aa4, 0x3ab4, 0x3ac4, 0x3ad4,
+	/* char values 0x1f9_ */
+	0x3ae3, 0x3aef, 0x3afc, 0x3b0c, 0x3b1c, 0x3b2c, 0x3b3c, 0x3b4c,
+	0x3b5b, 0x3b67, 0x3b74, 0x3b84, 0x3b94, 0x3ba4, 0x3bb4, 0x3bc4,
+	/* char values 0x1fa_ */
+	0x3bd3, 0x3bdf, 0x3bec, 0x3bfc, 0x3c0c, 0x3c1c, 0x3c2c, 0x3c3c,
+	0x3c4b, 0x3c57, 0x3c64, 0x3c74, 0x3c84, 0x3c94, 0x3ca4, 0x3cb4,
+	/* char values 0x1fb_ */
+	0x3cc2, 0x3cca, 0x3cd3, 0x3cde, 0x3ce7, 0x0000, 0x3cf2, 0x3cfb,
+	0x3d06, 0x3d0e, 0x3d16, 0x3d1e, 0x3d26, 0x0000, 0x3d2d, 0x0000,
+	/* char values 0x1fc_ */
+	0x0000, 0x3d32, 0x3d3b, 0x3d46, 0x3d4f, 0x0000, 0x3d5a, 0x3d63,
+	0x3d6e, 0x3d76, 0x3d7e, 0x3d86, 0x3d8e, 0x3d96, 0x3d9e, 0x3da6,
+	/* char values 0x1fd_ */
+	0x3dae, 0x3db6, 0x3dbf, 0x3dcb, 0x0000, 0x0000, 0x3dd6, 0x3ddf,
+	0x3dea, 0x3df2, 0x3dfa, 0x3e02, 0x0000, 0x3e0a, 0x3e12, 0x3e1a,
+	/* char values 0x1fe_ */
+	0x3e22, 0x3e2a, 0x3e33, 0x3e3f, 0x3e4a, 0x3e52, 0x3e5a, 0x3e63,
+	0x3e6e, 0x3e76, 0x3e7e, 0x3e86, 0x3e8e, 0x3e96, 0x3e9e, 0x3ea5,
+	/* char values 0x1ff_ */
+	0x0000, 0x0000, 0x3eab, 0x3eb6, 0x3ebf, 0x0000, 0x3eca, 0x3ed3,
+	0x3ede, 0x3ee6, 0x3eee, 0x3ef6, 0x3efe, 0x3f05, 0x0000, 0x0000,
+	/* char table 0x3___ */
+	0x0700, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x30__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0710, 0x0720, 0x0730, 0x0740,
+	0x0000, 0x0750, 0x0760, 0x0770, 0x0780, 0x0790, 0x0000, 0x07a0,
+	/* char values 0x304_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x3f0a, 0x0000, 0x3f12, 0x0000,
+	/* char values 0x305_ */
+	0x3f1a, 0x0000, 0x3f22, 0x0000, 0x3f2a, 0x0000, 0x3f32, 0x0000,
+	0x3f3a, 0x0000, 0x3f42, 0x0000, 0x3f4a, 0x0000, 0x3f52, 0x0000,
+	/* char values 0x306_ */
+	0x3f5a, 0x0000, 0x3f62, 0x0000, 0x0000, 0x3f6a, 0x0000, 0x3f72,
+	0x0000, 0x3f7a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x307_ */
+	0x3f82, 0x3f8a, 0x0000, 0x3f92, 0x3f9a, 0x0000, 0x3fa2, 0x3faa,
+	0x0000, 0x3fb2, 0x3fba, 0x0000, 0x3fc2, 0x3fca, 0x0000, 0x0000,
+	/* char values 0x309_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x3fd2, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x3fda, 0x0000,
+	/* char values 0x30a_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x3fe2, 0x0000, 0x3fea, 0x0000,
+	/* char values 0x30b_ */
+	0x3ff2, 0x0000, 0x3ffa, 0x0000, 0x4002, 0x0000, 0x400a, 0x0000,
+	0x4012, 0x0000, 0x401a, 0x0000, 0x4022, 0x0000, 0x402a, 0x0000,
+	/* char values 0x30c_ */
+	0x4032, 0x0000, 0x403a, 0x0000, 0x0000, 0x4042, 0x0000, 0x404a,
+	0x0000, 0x4052, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x30d_ */
+	0x405a, 0x4062, 0x0000, 0x406a, 0x4072, 0x0000, 0x407a, 0x4082,
+	0x0000, 0x408a, 0x4092, 0x0000, 0x409a, 0x40a2, 0x0000, 0x0000,
+	/* char values 0x30f_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x40aa, 0x0000, 0x0000, 0x40b2,
+	0x40ba, 0x40c2, 0x40ca, 0x0000, 0x0000, 0x0000, 0x40d2, 0x0000,
+	/* char table 0xf___ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x07c0, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0xfb__ */
+	0x0000, 0x07d0, 0x07e0, 0x07f0, 0x0800, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0xfb1_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x40da,
+	/* char values 0xfb2_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x40e2, 0x40ea, 0x40f3, 0x40ff, 0x410a, 0x4112,
+	/* char values 0xfb3_ */
+	0x411a, 0x4122, 0x412a, 0x4132, 0x413a, 0x4142, 0x414a, 0x0000,
+	0x4152, 0x415a, 0x4162, 0x416a, 0x4172, 0x0000, 0x417a, 0x0000,
+	/* char values 0xfb4_ */
+	0x4182, 0x418a, 0x0000, 0x4192, 0x419a, 0x0000, 0x41a2, 0x41aa,
+	0x41b2, 0x41ba, 0x41c2, 0x41ca, 0x41d2, 0x41da, 0x41e2, 0x0000,
+	/* decomposed characters */
+	0x0041, 0x0300, 0x0041, 0x0301, 0x0041, 0x0302, 0x0041, 0x0303,
+	0x0041, 0x0308, 0x0041, 0x030a, 0x0043, 0x0327, 0x0045, 0x0300,
+	0x0045, 0x0301, 0x0045, 0x0302, 0x0045, 0x0308, 0x0049, 0x0300,
+	0x0049, 0x0301, 0x0049, 0x0302, 0x0049, 0x0308, 0x004e, 0x0303,
+	0x004f, 0x0300, 0x004f, 0x0301, 0x004f, 0x0302, 0x004f, 0x0303,
+	0x004f, 0x0308, 0x0055, 0x0300, 0x0055, 0x0301, 0x0055, 0x0302,
+	0x0055, 0x0308, 0x0059, 0x0301, 0x0061, 0x0300, 0x0061, 0x0301,
+	0x0061, 0x0302, 0x0061, 0x0303, 0x0061, 0x0308, 0x0061, 0x030a,
+	0x0063, 0x0327, 0x0065, 0x0300, 0x0065, 0x0301, 0x0065, 0x0302,
+	0x0065, 0x0308, 0x0069, 0x0300, 0x0069, 0x0301, 0x0069, 0x0302,
+	0x0069, 0x0308, 0x006e, 0x0303, 0x006f, 0x0300, 0x006f, 0x0301,
+	0x006f, 0x0302, 0x006f, 0x0303, 0x006f, 0x0308, 0x0075, 0x0300,
+	0x0075, 0x0301, 0x0075, 0x0302, 0x0075, 0x0308, 0x0079, 0x0301,
+	0x0079, 0x0308, 0x0041, 0x0304, 0x0061, 0x0304, 0x0041, 0x0306,
+	0x0061, 0x0306, 0x0041, 0x0328, 0x0061, 0x0328, 0x0043, 0x0301,
+	0x0063, 0x0301, 0x0043, 0x0302, 0x0063, 0x0302, 0x0043, 0x0307,
+	0x0063, 0x0307, 0x0043, 0x030c, 0x0063, 0x030c, 0x0044, 0x030c,
+	0x0064, 0x030c, 0x0045, 0x0304, 0x0065, 0x0304, 0x0045, 0x0306,
+	0x0065, 0x0306, 0x0045, 0x0307, 0x0065, 0x0307, 0x0045, 0x0328,
+	0x0065, 0x0328, 0x0045, 0x030c, 0x0065, 0x030c, 0x0047, 0x0302,
+	0x0067, 0x0302, 0x0047, 0x0306, 0x0067, 0x0306, 0x0047, 0x0307,
+	0x0067, 0x0307, 0x0047, 0x0327, 0x0067, 0x0327, 0x0048, 0x0302,
+	0x0068, 0x0302, 0x0049, 0x0303, 0x0069, 0x0303, 0x0049, 0x0304,
+	0x0069, 0x0304, 0x0049, 0x0306, 0x0069, 0x0306, 0x0049, 0x0328,
+	0x0069, 0x0328, 0x0049, 0x0307, 0x004a, 0x0302, 0x006a, 0x0302,
+	0x004b, 0x0327, 0x006b, 0x0327, 0x004c, 0x0301, 0x006c, 0x0301,
+	0x004c, 0x0327, 0x006c, 0x0327, 0x004c, 0x030c, 0x006c, 0x030c,
+	0x004e, 0x0301, 0x006e, 0x0301, 0x004e, 0x0327, 0x006e, 0x0327,
+	0x004e, 0x030c, 0x006e, 0x030c, 0x004f, 0x0304, 0x006f, 0x0304,
+	0x004f, 0x0306, 0x006f, 0x0306, 0x004f, 0x030b, 0x006f, 0x030b,
+	0x0052, 0x0301, 0x0072, 0x0301, 0x0052, 0x0327, 0x0072, 0x0327,
+	0x0052, 0x030c, 0x0072, 0x030c, 0x0053, 0x0301, 0x0073, 0x0301,
+	0x0053, 0x0302, 0x0073, 0x0302, 0x0053, 0x0327, 0x0073, 0x0327,
+	0x0053, 0x030c, 0x0073, 0x030c, 0x0054, 0x0327, 0x0074, 0x0327,
+	0x0054, 0x030c, 0x0074, 0x030c, 0x0055, 0x0303, 0x0075, 0x0303,
+	0x0055, 0x0304, 0x0075, 0x0304, 0x0055, 0x0306, 0x0075, 0x0306,
+	0x0055, 0x030a, 0x0075, 0x030a, 0x0055, 0x030b, 0x0075, 0x030b,
+	0x0055, 0x0328, 0x0075, 0x0328, 0x0057, 0x0302, 0x0077, 0x0302,
+	0x0059, 0x0302, 0x0079, 0x0302, 0x0059, 0x0308, 0x005a, 0x0301,
+	0x007a, 0x0301, 0x005a, 0x0307, 0x007a, 0x0307, 0x005a, 0x030c,
+	0x007a, 0x030c, 0x004f, 0x031b, 0x006f, 0x031b, 0x0055, 0x031b,
+	0x0075, 0x031b, 0x0041, 0x030c, 0x0061, 0x030c, 0x0049, 0x030c,
+	0x0069, 0x030c, 0x004f, 0x030c, 0x006f, 0x030c, 0x0055, 0x030c,
+	0x0075, 0x030c, 0x0055, 0x0308, 0x0304, 0x0075, 0x0308, 0x0304,
+	0x0055, 0x0308, 0x0301, 0x0075, 0x0308, 0x0301, 0x0055, 0x0308,
+	0x030c, 0x0075, 0x0308, 0x030c, 0x0055, 0x0308, 0x0300, 0x0075,
+	0x0308, 0x0300, 0x0041, 0x0308, 0x0304, 0x0061, 0x0308, 0x0304,
+	0x0041, 0x0307, 0x0304, 0x0061, 0x0307, 0x0304, 0x00c6, 0x0304,
+	0x00e6, 0x0304, 0x0047, 0x030c, 0x0067, 0x030c, 0x004b, 0x030c,
+	0x006b, 0x030c, 0x004f, 0x0328, 0x006f, 0x0328, 0x004f, 0x0328,
+	0x0304, 0x006f, 0x0328, 0x0304, 0x01b7, 0x030c, 0x0292, 0x030c,
+	0x006a, 0x030c, 0x0047, 0x0301, 0x0067, 0x0301, 0x0041, 0x030a,
+	0x0301, 0x0061, 0x030a, 0x0301, 0x00c6, 0x0301, 0x00e6, 0x0301,
+	0x00d8, 0x0301, 0x00f8, 0x0301, 0x0041, 0x030f, 0x0061, 0x030f,
+	0x0041, 0x0311, 0x0061, 0x0311, 0x0045, 0x030f, 0x0065, 0x030f,
+	0x0045, 0x0311, 0x0065, 0x0311, 0x0049, 0x030f, 0x0069, 0x030f,
+	0x0049, 0x0311, 0x0069, 0x0311, 0x004f, 0x030f, 0x006f, 0x030f,
+	0x004f, 0x0311, 0x006f, 0x0311, 0x0052, 0x030f, 0x0072, 0x030f,
+	0x0052, 0x0311, 0x0072, 0x0311, 0x0055, 0x030f, 0x0075, 0x030f,
+	0x0055, 0x0311, 0x0075, 0x0311, 0x0306, 0x0307, 0x0300, 0x0301,
+	0x0313, 0x0308, 0x030d, 0x02b9, 0x003b, 0x00a8, 0x030d, 0x0391,
+	0x030d, 0x00b7, 0x0395, 0x030d, 0x0397, 0x030d, 0x0399, 0x030d,
+	0x039f, 0x030d, 0x03a5, 0x030d, 0x03a9, 0x030d, 0x03b9, 0x0308,
+	0x030d, 0x0399, 0x0308, 0x03a5, 0x0308, 0x03b1, 0x030d, 0x03b5,
+	0x030d, 0x03b7, 0x030d, 0x03b9, 0x030d, 0x03c5, 0x0308, 0x030d,
+	0x03b9, 0x0308, 0x03c5, 0x0308, 0x03bf, 0x030d, 0x03c5, 0x030d,
+	0x03c9, 0x030d, 0x03d2, 0x030d, 0x03d2, 0x0308, 0x0415, 0x0308,
+	0x0413, 0x0301, 0x0406, 0x0308, 0x041a, 0x0301, 0x0423, 0x0306,
+	0x0418, 0x0306, 0x0438, 0x0306, 0x0435, 0x0308, 0x0433, 0x0301,
+	0x0456, 0x0308, 0x043a, 0x0301, 0x0443, 0x0306, 0x0474, 0x030f,
+	0x0475, 0x030f, 0x0416, 0x0306, 0x0436, 0x0306, 0x0410, 0x0306,
+	0x0430, 0x0306, 0x0410, 0x0308, 0x0430, 0x0308, 0x00c6, 0x00e6,
+	0x0415, 0x0306, 0x0435, 0x0306, 0x018f, 0x0259, 0x018f, 0x0308,
+	0x0259, 0x0308, 0x0416, 0x0308, 0x0436, 0x0308, 0x0417, 0x0308,
+	0x0437, 0x0308, 0x01b7, 0x0292, 0x0418, 0x0304, 0x0438, 0x0304,
+	0x0418, 0x0308, 0x0438, 0x0308, 0x041e, 0x0308, 0x043e, 0x0308,
+	0x019f, 0x0275, 0x019f, 0x0308, 0x0275, 0x0308, 0x0423, 0x0304,
+	0x0443, 0x0304, 0x0423, 0x0308, 0x0443, 0x0308, 0x0423, 0x030b,
+	0x0443, 0x030b, 0x0427, 0x0308, 0x0447, 0x0308, 0x042b, 0x0308,
+	0x044b, 0x0308, 0x0928, 0x093c, 0x0930, 0x093c, 0x0933, 0x093c,
+	0x0915, 0x093c, 0x0916, 0x093c, 0x0917, 0x093c, 0x091c, 0x093c,
+	0x0921, 0x093c, 0x0922, 0x093c, 0x092b, 0x093c, 0x092f, 0x093c,
+	0x09ac, 0x09bc, 0x09c7, 0x09be, 0x09c7, 0x09d7, 0x09a1, 0x09bc,
+	0x09a2, 0x09bc, 0x09af, 0x09bc, 0x0a16, 0x0a3c, 0x0a17, 0x0a3c,
+	0x0a1c, 0x0a3c, 0x0a21, 0x0a3c, 0x0a2b, 0x0a3c, 0x0b47, 0x0b56,
+	0x0b47, 0x0b3e, 0x0b47, 0x0b57, 0x0b21, 0x0b3c, 0x0b22, 0x0b3c,
+	0x0b2f, 0x0b3c, 0x0b92, 0x0bd7, 0x0bc6, 0x0bbe, 0x0bc7, 0x0bbe,
+	0x0bc6, 0x0bd7, 0x0c46, 0x0c56, 0x0cbf, 0x0cd5, 0x0cc6, 0x0cd5,
+	0x0cc6, 0x0cd6, 0x0cc6, 0x0cc2, 0x0cc6, 0x0cc2, 0x0cd5, 0x0d46,
+	0x0d3e, 0x0d47, 0x0d3e, 0x0d46, 0x0d57, 0x0e4d, 0x0e32, 0x0ecd,
+	0x0eb2, 0x0f42, 0x0fb7, 0x0f4c, 0x0fb7, 0x0f51, 0x0fb7, 0x0f56,
+	0x0fb7, 0x0f5b, 0x0fb7, 0x0f40, 0x0fb5, 0x0f72, 0x0f71, 0x0f74,
+	0x0f71, 0x0fb2, 0x0f80, 0x0fb2, 0x0f80, 0x0f71, 0x0fb3, 0x0f80,
+	0x0fb3, 0x0f80, 0x0f71, 0x0f80, 0x0f71, 0x0f92, 0x0fb7, 0x0f9c,
+	0x0fb7, 0x0fa1, 0x0fb7, 0x0fa6, 0x0fb7, 0x0fab, 0x0fb7, 0x0f90,
+	0x0fb5, 0x0041, 0x0325, 0x0061, 0x0325, 0x0042, 0x0307, 0x0062,
+	0x0307, 0x0042, 0x0323, 0x0062, 0x0323, 0x0042, 0x0331, 0x0062,
+	0x0331, 0x0043, 0x0327, 0x0301, 0x0063, 0x0327, 0x0301, 0x0044,
+	0x0307, 0x0064, 0x0307, 0x0044, 0x0323, 0x0064, 0x0323, 0x0044,
+	0x0331, 0x0064, 0x0331, 0x0044, 0x0327, 0x0064, 0x0327, 0x0044,
+	0x032d, 0x0064, 0x032d, 0x0045, 0x0304, 0x0300, 0x0065, 0x0304,
+	0x0300, 0x0045, 0x0304, 0x0301, 0x0065, 0x0304, 0x0301, 0x0045,
+	0x032d, 0x0065, 0x032d, 0x0045, 0x0330, 0x0065, 0x0330, 0x0045,
+	0x0327, 0x0306, 0x0065, 0x0327, 0x0306, 0x0046, 0x0307, 0x0066,
+	0x0307, 0x0047, 0x0304, 0x0067, 0x0304, 0x0048, 0x0307, 0x0068,
+	0x0307, 0x0048, 0x0323, 0x0068, 0x0323, 0x0048, 0x0308, 0x0068,
+	0x0308, 0x0048, 0x0327, 0x0068, 0x0327, 0x0048, 0x032e, 0x0068,
+	0x032e, 0x0049, 0x0330, 0x0069, 0x0330, 0x0049, 0x0308, 0x0301,
+	0x0069, 0x0308, 0x0301, 0x004b, 0x0301, 0x006b, 0x0301, 0x004b,
+	0x0323, 0x006b, 0x0323, 0x004b, 0x0331, 0x006b, 0x0331, 0x004c,
+	0x0323, 0x006c, 0x0323, 0x004c, 0x0323, 0x0304, 0x006c, 0x0323,
+	0x0304, 0x004c, 0x0331, 0x006c, 0x0331, 0x004c, 0x032d, 0x006c,
+	0x032d, 0x004d, 0x0301, 0x006d, 0x0301, 0x004d, 0x0307, 0x006d,
+	0x0307, 0x004d, 0x0323, 0x006d, 0x0323, 0x004e, 0x0307, 0x006e,
+	0x0307, 0x004e, 0x0323, 0x006e, 0x0323, 0x004e, 0x0331, 0x006e,
+	0x0331, 0x004e, 0x032d, 0x006e, 0x032d, 0x004f, 0x0303, 0x0301,
+	0x006f, 0x0303, 0x0301, 0x004f, 0x0303, 0x0308, 0x006f, 0x0303,
+	0x0308, 0x004f, 0x0304, 0x0300, 0x006f, 0x0304, 0x0300, 0x004f,
+	0x0304, 0x0301, 0x006f, 0x0304, 0x0301, 0x0050, 0x0301, 0x0070,
+	0x0301, 0x0050, 0x0307, 0x0070, 0x0307, 0x0052, 0x0307, 0x0072,
+	0x0307, 0x0052, 0x0323, 0x0072, 0x0323, 0x0052, 0x0323, 0x0304,
+	0x0072, 0x0323, 0x0304, 0x0052, 0x0331, 0x0072, 0x0331, 0x0053,
+	0x0307, 0x0073, 0x0307, 0x0053, 0x0323, 0x0073, 0x0323, 0x0053,
+	0x0301, 0x0307, 0x0073, 0x0301, 0x0307, 0x0053, 0x030c, 0x0307,
+	0x0073, 0x030c, 0x0307, 0x0053, 0x0323, 0x0307, 0x0073, 0x0323,
+	0x0307, 0x0054, 0x0307, 0x0074, 0x0307, 0x0054, 0x0323, 0x0074,
+	0x0323, 0x0054, 0x0331, 0x0074, 0x0331, 0x0054, 0x032d, 0x0074,
+	0x032d, 0x0055, 0x0324, 0x0075, 0x0324, 0x0055, 0x0330, 0x0075,
+	0x0330, 0x0055, 0x032d, 0x0075, 0x032d, 0x0055, 0x0303, 0x0301,
+	0x0075, 0x0303, 0x0301, 0x0055, 0x0304, 0x0308, 0x0075, 0x0304,
+	0x0308, 0x0056, 0x0303, 0x0076, 0x0303, 0x0056, 0x0323, 0x0076,
+	0x0323, 0x0057, 0x0300, 0x0077, 0x0300, 0x0057, 0x0301, 0x0077,
+	0x0301, 0x0057, 0x0308, 0x0077, 0x0308, 0x0057, 0x0307, 0x0077,
+	0x0307, 0x0057, 0x0323, 0x0077, 0x0323, 0x0058, 0x0307, 0x0078,
+	0x0307, 0x0058, 0x0308, 0x0078, 0x0308, 0x0059, 0x0307, 0x0079,
+	0x0307, 0x005a, 0x0302, 0x007a, 0x0302, 0x005a, 0x0323, 0x007a,
+	0x0323, 0x005a, 0x0331, 0x007a, 0x0331, 0x0068, 0x0331, 0x0074,
+	0x0308, 0x0077, 0x030a, 0x0079, 0x030a, 0x017f, 0x0307, 0x0041,
+	0x0323, 0x0061, 0x0323, 0x0041, 0x0309, 0x0061, 0x0309, 0x0041,
+	0x0302, 0x0301, 0x0061, 0x0302, 0x0301, 0x0041, 0x0302, 0x0300,
+	0x0061, 0x0302, 0x0300, 0x0041, 0x0302, 0x0309, 0x0061, 0x0302,
+	0x0309, 0x0041, 0x0302, 0x0303, 0x0061, 0x0302, 0x0303, 0x0041,
+	0x0323, 0x0302, 0x0061, 0x0323, 0x0302, 0x0041, 0x0306, 0x0301,
+	0x0061, 0x0306, 0x0301, 0x0041, 0x0306, 0x0300, 0x0061, 0x0306,
+	0x0300, 0x0041, 0x0306, 0x0309, 0x0061, 0x0306, 0x0309, 0x0041,
+	0x0306, 0x0303, 0x0061, 0x0306, 0x0303, 0x0041, 0x0323, 0x0306,
+	0x0061, 0x0323, 0x0306, 0x0045, 0x0323, 0x0065, 0x0323, 0x0045,
+	0x0309, 0x0065, 0x0309, 0x0045, 0x0303, 0x0065, 0x0303, 0x0045,
+	0x0302, 0x0301, 0x0065, 0x0302, 0x0301, 0x0045, 0x0302, 0x0300,
+	0x0065, 0x0302, 0x0300, 0x0045, 0x0302, 0x0309, 0x0065, 0x0302,
+	0x0309, 0x0045, 0x0302, 0x0303, 0x0065, 0x0302, 0x0303, 0x0045,
+	0x0323, 0x0302, 0x0065, 0x0323, 0x0302, 0x0049, 0x0309, 0x0069,
+	0x0309, 0x0049, 0x0323, 0x0069, 0x0323, 0x004f, 0x0323, 0x006f,
+	0x0323, 0x004f, 0x0309, 0x006f, 0x0309, 0x004f, 0x0302, 0x0301,
+	0x006f, 0x0302, 0x0301, 0x004f, 0x0302, 0x0300, 0x006f, 0x0302,
+	0x0300, 0x004f, 0x0302, 0x0309, 0x006f, 0x0302, 0x0309, 0x004f,
+	0x0302, 0x0303, 0x006f, 0x0302, 0x0303, 0x004f, 0x0323, 0x0302,
+	0x006f, 0x0323, 0x0302, 0x004f, 0x031b, 0x0301, 0x006f, 0x031b,
+	0x0301, 0x004f, 0x031b, 0x0300, 0x006f, 0x031b, 0x0300, 0x004f,
+	0x031b, 0x0309, 0x006f, 0x031b, 0x0309, 0x004f, 0x031b, 0x0303,
+	0x006f, 0x031b, 0x0303, 0x004f, 0x031b, 0x0323, 0x006f, 0x031b,
+	0x0323, 0x0055, 0x0323, 0x0075, 0x0323, 0x0055, 0x0309, 0x0075,
+	0x0309, 0x0055, 0x031b, 0x0301, 0x0075, 0x031b, 0x0301, 0x0055,
+	0x031b, 0x0300, 0x0075, 0x031b, 0x0300, 0x0055, 0x031b, 0x0309,
+	0x0075, 0x031b, 0x0309, 0x0055, 0x031b, 0x0303, 0x0075, 0x031b,
+	0x0303, 0x0055, 0x031b, 0x0323, 0x0075, 0x031b, 0x0323, 0x0059,
+	0x0300, 0x0079, 0x0300, 0x0059, 0x0323, 0x0079, 0x0323, 0x0059,
+	0x0309, 0x0079, 0x0309, 0x0059, 0x0303, 0x0079, 0x0303, 0x03b1,
+	0x0313, 0x03b1, 0x0314, 0x03b1, 0x0313, 0x0300, 0x03b1, 0x0314,
+	0x0300, 0x03b1, 0x0313, 0x0301, 0x03b1, 0x0314, 0x0301, 0x03b1,
+	0x0313, 0x0342, 0x03b1, 0x0314, 0x0342, 0x0391, 0x0313, 0x0391,
+	0x0314, 0x0391, 0x0313, 0x0300, 0x0391, 0x0314, 0x0300, 0x0391,
+	0x0313, 0x0301, 0x0391, 0x0314, 0x0301, 0x0391, 0x0313, 0x0342,
+	0x0391, 0x0314, 0x0342, 0x03b5, 0x0313, 0x03b5, 0x0314, 0x03b5,
+	0x0313, 0x0300, 0x03b5, 0x0314, 0x0300, 0x03b5, 0x0313, 0x0301,
+	0x03b5, 0x0314, 0x0301, 0x0395, 0x0313, 0x0395, 0x0314, 0x0395,
+	0x0313, 0x0300, 0x0395, 0x0314, 0x0300, 0x0395, 0x0313, 0x0301,
+	0x0395, 0x0314, 0x0301, 0x03b7, 0x0313, 0x03b7, 0x0314, 0x03b7,
+	0x0313, 0x0300, 0x03b7, 0x0314, 0x0300, 0x03b7, 0x0313, 0x0301,
+	0x03b7, 0x0314, 0x0301, 0x03b7, 0x0313, 0x0342, 0x03b7, 0x0314,
+	0x0342, 0x0397, 0x0313, 0x0397, 0x0314, 0x0397, 0x0313, 0x0300,
+	0x0397, 0x0314, 0x0300, 0x0397, 0x0313, 0x0301, 0x0397, 0x0314,
+	0x0301, 0x0397, 0x0313, 0x0342, 0x0397, 0x0314, 0x0342, 0x03b9,
+	0x0313, 0x03b9, 0x0314, 0x03b9, 0x0313, 0x0300, 0x03b9, 0x0314,
+	0x0300, 0x03b9, 0x0313, 0x0301, 0x03b9, 0x0314, 0x0301, 0x03b9,
+	0x0313, 0x0342, 0x03b9, 0x0314, 0x0342, 0x0399, 0x0313, 0x0399,
+	0x0314, 0x0399, 0x0313, 0x0300, 0x0399, 0x0314, 0x0300, 0x0399,
+	0x0313, 0x0301, 0x0399, 0x0314, 0x0301, 0x0399, 0x0313, 0x0342,
+	0x0399, 0x0314, 0x0342, 0x03bf, 0x0313, 0x03bf, 0x0314, 0x03bf,
+	0x0313, 0x0300, 0x03bf, 0x0314, 0x0300, 0x03bf, 0x0313, 0x0301,
+	0x03bf, 0x0314, 0x0301, 0x039f, 0x0313, 0x039f, 0x0314, 0x039f,
+	0x0313, 0x0300, 0x039f, 0x0314, 0x0300, 0x039f, 0x0313, 0x0301,
+	0x039f, 0x0314, 0x0301, 0x03c5, 0x0313, 0x03c5, 0x0314, 0x03c5,
+	0x0313, 0x0300, 0x03c5, 0x0314, 0x0300, 0x03c5, 0x0313, 0x0301,
+	0x03c5, 0x0314, 0x0301, 0x03c5, 0x0313, 0x0342, 0x03c5, 0x0314,
+	0x0342, 0x03a5, 0x0314, 0x03a5, 0x0314, 0x0300, 0x03a5, 0x0314,
+	0x0301, 0x03a5, 0x0314, 0x0342, 0x03c9, 0x0313, 0x03c9, 0x0314,
+	0x03c9, 0x0313, 0x0300, 0x03c9, 0x0314, 0x0300, 0x03c9, 0x0313,
+	0x0301, 0x03c9, 0x0314, 0x0301, 0x03c9, 0x0313, 0x0342, 0x03c9,
+	0x0314, 0x0342, 0x03a9, 0x0313, 0x03a9, 0x0314, 0x03a9, 0x0313,
+	0x0300, 0x03a9, 0x0314, 0x0300, 0x03a9, 0x0313, 0x0301, 0x03a9,
+	0x0314, 0x0301, 0x03a9, 0x0313, 0x0342, 0x03a9, 0x0314, 0x0342,
+	0x03b1, 0x0300, 0x03b1, 0x0301, 0x03b5, 0x0300, 0x03b5, 0x0301,
+	0x03b7, 0x0300, 0x03b7, 0x0301, 0x03b9, 0x0300, 0x03b9, 0x0301,
+	0x03bf, 0x0300, 0x03bf, 0x0301, 0x03c5, 0x0300, 0x03c5, 0x0301,
+	0x03c9, 0x0300, 0x03c9, 0x0301, 0x03b1, 0x0345, 0x0313, 0x03b1,
+	0x0345, 0x0314, 0x03b1, 0x0345, 0x0313, 0x0300, 0x03b1, 0x0345,
+	0x0314, 0x0300, 0x03b1, 0x0345, 0x0313, 0x0301, 0x03b1, 0x0345,
+	0x0314, 0x0301, 0x03b1, 0x0345, 0x0313, 0x0342, 0x03b1, 0x0345,
+	0x0314, 0x0342, 0x0391, 0x0345, 0x0313, 0x0391, 0x0345, 0x0314,
+	0x0391, 0x0345, 0x0313, 0x0300, 0x0391, 0x0345, 0x0314, 0x0300,
+	0x0391, 0x0345, 0x0313, 0x0301, 0x0391, 0x0345, 0x0314, 0x0301,
+	0x0391, 0x0345, 0x0313, 0x0342, 0x0391, 0x0345, 0x0314, 0x0342,
+	0x03b7, 0x0345, 0x0313, 0x03b7, 0x0345, 0x0314, 0x03b7, 0x0345,
+	0x0313, 0x0300, 0x03b7, 0x0345, 0x0314, 0x0300, 0x03b7, 0x0345,
+	0x0313, 0x0301, 0x03b7, 0x0345, 0x0314, 0x0301, 0x03b7, 0x0345,
+	0x0313, 0x0342, 0x03b7, 0x0345, 0x0314, 0x0342, 0x0397, 0x0345,
+	0x0313, 0x0397, 0x0345, 0x0314, 0x0397, 0x0345, 0x0313, 0x0300,
+	0x0397, 0x0345, 0x0314, 0x0300, 0x0397, 0x0345, 0x0313, 0x0301,
+	0x0397, 0x0345, 0x0314, 0x0301, 0x0397, 0x0345, 0x0313, 0x0342,
+	0x0397, 0x0345, 0x0314, 0x0342, 0x03c9, 0x0345, 0x0313, 0x03c9,
+	0x0345, 0x0314, 0x03c9, 0x0345, 0x0313, 0x0300, 0x03c9, 0x0345,
+	0x0314, 0x0300, 0x03c9, 0x0345, 0x0313, 0x0301, 0x03c9, 0x0345,
+	0x0314, 0x0301, 0x03c9, 0x0345, 0x0313, 0x0342, 0x03c9, 0x0345,
+	0x0314, 0x0342, 0x03a9, 0x0345, 0x0313, 0x03a9, 0x0345, 0x0314,
+	0x03a9, 0x0345, 0x0313, 0x0300, 0x03a9, 0x0345, 0x0314, 0x0300,
+	0x03a9, 0x0345, 0x0313, 0x0301, 0x03a9, 0x0345, 0x0314, 0x0301,
+	0x03a9, 0x0345, 0x0313, 0x0342, 0x03a9, 0x0345, 0x0314, 0x0342,
+	0x03b1, 0x0306, 0x03b1, 0x0304, 0x03b1, 0x0345, 0x0300, 0x03b1,
+	0x0345, 0x03b1, 0x0345, 0x0301, 0x03b1, 0x0342, 0x03b1, 0x0345,
+	0x0342, 0x0391, 0x0306, 0x0391, 0x0304, 0x0391, 0x0300, 0x0391,
+	0x0301, 0x0391, 0x0345, 0x03b9, 0x00a8, 0x0342, 0x03b7, 0x0345,
+	0x0300, 0x03b7, 0x0345, 0x03b7, 0x0345, 0x0301, 0x03b7, 0x0342,
+	0x03b7, 0x0345, 0x0342, 0x0395, 0x0300, 0x0395, 0x0301, 0x0397,
+	0x0300, 0x0397, 0x0301, 0x0397, 0x0345, 0x1fbf, 0x0300, 0x1fbf,
+	0x0301, 0x1fbf, 0x0342, 0x03b9, 0x0306, 0x03b9, 0x0304, 0x03b9,
+	0x0308, 0x0300, 0x03b9, 0x0308, 0x0301, 0x03b9, 0x0342, 0x03b9,
+	0x0308, 0x0342, 0x0399, 0x0306, 0x0399, 0x0304, 0x0399, 0x0300,
+	0x0399, 0x0301, 0x1ffe, 0x0300, 0x1ffe, 0x0301, 0x1ffe, 0x0342,
+	0x03c5, 0x0306, 0x03c5, 0x0304, 0x03c5, 0x0308, 0x0300, 0x03c5,
+	0x0308, 0x0301, 0x03c1, 0x0313, 0x03c1, 0x0314, 0x03c5, 0x0342,
+	0x03c5, 0x0308, 0x0342, 0x03a5, 0x0306, 0x03a5, 0x0304, 0x03a5,
+	0x0300, 0x03a5, 0x0301, 0x03a1, 0x0314, 0x00a8, 0x0300, 0x00a8,
+	0x0301, 0x0060, 0x03c9, 0x0345, 0x0300, 0x03c9, 0x0345, 0x03bf,
+	0x0345, 0x0301, 0x03c9, 0x0342, 0x03c9, 0x0345, 0x0342, 0x039f,
+	0x0300, 0x039f, 0x0301, 0x03a9, 0x0300, 0x03a9, 0x0301, 0x03a9,
+	0x0345, 0x00b4, 0x304b, 0x3099, 0x304d, 0x3099, 0x304f, 0x3099,
+	0x3051, 0x3099, 0x3053, 0x3099, 0x3055, 0x3099, 0x3057, 0x3099,
+	0x3059, 0x3099, 0x305b, 0x3099, 0x305d, 0x3099, 0x305f, 0x3099,
+	0x3061, 0x3099, 0x3064, 0x3099, 0x3066, 0x3099, 0x3068, 0x3099,
+	0x306f, 0x3099, 0x306f, 0x309a, 0x3072, 0x3099, 0x3072, 0x309a,
+	0x3075, 0x3099, 0x3075, 0x309a, 0x3078, 0x3099, 0x3078, 0x309a,
+	0x307b, 0x3099, 0x307b, 0x309a, 0x3046, 0x3099, 0x309d, 0x3099,
+	0x30ab, 0x3099, 0x30ad, 0x3099, 0x30af, 0x3099, 0x30b1, 0x3099,
+	0x30b3, 0x3099, 0x30b5, 0x3099, 0x30b7, 0x3099, 0x30b9, 0x3099,
+	0x30bb, 0x3099, 0x30bd, 0x3099, 0x30bf, 0x3099, 0x30c1, 0x3099,
+	0x30c4, 0x3099, 0x30c6, 0x3099, 0x30c8, 0x3099, 0x30cf, 0x3099,
+	0x30cf, 0x309a, 0x30d2, 0x3099, 0x30d2, 0x309a, 0x30d5, 0x3099,
+	0x30d5, 0x309a, 0x30d8, 0x3099, 0x30d8, 0x309a, 0x30db, 0x3099,
+	0x30db, 0x309a, 0x30a6, 0x3099, 0x30ef, 0x3099, 0x30f0, 0x3099,
+	0x30f1, 0x3099, 0x30f2, 0x3099, 0x30fd, 0x3099, 0x05f2, 0x05b7,
+	0x05e9, 0x05c1, 0x05e9, 0x05c2, 0x05e9, 0x05bc, 0x05c1, 0x05e9,
+	0x05bc, 0x05c2, 0x05d0, 0x05b7, 0x05d0, 0x05b8, 0x05d0, 0x05bc,
+	0x05d1, 0x05bc, 0x05d2, 0x05bc, 0x05d3, 0x05bc, 0x05d4, 0x05bc,
+	0x05d5, 0x05bc, 0x05d6, 0x05bc, 0x05d8, 0x05bc, 0x05d9, 0x05bc,
+	0x05da, 0x05bc, 0x05db, 0x05bc, 0x05dc, 0x05bc, 0x05de, 0x05bc,
+	0x05e0, 0x05bc, 0x05e1, 0x05bc, 0x05e3, 0x05bc, 0x05e4, 0x05bc,
+	0x05e6, 0x05bc, 0x05e7, 0x05bc, 0x05e8, 0x05bc, 0x05e9, 0x05bc,
+	0x05ea, 0x05bc, 0x05d5, 0x05b9, 0x05d1, 0x05bf, 0x05db, 0x05bf,
+	0x05e4, 0x05bf
+};
+
+u16 hfsplus_compose_table[] = {
+	/* base */
+	0x0000, 0x0050,  0x0300, 0x00a4,  0x0301, 0x00e4,  0x0302, 0x015c,
+	0x0303, 0x0192,  0x0304, 0x01b4,  0x0306, 0x01e6,  0x0307, 0x0220,
+	0x0308, 0x0270,  0x0309, 0x02d2,  0x030a, 0x02ec,  0x030b, 0x02fa,
+	0x030c, 0x0308,  0x030d, 0x034c,  0x030f, 0x0370,  0x0311, 0x038e,
+	0x0313, 0x03a8,  0x0314, 0x03c6,  0x031b, 0x03e8,  0x0323, 0x03f2,
+	0x0324, 0x0440,  0x0325, 0x0446,  0x0327, 0x044c,  0x0328, 0x047a,
+	0x032d, 0x0490,  0x032e, 0x04aa,  0x0330, 0x04b0,  0x0331, 0x04be,
+	0x0342, 0x04e2,  0x0345, 0x04f4,  0x05b7, 0x0504,  0x05b8, 0x050a,
+	0x05b9, 0x050e,  0x05bc, 0x0512,  0x05bf, 0x0540,  0x05c1, 0x0548,
+	0x05c2, 0x054c,  0x093c, 0x0550,  0x09bc, 0x0568,  0x09be, 0x0572,
+	0x09d7, 0x0576,  0x0a3c, 0x057a,  0x0b3c, 0x0586,  0x0b3e, 0x058e,
+	0x0b56, 0x0592,  0x0b57, 0x0596,  0x0bbe, 0x059a,  0x0bd7, 0x05a0,
+	0x0c56, 0x05a6,  0x0cc2, 0x05aa,  0x0cd5, 0x05ae,  0x0cd6, 0x05b4,
+	0x0d3e, 0x05b8,  0x0d57, 0x05be,  0x0e32, 0x05c2,  0x0eb2, 0x05c6,
+	0x0f71, 0x05ca,  0x0f80, 0x05d2,  0x0fb5, 0x05d8,  0x0fb7, 0x05de,
+	0x1100, 0x00a2,  0x1101, 0x00a2,  0x1102, 0x00a2,  0x1103, 0x00a2,
+	0x1104, 0x00a2,  0x1105, 0x00a2,  0x1106, 0x00a2,  0x1107, 0x00a2,
+	0x1108, 0x00a2,  0x1109, 0x00a2,  0x110a, 0x00a2,  0x110b, 0x00a2,
+	0x110c, 0x00a2,  0x110d, 0x00a2,  0x110e, 0x00a2,  0x110f, 0x00a2,
+	0x1110, 0x00a2,  0x1111, 0x00a2,  0x1112, 0x00a2,  0x3099, 0x05f4,
+	0x309a, 0x0656,
+	/* hangul marker */
+	0xffff, 0x0000,
+	/* 0x0300 */
+	0x0340, 0x001f,  0x0041, 0x066c,  0x0045, 0x066e,  0x0049, 0x0670,
+	0x004f, 0x0672,  0x0055, 0x0674,  0x0057, 0x0676,  0x0059, 0x0678,
+	0x0061, 0x067a,  0x0065, 0x067c,  0x0069, 0x067e,  0x006f, 0x0680,
+	0x0075, 0x0682,  0x0077, 0x0684,  0x0079, 0x0686,  0x00a8, 0x0688,
+	0x0391, 0x068a,  0x0395, 0x068c,  0x0397, 0x068e,  0x0399, 0x0690,
+	0x039f, 0x0692,  0x03a5, 0x0694,  0x03a9, 0x0696,  0x03b1, 0x0698,
+	0x03b5, 0x069a,  0x03b7, 0x069c,  0x03b9, 0x069e,  0x03bf, 0x06a0,
+	0x03c5, 0x06a2,  0x03c9, 0x06a4,  0x1fbf, 0x06a6,  0x1ffe, 0x06a8,
+	/* 0x0301 */
+	0x0341, 0x003b,  0x0041, 0x06aa,  0x0043, 0x06ac,  0x0045, 0x06ae,
+	0x0047, 0x06b0,  0x0049, 0x06b2,  0x004b, 0x06b4,  0x004c, 0x06b6,
+	0x004d, 0x06b8,  0x004e, 0x06ba,  0x004f, 0x06bc,  0x0050, 0x06be,
+	0x0052, 0x06c0,  0x0053, 0x06c2,  0x0055, 0x06c6,  0x0057, 0x06c8,
+	0x0059, 0x06ca,  0x005a, 0x06cc,  0x0061, 0x06ce,  0x0063, 0x06d0,
+	0x0065, 0x06d2,  0x0067, 0x06d4,  0x0069, 0x06d6,  0x006b, 0x06d8,
+	0x006c, 0x06da,  0x006d, 0x06dc,  0x006e, 0x06de,  0x006f, 0x06e0,
+	0x0070, 0x06e2,  0x0072, 0x06e4,  0x0073, 0x06e6,  0x0075, 0x06ea,
+	0x0077, 0x06ec,  0x0079, 0x06ee,  0x007a, 0x06f0,  0x00a8, 0x06f2,
+	0x00c6, 0x06f4,  0x00d8, 0x06f6,  0x00e6, 0x06f8,  0x00f8, 0x06fa,
+	0x0391, 0x06fc,  0x0395, 0x06fe,  0x0397, 0x0700,  0x0399, 0x0702,
+	0x039f, 0x0704,  0x03a5, 0x0706,  0x03a9, 0x0708,  0x03b1, 0x070a,
+	0x03b5, 0x070c,  0x03b7, 0x070e,  0x03b9, 0x0710,  0x03bf, 0x0712,
+	0x03c5, 0x0714,  0x03c9, 0x0716,  0x0413, 0x0718,  0x041a, 0x071a,
+	0x0433, 0x071c,  0x043a, 0x071e,  0x1fbf, 0x0720,  0x1ffe, 0x0722,
+	/* 0x0302 */
+	0x0000, 0x001a,  0x0041, 0x0724,  0x0043, 0x072e,  0x0045, 0x0730,
+	0x0047, 0x073a,  0x0048, 0x073c,  0x0049, 0x073e,  0x004a, 0x0740,
+	0x004f, 0x0742,  0x0053, 0x074c,  0x0055, 0x074e,  0x0057, 0x0750,
+	0x0059, 0x0752,  0x005a, 0x0754,  0x0061, 0x0756,  0x0063, 0x0760,
+	0x0065, 0x0762,  0x0067, 0x076c,  0x0068, 0x076e,  0x0069, 0x0770,
+	0x006a, 0x0772,  0x006f, 0x0774,  0x0073, 0x077e,  0x0075, 0x0780,
+	0x0077, 0x0782,  0x0079, 0x0784,  0x007a, 0x0786,
+	/* 0x0303 */
+	0x0000, 0x0010,  0x0041, 0x0788,  0x0045, 0x078a,  0x0049, 0x078c,
+	0x004e, 0x078e,  0x004f, 0x0790,  0x0055, 0x0796,  0x0056, 0x079a,
+	0x0059, 0x079c,  0x0061, 0x079e,  0x0065, 0x07a0,  0x0069, 0x07a2,
+	0x006e, 0x07a4,  0x006f, 0x07a6,  0x0075, 0x07ac,  0x0076, 0x07b0,
+	0x0079, 0x07b2,
+	/* 0x0304 */
+	0x0000, 0x0018,  0x0041, 0x07b4,  0x0045, 0x07b6,  0x0047, 0x07bc,
+	0x0049, 0x07be,  0x004f, 0x07c0,  0x0055, 0x07c6,  0x0061, 0x07ca,
+	0x0065, 0x07cc,  0x0067, 0x07d2,  0x0069, 0x07d4,  0x006f, 0x07d6,
+	0x0075, 0x07dc,  0x00c6, 0x07e0,  0x00e6, 0x07e2,  0x0391, 0x07e4,
+	0x0399, 0x07e6,  0x03a5, 0x07e8,  0x03b1, 0x07ea,  0x03b9, 0x07ec,
+	0x03c5, 0x07ee,  0x0418, 0x07f0,  0x0423, 0x07f2,  0x0438, 0x07f4,
+	0x0443, 0x07f6,
+	/* 0x0306 */
+	0x0000, 0x001c,  0x0041, 0x07f8,  0x0045, 0x0802,  0x0047, 0x0804,
+	0x0049, 0x0806,  0x004f, 0x0808,  0x0055, 0x080a,  0x0061, 0x080c,
+	0x0065, 0x0816,  0x0067, 0x0818,  0x0069, 0x081a,  0x006f, 0x081c,
+	0x0075, 0x081e,  0x0391, 0x0820,  0x0399, 0x0822,  0x03a5, 0x0824,
+	0x03b1, 0x0826,  0x03b9, 0x0828,  0x03c5, 0x082a,  0x0410, 0x082c,
+	0x0415, 0x082e,  0x0416, 0x0830,  0x0418, 0x0832,  0x0423, 0x0834,
+	0x0430, 0x0836,  0x0435, 0x0838,  0x0436, 0x083a,  0x0438, 0x083c,
+	0x0443, 0x083e,
+	/* 0x0307 */
+	0x0000, 0x0027,  0x0041, 0x0840,  0x0042, 0x0844,  0x0043, 0x0846,
+	0x0044, 0x0848,  0x0045, 0x084a,  0x0046, 0x084c,  0x0047, 0x084e,
+	0x0048, 0x0850,  0x0049, 0x0852,  0x004d, 0x0854,  0x004e, 0x0856,
+	0x0050, 0x0858,  0x0052, 0x085a,  0x0053, 0x085c,  0x0054, 0x085e,
+	0x0057, 0x0860,  0x0058, 0x0862,  0x0059, 0x0864,  0x005a, 0x0866,
+	0x0061, 0x0868,  0x0062, 0x086c,  0x0063, 0x086e,  0x0064, 0x0870,
+	0x0065, 0x0872,  0x0066, 0x0874,  0x0067, 0x0876,  0x0068, 0x0878,
+	0x006d, 0x087a,  0x006e, 0x087c,  0x0070, 0x087e,  0x0072, 0x0880,
+	0x0073, 0x0882,  0x0074, 0x0884,  0x0077, 0x0886,  0x0078, 0x0888,
+	0x0079, 0x088a,  0x007a, 0x088c,  0x017f, 0x088e,  0x0306, 0x0890,
+	/* 0x0308 */
+	0x0000, 0x0030,  0x0041, 0x0892,  0x0045, 0x0896,  0x0048, 0x0898,
+	0x0049, 0x089a,  0x004f, 0x089e,  0x0055, 0x08a0,  0x0057, 0x08aa,
+	0x0058, 0x08ac,  0x0059, 0x08ae,  0x0061, 0x08b0,  0x0065, 0x08b4,
+	0x0068, 0x08b6,  0x0069, 0x08b8,  0x006f, 0x08bc,  0x0074, 0x08be,
+	0x0075, 0x08c0,  0x0077, 0x08ca,  0x0078, 0x08cc,  0x0079, 0x08ce,
+	0x018f, 0x08d0,  0x019f, 0x08d2,  0x0259, 0x08d4,  0x0275, 0x08d6,
+	0x0399, 0x08d8,  0x03a5, 0x08da,  0x03b9, 0x08dc,  0x03c5, 0x08e6,
+	0x03d2, 0x08f0,  0x0406, 0x08f2,  0x0410, 0x08f4,  0x0415, 0x08f6,
+	0x0416, 0x08f8,  0x0417, 0x08fa,  0x0418, 0x08fc,  0x041e, 0x08fe,
+	0x0423, 0x0900,  0x0427, 0x0902,  0x042b, 0x0904,  0x0430, 0x0906,
+	0x0435, 0x0908,  0x0436, 0x090a,  0x0437, 0x090c,  0x0438, 0x090e,
+	0x043e, 0x0910,  0x0443, 0x0912,  0x0447, 0x0914,  0x044b, 0x0916,
+	0x0456, 0x0918,
+	/* 0x0309 */
+	0x0000, 0x000c,  0x0041, 0x091a,  0x0045, 0x091c,  0x0049, 0x091e,
+	0x004f, 0x0920,  0x0055, 0x0922,  0x0059, 0x0924,  0x0061, 0x0926,
+	0x0065, 0x0928,  0x0069, 0x092a,  0x006f, 0x092c,  0x0075, 0x092e,
+	0x0079, 0x0930,
+	/* 0x030a */
+	0x0000, 0x0006,  0x0041, 0x0932,  0x0055, 0x0936,  0x0061, 0x0938,
+	0x0075, 0x093c,  0x0077, 0x093e,  0x0079, 0x0940,
+	/* 0x030b */
+	0x0000, 0x0006,  0x004f, 0x0942,  0x0055, 0x0944,  0x006f, 0x0946,
+	0x0075, 0x0948,  0x0423, 0x094a,  0x0443, 0x094c,
+	/* 0x030c */
+	0x0000, 0x0021,  0x0041, 0x094e,  0x0043, 0x0950,  0x0044, 0x0952,
+	0x0045, 0x0954,  0x0047, 0x0956,  0x0049, 0x0958,  0x004b, 0x095a,
+	0x004c, 0x095c,  0x004e, 0x095e,  0x004f, 0x0960,  0x0052, 0x0962,
+	0x0053, 0x0964,  0x0054, 0x0968,  0x0055, 0x096a,  0x005a, 0x096c,
+	0x0061, 0x096e,  0x0063, 0x0970,  0x0064, 0x0972,  0x0065, 0x0974,
+	0x0067, 0x0976,  0x0069, 0x0978,  0x006a, 0x097a,  0x006b, 0x097c,
+	0x006c, 0x097e,  0x006e, 0x0980,  0x006f, 0x0982,  0x0072, 0x0984,
+	0x0073, 0x0986,  0x0074, 0x098a,  0x0075, 0x098c,  0x007a, 0x098e,
+	0x01b7, 0x0990,  0x0292, 0x0992,
+	/* 0x030d */
+	0x0000, 0x0011,  0x00a8, 0x0994,  0x0308, 0x0996,  0x0391, 0x0998,
+	0x0395, 0x099a,  0x0397, 0x099c,  0x0399, 0x099e,  0x039f, 0x09a0,
+	0x03a5, 0x09a2,  0x03a9, 0x09a4,  0x03b1, 0x09a6,  0x03b5, 0x09a8,
+	0x03b7, 0x09aa,  0x03b9, 0x09ac,  0x03bf, 0x09ae,  0x03c5, 0x09b0,
+	0x03c9, 0x09b2,  0x03d2, 0x09b4,
+	/* 0x030f */
+	0x0000, 0x000e,  0x0041, 0x09b6,  0x0045, 0x09b8,  0x0049, 0x09ba,
+	0x004f, 0x09bc,  0x0052, 0x09be,  0x0055, 0x09c0,  0x0061, 0x09c2,
+	0x0065, 0x09c4,  0x0069, 0x09c6,  0x006f, 0x09c8,  0x0072, 0x09ca,
+	0x0075, 0x09cc,  0x0474, 0x09ce,  0x0475, 0x09d0,
+	/* 0x0311 */
+	0x0000, 0x000c,  0x0041, 0x09d2,  0x0045, 0x09d4,  0x0049, 0x09d6,
+	0x004f, 0x09d8,  0x0052, 0x09da,  0x0055, 0x09dc,  0x0061, 0x09de,
+	0x0065, 0x09e0,  0x0069, 0x09e2,  0x006f, 0x09e4,  0x0072, 0x09e6,
+	0x0075, 0x09e8,
+	/* 0x0313 */
+	0x0343, 0x000e,  0x0391, 0x09ea,  0x0395, 0x09f2,  0x0397, 0x09f8,
+	0x0399, 0x0a00,  0x039f, 0x0a08,  0x03a9, 0x0a0e,  0x03b1, 0x0a16,
+	0x03b5, 0x0a1e,  0x03b7, 0x0a24,  0x03b9, 0x0a2c,  0x03bf, 0x0a34,
+	0x03c1, 0x0a3a,  0x03c5, 0x0a3c,  0x03c9, 0x0a44,
+	/* 0x0314 */
+	0x0000, 0x0010,  0x0391, 0x0a4c,  0x0395, 0x0a54,  0x0397, 0x0a5a,
+	0x0399, 0x0a62,  0x039f, 0x0a6a,  0x03a1, 0x0a70,  0x03a5, 0x0a72,
+	0x03a9, 0x0a7a,  0x03b1, 0x0a82,  0x03b5, 0x0a8a,  0x03b7, 0x0a90,
+	0x03b9, 0x0a98,  0x03bf, 0x0aa0,  0x03c1, 0x0aa6,  0x03c5, 0x0aa8,
+	0x03c9, 0x0ab0,
+	/* 0x031b */
+	0x0000, 0x0004,  0x004f, 0x0ab8,  0x0055, 0x0ac4,  0x006f, 0x0ad0,
+	0x0075, 0x0adc,
+	/* 0x0323 */
+	0x0000, 0x0026,  0x0041, 0x0ae8,  0x0042, 0x0aee,  0x0044, 0x0af0,
+	0x0045, 0x0af2,  0x0048, 0x0af6,  0x0049, 0x0af8,  0x004b, 0x0afa,
+	0x004c, 0x0afc,  0x004d, 0x0b00,  0x004e, 0x0b02,  0x004f, 0x0b04,
+	0x0052, 0x0b08,  0x0053, 0x0b0c,  0x0054, 0x0b10,  0x0055, 0x0b12,
+	0x0056, 0x0b14,  0x0057, 0x0b16,  0x0059, 0x0b18,  0x005a, 0x0b1a,
+	0x0061, 0x0b1c,  0x0062, 0x0b22,  0x0064, 0x0b24,  0x0065, 0x0b26,
+	0x0068, 0x0b2a,  0x0069, 0x0b2c,  0x006b, 0x0b2e,  0x006c, 0x0b30,
+	0x006d, 0x0b34,  0x006e, 0x0b36,  0x006f, 0x0b38,  0x0072, 0x0b3c,
+	0x0073, 0x0b40,  0x0074, 0x0b44,  0x0075, 0x0b46,  0x0076, 0x0b48,
+	0x0077, 0x0b4a,  0x0079, 0x0b4c,  0x007a, 0x0b4e,
+	/* 0x0324 */
+	0x0000, 0x0002,  0x0055, 0x0b50,  0x0075, 0x0b52,
+	/* 0x0325 */
+	0x0000, 0x0002,  0x0041, 0x0b54,  0x0061, 0x0b56,
+	/* 0x0327 */
+	0x0000, 0x0016,  0x0043, 0x0b58,  0x0044, 0x0b5c,  0x0045, 0x0b5e,
+	0x0047, 0x0b62,  0x0048, 0x0b64,  0x004b, 0x0b66,  0x004c, 0x0b68,
+	0x004e, 0x0b6a,  0x0052, 0x0b6c,  0x0053, 0x0b6e,  0x0054, 0x0b70,
+	0x0063, 0x0b72,  0x0064, 0x0b76,  0x0065, 0x0b78,  0x0067, 0x0b7c,
+	0x0068, 0x0b7e,  0x006b, 0x0b80,  0x006c, 0x0b82,  0x006e, 0x0b84,
+	0x0072, 0x0b86,  0x0073, 0x0b88,  0x0074, 0x0b8a,
+	/* 0x0328 */
+	0x0000, 0x000a,  0x0041, 0x0b8c,  0x0045, 0x0b8e,  0x0049, 0x0b90,
+	0x004f, 0x0b92,  0x0055, 0x0b96,  0x0061, 0x0b98,  0x0065, 0x0b9a,
+	0x0069, 0x0b9c,  0x006f, 0x0b9e,  0x0075, 0x0ba2,
+	/* 0x032d */
+	0x0000, 0x000c,  0x0044, 0x0ba4,  0x0045, 0x0ba6,  0x004c, 0x0ba8,
+	0x004e, 0x0baa,  0x0054, 0x0bac,  0x0055, 0x0bae,  0x0064, 0x0bb0,
+	0x0065, 0x0bb2,  0x006c, 0x0bb4,  0x006e, 0x0bb6,  0x0074, 0x0bb8,
+	0x0075, 0x0bba,
+	/* 0x032e */
+	0x0000, 0x0002,  0x0048, 0x0bbc,  0x0068, 0x0bbe,
+	/* 0x0330 */
+	0x0000, 0x0006,  0x0045, 0x0bc0,  0x0049, 0x0bc2,  0x0055, 0x0bc4,
+	0x0065, 0x0bc6,  0x0069, 0x0bc8,  0x0075, 0x0bca,
+	/* 0x0331 */
+	0x0000, 0x0011,  0x0042, 0x0bcc,  0x0044, 0x0bce,  0x004b, 0x0bd0,
+	0x004c, 0x0bd2,  0x004e, 0x0bd4,  0x0052, 0x0bd6,  0x0054, 0x0bd8,
+	0x005a, 0x0bda,  0x0062, 0x0bdc,  0x0064, 0x0bde,  0x0068, 0x0be0,
+	0x006b, 0x0be2,  0x006c, 0x0be4,  0x006e, 0x0be6,  0x0072, 0x0be8,
+	0x0074, 0x0bea,  0x007a, 0x0bec,
+	/* 0x0342 */
+	0x0000, 0x0008,  0x00a8, 0x0bee,  0x03b1, 0x0bf0,  0x03b7, 0x0bf2,
+	0x03b9, 0x0bf4,  0x03c5, 0x0bf6,  0x03c9, 0x0bf8,  0x1fbf, 0x0bfa,
+	0x1ffe, 0x0bfc,
+	/* 0x0345 */
+	0x0000, 0x0007,  0x0391, 0x0bfe,  0x0397, 0x0c04,  0x03a9, 0x0c0a,
+	0x03b1, 0x0c10,  0x03b7, 0x0c1c,  0x03bf, 0x0c28,  0x03c9, 0x0c2c,
+	/* 0x05b7 */
+	0x0000, 0x0002,  0x05d0, 0x0c36,  0x05f2, 0x0c38,
+	/* 0x05b8 */
+	0x0000, 0x0001,  0x05d0, 0x0c3a,
+	/* 0x05b9 */
+	0x0000, 0x0001,  0x05d5, 0x0c3c,
+	/* 0x05bc */
+	0x0000, 0x0016,  0x05d0, 0x0c3e,  0x05d1, 0x0c40,  0x05d2, 0x0c42,
+	0x05d3, 0x0c44,  0x05d4, 0x0c46,  0x05d5, 0x0c48,  0x05d6, 0x0c4a,
+	0x05d8, 0x0c4c,  0x05d9, 0x0c4e,  0x05da, 0x0c50,  0x05db, 0x0c52,
+	0x05dc, 0x0c54,  0x05de, 0x0c56,  0x05e0, 0x0c58,  0x05e1, 0x0c5a,
+	0x05e3, 0x0c5c,  0x05e4, 0x0c5e,  0x05e6, 0x0c60,  0x05e7, 0x0c62,
+	0x05e8, 0x0c64,  0x05e9, 0x0c66,  0x05ea, 0x0c6c,
+	/* 0x05bf */
+	0x0000, 0x0003,  0x05d1, 0x0c6e,  0x05db, 0x0c70,  0x05e4, 0x0c72,
+	/* 0x05c1 */
+	0x0000, 0x0001,  0x05e9, 0x0c74,
+	/* 0x05c2 */
+	0x0000, 0x0001,  0x05e9, 0x0c76,
+	/* 0x093c */
+	0x0000, 0x000b,  0x0915, 0x0c78,  0x0916, 0x0c7a,  0x0917, 0x0c7c,
+	0x091c, 0x0c7e,  0x0921, 0x0c80,  0x0922, 0x0c82,  0x0928, 0x0c84,
+	0x092b, 0x0c86,  0x092f, 0x0c88,  0x0930, 0x0c8a,  0x0933, 0x0c8c,
+	/* 0x09bc */
+	0x0000, 0x0004,  0x09a1, 0x0c8e,  0x09a2, 0x0c90,  0x09ac, 0x0c92,
+	0x09af, 0x0c94,
+	/* 0x09be */
+	0x0000, 0x0001,  0x09c7, 0x0c96,
+	/* 0x09d7 */
+	0x0000, 0x0001,  0x09c7, 0x0c98,
+	/* 0x0a3c */
+	0x0000, 0x0005,  0x0a16, 0x0c9a,  0x0a17, 0x0c9c,  0x0a1c, 0x0c9e,
+	0x0a21, 0x0ca0,  0x0a2b, 0x0ca2,
+	/* 0x0b3c */
+	0x0000, 0x0003,  0x0b21, 0x0ca4,  0x0b22, 0x0ca6,  0x0b2f, 0x0ca8,
+	/* 0x0b3e */
+	0x0000, 0x0001,  0x0b47, 0x0caa,
+	/* 0x0b56 */
+	0x0000, 0x0001,  0x0b47, 0x0cac,
+	/* 0x0b57 */
+	0x0000, 0x0001,  0x0b47, 0x0cae,
+	/* 0x0bbe */
+	0x0000, 0x0002,  0x0bc6, 0x0cb0,  0x0bc7, 0x0cb2,
+	/* 0x0bd7 */
+	0x0000, 0x0002,  0x0b92, 0x0cb4,  0x0bc6, 0x0cb6,
+	/* 0x0c56 */
+	0x0000, 0x0001,  0x0c46, 0x0cb8,
+	/* 0x0cc2 */
+	0x0000, 0x0001,  0x0cc6, 0x0cba,
+	/* 0x0cd5 */
+	0x0000, 0x0002,  0x0cbf, 0x0cbe,  0x0cc6, 0x0cc0,
+	/* 0x0cd6 */
+	0x0000, 0x0001,  0x0cc6, 0x0cc2,
+	/* 0x0d3e */
+	0x0000, 0x0002,  0x0d46, 0x0cc4,  0x0d47, 0x0cc6,
+	/* 0x0d57 */
+	0x0000, 0x0001,  0x0d46, 0x0cc8,
+	/* 0x0e32 */
+	0x0000, 0x0001,  0x0e4d, 0x0cca,
+	/* 0x0eb2 */
+	0x0000, 0x0001,  0x0ecd, 0x0ccc,
+	/* 0x0f71 */
+	0x0000, 0x0003,  0x0f72, 0x0cce,  0x0f74, 0x0cd0,  0x0f80, 0x0cd2,
+	/* 0x0f80 */
+	0x0000, 0x0002,  0x0fb2, 0x0cd4,  0x0fb3, 0x0cd8,
+	/* 0x0fb5 */
+	0x0000, 0x0002,  0x0f40, 0x0cdc,  0x0f90, 0x0cde,
+	/* 0x0fb7 */
+	0x0000, 0x000a,  0x0f42, 0x0ce0,  0x0f4c, 0x0ce2,  0x0f51, 0x0ce4,
+	0x0f56, 0x0ce6,  0x0f5b, 0x0ce8,  0x0f92, 0x0cea,  0x0f9c, 0x0cec,
+	0x0fa1, 0x0cee,  0x0fa6, 0x0cf0,  0x0fab, 0x0cf2,
+	/* 0x3099 */
+	0x0000, 0x0030,  0x3046, 0x0cf4,  0x304b, 0x0cf6,  0x304d, 0x0cf8,
+	0x304f, 0x0cfa,  0x3051, 0x0cfc,  0x3053, 0x0cfe,  0x3055, 0x0d00,
+	0x3057, 0x0d02,  0x3059, 0x0d04,  0x305b, 0x0d06,  0x305d, 0x0d08,
+	0x305f, 0x0d0a,  0x3061, 0x0d0c,  0x3064, 0x0d0e,  0x3066, 0x0d10,
+	0x3068, 0x0d12,  0x306f, 0x0d14,  0x3072, 0x0d16,  0x3075, 0x0d18,
+	0x3078, 0x0d1a,  0x307b, 0x0d1c,  0x309d, 0x0d1e,  0x30a6, 0x0d20,
+	0x30ab, 0x0d22,  0x30ad, 0x0d24,  0x30af, 0x0d26,  0x30b1, 0x0d28,
+	0x30b3, 0x0d2a,  0x30b5, 0x0d2c,  0x30b7, 0x0d2e,  0x30b9, 0x0d30,
+	0x30bb, 0x0d32,  0x30bd, 0x0d34,  0x30bf, 0x0d36,  0x30c1, 0x0d38,
+	0x30c4, 0x0d3a,  0x30c6, 0x0d3c,  0x30c8, 0x0d3e,  0x30cf, 0x0d40,
+	0x30d2, 0x0d42,  0x30d5, 0x0d44,  0x30d8, 0x0d46,  0x30db, 0x0d48,
+	0x30ef, 0x0d4a,  0x30f0, 0x0d4c,  0x30f1, 0x0d4e,  0x30f2, 0x0d50,
+	0x30fd, 0x0d52,
+	/* 0x309a */
+	0x0000, 0x000a,  0x306f, 0x0d54,  0x3072, 0x0d56,  0x3075, 0x0d58,
+	0x3078, 0x0d5a,  0x307b, 0x0d5c,  0x30cf, 0x0d5e,  0x30d2, 0x0d60,
+	0x30d5, 0x0d62,  0x30d8, 0x0d64,  0x30db, 0x0d66,
+	/* 0x0041 0x0300 */
+	0x00c0, 0x0000,
+	/* 0x0045 0x0300 */
+	0x00c8, 0x0000,
+	/* 0x0049 0x0300 */
+	0x00cc, 0x0000,
+	/* 0x004f 0x0300 */
+	0x00d2, 0x0000,
+	/* 0x0055 0x0300 */
+	0x00d9, 0x0000,
+	/* 0x0057 0x0300 */
+	0x1e80, 0x0000,
+	/* 0x0059 0x0300 */
+	0x1ef2, 0x0000,
+	/* 0x0061 0x0300 */
+	0x00e0, 0x0000,
+	/* 0x0065 0x0300 */
+	0x00e8, 0x0000,
+	/* 0x0069 0x0300 */
+	0x00ec, 0x0000,
+	/* 0x006f 0x0300 */
+	0x00f2, 0x0000,
+	/* 0x0075 0x0300 */
+	0x00f9, 0x0000,
+	/* 0x0077 0x0300 */
+	0x1e81, 0x0000,
+	/* 0x0079 0x0300 */
+	0x1ef3, 0x0000,
+	/* 0x00a8 0x0300 */
+	0x1fed, 0x0000,
+	/* 0x0391 0x0300 */
+	0x1fba, 0x0000,
+	/* 0x0395 0x0300 */
+	0x1fc8, 0x0000,
+	/* 0x0397 0x0300 */
+	0x1fca, 0x0000,
+	/* 0x0399 0x0300 */
+	0x1fda, 0x0000,
+	/* 0x039f 0x0300 */
+	0x1ff8, 0x0000,
+	/* 0x03a5 0x0300 */
+	0x1fea, 0x0000,
+	/* 0x03a9 0x0300 */
+	0x1ffa, 0x0000,
+	/* 0x03b1 0x0300 */
+	0x1f70, 0x0000,
+	/* 0x03b5 0x0300 */
+	0x1f72, 0x0000,
+	/* 0x03b7 0x0300 */
+	0x1f74, 0x0000,
+	/* 0x03b9 0x0300 */
+	0x1f76, 0x0000,
+	/* 0x03bf 0x0300 */
+	0x1f78, 0x0000,
+	/* 0x03c5 0x0300 */
+	0x1f7a, 0x0000,
+	/* 0x03c9 0x0300 */
+	0x1f7c, 0x0000,
+	/* 0x1fbf 0x0300 */
+	0x1fcd, 0x0000,
+	/* 0x1ffe 0x0300 */
+	0x1fdd, 0x0000,
+	/* 0x0041 0x0301 */
+	0x00c1, 0x0000,
+	/* 0x0043 0x0301 */
+	0x0106, 0x0000,
+	/* 0x0045 0x0301 */
+	0x00c9, 0x0000,
+	/* 0x0047 0x0301 */
+	0x01f4, 0x0000,
+	/* 0x0049 0x0301 */
+	0x00cd, 0x0000,
+	/* 0x004b 0x0301 */
+	0x1e30, 0x0000,
+	/* 0x004c 0x0301 */
+	0x0139, 0x0000,
+	/* 0x004d 0x0301 */
+	0x1e3e, 0x0000,
+	/* 0x004e 0x0301 */
+	0x0143, 0x0000,
+	/* 0x004f 0x0301 */
+	0x00d3, 0x0000,
+	/* 0x0050 0x0301 */
+	0x1e54, 0x0000,
+	/* 0x0052 0x0301 */
+	0x0154, 0x0000,
+	/* 0x0053 0x0301 */
+	0x015a, 0x0001,  0x0307, 0x0d68,
+	/* 0x0055 0x0301 */
+	0x00da, 0x0000,
+	/* 0x0057 0x0301 */
+	0x1e82, 0x0000,
+	/* 0x0059 0x0301 */
+	0x00dd, 0x0000,
+	/* 0x005a 0x0301 */
+	0x0179, 0x0000,
+	/* 0x0061 0x0301 */
+	0x00e1, 0x0000,
+	/* 0x0063 0x0301 */
+	0x0107, 0x0000,
+	/* 0x0065 0x0301 */
+	0x00e9, 0x0000,
+	/* 0x0067 0x0301 */
+	0x01f5, 0x0000,
+	/* 0x0069 0x0301 */
+	0x00ed, 0x0000,
+	/* 0x006b 0x0301 */
+	0x1e31, 0x0000,
+	/* 0x006c 0x0301 */
+	0x013a, 0x0000,
+	/* 0x006d 0x0301 */
+	0x1e3f, 0x0000,
+	/* 0x006e 0x0301 */
+	0x0144, 0x0000,
+	/* 0x006f 0x0301 */
+	0x00f3, 0x0000,
+	/* 0x0070 0x0301 */
+	0x1e55, 0x0000,
+	/* 0x0072 0x0301 */
+	0x0155, 0x0000,
+	/* 0x0073 0x0301 */
+	0x015b, 0x0001,  0x0307, 0x0d6a,
+	/* 0x0075 0x0301 */
+	0x00fa, 0x0000,
+	/* 0x0077 0x0301 */
+	0x1e83, 0x0000,
+	/* 0x0079 0x0301 */
+	0x00fd, 0x0000,
+	/* 0x007a 0x0301 */
+	0x017a, 0x0000,
+	/* 0x00a8 0x0301 */
+	0x1fee, 0x0000,
+	/* 0x00c6 0x0301 */
+	0x01fc, 0x0000,
+	/* 0x00d8 0x0301 */
+	0x01fe, 0x0000,
+	/* 0x00e6 0x0301 */
+	0x01fd, 0x0000,
+	/* 0x00f8 0x0301 */
+	0x01ff, 0x0000,
+	/* 0x0391 0x0301 */
+	0x1fbb, 0x0000,
+	/* 0x0395 0x0301 */
+	0x1fc9, 0x0000,
+	/* 0x0397 0x0301 */
+	0x1fcb, 0x0000,
+	/* 0x0399 0x0301 */
+	0x1fdb, 0x0000,
+	/* 0x039f 0x0301 */
+	0x1ff9, 0x0000,
+	/* 0x03a5 0x0301 */
+	0x1feb, 0x0000,
+	/* 0x03a9 0x0301 */
+	0x1ffb, 0x0000,
+	/* 0x03b1 0x0301 */
+	0x1f71, 0x0000,
+	/* 0x03b5 0x0301 */
+	0x1f73, 0x0000,
+	/* 0x03b7 0x0301 */
+	0x1f75, 0x0000,
+	/* 0x03b9 0x0301 */
+	0x1f77, 0x0000,
+	/* 0x03bf 0x0301 */
+	0x1f79, 0x0000,
+	/* 0x03c5 0x0301 */
+	0x1f7b, 0x0000,
+	/* 0x03c9 0x0301 */
+	0x1f7d, 0x0000,
+	/* 0x0413 0x0301 */
+	0x0403, 0x0000,
+	/* 0x041a 0x0301 */
+	0x040c, 0x0000,
+	/* 0x0433 0x0301 */
+	0x0453, 0x0000,
+	/* 0x043a 0x0301 */
+	0x045c, 0x0000,
+	/* 0x1fbf 0x0301 */
+	0x1fce, 0x0000,
+	/* 0x1ffe 0x0301 */
+	0x1fde, 0x0000,
+	/* 0x0041 0x0302 */
+	0x00c2, 0x0004,  0x0300, 0x0d6c,  0x0301, 0x0d6e,  0x0303, 0x0d70,
+	0x0309, 0x0d72,
+	/* 0x0043 0x0302 */
+	0x0108, 0x0000,
+	/* 0x0045 0x0302 */
+	0x00ca, 0x0004,  0x0300, 0x0d74,  0x0301, 0x0d76,  0x0303, 0x0d78,
+	0x0309, 0x0d7a,
+	/* 0x0047 0x0302 */
+	0x011c, 0x0000,
+	/* 0x0048 0x0302 */
+	0x0124, 0x0000,
+	/* 0x0049 0x0302 */
+	0x00ce, 0x0000,
+	/* 0x004a 0x0302 */
+	0x0134, 0x0000,
+	/* 0x004f 0x0302 */
+	0x00d4, 0x0004,  0x0300, 0x0d7c,  0x0301, 0x0d7e,  0x0303, 0x0d80,
+	0x0309, 0x0d82,
+	/* 0x0053 0x0302 */
+	0x015c, 0x0000,
+	/* 0x0055 0x0302 */
+	0x00db, 0x0000,
+	/* 0x0057 0x0302 */
+	0x0174, 0x0000,
+	/* 0x0059 0x0302 */
+	0x0176, 0x0000,
+	/* 0x005a 0x0302 */
+	0x1e90, 0x0000,
+	/* 0x0061 0x0302 */
+	0x00e2, 0x0004,  0x0300, 0x0d84,  0x0301, 0x0d86,  0x0303, 0x0d88,
+	0x0309, 0x0d8a,
+	/* 0x0063 0x0302 */
+	0x0109, 0x0000,
+	/* 0x0065 0x0302 */
+	0x00ea, 0x0004,  0x0300, 0x0d8c,  0x0301, 0x0d8e,  0x0303, 0x0d90,
+	0x0309, 0x0d92,
+	/* 0x0067 0x0302 */
+	0x011d, 0x0000,
+	/* 0x0068 0x0302 */
+	0x0125, 0x0000,
+	/* 0x0069 0x0302 */
+	0x00ee, 0x0000,
+	/* 0x006a 0x0302 */
+	0x0135, 0x0000,
+	/* 0x006f 0x0302 */
+	0x00f4, 0x0004,  0x0300, 0x0d94,  0x0301, 0x0d96,  0x0303, 0x0d98,
+	0x0309, 0x0d9a,
+	/* 0x0073 0x0302 */
+	0x015d, 0x0000,
+	/* 0x0075 0x0302 */
+	0x00fb, 0x0000,
+	/* 0x0077 0x0302 */
+	0x0175, 0x0000,
+	/* 0x0079 0x0302 */
+	0x0177, 0x0000,
+	/* 0x007a 0x0302 */
+	0x1e91, 0x0000,
+	/* 0x0041 0x0303 */
+	0x00c3, 0x0000,
+	/* 0x0045 0x0303 */
+	0x1ebc, 0x0000,
+	/* 0x0049 0x0303 */
+	0x0128, 0x0000,
+	/* 0x004e 0x0303 */
+	0x00d1, 0x0000,
+	/* 0x004f 0x0303 */
+	0x00d5, 0x0002,  0x0301, 0x0d9c,  0x0308, 0x0d9e,
+	/* 0x0055 0x0303 */
+	0x0168, 0x0001,  0x0301, 0x0da0,
+	/* 0x0056 0x0303 */
+	0x1e7c, 0x0000,
+	/* 0x0059 0x0303 */
+	0x1ef8, 0x0000,
+	/* 0x0061 0x0303 */
+	0x00e3, 0x0000,
+	/* 0x0065 0x0303 */
+	0x1ebd, 0x0000,
+	/* 0x0069 0x0303 */
+	0x0129, 0x0000,
+	/* 0x006e 0x0303 */
+	0x00f1, 0x0000,
+	/* 0x006f 0x0303 */
+	0x00f5, 0x0002,  0x0301, 0x0da2,  0x0308, 0x0da4,
+	/* 0x0075 0x0303 */
+	0x0169, 0x0001,  0x0301, 0x0da6,
+	/* 0x0076 0x0303 */
+	0x1e7d, 0x0000,
+	/* 0x0079 0x0303 */
+	0x1ef9, 0x0000,
+	/* 0x0041 0x0304 */
+	0x0100, 0x0000,
+	/* 0x0045 0x0304 */
+	0x0112, 0x0002,  0x0300, 0x0da8,  0x0301, 0x0daa,
+	/* 0x0047 0x0304 */
+	0x1e20, 0x0000,
+	/* 0x0049 0x0304 */
+	0x012a, 0x0000,
+	/* 0x004f 0x0304 */
+	0x014c, 0x0002,  0x0300, 0x0dac,  0x0301, 0x0dae,
+	/* 0x0055 0x0304 */
+	0x016a, 0x0001,  0x0308, 0x0db0,
+	/* 0x0061 0x0304 */
+	0x0101, 0x0000,
+	/* 0x0065 0x0304 */
+	0x0113, 0x0002,  0x0300, 0x0db2,  0x0301, 0x0db4,
+	/* 0x0067 0x0304 */
+	0x1e21, 0x0000,
+	/* 0x0069 0x0304 */
+	0x012b, 0x0000,
+	/* 0x006f 0x0304 */
+	0x014d, 0x0002,  0x0300, 0x0db6,  0x0301, 0x0db8,
+	/* 0x0075 0x0304 */
+	0x016b, 0x0001,  0x0308, 0x0dba,
+	/* 0x00c6 0x0304 */
+	0x01e2, 0x0000,
+	/* 0x00e6 0x0304 */
+	0x01e3, 0x0000,
+	/* 0x0391 0x0304 */
+	0x1fb9, 0x0000,
+	/* 0x0399 0x0304 */
+	0x1fd9, 0x0000,
+	/* 0x03a5 0x0304 */
+	0x1fe9, 0x0000,
+	/* 0x03b1 0x0304 */
+	0x1fb1, 0x0000,
+	/* 0x03b9 0x0304 */
+	0x1fd1, 0x0000,
+	/* 0x03c5 0x0304 */
+	0x1fe1, 0x0000,
+	/* 0x0418 0x0304 */
+	0x04e2, 0x0000,
+	/* 0x0423 0x0304 */
+	0x04ee, 0x0000,
+	/* 0x0438 0x0304 */
+	0x04e3, 0x0000,
+	/* 0x0443 0x0304 */
+	0x04ef, 0x0000,
+	/* 0x0041 0x0306 */
+	0x0102, 0x0004,  0x0300, 0x0dbc,  0x0301, 0x0dbe,  0x0303, 0x0dc0,
+	0x0309, 0x0dc2,
+	/* 0x0045 0x0306 */
+	0x0114, 0x0000,
+	/* 0x0047 0x0306 */
+	0x011e, 0x0000,
+	/* 0x0049 0x0306 */
+	0x012c, 0x0000,
+	/* 0x004f 0x0306 */
+	0x014e, 0x0000,
+	/* 0x0055 0x0306 */
+	0x016c, 0x0000,
+	/* 0x0061 0x0306 */
+	0x0103, 0x0004,  0x0300, 0x0dc4,  0x0301, 0x0dc6,  0x0303, 0x0dc8,
+	0x0309, 0x0dca,
+	/* 0x0065 0x0306 */
+	0x0115, 0x0000,
+	/* 0x0067 0x0306 */
+	0x011f, 0x0000,
+	/* 0x0069 0x0306 */
+	0x012d, 0x0000,
+	/* 0x006f 0x0306 */
+	0x014f, 0x0000,
+	/* 0x0075 0x0306 */
+	0x016d, 0x0000,
+	/* 0x0391 0x0306 */
+	0x1fb8, 0x0000,
+	/* 0x0399 0x0306 */
+	0x1fd8, 0x0000,
+	/* 0x03a5 0x0306 */
+	0x1fe8, 0x0000,
+	/* 0x03b1 0x0306 */
+	0x1fb0, 0x0000,
+	/* 0x03b9 0x0306 */
+	0x1fd0, 0x0000,
+	/* 0x03c5 0x0306 */
+	0x1fe0, 0x0000,
+	/* 0x0410 0x0306 */
+	0x04d0, 0x0000,
+	/* 0x0415 0x0306 */
+	0x04d6, 0x0000,
+	/* 0x0416 0x0306 */
+	0x04c1, 0x0000,
+	/* 0x0418 0x0306 */
+	0x0419, 0x0000,
+	/* 0x0423 0x0306 */
+	0x040e, 0x0000,
+	/* 0x0430 0x0306 */
+	0x04d1, 0x0000,
+	/* 0x0435 0x0306 */
+	0x04d7, 0x0000,
+	/* 0x0436 0x0306 */
+	0x04c2, 0x0000,
+	/* 0x0438 0x0306 */
+	0x0439, 0x0000,
+	/* 0x0443 0x0306 */
+	0x045e, 0x0000,
+	/* 0x0041 0x0307 */
+	0x0000, 0x0001,  0x0304, 0x0dcc,
+	/* 0x0042 0x0307 */
+	0x1e02, 0x0000,
+	/* 0x0043 0x0307 */
+	0x010a, 0x0000,
+	/* 0x0044 0x0307 */
+	0x1e0a, 0x0000,
+	/* 0x0045 0x0307 */
+	0x0116, 0x0000,
+	/* 0x0046 0x0307 */
+	0x1e1e, 0x0000,
+	/* 0x0047 0x0307 */
+	0x0120, 0x0000,
+	/* 0x0048 0x0307 */
+	0x1e22, 0x0000,
+	/* 0x0049 0x0307 */
+	0x0130, 0x0000,
+	/* 0x004d 0x0307 */
+	0x1e40, 0x0000,
+	/* 0x004e 0x0307 */
+	0x1e44, 0x0000,
+	/* 0x0050 0x0307 */
+	0x1e56, 0x0000,
+	/* 0x0052 0x0307 */
+	0x1e58, 0x0000,
+	/* 0x0053 0x0307 */
+	0x1e60, 0x0000,
+	/* 0x0054 0x0307 */
+	0x1e6a, 0x0000,
+	/* 0x0057 0x0307 */
+	0x1e86, 0x0000,
+	/* 0x0058 0x0307 */
+	0x1e8a, 0x0000,
+	/* 0x0059 0x0307 */
+	0x1e8e, 0x0000,
+	/* 0x005a 0x0307 */
+	0x017b, 0x0000,
+	/* 0x0061 0x0307 */
+	0x0000, 0x0001,  0x0304, 0x0dce,
+	/* 0x0062 0x0307 */
+	0x1e03, 0x0000,
+	/* 0x0063 0x0307 */
+	0x010b, 0x0000,
+	/* 0x0064 0x0307 */
+	0x1e0b, 0x0000,
+	/* 0x0065 0x0307 */
+	0x0117, 0x0000,
+	/* 0x0066 0x0307 */
+	0x1e1f, 0x0000,
+	/* 0x0067 0x0307 */
+	0x0121, 0x0000,
+	/* 0x0068 0x0307 */
+	0x1e23, 0x0000,
+	/* 0x006d 0x0307 */
+	0x1e41, 0x0000,
+	/* 0x006e 0x0307 */
+	0x1e45, 0x0000,
+	/* 0x0070 0x0307 */
+	0x1e57, 0x0000,
+	/* 0x0072 0x0307 */
+	0x1e59, 0x0000,
+	/* 0x0073 0x0307 */
+	0x1e61, 0x0000,
+	/* 0x0074 0x0307 */
+	0x1e6b, 0x0000,
+	/* 0x0077 0x0307 */
+	0x1e87, 0x0000,
+	/* 0x0078 0x0307 */
+	0x1e8b, 0x0000,
+	/* 0x0079 0x0307 */
+	0x1e8f, 0x0000,
+	/* 0x007a 0x0307 */
+	0x017c, 0x0000,
+	/* 0x017f 0x0307 */
+	0x1e9b, 0x0000,
+	/* 0x0306 0x0307 */
+	0x0310, 0x0000,
+	/* 0x0041 0x0308 */
+	0x00c4, 0x0001,  0x0304, 0x0dd0,
+	/* 0x0045 0x0308 */
+	0x00cb, 0x0000,
+	/* 0x0048 0x0308 */
+	0x1e26, 0x0000,
+	/* 0x0049 0x0308 */
+	0x00cf, 0x0001,  0x0301, 0x0dd2,
+	/* 0x004f 0x0308 */
+	0x00d6, 0x0000,
+	/* 0x0055 0x0308 */
+	0x00dc, 0x0004,  0x0300, 0x0dd4,  0x0301, 0x0dd6,  0x0304, 0x0dd8,
+	0x030c, 0x0dda,
+	/* 0x0057 0x0308 */
+	0x1e84, 0x0000,
+	/* 0x0058 0x0308 */
+	0x1e8c, 0x0000,
+	/* 0x0059 0x0308 */
+	0x0178, 0x0000,
+	/* 0x0061 0x0308 */
+	0x00e4, 0x0001,  0x0304, 0x0ddc,
+	/* 0x0065 0x0308 */
+	0x00eb, 0x0000,
+	/* 0x0068 0x0308 */
+	0x1e27, 0x0000,
+	/* 0x0069 0x0308 */
+	0x00ef, 0x0001,  0x0301, 0x0dde,
+	/* 0x006f 0x0308 */
+	0x00f6, 0x0000,
+	/* 0x0074 0x0308 */
+	0x1e97, 0x0000,
+	/* 0x0075 0x0308 */
+	0x00fc, 0x0004,  0x0300, 0x0de0,  0x0301, 0x0de2,  0x0304, 0x0de4,
+	0x030c, 0x0de6,
+	/* 0x0077 0x0308 */
+	0x1e85, 0x0000,
+	/* 0x0078 0x0308 */
+	0x1e8d, 0x0000,
+	/* 0x0079 0x0308 */
+	0x00ff, 0x0000,
+	/* 0x018f 0x0308 */
+	0x04da, 0x0000,
+	/* 0x019f 0x0308 */
+	0x04ea, 0x0000,
+	/* 0x0259 0x0308 */
+	0x04db, 0x0000,
+	/* 0x0275 0x0308 */
+	0x04eb, 0x0000,
+	/* 0x0399 0x0308 */
+	0x03aa, 0x0000,
+	/* 0x03a5 0x0308 */
+	0x03ab, 0x0000,
+	/* 0x03b9 0x0308 */
+	0x03ca, 0x0004,  0x0300, 0x0de8,  0x0301, 0x0dea,  0x030d, 0x0dec,
+	0x0342, 0x0dee,
+	/* 0x03c5 0x0308 */
+	0x03cb, 0x0004,  0x0300, 0x0df0,  0x0301, 0x0df2,  0x030d, 0x0df4,
+	0x0342, 0x0df6,
+	/* 0x03d2 0x0308 */
+	0x03d4, 0x0000,
+	/* 0x0406 0x0308 */
+	0x0407, 0x0000,
+	/* 0x0410 0x0308 */
+	0x04d2, 0x0000,
+	/* 0x0415 0x0308 */
+	0x0401, 0x0000,
+	/* 0x0416 0x0308 */
+	0x04dc, 0x0000,
+	/* 0x0417 0x0308 */
+	0x04de, 0x0000,
+	/* 0x0418 0x0308 */
+	0x04e4, 0x0000,
+	/* 0x041e 0x0308 */
+	0x04e6, 0x0000,
+	/* 0x0423 0x0308 */
+	0x04f0, 0x0000,
+	/* 0x0427 0x0308 */
+	0x04f4, 0x0000,
+	/* 0x042b 0x0308 */
+	0x04f8, 0x0000,
+	/* 0x0430 0x0308 */
+	0x04d3, 0x0000,
+	/* 0x0435 0x0308 */
+	0x0451, 0x0000,
+	/* 0x0436 0x0308 */
+	0x04dd, 0x0000,
+	/* 0x0437 0x0308 */
+	0x04df, 0x0000,
+	/* 0x0438 0x0308 */
+	0x04e5, 0x0000,
+	/* 0x043e 0x0308 */
+	0x04e7, 0x0000,
+	/* 0x0443 0x0308 */
+	0x04f1, 0x0000,
+	/* 0x0447 0x0308 */
+	0x04f5, 0x0000,
+	/* 0x044b 0x0308 */
+	0x04f9, 0x0000,
+	/* 0x0456 0x0308 */
+	0x0457, 0x0000,
+	/* 0x0041 0x0309 */
+	0x1ea2, 0x0000,
+	/* 0x0045 0x0309 */
+	0x1eba, 0x0000,
+	/* 0x0049 0x0309 */
+	0x1ec8, 0x0000,
+	/* 0x004f 0x0309 */
+	0x1ece, 0x0000,
+	/* 0x0055 0x0309 */
+	0x1ee6, 0x0000,
+	/* 0x0059 0x0309 */
+	0x1ef6, 0x0000,
+	/* 0x0061 0x0309 */
+	0x1ea3, 0x0000,
+	/* 0x0065 0x0309 */
+	0x1ebb, 0x0000,
+	/* 0x0069 0x0309 */
+	0x1ec9, 0x0000,
+	/* 0x006f 0x0309 */
+	0x1ecf, 0x0000,
+	/* 0x0075 0x0309 */
+	0x1ee7, 0x0000,
+	/* 0x0079 0x0309 */
+	0x1ef7, 0x0000,
+	/* 0x0041 0x030a */
+	0x00c5, 0x0001,  0x0301, 0x0df8,
+	/* 0x0055 0x030a */
+	0x016e, 0x0000,
+	/* 0x0061 0x030a */
+	0x00e5, 0x0001,  0x0301, 0x0dfa,
+	/* 0x0075 0x030a */
+	0x016f, 0x0000,
+	/* 0x0077 0x030a */
+	0x1e98, 0x0000,
+	/* 0x0079 0x030a */
+	0x1e99, 0x0000,
+	/* 0x004f 0x030b */
+	0x0150, 0x0000,
+	/* 0x0055 0x030b */
+	0x0170, 0x0000,
+	/* 0x006f 0x030b */
+	0x0151, 0x0000,
+	/* 0x0075 0x030b */
+	0x0171, 0x0000,
+	/* 0x0423 0x030b */
+	0x04f2, 0x0000,
+	/* 0x0443 0x030b */
+	0x04f3, 0x0000,
+	/* 0x0041 0x030c */
+	0x01cd, 0x0000,
+	/* 0x0043 0x030c */
+	0x010c, 0x0000,
+	/* 0x0044 0x030c */
+	0x010e, 0x0000,
+	/* 0x0045 0x030c */
+	0x011a, 0x0000,
+	/* 0x0047 0x030c */
+	0x01e6, 0x0000,
+	/* 0x0049 0x030c */
+	0x01cf, 0x0000,
+	/* 0x004b 0x030c */
+	0x01e8, 0x0000,
+	/* 0x004c 0x030c */
+	0x013d, 0x0000,
+	/* 0x004e 0x030c */
+	0x0147, 0x0000,
+	/* 0x004f 0x030c */
+	0x01d1, 0x0000,
+	/* 0x0052 0x030c */
+	0x0158, 0x0000,
+	/* 0x0053 0x030c */
+	0x0160, 0x0001,  0x0307, 0x0dfc,
+	/* 0x0054 0x030c */
+	0x0164, 0x0000,
+	/* 0x0055 0x030c */
+	0x01d3, 0x0000,
+	/* 0x005a 0x030c */
+	0x017d, 0x0000,
+	/* 0x0061 0x030c */
+	0x01ce, 0x0000,
+	/* 0x0063 0x030c */
+	0x010d, 0x0000,
+	/* 0x0064 0x030c */
+	0x010f, 0x0000,
+	/* 0x0065 0x030c */
+	0x011b, 0x0000,
+	/* 0x0067 0x030c */
+	0x01e7, 0x0000,
+	/* 0x0069 0x030c */
+	0x01d0, 0x0000,
+	/* 0x006a 0x030c */
+	0x01f0, 0x0000,
+	/* 0x006b 0x030c */
+	0x01e9, 0x0000,
+	/* 0x006c 0x030c */
+	0x013e, 0x0000,
+	/* 0x006e 0x030c */
+	0x0148, 0x0000,
+	/* 0x006f 0x030c */
+	0x01d2, 0x0000,
+	/* 0x0072 0x030c */
+	0x0159, 0x0000,
+	/* 0x0073 0x030c */
+	0x0161, 0x0001,  0x0307, 0x0dfe,
+	/* 0x0074 0x030c */
+	0x0165, 0x0000,
+	/* 0x0075 0x030c */
+	0x01d4, 0x0000,
+	/* 0x007a 0x030c */
+	0x017e, 0x0000,
+	/* 0x01b7 0x030c */
+	0x01ee, 0x0000,
+	/* 0x0292 0x030c */
+	0x01ef, 0x0000,
+	/* 0x00a8 0x030d */
+	0x0385, 0x0000,
+	/* 0x0308 0x030d */
+	0x0344, 0x0000,
+	/* 0x0391 0x030d */
+	0x0386, 0x0000,
+	/* 0x0395 0x030d */
+	0x0388, 0x0000,
+	/* 0x0397 0x030d */
+	0x0389, 0x0000,
+	/* 0x0399 0x030d */
+	0x038a, 0x0000,
+	/* 0x039f 0x030d */
+	0x038c, 0x0000,
+	/* 0x03a5 0x030d */
+	0x038e, 0x0000,
+	/* 0x03a9 0x030d */
+	0x038f, 0x0000,
+	/* 0x03b1 0x030d */
+	0x03ac, 0x0000,
+	/* 0x03b5 0x030d */
+	0x03ad, 0x0000,
+	/* 0x03b7 0x030d */
+	0x03ae, 0x0000,
+	/* 0x03b9 0x030d */
+	0x03af, 0x0000,
+	/* 0x03bf 0x030d */
+	0x03cc, 0x0000,
+	/* 0x03c5 0x030d */
+	0x03cd, 0x0000,
+	/* 0x03c9 0x030d */
+	0x03ce, 0x0000,
+	/* 0x03d2 0x030d */
+	0x03d3, 0x0000,
+	/* 0x0041 0x030f */
+	0x0200, 0x0000,
+	/* 0x0045 0x030f */
+	0x0204, 0x0000,
+	/* 0x0049 0x030f */
+	0x0208, 0x0000,
+	/* 0x004f 0x030f */
+	0x020c, 0x0000,
+	/* 0x0052 0x030f */
+	0x0210, 0x0000,
+	/* 0x0055 0x030f */
+	0x0214, 0x0000,
+	/* 0x0061 0x030f */
+	0x0201, 0x0000,
+	/* 0x0065 0x030f */
+	0x0205, 0x0000,
+	/* 0x0069 0x030f */
+	0x0209, 0x0000,
+	/* 0x006f 0x030f */
+	0x020d, 0x0000,
+	/* 0x0072 0x030f */
+	0x0211, 0x0000,
+	/* 0x0075 0x030f */
+	0x0215, 0x0000,
+	/* 0x0474 0x030f */
+	0x0476, 0x0000,
+	/* 0x0475 0x030f */
+	0x0477, 0x0000,
+	/* 0x0041 0x0311 */
+	0x0202, 0x0000,
+	/* 0x0045 0x0311 */
+	0x0206, 0x0000,
+	/* 0x0049 0x0311 */
+	0x020a, 0x0000,
+	/* 0x004f 0x0311 */
+	0x020e, 0x0000,
+	/* 0x0052 0x0311 */
+	0x0212, 0x0000,
+	/* 0x0055 0x0311 */
+	0x0216, 0x0000,
+	/* 0x0061 0x0311 */
+	0x0203, 0x0000,
+	/* 0x0065 0x0311 */
+	0x0207, 0x0000,
+	/* 0x0069 0x0311 */
+	0x020b, 0x0000,
+	/* 0x006f 0x0311 */
+	0x020f, 0x0000,
+	/* 0x0072 0x0311 */
+	0x0213, 0x0000,
+	/* 0x0075 0x0311 */
+	0x0217, 0x0000,
+	/* 0x0391 0x0313 */
+	0x1f08, 0x0003,  0x0300, 0x0e00,  0x0301, 0x0e02,  0x0342, 0x0e04,
+	/* 0x0395 0x0313 */
+	0x1f18, 0x0002,  0x0300, 0x0e06,  0x0301, 0x0e08,
+	/* 0x0397 0x0313 */
+	0x1f28, 0x0003,  0x0300, 0x0e0a,  0x0301, 0x0e0c,  0x0342, 0x0e0e,
+	/* 0x0399 0x0313 */
+	0x1f38, 0x0003,  0x0300, 0x0e10,  0x0301, 0x0e12,  0x0342, 0x0e14,
+	/* 0x039f 0x0313 */
+	0x1f48, 0x0002,  0x0300, 0x0e16,  0x0301, 0x0e18,
+	/* 0x03a9 0x0313 */
+	0x1f68, 0x0003,  0x0300, 0x0e1a,  0x0301, 0x0e1c,  0x0342, 0x0e1e,
+	/* 0x03b1 0x0313 */
+	0x1f00, 0x0003,  0x0300, 0x0e20,  0x0301, 0x0e22,  0x0342, 0x0e24,
+	/* 0x03b5 0x0313 */
+	0x1f10, 0x0002,  0x0300, 0x0e26,  0x0301, 0x0e28,
+	/* 0x03b7 0x0313 */
+	0x1f20, 0x0003,  0x0300, 0x0e2a,  0x0301, 0x0e2c,  0x0342, 0x0e2e,
+	/* 0x03b9 0x0313 */
+	0x1f30, 0x0003,  0x0300, 0x0e30,  0x0301, 0x0e32,  0x0342, 0x0e34,
+	/* 0x03bf 0x0313 */
+	0x1f40, 0x0002,  0x0300, 0x0e36,  0x0301, 0x0e38,
+	/* 0x03c1 0x0313 */
+	0x1fe4, 0x0000,
+	/* 0x03c5 0x0313 */
+	0x1f50, 0x0003,  0x0300, 0x0e3a,  0x0301, 0x0e3c,  0x0342, 0x0e3e,
+	/* 0x03c9 0x0313 */
+	0x1f60, 0x0003,  0x0300, 0x0e40,  0x0301, 0x0e42,  0x0342, 0x0e44,
+	/* 0x0391 0x0314 */
+	0x1f09, 0x0003,  0x0300, 0x0e46,  0x0301, 0x0e48,  0x0342, 0x0e4a,
+	/* 0x0395 0x0314 */
+	0x1f19, 0x0002,  0x0300, 0x0e4c,  0x0301, 0x0e4e,
+	/* 0x0397 0x0314 */
+	0x1f29, 0x0003,  0x0300, 0x0e50,  0x0301, 0x0e52,  0x0342, 0x0e54,
+	/* 0x0399 0x0314 */
+	0x1f39, 0x0003,  0x0300, 0x0e56,  0x0301, 0x0e58,  0x0342, 0x0e5a,
+	/* 0x039f 0x0314 */
+	0x1f49, 0x0002,  0x0300, 0x0e5c,  0x0301, 0x0e5e,
+	/* 0x03a1 0x0314 */
+	0x1fec, 0x0000,
+	/* 0x03a5 0x0314 */
+	0x1f59, 0x0003,  0x0300, 0x0e60,  0x0301, 0x0e62,  0x0342, 0x0e64,
+	/* 0x03a9 0x0314 */
+	0x1f69, 0x0003,  0x0300, 0x0e66,  0x0301, 0x0e68,  0x0342, 0x0e6a,
+	/* 0x03b1 0x0314 */
+	0x1f01, 0x0003,  0x0300, 0x0e6c,  0x0301, 0x0e6e,  0x0342, 0x0e70,
+	/* 0x03b5 0x0314 */
+	0x1f11, 0x0002,  0x0300, 0x0e72,  0x0301, 0x0e74,
+	/* 0x03b7 0x0314 */
+	0x1f21, 0x0003,  0x0300, 0x0e76,  0x0301, 0x0e78,  0x0342, 0x0e7a,
+	/* 0x03b9 0x0314 */
+	0x1f31, 0x0003,  0x0300, 0x0e7c,  0x0301, 0x0e7e,  0x0342, 0x0e80,
+	/* 0x03bf 0x0314 */
+	0x1f41, 0x0002,  0x0300, 0x0e82,  0x0301, 0x0e84,
+	/* 0x03c1 0x0314 */
+	0x1fe5, 0x0000,
+	/* 0x03c5 0x0314 */
+	0x1f51, 0x0003,  0x0300, 0x0e86,  0x0301, 0x0e88,  0x0342, 0x0e8a,
+	/* 0x03c9 0x0314 */
+	0x1f61, 0x0003,  0x0300, 0x0e8c,  0x0301, 0x0e8e,  0x0342, 0x0e90,
+	/* 0x004f 0x031b */
+	0x01a0, 0x0005,  0x0300, 0x0e92,  0x0301, 0x0e94,  0x0303, 0x0e96,
+	0x0309, 0x0e98,  0x0323, 0x0e9a,
+	/* 0x0055 0x031b */
+	0x01af, 0x0005,  0x0300, 0x0e9c,  0x0301, 0x0e9e,  0x0303, 0x0ea0,
+	0x0309, 0x0ea2,  0x0323, 0x0ea4,
+	/* 0x006f 0x031b */
+	0x01a1, 0x0005,  0x0300, 0x0ea6,  0x0301, 0x0ea8,  0x0303, 0x0eaa,
+	0x0309, 0x0eac,  0x0323, 0x0eae,
+	/* 0x0075 0x031b */
+	0x01b0, 0x0005,  0x0300, 0x0eb0,  0x0301, 0x0eb2,  0x0303, 0x0eb4,
+	0x0309, 0x0eb6,  0x0323, 0x0eb8,
+	/* 0x0041 0x0323 */
+	0x1ea0, 0x0002,  0x0302, 0x0eba,  0x0306, 0x0ebc,
+	/* 0x0042 0x0323 */
+	0x1e04, 0x0000,
+	/* 0x0044 0x0323 */
+	0x1e0c, 0x0000,
+	/* 0x0045 0x0323 */
+	0x1eb8, 0x0001,  0x0302, 0x0ebe,
+	/* 0x0048 0x0323 */
+	0x1e24, 0x0000,
+	/* 0x0049 0x0323 */
+	0x1eca, 0x0000,
+	/* 0x004b 0x0323 */
+	0x1e32, 0x0000,
+	/* 0x004c 0x0323 */
+	0x1e36, 0x0001,  0x0304, 0x0ec0,
+	/* 0x004d 0x0323 */
+	0x1e42, 0x0000,
+	/* 0x004e 0x0323 */
+	0x1e46, 0x0000,
+	/* 0x004f 0x0323 */
+	0x1ecc, 0x0001,  0x0302, 0x0ec2,
+	/* 0x0052 0x0323 */
+	0x1e5a, 0x0001,  0x0304, 0x0ec4,
+	/* 0x0053 0x0323 */
+	0x1e62, 0x0001,  0x0307, 0x0ec6,
+	/* 0x0054 0x0323 */
+	0x1e6c, 0x0000,
+	/* 0x0055 0x0323 */
+	0x1ee4, 0x0000,
+	/* 0x0056 0x0323 */
+	0x1e7e, 0x0000,
+	/* 0x0057 0x0323 */
+	0x1e88, 0x0000,
+	/* 0x0059 0x0323 */
+	0x1ef4, 0x0000,
+	/* 0x005a 0x0323 */
+	0x1e92, 0x0000,
+	/* 0x0061 0x0323 */
+	0x1ea1, 0x0002,  0x0302, 0x0ec8,  0x0306, 0x0eca,
+	/* 0x0062 0x0323 */
+	0x1e05, 0x0000,
+	/* 0x0064 0x0323 */
+	0x1e0d, 0x0000,
+	/* 0x0065 0x0323 */
+	0x1eb9, 0x0001,  0x0302, 0x0ecc,
+	/* 0x0068 0x0323 */
+	0x1e25, 0x0000,
+	/* 0x0069 0x0323 */
+	0x1ecb, 0x0000,
+	/* 0x006b 0x0323 */
+	0x1e33, 0x0000,
+	/* 0x006c 0x0323 */
+	0x1e37, 0x0001,  0x0304, 0x0ece,
+	/* 0x006d 0x0323 */
+	0x1e43, 0x0000,
+	/* 0x006e 0x0323 */
+	0x1e47, 0x0000,
+	/* 0x006f 0x0323 */
+	0x1ecd, 0x0001,  0x0302, 0x0ed0,
+	/* 0x0072 0x0323 */
+	0x1e5b, 0x0001,  0x0304, 0x0ed2,
+	/* 0x0073 0x0323 */
+	0x1e63, 0x0001,  0x0307, 0x0ed4,
+	/* 0x0074 0x0323 */
+	0x1e6d, 0x0000,
+	/* 0x0075 0x0323 */
+	0x1ee5, 0x0000,
+	/* 0x0076 0x0323 */
+	0x1e7f, 0x0000,
+	/* 0x0077 0x0323 */
+	0x1e89, 0x0000,
+	/* 0x0079 0x0323 */
+	0x1ef5, 0x0000,
+	/* 0x007a 0x0323 */
+	0x1e93, 0x0000,
+	/* 0x0055 0x0324 */
+	0x1e72, 0x0000,
+	/* 0x0075 0x0324 */
+	0x1e73, 0x0000,
+	/* 0x0041 0x0325 */
+	0x1e00, 0x0000,
+	/* 0x0061 0x0325 */
+	0x1e01, 0x0000,
+	/* 0x0043 0x0327 */
+	0x00c7, 0x0001,  0x0301, 0x0ed6,
+	/* 0x0044 0x0327 */
+	0x1e10, 0x0000,
+	/* 0x0045 0x0327 */
+	0x0000, 0x0001,  0x0306, 0x0ed8,
+	/* 0x0047 0x0327 */
+	0x0122, 0x0000,
+	/* 0x0048 0x0327 */
+	0x1e28, 0x0000,
+	/* 0x004b 0x0327 */
+	0x0136, 0x0000,
+	/* 0x004c 0x0327 */
+	0x013b, 0x0000,
+	/* 0x004e 0x0327 */
+	0x0145, 0x0000,
+	/* 0x0052 0x0327 */
+	0x0156, 0x0000,
+	/* 0x0053 0x0327 */
+	0x015e, 0x0000,
+	/* 0x0054 0x0327 */
+	0x0162, 0x0000,
+	/* 0x0063 0x0327 */
+	0x00e7, 0x0001,  0x0301, 0x0eda,
+	/* 0x0064 0x0327 */
+	0x1e11, 0x0000,
+	/* 0x0065 0x0327 */
+	0x0000, 0x0001,  0x0306, 0x0edc,
+	/* 0x0067 0x0327 */
+	0x0123, 0x0000,
+	/* 0x0068 0x0327 */
+	0x1e29, 0x0000,
+	/* 0x006b 0x0327 */
+	0x0137, 0x0000,
+	/* 0x006c 0x0327 */
+	0x013c, 0x0000,
+	/* 0x006e 0x0327 */
+	0x0146, 0x0000,
+	/* 0x0072 0x0327 */
+	0x0157, 0x0000,
+	/* 0x0073 0x0327 */
+	0x015f, 0x0000,
+	/* 0x0074 0x0327 */
+	0x0163, 0x0000,
+	/* 0x0041 0x0328 */
+	0x0104, 0x0000,
+	/* 0x0045 0x0328 */
+	0x0118, 0x0000,
+	/* 0x0049 0x0328 */
+	0x012e, 0x0000,
+	/* 0x004f 0x0328 */
+	0x01ea, 0x0001,  0x0304, 0x0ede,
+	/* 0x0055 0x0328 */
+	0x0172, 0x0000,
+	/* 0x0061 0x0328 */
+	0x0105, 0x0000,
+	/* 0x0065 0x0328 */
+	0x0119, 0x0000,
+	/* 0x0069 0x0328 */
+	0x012f, 0x0000,
+	/* 0x006f 0x0328 */
+	0x01eb, 0x0001,  0x0304, 0x0ee0,
+	/* 0x0075 0x0328 */
+	0x0173, 0x0000,
+	/* 0x0044 0x032d */
+	0x1e12, 0x0000,
+	/* 0x0045 0x032d */
+	0x1e18, 0x0000,
+	/* 0x004c 0x032d */
+	0x1e3c, 0x0000,
+	/* 0x004e 0x032d */
+	0x1e4a, 0x0000,
+	/* 0x0054 0x032d */
+	0x1e70, 0x0000,
+	/* 0x0055 0x032d */
+	0x1e76, 0x0000,
+	/* 0x0064 0x032d */
+	0x1e13, 0x0000,
+	/* 0x0065 0x032d */
+	0x1e19, 0x0000,
+	/* 0x006c 0x032d */
+	0x1e3d, 0x0000,
+	/* 0x006e 0x032d */
+	0x1e4b, 0x0000,
+	/* 0x0074 0x032d */
+	0x1e71, 0x0000,
+	/* 0x0075 0x032d */
+	0x1e77, 0x0000,
+	/* 0x0048 0x032e */
+	0x1e2a, 0x0000,
+	/* 0x0068 0x032e */
+	0x1e2b, 0x0000,
+	/* 0x0045 0x0330 */
+	0x1e1a, 0x0000,
+	/* 0x0049 0x0330 */
+	0x1e2c, 0x0000,
+	/* 0x0055 0x0330 */
+	0x1e74, 0x0000,
+	/* 0x0065 0x0330 */
+	0x1e1b, 0x0000,
+	/* 0x0069 0x0330 */
+	0x1e2d, 0x0000,
+	/* 0x0075 0x0330 */
+	0x1e75, 0x0000,
+	/* 0x0042 0x0331 */
+	0x1e06, 0x0000,
+	/* 0x0044 0x0331 */
+	0x1e0e, 0x0000,
+	/* 0x004b 0x0331 */
+	0x1e34, 0x0000,
+	/* 0x004c 0x0331 */
+	0x1e3a, 0x0000,
+	/* 0x004e 0x0331 */
+	0x1e48, 0x0000,
+	/* 0x0052 0x0331 */
+	0x1e5e, 0x0000,
+	/* 0x0054 0x0331 */
+	0x1e6e, 0x0000,
+	/* 0x005a 0x0331 */
+	0x1e94, 0x0000,
+	/* 0x0062 0x0331 */
+	0x1e07, 0x0000,
+	/* 0x0064 0x0331 */
+	0x1e0f, 0x0000,
+	/* 0x0068 0x0331 */
+	0x1e96, 0x0000,
+	/* 0x006b 0x0331 */
+	0x1e35, 0x0000,
+	/* 0x006c 0x0331 */
+	0x1e3b, 0x0000,
+	/* 0x006e 0x0331 */
+	0x1e49, 0x0000,
+	/* 0x0072 0x0331 */
+	0x1e5f, 0x0000,
+	/* 0x0074 0x0331 */
+	0x1e6f, 0x0000,
+	/* 0x007a 0x0331 */
+	0x1e95, 0x0000,
+	/* 0x00a8 0x0342 */
+	0x1fc1, 0x0000,
+	/* 0x03b1 0x0342 */
+	0x1fb6, 0x0000,
+	/* 0x03b7 0x0342 */
+	0x1fc6, 0x0000,
+	/* 0x03b9 0x0342 */
+	0x1fd6, 0x0000,
+	/* 0x03c5 0x0342 */
+	0x1fe6, 0x0000,
+	/* 0x03c9 0x0342 */
+	0x1ff6, 0x0000,
+	/* 0x1fbf 0x0342 */
+	0x1fcf, 0x0000,
+	/* 0x1ffe 0x0342 */
+	0x1fdf, 0x0000,
+	/* 0x0391 0x0345 */
+	0x1fbc, 0x0002,  0x0313, 0x0ee2,  0x0314, 0x0eea,
+	/* 0x0397 0x0345 */
+	0x1fcc, 0x0002,  0x0313, 0x0ef2,  0x0314, 0x0efa,
+	/* 0x03a9 0x0345 */
+	0x1ffc, 0x0002,  0x0313, 0x0f02,  0x0314, 0x0f0a,
+	/* 0x03b1 0x0345 */
+	0x1fb3, 0x0005,  0x0300, 0x0f12,  0x0301, 0x0f14,  0x0313, 0x0f16,
+	0x0314, 0x0f1e,  0x0342, 0x0f26,
+	/* 0x03b7 0x0345 */
+	0x1fc3, 0x0005,  0x0300, 0x0f28,  0x0301, 0x0f2a,  0x0313, 0x0f2c,
+	0x0314, 0x0f34,  0x0342, 0x0f3c,
+	/* 0x03bf 0x0345 */
+	0x0000, 0x0001,  0x0301, 0x0f3e,
+	/* 0x03c9 0x0345 */
+	0x1ff3, 0x0004,  0x0300, 0x0f40,  0x0313, 0x0f42,  0x0314, 0x0f4a,
+	0x0342, 0x0f52,
+	/* 0x05d0 0x05b7 */
+	0xfb2e, 0x0000,
+	/* 0x05f2 0x05b7 */
+	0xfb1f, 0x0000,
+	/* 0x05d0 0x05b8 */
+	0xfb2f, 0x0000,
+	/* 0x05d5 0x05b9 */
+	0xfb4b, 0x0000,
+	/* 0x05d0 0x05bc */
+	0xfb30, 0x0000,
+	/* 0x05d1 0x05bc */
+	0xfb31, 0x0000,
+	/* 0x05d2 0x05bc */
+	0xfb32, 0x0000,
+	/* 0x05d3 0x05bc */
+	0xfb33, 0x0000,
+	/* 0x05d4 0x05bc */
+	0xfb34, 0x0000,
+	/* 0x05d5 0x05bc */
+	0xfb35, 0x0000,
+	/* 0x05d6 0x05bc */
+	0xfb36, 0x0000,
+	/* 0x05d8 0x05bc */
+	0xfb38, 0x0000,
+	/* 0x05d9 0x05bc */
+	0xfb39, 0x0000,
+	/* 0x05da 0x05bc */
+	0xfb3a, 0x0000,
+	/* 0x05db 0x05bc */
+	0xfb3b, 0x0000,
+	/* 0x05dc 0x05bc */
+	0xfb3c, 0x0000,
+	/* 0x05de 0x05bc */
+	0xfb3e, 0x0000,
+	/* 0x05e0 0x05bc */
+	0xfb40, 0x0000,
+	/* 0x05e1 0x05bc */
+	0xfb41, 0x0000,
+	/* 0x05e3 0x05bc */
+	0xfb43, 0x0000,
+	/* 0x05e4 0x05bc */
+	0xfb44, 0x0000,
+	/* 0x05e6 0x05bc */
+	0xfb46, 0x0000,
+	/* 0x05e7 0x05bc */
+	0xfb47, 0x0000,
+	/* 0x05e8 0x05bc */
+	0xfb48, 0x0000,
+	/* 0x05e9 0x05bc */
+	0xfb49, 0x0002,  0x05c1, 0x0f54,  0x05c2, 0x0f56,
+	/* 0x05ea 0x05bc */
+	0xfb4a, 0x0000,
+	/* 0x05d1 0x05bf */
+	0xfb4c, 0x0000,
+	/* 0x05db 0x05bf */
+	0xfb4d, 0x0000,
+	/* 0x05e4 0x05bf */
+	0xfb4e, 0x0000,
+	/* 0x05e9 0x05c1 */
+	0xfb2a, 0x0000,
+	/* 0x05e9 0x05c2 */
+	0xfb2b, 0x0000,
+	/* 0x0915 0x093c */
+	0x0958, 0x0000,
+	/* 0x0916 0x093c */
+	0x0959, 0x0000,
+	/* 0x0917 0x093c */
+	0x095a, 0x0000,
+	/* 0x091c 0x093c */
+	0x095b, 0x0000,
+	/* 0x0921 0x093c */
+	0x095c, 0x0000,
+	/* 0x0922 0x093c */
+	0x095d, 0x0000,
+	/* 0x0928 0x093c */
+	0x0929, 0x0000,
+	/* 0x092b 0x093c */
+	0x095e, 0x0000,
+	/* 0x092f 0x093c */
+	0x095f, 0x0000,
+	/* 0x0930 0x093c */
+	0x0931, 0x0000,
+	/* 0x0933 0x093c */
+	0x0934, 0x0000,
+	/* 0x09a1 0x09bc */
+	0x09dc, 0x0000,
+	/* 0x09a2 0x09bc */
+	0x09dd, 0x0000,
+	/* 0x09ac 0x09bc */
+	0x09b0, 0x0000,
+	/* 0x09af 0x09bc */
+	0x09df, 0x0000,
+	/* 0x09c7 0x09be */
+	0x09cb, 0x0000,
+	/* 0x09c7 0x09d7 */
+	0x09cc, 0x0000,
+	/* 0x0a16 0x0a3c */
+	0x0a59, 0x0000,
+	/* 0x0a17 0x0a3c */
+	0x0a5a, 0x0000,
+	/* 0x0a1c 0x0a3c */
+	0x0a5b, 0x0000,
+	/* 0x0a21 0x0a3c */
+	0x0a5c, 0x0000,
+	/* 0x0a2b 0x0a3c */
+	0x0a5e, 0x0000,
+	/* 0x0b21 0x0b3c */
+	0x0b5c, 0x0000,
+	/* 0x0b22 0x0b3c */
+	0x0b5d, 0x0000,
+	/* 0x0b2f 0x0b3c */
+	0x0b5f, 0x0000,
+	/* 0x0b47 0x0b3e */
+	0x0b4b, 0x0000,
+	/* 0x0b47 0x0b56 */
+	0x0b48, 0x0000,
+	/* 0x0b47 0x0b57 */
+	0x0b4c, 0x0000,
+	/* 0x0bc6 0x0bbe */
+	0x0bca, 0x0000,
+	/* 0x0bc7 0x0bbe */
+	0x0bcb, 0x0000,
+	/* 0x0b92 0x0bd7 */
+	0x0b94, 0x0000,
+	/* 0x0bc6 0x0bd7 */
+	0x0bcc, 0x0000,
+	/* 0x0c46 0x0c56 */
+	0x0c48, 0x0000,
+	/* 0x0cc6 0x0cc2 */
+	0x0cca, 0x0001,  0x0cd5, 0x0f58,
+	/* 0x0cbf 0x0cd5 */
+	0x0cc0, 0x0000,
+	/* 0x0cc6 0x0cd5 */
+	0x0cc7, 0x0000,
+	/* 0x0cc6 0x0cd6 */
+	0x0cc8, 0x0000,
+	/* 0x0d46 0x0d3e */
+	0x0d4a, 0x0000,
+	/* 0x0d47 0x0d3e */
+	0x0d4b, 0x0000,
+	/* 0x0d46 0x0d57 */
+	0x0d4c, 0x0000,
+	/* 0x0e4d 0x0e32 */
+	0x0e33, 0x0000,
+	/* 0x0ecd 0x0eb2 */
+	0x0eb3, 0x0000,
+	/* 0x0f72 0x0f71 */
+	0x0f73, 0x0000,
+	/* 0x0f74 0x0f71 */
+	0x0f75, 0x0000,
+	/* 0x0f80 0x0f71 */
+	0x0f81, 0x0000,
+	/* 0x0fb2 0x0f80 */
+	0x0f76, 0x0001,  0x0f71, 0x0f5a,
+	/* 0x0fb3 0x0f80 */
+	0x0f78, 0x0001,  0x0f71, 0x0f5c,
+	/* 0x0f40 0x0fb5 */
+	0x0f69, 0x0000,
+	/* 0x0f90 0x0fb5 */
+	0x0fb9, 0x0000,
+	/* 0x0f42 0x0fb7 */
+	0x0f43, 0x0000,
+	/* 0x0f4c 0x0fb7 */
+	0x0f4d, 0x0000,
+	/* 0x0f51 0x0fb7 */
+	0x0f52, 0x0000,
+	/* 0x0f56 0x0fb7 */
+	0x0f57, 0x0000,
+	/* 0x0f5b 0x0fb7 */
+	0x0f5c, 0x0000,
+	/* 0x0f92 0x0fb7 */
+	0x0f93, 0x0000,
+	/* 0x0f9c 0x0fb7 */
+	0x0f9d, 0x0000,
+	/* 0x0fa1 0x0fb7 */
+	0x0fa2, 0x0000,
+	/* 0x0fa6 0x0fb7 */
+	0x0fa7, 0x0000,
+	/* 0x0fab 0x0fb7 */
+	0x0fac, 0x0000,
+	/* 0x3046 0x3099 */
+	0x3094, 0x0000,
+	/* 0x304b 0x3099 */
+	0x304c, 0x0000,
+	/* 0x304d 0x3099 */
+	0x304e, 0x0000,
+	/* 0x304f 0x3099 */
+	0x3050, 0x0000,
+	/* 0x3051 0x3099 */
+	0x3052, 0x0000,
+	/* 0x3053 0x3099 */
+	0x3054, 0x0000,
+	/* 0x3055 0x3099 */
+	0x3056, 0x0000,
+	/* 0x3057 0x3099 */
+	0x3058, 0x0000,
+	/* 0x3059 0x3099 */
+	0x305a, 0x0000,
+	/* 0x305b 0x3099 */
+	0x305c, 0x0000,
+	/* 0x305d 0x3099 */
+	0x305e, 0x0000,
+	/* 0x305f 0x3099 */
+	0x3060, 0x0000,
+	/* 0x3061 0x3099 */
+	0x3062, 0x0000,
+	/* 0x3064 0x3099 */
+	0x3065, 0x0000,
+	/* 0x3066 0x3099 */
+	0x3067, 0x0000,
+	/* 0x3068 0x3099 */
+	0x3069, 0x0000,
+	/* 0x306f 0x3099 */
+	0x3070, 0x0000,
+	/* 0x3072 0x3099 */
+	0x3073, 0x0000,
+	/* 0x3075 0x3099 */
+	0x3076, 0x0000,
+	/* 0x3078 0x3099 */
+	0x3079, 0x0000,
+	/* 0x307b 0x3099 */
+	0x307c, 0x0000,
+	/* 0x309d 0x3099 */
+	0x309e, 0x0000,
+	/* 0x30a6 0x3099 */
+	0x30f4, 0x0000,
+	/* 0x30ab 0x3099 */
+	0x30ac, 0x0000,
+	/* 0x30ad 0x3099 */
+	0x30ae, 0x0000,
+	/* 0x30af 0x3099 */
+	0x30b0, 0x0000,
+	/* 0x30b1 0x3099 */
+	0x30b2, 0x0000,
+	/* 0x30b3 0x3099 */
+	0x30b4, 0x0000,
+	/* 0x30b5 0x3099 */
+	0x30b6, 0x0000,
+	/* 0x30b7 0x3099 */
+	0x30b8, 0x0000,
+	/* 0x30b9 0x3099 */
+	0x30ba, 0x0000,
+	/* 0x30bb 0x3099 */
+	0x30bc, 0x0000,
+	/* 0x30bd 0x3099 */
+	0x30be, 0x0000,
+	/* 0x30bf 0x3099 */
+	0x30c0, 0x0000,
+	/* 0x30c1 0x3099 */
+	0x30c2, 0x0000,
+	/* 0x30c4 0x3099 */
+	0x30c5, 0x0000,
+	/* 0x30c6 0x3099 */
+	0x30c7, 0x0000,
+	/* 0x30c8 0x3099 */
+	0x30c9, 0x0000,
+	/* 0x30cf 0x3099 */
+	0x30d0, 0x0000,
+	/* 0x30d2 0x3099 */
+	0x30d3, 0x0000,
+	/* 0x30d5 0x3099 */
+	0x30d6, 0x0000,
+	/* 0x30d8 0x3099 */
+	0x30d9, 0x0000,
+	/* 0x30db 0x3099 */
+	0x30dc, 0x0000,
+	/* 0x30ef 0x3099 */
+	0x30f7, 0x0000,
+	/* 0x30f0 0x3099 */
+	0x30f8, 0x0000,
+	/* 0x30f1 0x3099 */
+	0x30f9, 0x0000,
+	/* 0x30f2 0x3099 */
+	0x30fa, 0x0000,
+	/* 0x30fd 0x3099 */
+	0x30fe, 0x0000,
+	/* 0x306f 0x309a */
+	0x3071, 0x0000,
+	/* 0x3072 0x309a */
+	0x3074, 0x0000,
+	/* 0x3075 0x309a */
+	0x3077, 0x0000,
+	/* 0x3078 0x309a */
+	0x307a, 0x0000,
+	/* 0x307b 0x309a */
+	0x307d, 0x0000,
+	/* 0x30cf 0x309a */
+	0x30d1, 0x0000,
+	/* 0x30d2 0x309a */
+	0x30d4, 0x0000,
+	/* 0x30d5 0x309a */
+	0x30d7, 0x0000,
+	/* 0x30d8 0x309a */
+	0x30da, 0x0000,
+	/* 0x30db 0x309a */
+	0x30dd, 0x0000,
+	/* 0x0307 0x0053 0x0301 */
+	0x1e64, 0x0000,
+	/* 0x0307 0x0073 0x0301 */
+	0x1e65, 0x0000,
+	/* 0x0300 0x0041 0x0302 */
+	0x1ea6, 0x0000,
+	/* 0x0301 0x0041 0x0302 */
+	0x1ea4, 0x0000,
+	/* 0x0303 0x0041 0x0302 */
+	0x1eaa, 0x0000,
+	/* 0x0309 0x0041 0x0302 */
+	0x1ea8, 0x0000,
+	/* 0x0300 0x0045 0x0302 */
+	0x1ec0, 0x0000,
+	/* 0x0301 0x0045 0x0302 */
+	0x1ebe, 0x0000,
+	/* 0x0303 0x0045 0x0302 */
+	0x1ec4, 0x0000,
+	/* 0x0309 0x0045 0x0302 */
+	0x1ec2, 0x0000,
+	/* 0x0300 0x004f 0x0302 */
+	0x1ed2, 0x0000,
+	/* 0x0301 0x004f 0x0302 */
+	0x1ed0, 0x0000,
+	/* 0x0303 0x004f 0x0302 */
+	0x1ed6, 0x0000,
+	/* 0x0309 0x004f 0x0302 */
+	0x1ed4, 0x0000,
+	/* 0x0300 0x0061 0x0302 */
+	0x1ea7, 0x0000,
+	/* 0x0301 0x0061 0x0302 */
+	0x1ea5, 0x0000,
+	/* 0x0303 0x0061 0x0302 */
+	0x1eab, 0x0000,
+	/* 0x0309 0x0061 0x0302 */
+	0x1ea9, 0x0000,
+	/* 0x0300 0x0065 0x0302 */
+	0x1ec1, 0x0000,
+	/* 0x0301 0x0065 0x0302 */
+	0x1ebf, 0x0000,
+	/* 0x0303 0x0065 0x0302 */
+	0x1ec5, 0x0000,
+	/* 0x0309 0x0065 0x0302 */
+	0x1ec3, 0x0000,
+	/* 0x0300 0x006f 0x0302 */
+	0x1ed3, 0x0000,
+	/* 0x0301 0x006f 0x0302 */
+	0x1ed1, 0x0000,
+	/* 0x0303 0x006f 0x0302 */
+	0x1ed7, 0x0000,
+	/* 0x0309 0x006f 0x0302 */
+	0x1ed5, 0x0000,
+	/* 0x0301 0x004f 0x0303 */
+	0x1e4c, 0x0000,
+	/* 0x0308 0x004f 0x0303 */
+	0x1e4e, 0x0000,
+	/* 0x0301 0x0055 0x0303 */
+	0x1e78, 0x0000,
+	/* 0x0301 0x006f 0x0303 */
+	0x1e4d, 0x0000,
+	/* 0x0308 0x006f 0x0303 */
+	0x1e4f, 0x0000,
+	/* 0x0301 0x0075 0x0303 */
+	0x1e79, 0x0000,
+	/* 0x0300 0x0045 0x0304 */
+	0x1e14, 0x0000,
+	/* 0x0301 0x0045 0x0304 */
+	0x1e16, 0x0000,
+	/* 0x0300 0x004f 0x0304 */
+	0x1e50, 0x0000,
+	/* 0x0301 0x004f 0x0304 */
+	0x1e52, 0x0000,
+	/* 0x0308 0x0055 0x0304 */
+	0x1e7a, 0x0000,
+	/* 0x0300 0x0065 0x0304 */
+	0x1e15, 0x0000,
+	/* 0x0301 0x0065 0x0304 */
+	0x1e17, 0x0000,
+	/* 0x0300 0x006f 0x0304 */
+	0x1e51, 0x0000,
+	/* 0x0301 0x006f 0x0304 */
+	0x1e53, 0x0000,
+	/* 0x0308 0x0075 0x0304 */
+	0x1e7b, 0x0000,
+	/* 0x0300 0x0041 0x0306 */
+	0x1eb0, 0x0000,
+	/* 0x0301 0x0041 0x0306 */
+	0x1eae, 0x0000,
+	/* 0x0303 0x0041 0x0306 */
+	0x1eb4, 0x0000,
+	/* 0x0309 0x0041 0x0306 */
+	0x1eb2, 0x0000,
+	/* 0x0300 0x0061 0x0306 */
+	0x1eb1, 0x0000,
+	/* 0x0301 0x0061 0x0306 */
+	0x1eaf, 0x0000,
+	/* 0x0303 0x0061 0x0306 */
+	0x1eb5, 0x0000,
+	/* 0x0309 0x0061 0x0306 */
+	0x1eb3, 0x0000,
+	/* 0x0304 0x0041 0x0307 */
+	0x01e0, 0x0000,
+	/* 0x0304 0x0061 0x0307 */
+	0x01e1, 0x0000,
+	/* 0x0304 0x0041 0x0308 */
+	0x01de, 0x0000,
+	/* 0x0301 0x0049 0x0308 */
+	0x1e2e, 0x0000,
+	/* 0x0300 0x0055 0x0308 */
+	0x01db, 0x0000,
+	/* 0x0301 0x0055 0x0308 */
+	0x01d7, 0x0000,
+	/* 0x0304 0x0055 0x0308 */
+	0x01d5, 0x0000,
+	/* 0x030c 0x0055 0x0308 */
+	0x01d9, 0x0000,
+	/* 0x0304 0x0061 0x0308 */
+	0x01df, 0x0000,
+	/* 0x0301 0x0069 0x0308 */
+	0x1e2f, 0x0000,
+	/* 0x0300 0x0075 0x0308 */
+	0x01dc, 0x0000,
+	/* 0x0301 0x0075 0x0308 */
+	0x01d8, 0x0000,
+	/* 0x0304 0x0075 0x0308 */
+	0x01d6, 0x0000,
+	/* 0x030c 0x0075 0x0308 */
+	0x01da, 0x0000,
+	/* 0x0300 0x03b9 0x0308 */
+	0x1fd2, 0x0000,
+	/* 0x0301 0x03b9 0x0308 */
+	0x1fd3, 0x0000,
+	/* 0x030d 0x03b9 0x0308 */
+	0x0390, 0x0000,
+	/* 0x0342 0x03b9 0x0308 */
+	0x1fd7, 0x0000,
+	/* 0x0300 0x03c5 0x0308 */
+	0x1fe2, 0x0000,
+	/* 0x0301 0x03c5 0x0308 */
+	0x1fe3, 0x0000,
+	/* 0x030d 0x03c5 0x0308 */
+	0x03b0, 0x0000,
+	/* 0x0342 0x03c5 0x0308 */
+	0x1fe7, 0x0000,
+	/* 0x0301 0x0041 0x030a */
+	0x01fa, 0x0000,
+	/* 0x0301 0x0061 0x030a */
+	0x01fb, 0x0000,
+	/* 0x0307 0x0053 0x030c */
+	0x1e66, 0x0000,
+	/* 0x0307 0x0073 0x030c */
+	0x1e67, 0x0000,
+	/* 0x0300 0x0391 0x0313 */
+	0x1f0a, 0x0000,
+	/* 0x0301 0x0391 0x0313 */
+	0x1f0c, 0x0000,
+	/* 0x0342 0x0391 0x0313 */
+	0x1f0e, 0x0000,
+	/* 0x0300 0x0395 0x0313 */
+	0x1f1a, 0x0000,
+	/* 0x0301 0x0395 0x0313 */
+	0x1f1c, 0x0000,
+	/* 0x0300 0x0397 0x0313 */
+	0x1f2a, 0x0000,
+	/* 0x0301 0x0397 0x0313 */
+	0x1f2c, 0x0000,
+	/* 0x0342 0x0397 0x0313 */
+	0x1f2e, 0x0000,
+	/* 0x0300 0x0399 0x0313 */
+	0x1f3a, 0x0000,
+	/* 0x0301 0x0399 0x0313 */
+	0x1f3c, 0x0000,
+	/* 0x0342 0x0399 0x0313 */
+	0x1f3e, 0x0000,
+	/* 0x0300 0x039f 0x0313 */
+	0x1f4a, 0x0000,
+	/* 0x0301 0x039f 0x0313 */
+	0x1f4c, 0x0000,
+	/* 0x0300 0x03a9 0x0313 */
+	0x1f6a, 0x0000,
+	/* 0x0301 0x03a9 0x0313 */
+	0x1f6c, 0x0000,
+	/* 0x0342 0x03a9 0x0313 */
+	0x1f6e, 0x0000,
+	/* 0x0300 0x03b1 0x0313 */
+	0x1f02, 0x0000,
+	/* 0x0301 0x03b1 0x0313 */
+	0x1f04, 0x0000,
+	/* 0x0342 0x03b1 0x0313 */
+	0x1f06, 0x0000,
+	/* 0x0300 0x03b5 0x0313 */
+	0x1f12, 0x0000,
+	/* 0x0301 0x03b5 0x0313 */
+	0x1f14, 0x0000,
+	/* 0x0300 0x03b7 0x0313 */
+	0x1f22, 0x0000,
+	/* 0x0301 0x03b7 0x0313 */
+	0x1f24, 0x0000,
+	/* 0x0342 0x03b7 0x0313 */
+	0x1f26, 0x0000,
+	/* 0x0300 0x03b9 0x0313 */
+	0x1f32, 0x0000,
+	/* 0x0301 0x03b9 0x0313 */
+	0x1f34, 0x0000,
+	/* 0x0342 0x03b9 0x0313 */
+	0x1f36, 0x0000,
+	/* 0x0300 0x03bf 0x0313 */
+	0x1f42, 0x0000,
+	/* 0x0301 0x03bf 0x0313 */
+	0x1f44, 0x0000,
+	/* 0x0300 0x03c5 0x0313 */
+	0x1f52, 0x0000,
+	/* 0x0301 0x03c5 0x0313 */
+	0x1f54, 0x0000,
+	/* 0x0342 0x03c5 0x0313 */
+	0x1f56, 0x0000,
+	/* 0x0300 0x03c9 0x0313 */
+	0x1f62, 0x0000,
+	/* 0x0301 0x03c9 0x0313 */
+	0x1f64, 0x0000,
+	/* 0x0342 0x03c9 0x0313 */
+	0x1f66, 0x0000,
+	/* 0x0300 0x0391 0x0314 */
+	0x1f0b, 0x0000,
+	/* 0x0301 0x0391 0x0314 */
+	0x1f0d, 0x0000,
+	/* 0x0342 0x0391 0x0314 */
+	0x1f0f, 0x0000,
+	/* 0x0300 0x0395 0x0314 */
+	0x1f1b, 0x0000,
+	/* 0x0301 0x0395 0x0314 */
+	0x1f1d, 0x0000,
+	/* 0x0300 0x0397 0x0314 */
+	0x1f2b, 0x0000,
+	/* 0x0301 0x0397 0x0314 */
+	0x1f2d, 0x0000,
+	/* 0x0342 0x0397 0x0314 */
+	0x1f2f, 0x0000,
+	/* 0x0300 0x0399 0x0314 */
+	0x1f3b, 0x0000,
+	/* 0x0301 0x0399 0x0314 */
+	0x1f3d, 0x0000,
+	/* 0x0342 0x0399 0x0314 */
+	0x1f3f, 0x0000,
+	/* 0x0300 0x039f 0x0314 */
+	0x1f4b, 0x0000,
+	/* 0x0301 0x039f 0x0314 */
+	0x1f4d, 0x0000,
+	/* 0x0300 0x03a5 0x0314 */
+	0x1f5b, 0x0000,
+	/* 0x0301 0x03a5 0x0314 */
+	0x1f5d, 0x0000,
+	/* 0x0342 0x03a5 0x0314 */
+	0x1f5f, 0x0000,
+	/* 0x0300 0x03a9 0x0314 */
+	0x1f6b, 0x0000,
+	/* 0x0301 0x03a9 0x0314 */
+	0x1f6d, 0x0000,
+	/* 0x0342 0x03a9 0x0314 */
+	0x1f6f, 0x0000,
+	/* 0x0300 0x03b1 0x0314 */
+	0x1f03, 0x0000,
+	/* 0x0301 0x03b1 0x0314 */
+	0x1f05, 0x0000,
+	/* 0x0342 0x03b1 0x0314 */
+	0x1f07, 0x0000,
+	/* 0x0300 0x03b5 0x0314 */
+	0x1f13, 0x0000,
+	/* 0x0301 0x03b5 0x0314 */
+	0x1f15, 0x0000,
+	/* 0x0300 0x03b7 0x0314 */
+	0x1f23, 0x0000,
+	/* 0x0301 0x03b7 0x0314 */
+	0x1f25, 0x0000,
+	/* 0x0342 0x03b7 0x0314 */
+	0x1f27, 0x0000,
+	/* 0x0300 0x03b9 0x0314 */
+	0x1f33, 0x0000,
+	/* 0x0301 0x03b9 0x0314 */
+	0x1f35, 0x0000,
+	/* 0x0342 0x03b9 0x0314 */
+	0x1f37, 0x0000,
+	/* 0x0300 0x03bf 0x0314 */
+	0x1f43, 0x0000,
+	/* 0x0301 0x03bf 0x0314 */
+	0x1f45, 0x0000,
+	/* 0x0300 0x03c5 0x0314 */
+	0x1f53, 0x0000,
+	/* 0x0301 0x03c5 0x0314 */
+	0x1f55, 0x0000,
+	/* 0x0342 0x03c5 0x0314 */
+	0x1f57, 0x0000,
+	/* 0x0300 0x03c9 0x0314 */
+	0x1f63, 0x0000,
+	/* 0x0301 0x03c9 0x0314 */
+	0x1f65, 0x0000,
+	/* 0x0342 0x03c9 0x0314 */
+	0x1f67, 0x0000,
+	/* 0x0300 0x004f 0x031b */
+	0x1edc, 0x0000,
+	/* 0x0301 0x004f 0x031b */
+	0x1eda, 0x0000,
+	/* 0x0303 0x004f 0x031b */
+	0x1ee0, 0x0000,
+	/* 0x0309 0x004f 0x031b */
+	0x1ede, 0x0000,
+	/* 0x0323 0x004f 0x031b */
+	0x1ee2, 0x0000,
+	/* 0x0300 0x0055 0x031b */
+	0x1eea, 0x0000,
+	/* 0x0301 0x0055 0x031b */
+	0x1ee8, 0x0000,
+	/* 0x0303 0x0055 0x031b */
+	0x1eee, 0x0000,
+	/* 0x0309 0x0055 0x031b */
+	0x1eec, 0x0000,
+	/* 0x0323 0x0055 0x031b */
+	0x1ef0, 0x0000,
+	/* 0x0300 0x006f 0x031b */
+	0x1edd, 0x0000,
+	/* 0x0301 0x006f 0x031b */
+	0x1edb, 0x0000,
+	/* 0x0303 0x006f 0x031b */
+	0x1ee1, 0x0000,
+	/* 0x0309 0x006f 0x031b */
+	0x1edf, 0x0000,
+	/* 0x0323 0x006f 0x031b */
+	0x1ee3, 0x0000,
+	/* 0x0300 0x0075 0x031b */
+	0x1eeb, 0x0000,
+	/* 0x0301 0x0075 0x031b */
+	0x1ee9, 0x0000,
+	/* 0x0303 0x0075 0x031b */
+	0x1eef, 0x0000,
+	/* 0x0309 0x0075 0x031b */
+	0x1eed, 0x0000,
+	/* 0x0323 0x0075 0x031b */
+	0x1ef1, 0x0000,
+	/* 0x0302 0x0041 0x0323 */
+	0x1eac, 0x0000,
+	/* 0x0306 0x0041 0x0323 */
+	0x1eb6, 0x0000,
+	/* 0x0302 0x0045 0x0323 */
+	0x1ec6, 0x0000,
+	/* 0x0304 0x004c 0x0323 */
+	0x1e38, 0x0000,
+	/* 0x0302 0x004f 0x0323 */
+	0x1ed8, 0x0000,
+	/* 0x0304 0x0052 0x0323 */
+	0x1e5c, 0x0000,
+	/* 0x0307 0x0053 0x0323 */
+	0x1e68, 0x0000,
+	/* 0x0302 0x0061 0x0323 */
+	0x1ead, 0x0000,
+	/* 0x0306 0x0061 0x0323 */
+	0x1eb7, 0x0000,
+	/* 0x0302 0x0065 0x0323 */
+	0x1ec7, 0x0000,
+	/* 0x0304 0x006c 0x0323 */
+	0x1e39, 0x0000,
+	/* 0x0302 0x006f 0x0323 */
+	0x1ed9, 0x0000,
+	/* 0x0304 0x0072 0x0323 */
+	0x1e5d, 0x0000,
+	/* 0x0307 0x0073 0x0323 */
+	0x1e69, 0x0000,
+	/* 0x0301 0x0043 0x0327 */
+	0x1e08, 0x0000,
+	/* 0x0306 0x0045 0x0327 */
+	0x1e1c, 0x0000,
+	/* 0x0301 0x0063 0x0327 */
+	0x1e09, 0x0000,
+	/* 0x0306 0x0065 0x0327 */
+	0x1e1d, 0x0000,
+	/* 0x0304 0x004f 0x0328 */
+	0x01ec, 0x0000,
+	/* 0x0304 0x006f 0x0328 */
+	0x01ed, 0x0000,
+	/* 0x0313 0x0391 0x0345 */
+	0x1f88, 0x0003,  0x0300, 0x0f5e,  0x0301, 0x0f60,  0x0342, 0x0f62,
+	/* 0x0314 0x0391 0x0345 */
+	0x1f89, 0x0003,  0x0300, 0x0f64,  0x0301, 0x0f66,  0x0342, 0x0f68,
+	/* 0x0313 0x0397 0x0345 */
+	0x1f98, 0x0003,  0x0300, 0x0f6a,  0x0301, 0x0f6c,  0x0342, 0x0f6e,
+	/* 0x0314 0x0397 0x0345 */
+	0x1f99, 0x0003,  0x0300, 0x0f70,  0x0301, 0x0f72,  0x0342, 0x0f74,
+	/* 0x0313 0x03a9 0x0345 */
+	0x1fa8, 0x0003,  0x0300, 0x0f76,  0x0301, 0x0f78,  0x0342, 0x0f7a,
+	/* 0x0314 0x03a9 0x0345 */
+	0x1fa9, 0x0003,  0x0300, 0x0f7c,  0x0301, 0x0f7e,  0x0342, 0x0f80,
+	/* 0x0300 0x03b1 0x0345 */
+	0x1fb2, 0x0000,
+	/* 0x0301 0x03b1 0x0345 */
+	0x1fb4, 0x0000,
+	/* 0x0313 0x03b1 0x0345 */
+	0x1f80, 0x0003,  0x0300, 0x0f82,  0x0301, 0x0f84,  0x0342, 0x0f86,
+	/* 0x0314 0x03b1 0x0345 */
+	0x1f81, 0x0003,  0x0300, 0x0f88,  0x0301, 0x0f8a,  0x0342, 0x0f8c,
+	/* 0x0342 0x03b1 0x0345 */
+	0x1fb7, 0x0000,
+	/* 0x0300 0x03b7 0x0345 */
+	0x1fc2, 0x0000,
+	/* 0x0301 0x03b7 0x0345 */
+	0x1fc4, 0x0000,
+	/* 0x0313 0x03b7 0x0345 */
+	0x1f90, 0x0003,  0x0300, 0x0f8e,  0x0301, 0x0f90,  0x0342, 0x0f92,
+	/* 0x0314 0x03b7 0x0345 */
+	0x1f91, 0x0003,  0x0300, 0x0f94,  0x0301, 0x0f96,  0x0342, 0x0f98,
+	/* 0x0342 0x03b7 0x0345 */
+	0x1fc7, 0x0000,
+	/* 0x0301 0x03bf 0x0345 */
+	0x1ff4, 0x0000,
+	/* 0x0300 0x03c9 0x0345 */
+	0x1ff2, 0x0000,
+	/* 0x0313 0x03c9 0x0345 */
+	0x1fa0, 0x0003,  0x0300, 0x0f9a,  0x0301, 0x0f9c,  0x0342, 0x0f9e,
+	/* 0x0314 0x03c9 0x0345 */
+	0x1fa1, 0x0003,  0x0300, 0x0fa0,  0x0301, 0x0fa2,  0x0342, 0x0fa4,
+	/* 0x0342 0x03c9 0x0345 */
+	0x1ff7, 0x0000,
+	/* 0x05c1 0x05e9 0x05bc */
+	0xfb2c, 0x0000,
+	/* 0x05c2 0x05e9 0x05bc */
+	0xfb2d, 0x0000,
+	/* 0x0cd5 0x0cc6 0x0cc2 */
+	0x0ccb, 0x0000,
+	/* 0x0f71 0x0fb2 0x0f80 */
+	0x0f77, 0x0000,
+	/* 0x0f71 0x0fb3 0x0f80 */
+	0x0f79, 0x0000,
+	/* 0x0300 0x0313 0x0391 0x0345 */
+	0x1f8a, 0x0000,
+	/* 0x0301 0x0313 0x0391 0x0345 */
+	0x1f8c, 0x0000,
+	/* 0x0342 0x0313 0x0391 0x0345 */
+	0x1f8e, 0x0000,
+	/* 0x0300 0x0314 0x0391 0x0345 */
+	0x1f8b, 0x0000,
+	/* 0x0301 0x0314 0x0391 0x0345 */
+	0x1f8d, 0x0000,
+	/* 0x0342 0x0314 0x0391 0x0345 */
+	0x1f8f, 0x0000,
+	/* 0x0300 0x0313 0x0397 0x0345 */
+	0x1f9a, 0x0000,
+	/* 0x0301 0x0313 0x0397 0x0345 */
+	0x1f9c, 0x0000,
+	/* 0x0342 0x0313 0x0397 0x0345 */
+	0x1f9e, 0x0000,
+	/* 0x0300 0x0314 0x0397 0x0345 */
+	0x1f9b, 0x0000,
+	/* 0x0301 0x0314 0x0397 0x0345 */
+	0x1f9d, 0x0000,
+	/* 0x0342 0x0314 0x0397 0x0345 */
+	0x1f9f, 0x0000,
+	/* 0x0300 0x0313 0x03a9 0x0345 */
+	0x1faa, 0x0000,
+	/* 0x0301 0x0313 0x03a9 0x0345 */
+	0x1fac, 0x0000,
+	/* 0x0342 0x0313 0x03a9 0x0345 */
+	0x1fae, 0x0000,
+	/* 0x0300 0x0314 0x03a9 0x0345 */
+	0x1fab, 0x0000,
+	/* 0x0301 0x0314 0x03a9 0x0345 */
+	0x1fad, 0x0000,
+	/* 0x0342 0x0314 0x03a9 0x0345 */
+	0x1faf, 0x0000,
+	/* 0x0300 0x0313 0x03b1 0x0345 */
+	0x1f82, 0x0000,
+	/* 0x0301 0x0313 0x03b1 0x0345 */
+	0x1f84, 0x0000,
+	/* 0x0342 0x0313 0x03b1 0x0345 */
+	0x1f86, 0x0000,
+	/* 0x0300 0x0314 0x03b1 0x0345 */
+	0x1f83, 0x0000,
+	/* 0x0301 0x0314 0x03b1 0x0345 */
+	0x1f85, 0x0000,
+	/* 0x0342 0x0314 0x03b1 0x0345 */
+	0x1f87, 0x0000,
+	/* 0x0300 0x0313 0x03b7 0x0345 */
+	0x1f92, 0x0000,
+	/* 0x0301 0x0313 0x03b7 0x0345 */
+	0x1f94, 0x0000,
+	/* 0x0342 0x0313 0x03b7 0x0345 */
+	0x1f96, 0x0000,
+	/* 0x0300 0x0314 0x03b7 0x0345 */
+	0x1f93, 0x0000,
+	/* 0x0301 0x0314 0x03b7 0x0345 */
+	0x1f95, 0x0000,
+	/* 0x0342 0x0314 0x03b7 0x0345 */
+	0x1f97, 0x0000,
+	/* 0x0300 0x0313 0x03c9 0x0345 */
+	0x1fa2, 0x0000,
+	/* 0x0301 0x0313 0x03c9 0x0345 */
+	0x1fa4, 0x0000,
+	/* 0x0342 0x0313 0x03c9 0x0345 */
+	0x1fa6, 0x0000,
+	/* 0x0300 0x0314 0x03c9 0x0345 */
+	0x1fa3, 0x0000,
+	/* 0x0301 0x0314 0x03c9 0x0345 */
+	0x1fa5, 0x0000,
+	/* 0x0342 0x0314 0x03c9 0x0345 */
+	0x1fa7, 0x0000,
+};
diff --git a/kernel/fs/hfsplus/unicode.c b/kernel/fs/hfsplus/unicode.c
new file mode 100644
index 000000000..e8ef121a4
--- /dev/null
+++ b/kernel/fs/hfsplus/unicode.c
@@ -0,0 +1,469 @@
+/*
+ *  linux/fs/hfsplus/unicode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handler routines for unicode strings
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/* Fold the case of a unicode char, given the 16 bit value */
+/* Returns folded char, or 0 if ignorable */
+static inline u16 case_fold(u16 c)
+{
+	u16 tmp;
+
+	tmp = hfsplus_case_fold_table[c >> 8];
+	if (tmp)
+		tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
+	else
+		tmp = c;
+	return tmp;
+}
+
+/* Compare unicode strings, return values like normal strcmp */
+int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
+		       const struct hfsplus_unistr *s2)
+{
+	u16 len1, len2, c1, c2;
+	const hfsplus_unichr *p1, *p2;
+
+	len1 = be16_to_cpu(s1->length);
+	len2 = be16_to_cpu(s2->length);
+	p1 = s1->unicode;
+	p2 = s2->unicode;
+
+	while (1) {
+		c1 = c2 = 0;
+
+		while (len1 && !c1) {
+			c1 = case_fold(be16_to_cpu(*p1));
+			p1++;
+			len1--;
+		}
+		while (len2 && !c2) {
+			c2 = case_fold(be16_to_cpu(*p2));
+			p2++;
+			len2--;
+		}
+
+		if (c1 != c2)
+			return (c1 < c2) ? -1 : 1;
+		if (!c1 && !c2)
+			return 0;
+	}
+}
+
+/* Compare names as a sequence of 16-bit unsigned integers */
+int hfsplus_strcmp(const struct hfsplus_unistr *s1,
+		   const struct hfsplus_unistr *s2)
+{
+	u16 len1, len2, c1, c2;
+	const hfsplus_unichr *p1, *p2;
+	int len;
+
+	len1 = be16_to_cpu(s1->length);
+	len2 = be16_to_cpu(s2->length);
+	p1 = s1->unicode;
+	p2 = s2->unicode;
+
+	for (len = min(len1, len2); len > 0; len--) {
+		c1 = be16_to_cpu(*p1);
+		c2 = be16_to_cpu(*p2);
+		if (c1 != c2)
+			return c1 < c2 ? -1 : 1;
+		p1++;
+		p2++;
+	}
+
+	return len1 < len2 ? -1 :
+	       len1 > len2 ? 1 : 0;
+}
+
+
+#define Hangul_SBase	0xac00
+#define Hangul_LBase	0x1100
+#define Hangul_VBase	0x1161
+#define Hangul_TBase	0x11a7
+#define Hangul_SCount	11172
+#define Hangul_LCount	19
+#define Hangul_VCount	21
+#define Hangul_TCount	28
+#define Hangul_NCount	(Hangul_VCount * Hangul_TCount)
+
+
+static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
+{
+	int i, s, e;
+
+	s = 1;
+	e = p[1];
+	if (!e || cc < p[s * 2] || cc > p[e * 2])
+		return NULL;
+	do {
+		i = (s + e) / 2;
+		if (cc > p[i * 2])
+			s = i + 1;
+		else if (cc < p[i * 2])
+			e = i - 1;
+		else
+			return hfsplus_compose_table + p[i * 2 + 1];
+	} while (s <= e);
+	return NULL;
+}
+
+int hfsplus_uni2asc(struct super_block *sb,
+		const struct hfsplus_unistr *ustr,
+		char *astr, int *len_p)
+{
+	const hfsplus_unichr *ip;
+	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
+	u8 *op;
+	u16 cc, c0, c1;
+	u16 *ce1, *ce2;
+	int i, len, ustrlen, res, compose;
+
+	op = astr;
+	ip = ustr->unicode;
+	ustrlen = be16_to_cpu(ustr->length);
+	len = *len_p;
+	ce1 = NULL;
+	compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+
+	while (ustrlen > 0) {
+		c0 = be16_to_cpu(*ip++);
+		ustrlen--;
+		/* search for single decomposed char */
+		if (likely(compose))
+			ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0);
+		if (ce1)
+			cc = ce1[0];
+		else
+			cc = 0;
+		if (cc) {
+			/* start of a possibly decomposed Hangul char */
+			if (cc != 0xffff)
+				goto done;
+			if (!ustrlen)
+				goto same;
+			c1 = be16_to_cpu(*ip) - Hangul_VBase;
+			if (c1 < Hangul_VCount) {
+				/* compose the Hangul char */
+				cc = (c0 - Hangul_LBase) * Hangul_VCount;
+				cc = (cc + c1) * Hangul_TCount;
+				cc += Hangul_SBase;
+				ip++;
+				ustrlen--;
+				if (!ustrlen)
+					goto done;
+				c1 = be16_to_cpu(*ip) - Hangul_TBase;
+				if (c1 > 0 && c1 < Hangul_TCount) {
+					cc += c1;
+					ip++;
+					ustrlen--;
+				}
+				goto done;
+			}
+		}
+		while (1) {
+			/* main loop for common case of not composed chars */
+			if (!ustrlen)
+				goto same;
+			c1 = be16_to_cpu(*ip);
+			if (likely(compose))
+				ce1 = hfsplus_compose_lookup(
+					hfsplus_compose_table, c1);
+			if (ce1)
+				break;
+			switch (c0) {
+			case 0:
+				c0 = 0x2400;
+				break;
+			case '/':
+				c0 = ':';
+				break;
+			}
+			res = nls->uni2char(c0, op, len);
+			if (res < 0) {
+				if (res == -ENAMETOOLONG)
+					goto out;
+				*op = '?';
+				res = 1;
+			}
+			op += res;
+			len -= res;
+			c0 = c1;
+			ip++;
+			ustrlen--;
+		}
+		ce2 = hfsplus_compose_lookup(ce1, c0);
+		if (ce2) {
+			i = 1;
+			while (i < ustrlen) {
+				ce1 = hfsplus_compose_lookup(ce2,
+					be16_to_cpu(ip[i]));
+				if (!ce1)
+					break;
+				i++;
+				ce2 = ce1;
+			}
+			cc = ce2[0];
+			if (cc) {
+				ip += i;
+				ustrlen -= i;
+				goto done;
+			}
+		}
+same:
+		switch (c0) {
+		case 0:
+			cc = 0x2400;
+			break;
+		case '/':
+			cc = ':';
+			break;
+		default:
+			cc = c0;
+		}
+done:
+		res = nls->uni2char(cc, op, len);
+		if (res < 0) {
+			if (res == -ENAMETOOLONG)
+				goto out;
+			*op = '?';
+			res = 1;
+		}
+		op += res;
+		len -= res;
+	}
+	res = 0;
+out:
+	*len_p = (char *)op - astr;
+	return res;
+}
+
+/*
+ * Convert one or more ASCII characters into a single unicode character.
+ * Returns the number of ASCII characters corresponding to the unicode char.
+ */
+static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
+			      wchar_t *uc)
+{
+	int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
+	if (size <= 0) {
+		*uc = '?';
+		size = 1;
+	}
+	switch (*uc) {
+	case 0x2400:
+		*uc = 0;
+		break;
+	case ':':
+		*uc = '/';
+		break;
+	}
+	return size;
+}
+
+/* Decomposes a single unicode character. */
+static inline u16 *decompose_unichar(wchar_t uc, int *size)
+{
+	int off;
+
+	off = hfsplus_decompose_table[(uc >> 12) & 0xf];
+	if (off == 0 || off == 0xffff)
+		return NULL;
+
+	off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
+	if (!off)
+		return NULL;
+
+	off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
+	if (!off)
+		return NULL;
+
+	off = hfsplus_decompose_table[off + (uc & 0xf)];
+	*size = off & 3;
+	if (*size == 0)
+		return NULL;
+	return hfsplus_decompose_table + (off / 4);
+}
+
+int hfsplus_asc2uni(struct super_block *sb,
+		    struct hfsplus_unistr *ustr, int max_unistr_len,
+		    const char *astr, int len)
+{
+	int size, dsize, decompose;
+	u16 *dstr, outlen = 0;
+	wchar_t c;
+
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+	while (outlen < max_unistr_len && len > 0) {
+		size = asc2unichar(sb, astr, len, &c);
+
+		if (decompose)
+			dstr = decompose_unichar(c, &dsize);
+		else
+			dstr = NULL;
+		if (dstr) {
+			if (outlen + dsize > max_unistr_len)
+				break;
+			do {
+				ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
+			} while (--dsize > 0);
+		} else
+			ustr->unicode[outlen++] = cpu_to_be16(c);
+
+		astr += size;
+		len -= size;
+	}
+	ustr->length = cpu_to_be16(outlen);
+	if (len > 0)
+		return -ENAMETOOLONG;
+	return 0;
+}
+
+/*
+ * Hash a string to an integer as appropriate for the HFS+ filesystem.
+ * Composed unicode characters are decomposed and case-folding is performed
+ * if the appropriate bits are (un)set on the superblock.
+ */
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
+{
+	struct super_block *sb = dentry->d_sb;
+	const char *astr;
+	const u16 *dstr;
+	int casefold, decompose, size, len;
+	unsigned long hash;
+	wchar_t c;
+	u16 c2;
+
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+	hash = init_name_hash();
+	astr = str->name;
+	len = str->len;
+	while (len > 0) {
+		int uninitialized_var(dsize);
+		size = asc2unichar(sb, astr, len, &c);
+		astr += size;
+		len -= size;
+
+		if (decompose)
+			dstr = decompose_unichar(c, &dsize);
+		else
+			dstr = NULL;
+		if (dstr) {
+			do {
+				c2 = *dstr++;
+				if (casefold)
+					c2 = case_fold(c2);
+				if (!casefold || c2)
+					hash = partial_name_hash(c2, hash);
+			} while (--dsize > 0);
+		} else {
+			c2 = c;
+			if (casefold)
+				c2 = case_fold(c2);
+			if (!casefold || c2)
+				hash = partial_name_hash(c2, hash);
+		}
+	}
+	str->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Compare strings with HFS+ filename ordering.
+ * Composed unicode characters are decomposed and case-folding is performed
+ * if the appropriate bits are (un)set on the superblock.
+ */
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct super_block *sb = parent->d_sb;
+	int casefold, decompose, size;
+	int dsize1, dsize2, len1, len2;
+	const u16 *dstr1, *dstr2;
+	const char *astr1, *astr2;
+	u16 c1, c2;
+	wchar_t c;
+
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+	astr1 = str;
+	len1 = len;
+	astr2 = name->name;
+	len2 = name->len;
+	dsize1 = dsize2 = 0;
+	dstr1 = dstr2 = NULL;
+
+	while (len1 > 0 && len2 > 0) {
+		if (!dsize1) {
+			size = asc2unichar(sb, astr1, len1, &c);
+			astr1 += size;
+			len1 -= size;
+
+			if (decompose)
+				dstr1 = decompose_unichar(c, &dsize1);
+			if (!decompose || !dstr1) {
+				c1 = c;
+				dstr1 = &c1;
+				dsize1 = 1;
+			}
+		}
+
+		if (!dsize2) {
+			size = asc2unichar(sb, astr2, len2, &c);
+			astr2 += size;
+			len2 -= size;
+
+			if (decompose)
+				dstr2 = decompose_unichar(c, &dsize2);
+			if (!decompose || !dstr2) {
+				c2 = c;
+				dstr2 = &c2;
+				dsize2 = 1;
+			}
+		}
+
+		c1 = *dstr1;
+		c2 = *dstr2;
+		if (casefold) {
+			c1 = case_fold(c1);
+			if (!c1) {
+				dstr1++;
+				dsize1--;
+				continue;
+			}
+			c2 = case_fold(c2);
+			if (!c2) {
+				dstr2++;
+				dsize2--;
+				continue;
+			}
+		}
+		if (c1 < c2)
+			return -1;
+		else if (c1 > c2)
+			return 1;
+
+		dstr1++;
+		dsize1--;
+		dstr2++;
+		dsize2--;
+	}
+
+	if (len1 < len2)
+		return -1;
+	if (len1 > len2)
+		return 1;
+	return 0;
+}
diff --git a/kernel/fs/hfsplus/wrapper.c b/kernel/fs/hfsplus/wrapper.c
new file mode 100644
index 000000000..cc6235671
--- /dev/null
+++ b/kernel/fs/hfsplus/wrapper.c
@@ -0,0 +1,261 @@
+/*
+ *  linux/fs/hfsplus/wrapper.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of HFS wrappers around HFS+ volumes
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/genhd.h>
+#include <asm/unaligned.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+struct hfsplus_wd {
+	u32 ablk_size;
+	u16 ablk_start;
+	u16 embed_start;
+	u16 embed_count;
+};
+
+/**
+ * hfsplus_submit_bio - Perform block I/O
+ * @sb: super block of volume for I/O
+ * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
+ * @buf: buffer for I/O
+ * @data: output pointer for location of requested data
+ * @rw: direction of I/O
+ *
+ * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
+ * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
+ * @data will return a pointer to the start of the requested sector,
+ * which may not be the same location as @buf.
+ *
+ * If @sector is not aligned to the bdev logical block size it will
+ * be rounded down. For writes this means that @buf should contain data
+ * that starts at the rounded-down address. As long as the data was
+ * read using hfsplus_submit_bio() and the same buffer is used things
+ * will work correctly.
+ */
+int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
+		void *buf, void **data, int rw)
+{
+	struct bio *bio;
+	int ret = 0;
+	u64 io_size;
+	loff_t start;
+	int offset;
+
+	/*
+	 * Align sector to hardware sector size and find offset. We
+	 * assume that io_size is a power of two, which _should_
+	 * be true.
+	 */
+	io_size = hfsplus_min_io_size(sb);
+	start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
+	offset = start & (io_size - 1);
+	sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_bdev = sb->s_bdev;
+
+	if (!(rw & WRITE) && data)
+		*data = (u8 *)buf + offset;
+
+	while (io_size > 0) {
+		unsigned int page_offset = offset_in_page(buf);
+		unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset,
+					 io_size);
+
+		ret = bio_add_page(bio, virt_to_page(buf), len, page_offset);
+		if (ret != len) {
+			ret = -EIO;
+			goto out;
+		}
+		io_size -= len;
+		buf = (u8 *)buf + len;
+	}
+
+	ret = submit_bio_wait(rw, bio);
+out:
+	bio_put(bio);
+	return ret < 0 ? ret : 0;
+}
+
+static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
+{
+	u32 extent;
+	u16 attrib;
+	__be16 sig;
+
+	sig = *(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG);
+	if (sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIG) &&
+	    sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+		return 0;
+
+	attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB));
+	if (!(attrib & HFSP_WRAP_ATTRIB_SLOCK) ||
+	   !(attrib & HFSP_WRAP_ATTRIB_SPARED))
+		return 0;
+
+	wd->ablk_size =
+		be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
+	if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
+		return 0;
+	if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
+		return 0;
+	wd->ablk_start =
+		be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
+
+	extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
+	wd->embed_start = (extent >> 16) & 0xFFFF;
+	wd->embed_count = extent & 0xFFFF;
+
+	return 1;
+}
+
+static int hfsplus_get_last_session(struct super_block *sb,
+				    sector_t *start, sector_t *size)
+{
+	struct cdrom_multisession ms_info;
+	struct cdrom_tocentry te;
+	int res;
+
+	/* default values */
+	*start = 0;
+	*size = sb->s_bdev->bd_inode->i_size >> 9;
+
+	if (HFSPLUS_SB(sb)->session >= 0) {
+		te.cdte_track = HFSPLUS_SB(sb)->session;
+		te.cdte_format = CDROM_LBA;
+		res = ioctl_by_bdev(sb->s_bdev,
+			CDROMREADTOCENTRY, (unsigned long)&te);
+		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
+			*start = (sector_t)te.cdte_addr.lba << 2;
+			return 0;
+		}
+		pr_err("invalid session number or type of track\n");
+		return -EINVAL;
+	}
+	ms_info.addr_format = CDROM_LBA;
+	res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
+		(unsigned long)&ms_info);
+	if (!res && ms_info.xa_flag)
+		*start = (sector_t)ms_info.addr.lba << 2;
+	return 0;
+}
+
+/* Find the volume header and fill in some minimum bits in superblock */
+/* Takes in super block, returns true if good data read */
+int hfsplus_read_wrapper(struct super_block *sb)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_wd wd;
+	sector_t part_start, part_size;
+	u32 blocksize;
+	int error = 0;
+
+	error = -EINVAL;
+	blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
+	if (!blocksize)
+		goto out;
+
+	if (hfsplus_get_last_session(sb, &part_start, &part_size))
+		goto out;
+
+	error = -ENOMEM;
+	sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!sbi->s_vhdr_buf)
+		goto out;
+	sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!sbi->s_backup_vhdr_buf)
+		goto out_free_vhdr;
+
+reread:
+	error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR,
+				   sbi->s_vhdr_buf, (void **)&sbi->s_vhdr,
+				   READ);
+	if (error)
+		goto out_free_backup_vhdr;
+
+	error = -EINVAL;
+	switch (sbi->s_vhdr->signature) {
+	case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
+		set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
+		/*FALLTHRU*/
+	case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
+		break;
+	case cpu_to_be16(HFSP_WRAP_MAGIC):
+		if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
+			goto out_free_backup_vhdr;
+		wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
+		part_start += (sector_t)wd.ablk_start +
+			       (sector_t)wd.embed_start * wd.ablk_size;
+		part_size = (sector_t)wd.embed_count * wd.ablk_size;
+		goto reread;
+	default:
+		/*
+		 * Check for a partition block.
+		 *
+		 * (should do this only for cdrom/loop though)
+		 */
+		if (hfs_part_find(sb, &part_start, &part_size))
+			goto out_free_backup_vhdr;
+		goto reread;
+	}
+
+	error = hfsplus_submit_bio(sb, part_start + part_size - 2,
+				   sbi->s_backup_vhdr_buf,
+				   (void **)&sbi->s_backup_vhdr, READ);
+	if (error)
+		goto out_free_backup_vhdr;
+
+	error = -EINVAL;
+	if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
+		pr_warn("invalid secondary volume header\n");
+		goto out_free_backup_vhdr;
+	}
+
+	blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
+
+	/*
+	 * Block size must be at least as large as a sector and a multiple of 2.
+	 */
+	if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
+		goto out_free_backup_vhdr;
+	sbi->alloc_blksz = blocksize;
+	sbi->alloc_blksz_shift = ilog2(blocksize);
+	blocksize = min_t(u32, sbi->alloc_blksz, PAGE_SIZE);
+
+	/*
+	 * Align block size to block offset.
+	 */
+	while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
+		blocksize >>= 1;
+
+	if (sb_set_blocksize(sb, blocksize) != blocksize) {
+		pr_err("unable to set blocksize to %u!\n", blocksize);
+		goto out_free_backup_vhdr;
+	}
+
+	sbi->blockoffset =
+		part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+	sbi->part_start = part_start;
+	sbi->sect_count = part_size;
+	sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+	return 0;
+
+out_free_backup_vhdr:
+	kfree(sbi->s_backup_vhdr_buf);
+out_free_vhdr:
+	kfree(sbi->s_vhdr_buf);
+out:
+	return error;
+}
diff --git a/kernel/fs/hfsplus/xattr.c b/kernel/fs/hfsplus/xattr.c
new file mode 100644
index 000000000..416b1dbaf
--- /dev/null
+++ b/kernel/fs/hfsplus/xattr.c
@@ -0,0 +1,911 @@
+/*
+ * linux/fs/hfsplus/xattr.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Logic of processing extended attributes
+ */
+
+#include "hfsplus_fs.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/nls.h>
+#include "xattr.h"
+#include "acl.h"
+
+static int hfsplus_removexattr(struct inode *inode, const char *name);
+
+const struct xattr_handler *hfsplus_xattr_handlers[] = {
+	&hfsplus_xattr_osx_handler,
+	&hfsplus_xattr_user_handler,
+	&hfsplus_xattr_trusted_handler,
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&hfsplus_xattr_security_handler,
+	NULL
+};
+
+static int strcmp_xattr_finder_info(const char *name)
+{
+	if (name) {
+		return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME,
+				sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME));
+	}
+	return -1;
+}
+
+static int strcmp_xattr_acl(const char *name)
+{
+	if (name) {
+		return strncmp(name, HFSPLUS_XATTR_ACL_NAME,
+				sizeof(HFSPLUS_XATTR_ACL_NAME));
+	}
+	return -1;
+}
+
+static bool is_known_namespace(const char *name)
+{
+	if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return false;
+
+	return true;
+}
+
+static void hfsplus_init_header_node(struct inode *attr_file,
+					u32 clump_size,
+					char *buf, u16 node_size)
+{
+	struct hfs_bnode_desc *desc;
+	struct hfs_btree_header_rec *head;
+	u16 offset;
+	__be16 *rec_offsets;
+	u32 hdr_node_map_rec_bits;
+	char *bmp;
+	u32 used_nodes;
+	u32 used_bmp_bytes;
+	u64 tmp;
+
+	hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n",
+		clump_size, node_size);
+
+	/* The end of the node contains list of record offsets */
+	rec_offsets = (__be16 *)(buf + node_size);
+
+	desc = (struct hfs_bnode_desc *)buf;
+	desc->type = HFS_NODE_HEADER;
+	desc->num_recs = cpu_to_be16(HFSPLUS_BTREE_HDR_NODE_RECS_COUNT);
+	offset = sizeof(struct hfs_bnode_desc);
+	*--rec_offsets = cpu_to_be16(offset);
+
+	head = (struct hfs_btree_header_rec *)(buf + offset);
+	head->node_size = cpu_to_be16(node_size);
+	tmp = i_size_read(attr_file);
+	do_div(tmp, node_size);
+	head->node_count = cpu_to_be32(tmp);
+	head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1);
+	head->clump_size = cpu_to_be32(clump_size);
+	head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS);
+	head->max_key_len = cpu_to_be16(HFSPLUS_ATTR_KEYLEN - sizeof(u16));
+	offset += sizeof(struct hfs_btree_header_rec);
+	*--rec_offsets = cpu_to_be16(offset);
+	offset += HFSPLUS_BTREE_HDR_USER_BYTES;
+	*--rec_offsets = cpu_to_be16(offset);
+
+	hdr_node_map_rec_bits = 8 * (node_size - offset - (4 * sizeof(u16)));
+	if (be32_to_cpu(head->node_count) > hdr_node_map_rec_bits) {
+		u32 map_node_bits;
+		u32 map_nodes;
+
+		desc->next = cpu_to_be32(be32_to_cpu(head->leaf_tail) + 1);
+		map_node_bits = 8 * (node_size - sizeof(struct hfs_bnode_desc) -
+					(2 * sizeof(u16)) - 2);
+		map_nodes = (be32_to_cpu(head->node_count) -
+				hdr_node_map_rec_bits +
+				(map_node_bits - 1)) / map_node_bits;
+		be32_add_cpu(&head->free_nodes, 0 - map_nodes);
+	}
+
+	bmp = buf + offset;
+	used_nodes =
+		be32_to_cpu(head->node_count) - be32_to_cpu(head->free_nodes);
+	used_bmp_bytes = used_nodes / 8;
+	if (used_bmp_bytes) {
+		memset(bmp, 0xFF, used_bmp_bytes);
+		bmp += used_bmp_bytes;
+		used_nodes %= 8;
+	}
+	*bmp = ~(0xFF >> used_nodes);
+	offset += hdr_node_map_rec_bits / 8;
+	*--rec_offsets = cpu_to_be16(offset);
+}
+
+static int hfsplus_create_attributes_file(struct super_block *sb)
+{
+	int err = 0;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct inode *attr_file;
+	struct hfsplus_inode_info *hip;
+	u32 clump_size;
+	u16 node_size = HFSPLUS_ATTR_TREE_NODE_SIZE;
+	char *buf;
+	int index, written;
+	struct address_space *mapping;
+	struct page *page;
+	int old_state = HFSPLUS_EMPTY_ATTR_TREE;
+
+	hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID);
+
+check_attr_tree_state_again:
+	switch (atomic_read(&sbi->attr_tree_state)) {
+	case HFSPLUS_EMPTY_ATTR_TREE:
+		if (old_state != atomic_cmpxchg(&sbi->attr_tree_state,
+						old_state,
+						HFSPLUS_CREATING_ATTR_TREE))
+			goto check_attr_tree_state_again;
+		break;
+	case HFSPLUS_CREATING_ATTR_TREE:
+		/*
+		 * This state means that another thread is in process
+		 * of AttributesFile creation. Theoretically, it is
+		 * possible to be here. But really __setxattr() method
+		 * first of all calls hfs_find_init() for lookup in
+		 * B-tree of CatalogFile. This method locks mutex of
+		 * CatalogFile's B-tree. As a result, if some thread
+		 * is inside AttributedFile creation operation then
+		 * another threads will be waiting unlocking of
+		 * CatalogFile's B-tree's mutex. However, if code will
+		 * change then we will return error code (-EAGAIN) from
+		 * here. Really, it means that first try to set of xattr
+		 * fails with error but second attempt will have success.
+		 */
+		return -EAGAIN;
+	case HFSPLUS_VALID_ATTR_TREE:
+		return 0;
+	case HFSPLUS_FAILED_ATTR_TREE:
+		return -EOPNOTSUPP;
+	default:
+		BUG();
+	}
+
+	attr_file = hfsplus_iget(sb, HFSPLUS_ATTR_CNID);
+	if (IS_ERR(attr_file)) {
+		pr_err("failed to load attributes file\n");
+		return PTR_ERR(attr_file);
+	}
+
+	BUG_ON(i_size_read(attr_file) != 0);
+
+	hip = HFSPLUS_I(attr_file);
+
+	clump_size = hfsplus_calc_btree_clump_size(sb->s_blocksize,
+						    node_size,
+						    sbi->sect_count,
+						    HFSPLUS_ATTR_CNID);
+
+	mutex_lock(&hip->extents_lock);
+	hip->clump_blocks = clump_size >> sbi->alloc_blksz_shift;
+	mutex_unlock(&hip->extents_lock);
+
+	if (sbi->free_blocks <= (hip->clump_blocks << 1)) {
+		err = -ENOSPC;
+		goto end_attr_file_creation;
+	}
+
+	while (hip->alloc_blocks < hip->clump_blocks) {
+		err = hfsplus_file_extend(attr_file, false);
+		if (unlikely(err)) {
+			pr_err("failed to extend attributes file\n");
+			goto end_attr_file_creation;
+		}
+		hip->phys_size = attr_file->i_size =
+			(loff_t)hip->alloc_blocks << sbi->alloc_blksz_shift;
+		hip->fs_blocks = hip->alloc_blocks << sbi->fs_shift;
+		inode_set_bytes(attr_file, attr_file->i_size);
+	}
+
+	buf = kzalloc(node_size, GFP_NOFS);
+	if (!buf) {
+		pr_err("failed to allocate memory for header node\n");
+		err = -ENOMEM;
+		goto end_attr_file_creation;
+	}
+
+	hfsplus_init_header_node(attr_file, clump_size, buf, node_size);
+
+	mapping = attr_file->i_mapping;
+
+	index = 0;
+	written = 0;
+	for (; written < node_size; index++, written += PAGE_CACHE_SIZE) {
+		void *kaddr;
+
+		page = read_mapping_page(mapping, index, NULL);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed_header_node_init;
+		}
+
+		kaddr = kmap_atomic(page);
+		memcpy(kaddr, buf + written,
+			min_t(size_t, PAGE_CACHE_SIZE, node_size - written));
+		kunmap_atomic(kaddr);
+
+		set_page_dirty(page);
+		page_cache_release(page);
+	}
+
+	hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY);
+
+	sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
+	if (!sbi->attr_tree)
+		pr_err("failed to load attributes file\n");
+
+failed_header_node_init:
+	kfree(buf);
+
+end_attr_file_creation:
+	iput(attr_file);
+
+	if (!err)
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
+	else if (err == -ENOSPC)
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
+	else
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_FAILED_ATTR_TREE);
+
+	return err;
+}
+
+int __hfsplus_setxattr(struct inode *inode, const char *name,
+			const void *value, size_t size, int flags)
+{
+	int err = 0;
+	struct hfs_find_data cat_fd;
+	hfsplus_cat_entry entry;
+	u16 cat_entry_flags, cat_entry_type;
+	u16 folder_finderinfo_len = sizeof(struct DInfo) +
+					sizeof(struct DXInfo);
+	u16 file_finderinfo_len = sizeof(struct FInfo) +
+					sizeof(struct FXInfo);
+
+	if ((!S_ISREG(inode->i_mode) &&
+			!S_ISDIR(inode->i_mode)) ||
+				HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (value == NULL)
+		return hfsplus_removexattr(inode, name);
+
+	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
+	if (err) {
+		pr_err("can't init xattr find struct\n");
+		return err;
+	}
+
+	err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
+	if (err) {
+		pr_err("catalog searching failed\n");
+		goto end_setxattr;
+	}
+
+	if (!strcmp_xattr_finder_info(name)) {
+		if (flags & XATTR_CREATE) {
+			pr_err("xattr exists yet\n");
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset,
+					sizeof(hfsplus_cat_entry));
+		if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) {
+			if (size == folder_finderinfo_len) {
+				memcpy(&entry.folder.user_info, value,
+						folder_finderinfo_len);
+				hfs_bnode_write(cat_fd.bnode, &entry,
+					cat_fd.entryoffset,
+					sizeof(struct hfsplus_cat_folder));
+				hfsplus_mark_inode_dirty(inode,
+						HFSPLUS_I_CAT_DIRTY);
+			} else {
+				err = -ERANGE;
+				goto end_setxattr;
+			}
+		} else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) {
+			if (size == file_finderinfo_len) {
+				memcpy(&entry.file.user_info, value,
+						file_finderinfo_len);
+				hfs_bnode_write(cat_fd.bnode, &entry,
+					cat_fd.entryoffset,
+					sizeof(struct hfsplus_cat_file));
+				hfsplus_mark_inode_dirty(inode,
+						HFSPLUS_I_CAT_DIRTY);
+			} else {
+				err = -ERANGE;
+				goto end_setxattr;
+			}
+		} else {
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		goto end_setxattr;
+	}
+
+	if (!HFSPLUS_SB(inode->i_sb)->attr_tree) {
+		err = hfsplus_create_attributes_file(inode->i_sb);
+		if (unlikely(err))
+			goto end_setxattr;
+	}
+
+	if (hfsplus_attr_exists(inode, name)) {
+		if (flags & XATTR_CREATE) {
+			pr_err("xattr exists yet\n");
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		err = hfsplus_delete_attr(inode, name);
+		if (err)
+			goto end_setxattr;
+		err = hfsplus_create_attr(inode, name, value, size);
+		if (err)
+			goto end_setxattr;
+	} else {
+		if (flags & XATTR_REPLACE) {
+			pr_err("cannot replace xattr\n");
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		err = hfsplus_create_attr(inode, name, value, size);
+		if (err)
+			goto end_setxattr;
+	}
+
+	cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset);
+	if (cat_entry_type == HFSPLUS_FOLDER) {
+		cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode,
+				    cat_fd.entryoffset +
+				    offsetof(struct hfsplus_cat_folder, flags));
+		cat_entry_flags |= HFSPLUS_XATTR_EXISTS;
+		if (!strcmp_xattr_acl(name))
+			cat_entry_flags |= HFSPLUS_ACL_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, flags),
+				cat_entry_flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else if (cat_entry_type == HFSPLUS_FILE) {
+		cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode,
+				    cat_fd.entryoffset +
+				    offsetof(struct hfsplus_cat_file, flags));
+		cat_entry_flags |= HFSPLUS_XATTR_EXISTS;
+		if (!strcmp_xattr_acl(name))
+			cat_entry_flags |= HFSPLUS_ACL_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				    offsetof(struct hfsplus_cat_file, flags),
+				    cat_entry_flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else {
+		pr_err("invalid catalog entry type\n");
+		err = -EIO;
+		goto end_setxattr;
+	}
+
+end_setxattr:
+	hfs_find_exit(&cat_fd);
+	return err;
+}
+
+static int name_len(const char *xattr_name, int xattr_name_len)
+{
+	int len = xattr_name_len + 1;
+
+	if (!is_known_namespace(xattr_name))
+		len += XATTR_MAC_OSX_PREFIX_LEN;
+
+	return len;
+}
+
+static int copy_name(char *buffer, const char *xattr_name, int name_len)
+{
+	int len = name_len;
+	int offset = 0;
+
+	if (!is_known_namespace(xattr_name)) {
+		strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
+		offset += XATTR_MAC_OSX_PREFIX_LEN;
+		len += XATTR_MAC_OSX_PREFIX_LEN;
+	}
+
+	strncpy(buffer + offset, xattr_name, name_len);
+	memset(buffer + offset + name_len, 0, 1);
+	len += 1;
+
+	return len;
+}
+
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+		     const void *value, size_t size, int flags,
+		     const char *prefix, size_t prefixlen)
+{
+	char *xattr_name;
+	int res;
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+		GFP_KERNEL);
+	if (!xattr_name)
+		return -ENOMEM;
+	strcpy(xattr_name, prefix);
+	strcpy(xattr_name + prefixlen, name);
+	res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
+				 flags);
+	kfree(xattr_name);
+	return res;
+}
+
+static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
+						void *value, size_t size)
+{
+	ssize_t res = 0;
+	struct hfs_find_data fd;
+	u16 entry_type;
+	u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo);
+	u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+	u16 record_len = max(folder_rec_len, file_rec_len);
+	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
+	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+
+	if (size >= record_len) {
+		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+		if (res) {
+			pr_err("can't init xattr find struct\n");
+			return res;
+		}
+		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+		if (res)
+			goto end_getxattr_finder_info;
+		entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+
+		if (entry_type == HFSPLUS_FOLDER) {
+			hfs_bnode_read(fd.bnode, folder_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, user_info),
+				folder_rec_len);
+			memcpy(value, folder_finder_info, folder_rec_len);
+			res = folder_rec_len;
+		} else if (entry_type == HFSPLUS_FILE) {
+			hfs_bnode_read(fd.bnode, file_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, user_info),
+				file_rec_len);
+			memcpy(value, file_finder_info, file_rec_len);
+			res = file_rec_len;
+		} else {
+			res = -EOPNOTSUPP;
+			goto end_getxattr_finder_info;
+		}
+	} else
+		res = size ? -ERANGE : record_len;
+
+end_getxattr_finder_info:
+	if (size >= record_len)
+		hfs_find_exit(&fd);
+	return res;
+}
+
+ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
+			 void *value, size_t size)
+{
+	struct hfs_find_data fd;
+	hfsplus_attr_entry *entry;
+	__be32 xattr_record_type;
+	u32 record_type;
+	u16 record_length = 0;
+	ssize_t res = 0;
+
+	if ((!S_ISREG(inode->i_mode) &&
+			!S_ISDIR(inode->i_mode)) ||
+				HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (!strcmp_xattr_finder_info(name))
+		return hfsplus_getxattr_finder_info(inode, value, size);
+
+	if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+		return -EOPNOTSUPP;
+
+	entry = hfsplus_alloc_attr_entry();
+	if (!entry) {
+		pr_err("can't allocate xattr entry\n");
+		return -ENOMEM;
+	}
+
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
+	if (res) {
+		pr_err("can't init xattr find struct\n");
+		goto failed_getxattr_init;
+	}
+
+	res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd);
+	if (res) {
+		if (res == -ENOENT)
+			res = -ENODATA;
+		else
+			pr_err("xattr searching failed\n");
+		goto out;
+	}
+
+	hfs_bnode_read(fd.bnode, &xattr_record_type,
+			fd.entryoffset, sizeof(xattr_record_type));
+	record_type = be32_to_cpu(xattr_record_type);
+	if (record_type == HFSPLUS_ATTR_INLINE_DATA) {
+		record_length = hfs_bnode_read_u16(fd.bnode,
+				fd.entryoffset +
+				offsetof(struct hfsplus_attr_inline_data,
+				length));
+		if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) {
+			pr_err("invalid xattr record size\n");
+			res = -EIO;
+			goto out;
+		}
+	} else if (record_type == HFSPLUS_ATTR_FORK_DATA ||
+			record_type == HFSPLUS_ATTR_EXTENTS) {
+		pr_err("only inline data xattr are supported\n");
+		res = -EOPNOTSUPP;
+		goto out;
+	} else {
+		pr_err("invalid xattr record\n");
+		res = -EIO;
+		goto out;
+	}
+
+	if (size) {
+		hfs_bnode_read(fd.bnode, entry, fd.entryoffset,
+				offsetof(struct hfsplus_attr_inline_data,
+					raw_bytes) + record_length);
+	}
+
+	if (size >= record_length) {
+		memcpy(value, entry->inline_data.raw_bytes, record_length);
+		res = record_length;
+	} else
+		res = size ? -ERANGE : record_length;
+
+out:
+	hfs_find_exit(&fd);
+
+failed_getxattr_init:
+	hfsplus_destroy_attr_entry(entry);
+	return res;
+}
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+			 void *value, size_t size,
+			 const char *prefix, size_t prefixlen)
+{
+	int res;
+	char *xattr_name;
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+			     GFP_KERNEL);
+	if (!xattr_name)
+		return -ENOMEM;
+
+	strcpy(xattr_name, prefix);
+	strcpy(xattr_name + prefixlen, name);
+
+	res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size);
+	kfree(xattr_name);
+	return res;
+
+}
+
+static inline int can_list(const char *xattr_name)
+{
+	if (!xattr_name)
+		return 0;
+
+	return strncmp(xattr_name, XATTR_TRUSTED_PREFIX,
+			XATTR_TRUSTED_PREFIX_LEN) ||
+				capable(CAP_SYS_ADMIN);
+}
+
+static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
+						char *buffer, size_t size)
+{
+	ssize_t res = 0;
+	struct inode *inode = d_inode(dentry);
+	struct hfs_find_data fd;
+	u16 entry_type;
+	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
+	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+	unsigned long len, found_bit;
+	int xattr_name_len, symbols_count;
+
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+	if (res) {
+		pr_err("can't init xattr find struct\n");
+		return res;
+	}
+
+	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+	if (res)
+		goto end_listxattr_finder_info;
+
+	entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+	if (entry_type == HFSPLUS_FOLDER) {
+		len = sizeof(struct DInfo) + sizeof(struct DXInfo);
+		hfs_bnode_read(fd.bnode, folder_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, user_info),
+				len);
+		found_bit = find_first_bit((void *)folder_finder_info, len*8);
+	} else if (entry_type == HFSPLUS_FILE) {
+		len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+		hfs_bnode_read(fd.bnode, file_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, user_info),
+				len);
+		found_bit = find_first_bit((void *)file_finder_info, len*8);
+	} else {
+		res = -EOPNOTSUPP;
+		goto end_listxattr_finder_info;
+	}
+
+	if (found_bit >= (len*8))
+		res = 0;
+	else {
+		symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1;
+		xattr_name_len =
+			name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count);
+		if (!buffer || !size) {
+			if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME))
+				res = xattr_name_len;
+		} else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) {
+			if (size < xattr_name_len)
+				res = -ERANGE;
+			else {
+				res = copy_name(buffer,
+						HFSPLUS_XATTR_FINDER_INFO_NAME,
+						symbols_count);
+			}
+		}
+	}
+
+end_listxattr_finder_info:
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	ssize_t err;
+	ssize_t res = 0;
+	struct inode *inode = d_inode(dentry);
+	struct hfs_find_data fd;
+	u16 key_len = 0;
+	struct hfsplus_attr_key attr_key;
+	char *strbuf;
+	int xattr_name_len;
+
+	if ((!S_ISREG(inode->i_mode) &&
+			!S_ISDIR(inode->i_mode)) ||
+				HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	res = hfsplus_listxattr_finder_info(dentry, buffer, size);
+	if (res < 0)
+		return res;
+	else if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+		return (res == 0) ? -EOPNOTSUPP : res;
+
+	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
+	if (err) {
+		pr_err("can't init xattr find struct\n");
+		return err;
+	}
+
+	strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN +
+			XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
+	if (!strbuf) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd);
+	if (err) {
+		if (err == -ENOENT) {
+			if (res == 0)
+				res = -ENODATA;
+			goto end_listxattr;
+		} else {
+			res = err;
+			goto end_listxattr;
+		}
+	}
+
+	for (;;) {
+		key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset);
+		if (key_len == 0 || key_len > fd.tree->max_key_len) {
+			pr_err("invalid xattr key length: %d\n", key_len);
+			res = -EIO;
+			goto end_listxattr;
+		}
+
+		hfs_bnode_read(fd.bnode, &attr_key,
+				fd.keyoffset, key_len + sizeof(key_len));
+
+		if (be32_to_cpu(attr_key.cnid) != inode->i_ino)
+			goto end_listxattr;
+
+		xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN;
+		if (hfsplus_uni2asc(inode->i_sb,
+			(const struct hfsplus_unistr *)&fd.key->attr.key_name,
+					strbuf, &xattr_name_len)) {
+			pr_err("unicode conversion failed\n");
+			res = -EIO;
+			goto end_listxattr;
+		}
+
+		if (!buffer || !size) {
+			if (can_list(strbuf))
+				res += name_len(strbuf, xattr_name_len);
+		} else if (can_list(strbuf)) {
+			if (size < (res + name_len(strbuf, xattr_name_len))) {
+				res = -ERANGE;
+				goto end_listxattr;
+			} else
+				res += copy_name(buffer + res,
+						strbuf, xattr_name_len);
+		}
+
+		if (hfs_brec_goto(&fd, 1))
+			goto end_listxattr;
+	}
+
+end_listxattr:
+	kfree(strbuf);
+out:
+	hfs_find_exit(&fd);
+	return res;
+}
+
+static int hfsplus_removexattr(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct hfs_find_data cat_fd;
+	u16 flags;
+	u16 cat_entry_type;
+	int is_xattr_acl_deleted = 0;
+	int is_all_xattrs_deleted = 0;
+
+	if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+		return -EOPNOTSUPP;
+
+	if (!strcmp_xattr_finder_info(name))
+		return -EOPNOTSUPP;
+
+	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
+	if (err) {
+		pr_err("can't init xattr find struct\n");
+		return err;
+	}
+
+	err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
+	if (err) {
+		pr_err("catalog searching failed\n");
+		goto end_removexattr;
+	}
+
+	err = hfsplus_delete_attr(inode, name);
+	if (err)
+		goto end_removexattr;
+
+	is_xattr_acl_deleted = !strcmp_xattr_acl(name);
+	is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL);
+
+	if (!is_xattr_acl_deleted && !is_all_xattrs_deleted)
+		goto end_removexattr;
+
+	cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset);
+
+	if (cat_entry_type == HFSPLUS_FOLDER) {
+		flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, flags));
+		if (is_xattr_acl_deleted)
+			flags &= ~HFSPLUS_ACL_EXISTS;
+		if (is_all_xattrs_deleted)
+			flags &= ~HFSPLUS_XATTR_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, flags),
+				flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else if (cat_entry_type == HFSPLUS_FILE) {
+		flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, flags));
+		if (is_xattr_acl_deleted)
+			flags &= ~HFSPLUS_ACL_EXISTS;
+		if (is_all_xattrs_deleted)
+			flags &= ~HFSPLUS_XATTR_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, flags),
+				flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else {
+		pr_err("invalid catalog entry type\n");
+		err = -EIO;
+		goto end_removexattr;
+	}
+
+end_removexattr:
+	hfs_find_exit(&cat_fd);
+	return err;
+}
+
+static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	/*
+	 * Don't allow retrieving properly prefixed attributes
+	 * by prepending them with "osx."
+	 */
+	if (is_known_namespace(name))
+		return -EOPNOTSUPP;
+
+	/*
+	 * osx is the namespace we use to indicate an unprefixed
+	 * attribute on the filesystem (like the ones that OS X
+	 * creates), so we pass the name through unmodified (after
+	 * ensuring it doesn't conflict with another namespace).
+	 */
+	return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
+}
+
+static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	/*
+	 * Don't allow setting properly prefixed attributes
+	 * by prepending them with "osx."
+	 */
+	if (is_known_namespace(name))
+		return -EOPNOTSUPP;
+
+	/*
+	 * osx is the namespace we use to indicate an unprefixed
+	 * attribute on the filesystem (like the ones that OS X
+	 * creates), so we pass the name through unmodified (after
+	 * ensuring it doesn't conflict with another namespace).
+	 */
+	return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
+}
+
+static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_osx_handler = {
+	.prefix	= XATTR_MAC_OSX_PREFIX,
+	.list	= hfsplus_osx_listxattr,
+	.get	= hfsplus_osx_getxattr,
+	.set	= hfsplus_osx_setxattr,
+};
diff --git a/kernel/fs/hfsplus/xattr.h b/kernel/fs/hfsplus/xattr.h
new file mode 100644
index 000000000..f9b0955b3
--- /dev/null
+++ b/kernel/fs/hfsplus/xattr.h
@@ -0,0 +1,43 @@
+/*
+ * linux/fs/hfsplus/xattr.h
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Logic of processing extended attributes
+ */
+
+#ifndef _LINUX_HFSPLUS_XATTR_H
+#define _LINUX_HFSPLUS_XATTR_H
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler hfsplus_xattr_osx_handler;
+extern const struct xattr_handler hfsplus_xattr_user_handler;
+extern const struct xattr_handler hfsplus_xattr_trusted_handler;
+extern const struct xattr_handler hfsplus_xattr_security_handler;
+
+extern const struct xattr_handler *hfsplus_xattr_handlers[];
+
+int __hfsplus_setxattr(struct inode *inode, const char *name,
+			const void *value, size_t size, int flags);
+
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+				   const void *value, size_t size, int flags,
+				   const char *prefix, size_t prefixlen);
+
+ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
+			   void *value, size_t size);
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+			 void *value, size_t size,
+			 const char *prefix, size_t prefixlen);
+
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+int hfsplus_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr);
+
+int hfsplus_init_inode_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr);
+
+#endif
diff --git a/kernel/fs/hfsplus/xattr_security.c b/kernel/fs/hfsplus/xattr_security.c
new file mode 100644
index 000000000..aacff00a9
--- /dev/null
+++ b/kernel/fs/hfsplus/xattr_security.c
@@ -0,0 +1,98 @@
+/*
+ * linux/fs/hfsplus/xattr_trusted.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/security.h>
+#include <linux/nls.h>
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+	return hfsplus_getxattr(dentry, name, buffer, size,
+				XATTR_SECURITY_PREFIX,
+				XATTR_SECURITY_PREFIX_LEN);
+}
+
+static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+				XATTR_SECURITY_PREFIX,
+				XATTR_SECURITY_PREFIX_LEN);
+}
+
+static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+static int hfsplus_initxattrs(struct inode *inode,
+				const struct xattr *xattr_array,
+				void *fs_info)
+{
+	const struct xattr *xattr;
+	char *xattr_name;
+	int err = 0;
+
+	xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+		GFP_KERNEL);
+	if (!xattr_name)
+		return -ENOMEM;
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+
+		if (!strcmp(xattr->name, ""))
+			continue;
+
+		strcpy(xattr_name, XATTR_SECURITY_PREFIX);
+		strcpy(xattr_name +
+			XATTR_SECURITY_PREFIX_LEN, xattr->name);
+		memset(xattr_name +
+			XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name), 0, 1);
+
+		err = __hfsplus_setxattr(inode, xattr_name,
+					xattr->value, xattr->value_len, 0);
+		if (err)
+			break;
+	}
+	kfree(xattr_name);
+	return err;
+}
+
+int hfsplus_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					&hfsplus_initxattrs, NULL);
+}
+
+int hfsplus_init_inode_security(struct inode *inode,
+						struct inode *dir,
+						const struct qstr *qstr)
+{
+	int err;
+
+	err = hfsplus_init_posix_acl(inode, dir);
+	if (!err)
+		err = hfsplus_init_security(inode, dir, qstr);
+	return err;
+}
+
+const struct xattr_handler hfsplus_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= hfsplus_security_listxattr,
+	.get	= hfsplus_security_getxattr,
+	.set	= hfsplus_security_setxattr,
+};
diff --git a/kernel/fs/hfsplus/xattr_trusted.c b/kernel/fs/hfsplus/xattr_trusted.c
new file mode 100644
index 000000000..bcf65089b
--- /dev/null
+++ b/kernel/fs/hfsplus/xattr_trusted.c
@@ -0,0 +1,44 @@
+/*
+ * linux/fs/hfsplus/xattr_trusted.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for trusted extended attributes.
+ */
+
+#include <linux/nls.h>
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+	return hfsplus_getxattr(dentry, name, buffer, size,
+				XATTR_TRUSTED_PREFIX,
+				XATTR_TRUSTED_PREFIX_LEN);
+}
+
+static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+				XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+}
+
+static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= hfsplus_trusted_listxattr,
+	.get	= hfsplus_trusted_getxattr,
+	.set	= hfsplus_trusted_setxattr,
+};
diff --git a/kernel/fs/hfsplus/xattr_user.c b/kernel/fs/hfsplus/xattr_user.c
new file mode 100644
index 000000000..5aa0e6dc4
--- /dev/null
+++ b/kernel/fs/hfsplus/xattr_user.c
@@ -0,0 +1,44 @@
+/*
+ * linux/fs/hfsplus/xattr_user.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for user extended attributes.
+ */
+
+#include <linux/nls.h>
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+
+	return hfsplus_getxattr(dentry, name, buffer, size,
+				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= hfsplus_user_listxattr,
+	.get	= hfsplus_user_getxattr,
+	.set	= hfsplus_user_setxattr,
+};
diff --git a/kernel/fs/hostfs/Makefile b/kernel/fs/hostfs/Makefile
new file mode 100644
index 000000000..d5beaffad
--- /dev/null
+++ b/kernel/fs/hostfs/Makefile
@@ -0,0 +1,11 @@
+#
+# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
+# Licensed under the GPL
+#
+
+hostfs-objs := hostfs_kern.o hostfs_user.o
+
+obj-y :=
+obj-$(CONFIG_HOSTFS) += hostfs.o
+
+include arch/um/scripts/Makefile.rules
diff --git a/kernel/fs/hostfs/hostfs.h b/kernel/fs/hostfs/hostfs.h
new file mode 100644
index 000000000..91e19f9df
--- /dev/null
+++ b/kernel/fs/hostfs/hostfs.h
@@ -0,0 +1,98 @@
+#ifndef __UM_FS_HOSTFS
+#define __UM_FS_HOSTFS
+
+#include <os.h>
+
+/*
+ * These are exactly the same definitions as in fs.h, but the names are
+ * changed so that this file can be included in both kernel and user files.
+ */
+
+#define HOSTFS_ATTR_MODE	1
+#define HOSTFS_ATTR_UID 	2
+#define HOSTFS_ATTR_GID 	4
+#define HOSTFS_ATTR_SIZE	8
+#define HOSTFS_ATTR_ATIME	16
+#define HOSTFS_ATTR_MTIME	32
+#define HOSTFS_ATTR_CTIME	64
+#define HOSTFS_ATTR_ATIME_SET	128
+#define HOSTFS_ATTR_MTIME_SET	256
+
+/* These two are unused by hostfs. */
+#define HOSTFS_ATTR_FORCE	512	/* Not a change, but a change it */
+#define HOSTFS_ATTR_ATTR_FLAG	1024
+
+/*
+ * If you are very careful, you'll notice that these two are missing:
+ *
+ * #define ATTR_KILL_SUID	2048
+ * #define ATTR_KILL_SGID	4096
+ *
+ * and this is because they were added in 2.5 development.
+ * Actually, they are not needed by most ->setattr() methods - they are set by
+ * callers of notify_change() to notify that the setuid/setgid bits must be
+ * dropped.
+ * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE
+ * is on, and remove the appropriate bits from attr->ia_mode (attr is a
+ * "struct iattr *"). -BlaisorBlade
+ */
+
+struct hostfs_iattr {
+	unsigned int	ia_valid;
+	unsigned short	ia_mode;
+	uid_t		ia_uid;
+	gid_t		ia_gid;
+	loff_t		ia_size;
+	struct timespec	ia_atime;
+	struct timespec	ia_mtime;
+	struct timespec	ia_ctime;
+};
+
+struct hostfs_stat {
+	unsigned long long ino;
+	unsigned int mode;
+	unsigned int nlink;
+	unsigned int uid;
+	unsigned int gid;
+	unsigned long long size;
+	struct timespec atime, mtime, ctime;
+	unsigned int blksize;
+	unsigned long long blocks;
+	unsigned int maj;
+	unsigned int min;
+};
+
+extern int stat_file(const char *path, struct hostfs_stat *p, int fd);
+extern int access_file(char *path, int r, int w, int x);
+extern int open_file(char *path, int r, int w, int append);
+extern void *open_dir(char *path, int *err_out);
+extern void seek_dir(void *stream, unsigned long long pos);
+extern char *read_dir(void *stream, unsigned long long *pos_out,
+		      unsigned long long *ino_out, int *len_out,
+		      unsigned int *type_out);
+extern void close_file(void *stream);
+extern int replace_file(int oldfd, int fd);
+extern void close_dir(void *stream);
+extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
+extern int write_file(int fd, unsigned long long *offset, const char *buf,
+		      int len);
+extern int lseek_file(int fd, long long offset, int whence);
+extern int fsync_file(int fd, int datasync);
+extern int file_create(char *name, int mode);
+extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd);
+extern int make_symlink(const char *from, const char *to);
+extern int unlink_file(const char *file);
+extern int do_mkdir(const char *file, int mode);
+extern int do_rmdir(const char *file);
+extern int do_mknod(const char *file, int mode, unsigned int major,
+		    unsigned int minor);
+extern int link_file(const char *from, const char *to);
+extern int hostfs_do_readlink(char *file, char *buf, int size);
+extern int rename_file(char *from, char *to);
+extern int rename2_file(char *from, char *to, unsigned int flags);
+extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
+		     long long *bfree_out, long long *bavail_out,
+		     long long *files_out, long long *ffree_out,
+		     void *fsid_out, int fsid_size, long *namelen_out);
+
+#endif
diff --git a/kernel/fs/hostfs/hostfs_kern.c b/kernel/fs/hostfs/hostfs_kern.c
new file mode 100644
index 000000000..07d8d8f52
--- /dev/null
+++ b/kernel/fs/hostfs/hostfs_kern.c
@@ -0,0 +1,1023 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ *
+ * Ported the filesystem routines to 2.5.
+ * 2003-02-10 Petr Baudis <pasky@ucw.cz>
+ */
+
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/statfs.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include "hostfs.h"
+#include <init.h>
+#include <kern.h>
+
+struct hostfs_inode_info {
+	int fd;
+	fmode_t mode;
+	struct inode vfs_inode;
+	struct mutex open_mutex;
+};
+
+static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
+{
+	return list_entry(inode, struct hostfs_inode_info, vfs_inode);
+}
+
+#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
+
+/* Changed in hostfs_args before the kernel starts running */
+static char *root_ino = "";
+static int append = 0;
+
+static const struct inode_operations hostfs_iops;
+static const struct inode_operations hostfs_dir_iops;
+static const struct inode_operations hostfs_link_iops;
+
+#ifndef MODULE
+static int __init hostfs_args(char *options, int *add)
+{
+	char *ptr;
+
+	ptr = strchr(options, ',');
+	if (ptr != NULL)
+		*ptr++ = '\0';
+	if (*options != '\0')
+		root_ino = options;
+
+	options = ptr;
+	while (options) {
+		ptr = strchr(options, ',');
+		if (ptr != NULL)
+			*ptr++ = '\0';
+		if (*options != '\0') {
+			if (!strcmp(options, "append"))
+				append = 1;
+			else printf("hostfs_args - unsupported option - %s\n",
+				    options);
+		}
+		options = ptr;
+	}
+	return 0;
+}
+
+__uml_setup("hostfs=", hostfs_args,
+"hostfs=<root dir>,<flags>,...\n"
+"    This is used to set hostfs parameters.  The root directory argument\n"
+"    is used to confine all hostfs mounts to within the specified directory\n"
+"    tree on the host.  If this isn't specified, then a user inside UML can\n"
+"    mount anything on the host that's accessible to the user that's running\n"
+"    it.\n"
+"    The only flag currently supported is 'append', which specifies that all\n"
+"    files opened by hostfs will be opened in append mode.\n\n"
+);
+#endif
+
+static char *__dentry_name(struct dentry *dentry, char *name)
+{
+	char *p = dentry_path_raw(dentry, name, PATH_MAX);
+	char *root;
+	size_t len;
+
+	root = dentry->d_sb->s_fs_info;
+	len = strlen(root);
+	if (IS_ERR(p)) {
+		__putname(name);
+		return NULL;
+	}
+
+	/*
+	 * This function relies on the fact that dentry_path_raw() will place
+	 * the path name at the end of the provided buffer.
+	 */
+	BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
+
+	strlcpy(name, root, PATH_MAX);
+	if (len > p - name) {
+		__putname(name);
+		return NULL;
+	}
+
+	if (p > name + len)
+		strcpy(name + len, p);
+
+	return name;
+}
+
+static char *dentry_name(struct dentry *dentry)
+{
+	char *name = __getname();
+	if (!name)
+		return NULL;
+
+	return __dentry_name(dentry, name);
+}
+
+static char *inode_name(struct inode *ino)
+{
+	struct dentry *dentry;
+	char *name;
+
+	dentry = d_find_alias(ino);
+	if (!dentry)
+		return NULL;
+
+	name = dentry_name(dentry);
+
+	dput(dentry);
+
+	return name;
+}
+
+static char *follow_link(char *link)
+{
+	int len, n;
+	char *name, *resolved, *end;
+
+	name = __getname();
+	if (!name) {
+		n = -ENOMEM;
+		goto out_free;
+	}
+
+	n = hostfs_do_readlink(link, name, PATH_MAX);
+	if (n < 0)
+		goto out_free;
+	else if (n == PATH_MAX) {
+		n = -E2BIG;
+		goto out_free;
+	}
+
+	if (*name == '/')
+		return name;
+
+	end = strrchr(link, '/');
+	if (end == NULL)
+		return name;
+
+	*(end + 1) = '\0';
+	len = strlen(link) + strlen(name) + 1;
+
+	resolved = kmalloc(len, GFP_KERNEL);
+	if (resolved == NULL) {
+		n = -ENOMEM;
+		goto out_free;
+	}
+
+	sprintf(resolved, "%s%s", link, name);
+	__putname(name);
+	kfree(link);
+	return resolved;
+
+ out_free:
+	__putname(name);
+	return ERR_PTR(n);
+}
+
+static struct inode *hostfs_iget(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	return inode;
+}
+
+static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+{
+	/*
+	 * do_statfs uses struct statfs64 internally, but the linux kernel
+	 * struct statfs still has 32-bit versions for most of these fields,
+	 * so we convert them here
+	 */
+	int err;
+	long long f_blocks;
+	long long f_bfree;
+	long long f_bavail;
+	long long f_files;
+	long long f_ffree;
+
+	err = do_statfs(dentry->d_sb->s_fs_info,
+			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
+			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
+			&sf->f_namelen);
+	if (err)
+		return err;
+	sf->f_blocks = f_blocks;
+	sf->f_bfree = f_bfree;
+	sf->f_bavail = f_bavail;
+	sf->f_files = f_files;
+	sf->f_ffree = f_ffree;
+	sf->f_type = HOSTFS_SUPER_MAGIC;
+	return 0;
+}
+
+static struct inode *hostfs_alloc_inode(struct super_block *sb)
+{
+	struct hostfs_inode_info *hi;
+
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	if (hi == NULL)
+		return NULL;
+	hi->fd = -1;
+	hi->mode = 0;
+	inode_init_once(&hi->vfs_inode);
+	mutex_init(&hi->open_mutex);
+	return &hi->vfs_inode;
+}
+
+static void hostfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (HOSTFS_I(inode)->fd != -1) {
+		close_file(&HOSTFS_I(inode)->fd);
+		HOSTFS_I(inode)->fd = -1;
+	}
+}
+
+static void hostfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kfree(HOSTFS_I(inode));
+}
+
+static void hostfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
+
+static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	const char *root_path = root->d_sb->s_fs_info;
+	size_t offset = strlen(root_ino) + 1;
+
+	if (strlen(root_path) > offset)
+		seq_printf(seq, ",%s", root_path + offset);
+
+	if (append)
+		seq_puts(seq, ",append");
+
+	return 0;
+}
+
+static const struct super_operations hostfs_sbops = {
+	.alloc_inode	= hostfs_alloc_inode,
+	.destroy_inode	= hostfs_destroy_inode,
+	.evict_inode	= hostfs_evict_inode,
+	.statfs		= hostfs_statfs,
+	.show_options	= hostfs_show_options,
+};
+
+static int hostfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	void *dir;
+	char *name;
+	unsigned long long next, ino;
+	int error, len;
+	unsigned int type;
+
+	name = dentry_name(file->f_path.dentry);
+	if (name == NULL)
+		return -ENOMEM;
+	dir = open_dir(name, &error);
+	__putname(name);
+	if (dir == NULL)
+		return -error;
+	next = ctx->pos;
+	seek_dir(dir, next);
+	while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
+		if (!dir_emit(ctx, name, len, ino, type))
+			break;
+		ctx->pos = next;
+	}
+	close_dir(dir);
+	return 0;
+}
+
+static int hostfs_open(struct inode *ino, struct file *file)
+{
+	char *name;
+	fmode_t mode;
+	int err;
+	int r, w, fd;
+
+	mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
+	if ((mode & HOSTFS_I(ino)->mode) == mode)
+		return 0;
+
+	mode |= HOSTFS_I(ino)->mode;
+
+retry:
+	r = w = 0;
+
+	if (mode & FMODE_READ)
+		r = 1;
+	if (mode & FMODE_WRITE)
+		r = w = 1;
+
+	name = dentry_name(file->f_path.dentry);
+	if (name == NULL)
+		return -ENOMEM;
+
+	fd = open_file(name, r, w, append);
+	__putname(name);
+	if (fd < 0)
+		return fd;
+
+	mutex_lock(&HOSTFS_I(ino)->open_mutex);
+	/* somebody else had handled it first? */
+	if ((mode & HOSTFS_I(ino)->mode) == mode) {
+		mutex_unlock(&HOSTFS_I(ino)->open_mutex);
+		close_file(&fd);
+		return 0;
+	}
+	if ((mode | HOSTFS_I(ino)->mode) != mode) {
+		mode |= HOSTFS_I(ino)->mode;
+		mutex_unlock(&HOSTFS_I(ino)->open_mutex);
+		close_file(&fd);
+		goto retry;
+	}
+	if (HOSTFS_I(ino)->fd == -1) {
+		HOSTFS_I(ino)->fd = fd;
+	} else {
+		err = replace_file(fd, HOSTFS_I(ino)->fd);
+		close_file(&fd);
+		if (err < 0) {
+			mutex_unlock(&HOSTFS_I(ino)->open_mutex);
+			return err;
+		}
+	}
+	HOSTFS_I(ino)->mode = mode;
+	mutex_unlock(&HOSTFS_I(ino)->open_mutex);
+
+	return 0;
+}
+
+static int hostfs_file_release(struct inode *inode, struct file *file)
+{
+	filemap_write_and_wait(inode->i_mapping);
+
+	return 0;
+}
+
+static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
+			int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+static const struct file_operations hostfs_file_fops = {
+	.llseek		= generic_file_llseek,
+	.splice_read	= generic_file_splice_read,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.open		= hostfs_open,
+	.release	= hostfs_file_release,
+	.fsync		= hostfs_fsync,
+};
+
+static const struct file_operations hostfs_dir_fops = {
+	.llseek		= generic_file_llseek,
+	.iterate	= hostfs_readdir,
+	.read		= generic_read_dir,
+	.open		= hostfs_open,
+	.fsync		= hostfs_fsync,
+};
+
+static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	char *buffer;
+	loff_t base = page_offset(page);
+	int count = PAGE_CACHE_SIZE;
+	int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int err;
+
+	if (page->index >= end_index)
+		count = inode->i_size & (PAGE_CACHE_SIZE-1);
+
+	buffer = kmap(page);
+
+	err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
+	if (err != count) {
+		ClearPageUptodate(page);
+		goto out;
+	}
+
+	if (base > inode->i_size)
+		inode->i_size = base;
+
+	if (PageError(page))
+		ClearPageError(page);
+	err = 0;
+
+ out:
+	kunmap(page);
+
+	unlock_page(page);
+	return err;
+}
+
+static int hostfs_readpage(struct file *file, struct page *page)
+{
+	char *buffer;
+	loff_t start = page_offset(page);
+	int bytes_read, ret = 0;
+
+	buffer = kmap(page);
+	bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
+			PAGE_CACHE_SIZE);
+	if (bytes_read < 0) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+		ret = bytes_read;
+		goto out;
+	}
+
+	memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read);
+
+	ClearPageError(page);
+	SetPageUptodate(page);
+
+ out:
+	flush_dcache_page(page);
+	kunmap(page);
+	unlock_page(page);
+	return ret;
+}
+
+static int hostfs_write_begin(struct file *file, struct address_space *mapping,
+			      loff_t pos, unsigned len, unsigned flags,
+			      struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+	return 0;
+}
+
+static int hostfs_write_end(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned copied,
+			    struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	void *buffer;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int err;
+
+	buffer = kmap(page);
+	err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
+	kunmap(page);
+
+	if (!PageUptodate(page) && err == PAGE_CACHE_SIZE)
+		SetPageUptodate(page);
+
+	/*
+	 * If err > 0, write_file has added err to pos, so we are comparing
+	 * i_size against the last byte written.
+	 */
+	if (err > 0 && (pos > inode->i_size))
+		inode->i_size = pos;
+	unlock_page(page);
+	page_cache_release(page);
+
+	return err;
+}
+
+static const struct address_space_operations hostfs_aops = {
+	.writepage 	= hostfs_writepage,
+	.readpage	= hostfs_readpage,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.write_begin	= hostfs_write_begin,
+	.write_end	= hostfs_write_end,
+};
+
+static int read_name(struct inode *ino, char *name)
+{
+	dev_t rdev;
+	struct hostfs_stat st;
+	int err = stat_file(name, &st, -1);
+	if (err)
+		return err;
+
+	/* Reencode maj and min with the kernel encoding.*/
+	rdev = MKDEV(st.maj, st.min);
+
+	switch (st.mode & S_IFMT) {
+	case S_IFLNK:
+		ino->i_op = &hostfs_link_iops;
+		break;
+	case S_IFDIR:
+		ino->i_op = &hostfs_dir_iops;
+		ino->i_fop = &hostfs_dir_fops;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		init_special_inode(ino, st.mode & S_IFMT, rdev);
+		ino->i_op = &hostfs_iops;
+		break;
+	case S_IFREG:
+		ino->i_op = &hostfs_iops;
+		ino->i_fop = &hostfs_file_fops;
+		ino->i_mapping->a_ops = &hostfs_aops;
+		break;
+	default:
+		return -EIO;
+	}
+
+	ino->i_ino = st.ino;
+	ino->i_mode = st.mode;
+	set_nlink(ino, st.nlink);
+	i_uid_write(ino, st.uid);
+	i_gid_write(ino, st.gid);
+	ino->i_atime = st.atime;
+	ino->i_mtime = st.mtime;
+	ino->i_ctime = st.ctime;
+	ino->i_size = st.size;
+	ino->i_blocks = st.blocks;
+	return 0;
+}
+
+static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			 bool excl)
+{
+	struct inode *inode;
+	char *name;
+	int error, fd;
+
+	inode = hostfs_iget(dir->i_sb);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto out;
+	}
+
+	error = -ENOMEM;
+	name = dentry_name(dentry);
+	if (name == NULL)
+		goto out_put;
+
+	fd = file_create(name, mode & 0777);
+	if (fd < 0)
+		error = fd;
+	else
+		error = read_name(inode, name);
+
+	__putname(name);
+	if (error)
+		goto out_put;
+
+	HOSTFS_I(inode)->fd = fd;
+	HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE;
+	d_instantiate(dentry, inode);
+	return 0;
+
+ out_put:
+	iput(inode);
+ out:
+	return error;
+}
+
+static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
+				    unsigned int flags)
+{
+	struct inode *inode;
+	char *name;
+	int err;
+
+	inode = hostfs_iget(ino->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+
+	err = -ENOMEM;
+	name = dentry_name(dentry);
+	if (name == NULL)
+		goto out_put;
+
+	err = read_name(inode, name);
+
+	__putname(name);
+	if (err == -ENOENT) {
+		iput(inode);
+		inode = NULL;
+	}
+	else if (err)
+		goto out_put;
+
+	d_add(dentry, inode);
+	return NULL;
+
+ out_put:
+	iput(inode);
+ out:
+	return ERR_PTR(err);
+}
+
+static int hostfs_link(struct dentry *to, struct inode *ino,
+		       struct dentry *from)
+{
+	char *from_name, *to_name;
+	int err;
+
+	if ((from_name = dentry_name(from)) == NULL)
+		return -ENOMEM;
+	to_name = dentry_name(to);
+	if (to_name == NULL) {
+		__putname(from_name);
+		return -ENOMEM;
+	}
+	err = link_file(to_name, from_name);
+	__putname(from_name);
+	__putname(to_name);
+	return err;
+}
+
+static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
+{
+	char *file;
+	int err;
+
+	if (append)
+		return -EPERM;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+
+	err = unlink_file(file);
+	__putname(file);
+	return err;
+}
+
+static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
+			  const char *to)
+{
+	char *file;
+	int err;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+	err = make_symlink(file, to);
+	__putname(file);
+	return err;
+}
+
+static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+{
+	char *file;
+	int err;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+	err = do_mkdir(file, mode);
+	__putname(file);
+	return err;
+}
+
+static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
+{
+	char *file;
+	int err;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+	err = do_rmdir(file);
+	__putname(file);
+	return err;
+}
+
+static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	struct inode *inode;
+	char *name;
+	int err;
+
+	inode = hostfs_iget(dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+
+	err = -ENOMEM;
+	name = dentry_name(dentry);
+	if (name == NULL)
+		goto out_put;
+
+	init_special_inode(inode, mode, dev);
+	err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
+	if (!err)
+		goto out_free;
+
+	err = read_name(inode, name);
+	__putname(name);
+	if (err)
+		goto out_put;
+	if (err)
+		goto out_put;
+
+	d_instantiate(dentry, inode);
+	return 0;
+
+ out_free:
+	__putname(name);
+ out_put:
+	iput(inode);
+ out:
+	return err;
+}
+
+static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			  struct inode *new_dir, struct dentry *new_dentry,
+			  unsigned int flags)
+{
+	char *old_name, *new_name;
+	int err;
+
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	old_name = dentry_name(old_dentry);
+	if (old_name == NULL)
+		return -ENOMEM;
+	new_name = dentry_name(new_dentry);
+	if (new_name == NULL) {
+		__putname(old_name);
+		return -ENOMEM;
+	}
+	if (!flags)
+		err = rename_file(old_name, new_name);
+	else
+		err = rename2_file(old_name, new_name, flags);
+
+	__putname(old_name);
+	__putname(new_name);
+	return err;
+}
+
+static int hostfs_permission(struct inode *ino, int desired)
+{
+	char *name;
+	int r = 0, w = 0, x = 0, err;
+
+	if (desired & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	if (desired & MAY_READ) r = 1;
+	if (desired & MAY_WRITE) w = 1;
+	if (desired & MAY_EXEC) x = 1;
+	name = inode_name(ino);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (S_ISCHR(ino->i_mode) || S_ISBLK(ino->i_mode) ||
+	    S_ISFIFO(ino->i_mode) || S_ISSOCK(ino->i_mode))
+		err = 0;
+	else
+		err = access_file(name, r, w, x);
+	__putname(name);
+	if (!err)
+		err = generic_permission(ino, desired);
+	return err;
+}
+
+static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct hostfs_iattr attrs;
+	char *name;
+	int err;
+
+	int fd = HOSTFS_I(inode)->fd;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (append)
+		attr->ia_valid &= ~ATTR_SIZE;
+
+	attrs.ia_valid = 0;
+	if (attr->ia_valid & ATTR_MODE) {
+		attrs.ia_valid |= HOSTFS_ATTR_MODE;
+		attrs.ia_mode = attr->ia_mode;
+	}
+	if (attr->ia_valid & ATTR_UID) {
+		attrs.ia_valid |= HOSTFS_ATTR_UID;
+		attrs.ia_uid = from_kuid(&init_user_ns, attr->ia_uid);
+	}
+	if (attr->ia_valid & ATTR_GID) {
+		attrs.ia_valid |= HOSTFS_ATTR_GID;
+		attrs.ia_gid = from_kgid(&init_user_ns, attr->ia_gid);
+	}
+	if (attr->ia_valid & ATTR_SIZE) {
+		attrs.ia_valid |= HOSTFS_ATTR_SIZE;
+		attrs.ia_size = attr->ia_size;
+	}
+	if (attr->ia_valid & ATTR_ATIME) {
+		attrs.ia_valid |= HOSTFS_ATTR_ATIME;
+		attrs.ia_atime = attr->ia_atime;
+	}
+	if (attr->ia_valid & ATTR_MTIME) {
+		attrs.ia_valid |= HOSTFS_ATTR_MTIME;
+		attrs.ia_mtime = attr->ia_mtime;
+	}
+	if (attr->ia_valid & ATTR_CTIME) {
+		attrs.ia_valid |= HOSTFS_ATTR_CTIME;
+		attrs.ia_ctime = attr->ia_ctime;
+	}
+	if (attr->ia_valid & ATTR_ATIME_SET) {
+		attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
+	}
+	if (attr->ia_valid & ATTR_MTIME_SET) {
+		attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
+	}
+	name = dentry_name(dentry);
+	if (name == NULL)
+		return -ENOMEM;
+	err = set_attr(name, &attrs, fd);
+	__putname(name);
+	if (err)
+		return err;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode))
+		truncate_setsize(inode, attr->ia_size);
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static const struct inode_operations hostfs_iops = {
+	.permission	= hostfs_permission,
+	.setattr	= hostfs_setattr,
+};
+
+static const struct inode_operations hostfs_dir_iops = {
+	.create		= hostfs_create,
+	.lookup		= hostfs_lookup,
+	.link		= hostfs_link,
+	.unlink		= hostfs_unlink,
+	.symlink	= hostfs_symlink,
+	.mkdir		= hostfs_mkdir,
+	.rmdir		= hostfs_rmdir,
+	.mknod		= hostfs_mknod,
+	.rename2	= hostfs_rename2,
+	.permission	= hostfs_permission,
+	.setattr	= hostfs_setattr,
+};
+
+static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = __getname();
+	if (link) {
+		char *path = dentry_name(dentry);
+		int err = -ENOMEM;
+		if (path) {
+			err = hostfs_do_readlink(path, link, PATH_MAX);
+			if (err == PATH_MAX)
+				err = -E2BIG;
+			__putname(path);
+		}
+		if (err < 0) {
+			__putname(link);
+			link = ERR_PTR(err);
+		}
+	} else {
+		link = ERR_PTR(-ENOMEM);
+	}
+
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+static const struct inode_operations hostfs_link_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= hostfs_follow_link,
+	.put_link	= hostfs_put_link,
+};
+
+static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
+{
+	struct inode *root_inode;
+	char *host_root_path, *req_root = d;
+	int err;
+
+	sb->s_blocksize = 1024;
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = HOSTFS_SUPER_MAGIC;
+	sb->s_op = &hostfs_sbops;
+	sb->s_d_op = &simple_dentry_operations;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	/* NULL is printed as <NULL> by sprintf: avoid that. */
+	if (req_root == NULL)
+		req_root = "";
+
+	err = -ENOMEM;
+	sb->s_fs_info = host_root_path =
+		kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
+	if (host_root_path == NULL)
+		goto out;
+
+	sprintf(host_root_path, "%s/%s", root_ino, req_root);
+
+	root_inode = new_inode(sb);
+	if (!root_inode)
+		goto out;
+
+	err = read_name(root_inode, host_root_path);
+	if (err)
+		goto out_put;
+
+	if (S_ISLNK(root_inode->i_mode)) {
+		char *name = follow_link(host_root_path);
+		if (IS_ERR(name))
+			err = PTR_ERR(name);
+		else
+			err = read_name(root_inode, name);
+		kfree(name);
+		if (err)
+			goto out_put;
+	}
+
+	err = -ENOMEM;
+	sb->s_root = d_make_root(root_inode);
+	if (sb->s_root == NULL)
+		goto out;
+
+	return 0;
+
+out_put:
+	iput(root_inode);
+out:
+	return err;
+}
+
+static struct dentry *hostfs_read_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data)
+{
+	return mount_nodev(type, flags, data, hostfs_fill_sb_common);
+}
+
+static void hostfs_kill_sb(struct super_block *s)
+{
+	kill_anon_super(s);
+	kfree(s->s_fs_info);
+}
+
+static struct file_system_type hostfs_type = {
+	.owner 		= THIS_MODULE,
+	.name 		= "hostfs",
+	.mount	 	= hostfs_read_sb,
+	.kill_sb	= hostfs_kill_sb,
+	.fs_flags 	= 0,
+};
+MODULE_ALIAS_FS("hostfs");
+
+static int __init init_hostfs(void)
+{
+	return register_filesystem(&hostfs_type);
+}
+
+static void __exit exit_hostfs(void)
+{
+	unregister_filesystem(&hostfs_type);
+}
+
+module_init(init_hostfs)
+module_exit(exit_hostfs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/hostfs/hostfs_user.c b/kernel/fs/hostfs/hostfs_user.c
new file mode 100644
index 000000000..9c1e0f019
--- /dev/null
+++ b/kernel/fs/hostfs/hostfs_user.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <sys/syscall.h>
+#include "hostfs.h"
+#include <utime.h>
+
+static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
+{
+	p->ino = buf->st_ino;
+	p->mode = buf->st_mode;
+	p->nlink = buf->st_nlink;
+	p->uid = buf->st_uid;
+	p->gid = buf->st_gid;
+	p->size = buf->st_size;
+	p->atime.tv_sec = buf->st_atime;
+	p->atime.tv_nsec = 0;
+	p->ctime.tv_sec = buf->st_ctime;
+	p->ctime.tv_nsec = 0;
+	p->mtime.tv_sec = buf->st_mtime;
+	p->mtime.tv_nsec = 0;
+	p->blksize = buf->st_blksize;
+	p->blocks = buf->st_blocks;
+	p->maj = os_major(buf->st_rdev);
+	p->min = os_minor(buf->st_rdev);
+}
+
+int stat_file(const char *path, struct hostfs_stat *p, int fd)
+{
+	struct stat64 buf;
+
+	if (fd >= 0) {
+		if (fstat64(fd, &buf) < 0)
+			return -errno;
+	} else if (lstat64(path, &buf) < 0) {
+		return -errno;
+	}
+	stat64_to_hostfs(&buf, p);
+	return 0;
+}
+
+int access_file(char *path, int r, int w, int x)
+{
+	int mode = 0;
+
+	if (r)
+		mode = R_OK;
+	if (w)
+		mode |= W_OK;
+	if (x)
+		mode |= X_OK;
+	if (access(path, mode) != 0)
+		return -errno;
+	else return 0;
+}
+
+int open_file(char *path, int r, int w, int append)
+{
+	int mode = 0, fd;
+
+	if (r && !w)
+		mode = O_RDONLY;
+	else if (!r && w)
+		mode = O_WRONLY;
+	else if (r && w)
+		mode = O_RDWR;
+	else panic("Impossible mode in open_file");
+
+	if (append)
+		mode |= O_APPEND;
+	fd = open64(path, mode);
+	if (fd < 0)
+		return -errno;
+	else return fd;
+}
+
+void *open_dir(char *path, int *err_out)
+{
+	DIR *dir;
+
+	dir = opendir(path);
+	*err_out = errno;
+
+	return dir;
+}
+
+void seek_dir(void *stream, unsigned long long pos)
+{
+	DIR *dir = stream;
+
+	seekdir(dir, pos);
+}
+
+char *read_dir(void *stream, unsigned long long *pos_out,
+	       unsigned long long *ino_out, int *len_out,
+	       unsigned int *type_out)
+{
+	DIR *dir = stream;
+	struct dirent *ent;
+
+	ent = readdir(dir);
+	if (ent == NULL)
+		return NULL;
+	*len_out = strlen(ent->d_name);
+	*ino_out = ent->d_ino;
+	*type_out = ent->d_type;
+	*pos_out = ent->d_off;
+	return ent->d_name;
+}
+
+int read_file(int fd, unsigned long long *offset, char *buf, int len)
+{
+	int n;
+
+	n = pread64(fd, buf, len, *offset);
+	if (n < 0)
+		return -errno;
+	*offset += n;
+	return n;
+}
+
+int write_file(int fd, unsigned long long *offset, const char *buf, int len)
+{
+	int n;
+
+	n = pwrite64(fd, buf, len, *offset);
+	if (n < 0)
+		return -errno;
+	*offset += n;
+	return n;
+}
+
+int lseek_file(int fd, long long offset, int whence)
+{
+	int ret;
+
+	ret = lseek64(fd, offset, whence);
+	if (ret < 0)
+		return -errno;
+	return 0;
+}
+
+int fsync_file(int fd, int datasync)
+{
+	int ret;
+	if (datasync)
+		ret = fdatasync(fd);
+	else
+		ret = fsync(fd);
+
+	if (ret < 0)
+		return -errno;
+	return 0;
+}
+
+int replace_file(int oldfd, int fd)
+{
+	return dup2(oldfd, fd);
+}
+
+void close_file(void *stream)
+{
+	close(*((int *) stream));
+}
+
+void close_dir(void *stream)
+{
+	closedir(stream);
+}
+
+int file_create(char *name, int mode)
+{
+	int fd;
+
+	fd = open64(name, O_CREAT | O_RDWR, mode);
+	if (fd < 0)
+		return -errno;
+	return fd;
+}
+
+int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
+{
+	struct hostfs_stat st;
+	struct timeval times[2];
+	int err, ma;
+
+	if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
+		if (fd >= 0) {
+			if (fchmod(fd, attrs->ia_mode) != 0)
+				return -errno;
+		} else if (chmod(file, attrs->ia_mode) != 0) {
+			return -errno;
+		}
+	}
+	if (attrs->ia_valid & HOSTFS_ATTR_UID) {
+		if (fd >= 0) {
+			if (fchown(fd, attrs->ia_uid, -1))
+				return -errno;
+		} else if (chown(file, attrs->ia_uid, -1)) {
+			return -errno;
+		}
+	}
+	if (attrs->ia_valid & HOSTFS_ATTR_GID) {
+		if (fd >= 0) {
+			if (fchown(fd, -1, attrs->ia_gid))
+				return -errno;
+		} else if (chown(file, -1, attrs->ia_gid)) {
+			return -errno;
+		}
+	}
+	if (attrs->ia_valid & HOSTFS_ATTR_SIZE) {
+		if (fd >= 0) {
+			if (ftruncate(fd, attrs->ia_size))
+				return -errno;
+		} else if (truncate(file, attrs->ia_size)) {
+			return -errno;
+		}
+	}
+
+	/*
+	 * Update accessed and/or modified time, in two parts: first set
+	 * times according to the changes to perform, and then call futimes()
+	 * or utimes() to apply them.
+	 */
+	ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET);
+	if (attrs->ia_valid & ma) {
+		err = stat_file(file, &st, fd);
+		if (err != 0)
+			return err;
+
+		times[0].tv_sec = st.atime.tv_sec;
+		times[0].tv_usec = st.atime.tv_nsec / 1000;
+		times[1].tv_sec = st.mtime.tv_sec;
+		times[1].tv_usec = st.mtime.tv_nsec / 1000;
+
+		if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
+			times[0].tv_sec = attrs->ia_atime.tv_sec;
+			times[0].tv_usec = attrs->ia_atime.tv_nsec / 1000;
+		}
+		if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) {
+			times[1].tv_sec = attrs->ia_mtime.tv_sec;
+			times[1].tv_usec = attrs->ia_mtime.tv_nsec / 1000;
+		}
+
+		if (fd >= 0) {
+			if (futimes(fd, times) != 0)
+				return -errno;
+		} else if (utimes(file, times) != 0) {
+			return -errno;
+		}
+	}
+
+	/* Note: ctime is not handled */
+	if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) {
+		err = stat_file(file, &st, fd);
+		attrs->ia_atime = st.atime;
+		attrs->ia_mtime = st.mtime;
+		if (err != 0)
+			return err;
+	}
+	return 0;
+}
+
+int make_symlink(const char *from, const char *to)
+{
+	int err;
+
+	err = symlink(to, from);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int unlink_file(const char *file)
+{
+	int err;
+
+	err = unlink(file);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int do_mkdir(const char *file, int mode)
+{
+	int err;
+
+	err = mkdir(file, mode);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int do_rmdir(const char *file)
+{
+	int err;
+
+	err = rmdir(file);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
+{
+	int err;
+
+	err = mknod(file, mode, os_makedev(major, minor));
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int link_file(const char *to, const char *from)
+{
+	int err;
+
+	err = link(to, from);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int hostfs_do_readlink(char *file, char *buf, int size)
+{
+	int n;
+
+	n = readlink(file, buf, size);
+	if (n < 0)
+		return -errno;
+	if (n < size)
+		buf[n] = '\0';
+	return n;
+}
+
+int rename_file(char *from, char *to)
+{
+	int err;
+
+	err = rename(from, to);
+	if (err < 0)
+		return -errno;
+	return 0;
+}
+
+int rename2_file(char *from, char *to, unsigned int flags)
+{
+	int err;
+
+#ifndef SYS_renameat2
+#  ifdef __x86_64__
+#    define SYS_renameat2 316
+#  endif
+#  ifdef __i386__
+#    define SYS_renameat2 353
+#  endif
+#endif
+
+#ifdef SYS_renameat2
+	err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
+	if (err < 0) {
+		if (errno != ENOSYS)
+			return -errno;
+		else
+			return -EINVAL;
+	}
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
+int do_statfs(char *root, long *bsize_out, long long *blocks_out,
+	      long long *bfree_out, long long *bavail_out,
+	      long long *files_out, long long *ffree_out,
+	      void *fsid_out, int fsid_size, long *namelen_out)
+{
+	struct statfs64 buf;
+	int err;
+
+	err = statfs64(root, &buf);
+	if (err < 0)
+		return -errno;
+
+	*bsize_out = buf.f_bsize;
+	*blocks_out = buf.f_blocks;
+	*bfree_out = buf.f_bfree;
+	*bavail_out = buf.f_bavail;
+	*files_out = buf.f_files;
+	*ffree_out = buf.f_ffree;
+	memcpy(fsid_out, &buf.f_fsid,
+	       sizeof(buf.f_fsid) > fsid_size ? fsid_size :
+	       sizeof(buf.f_fsid));
+	*namelen_out = buf.f_namelen;
+
+	return 0;
+}
diff --git a/kernel/fs/hpfs/Kconfig b/kernel/fs/hpfs/Kconfig
new file mode 100644
index 000000000..56bd15c5b
--- /dev/null
+++ b/kernel/fs/hpfs/Kconfig
@@ -0,0 +1,14 @@
+config HPFS_FS
+	tristate "OS/2 HPFS file system support"
+	depends on BLOCK
+	help
+	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
+	  is the file system used for organizing files on OS/2 hard disk
+	  partitions. Say Y if you want to be able to read files from and
+	  write files to an OS/2 HPFS partition on your hard drive. OS/2
+	  floppies however are in regular MSDOS format, so you don't need this
+	  option in order to be able to read them. Read
+	  <file:Documentation/filesystems/hpfs.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hpfs.  If unsure, say N.
diff --git a/kernel/fs/hpfs/Makefile b/kernel/fs/hpfs/Makefile
new file mode 100644
index 000000000..57b786fb9
--- /dev/null
+++ b/kernel/fs/hpfs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux hpfs filesystem routines.
+#
+
+obj-$(CONFIG_HPFS_FS) += hpfs.o
+
+hpfs-objs := alloc.o anode.o buffer.o dentry.o dir.o dnode.o ea.o file.o \
+	     inode.o map.o name.o namei.o super.o
diff --git a/kernel/fs/hpfs/alloc.c b/kernel/fs/hpfs/alloc.c
new file mode 100644
index 000000000..f005046e1
--- /dev/null
+++ b/kernel/fs/hpfs/alloc.c
@@ -0,0 +1,486 @@
+/*
+ *  linux/fs/hpfs/alloc.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  HPFS bitmap operations
+ */
+
+#include "hpfs_fn.h"
+
+static void hpfs_claim_alloc(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free != (unsigned)-1) {
+		if (unlikely(!sbi->sb_n_free)) {
+			hpfs_error(s, "free count underflow, allocating sector %08x", sec);
+			sbi->sb_n_free = -1;
+			return;
+		}
+		sbi->sb_n_free--;
+	}
+}
+
+static void hpfs_claim_free(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free != (unsigned)-1) {
+		if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) {
+			hpfs_error(s, "free count overflow, freeing sector %08x", sec);
+			sbi->sb_n_free = -1;
+			return;
+		}
+		sbi->sb_n_free++;
+	}
+}
+
+static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+		if (unlikely(!sbi->sb_n_free_dnodes)) {
+			hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec);
+			sbi->sb_n_free_dnodes = -1;
+			return;
+		}
+		sbi->sb_n_free_dnodes--;
+	}
+}
+
+static void hpfs_claim_dirband_free(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+		if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) {
+			hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec);
+			sbi->sb_n_free_dnodes = -1;
+			return;
+		}
+		sbi->sb_n_free_dnodes++;
+	}
+}
+
+/*
+ * Check if a sector is allocated in bitmap
+ * This is really slow. Turned on only if chk==2
+ */
+
+static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
+{
+	struct quad_buffer_head qbh;
+	__le32 *bmp;
+	if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail;
+	if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
+		hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec);
+		goto fail1;
+	}
+	hpfs_brelse4(&qbh);
+	if (sec >= hpfs_sb(s)->sb_dirband_start && sec < hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) {
+		unsigned ssec = (sec - hpfs_sb(s)->sb_dirband_start) / 4;
+		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto fail;
+		if ((le32_to_cpu(bmp[ssec >> 5]) >> (ssec & 0x1f)) & 1) {
+			hpfs_error(s, "sector '%s' - %08x not allocated in directory bitmap", msg, sec);
+			goto fail1;
+		}
+		hpfs_brelse4(&qbh);
+	}
+	return 0;
+	fail1:
+	hpfs_brelse4(&qbh);
+	fail:
+	return 1;
+}
+
+/*
+ * Check if sector(s) have proper number and additionally check if they're
+ * allocated in bitmap.
+ */
+	
+int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg)
+{
+	if (start + len < start || start < 0x12 ||
+	    start + len > hpfs_sb(s)->sb_fs_size) {
+	    	hpfs_error(s, "sector(s) '%s' badly placed at %08x", msg, start);
+		return 1;
+	}
+	if (hpfs_sb(s)->sb_chk>=2) {
+		int i;
+		for (i = 0; i < len; i++)
+			if (chk_if_allocated(s, start + i, msg)) return 1;
+	}
+	return 0;
+}
+
+static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward)
+{
+	struct quad_buffer_head qbh;
+	__le32 *bmp;
+	unsigned bs = near & ~0x3fff;
+	unsigned nr = (near & 0x3fff) & ~(n - 1);
+	/*unsigned mnr;*/
+	unsigned i, q;
+	int a, b;
+	secno ret = 0;
+	if (n != 1 && n != 4) {
+		hpfs_error(s, "Bad allocation size: %d", n);
+		return 0;
+	}
+	if (bs != ~0x3fff) {
+		if (!(bmp = hpfs_map_bitmap(s, near >> 14, &qbh, "aib"))) goto uls;
+	} else {
+		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto uls;
+	}
+	if (!tstbits(bmp, nr, n + forward)) {
+		ret = bs + nr;
+		goto rt;
+	}
+	q = nr + n; b = 0;
+	while ((a = tstbits(bmp, q, n + forward)) != 0) {
+		q += a;
+		if (n != 1) q = ((q-1)&~(n-1))+n;
+		if (!b) {
+			if (q>>5 != nr>>5) {
+				b = 1;
+				q = nr & 0x1f;
+			}
+		} else if (q > nr) break;
+	}
+	if (!a) {
+		ret = bs + q;
+		goto rt;
+	}
+	nr >>= 5;
+	/*for (i = nr + 1; i != nr; i++, i &= 0x1ff) */
+	i = nr;
+	do {
+		if (!le32_to_cpu(bmp[i])) goto cont;
+		if (n + forward >= 0x3f && le32_to_cpu(bmp[i]) != 0xffffffff) goto cont;
+		q = i<<5;
+		if (i > 0) {
+			unsigned k = le32_to_cpu(bmp[i-1]);
+			while (k & 0x80000000) {
+				q--; k <<= 1;
+			}
+		}
+		if (n != 1) q = ((q-1)&~(n-1))+n;
+		while ((a = tstbits(bmp, q, n + forward)) != 0) {
+			q += a;
+			if (n != 1) q = ((q-1)&~(n-1))+n;
+			if (q>>5 > i) break;
+		}
+		if (!a) {
+			ret = bs + q;
+			goto rt;
+		}
+		cont:
+		i++, i &= 0x1ff;
+	} while (i != nr);
+	rt:
+	if (ret) {
+		if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (le32_to_cpu(bmp[(ret & 0x3fff) >> 5]) | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) {
+			hpfs_error(s, "Allocation doesn't work! Wanted %d, allocated at %08x", n, ret);
+			ret = 0;
+			goto b;
+		}
+		bmp[(ret & 0x3fff) >> 5] &= cpu_to_le32(~(((1 << n) - 1) << (ret & 0x1f)));
+		hpfs_mark_4buffers_dirty(&qbh);
+	}
+	b:
+	hpfs_brelse4(&qbh);
+	uls:
+	return ret;
+}
+
+/*
+ * Allocation strategy:	1) search place near the sector specified
+ *			2) search bitmap where free sectors last found
+ *			3) search all bitmaps
+ *			4) search all bitmaps ignoring number of pre-allocated
+ *				sectors
+ */
+
+secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forward)
+{
+	secno sec;
+	int i;
+	unsigned n_bmps;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	int f_p = 0;
+	int near_bmp;
+	if (forward < 0) {
+		forward = -forward;
+		f_p = 1;
+	}
+	n_bmps = (sbi->sb_fs_size + 0x4000 - 1) >> 14;
+	if (near && near < sbi->sb_fs_size) {
+		if ((sec = alloc_in_bmp(s, near, n, f_p ? forward : forward/4))) goto ret;
+		near_bmp = near >> 14;
+	} else near_bmp = n_bmps / 2;
+	/*
+	if (b != -1) {
+		if ((sec = alloc_in_bmp(s, b<<14, n, f_p ? forward : forward/2))) {
+			b &= 0x0fffffff;
+			goto ret;
+		}
+		if (b > 0x10000000) if ((sec = alloc_in_bmp(s, (b&0xfffffff)<<14, n, f_p ? forward : 0))) goto ret;
+	*/
+	if (!f_p) if (forward > sbi->sb_max_fwd_alloc) forward = sbi->sb_max_fwd_alloc;
+	less_fwd:
+	for (i = 0; i < n_bmps; i++) {
+		if (near_bmp+i < n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i) << 14, n, forward)))) {
+			sbi->sb_c_bitmap = near_bmp+i;
+			goto ret;
+		}	
+		if (!forward) {
+			if (near_bmp-i-1 >= 0 && ((sec = alloc_in_bmp(s, (near_bmp-i-1) << 14, n, forward)))) {
+				sbi->sb_c_bitmap = near_bmp-i-1;
+				goto ret;
+			}
+		} else {
+			if (near_bmp+i >= n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i-n_bmps) << 14, n, forward)))) {
+				sbi->sb_c_bitmap = near_bmp+i-n_bmps;
+				goto ret;
+			}
+		}
+		if (i == 1 && sbi->sb_c_bitmap != -1 && ((sec = alloc_in_bmp(s, (sbi->sb_c_bitmap) << 14, n, forward)))) {
+			goto ret;
+		}
+	}
+	if (!f_p) {
+		if (forward) {
+			sbi->sb_max_fwd_alloc = forward * 3 / 4;
+			forward /= 2;
+			goto less_fwd;
+		}
+	}
+	sec = 0;
+	ret:
+	if (sec) {
+		i = 0;
+		do
+			hpfs_claim_alloc(s, sec + i);
+		while (unlikely(++i < n));
+	}
+	if (sec && f_p) {
+		for (i = 0; i < forward; i++) {
+			if (!hpfs_alloc_if_possible(s, sec + n + i)) {
+				hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i);
+				sec = 0;
+				break;
+			}
+		}
+	}
+	return sec;
+}
+
+static secno alloc_in_dirband(struct super_block *s, secno near)
+{
+	unsigned nr = near;
+	secno sec;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (nr < sbi->sb_dirband_start)
+		nr = sbi->sb_dirband_start;
+	if (nr >= sbi->sb_dirband_start + sbi->sb_dirband_size)
+		nr = sbi->sb_dirband_start + sbi->sb_dirband_size - 4;
+	nr -= sbi->sb_dirband_start;
+	nr >>= 2;
+	sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
+	if (!sec) return 0;
+	hpfs_claim_dirband_alloc(s, sec);
+	return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
+}
+
+/* Alloc sector if it's free */
+
+int hpfs_alloc_if_possible(struct super_block *s, secno sec)
+{
+	struct quad_buffer_head qbh;
+	__le32 *bmp;
+	if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end;
+	if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) {
+		bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		hpfs_claim_alloc(s, sec);
+		return 1;
+	}
+	hpfs_brelse4(&qbh);
+	end:
+	return 0;
+}
+
+/* Free sectors in bitmaps */
+
+void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
+{
+	struct quad_buffer_head qbh;
+	__le32 *bmp;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	/*pr_info("2 - ");*/
+	if (!n) return;
+	if (sec < 0x12) {
+		hpfs_error(s, "Trying to free reserved sector %08x", sec);
+		return;
+	}
+	sbi->sb_max_fwd_alloc += n > 0xffff ? 0xffff : n;
+	if (sbi->sb_max_fwd_alloc > 0xffffff) sbi->sb_max_fwd_alloc = 0xffffff;
+	new_map:
+	if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "free"))) {
+		return;
+	}	
+	new_tst:
+	if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f) & 1)) {
+		hpfs_error(s, "sector %08x not allocated", sec);
+		hpfs_brelse4(&qbh);
+		return;
+	}
+	bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f));
+	hpfs_claim_free(s, sec);
+	if (!--n) {
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		return;
+	}	
+	if (!(++sec & 0x3fff)) {
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		goto new_map;
+	}
+	goto new_tst;
+}
+
+/*
+ * Check if there are at least n free dnodes on the filesystem.
+ * Called before adding to dnode. If we run out of space while
+ * splitting dnodes, it would corrupt dnode tree.
+ */
+
+int hpfs_check_free_dnodes(struct super_block *s, int n)
+{
+	int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14;
+	int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff;
+	int i, j;
+	__le32 *bmp;
+	struct quad_buffer_head qbh;
+	if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
+		for (j = 0; j < 512; j++) {
+			unsigned k;
+			if (!le32_to_cpu(bmp[j])) continue;
+			for (k = le32_to_cpu(bmp[j]); k; k >>= 1) if (k & 1) if (!--n) {
+				hpfs_brelse4(&qbh);
+				return 0;
+			}
+		}
+	}
+	hpfs_brelse4(&qbh);
+	i = 0;
+	if (hpfs_sb(s)->sb_c_bitmap != -1) {
+		bmp = hpfs_map_bitmap(s, b, &qbh, "chkdn1");
+		goto chk_bmp;
+	}
+	chk_next:
+	if (i == b) i++;
+	if (i >= n_bmps) return 1;
+	bmp = hpfs_map_bitmap(s, i, &qbh, "chkdn2");
+	chk_bmp:
+	if (bmp) {
+		for (j = 0; j < 512; j++) {
+			u32 k;
+			if (!le32_to_cpu(bmp[j])) continue;
+			for (k = 0xf; k; k <<= 4)
+				if ((le32_to_cpu(bmp[j]) & k) == k) {
+					if (!--n) {
+						hpfs_brelse4(&qbh);
+						return 0;
+					}
+				}
+		}
+		hpfs_brelse4(&qbh);
+	}
+	i++;
+	goto chk_next;
+}
+
+void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
+{
+	if (hpfs_sb(s)->sb_chk) if (dno & 3) {
+		hpfs_error(s, "hpfs_free_dnode: dnode %08x not aligned", dno);
+		return;
+	}
+	if (dno < hpfs_sb(s)->sb_dirband_start ||
+	    dno >= hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) {
+		hpfs_free_sectors(s, dno, 4);
+	} else {
+		struct quad_buffer_head qbh;
+		__le32 *bmp;
+		unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4;
+		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
+			return;
+		}
+		bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f));
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		hpfs_claim_dirband_free(s, dno);
+	}
+}
+
+struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
+			 dnode_secno *dno, struct quad_buffer_head *qbh)
+{
+	struct dnode *d;
+	if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) {
+		if (!(*dno = alloc_in_dirband(s, near)))
+			if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL;
+	} else {
+		if (!(*dno = hpfs_alloc_sector(s, near, 4, 0)))
+			if (!(*dno = alloc_in_dirband(s, near))) return NULL;
+	}
+	if (!(d = hpfs_get_4sectors(s, *dno, qbh))) {
+		hpfs_free_dnode(s, *dno);
+		return NULL;
+	}
+	memset(d, 0, 2048);
+	d->magic = cpu_to_le32(DNODE_MAGIC);
+	d->first_free = cpu_to_le32(52);
+	d->dirent[0] = 32;
+	d->dirent[2] = 8;
+	d->dirent[30] = 1;
+	d->dirent[31] = 255;
+	d->self = cpu_to_le32(*dno);
+	return d;
+}
+
+struct fnode *hpfs_alloc_fnode(struct super_block *s, secno near, fnode_secno *fno,
+			  struct buffer_head **bh)
+{
+	struct fnode *f;
+	if (!(*fno = hpfs_alloc_sector(s, near, 1, FNODE_ALLOC_FWD))) return NULL;
+	if (!(f = hpfs_get_sector(s, *fno, bh))) {
+		hpfs_free_sectors(s, *fno, 1);
+		return NULL;
+	}	
+	memset(f, 0, 512);
+	f->magic = cpu_to_le32(FNODE_MAGIC);
+	f->ea_offs = cpu_to_le16(0xc4);
+	f->btree.n_free_nodes = 8;
+	f->btree.first_free = cpu_to_le16(8);
+	return f;
+}
+
+struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *ano,
+			  struct buffer_head **bh)
+{
+	struct anode *a;
+	if (!(*ano = hpfs_alloc_sector(s, near, 1, ANODE_ALLOC_FWD))) return NULL;
+	if (!(a = hpfs_get_sector(s, *ano, bh))) {
+		hpfs_free_sectors(s, *ano, 1);
+		return NULL;
+	}
+	memset(a, 0, 512);
+	a->magic = cpu_to_le32(ANODE_MAGIC);
+	a->self = cpu_to_le32(*ano);
+	a->btree.n_free_nodes = 40;
+	a->btree.n_used_nodes = 0;
+	a->btree.first_free = cpu_to_le16(8);
+	return a;
+}
diff --git a/kernel/fs/hpfs/anode.c b/kernel/fs/hpfs/anode.c
new file mode 100644
index 000000000..2d5b254ad
--- /dev/null
+++ b/kernel/fs/hpfs/anode.c
@@ -0,0 +1,496 @@
+/*
+ *  linux/fs/hpfs/anode.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  handling HPFS anode tree that contains file allocation info
+ */
+
+#include "hpfs_fn.h"
+
+/* Find a sector in allocation tree */
+
+secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
+		   struct bplus_header *btree, unsigned sec,
+		   struct buffer_head *bh)
+{
+	anode_secno a = -1;
+	struct anode *anode;
+	int i;
+	int c1, c2 = 0;
+	go_down:
+	if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
+	if (bp_internal(btree)) {
+		for (i = 0; i < btree->n_used_nodes; i++)
+			if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) {
+				a = le32_to_cpu(btree->u.internal[i].down);
+				brelse(bh);
+				if (!(anode = hpfs_map_anode(s, a, &bh))) return -1;
+				btree = &anode->btree;
+				goto go_down;
+			}
+		hpfs_error(s, "sector %08x not found in internal anode %08x", sec, a);
+		brelse(bh);
+		return -1;
+	}
+	for (i = 0; i < btree->n_used_nodes; i++)
+		if (le32_to_cpu(btree->u.external[i].file_secno) <= sec &&
+		    le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > sec) {
+			a = le32_to_cpu(btree->u.external[i].disk_secno) + sec - le32_to_cpu(btree->u.external[i].file_secno);
+			if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, a, 1, "data")) {
+				brelse(bh);
+				return -1;
+			}
+			if (inode) {
+				struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+				hpfs_inode->i_file_sec = le32_to_cpu(btree->u.external[i].file_secno);
+				hpfs_inode->i_disk_sec = le32_to_cpu(btree->u.external[i].disk_secno);
+				hpfs_inode->i_n_secs = le32_to_cpu(btree->u.external[i].length);
+			}
+			brelse(bh);
+			return a;
+		}
+	hpfs_error(s, "sector %08x not found in external anode %08x", sec, a);
+	brelse(bh);
+	return -1;
+}
+
+/* Add a sector to tree */
+
+secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsigned fsecno)
+{
+	struct bplus_header *btree;
+	struct anode *anode = NULL, *ranode = NULL;
+	struct fnode *fnode;
+	anode_secno a, na = -1, ra, up = -1;
+	secno se;
+	struct buffer_head *bh, *bh1, *bh2;
+	int n;
+	unsigned fs;
+	int c1, c2 = 0;
+	if (fnod) {
+		if (!(fnode = hpfs_map_fnode(s, node, &bh))) return -1;
+		btree = &fnode->btree;
+	} else {
+		if (!(anode = hpfs_map_anode(s, node, &bh))) return -1;
+		btree = &anode->btree;
+	}
+	a = node;
+	go_down:
+	if ((n = btree->n_used_nodes - 1) < -!!fnod) {
+		hpfs_error(s, "anode %08x has no entries", a);
+		brelse(bh);
+		return -1;
+	}
+	if (bp_internal(btree)) {
+		a = le32_to_cpu(btree->u.internal[n].down);
+		btree->u.internal[n].file_secno = cpu_to_le32(-1);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_add_sector_to_btree #1")) return -1;
+		if (!(anode = hpfs_map_anode(s, a, &bh))) return -1;
+		btree = &anode->btree;
+		goto go_down;
+	}
+	if (n >= 0) {
+		if (le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length) != fsecno) {
+			hpfs_error(s, "allocated size %08x, trying to add sector %08x, %cnode %08x",
+				le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length), fsecno,
+				fnod?'f':'a', node);
+			brelse(bh);
+			return -1;
+		}
+		if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) {
+			le32_add_cpu(&btree->u.external[n].length, 1);
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			return se;
+		}
+	} else {
+		if (fsecno) {
+			hpfs_error(s, "empty file %08x, trying to add sector %08x", node, fsecno);
+			brelse(bh);
+			return -1;
+		}
+		se = !fnod ? node : (node + 16384) & ~16383;
+	}	
+	if (!(se = hpfs_alloc_sector(s, se, 1, fsecno*ALLOC_M>ALLOC_FWD_MAX ? ALLOC_FWD_MAX : fsecno*ALLOC_M<ALLOC_FWD_MIN ? ALLOC_FWD_MIN : fsecno*ALLOC_M))) {
+		brelse(bh);
+		return -1;
+	}
+	fs = n < 0 ? 0 : le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length);
+	if (!btree->n_free_nodes) {
+		up = a != node ? le32_to_cpu(anode->up) : -1;
+		if (!(anode = hpfs_alloc_anode(s, a, &na, &bh1))) {
+			brelse(bh);
+			hpfs_free_sectors(s, se, 1);
+			return -1;
+		}
+		if (a == node && fnod) {
+			anode->up = cpu_to_le32(node);
+			anode->btree.flags |= BP_fnode_parent;
+			anode->btree.n_used_nodes = btree->n_used_nodes;
+			anode->btree.first_free = btree->first_free;
+			anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes;
+			memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12);
+			btree->flags |= BP_internal;
+			btree->n_free_nodes = 11;
+			btree->n_used_nodes = 1;
+			btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree);
+			btree->u.internal[0].file_secno = cpu_to_le32(-1);
+			btree->u.internal[0].down = cpu_to_le32(na);
+			mark_buffer_dirty(bh);
+		} else if (!(ranode = hpfs_alloc_anode(s, /*a*/0, &ra, &bh2))) {
+			brelse(bh);
+			brelse(bh1);
+			hpfs_free_sectors(s, se, 1);
+			hpfs_free_sectors(s, na, 1);
+			return -1;
+		}
+		brelse(bh);
+		bh = bh1;
+		btree = &anode->btree;
+	}
+	btree->n_free_nodes--; n = btree->n_used_nodes++;
+	le16_add_cpu(&btree->first_free, 12);
+	btree->u.external[n].disk_secno = cpu_to_le32(se);
+	btree->u.external[n].file_secno = cpu_to_le32(fs);
+	btree->u.external[n].length = cpu_to_le32(1);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	if ((a == node && fnod) || na == -1) return se;
+	c2 = 0;
+	while (up != (anode_secno)-1) {
+		struct anode *new_anode;
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, up, &c1, &c2, "hpfs_add_sector_to_btree #2")) return -1;
+		if (up != node || !fnod) {
+			if (!(anode = hpfs_map_anode(s, up, &bh))) return -1;
+			btree = &anode->btree;
+		} else {
+			if (!(fnode = hpfs_map_fnode(s, up, &bh))) return -1;
+			btree = &fnode->btree;
+		}
+		if (btree->n_free_nodes) {
+			btree->n_free_nodes--; n = btree->n_used_nodes++;
+			le16_add_cpu(&btree->first_free, 8);
+			btree->u.internal[n].file_secno = cpu_to_le32(-1);
+			btree->u.internal[n].down = cpu_to_le32(na);
+			btree->u.internal[n-1].file_secno = cpu_to_le32(fs);
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			brelse(bh2);
+			hpfs_free_sectors(s, ra, 1);
+			if ((anode = hpfs_map_anode(s, na, &bh))) {
+				anode->up = cpu_to_le32(up);
+				if (up == node && fnod)
+					anode->btree.flags |= BP_fnode_parent;
+				else
+					anode->btree.flags &= ~BP_fnode_parent;
+				mark_buffer_dirty(bh);
+				brelse(bh);
+			}
+			return se;
+		}
+		up = up != node ? le32_to_cpu(anode->up) : -1;
+		btree->u.internal[btree->n_used_nodes - 1].file_secno = cpu_to_le32(/*fs*/-1);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+		a = na;
+		if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) {
+			anode = new_anode;
+			/*anode->up = cpu_to_le32(up != -1 ? up : ra);*/
+			anode->btree.flags |= BP_internal;
+			anode->btree.n_used_nodes = 1;
+			anode->btree.n_free_nodes = 59;
+			anode->btree.first_free = cpu_to_le16(16);
+			anode->btree.u.internal[0].down = cpu_to_le32(a);
+			anode->btree.u.internal[0].file_secno = cpu_to_le32(-1);
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			if ((anode = hpfs_map_anode(s, a, &bh))) {
+				anode->up = cpu_to_le32(na);
+				mark_buffer_dirty(bh);
+				brelse(bh);
+			}
+		} else na = a;
+	}
+	if ((anode = hpfs_map_anode(s, na, &bh))) {
+		anode->up = cpu_to_le32(node);
+		if (fnod)
+			anode->btree.flags |= BP_fnode_parent;
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	if (!fnod) {
+		if (!(anode = hpfs_map_anode(s, node, &bh))) {
+			brelse(bh2);
+			return -1;
+		}
+		btree = &anode->btree;
+	} else {
+		if (!(fnode = hpfs_map_fnode(s, node, &bh))) {
+			brelse(bh2);
+			return -1;
+		}
+		btree = &fnode->btree;
+	}
+	ranode->up = cpu_to_le32(node);
+	memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free));
+	if (fnod)
+		ranode->btree.flags |= BP_fnode_parent;
+	ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes;
+	if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
+		struct anode *unode;
+		if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) {
+			unode->up = cpu_to_le32(ra);
+			unode->btree.flags &= ~BP_fnode_parent;
+			mark_buffer_dirty(bh1);
+			brelse(bh1);
+		}
+	}
+	btree->flags |= BP_internal;
+	btree->n_free_nodes = fnod ? 10 : 58;
+	btree->n_used_nodes = 2;
+	btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree);
+	btree->u.internal[0].file_secno = cpu_to_le32(fs);
+	btree->u.internal[0].down = cpu_to_le32(ra);
+	btree->u.internal[1].file_secno = cpu_to_le32(-1);
+	btree->u.internal[1].down = cpu_to_le32(na);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	mark_buffer_dirty(bh2);
+	brelse(bh2);
+	return se;
+}
+
+/*
+ * Remove allocation tree. Recursion would look much nicer but
+ * I want to avoid it because it can cause stack overflow.
+ */
+
+void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
+{
+	struct bplus_header *btree1 = btree;
+	struct anode *anode = NULL;
+	anode_secno ano = 0, oano;
+	struct buffer_head *bh;
+	int level = 0;
+	int pos = 0;
+	int i;
+	int c1, c2 = 0;
+	int d1, d2;
+	go_down:
+	d2 = 0;
+	while (bp_internal(btree1)) {
+		ano = le32_to_cpu(btree1->u.internal[pos].down);
+		if (level) brelse(bh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, ano, &d1, &d2, "hpfs_remove_btree #1"))
+				return;
+		if (!(anode = hpfs_map_anode(s, ano, &bh))) return;
+		btree1 = &anode->btree;
+		level++;
+		pos = 0;
+	}
+	for (i = 0; i < btree1->n_used_nodes; i++)
+		hpfs_free_sectors(s, le32_to_cpu(btree1->u.external[i].disk_secno), le32_to_cpu(btree1->u.external[i].length));
+	go_up:
+	if (!level) return;
+	brelse(bh);
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, ano, &c1, &c2, "hpfs_remove_btree #2")) return;
+	hpfs_free_sectors(s, ano, 1);
+	oano = ano;
+	ano = le32_to_cpu(anode->up);
+	if (--level) {
+		if (!(anode = hpfs_map_anode(s, ano, &bh))) return;
+		btree1 = &anode->btree;
+	} else btree1 = btree;
+	for (i = 0; i < btree1->n_used_nodes; i++) {
+		if (le32_to_cpu(btree1->u.internal[i].down) == oano) {
+			if ((pos = i + 1) < btree1->n_used_nodes)
+				goto go_down;
+			else
+				goto go_up;
+		}
+	}
+	hpfs_error(s,
+		   "reference to anode %08x not found in anode %08x "
+		   "(probably bad up pointer)",
+		   oano, level ? ano : -1);
+	if (level)
+		brelse(bh);
+}
+
+/* Just a wrapper around hpfs_bplus_lookup .. used for reading eas */
+
+static secno anode_lookup(struct super_block *s, anode_secno a, unsigned sec)
+{
+	struct anode *anode;
+	struct buffer_head *bh;
+	if (!(anode = hpfs_map_anode(s, a, &bh))) return -1;
+	return hpfs_bplus_lookup(s, NULL, &anode->btree, sec, bh);
+}
+
+int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
+	    unsigned len, char *buf)
+{
+	struct buffer_head *bh;
+	char *data;
+	secno sec;
+	unsigned l;
+	while (len) {
+		if (ano) {
+			if ((sec = anode_lookup(s, a, pos >> 9)) == -1)
+				return -1;
+		} else sec = a + (pos >> 9);
+		if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #1")) return -1;
+		if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9)))
+			return -1;
+		l = 0x200 - (pos & 0x1ff); if (l > len) l = len;
+		memcpy(buf, data + (pos & 0x1ff), l);
+		brelse(bh);
+		buf += l; pos += l; len -= l;
+	}
+	return 0;
+}
+
+int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
+	     unsigned len, const char *buf)
+{
+	struct buffer_head *bh;
+	char *data;
+	secno sec;
+	unsigned l;
+	while (len) {
+		if (ano) {
+			if ((sec = anode_lookup(s, a, pos >> 9)) == -1)
+				return -1;
+		} else sec = a + (pos >> 9);
+		if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #2")) return -1;
+		if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9)))
+			return -1;
+		l = 0x200 - (pos & 0x1ff); if (l > len) l = len;
+		memcpy(data + (pos & 0x1ff), buf, l);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+		buf += l; pos += l; len -= l;
+	}
+	return 0;
+}
+
+void hpfs_ea_remove(struct super_block *s, secno a, int ano, unsigned len)
+{
+	struct anode *anode;
+	struct buffer_head *bh;
+	if (ano) {
+		if (!(anode = hpfs_map_anode(s, a, &bh))) return;
+		hpfs_remove_btree(s, &anode->btree);
+		brelse(bh);
+		hpfs_free_sectors(s, a, 1);
+	} else hpfs_free_sectors(s, a, (len + 511) >> 9);
+}
+
+/* Truncate allocation tree. Doesn't join anodes - I hope it doesn't matter */
+
+void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs)
+{
+	struct fnode *fnode;
+	struct anode *anode;
+	struct buffer_head *bh;
+	struct bplus_header *btree;
+	anode_secno node = f;
+	int i, j, nodes;
+	int c1, c2 = 0;
+	if (fno) {
+		if (!(fnode = hpfs_map_fnode(s, f, &bh))) return;
+		btree = &fnode->btree;
+	} else {
+		if (!(anode = hpfs_map_anode(s, f, &bh))) return;
+		btree = &anode->btree;
+	}
+	if (!secs) {
+		hpfs_remove_btree(s, btree);
+		if (fno) {
+			btree->n_free_nodes = 8;
+			btree->n_used_nodes = 0;
+			btree->first_free = cpu_to_le16(8);
+			btree->flags &= ~BP_internal;
+			mark_buffer_dirty(bh);
+		} else hpfs_free_sectors(s, f, 1);
+		brelse(bh);
+		return;
+	}
+	while (bp_internal(btree)) {
+		nodes = btree->n_used_nodes + btree->n_free_nodes;
+		for (i = 0; i < btree->n_used_nodes; i++)
+			if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f;
+		brelse(bh);
+		hpfs_error(s, "internal btree %08x doesn't end with -1", node);
+		return;
+		f:
+		for (j = i + 1; j < btree->n_used_nodes; j++)
+			hpfs_ea_remove(s, le32_to_cpu(btree->u.internal[j].down), 1, 0);
+		btree->n_used_nodes = i + 1;
+		btree->n_free_nodes = nodes - btree->n_used_nodes;
+		btree->first_free = cpu_to_le16(8 + 8 * btree->n_used_nodes);
+		mark_buffer_dirty(bh);
+		if (btree->u.internal[i].file_secno == cpu_to_le32(secs)) {
+			brelse(bh);
+			return;
+		}
+		node = le32_to_cpu(btree->u.internal[i].down);
+		brelse(bh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, node, &c1, &c2, "hpfs_truncate_btree"))
+				return;
+		if (!(anode = hpfs_map_anode(s, node, &bh))) return;
+		btree = &anode->btree;
+	}	
+	nodes = btree->n_used_nodes + btree->n_free_nodes;
+	for (i = 0; i < btree->n_used_nodes; i++)
+		if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) >= secs) goto ff;
+	brelse(bh);
+	return;
+	ff:
+	if (secs <= le32_to_cpu(btree->u.external[i].file_secno)) {
+		hpfs_error(s, "there is an allocation error in file %08x, sector %08x", f, secs);
+		if (i) i--;
+	}
+	else if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > secs) {
+		hpfs_free_sectors(s, le32_to_cpu(btree->u.external[i].disk_secno) + secs -
+			le32_to_cpu(btree->u.external[i].file_secno), le32_to_cpu(btree->u.external[i].length)
+			- secs + le32_to_cpu(btree->u.external[i].file_secno)); /* I hope gcc optimizes this :-) */
+		btree->u.external[i].length = cpu_to_le32(secs - le32_to_cpu(btree->u.external[i].file_secno));
+	}
+	for (j = i + 1; j < btree->n_used_nodes; j++)
+		hpfs_free_sectors(s, le32_to_cpu(btree->u.external[j].disk_secno), le32_to_cpu(btree->u.external[j].length));
+	btree->n_used_nodes = i + 1;
+	btree->n_free_nodes = nodes - btree->n_used_nodes;
+	btree->first_free = cpu_to_le16(8 + 12 * btree->n_used_nodes);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+}
+
+/* Remove file or directory and it's eas - note that directory must
+   be empty when this is called. */
+
+void hpfs_remove_fnode(struct super_block *s, fnode_secno fno)
+{
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end;
+	if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return;
+	if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree);
+	else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno));
+	ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (ea_indirect(ea))
+			hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
+	hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l));
+	brelse(bh);
+	hpfs_free_sectors(s, fno, 1);
+}
diff --git a/kernel/fs/hpfs/buffer.c b/kernel/fs/hpfs/buffer.c
new file mode 100644
index 000000000..8057fe4e6
--- /dev/null
+++ b/kernel/fs/hpfs/buffer.c
@@ -0,0 +1,204 @@
+/*
+ *  linux/fs/hpfs/buffer.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  general buffer i/o
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include "hpfs_fn.h"
+
+void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
+{
+	struct buffer_head *bh;
+	struct blk_plug plug;
+
+	if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+		return;
+
+	bh = sb_find_get_block(s, secno);
+	if (bh) {
+		if (buffer_uptodate(bh)) {
+			brelse(bh);
+			return;
+		}
+		brelse(bh);
+	};
+
+	blk_start_plug(&plug);
+	while (n > 0) {
+		if (unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+			break;
+		sb_breadahead(s, secno);
+		secno++;
+		n--;
+	}
+	blk_finish_plug(&plug);
+}
+
+/* Map a sector into a buffer and return pointers to it and to the buffer. */
+
+void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
+		 int ahead)
+{
+	struct buffer_head *bh;
+
+	hpfs_lock_assert(s);
+
+	hpfs_prefetch_sectors(s, secno, ahead);
+
+	cond_resched();
+
+	*bhp = bh = sb_bread(s, secno);
+	if (bh != NULL)
+		return bh->b_data;
+	else {
+		pr_err("%s(): read error\n", __func__);
+		return NULL;
+	}
+}
+
+/* Like hpfs_map_sector but don't read anything */
+
+void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+	/*return hpfs_map_sector(s, secno, bhp, 0);*/
+
+	hpfs_lock_assert(s);
+
+	cond_resched();
+
+	if ((*bhp = bh = sb_getblk(s, secno)) != NULL) {
+		if (!buffer_uptodate(bh)) wait_on_buffer(bh);
+		set_buffer_uptodate(bh);
+		return bh->b_data;
+	} else {
+		pr_err("%s(): getblk failed\n", __func__);
+		return NULL;
+	}
+}
+
+/* Map 4 sectors into a 4buffer and return pointers to it and to the buffer. */
+
+void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh,
+		   int ahead)
+{
+	char *data;
+
+	hpfs_lock_assert(s);
+
+	cond_resched();
+
+	if (secno & 3) {
+		pr_err("%s(): unaligned read\n", __func__);
+		return NULL;
+	}
+
+	hpfs_prefetch_sectors(s, secno, 4 + ahead);
+
+	if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0;
+	if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1;
+	if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2;
+	if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3;
+
+	if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+	    likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+	    likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+		return qbh->data = qbh->bh[0]->b_data;
+	}
+
+	qbh->data = data = kmalloc(2048, GFP_NOFS);
+	if (!data) {
+		pr_err("%s(): out of memory\n", __func__);
+		goto bail4;
+	}
+
+	memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512);
+	memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512);
+	memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512);
+	memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512);
+
+	return data;
+
+ bail4:
+	brelse(qbh->bh[3]);
+ bail3:
+	brelse(qbh->bh[2]);
+ bail2:
+	brelse(qbh->bh[1]);
+ bail1:
+	brelse(qbh->bh[0]);
+ bail0:
+	return NULL;
+}
+
+/* Don't read sectors */
+
+void *hpfs_get_4sectors(struct super_block *s, unsigned secno,
+                          struct quad_buffer_head *qbh)
+{
+	cond_resched();
+
+	hpfs_lock_assert(s);
+
+	if (secno & 3) {
+		pr_err("%s(): unaligned read\n", __func__);
+		return NULL;
+	}
+
+	if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0;
+	if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1;
+	if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2;
+	if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3;
+
+	if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+	    likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+	    likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+		return qbh->data = qbh->bh[0]->b_data;
+	}
+
+	if (!(qbh->data = kmalloc(2048, GFP_NOFS))) {
+		pr_err("%s(): out of memory\n", __func__);
+		goto bail4;
+	}
+	return qbh->data;
+
+bail4:
+	brelse(qbh->bh[3]);
+bail3:
+	brelse(qbh->bh[2]);
+bail2:
+	brelse(qbh->bh[1]);
+bail1:
+	brelse(qbh->bh[0]);
+bail0:
+	return NULL;
+}
+	
+
+void hpfs_brelse4(struct quad_buffer_head *qbh)
+{
+	if (unlikely(qbh->data != qbh->bh[0]->b_data))
+		kfree(qbh->data);
+	brelse(qbh->bh[0]);
+	brelse(qbh->bh[1]);
+	brelse(qbh->bh[2]);
+	brelse(qbh->bh[3]);
+}	
+
+void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh)
+{
+	if (unlikely(qbh->data != qbh->bh[0]->b_data)) {
+		memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512);
+		memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512);
+		memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
+		memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+	}
+	mark_buffer_dirty(qbh->bh[0]);
+	mark_buffer_dirty(qbh->bh[1]);
+	mark_buffer_dirty(qbh->bh[2]);
+	mark_buffer_dirty(qbh->bh[3]);
+}
diff --git a/kernel/fs/hpfs/dentry.c b/kernel/fs/hpfs/dentry.c
new file mode 100644
index 000000000..fa27980f2
--- /dev/null
+++ b/kernel/fs/hpfs/dentry.c
@@ -0,0 +1,61 @@
+/*
+ *  linux/fs/hpfs/dentry.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  dcache operations
+ */
+
+#include "hpfs_fn.h"
+
+/*
+ * Note: the dentry argument is the parent dentry.
+ */
+
+static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
+{
+	unsigned long	 hash;
+	int		 i;
+	unsigned l = qstr->len;
+
+	if (l == 1) if (qstr->name[0]=='.') goto x;
+	if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
+	hpfs_adjust_length(qstr->name, &l);
+	/*if (hpfs_chk_name(qstr->name,&l))*/
+		/*return -ENAMETOOLONG;*/
+		/*return -ENOENT;*/
+	x:
+
+	hash = init_name_hash();
+	for (i = 0; i < l; i++)
+		hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	unsigned al = len;
+	unsigned bl = name->len;
+
+	hpfs_adjust_length(str, &al);
+	/*hpfs_adjust_length(b->name, &bl);*/
+
+	/*
+	 * 'str' is the nane of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
+	 */
+
+	if (hpfs_chk_name(name->name, &bl))
+		return 1;
+	if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
+		return 1;
+	return 0;
+}
+
+const struct dentry_operations hpfs_dentry_operations = {
+	.d_hash		= hpfs_hash_dentry,
+	.d_compare	= hpfs_compare_dentry,
+};
diff --git a/kernel/fs/hpfs/dir.c b/kernel/fs/hpfs/dir.c
new file mode 100644
index 000000000..2a8e07425
--- /dev/null
+++ b/kernel/fs/hpfs/dir.c
@@ -0,0 +1,330 @@
+/*
+ *  linux/fs/hpfs/dir.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  directory VFS functions
+ */
+
+#include <linux/slab.h>
+#include "hpfs_fn.h"
+
+static int hpfs_dir_release(struct inode *inode, struct file *filp)
+{
+	hpfs_lock(inode->i_sb);
+	hpfs_del_pos(inode, &filp->f_pos);
+	/*hpfs_write_if_changed(inode);*/
+	hpfs_unlock(inode->i_sb);
+	return 0;
+}
+
+/* This is slow, but it's not used often */
+
+static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
+{
+	loff_t new_off = off + (whence == 1 ? filp->f_pos : 0);
+	loff_t pos;
+	struct quad_buffer_head qbh;
+	struct inode *i = file_inode(filp);
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct super_block *s = i->i_sb;
+
+	/* Somebody else will have to figure out what to do here */
+	if (whence == SEEK_DATA || whence == SEEK_HOLE)
+		return -EINVAL;
+
+	mutex_lock(&i->i_mutex);
+	hpfs_lock(s);
+
+	/*pr_info("dir lseek\n");*/
+	if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
+	pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
+	while (pos != new_off) {
+		if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
+		else goto fail;
+		if (pos == 12) goto fail;
+	}
+	hpfs_add_pos(i, &filp->f_pos);
+ok:
+	filp->f_pos = new_off;
+	hpfs_unlock(s);
+	mutex_unlock(&i->i_mutex);
+	return new_off;
+fail:
+	/*pr_warn("illegal lseek: %016llx\n", new_off);*/
+	hpfs_unlock(s);
+	mutex_unlock(&i->i_mutex);
+	return -ESPIPE;
+}
+
+static int hpfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	int lc;
+	loff_t next_pos;
+	unsigned char *tempname;
+	int c1, c2 = 0;
+	int ret = 0;
+
+	hpfs_lock(inode->i_sb);
+
+	if (hpfs_sb(inode->i_sb)->sb_chk) {
+		if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
+			ret = -EFSERROR;
+			goto out;
+		}
+		if (hpfs_chk_sectors(inode->i_sb, hpfs_inode->i_dno, 4, "dir_dnode")) {
+			ret = -EFSERROR;
+			goto out;
+		}
+	}
+	if (hpfs_sb(inode->i_sb)->sb_chk >= 2) {
+		struct buffer_head *bh;
+		struct fnode *fno;
+		int e = 0;
+		if (!(fno = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) {
+			ret = -EIOERROR;
+			goto out;
+		}
+		if (!fnode_is_dir(fno)) {
+			e = 1;
+			hpfs_error(inode->i_sb, "not a directory, fnode %08lx",
+					(unsigned long)inode->i_ino);
+		}
+		if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) {
+			e = 1;
+			hpfs_error(inode->i_sb, "corrupted inode: i_dno == %08x, fnode -> dnode == %08x", hpfs_inode->i_dno, le32_to_cpu(fno->u.external[0].disk_secno));
+		}
+		brelse(bh);
+		if (e) {
+			ret = -EFSERROR;
+			goto out;
+		}
+	}
+	lc = hpfs_sb(inode->i_sb)->sb_lowercase;
+	if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
+		ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
+		goto out;
+	}
+	if (ctx->pos == 13) {
+		ret = -ENOENT;
+		goto out;
+	}
+	
+	while (1) {
+		again:
+		/* This won't work when cycle is longer than number of dirents
+		   accepted by filldir, but what can I do?
+		   maybe killall -9 ls helps */
+		if (hpfs_sb(inode->i_sb)->sb_chk)
+			if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
+				ret = -EFSERROR;
+				goto out;
+			}
+		if (ctx->pos == 12)
+			goto out;
+		if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
+			pr_err("pos==%d\n", (int)ctx->pos);
+			goto out;
+		}
+		if (ctx->pos == 0) {
+			if (!dir_emit_dot(file, ctx))
+				goto out;
+			ctx->pos = 11;
+		}
+		if (ctx->pos == 11) {
+			if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
+				goto out;
+			ctx->pos = 1;
+		}
+		if (ctx->pos == 1) {
+			ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+			hpfs_add_pos(inode, &file->f_pos);
+			file->f_version = inode->i_version;
+		}
+		next_pos = ctx->pos;
+		if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
+			ctx->pos = next_pos;
+			ret = -EIOERROR;
+			goto out;
+		}
+		if (de->first || de->last) {
+			if (hpfs_sb(inode->i_sb)->sb_chk) {
+				if (de->first && !de->last && (de->namelen != 2
+				    || de ->name[0] != 1 || de->name[1] != 1))
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
+				if (de->last && (de->namelen != 1 || de ->name[0] != 255))
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
+			}
+			hpfs_brelse4(&qbh);
+			ctx->pos = next_pos;
+			goto again;
+		}
+		tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
+		if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
+			if (tempname != de->name) kfree(tempname);
+			hpfs_brelse4(&qbh);
+			goto out;
+		}
+		ctx->pos = next_pos;
+		if (tempname != de->name) kfree(tempname);
+		hpfs_brelse4(&qbh);
+	}
+out:
+	hpfs_unlock(inode->i_sb);
+	return ret;
+}
+
+/*
+ * lookup.  Search the specified directory for the specified name, set
+ * *result to the corresponding inode.
+ *
+ * lookup uses the inode number to tell read_inode whether it is reading
+ * the inode of a directory or a file -- file ino's are odd, directory
+ * ino's are even.  read_inode avoids i/o for file inodes; everything
+ * needed is up here in the directory.  (And file fnodes are out in
+ * the boondocks.)
+ *
+ *    - M.P.: this is over, sometimes we've got to read file's fnode for eas
+ *	      inode numbers are just fnode sector numbers; iget lock is used
+ *	      to tell read_inode to read fnode or not.
+ */
+
+struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	ino_t ino;
+	int err;
+	struct inode *result = NULL;
+	struct hpfs_inode_info *hpfs_result;
+
+	hpfs_lock(dir->i_sb);
+	if ((err = hpfs_chk_name(name, &len))) {
+		if (err == -ENAMETOOLONG) {
+			hpfs_unlock(dir->i_sb);
+			return ERR_PTR(-ENAMETOOLONG);
+		}
+		goto end_add;
+	}
+
+	/*
+	 * '.' and '..' will never be passed here.
+	 */
+
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
+
+	/*
+	 * This is not really a bailout, just means file not found.
+	 */
+
+	if (!de) goto end;
+
+	/*
+	 * Get inode number, what we're after.
+	 */
+
+	ino = le32_to_cpu(de->fnode);
+
+	/*
+	 * Go find or make an inode.
+	 */
+
+	result = iget_locked(dir->i_sb, ino);
+	if (!result) {
+		hpfs_error(dir->i_sb, "hpfs_lookup: can't get inode");
+		goto bail1;
+	}
+	if (result->i_state & I_NEW) {
+		hpfs_init_inode(result);
+		if (de->directory)
+			hpfs_read_inode(result);
+		else if (le32_to_cpu(de->ea_size) && hpfs_sb(dir->i_sb)->sb_eas)
+			hpfs_read_inode(result);
+		else {
+			result->i_mode |= S_IFREG;
+			result->i_mode &= ~0111;
+			result->i_op = &hpfs_file_iops;
+			result->i_fop = &hpfs_file_ops;
+			set_nlink(result, 1);
+		}
+		unlock_new_inode(result);
+	}
+	hpfs_result = hpfs_i(result);
+	if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
+
+	if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
+		hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
+		goto bail1;
+	}
+
+	/*
+	 * Fill in the info from the directory if this is a newly created
+	 * inode.
+	 */
+
+	if (!result->i_ctime.tv_sec) {
+		if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date))))
+			result->i_ctime.tv_sec = 1;
+		result->i_ctime.tv_nsec = 0;
+		result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
+		result->i_mtime.tv_nsec = 0;
+		result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
+		result->i_atime.tv_nsec = 0;
+		hpfs_result->i_ea_size = le32_to_cpu(de->ea_size);
+		if (!hpfs_result->i_ea_mode && de->read_only)
+			result->i_mode &= ~0222;
+		if (!de->directory) {
+			if (result->i_size == -1) {
+				result->i_size = le32_to_cpu(de->file_size);
+				result->i_data.a_ops = &hpfs_aops;
+				hpfs_i(result)->mmu_private = result->i_size;
+			/*
+			 * i_blocks should count the fnode and any anodes.
+			 * We count 1 for the fnode and don't bother about
+			 * anodes -- the disk heads are on the directory band
+			 * and we want them to stay there.
+			 */
+				result->i_blocks = 1 + ((result->i_size + 511) >> 9);
+			}
+		}
+	}
+
+	hpfs_brelse4(&qbh);
+
+	/*
+	 * Made it.
+	 */
+
+	end:
+	end_add:
+	hpfs_unlock(dir->i_sb);
+	d_add(dentry, result);
+	return NULL;
+
+	/*
+	 * Didn't.
+	 */
+	bail1:
+	
+	hpfs_brelse4(&qbh);
+	
+	/*bail:*/
+
+	hpfs_unlock(dir->i_sb);
+	return ERR_PTR(-ENOENT);
+}
+
+const struct file_operations hpfs_dir_ops =
+{
+	.llseek		= hpfs_dir_lseek,
+	.read		= generic_read_dir,
+	.iterate	= hpfs_readdir,
+	.release	= hpfs_dir_release,
+	.fsync		= hpfs_file_fsync,
+};
diff --git a/kernel/fs/hpfs/dnode.c b/kernel/fs/hpfs/dnode.c
new file mode 100644
index 000000000..2923a7bd8
--- /dev/null
+++ b/kernel/fs/hpfs/dnode.c
@@ -0,0 +1,1093 @@
+/*
+ *  linux/fs/hpfs/dnode.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  handling directory dnode tree - adding, deleteing & searching for dirents
+ */
+
+#include "hpfs_fn.h"
+
+static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde)
+{
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end = dnode_end_de(d);
+	int i = 1;
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i;
+		i++;
+	}
+	pr_info("%s(): not_found\n", __func__);
+	return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1;
+}
+
+void hpfs_add_pos(struct inode *inode, loff_t *pos)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	int i = 0;
+	loff_t **ppos;
+
+	if (hpfs_inode->i_rddir_off)
+		for (; hpfs_inode->i_rddir_off[i]; i++)
+			if (hpfs_inode->i_rddir_off[i] == pos) return;
+	if (!(i&0x0f)) {
+		if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) {
+			pr_err("out of memory for position list\n");
+			return;
+		}
+		if (hpfs_inode->i_rddir_off) {
+			memcpy(ppos, hpfs_inode->i_rddir_off, i * sizeof(loff_t));
+			kfree(hpfs_inode->i_rddir_off);
+		}
+		hpfs_inode->i_rddir_off = ppos;
+	}
+	hpfs_inode->i_rddir_off[i] = pos;
+	hpfs_inode->i_rddir_off[i + 1] = NULL;
+}
+
+void hpfs_del_pos(struct inode *inode, loff_t *pos)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	loff_t **i, **j;
+
+	if (!hpfs_inode->i_rddir_off) goto not_f;
+	for (i = hpfs_inode->i_rddir_off; *i; i++) if (*i == pos) goto fnd;
+	goto not_f;
+	fnd:
+	for (j = i + 1; *j; j++) ;
+	*i = *(j - 1);
+	*(j - 1) = NULL;
+	if (j - 1 == hpfs_inode->i_rddir_off) {
+		kfree(hpfs_inode->i_rddir_off);
+		hpfs_inode->i_rddir_off = NULL;
+	}
+	return;
+	not_f:
+	/*pr_warn("position pointer %p->%08x not found\n",
+		  pos, (int)*pos);*/
+	return;
+}
+
+static void for_all_poss(struct inode *inode, void (*f)(loff_t *, loff_t, loff_t),
+			 loff_t p1, loff_t p2)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	loff_t **i;
+
+	if (!hpfs_inode->i_rddir_off) return;
+	for (i = hpfs_inode->i_rddir_off; *i; i++) (*f)(*i, p1, p2);
+	return;
+}
+
+static void hpfs_pos_subst(loff_t *p, loff_t f, loff_t t)
+{
+	if (*p == f) *p = t;
+}
+
+/*void hpfs_hpfs_pos_substd(loff_t *p, loff_t f, loff_t t)
+{
+	if ((*p & ~0x3f) == (f & ~0x3f)) *p = (t & ~0x3f) | (*p & 0x3f);
+}*/
+
+static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c)
+{
+	if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) {
+		int n = (*p & 0x3f) + c;
+		if (n > 0x3f)
+			pr_err("%s(): %08x + %d\n",
+				__func__, (int)*p, (int)c >> 8);
+		else
+			*p = (*p & ~0x3f) | n;
+	}
+}
+
+static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c)
+{
+	if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) {
+		int n = (*p & 0x3f) - c;
+		if (n < 1)
+			pr_err("%s(): %08x - %d\n",
+				__func__, (int)*p, (int)c >> 8);
+		else
+			*p = (*p & ~0x3f) | n;
+	}
+}
+
+static struct hpfs_dirent *dnode_pre_last_de(struct dnode *d)
+{
+	struct hpfs_dirent *de, *de_end, *dee = NULL, *deee = NULL;
+	de_end = dnode_end_de(d);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		deee = dee; dee = de;
+	}	
+	return deee;
+}
+
+static struct hpfs_dirent *dnode_last_de(struct dnode *d)
+{
+	struct hpfs_dirent *de, *de_end, *dee = NULL;
+	de_end = dnode_end_de(d);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		dee = de;
+	}	
+	return dee;
+}
+
+static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno ptr)
+{
+	struct hpfs_dirent *de;
+	if (!(de = dnode_last_de(d))) {
+		hpfs_error(s, "set_last_pointer: empty dnode %08x", le32_to_cpu(d->self));
+		return;
+	}
+	if (hpfs_sb(s)->sb_chk) {
+		if (de->down) {
+			hpfs_error(s, "set_last_pointer: dnode %08x has already last pointer %08x",
+				le32_to_cpu(d->self), de_down_pointer(de));
+			return;
+		}
+		if (le16_to_cpu(de->length) != 32) {
+			hpfs_error(s, "set_last_pointer: bad last dirent in dnode %08x", le32_to_cpu(d->self));
+			return;
+		}
+	}
+	if (ptr) {
+		le32_add_cpu(&d->first_free, 4);
+		if (le32_to_cpu(d->first_free) > 2048) {
+			hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self));
+			le32_add_cpu(&d->first_free, -4);
+			return;
+		}
+		de->length = cpu_to_le16(36);
+		de->down = 1;
+		*(__le32 *)((char *)de + 32) = cpu_to_le32(ptr);
+	}
+}
+
+/* Add an entry to dnode and don't care if it grows over 2048 bytes */
+
+struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
+				const unsigned char *name,
+				unsigned namelen, secno down_ptr)
+{
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end = dnode_end_de(d);
+	unsigned d_size = de_size(namelen, down_ptr);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		int c = hpfs_compare_names(s, name, namelen, de->name, de->namelen, de->last);
+		if (!c) {
+			hpfs_error(s, "name (%c,%d) already exists in dnode %08x", *name, namelen, le32_to_cpu(d->self));
+			return NULL;
+		}
+		if (c < 0) break;
+	}
+	memmove((char *)de + d_size, de, (char *)de_end - (char *)de);
+	memset(de, 0, d_size);
+	if (down_ptr) {
+		*(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
+		de->down = 1;
+	}
+	de->length = cpu_to_le16(d_size);
+	de->not_8x3 = hpfs_is_name_long(name, namelen);
+	de->namelen = namelen;
+	memcpy(de->name, name, namelen);
+	le32_add_cpu(&d->first_free, d_size);
+	return de;
+}
+
+/* Delete dirent and don't care about its subtree */
+
+static void hpfs_delete_de(struct super_block *s, struct dnode *d,
+			   struct hpfs_dirent *de)
+{
+	if (de->last) {
+		hpfs_error(s, "attempt to delete last dirent in dnode %08x", le32_to_cpu(d->self));
+		return;
+	}
+	d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - le16_to_cpu(de->length));
+	memmove(de, de_next_de(de), le32_to_cpu(d->first_free) + (char *)d - (char *)de);
+}
+
+static void fix_up_ptrs(struct super_block *s, struct dnode *d)
+{
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end = dnode_end_de(d);
+	dnode_secno dno = le32_to_cpu(d->self);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de))
+		if (de->down) {
+			struct quad_buffer_head qbh;
+			struct dnode *dd;
+			if ((dd = hpfs_map_dnode(s, de_down_pointer(de), &qbh))) {
+				if (le32_to_cpu(dd->up) != dno || dd->root_dnode) {
+					dd->up = cpu_to_le32(dno);
+					dd->root_dnode = 0;
+					hpfs_mark_4buffers_dirty(&qbh);
+				}
+				hpfs_brelse4(&qbh);
+			}
+		}
+}
+
+/* Add an entry to dnode and do dnode splitting if required */
+
+static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
+			     const unsigned char *name, unsigned namelen,
+			     struct hpfs_dirent *new_de, dnode_secno down_ptr)
+{
+	struct quad_buffer_head qbh, qbh1, qbh2;
+	struct dnode *d, *ad, *rd, *nd = NULL;
+	dnode_secno adno, rdno;
+	struct hpfs_dirent *de;
+	struct hpfs_dirent nde;
+	unsigned char *nname;
+	int h;
+	int pos;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	int c1, c2 = 0;
+	if (!(nname = kmalloc(256, GFP_NOFS))) {
+		pr_err("out of memory, can't add to dnode\n");
+		return 1;
+	}
+	go_up:
+	if (namelen >= 256) {
+		hpfs_error(i->i_sb, "%s(): namelen == %d", __func__, namelen);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) {
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	go_up_a:
+	if (hpfs_sb(i->i_sb)->sb_chk)
+		if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_to_dnode")) {
+			hpfs_brelse4(&qbh);
+			kfree(nd);
+			kfree(nname);
+			return 1;
+		}
+	if (le32_to_cpu(d->first_free) + de_size(namelen, down_ptr) <= 2048) {
+		loff_t t;
+		copy_de(de=hpfs_add_de(i->i_sb, d, name, namelen, down_ptr), new_de);
+		t = get_pos(d, de);
+		for_all_poss(i, hpfs_pos_ins, t, 1);
+		for_all_poss(i, hpfs_pos_subst, 4, t);
+		for_all_poss(i, hpfs_pos_subst, 5, t + 1);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		kfree(nd);
+		kfree(nname);
+		return 0;
+	}
+	if (!nd) if (!(nd = kmalloc(0x924, GFP_NOFS))) {
+		/* 0x924 is a max size of dnode after adding a dirent with
+		   max name length. We alloc this only once. There must
+		   not be any error while splitting dnodes, otherwise the
+		   whole directory, not only file we're adding, would
+		   be lost. */
+		pr_err("out of memory for dnode splitting\n");
+		hpfs_brelse4(&qbh);
+		kfree(nname);
+		return 1;
+	}	
+	memcpy(nd, d, le32_to_cpu(d->first_free));
+	copy_de(de = hpfs_add_de(i->i_sb, nd, name, namelen, down_ptr), new_de);
+	for_all_poss(i, hpfs_pos_ins, get_pos(nd, de), 1);
+	h = ((char *)dnode_last_de(nd) - (char *)nd) / 2 + 10;
+	if (!(ad = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &adno, &qbh1))) {
+		hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted");
+		hpfs_brelse4(&qbh);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	i->i_size += 2048;
+	i->i_blocks += 4;
+	pos = 1;
+	for (de = dnode_first_de(nd); (char *)de_next_de(de) - (char *)nd < h; de = de_next_de(de)) {
+		copy_de(hpfs_add_de(i->i_sb, ad, de->name, de->namelen, de->down ? de_down_pointer(de) : 0), de);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, ((loff_t)adno << 4) | pos);
+		pos++;
+	}
+	copy_de(new_de = &nde, de);
+	memcpy(nname, de->name, de->namelen);
+	name = nname;
+	namelen = de->namelen;
+	for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
+	down_ptr = adno;
+	set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
+	de = de_next_de(de);
+	memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de);
+	le32_add_cpu(&nd->first_free, -((char *)de - (char *)nd - 20));
+	memcpy(d, nd, le32_to_cpu(nd->first_free));
+	for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos);
+	fix_up_ptrs(i->i_sb, ad);
+	if (!d->root_dnode) {
+		ad->up = d->up;
+		dno = le32_to_cpu(ad->up);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		hpfs_mark_4buffers_dirty(&qbh1);
+		hpfs_brelse4(&qbh1);
+		goto go_up;
+	}
+	if (!(rd = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &rdno, &qbh2))) {
+		hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted");
+		hpfs_brelse4(&qbh);
+		hpfs_brelse4(&qbh1);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	i->i_size += 2048;
+	i->i_blocks += 4;
+	rd->root_dnode = 1;
+	rd->up = d->up;
+	if (!(fnode = hpfs_map_fnode(i->i_sb, le32_to_cpu(d->up), &bh))) {
+		hpfs_free_dnode(i->i_sb, rdno);
+		hpfs_brelse4(&qbh);
+		hpfs_brelse4(&qbh1);
+		hpfs_brelse4(&qbh2);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	fnode->u.external[0].disk_secno = cpu_to_le32(rdno);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	hpfs_i(i)->i_dno = rdno;
+	d->up = ad->up = cpu_to_le32(rdno);
+	d->root_dnode = ad->root_dnode = 0;
+	hpfs_mark_4buffers_dirty(&qbh);
+	hpfs_brelse4(&qbh);
+	hpfs_mark_4buffers_dirty(&qbh1);
+	hpfs_brelse4(&qbh1);
+	qbh = qbh2;
+	set_last_pointer(i->i_sb, rd, dno);
+	dno = rdno;
+	d = rd;
+	goto go_up_a;
+}
+
+/*
+ * Add an entry to directory btree.
+ * I hate such crazy directory structure.
+ * It's easy to read but terrible to write.
+ * I wrote this directory code 4 times.
+ * I hope, now it's finally bug-free.
+ */
+
+int hpfs_add_dirent(struct inode *i,
+		    const unsigned char *name, unsigned namelen,
+		    struct hpfs_dirent *new_de)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct dnode *d;
+	struct hpfs_dirent *de, *de_end;
+	struct quad_buffer_head qbh;
+	dnode_secno dno;
+	int c;
+	int c1, c2 = 0;
+	dno = hpfs_inode->i_dno;
+	down:
+	if (hpfs_sb(i->i_sb)->sb_chk)
+		if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_dirent")) return 1;
+	if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 1;
+	de_end = dnode_end_de(d);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		if (!(c = hpfs_compare_names(i->i_sb, name, namelen, de->name, de->namelen, de->last))) {
+			hpfs_brelse4(&qbh);
+			return -1;
+		}	
+		if (c < 0) {
+			if (de->down) {
+				dno = de_down_pointer(de);
+				hpfs_brelse4(&qbh);
+				goto down;
+			}
+			break;
+		}
+	}
+	hpfs_brelse4(&qbh);
+	if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_ADD)) {
+		c = 1;
+		goto ret;
+	}	
+	i->i_version++;
+	c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0);
+	ret:
+	return c;
+}
+
+/* 
+ * Find dirent with higher name in 'from' subtree and move it to 'to' dnode.
+ * Return the dnode we moved from (to be checked later if it's empty)
+ */
+
+static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to)
+{
+	dnode_secno dno, ddno;
+	dnode_secno chk_up = to;
+	struct dnode *dnode;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de, *nde;
+	int a;
+	loff_t t;
+	int c1, c2 = 0;
+	dno = from;
+	while (1) {
+		if (hpfs_sb(i->i_sb)->sb_chk)
+			if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "move_to_top"))
+				return 0;
+		if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 0;
+		if (hpfs_sb(i->i_sb)->sb_chk) {
+			if (le32_to_cpu(dnode->up) != chk_up) {
+				hpfs_error(i->i_sb, "move_to_top: up pointer from %08x should be %08x, is %08x",
+					dno, chk_up, le32_to_cpu(dnode->up));
+				hpfs_brelse4(&qbh);
+				return 0;
+			}
+			chk_up = dno;
+		}
+		if (!(de = dnode_last_de(dnode))) {
+			hpfs_error(i->i_sb, "move_to_top: dnode %08x has no last de", dno);
+			hpfs_brelse4(&qbh);
+			return 0;
+		}
+		if (!de->down) break;
+		dno = de_down_pointer(de);
+		hpfs_brelse4(&qbh);
+	}
+	while (!(de = dnode_pre_last_de(dnode))) {
+		dnode_secno up = le32_to_cpu(dnode->up);
+		hpfs_brelse4(&qbh);
+		hpfs_free_dnode(i->i_sb, dno);
+		i->i_size -= 2048;
+		i->i_blocks -= 4;
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, 5);
+		if (up == to) return to;
+		if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return 0;
+		if (dnode->root_dnode) {
+			hpfs_error(i->i_sb, "move_to_top: got to root_dnode while moving from %08x to %08x", from, to);
+			hpfs_brelse4(&qbh);
+			return 0;
+		}
+		de = dnode_last_de(dnode);
+		if (!de || !de->down) {
+			hpfs_error(i->i_sb, "move_to_top: dnode %08x doesn't point down to %08x", up, dno);
+			hpfs_brelse4(&qbh);
+			return 0;
+		}
+		le32_add_cpu(&dnode->first_free, -4);
+		le16_add_cpu(&de->length, -4);
+		de->down = 0;
+		hpfs_mark_4buffers_dirty(&qbh);
+		dno = up;
+	}
+	t = get_pos(dnode, de);
+	for_all_poss(i, hpfs_pos_subst, t, 4);
+	for_all_poss(i, hpfs_pos_subst, t + 1, 5);
+	if (!(nde = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) {
+		hpfs_error(i->i_sb, "out of memory for dirent - directory will be corrupted");
+		hpfs_brelse4(&qbh);
+		return 0;
+	}
+	memcpy(nde, de, le16_to_cpu(de->length));
+	ddno = de->down ? de_down_pointer(de) : 0;
+	hpfs_delete_de(i->i_sb, dnode, de);
+	set_last_pointer(i->i_sb, dnode, ddno);
+	hpfs_mark_4buffers_dirty(&qbh);
+	hpfs_brelse4(&qbh);
+	a = hpfs_add_to_dnode(i, to, nde->name, nde->namelen, nde, from);
+	kfree(nde);
+	if (a) return 0;
+	return dno;
+}
+
+/* 
+ * Check if a dnode is empty and delete it from the tree
+ * (chkdsk doesn't like empty dnodes)
+ */
+
+static void delete_empty_dnode(struct inode *i, dnode_secno dno)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct quad_buffer_head qbh;
+	struct dnode *dnode;
+	dnode_secno down, up, ndown;
+	int p;
+	struct hpfs_dirent *de;
+	int c1, c2 = 0;
+	try_it_again:
+	if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "delete_empty_dnode")) return;
+	if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return;
+	if (le32_to_cpu(dnode->first_free) > 56) goto end;
+	if (le32_to_cpu(dnode->first_free) == 52 || le32_to_cpu(dnode->first_free) == 56) {
+		struct hpfs_dirent *de_end;
+		int root = dnode->root_dnode;
+		up = le32_to_cpu(dnode->up);
+		de = dnode_first_de(dnode);
+		down = de->down ? de_down_pointer(de) : 0;
+		if (hpfs_sb(i->i_sb)->sb_chk) if (root && !down) {
+			hpfs_error(i->i_sb, "delete_empty_dnode: root dnode %08x is empty", dno);
+			goto end;
+		}
+		hpfs_brelse4(&qbh);
+		hpfs_free_dnode(i->i_sb, dno);
+		i->i_size -= 2048;
+		i->i_blocks -= 4;
+		if (root) {
+			struct fnode *fnode;
+			struct buffer_head *bh;
+			struct dnode *d1;
+			struct quad_buffer_head qbh1;
+			if (hpfs_sb(i->i_sb)->sb_chk)
+				if (up != i->i_ino) {
+					hpfs_error(i->i_sb,
+						   "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
+						   dno, up,
+						   (unsigned long)i->i_ino);
+					return;
+				}
+			if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
+				d1->up = cpu_to_le32(up);
+				d1->root_dnode = 1;
+				hpfs_mark_4buffers_dirty(&qbh1);
+				hpfs_brelse4(&qbh1);
+			}
+			if ((fnode = hpfs_map_fnode(i->i_sb, up, &bh))) {
+				fnode->u.external[0].disk_secno = cpu_to_le32(down);
+				mark_buffer_dirty(bh);
+				brelse(bh);
+			}
+			hpfs_inode->i_dno = down;
+			for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, (loff_t) 12);
+			return;
+		}
+		if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return;
+		p = 1;
+		de_end = dnode_end_de(dnode);
+		for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de), p++)
+			if (de->down) if (de_down_pointer(de) == dno) goto fnd;
+		hpfs_error(i->i_sb, "delete_empty_dnode: pointer to dnode %08x not found in dnode %08x", dno, up);
+		goto end;
+		fnd:
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p);
+		if (!down) {
+			de->down = 0;
+			le16_add_cpu(&de->length, -4);
+			le32_add_cpu(&dnode->first_free, -4);
+			memmove(de_next_de(de), (char *)de_next_de(de) + 4,
+				(char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de));
+		} else {
+			struct dnode *d1;
+			struct quad_buffer_head qbh1;
+			*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4) = down;
+			if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
+				d1->up = cpu_to_le32(up);
+				hpfs_mark_4buffers_dirty(&qbh1);
+				hpfs_brelse4(&qbh1);
+			}
+		}
+	} else {
+		hpfs_error(i->i_sb, "delete_empty_dnode: dnode %08x, first_free == %03x", dno, le32_to_cpu(dnode->first_free));
+		goto end;
+	}
+
+	if (!de->last) {
+		struct hpfs_dirent *de_next = de_next_de(de);
+		struct hpfs_dirent *de_cp;
+		struct dnode *d1;
+		struct quad_buffer_head qbh1;
+		if (!de_next->down) goto endm;
+		ndown = de_down_pointer(de_next);
+		if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) {
+			pr_err("out of memory for dtree balancing\n");
+			goto endm;
+		}
+		memcpy(de_cp, de, le16_to_cpu(de->length));
+		hpfs_delete_de(i->i_sb, dnode, de);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, 4);
+		for_all_poss(i, hpfs_pos_del, ((loff_t)up << 4) | p, 1);
+		if (de_cp->down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de_cp), &qbh1))) {
+			d1->up = cpu_to_le32(ndown);
+			hpfs_mark_4buffers_dirty(&qbh1);
+			hpfs_brelse4(&qbh1);
+		}
+		hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0);
+		/*pr_info("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n",
+		  up, ndown, down, dno);*/
+		dno = up;
+		kfree(de_cp);
+		goto try_it_again;
+	} else {
+		struct hpfs_dirent *de_prev = dnode_pre_last_de(dnode);
+		struct hpfs_dirent *de_cp;
+		struct dnode *d1;
+		struct quad_buffer_head qbh1;
+		dnode_secno dlp;
+		if (!de_prev) {
+			hpfs_error(i->i_sb, "delete_empty_dnode: empty dnode %08x", up);
+			hpfs_mark_4buffers_dirty(&qbh);
+			hpfs_brelse4(&qbh);
+			dno = up;
+			goto try_it_again;
+		}
+		if (!de_prev->down) goto endm;
+		ndown = de_down_pointer(de_prev);
+		if ((d1 = hpfs_map_dnode(i->i_sb, ndown, &qbh1))) {
+			struct hpfs_dirent *del = dnode_last_de(d1);
+			dlp = del->down ? de_down_pointer(del) : 0;
+			if (!dlp && down) {
+				if (le32_to_cpu(d1->first_free) > 2044) {
+					if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
+						pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n");
+						pr_err("terminating balancing operation\n");
+					}
+					hpfs_brelse4(&qbh1);
+					goto endm;
+				}
+				if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
+					pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n");
+					pr_err("goin'on\n");
+				}
+				le16_add_cpu(&del->length, 4);
+				del->down = 1;
+				le32_add_cpu(&d1->first_free, 4);
+			}
+			if (dlp && !down) {
+				le16_add_cpu(&del->length, -4);
+				del->down = 0;
+				le32_add_cpu(&d1->first_free, -4);
+			} else if (down)
+				*(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
+		} else goto endm;
+		if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) {
+			pr_err("out of memory for dtree balancing\n");
+			hpfs_brelse4(&qbh1);
+			goto endm;
+		}
+		hpfs_mark_4buffers_dirty(&qbh1);
+		hpfs_brelse4(&qbh1);
+		memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length));
+		hpfs_delete_de(i->i_sb, dnode, de_prev);
+		if (!de_prev->down) {
+			le16_add_cpu(&de_prev->length, 4);
+			de_prev->down = 1;
+			le32_add_cpu(&dnode->first_free, 4);
+		}
+		*(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, ((loff_t)up << 4) | (p - 1));
+		if (down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de), &qbh1))) {
+			d1->up = cpu_to_le32(ndown);
+			hpfs_mark_4buffers_dirty(&qbh1);
+			hpfs_brelse4(&qbh1);
+		}
+		hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, dlp);
+		dno = up;
+		kfree(de_cp);
+		goto try_it_again;
+	}
+	endm:
+	hpfs_mark_4buffers_dirty(&qbh);
+	end:
+	hpfs_brelse4(&qbh);
+}
+
+
+/* Delete dirent from directory */
+
+int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de,
+		       struct quad_buffer_head *qbh, int depth)
+{
+	struct dnode *dnode = qbh->data;
+	dnode_secno down = 0;
+	loff_t t;
+	if (de->first || de->last) {
+		hpfs_error(i->i_sb, "hpfs_remove_dirent: attempt to delete first or last dirent in dnode %08x", dno);
+		hpfs_brelse4(qbh);
+		return 1;
+	}
+	if (de->down) down = de_down_pointer(de);
+	if (depth && (de->down || (de == dnode_first_de(dnode) && de_next_de(de)->last))) {
+		if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_DEL)) {
+			hpfs_brelse4(qbh);
+			return 2;
+		}
+	}
+	i->i_version++;
+	for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1);
+	hpfs_delete_de(i->i_sb, dnode, de);
+	hpfs_mark_4buffers_dirty(qbh);
+	hpfs_brelse4(qbh);
+	if (down) {
+		dnode_secno a = move_to_top(i, down, dno);
+		for_all_poss(i, hpfs_pos_subst, 5, t);
+		if (a) delete_empty_dnode(i, a);
+		return !a;
+	}
+	delete_empty_dnode(i, dno);
+	return 0;
+}
+
+void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes,
+		       int *n_subdirs, int *n_items)
+{
+	struct dnode *dnode;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	dnode_secno ptr, odno = 0;
+	int c1, c2 = 0;
+	int d1, d2 = 0;
+	go_down:
+	if (n_dnodes) (*n_dnodes)++;
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, dno, &c1, &c2, "hpfs_count_dnodes #1")) return;
+	ptr = 0;
+	go_up:
+	if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return;
+	if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && le32_to_cpu(dnode->up) != odno)
+		hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, le32_to_cpu(dnode->up));
+	de = dnode_first_de(dnode);
+	if (ptr) while(1) {
+		if (de->down) if (de_down_pointer(de) == ptr) goto process_de;
+		if (de->last) {
+			hpfs_brelse4(&qbh);
+			hpfs_error(s, "hpfs_count_dnodes: pointer to dnode %08x not found in dnode %08x, got here from %08x",
+				ptr, dno, odno);
+			return;
+		}
+		de = de_next_de(de);
+	}
+	next_de:
+	if (de->down) {
+		odno = dno;
+		dno = de_down_pointer(de);
+		hpfs_brelse4(&qbh);
+		goto go_down;
+	}
+	process_de:
+	if (!de->first && !de->last && de->directory && n_subdirs) (*n_subdirs)++;
+	if (!de->first && !de->last && n_items) (*n_items)++;
+	if ((de = de_next_de(de)) < dnode_end_de(dnode)) goto next_de;
+	ptr = dno;
+	dno = le32_to_cpu(dnode->up);
+	if (dnode->root_dnode) {
+		hpfs_brelse4(&qbh);
+		return;
+	}
+	hpfs_brelse4(&qbh);
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, ptr, &d1, &d2, "hpfs_count_dnodes #2")) return;
+	odno = -1;
+	goto go_up;
+}
+
+static struct hpfs_dirent *map_nth_dirent(struct super_block *s, dnode_secno dno, int n,
+					  struct quad_buffer_head *qbh, struct dnode **dn)
+{
+	int i;
+	struct hpfs_dirent *de, *de_end;
+	struct dnode *dnode;
+	dnode = hpfs_map_dnode(s, dno, qbh);
+	if (!dnode) return NULL;
+	if (dn) *dn=dnode;
+	de = dnode_first_de(dnode);
+	de_end = dnode_end_de(dnode);
+	for (i = 1; de < de_end; i++, de = de_next_de(de)) {
+		if (i == n) {
+			return de;
+		}	
+		if (de->last) break;
+	}
+	hpfs_brelse4(qbh);
+	hpfs_error(s, "map_nth_dirent: n too high; dnode = %08x, requested %08x", dno, n);
+	return NULL;
+}
+
+dnode_secno hpfs_de_as_down_as_possible(struct super_block *s, dnode_secno dno)
+{
+	struct quad_buffer_head qbh;
+	dnode_secno d = dno;
+	dnode_secno up = 0;
+	struct hpfs_dirent *de;
+	int c1, c2 = 0;
+
+	again:
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, d, &c1, &c2, "hpfs_de_as_down_as_possible"))
+			return d;
+	if (!(de = map_nth_dirent(s, d, 1, &qbh, NULL))) return dno;
+	if (hpfs_sb(s)->sb_chk)
+		if (up && le32_to_cpu(((struct dnode *)qbh.data)->up) != up)
+			hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, le32_to_cpu(((struct dnode *)qbh.data)->up));
+	if (!de->down) {
+		hpfs_brelse4(&qbh);
+		return d;
+	}
+	up = d;
+	d = de_down_pointer(de);
+	hpfs_brelse4(&qbh);
+	goto again;
+}
+
+struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
+				   struct quad_buffer_head *qbh)
+{
+	loff_t pos;
+	unsigned c;
+	dnode_secno dno;
+	struct hpfs_dirent *de, *d;
+	struct hpfs_dirent *up_de;
+	struct hpfs_dirent *end_up_de;
+	struct dnode *dnode;
+	struct dnode *up_dnode;
+	struct quad_buffer_head qbh0;
+
+	pos = *posp;
+	dno = pos >> 6 << 2;
+	pos &= 077;
+	if (!(de = map_nth_dirent(inode->i_sb, dno, pos, qbh, &dnode)))
+		goto bail;
+
+	/* Going to the next dirent */
+	if ((d = de_next_de(de)) < dnode_end_de(dnode)) {
+		if (!(++*posp & 077)) {
+			hpfs_error(inode->i_sb,
+				"map_pos_dirent: pos crossed dnode boundary; pos = %08llx",
+				(unsigned long long)*posp);
+			goto bail;
+		}
+		/* We're going down the tree */
+		if (d->down) {
+			*posp = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, de_down_pointer(d)) << 4) + 1;
+		}
+	
+		return de;
+	}
+
+	/* Going up */
+	if (dnode->root_dnode) goto bail;
+
+	if (!(up_dnode = hpfs_map_dnode(inode->i_sb, le32_to_cpu(dnode->up), &qbh0)))
+		goto bail;
+
+	end_up_de = dnode_end_de(up_dnode);
+	c = 0;
+	for (up_de = dnode_first_de(up_dnode); up_de < end_up_de;
+	     up_de = de_next_de(up_de)) {
+		if (!(++c & 077)) hpfs_error(inode->i_sb,
+			"map_pos_dirent: pos crossed dnode boundary; dnode = %08x", le32_to_cpu(dnode->up));
+		if (up_de->down && de_down_pointer(up_de) == dno) {
+			*posp = ((loff_t) le32_to_cpu(dnode->up) << 4) + c;
+			hpfs_brelse4(&qbh0);
+			return de;
+		}
+	}
+	
+	hpfs_error(inode->i_sb, "map_pos_dirent: pointer to dnode %08x not found in parent dnode %08x",
+		dno, le32_to_cpu(dnode->up));
+	hpfs_brelse4(&qbh0);
+	
+	bail:
+	*posp = 12;
+	return de;
+}
+
+/* Find a dirent in tree */
+
+struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
+			       const unsigned char *name, unsigned len,
+			       dnode_secno *dd, struct quad_buffer_head *qbh)
+{
+	struct dnode *dnode;
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end;
+	int c1, c2 = 0;
+
+	if (!S_ISDIR(inode->i_mode)) hpfs_error(inode->i_sb, "map_dirent: not a directory\n");
+	again:
+	if (hpfs_sb(inode->i_sb)->sb_chk)
+		if (hpfs_stop_cycles(inode->i_sb, dno, &c1, &c2, "map_dirent")) return NULL;
+	if (!(dnode = hpfs_map_dnode(inode->i_sb, dno, qbh))) return NULL;
+	
+	de_end = dnode_end_de(dnode);
+	for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de)) {
+		int t = hpfs_compare_names(inode->i_sb, name, len, de->name, de->namelen, de->last);
+		if (!t) {
+			if (dd) *dd = dno;
+			return de;
+		}
+		if (t < 0) {
+			if (de->down) {
+				dno = de_down_pointer(de);
+				hpfs_brelse4(qbh);
+				goto again;
+			}
+		break;
+		}
+	}
+	hpfs_brelse4(qbh);
+	return NULL;
+}
+
+/*
+ * Remove empty directory. In normal cases it is only one dnode with two
+ * entries, but we must handle also such obscure cases when it's a tree
+ * of empty dnodes.
+ */
+
+void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
+{
+	struct quad_buffer_head qbh;
+	struct dnode *dnode;
+	struct hpfs_dirent *de;
+	dnode_secno d1, d2, rdno = dno;
+	while (1) {
+		if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return;
+		de = dnode_first_de(dnode);
+		if (de->last) {
+			if (de->down) d1 = de_down_pointer(de);
+			else goto error;
+			hpfs_brelse4(&qbh);
+			hpfs_free_dnode(s, dno);
+			dno = d1;
+		} else break;
+	}
+	if (!de->first) goto error;
+	d1 = de->down ? de_down_pointer(de) : 0;
+	de = de_next_de(de);
+	if (!de->last) goto error;
+	d2 = de->down ? de_down_pointer(de) : 0;
+	hpfs_brelse4(&qbh);
+	hpfs_free_dnode(s, dno);
+	do {
+		while (d1) {
+			if (!(dnode = hpfs_map_dnode(s, dno = d1, &qbh))) return;
+			de = dnode_first_de(dnode);
+			if (!de->last) goto error;
+			d1 = de->down ? de_down_pointer(de) : 0;
+			hpfs_brelse4(&qbh);
+			hpfs_free_dnode(s, dno);
+		}
+		d1 = d2;
+		d2 = 0;
+	} while (d1);
+	return;
+	error:
+	hpfs_brelse4(&qbh);
+	hpfs_free_dnode(s, dno);
+	hpfs_error(s, "directory %08x is corrupted or not empty", rdno);
+}
+
+/* 
+ * Find dirent for specified fnode. Use truncated 15-char name in fnode as
+ * a help for searching.
+ */
+
+struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
+				     struct fnode *f, struct quad_buffer_head *qbh)
+{
+	unsigned char *name1;
+	unsigned char *name2;
+	int name1len, name2len;
+	struct dnode *d;
+	dnode_secno dno, downd;
+	struct fnode *upf;
+	struct buffer_head *bh;
+	struct hpfs_dirent *de, *de_end;
+	int c;
+	int c1, c2 = 0;
+	int d1, d2 = 0;
+	name1 = f->name;
+	if (!(name2 = kmalloc(256, GFP_NOFS))) {
+		pr_err("out of memory, can't map dirent\n");
+		return NULL;
+	}
+	if (f->len <= 15)
+		memcpy(name2, name1, name1len = name2len = f->len);
+	else {
+		memcpy(name2, name1, 15);
+		memset(name2 + 15, 0xff, 256 - 15);
+		/*name2[15] = 0xff;*/
+		name1len = 15; name2len = 256;
+	}
+	if (!(upf = hpfs_map_fnode(s, le32_to_cpu(f->up), &bh))) {
+		kfree(name2);
+		return NULL;
+	}	
+	if (!fnode_is_dir(upf)) {
+		brelse(bh);
+		hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up));
+		kfree(name2);
+		return NULL;
+	}
+	dno = le32_to_cpu(upf->u.external[0].disk_secno);
+	brelse(bh);
+	go_down:
+	downd = 0;
+	go_up:
+	if (!(d = hpfs_map_dnode(s, dno, qbh))) {
+		kfree(name2);
+		return NULL;
+	}
+	de_end = dnode_end_de(d);
+	de = dnode_first_de(d);
+	if (downd) {
+		while (de < de_end) {
+			if (de->down) if (de_down_pointer(de) == downd) goto f;
+			de = de_next_de(de);
+		}
+		hpfs_error(s, "pointer to dnode %08x not found in dnode %08x", downd, dno);
+		hpfs_brelse4(qbh);
+		kfree(name2);
+		return NULL;
+	}
+	next_de:
+	if (le32_to_cpu(de->fnode) == fno) {
+		kfree(name2);
+		return de;
+	}
+	c = hpfs_compare_names(s, name1, name1len, de->name, de->namelen, de->last);
+	if (c < 0 && de->down) {
+		dno = de_down_pointer(de);
+		hpfs_brelse4(qbh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) {
+				kfree(name2);
+				return NULL;
+		}
+		goto go_down;
+	}
+	f:
+	if (le32_to_cpu(de->fnode) == fno) {
+		kfree(name2);
+		return de;
+	}
+	c = hpfs_compare_names(s, name2, name2len, de->name, de->namelen, de->last);
+	if (c < 0 && !de->last) goto not_found;
+	if ((de = de_next_de(de)) < de_end) goto next_de;
+	if (d->root_dnode) goto not_found;
+	downd = dno;
+	dno = le32_to_cpu(d->up);
+	hpfs_brelse4(qbh);
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, downd, &d1, &d2, "map_fnode_dirent #2")) {
+			kfree(name2);
+			return NULL;
+		}
+	goto go_up;
+	not_found:
+	hpfs_brelse4(qbh);
+	hpfs_error(s, "dirent for fnode %08x not found", fno);
+	kfree(name2);
+	return NULL;
+}
diff --git a/kernel/fs/hpfs/ea.c b/kernel/fs/hpfs/ea.c
new file mode 100644
index 000000000..ce3f98ba9
--- /dev/null
+++ b/kernel/fs/hpfs/ea.c
@@ -0,0 +1,367 @@
+/*
+ *  linux/fs/hpfs/ea.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  handling extended attributes
+ */
+
+#include "hpfs_fn.h"
+
+/* Remove external extended attributes. ano specifies whether a is a 
+   direct sector where eas starts or an anode */
+
+void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len)
+{
+	unsigned pos = 0;
+	while (pos < len) {
+		char ex[4 + 255 + 1 + 8];
+		struct extended_attribute *ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
+		if (ea_indirect(ea)) {
+			if (ea_valuelen(ea) != 8) {
+				hpfs_error(s, "ea_indirect(ea) set while ea->valuelen!=8, %s %08x, pos %08x",
+					ano ? "anode" : "sectors", a, pos);
+				return;
+			}
+			if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4))
+				return;
+			hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	if (!ano) hpfs_free_sectors(s, a, (len+511) >> 9);
+	else {
+		struct buffer_head *bh;
+		struct anode *anode;
+		if ((anode = hpfs_map_anode(s, a, &bh))) {
+			hpfs_remove_btree(s, &anode->btree);
+			brelse(bh);
+			hpfs_free_sectors(s, a, 1);
+		}
+	}
+}
+
+static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
+{
+	char *ret;
+	if (!(ret = kmalloc(size + 1, GFP_NOFS))) {
+		pr_err("out of memory for EA\n");
+		return NULL;
+	}
+	if (hpfs_ea_read(s, a, ano, 0, size, ret)) {
+		kfree(ret);
+		return NULL;
+	}
+	ret[size] = 0;
+	return ret;
+}
+
+static void set_indirect_ea(struct super_block *s, int ano, secno a,
+			    const char *data, int size)
+{
+	hpfs_ea_write(s, a, ano, 0, size, data);
+}
+
+/* Read an extended attribute named 'key' into the provided buffer */
+
+int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
+		char *buf, int size)
+{
+	unsigned pos;
+	int ano, len;
+	secno a;
+	char ex[4 + 255 + 1 + 8];
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (!strcmp(ea->name, key)) {
+			if (ea_indirect(ea))
+				goto indirect;
+			if (ea_valuelen(ea) >= size)
+				return -EINVAL;
+			memcpy(buf, ea_data(ea), ea_valuelen(ea));
+			buf[ea_valuelen(ea)] = 0;
+			return 0;
+		}
+	a = le32_to_cpu(fnode->ea_secno);
+	len = le32_to_cpu(fnode->ea_size_l);
+	ano = fnode_in_anode(fnode);
+	pos = 0;
+	while (pos < len) {
+		ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return -EIO;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO;
+		if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
+			return -EIO;
+		if (!strcmp(ea->name, key)) {
+			if (ea_indirect(ea))
+				goto indirect;
+			if (ea_valuelen(ea) >= size)
+				return -EINVAL;
+			if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), buf))
+				return -EIO;
+			buf[ea_valuelen(ea)] = 0;
+			return 0;
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	return -ENOENT;
+indirect:
+	if (ea_len(ea) >= size)
+		return -EINVAL;
+	if (hpfs_ea_read(s, ea_sec(ea), ea_in_anode(ea), 0, ea_len(ea), buf))
+		return -EIO;
+	buf[ea_len(ea)] = 0;
+	return 0;
+}
+
+/* Read an extended attribute named 'key' */
+char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *size)
+{
+	char *ret;
+	unsigned pos;
+	int ano, len;
+	secno a;
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (!strcmp(ea->name, key)) {
+			if (ea_indirect(ea))
+				return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
+			if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
+				pr_err("out of memory for EA\n");
+				return NULL;
+			}
+			memcpy(ret, ea_data(ea), ea_valuelen(ea));
+			ret[ea_valuelen(ea)] = 0;
+			return ret;
+		}
+	a = le32_to_cpu(fnode->ea_secno);
+	len = le32_to_cpu(fnode->ea_size_l);
+	ano = fnode_in_anode(fnode);
+	pos = 0;
+	while (pos < len) {
+		char ex[4 + 255 + 1 + 8];
+		ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return NULL;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL;
+		if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
+			return NULL;
+		if (!strcmp(ea->name, key)) {
+			if (ea_indirect(ea))
+				return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
+			if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
+				pr_err("out of memory for EA\n");
+				return NULL;
+			}
+			if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) {
+				kfree(ret);
+				return NULL;
+			}
+			ret[ea_valuelen(ea)] = 0;
+			return ret;
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	return NULL;
+}
+
+/* 
+ * Update or create extended attribute 'key' with value 'data'. Note that
+ * when this ea exists, it MUST have the same size as size of data.
+ * This driver can't change sizes of eas ('cause I just don't need it).
+ */
+
+void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
+		 const char *data, int size)
+{
+	fnode_secno fno = inode->i_ino;
+	struct super_block *s = inode->i_sb;
+	unsigned pos;
+	int ano, len;
+	secno a;
+	unsigned char h[4];
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (!strcmp(ea->name, key)) {
+			if (ea_indirect(ea)) {
+				if (ea_len(ea) == size)
+					set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
+			} else if (ea_valuelen(ea) == size) {
+				memcpy(ea_data(ea), data, size);
+			}
+			return;
+		}
+	a = le32_to_cpu(fnode->ea_secno);
+	len = le32_to_cpu(fnode->ea_size_l);
+	ano = fnode_in_anode(fnode);
+	pos = 0;
+	while (pos < len) {
+		char ex[4 + 255 + 1 + 8];
+		ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
+		if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
+			return;
+		if (!strcmp(ea->name, key)) {
+			if (ea_indirect(ea)) {
+				if (ea_len(ea) == size)
+					set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
+			}
+			else {
+				if (ea_valuelen(ea) == size)
+					hpfs_ea_write(s, a, ano, pos + 4 + ea->namelen + 1, size, data);
+			}
+			return;
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	if (!le16_to_cpu(fnode->ea_offs)) {
+		/*if (le16_to_cpu(fnode->ea_size_s)) {
+			hpfs_error(s, "fnode %08x: ea_size_s == %03x, ea_offs == 0",
+				inode->i_ino, le16_to_cpu(fnode->ea_size_s));
+			return;
+		}*/
+		fnode->ea_offs = cpu_to_le16(0xc4);
+	}
+	if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) {
+		hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x",
+			(unsigned long)inode->i_ino,
+			le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
+		return;
+	}
+	if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) &&
+	     le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5 <= 0x200) {
+		ea = fnode_end_ea(fnode);
+		*(char *)ea = 0;
+		ea->namelen = strlen(key);
+		ea->valuelen_lo = size;
+		ea->valuelen_hi = size >> 8;
+		strcpy(ea->name, key);
+		memcpy(ea_data(ea), data, size);
+		fnode->ea_size_s = cpu_to_le16(le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5);
+		goto ret;
+	}
+	/* Most the code here is 99.9993422% unused. I hope there are no bugs.
+	   But what .. HPFS.IFS has also bugs in ea management. */
+	if (le16_to_cpu(fnode->ea_size_s) && !le32_to_cpu(fnode->ea_size_l)) {
+		secno n;
+		struct buffer_head *bh;
+		char *data;
+		if (!(n = hpfs_alloc_sector(s, fno, 1, 0))) return;
+		if (!(data = hpfs_get_sector(s, n, &bh))) {
+			hpfs_free_sectors(s, n, 1);
+			return;
+		}
+		memcpy(data, fnode_ea(fnode), le16_to_cpu(fnode->ea_size_s));
+		fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s));
+		fnode->ea_size_s = cpu_to_le16(0);
+		fnode->ea_secno = cpu_to_le32(n);
+		fnode->flags &= ~FNODE_anode;
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	pos = le32_to_cpu(fnode->ea_size_l) + 5 + strlen(key) + size;
+	len = (le32_to_cpu(fnode->ea_size_l) + 511) >> 9;
+	if (pos >= 30000) goto bail;
+	while (((pos + 511) >> 9) > len) {
+		if (!len) {
+			secno q = hpfs_alloc_sector(s, fno, 1, 0);
+			if (!q) goto bail;
+			fnode->ea_secno = cpu_to_le32(q);
+			fnode->flags &= ~FNODE_anode;
+			len++;
+		} else if (!fnode_in_anode(fnode)) {
+			if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) {
+				len++;
+			} else {
+				/* Aargh... don't know how to create ea anodes :-( */
+				/*struct buffer_head *bh;
+				struct anode *anode;
+				anode_secno a_s;
+				if (!(anode = hpfs_alloc_anode(s, fno, &a_s, &bh)))
+					goto bail;
+				anode->up = cpu_to_le32(fno);
+				anode->btree.fnode_parent = 1;
+				anode->btree.n_free_nodes--;
+				anode->btree.n_used_nodes++;
+				anode->btree.first_free = cpu_to_le16(le16_to_cpu(anode->btree.first_free) + 12);
+				anode->u.external[0].disk_secno = cpu_to_le32(le32_to_cpu(fnode->ea_secno));
+				anode->u.external[0].file_secno = cpu_to_le32(0);
+				anode->u.external[0].length = cpu_to_le32(len);
+				mark_buffer_dirty(bh);
+				brelse(bh);
+				fnode->flags |= FNODE_anode;
+				fnode->ea_secno = cpu_to_le32(a_s);*/
+				secno new_sec;
+				int i;
+				if (!(new_sec = hpfs_alloc_sector(s, fno, 1, 1 - ((pos + 511) >> 9))))
+					goto bail;
+				for (i = 0; i < len; i++) {
+					struct buffer_head *bh1, *bh2;
+					void *b1, *b2;
+					if (!(b1 = hpfs_map_sector(s, le32_to_cpu(fnode->ea_secno) + i, &bh1, len - i - 1))) {
+						hpfs_free_sectors(s, new_sec, (pos + 511) >> 9);
+						goto bail;
+					}
+					if (!(b2 = hpfs_get_sector(s, new_sec + i, &bh2))) {
+						brelse(bh1);
+						hpfs_free_sectors(s, new_sec, (pos + 511) >> 9);
+						goto bail;
+					}
+					memcpy(b2, b1, 512);
+					brelse(bh1);
+					mark_buffer_dirty(bh2);
+					brelse(bh2);
+				}
+				hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno), len);
+				fnode->ea_secno = cpu_to_le32(new_sec);
+				len = (pos + 511) >> 9;
+			}
+		}
+		if (fnode_in_anode(fnode)) {
+			if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno),
+						     0, len) != -1) {
+				len++;
+			} else {
+				goto bail;
+			}
+		}
+	}
+	h[0] = 0;
+	h[1] = strlen(key);
+	h[2] = size & 0xff;
+	h[3] = size >> 8;
+	if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
+	if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
+	if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
+	fnode->ea_size_l = cpu_to_le32(pos);
+	ret:
+	hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size;
+	return;
+	bail:
+	if (le32_to_cpu(fnode->ea_secno))
+		if (fnode_in_anode(fnode)) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
+		else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9));
+	else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0);
+}
+	
diff --git a/kernel/fs/hpfs/file.c b/kernel/fs/hpfs/file.c
new file mode 100644
index 000000000..6d8cfe9b5
--- /dev/null
+++ b/kernel/fs/hpfs/file.c
@@ -0,0 +1,211 @@
+/*
+ *  linux/fs/hpfs/file.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  file VFS functions
+ */
+
+#include "hpfs_fn.h"
+#include <linux/mpage.h>
+
+#define BLOCKS(size) (((size) + 511) >> 9)
+
+static int hpfs_file_release(struct inode *inode, struct file *file)
+{
+	hpfs_lock(inode->i_sb);
+	hpfs_write_if_changed(inode);
+	hpfs_unlock(inode->i_sb);
+	return 0;
+}
+
+int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	ret = filemap_write_and_wait_range(file->f_mapping, start, end);
+	if (ret)
+		return ret;
+	return sync_blockdev(inode->i_sb->s_bdev);
+}
+
+/*
+ * generic_file_read often calls bmap with non-existing sector,
+ * so we must ignore such errors.
+ */
+
+static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	unsigned n, disk_secno;
+	struct fnode *fnode;
+	struct buffer_head *bh;
+	if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
+	n = file_secno - hpfs_inode->i_file_sec;
+	if (n < hpfs_inode->i_n_secs) {
+		*n_secs = hpfs_inode->i_n_secs - n;
+		return hpfs_inode->i_disk_sec + n;
+	}
+	if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
+	disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
+	if (disk_secno == -1) return 0;
+	if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
+	n = file_secno - hpfs_inode->i_file_sec;
+	if (n < hpfs_inode->i_n_secs) {
+		*n_secs = hpfs_inode->i_n_secs - n;
+		return hpfs_inode->i_disk_sec + n;
+	}
+	*n_secs = 1;
+	return disk_secno;
+}
+
+void hpfs_truncate(struct inode *i)
+{
+	if (IS_IMMUTABLE(i)) return /*-EPERM*/;
+	hpfs_lock_assert(i->i_sb);
+
+	hpfs_i(i)->i_n_secs = 0;
+	i->i_blocks = 1 + ((i->i_size + 511) >> 9);
+	hpfs_i(i)->mmu_private = i->i_size;
+	hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
+	hpfs_write_inode(i);
+	hpfs_i(i)->i_n_secs = 0;
+}
+
+static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+{
+	int r;
+	secno s;
+	unsigned n_secs;
+	hpfs_lock(inode->i_sb);
+	s = hpfs_bmap(inode, iblock, &n_secs);
+	if (s) {
+		if (bh_result->b_size >> 9 < n_secs)
+			n_secs = bh_result->b_size >> 9;
+		map_bh(bh_result, inode->i_sb, s);
+		bh_result->b_size = n_secs << 9;
+		goto ret_0;
+	}
+	if (!create) goto ret_0;
+	if (iblock<<9 != hpfs_i(inode)->mmu_private) {
+		BUG();
+		r = -EIO;
+		goto ret_r;
+	}
+	if ((s = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1)) == -1) {
+		hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1);
+		r = -ENOSPC;
+		goto ret_r;
+	}
+	inode->i_blocks++;
+	hpfs_i(inode)->mmu_private += 512;
+	set_buffer_new(bh_result);
+	map_bh(bh_result, inode->i_sb, s);
+	ret_0:
+	r = 0;
+	ret_r:
+	hpfs_unlock(inode->i_sb);
+	return r;
+}
+
+static int hpfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, hpfs_get_block);
+}
+
+static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, hpfs_get_block, wbc);
+}
+
+static int hpfs_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
+}
+
+static int hpfs_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, hpfs_get_block);
+}
+
+static void hpfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	hpfs_lock(inode->i_sb);
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		hpfs_truncate(inode);
+	}
+
+	hpfs_unlock(inode->i_sb);
+}
+
+static int hpfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				hpfs_get_block,
+				&hpfs_i(mapping->host)->mmu_private);
+	if (unlikely(ret))
+		hpfs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static int hpfs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pagep, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int err;
+	err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+	if (err < len)
+		hpfs_write_failed(mapping, pos + len);
+	if (!(err < 0)) {
+		/* make sure we write it on close, if not earlier */
+		hpfs_lock(inode->i_sb);
+		hpfs_i(inode)->i_dirty = 1;
+		hpfs_unlock(inode->i_sb);
+	}
+	return err;
+}
+
+static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,hpfs_get_block);
+}
+
+const struct address_space_operations hpfs_aops = {
+	.readpage = hpfs_readpage,
+	.writepage = hpfs_writepage,
+	.readpages = hpfs_readpages,
+	.writepages = hpfs_writepages,
+	.write_begin = hpfs_write_begin,
+	.write_end = hpfs_write_end,
+	.bmap = _hpfs_bmap
+};
+
+const struct file_operations hpfs_file_ops =
+{
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.release	= hpfs_file_release,
+	.fsync		= hpfs_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations hpfs_file_iops =
+{
+	.setattr	= hpfs_setattr,
+};
diff --git a/kernel/fs/hpfs/hpfs.h b/kernel/fs/hpfs/hpfs.h
new file mode 100644
index 000000000..cce025aff
--- /dev/null
+++ b/kernel/fs/hpfs/hpfs.h
@@ -0,0 +1,559 @@
+/*
+ *  linux/fs/hpfs/hpfs.h
+ *
+ *  HPFS structures by Chris Smith, 1993
+ *
+ *  a little bit modified by Mikulas Patocka, 1998-1999
+ */
+
+/* The paper
+
+     Duncan, Roy
+     Design goals and implementation of the new High Performance File System
+     Microsoft Systems Journal  Sept 1989  v4 n5 p1(13)
+
+   describes what HPFS looked like when it was new, and it is the source
+   of most of the information given here.  The rest is conjecture.
+
+   For definitive information on the Duncan paper, see it, not this file.
+   For definitive information on HPFS, ask somebody else -- this is guesswork.
+   There are certain to be many mistakes. */
+
+#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
+#error unknown endian
+#endif
+
+/* Notation */
+
+typedef u32 secno;			/* sector number, partition relative */
+
+typedef secno dnode_secno;		/* sector number of a dnode */
+typedef secno fnode_secno;		/* sector number of an fnode */
+typedef secno anode_secno;		/* sector number of an anode */
+
+typedef u32 time32_t;		/* 32-bit time_t type */
+
+/* sector 0 */
+
+/* The boot block is very like a FAT boot block, except that the
+   29h signature byte is 28h instead, and the ID string is "HPFS". */
+
+#define BB_MAGIC 0xaa55
+
+struct hpfs_boot_block
+{
+  u8 jmp[3];
+  u8 oem_id[8];
+  u8 bytes_per_sector[2];	/* 512 */
+  u8 sectors_per_cluster;
+  u8 n_reserved_sectors[2];
+  u8 n_fats;
+  u8 n_rootdir_entries[2];
+  u8 n_sectors_s[2];
+  u8 media_byte;
+  __le16 sectors_per_fat;
+  __le16 sectors_per_track;
+  __le16 heads_per_cyl;
+  __le32 n_hidden_sectors;
+  __le32 n_sectors_l;		/* size of partition */
+  u8 drive_number;
+  u8 mbz;
+  u8 sig_28h;			/* 28h */
+  u8 vol_serno[4];
+  u8 vol_label[11];
+  u8 sig_hpfs[8];		/* "HPFS    " */
+  u8 pad[448];
+  __le16 magic;			/* aa55 */
+};
+
+
+/* sector 16 */
+
+/* The super block has the pointer to the root directory. */
+
+#define SB_MAGIC 0xf995e849
+
+struct hpfs_super_block
+{
+  __le32 magic;				/* f995 e849 */
+  __le32 magic1;			/* fa53 e9c5, more magic? */
+  u8 version;				/* version of a filesystem  usually 2 */
+  u8 funcversion;			/* functional version - oldest version
+  					   of filesystem that can understand
+					   this disk */
+  __le16 zero;				/* 0 */
+  __le32 root;				/* fnode of root directory */
+  __le32 n_sectors;			/* size of filesystem */
+  __le32 n_badblocks;			/* number of bad blocks */
+  __le32 bitmaps;			/* pointers to free space bit maps */
+  __le32 zero1;				/* 0 */
+  __le32 badblocks;			/* bad block list */
+  __le32 zero3;				/* 0 */
+  __le32 last_chkdsk;			/* date last checked, 0 if never */
+  __le32 last_optimize;			/* date last optimized, 0 if never */
+  __le32 n_dir_band;			/* number of sectors in dir band */
+  __le32 dir_band_start;			/* first sector in dir band */
+  __le32 dir_band_end;			/* last sector in dir band */
+  __le32 dir_band_bitmap;		/* free space map, 1 dnode per bit */
+  u8 volume_name[32];			/* not used */
+  __le32 user_id_table;			/* 8 preallocated sectors - user id */
+  u32 zero6[103];			/* 0 */
+};
+
+
+/* sector 17 */
+
+/* The spare block has pointers to spare sectors.  */
+
+#define SP_MAGIC 0xf9911849
+
+struct hpfs_spare_block
+{
+  __le32 magic;				/* f991 1849 */
+  __le32 magic1;				/* fa52 29c5, more magic? */
+
+#ifdef __LITTLE_ENDIAN
+  u8 dirty: 1;				/* 0 clean, 1 "improperly stopped" */
+  u8 sparedir_used: 1;			/* spare dirblks used */
+  u8 hotfixes_used: 1;			/* hotfixes used */
+  u8 bad_sector: 1;			/* bad sector, corrupted disk (???) */
+  u8 bad_bitmap: 1;			/* bad bitmap */
+  u8 fast: 1;				/* partition was fast formatted */
+  u8 old_wrote: 1;			/* old version wrote to partion */
+  u8 old_wrote_1: 1;			/* old version wrote to partion (?) */
+#else
+  u8 old_wrote_1: 1;			/* old version wrote to partion (?) */
+  u8 old_wrote: 1;			/* old version wrote to partion */
+  u8 fast: 1;				/* partition was fast formatted */
+  u8 bad_bitmap: 1;			/* bad bitmap */
+  u8 bad_sector: 1;			/* bad sector, corrupted disk (???) */
+  u8 hotfixes_used: 1;			/* hotfixes used */
+  u8 sparedir_used: 1;			/* spare dirblks used */
+  u8 dirty: 1;				/* 0 clean, 1 "improperly stopped" */
+#endif
+
+#ifdef __LITTLE_ENDIAN
+  u8 install_dasd_limits: 1;		/* HPFS386 flags */
+  u8 resynch_dasd_limits: 1;
+  u8 dasd_limits_operational: 1;
+  u8 multimedia_active: 1;
+  u8 dce_acls_active: 1;
+  u8 dasd_limits_dirty: 1;
+  u8 flag67: 2;
+#else
+  u8 flag67: 2;
+  u8 dasd_limits_dirty: 1;
+  u8 dce_acls_active: 1;
+  u8 multimedia_active: 1;
+  u8 dasd_limits_operational: 1;
+  u8 resynch_dasd_limits: 1;
+  u8 install_dasd_limits: 1;		/* HPFS386 flags */
+#endif
+
+  u8 mm_contlgulty;
+  u8 unused;
+
+  __le32 hotfix_map;			/* info about remapped bad sectors */
+  __le32 n_spares_used;			/* number of hotfixes */
+  __le32 n_spares;			/* number of spares in hotfix map */
+  __le32 n_dnode_spares_free;		/* spare dnodes unused */
+  __le32 n_dnode_spares;		/* length of spare_dnodes[] list,
+					   follows in this block*/
+  __le32 code_page_dir;			/* code page directory block */
+  __le32 n_code_pages;			/* number of code pages */
+  __le32 super_crc;			/* on HPFS386 and LAN Server this is
+  					   checksum of superblock, on normal
+					   OS/2 unused */
+  __le32 spare_crc;			/* on HPFS386 checksum of spareblock */
+  __le32 zero1[15];			/* unused */
+  __le32 spare_dnodes[100];		/* emergency free dnode list */
+  __le32 zero2[1];			/* room for more? */
+};
+
+/* The bad block list is 4 sectors long.  The first word must be zero,
+   the remaining words give n_badblocks bad block numbers.
+   I bet you can see it coming... */
+
+#define BAD_MAGIC 0
+       
+/* The hotfix map is 4 sectors long.  It looks like
+
+       secno from[n_spares];
+       secno to[n_spares];
+
+   The to[] list is initialized to point to n_spares preallocated empty
+   sectors.  The from[] list contains the sector numbers of bad blocks
+   which have been remapped to corresponding sectors in the to[] list.
+   n_spares_used gives the length of the from[] list. */
+
+
+/* Sectors 18 and 19 are preallocated and unused.
+   Maybe they're spares for 16 and 17, but simple substitution fails. */
+
+
+/* The code page info pointed to by the spare block consists of an index
+   block and blocks containing uppercasing tables.  I don't know what
+   these are for (CHKDSK, maybe?) -- OS/2 does not seem to use them
+   itself.  Linux doesn't use them either. */
+
+/* block pointed to by spareblock->code_page_dir */
+
+#define CP_DIR_MAGIC 0x494521f7
+
+struct code_page_directory
+{
+  __le32 magic;				/* 4945 21f7 */
+  __le32 n_code_pages;			/* number of pointers following */
+  __le32 zero1[2];
+  struct {
+    __le16 ix;				/* index */
+    __le16 code_page_number;		/* code page number */
+    __le32 bounds;			/* matches corresponding word
+					   in data block */
+    __le32 code_page_data;		/* sector number of a code_page_data
+					   containing c.p. array */
+    __le16 index;			/* index in c.p. array in that sector*/
+    __le16 unknown;			/* some unknown value; usually 0;
+    					   2 in Japanese version */
+  } array[31];				/* unknown length */
+};
+
+/* blocks pointed to by code_page_directory */
+
+#define CP_DATA_MAGIC 0x894521f7
+
+struct code_page_data
+{
+  __le32 magic;				/* 8945 21f7 */
+  __le32 n_used;			/* # elements used in c_p_data[] */
+  __le32 bounds[3];			/* looks a bit like
+					     (beg1,end1), (beg2,end2)
+					   one byte each */
+  __le16 offs[3];			/* offsets from start of sector
+					   to start of c_p_data[ix] */
+  struct {
+    __le16 ix;				/* index */
+    __le16 code_page_number;		/* code page number */
+    __le16 unknown;			/* the same as in cp directory */
+    u8 map[128];			/* upcase table for chars 80..ff */
+    __le16 zero2;
+  } code_page[3];
+  u8 incognita[78];
+};
+
+
+/* Free space bitmaps are 4 sectors long, which is 16384 bits.
+   16384 sectors is 8 meg, and each 8 meg band has a 4-sector bitmap.
+   Bit order in the maps is little-endian.  0 means taken, 1 means free.
+
+   Bit map sectors are marked allocated in the bit maps, and so are sectors 
+   off the end of the partition.
+
+   Band 0 is sectors 0-3fff, its map is in sectors 18-1b.
+   Band 1 is 4000-7fff, its map is in 7ffc-7fff.
+   Band 2 is 8000-ffff, its map is in 8000-8003.
+   The remaining bands have maps in their first (even) or last (odd) 4 sectors
+     -- if the last, partial, band is odd its map is in its last 4 sectors.
+
+   The bitmap locations are given in a table pointed to by the super block.
+   No doubt they aren't constrained to be at 18, 7ffc, 8000, ...; that is
+   just where they usually are.
+
+   The "directory band" is a bunch of sectors preallocated for dnodes.
+   It has a 4-sector free space bitmap of its own.  Each bit in the map
+   corresponds to one 4-sector dnode, bit 0 of the map corresponding to
+   the first 4 sectors of the directory band.  The entire band is marked
+   allocated in the main bitmap.   The super block gives the locations
+   of the directory band and its bitmap.  ("band" doesn't mean it is
+   8 meg long; it isn't.)  */
+
+
+/* dnode: directory.  4 sectors long */
+
+/* A directory is a tree of dnodes.  The fnode for a directory
+   contains one pointer, to the root dnode of the tree.  The fnode
+   never moves, the dnodes do the B-tree thing, splitting and merging
+   as files are added and removed.  */
+
+#define DNODE_MAGIC   0x77e40aae
+
+struct dnode {
+  __le32 magic;				/* 77e4 0aae */
+  __le32 first_free;			/* offset from start of dnode to
+					   first free dir entry */
+#ifdef __LITTLE_ENDIAN
+  u8 root_dnode: 1;			/* Is it root dnode? */
+  u8 increment_me: 7;			/* some kind of activity counter? */
+					/* Neither HPFS.IFS nor CHKDSK cares
+					   if you change this word */
+#else
+  u8 increment_me: 7;			/* some kind of activity counter? */
+					/* Neither HPFS.IFS nor CHKDSK cares
+					   if you change this word */
+  u8 root_dnode: 1;			/* Is it root dnode? */
+#endif
+  u8 increment_me2[3];
+  __le32 up;				/* (root dnode) directory's fnode
+					   (nonroot) parent dnode */
+  __le32 self;			/* pointer to this dnode */
+  u8 dirent[2028];			/* one or more dirents */
+};
+
+struct hpfs_dirent {
+  __le16 length;			/* offset to next dirent */
+
+#ifdef __LITTLE_ENDIAN
+  u8 first: 1;				/* set on phony ^A^A (".") entry */
+  u8 has_acl: 1;
+  u8 down: 1;				/* down pointer present (after name) */
+  u8 last: 1;				/* set on phony \377 entry */
+  u8 has_ea: 1;				/* entry has EA */
+  u8 has_xtd_perm: 1;			/* has extended perm list (???) */
+  u8 has_explicit_acl: 1;
+  u8 has_needea: 1;			/* ?? some EA has NEEDEA set
+					   I have no idea why this is
+					   interesting in a dir entry */
+#else
+  u8 has_needea: 1;			/* ?? some EA has NEEDEA set
+					   I have no idea why this is
+					   interesting in a dir entry */
+  u8 has_explicit_acl: 1;
+  u8 has_xtd_perm: 1;			/* has extended perm list (???) */
+  u8 has_ea: 1;				/* entry has EA */
+  u8 last: 1;				/* set on phony \377 entry */
+  u8 down: 1;				/* down pointer present (after name) */
+  u8 has_acl: 1;
+  u8 first: 1;				/* set on phony ^A^A (".") entry */
+#endif
+
+#ifdef __LITTLE_ENDIAN
+  u8 read_only: 1;			/* dos attrib */
+  u8 hidden: 1;				/* dos attrib */
+  u8 system: 1;				/* dos attrib */
+  u8 flag11: 1;				/* would be volume label dos attrib */
+  u8 directory: 1;			/* dos attrib */
+  u8 archive: 1;			/* dos attrib */
+  u8 not_8x3: 1;			/* name is not 8.3 */
+  u8 flag15: 1;
+#else
+  u8 flag15: 1;
+  u8 not_8x3: 1;			/* name is not 8.3 */
+  u8 archive: 1;			/* dos attrib */
+  u8 directory: 1;			/* dos attrib */
+  u8 flag11: 1;				/* would be volume label dos attrib */
+  u8 system: 1;				/* dos attrib */
+  u8 hidden: 1;				/* dos attrib */
+  u8 read_only: 1;			/* dos attrib */
+#endif
+
+  __le32 fnode;				/* fnode giving allocation info */
+  __le32 write_date;			/* mtime */
+  __le32 file_size;			/* file length, bytes */
+  __le32 read_date;			/* atime */
+  __le32 creation_date;			/* ctime */
+  __le32 ea_size;			/* total EA length, bytes */
+  u8 no_of_acls;			/* number of ACL's (low 3 bits) */
+  u8 ix;				/* code page index (of filename), see
+					   struct code_page_data */
+  u8 namelen, name[1];			/* file name */
+  /* dnode_secno down;	  btree down pointer, if present,
+     			  follows name on next word boundary, or maybe it
+			  precedes next dirent, which is on a word boundary. */
+};
+
+
+/* B+ tree: allocation info in fnodes and anodes */
+
+/* dnodes point to fnodes which are responsible for listing the sectors
+   assigned to the file.  This is done with trees of (length,address)
+   pairs.  (Actually triples, of (length, file-address, disk-address)
+   which can represent holes.  Find out if HPFS does that.)
+   At any rate, fnodes contain a small tree; if subtrees are needed
+   they occupy essentially a full block in anodes.  A leaf-level tree node
+   has 3-word entries giving sector runs, a non-leaf node has 2-word
+   entries giving subtree pointers.  A flag in the header says which. */
+
+struct bplus_leaf_node
+{
+  __le32 file_secno;			/* first file sector in extent */
+  __le32 length;			/* length, sectors */
+  __le32 disk_secno;			/* first corresponding disk sector */
+};
+
+struct bplus_internal_node
+{
+  __le32 file_secno;			/* subtree maps sectors < this  */
+  __le32 down;				/* pointer to subtree */
+};
+
+enum {
+	BP_hbff = 1,
+	BP_fnode_parent = 0x20,
+	BP_binary_search = 0x40,
+	BP_internal = 0x80
+};
+struct bplus_header
+{
+  u8 flags;				/* bit 0 - high bit of first free entry offset
+					   bit 5 - we're pointed to by an fnode,
+					   the data btree or some ea or the
+					   main ea bootage pointer ea_secno
+					   bit 6 - suggest binary search (unused)
+					   bit 7 - 1 -> (internal) tree of anodes
+						   0 -> (leaf) list of extents */
+  u8 fill[3];
+  u8 n_free_nodes;			/* free nodes in following array */
+  u8 n_used_nodes;			/* used nodes in following array */
+  __le16 first_free;			/* offset from start of header to
+					   first free node in array */
+  union {
+    struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
+					       subtree pointers */
+    struct bplus_leaf_node external[0];	    /* (external) 3-word entries giving
+					       sector runs */
+  } u;
+};
+
+static inline bool bp_internal(struct bplus_header *bp)
+{
+	return bp->flags & BP_internal;
+}
+
+static inline bool bp_fnode_parent(struct bplus_header *bp)
+{
+	return bp->flags & BP_fnode_parent;
+}
+
+/* fnode: root of allocation b+ tree, and EA's */
+
+/* Every file and every directory has one fnode, pointed to by the directory
+   entry and pointing to the file's sectors or directory's root dnode.  EA's
+   are also stored here, and there are said to be ACL's somewhere here too. */
+
+#define FNODE_MAGIC 0xf7e40aae
+
+enum {FNODE_anode = cpu_to_le16(2), FNODE_dir = cpu_to_le16(256)};
+struct fnode
+{
+  __le32 magic;				/* f7e4 0aae */
+  __le32 zero1[2];			/* read history */
+  u8 len, name[15];			/* true length, truncated name */
+  __le32 up;				/* pointer to file's directory fnode */
+  __le32 acl_size_l;
+  __le32 acl_secno;
+  __le16 acl_size_s;
+  u8 acl_anode;
+  u8 zero2;				/* history bit count */
+  __le32 ea_size_l;			/* length of disk-resident ea's */
+  __le32 ea_secno;			/* first sector of disk-resident ea's*/
+  __le16 ea_size_s;			/* length of fnode-resident ea's */
+
+  __le16 flags;				/* bit 1 set -> ea_secno is an anode */
+					/* bit 8 set -> directory.  first & only extent
+					   points to dnode. */
+  struct bplus_header btree;		/* b+ tree, 8 extents or 12 subtrees */
+  union {
+    struct bplus_leaf_node external[8];
+    struct bplus_internal_node internal[12];
+  } u;
+
+  __le32 file_size;			/* file length, bytes */
+  __le32 n_needea;			/* number of EA's with NEEDEA set */
+  u8 user_id[16];			/* unused */
+  __le16 ea_offs;			/* offset from start of fnode
+					   to first fnode-resident ea */
+  u8 dasd_limit_treshhold;
+  u8 dasd_limit_delta;
+  __le32 dasd_limit;
+  __le32 dasd_usage;
+  u8 ea[316];				/* zero or more EA's, packed together
+					   with no alignment padding.
+					   (Do not use this name, get here
+					   via fnode + ea_offs. I think.) */
+};
+
+static inline bool fnode_in_anode(struct fnode *p)
+{
+	return (p->flags & FNODE_anode) != 0;
+}
+
+static inline bool fnode_is_dir(struct fnode *p)
+{
+	return (p->flags & FNODE_dir) != 0;
+}
+
+
+/* anode: 99.44% pure allocation tree */
+
+#define ANODE_MAGIC 0x37e40aae
+
+struct anode
+{
+  __le32 magic;				/* 37e4 0aae */
+  __le32 self;				/* pointer to this anode */
+  __le32 up;				/* parent anode or fnode */
+
+  struct bplus_header btree;		/* b+tree, 40 extents or 60 subtrees */
+  union {
+    struct bplus_leaf_node external[40];
+    struct bplus_internal_node internal[60];
+  } u;
+
+  __le32 fill[3];			/* unused */
+};
+
+
+/* extended attributes.
+
+   A file's EA info is stored as a list of (name,value) pairs.  It is
+   usually in the fnode, but (if it's large) it is moved to a single
+   sector run outside the fnode, or to multiple runs with an anode tree
+   that points to them.
+
+   The value of a single EA is stored along with the name, or (if large)
+   it is moved to a single sector run, or multiple runs pointed to by an
+   anode tree, pointed to by the value field of the (name,value) pair.
+
+   Flags in the EA tell whether the value is immediate, in a single sector
+   run, or in multiple runs.  Flags in the fnode tell whether the EA list
+   is immediate, in a single run, or in multiple runs. */
+
+enum {EA_indirect = 1, EA_anode = 2, EA_needea = 128 };
+struct extended_attribute
+{
+  u8 flags;				/* bit 0 set -> value gives sector number
+					   where real value starts */
+					/* bit 1 set -> sector is an anode
+					   that points to fragmented value */
+					/* bit 7 set -> required ea */
+  u8 namelen;				/* length of name, bytes */
+  u8 valuelen_lo;			/* length of value, bytes */
+  u8 valuelen_hi;			/* length of value, bytes */
+  u8 name[];
+  /*
+    u8 name[namelen];			ascii attrib name
+    u8 nul;				terminating '\0', not counted
+    u8 value[valuelen];			value, arbitrary
+      if this.flags & 1, valuelen is 8 and the value is
+        u32 length;			real length of value, bytes
+        secno secno;			sector address where it starts
+      if this.anode, the above sector number is the root of an anode tree
+        which points to the value.
+  */
+};
+
+static inline bool ea_indirect(struct extended_attribute *ea)
+{
+	return ea->flags & EA_indirect;
+}
+
+static inline bool ea_in_anode(struct extended_attribute *ea)
+{
+	return ea->flags & EA_anode;
+}
+
+/*
+   Local Variables:
+   comment-column: 40
+   End:
+*/
diff --git a/kernel/fs/hpfs/hpfs_fn.h b/kernel/fs/hpfs/hpfs_fn.h
new file mode 100644
index 000000000..b63b75fa0
--- /dev/null
+++ b/kernel/fs/hpfs/hpfs_fn.h
@@ -0,0 +1,363 @@
+/*
+ *  linux/fs/hpfs/hpfs_fn.h
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  function headers
+ */
+
+//#define DBG
+//#define DEBUG_LOCKS
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include "hpfs.h"
+
+#define EIOERROR  EIO
+#define EFSERROR  EPERM
+#define EMEMERROR ENOMEM
+
+#define ANODE_ALLOC_FWD	512
+#define FNODE_ALLOC_FWD	0
+#define ALLOC_FWD_MIN	16
+#define ALLOC_FWD_MAX	128
+#define ALLOC_M		1
+#define FNODE_RD_AHEAD	16
+#define ANODE_RD_AHEAD	0
+#define DNODE_RD_AHEAD	72
+#define COUNT_RD_AHEAD	62
+
+#define FREE_DNODES_ADD	58
+#define FREE_DNODES_DEL	29
+
+#define CHKCOND(x,y) if (!(x)) printk y
+
+struct hpfs_inode_info {
+	loff_t mmu_private;
+	ino_t i_parent_dir;	/* (directories) gives fnode of parent dir */
+	unsigned i_dno;		/* (directories) root dnode */
+	unsigned i_dpos;	/* (directories) temp for readdir */
+	unsigned i_dsubdno;	/* (directories) temp for readdir */
+	unsigned i_file_sec;	/* (files) minimalist cache of alloc info */
+	unsigned i_disk_sec;	/* (files) minimalist cache of alloc info */
+	unsigned i_n_secs;	/* (files) minimalist cache of alloc info */
+	unsigned i_ea_size;	/* size of extended attributes */
+	unsigned i_ea_mode : 1;	/* file's permission is stored in ea */
+	unsigned i_ea_uid : 1;	/* file's uid is stored in ea */
+	unsigned i_ea_gid : 1;	/* file's gid is stored in ea */
+	unsigned i_dirty : 1;
+	loff_t **i_rddir_off;
+	struct inode vfs_inode;
+};
+
+struct hpfs_sb_info {
+	struct mutex hpfs_mutex;	/* global hpfs lock */
+	ino_t sb_root;			/* inode number of root dir */
+	unsigned sb_fs_size;		/* file system size, sectors */
+	unsigned sb_bitmaps;		/* sector number of bitmap list */
+	unsigned sb_dirband_start;	/* directory band start sector */
+	unsigned sb_dirband_size;	/* directory band size, dnodes */
+	unsigned sb_dmap;		/* sector number of dnode bit map */
+	unsigned sb_n_free;		/* free blocks for statfs, or -1 */
+	unsigned sb_n_free_dnodes;	/* free dnodes for statfs, or -1 */
+	kuid_t sb_uid;			/* uid from mount options */
+	kgid_t sb_gid;			/* gid from mount options */
+	umode_t sb_mode;		/* mode from mount options */
+	unsigned sb_eas : 2;		/* eas: 0-ignore, 1-ro, 2-rw */
+	unsigned sb_err : 2;		/* on errs: 0-cont, 1-ro, 2-panic */
+	unsigned sb_chk : 2;		/* checks: 0-no, 1-normal, 2-strict */
+	unsigned sb_lowercase : 1;	/* downcase filenames hackery */
+	unsigned sb_was_error : 1;	/* there was an error, set dirty flag */
+	unsigned sb_chkdsk : 2;		/* chkdsk: 0-no, 1-on errs, 2-allways */
+	unsigned char *sb_cp_table;	/* code page tables: */
+					/* 	128 bytes uppercasing table & */
+					/*	128 bytes lowercasing table */
+	__le32 *sb_bmp_dir;		/* main bitmap directory */
+	unsigned sb_c_bitmap;		/* current bitmap */
+	unsigned sb_max_fwd_alloc;	/* max forwad allocation */
+	int sb_timeshift;
+	struct rcu_head rcu;
+};
+
+/* Four 512-byte buffers and the 2k block obtained by concatenating them */
+
+struct quad_buffer_head {
+	struct buffer_head *bh[4];
+	void *data;
+};
+
+/* The b-tree down pointer from a dir entry */
+
+static inline dnode_secno de_down_pointer (struct hpfs_dirent *de)
+{
+  CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n"));
+  return le32_to_cpu(*(__le32 *) ((void *) de + le16_to_cpu(de->length) - 4));
+}
+
+/* The first dir entry in a dnode */
+
+static inline struct hpfs_dirent *dnode_first_de (struct dnode *dnode)
+{
+  return (void *) dnode->dirent;
+}
+
+/* The end+1 of the dir entries */
+
+static inline struct hpfs_dirent *dnode_end_de (struct dnode *dnode)
+{
+  CHKCOND(le32_to_cpu(dnode->first_free)>=0x14 && le32_to_cpu(dnode->first_free)<=0xa00,("HPFS: dnode_end_de: dnode->first_free = %x\n",(unsigned)le32_to_cpu(dnode->first_free)));
+  return (void *) dnode + le32_to_cpu(dnode->first_free);
+}
+
+/* The dir entry after dir entry de */
+
+static inline struct hpfs_dirent *de_next_de (struct hpfs_dirent *de)
+{
+  CHKCOND(le16_to_cpu(de->length)>=0x20 && le16_to_cpu(de->length)<0x800,("HPFS: de_next_de: de->length = %x\n",(unsigned)le16_to_cpu(de->length)));
+  return (void *) de + le16_to_cpu(de->length);
+}
+
+static inline struct extended_attribute *fnode_ea(struct fnode *fnode)
+{
+	return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s));
+}
+
+static inline struct extended_attribute *fnode_end_ea(struct fnode *fnode)
+{
+	return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s));
+}
+
+static unsigned ea_valuelen(struct extended_attribute *ea)
+{
+	return ea->valuelen_lo + 256 * ea->valuelen_hi;
+}
+
+static inline struct extended_attribute *next_ea(struct extended_attribute *ea)
+{
+	return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + ea_valuelen(ea));
+}
+
+static inline secno ea_sec(struct extended_attribute *ea)
+{
+	return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 9 + ea->namelen)));
+}
+
+static inline secno ea_len(struct extended_attribute *ea)
+{
+	return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 5 + ea->namelen)));
+}
+
+static inline char *ea_data(struct extended_attribute *ea)
+{
+	return (char *)((char *)ea + 5 + ea->namelen);
+}
+
+static inline unsigned de_size(int namelen, secno down_ptr)
+{
+	return ((0x1f + namelen + 3) & ~3) + (down_ptr ? 4 : 0);
+}
+
+static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src)
+{
+	int a;
+	int n;
+	if (!dst || !src) return;
+	a = dst->down;
+	n = dst->not_8x3;
+	memcpy((char *)dst + 2, (char *)src + 2, 28);
+	dst->down = a;
+	dst->not_8x3 = n;
+}
+
+static inline unsigned tstbits(__le32 *bmp, unsigned b, unsigned n)
+{
+	int i;
+	if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n;
+	if (!((le32_to_cpu(bmp[(b & 0x3fff) >> 5]) >> (b & 0x1f)) & 1)) return 1;
+	for (i = 1; i < n; i++)
+		if (!((le32_to_cpu(bmp[((b+i) & 0x3fff) >> 5]) >> ((b+i) & 0x1f)) & 1))
+			return i + 1;
+	return 0;
+}
+
+/* alloc.c */
+
+int hpfs_chk_sectors(struct super_block *, secno, int, char *);
+secno hpfs_alloc_sector(struct super_block *, secno, unsigned, int);
+int hpfs_alloc_if_possible(struct super_block *, secno);
+void hpfs_free_sectors(struct super_block *, secno, unsigned);
+int hpfs_check_free_dnodes(struct super_block *, int);
+void hpfs_free_dnode(struct super_block *, secno);
+struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *);
+struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **);
+struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **);
+
+/* anode.c */
+
+secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_header *, unsigned, struct buffer_head *);
+secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
+void hpfs_remove_btree(struct super_block *, struct bplus_header *);
+int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
+int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
+void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
+void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
+void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
+
+/* buffer.c */
+
+void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
+void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
+void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
+void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
+void *hpfs_get_4sectors(struct super_block *, unsigned, struct quad_buffer_head *);
+void hpfs_brelse4(struct quad_buffer_head *);
+void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
+
+/* dentry.c */
+
+extern const struct dentry_operations hpfs_dentry_operations;
+
+/* dir.c */
+
+struct dentry *hpfs_lookup(struct inode *, struct dentry *, unsigned int);
+extern const struct file_operations hpfs_dir_ops;
+
+/* dnode.c */
+
+void hpfs_add_pos(struct inode *, loff_t *);
+void hpfs_del_pos(struct inode *, loff_t *);
+struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
+				const unsigned char *, unsigned, secno);
+int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
+		    struct hpfs_dirent *);
+int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
+void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
+dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
+struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
+struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
+			       const unsigned char *, unsigned, dnode_secno *,
+			       struct quad_buffer_head *);
+void hpfs_remove_dtree(struct super_block *, dnode_secno);
+struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
+
+/* ea.c */
+
+void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
+int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
+char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
+void hpfs_set_ea(struct inode *, struct fnode *, const char *,
+		 const char *, int);
+
+/* file.c */
+
+int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
+void hpfs_truncate(struct inode *);
+extern const struct file_operations hpfs_file_ops;
+extern const struct inode_operations hpfs_file_iops;
+extern const struct address_space_operations hpfs_aops;
+
+/* inode.c */
+
+void hpfs_init_inode(struct inode *);
+void hpfs_read_inode(struct inode *);
+void hpfs_write_inode(struct inode *);
+void hpfs_write_inode_nolock(struct inode *);
+int hpfs_setattr(struct dentry *, struct iattr *);
+void hpfs_write_if_changed(struct inode *);
+void hpfs_evict_inode(struct inode *);
+
+/* map.c */
+
+__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
+__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+void hpfs_prefetch_bitmap(struct super_block *, unsigned);
+unsigned char *hpfs_load_code_page(struct super_block *, secno);
+__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
+struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
+struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
+struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *);
+dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
+
+/* name.c */
+
+unsigned char hpfs_upcase(unsigned char *, unsigned char);
+int hpfs_chk_name(const unsigned char *, unsigned *);
+unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
+int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
+		       const unsigned char *, unsigned, int);
+int hpfs_is_name_long(const unsigned char *, unsigned);
+void hpfs_adjust_length(const unsigned char *, unsigned *);
+
+/* namei.c */
+
+extern const struct inode_operations hpfs_dir_iops;
+extern const struct address_space_operations hpfs_symlink_aops;
+
+static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
+{
+	return list_entry(inode, struct hpfs_inode_info, vfs_inode);
+}
+
+static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* super.c */
+
+__printf(2, 3)
+void hpfs_error(struct super_block *, const char *, ...);
+int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
+unsigned hpfs_get_free_dnodes(struct super_block *);
+
+/*
+ * local time (HPFS) to GMT (Unix)
+ */
+
+static inline time_t local_to_gmt(struct super_block *s, time32_t t)
+{
+	extern struct timezone sys_tz;
+	return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift;
+}
+
+static inline time32_t gmt_to_local(struct super_block *s, time_t t)
+{
+	extern struct timezone sys_tz;
+	return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+}
+
+/*
+ * Locking:
+ *
+ * hpfs_lock() locks the whole filesystem. It must be taken
+ * on any method called by the VFS.
+ *
+ * We don't do any per-file locking anymore, it is hard to
+ * review and HPFS is not performance-sensitive anyway.
+ */
+static inline void hpfs_lock(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	mutex_lock(&sbi->hpfs_mutex);
+}
+
+static inline void hpfs_unlock(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	mutex_unlock(&sbi->hpfs_mutex);
+}
+
+static inline void hpfs_lock_assert(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	WARN_ON(!mutex_is_locked(&sbi->hpfs_mutex));
+}
diff --git a/kernel/fs/hpfs/inode.c b/kernel/fs/hpfs/inode.c
new file mode 100644
index 000000000..933c73780
--- /dev/null
+++ b/kernel/fs/hpfs/inode.c
@@ -0,0 +1,315 @@
+/*
+ *  linux/fs/hpfs/inode.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  inode VFS functions
+ */
+
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include "hpfs_fn.h"
+
+void hpfs_init_inode(struct inode *i)
+{
+	struct super_block *sb = i->i_sb;
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+
+	i->i_uid = hpfs_sb(sb)->sb_uid;
+	i->i_gid = hpfs_sb(sb)->sb_gid;
+	i->i_mode = hpfs_sb(sb)->sb_mode;
+	i->i_size = -1;
+	i->i_blocks = -1;
+	
+	hpfs_inode->i_dno = 0;
+	hpfs_inode->i_n_secs = 0;
+	hpfs_inode->i_file_sec = 0;
+	hpfs_inode->i_disk_sec = 0;
+	hpfs_inode->i_dpos = 0;
+	hpfs_inode->i_dsubdno = 0;
+	hpfs_inode->i_ea_mode = 0;
+	hpfs_inode->i_ea_uid = 0;
+	hpfs_inode->i_ea_gid = 0;
+	hpfs_inode->i_ea_size = 0;
+
+	hpfs_inode->i_rddir_off = NULL;
+	hpfs_inode->i_dirty = 0;
+
+	i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0;
+	i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
+	i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
+}
+
+void hpfs_read_inode(struct inode *i)
+{
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	struct super_block *sb = i->i_sb;
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	void *ea;
+	int ea_size;
+
+	if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
+		/*i->i_mode |= S_IFREG;
+		i->i_mode &= ~0111;
+		i->i_op = &hpfs_file_iops;
+		i->i_fop = &hpfs_file_ops;
+		clear_nlink(i);*/
+		make_bad_inode(i);
+		return;
+	}
+	if (hpfs_sb(i->i_sb)->sb_eas) {
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
+			if (ea_size == 2) {
+				i_uid_write(i, le16_to_cpu(*(__le16*)ea));
+				hpfs_inode->i_ea_uid = 1;
+			}
+			kfree(ea);
+		}
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) {
+			if (ea_size == 2) {
+				i_gid_write(i, le16_to_cpu(*(__le16*)ea));
+				hpfs_inode->i_ea_gid = 1;
+			}
+			kfree(ea);
+		}
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "SYMLINK", &ea_size))) {
+			kfree(ea);
+			i->i_mode = S_IFLNK | 0777;
+			i->i_op = &page_symlink_inode_operations;
+			i->i_data.a_ops = &hpfs_symlink_aops;
+			set_nlink(i, 1);
+			i->i_size = ea_size;
+			i->i_blocks = 1;
+			brelse(bh);
+			return;
+		}
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "MODE", &ea_size))) {
+			int rdev = 0;
+			umode_t mode = hpfs_sb(sb)->sb_mode;
+			if (ea_size == 2) {
+				mode = le16_to_cpu(*(__le16*)ea);
+				hpfs_inode->i_ea_mode = 1;
+			}
+			kfree(ea);
+			i->i_mode = mode;
+			if (S_ISBLK(mode) || S_ISCHR(mode)) {
+				if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) {
+					if (ea_size == 4)
+						rdev = le32_to_cpu(*(__le32*)ea);
+					kfree(ea);
+				}
+			}
+			if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
+				brelse(bh);
+				set_nlink(i, 1);
+				i->i_size = 0;
+				i->i_blocks = 1;
+				init_special_inode(i, mode,
+					new_decode_dev(rdev));
+				return;
+			}
+		}
+	}
+	if (fnode_is_dir(fnode)) {
+		int n_dnodes, n_subdirs;
+		i->i_mode |= S_IFDIR;
+		i->i_op = &hpfs_dir_iops;
+		i->i_fop = &hpfs_dir_ops;
+		hpfs_inode->i_parent_dir = le32_to_cpu(fnode->up);
+		hpfs_inode->i_dno = le32_to_cpu(fnode->u.external[0].disk_secno);
+		if (hpfs_sb(sb)->sb_chk >= 2) {
+			struct buffer_head *bh0;
+			if (hpfs_map_fnode(sb, hpfs_inode->i_parent_dir, &bh0)) brelse(bh0);
+		}
+		n_dnodes = 0; n_subdirs = 0;
+		hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL);
+		i->i_blocks = 4 * n_dnodes;
+		i->i_size = 2048 * n_dnodes;
+		set_nlink(i, 2 + n_subdirs);
+	} else {
+		i->i_mode |= S_IFREG;
+		if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111;
+		i->i_op = &hpfs_file_iops;
+		i->i_fop = &hpfs_file_ops;
+		set_nlink(i, 1);
+		i->i_size = le32_to_cpu(fnode->file_size);
+		i->i_blocks = ((i->i_size + 511) >> 9) + 1;
+		i->i_data.a_ops = &hpfs_aops;
+		hpfs_i(i)->mmu_private = i->i_size;
+	}
+	brelse(bh);
+}
+
+static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	/*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) {
+		   Some unknown structures like ACL may be in fnode,
+		   we'd better not overwrite them
+		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino);
+	} else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
+		__le32 ea;
+		if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
+			ea = cpu_to_le32(i_uid_read(i));
+			hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2);
+			hpfs_inode->i_ea_uid = 1;
+		}
+		if (!gid_eq(i->i_gid, hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) {
+			ea = cpu_to_le32(i_gid_read(i));
+			hpfs_set_ea(i, fnode, "GID", (char *)&ea, 2);
+			hpfs_inode->i_ea_gid = 1;
+		}
+		if (!S_ISLNK(i->i_mode))
+			if ((i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0 : 0111))
+			  | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))
+			  && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
+			  | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) {
+				ea = cpu_to_le32(i->i_mode);
+				/* sick, but legal */
+				hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2);
+				hpfs_inode->i_ea_mode = 1;
+			}
+		if (S_ISBLK(i->i_mode) || S_ISCHR(i->i_mode)) {
+			ea = cpu_to_le32(new_encode_dev(i->i_rdev));
+			hpfs_set_ea(i, fnode, "DEV", (char *)&ea, 4);
+		}
+	}
+}
+
+void hpfs_write_inode(struct inode *i)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct inode *parent;
+	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
+	if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) {
+		if (*hpfs_inode->i_rddir_off)
+			pr_err("write_inode: some position still there\n");
+		kfree(hpfs_inode->i_rddir_off);
+		hpfs_inode->i_rddir_off = NULL;
+	}
+	if (!i->i_nlink) {
+		return;
+	}
+	parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir);
+	if (parent) {
+		hpfs_inode->i_dirty = 0;
+		if (parent->i_state & I_NEW) {
+			hpfs_init_inode(parent);
+			hpfs_read_inode(parent);
+			unlock_new_inode(parent);
+		}
+		hpfs_write_inode_nolock(i);
+		iput(parent);
+	}
+}
+
+void hpfs_write_inode_nolock(struct inode *i)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
+	if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) return;
+	if (i->i_ino != hpfs_sb(i->i_sb)->sb_root && i->i_nlink) {
+		if (!(de = map_fnode_dirent(i->i_sb, i->i_ino, fnode, &qbh))) {
+			brelse(bh);
+			return;
+		}
+	} else de = NULL;
+	if (S_ISREG(i->i_mode)) {
+		fnode->file_size = cpu_to_le32(i->i_size);
+		if (de) de->file_size = cpu_to_le32(i->i_size);
+	} else if (S_ISDIR(i->i_mode)) {
+		fnode->file_size = cpu_to_le32(0);
+		if (de) de->file_size = cpu_to_le32(0);
+	}
+	hpfs_write_inode_ea(i, fnode);
+	if (de) {
+		de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
+		de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
+		de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+		de->read_only = !(i->i_mode & 0222);
+		de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+	}
+	if (S_ISDIR(i->i_mode)) {
+		if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
+			de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
+			de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
+			de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+			de->read_only = !(i->i_mode & 0222);
+			de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
+			de->file_size = cpu_to_le32(0);
+			hpfs_mark_4buffers_dirty(&qbh);
+			hpfs_brelse4(&qbh);
+		} else
+			hpfs_error(i->i_sb,
+				"directory %08lx doesn't have '.' entry",
+				(unsigned long)i->i_ino);
+	}
+	mark_buffer_dirty(bh);
+	brelse(bh);
+}
+
+int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error = -EINVAL;
+
+	hpfs_lock(inode->i_sb);
+	if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_UID) &&
+	    from_kuid(&init_user_ns, attr->ia_uid) >= 0x10000)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_GID) &&
+	    from_kgid(&init_user_ns, attr->ia_gid) >= 0x10000)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
+		goto out_unlock;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out_unlock;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			goto out_unlock;
+
+		truncate_setsize(inode, attr->ia_size);
+		hpfs_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+
+	hpfs_write_inode(inode);
+
+ out_unlock:
+	hpfs_unlock(inode->i_sb);
+	return error;
+}
+
+void hpfs_write_if_changed(struct inode *inode)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+
+	if (hpfs_inode->i_dirty)
+		hpfs_write_inode(inode);
+}
+
+void hpfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (!inode->i_nlink) {
+		hpfs_lock(inode->i_sb);
+		hpfs_remove_fnode(inode->i_sb, inode->i_ino);
+		hpfs_unlock(inode->i_sb);
+	}
+}
diff --git a/kernel/fs/hpfs/map.c b/kernel/fs/hpfs/map.c
new file mode 100644
index 000000000..442770edc
--- /dev/null
+++ b/kernel/fs/hpfs/map.c
@@ -0,0 +1,308 @@
+/*
+ *  linux/fs/hpfs/map.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  mapping structures to memory with some minimal checks
+ */
+
+#include "hpfs_fn.h"
+
+__le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
+{
+	return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0);
+}
+
+__le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
+			 struct quad_buffer_head *qbh, char *id)
+{
+	secno sec;
+	__le32 *ret;
+	unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+	if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
+		hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
+		return NULL;
+	}
+	sec = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
+	if (!sec || sec > hpfs_sb(s)->sb_fs_size-4) {
+		hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
+		return NULL;
+	}
+	ret = hpfs_map_4sectors(s, sec, qbh, 4);
+	if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1);
+	return ret;
+}
+
+void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block)
+{
+	unsigned to_prefetch, next_prefetch;
+	unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+	if (unlikely(bmp_block >= n_bands))
+		return;
+	to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
+	if (unlikely(bmp_block + 1 >= n_bands))
+		next_prefetch = 0;
+	else
+		next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]);
+	hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch));
+}
+
+/*
+ * Load first code page into kernel memory, return pointer to 256-byte array,
+ * first 128 bytes are uppercasing table for chars 128-255, next 128 bytes are
+ * lowercasing table
+ */
+
+unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
+{
+	struct buffer_head *bh;
+	secno cpds;
+	unsigned cpi;
+	unsigned char *ptr;
+	unsigned char *cp_table;
+	int i;
+	struct code_page_data *cpd;
+	struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0);
+	if (!cp) return NULL;
+	if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) {
+		pr_err("Code page directory magic doesn't match (magic = %08x)\n",
+			le32_to_cpu(cp->magic));
+		brelse(bh);
+		return NULL;
+	}
+	if (!le32_to_cpu(cp->n_code_pages)) {
+		pr_err("n_code_pages == 0\n");
+		brelse(bh);
+		return NULL;
+	}
+	cpds = le32_to_cpu(cp->array[0].code_page_data);
+	cpi = le16_to_cpu(cp->array[0].index);
+	brelse(bh);
+
+	if (cpi >= 3) {
+		pr_err("Code page index out of array\n");
+		return NULL;
+	}
+	
+	if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL;
+	if (le16_to_cpu(cpd->offs[cpi]) > 0x178) {
+		pr_err("Code page index out of sector\n");
+		brelse(bh);
+		return NULL;
+	}
+	ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6;
+	if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
+		pr_err("out of memory for code page table\n");
+		brelse(bh);
+		return NULL;
+	}
+	memcpy(cp_table, ptr, 128);
+	brelse(bh);
+
+	/* Try to build lowercasing table from uppercasing one */
+
+	for (i=128; i<256; i++) cp_table[i]=i;
+	for (i=128; i<256; i++) if (cp_table[i-128]!=i && cp_table[i-128]>=128)
+		cp_table[cp_table[i-128]] = i;
+	
+	return cp_table;
+}
+
+__le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
+{
+	struct buffer_head *bh;
+	int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21;
+	int i;
+	__le32 *b;
+	if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
+		pr_err("can't allocate memory for bitmap directory\n");
+		return NULL;
+	}	
+	for (i=0;i<n;i++) {
+		__le32 *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
+		if (!d) {
+			kfree(b);
+			return NULL;
+		}
+		memcpy((char *)b + 512 * i, d, 512);
+		brelse(bh);
+	}
+	return b;
+}
+
+/*
+ * Load fnode to memory
+ */
+
+struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_head **bhp)
+{
+	struct fnode *fnode;
+	if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ino, 1, "fnode")) {
+		return NULL;
+	}
+	if ((fnode = hpfs_map_sector(s, ino, bhp, FNODE_RD_AHEAD))) {
+		if (hpfs_sb(s)->sb_chk) {
+			struct extended_attribute *ea;
+			struct extended_attribute *ea_end;
+			if (le32_to_cpu(fnode->magic) != FNODE_MAGIC) {
+				hpfs_error(s, "bad magic on fnode %08lx",
+					(unsigned long)ino);
+				goto bail;
+			}
+			if (!fnode_is_dir(fnode)) {
+				if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes !=
+				    (bp_internal(&fnode->btree) ? 12 : 8)) {
+					hpfs_error(s,
+					   "bad number of nodes in fnode %08lx",
+					    (unsigned long)ino);
+					goto bail;
+				}
+				if (le16_to_cpu(fnode->btree.first_free) !=
+				    8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) {
+					hpfs_error(s,
+					    "bad first_free pointer in fnode %08lx",
+					    (unsigned long)ino);
+					goto bail;
+				}
+			}
+			if (le16_to_cpu(fnode->ea_size_s) && (le16_to_cpu(fnode->ea_offs) < 0xc4 ||
+			   le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200)) {
+				hpfs_error(s,
+					"bad EA info in fnode %08lx: ea_offs == %04x ea_size_s == %04x",
+					(unsigned long)ino,
+					le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
+				goto bail;
+			}
+			ea = fnode_ea(fnode);
+			ea_end = fnode_end_ea(fnode);
+			while (ea != ea_end) {
+				if (ea > ea_end) {
+					hpfs_error(s, "bad EA in fnode %08lx",
+						(unsigned long)ino);
+					goto bail;
+				}
+				ea = next_ea(ea);
+			}
+		}
+	}
+	return fnode;
+	bail:
+	brelse(*bhp);
+	return NULL;
+}
+
+struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buffer_head **bhp)
+{
+	struct anode *anode;
+	if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL;
+	if ((anode = hpfs_map_sector(s, ano, bhp, ANODE_RD_AHEAD)))
+		if (hpfs_sb(s)->sb_chk) {
+			if (le32_to_cpu(anode->magic) != ANODE_MAGIC) {
+				hpfs_error(s, "bad magic on anode %08x", ano);
+				goto bail;
+			}
+			if (le32_to_cpu(anode->self) != ano) {
+				hpfs_error(s, "self pointer invalid on anode %08x", ano);
+				goto bail;
+			}
+			if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes !=
+			    (bp_internal(&anode->btree) ? 60 : 40)) {
+				hpfs_error(s, "bad number of nodes in anode %08x", ano);
+				goto bail;
+			}
+			if (le16_to_cpu(anode->btree.first_free) !=
+			    8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) {
+				hpfs_error(s, "bad first_free pointer in anode %08x", ano);
+				goto bail;
+			}
+		}
+	return anode;
+	bail:
+	brelse(*bhp);
+	return NULL;
+}
+
+/*
+ * Load dnode to memory and do some checks
+ */
+
+struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
+			     struct quad_buffer_head *qbh)
+{
+	struct dnode *dnode;
+	if (hpfs_sb(s)->sb_chk) {
+		if (hpfs_chk_sectors(s, secno, 4, "dnode")) return NULL;
+		if (secno & 3) {
+			hpfs_error(s, "dnode %08x not byte-aligned", secno);
+			return NULL;
+		}	
+	}
+	if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
+		if (hpfs_sb(s)->sb_chk) {
+			unsigned p, pp = 0;
+			unsigned char *d = (unsigned char *)dnode;
+			int b = 0;
+			if (le32_to_cpu(dnode->magic) != DNODE_MAGIC) {
+				hpfs_error(s, "bad magic on dnode %08x", secno);
+				goto bail;
+			}
+			if (le32_to_cpu(dnode->self) != secno)
+				hpfs_error(s, "bad self pointer on dnode %08x self = %08x", secno, le32_to_cpu(dnode->self));
+			/* Check dirents - bad dirents would cause infinite
+			   loops or shooting to memory */
+			if (le32_to_cpu(dnode->first_free) > 2048) {
+				hpfs_error(s, "dnode %08x has first_free == %08x", secno, le32_to_cpu(dnode->first_free));
+				goto bail;
+			}
+			for (p = 20; p < le32_to_cpu(dnode->first_free); p += d[p] + (d[p+1] << 8)) {
+				struct hpfs_dirent *de = (struct hpfs_dirent *)((char *)dnode + p);
+				if (le16_to_cpu(de->length) > 292 || (le16_to_cpu(de->length) < 32) || (le16_to_cpu(de->length) & 3) || p + le16_to_cpu(de->length) > 2048) {
+					hpfs_error(s, "bad dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
+					goto bail;
+				}
+				if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) {
+					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok;
+					hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
+					goto bail;
+				}
+				ok:
+				if (hpfs_sb(s)->sb_chk >= 2) b |= 1 << de->down;
+				if (de->down) if (de_down_pointer(de) < 0x10) {
+					hpfs_error(s, "bad down pointer in dnode %08x, dirent %03x, last %03x", secno, p, pp);
+					goto bail;
+				}
+				pp = p;
+				
+			}
+			if (p != le32_to_cpu(dnode->first_free)) {
+				hpfs_error(s, "size on last dirent does not match first_free; dnode %08x", secno);
+				goto bail;
+			}
+			if (d[pp + 30] != 1 || d[pp + 31] != 255) {
+				hpfs_error(s, "dnode %08x does not end with \\377 entry", secno);
+				goto bail;
+			}
+			if (b == 3)
+				pr_err("unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n",
+					secno);
+		}
+	return dnode;
+	bail:
+	hpfs_brelse4(qbh);
+	return NULL;
+}
+
+dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino)
+{
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	dnode_secno dno;
+
+	fnode = hpfs_map_fnode(s, ino, &bh);
+	if (!fnode)
+		return 0;
+
+	dno = le32_to_cpu(fnode->u.external[0].disk_secno);
+	brelse(bh);
+	return dno;
+}
diff --git a/kernel/fs/hpfs/name.c b/kernel/fs/hpfs/name.c
new file mode 100644
index 000000000..b00d396d2
--- /dev/null
+++ b/kernel/fs/hpfs/name.c
@@ -0,0 +1,113 @@
+/*
+ *  linux/fs/hpfs/name.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  operations with filenames
+ */
+
+#include "hpfs_fn.h"
+
+static inline int not_allowed_char(unsigned char c)
+{
+	return c<' ' || c=='"' || c=='*' || c=='/' || c==':' || c=='<' ||
+	      c=='>' || c=='?' || c=='\\' || c=='|';
+}
+
+static inline int no_dos_char(unsigned char c)
+{	/* Characters that are allowed in HPFS but not in DOS */
+	return c=='+' || c==',' || c==';' || c=='=' || c=='[' || c==']';
+}
+
+static inline unsigned char upcase(unsigned char *dir, unsigned char a)
+{
+	if (a<128 || a==255) return a>='a' && a<='z' ? a - 0x20 : a;
+	if (!dir) return a;
+	return dir[a-128];
+}
+
+unsigned char hpfs_upcase(unsigned char *dir, unsigned char a)
+{
+	return upcase(dir, a);
+}
+
+static inline unsigned char locase(unsigned char *dir, unsigned char a)
+{
+	if (a<128 || a==255) return a>='A' && a<='Z' ? a + 0x20 : a;
+	if (!dir) return a;
+	return dir[a];
+}
+
+int hpfs_chk_name(const unsigned char *name, unsigned *len)
+{
+	int i;
+	if (*len > 254) return -ENAMETOOLONG;
+	hpfs_adjust_length(name, len);
+	if (!*len) return -EINVAL;
+	for (i = 0; i < *len; i++) if (not_allowed_char(name[i])) return -EINVAL;
+	if (*len == 1) if (name[0] == '.') return -EINVAL;
+	if (*len == 2) if (name[0] == '.' && name[1] == '.') return -EINVAL;
+	return 0;
+}
+
+unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
+			  unsigned len, int lc, int lng)
+{
+	unsigned char *to;
+	int i;
+	if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
+		pr_err("Long name flag mismatch - name ");
+		for (i = 0; i < len; i++)
+			pr_cont("%c", from[i]);
+		pr_cont(" misidentified as %s.\n", lng ? "short" : "long");
+		pr_err("It's nothing serious. It could happen because of bug in OS/2.\nSet checks=normal to disable this message.\n");
+	}
+	if (!lc) return from;
+	if (!(to = kmalloc(len, GFP_KERNEL))) {
+		pr_err("can't allocate memory for name conversion buffer\n");
+		return from;
+	}
+	for (i = 0; i < len; i++) to[i] = locase(hpfs_sb(s)->sb_cp_table,from[i]);
+	return to;
+}
+
+int hpfs_compare_names(struct super_block *s,
+		       const unsigned char *n1, unsigned l1,
+		       const unsigned char *n2, unsigned l2, int last)
+{
+	unsigned l = l1 < l2 ? l1 : l2;
+	unsigned i;
+	if (last) return -1;
+	for (i = 0; i < l; i++) {
+		unsigned char c1 = upcase(hpfs_sb(s)->sb_cp_table,n1[i]);
+		unsigned char c2 = upcase(hpfs_sb(s)->sb_cp_table,n2[i]);
+		if (c1 < c2) return -1;
+		if (c1 > c2) return 1;
+	}
+	if (l1 < l2) return -1;
+	if (l1 > l2) return 1;
+	return 0;
+}
+
+int hpfs_is_name_long(const unsigned char *name, unsigned len)
+{
+	int i,j;
+	for (i = 0; i < len && name[i] != '.'; i++)
+		if (no_dos_char(name[i])) return 1;
+	if (!i || i > 8) return 1;
+	if (i == len) return 0;
+	for (j = i + 1; j < len; j++)
+		if (name[j] == '.' || no_dos_char(name[i])) return 1;
+	return j - i > 4;
+}
+
+/* OS/2 clears dots and spaces at the end of file name, so we have to */
+
+void hpfs_adjust_length(const unsigned char *name, unsigned *len)
+{
+	if (!*len) return;
+	if (*len == 1 && name[0] == '.') return;
+	if (*len == 2 && name[0] == '.' && name[1] == '.') return;
+	while (*len && (name[*len - 1] == '.' || name[*len - 1] == ' '))
+		(*len)--;
+}
diff --git a/kernel/fs/hpfs/namei.c b/kernel/fs/hpfs/namei.c
new file mode 100644
index 000000000..a0872f239
--- /dev/null
+++ b/kernel/fs/hpfs/namei.c
@@ -0,0 +1,628 @@
+/*
+ *  linux/fs/hpfs/namei.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  adding & removing files & directories
+ */
+#include <linux/sched.h>
+#include "hpfs_fn.h"
+
+static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh0;
+	struct buffer_head *bh;
+	struct hpfs_dirent *de;
+	struct fnode *fnode;
+	struct dnode *dnode;
+	struct inode *result;
+	fnode_secno fno;
+	dnode_secno dno;
+	int r;
+	struct hpfs_dirent dee;
+	int err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+	hpfs_lock(dir->i_sb);
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	dnode = hpfs_alloc_dnode(dir->i_sb, fno, &dno, &qbh0);
+	if (!dnode)
+		goto bail1;
+	memset(&dee, 0, sizeof dee);
+	dee.directory = 1;
+	if (!(mode & 0222)) dee.read_only = 1;
+	/*dee.archive = 0;*/
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail2;
+	hpfs_init_inode(result);
+	result->i_ino = fno;
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	hpfs_i(result)->i_dno = dno;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0; 
+	result->i_mtime.tv_nsec = 0; 
+	result->i_atime.tv_nsec = 0; 
+	hpfs_i(result)->i_ea_size = 0;
+	result->i_mode |= S_IFDIR;
+	result->i_op = &hpfs_dir_iops;
+	result->i_fop = &hpfs_dir_ops;
+	result->i_blocks = 4;
+	result->i_size = 2048;
+	set_nlink(result, 2);
+	if (dee.read_only)
+		result->i_mode &= ~0222;
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail3;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail3;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	fnode->flags |= FNODE_dir;
+	fnode->btree.n_free_nodes = 7;
+	fnode->btree.n_used_nodes = 1;
+	fnode->btree.first_free = cpu_to_le16(0x14);
+	fnode->u.external[0].disk_secno = cpu_to_le32(dno);
+	fnode->u.external[0].file_secno = cpu_to_le32(-1);
+	dnode->root_dnode = 1;
+	dnode->up = cpu_to_le32(fno);
+	de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0);
+	de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	if (!(mode & 0222)) de->read_only = 1;
+	de->first = de->directory = 1;
+	/*de->hidden = de->system = 0;*/
+	de->fnode = cpu_to_le32(fno);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	hpfs_mark_4buffers_dirty(&qbh0);
+	hpfs_brelse4(&qbh0);
+	inc_nlink(dir);
+	insert_inode_hash(result);
+
+	if (!uid_eq(result->i_uid, current_fsuid()) ||
+	    !gid_eq(result->i_gid, current_fsgid()) ||
+	    result->i_mode != (mode | S_IFDIR)) {
+		result->i_uid = current_fsuid();
+		result->i_gid = current_fsgid();
+		result->i_mode = mode | S_IFDIR;
+		hpfs_write_inode_nolock(result);
+	}
+	d_instantiate(dentry, result);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+bail3:
+	iput(result);
+bail2:
+	hpfs_brelse4(&qbh0);
+	hpfs_free_dnode(dir->i_sb, dno);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct inode *result = NULL;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	fnode_secno fno;
+	int r;
+	struct hpfs_dirent dee;
+	int err;
+	if ((err = hpfs_chk_name(name, &len)))
+		return err==-ENOENT ? -EINVAL : err;
+	hpfs_lock(dir->i_sb);
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	memset(&dee, 0, sizeof dee);
+	if (!(mode & 0222)) dee.read_only = 1;
+	dee.archive = 1;
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail1;
+	
+	hpfs_init_inode(result);
+	result->i_ino = fno;
+	result->i_mode |= S_IFREG;
+	result->i_mode &= ~0111;
+	result->i_op = &hpfs_file_iops;
+	result->i_fop = &hpfs_file_ops;
+	set_nlink(result, 1);
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0;
+	result->i_mtime.tv_nsec = 0;
+	result->i_atime.tv_nsec = 0;
+	hpfs_i(result)->i_ea_size = 0;
+	if (dee.read_only)
+		result->i_mode &= ~0222;
+	result->i_blocks = 1;
+	result->i_size = 0;
+	result->i_data.a_ops = &hpfs_aops;
+	hpfs_i(result)->mmu_private = 0;
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail2;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail2;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	insert_inode_hash(result);
+
+	if (!uid_eq(result->i_uid, current_fsuid()) ||
+	    !gid_eq(result->i_gid, current_fsgid()) ||
+	    result->i_mode != (mode | S_IFREG)) {
+		result->i_uid = current_fsuid();
+		result->i_gid = current_fsgid();
+		result->i_mode = mode | S_IFREG;
+		hpfs_write_inode_nolock(result);
+	}
+	d_instantiate(dentry, result);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+
+bail2:
+	iput(result);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	fnode_secno fno;
+	int r;
+	struct hpfs_dirent dee;
+	struct inode *result = NULL;
+	int err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+	hpfs_lock(dir->i_sb);
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	memset(&dee, 0, sizeof dee);
+	if (!(mode & 0222)) dee.read_only = 1;
+	dee.archive = 1;
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail1;
+
+	hpfs_init_inode(result);
+	result->i_ino = fno;
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0;
+	result->i_mtime.tv_nsec = 0;
+	result->i_atime.tv_nsec = 0;
+	hpfs_i(result)->i_ea_size = 0;
+	result->i_uid = current_fsuid();
+	result->i_gid = current_fsgid();
+	set_nlink(result, 1);
+	result->i_size = 0;
+	result->i_blocks = 1;
+	init_special_inode(result, mode, rdev);
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail2;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail2;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	mark_buffer_dirty(bh);
+
+	insert_inode_hash(result);
+
+	hpfs_write_inode_nolock(result);
+	d_instantiate(dentry, result);
+	brelse(bh);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+bail2:
+	iput(result);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	fnode_secno fno;
+	int r;
+	struct hpfs_dirent dee;
+	struct inode *result;
+	int err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+	hpfs_lock(dir->i_sb);
+	if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
+		hpfs_unlock(dir->i_sb);
+		return -EPERM;
+	}
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	memset(&dee, 0, sizeof dee);
+	dee.archive = 1;
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail1;
+	result->i_ino = fno;
+	hpfs_init_inode(result);
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0;
+	result->i_mtime.tv_nsec = 0;
+	result->i_atime.tv_nsec = 0;
+	hpfs_i(result)->i_ea_size = 0;
+	result->i_mode = S_IFLNK | 0777;
+	result->i_uid = current_fsuid();
+	result->i_gid = current_fsgid();
+	result->i_blocks = 1;
+	set_nlink(result, 1);
+	result->i_size = strlen(symlink);
+	result->i_op = &page_symlink_inode_operations;
+	result->i_data.a_ops = &hpfs_symlink_aops;
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail2;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail2;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	insert_inode_hash(result);
+
+	hpfs_write_inode_nolock(result);
+	d_instantiate(dentry, result);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+bail2:
+	iput(result);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	struct inode *inode = d_inode(dentry);
+	dnode_secno dno;
+	int r;
+	int rep = 0;
+	int err;
+
+	hpfs_lock(dir->i_sb);
+	hpfs_adjust_length(name, &len);
+again:
+	err = -ENOENT;
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
+	if (!de)
+		goto out;
+
+	err = -EPERM;
+	if (de->first)
+		goto out1;
+
+	err = -EISDIR;
+	if (de->directory)
+		goto out1;
+
+	r = hpfs_remove_dirent(dir, dno, de, &qbh, 1);
+	switch (r) {
+	case 1:
+		hpfs_error(dir->i_sb, "there was error when removing dirent");
+		err = -EFSERROR;
+		break;
+	case 2:		/* no space for deleting, try to truncate file */
+
+		err = -ENOSPC;
+		if (rep++)
+			break;
+
+		dentry_unhash(dentry);
+		if (!d_unhashed(dentry)) {
+			hpfs_unlock(dir->i_sb);
+			return -ENOSPC;
+		}
+		if (generic_permission(inode, MAY_WRITE) ||
+		    !S_ISREG(inode->i_mode) ||
+		    get_write_access(inode)) {
+			d_rehash(dentry);
+		} else {
+			struct iattr newattrs;
+			/*pr_info("truncating file before delete.\n");*/
+			newattrs.ia_size = 0;
+			newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+			err = notify_change(dentry, &newattrs, NULL);
+			put_write_access(inode);
+			if (!err)
+				goto again;
+		}
+		hpfs_unlock(dir->i_sb);
+		return -ENOSPC;
+	default:
+		drop_nlink(inode);
+		err = 0;
+	}
+	goto out;
+
+out1:
+	hpfs_brelse4(&qbh);
+out:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	struct inode *inode = d_inode(dentry);
+	dnode_secno dno;
+	int n_items = 0;
+	int err;
+	int r;
+
+	hpfs_adjust_length(name, &len);
+	hpfs_lock(dir->i_sb);
+	err = -ENOENT;
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
+	if (!de)
+		goto out;
+
+	err = -EPERM;
+	if (de->first)
+		goto out1;
+
+	err = -ENOTDIR;
+	if (!de->directory)
+		goto out1;
+
+	hpfs_count_dnodes(dir->i_sb, hpfs_i(inode)->i_dno, NULL, NULL, &n_items);
+	err = -ENOTEMPTY;
+	if (n_items)
+		goto out1;
+
+	r = hpfs_remove_dirent(dir, dno, de, &qbh, 1);
+	switch (r) {
+	case 1:
+		hpfs_error(dir->i_sb, "there was error when removing dirent");
+		err = -EFSERROR;
+		break;
+	case 2:
+		err = -ENOSPC;
+		break;
+	default:
+		drop_nlink(dir);
+		clear_nlink(inode);
+		err = 0;
+	}
+	goto out;
+out1:
+	hpfs_brelse4(&qbh);
+out:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_symlink_readpage(struct file *file, struct page *page)
+{
+	char *link = kmap(page);
+	struct inode *i = page->mapping->host;
+	struct fnode *fnode;
+	struct buffer_head *bh;
+	int err;
+
+	err = -EIO;
+	hpfs_lock(i->i_sb);
+	if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
+		goto fail;
+	err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
+	brelse(bh);
+	if (err)
+		goto fail;
+	hpfs_unlock(i->i_sb);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+fail:
+	hpfs_unlock(i->i_sb);
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+const struct address_space_operations hpfs_symlink_aops = {
+	.readpage	= hpfs_symlink_readpage
+};
+	
+static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	const unsigned char *old_name = old_dentry->d_name.name;
+	unsigned old_len = old_dentry->d_name.len;
+	const unsigned char *new_name = new_dentry->d_name.name;
+	unsigned new_len = new_dentry->d_name.len;
+	struct inode *i = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct quad_buffer_head qbh, qbh1;
+	struct hpfs_dirent *dep, *nde;
+	struct hpfs_dirent de;
+	dnode_secno dno;
+	int r;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	int err;
+
+	if ((err = hpfs_chk_name(new_name, &new_len))) return err;
+	err = 0;
+	hpfs_adjust_length(old_name, &old_len);
+
+	hpfs_lock(i->i_sb);
+	/* order doesn't matter, due to VFS exclusion */
+	
+	/* Erm? Moving over the empty non-busy directory is perfectly legal */
+	if (new_inode && S_ISDIR(new_inode->i_mode)) {
+		err = -EINVAL;
+		goto end1;
+	}
+
+	if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
+		hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
+		err = -ENOENT;
+		goto end1;
+	}
+	copy_de(&de, dep);
+	de.hidden = new_name[0] == '.';
+
+	if (new_inode) {
+		int r;
+		if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
+			if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
+				clear_nlink(new_inode);
+				copy_de(nde, &de);
+				memcpy(nde->name, new_name, new_len);
+				hpfs_mark_4buffers_dirty(&qbh1);
+				hpfs_brelse4(&qbh1);
+				goto end;
+			}
+			hpfs_error(new_dir->i_sb, "hpfs_rename: could not find dirent");
+			err = -EFSERROR;
+			goto end1;
+		}
+		err = r == 2 ? -ENOSPC : r == 1 ? -EFSERROR : 0;
+		goto end1;
+	}
+
+	if (new_dir == old_dir) hpfs_brelse4(&qbh);
+
+	if ((r = hpfs_add_dirent(new_dir, new_name, new_len, &de))) {
+		if (r == -1) hpfs_error(new_dir->i_sb, "hpfs_rename: dirent already exists!");
+		err = r == 1 ? -ENOSPC : -EFSERROR;
+		if (new_dir != old_dir) hpfs_brelse4(&qbh);
+		goto end1;
+	}
+	
+	if (new_dir == old_dir)
+		if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
+			hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
+			err = -ENOENT;
+			goto end1;
+		}
+
+	if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 0))) {
+		hpfs_error(i->i_sb, "hpfs_rename: could not remove dirent");
+		err = r == 2 ? -ENOSPC : -EFSERROR;
+		goto end1;
+	}
+
+	end:
+	hpfs_i(i)->i_parent_dir = new_dir->i_ino;
+	if (S_ISDIR(i->i_mode)) {
+		inc_nlink(new_dir);
+		drop_nlink(old_dir);
+	}
+	if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) {
+		fnode->up = cpu_to_le32(new_dir->i_ino);
+		fnode->len = new_len;
+		memcpy(fnode->name, new_name, new_len>15?15:new_len);
+		if (new_len < 15) memset(&fnode->name[new_len], 0, 15 - new_len);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+end1:
+	hpfs_unlock(i->i_sb);
+	return err;
+}
+
+const struct inode_operations hpfs_dir_iops =
+{
+	.create		= hpfs_create,
+	.lookup		= hpfs_lookup,
+	.unlink		= hpfs_unlink,
+	.symlink	= hpfs_symlink,
+	.mkdir		= hpfs_mkdir,
+	.rmdir		= hpfs_rmdir,
+	.mknod		= hpfs_mknod,
+	.rename		= hpfs_rename,
+	.setattr	= hpfs_setattr,
+};
diff --git a/kernel/fs/hpfs/super.c b/kernel/fs/hpfs/super.c
new file mode 100644
index 000000000..7cd00d3a7
--- /dev/null
+++ b/kernel/fs/hpfs/super.c
@@ -0,0 +1,753 @@
+/*
+ *  linux/fs/hpfs/super.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  mounting, unmounting, error handling
+ */
+
+#include "hpfs_fn.h"
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/init.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+
+/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
+
+static void mark_dirty(struct super_block *s, int remount)
+{
+	if (hpfs_sb(s)->sb_chkdsk && (remount || !(s->s_flags & MS_RDONLY))) {
+		struct buffer_head *bh;
+		struct hpfs_spare_block *sb;
+		if ((sb = hpfs_map_sector(s, 17, &bh, 0))) {
+			sb->dirty = 1;
+			sb->old_wrote = 0;
+			mark_buffer_dirty(bh);
+			sync_dirty_buffer(bh);
+			brelse(bh);
+		}
+	}
+}
+
+/* Mark the filesystem clean (mark it dirty for chkdsk if chkdsk==2 or if there
+   were errors) */
+
+static void unmark_dirty(struct super_block *s)
+{
+	struct buffer_head *bh;
+	struct hpfs_spare_block *sb;
+	if (s->s_flags & MS_RDONLY) return;
+	sync_blockdev(s->s_bdev);
+	if ((sb = hpfs_map_sector(s, 17, &bh, 0))) {
+		sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error;
+		sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error;
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+}
+
+/* Filesystem error... */
+static char err_buf[1024];
+
+void hpfs_error(struct super_block *s, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+	va_end(args);
+
+	pr_err("filesystem error: %s", err_buf);
+	if (!hpfs_sb(s)->sb_was_error) {
+		if (hpfs_sb(s)->sb_err == 2) {
+			pr_cont("; crashing the system because you wanted it\n");
+			mark_dirty(s, 0);
+			panic("HPFS panic");
+		} else if (hpfs_sb(s)->sb_err == 1) {
+			if (s->s_flags & MS_RDONLY)
+				pr_cont("; already mounted read-only\n");
+			else {
+				pr_cont("; remounting read-only\n");
+				mark_dirty(s, 0);
+				s->s_flags |= MS_RDONLY;
+			}
+		} else if (s->s_flags & MS_RDONLY)
+				pr_cont("; going on - but anything won't be destroyed because it's read-only\n");
+		else
+			pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n");
+	} else
+		pr_cont("\n");
+	hpfs_sb(s)->sb_was_error = 1;
+}
+
+/* 
+ * A little trick to detect cycles in many hpfs structures and don't let the
+ * kernel crash on corrupted filesystem. When first called, set c2 to 0.
+ *
+ * BTW. chkdsk doesn't detect cycles correctly. When I had 2 lost directories
+ * nested each in other, chkdsk locked up happilly.
+ */
+
+int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
+		char *msg)
+{
+	if (*c2 && *c1 == key) {
+		hpfs_error(s, "cycle detected on key %08x in %s", key, msg);
+		return 1;
+	}
+	(*c2)++;
+	if (!((*c2 - 1) & *c2)) *c1 = key;
+	return 0;
+}
+
+static void free_sbi(struct hpfs_sb_info *sbi)
+{
+	kfree(sbi->sb_cp_table);
+	kfree(sbi->sb_bmp_dir);
+	kfree(sbi);
+}
+
+static void lazy_free_sbi(struct rcu_head *rcu)
+{
+	free_sbi(container_of(rcu, struct hpfs_sb_info, rcu));
+}
+
+static void hpfs_put_super(struct super_block *s)
+{
+	hpfs_lock(s);
+	unmark_dirty(s);
+	hpfs_unlock(s);
+	call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi);
+}
+
+static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
+{
+	struct quad_buffer_head qbh;
+	unsigned long *bits;
+	unsigned count;
+
+	bits = hpfs_map_4sectors(s, secno, &qbh, 0);
+	if (!bits)
+		return (unsigned)-1;
+	count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
+	hpfs_brelse4(&qbh);
+	return count;
+}
+
+static unsigned count_bitmaps(struct super_block *s)
+{
+	unsigned n, count, n_bands;
+	n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+	count = 0;
+	for (n = 0; n < COUNT_RD_AHEAD; n++) {
+		hpfs_prefetch_bitmap(s, n);
+	}
+	for (n = 0; n < n_bands; n++) {
+		unsigned c;
+		hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
+		c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+		if (c != (unsigned)-1)
+			count += c;
+	}
+	return count;
+}
+
+unsigned hpfs_get_free_dnodes(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free_dnodes == (unsigned)-1) {
+		unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap);
+		if (c == (unsigned)-1)
+			return 0;
+		sbi->sb_n_free_dnodes = c;
+	}
+	return sbi->sb_n_free_dnodes;
+}
+
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
+	hpfs_lock(s);
+
+	if (sbi->sb_n_free == (unsigned)-1)
+		sbi->sb_n_free = count_bitmaps(s);
+
+	buf->f_type = s->s_magic;
+	buf->f_bsize = 512;
+	buf->f_blocks = sbi->sb_fs_size;
+	buf->f_bfree = sbi->sb_n_free;
+	buf->f_bavail = sbi->sb_n_free;
+	buf->f_files = sbi->sb_dirband_size / 4;
+	buf->f_ffree = hpfs_get_free_dnodes(s);
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = 254;
+
+	hpfs_unlock(s);
+
+	return 0;
+}
+
+static struct kmem_cache * hpfs_inode_cachep;
+
+static struct inode *hpfs_alloc_inode(struct super_block *sb)
+{
+	struct hpfs_inode_info *ei;
+	ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	ei->vfs_inode.i_version = 1;
+	return &ei->vfs_inode;
+}
+
+static void hpfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
+}
+
+static void hpfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
+					     sizeof(struct hpfs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (hpfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(hpfs_inode_cachep);
+}
+
+/*
+ * A tiny parser for option strings, stolen from dosfs.
+ * Stolen again from read-only hpfs.
+ * And updated for table-driven option parsing.
+ */
+
+enum {
+	Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis,
+	Opt_check_none, Opt_check_normal, Opt_check_strict,
+	Opt_err_cont, Opt_err_ro, Opt_err_panic,
+	Opt_eas_no, Opt_eas_ro, Opt_eas_rw,
+	Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always,
+	Opt_timeshift, Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_help, "help"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_case_lower, "case=lower"},
+	{Opt_case_asis, "case=asis"},
+	{Opt_check_none, "check=none"},
+	{Opt_check_normal, "check=normal"},
+	{Opt_check_strict, "check=strict"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_eas_no, "eas=no"},
+	{Opt_eas_ro, "eas=ro"},
+	{Opt_eas_rw, "eas=rw"},
+	{Opt_chkdsk_no, "chkdsk=no"},
+	{Opt_chkdsk_errors, "chkdsk=errors"},
+	{Opt_chkdsk_always, "chkdsk=always"},
+	{Opt_timeshift, "timeshift=%d"},
+	{Opt_err, NULL},
+};
+
+static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask,
+		      int *lowercase, int *eas, int *chk, int *errs,
+		      int *chkdsk, int *timeshift)
+{
+	char *p;
+	int option;
+
+	if (!opts)
+		return 1;
+
+	/*pr_info("Parsing opts: '%s'\n",opts);*/
+
+	while ((p = strsep(&opts, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_help:
+			return 2;
+		case Opt_uid:
+			if (match_int(args, &option))
+				return 0;
+			*uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(*uid))
+				return 0;
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return 0;
+			*gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(*gid))
+				return 0;
+			break;
+		case Opt_umask:
+			if (match_octal(args, &option))
+				return 0;
+			*umask = option;
+			break;
+		case Opt_case_lower:
+			*lowercase = 1;
+			break;
+		case Opt_case_asis:
+			*lowercase = 0;
+			break;
+		case Opt_check_none:
+			*chk = 0;
+			break;
+		case Opt_check_normal:
+			*chk = 1;
+			break;
+		case Opt_check_strict:
+			*chk = 2;
+			break;
+		case Opt_err_cont:
+			*errs = 0;
+			break;
+		case Opt_err_ro:
+			*errs = 1;
+			break;
+		case Opt_err_panic:
+			*errs = 2;
+			break;
+		case Opt_eas_no:
+			*eas = 0;
+			break;
+		case Opt_eas_ro:
+			*eas = 1;
+			break;
+		case Opt_eas_rw:
+			*eas = 2;
+			break;
+		case Opt_chkdsk_no:
+			*chkdsk = 0;
+			break;
+		case Opt_chkdsk_errors:
+			*chkdsk = 1;
+			break;
+		case Opt_chkdsk_always:
+			*chkdsk = 2;
+			break;
+		case Opt_timeshift:
+		{
+			int m = 1;
+			char *rhs = args[0].from;
+			if (!rhs || !*rhs)
+				return 0;
+			if (*rhs == '-') m = -1;
+			if (*rhs == '+' || *rhs == '-') rhs++;
+			*timeshift = simple_strtoul(rhs, &rhs, 0) * m;
+			if (*rhs)
+				return 0;
+			break;
+		}
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static inline void hpfs_help(void)
+{
+	pr_info("\n\
+HPFS filesystem options:\n\
+      help              do not mount and display this text\n\
+      uid=xxx           set uid of files that don't have uid specified in eas\n\
+      gid=xxx           set gid of files that don't have gid specified in eas\n\
+      umask=xxx         set mode of files that don't have mode specified in eas\n\
+      case=lower        lowercase all files\n\
+      case=asis         do not lowercase files (default)\n\
+      check=none        no fs checks - kernel may crash on corrupted filesystem\n\
+      check=normal      do some checks - it should not crash (default)\n\
+      check=strict      do extra time-consuming checks, used for debugging\n\
+      errors=continue   continue on errors\n\
+      errors=remount-ro remount read-only if errors found (default)\n\
+      errors=panic      panic on errors\n\
+      chkdsk=no         do not mark fs for chkdsking even if there were errors\n\
+      chkdsk=errors     mark fs dirty if errors found (default)\n\
+      chkdsk=always     always mark fs dirty - used for debugging\n\
+      eas=no            ignore extended attributes\n\
+      eas=ro            read but do not write extended attributes\n\
+      eas=rw            r/w eas => enables chmod, chown, mknod, ln -s (default)\n\
+      timeshift=nnn	add nnn seconds to file times\n\
+\n");
+}
+
+static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
+{
+	kuid_t uid;
+	kgid_t gid;
+	umode_t umask;
+	int lowercase, eas, chk, errs, chkdsk, timeshift;
+	int o;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	char *new_opts = kstrdup(data, GFP_KERNEL);
+	
+	sync_filesystem(s);
+
+	*flags |= MS_NOATIME;
+	
+	hpfs_lock(s);
+	uid = sbi->sb_uid; gid = sbi->sb_gid;
+	umask = 0777 & ~sbi->sb_mode;
+	lowercase = sbi->sb_lowercase;
+	eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk;
+	errs = sbi->sb_err; timeshift = sbi->sb_timeshift;
+
+	if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase,
+	    &eas, &chk, &errs, &chkdsk, &timeshift))) {
+		pr_err("bad mount options.\n");
+		goto out_err;
+	}
+	if (o == 2) {
+		hpfs_help();
+		goto out_err;
+	}
+	if (timeshift != sbi->sb_timeshift) {
+		pr_err("timeshift can't be changed using remount.\n");
+		goto out_err;
+	}
+
+	unmark_dirty(s);
+
+	sbi->sb_uid = uid; sbi->sb_gid = gid;
+	sbi->sb_mode = 0777 & ~umask;
+	sbi->sb_lowercase = lowercase;
+	sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
+	sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
+
+	if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
+
+	replace_mount_options(s, new_opts);
+
+	hpfs_unlock(s);
+	return 0;
+
+out_err:
+	hpfs_unlock(s);
+	kfree(new_opts);
+	return -EINVAL;
+}
+
+/* Super operations */
+
+static const struct super_operations hpfs_sops =
+{
+	.alloc_inode	= hpfs_alloc_inode,
+	.destroy_inode	= hpfs_destroy_inode,
+	.evict_inode	= hpfs_evict_inode,
+	.put_super	= hpfs_put_super,
+	.statfs		= hpfs_statfs,
+	.remount_fs	= hpfs_remount_fs,
+	.show_options	= generic_show_options,
+};
+
+static int hpfs_fill_super(struct super_block *s, void *options, int silent)
+{
+	struct buffer_head *bh0, *bh1, *bh2;
+	struct hpfs_boot_block *bootblock;
+	struct hpfs_super_block *superblock;
+	struct hpfs_spare_block *spareblock;
+	struct hpfs_sb_info *sbi;
+	struct inode *root;
+
+	kuid_t uid;
+	kgid_t gid;
+	umode_t umask;
+	int lowercase, eas, chk, errs, chkdsk, timeshift;
+
+	dnode_secno root_dno;
+	struct hpfs_dirent *de = NULL;
+	struct quad_buffer_head qbh;
+
+	int o;
+
+	save_mount_options(s, options);
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi) {
+		return -ENOMEM;
+	}
+	s->s_fs_info = sbi;
+
+	mutex_init(&sbi->hpfs_mutex);
+	hpfs_lock(s);
+
+	uid = current_uid();
+	gid = current_gid();
+	umask = current_umask();
+	lowercase = 0;
+	eas = 2;
+	chk = 1;
+	errs = 1;
+	chkdsk = 1;
+	timeshift = 0;
+
+	if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase,
+	    &eas, &chk, &errs, &chkdsk, &timeshift))) {
+		pr_err("bad mount options.\n");
+		goto bail0;
+	}
+	if (o==2) {
+		hpfs_help();
+		goto bail0;
+	}
+
+	/*sbi->sb_mounting = 1;*/
+	sb_set_blocksize(s, 512);
+	sbi->sb_fs_size = -1;
+	if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1;
+	if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2;
+	if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3;
+
+	/* Check magics */
+	if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC
+	    ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC
+	    || le32_to_cpu(spareblock->magic) != SP_MAGIC) {
+		if (!silent)
+			pr_err("Bad magic ... probably not HPFS\n");
+		goto bail4;
+	}
+
+	/* Check version */
+	if (!(s->s_flags & MS_RDONLY) &&
+	      superblock->funcversion != 2 && superblock->funcversion != 3) {
+		pr_err("Bad version %d,%d. Mount readonly to go around\n",
+			(int)superblock->version, (int)superblock->funcversion);
+		pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n");
+		goto bail4;
+	}
+
+	s->s_flags |= MS_NOATIME;
+
+	/* Fill superblock stuff */
+	s->s_magic = HPFS_SUPER_MAGIC;
+	s->s_op = &hpfs_sops;
+	s->s_d_op = &hpfs_dentry_operations;
+
+	sbi->sb_root = le32_to_cpu(superblock->root);
+	sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors);
+	sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps);
+	sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start);
+	sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band);
+	sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap);
+	sbi->sb_uid = uid;
+	sbi->sb_gid = gid;
+	sbi->sb_mode = 0777 & ~umask;
+	sbi->sb_n_free = -1;
+	sbi->sb_n_free_dnodes = -1;
+	sbi->sb_lowercase = lowercase;
+	sbi->sb_eas = eas;
+	sbi->sb_chk = chk;
+	sbi->sb_chkdsk = chkdsk;
+	sbi->sb_err = errs;
+	sbi->sb_timeshift = timeshift;
+	sbi->sb_was_error = 0;
+	sbi->sb_cp_table = NULL;
+	sbi->sb_c_bitmap = -1;
+	sbi->sb_max_fwd_alloc = 0xffffff;
+
+	if (sbi->sb_fs_size >= 0x80000000) {
+		hpfs_error(s, "invalid size in superblock: %08x",
+			(unsigned)sbi->sb_fs_size);
+		goto bail4;
+	}
+
+	/* Load bitmap directory */
+	if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
+		goto bail4;
+	
+	/* Check for general fs errors*/
+	if (spareblock->dirty && !spareblock->old_wrote) {
+		if (errs == 2) {
+			pr_err("Improperly stopped, not mounted\n");
+			goto bail4;
+		}
+		hpfs_error(s, "improperly stopped");
+	}
+
+	if (!(s->s_flags & MS_RDONLY)) {
+		spareblock->dirty = 1;
+		spareblock->old_wrote = 0;
+		mark_buffer_dirty(bh2);
+	}
+
+	if (spareblock->hotfixes_used || spareblock->n_spares_used) {
+		if (errs >= 2) {
+			pr_err("Hotfixes not supported here, try chkdsk\n");
+			mark_dirty(s, 0);
+			goto bail4;
+		}
+		hpfs_error(s, "hotfixes not supported here, try chkdsk");
+		if (errs == 0)
+			pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n");
+		else
+			pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n");
+	}
+	if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) {
+		if (errs >= 2) {
+			pr_err("Spare dnodes used, try chkdsk\n");
+			mark_dirty(s, 0);
+			goto bail4;
+		}
+		hpfs_error(s, "warning: spare dnodes used, try chkdsk");
+		if (errs == 0)
+			pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n");
+	}
+	if (chk) {
+		unsigned a;
+		if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) ||
+		    le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) {
+			hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x",
+				le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band));
+			goto bail4;
+		}
+		a = sbi->sb_dirband_size;
+		sbi->sb_dirband_size = 0;
+		if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") ||
+		    hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") ||
+		    hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) {
+			mark_dirty(s, 0);
+			goto bail4;
+		}
+		sbi->sb_dirband_size = a;
+	} else
+		pr_err("You really don't want any checks? You are crazy...\n");
+
+	/* Load code page table */
+	if (le32_to_cpu(spareblock->n_code_pages))
+		if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir))))
+			pr_err("code page support is disabled\n");
+
+	brelse(bh2);
+	brelse(bh1);
+	brelse(bh0);
+
+	root = iget_locked(s, sbi->sb_root);
+	if (!root)
+		goto bail0;
+	hpfs_init_inode(root);
+	hpfs_read_inode(root);
+	unlock_new_inode(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
+		goto bail0;
+
+	/*
+	 * find the root directory's . pointer & finish filling in the inode
+	 */
+
+	root_dno = hpfs_fnode_dno(s, sbi->sb_root);
+	if (root_dno)
+		de = map_dirent(root, root_dno, "\001\001", 2, NULL, &qbh);
+	if (!de)
+		hpfs_error(s, "unable to find root dir");
+	else {
+		root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date));
+		root->i_atime.tv_nsec = 0;
+		root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
+		root->i_mtime.tv_nsec = 0;
+		root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
+		root->i_ctime.tv_nsec = 0;
+		hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size);
+		hpfs_i(root)->i_parent_dir = root->i_ino;
+		if (root->i_size == -1)
+			root->i_size = 2048;
+		if (root->i_blocks == -1)
+			root->i_blocks = 5;
+		hpfs_brelse4(&qbh);
+	}
+	hpfs_unlock(s);
+	return 0;
+
+bail4:	brelse(bh2);
+bail3:	brelse(bh1);
+bail2:	brelse(bh0);
+bail1:
+bail0:
+	hpfs_unlock(s);
+	free_sbi(sbi);
+	return -EINVAL;
+}
+
+static struct dentry *hpfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+}
+
+static struct file_system_type hpfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "hpfs",
+	.mount		= hpfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("hpfs");
+
+static int __init init_hpfs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&hpfs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_hpfs_fs(void)
+{
+	unregister_filesystem(&hpfs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_hpfs_fs)
+module_exit(exit_hpfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/hppfs/Makefile b/kernel/fs/hppfs/Makefile
new file mode 100644
index 000000000..3a982bd97
--- /dev/null
+++ b/kernel/fs/hppfs/Makefile
@@ -0,0 +1,6 @@
+#
+# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+# Licensed under the GPL
+#
+
+obj-$(CONFIG_HPPFS) += hppfs.o
diff --git a/kernel/fs/hppfs/hppfs.c b/kernel/fs/hppfs/hppfs.c
new file mode 100644
index 000000000..fa2bd5366
--- /dev/null
+++ b/kernel/fs/hppfs/hppfs.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <linux/ctype.h>
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/types.h>
+#include <linux/pid_namespace.h>
+#include <linux/namei.h>
+#include <asm/uaccess.h>
+#include <os.h>
+
+static struct inode *get_inode(struct super_block *, struct dentry *);
+
+struct hppfs_data {
+	struct list_head list;
+	char contents[PAGE_SIZE - sizeof(struct list_head)];
+};
+
+struct hppfs_private {
+	struct file *proc_file;
+	int host_fd;
+	loff_t len;
+	struct hppfs_data *contents;
+};
+
+struct hppfs_inode_info {
+	struct dentry *proc_dentry;
+	struct inode vfs_inode;
+};
+
+static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
+{
+	return container_of(inode, struct hppfs_inode_info, vfs_inode);
+}
+
+#define HPPFS_SUPER_MAGIC 0xb00000ee
+
+static const struct super_operations hppfs_sbops;
+
+static int is_pid(struct dentry *dentry)
+{
+	struct super_block *sb;
+	int i;
+
+	sb = dentry->d_sb;
+	if (dentry->d_parent != sb->s_root)
+		return 0;
+
+	for (i = 0; i < dentry->d_name.len; i++) {
+		if (!isdigit(dentry->d_name.name[i]))
+			return 0;
+	}
+	return 1;
+}
+
+static char *dentry_name(struct dentry *dentry, int extra)
+{
+	struct dentry *parent;
+	char *root, *name;
+	const char *seg_name;
+	int len, seg_len, root_len;
+
+	len = 0;
+	parent = dentry;
+	while (parent->d_parent != parent) {
+		if (is_pid(parent))
+			len += strlen("pid") + 1;
+		else len += parent->d_name.len + 1;
+		parent = parent->d_parent;
+	}
+
+	root = "proc";
+	root_len = strlen(root);
+	len += root_len;
+	name = kmalloc(len + extra + 1, GFP_KERNEL);
+	if (name == NULL)
+		return NULL;
+
+	name[len] = '\0';
+	parent = dentry;
+	while (parent->d_parent != parent) {
+		if (is_pid(parent)) {
+			seg_name = "pid";
+			seg_len = strlen(seg_name);
+		}
+		else {
+			seg_name = parent->d_name.name;
+			seg_len = parent->d_name.len;
+		}
+
+		len -= seg_len + 1;
+		name[len] = '/';
+		memcpy(&name[len + 1], seg_name, seg_len);
+		parent = parent->d_parent;
+	}
+	memcpy(name, root, root_len);
+	return name;
+}
+
+static int file_removed(struct dentry *dentry, const char *file)
+{
+	char *host_file;
+	int extra, fd;
+
+	extra = 0;
+	if (file != NULL)
+		extra += strlen(file) + 1;
+
+	host_file = dentry_name(dentry, extra + strlen("/remove"));
+	if (host_file == NULL) {
+		printk(KERN_ERR "file_removed : allocation failed\n");
+		return -ENOMEM;
+	}
+
+	if (file != NULL) {
+		strcat(host_file, "/");
+		strcat(host_file, file);
+	}
+	strcat(host_file, "/remove");
+
+	fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+	kfree(host_file);
+	if (fd > 0) {
+		os_close_file(fd);
+		return 1;
+	}
+	return 0;
+}
+
+static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct dentry *proc_dentry, *parent;
+	struct qstr *name = &dentry->d_name;
+	struct inode *inode;
+	int err, deleted;
+
+	deleted = file_removed(dentry, NULL);
+	if (deleted < 0)
+		return ERR_PTR(deleted);
+	else if (deleted)
+		return ERR_PTR(-ENOENT);
+
+	parent = HPPFS_I(ino)->proc_dentry;
+	mutex_lock(&d_inode(parent)->i_mutex);
+	proc_dentry = lookup_one_len(name->name, parent, name->len);
+	mutex_unlock(&d_inode(parent)->i_mutex);
+
+	if (IS_ERR(proc_dentry))
+		return proc_dentry;
+
+	err = -ENOMEM;
+	inode = get_inode(ino->i_sb, proc_dentry);
+	if (!inode)
+		goto out;
+
+ 	d_add(dentry, inode);
+	return NULL;
+
+ out:
+	return ERR_PTR(err);
+}
+
+static const struct inode_operations hppfs_file_iops = {
+};
+
+static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
+			 loff_t *ppos, int is_user)
+{
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	ssize_t n;
+
+	read = file_inode(file)->i_fop->read;
+
+	if (!is_user)
+		set_fs(KERNEL_DS);
+
+	n = (*read)(file, buf, count, &file->f_pos);
+
+	if (!is_user)
+		set_fs(USER_DS);
+
+	if (ppos)
+		*ppos = file->f_pos;
+	return n;
+}
+
+static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
+{
+	ssize_t n;
+	int cur, err;
+	char *new_buf;
+
+	n = -ENOMEM;
+	new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (new_buf == NULL) {
+		printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
+		goto out;
+	}
+	n = 0;
+	while (count > 0) {
+		cur = min_t(ssize_t, count, PAGE_SIZE);
+		err = os_read_file(fd, new_buf, cur);
+		if (err < 0) {
+			printk(KERN_ERR "hppfs_read : read failed, "
+			       "errno = %d\n", err);
+			n = err;
+			goto out_free;
+		} else if (err == 0)
+			break;
+
+		if (copy_to_user(buf, new_buf, err)) {
+			n = -EFAULT;
+			goto out_free;
+		}
+		n += err;
+		count -= err;
+	}
+ out_free:
+	kfree(new_buf);
+ out:
+	return n;
+}
+
+static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
+			  loff_t *ppos)
+{
+	struct hppfs_private *hppfs = file->private_data;
+	struct hppfs_data *data;
+	loff_t off;
+	int err;
+
+	if (hppfs->contents != NULL) {
+		int rem;
+
+		if (*ppos >= hppfs->len)
+			return 0;
+
+		data = hppfs->contents;
+		off = *ppos;
+		while (off >= sizeof(data->contents)) {
+			data = list_entry(data->list.next, struct hppfs_data,
+					  list);
+			off -= sizeof(data->contents);
+		}
+
+		if (off + count > hppfs->len)
+			count = hppfs->len - off;
+		rem = copy_to_user(buf, &data->contents[off], count);
+		*ppos += count - rem;
+		if (rem > 0)
+			return -EFAULT;
+	} else if (hppfs->host_fd != -1) {
+		err = os_seek_file(hppfs->host_fd, *ppos);
+		if (err) {
+			printk(KERN_ERR "hppfs_read : seek failed, "
+			       "errno = %d\n", err);
+			return err;
+		}
+		err = hppfs_read_file(hppfs->host_fd, buf, count);
+		if (err < 0) {
+			printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
+			return err;
+		}
+		count = err;
+		if (count > 0)
+			*ppos += count;
+	}
+	else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
+
+	return count;
+}
+
+static ssize_t hppfs_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+
+	write = file_inode(proc_file)->i_fop->write;
+	return (*write)(proc_file, buf, len, ppos);
+}
+
+static int open_host_sock(char *host_file, int *filter_out)
+{
+	char *end;
+	int fd;
+
+	end = &host_file[strlen(host_file)];
+	strcpy(end, "/rw");
+	*filter_out = 1;
+	fd = os_connect_socket(host_file);
+	if (fd > 0)
+		return fd;
+
+	strcpy(end, "/r");
+	*filter_out = 0;
+	fd = os_connect_socket(host_file);
+	return fd;
+}
+
+static void free_contents(struct hppfs_data *head)
+{
+	struct hppfs_data *data;
+	struct list_head *ele, *next;
+
+	if (head == NULL)
+		return;
+
+	list_for_each_safe(ele, next, &head->list) {
+		data = list_entry(ele, struct hppfs_data, list);
+		kfree(data);
+	}
+	kfree(head);
+}
+
+static struct hppfs_data *hppfs_get_data(int fd, int filter,
+					 struct file *proc_file,
+					 struct file *hppfs_file,
+					 loff_t *size_out)
+{
+	struct hppfs_data *data, *new, *head;
+	int n, err;
+
+	err = -ENOMEM;
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL) {
+		printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
+		goto failed;
+	}
+
+	INIT_LIST_HEAD(&data->list);
+
+	head = data;
+	*size_out = 0;
+
+	if (filter) {
+		while ((n = read_proc(proc_file, data->contents,
+				      sizeof(data->contents), NULL, 0)) > 0)
+			os_write_file(fd, data->contents, n);
+		err = os_shutdown_socket(fd, 0, 1);
+		if (err) {
+			printk(KERN_ERR "hppfs_get_data : failed to shut down "
+			       "socket\n");
+			goto failed_free;
+		}
+	}
+	while (1) {
+		n = os_read_file(fd, data->contents, sizeof(data->contents));
+		if (n < 0) {
+			err = n;
+			printk(KERN_ERR "hppfs_get_data : read failed, "
+			       "errno = %d\n", err);
+			goto failed_free;
+		} else if (n == 0)
+			break;
+
+		*size_out += n;
+
+		if (n < sizeof(data->contents))
+			break;
+
+		new = kmalloc(sizeof(*data), GFP_KERNEL);
+		if (new == 0) {
+			printk(KERN_ERR "hppfs_get_data : data allocation "
+			       "failed\n");
+			err = -ENOMEM;
+			goto failed_free;
+		}
+
+		INIT_LIST_HEAD(&new->list);
+		list_add(&new->list, &data->list);
+		data = new;
+	}
+	return head;
+
+ failed_free:
+	free_contents(head);
+ failed:
+	return ERR_PTR(err);
+}
+
+static struct hppfs_private *hppfs_data(void)
+{
+	struct hppfs_private *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return data;
+
+	*data = ((struct hppfs_private ) { .host_fd  		= -1,
+					   .len  		= -1,
+					   .contents 		= NULL } );
+	return data;
+}
+
+static int file_mode(int fmode)
+{
+	if (fmode == (FMODE_READ | FMODE_WRITE))
+		return O_RDWR;
+	if (fmode == FMODE_READ)
+		return O_RDONLY;
+	if (fmode == FMODE_WRITE)
+		return O_WRONLY;
+	return 0;
+}
+
+static int hppfs_open(struct inode *inode, struct file *file)
+{
+	const struct cred *cred = file->f_cred;
+	struct hppfs_private *data;
+	struct path path;
+	char *host_file;
+	int err, fd, type, filter;
+
+	err = -ENOMEM;
+	data = hppfs_data();
+	if (data == NULL)
+		goto out;
+
+	host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
+	if (host_file == NULL)
+		goto out_free2;
+
+	path.mnt = inode->i_sb->s_fs_info;
+	path.dentry = HPPFS_I(inode)->proc_dentry;
+
+	data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
+	err = PTR_ERR(data->proc_file);
+	if (IS_ERR(data->proc_file))
+		goto out_free1;
+
+	type = os_file_type(host_file);
+	if (type == OS_TYPE_FILE) {
+		fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+		if (fd >= 0)
+			data->host_fd = fd;
+		else
+			printk(KERN_ERR "hppfs_open : failed to open '%s', "
+			       "errno = %d\n", host_file, -fd);
+
+		data->contents = NULL;
+	} else if (type == OS_TYPE_DIR) {
+		fd = open_host_sock(host_file, &filter);
+		if (fd > 0) {
+			data->contents = hppfs_get_data(fd, filter,
+							data->proc_file,
+							file, &data->len);
+			if (!IS_ERR(data->contents))
+				data->host_fd = fd;
+		} else
+			printk(KERN_ERR "hppfs_open : failed to open a socket "
+			       "in '%s', errno = %d\n", host_file, -fd);
+	}
+	kfree(host_file);
+
+	file->private_data = data;
+	return 0;
+
+ out_free1:
+	kfree(host_file);
+ out_free2:
+	free_contents(data->contents);
+	kfree(data);
+ out:
+	return err;
+}
+
+static int hppfs_dir_open(struct inode *inode, struct file *file)
+{
+	const struct cred *cred = file->f_cred;
+	struct hppfs_private *data;
+	struct path path;
+	int err;
+
+	err = -ENOMEM;
+	data = hppfs_data();
+	if (data == NULL)
+		goto out;
+
+	path.mnt = inode->i_sb->s_fs_info;
+	path.dentry = HPPFS_I(inode)->proc_dentry;
+	data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
+	err = PTR_ERR(data->proc_file);
+	if (IS_ERR(data->proc_file))
+		goto out_free;
+
+	file->private_data = data;
+	return 0;
+
+ out_free:
+	kfree(data);
+ out:
+	return err;
+}
+
+static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	loff_t (*llseek)(struct file *, loff_t, int);
+	loff_t ret;
+
+	llseek = file_inode(proc_file)->i_fop->llseek;
+	if (llseek != NULL) {
+		ret = (*llseek)(proc_file, off, where);
+		if (ret < 0)
+			return ret;
+	}
+
+	return default_llseek(file, off, where);
+}
+
+static int hppfs_release(struct inode *inode, struct file *file)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	if (proc_file)
+		fput(proc_file);
+	kfree(data);
+	return 0;
+}
+
+static const struct file_operations hppfs_file_fops = {
+	.owner		= NULL,
+	.llseek		= hppfs_llseek,
+	.read		= hppfs_read,
+	.write		= hppfs_write,
+	.open		= hppfs_open,
+	.release	= hppfs_release,
+};
+
+struct hppfs_dirent {
+	struct dir_context ctx;
+	struct dir_context *caller;
+	struct dentry *dentry;
+};
+
+static int hppfs_filldir(struct dir_context *ctx, const char *name, int size,
+			 loff_t offset, u64 inode, unsigned int type)
+{
+	struct hppfs_dirent *dirent =
+		container_of(ctx, struct hppfs_dirent, ctx);
+
+	if (file_removed(dirent->dentry, name))
+		return 0;
+
+	dirent->caller->pos = dirent->ctx.pos;
+	return !dir_emit(dirent->caller, name, size, inode, type);
+}
+
+static int hppfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	struct hppfs_dirent d = {
+		.ctx.actor	= hppfs_filldir,
+		.caller		= ctx,
+		.dentry  	= file->f_path.dentry
+	};
+	int err;
+	proc_file->f_pos = ctx->pos;
+	err = iterate_dir(proc_file, &d.ctx);
+	ctx->pos = d.ctx.pos;
+	return err;
+}
+
+static const struct file_operations hppfs_dir_fops = {
+	.owner		= NULL,
+	.iterate	= hppfs_readdir,
+	.open		= hppfs_dir_open,
+	.llseek		= default_llseek,
+	.release	= hppfs_release,
+};
+
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+{
+	sf->f_blocks = 0;
+	sf->f_bfree = 0;
+	sf->f_bavail = 0;
+	sf->f_files = 0;
+	sf->f_ffree = 0;
+	sf->f_type = HPPFS_SUPER_MAGIC;
+	return 0;
+}
+
+static struct inode *hppfs_alloc_inode(struct super_block *sb)
+{
+	struct hppfs_inode_info *hi;
+
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	if (!hi)
+		return NULL;
+
+	hi->proc_dentry = NULL;
+	inode_init_once(&hi->vfs_inode);
+	return &hi->vfs_inode;
+}
+
+void hppfs_evict_inode(struct inode *ino)
+{
+	clear_inode(ino);
+	dput(HPPFS_I(ino)->proc_dentry);
+	mntput(ino->i_sb->s_fs_info);
+}
+
+static void hppfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kfree(HPPFS_I(inode));
+}
+
+static void hppfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
+
+static const struct super_operations hppfs_sbops = {
+	.alloc_inode	= hppfs_alloc_inode,
+	.destroy_inode	= hppfs_destroy_inode,
+	.evict_inode	= hppfs_evict_inode,
+	.statfs		= hppfs_statfs,
+};
+
+static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
+			  int buflen)
+{
+	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
+	return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer,
+						    buflen);
+}
+
+static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
+
+	return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd);
+}
+
+static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			   void *cookie)
+{
+	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
+
+	if (d_inode(proc_dentry)->i_op->put_link)
+		d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie);
+}
+
+static const struct inode_operations hppfs_dir_iops = {
+	.lookup		= hppfs_lookup,
+};
+
+static const struct inode_operations hppfs_link_iops = {
+	.readlink	= hppfs_readlink,
+	.follow_link	= hppfs_follow_link,
+	.put_link	= hppfs_put_link,
+};
+
+static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
+{
+	struct inode *proc_ino = d_inode(dentry);
+	struct inode *inode = new_inode(sb);
+
+	if (!inode) {
+		dput(dentry);
+		return NULL;
+	}
+
+	if (d_is_dir(dentry)) {
+		inode->i_op = &hppfs_dir_iops;
+		inode->i_fop = &hppfs_dir_fops;
+	} else if (d_is_symlink(dentry)) {
+		inode->i_op = &hppfs_link_iops;
+		inode->i_fop = &hppfs_file_fops;
+	} else {
+		inode->i_op = &hppfs_file_iops;
+		inode->i_fop = &hppfs_file_fops;
+	}
+
+	HPPFS_I(inode)->proc_dentry = dentry;
+
+	inode->i_uid = proc_ino->i_uid;
+	inode->i_gid = proc_ino->i_gid;
+	inode->i_atime = proc_ino->i_atime;
+	inode->i_mtime = proc_ino->i_mtime;
+	inode->i_ctime = proc_ino->i_ctime;
+	inode->i_ino = proc_ino->i_ino;
+	inode->i_mode = proc_ino->i_mode;
+	set_nlink(inode, proc_ino->i_nlink);
+	inode->i_size = proc_ino->i_size;
+	inode->i_blocks = proc_ino->i_blocks;
+
+	return inode;
+}
+
+static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
+{
+	struct inode *root_inode;
+	struct vfsmount *proc_mnt;
+	int err = -ENOENT;
+
+	proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
+	if (IS_ERR(proc_mnt))
+		goto out;
+
+	sb->s_blocksize = 1024;
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = HPPFS_SUPER_MAGIC;
+	sb->s_op = &hppfs_sbops;
+	sb->s_fs_info = proc_mnt;
+
+	err = -ENOMEM;
+	root_inode = get_inode(sb, dget(proc_mnt->mnt_root));
+	sb->s_root = d_make_root(root_inode);
+	if (!sb->s_root)
+		goto out_mntput;
+
+	return 0;
+
+ out_mntput:
+	mntput(proc_mnt);
+ out:
+	return(err);
+}
+
+static struct dentry *hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data)
+{
+	return mount_nodev(type, flags, data, hppfs_fill_super);
+}
+
+static struct file_system_type hppfs_type = {
+	.owner 		= THIS_MODULE,
+	.name 		= "hppfs",
+	.mount 		= hppfs_read_super,
+	.kill_sb	= kill_anon_super,
+	.fs_flags 	= 0,
+};
+MODULE_ALIAS_FS("hppfs");
+
+static int __init init_hppfs(void)
+{
+	return register_filesystem(&hppfs_type);
+}
+
+static void __exit exit_hppfs(void)
+{
+	unregister_filesystem(&hppfs_type);
+}
+
+module_init(init_hppfs)
+module_exit(exit_hppfs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/hugetlbfs/Makefile b/kernel/fs/hugetlbfs/Makefile
new file mode 100644
index 000000000..6adf870c6
--- /dev/null
+++ b/kernel/fs/hugetlbfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux ramfs routines.
+#
+
+obj-$(CONFIG_HUGETLBFS) += hugetlbfs.o
+
+hugetlbfs-objs := inode.o
diff --git a/kernel/fs/hugetlbfs/inode.c b/kernel/fs/hugetlbfs/inode.c
new file mode 100644
index 000000000..87724c1d7
--- /dev/null
+++ b/kernel/fs/hugetlbfs/inode.c
@@ -0,0 +1,1114 @@
+/*
+ * hugetlbpage-backed filesystem.  Based on ramfs.
+ *
+ * Nadia Yvette Chambers, 2002
+ *
+ * Copyright (C) 2002 Linus Torvalds.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/thread_info.h>
+#include <asm/current.h>
+#include <linux/sched.h>		/* remove ASAP */
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/ctype.h>
+#include <linux/backing-dev.h>
+#include <linux/hugetlb.h>
+#include <linux/pagevec.h>
+#include <linux/parser.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/dnotify.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/magic.h>
+#include <linux/migrate.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+
+static const struct super_operations hugetlbfs_ops;
+static const struct address_space_operations hugetlbfs_aops;
+const struct file_operations hugetlbfs_file_operations;
+static const struct inode_operations hugetlbfs_dir_inode_operations;
+static const struct inode_operations hugetlbfs_inode_operations;
+
+struct hugetlbfs_config {
+	kuid_t   uid;
+	kgid_t   gid;
+	umode_t mode;
+	long	max_hpages;
+	long	nr_inodes;
+	struct hstate *hstate;
+	long    min_hpages;
+};
+
+struct hugetlbfs_inode_info {
+	struct shared_policy policy;
+	struct inode vfs_inode;
+};
+
+static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
+{
+	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
+}
+
+int sysctl_hugetlb_shm_group;
+
+enum {
+	Opt_size, Opt_nr_inodes,
+	Opt_mode, Opt_uid, Opt_gid,
+	Opt_pagesize, Opt_min_size,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_size,	"size=%s"},
+	{Opt_nr_inodes,	"nr_inodes=%s"},
+	{Opt_mode,	"mode=%o"},
+	{Opt_uid,	"uid=%u"},
+	{Opt_gid,	"gid=%u"},
+	{Opt_pagesize,	"pagesize=%s"},
+	{Opt_min_size,	"min_size=%s"},
+	{Opt_err,	NULL},
+};
+
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); ++i)
+		put_page(pvec->pages[i]);
+
+	pagevec_reinit(pvec);
+}
+
+static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(file);
+	loff_t len, vma_len;
+	int ret;
+	struct hstate *h = hstate_file(file);
+
+	/*
+	 * vma address alignment (but not the pgoff alignment) has
+	 * already been checked by prepare_hugepage_range.  If you add
+	 * any error returns here, do so after setting VM_HUGETLB, so
+	 * is_vm_hugetlb_page tests below unmap_region go the right
+	 * way when do_mmap_pgoff unwinds (may be important on powerpc
+	 * and ia64).
+	 */
+	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+	vma->vm_ops = &hugetlb_vm_ops;
+
+	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+		return -EINVAL;
+
+	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
+
+	mutex_lock(&inode->i_mutex);
+	file_accessed(file);
+
+	ret = -ENOMEM;
+	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+
+	if (hugetlb_reserve_pages(inode,
+				vma->vm_pgoff >> huge_page_order(h),
+				len >> huge_page_shift(h), vma,
+				vma->vm_flags))
+		goto out;
+
+	ret = 0;
+	hugetlb_prefault_arch_hook(vma->vm_mm);
+	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+		inode->i_size = len;
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+/*
+ * Called under down_write(mmap_sem).
+ */
+
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+static unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct hstate *h = hstate_file(file);
+	struct vm_unmapped_area_info info;
+
+	if (len & ~huge_page_mask(h))
+		return -EINVAL;
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED) {
+		if (prepare_hugepage_range(file, addr, len))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (addr) {
+		addr = ALIGN(addr, huge_page_size(h));
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	info.flags = 0;
+	info.length = len;
+	info.low_limit = TASK_UNMAPPED_BASE;
+	info.high_limit = TASK_SIZE;
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+	return vm_unmapped_area(&info);
+}
+#endif
+
+static size_t
+hugetlbfs_read_actor(struct page *page, unsigned long offset,
+			struct iov_iter *to, unsigned long size)
+{
+	size_t copied = 0;
+	int i, chunksize;
+
+	/* Find which 4k chunk and offset with in that chunk */
+	i = offset >> PAGE_CACHE_SHIFT;
+	offset = offset & ~PAGE_CACHE_MASK;
+
+	while (size) {
+		size_t n;
+		chunksize = PAGE_CACHE_SIZE;
+		if (offset)
+			chunksize -= offset;
+		if (chunksize > size)
+			chunksize = size;
+		n = copy_page_to_iter(&page[i], offset, chunksize, to);
+		copied += n;
+		if (n != chunksize)
+			return copied;
+		offset = 0;
+		size -= chunksize;
+		i++;
+	}
+	return copied;
+}
+
+/*
+ * Support for read() - Find the page attached to f_mapping and copy out the
+ * data. Its *very* similar to do_generic_mapping_read(), we can't use that
+ * since it has PAGE_CACHE_SIZE assumptions.
+ */
+static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct hstate *h = hstate_file(file);
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
+	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
+	unsigned long end_index;
+	loff_t isize;
+	ssize_t retval = 0;
+
+	while (iov_iter_count(to)) {
+		struct page *page;
+		size_t nr, copied;
+
+		/* nr is the maximum number of bytes to copy from this page */
+		nr = huge_page_size(h);
+		isize = i_size_read(inode);
+		if (!isize)
+			break;
+		end_index = (isize - 1) >> huge_page_shift(h);
+		if (index > end_index)
+			break;
+		if (index == end_index) {
+			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
+			if (nr <= offset)
+				break;
+		}
+		nr = nr - offset;
+
+		/* Find the page */
+		page = find_lock_page(mapping, index);
+		if (unlikely(page == NULL)) {
+			/*
+			 * We have a HOLE, zero out the user-buffer for the
+			 * length of the hole or request.
+			 */
+			copied = iov_iter_zero(nr, to);
+		} else {
+			unlock_page(page);
+
+			/*
+			 * We have the page, copy it to user space buffer.
+			 */
+			copied = hugetlbfs_read_actor(page, offset, to, nr);
+			page_cache_release(page);
+		}
+		offset += copied;
+		retval += copied;
+		if (copied != nr && iov_iter_count(to)) {
+			if (!retval)
+				retval = -EFAULT;
+			break;
+		}
+		index += offset >> huge_page_shift(h);
+		offset &= ~huge_page_mask(h);
+	}
+	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
+	return retval;
+}
+
+static int hugetlbfs_write_begin(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	return -EINVAL;
+}
+
+static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	BUG();
+	return -EINVAL;
+}
+
+static void truncate_huge_page(struct page *page)
+{
+	ClearPageDirty(page);
+	ClearPageUptodate(page);
+	delete_from_page_cache(page);
+}
+
+static void truncate_hugepages(struct inode *inode, loff_t lstart)
+{
+	struct hstate *h = hstate_inode(inode);
+	struct address_space *mapping = &inode->i_data;
+	const pgoff_t start = lstart >> huge_page_shift(h);
+	struct pagevec pvec;
+	pgoff_t next;
+	int i, freed = 0;
+
+	pagevec_init(&pvec, 0);
+	next = start;
+	while (1) {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+			if (next == start)
+				break;
+			next = start;
+			continue;
+		}
+
+		for (i = 0; i < pagevec_count(&pvec); ++i) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			if (page->index > next)
+				next = page->index;
+			++next;
+			truncate_huge_page(page);
+			unlock_page(page);
+			freed++;
+		}
+		huge_pagevec_release(&pvec);
+	}
+	BUG_ON(!lstart && mapping->nrpages);
+	hugetlb_unreserve_pages(inode, start, freed);
+}
+
+static void hugetlbfs_evict_inode(struct inode *inode)
+{
+	struct resv_map *resv_map;
+
+	truncate_hugepages(inode, 0);
+	resv_map = (struct resv_map *)inode->i_mapping->private_data;
+	/* root inode doesn't have the resv_map, so we should check it */
+	if (resv_map)
+		resv_map_release(&resv_map->refs);
+	clear_inode(inode);
+}
+
+static inline void
+hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+{
+	struct vm_area_struct *vma;
+
+	vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+		unsigned long v_offset;
+
+		/*
+		 * Can the expression below overflow on 32-bit arches?
+		 * No, because the interval tree returns us only those vmas
+		 * which overlap the truncated area starting at pgoff,
+		 * and no vma on a 32-bit arch can span beyond the 4GB.
+		 */
+		if (vma->vm_pgoff < pgoff)
+			v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+		else
+			v_offset = 0;
+
+		unmap_hugepage_range(vma, vma->vm_start + v_offset,
+				     vma->vm_end, NULL);
+	}
+}
+
+static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
+{
+	pgoff_t pgoff;
+	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
+
+	BUG_ON(offset & ~huge_page_mask(h));
+	pgoff = offset >> PAGE_SHIFT;
+
+	i_size_write(inode, offset);
+	i_mmap_lock_write(mapping);
+	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+	i_mmap_unlock_write(mapping);
+	truncate_hugepages(inode, offset);
+	return 0;
+}
+
+static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct hstate *h = hstate_inode(inode);
+	int error;
+	unsigned int ia_valid = attr->ia_valid;
+
+	BUG_ON(!inode);
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_SIZE) {
+		error = -EINVAL;
+		if (attr->ia_size & ~huge_page_mask(h))
+			return -EINVAL;
+		error = hugetlb_vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static struct inode *hugetlbfs_get_root(struct super_block *sb,
+					struct hugetlbfs_config *config)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode) {
+		struct hugetlbfs_inode_info *info;
+		inode->i_ino = get_next_ino();
+		inode->i_mode = S_IFDIR | config->mode;
+		inode->i_uid = config->uid;
+		inode->i_gid = config->gid;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		info = HUGETLBFS_I(inode);
+		mpol_shared_policy_init(&info->policy, NULL);
+		inode->i_op = &hugetlbfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inc_nlink(inode);
+		lockdep_annotate_inode_mutex_key(inode);
+	}
+	return inode;
+}
+
+/*
+ * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
+ * be taken from reclaim -- unlike regular filesystems. This needs an
+ * annotation because huge_pmd_share() does an allocation under
+ * i_mmap_rwsem.
+ */
+static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
+
+static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+					struct inode *dir,
+					umode_t mode, dev_t dev)
+{
+	struct inode *inode;
+	struct resv_map *resv_map;
+
+	resv_map = resv_map_alloc();
+	if (!resv_map)
+		return NULL;
+
+	inode = new_inode(sb);
+	if (inode) {
+		struct hugetlbfs_inode_info *info;
+		inode->i_ino = get_next_ino();
+		inode_init_owner(inode, dir, mode);
+		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
+				&hugetlbfs_i_mmap_rwsem_key);
+		inode->i_mapping->a_ops = &hugetlbfs_aops;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->private_data = resv_map;
+		info = HUGETLBFS_I(inode);
+		/*
+		 * The policy is initialized here even if we are creating a
+		 * private inode because initialization simply creates an
+		 * an empty rb tree and calls spin_lock_init(), later when we
+		 * call mpol_free_shared_policy() it will just return because
+		 * the rb tree will still be empty.
+		 */
+		mpol_shared_policy_init(&info->policy, NULL);
+		switch (mode & S_IFMT) {
+		default:
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_op = &hugetlbfs_inode_operations;
+			inode->i_fop = &hugetlbfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &hugetlbfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+
+			/* directory inodes start off with i_nlink == 2 (for "." entry) */
+			inc_nlink(inode);
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			break;
+		}
+		lockdep_annotate_inode_mutex_key(inode);
+	} else
+		kref_put(&resv_map->refs, resv_map_release);
+
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+static int hugetlbfs_mknod(struct inode *dir,
+			struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+
+	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
+	if (inode) {
+		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+		d_instantiate(dentry, inode);
+		dget(dentry);	/* Extra count - pin the dentry in core */
+		error = 0;
+	}
+	return error;
+}
+
+static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	if (!retval)
+		inc_nlink(dir);
+	return retval;
+}
+
+static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+{
+	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int hugetlbfs_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+
+	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+	if (inode) {
+		int l = strlen(symname)+1;
+		error = page_symlink(inode, symname, l);
+		if (!error) {
+			d_instantiate(dentry, inode);
+			dget(dentry);
+		} else
+			iput(inode);
+	}
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	return error;
+}
+
+/*
+ * mark the head page dirty
+ */
+static int hugetlbfs_set_page_dirty(struct page *page)
+{
+	struct page *head = compound_head(page);
+
+	SetPageDirty(head);
+	return 0;
+}
+
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+				struct page *newpage, struct page *page,
+				enum migrate_mode mode)
+{
+	int rc;
+
+	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+	if (rc != MIGRATEPAGE_SUCCESS)
+		return rc;
+	migrate_page_copy(newpage, page);
+
+	return MIGRATEPAGE_SUCCESS;
+}
+
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+	struct hstate *h = hstate_inode(d_inode(dentry));
+
+	buf->f_type = HUGETLBFS_MAGIC;
+	buf->f_bsize = huge_page_size(h);
+	if (sbinfo) {
+		spin_lock(&sbinfo->stat_lock);
+		/* If no limits set, just report 0 for max/free/used
+		 * blocks, like simple_statfs() */
+		if (sbinfo->spool) {
+			long free_pages;
+
+			spin_lock(&sbinfo->spool->lock);
+			buf->f_blocks = sbinfo->spool->max_hpages;
+			free_pages = sbinfo->spool->max_hpages
+				- sbinfo->spool->used_hpages;
+			buf->f_bavail = buf->f_bfree = free_pages;
+			spin_unlock(&sbinfo->spool->lock);
+			buf->f_files = sbinfo->max_inodes;
+			buf->f_ffree = sbinfo->free_inodes;
+		}
+		spin_unlock(&sbinfo->stat_lock);
+	}
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+
+static void hugetlbfs_put_super(struct super_block *sb)
+{
+	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
+
+	if (sbi) {
+		sb->s_fs_info = NULL;
+
+		if (sbi->spool)
+			hugepage_put_subpool(sbi->spool);
+
+		kfree(sbi);
+	}
+}
+
+static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		if (unlikely(!sbinfo->free_inodes)) {
+			spin_unlock(&sbinfo->stat_lock);
+			return 0;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
+	return 1;
+}
+
+static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+}
+
+
+static struct kmem_cache *hugetlbfs_inode_cachep;
+
+static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
+{
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+	struct hugetlbfs_inode_info *p;
+
+	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
+		return NULL;
+	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
+	if (unlikely(!p)) {
+		hugetlbfs_inc_free_inodes(sbinfo);
+		return NULL;
+	}
+	return &p->vfs_inode;
+}
+
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
+
+static void hugetlbfs_destroy_inode(struct inode *inode)
+{
+	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
+	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
+	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
+}
+
+static const struct address_space_operations hugetlbfs_aops = {
+	.write_begin	= hugetlbfs_write_begin,
+	.write_end	= hugetlbfs_write_end,
+	.set_page_dirty	= hugetlbfs_set_page_dirty,
+	.migratepage    = hugetlbfs_migrate_page,
+};
+
+
+static void init_once(void *foo)
+{
+	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+const struct file_operations hugetlbfs_file_operations = {
+	.read_iter		= hugetlbfs_read_iter,
+	.mmap			= hugetlbfs_file_mmap,
+	.fsync			= noop_fsync,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations hugetlbfs_dir_inode_operations = {
+	.create		= hugetlbfs_create,
+	.lookup		= simple_lookup,
+	.link		= simple_link,
+	.unlink		= simple_unlink,
+	.symlink	= hugetlbfs_symlink,
+	.mkdir		= hugetlbfs_mkdir,
+	.rmdir		= simple_rmdir,
+	.mknod		= hugetlbfs_mknod,
+	.rename		= simple_rename,
+	.setattr	= hugetlbfs_setattr,
+};
+
+static const struct inode_operations hugetlbfs_inode_operations = {
+	.setattr	= hugetlbfs_setattr,
+};
+
+static const struct super_operations hugetlbfs_ops = {
+	.alloc_inode    = hugetlbfs_alloc_inode,
+	.destroy_inode  = hugetlbfs_destroy_inode,
+	.evict_inode	= hugetlbfs_evict_inode,
+	.statfs		= hugetlbfs_statfs,
+	.put_super	= hugetlbfs_put_super,
+	.show_options	= generic_show_options,
+};
+
+enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+/*
+ * Convert size option passed from command line to number of huge pages
+ * in the pool specified by hstate.  Size option could be in bytes
+ * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
+ */
+static long long
+hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
+								int val_type)
+{
+	if (val_type == NO_SIZE)
+		return -1;
+
+	if (val_type == SIZE_PERCENT) {
+		size_opt <<= huge_page_shift(h);
+		size_opt *= h->max_huge_pages;
+		do_div(size_opt, 100);
+	}
+
+	size_opt >>= huge_page_shift(h);
+	return size_opt;
+}
+
+static int
+hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
+{
+	char *p, *rest;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	unsigned long long max_size_opt = 0, min_size_opt = 0;
+	int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+ 				goto bad_val;
+			pconfig->uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(pconfig->uid))
+				goto bad_val;
+			break;
+
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+ 				goto bad_val;
+			pconfig->gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(pconfig->gid))
+				goto bad_val;
+			break;
+
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+ 				goto bad_val;
+			pconfig->mode = option & 01777U;
+			break;
+
+		case Opt_size: {
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			max_size_opt = memparse(args[0].from, &rest);
+			max_val_type = SIZE_STD;
+			if (*rest == '%')
+				max_val_type = SIZE_PERCENT;
+			break;
+		}
+
+		case Opt_nr_inodes:
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			pconfig->nr_inodes = memparse(args[0].from, &rest);
+			break;
+
+		case Opt_pagesize: {
+			unsigned long ps;
+			ps = memparse(args[0].from, &rest);
+			pconfig->hstate = size_to_hstate(ps);
+			if (!pconfig->hstate) {
+				pr_err("Unsupported page size %lu MB\n",
+					ps >> 20);
+				return -EINVAL;
+			}
+			break;
+		}
+
+		case Opt_min_size: {
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			min_size_opt = memparse(args[0].from, &rest);
+			min_val_type = SIZE_STD;
+			if (*rest == '%')
+				min_val_type = SIZE_PERCENT;
+			break;
+		}
+
+		default:
+			pr_err("Bad mount option: \"%s\"\n", p);
+			return -EINVAL;
+			break;
+		}
+	}
+
+	/*
+	 * Use huge page pool size (in hstate) to convert the size
+	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
+	 */
+	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+						max_size_opt, max_val_type);
+	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+						min_size_opt, min_val_type);
+
+	/*
+	 * If max_size was specified, then min_size must be smaller
+	 */
+	if (max_val_type > NO_SIZE &&
+	    pconfig->min_hpages > pconfig->max_hpages) {
+		pr_err("minimum size can not be greater than maximum size\n");
+		return -EINVAL;
+	}
+
+	return 0;
+
+bad_val:
+	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
+ 	return -EINVAL;
+}
+
+static int
+hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int ret;
+	struct hugetlbfs_config config;
+	struct hugetlbfs_sb_info *sbinfo;
+
+	save_mount_options(sb, data);
+
+	config.max_hpages = -1; /* No limit on size by default */
+	config.nr_inodes = -1; /* No limit on number of inodes by default */
+	config.uid = current_fsuid();
+	config.gid = current_fsgid();
+	config.mode = 0755;
+	config.hstate = &default_hstate;
+	config.min_hpages = -1; /* No default minimum size */
+	ret = hugetlbfs_parse_options(data, &config);
+	if (ret)
+		return ret;
+
+	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
+	if (!sbinfo)
+		return -ENOMEM;
+	sb->s_fs_info = sbinfo;
+	sbinfo->hstate = config.hstate;
+	spin_lock_init(&sbinfo->stat_lock);
+	sbinfo->max_inodes = config.nr_inodes;
+	sbinfo->free_inodes = config.nr_inodes;
+	sbinfo->spool = NULL;
+	/*
+	 * Allocate and initialize subpool if maximum or minimum size is
+	 * specified.  Any needed reservations (for minimim size) are taken
+	 * taken when the subpool is created.
+	 */
+	if (config.max_hpages != -1 || config.min_hpages != -1) {
+		sbinfo->spool = hugepage_new_subpool(config.hstate,
+							config.max_hpages,
+							config.min_hpages);
+		if (!sbinfo->spool)
+			goto out_free;
+	}
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = huge_page_size(config.hstate);
+	sb->s_blocksize_bits = huge_page_shift(config.hstate);
+	sb->s_magic = HUGETLBFS_MAGIC;
+	sb->s_op = &hugetlbfs_ops;
+	sb->s_time_gran = 1;
+	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+	if (!sb->s_root)
+		goto out_free;
+	return 0;
+out_free:
+	kfree(sbinfo->spool);
+	kfree(sbinfo);
+	return -ENOMEM;
+}
+
+static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+}
+
+static struct file_system_type hugetlbfs_fs_type = {
+	.name		= "hugetlbfs",
+	.mount		= hugetlbfs_mount,
+	.kill_sb	= kill_litter_super,
+};
+MODULE_ALIAS_FS("hugetlbfs");
+
+static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
+
+static int can_do_hugetlb_shm(void)
+{
+	kgid_t shm_group;
+	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
+	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
+}
+
+static int get_hstate_idx(int page_size_log)
+{
+	struct hstate *h = hstate_sizelog(page_size_log);
+
+	if (!h)
+		return -1;
+	return h - hstates;
+}
+
+static const struct dentry_operations anon_ops = {
+	.d_dname = simple_dname
+};
+
+/*
+ * Note that size should be aligned to proper hugepage size in caller side,
+ * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
+ */
+struct file *hugetlb_file_setup(const char *name, size_t size,
+				vm_flags_t acctflag, struct user_struct **user,
+				int creat_flags, int page_size_log)
+{
+	struct file *file = ERR_PTR(-ENOMEM);
+	struct inode *inode;
+	struct path path;
+	struct super_block *sb;
+	struct qstr quick_string;
+	int hstate_idx;
+
+	hstate_idx = get_hstate_idx(page_size_log);
+	if (hstate_idx < 0)
+		return ERR_PTR(-ENODEV);
+
+	*user = NULL;
+	if (!hugetlbfs_vfsmount[hstate_idx])
+		return ERR_PTR(-ENOENT);
+
+	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
+		*user = current_user();
+		if (user_shm_lock(size, *user)) {
+			task_lock(current);
+			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+				current->comm, current->pid);
+			task_unlock(current);
+		} else {
+			*user = NULL;
+			return ERR_PTR(-EPERM);
+		}
+	}
+
+	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
+	quick_string.name = name;
+	quick_string.len = strlen(quick_string.name);
+	quick_string.hash = 0;
+	path.dentry = d_alloc_pseudo(sb, &quick_string);
+	if (!path.dentry)
+		goto out_shm_unlock;
+
+	d_set_d_op(path.dentry, &anon_ops);
+	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
+	file = ERR_PTR(-ENOSPC);
+	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
+	if (!inode)
+		goto out_dentry;
+
+	file = ERR_PTR(-ENOMEM);
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
+		goto out_inode;
+
+	d_instantiate(path.dentry, inode);
+	inode->i_size = size;
+	clear_nlink(inode);
+
+	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
+			&hugetlbfs_file_operations);
+	if (IS_ERR(file))
+		goto out_dentry; /* inode is already attached */
+
+	return file;
+
+out_inode:
+	iput(inode);
+out_dentry:
+	path_put(&path);
+out_shm_unlock:
+	if (*user) {
+		user_shm_unlock(size, *user);
+		*user = NULL;
+	}
+	return file;
+}
+
+static int __init init_hugetlbfs_fs(void)
+{
+	struct hstate *h;
+	int error;
+	int i;
+
+	if (!hugepages_supported()) {
+		pr_info("disabling because there are no supported hugepage sizes\n");
+		return -ENOTSUPP;
+	}
+
+	error = -ENOMEM;
+	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
+					sizeof(struct hugetlbfs_inode_info),
+					0, 0, init_once);
+	if (hugetlbfs_inode_cachep == NULL)
+		goto out2;
+
+	error = register_filesystem(&hugetlbfs_fs_type);
+	if (error)
+		goto out;
+
+	i = 0;
+	for_each_hstate(h) {
+		char buf[50];
+		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
+
+		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
+		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
+							buf);
+
+		if (IS_ERR(hugetlbfs_vfsmount[i])) {
+			pr_err("Cannot mount internal hugetlbfs for "
+				"page size %uK", ps_kb);
+			error = PTR_ERR(hugetlbfs_vfsmount[i]);
+			hugetlbfs_vfsmount[i] = NULL;
+		}
+		i++;
+	}
+	/* Non default hstates are optional */
+	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
+		return 0;
+
+ out:
+	kmem_cache_destroy(hugetlbfs_inode_cachep);
+ out2:
+	return error;
+}
+
+static void __exit exit_hugetlbfs_fs(void)
+{
+	struct hstate *h;
+	int i;
+
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(hugetlbfs_inode_cachep);
+	i = 0;
+	for_each_hstate(h)
+		kern_unmount(hugetlbfs_vfsmount[i++]);
+	unregister_filesystem(&hugetlbfs_fs_type);
+}
+
+module_init(init_hugetlbfs_fs)
+module_exit(exit_hugetlbfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/inode.c b/kernel/fs/inode.c
new file mode 100644
index 000000000..6e342cade
--- /dev/null
+++ b/kernel/fs/inode.c
@@ -0,0 +1,1977 @@
+/*
+ * (C) 1997 Linus Torvalds
+ * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
+ */
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/cdev.h>
+#include <linux/bootmem.h>
+#include <linux/fsnotify.h>
+#include <linux/mount.h>
+#include <linux/posix_acl.h>
+#include <linux/prefetch.h>
+#include <linux/buffer_head.h> /* for inode_has_buffers */
+#include <linux/ratelimit.h>
+#include <linux/list_lru.h>
+#include <trace/events/writeback.h>
+#include "internal.h"
+
+/*
+ * Inode locking rules:
+ *
+ * inode->i_lock protects:
+ *   inode->i_state, inode->i_hash, __iget()
+ * Inode LRU list locks protect:
+ *   inode->i_sb->s_inode_lru, inode->i_lru
+ * inode_sb_list_lock protects:
+ *   sb->s_inodes, inode->i_sb_list
+ * bdi->wb.list_lock protects:
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
+ * inode_hash_lock protects:
+ *   inode_hashtable, inode->i_hash
+ *
+ * Lock ordering:
+ *
+ * inode_sb_list_lock
+ *   inode->i_lock
+ *     Inode LRU list locks
+ *
+ * bdi->wb.list_lock
+ *   inode->i_lock
+ *
+ * inode_hash_lock
+ *   inode_sb_list_lock
+ *   inode->i_lock
+ *
+ * iunique_lock
+ *   inode_hash_lock
+ */
+
+static unsigned int i_hash_mask __read_mostly;
+static unsigned int i_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+
+/*
+ * Empty aops. Can be used for the cases where the user does not
+ * define any of the address_space operations.
+ */
+const struct address_space_operations empty_aops = {
+};
+EXPORT_SYMBOL(empty_aops);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static DEFINE_PER_CPU(unsigned long, nr_inodes);
+static DEFINE_PER_CPU(unsigned long, nr_unused);
+
+static struct kmem_cache *inode_cachep __read_mostly;
+
+static long get_nr_inodes(void)
+{
+	int i;
+	long sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static inline long get_nr_inodes_unused(void)
+{
+	int i;
+	long sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_unused, i);
+	return sum < 0 ? 0 : sum;
+}
+
+long get_nr_dirty_inodes(void)
+{
+	/* not actually dirty inodes, but a wild approximation */
+	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+	return nr_dirty > 0 ? nr_dirty : 0;
+}
+
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(struct ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	inodes_stat.nr_inodes = get_nr_inodes();
+	inodes_stat.nr_unused = get_nr_inodes_unused();
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
+static int no_open(struct inode *inode, struct file *file)
+{
+	return -ENXIO;
+}
+
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb: superblock inode belongs to
+ * @inode: inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+int inode_init_always(struct super_block *sb, struct inode *inode)
+{
+	static const struct inode_operations empty_iops;
+	static const struct file_operations no_open_fops = {.open = no_open};
+	struct address_space *const mapping = &inode->i_data;
+
+	inode->i_sb = sb;
+	inode->i_blkbits = sb->s_blocksize_bits;
+	inode->i_flags = 0;
+	atomic_set(&inode->i_count, 1);
+	inode->i_op = &empty_iops;
+	inode->i_fop = &no_open_fops;
+	inode->__i_nlink = 1;
+	inode->i_opflags = 0;
+	i_uid_write(inode, 0);
+	i_gid_write(inode, 0);
+	atomic_set(&inode->i_writecount, 0);
+	inode->i_size = 0;
+	inode->i_blocks = 0;
+	inode->i_bytes = 0;
+	inode->i_generation = 0;
+	inode->i_pipe = NULL;
+	inode->i_bdev = NULL;
+	inode->i_cdev = NULL;
+	inode->i_rdev = 0;
+	inode->dirtied_when = 0;
+
+	if (security_inode_alloc(inode))
+		goto out;
+	spin_lock_init(&inode->i_lock);
+	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+
+	mutex_init(&inode->i_mutex);
+	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+
+	atomic_set(&inode->i_dio_count, 0);
+
+	mapping->a_ops = &empty_aops;
+	mapping->host = inode;
+	mapping->flags = 0;
+	atomic_set(&mapping->i_mmap_writable, 0);
+	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
+	mapping->private_data = NULL;
+	mapping->writeback_index = 0;
+	inode->i_private = NULL;
+	inode->i_mapping = mapping;
+	INIT_HLIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
+#ifdef CONFIG_FS_POSIX_ACL
+	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+#endif
+
+#ifdef CONFIG_FSNOTIFY
+	inode->i_fsnotify_mask = 0;
+#endif
+	inode->i_flctx = NULL;
+	this_cpu_inc(nr_inodes);
+
+	return 0;
+out:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(inode_init_always);
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+	struct inode *inode;
+
+	if (sb->s_op->alloc_inode)
+		inode = sb->s_op->alloc_inode(sb);
+	else
+		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+
+	if (!inode)
+		return NULL;
+
+	if (unlikely(inode_init_always(sb, inode))) {
+		if (inode->i_sb->s_op->destroy_inode)
+			inode->i_sb->s_op->destroy_inode(inode);
+		else
+			kmem_cache_free(inode_cachep, inode);
+		return NULL;
+	}
+
+	return inode;
+}
+
+void free_inode_nonrcu(struct inode *inode)
+{
+	kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
+
+void __destroy_inode(struct inode *inode)
+{
+	BUG_ON(inode_has_buffers(inode));
+	security_inode_free(inode);
+	fsnotify_inode_delete(inode);
+	locks_free_lock_context(inode->i_flctx);
+	if (!inode->i_nlink) {
+		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
+		atomic_long_dec(&inode->i_sb->s_remove_count);
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_acl);
+	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_default_acl);
+#endif
+	this_cpu_dec(nr_inodes);
+}
+EXPORT_SYMBOL(__destroy_inode);
+
+static void i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(inode_cachep, inode);
+}
+
+static void destroy_inode(struct inode *inode)
+{
+	BUG_ON(!list_empty(&inode->i_lru));
+	__destroy_inode(inode);
+	if (inode->i_sb->s_op->destroy_inode)
+		inode->i_sb->s_op->destroy_inode(inode);
+	else
+		call_rcu(&inode->i_rcu, i_callback);
+}
+
+/**
+ * drop_nlink - directly drop an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  In cases
+ * where we are attempting to track writes to the
+ * filesystem, a decrement to zero means an imminent
+ * write when the file is truncated and actually unlinked
+ * on the filesystem.
+ */
+void drop_nlink(struct inode *inode)
+{
+	WARN_ON(inode->i_nlink == 0);
+	inode->__i_nlink--;
+	if (!inode->i_nlink)
+		atomic_long_inc(&inode->i_sb->s_remove_count);
+}
+EXPORT_SYMBOL(drop_nlink);
+
+/**
+ * clear_nlink - directly zero an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  See
+ * drop_nlink() for why we care about i_nlink hitting zero.
+ */
+void clear_nlink(struct inode *inode)
+{
+	if (inode->i_nlink) {
+		inode->__i_nlink = 0;
+		atomic_long_inc(&inode->i_sb->s_remove_count);
+	}
+}
+EXPORT_SYMBOL(clear_nlink);
+
+/**
+ * set_nlink - directly set an inode's link count
+ * @inode: inode
+ * @nlink: new nlink (should be non-zero)
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.
+ */
+void set_nlink(struct inode *inode, unsigned int nlink)
+{
+	if (!nlink) {
+		clear_nlink(inode);
+	} else {
+		/* Yes, some filesystems do change nlink from zero to one */
+		if (inode->i_nlink == 0)
+			atomic_long_dec(&inode->i_sb->s_remove_count);
+
+		inode->__i_nlink = nlink;
+	}
+}
+EXPORT_SYMBOL(set_nlink);
+
+/**
+ * inc_nlink - directly increment an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  Currently,
+ * it is only here for parity with dec_nlink().
+ */
+void inc_nlink(struct inode *inode)
+{
+	if (unlikely(inode->i_nlink == 0)) {
+		WARN_ON(!(inode->i_state & I_LINKABLE));
+		atomic_long_dec(&inode->i_sb->s_remove_count);
+	}
+
+	inode->__i_nlink++;
+}
+EXPORT_SYMBOL(inc_nlink);
+
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	init_rwsem(&mapping->i_mmap_rwsem);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	mapping->i_mmap = RB_ROOT;
+}
+EXPORT_SYMBOL(address_space_init_once);
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+	memset(inode, 0, sizeof(*inode));
+	INIT_HLIST_NODE(&inode->i_hash);
+	INIT_LIST_HEAD(&inode->i_devices);
+	INIT_LIST_HEAD(&inode->i_wb_list);
+	INIT_LIST_HEAD(&inode->i_lru);
+	address_space_init_once(&inode->i_data);
+	i_size_ordered_init(inode);
+#ifdef CONFIG_FSNOTIFY
+	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
+#endif
+}
+EXPORT_SYMBOL(inode_init_once);
+
+static void init_once(void *foo)
+{
+	struct inode *inode = (struct inode *) foo;
+
+	inode_init_once(inode);
+}
+
+/*
+ * inode->i_lock must be held
+ */
+void __iget(struct inode *inode)
+{
+	atomic_inc(&inode->i_count);
+}
+
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+
+static void inode_lru_list_add(struct inode *inode)
+{
+	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
+		this_cpu_inc(nr_unused);
+}
+
+/*
+ * Add inode to LRU if needed (inode is unused and clean).
+ *
+ * Needs inode->i_lock held.
+ */
+void inode_add_lru(struct inode *inode)
+{
+	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+				I_FREEING | I_WILL_FREE)) &&
+	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+		inode_lru_list_add(inode);
+}
+
+
+static void inode_lru_list_del(struct inode *inode)
+{
+
+	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
+		this_cpu_dec(nr_unused);
+}
+
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+	spin_lock(&inode_sb_list_lock);
+	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+	spin_unlock(&inode_sb_list_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+
+static inline void inode_sb_list_del(struct inode *inode)
+{
+	if (!list_empty(&inode->i_sb_list)) {
+		spin_lock(&inode_sb_list_lock);
+		list_del_init(&inode->i_sb_list);
+		spin_unlock(&inode_sb_list_lock);
+	}
+}
+
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+	unsigned long tmp;
+
+	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+			L1_CACHE_BYTES;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+	return tmp & i_hash_mask;
+}
+
+/**
+ *	__insert_inode_hash - hash an inode
+ *	@inode: unhashed inode
+ *	@hashval: unsigned long value used to locate this object in the
+ *		inode_hashtable.
+ *
+ *	Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+
+	spin_lock(&inode_hash_lock);
+	spin_lock(&inode->i_lock);
+	hlist_add_head(&inode->i_hash, b);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+
+/**
+ *	__remove_inode_hash - remove an inode from the hash
+ *	@inode: inode to unhash
+ *
+ *	Remove an inode from the superblock.
+ */
+void __remove_inode_hash(struct inode *inode)
+{
+	spin_lock(&inode_hash_lock);
+	spin_lock(&inode->i_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
+}
+EXPORT_SYMBOL(__remove_inode_hash);
+
+void clear_inode(struct inode *inode)
+{
+	might_sleep();
+	/*
+	 * We have to cycle tree_lock here because reclaim can be still in the
+	 * process of removing the last page (in __delete_from_page_cache())
+	 * and we must not free mapping under it.
+	 */
+	spin_lock_irq(&inode->i_data.tree_lock);
+	BUG_ON(inode->i_data.nrpages);
+	BUG_ON(inode->i_data.nrshadows);
+	spin_unlock_irq(&inode->i_data.tree_lock);
+	BUG_ON(!list_empty(&inode->i_data.private_list));
+	BUG_ON(!(inode->i_state & I_FREEING));
+	BUG_ON(inode->i_state & I_CLEAR);
+	/* don't need i_lock here, no concurrent mods to i_state */
+	inode->i_state = I_FREEING | I_CLEAR;
+}
+EXPORT_SYMBOL(clear_inode);
+
+/*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to. We remove any pages still attached to the inode and wait for any IO that
+ * is still in progress before finally destroying the inode.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode). The caller is responsible for setting this.
+ *
+ * An inode must already be removed from the LRU list before being evicted from
+ * the cache. This should occur atomically with setting the I_FREEING state
+ * flag, so no inodes here should ever be on the LRU when being evicted.
+ */
+static void evict(struct inode *inode)
+{
+	const struct super_operations *op = inode->i_sb->s_op;
+
+	BUG_ON(!(inode->i_state & I_FREEING));
+	BUG_ON(!list_empty(&inode->i_lru));
+
+	if (!list_empty(&inode->i_wb_list))
+		inode_wb_list_del(inode);
+
+	inode_sb_list_del(inode);
+
+	/*
+	 * Wait for flusher thread to be done with the inode so that filesystem
+	 * does not start destroying it while writeback is still running. Since
+	 * the inode has I_FREEING set, flusher thread won't start new work on
+	 * the inode.  We just have to wait for running writeback to finish.
+	 */
+	inode_wait_for_writeback(inode);
+
+	if (op->evict_inode) {
+		op->evict_inode(inode);
+	} else {
+		truncate_inode_pages_final(&inode->i_data);
+		clear_inode(inode);
+	}
+	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
+		bd_forget(inode);
+	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
+		cd_forget(inode);
+
+	remove_inode_hash(inode);
+
+	spin_lock(&inode->i_lock);
+	wake_up_bit(&inode->i_state, __I_NEW);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+	spin_unlock(&inode->i_lock);
+
+	destroy_inode(inode);
+}
+
+/*
+ * dispose_list - dispose of the contents of a local list
+ * @head: the head of the list to free
+ *
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct inode *inode;
+
+		inode = list_first_entry(head, struct inode, i_lru);
+		list_del_init(&inode->i_lru);
+
+		evict(inode);
+	}
+}
+
+/**
+ * evict_inodes	- evict all evictable inodes for a superblock
+ * @sb:		superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
+ */
+void evict_inodes(struct super_block *sb)
+{
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+		if (atomic_read(&inode->i_count))
+			continue;
+
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		inode->i_state |= I_FREEING;
+		inode_lru_list_del(inode);
+		spin_unlock(&inode->i_lock);
+		list_add(&inode->i_lru, &dispose);
+	}
+	spin_unlock(&inode_sb_list_lock);
+
+	dispose_list(&dispose);
+}
+
+/**
+ * invalidate_inodes	- attempt to free all inodes on a superblock
+ * @sb:		superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
+ *
+ * Attempts to free all inodes for a given superblock.  If there were any
+ * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
+ */
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
+{
+	int busy = 0;
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
+			spin_unlock(&inode->i_lock);
+			busy = 1;
+			continue;
+		}
+		if (atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
+			busy = 1;
+			continue;
+		}
+
+		inode->i_state |= I_FREEING;
+		inode_lru_list_del(inode);
+		spin_unlock(&inode->i_lock);
+		list_add(&inode->i_lru, &dispose);
+	}
+	spin_unlock(&inode_sb_list_lock);
+
+	dispose_list(&dispose);
+
+	return busy;
+}
+
+/*
+ * Isolate the inode from the LRU in preparation for freeing it.
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed.  If the inode has metadata buffers attached to
+ * mapping->private_list then try to remove them.
+ *
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
+ */
+static enum lru_status inode_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct inode	*inode = container_of(item, struct inode, i_lru);
+
+	/*
+	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&inode->i_lock))
+		return LRU_SKIP;
+
+	/*
+	 * Referenced or dirty inodes are still in use. Give them another pass
+	 * through the LRU as we canot reclaim them now.
+	 */
+	if (atomic_read(&inode->i_count) ||
+	    (inode->i_state & ~I_REFERENCED)) {
+		list_lru_isolate(lru, &inode->i_lru);
+		spin_unlock(&inode->i_lock);
+		this_cpu_dec(nr_unused);
+		return LRU_REMOVED;
+	}
+
+	/* recently referenced inodes get one more pass */
+	if (inode->i_state & I_REFERENCED) {
+		inode->i_state &= ~I_REFERENCED;
+		spin_unlock(&inode->i_lock);
+		return LRU_ROTATE;
+	}
+
+	if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(lru_lock);
+		if (remove_inode_buffers(inode)) {
+			unsigned long reap;
+			reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
+			if (current_is_kswapd())
+				__count_vm_events(KSWAPD_INODESTEAL, reap);
+			else
+				__count_vm_events(PGINODESTEAL, reap);
+			if (current->reclaim_state)
+				current->reclaim_state->reclaimed_slab += reap;
+		}
+		iput(inode);
+		spin_lock(lru_lock);
+		return LRU_RETRY;
+	}
+
+	WARN_ON(inode->i_state & I_NEW);
+	inode->i_state |= I_FREEING;
+	list_lru_isolate_move(lru, &inode->i_lru, freeable);
+	spin_unlock(&inode->i_lock);
+
+	this_cpu_dec(nr_unused);
+	return LRU_REMOVED;
+}
+
+/*
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
+ */
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
+{
+	LIST_HEAD(freeable);
+	long freed;
+
+	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
+				     inode_lru_isolate, &freeable);
+	dispose_list(&freeable);
+	return freed;
+}
+
+static void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ */
+static struct inode *find_inode(struct super_block *sb,
+				struct hlist_head *head,
+				int (*test)(struct inode *, void *),
+				void *data)
+{
+	struct inode *inode = NULL;
+
+repeat:
+	hlist_for_each_entry(inode, head, i_hash) {
+		if (inode->i_sb != sb)
+			continue;
+		if (!test(inode, data))
+			continue;
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		return inode;
+	}
+	return NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode *find_inode_fast(struct super_block *sb,
+				struct hlist_head *head, unsigned long ino)
+{
+	struct inode *inode = NULL;
+
+repeat:
+	hlist_for_each_entry(inode, head, i_hash) {
+		if (inode->i_ino != ino)
+			continue;
+		if (inode->i_sb != sb)
+			continue;
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		return inode;
+	}
+	return NULL;
+}
+
+/*
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
+ * to renew the exhausted range.
+ *
+ * This does not significantly increase overflow rate because every CPU can
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
+ * overflow rate by 2x, which does not seem too significant.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
+ */
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+
+unsigned int get_next_ino(void)
+{
+	unsigned int *p = &get_cpu_var(last_ino);
+	unsigned int res = *p;
+
+#ifdef CONFIG_SMP
+	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+		static atomic_t shared_last_ino;
+		int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+
+		res = next - LAST_INO_BATCH;
+	}
+#endif
+
+	*p = ++res;
+	put_cpu_var(last_ino);
+	return res;
+}
+EXPORT_SYMBOL(get_next_ino);
+
+/**
+ *	new_inode_pseudo 	- obtain an inode
+ *	@sb: superblock
+ *
+ *	Allocates a new inode for given superblock.
+ *	Inode wont be chained in superblock s_inodes list
+ *	This means :
+ *	- fs can't be unmount
+ *	- quotas, fsnotify, writeback can't work
+ */
+struct inode *new_inode_pseudo(struct super_block *sb)
+{
+	struct inode *inode = alloc_inode(sb);
+
+	if (inode) {
+		spin_lock(&inode->i_lock);
+		inode->i_state = 0;
+		spin_unlock(&inode->i_lock);
+		INIT_LIST_HEAD(&inode->i_sb_list);
+	}
+	return inode;
+}
+
+/**
+ *	new_inode 	- obtain an inode
+ *	@sb: superblock
+ *
+ *	Allocates a new inode for given superblock. The default gfp_mask
+ *	for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
+ *	If HIGHMEM pages are unsuitable or it is known that pages allocated
+ *	for the page cache are not reclaimable or migratable,
+ *	mapping_set_gfp_mask() must be called with suitable flags on the
+ *	newly created inode's mapping
+ *
+ */
+struct inode *new_inode(struct super_block *sb)
+{
+	struct inode *inode;
+
+	spin_lock_prefetch(&inode_sb_list_lock);
+
+	inode = new_inode_pseudo(sb);
+	if (inode)
+		inode_sb_list_add(inode);
+	return inode;
+}
+EXPORT_SYMBOL(new_inode);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void lockdep_annotate_inode_mutex_key(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode)) {
+		struct file_system_type *type = inode->i_sb->s_type;
+
+		/* Set new key only if filesystem hasn't already changed it */
+		if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) {
+			/*
+			 * ensure nobody is actually holding i_mutex
+			 */
+			mutex_destroy(&inode->i_mutex);
+			mutex_init(&inode->i_mutex);
+			lockdep_set_class(&inode->i_mutex,
+					  &type->i_mutex_dir_key);
+		}
+	}
+}
+EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
+#endif
+
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:	new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
+void unlock_new_inode(struct inode *inode)
+{
+	lockdep_annotate_inode_mutex_key(inode);
+	spin_lock(&inode->i_lock);
+	WARN_ON(!(inode->i_state & I_NEW));
+	inode->i_state &= ~I_NEW;
+	smp_mb();
+	wake_up_bit(&inode->i_state, __I_NEW);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/**
+ * lock_two_nondirectories - take two i_mutexes on non-directory objects
+ *
+ * Lock any non-NULL argument that is not a directory.
+ * Zero, one or two objects may be locked by this function.
+ *
+ * @inode1: first inode to lock
+ * @inode2: second inode to lock
+ */
+void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1 > inode2)
+		swap(inode1, inode2);
+
+	if (inode1 && !S_ISDIR(inode1->i_mode))
+		mutex_lock(&inode1->i_mutex);
+	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+}
+EXPORT_SYMBOL(lock_two_nondirectories);
+
+/**
+ * unlock_two_nondirectories - release locks from lock_two_nondirectories()
+ * @inode1: first inode to unlock
+ * @inode2: second inode to unlock
+ */
+void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1 && !S_ISDIR(inode1->i_mode))
+		mutex_unlock(&inode1->i_mutex);
+	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
+		mutex_unlock(&inode2->i_mutex);
+}
+EXPORT_SYMBOL(unlock_two_nondirectories);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @hashval:	hash value (usually inode number) to get
+ * @test:	callback used for comparisons between inodes
+ * @set:	callback used to initialize a new struct inode
+ * @data:	opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a generalized version of iget_locked() for file systems where the inode
+ * number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *), void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode(sb, head, test, data);
+	spin_unlock(&inode_hash_lock);
+
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
+	inode = alloc_inode(sb);
+	if (inode) {
+		struct inode *old;
+
+		spin_lock(&inode_hash_lock);
+		/* We released the lock, so.. */
+		old = find_inode(sb, head, test, data);
+		if (!old) {
+			if (set(inode, data))
+				goto set_failed;
+
+			spin_lock(&inode->i_lock);
+			inode->i_state = I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			inode_sb_list_add(inode);
+			spin_unlock(&inode_hash_lock);
+
+			/* Return the locked inode with I_NEW set, the
+			 * caller is responsible for filling in the contents
+			 */
+			return inode;
+		}
+
+		/*
+		 * Uhhuh, somebody else created the same inode under
+		 * us. Use the old inode instead of the one we just
+		 * allocated.
+		 */
+		spin_unlock(&inode_hash_lock);
+		destroy_inode(inode);
+		inode = old;
+		wait_on_inode(inode);
+	}
+	return inode;
+
+set_failed:
+	spin_unlock(&inode_hash_lock);
+	destroy_inode(inode);
+	return NULL;
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @ino:	inode number to get
+ *
+ * Search for the inode specified by @ino in the inode cache and if present
+ * return it with an increased reference count. This is for file systems
+ * where the inode number is sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set.  The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
+	inode = alloc_inode(sb);
+	if (inode) {
+		struct inode *old;
+
+		spin_lock(&inode_hash_lock);
+		/* We released the lock, so.. */
+		old = find_inode_fast(sb, head, ino);
+		if (!old) {
+			inode->i_ino = ino;
+			spin_lock(&inode->i_lock);
+			inode->i_state = I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			inode_sb_list_add(inode);
+			spin_unlock(&inode_hash_lock);
+
+			/* Return the locked inode with I_NEW set, the
+			 * caller is responsible for filling in the contents
+			 */
+			return inode;
+		}
+
+		/*
+		 * Uhhuh, somebody else created the same inode under
+		 * us. Use the old inode instead of the one we just
+		 * allocated.
+		 */
+		spin_unlock(&inode_hash_lock);
+		destroy_inode(inode);
+		inode = old;
+		wait_on_inode(inode);
+	}
+	return inode;
+}
+EXPORT_SYMBOL(iget_locked);
+
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *b = inode_hashtable + hash(sb, ino);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	hlist_for_each_entry(inode, b, i_hash) {
+		if (inode->i_ino == ino && inode->i_sb == sb) {
+			spin_unlock(&inode_hash_lock);
+			return 0;
+		}
+	}
+	spin_unlock(&inode_hash_lock);
+
+	return 1;
+}
+
+/**
+ *	iunique - get a unique inode number
+ *	@sb: superblock
+ *	@max_reserved: highest reserved inode number
+ *
+ *	Obtain an inode number that is unique on the system for a given
+ *	superblock. This is used by file systems that have no natural
+ *	permanent inode numbering system. An inode number is returned that
+ *	is higher than the reserved limit but unique.
+ *
+ *	BUGS:
+ *	With a large number of inodes live on the file system this function
+ *	currently becomes quite slow.
+ */
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+	/*
+	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+	 * error if st_ino won't fit in target struct field. Use 32bit counter
+	 * here to attempt to avoid that.
+	 */
+	static DEFINE_SPINLOCK(iunique_lock);
+	static unsigned int counter;
+	ino_t res;
+
+	spin_lock(&iunique_lock);
+	do {
+		if (counter <= max_reserved)
+			counter = max_reserved + 1;
+		res = counter++;
+	} while (!test_inode_iunique(sb, res));
+	spin_unlock(&iunique_lock);
+
+	return res;
+}
+EXPORT_SYMBOL(iunique);
+
+struct inode *igrab(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
+		/*
+		 * Handle the case where s_op->clear_inode is not been
+		 * called yet, and somebody is calling igrab
+		 * while the inode is getting freed.
+		 */
+		inode = NULL;
+	}
+	return inode;
+}
+EXPORT_SYMBOL(igrab);
+
+/**
+ * ilookup5_nowait - search for an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @test:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @test
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache.
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Note: I_NEW is not waited upon so you have to be very careful what you do
+ * with the returned inode.  You probably should be using ilookup5() instead.
+ *
+ * Note2: @test is called with the inode_hash_lock held, so can't sleep.
+ */
+struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode(sb, head, test, data);
+	spin_unlock(&inode_hash_lock);
+
+	return inode;
+}
+EXPORT_SYMBOL(ilookup5_nowait);
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @test:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @test
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if the inode is in the cache, return the inode with an incremented
+ * reference count.  Waits on I_NEW before returning the inode.
+ * returned with an incremented reference count.
+ *
+ * This is a generalized version of ilookup() for file systems where the
+ * inode number is not sufficient for unique identification of an inode.
+ *
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
+
+	if (inode)
+		wait_on_inode(inode);
+	return inode;
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @ino:	inode number to search for
+ *
+ * Search for the inode @ino in the inode cache, and if the inode is in the
+ * cache, the inode is returned with an incremented reference count.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
+
+	if (inode)
+		wait_on_inode(inode);
+	return inode;
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @match:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped.  The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1.  It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction.  The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+				unsigned long hashval,
+				int (*match)(struct inode *, unsigned long,
+					     void *),
+				void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode, *ret_inode = NULL;
+	int mval;
+
+	spin_lock(&inode_hash_lock);
+	hlist_for_each_entry(inode, head, i_hash) {
+		if (inode->i_sb != sb)
+			continue;
+		mval = match(inode, hashval, data);
+		if (mval == 0)
+			continue;
+		if (mval == 1)
+			ret_inode = inode;
+		goto out;
+	}
+out:
+	spin_unlock(&inode_hash_lock);
+	return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
+
+int insert_inode_locked(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	ino_t ino = inode->i_ino;
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+	while (1) {
+		struct inode *old = NULL;
+		spin_lock(&inode_hash_lock);
+		hlist_for_each_entry(old, head, i_hash) {
+			if (old->i_ino != ino)
+				continue;
+			if (old->i_sb != sb)
+				continue;
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
+				continue;
+			}
+			break;
+		}
+		if (likely(!old)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_hash_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&old->i_lock);
+		spin_unlock(&inode_hash_lock);
+		wait_on_inode(old);
+		if (unlikely(!inode_unhashed(old))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+EXPORT_SYMBOL(insert_inode_locked);
+
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+	while (1) {
+		struct inode *old = NULL;
+
+		spin_lock(&inode_hash_lock);
+		hlist_for_each_entry(old, head, i_hash) {
+			if (old->i_sb != sb)
+				continue;
+			if (!test(old, data))
+				continue;
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
+				continue;
+			}
+			break;
+		}
+		if (likely(!old)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_hash_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&old->i_lock);
+		spin_unlock(&inode_hash_lock);
+		wait_on_inode(old);
+		if (unlikely(!inode_unhashed(old))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+EXPORT_SYMBOL(insert_inode_locked4);
+
+
+int generic_delete_inode(struct inode *inode)
+{
+	return 1;
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop_inode()" function, defaulting to
+ * the legacy UNIX filesystem behaviour.  If it tells
+ * us to evict inode, do so.  Otherwise, retain inode
+ * in cache if fs is alive, sync and evict if fs is
+ * shutting down.
+ */
+static void iput_final(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	const struct super_operations *op = inode->i_sb->s_op;
+	int drop;
+
+	WARN_ON(inode->i_state & I_NEW);
+
+	if (op->drop_inode)
+		drop = op->drop_inode(inode);
+	else
+		drop = generic_drop_inode(inode);
+
+	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+		inode->i_state |= I_REFERENCED;
+		inode_add_lru(inode);
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	if (!drop) {
+		inode->i_state |= I_WILL_FREE;
+		spin_unlock(&inode->i_lock);
+		write_inode_now(inode, 1);
+		spin_lock(&inode->i_lock);
+		WARN_ON(inode->i_state & I_NEW);
+		inode->i_state &= ~I_WILL_FREE;
+	}
+
+	inode->i_state |= I_FREEING;
+	if (!list_empty(&inode->i_lru))
+		inode_lru_list_del(inode);
+	spin_unlock(&inode->i_lock);
+
+	evict(inode);
+}
+
+/**
+ *	iput	- put an inode
+ *	@inode: inode to put
+ *
+ *	Puts an inode, dropping its usage count. If the inode use count hits
+ *	zero, the inode is then freed and may also be destroyed.
+ *
+ *	Consequently, iput() can sleep.
+ */
+void iput(struct inode *inode)
+{
+	if (!inode)
+		return;
+	BUG_ON(inode->i_state & I_CLEAR);
+retry:
+	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+			atomic_inc(&inode->i_count);
+			inode->i_state &= ~I_DIRTY_TIME;
+			spin_unlock(&inode->i_lock);
+			trace_writeback_lazytime_iput(inode);
+			mark_inode_dirty_sync(inode);
+			goto retry;
+		}
+		iput_final(inode);
+	}
+}
+EXPORT_SYMBOL(iput);
+
+/**
+ *	bmap	- find a block number in a file
+ *	@inode: inode of file
+ *	@block: block to find
+ *
+ *	Returns the block number on the device holding the inode that
+ *	is the disk block number for the block of the file requested.
+ *	That is, asked for block 4 of inode 1 the function will return the
+ *	disk block relative to the disk start that holds that block of the
+ *	file.
+ */
+sector_t bmap(struct inode *inode, sector_t block)
+{
+	sector_t res = 0;
+	if (inode->i_mapping->a_ops->bmap)
+		res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+	return res;
+}
+EXPORT_SYMBOL(bmap);
+
+/*
+ * With relative atime, only update atime if the previous atime is
+ * earlier than either the ctime or mtime or if at least a day has
+ * passed since the last atime update.
+ */
+static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+			     struct timespec now)
+{
+
+	if (!(mnt->mnt_flags & MNT_RELATIME))
+		return 1;
+	/*
+	 * Is mtime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+		return 1;
+	/*
+	 * Is ctime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+		return 1;
+
+	/*
+	 * Is the previous atime value older than a day? If yes,
+	 * update atime:
+	 */
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+		return 1;
+	/*
+	 * Good, we can skip the atime update:
+	 */
+	return 0;
+}
+
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	int iflags = I_DIRTY_TIME;
+
+	if (flags & S_ATIME)
+		inode->i_atime = *time;
+	if (flags & S_VERSION)
+		inode_inc_iversion(inode);
+	if (flags & S_CTIME)
+		inode->i_ctime = *time;
+	if (flags & S_MTIME)
+		inode->i_mtime = *time;
+
+	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+		iflags |= I_DIRTY_SYNC;
+	__mark_inode_dirty(inode, iflags);
+	return 0;
+}
+EXPORT_SYMBOL(generic_update_time);
+
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	int (*update_time)(struct inode *, struct timespec *, int);
+
+	update_time = inode->i_op->update_time ? inode->i_op->update_time :
+		generic_update_time;
+
+	return update_time(inode, time, flags);
+}
+
+/**
+ *	touch_atime	-	update the access time
+ *	@path: the &struct path to update
+ *
+ *	Update the accessed time on an inode and mark it for writeback.
+ *	This function automatically handles read only file systems and media,
+ *	as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+void touch_atime(const struct path *path)
+{
+	struct vfsmount *mnt = path->mnt;
+	struct inode *inode = d_inode(path->dentry);
+	struct timespec now;
+
+	if (inode->i_flags & S_NOATIME)
+		return;
+	if (IS_NOATIME(inode))
+		return;
+	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return;
+
+	if (mnt->mnt_flags & MNT_NOATIME)
+		return;
+	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return;
+
+	now = current_fs_time(inode->i_sb);
+
+	if (!relatime_need_update(mnt, inode, now))
+		return;
+
+	if (timespec_equal(&inode->i_atime, &now))
+		return;
+
+	if (!sb_start_write_trylock(inode->i_sb))
+		return;
+
+	if (__mnt_want_write(mnt))
+		goto skip_update;
+	/*
+	 * File systems can error out when updating inodes if they need to
+	 * allocate new space to modify an inode (such is the case for
+	 * Btrfs), but since we touch atime while walking down the path we
+	 * really don't care if we failed to update the atime of the file,
+	 * so just ignore the return value.
+	 * We may also fail on filesystems that have the ability to make parts
+	 * of the fs read only, e.g. subvolumes in Btrfs.
+	 */
+	update_time(inode, &now, S_ATIME);
+	__mnt_drop_write(mnt);
+skip_update:
+	sb_end_write(inode->i_sb);
+}
+EXPORT_SYMBOL(touch_atime);
+
+/*
+ * The logic we want is
+ *
+ *	if suid or (sgid and xgrp)
+ *		remove privs
+ */
+int should_remove_suid(struct dentry *dentry)
+{
+	umode_t mode = d_inode(dentry)->i_mode;
+	int kill = 0;
+
+	/* suid always must be killed */
+	if (unlikely(mode & S_ISUID))
+		kill = ATTR_KILL_SUID;
+
+	/*
+	 * sgid without any exec bits is just a mandatory locking mark; leave
+	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
+	 */
+	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+		kill |= ATTR_KILL_SGID;
+
+	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+		return kill;
+
+	return 0;
+}
+EXPORT_SYMBOL(should_remove_suid);
+
+static int __remove_suid(struct dentry *dentry, int kill)
+{
+	struct iattr newattrs;
+
+	newattrs.ia_valid = ATTR_FORCE | kill;
+	/*
+	 * Note we call this on write, so notify_change will not
+	 * encounter any conflicting delegations:
+	 */
+	return notify_change(dentry, &newattrs, NULL);
+}
+
+int file_remove_suid(struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	int killsuid;
+	int killpriv;
+	int error = 0;
+
+	/* Fast path for nothing security related */
+	if (IS_NOSEC(inode))
+		return 0;
+
+	killsuid = should_remove_suid(dentry);
+	killpriv = security_inode_need_killpriv(dentry);
+
+	if (killpriv < 0)
+		return killpriv;
+	if (killpriv)
+		error = security_inode_killpriv(dentry);
+	if (!error && killsuid)
+		error = __remove_suid(dentry, killsuid);
+	if (!error)
+		inode_has_no_xattr(inode);
+
+	return error;
+}
+EXPORT_SYMBOL(file_remove_suid);
+
+/**
+ *	file_update_time	-	update mtime and ctime time
+ *	@file: file accessed
+ *
+ *	Update the mtime and ctime members of an inode and mark the inode
+ *	for writeback.  Note that this function is meant exclusively for
+ *	usage in the file write path of filesystems, and filesystems may
+ *	choose to explicitly ignore update via this function with the
+ *	S_NOCMTIME inode flag, e.g. for network filesystem where these
+ *	timestamps are handled by the server.  This can return an error for
+ *	file systems who need to allocate space in order to update an inode.
+ */
+
+int file_update_time(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	struct timespec now;
+	int sync_it = 0;
+	int ret;
+
+	/* First try to exhaust all avenues to not sync */
+	if (IS_NOCMTIME(inode))
+		return 0;
+
+	now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, &now))
+		sync_it = S_MTIME;
+
+	if (!timespec_equal(&inode->i_ctime, &now))
+		sync_it |= S_CTIME;
+
+	if (IS_I_VERSION(inode))
+		sync_it |= S_VERSION;
+
+	if (!sync_it)
+		return 0;
+
+	/* Finally allowed to write? Takes lock. */
+	if (__mnt_want_write_file(file))
+		return 0;
+
+	ret = update_time(inode, &now, sync_it);
+	__mnt_drop_write_file(file);
+
+	return ret;
+}
+EXPORT_SYMBOL(file_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+	if (IS_SYNC(inode))
+		return 1;
+	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * If we try to find an inode in the inode hash while it is being
+ * deleted, we have to wait until the filesystem completes its
+ * deletion before reporting that it isn't found.  This function waits
+ * until the deletion _might_ have completed.  Callers are responsible
+ * to recheck inode state.
+ *
+ * It doesn't matter if I_NEW is not set initially, a call to
+ * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
+ * will DTRT.
+ */
+static void __wait_on_freeing_inode(struct inode *inode)
+{
+	wait_queue_head_t *wq;
+	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+	wq = bit_waitqueue(&inode->i_state, __I_NEW);
+	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
+	schedule();
+	finish_wait(wq, &wait.wait);
+	spin_lock(&inode_hash_lock);
+}
+
+static __initdata unsigned long ihash_entries;
+static int __init set_ihash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	ihash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("ihash_entries=", set_ihash_entries);
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init_early(void)
+{
+	unsigned int loop;
+
+	/* If hashes are distributed across NUMA nodes, defer
+	 * hash allocation until vmalloc space is available.
+	 */
+	if (hashdist)
+		return;
+
+	inode_hashtable =
+		alloc_large_system_hash("Inode-cache",
+					sizeof(struct hlist_head),
+					ihash_entries,
+					14,
+					HASH_EARLY,
+					&i_hash_shift,
+					&i_hash_mask,
+					0,
+					0);
+
+	for (loop = 0; loop < (1U << i_hash_shift); loop++)
+		INIT_HLIST_HEAD(&inode_hashtable[loop]);
+}
+
+void __init inode_init(void)
+{
+	unsigned int loop;
+
+	/* inode slab cache */
+	inode_cachep = kmem_cache_create("inode_cache",
+					 sizeof(struct inode),
+					 0,
+					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
+					 SLAB_MEM_SPREAD),
+					 init_once);
+
+	/* Hash may have been set up in inode_init_early */
+	if (!hashdist)
+		return;
+
+	inode_hashtable =
+		alloc_large_system_hash("Inode-cache",
+					sizeof(struct hlist_head),
+					ihash_entries,
+					14,
+					0,
+					&i_hash_shift,
+					&i_hash_mask,
+					0,
+					0);
+
+	for (loop = 0; loop < (1U << i_hash_shift); loop++)
+		INIT_HLIST_HEAD(&inode_hashtable[loop]);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+	inode->i_mode = mode;
+	if (S_ISCHR(mode)) {
+		inode->i_fop = &def_chr_fops;
+		inode->i_rdev = rdev;
+	} else if (S_ISBLK(mode)) {
+		inode->i_fop = &def_blk_fops;
+		inode->i_rdev = rdev;
+	} else if (S_ISFIFO(mode))
+		inode->i_fop = &pipefifo_fops;
+	else if (S_ISSOCK(mode))
+		;	/* leave it no_open_fops */
+	else
+		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
+				  " inode %s:%lu\n", mode, inode->i_sb->s_id,
+				  inode->i_ino);
+}
+EXPORT_SYMBOL(init_special_inode);
+
+/**
+ * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+			umode_t mode)
+{
+	inode->i_uid = current_fsuid();
+	if (dir && dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+	inode->i_mode = mode;
+}
+EXPORT_SYMBOL(inode_init_owner);
+
+/**
+ * inode_owner_or_capable - check current task permissions to inode
+ * @inode: inode being checked
+ *
+ * Return true if current either has CAP_FOWNER in a namespace with the
+ * inode owner uid mapped, or owns the file.
+ */
+bool inode_owner_or_capable(const struct inode *inode)
+{
+	struct user_namespace *ns;
+
+	if (uid_eq(current_fsuid(), inode->i_uid))
+		return true;
+
+	ns = current_user_ns();
+	if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid))
+		return true;
+	return false;
+}
+EXPORT_SYMBOL(inode_owner_or_capable);
+
+/*
+ * Direct i/o helper functions
+ */
+static void __inode_dio_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+	do {
+		prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&inode->i_dio_count))
+			schedule();
+	} while (atomic_read(&inode->i_dio_count));
+	finish_wait(wq, &q.wait);
+}
+
+/**
+ * inode_dio_wait - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+void inode_dio_wait(struct inode *inode)
+{
+	if (atomic_read(&inode->i_dio_count))
+		__inode_dio_wait(inode);
+}
+EXPORT_SYMBOL(inode_dio_wait);
+
+/*
+ * inode_set_flags - atomically set some inode flags
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that
+ * they have exclusive access to the inode structure (i.e., while the
+ * inode is being instantiated).  The reason for the cmpxchg() loop
+ * --- which wouldn't be necessary if all code paths which modify
+ * i_flags actually followed this rule, is that there is at least one
+ * code path which doesn't today --- for example,
+ * __generic_file_aio_write() calls file_remove_suid() without holding
+ * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ *
+ * In the long run, i_mutex is overkill, and we should probably look
+ * at using the i_lock spinlock to protect i_flags, and then make sure
+ * it is so documented in include/linux/fs.h and that all code follows
+ * the locking convention!!
+ */
+void inode_set_flags(struct inode *inode, unsigned int flags,
+		     unsigned int mask)
+{
+	unsigned int old_flags, new_flags;
+
+	WARN_ON_ONCE(flags & ~mask);
+	do {
+		old_flags = ACCESS_ONCE(inode->i_flags);
+		new_flags = (old_flags & ~mask) | flags;
+	} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
+				  new_flags) != old_flags));
+}
+EXPORT_SYMBOL(inode_set_flags);
diff --git a/kernel/fs/internal.h b/kernel/fs/internal.h
new file mode 100644
index 000000000..01dce1d14
--- /dev/null
+++ b/kernel/fs/internal.h
@@ -0,0 +1,153 @@
+/* fs/ internal definitions
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+struct super_block;
+struct file_system_type;
+struct linux_binprm;
+struct path;
+struct mount;
+struct shrink_control;
+
+/*
+ * block_dev.c
+ */
+#ifdef CONFIG_BLOCK
+extern void __init bdev_cache_init(void);
+
+extern int __sync_blockdev(struct block_device *bdev, int wait);
+
+#else
+static inline void bdev_cache_init(void)
+{
+}
+
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	return 0;
+}
+#endif
+
+/*
+ * buffer.c
+ */
+extern void guard_bio_eod(int rw, struct bio *bio);
+
+/*
+ * char_dev.c
+ */
+extern void __init chrdev_init(void);
+
+/*
+ * namei.c
+ */
+extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+			   const char *, unsigned int, struct path *);
+
+/*
+ * namespace.c
+ */
+extern int copy_mount_options(const void __user *, unsigned long *);
+extern char *copy_mount_string(const void __user *);
+
+extern struct vfsmount *lookup_mnt(struct path *);
+extern int finish_automount(struct vfsmount *, struct path *);
+
+extern int sb_prepare_remount_readonly(struct super_block *);
+
+extern void __init mnt_init(void);
+
+extern int __mnt_want_write(struct vfsmount *);
+extern int __mnt_want_write_file(struct file *);
+extern void __mnt_drop_write(struct vfsmount *);
+extern void __mnt_drop_write_file(struct file *);
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(const struct path *, const struct path *);
+
+/*
+ * file_table.c
+ */
+extern struct file *get_empty_filp(void);
+
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
+extern bool trylock_super(struct super_block *sb);
+extern struct dentry *mount_fs(struct file_system_type *,
+			       int, const char *, void *);
+extern struct super_block *user_get_super(dev_t);
+
+/*
+ * open.c
+ */
+struct open_flags {
+	int open_flag;
+	umode_t mode;
+	int acc_mode;
+	int intent;
+	int lookup_flags;
+};
+extern struct file *do_filp_open(int dfd, struct filename *pathname,
+		const struct open_flags *op);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+		const char *, const struct open_flags *);
+
+extern long do_handle_open(int mountdirfd,
+			   struct file_handle __user *ufh, int open_flag);
+extern int open_check_o_direct(struct file *f);
+
+/*
+ * inode.c
+ */
+extern spinlock_t inode_sb_list_lock;
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
+extern void inode_add_lru(struct inode *inode);
+
+/*
+ * fs-writeback.c
+ */
+extern void inode_wb_list_del(struct inode *inode);
+
+extern long get_nr_dirty_inodes(void);
+extern void evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
+
+/*
+ * dcache.c
+ */
+extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
+extern int d_set_mounted(struct dentry *dentry);
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
+
+/*
+ * read_write.c
+ */
+extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
+
+/*
+ * pipe.c
+ */
+extern const struct file_operations pipefifo_fops;
+
+/*
+ * fs_pin.c
+ */
+extern void group_pin_kill(struct hlist_head *p);
+extern void mnt_pin_kill(struct mount *m);
+
+/*
+ * fs/nsfs.c
+ */
+extern struct dentry_operations ns_dentry_operations;
diff --git a/kernel/fs/ioctl.c b/kernel/fs/ioctl.c
new file mode 100644
index 000000000..5d01d2638
--- /dev/null
+++ b/kernel/fs/ioctl.c
@@ -0,0 +1,625 @@
+/*
+ *  linux/fs/ioctl.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/falloc.h>
+
+#include <asm/ioctls.h>
+
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
+/**
+ * vfs_ioctl - call filesystem specific ioctl methods
+ * @filp:	open file to invoke ioctl method on
+ * @cmd:	ioctl command to execute
+ * @arg:	command-specific argument for ioctl
+ *
+ * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
+ * returns -ENOTTY.
+ *
+ * Returns 0 on success, -errno on error.
+ */
+static long vfs_ioctl(struct file *filp, unsigned int cmd,
+		      unsigned long arg)
+{
+	int error = -ENOTTY;
+
+	if (!filp->f_op->unlocked_ioctl)
+		goto out;
+
+	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+	if (error == -ENOIOCTLCMD)
+		error = -ENOTTY;
+ out:
+	return error;
+}
+
+static int ioctl_fibmap(struct file *filp, int __user *p)
+{
+	struct address_space *mapping = filp->f_mapping;
+	int res, block;
+
+	/* do we support this mess? */
+	if (!mapping->a_ops->bmap)
+		return -EINVAL;
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	res = get_user(block, p);
+	if (res)
+		return res;
+	res = mapping->a_ops->bmap(mapping, block);
+	return put_user(res, p);
+}
+
+/**
+ * fiemap_fill_next_extent - Fiemap helper function
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @logical:	Extent logical start offset, in bytes
+ * @phys:	Extent physical start offset, in bytes
+ * @len:	Extent length, in bytes
+ * @flags:	FIEMAP_EXTENT flags that describe this extent
+ *
+ * Called from file system ->fiemap callback. Will populate extent
+ * info as passed in via arguments and copy to user memory. On
+ * success, extent count on fieinfo is incremented.
+ *
+ * Returns 0 on success, -errno on error, 1 if this was the last
+ * extent that will fit in user array.
+ */
+#define SET_UNKNOWN_FLAGS	(FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS	(FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS	(FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
+			    u64 phys, u64 len, u32 flags)
+{
+	struct fiemap_extent extent;
+	struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
+
+	/* only count the extents */
+	if (fieinfo->fi_extents_max == 0) {
+		fieinfo->fi_extents_mapped++;
+		return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+	}
+
+	if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
+		return 1;
+
+	if (flags & SET_UNKNOWN_FLAGS)
+		flags |= FIEMAP_EXTENT_UNKNOWN;
+	if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
+		flags |= FIEMAP_EXTENT_ENCODED;
+	if (flags & SET_NOT_ALIGNED_FLAGS)
+		flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+	memset(&extent, 0, sizeof(extent));
+	extent.fe_logical = logical;
+	extent.fe_physical = phys;
+	extent.fe_length = len;
+	extent.fe_flags = flags;
+
+	dest += fieinfo->fi_extents_mapped;
+	if (copy_to_user(dest, &extent, sizeof(extent)))
+		return -EFAULT;
+
+	fieinfo->fi_extents_mapped++;
+	if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
+		return 1;
+	return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+}
+EXPORT_SYMBOL(fiemap_fill_next_extent);
+
+/**
+ * fiemap_check_flags - check validity of requested flags for fiemap
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @fs_flags:	Set of fiemap flags that the file system understands
+ *
+ * Called from file system ->fiemap callback. This will compute the
+ * intersection of valid fiemap flags and those that the fs supports. That
+ * value is then compared against the user supplied flags. In case of bad user
+ * flags, the invalid values will be written into the fieinfo structure, and
+ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
+ * userspace. For this reason, a return code of -EBADR should be preserved.
+ *
+ * Returns 0 on success, -EBADR on bad flags.
+ */
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
+{
+	u32 incompat_flags;
+
+	incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
+	if (incompat_flags) {
+		fieinfo->fi_flags = incompat_flags;
+		return -EBADR;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(fiemap_check_flags);
+
+static int fiemap_check_ranges(struct super_block *sb,
+			       u64 start, u64 len, u64 *new_len)
+{
+	u64 maxbytes = (u64) sb->s_maxbytes;
+
+	*new_len = len;
+
+	if (len == 0)
+		return -EINVAL;
+
+	if (start > maxbytes)
+		return -EFBIG;
+
+	/*
+	 * Shrink request scope to what the fs can actually handle.
+	 */
+	if (len > maxbytes || (maxbytes - len) < start)
+		*new_len = maxbytes - start;
+
+	return 0;
+}
+
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap;
+	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	u64 len;
+	int error;
+
+	if (!inode->i_op->fiemap)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+		return -EFAULT;
+
+	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+		return -EINVAL;
+
+	error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+				    &len);
+	if (error)
+		return error;
+
+	fieinfo.fi_flags = fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap.fm_extent_count;
+	fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+	if (fiemap.fm_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+		return -EFAULT;
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
+	fiemap.fm_flags = fieinfo.fi_flags;
+	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+	if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+		error = -EFAULT;
+
+	return error;
+}
+
+#ifdef CONFIG_BLOCK
+
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
+{
+	return (offset >> inode->i_blkbits);
+}
+
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+	return (blk << inode->i_blkbits);
+}
+
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
+ * @inode: the inode to map
+ * @fieinfo: the fiemap info struct that will be passed back to userspace
+ * @start: where to start mapping in the inode
+ * @len: how much space to map
+ * @get_block: the fs's get_block function
+ *
+ * This does FIEMAP for block based inodes.  Basically it will just loop
+ * through get_block until we hit the number of extents we want to map, or we
+ * go past the end of the file and hit a hole.
+ *
+ * If it is possible to have data blocks beyond a hole past @inode->i_size, then
+ * please do not use this function, it will stop at the first unmapped block
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
+ */
+
+int __generic_block_fiemap(struct inode *inode,
+			   struct fiemap_extent_info *fieinfo, loff_t start,
+			   loff_t len, get_block_t *get_block)
+{
+	struct buffer_head map_bh;
+	sector_t start_blk, last_blk;
+	loff_t isize = i_size_read(inode);
+	u64 logical = 0, phys = 0, size = 0;
+	u32 flags = FIEMAP_EXTENT_MERGED;
+	bool past_eof = false, whole_file = false;
+	int ret = 0;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	/*
+	 * Either the i_mutex or other appropriate locking needs to be held
+	 * since we expect isize to not change at all through the duration of
+	 * this call.
+	 */
+	if (len >= isize) {
+		whole_file = true;
+		len = isize;
+	}
+
+	/*
+	 * Some filesystems can't deal with being asked to map less than
+	 * blocksize, so make sure our len is at least block length.
+	 */
+	if (logical_to_blk(inode, len) == 0)
+		len = blk_to_logical(inode, 1);
+
+	start_blk = logical_to_blk(inode, start);
+	last_blk = logical_to_blk(inode, start + len - 1);
+
+	do {
+		/*
+		 * we set b_size to the total size we want so it will map as
+		 * many contiguous blocks as possible at once
+		 */
+		memset(&map_bh, 0, sizeof(struct buffer_head));
+		map_bh.b_size = len;
+
+		ret = get_block(inode, start_blk, &map_bh, 0);
+		if (ret)
+			break;
+
+		/* HOLE */
+		if (!buffer_mapped(&map_bh)) {
+			start_blk++;
+
+			/*
+			 * We want to handle the case where there is an
+			 * allocated block at the front of the file, and then
+			 * nothing but holes up to the end of the file properly,
+			 * to make sure that extent at the front gets properly
+			 * marked with FIEMAP_EXTENT_LAST
+			 */
+			if (!past_eof &&
+			    blk_to_logical(inode, start_blk) >= isize)
+				past_eof = 1;
+
+			/*
+			 * First hole after going past the EOF, this is our
+			 * last extent
+			 */
+			if (past_eof && size) {
+				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+			} else if (size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size, flags);
+				size = 0;
+			}
+
+			/* if we have holes up to/past EOF then we're done */
+			if (start_blk > last_blk || past_eof || ret)
+				break;
+		} else {
+			/*
+			 * We have gone over the length of what we wanted to
+			 * map, and it wasn't the entire file, so add the extent
+			 * we got last time and exit.
+			 *
+			 * This is for the case where say we want to map all the
+			 * way up to the second to the last block in a file, but
+			 * the last block is a hole, making the second to last
+			 * block FIEMAP_EXTENT_LAST.  In this case we want to
+			 * see if there is a hole after the second to last block
+			 * so we can mark it properly.  If we found data after
+			 * we exceeded the length we were requesting, then we
+			 * are good to go, just add the extent to the fieinfo
+			 * and break
+			 */
+			if (start_blk > last_blk && !whole_file) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				break;
+			}
+
+			/*
+			 * if size != 0 then we know we already have an extent
+			 * to add, so add it.
+			 */
+			if (size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				if (ret)
+					break;
+			}
+
+			logical = blk_to_logical(inode, start_blk);
+			phys = blk_to_logical(inode, map_bh.b_blocknr);
+			size = map_bh.b_size;
+			flags = FIEMAP_EXTENT_MERGED;
+
+			start_blk += logical_to_blk(inode, size);
+
+			/*
+			 * If we are past the EOF, then we need to make sure as
+			 * soon as we find a hole that the last extent we found
+			 * is marked with FIEMAP_EXTENT_LAST
+			 */
+			if (!past_eof && logical + size >= isize)
+				past_eof = true;
+		}
+		cond_resched();
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+	} while (1);
+
+	/* If ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL(__generic_block_fiemap);
+
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+
+int generic_block_fiemap(struct inode *inode,
+			 struct fiemap_extent_info *fieinfo, u64 start,
+			 u64 len, get_block_t *get_block)
+{
+	int ret;
+	mutex_lock(&inode->i_mutex);
+	ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(generic_block_fiemap);
+
+#endif  /*  CONFIG_BLOCK  */
+
+/*
+ * This provides compatibility with legacy XFS pre-allocation ioctls
+ * which predate the fallocate syscall.
+ *
+ * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
+ * are used here, rest are ignored.
+ */
+int ioctl_preallocate(struct file *filp, void __user *argp)
+{
+	struct inode *inode = file_inode(filp);
+	struct space_resv sr;
+
+	if (copy_from_user(&sr, argp, sizeof(sr)))
+		return -EFAULT;
+
+	switch (sr.l_whence) {
+	case SEEK_SET:
+		break;
+	case SEEK_CUR:
+		sr.l_start += filp->f_pos;
+		break;
+	case SEEK_END:
+		sr.l_start += i_size_read(inode);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
+
+static int file_ioctl(struct file *filp, unsigned int cmd,
+		unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	int __user *p = (int __user *)arg;
+
+	switch (cmd) {
+	case FIBMAP:
+		return ioctl_fibmap(filp, p);
+	case FIONREAD:
+		return put_user(i_size_read(inode) - filp->f_pos, p);
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		return ioctl_preallocate(filp, p);
+	}
+
+	return vfs_ioctl(filp, cmd, arg);
+}
+
+static int ioctl_fionbio(struct file *filp, int __user *argp)
+{
+	unsigned int flag;
+	int on, error;
+
+	error = get_user(on, argp);
+	if (error)
+		return error;
+	flag = O_NONBLOCK;
+#ifdef __sparc__
+	/* SunOS compatibility item. */
+	if (O_NONBLOCK != O_NDELAY)
+		flag |= O_NDELAY;
+#endif
+	spin_lock(&filp->f_lock);
+	if (on)
+		filp->f_flags |= flag;
+	else
+		filp->f_flags &= ~flag;
+	spin_unlock(&filp->f_lock);
+	return error;
+}
+
+static int ioctl_fioasync(unsigned int fd, struct file *filp,
+			  int __user *argp)
+{
+	unsigned int flag;
+	int on, error;
+
+	error = get_user(on, argp);
+	if (error)
+		return error;
+	flag = on ? FASYNC : 0;
+
+	/* Did FASYNC state change ? */
+	if ((flag ^ filp->f_flags) & FASYNC) {
+		if (filp->f_op->fasync)
+			/* fasync() adjusts filp->f_flags */
+			error = filp->f_op->fasync(fd, filp, on);
+		else
+			error = -ENOTTY;
+	}
+	return error < 0 ? error : 0;
+}
+
+static int ioctl_fsfreeze(struct file *filp)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support freeze feature, return. */
+	if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL)
+		return -EOPNOTSUPP;
+
+	/* Freeze */
+	if (sb->s_op->freeze_super)
+		return sb->s_op->freeze_super(sb);
+	return freeze_super(sb);
+}
+
+static int ioctl_fsthaw(struct file *filp)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Thaw */
+	if (sb->s_op->thaw_super)
+		return sb->s_op->thaw_super(sb);
+	return thaw_super(sb);
+}
+
+/*
+ * When you add any new common ioctls to the switches above and below
+ * please update compat_sys_ioctl() too.
+ *
+ * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
+ * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
+ */
+int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
+	     unsigned long arg)
+{
+	int error = 0;
+	int __user *argp = (int __user *)arg;
+	struct inode *inode = file_inode(filp);
+
+	switch (cmd) {
+	case FIOCLEX:
+		set_close_on_exec(fd, 1);
+		break;
+
+	case FIONCLEX:
+		set_close_on_exec(fd, 0);
+		break;
+
+	case FIONBIO:
+		error = ioctl_fionbio(filp, argp);
+		break;
+
+	case FIOASYNC:
+		error = ioctl_fioasync(fd, filp, argp);
+		break;
+
+	case FIOQSIZE:
+		if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+		    S_ISLNK(inode->i_mode)) {
+			loff_t res = inode_get_bytes(inode);
+			error = copy_to_user(argp, &res, sizeof(res)) ?
+					-EFAULT : 0;
+		} else
+			error = -ENOTTY;
+		break;
+
+	case FIFREEZE:
+		error = ioctl_fsfreeze(filp);
+		break;
+
+	case FITHAW:
+		error = ioctl_fsthaw(filp);
+		break;
+
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
+
+	case FIGETBSZ:
+		return put_user(inode->i_sb->s_blocksize, argp);
+
+	default:
+		if (S_ISREG(inode->i_mode))
+			error = file_ioctl(filp, cmd, arg);
+		else
+			error = vfs_ioctl(filp, cmd, arg);
+		break;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+{
+	int error;
+	struct fd f = fdget(fd);
+
+	if (!f.file)
+		return -EBADF;
+	error = security_file_ioctl(f.file, cmd, arg);
+	if (!error)
+		error = do_vfs_ioctl(f.file, fd, cmd, arg);
+	fdput(f);
+	return error;
+}
diff --git a/kernel/fs/isofs/Kconfig b/kernel/fs/isofs/Kconfig
new file mode 100644
index 000000000..8ab9878e3
--- /dev/null
+++ b/kernel/fs/isofs/Kconfig
@@ -0,0 +1,39 @@
+config ISO9660_FS
+	tristate "ISO 9660 CDROM file system support"
+	help
+	  This is the standard file system used on CD-ROMs.  It was previously
+	  known as "High Sierra File System" and is called "hsfs" on other
+	  Unix systems.  The so-called Rock-Ridge extensions which allow for
+	  long Unix filenames and symbolic links are also supported by this
+	  driver.  If you have a CD-ROM drive and want to do more with it than
+	  just listen to audio CDs and watch its LEDs, say Y (and read
+	  <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>), thereby
+	  enlarging your kernel by about 27 KB; otherwise say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called isofs.
+
+config JOLIET
+	bool "Microsoft Joliet CDROM extensions"
+	depends on ISO9660_FS
+	select NLS
+	help
+	  Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
+	  which allows for long filenames in unicode format (unicode is the
+	  new 16 bit character code, successor to ASCII, which encodes the
+	  characters of almost all languages of the world; see
+	  <http://www.unicode.org/> for more information).  Say Y here if you
+	  want to be able to read Joliet CD-ROMs under Linux.
+
+config ZISOFS
+	bool "Transparent decompression extension"
+	depends on ISO9660_FS
+	select ZLIB_INFLATE
+	help
+	  This is a Linux-specific extension to RockRidge which lets you store
+	  data in compressed form on a CD-ROM and have it transparently
+	  decompressed when the CD-ROM is accessed.  See
+	  <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
+	  necessary to create such a filesystem.  Say Y here if you want to be
+	  able to read such compressed CD-ROMs.
diff --git a/kernel/fs/isofs/Makefile b/kernel/fs/isofs/Makefile
new file mode 100644
index 000000000..bf162f094
--- /dev/null
+++ b/kernel/fs/isofs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux isofs filesystem routines.
+#
+
+obj-$(CONFIG_ISO9660_FS) += isofs.o
+
+isofs-objs-y 			:= namei.o inode.o dir.o util.o rock.o export.o
+isofs-objs-$(CONFIG_JOLIET)	+= joliet.o
+isofs-objs-$(CONFIG_ZISOFS)	+= compress.o
+isofs-objs			:= $(isofs-objs-y)
diff --git a/kernel/fs/isofs/compress.c b/kernel/fs/isofs/compress.c
new file mode 100644
index 000000000..f311bf084
--- /dev/null
+++ b/kernel/fs/isofs/compress.c
@@ -0,0 +1,380 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *   
+ *   Copyright 2001 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * linux/fs/isofs/compress.c
+ *
+ * Transparent decompression of files on an iso9660 filesystem
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#include "isofs.h"
+#include "zisofs.h"
+
+/* This should probably be global. */
+static char zisofs_sink_page[PAGE_CACHE_SIZE];
+
+/*
+ * This contains the zlib memory allocation and the mutex for the
+ * allocation; this avoids failures at block-decompression time.
+ */
+static void *zisofs_zlib_workspace;
+static DEFINE_MUTEX(zisofs_zlib_lock);
+
+/*
+ * Read data of @inode from @block_start to @block_end and uncompress
+ * to one zisofs block. Store the data in the @pages array with @pcount
+ * entries. Start storing at offset @poffset of the first page.
+ */
+static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
+				      loff_t block_end, int pcount,
+				      struct page **pages, unsigned poffset,
+				      int *errp)
+{
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int bufsize = ISOFS_BUFFER_SIZE(inode);
+	unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
+	unsigned int bufmask = bufsize - 1;
+	int i, block_size = block_end - block_start;
+	z_stream stream = { .total_out = 0,
+			    .avail_in = 0,
+			    .avail_out = 0, };
+	int zerr;
+	int needblocks = (block_size + (block_start & bufmask) + bufmask)
+				>> bufshift;
+	int haveblocks;
+	blkcnt_t blocknum;
+	struct buffer_head *bhs[needblocks + 1];
+	int curbh, curpage;
+
+	if (block_size > deflateBound(1UL << zisofs_block_shift)) {
+		*errp = -EIO;
+		return 0;
+	}
+	/* Empty block? */
+	if (block_size == 0) {
+		for ( i = 0 ; i < pcount ; i++ ) {
+			if (!pages[i])
+				continue;
+			memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
+			flush_dcache_page(pages[i]);
+			SetPageUptodate(pages[i]);
+		}
+		return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
+	}
+
+	/* Because zlib is not thread-safe, do all the I/O at the top. */
+	blocknum = block_start >> bufshift;
+	memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
+	haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
+	ll_rw_block(READ, haveblocks, bhs);
+
+	curbh = 0;
+	curpage = 0;
+	/*
+	 * First block is special since it may be fractional.  We also wait for
+	 * it before grabbing the zlib mutex; odds are that the subsequent
+	 * blocks are going to come in in short order so we don't hold the zlib
+	 * mutex longer than necessary.
+	 */
+
+	if (!bhs[0])
+		goto b_eio;
+
+	wait_on_buffer(bhs[0]);
+	if (!buffer_uptodate(bhs[0])) {
+		*errp = -EIO;
+		goto b_eio;
+	}
+
+	stream.workspace = zisofs_zlib_workspace;
+	mutex_lock(&zisofs_zlib_lock);
+		
+	zerr = zlib_inflateInit(&stream);
+	if (zerr != Z_OK) {
+		if (zerr == Z_MEM_ERROR)
+			*errp = -ENOMEM;
+		else
+			*errp = -EIO;
+		printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
+			       zerr);
+		goto z_eio;
+	}
+
+	while (curpage < pcount && curbh < haveblocks &&
+	       zerr != Z_STREAM_END) {
+		if (!stream.avail_out) {
+			if (pages[curpage]) {
+				stream.next_out = page_address(pages[curpage])
+						+ poffset;
+				stream.avail_out = PAGE_CACHE_SIZE - poffset;
+				poffset = 0;
+			} else {
+				stream.next_out = (void *)&zisofs_sink_page;
+				stream.avail_out = PAGE_CACHE_SIZE;
+			}
+		}
+		if (!stream.avail_in) {
+			wait_on_buffer(bhs[curbh]);
+			if (!buffer_uptodate(bhs[curbh])) {
+				*errp = -EIO;
+				break;
+			}
+			stream.next_in  = bhs[curbh]->b_data +
+						(block_start & bufmask);
+			stream.avail_in = min_t(unsigned, bufsize -
+						(block_start & bufmask),
+						block_size);
+			block_size -= stream.avail_in;
+			block_start = 0;
+		}
+
+		while (stream.avail_out && stream.avail_in) {
+			zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
+			if (zerr == Z_BUF_ERROR && stream.avail_in == 0)
+				break;
+			if (zerr == Z_STREAM_END)
+				break;
+			if (zerr != Z_OK) {
+				/* EOF, error, or trying to read beyond end of input */
+				if (zerr == Z_MEM_ERROR)
+					*errp = -ENOMEM;
+				else {
+					printk(KERN_DEBUG
+					       "zisofs: zisofs_inflate returned"
+					       " %d, inode = %lu,"
+					       " page idx = %d, bh idx = %d,"
+					       " avail_in = %ld,"
+					       " avail_out = %ld\n",
+					       zerr, inode->i_ino, curpage,
+					       curbh, stream.avail_in,
+					       stream.avail_out);
+					*errp = -EIO;
+				}
+				goto inflate_out;
+			}
+		}
+
+		if (!stream.avail_out) {
+			/* This page completed */
+			if (pages[curpage]) {
+				flush_dcache_page(pages[curpage]);
+				SetPageUptodate(pages[curpage]);
+			}
+			curpage++;
+		}
+		if (!stream.avail_in)
+			curbh++;
+	}
+inflate_out:
+	zlib_inflateEnd(&stream);
+
+z_eio:
+	mutex_unlock(&zisofs_zlib_lock);
+
+b_eio:
+	for (i = 0; i < haveblocks; i++)
+		brelse(bhs[i]);
+	return stream.total_out;
+}
+
+/*
+ * Uncompress data so that pages[full_page] is fully uptodate and possibly
+ * fills in other pages if we have data for them.
+ */
+static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
+			     struct page **pages)
+{
+	loff_t start_off, end_off;
+	loff_t block_start, block_end;
+	unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int blockptr;
+	loff_t poffset = 0;
+	blkcnt_t cstart_block, cend_block;
+	struct buffer_head *bh;
+	unsigned int blkbits = ISOFS_BUFFER_BITS(inode);
+	unsigned int blksize = 1 << blkbits;
+	int err;
+	loff_t ret;
+
+	BUG_ON(!pages[full_page]);
+
+	/*
+	 * We want to read at least 'full_page' page. Because we have to
+	 * uncompress the whole compression block anyway, fill the surrounding
+	 * pages with the data we have anyway...
+	 */
+	start_off = page_offset(pages[full_page]);
+	end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+
+	cstart_block = start_off >> zisofs_block_shift;
+	cend_block = (end_off + (1 << zisofs_block_shift) - 1)
+			>> zisofs_block_shift;
+
+	WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
+		((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+
+	/* Find the pointer to this specific chunk */
+	/* Note: we're not using isonum_731() here because the data is known aligned */
+	/* Note: header_size is in 32-bit words (4 bytes) */
+	blockptr = (header_size + cstart_block) << 2;
+	bh = isofs_bread(inode, blockptr >> blkbits);
+	if (!bh)
+		return -EIO;
+	block_start = le32_to_cpu(*(__le32 *)
+				(bh->b_data + (blockptr & (blksize - 1))));
+
+	while (cstart_block < cend_block && pcount > 0) {
+		/* Load end of the compressed block in the file */
+		blockptr += 4;
+		/* Traversed to next block? */
+		if (!(blockptr & (blksize - 1))) {
+			brelse(bh);
+
+			bh = isofs_bread(inode, blockptr >> blkbits);
+			if (!bh)
+				return -EIO;
+		}
+		block_end = le32_to_cpu(*(__le32 *)
+				(bh->b_data + (blockptr & (blksize - 1))));
+		if (block_start > block_end) {
+			brelse(bh);
+			return -EIO;
+		}
+		err = 0;
+		ret = zisofs_uncompress_block(inode, block_start, block_end,
+					      pcount, pages, poffset, &err);
+		poffset += ret;
+		pages += poffset >> PAGE_CACHE_SHIFT;
+		pcount -= poffset >> PAGE_CACHE_SHIFT;
+		full_page -= poffset >> PAGE_CACHE_SHIFT;
+		poffset &= ~PAGE_CACHE_MASK;
+
+		if (err) {
+			brelse(bh);
+			/*
+			 * Did we finish reading the page we really wanted
+			 * to read?
+			 */
+			if (full_page < 0)
+				return 0;
+			return err;
+		}
+
+		block_start = block_end;
+		cstart_block++;
+	}
+
+	if (poffset && *pages) {
+		memset(page_address(*pages) + poffset, 0,
+		       PAGE_CACHE_SIZE - poffset);
+		flush_dcache_page(*pages);
+		SetPageUptodate(*pages);
+	}
+	return 0;
+}
+
+/*
+ * When decompressing, we typically obtain more than one page
+ * per reference.  We inject the additional pages into the page
+ * cache as a form of readahead.
+ */
+static int zisofs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	int err;
+	int i, pcount, full_page;
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int zisofs_pages_per_cblock =
+		PAGE_CACHE_SHIFT <= zisofs_block_shift ?
+		(1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+	struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
+	pgoff_t index = page->index, end_index;
+
+	end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	/*
+	 * If this page is wholly outside i_size we just return zero;
+	 * do_generic_file_read() will handle this for us
+	 */
+	if (index >= end_index) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+		/* We have already been given one page, this is the one
+		   we must do. */
+		full_page = index & (zisofs_pages_per_cblock - 1);
+		pcount = min_t(int, zisofs_pages_per_cblock,
+			end_index - (index & ~(zisofs_pages_per_cblock - 1)));
+		index -= full_page;
+	} else {
+		full_page = 0;
+		pcount = 1;
+	}
+	pages[full_page] = page;
+
+	for (i = 0; i < pcount; i++, index++) {
+		if (i != full_page)
+			pages[i] = grab_cache_page_nowait(mapping, index);
+		if (pages[i]) {
+			ClearPageError(pages[i]);
+			kmap(pages[i]);
+		}
+	}
+
+	err = zisofs_fill_pages(inode, full_page, pcount, pages);
+
+	/* Release any residual pages, do not SetPageUptodate */
+	for (i = 0; i < pcount; i++) {
+		if (pages[i]) {
+			flush_dcache_page(pages[i]);
+			if (i == full_page && err)
+				SetPageError(pages[i]);
+			kunmap(pages[i]);
+			unlock_page(pages[i]);
+			if (i != full_page)
+				page_cache_release(pages[i]);
+		}
+	}			
+
+	/* At this point, err contains 0 or -EIO depending on the "critical" page */
+	return err;
+}
+
+const struct address_space_operations zisofs_aops = {
+	.readpage = zisofs_readpage,
+	/* No sync_page operation supported? */
+	/* No bmap operation supported */
+};
+
+int __init zisofs_init(void)
+{
+	zisofs_zlib_workspace = vmalloc(zlib_inflate_workspacesize());
+	if ( !zisofs_zlib_workspace )
+		return -ENOMEM;
+
+	return 0;
+}
+
+void zisofs_cleanup(void)
+{
+	vfree(zisofs_zlib_workspace);
+}
diff --git a/kernel/fs/isofs/dir.c b/kernel/fs/isofs/dir.c
new file mode 100644
index 000000000..b943cbd96
--- /dev/null
+++ b/kernel/fs/isofs/dir.c
@@ -0,0 +1,283 @@
+/*
+ *  linux/fs/isofs/dir.c
+ *
+ *  (C) 1992, 1993, 1994  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *
+ *  Steve Beynon		       : Missing last directory entries fixed
+ *  (stephen@askone.demon.co.uk)      : 21st June 1996
+ *
+ *  isofs directory handling functions
+ */
+#include <linux/gfp.h>
+#include "isofs.h"
+
+int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
+{
+	char * old = de->name;
+	int len = de->name_len[0];
+	int i;
+
+	for (i = 0; i < len; i++) {
+		unsigned char c = old[i];
+		if (!c)
+			break;
+
+		if (c >= 'A' && c <= 'Z')
+			c |= 0x20;	/* lower case */
+
+		/* Drop trailing '.;1' (ISO 9660:1988 7.5.1 requires period) */
+		if (c == '.' && i == len - 3 && old[i + 1] == ';' && old[i + 2] == '1')
+			break;
+
+		/* Drop trailing ';1' */
+		if (c == ';' && i == len - 2 && old[i + 1] == '1')
+			break;
+
+		/* Convert remaining ';' to '.' */
+		/* Also '/' to '.' (broken Acorn-generated ISO9660 images) */
+		if (c == ';' || c == '/')
+			c = '.';
+
+		new[i] = c;
+	}
+	return i;
+}
+
+/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
+int get_acorn_filename(struct iso_directory_record *de,
+			    char *retname, struct inode *inode)
+{
+	int std;
+	unsigned char *chr;
+	int retnamlen = isofs_name_translate(de, retname, inode);
+
+	if (retnamlen == 0)
+		return 0;
+	std = sizeof(struct iso_directory_record) + de->name_len[0];
+	if (std & 1)
+		std++;
+	if ((*((unsigned char *) de) - std) != 32)
+		return retnamlen;
+	chr = ((unsigned char *) de) + std;
+	if (strncmp(chr, "ARCHIMEDES", 10))
+		return retnamlen;
+	if ((*retname == '_') && ((chr[19] & 1) == 1))
+		*retname = '!';
+	if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff)
+		&& ((chr[12] & 0xf0) == 0xf0)) {
+		retname[retnamlen] = ',';
+		sprintf(retname+retnamlen+1, "%3.3x",
+			((chr[12] & 0xf) << 8) | chr[11]);
+		retnamlen += 4;
+	}
+	return retnamlen;
+}
+
+/*
+ * This should _really_ be cleaned up some day..
+ */
+static int do_isofs_readdir(struct inode *inode, struct file *file,
+		struct dir_context *ctx,
+		char *tmpname, struct iso_directory_record *tmpde)
+{
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	unsigned char bufbits = ISOFS_BUFFER_BITS(inode);
+	unsigned long block, offset, block_saved, offset_saved;
+	unsigned long inode_number = 0;	/* Quiet GCC */
+	struct buffer_head *bh = NULL;
+	int len;
+	int map;
+	int first_de = 1;
+	char *p = NULL;		/* Quiet GCC */
+	struct iso_directory_record *de;
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
+
+	offset = ctx->pos & (bufsize - 1);
+	block = ctx->pos >> bufbits;
+
+	while (ctx->pos < inode->i_size) {
+		int de_len;
+
+		if (!bh) {
+			bh = isofs_bread(inode, block);
+			if (!bh)
+				return 0;
+		}
+
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+
+		de_len = *(unsigned char *)de;
+
+		/*
+		 * If the length byte is zero, we should move on to the next
+		 * CDROM sector.  If we are at the end of the directory, we
+		 * kick out of the while loop.
+		 */
+
+		if (de_len == 0) {
+			brelse(bh);
+			bh = NULL;
+			ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+			block = ctx->pos >> bufbits;
+			offset = 0;
+			continue;
+		}
+
+		block_saved = block;
+		offset_saved = offset;
+		offset += de_len;
+
+		/* Make sure we have a full directory entry */
+		if (offset >= bufsize) {
+			int slop = bufsize - offset + de_len;
+			memcpy(tmpde, de, slop);
+			offset &= bufsize - 1;
+			block++;
+			brelse(bh);
+			bh = NULL;
+			if (offset) {
+				bh = isofs_bread(inode, block);
+				if (!bh)
+					return 0;
+				memcpy((void *) tmpde + slop, bh->b_data, offset);
+			}
+			de = tmpde;
+		}
+		/* Basic sanity check, whether name doesn't exceed dir entry */
+		if (de_len < de->name_len[0] +
+					sizeof(struct iso_directory_record)) {
+			printk(KERN_NOTICE "iso9660: Corrupted directory entry"
+			       " in block %lu of inode %lu\n", block,
+			       inode->i_ino);
+			return -EIO;
+		}
+
+		if (first_de) {
+			isofs_normalize_block_and_offset(de,
+							&block_saved,
+							&offset_saved);
+			inode_number = isofs_get_ino(block_saved,
+							offset_saved, bufbits);
+		}
+
+		if (de->flags[-sbi->s_high_sierra] & 0x80) {
+			first_de = 0;
+			ctx->pos += de_len;
+			continue;
+		}
+		first_de = 1;
+
+		/* Handle the case of the '.' directory */
+		if (de->name_len[0] == 1 && de->name[0] == 0) {
+			if (!dir_emit_dot(file, ctx))
+				break;
+			ctx->pos += de_len;
+			continue;
+		}
+
+		len = 0;
+
+		/* Handle the case of the '..' directory */
+		if (de->name_len[0] == 1 && de->name[0] == 1) {
+			if (!dir_emit_dotdot(file, ctx))
+				break;
+			ctx->pos += de_len;
+			continue;
+		}
+
+		/* Handle everything else.  Do name translation if there
+		   is no Rock Ridge NM field. */
+
+		/*
+		 * Do not report hidden files if so instructed, or associated
+		 * files unless instructed to do so
+		 */
+		if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
+		    (!sbi->s_showassoc &&
+				(de->flags[-sbi->s_high_sierra] & 4))) {
+			ctx->pos += de_len;
+			continue;
+		}
+
+		map = 1;
+		if (sbi->s_rock) {
+			len = get_rock_ridge_filename(de, tmpname, inode);
+			if (len != 0) {		/* may be -1 */
+				p = tmpname;
+				map = 0;
+			}
+		}
+		if (map) {
+#ifdef CONFIG_JOLIET
+			if (sbi->s_joliet_level) {
+				len = get_joliet_filename(de, tmpname, inode);
+				p = tmpname;
+			} else
+#endif
+			if (sbi->s_mapping == 'a') {
+				len = get_acorn_filename(de, tmpname, inode);
+				p = tmpname;
+			} else
+			if (sbi->s_mapping == 'n') {
+				len = isofs_name_translate(de, tmpname, inode);
+				p = tmpname;
+			} else {
+				p = de->name;
+				len = de->name_len[0];
+			}
+		}
+		if (len > 0) {
+			if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
+				break;
+		}
+		ctx->pos += de_len;
+
+		continue;
+	}
+	if (bh)
+		brelse(bh);
+	return 0;
+}
+
+/*
+ * Handle allocation of temporary space for name translation and
+ * handling split directory entries.. The real work is done by
+ * "do_isofs_readdir()".
+ */
+static int isofs_readdir(struct file *file, struct dir_context *ctx)
+{
+	int result;
+	char *tmpname;
+	struct iso_directory_record *tmpde;
+	struct inode *inode = file_inode(file);
+
+	tmpname = (char *)__get_free_page(GFP_KERNEL);
+	if (tmpname == NULL)
+		return -ENOMEM;
+
+	tmpde = (struct iso_directory_record *) (tmpname+1024);
+
+	result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
+
+	free_page((unsigned long) tmpname);
+	return result;
+}
+
+const struct file_operations isofs_dir_operations =
+{
+	.llseek = generic_file_llseek,
+	.read = generic_read_dir,
+	.iterate = isofs_readdir,
+};
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations isofs_dir_inode_operations =
+{
+	.lookup = isofs_lookup,
+};
+
+
diff --git a/kernel/fs/isofs/export.c b/kernel/fs/isofs/export.c
new file mode 100644
index 000000000..0c5f721b4
--- /dev/null
+++ b/kernel/fs/isofs/export.c
@@ -0,0 +1,192 @@
+/*
+ * fs/isofs/export.c
+ *
+ *  (C) 2004  Paul Serice - The new inode scheme requires switching
+ *                          from iget() to iget5_locked() which means
+ *                          the NFS export operations have to be hand
+ *                          coded because the default routines rely on
+ *                          iget().
+ *
+ * The following files are helpful:
+ *
+ *     Documentation/filesystems/nfs/Exporting
+ *     fs/exportfs/expfs.c.
+ */
+
+#include "isofs.h"
+
+static struct dentry *
+isofs_export_iget(struct super_block *sb,
+		  unsigned long block,
+		  unsigned long offset,
+		  __u32 generation)
+{
+	struct inode *inode;
+
+	if (block == 0)
+		return ERR_PTR(-ESTALE);
+	inode = isofs_iget(sb, block, offset);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+/* This function is surprisingly simple.  The trick is understanding
+ * that "child" is always a directory. So, to find its parent, you
+ * simply need to find its ".." entry, normalize its block and offset,
+ * and return the underlying inode.  See the comments for
+ * isofs_normalize_block_and_offset(). */
+static struct dentry *isofs_export_get_parent(struct dentry *child)
+{
+	unsigned long parent_block = 0;
+	unsigned long parent_offset = 0;
+	struct inode *child_inode = d_inode(child);
+	struct iso_inode_info *e_child_inode = ISOFS_I(child_inode);
+	struct iso_directory_record *de = NULL;
+	struct buffer_head * bh = NULL;
+	struct dentry *rv = NULL;
+
+	/* "child" must always be a directory. */
+	if (!S_ISDIR(child_inode->i_mode)) {
+		printk(KERN_ERR "isofs: isofs_export_get_parent(): "
+		       "child is not a directory!\n");
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* It is an invariant that the directory offset is zero.  If
+	 * it is not zero, it means the directory failed to be
+	 * normalized for some reason. */
+	if (e_child_inode->i_iget5_offset != 0) {
+		printk(KERN_ERR "isofs: isofs_export_get_parent(): "
+		       "child directory not normalized!\n");
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* The child inode has been normalized such that its
+	 * i_iget5_block value points to the "." entry.  Fortunately,
+	 * the ".." entry is located in the same block. */
+	parent_block = e_child_inode->i_iget5_block;
+
+	/* Get the block in question. */
+	bh = sb_bread(child_inode->i_sb, parent_block);
+	if (bh == NULL) {
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* This is the "." entry. */
+	de = (struct iso_directory_record*)bh->b_data;
+
+	/* The ".." entry is always the second entry. */
+	parent_offset = (unsigned long)isonum_711(de->length);
+	de = (struct iso_directory_record*)(bh->b_data + parent_offset);
+
+	/* Verify it is in fact the ".." entry. */
+	if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) {
+		printk(KERN_ERR "isofs: Unable to find the \"..\" "
+		       "directory for NFS.\n");
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* Normalize */
+	isofs_normalize_block_and_offset(de, &parent_block, &parent_offset);
+
+	rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block,
+				     parent_offset));
+ out:
+	if (bh)
+		brelse(bh);
+	return rv;
+}
+
+static int
+isofs_export_encode_fh(struct inode *inode,
+		       __u32 *fh32,
+		       int *max_len,
+		       struct inode *parent)
+{
+	struct iso_inode_info * ei = ISOFS_I(inode);
+	int len = *max_len;
+	int type = 1;
+	__u16 *fh16 = (__u16*)fh32;
+
+	/*
+	 * WARNING: max_len is 5 for NFSv2.  Because of this
+	 * limitation, we use the lower 16 bits of fh32[1] to hold the
+	 * offset of the inode and the upper 16 bits of fh32[1] to
+	 * hold the offset of the parent.
+	 */
+	if (parent && (len < 5)) {
+		*max_len = 5;
+		return FILEID_INVALID;
+	} else if (len < 3) {
+		*max_len = 3;
+		return FILEID_INVALID;
+	}
+
+	len = 3;
+	fh32[0] = ei->i_iget5_block;
+ 	fh16[2] = (__u16)ei->i_iget5_offset;  /* fh16 [sic] */
+	fh16[3] = 0;  /* avoid leaking uninitialized data */
+	fh32[2] = inode->i_generation;
+	if (parent) {
+		struct iso_inode_info *eparent;
+		eparent = ISOFS_I(parent);
+		fh32[3] = eparent->i_iget5_block;
+		fh16[3] = (__u16)eparent->i_iget5_offset;  /* fh16 [sic] */
+		fh32[4] = parent->i_generation;
+		len = 5;
+		type = 2;
+	}
+	*max_len = len;
+	return type;
+}
+
+struct isofs_fid {
+	u32 block;
+	u16 offset;
+	u16 parent_offset;
+	u32 generation;
+	u32 parent_block;
+	u32 parent_generation;
+};
+
+static struct dentry *isofs_fh_to_dentry(struct super_block *sb,
+	struct fid *fid, int fh_len, int fh_type)
+{
+	struct isofs_fid *ifid = (struct isofs_fid *)fid;
+
+	if (fh_len < 3 || fh_type > 2)
+		return NULL;
+
+	return isofs_export_iget(sb, ifid->block, ifid->offset,
+			ifid->generation);
+}
+
+static struct dentry *isofs_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct isofs_fid *ifid = (struct isofs_fid *)fid;
+
+	if (fh_len < 2 || fh_type != 2)
+		return NULL;
+
+	return isofs_export_iget(sb,
+			fh_len > 2 ? ifid->parent_block : 0,
+			ifid->parent_offset,
+			fh_len > 4 ? ifid->parent_generation : 0);
+}
+
+const struct export_operations isofs_export_ops = {
+	.encode_fh	= isofs_export_encode_fh,
+	.fh_to_dentry	= isofs_fh_to_dentry,
+	.fh_to_parent	= isofs_fh_to_parent,
+	.get_parent     = isofs_export_get_parent,
+};
diff --git a/kernel/fs/isofs/inode.c b/kernel/fs/isofs/inode.c
new file mode 100644
index 000000000..d67a16f2a
--- /dev/null
+++ b/kernel/fs/isofs/inode.c
@@ -0,0 +1,1557 @@
+/*
+ *  linux/fs/isofs/inode.c
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *      1992, 1993, 1994  Eric Youngdale Modified for ISO 9660 filesystem.
+ *      1994  Eberhard Mönkeberg - multi session handling.
+ *      1995  Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs.
+ *	1997  Gordon Chaffee - Joliet CDs
+ *	1998  Eric Lammerts - ISO 9660 Level 3
+ *	2004  Paul Serice - Inode Support pushed out from 4GB to 128GB
+ *	2004  Paul Serice - NFS Export Operations
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/nls.h>
+#include <linux/ctype.h>
+#include <linux/statfs.h>
+#include <linux/cdrom.h>
+#include <linux/parser.h>
+#include <linux/mpage.h>
+#include <linux/user_namespace.h>
+
+#include "isofs.h"
+#include "zisofs.h"
+
+#define BEQUIET
+
+static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+		const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name);
+
+#ifdef CONFIG_JOLIET
+static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+		const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+		const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name);
+#endif
+
+static void isofs_put_super(struct super_block *sb)
+{
+	struct isofs_sb_info *sbi = ISOFS_SB(sb);
+
+#ifdef CONFIG_JOLIET
+	unload_nls(sbi->s_nls_iocharset);
+#endif
+
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+	return;
+}
+
+static int isofs_read_inode(struct inode *, int relocated);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
+
+static struct kmem_cache *isofs_inode_cachep;
+
+static struct inode *isofs_alloc_inode(struct super_block *sb)
+{
+	struct iso_inode_info *ei;
+	ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void isofs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
+}
+
+static void isofs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, isofs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct iso_inode_info *ei = foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
+					sizeof(struct iso_inode_info),
+					0, (SLAB_RECLAIM_ACCOUNT|
+					SLAB_MEM_SPREAD),
+					init_once);
+	if (isofs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(isofs_inode_cachep);
+}
+
+static int isofs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	if (!(*flags & MS_RDONLY))
+		return -EROFS;
+	return 0;
+}
+
+static const struct super_operations isofs_sops = {
+	.alloc_inode	= isofs_alloc_inode,
+	.destroy_inode	= isofs_destroy_inode,
+	.put_super	= isofs_put_super,
+	.statfs		= isofs_statfs,
+	.remount_fs	= isofs_remount,
+	.show_options	= generic_show_options,
+};
+
+
+static const struct dentry_operations isofs_dentry_ops[] = {
+	{
+		.d_hash		= isofs_hashi,
+		.d_compare	= isofs_dentry_cmpi,
+	},
+#ifdef CONFIG_JOLIET
+	{
+		.d_hash		= isofs_hash_ms,
+		.d_compare	= isofs_dentry_cmp_ms,
+	},
+	{
+		.d_hash		= isofs_hashi_ms,
+		.d_compare	= isofs_dentry_cmpi_ms,
+	},
+#endif
+};
+
+struct iso9660_options{
+	unsigned int rock:1;
+	unsigned int joliet:1;
+	unsigned int cruft:1;
+	unsigned int hide:1;
+	unsigned int showassoc:1;
+	unsigned int nocompress:1;
+	unsigned int overriderockperm:1;
+	unsigned int uid_set:1;
+	unsigned int gid_set:1;
+	unsigned int utf8:1;
+	unsigned char map;
+	unsigned char check;
+	unsigned int blocksize;
+	umode_t fmode;
+	umode_t dmode;
+	kgid_t gid;
+	kuid_t uid;
+	char *iocharset;
+	/* LVE */
+	s32 session;
+	s32 sbsector;
+};
+
+/*
+ * Compute the hash for the isofs name corresponding to the dentry.
+ */
+static int
+isofs_hashi_common(struct qstr *qstr, int ms)
+{
+	const char *name;
+	int len;
+	char c;
+	unsigned long hash;
+
+	len = qstr->len;
+	name = qstr->name;
+	if (ms) {
+		while (len && name[len-1] == '.')
+			len--;
+	}
+
+	hash = init_name_hash();
+	while (len--) {
+		c = tolower(*name++);
+		hash = partial_name_hash(c, hash);
+	}
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Compare of two isofs names.
+ */
+static int isofs_dentry_cmp_common(
+		unsigned int len, const char *str,
+		const struct qstr *name, int ms, int ci)
+{
+	int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = name->len;
+	blen = len;
+	if (ms) {
+		while (alen && name->name[alen-1] == '.')
+			alen--;
+		while (blen && str[blen-1] == '.')
+			blen--;
+	}
+	if (alen == blen) {
+		if (ci) {
+			if (strncasecmp(name->name, str, alen) == 0)
+				return 0;
+		} else {
+			if (strncmp(name->name, str, alen) == 0)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+static int
+isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
+{
+	return isofs_hashi_common(qstr, 0);
+}
+
+static int
+isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return isofs_dentry_cmp_common(len, str, name, 0, 1);
+}
+
+#ifdef CONFIG_JOLIET
+/*
+ * Compute the hash for the isofs name corresponding to the dentry.
+ */
+static int
+isofs_hash_common(struct qstr *qstr, int ms)
+{
+	const char *name;
+	int len;
+
+	len = qstr->len;
+	name = qstr->name;
+	if (ms) {
+		while (len && name[len-1] == '.')
+			len--;
+	}
+
+	qstr->hash = full_name_hash(name, len);
+
+	return 0;
+}
+
+static int
+isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
+{
+	return isofs_hash_common(qstr, 1);
+}
+
+static int
+isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
+{
+	return isofs_hashi_common(qstr, 1);
+}
+
+static int
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return isofs_dentry_cmp_common(len, str, name, 1, 0);
+}
+
+static int
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return isofs_dentry_cmp_common(len, str, name, 1, 1);
+}
+#endif
+
+enum {
+	Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
+	Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
+	Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
+	Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
+};
+
+static const match_table_t tokens = {
+	{Opt_norock, "norock"},
+	{Opt_nojoliet, "nojoliet"},
+	{Opt_unhide, "unhide"},
+	{Opt_hide, "hide"},
+	{Opt_showassoc, "showassoc"},
+	{Opt_cruft, "cruft"},
+	{Opt_utf8, "utf8"},
+	{Opt_iocharset, "iocharset=%s"},
+	{Opt_map_a, "map=acorn"},
+	{Opt_map_a, "map=a"},
+	{Opt_map_n, "map=normal"},
+	{Opt_map_n, "map=n"},
+	{Opt_map_o, "map=off"},
+	{Opt_map_o, "map=o"},
+	{Opt_session, "session=%u"},
+	{Opt_sb, "sbsector=%u"},
+	{Opt_check_r, "check=relaxed"},
+	{Opt_check_r, "check=r"},
+	{Opt_check_s, "check=strict"},
+	{Opt_check_s, "check=s"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_mode, "mode=%u"},
+	{Opt_dmode, "dmode=%u"},
+	{Opt_overriderockperm, "overriderockperm"},
+	{Opt_block, "block=%u"},
+	{Opt_ignore, "conv=binary"},
+	{Opt_ignore, "conv=b"},
+	{Opt_ignore, "conv=text"},
+	{Opt_ignore, "conv=t"},
+	{Opt_ignore, "conv=mtext"},
+	{Opt_ignore, "conv=m"},
+	{Opt_ignore, "conv=auto"},
+	{Opt_ignore, "conv=a"},
+	{Opt_nocompress, "nocompress"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct iso9660_options *popt)
+{
+	char *p;
+	int option;
+
+	popt->map = 'n';
+	popt->rock = 1;
+	popt->joliet = 1;
+	popt->cruft = 0;
+	popt->hide = 0;
+	popt->showassoc = 0;
+	popt->check = 'u';		/* unset */
+	popt->nocompress = 0;
+	popt->blocksize = 1024;
+	popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
+	popt->uid_set = 0;
+	popt->gid_set = 0;
+	popt->gid = GLOBAL_ROOT_GID;
+	popt->uid = GLOBAL_ROOT_UID;
+	popt->iocharset = NULL;
+	popt->utf8 = 0;
+	popt->overriderockperm = 0;
+	popt->session=-1;
+	popt->sbsector=-1;
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		substring_t args[MAX_OPT_ARGS];
+		unsigned n;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_norock:
+			popt->rock = 0;
+			break;
+		case Opt_nojoliet:
+			popt->joliet = 0;
+			break;
+		case Opt_hide:
+			popt->hide = 1;
+			break;
+		case Opt_unhide:
+		case Opt_showassoc:
+			popt->showassoc = 1;
+			break;
+		case Opt_cruft:
+			popt->cruft = 1;
+			break;
+		case Opt_utf8:
+			popt->utf8 = 1;
+			break;
+#ifdef CONFIG_JOLIET
+		case Opt_iocharset:
+			popt->iocharset = match_strdup(&args[0]);
+			break;
+#endif
+		case Opt_map_a:
+			popt->map = 'a';
+			break;
+		case Opt_map_o:
+			popt->map = 'o';
+			break;
+		case Opt_map_n:
+			popt->map = 'n';
+			break;
+		case Opt_session:
+			if (match_int(&args[0], &option))
+				return 0;
+			n = option;
+			if (n > 99)
+				return 0;
+			popt->session = n + 1;
+			break;
+		case Opt_sb:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->sbsector = option;
+			break;
+		case Opt_check_r:
+			popt->check = 'r';
+			break;
+		case Opt_check_s:
+			popt->check = 's';
+			break;
+		case Opt_ignore:
+			break;
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(popt->uid))
+				return 0;
+			popt->uid_set = 1;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(popt->gid))
+				return 0;
+			popt->gid_set = 1;
+			break;
+		case Opt_mode:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->fmode = option;
+			break;
+		case Opt_dmode:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->dmode = option;
+			break;
+		case Opt_overriderockperm:
+			popt->overriderockperm = 1;
+			break;
+		case Opt_block:
+			if (match_int(&args[0], &option))
+				return 0;
+			n = option;
+			if (n != 512 && n != 1024 && n != 2048)
+				return 0;
+			popt->blocksize = n;
+			break;
+		case Opt_nocompress:
+			popt->nocompress = 1;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * look if the driver can tell the multi session redirection value
+ *
+ * don't change this if you don't know what you do, please!
+ * Multisession is legal only with XA disks.
+ * A non-XA disk with more than one volume descriptor may do it right, but
+ * usually is written in a nowhere standardized "multi-partition" manner.
+ * Multisession uses absolute addressing (solely the first frame of the whole
+ * track is #0), multi-partition uses relative addressing (each first frame of
+ * each track is #0), and a track is not a session.
+ *
+ * A broken CDwriter software or drive firmware does not set new standards,
+ * at least not if conflicting with the existing ones.
+ *
+ * emoenke@gwdg.de
+ */
+#define WE_OBEY_THE_WRITTEN_STANDARDS 1
+
+static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
+{
+	struct cdrom_multisession ms_info;
+	unsigned int vol_desc_start;
+	struct block_device *bdev = sb->s_bdev;
+	int i;
+
+	vol_desc_start=0;
+	ms_info.addr_format=CDROM_LBA;
+	if(session >= 0 && session <= 99) {
+		struct cdrom_tocentry Te;
+		Te.cdte_track=session;
+		Te.cdte_format=CDROM_LBA;
+		i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te);
+		if (!i) {
+			printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
+				session, Te.cdte_addr.lba,
+				Te.cdte_ctrl&CDROM_DATA_TRACK);
+			if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4)
+				return Te.cdte_addr.lba;
+		}
+
+		printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
+	}
+	i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info);
+	if (session > 0)
+		printk(KERN_ERR "ISOFS: Invalid session number\n");
+#if 0
+	printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i);
+	if (i==0) {
+		printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no");
+		printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba);
+	}
+#endif
+	if (i==0)
+#if WE_OBEY_THE_WRITTEN_STANDARDS
+		if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
+#endif
+			vol_desc_start=ms_info.addr.lba;
+	return vol_desc_start;
+}
+
+/*
+ * Check if root directory is empty (has less than 3 files).
+ *
+ * Used to detect broken CDs where ISO root directory is empty but Joliet root
+ * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
+ * (and Joliet used instead) or else no files would be visible.
+ */
+static bool rootdir_empty(struct super_block *sb, unsigned long block)
+{
+	int offset = 0, files = 0, de_len;
+	struct iso_directory_record *de;
+	struct buffer_head *bh;
+
+	bh = sb_bread(sb, block);
+	if (!bh)
+		return true;
+	while (files < 3) {
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+		de_len = *(unsigned char *) de;
+		if (de_len == 0)
+			break;
+		files++;
+		offset += de_len;
+	}
+	brelse(bh);
+	return files < 3;
+}
+
+/*
+ * Initialize the superblock and read the root inode.
+ *
+ * Note: a check_disk_change() has been done immediately prior
+ * to this call, so we don't need to check again.
+ */
+static int isofs_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh = NULL, *pri_bh = NULL;
+	struct hs_primary_descriptor *h_pri = NULL;
+	struct iso_primary_descriptor *pri = NULL;
+	struct iso_supplementary_descriptor *sec = NULL;
+	struct iso_directory_record *rootp;
+	struct inode *inode;
+	struct iso9660_options opt;
+	struct isofs_sb_info *sbi;
+	unsigned long first_data_zone;
+	int joliet_level = 0;
+	int iso_blknum, block;
+	int orig_zonesize;
+	int table, error = -EINVAL;
+	unsigned int vol_desc_start;
+
+	save_mount_options(s, data);
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	s->s_fs_info = sbi;
+
+	if (!parse_options((char *)data, &opt))
+		goto out_freesbi;
+
+	/*
+	 * First of all, get the hardware blocksize for this device.
+	 * If we don't know what it is, or the hardware blocksize is
+	 * larger than the blocksize the user specified, then use
+	 * that value.
+	 */
+	/*
+	 * What if bugger tells us to go beyond page size?
+	 */
+	opt.blocksize = sb_min_blocksize(s, opt.blocksize);
+
+	sbi->s_high_sierra = 0; /* default is iso9660 */
+
+	vol_desc_start = (opt.sbsector != -1) ?
+		opt.sbsector : isofs_get_last_session(s,opt.session);
+
+	for (iso_blknum = vol_desc_start+16;
+		iso_blknum < vol_desc_start+100; iso_blknum++) {
+		struct hs_volume_descriptor *hdp;
+		struct iso_volume_descriptor  *vdp;
+
+		block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
+		if (!(bh = sb_bread(s, block)))
+			goto out_no_read;
+
+		vdp = (struct iso_volume_descriptor *)bh->b_data;
+		hdp = (struct hs_volume_descriptor *)bh->b_data;
+
+		/*
+		 * Due to the overlapping physical location of the descriptors,
+		 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure
+		 * proper identification in this case, we first check for ISO.
+		 */
+		if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
+			if (isonum_711(vdp->type) == ISO_VD_END)
+				break;
+			if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
+				if (pri == NULL) {
+					pri = (struct iso_primary_descriptor *)vdp;
+					/* Save the buffer in case we need it ... */
+					pri_bh = bh;
+					bh = NULL;
+				}
+			}
+#ifdef CONFIG_JOLIET
+			else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
+				sec = (struct iso_supplementary_descriptor *)vdp;
+				if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
+					if (opt.joliet) {
+						if (sec->escape[2] == 0x40)
+							joliet_level = 1;
+						else if (sec->escape[2] == 0x43)
+							joliet_level = 2;
+						else if (sec->escape[2] == 0x45)
+							joliet_level = 3;
+
+						printk(KERN_DEBUG "ISO 9660 Extensions: "
+							"Microsoft Joliet Level %d\n",
+							joliet_level);
+					}
+					goto root_found;
+				} else {
+				/* Unknown supplementary volume descriptor */
+				sec = NULL;
+				}
+			}
+#endif
+		} else {
+			if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
+				if (isonum_711(hdp->type) != ISO_VD_PRIMARY)
+					goto out_freebh;
+
+				sbi->s_high_sierra = 1;
+				opt.rock = 0;
+				h_pri = (struct hs_primary_descriptor *)vdp;
+				goto root_found;
+			}
+		}
+
+		/* Just skip any volume descriptors we don't recognize */
+
+		brelse(bh);
+		bh = NULL;
+	}
+	/*
+	 * If we fall through, either no volume descriptor was found,
+	 * or else we passed a primary descriptor looking for others.
+	 */
+	if (!pri)
+		goto out_unknown_format;
+	brelse(bh);
+	bh = pri_bh;
+	pri_bh = NULL;
+
+root_found:
+
+	if (joliet_level && (pri == NULL || !opt.rock)) {
+		/* This is the case of Joliet with the norock mount flag.
+		 * A disc with both Joliet and Rock Ridge is handled later
+		 */
+		pri = (struct iso_primary_descriptor *) sec;
+	}
+
+	if(sbi->s_high_sierra){
+		rootp = (struct iso_directory_record *) h_pri->root_directory_record;
+		sbi->s_nzones = isonum_733(h_pri->volume_space_size);
+		sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size);
+		sbi->s_max_size = isonum_733(h_pri->volume_space_size);
+	} else {
+		if (!pri)
+			goto out_freebh;
+		rootp = (struct iso_directory_record *) pri->root_directory_record;
+		sbi->s_nzones = isonum_733(pri->volume_space_size);
+		sbi->s_log_zone_size = isonum_723(pri->logical_block_size);
+		sbi->s_max_size = isonum_733(pri->volume_space_size);
+	}
+
+	sbi->s_ninodes = 0; /* No way to figure this out easily */
+
+	orig_zonesize = sbi->s_log_zone_size;
+	/*
+	 * If the zone size is smaller than the hardware sector size,
+	 * this is a fatal error.  This would occur if the disc drive
+	 * had sectors that were 2048 bytes, but the filesystem had
+	 * blocks that were 512 bytes (which should only very rarely
+	 * happen.)
+	 */
+	if (orig_zonesize < opt.blocksize)
+		goto out_bad_size;
+
+	/* RDE: convert log zone size to bit shift */
+	switch (sbi->s_log_zone_size) {
+	case  512: sbi->s_log_zone_size =  9; break;
+	case 1024: sbi->s_log_zone_size = 10; break;
+	case 2048: sbi->s_log_zone_size = 11; break;
+
+	default:
+		goto out_bad_zone_size;
+	}
+
+	s->s_magic = ISOFS_SUPER_MAGIC;
+
+	/*
+	 * With multi-extent files, file size is only limited by the maximum
+	 * size of a file system, which is 8 TB.
+	 */
+	s->s_maxbytes = 0x80000000000LL;
+
+	/* Set this for reference. Its not currently used except on write
+	   which we don't have .. */
+
+	first_data_zone = isonum_733(rootp->extent) +
+			  isonum_711(rootp->ext_attr_length);
+	sbi->s_firstdatazone = first_data_zone;
+#ifndef BEQUIET
+	printk(KERN_DEBUG "ISOFS: Max size:%ld   Log zone size:%ld\n",
+		sbi->s_max_size, 1UL << sbi->s_log_zone_size);
+	printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone);
+	if(sbi->s_high_sierra)
+		printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n");
+#endif
+
+	/*
+	 * If the Joliet level is set, we _may_ decide to use the
+	 * secondary descriptor, but can't be sure until after we
+	 * read the root inode. But before reading the root inode
+	 * we may need to change the device blocksize, and would
+	 * rather release the old buffer first. So, we cache the
+	 * first_data_zone value from the secondary descriptor.
+	 */
+	if (joliet_level) {
+		pri = (struct iso_primary_descriptor *) sec;
+		rootp = (struct iso_directory_record *)
+			pri->root_directory_record;
+		first_data_zone = isonum_733(rootp->extent) +
+				isonum_711(rootp->ext_attr_length);
+	}
+
+	/*
+	 * We're all done using the volume descriptor, and may need
+	 * to change the device blocksize, so release the buffer now.
+	 */
+	brelse(pri_bh);
+	brelse(bh);
+
+	/*
+	 * Force the blocksize to 512 for 512 byte sectors.  The file
+	 * read primitives really get it wrong in a bad way if we don't
+	 * do this.
+	 *
+	 * Note - we should never be setting the blocksize to something
+	 * less than the hardware sector size for the device.  If we
+	 * do, we would end up having to read larger buffers and split
+	 * out portions to satisfy requests.
+	 *
+	 * Note2- the idea here is that we want to deal with the optimal
+	 * zonesize in the filesystem.  If we have it set to something less,
+	 * then we have horrible problems with trying to piece together
+	 * bits of adjacent blocks in order to properly read directory
+	 * entries.  By forcing the blocksize in this way, we ensure
+	 * that we will never be required to do this.
+	 */
+	sb_set_blocksize(s, orig_zonesize);
+
+	sbi->s_nls_iocharset = NULL;
+
+#ifdef CONFIG_JOLIET
+	if (joliet_level && opt.utf8 == 0) {
+		char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
+		sbi->s_nls_iocharset = load_nls(p);
+		if (! sbi->s_nls_iocharset) {
+			/* Fail only if explicit charset specified */
+			if (opt.iocharset)
+				goto out_freesbi;
+			sbi->s_nls_iocharset = load_nls_default();
+		}
+	}
+#endif
+	s->s_op = &isofs_sops;
+	s->s_export_op = &isofs_export_ops;
+	sbi->s_mapping = opt.map;
+	sbi->s_rock = (opt.rock ? 2 : 0);
+	sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
+	sbi->s_cruft = opt.cruft;
+	sbi->s_hide = opt.hide;
+	sbi->s_showassoc = opt.showassoc;
+	sbi->s_uid = opt.uid;
+	sbi->s_gid = opt.gid;
+	sbi->s_uid_set = opt.uid_set;
+	sbi->s_gid_set = opt.gid_set;
+	sbi->s_utf8 = opt.utf8;
+	sbi->s_nocompress = opt.nocompress;
+	sbi->s_overriderockperm = opt.overriderockperm;
+	/*
+	 * It would be incredibly stupid to allow people to mark every file
+	 * on the disk as suid, so we merely allow them to set the default
+	 * permissions.
+	 */
+	if (opt.fmode != ISOFS_INVALID_MODE)
+		sbi->s_fmode = opt.fmode & 0777;
+	else
+		sbi->s_fmode = ISOFS_INVALID_MODE;
+	if (opt.dmode != ISOFS_INVALID_MODE)
+		sbi->s_dmode = opt.dmode & 0777;
+	else
+		sbi->s_dmode = ISOFS_INVALID_MODE;
+
+	/*
+	 * Read the root inode, which _may_ result in changing
+	 * the s_rock flag. Once we have the final s_rock value,
+	 * we then decide whether to use the Joliet descriptor.
+	 */
+	inode = isofs_iget(s, sbi->s_firstdatazone, 0);
+	if (IS_ERR(inode))
+		goto out_no_root;
+
+	/*
+	 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
+	 * correct Joliet root directory.
+	 */
+	if (sbi->s_rock == 1 && joliet_level &&
+				rootdir_empty(s, sbi->s_firstdatazone)) {
+		printk(KERN_NOTICE
+			"ISOFS: primary root directory is empty. "
+			"Disabling Rock Ridge and switching to Joliet.");
+		sbi->s_rock = 0;
+	}
+
+	/*
+	 * If this disk has both Rock Ridge and Joliet on it, then we
+	 * want to use Rock Ridge by default.  This can be overridden
+	 * by using the norock mount option.  There is still one other
+	 * possibility that is not taken into account: a Rock Ridge
+	 * CD with Unicode names.  Until someone sees such a beast, it
+	 * will not be supported.
+	 */
+	if (sbi->s_rock == 1) {
+		joliet_level = 0;
+	} else if (joliet_level) {
+		sbi->s_rock = 0;
+		if (sbi->s_firstdatazone != first_data_zone) {
+			sbi->s_firstdatazone = first_data_zone;
+			printk(KERN_DEBUG
+				"ISOFS: changing to secondary root\n");
+			iput(inode);
+			inode = isofs_iget(s, sbi->s_firstdatazone, 0);
+			if (IS_ERR(inode))
+				goto out_no_root;
+		}
+	}
+
+	if (opt.check == 'u') {
+		/* Only Joliet is case insensitive by default */
+		if (joliet_level)
+			opt.check = 'r';
+		else
+			opt.check = 's';
+	}
+	sbi->s_joliet_level = joliet_level;
+
+	/* Make sure the root inode is a directory */
+	if (!S_ISDIR(inode->i_mode)) {
+		printk(KERN_WARNING
+			"isofs_fill_super: root inode is not a directory. "
+			"Corrupted media?\n");
+		goto out_iput;
+	}
+
+	table = 0;
+	if (joliet_level)
+		table += 2;
+	if (opt.check == 'r')
+		table++;
+
+	if (table)
+		s->s_d_op = &isofs_dentry_ops[table - 1];
+
+	/* get the root dentry */
+	s->s_root = d_make_root(inode);
+	if (!(s->s_root)) {
+		error = -ENOMEM;
+		goto out_no_inode;
+	}
+
+	kfree(opt.iocharset);
+
+	return 0;
+
+	/*
+	 * Display error messages and free resources.
+	 */
+out_iput:
+	iput(inode);
+	goto out_no_inode;
+out_no_root:
+	error = PTR_ERR(inode);
+	if (error != -ENOMEM)
+		printk(KERN_WARNING "%s: get root inode failed\n", __func__);
+out_no_inode:
+#ifdef CONFIG_JOLIET
+	unload_nls(sbi->s_nls_iocharset);
+#endif
+	goto out_freesbi;
+out_no_read:
+	printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
+		__func__, s->s_id, iso_blknum, block);
+	goto out_freebh;
+out_bad_zone_size:
+	printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
+		sbi->s_log_zone_size);
+	goto out_freebh;
+out_bad_size:
+	printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
+		orig_zonesize, opt.blocksize);
+	goto out_freebh;
+out_unknown_format:
+	if (!silent)
+		printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n");
+
+out_freebh:
+	brelse(bh);
+	brelse(pri_bh);
+out_freesbi:
+	kfree(opt.iocharset);
+	kfree(sbi);
+	s->s_fs_info = NULL;
+	return error;
+}
+
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = ISOFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = (ISOFS_SB(sb)->s_nzones
+		<< (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_files = ISOFS_SB(sb)->s_ninodes;
+	buf->f_ffree = 0;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+
+/*
+ * Get a set of blocks; filling in buffer_heads if already allocated
+ * or getblk() if they are not.  Returns the number of blocks inserted
+ * (-ve == error.)
+ */
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
+		     struct buffer_head **bh, unsigned long nblocks)
+{
+	unsigned long b_off = iblock;
+	unsigned offset, sect_size;
+	unsigned int firstext;
+	unsigned long nextblk, nextoff;
+	int section, rv, error;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+
+	error = -EIO;
+	rv = 0;
+	if (iblock != b_off) {
+		printk(KERN_DEBUG "%s: block number too large\n", __func__);
+		goto abort;
+	}
+
+
+	offset = 0;
+	firstext = ei->i_first_extent;
+	sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode);
+	nextblk = ei->i_next_section_block;
+	nextoff = ei->i_next_section_offset;
+	section = 0;
+
+	while (nblocks) {
+		/* If we are *way* beyond the end of the file, print a message.
+		 * Access beyond the end of the file up to the next page boundary
+		 * is normal, however because of the way the page cache works.
+		 * In this case, we just return 0 so that we can properly fill
+		 * the page with useless information without generating any
+		 * I/O errors.
+		 */
+		if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
+			printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
+				__func__, b_off,
+				(unsigned long long)inode->i_size);
+			goto abort;
+		}
+
+		/* On the last section, nextblk == 0, section size is likely to
+		 * exceed sect_size by a partial block, and access beyond the
+		 * end of the file will reach beyond the section size, too.
+		 */
+		while (nextblk && (b_off >= (offset + sect_size))) {
+			struct inode *ninode;
+
+			offset += sect_size;
+			ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
+			if (IS_ERR(ninode)) {
+				error = PTR_ERR(ninode);
+				goto abort;
+			}
+			firstext  = ISOFS_I(ninode)->i_first_extent;
+			sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
+			nextblk   = ISOFS_I(ninode)->i_next_section_block;
+			nextoff   = ISOFS_I(ninode)->i_next_section_offset;
+			iput(ninode);
+
+			if (++section > 100) {
+				printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
+					" aborting...\n", __func__);
+				printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
+					"nextblk=%lu nextoff=%lu\n", __func__,
+					b_off, firstext, (unsigned) sect_size,
+					nextblk, nextoff);
+				goto abort;
+			}
+		}
+
+		if (*bh) {
+			map_bh(*bh, inode->i_sb, firstext + b_off - offset);
+		} else {
+			*bh = sb_getblk(inode->i_sb, firstext+b_off-offset);
+			if (!*bh)
+				goto abort;
+		}
+		bh++;	/* Next buffer head */
+		b_off++;	/* Next buffer offset */
+		nblocks--;
+		rv++;
+	}
+
+	error = 0;
+abort:
+	return rv != 0 ? rv : error;
+}
+
+/*
+ * Used by the standard interfaces.
+ */
+static int isofs_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	int ret;
+
+	if (create) {
+		printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__);
+		return -EROFS;
+	}
+
+	ret = isofs_get_blocks(inode, iblock, &bh_result, 1);
+	return ret < 0 ? ret : 0;
+}
+
+static int isofs_bmap(struct inode *inode, sector_t block)
+{
+	struct buffer_head dummy;
+	int error;
+
+	dummy.b_state = 0;
+	dummy.b_blocknr = -1000;
+	error = isofs_get_block(inode, block, &dummy, 0);
+	if (!error)
+		return dummy.b_blocknr;
+	return 0;
+}
+
+struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
+{
+	sector_t blknr = isofs_bmap(inode, block);
+	if (!blknr)
+		return NULL;
+	return sb_bread(inode->i_sb, blknr);
+}
+
+static int isofs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, isofs_get_block);
+}
+
+static int isofs_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
+}
+
+static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,isofs_get_block);
+}
+
+static const struct address_space_operations isofs_aops = {
+	.readpage = isofs_readpage,
+	.readpages = isofs_readpages,
+	.bmap = _isofs_bmap
+};
+
+static int isofs_read_level3_size(struct inode *inode)
+{
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra;
+	struct buffer_head *bh = NULL;
+	unsigned long block, offset, block_saved, offset_saved;
+	int i = 0;
+	int more_entries = 0;
+	struct iso_directory_record *tmpde = NULL;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+
+	inode->i_size = 0;
+
+	/* The first 16 blocks are reserved as the System Area.  Thus,
+	 * no inodes can appear in block 0.  We use this to flag that
+	 * this is the last section. */
+	ei->i_next_section_block = 0;
+	ei->i_next_section_offset = 0;
+
+	block = ei->i_iget5_block;
+	offset = ei->i_iget5_offset;
+
+	do {
+		struct iso_directory_record *de;
+		unsigned int de_len;
+
+		if (!bh) {
+			bh = sb_bread(inode->i_sb, block);
+			if (!bh)
+				goto out_noread;
+		}
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+		de_len = *(unsigned char *) de;
+
+		if (de_len == 0) {
+			brelse(bh);
+			bh = NULL;
+			++block;
+			offset = 0;
+			continue;
+		}
+
+		block_saved = block;
+		offset_saved = offset;
+		offset += de_len;
+
+		/* Make sure we have a full directory entry */
+		if (offset >= bufsize) {
+			int slop = bufsize - offset + de_len;
+			if (!tmpde) {
+				tmpde = kmalloc(256, GFP_KERNEL);
+				if (!tmpde)
+					goto out_nomem;
+			}
+			memcpy(tmpde, de, slop);
+			offset &= bufsize - 1;
+			block++;
+			brelse(bh);
+			bh = NULL;
+			if (offset) {
+				bh = sb_bread(inode->i_sb, block);
+				if (!bh)
+					goto out_noread;
+				memcpy((void *)tmpde+slop, bh->b_data, offset);
+			}
+			de = tmpde;
+		}
+
+		inode->i_size += isonum_733(de->size);
+		if (i == 1) {
+			ei->i_next_section_block = block_saved;
+			ei->i_next_section_offset = offset_saved;
+		}
+
+		more_entries = de->flags[-high_sierra] & 0x80;
+
+		i++;
+		if (i > 100)
+			goto out_toomany;
+	} while (more_entries);
+out:
+	kfree(tmpde);
+	if (bh)
+		brelse(bh);
+	return 0;
+
+out_nomem:
+	if (bh)
+		brelse(bh);
+	return -ENOMEM;
+
+out_noread:
+	printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block);
+	kfree(tmpde);
+	return -EIO;
+
+out_toomany:
+	printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n"
+		"isofs_read_level3_size: inode=%lu\n",
+		__func__, inode->i_ino);
+	goto out;
+}
+
+static int isofs_read_inode(struct inode *inode, int relocated)
+{
+	struct super_block *sb = inode->i_sb;
+	struct isofs_sb_info *sbi = ISOFS_SB(sb);
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	unsigned long block;
+	int high_sierra = sbi->s_high_sierra;
+	struct buffer_head *bh = NULL;
+	struct iso_directory_record *de;
+	struct iso_directory_record *tmpde = NULL;
+	unsigned int de_len;
+	unsigned long offset;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+	int ret = -EIO;
+
+	block = ei->i_iget5_block;
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto out_badread;
+
+	offset = ei->i_iget5_offset;
+
+	de = (struct iso_directory_record *) (bh->b_data + offset);
+	de_len = *(unsigned char *) de;
+
+	if (offset + de_len > bufsize) {
+		int frag1 = bufsize - offset;
+
+		tmpde = kmalloc(de_len, GFP_KERNEL);
+		if (tmpde == NULL) {
+			printk(KERN_INFO "%s: out of memory\n", __func__);
+			ret = -ENOMEM;
+			goto fail;
+		}
+		memcpy(tmpde, bh->b_data + offset, frag1);
+		brelse(bh);
+		bh = sb_bread(inode->i_sb, ++block);
+		if (!bh)
+			goto out_badread;
+		memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1);
+		de = tmpde;
+	}
+
+	inode->i_ino = isofs_get_ino(ei->i_iget5_block,
+					ei->i_iget5_offset,
+					ISOFS_BUFFER_BITS(inode));
+
+	/* Assume it is a normal-format file unless told otherwise */
+	ei->i_file_format = isofs_file_normal;
+
+	if (de->flags[-high_sierra] & 2) {
+		if (sbi->s_dmode != ISOFS_INVALID_MODE)
+			inode->i_mode = S_IFDIR | sbi->s_dmode;
+		else
+			inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+		set_nlink(inode, 1);	/*
+					 * Set to 1.  We know there are 2, but
+					 * the find utility tries to optimize
+					 * if it is 2, and it screws up.  It is
+					 * easier to give 1 which tells find to
+					 * do it the hard way.
+					 */
+	} else {
+		if (sbi->s_fmode != ISOFS_INVALID_MODE) {
+			inode->i_mode = S_IFREG | sbi->s_fmode;
+		} else {
+			/*
+			 * Set default permissions: r-x for all.  The disc
+			 * could be shared with DOS machines so virtually
+			 * anything could be a valid executable.
+			 */
+			inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
+		}
+		set_nlink(inode, 1);
+	}
+	inode->i_uid = sbi->s_uid;
+	inode->i_gid = sbi->s_gid;
+	inode->i_blocks = 0;
+
+	ei->i_format_parm[0] = 0;
+	ei->i_format_parm[1] = 0;
+	ei->i_format_parm[2] = 0;
+
+	ei->i_section_size = isonum_733(de->size);
+	if (de->flags[-high_sierra] & 0x80) {
+		ret = isofs_read_level3_size(inode);
+		if (ret < 0)
+			goto fail;
+		ret = -EIO;
+	} else {
+		ei->i_next_section_block = 0;
+		ei->i_next_section_offset = 0;
+		inode->i_size = isonum_733(de->size);
+	}
+
+	/*
+	 * Some dipshit decided to store some other bit of information
+	 * in the high byte of the file length.  Truncate size in case
+	 * this CDROM was mounted with the cruft option.
+	 */
+
+	if (sbi->s_cruft)
+		inode->i_size &= 0x00ffffff;
+
+	if (de->interleave[0]) {
+		printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n");
+		inode->i_size = 0;
+	}
+
+	/* I have no idea what file_unit_size is used for, so
+	   we will flag it for now */
+	if (de->file_unit_size[0] != 0) {
+		printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n",
+			inode->i_ino);
+	}
+
+	/* I have no idea what other flag bits are used for, so
+	   we will flag it for now */
+#ifdef DEBUG
+	if((de->flags[-high_sierra] & ~2)!= 0){
+		printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file "
+				"(%ld %x).\n",
+			inode->i_ino, de->flags[-high_sierra]);
+	}
+#endif
+
+	inode->i_mtime.tv_sec =
+	inode->i_atime.tv_sec =
+	inode->i_ctime.tv_sec = iso_date(de->date, high_sierra);
+	inode->i_mtime.tv_nsec =
+	inode->i_atime.tv_nsec =
+	inode->i_ctime.tv_nsec = 0;
+
+	ei->i_first_extent = (isonum_733(de->extent) +
+			isonum_711(de->ext_attr_length));
+
+	/* Set the number of blocks for stat() - should be done before RR */
+	inode->i_blocks = (inode->i_size + 511) >> 9;
+
+	/*
+	 * Now test for possible Rock Ridge extensions which will override
+	 * some of these numbers in the inode structure.
+	 */
+
+	if (!high_sierra) {
+		parse_rock_ridge_inode(de, inode, relocated);
+		/* if we want uid/gid set, override the rock ridge setting */
+		if (sbi->s_uid_set)
+			inode->i_uid = sbi->s_uid;
+		if (sbi->s_gid_set)
+			inode->i_gid = sbi->s_gid;
+	}
+	/* Now set final access rights if overriding rock ridge setting */
+	if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_dmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFDIR | sbi->s_dmode;
+	if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_fmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFREG | sbi->s_fmode;
+
+	/* Install the inode operations vector */
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+		switch (ei->i_file_format) {
+#ifdef CONFIG_ZISOFS
+		case isofs_file_compressed:
+			inode->i_data.a_ops = &zisofs_aops;
+			break;
+#endif
+		default:
+			inode->i_data.a_ops = &isofs_aops;
+			break;
+		}
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &isofs_dir_inode_operations;
+		inode->i_fop = &isofs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &isofs_symlink_aops;
+	} else
+		/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+
+	ret = 0;
+out:
+	kfree(tmpde);
+	if (bh)
+		brelse(bh);
+	return ret;
+
+out_badread:
+	printk(KERN_WARNING "ISOFS: unable to read i-node block\n");
+fail:
+	goto out;
+}
+
+struct isofs_iget5_callback_data {
+	unsigned long block;
+	unsigned long offset;
+};
+
+static int isofs_iget5_test(struct inode *ino, void *data)
+{
+	struct iso_inode_info *i = ISOFS_I(ino);
+	struct isofs_iget5_callback_data *d =
+		(struct isofs_iget5_callback_data*)data;
+	return (i->i_iget5_block == d->block)
+		&& (i->i_iget5_offset == d->offset);
+}
+
+static int isofs_iget5_set(struct inode *ino, void *data)
+{
+	struct iso_inode_info *i = ISOFS_I(ino);
+	struct isofs_iget5_callback_data *d =
+		(struct isofs_iget5_callback_data*)data;
+	i->i_iget5_block = d->block;
+	i->i_iget5_offset = d->offset;
+	return 0;
+}
+
+/* Store, in the inode's containing structure, the block and block
+ * offset that point to the underlying meta-data for the inode.  The
+ * code below is otherwise similar to the iget() code in
+ * include/linux/fs.h */
+struct inode *__isofs_iget(struct super_block *sb,
+			   unsigned long block,
+			   unsigned long offset,
+			   int relocated)
+{
+	unsigned long hashval;
+	struct inode *inode;
+	struct isofs_iget5_callback_data data;
+	long ret;
+
+	if (offset >= 1ul << sb->s_blocksize_bits)
+		return ERR_PTR(-EINVAL);
+
+	data.block = block;
+	data.offset = offset;
+
+	hashval = (block << sb->s_blocksize_bits) | offset;
+
+	inode = iget5_locked(sb, hashval, &isofs_iget5_test,
+				&isofs_iget5_set, &data);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		ret = isofs_read_inode(inode, relocated);
+		if (ret < 0) {
+			iget_failed(inode);
+			inode = ERR_PTR(ret);
+		} else {
+			unlock_new_inode(inode);
+		}
+	}
+
+	return inode;
+}
+
+static struct dentry *isofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	/* We don't support read-write mounts */
+	if (!(flags & MS_RDONLY))
+		return ERR_PTR(-EACCES);
+	return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+}
+
+static struct file_system_type iso9660_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "iso9660",
+	.mount		= isofs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("iso9660");
+MODULE_ALIAS("iso9660");
+
+static int __init init_iso9660_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out;
+#ifdef CONFIG_ZISOFS
+	err = zisofs_init();
+	if (err)
+		goto out1;
+#endif
+	err = register_filesystem(&iso9660_fs_type);
+	if (err)
+		goto out2;
+	return 0;
+out2:
+#ifdef CONFIG_ZISOFS
+	zisofs_cleanup();
+out1:
+#endif
+	destroy_inodecache();
+out:
+	return err;
+}
+
+static void __exit exit_iso9660_fs(void)
+{
+        unregister_filesystem(&iso9660_fs_type);
+#ifdef CONFIG_ZISOFS
+	zisofs_cleanup();
+#endif
+	destroy_inodecache();
+}
+
+module_init(init_iso9660_fs)
+module_exit(exit_iso9660_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/isofs/isofs.h b/kernel/fs/isofs/isofs.h
new file mode 100644
index 000000000..0ac4c1f73
--- /dev/null
+++ b/kernel/fs/isofs/isofs.h
@@ -0,0 +1,198 @@
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/iso_fs.h>
+#include <asm/unaligned.h>
+
+enum isofs_file_format {
+	isofs_file_normal = 0,
+	isofs_file_sparse = 1,
+	isofs_file_compressed = 2,
+};
+	
+/*
+ * iso fs inode data in memory
+ */
+struct iso_inode_info {
+	unsigned long i_iget5_block;
+	unsigned long i_iget5_offset;
+	unsigned int i_first_extent;
+	unsigned char i_file_format;
+	unsigned char i_format_parm[3];
+	unsigned long i_next_section_block;
+	unsigned long i_next_section_offset;
+	off_t i_section_size;
+	struct inode vfs_inode;
+};
+
+/*
+ * iso9660 super-block data in memory
+ */
+struct isofs_sb_info {
+	unsigned long s_ninodes;
+	unsigned long s_nzones;
+	unsigned long s_firstdatazone;
+	unsigned long s_log_zone_size;
+	unsigned long s_max_size;
+	
+	int           s_rock_offset; /* offset of SUSP fields within SU area */
+	unsigned char s_joliet_level;
+	unsigned char s_mapping;
+	unsigned int  s_high_sierra:1;
+	unsigned int  s_rock:2;
+	unsigned int  s_utf8:1;
+	unsigned int  s_cruft:1; /* Broken disks with high byte of length
+				  * containing junk */
+	unsigned int  s_nocompress:1;
+	unsigned int  s_hide:1;
+	unsigned int  s_showassoc:1;
+	unsigned int  s_overriderockperm:1;
+	unsigned int  s_uid_set:1;
+	unsigned int  s_gid_set:1;
+
+	umode_t s_fmode;
+	umode_t s_dmode;
+	kgid_t s_gid;
+	kuid_t s_uid;
+	struct nls_table *s_nls_iocharset; /* Native language support table */
+};
+
+#define ISOFS_INVALID_MODE ((umode_t) -1)
+
+static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct iso_inode_info *ISOFS_I(struct inode *inode)
+{
+	return container_of(inode, struct iso_inode_info, vfs_inode);
+}
+
+static inline int isonum_711(char *p)
+{
+	return *(u8 *)p;
+}
+static inline int isonum_712(char *p)
+{
+	return *(s8 *)p;
+}
+static inline unsigned int isonum_721(char *p)
+{
+	return get_unaligned_le16(p);
+}
+static inline unsigned int isonum_722(char *p)
+{
+	return get_unaligned_be16(p);
+}
+static inline unsigned int isonum_723(char *p)
+{
+	/* Ignore bigendian datum due to broken mastering programs */
+	return get_unaligned_le16(p);
+}
+static inline unsigned int isonum_731(char *p)
+{
+	return get_unaligned_le32(p);
+}
+static inline unsigned int isonum_732(char *p)
+{
+	return get_unaligned_be32(p);
+}
+static inline unsigned int isonum_733(char *p)
+{
+	/* Ignore bigendian datum due to broken mastering programs */
+	return get_unaligned_le32(p);
+}
+extern int iso_date(char *, int);
+
+struct inode;		/* To make gcc happy */
+
+extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
+extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
+extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
+
+int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *);
+int get_acorn_filename(struct iso_directory_record *, char *, struct inode *);
+
+extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int flags);
+extern struct buffer_head *isofs_bread(struct inode *, sector_t);
+extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
+
+struct inode *__isofs_iget(struct super_block *sb,
+			   unsigned long block,
+			   unsigned long offset,
+			   int relocated);
+
+static inline struct inode *isofs_iget(struct super_block *sb,
+				       unsigned long block,
+				       unsigned long offset)
+{
+	return __isofs_iget(sb, block, offset, 0);
+}
+
+static inline struct inode *isofs_iget_reloc(struct super_block *sb,
+					     unsigned long block,
+					     unsigned long offset)
+{
+	return __isofs_iget(sb, block, offset, 1);
+}
+
+/* Because the inode number is no longer relevant to finding the
+ * underlying meta-data for an inode, we are free to choose a more
+ * convenient 32-bit number as the inode number.  The inode numbering
+ * scheme was recommended by Sergey Vlasov and Eric Lammerts. */
+static inline unsigned long isofs_get_ino(unsigned long block,
+					  unsigned long offset,
+					  unsigned long bufbits)
+{
+	return (block << (bufbits - 5)) | (offset >> 5);
+}
+
+/* Every directory can have many redundant directory entries scattered
+ * throughout the directory tree.  First there is the directory entry
+ * with the name of the directory stored in the parent directory.
+ * Then, there is the "." directory entry stored in the directory
+ * itself.  Finally, there are possibly many ".." directory entries
+ * stored in all the subdirectories.
+ *
+ * In order for the NFS get_parent() method to work and for the
+ * general consistency of the dcache, we need to make sure the
+ * "i_iget5_block" and "i_iget5_offset" all point to exactly one of
+ * the many redundant entries for each directory.  We normalize the
+ * block and offset by always making them point to the "."  directory.
+ *
+ * Notice that we do not use the entry for the directory with the name
+ * that is located in the parent directory.  Even though choosing this
+ * first directory is more natural, it is much easier to find the "."
+ * entry in the NFS get_parent() method because it is implicitly
+ * encoded in the "extent + ext_attr_length" fields of _all_ the
+ * redundant entries for the directory.  Thus, it can always be
+ * reached regardless of which directory entry you have in hand.
+ *
+ * This works because the "." entry is simply the first directory
+ * record when you start reading the file that holds all the directory
+ * records, and this file starts at "extent + ext_attr_length" blocks.
+ * Because the "." entry is always the first entry listed in the
+ * directories file, the normalized "offset" value is always 0.
+ *
+ * You should pass the directory entry in "de".  On return, "block"
+ * and "offset" will hold normalized values.  Only directories are
+ * affected making it safe to call even for non-directory file
+ * types. */
+static inline void
+isofs_normalize_block_and_offset(struct iso_directory_record* de,
+				 unsigned long *block,
+				 unsigned long *offset)
+{
+	/* Only directories are normalized. */
+	if (de->flags[0] & 2) {
+		*offset = 0;
+		*block = (unsigned long)isonum_733(de->extent)
+			+ (unsigned long)isonum_711(de->ext_attr_length);
+	}
+}
+
+extern const struct inode_operations isofs_dir_inode_operations;
+extern const struct file_operations isofs_dir_operations;
+extern const struct address_space_operations isofs_symlink_aops;
+extern const struct export_operations isofs_export_ops;
diff --git a/kernel/fs/isofs/joliet.c b/kernel/fs/isofs/joliet.c
new file mode 100644
index 000000000..a048de81c
--- /dev/null
+++ b/kernel/fs/isofs/joliet.c
@@ -0,0 +1,69 @@
+/*
+ *  linux/fs/isofs/joliet.c
+ *
+ *  (C) 1996 Gordon Chaffee
+ *
+ *  Joliet: Microsoft's Unicode extensions to iso9660
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+#include "isofs.h"
+
+/*
+ * Convert Unicode 16 to UTF-8 or ASCII.
+ */
+static int
+uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
+{
+	__be16 *ip, ch;
+	unsigned char *op;
+
+	ip = uni;
+	op = ascii;
+
+	while ((ch = get_unaligned(ip)) && len) {
+		int llen;
+		llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE);
+		if (llen > 0)
+			op += llen;
+		else
+			*op++ = '?';
+		ip++;
+
+		len--;
+	}
+	*op = 0;
+	return (op - ascii);
+}
+
+int
+get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
+{
+	unsigned char utf8;
+	struct nls_table *nls;
+	unsigned char len = 0;
+
+	utf8 = ISOFS_SB(inode->i_sb)->s_utf8;
+	nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
+
+	if (utf8) {
+		len = utf16s_to_utf8s((const wchar_t *) de->name,
+				de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+				outname, PAGE_SIZE);
+	} else {
+		len = uni16_to_x8(outname, (__be16 *) de->name,
+				de->name_len[0] >> 1, nls);
+	}
+	if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1'))
+		len -= 2;
+
+	/*
+	 * Windows doesn't like periods at the end of a name,
+	 * so neither do we
+	 */
+	while (len >= 2 && (outname[len-1] == '.'))
+		len--;
+
+	return len;
+}
diff --git a/kernel/fs/isofs/namei.c b/kernel/fs/isofs/namei.c
new file mode 100644
index 000000000..7b543e6b6
--- /dev/null
+++ b/kernel/fs/isofs/namei.c
@@ -0,0 +1,172 @@
+/*
+ *  linux/fs/isofs/namei.c
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+
+#include <linux/gfp.h>
+#include "isofs.h"
+
+/*
+ * ok, we cannot use strncmp, as the name is not in our data space.
+ * Thus we'll have to use isofs_match. No big problem. Match also makes
+ * some sanity tests.
+ */
+static int
+isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
+{
+	struct qstr qstr;
+	qstr.name = compare;
+	qstr.len = dlen;
+	if (likely(!dentry->d_op))
+		return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
+	return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
+}
+
+/*
+ *	isofs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the inode number of the found entry, or 0 on error.
+ */
+static unsigned long
+isofs_find_entry(struct inode *dir, struct dentry *dentry,
+	unsigned long *block_rv, unsigned long *offset_rv,
+	char *tmpname, struct iso_directory_record *tmpde)
+{
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(dir);
+	unsigned char bufbits = ISOFS_BUFFER_BITS(dir);
+	unsigned long block, f_pos, offset, block_saved, offset_saved;
+	struct buffer_head *bh = NULL;
+	struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
+
+	if (!ISOFS_I(dir)->i_first_extent)
+		return 0;
+
+	f_pos = 0;
+	offset = 0;
+	block = 0;
+
+	while (f_pos < dir->i_size) {
+		struct iso_directory_record *de;
+		int de_len, match, i, dlen;
+		char *dpnt;
+
+		if (!bh) {
+			bh = isofs_bread(dir, block);
+			if (!bh)
+				return 0;
+		}
+
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+
+		de_len = *(unsigned char *) de;
+		if (!de_len) {
+			brelse(bh);
+			bh = NULL;
+			f_pos = (f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+			block = f_pos >> bufbits;
+			offset = 0;
+			continue;
+		}
+
+		block_saved = bh->b_blocknr;
+		offset_saved = offset;
+		offset += de_len;
+		f_pos += de_len;
+
+		/* Make sure we have a full directory entry */
+		if (offset >= bufsize) {
+			int slop = bufsize - offset + de_len;
+			memcpy(tmpde, de, slop);
+			offset &= bufsize - 1;
+			block++;
+			brelse(bh);
+			bh = NULL;
+			if (offset) {
+				bh = isofs_bread(dir, block);
+				if (!bh)
+					return 0;
+				memcpy((void *) tmpde + slop, bh->b_data, offset);
+			}
+			de = tmpde;
+		}
+
+		dlen = de->name_len[0];
+		dpnt = de->name;
+		/* Basic sanity check, whether name doesn't exceed dir entry */
+		if (de_len < dlen + sizeof(struct iso_directory_record)) {
+			printk(KERN_NOTICE "iso9660: Corrupted directory entry"
+			       " in block %lu of inode %lu\n", block,
+			       dir->i_ino);
+			return 0;
+		}
+
+		if (sbi->s_rock &&
+		    ((i = get_rock_ridge_filename(de, tmpname, dir)))) {
+			dlen = i;	/* possibly -1 */
+			dpnt = tmpname;
+#ifdef CONFIG_JOLIET
+		} else if (sbi->s_joliet_level) {
+			dlen = get_joliet_filename(de, tmpname, dir);
+			dpnt = tmpname;
+#endif
+		} else if (sbi->s_mapping == 'a') {
+			dlen = get_acorn_filename(de, tmpname, dir);
+			dpnt = tmpname;
+		} else if (sbi->s_mapping == 'n') {
+			dlen = isofs_name_translate(de, tmpname, dir);
+			dpnt = tmpname;
+		}
+
+		/*
+		 * Skip hidden or associated files unless hide or showassoc,
+		 * respectively, is set
+		 */
+		match = 0;
+		if (dlen > 0 &&
+			(!sbi->s_hide ||
+				(!(de->flags[-sbi->s_high_sierra] & 1))) &&
+			(sbi->s_showassoc ||
+				(!(de->flags[-sbi->s_high_sierra] & 4)))) {
+			if (dpnt && (dlen > 1 || dpnt[0] > 1))
+				match = (isofs_cmp(dentry, dpnt, dlen) == 0);
+		}
+		if (match) {
+			isofs_normalize_block_and_offset(de,
+							 &block_saved,
+							 &offset_saved);
+			*block_rv = block_saved;
+			*offset_rv = offset_saved;
+			brelse(bh);
+			return 1;
+		}
+	}
+	brelse(bh);
+	return 0;
+}
+
+struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	int found;
+	unsigned long uninitialized_var(block);
+	unsigned long uninitialized_var(offset);
+	struct inode *inode;
+	struct page *page;
+
+	page = alloc_page(GFP_USER);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	found = isofs_find_entry(dir, dentry,
+				&block, &offset,
+				page_address(page),
+				1024 + page_address(page));
+	__free_page(page);
+
+	inode = found ? isofs_iget(dir->i_sb, block, offset) : NULL;
+
+	return d_splice_alias(inode, dentry);
+}
diff --git a/kernel/fs/isofs/rock.c b/kernel/fs/isofs/rock.c
new file mode 100644
index 000000000..735d7522a
--- /dev/null
+++ b/kernel/fs/isofs/rock.c
@@ -0,0 +1,801 @@
+/*
+ *  linux/fs/isofs/rock.c
+ *
+ *  (C) 1992, 1993  Eric Youngdale
+ *
+ *  Rock Ridge Extensions to iso9660
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include "isofs.h"
+#include "rock.h"
+
+/*
+ * These functions are designed to read the system areas of a directory record
+ * and extract relevant information.  There are different functions provided
+ * depending upon what information we need at the time.  One function fills
+ * out an inode structure, a second one extracts a filename, a third one
+ * returns a symbolic link name, and a fourth one returns the extent number
+ * for the file.
+ */
+
+#define SIG(A,B) ((A) | ((B) << 8))	/* isonum_721() */
+
+struct rock_state {
+	void *buffer;
+	unsigned char *chr;
+	int len;
+	int cont_size;
+	int cont_extent;
+	int cont_offset;
+	int cont_loops;
+	struct inode *inode;
+};
+
+/*
+ * This is a way of ensuring that we have something in the system
+ * use fields that is compatible with Rock Ridge.  Return zero on success.
+ */
+
+static int check_sp(struct rock_ridge *rr, struct inode *inode)
+{
+	if (rr->u.SP.magic[0] != 0xbe)
+		return -1;
+	if (rr->u.SP.magic[1] != 0xef)
+		return -1;
+	ISOFS_SB(inode->i_sb)->s_rock_offset = rr->u.SP.skip;
+	return 0;
+}
+
+static void setup_rock_ridge(struct iso_directory_record *de,
+			struct inode *inode, struct rock_state *rs)
+{
+	rs->len = sizeof(struct iso_directory_record) + de->name_len[0];
+	if (rs->len & 1)
+		(rs->len)++;
+	rs->chr = (unsigned char *)de + rs->len;
+	rs->len = *((unsigned char *)de) - rs->len;
+	if (rs->len < 0)
+		rs->len = 0;
+
+	if (ISOFS_SB(inode->i_sb)->s_rock_offset != -1) {
+		rs->len -= ISOFS_SB(inode->i_sb)->s_rock_offset;
+		rs->chr += ISOFS_SB(inode->i_sb)->s_rock_offset;
+		if (rs->len < 0)
+			rs->len = 0;
+	}
+}
+
+static void init_rock_state(struct rock_state *rs, struct inode *inode)
+{
+	memset(rs, 0, sizeof(*rs));
+	rs->inode = inode;
+}
+
+/* Maximum number of Rock Ridge continuation entries */
+#define RR_MAX_CE_ENTRIES 32
+
+/*
+ * Returns 0 if the caller should continue scanning, 1 if the scan must end
+ * and -ve on error.
+ */
+static int rock_continue(struct rock_state *rs)
+{
+	int ret = 1;
+	int blocksize = 1 << rs->inode->i_blkbits;
+	const int min_de_size = offsetof(struct rock_ridge, u);
+
+	kfree(rs->buffer);
+	rs->buffer = NULL;
+
+	if ((unsigned)rs->cont_offset > blocksize - min_de_size ||
+	    (unsigned)rs->cont_size > blocksize ||
+	    (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) {
+		printk(KERN_NOTICE "rock: corrupted directory entry. "
+			"extent=%d, offset=%d, size=%d\n",
+			rs->cont_extent, rs->cont_offset, rs->cont_size);
+		ret = -EIO;
+		goto out;
+	}
+
+	if (rs->cont_extent) {
+		struct buffer_head *bh;
+
+		rs->buffer = kmalloc(rs->cont_size, GFP_KERNEL);
+		if (!rs->buffer) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = -EIO;
+		if (++rs->cont_loops >= RR_MAX_CE_ENTRIES)
+			goto out;
+		bh = sb_bread(rs->inode->i_sb, rs->cont_extent);
+		if (bh) {
+			memcpy(rs->buffer, bh->b_data + rs->cont_offset,
+					rs->cont_size);
+			put_bh(bh);
+			rs->chr = rs->buffer;
+			rs->len = rs->cont_size;
+			rs->cont_extent = 0;
+			rs->cont_size = 0;
+			rs->cont_offset = 0;
+			return 0;
+		}
+		printk("Unable to read rock-ridge attributes\n");
+	}
+out:
+	kfree(rs->buffer);
+	rs->buffer = NULL;
+	return ret;
+}
+
+/*
+ * We think there's a record of type `sig' at rs->chr.  Parse the signature
+ * and make sure that there's really room for a record of that type.
+ */
+static int rock_check_overflow(struct rock_state *rs, int sig)
+{
+	int len;
+
+	switch (sig) {
+	case SIG('S', 'P'):
+		len = sizeof(struct SU_SP_s);
+		break;
+	case SIG('C', 'E'):
+		len = sizeof(struct SU_CE_s);
+		break;
+	case SIG('E', 'R'):
+		len = sizeof(struct SU_ER_s);
+		break;
+	case SIG('R', 'R'):
+		len = sizeof(struct RR_RR_s);
+		break;
+	case SIG('P', 'X'):
+		len = sizeof(struct RR_PX_s);
+		break;
+	case SIG('P', 'N'):
+		len = sizeof(struct RR_PN_s);
+		break;
+	case SIG('S', 'L'):
+		len = sizeof(struct RR_SL_s);
+		break;
+	case SIG('N', 'M'):
+		len = sizeof(struct RR_NM_s);
+		break;
+	case SIG('C', 'L'):
+		len = sizeof(struct RR_CL_s);
+		break;
+	case SIG('P', 'L'):
+		len = sizeof(struct RR_PL_s);
+		break;
+	case SIG('T', 'F'):
+		len = sizeof(struct RR_TF_s);
+		break;
+	case SIG('Z', 'F'):
+		len = sizeof(struct RR_ZF_s);
+		break;
+	default:
+		len = 0;
+		break;
+	}
+	len += offsetof(struct rock_ridge, u);
+	if (len > rs->len) {
+		printk(KERN_NOTICE "rock: directory entry would overflow "
+				"storage\n");
+		printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n",
+				sig, len, rs->len);
+		return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * return length of name field; 0: not found, -1: to be ignored
+ */
+int get_rock_ridge_filename(struct iso_directory_record *de,
+			    char *retname, struct inode *inode)
+{
+	struct rock_state rs;
+	struct rock_ridge *rr;
+	int sig;
+	int retnamlen = 0;
+	int truncate = 0;
+	int ret = 0;
+
+	if (!ISOFS_SB(inode->i_sb)->s_rock)
+		return 0;
+	*retname = 0;
+
+	init_rock_state(&rs, inode);
+	setup_rock_ridge(de, inode, &rs);
+repeat:
+
+	while (rs.len > 2) { /* There may be one byte for padding somewhere */
+		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
+		if (rr->len < 3)
+			goto out;	/* Something got screwed up here */
+		sig = isonum_721(rs.chr);
+		if (rock_check_overflow(&rs, sig))
+			goto eio;
+		rs.chr += rr->len;
+		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
+		if (rs.len < 0)
+			goto out;	/* Something got screwed up here */
+
+		switch (sig) {
+		case SIG('R', 'R'):
+			if ((rr->u.RR.flags[0] & RR_NM) == 0)
+				goto out;
+			break;
+		case SIG('S', 'P'):
+			if (check_sp(rr, inode))
+				goto out;
+			break;
+		case SIG('C', 'E'):
+			rs.cont_extent = isonum_733(rr->u.CE.extent);
+			rs.cont_offset = isonum_733(rr->u.CE.offset);
+			rs.cont_size = isonum_733(rr->u.CE.size);
+			break;
+		case SIG('N', 'M'):
+			if (truncate)
+				break;
+			if (rr->len < 5)
+				break;
+			/*
+			 * If the flags are 2 or 4, this indicates '.' or '..'.
+			 * We don't want to do anything with this, because it
+			 * screws up the code that calls us.  We don't really
+			 * care anyways, since we can just use the non-RR
+			 * name.
+			 */
+			if (rr->u.NM.flags & 6)
+				break;
+
+			if (rr->u.NM.flags & ~1) {
+				printk("Unsupported NM flag settings (%d)\n",
+					rr->u.NM.flags);
+				break;
+			}
+			if ((strlen(retname) + rr->len - 5) >= 254) {
+				truncate = 1;
+				break;
+			}
+			strncat(retname, rr->u.NM.name, rr->len - 5);
+			retnamlen += rr->len - 5;
+			break;
+		case SIG('R', 'E'):
+			kfree(rs.buffer);
+			return -1;
+		default:
+			break;
+		}
+	}
+	ret = rock_continue(&rs);
+	if (ret == 0)
+		goto repeat;
+	if (ret == 1)
+		return retnamlen; /* If 0, this file did not have a NM field */
+out:
+	kfree(rs.buffer);
+	return ret;
+eio:
+	ret = -EIO;
+	goto out;
+}
+
+#define RR_REGARD_XA 1
+#define RR_RELOC_DE 2
+
+static int
+parse_rock_ridge_inode_internal(struct iso_directory_record *de,
+				struct inode *inode, int flags)
+{
+	int symlink_len = 0;
+	int cnt, sig;
+	unsigned int reloc_block;
+	struct inode *reloc;
+	struct rock_ridge *rr;
+	int rootflag;
+	struct rock_state rs;
+	int ret = 0;
+
+	if (!ISOFS_SB(inode->i_sb)->s_rock)
+		return 0;
+
+	init_rock_state(&rs, inode);
+	setup_rock_ridge(de, inode, &rs);
+	if (flags & RR_REGARD_XA) {
+		rs.chr += 14;
+		rs.len -= 14;
+		if (rs.len < 0)
+			rs.len = 0;
+	}
+
+repeat:
+	while (rs.len > 2) { /* There may be one byte for padding somewhere */
+		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
+		if (rr->len < 3)
+			goto out;	/* Something got screwed up here */
+		sig = isonum_721(rs.chr);
+		if (rock_check_overflow(&rs, sig))
+			goto eio;
+		rs.chr += rr->len;
+		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
+		if (rs.len < 0)
+			goto out;	/* Something got screwed up here */
+
+		switch (sig) {
+#ifndef CONFIG_ZISOFS		/* No flag for SF or ZF */
+		case SIG('R', 'R'):
+			if ((rr->u.RR.flags[0] &
+			     (RR_PX | RR_TF | RR_SL | RR_CL)) == 0)
+				goto out;
+			break;
+#endif
+		case SIG('S', 'P'):
+			if (check_sp(rr, inode))
+				goto out;
+			break;
+		case SIG('C', 'E'):
+			rs.cont_extent = isonum_733(rr->u.CE.extent);
+			rs.cont_offset = isonum_733(rr->u.CE.offset);
+			rs.cont_size = isonum_733(rr->u.CE.size);
+			break;
+		case SIG('E', 'R'):
+			/* Invalid length of ER tag id? */
+			if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len)
+				goto out;
+			ISOFS_SB(inode->i_sb)->s_rock = 1;
+			printk(KERN_DEBUG "ISO 9660 Extensions: ");
+			{
+				int p;
+				for (p = 0; p < rr->u.ER.len_id; p++)
+					printk("%c", rr->u.ER.data[p]);
+			}
+			printk("\n");
+			break;
+		case SIG('P', 'X'):
+			inode->i_mode = isonum_733(rr->u.PX.mode);
+			set_nlink(inode, isonum_733(rr->u.PX.n_links));
+			i_uid_write(inode, isonum_733(rr->u.PX.uid));
+			i_gid_write(inode, isonum_733(rr->u.PX.gid));
+			break;
+		case SIG('P', 'N'):
+			{
+				int high, low;
+				high = isonum_733(rr->u.PN.dev_high);
+				low = isonum_733(rr->u.PN.dev_low);
+				/*
+				 * The Rock Ridge standard specifies that if
+				 * sizeof(dev_t) <= 4, then the high field is
+				 * unused, and the device number is completely
+				 * stored in the low field.  Some writers may
+				 * ignore this subtlety,
+				 * and as a result we test to see if the entire
+				 * device number is
+				 * stored in the low field, and use that.
+				 */
+				if ((low & ~0xff) && high == 0) {
+					inode->i_rdev =
+					    MKDEV(low >> 8, low & 0xff);
+				} else {
+					inode->i_rdev =
+					    MKDEV(high, low);
+				}
+			}
+			break;
+		case SIG('T', 'F'):
+			/*
+			 * Some RRIP writers incorrectly place ctime in the
+			 * TF_CREATE field. Try to handle this correctly for
+			 * either case.
+			 */
+			/* Rock ridge never appears on a High Sierra disk */
+			cnt = 0;
+			if (rr->u.TF.flags & TF_CREATE) {
+				inode->i_ctime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_ctime.tv_nsec = 0;
+			}
+			if (rr->u.TF.flags & TF_MODIFY) {
+				inode->i_mtime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_mtime.tv_nsec = 0;
+			}
+			if (rr->u.TF.flags & TF_ACCESS) {
+				inode->i_atime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_atime.tv_nsec = 0;
+			}
+			if (rr->u.TF.flags & TF_ATTRIBUTES) {
+				inode->i_ctime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_ctime.tv_nsec = 0;
+			}
+			break;
+		case SIG('S', 'L'):
+			{
+				int slen;
+				struct SL_component *slp;
+				struct SL_component *oldslp;
+				slen = rr->len - 5;
+				slp = &rr->u.SL.link;
+				inode->i_size = symlink_len;
+				while (slen > 1) {
+					rootflag = 0;
+					switch (slp->flags & ~1) {
+					case 0:
+						inode->i_size +=
+						    slp->len;
+						break;
+					case 2:
+						inode->i_size += 1;
+						break;
+					case 4:
+						inode->i_size += 2;
+						break;
+					case 8:
+						rootflag = 1;
+						inode->i_size += 1;
+						break;
+					default:
+						printk("Symlink component flag "
+							"not implemented\n");
+					}
+					slen -= slp->len + 2;
+					oldslp = slp;
+					slp = (struct SL_component *)
+						(((char *)slp) + slp->len + 2);
+
+					if (slen < 2) {
+						if (((rr->u.SL.
+						      flags & 1) != 0)
+						    &&
+						    ((oldslp->
+						      flags & 1) == 0))
+							inode->i_size +=
+							    1;
+						break;
+					}
+
+					/*
+					 * If this component record isn't
+					 * continued, then append a '/'.
+					 */
+					if (!rootflag
+					    && (oldslp->flags & 1) == 0)
+						inode->i_size += 1;
+				}
+			}
+			symlink_len = inode->i_size;
+			break;
+		case SIG('R', 'E'):
+			printk(KERN_WARNING "Attempt to read inode for "
+					"relocated directory\n");
+			goto out;
+		case SIG('C', 'L'):
+			if (flags & RR_RELOC_DE) {
+				printk(KERN_ERR
+				       "ISOFS: Recursive directory relocation "
+				       "is not supported\n");
+				goto eio;
+			}
+			reloc_block = isonum_733(rr->u.CL.location);
+			if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
+			    ISOFS_I(inode)->i_iget5_offset == 0) {
+				printk(KERN_ERR
+				       "ISOFS: Directory relocation points to "
+				       "itself\n");
+				goto eio;
+			}
+			ISOFS_I(inode)->i_first_extent = reloc_block;
+			reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
+			if (IS_ERR(reloc)) {
+				ret = PTR_ERR(reloc);
+				goto out;
+			}
+			inode->i_mode = reloc->i_mode;
+			set_nlink(inode, reloc->i_nlink);
+			inode->i_uid = reloc->i_uid;
+			inode->i_gid = reloc->i_gid;
+			inode->i_rdev = reloc->i_rdev;
+			inode->i_size = reloc->i_size;
+			inode->i_blocks = reloc->i_blocks;
+			inode->i_atime = reloc->i_atime;
+			inode->i_ctime = reloc->i_ctime;
+			inode->i_mtime = reloc->i_mtime;
+			iput(reloc);
+			break;
+#ifdef CONFIG_ZISOFS
+		case SIG('Z', 'F'): {
+			int algo;
+
+			if (ISOFS_SB(inode->i_sb)->s_nocompress)
+				break;
+			algo = isonum_721(rr->u.ZF.algorithm);
+			if (algo == SIG('p', 'z')) {
+				int block_shift =
+					isonum_711(&rr->u.ZF.parms[1]);
+				if (block_shift > 17) {
+					printk(KERN_WARNING "isofs: "
+						"Can't handle ZF block "
+						"size of 2^%d\n",
+						block_shift);
+				} else {
+					/*
+					 * Note: we don't change
+					 * i_blocks here
+					 */
+					ISOFS_I(inode)->i_file_format =
+						isofs_file_compressed;
+					/*
+					 * Parameters to compression
+					 * algorithm (header size,
+					 * block size)
+					 */
+					ISOFS_I(inode)->i_format_parm[0] =
+						isonum_711(&rr->u.ZF.parms[0]);
+					ISOFS_I(inode)->i_format_parm[1] =
+						isonum_711(&rr->u.ZF.parms[1]);
+					inode->i_size =
+					    isonum_733(rr->u.ZF.
+						       real_size);
+				}
+			} else {
+				printk(KERN_WARNING
+				       "isofs: Unknown ZF compression "
+						"algorithm: %c%c\n",
+				       rr->u.ZF.algorithm[0],
+				       rr->u.ZF.algorithm[1]);
+			}
+			break;
+		}
+#endif
+		default:
+			break;
+		}
+	}
+	ret = rock_continue(&rs);
+	if (ret == 0)
+		goto repeat;
+	if (ret == 1)
+		ret = 0;
+out:
+	kfree(rs.buffer);
+	return ret;
+eio:
+	ret = -EIO;
+	goto out;
+}
+
+static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
+{
+	int slen;
+	int rootflag;
+	struct SL_component *oldslp;
+	struct SL_component *slp;
+	slen = rr->len - 5;
+	slp = &rr->u.SL.link;
+	while (slen > 1) {
+		rootflag = 0;
+		switch (slp->flags & ~1) {
+		case 0:
+			if (slp->len > plimit - rpnt)
+				return NULL;
+			memcpy(rpnt, slp->text, slp->len);
+			rpnt += slp->len;
+			break;
+		case 2:
+			if (rpnt >= plimit)
+				return NULL;
+			*rpnt++ = '.';
+			break;
+		case 4:
+			if (2 > plimit - rpnt)
+				return NULL;
+			*rpnt++ = '.';
+			*rpnt++ = '.';
+			break;
+		case 8:
+			if (rpnt >= plimit)
+				return NULL;
+			rootflag = 1;
+			*rpnt++ = '/';
+			break;
+		default:
+			printk("Symlink component flag not implemented (%d)\n",
+			       slp->flags);
+		}
+		slen -= slp->len + 2;
+		oldslp = slp;
+		slp = (struct SL_component *)((char *)slp + slp->len + 2);
+
+		if (slen < 2) {
+			/*
+			 * If there is another SL record, and this component
+			 * record isn't continued, then add a slash.
+			 */
+			if ((!rootflag) && (rr->u.SL.flags & 1) &&
+			    !(oldslp->flags & 1)) {
+				if (rpnt >= plimit)
+					return NULL;
+				*rpnt++ = '/';
+			}
+			break;
+		}
+
+		/*
+		 * If this component record isn't continued, then append a '/'.
+		 */
+		if (!rootflag && !(oldslp->flags & 1)) {
+			if (rpnt >= plimit)
+				return NULL;
+			*rpnt++ = '/';
+		}
+	}
+	return rpnt;
+}
+
+int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
+			   int relocated)
+{
+	int flags = relocated ? RR_RELOC_DE : 0;
+	int result = parse_rock_ridge_inode_internal(de, inode, flags);
+
+	/*
+	 * if rockridge flag was reset and we didn't look for attributes
+	 * behind eventual XA attributes, have a look there
+	 */
+	if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
+	    && (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
+		result = parse_rock_ridge_inode_internal(de, inode,
+							 flags | RR_REGARD_XA);
+	}
+	return result;
+}
+
+/*
+ * readpage() for symlinks: reads symlink contents into the page and either
+ * makes it uptodate and returns 0 or returns error (-EIO)
+ */
+static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
+	char *link = kmap(page);
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	struct buffer_head *bh;
+	char *rpnt = link;
+	unsigned char *pnt;
+	struct iso_directory_record *raw_de;
+	unsigned long block, offset;
+	int sig;
+	struct rock_ridge *rr;
+	struct rock_state rs;
+	int ret;
+
+	if (!sbi->s_rock)
+		goto error;
+
+	init_rock_state(&rs, inode);
+	block = ei->i_iget5_block;
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto out_noread;
+
+	offset = ei->i_iget5_offset;
+	pnt = (unsigned char *)bh->b_data + offset;
+
+	raw_de = (struct iso_directory_record *)pnt;
+
+	/*
+	 * If we go past the end of the buffer, there is some sort of error.
+	 */
+	if (offset + *pnt > bufsize)
+		goto out_bad_span;
+
+	/*
+	 * Now test for possible Rock Ridge extensions which will override
+	 * some of these numbers in the inode structure.
+	 */
+
+	setup_rock_ridge(raw_de, inode, &rs);
+
+repeat:
+	while (rs.len > 2) { /* There may be one byte for padding somewhere */
+		rr = (struct rock_ridge *)rs.chr;
+		if (rr->len < 3)
+			goto out;	/* Something got screwed up here */
+		sig = isonum_721(rs.chr);
+		if (rock_check_overflow(&rs, sig))
+			goto out;
+		rs.chr += rr->len;
+		rs.len -= rr->len;
+		if (rs.len < 0)
+			goto out;	/* corrupted isofs */
+
+		switch (sig) {
+		case SIG('R', 'R'):
+			if ((rr->u.RR.flags[0] & RR_SL) == 0)
+				goto out;
+			break;
+		case SIG('S', 'P'):
+			if (check_sp(rr, inode))
+				goto out;
+			break;
+		case SIG('S', 'L'):
+			rpnt = get_symlink_chunk(rpnt, rr,
+						 link + (PAGE_SIZE - 1));
+			if (rpnt == NULL)
+				goto out;
+			break;
+		case SIG('C', 'E'):
+			/* This tells is if there is a continuation record */
+			rs.cont_extent = isonum_733(rr->u.CE.extent);
+			rs.cont_offset = isonum_733(rr->u.CE.offset);
+			rs.cont_size = isonum_733(rr->u.CE.size);
+		default:
+			break;
+		}
+	}
+	ret = rock_continue(&rs);
+	if (ret == 0)
+		goto repeat;
+	if (ret < 0)
+		goto fail;
+
+	if (rpnt == link)
+		goto fail;
+	brelse(bh);
+	*rpnt = '\0';
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+	/* error exit from macro */
+out:
+	kfree(rs.buffer);
+	goto fail;
+out_noread:
+	printk("unable to read i-node block");
+	goto fail;
+out_bad_span:
+	printk("symlink spans iso9660 blocks\n");
+fail:
+	brelse(bh);
+error:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return -EIO;
+}
+
+const struct address_space_operations isofs_symlink_aops = {
+	.readpage = rock_ridge_symlink_readpage
+};
diff --git a/kernel/fs/isofs/rock.h b/kernel/fs/isofs/rock.h
new file mode 100644
index 000000000..ed09e2b08
--- /dev/null
+++ b/kernel/fs/isofs/rock.h
@@ -0,0 +1,122 @@
+/*
+ * These structs are used by the system-use-sharing protocol, in which the
+ * Rock Ridge extensions are embedded.  It is quite possible that other
+ * extensions are present on the disk, and this is fine as long as they
+ * all use SUSP
+ */
+
+struct SU_SP_s {
+	unsigned char magic[2];
+	unsigned char skip;
+} __attribute__ ((packed));
+
+struct SU_CE_s {
+	char extent[8];
+	char offset[8];
+	char size[8];
+};
+
+struct SU_ER_s {
+	unsigned char len_id;
+	unsigned char len_des;
+	unsigned char len_src;
+	unsigned char ext_ver;
+	char data[0];
+} __attribute__ ((packed));
+
+struct RR_RR_s {
+	char flags[1];
+} __attribute__ ((packed));
+
+struct RR_PX_s {
+	char mode[8];
+	char n_links[8];
+	char uid[8];
+	char gid[8];
+};
+
+struct RR_PN_s {
+	char dev_high[8];
+	char dev_low[8];
+};
+
+struct SL_component {
+	unsigned char flags;
+	unsigned char len;
+	char text[0];
+} __attribute__ ((packed));
+
+struct RR_SL_s {
+	unsigned char flags;
+	struct SL_component link;
+} __attribute__ ((packed));
+
+struct RR_NM_s {
+	unsigned char flags;
+	char name[0];
+} __attribute__ ((packed));
+
+struct RR_CL_s {
+	char location[8];
+};
+
+struct RR_PL_s {
+	char location[8];
+};
+
+struct stamp {
+	char time[7];
+} __attribute__ ((packed));
+
+struct RR_TF_s {
+	char flags;
+	struct stamp times[0];	/* Variable number of these beasts */
+} __attribute__ ((packed));
+
+/* Linux-specific extension for transparent decompression */
+struct RR_ZF_s {
+	char algorithm[2];
+	char parms[2];
+	char real_size[8];
+};
+
+/*
+ * These are the bits and their meanings for flags in the TF structure.
+ */
+#define TF_CREATE 1
+#define TF_MODIFY 2
+#define TF_ACCESS 4
+#define TF_ATTRIBUTES 8
+#define TF_BACKUP 16
+#define TF_EXPIRATION 32
+#define TF_EFFECTIVE 64
+#define TF_LONG_FORM 128
+
+struct rock_ridge {
+	char signature[2];
+	unsigned char len;
+	unsigned char version;
+	union {
+		struct SU_SP_s SP;
+		struct SU_CE_s CE;
+		struct SU_ER_s ER;
+		struct RR_RR_s RR;
+		struct RR_PX_s PX;
+		struct RR_PN_s PN;
+		struct RR_SL_s SL;
+		struct RR_NM_s NM;
+		struct RR_CL_s CL;
+		struct RR_PL_s PL;
+		struct RR_TF_s TF;
+		struct RR_ZF_s ZF;
+	} u;
+};
+
+#define RR_PX 1			/* POSIX attributes */
+#define RR_PN 2			/* POSIX devices */
+#define RR_SL 4			/* Symbolic link */
+#define RR_NM 8			/* Alternate Name */
+#define RR_CL 16		/* Child link */
+#define RR_PL 32		/* Parent link */
+#define RR_RE 64		/* Relocation directory */
+#define RR_TF 128		/* Timestamps */
diff --git a/kernel/fs/isofs/util.c b/kernel/fs/isofs/util.c
new file mode 100644
index 000000000..005a15cfd
--- /dev/null
+++ b/kernel/fs/isofs/util.c
@@ -0,0 +1,70 @@
+/*
+ *  linux/fs/isofs/util.c
+ */
+
+#include <linux/time.h>
+#include "isofs.h"
+
+/* 
+ * We have to convert from a MM/DD/YY format to the Unix ctime format.
+ * We have to take into account leap years and all of that good stuff.
+ * Unfortunately, the kernel does not have the information on hand to
+ * take into account daylight savings time, but it shouldn't matter.
+ * The time stored should be localtime (with or without DST in effect),
+ * and the timezone offset should hold the offset required to get back
+ * to GMT.  Thus  we should always be correct.
+ */
+
+int iso_date(char * p, int flag)
+{
+	int year, month, day, hour, minute, second, tz;
+	int crtime;
+
+	year = p[0];
+	month = p[1];
+	day = p[2];
+	hour = p[3];
+	minute = p[4];
+	second = p[5];
+	if (flag == 0) tz = p[6]; /* High sierra has no time zone */
+	else tz = 0;
+	
+	if (year < 0) {
+		crtime = 0;
+	} else {
+		crtime = mktime64(year+1900, month, day, hour, minute, second);
+
+		/* sign extend */
+		if (tz & 0x80)
+			tz |= (-1 << 8);
+		
+		/* 
+		 * The timezone offset is unreliable on some disks,
+		 * so we make a sanity check.  In no case is it ever
+		 * more than 13 hours from GMT, which is 52*15min.
+		 * The time is always stored in localtime with the
+		 * timezone offset being what get added to GMT to
+		 * get to localtime.  Thus we need to subtract the offset
+		 * to get to true GMT, which is what we store the time
+		 * as internally.  On the local system, the user may set
+		 * their timezone any way they wish, of course, so GMT
+		 * gets converted back to localtime on the receiving
+		 * system.
+		 *
+		 * NOTE: mkisofs in versions prior to mkisofs-1.10 had
+		 * the sign wrong on the timezone offset.  This has now
+		 * been corrected there too, but if you are getting screwy
+		 * results this may be the explanation.  If enough people
+		 * complain, a user configuration option could be added
+		 * to add the timezone offset in with the wrong sign
+		 * for 'compatibility' with older discs, but I cannot see how
+		 * it will matter that much.
+		 *
+		 * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann)
+		 * for pointing out the sign error.
+		 */
+		if (-52 <= tz && tz <= 52)
+			crtime -= tz * 15 * 60;
+	}
+	return crtime;
+}		
diff --git a/kernel/fs/isofs/zisofs.h b/kernel/fs/isofs/zisofs.h
new file mode 100644
index 000000000..273795709
--- /dev/null
+++ b/kernel/fs/isofs/zisofs.h
@@ -0,0 +1,21 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2001 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Prototypes for functions exported from the compressed isofs subsystem
+ */
+
+#ifdef CONFIG_ZISOFS
+extern const struct address_space_operations zisofs_aops;
+extern int __init zisofs_init(void);
+extern void zisofs_cleanup(void);
+#endif
diff --git a/kernel/fs/jbd/Kconfig b/kernel/fs/jbd/Kconfig
new file mode 100644
index 000000000..4e28beeed
--- /dev/null
+++ b/kernel/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
+config JBD
+	tristate
+	help
+	  This is a generic journalling layer for block devices.  It is
+	  currently used by the ext3 file system, but it could also be
+	  used to add journal support to other file systems or block
+	  devices such as RAID or LVM.
+
+	  If you are using the ext3 file system, you need to say Y here.
+	  If you are not using ext3 then you will probably want to say N.
+
+	  To compile this device as a module, choose M here: the module will be
+	  called jbd.  If you are compiling ext3 into the kernel, you
+	  cannot compile this code as a module.
+
+config JBD_DEBUG
+	bool "JBD (ext3) debugging support"
+	depends on JBD && DEBUG_FS
+	help
+	  If you are using the ext3 journaled file system (or potentially any
+	  other file system/device using JBD), this option allows you to
+	  enable debugging output while the system is running, in order to
+	  help track down any problems you are having.  By default the
+	  debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
+	  number between 1 and 5, the higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/kernel/fs/jbd/Makefile b/kernel/fs/jbd/Makefile
new file mode 100644
index 000000000..54aca4868
--- /dev/null
+++ b/kernel/fs/jbd/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux journaling routines.
+#
+
+obj-$(CONFIG_JBD) += jbd.o
+
+jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/kernel/fs/jbd/checkpoint.c b/kernel/fs/jbd/checkpoint.c
new file mode 100644
index 000000000..95debd71e
--- /dev/null
+++ b/kernel/fs/jbd/checkpoint.c
@@ -0,0 +1,784 @@
+/*
+ * linux/fs/jbd/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <trace/events/jbd.h>
+
+/*
+ * Unlink a buffer from a transaction checkpoint list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink_first(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	jh->b_cpnext->b_cpprev = jh->b_cpprev;
+	jh->b_cpprev->b_cpnext = jh->b_cpnext;
+	if (transaction->t_checkpoint_list == jh) {
+		transaction->t_checkpoint_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
+}
+
+/*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
+ * Requires j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __try_to_free_cp_buf(struct journal_head *jh)
+{
+	int ret = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+		/*
+		 * Get our reference so that bh cannot be freed before
+		 * we unlock it
+		 */
+		get_bh(bh);
+		JBUFFER_TRACE(jh, "remove from checkpoint list");
+		ret = __journal_remove_checkpoint(jh) + 1;
+		jbd_unlock_bh_state(bh);
+		BUFFER_TRACE(bh, "release");
+		__brelse(bh);
+	} else {
+		jbd_unlock_bh_state(bh);
+	}
+	return ret;
+}
+
+/*
+ * __log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called under j-state_lock *only*.  It will be unlocked if we have to wait
+ * for a checkpoint to free up some space in the log.
+ */
+void __log_wait_for_space(journal_t *journal)
+{
+	int nblocks, space_left;
+	assert_spin_locked(&journal->j_state_lock);
+
+	nblocks = jbd_space_needed(journal);
+	while (__log_space_left(journal) < nblocks) {
+		if (journal->j_flags & JFS_ABORT)
+			return;
+		spin_unlock(&journal->j_state_lock);
+		if (current->plug)
+			io_schedule();
+		mutex_lock(&journal->j_checkpoint_mutex);
+
+		/*
+		 * Test again, another process may have checkpointed while we
+		 * were waiting for the checkpoint lock. If there are no
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
+		 */
+		spin_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
+		nblocks = jbd_space_needed(journal);
+		space_left = __log_space_left(journal);
+		if (space_left < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
+
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
+			spin_unlock(&journal->j_list_lock);
+			spin_unlock(&journal->j_state_lock);
+			if (chkpt) {
+				log_do_checkpoint(journal);
+			} else if (cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				log_wait_commit(journal, tid);
+			} else {
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space\n", __func__);
+				WARN_ON(1);
+				journal_abort(journal, 0);
+			}
+			spin_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
+		}
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+}
+
+/*
+ * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
+ * The caller must restart a list walk.  Wait for someone else to run
+ * jbd_unlock_bh_state().
+ */
+static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+	__releases(journal->j_list_lock)
+{
+	get_bh(bh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_lock_bh_state(bh);
+	jbd_unlock_bh_state(bh);
+	put_bh(bh);
+}
+
+/*
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
+ *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
+ * Called with j_list_lock held.
+ */
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	tid_t this_tid;
+	int released = 0;
+	int ret = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return ret;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
+		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		get_bh(bh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+
+		/*
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
+		 */
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		__brelse(bh);
+	}
+
+	return ret;
+}
+
+#define NR_BATCH	64
+
+static void
+__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+{
+	int i;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(bhs[i], WRITE_SYNC);
+	blk_finish_plug(&plug);
+
+	for (i = 0; i < *batch_count; i++) {
+		struct buffer_head *bh = bhs[i];
+		clear_buffer_jwrite(bh);
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	*batch_count = 0;
+}
+
+/*
+ * Try to flush one buffer from the checkpoint list to disk.
+ *
+ * Return 1 if something happened which requires us to abort the current
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
+ *
+ * Called with j_list_lock held and drops it if 1 is returned
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
+{
+	struct buffer_head *bh = jh2bh(jh);
+	int ret = 0;
+
+	if (buffer_locked(bh)) {
+		get_bh(bh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
+
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+		get_bh(bh);
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		__brelse(bh);
+	} else {
+		/*
+		 * Important: we are about to write the buffer, and
+		 * possibly block, while still holding the journal lock.
+		 * We cannot afford to let the transaction logic start
+		 * messing around with this buffer before we write it to
+		 * disk, as that would break recoverability.
+		 */
+		BUFFER_TRACE(bh, "queue");
+		get_bh(bh);
+		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+		set_buffer_jwrite(bh);
+		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
+		jbd_unlock_bh_state(bh);
+		(*batch_count)++;
+		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
+			__flush_batch(journal, bhs, batch_count);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
+ *
+ * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
+ */
+int log_do_checkpoint(journal_t *journal)
+{
+	transaction_t *transaction;
+	tid_t this_tid;
+	int result;
+
+	jbd_debug(1, "Start checkpoint\n");
+
+	/*
+	 * First thing: if there are any transactions in the log which
+	 * don't need checkpointing, just eliminate them from the
+	 * journal straight away.
+	 */
+	result = cleanup_journal_tail(journal);
+	trace_jbd_checkpoint(journal, result);
+	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	if (result <= 0)
+		return result;
+
+	/*
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
+	 */
+	result = 0;
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
+	/*
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
+	 */
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0, err;
+
+		while (!retry && transaction->t_checkpoint_list) {
+			struct buffer_head *bh;
+
+			jh = transaction->t_checkpoint_list;
+			bh = jh2bh(jh);
+			if (!jbd_trylock_bh_state(bh)) {
+				jbd_sync_bh(journal, bh);
+				retry = 1;
+				break;
+			}
+			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (retry < 0 && !result)
+				result = retry;
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+				break;
+			}
+		}
+
+		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
+			__flush_batch(journal, bhs, &batch_count);
+		}
+
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		/*
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
+		 */
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
+	}
+out:
+	spin_unlock(&journal->j_list_lock);
+	if (result < 0)
+		journal_abort(journal, result);
+	else
+		result = cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
+}
+
+/*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock.  If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts.  Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
+ */
+
+int cleanup_journal_tail(journal_t *journal)
+{
+	transaction_t * transaction;
+	tid_t		first_tid;
+	unsigned int	blocknr, freed;
+
+	if (is_journal_aborted(journal))
+		return 1;
+
+	/*
+	 * OK, work out the oldest transaction remaining in the log, and
+	 * the log block it starts at.
+	 *
+	 * If the log is now empty, we need to work out which is the
+	 * next transaction ID we will write, and where it will
+	 * start.
+	 */
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_committing_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_running_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = journal->j_head;
+	} else {
+		first_tid = journal->j_transaction_sequence;
+		blocknr = journal->j_head;
+	}
+	spin_unlock(&journal->j_list_lock);
+	J_ASSERT(blocknr != 0);
+
+	/* If the oldest pinned transaction is at the tail of the log
+           already then there's not much we can do right now. */
+	if (journal->j_tail_sequence == first_tid) {
+		spin_unlock(&journal->j_state_lock);
+		return 1;
+	}
+	spin_unlock(&journal->j_state_lock);
+
+	/*
+	 * We need to make sure that any blocks that were recently written out
+	 * --- perhaps by log_do_checkpoint() --- are flushed out before we
+	 * drop the transactions from the journal. Similarly we need to be sure
+	 * superblock makes it to disk before next transaction starts reusing
+	 * freed space (otherwise we could replay some blocks of the new
+	 * transaction thinking they belong to the old one). So we use
+	 * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially
+	 * with an appropriately sized journal, but we need this to guarantee
+	 * correctness.  Fortunately cleanup_journal_tail() doesn't get called
+	 * all that often.
+	 */
+	journal_update_sb_log_tail(journal, first_tid, blocknr,
+				   WRITE_FLUSH_FUA);
+
+	spin_lock(&journal->j_state_lock);
+	/* OK, update the superblock to recover the freed space.
+	 * Physical blocks come first: have we wrapped beyond the end of
+	 * the log?  */
+	freed = blocknr - journal->j_tail;
+	if (blocknr < journal->j_tail)
+		freed = freed + journal->j_last - journal->j_first;
+
+	trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
+	jbd_debug(1,
+		  "Cleaning journal tail from %d to %d (offset %u), "
+		  "freeing %u\n",
+		  journal->j_tail_sequence, first_tid, blocknr, freed);
+
+	journal->j_free += freed;
+	journal->j_tail_sequence = first_tid;
+	journal->j_tail = blocknr;
+	spin_unlock(&journal->j_state_lock);
+	return 0;
+}
+
+
+/* Checkpoint list management */
+
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release
+ * them.
+ *
+ * Called with j_list_lock held.
+ * Returns number of buffers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of buffers reaped (for debug)
+ */
+
+int __journal_clean_checkpoint_list(journal_t *journal)
+{
+	transaction_t *transaction, *last_transaction, *next_transaction;
+	int ret = 0;
+	int released;
+
+	transaction = journal->j_checkpoint_transactions;
+	if (!transaction)
+		goto out;
+
+	last_transaction = transaction->t_cpprev;
+	next_transaction = transaction;
+	do {
+		transaction = next_transaction;
+		next_transaction = transaction->t_cpnext;
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
+	} while (transaction != last_transaction);
+out:
+	return ret;
+}
+
+/*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk.  To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * lists until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
+ *
+ * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
+ */
+
+int __journal_remove_checkpoint(struct journal_head *jh)
+{
+	transaction_t *transaction;
+	journal_t *journal;
+	int ret = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	if ((transaction = jh->b_cp_transaction) == NULL) {
+		JBUFFER_TRACE(jh, "not on transaction");
+		goto out;
+	}
+	journal = transaction->t_journal;
+
+	JBUFFER_TRACE(jh, "removing from transaction");
+	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
+	journal_put_journal_head(jh);
+
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
+		goto out;
+
+	/*
+	 * There is one special case to worry about: if we have just pulled the
+	 * buffer off a running or committing transaction's checkpoing list,
+	 * then even if the checkpoint list is empty, the transaction obviously
+	 * cannot be dropped!
+	 *
+	 * The locking here around t_state is a bit sleazy.
+	 * See the comment at the end of journal_commit_transaction().
+	 */
+	if (transaction->t_state != T_FINISHED)
+		goto out;
+
+	/* OK, that was the last buffer for the transaction: we can now
+	   safely remove this transaction from the log */
+
+	__journal_drop_transaction(journal, transaction);
+
+	/* Just in case anybody was waiting for more transactions to be
+           checkpointed... */
+	wake_up(&journal->j_wait_logspace);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __journal_insert_checkpoint(struct journal_head *jh,
+			       transaction_t *transaction)
+{
+	JBUFFER_TRACE(jh, "entry");
+	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+	/* Get reference for checkpointing transaction */
+	journal_grab_journal_head(jh2bh(jh));
+	jh->b_cp_transaction = transaction;
+
+	if (!transaction->t_checkpoint_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_list;
+		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_list = jh;
+}
+
+/*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+
+void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+{
+	assert_spin_locked(&journal->j_list_lock);
+	if (transaction->t_cpnext) {
+		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions =
+				transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions = NULL;
+	}
+
+	J_ASSERT(transaction->t_state == T_FINISHED);
+	J_ASSERT(transaction->t_buffers == NULL);
+	J_ASSERT(transaction->t_sync_datalist == NULL);
+	J_ASSERT(transaction->t_forget == NULL);
+	J_ASSERT(transaction->t_iobuf_list == NULL);
+	J_ASSERT(transaction->t_shadow_list == NULL);
+	J_ASSERT(transaction->t_log_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+	J_ASSERT(transaction->t_updates == 0);
+	J_ASSERT(journal->j_committing_transaction != transaction);
+	J_ASSERT(journal->j_running_transaction != transaction);
+
+	trace_jbd_drop_transaction(journal, transaction);
+	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+	kfree(transaction);
+}
diff --git a/kernel/fs/jbd/commit.c b/kernel/fs/jbd/commit.c
new file mode 100644
index 000000000..bb217dcb4
--- /dev/null
+++ b/kernel/fs/jbd/commit.c
@@ -0,0 +1,1021 @@
+/*
+ * linux/fs/jbd/commit.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <trace/events/jbd.h>
+
+/*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+ */
+static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	BUFFER_TRACE(bh, "");
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+}
+
+/*
+ * When an ext3-ordered file is truncated, it is possible that many pages are
+ * not successfully freed, because they are attached to a committing transaction.
+ * After the transaction commits, these pages are left on the LRU, with no
+ * ->mapping, and with attached buffers.  These pages are trivially reclaimable
+ * by the VM, but their apparent absence upsets the VM accounting, and it makes
+ * the numbers in /proc/meminfo look odd.
+ *
+ * So here, we have a buffer which has just come off the forget list.  Look to
+ * see if we can strip all buffers from the backing page.
+ *
+ * Called under journal->j_list_lock.  The caller provided us with a ref
+ * against the buffer, and we drop that here.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+	struct page *page;
+
+	if (buffer_dirty(bh))
+		goto nope;
+	if (atomic_read(&bh->b_count) != 1)
+		goto nope;
+	page = bh->b_page;
+	if (!page)
+		goto nope;
+	if (page->mapping)
+		goto nope;
+
+	/* OK, it's a truncated page */
+	if (!trylock_page(page))
+		goto nope;
+
+	page_cache_get(page);
+	__brelse(bh);
+	try_to_free_buffers(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return;
+
+nope:
+	__brelse(bh);
+}
+
+/*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+	if (buffer_freed(bh)) {
+		WARN_ON_ONCE(buffer_dirty(bh));
+		clear_buffer_freed(bh);
+		clear_buffer_mapped(bh);
+		clear_buffer_new(bh);
+		clear_buffer_req(bh);
+		bh->b_bdev = NULL;
+		release_buffer_page(bh);
+	} else
+		put_bh(bh);
+}
+
+/*
+ * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
+ * held.  For ranking reasons we must trylock.  If we lose, schedule away and
+ * return 0.  j_list_lock is dropped in this case.
+ */
+static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+{
+	if (!jbd_trylock_bh_state(bh)) {
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		return 0;
+	}
+	return 1;
+}
+
+/* Done it all: now write the commit record.  We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_write_commit_record(journal_t *journal,
+					transaction_t *commit_transaction)
+{
+	struct journal_head *descriptor;
+	struct buffer_head *bh;
+	journal_header_t *header;
+	int ret;
+
+	if (is_journal_aborted(journal))
+		return 0;
+
+	descriptor = journal_get_descriptor_buffer(journal);
+	if (!descriptor)
+		return 1;
+
+	bh = jh2bh(descriptor);
+
+	header = (journal_header_t *)(bh->b_data);
+	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+
+	JBUFFER_TRACE(descriptor, "write commit block");
+	set_buffer_dirty(bh);
+
+	if (journal->j_flags & JFS_BARRIER)
+		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+	else
+		ret = sync_dirty_buffer(bh);
+
+	put_bh(bh);		/* One for getblk() */
+	journal_put_journal_head(descriptor);
+
+	return (ret == -EIO);
+}
+
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+				   int write_op)
+{
+	int i;
+
+	for (i = 0; i < bufs; i++) {
+		wbuf[i]->b_end_io = end_buffer_write_sync;
+		/*
+		 * Here we write back pagecache data that may be mmaped. Since
+		 * we cannot afford to clean the page and set PageWriteback
+		 * here due to lock ordering (page lock ranks above transaction
+		 * start), the data can change while IO is in flight. Tell the
+		 * block layer it should bounce the bio pages if stable data
+		 * during write is required.
+		 *
+		 * We use up our safety reference in submit_bh().
+		 */
+		_submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
+	}
+}
+
+/*
+ *  Submit all the data buffers to disk
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+				       transaction_t *commit_transaction,
+				       int write_op)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	int locked;
+	int bufs = 0;
+	struct buffer_head **wbuf = journal->j_wbuf;
+	int err = 0;
+
+	/*
+	 * Whenever we unlock the journal and sleep, things can get added
+	 * onto ->t_sync_datalist, so we have to keep looping back to
+	 * write_out_data until we *know* that the list is empty.
+	 *
+	 * Cleanup any flushed data buffers from the data list.  Even in
+	 * abort mode, we want to flush this out as soon as possible.
+	 */
+write_out_data:
+	cond_resched();
+	spin_lock(&journal->j_list_lock);
+
+	while (commit_transaction->t_sync_datalist) {
+		jh = commit_transaction->t_sync_datalist;
+		bh = jh2bh(jh);
+		locked = 0;
+
+		/* Get reference just to make sure buffer does not disappear
+		 * when we are forced to drop various locks */
+		get_bh(bh);
+		/* If the buffer is dirty, we need to submit IO and hence
+		 * we need the buffer lock. We try to lock the buffer without
+		 * blocking. If we fail, we need to drop j_list_lock and do
+		 * blocking lock_buffer().
+		 */
+		if (buffer_dirty(bh)) {
+			if (!trylock_buffer(bh)) {
+				BUFFER_TRACE(bh, "needs blocking lock");
+				spin_unlock(&journal->j_list_lock);
+				trace_jbd_do_submit_data(journal,
+						     commit_transaction);
+				/* Write out all data to prevent deadlocks */
+				journal_do_submit_data(wbuf, bufs, write_op);
+				bufs = 0;
+				lock_buffer(bh);
+				spin_lock(&journal->j_list_lock);
+			}
+			locked = 1;
+		}
+		/* We have to get bh_state lock. Again out of order, sigh. */
+		if (!inverted_lock(journal, bh)) {
+			jbd_lock_bh_state(bh);
+			spin_lock(&journal->j_list_lock);
+		}
+		/* Someone already cleaned up the buffer? */
+		if (!buffer_jbd(bh) || bh2jh(bh) != jh
+			|| jh->b_transaction != commit_transaction
+			|| jh->b_jlist != BJ_SyncData) {
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			BUFFER_TRACE(bh, "already cleaned up");
+			release_data_buffer(bh);
+			continue;
+		}
+		if (locked && test_clear_buffer_dirty(bh)) {
+			BUFFER_TRACE(bh, "needs writeout, adding to array");
+			wbuf[bufs++] = bh;
+			__journal_file_buffer(jh, commit_transaction,
+						BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			if (bufs == journal->j_wbufsize) {
+				spin_unlock(&journal->j_list_lock);
+				trace_jbd_do_submit_data(journal,
+						     commit_transaction);
+				journal_do_submit_data(wbuf, bufs, write_op);
+				bufs = 0;
+				goto write_out_data;
+			}
+		} else if (!locked && buffer_locked(bh)) {
+			__journal_file_buffer(jh, commit_transaction,
+						BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			put_bh(bh);
+		} else {
+			BUFFER_TRACE(bh, "writeout complete: unfile");
+			if (unlikely(!buffer_uptodate(bh)))
+				err = -EIO;
+			__journal_unfile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			release_data_buffer(bh);
+		}
+
+		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
+			spin_unlock(&journal->j_list_lock);
+			goto write_out_data;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+	trace_jbd_do_submit_data(journal, commit_transaction);
+	journal_do_submit_data(wbuf, bufs, write_op);
+
+	return err;
+}
+
+/*
+ * journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log.  This
+ * function is called by the journal thread to begin a complete commit.
+ */
+void journal_commit_transaction(journal_t *journal)
+{
+	transaction_t *commit_transaction;
+	struct journal_head *jh, *new_jh, *descriptor;
+	struct buffer_head **wbuf = journal->j_wbuf;
+	int bufs;
+	int flags;
+	int err;
+	unsigned int blocknr;
+	ktime_t start_time;
+	u64 commit_time;
+	char *tagp = NULL;
+	journal_header_t *header;
+	journal_block_tag_t *tag = NULL;
+	int space_left = 0;
+	int first_tag = 0;
+	int tag_flag;
+	int i;
+	struct blk_plug plug;
+	int write_op = WRITE;
+
+	/*
+	 * First job: lock down the current transaction and wait for
+	 * all outstanding updates to complete.
+	 */
+
+	/* Do we need to erase the effects of a prior journal_flush? */
+	if (journal->j_flags & JFS_FLUSHED) {
+		jbd_debug(3, "super block updated\n");
+		mutex_lock(&journal->j_checkpoint_mutex);
+		/*
+		 * We hold j_checkpoint_mutex so tail cannot change under us.
+		 * We don't need any special data guarantees for writing sb
+		 * since journal is empty and it is ok for write to be
+		 * flushed only with transaction commit.
+		 */
+		journal_update_sb_log_tail(journal, journal->j_tail_sequence,
+					   journal->j_tail, WRITE_SYNC);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	} else {
+		jbd_debug(3, "superblock not updated\n");
+	}
+
+	J_ASSERT(journal->j_running_transaction != NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+
+	commit_transaction = journal->j_running_transaction;
+
+	trace_jbd_start_commit(journal, commit_transaction);
+	jbd_debug(1, "JBD: starting commit of transaction %d\n",
+			commit_transaction->t_tid);
+
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_RUNNING);
+	commit_transaction->t_state = T_LOCKED;
+
+	trace_jbd_commit_locking(journal, commit_transaction);
+	spin_lock(&commit_transaction->t_handle_lock);
+	while (commit_transaction->t_updates) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+					TASK_UNINTERRUPTIBLE);
+		if (commit_transaction->t_updates) {
+			spin_unlock(&commit_transaction->t_handle_lock);
+			spin_unlock(&journal->j_state_lock);
+			schedule();
+			spin_lock(&journal->j_state_lock);
+			spin_lock(&commit_transaction->t_handle_lock);
+		}
+		finish_wait(&journal->j_wait_updates, &wait);
+	}
+	spin_unlock(&commit_transaction->t_handle_lock);
+
+	J_ASSERT (commit_transaction->t_outstanding_credits <=
+			journal->j_max_transaction_buffers);
+
+	/*
+	 * First thing we are allowed to do is to discard any remaining
+	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+	 * that there are no such buffers: if a large filesystem
+	 * operation like a truncate needs to split itself over multiple
+	 * transactions, then it may try to do a journal_restart() while
+	 * there are still BJ_Reserved buffers outstanding.  These must
+	 * be released cleanly from the current transaction.
+	 *
+	 * In this case, the filesystem must still reserve write access
+	 * again before modifying the buffer in the new transaction, but
+	 * we do not require it to remember exactly which old buffers it
+	 * has reserved.  This is consistent with the existing behaviour
+	 * that multiple journal_get_write_access() calls to the same
+	 * buffer are perfectly permissible.
+	 */
+	while (commit_transaction->t_reserved_list) {
+		jh = commit_transaction->t_reserved_list;
+		JBUFFER_TRACE(jh, "reserved, unused: refile");
+		/*
+		 * A journal_get_undo_access()+journal_release_buffer() may
+		 * leave undo-committed data.
+		 */
+		if (jh->b_committed_data) {
+			struct buffer_head *bh = jh2bh(jh);
+
+			jbd_lock_bh_state(bh);
+			jbd_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			jbd_unlock_bh_state(bh);
+		}
+		journal_refile_buffer(journal, jh);
+	}
+
+	/*
+	 * Now try to drop any written-back buffers from the journal's
+	 * checkpoint lists.  We do this *before* commit because it potentially
+	 * frees some memory
+	 */
+	spin_lock(&journal->j_list_lock);
+	__journal_clean_checkpoint_list(journal);
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug (3, "JBD: commit phase 1\n");
+
+	/*
+	 * Clear revoked flag to reflect there is no revoked buffers
+	 * in the next transaction which is going to be started.
+	 */
+	journal_clear_buffer_revoked_flags(journal);
+
+	/*
+	 * Switch to a new revoke table.
+	 */
+	journal_switch_revoke_table(journal);
+
+	trace_jbd_commit_flushing(journal, commit_transaction);
+	commit_transaction->t_state = T_FLUSH;
+	journal->j_committing_transaction = commit_transaction;
+	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
+	commit_transaction->t_log_start = journal->j_head;
+	wake_up(&journal->j_wait_transaction_locked);
+	spin_unlock(&journal->j_state_lock);
+
+	jbd_debug (3, "JBD: commit phase 2\n");
+
+	if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
+		write_op = WRITE_SYNC;
+
+	/*
+	 * Now start flushing things to disk, in the order they appear
+	 * on the transaction lists.  Data blocks go first.
+	 */
+	blk_start_plug(&plug);
+	err = journal_submit_data_buffers(journal, commit_transaction,
+					  write_op);
+	blk_finish_plug(&plug);
+
+	/*
+	 * Wait for all previously submitted IO to complete.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_locked_list) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_locked_list->b_tprev;
+		bh = jh2bh(jh);
+		get_bh(bh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			wait_on_buffer(bh);
+			spin_lock(&journal->j_list_lock);
+		}
+		if (unlikely(!buffer_uptodate(bh))) {
+			if (!trylock_page(bh->b_page)) {
+				spin_unlock(&journal->j_list_lock);
+				lock_page(bh->b_page);
+				spin_lock(&journal->j_list_lock);
+			}
+			if (bh->b_page->mapping)
+				set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+			unlock_page(bh->b_page);
+			SetPageError(bh->b_page);
+			err = -EIO;
+		}
+		if (!inverted_lock(journal, bh)) {
+			put_bh(bh);
+			spin_lock(&journal->j_list_lock);
+			continue;
+		}
+		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+		    jh->b_transaction == commit_transaction &&
+		    jh->b_jlist == BJ_Locked)
+			__journal_unfile_buffer(jh);
+		jbd_unlock_bh_state(bh);
+		release_data_buffer(bh);
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (err) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_WARNING
+			"JBD: Detected IO errors while flushing file data "
+			"on %s\n", bdevname(journal->j_fs_dev, b));
+		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+			journal_abort(journal, err);
+		err = 0;
+	}
+
+	blk_start_plug(&plug);
+
+	journal_write_revoke_records(journal, commit_transaction, write_op);
+
+	/*
+	 * If we found any dirty or locked buffers, then we should have
+	 * looped back up to the write_out_data label.  If there weren't
+	 * any then journal_clean_data_list should have wiped the list
+	 * clean by now, so check that it is in fact empty.
+	 */
+	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+
+	jbd_debug (3, "JBD: commit phase 3\n");
+
+	/*
+	 * Way to go: we have now written out all of the data for a
+	 * transaction!  Now comes the tricky part: we need to write out
+	 * metadata.  Loop over the transaction's entire buffer list:
+	 */
+	spin_lock(&journal->j_state_lock);
+	commit_transaction->t_state = T_COMMIT;
+	spin_unlock(&journal->j_state_lock);
+
+	trace_jbd_commit_logging(journal, commit_transaction);
+	J_ASSERT(commit_transaction->t_nr_buffers <=
+		 commit_transaction->t_outstanding_credits);
+
+	descriptor = NULL;
+	bufs = 0;
+	while (commit_transaction->t_buffers) {
+
+		/* Find the next buffer to be journaled... */
+
+		jh = commit_transaction->t_buffers;
+
+		/* If we're in abort mode, we just un-journal the buffer and
+		   release it. */
+
+		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
+			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			journal_refile_buffer(journal, jh);
+			/* If that was the last one, we need to clean up
+			 * any descriptor buffers which may have been
+			 * already allocated, even if we are now
+			 * aborting. */
+			if (!commit_transaction->t_buffers)
+				goto start_journal_io;
+			continue;
+		}
+
+		/* Make sure we have a descriptor block in which to
+		   record the metadata buffer. */
+
+		if (!descriptor) {
+			struct buffer_head *bh;
+
+			J_ASSERT (bufs == 0);
+
+			jbd_debug(4, "JBD: get descriptor\n");
+
+			descriptor = journal_get_descriptor_buffer(journal);
+			if (!descriptor) {
+				journal_abort(journal, -EIO);
+				continue;
+			}
+
+			bh = jh2bh(descriptor);
+			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+				(unsigned long long)bh->b_blocknr, bh->b_data);
+			header = (journal_header_t *)&bh->b_data[0];
+			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
+			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
+			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			space_left = bh->b_size - sizeof(journal_header_t);
+			first_tag = 1;
+			set_buffer_jwrite(bh);
+			set_buffer_dirty(bh);
+			wbuf[bufs++] = bh;
+
+			/* Record it so that we can wait for IO
+                           completion later */
+			BUFFER_TRACE(bh, "ph3: file as descriptor");
+			journal_file_buffer(descriptor, commit_transaction,
+					BJ_LogCtl);
+		}
+
+		/* Where is the buffer to be written? */
+
+		err = journal_next_log_block(journal, &blocknr);
+		/* If the block mapping failed, just abandon the buffer
+		   and repeat this loop: we'll fall into the
+		   refile-on-abort condition above. */
+		if (err) {
+			journal_abort(journal, err);
+			continue;
+		}
+
+		/*
+		 * start_this_handle() uses t_outstanding_credits to determine
+		 * the free space in the log, but this counter is changed
+		 * by journal_next_log_block() also.
+		 */
+		commit_transaction->t_outstanding_credits--;
+
+		/* Bump b_count to prevent truncate from stumbling over
+                   the shadowed buffer!  @@@ This can go if we ever get
+                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+		get_bh(jh2bh(jh));
+
+		/* Make a temporary IO buffer with which to write it out
+                   (this will requeue both the metadata buffer and the
+                   temporary IO buffer). new_bh goes on BJ_IO*/
+
+		set_buffer_jwrite(jh2bh(jh));
+		/*
+		 * akpm: journal_write_metadata_buffer() sets
+		 * new_bh->b_transaction to commit_transaction.
+		 * We need to clean this up before we release new_bh
+		 * (which is of type BJ_IO)
+		 */
+		JBUFFER_TRACE(jh, "ph3: write metadata");
+		flags = journal_write_metadata_buffer(commit_transaction,
+						      jh, &new_jh, blocknr);
+		set_buffer_jwrite(jh2bh(new_jh));
+		wbuf[bufs++] = jh2bh(new_jh);
+
+		/* Record the new block's tag in the current descriptor
+                   buffer */
+
+		tag_flag = 0;
+		if (flags & 1)
+			tag_flag |= JFS_FLAG_ESCAPE;
+		if (!first_tag)
+			tag_flag |= JFS_FLAG_SAME_UUID;
+
+		tag = (journal_block_tag_t *) tagp;
+		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
+		tag->t_flags = cpu_to_be32(tag_flag);
+		tagp += sizeof(journal_block_tag_t);
+		space_left -= sizeof(journal_block_tag_t);
+
+		if (first_tag) {
+			memcpy (tagp, journal->j_uuid, 16);
+			tagp += 16;
+			space_left -= 16;
+			first_tag = 0;
+		}
+
+		/* If there's no more to do, or if the descriptor is full,
+		   let the IO rip! */
+
+		if (bufs == journal->j_wbufsize ||
+		    commit_transaction->t_buffers == NULL ||
+		    space_left < sizeof(journal_block_tag_t) + 16) {
+
+			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+			/* Write an end-of-descriptor marker before
+                           submitting the IOs.  "tag" still points to
+                           the last tag we set up. */
+
+			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
+
+start_journal_io:
+			for (i = 0; i < bufs; i++) {
+				struct buffer_head *bh = wbuf[i];
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				bh->b_end_io = journal_end_buffer_io_sync;
+				/*
+				 * In data=journal mode, here we can end up
+				 * writing pagecache data that might be
+				 * mmapped. Since we can't afford to clean the
+				 * page and set PageWriteback (see the comment
+				 * near the other use of _submit_bh()), the
+				 * data can change while the write is in
+				 * flight.  Tell the block layer to bounce the
+				 * bio pages if stable pages are required.
+				 */
+				_submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
+			}
+			cond_resched();
+
+			/* Force a new descriptor to be generated next
+                           time round the loop. */
+			descriptor = NULL;
+			bufs = 0;
+		}
+	}
+
+	blk_finish_plug(&plug);
+
+	/* Lo and behold: we have just managed to send a transaction to
+           the log.  Before we can commit it, wait for the IO so far to
+           complete.  Control buffers being written are on the
+           transaction's t_log_list queue, and metadata buffers are on
+           the t_iobuf_list queue.
+
+	   Wait for the buffers in reverse order.  That way we are
+	   less likely to be woken up until all IOs have completed, and
+	   so we incur less scheduling load.
+	*/
+
+	jbd_debug(3, "JBD: commit phase 4\n");
+
+	/*
+	 * akpm: these are BJ_IO, and j_list_lock is not needed.
+	 * See __journal_try_to_free_buffer.
+	 */
+wait_for_iobuf:
+	while (commit_transaction->t_iobuf_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_iobuf_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_iobuf;
+		}
+		if (cond_resched())
+			goto wait_for_iobuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		clear_buffer_jwrite(bh);
+
+		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+		journal_unfile_buffer(journal, jh);
+
+		/*
+		 * ->t_iobuf_list should contain only dummy buffer_heads
+		 * which were created by journal_write_metadata_buffer().
+		 */
+		BUFFER_TRACE(bh, "dumping temporary bh");
+		journal_put_journal_head(jh);
+		__brelse(bh);
+		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+		free_buffer_head(bh);
+
+		/* We also have to unlock and free the corresponding
+                   shadowed buffer */
+		jh = commit_transaction->t_shadow_list->b_tprev;
+		bh = jh2bh(jh);
+		clear_buffer_jwrite(bh);
+		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+
+		/* The metadata is now released for reuse, but we need
+                   to remember it against this transaction so that when
+                   we finally commit, we can do any checkpointing
+                   required. */
+		JBUFFER_TRACE(jh, "file as BJ_Forget");
+		journal_file_buffer(jh, commit_transaction, BJ_Forget);
+		/*
+		 * Wake up any transactions which were waiting for this
+		 * IO to complete. The barrier must be here so that changes
+		 * by journal_file_buffer() take effect before wake_up_bit()
+		 * does the waitqueue check.
+		 */
+		smp_mb();
+		wake_up_bit(&bh->b_state, BH_Unshadow);
+		JBUFFER_TRACE(jh, "brelse shadowed buffer");
+		__brelse(bh);
+	}
+
+	J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+	jbd_debug(3, "JBD: commit phase 5\n");
+
+	/* Here we wait for the revoke record and descriptor record buffers */
+ wait_for_ctlbuf:
+	while (commit_transaction->t_log_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_log_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_ctlbuf;
+		}
+		if (cond_resched())
+			goto wait_for_ctlbuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+		clear_buffer_jwrite(bh);
+		journal_unfile_buffer(journal, jh);
+		journal_put_journal_head(jh);
+		__brelse(bh);		/* One for getblk */
+		/* AKPM: bforget here */
+	}
+
+	if (err)
+		journal_abort(journal, err);
+
+	jbd_debug(3, "JBD: commit phase 6\n");
+
+	/* All metadata is written, now write commit record and do cleanup */
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	commit_transaction->t_state = T_COMMIT_RECORD;
+	spin_unlock(&journal->j_state_lock);
+
+	if (journal_write_commit_record(journal, commit_transaction))
+		err = -EIO;
+
+	if (err)
+		journal_abort(journal, err);
+
+	/* End of a transaction!  Finally, we can do checkpoint
+           processing: any buffers committed as a result of this
+           transaction can be removed from any checkpoint list it was on
+           before. */
+
+	jbd_debug(3, "JBD: commit phase 7\n");
+
+	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(commit_transaction->t_buffers == NULL);
+	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+	J_ASSERT(commit_transaction->t_shadow_list == NULL);
+	J_ASSERT(commit_transaction->t_log_list == NULL);
+
+restart_loop:
+	/*
+	 * As there are other places (journal_unmap_buffer()) adding buffers
+	 * to this list we have to be careful and hold the j_list_lock.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_forget) {
+		transaction_t *cp_transaction;
+		struct buffer_head *bh;
+		int try_to_free = 0;
+
+		jh = commit_transaction->t_forget;
+		spin_unlock(&journal->j_list_lock);
+		bh = jh2bh(jh);
+		/*
+		 * Get a reference so that bh cannot be freed before we are
+		 * done with it.
+		 */
+		get_bh(bh);
+		jbd_lock_bh_state(bh);
+		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
+			jh->b_transaction == journal->j_running_transaction);
+
+		/*
+		 * If there is undo-protected committed data against
+		 * this buffer, then we can remove it now.  If it is a
+		 * buffer needing such protection, the old frozen_data
+		 * field now points to a committed version of the
+		 * buffer, so rotate that field to the new committed
+		 * data.
+		 *
+		 * Otherwise, we can just throw away the frozen data now.
+		 */
+		if (jh->b_committed_data) {
+			jbd_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			if (jh->b_frozen_data) {
+				jh->b_committed_data = jh->b_frozen_data;
+				jh->b_frozen_data = NULL;
+			}
+		} else if (jh->b_frozen_data) {
+			jbd_free(jh->b_frozen_data, bh->b_size);
+			jh->b_frozen_data = NULL;
+		}
+
+		spin_lock(&journal->j_list_lock);
+		cp_transaction = jh->b_cp_transaction;
+		if (cp_transaction) {
+			JBUFFER_TRACE(jh, "remove from old cp transaction");
+			__journal_remove_checkpoint(jh);
+		}
+
+		/* Only re-checkpoint the buffer_head if it is marked
+		 * dirty.  If the buffer was added to the BJ_Forget list
+		 * by journal_forget, it may no longer be dirty and
+		 * there's no point in keeping a checkpoint record for
+		 * it. */
+
+		/*
+		 * A buffer which has been freed while still being journaled by
+		 * a previous transaction.
+		 */
+		if (buffer_freed(bh)) {
+			/*
+			 * If the running transaction is the one containing
+			 * "add to orphan" operation (b_next_transaction !=
+			 * NULL), we have to wait for that transaction to
+			 * commit before we can really get rid of the buffer.
+			 * So just clear b_modified to not confuse transaction
+			 * credit accounting and refile the buffer to
+			 * BJ_Forget of the running transaction. If the just
+			 * committed transaction contains "add to orphan"
+			 * operation, we can completely invalidate the buffer
+			 * now. We are rather throughout in that since the
+			 * buffer may be still accessible when blocksize <
+			 * pagesize and it is attached to the last partial
+			 * page.
+			 */
+			jh->b_modified = 0;
+			if (!jh->b_next_transaction) {
+				clear_buffer_freed(bh);
+				clear_buffer_jbddirty(bh);
+				clear_buffer_mapped(bh);
+				clear_buffer_new(bh);
+				clear_buffer_req(bh);
+				bh->b_bdev = NULL;
+			}
+		}
+
+		if (buffer_jbddirty(bh)) {
+			JBUFFER_TRACE(jh, "add to new checkpointing trans");
+			__journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
+		} else {
+			J_ASSERT_BH(bh, !buffer_dirty(bh));
+			/*
+			 * The buffer on BJ_Forget list and not jbddirty means
+			 * it has been freed by this transaction and hence it
+			 * could not have been reallocated until this
+			 * transaction has committed. *BUT* it could be
+			 * reallocated once we have written all the data to
+			 * disk and before we process the buffer on BJ_Forget
+			 * list.
+			 */
+			if (!jh->b_next_transaction)
+				try_to_free = 1;
+		}
+		JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+		__journal_refile_buffer(jh);
+		jbd_unlock_bh_state(bh);
+		if (try_to_free)
+			release_buffer_page(bh);
+		else
+			__brelse(bh);
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+	/*
+	 * This is a bit sleazy.  We use j_list_lock to protect transition
+	 * of a transaction into T_FINISHED state and calling
+	 * __journal_drop_transaction(). Otherwise we could race with
+	 * other checkpointing code processing the transaction...
+	 */
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	/*
+	 * Now recheck if some buffers did not get attached to the transaction
+	 * while the lock was dropped...
+	 */
+	if (commit_transaction->t_forget) {
+		spin_unlock(&journal->j_list_lock);
+		spin_unlock(&journal->j_state_lock);
+		goto restart_loop;
+	}
+
+	/* Done with this transaction! */
+
+	jbd_debug(3, "JBD: commit phase 8\n");
+
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
+
+	commit_transaction->t_state = T_FINISHED;
+	J_ASSERT(commit_transaction == journal->j_committing_transaction);
+	journal->j_commit_sequence = commit_transaction->t_tid;
+	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time*3 +
+				journal->j_average_commit_time) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+
+	spin_unlock(&journal->j_state_lock);
+
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__journal_drop_transaction(journal, commit_transaction);
+	} else {
+		if (journal->j_checkpoint_transactions == NULL) {
+			journal->j_checkpoint_transactions = commit_transaction;
+			commit_transaction->t_cpnext = commit_transaction;
+			commit_transaction->t_cpprev = commit_transaction;
+		} else {
+			commit_transaction->t_cpnext =
+				journal->j_checkpoint_transactions;
+			commit_transaction->t_cpprev =
+				commit_transaction->t_cpnext->t_cpprev;
+			commit_transaction->t_cpnext->t_cpprev =
+				commit_transaction;
+			commit_transaction->t_cpprev->t_cpnext =
+				commit_transaction;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	trace_jbd_end_commit(journal, commit_transaction);
+	jbd_debug(1, "JBD: commit %d complete, head %d\n",
+		  journal->j_commit_sequence, journal->j_tail_sequence);
+
+	wake_up(&journal->j_wait_done_commit);
+}
diff --git a/kernel/fs/jbd/journal.c b/kernel/fs/jbd/journal.c
new file mode 100644
index 000000000..c46a79adb
--- /dev/null
+++ b/kernel/fs/jbd/journal.c
@@ -0,0 +1,2145 @@
+/*
+ * linux/fs/jbd/journal.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates.  This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/freezer.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+
+EXPORT_SYMBOL(journal_start);
+EXPORT_SYMBOL(journal_restart);
+EXPORT_SYMBOL(journal_extend);
+EXPORT_SYMBOL(journal_stop);
+EXPORT_SYMBOL(journal_lock_updates);
+EXPORT_SYMBOL(journal_unlock_updates);
+EXPORT_SYMBOL(journal_get_write_access);
+EXPORT_SYMBOL(journal_get_create_access);
+EXPORT_SYMBOL(journal_get_undo_access);
+EXPORT_SYMBOL(journal_dirty_data);
+EXPORT_SYMBOL(journal_dirty_metadata);
+EXPORT_SYMBOL(journal_release_buffer);
+EXPORT_SYMBOL(journal_forget);
+#if 0
+EXPORT_SYMBOL(journal_sync_buffer);
+#endif
+EXPORT_SYMBOL(journal_flush);
+EXPORT_SYMBOL(journal_revoke);
+
+EXPORT_SYMBOL(journal_init_dev);
+EXPORT_SYMBOL(journal_init_inode);
+EXPORT_SYMBOL(journal_update_format);
+EXPORT_SYMBOL(journal_check_used_features);
+EXPORT_SYMBOL(journal_check_available_features);
+EXPORT_SYMBOL(journal_set_features);
+EXPORT_SYMBOL(journal_create);
+EXPORT_SYMBOL(journal_load);
+EXPORT_SYMBOL(journal_destroy);
+EXPORT_SYMBOL(journal_abort);
+EXPORT_SYMBOL(journal_errno);
+EXPORT_SYMBOL(journal_ack_err);
+EXPORT_SYMBOL(journal_clear_err);
+EXPORT_SYMBOL(log_wait_commit);
+EXPORT_SYMBOL(log_start_commit);
+EXPORT_SYMBOL(journal_start_commit);
+EXPORT_SYMBOL(journal_force_commit_nested);
+EXPORT_SYMBOL(journal_wipe);
+EXPORT_SYMBOL(journal_blocks_per_page);
+EXPORT_SYMBOL(journal_invalidatepage);
+EXPORT_SYMBOL(journal_try_to_free_buffers);
+EXPORT_SYMBOL(journal_force_commit);
+
+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
+static const char *journal_dev_name(journal_t *journal, char *buffer);
+
+#ifdef CONFIG_JBD_DEBUG
+void __jbd_debug(int level, const char *file, const char *func,
+		 unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (level > journal_enable_debug)
+		return;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+	va_end(args);
+}
+EXPORT_SYMBOL(__jbd_debug);
+#endif
+
+/*
+ * Helper function used to manage commit timeouts
+ */
+
+static void commit_timeout(unsigned long __data)
+{
+	struct task_struct * p = (struct task_struct *) __data;
+
+	wake_up_process(p);
+}
+
+/*
+ * kjournald: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT:  Every so often we need to commit the current state of the
+ *    filesystem to disk.  The journal thread is responsible for writing
+ *    all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ *    of the data in that part of the log has been rewritten elsewhere on
+ *    the disk.  Flushing these old buffers to reclaim space in the log is
+ *    known as checkpointing, and this thread is responsible for that job.
+ */
+
+static int kjournald(void *arg)
+{
+	journal_t *journal = arg;
+	transaction_t *transaction;
+
+	/*
+	 * Set up an interval timer which can be used to trigger a commit wakeup
+	 * after the commit interval expires
+	 */
+	setup_timer(&journal->j_commit_timer, commit_timeout,
+			(unsigned long)current);
+
+	set_freezable();
+
+	/* Record that the journal thread is running */
+	journal->j_task = current;
+	wake_up(&journal->j_wait_done_commit);
+
+	printk(KERN_INFO "kjournald starting.  Commit interval %ld seconds\n",
+			journal->j_commit_interval / HZ);
+
+	/*
+	 * And now, wait forever for commit wakeup events.
+	 */
+	spin_lock(&journal->j_state_lock);
+
+loop:
+	if (journal->j_flags & JFS_UNMOUNT)
+		goto end_loop;
+
+	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+		journal->j_commit_sequence, journal->j_commit_request);
+
+	if (journal->j_commit_sequence != journal->j_commit_request) {
+		jbd_debug(1, "OK, requests differ\n");
+		spin_unlock(&journal->j_state_lock);
+		del_timer_sync(&journal->j_commit_timer);
+		journal_commit_transaction(journal);
+		spin_lock(&journal->j_state_lock);
+		goto loop;
+	}
+
+	wake_up(&journal->j_wait_done_commit);
+	if (freezing(current)) {
+		/*
+		 * The simpler the better. Flushing journal isn't a
+		 * good idea, because that depends on threads that may
+		 * be already stopped.
+		 */
+		jbd_debug(1, "Now suspending kjournald\n");
+		spin_unlock(&journal->j_state_lock);
+		try_to_freeze();
+		spin_lock(&journal->j_state_lock);
+	} else {
+		/*
+		 * We assume on resume that commits are already there,
+		 * so we don't sleep
+		 */
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&journal->j_wait_commit, &wait,
+				TASK_INTERRUPTIBLE);
+		if (journal->j_commit_sequence != journal->j_commit_request)
+			should_sleep = 0;
+		transaction = journal->j_running_transaction;
+		if (transaction && time_after_eq(jiffies,
+						transaction->t_expires))
+			should_sleep = 0;
+		if (journal->j_flags & JFS_UNMOUNT)
+			should_sleep = 0;
+		if (should_sleep) {
+			spin_unlock(&journal->j_state_lock);
+			schedule();
+			spin_lock(&journal->j_state_lock);
+		}
+		finish_wait(&journal->j_wait_commit, &wait);
+	}
+
+	jbd_debug(1, "kjournald wakes\n");
+
+	/*
+	 * Were we woken up by a commit wakeup event?
+	 */
+	transaction = journal->j_running_transaction;
+	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
+		journal->j_commit_request = transaction->t_tid;
+		jbd_debug(1, "woke because of timeout\n");
+	}
+	goto loop;
+
+end_loop:
+	spin_unlock(&journal->j_state_lock);
+	del_timer_sync(&journal->j_commit_timer);
+	journal->j_task = NULL;
+	wake_up(&journal->j_wait_done_commit);
+	jbd_debug(1, "Journal thread exiting.\n");
+	return 0;
+}
+
+static int journal_start_thread(journal_t *journal)
+{
+	struct task_struct *t;
+
+	t = kthread_run(kjournald, journal, "kjournald");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
+	return 0;
+}
+
+static void journal_kill_thread(journal_t *journal)
+{
+	spin_lock(&journal->j_state_lock);
+	journal->j_flags |= JFS_UNMOUNT;
+
+	while (journal->j_task) {
+		wake_up(&journal->j_wait_commit);
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit,
+				journal->j_task == NULL);
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+}
+
+/*
+ * journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block.  The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here.  If the buffer happens to start with the
+ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks.  In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block.  The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO.  If we end up using the existing buffer_head's data
+ * for the write, then we *have* to lock the buffer to prevent anyone
+ * else from using and possibly modifying it while the IO is in
+ * progress.
+ *
+ * The function returns a pointer to the buffer_heads to be used for IO.
+ *
+ * We assume that the journal has already been locked in this function.
+ *
+ * Return value:
+ *  <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+
+int journal_write_metadata_buffer(transaction_t *transaction,
+				  struct journal_head  *jh_in,
+				  struct journal_head **jh_out,
+				  unsigned int blocknr)
+{
+	int need_copy_out = 0;
+	int done_copy_out = 0;
+	int do_escape = 0;
+	char *mapped_data;
+	struct buffer_head *new_bh;
+	struct journal_head *new_jh;
+	struct page *new_page;
+	unsigned int new_offset;
+	struct buffer_head *bh_in = jh2bh(jh_in);
+	journal_t *journal = transaction->t_journal;
+
+	/*
+	 * The buffer really shouldn't be locked: only the current committing
+	 * transaction is allowed to write it, so nobody else is allowed
+	 * to do any IO.
+	 *
+	 * akpm: except if we're journalling data, and write() output is
+	 * also part of a shared mapping, and another thread has
+	 * decided to launch a writepage() against this buffer.
+	 */
+	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+
+	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+	/* keep subsequent assertions sane */
+	atomic_set(&new_bh->b_count, 1);
+	new_jh = journal_add_journal_head(new_bh);	/* This sleeps */
+
+	/*
+	 * If a new transaction has already done a buffer copy-out, then
+	 * we use that version of the data for the commit.
+	 */
+	jbd_lock_bh_state(bh_in);
+repeat:
+	if (jh_in->b_frozen_data) {
+		done_copy_out = 1;
+		new_page = virt_to_page(jh_in->b_frozen_data);
+		new_offset = offset_in_page(jh_in->b_frozen_data);
+	} else {
+		new_page = jh2bh(jh_in)->b_page;
+		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+	}
+
+	mapped_data = kmap_atomic(new_page);
+	/*
+	 * Check for escaping
+	 */
+	if (*((__be32 *)(mapped_data + new_offset)) ==
+				cpu_to_be32(JFS_MAGIC_NUMBER)) {
+		need_copy_out = 1;
+		do_escape = 1;
+	}
+	kunmap_atomic(mapped_data);
+
+	/*
+	 * Do we need to do a data copy?
+	 */
+	if (need_copy_out && !done_copy_out) {
+		char *tmp;
+
+		jbd_unlock_bh_state(bh_in);
+		tmp = jbd_alloc(bh_in->b_size, GFP_NOFS);
+		jbd_lock_bh_state(bh_in);
+		if (jh_in->b_frozen_data) {
+			jbd_free(tmp, bh_in->b_size);
+			goto repeat;
+		}
+
+		jh_in->b_frozen_data = tmp;
+		mapped_data = kmap_atomic(new_page);
+		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+		kunmap_atomic(mapped_data);
+
+		new_page = virt_to_page(tmp);
+		new_offset = offset_in_page(tmp);
+		done_copy_out = 1;
+	}
+
+	/*
+	 * Did we need to do an escaping?  Now we've done all the
+	 * copying, we can finally do so.
+	 */
+	if (do_escape) {
+		mapped_data = kmap_atomic(new_page);
+		*((unsigned int *)(mapped_data + new_offset)) = 0;
+		kunmap_atomic(mapped_data);
+	}
+
+	set_bh_page(new_bh, new_page, new_offset);
+	new_jh->b_transaction = NULL;
+	new_bh->b_size = jh2bh(jh_in)->b_size;
+	new_bh->b_bdev = transaction->t_journal->j_dev;
+	new_bh->b_blocknr = blocknr;
+	set_buffer_mapped(new_bh);
+	set_buffer_dirty(new_bh);
+
+	*jh_out = new_jh;
+
+	/*
+	 * The to-be-written buffer needs to get moved to the io queue,
+	 * and the original buffer whose contents we are shadowing or
+	 * copying is moved to the transaction's shadow queue.
+	 */
+	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+	spin_lock(&journal->j_list_lock);
+	__journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh_in);
+
+	JBUFFER_TRACE(new_jh, "file as BJ_IO");
+	journal_file_buffer(new_jh, transaction, BJ_IO);
+
+	return do_escape | (done_copy_out << 1);
+}
+
+/*
+ * Allocation code for the journal file.  Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+
+/*
+ * __log_space_left: Return the number of free blocks left in the journal.
+ *
+ * Called with the journal already locked.
+ *
+ * Called under j_state_lock
+ */
+
+int __log_space_left(journal_t *journal)
+{
+	int left = journal->j_free;
+
+	assert_spin_locked(&journal->j_state_lock);
+
+	/*
+	 * Be pessimistic here about the number of those free blocks which
+	 * might be required for log descriptor control blocks.
+	 */
+
+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+
+	left -= MIN_LOG_RESERVED_BLOCKS;
+
+	if (left <= 0)
+		return 0;
+	left -= (left >> 3);
+	return left;
+}
+
+/*
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
+ */
+int __log_start_commit(journal_t *journal, tid_t target)
+{
+	/*
+	 * The only transaction we can possibly wait upon is the
+	 * currently running transaction (if it exists).  Otherwise,
+	 * the target tid must be an old one.
+	 */
+	if (journal->j_commit_request != target &&
+	    journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == target) {
+		/*
+		 * We want a new commit: OK, mark the request and wakeup the
+		 * commit thread.  We do _not_ do the commit ourselves.
+		 */
+
+		journal->j_commit_request = target;
+		jbd_debug(1, "JBD: requesting commit %d/%d\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		return 1;
+	} else if (!tid_geq(journal->j_commit_request, target))
+		/* This should never happen, but if it does, preserve
+		   the evidence before kjournald goes into a loop and
+		   increments j_commit_sequence beyond all recognition. */
+		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+		    journal->j_commit_request, journal->j_commit_sequence,
+		    target, journal->j_running_transaction ?
+		    journal->j_running_transaction->t_tid : 0);
+	return 0;
+}
+
+int log_start_commit(journal_t *journal, tid_t tid)
+{
+	int ret;
+
+	spin_lock(&journal->j_state_lock);
+	ret = __log_start_commit(journal, tid);
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * We can only force the running transaction if we don't have an active handle;
+ * otherwise, we will deadlock.
+ *
+ * Returns true if a transaction was started.
+ */
+int journal_force_commit_nested(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction && !current->journal_info) {
+		transaction = journal->j_running_transaction;
+		__log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return 0;	/* Nothing to retry */
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	log_wait_commit(journal, tid);
+	return 1;
+}
+
+/*
+ * Start a commit of the current running transaction (if any).  Returns true
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
+ */
+int journal_start_commit(journal_t *journal, tid_t *ptid)
+{
+	int ret = 0;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction) {
+		tid_t tid = journal->j_running_transaction->t_tid;
+
+		__log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
+			*ptid = tid;
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
+		/*
+		 * If commit has been started, then we have to wait for
+		 * completion of that transaction.
+		 */
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
+		ret = 1;
+	}
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+int log_wait_commit(journal_t *journal, tid_t tid)
+{
+	int err = 0;
+
+#ifdef CONFIG_JBD_DEBUG
+	spin_lock(&journal->j_state_lock);
+	if (!tid_geq(journal->j_commit_request, tid)) {
+		printk(KERN_ERR
+		       "%s: error: j_commit_request=%d, tid=%d\n",
+		       __func__, journal->j_commit_request, tid);
+	}
+	spin_unlock(&journal->j_state_lock);
+#endif
+	spin_lock(&journal->j_state_lock);
+	/*
+	 * Not running or committing trans? Must be already committed. This
+	 * saves us from waiting for a *long* time when tid overflows.
+	 */
+	if (!((journal->j_running_transaction &&
+	       journal->j_running_transaction->t_tid == tid) ||
+	      (journal->j_committing_transaction &&
+	       journal->j_committing_transaction->t_tid == tid)))
+		goto out_unlock;
+
+	if (!tid_geq(journal->j_commit_waited, tid))
+		journal->j_commit_waited = tid;
+	while (tid_gt(tid, journal->j_commit_sequence)) {
+		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+				  tid, journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit,
+				!tid_gt(tid, journal->j_commit_sequence));
+		spin_lock(&journal->j_state_lock);
+	}
+out_unlock:
+	spin_unlock(&journal->j_state_lock);
+
+	if (unlikely(is_journal_aborted(journal)))
+		err = -EIO;
+	return err;
+}
+
+/*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+	int ret = 0;
+	transaction_t *commit_trans;
+
+	if (!(journal->j_flags & JFS_BARRIER))
+		return 0;
+	spin_lock(&journal->j_state_lock);
+	/* Transaction already committed? */
+	if (tid_geq(journal->j_commit_sequence, tid))
+		goto out;
+	/*
+	 * Transaction is being committed and we already proceeded to
+	 * writing commit record?
+	 */
+	commit_trans = journal->j_committing_transaction;
+	if (commit_trans && commit_trans->t_tid == tid &&
+	    commit_trans->t_state >= T_COMMIT_RECORD)
+		goto out;
+	ret = 1;
+out:
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
+
+/*
+ * Log buffer allocation routines:
+ */
+
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
+{
+	unsigned int blocknr;
+
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(journal->j_free > 1);
+
+	blocknr = journal->j_head;
+	journal->j_head++;
+	journal->j_free--;
+	if (journal->j_head == journal->j_last)
+		journal->j_head = journal->j_first;
+	spin_unlock(&journal->j_state_lock);
+	return journal_bmap(journal, blocknr, retp);
+}
+
+/*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op.  If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+		 unsigned int *retp)
+{
+	int err = 0;
+	unsigned int ret;
+
+	if (journal->j_inode) {
+		ret = bmap(journal->j_inode, blocknr);
+		if (ret)
+			*retp = ret;
+		else {
+			char b[BDEVNAME_SIZE];
+
+			printk(KERN_ALERT "%s: journal block not found "
+					"at offset %u on %s\n",
+				__func__,
+				blocknr,
+				bdevname(journal->j_dev, b));
+			err = -EIO;
+			__journal_abort_soft(journal, err);
+		}
+	} else {
+		*retp = blocknr; /* +journal->j_blk_offset */
+	}
+	return err;
+}
+
+/*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ *
+ * After the caller of journal_get_descriptor_buffer() has finished modifying
+ * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * But we don't bother doing that, so there will be coherency problems with
+ * mmaps of blockdevs which hold live JBD-controlled filesystems.
+ */
+struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
+{
+	struct buffer_head *bh;
+	unsigned int blocknr;
+	int err;
+
+	err = journal_next_log_block(journal, &blocknr);
+
+	if (err)
+		return NULL;
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return NULL;
+	lock_buffer(bh);
+	memset(bh->b_data, 0, journal->j_blocksize);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	BUFFER_TRACE(bh, "return this buffer");
+	return journal_add_journal_head(bh);
+}
+
+/*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk.  */
+
+/* First: create and setup a journal_t object in memory.  We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+
+static journal_t * journal_init_common (void)
+{
+	journal_t *journal;
+	int err;
+
+	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
+	if (!journal)
+		goto fail;
+
+	init_waitqueue_head(&journal->j_wait_transaction_locked);
+	init_waitqueue_head(&journal->j_wait_logspace);
+	init_waitqueue_head(&journal->j_wait_done_commit);
+	init_waitqueue_head(&journal->j_wait_checkpoint);
+	init_waitqueue_head(&journal->j_wait_commit);
+	init_waitqueue_head(&journal->j_wait_updates);
+	mutex_init(&journal->j_checkpoint_mutex);
+	spin_lock_init(&journal->j_revoke_lock);
+	spin_lock_init(&journal->j_list_lock);
+	spin_lock_init(&journal->j_state_lock);
+
+	journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
+
+	/* The journal is marked for error until we succeed with recovery! */
+	journal->j_flags = JFS_ABORT;
+
+	/* Set up a default-sized revoke table for the new mount. */
+	err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+	if (err) {
+		kfree(journal);
+		goto fail;
+	}
+	return journal;
+fail:
+	return NULL;
+}
+
+/* journal_init_dev and journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal.  We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ */
+
+/**
+ *  journal_t * journal_init_dev() - creates and initialises a journal structure
+ *  @bdev: Block device on which to create the journal
+ *  @fs_dev: Device which hold journalled filesystem for this journal.
+ *  @start: Block nr Start of journal.
+ *  @len:  Length of the journal in blocks.
+ *  @blocksize: blocksize of journalling device
+ *
+ *  Returns: a newly created journal_t *
+ *
+ *  journal_init_dev creates a journal which maps a fixed contiguous
+ *  range of blocks on an arbitrary block device.
+ *
+ */
+journal_t * journal_init_dev(struct block_device *bdev,
+			struct block_device *fs_dev,
+			int start, int len, int blocksize)
+{
+	journal_t *journal = journal_init_common();
+	struct buffer_head *bh;
+	int n;
+
+	if (!journal)
+		return NULL;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	journal->j_blocksize = blocksize;
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+	journal->j_dev = bdev;
+	journal->j_fs_dev = fs_dev;
+	journal->j_blk_offset = start;
+	journal->j_maxlen = len;
+
+	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	kfree(journal);
+	return NULL;
+}
+
+/**
+ *  journal_t * journal_init_inode () - creates a journal which maps to a inode.
+ *  @inode: An inode to create the journal in
+ *
+ * journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal.  The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+journal_t * journal_init_inode (struct inode *inode)
+{
+	struct buffer_head *bh;
+	journal_t *journal = journal_init_common();
+	int err;
+	int n;
+	unsigned int blocknr;
+
+	if (!journal)
+		return NULL;
+
+	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
+	journal->j_inode = inode;
+	jbd_debug(1,
+		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+		  journal, inode->i_sb->s_id, inode->i_ino,
+		  (long long) inode->i_size,
+		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+	journal->j_blocksize = inode->i_sb->s_blocksize;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+
+	err = journal_bmap(journal, 0, &blocknr);
+	/* If that failed, give up */
+	if (err) {
+		printk(KERN_ERR "%s: Cannot locate journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+
+	bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	kfree(journal);
+	return NULL;
+}
+
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	brelse(bh);
+	journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session.  We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+
+static int journal_reset(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	unsigned int first, last;
+
+	first = be32_to_cpu(sb->s_first);
+	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
+
+	journal->j_first = first;
+	journal->j_last = last;
+
+	journal->j_head = first;
+	journal->j_tail = first;
+	journal->j_free = last - first;
+
+	journal->j_tail_sequence = journal->j_transaction_sequence;
+	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+	journal->j_commit_request = journal->j_commit_sequence;
+
+	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+	/*
+	 * As a special case, if the on-disk copy is already marked as needing
+	 * no recovery (s_start == 0), then we can safely defer the superblock
+	 * update until the next commit by setting JFS_FLUSHED.  This avoids
+	 * attempting a write to a potential-readonly device.
+	 */
+	if (sb->s_start == 0) {
+		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+			"(start %u, seq %d, errno %d)\n",
+			journal->j_tail, journal->j_tail_sequence,
+			journal->j_errno);
+		journal->j_flags |= JFS_FLUSHED;
+	} else {
+		/* Lock here to make assertions happy... */
+		mutex_lock(&journal->j_checkpoint_mutex);
+		/*
+		 * Update log tail information. We use WRITE_FUA since new
+		 * transaction will start reusing journal space and so we
+		 * must make sure information about current log tail is on
+		 * disk before that.
+		 */
+		journal_update_sb_log_tail(journal,
+					   journal->j_tail_sequence,
+					   journal->j_tail,
+					   WRITE_FUA);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+	return journal_start_thread(journal);
+}
+
+/**
+ * int journal_create() - Initialise the new journal file
+ * @journal: Journal to create. This structure must have been initialised
+ *
+ * Given a journal_t structure which tells us which disk blocks we can
+ * use, create a new journal superblock and initialise all of the
+ * journal fields from scratch.
+ **/
+int journal_create(journal_t *journal)
+{
+	unsigned int blocknr;
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int i, err;
+
+	if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
+		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+			journal->j_maxlen);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
+
+	if (journal->j_inode == NULL) {
+		/*
+		 * We don't know what block to start at!
+		 */
+		printk(KERN_EMERG
+		       "%s: creation of journal on external device!\n",
+		       __func__);
+		BUG();
+	}
+
+	/* Zero out the entire journal on disk.  We cannot afford to
+	   have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
+	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+	for (i = 0; i < journal->j_maxlen; i++) {
+		err = journal_bmap(journal, i, &blocknr);
+		if (err)
+			return err;
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (unlikely(!bh))
+			return -ENOMEM;
+		lock_buffer(bh);
+		memset (bh->b_data, 0, journal->j_blocksize);
+		BUFFER_TRACE(bh, "marking dirty");
+		mark_buffer_dirty(bh);
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		__brelse(bh);
+	}
+
+	sync_blockdev(journal->j_dev);
+	jbd_debug(1, "JBD: journal cleared.\n");
+
+	/* OK, fill in the initial static fields in the new superblock */
+	sb = journal->j_superblock;
+
+	sb->s_header.h_magic	 = cpu_to_be32(JFS_MAGIC_NUMBER);
+	sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
+
+	sb->s_blocksize	= cpu_to_be32(journal->j_blocksize);
+	sb->s_maxlen	= cpu_to_be32(journal->j_maxlen);
+	sb->s_first	= cpu_to_be32(1);
+
+	journal->j_transaction_sequence = 1;
+
+	journal->j_flags &= ~JFS_ABORT;
+	journal->j_format_version = 2;
+
+	return journal_reset(journal);
+}
+
+static void journal_write_superblock(journal_t *journal, int write_op)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	int ret;
+
+	trace_journal_write_superblock(journal, write_op);
+	if (!(journal->j_flags & JFS_BARRIER))
+		write_op &= ~(REQ_FUA | REQ_FLUSH);
+	lock_buffer(bh);
+	if (buffer_write_io_error(bh)) {
+		char b[BDEVNAME_SIZE];
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal_dev_name(journal, b));
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
+	get_bh(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	ret = submit_bh(write_op, bh);
+	wait_on_buffer(bh);
+	if (buffer_write_io_error(bh)) {
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+		ret = -EIO;
+	}
+	if (ret) {
+		char b[BDEVNAME_SIZE];
+		printk(KERN_ERR "JBD: Error %d detected "
+		       "when updating journal superblock for %s.\n",
+		       ret, journal_dev_name(journal, b));
+	}
+}
+
+/**
+ * journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+				unsigned int tail_block, int write_op)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+	jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n",
+		  tail_block, tail_tid);
+
+	sb->s_sequence = cpu_to_be32(tail_tid);
+	sb->s_start    = cpu_to_be32(tail_block);
+
+	journal_write_superblock(journal, write_op);
+
+	/* Log is no longer empty */
+	spin_lock(&journal->j_state_lock);
+	WARN_ON(!sb->s_sequence);
+	journal->j_flags &= ~JFS_FLUSHED;
+	spin_unlock(&journal->j_state_lock);
+}
+
+/**
+ * mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void mark_journal_empty(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+	spin_lock(&journal->j_state_lock);
+	/* Is it already empty? */
+	if (sb->s_start == 0) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
+	jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
+        	  journal->j_tail_sequence);
+
+	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
+	sb->s_start    = cpu_to_be32(0);
+	spin_unlock(&journal->j_state_lock);
+
+	journal_write_superblock(journal, WRITE_FUA);
+
+	spin_lock(&journal->j_state_lock);
+	/* Log is empty */
+	journal->j_flags |= JFS_FLUSHED;
+	spin_unlock(&journal->j_state_lock);
+}
+
+/**
+ * journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno.  Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+static void journal_update_sb_errno(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	spin_lock(&journal->j_state_lock);
+	jbd_debug(1, "JBD: updating superblock error (errno %d)\n",
+        	  journal->j_errno);
+	sb->s_errno = cpu_to_be32(journal->j_errno);
+	spin_unlock(&journal->j_state_lock);
+
+	journal_write_superblock(journal, WRITE_SYNC);
+}
+
+/*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+
+static int journal_get_superblock(journal_t *journal)
+{
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int err = -EIO;
+
+	bh = journal->j_sb_buffer;
+
+	J_ASSERT(bh != NULL);
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			printk (KERN_ERR
+				"JBD: IO error reading journal superblock\n");
+			goto out;
+		}
+	}
+
+	sb = journal->j_superblock;
+
+	err = -EINVAL;
+
+	if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
+	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+		printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+		goto out;
+	}
+
+	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JFS_SUPERBLOCK_V1:
+		journal->j_format_version = 1;
+		break;
+	case JFS_SUPERBLOCK_V2:
+		journal->j_format_version = 2;
+		break;
+	default:
+		printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
+		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
+	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+		printk (KERN_WARNING "JBD: journal file too short\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_first) == 0 ||
+	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+		printk(KERN_WARNING
+			"JBD: Invalid start block of journal: %u\n",
+			be32_to_cpu(sb->s_first));
+		goto out;
+	}
+
+	return 0;
+
+out:
+	journal_fail_superblock(journal);
+	return err;
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+
+static int load_superblock(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+	journal->j_tail = be32_to_cpu(sb->s_start);
+	journal->j_first = be32_to_cpu(sb->s_first);
+	journal->j_last = be32_to_cpu(sb->s_maxlen);
+	journal->j_errno = be32_to_cpu(sb->s_errno);
+
+	return 0;
+}
+
+
+/**
+ * int journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ *
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+int journal_load(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+	/* If this is a V2 superblock, then we have to check the
+	 * features flags on it. */
+
+	if (journal->j_format_version >= 2) {
+		if ((sb->s_feature_ro_compat &
+		     ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
+		    (sb->s_feature_incompat &
+		     ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
+			printk (KERN_WARNING
+				"JBD: Unrecognised features on journal\n");
+			return -EINVAL;
+		}
+	}
+
+	/* Let the recovery code check whether it needs to recover any
+	 * data from the journal. */
+	if (journal_recover(journal))
+		goto recovery_error;
+
+	/* OK, we've finished with the dynamic journal bits:
+	 * reinitialise the dynamic contents of the superblock in memory
+	 * and reset them on disk. */
+	if (journal_reset(journal))
+		goto recovery_error;
+
+	journal->j_flags &= ~JFS_ABORT;
+	journal->j_flags |= JFS_LOADED;
+	return 0;
+
+recovery_error:
+	printk (KERN_WARNING "JBD: recovery failed\n");
+	return -EIO;
+}
+
+/**
+ * void journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+ *
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ * Return <0 if we couldn't clean up the journal.
+ */
+int journal_destroy(journal_t *journal)
+{
+	int err = 0;
+
+	
+	/* Wait for the commit thread to wake up and die. */
+	journal_kill_thread(journal);
+
+	/* Force a final log commit */
+	if (journal->j_running_transaction)
+		journal_commit_transaction(journal);
+
+	/* Force any old transactions to disk */
+
+	/* We cannot race with anybody but must keep assertions happy */
+	mutex_lock(&journal->j_checkpoint_mutex);
+	/* Totally anal locking here... */
+	spin_lock(&journal->j_list_lock);
+	while (journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		log_do_checkpoint(journal);
+		spin_lock(&journal->j_list_lock);
+	}
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+	J_ASSERT(journal->j_checkpoint_transactions == NULL);
+	spin_unlock(&journal->j_list_lock);
+
+	if (journal->j_sb_buffer) {
+		if (!is_journal_aborted(journal)) {
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			mark_journal_empty(journal);
+		} else
+			err = -EIO;
+		brelse(journal->j_sb_buffer);
+	}
+	mutex_unlock(&journal->j_checkpoint_mutex);
+
+	iput(journal->j_inode);
+	if (journal->j_revoke)
+		journal_destroy_revoke(journal);
+	kfree(journal->j_wbuf);
+	kfree(journal);
+
+	return err;
+}
+
+
+/**
+ *int journal_check_used_features () - Check if features specified are used.
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journal uses all of a given set of
+ * features.  Return true (non-zero) if it does.
+ **/
+
+int journal_check_used_features (journal_t *journal, unsigned long compat,
+				 unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (!compat && !ro && !incompat)
+		return 1;
+	if (journal->j_format_version == 1)
+		return 0;
+
+	sb = journal->j_superblock;
+
+	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int journal_check_available_features() - Check feature set in journalling layer
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journaling code supports the use of
+ * all of a given set of features on this journal.  Return true
+ * (non-zero) if it can. */
+
+int journal_check_available_features (journal_t *journal, unsigned long compat,
+				      unsigned long ro, unsigned long incompat)
+{
+	if (!compat && !ro && !incompat)
+		return 1;
+
+	/* We can support any known requested features iff the
+	 * superblock is in version 2.  Otherwise we fail to support any
+	 * extended sb features. */
+
+	if (journal->j_format_version != 2)
+		return 0;
+
+	if ((compat   & JFS_KNOWN_COMPAT_FEATURES) == compat &&
+	    (ro       & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
+	    (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int journal_set_features () - Mark a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Mark a given journal feature as present on the
+ * superblock.  Returns true if the requested features could be set.
+ *
+ */
+
+int journal_set_features (journal_t *journal, unsigned long compat,
+			  unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (journal_check_used_features(journal, compat, ro, incompat))
+		return 1;
+
+	if (!journal_check_available_features(journal, compat, ro, incompat))
+		return 0;
+
+	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	sb->s_feature_compat    |= cpu_to_be32(compat);
+	sb->s_feature_ro_compat |= cpu_to_be32(ro);
+	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+
+	return 1;
+}
+
+
+/**
+ * int journal_update_format () - Update on-disk journal structure.
+ * @journal: Journal to act on.
+ *
+ * Given an initialised but unloaded journal struct, poke about in the
+ * on-disk structure to update it to the most recent supported version.
+ */
+int journal_update_format (journal_t *journal)
+{
+	journal_superblock_t *sb;
+	int err;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	switch (be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JFS_SUPERBLOCK_V2:
+		return 0;
+	case JFS_SUPERBLOCK_V1:
+		return journal_convert_superblock_v1(journal, sb);
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+static int journal_convert_superblock_v1(journal_t *journal,
+					 journal_superblock_t *sb)
+{
+	int offset, blocksize;
+	struct buffer_head *bh;
+
+	printk(KERN_WARNING
+		"JBD: Converting superblock from version 1 to 2.\n");
+
+	/* Pre-initialise new fields to zero */
+	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+	blocksize = be32_to_cpu(sb->s_blocksize);
+	memset(&sb->s_feature_compat, 0, blocksize-offset);
+
+	sb->s_nr_users = cpu_to_be32(1);
+	sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
+	journal->j_format_version = 2;
+
+	bh = journal->j_sb_buffer;
+	BUFFER_TRACE(bh, "marking dirty");
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	return 0;
+}
+
+
+/**
+ * int journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ *
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+
+int journal_flush(journal_t *journal)
+{
+	int err = 0;
+	transaction_t *transaction = NULL;
+
+	spin_lock(&journal->j_state_lock);
+
+	/* Force everything buffered to the log... */
+	if (journal->j_running_transaction) {
+		transaction = journal->j_running_transaction;
+		__log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	/* Wait for the log commit to complete... */
+	if (transaction) {
+		tid_t tid = transaction->t_tid;
+
+		spin_unlock(&journal->j_state_lock);
+		log_wait_commit(journal, tid);
+	} else {
+		spin_unlock(&journal->j_state_lock);
+	}
+
+	/* ...and flush everything in the log out to disk. */
+	spin_lock(&journal->j_list_lock);
+	while (!err && journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+		err = log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+		spin_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
+	mutex_lock(&journal->j_checkpoint_mutex);
+	cleanup_journal_tail(journal);
+
+	/* Finally, mark the journal as really needing no recovery.
+	 * This sets s_start==0 in the underlying superblock, which is
+	 * the magic code for a fully-recovered superblock.  Any future
+	 * commits of data to the journal will restore the current
+	 * s_start value. */
+	mark_journal_empty(journal);
+	mutex_unlock(&journal->j_checkpoint_mutex);
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(!journal->j_running_transaction);
+	J_ASSERT(!journal->j_committing_transaction);
+	J_ASSERT(!journal->j_checkpoint_transactions);
+	J_ASSERT(journal->j_head == journal->j_tail);
+	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+	spin_unlock(&journal->j_state_lock);
+	return 0;
+}
+
+/**
+ * int journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ *
+ * Wipe out all of the contents of a journal, safely.  This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and journal_load().
+ *
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+
+int journal_wipe(journal_t *journal, int write)
+{
+	int err = 0;
+
+	J_ASSERT (!(journal->j_flags & JFS_LOADED));
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	if (!journal->j_tail)
+		goto no_recovery;
+
+	printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+		write ? "Clearing" : "Ignoring");
+
+	err = journal_skip_recovery(journal);
+	if (write) {
+		/* Lock to make assertions happy... */
+		mutex_lock(&journal->j_checkpoint_mutex);
+		mark_journal_empty(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+
+ no_recovery:
+	return err;
+}
+
+/*
+ * journal_dev_name: format a character string to describe on what
+ * device this journal is present.
+ */
+
+static const char *journal_dev_name(journal_t *journal, char *buffer)
+{
+	struct block_device *bdev;
+
+	if (journal->j_inode)
+		bdev = journal->j_inode->i_sb->s_bdev;
+	else
+		bdev = journal->j_dev;
+
+	return bdevname(bdev, buffer);
+}
+
+/*
+ * Journal abort has very specific semantics, which we describe
+ * for journal abort.
+ *
+ * Two internal function, which provide abort to te jbd layer
+ * itself are here.
+ */
+
+/*
+ * Quick version for internal journal use (doesn't lock the journal).
+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
+ * and don't attempt to make any other journal updates.
+ */
+static void __journal_abort_hard(journal_t *journal)
+{
+	transaction_t *transaction;
+	char b[BDEVNAME_SIZE];
+
+	if (journal->j_flags & JFS_ABORT)
+		return;
+
+	printk(KERN_ERR "Aborting journal on device %s.\n",
+		journal_dev_name(journal, b));
+
+	spin_lock(&journal->j_state_lock);
+	journal->j_flags |= JFS_ABORT;
+	transaction = journal->j_running_transaction;
+	if (transaction)
+		__log_start_commit(journal, transaction->t_tid);
+	spin_unlock(&journal->j_state_lock);
+}
+
+/* Soft abort: record the abort error status in the journal superblock,
+ * but don't do any other IO. */
+static void __journal_abort_soft (journal_t *journal, int errno)
+{
+	if (journal->j_flags & JFS_ABORT)
+		return;
+
+	if (!journal->j_errno)
+		journal->j_errno = errno;
+
+	__journal_abort_hard(journal);
+
+	if (errno)
+		journal_update_sb_errno(journal);
+}
+
+/**
+ * void journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno:   an error number to record in the journal indicating
+ *           the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction).  This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics.  Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return.  A
+ * journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock.  This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk.  ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ *
+ */
+
+void journal_abort(journal_t *journal, int errno)
+{
+	__journal_abort_soft(journal, errno);
+}
+
+/**
+ * int journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno numbet set with journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
+int journal_errno(journal_t *journal)
+{
+	int err;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_flags & JFS_ABORT)
+		err = -EROFS;
+	else
+		err = journal->j_errno;
+	spin_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * int journal_clear_err () - clears the journal's error state
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+int journal_clear_err(journal_t *journal)
+{
+	int err = 0;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_flags & JFS_ABORT)
+		err = -EROFS;
+	else
+		journal->j_errno = 0;
+	spin_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * void journal_ack_err() - Ack journal err.
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+void journal_ack_err(journal_t *journal)
+{
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_errno)
+		journal->j_flags |= JFS_ACK_ERR;
+	spin_unlock(&journal->j_state_lock);
+}
+
+int journal_blocks_per_page(struct inode *inode)
+{
+	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+}
+
+/*
+ * Journal_head storage management
+ */
+static struct kmem_cache *journal_head_cache;
+#ifdef CONFIG_JBD_DEBUG
+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+#endif
+
+static int journal_init_journal_head_cache(void)
+{
+	int retval;
+
+	J_ASSERT(journal_head_cache == NULL);
+	journal_head_cache = kmem_cache_create("journal_head",
+				sizeof(struct journal_head),
+				0,		/* offset */
+				SLAB_TEMPORARY,	/* flags */
+				NULL);		/* ctor */
+	retval = 0;
+	if (!journal_head_cache) {
+		retval = -ENOMEM;
+		printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+	}
+	return retval;
+}
+
+static void journal_destroy_journal_head_cache(void)
+{
+	if (journal_head_cache) {
+		kmem_cache_destroy(journal_head_cache);
+		journal_head_cache = NULL;
+	}
+}
+
+/*
+ * journal_head splicing and dicing
+ */
+static struct journal_head *journal_alloc_journal_head(void)
+{
+	struct journal_head *ret;
+
+#ifdef CONFIG_JBD_DEBUG
+	atomic_inc(&nr_journal_heads);
+#endif
+	ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
+	if (ret == NULL) {
+		jbd_debug(1, "out of memory for journal_head\n");
+		printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+				   __func__);
+
+		while (ret == NULL) {
+			yield();
+			ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
+		}
+	}
+	return ret;
+}
+
+static void journal_free_journal_head(struct journal_head *jh)
+{
+#ifdef CONFIG_JBD_DEBUG
+	atomic_dec(&nr_journal_heads);
+	memset(jh, JBD_POISON_FREE, sizeof(*jh));
+#endif
+	kmem_cache_free(journal_head_cache, jh);
+}
+
+/*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set.  This bit is tested in core kernel code where we need to take
+ * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head is detached from its buffer_head when the journal_head's
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
+ * transaction (b_cp_transaction) hold their references to b_jcount.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction.  To protect the
+ * journal_head in this situation, journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one.  The caller must call
+ * journal_put_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ *	(Attach a journal_head if needed.  Increments b_jcount)
+ *	struct journal_head *jh = journal_add_journal_head(bh);
+ *	...
+ *      (Get another reference for transaction)
+ *      journal_grab_journal_head(bh);
+ *      jh->b_transaction = xxx;
+ *      (Put original reference)
+ *      journal_put_journal_head(jh);
+ */
+
+/*
+ * Give a buffer_head a journal_head.
+ *
+ * May sleep.
+ */
+struct journal_head *journal_add_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh;
+	struct journal_head *new_jh = NULL;
+
+repeat:
+	if (!buffer_jbd(bh))
+		new_jh = journal_alloc_journal_head();
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+	} else {
+		J_ASSERT_BH(bh,
+			(atomic_read(&bh->b_count) > 0) ||
+			(bh->b_page && bh->b_page->mapping));
+
+		if (!new_jh) {
+			jbd_unlock_bh_journal_head(bh);
+			goto repeat;
+		}
+
+		jh = new_jh;
+		new_jh = NULL;		/* We consumed it */
+		set_buffer_jbd(bh);
+		bh->b_private = jh;
+		jh->b_bh = bh;
+		get_bh(bh);
+		BUFFER_TRACE(bh, "added journal_head");
+	}
+	jh->b_jcount++;
+	jbd_unlock_bh_journal_head(bh);
+	if (new_jh)
+		journal_free_journal_head(new_jh);
+	return bh->b_private;
+}
+
+/*
+ * Grab a ref against this buffer_head's journal_head.  If it ended up not
+ * having a journal_head, return NULL
+ */
+struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = NULL;
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+		jh->b_jcount++;
+	}
+	jbd_unlock_bh_journal_head(bh);
+	return jh;
+}
+
+static void __journal_remove_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	J_ASSERT_JH(jh, jh->b_jcount >= 0);
+	J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+	J_ASSERT_BH(bh, buffer_jbd(bh));
+	J_ASSERT_BH(bh, jh2bh(jh) == bh);
+	BUFFER_TRACE(bh, "remove journal_head");
+	if (jh->b_frozen_data) {
+		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+		jbd_free(jh->b_frozen_data, bh->b_size);
+	}
+	if (jh->b_committed_data) {
+		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+		jbd_free(jh->b_committed_data, bh->b_size);
+	}
+	bh->b_private = NULL;
+	jh->b_bh = NULL;	/* debug, really */
+	clear_buffer_jbd(bh);
+	journal_free_journal_head(jh);
+}
+
+/*
+ * Drop a reference on the passed journal_head.  If it fell to zero then
+ * release the journal_head from the buffer_head.
+ */
+void journal_put_journal_head(struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_lock_bh_journal_head(bh);
+	J_ASSERT_JH(jh, jh->b_jcount > 0);
+	--jh->b_jcount;
+	if (!jh->b_jcount) {
+		__journal_remove_journal_head(bh);
+		jbd_unlock_bh_journal_head(bh);
+		__brelse(bh);
+	} else
+		jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * debugfs tunables
+ */
+#ifdef CONFIG_JBD_DEBUG
+
+u8 journal_enable_debug __read_mostly;
+EXPORT_SYMBOL(journal_enable_debug);
+
+static struct dentry *jbd_debugfs_dir;
+static struct dentry *jbd_debug;
+
+static void __init jbd_create_debugfs_entry(void)
+{
+	jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
+	if (jbd_debugfs_dir)
+		jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
+					       jbd_debugfs_dir,
+					       &journal_enable_debug);
+}
+
+static void __exit jbd_remove_debugfs_entry(void)
+{
+	debugfs_remove(jbd_debug);
+	debugfs_remove(jbd_debugfs_dir);
+}
+
+#else
+
+static inline void jbd_create_debugfs_entry(void)
+{
+}
+
+static inline void jbd_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
+struct kmem_cache *jbd_handle_cache;
+
+static int __init journal_init_handle_cache(void)
+{
+	jbd_handle_cache = kmem_cache_create("journal_handle",
+				sizeof(handle_t),
+				0,		/* offset */
+				SLAB_TEMPORARY,	/* flags */
+				NULL);		/* ctor */
+	if (jbd_handle_cache == NULL) {
+		printk(KERN_EMERG "JBD: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void journal_destroy_handle_cache(void)
+{
+	if (jbd_handle_cache)
+		kmem_cache_destroy(jbd_handle_cache);
+}
+
+/*
+ * Module startup and shutdown
+ */
+
+static int __init journal_init_caches(void)
+{
+	int ret;
+
+	ret = journal_init_revoke_caches();
+	if (ret == 0)
+		ret = journal_init_journal_head_cache();
+	if (ret == 0)
+		ret = journal_init_handle_cache();
+	return ret;
+}
+
+static void journal_destroy_caches(void)
+{
+	journal_destroy_revoke_caches();
+	journal_destroy_journal_head_cache();
+	journal_destroy_handle_cache();
+}
+
+static int __init journal_init(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
+
+	ret = journal_init_caches();
+	if (ret != 0)
+		journal_destroy_caches();
+	jbd_create_debugfs_entry();
+	return ret;
+}
+
+static void __exit journal_exit(void)
+{
+#ifdef CONFIG_JBD_DEBUG
+	int n = atomic_read(&nr_journal_heads);
+	if (n)
+		printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
+#endif
+	jbd_remove_debugfs_entry();
+	journal_destroy_caches();
+}
+
+MODULE_LICENSE("GPL");
+module_init(journal_init);
+module_exit(journal_exit);
+
diff --git a/kernel/fs/jbd/recovery.c b/kernel/fs/jbd/recovery.c
new file mode 100644
index 000000000..a748fe214
--- /dev/null
+++ b/kernel/fs/jbd/recovery.c
@@ -0,0 +1,594 @@
+/*
+ * linux/fs/jbd/recovery.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/blkdev.h>
+#endif
+
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+struct recovery_info
+{
+	tid_t		start_transaction;
+	tid_t		end_transaction;
+
+	int		nr_replays;
+	int		nr_revokes;
+	int		nr_revoke_hits;
+};
+
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+				struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+				tid_t, struct recovery_info *);
+
+#ifdef __KERNEL__
+
+/* Release readahead buffers after use */
+static void journal_brelse_array(struct buffer_head *b[], int n)
+{
+	while (--n >= 0)
+		brelse (b[n]);
+}
+
+
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us.  We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all.  Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though.  128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+	int err;
+	unsigned int max, nbufs, next;
+	unsigned int blocknr;
+	struct buffer_head *bh;
+
+	struct buffer_head * bufs[MAXBUF];
+
+	/* Do up to 128K of readahead */
+	max = start + (128 * 1024 / journal->j_blocksize);
+	if (max > journal->j_maxlen)
+		max = journal->j_maxlen;
+
+	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+	 * a time to the block device IO layer. */
+
+	nbufs = 0;
+
+	for (next = start; next < max; next++) {
+		err = journal_bmap(journal, next, &blocknr);
+
+		if (err) {
+			printk (KERN_ERR "JBD: bad block at offset %u\n",
+				next);
+			goto failed;
+		}
+
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (!bh) {
+			err = -ENOMEM;
+			goto failed;
+		}
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+			bufs[nbufs++] = bh;
+			if (nbufs == MAXBUF) {
+				ll_rw_block(READ, nbufs, bufs);
+				journal_brelse_array(bufs, nbufs);
+				nbufs = 0;
+			}
+		} else
+			brelse(bh);
+	}
+
+	if (nbufs)
+		ll_rw_block(READ, nbufs, bufs);
+	err = 0;
+
+failed:
+	if (nbufs)
+		journal_brelse_array(bufs, nbufs);
+	return err;
+}
+
+#endif /* __KERNEL__ */
+
+
+/*
+ * Read a block from the journal
+ */
+
+static int jread(struct buffer_head **bhp, journal_t *journal,
+		 unsigned int offset)
+{
+	int err;
+	unsigned int blocknr;
+	struct buffer_head *bh;
+
+	*bhp = NULL;
+
+	if (offset >= journal->j_maxlen) {
+		printk(KERN_ERR "JBD: corrupted journal superblock\n");
+		return -EIO;
+	}
+
+	err = journal_bmap(journal, offset, &blocknr);
+
+	if (err) {
+		printk (KERN_ERR "JBD: bad block at offset %u\n",
+			offset);
+		return err;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return -ENOMEM;
+
+	if (!buffer_uptodate(bh)) {
+		/* If this is a brand new buffer, start readahead.
+                   Otherwise, we assume we are already reading it.  */
+		if (!buffer_req(bh))
+			do_readahead(journal, offset);
+		wait_on_buffer(bh);
+	}
+
+	if (!buffer_uptodate(bh)) {
+		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+			offset);
+		brelse(bh);
+		return -EIO;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+
+static int count_tags(struct buffer_head *bh, int size)
+{
+	char *			tagp;
+	journal_block_tag_t *	tag;
+	int			nr = 0;
+
+	tagp = &bh->b_data[sizeof(journal_header_t)];
+
+	while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
+		tag = (journal_block_tag_t *) tagp;
+
+		nr++;
+		tagp += sizeof(journal_block_tag_t);
+		if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
+			tagp += 16;
+
+		if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
+			break;
+	}
+
+	return nr;
+}
+
+
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var)						\
+do {									\
+	if (var >= (journal)->j_last)					\
+		var -= ((journal)->j_last - (journal)->j_first);	\
+} while (0)
+
+/**
+ * journal_recover - recovers a on-disk journal
+ * @journal: the journal to recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes.  In the first pass, we look for the
+ * end of the log.  In the second, we assemble the list of revoke
+ * blocks.  In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+int journal_recover(journal_t *journal)
+{
+	int			err, err2;
+	journal_superblock_t *	sb;
+
+	struct recovery_info	info;
+
+	memset(&info, 0, sizeof(info));
+	sb = journal->j_superblock;
+
+	/*
+	 * The journal superblock's s_start field (the current log head)
+	 * is always zero if, and only if, the journal was cleanly
+	 * unmounted.
+	 */
+
+	if (!sb->s_start) {
+		jbd_debug(1, "No recovery required, last transaction %d\n",
+			  be32_to_cpu(sb->s_sequence));
+		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+		return 0;
+	}
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REVOKE);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REPLAY);
+
+	jbd_debug(1, "JBD: recovery, exit status %d, "
+		  "recovered transactions %u to %u\n",
+		  err, info.start_transaction, info.end_transaction);
+	jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
+		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+	/* Restart the log at the next transaction ID, thus invalidating
+	 * any existing commit records in the log. */
+	journal->j_transaction_sequence = ++info.end_transaction;
+
+	journal_clear_revoke(journal);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+	/* Flush disk caches to get replayed data on the permanent storage */
+	if (journal->j_flags & JFS_BARRIER) {
+		err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+		if (!err)
+			err = err2;
+	}
+
+	return err;
+}
+
+/**
+ * journal_skip_recovery - Start journal and wipe exiting records
+ * @journal: journal to startup
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+int journal_skip_recovery(journal_t *journal)
+{
+	int			err;
+	struct recovery_info	info;
+
+	memset (&info, 0, sizeof(info));
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+
+	if (err) {
+		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+		++journal->j_transaction_sequence;
+	} else {
+#ifdef CONFIG_JBD_DEBUG
+		int dropped = info.end_transaction -
+			      be32_to_cpu(journal->j_superblock->s_sequence);
+		jbd_debug(1,
+			  "JBD: ignoring %d transaction%s from the journal.\n",
+			  dropped, (dropped == 1) ? "" : "s");
+#endif
+		journal->j_transaction_sequence = ++info.end_transaction;
+	}
+
+	journal->j_tail = 0;
+	return err;
+}
+
+static int do_one_pass(journal_t *journal,
+			struct recovery_info *info, enum passtype pass)
+{
+	unsigned int		first_commit_ID, next_commit_ID;
+	unsigned int		next_log_block;
+	int			err, success = 0;
+	journal_superblock_t *	sb;
+	journal_header_t *	tmp;
+	struct buffer_head *	bh;
+	unsigned int		sequence;
+	int			blocktype;
+
+	/*
+	 * First thing is to establish what we expect to find in the log
+	 * (in terms of transaction IDs), and where (in terms of log
+	 * block offsets): query the superblock.
+	 */
+
+	sb = journal->j_superblock;
+	next_commit_ID = be32_to_cpu(sb->s_sequence);
+	next_log_block = be32_to_cpu(sb->s_start);
+
+	first_commit_ID = next_commit_ID;
+	if (pass == PASS_SCAN)
+		info->start_transaction = first_commit_ID;
+
+	jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+	/*
+	 * Now we walk through the log, transaction by transaction,
+	 * making sure that each transaction has a commit block in the
+	 * expected place.  Each complete transaction gets replayed back
+	 * into the main filesystem.
+	 */
+
+	while (1) {
+		int			flags;
+		char *			tagp;
+		journal_block_tag_t *	tag;
+		struct buffer_head *	obh;
+		struct buffer_head *	nbh;
+
+		cond_resched();
+
+		/* If we already know where to stop the log traversal,
+		 * check right now that we haven't gone past the end of
+		 * the log. */
+
+		if (pass != PASS_SCAN)
+			if (tid_geq(next_commit_ID, info->end_transaction))
+				break;
+
+		jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
+			  next_commit_ID, next_log_block, journal->j_last);
+
+		/* Skip over each chunk of the transaction looking
+		 * either the next descriptor block or the final commit
+		 * record. */
+
+		jbd_debug(3, "JBD: checking block %u\n", next_log_block);
+		err = jread(&bh, journal, next_log_block);
+		if (err)
+			goto failed;
+
+		next_log_block++;
+		wrap(journal, next_log_block);
+
+		/* What kind of buffer is it?
+		 *
+		 * If it is a descriptor block, check that it has the
+		 * expected sequence number.  Otherwise, we're all done
+		 * here. */
+
+		tmp = (journal_header_t *)bh->b_data;
+
+		if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
+			brelse(bh);
+			break;
+		}
+
+		blocktype = be32_to_cpu(tmp->h_blocktype);
+		sequence = be32_to_cpu(tmp->h_sequence);
+		jbd_debug(3, "Found magic %d, sequence %d\n",
+			  blocktype, sequence);
+
+		if (sequence != next_commit_ID) {
+			brelse(bh);
+			break;
+		}
+
+		/* OK, we have a valid descriptor block which matches
+		 * all of the sequence number checks.  What are we going
+		 * to do with it?  That depends on the pass... */
+
+		switch(blocktype) {
+		case JFS_DESCRIPTOR_BLOCK:
+			/* If it is a valid descriptor block, replay it
+			 * in pass REPLAY; otherwise, just skip over the
+			 * blocks it describes. */
+			if (pass != PASS_REPLAY) {
+				next_log_block +=
+					count_tags(bh, journal->j_blocksize);
+				wrap(journal, next_log_block);
+				brelse(bh);
+				continue;
+			}
+
+			/* A descriptor block: we can now write all of
+			 * the data blocks.  Yay, useful work is finally
+			 * getting done here! */
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+			       <= journal->j_blocksize) {
+				unsigned int io_block;
+
+				tag = (journal_block_tag_t *) tagp;
+				flags = be32_to_cpu(tag->t_flags);
+
+				io_block = next_log_block++;
+				wrap(journal, next_log_block);
+				err = jread(&obh, journal, io_block);
+				if (err) {
+					/* Recover what we can, but
+					 * report failure at the end. */
+					success = err;
+					printk (KERN_ERR
+						"JBD: IO error %d recovering "
+						"block %u in log\n",
+						err, io_block);
+				} else {
+					unsigned int blocknr;
+
+					J_ASSERT(obh != NULL);
+					blocknr = be32_to_cpu(tag->t_blocknr);
+
+					/* If the block has been
+					 * revoked, then we're all done
+					 * here. */
+					if (journal_test_revoke
+					    (journal, blocknr,
+					     next_commit_ID)) {
+						brelse(obh);
+						++info->nr_revoke_hits;
+						goto skip_write;
+					}
+
+					/* Find a buffer for the new
+					 * data being restored */
+					nbh = __getblk(journal->j_fs_dev,
+							blocknr,
+							journal->j_blocksize);
+					if (nbh == NULL) {
+						printk(KERN_ERR
+						       "JBD: Out of memory "
+						       "during recovery.\n");
+						err = -ENOMEM;
+						brelse(bh);
+						brelse(obh);
+						goto failed;
+					}
+
+					lock_buffer(nbh);
+					memcpy(nbh->b_data, obh->b_data,
+							journal->j_blocksize);
+					if (flags & JFS_FLAG_ESCAPE) {
+						*((__be32 *)nbh->b_data) =
+						cpu_to_be32(JFS_MAGIC_NUMBER);
+					}
+
+					BUFFER_TRACE(nbh, "marking dirty");
+					set_buffer_uptodate(nbh);
+					mark_buffer_dirty(nbh);
+					BUFFER_TRACE(nbh, "marking uptodate");
+					++info->nr_replays;
+					/* ll_rw_block(WRITE, 1, &nbh); */
+					unlock_buffer(nbh);
+					brelse(obh);
+					brelse(nbh);
+				}
+
+			skip_write:
+				tagp += sizeof(journal_block_tag_t);
+				if (!(flags & JFS_FLAG_SAME_UUID))
+					tagp += 16;
+
+				if (flags & JFS_FLAG_LAST_TAG)
+					break;
+			}
+
+			brelse(bh);
+			continue;
+
+		case JFS_COMMIT_BLOCK:
+			/* Found an expected commit block: not much to
+			 * do other than move on to the next sequence
+			 * number. */
+			brelse(bh);
+			next_commit_ID++;
+			continue;
+
+		case JFS_REVOKE_BLOCK:
+			/* If we aren't in the REVOKE pass, then we can
+			 * just skip over this block. */
+			if (pass != PASS_REVOKE) {
+				brelse(bh);
+				continue;
+			}
+
+			err = scan_revoke_records(journal, bh,
+						  next_commit_ID, info);
+			brelse(bh);
+			if (err)
+				goto failed;
+			continue;
+
+		default:
+			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+				  blocktype);
+			brelse(bh);
+			goto done;
+		}
+	}
+
+ done:
+	/*
+	 * We broke out of the log scan loop: either we came to the
+	 * known end of the log or we found an unexpected block in the
+	 * log.  If the latter happened, then we know that the "current"
+	 * transaction marks the end of the valid log.
+	 */
+
+	if (pass == PASS_SCAN)
+		info->end_transaction = next_commit_ID;
+	else {
+		/* It's really bad news if different passes end up at
+		 * different places (but possible due to IO errors). */
+		if (info->end_transaction != next_commit_ID) {
+			printk (KERN_ERR "JBD: recovery pass %d ended at "
+				"transaction %u, expected %u\n",
+				pass, next_commit_ID, info->end_transaction);
+			if (!success)
+				success = -EIO;
+		}
+	}
+
+	return success;
+
+ failed:
+	return err;
+}
+
+
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+			       tid_t sequence, struct recovery_info *info)
+{
+	journal_revoke_header_t *header;
+	int offset, max;
+
+	header = (journal_revoke_header_t *) bh->b_data;
+	offset = sizeof(journal_revoke_header_t);
+	max = be32_to_cpu(header->r_count);
+
+	while (offset < max) {
+		unsigned int blocknr;
+		int err;
+
+		blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+		offset += 4;
+		err = journal_set_revoke(journal, blocknr, sequence);
+		if (err)
+			return err;
+		++info->nr_revokes;
+	}
+	return 0;
+}
diff --git a/kernel/fs/jbd/revoke.c b/kernel/fs/jbd/revoke.c
new file mode 100644
index 000000000..dcead636c
--- /dev/null
+++ b/kernel/fs/jbd/revoke.c
@@ -0,0 +1,733 @@
+/*
+ * linux/fs/jbd/revoke.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks.  The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ *   transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ *   revoked blocks.  If there are multiple revoke records in the log
+ *   for a single block, only the last one counts, and if there is a log
+ *   entry for a block beyond the last revoke, then that log entry still
+ *   gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ *   The desired end result is the journaling of the new block, so we
+ *   cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ *   The revoke must take precedence over the write of the block, so we
+ *   need either to cancel the journal entry or to write the revoke
+ *   later in the log than the log block.  In this case, we choose the
+ *   latter: journaling a block cancels any revoke record for that block
+ *   in the current transaction, so any revoke for that block in the
+ *   transaction must have happened after the block was journaled and so
+ *   the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ *   The data write is allowed to succeed, but the revoke is _not_
+ *   cancelled.  We still need to prevent old log records from
+ *   overwriting the new data.  We don't even need to clear the revoke
+ *   bit here.
+ *
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits.  As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear:	no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ *			buffer has not been revoked, and cancel_revoke
+ *			need do nothing.
+ * RevokeValid set, Revoked set:
+ *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment no one else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#endif
+#include <linux/log2.h>
+#include <linux/hash.h>
+
+static struct kmem_cache *revoke_record_cache;
+static struct kmem_cache *revoke_table_cache;
+
+/* Each revoke record represents one single revoked block.  During
+   journal replay, this involves recording the transaction ID of the
+   last transaction to revoke this block. */
+
+struct jbd_revoke_record_s
+{
+	struct list_head  hash;
+	tid_t		  sequence;	/* Used for recovery only */
+	unsigned int	  blocknr;
+};
+
+
+/* The revoke table is just a simple hash table of revoke records. */
+struct jbd_revoke_table_s
+{
+	/* It is conceivable that we might want a larger hash table
+	 * for recovery.  Must be a power of two. */
+	int		  hash_size;
+	int		  hash_shift;
+	struct list_head *hash_table;
+};
+
+
+#ifdef __KERNEL__
+static void write_one_revoke_record(journal_t *, transaction_t *,
+				    struct journal_head **, int *,
+				    struct jbd_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+#endif
+
+/* Utility functions to maintain the revoke table */
+
+static inline int hash(journal_t *journal, unsigned int block)
+{
+	struct jbd_revoke_table_s *table = journal->j_revoke;
+
+	return hash_32(block, table->hash_shift);
+}
+
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
+			      tid_t seq)
+{
+	struct list_head *hash_list;
+	struct jbd_revoke_record_s *record;
+
+repeat:
+	record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
+	if (!record)
+		goto oom;
+
+	record->sequence = seq;
+	record->blocknr = blocknr;
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+	spin_lock(&journal->j_revoke_lock);
+	list_add(&record->hash, hash_list);
+	spin_unlock(&journal->j_revoke_lock);
+	return 0;
+
+oom:
+	if (!journal_oom_retry)
+		return -ENOMEM;
+	jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
+	yield();
+	goto repeat;
+}
+
+/* Find a revoke record in the journal's hash table. */
+
+static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
+						      unsigned int blocknr)
+{
+	struct list_head *hash_list;
+	struct jbd_revoke_record_s *record;
+
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+	spin_lock(&journal->j_revoke_lock);
+	record = (struct jbd_revoke_record_s *) hash_list->next;
+	while (&(record->hash) != hash_list) {
+		if (record->blocknr == blocknr) {
+			spin_unlock(&journal->j_revoke_lock);
+			return record;
+		}
+		record = (struct jbd_revoke_record_s *) record->hash.next;
+	}
+	spin_unlock(&journal->j_revoke_lock);
+	return NULL;
+}
+
+void journal_destroy_revoke_caches(void)
+{
+	if (revoke_record_cache) {
+		kmem_cache_destroy(revoke_record_cache);
+		revoke_record_cache = NULL;
+	}
+	if (revoke_table_cache) {
+		kmem_cache_destroy(revoke_table_cache);
+		revoke_table_cache = NULL;
+	}
+}
+
+int __init journal_init_revoke_caches(void)
+{
+	J_ASSERT(!revoke_record_cache);
+	J_ASSERT(!revoke_table_cache);
+
+	revoke_record_cache = kmem_cache_create("revoke_record",
+					   sizeof(struct jbd_revoke_record_s),
+					   0,
+					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+					   NULL);
+	if (!revoke_record_cache)
+		goto record_cache_failure;
+
+	revoke_table_cache = kmem_cache_create("revoke_table",
+					   sizeof(struct jbd_revoke_table_s),
+					   0, SLAB_TEMPORARY, NULL);
+	if (!revoke_table_cache)
+		goto table_cache_failure;
+
+	return 0;
+
+table_cache_failure:
+	journal_destroy_revoke_caches();
+record_cache_failure:
+	return -ENOMEM;
+}
+
+static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
+{
+	int i;
+	struct jbd_revoke_table_s *table;
+
+	table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	table->hash_size = hash_size;
+	table->hash_shift = ilog2(hash_size);
+	table->hash_table =
+		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+	if (!table->hash_table) {
+		kmem_cache_free(revoke_table_cache, table);
+		table = NULL;
+		goto out;
+	}
+
+	for (i = 0; i < hash_size; i++)
+		INIT_LIST_HEAD(&table->hash_table[i]);
+
+out:
+	return table;
+}
+
+static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
+{
+	int i;
+	struct list_head *hash_list;
+
+	for (i = 0; i < table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT(list_empty(hash_list));
+	}
+
+	kfree(table->hash_table);
+	kmem_cache_free(revoke_table_cache, table);
+}
+
+/* Initialise the revoke table for a given journal to a given size. */
+int journal_init_revoke(journal_t *journal, int hash_size)
+{
+	J_ASSERT(journal->j_revoke_table[0] == NULL);
+	J_ASSERT(is_power_of_2(hash_size));
+
+	journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[0])
+		goto fail0;
+
+	journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[1])
+		goto fail1;
+
+	journal->j_revoke = journal->j_revoke_table[1];
+
+	spin_lock_init(&journal->j_revoke_lock);
+
+	return 0;
+
+fail1:
+	journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+	return -ENOMEM;
+}
+
+/* Destroy a journal's revoke table.  The table must already be empty! */
+void journal_destroy_revoke(journal_t *journal)
+{
+	journal->j_revoke = NULL;
+	if (journal->j_revoke_table[0])
+		journal_destroy_revoke_table(journal->j_revoke_table[0]);
+	if (journal->j_revoke_table[1])
+		journal_destroy_revoke_table(journal->j_revoke_table[1]);
+}
+
+
+#ifdef __KERNEL__
+
+/*
+ * journal_revoke: revoke a given buffer_head from the journal.  This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits.  Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete.  In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, journal_revoke() will decrement its b_count
+ * by one.
+ */
+
+int journal_revoke(handle_t *handle, unsigned int blocknr,
+		   struct buffer_head *bh_in)
+{
+	struct buffer_head *bh = NULL;
+	journal_t *journal;
+	struct block_device *bdev;
+	int err;
+
+	might_sleep();
+	if (bh_in)
+		BUFFER_TRACE(bh_in, "enter");
+
+	journal = handle->h_transaction->t_journal;
+	if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
+		J_ASSERT (!"Cannot set revoke feature!");
+		return -EINVAL;
+	}
+
+	bdev = journal->j_fs_dev;
+	bh = bh_in;
+
+	if (!bh) {
+		bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh)
+			BUFFER_TRACE(bh, "found on hash");
+	}
+#ifdef JBD_EXPENSIVE_CHECKING
+	else {
+		struct buffer_head *bh2;
+
+		/* If there is a different buffer_head lying around in
+		 * memory anywhere... */
+		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh2) {
+			/* ... and it has RevokeValid status... */
+			if (bh2 != bh && buffer_revokevalid(bh2))
+				/* ...then it better be revoked too,
+				 * since it's illegal to create a revoke
+				 * record against a buffer_head which is
+				 * not marked revoked --- that would
+				 * risk missing a subsequent revoke
+				 * cancel. */
+				J_ASSERT_BH(bh2, buffer_revoked(bh2));
+			put_bh(bh2);
+		}
+	}
+#endif
+
+	/* We really ought not ever to revoke twice in a row without
+           first having the revoke cancelled: it's illegal to free a
+           block twice without allocating it in between! */
+	if (bh) {
+		if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
+				 "inconsistent data on disk")) {
+			if (!bh_in)
+				brelse(bh);
+			return -EIO;
+		}
+		set_buffer_revoked(bh);
+		set_buffer_revokevalid(bh);
+		if (bh_in) {
+			BUFFER_TRACE(bh_in, "call journal_forget");
+			journal_forget(handle, bh_in);
+		} else {
+			BUFFER_TRACE(bh, "call brelse");
+			__brelse(bh);
+		}
+	}
+
+	jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
+	err = insert_revoke_hash(journal, blocknr,
+				handle->h_transaction->t_tid);
+	BUFFER_TRACE(bh_in, "exit");
+	return err;
+}
+
+/*
+ * Cancel an outstanding revoke.  For use only internally by the
+ * journaling code (called from journal_get_write_access).
+ *
+ * We trust buffer_revoked() on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction.  In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel.  So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ */
+int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+{
+	struct jbd_revoke_record_s *record;
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_cancel;
+	int did_revoke = 0;	/* akpm: debug */
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+	/* Is the existing Revoke bit valid?  If so, we trust it, and
+	 * only perform the full cancel if the revoke bit is set.  If
+	 * not, we can't trust the revoke bit, and we need to do the
+	 * full search for a revoke record. */
+	if (test_set_buffer_revokevalid(bh)) {
+		need_cancel = test_clear_buffer_revoked(bh);
+	} else {
+		need_cancel = 1;
+		clear_buffer_revoked(bh);
+	}
+
+	if (need_cancel) {
+		record = find_revoke_record(journal, bh->b_blocknr);
+		if (record) {
+			jbd_debug(4, "cancelled existing revoke on "
+				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
+			spin_lock(&journal->j_revoke_lock);
+			list_del(&record->hash);
+			spin_unlock(&journal->j_revoke_lock);
+			kmem_cache_free(revoke_record_cache, record);
+			did_revoke = 1;
+		}
+	}
+
+#ifdef JBD_EXPENSIVE_CHECKING
+	/* There better not be one left behind by now! */
+	record = find_revoke_record(journal, bh->b_blocknr);
+	J_ASSERT_JH(jh, record == NULL);
+#endif
+
+	/* Finally, have we just cleared revoke on an unhashed
+	 * buffer_head?  If so, we'd better make sure we clear the
+	 * revoked status on any hashed alias too, otherwise the revoke
+	 * state machine will get very upset later on. */
+	if (need_cancel) {
+		struct buffer_head *bh2;
+		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+		if (bh2) {
+			if (bh2 != bh)
+				clear_buffer_revoked(bh2);
+			__brelse(bh2);
+		}
+	}
+	return did_revoke;
+}
+
+/*
+ * journal_clear_revoked_flags clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffer in the next
+ * transaction which is going to be started.
+ */
+void journal_clear_buffer_revoked_flags(journal_t *journal)
+{
+	struct jbd_revoke_table_s *revoke = journal->j_revoke;
+	int i = 0;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		struct list_head *hash_list;
+		struct list_head *list_entry;
+		hash_list = &revoke->hash_table[i];
+
+		list_for_each(list_entry, hash_list) {
+			struct jbd_revoke_record_s *record;
+			struct buffer_head *bh;
+			record = (struct jbd_revoke_record_s *)list_entry;
+			bh = __find_get_block(journal->j_fs_dev,
+					      record->blocknr,
+					      journal->j_blocksize);
+			if (bh) {
+				clear_buffer_revoked(bh);
+				__brelse(bh);
+			}
+		}
+	}
+}
+
+/* journal_switch_revoke table select j_revoke for next transaction
+ * we do not want to suspend any processing until all revokes are
+ * written -bzzz
+ */
+void journal_switch_revoke_table(journal_t *journal)
+{
+	int i;
+
+	if (journal->j_revoke == journal->j_revoke_table[0])
+		journal->j_revoke = journal->j_revoke_table[1];
+	else
+		journal->j_revoke = journal->j_revoke_table[0];
+
+	for (i = 0; i < journal->j_revoke->hash_size; i++)
+		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+}
+
+/*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ */
+void journal_write_revoke_records(journal_t *journal,
+				  transaction_t *transaction, int write_op)
+{
+	struct journal_head *descriptor;
+	struct jbd_revoke_record_s *record;
+	struct jbd_revoke_table_s *revoke;
+	struct list_head *hash_list;
+	int i, offset, count;
+
+	descriptor = NULL;
+	offset = 0;
+	count = 0;
+
+	/* select revoke table for committing transaction */
+	revoke = journal->j_revoke == journal->j_revoke_table[0] ?
+		journal->j_revoke_table[1] : journal->j_revoke_table[0];
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+
+		while (!list_empty(hash_list)) {
+			record = (struct jbd_revoke_record_s *)
+				hash_list->next;
+			write_one_revoke_record(journal, transaction,
+						&descriptor, &offset,
+						record, write_op);
+			count++;
+			list_del(&record->hash);
+			kmem_cache_free(revoke_record_cache, record);
+		}
+	}
+	if (descriptor)
+		flush_descriptor(journal, descriptor, offset, write_op);
+	jbd_debug(1, "Wrote %d revoke records\n", count);
+}
+
+/*
+ * Write out one revoke record.  We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+
+static void write_one_revoke_record(journal_t *journal,
+				    transaction_t *transaction,
+				    struct journal_head **descriptorp,
+				    int *offsetp,
+				    struct jbd_revoke_record_s *record,
+				    int write_op)
+{
+	struct journal_head *descriptor;
+	int offset;
+	journal_header_t *header;
+
+	/* If we are already aborting, this all becomes a noop.  We
+           still need to go round the loop in
+           journal_write_revoke_records in order to free all of the
+           revoke records: only the IO to the journal is omitted. */
+	if (is_journal_aborted(journal))
+		return;
+
+	descriptor = *descriptorp;
+	offset = *offsetp;
+
+	/* Make sure we have a descriptor with space left for the record */
+	if (descriptor) {
+		if (offset == journal->j_blocksize) {
+			flush_descriptor(journal, descriptor, offset, write_op);
+			descriptor = NULL;
+		}
+	}
+
+	if (!descriptor) {
+		descriptor = journal_get_descriptor_buffer(journal);
+		if (!descriptor)
+			return;
+		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+		header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
+		header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
+		header->h_sequence  = cpu_to_be32(transaction->t_tid);
+
+		/* Record it so that we can wait for IO completion later */
+		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+		journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+
+		offset = sizeof(journal_revoke_header_t);
+		*descriptorp = descriptor;
+	}
+
+	* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+		cpu_to_be32(record->blocknr);
+	offset += 4;
+	*offsetp = offset;
+}
+
+/*
+ * Flush a revoke descriptor out to the journal.  If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+
+static void flush_descriptor(journal_t *journal,
+			     struct journal_head *descriptor,
+			     int offset, int write_op)
+{
+	journal_revoke_header_t *header;
+	struct buffer_head *bh = jh2bh(descriptor);
+
+	if (is_journal_aborted(journal)) {
+		put_bh(bh);
+		return;
+	}
+
+	header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+	header->r_count = cpu_to_be32(offset);
+	set_buffer_jwrite(bh);
+	BUFFER_TRACE(bh, "write");
+	set_buffer_dirty(bh);
+	write_dirty_buffer(bh, write_op);
+}
+#endif
+
+/*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ *  record all revoke records, including the tid of the latest instance
+ *  of each revoke in the journal
+ *
+ *  check whether a given block in a given transaction should be replayed
+ *  (ie. has not been revoked by a revoke record in that or a subsequent
+ *  transaction)
+ *
+ *  empty the revoke table after recovery.
+ */
+
+/*
+ * First, setting revoke records.  We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+
+int journal_set_revoke(journal_t *journal,
+		       unsigned int blocknr,
+		       tid_t sequence)
+{
+	struct jbd_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (record) {
+		/* If we have multiple occurrences, only record the
+		 * latest sequence number in the hashed record */
+		if (tid_gt(sequence, record->sequence))
+			record->sequence = sequence;
+		return 0;
+	}
+	return insert_revoke_hash(journal, blocknr, sequence);
+}
+
+/*
+ * Test revoke records.  For a given block referenced in the log, has
+ * that block been revoked?  A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+
+int journal_test_revoke(journal_t *journal,
+			unsigned int blocknr,
+			tid_t sequence)
+{
+	struct jbd_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (!record)
+		return 0;
+	if (tid_gt(sequence, record->sequence))
+		return 0;
+	return 1;
+}
+
+/*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+
+void journal_clear_revoke(journal_t *journal)
+{
+	int i;
+	struct list_head *hash_list;
+	struct jbd_revoke_record_s *record;
+	struct jbd_revoke_table_s *revoke;
+
+	revoke = journal->j_revoke;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+		while (!list_empty(hash_list)) {
+			record = (struct jbd_revoke_record_s*) hash_list->next;
+			list_del(&record->hash);
+			kmem_cache_free(revoke_record_cache, record);
+		}
+	}
+}
diff --git a/kernel/fs/jbd/transaction.c b/kernel/fs/jbd/transaction.c
new file mode 100644
index 000000000..1695ba833
--- /dev/null
+++ b/kernel/fs/jbd/transaction.c
@@ -0,0 +1,2237 @@
+/*
+ * linux/fs/jbd/transaction.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hrtimer.h>
+
+static void __journal_temp_unlink_buffer(struct journal_head *jh);
+
+/*
+ * get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction.  Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ *	The journal MUST be locked.  We don't perform atomic mallocs on the
+ *	new transaction	and we can't block without protecting against other
+ *	processes trying to touch the journal while it is in transition.
+ *
+ * Called under j_state_lock
+ */
+
+static transaction_t *
+get_transaction(journal_t *journal, transaction_t *transaction)
+{
+	transaction->t_journal = journal;
+	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
+	transaction->t_tid = journal->j_transaction_sequence++;
+	transaction->t_expires = jiffies + journal->j_commit_interval;
+	spin_lock_init(&transaction->t_handle_lock);
+
+	/* Set up the commit timer for the new transaction. */
+	journal->j_commit_timer.expires =
+				round_jiffies_up(transaction->t_expires);
+	add_timer(&journal->j_commit_timer);
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	journal->j_running_transaction = transaction;
+
+	return transaction;
+}
+
+/*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+
+/*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin.  Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+
+static int start_this_handle(journal_t *journal, handle_t *handle)
+{
+	transaction_t *transaction;
+	int needed;
+	int nblocks = handle->h_buffer_credits;
+	transaction_t *new_transaction = NULL;
+	int ret = 0;
+
+	if (nblocks > journal->j_max_transaction_buffers) {
+		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+		       current->comm, nblocks,
+		       journal->j_max_transaction_buffers);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+alloc_transaction:
+	if (!journal->j_running_transaction) {
+		new_transaction = kzalloc(sizeof(*new_transaction),
+						GFP_NOFS|__GFP_NOFAIL);
+		if (!new_transaction) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	jbd_debug(3, "New handle %p going live.\n", handle);
+
+repeat:
+
+	/*
+	 * We need to hold j_state_lock until t_updates has been incremented,
+	 * for proper journal barrier handling
+	 */
+	spin_lock(&journal->j_state_lock);
+repeat_locked:
+	if (is_journal_aborted(journal) ||
+	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+		spin_unlock(&journal->j_state_lock);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/* Wait on the journal's transaction barrier if necessary */
+	if (journal->j_barrier_count) {
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_transaction_locked,
+				journal->j_barrier_count == 0);
+		goto repeat;
+	}
+
+	if (!journal->j_running_transaction) {
+		if (!new_transaction) {
+			spin_unlock(&journal->j_state_lock);
+			goto alloc_transaction;
+		}
+		get_transaction(journal, new_transaction);
+		new_transaction = NULL;
+	}
+
+	transaction = journal->j_running_transaction;
+
+	/*
+	 * If the current transaction is locked down for commit, wait for the
+	 * lock to be released.
+	 */
+	if (transaction->t_state == T_LOCKED) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_transaction_locked,
+					&wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * If there is not enough space left in the log to write all potential
+	 * buffers requested by this operation, we need to stall pending a log
+	 * checkpoint to free some more log space.
+	 */
+	spin_lock(&transaction->t_handle_lock);
+	needed = transaction->t_outstanding_credits + nblocks;
+
+	if (needed > journal->j_max_transaction_buffers) {
+		/*
+		 * If the current transaction is already too large, then start
+		 * to commit it: we can then go back and attach this handle to
+		 * a new transaction.
+		 */
+		DEFINE_WAIT(wait);
+
+		jbd_debug(2, "Handle %p starting new commit...\n", handle);
+		spin_unlock(&transaction->t_handle_lock);
+		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+				TASK_UNINTERRUPTIBLE);
+		__log_start_commit(journal, transaction->t_tid);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * The commit code assumes that it can get enough log space
+	 * without forcing a checkpoint.  This is *critical* for
+	 * correctness: a checkpoint of a buffer which is also
+	 * associated with a committing transaction creates a deadlock,
+	 * so commit simply cannot force through checkpoints.
+	 *
+	 * We must therefore ensure the necessary space in the journal
+	 * *before* starting to dirty potentially checkpointed buffers
+	 * in the new transaction.
+	 *
+	 * The worst part is, any transaction currently committing can
+	 * reduce the free space arbitrarily.  Be careful to account for
+	 * those buffers when checkpointing.
+	 */
+
+	/*
+	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
+	 * a _lot_ of headroom: 1/4 of the journal plus the size of
+	 * the committing transaction.  Really, we only need to give it
+	 * committing_transaction->t_outstanding_credits plus "enough" for
+	 * the log control blocks.
+	 * Also, this test is inconsistent with the matching one in
+	 * journal_extend().
+	 */
+	if (__log_space_left(journal) < jbd_space_needed(journal)) {
+		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+		spin_unlock(&transaction->t_handle_lock);
+		__log_wait_for_space(journal);
+		goto repeat_locked;
+	}
+
+	/* OK, account for the buffers that this operation expects to
+	 * use and add the handle to the running transaction. */
+
+	handle->h_transaction = transaction;
+	transaction->t_outstanding_credits += nblocks;
+	transaction->t_updates++;
+	transaction->t_handle_count++;
+	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+		  handle, nblocks, transaction->t_outstanding_credits,
+		  __log_space_left(journal));
+	spin_unlock(&transaction->t_handle_lock);
+	spin_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
+out:
+	if (unlikely(new_transaction))		/* It's usually NULL */
+		kfree(new_transaction);
+	return ret;
+}
+
+static struct lock_class_key jbd_handle_key;
+
+/* Allocate a new handle.  This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+	handle_t *handle = jbd_alloc_handle(GFP_NOFS);
+	if (!handle)
+		return NULL;
+	handle->h_buffer_credits = nblocks;
+	handle->h_ref = 1;
+
+	lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
+
+	return handle;
+}
+
+/**
+ * handle_t *journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log.  We block until the log can guarantee
+ * that much space.
+ *
+ * This function is visible to journal users (like ext3fs), so is not
+ * called with the journal already locked.
+ *
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
+ */
+handle_t *journal_start(journal_t *journal, int nblocks)
+{
+	handle_t *handle = journal_current_handle();
+	int err;
+
+	if (!journal)
+		return ERR_PTR(-EROFS);
+
+	if (handle) {
+		J_ASSERT(handle->h_transaction->t_journal == journal);
+		handle->h_ref++;
+		return handle;
+	}
+
+	handle = new_handle(nblocks);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	current->journal_info = handle;
+
+	err = start_this_handle(journal, handle);
+	if (err < 0) {
+		jbd_free_handle(handle);
+		current->journal_info = NULL;
+		handle = ERR_PTR(err);
+	}
+	return handle;
+}
+
+/**
+ * int journal_extend() - extend buffer credits.
+ * @handle:  handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages.  The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation - this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+int journal_extend(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int result;
+	int wanted;
+
+	result = -EIO;
+	if (is_handle_aborted(handle))
+		goto out;
+
+	result = 1;
+
+	spin_lock(&journal->j_state_lock);
+
+	/* Don't extend a locked-down transaction! */
+	if (handle->h_transaction->t_state != T_RUNNING) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction not running\n", handle, nblocks);
+		goto error_out;
+	}
+
+	spin_lock(&transaction->t_handle_lock);
+	wanted = transaction->t_outstanding_credits + nblocks;
+
+	if (wanted > journal->j_max_transaction_buffers) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction too large\n", handle, nblocks);
+		goto unlock;
+	}
+
+	if (wanted > __log_space_left(journal)) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "insufficient log space\n", handle, nblocks);
+		goto unlock;
+	}
+
+	handle->h_buffer_credits += nblocks;
+	transaction->t_outstanding_credits += nblocks;
+	result = 0;
+
+	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+unlock:
+	spin_unlock(&transaction->t_handle_lock);
+error_out:
+	spin_unlock(&journal->j_state_lock);
+out:
+	return result;
+}
+
+
+/**
+ * int journal_restart() - restart a handle.
+ * @handle:  handle to restart
+ * @nblocks: nr credits requested
+ *
+ * Restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits.
+ */
+
+int journal_restart(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int ret;
+
+	/* If we've had an abort of any type, don't even think about
+	 * actually doing the restart! */
+	if (is_handle_aborted(handle))
+		return 0;
+
+	/*
+	 * First unlink the handle from its current transaction, and start the
+	 * commit on that.
+	 */
+	J_ASSERT(transaction->t_updates > 0);
+	J_ASSERT(journal_current_handle() == handle);
+
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+	transaction->t_updates--;
+
+	if (!transaction->t_updates)
+		wake_up(&journal->j_wait_updates);
+	spin_unlock(&transaction->t_handle_lock);
+
+	jbd_debug(2, "restarting handle %p\n", handle);
+	__log_start_commit(journal, transaction->t_tid);
+	spin_unlock(&journal->j_state_lock);
+
+	lock_map_release(&handle->h_lockdep_map);
+	handle->h_buffer_credits = nblocks;
+	ret = start_this_handle(journal, handle);
+	return ret;
+}
+
+
+/**
+ * void journal_lock_updates () - establish a transaction barrier.
+ * @journal:  Journal to establish a barrier on.
+ *
+ * This locks out any further updates from being started, and blocks until all
+ * existing updates have completed, returning only once the journal is in a
+ * quiescent state with no updates running.
+ *
+ * We do not use simple mutex for synchronization as there are syscalls which
+ * want to return with filesystem locked and that trips up lockdep. Also
+ * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
+ * Since locking filesystem is rare operation, we use simple counter and
+ * waitqueue for locking.
+ */
+void journal_lock_updates(journal_t *journal)
+{
+	DEFINE_WAIT(wait);
+
+wait:
+	/* Wait for previous locked operation to finish */
+	wait_event(journal->j_wait_transaction_locked,
+		   journal->j_barrier_count == 0);
+
+	spin_lock(&journal->j_state_lock);
+	/*
+	 * Check reliably under the lock whether we are the ones winning the race
+	 * and locking the journal
+	 */
+	if (journal->j_barrier_count > 0) {
+		spin_unlock(&journal->j_state_lock);
+		goto wait;
+	}
+	++journal->j_barrier_count;
+
+	/* Wait until there are no running updates */
+	while (1) {
+		transaction_t *transaction = journal->j_running_transaction;
+
+		if (!transaction)
+			break;
+
+		spin_lock(&transaction->t_handle_lock);
+		if (!transaction->t_updates) {
+			spin_unlock(&transaction->t_handle_lock);
+			break;
+		}
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+				TASK_UNINTERRUPTIBLE);
+		spin_unlock(&transaction->t_handle_lock);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_updates, &wait);
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+}
+
+/**
+ * void journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal:  Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained with journal_lock_updates().
+ */
+void journal_unlock_updates (journal_t *journal)
+{
+	J_ASSERT(journal->j_barrier_count != 0);
+
+	spin_lock(&journal->j_state_lock);
+	--journal->j_barrier_count;
+	spin_unlock(&journal->j_state_lock);
+	wake_up(&journal->j_wait_transaction_locked);
+}
+
+static void warn_dirty_buffer(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_WARNING
+	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "There's a risk of filesystem corruption in case of system "
+	       "crash.\n",
+	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do.  If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk.  We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ */
+static int
+do_get_write_access(handle_t *handle, struct journal_head *jh,
+			int force_copy)
+{
+	struct buffer_head *bh;
+	transaction_t *transaction;
+	journal_t *journal;
+	int error;
+	char *frozen_buffer = NULL;
+	int need_copy = 0;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
+	transaction = handle->h_transaction;
+	journal = transaction->t_journal;
+
+	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+
+	JBUFFER_TRACE(jh, "entry");
+repeat:
+	bh = jh2bh(jh);
+
+	/* @@@ Need to check for errors here at some point. */
+
+	lock_buffer(bh);
+	jbd_lock_bh_state(bh);
+
+	/* We now hold the buffer lock so it is safe to query the buffer
+	 * state.  Is the buffer dirty?
+	 *
+	 * If so, there are two possibilities.  The buffer may be
+	 * non-journaled, and undergoing a quite legitimate writeback.
+	 * Otherwise, it is journaled, and we don't expect dirty buffers
+	 * in that state (the buffers should be marked JBD_Dirty
+	 * instead.)  So either the IO is being done under our own
+	 * control and this is a bug, or it's a third party IO such as
+	 * dump(8) (which may leave the buffer scheduled for read ---
+	 * ie. locked but not dirty) or tune2fs (which may actually have
+	 * the buffer dirtied, ugh.)  */
+
+	if (buffer_dirty(bh)) {
+		/*
+		 * First question: is this buffer already part of the current
+		 * transaction or the existing committing transaction?
+		 */
+		if (jh->b_transaction) {
+			J_ASSERT_JH(jh,
+				jh->b_transaction == transaction ||
+				jh->b_transaction ==
+					journal->j_committing_transaction);
+			if (jh->b_next_transaction)
+				J_ASSERT_JH(jh, jh->b_next_transaction ==
+							transaction);
+			warn_dirty_buffer(bh);
+		}
+		/*
+		 * In any case we need to clean the dirty flag and we must
+		 * do it under the buffer lock to be sure we don't race
+		 * with running write-out.
+		 */
+		JBUFFER_TRACE(jh, "Journalling dirty buffer");
+		clear_buffer_dirty(bh);
+		set_buffer_jbddirty(bh);
+	}
+
+	unlock_buffer(bh);
+
+	error = -EROFS;
+	if (is_handle_aborted(handle)) {
+		jbd_unlock_bh_state(bh);
+		goto out;
+	}
+	error = 0;
+
+	/*
+	 * The buffer is already part of this transaction if b_transaction or
+	 * b_next_transaction points to it
+	 */
+	if (jh->b_transaction == transaction ||
+	    jh->b_next_transaction == transaction)
+		goto done;
+
+	/*
+	 * this is the first time this transaction is touching this buffer,
+	 * reset the modified flag
+	 */
+	jh->b_modified = 0;
+
+	/*
+	 * If there is already a copy-out version of this buffer, then we don't
+	 * need to make another one
+	 */
+	if (jh->b_frozen_data) {
+		JBUFFER_TRACE(jh, "has frozen data");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		jh->b_next_transaction = transaction;
+		goto done;
+	}
+
+	/* Is there data here we need to preserve? */
+
+	if (jh->b_transaction && jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "owned by older transaction");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+		/* There is one case we have to be very careful about.
+		 * If the committing transaction is currently writing
+		 * this buffer out to disk and has NOT made a copy-out,
+		 * then we cannot modify the buffer contents at all
+		 * right now.  The essence of copy-out is that it is the
+		 * extra copy, not the primary copy, which gets
+		 * journaled.  If the primary copy is already going to
+		 * disk then we cannot do copy-out here. */
+
+		if (jh->b_jlist == BJ_Shadow) {
+			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
+			wait_queue_head_t *wqh;
+
+			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
+
+			JBUFFER_TRACE(jh, "on shadow: sleep");
+			jbd_unlock_bh_state(bh);
+			/* commit wakes up all shadow buffers after IO */
+			for ( ; ; ) {
+				prepare_to_wait(wqh, &wait.wait,
+						TASK_UNINTERRUPTIBLE);
+				if (jh->b_jlist != BJ_Shadow)
+					break;
+				schedule();
+			}
+			finish_wait(wqh, &wait.wait);
+			goto repeat;
+		}
+
+		/* Only do the copy if the currently-owning transaction
+		 * still needs it.  If it is on the Forget list, the
+		 * committing transaction is past that stage.  The
+		 * buffer had better remain locked during the kmalloc,
+		 * but that should be true --- we hold the journal lock
+		 * still and the buffer is already on the BUF_JOURNAL
+		 * list so won't be flushed.
+		 *
+		 * Subtle point, though: if this is a get_undo_access,
+		 * then we will be relying on the frozen_data to contain
+		 * the new value of the committed_data record after the
+		 * transaction, so we HAVE to force the frozen_data copy
+		 * in that case. */
+
+		if (jh->b_jlist != BJ_Forget || force_copy) {
+			JBUFFER_TRACE(jh, "generate frozen data");
+			if (!frozen_buffer) {
+				JBUFFER_TRACE(jh, "allocate memory for buffer");
+				jbd_unlock_bh_state(bh);
+				frozen_buffer =
+					jbd_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
+				if (!frozen_buffer) {
+					printk(KERN_ERR
+					       "%s: OOM for frozen_buffer\n",
+					       __func__);
+					JBUFFER_TRACE(jh, "oom!");
+					error = -ENOMEM;
+					jbd_lock_bh_state(bh);
+					goto done;
+				}
+				goto repeat;
+			}
+			jh->b_frozen_data = frozen_buffer;
+			frozen_buffer = NULL;
+			need_copy = 1;
+		}
+		jh->b_next_transaction = transaction;
+	}
+
+
+	/*
+	 * Finally, if the buffer is not journaled right now, we need to make
+	 * sure it doesn't get written to disk before the caller actually
+	 * commits the new data
+	 */
+	if (!jh->b_transaction) {
+		JBUFFER_TRACE(jh, "no transaction");
+		J_ASSERT_JH(jh, !jh->b_next_transaction);
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		spin_lock(&journal->j_list_lock);
+		__journal_file_buffer(jh, transaction, BJ_Reserved);
+		spin_unlock(&journal->j_list_lock);
+	}
+
+done:
+	if (need_copy) {
+		struct page *page;
+		int offset;
+		char *source;
+
+		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
+			    "Possible IO failure.\n");
+		page = jh2bh(jh)->b_page;
+		offset = offset_in_page(jh2bh(jh)->b_data);
+		source = kmap_atomic(page);
+		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+		kunmap_atomic(source);
+	}
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * If we are about to journal a buffer, then any revoke pending on it is
+	 * no longer valid
+	 */
+	journal_cancel_revoke(handle, jh);
+
+out:
+	if (unlikely(frozen_buffer))	/* It's usually NULL */
+		jbd_free(frozen_buffer, bh->b_size);
+
+	JBUFFER_TRACE(jh, "exit");
+	return error;
+}
+
+/**
+ * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh:     bh to be used for metadata writes
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+{
+	struct journal_head *jh = journal_add_journal_head(bh);
+	int rc;
+
+	/* We do not want to get caught playing with fields which the
+	 * log thread also manipulates.  Make sure that the buffer
+	 * completes any outstanding IO before proceeding. */
+	rc = do_get_write_access(handle, jh, 0);
+	journal_put_journal_head(jh);
+	return rc;
+}
+
+
+/*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data.  In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+
+/**
+ * int journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
+int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = journal_add_journal_head(bh);
+	int err;
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	err = -EROFS;
+	if (is_handle_aborted(handle))
+		goto out;
+	err = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+	/*
+	 * The buffer may already belong to this transaction due to pre-zeroing
+	 * in the filesystem's new_block code.  It may also be on the previous,
+	 * committing transaction's lists, but it HAS to be in Forget state in
+	 * that case: the transaction must have deleted the buffer for it to be
+	 * reused here.
+	 */
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+		jh->b_transaction == NULL ||
+		(jh->b_transaction == journal->j_committing_transaction &&
+			  jh->b_jlist == BJ_Forget)));
+
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+	if (jh->b_transaction == NULL) {
+		/*
+		 * Previous journal_forget() could have left the buffer
+		 * with jbddirty bit set because it was being committed. When
+		 * the commit finished, we've filed the buffer for
+		 * checkpointing and marked it dirty. Now we are reallocating
+		 * the buffer so the transaction freeing it must have
+		 * committed and so it's safe to clear the dirty bit.
+		 */
+		clear_buffer_dirty(jh2bh(jh));
+
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		__journal_file_buffer(jh, transaction, BJ_Reserved);
+	} else if (jh->b_transaction == journal->j_committing_transaction) {
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "set next transaction");
+		jh->b_next_transaction = transaction;
+	}
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+	 * blocks which contain freed but then revoked metadata.  We need
+	 * to cancel the revoke in case we end up freeing it yet again
+	 * and the reallocating as data - this would cause a second revoke,
+	 * which hits an assertion error.
+	 */
+	JBUFFER_TRACE(jh, "cancelling revoke");
+	journal_cancel_revoke(handle, jh);
+out:
+	journal_put_journal_head(jh);
+	return err;
+}
+
+/**
+ * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not.  The ext3fs code uses
+ * this for freeing and allocating space, we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps.  The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of, buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+{
+	int err;
+	struct journal_head *jh = journal_add_journal_head(bh);
+	char *committed_data = NULL;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * Do this first --- it can drop the journal lock, so we want to
+	 * make sure that obtaining the committed_data is done
+	 * atomically wrt. completion of any outstanding commits.
+	 */
+	err = do_get_write_access(handle, jh, 1);
+	if (err)
+		goto out;
+
+repeat:
+	if (!jh->b_committed_data) {
+		committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+		if (!committed_data) {
+			printk(KERN_ERR "%s: No memory for committed data\n",
+				__func__);
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data) {
+		/* Copy out the current buffer contents into the
+		 * preserved, committed copy. */
+		JBUFFER_TRACE(jh, "generate b_committed data");
+		if (!committed_data) {
+			jbd_unlock_bh_state(bh);
+			goto repeat;
+		}
+
+		jh->b_committed_data = committed_data;
+		committed_data = NULL;
+		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
+	}
+	jbd_unlock_bh_state(bh);
+out:
+	journal_put_journal_head(jh);
+	if (unlikely(committed_data))
+		jbd_free(committed_data, bh->b_size);
+	return err;
+}
+
+/**
+ * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
+ * @handle: transaction
+ * @bh: bufferhead to mark
+ *
+ * Description:
+ * Mark a buffer as containing dirty data which needs to be flushed before
+ * we can commit the current transaction.
+ *
+ * The buffer is placed on the transaction's data list and is marked as
+ * belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * journal_dirty_data() can be called via page_launder->ext3_writepage
+ * by kswapd.
+ */
+int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_brelse = 0;
+	struct journal_head *jh;
+	int ret = 0;
+
+	if (is_handle_aborted(handle))
+		return ret;
+
+	jh = journal_add_journal_head(bh);
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * The buffer could *already* be dirty.  Writeout can start
+	 * at any time.
+	 */
+	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+
+	/*
+	 * What if the buffer is already part of a running transaction?
+	 *
+	 * There are two cases:
+	 * 1) It is part of the current running transaction.  Refile it,
+	 *    just in case we have allocated it as metadata, deallocated
+	 *    it, then reallocated it as data.
+	 * 2) It is part of the previous, still-committing transaction.
+	 *    If all we want to do is to guarantee that the buffer will be
+	 *    written to disk before this new transaction commits, then
+	 *    being sure that the *previous* transaction has this same
+	 *    property is sufficient for us!  Just leave it on its old
+	 *    transaction.
+	 *
+	 * In case (2), the buffer must not already exist as metadata
+	 * --- that would violate write ordering (a transaction is free
+	 * to write its data at any point, even before the previous
+	 * committing transaction has committed).  The caller must
+	 * never, ever allow this to happen: there's nothing we can do
+	 * about it in this layer.
+	 */
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	/* Now that we have bh_state locked, are we really still mapped? */
+	if (!buffer_mapped(bh)) {
+		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
+		goto no_journal;
+	}
+
+	if (jh->b_transaction) {
+		JBUFFER_TRACE(jh, "has transaction");
+		if (jh->b_transaction != handle->h_transaction) {
+			JBUFFER_TRACE(jh, "belongs to older transaction");
+			J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+			/* @@@ IS THIS TRUE  ? */
+			/*
+			 * Not any more.  Scenario: someone does a write()
+			 * in data=journal mode.  The buffer's transaction has
+			 * moved into commit.  Then someone does another
+			 * write() to the file.  We do the frozen data copyout
+			 * and set b_next_transaction to point to j_running_t.
+			 * And while we're in that state, someone does a
+			 * writepage() in an attempt to pageout the same area
+			 * of the file via a shared mapping.  At present that
+			 * calls journal_dirty_data(), and we get right here.
+			 * It may be too late to journal the data.  Simply
+			 * falling through to the next test will suffice: the
+			 * data will be dirty and wil be checkpointed.  The
+			 * ordering comments in the next comment block still
+			 * apply.
+			 */
+			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
+			/*
+			 * If we're journalling data, and this buffer was
+			 * subject to a write(), it could be metadata, forget
+			 * or shadow against the committing transaction.  Now,
+			 * someone has dirtied the same darn page via a mapping
+			 * and it is being writepage()'d.
+			 * We *could* just steal the page from commit, with some
+			 * fancy locking there.  Instead, we just skip it -
+			 * don't tie the page's buffers to the new transaction
+			 * at all.
+			 * Implication: if we crash before the writepage() data
+			 * is written into the filesystem, recovery will replay
+			 * the write() data.
+			 */
+			if (jh->b_jlist != BJ_None &&
+					jh->b_jlist != BJ_SyncData &&
+					jh->b_jlist != BJ_Locked) {
+				JBUFFER_TRACE(jh, "Not stealing");
+				goto no_journal;
+			}
+
+			/*
+			 * This buffer may be undergoing writeout in commit.  We
+			 * can't return from here and let the caller dirty it
+			 * again because that can cause the write-out loop in
+			 * commit to never terminate.
+			 */
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				need_brelse = 1;
+				sync_dirty_buffer(bh);
+				jbd_lock_bh_state(bh);
+				spin_lock(&journal->j_list_lock);
+				/* Since we dropped the lock... */
+				if (!buffer_mapped(bh)) {
+					JBUFFER_TRACE(jh, "buffer got unmapped");
+					goto no_journal;
+				}
+				/* The buffer may become locked again at any
+				   time if it is redirtied */
+			}
+
+			/*
+			 * We cannot remove the buffer with io error from the
+			 * committing transaction, because otherwise it would
+			 * miss the error and the commit would not abort.
+			 */
+			if (unlikely(!buffer_uptodate(bh))) {
+				ret = -EIO;
+				goto no_journal;
+			}
+			/* We might have slept so buffer could be refiled now */
+			if (jh->b_transaction != NULL &&
+			    jh->b_transaction != handle->h_transaction) {
+				JBUFFER_TRACE(jh, "unfile from commit");
+				__journal_temp_unlink_buffer(jh);
+				/* It still points to the committing
+				 * transaction; move it to this one so
+				 * that the refile assert checks are
+				 * happy. */
+				jh->b_transaction = handle->h_transaction;
+			}
+			/* The buffer will be refiled below */
+
+		}
+		/*
+		 * Special case --- the buffer might actually have been
+		 * allocated and then immediately deallocated in the previous,
+		 * committing transaction, so might still be left on that
+		 * transaction's metadata lists.
+		 */
+		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+			JBUFFER_TRACE(jh, "not on correct data list: unfile");
+			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+			JBUFFER_TRACE(jh, "file as data");
+			__journal_file_buffer(jh, handle->h_transaction,
+						BJ_SyncData);
+		}
+	} else {
+		JBUFFER_TRACE(jh, "not on a transaction");
+		__journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+	}
+no_journal:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	if (need_brelse) {
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	JBUFFER_TRACE(jh, "exit");
+	journal_put_journal_head(jh);
+	return ret;
+}
+
+/**
+ * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark
+ *
+ * Mark dirty metadata which needs to be journaled as part of the current
+ * transaction.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit).  In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ */
+int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = bh2jh(bh);
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+	if (is_handle_aborted(handle))
+		goto out;
+
+	jbd_lock_bh_state(bh);
+
+	if (jh->b_modified == 0) {
+		/*
+		 * This buffer's got modified and becoming part
+		 * of the transaction. This needs to be done
+		 * once a transaction -bzzz
+		 */
+		jh->b_modified = 1;
+		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+		handle->h_buffer_credits--;
+	}
+
+	/*
+	 * fastpath, to avoid expensive locking.  If this buffer is already
+	 * on the running transaction's metadata list there is nothing to do.
+	 * Nobody can take it off again because there is a handle open.
+	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
+	 * result in this test being false, so we go in and take the locks.
+	 */
+	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
+		JBUFFER_TRACE(jh, "fastpath");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_running_transaction);
+		goto out_unlock_bh;
+	}
+
+	set_buffer_jbddirty(bh);
+
+	/*
+	 * Metadata already on the current transaction list doesn't
+	 * need to be filed.  Metadata on another transaction's list must
+	 * be committing, and will be refiled once the commit completes:
+	 * leave it alone for now.
+	 */
+	if (jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "already on other transaction");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+		/* And this case is illegal: we can't reuse another
+		 * transaction's data buffer, ever. */
+		goto out_unlock_bh;
+	}
+
+	/* That test should have eliminated the following case: */
+	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+
+	JBUFFER_TRACE(jh, "file as BJ_Metadata");
+	spin_lock(&journal->j_list_lock);
+	__journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+	spin_unlock(&journal->j_list_lock);
+out_unlock_bh:
+	jbd_unlock_bh_state(bh);
+out:
+	JBUFFER_TRACE(jh, "exit");
+	return 0;
+}
+
+/*
+ * journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ */
+void
+journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUFFER_TRACE(bh, "entry");
+}
+
+/**
+ * void journal_forget() - bforget() for potentially-journaled buffers.
+ * @handle: transaction handle
+ * @bh:     bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
+ * buffer.  If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable.  Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+int journal_forget (handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh;
+	int drop_reserve = 0;
+	int err = 0;
+	int was_modified = 0;
+
+	BUFFER_TRACE(bh, "entry");
+
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	if (!buffer_jbd(bh))
+		goto not_jbd;
+	jh = bh2jh(bh);
+
+	/* Critical error: attempting to delete a bitmap buffer, maybe?
+	 * Don't do any jbd operations, and return an error. */
+	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
+			 "inconsistent data on disk")) {
+		err = -EIO;
+		goto not_jbd;
+	}
+
+	/* keep track of whether or not this transaction modified us */
+	was_modified = jh->b_modified;
+
+	/*
+	 * The buffer's going from the transaction, we must drop
+	 * all references -bzzz
+	 */
+	jh->b_modified = 0;
+
+	if (jh->b_transaction == handle->h_transaction) {
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+		/* If we are forgetting a buffer which is already part
+		 * of this transaction, then we can just drop it from
+		 * the transaction immediately. */
+		clear_buffer_dirty(bh);
+		clear_buffer_jbddirty(bh);
+
+		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+
+		/*
+		 * we only want to drop a reference if this transaction
+		 * modified the buffer
+		 */
+		if (was_modified)
+			drop_reserve = 1;
+
+		/*
+		 * We are no longer going to journal this buffer.
+		 * However, the commit of this transaction is still
+		 * important to the buffer: the delete that we are now
+		 * processing might obsolete an old log entry, so by
+		 * committing, we can satisfy the buffer's checkpoint.
+		 *
+		 * So, if we have a checkpoint on the buffer, we should
+		 * now refile the buffer on our BJ_Forget list so that
+		 * we know to remove the checkpoint after we commit.
+		 */
+
+		if (jh->b_cp_transaction) {
+			__journal_temp_unlink_buffer(jh);
+			__journal_file_buffer(jh, transaction, BJ_Forget);
+		} else {
+			__journal_unfile_buffer(jh);
+			if (!buffer_jbd(bh)) {
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				__bforget(bh);
+				goto drop;
+			}
+		}
+	} else if (jh->b_transaction) {
+		J_ASSERT_JH(jh, (jh->b_transaction ==
+				 journal->j_committing_transaction));
+		/* However, if the buffer is still owned by a prior
+		 * (committing) transaction, we can't drop it yet... */
+		JBUFFER_TRACE(jh, "belongs to older transaction");
+		/* ... but we CAN drop it from the new transaction if we
+		 * have also modified it since the original commit. */
+
+		if (jh->b_next_transaction) {
+			J_ASSERT(jh->b_next_transaction == transaction);
+			jh->b_next_transaction = NULL;
+
+			/*
+			 * only drop a reference if this transaction modified
+			 * the buffer
+			 */
+			if (was_modified)
+				drop_reserve = 1;
+		}
+	}
+
+not_jbd:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+drop:
+	if (drop_reserve) {
+		/* no need to reserve log space for this block -bzzz */
+		handle->h_buffer_credits++;
+	}
+	return err;
+}
+
+/**
+ * int journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+ * All done for a particular handle.
+ *
+ * There is not much action needed here.  We just return any remaining
+ * buffer credits to the transaction and remove the handle.  The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances.  In particular, expect it to
+ * return -EIO if a journal_abort has been executed since the
+ * transaction began.
+ */
+int journal_stop(handle_t *handle)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int err;
+	pid_t pid;
+
+	J_ASSERT(journal_current_handle() == handle);
+
+	if (is_handle_aborted(handle))
+		err = -EIO;
+	else {
+		J_ASSERT(transaction->t_updates > 0);
+		err = 0;
+	}
+
+	if (--handle->h_ref > 0) {
+		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+			  handle->h_ref);
+		return err;
+	}
+
+	jbd_debug(4, "Handle %p going down\n", handle);
+
+	/*
+	 * Implement synchronous transaction batching.  If the handle
+	 * was synchronous, don't force a commit immediately.  Let's
+	 * yield and let another thread piggyback onto this transaction.
+	 * Keep doing that while new threads continue to arrive.
+	 * It doesn't cost much - we're about to run a commit and sleep
+	 * on IO anyway.  Speeds up many-threaded, many-dir operations
+	 * by 30x or more...
+	 *
+	 * We try and optimize the sleep time against what the underlying disk
+	 * can do, instead of having a static sleep time.  This is useful for
+	 * the case where our storage is so fast that it is more optimal to go
+	 * ahead and force a flush and wait for the transaction to be committed
+	 * than it is to wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We achieve this by measuring how long it takes
+	 * to commit a transaction, and compare it with how long this
+	 * transaction has been running, and if run time < commit time then we
+	 * sleep for the delta and commit.  This greatly helps super fast disks
+	 * that would see slowdowns as more threads started doing fsyncs.
+	 *
+	 * But don't do this if this process was the most recent one to
+	 * perform a synchronous write.  We do this to detect the case where a
+	 * single process is doing a stream of sync writes.  No point in waiting
+	 * for joiners in that case.
+	 */
+	pid = current->pid;
+	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
+		journal->j_last_sync_writer = pid;
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = min_t(u64, commit_time,
+				    1000*jiffies_to_usecs(1));
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
+	}
+
+	current->journal_info = NULL;
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+	transaction->t_updates--;
+	if (!transaction->t_updates) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	/*
+	 * If the handle is marked SYNC, we need to set another commit
+	 * going!  We also want to force a commit if the current
+	 * transaction is occupying too much of the log, or if the
+	 * transaction is too old now.
+	 */
+	if (handle->h_sync ||
+			transaction->t_outstanding_credits >
+				journal->j_max_transaction_buffers ||
+			time_after_eq(jiffies, transaction->t_expires)) {
+		/* Do this even for aborted journals: an abort still
+		 * completes the commit thread, it just doesn't write
+		 * anything to disk. */
+		tid_t tid = transaction->t_tid;
+
+		spin_unlock(&transaction->t_handle_lock);
+		jbd_debug(2, "transaction too old, requesting commit for "
+					"handle %p\n", handle);
+		/* This is non-blocking */
+		__log_start_commit(journal, transaction->t_tid);
+		spin_unlock(&journal->j_state_lock);
+
+		/*
+		 * Special case: JFS_SYNC synchronous updates require us
+		 * to wait for the commit to complete.
+		 */
+		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+			err = log_wait_commit(journal, tid);
+	} else {
+		spin_unlock(&transaction->t_handle_lock);
+		spin_unlock(&journal->j_state_lock);
+	}
+
+	lock_map_release(&handle->h_lockdep_map);
+
+	jbd_free_handle(handle);
+	return err;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * For synchronous operations: force any uncommitted transactions
+ * to disk.  May seem kludgy, but it reuses all the handle batching
+ * code in a very simple manner.
+ */
+int journal_force_commit(journal_t *journal)
+{
+	handle_t *handle;
+	int ret;
+
+	handle = journal_start(journal, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+	} else {
+		handle->h_sync = 1;
+		ret = journal_stop(handle);
+	}
+	return ret;
+}
+
+/*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+
+/*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ *
+ * j_list_lock is held.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (!*list) {
+		jh->b_tnext = jh->b_tprev = jh;
+		*list = jh;
+	} else {
+		/* Insert at the tail of the list to preserve order */
+		struct journal_head *first = *list, *last = first->b_tprev;
+		jh->b_tprev = last;
+		jh->b_tnext = first;
+		last->b_tnext = first->b_tprev = jh;
+	}
+}
+
+/*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with j_list_lock held, and the journal may not be locked.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (*list == jh) {
+		*list = jh->b_tnext;
+		if (*list == jh)
+			*list = NULL;
+	}
+	jh->b_tprev->b_tnext = jh->b_tnext;
+	jh->b_tnext->b_tprev = jh->b_tprev;
+}
+
+/*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+ * is holding onto a copy of one of thee pointers, it could go bad.
+ * Generally the caller needs to re-read the pointer from the transaction_t.
+ *
+ * Called under j_list_lock.  The journal may not be locked.
+ */
+static void __journal_temp_unlink_buffer(struct journal_head *jh)
+{
+	struct journal_head **list = NULL;
+	transaction_t *transaction;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	transaction = jh->b_transaction;
+	if (transaction)
+		assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	if (jh->b_jlist != BJ_None)
+		J_ASSERT_JH(jh, transaction != NULL);
+
+	switch (jh->b_jlist) {
+	case BJ_None:
+		return;
+	case BJ_SyncData:
+		list = &transaction->t_sync_datalist;
+		break;
+	case BJ_Metadata:
+		transaction->t_nr_buffers--;
+		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	case BJ_Locked:
+		list = &transaction->t_locked_list;
+		break;
+	}
+
+	__blist_del_buffer(list, jh);
+	jh->b_jlist = BJ_None;
+	if (test_clear_buffer_jbddirty(bh))
+		mark_buffer_dirty(bh);	/* Expose it to the VM */
+}
+
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
+void __journal_unfile_buffer(struct journal_head *jh)
+{
+	__journal_temp_unlink_buffer(jh);
+	jh->b_transaction = NULL;
+	journal_put_journal_head(jh);
+}
+
+void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	__journal_unfile_buffer(jh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+}
+
+/*
+ * Called from journal_try_to_free_buffers().
+ *
+ * Called under jbd_lock_bh_state(bh)
+ */
+static void
+__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	struct journal_head *jh;
+
+	jh = bh2jh(bh);
+
+	if (buffer_locked(bh) || buffer_dirty(bh))
+		goto out;
+
+	if (jh->b_next_transaction != NULL)
+		goto out;
+
+	spin_lock(&journal->j_list_lock);
+	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
+		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+			/* A written-back ordered data buffer */
+			JBUFFER_TRACE(jh, "release data");
+			__journal_unfile_buffer(jh);
+		}
+	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+		/* written-back checkpointed metadata buffer */
+		if (jh->b_jlist == BJ_None) {
+			JBUFFER_TRACE(jh, "remove from checkpoint list");
+			__journal_remove_checkpoint(jh);
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+out:
+	return;
+}
+
+/**
+ * int journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
+ *
+ *
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ *
+ * This complicates JBD locking somewhat.  We aren't protected by the
+ * BKL here.  We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a journal_dirty_data on this
+ * buffer.  So we need to lock against that.  journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this?  hmm...  Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state.  But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
+ */
+int journal_try_to_free_buffers(journal_t *journal,
+				struct page *page, gfp_t gfp_mask)
+{
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	J_ASSERT(PageLocked(page));
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		struct journal_head *jh;
+
+		/*
+		 * We take our own ref against the journal_head here to avoid
+		 * having to add tons of locking around each instance of
+		 * journal_put_journal_head().
+		 */
+		jh = journal_grab_journal_head(bh);
+		if (!jh)
+			continue;
+
+		jbd_lock_bh_state(bh);
+		__journal_try_to_free_buffer(journal, bh);
+		journal_put_journal_head(jh);
+		jbd_unlock_bh_state(bh);
+		if (buffer_jbd(bh))
+			goto busy;
+	} while ((bh = bh->b_this_page) != head);
+
+	ret = try_to_free_buffers(page);
+
+busy:
+	return ret;
+}
+
+/*
+ * This buffer is no longer needed.  If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits.  If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ *
+ * Called under j_list_lock.
+ *
+ * Called under jbd_lock_bh_state(bh).
+ */
+static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
+{
+	int may_free = 1;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_cp_transaction) {
+		JBUFFER_TRACE(jh, "on running+cp transaction");
+		__journal_temp_unlink_buffer(jh);
+		/*
+		 * We don't want to write the buffer anymore, clear the
+		 * bit so that we don't confuse checks in
+		 * __journal_file_buffer
+		 */
+		clear_buffer_dirty(bh);
+		__journal_file_buffer(jh, transaction, BJ_Forget);
+		may_free = 0;
+	} else {
+		JBUFFER_TRACE(jh, "on running transaction");
+		__journal_unfile_buffer(jh);
+	}
+	return may_free;
+}
+
+/*
+ * journal_invalidatepage
+ *
+ * This code is tricky.  It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling invalidatepage on the
+ * data.
+ *
+ *  This is done in ext3 by defining an ext3_setattr method which
+ *  updates i_size before truncate gets going.  By maintaining this
+ *  invariant, we can be sure that it is safe to throw away any buffers
+ *  attached to the current transaction: once the transaction commits,
+ *  we know that the data will not be needed.
+ *
+ *  Note however that we can *not* throw away data belonging to the
+ *  previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ *  The bitmap committed_data images guarantee this: any block which is
+ *  allocated in one transaction and removed in the next will be marked
+ *  as in-use in the committed_data bitmap, so cannot be reused until
+ *  the next transaction to delete the block commits.  This means that
+ *  leaving committing buffers dirty is quite safe: the disk blocks
+ *  cannot be reallocated to a different file and so buffer aliasing is
+ *  not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode.  In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode.  --sct
+ */
+
+/*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here.  Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+				int partial_page)
+{
+	transaction_t *transaction;
+	struct journal_head *jh;
+	int may_free = 1;
+
+	BUFFER_TRACE(bh, "entry");
+
+retry:
+	/*
+	 * It is safe to proceed here without the j_list_lock because the
+	 * buffers cannot be stolen by try_to_free_buffers as long as we are
+	 * holding the page lock. --sct
+	 */
+
+	if (!buffer_jbd(bh))
+		goto zap_buffer_unlocked;
+
+	spin_lock(&journal->j_state_lock);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	jh = journal_grab_journal_head(bh);
+	if (!jh)
+		goto zap_buffer_no_jh;
+
+	/*
+	 * We cannot remove the buffer from checkpoint lists until the
+	 * transaction adding inode to orphan list (let's call it T)
+	 * is committed.  Otherwise if the transaction changing the
+	 * buffer would be cleaned from the journal before T is
+	 * committed, a crash will cause that the correct contents of
+	 * the buffer will be lost.  On the other hand we have to
+	 * clear the buffer dirty bit at latest at the moment when the
+	 * transaction marking the buffer as freed in the filesystem
+	 * structures is committed because from that moment on the
+	 * block can be reallocated and used by a different page.
+	 * Since the block hasn't been freed yet but the inode has
+	 * already been added to orphan list, it is safe for us to add
+	 * the buffer to BJ_Forget list of the newest transaction.
+	 *
+	 * Also we have to clear buffer_mapped flag of a truncated buffer
+	 * because the buffer_head may be attached to the page straddling
+	 * i_size (can happen only when blocksize < pagesize) and thus the
+	 * buffer_head can be reused when the file is extended again. So we end
+	 * up keeping around invalidated buffers attached to transactions'
+	 * BJ_Forget list just to stop checkpointing code from cleaning up
+	 * the transaction this buffer was modified in.
+	 */
+	transaction = jh->b_transaction;
+	if (transaction == NULL) {
+		/* First case: not on any transaction.  If it
+		 * has no checkpoint link, then we can zap it:
+		 * it's a writeback-mode buffer so we don't care
+		 * if it hits disk safely. */
+		if (!jh->b_cp_transaction) {
+			JBUFFER_TRACE(jh, "not on any transaction: zap");
+			goto zap_buffer;
+		}
+
+		if (!buffer_dirty(bh)) {
+			/* bdflush has written it.  We can drop it now */
+			goto zap_buffer;
+		}
+
+		/* OK, it must be in the journal but still not
+		 * written fully to disk: it's metadata or
+		 * journaled data... */
+
+		if (journal->j_running_transaction) {
+			/* ... and once the current transaction has
+			 * committed, the buffer won't be needed any
+			 * longer. */
+			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+			may_free = __dispose_buffer(jh,
+					journal->j_running_transaction);
+			goto zap_buffer;
+		} else {
+			/* There is no currently-running transaction. So the
+			 * orphan record which we wrote for this file must have
+			 * passed into commit.  We must attach this buffer to
+			 * the committing transaction, if it exists. */
+			if (journal->j_committing_transaction) {
+				JBUFFER_TRACE(jh, "give to committing trans");
+				may_free = __dispose_buffer(jh,
+					journal->j_committing_transaction);
+				goto zap_buffer;
+			} else {
+				/* The orphan record's transaction has
+				 * committed.  We can cleanse this buffer */
+				clear_buffer_jbddirty(bh);
+				goto zap_buffer;
+			}
+		}
+	} else if (transaction == journal->j_committing_transaction) {
+		JBUFFER_TRACE(jh, "on committing transaction");
+		if (jh->b_jlist == BJ_Locked) {
+			/*
+			 * The buffer is on the committing transaction's locked
+			 * list.  We have the buffer locked, so I/O has
+			 * completed.  So we can nail the buffer now.
+			 */
+			may_free = __dispose_buffer(jh, transaction);
+			goto zap_buffer;
+		}
+		/*
+		 * The buffer is committing, we simply cannot touch
+		 * it. If the page is straddling i_size we have to wait
+		 * for commit and try again.
+		 */
+		if (partial_page) {
+			tid_t tid = journal->j_committing_transaction->t_tid;
+
+			journal_put_journal_head(jh);
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			spin_unlock(&journal->j_state_lock);
+			unlock_buffer(bh);
+			log_wait_commit(journal, tid);
+			lock_buffer(bh);
+			goto retry;
+		}
+		/*
+		 * OK, buffer won't be reachable after truncate. We just set
+		 * j_next_transaction to the running transaction (if there is
+		 * one) and mark buffer as freed so that commit code knows it
+		 * should clear dirty bits when it is done with the buffer.
+		 */
+		set_buffer_freed(bh);
+		if (journal->j_running_transaction && buffer_jbddirty(bh))
+			jh->b_next_transaction = journal->j_running_transaction;
+		journal_put_journal_head(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		spin_unlock(&journal->j_state_lock);
+		return 0;
+	} else {
+		/* Good, the buffer belongs to the running transaction.
+		 * We are writing our own transaction's data, not any
+		 * previous one's, so it is safe to throw it away
+		 * (remember that we expect the filesystem to have set
+		 * i_size already for this truncate so recovery will not
+		 * expose the disk blocks we are discarding here.) */
+		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+		JBUFFER_TRACE(jh, "on running transaction");
+		may_free = __dispose_buffer(jh, transaction);
+	}
+
+zap_buffer:
+	/*
+	 * This is tricky. Although the buffer is truncated, it may be reused
+	 * if blocksize < pagesize and it is attached to the page straddling
+	 * EOF. Since the buffer might have been added to BJ_Forget list of the
+	 * running transaction, journal_get_write_access() won't clear
+	 * b_modified and credit accounting gets confused. So clear b_modified
+	 * here. */
+	jh->b_modified = 0;
+	journal_put_journal_head(jh);
+zap_buffer_no_jh:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	spin_unlock(&journal->j_state_lock);
+zap_buffer_unlocked:
+	clear_buffer_dirty(bh);
+	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	bh->b_bdev = NULL;
+	return may_free;
+}
+
+/**
+ * void journal_invalidatepage() - invalidate a journal page
+ * @journal: journal to use for flush
+ * @page:    page to flush
+ * @offset:  offset of the range to invalidate
+ * @length:  length of the range to invalidate
+ *
+ * Reap page buffers containing data in specified range in page.
+ */
+void journal_invalidatepage(journal_t *journal,
+		      struct page *page,
+		      unsigned int offset,
+		      unsigned int length)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int stop = offset + length;
+	unsigned int curr_off = 0;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
+	int may_free = 1;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page_has_buffers(page))
+		return;
+
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
+	/* We will potentially be playing with lists other than just the
+	 * data lists (especially for journaled data mode), so be
+	 * cautious in our locking. */
+
+	head = bh = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		if (next_off > stop)
+			return;
+
+		if (offset <= curr_off) {
+			/* This block is wholly outside the truncation point */
+			lock_buffer(bh);
+			may_free &= journal_unmap_buffer(journal, bh,
+							 partial_page);
+			unlock_buffer(bh);
+		}
+		curr_off = next_off;
+		bh = next;
+
+	} while (bh != head);
+
+	if (!partial_page) {
+		if (may_free && try_to_free_buffers(page))
+			J_ASSERT(!page_has_buffers(page));
+	}
+}
+
+/*
+ * File a buffer on the given transaction list.
+ */
+void __journal_file_buffer(struct journal_head *jh,
+			transaction_t *transaction, int jlist)
+{
+	struct journal_head **list = NULL;
+	int was_dirty = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+				jh->b_transaction == NULL);
+
+	if (jh->b_transaction && jh->b_jlist == jlist)
+		return;
+
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		/*
+		 * For metadata buffers, we track dirty bit in buffer_jbddirty
+		 * instead of buffer_dirty. We should not see a dirty bit set
+		 * here because we clear it in do_get_write_access but e.g.
+		 * tune2fs can modify the sb and set the dirty bit at any time
+		 * so we try to gracefully handle that.
+		 */
+		if (buffer_dirty(bh))
+			warn_dirty_buffer(bh);
+		if (test_clear_buffer_dirty(bh) ||
+		    test_clear_buffer_jbddirty(bh))
+			was_dirty = 1;
+	}
+
+	if (jh->b_transaction)
+		__journal_temp_unlink_buffer(jh);
+	else
+		journal_grab_journal_head(bh);
+	jh->b_transaction = transaction;
+
+	switch (jlist) {
+	case BJ_None:
+		J_ASSERT_JH(jh, !jh->b_committed_data);
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+		return;
+	case BJ_SyncData:
+		list = &transaction->t_sync_datalist;
+		break;
+	case BJ_Metadata:
+		transaction->t_nr_buffers++;
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	case BJ_Locked:
+		list =  &transaction->t_locked_list;
+		break;
+	}
+
+	__blist_add_buffer(list, jh);
+	jh->b_jlist = jlist;
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+void journal_file_buffer(struct journal_head *jh,
+				transaction_t *transaction, int jlist)
+{
+	jbd_lock_bh_state(jh2bh(jh));
+	spin_lock(&transaction->t_journal->j_list_lock);
+	__journal_file_buffer(jh, transaction, jlist);
+	spin_unlock(&transaction->t_journal->j_list_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely.  If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ *
+ * Called under j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
+ */
+void __journal_refile_buffer(struct journal_head *jh)
+{
+	int was_dirty, jlist;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	if (jh->b_transaction)
+		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
+
+	/* If the buffer is now unused, just drop it. */
+	if (jh->b_next_transaction == NULL) {
+		__journal_unfile_buffer(jh);
+		return;
+	}
+
+	/*
+	 * It has been modified by a later transaction: add it to the new
+	 * transaction's metadata list.
+	 */
+
+	was_dirty = test_clear_buffer_jbddirty(bh);
+	__journal_temp_unlink_buffer(jh);
+	/*
+	 * We set b_transaction here because b_next_transaction will inherit
+	 * our jh reference and thus __journal_file_buffer() must not take a
+	 * new one.
+	 */
+	jh->b_transaction = jh->b_next_transaction;
+	jh->b_next_transaction = NULL;
+	if (buffer_freed(bh))
+		jlist = BJ_Forget;
+	else if (jh->b_modified)
+		jlist = BJ_Metadata;
+	else
+		jlist = BJ_Reserved;
+	__journal_file_buffer(jh, jh->b_transaction, jlist);
+	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+/*
+ * __journal_refile_buffer() with necessary locking added. We take our bh
+ * reference so that we can safely unlock bh.
+ *
+ * The jh and bh may be freed by this call.
+ */
+void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	__journal_refile_buffer(jh);
+	jbd_unlock_bh_state(bh);
+	spin_unlock(&journal->j_list_lock);
+	__brelse(bh);
+}
diff --git a/kernel/fs/jbd2/Kconfig b/kernel/fs/jbd2/Kconfig
new file mode 100644
index 000000000..5a9f5534d
--- /dev/null
+++ b/kernel/fs/jbd2/Kconfig
@@ -0,0 +1,35 @@
+config JBD2
+	tristate
+	select CRC32
+	select CRYPTO
+	select CRYPTO_CRC32C
+	help
+	  This is a generic journaling layer for block devices that support
+	  both 32-bit and 64-bit block numbers.  It is currently used by
+	  the ext4 and OCFS2 filesystems, but it could also be used to add
+	  journal support to other file systems or block devices such
+	  as RAID or LVM.
+
+	  If you are using ext4 or OCFS2, you need to say Y here.
+	  If you are not using ext4 or OCFS2 then you will
+	  probably want to say N.
+
+	  To compile this device as a module, choose M here. The module will be
+	  called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
+	  you cannot compile this code as a module.
+
+config JBD2_DEBUG
+	bool "JBD2 (ext4) debugging support"
+	depends on JBD2
+	help
+	  If you are using the ext4 journaled file system (or
+	  potentially any other filesystem/device using JBD2), this option
+	  allows you to enable debugging output while the system is running,
+	  in order to help track down any problems you are having.
+	  By default, the debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
+	  number between 1 and 5. The higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/kernel/fs/jbd2/Makefile b/kernel/fs/jbd2/Makefile
new file mode 100644
index 000000000..802a34138
--- /dev/null
+++ b/kernel/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux journaling routines.
+#
+
+obj-$(CONFIG_JBD2) += jbd2.o
+
+jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/kernel/fs/jbd2/checkpoint.c b/kernel/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000..6f691c289
--- /dev/null
+++ b/kernel/fs/jbd2/checkpoint.c
@@ -0,0 +1,647 @@
+/*
+ * linux/fs/jbd2/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <trace/events/jbd2.h>
+
+/*
+ * Unlink a buffer from a transaction checkpoint list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink_first(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	jh->b_cpnext->b_cpprev = jh->b_cpprev;
+	jh->b_cpprev->b_cpnext = jh->b_cpnext;
+	if (transaction->t_checkpoint_list == jh) {
+		transaction->t_checkpoint_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
+}
+
+/*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
+ * Requires j_list_lock
+ */
+static int __try_to_free_cp_buf(struct journal_head *jh)
+{
+	int ret = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_transaction == NULL && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+		JBUFFER_TRACE(jh, "remove from checkpoint list");
+		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
+	}
+	return ret;
+}
+
+/*
+ * __jbd2_log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called under j-state_lock *only*.  It will be unlocked if we have to wait
+ * for a checkpoint to free up some space in the log.
+ */
+void __jbd2_log_wait_for_space(journal_t *journal)
+{
+	int nblocks, space_left;
+	/* assert_spin_locked(&journal->j_state_lock); */
+
+	nblocks = jbd2_space_needed(journal);
+	while (jbd2_log_space_left(journal) < nblocks) {
+		write_unlock(&journal->j_state_lock);
+		if (current->plug)
+			io_schedule();
+		mutex_lock(&journal->j_checkpoint_mutex);
+
+		/*
+		 * Test again, another process may have checkpointed while we
+		 * were waiting for the checkpoint lock. If there are no
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
+		 */
+		write_lock(&journal->j_state_lock);
+		if (journal->j_flags & JBD2_ABORT) {
+			mutex_unlock(&journal->j_checkpoint_mutex);
+			return;
+		}
+		spin_lock(&journal->j_list_lock);
+		nblocks = jbd2_space_needed(journal);
+		space_left = jbd2_log_space_left(journal);
+		if (space_left < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
+
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
+			spin_unlock(&journal->j_list_lock);
+			write_unlock(&journal->j_state_lock);
+			if (chkpt) {
+				jbd2_log_do_checkpoint(journal);
+			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				/*
+				 * jbd2_journal_commit_transaction() may want
+				 * to take the checkpoint_mutex if JBD2_FLUSHED
+				 * is set.  So we need to temporarily drop it.
+				 */
+				mutex_unlock(&journal->j_checkpoint_mutex);
+				jbd2_log_wait_commit(journal, tid);
+				write_lock(&journal->j_state_lock);
+				continue;
+			} else {
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space in %s\n", __func__,
+				       journal->j_devname);
+				WARN_ON(1);
+				jbd2_journal_abort(journal, 0);
+			}
+			write_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
+		}
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+}
+
+static void
+__flush_batch(journal_t *journal, int *batch_count)
+{
+	int i;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
+	blk_finish_plug(&plug);
+
+	for (i = 0; i < *batch_count; i++) {
+		struct buffer_head *bh = journal->j_chkpt_bhs[i];
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	*batch_count = 0;
+}
+
+/*
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
+ *
+ * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
+ */
+int jbd2_log_do_checkpoint(journal_t *journal)
+{
+	struct journal_head	*jh;
+	struct buffer_head	*bh;
+	transaction_t		*transaction;
+	tid_t			this_tid;
+	int			result, batch_count = 0;
+
+	jbd_debug(1, "Start checkpoint\n");
+
+	/*
+	 * First thing: if there are any transactions in the log which
+	 * don't need checkpointing, just eliminate them from the
+	 * journal straight away.
+	 */
+	result = jbd2_cleanup_journal_tail(journal);
+	trace_jbd2_checkpoint(journal, result);
+	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	if (result <= 0)
+		return result;
+
+	/*
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
+	 */
+	result = 0;
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction->t_chp_stats.cs_chp_time == 0)
+		transaction->t_chp_stats.cs_chp_time = jiffies;
+	this_tid = transaction->t_tid;
+restart:
+	/*
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
+	 */
+	if (journal->j_checkpoint_transactions != transaction ||
+	    transaction->t_tid != this_tid)
+		goto out;
+
+	/* checkpoint all of the transaction's buffers */
+	while (transaction->t_checkpoint_list) {
+		jh = transaction->t_checkpoint_list;
+		bh = jh2bh(jh);
+
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			get_bh(bh);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			goto retry;
+		}
+		if (jh->b_transaction != NULL) {
+			transaction_t *t = jh->b_transaction;
+			tid_t tid = t->t_tid;
+
+			transaction->t_chp_stats.cs_forced_to_close++;
+			spin_unlock(&journal->j_list_lock);
+			if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+				/*
+				 * The journal thread is dead; so
+				 * starting and waiting for a commit
+				 * to finish will cause us to wait for
+				 * a _very_ long time.
+				 */
+				printk(KERN_ERR
+		"JBD2: %s: Waiting for Godot: block %llu\n",
+		journal->j_devname, (unsigned long long) bh->b_blocknr);
+
+			jbd2_log_start_commit(journal, tid);
+			jbd2_log_wait_commit(journal, tid);
+			goto retry;
+		}
+		if (!buffer_dirty(bh)) {
+			if (unlikely(buffer_write_io_error(bh)) && !result)
+				result = -EIO;
+			BUFFER_TRACE(bh, "remove from checkpoint");
+			if (__jbd2_journal_remove_checkpoint(jh))
+				/* The transaction was released; we're done */
+				goto out;
+			continue;
+		}
+		/*
+		 * Important: we are about to write the buffer, and
+		 * possibly block, while still holding the journal
+		 * lock.  We cannot afford to let the transaction
+		 * logic start messing around with this buffer before
+		 * we write it to disk, as that would break
+		 * recoverability.
+		 */
+		BUFFER_TRACE(bh, "queue");
+		get_bh(bh);
+		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+		journal->j_chkpt_bhs[batch_count++] = bh;
+		__buffer_relink_io(jh);
+		transaction->t_chp_stats.cs_written++;
+		if ((batch_count == JBD2_NR_BATCH) ||
+		    need_resched() ||
+		    spin_needbreak(&journal->j_list_lock))
+			goto unlock_and_flush;
+	}
+
+	if (batch_count) {
+		unlock_and_flush:
+			spin_unlock(&journal->j_list_lock);
+		retry:
+			if (batch_count)
+				__flush_batch(journal, &batch_count);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+	}
+
+	/*
+	 * Now we issued all of the transaction's buffers, let's deal
+	 * with the buffers that are out for I/O.
+	 */
+restart2:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+	    transaction->t_tid != this_tid)
+		goto out;
+
+	while (transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			get_bh(bh);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart2;
+		}
+		if (unlikely(buffer_write_io_error(bh)) && !result)
+			result = -EIO;
+
+		/*
+		 * Now in whatever state the buffer currently is, we
+		 * know that it has been written out and so we can
+		 * drop it from the list
+		 */
+		if (__jbd2_journal_remove_checkpoint(jh))
+			break;
+	}
+out:
+	spin_unlock(&journal->j_list_lock);
+	if (result < 0)
+		jbd2_journal_abort(journal, result);
+	else
+		result = jbd2_cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
+}
+
+/*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock.  If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * Called with the journal lock held.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts.  Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
+ */
+
+int jbd2_cleanup_journal_tail(journal_t *journal)
+{
+	tid_t		first_tid;
+	unsigned long	blocknr;
+
+	if (is_journal_aborted(journal))
+		return 1;
+
+	if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
+		return 1;
+	J_ASSERT(blocknr != 0);
+
+	/*
+	 * We need to make sure that any blocks that were recently written out
+	 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
+	 * we drop the transactions from the journal. It's unlikely this will
+	 * be necessary, especially with an appropriately sized journal, but we
+	 * need this to guarantee correctness.  Fortunately
+	 * jbd2_cleanup_journal_tail() doesn't get called all that often.
+	 */
+	if (journal->j_flags & JBD2_BARRIER)
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+
+	__jbd2_update_log_tail(journal, first_tid, blocknr);
+	return 0;
+}
+
+
+/* Checkpoint list management */
+
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and
+ * release them.
+ *
+ * Called with j_list_lock held.
+ * Returns 1 if we freed the transaction, 0 otherwise.
+ */
+static int journal_clean_one_cp_list(struct journal_head *jh)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret;
+	int freed = 0;
+
+	if (!jh)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		ret = __try_to_free_cp_buf(jh);
+		if (!ret)
+			return freed;
+		if (ret == 2)
+			return 1;
+		freed = 1;
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with j_list_lock held.
+ */
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+{
+	transaction_t *transaction, *last_transaction, *next_transaction;
+	int ret;
+
+	transaction = journal->j_checkpoint_transactions;
+	if (!transaction)
+		return;
+
+	last_transaction = transaction->t_cpprev;
+	next_transaction = transaction;
+	do {
+		transaction = next_transaction;
+		next_transaction = transaction->t_cpnext;
+		ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			return;
+		if (ret)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret = journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list);
+		if (need_resched())
+			return;
+		/*
+		 * Stop scanning if we couldn't free the transaction. This
+		 * avoids pointless scanning of transactions which still
+		 * weren't checkpointed.
+		 */
+		if (!ret)
+			return;
+	} while (transaction != last_transaction);
+}
+
+/*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk.  To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * lists until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
+ *
+ * This function is called with j_list_lock held.
+ */
+int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
+{
+	struct transaction_chp_stats_s *stats;
+	transaction_t *transaction;
+	journal_t *journal;
+	int ret = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	if ((transaction = jh->b_cp_transaction) == NULL) {
+		JBUFFER_TRACE(jh, "not on transaction");
+		goto out;
+	}
+	journal = transaction->t_journal;
+
+	JBUFFER_TRACE(jh, "removing from transaction");
+	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
+	jbd2_journal_put_journal_head(jh);
+
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
+		goto out;
+
+	/*
+	 * There is one special case to worry about: if we have just pulled the
+	 * buffer off a running or committing transaction's checkpoing list,
+	 * then even if the checkpoint list is empty, the transaction obviously
+	 * cannot be dropped!
+	 *
+	 * The locking here around t_state is a bit sleazy.
+	 * See the comment at the end of jbd2_journal_commit_transaction().
+	 */
+	if (transaction->t_state != T_FINISHED)
+		goto out;
+
+	/* OK, that was the last buffer for the transaction: we can now
+	   safely remove this transaction from the log */
+	stats = &transaction->t_chp_stats;
+	if (stats->cs_chp_time)
+		stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
+						    jiffies);
+	trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
+				    transaction->t_tid, stats);
+
+	__jbd2_journal_drop_transaction(journal, transaction);
+	jbd2_journal_free_transaction(transaction);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
+			       transaction_t *transaction)
+{
+	JBUFFER_TRACE(jh, "entry");
+	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+	/* Get reference for checkpointing transaction */
+	jbd2_journal_grab_journal_head(jh2bh(jh));
+	jh->b_cp_transaction = transaction;
+
+	if (!transaction->t_checkpoint_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_list;
+		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_list = jh;
+}
+
+/*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+
+void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+{
+	assert_spin_locked(&journal->j_list_lock);
+	if (transaction->t_cpnext) {
+		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions =
+				transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions = NULL;
+	}
+
+	J_ASSERT(transaction->t_state == T_FINISHED);
+	J_ASSERT(transaction->t_buffers == NULL);
+	J_ASSERT(transaction->t_forget == NULL);
+	J_ASSERT(transaction->t_shadow_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
+	J_ASSERT(journal->j_committing_transaction != transaction);
+	J_ASSERT(journal->j_running_transaction != transaction);
+
+	trace_jbd2_drop_transaction(journal, transaction);
+
+	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+}
diff --git a/kernel/fs/jbd2/commit.c b/kernel/fs/jbd2/commit.c
new file mode 100644
index 000000000..b73e0215b
--- /dev/null
+++ b/kernel/fs/jbd2/commit.c
@@ -0,0 +1,1164 @@
+/*
+ * linux/fs/jbd2/commit.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/jiffies.h>
+#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/bitops.h>
+#include <trace/events/jbd2.h>
+
+/*
+ * IO end handler for temporary buffer_heads handling writes to the journal.
+ */
+static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	struct buffer_head *orig_bh = bh->b_private;
+
+	BUFFER_TRACE(bh, "");
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	if (orig_bh) {
+		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+		smp_mb__after_atomic();
+		wake_up_bit(&orig_bh->b_state, BH_Shadow);
+	}
+	unlock_buffer(bh);
+}
+
+/*
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
+ * After the transaction commits, these pages are left on the LRU, with no
+ * ->mapping, and with attached buffers.  These pages are trivially reclaimable
+ * by the VM, but their apparent absence upsets the VM accounting, and it makes
+ * the numbers in /proc/meminfo look odd.
+ *
+ * So here, we have a buffer which has just come off the forget list.  Look to
+ * see if we can strip all buffers from the backing page.
+ *
+ * Called under lock_journal(), and possibly under journal_datalist_lock.  The
+ * caller provided us with a ref against the buffer, and we drop that here.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+	struct page *page;
+
+	if (buffer_dirty(bh))
+		goto nope;
+	if (atomic_read(&bh->b_count) != 1)
+		goto nope;
+	page = bh->b_page;
+	if (!page)
+		goto nope;
+	if (page->mapping)
+		goto nope;
+
+	/* OK, it's a truncated page */
+	if (!trylock_page(page))
+		goto nope;
+
+	page_cache_get(page);
+	__brelse(bh);
+	try_to_free_buffers(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return;
+
+nope:
+	__brelse(bh);
+}
+
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
+{
+	struct commit_header *h;
+	__u32 csum;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return;
+
+	h = (struct commit_header *)(bh->b_data);
+	h->h_chksum_type = 0;
+	h->h_chksum_size = 0;
+	h->h_chksum[0] = 0;
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+	h->h_chksum[0] = cpu_to_be32(csum);
+}
+
+/*
+ * Done it all: now submit the commit record.  We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_submit_commit_record(journal_t *journal,
+					transaction_t *commit_transaction,
+					struct buffer_head **cbh,
+					__u32 crc32_sum)
+{
+	struct commit_header *tmp;
+	struct buffer_head *bh;
+	int ret;
+	struct timespec now = current_kernel_time();
+
+	*cbh = NULL;
+
+	if (is_journal_aborted(journal))
+		return 0;
+
+	bh = jbd2_journal_get_descriptor_buffer(journal);
+	if (!bh)
+		return 1;
+
+	tmp = (struct commit_header *)bh->b_data;
+	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
+	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
+
+	if (JBD2_HAS_COMPAT_FEATURE(journal,
+				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
+		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
+		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
+	}
+	jbd2_commit_block_csum_set(journal, bh);
+
+	BUFFER_TRACE(bh, "submit commit block");
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	set_buffer_uptodate(bh);
+	bh->b_end_io = journal_end_buffer_io_sync;
+
+	if (journal->j_flags & JBD2_BARRIER &&
+	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+	else
+		ret = submit_bh(WRITE_SYNC, bh);
+
+	*cbh = bh;
+	return ret;
+}
+
+/*
+ * This function along with journal_submit_commit_record
+ * allows to write the commit record asynchronously.
+ */
+static int journal_wait_on_commit_record(journal_t *journal,
+					 struct buffer_head *bh)
+{
+	int ret = 0;
+
+	clear_buffer_dirty(bh);
+	wait_on_buffer(bh);
+
+	if (unlikely(!buffer_uptodate(bh)))
+		ret = -EIO;
+	put_bh(bh);            /* One for getblk() */
+
+	return ret;
+}
+
+/*
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
+ */
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
+{
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = 0,
+		.range_end = i_size_read(mapping->host),
+	};
+
+	ret = generic_writepages(mapping, &wbc);
+	return ret;
+}
+
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
+
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		spin_unlock(&journal->j_list_lock);
+		/*
+		 * submit the inode data buffers. We use writepage
+		 * instead of writepages. Because writepages can do
+		 * block allocation  with delalloc. We need to write
+		 * only allocated blocks here.
+		 */
+		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
+		err = journal_submit_inode_data_buffers(mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_atomic();
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
+}
+
+/*
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
+ */
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
+
+	/* For locking, see the comment in journal_submit_data_buffers() */
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (err) {
+			/*
+			 * Because AS_EIO is cleared by
+			 * filemap_fdatawait_range(), set it again so
+			 * that user process can get -EIO from fsync().
+			 */
+			set_bit(AS_EIO,
+				&jinode->i_vfs_inode->i_mapping->flags);
+
+			if (!ret)
+				ret = err;
+		}
+		spin_lock(&journal->j_list_lock);
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_atomic();
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
+		} else {
+			jinode->i_transaction = NULL;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	return ret;
+}
+
+static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	char *addr;
+	__u32 checksum;
+
+	addr = kmap_atomic(page);
+	checksum = crc32_be(crc32_sum,
+		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
+	kunmap_atomic(addr);
+
+	return checksum;
+}
+
+static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
+				   unsigned long long block)
+{
+	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
+	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
+		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
+}
+
+static void jbd2_descr_block_csum_set(journal_t *j,
+				      struct buffer_head *bh)
+{
+	struct jbd2_journal_block_tail *tail;
+	__u32 csum;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return;
+
+	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
+			sizeof(struct jbd2_journal_block_tail));
+	tail->t_checksum = 0;
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+	tail->t_checksum = cpu_to_be32(csum);
+}
+
+static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
+				    struct buffer_head *bh, __u32 sequence)
+{
+	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
+	struct page *page = bh->b_page;
+	__u8 *addr;
+	__u32 csum32;
+	__be32 seq;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return;
+
+	seq = cpu_to_be32(sequence);
+	addr = kmap_atomic(page);
+	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+			     bh->b_size);
+	kunmap_atomic(addr);
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+		tag3->t_checksum = cpu_to_be32(csum32);
+	else
+		tag->t_checksum = cpu_to_be16(csum32);
+}
+/*
+ * jbd2_journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log.  This
+ * function is called by the journal thread to begin a complete commit.
+ */
+void jbd2_journal_commit_transaction(journal_t *journal)
+{
+	struct transaction_stats_s stats;
+	transaction_t *commit_transaction;
+	struct journal_head *jh;
+	struct buffer_head *descriptor;
+	struct buffer_head **wbuf = journal->j_wbuf;
+	int bufs;
+	int flags;
+	int err;
+	unsigned long long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
+	char *tagp = NULL;
+	journal_header_t *header;
+	journal_block_tag_t *tag = NULL;
+	int space_left = 0;
+	int first_tag = 0;
+	int tag_flag;
+	int i;
+	int tag_bytes = journal_tag_bytes(journal);
+	struct buffer_head *cbh = NULL; /* For transactional checksums */
+	__u32 crc32_sum = ~0;
+	struct blk_plug plug;
+	/* Tail of the journal */
+	unsigned long first_block;
+	tid_t first_tid;
+	int update_tail;
+	int csum_size = 0;
+	LIST_HEAD(io_bufs);
+	LIST_HEAD(log_bufs);
+
+	if (jbd2_journal_has_csum_v2or3(journal))
+		csum_size = sizeof(struct jbd2_journal_block_tail);
+
+	/*
+	 * First job: lock down the current transaction and wait for
+	 * all outstanding updates to complete.
+	 */
+
+	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
+	if (journal->j_flags & JBD2_FLUSHED) {
+		jbd_debug(3, "super block updated\n");
+		mutex_lock(&journal->j_checkpoint_mutex);
+		/*
+		 * We hold j_checkpoint_mutex so tail cannot change under us.
+		 * We don't need any special data guarantees for writing sb
+		 * since journal is empty and it is ok for write to be
+		 * flushed only with transaction commit.
+		 */
+		jbd2_journal_update_sb_log_tail(journal,
+						journal->j_tail_sequence,
+						journal->j_tail,
+						WRITE_SYNC);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	} else {
+		jbd_debug(3, "superblock not updated\n");
+	}
+
+	J_ASSERT(journal->j_running_transaction != NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+
+	commit_transaction = journal->j_running_transaction;
+
+	trace_jbd2_start_commit(journal, commit_transaction);
+	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
+			commit_transaction->t_tid);
+
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_RUNNING);
+	commit_transaction->t_state = T_LOCKED;
+
+	trace_jbd2_commit_locking(journal, commit_transaction);
+	stats.run.rs_wait = commit_transaction->t_max_wait;
+	stats.run.rs_request_delay = 0;
+	stats.run.rs_locked = jiffies;
+	if (commit_transaction->t_requested)
+		stats.run.rs_request_delay =
+			jbd2_time_diff(commit_transaction->t_requested,
+				       stats.run.rs_locked);
+	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
+					      stats.run.rs_locked);
+
+	spin_lock(&commit_transaction->t_handle_lock);
+	while (atomic_read(&commit_transaction->t_updates)) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+					TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&commit_transaction->t_updates)) {
+			spin_unlock(&commit_transaction->t_handle_lock);
+			write_unlock(&journal->j_state_lock);
+			schedule();
+			write_lock(&journal->j_state_lock);
+			spin_lock(&commit_transaction->t_handle_lock);
+		}
+		finish_wait(&journal->j_wait_updates, &wait);
+	}
+	spin_unlock(&commit_transaction->t_handle_lock);
+
+	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
+			journal->j_max_transaction_buffers);
+
+	/*
+	 * First thing we are allowed to do is to discard any remaining
+	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+	 * that there are no such buffers: if a large filesystem
+	 * operation like a truncate needs to split itself over multiple
+	 * transactions, then it may try to do a jbd2_journal_restart() while
+	 * there are still BJ_Reserved buffers outstanding.  These must
+	 * be released cleanly from the current transaction.
+	 *
+	 * In this case, the filesystem must still reserve write access
+	 * again before modifying the buffer in the new transaction, but
+	 * we do not require it to remember exactly which old buffers it
+	 * has reserved.  This is consistent with the existing behaviour
+	 * that multiple jbd2_journal_get_write_access() calls to the same
+	 * buffer are perfectly permissible.
+	 */
+	while (commit_transaction->t_reserved_list) {
+		jh = commit_transaction->t_reserved_list;
+		JBUFFER_TRACE(jh, "reserved, unused: refile");
+		/*
+		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
+		 * leave undo-committed data.
+		 */
+		if (jh->b_committed_data) {
+			struct buffer_head *bh = jh2bh(jh);
+
+			jbd_lock_bh_state(bh);
+			jbd2_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			jbd_unlock_bh_state(bh);
+		}
+		jbd2_journal_refile_buffer(journal, jh);
+	}
+
+	/*
+	 * Now try to drop any written-back buffers from the journal's
+	 * checkpoint lists.  We do this *before* commit because it potentially
+	 * frees some memory
+	 */
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_clean_checkpoint_list(journal);
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug(3, "JBD2: commit phase 1\n");
+
+	/*
+	 * Clear revoked flag to reflect there is no revoked buffers
+	 * in the next transaction which is going to be started.
+	 */
+	jbd2_clear_buffer_revoked_flags(journal);
+
+	/*
+	 * Switch to a new revoke table.
+	 */
+	jbd2_journal_switch_revoke_table(journal);
+
+	/*
+	 * Reserved credits cannot be claimed anymore, free them
+	 */
+	atomic_sub(atomic_read(&journal->j_reserved_credits),
+		   &commit_transaction->t_outstanding_credits);
+
+	trace_jbd2_commit_flushing(journal, commit_transaction);
+	stats.run.rs_flushing = jiffies;
+	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
+					     stats.run.rs_flushing);
+
+	commit_transaction->t_state = T_FLUSH;
+	journal->j_committing_transaction = commit_transaction;
+	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
+	commit_transaction->t_log_start = journal->j_head;
+	wake_up(&journal->j_wait_transaction_locked);
+	write_unlock(&journal->j_state_lock);
+
+	jbd_debug(3, "JBD2: commit phase 2a\n");
+
+	/*
+	 * Now start flushing things to disk, in the order they appear
+	 * on the transaction lists.  Data blocks go first.
+	 */
+	err = journal_submit_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
+	blk_start_plug(&plug);
+	jbd2_journal_write_revoke_records(journal, commit_transaction,
+					  &log_bufs, WRITE_SYNC);
+
+	jbd_debug(3, "JBD2: commit phase 2b\n");
+
+	/*
+	 * Way to go: we have now written out all of the data for a
+	 * transaction!  Now comes the tricky part: we need to write out
+	 * metadata.  Loop over the transaction's entire buffer list:
+	 */
+	write_lock(&journal->j_state_lock);
+	commit_transaction->t_state = T_COMMIT;
+	write_unlock(&journal->j_state_lock);
+
+	trace_jbd2_commit_logging(journal, commit_transaction);
+	stats.run.rs_logging = jiffies;
+	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
+					       stats.run.rs_logging);
+	stats.run.rs_blocks =
+		atomic_read(&commit_transaction->t_outstanding_credits);
+	stats.run.rs_blocks_logged = 0;
+
+	J_ASSERT(commit_transaction->t_nr_buffers <=
+		 atomic_read(&commit_transaction->t_outstanding_credits));
+
+	err = 0;
+	bufs = 0;
+	descriptor = NULL;
+	while (commit_transaction->t_buffers) {
+
+		/* Find the next buffer to be journaled... */
+
+		jh = commit_transaction->t_buffers;
+
+		/* If we're in abort mode, we just un-journal the buffer and
+		   release it. */
+
+		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
+			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			jbd2_buffer_abort_trigger(jh,
+						  jh->b_frozen_data ?
+						  jh->b_frozen_triggers :
+						  jh->b_triggers);
+			jbd2_journal_refile_buffer(journal, jh);
+			/* If that was the last one, we need to clean up
+			 * any descriptor buffers which may have been
+			 * already allocated, even if we are now
+			 * aborting. */
+			if (!commit_transaction->t_buffers)
+				goto start_journal_io;
+			continue;
+		}
+
+		/* Make sure we have a descriptor block in which to
+		   record the metadata buffer. */
+
+		if (!descriptor) {
+			J_ASSERT (bufs == 0);
+
+			jbd_debug(4, "JBD2: get descriptor\n");
+
+			descriptor = jbd2_journal_get_descriptor_buffer(journal);
+			if (!descriptor) {
+				jbd2_journal_abort(journal, -EIO);
+				continue;
+			}
+
+			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
+				(unsigned long long)descriptor->b_blocknr,
+				descriptor->b_data);
+			header = (journal_header_t *)descriptor->b_data;
+			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
+			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
+
+			tagp = &descriptor->b_data[sizeof(journal_header_t)];
+			space_left = descriptor->b_size -
+						sizeof(journal_header_t);
+			first_tag = 1;
+			set_buffer_jwrite(descriptor);
+			set_buffer_dirty(descriptor);
+			wbuf[bufs++] = descriptor;
+
+			/* Record it so that we can wait for IO
+                           completion later */
+			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+			jbd2_file_log_bh(&log_bufs, descriptor);
+		}
+
+		/* Where is the buffer to be written? */
+
+		err = jbd2_journal_next_log_block(journal, &blocknr);
+		/* If the block mapping failed, just abandon the buffer
+		   and repeat this loop: we'll fall into the
+		   refile-on-abort condition above. */
+		if (err) {
+			jbd2_journal_abort(journal, err);
+			continue;
+		}
+
+		/*
+		 * start_this_handle() uses t_outstanding_credits to determine
+		 * the free space in the log, but this counter is changed
+		 * by jbd2_journal_next_log_block() also.
+		 */
+		atomic_dec(&commit_transaction->t_outstanding_credits);
+
+		/* Bump b_count to prevent truncate from stumbling over
+                   the shadowed buffer!  @@@ This can go if we ever get
+                   rid of the shadow pairing of buffers. */
+		atomic_inc(&jh2bh(jh)->b_count);
+
+		/*
+		 * Make a temporary IO buffer with which to write it out
+		 * (this will requeue the metadata buffer to BJ_Shadow).
+		 */
+		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+		JBUFFER_TRACE(jh, "ph3: write metadata");
+		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+						jh, &wbuf[bufs], blocknr);
+		if (flags < 0) {
+			jbd2_journal_abort(journal, flags);
+			continue;
+		}
+		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
+
+		/* Record the new block's tag in the current descriptor
+                   buffer */
+
+		tag_flag = 0;
+		if (flags & 1)
+			tag_flag |= JBD2_FLAG_ESCAPE;
+		if (!first_tag)
+			tag_flag |= JBD2_FLAG_SAME_UUID;
+
+		tag = (journal_block_tag_t *) tagp;
+		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
+		tag->t_flags = cpu_to_be16(tag_flag);
+		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
+					commit_transaction->t_tid);
+		tagp += tag_bytes;
+		space_left -= tag_bytes;
+		bufs++;
+
+		if (first_tag) {
+			memcpy (tagp, journal->j_uuid, 16);
+			tagp += 16;
+			space_left -= 16;
+			first_tag = 0;
+		}
+
+		/* If there's no more to do, or if the descriptor is full,
+		   let the IO rip! */
+
+		if (bufs == journal->j_wbufsize ||
+		    commit_transaction->t_buffers == NULL ||
+		    space_left < tag_bytes + 16 + csum_size) {
+
+			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
+
+			/* Write an end-of-descriptor marker before
+                           submitting the IOs.  "tag" still points to
+                           the last tag we set up. */
+
+			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
+
+			jbd2_descr_block_csum_set(journal, descriptor);
+start_journal_io:
+			for (i = 0; i < bufs; i++) {
+				struct buffer_head *bh = wbuf[i];
+				/*
+				 * Compute checksum.
+				 */
+				if (JBD2_HAS_COMPAT_FEATURE(journal,
+					JBD2_FEATURE_COMPAT_CHECKSUM)) {
+					crc32_sum =
+					    jbd2_checksum_data(crc32_sum, bh);
+				}
+
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				bh->b_end_io = journal_end_buffer_io_sync;
+				submit_bh(WRITE_SYNC, bh);
+			}
+			cond_resched();
+			stats.run.rs_blocks_logged += bufs;
+
+			/* Force a new descriptor to be generated next
+                           time round the loop. */
+			descriptor = NULL;
+			bufs = 0;
+		}
+	}
+
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err) {
+		printk(KERN_WARNING
+			"JBD2: Detected IO errors while flushing file data "
+		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
+		err = 0;
+	}
+
+	/*
+	 * Get current oldest transaction in the log before we issue flush
+	 * to the filesystem device. After the flush we can be sure that
+	 * blocks of all older transactions are checkpointed to persistent
+	 * storage and we will be safe to update journal start in the
+	 * superblock with the numbers we get here.
+	 */
+	update_tail =
+		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
+
+	write_lock(&journal->j_state_lock);
+	if (update_tail) {
+		long freed = first_block - journal->j_tail;
+
+		if (first_block < journal->j_tail)
+			freed += journal->j_last - journal->j_first;
+		/* Update tail only if we free significant amount of space */
+		if (freed < journal->j_maxlen / 4)
+			update_tail = 0;
+	}
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	commit_transaction->t_state = T_COMMIT_DFLUSH;
+	write_unlock(&journal->j_state_lock);
+
+	/* 
+	 * If the journal is not located on the file system device,
+	 * then we must flush the file system device before we issue
+	 * the commit record
+	 */
+	if (commit_transaction->t_need_data_flush &&
+	    (journal->j_fs_dev != journal->j_dev) &&
+	    (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
+
+	/* Done it all: now write the commit record asynchronously. */
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+		err = journal_submit_commit_record(journal, commit_transaction,
+						 &cbh, crc32_sum);
+		if (err)
+			__jbd2_journal_abort_hard(journal);
+	}
+
+	blk_finish_plug(&plug);
+
+	/* Lo and behold: we have just managed to send a transaction to
+           the log.  Before we can commit it, wait for the IO so far to
+           complete.  Control buffers being written are on the
+           transaction's t_log_list queue, and metadata buffers are on
+           the io_bufs list.
+
+	   Wait for the buffers in reverse order.  That way we are
+	   less likely to be woken up until all IOs have completed, and
+	   so we incur less scheduling load.
+	*/
+
+	jbd_debug(3, "JBD2: commit phase 3\n");
+
+	while (!list_empty(&io_bufs)) {
+		struct buffer_head *bh = list_entry(io_bufs.prev,
+						    struct buffer_head,
+						    b_assoc_buffers);
+
+		wait_on_buffer(bh);
+		cond_resched();
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+		jbd2_unfile_log_bh(bh);
+
+		/*
+		 * The list contains temporary buffer heads created by
+		 * jbd2_journal_write_metadata_buffer().
+		 */
+		BUFFER_TRACE(bh, "dumping temporary bh");
+		__brelse(bh);
+		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+		free_buffer_head(bh);
+
+		/* We also have to refile the corresponding shadowed buffer */
+		jh = commit_transaction->t_shadow_list->b_tprev;
+		bh = jh2bh(jh);
+		clear_buffer_jwrite(bh);
+		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+		J_ASSERT_BH(bh, !buffer_shadow(bh));
+
+		/* The metadata is now released for reuse, but we need
+                   to remember it against this transaction so that when
+                   we finally commit, we can do any checkpointing
+                   required. */
+		JBUFFER_TRACE(jh, "file as BJ_Forget");
+		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
+		JBUFFER_TRACE(jh, "brelse shadowed buffer");
+		__brelse(bh);
+	}
+
+	J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+	jbd_debug(3, "JBD2: commit phase 4\n");
+
+	/* Here we wait for the revoke record and descriptor record buffers */
+	while (!list_empty(&log_bufs)) {
+		struct buffer_head *bh;
+
+		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+		wait_on_buffer(bh);
+		cond_resched();
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+		clear_buffer_jwrite(bh);
+		jbd2_unfile_log_bh(bh);
+		__brelse(bh);		/* One for getblk */
+		/* AKPM: bforget here */
+	}
+
+	if (err)
+		jbd2_journal_abort(journal, err);
+
+	jbd_debug(3, "JBD2: commit phase 5\n");
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+	commit_transaction->t_state = T_COMMIT_JFLUSH;
+	write_unlock(&journal->j_state_lock);
+
+	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+		err = journal_submit_commit_record(journal, commit_transaction,
+						&cbh, crc32_sum);
+		if (err)
+			__jbd2_journal_abort_hard(journal);
+	}
+	if (cbh)
+		err = journal_wait_on_commit_record(journal, cbh);
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+	    journal->j_flags & JBD2_BARRIER) {
+		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
+	}
+
+	if (err)
+		jbd2_journal_abort(journal, err);
+
+	/*
+	 * Now disk caches for filesystem device are flushed so we are safe to
+	 * erase checkpointed transactions from the log by updating journal
+	 * superblock.
+	 */
+	if (update_tail)
+		jbd2_update_log_tail(journal, first_tid, first_block);
+
+	/* End of a transaction!  Finally, we can do checkpoint
+           processing: any buffers committed as a result of this
+           transaction can be removed from any checkpoint list it was on
+           before. */
+
+	jbd_debug(3, "JBD2: commit phase 6\n");
+
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
+	J_ASSERT(commit_transaction->t_buffers == NULL);
+	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+	J_ASSERT(commit_transaction->t_shadow_list == NULL);
+
+restart_loop:
+	/*
+	 * As there are other places (journal_unmap_buffer()) adding buffers
+	 * to this list we have to be careful and hold the j_list_lock.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_forget) {
+		transaction_t *cp_transaction;
+		struct buffer_head *bh;
+		int try_to_free = 0;
+
+		jh = commit_transaction->t_forget;
+		spin_unlock(&journal->j_list_lock);
+		bh = jh2bh(jh);
+		/*
+		 * Get a reference so that bh cannot be freed before we are
+		 * done with it.
+		 */
+		get_bh(bh);
+		jbd_lock_bh_state(bh);
+		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
+
+		/*
+		 * If there is undo-protected committed data against
+		 * this buffer, then we can remove it now.  If it is a
+		 * buffer needing such protection, the old frozen_data
+		 * field now points to a committed version of the
+		 * buffer, so rotate that field to the new committed
+		 * data.
+		 *
+		 * Otherwise, we can just throw away the frozen data now.
+		 *
+		 * We also know that the frozen data has already fired
+		 * its triggers if they exist, so we can clear that too.
+		 */
+		if (jh->b_committed_data) {
+			jbd2_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			if (jh->b_frozen_data) {
+				jh->b_committed_data = jh->b_frozen_data;
+				jh->b_frozen_data = NULL;
+				jh->b_frozen_triggers = NULL;
+			}
+		} else if (jh->b_frozen_data) {
+			jbd2_free(jh->b_frozen_data, bh->b_size);
+			jh->b_frozen_data = NULL;
+			jh->b_frozen_triggers = NULL;
+		}
+
+		spin_lock(&journal->j_list_lock);
+		cp_transaction = jh->b_cp_transaction;
+		if (cp_transaction) {
+			JBUFFER_TRACE(jh, "remove from old cp transaction");
+			cp_transaction->t_chp_stats.cs_dropped++;
+			__jbd2_journal_remove_checkpoint(jh);
+		}
+
+		/* Only re-checkpoint the buffer_head if it is marked
+		 * dirty.  If the buffer was added to the BJ_Forget list
+		 * by jbd2_journal_forget, it may no longer be dirty and
+		 * there's no point in keeping a checkpoint record for
+		 * it. */
+
+		/*
+		* A buffer which has been freed while still being journaled by
+		* a previous transaction.
+		*/
+		if (buffer_freed(bh)) {
+			/*
+			 * If the running transaction is the one containing
+			 * "add to orphan" operation (b_next_transaction !=
+			 * NULL), we have to wait for that transaction to
+			 * commit before we can really get rid of the buffer.
+			 * So just clear b_modified to not confuse transaction
+			 * credit accounting and refile the buffer to
+			 * BJ_Forget of the running transaction. If the just
+			 * committed transaction contains "add to orphan"
+			 * operation, we can completely invalidate the buffer
+			 * now. We are rather through in that since the
+			 * buffer may be still accessible when blocksize <
+			 * pagesize and it is attached to the last partial
+			 * page.
+			 */
+			jh->b_modified = 0;
+			if (!jh->b_next_transaction) {
+				clear_buffer_freed(bh);
+				clear_buffer_jbddirty(bh);
+				clear_buffer_mapped(bh);
+				clear_buffer_new(bh);
+				clear_buffer_req(bh);
+				bh->b_bdev = NULL;
+			}
+		}
+
+		if (buffer_jbddirty(bh)) {
+			JBUFFER_TRACE(jh, "add to new checkpointing trans");
+			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
+		} else {
+			J_ASSERT_BH(bh, !buffer_dirty(bh));
+			/*
+			 * The buffer on BJ_Forget list and not jbddirty means
+			 * it has been freed by this transaction and hence it
+			 * could not have been reallocated until this
+			 * transaction has committed. *BUT* it could be
+			 * reallocated once we have written all the data to
+			 * disk and before we process the buffer on BJ_Forget
+			 * list.
+			 */
+			if (!jh->b_next_transaction)
+				try_to_free = 1;
+		}
+		JBUFFER_TRACE(jh, "refile or unfile buffer");
+		__jbd2_journal_refile_buffer(jh);
+		jbd_unlock_bh_state(bh);
+		if (try_to_free)
+			release_buffer_page(bh);	/* Drops bh reference */
+		else
+			__brelse(bh);
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+	/*
+	 * This is a bit sleazy.  We use j_list_lock to protect transition
+	 * of a transaction into T_FINISHED state and calling
+	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
+	 * other checkpointing code processing the transaction...
+	 */
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	/*
+	 * Now recheck if some buffers did not get attached to the transaction
+	 * while the lock was dropped...
+	 */
+	if (commit_transaction->t_forget) {
+		spin_unlock(&journal->j_list_lock);
+		write_unlock(&journal->j_state_lock);
+		goto restart_loop;
+	}
+
+	/* Add the transaction to the checkpoint list
+	 * __journal_remove_checkpoint() can not destroy transaction
+	 * under us because it is not marked as T_FINISHED yet */
+	if (journal->j_checkpoint_transactions == NULL) {
+		journal->j_checkpoint_transactions = commit_transaction;
+		commit_transaction->t_cpnext = commit_transaction;
+		commit_transaction->t_cpprev = commit_transaction;
+	} else {
+		commit_transaction->t_cpnext =
+			journal->j_checkpoint_transactions;
+		commit_transaction->t_cpprev =
+			commit_transaction->t_cpnext->t_cpprev;
+		commit_transaction->t_cpnext->t_cpprev =
+			commit_transaction;
+		commit_transaction->t_cpprev->t_cpnext =
+				commit_transaction;
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	/* Done with this transaction! */
+
+	jbd_debug(3, "JBD2: commit phase 7\n");
+
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
+
+	commit_transaction->t_start = jiffies;
+	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
+					      commit_transaction->t_start);
+
+	/*
+	 * File the transaction statistics
+	 */
+	stats.ts_tid = commit_transaction->t_tid;
+	stats.run.rs_handle_count =
+		atomic_read(&commit_transaction->t_handle_count);
+	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
+			     commit_transaction->t_tid, &stats.run);
+	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
+
+	commit_transaction->t_state = T_COMMIT_CALLBACK;
+	J_ASSERT(commit_transaction == journal->j_committing_transaction);
+	journal->j_commit_sequence = commit_transaction->t_tid;
+	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in the commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time +
+				journal->j_average_commit_time*3) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+
+	write_unlock(&journal->j_state_lock);
+
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
+	trace_jbd2_end_commit(journal, commit_transaction);
+	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+		  journal->j_commit_sequence, journal->j_tail_sequence);
+
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	commit_transaction->t_state = T_FINISHED;
+	/* Check if the transaction can be dropped now that we are finished */
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		jbd2_journal_free_transaction(commit_transaction);
+	}
+	spin_unlock(&journal->j_list_lock);
+	write_unlock(&journal->j_state_lock);
+	wake_up(&journal->j_wait_done_commit);
+
+	/*
+	 * Calculate overall stats
+	 */
+	spin_lock(&journal->j_history_lock);
+	journal->j_stats.ts_tid++;
+	journal->j_stats.ts_requested += stats.ts_requested;
+	journal->j_stats.run.rs_wait += stats.run.rs_wait;
+	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
+	journal->j_stats.run.rs_running += stats.run.rs_running;
+	journal->j_stats.run.rs_locked += stats.run.rs_locked;
+	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+	journal->j_stats.run.rs_logging += stats.run.rs_logging;
+	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+	spin_unlock(&journal->j_history_lock);
+}
diff --git a/kernel/fs/jbd2/journal.c b/kernel/fs/jbd2/journal.c
new file mode 100644
index 000000000..b96bd8076
--- /dev/null
+++ b/kernel/fs/jbd2/journal.c
@@ -0,0 +1,2671 @@
+/*
+ * linux/fs/jbd2/journal.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates.  This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/freezer.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/math64.h>
+#include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
+#include <linux/backing-dev.h>
+#include <linux/bitops.h>
+#include <linux/ratelimit.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd2.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+
+#ifdef CONFIG_JBD2_DEBUG
+ushort jbd2_journal_enable_debug __read_mostly;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+
+module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
+MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
+#endif
+
+EXPORT_SYMBOL(jbd2_journal_extend);
+EXPORT_SYMBOL(jbd2_journal_stop);
+EXPORT_SYMBOL(jbd2_journal_lock_updates);
+EXPORT_SYMBOL(jbd2_journal_unlock_updates);
+EXPORT_SYMBOL(jbd2_journal_get_write_access);
+EXPORT_SYMBOL(jbd2_journal_get_create_access);
+EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
+EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
+EXPORT_SYMBOL(jbd2_journal_forget);
+#if 0
+EXPORT_SYMBOL(journal_sync_buffer);
+#endif
+EXPORT_SYMBOL(jbd2_journal_flush);
+EXPORT_SYMBOL(jbd2_journal_revoke);
+
+EXPORT_SYMBOL(jbd2_journal_init_dev);
+EXPORT_SYMBOL(jbd2_journal_init_inode);
+EXPORT_SYMBOL(jbd2_journal_check_used_features);
+EXPORT_SYMBOL(jbd2_journal_check_available_features);
+EXPORT_SYMBOL(jbd2_journal_set_features);
+EXPORT_SYMBOL(jbd2_journal_load);
+EXPORT_SYMBOL(jbd2_journal_destroy);
+EXPORT_SYMBOL(jbd2_journal_abort);
+EXPORT_SYMBOL(jbd2_journal_errno);
+EXPORT_SYMBOL(jbd2_journal_ack_err);
+EXPORT_SYMBOL(jbd2_journal_clear_err);
+EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
+EXPORT_SYMBOL(jbd2_journal_start_commit);
+EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
+EXPORT_SYMBOL(jbd2_journal_wipe);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
+EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
+EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
+
+static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
+
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+		  unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (level > jbd2_journal_enable_debug)
+		return;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+	va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
+/* Checksumming functions */
+static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+{
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return 1;
+
+	return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
+}
+
+static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+{
+	__u32 csum;
+	__be32 old_csum;
+
+	old_csum = sb->s_checksum;
+	sb->s_checksum = 0;
+	csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+	sb->s_checksum = old_csum;
+
+	return cpu_to_be32(csum);
+}
+
+static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+{
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return 1;
+
+	return sb->s_checksum == jbd2_superblock_csum(j, sb);
+}
+
+static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+{
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return;
+
+	sb->s_checksum = jbd2_superblock_csum(j, sb);
+}
+
+/*
+ * Helper function used to manage commit timeouts
+ */
+
+static void commit_timeout(unsigned long __data)
+{
+	struct task_struct * p = (struct task_struct *) __data;
+
+	wake_up_process(p);
+}
+
+/*
+ * kjournald2: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT:  Every so often we need to commit the current state of the
+ *    filesystem to disk.  The journal thread is responsible for writing
+ *    all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ *    of the data in that part of the log has been rewritten elsewhere on
+ *    the disk.  Flushing these old buffers to reclaim space in the log is
+ *    known as checkpointing, and this thread is responsible for that job.
+ */
+
+static int kjournald2(void *arg)
+{
+	journal_t *journal = arg;
+	transaction_t *transaction;
+
+	/*
+	 * Set up an interval timer which can be used to trigger a commit wakeup
+	 * after the commit interval expires
+	 */
+	setup_timer(&journal->j_commit_timer, commit_timeout,
+			(unsigned long)current);
+
+	set_freezable();
+
+	/* Record that the journal thread is running */
+	journal->j_task = current;
+	wake_up(&journal->j_wait_done_commit);
+
+	/*
+	 * And now, wait forever for commit wakeup events.
+	 */
+	write_lock(&journal->j_state_lock);
+
+loop:
+	if (journal->j_flags & JBD2_UNMOUNT)
+		goto end_loop;
+
+	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+		journal->j_commit_sequence, journal->j_commit_request);
+
+	if (journal->j_commit_sequence != journal->j_commit_request) {
+		jbd_debug(1, "OK, requests differ\n");
+		write_unlock(&journal->j_state_lock);
+		del_timer_sync(&journal->j_commit_timer);
+		jbd2_journal_commit_transaction(journal);
+		write_lock(&journal->j_state_lock);
+		goto loop;
+	}
+
+	wake_up(&journal->j_wait_done_commit);
+	if (freezing(current)) {
+		/*
+		 * The simpler the better. Flushing journal isn't a
+		 * good idea, because that depends on threads that may
+		 * be already stopped.
+		 */
+		jbd_debug(1, "Now suspending kjournald2\n");
+		write_unlock(&journal->j_state_lock);
+		try_to_freeze();
+		write_lock(&journal->j_state_lock);
+	} else {
+		/*
+		 * We assume on resume that commits are already there,
+		 * so we don't sleep
+		 */
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&journal->j_wait_commit, &wait,
+				TASK_INTERRUPTIBLE);
+		if (journal->j_commit_sequence != journal->j_commit_request)
+			should_sleep = 0;
+		transaction = journal->j_running_transaction;
+		if (transaction && time_after_eq(jiffies,
+						transaction->t_expires))
+			should_sleep = 0;
+		if (journal->j_flags & JBD2_UNMOUNT)
+			should_sleep = 0;
+		if (should_sleep) {
+			write_unlock(&journal->j_state_lock);
+			schedule();
+			write_lock(&journal->j_state_lock);
+		}
+		finish_wait(&journal->j_wait_commit, &wait);
+	}
+
+	jbd_debug(1, "kjournald2 wakes\n");
+
+	/*
+	 * Were we woken up by a commit wakeup event?
+	 */
+	transaction = journal->j_running_transaction;
+	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
+		journal->j_commit_request = transaction->t_tid;
+		jbd_debug(1, "woke because of timeout\n");
+	}
+	goto loop;
+
+end_loop:
+	write_unlock(&journal->j_state_lock);
+	del_timer_sync(&journal->j_commit_timer);
+	journal->j_task = NULL;
+	wake_up(&journal->j_wait_done_commit);
+	jbd_debug(1, "Journal thread exiting.\n");
+	return 0;
+}
+
+static int jbd2_journal_start_thread(journal_t *journal)
+{
+	struct task_struct *t;
+
+	t = kthread_run(kjournald2, journal, "jbd2/%s",
+			journal->j_devname);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
+	return 0;
+}
+
+static void journal_kill_thread(journal_t *journal)
+{
+	write_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_UNMOUNT;
+
+	while (journal->j_task) {
+		write_unlock(&journal->j_state_lock);
+		wake_up(&journal->j_wait_commit);
+		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
+		write_lock(&journal->j_state_lock);
+	}
+	write_unlock(&journal->j_state_lock);
+}
+
+/*
+ * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block.  The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here.  If the buffer happens to start with the
+ * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks.  In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block.  The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
+ *
+ * The function returns a pointer to the buffer_head to be used for IO.
+ * 
+ *
+ * Return value:
+ *  <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+
+int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
+				  struct journal_head  *jh_in,
+				  struct buffer_head **bh_out,
+				  sector_t blocknr)
+{
+	int need_copy_out = 0;
+	int done_copy_out = 0;
+	int do_escape = 0;
+	char *mapped_data;
+	struct buffer_head *new_bh;
+	struct page *new_page;
+	unsigned int new_offset;
+	struct buffer_head *bh_in = jh2bh(jh_in);
+	journal_t *journal = transaction->t_journal;
+
+	/*
+	 * The buffer really shouldn't be locked: only the current committing
+	 * transaction is allowed to write it, so nobody else is allowed
+	 * to do any IO.
+	 *
+	 * akpm: except if we're journalling data, and write() output is
+	 * also part of a shared mapping, and another thread has
+	 * decided to launch a writepage() against this buffer.
+	 */
+	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+
+retry_alloc:
+	new_bh = alloc_buffer_head(GFP_NOFS);
+	if (!new_bh) {
+		/*
+		 * Failure is not an option, but __GFP_NOFAIL is going
+		 * away; so we retry ourselves here.
+		 */
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry_alloc;
+	}
+
+	/* keep subsequent assertions sane */
+	atomic_set(&new_bh->b_count, 1);
+
+	jbd_lock_bh_state(bh_in);
+repeat:
+	/*
+	 * If a new transaction has already done a buffer copy-out, then
+	 * we use that version of the data for the commit.
+	 */
+	if (jh_in->b_frozen_data) {
+		done_copy_out = 1;
+		new_page = virt_to_page(jh_in->b_frozen_data);
+		new_offset = offset_in_page(jh_in->b_frozen_data);
+	} else {
+		new_page = jh2bh(jh_in)->b_page;
+		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+	}
+
+	mapped_data = kmap_atomic(new_page);
+	/*
+	 * Fire data frozen trigger if data already wasn't frozen.  Do this
+	 * before checking for escaping, as the trigger may modify the magic
+	 * offset.  If a copy-out happens afterwards, it will have the correct
+	 * data in the buffer.
+	 */
+	if (!done_copy_out)
+		jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+					   jh_in->b_triggers);
+
+	/*
+	 * Check for escaping
+	 */
+	if (*((__be32 *)(mapped_data + new_offset)) ==
+				cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+		need_copy_out = 1;
+		do_escape = 1;
+	}
+	kunmap_atomic(mapped_data);
+
+	/*
+	 * Do we need to do a data copy?
+	 */
+	if (need_copy_out && !done_copy_out) {
+		char *tmp;
+
+		jbd_unlock_bh_state(bh_in);
+		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+		if (!tmp) {
+			brelse(new_bh);
+			return -ENOMEM;
+		}
+		jbd_lock_bh_state(bh_in);
+		if (jh_in->b_frozen_data) {
+			jbd2_free(tmp, bh_in->b_size);
+			goto repeat;
+		}
+
+		jh_in->b_frozen_data = tmp;
+		mapped_data = kmap_atomic(new_page);
+		memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
+		kunmap_atomic(mapped_data);
+
+		new_page = virt_to_page(tmp);
+		new_offset = offset_in_page(tmp);
+		done_copy_out = 1;
+
+		/*
+		 * This isn't strictly necessary, as we're using frozen
+		 * data for the escaping, but it keeps consistency with
+		 * b_frozen_data usage.
+		 */
+		jh_in->b_frozen_triggers = jh_in->b_triggers;
+	}
+
+	/*
+	 * Did we need to do an escaping?  Now we've done all the
+	 * copying, we can finally do so.
+	 */
+	if (do_escape) {
+		mapped_data = kmap_atomic(new_page);
+		*((unsigned int *)(mapped_data + new_offset)) = 0;
+		kunmap_atomic(mapped_data);
+	}
+
+	set_bh_page(new_bh, new_page, new_offset);
+	new_bh->b_size = bh_in->b_size;
+	new_bh->b_bdev = journal->j_dev;
+	new_bh->b_blocknr = blocknr;
+	new_bh->b_private = bh_in;
+	set_buffer_mapped(new_bh);
+	set_buffer_dirty(new_bh);
+
+	*bh_out = new_bh;
+
+	/*
+	 * The to-be-written buffer needs to get moved to the io queue,
+	 * and the original buffer whose contents we are shadowing or
+	 * copying is moved to the transaction's shadow queue.
+	 */
+	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_unlock(&journal->j_list_lock);
+	set_buffer_shadow(bh_in);
+	jbd_unlock_bh_state(bh_in);
+
+	return do_escape | (done_copy_out << 1);
+}
+
+/*
+ * Allocation code for the journal file.  Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+
+/*
+ * Called with j_state_lock locked for writing.
+ * Returns true if a transaction commit was started.
+ */
+int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+{
+	/* Return if the txn has already requested to be committed */
+	if (journal->j_commit_request == target)
+		return 0;
+
+	/*
+	 * The only transaction we can possibly wait upon is the
+	 * currently running transaction (if it exists).  Otherwise,
+	 * the target tid must be an old one.
+	 */
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == target) {
+		/*
+		 * We want a new commit: OK, mark the request and wakeup the
+		 * commit thread.  We do _not_ do the commit ourselves.
+		 */
+
+		journal->j_commit_request = target;
+		jbd_debug(1, "JBD2: requesting commit %d/%d\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence);
+		journal->j_running_transaction->t_requested = jiffies;
+		wake_up(&journal->j_wait_commit);
+		return 1;
+	} else if (!tid_geq(journal->j_commit_request, target))
+		/* This should never happen, but if it does, preserve
+		   the evidence before kjournald goes into a loop and
+		   increments j_commit_sequence beyond all recognition. */
+		WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence,
+			  target, journal->j_running_transaction ? 
+			  journal->j_running_transaction->t_tid : 0);
+	return 0;
+}
+
+int jbd2_log_start_commit(journal_t *journal, tid_t tid)
+{
+	int ret;
+
+	write_lock(&journal->j_state_lock);
+	ret = __jbd2_log_start_commit(journal, tid);
+	write_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Force and wait any uncommitted transactions.  We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ *           0 if nothing to commit,
+ *           1 if transaction was successfully committed.
+ */
+static int __jbd2_journal_force_commit(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+	int need_to_start = 0, ret = 0;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction && !current->journal_info) {
+		transaction = journal->j_running_transaction;
+		if (!tid_geq(journal->j_commit_request, transaction->t_tid))
+			need_to_start = 1;
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		/* Nothing to commit */
+		read_unlock(&journal->j_state_lock);
+		return 0;
+	}
+	tid = transaction->t_tid;
+	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
+	ret = jbd2_log_wait_commit(journal, tid);
+	if (!ret)
+		ret = 1;
+
+	return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+	int ret;
+
+	ret = __jbd2_journal_force_commit(journal);
+	return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+	int ret;
+
+	J_ASSERT(!current->journal_info);
+	ret = __jbd2_journal_force_commit(journal);
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
+/*
+ * Start a commit of the current running transaction (if any).  Returns true
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
+ */
+int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
+{
+	int ret = 0;
+
+	write_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction) {
+		tid_t tid = journal->j_running_transaction->t_tid;
+
+		__jbd2_log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
+			*ptid = tid;
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
+		/*
+		 * If commit has been started, then we have to wait for
+		 * completion of that transaction.
+		 */
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
+		ret = 1;
+	}
+	write_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+	int ret = 0;
+	transaction_t *commit_trans;
+
+	if (!(journal->j_flags & JBD2_BARRIER))
+		return 0;
+	read_lock(&journal->j_state_lock);
+	/* Transaction already committed? */
+	if (tid_geq(journal->j_commit_sequence, tid))
+		goto out;
+	commit_trans = journal->j_committing_transaction;
+	if (!commit_trans || commit_trans->t_tid != tid) {
+		ret = 1;
+		goto out;
+	}
+	/*
+	 * Transaction is being committed and we already proceeded to
+	 * submitting a flush to fs partition?
+	 */
+	if (journal->j_fs_dev != journal->j_dev) {
+		if (!commit_trans->t_need_data_flush ||
+		    commit_trans->t_state >= T_COMMIT_DFLUSH)
+			goto out;
+	} else {
+		if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+			goto out;
+	}
+	ret = 1;
+out:
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+
+/*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
+{
+	int err = 0;
+
+	read_lock(&journal->j_state_lock);
+#ifdef CONFIG_JBD2_DEBUG
+	if (!tid_geq(journal->j_commit_request, tid)) {
+		printk(KERN_ERR
+		       "%s: error: j_commit_request=%d, tid=%d\n",
+		       __func__, journal->j_commit_request, tid);
+	}
+#endif
+	while (tid_gt(tid, journal->j_commit_sequence)) {
+		jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
+				  tid, journal->j_commit_sequence);
+		read_unlock(&journal->j_state_lock);
+		wake_up(&journal->j_wait_commit);
+		wait_event(journal->j_wait_done_commit,
+				!tid_gt(tid, journal->j_commit_sequence));
+		read_lock(&journal->j_state_lock);
+	}
+	read_unlock(&journal->j_state_lock);
+
+	if (unlikely(is_journal_aborted(journal)))
+		err = -EIO;
+	return err;
+}
+
+/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed.  If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete.  If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+	int	need_to_wait = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid) {
+		if (journal->j_commit_request != tid) {
+			/* transaction not yet started, so request it */
+			read_unlock(&journal->j_state_lock);
+			jbd2_log_start_commit(journal, tid);
+			goto wait_commit;
+		}
+	} else if (!(journal->j_committing_transaction &&
+		     journal->j_committing_transaction->t_tid == tid))
+		need_to_wait = 0;
+	read_unlock(&journal->j_state_lock);
+	if (!need_to_wait)
+		return 0;
+wait_commit:
+	return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
+/*
+ * Log buffer allocation routines:
+ */
+
+int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
+{
+	unsigned long blocknr;
+
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(journal->j_free > 1);
+
+	blocknr = journal->j_head;
+	journal->j_head++;
+	journal->j_free--;
+	if (journal->j_head == journal->j_last)
+		journal->j_head = journal->j_first;
+	write_unlock(&journal->j_state_lock);
+	return jbd2_journal_bmap(journal, blocknr, retp);
+}
+
+/*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op.  If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
+		 unsigned long long *retp)
+{
+	int err = 0;
+	unsigned long long ret;
+
+	if (journal->j_inode) {
+		ret = bmap(journal->j_inode, blocknr);
+		if (ret)
+			*retp = ret;
+		else {
+			printk(KERN_ALERT "%s: journal block not found "
+					"at offset %lu on %s\n",
+			       __func__, blocknr, journal->j_devname);
+			err = -EIO;
+			__journal_abort_soft(journal, err);
+		}
+	} else {
+		*retp = blocknr; /* +journal->j_blk_offset */
+	}
+	return err;
+}
+
+/*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ *
+ * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
+ * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * But we don't bother doing that, so there will be coherency problems with
+ * mmaps of blockdevs which hold live JBD-controlled filesystems.
+ */
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+{
+	struct buffer_head *bh;
+	unsigned long long blocknr;
+	int err;
+
+	err = jbd2_journal_next_log_block(journal, &blocknr);
+
+	if (err)
+		return NULL;
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return NULL;
+	lock_buffer(bh);
+	memset(bh->b_data, 0, journal->j_blocksize);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	BUFFER_TRACE(bh, "return this buffer");
+	return bh;
+}
+
+/*
+ * Return tid of the oldest transaction in the journal and block in the journal
+ * where the transaction starts.
+ *
+ * If the journal is now empty, return which will be the next transaction ID
+ * we will write and where will that transaction start.
+ *
+ * The return value is 0 if journal tail cannot be pushed any further, 1 if
+ * it can.
+ */
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+			      unsigned long *block)
+{
+	transaction_t *transaction;
+	int ret;
+
+	read_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction) {
+		*tid = transaction->t_tid;
+		*block = transaction->t_log_start;
+	} else if ((transaction = journal->j_committing_transaction) != NULL) {
+		*tid = transaction->t_tid;
+		*block = transaction->t_log_start;
+	} else if ((transaction = journal->j_running_transaction) != NULL) {
+		*tid = transaction->t_tid;
+		*block = journal->j_head;
+	} else {
+		*tid = journal->j_transaction_sequence;
+		*block = journal->j_head;
+	}
+	ret = tid_gt(*tid, journal->j_tail_sequence);
+	spin_unlock(&journal->j_list_lock);
+	read_unlock(&journal->j_state_lock);
+
+	return ret;
+}
+
+/*
+ * Update information in journal structure and in on disk journal superblock
+ * about log tail. This function does not check whether information passed in
+ * really pushes log tail further. It's responsibility of the caller to make
+ * sure provided log tail information is valid (e.g. by holding
+ * j_checkpoint_mutex all the time between computing log tail and calling this
+ * function as is the case with jbd2_cleanup_journal_tail()).
+ *
+ * Requires j_checkpoint_mutex
+ */
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+	unsigned long freed;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+
+	/*
+	 * We cannot afford for write to remain in drive's caches since as
+	 * soon as we update j_tail, next transaction can start reusing journal
+	 * space and if we lose sb update during power failure we'd replay
+	 * old transaction with possibly newly overwritten data.
+	 */
+	jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	write_lock(&journal->j_state_lock);
+	freed = block - journal->j_tail;
+	if (block < journal->j_tail)
+		freed += journal->j_last - journal->j_first;
+
+	trace_jbd2_update_log_tail(journal, tid, block, freed);
+	jbd_debug(1,
+		  "Cleaning journal tail from %d to %d (offset %lu), "
+		  "freeing %lu\n",
+		  journal->j_tail_sequence, tid, block, freed);
+
+	journal->j_free += freed;
+	journal->j_tail_sequence = tid;
+	journal->j_tail = block;
+	write_unlock(&journal->j_state_lock);
+}
+
+/*
+ * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * provided log tail and locks j_checkpoint_mutex. So it is safe against races
+ * with other threads updating log tail.
+ */
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+	mutex_lock(&journal->j_checkpoint_mutex);
+	if (tid_gt(tid, journal->j_tail_sequence))
+		__jbd2_update_log_tail(journal, tid, block);
+	mutex_unlock(&journal->j_checkpoint_mutex);
+}
+
+struct jbd2_stats_proc_session {
+	journal_t *journal;
+	struct transaction_stats_s *stats;
+	int start;
+	int max;
+};
+
+static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int jbd2_seq_info_show(struct seq_file *seq, void *v)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+
+	if (v != SEQ_START_TOKEN)
+		return 0;
+	seq_printf(seq, "%lu transactions (%lu requested), "
+		   "each up to %u blocks\n",
+		   s->stats->ts_tid, s->stats->ts_requested,
+		   s->journal->j_max_transaction_buffers);
+	if (s->stats->ts_tid == 0)
+		return 0;
+	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
+	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
+	seq_printf(seq, "  %ums request delay\n",
+	    (s->stats->ts_requested == 0) ? 0 :
+	    jiffies_to_msecs(s->stats->run.rs_request_delay /
+			     s->stats->ts_requested));
+	seq_printf(seq, "  %ums running transaction\n",
+	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
+	seq_printf(seq, "  %ums transaction was being locked\n",
+	    jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
+	seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
+	    jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
+	seq_printf(seq, "  %ums logging transaction\n",
+	    jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
+	seq_printf(seq, "  %lluus average transaction commit time\n",
+		   div_u64(s->journal->j_average_commit_time, 1000));
+	seq_printf(seq, "  %lu handles per transaction\n",
+	    s->stats->run.rs_handle_count / s->stats->ts_tid);
+	seq_printf(seq, "  %lu blocks per transaction\n",
+	    s->stats->run.rs_blocks / s->stats->ts_tid);
+	seq_printf(seq, "  %lu logged blocks per transaction\n",
+	    s->stats->run.rs_blocks_logged / s->stats->ts_tid);
+	return 0;
+}
+
+static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations jbd2_seq_info_ops = {
+	.start  = jbd2_seq_info_start,
+	.next   = jbd2_seq_info_next,
+	.stop   = jbd2_seq_info_stop,
+	.show   = jbd2_seq_info_show,
+};
+
+static int jbd2_seq_info_open(struct inode *inode, struct file *file)
+{
+	journal_t *journal = PDE_DATA(inode);
+	struct jbd2_stats_proc_session *s;
+	int rc, size;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return -ENOMEM;
+	size = sizeof(struct transaction_stats_s);
+	s->stats = kmalloc(size, GFP_KERNEL);
+	if (s->stats == NULL) {
+		kfree(s);
+		return -ENOMEM;
+	}
+	spin_lock(&journal->j_history_lock);
+	memcpy(s->stats, &journal->j_stats, size);
+	s->journal = journal;
+	spin_unlock(&journal->j_history_lock);
+
+	rc = seq_open(file, &jbd2_seq_info_ops);
+	if (rc == 0) {
+		struct seq_file *m = file->private_data;
+		m->private = s;
+	} else {
+		kfree(s->stats);
+		kfree(s);
+	}
+	return rc;
+
+}
+
+static int jbd2_seq_info_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct jbd2_stats_proc_session *s = seq->private;
+	kfree(s->stats);
+	kfree(s);
+	return seq_release(inode, file);
+}
+
+static const struct file_operations jbd2_seq_info_fops = {
+	.owner		= THIS_MODULE,
+	.open           = jbd2_seq_info_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = jbd2_seq_info_release,
+};
+
+static struct proc_dir_entry *proc_jbd2_stats;
+
+static void jbd2_stats_proc_init(journal_t *journal)
+{
+	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
+	if (journal->j_proc_entry) {
+		proc_create_data("info", S_IRUGO, journal->j_proc_entry,
+				 &jbd2_seq_info_fops, journal);
+	}
+}
+
+static void jbd2_stats_proc_exit(journal_t *journal)
+{
+	remove_proc_entry("info", journal->j_proc_entry);
+	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
+}
+
+/*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk.  */
+
+/* First: create and setup a journal_t object in memory.  We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+
+static journal_t * journal_init_common (void)
+{
+	journal_t *journal;
+	int err;
+
+	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
+	if (!journal)
+		return NULL;
+
+	init_waitqueue_head(&journal->j_wait_transaction_locked);
+	init_waitqueue_head(&journal->j_wait_done_commit);
+	init_waitqueue_head(&journal->j_wait_commit);
+	init_waitqueue_head(&journal->j_wait_updates);
+	init_waitqueue_head(&journal->j_wait_reserved);
+	mutex_init(&journal->j_barrier);
+	mutex_init(&journal->j_checkpoint_mutex);
+	spin_lock_init(&journal->j_revoke_lock);
+	spin_lock_init(&journal->j_list_lock);
+	rwlock_init(&journal->j_state_lock);
+
+	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+	journal->j_min_batch_time = 0;
+	journal->j_max_batch_time = 15000; /* 15ms */
+	atomic_set(&journal->j_reserved_credits, 0);
+
+	/* The journal is marked for error until we succeed with recovery! */
+	journal->j_flags = JBD2_ABORT;
+
+	/* Set up a default-sized revoke table for the new mount. */
+	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+	if (err) {
+		kfree(journal);
+		return NULL;
+	}
+
+	spin_lock_init(&journal->j_history_lock);
+
+	return journal;
+}
+
+/* jbd2_journal_init_dev and jbd2_journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal.  We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ */
+
+/**
+ *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
+ *  @bdev: Block device on which to create the journal
+ *  @fs_dev: Device which hold journalled filesystem for this journal.
+ *  @start: Block nr Start of journal.
+ *  @len:  Length of the journal in blocks.
+ *  @blocksize: blocksize of journalling device
+ *
+ *  Returns: a newly created journal_t *
+ *
+ *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
+ *  range of blocks on an arbitrary block device.
+ *
+ */
+journal_t * jbd2_journal_init_dev(struct block_device *bdev,
+			struct block_device *fs_dev,
+			unsigned long long start, int len, int blocksize)
+{
+	journal_t *journal = journal_init_common();
+	struct buffer_head *bh;
+	char *p;
+	int n;
+
+	if (!journal)
+		return NULL;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	journal->j_blocksize = blocksize;
+	journal->j_dev = bdev;
+	journal->j_fs_dev = fs_dev;
+	journal->j_blk_offset = start;
+	journal->j_maxlen = len;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	jbd2_stats_proc_init(journal);
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+
+	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
+}
+
+/**
+ *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
+ *  @inode: An inode to create the journal in
+ *
+ * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal.  The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+journal_t * jbd2_journal_init_inode (struct inode *inode)
+{
+	struct buffer_head *bh;
+	journal_t *journal = journal_init_common();
+	char *p;
+	int err;
+	int n;
+	unsigned long long blocknr;
+
+	if (!journal)
+		return NULL;
+
+	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
+	journal->j_inode = inode;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	p = journal->j_devname + strlen(journal->j_devname);
+	sprintf(p, "-%lu", journal->j_inode->i_ino);
+	jbd_debug(1,
+		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+		  journal, inode->i_sb->s_id, inode->i_ino,
+		  (long long) inode->i_size,
+		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+	journal->j_blocksize = inode->i_sb->s_blocksize;
+	jbd2_stats_proc_init(journal);
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+
+	err = jbd2_journal_bmap(journal, 0, &blocknr);
+	/* If that failed, give up */
+	if (err) {
+		printk(KERN_ERR "%s: Cannot locate journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+
+	bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
+}
+
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	brelse(bh);
+	journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session.  We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+
+static int journal_reset(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	unsigned long long first, last;
+
+	first = be32_to_cpu(sb->s_first);
+	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
+
+	journal->j_first = first;
+	journal->j_last = last;
+
+	journal->j_head = first;
+	journal->j_tail = first;
+	journal->j_free = last - first;
+
+	journal->j_tail_sequence = journal->j_transaction_sequence;
+	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+	journal->j_commit_request = journal->j_commit_sequence;
+
+	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+	/*
+	 * As a special case, if the on-disk copy is already marked as needing
+	 * no recovery (s_start == 0), then we can safely defer the superblock
+	 * update until the next commit by setting JBD2_FLUSHED.  This avoids
+	 * attempting a write to a potential-readonly device.
+	 */
+	if (sb->s_start == 0) {
+		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
+			"(start %ld, seq %d, errno %d)\n",
+			journal->j_tail, journal->j_tail_sequence,
+			journal->j_errno);
+		journal->j_flags |= JBD2_FLUSHED;
+	} else {
+		/* Lock here to make assertions happy... */
+		mutex_lock(&journal->j_checkpoint_mutex);
+		/*
+		 * Update log tail information. We use WRITE_FUA since new
+		 * transaction will start reusing journal space and so we
+		 * must make sure information about current log tail is on
+		 * disk before that.
+		 */
+		jbd2_journal_update_sb_log_tail(journal,
+						journal->j_tail_sequence,
+						journal->j_tail,
+						WRITE_FUA);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+	return jbd2_journal_start_thread(journal);
+}
+
+static void jbd2_write_superblock(journal_t *journal, int write_op)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	journal_superblock_t *sb = journal->j_superblock;
+	int ret;
+
+	trace_jbd2_write_superblock(journal, write_op);
+	if (!(journal->j_flags & JBD2_BARRIER))
+		write_op &= ~(REQ_FUA | REQ_FLUSH);
+	lock_buffer(bh);
+	if (buffer_write_io_error(bh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD2: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal->j_devname);
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+	jbd2_superblock_csum_set(journal, sb);
+	get_bh(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	ret = submit_bh(write_op, bh);
+	wait_on_buffer(bh);
+	if (buffer_write_io_error(bh)) {
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+		ret = -EIO;
+	}
+	if (ret) {
+		printk(KERN_ERR "JBD2: Error %d detected when updating "
+		       "journal superblock for %s.\n", ret,
+		       journal->j_devname);
+	}
+}
+
+/**
+ * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+				     unsigned long tail_block, int write_op)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+		  tail_block, tail_tid);
+
+	sb->s_sequence = cpu_to_be32(tail_tid);
+	sb->s_start    = cpu_to_be32(tail_block);
+
+	jbd2_write_superblock(journal, write_op);
+
+	/* Log is no longer empty */
+	write_lock(&journal->j_state_lock);
+	WARN_ON(!sb->s_sequence);
+	journal->j_flags &= ~JBD2_FLUSHED;
+	write_unlock(&journal->j_state_lock);
+}
+
+/**
+ * jbd2_mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void jbd2_mark_journal_empty(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+	read_lock(&journal->j_state_lock);
+	/* Is it already empty? */
+	if (sb->s_start == 0) {
+		read_unlock(&journal->j_state_lock);
+		return;
+	}
+	jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
+		  journal->j_tail_sequence);
+
+	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
+	sb->s_start    = cpu_to_be32(0);
+	read_unlock(&journal->j_state_lock);
+
+	jbd2_write_superblock(journal, WRITE_FUA);
+
+	/* Log is no longer empty */
+	write_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_FLUSHED;
+	write_unlock(&journal->j_state_lock);
+}
+
+
+/**
+ * jbd2_journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno.  Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+void jbd2_journal_update_sb_errno(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	read_lock(&journal->j_state_lock);
+	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
+		  journal->j_errno);
+	sb->s_errno    = cpu_to_be32(journal->j_errno);
+	read_unlock(&journal->j_state_lock);
+
+	jbd2_write_superblock(journal, WRITE_SYNC);
+}
+EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
+
+/*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+static int journal_get_superblock(journal_t *journal)
+{
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int err = -EIO;
+
+	bh = journal->j_sb_buffer;
+
+	J_ASSERT(bh != NULL);
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			printk(KERN_ERR
+				"JBD2: IO error reading journal superblock\n");
+			goto out;
+		}
+	}
+
+	if (buffer_verified(bh))
+		return 0;
+
+	sb = journal->j_superblock;
+
+	err = -EINVAL;
+
+	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+		printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
+		goto out;
+	}
+
+	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JBD2_SUPERBLOCK_V1:
+		journal->j_format_version = 1;
+		break;
+	case JBD2_SUPERBLOCK_V2:
+		journal->j_format_version = 2;
+		break;
+	default:
+		printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
+		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
+	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+		printk(KERN_WARNING "JBD2: journal file too short\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_first) == 0 ||
+	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+		printk(KERN_WARNING
+			"JBD2: Invalid start block of journal: %u\n",
+			be32_to_cpu(sb->s_first));
+		goto out;
+	}
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
+	    JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+		/* Can't have checksum v2 and v3 at the same time! */
+		printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+		       "at the same time!\n");
+		goto out;
+	}
+
+	if (jbd2_journal_has_csum_v2or3(journal) &&
+	    JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+		/* Can't have checksum v1 and v2 on at the same time! */
+		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
+		       "at the same time!\n");
+		goto out;
+	}
+
+	if (!jbd2_verify_csum_type(journal, sb)) {
+		printk(KERN_ERR "JBD2: Unknown checksum type\n");
+		goto out;
+	}
+
+	/* Load the checksum driver */
+	if (jbd2_journal_has_csum_v2or3(journal)) {
+		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+		if (IS_ERR(journal->j_chksum_driver)) {
+			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+			err = PTR_ERR(journal->j_chksum_driver);
+			journal->j_chksum_driver = NULL;
+			goto out;
+		}
+	}
+
+	/* Check superblock checksum */
+	if (!jbd2_superblock_csum_verify(journal, sb)) {
+		printk(KERN_ERR "JBD2: journal checksum error\n");
+		goto out;
+	}
+
+	/* Precompute checksum seed for all metadata */
+	if (jbd2_journal_has_csum_v2or3(journal))
+		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+						   sizeof(sb->s_uuid));
+
+	set_buffer_verified(bh);
+
+	return 0;
+
+out:
+	journal_fail_superblock(journal);
+	return err;
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+
+static int load_superblock(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+	journal->j_tail = be32_to_cpu(sb->s_start);
+	journal->j_first = be32_to_cpu(sb->s_first);
+	journal->j_last = be32_to_cpu(sb->s_maxlen);
+	journal->j_errno = be32_to_cpu(sb->s_errno);
+
+	return 0;
+}
+
+
+/**
+ * int jbd2_journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ *
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+int jbd2_journal_load(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+	/* If this is a V2 superblock, then we have to check the
+	 * features flags on it. */
+
+	if (journal->j_format_version >= 2) {
+		if ((sb->s_feature_ro_compat &
+		     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+		    (sb->s_feature_incompat &
+		     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+			printk(KERN_WARNING
+				"JBD2: Unrecognised features on journal\n");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+	if (err)
+		return err;
+
+	/* Let the recovery code check whether it needs to recover any
+	 * data from the journal. */
+	if (jbd2_journal_recover(journal))
+		goto recovery_error;
+
+	if (journal->j_failed_commit) {
+		printk(KERN_ERR "JBD2: journal transaction %u on %s "
+		       "is corrupt.\n", journal->j_failed_commit,
+		       journal->j_devname);
+		return -EIO;
+	}
+
+	/* OK, we've finished with the dynamic journal bits:
+	 * reinitialise the dynamic contents of the superblock in memory
+	 * and reset them on disk. */
+	if (journal_reset(journal))
+		goto recovery_error;
+
+	journal->j_flags &= ~JBD2_ABORT;
+	journal->j_flags |= JBD2_LOADED;
+	return 0;
+
+recovery_error:
+	printk(KERN_WARNING "JBD2: recovery failed\n");
+	return -EIO;
+}
+
+/**
+ * void jbd2_journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+ *
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ * Return <0 if we couldn't clean up the journal.
+ */
+int jbd2_journal_destroy(journal_t *journal)
+{
+	int err = 0;
+
+	/* Wait for the commit thread to wake up and die. */
+	journal_kill_thread(journal);
+
+	/* Force a final log commit */
+	if (journal->j_running_transaction)
+		jbd2_journal_commit_transaction(journal);
+
+	/* Force any old transactions to disk */
+
+	/* Totally anal locking here... */
+	spin_lock(&journal->j_list_lock);
+	while (journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+		jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+		spin_lock(&journal->j_list_lock);
+	}
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+	J_ASSERT(journal->j_checkpoint_transactions == NULL);
+	spin_unlock(&journal->j_list_lock);
+
+	if (journal->j_sb_buffer) {
+		if (!is_journal_aborted(journal)) {
+			mutex_lock(&journal->j_checkpoint_mutex);
+			jbd2_mark_journal_empty(journal);
+			mutex_unlock(&journal->j_checkpoint_mutex);
+		} else
+			err = -EIO;
+		brelse(journal->j_sb_buffer);
+	}
+
+	if (journal->j_proc_entry)
+		jbd2_stats_proc_exit(journal);
+	iput(journal->j_inode);
+	if (journal->j_revoke)
+		jbd2_journal_destroy_revoke(journal);
+	if (journal->j_chksum_driver)
+		crypto_free_shash(journal->j_chksum_driver);
+	kfree(journal->j_wbuf);
+	kfree(journal);
+
+	return err;
+}
+
+
+/**
+ *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journal uses all of a given set of
+ * features.  Return true (non-zero) if it does.
+ **/
+
+int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+				 unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (!compat && !ro && !incompat)
+		return 1;
+	/* Load journal superblock if it is not loaded yet. */
+	if (journal->j_format_version == 0 &&
+	    journal_get_superblock(journal) != 0)
+		return 0;
+	if (journal->j_format_version == 1)
+		return 0;
+
+	sb = journal->j_superblock;
+
+	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journaling code supports the use of
+ * all of a given set of features on this journal.  Return true
+ * (non-zero) if it can. */
+
+int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+				      unsigned long ro, unsigned long incompat)
+{
+	if (!compat && !ro && !incompat)
+		return 1;
+
+	/* We can support any known requested features iff the
+	 * superblock is in version 2.  Otherwise we fail to support any
+	 * extended sb features. */
+
+	if (journal->j_format_version != 2)
+		return 0;
+
+	if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
+	    (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
+	    (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Mark a given journal feature as present on the
+ * superblock.  Returns true if the requested features could be set.
+ *
+ */
+
+int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+			  unsigned long ro, unsigned long incompat)
+{
+#define INCOMPAT_FEATURE_ON(f) \
+		((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
+#define COMPAT_FEATURE_ON(f) \
+		((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
+	journal_superblock_t *sb;
+
+	if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
+		return 1;
+
+	if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
+		return 0;
+
+	/* If enabling v2 checksums, turn on v3 instead */
+	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
+		incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
+		incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
+	}
+
+	/* Asking for checksumming v3 and v1?  Only give them v3. */
+	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
+	    compat & JBD2_FEATURE_COMPAT_CHECKSUM)
+		compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
+
+	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	/* If enabling v3 checksums, update superblock */
+	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+		sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
+		sb->s_feature_compat &=
+			~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
+
+		/* Load the checksum driver */
+		if (journal->j_chksum_driver == NULL) {
+			journal->j_chksum_driver = crypto_alloc_shash("crc32c",
+								      0, 0);
+			if (IS_ERR(journal->j_chksum_driver)) {
+				printk(KERN_ERR "JBD2: Cannot load crc32c "
+				       "driver.\n");
+				journal->j_chksum_driver = NULL;
+				return 0;
+			}
+
+			/* Precompute checksum seed for all metadata */
+			journal->j_csum_seed = jbd2_chksum(journal, ~0,
+							   sb->s_uuid,
+							   sizeof(sb->s_uuid));
+		}
+	}
+
+	/* If enabling v1 checksums, downgrade superblock */
+	if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
+		sb->s_feature_incompat &=
+			~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
+				     JBD2_FEATURE_INCOMPAT_CSUM_V3);
+
+	sb->s_feature_compat    |= cpu_to_be32(compat);
+	sb->s_feature_ro_compat |= cpu_to_be32(ro);
+	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+
+	return 1;
+#undef COMPAT_FEATURE_ON
+#undef INCOMPAT_FEATURE_ON
+}
+
+/*
+ * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * 				    superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Clear a given journal feature as present on the
+ * superblock.
+ */
+void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
+				unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	sb->s_feature_compat    &= ~cpu_to_be32(compat);
+	sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
+	sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
+}
+EXPORT_SYMBOL(jbd2_journal_clear_features);
+
+/**
+ * int jbd2_journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ *
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+
+int jbd2_journal_flush(journal_t *journal)
+{
+	int err = 0;
+	transaction_t *transaction = NULL;
+
+	write_lock(&journal->j_state_lock);
+
+	/* Force everything buffered to the log... */
+	if (journal->j_running_transaction) {
+		transaction = journal->j_running_transaction;
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	/* Wait for the log commit to complete... */
+	if (transaction) {
+		tid_t tid = transaction->t_tid;
+
+		write_unlock(&journal->j_state_lock);
+		jbd2_log_wait_commit(journal, tid);
+	} else {
+		write_unlock(&journal->j_state_lock);
+	}
+
+	/* ...and flush everything in the log out to disk. */
+	spin_lock(&journal->j_list_lock);
+	while (!err && journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+		err = jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+		spin_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
+	mutex_lock(&journal->j_checkpoint_mutex);
+	jbd2_cleanup_journal_tail(journal);
+
+	/* Finally, mark the journal as really needing no recovery.
+	 * This sets s_start==0 in the underlying superblock, which is
+	 * the magic code for a fully-recovered superblock.  Any future
+	 * commits of data to the journal will restore the current
+	 * s_start value. */
+	jbd2_mark_journal_empty(journal);
+	mutex_unlock(&journal->j_checkpoint_mutex);
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(!journal->j_running_transaction);
+	J_ASSERT(!journal->j_committing_transaction);
+	J_ASSERT(!journal->j_checkpoint_transactions);
+	J_ASSERT(journal->j_head == journal->j_tail);
+	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+	write_unlock(&journal->j_state_lock);
+	return 0;
+}
+
+/**
+ * int jbd2_journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ *
+ * Wipe out all of the contents of a journal, safely.  This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and jbd2_journal_load().
+ *
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+
+int jbd2_journal_wipe(journal_t *journal, int write)
+{
+	int err = 0;
+
+	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	if (!journal->j_tail)
+		goto no_recovery;
+
+	printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
+		write ? "Clearing" : "Ignoring");
+
+	err = jbd2_journal_skip_recovery(journal);
+	if (write) {
+		/* Lock to make assertions happy... */
+		mutex_lock(&journal->j_checkpoint_mutex);
+		jbd2_mark_journal_empty(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+
+ no_recovery:
+	return err;
+}
+
+/*
+ * Journal abort has very specific semantics, which we describe
+ * for journal abort.
+ *
+ * Two internal functions, which provide abort to the jbd layer
+ * itself are here.
+ */
+
+/*
+ * Quick version for internal journal use (doesn't lock the journal).
+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
+ * and don't attempt to make any other journal updates.
+ */
+void __jbd2_journal_abort_hard(journal_t *journal)
+{
+	transaction_t *transaction;
+
+	if (journal->j_flags & JBD2_ABORT)
+		return;
+
+	printk(KERN_ERR "Aborting journal on device %s.\n",
+	       journal->j_devname);
+
+	write_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_ABORT;
+	transaction = journal->j_running_transaction;
+	if (transaction)
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+	write_unlock(&journal->j_state_lock);
+}
+
+/* Soft abort: record the abort error status in the journal superblock,
+ * but don't do any other IO. */
+static void __journal_abort_soft (journal_t *journal, int errno)
+{
+	if (journal->j_flags & JBD2_ABORT)
+		return;
+
+	if (!journal->j_errno)
+		journal->j_errno = errno;
+
+	__jbd2_journal_abort_hard(journal);
+
+	if (errno)
+		jbd2_journal_update_sb_errno(journal);
+}
+
+/**
+ * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno:   an error number to record in the journal indicating
+ *           the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction).  This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The jbd2_journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics.  Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return.  A
+ * jbd2_journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final jbd2_journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the jbd2_journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock.  This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk.  ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ *
+ */
+
+void jbd2_journal_abort(journal_t *journal, int errno)
+{
+	__journal_abort_soft(journal, errno);
+}
+
+/**
+ * int jbd2_journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno number set with jbd2_journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
+int jbd2_journal_errno(journal_t *journal)
+{
+	int err;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_flags & JBD2_ABORT)
+		err = -EROFS;
+	else
+		err = journal->j_errno;
+	read_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * int jbd2_journal_clear_err () - clears the journal's error state
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or acked to take a FS out of readonly
+ * mode.
+ */
+int jbd2_journal_clear_err(journal_t *journal)
+{
+	int err = 0;
+
+	write_lock(&journal->j_state_lock);
+	if (journal->j_flags & JBD2_ABORT)
+		err = -EROFS;
+	else
+		journal->j_errno = 0;
+	write_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * void jbd2_journal_ack_err() - Ack journal err.
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or acked to take a FS out of readonly
+ * mode.
+ */
+void jbd2_journal_ack_err(journal_t *journal)
+{
+	write_lock(&journal->j_state_lock);
+	if (journal->j_errno)
+		journal->j_flags |= JBD2_ACK_ERR;
+	write_unlock(&journal->j_state_lock);
+}
+
+int jbd2_journal_blocks_per_page(struct inode *inode)
+{
+	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+}
+
+/*
+ * helper functions to deal with 32 or 64bit block numbers.
+ */
+size_t journal_tag_bytes(journal_t *journal)
+{
+	size_t sz;
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+		return sizeof(journal_block_tag3_t);
+
+	sz = sizeof(journal_block_tag_t);
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+		sz += sizeof(__u16);
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		return sz;
+	else
+		return sz - sizeof(__u32);
+}
+
+/*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data.  Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks.  (For example, 16k pages on Power systems
+ * with a 4k block file system.)  For blocks smaller than a page, we
+ * use a SLAB allocator.  There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded.  For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD2_MAX_SLABS; i++) {
+		if (jbd2_slab[i])
+			kmem_cache_destroy(jbd2_slab[i]);
+		jbd2_slab[i] = NULL;
+	}
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+	static DEFINE_MUTEX(jbd2_slab_create_mutex);
+	int i = order_base_2(size) - 10;
+	size_t slab_size;
+
+	if (size == PAGE_SIZE)
+		return 0;
+
+	if (i >= JBD2_MAX_SLABS)
+		return -EINVAL;
+
+	if (unlikely(i < 0))
+		i = 0;
+	mutex_lock(&jbd2_slab_create_mutex);
+	if (jbd2_slab[i]) {
+		mutex_unlock(&jbd2_slab_create_mutex);
+		return 0;	/* Already created */
+	}
+
+	slab_size = 1 << (i+10);
+	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+					 slab_size, 0, NULL);
+	mutex_unlock(&jbd2_slab_create_mutex);
+	if (!jbd2_slab[i]) {
+		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+
+	BUG_ON(i >= JBD2_MAX_SLABS);
+	if (unlikely(i < 0))
+		i = 0;
+	BUG_ON(jbd2_slab[i] == NULL);
+	return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+	void *ptr;
+
+	BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+	flags |= __GFP_REPEAT;
+	if (size == PAGE_SIZE)
+		ptr = (void *)__get_free_pages(flags, 0);
+	else if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			ptr = (void *)__get_free_pages(flags, order);
+		else
+			ptr = vmalloc(size);
+	} else
+		ptr = kmem_cache_alloc(get_slab(size), flags);
+
+	/* Check alignment; SLUB has gotten this wrong in the past,
+	 * and this can lead to user data corruption! */
+	BUG_ON(((unsigned long) ptr) & (size-1));
+
+	return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+	if (size == PAGE_SIZE) {
+		free_pages((unsigned long)ptr, 0);
+		return;
+	}
+	if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			free_pages((unsigned long)ptr, order);
+		else
+			vfree(ptr);
+		return;
+	}
+	kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
+ * Journal_head storage management
+ */
+static struct kmem_cache *jbd2_journal_head_cache;
+#ifdef CONFIG_JBD2_DEBUG
+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+#endif
+
+static int jbd2_journal_init_journal_head_cache(void)
+{
+	int retval;
+
+	J_ASSERT(jbd2_journal_head_cache == NULL);
+	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
+				sizeof(struct journal_head),
+				0,		/* offset */
+				SLAB_TEMPORARY,	/* flags */
+				NULL);		/* ctor */
+	retval = 0;
+	if (!jbd2_journal_head_cache) {
+		retval = -ENOMEM;
+		printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
+	}
+	return retval;
+}
+
+static void jbd2_journal_destroy_journal_head_cache(void)
+{
+	if (jbd2_journal_head_cache) {
+		kmem_cache_destroy(jbd2_journal_head_cache);
+		jbd2_journal_head_cache = NULL;
+	}
+}
+
+/*
+ * journal_head splicing and dicing
+ */
+static struct journal_head *journal_alloc_journal_head(void)
+{
+	struct journal_head *ret;
+
+#ifdef CONFIG_JBD2_DEBUG
+	atomic_inc(&nr_journal_heads);
+#endif
+	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
+	if (!ret) {
+		jbd_debug(1, "out of memory for journal_head\n");
+		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
+		while (!ret) {
+			yield();
+			ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
+		}
+	}
+	return ret;
+}
+
+static void journal_free_journal_head(struct journal_head *jh)
+{
+#ifdef CONFIG_JBD2_DEBUG
+	atomic_dec(&nr_journal_heads);
+	memset(jh, JBD2_POISON_FREE, sizeof(*jh));
+#endif
+	kmem_cache_free(jbd2_journal_head_cache, jh);
+}
+
+/*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set.  This bit is tested in core kernel code where we need to take
+ * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head is detached from its buffer_head when the journal_head's
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
+ * transaction (b_cp_transaction) hold their references to b_jcount.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction.  To protect the
+ * journal_head in this situation, jbd2_journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one.  The caller must call
+ * jbd2_journal_put_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ *	(Attach a journal_head if needed.  Increments b_jcount)
+ *	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ *	...
+ *      (Get another reference for transaction)
+ *	jbd2_journal_grab_journal_head(bh);
+ *	jh->b_transaction = xxx;
+ *	(Put original reference)
+ *	jbd2_journal_put_journal_head(jh);
+ */
+
+/*
+ * Give a buffer_head a journal_head.
+ *
+ * May sleep.
+ */
+struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh;
+	struct journal_head *new_jh = NULL;
+
+repeat:
+	if (!buffer_jbd(bh))
+		new_jh = journal_alloc_journal_head();
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+	} else {
+		J_ASSERT_BH(bh,
+			(atomic_read(&bh->b_count) > 0) ||
+			(bh->b_page && bh->b_page->mapping));
+
+		if (!new_jh) {
+			jbd_unlock_bh_journal_head(bh);
+			goto repeat;
+		}
+
+		jh = new_jh;
+		new_jh = NULL;		/* We consumed it */
+		set_buffer_jbd(bh);
+		bh->b_private = jh;
+		jh->b_bh = bh;
+		get_bh(bh);
+		BUFFER_TRACE(bh, "added journal_head");
+	}
+	jh->b_jcount++;
+	jbd_unlock_bh_journal_head(bh);
+	if (new_jh)
+		journal_free_journal_head(new_jh);
+	return bh->b_private;
+}
+
+/*
+ * Grab a ref against this buffer_head's journal_head.  If it ended up not
+ * having a journal_head, return NULL
+ */
+struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = NULL;
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+		jh->b_jcount++;
+	}
+	jbd_unlock_bh_journal_head(bh);
+	return jh;
+}
+
+static void __journal_remove_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	J_ASSERT_JH(jh, jh->b_jcount >= 0);
+	J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+	J_ASSERT_BH(bh, buffer_jbd(bh));
+	J_ASSERT_BH(bh, jh2bh(jh) == bh);
+	BUFFER_TRACE(bh, "remove journal_head");
+	if (jh->b_frozen_data) {
+		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+		jbd2_free(jh->b_frozen_data, bh->b_size);
+	}
+	if (jh->b_committed_data) {
+		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+		jbd2_free(jh->b_committed_data, bh->b_size);
+	}
+	bh->b_private = NULL;
+	jh->b_bh = NULL;	/* debug, really */
+	clear_buffer_jbd(bh);
+	journal_free_journal_head(jh);
+}
+
+/*
+ * Drop a reference on the passed journal_head.  If it fell to zero then
+ * release the journal_head from the buffer_head.
+ */
+void jbd2_journal_put_journal_head(struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_lock_bh_journal_head(bh);
+	J_ASSERT_JH(jh, jh->b_jcount > 0);
+	--jh->b_jcount;
+	if (!jh->b_jcount) {
+		__journal_remove_journal_head(bh);
+		jbd_unlock_bh_journal_head(bh);
+		__brelse(bh);
+	} else
+		jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
+
+#ifdef CONFIG_PROC_FS
+
+#define JBD2_STATS_PROC_NAME "fs/jbd2"
+
+static void __init jbd2_create_jbd_stats_proc_entry(void)
+{
+	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
+}
+
+static void __exit jbd2_remove_jbd_stats_proc_entry(void)
+{
+	if (proc_jbd2_stats)
+		remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
+}
+
+#else
+
+#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
+
+#endif
+
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
+
+static int __init jbd2_journal_init_handle_cache(void)
+{
+	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
+	if (jbd2_handle_cache == NULL) {
+		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+	if (jbd2_inode_cache == NULL) {
+		printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+		kmem_cache_destroy(jbd2_handle_cache);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void jbd2_journal_destroy_handle_cache(void)
+{
+	if (jbd2_handle_cache)
+		kmem_cache_destroy(jbd2_handle_cache);
+	if (jbd2_inode_cache)
+		kmem_cache_destroy(jbd2_inode_cache);
+
+}
+
+/*
+ * Module startup and shutdown
+ */
+
+static int __init journal_init_caches(void)
+{
+	int ret;
+
+	ret = jbd2_journal_init_revoke_caches();
+	if (ret == 0)
+		ret = jbd2_journal_init_journal_head_cache();
+	if (ret == 0)
+		ret = jbd2_journal_init_handle_cache();
+	if (ret == 0)
+		ret = jbd2_journal_init_transaction_cache();
+	return ret;
+}
+
+static void jbd2_journal_destroy_caches(void)
+{
+	jbd2_journal_destroy_revoke_caches();
+	jbd2_journal_destroy_journal_head_cache();
+	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_transaction_cache();
+	jbd2_journal_destroy_slabs();
+}
+
+static int __init journal_init(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
+
+	ret = journal_init_caches();
+	if (ret == 0) {
+		jbd2_create_jbd_stats_proc_entry();
+	} else {
+		jbd2_journal_destroy_caches();
+	}
+	return ret;
+}
+
+static void __exit journal_exit(void)
+{
+#ifdef CONFIG_JBD2_DEBUG
+	int n = atomic_read(&nr_journal_heads);
+	if (n)
+		printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
+#endif
+	jbd2_remove_jbd_stats_proc_entry();
+	jbd2_journal_destroy_caches();
+}
+
+MODULE_LICENSE("GPL");
+module_init(journal_init);
+module_exit(journal_exit);
+
diff --git a/kernel/fs/jbd2/recovery.c b/kernel/fs/jbd2/recovery.c
new file mode 100644
index 000000000..a9079d035
--- /dev/null
+++ b/kernel/fs/jbd2/recovery.c
@@ -0,0 +1,880 @@
+/*
+ * linux/fs/jbd2/recovery.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/crc32.h>
+#include <linux/blkdev.h>
+#endif
+
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+struct recovery_info
+{
+	tid_t		start_transaction;
+	tid_t		end_transaction;
+
+	int		nr_replays;
+	int		nr_revokes;
+	int		nr_revoke_hits;
+};
+
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+				struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+				tid_t, struct recovery_info *);
+
+#ifdef __KERNEL__
+
+/* Release readahead buffers after use */
+static void journal_brelse_array(struct buffer_head *b[], int n)
+{
+	while (--n >= 0)
+		brelse (b[n]);
+}
+
+
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us.  We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all.  Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though.  128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+	int err;
+	unsigned int max, nbufs, next;
+	unsigned long long blocknr;
+	struct buffer_head *bh;
+
+	struct buffer_head * bufs[MAXBUF];
+
+	/* Do up to 128K of readahead */
+	max = start + (128 * 1024 / journal->j_blocksize);
+	if (max > journal->j_maxlen)
+		max = journal->j_maxlen;
+
+	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+	 * a time to the block device IO layer. */
+
+	nbufs = 0;
+
+	for (next = start; next < max; next++) {
+		err = jbd2_journal_bmap(journal, next, &blocknr);
+
+		if (err) {
+			printk(KERN_ERR "JBD2: bad block at offset %u\n",
+				next);
+			goto failed;
+		}
+
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (!bh) {
+			err = -ENOMEM;
+			goto failed;
+		}
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+			bufs[nbufs++] = bh;
+			if (nbufs == MAXBUF) {
+				ll_rw_block(READ, nbufs, bufs);
+				journal_brelse_array(bufs, nbufs);
+				nbufs = 0;
+			}
+		} else
+			brelse(bh);
+	}
+
+	if (nbufs)
+		ll_rw_block(READ, nbufs, bufs);
+	err = 0;
+
+failed:
+	if (nbufs)
+		journal_brelse_array(bufs, nbufs);
+	return err;
+}
+
+#endif /* __KERNEL__ */
+
+
+/*
+ * Read a block from the journal
+ */
+
+static int jread(struct buffer_head **bhp, journal_t *journal,
+		 unsigned int offset)
+{
+	int err;
+	unsigned long long blocknr;
+	struct buffer_head *bh;
+
+	*bhp = NULL;
+
+	if (offset >= journal->j_maxlen) {
+		printk(KERN_ERR "JBD2: corrupted journal superblock\n");
+		return -EIO;
+	}
+
+	err = jbd2_journal_bmap(journal, offset, &blocknr);
+
+	if (err) {
+		printk(KERN_ERR "JBD2: bad block at offset %u\n",
+			offset);
+		return err;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return -ENOMEM;
+
+	if (!buffer_uptodate(bh)) {
+		/* If this is a brand new buffer, start readahead.
+                   Otherwise, we assume we are already reading it.  */
+		if (!buffer_req(bh))
+			do_readahead(journal, offset);
+		wait_on_buffer(bh);
+	}
+
+	if (!buffer_uptodate(bh)) {
+		printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
+			offset);
+		brelse(bh);
+		return -EIO;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+static int jbd2_descr_block_csum_verify(journal_t *j,
+					void *buf)
+{
+	struct jbd2_journal_block_tail *tail;
+	__be32 provided;
+	__u32 calculated;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return 1;
+
+	tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
+			sizeof(struct jbd2_journal_block_tail));
+	provided = tail->t_checksum;
+	tail->t_checksum = 0;
+	calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+	tail->t_checksum = provided;
+
+	return provided == cpu_to_be32(calculated);
+}
+
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+
+static int count_tags(journal_t *journal, struct buffer_head *bh)
+{
+	char *			tagp;
+	journal_block_tag_t *	tag;
+	int			nr = 0, size = journal->j_blocksize;
+	int			tag_bytes = journal_tag_bytes(journal);
+
+	if (jbd2_journal_has_csum_v2or3(journal))
+		size -= sizeof(struct jbd2_journal_block_tail);
+
+	tagp = &bh->b_data[sizeof(journal_header_t)];
+
+	while ((tagp - bh->b_data + tag_bytes) <= size) {
+		tag = (journal_block_tag_t *) tagp;
+
+		nr++;
+		tagp += tag_bytes;
+		if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
+			tagp += 16;
+
+		if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
+			break;
+	}
+
+	return nr;
+}
+
+
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var)						\
+do {									\
+	if (var >= (journal)->j_last)					\
+		var -= ((journal)->j_last - (journal)->j_first);	\
+} while (0)
+
+/**
+ * jbd2_journal_recover - recovers a on-disk journal
+ * @journal: the journal to recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes.  In the first pass, we look for the
+ * end of the log.  In the second, we assemble the list of revoke
+ * blocks.  In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+int jbd2_journal_recover(journal_t *journal)
+{
+	int			err, err2;
+	journal_superblock_t *	sb;
+
+	struct recovery_info	info;
+
+	memset(&info, 0, sizeof(info));
+	sb = journal->j_superblock;
+
+	/*
+	 * The journal superblock's s_start field (the current log head)
+	 * is always zero if, and only if, the journal was cleanly
+	 * unmounted.
+	 */
+
+	if (!sb->s_start) {
+		jbd_debug(1, "No recovery required, last transaction %d\n",
+			  be32_to_cpu(sb->s_sequence));
+		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+		return 0;
+	}
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REVOKE);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REPLAY);
+
+	jbd_debug(1, "JBD2: recovery, exit status %d, "
+		  "recovered transactions %u to %u\n",
+		  err, info.start_transaction, info.end_transaction);
+	jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
+		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+	/* Restart the log at the next transaction ID, thus invalidating
+	 * any existing commit records in the log. */
+	journal->j_transaction_sequence = ++info.end_transaction;
+
+	jbd2_journal_clear_revoke(journal);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+	/* Make sure all replayed data is on permanent storage */
+	if (journal->j_flags & JBD2_BARRIER) {
+		err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+		if (!err)
+			err = err2;
+	}
+	return err;
+}
+
+/**
+ * jbd2_journal_skip_recovery - Start journal and wipe exiting records
+ * @journal: journal to startup
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+int jbd2_journal_skip_recovery(journal_t *journal)
+{
+	int			err;
+
+	struct recovery_info	info;
+
+	memset (&info, 0, sizeof(info));
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+
+	if (err) {
+		printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
+		++journal->j_transaction_sequence;
+	} else {
+#ifdef CONFIG_JBD2_DEBUG
+		int dropped = info.end_transaction - 
+			be32_to_cpu(journal->j_superblock->s_sequence);
+		jbd_debug(1,
+			  "JBD2: ignoring %d transaction%s from the journal.\n",
+			  dropped, (dropped == 1) ? "" : "s");
+#endif
+		journal->j_transaction_sequence = ++info.end_transaction;
+	}
+
+	journal->j_tail = 0;
+	return err;
+}
+
+static inline unsigned long long read_tag_block(journal_t *journal,
+						journal_block_tag_t *tag)
+{
+	unsigned long long block = be32_to_cpu(tag->t_blocknr);
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
+	return block;
+}
+
+/*
+ * calc_chksums calculates the checksums for the blocks described in the
+ * descriptor block.
+ */
+static int calc_chksums(journal_t *journal, struct buffer_head *bh,
+			unsigned long *next_log_block, __u32 *crc32_sum)
+{
+	int i, num_blks, err;
+	unsigned long io_block;
+	struct buffer_head *obh;
+
+	num_blks = count_tags(journal, bh);
+	/* Calculate checksum of the descriptor block. */
+	*crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
+
+	for (i = 0; i < num_blks; i++) {
+		io_block = (*next_log_block)++;
+		wrap(journal, *next_log_block);
+		err = jread(&obh, journal, io_block);
+		if (err) {
+			printk(KERN_ERR "JBD2: IO error %d recovering block "
+				"%lu in log\n", err, io_block);
+			return 1;
+		} else {
+			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
+				     obh->b_size);
+		}
+		put_bh(obh);
+	}
+	return 0;
+}
+
+static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
+{
+	struct commit_header *h;
+	__be32 provided;
+	__u32 calculated;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return 1;
+
+	h = buf;
+	provided = h->h_chksum[0];
+	h->h_chksum[0] = 0;
+	calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+	h->h_chksum[0] = provided;
+
+	return provided == cpu_to_be32(calculated);
+}
+
+static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
+				      void *buf, __u32 sequence)
+{
+	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
+	__u32 csum32;
+	__be32 seq;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return 1;
+
+	seq = cpu_to_be32(sequence);
+	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+	csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+		return tag3->t_checksum == cpu_to_be32(csum32);
+	else
+		return tag->t_checksum == cpu_to_be16(csum32);
+}
+
+static int do_one_pass(journal_t *journal,
+			struct recovery_info *info, enum passtype pass)
+{
+	unsigned int		first_commit_ID, next_commit_ID;
+	unsigned long		next_log_block;
+	int			err, success = 0;
+	journal_superblock_t *	sb;
+	journal_header_t *	tmp;
+	struct buffer_head *	bh;
+	unsigned int		sequence;
+	int			blocktype;
+	int			tag_bytes = journal_tag_bytes(journal);
+	__u32			crc32_sum = ~0; /* Transactional Checksums */
+	int			descr_csum_size = 0;
+	int			block_error = 0;
+
+	/*
+	 * First thing is to establish what we expect to find in the log
+	 * (in terms of transaction IDs), and where (in terms of log
+	 * block offsets): query the superblock.
+	 */
+
+	sb = journal->j_superblock;
+	next_commit_ID = be32_to_cpu(sb->s_sequence);
+	next_log_block = be32_to_cpu(sb->s_start);
+
+	first_commit_ID = next_commit_ID;
+	if (pass == PASS_SCAN)
+		info->start_transaction = first_commit_ID;
+
+	jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+	/*
+	 * Now we walk through the log, transaction by transaction,
+	 * making sure that each transaction has a commit block in the
+	 * expected place.  Each complete transaction gets replayed back
+	 * into the main filesystem.
+	 */
+
+	while (1) {
+		int			flags;
+		char *			tagp;
+		journal_block_tag_t *	tag;
+		struct buffer_head *	obh;
+		struct buffer_head *	nbh;
+
+		cond_resched();
+
+		/* If we already know where to stop the log traversal,
+		 * check right now that we haven't gone past the end of
+		 * the log. */
+
+		if (pass != PASS_SCAN)
+			if (tid_geq(next_commit_ID, info->end_transaction))
+				break;
+
+		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+			  next_commit_ID, next_log_block, journal->j_last);
+
+		/* Skip over each chunk of the transaction looking
+		 * either the next descriptor block or the final commit
+		 * record. */
+
+		jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
+		err = jread(&bh, journal, next_log_block);
+		if (err)
+			goto failed;
+
+		next_log_block++;
+		wrap(journal, next_log_block);
+
+		/* What kind of buffer is it?
+		 *
+		 * If it is a descriptor block, check that it has the
+		 * expected sequence number.  Otherwise, we're all done
+		 * here. */
+
+		tmp = (journal_header_t *)bh->b_data;
+
+		if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+			brelse(bh);
+			break;
+		}
+
+		blocktype = be32_to_cpu(tmp->h_blocktype);
+		sequence = be32_to_cpu(tmp->h_sequence);
+		jbd_debug(3, "Found magic %d, sequence %d\n",
+			  blocktype, sequence);
+
+		if (sequence != next_commit_ID) {
+			brelse(bh);
+			break;
+		}
+
+		/* OK, we have a valid descriptor block which matches
+		 * all of the sequence number checks.  What are we going
+		 * to do with it?  That depends on the pass... */
+
+		switch(blocktype) {
+		case JBD2_DESCRIPTOR_BLOCK:
+			/* Verify checksum first */
+			if (jbd2_journal_has_csum_v2or3(journal))
+				descr_csum_size =
+					sizeof(struct jbd2_journal_block_tail);
+			if (descr_csum_size > 0 &&
+			    !jbd2_descr_block_csum_verify(journal,
+							  bh->b_data)) {
+				printk(KERN_ERR "JBD2: Invalid checksum "
+				       "recovering block %lu in log\n",
+				       next_log_block);
+				err = -EIO;
+				brelse(bh);
+				goto failed;
+			}
+
+			/* If it is a valid descriptor block, replay it
+			 * in pass REPLAY; if journal_checksums enabled, then
+			 * calculate checksums in PASS_SCAN, otherwise,
+			 * just skip over the blocks it describes. */
+			if (pass != PASS_REPLAY) {
+				if (pass == PASS_SCAN &&
+				    JBD2_HAS_COMPAT_FEATURE(journal,
+					    JBD2_FEATURE_COMPAT_CHECKSUM) &&
+				    !info->end_transaction) {
+					if (calc_chksums(journal, bh,
+							&next_log_block,
+							&crc32_sum)) {
+						put_bh(bh);
+						break;
+					}
+					put_bh(bh);
+					continue;
+				}
+				next_log_block += count_tags(journal, bh);
+				wrap(journal, next_log_block);
+				put_bh(bh);
+				continue;
+			}
+
+			/* A descriptor block: we can now write all of
+			 * the data blocks.  Yay, useful work is finally
+			 * getting done here! */
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			while ((tagp - bh->b_data + tag_bytes)
+			       <= journal->j_blocksize - descr_csum_size) {
+				unsigned long io_block;
+
+				tag = (journal_block_tag_t *) tagp;
+				flags = be16_to_cpu(tag->t_flags);
+
+				io_block = next_log_block++;
+				wrap(journal, next_log_block);
+				err = jread(&obh, journal, io_block);
+				if (err) {
+					/* Recover what we can, but
+					 * report failure at the end. */
+					success = err;
+					printk(KERN_ERR
+						"JBD2: IO error %d recovering "
+						"block %ld in log\n",
+						err, io_block);
+				} else {
+					unsigned long long blocknr;
+
+					J_ASSERT(obh != NULL);
+					blocknr = read_tag_block(journal,
+								 tag);
+
+					/* If the block has been
+					 * revoked, then we're all done
+					 * here. */
+					if (jbd2_journal_test_revoke
+					    (journal, blocknr,
+					     next_commit_ID)) {
+						brelse(obh);
+						++info->nr_revoke_hits;
+						goto skip_write;
+					}
+
+					/* Look for block corruption */
+					if (!jbd2_block_tag_csum_verify(
+						journal, tag, obh->b_data,
+						be32_to_cpu(tmp->h_sequence))) {
+						brelse(obh);
+						success = -EIO;
+						printk(KERN_ERR "JBD2: Invalid "
+						       "checksum recovering "
+						       "block %llu in log\n",
+						       blocknr);
+						block_error = 1;
+						goto skip_write;
+					}
+
+					/* Find a buffer for the new
+					 * data being restored */
+					nbh = __getblk(journal->j_fs_dev,
+							blocknr,
+							journal->j_blocksize);
+					if (nbh == NULL) {
+						printk(KERN_ERR
+						       "JBD2: Out of memory "
+						       "during recovery.\n");
+						err = -ENOMEM;
+						brelse(bh);
+						brelse(obh);
+						goto failed;
+					}
+
+					lock_buffer(nbh);
+					memcpy(nbh->b_data, obh->b_data,
+							journal->j_blocksize);
+					if (flags & JBD2_FLAG_ESCAPE) {
+						*((__be32 *)nbh->b_data) =
+						cpu_to_be32(JBD2_MAGIC_NUMBER);
+					}
+
+					BUFFER_TRACE(nbh, "marking dirty");
+					set_buffer_uptodate(nbh);
+					mark_buffer_dirty(nbh);
+					BUFFER_TRACE(nbh, "marking uptodate");
+					++info->nr_replays;
+					/* ll_rw_block(WRITE, 1, &nbh); */
+					unlock_buffer(nbh);
+					brelse(obh);
+					brelse(nbh);
+				}
+
+			skip_write:
+				tagp += tag_bytes;
+				if (!(flags & JBD2_FLAG_SAME_UUID))
+					tagp += 16;
+
+				if (flags & JBD2_FLAG_LAST_TAG)
+					break;
+			}
+
+			brelse(bh);
+			continue;
+
+		case JBD2_COMMIT_BLOCK:
+			/*     How to differentiate between interrupted commit
+			 *               and journal corruption ?
+			 *
+			 * {nth transaction}
+			 *        Checksum Verification Failed
+			 *			 |
+			 *		 ____________________
+			 *		|		     |
+			 * 	async_commit             sync_commit
+			 *     		|                    |
+			 *		| GO TO NEXT    "Journal Corruption"
+			 *		| TRANSACTION
+			 *		|
+			 * {(n+1)th transanction}
+			 *		|
+			 * 	 _______|______________
+			 * 	|	 	      |
+			 * Commit block found	Commit block not found
+			 *      |		      |
+			 * "Journal Corruption"       |
+			 *		 _____________|_________
+			 *     		|	           	|
+			 *	nth trans corrupt	OR   nth trans
+			 *	and (n+1)th interrupted     interrupted
+			 *	before commit block
+			 *      could reach the disk.
+			 *	(Cannot find the difference in above
+			 *	 mentioned conditions. Hence assume
+			 *	 "Interrupted Commit".)
+			 */
+
+			/* Found an expected commit block: if checksums
+			 * are present verify them in PASS_SCAN; else not
+			 * much to do other than move on to the next sequence
+			 * number. */
+			if (pass == PASS_SCAN &&
+			    JBD2_HAS_COMPAT_FEATURE(journal,
+				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+				int chksum_err, chksum_seen;
+				struct commit_header *cbh =
+					(struct commit_header *)bh->b_data;
+				unsigned found_chksum =
+					be32_to_cpu(cbh->h_chksum[0]);
+
+				chksum_err = chksum_seen = 0;
+
+				if (info->end_transaction) {
+					journal->j_failed_commit =
+						info->end_transaction;
+					brelse(bh);
+					break;
+				}
+
+				if (crc32_sum == found_chksum &&
+				    cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
+				    cbh->h_chksum_size ==
+						JBD2_CRC32_CHKSUM_SIZE)
+				       chksum_seen = 1;
+				else if (!(cbh->h_chksum_type == 0 &&
+					     cbh->h_chksum_size == 0 &&
+					     found_chksum == 0 &&
+					     !chksum_seen))
+				/*
+				 * If fs is mounted using an old kernel and then
+				 * kernel with journal_chksum is used then we
+				 * get a situation where the journal flag has
+				 * checksum flag set but checksums are not
+				 * present i.e chksum = 0, in the individual
+				 * commit blocks.
+				 * Hence to avoid checksum failures, in this
+				 * situation, this extra check is added.
+				 */
+						chksum_err = 1;
+
+				if (chksum_err) {
+					info->end_transaction = next_commit_ID;
+
+					if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+						journal->j_failed_commit =
+							next_commit_ID;
+						brelse(bh);
+						break;
+					}
+				}
+				crc32_sum = ~0;
+			}
+			if (pass == PASS_SCAN &&
+			    !jbd2_commit_block_csum_verify(journal,
+							   bh->b_data)) {
+				info->end_transaction = next_commit_ID;
+
+				if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+				     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+					journal->j_failed_commit =
+						next_commit_ID;
+					brelse(bh);
+					break;
+				}
+			}
+			brelse(bh);
+			next_commit_ID++;
+			continue;
+
+		case JBD2_REVOKE_BLOCK:
+			/* If we aren't in the REVOKE pass, then we can
+			 * just skip over this block. */
+			if (pass != PASS_REVOKE) {
+				brelse(bh);
+				continue;
+			}
+
+			err = scan_revoke_records(journal, bh,
+						  next_commit_ID, info);
+			brelse(bh);
+			if (err)
+				goto failed;
+			continue;
+
+		default:
+			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+				  blocktype);
+			brelse(bh);
+			goto done;
+		}
+	}
+
+ done:
+	/*
+	 * We broke out of the log scan loop: either we came to the
+	 * known end of the log or we found an unexpected block in the
+	 * log.  If the latter happened, then we know that the "current"
+	 * transaction marks the end of the valid log.
+	 */
+
+	if (pass == PASS_SCAN) {
+		if (!info->end_transaction)
+			info->end_transaction = next_commit_ID;
+	} else {
+		/* It's really bad news if different passes end up at
+		 * different places (but possible due to IO errors). */
+		if (info->end_transaction != next_commit_ID) {
+			printk(KERN_ERR "JBD2: recovery pass %d ended at "
+				"transaction %u, expected %u\n",
+				pass, next_commit_ID, info->end_transaction);
+			if (!success)
+				success = -EIO;
+		}
+	}
+	if (block_error && success == 0)
+		success = -EIO;
+	return success;
+
+ failed:
+	return err;
+}
+
+static int jbd2_revoke_block_csum_verify(journal_t *j,
+					 void *buf)
+{
+	struct jbd2_journal_revoke_tail *tail;
+	__be32 provided;
+	__u32 calculated;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return 1;
+
+	tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
+			sizeof(struct jbd2_journal_revoke_tail));
+	provided = tail->r_checksum;
+	tail->r_checksum = 0;
+	calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+	tail->r_checksum = provided;
+
+	return provided == cpu_to_be32(calculated);
+}
+
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+			       tid_t sequence, struct recovery_info *info)
+{
+	jbd2_journal_revoke_header_t *header;
+	int offset, max;
+	int csum_size = 0;
+	__u32 rcount;
+	int record_len = 4;
+
+	header = (jbd2_journal_revoke_header_t *) bh->b_data;
+	offset = sizeof(jbd2_journal_revoke_header_t);
+	rcount = be32_to_cpu(header->r_count);
+
+	if (!jbd2_revoke_block_csum_verify(journal, header))
+		return -EINVAL;
+
+	if (jbd2_journal_has_csum_v2or3(journal))
+		csum_size = sizeof(struct jbd2_journal_revoke_tail);
+	if (rcount > journal->j_blocksize - csum_size)
+		return -EINVAL;
+	max = rcount;
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		record_len = 8;
+
+	while (offset + record_len <= max) {
+		unsigned long long blocknr;
+		int err;
+
+		if (record_len == 4)
+			blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+		else
+			blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
+		offset += record_len;
+		err = jbd2_journal_set_revoke(journal, blocknr, sequence);
+		if (err)
+			return err;
+		++info->nr_revokes;
+	}
+	return 0;
+}
diff --git a/kernel/fs/jbd2/revoke.c b/kernel/fs/jbd2/revoke.c
new file mode 100644
index 000000000..14214da80
--- /dev/null
+++ b/kernel/fs/jbd2/revoke.c
@@ -0,0 +1,764 @@
+/*
+ * linux/fs/jbd2/revoke.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks.  The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ *   transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ *   revoked blocks.  If there are multiple revoke records in the log
+ *   for a single block, only the last one counts, and if there is a log
+ *   entry for a block beyond the last revoke, then that log entry still
+ *   gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ *   The desired end result is the journaling of the new block, so we
+ *   cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ *   The revoke must take precedence over the write of the block, so we
+ *   need either to cancel the journal entry or to write the revoke
+ *   later in the log than the log block.  In this case, we choose the
+ *   latter: journaling a block cancels any revoke record for that block
+ *   in the current transaction, so any revoke for that block in the
+ *   transaction must have happened after the block was journaled and so
+ *   the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ *   The data write is allowed to succeed, but the revoke is _not_
+ *   cancelled.  We still need to prevent old log records from
+ *   overwriting the new data.  We don't even need to clear the revoke
+ *   bit here.
+ *
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits.  As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear:	no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ *			buffer has not been revoked, and cancel_revoke
+ *			need do nothing.
+ * RevokeValid set, Revoked set:
+ *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment no one else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
+#endif
+
+static struct kmem_cache *jbd2_revoke_record_cache;
+static struct kmem_cache *jbd2_revoke_table_cache;
+
+/* Each revoke record represents one single revoked block.  During
+   journal replay, this involves recording the transaction ID of the
+   last transaction to revoke this block. */
+
+struct jbd2_revoke_record_s
+{
+	struct list_head  hash;
+	tid_t		  sequence;	/* Used for recovery only */
+	unsigned long long	  blocknr;
+};
+
+
+/* The revoke table is just a simple hash table of revoke records. */
+struct jbd2_revoke_table_s
+{
+	/* It is conceivable that we might want a larger hash table
+	 * for recovery.  Must be a power of two. */
+	int		  hash_size;
+	int		  hash_shift;
+	struct list_head *hash_table;
+};
+
+
+#ifdef __KERNEL__
+static void write_one_revoke_record(journal_t *, transaction_t *,
+				    struct list_head *,
+				    struct buffer_head **, int *,
+				    struct jbd2_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
+#endif
+
+/* Utility functions to maintain the revoke table */
+
+static inline int hash(journal_t *journal, unsigned long long block)
+{
+	return hash_64(block, journal->j_revoke->hash_shift);
+}
+
+static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
+			      tid_t seq)
+{
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+
+repeat:
+	record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+	if (!record)
+		goto oom;
+
+	record->sequence = seq;
+	record->blocknr = blocknr;
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+	spin_lock(&journal->j_revoke_lock);
+	list_add(&record->hash, hash_list);
+	spin_unlock(&journal->j_revoke_lock);
+	return 0;
+
+oom:
+	if (!journal_oom_retry)
+		return -ENOMEM;
+	jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
+	yield();
+	goto repeat;
+}
+
+/* Find a revoke record in the journal's hash table. */
+
+static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
+						      unsigned long long blocknr)
+{
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+	spin_lock(&journal->j_revoke_lock);
+	record = (struct jbd2_revoke_record_s *) hash_list->next;
+	while (&(record->hash) != hash_list) {
+		if (record->blocknr == blocknr) {
+			spin_unlock(&journal->j_revoke_lock);
+			return record;
+		}
+		record = (struct jbd2_revoke_record_s *) record->hash.next;
+	}
+	spin_unlock(&journal->j_revoke_lock);
+	return NULL;
+}
+
+void jbd2_journal_destroy_revoke_caches(void)
+{
+	if (jbd2_revoke_record_cache) {
+		kmem_cache_destroy(jbd2_revoke_record_cache);
+		jbd2_revoke_record_cache = NULL;
+	}
+	if (jbd2_revoke_table_cache) {
+		kmem_cache_destroy(jbd2_revoke_table_cache);
+		jbd2_revoke_table_cache = NULL;
+	}
+}
+
+int __init jbd2_journal_init_revoke_caches(void)
+{
+	J_ASSERT(!jbd2_revoke_record_cache);
+	J_ASSERT(!jbd2_revoke_table_cache);
+
+	jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
+					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
+	if (!jbd2_revoke_record_cache)
+		goto record_cache_failure;
+
+	jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
+					     SLAB_TEMPORARY);
+	if (!jbd2_revoke_table_cache)
+		goto table_cache_failure;
+	return 0;
+table_cache_failure:
+	jbd2_journal_destroy_revoke_caches();
+record_cache_failure:
+		return -ENOMEM;
+}
+
+static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
+{
+	int shift = 0;
+	int tmp = hash_size;
+	struct jbd2_revoke_table_s *table;
+
+	table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	while((tmp >>= 1UL) != 0UL)
+		shift++;
+
+	table->hash_size = hash_size;
+	table->hash_shift = shift;
+	table->hash_table =
+		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+	if (!table->hash_table) {
+		kmem_cache_free(jbd2_revoke_table_cache, table);
+		table = NULL;
+		goto out;
+	}
+
+	for (tmp = 0; tmp < hash_size; tmp++)
+		INIT_LIST_HEAD(&table->hash_table[tmp]);
+
+out:
+	return table;
+}
+
+static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
+{
+	int i;
+	struct list_head *hash_list;
+
+	for (i = 0; i < table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT(list_empty(hash_list));
+	}
+
+	kfree(table->hash_table);
+	kmem_cache_free(jbd2_revoke_table_cache, table);
+}
+
+/* Initialise the revoke table for a given journal to a given size. */
+int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+{
+	J_ASSERT(journal->j_revoke_table[0] == NULL);
+	J_ASSERT(is_power_of_2(hash_size));
+
+	journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[0])
+		goto fail0;
+
+	journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[1])
+		goto fail1;
+
+	journal->j_revoke = journal->j_revoke_table[1];
+
+	spin_lock_init(&journal->j_revoke_lock);
+
+	return 0;
+
+fail1:
+	jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+	return -ENOMEM;
+}
+
+/* Destroy a journal's revoke table.  The table must already be empty! */
+void jbd2_journal_destroy_revoke(journal_t *journal)
+{
+	journal->j_revoke = NULL;
+	if (journal->j_revoke_table[0])
+		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
+	if (journal->j_revoke_table[1])
+		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
+}
+
+
+#ifdef __KERNEL__
+
+/*
+ * jbd2_journal_revoke: revoke a given buffer_head from the journal.  This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits.  Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete.  In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
+ * by one.
+ */
+
+int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
+		   struct buffer_head *bh_in)
+{
+	struct buffer_head *bh = NULL;
+	journal_t *journal;
+	struct block_device *bdev;
+	int err;
+
+	might_sleep();
+	if (bh_in)
+		BUFFER_TRACE(bh_in, "enter");
+
+	journal = handle->h_transaction->t_journal;
+	if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
+		J_ASSERT (!"Cannot set revoke feature!");
+		return -EINVAL;
+	}
+
+	bdev = journal->j_fs_dev;
+	bh = bh_in;
+
+	if (!bh) {
+		bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh)
+			BUFFER_TRACE(bh, "found on hash");
+	}
+#ifdef JBD2_EXPENSIVE_CHECKING
+	else {
+		struct buffer_head *bh2;
+
+		/* If there is a different buffer_head lying around in
+		 * memory anywhere... */
+		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh2) {
+			/* ... and it has RevokeValid status... */
+			if (bh2 != bh && buffer_revokevalid(bh2))
+				/* ...then it better be revoked too,
+				 * since it's illegal to create a revoke
+				 * record against a buffer_head which is
+				 * not marked revoked --- that would
+				 * risk missing a subsequent revoke
+				 * cancel. */
+				J_ASSERT_BH(bh2, buffer_revoked(bh2));
+			put_bh(bh2);
+		}
+	}
+#endif
+
+	/* We really ought not ever to revoke twice in a row without
+           first having the revoke cancelled: it's illegal to free a
+           block twice without allocating it in between! */
+	if (bh) {
+		if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
+				 "inconsistent data on disk")) {
+			if (!bh_in)
+				brelse(bh);
+			return -EIO;
+		}
+		set_buffer_revoked(bh);
+		set_buffer_revokevalid(bh);
+		if (bh_in) {
+			BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
+			jbd2_journal_forget(handle, bh_in);
+		} else {
+			BUFFER_TRACE(bh, "call brelse");
+			__brelse(bh);
+		}
+	}
+
+	jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+	err = insert_revoke_hash(journal, blocknr,
+				handle->h_transaction->t_tid);
+	BUFFER_TRACE(bh_in, "exit");
+	return err;
+}
+
+/*
+ * Cancel an outstanding revoke.  For use only internally by the
+ * journaling code (called from jbd2_journal_get_write_access).
+ *
+ * We trust buffer_revoked() on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction.  In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel.  So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ */
+int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+{
+	struct jbd2_revoke_record_s *record;
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_cancel;
+	int did_revoke = 0;	/* akpm: debug */
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+	/* Is the existing Revoke bit valid?  If so, we trust it, and
+	 * only perform the full cancel if the revoke bit is set.  If
+	 * not, we can't trust the revoke bit, and we need to do the
+	 * full search for a revoke record. */
+	if (test_set_buffer_revokevalid(bh)) {
+		need_cancel = test_clear_buffer_revoked(bh);
+	} else {
+		need_cancel = 1;
+		clear_buffer_revoked(bh);
+	}
+
+	if (need_cancel) {
+		record = find_revoke_record(journal, bh->b_blocknr);
+		if (record) {
+			jbd_debug(4, "cancelled existing revoke on "
+				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
+			spin_lock(&journal->j_revoke_lock);
+			list_del(&record->hash);
+			spin_unlock(&journal->j_revoke_lock);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+			did_revoke = 1;
+		}
+	}
+
+#ifdef JBD2_EXPENSIVE_CHECKING
+	/* There better not be one left behind by now! */
+	record = find_revoke_record(journal, bh->b_blocknr);
+	J_ASSERT_JH(jh, record == NULL);
+#endif
+
+	/* Finally, have we just cleared revoke on an unhashed
+	 * buffer_head?  If so, we'd better make sure we clear the
+	 * revoked status on any hashed alias too, otherwise the revoke
+	 * state machine will get very upset later on. */
+	if (need_cancel) {
+		struct buffer_head *bh2;
+		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+		if (bh2) {
+			if (bh2 != bh)
+				clear_buffer_revoked(bh2);
+			__brelse(bh2);
+		}
+	}
+	return did_revoke;
+}
+
+/*
+ * journal_clear_revoked_flag clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffers in the next
+ * transaction which is going to be started.
+ */
+void jbd2_clear_buffer_revoked_flags(journal_t *journal)
+{
+	struct jbd2_revoke_table_s *revoke = journal->j_revoke;
+	int i = 0;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		struct list_head *hash_list;
+		struct list_head *list_entry;
+		hash_list = &revoke->hash_table[i];
+
+		list_for_each(list_entry, hash_list) {
+			struct jbd2_revoke_record_s *record;
+			struct buffer_head *bh;
+			record = (struct jbd2_revoke_record_s *)list_entry;
+			bh = __find_get_block(journal->j_fs_dev,
+					      record->blocknr,
+					      journal->j_blocksize);
+			if (bh) {
+				clear_buffer_revoked(bh);
+				__brelse(bh);
+			}
+		}
+	}
+}
+
+/* journal_switch_revoke table select j_revoke for next transaction
+ * we do not want to suspend any processing until all revokes are
+ * written -bzzz
+ */
+void jbd2_journal_switch_revoke_table(journal_t *journal)
+{
+	int i;
+
+	if (journal->j_revoke == journal->j_revoke_table[0])
+		journal->j_revoke = journal->j_revoke_table[1];
+	else
+		journal->j_revoke = journal->j_revoke_table[0];
+
+	for (i = 0; i < journal->j_revoke->hash_size; i++)
+		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+}
+
+/*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ */
+void jbd2_journal_write_revoke_records(journal_t *journal,
+				       transaction_t *transaction,
+				       struct list_head *log_bufs,
+				       int write_op)
+{
+	struct buffer_head *descriptor;
+	struct jbd2_revoke_record_s *record;
+	struct jbd2_revoke_table_s *revoke;
+	struct list_head *hash_list;
+	int i, offset, count;
+
+	descriptor = NULL;
+	offset = 0;
+	count = 0;
+
+	/* select revoke table for committing transaction */
+	revoke = journal->j_revoke == journal->j_revoke_table[0] ?
+		journal->j_revoke_table[1] : journal->j_revoke_table[0];
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+
+		while (!list_empty(hash_list)) {
+			record = (struct jbd2_revoke_record_s *)
+				hash_list->next;
+			write_one_revoke_record(journal, transaction, log_bufs,
+						&descriptor, &offset,
+						record, write_op);
+			count++;
+			list_del(&record->hash);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+		}
+	}
+	if (descriptor)
+		flush_descriptor(journal, descriptor, offset, write_op);
+	jbd_debug(1, "Wrote %d revoke records\n", count);
+}
+
+/*
+ * Write out one revoke record.  We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+
+static void write_one_revoke_record(journal_t *journal,
+				    transaction_t *transaction,
+				    struct list_head *log_bufs,
+				    struct buffer_head **descriptorp,
+				    int *offsetp,
+				    struct jbd2_revoke_record_s *record,
+				    int write_op)
+{
+	int csum_size = 0;
+	struct buffer_head *descriptor;
+	int sz, offset;
+	journal_header_t *header;
+
+	/* If we are already aborting, this all becomes a noop.  We
+           still need to go round the loop in
+           jbd2_journal_write_revoke_records in order to free all of the
+           revoke records: only the IO to the journal is omitted. */
+	if (is_journal_aborted(journal))
+		return;
+
+	descriptor = *descriptorp;
+	offset = *offsetp;
+
+	/* Do we need to leave space at the end for a checksum? */
+	if (jbd2_journal_has_csum_v2or3(journal))
+		csum_size = sizeof(struct jbd2_journal_revoke_tail);
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		sz = 8;
+	else
+		sz = 4;
+
+	/* Make sure we have a descriptor with space left for the record */
+	if (descriptor) {
+		if (offset + sz > journal->j_blocksize - csum_size) {
+			flush_descriptor(journal, descriptor, offset, write_op);
+			descriptor = NULL;
+		}
+	}
+
+	if (!descriptor) {
+		descriptor = jbd2_journal_get_descriptor_buffer(journal);
+		if (!descriptor)
+			return;
+		header = (journal_header_t *)descriptor->b_data;
+		header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+		header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
+		header->h_sequence  = cpu_to_be32(transaction->t_tid);
+
+		/* Record it so that we can wait for IO completion later */
+		BUFFER_TRACE(descriptor, "file in log_bufs");
+		jbd2_file_log_bh(log_bufs, descriptor);
+
+		offset = sizeof(jbd2_journal_revoke_header_t);
+		*descriptorp = descriptor;
+	}
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		* ((__be64 *)(&descriptor->b_data[offset])) =
+			cpu_to_be64(record->blocknr);
+	else
+		* ((__be32 *)(&descriptor->b_data[offset])) =
+			cpu_to_be32(record->blocknr);
+	offset += sz;
+
+	*offsetp = offset;
+}
+
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
+{
+	struct jbd2_journal_revoke_tail *tail;
+	__u32 csum;
+
+	if (!jbd2_journal_has_csum_v2or3(j))
+		return;
+
+	tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
+			sizeof(struct jbd2_journal_revoke_tail));
+	tail->r_checksum = 0;
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+	tail->r_checksum = cpu_to_be32(csum);
+}
+
+/*
+ * Flush a revoke descriptor out to the journal.  If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+
+static void flush_descriptor(journal_t *journal,
+			     struct buffer_head *descriptor,
+			     int offset, int write_op)
+{
+	jbd2_journal_revoke_header_t *header;
+
+	if (is_journal_aborted(journal)) {
+		put_bh(descriptor);
+		return;
+	}
+
+	header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
+	header->r_count = cpu_to_be32(offset);
+	jbd2_revoke_csum_set(journal, descriptor);
+
+	set_buffer_jwrite(descriptor);
+	BUFFER_TRACE(descriptor, "write");
+	set_buffer_dirty(descriptor);
+	write_dirty_buffer(descriptor, write_op);
+}
+#endif
+
+/*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ *  record all revoke records, including the tid of the latest instance
+ *  of each revoke in the journal
+ *
+ *  check whether a given block in a given transaction should be replayed
+ *  (ie. has not been revoked by a revoke record in that or a subsequent
+ *  transaction)
+ *
+ *  empty the revoke table after recovery.
+ */
+
+/*
+ * First, setting revoke records.  We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+
+int jbd2_journal_set_revoke(journal_t *journal,
+		       unsigned long long blocknr,
+		       tid_t sequence)
+{
+	struct jbd2_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (record) {
+		/* If we have multiple occurrences, only record the
+		 * latest sequence number in the hashed record */
+		if (tid_gt(sequence, record->sequence))
+			record->sequence = sequence;
+		return 0;
+	}
+	return insert_revoke_hash(journal, blocknr, sequence);
+}
+
+/*
+ * Test revoke records.  For a given block referenced in the log, has
+ * that block been revoked?  A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+
+int jbd2_journal_test_revoke(journal_t *journal,
+			unsigned long long blocknr,
+			tid_t sequence)
+{
+	struct jbd2_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (!record)
+		return 0;
+	if (tid_gt(sequence, record->sequence))
+		return 0;
+	return 1;
+}
+
+/*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+
+void jbd2_journal_clear_revoke(journal_t *journal)
+{
+	int i;
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+	struct jbd2_revoke_table_s *revoke;
+
+	revoke = journal->j_revoke;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+		while (!list_empty(hash_list)) {
+			record = (struct jbd2_revoke_record_s*) hash_list->next;
+			list_del(&record->hash);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+		}
+	}
+}
diff --git a/kernel/fs/jbd2/transaction.c b/kernel/fs/jbd2/transaction.c
new file mode 100644
index 000000000..ff2f2e6ad
--- /dev/null
+++ b/kernel/fs/jbd2/transaction.c
@@ -0,0 +1,2487 @@
+/*
+ * linux/fs/jbd2/transaction.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
+#include <linux/bug.h>
+#include <linux/module.h>
+
+#include <trace/events/jbd2.h>
+
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
+
+static struct kmem_cache *transaction_cache;
+int __init jbd2_journal_init_transaction_cache(void)
+{
+	J_ASSERT(!transaction_cache);
+	transaction_cache = kmem_cache_create("jbd2_transaction_s",
+					sizeof(transaction_t),
+					0,
+					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+					NULL);
+	if (transaction_cache)
+		return 0;
+	return -ENOMEM;
+}
+
+void jbd2_journal_destroy_transaction_cache(void)
+{
+	if (transaction_cache) {
+		kmem_cache_destroy(transaction_cache);
+		transaction_cache = NULL;
+	}
+}
+
+void jbd2_journal_free_transaction(transaction_t *transaction)
+{
+	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
+		return;
+	kmem_cache_free(transaction_cache, transaction);
+}
+
+/*
+ * jbd2_get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction.  Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ *	The journal MUST be locked.  We don't perform atomic mallocs on the
+ *	new transaction	and we can't block without protecting against other
+ *	processes trying to touch the journal while it is in transition.
+ *
+ */
+
+static transaction_t *
+jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+{
+	transaction->t_journal = journal;
+	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
+	transaction->t_tid = journal->j_transaction_sequence++;
+	transaction->t_expires = jiffies + journal->j_commit_interval;
+	spin_lock_init(&transaction->t_handle_lock);
+	atomic_set(&transaction->t_updates, 0);
+	atomic_set(&transaction->t_outstanding_credits,
+		   atomic_read(&journal->j_reserved_credits));
+	atomic_set(&transaction->t_handle_count, 0);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
+	INIT_LIST_HEAD(&transaction->t_private_list);
+
+	/* Set up the commit timer for the new transaction. */
+	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
+	add_timer(&journal->j_commit_timer);
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	journal->j_running_transaction = transaction;
+	transaction->t_max_wait = 0;
+	transaction->t_start = jiffies;
+	transaction->t_requested = 0;
+
+	return transaction;
+}
+
+/*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+
+/*
+ * Update transaction's maximum wait time, if debugging is enabled.
+ *
+ * In order for t_max_wait to be reliable, it must be protected by a
+ * lock.  But doing so will mean that start_this_handle() can not be
+ * run in parallel on SMP systems, which limits our scalability.  So
+ * unless debugging is enabled, we no longer update t_max_wait, which
+ * means that maximum wait time reported by the jbd2_run_stats
+ * tracepoint will always be zero.
+ */
+static inline void update_t_max_wait(transaction_t *transaction,
+				     unsigned long ts)
+{
+#ifdef CONFIG_JBD2_DEBUG
+	if (jbd2_journal_enable_debug &&
+	    time_after(transaction->t_start, ts)) {
+		ts = jbd2_time_diff(ts, transaction->t_start);
+		spin_lock(&transaction->t_handle_lock);
+		if (ts > transaction->t_max_wait)
+			transaction->t_max_wait = ts;
+		spin_unlock(&transaction->t_handle_lock);
+	}
+#endif
+}
+
+/*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+	__releases(journal->j_state_lock)
+{
+	DEFINE_WAIT(wait);
+	int need_to_start;
+	tid_t tid = journal->j_running_transaction->t_tid;
+
+	prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+			TASK_UNINTERRUPTIBLE);
+	need_to_start = !tid_geq(journal->j_commit_request, tid);
+	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
+	schedule();
+	finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+	atomic_sub(blocks, &journal->j_reserved_credits);
+	wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction.  Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+				   int rsv_blocks)
+{
+	transaction_t *t = journal->j_running_transaction;
+	int needed;
+	int total = blocks + rsv_blocks;
+
+	/*
+	 * If the current transaction is locked down for commit, wait
+	 * for the lock to be released.
+	 */
+	if (t->t_state == T_LOCKED) {
+		wait_transaction_locked(journal);
+		return 1;
+	}
+
+	/*
+	 * If there is not enough space left in the log to write all
+	 * potential buffers requested by this operation, we need to
+	 * stall pending a log checkpoint to free some more log space.
+	 */
+	needed = atomic_add_return(total, &t->t_outstanding_credits);
+	if (needed > journal->j_max_transaction_buffers) {
+		/*
+		 * If the current transaction is already too large,
+		 * then start to commit it: we can then go back and
+		 * attach this handle to a new transaction.
+		 */
+		atomic_sub(total, &t->t_outstanding_credits);
+		wait_transaction_locked(journal);
+		return 1;
+	}
+
+	/*
+	 * The commit code assumes that it can get enough log space
+	 * without forcing a checkpoint.  This is *critical* for
+	 * correctness: a checkpoint of a buffer which is also
+	 * associated with a committing transaction creates a deadlock,
+	 * so commit simply cannot force through checkpoints.
+	 *
+	 * We must therefore ensure the necessary space in the journal
+	 * *before* starting to dirty potentially checkpointed buffers
+	 * in the new transaction.
+	 */
+	if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+		atomic_sub(total, &t->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
+		if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+			__jbd2_log_wait_for_space(journal);
+		write_unlock(&journal->j_state_lock);
+		return 1;
+	}
+
+	/* No reservation? We are done... */
+	if (!rsv_blocks)
+		return 0;
+
+	needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+	/* We allow at most half of a transaction to be reserved */
+	if (needed > journal->j_max_transaction_buffers / 2) {
+		sub_reserved_credits(journal, rsv_blocks);
+		atomic_sub(total, &t->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_reserved,
+			 atomic_read(&journal->j_reserved_credits) + rsv_blocks
+			 <= journal->j_max_transaction_buffers / 2);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin.  Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+
+static int start_this_handle(journal_t *journal, handle_t *handle,
+			     gfp_t gfp_mask)
+{
+	transaction_t	*transaction, *new_transaction = NULL;
+	int		blocks = handle->h_buffer_credits;
+	int		rsv_blocks = 0;
+	unsigned long ts = jiffies;
+
+	/*
+	 * 1/2 of transaction can be reserved so we can practically handle
+	 * only 1/2 of maximum transaction size per operation
+	 */
+	if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
+		printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
+		       current->comm, blocks,
+		       journal->j_max_transaction_buffers / 2);
+		return -ENOSPC;
+	}
+
+	if (handle->h_rsv_handle)
+		rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
+alloc_transaction:
+	if (!journal->j_running_transaction) {
+		new_transaction = kmem_cache_zalloc(transaction_cache,
+						    gfp_mask);
+		if (!new_transaction) {
+			/*
+			 * If __GFP_FS is not present, then we may be
+			 * being called from inside the fs writeback
+			 * layer, so we MUST NOT fail.  Since
+			 * __GFP_NOFAIL is going away, we will arrange
+			 * to retry the allocation ourselves.
+			 */
+			if ((gfp_mask & __GFP_FS) == 0) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto alloc_transaction;
+			}
+			return -ENOMEM;
+		}
+	}
+
+	jbd_debug(3, "New handle %p going live.\n", handle);
+
+	/*
+	 * We need to hold j_state_lock until t_updates has been incremented,
+	 * for proper journal barrier handling
+	 */
+repeat:
+	read_lock(&journal->j_state_lock);
+	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
+	if (is_journal_aborted(journal) ||
+	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
+		read_unlock(&journal->j_state_lock);
+		jbd2_journal_free_transaction(new_transaction);
+		return -EROFS;
+	}
+
+	/*
+	 * Wait on the journal's transaction barrier if necessary. Specifically
+	 * we allow reserved handles to proceed because otherwise commit could
+	 * deadlock on page writeback not being able to complete.
+	 */
+	if (!handle->h_reserved && journal->j_barrier_count) {
+		read_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_transaction_locked,
+				journal->j_barrier_count == 0);
+		goto repeat;
+	}
+
+	if (!journal->j_running_transaction) {
+		read_unlock(&journal->j_state_lock);
+		if (!new_transaction)
+			goto alloc_transaction;
+		write_lock(&journal->j_state_lock);
+		if (!journal->j_running_transaction &&
+		    (handle->h_reserved || !journal->j_barrier_count)) {
+			jbd2_get_transaction(journal, new_transaction);
+			new_transaction = NULL;
+		}
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
+	}
+
+	transaction = journal->j_running_transaction;
+
+	if (!handle->h_reserved) {
+		/* We may have dropped j_state_lock - restart in that case */
+		if (add_transaction_credits(journal, blocks, rsv_blocks))
+			goto repeat;
+	} else {
+		/*
+		 * We have handle reserved so we are allowed to join T_LOCKED
+		 * transaction and we don't have to check for transaction size
+		 * and journal space.
+		 */
+		sub_reserved_credits(journal, blocks);
+		handle->h_reserved = 0;
+	}
+
+	/* OK, account for the buffers that this operation expects to
+	 * use and add the handle to the running transaction. 
+	 */
+	update_t_max_wait(transaction, ts);
+	handle->h_transaction = transaction;
+	handle->h_requested_credits = blocks;
+	handle->h_start_jiffies = jiffies;
+	atomic_inc(&transaction->t_updates);
+	atomic_inc(&transaction->t_handle_count);
+	jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+		  handle, blocks,
+		  atomic_read(&transaction->t_outstanding_credits),
+		  jbd2_log_space_left(journal));
+	read_unlock(&journal->j_state_lock);
+	current->journal_info = handle;
+
+	lock_map_acquire(&handle->h_lockdep_map);
+	jbd2_journal_free_transaction(new_transaction);
+	return 0;
+}
+
+static struct lock_class_key jbd2_handle_key;
+
+/* Allocate a new handle.  This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
+	if (!handle)
+		return NULL;
+	handle->h_buffer_credits = nblocks;
+	handle->h_ref = 1;
+
+	lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
+						&jbd2_handle_key, 0);
+
+	return handle;
+}
+
+/**
+ * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log.  We block until the log can guarantee
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
+ *
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
+ */
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+			      gfp_t gfp_mask, unsigned int type,
+			      unsigned int line_no)
+{
+	handle_t *handle = journal_current_handle();
+	int err;
+
+	if (!journal)
+		return ERR_PTR(-EROFS);
+
+	if (handle) {
+		J_ASSERT(handle->h_transaction->t_journal == journal);
+		handle->h_ref++;
+		return handle;
+	}
+
+	handle = new_handle(nblocks);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+	if (rsv_blocks) {
+		handle_t *rsv_handle;
+
+		rsv_handle = new_handle(rsv_blocks);
+		if (!rsv_handle) {
+			jbd2_free_handle(handle);
+			return ERR_PTR(-ENOMEM);
+		}
+		rsv_handle->h_reserved = 1;
+		rsv_handle->h_journal = journal;
+		handle->h_rsv_handle = rsv_handle;
+	}
+
+	err = start_this_handle(journal, handle, gfp_mask);
+	if (err < 0) {
+		if (handle->h_rsv_handle)
+			jbd2_free_handle(handle->h_rsv_handle);
+		jbd2_free_handle(handle);
+		return ERR_PTR(err);
+	}
+	handle->h_type = type;
+	handle->h_line_no = line_no;
+	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
+				handle->h_transaction->t_tid, type,
+				line_no, nblocks);
+	return handle;
+}
+EXPORT_SYMBOL(jbd2__journal_start);
+
+
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+	return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+}
+EXPORT_SYMBOL(jbd2_journal_start);
+
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+	journal_t *journal = handle->h_journal;
+
+	WARN_ON(!handle->h_reserved);
+	sub_reserved_credits(journal, handle->h_buffer_credits);
+	jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+				unsigned int line_no)
+{
+	journal_t *journal = handle->h_journal;
+	int ret = -EIO;
+
+	if (WARN_ON(!handle->h_reserved)) {
+		/* Someone passed in normal handle? Just stop it. */
+		jbd2_journal_stop(handle);
+		return ret;
+	}
+	/*
+	 * Usefulness of mixing of reserved and unreserved handles is
+	 * questionable. So far nobody seems to need it so just error out.
+	 */
+	if (WARN_ON(current->journal_info)) {
+		jbd2_journal_free_reserved(handle);
+		return ret;
+	}
+
+	handle->h_journal = NULL;
+	/*
+	 * GFP_NOFS is here because callers are likely from writeback or
+	 * similarly constrained call sites
+	 */
+	ret = start_this_handle(journal, handle, GFP_NOFS);
+	if (ret < 0) {
+		jbd2_journal_free_reserved(handle);
+		return ret;
+	}
+	handle->h_type = type;
+	handle->h_line_no = line_no;
+	return 0;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
+
+/**
+ * int jbd2_journal_extend() - extend buffer credits.
+ * @handle:  handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages.  The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * jbd2_journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation - this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+int jbd2_journal_extend(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+	int result;
+	int wanted;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+	journal = transaction->t_journal;
+
+	result = 1;
+
+	read_lock(&journal->j_state_lock);
+
+	/* Don't extend a locked-down transaction! */
+	if (transaction->t_state != T_RUNNING) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction not running\n", handle, nblocks);
+		goto error_out;
+	}
+
+	spin_lock(&transaction->t_handle_lock);
+	wanted = atomic_add_return(nblocks,
+				   &transaction->t_outstanding_credits);
+
+	if (wanted > journal->j_max_transaction_buffers) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction too large\n", handle, nblocks);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
+		goto unlock;
+	}
+
+	if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+	    jbd2_log_space_left(journal)) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "insufficient log space\n", handle, nblocks);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
+		goto unlock;
+	}
+
+	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
+				 transaction->t_tid,
+				 handle->h_type, handle->h_line_no,
+				 handle->h_buffer_credits,
+				 nblocks);
+
+	handle->h_buffer_credits += nblocks;
+	handle->h_requested_credits += nblocks;
+	result = 0;
+
+	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+unlock:
+	spin_unlock(&transaction->t_handle_lock);
+error_out:
+	read_unlock(&journal->j_state_lock);
+	return result;
+}
+
+
+/**
+ * int jbd2_journal_restart() - restart a handle .
+ * @handle:  handle to restart
+ * @nblocks: nr credits requested
+ *
+ * Restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the jbd2_journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to jbd2_journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
+ */
+int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+	tid_t		tid;
+	int		need_to_start, ret;
+
+	/* If we've had an abort of any type, don't even think about
+	 * actually doing the restart! */
+	if (is_handle_aborted(handle))
+		return 0;
+	journal = transaction->t_journal;
+
+	/*
+	 * First unlink the handle from its current transaction, and start the
+	 * commit on that.
+	 */
+	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+	J_ASSERT(journal_current_handle() == handle);
+
+	read_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
+	if (handle->h_rsv_handle) {
+		sub_reserved_credits(journal,
+				     handle->h_rsv_handle->h_buffer_credits);
+	}
+	if (atomic_dec_and_test(&transaction->t_updates))
+		wake_up(&journal->j_wait_updates);
+	tid = transaction->t_tid;
+	spin_unlock(&transaction->t_handle_lock);
+	handle->h_transaction = NULL;
+	current->journal_info = NULL;
+
+	jbd_debug(2, "restarting handle %p\n", handle);
+	need_to_start = !tid_geq(journal->j_commit_request, tid);
+	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
+
+	lock_map_release(&handle->h_lockdep_map);
+	handle->h_buffer_credits = nblocks;
+	ret = start_this_handle(journal, handle, gfp_mask);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2__journal_restart);
+
+
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+	return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_restart);
+
+/**
+ * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * @journal:  Journal to establish a barrier on.
+ *
+ * This locks out any further updates from being started, and blocks
+ * until all existing updates have completed, returning only once the
+ * journal is in a quiescent state with no updates running.
+ *
+ * The journal lock should not be held on entry.
+ */
+void jbd2_journal_lock_updates(journal_t *journal)
+{
+	DEFINE_WAIT(wait);
+
+	write_lock(&journal->j_state_lock);
+	++journal->j_barrier_count;
+
+	/* Wait until there are no reserved handles */
+	if (atomic_read(&journal->j_reserved_credits)) {
+		write_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_reserved,
+			   atomic_read(&journal->j_reserved_credits) == 0);
+		write_lock(&journal->j_state_lock);
+	}
+
+	/* Wait until there are no running updates */
+	while (1) {
+		transaction_t *transaction = journal->j_running_transaction;
+
+		if (!transaction)
+			break;
+
+		spin_lock(&transaction->t_handle_lock);
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (!atomic_read(&transaction->t_updates)) {
+			spin_unlock(&transaction->t_handle_lock);
+			finish_wait(&journal->j_wait_updates, &wait);
+			break;
+		}
+		spin_unlock(&transaction->t_handle_lock);
+		write_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_updates, &wait);
+		write_lock(&journal->j_state_lock);
+	}
+	write_unlock(&journal->j_state_lock);
+
+	/*
+	 * We have now established a barrier against other normal updates, but
+	 * we also need to barrier against other jbd2_journal_lock_updates() calls
+	 * to make sure that we serialise special journal-locked operations
+	 * too.
+	 */
+	mutex_lock(&journal->j_barrier);
+}
+
+/**
+ * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal:  Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained with jbd2_journal_lock_updates().
+ *
+ * Should be called without the journal lock held.
+ */
+void jbd2_journal_unlock_updates (journal_t *journal)
+{
+	J_ASSERT(journal->j_barrier_count != 0);
+
+	mutex_unlock(&journal->j_barrier);
+	write_lock(&journal->j_state_lock);
+	--journal->j_barrier_count;
+	write_unlock(&journal->j_state_lock);
+	wake_up(&journal->j_wait_transaction_locked);
+}
+
+static void warn_dirty_buffer(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_WARNING
+	       "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "There's a risk of filesystem corruption in case of system "
+	       "crash.\n",
+	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do.  If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk.  We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ */
+static int
+do_get_write_access(handle_t *handle, struct journal_head *jh,
+			int force_copy)
+{
+	struct buffer_head *bh;
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+	int error;
+	char *frozen_buffer = NULL;
+	int need_copy = 0;
+	unsigned long start_lock, time_lock;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+	journal = transaction->t_journal;
+
+	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+
+	JBUFFER_TRACE(jh, "entry");
+repeat:
+	bh = jh2bh(jh);
+
+	/* @@@ Need to check for errors here at some point. */
+
+ 	start_lock = jiffies;
+	lock_buffer(bh);
+	jbd_lock_bh_state(bh);
+
+	/* If it takes too long to lock the buffer, trace it */
+	time_lock = jbd2_time_diff(start_lock, jiffies);
+	if (time_lock > HZ/10)
+		trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+			jiffies_to_msecs(time_lock));
+
+	/* We now hold the buffer lock so it is safe to query the buffer
+	 * state.  Is the buffer dirty?
+	 *
+	 * If so, there are two possibilities.  The buffer may be
+	 * non-journaled, and undergoing a quite legitimate writeback.
+	 * Otherwise, it is journaled, and we don't expect dirty buffers
+	 * in that state (the buffers should be marked JBD_Dirty
+	 * instead.)  So either the IO is being done under our own
+	 * control and this is a bug, or it's a third party IO such as
+	 * dump(8) (which may leave the buffer scheduled for read ---
+	 * ie. locked but not dirty) or tune2fs (which may actually have
+	 * the buffer dirtied, ugh.)  */
+
+	if (buffer_dirty(bh)) {
+		/*
+		 * First question: is this buffer already part of the current
+		 * transaction or the existing committing transaction?
+		 */
+		if (jh->b_transaction) {
+			J_ASSERT_JH(jh,
+				jh->b_transaction == transaction ||
+				jh->b_transaction ==
+					journal->j_committing_transaction);
+			if (jh->b_next_transaction)
+				J_ASSERT_JH(jh, jh->b_next_transaction ==
+							transaction);
+			warn_dirty_buffer(bh);
+		}
+		/*
+		 * In any case we need to clean the dirty flag and we must
+		 * do it under the buffer lock to be sure we don't race
+		 * with running write-out.
+		 */
+		JBUFFER_TRACE(jh, "Journalling dirty buffer");
+		clear_buffer_dirty(bh);
+		set_buffer_jbddirty(bh);
+	}
+
+	unlock_buffer(bh);
+
+	error = -EROFS;
+	if (is_handle_aborted(handle)) {
+		jbd_unlock_bh_state(bh);
+		goto out;
+	}
+	error = 0;
+
+	/*
+	 * The buffer is already part of this transaction if b_transaction or
+	 * b_next_transaction points to it
+	 */
+	if (jh->b_transaction == transaction ||
+	    jh->b_next_transaction == transaction)
+		goto done;
+
+	/*
+	 * this is the first time this transaction is touching this buffer,
+	 * reset the modified flag
+	 */
+       jh->b_modified = 0;
+
+	/*
+	 * If there is already a copy-out version of this buffer, then we don't
+	 * need to make another one
+	 */
+	if (jh->b_frozen_data) {
+		JBUFFER_TRACE(jh, "has frozen data");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		jh->b_next_transaction = transaction;
+		goto done;
+	}
+
+	/* Is there data here we need to preserve? */
+
+	if (jh->b_transaction && jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "owned by older transaction");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+		/* There is one case we have to be very careful about.
+		 * If the committing transaction is currently writing
+		 * this buffer out to disk and has NOT made a copy-out,
+		 * then we cannot modify the buffer contents at all
+		 * right now.  The essence of copy-out is that it is the
+		 * extra copy, not the primary copy, which gets
+		 * journaled.  If the primary copy is already going to
+		 * disk then we cannot do copy-out here. */
+
+		if (buffer_shadow(bh)) {
+			JBUFFER_TRACE(jh, "on shadow: sleep");
+			jbd_unlock_bh_state(bh);
+			wait_on_bit_io(&bh->b_state, BH_Shadow,
+				       TASK_UNINTERRUPTIBLE);
+			goto repeat;
+		}
+
+		/*
+		 * Only do the copy if the currently-owning transaction still
+		 * needs it. If buffer isn't on BJ_Metadata list, the
+		 * committing transaction is past that stage (here we use the
+		 * fact that BH_Shadow is set under bh_state lock together with
+		 * refiling to BJ_Shadow list and at this point we know the
+		 * buffer doesn't have BH_Shadow set).
+		 *
+		 * Subtle point, though: if this is a get_undo_access,
+		 * then we will be relying on the frozen_data to contain
+		 * the new value of the committed_data record after the
+		 * transaction, so we HAVE to force the frozen_data copy
+		 * in that case.
+		 */
+		if (jh->b_jlist == BJ_Metadata || force_copy) {
+			JBUFFER_TRACE(jh, "generate frozen data");
+			if (!frozen_buffer) {
+				JBUFFER_TRACE(jh, "allocate memory for buffer");
+				jbd_unlock_bh_state(bh);
+				frozen_buffer =
+					jbd2_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
+				if (!frozen_buffer) {
+					printk(KERN_ERR
+					       "%s: OOM for frozen_buffer\n",
+					       __func__);
+					JBUFFER_TRACE(jh, "oom!");
+					error = -ENOMEM;
+					jbd_lock_bh_state(bh);
+					goto done;
+				}
+				goto repeat;
+			}
+			jh->b_frozen_data = frozen_buffer;
+			frozen_buffer = NULL;
+			need_copy = 1;
+		}
+		jh->b_next_transaction = transaction;
+	}
+
+
+	/*
+	 * Finally, if the buffer is not journaled right now, we need to make
+	 * sure it doesn't get written to disk before the caller actually
+	 * commits the new data
+	 */
+	if (!jh->b_transaction) {
+		JBUFFER_TRACE(jh, "no transaction");
+		J_ASSERT_JH(jh, !jh->b_next_transaction);
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		spin_lock(&journal->j_list_lock);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+		spin_unlock(&journal->j_list_lock);
+	}
+
+done:
+	if (need_copy) {
+		struct page *page;
+		int offset;
+		char *source;
+
+		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
+			    "Possible IO failure.\n");
+		page = jh2bh(jh)->b_page;
+		offset = offset_in_page(jh2bh(jh)->b_data);
+		source = kmap_atomic(page);
+		/* Fire data frozen trigger just before we copy the data */
+		jbd2_buffer_frozen_trigger(jh, source + offset,
+					   jh->b_triggers);
+		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+		kunmap_atomic(source);
+
+		/*
+		 * Now that the frozen data is saved off, we need to store
+		 * any matching triggers.
+		 */
+		jh->b_frozen_triggers = jh->b_triggers;
+	}
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * If we are about to journal a buffer, then any revoke pending on it is
+	 * no longer valid
+	 */
+	jbd2_journal_cancel_revoke(handle, jh);
+
+out:
+	if (unlikely(frozen_buffer))	/* It's usually NULL */
+		jbd2_free(frozen_buffer, bh->b_size);
+
+	JBUFFER_TRACE(jh, "exit");
+	return error;
+}
+
+/**
+ * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh:     bh to be used for metadata writes
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+{
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	int rc;
+
+	/* We do not want to get caught playing with fields which the
+	 * log thread also manipulates.  Make sure that the buffer
+	 * completes any outstanding IO before proceeding. */
+	rc = do_get_write_access(handle, jh, 0);
+	jbd2_journal_put_journal_head(jh);
+	return rc;
+}
+
+
+/*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data.  In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+
+/**
+ * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
+int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	int err;
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	err = -EROFS;
+	if (is_handle_aborted(handle))
+		goto out;
+	journal = transaction->t_journal;
+	err = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+	/*
+	 * The buffer may already belong to this transaction due to pre-zeroing
+	 * in the filesystem's new_block code.  It may also be on the previous,
+	 * committing transaction's lists, but it HAS to be in Forget state in
+	 * that case: the transaction must have deleted the buffer for it to be
+	 * reused here.
+	 */
+	jbd_lock_bh_state(bh);
+	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+		jh->b_transaction == NULL ||
+		(jh->b_transaction == journal->j_committing_transaction &&
+			  jh->b_jlist == BJ_Forget)));
+
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+	if (jh->b_transaction == NULL) {
+		/*
+		 * Previous jbd2_journal_forget() could have left the buffer
+		 * with jbddirty bit set because it was being committed. When
+		 * the commit finished, we've filed the buffer for
+		 * checkpointing and marked it dirty. Now we are reallocating
+		 * the buffer so the transaction freeing it must have
+		 * committed and so it's safe to clear the dirty bit.
+		 */
+		clear_buffer_dirty(jh2bh(jh));
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		spin_lock(&journal->j_list_lock);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+	} else if (jh->b_transaction == journal->j_committing_transaction) {
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "set next transaction");
+		spin_lock(&journal->j_list_lock);
+		jh->b_next_transaction = transaction;
+	}
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+	 * blocks which contain freed but then revoked metadata.  We need
+	 * to cancel the revoke in case we end up freeing it yet again
+	 * and the reallocating as data - this would cause a second revoke,
+	 * which hits an assertion error.
+	 */
+	JBUFFER_TRACE(jh, "cancelling revoke");
+	jbd2_journal_cancel_revoke(handle, jh);
+out:
+	jbd2_journal_put_journal_head(jh);
+	return err;
+}
+
+/**
+ * int jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
+ *     non-rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not.  The ext3fs code uses
+ * this for freeing and allocating space, we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, jbd2_journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps.  The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of, buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+{
+	int err;
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	char *committed_data = NULL;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * Do this first --- it can drop the journal lock, so we want to
+	 * make sure that obtaining the committed_data is done
+	 * atomically wrt. completion of any outstanding commits.
+	 */
+	err = do_get_write_access(handle, jh, 1);
+	if (err)
+		goto out;
+
+repeat:
+	if (!jh->b_committed_data) {
+		committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+		if (!committed_data) {
+			printk(KERN_ERR "%s: No memory for committed data\n",
+				__func__);
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data) {
+		/* Copy out the current buffer contents into the
+		 * preserved, committed copy. */
+		JBUFFER_TRACE(jh, "generate b_committed data");
+		if (!committed_data) {
+			jbd_unlock_bh_state(bh);
+			goto repeat;
+		}
+
+		jh->b_committed_data = committed_data;
+		committed_data = NULL;
+		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
+	}
+	jbd_unlock_bh_state(bh);
+out:
+	jbd2_journal_put_journal_head(jh);
+	if (unlikely(committed_data))
+		jbd2_free(committed_data, bh->b_size);
+	return err;
+}
+
+/**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+			       struct jbd2_buffer_trigger_type *type)
+{
+	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
+
+	if (WARN_ON(!jh))
+		return;
+	jh->b_triggers = type;
+	jbd2_journal_put_journal_head(jh);
+}
+
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
+				struct jbd2_buffer_trigger_type *triggers)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (!triggers || !triggers->t_frozen)
+		return;
+
+	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+			       struct jbd2_buffer_trigger_type *triggers)
+{
+	if (!triggers || !triggers->t_abort)
+		return;
+
+	triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
+/**
+ * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark
+ *
+ * mark dirty metadata which needs to be journaled as part of the current
+ * transaction.
+ *
+ * The buffer must have previously had jbd2_journal_get_write_access()
+ * called so that it has a valid journal_head attached to the buffer
+ * head.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit).  In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ */
+int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+	struct journal_head *jh;
+	int ret = 0;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+	journal = transaction->t_journal;
+	jh = jbd2_journal_grab_journal_head(bh);
+	if (!jh) {
+		ret = -EUCLEAN;
+		goto out;
+	}
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+
+	jbd_lock_bh_state(bh);
+
+	if (jh->b_modified == 0) {
+		/*
+		 * This buffer's got modified and becoming part
+		 * of the transaction. This needs to be done
+		 * once a transaction -bzzz
+		 */
+		jh->b_modified = 1;
+		if (handle->h_buffer_credits <= 0) {
+			ret = -ENOSPC;
+			goto out_unlock_bh;
+		}
+		handle->h_buffer_credits--;
+	}
+
+	/*
+	 * fastpath, to avoid expensive locking.  If this buffer is already
+	 * on the running transaction's metadata list there is nothing to do.
+	 * Nobody can take it off again because there is a handle open.
+	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
+	 * result in this test being false, so we go in and take the locks.
+	 */
+	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
+		JBUFFER_TRACE(jh, "fastpath");
+		if (unlikely(jh->b_transaction !=
+			     journal->j_running_transaction)) {
+			printk(KERN_ERR "JBD2: %s: "
+			       "jh->b_transaction (%llu, %p, %u) != "
+			       "journal->j_running_transaction (%p, %u)\n",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_transaction,
+			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
+			       journal->j_running_transaction,
+			       journal->j_running_transaction ?
+			       journal->j_running_transaction->t_tid : 0);
+			ret = -EINVAL;
+		}
+		goto out_unlock_bh;
+	}
+
+	set_buffer_jbddirty(bh);
+
+	/*
+	 * Metadata already on the current transaction list doesn't
+	 * need to be filed.  Metadata on another transaction's list must
+	 * be committing, and will be refiled once the commit completes:
+	 * leave it alone for now.
+	 */
+	if (jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "already on other transaction");
+		if (unlikely(((jh->b_transaction !=
+			       journal->j_committing_transaction)) ||
+			     (jh->b_next_transaction != transaction))) {
+			printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
+			       "bad jh for block %llu: "
+			       "transaction (%p, %u), "
+			       "jh->b_transaction (%p, %u), "
+			       "jh->b_next_transaction (%p, %u), jlist %u\n",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       transaction, transaction->t_tid,
+			       jh->b_transaction,
+			       jh->b_transaction ?
+			       jh->b_transaction->t_tid : 0,
+			       jh->b_next_transaction,
+			       jh->b_next_transaction ?
+			       jh->b_next_transaction->t_tid : 0,
+			       jh->b_jlist);
+			WARN_ON(1);
+			ret = -EINVAL;
+		}
+		/* And this case is illegal: we can't reuse another
+		 * transaction's data buffer, ever. */
+		goto out_unlock_bh;
+	}
+
+	/* That test should have eliminated the following case: */
+	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+
+	JBUFFER_TRACE(jh, "file as BJ_Metadata");
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
+	spin_unlock(&journal->j_list_lock);
+out_unlock_bh:
+	jbd_unlock_bh_state(bh);
+	jbd2_journal_put_journal_head(jh);
+out:
+	JBUFFER_TRACE(jh, "exit");
+	return ret;
+}
+
+/**
+ * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * @handle: transaction handle
+ * @bh:     bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
+ * buffer.  If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable.  Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+	struct journal_head *jh;
+	int drop_reserve = 0;
+	int err = 0;
+	int was_modified = 0;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+	journal = transaction->t_journal;
+
+	BUFFER_TRACE(bh, "entry");
+
+	jbd_lock_bh_state(bh);
+
+	if (!buffer_jbd(bh))
+		goto not_jbd;
+	jh = bh2jh(bh);
+
+	/* Critical error: attempting to delete a bitmap buffer, maybe?
+	 * Don't do any jbd operations, and return an error. */
+	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
+			 "inconsistent data on disk")) {
+		err = -EIO;
+		goto not_jbd;
+	}
+
+	/* keep track of whether or not this transaction modified us */
+	was_modified = jh->b_modified;
+
+	/*
+	 * The buffer's going from the transaction, we must drop
+	 * all references -bzzz
+	 */
+	jh->b_modified = 0;
+
+	if (jh->b_transaction == transaction) {
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+		/* If we are forgetting a buffer which is already part
+		 * of this transaction, then we can just drop it from
+		 * the transaction immediately. */
+		clear_buffer_dirty(bh);
+		clear_buffer_jbddirty(bh);
+
+		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+
+		/*
+		 * we only want to drop a reference if this transaction
+		 * modified the buffer
+		 */
+		if (was_modified)
+			drop_reserve = 1;
+
+		/*
+		 * We are no longer going to journal this buffer.
+		 * However, the commit of this transaction is still
+		 * important to the buffer: the delete that we are now
+		 * processing might obsolete an old log entry, so by
+		 * committing, we can satisfy the buffer's checkpoint.
+		 *
+		 * So, if we have a checkpoint on the buffer, we should
+		 * now refile the buffer on our BJ_Forget list so that
+		 * we know to remove the checkpoint after we commit.
+		 */
+
+		spin_lock(&journal->j_list_lock);
+		if (jh->b_cp_transaction) {
+			__jbd2_journal_temp_unlink_buffer(jh);
+			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+		} else {
+			__jbd2_journal_unfile_buffer(jh);
+			if (!buffer_jbd(bh)) {
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				__bforget(bh);
+				goto drop;
+			}
+		}
+		spin_unlock(&journal->j_list_lock);
+	} else if (jh->b_transaction) {
+		J_ASSERT_JH(jh, (jh->b_transaction ==
+				 journal->j_committing_transaction));
+		/* However, if the buffer is still owned by a prior
+		 * (committing) transaction, we can't drop it yet... */
+		JBUFFER_TRACE(jh, "belongs to older transaction");
+		/* ... but we CAN drop it from the new transaction if we
+		 * have also modified it since the original commit. */
+
+		if (jh->b_next_transaction) {
+			J_ASSERT(jh->b_next_transaction == transaction);
+			spin_lock(&journal->j_list_lock);
+			jh->b_next_transaction = NULL;
+			spin_unlock(&journal->j_list_lock);
+
+			/*
+			 * only drop a reference if this transaction modified
+			 * the buffer
+			 */
+			if (was_modified)
+				drop_reserve = 1;
+		}
+	}
+
+not_jbd:
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+drop:
+	if (drop_reserve) {
+		/* no need to reserve log space for this block -bzzz */
+		handle->h_buffer_credits++;
+	}
+	return err;
+}
+
+/**
+ * int jbd2_journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+ * All done for a particular handle.
+ *
+ * There is not much action needed here.  We just return any remaining
+ * buffer credits to the transaction and remove the handle.  The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * jbd2_journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances.  In particular, expect it to
+ * return -EIO if a jbd2_journal_abort has been executed since the
+ * transaction began.
+ */
+int jbd2_journal_stop(handle_t *handle)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+	int err = 0, wait_for_commit = 0;
+	tid_t tid;
+	pid_t pid;
+
+	if (!transaction) {
+		/*
+		 * Handle is already detached from the transaction so
+		 * there is nothing to do other than decrease a refcount,
+		 * or free the handle if refcount drops to zero
+		 */
+		if (--handle->h_ref > 0) {
+			jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+							 handle->h_ref);
+			return err;
+		} else {
+			if (handle->h_rsv_handle)
+				jbd2_free_handle(handle->h_rsv_handle);
+			goto free_and_exit;
+		}
+	}
+	journal = transaction->t_journal;
+
+	J_ASSERT(journal_current_handle() == handle);
+
+	if (is_handle_aborted(handle))
+		err = -EIO;
+	else
+		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+
+	if (--handle->h_ref > 0) {
+		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+			  handle->h_ref);
+		return err;
+	}
+
+	jbd_debug(4, "Handle %p going down\n", handle);
+	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
+				transaction->t_tid,
+				handle->h_type, handle->h_line_no,
+				jiffies - handle->h_start_jiffies,
+				handle->h_sync, handle->h_requested_credits,
+				(handle->h_requested_credits -
+				 handle->h_buffer_credits));
+
+	/*
+	 * Implement synchronous transaction batching.  If the handle
+	 * was synchronous, don't force a commit immediately.  Let's
+	 * yield and let another thread piggyback onto this
+	 * transaction.  Keep doing that while new threads continue to
+	 * arrive.  It doesn't cost much - we're about to run a commit
+	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
+	 * operations by 30x or more...
+	 *
+	 * We try and optimize the sleep time against what the
+	 * underlying disk can do, instead of having a static sleep
+	 * time.  This is useful for the case where our storage is so
+	 * fast that it is more optimal to go ahead and force a flush
+	 * and wait for the transaction to be committed than it is to
+	 * wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We achieve this by measuring how
+	 * long it takes to commit a transaction, and compare it with
+	 * how long this transaction has been running, and if run time
+	 * < commit time then we sleep for the delta and commit.  This
+	 * greatly helps super fast disks that would see slowdowns as
+	 * more threads started doing fsyncs.
+	 *
+	 * But don't do this if this process was the most recent one
+	 * to perform a synchronous write.  We do this to detect the
+	 * case where a single process is doing a stream of sync
+	 * writes.  No point in waiting for joiners in that case.
+	 *
+	 * Setting max_batch_time to 0 disables this completely.
+	 */
+	pid = current->pid;
+	if (handle->h_sync && journal->j_last_sync_writer != pid &&
+	    journal->j_max_batch_time) {
+		u64 commit_time, trans_time;
+
+		journal->j_last_sync_writer = pid;
+
+		read_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		read_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = max_t(u64, commit_time,
+				    1000*journal->j_min_batch_time);
+		commit_time = min_t(u64, commit_time,
+				    1000*journal->j_max_batch_time);
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
+	}
+
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
+	current->journal_info = NULL;
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
+
+	/*
+	 * If the handle is marked SYNC, we need to set another commit
+	 * going!  We also want to force a commit if the current
+	 * transaction is occupying too much of the log, or if the
+	 * transaction is too old now.
+	 */
+	if (handle->h_sync ||
+	    (atomic_read(&transaction->t_outstanding_credits) >
+	     journal->j_max_transaction_buffers) ||
+	    time_after_eq(jiffies, transaction->t_expires)) {
+		/* Do this even for aborted journals: an abort still
+		 * completes the commit thread, it just doesn't write
+		 * anything to disk. */
+
+		jbd_debug(2, "transaction too old, requesting commit for "
+					"handle %p\n", handle);
+		/* This is non-blocking */
+		jbd2_log_start_commit(journal, transaction->t_tid);
+
+		/*
+		 * Special case: JBD2_SYNC synchronous updates require us
+		 * to wait for the commit to complete.
+		 */
+		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+			wait_for_commit = 1;
+	}
+
+	/*
+	 * Once we drop t_updates, if it goes to zero the transaction
+	 * could start committing on us and eventually disappear.  So
+	 * once we do this, we must not dereference transaction
+	 * pointer again.
+	 */
+	tid = transaction->t_tid;
+	if (atomic_dec_and_test(&transaction->t_updates)) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	if (wait_for_commit)
+		err = jbd2_log_wait_commit(journal, tid);
+
+	lock_map_release(&handle->h_lockdep_map);
+
+	if (handle->h_rsv_handle)
+		jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
+	jbd2_free_handle(handle);
+	return err;
+}
+
+/*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+
+/*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ *
+ * j_list_lock is held.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (!*list) {
+		jh->b_tnext = jh->b_tprev = jh;
+		*list = jh;
+	} else {
+		/* Insert at the tail of the list to preserve order */
+		struct journal_head *first = *list, *last = first->b_tprev;
+		jh->b_tprev = last;
+		jh->b_tnext = first;
+		last->b_tnext = first->b_tprev = jh;
+	}
+}
+
+/*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with j_list_lock held, and the journal may not be locked.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (*list == jh) {
+		*list = jh->b_tnext;
+		if (*list == jh)
+			*list = NULL;
+	}
+	jh->b_tprev->b_tnext = jh->b_tnext;
+	jh->b_tnext->b_tprev = jh->b_tprev;
+}
+
+/*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list.  If the caller is holding onto a copy of one of these
+ * pointers, it could go bad.  Generally the caller needs to re-read the
+ * pointer from the transaction_t.
+ *
+ * Called under j_list_lock.
+ */
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+{
+	struct journal_head **list = NULL;
+	transaction_t *transaction;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	transaction = jh->b_transaction;
+	if (transaction)
+		assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	if (jh->b_jlist != BJ_None)
+		J_ASSERT_JH(jh, transaction != NULL);
+
+	switch (jh->b_jlist) {
+	case BJ_None:
+		return;
+	case BJ_Metadata:
+		transaction->t_nr_buffers--;
+		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	}
+
+	__blist_del_buffer(list, jh);
+	jh->b_jlist = BJ_None;
+	if (test_clear_buffer_jbddirty(bh))
+		mark_buffer_dirty(bh);	/* Expose it to the VM */
+}
+
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
+{
+	__jbd2_journal_temp_unlink_buffer(jh);
+	jh->b_transaction = NULL;
+	jbd2_journal_put_journal_head(jh);
+}
+
+void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_unfile_buffer(jh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+}
+
+/*
+ * Called from jbd2_journal_try_to_free_buffers().
+ *
+ * Called under jbd_lock_bh_state(bh)
+ */
+static void
+__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	struct journal_head *jh;
+
+	jh = bh2jh(bh);
+
+	if (buffer_locked(bh) || buffer_dirty(bh))
+		goto out;
+
+	if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
+		goto out;
+
+	spin_lock(&journal->j_list_lock);
+	if (jh->b_cp_transaction != NULL) {
+		/* written-back checkpointed metadata buffer */
+		JBUFFER_TRACE(jh, "remove from checkpoint list");
+		__jbd2_journal_remove_checkpoint(jh);
+	}
+	spin_unlock(&journal->j_list_lock);
+out:
+	return;
+}
+
+/**
+ * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
+ *
+ *
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ *
+ * This complicates JBD locking somewhat.  We aren't protected by the
+ * BKL here.  We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a jbd2_journal_dirty_data on this
+ * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this?  hmm...  Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state.  But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
+ */
+int jbd2_journal_try_to_free_buffers(journal_t *journal,
+				struct page *page, gfp_t gfp_mask)
+{
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	J_ASSERT(PageLocked(page));
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		struct journal_head *jh;
+
+		/*
+		 * We take our own ref against the journal_head here to avoid
+		 * having to add tons of locking around each instance of
+		 * jbd2_journal_put_journal_head().
+		 */
+		jh = jbd2_journal_grab_journal_head(bh);
+		if (!jh)
+			continue;
+
+		jbd_lock_bh_state(bh);
+		__journal_try_to_free_buffer(journal, bh);
+		jbd2_journal_put_journal_head(jh);
+		jbd_unlock_bh_state(bh);
+		if (buffer_jbd(bh))
+			goto busy;
+	} while ((bh = bh->b_this_page) != head);
+
+	ret = try_to_free_buffers(page);
+
+busy:
+	return ret;
+}
+
+/*
+ * This buffer is no longer needed.  If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits.  If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ *
+ * Called under j_list_lock.
+ *
+ * Called under jbd_lock_bh_state(bh).
+ */
+static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
+{
+	int may_free = 1;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_cp_transaction) {
+		JBUFFER_TRACE(jh, "on running+cp transaction");
+		__jbd2_journal_temp_unlink_buffer(jh);
+		/*
+		 * We don't want to write the buffer anymore, clear the
+		 * bit so that we don't confuse checks in
+		 * __journal_file_buffer
+		 */
+		clear_buffer_dirty(bh);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+		may_free = 0;
+	} else {
+		JBUFFER_TRACE(jh, "on running transaction");
+		__jbd2_journal_unfile_buffer(jh);
+	}
+	return may_free;
+}
+
+/*
+ * jbd2_journal_invalidatepage
+ *
+ * This code is tricky.  It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling invalidatepage on the
+ * data.
+ *
+ *  This is done in ext3 by defining an ext3_setattr method which
+ *  updates i_size before truncate gets going.  By maintaining this
+ *  invariant, we can be sure that it is safe to throw away any buffers
+ *  attached to the current transaction: once the transaction commits,
+ *  we know that the data will not be needed.
+ *
+ *  Note however that we can *not* throw away data belonging to the
+ *  previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ *  The bitmap committed_data images guarantee this: any block which is
+ *  allocated in one transaction and removed in the next will be marked
+ *  as in-use in the committed_data bitmap, so cannot be reused until
+ *  the next transaction to delete the block commits.  This means that
+ *  leaving committing buffers dirty is quite safe: the disk blocks
+ *  cannot be reallocated to a different file and so buffer aliasing is
+ *  not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode.  In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode.  --sct
+ */
+
+/*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here.  Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+				int partial_page)
+{
+	transaction_t *transaction;
+	struct journal_head *jh;
+	int may_free = 1;
+
+	BUFFER_TRACE(bh, "entry");
+
+	/*
+	 * It is safe to proceed here without the j_list_lock because the
+	 * buffers cannot be stolen by try_to_free_buffers as long as we are
+	 * holding the page lock. --sct
+	 */
+
+	if (!buffer_jbd(bh))
+		goto zap_buffer_unlocked;
+
+	/* OK, we have data buffer in journaled mode */
+	write_lock(&journal->j_state_lock);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	jh = jbd2_journal_grab_journal_head(bh);
+	if (!jh)
+		goto zap_buffer_no_jh;
+
+	/*
+	 * We cannot remove the buffer from checkpoint lists until the
+	 * transaction adding inode to orphan list (let's call it T)
+	 * is committed.  Otherwise if the transaction changing the
+	 * buffer would be cleaned from the journal before T is
+	 * committed, a crash will cause that the correct contents of
+	 * the buffer will be lost.  On the other hand we have to
+	 * clear the buffer dirty bit at latest at the moment when the
+	 * transaction marking the buffer as freed in the filesystem
+	 * structures is committed because from that moment on the
+	 * block can be reallocated and used by a different page.
+	 * Since the block hasn't been freed yet but the inode has
+	 * already been added to orphan list, it is safe for us to add
+	 * the buffer to BJ_Forget list of the newest transaction.
+	 *
+	 * Also we have to clear buffer_mapped flag of a truncated buffer
+	 * because the buffer_head may be attached to the page straddling
+	 * i_size (can happen only when blocksize < pagesize) and thus the
+	 * buffer_head can be reused when the file is extended again. So we end
+	 * up keeping around invalidated buffers attached to transactions'
+	 * BJ_Forget list just to stop checkpointing code from cleaning up
+	 * the transaction this buffer was modified in.
+	 */
+	transaction = jh->b_transaction;
+	if (transaction == NULL) {
+		/* First case: not on any transaction.  If it
+		 * has no checkpoint link, then we can zap it:
+		 * it's a writeback-mode buffer so we don't care
+		 * if it hits disk safely. */
+		if (!jh->b_cp_transaction) {
+			JBUFFER_TRACE(jh, "not on any transaction: zap");
+			goto zap_buffer;
+		}
+
+		if (!buffer_dirty(bh)) {
+			/* bdflush has written it.  We can drop it now */
+			goto zap_buffer;
+		}
+
+		/* OK, it must be in the journal but still not
+		 * written fully to disk: it's metadata or
+		 * journaled data... */
+
+		if (journal->j_running_transaction) {
+			/* ... and once the current transaction has
+			 * committed, the buffer won't be needed any
+			 * longer. */
+			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+			may_free = __dispose_buffer(jh,
+					journal->j_running_transaction);
+			goto zap_buffer;
+		} else {
+			/* There is no currently-running transaction. So the
+			 * orphan record which we wrote for this file must have
+			 * passed into commit.  We must attach this buffer to
+			 * the committing transaction, if it exists. */
+			if (journal->j_committing_transaction) {
+				JBUFFER_TRACE(jh, "give to committing trans");
+				may_free = __dispose_buffer(jh,
+					journal->j_committing_transaction);
+				goto zap_buffer;
+			} else {
+				/* The orphan record's transaction has
+				 * committed.  We can cleanse this buffer */
+				clear_buffer_jbddirty(bh);
+				goto zap_buffer;
+			}
+		}
+	} else if (transaction == journal->j_committing_transaction) {
+		JBUFFER_TRACE(jh, "on committing transaction");
+		/*
+		 * The buffer is committing, we simply cannot touch
+		 * it. If the page is straddling i_size we have to wait
+		 * for commit and try again.
+		 */
+		if (partial_page) {
+			jbd2_journal_put_journal_head(jh);
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			write_unlock(&journal->j_state_lock);
+			return -EBUSY;
+		}
+		/*
+		 * OK, buffer won't be reachable after truncate. We just set
+		 * j_next_transaction to the running transaction (if there is
+		 * one) and mark buffer as freed so that commit code knows it
+		 * should clear dirty bits when it is done with the buffer.
+		 */
+		set_buffer_freed(bh);
+		if (journal->j_running_transaction && buffer_jbddirty(bh))
+			jh->b_next_transaction = journal->j_running_transaction;
+		jbd2_journal_put_journal_head(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		write_unlock(&journal->j_state_lock);
+		return 0;
+	} else {
+		/* Good, the buffer belongs to the running transaction.
+		 * We are writing our own transaction's data, not any
+		 * previous one's, so it is safe to throw it away
+		 * (remember that we expect the filesystem to have set
+		 * i_size already for this truncate so recovery will not
+		 * expose the disk blocks we are discarding here.) */
+		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+		JBUFFER_TRACE(jh, "on running transaction");
+		may_free = __dispose_buffer(jh, transaction);
+	}
+
+zap_buffer:
+	/*
+	 * This is tricky. Although the buffer is truncated, it may be reused
+	 * if blocksize < pagesize and it is attached to the page straddling
+	 * EOF. Since the buffer might have been added to BJ_Forget list of the
+	 * running transaction, journal_get_write_access() won't clear
+	 * b_modified and credit accounting gets confused. So clear b_modified
+	 * here.
+	 */
+	jh->b_modified = 0;
+	jbd2_journal_put_journal_head(jh);
+zap_buffer_no_jh:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	write_unlock(&journal->j_state_lock);
+zap_buffer_unlocked:
+	clear_buffer_dirty(bh);
+	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	clear_buffer_delay(bh);
+	clear_buffer_unwritten(bh);
+	bh->b_bdev = NULL;
+	return may_free;
+}
+
+/**
+ * void jbd2_journal_invalidatepage()
+ * @journal: journal to use for flush...
+ * @page:    page to flush
+ * @offset:  start of the range to invalidate
+ * @length:  length of the range to invalidate
+ *
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
+ */
+int jbd2_journal_invalidatepage(journal_t *journal,
+				struct page *page,
+				unsigned int offset,
+				unsigned int length)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int stop = offset + length;
+	unsigned int curr_off = 0;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
+	int may_free = 1;
+	int ret = 0;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page_has_buffers(page))
+		return 0;
+
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
+	/* We will potentially be playing with lists other than just the
+	 * data lists (especially for journaled data mode), so be
+	 * cautious in our locking. */
+
+	head = bh = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		if (next_off > stop)
+			return 0;
+
+		if (offset <= curr_off) {
+			/* This block is wholly outside the truncation point */
+			lock_buffer(bh);
+			ret = journal_unmap_buffer(journal, bh, partial_page);
+			unlock_buffer(bh);
+			if (ret < 0)
+				return ret;
+			may_free &= ret;
+		}
+		curr_off = next_off;
+		bh = next;
+
+	} while (bh != head);
+
+	if (!partial_page) {
+		if (may_free && try_to_free_buffers(page))
+			J_ASSERT(!page_has_buffers(page));
+	}
+	return 0;
+}
+
+/*
+ * File a buffer on the given transaction list.
+ */
+void __jbd2_journal_file_buffer(struct journal_head *jh,
+			transaction_t *transaction, int jlist)
+{
+	struct journal_head **list = NULL;
+	int was_dirty = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+				jh->b_transaction == NULL);
+
+	if (jh->b_transaction && jh->b_jlist == jlist)
+		return;
+
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		/*
+		 * For metadata buffers, we track dirty bit in buffer_jbddirty
+		 * instead of buffer_dirty. We should not see a dirty bit set
+		 * here because we clear it in do_get_write_access but e.g.
+		 * tune2fs can modify the sb and set the dirty bit at any time
+		 * so we try to gracefully handle that.
+		 */
+		if (buffer_dirty(bh))
+			warn_dirty_buffer(bh);
+		if (test_clear_buffer_dirty(bh) ||
+		    test_clear_buffer_jbddirty(bh))
+			was_dirty = 1;
+	}
+
+	if (jh->b_transaction)
+		__jbd2_journal_temp_unlink_buffer(jh);
+	else
+		jbd2_journal_grab_journal_head(bh);
+	jh->b_transaction = transaction;
+
+	switch (jlist) {
+	case BJ_None:
+		J_ASSERT_JH(jh, !jh->b_committed_data);
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+		return;
+	case BJ_Metadata:
+		transaction->t_nr_buffers++;
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	}
+
+	__blist_add_buffer(list, jh);
+	jh->b_jlist = jlist;
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+void jbd2_journal_file_buffer(struct journal_head *jh,
+				transaction_t *transaction, int jlist)
+{
+	jbd_lock_bh_state(jh2bh(jh));
+	spin_lock(&transaction->t_journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh, transaction, jlist);
+	spin_unlock(&transaction->t_journal->j_list_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely.  If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ *
+ * Called under j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
+ */
+void __jbd2_journal_refile_buffer(struct journal_head *jh)
+{
+	int was_dirty, jlist;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	if (jh->b_transaction)
+		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
+
+	/* If the buffer is now unused, just drop it. */
+	if (jh->b_next_transaction == NULL) {
+		__jbd2_journal_unfile_buffer(jh);
+		return;
+	}
+
+	/*
+	 * It has been modified by a later transaction: add it to the new
+	 * transaction's metadata list.
+	 */
+
+	was_dirty = test_clear_buffer_jbddirty(bh);
+	__jbd2_journal_temp_unlink_buffer(jh);
+	/*
+	 * We set b_transaction here because b_next_transaction will inherit
+	 * our jh reference and thus __jbd2_journal_file_buffer() must not
+	 * take a new one.
+	 */
+	jh->b_transaction = jh->b_next_transaction;
+	jh->b_next_transaction = NULL;
+	if (buffer_freed(bh))
+		jlist = BJ_Forget;
+	else if (jh->b_modified)
+		jlist = BJ_Metadata;
+	else
+		jlist = BJ_Reserved;
+	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
+	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+/*
+ * __jbd2_journal_refile_buffer() with necessary locking added. We take our
+ * bh reference so that we can safely unlock bh.
+ *
+ * The jh and bh may be freed by this call.
+ */
+void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_refile_buffer(jh);
+	jbd_unlock_bh_state(bh);
+	spin_unlock(&journal->j_list_lock);
+	__brelse(bh);
+}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+	journal = transaction->t_journal;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/*
+	 * We only ever set this variable to 1 so the test is safe. Since
+	 * t_need_data_flush is likely to be set, we do the test to save some
+	 * cacheline bouncing
+	 */
+	if (!transaction->t_need_data_flush)
+		transaction->t_need_data_flush = 1;
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
+ */
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+					struct jbd2_inode *jinode,
+					loff_t new_size)
+{
+	transaction_t *inode_trans, *commit_trans;
+	int ret = 0;
+
+	/* This is a quick check to avoid locking if not necessary */
+	if (!jinode->i_transaction)
+		goto out;
+	/* Locks are here just to force reading of recent values, it is
+	 * enough that the transaction was not committing before we started
+	 * a transaction adding the inode to orphan list */
+	read_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	read_unlock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	inode_trans = jinode->i_transaction;
+	spin_unlock(&journal->j_list_lock);
+	if (inode_trans == commit_trans) {
+		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
diff --git a/kernel/fs/jffs2/Kconfig b/kernel/fs/jffs2/Kconfig
new file mode 100644
index 000000000..d8bb6c411
--- /dev/null
+++ b/kernel/fs/jffs2/Kconfig
@@ -0,0 +1,188 @@
+config JFFS2_FS
+	tristate "Journalling Flash File System v2 (JFFS2) support"
+	select CRC32
+	depends on MTD
+	help
+	  JFFS2 is the second generation of the Journalling Flash File System
+	  for use on diskless embedded devices. It provides improved wear
+	  levelling, compression and support for hard links. You cannot use
+	  this on normal block devices, only on 'MTD' devices.
+
+	  Further information on the design and implementation of JFFS2 is
+	  available at <http://sources.redhat.com/jffs2/>.
+
+config JFFS2_FS_DEBUG
+	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
+	depends on JFFS2_FS
+	default "0"
+	help
+	  This controls the amount of debugging messages produced by the JFFS2
+	  code. Set it to zero for use in production systems. For evaluation,
+	  testing and debugging, it's advisable to set it to one. This will
+	  enable a few assertions and will print debugging messages at the
+	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
+	  is unlikely to be useful - it enables extra debugging in certain
+	  areas which at one point needed debugging, but when the bugs were
+	  located and fixed, the detailed messages were relegated to level 2.
+
+	  If reporting bugs, please try to have available a full dump of the
+	  messages at debug level 1 while the misbehaviour was occurring.
+
+config JFFS2_FS_WRITEBUFFER
+	bool "JFFS2 write-buffering support"
+	depends on JFFS2_FS
+	default y
+	help
+	  This enables the write-buffering support in JFFS2.
+
+	  This functionality is required to support JFFS2 on the following
+	  types of flash devices:
+	    - NAND flash
+	    - NOR flash with transparent ECC
+	    - DataFlash
+
+config JFFS2_FS_WBUF_VERIFY
+	bool "Verify JFFS2 write-buffer reads"
+	depends on JFFS2_FS_WRITEBUFFER
+	default n
+	help
+	  This causes JFFS2 to read back every page written through the
+	  write-buffer, and check for errors.
+
+config JFFS2_SUMMARY
+	bool "JFFS2 summary support"
+	depends on JFFS2_FS
+	default n
+	help
+	  This feature makes it possible to use summary information
+	  for faster filesystem mount.
+
+	  The summary information can be inserted into a filesystem image
+	  by the utility 'sumtool'.
+
+	  If unsure, say 'N'.
+
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support"
+	depends on JFFS2_FS
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFFS2_COMPRESSION_OPTIONS
+	bool "Advanced compression options for JFFS2"
+	depends on JFFS2_FS
+	default n
+	help
+	  Enabling this option allows you to explicitly choose which
+	  compression modules, if any, are enabled in JFFS2. Removing
+	  compressors can mean you cannot read existing file systems,
+	  and enabling experimental compressors can mean that you
+	  write a file system which cannot be read by a standard kernel.
+
+	  If unsure, you should _definitely_ say 'N'.
+
+config JFFS2_ZLIB
+	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	depends on JFFS2_FS
+	default y
+	help
+	  Zlib is designed to be a free, general-purpose, legally unencumbered,
+	  lossless data-compression library for use on virtually any computer
+	  hardware and operating system. See <http://www.gzip.org/zlib/> for
+	  further information.
+
+	  Say 'Y' if unsure.
+
+config JFFS2_LZO
+	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	depends on JFFS2_FS
+	default n
+	help
+	  minilzo-based compression. Generally works better than Zlib.
+
+	  This feature was added in July, 2007. Say 'N' if you need
+	  compatibility with older bootloaders or kernels.
+
+config JFFS2_RTIME
+	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default y
+	help
+	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
+
+config JFFS2_RUBIN
+	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default n
+	help
+	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
+
+choice
+	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
+	default JFFS2_CMODE_PRIORITY
+	depends on JFFS2_FS
+	help
+	  You can set here the default compression mode of JFFS2 from
+	  the available compression modes. Don't touch if unsure.
+
+config JFFS2_CMODE_NONE
+	bool "no compression"
+	help
+	  Uses no compression.
+
+config JFFS2_CMODE_PRIORITY
+	bool "priority"
+	help
+	  Tries the compressors in a predefined order and chooses the first
+	  successful one.
+
+config JFFS2_CMODE_SIZE
+	bool "size"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result.
+
+config JFFS2_CMODE_FAVOURLZO
+	bool "Favour LZO"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result but gives some preference to LZO (which has faster
+	  decompression) at the expense of size.
+
+endchoice
diff --git a/kernel/fs/jffs2/LICENCE b/kernel/fs/jffs2/LICENCE
new file mode 100644
index 000000000..562885908
--- /dev/null
+++ b/kernel/fs/jffs2/LICENCE
@@ -0,0 +1,30 @@
+The files in this directory and elsewhere which refer to this LICENCE
+file are part of JFFS2, the Journalling Flash File System v2.
+
+	Copyright © 2001-2007 Red Hat, Inc. and others
+
+JFFS2 is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 or (at your option) any later 
+version.
+
+JFFS2 is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License along
+with JFFS2; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+
+As a special exception, if other files instantiate templates or use
+macros or inline functions from these files, or you compile these
+files and link them with other works to produce a work based on these
+files, these files do not by themselves cause the resulting work to be
+covered by the GNU General Public License. However the source code for
+these files must still be made available in accordance with section (3)
+of the GNU General Public License.
+
+This exception does not invalidate any other reasons why a work based on
+this file might be covered by the GNU General Public License.
+
diff --git a/kernel/fs/jffs2/Makefile b/kernel/fs/jffs2/Makefile
new file mode 100644
index 000000000..60e5d49ca
--- /dev/null
+++ b/kernel/fs/jffs2/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for the Linux Journalling Flash File System v2 (JFFS2)
+#
+#
+
+obj-$(CONFIG_JFFS2_FS) += jffs2.o
+
+jffs2-y	:= compr.o dir.o file.o ioctl.o nodelist.o malloc.o
+jffs2-y	+= read.o nodemgmt.o readinode.o write.o scan.o gc.o
+jffs2-y	+= symlink.o build.o erase.o background.o fs.o writev.o
+jffs2-y	+= super.o debug.o
+
+jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER)	+= wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_XATTR)		+= xattr.o xattr_trusted.o xattr_user.o
+jffs2-$(CONFIG_JFFS2_FS_SECURITY)	+= security.o
+jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL)	+= acl.o
+jffs2-$(CONFIG_JFFS2_RUBIN)	+= compr_rubin.o
+jffs2-$(CONFIG_JFFS2_RTIME)	+= compr_rtime.o
+jffs2-$(CONFIG_JFFS2_ZLIB)	+= compr_zlib.o
+jffs2-$(CONFIG_JFFS2_LZO)	+= compr_lzo.o
+jffs2-$(CONFIG_JFFS2_SUMMARY)   += summary.o
diff --git a/kernel/fs/jffs2/README.Locking b/kernel/fs/jffs2/README.Locking
new file mode 100644
index 000000000..3ea365541
--- /dev/null
+++ b/kernel/fs/jffs2/README.Locking
@@ -0,0 +1,172 @@
+
+	JFFS2 LOCKING DOCUMENTATION
+	---------------------------
+
+At least theoretically, JFFS2 does not require the Big Kernel Lock
+(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
+code. It has its own locking, as described below.
+
+This document attempts to describe the existing locking rules for
+JFFS2. It is not expected to remain perfectly up to date, but ought to
+be fairly close.
+
+
+	alloc_sem
+	---------
+
+The alloc_sem is a per-filesystem mutex, used primarily to ensure
+contiguous allocation of space on the medium. It is automatically
+obtained during space allocations (jffs2_reserve_space()) and freed
+upon write completion (jffs2_complete_reservation()). Note that
+the garbage collector will obtain this right at the beginning of
+jffs2_garbage_collect_pass() and release it at the end, thereby
+preventing any other write activity on the file system during a
+garbage collect pass.
+
+When writing new nodes, the alloc_sem must be held until the new nodes
+have been properly linked into the data structures for the inode to
+which they belong. This is for the benefit of NAND flash - adding new
+nodes to an inode may obsolete old ones, and by holding the alloc_sem
+until this happens we ensure that any data in the write-buffer at the
+time this happens are part of the new node, not just something that
+was written afterwards. Hence, we can ensure the newly-obsoleted nodes
+don't actually get erased until the write-buffer has been flushed to
+the medium.
+
+With the introduction of NAND flash support and the write-buffer, 
+the alloc_sem is also used to protect the wbuf-related members of the
+jffs2_sb_info structure. Atomically reading the wbuf_len member to see
+if the wbuf is currently holding any data is permitted, though.
+
+Ordering constraints: See f->sem.
+
+
+	File Mutex f->sem
+	---------------------
+
+This is the JFFS2-internal equivalent of the inode mutex i->i_sem.
+It protects the contents of the jffs2_inode_info private inode data,
+including the linked list of node fragments (but see the notes below on
+erase_completion_lock), etc.
+
+The reason that the i_sem itself isn't used for this purpose is to
+avoid deadlocks with garbage collection -- the VFS will lock the i_sem
+before calling a function which may need to allocate space. The
+allocation may trigger garbage-collection, which may need to move a
+node belonging to the inode which was locked in the first place by the
+VFS. If the garbage collection code were to attempt to lock the i_sem
+of the inode from which it's garbage-collecting a physical node, this
+lead to deadlock, unless we played games with unlocking the i_sem
+before calling the space allocation functions.
+
+Instead of playing such games, we just have an extra internal
+mutex, which is obtained by the garbage collection code and also
+by the normal file system code _after_ allocation of space.
+
+Ordering constraints: 
+
+	1. Never attempt to allocate space or lock alloc_sem with 
+	   any f->sem held.
+	2. Never attempt to lock two file mutexes in one thread.
+	   No ordering rules have been made for doing so.
+
+
+	erase_completion_lock spinlock
+	------------------------------
+
+This is used to serialise access to the eraseblock lists, to the
+per-eraseblock lists of physical jffs2_raw_node_ref structures, and
+(NB) the per-inode list of physical nodes. The latter is a special
+case - see below.
+
+As the MTD API no longer permits erase-completion callback functions
+to be called from bottom-half (timer) context (on the basis that nobody
+ever actually implemented such a thing), it's now sufficient to use
+a simple spin_lock() rather than spin_lock_bh().
+
+Note that the per-inode list of physical nodes (f->nodes) is a special
+case. Any changes to _valid_ nodes (i.e. ->flash_offset & 1 == 0) in
+the list are protected by the file mutex f->sem. But the erase code
+may remove _obsolete_ nodes from the list while holding only the
+erase_completion_lock. So you can walk the list only while holding the
+erase_completion_lock, and can drop the lock temporarily mid-walk as
+long as the pointer you're holding is to a _valid_ node, not an
+obsolete one.
+
+The erase_completion_lock is also used to protect the c->gc_task
+pointer when the garbage collection thread exits. The code to kill the
+GC thread locks it, sends the signal, then unlocks it - while the GC
+thread itself locks it, zeroes c->gc_task, then unlocks on the exit path.
+
+
+	inocache_lock spinlock
+	----------------------
+
+This spinlock protects the hashed list (c->inocache_list) of the
+in-core jffs2_inode_cache objects (each inode in JFFS2 has the
+correspondent jffs2_inode_cache object). So, the inocache_lock
+has to be locked while walking the c->inocache_list hash buckets.
+
+This spinlock also covers allocation of new inode numbers, which is
+currently just '++->highest_ino++', but might one day get more complicated
+if we need to deal with wrapping after 4 milliard inode numbers are used.
+
+Note, the f->sem guarantees that the correspondent jffs2_inode_cache
+will not be removed. So, it is allowed to access it without locking
+the inocache_lock spinlock. 
+
+Ordering constraints: 
+
+	If both erase_completion_lock and inocache_lock are needed, the
+	c->erase_completion has to be acquired first.
+
+
+	erase_free_sem
+	--------------
+
+This mutex is only used by the erase code which frees obsolete node
+references and the jffs2_garbage_collect_deletion_dirent() function.
+The latter function on NAND flash must read _obsolete_ nodes to
+determine whether the 'deletion dirent' under consideration can be
+discarded or whether it is still required to show that an inode has
+been unlinked. Because reading from the flash may sleep, the
+erase_completion_lock cannot be held, so an alternative, more
+heavyweight lock was required to prevent the erase code from freeing
+the jffs2_raw_node_ref structures in question while the garbage
+collection code is looking at them.
+
+Suggestions for alternative solutions to this problem would be welcomed.
+
+
+	wbuf_sem
+	--------
+
+This read/write semaphore protects against concurrent access to the
+write-behind buffer ('wbuf') used for flash chips where we must write
+in blocks. It protects both the contents of the wbuf and the metadata
+which indicates which flash region (if any) is currently covered by 
+the buffer.
+
+Ordering constraints:
+	Lock wbuf_sem last, after the alloc_sem or and f->sem.
+
+
+	c->xattr_sem
+	------------
+
+This read/write semaphore protects against concurrent access to the
+xattr related objects which include stuff in superblock and ic->xref.
+In read-only path, write-semaphore is too much exclusion. It's enough
+by read-semaphore. But you must hold write-semaphore when updating,
+creating or deleting any xattr related object.
+
+Once xattr_sem released, there would be no assurance for the existence
+of those objects. Thus, a series of processes is often required to retry,
+when updating such a object is necessary under holding read semaphore.
+For example, do_jffs2_getxattr() holds read-semaphore to scan xref and
+xdatum at first. But it retries this process with holding write-semaphore
+after release read-semaphore, if it's necessary to load name/value pair
+from medium.
+
+Ordering constraints:
+	Lock xattr_sem last, after the alloc_sem.
diff --git a/kernel/fs/jffs2/TODO b/kernel/fs/jffs2/TODO
new file mode 100644
index 000000000..ca28964ab
--- /dev/null
+++ b/kernel/fs/jffs2/TODO
@@ -0,0 +1,37 @@
+
+ - support asynchronous operation -- add a per-fs 'reserved_space' count,
+   let each outstanding write reserve the _maximum_ amount of physical
+   space it could take. Let GC flush the outstanding writes because the
+   reservations will necessarily be pessimistic. With this we could even
+   do shared writable mmap, if we can have a fs hook for do_wp_page() to
+   make the reservation.
+ - disable compression in commit_write()?
+ - fine-tune the allocation / GC thresholds
+ - chattr support - turning on/off and tuning compression per-inode
+ - checkpointing (do we need this? scan is quite fast)
+ - make the scan code populate real inodes so read_inode just after 
+	mount doesn't have to read the flash twice for large files.
+	Make this a per-inode option, changeable with chattr, so you can
+	decide which inodes should be in-core immediately after mount.
+ - test, test, test
+
+ - NAND flash support:
+	- almost done :)
+	- use bad block check instead of the hardwired byte check
+
+ - Optimisations:
+   - Split writes so they go to two separate blocks rather than just c->nextblock.
+	By writing _new_ nodes to one block, and garbage-collected REF_PRISTINE
+	nodes to a different one, we can separate clean nodes from those which
+	are likely to become dirty, and end up with blocks which are each far
+	closer to 100% or 0% clean, hence speeding up later GC progress dramatically.
+   - Stop keeping name in-core with struct jffs2_full_dirent. If we keep the hash in 
+     the full dirent, we only need to go to the flash in lookup() when we think we've
+     got a match, and in readdir(). 
+   - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately?
+   - Remove size from jffs2_raw_node_frag. 
+
+dedekind:
+1. __jffs2_flush_wbuf() has a strange 'pad' parameter. Eliminate.
+2. get_sb()->build_fs()->scan() path... Why get_sb() removes scan()'s crap in
+   case of failure? scan() does not clean everything. Fix.
diff --git a/kernel/fs/jffs2/acl.c b/kernel/fs/jffs2/acl.c
new file mode 100644
index 000000000..2f7a3c090
--- /dev/null
+++ b/kernel/fs/jffs2/acl.c
@@ -0,0 +1,309 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static size_t jffs2_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(struct jffs2_acl_header)
+		       + count * sizeof(struct jffs2_acl_entry_short);
+	} else {
+		return sizeof(struct jffs2_acl_header)
+		       + 4 * sizeof(struct jffs2_acl_entry_short)
+		       + (count - 4) * sizeof(struct jffs2_acl_entry);
+	}
+}
+
+static int jffs2_acl_count(size_t size)
+{
+	size_t s;
+
+	size -= sizeof(struct jffs2_acl_header);
+	if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
+		if (size % sizeof(struct jffs2_acl_entry_short))
+			return -1;
+		return size / sizeof(struct jffs2_acl_entry_short);
+	} else {
+		s = size - 4 * sizeof(struct jffs2_acl_entry_short);
+		if (s % sizeof(struct jffs2_acl_entry))
+			return -1;
+		return s / sizeof(struct jffs2_acl_entry) + 4;
+	}
+}
+
+static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size)
+{
+	void *end = value + size;
+	struct jffs2_acl_header *header = value;
+	struct jffs2_acl_entry *entry;
+	struct posix_acl *acl;
+	uint32_t ver;
+	int i, count;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct jffs2_acl_header))
+		return ERR_PTR(-EINVAL);
+	ver = je32_to_cpu(header->a_version);
+	if (ver != JFFS2_ACL_VERSION) {
+		JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
+		return ERR_PTR(-EINVAL);
+	}
+
+	value += sizeof(struct jffs2_acl_header);
+	count = jffs2_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i=0; i < count; i++) {
+		entry = value;
+		if (value + sizeof(struct jffs2_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
+		acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[i].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value += sizeof(struct jffs2_acl_entry_short);
+				break;
+
+			case ACL_USER:
+				value += sizeof(struct jffs2_acl_entry);
+				if (value > end)
+					goto fail;
+				acl->a_entries[i].e_uid =
+					make_kuid(&init_user_ns,
+						  je32_to_cpu(entry->e_id));
+				break;
+			case ACL_GROUP:
+				value += sizeof(struct jffs2_acl_entry);
+				if (value > end)
+					goto fail;
+				acl->a_entries[i].e_gid =
+					make_kgid(&init_user_ns,
+						  je32_to_cpu(entry->e_id));
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+ fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
+{
+	struct jffs2_acl_header *header;
+	struct jffs2_acl_entry *entry;
+	void *e;
+	size_t i;
+
+	*size = jffs2_acl_size(acl->a_count);
+	header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL);
+	if (!header)
+		return ERR_PTR(-ENOMEM);
+	header->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
+	e = header + 1;
+	for (i=0; i < acl->a_count; i++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[i];
+		entry = e;
+		entry->e_tag = cpu_to_je16(acl_e->e_tag);
+		entry->e_perm = cpu_to_je16(acl_e->e_perm);
+		switch(acl_e->e_tag) {
+			case ACL_USER:
+				entry->e_id = cpu_to_je32(
+					from_kuid(&init_user_ns, acl_e->e_uid));
+				e += sizeof(struct jffs2_acl_entry);
+				break;
+			case ACL_GROUP:
+				entry->e_id = cpu_to_je32(
+					from_kgid(&init_user_ns, acl_e->e_gid));
+				e += sizeof(struct jffs2_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(struct jffs2_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return header;
+ fail:
+	kfree(header);
+	return ERR_PTR(-EINVAL);
+}
+
+struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	char *value = NULL;
+	int rc, xprefix;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
+	if (rc > 0) {
+		value = kmalloc(rc, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		rc = do_jffs2_getxattr(inode, xprefix, "", value, rc);
+	}
+	if (rc > 0) {
+		acl = jffs2_acl_from_medium(value, rc);
+	} else if (rc == -ENODATA || rc == -ENOSYS) {
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(rc);
+	}
+	kfree(value);
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+	return acl;
+}
+
+static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *acl)
+{
+	char *value = NULL;
+	size_t size = 0;
+	int rc;
+
+	if (acl) {
+		value = jffs2_acl_to_medium(acl, &size);
+		if (IS_ERR(value))
+			return PTR_ERR(value);
+	}
+	rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0);
+	if (!value && rc == -ENODATA)
+		rc = 0;
+	kfree(value);
+
+	return rc;
+}
+
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int rc, xprefix;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		if (acl) {
+			umode_t mode = inode->i_mode;
+			rc = posix_acl_equiv_mode(acl, &mode);
+			if (rc < 0)
+				return rc;
+			if (inode->i_mode != mode) {
+				struct iattr attr;
+
+				attr.ia_valid = ATTR_MODE | ATTR_CTIME;
+				attr.ia_mode = mode;
+				attr.ia_ctime = CURRENT_TIME_SEC;
+				rc = jffs2_do_setattr(inode, &attr);
+				if (rc < 0)
+					return rc;
+			}
+			if (rc == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	rc = __jffs2_set_acl(inode, xprefix, acl);
+	if (!rc)
+		set_cached_acl(inode, type, acl);
+	return rc;
+}
+
+int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
+{
+	struct posix_acl *default_acl, *acl;
+	int rc;
+
+	cache_no_acl(inode);
+
+	rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl);
+	if (rc)
+		return rc;
+
+	if (default_acl) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+		posix_acl_release(acl);
+	}
+	return 0;
+}
+
+int jffs2_init_acl_post(struct inode *inode)
+{
+	int rc;
+
+	if (inode->i_default_acl) {
+		rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, inode->i_default_acl);
+		if (rc)
+			return rc;
+	}
+
+	if (inode->i_acl) {
+		rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, inode->i_acl);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/jffs2/acl.h b/kernel/fs/jffs2/acl.h
new file mode 100644
index 000000000..2e2b5745c
--- /dev/null
+++ b/kernel/fs/jffs2/acl.h
@@ -0,0 +1,41 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+struct jffs2_acl_entry {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+	jint32_t	e_id;
+};
+
+struct jffs2_acl_entry_short {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+};
+
+struct jffs2_acl_header {
+	jint32_t	a_version;
+};
+
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+
+struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
+extern int jffs2_init_acl_post(struct inode *);
+
+#else
+
+#define jffs2_get_acl				(NULL)
+#define jffs2_set_acl				(NULL)
+#define jffs2_init_acl_pre(dir_i,inode,mode)	(0)
+#define jffs2_init_acl_post(inode)		(0)
+
+#endif	/* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/kernel/fs/jffs2/background.c b/kernel/fs/jffs2/background.c
new file mode 100644
index 000000000..bb9cebc9c
--- /dev/null
+++ b/kernel/fs/jffs2/background.c
@@ -0,0 +1,168 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/jffs2.h>
+#include <linux/mtd/mtd.h>
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include "nodelist.h"
+
+
+static int jffs2_garbage_collect_thread(void *);
+
+void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
+{
+	assert_spin_locked(&c->erase_completion_lock);
+	if (c->gc_task && jffs2_thread_should_wake(c))
+		send_sig(SIGHUP, c->gc_task, 1);
+}
+
+/* This must only ever be called when no GC thread is currently running */
+int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
+{
+	struct task_struct *tsk;
+	int ret = 0;
+
+	BUG_ON(c->gc_task);
+
+	init_completion(&c->gc_thread_start);
+	init_completion(&c->gc_thread_exit);
+
+	tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
+	if (IS_ERR(tsk)) {
+		pr_warn("fork failed for JFFS2 garbage collect thread: %ld\n",
+			-PTR_ERR(tsk));
+		complete(&c->gc_thread_exit);
+		ret = PTR_ERR(tsk);
+	} else {
+		/* Wait for it... */
+		jffs2_dbg(1, "Garbage collect thread is pid %d\n", tsk->pid);
+		wait_for_completion(&c->gc_thread_start);
+		ret = tsk->pid;
+	}
+
+	return ret;
+}
+
+void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c)
+{
+	int wait = 0;
+	spin_lock(&c->erase_completion_lock);
+	if (c->gc_task) {
+		jffs2_dbg(1, "Killing GC task %d\n", c->gc_task->pid);
+		send_sig(SIGKILL, c->gc_task, 1);
+		wait = 1;
+	}
+	spin_unlock(&c->erase_completion_lock);
+	if (wait)
+		wait_for_completion(&c->gc_thread_exit);
+}
+
+static int jffs2_garbage_collect_thread(void *_c)
+{
+	struct jffs2_sb_info *c = _c;
+	sigset_t hupmask;
+
+	siginitset(&hupmask, sigmask(SIGHUP));
+	allow_signal(SIGKILL);
+	allow_signal(SIGSTOP);
+	allow_signal(SIGCONT);
+	allow_signal(SIGHUP);
+
+	c->gc_task = current;
+	complete(&c->gc_thread_start);
+
+	set_user_nice(current, 10);
+
+	set_freezable();
+	for (;;) {
+		sigprocmask(SIG_UNBLOCK, &hupmask, NULL);
+	again:
+		spin_lock(&c->erase_completion_lock);
+		if (!jffs2_thread_should_wake(c)) {
+			set_current_state (TASK_INTERRUPTIBLE);
+			spin_unlock(&c->erase_completion_lock);
+			jffs2_dbg(1, "%s(): sleeping...\n", __func__);
+			schedule();
+		} else {
+			spin_unlock(&c->erase_completion_lock);
+		}
+		/* Problem - immediately after bootup, the GCD spends a lot
+		 * of time in places like jffs2_kill_fragtree(); so much so
+		 * that userspace processes (like gdm and X) are starved
+		 * despite plenty of cond_resched()s and renicing.  Yield()
+		 * doesn't help, either (presumably because userspace and GCD
+		 * are generally competing for a higher latency resource -
+		 * disk).
+		 * This forces the GCD to slow the hell down.   Pulling an
+		 * inode in with read_inode() is much preferable to having
+		 * the GC thread get there first. */
+		schedule_timeout_interruptible(msecs_to_jiffies(50));
+
+		if (kthread_should_stop()) {
+			jffs2_dbg(1, "%s(): kthread_stop() called\n", __func__);
+			goto die;
+		}
+
+		/* Put_super will send a SIGKILL and then wait on the sem.
+		 */
+		while (signal_pending(current) || freezing(current)) {
+			siginfo_t info;
+			unsigned long signr;
+
+			if (try_to_freeze())
+				goto again;
+
+			signr = dequeue_signal_lock(current, &current->blocked, &info);
+
+			switch(signr) {
+			case SIGSTOP:
+				jffs2_dbg(1, "%s(): SIGSTOP received\n",
+					  __func__);
+				set_current_state(TASK_STOPPED);
+				schedule();
+				break;
+
+			case SIGKILL:
+				jffs2_dbg(1, "%s(): SIGKILL received\n",
+					  __func__);
+				goto die;
+
+			case SIGHUP:
+				jffs2_dbg(1, "%s(): SIGHUP received\n",
+					  __func__);
+				break;
+			default:
+				jffs2_dbg(1, "%s(): signal %ld received\n",
+					  __func__, signr);
+			}
+		}
+		/* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */
+		sigprocmask(SIG_BLOCK, &hupmask, NULL);
+
+		jffs2_dbg(1, "%s(): pass\n", __func__);
+		if (jffs2_garbage_collect_pass(c) == -ENOSPC) {
+			pr_notice("No space for garbage collection. Aborting GC thread\n");
+			goto die;
+		}
+	}
+ die:
+	spin_lock(&c->erase_completion_lock);
+	c->gc_task = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	complete_and_exit(&c->gc_thread_exit, 0);
+}
diff --git a/kernel/fs/jffs2/build.c b/kernel/fs/jffs2/build.c
new file mode 100644
index 000000000..a3750f902
--- /dev/null
+++ b/kernel/fs/jffs2/build.c
@@ -0,0 +1,394 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
+		struct jffs2_inode_cache *, struct jffs2_full_dirent **);
+
+static inline struct jffs2_inode_cache *
+first_inode_chain(int *i, struct jffs2_sb_info *c)
+{
+	for (; *i < c->inocache_hashsize; (*i)++) {
+		if (c->inocache_list[*i])
+			return c->inocache_list[*i];
+	}
+	return NULL;
+}
+
+static inline struct jffs2_inode_cache *
+next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
+{
+	/* More in this chain? */
+	if (ic->next)
+		return ic->next;
+	(*i)++;
+	return first_inode_chain(i, c);
+}
+
+#define for_each_inode(i, c, ic)			\
+	for (i = 0, ic = first_inode_chain(&i, (c));	\
+	     ic;					\
+	     ic = next_inode(&i, ic, (c)))
+
+
+static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
+				    struct jffs2_inode_cache *ic)
+{
+	struct jffs2_full_dirent *fd;
+
+	dbg_fsbuild("building directory inode #%u\n", ic->ino);
+
+	/* For each child, increase nlink */
+	for(fd = ic->scan_dents; fd; fd = fd->next) {
+		struct jffs2_inode_cache *child_ic;
+		if (!fd->ino)
+			continue;
+
+		/* we can get high latency here with huge directories */
+
+		child_ic = jffs2_get_ino_cache(c, fd->ino);
+		if (!child_ic) {
+			dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
+				  fd->name, fd->ino, ic->ino);
+			jffs2_mark_node_obsolete(c, fd->raw);
+			continue;
+		}
+
+		if (fd->type == DT_DIR) {
+			if (child_ic->pino_nlink) {
+				JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
+					    fd->name, fd->ino, ic->ino);
+				/* TODO: What do we do about it? */
+			} else {
+				child_ic->pino_nlink = ic->ino;
+			}
+		} else
+			child_ic->pino_nlink++;
+
+		dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
+		/* Can't free scan_dents so far. We might need them in pass 2 */
+	}
+}
+
+/* Scan plan:
+ - Scan physical nodes. Build map of inodes/dirents. Allocate inocaches as we go
+ - Scan directory tree from top down, setting nlink in inocaches
+ - Scan inocaches for inodes with nlink==0
+*/
+static int jffs2_build_filesystem(struct jffs2_sb_info *c)
+{
+	int ret;
+	int i;
+	struct jffs2_inode_cache *ic;
+	struct jffs2_full_dirent *fd;
+	struct jffs2_full_dirent *dead_fds = NULL;
+
+	dbg_fsbuild("build FS data structures\n");
+
+	/* First, scan the medium and build all the inode caches with
+	   lists of physical nodes */
+
+	c->flags |= JFFS2_SB_FLAG_SCANNING;
+	ret = jffs2_scan_medium(c);
+	c->flags &= ~JFFS2_SB_FLAG_SCANNING;
+	if (ret)
+		goto exit;
+
+	dbg_fsbuild("scanned flash completely\n");
+	jffs2_dbg_dump_block_lists_nolock(c);
+
+	dbg_fsbuild("pass 1 starting\n");
+	c->flags |= JFFS2_SB_FLAG_BUILDING;
+	/* Now scan the directory tree, increasing nlink according to every dirent found. */
+	for_each_inode(i, c, ic) {
+		if (ic->scan_dents) {
+			jffs2_build_inode_pass1(c, ic);
+			cond_resched();
+		}
+	}
+
+	dbg_fsbuild("pass 1 complete\n");
+
+	/* Next, scan for inodes with nlink == 0 and remove them. If
+	   they were directories, then decrement the nlink of their
+	   children too, and repeat the scan. As that's going to be
+	   a fairly uncommon occurrence, it's not so evil to do it this
+	   way. Recursion bad. */
+	dbg_fsbuild("pass 2 starting\n");
+
+	for_each_inode(i, c, ic) {
+		if (ic->pino_nlink)
+			continue;
+
+		jffs2_build_remove_unlinked_inode(c, ic, &dead_fds);
+		cond_resched();
+	}
+
+	dbg_fsbuild("pass 2a starting\n");
+
+	while (dead_fds) {
+		fd = dead_fds;
+		dead_fds = fd->next;
+
+		ic = jffs2_get_ino_cache(c, fd->ino);
+
+		if (ic)
+			jffs2_build_remove_unlinked_inode(c, ic, &dead_fds);
+		jffs2_free_full_dirent(fd);
+	}
+
+	dbg_fsbuild("pass 2a complete\n");
+	dbg_fsbuild("freeing temporary data structures\n");
+
+	/* Finally, we can scan again and free the dirent structs */
+	for_each_inode(i, c, ic) {
+		while(ic->scan_dents) {
+			fd = ic->scan_dents;
+			ic->scan_dents = fd->next;
+			jffs2_free_full_dirent(fd);
+		}
+		ic->scan_dents = NULL;
+		cond_resched();
+	}
+	jffs2_build_xattr_subsystem(c);
+	c->flags &= ~JFFS2_SB_FLAG_BUILDING;
+
+	dbg_fsbuild("FS build complete\n");
+
+	/* Rotate the lists by some number to ensure wear levelling */
+	jffs2_rotate_lists(c);
+
+	ret = 0;
+
+exit:
+	if (ret) {
+		for_each_inode(i, c, ic) {
+			while(ic->scan_dents) {
+				fd = ic->scan_dents;
+				ic->scan_dents = fd->next;
+				jffs2_free_full_dirent(fd);
+			}
+		}
+		jffs2_clear_xattr_subsystem(c);
+	}
+
+	return ret;
+}
+
+static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
+					struct jffs2_inode_cache *ic,
+					struct jffs2_full_dirent **dead_fds)
+{
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_full_dirent *fd;
+
+	dbg_fsbuild("removing ino #%u with nlink == zero.\n", ic->ino);
+
+	raw = ic->nodes;
+	while (raw != (void *)ic) {
+		struct jffs2_raw_node_ref *next = raw->next_in_ino;
+		dbg_fsbuild("obsoleting node at 0x%08x\n", ref_offset(raw));
+		jffs2_mark_node_obsolete(c, raw);
+		raw = next;
+	}
+
+	if (ic->scan_dents) {
+		int whinged = 0;
+		dbg_fsbuild("inode #%u was a directory which may have children...\n", ic->ino);
+
+		while(ic->scan_dents) {
+			struct jffs2_inode_cache *child_ic;
+
+			fd = ic->scan_dents;
+			ic->scan_dents = fd->next;
+
+			if (!fd->ino) {
+				/* It's a deletion dirent. Ignore it */
+				dbg_fsbuild("child \"%s\" is a deletion dirent, skipping...\n", fd->name);
+				jffs2_free_full_dirent(fd);
+				continue;
+			}
+			if (!whinged)
+				whinged = 1;
+
+			dbg_fsbuild("removing child \"%s\", ino #%u\n", fd->name, fd->ino);
+
+			child_ic = jffs2_get_ino_cache(c, fd->ino);
+			if (!child_ic) {
+				dbg_fsbuild("cannot remove child \"%s\", ino #%u, because it doesn't exist\n",
+						fd->name, fd->ino);
+				jffs2_free_full_dirent(fd);
+				continue;
+			}
+
+			/* Reduce nlink of the child. If it's now zero, stick it on the
+			   dead_fds list to be cleaned up later. Else just free the fd */
+
+			if (fd->type == DT_DIR)
+				child_ic->pino_nlink = 0;
+			else
+				child_ic->pino_nlink--;
+
+			if (!child_ic->pino_nlink) {
+				dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
+					  fd->ino, fd->name);
+				fd->next = *dead_fds;
+				*dead_fds = fd;
+			} else {
+				dbg_fsbuild("inode #%u (\"%s\") has now got nlink %d. Ignoring.\n",
+					  fd->ino, fd->name, child_ic->pino_nlink);
+				jffs2_free_full_dirent(fd);
+			}
+		}
+	}
+
+	/*
+	   We don't delete the inocache from the hash list and free it yet.
+	   The erase code will do that, when all the nodes are completely gone.
+	*/
+}
+
+static void jffs2_calc_trigger_levels(struct jffs2_sb_info *c)
+{
+	uint32_t size;
+
+	/* Deletion should almost _always_ be allowed. We're fairly
+	   buggered once we stop allowing people to delete stuff
+	   because there's not enough free space... */
+	c->resv_blocks_deletion = 2;
+
+	/* Be conservative about how much space we need before we allow writes.
+	   On top of that which is required for deletia, require an extra 2%
+	   of the medium to be available, for overhead caused by nodes being
+	   split across blocks, etc. */
+
+	size = c->flash_size / 50; /* 2% of flash size */
+	size += c->nr_blocks * 100; /* And 100 bytes per eraseblock */
+	size += c->sector_size - 1; /* ... and round up */
+
+	c->resv_blocks_write = c->resv_blocks_deletion + (size / c->sector_size);
+
+	/* When do we let the GC thread run in the background */
+
+	c->resv_blocks_gctrigger = c->resv_blocks_write + 1;
+
+	/* When do we allow garbage collection to merge nodes to make
+	   long-term progress at the expense of short-term space exhaustion? */
+	c->resv_blocks_gcmerge = c->resv_blocks_deletion + 1;
+
+	/* When do we allow garbage collection to eat from bad blocks rather
+	   than actually making progress? */
+	c->resv_blocks_gcbad = 0;//c->resv_blocks_deletion + 2;
+
+	/* What number of 'very dirty' eraseblocks do we allow before we
+	   trigger the GC thread even if we don't _need_ the space. When we
+	   can't mark nodes obsolete on the medium, the old dirty nodes cause
+	   performance problems because we have to inspect and discard them. */
+	c->vdirty_blocks_gctrigger = c->resv_blocks_gctrigger;
+	if (jffs2_can_mark_obsolete(c))
+		c->vdirty_blocks_gctrigger *= 10;
+
+	/* If there's less than this amount of dirty space, don't bother
+	   trying to GC to make more space. It'll be a fruitless task */
+	c->nospc_dirty_size = c->sector_size + (c->flash_size / 100);
+
+	dbg_fsbuild("trigger levels (size %d KiB, block size %d KiB, %d blocks)\n",
+		    c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks);
+	dbg_fsbuild("Blocks required to allow deletion:    %d (%d KiB)\n",
+		  c->resv_blocks_deletion, c->resv_blocks_deletion*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to allow writes:      %d (%d KiB)\n",
+		  c->resv_blocks_write, c->resv_blocks_write*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to quiesce GC thread: %d (%d KiB)\n",
+		  c->resv_blocks_gctrigger, c->resv_blocks_gctrigger*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to allow GC merges:   %d (%d KiB)\n",
+		  c->resv_blocks_gcmerge, c->resv_blocks_gcmerge*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to GC bad blocks:     %d (%d KiB)\n",
+		  c->resv_blocks_gcbad, c->resv_blocks_gcbad*c->sector_size/1024);
+	dbg_fsbuild("Amount of dirty space required to GC: %d bytes\n",
+		  c->nospc_dirty_size);
+	dbg_fsbuild("Very dirty blocks before GC triggered: %d\n",
+		  c->vdirty_blocks_gctrigger);
+}
+
+int jffs2_do_mount_fs(struct jffs2_sb_info *c)
+{
+	int ret;
+	int i;
+	int size;
+
+	c->free_size = c->flash_size;
+	c->nr_blocks = c->flash_size / c->sector_size;
+	size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
+#ifndef __ECOS
+	if (jffs2_blocks_use_vmalloc(c))
+		c->blocks = vzalloc(size);
+	else
+#endif
+		c->blocks = kzalloc(size, GFP_KERNEL);
+	if (!c->blocks)
+		return -ENOMEM;
+
+	for (i=0; i<c->nr_blocks; i++) {
+		INIT_LIST_HEAD(&c->blocks[i].list);
+		c->blocks[i].offset = i * c->sector_size;
+		c->blocks[i].free_size = c->sector_size;
+	}
+
+	INIT_LIST_HEAD(&c->clean_list);
+	INIT_LIST_HEAD(&c->very_dirty_list);
+	INIT_LIST_HEAD(&c->dirty_list);
+	INIT_LIST_HEAD(&c->erasable_list);
+	INIT_LIST_HEAD(&c->erasing_list);
+	INIT_LIST_HEAD(&c->erase_checking_list);
+	INIT_LIST_HEAD(&c->erase_pending_list);
+	INIT_LIST_HEAD(&c->erasable_pending_wbuf_list);
+	INIT_LIST_HEAD(&c->erase_complete_list);
+	INIT_LIST_HEAD(&c->free_list);
+	INIT_LIST_HEAD(&c->bad_list);
+	INIT_LIST_HEAD(&c->bad_used_list);
+	c->highest_ino = 1;
+	c->summary = NULL;
+
+	ret = jffs2_sum_init(c);
+	if (ret)
+		goto out_free;
+
+	if (jffs2_build_filesystem(c)) {
+		dbg_fsbuild("build_fs failed\n");
+		jffs2_free_ino_caches(c);
+		jffs2_free_raw_node_refs(c);
+		ret = -EIO;
+		goto out_free;
+	}
+
+	jffs2_calc_trigger_levels(c);
+
+	return 0;
+
+ out_free:
+#ifndef __ECOS
+	if (jffs2_blocks_use_vmalloc(c))
+		vfree(c->blocks);
+	else
+#endif
+		kfree(c->blocks);
+
+	return ret;
+}
diff --git a/kernel/fs/jffs2/compr.c b/kernel/fs/jffs2/compr.c
new file mode 100644
index 000000000..4849a4c9a
--- /dev/null
+++ b/kernel/fs/jffs2/compr.c
@@ -0,0 +1,418 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		    University of Szeged, Hungary
+ *
+ * Created by Arjan van de Ven <arjan@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "compr.h"
+
+static DEFINE_SPINLOCK(jffs2_compressor_list_lock);
+
+/* Available compressors are on this list */
+static LIST_HEAD(jffs2_compressor_list);
+
+/* Actual compression mode */
+static int jffs2_compression_mode = JFFS2_COMPR_MODE_PRIORITY;
+
+/* Statistics for blocks stored without compression */
+static uint32_t none_stat_compr_blocks=0,none_stat_decompr_blocks=0,none_stat_compr_size=0;
+
+
+/*
+ * Return 1 to use this compression
+ */
+static int jffs2_is_best_compression(struct jffs2_compressor *this,
+		struct jffs2_compressor *best, uint32_t size, uint32_t bestsize)
+{
+	switch (jffs2_compression_mode) {
+	case JFFS2_COMPR_MODE_SIZE:
+		if (bestsize > size)
+			return 1;
+		return 0;
+	case JFFS2_COMPR_MODE_FAVOURLZO:
+		if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > size))
+			return 1;
+		if ((best->compr != JFFS2_COMPR_LZO) && (bestsize > size))
+			return 1;
+		if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > (size * FAVOUR_LZO_PERCENT / 100)))
+			return 1;
+		if ((bestsize * FAVOUR_LZO_PERCENT / 100) > size)
+			return 1;
+
+		return 0;
+	}
+	/* Shouldn't happen */
+	return 0;
+}
+
+/*
+ * jffs2_selected_compress:
+ * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
+ *	If 0, just take the first available compression mode.
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
+ * @datalen: On entry, holds the amount of data available for compression.
+ *	On exit, expected to hold the amount of data actually compressed.
+ * @cdatalen: On entry, holds the amount of space available for compressed
+ *	data. On exit, expected to hold the actual size of the compressed
+ *	data.
+ *
+ * Returns: the compression type used.  Zero is used to show that the data
+ * could not be compressed; probably because we couldn't find the requested
+ * compression mode.
+ */
+static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
+		unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
+{
+	struct jffs2_compressor *this;
+	int err, ret = JFFS2_COMPR_NONE;
+	uint32_t orig_slen, orig_dlen;
+	char *output_buf;
+
+	output_buf = kmalloc(*cdatalen, GFP_KERNEL);
+	if (!output_buf) {
+		pr_warn("No memory for compressor allocation. Compression failed.\n");
+		return ret;
+	}
+	orig_slen = *datalen;
+	orig_dlen = *cdatalen;
+	spin_lock(&jffs2_compressor_list_lock);
+	list_for_each_entry(this, &jffs2_compressor_list, list) {
+		/* Skip decompress-only and disabled modules */
+		if (!this->compress || this->disabled)
+			continue;
+
+		/* Skip if not the desired compression type */
+		if (compr && (compr != this->compr))
+			continue;
+
+		/*
+		 * Either compression type was unspecified, or we found our
+		 * compressor; either way, we're good to go.
+		 */
+		this->usecount++;
+		spin_unlock(&jffs2_compressor_list_lock);
+
+		*datalen  = orig_slen;
+		*cdatalen = orig_dlen;
+		err = this->compress(data_in, output_buf, datalen, cdatalen);
+
+		spin_lock(&jffs2_compressor_list_lock);
+		this->usecount--;
+		if (!err) {
+			/* Success */
+			ret = this->compr;
+			this->stat_compr_blocks++;
+			this->stat_compr_orig_size += *datalen;
+			this->stat_compr_new_size += *cdatalen;
+			break;
+		}
+	}
+	spin_unlock(&jffs2_compressor_list_lock);
+	if (ret == JFFS2_COMPR_NONE)
+		kfree(output_buf);
+	else
+		*cpage_out = output_buf;
+
+	return ret;
+}
+
+/* jffs2_compress:
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
+ * @datalen: On entry, holds the amount of data available for compression.
+ *	On exit, expected to hold the amount of data actually compressed.
+ * @cdatalen: On entry, holds the amount of space available for compressed
+ *	data. On exit, expected to hold the actual size of the compressed
+ *	data.
+ *
+ * Returns: Lower byte to be stored with data indicating compression type used.
+ * Zero is used to show that the data could not be compressed - the
+ * compressed version was actually larger than the original.
+ * Upper byte will be used later. (soon)
+ *
+ * If the cdata buffer isn't large enough to hold all the uncompressed data,
+ * jffs2_compress should compress as much as will fit, and should set
+ * *datalen accordingly to show the amount of data which were compressed.
+ */
+uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			unsigned char *data_in, unsigned char **cpage_out,
+			uint32_t *datalen, uint32_t *cdatalen)
+{
+	int ret = JFFS2_COMPR_NONE;
+	int mode, compr_ret;
+	struct jffs2_compressor *this, *best=NULL;
+	unsigned char *output_buf = NULL, *tmp_buf;
+	uint32_t orig_slen, orig_dlen;
+	uint32_t best_slen=0, best_dlen=0;
+
+	if (c->mount_opts.override_compr)
+		mode = c->mount_opts.compr;
+	else
+		mode = jffs2_compression_mode;
+
+	switch (mode) {
+	case JFFS2_COMPR_MODE_NONE:
+		break;
+	case JFFS2_COMPR_MODE_PRIORITY:
+		ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
+				cdatalen);
+		break;
+	case JFFS2_COMPR_MODE_SIZE:
+	case JFFS2_COMPR_MODE_FAVOURLZO:
+		orig_slen = *datalen;
+		orig_dlen = *cdatalen;
+		spin_lock(&jffs2_compressor_list_lock);
+		list_for_each_entry(this, &jffs2_compressor_list, list) {
+			/* Skip decompress-only backwards-compatibility and disabled modules */
+			if ((!this->compress)||(this->disabled))
+				continue;
+			/* Allocating memory for output buffer if necessary */
+			if ((this->compr_buf_size < orig_slen) && (this->compr_buf)) {
+				spin_unlock(&jffs2_compressor_list_lock);
+				kfree(this->compr_buf);
+				spin_lock(&jffs2_compressor_list_lock);
+				this->compr_buf_size=0;
+				this->compr_buf=NULL;
+			}
+			if (!this->compr_buf) {
+				spin_unlock(&jffs2_compressor_list_lock);
+				tmp_buf = kmalloc(orig_slen, GFP_KERNEL);
+				spin_lock(&jffs2_compressor_list_lock);
+				if (!tmp_buf) {
+					pr_warn("No memory for compressor allocation. (%d bytes)\n",
+						orig_slen);
+					continue;
+				}
+				else {
+					this->compr_buf = tmp_buf;
+					this->compr_buf_size = orig_slen;
+				}
+			}
+			this->usecount++;
+			spin_unlock(&jffs2_compressor_list_lock);
+			*datalen  = orig_slen;
+			*cdatalen = orig_dlen;
+			compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
+			spin_lock(&jffs2_compressor_list_lock);
+			this->usecount--;
+			if (!compr_ret) {
+				if (((!best_dlen) || jffs2_is_best_compression(this, best, *cdatalen, best_dlen))
+						&& (*cdatalen < *datalen)) {
+					best_dlen = *cdatalen;
+					best_slen = *datalen;
+					best = this;
+				}
+			}
+		}
+		if (best_dlen) {
+			*cdatalen = best_dlen;
+			*datalen  = best_slen;
+			output_buf = best->compr_buf;
+			best->compr_buf = NULL;
+			best->compr_buf_size = 0;
+			best->stat_compr_blocks++;
+			best->stat_compr_orig_size += best_slen;
+			best->stat_compr_new_size  += best_dlen;
+			ret = best->compr;
+			*cpage_out = output_buf;
+		}
+		spin_unlock(&jffs2_compressor_list_lock);
+		break;
+	case JFFS2_COMPR_MODE_FORCELZO:
+		ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
+				cpage_out, datalen, cdatalen);
+		break;
+	case JFFS2_COMPR_MODE_FORCEZLIB:
+		ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
+				cpage_out, datalen, cdatalen);
+		break;
+	default:
+		pr_err("unknown compression mode\n");
+	}
+
+	if (ret == JFFS2_COMPR_NONE) {
+		*cpage_out = data_in;
+		*datalen = *cdatalen;
+		none_stat_compr_blocks++;
+		none_stat_compr_size += *datalen;
+	}
+	return ret;
+}
+
+int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     uint16_t comprtype, unsigned char *cdata_in,
+		     unsigned char *data_out, uint32_t cdatalen, uint32_t datalen)
+{
+	struct jffs2_compressor *this;
+	int ret;
+
+	/* Older code had a bug where it would write non-zero 'usercompr'
+	   fields. Deal with it. */
+	if ((comprtype & 0xff) <= JFFS2_COMPR_ZLIB)
+		comprtype &= 0xff;
+
+	switch (comprtype & 0xff) {
+	case JFFS2_COMPR_NONE:
+		/* This should be special-cased elsewhere, but we might as well deal with it */
+		memcpy(data_out, cdata_in, datalen);
+		none_stat_decompr_blocks++;
+		break;
+	case JFFS2_COMPR_ZERO:
+		memset(data_out, 0, datalen);
+		break;
+	default:
+		spin_lock(&jffs2_compressor_list_lock);
+		list_for_each_entry(this, &jffs2_compressor_list, list) {
+			if (comprtype == this->compr) {
+				this->usecount++;
+				spin_unlock(&jffs2_compressor_list_lock);
+				ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
+				spin_lock(&jffs2_compressor_list_lock);
+				if (ret) {
+					pr_warn("Decompressor \"%s\" returned %d\n",
+						this->name, ret);
+				}
+				else {
+					this->stat_decompr_blocks++;
+				}
+				this->usecount--;
+				spin_unlock(&jffs2_compressor_list_lock);
+				return ret;
+			}
+		}
+		pr_warn("compression type 0x%02x not available\n", comprtype);
+		spin_unlock(&jffs2_compressor_list_lock);
+		return -EIO;
+	}
+	return 0;
+}
+
+int jffs2_register_compressor(struct jffs2_compressor *comp)
+{
+	struct jffs2_compressor *this;
+
+	if (!comp->name) {
+		pr_warn("NULL compressor name at registering JFFS2 compressor. Failed.\n");
+		return -1;
+	}
+	comp->compr_buf_size=0;
+	comp->compr_buf=NULL;
+	comp->usecount=0;
+	comp->stat_compr_orig_size=0;
+	comp->stat_compr_new_size=0;
+	comp->stat_compr_blocks=0;
+	comp->stat_decompr_blocks=0;
+	jffs2_dbg(1, "Registering JFFS2 compressor \"%s\"\n", comp->name);
+
+	spin_lock(&jffs2_compressor_list_lock);
+
+	list_for_each_entry(this, &jffs2_compressor_list, list) {
+		if (this->priority < comp->priority) {
+			list_add(&comp->list, this->list.prev);
+			goto out;
+		}
+	}
+	list_add_tail(&comp->list, &jffs2_compressor_list);
+out:
+	D2(list_for_each_entry(this, &jffs2_compressor_list, list) {
+		printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority);
+	})
+
+	spin_unlock(&jffs2_compressor_list_lock);
+
+	return 0;
+}
+
+int jffs2_unregister_compressor(struct jffs2_compressor *comp)
+{
+	D2(struct jffs2_compressor *this);
+
+	jffs2_dbg(1, "Unregistering JFFS2 compressor \"%s\"\n", comp->name);
+
+	spin_lock(&jffs2_compressor_list_lock);
+
+	if (comp->usecount) {
+		spin_unlock(&jffs2_compressor_list_lock);
+		pr_warn("Compressor module is in use. Unregister failed.\n");
+		return -1;
+	}
+	list_del(&comp->list);
+
+	D2(list_for_each_entry(this, &jffs2_compressor_list, list) {
+		printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority);
+	})
+	spin_unlock(&jffs2_compressor_list_lock);
+	return 0;
+}
+
+void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
+{
+	if (orig != comprbuf)
+		kfree(comprbuf);
+}
+
+int __init jffs2_compressors_init(void)
+{
+/* Registering compressors */
+#ifdef CONFIG_JFFS2_ZLIB
+	jffs2_zlib_init();
+#endif
+#ifdef CONFIG_JFFS2_RTIME
+	jffs2_rtime_init();
+#endif
+#ifdef CONFIG_JFFS2_RUBIN
+	jffs2_rubinmips_init();
+	jffs2_dynrubin_init();
+#endif
+#ifdef CONFIG_JFFS2_LZO
+	jffs2_lzo_init();
+#endif
+/* Setting default compression mode */
+#ifdef CONFIG_JFFS2_CMODE_NONE
+	jffs2_compression_mode = JFFS2_COMPR_MODE_NONE;
+	jffs2_dbg(1, "default compression mode: none\n");
+#else
+#ifdef CONFIG_JFFS2_CMODE_SIZE
+	jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE;
+	jffs2_dbg(1, "default compression mode: size\n");
+#else
+#ifdef CONFIG_JFFS2_CMODE_FAVOURLZO
+	jffs2_compression_mode = JFFS2_COMPR_MODE_FAVOURLZO;
+	jffs2_dbg(1, "default compression mode: favourlzo\n");
+#else
+	jffs2_dbg(1, "default compression mode: priority\n");
+#endif
+#endif
+#endif
+	return 0;
+}
+
+int jffs2_compressors_exit(void)
+{
+/* Unregistering compressors */
+#ifdef CONFIG_JFFS2_LZO
+	jffs2_lzo_exit();
+#endif
+#ifdef CONFIG_JFFS2_RUBIN
+	jffs2_dynrubin_exit();
+	jffs2_rubinmips_exit();
+#endif
+#ifdef CONFIG_JFFS2_RTIME
+	jffs2_rtime_exit();
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+	jffs2_zlib_exit();
+#endif
+	return 0;
+}
diff --git a/kernel/fs/jffs2/compr.h b/kernel/fs/jffs2/compr.h
new file mode 100644
index 000000000..5e91d578f
--- /dev/null
+++ b/kernel/fs/jffs2/compr.h
@@ -0,0 +1,105 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2004   Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		      University of Szeged, Hungary
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef __JFFS2_COMPR_H__
+#define __JFFS2_COMPR_H__
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
+#include "nodelist.h"
+
+#define JFFS2_RUBINMIPS_PRIORITY 10
+#define JFFS2_DYNRUBIN_PRIORITY  20
+#define JFFS2_LZARI_PRIORITY     30
+#define JFFS2_RTIME_PRIORITY     50
+#define JFFS2_ZLIB_PRIORITY      60
+#define JFFS2_LZO_PRIORITY       80
+
+
+#define JFFS2_RUBINMIPS_DISABLED /* RUBINs will be used only */
+#define JFFS2_DYNRUBIN_DISABLED  /*	   for decompression */
+
+#define JFFS2_COMPR_MODE_NONE       0
+#define JFFS2_COMPR_MODE_PRIORITY   1
+#define JFFS2_COMPR_MODE_SIZE       2
+#define JFFS2_COMPR_MODE_FAVOURLZO  3
+#define JFFS2_COMPR_MODE_FORCELZO   4
+#define JFFS2_COMPR_MODE_FORCEZLIB  5
+
+#define FAVOUR_LZO_PERCENT 80
+
+struct jffs2_compressor {
+	struct list_head list;
+	int priority;			/* used by prirority comr. mode */
+	char *name;
+	char compr;			/* JFFS2_COMPR_XXX */
+	int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
+			uint32_t *srclen, uint32_t *destlen);
+	int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
+			  uint32_t cdatalen, uint32_t datalen);
+	int usecount;
+	int disabled;		/* if set the compressor won't compress */
+	unsigned char *compr_buf;	/* used by size compr. mode */
+	uint32_t compr_buf_size;	/* used by size compr. mode */
+	uint32_t stat_compr_orig_size;
+	uint32_t stat_compr_new_size;
+	uint32_t stat_compr_blocks;
+	uint32_t stat_decompr_blocks;
+};
+
+int jffs2_register_compressor(struct jffs2_compressor *comp);
+int jffs2_unregister_compressor(struct jffs2_compressor *comp);
+
+int jffs2_compressors_init(void);
+int jffs2_compressors_exit(void);
+
+uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			unsigned char *data_in, unsigned char **cpage_out,
+			uint32_t *datalen, uint32_t *cdatalen);
+
+int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     uint16_t comprtype, unsigned char *cdata_in,
+		     unsigned char *data_out, uint32_t cdatalen, uint32_t datalen);
+
+void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig);
+
+/* Compressor modules */
+/* These functions will be called by jffs2_compressors_init/exit */
+
+#ifdef CONFIG_JFFS2_RUBIN
+int jffs2_rubinmips_init(void);
+void jffs2_rubinmips_exit(void);
+int jffs2_dynrubin_init(void);
+void jffs2_dynrubin_exit(void);
+#endif
+#ifdef CONFIG_JFFS2_RTIME
+int jffs2_rtime_init(void);
+void jffs2_rtime_exit(void);
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+int jffs2_zlib_init(void);
+void jffs2_zlib_exit(void);
+#endif
+#ifdef CONFIG_JFFS2_LZO
+int jffs2_lzo_init(void);
+void jffs2_lzo_exit(void);
+#endif
+
+#endif /* __JFFS2_COMPR_H__ */
diff --git a/kernel/fs/jffs2/compr_lzo.c b/kernel/fs/jffs2/compr_lzo.c
new file mode 100644
index 000000000..c553bd650
--- /dev/null
+++ b/kernel/fs/jffs2/compr_lzo.c
@@ -0,0 +1,110 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2007 Nokia Corporation. All rights reserved.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by Richard Purdie <rpurdie@openedhand.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/lzo.h>
+#include "compr.h"
+
+static void *lzo_mem;
+static void *lzo_compress_buf;
+static DEFINE_MUTEX(deflate_mutex);	/* for lzo_mem and lzo_compress_buf */
+
+static void free_workspace(void)
+{
+	vfree(lzo_mem);
+	vfree(lzo_compress_buf);
+}
+
+static int __init alloc_workspace(void)
+{
+	lzo_mem = vmalloc(LZO1X_MEM_COMPRESS);
+	lzo_compress_buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
+
+	if (!lzo_mem || !lzo_compress_buf) {
+		free_workspace();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
+			      uint32_t *sourcelen, uint32_t *dstlen)
+{
+	size_t compress_size;
+	int ret;
+
+	mutex_lock(&deflate_mutex);
+	ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem);
+	if (ret != LZO_E_OK)
+		goto fail;
+
+	if (compress_size > *dstlen)
+		goto fail;
+
+	memcpy(cpage_out, lzo_compress_buf, compress_size);
+	mutex_unlock(&deflate_mutex);
+
+	*dstlen = compress_size;
+	return 0;
+
+ fail:
+	mutex_unlock(&deflate_mutex);
+	return -1;
+}
+
+static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
+				 uint32_t srclen, uint32_t destlen)
+{
+	size_t dl = destlen;
+	int ret;
+
+	ret = lzo1x_decompress_safe(data_in, srclen, cpage_out, &dl);
+
+	if (ret != LZO_E_OK || dl != destlen)
+		return -1;
+
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_lzo_comp = {
+	.priority = JFFS2_LZO_PRIORITY,
+	.name = "lzo",
+	.compr = JFFS2_COMPR_LZO,
+	.compress = &jffs2_lzo_compress,
+	.decompress = &jffs2_lzo_decompress,
+	.disabled = 0,
+};
+
+int __init jffs2_lzo_init(void)
+{
+	int ret;
+
+	ret = alloc_workspace();
+	if (ret < 0)
+		return ret;
+
+	ret = jffs2_register_compressor(&jffs2_lzo_comp);
+	if (ret)
+		free_workspace();
+
+	return ret;
+}
+
+void jffs2_lzo_exit(void)
+{
+	jffs2_unregister_compressor(&jffs2_lzo_comp);
+	free_workspace();
+}
diff --git a/kernel/fs/jffs2/compr_rtime.c b/kernel/fs/jffs2/compr_rtime.c
new file mode 100644
index 000000000..406d9cc84
--- /dev/null
+++ b/kernel/fs/jffs2/compr_rtime.c
@@ -0,0 +1,130 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by Arjan van de Ven <arjanv@redhat.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ *
+ *
+ * Very simple lz77-ish encoder.
+ *
+ * Theory of operation: Both encoder and decoder have a list of "last
+ * occurrences" for every possible source-value; after sending the
+ * first source-byte, the second byte indicated the "run" length of
+ * matches
+ *
+ * The algorithm is intended to only send "whole bytes", no bit-messing.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/jffs2.h>
+#include "compr.h"
+
+/* _compress returns the compressed size, -1 if bigger */
+static int jffs2_rtime_compress(unsigned char *data_in,
+				unsigned char *cpage_out,
+				uint32_t *sourcelen, uint32_t *dstlen)
+{
+	unsigned short positions[256];
+	int outpos = 0;
+	int pos=0;
+
+	memset(positions,0,sizeof(positions));
+
+	while (pos < (*sourcelen) && outpos <= (*dstlen)-2) {
+		int backpos, runlen=0;
+		unsigned char value;
+
+		value = data_in[pos];
+
+		cpage_out[outpos++] = data_in[pos++];
+
+		backpos = positions[value];
+		positions[value]=pos;
+
+		while ((backpos < pos) && (pos < (*sourcelen)) &&
+		       (data_in[pos]==data_in[backpos++]) && (runlen<255)) {
+			pos++;
+			runlen++;
+		}
+		cpage_out[outpos++] = runlen;
+	}
+
+	if (outpos >= pos) {
+		/* We failed */
+		return -1;
+	}
+
+	/* Tell the caller how much we managed to compress, and how much space it took */
+	*sourcelen = pos;
+	*dstlen = outpos;
+	return 0;
+}
+
+
+static int jffs2_rtime_decompress(unsigned char *data_in,
+				  unsigned char *cpage_out,
+				  uint32_t srclen, uint32_t destlen)
+{
+	unsigned short positions[256];
+	int outpos = 0;
+	int pos=0;
+
+	memset(positions,0,sizeof(positions));
+
+	while (outpos<destlen) {
+		unsigned char value;
+		int backoffs;
+		int repeat;
+
+		value = data_in[pos++];
+		cpage_out[outpos++] = value; /* first the verbatim copied byte */
+		repeat = data_in[pos++];
+		backoffs = positions[value];
+
+		positions[value]=outpos;
+		if (repeat) {
+			if (backoffs + repeat >= outpos) {
+				while(repeat) {
+					cpage_out[outpos++] = cpage_out[backoffs++];
+					repeat--;
+				}
+			} else {
+				memcpy(&cpage_out[outpos],&cpage_out[backoffs],repeat);
+				outpos+=repeat;
+			}
+		}
+	}
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_rtime_comp = {
+    .priority = JFFS2_RTIME_PRIORITY,
+    .name = "rtime",
+    .compr = JFFS2_COMPR_RTIME,
+    .compress = &jffs2_rtime_compress,
+    .decompress = &jffs2_rtime_decompress,
+#ifdef JFFS2_RTIME_DISABLED
+    .disabled = 1,
+#else
+    .disabled = 0,
+#endif
+};
+
+int jffs2_rtime_init(void)
+{
+    return jffs2_register_compressor(&jffs2_rtime_comp);
+}
+
+void jffs2_rtime_exit(void)
+{
+    jffs2_unregister_compressor(&jffs2_rtime_comp);
+}
diff --git a/kernel/fs/jffs2/compr_rubin.c b/kernel/fs/jffs2/compr_rubin.c
new file mode 100644
index 000000000..556de100e
--- /dev/null
+++ b/kernel/fs/jffs2/compr_rubin.c
@@ -0,0 +1,452 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by Arjan van de Ven <arjanv@redhat.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/jffs2.h>
+#include <linux/errno.h>
+#include "compr.h"
+
+
+#define RUBIN_REG_SIZE   16
+#define UPPER_BIT_RUBIN    (((long) 1)<<(RUBIN_REG_SIZE-1))
+#define LOWER_BITS_RUBIN   ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
+
+
+#define BIT_DIVIDER_MIPS 1043
+static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
+
+struct pushpull {
+	unsigned char *buf;
+	unsigned int buflen;
+	unsigned int ofs;
+	unsigned int reserve;
+};
+
+struct rubin_state {
+	unsigned long p;
+	unsigned long q;
+	unsigned long rec_q;
+	long bit_number;
+	struct pushpull pp;
+	int bit_divider;
+	int bits[8];
+};
+
+static inline void init_pushpull(struct pushpull *pp, char *buf,
+				 unsigned buflen, unsigned ofs,
+				 unsigned reserve)
+{
+	pp->buf = buf;
+	pp->buflen = buflen;
+	pp->ofs = ofs;
+	pp->reserve = reserve;
+}
+
+static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
+{
+	if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
+		return -ENOSPC;
+
+	if (bit)
+		pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
+	else
+		pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
+
+	pp->ofs++;
+
+	return 0;
+}
+
+static inline int pushedbits(struct pushpull *pp)
+{
+	return pp->ofs;
+}
+
+static inline int pullbit(struct pushpull *pp)
+{
+	int bit;
+
+	bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
+
+	pp->ofs++;
+	return bit;
+}
+
+
+static void init_rubin(struct rubin_state *rs, int div, int *bits)
+{
+	int c;
+
+	rs->q = 0;
+	rs->p = (long) (2 * UPPER_BIT_RUBIN);
+	rs->bit_number = (long) 0;
+	rs->bit_divider = div;
+
+	for (c=0; c<8; c++)
+		rs->bits[c] = bits[c];
+}
+
+
+static int encode(struct rubin_state *rs, long A, long B, int symbol)
+{
+
+	long i0, i1;
+	int ret;
+
+	while ((rs->q >= UPPER_BIT_RUBIN) ||
+	       ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
+		rs->bit_number++;
+
+		ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
+		if (ret)
+			return ret;
+		rs->q &= LOWER_BITS_RUBIN;
+		rs->q <<= 1;
+		rs->p <<= 1;
+	}
+	i0 = A * rs->p / (A + B);
+	if (i0 <= 0)
+		i0 = 1;
+
+	if (i0 >= rs->p)
+		i0 = rs->p - 1;
+
+	i1 = rs->p - i0;
+
+	if (symbol == 0)
+		rs->p = i0;
+	else {
+		rs->p = i1;
+		rs->q += i0;
+	}
+	return 0;
+}
+
+
+static void end_rubin(struct rubin_state *rs)
+{
+
+	int i;
+
+	for (i = 0; i < RUBIN_REG_SIZE; i++) {
+		pushbit(&rs->pp, (UPPER_BIT_RUBIN & rs->q) ? 1 : 0, 1);
+		rs->q &= LOWER_BITS_RUBIN;
+		rs->q <<= 1;
+	}
+}
+
+
+static void init_decode(struct rubin_state *rs, int div, int *bits)
+{
+	init_rubin(rs, div, bits);
+
+	/* behalve lower */
+	rs->rec_q = 0;
+
+	for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
+	     rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
+		;
+}
+
+static void __do_decode(struct rubin_state *rs, unsigned long p,
+			unsigned long q)
+{
+	register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
+	unsigned long rec_q;
+	int c, bits = 0;
+
+	/*
+	 * First, work out how many bits we need from the input stream.
+	 * Note that we have already done the initial check on this
+	 * loop prior to calling this function.
+	 */
+	do {
+		bits++;
+		q &= lower_bits_rubin;
+		q <<= 1;
+		p <<= 1;
+	} while ((q >= UPPER_BIT_RUBIN) || ((p + q) <= UPPER_BIT_RUBIN));
+
+	rs->p = p;
+	rs->q = q;
+
+	rs->bit_number += bits;
+
+	/*
+	 * Now get the bits.  We really want this to be "get n bits".
+	 */
+	rec_q = rs->rec_q;
+	do {
+		c = pullbit(&rs->pp);
+		rec_q &= lower_bits_rubin;
+		rec_q <<= 1;
+		rec_q += c;
+	} while (--bits);
+	rs->rec_q = rec_q;
+}
+
+static int decode(struct rubin_state *rs, long A, long B)
+{
+	unsigned long p = rs->p, q = rs->q;
+	long i0, threshold;
+	int symbol;
+
+	if (q >= UPPER_BIT_RUBIN || ((p + q) <= UPPER_BIT_RUBIN))
+		__do_decode(rs, p, q);
+
+	i0 = A * rs->p / (A + B);
+	if (i0 <= 0)
+		i0 = 1;
+
+	if (i0 >= rs->p)
+		i0 = rs->p - 1;
+
+	threshold = rs->q + i0;
+	symbol = rs->rec_q >= threshold;
+	if (rs->rec_q >= threshold) {
+		rs->q += i0;
+		i0 = rs->p - i0;
+	}
+
+	rs->p = i0;
+
+	return symbol;
+}
+
+
+
+static int out_byte(struct rubin_state *rs, unsigned char byte)
+{
+	int i, ret;
+	struct rubin_state rs_copy;
+	rs_copy = *rs;
+
+	for (i=0; i<8; i++) {
+		ret = encode(rs, rs->bit_divider-rs->bits[i],
+			     rs->bits[i], byte & 1);
+		if (ret) {
+			/* Failed. Restore old state */
+			*rs = rs_copy;
+			return ret;
+		}
+		byte >>= 1 ;
+	}
+	return 0;
+}
+
+static int in_byte(struct rubin_state *rs)
+{
+	int i, result = 0, bit_divider = rs->bit_divider;
+
+	for (i = 0; i < 8; i++)
+		result |= decode(rs, bit_divider - rs->bits[i],
+				 rs->bits[i]) << i;
+
+	return result;
+}
+
+
+
+static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
+			     unsigned char *cpage_out, uint32_t *sourcelen,
+			     uint32_t *dstlen)
+	{
+	int outpos = 0;
+	int pos=0;
+	struct rubin_state rs;
+
+	init_pushpull(&rs.pp, cpage_out, *dstlen * 8, 0, 32);
+
+	init_rubin(&rs, bit_divider, bits);
+
+	while (pos < (*sourcelen) && !out_byte(&rs, data_in[pos]))
+		pos++;
+
+	end_rubin(&rs);
+
+	if (outpos > pos) {
+		/* We failed */
+		return -1;
+	}
+
+	/* Tell the caller how much we managed to compress,
+	 * and how much space it took */
+
+	outpos = (pushedbits(&rs.pp)+7)/8;
+
+	if (outpos >= pos)
+		return -1; /* We didn't actually compress */
+	*sourcelen = pos;
+	*dstlen = outpos;
+	return 0;
+}
+#if 0
+/* _compress returns the compressed size, -1 if bigger */
+int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
+		   uint32_t *sourcelen, uint32_t *dstlen)
+{
+	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+				 cpage_out, sourcelen, dstlen);
+}
+#endif
+static int jffs2_dynrubin_compress(unsigned char *data_in,
+				   unsigned char *cpage_out,
+				   uint32_t *sourcelen, uint32_t *dstlen)
+{
+	int bits[8];
+	unsigned char histo[256];
+	int i;
+	int ret;
+	uint32_t mysrclen, mydstlen;
+
+	mysrclen = *sourcelen;
+	mydstlen = *dstlen - 8;
+
+	if (*dstlen <= 12)
+		return -1;
+
+	memset(histo, 0, 256);
+	for (i=0; i<mysrclen; i++)
+		histo[data_in[i]]++;
+	memset(bits, 0, sizeof(int)*8);
+	for (i=0; i<256; i++) {
+		if (i&128)
+			bits[7] += histo[i];
+		if (i&64)
+			bits[6] += histo[i];
+		if (i&32)
+			bits[5] += histo[i];
+		if (i&16)
+			bits[4] += histo[i];
+		if (i&8)
+			bits[3] += histo[i];
+		if (i&4)
+			bits[2] += histo[i];
+		if (i&2)
+			bits[1] += histo[i];
+		if (i&1)
+			bits[0] += histo[i];
+	}
+
+	for (i=0; i<8; i++) {
+		bits[i] = (bits[i] * 256) / mysrclen;
+		if (!bits[i]) bits[i] = 1;
+		if (bits[i] > 255) bits[i] = 255;
+		cpage_out[i] = bits[i];
+	}
+
+	ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
+				&mydstlen);
+	if (ret)
+		return ret;
+
+	/* Add back the 8 bytes we took for the probabilities */
+	mydstlen += 8;
+
+	if (mysrclen <= mydstlen) {
+		/* We compressed */
+		return -1;
+	}
+
+	*sourcelen = mysrclen;
+	*dstlen = mydstlen;
+	return 0;
+}
+
+static void rubin_do_decompress(int bit_divider, int *bits,
+				unsigned char *cdata_in, 
+				unsigned char *page_out, uint32_t srclen,
+				uint32_t destlen)
+{
+	int outpos = 0;
+	struct rubin_state rs;
+
+	init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
+	init_decode(&rs, bit_divider, bits);
+
+	while (outpos < destlen)
+		page_out[outpos++] = in_byte(&rs);
+}
+
+
+static int jffs2_rubinmips_decompress(unsigned char *data_in,
+				      unsigned char *cpage_out,
+				      uint32_t sourcelen, uint32_t dstlen)
+{
+	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+			    cpage_out, sourcelen, dstlen);
+	return 0;
+}
+
+static int jffs2_dynrubin_decompress(unsigned char *data_in,
+				     unsigned char *cpage_out,
+				     uint32_t sourcelen, uint32_t dstlen)
+{
+	int bits[8];
+	int c;
+
+	for (c=0; c<8; c++)
+		bits[c] = data_in[c];
+
+	rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
+			    dstlen);
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_rubinmips_comp = {
+	.priority = JFFS2_RUBINMIPS_PRIORITY,
+	.name = "rubinmips",
+	.compr = JFFS2_COMPR_DYNRUBIN,
+	.compress = NULL, /*&jffs2_rubinmips_compress,*/
+	.decompress = &jffs2_rubinmips_decompress,
+#ifdef JFFS2_RUBINMIPS_DISABLED
+	.disabled = 1,
+#else
+	.disabled = 0,
+#endif
+};
+
+int jffs2_rubinmips_init(void)
+{
+	return jffs2_register_compressor(&jffs2_rubinmips_comp);
+}
+
+void jffs2_rubinmips_exit(void)
+{
+	jffs2_unregister_compressor(&jffs2_rubinmips_comp);
+}
+
+static struct jffs2_compressor jffs2_dynrubin_comp = {
+	.priority = JFFS2_DYNRUBIN_PRIORITY,
+	.name = "dynrubin",
+	.compr = JFFS2_COMPR_RUBINMIPS,
+	.compress = jffs2_dynrubin_compress,
+	.decompress = &jffs2_dynrubin_decompress,
+#ifdef JFFS2_DYNRUBIN_DISABLED
+	.disabled = 1,
+#else
+	.disabled = 0,
+#endif
+};
+
+int jffs2_dynrubin_init(void)
+{
+	return jffs2_register_compressor(&jffs2_dynrubin_comp);
+}
+
+void jffs2_dynrubin_exit(void)
+{
+	jffs2_unregister_compressor(&jffs2_dynrubin_comp);
+}
diff --git a/kernel/fs/jffs2/compr_zlib.c b/kernel/fs/jffs2/compr_zlib.c
new file mode 100644
index 000000000..5698dae5d
--- /dev/null
+++ b/kernel/fs/jffs2/compr_zlib.c
@@ -0,0 +1,222 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#if !defined(__KERNEL__) && !defined(__ECOS)
+#error "The userspace support got too messy and was removed. Update your mkfs.jffs2"
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include "nodelist.h"
+#include "compr.h"
+
+	/* Plan: call deflate() with avail_in == *sourcelen,
+		avail_out = *dstlen - 12 and flush == Z_FINISH.
+		If it doesn't manage to finish,	call it again with
+		avail_in == 0 and avail_out set to the remaining 12
+		bytes for it to clean up.
+	   Q: Is 12 bytes sufficient?
+	*/
+#define STREAM_END_SPACE 12
+
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+static z_stream inf_strm, def_strm;
+
+#ifdef __KERNEL__ /* Linux-only */
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+
+static int __init alloc_workspaces(void)
+{
+	def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+							MAX_MEM_LEVEL));
+	if (!def_strm.workspace)
+		return -ENOMEM;
+
+	jffs2_dbg(1, "Allocated %d bytes for deflate workspace\n",
+		  zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
+	inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+	if (!inf_strm.workspace) {
+		vfree(def_strm.workspace);
+		return -ENOMEM;
+	}
+	jffs2_dbg(1, "Allocated %d bytes for inflate workspace\n",
+		  zlib_inflate_workspacesize());
+	return 0;
+}
+
+static void free_workspaces(void)
+{
+	vfree(def_strm.workspace);
+	vfree(inf_strm.workspace);
+}
+#else
+#define alloc_workspaces() (0)
+#define free_workspaces() do { } while(0)
+#endif /* __KERNEL__ */
+
+static int jffs2_zlib_compress(unsigned char *data_in,
+			       unsigned char *cpage_out,
+			       uint32_t *sourcelen, uint32_t *dstlen)
+{
+	int ret;
+
+	if (*dstlen <= STREAM_END_SPACE)
+		return -1;
+
+	mutex_lock(&deflate_mutex);
+
+	if (Z_OK != zlib_deflateInit(&def_strm, 3)) {
+		pr_warn("deflateInit failed\n");
+		mutex_unlock(&deflate_mutex);
+		return -1;
+	}
+
+	def_strm.next_in = data_in;
+	def_strm.total_in = 0;
+
+	def_strm.next_out = cpage_out;
+	def_strm.total_out = 0;
+
+	while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) {
+		def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE);
+		def_strm.avail_in = min_t(unsigned long,
+			(*sourcelen-def_strm.total_in), def_strm.avail_out);
+		jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n",
+			  def_strm.avail_in, def_strm.avail_out);
+		ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH);
+		jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n",
+			  def_strm.avail_in, def_strm.avail_out,
+			  def_strm.total_in, def_strm.total_out);
+		if (ret != Z_OK) {
+			jffs2_dbg(1, "deflate in loop returned %d\n", ret);
+			zlib_deflateEnd(&def_strm);
+			mutex_unlock(&deflate_mutex);
+			return -1;
+		}
+	}
+	def_strm.avail_out += STREAM_END_SPACE;
+	def_strm.avail_in = 0;
+	ret = zlib_deflate(&def_strm, Z_FINISH);
+	zlib_deflateEnd(&def_strm);
+
+	if (ret != Z_STREAM_END) {
+		jffs2_dbg(1, "final deflate returned %d\n", ret);
+		ret = -1;
+		goto out;
+	}
+
+	if (def_strm.total_out >= def_strm.total_in) {
+		jffs2_dbg(1, "zlib compressed %ld bytes into %ld; failing\n",
+			  def_strm.total_in, def_strm.total_out);
+		ret = -1;
+		goto out;
+	}
+
+	jffs2_dbg(1, "zlib compressed %ld bytes into %ld\n",
+		  def_strm.total_in, def_strm.total_out);
+
+	*dstlen = def_strm.total_out;
+	*sourcelen = def_strm.total_in;
+	ret = 0;
+ out:
+	mutex_unlock(&deflate_mutex);
+	return ret;
+}
+
+static int jffs2_zlib_decompress(unsigned char *data_in,
+				 unsigned char *cpage_out,
+				 uint32_t srclen, uint32_t destlen)
+{
+	int ret;
+	int wbits = MAX_WBITS;
+
+	mutex_lock(&inflate_mutex);
+
+	inf_strm.next_in = data_in;
+	inf_strm.avail_in = srclen;
+	inf_strm.total_in = 0;
+
+	inf_strm.next_out = cpage_out;
+	inf_strm.avail_out = destlen;
+	inf_strm.total_out = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		jffs2_dbg(2, "inflate skipping adler32\n");
+		wbits = -((data_in[0] >> 4) + 8);
+		inf_strm.next_in += 2;
+		inf_strm.avail_in -= 2;
+	} else {
+		/* Let this remain D1 for now -- it should never happen */
+		jffs2_dbg(1, "inflate not skipping adler32\n");
+	}
+
+
+	if (Z_OK != zlib_inflateInit2(&inf_strm, wbits)) {
+		pr_warn("inflateInit failed\n");
+		mutex_unlock(&inflate_mutex);
+		return 1;
+	}
+
+	while((ret = zlib_inflate(&inf_strm, Z_FINISH)) == Z_OK)
+		;
+	if (ret != Z_STREAM_END) {
+		pr_notice("inflate returned %d\n", ret);
+	}
+	zlib_inflateEnd(&inf_strm);
+	mutex_unlock(&inflate_mutex);
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_zlib_comp = {
+    .priority = JFFS2_ZLIB_PRIORITY,
+    .name = "zlib",
+    .compr = JFFS2_COMPR_ZLIB,
+    .compress = &jffs2_zlib_compress,
+    .decompress = &jffs2_zlib_decompress,
+#ifdef JFFS2_ZLIB_DISABLED
+    .disabled = 1,
+#else
+    .disabled = 0,
+#endif
+};
+
+int __init jffs2_zlib_init(void)
+{
+    int ret;
+
+    ret = alloc_workspaces();
+    if (ret)
+	    return ret;
+
+    ret = jffs2_register_compressor(&jffs2_zlib_comp);
+    if (ret)
+	    free_workspaces();
+
+    return ret;
+}
+
+void jffs2_zlib_exit(void)
+{
+    jffs2_unregister_compressor(&jffs2_zlib_comp);
+    free_workspaces();
+}
diff --git a/kernel/fs/jffs2/debug.c b/kernel/fs/jffs2/debug.c
new file mode 100644
index 000000000..1090eb64b
--- /dev/null
+++ b/kernel/fs/jffs2/debug.c
@@ -0,0 +1,866 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/mtd/mtd.h>
+#include <linux/slab.h>
+#include "nodelist.h"
+#include "debug.h"
+
+#ifdef JFFS2_DBG_SANITY_CHECKS
+
+void
+__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c,
+				     struct jffs2_eraseblock *jeb)
+{
+	if (unlikely(jeb && jeb->used_size + jeb->dirty_size +
+			jeb->free_size + jeb->wasted_size +
+			jeb->unchecked_size != c->sector_size)) {
+		JFFS2_ERROR("eeep, space accounting for block at 0x%08x is screwed.\n", jeb->offset);
+		JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n",
+			jeb->free_size, jeb->dirty_size, jeb->used_size,
+			jeb->wasted_size, jeb->unchecked_size, c->sector_size);
+		BUG();
+	}
+
+	if (unlikely(c->used_size + c->dirty_size + c->free_size + c->erasing_size + c->bad_size
+				+ c->wasted_size + c->unchecked_size != c->flash_size)) {
+		JFFS2_ERROR("eeep, space accounting superblock info is screwed.\n");
+		JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + erasing %#08x + bad %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n",
+			c->free_size, c->dirty_size, c->used_size, c->erasing_size, c->bad_size,
+			c->wasted_size, c->unchecked_size, c->flash_size);
+		BUG();
+	}
+}
+
+void
+__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c,
+			      struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+#endif /* JFFS2_DBG_SANITY_CHECKS */
+
+#ifdef JFFS2_DBG_PARANOIA_CHECKS
+/*
+ * Check the fragtree.
+ */
+void
+__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f)
+{
+	mutex_lock(&f->sem);
+	__jffs2_dbg_fragtree_paranoia_check_nolock(f);
+	mutex_unlock(&f->sem);
+}
+
+void
+__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f)
+{
+	struct jffs2_node_frag *frag;
+	int bitched = 0;
+
+	for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) {
+		struct jffs2_full_dnode *fn = frag->node;
+
+		if (!fn || !fn->raw)
+			continue;
+
+		if (ref_flags(fn->raw) == REF_PRISTINE) {
+			if (fn->frags > 1) {
+				JFFS2_ERROR("REF_PRISTINE node at 0x%08x had %d frags. Tell dwmw2.\n",
+					ref_offset(fn->raw), fn->frags);
+				bitched = 1;
+			}
+
+			/* A hole node which isn't multi-page should be garbage-collected
+			   and merged anyway, so we just check for the frag size here,
+			   rather than mucking around with actually reading the node
+			   and checking the compression type, which is the real way
+			   to tell a hole node. */
+			if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag)
+					&& frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) {
+				JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n",
+					ref_offset(fn->raw));
+				bitched = 1;
+			}
+
+			if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag)
+					&& frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) {
+				JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n",
+				       ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size);
+				bitched = 1;
+			}
+		}
+	}
+
+	if (bitched) {
+		JFFS2_ERROR("fragtree is corrupted.\n");
+		__jffs2_dbg_dump_fragtree_nolock(f);
+		BUG();
+	}
+}
+
+/*
+ * Check if the flash contains all 0xFF before we start writing.
+ */
+void
+__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c,
+				    uint32_t ofs, int len)
+{
+	size_t retlen;
+	int ret, i;
+	unsigned char *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	ret = jffs2_flash_read(c, ofs, len, &retlen, buf);
+	if (ret || (retlen != len)) {
+		JFFS2_WARNING("read %d bytes failed or short. ret %d, retlen %zd.\n",
+				len, ret, retlen);
+		kfree(buf);
+		return;
+	}
+
+	ret = 0;
+	for (i = 0; i < len; i++)
+		if (buf[i] != 0xff)
+			ret = 1;
+
+	if (ret) {
+		JFFS2_ERROR("argh, about to write node to %#08x on flash, but there are data already there. The first corrupted byte is at %#08x offset.\n",
+			ofs, ofs + i);
+		__jffs2_dbg_dump_buffer(buf, len, ofs);
+		kfree(buf);
+		BUG();
+	}
+
+	kfree(buf);
+}
+
+void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
+{
+	struct jffs2_eraseblock *jeb;
+	uint32_t free = 0, dirty = 0, used = 0, wasted = 0,
+		erasing = 0, bad = 0, unchecked = 0;
+	int nr_counted = 0;
+	int dump = 0;
+
+	if (c->gcblock) {
+		nr_counted++;
+		free += c->gcblock->free_size;
+		dirty += c->gcblock->dirty_size;
+		used += c->gcblock->used_size;
+		wasted += c->gcblock->wasted_size;
+		unchecked += c->gcblock->unchecked_size;
+	}
+	if (c->nextblock) {
+		nr_counted++;
+		free += c->nextblock->free_size;
+		dirty += c->nextblock->dirty_size;
+		used += c->nextblock->used_size;
+		wasted += c->nextblock->wasted_size;
+		unchecked += c->nextblock->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->clean_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->very_dirty_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->dirty_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->erasable_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->erasable_pending_wbuf_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->erase_pending_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->free_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->bad_used_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+
+	list_for_each_entry(jeb, &c->erasing_list, list) {
+		nr_counted++;
+		erasing += c->sector_size;
+	}
+	list_for_each_entry(jeb, &c->erase_checking_list, list) {
+		nr_counted++;
+		erasing += c->sector_size;
+	}
+	list_for_each_entry(jeb, &c->erase_complete_list, list) {
+		nr_counted++;
+		erasing += c->sector_size;
+	}
+	list_for_each_entry(jeb, &c->bad_list, list) {
+		nr_counted++;
+		bad += c->sector_size;
+	}
+
+#define check(sz)							\
+do {									\
+	if (sz != c->sz##_size) {					\
+		pr_warn("%s_size mismatch counted 0x%x, c->%s_size 0x%x\n", \
+			#sz, sz, #sz, c->sz##_size);			\
+		dump = 1;						\
+	}								\
+} while (0)
+
+	check(free);
+	check(dirty);
+	check(used);
+	check(wasted);
+	check(unchecked);
+	check(bad);
+	check(erasing);
+
+#undef check
+
+	if (nr_counted != c->nr_blocks) {
+		pr_warn("%s counted only 0x%x blocks of 0x%x. Where are the others?\n",
+			__func__, nr_counted, c->nr_blocks);
+		dump = 1;
+	}
+
+	if (dump) {
+		__jffs2_dbg_dump_block_lists_nolock(c);
+		BUG();
+	}
+}
+
+/*
+ * Check the space accounting and node_ref list correctness for the JFFS2 erasable block 'jeb'.
+ */
+void
+__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c,
+				struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c,
+				       struct jffs2_eraseblock *jeb)
+{
+	uint32_t my_used_size = 0;
+	uint32_t my_unchecked_size = 0;
+	uint32_t my_dirty_size = 0;
+	struct jffs2_raw_node_ref *ref2 = jeb->first_node;
+
+	while (ref2) {
+		uint32_t totlen = ref_totlen(c, jeb, ref2);
+
+		if (ref_offset(ref2) < jeb->offset ||
+				ref_offset(ref2) > jeb->offset + c->sector_size) {
+			JFFS2_ERROR("node_ref %#08x shouldn't be in block at %#08x.\n",
+				ref_offset(ref2), jeb->offset);
+			goto error;
+
+		}
+		if (ref_flags(ref2) == REF_UNCHECKED)
+			my_unchecked_size += totlen;
+		else if (!ref_obsolete(ref2))
+			my_used_size += totlen;
+		else
+			my_dirty_size += totlen;
+
+		if ((!ref_next(ref2)) != (ref2 == jeb->last_node)) {
+			JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
+				    ref_offset(ref2), ref2, ref_offset(ref_next(ref2)), ref_next(ref2),
+				    ref_offset(jeb->last_node), jeb->last_node);
+			goto error;
+		}
+		ref2 = ref_next(ref2);
+	}
+
+	if (my_used_size != jeb->used_size) {
+		JFFS2_ERROR("Calculated used size %#08x != stored used size %#08x.\n",
+			my_used_size, jeb->used_size);
+		goto error;
+	}
+
+	if (my_unchecked_size != jeb->unchecked_size) {
+		JFFS2_ERROR("Calculated unchecked size %#08x != stored unchecked size %#08x.\n",
+			my_unchecked_size, jeb->unchecked_size);
+		goto error;
+	}
+
+#if 0
+	/* This should work when we implement ref->__totlen elemination */
+	if (my_dirty_size != jeb->dirty_size + jeb->wasted_size) {
+		JFFS2_ERROR("Calculated dirty+wasted size %#08x != stored dirty + wasted size %#08x\n",
+			my_dirty_size, jeb->dirty_size + jeb->wasted_size);
+		goto error;
+	}
+
+	if (jeb->free_size == 0
+		&& my_used_size + my_unchecked_size + my_dirty_size != c->sector_size) {
+		JFFS2_ERROR("The sum of all nodes in block (%#x) != size of block (%#x)\n",
+			my_used_size + my_unchecked_size + my_dirty_size,
+			c->sector_size);
+		goto error;
+	}
+#endif
+
+	if (!(c->flags & (JFFS2_SB_FLAG_BUILDING|JFFS2_SB_FLAG_SCANNING)))
+		__jffs2_dbg_superblock_counts(c);
+
+	return;
+
+error:
+	__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+	__jffs2_dbg_dump_jeb_nolock(jeb);
+	__jffs2_dbg_dump_block_lists_nolock(c);
+	BUG();
+
+}
+#endif /* JFFS2_DBG_PARANOIA_CHECKS */
+
+#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
+/*
+ * Dump the node_refs of the 'jeb' JFFS2 eraseblock.
+ */
+void
+__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c,
+			   struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c,
+				  struct jffs2_eraseblock *jeb)
+{
+	struct jffs2_raw_node_ref *ref;
+	int i = 0;
+
+	printk(JFFS2_DBG_MSG_PREFIX " Dump node_refs of the eraseblock %#08x\n", jeb->offset);
+	if (!jeb->first_node) {
+		printk(JFFS2_DBG_MSG_PREFIX " no nodes in the eraseblock %#08x\n", jeb->offset);
+		return;
+	}
+
+	printk(JFFS2_DBG);
+	for (ref = jeb->first_node; ; ref = ref_next(ref)) {
+		printk("%#08x", ref_offset(ref));
+#ifdef TEST_TOTLEN
+		printk("(%x)", ref->__totlen);
+#endif
+		if (ref_next(ref))
+			printk("->");
+		else
+			break;
+		if (++i == 4) {
+			i = 0;
+			printk("\n" JFFS2_DBG);
+		}
+	}
+	printk("\n");
+}
+
+/*
+ * Dump an eraseblock's space accounting.
+ */
+void
+__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_dump_jeb_nolock(jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb)
+{
+	if (!jeb)
+		return;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump space accounting for the eraseblock at %#08x:\n",
+			jeb->offset);
+
+	printk(JFFS2_DBG "used_size: %#08x\n",		jeb->used_size);
+	printk(JFFS2_DBG "dirty_size: %#08x\n",		jeb->dirty_size);
+	printk(JFFS2_DBG "wasted_size: %#08x\n",	jeb->wasted_size);
+	printk(JFFS2_DBG "unchecked_size: %#08x\n",	jeb->unchecked_size);
+	printk(JFFS2_DBG "free_size: %#08x\n",		jeb->free_size);
+}
+
+void
+__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_dump_block_lists_nolock(c);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c)
+{
+	printk(JFFS2_DBG_MSG_PREFIX " dump JFFS2 blocks lists:\n");
+
+	printk(JFFS2_DBG "flash_size: %#08x\n",		c->flash_size);
+	printk(JFFS2_DBG "used_size: %#08x\n",		c->used_size);
+	printk(JFFS2_DBG "dirty_size: %#08x\n",		c->dirty_size);
+	printk(JFFS2_DBG "wasted_size: %#08x\n",	c->wasted_size);
+	printk(JFFS2_DBG "unchecked_size: %#08x\n",	c->unchecked_size);
+	printk(JFFS2_DBG "free_size: %#08x\n",		c->free_size);
+	printk(JFFS2_DBG "erasing_size: %#08x\n",	c->erasing_size);
+	printk(JFFS2_DBG "bad_size: %#08x\n",		c->bad_size);
+	printk(JFFS2_DBG "sector_size: %#08x\n",	c->sector_size);
+	printk(JFFS2_DBG "jffs2_reserved_blocks size: %#08x\n",
+				c->sector_size * c->resv_blocks_write);
+
+	if (c->nextblock)
+		printk(JFFS2_DBG "nextblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+			c->nextblock->offset, c->nextblock->used_size,
+			c->nextblock->dirty_size, c->nextblock->wasted_size,
+			c->nextblock->unchecked_size, c->nextblock->free_size);
+	else
+		printk(JFFS2_DBG "nextblock: NULL\n");
+
+	if (c->gcblock)
+		printk(JFFS2_DBG "gcblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+			c->gcblock->offset, c->gcblock->used_size, c->gcblock->dirty_size,
+			c->gcblock->wasted_size, c->gcblock->unchecked_size, c->gcblock->free_size);
+	else
+		printk(JFFS2_DBG "gcblock: NULL\n");
+
+	if (list_empty(&c->clean_list)) {
+		printk(JFFS2_DBG "clean_list: empty\n");
+	} else {
+		struct list_head *this;
+		int numblocks = 0;
+		uint32_t dirty = 0;
+
+		list_for_each(this, &c->clean_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+			numblocks ++;
+			dirty += jeb->wasted_size;
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "clean_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+
+		printk (JFFS2_DBG "Contains %d blocks with total wasted size %u, average wasted size: %u\n",
+			numblocks, dirty, dirty / numblocks);
+	}
+
+	if (list_empty(&c->very_dirty_list)) {
+		printk(JFFS2_DBG "very_dirty_list: empty\n");
+	} else {
+		struct list_head *this;
+		int numblocks = 0;
+		uint32_t dirty = 0;
+
+		list_for_each(this, &c->very_dirty_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			numblocks ++;
+			dirty += jeb->dirty_size;
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "very_dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+
+		printk (JFFS2_DBG "Contains %d blocks with total dirty size %u, average dirty size: %u\n",
+			numblocks, dirty, dirty / numblocks);
+	}
+
+	if (list_empty(&c->dirty_list)) {
+		printk(JFFS2_DBG "dirty_list: empty\n");
+	} else {
+		struct list_head *this;
+		int numblocks = 0;
+		uint32_t dirty = 0;
+
+		list_for_each(this, &c->dirty_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			numblocks ++;
+			dirty += jeb->dirty_size;
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+
+		printk (JFFS2_DBG "contains %d blocks with total dirty size %u, average dirty size: %u\n",
+			numblocks, dirty, dirty / numblocks);
+	}
+
+	if (list_empty(&c->erasable_list)) {
+		printk(JFFS2_DBG "erasable_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erasable_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erasable_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->erasing_list)) {
+		printk(JFFS2_DBG "erasing_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erasing_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erasing_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+	if (list_empty(&c->erase_checking_list)) {
+		printk(JFFS2_DBG "erase_checking_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erase_checking_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erase_checking_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->erase_pending_list)) {
+		printk(JFFS2_DBG "erase_pending_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erase_pending_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erase_pending_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->erasable_pending_wbuf_list)) {
+		printk(JFFS2_DBG "erasable_pending_wbuf_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erasable_pending_wbuf_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erasable_pending_wbuf_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->free_list)) {
+		printk(JFFS2_DBG "free_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->free_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "free_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->bad_list)) {
+		printk(JFFS2_DBG "bad_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->bad_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "bad_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->bad_used_list)) {
+		printk(JFFS2_DBG "bad_used_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->bad_used_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "bad_used_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+}
+
+void
+__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f)
+{
+	mutex_lock(&f->sem);
+	jffs2_dbg_dump_fragtree_nolock(f);
+	mutex_unlock(&f->sem);
+}
+
+void
+__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f)
+{
+	struct jffs2_node_frag *this = frag_first(&f->fragtree);
+	uint32_t lastofs = 0;
+	int buggy = 0;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump fragtree of ino #%u\n", f->inocache->ino);
+	while(this) {
+		if (this->node)
+			printk(JFFS2_DBG "frag %#04x-%#04x: %#08x(%d) on flash (*%p), left (%p), right (%p), parent (%p)\n",
+				this->ofs, this->ofs+this->size, ref_offset(this->node->raw),
+				ref_flags(this->node->raw), this, frag_left(this), frag_right(this),
+				frag_parent(this));
+		else
+			printk(JFFS2_DBG "frag %#04x-%#04x: hole (*%p). left (%p), right (%p), parent (%p)\n",
+				this->ofs, this->ofs+this->size, this, frag_left(this),
+				frag_right(this), frag_parent(this));
+		if (this->ofs != lastofs)
+			buggy = 1;
+		lastofs = this->ofs + this->size;
+		this = frag_next(this);
+	}
+
+	if (f->metadata)
+		printk(JFFS2_DBG "metadata at 0x%08x\n", ref_offset(f->metadata->raw));
+
+	if (buggy) {
+		JFFS2_ERROR("frag tree got a hole in it.\n");
+		BUG();
+	}
+}
+
+#define JFFS2_BUFDUMP_BYTES_PER_LINE	32
+void
+__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs)
+{
+	int skip;
+	int i;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump from offset %#08x to offset %#08x (%x bytes).\n",
+		offs, offs + len, len);
+	i = skip = offs % JFFS2_BUFDUMP_BYTES_PER_LINE;
+	offs = offs & ~(JFFS2_BUFDUMP_BYTES_PER_LINE - 1);
+
+	if (skip != 0)
+		printk(JFFS2_DBG "%#08x: ", offs);
+
+	while (skip--)
+		printk("   ");
+
+	while (i < len) {
+		if ((i % JFFS2_BUFDUMP_BYTES_PER_LINE) == 0 && i != len -1) {
+			if (i != 0)
+				printk("\n");
+			offs += JFFS2_BUFDUMP_BYTES_PER_LINE;
+			printk(JFFS2_DBG "%0#8x: ", offs);
+		}
+
+		printk("%02x ", buf[i]);
+
+		i += 1;
+	}
+
+	printk("\n");
+}
+
+/*
+ * Dump a JFFS2 node.
+ */
+void
+__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs)
+{
+	union jffs2_node_union node;
+	int len = sizeof(union jffs2_node_union);
+	size_t retlen;
+	uint32_t crc;
+	int ret;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump node at offset %#08x.\n", ofs);
+
+	ret = jffs2_flash_read(c, ofs, len, &retlen, (unsigned char *)&node);
+	if (ret || (retlen != len)) {
+		JFFS2_ERROR("read %d bytes failed or short. ret %d, retlen %zd.\n",
+			len, ret, retlen);
+		return;
+	}
+
+	printk(JFFS2_DBG "magic:\t%#04x\n", je16_to_cpu(node.u.magic));
+	printk(JFFS2_DBG "nodetype:\t%#04x\n", je16_to_cpu(node.u.nodetype));
+	printk(JFFS2_DBG "totlen:\t%#08x\n", je32_to_cpu(node.u.totlen));
+	printk(JFFS2_DBG "hdr_crc:\t%#08x\n", je32_to_cpu(node.u.hdr_crc));
+
+	crc = crc32(0, &node.u, sizeof(node.u) - 4);
+	if (crc != je32_to_cpu(node.u.hdr_crc)) {
+		JFFS2_ERROR("wrong common header CRC.\n");
+		return;
+	}
+
+	if (je16_to_cpu(node.u.magic) != JFFS2_MAGIC_BITMASK &&
+		je16_to_cpu(node.u.magic) != JFFS2_OLD_MAGIC_BITMASK)
+	{
+		JFFS2_ERROR("wrong node magic: %#04x instead of %#04x.\n",
+			je16_to_cpu(node.u.magic), JFFS2_MAGIC_BITMASK);
+		return;
+	}
+
+	switch(je16_to_cpu(node.u.nodetype)) {
+
+	case JFFS2_NODETYPE_INODE:
+
+		printk(JFFS2_DBG "the node is inode node\n");
+		printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.i.ino));
+		printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.i.version));
+		printk(JFFS2_DBG "mode:\t%#08x\n", node.i.mode.m);
+		printk(JFFS2_DBG "uid:\t%#04x\n", je16_to_cpu(node.i.uid));
+		printk(JFFS2_DBG "gid:\t%#04x\n", je16_to_cpu(node.i.gid));
+		printk(JFFS2_DBG "isize:\t%#08x\n", je32_to_cpu(node.i.isize));
+		printk(JFFS2_DBG "atime:\t%#08x\n", je32_to_cpu(node.i.atime));
+		printk(JFFS2_DBG "mtime:\t%#08x\n", je32_to_cpu(node.i.mtime));
+		printk(JFFS2_DBG "ctime:\t%#08x\n", je32_to_cpu(node.i.ctime));
+		printk(JFFS2_DBG "offset:\t%#08x\n", je32_to_cpu(node.i.offset));
+		printk(JFFS2_DBG "csize:\t%#08x\n", je32_to_cpu(node.i.csize));
+		printk(JFFS2_DBG "dsize:\t%#08x\n", je32_to_cpu(node.i.dsize));
+		printk(JFFS2_DBG "compr:\t%#02x\n", node.i.compr);
+		printk(JFFS2_DBG "usercompr:\t%#02x\n", node.i.usercompr);
+		printk(JFFS2_DBG "flags:\t%#04x\n", je16_to_cpu(node.i.flags));
+		printk(JFFS2_DBG "data_crc:\t%#08x\n", je32_to_cpu(node.i.data_crc));
+		printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.i.node_crc));
+
+		crc = crc32(0, &node.i, sizeof(node.i) - 8);
+		if (crc != je32_to_cpu(node.i.node_crc)) {
+			JFFS2_ERROR("wrong node header CRC.\n");
+			return;
+		}
+		break;
+
+	case JFFS2_NODETYPE_DIRENT:
+
+		printk(JFFS2_DBG "the node is dirent node\n");
+		printk(JFFS2_DBG "pino:\t%#08x\n", je32_to_cpu(node.d.pino));
+		printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.d.version));
+		printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.d.ino));
+		printk(JFFS2_DBG "mctime:\t%#08x\n", je32_to_cpu(node.d.mctime));
+		printk(JFFS2_DBG "nsize:\t%#02x\n", node.d.nsize);
+		printk(JFFS2_DBG "type:\t%#02x\n", node.d.type);
+		printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.d.node_crc));
+		printk(JFFS2_DBG "name_crc:\t%#08x\n", je32_to_cpu(node.d.name_crc));
+
+		node.d.name[node.d.nsize] = '\0';
+		printk(JFFS2_DBG "name:\t\"%s\"\n", node.d.name);
+
+		crc = crc32(0, &node.d, sizeof(node.d) - 8);
+		if (crc != je32_to_cpu(node.d.node_crc)) {
+			JFFS2_ERROR("wrong node header CRC.\n");
+			return;
+		}
+		break;
+
+	default:
+		printk(JFFS2_DBG "node type is unknown\n");
+		break;
+	}
+}
+#endif /* JFFS2_DBG_DUMPS || JFFS2_DBG_PARANOIA_CHECKS */
diff --git a/kernel/fs/jffs2/debug.h b/kernel/fs/jffs2/debug.h
new file mode 100644
index 000000000..4fd9be4cb
--- /dev/null
+++ b/kernel/fs/jffs2/debug.h
@@ -0,0 +1,275 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_DEBUG_H_
+#define _JFFS2_DEBUG_H_
+
+#include <linux/sched.h>
+
+#ifndef CONFIG_JFFS2_FS_DEBUG
+#define CONFIG_JFFS2_FS_DEBUG 0
+#endif
+
+#if CONFIG_JFFS2_FS_DEBUG > 0
+/* Enable "paranoia" checks and dumps */
+#define JFFS2_DBG_PARANOIA_CHECKS
+#define JFFS2_DBG_DUMPS
+
+/*
+ * By defining/undefining the below macros one may select debugging messages
+ * fro specific JFFS2 subsystems.
+ */
+#define JFFS2_DBG_READINODE_MESSAGES
+#define JFFS2_DBG_FRAGTREE_MESSAGES
+#define JFFS2_DBG_DENTLIST_MESSAGES
+#define JFFS2_DBG_NODEREF_MESSAGES
+#define JFFS2_DBG_INOCACHE_MESSAGES
+#define JFFS2_DBG_SUMMARY_MESSAGES
+#define JFFS2_DBG_FSBUILD_MESSAGES
+#endif
+
+#if CONFIG_JFFS2_FS_DEBUG > 1
+#define JFFS2_DBG_FRAGTREE2_MESSAGES
+#define JFFS2_DBG_READINODE2_MESSAGES
+#define JFFS2_DBG_MEMALLOC_MESSAGES
+#endif
+
+/* Sanity checks are supposed to be light-weight and enabled by default */
+#define JFFS2_DBG_SANITY_CHECKS
+
+/*
+ * Dx() are mainly used for debugging messages, they must go away and be
+ * superseded by nicer dbg_xxx() macros...
+ */
+#if CONFIG_JFFS2_FS_DEBUG > 0
+#define DEBUG
+#define D1(x) x
+#else
+#define D1(x)
+#endif
+
+#if CONFIG_JFFS2_FS_DEBUG > 1
+#define D2(x) x
+#else
+#define D2(x)
+#endif
+
+#define jffs2_dbg(level, fmt, ...)		\
+do {						\
+	if (CONFIG_JFFS2_FS_DEBUG >= level)	\
+		pr_debug(fmt, ##__VA_ARGS__);	\
+} while (0)
+
+/* The prefixes of JFFS2 messages */
+#define JFFS2_DBG		KERN_DEBUG
+#define JFFS2_DBG_PREFIX	"[JFFS2 DBG]"
+#define JFFS2_DBG_MSG_PREFIX	JFFS2_DBG JFFS2_DBG_PREFIX
+
+/* JFFS2 message macros */
+#define JFFS2_ERROR(fmt, ...)					\
+	pr_err("error: (%d) %s: " fmt,				\
+	       task_pid_nr(current), __func__, ##__VA_ARGS__)
+
+#define JFFS2_WARNING(fmt, ...)						\
+	pr_warn("warning: (%d) %s: " fmt,				\
+		task_pid_nr(current), __func__, ##__VA_ARGS__)
+
+#define JFFS2_NOTICE(fmt, ...)						\
+	pr_notice("notice: (%d) %s: " fmt,				\
+		  task_pid_nr(current), __func__, ##__VA_ARGS__)
+
+#define JFFS2_DEBUG(fmt, ...)						\
+	printk(KERN_DEBUG "[JFFS2 DBG] (%d) %s: " fmt,			\
+	       task_pid_nr(current), __func__, ##__VA_ARGS__)
+
+/*
+ * We split our debugging messages on several parts, depending on the JFFS2
+ * subsystem the message belongs to.
+ */
+/* Read inode debugging messages */
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+#define dbg_readinode(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_readinode(fmt, ...)
+#endif
+#ifdef JFFS2_DBG_READINODE2_MESSAGES
+#define dbg_readinode2(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_readinode2(fmt, ...)
+#endif
+
+/* Fragtree build debugging messages */
+#ifdef JFFS2_DBG_FRAGTREE_MESSAGES
+#define dbg_fragtree(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_fragtree(fmt, ...)
+#endif
+#ifdef JFFS2_DBG_FRAGTREE2_MESSAGES
+#define dbg_fragtree2(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_fragtree2(fmt, ...)
+#endif
+
+/* Directory entry list manilulation debugging messages */
+#ifdef JFFS2_DBG_DENTLIST_MESSAGES
+#define dbg_dentlist(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_dentlist(fmt, ...)
+#endif
+
+/* Print the messages about manipulating node_refs */
+#ifdef JFFS2_DBG_NODEREF_MESSAGES
+#define dbg_noderef(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_noderef(fmt, ...)
+#endif
+
+/* Manipulations with the list of inodes (JFFS2 inocache) */
+#ifdef JFFS2_DBG_INOCACHE_MESSAGES
+#define dbg_inocache(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_inocache(fmt, ...)
+#endif
+
+/* Summary debugging messages */
+#ifdef JFFS2_DBG_SUMMARY_MESSAGES
+#define dbg_summary(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_summary(fmt, ...)
+#endif
+
+/* File system build messages */
+#ifdef JFFS2_DBG_FSBUILD_MESSAGES
+#define dbg_fsbuild(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_fsbuild(fmt, ...)
+#endif
+
+/* Watch the object allocations */
+#ifdef JFFS2_DBG_MEMALLOC_MESSAGES
+#define dbg_memalloc(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_memalloc(fmt, ...)
+#endif
+
+/* Watch the XATTR subsystem */
+#ifdef JFFS2_DBG_XATTR_MESSAGES
+#define dbg_xattr(fmt, ...)  JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_xattr(fmt, ...)
+#endif 
+
+/* "Sanity" checks */
+void
+__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c,
+				     struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c,
+			      struct jffs2_eraseblock *jeb);
+
+/* "Paranoia" checks */
+void
+__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c,
+			   	struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c,
+				       struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c,
+				    uint32_t ofs, int len);
+
+/* "Dump" functions */
+void
+__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c);
+void
+__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c);
+void
+__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c,
+		 	   struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c,
+				  struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs);
+void
+__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs);
+
+#ifdef JFFS2_DBG_PARANOIA_CHECKS
+#define jffs2_dbg_fragtree_paranoia_check(f)			\
+	__jffs2_dbg_fragtree_paranoia_check(f)
+#define jffs2_dbg_fragtree_paranoia_check_nolock(f)		\
+	__jffs2_dbg_fragtree_paranoia_check_nolock(f)
+#define jffs2_dbg_acct_paranoia_check(c, jeb)			\
+	__jffs2_dbg_acct_paranoia_check(c,jeb)
+#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb)		\
+	__jffs2_dbg_acct_paranoia_check_nolock(c,jeb)
+#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len)		\
+	__jffs2_dbg_prewrite_paranoia_check(c, ofs, len)
+#else
+#define jffs2_dbg_fragtree_paranoia_check(f)
+#define jffs2_dbg_fragtree_paranoia_check_nolock(f)
+#define jffs2_dbg_acct_paranoia_check(c, jeb)
+#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb)
+#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len)
+#endif /* !JFFS2_PARANOIA_CHECKS */
+
+#ifdef JFFS2_DBG_DUMPS
+#define jffs2_dbg_dump_jeb(c, jeb)				\
+	__jffs2_dbg_dump_jeb(c, jeb);
+#define jffs2_dbg_dump_jeb_nolock(jeb)				\
+	__jffs2_dbg_dump_jeb_nolock(jeb);
+#define jffs2_dbg_dump_block_lists(c)				\
+	__jffs2_dbg_dump_block_lists(c)
+#define jffs2_dbg_dump_block_lists_nolock(c)			\
+	__jffs2_dbg_dump_block_lists_nolock(c)
+#define jffs2_dbg_dump_fragtree(f)				\
+	__jffs2_dbg_dump_fragtree(f);
+#define jffs2_dbg_dump_fragtree_nolock(f)			\
+	__jffs2_dbg_dump_fragtree_nolock(f);
+#define jffs2_dbg_dump_buffer(buf, len, offs)			\
+	__jffs2_dbg_dump_buffer(*buf, len, offs);
+#define jffs2_dbg_dump_node(c, ofs)				\
+	__jffs2_dbg_dump_node(c, ofs);
+#else
+#define jffs2_dbg_dump_jeb(c, jeb)
+#define jffs2_dbg_dump_jeb_nolock(jeb)
+#define jffs2_dbg_dump_block_lists(c)
+#define jffs2_dbg_dump_block_lists_nolock(c)
+#define jffs2_dbg_dump_fragtree(f)
+#define jffs2_dbg_dump_fragtree_nolock(f)
+#define jffs2_dbg_dump_buffer(buf, len, offs)
+#define jffs2_dbg_dump_node(c, ofs)
+#endif /* !JFFS2_DBG_DUMPS */
+
+#ifdef JFFS2_DBG_SANITY_CHECKS
+#define jffs2_dbg_acct_sanity_check(c, jeb)			\
+	__jffs2_dbg_acct_sanity_check(c, jeb)
+#define jffs2_dbg_acct_sanity_check_nolock(c, jeb)		\
+	__jffs2_dbg_acct_sanity_check_nolock(c, jeb)
+#else
+#define jffs2_dbg_acct_sanity_check(c, jeb)
+#define jffs2_dbg_acct_sanity_check_nolock(c, jeb)
+#endif /* !JFFS2_DBG_SANITY_CHECKS */
+
+#endif /* _JFFS2_DEBUG_H_ */
diff --git a/kernel/fs/jffs2/dir.c b/kernel/fs/jffs2/dir.c
new file mode 100644
index 000000000..1ba5c9794
--- /dev/null
+++ b/kernel/fs/jffs2/dir.c
@@ -0,0 +1,862 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
+#include <linux/time.h>
+#include "nodelist.h"
+
+static int jffs2_readdir (struct file *, struct dir_context *);
+
+static int jffs2_create (struct inode *,struct dentry *,umode_t,
+			 bool);
+static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
+				    unsigned int);
+static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
+static int jffs2_unlink (struct inode *,struct dentry *);
+static int jffs2_symlink (struct inode *,struct dentry *,const char *);
+static int jffs2_mkdir (struct inode *,struct dentry *,umode_t);
+static int jffs2_rmdir (struct inode *,struct dentry *);
+static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t);
+static int jffs2_rename (struct inode *, struct dentry *,
+			 struct inode *, struct dentry *);
+
+const struct file_operations jffs2_dir_operations =
+{
+	.read =		generic_read_dir,
+	.iterate =	jffs2_readdir,
+	.unlocked_ioctl=jffs2_ioctl,
+	.fsync =	jffs2_fsync,
+	.llseek =	generic_file_llseek,
+};
+
+
+const struct inode_operations jffs2_dir_inode_operations =
+{
+	.create =	jffs2_create,
+	.lookup =	jffs2_lookup,
+	.link =		jffs2_link,
+	.unlink =	jffs2_unlink,
+	.symlink =	jffs2_symlink,
+	.mkdir =	jffs2_mkdir,
+	.rmdir =	jffs2_rmdir,
+	.mknod =	jffs2_mknod,
+	.rename =	jffs2_rename,
+	.get_acl =	jffs2_get_acl,
+	.set_acl =	jffs2_set_acl,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
+};
+
+/***********************************************************************/
+
+
+/* We keep the dirent list sorted in increasing order of name hash,
+   and we use the same hash function as the dentries. Makes this
+   nice and simple
+*/
+static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
+				   unsigned int flags)
+{
+	struct jffs2_inode_info *dir_f;
+	struct jffs2_full_dirent *fd = NULL, *fd_list;
+	uint32_t ino = 0;
+	struct inode *inode = NULL;
+
+	jffs2_dbg(1, "jffs2_lookup()\n");
+
+	if (target->d_name.len > JFFS2_MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+
+	mutex_lock(&dir_f->sem);
+
+	/* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */
+	for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) {
+		if (fd_list->nhash == target->d_name.hash &&
+		    (!fd || fd_list->version > fd->version) &&
+		    strlen(fd_list->name) == target->d_name.len &&
+		    !strncmp(fd_list->name, target->d_name.name, target->d_name.len)) {
+			fd = fd_list;
+		}
+	}
+	if (fd)
+		ino = fd->ino;
+	mutex_unlock(&dir_f->sem);
+	if (ino) {
+		inode = jffs2_iget(dir_i->i_sb, ino);
+		if (IS_ERR(inode))
+			pr_warn("iget() failed for ino #%u\n", ino);
+	}
+
+	return d_splice_alias(inode, target);
+}
+
+/***********************************************************************/
+
+
+static int jffs2_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_full_dirent *fd;
+	unsigned long curofs = 1;
+
+	jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	mutex_lock(&f->sem);
+	for (fd = f->dents; fd; fd = fd->next) {
+		curofs++;
+		/* First loop: curofs = 2; pos = 2 */
+		if (curofs < ctx->pos) {
+			jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
+				  fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
+			continue;
+		}
+		if (!fd->ino) {
+			jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
+				  fd->name);
+			ctx->pos++;
+			continue;
+		}
+		jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
+			  (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
+		if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
+			break;
+		ctx->pos++;
+	}
+	mutex_unlock(&f->sem);
+	return 0;
+}
+
+/***********************************************************************/
+
+
+static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
+			umode_t mode, bool excl)
+{
+	struct jffs2_raw_inode *ri;
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	int ret;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	jffs2_dbg(1, "%s()\n", __func__);
+
+	inode = jffs2_new_inode(dir_i, mode, ri);
+
+	if (IS_ERR(inode)) {
+		jffs2_dbg(1, "jffs2_new_inode() failed\n");
+		jffs2_free_raw_inode(ri);
+		return PTR_ERR(inode);
+	}
+
+	inode->i_op = &jffs2_file_inode_operations;
+	inode->i_fop = &jffs2_file_operations;
+	inode->i_mapping->a_ops = &jffs2_file_address_operations;
+	inode->i_mapping->nrpages = 0;
+
+	f = JFFS2_INODE_INFO(inode);
+	dir_f = JFFS2_INODE_INFO(dir_i);
+
+	/* jffs2_do_create() will want to lock it, _after_ reserving
+	   space and taking c-alloc_sem. If we keep it locked here,
+	   lockdep gets unhappy (although it's a false positive;
+	   nothing else will be looking at this inode yet so there's
+	   no chance of AB-BA deadlock involving its f->sem). */
+	mutex_unlock(&f->sem);
+
+	ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+
+	jffs2_free_raw_inode(ri);
+
+	jffs2_dbg(1, "%s(): Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
+		  __func__, inode->i_ino, inode->i_mode, inode->i_nlink,
+		  f->inocache->pino_nlink, inode->i_mapping->nrpages);
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	jffs2_free_raw_inode(ri);
+	return ret;
+}
+
+/***********************************************************************/
+
+
+static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb);
+	struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
+	struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(d_inode(dentry));
+	int ret;
+	uint32_t now = get_seconds();
+
+	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
+			      dentry->d_name.len, dead_f, now);
+	if (dead_f->inocache)
+		set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
+	if (!ret)
+		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+	return ret;
+}
+/***********************************************************************/
+
+
+static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry));
+	struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
+	int ret;
+	uint8_t type;
+	uint32_t now;
+
+	/* Don't let people make hard links to bad inodes. */
+	if (!f->inocache)
+		return -EIO;
+
+	if (d_is_dir(old_dentry))
+		return -EPERM;
+
+	/* XXX: This is ugly */
+	type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12;
+	if (!type) type = DT_REG;
+
+	now = get_seconds();
+	ret = jffs2_do_link(c, dir_f, f->inocache->ino, type, dentry->d_name.name, dentry->d_name.len, now);
+
+	if (!ret) {
+		mutex_lock(&f->sem);
+		set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
+		mutex_unlock(&f->sem);
+		d_instantiate(dentry, d_inode(old_dentry));
+		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+		ihold(d_inode(old_dentry));
+	}
+	return ret;
+}
+
+/***********************************************************************/
+
+static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target)
+{
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	struct jffs2_raw_inode *ri;
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	int namelen;
+	uint32_t alloclen;
+	int ret, targetlen = strlen(target);
+
+	/* FIXME: If you care. We'd need to use frags for the target
+	   if it grows much more than this */
+	if (targetlen > 254)
+		return -ENAMETOOLONG;
+
+	ri = jffs2_alloc_raw_inode();
+
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	namelen = dentry->d_name.len;
+	ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		return ret;
+	}
+
+	inode = jffs2_new_inode(dir_i, S_IFLNK | S_IRWXUGO, ri);
+
+	if (IS_ERR(inode)) {
+		jffs2_free_raw_inode(ri);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(inode);
+	}
+
+	inode->i_op = &jffs2_symlink_inode_operations;
+
+	f = JFFS2_INODE_INFO(inode);
+
+	inode->i_size = targetlen;
+	ri->isize = ri->dsize = ri->csize = cpu_to_je32(inode->i_size);
+	ri->totlen = cpu_to_je32(sizeof(*ri) + inode->i_size);
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri->compr = JFFS2_COMPR_NONE;
+	ri->data_crc = cpu_to_je32(crc32(0, target, targetlen));
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, target, targetlen, ALLOC_NORMAL);
+
+	jffs2_free_raw_inode(ri);
+
+	if (IS_ERR(fn)) {
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = PTR_ERR(fn);
+		goto fail;
+	}
+
+	/* We use f->target field to store the target path. */
+	f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
+	if (!f->target) {
+		pr_warn("Can't allocate %d bytes of memory\n", targetlen + 1);
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	jffs2_dbg(1, "%s(): symlink's target '%s' cached\n",
+		  __func__, (char *)f->target);
+
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+	mutex_unlock(&f->sem);
+
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_acl_post(inode);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret)
+		goto fail;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_i->i_ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(inode->i_ino);
+	rd->mctime = cpu_to_je32(get_seconds());
+	rd->nsize = namelen;
+	rd->type = DT_LNK;
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_dirent(rd);
+		mutex_unlock(&dir_f->sem);
+		ret = PTR_ERR(fd);
+		goto fail;
+	}
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+
+	jffs2_free_raw_dirent(rd);
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	mutex_unlock(&dir_f->sem);
+	jffs2_complete_reservation(c);
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	return ret;
+}
+
+
+static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode)
+{
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	struct jffs2_raw_inode *ri;
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	int namelen;
+	uint32_t alloclen;
+	int ret;
+
+	mode |= S_IFDIR;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	namelen = dentry->d_name.len;
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
+				  JFFS2_SUMMARY_INODE_SIZE);
+
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		return ret;
+	}
+
+	inode = jffs2_new_inode(dir_i, mode, ri);
+
+	if (IS_ERR(inode)) {
+		jffs2_free_raw_inode(ri);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(inode);
+	}
+
+	inode->i_op = &jffs2_dir_inode_operations;
+	inode->i_fop = &jffs2_dir_operations;
+
+	f = JFFS2_INODE_INFO(inode);
+
+	/* Directories get nlink 2 at start */
+	set_nlink(inode, 2);
+	/* but ic->pino_nlink is the parent ino# */
+	f->inocache->pino_nlink = dir_i->i_ino;
+
+	ri->data_crc = cpu_to_je32(0);
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
+
+	jffs2_free_raw_inode(ri);
+
+	if (IS_ERR(fn)) {
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = PTR_ERR(fn);
+		goto fail;
+	}
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+	mutex_unlock(&f->sem);
+
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_acl_post(inode);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret)
+		goto fail;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_i->i_ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(inode->i_ino);
+	rd->mctime = cpu_to_je32(get_seconds());
+	rd->nsize = namelen;
+	rd->type = DT_DIR;
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_dirent(rd);
+		mutex_unlock(&dir_f->sem);
+		ret = PTR_ERR(fd);
+		goto fail;
+	}
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+	inc_nlink(dir_i);
+
+	jffs2_free_raw_dirent(rd);
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	mutex_unlock(&dir_f->sem);
+	jffs2_complete_reservation(c);
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	return ret;
+}
+
+static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb);
+	struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry));
+	struct jffs2_full_dirent *fd;
+	int ret;
+	uint32_t now = get_seconds();
+
+	for (fd = f->dents ; fd; fd = fd->next) {
+		if (fd->ino)
+			return -ENOTEMPTY;
+	}
+
+	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
+			      dentry->d_name.len, f, now);
+	if (!ret) {
+		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+		clear_nlink(d_inode(dentry));
+		drop_nlink(dir_i);
+	}
+	return ret;
+}
+
+static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	struct jffs2_raw_inode *ri;
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	int namelen;
+	union jffs2_device_node dev;
+	int devlen = 0;
+	uint32_t alloclen;
+	int ret;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		devlen = jffs2_encode_dev(&dev, rdev);
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	namelen = dentry->d_name.len;
+	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		return ret;
+	}
+
+	inode = jffs2_new_inode(dir_i, mode, ri);
+
+	if (IS_ERR(inode)) {
+		jffs2_free_raw_inode(ri);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(inode);
+	}
+	inode->i_op = &jffs2_file_inode_operations;
+	init_special_inode(inode, inode->i_mode, rdev);
+
+	f = JFFS2_INODE_INFO(inode);
+
+	ri->dsize = ri->csize = cpu_to_je32(devlen);
+	ri->totlen = cpu_to_je32(sizeof(*ri) + devlen);
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri->compr = JFFS2_COMPR_NONE;
+	ri->data_crc = cpu_to_je32(crc32(0, &dev, devlen));
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, ALLOC_NORMAL);
+
+	jffs2_free_raw_inode(ri);
+
+	if (IS_ERR(fn)) {
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = PTR_ERR(fn);
+		goto fail;
+	}
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+	mutex_unlock(&f->sem);
+
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_acl_post(inode);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret)
+		goto fail;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_i->i_ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(inode->i_ino);
+	rd->mctime = cpu_to_je32(get_seconds());
+	rd->nsize = namelen;
+
+	/* XXX: This is ugly. */
+	rd->type = (mode & S_IFMT) >> 12;
+
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_dirent(rd);
+		mutex_unlock(&dir_f->sem);
+		ret = PTR_ERR(fd);
+		goto fail;
+	}
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+
+	jffs2_free_raw_dirent(rd);
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	mutex_unlock(&dir_f->sem);
+	jffs2_complete_reservation(c);
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	return ret;
+}
+
+static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
+			 struct inode *new_dir_i, struct dentry *new_dentry)
+{
+	int ret;
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dir_i->i_sb);
+	struct jffs2_inode_info *victim_f = NULL;
+	uint8_t type;
+	uint32_t now;
+
+	/* The VFS will check for us and prevent trying to rename a
+	 * file over a directory and vice versa, but if it's a directory,
+	 * the VFS can't check whether the victim is empty. The filesystem
+	 * needs to do that for itself.
+	 */
+	if (d_really_is_positive(new_dentry)) {
+		victim_f = JFFS2_INODE_INFO(d_inode(new_dentry));
+		if (d_is_dir(new_dentry)) {
+			struct jffs2_full_dirent *fd;
+
+			mutex_lock(&victim_f->sem);
+			for (fd = victim_f->dents; fd; fd = fd->next) {
+				if (fd->ino) {
+					mutex_unlock(&victim_f->sem);
+					return -ENOTEMPTY;
+				}
+			}
+			mutex_unlock(&victim_f->sem);
+		}
+	}
+
+	/* XXX: We probably ought to alloc enough space for
+	   both nodes at the same time. Writing the new link,
+	   then getting -ENOSPC, is quite bad :)
+	*/
+
+	/* Make a hard link */
+
+	/* XXX: This is ugly */
+	type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12;
+	if (!type) type = DT_REG;
+
+	now = get_seconds();
+	ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i),
+			    d_inode(old_dentry)->i_ino, type,
+			    new_dentry->d_name.name, new_dentry->d_name.len, now);
+
+	if (ret)
+		return ret;
+
+	if (victim_f) {
+		/* There was a victim. Kill it off nicely */
+		if (d_is_dir(new_dentry))
+			clear_nlink(d_inode(new_dentry));
+		else
+			drop_nlink(d_inode(new_dentry));
+		/* Don't oops if the victim was a dirent pointing to an
+		   inode which didn't exist. */
+		if (victim_f->inocache) {
+			mutex_lock(&victim_f->sem);
+			if (d_is_dir(new_dentry))
+				victim_f->inocache->pino_nlink = 0;
+			else
+				victim_f->inocache->pino_nlink--;
+			mutex_unlock(&victim_f->sem);
+		}
+	}
+
+	/* If it was a directory we moved, and there was no victim,
+	   increase i_nlink on its new parent */
+	if (d_is_dir(old_dentry) && !victim_f)
+		inc_nlink(new_dir_i);
+
+	/* Unlink the original */
+	ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
+			      old_dentry->d_name.name, old_dentry->d_name.len, NULL, now);
+
+	/* We don't touch inode->i_nlink */
+
+	if (ret) {
+		/* Oh shit. We really ought to make a single node which can do both atomically */
+		struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry));
+		mutex_lock(&f->sem);
+		inc_nlink(d_inode(old_dentry));
+		if (f->inocache && !d_is_dir(old_dentry))
+			f->inocache->pino_nlink++;
+		mutex_unlock(&f->sem);
+
+		pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
+			  __func__, ret);
+		/* Might as well let the VFS know */
+		d_instantiate(new_dentry, d_inode(old_dentry));
+		ihold(d_inode(old_dentry));
+		new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
+		return ret;
+	}
+
+	if (d_is_dir(old_dentry))
+		drop_nlink(old_dir_i);
+
+	new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
+
+	return 0;
+}
+
diff --git a/kernel/fs/jffs2/erase.c b/kernel/fs/jffs2/erase.c
new file mode 100644
index 000000000..4a6cf289b
--- /dev/null
+++ b/kernel/fs/jffs2/erase.c
@@ -0,0 +1,515 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include <linux/crc32.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include "nodelist.h"
+
+struct erase_priv_struct {
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_sb_info *c;
+};
+
+#ifndef __ECOS
+static void jffs2_erase_callback(struct erase_info *);
+#endif
+static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
+static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+
+static void jffs2_erase_block(struct jffs2_sb_info *c,
+			      struct jffs2_eraseblock *jeb)
+{
+	int ret;
+	uint32_t bad_offset;
+#ifdef __ECOS
+       ret = jffs2_flash_erase(c, jeb);
+       if (!ret) {
+	       jffs2_erase_succeeded(c, jeb);
+	       return;
+       }
+       bad_offset = jeb->offset;
+#else /* Linux */
+	struct erase_info *instr;
+
+	jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n",
+		  __func__,
+		  jeb->offset, jeb->offset, jeb->offset + c->sector_size);
+	instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL);
+	if (!instr) {
+		pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
+		mutex_lock(&c->erase_free_sem);
+		spin_lock(&c->erase_completion_lock);
+		list_move(&jeb->list, &c->erase_pending_list);
+		c->erasing_size -= c->sector_size;
+		c->dirty_size += c->sector_size;
+		jeb->dirty_size = c->sector_size;
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->erase_free_sem);
+		return;
+	}
+
+	memset(instr, 0, sizeof(*instr));
+
+	instr->mtd = c->mtd;
+	instr->addr = jeb->offset;
+	instr->len = c->sector_size;
+	instr->callback = jffs2_erase_callback;
+	instr->priv = (unsigned long)(&instr[1]);
+
+	((struct erase_priv_struct *)instr->priv)->jeb = jeb;
+	((struct erase_priv_struct *)instr->priv)->c = c;
+
+	ret = mtd_erase(c->mtd, instr);
+	if (!ret)
+		return;
+
+	bad_offset = instr->fail_addr;
+	kfree(instr);
+#endif /* __ECOS */
+
+	if (ret == -ENOMEM || ret == -EAGAIN) {
+		/* Erase failed immediately. Refile it on the list */
+		jffs2_dbg(1, "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n",
+			  jeb->offset, ret);
+		mutex_lock(&c->erase_free_sem);
+		spin_lock(&c->erase_completion_lock);
+		list_move(&jeb->list, &c->erase_pending_list);
+		c->erasing_size -= c->sector_size;
+		c->dirty_size += c->sector_size;
+		jeb->dirty_size = c->sector_size;
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->erase_free_sem);
+		return;
+	}
+
+	if (ret == -EROFS)
+		pr_warn("Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n",
+			jeb->offset);
+	else
+		pr_warn("Erase at 0x%08x failed immediately: errno %d\n",
+			jeb->offset, ret);
+
+	jffs2_erase_failed(c, jeb, bad_offset);
+}
+
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+{
+	struct jffs2_eraseblock *jeb;
+	int work_done = 0;
+
+	mutex_lock(&c->erase_free_sem);
+
+	spin_lock(&c->erase_completion_lock);
+
+	while (!list_empty(&c->erase_complete_list) ||
+	       !list_empty(&c->erase_pending_list)) {
+
+		if (!list_empty(&c->erase_complete_list)) {
+			jeb = list_entry(c->erase_complete_list.next, struct jffs2_eraseblock, list);
+			list_move(&jeb->list, &c->erase_checking_list);
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->erase_free_sem);
+			jffs2_mark_erased_block(c, jeb);
+
+			work_done++;
+			if (!--count) {
+				jffs2_dbg(1, "Count reached. jffs2_erase_pending_blocks leaving\n");
+				goto done;
+			}
+
+		} else if (!list_empty(&c->erase_pending_list)) {
+			jeb = list_entry(c->erase_pending_list.next, struct jffs2_eraseblock, list);
+			jffs2_dbg(1, "Starting erase of pending block 0x%08x\n",
+				  jeb->offset);
+			list_del(&jeb->list);
+			c->erasing_size += c->sector_size;
+			c->wasted_size -= jeb->wasted_size;
+			c->free_size -= jeb->free_size;
+			c->used_size -= jeb->used_size;
+			c->dirty_size -= jeb->dirty_size;
+			jeb->wasted_size = jeb->used_size = jeb->dirty_size = jeb->free_size = 0;
+			jffs2_free_jeb_node_refs(c, jeb);
+			list_add(&jeb->list, &c->erasing_list);
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->erase_free_sem);
+
+			jffs2_erase_block(c, jeb);
+
+		} else {
+			BUG();
+		}
+
+		/* Be nice */
+		cond_resched();
+		mutex_lock(&c->erase_free_sem);
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+ done:
+	jffs2_dbg(1, "jffs2_erase_pending_blocks completed\n");
+	return work_done;
+}
+
+static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	jffs2_dbg(1, "Erase completed successfully at 0x%08x\n", jeb->offset);
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+	list_move_tail(&jeb->list, &c->erase_complete_list);
+	/* Wake the GC thread to mark them clean */
+	jffs2_garbage_collect_trigger(c);
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	wake_up(&c->erase_wait);
+}
+
+static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
+{
+	/* For NAND, if the failure did not occur at the device level for a
+	   specific physical page, don't bother updating the bad block table. */
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
+		/* We had a device-level failure to erase.  Let's see if we've
+		   failed too many times. */
+		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
+			/* We'd like to give this block another try. */
+			mutex_lock(&c->erase_free_sem);
+			spin_lock(&c->erase_completion_lock);
+			list_move(&jeb->list, &c->erase_pending_list);
+			c->erasing_size -= c->sector_size;
+			c->dirty_size += c->sector_size;
+			jeb->dirty_size = c->sector_size;
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->erase_free_sem);
+			return;
+		}
+	}
+
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+	c->erasing_size -= c->sector_size;
+	c->bad_size += c->sector_size;
+	list_move(&jeb->list, &c->bad_list);
+	c->nr_erasing_blocks--;
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	wake_up(&c->erase_wait);
+}
+
+#ifndef __ECOS
+static void jffs2_erase_callback(struct erase_info *instr)
+{
+	struct erase_priv_struct *priv = (void *)instr->priv;
+
+	if(instr->state != MTD_ERASE_DONE) {
+		pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+			(unsigned long long)instr->addr, instr->state);
+		jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
+	} else {
+		jffs2_erase_succeeded(priv->c, priv->jeb);
+	}
+	kfree(instr);
+}
+#endif /* !__ECOS */
+
+/* Hmmm. Maybe we should accept the extra space it takes and make
+   this a standard doubly-linked list? */
+static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
+			struct jffs2_raw_node_ref *ref, struct jffs2_eraseblock *jeb)
+{
+	struct jffs2_inode_cache *ic = NULL;
+	struct jffs2_raw_node_ref **prev;
+
+	prev = &ref->next_in_ino;
+
+	/* Walk the inode's list once, removing any nodes from this eraseblock */
+	while (1) {
+		if (!(*prev)->next_in_ino) {
+			/* We're looking at the jffs2_inode_cache, which is
+			   at the end of the linked list. Stash it and continue
+			   from the beginning of the list */
+			ic = (struct jffs2_inode_cache *)(*prev);
+			prev = &ic->nodes;
+			continue;
+		}
+
+		if (SECTOR_ADDR((*prev)->flash_offset) == jeb->offset) {
+			/* It's in the block we're erasing */
+			struct jffs2_raw_node_ref *this;
+
+			this = *prev;
+			*prev = this->next_in_ino;
+			this->next_in_ino = NULL;
+
+			if (this == ref)
+				break;
+
+			continue;
+		}
+		/* Not to be deleted. Skip */
+		prev = &((*prev)->next_in_ino);
+	}
+
+	/* PARANOIA */
+	if (!ic) {
+		JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref"
+			      " not found in remove_node_refs()!!\n");
+		return;
+	}
+
+	jffs2_dbg(1, "Removed nodes in range 0x%08x-0x%08x from ino #%u\n",
+		  jeb->offset, jeb->offset + c->sector_size, ic->ino);
+
+	D2({
+		int i=0;
+		struct jffs2_raw_node_ref *this;
+		printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n");
+
+		this = ic->nodes;
+
+		printk(KERN_DEBUG);
+		while(this) {
+			pr_cont("0x%08x(%d)->",
+			       ref_offset(this), ref_flags(this));
+			if (++i == 5) {
+				printk(KERN_DEBUG);
+				i=0;
+			}
+			this = this->next_in_ino;
+		}
+		pr_cont("\n");
+	});
+
+	switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case RAWNODE_CLASS_XATTR_DATUM:
+			jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+			break;
+		case RAWNODE_CLASS_XATTR_REF:
+			jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+			break;
+#endif
+		default:
+			if (ic->nodes == (void *)ic && ic->pino_nlink == 0)
+				jffs2_del_ino_cache(c, ic);
+	}
+}
+
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	struct jffs2_raw_node_ref *block, *ref;
+	jffs2_dbg(1, "Freeing all node refs for eraseblock offset 0x%08x\n",
+		  jeb->offset);
+
+	block = ref = jeb->first_node;
+
+	while (ref) {
+		if (ref->flash_offset == REF_LINK_NODE) {
+			ref = ref->next_in_ino;
+			jffs2_free_refblock(block);
+			block = ref;
+			continue;
+		}
+		if (ref->flash_offset != REF_EMPTY_NODE && ref->next_in_ino)
+			jffs2_remove_node_refs_from_ino_list(c, ref, jeb);
+		/* else it was a non-inode node or already removed, so don't bother */
+
+		ref++;
+	}
+	jeb->first_node = jeb->last_node = NULL;
+}
+
+static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset)
+{
+	void *ebuf;
+	uint32_t ofs;
+	size_t retlen;
+	int ret;
+	unsigned long *wordebuf;
+
+	ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
+			&ebuf, NULL);
+	if (ret != -EOPNOTSUPP) {
+		if (ret) {
+			jffs2_dbg(1, "MTD point failed %d\n", ret);
+			goto do_flash_read;
+		}
+		if (retlen < c->sector_size) {
+			/* Don't muck about if it won't let us point to the whole erase sector */
+			jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n",
+				  retlen);
+			mtd_unpoint(c->mtd, jeb->offset, retlen);
+			goto do_flash_read;
+		}
+		wordebuf = ebuf-sizeof(*wordebuf);
+		retlen /= sizeof(*wordebuf);
+		do {
+		   if (*++wordebuf != ~0)
+			   break;
+		} while(--retlen);
+		mtd_unpoint(c->mtd, jeb->offset, c->sector_size);
+		if (retlen) {
+			pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
+				*wordebuf,
+				jeb->offset +
+				c->sector_size-retlen * sizeof(*wordebuf));
+			return -EIO;
+		}
+		return 0;
+	}
+ do_flash_read:
+	ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!ebuf) {
+		pr_warn("Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n",
+			jeb->offset);
+		return -EAGAIN;
+	}
+
+	jffs2_dbg(1, "Verifying erase at 0x%08x\n", jeb->offset);
+
+	for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) {
+		uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs);
+		int i;
+
+		*bad_offset = ofs;
+
+		ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf);
+		if (ret) {
+			pr_warn("Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n",
+				ofs, ret);
+			ret = -EIO;
+			goto fail;
+		}
+		if (retlen != readlen) {
+			pr_warn("Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n",
+				ofs, readlen, retlen);
+			ret = -EIO;
+			goto fail;
+		}
+		for (i=0; i<readlen; i += sizeof(unsigned long)) {
+			/* It's OK. We know it's properly aligned */
+			unsigned long *datum = ebuf + i;
+			if (*datum + 1) {
+				*bad_offset += i;
+				pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08x\n",
+					*datum, *bad_offset);
+				ret = -EIO;
+				goto fail;
+			}
+		}
+		ofs += readlen;
+		cond_resched();
+	}
+	ret = 0;
+fail:
+	kfree(ebuf);
+	return ret;
+}
+
+static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	size_t retlen;
+	int ret;
+	uint32_t uninitialized_var(bad_offset);
+
+	switch (jffs2_block_check_erase(c, jeb, &bad_offset)) {
+	case -EAGAIN:	goto refile;
+	case -EIO:	goto filebad;
+	}
+
+	/* Write the erase complete marker */
+	jffs2_dbg(1, "Writing erased marker to block at 0x%08x\n", jeb->offset);
+	bad_offset = jeb->offset;
+
+	/* Cleanmarker in oob area or no cleanmarker at all ? */
+	if (jffs2_cleanmarker_oob(c) || c->cleanmarker_size == 0) {
+
+		if (jffs2_cleanmarker_oob(c)) {
+			if (jffs2_write_nand_cleanmarker(c, jeb))
+				goto filebad;
+		}
+	} else {
+
+		struct kvec vecs[1];
+		struct jffs2_unknown_node marker = {
+			.magic =	cpu_to_je16(JFFS2_MAGIC_BITMASK),
+			.nodetype =	cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
+			.totlen =	cpu_to_je32(c->cleanmarker_size)
+		};
+
+		jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+		marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
+
+		vecs[0].iov_base = (unsigned char *) &marker;
+		vecs[0].iov_len = sizeof(marker);
+		ret = jffs2_flash_direct_writev(c, vecs, 1, jeb->offset, &retlen);
+
+		if (ret || retlen != sizeof(marker)) {
+			if (ret)
+				pr_warn("Write clean marker to block at 0x%08x failed: %d\n",
+				       jeb->offset, ret);
+			else
+				pr_warn("Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
+				       jeb->offset, sizeof(marker), retlen);
+
+			goto filebad;
+		}
+	}
+	/* Everything else got zeroed before the erase */
+	jeb->free_size = c->sector_size;
+
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+
+	c->erasing_size -= c->sector_size;
+	c->free_size += c->sector_size;
+
+	/* Account for cleanmarker now, if it's in-band */
+	if (c->cleanmarker_size && !jffs2_cleanmarker_oob(c))
+		jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL);
+
+	list_move_tail(&jeb->list, &c->free_list);
+	c->nr_erasing_blocks--;
+	c->nr_free_blocks++;
+
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	wake_up(&c->erase_wait);
+	return;
+
+filebad:
+	jffs2_erase_failed(c, jeb, bad_offset);
+	return;
+
+refile:
+	/* Stick it back on the list from whence it came and come back later */
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+	jffs2_garbage_collect_trigger(c);
+	list_move(&jeb->list, &c->erase_complete_list);
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	return;
+}
diff --git a/kernel/fs/jffs2/file.c b/kernel/fs/jffs2/file.c
new file mode 100644
index 000000000..f509f62e1
--- /dev/null
+++ b/kernel/fs/jffs2/file.c
@@ -0,0 +1,337 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include "nodelist.h"
+
+static int jffs2_write_end(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pg, void *fsdata);
+static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata);
+static int jffs2_readpage (struct file *filp, struct page *pg);
+
+int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	/* Trigger GC to flush any pending writes for this inode */
+	jffs2_flush_wbuf_gc(c, inode->i_ino);
+	mutex_unlock(&inode->i_mutex);
+
+	return 0;
+}
+
+const struct file_operations jffs2_file_operations =
+{
+	.llseek =	generic_file_llseek,
+	.open =		generic_file_open,
+ 	.read_iter =	generic_file_read_iter,
+ 	.write_iter =	generic_file_write_iter,
+	.unlocked_ioctl=jffs2_ioctl,
+	.mmap =		generic_file_readonly_mmap,
+	.fsync =	jffs2_fsync,
+	.splice_read =	generic_file_splice_read,
+};
+
+/* jffs2_file_inode_operations */
+
+const struct inode_operations jffs2_file_inode_operations =
+{
+	.get_acl =	jffs2_get_acl,
+	.set_acl =	jffs2_set_acl,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
+};
+
+const struct address_space_operations jffs2_file_address_operations =
+{
+	.readpage =	jffs2_readpage,
+	.write_begin =	jffs2_write_begin,
+	.write_end =	jffs2_write_end,
+};
+
+static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	unsigned char *pg_buf;
+	int ret;
+
+	jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n",
+		  __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT);
+
+	BUG_ON(!PageLocked(pg));
+
+	pg_buf = kmap(pg);
+	/* FIXME: Can kmap fail? */
+
+	ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE);
+
+	if (ret) {
+		ClearPageUptodate(pg);
+		SetPageError(pg);
+	} else {
+		SetPageUptodate(pg);
+		ClearPageError(pg);
+	}
+
+	flush_dcache_page(pg);
+	kunmap(pg);
+
+	jffs2_dbg(2, "readpage finished\n");
+	return ret;
+}
+
+int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
+{
+	int ret = jffs2_do_readpage_nolock(inode, pg);
+	unlock_page(pg);
+	return ret;
+}
+
+
+static int jffs2_readpage (struct file *filp, struct page *pg)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host);
+	int ret;
+
+	mutex_lock(&f->sem);
+	ret = jffs2_do_readpage_unlock(pg->mapping->host, pg);
+	mutex_unlock(&f->sem);
+	return ret;
+}
+
+static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	struct page *pg;
+	struct inode *inode = mapping->host;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_raw_inode ri;
+	uint32_t alloc_len = 0;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	uint32_t pageofs = index << PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	jffs2_dbg(1, "%s()\n", __func__);
+
+	if (pageofs > inode->i_size) {
+		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		if (ret)
+			return ret;
+	}
+
+	mutex_lock(&f->sem);
+	pg = grab_cache_page_write_begin(mapping, index, flags);
+	if (!pg) {
+		if (alloc_len)
+			jffs2_complete_reservation(c);
+		mutex_unlock(&f->sem);
+		return -ENOMEM;
+	}
+	*pagep = pg;
+
+	if (alloc_len) {
+		/* Make new hole frag from old EOF to new page */
+		struct jffs2_full_dnode *fn;
+
+		jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
+			  (unsigned int)inode->i_size, pageofs);
+
+		memset(&ri, 0, sizeof(ri));
+
+		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri.totlen = cpu_to_je32(sizeof(ri));
+		ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri.ino = cpu_to_je32(f->inocache->ino);
+		ri.version = cpu_to_je32(++f->highest_version);
+		ri.mode = cpu_to_jemode(inode->i_mode);
+		ri.uid = cpu_to_je16(i_uid_read(inode));
+		ri.gid = cpu_to_je16(i_gid_read(inode));
+		ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs));
+		ri.atime = ri.ctime = ri.mtime = cpu_to_je32(get_seconds());
+		ri.offset = cpu_to_je32(inode->i_size);
+		ri.dsize = cpu_to_je32(pageofs - inode->i_size);
+		ri.csize = cpu_to_je32(0);
+		ri.compr = JFFS2_COMPR_ZERO;
+		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+		ri.data_crc = cpu_to_je32(0);
+
+		fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_NORMAL);
+
+		if (IS_ERR(fn)) {
+			ret = PTR_ERR(fn);
+			jffs2_complete_reservation(c);
+			goto out_page;
+		}
+		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+		if (ret) {
+			jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n",
+				  ret);
+			jffs2_mark_node_obsolete(c, fn->raw);
+			jffs2_free_full_dnode(fn);
+			jffs2_complete_reservation(c);
+			goto out_page;
+		}
+		jffs2_complete_reservation(c);
+		inode->i_size = pageofs;
+	}
+
+	/*
+	 * Read in the page if it wasn't already present. Cannot optimize away
+	 * the whole page write case until jffs2_write_end can handle the
+	 * case of a short-copy.
+	 */
+	if (!PageUptodate(pg)) {
+		ret = jffs2_do_readpage_nolock(inode, pg);
+		if (ret)
+			goto out_page;
+	}
+	mutex_unlock(&f->sem);
+	jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
+	return ret;
+
+out_page:
+	unlock_page(pg);
+	page_cache_release(pg);
+	mutex_unlock(&f->sem);
+	return ret;
+}
+
+static int jffs2_write_end(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pg, void *fsdata)
+{
+	/* Actually commit the write from the page cache page we're looking at.
+	 * For now, we write the full page out each time. It sucks, but it's simple
+	 */
+	struct inode *inode = mapping->host;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_raw_inode *ri;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned end = start + copied;
+	unsigned aligned_start = start & ~3;
+	int ret = 0;
+	uint32_t writtenlen = 0;
+
+	jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
+		  __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT,
+		  start, end, pg->flags);
+
+	/* We need to avoid deadlock with page_cache_read() in
+	   jffs2_garbage_collect_pass(). So the page must be
+	   up to date to prevent page_cache_read() from trying
+	   to re-lock it. */
+	BUG_ON(!PageUptodate(pg));
+
+	if (end == PAGE_CACHE_SIZE) {
+		/* When writing out the end of a page, write out the
+		   _whole_ page. This helps to reduce the number of
+		   nodes in files which have many short writes, like
+		   syslog files. */
+		aligned_start = 0;
+	}
+
+	ri = jffs2_alloc_raw_inode();
+
+	if (!ri) {
+		jffs2_dbg(1, "%s(): Allocation of raw inode failed\n",
+			  __func__);
+		unlock_page(pg);
+		page_cache_release(pg);
+		return -ENOMEM;
+	}
+
+	/* Set the fields that the generic jffs2_write_inode_range() code can't find */
+	ri->ino = cpu_to_je32(inode->i_ino);
+	ri->mode = cpu_to_jemode(inode->i_mode);
+	ri->uid = cpu_to_je16(i_uid_read(inode));
+	ri->gid = cpu_to_je16(i_gid_read(inode));
+	ri->isize = cpu_to_je32((uint32_t)inode->i_size);
+	ri->atime = ri->ctime = ri->mtime = cpu_to_je32(get_seconds());
+
+	/* In 2.4, it was already kmapped by generic_file_write(). Doesn't
+	   hurt to do it again. The alternative is ifdefs, which are ugly. */
+	kmap(pg);
+
+	ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start,
+				      (pg->index << PAGE_CACHE_SHIFT) + aligned_start,
+				      end - aligned_start, &writtenlen);
+
+	kunmap(pg);
+
+	if (ret) {
+		/* There was an error writing. */
+		SetPageError(pg);
+	}
+
+	/* Adjust writtenlen for the padding we did, so we don't confuse our caller */
+	writtenlen -= min(writtenlen, (start - aligned_start));
+
+	if (writtenlen) {
+		if (inode->i_size < pos + writtenlen) {
+			inode->i_size = pos + writtenlen;
+			inode->i_blocks = (inode->i_size + 511) >> 9;
+
+			inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime));
+		}
+	}
+
+	jffs2_free_raw_inode(ri);
+
+	if (start+writtenlen < end) {
+		/* generic_file_write has written more to the page cache than we've
+		   actually written to the medium. Mark the page !Uptodate so that
+		   it gets reread */
+		jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n",
+			__func__);
+		SetPageError(pg);
+		ClearPageUptodate(pg);
+	}
+
+	jffs2_dbg(1, "%s() returning %d\n",
+		  __func__, writtenlen > 0 ? writtenlen : ret);
+	unlock_page(pg);
+	page_cache_release(pg);
+	return writtenlen > 0 ? writtenlen : ret;
+}
diff --git a/kernel/fs/jffs2/fs.c b/kernel/fs/jffs2/fs.c
new file mode 100644
index 000000000..fe5ea080b
--- /dev/null
+++ b/kernel/fs/jffs2/fs.c
@@ -0,0 +1,766 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/mtd/mtd.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/vfs.h>
+#include <linux/crc32.h>
+#include "nodelist.h"
+
+static int jffs2_flash_setup(struct jffs2_sb_info *c);
+
+int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
+{
+	struct jffs2_full_dnode *old_metadata, *new_metadata;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_raw_inode *ri;
+	union jffs2_device_node dev;
+	unsigned char *mdata = NULL;
+	int mdatalen = 0;
+	unsigned int ivalid;
+	uint32_t alloclen;
+	int ret;
+	int alloc_type = ALLOC_NORMAL;
+
+	jffs2_dbg(1, "%s(): ino #%lu\n", __func__, inode->i_ino);
+
+	/* Special cases - we don't want more than one data node
+	   for these types on the medium at any time. So setattr
+	   must read the original data associated with the node
+	   (i.e. the device numbers or the target name) and write
+	   it out again with the appropriate data attached */
+	if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
+		/* For these, we don't actually need to read the old node */
+		mdatalen = jffs2_encode_dev(&dev, inode->i_rdev);
+		mdata = (char *)&dev;
+		jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n",
+			  __func__, mdatalen);
+	} else if (S_ISLNK(inode->i_mode)) {
+		mutex_lock(&f->sem);
+		mdatalen = f->metadata->size;
+		mdata = kmalloc(f->metadata->size, GFP_USER);
+		if (!mdata) {
+			mutex_unlock(&f->sem);
+			return -ENOMEM;
+		}
+		ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen);
+		if (ret) {
+			mutex_unlock(&f->sem);
+			kfree(mdata);
+			return ret;
+		}
+		mutex_unlock(&f->sem);
+		jffs2_dbg(1, "%s(): Writing %d bytes of symlink target\n",
+			  __func__, mdatalen);
+	}
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri) {
+		if (S_ISLNK(inode->i_mode))
+			kfree(mdata);
+		return -ENOMEM;
+	}
+
+	ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		if (S_ISLNK(inode->i_mode))
+			 kfree(mdata);
+		return ret;
+	}
+	mutex_lock(&f->sem);
+	ivalid = iattr->ia_valid;
+
+	ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+	ri->totlen = cpu_to_je32(sizeof(*ri) + mdatalen);
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri->ino = cpu_to_je32(inode->i_ino);
+	ri->version = cpu_to_je32(++f->highest_version);
+
+	ri->uid = cpu_to_je16((ivalid & ATTR_UID)?
+		from_kuid(&init_user_ns, iattr->ia_uid):i_uid_read(inode));
+	ri->gid = cpu_to_je16((ivalid & ATTR_GID)?
+		from_kgid(&init_user_ns, iattr->ia_gid):i_gid_read(inode));
+
+	if (ivalid & ATTR_MODE)
+		ri->mode = cpu_to_jemode(iattr->ia_mode);
+	else
+		ri->mode = cpu_to_jemode(inode->i_mode);
+
+
+	ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
+	ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
+	ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
+	ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime));
+
+	ri->offset = cpu_to_je32(0);
+	ri->csize = ri->dsize = cpu_to_je32(mdatalen);
+	ri->compr = JFFS2_COMPR_NONE;
+	if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) {
+		/* It's an extension. Make it a hole node */
+		ri->compr = JFFS2_COMPR_ZERO;
+		ri->dsize = cpu_to_je32(iattr->ia_size - inode->i_size);
+		ri->offset = cpu_to_je32(inode->i_size);
+	} else if (ivalid & ATTR_SIZE && !iattr->ia_size) {
+		/* For truncate-to-zero, treat it as deletion because
+		   it'll always be obsoleting all previous nodes */
+		alloc_type = ALLOC_DELETION;
+	}
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+	if (mdatalen)
+		ri->data_crc = cpu_to_je32(crc32(0, mdata, mdatalen));
+	else
+		ri->data_crc = cpu_to_je32(0);
+
+	new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, alloc_type);
+	if (S_ISLNK(inode->i_mode))
+		kfree(mdata);
+
+	if (IS_ERR(new_metadata)) {
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_inode(ri);
+		mutex_unlock(&f->sem);
+		return PTR_ERR(new_metadata);
+	}
+	/* It worked. Update the inode */
+	inode->i_atime = ITIME(je32_to_cpu(ri->atime));
+	inode->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+	inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
+	inode->i_mode = jemode_to_cpu(ri->mode);
+	i_uid_write(inode, je16_to_cpu(ri->uid));
+	i_gid_write(inode, je16_to_cpu(ri->gid));
+
+
+	old_metadata = f->metadata;
+
+	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size)
+		jffs2_truncate_fragtree (c, &f->fragtree, iattr->ia_size);
+
+	if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) {
+		jffs2_add_full_dnode_to_inode(c, f, new_metadata);
+		inode->i_size = iattr->ia_size;
+		inode->i_blocks = (inode->i_size + 511) >> 9;
+		f->metadata = NULL;
+	} else {
+		f->metadata = new_metadata;
+	}
+	if (old_metadata) {
+		jffs2_mark_node_obsolete(c, old_metadata->raw);
+		jffs2_free_full_dnode(old_metadata);
+	}
+	jffs2_free_raw_inode(ri);
+
+	mutex_unlock(&f->sem);
+	jffs2_complete_reservation(c);
+
+	/* We have to do the truncate_setsize() without f->sem held, since
+	   some pages may be locked and waiting for it in readpage().
+	   We are protected from a simultaneous write() extending i_size
+	   back past iattr->ia_size, because do_truncate() holds the
+	   generic inode semaphore. */
+	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
+		truncate_setsize(inode, iattr->ia_size);
+		inode->i_blocks = (inode->i_size + 511) >> 9;
+	}	
+
+	return 0;
+}
+
+int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	int rc;
+
+	rc = inode_change_ok(inode, iattr);
+	if (rc)
+		return rc;
+
+	rc = jffs2_do_setattr(inode, iattr);
+	if (!rc && (iattr->ia_valid & ATTR_MODE))
+		rc = posix_acl_chmod(inode, inode->i_mode);
+
+	return rc;
+}
+
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
+	unsigned long avail;
+
+	buf->f_type = JFFS2_SUPER_MAGIC;
+	buf->f_bsize = 1 << PAGE_SHIFT;
+	buf->f_blocks = c->flash_size >> PAGE_SHIFT;
+	buf->f_files = 0;
+	buf->f_ffree = 0;
+	buf->f_namelen = JFFS2_MAX_NAME_LEN;
+	buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC;
+	buf->f_fsid.val[1] = c->mtd->index;
+
+	spin_lock(&c->erase_completion_lock);
+	avail = c->dirty_size + c->free_size;
+	if (avail > c->sector_size * c->resv_blocks_write)
+		avail -= c->sector_size * c->resv_blocks_write;
+	else
+		avail = 0;
+	spin_unlock(&c->erase_completion_lock);
+
+	buf->f_bavail = buf->f_bfree = avail >> PAGE_SHIFT;
+
+	return 0;
+}
+
+
+void jffs2_evict_inode (struct inode *inode)
+{
+	/* We can forget about this inode for now - drop all
+	 *  the nodelists associated with it, etc.
+	 */
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+	jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
+		  __func__, inode->i_ino, inode->i_mode);
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	jffs2_do_clear_inode(c, f);
+}
+
+struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
+{
+	struct jffs2_inode_info *f;
+	struct jffs2_sb_info *c;
+	struct jffs2_raw_inode latest_node;
+	union jffs2_device_node jdev;
+	struct inode *inode;
+	dev_t rdev = 0;
+	int ret;
+
+	jffs2_dbg(1, "%s(): ino == %lu\n", __func__, ino);
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	f = JFFS2_INODE_INFO(inode);
+	c = JFFS2_SB_INFO(inode->i_sb);
+
+	jffs2_init_inode_info(f);
+	mutex_lock(&f->sem);
+
+	ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node);
+
+	if (ret) {
+		mutex_unlock(&f->sem);
+		iget_failed(inode);
+		return ERR_PTR(ret);
+	}
+	inode->i_mode = jemode_to_cpu(latest_node.mode);
+	i_uid_write(inode, je16_to_cpu(latest_node.uid));
+	i_gid_write(inode, je16_to_cpu(latest_node.gid));
+	inode->i_size = je32_to_cpu(latest_node.isize);
+	inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
+	inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
+	inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
+
+	set_nlink(inode, f->inocache->pino_nlink);
+
+	inode->i_blocks = (inode->i_size + 511) >> 9;
+
+	switch (inode->i_mode & S_IFMT) {
+
+	case S_IFLNK:
+		inode->i_op = &jffs2_symlink_inode_operations;
+		break;
+
+	case S_IFDIR:
+	{
+		struct jffs2_full_dirent *fd;
+		set_nlink(inode, 2); /* parent and '.' */
+
+		for (fd=f->dents; fd; fd = fd->next) {
+			if (fd->type == DT_DIR && fd->ino)
+				inc_nlink(inode);
+		}
+		/* Root dir gets i_nlink 3 for some reason */
+		if (inode->i_ino == 1)
+			inc_nlink(inode);
+
+		inode->i_op = &jffs2_dir_inode_operations;
+		inode->i_fop = &jffs2_dir_operations;
+		break;
+	}
+	case S_IFREG:
+		inode->i_op = &jffs2_file_inode_operations;
+		inode->i_fop = &jffs2_file_operations;
+		inode->i_mapping->a_ops = &jffs2_file_address_operations;
+		inode->i_mapping->nrpages = 0;
+		break;
+
+	case S_IFBLK:
+	case S_IFCHR:
+		/* Read the device numbers from the media */
+		if (f->metadata->size != sizeof(jdev.old_id) &&
+		    f->metadata->size != sizeof(jdev.new_id)) {
+			pr_notice("Device node has strange size %d\n",
+				  f->metadata->size);
+			goto error_io;
+		}
+		jffs2_dbg(1, "Reading device numbers from flash\n");
+		ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size);
+		if (ret < 0) {
+			/* Eep */
+			pr_notice("Read device numbers for inode %lu failed\n",
+				  (unsigned long)inode->i_ino);
+			goto error;
+		}
+		if (f->metadata->size == sizeof(jdev.old_id))
+			rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
+		else
+			rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
+
+	case S_IFSOCK:
+	case S_IFIFO:
+		inode->i_op = &jffs2_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+
+	default:
+		pr_warn("%s(): Bogus i_mode %o for ino %lu\n",
+			__func__, inode->i_mode, (unsigned long)inode->i_ino);
+	}
+
+	mutex_unlock(&f->sem);
+
+	jffs2_dbg(1, "jffs2_read_inode() returning\n");
+	unlock_new_inode(inode);
+	return inode;
+
+error_io:
+	ret = -EIO;
+error:
+	mutex_unlock(&f->sem);
+	jffs2_do_clear_inode(c, f);
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+void jffs2_dirty_inode(struct inode *inode, int flags)
+{
+	struct iattr iattr;
+
+	if (!(inode->i_state & I_DIRTY_DATASYNC)) {
+		jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n",
+			  __func__, inode->i_ino);
+		return;
+	}
+
+	jffs2_dbg(1, "%s(): calling setattr() for ino #%lu\n",
+		  __func__, inode->i_ino);
+
+	iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME;
+	iattr.ia_mode = inode->i_mode;
+	iattr.ia_uid = inode->i_uid;
+	iattr.ia_gid = inode->i_gid;
+	iattr.ia_atime = inode->i_atime;
+	iattr.ia_mtime = inode->i_mtime;
+	iattr.ia_ctime = inode->i_ctime;
+
+	jffs2_do_setattr(inode, &iattr);
+}
+
+int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	if (c->flags & JFFS2_SB_FLAG_RO && !(sb->s_flags & MS_RDONLY))
+		return -EROFS;
+
+	/* We stop if it was running, then restart if it needs to.
+	   This also catches the case where it was stopped and this
+	   is just a remount to restart it.
+	   Flush the writebuffer, if neccecary, else we loose it */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		jffs2_stop_garbage_collect_thread(c);
+		mutex_lock(&c->alloc_sem);
+		jffs2_flush_wbuf_pad(c);
+		mutex_unlock(&c->alloc_sem);
+	}
+
+	if (!(*flags & MS_RDONLY))
+		jffs2_start_garbage_collect_thread(c);
+
+	*flags |= MS_NOATIME;
+	return 0;
+}
+
+/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
+   fill in the raw_inode while you're at it. */
+struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri)
+{
+	struct inode *inode;
+	struct super_block *sb = dir_i->i_sb;
+	struct jffs2_sb_info *c;
+	struct jffs2_inode_info *f;
+	int ret;
+
+	jffs2_dbg(1, "%s(): dir_i %ld, mode 0x%x\n",
+		  __func__, dir_i->i_ino, mode);
+
+	c = JFFS2_SB_INFO(sb);
+
+	inode = new_inode(sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	f = JFFS2_INODE_INFO(inode);
+	jffs2_init_inode_info(f);
+	mutex_lock(&f->sem);
+
+	memset(ri, 0, sizeof(*ri));
+	/* Set OS-specific defaults for new inodes */
+	ri->uid = cpu_to_je16(from_kuid(&init_user_ns, current_fsuid()));
+
+	if (dir_i->i_mode & S_ISGID) {
+		ri->gid = cpu_to_je16(i_gid_read(dir_i));
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else {
+		ri->gid = cpu_to_je16(from_kgid(&init_user_ns, current_fsgid()));
+	}
+
+	/* POSIX ACLs have to be processed now, at least partly.
+	   The umask is only applied if there's no default ACL */
+	ret = jffs2_init_acl_pre(dir_i, inode, &mode);
+	if (ret) {
+		mutex_unlock(&f->sem);
+		make_bad_inode(inode);
+		iput(inode);
+		return ERR_PTR(ret);
+	}
+	ret = jffs2_do_new_inode (c, f, mode, ri);
+	if (ret) {
+		mutex_unlock(&f->sem);
+		make_bad_inode(inode);
+		iput(inode);
+		return ERR_PTR(ret);
+	}
+	set_nlink(inode, 1);
+	inode->i_ino = je32_to_cpu(ri->ino);
+	inode->i_mode = jemode_to_cpu(ri->mode);
+	i_gid_write(inode, je16_to_cpu(ri->gid));
+	i_uid_write(inode, je16_to_cpu(ri->uid));
+	inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
+
+	inode->i_blocks = 0;
+	inode->i_size = 0;
+
+	if (insert_inode_locked(inode) < 0) {
+		mutex_unlock(&f->sem);
+		make_bad_inode(inode);
+		iput(inode);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return inode;
+}
+
+static int calculate_inocache_hashsize(uint32_t flash_size)
+{
+	/*
+	 * Pick a inocache hash size based on the size of the medium.
+	 * Count how many megabytes we're dealing with, apply a hashsize twice
+	 * that size, but rounding down to the usual big powers of 2. And keep
+	 * to sensible bounds.
+	 */
+
+	int size_mb = flash_size / 1024 / 1024;
+	int hashsize = (size_mb * 2) & ~0x3f;
+
+	if (hashsize < INOCACHE_HASHSIZE_MIN)
+		return INOCACHE_HASHSIZE_MIN;
+	if (hashsize > INOCACHE_HASHSIZE_MAX)
+		return INOCACHE_HASHSIZE_MAX;
+
+	return hashsize;
+}
+
+int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct jffs2_sb_info *c;
+	struct inode *root_i;
+	int ret;
+	size_t blocks;
+
+	c = JFFS2_SB_INFO(sb);
+
+	/* Do not support the MLC nand */
+	if (c->mtd->type == MTD_MLCNANDFLASH)
+		return -EINVAL;
+
+#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (c->mtd->type == MTD_NANDFLASH) {
+		pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n");
+		return -EINVAL;
+	}
+	if (c->mtd->type == MTD_DATAFLASH) {
+		pr_err("Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in\n");
+		return -EINVAL;
+	}
+#endif
+
+	c->flash_size = c->mtd->size;
+	c->sector_size = c->mtd->erasesize;
+	blocks = c->flash_size / c->sector_size;
+
+	/*
+	 * Size alignment check
+	 */
+	if ((c->sector_size * blocks) != c->flash_size) {
+		c->flash_size = c->sector_size * blocks;
+		pr_info("Flash size not aligned to erasesize, reducing to %dKiB\n",
+			c->flash_size / 1024);
+	}
+
+	if (c->flash_size < 5*c->sector_size) {
+		pr_err("Too few erase blocks (%d)\n",
+		       c->flash_size / c->sector_size);
+		return -EINVAL;
+	}
+
+	c->cleanmarker_size = sizeof(struct jffs2_unknown_node);
+
+	/* NAND (or other bizarre) flash... do setup accordingly */
+	ret = jffs2_flash_setup(c);
+	if (ret)
+		return ret;
+
+	c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
+	c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
+	if (!c->inocache_list) {
+		ret = -ENOMEM;
+		goto out_wbuf;
+	}
+
+	jffs2_init_xattr_subsystem(c);
+
+	if ((ret = jffs2_do_mount_fs(c)))
+		goto out_inohash;
+
+	jffs2_dbg(1, "%s(): Getting root inode\n", __func__);
+	root_i = jffs2_iget(sb, 1);
+	if (IS_ERR(root_i)) {
+		jffs2_dbg(1, "get root inode failed\n");
+		ret = PTR_ERR(root_i);
+		goto out_root;
+	}
+
+	ret = -ENOMEM;
+
+	jffs2_dbg(1, "%s(): d_make_root()\n", __func__);
+	sb->s_root = d_make_root(root_i);
+	if (!sb->s_root)
+		goto out_root;
+
+	sb->s_maxbytes = 0xFFFFFFFF;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = JFFS2_SUPER_MAGIC;
+	if (!(sb->s_flags & MS_RDONLY))
+		jffs2_start_garbage_collect_thread(c);
+	return 0;
+
+out_root:
+	jffs2_free_ino_caches(c);
+	jffs2_free_raw_node_refs(c);
+	if (jffs2_blocks_use_vmalloc(c))
+		vfree(c->blocks);
+	else
+		kfree(c->blocks);
+ out_inohash:
+	jffs2_clear_xattr_subsystem(c);
+	kfree(c->inocache_list);
+ out_wbuf:
+	jffs2_flash_cleanup(c);
+
+	return ret;
+}
+
+void jffs2_gc_release_inode(struct jffs2_sb_info *c,
+				   struct jffs2_inode_info *f)
+{
+	iput(OFNI_EDONI_2SFFJ(f));
+}
+
+struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c,
+					      int inum, int unlinked)
+{
+	struct inode *inode;
+	struct jffs2_inode_cache *ic;
+
+	if (unlinked) {
+		/* The inode has zero nlink but its nodes weren't yet marked
+		   obsolete. This has to be because we're still waiting for
+		   the final (close() and) iput() to happen.
+
+		   There's a possibility that the final iput() could have
+		   happened while we were contemplating. In order to ensure
+		   that we don't cause a new read_inode() (which would fail)
+		   for the inode in question, we use ilookup() in this case
+		   instead of iget().
+
+		   The nlink can't _become_ zero at this point because we're
+		   holding the alloc_sem, and jffs2_do_unlink() would also
+		   need that while decrementing nlink on any inode.
+		*/
+		inode = ilookup(OFNI_BS_2SFFJ(c), inum);
+		if (!inode) {
+			jffs2_dbg(1, "ilookup() failed for ino #%u; inode is probably deleted.\n",
+				  inum);
+
+			spin_lock(&c->inocache_lock);
+			ic = jffs2_get_ino_cache(c, inum);
+			if (!ic) {
+				jffs2_dbg(1, "Inode cache for ino #%u is gone\n",
+					  inum);
+				spin_unlock(&c->inocache_lock);
+				return NULL;
+			}
+			if (ic->state != INO_STATE_CHECKEDABSENT) {
+				/* Wait for progress. Don't just loop */
+				jffs2_dbg(1, "Waiting for ino #%u in state %d\n",
+					  ic->ino, ic->state);
+				sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+			} else {
+				spin_unlock(&c->inocache_lock);
+			}
+
+			return NULL;
+		}
+	} else {
+		/* Inode has links to it still; they're not going away because
+		   jffs2_do_unlink() would need the alloc_sem and we have it.
+		   Just iget() it, and if read_inode() is necessary that's OK.
+		*/
+		inode = jffs2_iget(OFNI_BS_2SFFJ(c), inum);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	if (is_bad_inode(inode)) {
+		pr_notice("Eep. read_inode() failed for ino #%u. unlinked %d\n",
+			  inum, unlinked);
+		/* NB. This will happen again. We need to do something appropriate here. */
+		iput(inode);
+		return ERR_PTR(-EIO);
+	}
+
+	return JFFS2_INODE_INFO(inode);
+}
+
+unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
+				   struct jffs2_inode_info *f,
+				   unsigned long offset,
+				   unsigned long *priv)
+{
+	struct inode *inode = OFNI_EDONI_2SFFJ(f);
+	struct page *pg;
+
+	pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+			     (void *)jffs2_do_readpage_unlock, inode);
+	if (IS_ERR(pg))
+		return (void *)pg;
+
+	*priv = (unsigned long)pg;
+	return kmap(pg);
+}
+
+void jffs2_gc_release_page(struct jffs2_sb_info *c,
+			   unsigned char *ptr,
+			   unsigned long *priv)
+{
+	struct page *pg = (void *)*priv;
+
+	kunmap(pg);
+	page_cache_release(pg);
+}
+
+static int jffs2_flash_setup(struct jffs2_sb_info *c) {
+	int ret = 0;
+
+	if (jffs2_cleanmarker_oob(c)) {
+		/* NAND flash... do setup accordingly */
+		ret = jffs2_nand_flash_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	/* and Dataflash */
+	if (jffs2_dataflash(c)) {
+		ret = jffs2_dataflash_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	/* and Intel "Sibley" flash */
+	if (jffs2_nor_wbuf_flash(c)) {
+		ret = jffs2_nor_wbuf_flash_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	/* and an UBI volume */
+	if (jffs2_ubivol(c)) {
+		ret = jffs2_ubivol_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+
+void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
+
+	if (jffs2_cleanmarker_oob(c)) {
+		jffs2_nand_flash_cleanup(c);
+	}
+
+	/* and DataFlash */
+	if (jffs2_dataflash(c)) {
+		jffs2_dataflash_cleanup(c);
+	}
+
+	/* and Intel "Sibley" flash */
+	if (jffs2_nor_wbuf_flash(c)) {
+		jffs2_nor_wbuf_flash_cleanup(c);
+	}
+
+	/* and an UBI volume */
+	if (jffs2_ubivol(c)) {
+		jffs2_ubivol_cleanup(c);
+	}
+}
diff --git a/kernel/fs/jffs2/gc.c b/kernel/fs/jffs2/gc.c
new file mode 100644
index 000000000..5a2dec2b0
--- /dev/null
+++ b/kernel/fs/jffs2/gc.c
@@ -0,0 +1,1378 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/mtd/mtd.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/compiler.h>
+#include <linux/stat.h>
+#include "nodelist.h"
+#include "compr.h"
+
+static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
+					  struct jffs2_inode_cache *ic,
+					  struct jffs2_raw_node_ref *raw);
+static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dnode *fd);
+static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd);
+static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd);
+static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				      struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				      uint32_t start, uint32_t end);
+static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				       struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				       uint32_t start, uint32_t end);
+static int jffs2_garbage_collect_live(struct jffs2_sb_info *c,  struct jffs2_eraseblock *jeb,
+			       struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f);
+
+/* Called with erase_completion_lock held */
+static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c)
+{
+	struct jffs2_eraseblock *ret;
+	struct list_head *nextlist = NULL;
+	int n = jiffies % 128;
+
+	/* Pick an eraseblock to garbage collect next. This is where we'll
+	   put the clever wear-levelling algorithms. Eventually.  */
+	/* We possibly want to favour the dirtier blocks more when the
+	   number of free blocks is low. */
+again:
+	if (!list_empty(&c->bad_used_list) && c->nr_free_blocks > c->resv_blocks_gcbad) {
+		jffs2_dbg(1, "Picking block from bad_used_list to GC next\n");
+		nextlist = &c->bad_used_list;
+	} else if (n < 50 && !list_empty(&c->erasable_list)) {
+		/* Note that most of them will have gone directly to be erased.
+		   So don't favour the erasable_list _too_ much. */
+		jffs2_dbg(1, "Picking block from erasable_list to GC next\n");
+		nextlist = &c->erasable_list;
+	} else if (n < 110 && !list_empty(&c->very_dirty_list)) {
+		/* Most of the time, pick one off the very_dirty list */
+		jffs2_dbg(1, "Picking block from very_dirty_list to GC next\n");
+		nextlist = &c->very_dirty_list;
+	} else if (n < 126 && !list_empty(&c->dirty_list)) {
+		jffs2_dbg(1, "Picking block from dirty_list to GC next\n");
+		nextlist = &c->dirty_list;
+	} else if (!list_empty(&c->clean_list)) {
+		jffs2_dbg(1, "Picking block from clean_list to GC next\n");
+		nextlist = &c->clean_list;
+	} else if (!list_empty(&c->dirty_list)) {
+		jffs2_dbg(1, "Picking block from dirty_list to GC next (clean_list was empty)\n");
+
+		nextlist = &c->dirty_list;
+	} else if (!list_empty(&c->very_dirty_list)) {
+		jffs2_dbg(1, "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n");
+		nextlist = &c->very_dirty_list;
+	} else if (!list_empty(&c->erasable_list)) {
+		jffs2_dbg(1, "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n");
+
+		nextlist = &c->erasable_list;
+	} else if (!list_empty(&c->erasable_pending_wbuf_list)) {
+		/* There are blocks are wating for the wbuf sync */
+		jffs2_dbg(1, "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n");
+		spin_unlock(&c->erase_completion_lock);
+		jffs2_flush_wbuf_pad(c);
+		spin_lock(&c->erase_completion_lock);
+		goto again;
+	} else {
+		/* Eep. All were empty */
+		jffs2_dbg(1, "No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n");
+		return NULL;
+	}
+
+	ret = list_entry(nextlist->next, struct jffs2_eraseblock, list);
+	list_del(&ret->list);
+	c->gcblock = ret;
+	ret->gc_node = ret->first_node;
+	if (!ret->gc_node) {
+		pr_warn("Eep. ret->gc_node for block at 0x%08x is NULL\n",
+			ret->offset);
+		BUG();
+	}
+
+	/* Have we accidentally picked a clean block with wasted space ? */
+	if (ret->wasted_size) {
+		jffs2_dbg(1, "Converting wasted_size %08x to dirty_size\n",
+			  ret->wasted_size);
+		ret->dirty_size += ret->wasted_size;
+		c->wasted_size -= ret->wasted_size;
+		c->dirty_size += ret->wasted_size;
+		ret->wasted_size = 0;
+	}
+
+	return ret;
+}
+
+/* jffs2_garbage_collect_pass
+ * Make a single attempt to progress GC. Move one node, and possibly
+ * start erasing one eraseblock.
+ */
+int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
+{
+	struct jffs2_inode_info *f;
+	struct jffs2_inode_cache *ic;
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t gcblock_dirty;
+	int ret = 0, inum, nlink;
+	int xattr = 0;
+
+	if (mutex_lock_interruptible(&c->alloc_sem))
+		return -EINTR;
+
+	for (;;) {
+		spin_lock(&c->erase_completion_lock);
+		if (!c->unchecked_size)
+			break;
+
+		/* We can't start doing GC yet. We haven't finished checking
+		   the node CRCs etc. Do it now. */
+
+		/* checked_ino is protected by the alloc_sem */
+		if (c->checked_ino > c->highest_ino && xattr) {
+			pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
+				c->unchecked_size);
+			jffs2_dbg_dump_block_lists_nolock(c);
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->alloc_sem);
+			return -ENOSPC;
+		}
+
+		spin_unlock(&c->erase_completion_lock);
+
+		if (!xattr)
+			xattr = jffs2_verify_xattr(c);
+
+		spin_lock(&c->inocache_lock);
+
+		ic = jffs2_get_ino_cache(c, c->checked_ino++);
+
+		if (!ic) {
+			spin_unlock(&c->inocache_lock);
+			continue;
+		}
+
+		if (!ic->pino_nlink) {
+			jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n",
+				  ic->ino);
+			spin_unlock(&c->inocache_lock);
+			jffs2_xattr_delete_inode(c, ic);
+			continue;
+		}
+		switch(ic->state) {
+		case INO_STATE_CHECKEDABSENT:
+		case INO_STATE_PRESENT:
+			jffs2_dbg(1, "Skipping ino #%u already checked\n",
+				  ic->ino);
+			spin_unlock(&c->inocache_lock);
+			continue;
+
+		case INO_STATE_GC:
+		case INO_STATE_CHECKING:
+			pr_warn("Inode #%u is in state %d during CRC check phase!\n",
+				ic->ino, ic->state);
+			spin_unlock(&c->inocache_lock);
+			BUG();
+
+		case INO_STATE_READING:
+			/* We need to wait for it to finish, lest we move on
+			   and trigger the BUG() above while we haven't yet
+			   finished checking all its nodes */
+			jffs2_dbg(1, "Waiting for ino #%u to finish reading\n",
+				  ic->ino);
+			/* We need to come back again for the _same_ inode. We've
+			 made no progress in this case, but that should be OK */
+			c->checked_ino--;
+
+			mutex_unlock(&c->alloc_sem);
+			sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+			return 0;
+
+		default:
+			BUG();
+
+		case INO_STATE_UNCHECKED:
+			;
+		}
+		ic->state = INO_STATE_CHECKING;
+		spin_unlock(&c->inocache_lock);
+
+		jffs2_dbg(1, "%s(): triggering inode scan of ino#%u\n",
+			  __func__, ic->ino);
+
+		ret = jffs2_do_crccheck_inode(c, ic);
+		if (ret)
+			pr_warn("Returned error for crccheck of ino #%u. Expect badness...\n",
+				ic->ino);
+
+		jffs2_set_inocache_state(c, ic, INO_STATE_CHECKEDABSENT);
+		mutex_unlock(&c->alloc_sem);
+		return ret;
+	}
+
+	/* If there are any blocks which need erasing, erase them now */
+	if (!list_empty(&c->erase_complete_list) ||
+	    !list_empty(&c->erase_pending_list)) {
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->alloc_sem);
+		jffs2_dbg(1, "%s(): erasing pending blocks\n", __func__);
+		if (jffs2_erase_pending_blocks(c, 1))
+			return 0;
+
+		jffs2_dbg(1, "No progress from erasing block; doing GC anyway\n");
+		mutex_lock(&c->alloc_sem);
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	/* First, work out which block we're garbage-collecting */
+	jeb = c->gcblock;
+
+	if (!jeb)
+		jeb = jffs2_find_gc_block(c);
+
+	if (!jeb) {
+		/* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
+		if (c->nr_erasing_blocks) {
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->alloc_sem);
+			return -EAGAIN;
+		}
+		jffs2_dbg(1, "Couldn't find erase block to garbage collect!\n");
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->alloc_sem);
+		return -EIO;
+	}
+
+	jffs2_dbg(1, "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n",
+		  jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size);
+	D1(if (c->nextblock)
+	   printk(KERN_DEBUG "Nextblock at  %08x, used_size %08x, dirty_size %08x, wasted_size %08x, free_size %08x\n", c->nextblock->offset, c->nextblock->used_size, c->nextblock->dirty_size, c->nextblock->wasted_size, c->nextblock->free_size));
+
+	if (!jeb->used_size) {
+		mutex_unlock(&c->alloc_sem);
+		goto eraseit;
+	}
+
+	raw = jeb->gc_node;
+	gcblock_dirty = jeb->dirty_size;
+
+	while(ref_obsolete(raw)) {
+		jffs2_dbg(1, "Node at 0x%08x is obsolete... skipping\n",
+			  ref_offset(raw));
+		raw = ref_next(raw);
+		if (unlikely(!raw)) {
+			pr_warn("eep. End of raw list while still supposedly nodes to GC\n");
+			pr_warn("erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n",
+				jeb->offset, jeb->free_size,
+				jeb->dirty_size, jeb->used_size);
+			jeb->gc_node = raw;
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->alloc_sem);
+			BUG();
+		}
+	}
+	jeb->gc_node = raw;
+
+	jffs2_dbg(1, "Going to garbage collect node at 0x%08x\n",
+		  ref_offset(raw));
+
+	if (!raw->next_in_ino) {
+		/* Inode-less node. Clean marker, snapshot or something like that */
+		spin_unlock(&c->erase_completion_lock);
+		if (ref_flags(raw) == REF_PRISTINE) {
+			/* It's an unknown node with JFFS2_FEATURE_RWCOMPAT_COPY */
+			jffs2_garbage_collect_pristine(c, NULL, raw);
+		} else {
+			/* Just mark it obsolete */
+			jffs2_mark_node_obsolete(c, raw);
+		}
+		mutex_unlock(&c->alloc_sem);
+		goto eraseit_lock;
+	}
+
+	ic = jffs2_raw_ref_to_ic(raw);
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	/* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr.
+	 * We can decide whether this node is inode or xattr by ic->class.     */
+	if (ic->class == RAWNODE_CLASS_XATTR_DATUM
+	    || ic->class == RAWNODE_CLASS_XATTR_REF) {
+		spin_unlock(&c->erase_completion_lock);
+
+		if (ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw);
+		} else {
+			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw);
+		}
+		goto test_gcnode;
+	}
+#endif
+
+	/* We need to hold the inocache. Either the erase_completion_lock or
+	   the inocache_lock are sufficient; we trade down since the inocache_lock
+	   causes less contention. */
+	spin_lock(&c->inocache_lock);
+
+	spin_unlock(&c->erase_completion_lock);
+
+	jffs2_dbg(1, "%s(): collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n",
+		  __func__, jeb->offset, ref_offset(raw), ref_flags(raw),
+		  ic->ino);
+
+	/* Three possibilities:
+	   1. Inode is already in-core. We must iget it and do proper
+	      updating to its fragtree, etc.
+	   2. Inode is not in-core, node is REF_PRISTINE. We lock the
+	      inocache to prevent a read_inode(), copy the node intact.
+	   3. Inode is not in-core, node is not pristine. We must iget()
+	      and take the slow path.
+	*/
+
+	switch(ic->state) {
+	case INO_STATE_CHECKEDABSENT:
+		/* It's been checked, but it's not currently in-core.
+		   We can just copy any pristine nodes, but have
+		   to prevent anyone else from doing read_inode() while
+		   we're at it, so we set the state accordingly */
+		if (ref_flags(raw) == REF_PRISTINE)
+			ic->state = INO_STATE_GC;
+		else {
+			jffs2_dbg(1, "Ino #%u is absent but node not REF_PRISTINE. Reading.\n",
+				  ic->ino);
+		}
+		break;
+
+	case INO_STATE_PRESENT:
+		/* It's in-core. GC must iget() it. */
+		break;
+
+	case INO_STATE_UNCHECKED:
+	case INO_STATE_CHECKING:
+	case INO_STATE_GC:
+		/* Should never happen. We should have finished checking
+		   by the time we actually start doing any GC, and since
+		   we're holding the alloc_sem, no other garbage collection
+		   can happen.
+		*/
+		pr_crit("Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n",
+			ic->ino, ic->state);
+		mutex_unlock(&c->alloc_sem);
+		spin_unlock(&c->inocache_lock);
+		BUG();
+
+	case INO_STATE_READING:
+		/* Someone's currently trying to read it. We must wait for
+		   them to finish and then go through the full iget() route
+		   to do the GC. However, sometimes read_inode() needs to get
+		   the alloc_sem() (for marking nodes invalid) so we must
+		   drop the alloc_sem before sleeping. */
+
+		mutex_unlock(&c->alloc_sem);
+		jffs2_dbg(1, "%s(): waiting for ino #%u in state %d\n",
+			  __func__, ic->ino, ic->state);
+		sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+		/* And because we dropped the alloc_sem we must start again from the
+		   beginning. Ponder chance of livelock here -- we're returning success
+		   without actually making any progress.
+
+		   Q: What are the chances that the inode is back in INO_STATE_READING
+		   again by the time we next enter this function? And that this happens
+		   enough times to cause a real delay?
+
+		   A: Small enough that I don't care :)
+		*/
+		return 0;
+	}
+
+	/* OK. Now if the inode is in state INO_STATE_GC, we are going to copy the
+	   node intact, and we don't have to muck about with the fragtree etc.
+	   because we know it's not in-core. If it _was_ in-core, we go through
+	   all the iget() crap anyway */
+
+	if (ic->state == INO_STATE_GC) {
+		spin_unlock(&c->inocache_lock);
+
+		ret = jffs2_garbage_collect_pristine(c, ic, raw);
+
+		spin_lock(&c->inocache_lock);
+		ic->state = INO_STATE_CHECKEDABSENT;
+		wake_up(&c->inocache_wq);
+
+		if (ret != -EBADFD) {
+			spin_unlock(&c->inocache_lock);
+			goto test_gcnode;
+		}
+
+		/* Fall through if it wanted us to, with inocache_lock held */
+	}
+
+	/* Prevent the fairly unlikely race where the gcblock is
+	   entirely obsoleted by the final close of a file which had
+	   the only valid nodes in the block, followed by erasure,
+	   followed by freeing of the ic because the erased block(s)
+	   held _all_ the nodes of that inode.... never been seen but
+	   it's vaguely possible. */
+
+	inum = ic->ino;
+	nlink = ic->pino_nlink;
+	spin_unlock(&c->inocache_lock);
+
+	f = jffs2_gc_fetch_inode(c, inum, !nlink);
+	if (IS_ERR(f)) {
+		ret = PTR_ERR(f);
+		goto release_sem;
+	}
+	if (!f) {
+		ret = 0;
+		goto release_sem;
+	}
+
+	ret = jffs2_garbage_collect_live(c, jeb, raw, f);
+
+	jffs2_gc_release_inode(c, f);
+
+ test_gcnode:
+	if (jeb->dirty_size == gcblock_dirty && !ref_obsolete(jeb->gc_node)) {
+		/* Eep. This really should never happen. GC is broken */
+		pr_err("Error garbage collecting node at %08x!\n",
+		       ref_offset(jeb->gc_node));
+		ret = -ENOSPC;
+	}
+ release_sem:
+	mutex_unlock(&c->alloc_sem);
+
+ eraseit_lock:
+	/* If we've finished this block, start it erasing */
+	spin_lock(&c->erase_completion_lock);
+
+ eraseit:
+	if (c->gcblock && !c->gcblock->used_size) {
+		jffs2_dbg(1, "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n",
+			  c->gcblock->offset);
+		/* We're GC'ing an empty block? */
+		list_add_tail(&c->gcblock->list, &c->erase_pending_list);
+		c->gcblock = NULL;
+		c->nr_erasing_blocks++;
+		jffs2_garbage_collect_trigger(c);
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	return ret;
+}
+
+static int jffs2_garbage_collect_live(struct jffs2_sb_info *c,  struct jffs2_eraseblock *jeb,
+				      struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f)
+{
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dnode *fn = NULL;
+	struct jffs2_full_dirent *fd;
+	uint32_t start = 0, end = 0, nrfrags = 0;
+	int ret = 0;
+
+	mutex_lock(&f->sem);
+
+	/* Now we have the lock for this inode. Check that it's still the one at the head
+	   of the list. */
+
+	spin_lock(&c->erase_completion_lock);
+
+	if (c->gcblock != jeb) {
+		spin_unlock(&c->erase_completion_lock);
+		jffs2_dbg(1, "GC block is no longer gcblock. Restart\n");
+		goto upnout;
+	}
+	if (ref_obsolete(raw)) {
+		spin_unlock(&c->erase_completion_lock);
+		jffs2_dbg(1, "node to be GC'd was obsoleted in the meantime.\n");
+		/* They'll call again */
+		goto upnout;
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	/* OK. Looks safe. And nobody can get us now because we have the semaphore. Move the block */
+	if (f->metadata && f->metadata->raw == raw) {
+		fn = f->metadata;
+		ret = jffs2_garbage_collect_metadata(c, jeb, f, fn);
+		goto upnout;
+	}
+
+	/* FIXME. Read node and do lookup? */
+	for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) {
+		if (frag->node && frag->node->raw == raw) {
+			fn = frag->node;
+			end = frag->ofs + frag->size;
+			if (!nrfrags++)
+				start = frag->ofs;
+			if (nrfrags == frag->node->frags)
+				break; /* We've found them all */
+		}
+	}
+	if (fn) {
+		if (ref_flags(raw) == REF_PRISTINE) {
+			ret = jffs2_garbage_collect_pristine(c, f->inocache, raw);
+			if (!ret) {
+				/* Urgh. Return it sensibly. */
+				frag->node->raw = f->inocache->nodes;
+			}
+			if (ret != -EBADFD)
+				goto upnout;
+		}
+		/* We found a datanode. Do the GC */
+		if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) {
+			/* It crosses a page boundary. Therefore, it must be a hole. */
+			ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end);
+		} else {
+			/* It could still be a hole. But we GC the page this way anyway */
+			ret = jffs2_garbage_collect_dnode(c, jeb, f, fn, start, end);
+		}
+		goto upnout;
+	}
+
+	/* Wasn't a dnode. Try dirent */
+	for (fd = f->dents; fd; fd=fd->next) {
+		if (fd->raw == raw)
+			break;
+	}
+
+	if (fd && fd->ino) {
+		ret = jffs2_garbage_collect_dirent(c, jeb, f, fd);
+	} else if (fd) {
+		ret = jffs2_garbage_collect_deletion_dirent(c, jeb, f, fd);
+	} else {
+		pr_warn("Raw node at 0x%08x wasn't in node lists for ino #%u\n",
+			ref_offset(raw), f->inocache->ino);
+		if (ref_obsolete(raw)) {
+			pr_warn("But it's obsolete so we don't mind too much\n");
+		} else {
+			jffs2_dbg_dump_node(c, ref_offset(raw));
+			BUG();
+		}
+	}
+ upnout:
+	mutex_unlock(&f->sem);
+
+	return ret;
+}
+
+static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
+					  struct jffs2_inode_cache *ic,
+					  struct jffs2_raw_node_ref *raw)
+{
+	union jffs2_node_union *node;
+	size_t retlen;
+	int ret;
+	uint32_t phys_ofs, alloclen;
+	uint32_t crc, rawlen;
+	int retried = 0;
+
+	jffs2_dbg(1, "Going to GC REF_PRISTINE node at 0x%08x\n",
+		  ref_offset(raw));
+
+	alloclen = rawlen = ref_totlen(c, c->gcblock, raw);
+
+	/* Ask for a small amount of space (or the totlen if smaller) because we
+	   don't want to force wastage of the end of a block if splitting would
+	   work. */
+	if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
+		alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN;
+
+	ret = jffs2_reserve_space_gc(c, alloclen, &alloclen, rawlen);
+	/* 'rawlen' is not the exact summary size; it is only an upper estimation */
+
+	if (ret)
+		return ret;
+
+	if (alloclen < rawlen) {
+		/* Doesn't fit untouched. We'll go the old route and split it */
+		return -EBADFD;
+	}
+
+	node = kmalloc(rawlen, GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)node);
+	if (!ret && retlen != rawlen)
+		ret = -EIO;
+	if (ret)
+		goto out_node;
+
+	crc = crc32(0, node, sizeof(struct jffs2_unknown_node)-4);
+	if (je32_to_cpu(node->u.hdr_crc) != crc) {
+		pr_warn("Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+			ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc);
+		goto bail;
+	}
+
+	switch(je16_to_cpu(node->u.nodetype)) {
+	case JFFS2_NODETYPE_INODE:
+		crc = crc32(0, node, sizeof(node->i)-8);
+		if (je32_to_cpu(node->i.node_crc) != crc) {
+			pr_warn("Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+				ref_offset(raw), je32_to_cpu(node->i.node_crc),
+				crc);
+			goto bail;
+		}
+
+		if (je32_to_cpu(node->i.dsize)) {
+			crc = crc32(0, node->i.data, je32_to_cpu(node->i.csize));
+			if (je32_to_cpu(node->i.data_crc) != crc) {
+				pr_warn("Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+					ref_offset(raw),
+					je32_to_cpu(node->i.data_crc), crc);
+				goto bail;
+			}
+		}
+		break;
+
+	case JFFS2_NODETYPE_DIRENT:
+		crc = crc32(0, node, sizeof(node->d)-8);
+		if (je32_to_cpu(node->d.node_crc) != crc) {
+			pr_warn("Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+				ref_offset(raw),
+				je32_to_cpu(node->d.node_crc), crc);
+			goto bail;
+		}
+
+		if (strnlen(node->d.name, node->d.nsize) != node->d.nsize) {
+			pr_warn("Name in dirent node at 0x%08x contains zeroes\n",
+				ref_offset(raw));
+			goto bail;
+		}
+
+		if (node->d.nsize) {
+			crc = crc32(0, node->d.name, node->d.nsize);
+			if (je32_to_cpu(node->d.name_crc) != crc) {
+				pr_warn("Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+					ref_offset(raw),
+					je32_to_cpu(node->d.name_crc), crc);
+				goto bail;
+			}
+		}
+		break;
+	default:
+		/* If it's inode-less, we don't _know_ what it is. Just copy it intact */
+		if (ic) {
+			pr_warn("Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
+				ref_offset(raw), je16_to_cpu(node->u.nodetype));
+			goto bail;
+		}
+	}
+
+	/* OK, all the CRCs are good; this node can just be copied as-is. */
+ retry:
+	phys_ofs = write_ofs(c);
+
+	ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
+
+	if (ret || (retlen != rawlen)) {
+		pr_notice("Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
+			  rawlen, phys_ofs, ret, retlen);
+		if (retlen) {
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL);
+		} else {
+			pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n",
+				  phys_ofs);
+		}
+		if (!retried) {
+			/* Try to reallocate space and retry */
+			uint32_t dummy;
+			struct jffs2_eraseblock *jeb = &c->blocks[phys_ofs / c->sector_size];
+
+			retried = 1;
+
+			jffs2_dbg(1, "Retrying failed write of REF_PRISTINE node.\n");
+
+			jffs2_dbg_acct_sanity_check(c,jeb);
+			jffs2_dbg_acct_paranoia_check(c, jeb);
+
+			ret = jffs2_reserve_space_gc(c, rawlen, &dummy, rawlen);
+						/* this is not the exact summary size of it,
+							it is only an upper estimation */
+
+			if (!ret) {
+				jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n",
+					  phys_ofs);
+
+				jffs2_dbg_acct_sanity_check(c,jeb);
+				jffs2_dbg_acct_paranoia_check(c, jeb);
+
+				goto retry;
+			}
+			jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n",
+				  ret);
+		}
+
+		if (!ret)
+			ret = -EIO;
+		goto out_node;
+	}
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic);
+
+	jffs2_mark_node_obsolete(c, raw);
+	jffs2_dbg(1, "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n",
+		  ref_offset(raw));
+
+ out_node:
+	kfree(node);
+	return ret;
+ bail:
+	ret = -EBADFD;
+	goto out_node;
+}
+
+static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dnode *fn)
+{
+	struct jffs2_full_dnode *new_fn;
+	struct jffs2_raw_inode ri;
+	struct jffs2_node_frag *last_frag;
+	union jffs2_device_node dev;
+	char *mdata = NULL;
+	int mdatalen = 0;
+	uint32_t alloclen, ilen;
+	int ret;
+
+	if (S_ISBLK(JFFS2_F_I_MODE(f)) ||
+	    S_ISCHR(JFFS2_F_I_MODE(f)) ) {
+		/* For these, we don't actually need to read the old node */
+		mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f));
+		mdata = (char *)&dev;
+		jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n",
+			  __func__, mdatalen);
+	} else if (S_ISLNK(JFFS2_F_I_MODE(f))) {
+		mdatalen = fn->size;
+		mdata = kmalloc(fn->size, GFP_KERNEL);
+		if (!mdata) {
+			pr_warn("kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n");
+			return -ENOMEM;
+		}
+		ret = jffs2_read_dnode(c, f, fn, mdata, 0, mdatalen);
+		if (ret) {
+			pr_warn("read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n",
+				ret);
+			kfree(mdata);
+			return ret;
+		}
+		jffs2_dbg(1, "%s(): Writing %d bites of symlink target\n",
+			  __func__, mdatalen);
+
+	}
+
+	ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen,
+				JFFS2_SUMMARY_INODE_SIZE);
+	if (ret) {
+		pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n",
+			sizeof(ri) + mdatalen, ret);
+		goto out;
+	}
+
+	last_frag = frag_last(&f->fragtree);
+	if (last_frag)
+		/* Fetch the inode length from the fragtree rather then
+		 * from i_size since i_size may have not been updated yet */
+		ilen = last_frag->ofs + last_frag->size;
+	else
+		ilen = JFFS2_F_I_SIZE(f);
+
+	memset(&ri, 0, sizeof(ri));
+	ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+	ri.totlen = cpu_to_je32(sizeof(ri) + mdatalen);
+	ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri.ino = cpu_to_je32(f->inocache->ino);
+	ri.version = cpu_to_je32(++f->highest_version);
+	ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f));
+	ri.uid = cpu_to_je16(JFFS2_F_I_UID(f));
+	ri.gid = cpu_to_je16(JFFS2_F_I_GID(f));
+	ri.isize = cpu_to_je32(ilen);
+	ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f));
+	ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f));
+	ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+	ri.offset = cpu_to_je32(0);
+	ri.csize = cpu_to_je32(mdatalen);
+	ri.dsize = cpu_to_je32(mdatalen);
+	ri.compr = JFFS2_COMPR_NONE;
+	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+	ri.data_crc = cpu_to_je32(crc32(0, mdata, mdatalen));
+
+	new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC);
+
+	if (IS_ERR(new_fn)) {
+		pr_warn("Error writing new dnode: %ld\n", PTR_ERR(new_fn));
+		ret = PTR_ERR(new_fn);
+		goto out;
+	}
+	jffs2_mark_node_obsolete(c, fn->raw);
+	jffs2_free_full_dnode(fn);
+	f->metadata = new_fn;
+ out:
+	if (S_ISLNK(JFFS2_F_I_MODE(f)))
+		kfree(mdata);
+	return ret;
+}
+
+static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd)
+{
+	struct jffs2_full_dirent *new_fd;
+	struct jffs2_raw_dirent rd;
+	uint32_t alloclen;
+	int ret;
+
+	rd.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd.nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd.nsize = strlen(fd->name);
+	rd.totlen = cpu_to_je32(sizeof(rd) + rd.nsize);
+	rd.hdr_crc = cpu_to_je32(crc32(0, &rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd.pino = cpu_to_je32(f->inocache->ino);
+	rd.version = cpu_to_je32(++f->highest_version);
+	rd.ino = cpu_to_je32(fd->ino);
+	/* If the times on this inode were set by explicit utime() they can be different,
+	   so refrain from splatting them. */
+	if (JFFS2_F_I_MTIME(f) == JFFS2_F_I_CTIME(f))
+		rd.mctime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+	else
+		rd.mctime = cpu_to_je32(0);
+	rd.type = fd->type;
+	rd.node_crc = cpu_to_je32(crc32(0, &rd, sizeof(rd)-8));
+	rd.name_crc = cpu_to_je32(crc32(0, fd->name, rd.nsize));
+
+	ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen,
+				JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize));
+	if (ret) {
+		pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n",
+			sizeof(rd)+rd.nsize, ret);
+		return ret;
+	}
+	new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC);
+
+	if (IS_ERR(new_fd)) {
+		pr_warn("jffs2_write_dirent in garbage_collect_dirent failed: %ld\n",
+			PTR_ERR(new_fd));
+		return PTR_ERR(new_fd);
+	}
+	jffs2_add_fd_to_list(c, new_fd, &f->dents);
+	return 0;
+}
+
+static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd)
+{
+	struct jffs2_full_dirent **fdp = &f->dents;
+	int found = 0;
+
+	/* On a medium where we can't actually mark nodes obsolete
+	   pernamently, such as NAND flash, we need to work out
+	   whether this deletion dirent is still needed to actively
+	   delete a 'real' dirent with the same name that's still
+	   somewhere else on the flash. */
+	if (!jffs2_can_mark_obsolete(c)) {
+		struct jffs2_raw_dirent *rd;
+		struct jffs2_raw_node_ref *raw;
+		int ret;
+		size_t retlen;
+		int name_len = strlen(fd->name);
+		uint32_t name_crc = crc32(0, fd->name, name_len);
+		uint32_t rawlen = ref_totlen(c, jeb, fd->raw);
+
+		rd = kmalloc(rawlen, GFP_KERNEL);
+		if (!rd)
+			return -ENOMEM;
+
+		/* Prevent the erase code from nicking the obsolete node refs while
+		   we're looking at them. I really don't like this extra lock but
+		   can't see any alternative. Suggestions on a postcard to... */
+		mutex_lock(&c->erase_free_sem);
+
+		for (raw = f->inocache->nodes; raw != (void *)f->inocache; raw = raw->next_in_ino) {
+
+			cond_resched();
+
+			/* We only care about obsolete ones */
+			if (!(ref_obsolete(raw)))
+				continue;
+
+			/* Any dirent with the same name is going to have the same length... */
+			if (ref_totlen(c, NULL, raw) != rawlen)
+				continue;
+
+			/* Doesn't matter if there's one in the same erase block. We're going to
+			   delete it too at the same time. */
+			if (SECTOR_ADDR(raw->flash_offset) == SECTOR_ADDR(fd->raw->flash_offset))
+				continue;
+
+			jffs2_dbg(1, "Check potential deletion dirent at %08x\n",
+				  ref_offset(raw));
+
+			/* This is an obsolete node belonging to the same directory, and it's of the right
+			   length. We need to take a closer look...*/
+			ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)rd);
+			if (ret) {
+				pr_warn("%s(): Read error (%d) reading obsolete node at %08x\n",
+					__func__, ret, ref_offset(raw));
+				/* If we can't read it, we don't need to continue to obsolete it. Continue */
+				continue;
+			}
+			if (retlen != rawlen) {
+				pr_warn("%s(): Short read (%zd not %u) reading header from obsolete node at %08x\n",
+					__func__, retlen, rawlen,
+					ref_offset(raw));
+				continue;
+			}
+
+			if (je16_to_cpu(rd->nodetype) != JFFS2_NODETYPE_DIRENT)
+				continue;
+
+			/* If the name CRC doesn't match, skip */
+			if (je32_to_cpu(rd->name_crc) != name_crc)
+				continue;
+
+			/* If the name length doesn't match, or it's another deletion dirent, skip */
+			if (rd->nsize != name_len || !je32_to_cpu(rd->ino))
+				continue;
+
+			/* OK, check the actual name now */
+			if (memcmp(rd->name, fd->name, name_len))
+				continue;
+
+			/* OK. The name really does match. There really is still an older node on
+			   the flash which our deletion dirent obsoletes. So we have to write out
+			   a new deletion dirent to replace it */
+			mutex_unlock(&c->erase_free_sem);
+
+			jffs2_dbg(1, "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n",
+				  ref_offset(fd->raw), fd->name,
+				  ref_offset(raw), je32_to_cpu(rd->ino));
+			kfree(rd);
+
+			return jffs2_garbage_collect_dirent(c, jeb, f, fd);
+		}
+
+		mutex_unlock(&c->erase_free_sem);
+		kfree(rd);
+	}
+
+	/* FIXME: If we're deleting a dirent which contains the current mtime and ctime,
+	   we should update the metadata node with those times accordingly */
+
+	/* No need for it any more. Just mark it obsolete and remove it from the list */
+	while (*fdp) {
+		if ((*fdp) == fd) {
+			found = 1;
+			*fdp = fd->next;
+			break;
+		}
+		fdp = &(*fdp)->next;
+	}
+	if (!found) {
+		pr_warn("Deletion dirent \"%s\" not found in list for ino #%u\n",
+			fd->name, f->inocache->ino);
+	}
+	jffs2_mark_node_obsolete(c, fd->raw);
+	jffs2_free_full_dirent(fd);
+	return 0;
+}
+
+static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				      struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				      uint32_t start, uint32_t end)
+{
+	struct jffs2_raw_inode ri;
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dnode *new_fn;
+	uint32_t alloclen, ilen;
+	int ret;
+
+	jffs2_dbg(1, "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n",
+		  f->inocache->ino, start, end);
+
+	memset(&ri, 0, sizeof(ri));
+
+	if(fn->frags > 1) {
+		size_t readlen;
+		uint32_t crc;
+		/* It's partially obsoleted by a later write. So we have to
+		   write it out again with the _same_ version as before */
+		ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(ri), &readlen, (char *)&ri);
+		if (readlen != sizeof(ri) || ret) {
+			pr_warn("Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n",
+				ret, readlen);
+			goto fill;
+		}
+		if (je16_to_cpu(ri.nodetype) != JFFS2_NODETYPE_INODE) {
+			pr_warn("%s(): Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n",
+				__func__, ref_offset(fn->raw),
+				je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE);
+			return -EIO;
+		}
+		if (je32_to_cpu(ri.totlen) != sizeof(ri)) {
+			pr_warn("%s(): Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n",
+				__func__, ref_offset(fn->raw),
+				je32_to_cpu(ri.totlen), sizeof(ri));
+			return -EIO;
+		}
+		crc = crc32(0, &ri, sizeof(ri)-8);
+		if (crc != je32_to_cpu(ri.node_crc)) {
+			pr_warn("%s: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n",
+				__func__, ref_offset(fn->raw),
+				je32_to_cpu(ri.node_crc), crc);
+			/* FIXME: We could possibly deal with this by writing new holes for each frag */
+			pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
+				start, end, f->inocache->ino);
+			goto fill;
+		}
+		if (ri.compr != JFFS2_COMPR_ZERO) {
+			pr_warn("%s(): Node 0x%08x wasn't a hole node!\n",
+				__func__, ref_offset(fn->raw));
+			pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
+				start, end, f->inocache->ino);
+			goto fill;
+		}
+	} else {
+	fill:
+		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri.totlen = cpu_to_je32(sizeof(ri));
+		ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri.ino = cpu_to_je32(f->inocache->ino);
+		ri.version = cpu_to_je32(++f->highest_version);
+		ri.offset = cpu_to_je32(start);
+		ri.dsize = cpu_to_je32(end - start);
+		ri.csize = cpu_to_je32(0);
+		ri.compr = JFFS2_COMPR_ZERO;
+	}
+
+	frag = frag_last(&f->fragtree);
+	if (frag)
+		/* Fetch the inode length from the fragtree rather then
+		 * from i_size since i_size may have not been updated yet */
+		ilen = frag->ofs + frag->size;
+	else
+		ilen = JFFS2_F_I_SIZE(f);
+
+	ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f));
+	ri.uid = cpu_to_je16(JFFS2_F_I_UID(f));
+	ri.gid = cpu_to_je16(JFFS2_F_I_GID(f));
+	ri.isize = cpu_to_je32(ilen);
+	ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f));
+	ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f));
+	ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+	ri.data_crc = cpu_to_je32(0);
+	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+
+	ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen,
+				     JFFS2_SUMMARY_INODE_SIZE);
+	if (ret) {
+		pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n",
+			sizeof(ri), ret);
+		return ret;
+	}
+	new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC);
+
+	if (IS_ERR(new_fn)) {
+		pr_warn("Error writing new hole node: %ld\n", PTR_ERR(new_fn));
+		return PTR_ERR(new_fn);
+	}
+	if (je32_to_cpu(ri.version) == f->highest_version) {
+		jffs2_add_full_dnode_to_inode(c, f, new_fn);
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+		return 0;
+	}
+
+	/*
+	 * We should only get here in the case where the node we are
+	 * replacing had more than one frag, so we kept the same version
+	 * number as before. (Except in case of error -- see 'goto fill;'
+	 * above.)
+	 */
+	D1(if(unlikely(fn->frags <= 1)) {
+			pr_warn("%s(): Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n",
+				__func__, fn->frags, je32_to_cpu(ri.version),
+				f->highest_version, je32_to_cpu(ri.ino));
+	});
+
+	/* This is a partially-overlapped hole node. Mark it REF_NORMAL not REF_PRISTINE */
+	mark_ref_normal(new_fn->raw);
+
+	for (frag = jffs2_lookup_node_frag(&f->fragtree, fn->ofs);
+	     frag; frag = frag_next(frag)) {
+		if (frag->ofs > fn->size + fn->ofs)
+			break;
+		if (frag->node == fn) {
+			frag->node = new_fn;
+			new_fn->frags++;
+			fn->frags--;
+		}
+	}
+	if (fn->frags) {
+		pr_warn("%s(): Old node still has frags!\n", __func__);
+		BUG();
+	}
+	if (!new_fn->frags) {
+		pr_warn("%s(): New node has no frags!\n", __func__);
+		BUG();
+	}
+
+	jffs2_mark_node_obsolete(c, fn->raw);
+	jffs2_free_full_dnode(fn);
+
+	return 0;
+}
+
+static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *orig_jeb,
+				       struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				       uint32_t start, uint32_t end)
+{
+	struct jffs2_full_dnode *new_fn;
+	struct jffs2_raw_inode ri;
+	uint32_t alloclen, offset, orig_end, orig_start;
+	int ret = 0;
+	unsigned char *comprbuf = NULL, *writebuf;
+	unsigned long pg;
+	unsigned char *pg_ptr;
+
+	memset(&ri, 0, sizeof(ri));
+
+	jffs2_dbg(1, "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n",
+		  f->inocache->ino, start, end);
+
+	orig_end = end;
+	orig_start = start;
+
+	if (c->nr_free_blocks + c->nr_erasing_blocks > c->resv_blocks_gcmerge) {
+		/* Attempt to do some merging. But only expand to cover logically
+		   adjacent frags if the block containing them is already considered
+		   to be dirty. Otherwise we end up with GC just going round in
+		   circles dirtying the nodes it already wrote out, especially
+		   on NAND where we have small eraseblocks and hence a much higher
+		   chance of nodes having to be split to cross boundaries. */
+
+		struct jffs2_node_frag *frag;
+		uint32_t min, max;
+
+		min = start & ~(PAGE_CACHE_SIZE-1);
+		max = min + PAGE_CACHE_SIZE;
+
+		frag = jffs2_lookup_node_frag(&f->fragtree, start);
+
+		/* BUG_ON(!frag) but that'll happen anyway... */
+
+		BUG_ON(frag->ofs != start);
+
+		/* First grow down... */
+		while((frag = frag_prev(frag)) && frag->ofs >= min) {
+
+			/* If the previous frag doesn't even reach the beginning, there's
+			   excessive fragmentation. Just merge. */
+			if (frag->ofs > min) {
+				jffs2_dbg(1, "Expanding down to cover partial frag (0x%x-0x%x)\n",
+					  frag->ofs, frag->ofs+frag->size);
+				start = frag->ofs;
+				continue;
+			}
+			/* OK. This frag holds the first byte of the page. */
+			if (!frag->node || !frag->node->raw) {
+				jffs2_dbg(1, "First frag in page is hole (0x%x-0x%x). Not expanding down.\n",
+					  frag->ofs, frag->ofs+frag->size);
+				break;
+			} else {
+
+				/* OK, it's a frag which extends to the beginning of the page. Does it live
+				   in a block which is still considered clean? If so, don't obsolete it.
+				   If not, cover it anyway. */
+
+				struct jffs2_raw_node_ref *raw = frag->node->raw;
+				struct jffs2_eraseblock *jeb;
+
+				jeb = &c->blocks[raw->flash_offset / c->sector_size];
+
+				if (jeb == c->gcblock) {
+					jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n",
+						  frag->ofs,
+						  frag->ofs + frag->size,
+						  ref_offset(raw));
+					start = frag->ofs;
+					break;
+				}
+				if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) {
+					jffs2_dbg(1, "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n",
+						  frag->ofs,
+						  frag->ofs + frag->size,
+						  jeb->offset);
+					break;
+				}
+
+				jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n",
+					  frag->ofs,
+					  frag->ofs + frag->size,
+					  jeb->offset);
+				start = frag->ofs;
+				break;
+			}
+		}
+
+		/* ... then up */
+
+		/* Find last frag which is actually part of the node we're to GC. */
+		frag = jffs2_lookup_node_frag(&f->fragtree, end-1);
+
+		while((frag = frag_next(frag)) && frag->ofs+frag->size <= max) {
+
+			/* If the previous frag doesn't even reach the beginning, there's lots
+			   of fragmentation. Just merge. */
+			if (frag->ofs+frag->size < max) {
+				jffs2_dbg(1, "Expanding up to cover partial frag (0x%x-0x%x)\n",
+					  frag->ofs, frag->ofs+frag->size);
+				end = frag->ofs + frag->size;
+				continue;
+			}
+
+			if (!frag->node || !frag->node->raw) {
+				jffs2_dbg(1, "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n",
+					  frag->ofs, frag->ofs+frag->size);
+				break;
+			} else {
+
+				/* OK, it's a frag which extends to the beginning of the page. Does it live
+				   in a block which is still considered clean? If so, don't obsolete it.
+				   If not, cover it anyway. */
+
+				struct jffs2_raw_node_ref *raw = frag->node->raw;
+				struct jffs2_eraseblock *jeb;
+
+				jeb = &c->blocks[raw->flash_offset / c->sector_size];
+
+				if (jeb == c->gcblock) {
+					jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n",
+						  frag->ofs,
+						  frag->ofs + frag->size,
+						  ref_offset(raw));
+					end = frag->ofs + frag->size;
+					break;
+				}
+				if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) {
+					jffs2_dbg(1, "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n",
+						  frag->ofs,
+						  frag->ofs + frag->size,
+						  jeb->offset);
+					break;
+				}
+
+				jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n",
+					  frag->ofs,
+					  frag->ofs + frag->size,
+					  jeb->offset);
+				end = frag->ofs + frag->size;
+				break;
+			}
+		}
+		jffs2_dbg(1, "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n",
+			  orig_start, orig_end, start, end);
+
+		D1(BUG_ON(end > frag_last(&f->fragtree)->ofs + frag_last(&f->fragtree)->size));
+		BUG_ON(end < orig_end);
+		BUG_ON(start > orig_start);
+	}
+
+	/* First, use readpage() to read the appropriate page into the page cache */
+	/* Q: What happens if we actually try to GC the _same_ page for which commit_write()
+	 *    triggered garbage collection in the first place?
+	 * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
+	 *    page OK. We'll actually write it out again in commit_write, which is a little
+	 *    suboptimal, but at least we're correct.
+	 */
+	pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+
+	if (IS_ERR(pg_ptr)) {
+		pr_warn("read_cache_page() returned error: %ld\n",
+			PTR_ERR(pg_ptr));
+		return PTR_ERR(pg_ptr);
+	}
+
+	offset = start;
+	while(offset < orig_end) {
+		uint32_t datalen;
+		uint32_t cdatalen;
+		uint16_t comprtype = JFFS2_COMPR_NONE;
+
+		ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN,
+					&alloclen, JFFS2_SUMMARY_INODE_SIZE);
+
+		if (ret) {
+			pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n",
+				sizeof(ri) + JFFS2_MIN_DATA_LEN, ret);
+			break;
+		}
+		cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset);
+		datalen = end - offset;
+
+		writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1));
+
+		comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen);
+
+		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri.totlen = cpu_to_je32(sizeof(ri) + cdatalen);
+		ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri.ino = cpu_to_je32(f->inocache->ino);
+		ri.version = cpu_to_je32(++f->highest_version);
+		ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f));
+		ri.uid = cpu_to_je16(JFFS2_F_I_UID(f));
+		ri.gid = cpu_to_je16(JFFS2_F_I_GID(f));
+		ri.isize = cpu_to_je32(JFFS2_F_I_SIZE(f));
+		ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f));
+		ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f));
+		ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+		ri.offset = cpu_to_je32(offset);
+		ri.csize = cpu_to_je32(cdatalen);
+		ri.dsize = cpu_to_je32(datalen);
+		ri.compr = comprtype & 0xff;
+		ri.usercompr = (comprtype >> 8) & 0xff;
+		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+		ri.data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
+
+		new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, ALLOC_GC);
+
+		jffs2_free_comprbuf(comprbuf, writebuf);
+
+		if (IS_ERR(new_fn)) {
+			pr_warn("Error writing new dnode: %ld\n",
+				PTR_ERR(new_fn));
+			ret = PTR_ERR(new_fn);
+			break;
+		}
+		ret = jffs2_add_full_dnode_to_inode(c, f, new_fn);
+		offset += datalen;
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+	}
+
+	jffs2_gc_release_page(c, pg_ptr, &pg);
+	return ret;
+}
diff --git a/kernel/fs/jffs2/ioctl.c b/kernel/fs/jffs2/ioctl.c
new file mode 100644
index 000000000..859a598af
--- /dev/null
+++ b/kernel/fs/jffs2/ioctl.c
@@ -0,0 +1,22 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/fs.h>
+#include "nodelist.h"
+
+long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	/* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which
+	   will include compression support etc. */
+	return -ENOTTY;
+}
+
diff --git a/kernel/fs/jffs2/jffs2_fs_i.h b/kernel/fs/jffs2/jffs2_fs_i.h
new file mode 100644
index 000000000..2e4a86763
--- /dev/null
+++ b/kernel/fs/jffs2/jffs2_fs_i.h
@@ -0,0 +1,56 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_FS_I
+#define _JFFS2_FS_I
+
+#include <linux/rbtree.h>
+#include <linux/posix_acl.h>
+#include <linux/mutex.h>
+
+struct jffs2_inode_info {
+	/* We need an internal mutex similar to inode->i_mutex.
+	   Unfortunately, we can't used the existing one, because
+	   either the GC would deadlock, or we'd have to release it
+	   before letting GC proceed. Or we'd have to put ugliness
+	   into the GC code so it didn't attempt to obtain the i_mutex
+	   for the inode(s) which are already locked */
+	struct mutex sem;
+
+	/* The highest (datanode) version number used for this ino */
+	uint32_t highest_version;
+
+	/* List of data fragments which make up the file */
+	struct rb_root fragtree;
+
+	/* There may be one datanode which isn't referenced by any of the
+	   above fragments, if it contains a metadata update but no actual
+	   data - or if this is a directory inode */
+	/* This also holds the _only_ dnode for symlinks/device nodes,
+	   etc. */
+	struct jffs2_full_dnode *metadata;
+
+	/* Directory entries */
+	struct jffs2_full_dirent *dents;
+
+	/* The target path if this is the inode of a symlink */
+	unsigned char *target;
+
+	/* Some stuff we just have to keep in-core at all times, for each inode. */
+	struct jffs2_inode_cache *inocache;
+
+	uint16_t flags;
+	uint8_t usercompr;
+	struct inode vfs_inode;
+};
+
+#endif /* _JFFS2_FS_I */
diff --git a/kernel/fs/jffs2/jffs2_fs_sb.h b/kernel/fs/jffs2/jffs2_fs_sb.h
new file mode 100644
index 000000000..046fee8b6
--- /dev/null
+++ b/kernel/fs/jffs2/jffs2_fs_sb.h
@@ -0,0 +1,162 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_FS_SB
+#define _JFFS2_FS_SB
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+#define JFFS2_SB_FLAG_RO 1
+#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
+#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
+
+struct jffs2_inodirty;
+
+struct jffs2_mount_opts {
+	bool override_compr;
+	unsigned int compr;
+
+	/* The size of the reserved pool. The reserved pool is the JFFS2 flash
+	 * space which may only be used by root cannot be used by the other
+	 * users. This is implemented simply by means of not allowing the
+	 * latter users to write to the file system if the amount if the
+	 * available space is less then 'rp_size'. */
+	unsigned int rp_size;
+};
+
+/* A struct for the overall file system control.  Pointers to
+   jffs2_sb_info structs are named `c' in the source code.
+   Nee jffs_control
+*/
+struct jffs2_sb_info {
+	struct mtd_info *mtd;
+
+	uint32_t highest_ino;
+	uint32_t checked_ino;
+
+	unsigned int flags;
+
+	struct task_struct *gc_task;	/* GC task struct */
+	struct completion gc_thread_start; /* GC thread start completion */
+	struct completion gc_thread_exit; /* GC thread exit completion port */
+
+	struct mutex alloc_sem;		/* Used to protect all the following
+					   fields, and also to protect against
+					   out-of-order writing of nodes. And GC. */
+	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
+					 (i.e. zero for OOB CLEANMARKER */
+
+	uint32_t flash_size;
+	uint32_t used_size;
+	uint32_t dirty_size;
+	uint32_t wasted_size;
+	uint32_t free_size;
+	uint32_t erasing_size;
+	uint32_t bad_size;
+	uint32_t sector_size;
+	uint32_t unchecked_size;
+
+	uint32_t nr_free_blocks;
+	uint32_t nr_erasing_blocks;
+
+	/* Number of free blocks there must be before we... */
+	uint8_t resv_blocks_write;	/* ... allow a normal filesystem write */
+	uint8_t resv_blocks_deletion;	/* ... allow a normal filesystem deletion */
+	uint8_t resv_blocks_gctrigger;	/* ... wake up the GC thread */
+	uint8_t resv_blocks_gcbad;	/* ... pick a block from the bad_list to GC */
+	uint8_t resv_blocks_gcmerge;	/* ... merge pages when garbage collecting */
+	/* Number of 'very dirty' blocks before we trigger immediate GC */
+	uint8_t vdirty_blocks_gctrigger;
+
+	uint32_t nospc_dirty_size;
+
+	uint32_t nr_blocks;
+	struct jffs2_eraseblock *blocks;	/* The whole array of blocks. Used for getting blocks
+						 * from the offset (blocks[ofs / sector_size]) */
+	struct jffs2_eraseblock *nextblock;	/* The block we're currently filling */
+
+	struct jffs2_eraseblock *gcblock;	/* The block we're currently garbage-collecting */
+
+	struct list_head clean_list;		/* Blocks 100% full of clean data */
+	struct list_head very_dirty_list;	/* Blocks with lots of dirty space */
+	struct list_head dirty_list;		/* Blocks with some dirty space */
+	struct list_head erasable_list;		/* Blocks which are completely dirty, and need erasing */
+	struct list_head erasable_pending_wbuf_list;	/* Blocks which need erasing but only after the current wbuf is flushed */
+	struct list_head erasing_list;		/* Blocks which are currently erasing */
+	struct list_head erase_checking_list;	/* Blocks which are being checked and marked */
+	struct list_head erase_pending_list;	/* Blocks which need erasing now */
+	struct list_head erase_complete_list;	/* Blocks which are erased and need the clean marker written to them */
+	struct list_head free_list;		/* Blocks which are free and ready to be used */
+	struct list_head bad_list;		/* Bad blocks. */
+	struct list_head bad_used_list;		/* Bad blocks with valid data in. */
+
+	spinlock_t erase_completion_lock;	/* Protect free_list and erasing_list
+						   against erase completion handler */
+	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
+
+	wait_queue_head_t inocache_wq;
+	int inocache_hashsize;
+	struct jffs2_inode_cache **inocache_list;
+	spinlock_t inocache_lock;
+
+	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
+	   drop the erase_completion_lock while it's holding a pointer
+	   to an obsoleted node. I don't like this. Alternatives welcomed. */
+	struct mutex erase_free_sem;
+
+	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	unsigned char *wbuf_verify; /* read-back buffer for verification */
+#endif
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	unsigned char *wbuf; /* Write-behind buffer for NAND flash */
+	uint32_t wbuf_ofs;
+	uint32_t wbuf_len;
+	struct jffs2_inodirty *wbuf_inodes;
+	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+
+	struct delayed_work wbuf_dwork; /* write-buffer write-out work */
+
+	unsigned char *oobbuf;
+	int oobavail; /* How many bytes are available for JFFS2 in OOB */
+#endif
+
+	struct jffs2_summary *summary;		/* Summary information */
+	struct jffs2_mount_opts mount_opts;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+#define XATTRINDEX_HASHSIZE	(57)
+	uint32_t highest_xid;
+	uint32_t highest_xseqno;
+	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
+	struct list_head xattr_unchecked;
+	struct list_head xattr_dead_list;
+	struct jffs2_xattr_ref *xref_dead_list;
+	struct jffs2_xattr_ref *xref_temp;
+	struct rw_semaphore xattr_sem;
+	uint32_t xdatum_mem_usage;
+	uint32_t xdatum_mem_threshold;
+#endif
+	/* OS-private pointer for getting back to master superblock info */
+	void *os_priv;
+};
+
+#endif /* _JFFS2_FS_SB */
diff --git a/kernel/fs/jffs2/malloc.c b/kernel/fs/jffs2/malloc.c
new file mode 100644
index 000000000..b8fd65130
--- /dev/null
+++ b/kernel/fs/jffs2/malloc.c
@@ -0,0 +1,324 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/jffs2.h>
+#include "nodelist.h"
+
+/* These are initialised to NULL in the kernel startup code.
+   If you're porting to other operating systems, beware */
+static struct kmem_cache *full_dnode_slab;
+static struct kmem_cache *raw_dirent_slab;
+static struct kmem_cache *raw_inode_slab;
+static struct kmem_cache *tmp_dnode_info_slab;
+static struct kmem_cache *raw_node_ref_slab;
+static struct kmem_cache *node_frag_slab;
+static struct kmem_cache *inode_cache_slab;
+#ifdef CONFIG_JFFS2_FS_XATTR
+static struct kmem_cache *xattr_datum_cache;
+static struct kmem_cache *xattr_ref_cache;
+#endif
+
+int __init jffs2_create_slab_caches(void)
+{
+	full_dnode_slab = kmem_cache_create("jffs2_full_dnode",
+					    sizeof(struct jffs2_full_dnode),
+					    0, 0, NULL);
+	if (!full_dnode_slab)
+		goto err;
+
+	raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
+					    sizeof(struct jffs2_raw_dirent),
+					    0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!raw_dirent_slab)
+		goto err;
+
+	raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
+					   sizeof(struct jffs2_raw_inode),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!raw_inode_slab)
+		goto err;
+
+	tmp_dnode_info_slab = kmem_cache_create("jffs2_tmp_dnode",
+						sizeof(struct jffs2_tmp_dnode_info),
+						0, 0, NULL);
+	if (!tmp_dnode_info_slab)
+		goto err;
+
+	raw_node_ref_slab = kmem_cache_create("jffs2_refblock",
+					      sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1),
+					      0, 0, NULL);
+	if (!raw_node_ref_slab)
+		goto err;
+
+	node_frag_slab = kmem_cache_create("jffs2_node_frag",
+					   sizeof(struct jffs2_node_frag),
+					   0, 0, NULL);
+	if (!node_frag_slab)
+		goto err;
+
+	inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
+					     sizeof(struct jffs2_inode_cache),
+					     0, 0, NULL);
+	if (!inode_cache_slab)
+		goto err;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
+					     sizeof(struct jffs2_xattr_datum),
+					     0, 0, NULL);
+	if (!xattr_datum_cache)
+		goto err;
+
+	xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
+					   sizeof(struct jffs2_xattr_ref),
+					   0, 0, NULL);
+	if (!xattr_ref_cache)
+		goto err;
+#endif
+
+	return 0;
+ err:
+	jffs2_destroy_slab_caches();
+	return -ENOMEM;
+}
+
+void jffs2_destroy_slab_caches(void)
+{
+	if(full_dnode_slab)
+		kmem_cache_destroy(full_dnode_slab);
+	if(raw_dirent_slab)
+		kmem_cache_destroy(raw_dirent_slab);
+	if(raw_inode_slab)
+		kmem_cache_destroy(raw_inode_slab);
+	if(tmp_dnode_info_slab)
+		kmem_cache_destroy(tmp_dnode_info_slab);
+	if(raw_node_ref_slab)
+		kmem_cache_destroy(raw_node_ref_slab);
+	if(node_frag_slab)
+		kmem_cache_destroy(node_frag_slab);
+	if(inode_cache_slab)
+		kmem_cache_destroy(inode_cache_slab);
+#ifdef CONFIG_JFFS2_FS_XATTR
+	if (xattr_datum_cache)
+		kmem_cache_destroy(xattr_datum_cache);
+	if (xattr_ref_cache)
+		kmem_cache_destroy(xattr_ref_cache);
+#endif
+}
+
+struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize)
+{
+	struct jffs2_full_dirent *ret;
+	ret = kmalloc(sizeof(struct jffs2_full_dirent) + namesize, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_full_dirent(struct jffs2_full_dirent *x)
+{
+	dbg_memalloc("%p\n", x);
+	kfree(x);
+}
+
+struct jffs2_full_dnode *jffs2_alloc_full_dnode(void)
+{
+	struct jffs2_full_dnode *ret;
+	ret = kmem_cache_alloc(full_dnode_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_full_dnode(struct jffs2_full_dnode *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(full_dnode_slab, x);
+}
+
+struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void)
+{
+	struct jffs2_raw_dirent *ret;
+	ret = kmem_cache_alloc(raw_dirent_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_raw_dirent(struct jffs2_raw_dirent *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(raw_dirent_slab, x);
+}
+
+struct jffs2_raw_inode *jffs2_alloc_raw_inode(void)
+{
+	struct jffs2_raw_inode *ret;
+	ret = kmem_cache_alloc(raw_inode_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_raw_inode(struct jffs2_raw_inode *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(raw_inode_slab, x);
+}
+
+struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void)
+{
+	struct jffs2_tmp_dnode_info *ret;
+	ret = kmem_cache_alloc(tmp_dnode_info_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n",
+		ret);
+	return ret;
+}
+
+void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(tmp_dnode_info_slab, x);
+}
+
+static struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
+{
+	struct jffs2_raw_node_ref *ret;
+
+	ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
+	if (ret) {
+		int i = 0;
+		for (i=0; i < REFS_PER_BLOCK; i++) {
+			ret[i].flash_offset = REF_EMPTY_NODE;
+			ret[i].next_in_ino = NULL;
+		}
+		ret[i].flash_offset = REF_LINK_NODE;
+		ret[i].next_in_ino = NULL;
+	}
+	return ret;
+}
+
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb, int nr)
+{
+	struct jffs2_raw_node_ref **p, *ref;
+	int i = nr;
+
+	dbg_memalloc("%d\n", nr);
+
+	p = &jeb->last_node;
+	ref = *p;
+
+	dbg_memalloc("Reserving %d refs for block @0x%08x\n", nr, jeb->offset);
+
+	/* If jeb->last_node is really a valid node then skip over it */
+	if (ref && ref->flash_offset != REF_EMPTY_NODE)
+		ref++;
+
+	while (i) {
+		if (!ref) {
+			dbg_memalloc("Allocating new refblock linked from %p\n", p);
+			ref = *p = jffs2_alloc_refblock();
+			if (!ref)
+				return -ENOMEM;
+		}
+		if (ref->flash_offset == REF_LINK_NODE) {
+			p = &ref->next_in_ino;
+			ref = *p;
+			continue;
+		}
+		i--;
+		ref++;
+	}
+	jeb->allocated_refs = nr;
+
+	dbg_memalloc("Reserved %d refs for block @0x%08x, last_node is %p (%08x,%p)\n",
+		  nr, jeb->offset, jeb->last_node, jeb->last_node->flash_offset,
+		  jeb->last_node->next_in_ino);
+
+	return 0;
+}
+
+void jffs2_free_refblock(struct jffs2_raw_node_ref *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(raw_node_ref_slab, x);
+}
+
+struct jffs2_node_frag *jffs2_alloc_node_frag(void)
+{
+	struct jffs2_node_frag *ret;
+	ret = kmem_cache_alloc(node_frag_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_node_frag(struct jffs2_node_frag *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(node_frag_slab, x);
+}
+
+struct jffs2_inode_cache *jffs2_alloc_inode_cache(void)
+{
+	struct jffs2_inode_cache *ret;
+	ret = kmem_cache_alloc(inode_cache_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(inode_cache_slab, x);
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
+{
+	struct jffs2_xattr_datum *xd;
+	xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", xd);
+	if (!xd)
+		return NULL;
+
+	xd->class = RAWNODE_CLASS_XATTR_DATUM;
+	xd->node = (void *)xd;
+	INIT_LIST_HEAD(&xd->xindex);
+	return xd;
+}
+
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
+{
+	dbg_memalloc("%p\n", xd);
+	kmem_cache_free(xattr_datum_cache, xd);
+}
+
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
+{
+	struct jffs2_xattr_ref *ref;
+	ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", ref);
+	if (!ref)
+		return NULL;
+
+	ref->class = RAWNODE_CLASS_XATTR_REF;
+	ref->node = (void *)ref;
+	return ref;
+}
+
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref)
+{
+	dbg_memalloc("%p\n", ref);
+	kmem_cache_free(xattr_ref_cache, ref);
+}
+#endif
diff --git a/kernel/fs/jffs2/nodelist.c b/kernel/fs/jffs2/nodelist.c
new file mode 100644
index 000000000..9a5449bc3
--- /dev/null
+++ b/kernel/fs/jffs2/nodelist.c
@@ -0,0 +1,755 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/mtd/mtd.h>
+#include <linux/rbtree.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include "nodelist.h"
+
+static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
+				     struct jffs2_node_frag *this);
+
+void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list)
+{
+	struct jffs2_full_dirent **prev = list;
+
+	dbg_dentlist("add dirent \"%s\", ino #%u\n", new->name, new->ino);
+
+	while ((*prev) && (*prev)->nhash <= new->nhash) {
+		if ((*prev)->nhash == new->nhash && !strcmp((*prev)->name, new->name)) {
+			/* Duplicate. Free one */
+			if (new->version < (*prev)->version) {
+				dbg_dentlist("Eep! Marking new dirent node obsolete, old is \"%s\", ino #%u\n",
+					(*prev)->name, (*prev)->ino);
+				jffs2_mark_node_obsolete(c, new->raw);
+				jffs2_free_full_dirent(new);
+			} else {
+				dbg_dentlist("marking old dirent \"%s\", ino #%u obsolete\n",
+					(*prev)->name, (*prev)->ino);
+				new->next = (*prev)->next;
+				/* It may have been a 'placeholder' deletion dirent, 
+				   if jffs2_can_mark_obsolete() (see jffs2_do_unlink()) */
+				if ((*prev)->raw)
+					jffs2_mark_node_obsolete(c, ((*prev)->raw));
+				jffs2_free_full_dirent(*prev);
+				*prev = new;
+			}
+			return;
+		}
+		prev = &((*prev)->next);
+	}
+	new->next = *prev;
+	*prev = new;
+}
+
+uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size)
+{
+	struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size);
+
+	dbg_fragtree("truncating fragtree to 0x%08x bytes\n", size);
+
+	/* We know frag->ofs <= size. That's what lookup does for us */
+	if (frag && frag->ofs != size) {
+		if (frag->ofs+frag->size > size) {
+			frag->size = size - frag->ofs;
+		}
+		frag = frag_next(frag);
+	}
+	while (frag && frag->ofs >= size) {
+		struct jffs2_node_frag *next = frag_next(frag);
+
+		frag_erase(frag, list);
+		jffs2_obsolete_node_frag(c, frag);
+		frag = next;
+	}
+
+	if (size == 0)
+		return 0;
+
+	frag = frag_last(list);
+
+	/* Sanity check for truncation to longer than we started with... */
+	if (!frag)
+		return 0;
+	if (frag->ofs + frag->size < size)
+		return frag->ofs + frag->size;
+
+	/* If the last fragment starts at the RAM page boundary, it is
+	 * REF_PRISTINE irrespective of its size. */
+	if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
+		dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
+			frag->ofs, frag->ofs + frag->size);
+		frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
+	}
+	return size;
+}
+
+static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
+				     struct jffs2_node_frag *this)
+{
+	if (this->node) {
+		this->node->frags--;
+		if (!this->node->frags) {
+			/* The node has no valid frags left. It's totally obsoleted */
+			dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) obsolete\n",
+				ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size);
+			jffs2_mark_node_obsolete(c, this->node->raw);
+			jffs2_free_full_dnode(this->node);
+		} else {
+			dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) REF_NORMAL. frags is %d\n",
+				ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size, this->node->frags);
+			mark_ref_normal(this->node->raw);
+		}
+
+	}
+	jffs2_free_node_frag(this);
+}
+
+static void jffs2_fragtree_insert(struct jffs2_node_frag *newfrag, struct jffs2_node_frag *base)
+{
+	struct rb_node *parent = &base->rb;
+	struct rb_node **link = &parent;
+
+	dbg_fragtree2("insert frag (0x%04x-0x%04x)\n", newfrag->ofs, newfrag->ofs + newfrag->size);
+
+	while (*link) {
+		parent = *link;
+		base = rb_entry(parent, struct jffs2_node_frag, rb);
+
+		if (newfrag->ofs > base->ofs)
+			link = &base->rb.rb_right;
+		else if (newfrag->ofs < base->ofs)
+			link = &base->rb.rb_left;
+		else {
+			JFFS2_ERROR("duplicate frag at %08x (%p,%p)\n", newfrag->ofs, newfrag, base);
+			BUG();
+		}
+	}
+
+	rb_link_node(&newfrag->rb, &base->rb, link);
+}
+
+/*
+ * Allocate and initializes a new fragment.
+ */
+static struct jffs2_node_frag * new_fragment(struct jffs2_full_dnode *fn, uint32_t ofs, uint32_t size)
+{
+	struct jffs2_node_frag *newfrag;
+
+	newfrag = jffs2_alloc_node_frag();
+	if (likely(newfrag)) {
+		newfrag->ofs = ofs;
+		newfrag->size = size;
+		newfrag->node = fn;
+	} else {
+		JFFS2_ERROR("cannot allocate a jffs2_node_frag object\n");
+	}
+
+	return newfrag;
+}
+
+/*
+ * Called when there is no overlapping fragment exist. Inserts a hole before the new
+ * fragment and inserts the new fragment to the fragtree.
+ */
+static int no_overlapping_node(struct jffs2_sb_info *c, struct rb_root *root,
+		 	       struct jffs2_node_frag *newfrag,
+			       struct jffs2_node_frag *this, uint32_t lastend)
+{
+	if (lastend < newfrag->node->ofs) {
+		/* put a hole in before the new fragment */
+		struct jffs2_node_frag *holefrag;
+
+		holefrag= new_fragment(NULL, lastend, newfrag->node->ofs - lastend);
+		if (unlikely(!holefrag)) {
+			jffs2_free_node_frag(newfrag);
+			return -ENOMEM;
+		}
+
+		if (this) {
+			/* By definition, the 'this' node has no right-hand child,
+			   because there are no frags with offset greater than it.
+			   So that's where we want to put the hole */
+			dbg_fragtree2("add hole frag %#04x-%#04x on the right of the new frag.\n",
+				holefrag->ofs, holefrag->ofs + holefrag->size);
+			rb_link_node(&holefrag->rb, &this->rb, &this->rb.rb_right);
+		} else {
+			dbg_fragtree2("Add hole frag %#04x-%#04x to the root of the tree.\n",
+				holefrag->ofs, holefrag->ofs + holefrag->size);
+			rb_link_node(&holefrag->rb, NULL, &root->rb_node);
+		}
+		rb_insert_color(&holefrag->rb, root);
+		this = holefrag;
+	}
+
+	if (this) {
+		/* By definition, the 'this' node has no right-hand child,
+		   because there are no frags with offset greater than it.
+		   So that's where we want to put new fragment */
+		dbg_fragtree2("add the new node at the right\n");
+		rb_link_node(&newfrag->rb, &this->rb, &this->rb.rb_right);
+	} else {
+		dbg_fragtree2("insert the new node at the root of the tree\n");
+		rb_link_node(&newfrag->rb, NULL, &root->rb_node);
+	}
+	rb_insert_color(&newfrag->rb, root);
+
+	return 0;
+}
+
+/* Doesn't set inode->i_size */
+static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *root, struct jffs2_node_frag *newfrag)
+{
+	struct jffs2_node_frag *this;
+	uint32_t lastend;
+
+	/* Skip all the nodes which are completed before this one starts */
+	this = jffs2_lookup_node_frag(root, newfrag->node->ofs);
+
+	if (this) {
+		dbg_fragtree2("lookup gave frag 0x%04x-0x%04x; phys 0x%08x (*%p)\n",
+			  this->ofs, this->ofs+this->size, this->node?(ref_offset(this->node->raw)):0xffffffff, this);
+		lastend = this->ofs + this->size;
+	} else {
+		dbg_fragtree2("lookup gave no frag\n");
+		lastend = 0;
+	}
+
+	/* See if we ran off the end of the fragtree */
+	if (lastend <= newfrag->ofs) {
+		/* We did */
+
+		/* Check if 'this' node was on the same page as the new node.
+		   If so, both 'this' and the new node get marked REF_NORMAL so
+		   the GC can take a look.
+		*/
+		if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) {
+			if (this->node)
+				mark_ref_normal(this->node->raw);
+			mark_ref_normal(newfrag->node->raw);
+		}
+
+		return no_overlapping_node(c, root, newfrag, this, lastend);
+	}
+
+	if (this->node)
+		dbg_fragtree2("dealing with frag %u-%u, phys %#08x(%d).\n",
+		this->ofs, this->ofs + this->size,
+		ref_offset(this->node->raw), ref_flags(this->node->raw));
+	else
+		dbg_fragtree2("dealing with hole frag %u-%u.\n",
+		this->ofs, this->ofs + this->size);
+
+	/* OK. 'this' is pointing at the first frag that newfrag->ofs at least partially obsoletes,
+	 * - i.e. newfrag->ofs < this->ofs+this->size && newfrag->ofs >= this->ofs
+	 */
+	if (newfrag->ofs > this->ofs) {
+		/* This node isn't completely obsoleted. The start of it remains valid */
+
+		/* Mark the new node and the partially covered node REF_NORMAL -- let
+		   the GC take a look at them */
+		mark_ref_normal(newfrag->node->raw);
+		if (this->node)
+			mark_ref_normal(this->node->raw);
+
+		if (this->ofs + this->size > newfrag->ofs + newfrag->size) {
+			/* The new node splits 'this' frag into two */
+			struct jffs2_node_frag *newfrag2;
+
+			if (this->node)
+				dbg_fragtree2("split old frag 0x%04x-0x%04x, phys 0x%08x\n",
+					this->ofs, this->ofs+this->size, ref_offset(this->node->raw));
+			else
+				dbg_fragtree2("split old hole frag 0x%04x-0x%04x\n",
+					this->ofs, this->ofs+this->size);
+
+			/* New second frag pointing to this's node */
+			newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size,
+						this->ofs + this->size - newfrag->ofs - newfrag->size);
+			if (unlikely(!newfrag2))
+				return -ENOMEM;
+			if (this->node)
+				this->node->frags++;
+
+			/* Adjust size of original 'this' */
+			this->size = newfrag->ofs - this->ofs;
+
+			/* Now, we know there's no node with offset
+			   greater than this->ofs but smaller than
+			   newfrag2->ofs or newfrag->ofs, for obvious
+			   reasons. So we can do a tree insert from
+			   'this' to insert newfrag, and a tree insert
+			   from newfrag to insert newfrag2. */
+			jffs2_fragtree_insert(newfrag, this);
+			rb_insert_color(&newfrag->rb, root);
+
+			jffs2_fragtree_insert(newfrag2, newfrag);
+			rb_insert_color(&newfrag2->rb, root);
+
+			return 0;
+		}
+		/* New node just reduces 'this' frag in size, doesn't split it */
+		this->size = newfrag->ofs - this->ofs;
+
+		/* Again, we know it lives down here in the tree */
+		jffs2_fragtree_insert(newfrag, this);
+		rb_insert_color(&newfrag->rb, root);
+	} else {
+		/* New frag starts at the same point as 'this' used to. Replace
+		   it in the tree without doing a delete and insertion */
+		dbg_fragtree2("inserting newfrag (*%p),%d-%d in before 'this' (*%p),%d-%d\n",
+			  newfrag, newfrag->ofs, newfrag->ofs+newfrag->size, this, this->ofs, this->ofs+this->size);
+
+		rb_replace_node(&this->rb, &newfrag->rb, root);
+
+		if (newfrag->ofs + newfrag->size >= this->ofs+this->size) {
+			dbg_fragtree2("obsoleting node frag %p (%x-%x)\n", this, this->ofs, this->ofs+this->size);
+			jffs2_obsolete_node_frag(c, this);
+		} else {
+			this->ofs += newfrag->size;
+			this->size -= newfrag->size;
+
+			jffs2_fragtree_insert(this, newfrag);
+			rb_insert_color(&this->rb, root);
+			return 0;
+		}
+	}
+	/* OK, now we have newfrag added in the correct place in the tree, but
+	   frag_next(newfrag) may be a fragment which is overlapped by it
+	*/
+	while ((this = frag_next(newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) {
+		/* 'this' frag is obsoleted completely. */
+		dbg_fragtree2("obsoleting node frag %p (%x-%x) and removing from tree\n",
+			this, this->ofs, this->ofs+this->size);
+		rb_erase(&this->rb, root);
+		jffs2_obsolete_node_frag(c, this);
+	}
+	/* Now we're pointing at the first frag which isn't totally obsoleted by
+	   the new frag */
+
+	if (!this || newfrag->ofs + newfrag->size == this->ofs)
+		return 0;
+
+	/* Still some overlap but we don't need to move it in the tree */
+	this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size);
+	this->ofs = newfrag->ofs + newfrag->size;
+
+	/* And mark them REF_NORMAL so the GC takes a look at them */
+	if (this->node)
+		mark_ref_normal(this->node->raw);
+	mark_ref_normal(newfrag->node->raw);
+
+	return 0;
+}
+
+/*
+ * Given an inode, probably with existing tree of fragments, add the new node
+ * to the fragment tree.
+ */
+int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn)
+{
+	int ret;
+	struct jffs2_node_frag *newfrag;
+
+	if (unlikely(!fn->size))
+		return 0;
+
+	newfrag = new_fragment(fn, fn->ofs, fn->size);
+	if (unlikely(!newfrag))
+		return -ENOMEM;
+	newfrag->node->frags = 1;
+
+	dbg_fragtree("adding node %#04x-%#04x @0x%08x on flash, newfrag *%p\n",
+		  fn->ofs, fn->ofs+fn->size, ref_offset(fn->raw), newfrag);
+
+	ret = jffs2_add_frag_to_fragtree(c, &f->fragtree, newfrag);
+	if (unlikely(ret))
+		return ret;
+
+	/* If we now share a page with other nodes, mark either previous
+	   or next node REF_NORMAL, as appropriate.  */
+	if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) {
+		struct jffs2_node_frag *prev = frag_prev(newfrag);
+
+		mark_ref_normal(fn->raw);
+		/* If we don't start at zero there's _always_ a previous */
+		if (prev->node)
+			mark_ref_normal(prev->node->raw);
+	}
+
+	if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) {
+		struct jffs2_node_frag *next = frag_next(newfrag);
+
+		if (next) {
+			mark_ref_normal(fn->raw);
+			if (next->node)
+				mark_ref_normal(next->node->raw);
+		}
+	}
+	jffs2_dbg_fragtree_paranoia_check_nolock(f);
+
+	return 0;
+}
+
+void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state)
+{
+	spin_lock(&c->inocache_lock);
+	ic->state = state;
+	wake_up(&c->inocache_wq);
+	spin_unlock(&c->inocache_lock);
+}
+
+/* During mount, this needs no locking. During normal operation, its
+   callers want to do other stuff while still holding the inocache_lock.
+   Rather than introducing special case get_ino_cache functions or
+   callbacks, we just let the caller do the locking itself. */
+
+struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inode_cache *ret;
+
+	ret = c->inocache_list[ino % c->inocache_hashsize];
+	while (ret && ret->ino < ino) {
+		ret = ret->next;
+	}
+
+	if (ret && ret->ino != ino)
+		ret = NULL;
+
+	return ret;
+}
+
+void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new)
+{
+	struct jffs2_inode_cache **prev;
+
+	spin_lock(&c->inocache_lock);
+	if (!new->ino)
+		new->ino = ++c->highest_ino;
+
+	dbg_inocache("add %p (ino #%u)\n", new, new->ino);
+
+	prev = &c->inocache_list[new->ino % c->inocache_hashsize];
+
+	while ((*prev) && (*prev)->ino < new->ino) {
+		prev = &(*prev)->next;
+	}
+	new->next = *prev;
+	*prev = new;
+
+	spin_unlock(&c->inocache_lock);
+}
+
+void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
+{
+	struct jffs2_inode_cache **prev;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	BUG_ON(old->xref);
+#endif
+	dbg_inocache("del %p (ino #%u)\n", old, old->ino);
+	spin_lock(&c->inocache_lock);
+
+	prev = &c->inocache_list[old->ino % c->inocache_hashsize];
+
+	while ((*prev) && (*prev)->ino < old->ino) {
+		prev = &(*prev)->next;
+	}
+	if ((*prev) == old) {
+		*prev = old->next;
+	}
+
+	/* Free it now unless it's in READING or CLEARING state, which
+	   are the transitions upon read_inode() and clear_inode(). The
+	   rest of the time we know nobody else is looking at it, and
+	   if it's held by read_inode() or clear_inode() they'll free it
+	   for themselves. */
+	if (old->state != INO_STATE_READING && old->state != INO_STATE_CLEARING)
+		jffs2_free_inode_cache(old);
+
+	spin_unlock(&c->inocache_lock);
+}
+
+void jffs2_free_ino_caches(struct jffs2_sb_info *c)
+{
+	int i;
+	struct jffs2_inode_cache *this, *next;
+
+	for (i=0; i < c->inocache_hashsize; i++) {
+		this = c->inocache_list[i];
+		while (this) {
+			next = this->next;
+			jffs2_xattr_free_inode(c, this);
+			jffs2_free_inode_cache(this);
+			this = next;
+		}
+		c->inocache_list[i] = NULL;
+	}
+}
+
+void jffs2_free_raw_node_refs(struct jffs2_sb_info *c)
+{
+	int i;
+	struct jffs2_raw_node_ref *this, *next;
+
+	for (i=0; i<c->nr_blocks; i++) {
+		this = c->blocks[i].first_node;
+		while (this) {
+			if (this[REFS_PER_BLOCK].flash_offset == REF_LINK_NODE)
+				next = this[REFS_PER_BLOCK].next_in_ino;
+			else
+				next = NULL;
+
+			jffs2_free_refblock(this);
+			this = next;
+		}
+		c->blocks[i].first_node = c->blocks[i].last_node = NULL;
+	}
+}
+
+struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset)
+{
+	/* The common case in lookup is that there will be a node
+	   which precisely matches. So we go looking for that first */
+	struct rb_node *next;
+	struct jffs2_node_frag *prev = NULL;
+	struct jffs2_node_frag *frag = NULL;
+
+	dbg_fragtree2("root %p, offset %d\n", fragtree, offset);
+
+	next = fragtree->rb_node;
+
+	while(next) {
+		frag = rb_entry(next, struct jffs2_node_frag, rb);
+
+		if (frag->ofs + frag->size <= offset) {
+			/* Remember the closest smaller match on the way down */
+			if (!prev || frag->ofs > prev->ofs)
+				prev = frag;
+			next = frag->rb.rb_right;
+		} else if (frag->ofs > offset) {
+			next = frag->rb.rb_left;
+		} else {
+			return frag;
+		}
+	}
+
+	/* Exact match not found. Go back up looking at each parent,
+	   and return the closest smaller one */
+
+	if (prev)
+		dbg_fragtree2("no match. Returning frag %#04x-%#04x, closest previous\n",
+			  prev->ofs, prev->ofs+prev->size);
+	else
+		dbg_fragtree2("returning NULL, empty fragtree\n");
+
+	return prev;
+}
+
+/* Pass 'c' argument to indicate that nodes should be marked obsolete as
+   they're killed. */
+void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
+{
+	struct jffs2_node_frag *frag, *next;
+
+	dbg_fragtree("killing\n");
+	rbtree_postorder_for_each_entry_safe(frag, next, root, rb) {
+		if (frag->node && !(--frag->node->frags)) {
+			/* Not a hole, and it's the final remaining frag
+			   of this node. Free the node */
+			if (c)
+				jffs2_mark_node_obsolete(c, frag->node->raw);
+
+			jffs2_free_full_dnode(frag->node);
+		}
+
+		jffs2_free_node_frag(frag);
+		cond_resched();
+	}
+}
+
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic)
+{
+	struct jffs2_raw_node_ref *ref;
+
+	BUG_ON(!jeb->allocated_refs);
+	jeb->allocated_refs--;
+
+	ref = jeb->last_node;
+
+	dbg_noderef("Last node at %p is (%08x,%p)\n", ref, ref->flash_offset,
+		    ref->next_in_ino);
+
+	while (ref->flash_offset != REF_EMPTY_NODE) {
+		if (ref->flash_offset == REF_LINK_NODE)
+			ref = ref->next_in_ino;
+		else
+			ref++;
+	}
+
+	dbg_noderef("New ref is %p (%08x becomes %08x,%p) len 0x%x\n", ref, 
+		    ref->flash_offset, ofs, ref->next_in_ino, len);
+
+	ref->flash_offset = ofs;
+
+	if (!jeb->first_node) {
+		jeb->first_node = ref;
+		BUG_ON(ref_offset(ref) != jeb->offset);
+	} else if (unlikely(ref_offset(ref) != jeb->offset + c->sector_size - jeb->free_size)) {
+		uint32_t last_len = ref_totlen(c, jeb, jeb->last_node);
+
+		JFFS2_ERROR("Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n",
+			    ref, ref_offset(ref), ref_offset(ref)+len,
+			    ref_offset(jeb->last_node), 
+			    ref_offset(jeb->last_node)+last_len);
+		BUG();
+	}
+	jeb->last_node = ref;
+
+	if (ic) {
+		ref->next_in_ino = ic->nodes;
+		ic->nodes = ref;
+	} else {
+		ref->next_in_ino = NULL;
+	}
+
+	switch(ref_flags(ref)) {
+	case REF_UNCHECKED:
+		c->unchecked_size += len;
+		jeb->unchecked_size += len;
+		break;
+
+	case REF_NORMAL:
+	case REF_PRISTINE:
+		c->used_size += len;
+		jeb->used_size += len;
+		break;
+
+	case REF_OBSOLETE:
+		c->dirty_size += len;
+		jeb->dirty_size += len;
+		break;
+	}
+	c->free_size -= len;
+	jeb->free_size -= len;
+
+#ifdef TEST_TOTLEN
+	/* Set (and test) __totlen field... for now */
+	ref->__totlen = len;
+	ref_totlen(c, jeb, ref);
+#endif
+	return ref;
+}
+
+/* No locking, no reservation of 'ref'. Do not use on a live file system */
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   uint32_t size)
+{
+	if (!size)
+		return 0;
+	if (unlikely(size > jeb->free_size)) {
+		pr_crit("Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n",
+			size, jeb->free_size, jeb->wasted_size);
+		BUG();
+	}
+	/* REF_EMPTY_NODE is !obsolete, so that works OK */
+	if (jeb->last_node && ref_obsolete(jeb->last_node)) {
+#ifdef TEST_TOTLEN
+		jeb->last_node->__totlen += size;
+#endif
+		c->dirty_size += size;
+		c->free_size -= size;
+		jeb->dirty_size += size;
+		jeb->free_size -= size;
+	} else {
+		uint32_t ofs = jeb->offset + c->sector_size - jeb->free_size;
+		ofs |= REF_OBSOLETE;
+
+		jffs2_link_node_ref(c, jeb, ofs, size, NULL);
+	}
+
+	return 0;
+}
+
+/* Calculate totlen from surrounding nodes or eraseblock */
+static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
+				    struct jffs2_eraseblock *jeb,
+				    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ref_end;
+	struct jffs2_raw_node_ref *next_ref = ref_next(ref);
+
+	if (next_ref)
+		ref_end = ref_offset(next_ref);
+	else {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
+		/* Last node in block. Use free_space */
+		if (unlikely(ref != jeb->last_node)) {
+			pr_crit("ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
+				ref, ref_offset(ref), jeb->last_node,
+				jeb->last_node ?
+				ref_offset(jeb->last_node) : 0);
+			BUG();
+		}
+		ref_end = jeb->offset + c->sector_size - jeb->free_size;
+	}
+	return ref_end - ref_offset(ref);
+}
+
+uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ret;
+
+	ret = __ref_totlen(c, jeb, ref);
+
+#ifdef TEST_TOTLEN
+	if (unlikely(ret != ref->__totlen)) {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
+		pr_crit("Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
+			ref, ref_offset(ref), ref_offset(ref) + ref->__totlen,
+			ret, ref->__totlen);
+		if (ref_next(ref)) {
+			pr_crit("next %p (0x%08x-0x%08x)\n",
+				ref_next(ref), ref_offset(ref_next(ref)),
+				ref_offset(ref_next(ref)) + ref->__totlen);
+		} else 
+			pr_crit("No next ref. jeb->last_node is %p\n",
+				jeb->last_node);
+
+		pr_crit("jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n",
+			jeb->wasted_size, jeb->dirty_size, jeb->used_size,
+			jeb->free_size);
+
+#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
+		__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+#endif
+
+		WARN_ON(1);
+
+		ret = ref->__totlen;
+	}
+#endif /* TEST_TOTLEN */
+	return ret;
+}
diff --git a/kernel/fs/jffs2/nodelist.h b/kernel/fs/jffs2/nodelist.h
new file mode 100644
index 000000000..fa35ff79a
--- /dev/null
+++ b/kernel/fs/jffs2/nodelist.h
@@ -0,0 +1,480 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef __JFFS2_NODELIST_H__
+#define __JFFS2_NODELIST_H__
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/jffs2.h>
+#include "jffs2_fs_sb.h"
+#include "jffs2_fs_i.h"
+#include "xattr.h"
+#include "acl.h"
+#include "summary.h"
+
+#ifdef __ECOS
+#include "os-ecos.h"
+#else
+#include "os-linux.h"
+#endif
+
+#define JFFS2_NATIVE_ENDIAN
+
+/* Note we handle mode bits conversion from JFFS2 (i.e. Linux) to/from
+   whatever OS we're actually running on here too. */
+
+#if defined(JFFS2_NATIVE_ENDIAN)
+#define cpu_to_je16(x) ((jint16_t){x})
+#define cpu_to_je32(x) ((jint32_t){x})
+#define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)})
+
+#define constant_cpu_to_je16(x) ((jint16_t){x})
+#define constant_cpu_to_je32(x) ((jint32_t){x})
+
+#define je16_to_cpu(x) ((x).v16)
+#define je32_to_cpu(x) ((x).v32)
+#define jemode_to_cpu(x) (jffs2_to_os_mode((x).m))
+#elif defined(JFFS2_BIG_ENDIAN)
+#define cpu_to_je16(x) ((jint16_t){cpu_to_be16(x)})
+#define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)})
+#define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))})
+
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_be16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_be32(x)})
+
+#define je16_to_cpu(x) (be16_to_cpu(x.v16))
+#define je32_to_cpu(x) (be32_to_cpu(x.v32))
+#define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m)))
+#elif defined(JFFS2_LITTLE_ENDIAN)
+#define cpu_to_je16(x) ((jint16_t){cpu_to_le16(x)})
+#define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)})
+#define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))})
+
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_le16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_le32(x)})
+
+#define je16_to_cpu(x) (le16_to_cpu(x.v16))
+#define je32_to_cpu(x) (le32_to_cpu(x.v32))
+#define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m)))
+#else
+#error wibble
+#endif
+
+/* The minimal node header size */
+#define JFFS2_MIN_NODE_HEADER sizeof(struct jffs2_raw_dirent)
+
+/*
+  This is all we need to keep in-core for each raw node during normal
+  operation. As and when we do read_inode on a particular inode, we can
+  scan the nodes which are listed for it and build up a proper map of
+  which nodes are currently valid. JFFSv1 always used to keep that whole
+  map in core for each inode.
+*/
+struct jffs2_raw_node_ref
+{
+	struct jffs2_raw_node_ref *next_in_ino; /* Points to the next raw_node_ref
+		for this object. If this _is_ the last, it points to the inode_cache,
+		xattr_ref or xattr_datum instead. The common part of those structures
+		has NULL in the first word. See jffs2_raw_ref_to_ic() below */
+	uint32_t flash_offset;
+#undef TEST_TOTLEN
+#ifdef TEST_TOTLEN
+	uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */
+#endif
+};
+
+#define REF_LINK_NODE ((int32_t)-1)
+#define REF_EMPTY_NODE ((int32_t)-2)
+
+/* Use blocks of about 256 bytes */
+#define REFS_PER_BLOCK ((255/sizeof(struct jffs2_raw_node_ref))-1)
+
+static inline struct jffs2_raw_node_ref *ref_next(struct jffs2_raw_node_ref *ref)
+{
+	ref++;
+
+	/* Link to another block of refs */
+	if (ref->flash_offset == REF_LINK_NODE) {
+		ref = ref->next_in_ino;
+		if (!ref)
+			return ref;
+	}
+
+	/* End of chain */
+	if (ref->flash_offset == REF_EMPTY_NODE)
+		return NULL;
+
+	return ref;
+}
+
+static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
+{
+	while(raw->next_in_ino)
+		raw = raw->next_in_ino;
+
+	/* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and
+	   not actually a jffs2_inode_cache. Check ->class */
+	return ((struct jffs2_inode_cache *)raw);
+}
+
+	/* flash_offset & 3 always has to be zero, because nodes are
+	   always aligned at 4 bytes. So we have a couple of extra bits
+	   to play with, which indicate the node's status; see below: */
+#define REF_UNCHECKED	0	/* We haven't yet checked the CRC or built its inode */
+#define REF_OBSOLETE	1	/* Obsolete, can be completely ignored */
+#define REF_PRISTINE	2	/* Completely clean. GC without looking */
+#define REF_NORMAL	3	/* Possibly overlapped. Read the page and write again on GC */
+#define ref_flags(ref)		((ref)->flash_offset & 3)
+#define ref_offset(ref)		((ref)->flash_offset & ~3)
+#define ref_obsolete(ref)	(((ref)->flash_offset & 3) == REF_OBSOLETE)
+#define mark_ref_normal(ref)    do { (ref)->flash_offset = ref_offset(ref) | REF_NORMAL; } while(0)
+
+/* Dirent nodes should be REF_PRISTINE only if they are not a deletion
+   dirent. Deletion dirents should be REF_NORMAL so that GC gets to
+   throw them away when appropriate */
+#define dirent_node_state(rd)	( (je32_to_cpu((rd)->ino)?REF_PRISTINE:REF_NORMAL) )
+
+/* NB: REF_PRISTINE for an inode-less node (ref->next_in_ino == NULL) indicates
+   it is an unknown node of type JFFS2_NODETYPE_RWCOMPAT_COPY, so it'll get
+   copied. If you need to do anything different to GC inode-less nodes, then
+   you need to modify gc.c accordingly. */
+
+/* For each inode in the filesystem, we need to keep a record of
+   nlink, because it would be a PITA to scan the whole directory tree
+   at read_inode() time to calculate it, and to keep sufficient information
+   in the raw_node_ref (basically both parent and child inode number for
+   dirent nodes) would take more space than this does. We also keep
+   a pointer to the first physical node which is part of this inode, too.
+*/
+struct jffs2_inode_cache {
+	/* First part of structure is shared with other objects which
+	   can terminate the raw node refs' next_in_ino list -- which
+	   currently struct jffs2_xattr_datum and struct jffs2_xattr_ref. */
+
+	struct jffs2_full_dirent *scan_dents; /* Used during scan to hold
+		temporary lists of dirents, and later must be set to
+		NULL to mark the end of the raw_node_ref->next_in_ino
+		chain. */
+	struct jffs2_raw_node_ref *nodes;
+	uint8_t class;	/* It's used for identification */
+
+	/* end of shared structure */
+
+	uint8_t flags;
+	uint16_t state;
+	uint32_t ino;
+	struct jffs2_inode_cache *next;
+#ifdef CONFIG_JFFS2_FS_XATTR
+	struct jffs2_xattr_ref *xref;
+#endif
+	uint32_t pino_nlink;	/* Directories store parent inode
+				   here; other inodes store nlink.
+				   Zero always means that it's
+				   completely unlinked. */
+};
+
+/* Inode states for 'state' above. We need the 'GC' state to prevent
+   someone from doing a read_inode() while we're moving a 'REF_PRISTINE'
+   node without going through all the iget() nonsense */
+#define INO_STATE_UNCHECKED	0	/* CRC checks not yet done */
+#define INO_STATE_CHECKING	1	/* CRC checks in progress */
+#define INO_STATE_PRESENT	2	/* In core */
+#define INO_STATE_CHECKEDABSENT	3	/* Checked, cleared again */
+#define INO_STATE_GC		4	/* GCing a 'pristine' node */
+#define INO_STATE_READING	5	/* In read_inode() */
+#define INO_STATE_CLEARING	6	/* In clear_inode() */
+
+#define INO_FLAGS_XATTR_CHECKED	0x01	/* has no duplicate xattr_ref */
+
+#define RAWNODE_CLASS_INODE_CACHE	0
+#define RAWNODE_CLASS_XATTR_DATUM	1
+#define RAWNODE_CLASS_XATTR_REF		2
+
+#define INOCACHE_HASHSIZE_MIN 128
+#define INOCACHE_HASHSIZE_MAX 1024
+
+#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
+
+/*
+  Larger representation of a raw node, kept in-core only when the
+  struct inode for this particular ino is instantiated.
+*/
+
+struct jffs2_full_dnode
+{
+	struct jffs2_raw_node_ref *raw;
+	uint32_t ofs; /* The offset to which the data of this node belongs */
+	uint32_t size;
+	uint32_t frags; /* Number of fragments which currently refer
+			to this node. When this reaches zero,
+			the node is obsolete.  */
+};
+
+/*
+   Even larger representation of a raw node, kept in-core only while
+   we're actually building up the original map of which nodes go where,
+   in read_inode()
+*/
+struct jffs2_tmp_dnode_info
+{
+	struct rb_node rb;
+	struct jffs2_full_dnode *fn;
+	uint32_t version;
+	uint32_t data_crc;
+	uint32_t partial_crc;
+	uint32_t csize;
+	uint16_t overlapped;
+};
+
+/* Temporary data structure used during readinode. */
+struct jffs2_readinode_info
+{
+	struct rb_root tn_root;
+	struct jffs2_tmp_dnode_info *mdata_tn;
+	uint32_t highest_version;
+	uint32_t latest_mctime;
+	uint32_t mctime_ver;
+	struct jffs2_full_dirent *fds;
+	struct jffs2_raw_node_ref *latest_ref;
+};
+
+struct jffs2_full_dirent
+{
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_full_dirent *next;
+	uint32_t version;
+	uint32_t ino; /* == zero for unlink */
+	unsigned int nhash;
+	unsigned char type;
+	unsigned char name[0];
+};
+
+/*
+  Fragments - used to build a map of which raw node to obtain
+  data from for each part of the ino
+*/
+struct jffs2_node_frag
+{
+	struct rb_node rb;
+	struct jffs2_full_dnode *node; /* NULL for holes */
+	uint32_t size;
+	uint32_t ofs; /* The offset to which this fragment belongs */
+};
+
+struct jffs2_eraseblock
+{
+	struct list_head list;
+	int bad_count;
+	uint32_t offset;		/* of this block in the MTD */
+
+	uint32_t unchecked_size;
+	uint32_t used_size;
+	uint32_t dirty_size;
+	uint32_t wasted_size;
+	uint32_t free_size;	/* Note that sector_size - free_size
+				   is the address of the first free space */
+	uint32_t allocated_refs;
+	struct jffs2_raw_node_ref *first_node;
+	struct jffs2_raw_node_ref *last_node;
+
+	struct jffs2_raw_node_ref *gc_node;	/* Next node to be garbage collected */
+};
+
+static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
+{
+	return ((c->flash_size / c->sector_size) * sizeof (struct jffs2_eraseblock)) > (128 * 1024);
+}
+
+#define ref_totlen(a, b, c) __jffs2_ref_totlen((a), (b), (c))
+
+#define ALLOC_NORMAL	0	/* Normal allocation */
+#define ALLOC_DELETION	1	/* Deletion node. Best to allow it */
+#define ALLOC_GC	2	/* Space requested for GC. Give it or die */
+#define ALLOC_NORETRY	3	/* For jffs2_write_dnode: On failure, return -EAGAIN instead of retrying */
+
+/* How much dirty space before it goes on the very_dirty_list */
+#define VERYDIRTY(c, size) ((size) >= ((c)->sector_size / 2))
+
+/* check if dirty space is more than 255 Byte */
+#define ISDIRTY(size) ((size) >  sizeof (struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
+
+#define PAD(x) (((x)+3)&~3)
+
+static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
+{
+	if (old_valid_dev(rdev)) {
+		jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
+		return sizeof(jdev->old_id);
+	} else {
+		jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
+		return sizeof(jdev->new_id);
+	}
+}
+
+static inline struct jffs2_node_frag *frag_first(struct rb_root *root)
+{
+	struct rb_node *node = rb_first(root);
+
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct jffs2_node_frag, rb);
+}
+
+static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
+{
+	struct rb_node *node = rb_last(root);
+
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct jffs2_node_frag, rb);
+}
+
+#define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb)
+#define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb)
+#define frag_parent(frag) rb_entry(rb_parent(&(frag)->rb), struct jffs2_node_frag, rb)
+#define frag_left(frag) rb_entry((frag)->rb.rb_left, struct jffs2_node_frag, rb)
+#define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb)
+#define frag_erase(frag, list) rb_erase(&frag->rb, list);
+
+#define tn_next(tn) rb_entry(rb_next(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_prev(tn) rb_entry(rb_prev(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_parent(tn) rb_entry(rb_parent(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_left(tn) rb_entry((tn)->rb.rb_left, struct jffs2_tmp_dnode_info, rb)
+#define tn_right(tn) rb_entry((tn)->rb.rb_right, struct jffs2_tmp_dnode_info, rb)
+#define tn_erase(tn, list) rb_erase(&tn->rb, list);
+#define tn_last(list) rb_entry(rb_last(list), struct jffs2_tmp_dnode_info, rb)
+#define tn_first(list) rb_entry(rb_first(list), struct jffs2_tmp_dnode_info, rb)
+
+/* nodelist.c */
+void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list);
+void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state);
+struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
+void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new);
+void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old);
+void jffs2_free_ino_caches(struct jffs2_sb_info *c);
+void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
+struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
+void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
+int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
+uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic);
+extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
+				   struct jffs2_eraseblock *jeb,
+				   struct jffs2_raw_node_ref *ref);
+
+/* nodemgmt.c */
+int jffs2_thread_should_wake(struct jffs2_sb_info *c);
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+			uint32_t *len, int prio, uint32_t sumsize);
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
+			uint32_t *len, uint32_t sumsize);
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic);
+void jffs2_complete_reservation(struct jffs2_sb_info *c);
+void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw);
+
+/* write.c */
+int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri);
+
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode);
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode);
+int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			    struct jffs2_raw_inode *ri, unsigned char *buf,
+			    uint32_t offset, uint32_t writelen, uint32_t *retlen);
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
+		    struct jffs2_raw_inode *ri, const struct qstr *qstr);
+int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
+		    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
+int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
+		   uint8_t type, const char *name, int namelen, uint32_t time);
+
+
+/* readinode.c */
+int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			uint32_t ino, struct jffs2_raw_inode *latest_node);
+int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f);
+
+/* malloc.c */
+int jffs2_create_slab_caches(void);
+void jffs2_destroy_slab_caches(void);
+
+struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize);
+void jffs2_free_full_dirent(struct jffs2_full_dirent *);
+struct jffs2_full_dnode *jffs2_alloc_full_dnode(void);
+void jffs2_free_full_dnode(struct jffs2_full_dnode *);
+struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void);
+void jffs2_free_raw_dirent(struct jffs2_raw_dirent *);
+struct jffs2_raw_inode *jffs2_alloc_raw_inode(void);
+void jffs2_free_raw_inode(struct jffs2_raw_inode *);
+struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void);
+void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *);
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb, int nr);
+void jffs2_free_refblock(struct jffs2_raw_node_ref *);
+struct jffs2_node_frag *jffs2_alloc_node_frag(void);
+void jffs2_free_node_frag(struct jffs2_node_frag *);
+struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
+void jffs2_free_inode_cache(struct jffs2_inode_cache *);
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void);
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *);
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void);
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *);
+#endif
+
+/* gc.c */
+int jffs2_garbage_collect_pass(struct jffs2_sb_info *c);
+
+/* read.c */
+int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     struct jffs2_full_dnode *fd, unsigned char *buf,
+		     int ofs, int len);
+int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			   unsigned char *buf, uint32_t offset, uint32_t len);
+char *jffs2_getlink(struct jffs2_sb_info *c, struct jffs2_inode_info *f);
+
+/* scan.c */
+int jffs2_scan_medium(struct jffs2_sb_info *c);
+void jffs2_rotate_lists(struct jffs2_sb_info *c);
+struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
+int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size);
+
+/* build.c */
+int jffs2_do_mount_fs(struct jffs2_sb_info *c);
+
+/* erase.c */
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+/* wbuf.c */
+int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino);
+int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
+int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+#endif
+
+#include "debug.h"
+
+#endif /* __JFFS2_NODELIST_H__ */
diff --git a/kernel/fs/jffs2/nodemgmt.c b/kernel/fs/jffs2/nodemgmt.c
new file mode 100644
index 000000000..b6bd4affd
--- /dev/null
+++ b/kernel/fs/jffs2/nodemgmt.c
@@ -0,0 +1,883 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include <linux/sched.h> /* For cond_resched() */
+#include "nodelist.h"
+#include "debug.h"
+
+/*
+ * Check whether the user is allowed to write.
+ */
+static int jffs2_rp_can_write(struct jffs2_sb_info *c)
+{
+	uint32_t avail;
+	struct jffs2_mount_opts *opts = &c->mount_opts;
+
+	avail = c->dirty_size + c->free_size + c->unchecked_size +
+		c->erasing_size - c->resv_blocks_write * c->sector_size
+		- c->nospc_dirty_size;
+
+	if (avail < 2 * opts->rp_size)
+		jffs2_dbg(1, "rpsize %u, dirty_size %u, free_size %u, "
+			  "erasing_size %u, unchecked_size %u, "
+			  "nr_erasing_blocks %u, avail %u, resrv %u\n",
+			  opts->rp_size, c->dirty_size, c->free_size,
+			  c->erasing_size, c->unchecked_size,
+			  c->nr_erasing_blocks, avail, c->nospc_dirty_size);
+
+	if (avail > opts->rp_size)
+		return 1;
+
+	/* Always allow root */
+	if (capable(CAP_SYS_RESOURCE))
+		return 1;
+
+	jffs2_dbg(1, "forbid writing\n");
+	return 0;
+}
+
+/**
+ *	jffs2_reserve_space - request physical space to write nodes to flash
+ *	@c: superblock info
+ *	@minsize: Minimum acceptable size of allocation
+ *	@len: Returned value of allocation length
+ *	@prio: Allocation type - ALLOC_{NORMAL,DELETION}
+ *
+ *	Requests a block of physical space on the flash. Returns zero for success
+ *	and puts 'len' into the appropriate place, or returns -ENOSPC or other 
+ *	error if appropriate. Doesn't return len since that's 
+ *
+ *	If it returns zero, jffs2_reserve_space() also downs the per-filesystem
+ *	allocation semaphore, to prevent more than one allocation from being
+ *	active at any time. The semaphore is later released by jffs2_commit_allocation()
+ *
+ *	jffs2_reserve_space() may trigger garbage collection in order to make room
+ *	for the requested allocation.
+ */
+
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c,  uint32_t minsize,
+				  uint32_t *len, uint32_t sumsize);
+
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+			uint32_t *len, int prio, uint32_t sumsize)
+{
+	int ret = -EAGAIN;
+	int blocksneeded = c->resv_blocks_write;
+	/* align it */
+	minsize = PAD(minsize);
+
+	jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
+	mutex_lock(&c->alloc_sem);
+
+	jffs2_dbg(1, "%s(): alloc sem got\n", __func__);
+
+	spin_lock(&c->erase_completion_lock);
+
+	/*
+	 * Check if the free space is greater then size of the reserved pool.
+	 * If not, only allow root to proceed with writing.
+	 */
+	if (prio != ALLOC_DELETION && !jffs2_rp_can_write(c)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	/* this needs a little more thought (true <tglx> :)) */
+	while(ret == -EAGAIN) {
+		while(c->nr_free_blocks + c->nr_erasing_blocks < blocksneeded) {
+			uint32_t dirty, avail;
+
+			/* calculate real dirty size
+			 * dirty_size contains blocks on erase_pending_list
+			 * those blocks are counted in c->nr_erasing_blocks.
+			 * If one block is actually erased, it is not longer counted as dirty_space
+			 * but it is counted in c->nr_erasing_blocks, so we add it and subtract it
+			 * with c->nr_erasing_blocks * c->sector_size again.
+			 * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks
+			 * This helps us to force gc and pick eventually a clean block to spread the load.
+			 * We add unchecked_size here, as we hopefully will find some space to use.
+			 * This will affect the sum only once, as gc first finishes checking
+			 * of nodes.
+			 */
+			dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size;
+			if (dirty < c->nospc_dirty_size) {
+				if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
+					jffs2_dbg(1, "%s(): Low on dirty space to GC, but it's a deletion. Allowing...\n",
+						  __func__);
+					break;
+				}
+				jffs2_dbg(1, "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n",
+					  dirty, c->unchecked_size,
+					  c->sector_size);
+
+				spin_unlock(&c->erase_completion_lock);
+				mutex_unlock(&c->alloc_sem);
+				return -ENOSPC;
+			}
+
+			/* Calc possibly available space. Possibly available means that we
+			 * don't know, if unchecked size contains obsoleted nodes, which could give us some
+			 * more usable space. This will affect the sum only once, as gc first finishes checking
+			 * of nodes.
+			 + Return -ENOSPC, if the maximum possibly available space is less or equal than
+			 * blocksneeded * sector_size.
+			 * This blocks endless gc looping on a filesystem, which is nearly full, even if
+			 * the check above passes.
+			 */
+			avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size;
+			if ( (avail / c->sector_size) <= blocksneeded) {
+				if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
+					jffs2_dbg(1, "%s(): Low on possibly available space, but it's a deletion. Allowing...\n",
+						  __func__);
+					break;
+				}
+
+				jffs2_dbg(1, "max. available size 0x%08x  < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n",
+					  avail, blocksneeded * c->sector_size);
+				spin_unlock(&c->erase_completion_lock);
+				mutex_unlock(&c->alloc_sem);
+				return -ENOSPC;
+			}
+
+			mutex_unlock(&c->alloc_sem);
+
+			jffs2_dbg(1, "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n",
+				  c->nr_free_blocks, c->nr_erasing_blocks,
+				  c->free_size, c->dirty_size, c->wasted_size,
+				  c->used_size, c->erasing_size, c->bad_size,
+				  c->free_size + c->dirty_size +
+				  c->wasted_size + c->used_size +
+				  c->erasing_size + c->bad_size,
+				  c->flash_size);
+			spin_unlock(&c->erase_completion_lock);
+
+			ret = jffs2_garbage_collect_pass(c);
+
+			if (ret == -EAGAIN) {
+				spin_lock(&c->erase_completion_lock);
+				if (c->nr_erasing_blocks &&
+				    list_empty(&c->erase_pending_list) &&
+				    list_empty(&c->erase_complete_list)) {
+					DECLARE_WAITQUEUE(wait, current);
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					add_wait_queue(&c->erase_wait, &wait);
+					jffs2_dbg(1, "%s waiting for erase to complete\n",
+						  __func__);
+					spin_unlock(&c->erase_completion_lock);
+
+					schedule();
+					remove_wait_queue(&c->erase_wait, &wait);
+				} else
+					spin_unlock(&c->erase_completion_lock);
+			} else if (ret)
+				return ret;
+
+			cond_resched();
+
+			if (signal_pending(current))
+				return -EINTR;
+
+			mutex_lock(&c->alloc_sem);
+			spin_lock(&c->erase_completion_lock);
+		}
+
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
+		if (ret) {
+			jffs2_dbg(1, "%s(): ret is %d\n", __func__, ret);
+		}
+	}
+
+out:
+	spin_unlock(&c->erase_completion_lock);
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+	if (ret)
+		mutex_unlock(&c->alloc_sem);
+	return ret;
+}
+
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
+			   uint32_t *len, uint32_t sumsize)
+{
+	int ret;
+	minsize = PAD(minsize);
+
+	jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
+
+	while (true) {
+		spin_lock(&c->erase_completion_lock);
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
+		if (ret) {
+			jffs2_dbg(1, "%s(): looping, ret is %d\n",
+				  __func__, ret);
+		}
+		spin_unlock(&c->erase_completion_lock);
+
+		if (ret == -EAGAIN)
+			cond_resched();
+		else
+			break;
+	}
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+
+	return ret;
+}
+
+
+/* Classify nextblock (clean, dirty of verydirty) and force to select an other one */
+
+static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+
+	if (c->nextblock == NULL) {
+		jffs2_dbg(1, "%s(): Erase block at 0x%08x has already been placed in a list\n",
+			  __func__, jeb->offset);
+		return;
+	}
+	/* Check, if we have a dirty block now, or if it was dirty already */
+	if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) {
+		c->dirty_size += jeb->wasted_size;
+		c->wasted_size -= jeb->wasted_size;
+		jeb->dirty_size += jeb->wasted_size;
+		jeb->wasted_size = 0;
+		if (VERYDIRTY(c, jeb->dirty_size)) {
+			jffs2_dbg(1, "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+				  jeb->offset, jeb->free_size, jeb->dirty_size,
+				  jeb->used_size);
+			list_add_tail(&jeb->list, &c->very_dirty_list);
+		} else {
+			jffs2_dbg(1, "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+				  jeb->offset, jeb->free_size, jeb->dirty_size,
+				  jeb->used_size);
+			list_add_tail(&jeb->list, &c->dirty_list);
+		}
+	} else {
+		jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+			  jeb->offset, jeb->free_size, jeb->dirty_size,
+			  jeb->used_size);
+		list_add_tail(&jeb->list, &c->clean_list);
+	}
+	c->nextblock = NULL;
+
+}
+
+/* Select a new jeb for nextblock */
+
+static int jffs2_find_nextblock(struct jffs2_sb_info *c)
+{
+	struct list_head *next;
+
+	/* Take the next block off the 'free' list */
+
+	if (list_empty(&c->free_list)) {
+
+		if (!c->nr_erasing_blocks &&
+			!list_empty(&c->erasable_list)) {
+			struct jffs2_eraseblock *ejeb;
+
+			ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
+			list_move_tail(&ejeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			jffs2_garbage_collect_trigger(c);
+			jffs2_dbg(1, "%s(): Triggering erase of erasable block at 0x%08x\n",
+				  __func__, ejeb->offset);
+		}
+
+		if (!c->nr_erasing_blocks &&
+			!list_empty(&c->erasable_pending_wbuf_list)) {
+			jffs2_dbg(1, "%s(): Flushing write buffer\n",
+				  __func__);
+			/* c->nextblock is NULL, no update to c->nextblock allowed */
+			spin_unlock(&c->erase_completion_lock);
+			jffs2_flush_wbuf_pad(c);
+			spin_lock(&c->erase_completion_lock);
+			/* Have another go. It'll be on the erasable_list now */
+			return -EAGAIN;
+		}
+
+		if (!c->nr_erasing_blocks) {
+			/* Ouch. We're in GC, or we wouldn't have got here.
+			   And there's no space left. At all. */
+			pr_crit("Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n",
+				c->nr_erasing_blocks, c->nr_free_blocks,
+				list_empty(&c->erasable_list) ? "yes" : "no",
+				list_empty(&c->erasing_list) ? "yes" : "no",
+				list_empty(&c->erase_pending_list) ? "yes" : "no");
+			return -ENOSPC;
+		}
+
+		spin_unlock(&c->erase_completion_lock);
+		/* Don't wait for it; just erase one right now */
+		jffs2_erase_pending_blocks(c, 1);
+		spin_lock(&c->erase_completion_lock);
+
+		/* An erase may have failed, decreasing the
+		   amount of free space available. So we must
+		   restart from the beginning */
+		return -EAGAIN;
+	}
+
+	next = c->free_list.next;
+	list_del(next);
+	c->nextblock = list_entry(next, struct jffs2_eraseblock, list);
+	c->nr_free_blocks--;
+
+	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	/* adjust write buffer offset, else we get a non contiguous write bug */
+	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
+		c->wbuf_ofs = 0xffffffff;
+#endif
+
+	jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n",
+		  __func__, c->nextblock->offset);
+
+	return 0;
+}
+
+/* Called with alloc sem _and_ erase_completion_lock */
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+				  uint32_t *len, uint32_t sumsize)
+{
+	struct jffs2_eraseblock *jeb = c->nextblock;
+	uint32_t reserved_size;				/* for summary information at the end of the jeb */
+	int ret;
+
+ restart:
+	reserved_size = 0;
+
+	if (jffs2_sum_active() && (sumsize != JFFS2_SUMMARY_NOSUM_SIZE)) {
+							/* NOSUM_SIZE means not to generate summary */
+
+		if (jeb) {
+			reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE);
+			dbg_summary("minsize=%d , jeb->free=%d ,"
+						"summary->size=%d , sumsize=%d\n",
+						minsize, jeb->free_size,
+						c->summary->sum_size, sumsize);
+		}
+
+		/* Is there enough space for writing out the current node, or we have to
+		   write out summary information now, close this jeb and select new nextblock? */
+		if (jeb && (PAD(minsize) + PAD(c->summary->sum_size + sumsize +
+					JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size)) {
+
+			/* Has summary been disabled for this jeb? */
+			if (jffs2_sum_is_disabled(c->summary)) {
+				sumsize = JFFS2_SUMMARY_NOSUM_SIZE;
+				goto restart;
+			}
+
+			/* Writing out the collected summary information */
+			dbg_summary("generating summary for 0x%08x.\n", jeb->offset);
+			ret = jffs2_sum_write_sumnode(c);
+
+			if (ret)
+				return ret;
+
+			if (jffs2_sum_is_disabled(c->summary)) {
+				/* jffs2_write_sumnode() couldn't write out the summary information
+				   diabling summary for this jeb and free the collected information
+				 */
+				sumsize = JFFS2_SUMMARY_NOSUM_SIZE;
+				goto restart;
+			}
+
+			jffs2_close_nextblock(c, jeb);
+			jeb = NULL;
+			/* keep always valid value in reserved_size */
+			reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE);
+		}
+	} else {
+		if (jeb && minsize > jeb->free_size) {
+			uint32_t waste;
+
+			/* Skip the end of this block and file it as having some dirty space */
+			/* If there's a pending write to it, flush now */
+
+			if (jffs2_wbuf_dirty(c)) {
+				spin_unlock(&c->erase_completion_lock);
+				jffs2_dbg(1, "%s(): Flushing write buffer\n",
+					  __func__);
+				jffs2_flush_wbuf_pad(c);
+				spin_lock(&c->erase_completion_lock);
+				jeb = c->nextblock;
+				goto restart;
+			}
+
+			spin_unlock(&c->erase_completion_lock);
+
+			ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+			/* Just lock it again and continue. Nothing much can change because
+			   we hold c->alloc_sem anyway. In fact, it's not entirely clear why
+			   we hold c->erase_completion_lock in the majority of this function...
+			   but that's a question for another (more caffeine-rich) day. */
+			spin_lock(&c->erase_completion_lock);
+
+			if (ret)
+				return ret;
+
+			waste = jeb->free_size;
+			jffs2_link_node_ref(c, jeb,
+					    (jeb->offset + c->sector_size - waste) | REF_OBSOLETE,
+					    waste, NULL);
+			/* FIXME: that made it count as dirty. Convert to wasted */
+			jeb->dirty_size -= waste;
+			c->dirty_size -= waste;
+			jeb->wasted_size += waste;
+			c->wasted_size += waste;
+
+			jffs2_close_nextblock(c, jeb);
+			jeb = NULL;
+		}
+	}
+
+	if (!jeb) {
+
+		ret = jffs2_find_nextblock(c);
+		if (ret)
+			return ret;
+
+		jeb = c->nextblock;
+
+		if (jeb->free_size != c->sector_size - c->cleanmarker_size) {
+			pr_warn("Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n",
+				jeb->offset, jeb->free_size);
+			goto restart;
+		}
+	}
+	/* OK, jeb (==c->nextblock) is now pointing at a block which definitely has
+	   enough space */
+	*len = jeb->free_size - reserved_size;
+
+	if (c->cleanmarker_size && jeb->used_size == c->cleanmarker_size &&
+	    !jeb->first_node->next_in_ino) {
+		/* Only node in it beforehand was a CLEANMARKER node (we think).
+		   So mark it obsolete now that there's going to be another node
+		   in the block. This will reduce used_size to zero but We've
+		   already set c->nextblock so that jffs2_mark_node_obsolete()
+		   won't try to refile it to the dirty_list.
+		*/
+		spin_unlock(&c->erase_completion_lock);
+		jffs2_mark_node_obsolete(c, jeb->first_node);
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	jffs2_dbg(1, "%s(): Giving 0x%x bytes at 0x%x\n",
+		  __func__,
+		  *len, jeb->offset + (c->sector_size - jeb->free_size));
+	return 0;
+}
+
+/**
+ *	jffs2_add_physical_node_ref - add a physical node reference to the list
+ *	@c: superblock info
+ *	@new: new node reference to add
+ *	@len: length of this physical node
+ *
+ *	Should only be used to report nodes for which space has been allocated
+ *	by jffs2_reserve_space.
+ *
+ *	Must be called with the alloc_sem held.
+ */
+
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic)
+{
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *new;
+
+	jeb = &c->blocks[ofs / c->sector_size];
+
+	jffs2_dbg(1, "%s(): Node at 0x%x(%d), size 0x%x\n",
+		  __func__, ofs & ~3, ofs & 3, len);
+#if 1
+	/* Allow non-obsolete nodes only to be added at the end of c->nextblock, 
+	   if c->nextblock is set. Note that wbuf.c will file obsolete nodes
+	   even after refiling c->nextblock */
+	if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE))
+	    && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) {
+		pr_warn("argh. node added in wrong place at 0x%08x(%d)\n",
+			ofs & ~3, ofs & 3);
+		if (c->nextblock)
+			pr_warn("nextblock 0x%08x", c->nextblock->offset);
+		else
+			pr_warn("No nextblock");
+		pr_cont(", expected at %08x\n",
+			jeb->offset + (c->sector_size - jeb->free_size));
+		return ERR_PTR(-EINVAL);
+	}
+#endif
+	spin_lock(&c->erase_completion_lock);
+
+	new = jffs2_link_node_ref(c, jeb, ofs, len, ic);
+
+	if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
+		/* If it lives on the dirty_list, jffs2_reserve_space will put it there */
+		jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+			  jeb->offset, jeb->free_size, jeb->dirty_size,
+			  jeb->used_size);
+		if (jffs2_wbuf_dirty(c)) {
+			/* Flush the last write in the block if it's outstanding */
+			spin_unlock(&c->erase_completion_lock);
+			jffs2_flush_wbuf_pad(c);
+			spin_lock(&c->erase_completion_lock);
+		}
+
+		list_add_tail(&jeb->list, &c->clean_list);
+		c->nextblock = NULL;
+	}
+	jffs2_dbg_acct_sanity_check_nolock(c,jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	spin_unlock(&c->erase_completion_lock);
+
+	return new;
+}
+
+
+void jffs2_complete_reservation(struct jffs2_sb_info *c)
+{
+	jffs2_dbg(1, "jffs2_complete_reservation()\n");
+	spin_lock(&c->erase_completion_lock);
+	jffs2_garbage_collect_trigger(c);
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->alloc_sem);
+}
+
+static inline int on_list(struct list_head *obj, struct list_head *head)
+{
+	struct list_head *this;
+
+	list_for_each(this, head) {
+		if (this == obj) {
+			jffs2_dbg(1, "%p is on list at %p\n", obj, head);
+			return 1;
+
+		}
+	}
+	return 0;
+}
+
+void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref)
+{
+	struct jffs2_eraseblock *jeb;
+	int blocknr;
+	struct jffs2_unknown_node n;
+	int ret, addedsize;
+	size_t retlen;
+	uint32_t freed_len;
+
+	if(unlikely(!ref)) {
+		pr_notice("EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
+		return;
+	}
+	if (ref_obsolete(ref)) {
+		jffs2_dbg(1, "%s(): called with already obsolete node at 0x%08x\n",
+			  __func__, ref_offset(ref));
+		return;
+	}
+	blocknr = ref->flash_offset / c->sector_size;
+	if (blocknr >= c->nr_blocks) {
+		pr_notice("raw node at 0x%08x is off the end of device!\n",
+			  ref->flash_offset);
+		BUG();
+	}
+	jeb = &c->blocks[blocknr];
+
+	if (jffs2_can_mark_obsolete(c) && !jffs2_is_readonly(c) &&
+	    !(c->flags & (JFFS2_SB_FLAG_SCANNING | JFFS2_SB_FLAG_BUILDING))) {
+		/* Hm. This may confuse static lock analysis. If any of the above
+		   three conditions is false, we're going to return from this
+		   function without actually obliterating any nodes or freeing
+		   any jffs2_raw_node_refs. So we don't need to stop erases from
+		   happening, or protect against people holding an obsolete
+		   jffs2_raw_node_ref without the erase_completion_lock. */
+		mutex_lock(&c->erase_free_sem);
+	}
+
+	spin_lock(&c->erase_completion_lock);
+
+	freed_len = ref_totlen(c, jeb, ref);
+
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		D1(if (unlikely(jeb->unchecked_size < freed_len)) {
+				pr_notice("raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n",
+					  freed_len, blocknr,
+					  ref->flash_offset, jeb->used_size);
+			BUG();
+		})
+			jffs2_dbg(1, "Obsoleting previously unchecked node at 0x%08x of len %x\n",
+				  ref_offset(ref), freed_len);
+		jeb->unchecked_size -= freed_len;
+		c->unchecked_size -= freed_len;
+	} else {
+		D1(if (unlikely(jeb->used_size < freed_len)) {
+				pr_notice("raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n",
+					  freed_len, blocknr,
+					  ref->flash_offset, jeb->used_size);
+			BUG();
+		})
+			jffs2_dbg(1, "Obsoleting node at 0x%08x of len %#x: ",
+				  ref_offset(ref), freed_len);
+		jeb->used_size -= freed_len;
+		c->used_size -= freed_len;
+	}
+
+	// Take care, that wasted size is taken into concern
+	if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) {
+		jffs2_dbg(1, "Dirtying\n");
+		addedsize = freed_len;
+		jeb->dirty_size += freed_len;
+		c->dirty_size += freed_len;
+
+		/* Convert wasted space to dirty, if not a bad block */
+		if (jeb->wasted_size) {
+			if (on_list(&jeb->list, &c->bad_used_list)) {
+				jffs2_dbg(1, "Leaving block at %08x on the bad_used_list\n",
+					  jeb->offset);
+				addedsize = 0; /* To fool the refiling code later */
+			} else {
+				jffs2_dbg(1, "Converting %d bytes of wasted space to dirty in block at %08x\n",
+					  jeb->wasted_size, jeb->offset);
+				addedsize += jeb->wasted_size;
+				jeb->dirty_size += jeb->wasted_size;
+				c->dirty_size += jeb->wasted_size;
+				c->wasted_size -= jeb->wasted_size;
+				jeb->wasted_size = 0;
+			}
+		}
+	} else {
+		jffs2_dbg(1, "Wasting\n");
+		addedsize = 0;
+		jeb->wasted_size += freed_len;
+		c->wasted_size += freed_len;
+	}
+	ref->flash_offset = ref_offset(ref) | REF_OBSOLETE;
+
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	if (c->flags & JFFS2_SB_FLAG_SCANNING) {
+		/* Flash scanning is in progress. Don't muck about with the block
+		   lists because they're not ready yet, and don't actually
+		   obliterate nodes that look obsolete. If they weren't
+		   marked obsolete on the flash at the time they _became_
+		   obsolete, there was probably a reason for that. */
+		spin_unlock(&c->erase_completion_lock);
+		/* We didn't lock the erase_free_sem */
+		return;
+	}
+
+	if (jeb == c->nextblock) {
+		jffs2_dbg(2, "Not moving nextblock 0x%08x to dirty/erase_pending list\n",
+			  jeb->offset);
+	} else if (!jeb->used_size && !jeb->unchecked_size) {
+		if (jeb == c->gcblock) {
+			jffs2_dbg(1, "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n",
+				  jeb->offset);
+			c->gcblock = NULL;
+		} else {
+			jffs2_dbg(1, "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n",
+				  jeb->offset);
+			list_del(&jeb->list);
+		}
+		if (jffs2_wbuf_dirty(c)) {
+			jffs2_dbg(1, "...and adding to erasable_pending_wbuf_list\n");
+			list_add_tail(&jeb->list, &c->erasable_pending_wbuf_list);
+		} else {
+			if (jiffies & 127) {
+				/* Most of the time, we just erase it immediately. Otherwise we
+				   spend ages scanning it on mount, etc. */
+				jffs2_dbg(1, "...and adding to erase_pending_list\n");
+				list_add_tail(&jeb->list, &c->erase_pending_list);
+				c->nr_erasing_blocks++;
+				jffs2_garbage_collect_trigger(c);
+			} else {
+				/* Sometimes, however, we leave it elsewhere so it doesn't get
+				   immediately reused, and we spread the load a bit. */
+				jffs2_dbg(1, "...and adding to erasable_list\n");
+				list_add_tail(&jeb->list, &c->erasable_list);
+			}
+		}
+		jffs2_dbg(1, "Done OK\n");
+	} else if (jeb == c->gcblock) {
+		jffs2_dbg(2, "Not moving gcblock 0x%08x to dirty_list\n",
+			  jeb->offset);
+	} else if (ISDIRTY(jeb->dirty_size) && !ISDIRTY(jeb->dirty_size - addedsize)) {
+		jffs2_dbg(1, "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n",
+			  jeb->offset);
+		list_del(&jeb->list);
+		jffs2_dbg(1, "...and adding to dirty_list\n");
+		list_add_tail(&jeb->list, &c->dirty_list);
+	} else if (VERYDIRTY(c, jeb->dirty_size) &&
+		   !VERYDIRTY(c, jeb->dirty_size - addedsize)) {
+		jffs2_dbg(1, "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n",
+			  jeb->offset);
+		list_del(&jeb->list);
+		jffs2_dbg(1, "...and adding to very_dirty_list\n");
+		list_add_tail(&jeb->list, &c->very_dirty_list);
+	} else {
+		jffs2_dbg(1, "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n",
+			  jeb->offset, jeb->free_size, jeb->dirty_size,
+			  jeb->used_size);
+	}
+
+	spin_unlock(&c->erase_completion_lock);
+
+	if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c) ||
+		(c->flags & JFFS2_SB_FLAG_BUILDING)) {
+		/* We didn't lock the erase_free_sem */
+		return;
+	}
+
+	/* The erase_free_sem is locked, and has been since before we marked the node obsolete
+	   and potentially put its eraseblock onto the erase_pending_list. Thus, we know that
+	   the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet
+	   by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */
+
+	jffs2_dbg(1, "obliterating obsoleted node at 0x%08x\n",
+		  ref_offset(ref));
+	ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
+	if (ret) {
+		pr_warn("Read error reading from obsoleted node at 0x%08x: %d\n",
+			ref_offset(ref), ret);
+		goto out_erase_sem;
+	}
+	if (retlen != sizeof(n)) {
+		pr_warn("Short read from obsoleted node at 0x%08x: %zd\n",
+			ref_offset(ref), retlen);
+		goto out_erase_sem;
+	}
+	if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) {
+		pr_warn("Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n",
+			je32_to_cpu(n.totlen), freed_len);
+		goto out_erase_sem;
+	}
+	if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) {
+		jffs2_dbg(1, "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n",
+			  ref_offset(ref), je16_to_cpu(n.nodetype));
+		goto out_erase_sem;
+	}
+	/* XXX FIXME: This is ugly now */
+	n.nodetype = cpu_to_je16(je16_to_cpu(n.nodetype) & ~JFFS2_NODE_ACCURATE);
+	ret = jffs2_flash_write(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
+	if (ret) {
+		pr_warn("Write error in obliterating obsoleted node at 0x%08x: %d\n",
+			ref_offset(ref), ret);
+		goto out_erase_sem;
+	}
+	if (retlen != sizeof(n)) {
+		pr_warn("Short write in obliterating obsoleted node at 0x%08x: %zd\n",
+			ref_offset(ref), retlen);
+		goto out_erase_sem;
+	}
+
+	/* Nodes which have been marked obsolete no longer need to be
+	   associated with any inode. Remove them from the per-inode list.
+
+	   Note we can't do this for NAND at the moment because we need
+	   obsolete dirent nodes to stay on the lists, because of the
+	   horridness in jffs2_garbage_collect_deletion_dirent(). Also
+	   because we delete the inocache, and on NAND we need that to
+	   stay around until all the nodes are actually erased, in order
+	   to stop us from giving the same inode number to another newly
+	   created inode. */
+	if (ref->next_in_ino) {
+		struct jffs2_inode_cache *ic;
+		struct jffs2_raw_node_ref **p;
+
+		spin_lock(&c->erase_completion_lock);
+
+		ic = jffs2_raw_ref_to_ic(ref);
+		for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino))
+			;
+
+		*p = ref->next_in_ino;
+		ref->next_in_ino = NULL;
+
+		switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case RAWNODE_CLASS_XATTR_DATUM:
+				jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+				break;
+			case RAWNODE_CLASS_XATTR_REF:
+				jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+				break;
+#endif
+			default:
+				if (ic->nodes == (void *)ic && ic->pino_nlink == 0)
+					jffs2_del_ino_cache(c, ic);
+				break;
+		}
+		spin_unlock(&c->erase_completion_lock);
+	}
+
+ out_erase_sem:
+	mutex_unlock(&c->erase_free_sem);
+}
+
+int jffs2_thread_should_wake(struct jffs2_sb_info *c)
+{
+	int ret = 0;
+	uint32_t dirty;
+	int nr_very_dirty = 0;
+	struct jffs2_eraseblock *jeb;
+
+	if (!list_empty(&c->erase_complete_list) ||
+	    !list_empty(&c->erase_pending_list))
+		return 1;
+
+	if (c->unchecked_size) {
+		jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
+			  c->unchecked_size, c->checked_ino);
+		return 1;
+	}
+
+	/* dirty_size contains blocks on erase_pending_list
+	 * those blocks are counted in c->nr_erasing_blocks.
+	 * If one block is actually erased, it is not longer counted as dirty_space
+	 * but it is counted in c->nr_erasing_blocks, so we add it and subtract it
+	 * with c->nr_erasing_blocks * c->sector_size again.
+	 * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks
+	 * This helps us to force gc and pick eventually a clean block to spread the load.
+	 */
+	dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size;
+
+	if (c->nr_free_blocks + c->nr_erasing_blocks < c->resv_blocks_gctrigger &&
+			(dirty > c->nospc_dirty_size))
+		ret = 1;
+
+	list_for_each_entry(jeb, &c->very_dirty_list, list) {
+		nr_very_dirty++;
+		if (nr_very_dirty == c->vdirty_blocks_gctrigger) {
+			ret = 1;
+			/* In debug mode, actually go through and count them all */
+			D1(continue);
+			break;
+		}
+	}
+
+	jffs2_dbg(1, "%s(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n",
+		  __func__, c->nr_free_blocks, c->nr_erasing_blocks,
+		  c->dirty_size, nr_very_dirty, ret ? "yes" : "no");
+
+	return ret;
+}
diff --git a/kernel/fs/jffs2/os-linux.h b/kernel/fs/jffs2/os-linux.h
new file mode 100644
index 000000000..d200a9b8f
--- /dev/null
+++ b/kernel/fs/jffs2/os-linux.h
@@ -0,0 +1,199 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef __JFFS2_OS_LINUX_H__
+#define __JFFS2_OS_LINUX_H__
+
+/* JFFS2 uses Linux mode bits natively -- no need for conversion */
+#define os_to_jffs2_mode(x) (x)
+#define jffs2_to_os_mode(x) (x)
+
+struct kstatfs;
+struct kvec;
+
+#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode))
+#define OFNI_EDONI_2SFFJ(f)  (&(f)->vfs_inode)
+#define JFFS2_SB_INFO(sb) (sb->s_fs_info)
+#define OFNI_BS_2SFFJ(c)  ((struct super_block *)c->os_priv)
+
+
+#define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size)
+#define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode)
+#define JFFS2_F_I_UID(f) (i_uid_read(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_GID(f) (i_gid_read(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev)
+
+#define ITIME(sec) ((struct timespec){sec, 0})
+#define I_SEC(tv) ((tv).tv_sec)
+#define JFFS2_F_I_CTIME(f) (OFNI_EDONI_2SFFJ(f)->i_ctime.tv_sec)
+#define JFFS2_F_I_MTIME(f) (OFNI_EDONI_2SFFJ(f)->i_mtime.tv_sec)
+#define JFFS2_F_I_ATIME(f) (OFNI_EDONI_2SFFJ(f)->i_atime.tv_sec)
+
+#define sleep_on_spinunlock(wq, s)				\
+	do {							\
+		DECLARE_WAITQUEUE(__wait, current);		\
+		add_wait_queue((wq), &__wait);			\
+		set_current_state(TASK_UNINTERRUPTIBLE);	\
+		spin_unlock(s);					\
+		schedule();					\
+		remove_wait_queue((wq), &__wait);		\
+	} while(0)
+
+static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
+{
+	f->highest_version = 0;
+	f->fragtree = RB_ROOT;
+	f->metadata = NULL;
+	f->dents = NULL;
+	f->target = NULL;
+	f->flags = 0;
+	f->usercompr = 0;
+}
+
+
+#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
+
+#define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) )
+#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
+
+
+#ifdef CONFIG_JFFS2_SUMMARY
+#define jffs2_can_mark_obsolete(c) (0)
+#else
+#define jffs2_can_mark_obsolete(c) (1)
+#endif
+
+#define jffs2_is_writebuffered(c) (0)
+#define jffs2_cleanmarker_oob(c) (0)
+#define jffs2_write_nand_cleanmarker(c,jeb) (-EIO)
+
+#define jffs2_flash_write(c, ofs, len, retlen, buf) jffs2_flash_direct_write(c, ofs, len, retlen, buf)
+#define jffs2_flash_read(c, ofs, len, retlen, buf) (mtd_read((c)->mtd, ofs, len, retlen, buf))
+#define jffs2_flush_wbuf_pad(c) ({ do{} while(0); (void)(c), 0; })
+#define jffs2_flush_wbuf_gc(c, i) ({ do{} while(0); (void)(c), (void) i, 0; })
+#define jffs2_write_nand_badblock(c,jeb,bad_offset) (1)
+#define jffs2_nand_flash_setup(c) (0)
+#define jffs2_nand_flash_cleanup(c) do {} while(0)
+#define jffs2_wbuf_dirty(c) (0)
+#define jffs2_flash_writev(a,b,c,d,e,f) jffs2_flash_direct_writev(a,b,c,d,e)
+#define jffs2_wbuf_timeout NULL
+#define jffs2_wbuf_process NULL
+#define jffs2_dataflash(c) (0)
+#define jffs2_dataflash_setup(c) (0)
+#define jffs2_dataflash_cleanup(c) do {} while (0)
+#define jffs2_nor_wbuf_flash(c) (0)
+#define jffs2_nor_wbuf_flash_setup(c) (0)
+#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
+#define jffs2_ubivol(c) (0)
+#define jffs2_ubivol_setup(c) (0)
+#define jffs2_ubivol_cleanup(c) do {} while (0)
+#define jffs2_dirty_trigger(c) do {} while (0)
+
+#else /* NAND and/or ECC'd NOR support present */
+
+#define jffs2_is_writebuffered(c) (c->wbuf != NULL)
+
+#ifdef CONFIG_JFFS2_SUMMARY
+#define jffs2_can_mark_obsolete(c) (0)
+#else
+#define jffs2_can_mark_obsolete(c) (c->mtd->flags & (MTD_BIT_WRITEABLE))
+#endif
+
+#define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH)
+
+#define jffs2_wbuf_dirty(c) (!!(c)->wbuf_len)
+
+/* wbuf.c */
+int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino);
+int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf);
+int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf);
+int jffs2_check_oob_empty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,int mode);
+int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
+void jffs2_wbuf_timeout(unsigned long data);
+void jffs2_wbuf_process(void *data);
+int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino);
+int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
+int jffs2_nand_flash_setup(struct jffs2_sb_info *c);
+void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
+
+#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
+int jffs2_dataflash_setup(struct jffs2_sb_info *c);
+void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
+#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
+int jffs2_ubivol_setup(struct jffs2_sb_info *c);
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
+
+#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
+int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
+void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
+void jffs2_dirty_trigger(struct jffs2_sb_info *c);
+
+#endif /* WRITEBUFFER */
+
+/* background.c */
+int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c);
+void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
+void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c);
+
+/* dir.c */
+extern const struct file_operations jffs2_dir_operations;
+extern const struct inode_operations jffs2_dir_inode_operations;
+
+/* file.c */
+extern const struct file_operations jffs2_file_operations;
+extern const struct inode_operations jffs2_file_inode_operations;
+extern const struct address_space_operations jffs2_file_address_operations;
+int jffs2_fsync(struct file *, loff_t, loff_t, int);
+int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
+
+/* ioctl.c */
+long jffs2_ioctl(struct file *, unsigned int, unsigned long);
+
+/* symlink.c */
+extern const struct inode_operations jffs2_symlink_inode_operations;
+
+/* fs.c */
+int jffs2_setattr (struct dentry *, struct iattr *);
+int jffs2_do_setattr (struct inode *, struct iattr *);
+struct inode *jffs2_iget(struct super_block *, unsigned long);
+void jffs2_evict_inode (struct inode *);
+void jffs2_dirty_inode(struct inode *inode, int flags);
+struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
+			       struct jffs2_raw_inode *ri);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
+int jffs2_do_remount_fs(struct super_block *, int *, char *);
+int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
+void jffs2_gc_release_inode(struct jffs2_sb_info *c,
+			    struct jffs2_inode_info *f);
+struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c,
+					      int inum, int unlinked);
+
+unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
+				   struct jffs2_inode_info *f,
+				   unsigned long offset,
+				   unsigned long *priv);
+void jffs2_gc_release_page(struct jffs2_sb_info *c,
+			   unsigned char *pg,
+			   unsigned long *priv);
+void jffs2_flash_cleanup(struct jffs2_sb_info *c);
+
+
+/* writev.c */
+int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
+		       unsigned long count, loff_t to, size_t *retlen);
+int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+			size_t *retlen, const u_char *buf);
+
+#endif /* __JFFS2_OS_LINUX_H__ */
+
+
diff --git a/kernel/fs/jffs2/read.c b/kernel/fs/jffs2/read.c
new file mode 100644
index 000000000..0b042b1fc
--- /dev/null
+++ b/kernel/fs/jffs2/read.c
@@ -0,0 +1,228 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include "nodelist.h"
+#include "compr.h"
+
+int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     struct jffs2_full_dnode *fd, unsigned char *buf,
+		     int ofs, int len)
+{
+	struct jffs2_raw_inode *ri;
+	size_t readlen;
+	uint32_t crc;
+	unsigned char *decomprbuf = NULL;
+	unsigned char *readbuf = NULL;
+	int ret = 0;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(fd->raw), sizeof(*ri), &readlen, (char *)ri);
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		pr_warn("Error reading node from 0x%08x: %d\n",
+			ref_offset(fd->raw), ret);
+		return ret;
+	}
+	if (readlen != sizeof(*ri)) {
+		jffs2_free_raw_inode(ri);
+		pr_warn("Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n",
+			ref_offset(fd->raw), sizeof(*ri), readlen);
+		return -EIO;
+	}
+	crc = crc32(0, ri, sizeof(*ri)-8);
+
+	jffs2_dbg(1, "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n",
+		  ref_offset(fd->raw), je32_to_cpu(ri->node_crc),
+		  crc, je32_to_cpu(ri->dsize), je32_to_cpu(ri->csize),
+		  je32_to_cpu(ri->offset), buf);
+	if (crc != je32_to_cpu(ri->node_crc)) {
+		pr_warn("Node CRC %08x != calculated CRC %08x for node at %08x\n",
+			je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw));
+		ret = -EIO;
+		goto out_ri;
+	}
+	/* There was a bug where we wrote hole nodes out with csize/dsize
+	   swapped. Deal with it */
+	if (ri->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(ri->dsize) &&
+	    je32_to_cpu(ri->csize)) {
+		ri->dsize = ri->csize;
+		ri->csize = cpu_to_je32(0);
+	}
+
+	D1(if(ofs + len > je32_to_cpu(ri->dsize)) {
+			pr_warn("jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n",
+				len, ofs, je32_to_cpu(ri->dsize));
+		ret = -EINVAL;
+		goto out_ri;
+	});
+
+
+	if (ri->compr == JFFS2_COMPR_ZERO) {
+		memset(buf, 0, len);
+		goto out_ri;
+	}
+
+	/* Cases:
+	   Reading whole node and it's uncompressed - read directly to buffer provided, check CRC.
+	   Reading whole node and it's compressed - read into comprbuf, check CRC and decompress to buffer provided
+	   Reading partial node and it's uncompressed - read into readbuf, check CRC, and copy
+	   Reading partial node and it's compressed - read into readbuf, check checksum, decompress to decomprbuf and copy
+	*/
+	if (ri->compr == JFFS2_COMPR_NONE && len == je32_to_cpu(ri->dsize)) {
+		readbuf = buf;
+	} else {
+		readbuf = kmalloc(je32_to_cpu(ri->csize), GFP_KERNEL);
+		if (!readbuf) {
+			ret = -ENOMEM;
+			goto out_ri;
+		}
+	}
+	if (ri->compr != JFFS2_COMPR_NONE) {
+		if (len < je32_to_cpu(ri->dsize)) {
+			decomprbuf = kmalloc(je32_to_cpu(ri->dsize), GFP_KERNEL);
+			if (!decomprbuf) {
+				ret = -ENOMEM;
+				goto out_readbuf;
+			}
+		} else {
+			decomprbuf = buf;
+		}
+	} else {
+		decomprbuf = readbuf;
+	}
+
+	jffs2_dbg(2, "Read %d bytes to %p\n", je32_to_cpu(ri->csize),
+		  readbuf);
+	ret = jffs2_flash_read(c, (ref_offset(fd->raw)) + sizeof(*ri),
+			       je32_to_cpu(ri->csize), &readlen, readbuf);
+
+	if (!ret && readlen != je32_to_cpu(ri->csize))
+		ret = -EIO;
+	if (ret)
+		goto out_decomprbuf;
+
+	crc = crc32(0, readbuf, je32_to_cpu(ri->csize));
+	if (crc != je32_to_cpu(ri->data_crc)) {
+		pr_warn("Data CRC %08x != calculated CRC %08x for node at %08x\n",
+			je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw));
+		ret = -EIO;
+		goto out_decomprbuf;
+	}
+	jffs2_dbg(2, "Data CRC matches calculated CRC %08x\n", crc);
+	if (ri->compr != JFFS2_COMPR_NONE) {
+		jffs2_dbg(2, "Decompress %d bytes from %p to %d bytes at %p\n",
+			  je32_to_cpu(ri->csize), readbuf,
+			  je32_to_cpu(ri->dsize), decomprbuf);
+		ret = jffs2_decompress(c, f, ri->compr | (ri->usercompr << 8), readbuf, decomprbuf, je32_to_cpu(ri->csize), je32_to_cpu(ri->dsize));
+		if (ret) {
+			pr_warn("Error: jffs2_decompress returned %d\n", ret);
+			goto out_decomprbuf;
+		}
+	}
+
+	if (len < je32_to_cpu(ri->dsize)) {
+		memcpy(buf, decomprbuf+ofs, len);
+	}
+ out_decomprbuf:
+	if(decomprbuf != buf && decomprbuf != readbuf)
+		kfree(decomprbuf);
+ out_readbuf:
+	if(readbuf != buf)
+		kfree(readbuf);
+ out_ri:
+	jffs2_free_raw_inode(ri);
+
+	return ret;
+}
+
+int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			   unsigned char *buf, uint32_t offset, uint32_t len)
+{
+	uint32_t end = offset + len;
+	struct jffs2_node_frag *frag;
+	int ret;
+
+	jffs2_dbg(1, "%s(): ino #%u, range 0x%08x-0x%08x\n",
+		  __func__, f->inocache->ino, offset, offset + len);
+
+	frag = jffs2_lookup_node_frag(&f->fragtree, offset);
+
+	/* XXX FIXME: Where a single physical node actually shows up in two
+	   frags, we read it twice. Don't do that. */
+	/* Now we're pointing at the first frag which overlaps our page
+	 * (or perhaps is before it, if we've been asked to read off the
+	 * end of the file). */
+	while(offset < end) {
+		jffs2_dbg(2, "%s(): offset %d, end %d\n",
+			  __func__, offset, end);
+		if (unlikely(!frag || frag->ofs > offset ||
+			     frag->ofs + frag->size <= offset)) {
+			uint32_t holesize = end - offset;
+			if (frag && frag->ofs > offset) {
+				jffs2_dbg(1, "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n",
+					  f->inocache->ino, frag->ofs, offset);
+				holesize = min(holesize, frag->ofs - offset);
+			}
+			jffs2_dbg(1, "Filling non-frag hole from %d-%d\n",
+				  offset, offset + holesize);
+			memset(buf, 0, holesize);
+			buf += holesize;
+			offset += holesize;
+			continue;
+		} else if (unlikely(!frag->node)) {
+			uint32_t holeend = min(end, frag->ofs + frag->size);
+			jffs2_dbg(1, "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n",
+				  offset, holeend, frag->ofs,
+				  frag->ofs + frag->size);
+			memset(buf, 0, holeend - offset);
+			buf += holeend - offset;
+			offset = holeend;
+			frag = frag_next(frag);
+			continue;
+		} else {
+			uint32_t readlen;
+			uint32_t fragofs; /* offset within the frag to start reading */
+
+			fragofs = offset - frag->ofs;
+			readlen = min(frag->size - fragofs, end - offset);
+			jffs2_dbg(1, "Reading %d-%d from node at 0x%08x (%d)\n",
+				  frag->ofs+fragofs,
+				  frag->ofs + fragofs+readlen,
+				  ref_offset(frag->node->raw),
+				  ref_flags(frag->node->raw));
+			ret = jffs2_read_dnode(c, f, frag->node, buf, fragofs + frag->ofs - frag->node->ofs, readlen);
+			jffs2_dbg(2, "node read done\n");
+			if (ret) {
+				jffs2_dbg(1, "%s(): error %d\n",
+					  __func__, ret);
+				memset(buf, 0, readlen);
+				return ret;
+			}
+			buf += readlen;
+			offset += readlen;
+			frag = frag_next(frag);
+			jffs2_dbg(2, "node read was OK. Looping\n");
+		}
+	}
+	return 0;
+}
+
diff --git a/kernel/fs/jffs2/readinode.c b/kernel/fs/jffs2/readinode.c
new file mode 100644
index 000000000..dddbde4f5
--- /dev/null
+++ b/kernel/fs/jffs2/readinode.c
@@ -0,0 +1,1451 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include "nodelist.h"
+
+/*
+ * Check the data CRC of the node.
+ *
+ * Returns: 0 if the data CRC is correct;
+ * 	    1 - if incorrect;
+ *	    error code if an error occurred.
+ */
+static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+	struct jffs2_raw_node_ref *ref = tn->fn->raw;
+	int err = 0, pointed = 0;
+	struct jffs2_eraseblock *jeb;
+	unsigned char *buffer;
+	uint32_t crc, ofs, len;
+	size_t retlen;
+
+	BUG_ON(tn->csize == 0);
+
+	/* Calculate how many bytes were already checked */
+	ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
+	len = tn->csize;
+
+	if (jffs2_is_writebuffered(c)) {
+		int adj = ofs % c->wbuf_pagesize;
+		if (likely(adj))
+			adj = c->wbuf_pagesize - adj;
+
+		if (adj >= tn->csize) {
+			dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
+				      ref_offset(ref), tn->csize, ofs);
+			goto adj_acc;
+		}
+
+		ofs += adj;
+		len -= adj;
+	}
+
+	dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
+		ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
+
+#ifndef __ECOS
+	/* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
+	 * adding and jffs2_flash_read_end() interface. */
+	err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL);
+	if (!err && retlen < len) {
+		JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
+		mtd_unpoint(c->mtd, ofs, retlen);
+	} else if (err) {
+		if (err != -EOPNOTSUPP)
+			JFFS2_WARNING("MTD point failed: error code %d.\n", err);
+	} else
+		pointed = 1; /* succefully pointed to device */
+#endif
+
+	if (!pointed) {
+		buffer = kmalloc(len, GFP_KERNEL);
+		if (unlikely(!buffer))
+			return -ENOMEM;
+
+		/* TODO: this is very frequent pattern, make it a separate
+		 * routine */
+		err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
+		if (err) {
+			JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
+			goto free_out;
+		}
+
+		if (retlen != len) {
+			JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
+			err = -EIO;
+			goto free_out;
+		}
+	}
+
+	/* Continue calculating CRC */
+	crc = crc32(tn->partial_crc, buffer, len);
+	if(!pointed)
+		kfree(buffer);
+#ifndef __ECOS
+	else
+		mtd_unpoint(c->mtd, ofs, len);
+#endif
+
+	if (crc != tn->data_crc) {
+		JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
+			     ref_offset(ref), tn->data_crc, crc);
+		return 1;
+	}
+
+adj_acc:
+	jeb = &c->blocks[ref->flash_offset / c->sector_size];
+	len = ref_totlen(c, jeb, ref);
+	/* If it should be REF_NORMAL, it'll get marked as such when
+	   we build the fragtree, shortly. No need to worry about GC
+	   moving it while it's marked REF_PRISTINE -- GC won't happen
+	   till we've finished checking every inode anyway. */
+	ref->flash_offset |= REF_PRISTINE;
+	/*
+	 * Mark the node as having been checked and fix the
+	 * accounting accordingly.
+	 */
+	spin_lock(&c->erase_completion_lock);
+	jeb->used_size += len;
+	jeb->unchecked_size -= len;
+	c->used_size += len;
+	c->unchecked_size -= len;
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+
+	return 0;
+
+free_out:
+	if(!pointed)
+		kfree(buffer);
+#ifndef __ECOS
+	else
+		mtd_unpoint(c->mtd, ofs, len);
+#endif
+	return err;
+}
+
+/*
+ * Helper function for jffs2_add_older_frag_to_fragtree().
+ *
+ * Checks the node if we are in the checking stage.
+ */
+static int check_tn_node(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+	int ret;
+
+	BUG_ON(ref_obsolete(tn->fn->raw));
+
+	/* We only check the data CRC of unchecked nodes */
+	if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
+		return 0;
+
+	dbg_readinode("check node %#04x-%#04x, phys offs %#08x\n",
+		      tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
+
+	ret = check_node_data(c, tn);
+	if (unlikely(ret < 0)) {
+		JFFS2_ERROR("check_node_data() returned error: %d.\n",
+			ret);
+	} else if (unlikely(ret > 0)) {
+		dbg_readinode("CRC error, mark it obsolete.\n");
+		jffs2_mark_node_obsolete(c, tn->fn->raw);
+	}
+
+	return ret;
+}
+
+static struct jffs2_tmp_dnode_info *jffs2_lookup_tn(struct rb_root *tn_root, uint32_t offset)
+{
+	struct rb_node *next;
+	struct jffs2_tmp_dnode_info *tn = NULL;
+
+	dbg_readinode("root %p, offset %d\n", tn_root, offset);
+
+	next = tn_root->rb_node;
+
+	while (next) {
+		tn = rb_entry(next, struct jffs2_tmp_dnode_info, rb);
+
+		if (tn->fn->ofs < offset)
+			next = tn->rb.rb_right;
+		else if (tn->fn->ofs >= offset)
+			next = tn->rb.rb_left;
+		else
+			break;
+	}
+
+	return tn;
+}
+
+
+static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+	jffs2_mark_node_obsolete(c, tn->fn->raw);
+	jffs2_free_full_dnode(tn->fn);
+	jffs2_free_tmp_dnode_info(tn);
+}
+/*
+ * This function is used when we read an inode. Data nodes arrive in
+ * arbitrary order -- they may be older or newer than the nodes which
+ * are already in the tree. Where overlaps occur, the older node can
+ * be discarded as long as the newer passes the CRC check. We don't
+ * bother to keep track of holes in this rbtree, and neither do we deal
+ * with frags -- we can have multiple entries starting at the same
+ * offset, and the one with the smallest length will come first in the
+ * ordering.
+ *
+ * Returns 0 if the node was handled (including marking it obsolete)
+ *	 < 0 an if error occurred
+ */
+static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
+				struct jffs2_readinode_info *rii,
+				struct jffs2_tmp_dnode_info *tn)
+{
+	uint32_t fn_end = tn->fn->ofs + tn->fn->size;
+	struct jffs2_tmp_dnode_info *this, *ptn;
+
+	dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
+
+	/* If a node has zero dsize, we only have to keep it if it might be the
+	   node with highest version -- i.e. the one which will end up as f->metadata.
+	   Note that such nodes won't be REF_UNCHECKED since there are no data to
+	   check anyway. */
+	if (!tn->fn->size) {
+		if (rii->mdata_tn) {
+			if (rii->mdata_tn->version < tn->version) {
+				/* We had a candidate mdata node already */
+				dbg_readinode("kill old mdata with ver %d\n", rii->mdata_tn->version);
+				jffs2_kill_tn(c, rii->mdata_tn);
+			} else {
+				dbg_readinode("kill new mdata with ver %d (older than existing %d\n",
+					      tn->version, rii->mdata_tn->version);
+				jffs2_kill_tn(c, tn);
+				return 0;
+			}
+		}
+		rii->mdata_tn = tn;
+		dbg_readinode("keep new mdata with ver %d\n", tn->version);
+		return 0;
+	}
+
+	/* Find the earliest node which _may_ be relevant to this one */
+	this = jffs2_lookup_tn(&rii->tn_root, tn->fn->ofs);
+	if (this) {
+		/* If the node is coincident with another at a lower address,
+		   back up until the other node is found. It may be relevant */
+		while (this->overlapped) {
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
+		dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
+	}
+
+	while (this) {
+		if (this->fn->ofs > fn_end)
+			break;
+		dbg_readinode("Ponder this ver %d, 0x%x-0x%x\n",
+			      this->version, this->fn->ofs, this->fn->size);
+
+		if (this->version == tn->version) {
+			/* Version number collision means REF_PRISTINE GC. Accept either of them
+			   as long as the CRC is correct. Check the one we have already...  */
+			if (!check_tn_node(c, this)) {
+				/* The one we already had was OK. Keep it and throw away the new one */
+				dbg_readinode("Like old node. Throw away new\n");
+				jffs2_kill_tn(c, tn);
+				return 0;
+			} else {
+				/* Who cares if the new one is good; keep it for now anyway. */
+				dbg_readinode("Like new node. Throw away old\n");
+				rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+				jffs2_kill_tn(c, this);
+				/* Same overlapping from in front and behind */
+				return 0;
+			}
+		}
+		if (this->version < tn->version &&
+		    this->fn->ofs >= tn->fn->ofs &&
+		    this->fn->ofs + this->fn->size <= fn_end) {
+			/* New node entirely overlaps 'this' */
+			if (check_tn_node(c, tn)) {
+				dbg_readinode("new node bad CRC\n");
+				jffs2_kill_tn(c, tn);
+				return 0;
+			}
+			/* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */
+			while (this && this->fn->ofs + this->fn->size <= fn_end) {
+				struct jffs2_tmp_dnode_info *next = tn_next(this);
+				if (this->version < tn->version) {
+					tn_erase(this, &rii->tn_root);
+					dbg_readinode("Kill overlapped ver %d, 0x%x-0x%x\n",
+						      this->version, this->fn->ofs,
+						      this->fn->ofs+this->fn->size);
+					jffs2_kill_tn(c, this);
+				}
+				this = next;
+			}
+			dbg_readinode("Done killing overlapped nodes\n");
+			continue;
+		}
+		if (this->version > tn->version &&
+		    this->fn->ofs <= tn->fn->ofs &&
+		    this->fn->ofs+this->fn->size >= fn_end) {
+			/* New node entirely overlapped by 'this' */
+			if (!check_tn_node(c, this)) {
+				dbg_readinode("Good CRC on old node. Kill new\n");
+				jffs2_kill_tn(c, tn);
+				return 0;
+			}
+			/* ... but 'this' was bad. Replace it... */
+			dbg_readinode("Bad CRC on old overlapping node. Kill it\n");
+			tn_erase(this, &rii->tn_root);
+			jffs2_kill_tn(c, this);
+			break;
+		}
+
+		this = tn_next(this);
+	}
+
+	/* We neither completely obsoleted nor were completely
+	   obsoleted by an earlier node. Insert into the tree */
+	{
+		struct rb_node *parent;
+		struct rb_node **link = &rii->tn_root.rb_node;
+		struct jffs2_tmp_dnode_info *insert_point = NULL;
+
+		while (*link) {
+			parent = *link;
+			insert_point = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+			if (tn->fn->ofs > insert_point->fn->ofs)
+				link = &insert_point->rb.rb_right;
+			else if (tn->fn->ofs < insert_point->fn->ofs ||
+				 tn->fn->size < insert_point->fn->size)
+				link = &insert_point->rb.rb_left;
+			else
+				link = &insert_point->rb.rb_right;
+		}
+		rb_link_node(&tn->rb, &insert_point->rb, link);
+		rb_insert_color(&tn->rb, &rii->tn_root);
+	}
+
+	/* If there's anything behind that overlaps us, note it */
+	this = tn_prev(tn);
+	if (this) {
+		while (1) {
+			if (this->fn->ofs + this->fn->size > tn->fn->ofs) {
+				dbg_readinode("Node is overlapped by %p (v %d, 0x%x-0x%x)\n",
+					      this, this->version, this->fn->ofs,
+					      this->fn->ofs+this->fn->size);
+				tn->overlapped = 1;
+				break;
+			}
+			if (!this->overlapped)
+				break;
+
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
+	}
+
+	/* If the new node overlaps anything ahead, note it */
+	this = tn_next(tn);
+	while (this && this->fn->ofs < fn_end) {
+		this->overlapped = 1;
+		dbg_readinode("Node ver %d, 0x%x-0x%x is overlapped\n",
+			      this->version, this->fn->ofs,
+			      this->fn->ofs+this->fn->size);
+		this = tn_next(this);
+	}
+	return 0;
+}
+
+/* Trivial function to remove the last node in the tree. Which by definition
+   has no right-hand child — so can be removed just by making its left-hand
+   child (if any) take its place under its parent. Since this is only done
+   when we're consuming the whole tree, there's no need to use rb_erase()
+   and let it worry about adjusting colours and balancing the tree. That
+   would just be a waste of time. */
+static void eat_last(struct rb_root *root, struct rb_node *node)
+{
+	struct rb_node *parent = rb_parent(node);
+	struct rb_node **link;
+
+	/* LAST! */
+	BUG_ON(node->rb_right);
+
+	if (!parent)
+		link = &root->rb_node;
+	else if (node == parent->rb_left)
+		link = &parent->rb_left;
+	else
+		link = &parent->rb_right;
+
+	*link = node->rb_left;
+	if (node->rb_left)
+		node->rb_left->__rb_parent_color = node->__rb_parent_color;
+}
+
+/* We put the version tree in reverse order, so we can use the same eat_last()
+   function that we use to consume the tmpnode tree (tn_root). */
+static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
+{
+	struct rb_node **link = &ver_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct jffs2_tmp_dnode_info *this_tn;
+
+	while (*link) {
+		parent = *link;
+		this_tn = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+
+		if (tn->version > this_tn->version)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	dbg_readinode("Link new node at %p (root is %p)\n", link, ver_root);
+	rb_link_node(&tn->rb, parent, link);
+	rb_insert_color(&tn->rb, ver_root);
+}
+
+/* Build final, normal fragtree from tn tree. It doesn't matter which order
+   we add nodes to the real fragtree, as long as they don't overlap. And
+   having thrown away the majority of overlapped nodes as we went, there
+   really shouldn't be many sets of nodes which do overlap. If we start at
+   the end, we can use the overlap markers -- we can just eat nodes which
+   aren't overlapped, and when we encounter nodes which _do_ overlap we
+   sort them all into a temporary tree in version order before replaying them. */
+static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
+				      struct jffs2_inode_info *f,
+				      struct jffs2_readinode_info *rii)
+{
+	struct jffs2_tmp_dnode_info *pen, *last, *this;
+	struct rb_root ver_root = RB_ROOT;
+	uint32_t high_ver = 0;
+
+	if (rii->mdata_tn) {
+		dbg_readinode("potential mdata is ver %d at %p\n", rii->mdata_tn->version, rii->mdata_tn);
+		high_ver = rii->mdata_tn->version;
+		rii->latest_ref = rii->mdata_tn->fn->raw;
+	}
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+	this = tn_last(&rii->tn_root);
+	while (this) {
+		dbg_readinode("tn %p ver %d range 0x%x-0x%x ov %d\n", this, this->version, this->fn->ofs,
+			      this->fn->ofs+this->fn->size, this->overlapped);
+		this = tn_prev(this);
+	}
+#endif
+	pen = tn_last(&rii->tn_root);
+	while ((last = pen)) {
+		pen = tn_prev(last);
+
+		eat_last(&rii->tn_root, &last->rb);
+		ver_insert(&ver_root, last);
+
+		if (unlikely(last->overlapped)) {
+			if (pen)
+				continue;
+			/*
+			 * We killed a node which set the overlapped
+			 * flags during the scan. Fix it up.
+			 */
+			last->overlapped = 0;
+		}
+
+		/* Now we have a bunch of nodes in reverse version
+		   order, in the tree at ver_root. Most of the time,
+		   there'll actually be only one node in the 'tree',
+		   in fact. */
+		this = tn_last(&ver_root);
+
+		while (this) {
+			struct jffs2_tmp_dnode_info *vers_next;
+			int ret;
+			vers_next = tn_prev(this);
+			eat_last(&ver_root, &this->rb);
+			if (check_tn_node(c, this)) {
+				dbg_readinode("node ver %d, 0x%x-0x%x failed CRC\n",
+					     this->version, this->fn->ofs,
+					     this->fn->ofs+this->fn->size);
+				jffs2_kill_tn(c, this);
+			} else {
+				if (this->version > high_ver) {
+					/* Note that this is different from the other
+					   highest_version, because this one is only
+					   counting _valid_ nodes which could give the
+					   latest inode metadata */
+					high_ver = this->version;
+					rii->latest_ref = this->fn->raw;
+				}
+				dbg_readinode("Add %p (v %d, 0x%x-0x%x, ov %d) to fragtree\n",
+					     this, this->version, this->fn->ofs,
+					     this->fn->ofs+this->fn->size, this->overlapped);
+
+				ret = jffs2_add_full_dnode_to_inode(c, f, this->fn);
+				if (ret) {
+					/* Free the nodes in vers_root; let the caller
+					   deal with the rest */
+					JFFS2_ERROR("Add node to tree failed %d\n", ret);
+					while (1) {
+						vers_next = tn_prev(this);
+						if (check_tn_node(c, this))
+							jffs2_mark_node_obsolete(c, this->fn->raw);
+						jffs2_free_full_dnode(this->fn);
+						jffs2_free_tmp_dnode_info(this);
+						this = vers_next;
+						if (!this)
+							break;
+						eat_last(&ver_root, &vers_next->rb);
+					}
+					return ret;
+				}
+				jffs2_free_tmp_dnode_info(this);
+			}
+			this = vers_next;
+		}
+	}
+	return 0;
+}
+
+static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
+{
+	struct jffs2_tmp_dnode_info *tn, *next;
+
+	rbtree_postorder_for_each_entry_safe(tn, next, list, rb) {
+			jffs2_free_full_dnode(tn->fn);
+			jffs2_free_tmp_dnode_info(tn);
+	}
+
+	*list = RB_ROOT;
+}
+
+static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
+{
+	struct jffs2_full_dirent *next;
+
+	while (fd) {
+		next = fd->next;
+		jffs2_free_full_dirent(fd);
+		fd = next;
+	}
+}
+
+/* Returns first valid node after 'ref'. May return 'ref' */
+static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_ref *ref)
+{
+	while (ref && ref->next_in_ino) {
+		if (!ref_obsolete(ref))
+			return ref;
+		dbg_noderef("node at 0x%08x is obsoleted. Ignoring.\n", ref_offset(ref));
+		ref = ref->next_in_ino;
+	}
+	return NULL;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * It is called every time an directory entry node is found.
+ *
+ * Returns: 0 on success;
+ * 	    negative error code on failure.
+ */
+static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
+				struct jffs2_raw_dirent *rd, size_t read,
+				struct jffs2_readinode_info *rii)
+{
+	struct jffs2_full_dirent *fd;
+	uint32_t crc;
+
+	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
+	BUG_ON(ref_obsolete(ref));
+
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	/* If we've never checked the CRCs on this node, check them now */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		struct jffs2_eraseblock *jeb;
+		int len;
+
+		/* Sanity check */
+		if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
+			JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
+				    ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+			jffs2_mark_node_obsolete(c, ref);
+			return 0;
+		}
+
+		jeb = &c->blocks[ref->flash_offset / c->sector_size];
+		len = ref_totlen(c, jeb, ref);
+
+		spin_lock(&c->erase_completion_lock);
+		jeb->used_size += len;
+		jeb->unchecked_size -= len;
+		c->used_size += len;
+		c->unchecked_size -= len;
+		ref->flash_offset = ref_offset(ref) | dirent_node_state(rd);
+		spin_unlock(&c->erase_completion_lock);
+	}
+
+	fd = jffs2_alloc_full_dirent(rd->nsize + 1);
+	if (unlikely(!fd))
+		return -ENOMEM;
+
+	fd->raw = ref;
+	fd->version = je32_to_cpu(rd->version);
+	fd->ino = je32_to_cpu(rd->ino);
+	fd->type = rd->type;
+
+	if (fd->version > rii->highest_version)
+		rii->highest_version = fd->version;
+
+	/* Pick out the mctime of the latest dirent */
+	if(fd->version > rii->mctime_ver && je32_to_cpu(rd->mctime)) {
+		rii->mctime_ver = fd->version;
+		rii->latest_mctime = je32_to_cpu(rd->mctime);
+	}
+
+	/*
+	 * Copy as much of the name as possible from the raw
+	 * dirent we've already read from the flash.
+	 */
+	if (read > sizeof(*rd))
+		memcpy(&fd->name[0], &rd->name[0],
+		       min_t(uint32_t, rd->nsize, (read - sizeof(*rd)) ));
+
+	/* Do we need to copy any more of the name directly from the flash? */
+	if (rd->nsize + sizeof(*rd) > read) {
+		/* FIXME: point() */
+		int err;
+		int already = read - sizeof(*rd);
+
+		err = jffs2_flash_read(c, (ref_offset(ref)) + read,
+				rd->nsize - already, &read, &fd->name[already]);
+		if (unlikely(read != rd->nsize - already) && likely(!err))
+			return -EIO;
+
+		if (unlikely(err)) {
+			JFFS2_ERROR("read remainder of name: error %d\n", err);
+			jffs2_free_full_dirent(fd);
+			return -EIO;
+		}
+	}
+
+	fd->nhash = full_name_hash(fd->name, rd->nsize);
+	fd->next = NULL;
+	fd->name[rd->nsize] = '\0';
+
+	/*
+	 * Wheee. We now have a complete jffs2_full_dirent structure, with
+	 * the name in it and everything. Link it into the list
+	 */
+	jffs2_add_fd_to_list(c, fd, &rii->fds);
+
+	return 0;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * It is called every time an inode node is found.
+ *
+ * Returns: 0 on success (possibly after marking a bad node obsolete);
+ * 	    negative error code on failure.
+ */
+static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
+			     struct jffs2_raw_inode *rd, int rdlen,
+			     struct jffs2_readinode_info *rii)
+{
+	struct jffs2_tmp_dnode_info *tn;
+	uint32_t len, csize;
+	int ret = 0;
+	uint32_t crc;
+
+	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
+	BUG_ON(ref_obsolete(ref));
+
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	tn = jffs2_alloc_tmp_dnode_info();
+	if (!tn) {
+		JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn));
+		return -ENOMEM;
+	}
+
+	tn->partial_crc = 0;
+	csize = je32_to_cpu(rd->csize);
+
+	/* If we've never checked the CRCs on this node, check them now */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+
+		/* Sanity checks */
+		if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) ||
+		    unlikely(PAD(je32_to_cpu(rd->csize) + sizeof(*rd)) != PAD(je32_to_cpu(rd->totlen)))) {
+			JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref));
+			jffs2_dbg_dump_node(c, ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto free_out;
+		}
+
+		if (jffs2_is_writebuffered(c) && csize != 0) {
+			/* At this point we are supposed to check the data CRC
+			 * of our unchecked node. But thus far, we do not
+			 * know whether the node is valid or obsolete. To
+			 * figure this out, we need to walk all the nodes of
+			 * the inode and build the inode fragtree. We don't
+			 * want to spend time checking data of nodes which may
+			 * later be found to be obsolete. So we put off the full
+			 * data CRC checking until we have read all the inode
+			 * nodes and have started building the fragtree.
+			 *
+			 * The fragtree is being built starting with nodes
+			 * having the highest version number, so we'll be able
+			 * to detect whether a node is valid (i.e., it is not
+			 * overlapped by a node with higher version) or not.
+			 * And we'll be able to check only those nodes, which
+			 * are not obsolete.
+			 *
+			 * Of course, this optimization only makes sense in case
+			 * of NAND flashes (or other flashes with
+			 * !jffs2_can_mark_obsolete()), since on NOR flashes
+			 * nodes are marked obsolete physically.
+			 *
+			 * Since NAND flashes (or other flashes with
+			 * jffs2_is_writebuffered(c)) are anyway read by
+			 * fractions of c->wbuf_pagesize, and we have just read
+			 * the node header, it is likely that the starting part
+			 * of the node data is also read when we read the
+			 * header. So we don't mind to check the CRC of the
+			 * starting part of the data of the node now, and check
+			 * the second part later (in jffs2_check_node_data()).
+			 * Of course, we will not need to re-read and re-check
+			 * the NAND page which we have just read. This is why we
+			 * read the whole NAND page at jffs2_get_inode_nodes(),
+			 * while we needed only the node header.
+			 */
+			unsigned char *buf;
+
+			/* 'buf' will point to the start of data */
+			buf = (unsigned char *)rd + sizeof(*rd);
+			/* len will be the read data length */
+			len = min_t(uint32_t, rdlen - sizeof(*rd), csize);
+			tn->partial_crc = crc32(0, buf, len);
+
+			dbg_readinode("Calculates CRC (%#08x) for %d bytes, csize %d\n", tn->partial_crc, len, csize);
+
+			/* If we actually calculated the whole data CRC
+			 * and it is wrong, drop the node. */
+			if (len >= csize && unlikely(tn->partial_crc != je32_to_cpu(rd->data_crc))) {
+				JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
+					ref_offset(ref), tn->partial_crc, je32_to_cpu(rd->data_crc));
+				jffs2_mark_node_obsolete(c, ref);
+				goto free_out;
+			}
+
+		} else if (csize == 0) {
+			/*
+			 * We checked the header CRC. If the node has no data, adjust
+			 * the space accounting now. For other nodes this will be done
+			 * later either when the node is marked obsolete or when its
+			 * data is checked.
+			 */
+			struct jffs2_eraseblock *jeb;
+
+			dbg_readinode("the node has no data.\n");
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+			len = ref_totlen(c, jeb, ref);
+
+			spin_lock(&c->erase_completion_lock);
+			jeb->used_size += len;
+			jeb->unchecked_size -= len;
+			c->used_size += len;
+			c->unchecked_size -= len;
+			ref->flash_offset = ref_offset(ref) | REF_NORMAL;
+			spin_unlock(&c->erase_completion_lock);
+		}
+	}
+
+	tn->fn = jffs2_alloc_full_dnode();
+	if (!tn->fn) {
+		JFFS2_ERROR("alloc fn failed\n");
+		ret = -ENOMEM;
+		goto free_out;
+	}
+
+	tn->version = je32_to_cpu(rd->version);
+	tn->fn->ofs = je32_to_cpu(rd->offset);
+	tn->data_crc = je32_to_cpu(rd->data_crc);
+	tn->csize = csize;
+	tn->fn->raw = ref;
+	tn->overlapped = 0;
+
+	if (tn->version > rii->highest_version)
+		rii->highest_version = tn->version;
+
+	/* There was a bug where we wrote hole nodes out with
+	   csize/dsize swapped. Deal with it */
+	if (rd->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(rd->dsize) && csize)
+		tn->fn->size = csize;
+	else // normal case...
+		tn->fn->size = je32_to_cpu(rd->dsize);
+
+	dbg_readinode2("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n",
+		       ref_offset(ref), je32_to_cpu(rd->version),
+		       je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize);
+
+	ret = jffs2_add_tn_to_tree(c, rii, tn);
+
+	if (ret) {
+		jffs2_free_full_dnode(tn->fn);
+	free_out:
+		jffs2_free_tmp_dnode_info(tn);
+		return ret;
+	}
+#ifdef JFFS2_DBG_READINODE2_MESSAGES
+	dbg_readinode2("After adding ver %d:\n", je32_to_cpu(rd->version));
+	tn = tn_first(&rii->tn_root);
+	while (tn) {
+		dbg_readinode2("%p: v %d r 0x%x-0x%x ov %d\n",
+			       tn, tn->version, tn->fn->ofs,
+			       tn->fn->ofs+tn->fn->size, tn->overlapped);
+		tn = tn_next(tn);
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * It is called every time an unknown node is found.
+ *
+ * Returns: 0 on success;
+ * 	    negative error code on failure.
+ */
+static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un)
+{
+	/* We don't mark unknown nodes as REF_UNCHECKED */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		JFFS2_ERROR("REF_UNCHECKED but unknown node at %#08x\n",
+			    ref_offset(ref));
+		JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n",
+			    je16_to_cpu(un->magic), je16_to_cpu(un->nodetype),
+			    je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc));
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
+
+	switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
+
+	case JFFS2_FEATURE_INCOMPAT:
+		JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		/* EEP */
+		BUG();
+		break;
+
+	case JFFS2_FEATURE_ROCOMPAT:
+		JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
+		break;
+
+	case JFFS2_FEATURE_RWCOMPAT_COPY:
+		JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		break;
+
+	case JFFS2_FEATURE_RWCOMPAT_DELETE:
+		JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * The function detects whether more data should be read and reads it if yes.
+ *
+ * Returns: 0 on success;
+ * 	    negative error code on failure.
+ */
+static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
+		     int needed_len, int *rdlen, unsigned char *buf)
+{
+	int err, to_read = needed_len - *rdlen;
+	size_t retlen;
+	uint32_t offs;
+
+	if (jffs2_is_writebuffered(c)) {
+		int rem = to_read % c->wbuf_pagesize;
+
+		if (rem)
+			to_read += c->wbuf_pagesize - rem;
+	}
+
+	/* We need to read more data */
+	offs = ref_offset(ref) + *rdlen;
+
+	dbg_readinode("read more %d bytes\n", to_read);
+
+	err = jffs2_flash_read(c, offs, to_read, &retlen, buf + *rdlen);
+	if (err) {
+		JFFS2_ERROR("can not read %d bytes from 0x%08x, "
+			"error code: %d.\n", to_read, offs, err);
+		return err;
+	}
+
+	if (retlen < to_read) {
+		JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
+				offs, retlen, to_read);
+		return -EIO;
+	}
+
+	*rdlen += to_read;
+	return 0;
+}
+
+/* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated
+   with this ino. Perform a preliminary ordering on data nodes, throwing away
+   those which are completely obsoleted by newer ones. The naïve approach we
+   use to take of just returning them _all_ in version order will cause us to
+   run out of memory in certain degenerate cases. */
+static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+				 struct jffs2_readinode_info *rii)
+{
+	struct jffs2_raw_node_ref *ref, *valid_ref;
+	unsigned char *buf = NULL;
+	union jffs2_node_union *node;
+	size_t retlen;
+	int len, err;
+
+	rii->mctime_ver = 0;
+
+	dbg_readinode("ino #%u\n", f->inocache->ino);
+
+	/* FIXME: in case of NOR and available ->point() this
+	 * needs to be fixed. */
+	len = sizeof(union jffs2_node_union) + c->wbuf_pagesize;
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock(&c->erase_completion_lock);
+	valid_ref = jffs2_first_valid_node(f->inocache->nodes);
+	if (!valid_ref && f->inocache->ino != 1)
+		JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino);
+	while (valid_ref) {
+		/* We can hold a pointer to a non-obsolete node without the spinlock,
+		   but _obsolete_ nodes may disappear at any time, if the block
+		   they're in gets erased. So if we mark 'ref' obsolete while we're
+		   not holding the lock, it can go away immediately. For that reason,
+		   we find the next valid node first, before processing 'ref'.
+		*/
+		ref = valid_ref;
+		valid_ref = jffs2_first_valid_node(ref->next_in_ino);
+		spin_unlock(&c->erase_completion_lock);
+
+		cond_resched();
+
+		/*
+		 * At this point we don't know the type of the node we're going
+		 * to read, so we do not know the size of its header. In order
+		 * to minimize the amount of flash IO we assume the header is
+		 * of size = JFFS2_MIN_NODE_HEADER.
+		 */
+		len = JFFS2_MIN_NODE_HEADER;
+		if (jffs2_is_writebuffered(c)) {
+			int end, rem;
+
+			/*
+			 * We are about to read JFFS2_MIN_NODE_HEADER bytes,
+			 * but this flash has some minimal I/O unit. It is
+			 * possible that we'll need to read more soon, so read
+			 * up to the next min. I/O unit, in order not to
+			 * re-read the same min. I/O unit twice.
+			 */
+			end = ref_offset(ref) + len;
+			rem = end % c->wbuf_pagesize;
+			if (rem)
+				end += c->wbuf_pagesize - rem;
+			len = end - ref_offset(ref);
+		}
+
+		dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref));
+
+		/* FIXME: point() */
+		err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf);
+		if (err) {
+			JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ref_offset(ref), err);
+			goto free_out;
+		}
+
+		if (retlen < len) {
+			JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len);
+			err = -EIO;
+			goto free_out;
+		}
+
+		node = (union jffs2_node_union *)buf;
+
+		/* No need to mask in the valid bit; it shouldn't be invalid */
+		if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
+			JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n",
+				     ref_offset(ref), je16_to_cpu(node->u.magic),
+				     je16_to_cpu(node->u.nodetype),
+				     je32_to_cpu(node->u.totlen),
+				     je32_to_cpu(node->u.hdr_crc));
+			jffs2_dbg_dump_node(c, ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto cont;
+		}
+		if (je16_to_cpu(node->u.magic) != JFFS2_MAGIC_BITMASK) {
+			/* Not a JFFS2 node, whinge and move on */
+			JFFS2_NOTICE("Wrong magic bitmask 0x%04x in node header at %#08x.\n",
+				     je16_to_cpu(node->u.magic), ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto cont;
+		}
+
+		switch (je16_to_cpu(node->u.nodetype)) {
+
+		case JFFS2_NODETYPE_DIRENT:
+
+			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent) &&
+			    len < sizeof(struct jffs2_raw_dirent)) {
+				err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf);
+				if (unlikely(err))
+					goto free_out;
+			}
+
+			err = read_direntry(c, ref, &node->d, retlen, rii);
+			if (unlikely(err))
+				goto free_out;
+
+			break;
+
+		case JFFS2_NODETYPE_INODE:
+
+			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode) &&
+			    len < sizeof(struct jffs2_raw_inode)) {
+				err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf);
+				if (unlikely(err))
+					goto free_out;
+			}
+
+			err = read_dnode(c, ref, &node->i, len, rii);
+			if (unlikely(err))
+				goto free_out;
+
+			break;
+
+		default:
+			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node) &&
+			    len < sizeof(struct jffs2_unknown_node)) {
+				err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf);
+				if (unlikely(err))
+					goto free_out;
+			}
+
+			err = read_unknown(c, ref, &node->u);
+			if (unlikely(err))
+				goto free_out;
+
+		}
+	cont:
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	spin_unlock(&c->erase_completion_lock);
+	kfree(buf);
+
+	f->highest_version = rii->highest_version;
+
+	dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n",
+		      f->inocache->ino, rii->highest_version, rii->latest_mctime,
+		      rii->mctime_ver);
+	return 0;
+
+ free_out:
+	jffs2_free_tmp_dnode_info_list(&rii->tn_root);
+	jffs2_free_full_dirent_list(rii->fds);
+	rii->fds = NULL;
+	kfree(buf);
+	return err;
+}
+
+static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
+					struct jffs2_inode_info *f,
+					struct jffs2_raw_inode *latest_node)
+{
+	struct jffs2_readinode_info rii;
+	uint32_t crc, new_size;
+	size_t retlen;
+	int ret;
+
+	dbg_readinode("ino #%u pino/nlink is %d\n", f->inocache->ino,
+		      f->inocache->pino_nlink);
+
+	memset(&rii, 0, sizeof(rii));
+
+	/* Grab all nodes relevant to this ino */
+	ret = jffs2_get_inode_nodes(c, f, &rii);
+
+	if (ret) {
+		JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret);
+		if (f->inocache->state == INO_STATE_READING)
+			jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+		return ret;
+	}
+
+	ret = jffs2_build_inode_fragtree(c, f, &rii);
+	if (ret) {
+		JFFS2_ERROR("Failed to build final fragtree for inode #%u: error %d\n",
+			    f->inocache->ino, ret);
+		if (f->inocache->state == INO_STATE_READING)
+			jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+		jffs2_free_tmp_dnode_info_list(&rii.tn_root);
+		/* FIXME: We could at least crc-check them all */
+		if (rii.mdata_tn) {
+			jffs2_free_full_dnode(rii.mdata_tn->fn);
+			jffs2_free_tmp_dnode_info(rii.mdata_tn);
+			rii.mdata_tn = NULL;
+		}
+		return ret;
+	}
+
+	if (rii.mdata_tn) {
+		if (rii.mdata_tn->fn->raw == rii.latest_ref) {
+			f->metadata = rii.mdata_tn->fn;
+			jffs2_free_tmp_dnode_info(rii.mdata_tn);
+		} else {
+			jffs2_kill_tn(c, rii.mdata_tn);
+		}
+		rii.mdata_tn = NULL;
+	}
+
+	f->dents = rii.fds;
+
+	jffs2_dbg_fragtree_paranoia_check_nolock(f);
+
+	if (unlikely(!rii.latest_ref)) {
+		/* No data nodes for this inode. */
+		if (f->inocache->ino != 1) {
+			JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino);
+			if (!rii.fds) {
+				if (f->inocache->state == INO_STATE_READING)
+					jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+				return -EIO;
+			}
+			JFFS2_NOTICE("but it has children so we fake some modes for it\n");
+		}
+		latest_node->mode = cpu_to_jemode(S_IFDIR|S_IRUGO|S_IWUSR|S_IXUGO);
+		latest_node->version = cpu_to_je32(0);
+		latest_node->atime = latest_node->ctime = latest_node->mtime = cpu_to_je32(0);
+		latest_node->isize = cpu_to_je32(0);
+		latest_node->gid = cpu_to_je16(0);
+		latest_node->uid = cpu_to_je16(0);
+		if (f->inocache->state == INO_STATE_READING)
+			jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT);
+		return 0;
+	}
+
+	ret = jffs2_flash_read(c, ref_offset(rii.latest_ref), sizeof(*latest_node), &retlen, (void *)latest_node);
+	if (ret || retlen != sizeof(*latest_node)) {
+		JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n",
+			ret, retlen, sizeof(*latest_node));
+		/* FIXME: If this fails, there seems to be a memory leak. Find it. */
+		mutex_unlock(&f->sem);
+		jffs2_do_clear_inode(c, f);
+		return ret?ret:-EIO;
+	}
+
+	crc = crc32(0, latest_node, sizeof(*latest_node)-8);
+	if (crc != je32_to_cpu(latest_node->node_crc)) {
+		JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n",
+			f->inocache->ino, ref_offset(rii.latest_ref));
+		mutex_unlock(&f->sem);
+		jffs2_do_clear_inode(c, f);
+		return -EIO;
+	}
+
+	switch(jemode_to_cpu(latest_node->mode) & S_IFMT) {
+	case S_IFDIR:
+		if (rii.mctime_ver > je32_to_cpu(latest_node->version)) {
+			/* The times in the latest_node are actually older than
+			   mctime in the latest dirent. Cheat. */
+			latest_node->ctime = latest_node->mtime = cpu_to_je32(rii.latest_mctime);
+		}
+		break;
+
+
+	case S_IFREG:
+		/* If it was a regular file, truncate it to the latest node's isize */
+		new_size = jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize));
+		if (new_size != je32_to_cpu(latest_node->isize)) {
+			JFFS2_WARNING("Truncating ino #%u to %d bytes failed because it only had %d bytes to start with!\n",
+				      f->inocache->ino, je32_to_cpu(latest_node->isize), new_size);
+			latest_node->isize = cpu_to_je32(new_size);
+		}
+		break;
+
+	case S_IFLNK:
+		/* Hack to work around broken isize in old symlink code.
+		   Remove this when dwmw2 comes to his senses and stops
+		   symlinks from being an entirely gratuitous special
+		   case. */
+		if (!je32_to_cpu(latest_node->isize))
+			latest_node->isize = latest_node->dsize;
+
+		if (f->inocache->state != INO_STATE_CHECKING) {
+			/* Symlink's inode data is the target path. Read it and
+			 * keep in RAM to facilitate quick follow symlink
+			 * operation. */
+			uint32_t csize = je32_to_cpu(latest_node->csize);
+			if (csize > JFFS2_MAX_NAME_LEN) {
+				mutex_unlock(&f->sem);
+				jffs2_do_clear_inode(c, f);
+				return -ENAMETOOLONG;
+			}
+			f->target = kmalloc(csize + 1, GFP_KERNEL);
+			if (!f->target) {
+				JFFS2_ERROR("can't allocate %u bytes of memory for the symlink target path cache\n", csize);
+				mutex_unlock(&f->sem);
+				jffs2_do_clear_inode(c, f);
+				return -ENOMEM;
+			}
+
+			ret = jffs2_flash_read(c, ref_offset(rii.latest_ref) + sizeof(*latest_node),
+					       csize, &retlen, (char *)f->target);
+
+			if (ret || retlen != csize) {
+				if (retlen != csize)
+					ret = -EIO;
+				kfree(f->target);
+				f->target = NULL;
+				mutex_unlock(&f->sem);
+				jffs2_do_clear_inode(c, f);
+				return ret;
+			}
+
+			f->target[csize] = '\0';
+			dbg_readinode("symlink's target '%s' cached\n", f->target);
+		}
+
+		/* fall through... */
+
+	case S_IFBLK:
+	case S_IFCHR:
+		/* Certain inode types should have only one data node, and it's
+		   kept as the metadata node */
+		if (f->metadata) {
+			JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n",
+			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
+			mutex_unlock(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			return -EIO;
+		}
+		if (!frag_first(&f->fragtree)) {
+			JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n",
+			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
+			mutex_unlock(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			return -EIO;
+		}
+		/* ASSERT: f->fraglist != NULL */
+		if (frag_next(frag_first(&f->fragtree))) {
+			JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n",
+			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
+			/* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */
+			mutex_unlock(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			return -EIO;
+		}
+		/* OK. We're happy */
+		f->metadata = frag_first(&f->fragtree)->node;
+		jffs2_free_node_frag(frag_first(&f->fragtree));
+		f->fragtree = RB_ROOT;
+		break;
+	}
+	if (f->inocache->state == INO_STATE_READING)
+		jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT);
+
+	return 0;
+}
+
+/* Scan the list of all nodes present for this ino, build map of versions, etc. */
+int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			uint32_t ino, struct jffs2_raw_inode *latest_node)
+{
+	dbg_readinode("read inode #%u\n", ino);
+
+ retry_inocache:
+	spin_lock(&c->inocache_lock);
+	f->inocache = jffs2_get_ino_cache(c, ino);
+
+	if (f->inocache) {
+		/* Check its state. We may need to wait before we can use it */
+		switch(f->inocache->state) {
+		case INO_STATE_UNCHECKED:
+		case INO_STATE_CHECKEDABSENT:
+			f->inocache->state = INO_STATE_READING;
+			break;
+
+		case INO_STATE_CHECKING:
+		case INO_STATE_GC:
+			/* If it's in either of these states, we need
+			   to wait for whoever's got it to finish and
+			   put it back. */
+			dbg_readinode("waiting for ino #%u in state %d\n", ino, f->inocache->state);
+			sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+			goto retry_inocache;
+
+		case INO_STATE_READING:
+		case INO_STATE_PRESENT:
+			/* Eep. This should never happen. It can
+			happen if Linux calls read_inode() again
+			before clear_inode() has finished though. */
+			JFFS2_ERROR("Eep. Trying to read_inode #%u when it's already in state %d!\n", ino, f->inocache->state);
+			/* Fail. That's probably better than allowing it to succeed */
+			f->inocache = NULL;
+			break;
+
+		default:
+			BUG();
+		}
+	}
+	spin_unlock(&c->inocache_lock);
+
+	if (!f->inocache && ino == 1) {
+		/* Special case - no root inode on medium */
+		f->inocache = jffs2_alloc_inode_cache();
+		if (!f->inocache) {
+			JFFS2_ERROR("cannot allocate inocache for root inode\n");
+			return -ENOMEM;
+		}
+		dbg_readinode("creating inocache for root inode\n");
+		memset(f->inocache, 0, sizeof(struct jffs2_inode_cache));
+		f->inocache->ino = f->inocache->pino_nlink = 1;
+		f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
+		f->inocache->state = INO_STATE_READING;
+		jffs2_add_ino_cache(c, f->inocache);
+	}
+	if (!f->inocache) {
+		JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino);
+		return -ENOENT;
+	}
+
+	return jffs2_do_read_inode_internal(c, f, latest_node);
+}
+
+int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	struct jffs2_raw_inode n;
+	struct jffs2_inode_info *f = kzalloc(sizeof(*f), GFP_KERNEL);
+	int ret;
+
+	if (!f)
+		return -ENOMEM;
+
+	mutex_init(&f->sem);
+	mutex_lock(&f->sem);
+	f->inocache = ic;
+
+	ret = jffs2_do_read_inode_internal(c, f, &n);
+	if (!ret) {
+		mutex_unlock(&f->sem);
+		jffs2_do_clear_inode(c, f);
+	}
+	jffs2_xattr_do_crccheck_inode(c, ic);
+	kfree (f);
+	return ret;
+}
+
+void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
+{
+	struct jffs2_full_dirent *fd, *fds;
+	int deleted;
+
+	jffs2_xattr_delete_inode(c, f->inocache);
+	mutex_lock(&f->sem);
+	deleted = f->inocache && !f->inocache->pino_nlink;
+
+	if (f->inocache && f->inocache->state != INO_STATE_CHECKING)
+		jffs2_set_inocache_state(c, f->inocache, INO_STATE_CLEARING);
+
+	if (f->metadata) {
+		if (deleted)
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+		jffs2_free_full_dnode(f->metadata);
+	}
+
+	jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL);
+
+	if (f->target) {
+		kfree(f->target);
+		f->target = NULL;
+	}
+
+	fds = f->dents;
+	while(fds) {
+		fd = fds;
+		fds = fd->next;
+		jffs2_free_full_dirent(fd);
+	}
+
+	if (f->inocache && f->inocache->state != INO_STATE_CHECKING) {
+		jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+		if (f->inocache->nodes == (void *)f->inocache)
+			jffs2_del_ino_cache(c, f->inocache);
+	}
+
+	mutex_unlock(&f->sem);
+}
diff --git a/kernel/fs/jffs2/scan.c b/kernel/fs/jffs2/scan.c
new file mode 100644
index 000000000..9ad5ba4b2
--- /dev/null
+++ b/kernel/fs/jffs2/scan.c
@@ -0,0 +1,1176 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/compiler.h>
+#include "nodelist.h"
+#include "summary.h"
+#include "debug.h"
+
+#define DEFAULT_EMPTY_SCAN_SIZE 256
+
+#define noisy_printk(noise, fmt, ...)					\
+do {									\
+	if (*(noise)) {							\
+		pr_notice(fmt, ##__VA_ARGS__);				\
+		(*(noise))--;						\
+		if (!(*(noise)))					\
+			pr_notice("Further such events for this erase block will not be printed\n"); \
+	}								\
+} while (0)
+
+static uint32_t pseudo_random;
+
+static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s);
+
+/* These helper functions _must_ increase ofs and also do the dirty/used space accounting.
+ * Returning an error will abort the mount - bad checksums etc. should just mark the space
+ * as dirty.
+ */
+static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s);
+static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s);
+
+static inline int min_free(struct jffs2_sb_info *c)
+{
+	uint32_t min = 2 * sizeof(struct jffs2_raw_inode);
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (!jffs2_can_mark_obsolete(c) && min < c->wbuf_pagesize)
+		return c->wbuf_pagesize;
+#endif
+	return min;
+
+}
+
+static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) {
+	if (sector_size < DEFAULT_EMPTY_SCAN_SIZE)
+		return sector_size;
+	else
+		return DEFAULT_EMPTY_SCAN_SIZE;
+}
+
+static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	int ret;
+
+	if ((ret = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+		return ret;
+	if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size)))
+		return ret;
+	/* Turned wasted size into dirty, since we apparently 
+	   think it's recoverable now. */
+	jeb->dirty_size += jeb->wasted_size;
+	c->dirty_size += jeb->wasted_size;
+	c->wasted_size -= jeb->wasted_size;
+	jeb->wasted_size = 0;
+	if (VERYDIRTY(c, jeb->dirty_size)) {
+		list_add(&jeb->list, &c->very_dirty_list);
+	} else {
+		list_add(&jeb->list, &c->dirty_list);
+	}
+	return 0;
+}
+
+int jffs2_scan_medium(struct jffs2_sb_info *c)
+{
+	int i, ret;
+	uint32_t empty_blocks = 0, bad_blocks = 0;
+	unsigned char *flashbuf = NULL;
+	uint32_t buf_size = 0;
+	struct jffs2_summary *s = NULL; /* summary info collected by the scan process */
+#ifndef __ECOS
+	size_t pointlen, try_size;
+
+	ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen,
+			(void **)&flashbuf, NULL);
+	if (!ret && pointlen < c->mtd->size) {
+		/* Don't muck about if it won't let us point to the whole flash */
+		jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n",
+			  pointlen);
+		mtd_unpoint(c->mtd, 0, pointlen);
+		flashbuf = NULL;
+	}
+	if (ret && ret != -EOPNOTSUPP)
+		jffs2_dbg(1, "MTD point failed %d\n", ret);
+#endif
+	if (!flashbuf) {
+		/* For NAND it's quicker to read a whole eraseblock at a time,
+		   apparently */
+		if (jffs2_cleanmarker_oob(c))
+			try_size = c->sector_size;
+		else
+			try_size = PAGE_SIZE;
+
+		jffs2_dbg(1, "Trying to allocate readbuf of %zu "
+			  "bytes\n", try_size);
+
+		flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size);
+		if (!flashbuf)
+			return -ENOMEM;
+
+		jffs2_dbg(1, "Allocated readbuf of %zu bytes\n",
+			  try_size);
+
+		buf_size = (uint32_t)try_size;
+	}
+
+	if (jffs2_sum_active()) {
+		s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
+		if (!s) {
+			JFFS2_WARNING("Can't allocate memory for summary\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	for (i=0; i<c->nr_blocks; i++) {
+		struct jffs2_eraseblock *jeb = &c->blocks[i];
+
+		cond_resched();
+
+		/* reset summary info for next eraseblock scan */
+		jffs2_sum_reset_collected(s);
+
+		ret = jffs2_scan_eraseblock(c, jeb, buf_size?flashbuf:(flashbuf+jeb->offset),
+						buf_size, s);
+
+		if (ret < 0)
+			goto out;
+
+		jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+		/* Now decide which list to put it on */
+		switch(ret) {
+		case BLK_STATE_ALLFF:
+			/*
+			 * Empty block.   Since we can't be sure it
+			 * was entirely erased, we just queue it for erase
+			 * again.  It will be marked as such when the erase
+			 * is complete.  Meanwhile we still count it as empty
+			 * for later checks.
+			 */
+			empty_blocks++;
+			list_add(&jeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			break;
+
+		case BLK_STATE_CLEANMARKER:
+			/* Only a CLEANMARKER node is valid */
+			if (!jeb->dirty_size) {
+				/* It's actually free */
+				list_add(&jeb->list, &c->free_list);
+				c->nr_free_blocks++;
+			} else {
+				/* Dirt */
+				jffs2_dbg(1, "Adding all-dirty block at 0x%08x to erase_pending_list\n",
+					  jeb->offset);
+				list_add(&jeb->list, &c->erase_pending_list);
+				c->nr_erasing_blocks++;
+			}
+			break;
+
+		case BLK_STATE_CLEAN:
+			/* Full (or almost full) of clean data. Clean list */
+			list_add(&jeb->list, &c->clean_list);
+			break;
+
+		case BLK_STATE_PARTDIRTY:
+			/* Some data, but not full. Dirty list. */
+			/* We want to remember the block with most free space
+			and stick it in the 'nextblock' position to start writing to it. */
+			if (jeb->free_size > min_free(c) &&
+					(!c->nextblock || c->nextblock->free_size < jeb->free_size)) {
+				/* Better candidate for the next writes to go to */
+				if (c->nextblock) {
+					ret = file_dirty(c, c->nextblock);
+					if (ret)
+						goto out;
+					/* deleting summary information of the old nextblock */
+					jffs2_sum_reset_collected(c->summary);
+				}
+				/* update collected summary information for the current nextblock */
+				jffs2_sum_move_collected(c, s);
+				jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n",
+					  __func__, jeb->offset);
+				c->nextblock = jeb;
+			} else {
+				ret = file_dirty(c, jeb);
+				if (ret)
+					goto out;
+			}
+			break;
+
+		case BLK_STATE_ALLDIRTY:
+			/* Nothing valid - not even a clean marker. Needs erasing. */
+			/* For now we just put it on the erasing list. We'll start the erases later */
+			jffs2_dbg(1, "Erase block at 0x%08x is not formatted. It will be erased\n",
+				  jeb->offset);
+			list_add(&jeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			break;
+
+		case BLK_STATE_BADBLOCK:
+			jffs2_dbg(1, "Block at 0x%08x is bad\n", jeb->offset);
+			list_add(&jeb->list, &c->bad_list);
+			c->bad_size += c->sector_size;
+			c->free_size -= c->sector_size;
+			bad_blocks++;
+			break;
+		default:
+			pr_warn("%s(): unknown block state\n", __func__);
+			BUG();
+		}
+	}
+
+	/* Nextblock dirty is always seen as wasted, because we cannot recycle it now */
+	if (c->nextblock && (c->nextblock->dirty_size)) {
+		c->nextblock->wasted_size += c->nextblock->dirty_size;
+		c->wasted_size += c->nextblock->dirty_size;
+		c->dirty_size -= c->nextblock->dirty_size;
+		c->nextblock->dirty_size = 0;
+	}
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) {
+		/* If we're going to start writing into a block which already
+		   contains data, and the end of the data isn't page-aligned,
+		   skip a little and align it. */
+
+		uint32_t skip = c->nextblock->free_size % c->wbuf_pagesize;
+
+		jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n",
+			  __func__, skip);
+		jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+		jffs2_scan_dirty_space(c, c->nextblock, skip);
+	}
+#endif
+	if (c->nr_erasing_blocks) {
+		if ( !c->used_size && ((c->nr_free_blocks+empty_blocks+bad_blocks)!= c->nr_blocks || bad_blocks == c->nr_blocks) ) {
+			pr_notice("Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n");
+			pr_notice("empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n",
+				  empty_blocks, bad_blocks, c->nr_blocks);
+			ret = -EIO;
+			goto out;
+		}
+		spin_lock(&c->erase_completion_lock);
+		jffs2_garbage_collect_trigger(c);
+		spin_unlock(&c->erase_completion_lock);
+	}
+	ret = 0;
+ out:
+	if (buf_size)
+		kfree(flashbuf);
+#ifndef __ECOS
+	else
+		mtd_unpoint(c->mtd, 0, c->mtd->size);
+#endif
+	kfree(s);
+	return ret;
+}
+
+static int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
+			       uint32_t ofs, uint32_t len)
+{
+	int ret;
+	size_t retlen;
+
+	ret = jffs2_flash_read(c, ofs, len, &retlen, buf);
+	if (ret) {
+		jffs2_dbg(1, "mtd->read(0x%x bytes from 0x%x) returned %d\n",
+			  len, ofs, ret);
+		return ret;
+	}
+	if (retlen < len) {
+		jffs2_dbg(1, "Read at 0x%x gave only 0x%zx bytes\n",
+			  ofs, retlen);
+		return -EIO;
+	}
+	return 0;
+}
+
+int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size
+	    && (!jeb->first_node || !ref_next(jeb->first_node)) )
+		return BLK_STATE_CLEANMARKER;
+
+	/* move blocks with max 4 byte dirty space to cleanlist */
+	else if (!ISDIRTY(c->sector_size - (jeb->used_size + jeb->unchecked_size))) {
+		c->dirty_size -= jeb->dirty_size;
+		c->wasted_size += jeb->dirty_size;
+		jeb->wasted_size += jeb->dirty_size;
+		jeb->dirty_size = 0;
+		return BLK_STATE_CLEAN;
+	} else if (jeb->used_size || jeb->unchecked_size)
+		return BLK_STATE_PARTDIRTY;
+	else
+		return BLK_STATE_ALLDIRTY;
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_xattr *rx, uint32_t ofs,
+				 struct jffs2_summary *s)
+{
+	struct jffs2_xattr_datum *xd;
+	uint32_t xid, version, totlen, crc;
+	int err;
+
+	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
+	if (crc != je32_to_cpu(rx->node_crc)) {
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rx->node_crc), crc);
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
+		return 0;
+	}
+
+	xid = je32_to_cpu(rx->xid);
+	version = je32_to_cpu(rx->version);
+
+	totlen = PAD(sizeof(struct jffs2_raw_xattr)
+			+ rx->name_len + 1 + je16_to_cpu(rx->value_len));
+	if (totlen != je32_to_cpu(rx->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+			      ofs, je32_to_cpu(rx->totlen), totlen);
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
+		return 0;
+	}
+
+	xd = jffs2_setup_xattr_datum(c, xid, version);
+	if (IS_ERR(xd))
+		return PTR_ERR(xd);
+
+	if (xd->version > version) {
+		struct jffs2_raw_node_ref *raw
+			= jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
+		raw->next_in_ino = xd->node->next_in_ino;
+		xd->node->next_in_ino = raw;
+	} else {
+		xd->version = version;
+		xd->xprefix = rx->xprefix;
+		xd->name_len = rx->name_len;
+		xd->value_len = je16_to_cpu(rx->value_len);
+		xd->data_crc = je32_to_cpu(rx->data_crc);
+
+		jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd);
+	}
+
+	if (jffs2_sum_active())
+		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
+	dbg_xattr("scanning xdatum at %#08x (xid=%u, version=%u)\n",
+		  ofs, xd->xid, xd->version);
+	return 0;
+}
+
+static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				struct jffs2_raw_xref *rr, uint32_t ofs,
+				struct jffs2_summary *s)
+{
+	struct jffs2_xattr_ref *ref;
+	uint32_t crc;
+	int err;
+
+	crc = crc32(0, rr, sizeof(*rr) - 4);
+	if (crc != je32_to_cpu(rr->node_crc)) {
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rr->node_crc), crc);
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen)))))
+			return err;
+		return 0;
+	}
+
+	if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%zd\n",
+			      ofs, je32_to_cpu(rr->totlen),
+			      PAD(sizeof(struct jffs2_raw_xref)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen))))
+			return err;
+		return 0;
+	}
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return -ENOMEM;
+
+	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	 * and AFTER xattr_ref is marked as a dead xref,
+	 * ref->xid is used to store 32bit xid, xd is not used
+	 * ref->ino is used to store 32bit inode-number, ic is not used
+	 * Thoes variables are declared as union, thus using those
+	 * are exclusive. In a similar way, ref->next is temporarily
+	 * used to chain all xattr_ref object. It's re-chained to
+	 * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
+	 */
+	ref->ino = je32_to_cpu(rr->ino);
+	ref->xid = je32_to_cpu(rr->xid);
+	ref->xseqno = je32_to_cpu(rr->xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
+	ref->next = c->xref_temp;
+	c->xref_temp = ref;
+
+	jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref);
+
+	if (jffs2_sum_active())
+		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
+	dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n",
+		  ofs, ref->xid, ref->ino);
+	return 0;
+}
+#endif
+
+/* Called with 'buf_size == 0' if buf is in fact a pointer _directly_ into
+   the flash, XIP-style */
+static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
+	struct jffs2_unknown_node *node;
+	struct jffs2_unknown_node crcnode;
+	uint32_t ofs, prevofs, max_ofs;
+	uint32_t hdr_crc, buf_ofs, buf_len;
+	int err;
+	int noise = 0;
+
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	int cleanmarkerfound = 0;
+#endif
+
+	ofs = jeb->offset;
+	prevofs = jeb->offset - 1;
+
+	jffs2_dbg(1, "%s(): Scanning block at 0x%x\n", __func__, ofs);
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (jffs2_cleanmarker_oob(c)) {
+		int ret;
+
+		if (mtd_block_isbad(c->mtd, jeb->offset))
+			return BLK_STATE_BADBLOCK;
+
+		ret = jffs2_check_nand_cleanmarker(c, jeb);
+		jffs2_dbg(2, "jffs_check_nand_cleanmarker returned %d\n", ret);
+
+		/* Even if it's not found, we still scan to see
+		   if the block is empty. We use this information
+		   to decide whether to erase it or not. */
+		switch (ret) {
+		case 0:		cleanmarkerfound = 1; break;
+		case 1: 	break;
+		default: 	return ret;
+		}
+	}
+#endif
+
+	if (jffs2_sum_active()) {
+		struct jffs2_sum_marker *sm;
+		void *sumptr = NULL;
+		uint32_t sumlen;
+	      
+		if (!buf_size) {
+			/* XIP case. Just look, point at the summary if it's there */
+			sm = (void *)buf + c->sector_size - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumptr = buf + je32_to_cpu(sm->offset);
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+			}
+		} else {
+			/* If NAND flash, read a whole page of it. Else just the end */
+			if (c->wbuf_pagesize)
+				buf_len = c->wbuf_pagesize;
+			else
+				buf_len = sizeof(*sm);
+
+			/* Read as much as we want into the _end_ of the preallocated buffer */
+			err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, 
+						  jeb->offset + c->sector_size - buf_len,
+						  buf_len);				
+			if (err)
+				return err;
+
+			sm = (void *)buf + buf_size - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+				sumptr = buf + buf_size - sumlen;
+
+				/* sm->offset maybe wrong but MAGIC maybe right */
+				if (sumlen > c->sector_size)
+					goto full_scan;
+
+				/* Now, make sure the summary itself is available */
+				if (sumlen > buf_size) {
+					/* Need to kmalloc for this. */
+					sumptr = kmalloc(sumlen, GFP_KERNEL);
+					if (!sumptr)
+						return -ENOMEM;
+					memcpy(sumptr + sumlen - buf_len, buf + buf_size - buf_len, buf_len);
+				}
+				if (buf_len < sumlen) {
+					/* Need to read more so that the entire summary node is present */
+					err = jffs2_fill_scan_buf(c, sumptr, 
+								  jeb->offset + c->sector_size - sumlen,
+								  sumlen - buf_len);				
+					if (err)
+						return err;
+				}
+			}
+
+		}
+
+		if (sumptr) {
+			err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random);
+
+			if (buf_size && sumlen > buf_size)
+				kfree(sumptr);
+			/* If it returns with a real error, bail. 
+			   If it returns positive, that's a block classification
+			   (i.e. BLK_STATE_xxx) so return that too.
+			   If it returns zero, fall through to full scan. */
+			if (err)
+				return err;
+		}
+	}
+
+full_scan:
+	buf_ofs = jeb->offset;
+
+	if (!buf_size) {
+		/* This is the XIP case -- we're reading _directly_ from the flash chip */
+		buf_len = c->sector_size;
+	} else {
+		buf_len = EMPTY_SCAN_SIZE(c->sector_size);
+		err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
+		if (err)
+			return err;
+	}
+
+	/* We temporarily use 'ofs' as a pointer into the buffer/jeb */
+	ofs = 0;
+	max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
+	/* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
+	while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
+		ofs += 4;
+
+	if (ofs == max_ofs) {
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+		if (jffs2_cleanmarker_oob(c)) {
+			/* scan oob, take care of cleanmarker */
+			int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound);
+			jffs2_dbg(2, "jffs2_check_oob_empty returned %d\n",
+				  ret);
+			switch (ret) {
+			case 0:		return cleanmarkerfound ? BLK_STATE_CLEANMARKER : BLK_STATE_ALLFF;
+			case 1: 	return BLK_STATE_ALLDIRTY;
+			default: 	return ret;
+			}
+		}
+#endif
+		jffs2_dbg(1, "Block at 0x%08x is empty (erased)\n",
+			  jeb->offset);
+		if (c->cleanmarker_size == 0)
+			return BLK_STATE_CLEANMARKER;	/* don't bother with re-erase */
+		else
+			return BLK_STATE_ALLFF;	/* OK to erase if all blocks are like this */
+	}
+	if (ofs) {
+		jffs2_dbg(1, "Free space at %08x ends at %08x\n", jeb->offset,
+			  jeb->offset + ofs);
+		if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+			return err;
+		if ((err = jffs2_scan_dirty_space(c, jeb, ofs)))
+			return err;
+	}
+
+	/* Now ofs is a complete physical flash offset as it always was... */
+	ofs += jeb->offset;
+
+	noise = 10;
+
+	dbg_summary("no summary found in jeb 0x%08x. Apply original scan.\n",jeb->offset);
+
+scan_more:
+	while(ofs < jeb->offset + c->sector_size) {
+
+		jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+		/* Make sure there are node refs available for use */
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+		if (err)
+			return err;
+
+		cond_resched();
+
+		if (ofs & 3) {
+			pr_warn("Eep. ofs 0x%08x not word-aligned!\n", ofs);
+			ofs = PAD(ofs);
+			continue;
+		}
+		if (ofs == prevofs) {
+			pr_warn("ofs 0x%08x has already been seen. Skipping\n",
+				ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		prevofs = ofs;
+
+		if (jeb->offset + c->sector_size < ofs + sizeof(*node)) {
+			jffs2_dbg(1, "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n",
+				  sizeof(struct jffs2_unknown_node),
+				  jeb->offset, c->sector_size, ofs,
+				  sizeof(*node));
+			if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs)))
+				return err;
+			break;
+		}
+
+		if (buf_ofs + buf_len < ofs + sizeof(*node)) {
+			buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+			jffs2_dbg(1, "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n",
+				  sizeof(struct jffs2_unknown_node),
+				  buf_len, ofs);
+			err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+			if (err)
+				return err;
+			buf_ofs = ofs;
+		}
+
+		node = (struct jffs2_unknown_node *)&buf[ofs-buf_ofs];
+
+		if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) {
+			uint32_t inbuf_ofs;
+			uint32_t empty_start, scan_end;
+
+			empty_start = ofs;
+			ofs += 4;
+			scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len);
+
+			jffs2_dbg(1, "Found empty flash at 0x%08x\n", ofs);
+		more_empty:
+			inbuf_ofs = ofs - buf_ofs;
+			while (inbuf_ofs < scan_end) {
+				if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) {
+					pr_warn("Empty flash at 0x%08x ends at 0x%08x\n",
+						empty_start, ofs);
+					if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
+						return err;
+					goto scan_more;
+				}
+
+				inbuf_ofs+=4;
+				ofs += 4;
+			}
+			/* Ran off end. */
+			jffs2_dbg(1, "Empty flash to end of buffer at 0x%08x\n",
+				  ofs);
+
+			/* If we're only checking the beginning of a block with a cleanmarker,
+			   bail now */
+			if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) &&
+			    c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) {
+				jffs2_dbg(1, "%d bytes at start of block seems clean... assuming all clean\n",
+					  EMPTY_SCAN_SIZE(c->sector_size));
+				return BLK_STATE_CLEANMARKER;
+			}
+			if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */
+				scan_end = buf_len;
+				goto more_empty;
+			}
+			
+			/* See how much more there is to read in this eraseblock... */
+			buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+			if (!buf_len) {
+				/* No more to read. Break out of main loop without marking
+				   this range of empty space as dirty (because it's not) */
+				jffs2_dbg(1, "Empty flash at %08x runs to end of block. Treating as free_space\n",
+					  empty_start);
+				break;
+			}
+			/* point never reaches here */
+			scan_end = buf_len;
+			jffs2_dbg(1, "Reading another 0x%x at 0x%08x\n",
+				  buf_len, ofs);
+			err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+			if (err)
+				return err;
+			buf_ofs = ofs;
+			goto more_empty;
+		}
+
+		if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) {
+			pr_warn("Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n",
+				ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) {
+			jffs2_dbg(1, "Dirty bitmask at 0x%08x\n", ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) {
+			pr_warn("Old JFFS2 bitmask found at 0x%08x\n", ofs);
+			pr_warn("You cannot use older JFFS2 filesystems with newer kernels\n");
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		if (je16_to_cpu(node->magic) != JFFS2_MAGIC_BITMASK) {
+			/* OK. We're out of possibilities. Whinge and move on */
+			noisy_printk(&noise, "%s(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n",
+				     __func__,
+				     JFFS2_MAGIC_BITMASK, ofs,
+				     je16_to_cpu(node->magic));
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		/* We seem to have a node of sorts. Check the CRC */
+		crcnode.magic = node->magic;
+		crcnode.nodetype = cpu_to_je16( je16_to_cpu(node->nodetype) | JFFS2_NODE_ACCURATE);
+		crcnode.totlen = node->totlen;
+		hdr_crc = crc32(0, &crcnode, sizeof(crcnode)-4);
+
+		if (hdr_crc != je32_to_cpu(node->hdr_crc)) {
+			noisy_printk(&noise, "%s(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n",
+				     __func__,
+				     ofs, je16_to_cpu(node->magic),
+				     je16_to_cpu(node->nodetype),
+				     je32_to_cpu(node->totlen),
+				     je32_to_cpu(node->hdr_crc),
+				     hdr_crc);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+
+		if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) {
+			/* Eep. Node goes over the end of the erase block. */
+			pr_warn("Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
+				ofs, je32_to_cpu(node->totlen));
+			pr_warn("Perhaps the file system was created with the wrong erase size?\n");
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+
+		if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) {
+			/* Wheee. This is an obsoleted node */
+			jffs2_dbg(2, "Node at 0x%08x is obsolete. Skipping\n",
+				  ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			continue;
+		}
+
+		switch(je16_to_cpu(node->nodetype)) {
+		case JFFS2_NODETYPE_INODE:
+			if (buf_ofs + buf_len < ofs + sizeof(struct jffs2_raw_inode)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				jffs2_dbg(1, "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n",
+					  sizeof(struct jffs2_raw_inode),
+					  buf_len, ofs);
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_inode_node(c, jeb, (void *)node, ofs, s);
+			if (err) return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+
+		case JFFS2_NODETYPE_DIRENT:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				jffs2_dbg(1, "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len,
+					  ofs);
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_dirent_node(c, jeb, (void *)node, ofs, s);
+			if (err) return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				jffs2_dbg(1, "Fewer than %d bytes (xattr node) left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len,
+					  ofs);
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				jffs2_dbg(1, "Fewer than %d bytes (xref node) left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len,
+					  ofs);
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+#endif	/* CONFIG_JFFS2_FS_XATTR */
+
+		case JFFS2_NODETYPE_CLEANMARKER:
+			jffs2_dbg(1, "CLEANMARKER node found at 0x%08x\n", ofs);
+			if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
+				pr_notice("CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n",
+					  ofs, je32_to_cpu(node->totlen),
+					  c->cleanmarker_size);
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
+				ofs += PAD(sizeof(struct jffs2_unknown_node));
+			} else if (jeb->first_node) {
+				pr_notice("CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n",
+					  ofs, jeb->offset);
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
+				ofs += PAD(sizeof(struct jffs2_unknown_node));
+			} else {
+				jffs2_link_node_ref(c, jeb, ofs | REF_NORMAL, c->cleanmarker_size, NULL);
+
+				ofs += PAD(c->cleanmarker_size);
+			}
+			break;
+
+		case JFFS2_NODETYPE_PADDING:
+			if (jffs2_sum_active())
+				jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+
+		default:
+			switch (je16_to_cpu(node->nodetype) & JFFS2_COMPAT_MASK) {
+			case JFFS2_FEATURE_ROCOMPAT:
+				pr_notice("Read-only compatible feature node (0x%04x) found at offset 0x%08x\n",
+					  je16_to_cpu(node->nodetype), ofs);
+				c->flags |= JFFS2_SB_FLAG_RO;
+				if (!(jffs2_is_readonly(c)))
+					return -EROFS;
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
+				ofs += PAD(je32_to_cpu(node->totlen));
+				break;
+
+			case JFFS2_FEATURE_INCOMPAT:
+				pr_notice("Incompatible feature node (0x%04x) found at offset 0x%08x\n",
+					  je16_to_cpu(node->nodetype), ofs);
+				return -EINVAL;
+
+			case JFFS2_FEATURE_RWCOMPAT_DELETE:
+				jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n",
+					  je16_to_cpu(node->nodetype), ofs);
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
+				ofs += PAD(je32_to_cpu(node->totlen));
+				break;
+
+			case JFFS2_FEATURE_RWCOMPAT_COPY: {
+				jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n",
+					  je16_to_cpu(node->nodetype), ofs);
+
+				jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL);
+
+				/* We can't summarise nodes we don't grok */
+				jffs2_sum_disable_collecting(s);
+				ofs += PAD(je32_to_cpu(node->totlen));
+				break;
+				}
+			}
+		}
+	}
+
+	if (jffs2_sum_active()) {
+		if (PAD(s->sum_size + JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size) {
+			dbg_summary("There is not enough space for "
+				"summary information, disabling for this jeb!\n");
+			jffs2_sum_disable_collecting(s);
+		}
+	}
+
+	jffs2_dbg(1, "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
+		  jeb->offset, jeb->free_size, jeb->dirty_size,
+		  jeb->unchecked_size, jeb->used_size, jeb->wasted_size);
+	
+	/* mark_node_obsolete can add to wasted !! */
+	if (jeb->wasted_size) {
+		jeb->dirty_size += jeb->wasted_size;
+		c->dirty_size += jeb->wasted_size;
+		c->wasted_size -= jeb->wasted_size;
+		jeb->wasted_size = 0;
+	}
+
+	return jffs2_scan_classify_jeb(c, jeb);
+}
+
+struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inode_cache *ic;
+
+	ic = jffs2_get_ino_cache(c, ino);
+	if (ic)
+		return ic;
+
+	if (ino > c->highest_ino)
+		c->highest_ino = ino;
+
+	ic = jffs2_alloc_inode_cache();
+	if (!ic) {
+		pr_notice("%s(): allocation of inode cache failed\n", __func__);
+		return NULL;
+	}
+	memset(ic, 0, sizeof(*ic));
+
+	ic->ino = ino;
+	ic->nodes = (void *)ic;
+	jffs2_add_ino_cache(c, ic);
+	if (ino == 1)
+		ic->pino_nlink = 1;
+	return ic;
+}
+
+static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
+{
+	struct jffs2_inode_cache *ic;
+	uint32_t crc, ino = je32_to_cpu(ri->ino);
+
+	jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs);
+
+	/* We do very little here now. Just check the ino# to which we should attribute
+	   this node; we can do all the CRC checking etc. later. There's a tradeoff here --
+	   we used to scan the flash once only, reading everything we want from it into
+	   memory, then building all our in-core data structures and freeing the extra
+	   information. Now we allow the first part of the mount to complete a lot quicker,
+	   but we have to go _back_ to the flash in order to finish the CRC checking, etc.
+	   Which means that the _full_ amount of time to get to proper write mode with GC
+	   operational may actually be _longer_ than before. Sucks to be me. */
+
+	/* Check the node CRC in any case. */
+	crc = crc32(0, ri, sizeof(*ri)-8);
+	if (crc != je32_to_cpu(ri->node_crc)) {
+		pr_notice("%s(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+			  __func__, ofs, je32_to_cpu(ri->node_crc), crc);
+		/*
+		 * We believe totlen because the CRC on the node
+		 * _header_ was OK, just the node itself failed.
+		 */
+		return jffs2_scan_dirty_space(c, jeb,
+					      PAD(je32_to_cpu(ri->totlen)));
+	}
+
+	ic = jffs2_get_ino_cache(c, ino);
+	if (!ic) {
+		ic = jffs2_scan_make_ino_cache(c, ino);
+		if (!ic)
+			return -ENOMEM;
+	}
+
+	/* Wheee. It worked */
+	jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic);
+
+	jffs2_dbg(1, "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
+		  je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
+		  je32_to_cpu(ri->offset),
+		  je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize));
+
+	pseudo_random += je32_to_cpu(ri->version);
+
+	if (jffs2_sum_active()) {
+		jffs2_sum_add_inode_mem(s, ri, ofs - jeb->offset);
+	}
+
+	return 0;
+}
+
+static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				  struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s)
+{
+	struct jffs2_full_dirent *fd;
+	struct jffs2_inode_cache *ic;
+	uint32_t checkedlen;
+	uint32_t crc;
+	int err;
+
+	jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs);
+
+	/* We don't get here unless the node is still valid, so we don't have to
+	   mask in the ACCURATE bit any more. */
+	crc = crc32(0, rd, sizeof(*rd)-8);
+
+	if (crc != je32_to_cpu(rd->node_crc)) {
+		pr_notice("%s(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+			  __func__, ofs, je32_to_cpu(rd->node_crc), crc);
+		/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
+		return 0;
+	}
+
+	pseudo_random += je32_to_cpu(rd->version);
+
+	/* Should never happen. Did. (OLPC trac #4184)*/
+	checkedlen = strnlen(rd->name, rd->nsize);
+	if (checkedlen < rd->nsize) {
+		pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n",
+		       ofs, checkedlen);
+	}
+	fd = jffs2_alloc_full_dirent(checkedlen+1);
+	if (!fd) {
+		return -ENOMEM;
+	}
+	memcpy(&fd->name, rd->name, checkedlen);
+	fd->name[checkedlen] = 0;
+
+	crc = crc32(0, fd->name, rd->nsize);
+	if (crc != je32_to_cpu(rd->name_crc)) {
+		pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+			  __func__, ofs, je32_to_cpu(rd->name_crc), crc);
+		jffs2_dbg(1, "Name for which CRC failed is (now) '%s', ino #%d\n",
+			  fd->name, je32_to_cpu(rd->ino));
+		jffs2_free_full_dirent(fd);
+		/* FIXME: Why do we believe totlen? */
+		/* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
+		return 0;
+	}
+	ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(rd->pino));
+	if (!ic) {
+		jffs2_free_full_dirent(fd);
+		return -ENOMEM;
+	}
+
+	fd->raw = jffs2_link_node_ref(c, jeb, ofs | dirent_node_state(rd),
+				      PAD(je32_to_cpu(rd->totlen)), ic);
+
+	fd->next = NULL;
+	fd->version = je32_to_cpu(rd->version);
+	fd->ino = je32_to_cpu(rd->ino);
+	fd->nhash = full_name_hash(fd->name, checkedlen);
+	fd->type = rd->type;
+	jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
+
+	if (jffs2_sum_active()) {
+		jffs2_sum_add_dirent_mem(s, rd, ofs - jeb->offset);
+	}
+
+	return 0;
+}
+
+static int count_list(struct list_head *l)
+{
+	uint32_t count = 0;
+	struct list_head *tmp;
+
+	list_for_each(tmp, l) {
+		count++;
+	}
+	return count;
+}
+
+/* Note: This breaks if list_empty(head). I don't care. You
+   might, if you copy this code and use it elsewhere :) */
+static void rotate_list(struct list_head *head, uint32_t count)
+{
+	struct list_head *n = head->next;
+
+	list_del(head);
+	while(count--) {
+		n = n->next;
+	}
+	list_add(head, n);
+}
+
+void jffs2_rotate_lists(struct jffs2_sb_info *c)
+{
+	uint32_t x;
+	uint32_t rotateby;
+
+	x = count_list(&c->clean_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->clean_list), rotateby);
+	}
+
+	x = count_list(&c->very_dirty_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->very_dirty_list), rotateby);
+	}
+
+	x = count_list(&c->dirty_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->dirty_list), rotateby);
+	}
+
+	x = count_list(&c->erasable_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->erasable_list), rotateby);
+	}
+
+	if (c->nr_erasing_blocks) {
+		rotateby = pseudo_random % c->nr_erasing_blocks;
+		rotate_list((&c->erase_pending_list), rotateby);
+	}
+
+	if (c->nr_free_blocks) {
+		rotateby = pseudo_random % c->nr_free_blocks;
+		rotate_list((&c->free_list), rotateby);
+	}
+}
diff --git a/kernel/fs/jffs2/security.c b/kernel/fs/jffs2/security.c
new file mode 100644
index 000000000..d4b43fb7a
--- /dev/null
+++ b/kernel/fs/jffs2/security.c
@@ -0,0 +1,89 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include <linux/security.h>
+#include "nodelist.h"
+
+/* ---- Initial Security Label(s) Attachment callback --- */
+static int jffs2_initxattrs(struct inode *inode,
+			    const struct xattr *xattr_array, void *fs_info)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
+					xattr->name, xattr->value,
+					xattr->value_len, 0);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+/* ---- Initial Security Label(s) Attachment ----------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+			const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &jffs2_initxattrs, NULL);
+}
+
+/* ---- XATTR Handler for "security.*" ----------------- */
+static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
+				   void *buffer, size_t size, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+				 name, buffer, size);
+}
+
+static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+				 name, buffer, size, flags);
+}
+
+static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_SECURITY_PREFIX);
+		strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+const struct xattr_handler jffs2_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list = jffs2_security_listxattr,
+	.set = jffs2_security_setxattr,
+	.get = jffs2_security_getxattr
+};
diff --git a/kernel/fs/jffs2/summary.c b/kernel/fs/jffs2/summary.c
new file mode 100644
index 000000000..bc5385471
--- /dev/null
+++ b/kernel/fs/jffs2/summary.c
@@ -0,0 +1,874 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *		     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *		     University of Szeged, Hungary
+ *	       2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include "nodelist.h"
+#include "debug.h"
+
+int jffs2_sum_init(struct jffs2_sb_info *c)
+{
+	uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
+
+	c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
+
+	if (!c->summary) {
+		JFFS2_WARNING("Can't allocate memory for summary information!\n");
+		return -ENOMEM;
+	}
+
+	c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL);
+
+	if (!c->summary->sum_buf) {
+		JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n");
+		kfree(c->summary);
+		return -ENOMEM;
+	}
+
+	dbg_summary("returned successfully\n");
+
+	return 0;
+}
+
+void jffs2_sum_exit(struct jffs2_sb_info *c)
+{
+	dbg_summary("called\n");
+
+	jffs2_sum_disable_collecting(c->summary);
+
+	kfree(c->summary->sum_buf);
+	c->summary->sum_buf = NULL;
+
+	kfree(c->summary);
+	c->summary = NULL;
+}
+
+static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item)
+{
+	if (!s->sum_list_head)
+		s->sum_list_head = (union jffs2_sum_mem *) item;
+	if (s->sum_list_tail)
+		s->sum_list_tail->u.next = (union jffs2_sum_mem *) item;
+	s->sum_list_tail = (union jffs2_sum_mem *) item;
+
+	switch (je16_to_cpu(item->u.nodetype)) {
+		case JFFS2_NODETYPE_INODE:
+			s->sum_size += JFFS2_SUMMARY_INODE_SIZE;
+			s->sum_num++;
+			dbg_summary("inode (%u) added to summary\n",
+						je32_to_cpu(item->i.inode));
+			break;
+		case JFFS2_NODETYPE_DIRENT:
+			s->sum_size += JFFS2_SUMMARY_DIRENT_SIZE(item->d.nsize);
+			s->sum_num++;
+			dbg_summary("dirent (%u) added to summary\n",
+						je32_to_cpu(item->d.ino));
+			break;
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			s->sum_size += JFFS2_SUMMARY_XATTR_SIZE;
+			s->sum_num++;
+			dbg_summary("xattr (xid=%u, version=%u) added to summary\n",
+				    je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			s->sum_size += JFFS2_SUMMARY_XREF_SIZE;
+			s->sum_num++;
+			dbg_summary("xref added to summary\n");
+			break;
+#endif
+		default:
+			JFFS2_WARNING("UNKNOWN node type %u\n",
+					    je16_to_cpu(item->u.nodetype));
+			return 1;
+	}
+	return 0;
+}
+
+
+/* The following 3 functions are called from scan.c to collect summary info for not closed jeb */
+
+int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size)
+{
+	dbg_summary("called with %u\n", size);
+	s->sum_padded += size;
+	return 0;
+}
+
+int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri,
+				uint32_t ofs)
+{
+	struct jffs2_sum_inode_mem *temp = kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL);
+
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = ri->nodetype;
+	temp->inode = ri->ino;
+	temp->version = ri->version;
+	temp->offset = cpu_to_je32(ofs); /* relative offset from the beginning of the jeb */
+	temp->totlen = ri->totlen;
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd,
+				uint32_t ofs)
+{
+	struct jffs2_sum_dirent_mem *temp =
+		kmalloc(sizeof(struct jffs2_sum_dirent_mem) + rd->nsize, GFP_KERNEL);
+
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rd->nodetype;
+	temp->totlen = rd->totlen;
+	temp->offset = cpu_to_je32(ofs);	/* relative from the beginning of the jeb */
+	temp->pino = rd->pino;
+	temp->version = rd->version;
+	temp->ino = rd->ino;
+	temp->nsize = rd->nsize;
+	temp->type = rd->type;
+	temp->next = NULL;
+
+	memcpy(temp->name, rd->name, rd->nsize);
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs)
+{
+	struct jffs2_sum_xattr_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rx->nodetype;
+	temp->xid = rx->xid;
+	temp->version = rx->version;
+	temp->offset = cpu_to_je32(ofs);
+	temp->totlen = rx->totlen;
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs)
+{
+	struct jffs2_sum_xref_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rr->nodetype;
+	temp->offset = cpu_to_je32(ofs);
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+#endif
+/* Cleanup every collected summary information */
+
+static void jffs2_sum_clean_collected(struct jffs2_summary *s)
+{
+	union jffs2_sum_mem *temp;
+
+	if (!s->sum_list_head) {
+		dbg_summary("already empty\n");
+	}
+	while (s->sum_list_head) {
+		temp = s->sum_list_head;
+		s->sum_list_head = s->sum_list_head->u.next;
+		kfree(temp);
+	}
+	s->sum_list_tail = NULL;
+	s->sum_padded = 0;
+	s->sum_num = 0;
+}
+
+void jffs2_sum_reset_collected(struct jffs2_summary *s)
+{
+	dbg_summary("called\n");
+	jffs2_sum_clean_collected(s);
+	s->sum_size = 0;
+}
+
+void jffs2_sum_disable_collecting(struct jffs2_summary *s)
+{
+	dbg_summary("called\n");
+	jffs2_sum_clean_collected(s);
+	s->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
+}
+
+int jffs2_sum_is_disabled(struct jffs2_summary *s)
+{
+	return (s->sum_size == JFFS2_SUMMARY_NOSUM_SIZE);
+}
+
+/* Move the collected summary information into sb (called from scan.c) */
+
+void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s)
+{
+	dbg_summary("oldsize=0x%x oldnum=%u => newsize=0x%x newnum=%u\n",
+				c->summary->sum_size, c->summary->sum_num,
+				s->sum_size, s->sum_num);
+
+	c->summary->sum_size = s->sum_size;
+	c->summary->sum_num = s->sum_num;
+	c->summary->sum_padded = s->sum_padded;
+	c->summary->sum_list_head = s->sum_list_head;
+	c->summary->sum_list_tail = s->sum_list_tail;
+
+	s->sum_list_head = s->sum_list_tail = NULL;
+}
+
+/* Called from wbuf.c to collect writed node info */
+
+int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
+				unsigned long count, uint32_t ofs)
+{
+	union jffs2_node_union *node;
+	struct jffs2_eraseblock *jeb;
+
+	if (c->summary->sum_size == JFFS2_SUMMARY_NOSUM_SIZE) {
+		dbg_summary("Summary is disabled for this jeb! Skipping summary info!\n");
+		return 0;
+	}
+
+	node = invecs[0].iov_base;
+	jeb = &c->blocks[ofs / c->sector_size];
+	ofs -= jeb->offset;
+
+	switch (je16_to_cpu(node->u.nodetype)) {
+		case JFFS2_NODETYPE_INODE: {
+			struct jffs2_sum_inode_mem *temp =
+				kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL);
+
+			if (!temp)
+				goto no_mem;
+
+			temp->nodetype = node->i.nodetype;
+			temp->inode = node->i.ino;
+			temp->version = node->i.version;
+			temp->offset = cpu_to_je32(ofs);
+			temp->totlen = node->i.totlen;
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+
+		case JFFS2_NODETYPE_DIRENT: {
+			struct jffs2_sum_dirent_mem *temp =
+				kmalloc(sizeof(struct jffs2_sum_dirent_mem) + node->d.nsize, GFP_KERNEL);
+
+			if (!temp)
+				goto no_mem;
+
+			temp->nodetype = node->d.nodetype;
+			temp->totlen = node->d.totlen;
+			temp->offset = cpu_to_je32(ofs);
+			temp->pino = node->d.pino;
+			temp->version = node->d.version;
+			temp->ino = node->d.ino;
+			temp->nsize = node->d.nsize;
+			temp->type = node->d.type;
+			temp->next = NULL;
+
+			switch (count) {
+				case 1:
+					memcpy(temp->name,node->d.name,node->d.nsize);
+					break;
+
+				case 2:
+					memcpy(temp->name,invecs[1].iov_base,node->d.nsize);
+					break;
+
+				default:
+					BUG();	/* impossible count value */
+					break;
+			}
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR: {
+			struct jffs2_sum_xattr_mem *temp;
+			temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+
+			temp->nodetype = node->x.nodetype;
+			temp->xid = node->x.xid;
+			temp->version = node->x.version;
+			temp->totlen = node->x.totlen;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+		case JFFS2_NODETYPE_XREF: {
+			struct jffs2_sum_xref_mem *temp;
+			temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+			temp->nodetype = node->r.nodetype;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+#endif
+		case JFFS2_NODETYPE_PADDING:
+			dbg_summary("node PADDING\n");
+			c->summary->sum_padded += je32_to_cpu(node->u.totlen);
+			break;
+
+		case JFFS2_NODETYPE_CLEANMARKER:
+			dbg_summary("node CLEANMARKER\n");
+			break;
+
+		case JFFS2_NODETYPE_SUMMARY:
+			dbg_summary("node SUMMARY\n");
+			break;
+
+		default:
+			/* If you implement a new node type you should also implement
+			   summary support for it or disable summary.
+			*/
+			BUG();
+			break;
+	}
+
+	return 0;
+
+no_mem:
+	JFFS2_WARNING("MEMORY ALLOCATION ERROR!");
+	return -ENOMEM;
+}
+
+static struct jffs2_raw_node_ref *sum_link_node_ref(struct jffs2_sb_info *c,
+						    struct jffs2_eraseblock *jeb,
+						    uint32_t ofs, uint32_t len,
+						    struct jffs2_inode_cache *ic)
+{
+	/* If there was a gap, mark it dirty */
+	if ((ofs & ~3) > c->sector_size - jeb->free_size) {
+		/* Ew. Summary doesn't actually tell us explicitly about dirty space */
+		jffs2_scan_dirty_space(c, jeb, (ofs & ~3) - (c->sector_size - jeb->free_size));
+	}
+
+	return jffs2_link_node_ref(c, jeb, jeb->offset + ofs, len, ic);
+}
+
+/* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */
+
+static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				struct jffs2_raw_summary *summary, uint32_t *pseudo_random)
+{
+	struct jffs2_inode_cache *ic;
+	struct jffs2_full_dirent *fd;
+	void *sp;
+	int i, ino;
+	int err;
+
+	sp = summary->sum;
+
+	for (i=0; i<je32_to_cpu(summary->sum_num); i++) {
+		dbg_summary("processing summary index %d\n", i);
+
+		cond_resched();
+
+		/* Make sure there's a spare ref for dirty space */
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+		if (err)
+			return err;
+
+		switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) {
+			case JFFS2_NODETYPE_INODE: {
+				struct jffs2_sum_inode_flash *spi;
+				spi = sp;
+
+				ino = je32_to_cpu(spi->inode);
+
+				dbg_summary("Inode at 0x%08x-0x%08x\n",
+					    jeb->offset + je32_to_cpu(spi->offset),
+					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen));
+
+				ic = jffs2_scan_make_ino_cache(c, ino);
+				if (!ic) {
+					JFFS2_NOTICE("scan_make_ino_cache failed\n");
+					return -ENOMEM;
+				}
+
+				sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED,
+						  PAD(je32_to_cpu(spi->totlen)), ic);
+
+				*pseudo_random += je32_to_cpu(spi->version);
+
+				sp += JFFS2_SUMMARY_INODE_SIZE;
+
+				break;
+			}
+
+			case JFFS2_NODETYPE_DIRENT: {
+				struct jffs2_sum_dirent_flash *spd;
+				int checkedlen;
+				spd = sp;
+
+				dbg_summary("Dirent at 0x%08x-0x%08x\n",
+					    jeb->offset + je32_to_cpu(spd->offset),
+					    jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen));
+
+
+				/* This should never happen, but https://dev.laptop.org/ticket/4184 */
+				checkedlen = strnlen(spd->name, spd->nsize);
+				if (!checkedlen) {
+					pr_err("Dirent at %08x has zero at start of name. Aborting mount.\n",
+					       jeb->offset +
+					       je32_to_cpu(spd->offset));
+					return -EIO;
+				}
+				if (checkedlen < spd->nsize) {
+					pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n",
+					       jeb->offset +
+					       je32_to_cpu(spd->offset),
+					       checkedlen);
+				}
+
+
+				fd = jffs2_alloc_full_dirent(checkedlen+1);
+				if (!fd)
+					return -ENOMEM;
+
+				memcpy(&fd->name, spd->name, checkedlen);
+				fd->name[checkedlen] = 0;
+
+				ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino));
+				if (!ic) {
+					jffs2_free_full_dirent(fd);
+					return -ENOMEM;
+				}
+
+				fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_UNCHECKED,
+							    PAD(je32_to_cpu(spd->totlen)), ic);
+
+				fd->next = NULL;
+				fd->version = je32_to_cpu(spd->version);
+				fd->ino = je32_to_cpu(spd->ino);
+				fd->nhash = full_name_hash(fd->name, checkedlen);
+				fd->type = spd->type;
+
+				jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
+
+				*pseudo_random += je32_to_cpu(spd->version);
+
+				sp += JFFS2_SUMMARY_DIRENT_SIZE(spd->nsize);
+
+				break;
+			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_xattr_datum *xd;
+				struct jffs2_sum_xattr_flash *spx;
+
+				spx = (struct jffs2_sum_xattr_flash *)sp;
+				dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", 
+					    jeb->offset + je32_to_cpu(spx->offset),
+					    jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen),
+					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
+
+				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
+								je32_to_cpu(spx->version));
+				if (IS_ERR(xd))
+					return PTR_ERR(xd);
+				if (xd->version > je32_to_cpu(spx->version)) {
+					/* node is not the newest one */
+					struct jffs2_raw_node_ref *raw
+						= sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+								    PAD(je32_to_cpu(spx->totlen)), NULL);
+					raw->next_in_ino = xd->node->next_in_ino;
+					xd->node->next_in_ino = raw;
+				} else {
+					xd->version = je32_to_cpu(spx->version);
+					sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+							  PAD(je32_to_cpu(spx->totlen)), (void *)xd);
+				}
+				*pseudo_random += je32_to_cpu(spx->xid);
+				sp += JFFS2_SUMMARY_XATTR_SIZE;
+
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_xattr_ref *ref;
+				struct jffs2_sum_xref_flash *spr;
+
+				spr = (struct jffs2_sum_xref_flash *)sp;
+				dbg_summary("xref at %#08x-%#08x\n",
+					    jeb->offset + je32_to_cpu(spr->offset),
+					    jeb->offset + je32_to_cpu(spr->offset) + 
+					    (uint32_t)PAD(sizeof(struct jffs2_raw_xref)));
+
+				ref = jffs2_alloc_xattr_ref();
+				if (!ref) {
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					return -ENOMEM;
+				}
+				ref->next = c->xref_temp;
+				c->xref_temp = ref;
+
+				sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
+						  PAD(sizeof(struct jffs2_raw_xref)), (void *)ref);
+
+				*pseudo_random += ref->node->flash_offset;
+				sp += JFFS2_SUMMARY_XREF_SIZE;
+
+				break;
+			}
+#endif
+			default : {
+				uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype);
+				JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype);
+				if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT)
+					return -EIO;
+
+				/* For compatible node types, just fall back to the full scan */
+				c->wasted_size -= jeb->wasted_size;
+				c->free_size += c->sector_size - jeb->free_size;
+				c->used_size -= jeb->used_size;
+				c->dirty_size -= jeb->dirty_size;
+				jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0;
+				jeb->free_size = c->sector_size;
+
+				jffs2_free_jeb_node_refs(c, jeb);
+				return -ENOTRECOVERABLE;
+			}
+		}
+	}
+	return 0;
+}
+
+/* Process the summary node - called from jffs2_scan_eraseblock() */
+int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   struct jffs2_raw_summary *summary, uint32_t sumsize,
+			   uint32_t *pseudo_random)
+{
+	struct jffs2_unknown_node crcnode;
+	int ret, ofs;
+	uint32_t crc;
+
+	ofs = c->sector_size - sumsize;
+
+	dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n",
+		    jeb->offset, jeb->offset + ofs, sumsize);
+
+	/* OK, now check for node validity and CRC */
+	crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	crcnode.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY);
+	crcnode.totlen = summary->totlen;
+	crc = crc32(0, &crcnode, sizeof(crcnode)-4);
+
+	if (je32_to_cpu(summary->hdr_crc) != crc) {
+		dbg_summary("Summary node header is corrupt (bad CRC or "
+				"no summary at all)\n");
+		goto crc_err;
+	}
+
+	if (je32_to_cpu(summary->totlen) != sumsize) {
+		dbg_summary("Summary node is corrupt (wrong erasesize?)\n");
+		goto crc_err;
+	}
+
+	crc = crc32(0, summary, sizeof(struct jffs2_raw_summary)-8);
+
+	if (je32_to_cpu(summary->node_crc) != crc) {
+		dbg_summary("Summary node is corrupt (bad CRC)\n");
+		goto crc_err;
+	}
+
+	crc = crc32(0, summary->sum, sumsize - sizeof(struct jffs2_raw_summary));
+
+	if (je32_to_cpu(summary->sum_crc) != crc) {
+		dbg_summary("Summary node data is corrupt (bad CRC)\n");
+		goto crc_err;
+	}
+
+	if ( je32_to_cpu(summary->cln_mkr) ) {
+
+		dbg_summary("Summary : CLEANMARKER node \n");
+
+		ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+		if (ret)
+			return ret;
+
+		if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) {
+			dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n",
+				je32_to_cpu(summary->cln_mkr), c->cleanmarker_size);
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
+		} else if (jeb->first_node) {
+			dbg_summary("CLEANMARKER node not first node in block "
+					"(0x%08x)\n", jeb->offset);
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
+		} else {
+			jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL,
+					    je32_to_cpu(summary->cln_mkr), NULL);
+		}
+	}
+
+	ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random);
+	/* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full
+	   scan of this eraseblock. So return zero */
+	if (ret == -ENOTRECOVERABLE)
+		return 0;
+	if (ret)
+		return ret;		/* real error */
+
+	/* for PARANOIA_CHECK */
+	ret = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+	if (ret)
+		return ret;
+
+	sum_link_node_ref(c, jeb, ofs | REF_NORMAL, sumsize, NULL);
+
+	if (unlikely(jeb->free_size)) {
+		JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
+			      jeb->free_size, jeb->offset);
+		jeb->wasted_size += jeb->free_size;
+		c->wasted_size += jeb->free_size;
+		c->free_size -= jeb->free_size;
+		jeb->free_size = 0;
+	}
+
+	return jffs2_scan_classify_jeb(c, jeb);
+
+crc_err:
+	JFFS2_WARNING("Summary node crc error, skipping summary information.\n");
+
+	return 0;
+}
+
+/* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */
+
+static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				uint32_t infosize, uint32_t datasize, int padsize)
+{
+	struct jffs2_raw_summary isum;
+	union jffs2_sum_mem *temp;
+	struct jffs2_sum_marker *sm;
+	struct kvec vecs[2];
+	uint32_t sum_ofs;
+	void *wpage;
+	int ret;
+	size_t retlen;
+
+	if (padsize + datasize > MAX_SUMMARY_SIZE) {
+		/* It won't fit in the buffer. Abort summary for this jeb */
+		jffs2_sum_disable_collecting(c->summary);
+
+		JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n",
+			      datasize, padsize, jeb->offset);
+		/* Non-fatal */
+		return 0;
+	}
+	/* Is there enough space for summary? */
+	if (padsize < 0) {
+		/* don't try to write out summary for this jeb */
+		jffs2_sum_disable_collecting(c->summary);
+
+		JFFS2_WARNING("Not enough space for summary, padsize = %d\n",
+			      padsize);
+		/* Non-fatal */
+		return 0;
+	}
+
+	memset(c->summary->sum_buf, 0xff, datasize);
+	memset(&isum, 0, sizeof(isum));
+
+	isum.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	isum.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY);
+	isum.totlen = cpu_to_je32(infosize);
+	isum.hdr_crc = cpu_to_je32(crc32(0, &isum, sizeof(struct jffs2_unknown_node) - 4));
+	isum.padded = cpu_to_je32(c->summary->sum_padded);
+	isum.cln_mkr = cpu_to_je32(c->cleanmarker_size);
+	isum.sum_num = cpu_to_je32(c->summary->sum_num);
+	wpage = c->summary->sum_buf;
+
+	while (c->summary->sum_num) {
+		temp = c->summary->sum_list_head;
+
+		switch (je16_to_cpu(temp->u.nodetype)) {
+			case JFFS2_NODETYPE_INODE: {
+				struct jffs2_sum_inode_flash *sino_ptr = wpage;
+
+				sino_ptr->nodetype = temp->i.nodetype;
+				sino_ptr->inode = temp->i.inode;
+				sino_ptr->version = temp->i.version;
+				sino_ptr->offset = temp->i.offset;
+				sino_ptr->totlen = temp->i.totlen;
+
+				wpage += JFFS2_SUMMARY_INODE_SIZE;
+
+				break;
+			}
+
+			case JFFS2_NODETYPE_DIRENT: {
+				struct jffs2_sum_dirent_flash *sdrnt_ptr = wpage;
+
+				sdrnt_ptr->nodetype = temp->d.nodetype;
+				sdrnt_ptr->totlen = temp->d.totlen;
+				sdrnt_ptr->offset = temp->d.offset;
+				sdrnt_ptr->pino = temp->d.pino;
+				sdrnt_ptr->version = temp->d.version;
+				sdrnt_ptr->ino = temp->d.ino;
+				sdrnt_ptr->nsize = temp->d.nsize;
+				sdrnt_ptr->type = temp->d.type;
+
+				memcpy(sdrnt_ptr->name, temp->d.name,
+							temp->d.nsize);
+
+				wpage += JFFS2_SUMMARY_DIRENT_SIZE(temp->d.nsize);
+
+				break;
+			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_sum_xattr_flash *sxattr_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxattr_ptr->nodetype = temp->x.nodetype;
+				sxattr_ptr->xid = temp->x.xid;
+				sxattr_ptr->version = temp->x.version;
+				sxattr_ptr->offset = temp->x.offset;
+				sxattr_ptr->totlen = temp->x.totlen;
+
+				wpage += JFFS2_SUMMARY_XATTR_SIZE;
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_sum_xref_flash *sxref_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxref_ptr->nodetype = temp->r.nodetype;
+				sxref_ptr->offset = temp->r.offset;
+
+				wpage += JFFS2_SUMMARY_XREF_SIZE;
+				break;
+			}
+#endif
+			default : {
+				if ((je16_to_cpu(temp->u.nodetype) & JFFS2_COMPAT_MASK)
+				    == JFFS2_FEATURE_RWCOMPAT_COPY) {
+					dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n",
+						    je16_to_cpu(temp->u.nodetype));
+					jffs2_sum_disable_collecting(c->summary);
+				} else {
+					BUG();	/* unknown node in summary information */
+				}
+			}
+		}
+
+		c->summary->sum_list_head = temp->u.next;
+		kfree(temp);
+
+		c->summary->sum_num--;
+	}
+
+	jffs2_sum_reset_collected(c->summary);
+
+	wpage += padsize;
+
+	sm = wpage;
+	sm->offset = cpu_to_je32(c->sector_size - jeb->free_size);
+	sm->magic = cpu_to_je32(JFFS2_SUM_MAGIC);
+
+	isum.sum_crc = cpu_to_je32(crc32(0, c->summary->sum_buf, datasize));
+	isum.node_crc = cpu_to_je32(crc32(0, &isum, sizeof(isum) - 8));
+
+	vecs[0].iov_base = &isum;
+	vecs[0].iov_len = sizeof(isum);
+	vecs[1].iov_base = c->summary->sum_buf;
+	vecs[1].iov_len = datasize;
+
+	sum_ofs = jeb->offset + c->sector_size - jeb->free_size;
+
+	dbg_summary("writing out data to flash to pos : 0x%08x\n", sum_ofs);
+
+	ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0);
+
+	if (ret || (retlen != infosize)) {
+
+		JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
+			      infosize, sum_ofs, ret, retlen);
+
+		if (retlen) {
+			/* Waste remaining space */
+			spin_lock(&c->erase_completion_lock);
+			jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL);
+			spin_unlock(&c->erase_completion_lock);
+		}
+
+		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
+
+		return 0;
+	}
+
+	spin_lock(&c->erase_completion_lock);
+	jffs2_link_node_ref(c, jeb, sum_ofs | REF_NORMAL, infosize, NULL);
+	spin_unlock(&c->erase_completion_lock);
+
+	return 0;
+}
+
+/* Write out summary information - called from jffs2_do_reserve_space */
+
+int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
+	__must_hold(&c->erase_completion_block)
+{
+	int datasize, infosize, padsize;
+	struct jffs2_eraseblock *jeb;
+	int ret = 0;
+
+	dbg_summary("called\n");
+
+	spin_unlock(&c->erase_completion_lock);
+
+	jeb = c->nextblock;
+	jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+	if (!c->summary->sum_num || !c->summary->sum_list_head) {
+		JFFS2_WARNING("Empty summary info!!!\n");
+		BUG();
+	}
+
+	datasize = c->summary->sum_size + sizeof(struct jffs2_sum_marker);
+	infosize = sizeof(struct jffs2_raw_summary) + datasize;
+	padsize = jeb->free_size - infosize;
+	infosize += padsize;
+	datasize += padsize;
+
+	ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
+	spin_lock(&c->erase_completion_lock);
+	return ret;
+}
diff --git a/kernel/fs/jffs2/summary.h b/kernel/fs/jffs2/summary.h
new file mode 100644
index 000000000..60207a2ae
--- /dev/null
+++ b/kernel/fs/jffs2/summary.h
@@ -0,0 +1,213 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *		     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *		     University of Szeged, Hungary
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef JFFS2_SUMMARY_H
+#define JFFS2_SUMMARY_H
+
+/* Limit summary size to 64KiB so that we can kmalloc it. If the summary
+   is larger than that, we have to just ditch it and avoid using summary
+   for the eraseblock in question... and it probably doesn't hurt us much
+   anyway. */
+#define MAX_SUMMARY_SIZE 65536
+
+#include <linux/uio.h>
+#include <linux/jffs2.h>
+
+#define BLK_STATE_ALLFF		0
+#define BLK_STATE_CLEAN		1
+#define BLK_STATE_PARTDIRTY	2
+#define BLK_STATE_CLEANMARKER	3
+#define BLK_STATE_ALLDIRTY	4
+#define BLK_STATE_BADBLOCK	5
+
+#define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff
+#define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash))
+#define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x))
+#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash))
+#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash))
+
+/* Summary structures used on flash */
+
+struct jffs2_sum_unknown_flash
+{
+	jint16_t nodetype;	/* node type */
+};
+
+struct jffs2_sum_inode_flash
+{
+	jint16_t nodetype;	/* node type */
+	jint32_t inode;		/* inode number */
+	jint32_t version;	/* inode version */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen; 	/* record length */
+} __attribute__((packed));
+
+struct jffs2_sum_dirent_flash
+{
+	jint16_t nodetype;	/* == JFFS_NODETYPE_DIRENT */
+	jint32_t totlen;	/* record length */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t pino;		/* parent inode */
+	jint32_t version;	/* dirent version */
+	jint32_t ino; 		/* == zero for unlink */
+	uint8_t nsize;		/* dirent name size */
+	uint8_t type;		/* dirent type */
+	uint8_t name[0];	/* dirent name */
+} __attribute__((packed));
+
+struct jffs2_sum_xattr_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XATR */
+	jint32_t xid;		/* xattr identifier */
+	jint32_t version;	/* version number */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen;	/* node length */
+} __attribute__((packed));
+
+struct jffs2_sum_xref_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XREF */
+	jint32_t offset;	/* offset on jeb */
+} __attribute__((packed));
+
+union jffs2_sum_flash
+{
+	struct jffs2_sum_unknown_flash u;
+	struct jffs2_sum_inode_flash i;
+	struct jffs2_sum_dirent_flash d;
+	struct jffs2_sum_xattr_flash x;
+	struct jffs2_sum_xref_flash r;
+};
+
+/* Summary structures used in the memory */
+
+struct jffs2_sum_unknown_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;	/* node type */
+};
+
+struct jffs2_sum_inode_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;	/* node type */
+	jint32_t inode;		/* inode number */
+	jint32_t version;	/* inode version */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen; 	/* record length */
+} __attribute__((packed));
+
+struct jffs2_sum_dirent_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;	/* == JFFS_NODETYPE_DIRENT */
+	jint32_t totlen;	/* record length */
+	jint32_t offset;	/* ofset on jeb */
+	jint32_t pino;		/* parent inode */
+	jint32_t version;	/* dirent version */
+	jint32_t ino; 		/* == zero for unlink */
+	uint8_t nsize;		/* dirent name size */
+	uint8_t type;		/* dirent type */
+	uint8_t name[0];	/* dirent name */
+} __attribute__((packed));
+
+struct jffs2_sum_xattr_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t xid;
+	jint32_t version;
+	jint32_t offset;
+	jint32_t totlen;
+} __attribute__((packed));
+
+struct jffs2_sum_xref_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t offset;
+} __attribute__((packed));
+
+union jffs2_sum_mem
+{
+	struct jffs2_sum_unknown_mem u;
+	struct jffs2_sum_inode_mem i;
+	struct jffs2_sum_dirent_mem d;
+	struct jffs2_sum_xattr_mem x;
+	struct jffs2_sum_xref_mem r;
+};
+
+/* Summary related information stored in superblock */
+
+struct jffs2_summary
+{
+	uint32_t sum_size;      /* collected summary information for nextblock */
+	uint32_t sum_num;
+	uint32_t sum_padded;
+	union jffs2_sum_mem *sum_list_head;
+	union jffs2_sum_mem *sum_list_tail;
+
+	jint32_t *sum_buf;	/* buffer for writing out summary */
+};
+
+/* Summary marker is stored at the end of every sumarized erase block */
+
+struct jffs2_sum_marker
+{
+	jint32_t offset;	/* offset of the summary node in the jeb */
+	jint32_t magic; 	/* == JFFS2_SUM_MAGIC */
+};
+
+#define JFFS2_SUMMARY_FRAME_SIZE (sizeof(struct jffs2_raw_summary) + sizeof(struct jffs2_sum_marker))
+
+#ifdef CONFIG_JFFS2_SUMMARY	/* SUMMARY SUPPORT ENABLED */
+
+#define jffs2_sum_active() (1)
+int jffs2_sum_init(struct jffs2_sb_info *c);
+void jffs2_sum_exit(struct jffs2_sb_info *c);
+void jffs2_sum_disable_collecting(struct jffs2_summary *s);
+int jffs2_sum_is_disabled(struct jffs2_summary *s);
+void jffs2_sum_reset_collected(struct jffs2_summary *s);
+void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s);
+int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
+			unsigned long count,  uint32_t to);
+int jffs2_sum_write_sumnode(struct jffs2_sb_info *c);
+int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size);
+int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs);
+int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs);
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs);
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs);
+int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   struct jffs2_raw_summary *summary, uint32_t sumlen,
+			   uint32_t *pseudo_random);
+
+#else				/* SUMMARY DISABLED */
+
+#define jffs2_sum_active() (0)
+#define jffs2_sum_init(a) (0)
+#define jffs2_sum_exit(a)
+#define jffs2_sum_disable_collecting(a)
+#define jffs2_sum_is_disabled(a) (0)
+#define jffs2_sum_reset_collected(a)
+#define jffs2_sum_add_kvec(a,b,c,d) (0)
+#define jffs2_sum_move_collected(a,b)
+#define jffs2_sum_write_sumnode(a) (0)
+#define jffs2_sum_add_padding_mem(a,b)
+#define jffs2_sum_add_inode_mem(a,b,c)
+#define jffs2_sum_add_dirent_mem(a,b,c)
+#define jffs2_sum_add_xattr_mem(a,b,c)
+#define jffs2_sum_add_xref_mem(a,b,c)
+#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0)
+
+#endif /* CONFIG_JFFS2_SUMMARY */
+
+#endif /* JFFS2_SUMMARY_H */
diff --git a/kernel/fs/jffs2/super.c b/kernel/fs/jffs2/super.c
new file mode 100644
index 000000000..d86c5e317
--- /dev/null
+++ b/kernel/fs/jffs2/super.c
@@ -0,0 +1,442 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/jffs2.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/seq_file.h>
+#include <linux/exportfs.h>
+#include "compr.h"
+#include "nodelist.h"
+
+static void jffs2_put_super(struct super_block *);
+
+static struct kmem_cache *jffs2_inode_cachep;
+
+static struct inode *jffs2_alloc_inode(struct super_block *sb)
+{
+	struct jffs2_inode_info *f;
+
+	f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL);
+	if (!f)
+		return NULL;
+	return &f->vfs_inode;
+}
+
+static void jffs2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
+}
+
+static void jffs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
+
+static void jffs2_i_init_once(void *foo)
+{
+	struct jffs2_inode_info *f = foo;
+
+	mutex_init(&f->sem);
+	inode_init_once(&f->vfs_inode);
+}
+
+static const char *jffs2_compr_name(unsigned int compr)
+{
+	switch (compr) {
+	case JFFS2_COMPR_MODE_NONE:
+		return "none";
+#ifdef CONFIG_JFFS2_LZO
+	case JFFS2_COMPR_MODE_FORCELZO:
+		return "lzo";
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+	case JFFS2_COMPR_MODE_FORCEZLIB:
+		return "zlib";
+#endif
+	default:
+		/* should never happen; programmer error */
+		WARN_ON(1);
+		return "";
+	}
+}
+
+static int jffs2_show_options(struct seq_file *s, struct dentry *root)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(root->d_sb);
+	struct jffs2_mount_opts *opts = &c->mount_opts;
+
+	if (opts->override_compr)
+		seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
+	if (opts->rp_size)
+		seq_printf(s, ",rp_size=%u", opts->rp_size / 1024);
+
+	return 0;
+}
+
+static int jffs2_sync_fs(struct super_block *sb, int wait)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	cancel_delayed_work_sync(&c->wbuf_dwork);
+#endif
+
+	mutex_lock(&c->alloc_sem);
+	jffs2_flush_wbuf_pad(c);
+	mutex_unlock(&c->alloc_sem);
+	return 0;
+}
+
+static struct inode *jffs2_nfs_get_inode(struct super_block *sb, uint64_t ino,
+					 uint32_t generation)
+{
+	/* We don't care about i_generation. We'll destroy the flash
+	   before we start re-using inode numbers anyway. And even
+	   if that wasn't true, we'd have other problems...*/
+	return jffs2_iget(sb, ino);
+}
+
+static struct dentry *jffs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_get_parent(struct dentry *child)
+{
+	struct jffs2_inode_info *f;
+	uint32_t pino;
+
+	BUG_ON(!d_is_dir(child));
+
+	f = JFFS2_INODE_INFO(d_inode(child));
+
+	pino = f->inocache->pino_nlink;
+
+	JFFS2_DEBUG("Parent of directory ino #%u is #%u\n",
+		    f->inocache->ino, pino);
+
+	return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino));
+}
+
+static const struct export_operations jffs2_export_ops = {
+	.get_parent = jffs2_get_parent,
+	.fh_to_dentry = jffs2_fh_to_dentry,
+	.fh_to_parent = jffs2_fh_to_parent,
+};
+
+/*
+ * JFFS2 mount options.
+ *
+ * Opt_override_compr: override default compressor
+ * Opt_rp_size: size of reserved pool in KiB
+ * Opt_err: just end of array marker
+ */
+enum {
+	Opt_override_compr,
+	Opt_rp_size,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_override_compr, "compr=%s"},
+	{Opt_rp_size, "rp_size=%u"},
+	{Opt_err, NULL},
+};
+
+static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *name;
+	unsigned int opt;
+
+	if (!data)
+		return 0;
+
+	while ((p = strsep(&data, ","))) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_override_compr:
+			name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (!strcmp(name, "none"))
+				c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
+#ifdef CONFIG_JFFS2_LZO
+			else if (!strcmp(name, "lzo"))
+				c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+			else if (!strcmp(name, "zlib"))
+				c->mount_opts.compr =
+						JFFS2_COMPR_MODE_FORCEZLIB;
+#endif
+			else {
+				pr_err("Error: unknown compressor \"%s\"\n",
+				       name);
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			c->mount_opts.override_compr = true;
+			break;
+		case Opt_rp_size:
+			if (match_int(&args[0], &opt))
+				return -EINVAL;
+			opt *= 1024;
+			if (opt > c->mtd->size) {
+				pr_warn("Too large reserve pool specified, max "
+					"is %llu KB\n", c->mtd->size / 1024);
+				return -EINVAL;
+			}
+			c->mount_opts.rp_size = opt;
+			break;
+		default:
+			pr_err("Error: unrecognized mount option '%s' or missing value\n",
+			       p);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	int err;
+
+	sync_filesystem(sb);
+	err = jffs2_parse_options(c, data);
+	if (err)
+		return -EINVAL;
+
+	return jffs2_do_remount_fs(sb, flags, data);
+}
+
+static const struct super_operations jffs2_super_operations =
+{
+	.alloc_inode =	jffs2_alloc_inode,
+	.destroy_inode =jffs2_destroy_inode,
+	.put_super =	jffs2_put_super,
+	.statfs =	jffs2_statfs,
+	.remount_fs =	jffs2_remount_fs,
+	.evict_inode =	jffs2_evict_inode,
+	.dirty_inode =	jffs2_dirty_inode,
+	.show_options =	jffs2_show_options,
+	.sync_fs =	jffs2_sync_fs,
+};
+
+/*
+ * fill in the superblock
+ */
+static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct jffs2_sb_info *c;
+	int ret;
+
+	jffs2_dbg(1, "jffs2_get_sb_mtd():"
+		  " New superblock for device %d (\"%s\")\n",
+		  sb->s_mtd->index, sb->s_mtd->name);
+
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+
+	c->mtd = sb->s_mtd;
+	c->os_priv = sb;
+	sb->s_fs_info = c;
+
+	ret = jffs2_parse_options(c, data);
+	if (ret) {
+		kfree(c);
+		return -EINVAL;
+	}
+
+	/* Initialize JFFS2 superblock locks, the further initialization will
+	 * be done later */
+	mutex_init(&c->alloc_sem);
+	mutex_init(&c->erase_free_sem);
+	init_waitqueue_head(&c->erase_wait);
+	init_waitqueue_head(&c->inocache_wq);
+	spin_lock_init(&c->erase_completion_lock);
+	spin_lock_init(&c->inocache_lock);
+
+	sb->s_op = &jffs2_super_operations;
+	sb->s_export_op = &jffs2_export_ops;
+	sb->s_flags = sb->s_flags | MS_NOATIME;
+	sb->s_xattr = jffs2_xattr_handlers;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+	ret = jffs2_do_fill_super(sb, data, silent);
+	return ret;
+}
+
+static struct dentry *jffs2_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
+}
+
+static void jffs2_put_super (struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	jffs2_dbg(2, "%s()\n", __func__);
+
+	mutex_lock(&c->alloc_sem);
+	jffs2_flush_wbuf_pad(c);
+	mutex_unlock(&c->alloc_sem);
+
+	jffs2_sum_exit(c);
+
+	jffs2_free_ino_caches(c);
+	jffs2_free_raw_node_refs(c);
+	if (jffs2_blocks_use_vmalloc(c))
+		vfree(c->blocks);
+	else
+		kfree(c->blocks);
+	jffs2_flash_cleanup(c);
+	kfree(c->inocache_list);
+	jffs2_clear_xattr_subsystem(c);
+	mtd_sync(c->mtd);
+	jffs2_dbg(1, "%s(): returning\n", __func__);
+}
+
+static void jffs2_kill_sb(struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	if (!(sb->s_flags & MS_RDONLY))
+		jffs2_stop_garbage_collect_thread(c);
+	kill_mtd_super(sb);
+	kfree(c);
+}
+
+static struct file_system_type jffs2_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"jffs2",
+	.mount =	jffs2_mount,
+	.kill_sb =	jffs2_kill_sb,
+};
+MODULE_ALIAS_FS("jffs2");
+
+static int __init init_jffs2_fs(void)
+{
+	int ret;
+
+	/* Paranoia checks for on-medium structures. If we ask GCC
+	   to pack them with __attribute__((packed)) then it _also_
+	   assumes that they're not aligned -- so it emits crappy
+	   code on some architectures. Ideally we want an attribute
+	   which means just 'no padding', without the alignment
+	   thing. But GCC doesn't have that -- we have to just
+	   hope the structs are the right sizes, instead. */
+	BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+
+	pr_info("version 2.2."
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	       " (NAND)"
+#endif
+#ifdef CONFIG_JFFS2_SUMMARY
+	       " (SUMMARY) "
+#endif
+	       " © 2001-2006 Red Hat, Inc.\n");
+
+	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
+					     sizeof(struct jffs2_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     jffs2_i_init_once);
+	if (!jffs2_inode_cachep) {
+		pr_err("error: Failed to initialise inode cache\n");
+		return -ENOMEM;
+	}
+	ret = jffs2_compressors_init();
+	if (ret) {
+		pr_err("error: Failed to initialise compressors\n");
+		goto out;
+	}
+	ret = jffs2_create_slab_caches();
+	if (ret) {
+		pr_err("error: Failed to initialise slab caches\n");
+		goto out_compressors;
+	}
+	ret = register_filesystem(&jffs2_fs_type);
+	if (ret) {
+		pr_err("error: Failed to register filesystem\n");
+		goto out_slab;
+	}
+	return 0;
+
+ out_slab:
+	jffs2_destroy_slab_caches();
+ out_compressors:
+	jffs2_compressors_exit();
+ out:
+	kmem_cache_destroy(jffs2_inode_cachep);
+	return ret;
+}
+
+static void __exit exit_jffs2_fs(void)
+{
+	unregister_filesystem(&jffs2_fs_type);
+	jffs2_destroy_slab_caches();
+	jffs2_compressors_exit();
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(jffs2_inode_cachep);
+}
+
+module_init(init_jffs2_fs);
+module_exit(exit_jffs2_fs);
+
+MODULE_DESCRIPTION("The Journalling Flash File System, v2");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); // Actually dual-licensed, but it doesn't matter for
+		       // the sake of this tag. It's Free Software.
diff --git a/kernel/fs/jffs2/symlink.c b/kernel/fs/jffs2/symlink.c
new file mode 100644
index 000000000..1fefa25d0
--- /dev/null
+++ b/kernel/fs/jffs2/symlink.c
@@ -0,0 +1,66 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include "nodelist.h"
+
+static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd);
+
+const struct inode_operations jffs2_symlink_inode_operations =
+{
+	.readlink =	generic_readlink,
+	.follow_link =	jffs2_follow_link,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
+};
+
+static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry));
+	char *p = (char *)f->target;
+
+	/*
+	 * We don't acquire the f->sem mutex here since the only data we
+	 * use is f->target.
+	 *
+	 * 1. If we are here the inode has already built and f->target has
+	 * to point to the target path.
+	 * 2. Nobody uses f->target (if the inode is symlink's inode). The
+	 * exception is inode freeing function which frees f->target. But
+	 * it can't be called while we are here and before VFS has
+	 * stopped using our f->target string which we provide by means of
+	 * nd_set_link() call.
+	 */
+
+	if (!p) {
+		pr_err("%s(): can't find symlink target\n", __func__);
+		p = ERR_PTR(-EIO);
+	}
+	jffs2_dbg(1, "%s(): target path is '%s'\n",
+		  __func__, (char *)f->target);
+
+	nd_set_link(nd, p);
+
+	/*
+	 * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe
+	 * since the only way that may cause f->target to be changed is iput() operation.
+	 * But VFS will not use f->target after iput() has been called.
+	 */
+	return NULL;
+}
+
diff --git a/kernel/fs/jffs2/wbuf.c b/kernel/fs/jffs2/wbuf.c
new file mode 100644
index 000000000..09ed55190
--- /dev/null
+++ b/kernel/fs/jffs2/wbuf.c
@@ -0,0 +1,1353 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ * Modified debugged and enhanced by Thomas Gleixner <tglx@linutronix.de>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/crc32.h>
+#include <linux/mtd/nand.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+#include "nodelist.h"
+
+/* For testing write failures */
+#undef BREAKME
+#undef BREAKMEHEADER
+
+#ifdef BREAKME
+static unsigned char *brokenbuf;
+#endif
+
+#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(c->wbuf_pagesize)) * (unsigned long)(c->wbuf_pagesize) )
+#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(c->wbuf_pagesize) )
+
+/* max. erase failures before we mark a block bad */
+#define MAX_ERASE_FAILURES 	2
+
+struct jffs2_inodirty {
+	uint32_t ino;
+	struct jffs2_inodirty *next;
+};
+
+static struct jffs2_inodirty inodirty_nomem;
+
+static int jffs2_wbuf_pending_for_ino(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inodirty *this = c->wbuf_inodes;
+
+	/* If a malloc failed, consider _everything_ dirty */
+	if (this == &inodirty_nomem)
+		return 1;
+
+	/* If ino == 0, _any_ non-GC writes mean 'yes' */
+	if (this && !ino)
+		return 1;
+
+	/* Look to see if the inode in question is pending in the wbuf */
+	while (this) {
+		if (this->ino == ino)
+			return 1;
+		this = this->next;
+	}
+	return 0;
+}
+
+static void jffs2_clear_wbuf_ino_list(struct jffs2_sb_info *c)
+{
+	struct jffs2_inodirty *this;
+
+	this = c->wbuf_inodes;
+
+	if (this != &inodirty_nomem) {
+		while (this) {
+			struct jffs2_inodirty *next = this->next;
+			kfree(this);
+			this = next;
+		}
+	}
+	c->wbuf_inodes = NULL;
+}
+
+static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inodirty *new;
+
+	/* Schedule delayed write-buffer write-out */
+	jffs2_dirty_trigger(c);
+
+	if (jffs2_wbuf_pending_for_ino(c, ino))
+		return;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new) {
+		jffs2_dbg(1, "No memory to allocate inodirty. Fallback to all considered dirty\n");
+		jffs2_clear_wbuf_ino_list(c);
+		c->wbuf_inodes = &inodirty_nomem;
+		return;
+	}
+	new->ino = ino;
+	new->next = c->wbuf_inodes;
+	c->wbuf_inodes = new;
+	return;
+}
+
+static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
+{
+	struct list_head *this, *next;
+	static int n;
+
+	if (list_empty(&c->erasable_pending_wbuf_list))
+		return;
+
+	list_for_each_safe(this, next, &c->erasable_pending_wbuf_list) {
+		struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+		jffs2_dbg(1, "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n",
+			  jeb->offset);
+		list_del(this);
+		if ((jiffies + (n++)) & 127) {
+			/* Most of the time, we just erase it immediately. Otherwise we
+			   spend ages scanning it on mount, etc. */
+			jffs2_dbg(1, "...and adding to erase_pending_list\n");
+			list_add_tail(&jeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			jffs2_garbage_collect_trigger(c);
+		} else {
+			/* Sometimes, however, we leave it elsewhere so it doesn't get
+			   immediately reused, and we spread the load a bit. */
+			jffs2_dbg(1, "...and adding to erasable_list\n");
+			list_add_tail(&jeb->list, &c->erasable_list);
+		}
+	}
+}
+
+#define REFILE_NOTEMPTY 0
+#define REFILE_ANYWAY   1
+
+static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int allow_empty)
+{
+	jffs2_dbg(1, "About to refile bad block at %08x\n", jeb->offset);
+
+	/* File the existing block on the bad_used_list.... */
+	if (c->nextblock == jeb)
+		c->nextblock = NULL;
+	else /* Not sure this should ever happen... need more coffee */
+		list_del(&jeb->list);
+	if (jeb->first_node) {
+		jffs2_dbg(1, "Refiling block at %08x to bad_used_list\n",
+			  jeb->offset);
+		list_add(&jeb->list, &c->bad_used_list);
+	} else {
+		BUG_ON(allow_empty == REFILE_NOTEMPTY);
+		/* It has to have had some nodes or we couldn't be here */
+		jffs2_dbg(1, "Refiling block at %08x to erase_pending_list\n",
+			  jeb->offset);
+		list_add(&jeb->list, &c->erase_pending_list);
+		c->nr_erasing_blocks++;
+		jffs2_garbage_collect_trigger(c);
+	}
+
+	if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
+		uint32_t oldfree = jeb->free_size;
+
+		jffs2_link_node_ref(c, jeb, 
+				    (jeb->offset+c->sector_size-oldfree) | REF_OBSOLETE,
+				    oldfree, NULL);
+		/* convert to wasted */
+		c->wasted_size += oldfree;
+		jeb->wasted_size += oldfree;
+		c->dirty_size -= oldfree;
+		jeb->dirty_size -= oldfree;
+	}
+
+	jffs2_dbg_dump_block_lists_nolock(c);
+	jffs2_dbg_acct_sanity_check_nolock(c,jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+}
+
+static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info *c,
+							    struct jffs2_inode_info *f,
+							    struct jffs2_raw_node_ref *raw,
+							    union jffs2_node_union *node)
+{
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dirent *fd;
+
+	dbg_noderef("incore_replace_raw: node at %p is {%04x,%04x}\n",
+		    node, je16_to_cpu(node->u.magic), je16_to_cpu(node->u.nodetype));
+
+	BUG_ON(je16_to_cpu(node->u.magic) != 0x1985 &&
+	       je16_to_cpu(node->u.magic) != 0);
+
+	switch (je16_to_cpu(node->u.nodetype)) {
+	case JFFS2_NODETYPE_INODE:
+		if (f->metadata && f->metadata->raw == raw) {
+			dbg_noderef("Will replace ->raw in f->metadata at %p\n", f->metadata);
+			return &f->metadata->raw;
+		}
+		frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset));
+		BUG_ON(!frag);
+		/* Find a frag which refers to the full_dnode we want to modify */
+		while (!frag->node || frag->node->raw != raw) {
+			frag = frag_next(frag);
+			BUG_ON(!frag);
+		}
+		dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node);
+		return &frag->node->raw;
+
+	case JFFS2_NODETYPE_DIRENT:
+		for (fd = f->dents; fd; fd = fd->next) {
+			if (fd->raw == raw) {
+				dbg_noderef("Will replace ->raw in full_dirent at %p\n", fd);
+				return &fd->raw;
+			}
+		}
+		BUG();
+
+	default:
+		dbg_noderef("Don't care about replacing raw for nodetype %x\n",
+			    je16_to_cpu(node->u.nodetype));
+		break;
+	}
+	return NULL;
+}
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
+			      uint32_t ofs)
+{
+	int ret;
+	size_t retlen;
+	char *eccstr;
+
+	ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
+	if (ret && ret != -EUCLEAN && ret != -EBADMSG) {
+		pr_warn("%s(): Read back of page at %08x failed: %d\n",
+			__func__, c->wbuf_ofs, ret);
+		return ret;
+	} else if (retlen != c->wbuf_pagesize) {
+		pr_warn("%s(): Read back of page at %08x gave short read: %zd not %d\n",
+			__func__, ofs, retlen, c->wbuf_pagesize);
+		return -EIO;
+	}
+	if (!memcmp(buf, c->wbuf_verify, c->wbuf_pagesize))
+		return 0;
+
+	if (ret == -EUCLEAN)
+		eccstr = "corrected";
+	else if (ret == -EBADMSG)
+		eccstr = "correction failed";
+	else
+		eccstr = "OK or unused";
+
+	pr_warn("Write verify error (ECC %s) at %08x. Wrote:\n",
+		eccstr, c->wbuf_ofs);
+	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1,
+		       c->wbuf, c->wbuf_pagesize, 0);
+
+	pr_warn("Read back:\n");
+	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1,
+		       c->wbuf_verify, c->wbuf_pagesize, 0);
+
+	return -EIO;
+}
+#else
+#define jffs2_verify_write(c,b,o) (0)
+#endif
+
+/* Recover from failure to write wbuf. Recover the nodes up to the
+ * wbuf, not the one which we were starting to try to write. */
+
+static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
+{
+	struct jffs2_eraseblock *jeb, *new_jeb;
+	struct jffs2_raw_node_ref *raw, *next, *first_raw = NULL;
+	size_t retlen;
+	int ret;
+	int nr_refile = 0;
+	unsigned char *buf;
+	uint32_t start, end, ofs, len;
+
+	jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+
+	spin_lock(&c->erase_completion_lock);
+	if (c->wbuf_ofs % c->mtd->erasesize)
+		jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
+	else
+		jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+	spin_unlock(&c->erase_completion_lock);
+
+	BUG_ON(!ref_obsolete(jeb->last_node));
+
+	/* Find the first node to be recovered, by skipping over every
+	   node which ends before the wbuf starts, or which is obsolete. */
+	for (next = raw = jeb->first_node; next; raw = next) {
+		next = ref_next(raw);
+
+		if (ref_obsolete(raw) || 
+		    (next && ref_offset(next) <= c->wbuf_ofs)) {
+			dbg_noderef("Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
+				    ref_offset(raw), ref_flags(raw),
+				    (ref_offset(raw) + ref_totlen(c, jeb, raw)),
+				    c->wbuf_ofs);
+			continue;
+		}
+		dbg_noderef("First node to be recovered is at 0x%08x(%d)-0x%08x\n",
+			    ref_offset(raw), ref_flags(raw),
+			    (ref_offset(raw) + ref_totlen(c, jeb, raw)));
+
+		first_raw = raw;
+		break;
+	}
+
+	if (!first_raw) {
+		/* All nodes were obsolete. Nothing to recover. */
+		jffs2_dbg(1, "No non-obsolete nodes to be recovered. Just filing block bad\n");
+		c->wbuf_len = 0;
+		return;
+	}
+
+	start = ref_offset(first_raw);
+	end = ref_offset(jeb->last_node);
+	nr_refile = 1;
+
+	/* Count the number of refs which need to be copied */
+	while ((raw = ref_next(raw)) != jeb->last_node)
+		nr_refile++;
+
+	dbg_noderef("wbuf recover %08x-%08x (%d bytes in %d nodes)\n",
+		    start, end, end - start, nr_refile);
+
+	buf = NULL;
+	if (start < c->wbuf_ofs) {
+		/* First affected node was already partially written.
+		 * Attempt to reread the old data into our buffer. */
+
+		buf = kmalloc(end - start, GFP_KERNEL);
+		if (!buf) {
+			pr_crit("Malloc failure in wbuf recovery. Data loss ensues.\n");
+
+			goto read_failed;
+		}
+
+		/* Do the read... */
+		ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen,
+			       buf);
+
+		/* ECC recovered ? */
+		if ((ret == -EUCLEAN || ret == -EBADMSG) &&
+		    (retlen == c->wbuf_ofs - start))
+			ret = 0;
+
+		if (ret || retlen != c->wbuf_ofs - start) {
+			pr_crit("Old data are already lost in wbuf recovery. Data loss ensues.\n");
+
+			kfree(buf);
+			buf = NULL;
+		read_failed:
+			first_raw = ref_next(first_raw);
+			nr_refile--;
+			while (first_raw && ref_obsolete(first_raw)) {
+				first_raw = ref_next(first_raw);
+				nr_refile--;
+			}
+
+			/* If this was the only node to be recovered, give up */
+			if (!first_raw) {
+				c->wbuf_len = 0;
+				return;
+			}
+
+			/* It wasn't. Go on and try to recover nodes complete in the wbuf */
+			start = ref_offset(first_raw);
+			dbg_noderef("wbuf now recover %08x-%08x (%d bytes in %d nodes)\n",
+				    start, end, end - start, nr_refile);
+
+		} else {
+			/* Read succeeded. Copy the remaining data from the wbuf */
+			memcpy(buf + (c->wbuf_ofs - start), c->wbuf, end - c->wbuf_ofs);
+		}
+	}
+	/* OK... we're to rewrite (end-start) bytes of data from first_raw onwards.
+	   Either 'buf' contains the data, or we find it in the wbuf */
+
+	/* ... and get an allocation of space from a shiny new block instead */
+	ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE);
+	if (ret) {
+		pr_warn("Failed to allocate space for wbuf recovery. Data loss ensues.\n");
+		kfree(buf);
+		return;
+	}
+
+	/* The summary is not recovered, so it must be disabled for this erase block */
+	jffs2_sum_disable_collecting(c->summary);
+
+	ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
+	if (ret) {
+		pr_warn("Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
+		kfree(buf);
+		return;
+	}
+
+	ofs = write_ofs(c);
+
+	if (end-start >= c->wbuf_pagesize) {
+		/* Need to do another write immediately, but it's possible
+		   that this is just because the wbuf itself is completely
+		   full, and there's nothing earlier read back from the
+		   flash. Hence 'buf' isn't necessarily what we're writing
+		   from. */
+		unsigned char *rewrite_buf = buf?:c->wbuf;
+		uint32_t towrite = (end-start) - ((end-start)%c->wbuf_pagesize);
+
+		jffs2_dbg(1, "Write 0x%x bytes at 0x%08x in wbuf recover\n",
+			  towrite, ofs);
+
+#ifdef BREAKMEHEADER
+		static int breakme;
+		if (breakme++ == 20) {
+			pr_notice("Faking write error at 0x%08x\n", ofs);
+			breakme = 0;
+			mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf);
+			ret = -EIO;
+		} else
+#endif
+			ret = mtd_write(c->mtd, ofs, towrite, &retlen,
+					rewrite_buf);
+
+		if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) {
+			/* Argh. We tried. Really we did. */
+			pr_crit("Recovery of wbuf failed due to a second write error\n");
+			kfree(buf);
+
+			if (retlen)
+				jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, first_raw), NULL);
+
+			return;
+		}
+		pr_notice("Recovery of wbuf succeeded to %08x\n", ofs);
+
+		c->wbuf_len = (end - start) - towrite;
+		c->wbuf_ofs = ofs + towrite;
+		memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len);
+		/* Don't muck about with c->wbuf_inodes. False positives are harmless. */
+	} else {
+		/* OK, now we're left with the dregs in whichever buffer we're using */
+		if (buf) {
+			memcpy(c->wbuf, buf, end-start);
+		} else {
+			memmove(c->wbuf, c->wbuf + (start - c->wbuf_ofs), end - start);
+		}
+		c->wbuf_ofs = ofs;
+		c->wbuf_len = end - start;
+	}
+
+	/* Now sort out the jffs2_raw_node_refs, moving them from the old to the next block */
+	new_jeb = &c->blocks[ofs / c->sector_size];
+
+	spin_lock(&c->erase_completion_lock);
+	for (raw = first_raw; raw != jeb->last_node; raw = ref_next(raw)) {
+		uint32_t rawlen = ref_totlen(c, jeb, raw);
+		struct jffs2_inode_cache *ic;
+		struct jffs2_raw_node_ref *new_ref;
+		struct jffs2_raw_node_ref **adjust_ref = NULL;
+		struct jffs2_inode_info *f = NULL;
+
+		jffs2_dbg(1, "Refiling block of %08x at %08x(%d) to %08x\n",
+			  rawlen, ref_offset(raw), ref_flags(raw), ofs);
+
+		ic = jffs2_raw_ref_to_ic(raw);
+
+		/* Ick. This XATTR mess should be fixed shortly... */
+		if (ic && ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			struct jffs2_xattr_datum *xd = (void *)ic;
+			BUG_ON(xd->node != raw);
+			adjust_ref = &xd->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_XATTR_REF) {
+			struct jffs2_xattr_datum *xr = (void *)ic;
+			BUG_ON(xr->node != raw);
+			adjust_ref = &xr->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_INODE_CACHE) {
+			struct jffs2_raw_node_ref **p = &ic->nodes;
+
+			/* Remove the old node from the per-inode list */
+			while (*p && *p != (void *)ic) {
+				if (*p == raw) {
+					(*p) = (raw->next_in_ino);
+					raw->next_in_ino = NULL;
+					break;
+				}
+				p = &((*p)->next_in_ino);
+			}
+
+			if (ic->state == INO_STATE_PRESENT && !ref_obsolete(raw)) {
+				/* If it's an in-core inode, then we have to adjust any
+				   full_dirent or full_dnode structure to point to the
+				   new version instead of the old */
+				f = jffs2_gc_fetch_inode(c, ic->ino, !ic->pino_nlink);
+				if (IS_ERR(f)) {
+					/* Should never happen; it _must_ be present */
+					JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n",
+						    ic->ino, PTR_ERR(f));
+					BUG();
+				}
+				/* We don't lock f->sem. There's a number of ways we could
+				   end up in here with it already being locked, and nobody's
+				   going to modify it on us anyway because we hold the
+				   alloc_sem. We're only changing one ->raw pointer too,
+				   which we can get away with without upsetting readers. */
+				adjust_ref = jffs2_incore_replace_raw(c, f, raw,
+								      (void *)(buf?:c->wbuf) + (ref_offset(raw) - start));
+			} else if (unlikely(ic->state != INO_STATE_PRESENT &&
+					    ic->state != INO_STATE_CHECKEDABSENT &&
+					    ic->state != INO_STATE_GC)) {
+				JFFS2_ERROR("Inode #%u is in strange state %d!\n", ic->ino, ic->state);
+				BUG();
+			}
+		}
+
+		new_ref = jffs2_link_node_ref(c, new_jeb, ofs | ref_flags(raw), rawlen, ic);
+
+		if (adjust_ref) {
+			BUG_ON(*adjust_ref != raw);
+			*adjust_ref = new_ref;
+		}
+		if (f)
+			jffs2_gc_release_inode(c, f);
+
+		if (!ref_obsolete(raw)) {
+			jeb->dirty_size += rawlen;
+			jeb->used_size  -= rawlen;
+			c->dirty_size += rawlen;
+			c->used_size -= rawlen;
+			raw->flash_offset = ref_offset(raw) | REF_OBSOLETE;
+			BUG_ON(raw->next_in_ino);
+		}
+		ofs += rawlen;
+	}
+
+	kfree(buf);
+
+	/* Fix up the original jeb now it's on the bad_list */
+	if (first_raw == jeb->first_node) {
+		jffs2_dbg(1, "Failing block at %08x is now empty. Moving to erase_pending_list\n",
+			  jeb->offset);
+		list_move(&jeb->list, &c->erase_pending_list);
+		c->nr_erasing_blocks++;
+		jffs2_garbage_collect_trigger(c);
+	}
+
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	jffs2_dbg_acct_sanity_check_nolock(c, new_jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
+
+	spin_unlock(&c->erase_completion_lock);
+
+	jffs2_dbg(1, "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n",
+		  c->wbuf_ofs, c->wbuf_len);
+
+}
+
+/* Meaning of pad argument:
+   0: Do not pad. Probably pointless - we only ever use this when we can't pad anyway.
+   1: Pad, do not adjust nextblock free_size
+   2: Pad, adjust nextblock free_size
+*/
+#define NOPAD		0
+#define PAD_NOACCOUNT	1
+#define PAD_ACCOUNTING	2
+
+static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
+{
+	struct jffs2_eraseblock *wbuf_jeb;
+	int ret;
+	size_t retlen;
+
+	/* Nothing to do if not write-buffering the flash. In particular, we shouldn't
+	   del_timer() the timer we never initialised. */
+	if (!jffs2_is_writebuffered(c))
+		return 0;
+
+	if (!mutex_is_locked(&c->alloc_sem)) {
+		pr_crit("jffs2_flush_wbuf() called with alloc_sem not locked!\n");
+		BUG();
+	}
+
+	if (!c->wbuf_len)	/* already checked c->wbuf above */
+		return 0;
+
+	wbuf_jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+	if (jffs2_prealloc_raw_node_refs(c, wbuf_jeb, c->nextblock->allocated_refs + 1))
+		return -ENOMEM;
+
+	/* claim remaining space on the page
+	   this happens, if we have a change to a new block,
+	   or if fsync forces us to flush the writebuffer.
+	   if we have a switch to next page, we will not have
+	   enough remaining space for this.
+	*/
+	if (pad ) {
+		c->wbuf_len = PAD(c->wbuf_len);
+
+		/* Pad with JFFS2_DIRTY_BITMASK initially.  this helps out ECC'd NOR
+		   with 8 byte page size */
+		memset(c->wbuf + c->wbuf_len, 0, c->wbuf_pagesize - c->wbuf_len);
+
+		if ( c->wbuf_len + sizeof(struct jffs2_unknown_node) < c->wbuf_pagesize) {
+			struct jffs2_unknown_node *padnode = (void *)(c->wbuf + c->wbuf_len);
+			padnode->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+			padnode->nodetype = cpu_to_je16(JFFS2_NODETYPE_PADDING);
+			padnode->totlen = cpu_to_je32(c->wbuf_pagesize - c->wbuf_len);
+			padnode->hdr_crc = cpu_to_je32(crc32(0, padnode, sizeof(*padnode)-4));
+		}
+	}
+	/* else jffs2_flash_writev has actually filled in the rest of the
+	   buffer for us, and will deal with the node refs etc. later. */
+
+#ifdef BREAKME
+	static int breakme;
+	if (breakme++ == 20) {
+		pr_notice("Faking write error at 0x%08x\n", c->wbuf_ofs);
+		breakme = 0;
+		mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+			  brokenbuf);
+		ret = -EIO;
+	} else
+#endif
+
+		ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
+				&retlen, c->wbuf);
+
+	if (ret) {
+		pr_warn("jffs2_flush_wbuf(): Write failed with %d\n", ret);
+		goto wfail;
+	} else if (retlen != c->wbuf_pagesize) {
+		pr_warn("jffs2_flush_wbuf(): Write was short: %zd instead of %d\n",
+			retlen, c->wbuf_pagesize);
+		ret = -EIO;
+		goto wfail;
+	} else if ((ret = jffs2_verify_write(c, c->wbuf, c->wbuf_ofs))) {
+	wfail:
+		jffs2_wbuf_recover(c);
+
+		return ret;
+	}
+
+	/* Adjust free size of the block if we padded. */
+	if (pad) {
+		uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
+
+		jffs2_dbg(1, "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n",
+			  (wbuf_jeb == c->nextblock) ? "next" : "",
+			  wbuf_jeb->offset);
+
+		/* wbuf_pagesize - wbuf_len is the amount of space that's to be
+		   padded. If there is less free space in the block than that,
+		   something screwed up */
+		if (wbuf_jeb->free_size < waste) {
+			pr_crit("jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
+				c->wbuf_ofs, c->wbuf_len, waste);
+			pr_crit("jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
+				wbuf_jeb->offset, wbuf_jeb->free_size);
+			BUG();
+		}
+
+		spin_lock(&c->erase_completion_lock);
+
+		jffs2_link_node_ref(c, wbuf_jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL);
+		/* FIXME: that made it count as dirty. Convert to wasted */
+		wbuf_jeb->dirty_size -= waste;
+		c->dirty_size -= waste;
+		wbuf_jeb->wasted_size += waste;
+		c->wasted_size += waste;
+	} else
+		spin_lock(&c->erase_completion_lock);
+
+	/* Stick any now-obsoleted blocks on the erase_pending_list */
+	jffs2_refile_wbuf_blocks(c);
+	jffs2_clear_wbuf_ino_list(c);
+	spin_unlock(&c->erase_completion_lock);
+
+	memset(c->wbuf,0xff,c->wbuf_pagesize);
+	/* adjust write buffer offset, else we get a non contiguous write bug */
+	c->wbuf_ofs += c->wbuf_pagesize;
+	c->wbuf_len = 0;
+	return 0;
+}
+
+/* Trigger garbage collection to flush the write-buffer.
+   If ino arg is zero, do it if _any_ real (i.e. not GC) writes are
+   outstanding. If ino arg non-zero, do it only if a write for the
+   given inode is outstanding. */
+int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino)
+{
+	uint32_t old_wbuf_ofs;
+	uint32_t old_wbuf_len;
+	int ret = 0;
+
+	jffs2_dbg(1, "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino);
+
+	if (!c->wbuf)
+		return 0;
+
+	mutex_lock(&c->alloc_sem);
+	if (!jffs2_wbuf_pending_for_ino(c, ino)) {
+		jffs2_dbg(1, "Ino #%d not pending in wbuf. Returning\n", ino);
+		mutex_unlock(&c->alloc_sem);
+		return 0;
+	}
+
+	old_wbuf_ofs = c->wbuf_ofs;
+	old_wbuf_len = c->wbuf_len;
+
+	if (c->unchecked_size) {
+		/* GC won't make any progress for a while */
+		jffs2_dbg(1, "%s(): padding. Not finished checking\n",
+			  __func__);
+		down_write(&c->wbuf_sem);
+		ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+		/* retry flushing wbuf in case jffs2_wbuf_recover
+		   left some data in the wbuf */
+		if (ret)
+			ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+		up_write(&c->wbuf_sem);
+	} else while (old_wbuf_len &&
+		      old_wbuf_ofs == c->wbuf_ofs) {
+
+		mutex_unlock(&c->alloc_sem);
+
+		jffs2_dbg(1, "%s(): calls gc pass\n", __func__);
+
+		ret = jffs2_garbage_collect_pass(c);
+		if (ret) {
+			/* GC failed. Flush it with padding instead */
+			mutex_lock(&c->alloc_sem);
+			down_write(&c->wbuf_sem);
+			ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+			/* retry flushing wbuf in case jffs2_wbuf_recover
+			   left some data in the wbuf */
+			if (ret)
+				ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+			up_write(&c->wbuf_sem);
+			break;
+		}
+		mutex_lock(&c->alloc_sem);
+	}
+
+	jffs2_dbg(1, "%s(): ends...\n", __func__);
+
+	mutex_unlock(&c->alloc_sem);
+	return ret;
+}
+
+/* Pad write-buffer to end and write it, wasting space. */
+int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c)
+{
+	int ret;
+
+	if (!c->wbuf)
+		return 0;
+
+	down_write(&c->wbuf_sem);
+	ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
+	/* retry - maybe wbuf recover left some data in wbuf. */
+	if (ret)
+		ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
+	up_write(&c->wbuf_sem);
+
+	return ret;
+}
+
+static size_t jffs2_fill_wbuf(struct jffs2_sb_info *c, const uint8_t *buf,
+			      size_t len)
+{
+	if (len && !c->wbuf_len && (len >= c->wbuf_pagesize))
+		return 0;
+
+	if (len > (c->wbuf_pagesize - c->wbuf_len))
+		len = c->wbuf_pagesize - c->wbuf_len;
+	memcpy(c->wbuf + c->wbuf_len, buf, len);
+	c->wbuf_len += (uint32_t) len;
+	return len;
+}
+
+int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
+		       unsigned long count, loff_t to, size_t *retlen,
+		       uint32_t ino)
+{
+	struct jffs2_eraseblock *jeb;
+	size_t wbuf_retlen, donelen = 0;
+	uint32_t outvec_to = to;
+	int ret, invec;
+
+	/* If not writebuffered flash, don't bother */
+	if (!jffs2_is_writebuffered(c))
+		return jffs2_flash_direct_writev(c, invecs, count, to, retlen);
+
+	down_write(&c->wbuf_sem);
+
+	/* If wbuf_ofs is not initialized, set it to target address */
+	if (c->wbuf_ofs == 0xFFFFFFFF) {
+		c->wbuf_ofs = PAGE_DIV(to);
+		c->wbuf_len = PAGE_MOD(to);
+		memset(c->wbuf,0xff,c->wbuf_pagesize);
+	}
+
+	/*
+	 * Sanity checks on target address.  It's permitted to write
+	 * at PAD(c->wbuf_len+c->wbuf_ofs), and it's permitted to
+	 * write at the beginning of a new erase block. Anything else,
+	 * and you die.  New block starts at xxx000c (0-b = block
+	 * header)
+	 */
+	if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) {
+		/* It's a write to a new block */
+		if (c->wbuf_len) {
+			jffs2_dbg(1, "%s(): to 0x%lx causes flush of wbuf at 0x%08x\n",
+				  __func__, (unsigned long)to, c->wbuf_ofs);
+			ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
+			if (ret)
+				goto outerr;
+		}
+		/* set pointer to new block */
+		c->wbuf_ofs = PAGE_DIV(to);
+		c->wbuf_len = PAGE_MOD(to);
+	}
+
+	if (to != PAD(c->wbuf_ofs + c->wbuf_len)) {
+		/* We're not writing immediately after the writebuffer. Bad. */
+		pr_crit("%s(): Non-contiguous write to %08lx\n",
+			__func__, (unsigned long)to);
+		if (c->wbuf_len)
+			pr_crit("wbuf was previously %08x-%08x\n",
+				c->wbuf_ofs, c->wbuf_ofs + c->wbuf_len);
+		BUG();
+	}
+
+	/* adjust alignment offset */
+	if (c->wbuf_len != PAGE_MOD(to)) {
+		c->wbuf_len = PAGE_MOD(to);
+		/* take care of alignment to next page */
+		if (!c->wbuf_len) {
+			c->wbuf_len = c->wbuf_pagesize;
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+	}
+
+	for (invec = 0; invec < count; invec++) {
+		int vlen = invecs[invec].iov_len;
+		uint8_t *v = invecs[invec].iov_base;
+
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
+
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+		vlen -= wbuf_retlen;
+		outvec_to += wbuf_retlen;
+		donelen += wbuf_retlen;
+		v += wbuf_retlen;
+
+		if (vlen >= c->wbuf_pagesize) {
+			ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen),
+					&wbuf_retlen, v);
+			if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
+				goto outfile;
+
+			vlen -= wbuf_retlen;
+			outvec_to += wbuf_retlen;
+			c->wbuf_ofs = outvec_to;
+			donelen += wbuf_retlen;
+			v += wbuf_retlen;
+		}
+
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+
+		outvec_to += wbuf_retlen;
+		donelen += wbuf_retlen;
+	}
+
+	/*
+	 * If there's a remainder in the wbuf and it's a non-GC write,
+	 * remember that the wbuf affects this ino
+	 */
+	*retlen = donelen;
+
+	if (jffs2_sum_active()) {
+		int res = jffs2_sum_add_kvec(c, invecs, count, (uint32_t) to);
+		if (res)
+			return res;
+	}
+
+	if (c->wbuf_len && ino)
+		jffs2_wbuf_dirties_inode(c, ino);
+
+	ret = 0;
+	up_write(&c->wbuf_sem);
+	return ret;
+
+outfile:
+	/*
+	 * At this point we have no problem, c->wbuf is empty. However
+	 * refile nextblock to avoid writing again to same address.
+	 */
+
+	spin_lock(&c->erase_completion_lock);
+
+	jeb = &c->blocks[outvec_to / c->sector_size];
+	jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+
+	spin_unlock(&c->erase_completion_lock);
+
+outerr:
+	*retlen = 0;
+	up_write(&c->wbuf_sem);
+	return ret;
+}
+
+/*
+ *	This is the entry for flash write.
+ *	Check, if we work on NAND FLASH, if so build an kvec and write it via vritev
+*/
+int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+		      size_t *retlen, const u_char *buf)
+{
+	struct kvec vecs[1];
+
+	if (!jffs2_is_writebuffered(c))
+		return jffs2_flash_direct_write(c, ofs, len, retlen, buf);
+
+	vecs[0].iov_base = (unsigned char *) buf;
+	vecs[0].iov_len = len;
+	return jffs2_flash_writev(c, vecs, 1, ofs, retlen, 0);
+}
+
+/*
+	Handle readback from writebuffer and ECC failure return
+*/
+int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf)
+{
+	loff_t	orbf = 0, owbf = 0, lwbf = 0;
+	int	ret;
+
+	if (!jffs2_is_writebuffered(c))
+		return mtd_read(c->mtd, ofs, len, retlen, buf);
+
+	/* Read flash */
+	down_read(&c->wbuf_sem);
+	ret = mtd_read(c->mtd, ofs, len, retlen, buf);
+
+	if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
+		if (ret == -EBADMSG)
+			pr_warn("mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n",
+				len, ofs);
+		/*
+		 * We have the raw data without ECC correction in the buffer,
+		 * maybe we are lucky and all data or parts are correct. We
+		 * check the node.  If data are corrupted node check will sort
+		 * it out.  We keep this block, it will fail on write or erase
+		 * and the we mark it bad. Or should we do that now? But we
+		 * should give him a chance.  Maybe we had a system crash or
+		 * power loss before the ecc write or a erase was completed.
+		 * So we return success. :)
+		 */
+		ret = 0;
+	}
+
+	/* if no writebuffer available or write buffer empty, return */
+	if (!c->wbuf_pagesize || !c->wbuf_len)
+		goto exit;
+
+	/* if we read in a different block, return */
+	if (SECTOR_ADDR(ofs) != SECTOR_ADDR(c->wbuf_ofs))
+		goto exit;
+
+	if (ofs >= c->wbuf_ofs) {
+		owbf = (ofs - c->wbuf_ofs);	/* offset in write buffer */
+		if (owbf > c->wbuf_len)		/* is read beyond write buffer ? */
+			goto exit;
+		lwbf = c->wbuf_len - owbf;	/* number of bytes to copy */
+		if (lwbf > len)
+			lwbf = len;
+	} else {
+		orbf = (c->wbuf_ofs - ofs);	/* offset in read buffer */
+		if (orbf > len)			/* is write beyond write buffer ? */
+			goto exit;
+		lwbf = len - orbf;		/* number of bytes to copy */
+		if (lwbf > c->wbuf_len)
+			lwbf = c->wbuf_len;
+	}
+	if (lwbf > 0)
+		memcpy(buf+orbf,c->wbuf+owbf,lwbf);
+
+exit:
+	up_read(&c->wbuf_sem);
+	return ret;
+}
+
+#define NR_OOB_SCAN_PAGES 4
+
+/* For historical reasons we use only 8 bytes for OOB clean marker */
+#define OOB_CM_SIZE 8
+
+static const struct jffs2_unknown_node oob_cleanmarker =
+{
+	.magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK),
+	.nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
+	.totlen = constant_cpu_to_je32(8)
+};
+
+/*
+ * Check, if the out of band area is empty. This function knows about the clean
+ * marker and if it is present in OOB, treats the OOB as empty anyway.
+ */
+int jffs2_check_oob_empty(struct jffs2_sb_info *c,
+			  struct jffs2_eraseblock *jeb, int mode)
+{
+	int i, ret;
+	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+	struct mtd_oob_ops ops;
+
+	ops.mode = MTD_OPS_AUTO_OOB;
+	ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
+	ops.oobbuf = c->oobbuf;
+	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
+	ops.datbuf = NULL;
+
+	ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
+	if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
+		pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
+		       jeb->offset, ops.ooblen, ops.oobretlen, ret);
+		if (!ret || mtd_is_bitflip(ret))
+			ret = -EIO;
+		return ret;
+	}
+
+	for(i = 0; i < ops.ooblen; i++) {
+		if (mode && i < cmlen)
+			/* Yeah, we know about the cleanmarker */
+			continue;
+
+		if (ops.oobbuf[i] != 0xFF) {
+			jffs2_dbg(2, "Found %02x at %x in OOB for "
+				  "%08x\n", ops.oobbuf[i], i, jeb->offset);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Check for a valid cleanmarker.
+ * Returns: 0 if a valid cleanmarker was found
+ *	    1 if no cleanmarker was found
+ *	    negative error code if an error occurred
+ */
+int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb)
+{
+	struct mtd_oob_ops ops;
+	int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+
+	ops.mode = MTD_OPS_AUTO_OOB;
+	ops.ooblen = cmlen;
+	ops.oobbuf = c->oobbuf;
+	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
+	ops.datbuf = NULL;
+
+	ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
+	if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
+		pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
+		       jeb->offset, ops.ooblen, ops.oobretlen, ret);
+		if (!ret || mtd_is_bitflip(ret))
+			ret = -EIO;
+		return ret;
+	}
+
+	return !!memcmp(&oob_cleanmarker, c->oobbuf, cmlen);
+}
+
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb)
+{
+	int ret;
+	struct mtd_oob_ops ops;
+	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+
+	ops.mode = MTD_OPS_AUTO_OOB;
+	ops.ooblen = cmlen;
+	ops.oobbuf = (uint8_t *)&oob_cleanmarker;
+	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
+	ops.datbuf = NULL;
+
+	ret = mtd_write_oob(c->mtd, jeb->offset, &ops);
+	if (ret || ops.oobretlen != ops.ooblen) {
+		pr_err("cannot write OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
+		       jeb->offset, ops.ooblen, ops.oobretlen, ret);
+		if (!ret)
+			ret = -EIO;
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * On NAND we try to mark this block bad. If the block was erased more
+ * than MAX_ERASE_FAILURES we mark it finally bad.
+ * Don't care about failures. This block remains on the erase-pending
+ * or badblock list as long as nobody manipulates the flash with
+ * a bootloader or something like that.
+ */
+
+int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
+{
+	int 	ret;
+
+	/* if the count is < max, we try to write the counter to the 2nd page oob area */
+	if( ++jeb->bad_count < MAX_ERASE_FAILURES)
+		return 0;
+
+	pr_warn("marking eraseblock at %08x as bad\n", bad_offset);
+	ret = mtd_block_markbad(c->mtd, bad_offset);
+
+	if (ret) {
+		jffs2_dbg(1, "%s(): Write failed for block at %08x: error %d\n",
+			  __func__, jeb->offset, ret);
+		return ret;
+	}
+	return 1;
+}
+
+static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
+{
+	struct delayed_work *dwork;
+
+	dwork = container_of(work, struct delayed_work, work);
+	return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
+}
+
+static void delayed_wbuf_sync(struct work_struct *work)
+{
+	struct jffs2_sb_info *c = work_to_sb(work);
+	struct super_block *sb = OFNI_BS_2SFFJ(c);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		jffs2_dbg(1, "%s()\n", __func__);
+		jffs2_flush_wbuf_gc(c, 0);
+	}
+}
+
+void jffs2_dirty_trigger(struct jffs2_sb_info *c)
+{
+	struct super_block *sb = OFNI_BS_2SFFJ(c);
+	unsigned long delay;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+	if (queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay))
+		jffs2_dbg(1, "%s()\n", __func__);
+}
+
+int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
+{
+	struct nand_ecclayout *oinfo = c->mtd->ecclayout;
+
+	if (!c->mtd->oobsize)
+		return 0;
+
+	/* Cleanmarker is out-of-band, so inline size zero */
+	c->cleanmarker_size = 0;
+
+	if (!oinfo || oinfo->oobavail == 0) {
+		pr_err("inconsistent device description\n");
+		return -EINVAL;
+	}
+
+	jffs2_dbg(1, "using OOB on NAND\n");
+
+	c->oobavail = oinfo->oobavail;
+
+	/* Initialise write buffer */
+	init_rwsem(&c->wbuf_sem);
+	INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
+	c->wbuf_pagesize = c->mtd->writesize;
+	c->wbuf_ofs = 0xFFFFFFFF;
+
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+	c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->oobavail, GFP_KERNEL);
+	if (!c->oobbuf) {
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf_verify) {
+		kfree(c->oobbuf);
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+#endif
+	return 0;
+}
+
+void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
+{
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	kfree(c->wbuf_verify);
+#endif
+	kfree(c->wbuf);
+	kfree(c->oobbuf);
+}
+
+int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
+	c->cleanmarker_size = 0;		/* No cleanmarkers needed */
+
+	/* Initialize write buffer */
+	init_rwsem(&c->wbuf_sem);
+	INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
+	c->wbuf_pagesize =  c->mtd->erasesize;
+
+	/* Find a suitable c->sector_size
+	 * - Not too much sectors
+	 * - Sectors have to be at least 4 K + some bytes
+	 * - All known dataflashes have erase sizes of 528 or 1056
+	 * - we take at least 8 eraseblocks and want to have at least 8K size
+	 * - The concatenation should be a power of 2
+	*/
+
+	c->sector_size = 8 * c->mtd->erasesize;
+
+	while (c->sector_size < 8192) {
+		c->sector_size *= 2;
+	}
+
+	/* It may be necessary to adjust the flash size */
+	c->flash_size = c->mtd->size;
+
+	if ((c->flash_size % c->sector_size) != 0) {
+		c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
+		pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
+	};
+
+	c->wbuf_ofs = 0xFFFFFFFF;
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf_verify) {
+		kfree(c->oobbuf);
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+#endif
+
+	pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n",
+		c->wbuf_pagesize, c->sector_size);
+
+	return 0;
+}
+
+void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	kfree(c->wbuf_verify);
+#endif
+	kfree(c->wbuf);
+}
+
+int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
+	/* Cleanmarker currently occupies whole programming regions,
+	 * either one or 2 for 8Byte STMicro flashes. */
+	c->cleanmarker_size = max(16u, c->mtd->writesize);
+
+	/* Initialize write buffer */
+	init_rwsem(&c->wbuf_sem);
+	INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
+
+	c->wbuf_pagesize = c->mtd->writesize;
+	c->wbuf_ofs = 0xFFFFFFFF;
+
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf_verify) {
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+#endif
+	return 0;
+}
+
+void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	kfree(c->wbuf_verify);
+#endif
+	kfree(c->wbuf);
+}
+
+int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
+	c->cleanmarker_size = 0;
+
+	if (c->mtd->writesize == 1)
+		/* We do not need write-buffer */
+		return 0;
+
+	init_rwsem(&c->wbuf_sem);
+	INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
+
+	c->wbuf_pagesize =  c->mtd->writesize;
+	c->wbuf_ofs = 0xFFFFFFFF;
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+	pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n",
+		c->wbuf_pagesize, c->sector_size);
+
+	return 0;
+}
+
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
+	kfree(c->wbuf);
+}
diff --git a/kernel/fs/jffs2/write.c b/kernel/fs/jffs2/write.c
new file mode 100644
index 000000000..b634de4c8
--- /dev/null
+++ b/kernel/fs/jffs2/write.c
@@ -0,0 +1,722 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+#include "compr.h"
+
+
+int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		       uint32_t mode, struct jffs2_raw_inode *ri)
+{
+	struct jffs2_inode_cache *ic;
+
+	ic = jffs2_alloc_inode_cache();
+	if (!ic) {
+		return -ENOMEM;
+	}
+
+	memset(ic, 0, sizeof(*ic));
+
+	f->inocache = ic;
+	f->inocache->pino_nlink = 1; /* Will be overwritten shortly for directories */
+	f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
+	f->inocache->state = INO_STATE_PRESENT;
+
+	jffs2_add_ino_cache(c, f->inocache);
+	jffs2_dbg(1, "%s(): Assigned ino# %d\n", __func__, f->inocache->ino);
+	ri->ino = cpu_to_je32(f->inocache->ino);
+
+	ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+	ri->totlen = cpu_to_je32(PAD(sizeof(*ri)));
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+	ri->mode = cpu_to_jemode(mode);
+
+	f->highest_version = 1;
+	ri->version = cpu_to_je32(f->highest_version);
+
+	return 0;
+}
+
+/* jffs2_write_dnode - given a raw_inode, allocate a full_dnode for it,
+   write it to the flash, link it into the existing inode/fragment list */
+
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode)
+
+{
+	struct jffs2_full_dnode *fn;
+	size_t retlen;
+	uint32_t flash_ofs;
+	struct kvec vecs[2];
+	int ret;
+	int retried = 0;
+	unsigned long cnt = 2;
+
+	D1(if(je32_to_cpu(ri->hdr_crc) != crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)) {
+		pr_crit("Eep. CRC not correct in jffs2_write_dnode()\n");
+		BUG();
+	}
+	   );
+	vecs[0].iov_base = ri;
+	vecs[0].iov_len = sizeof(*ri);
+	vecs[1].iov_base = (unsigned char *)data;
+	vecs[1].iov_len = datalen;
+
+	if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) {
+		pr_warn("%s(): ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n",
+			__func__, je32_to_cpu(ri->totlen),
+			sizeof(*ri), datalen);
+	}
+
+	fn = jffs2_alloc_full_dnode();
+	if (!fn)
+		return ERR_PTR(-ENOMEM);
+
+	/* check number of valid vecs */
+	if (!datalen || !data)
+		cnt = 1;
+ retry:
+	flash_ofs = write_ofs(c);
+
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
+
+	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) {
+		BUG_ON(!retried);
+		jffs2_dbg(1, "%s(): dnode_version %d, highest version %d -> updating dnode\n",
+			  __func__,
+			  je32_to_cpu(ri->version), f->highest_version);
+		ri->version = cpu_to_je32(++f->highest_version);
+		ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+	}
+
+	ret = jffs2_flash_writev(c, vecs, cnt, flash_ofs, &retlen,
+				 (alloc_mode==ALLOC_GC)?0:f->inocache->ino);
+
+	if (ret || (retlen != sizeof(*ri) + datalen)) {
+		pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
+			  sizeof(*ri) + datalen, flash_ofs, ret, retlen);
+
+		/* Mark the space as dirtied */
+		if (retlen) {
+			/* Don't change raw->size to match retlen. We may have
+			   written the node header already, and only the data will
+			   seem corrupted, in which case the scan would skip over
+			   any node we write before the original intended end of
+			   this node */
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL);
+		} else {
+			pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n",
+				  flash_ofs);
+		}
+		if (!retried && alloc_mode != ALLOC_NORETRY) {
+			/* Try to reallocate space and retry */
+			uint32_t dummy;
+			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
+
+			retried = 1;
+
+			jffs2_dbg(1, "Retrying failed write.\n");
+
+			jffs2_dbg_acct_sanity_check(c,jeb);
+			jffs2_dbg_acct_paranoia_check(c, jeb);
+
+			if (alloc_mode == ALLOC_GC) {
+				ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &dummy,
+							     JFFS2_SUMMARY_INODE_SIZE);
+			} else {
+				/* Locking pain */
+				mutex_unlock(&f->sem);
+				jffs2_complete_reservation(c);
+
+				ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
+				mutex_lock(&f->sem);
+			}
+
+			if (!ret) {
+				flash_ofs = write_ofs(c);
+				jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n",
+					  flash_ofs);
+
+				jffs2_dbg_acct_sanity_check(c,jeb);
+				jffs2_dbg_acct_paranoia_check(c, jeb);
+
+				goto retry;
+			}
+			jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n",
+				  ret);
+		}
+		/* Release the full_dnode which is now useless, and return */
+		jffs2_free_full_dnode(fn);
+		return ERR_PTR(ret?ret:-EIO);
+	}
+	/* Mark the space used */
+	/* If node covers at least a whole page, or if it starts at the
+	   beginning of a page and runs to the end of the file, or if
+	   it's a hole node, mark it REF_PRISTINE, else REF_NORMAL.
+	*/
+	if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
+	    ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
+	      (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) ==  je32_to_cpu(ri->isize)))) {
+		flash_ofs |= REF_PRISTINE;
+	} else {
+		flash_ofs |= REF_NORMAL;
+	}
+	fn->raw = jffs2_add_physical_node_ref(c, flash_ofs, PAD(sizeof(*ri)+datalen), f->inocache);
+	if (IS_ERR(fn->raw)) {
+		void *hold_err = fn->raw;
+		/* Release the full_dnode which is now useless, and return */
+		jffs2_free_full_dnode(fn);
+		return ERR_CAST(hold_err);
+	}
+	fn->ofs = je32_to_cpu(ri->offset);
+	fn->size = je32_to_cpu(ri->dsize);
+	fn->frags = 0;
+
+	jffs2_dbg(1, "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
+		  flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize),
+		  je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc),
+		  je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen));
+
+	if (retried) {
+		jffs2_dbg_acct_sanity_check(c,NULL);
+	}
+
+	return fn;
+}
+
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode)
+{
+	struct jffs2_full_dirent *fd;
+	size_t retlen;
+	struct kvec vecs[2];
+	uint32_t flash_ofs;
+	int retried = 0;
+	int ret;
+
+	jffs2_dbg(1, "%s(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n",
+		  __func__,
+		  je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino),
+		  je32_to_cpu(rd->name_crc));
+
+	D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) {
+		pr_crit("Eep. CRC not correct in jffs2_write_dirent()\n");
+		BUG();
+	   });
+
+	if (strnlen(name, namelen) != namelen) {
+		/* This should never happen, but seems to have done on at least one
+		   occasion: https://dev.laptop.org/ticket/4184 */
+		pr_crit("Error in jffs2_write_dirent() -- name contains zero bytes!\n");
+		pr_crit("Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n",
+			je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino),
+			je32_to_cpu(rd->name_crc));
+		WARN_ON(1);
+		return ERR_PTR(-EIO);
+	}
+
+	vecs[0].iov_base = rd;
+	vecs[0].iov_len = sizeof(*rd);
+	vecs[1].iov_base = (unsigned char *)name;
+	vecs[1].iov_len = namelen;
+
+	fd = jffs2_alloc_full_dirent(namelen+1);
+	if (!fd)
+		return ERR_PTR(-ENOMEM);
+
+	fd->version = je32_to_cpu(rd->version);
+	fd->ino = je32_to_cpu(rd->ino);
+	fd->nhash = full_name_hash(name, namelen);
+	fd->type = rd->type;
+	memcpy(fd->name, name, namelen);
+	fd->name[namelen]=0;
+
+ retry:
+	flash_ofs = write_ofs(c);
+
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
+
+	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) {
+		BUG_ON(!retried);
+		jffs2_dbg(1, "%s(): dirent_version %d, highest version %d -> updating dirent\n",
+			  __func__,
+			  je32_to_cpu(rd->version), f->highest_version);
+		rd->version = cpu_to_je32(++f->highest_version);
+		fd->version = je32_to_cpu(rd->version);
+		rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	}
+
+	ret = jffs2_flash_writev(c, vecs, 2, flash_ofs, &retlen,
+				 (alloc_mode==ALLOC_GC)?0:je32_to_cpu(rd->pino));
+	if (ret || (retlen != sizeof(*rd) + namelen)) {
+		pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
+			  sizeof(*rd) + namelen, flash_ofs, ret, retlen);
+		/* Mark the space as dirtied */
+		if (retlen) {
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL);
+		} else {
+			pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n",
+				  flash_ofs);
+		}
+		if (!retried) {
+			/* Try to reallocate space and retry */
+			uint32_t dummy;
+			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
+
+			retried = 1;
+
+			jffs2_dbg(1, "Retrying failed write.\n");
+
+			jffs2_dbg_acct_sanity_check(c,jeb);
+			jffs2_dbg_acct_paranoia_check(c, jeb);
+
+			if (alloc_mode == ALLOC_GC) {
+				ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &dummy,
+							     JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+			} else {
+				/* Locking pain */
+				mutex_unlock(&f->sem);
+				jffs2_complete_reservation(c);
+
+				ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+				mutex_lock(&f->sem);
+			}
+
+			if (!ret) {
+				flash_ofs = write_ofs(c);
+				jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write\n",
+					  flash_ofs);
+				jffs2_dbg_acct_sanity_check(c,jeb);
+				jffs2_dbg_acct_paranoia_check(c, jeb);
+				goto retry;
+			}
+			jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n",
+				  ret);
+		}
+		/* Release the full_dnode which is now useless, and return */
+		jffs2_free_full_dirent(fd);
+		return ERR_PTR(ret?ret:-EIO);
+	}
+	/* Mark the space used */
+	fd->raw = jffs2_add_physical_node_ref(c, flash_ofs | dirent_node_state(rd),
+					      PAD(sizeof(*rd)+namelen), f->inocache);
+	if (IS_ERR(fd->raw)) {
+		void *hold_err = fd->raw;
+		/* Release the full_dirent which is now useless, and return */
+		jffs2_free_full_dirent(fd);
+		return ERR_CAST(hold_err);
+	}
+
+	if (retried) {
+		jffs2_dbg_acct_sanity_check(c,NULL);
+	}
+
+	return fd;
+}
+
+/* The OS-specific code fills in the metadata in the jffs2_raw_inode for us, so that
+   we don't have to go digging in struct inode or its equivalent. It should set:
+   mode, uid, gid, (starting)isize, atime, ctime, mtime */
+int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			    struct jffs2_raw_inode *ri, unsigned char *buf,
+			    uint32_t offset, uint32_t writelen, uint32_t *retlen)
+{
+	int ret = 0;
+	uint32_t writtenlen = 0;
+
+	jffs2_dbg(1, "%s(): Ino #%u, ofs 0x%x, len 0x%x\n",
+		  __func__, f->inocache->ino, offset, writelen);
+
+	while(writelen) {
+		struct jffs2_full_dnode *fn;
+		unsigned char *comprbuf = NULL;
+		uint16_t comprtype = JFFS2_COMPR_NONE;
+		uint32_t alloclen;
+		uint32_t datalen, cdatalen;
+		int retried = 0;
+
+	retry:
+		jffs2_dbg(2, "jffs2_commit_write() loop: 0x%x to write to 0x%x\n",
+			  writelen, offset);
+
+		ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN,
+					&alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		if (ret) {
+			jffs2_dbg(1, "jffs2_reserve_space returned %d\n", ret);
+			break;
+		}
+		mutex_lock(&f->sem);
+		datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1)));
+		cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen);
+
+		comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen);
+
+		ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri->totlen = cpu_to_je32(sizeof(*ri) + cdatalen);
+		ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri->ino = cpu_to_je32(f->inocache->ino);
+		ri->version = cpu_to_je32(++f->highest_version);
+		ri->isize = cpu_to_je32(max(je32_to_cpu(ri->isize), offset + datalen));
+		ri->offset = cpu_to_je32(offset);
+		ri->csize = cpu_to_je32(cdatalen);
+		ri->dsize = cpu_to_je32(datalen);
+		ri->compr = comprtype & 0xff;
+		ri->usercompr = (comprtype >> 8 ) & 0xff;
+		ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+		ri->data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
+
+		fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, ALLOC_NORETRY);
+
+		jffs2_free_comprbuf(comprbuf, buf);
+
+		if (IS_ERR(fn)) {
+			ret = PTR_ERR(fn);
+			mutex_unlock(&f->sem);
+			jffs2_complete_reservation(c);
+			if (!retried) {
+				/* Write error to be retried */
+				retried = 1;
+				jffs2_dbg(1, "Retrying node write in jffs2_write_inode_range()\n");
+				goto retry;
+			}
+			break;
+		}
+		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+		if (ret) {
+			/* Eep */
+			jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n",
+				  ret);
+			jffs2_mark_node_obsolete(c, fn->raw);
+			jffs2_free_full_dnode(fn);
+
+			mutex_unlock(&f->sem);
+			jffs2_complete_reservation(c);
+			break;
+		}
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		if (!datalen) {
+			pr_warn("Eep. We didn't actually write any data in jffs2_write_inode_range()\n");
+			ret = -EIO;
+			break;
+		}
+		jffs2_dbg(1, "increasing writtenlen by %d\n", datalen);
+		writtenlen += datalen;
+		offset += datalen;
+		writelen -= datalen;
+		buf += datalen;
+	}
+	*retlen = writtenlen;
+	return ret;
+}
+
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+		    struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
+		    const struct qstr *qstr)
+{
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	uint32_t alloclen;
+	int ret;
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
+				JFFS2_SUMMARY_INODE_SIZE);
+	jffs2_dbg(1, "%s(): reserved 0x%x bytes\n", __func__, alloclen);
+	if (ret)
+		return ret;
+
+	mutex_lock(&f->sem);
+
+	ri->data_crc = cpu_to_je32(0);
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
+
+	jffs2_dbg(1, "jffs2_do_create created file with mode 0x%x\n",
+		  jemode_to_cpu(ri->mode));
+
+	if (IS_ERR(fn)) {
+		jffs2_dbg(1, "jffs2_write_dnode() failed\n");
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(fn);
+	}
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+
+	mutex_unlock(&f->sem);
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
+	if (ret)
+		return ret;
+	ret = jffs2_init_acl_post(&f->vfs_inode);
+	if (ret)
+		return ret;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
+				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
+
+	if (ret) {
+		/* Eep. */
+		jffs2_dbg(1, "jffs2_reserve_space() for dirent failed\n");
+		return ret;
+	}
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_f->inocache->ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = ri->ino;
+	rd->mctime = ri->ctime;
+	rd->nsize = qstr->len;
+	rd->type = DT_REG;
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
+
+	jffs2_free_raw_dirent(rd);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		mutex_unlock(&dir_f->sem);
+		return PTR_ERR(fd);
+	}
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	jffs2_complete_reservation(c);
+	mutex_unlock(&dir_f->sem);
+
+	return 0;
+}
+
+
+int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+		    const char *name, int namelen, struct jffs2_inode_info *dead_f,
+		    uint32_t time)
+{
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dirent *fd;
+	uint32_t alloclen;
+	int ret;
+
+	if (!jffs2_can_mark_obsolete(c)) {
+		/* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */
+
+		rd = jffs2_alloc_raw_dirent();
+		if (!rd)
+			return -ENOMEM;
+
+		ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+					ALLOC_DELETION, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+		if (ret) {
+			jffs2_free_raw_dirent(rd);
+			return ret;
+		}
+
+		mutex_lock(&dir_f->sem);
+
+		/* Build a deletion node */
+		rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+		rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+		rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+		rd->pino = cpu_to_je32(dir_f->inocache->ino);
+		rd->version = cpu_to_je32(++dir_f->highest_version);
+		rd->ino = cpu_to_je32(0);
+		rd->mctime = cpu_to_je32(time);
+		rd->nsize = namelen;
+		rd->type = DT_UNKNOWN;
+		rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+		rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+
+		fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_DELETION);
+
+		jffs2_free_raw_dirent(rd);
+
+		if (IS_ERR(fd)) {
+			jffs2_complete_reservation(c);
+			mutex_unlock(&dir_f->sem);
+			return PTR_ERR(fd);
+		}
+
+		/* File it. This will mark the old one obsolete. */
+		jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+		mutex_unlock(&dir_f->sem);
+	} else {
+		uint32_t nhash = full_name_hash(name, namelen);
+
+		fd = dir_f->dents;
+		/* We don't actually want to reserve any space, but we do
+		   want to be holding the alloc_sem when we write to flash */
+		mutex_lock(&c->alloc_sem);
+		mutex_lock(&dir_f->sem);
+
+		for (fd = dir_f->dents; fd; fd = fd->next) {
+			if (fd->nhash == nhash &&
+			    !memcmp(fd->name, name, namelen) &&
+			    !fd->name[namelen]) {
+
+				jffs2_dbg(1, "Marking old dirent node (ino #%u) @%08x obsolete\n",
+					  fd->ino, ref_offset(fd->raw));
+				jffs2_mark_node_obsolete(c, fd->raw);
+				/* We don't want to remove it from the list immediately,
+				   because that screws up getdents()/seek() semantics even
+				   more than they're screwed already. Turn it into a
+				   node-less deletion dirent instead -- a placeholder */
+				fd->raw = NULL;
+				fd->ino = 0;
+				break;
+			}
+		}
+		mutex_unlock(&dir_f->sem);
+	}
+
+	/* dead_f is NULL if this was a rename not a real unlink */
+	/* Also catch the !f->inocache case, where there was a dirent
+	   pointing to an inode which didn't exist. */
+	if (dead_f && dead_f->inocache) {
+
+		mutex_lock(&dead_f->sem);
+
+		if (S_ISDIR(OFNI_EDONI_2SFFJ(dead_f)->i_mode)) {
+			while (dead_f->dents) {
+				/* There can be only deleted ones */
+				fd = dead_f->dents;
+
+				dead_f->dents = fd->next;
+
+				if (fd->ino) {
+					pr_warn("Deleting inode #%u with active dentry \"%s\"->ino #%u\n",
+						dead_f->inocache->ino,
+						fd->name, fd->ino);
+				} else {
+					jffs2_dbg(1, "Removing deletion dirent for \"%s\" from dir ino #%u\n",
+						  fd->name,
+						  dead_f->inocache->ino);
+				}
+				if (fd->raw)
+					jffs2_mark_node_obsolete(c, fd->raw);
+				jffs2_free_full_dirent(fd);
+			}
+			dead_f->inocache->pino_nlink = 0;
+		} else
+			dead_f->inocache->pino_nlink--;
+		/* NB: Caller must set inode nlink if appropriate */
+		mutex_unlock(&dead_f->sem);
+	}
+
+	jffs2_complete_reservation(c);
+
+	return 0;
+}
+
+
+int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, uint8_t type, const char *name, int namelen, uint32_t time)
+{
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dirent *fd;
+	uint32_t alloclen;
+	int ret;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd)
+		return -ENOMEM;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret) {
+		jffs2_free_raw_dirent(rd);
+		return ret;
+	}
+
+	mutex_lock(&dir_f->sem);
+
+	/* Build a deletion node */
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_f->inocache->ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(ino);
+	rd->mctime = cpu_to_je32(time);
+	rd->nsize = namelen;
+
+	rd->type = type;
+
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
+
+	jffs2_free_raw_dirent(rd);
+
+	if (IS_ERR(fd)) {
+		jffs2_complete_reservation(c);
+		mutex_unlock(&dir_f->sem);
+		return PTR_ERR(fd);
+	}
+
+	/* File it. This will mark the old one obsolete. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	jffs2_complete_reservation(c);
+	mutex_unlock(&dir_f->sem);
+
+	return 0;
+}
diff --git a/kernel/fs/jffs2/writev.c b/kernel/fs/jffs2/writev.c
new file mode 100644
index 000000000..a1bda9dab
--- /dev/null
+++ b/kernel/fs/jffs2/writev.c
@@ -0,0 +1,51 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
+			      unsigned long count, loff_t to, size_t *retlen)
+{
+	if (!jffs2_is_writebuffered(c)) {
+		if (jffs2_sum_active()) {
+			int res;
+			res = jffs2_sum_add_kvec(c, vecs, count, (uint32_t) to);
+			if (res) {
+				return res;
+			}
+		}
+	}
+
+	return mtd_writev(c->mtd, vecs, count, to, retlen);
+}
+
+int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+			size_t *retlen, const u_char *buf)
+{
+	int ret;
+	ret = mtd_write(c->mtd, ofs, len, retlen, buf);
+
+	if (jffs2_sum_active()) {
+		struct kvec vecs[1];
+		int res;
+
+		vecs[0].iov_base = (unsigned char *) buf;
+		vecs[0].iov_len = len;
+
+		res = jffs2_sum_add_kvec(c, vecs, 1, (uint32_t) ofs);
+		if (res) {
+			return res;
+		}
+	}
+	return ret;
+}
diff --git a/kernel/fs/jffs2/xattr.c b/kernel/fs/jffs2/xattr.c
new file mode 100644
index 000000000..f092fee5b
--- /dev/null
+++ b/kernel/fs/jffs2/xattr.c
@@ -0,0 +1,1340 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define JFFS2_XATTR_IS_CORRUPTED	1
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+/* -------- xdatum related functions ----------------
+ * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
+ *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
+ *   the index of the xattr name/value pair cache (c->xattrindex).
+ * is_xattr_datum_unchecked(c, xd)
+ *   returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not
+ *   unchecked, it returns 0.
+ * unload_xattr_datum(c, xd)
+ *   is used to release xattr name/value pair and detach from c->xattrindex.
+ * reclaim_xattr_datum(c)
+ *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
+ *   is hard coded as 32KiB.
+ * do_verify_xattr_datum(c, xd)
+ *   is used to load the xdatum informations without name/value pair from the medium.
+ *   It's necessary once, because those informations are not collected during mounting
+ *   process when EBS is enabled.
+ *   0 will be returned, if success. An negative return value means recoverable error, and
+ *   positive return value means unrecoverable error. Thus, caller must remove this xdatum
+ *   and xref when it returned positive value.
+ * do_load_xattr_datum(c, xd)
+ *   is used to load name/value pair from the medium.
+ *   The meanings of return value is same as do_verify_xattr_datum().
+ * load_xattr_datum(c, xd)
+ *   is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum().
+ *   If xd need to call do_verify_xattr_datum() at first, it's called before calling
+ *   do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum().
+ * save_xattr_datum(c, xd)
+ *   is used to write xdatum to medium. xd->version will be incremented.
+ * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
+ *   is used to create new xdatum and write to medium.
+ * unrefer_xattr_datum(c, xd)
+ *   is used to delete a xdatum. When nobody refers this xdatum, JFFS2_XFLAGS_DEAD
+ *   is set on xd->flags and chained xattr_dead_list or release it immediately.
+ *   In the first case, the garbage collector release it later.
+ * -------------------------------------------------- */
+static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
+{
+	int name_len = strlen(xname);
+
+	return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
+}
+
+static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	struct jffs2_raw_node_ref *raw;
+	int rc = 0;
+
+	spin_lock(&c->erase_completion_lock);
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			rc = 1;
+			break;
+		}
+	}
+	spin_unlock(&c->erase_completion_lock);
+	return rc;
+}
+
+static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	D1(dbg_xattr("%s: xid=%u, version=%u\n", __func__, xd->xid, xd->version));
+	if (xd->xname) {
+		c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len);
+		kfree(xd->xname);
+	}
+
+	list_del_init(&xd->xindex);
+	xd->hashkey = 0;
+	xd->xname = NULL;
+	xd->xvalue = NULL;
+}
+
+static void reclaim_xattr_datum(struct jffs2_sb_info *c)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd, *_xd;
+	uint32_t target, before;
+	static int index = 0;
+	int count;
+
+	if (c->xdatum_mem_threshold > c->xdatum_mem_usage)
+		return;
+
+	before = c->xdatum_mem_usage;
+	target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */
+	for (count = 0; count < XATTRINDEX_HASHSIZE; count++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) {
+			if (xd->flags & JFFS2_XFLAGS_HOT) {
+				xd->flags &= ~JFFS2_XFLAGS_HOT;
+			} else if (!(xd->flags & JFFS2_XFLAGS_BIND)) {
+				unload_xattr_datum(c, xd);
+			}
+			if (c->xdatum_mem_usage <= target)
+				goto out;
+		}
+		index = (index+1) % XATTRINDEX_HASHSIZE;
+	}
+ out:
+	JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n",
+		     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
+}
+
+static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xattr rx;
+	size_t readlen;
+	uint32_t crc, offset, totlen;
+	int rc;
+
+	spin_lock(&c->erase_completion_lock);
+	offset = ref_offset(xd->node);
+	if (ref_flags(xd->node) == REF_PRISTINE)
+		goto complete;
+	spin_unlock(&c->erase_completion_lock);
+
+	rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx);
+	if (rc || readlen != sizeof(rx)) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
+			      rc, sizeof(rx), readlen, offset);
+		return rc ? rc : -EIO;
+	}
+	crc = crc32(0, &rx, sizeof(rx) - 4);
+	if (crc != je32_to_cpu(rx.node_crc)) {
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rx.hdr_crc), crc);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
+		return JFFS2_XATTR_IS_CORRUPTED;
+	}
+	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
+	    || je32_to_cpu(rx.totlen) != totlen
+	    || je32_to_cpu(rx.xid) != xd->xid
+	    || je32_to_cpu(rx.version) != xd->version) {
+		JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
+			    offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
+			    je32_to_cpu(rx.totlen), totlen,
+			    je32_to_cpu(rx.xid), xd->xid,
+			    je32_to_cpu(rx.version), xd->version);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
+		return JFFS2_XATTR_IS_CORRUPTED;
+	}
+	xd->xprefix = rx.xprefix;
+	xd->name_len = rx.name_len;
+	xd->value_len = je16_to_cpu(rx.value_len);
+	xd->data_crc = je32_to_cpu(rx.data_crc);
+
+	spin_lock(&c->erase_completion_lock);
+ complete:
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	/* unchecked xdatum is chained with c->xattr_unchecked */
+	list_del_init(&xd->xindex);
+
+	dbg_xattr("success on verifying xdatum (xid=%u, version=%u)\n",
+		  xd->xid, xd->version);
+
+	return 0;
+}
+
+static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	char *data;
+	size_t readlen;
+	uint32_t crc, length;
+	int i, ret, retry = 0;
+
+	BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
+	BUG_ON(!list_empty(&xd->xindex));
+ retry:
+	length = xd->name_len + 1 + xd->value_len;
+	data = kmalloc(length, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr),
+			       length, &readlen, data);
+
+	if (ret || length!=readlen) {
+		JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%zu, at %#08x\n",
+			      ret, length, readlen, ref_offset(xd->node));
+		kfree(data);
+		return ret ? ret : -EIO;
+	}
+
+	data[xd->name_len] = '\0';
+	crc = crc32(0, data, length);
+	if (crc != xd->data_crc) {
+		JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XATTR)"
+			      " at %#08x, read: 0x%08x calculated: 0x%08x\n",
+			      ref_offset(xd->node), xd->data_crc, crc);
+		kfree(data);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
+		return JFFS2_XATTR_IS_CORRUPTED;
+	}
+
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xname = data;
+	xd->xvalue = data + xd->name_len+1;
+
+	c->xdatum_mem_usage += length;
+
+	xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len);
+	i = xd->hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+	if (!retry) {
+		retry = 1;
+		reclaim_xattr_datum(c);
+		if (!xd->xname)
+			goto retry;
+	}
+
+	dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem);
+	 * rc < 0 : recoverable error, try again
+	 * rc = 0 : success
+	 * rc > 0 : Unrecoverable error, this node should be deleted.
+	 */
+	int rc = 0;
+
+	BUG_ON(xd->flags & JFFS2_XFLAGS_DEAD);
+	if (xd->xname)
+		return 0;
+	if (xd->flags & JFFS2_XFLAGS_INVALID)
+		return JFFS2_XATTR_IS_CORRUPTED;
+	if (unlikely(is_xattr_datum_unchecked(c, xd)))
+		rc = do_verify_xattr_datum(c, xd);
+	if (!rc)
+		rc = do_load_xattr_datum(c, xd);
+	return rc;
+}
+
+static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xattr rx;
+	struct kvec vecs[2];
+	size_t length;
+	int rc, totlen;
+	uint32_t phys_ofs = write_ofs(c);
+
+	BUG_ON(!xd->xname);
+	BUG_ON(xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID));
+
+	vecs[0].iov_base = &rx;
+	vecs[0].iov_len = sizeof(rx);
+	vecs[1].iov_base = xd->xname;
+	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+	totlen = vecs[0].iov_len + vecs[1].iov_len;
+
+	/* Setup raw-xattr */
+	memset(&rx, 0, sizeof(rx));
+	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
+	rx.totlen = cpu_to_je32(PAD(totlen));
+	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
+
+	rx.xid = cpu_to_je32(xd->xid);
+	rx.version = cpu_to_je32(++xd->version);
+	rx.xprefix = xd->xprefix;
+	rx.name_len = xd->name_len;
+	rx.value_len = cpu_to_je16(xd->value_len);
+	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
+
+	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+	if (rc || totlen != length) {
+		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
+			      rc, totlen, length, phys_ofs);
+		rc = rc ? rc : -EIO;
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(totlen), NULL);
+
+		return rc;
+	}
+	/* success */
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd);
+
+	dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->version, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
+						    int xprefix, const char *xname,
+						    const char *xvalue, int xsize)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+	uint32_t hashkey, name_len;
+	char *data;
+	int i, rc;
+
+	/* Search xattr_datum has same xname/xvalue by index */
+	hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize);
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->hashkey==hashkey
+		    && xd->xprefix==xprefix
+		    && xd->value_len==xsize
+		    && !strcmp(xd->xname, xname)
+		    && !memcmp(xd->xvalue, xvalue, xsize)) {
+			atomic_inc(&xd->refcnt);
+			return xd;
+		}
+	}
+
+	/* Not found, Create NEW XATTR-Cache */
+	name_len = strlen(xname);
+
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+
+	data = kmalloc(name_len + 1 + xsize, GFP_KERNEL);
+	if (!data) {
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(-ENOMEM);
+	}
+	strcpy(data, xname);
+	memcpy(data + name_len + 1, xvalue, xsize);
+
+	atomic_set(&xd->refcnt, 1);
+	xd->xid = ++c->highest_xid;
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xprefix = xprefix;
+
+	xd->hashkey = hashkey;
+	xd->xname = data;
+	xd->xvalue = data + name_len + 1;
+	xd->name_len = name_len;
+	xd->value_len = xsize;
+	xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len);
+
+	rc = save_xattr_datum(c, xd);
+	if (rc) {
+		kfree(xd->xname);
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(rc);
+	}
+
+	/* Insert Hash Index */
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+
+	c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len);
+	reclaim_xattr_datum(c);
+
+	return xd;
+}
+
+static void unrefer_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	if (atomic_dec_and_lock(&xd->refcnt, &c->erase_completion_lock)) {
+		unload_xattr_datum(c, xd);
+		xd->flags |= JFFS2_XFLAGS_DEAD;
+		if (xd->node == (void *)xd) {
+			BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID));
+			jffs2_free_xattr_datum(xd);
+		} else {
+			list_add(&xd->xindex, &c->xattr_dead_list);
+		}
+		spin_unlock(&c->erase_completion_lock);
+
+		dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n",
+			  xd->xid, xd->version);
+	}
+}
+
+/* -------- xref related functions ------------------
+ * verify_xattr_ref(c, ref)
+ *   is used to load xref information from medium. Because summary data does not
+ *   contain xid/ino, it's necessary to verify once while mounting process.
+ * save_xattr_ref(c, ref)
+ *   is used to write xref to medium. If delete marker is marked, it write
+ *   a delete marker of xref into medium.
+ * create_xattr_ref(c, ic, xd)
+ *   is used to create a new xref and write to medium.
+ * delete_xattr_ref(c, ref)
+ *   is used to delete jffs2_xattr_ref. It marks xref XREF_DELETE_MARKER,
+ *   and allows GC to reclaim those physical nodes.
+ * jffs2_xattr_delete_inode(c, ic)
+ *   is called to remove xrefs related to obsolete inode when inode is unlinked.
+ * jffs2_xattr_free_inode(c, ic)
+ *   is called to release xattr related objects when unmounting. 
+ * check_xattr_ref_inode(c, ic)
+ *   is used to confirm inode does not have duplicate xattr name/value pair.
+ * jffs2_xattr_do_crccheck_inode(c, ic)
+ *   is used to force xattr data integrity check during the initial gc scan.
+ * -------------------------------------------------- */
+static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xref rr;
+	size_t readlen;
+	uint32_t crc, offset, totlen;
+	int rc;
+
+	spin_lock(&c->erase_completion_lock);
+	if (ref_flags(ref->node) != REF_UNCHECKED)
+		goto complete;
+	offset = ref_offset(ref->node);
+	spin_unlock(&c->erase_completion_lock);
+
+	rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr);
+	if (rc || sizeof(rr) != readlen) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n",
+			      rc, sizeof(rr), readlen, offset);
+		return rc ? rc : -EIO;
+	}
+	/* obsolete node */
+	crc = crc32(0, &rr, sizeof(rr) - 4);
+	if (crc != je32_to_cpu(rr.node_crc)) {
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rr.node_crc), crc);
+		return JFFS2_XATTR_IS_CORRUPTED;
+	}
+	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
+	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
+		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%zu\n",
+			    offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
+			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
+		return JFFS2_XATTR_IS_CORRUPTED;
+	}
+	ref->ino = je32_to_cpu(rr.ino);
+	ref->xid = je32_to_cpu(rr.xid);
+	ref->xseqno = je32_to_cpu(rr.xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
+
+	spin_lock(&c->erase_completion_lock);
+ complete:
+	for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
+		  ref->ino, ref->xid, ref_offset(ref->node));
+	return 0;
+}
+
+static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xref rr;
+	size_t length;
+	uint32_t xseqno, phys_ofs = write_ofs(c);
+	int ret;
+
+	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
+	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
+	rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
+
+	xseqno = (c->highest_xseqno += 2);
+	if (is_xattr_ref_dead(ref)) {
+		xseqno |= XREF_DELETE_MARKER;
+		rr.ino = cpu_to_je32(ref->ino);
+		rr.xid = cpu_to_je32(ref->xid);
+	} else {
+		rr.ino = cpu_to_je32(ref->ic->ino);
+		rr.xid = cpu_to_je32(ref->xd->xid);
+	}
+	rr.xseqno = cpu_to_je32(xseqno);
+	rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
+
+	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
+	if (ret || sizeof(rr) != length) {
+		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%zu, retlen=%zu, at %#08x\n",
+			      ret, sizeof(rr), length, phys_ofs);
+		ret = ret ? ret : -EIO;
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(sizeof(rr)), NULL);
+
+		return ret;
+	}
+	/* success */
+	ref->xseqno = xseqno;
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref);
+
+	dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
+
+	return 0;
+}
+
+static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic,
+						struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_ref *ref;
+	int ret;
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return ERR_PTR(-ENOMEM);
+	ref->ic = ic;
+	ref->xd = xd;
+
+	ret = save_xattr_ref(c, ref);
+	if (ret) {
+		jffs2_free_xattr_ref(ref);
+		return ERR_PTR(ret);
+	}
+
+	/* Chain to inode */
+	ref->next = ic->xref;
+	ic->xref = ref;
+
+	return ref; /* success */
+}
+
+static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+
+	xd = ref->xd;
+	ref->xseqno |= XREF_DELETE_MARKER;
+	ref->ino = ref->ic->ino;
+	ref->xid = ref->xd->xid;
+	spin_lock(&c->erase_completion_lock);
+	ref->next = c->xref_dead_list;
+	c->xref_dead_list = ref;
+	spin_unlock(&c->erase_completion_lock);
+
+	dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n",
+		  ref->ino, ref->xid, ref->xseqno);
+
+	unrefer_xattr_datum(c, xd);
+}
+
+void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_evict_inode() on inode removing.
+	   When an inode with XATTR is removed, those XATTRs must be removed. */
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	if (!ic || ic->pino_nlink > 0)
+		return;
+
+	down_write(&c->xattr_sem);
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
+		delete_xattr_ref(c, ref);
+	}
+	ic->xref = NULL;
+	up_write(&c->xattr_sem);
+}
+
+void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_free_ino_caches() until unmounting FS. */
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	down_write(&c->xattr_sem);
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
+		xd = ref->xd;
+		if (atomic_dec_and_test(&xd->refcnt)) {
+			unload_xattr_datum(c, xd);
+			jffs2_free_xattr_datum(xd);
+		}
+		jffs2_free_xattr_ref(ref);
+	}
+	ic->xref = NULL;
+	up_write(&c->xattr_sem);
+}
+
+static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* success of check_xattr_ref_inode() means that inode (ic) dose not have
+	 * duplicate name/value pairs. If duplicate name/value pair would be found,
+	 * one will be removed.
+	 */
+	struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp;
+	int rc = 0;
+
+	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
+		return 0;
+	down_write(&c->xattr_sem);
+ retry:
+	rc = 0;
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		if (!ref->xd->xname) {
+			rc = load_xattr_datum(c, ref->xd);
+			if (unlikely(rc > 0)) {
+				*pref = ref->next;
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) {
+			if (!cmp->xd->xname) {
+				ref->xd->flags |= JFFS2_XFLAGS_BIND;
+				rc = load_xattr_datum(c, cmp->xd);
+				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
+				if (unlikely(rc > 0)) {
+					*pcmp = cmp->next;
+					delete_xattr_ref(c, cmp);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+			if (ref->xd->xprefix == cmp->xd->xprefix
+			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
+				if (ref->xseqno > cmp->xseqno) {
+					*pcmp = cmp->next;
+					delete_xattr_ref(c, cmp);
+				} else {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+				}
+				goto retry;
+			}
+		}
+	}
+	ic->flags |= INO_FLAGS_XATTR_CHECKED;
+ out:
+	up_write(&c->xattr_sem);
+
+	return rc;
+}
+
+void jffs2_xattr_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	check_xattr_ref_inode(c, ic);
+}
+
+/* -------- xattr subsystem functions ---------------
+ * jffs2_init_xattr_subsystem(c)
+ *   is used to initialize semaphore and list_head, and some variables.
+ * jffs2_find_xattr_datum(c, xid)
+ *   is used to lookup xdatum while scanning process.
+ * jffs2_clear_xattr_subsystem(c)
+ *   is used to release any xattr related objects.
+ * jffs2_build_xattr_subsystem(c)
+ *   is used to associate xdatum and xref while super block building process.
+ * jffs2_setup_xattr_datum(c, xid, version)
+ *   is used to insert xdatum while scanning process.
+ * -------------------------------------------------- */
+void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	int i;
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
+		INIT_LIST_HEAD(&c->xattrindex[i]);
+	INIT_LIST_HEAD(&c->xattr_unchecked);
+	INIT_LIST_HEAD(&c->xattr_dead_list);
+	c->xref_dead_list = NULL;
+	c->xref_temp = NULL;
+
+	init_rwsem(&c->xattr_sem);
+	c->highest_xid = 0;
+	c->highest_xseqno = 0;
+	c->xdatum_mem_usage = 0;
+	c->xdatum_mem_threshold = 32 * 1024;	/* Default 32KB */
+}
+
+static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid)
+{
+	struct jffs2_xattr_datum *xd;
+	int i = xid % XATTRINDEX_HASHSIZE;
+
+	/* It's only used in scanning/building process. */
+	BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING)));
+
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->xid==xid)
+			return xd;
+	}
+	return NULL;
+}
+
+void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+	int i;
+
+	for (ref=c->xref_temp; ref; ref = _ref) {
+		_ref = ref->next;
+		jffs2_free_xattr_ref(ref);
+	}
+
+	for (ref=c->xref_dead_list; ref; ref = _ref) {
+		_ref = ref->next;
+		jffs2_free_xattr_ref(ref);
+	}
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del(&xd->xindex);
+			kfree(xd->xname);
+			jffs2_free_xattr_datum(xd);
+		}
+	}
+
+	list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) {
+		list_del(&xd->xindex);
+		jffs2_free_xattr_datum(xd);
+	}
+	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+		list_del(&xd->xindex);
+		jffs2_free_xattr_datum(xd);
+	}
+}
+
+#define XREF_TMPHASH_SIZE	(128)
+void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE];
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_inode_cache *ic;
+	struct jffs2_raw_node_ref *raw;
+	int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0;
+	int xdatum_orphan_count = 0, xref_orphan_count = 0, xref_dead_count = 0;
+
+	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+
+	/* Phase.1 : Merge same xref */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++)
+		xref_tmphash[i] = NULL;
+	for (ref=c->xref_temp; ref; ref=_ref) {
+		struct jffs2_xattr_ref *tmp;
+
+		_ref = ref->next;
+		if (ref_flags(ref->node) != REF_PRISTINE) {
+			if (verify_xattr_ref(c, ref)) {
+				BUG_ON(ref->node->next_in_ino != (void *)ref);
+				ref->node->next_in_ino = NULL;
+				jffs2_mark_node_obsolete(c, ref->node);
+				jffs2_free_xattr_ref(ref);
+				continue;
+			}
+		}
+
+		i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE;
+		for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) {
+			if (tmp->ino == ref->ino && tmp->xid == ref->xid)
+				break;
+		}
+		if (tmp) {
+			raw = ref->node;
+			if (ref->xseqno > tmp->xseqno) {
+				tmp->xseqno = ref->xseqno;
+				raw->next_in_ino = tmp->node;
+				tmp->node = raw;
+			} else {
+				raw->next_in_ino = tmp->node->next_in_ino;
+				tmp->node->next_in_ino = raw;
+			}
+			jffs2_free_xattr_ref(ref);
+			continue;
+		} else {
+			ref->next = xref_tmphash[i];
+			xref_tmphash[i] = ref;
+		}
+	}
+	c->xref_temp = NULL;
+
+	/* Phase.2 : Bind xref with inode_cache and xattr_datum */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++) {
+		for (ref=xref_tmphash[i]; ref; ref=_ref) {
+			xref_count++;
+			_ref = ref->next;
+			if (is_xattr_ref_dead(ref)) {
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				xref_dead_count++;
+				continue;
+			}
+			/* At this point, ref->xid and ref->ino contain XID and inode number.
+			   ref->xd and ref->ic are not valid yet. */
+			xd = jffs2_find_xattr_datum(c, ref->xid);
+			ic = jffs2_get_ino_cache(c, ref->ino);
+			if (!xd || !ic || !ic->pino_nlink) {
+				dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n",
+					  ref->ino, ref->xid, ref->xseqno);
+				ref->xseqno |= XREF_DELETE_MARKER;
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				xref_orphan_count++;
+				continue;
+			}
+			ref->xd = xd;
+			ref->ic = ic;
+			atomic_inc(&xd->refcnt);
+			ref->next = ic->xref;
+			ic->xref = ref;
+		}
+	}
+
+	/* Phase.3 : Link unchecked xdatum to xattr_unchecked list */
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			xdatum_count++;
+			list_del_init(&xd->xindex);
+			if (!atomic_read(&xd->refcnt)) {
+				dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n",
+					  xd->xid, xd->version);
+				xd->flags |= JFFS2_XFLAGS_DEAD;
+				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_orphan_count++;
+				continue;
+			}
+			if (is_xattr_datum_unchecked(c, xd)) {
+				dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n",
+					  xd->xid, xd->version);
+				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_unchecked_count++;
+			}
+		}
+	}
+	/* build complete */
+	JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum"
+		     " (%u unchecked, %u orphan) and "
+		     "%u of xref (%u dead, %u orphan) found.\n",
+		     xdatum_count, xdatum_unchecked_count, xdatum_orphan_count,
+		     xref_count, xref_dead_count, xref_orphan_count);
+}
+
+struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+						  uint32_t xid, uint32_t version)
+{
+	struct jffs2_xattr_datum *xd;
+
+	xd = jffs2_find_xattr_datum(c, xid);
+	if (!xd) {
+		xd = jffs2_alloc_xattr_datum();
+		if (!xd)
+			return ERR_PTR(-ENOMEM);
+		xd->xid = xid;
+		xd->version = version;
+		if (xd->xid > c->highest_xid)
+			c->highest_xid = xd->xid;
+		list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
+	}
+	return xd;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * xprefix_to_handler(xprefix)
+ *   is used to translate xprefix into xattr_handler.
+ * jffs2_listxattr(dentry, buffer, size)
+ *   is an implementation of listxattr handler on jffs2.
+ * do_jffs2_getxattr(inode, xprefix, xname, buffer, size)
+ *   is an implementation of getxattr handler on jffs2.
+ * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
+ *   is an implementation of setxattr handler on jffs2.
+ * -------------------------------------------------- */
+const struct xattr_handler *jffs2_xattr_handlers[] = {
+	&jffs2_user_xattr_handler,
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	&jffs2_security_xattr_handler,
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&jffs2_trusted_xattr_handler,
+	NULL
+};
+
+static const struct xattr_handler *xprefix_to_handler(int xprefix) {
+	const struct xattr_handler *ret;
+
+	switch (xprefix) {
+	case JFFS2_XPREFIX_USER:
+		ret = &jffs2_user_xattr_handler;
+		break;
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	case JFFS2_XPREFIX_SECURITY:
+		ret = &jffs2_security_xattr_handler;
+		break;
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	case JFFS2_XPREFIX_ACL_ACCESS:
+		ret = &posix_acl_access_xattr_handler;
+		break;
+	case JFFS2_XPREFIX_ACL_DEFAULT:
+		ret = &posix_acl_default_xattr_handler;
+		break;
+#endif
+	case JFFS2_XPREFIX_TRUSTED:
+		ret = &jffs2_trusted_xattr_handler;
+		break;
+	default:
+		ret = NULL;
+		break;
+	}
+	return ret;
+}
+
+ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_ref *ref, **pref;
+	struct jffs2_xattr_datum *xd;
+	const struct xattr_handler *xhandle;
+	ssize_t len, rc;
+	int retry = 0;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	len = 0;
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		BUG_ON(ref->ic != ic);
+		xd = ref->xd;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+		}
+		xhandle = xprefix_to_handler(xd->xprefix);
+		if (!xhandle)
+			continue;
+		if (buffer) {
+			rc = xhandle->list(dentry, buffer+len, size-len,
+					   xd->xname, xd->name_len, xd->flags);
+		} else {
+			rc = xhandle->list(dentry, NULL, 0, xd->xname,
+					   xd->name_len, xd->flags);
+		}
+		if (rc < 0)
+			goto out;
+		len += rc;
+	}
+	rc = len;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+		      char *buffer, size_t size)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, **pref;
+	int rc, retry = 0;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		BUG_ON(ref->ic!=ic);
+
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0)) {
+					goto out;
+				}
+			}
+		}
+		if (!strcmp(xname, xd->xname)) {
+			rc = xd->value_len;
+			if (buffer) {
+				if (size < rc) {
+					rc = -ERANGE;
+				} else {
+					memcpy(buffer, xd->xvalue, rc);
+				}
+			}
+			goto out;
+		}
+	}
+	rc = -ENODATA;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+		      const char *buffer, size_t size, int flags)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *newref, **pref;
+	uint32_t length, request;
+	int rc;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
+	rc = jffs2_reserve_space(c, request, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		return rc;
+	}
+
+	/* Find existing xattr */
+	down_write(&c->xattr_sem);
+ retry:
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			rc = load_xattr_datum(c, xd);
+			if (unlikely(rc > 0)) {
+				*pref = ref->next;
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		if (!strcmp(xd->xname, xname)) {
+			if (flags & XATTR_CREATE) {
+				rc = -EEXIST;
+				goto out;
+			}
+			if (!buffer) {
+				ref->ino = ic->ino;
+				ref->xid = xd->xid;
+				ref->xseqno |= XREF_DELETE_MARKER;
+				rc = save_xattr_ref(c, ref);
+				if (!rc) {
+					*pref = ref->next;
+					spin_lock(&c->erase_completion_lock);
+					ref->next = c->xref_dead_list;
+					c->xref_dead_list = ref;
+					spin_unlock(&c->erase_completion_lock);
+					unrefer_xattr_datum(c, xd);
+				} else {
+					ref->ic = ic;
+					ref->xd = xd;
+					ref->xseqno &= ~XREF_DELETE_MARKER;
+				}
+				goto out;
+			}
+			goto found;
+		}
+	}
+	/* not found */
+	if (flags & XATTR_REPLACE) {
+		rc = -ENODATA;
+		goto out;
+	}
+	if (!buffer) {
+		rc = -ENODATA;
+		goto out;
+	}
+ found:
+	xd = create_xattr_datum(c, xprefix, xname, buffer, size);
+	if (IS_ERR(xd)) {
+		rc = PTR_ERR(xd);
+		goto out;
+	}
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+
+	/* create xattr_ref */
+	request = PAD(sizeof(struct jffs2_raw_xref));
+	rc = jffs2_reserve_space(c, request, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+	down_write(&c->xattr_sem);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		unrefer_xattr_datum(c, xd);
+		up_write(&c->xattr_sem);
+		return rc;
+	}
+	if (ref)
+		*pref = ref->next;
+	newref = create_xattr_ref(c, ic, xd);
+	if (IS_ERR(newref)) {
+		if (ref) {
+			ref->next = ic->xref;
+			ic->xref = ref;
+		}
+		rc = PTR_ERR(newref);
+		unrefer_xattr_datum(c, xd);
+	} else if (ref) {
+		delete_xattr_ref(c, ref);
+	}
+ out:
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+	return rc;
+}
+
+/* -------- garbage collector functions -------------
+ * jffs2_garbage_collect_xattr_datum(c, xd, raw)
+ *   is used to move xdatum into new node.
+ * jffs2_garbage_collect_xattr_ref(c, ref, raw)
+ *   is used to move xref into new node.
+ * jffs2_verify_xattr(c)
+ *   is used to call do_verify_xattr_datum() before garbage collecting.
+ * jffs2_release_xattr_datum(c, xd)
+ *   is used to release an in-memory object of xdatum.
+ * jffs2_release_xattr_ref(c, ref)
+ *   is used to release an in-memory object of xref.
+ * -------------------------------------------------- */
+int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+				      struct jffs2_raw_node_ref *raw)
+{
+	uint32_t totlen, length, old_ofs;
+	int rc = 0;
+
+	down_write(&c->xattr_sem);
+	if (xd->node != raw)
+		goto out;
+	if (xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID))
+		goto out;
+
+	rc = load_xattr_datum(c, xd);
+	if (unlikely(rc)) {
+		rc = (rc > 0) ? 0 : rc;
+		goto out;
+	}
+	old_ofs = ref_offset(xd->node);
+	totlen = PAD(sizeof(struct jffs2_raw_xattr)
+			+ xd->name_len + 1 + xd->value_len);
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen);
+		goto out;
+	}
+	rc = save_xattr_datum(c, xd);
+	if (!rc)
+		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
+			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
+ out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
+	up_write(&c->xattr_sem);
+	return rc;
+}
+
+int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+				    struct jffs2_raw_node_ref *raw)
+{
+	uint32_t totlen, length, old_ofs;
+	int rc = 0;
+
+	down_write(&c->xattr_sem);
+	BUG_ON(!ref->node);
+
+	if (ref->node != raw)
+		goto out;
+	if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref))
+		goto out;
+
+	old_ofs = ref_offset(ref->node);
+	totlen = ref_totlen(c, c->gcblock, ref->node);
+
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc) {
+		JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
+			      __func__, rc, totlen);
+		goto out;
+	}
+	rc = save_xattr_ref(c, ref);
+	if (!rc)
+		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
+			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
+ out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
+	up_write(&c->xattr_sem);
+	return rc;
+}
+
+int jffs2_verify_xattr(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t totlen;
+	int rc;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc < 0)
+			continue;
+		list_del_init(&xd->xindex);
+		spin_lock(&c->erase_completion_lock);
+		for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+			if (ref_flags(raw) != REF_UNCHECKED)
+				continue;
+			jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+			totlen = PAD(ref_totlen(c, jeb, raw));
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+			raw->flash_offset = ref_offset(raw)
+				| ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL);
+		}
+		if (xd->flags & JFFS2_XFLAGS_DEAD)
+			list_add(&xd->xindex, &c->xattr_dead_list);
+		spin_unlock(&c->erase_completion_lock);
+	}
+	up_write(&c->xattr_sem);
+	return list_empty(&c->xattr_unchecked) ? 1 : 0;
+}
+
+void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	if (atomic_read(&xd->refcnt) || xd->node != (void *)xd)
+		return;
+
+	list_del(&xd->xindex);
+	jffs2_free_xattr_datum(xd);
+}
+
+void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	struct jffs2_xattr_ref *tmp, **ptmp;
+
+	if (ref->node != (void *)ref)
+		return;
+
+	for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) {
+		if (ref == tmp) {
+			*ptmp = tmp->next;
+			break;
+		}
+	}
+	jffs2_free_xattr_ref(ref);
+}
diff --git a/kernel/fs/jffs2/xattr.h b/kernel/fs/jffs2/xattr.h
new file mode 100644
index 000000000..467ff376e
--- /dev/null
+++ b/kernel/fs/jffs2/xattr.h
@@ -0,0 +1,133 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_FS_XATTR_H_
+#define _JFFS2_FS_XATTR_H_
+
+#include <linux/xattr.h>
+#include <linux/list.h>
+
+#define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
+#define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+#define JFFS2_XFLAGS_DEAD	(0x40)	/* This datum is already dead */
+#define JFFS2_XFLAGS_INVALID	(0x80)	/* This datum contains crc error */
+
+struct jffs2_xattr_datum
+{
+	void *always_null;
+	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;
+	uint16_t xprefix;		/* see JFFS2_XATTR_PREFIX_* */
+
+	struct list_head xindex;	/* chained from c->xattrindex[n] */
+	atomic_t refcnt;		/* # of xattr_ref refers this */
+	uint32_t xid;
+	uint32_t version;
+
+	uint32_t data_crc;
+	uint32_t hashkey;
+	char *xname;		/* XATTR name without prefix */
+	uint32_t name_len;	/* length of xname */
+	char *xvalue;		/* XATTR value */
+	uint32_t value_len;	/* length of xvalue */
+};
+
+struct jffs2_inode_cache;
+struct jffs2_xattr_ref
+{
+	void *always_null;
+	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;		/* Currently unused */
+	u16 unused;
+
+	uint32_t xseqno;
+	union {
+		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
+		uint32_t ino;			/* only used in scanning/building  */
+	};
+	union {
+		struct jffs2_xattr_datum *xd;	/* reference to jffs2_xattr_datum */
+		uint32_t xid;			/* only used in sccanning/building */
+	};
+	struct jffs2_xattr_ref *next;		/* chained from ic->xref_list */
+};
+
+#define XREF_DELETE_MARKER	(0x00000001)
+static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	return ((ref->xseqno & XREF_DELETE_MARKER) != 0);
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+
+extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
+
+extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+							 uint32_t xid, uint32_t version);
+
+extern void jffs2_xattr_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+
+extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+					     struct jffs2_raw_node_ref *raw);
+extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+					   struct jffs2_raw_node_ref *raw);
+extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
+extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
+
+extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+			     char *buffer, size_t size);
+extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+			     const char *buffer, size_t size, int flags);
+
+extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler jffs2_user_xattr_handler;
+extern const struct xattr_handler jffs2_trusted_xattr_handler;
+
+extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
+#define jffs2_getxattr		generic_getxattr
+#define jffs2_setxattr		generic_setxattr
+#define jffs2_removexattr	generic_removexattr
+
+#else
+
+#define jffs2_init_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c)
+#define jffs2_clear_xattr_subsystem(c)
+
+#define jffs2_xattr_do_crccheck_inode(c, ic)
+#define jffs2_xattr_delete_inode(c, ic)
+#define jffs2_xattr_free_inode(c, ic)
+#define jffs2_verify_xattr(c)			(1)
+
+#define jffs2_xattr_handlers	NULL
+#define jffs2_listxattr		NULL
+#define jffs2_getxattr		NULL
+#define jffs2_setxattr		NULL
+#define jffs2_removexattr	NULL
+
+#endif /* CONFIG_JFFS2_FS_XATTR */
+
+#ifdef CONFIG_JFFS2_FS_SECURITY
+extern int jffs2_init_security(struct inode *inode, struct inode *dir,
+			       const struct qstr *qstr);
+extern const struct xattr_handler jffs2_security_xattr_handler;
+#else
+#define jffs2_init_security(inode,dir,qstr)	(0)
+#endif /* CONFIG_JFFS2_FS_SECURITY */
+
+#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/kernel/fs/jffs2/xattr_trusted.c b/kernel/fs/jffs2/xattr_trusted.c
new file mode 100644
index 000000000..ceaf9c693
--- /dev/null
+++ b/kernel/fs/jffs2/xattr_trusted.c
@@ -0,0 +1,55 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+				 name, buffer, size);
+}
+
+static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+				 name, buffer, size, flags);
+}
+
+static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen<=list_size) {
+		strcpy(list, XATTR_TRUSTED_PREFIX);
+		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+const struct xattr_handler jffs2_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.list = jffs2_trusted_listxattr,
+	.set = jffs2_trusted_setxattr,
+	.get = jffs2_trusted_getxattr
+};
diff --git a/kernel/fs/jffs2/xattr_user.c b/kernel/fs/jffs2/xattr_user.c
new file mode 100644
index 000000000..a71391eba
--- /dev/null
+++ b/kernel/fs/jffs2/xattr_user.c
@@ -0,0 +1,55 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
+			       void *buffer, size_t size, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+				 name, buffer, size);
+}
+
+static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+				 name, buffer, size, flags);
+}
+
+static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_USER_PREFIX);
+		strcpy(list + XATTR_USER_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+const struct xattr_handler jffs2_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.list = jffs2_user_listxattr,
+	.set = jffs2_user_setxattr,
+	.get = jffs2_user_getxattr
+};
diff --git a/kernel/fs/jfs/Kconfig b/kernel/fs/jfs/Kconfig
new file mode 100644
index 000000000..57cef1995
--- /dev/null
+++ b/kernel/fs/jfs/Kconfig
@@ -0,0 +1,50 @@
+config JFS_FS
+	tristate "JFS filesystem support"
+	select NLS
+	select CRC32
+	help
+	  This is a port of IBM's Journaled Filesystem .  More information is
+	  available in the file <file:Documentation/filesystems/jfs.txt>.
+
+	  If you do not intend to use the JFS filesystem, say N.
+
+config JFS_POSIX_ACL
+	bool "JFS POSIX Access Control Lists"
+	depends on JFS_FS
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFS_SECURITY
+	bool "JFS Security Labels"
+	depends on JFS_FS
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jfs filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFS_DEBUG
+	bool "JFS debugging"
+	depends on JFS_FS
+	help
+	  If you are experiencing any problems with the JFS filesystem, say
+	  Y here.  This will result in additional debugging messages to be
+	  written to the system log.  Under normal circumstances, this
+	  results in very little overhead.
+
+config JFS_STATISTICS
+	bool "JFS statistics"
+	depends on JFS_FS
+	help
+	  Enabling this option will cause statistics from the JFS file system
+	  to be made available to the user in the /proc/fs/jfs/ directory.
diff --git a/kernel/fs/jfs/Makefile b/kernel/fs/jfs/Makefile
new file mode 100644
index 000000000..d20d4737b
--- /dev/null
+++ b/kernel/fs/jfs/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the Linux JFS filesystem routines.
+#
+
+obj-$(CONFIG_JFS_FS) += jfs.o
+
+jfs-y    := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
+	    jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
+	    jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \
+	    jfs_extent.o symlink.o jfs_metapage.o \
+	    jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \
+	    resize.o xattr.o ioctl.o
+
+jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
+
+ccflags-y := -D_JFS_4K
diff --git a/kernel/fs/jfs/acl.c b/kernel/fs/jfs/acl.c
new file mode 100644
index 000000000..0c8ca830b
--- /dev/null
+++ b/kernel/fs/jfs/acl.c
@@ -0,0 +1,161 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2002-2004
+ *   Copyright (C) Andreas Gruenbacher, 2001
+ *   Copyright (C) Linus Torvalds, 1991, 1992
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include "jfs_incore.h"
+#include "jfs_txnmgr.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+
+struct posix_acl *jfs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	char *ea_name;
+	int size;
+	char *value = NULL;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			ea_name = POSIX_ACL_XATTR_ACCESS;
+			break;
+		case ACL_TYPE_DEFAULT:
+			ea_name = POSIX_ACL_XATTR_DEFAULT;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+	}
+
+	size = __jfs_getxattr(inode, ea_name, NULL, 0);
+
+	if (size > 0) {
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __jfs_getxattr(inode, ea_name, value, size);
+	}
+
+	if (size < 0) {
+		if (size == -ENODATA)
+			acl = NULL;
+		else
+			acl = ERR_PTR(size);
+	} else {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	}
+	kfree(value);
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+	return acl;
+}
+
+static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
+		       struct posix_acl *acl)
+{
+	char *ea_name;
+	int rc;
+	int size = 0;
+	char *value = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			rc = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (rc < 0)
+				return rc;
+			inode->i_ctime = CURRENT_TIME;
+			mark_inode_dirty(inode);
+			if (rc == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		ea_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+		rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (rc < 0)
+			goto out;
+	}
+	rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0);
+out:
+	kfree(value);
+
+	if (!rc)
+		set_cached_acl(inode, type, acl);
+
+	return rc;
+}
+
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int rc;
+	tid_t tid;
+
+	tid = txBegin(inode->i_sb, 0);
+	mutex_lock(&JFS_IP(inode)->commit_mutex);
+	rc = __jfs_set_acl(tid, inode, type, acl);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(inode)->commit_mutex);
+	return rc;
+}
+
+int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *default_acl, *acl;
+	int rc = 0;
+
+	rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (rc)
+		return rc;
+
+	if (default_acl) {
+		rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl);
+		posix_acl_release(default_acl);
+	}
+
+	if (acl) {
+		if (!rc)
+			rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
+		posix_acl_release(acl);
+	}
+
+	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
+			       inode->i_mode;
+
+	return rc;
+}
diff --git a/kernel/fs/jfs/file.c b/kernel/fs/jfs/file.c
new file mode 100644
index 000000000..e98d39d75
--- /dev/null
+++ b/kernel/fs/jfs/file.c
@@ -0,0 +1,165 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_dmap.h"
+#include "jfs_txnmgr.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+
+int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int rc = 0;
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+
+	mutex_lock(&inode->i_mutex);
+	if (!(inode->i_state & I_DIRTY_ALL) ||
+	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
+		/* Make sure committed changes hit the disk */
+		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
+		mutex_unlock(&inode->i_mutex);
+		return rc;
+	}
+
+	rc |= jfs_commit_inode(inode, 1);
+	mutex_unlock(&inode->i_mutex);
+
+	return rc ? -EIO : 0;
+}
+
+static int jfs_open(struct inode *inode, struct file *file)
+{
+	int rc;
+
+	if ((rc = dquot_file_open(inode, file)))
+		return rc;
+
+	/*
+	 * We attempt to allow only one "active" file open per aggregate
+	 * group.  Otherwise, appending to files in parallel can cause
+	 * fragmentation within the files.
+	 *
+	 * If the file is empty, it was probably just created and going
+	 * to be written to.  If it has a size, we'll hold off until the
+	 * file is actually grown.
+	 */
+	if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
+	    (inode->i_size == 0)) {
+		struct jfs_inode_info *ji = JFS_IP(inode);
+		spin_lock_irq(&ji->ag_lock);
+		if (ji->active_ag == -1) {
+			struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
+			ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
+			atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]);
+		}
+		spin_unlock_irq(&ji->ag_lock);
+	}
+
+	return 0;
+}
+static int jfs_release(struct inode *inode, struct file *file)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+
+	spin_lock_irq(&ji->ag_lock);
+	if (ji->active_ag != -1) {
+		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+		atomic_dec(&bmap->db_active[ji->active_ag]);
+		ji->active_ag = -1;
+	}
+	spin_unlock_irq(&ji->ag_lock);
+
+	return 0;
+}
+
+int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	int rc;
+
+	rc = inode_change_ok(inode, iattr);
+	if (rc)
+		return rc;
+
+	if (is_quota_modification(inode, iattr))
+		dquot_initialize(inode);
+	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
+	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
+		rc = dquot_transfer(inode, iattr);
+		if (rc)
+			return rc;
+	}
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
+
+		rc = inode_newsize_ok(inode, iattr->ia_size);
+		if (rc)
+			return rc;
+
+		truncate_setsize(inode, iattr->ia_size);
+		jfs_truncate(inode);
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	if (iattr->ia_valid & ATTR_MODE)
+		rc = posix_acl_chmod(inode, inode->i_mode);
+	return rc;
+}
+
+const struct inode_operations jfs_file_inode_operations = {
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+	.setattr	= jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
+	.get_acl	= jfs_get_acl,
+	.set_acl	= jfs_set_acl,
+#endif
+};
+
+const struct file_operations jfs_file_operations = {
+	.open		= jfs_open,
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fsync		= jfs_fsync,
+	.release	= jfs_release,
+	.unlocked_ioctl = jfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= jfs_compat_ioctl,
+#endif
+};
diff --git a/kernel/fs/jfs/inode.c b/kernel/fs/jfs/inode.c
new file mode 100644
index 000000000..070dc4b33
--- /dev/null
+++ b/kernel/fs/jfs/inode.c
@@ -0,0 +1,423 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/mpage.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_extent.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+
+struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ret = diRead(inode);
+	if (ret < 0) {
+		iget_failed(inode);
+		return ERR_PTR(ret);
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &jfs_file_inode_operations;
+		inode->i_fop = &jfs_file_operations;
+		inode->i_mapping->a_ops = &jfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &jfs_dir_inode_operations;
+		inode->i_fop = &jfs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (inode->i_size >= IDATASIZE) {
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_mapping->a_ops = &jfs_aops;
+		} else {
+			inode->i_op = &jfs_fast_symlink_inode_operations;
+			/*
+			 * The inline data should be null-terminated, but
+			 * don't let on-disk corruption crash the kernel
+			 */
+			JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+		}
+	} else {
+		inode->i_op = &jfs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/*
+ * Workhorse of both fsync & write_inode
+ */
+int jfs_commit_inode(struct inode *inode, int wait)
+{
+	int rc = 0;
+	tid_t tid;
+	static int noisy = 5;
+
+	jfs_info("In jfs_commit_inode, inode = 0x%p", inode);
+
+	/*
+	 * Don't commit if inode has been committed since last being
+	 * marked dirty, or if it has been deleted.
+	 */
+	if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode))
+		return 0;
+
+	if (isReadOnly(inode)) {
+		/* kernel allows writes to devices on read-only
+		 * partitions and may think inode is dirty
+		 */
+		if (!special_file(inode->i_mode) && noisy) {
+			jfs_err("jfs_commit_inode(0x%p) called on "
+				   "read-only volume", inode);
+			jfs_err("Is remount racy?");
+			noisy--;
+		}
+		return 0;
+	}
+
+	tid = txBegin(inode->i_sb, COMMIT_INODE);
+	mutex_lock(&JFS_IP(inode)->commit_mutex);
+
+	/*
+	 * Retest inode state after taking commit_mutex
+	 */
+	if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode))
+		rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0);
+
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(inode)->commit_mutex);
+	return rc;
+}
+
+int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
+	if (inode->i_nlink == 0)
+		return 0;
+	/*
+	 * If COMMIT_DIRTY is not set, the inode isn't really dirty.
+	 * It has been committed since the last change, but was still
+	 * on the dirty inode list.
+	 */
+	 if (!test_cflag(COMMIT_Dirty, inode)) {
+		/* Make sure committed changes hit the disk */
+		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait);
+		return 0;
+	 }
+
+	if (jfs_commit_inode(inode, wait)) {
+		jfs_err("jfs_write_inode: jfs_commit_inode failed!");
+		return -EIO;
+	} else
+		return 0;
+}
+
+void jfs_evict_inode(struct inode *inode)
+{
+	jfs_info("In jfs_evict_inode, inode = 0x%p", inode);
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		dquot_initialize(inode);
+
+		if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+			truncate_inode_pages_final(&inode->i_data);
+
+			if (test_cflag(COMMIT_Freewmap, inode))
+				jfs_free_zero_link(inode);
+
+			diFree(inode);
+
+			/*
+			 * Free the inode from the quota allocation.
+			 */
+			dquot_initialize(inode);
+			dquot_free_inode(inode);
+		}
+	} else {
+		truncate_inode_pages_final(&inode->i_data);
+	}
+	clear_inode(inode);
+	dquot_drop(inode);
+}
+
+void jfs_dirty_inode(struct inode *inode, int flags)
+{
+	static int noisy = 5;
+
+	if (isReadOnly(inode)) {
+		if (!special_file(inode->i_mode) && noisy) {
+			/* kernel allows writes to devices on read-only
+			 * partitions and may try to mark inode dirty
+			 */
+			jfs_err("jfs_dirty_inode called on read-only volume");
+			jfs_err("Is remount racy?");
+			noisy--;
+		}
+		return;
+	}
+
+	set_cflag(COMMIT_Dirty, inode);
+}
+
+int jfs_get_block(struct inode *ip, sector_t lblock,
+		  struct buffer_head *bh_result, int create)
+{
+	s64 lblock64 = lblock;
+	int rc = 0;
+	xad_t xad;
+	s64 xaddr;
+	int xflag;
+	s32 xlen = bh_result->b_size >> ip->i_blkbits;
+
+	/*
+	 * Take appropriate lock on inode
+	 */
+	if (create)
+		IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
+	else
+		IREAD_LOCK(ip, RDWRLOCK_NORMAL);
+
+	if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
+	    (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) &&
+	    xaddr) {
+		if (xflag & XAD_NOTRECORDED) {
+			if (!create)
+				/*
+				 * Allocated but not recorded, read treats
+				 * this as a hole
+				 */
+				goto unlock;
+#ifdef _JFS_4K
+			XADoffset(&xad, lblock64);
+			XADlength(&xad, xlen);
+			XADaddress(&xad, xaddr);
+#else				/* _JFS_4K */
+			/*
+			 * As long as block size = 4K, this isn't a problem.
+			 * We should mark the whole page not ABNR, but how
+			 * will we know to mark the other blocks BH_New?
+			 */
+			BUG();
+#endif				/* _JFS_4K */
+			rc = extRecord(ip, &xad);
+			if (rc)
+				goto unlock;
+			set_buffer_new(bh_result);
+		}
+
+		map_bh(bh_result, ip->i_sb, xaddr);
+		bh_result->b_size = xlen << ip->i_blkbits;
+		goto unlock;
+	}
+	if (!create)
+		goto unlock;
+
+	/*
+	 * Allocate a new block
+	 */
+#ifdef _JFS_4K
+	if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
+		goto unlock;
+	rc = extAlloc(ip, xlen, lblock64, &xad, false);
+	if (rc)
+		goto unlock;
+
+	set_buffer_new(bh_result);
+	map_bh(bh_result, ip->i_sb, addressXAD(&xad));
+	bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits;
+
+#else				/* _JFS_4K */
+	/*
+	 * We need to do whatever it takes to keep all but the last buffers
+	 * in 4K pages - see jfs_write.c
+	 */
+	BUG();
+#endif				/* _JFS_4K */
+
+      unlock:
+	/*
+	 * Release lock on inode
+	 */
+	if (create)
+		IWRITE_UNLOCK(ip);
+	else
+		IREAD_UNLOCK(ip);
+	return rc;
+}
+
+static int jfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, jfs_get_block, wbc);
+}
+
+static int jfs_writepages(struct address_space *mapping,
+			struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, jfs_get_block);
+}
+
+static int jfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, jfs_get_block);
+}
+
+static int jfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
+}
+
+static void jfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		jfs_truncate(inode);
+	}
+}
+
+static int jfs_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
+				jfs_get_block);
+	if (unlikely(ret))
+		jfs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, jfs_get_block);
+}
+
+static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			     loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = file->f_mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + count;
+
+		if (end > isize)
+			jfs_write_failed(mapping, end);
+	}
+
+	return ret;
+}
+
+const struct address_space_operations jfs_aops = {
+	.readpage	= jfs_readpage,
+	.readpages	= jfs_readpages,
+	.writepage	= jfs_writepage,
+	.writepages	= jfs_writepages,
+	.write_begin	= jfs_write_begin,
+	.write_end	= nobh_write_end,
+	.bmap		= jfs_bmap,
+	.direct_IO	= jfs_direct_IO,
+};
+
+/*
+ * Guts of jfs_truncate.  Called with locks already held.  Can be called
+ * with directory for truncating directory index table.
+ */
+void jfs_truncate_nolock(struct inode *ip, loff_t length)
+{
+	loff_t newsize;
+	tid_t tid;
+
+	ASSERT(length >= 0);
+
+	if (test_cflag(COMMIT_Nolink, ip)) {
+		xtTruncate(0, ip, length, COMMIT_WMAP);
+		return;
+	}
+
+	do {
+		tid = txBegin(ip->i_sb, 0);
+
+		/*
+		 * The commit_mutex cannot be taken before txBegin.
+		 * txBegin may block and there is a chance the inode
+		 * could be marked dirty and need to be committed
+		 * before txBegin unblocks
+		 */
+		mutex_lock(&JFS_IP(ip)->commit_mutex);
+
+		newsize = xtTruncate(tid, ip, length,
+				     COMMIT_TRUNCATE | COMMIT_PWMAP);
+		if (newsize < 0) {
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ip)->commit_mutex);
+			break;
+		}
+
+		ip->i_mtime = ip->i_ctime = CURRENT_TIME;
+		mark_inode_dirty(ip);
+
+		txCommit(tid, 1, &ip, 0);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	} while (newsize > length);	/* Truncate isn't always atomic */
+}
+
+void jfs_truncate(struct inode *ip)
+{
+	jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size);
+
+	nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block);
+
+	IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
+	jfs_truncate_nolock(ip, ip->i_size);
+	IWRITE_UNLOCK(ip);
+}
diff --git a/kernel/fs/jfs/ioctl.c b/kernel/fs/jfs/ioctl.c
new file mode 100644
index 000000000..93a123289
--- /dev/null
+++ b/kernel/fs/jfs/ioctl.c
@@ -0,0 +1,189 @@
+/*
+ * linux/fs/jfs/ioctl.c
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ * adapted from Remy Card's ext2/ioctl.c
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/capability.h>
+#include <linux/mount.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+
+#include "jfs_filsys.h"
+#include "jfs_debug.h"
+#include "jfs_incore.h"
+#include "jfs_dinode.h"
+#include "jfs_inode.h"
+#include "jfs_dmap.h"
+#include "jfs_discard.h"
+
+static struct {
+	long jfs_flag;
+	long ext2_flag;
+} jfs_map[] = {
+	{JFS_NOATIME_FL,	FS_NOATIME_FL},
+	{JFS_DIRSYNC_FL,	FS_DIRSYNC_FL},
+	{JFS_SYNC_FL,		FS_SYNC_FL},
+	{JFS_SECRM_FL,		FS_SECRM_FL},
+	{JFS_UNRM_FL,		FS_UNRM_FL},
+	{JFS_APPEND_FL,		FS_APPEND_FL},
+	{JFS_IMMUTABLE_FL,	FS_IMMUTABLE_FL},
+	{0, 0},
+};
+
+static long jfs_map_ext2(unsigned long flags, int from)
+{
+	int index=0;
+	long mapped=0;
+
+	while (jfs_map[index].jfs_flag) {
+		if (from) {
+			if (jfs_map[index].ext2_flag & flags)
+				mapped |= jfs_map[index].jfs_flag;
+		} else {
+			if (jfs_map[index].jfs_flag & flags)
+				mapped |= jfs_map[index].ext2_flag;
+		}
+		index++;
+	}
+	return mapped;
+}
+
+
+long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct jfs_inode_info *jfs_inode = JFS_IP(inode);
+	unsigned int flags;
+
+	switch (cmd) {
+	case JFS_IOC_GETFLAGS:
+		jfs_get_inode_flags(jfs_inode);
+		flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
+		flags = jfs_map_ext2(flags, 0);
+		return put_user(flags, (int __user *) arg);
+	case JFS_IOC_SETFLAGS: {
+		unsigned int oldflags;
+		int err;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		if (!inode_owner_or_capable(inode)) {
+			err = -EACCES;
+			goto setflags_out;
+		}
+		if (get_user(flags, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setflags_out;
+		}
+
+		flags = jfs_map_ext2(flags, 1);
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~JFS_DIRSYNC_FL;
+
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode)) {
+			err = -EPERM;
+			goto setflags_out;
+		}
+
+		/* Lock against other parallel changes of flags */
+		mutex_lock(&inode->i_mutex);
+
+		jfs_get_inode_flags(jfs_inode);
+		oldflags = jfs_inode->mode2;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 */
+		if ((oldflags & JFS_IMMUTABLE_FL) ||
+			((flags ^ oldflags) &
+			(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
+				err = -EPERM;
+				goto setflags_out;
+			}
+		}
+
+		flags = flags & JFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~JFS_FL_USER_MODIFIABLE;
+		jfs_inode->mode2 = flags;
+
+		jfs_set_inode_flags(inode);
+		mutex_unlock(&inode->i_mutex);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
+setflags_out:
+		mnt_drop_write_file(filp);
+		return err;
+	}
+
+	case FITRIM:
+	{
+		struct super_block *sb = inode->i_sb;
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		struct fstrim_range range;
+		s64 ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (!blk_queue_discard(q)) {
+			jfs_warn("FITRIM not supported on device");
+			return -EOPNOTSUPP;
+		}
+
+		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		range.minlen = max_t(unsigned int, range.minlen,
+			q->limits.discard_granularity);
+
+		ret = jfs_ioc_trim(inode, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range __user *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	/* While these ioctl numbers defined with 'long' and have different
+	 * numbers than the 64bit ABI,
+	 * the actual implementation only deals with ints and is compatible.
+	 */
+	switch (cmd) {
+	case JFS_IOC_GETFLAGS32:
+		cmd = JFS_IOC_GETFLAGS;
+		break;
+	case JFS_IOC_SETFLAGS32:
+		cmd = JFS_IOC_SETFLAGS;
+		break;
+	case FITRIM:
+		cmd = FITRIM;
+		break;
+	}
+	return jfs_ioctl(filp, cmd, arg);
+}
+#endif
diff --git a/kernel/fs/jfs/jfs_acl.h b/kernel/fs/jfs/jfs_acl.h
new file mode 100644
index 000000000..489f993b7
--- /dev/null
+++ b/kernel/fs/jfs/jfs_acl.h
@@ -0,0 +1,36 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_ACL
+#define _H_JFS_ACL
+
+#ifdef CONFIG_JFS_POSIX_ACL
+
+struct posix_acl *jfs_get_acl(struct inode *inode, int type);
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int jfs_init_acl(tid_t, struct inode *, struct inode *);
+
+#else
+
+static inline int jfs_init_acl(tid_t tid, struct inode *inode,
+			       struct inode *dir)
+{
+	return 0;
+}
+
+#endif
+#endif		/* _H_JFS_ACL */
diff --git a/kernel/fs/jfs/jfs_btree.h b/kernel/fs/jfs/jfs_btree.h
new file mode 100644
index 000000000..79c61805b
--- /dev/null
+++ b/kernel/fs/jfs/jfs_btree.h
@@ -0,0 +1,172 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_BTREE
+#define _H_JFS_BTREE
+
+/*
+ *	jfs_btree.h: B+-tree
+ *
+ * JFS B+-tree (dtree and xtree) common definitions
+ */
+
+/*
+ *	basic btree page - btpage
+ *
+struct btpage {
+	s64 next;		right sibling bn
+	s64 prev;		left sibling bn
+
+	u8 flag;
+	u8 rsrvd[7];		type specific
+	s64 self;		self address
+
+	u8 entry[4064];
+};						*/
+
+/* btpaget_t flag */
+#define BT_TYPE		0x07	/* B+-tree index */
+#define	BT_ROOT		0x01	/* root page */
+#define	BT_LEAF		0x02	/* leaf page */
+#define	BT_INTERNAL	0x04	/* internal page */
+#define	BT_RIGHTMOST	0x10	/* rightmost page */
+#define	BT_LEFTMOST	0x20	/* leftmost page */
+#define	BT_SWAPPED	0x80	/* used by fsck for endian swapping */
+
+/* btorder (in inode) */
+#define	BT_RANDOM		0x0000
+#define	BT_SEQUENTIAL		0x0001
+#define	BT_LOOKUP		0x0010
+#define	BT_INSERT		0x0020
+#define	BT_DELETE		0x0040
+
+/*
+ *	btree page buffer cache access
+ */
+#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0)
+
+/* get page from buffer page */
+#define BT_PAGE(IP, MP, TYPE, ROOT)\
+	(BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data)
+
+/* get the page buffer and the page for specified block address */
+#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\
+{\
+	if ((BN) == 0)\
+	{\
+		MP = (struct metapage *)&JFS_IP(IP)->bxflag;\
+		P = (TYPE *)&JFS_IP(IP)->ROOT;\
+		RC = 0;\
+	}\
+	else\
+	{\
+		MP = read_metapage((IP), BN, SIZE, 1);\
+		if (MP) {\
+			RC = 0;\
+			P = (MP)->data;\
+		} else {\
+			P = NULL;\
+			jfs_err("bread failed!");\
+			RC = -EIO;\
+		}\
+	}\
+}
+
+#define BT_MARK_DIRTY(MP, IP)\
+{\
+	if (BT_IS_ROOT(MP))\
+		mark_inode_dirty(IP);\
+	else\
+		mark_metapage_dirty(MP);\
+}
+
+/* put the page buffer */
+#define BT_PUTPAGE(MP)\
+{\
+	if (! BT_IS_ROOT(MP)) \
+		release_metapage(MP); \
+}
+
+
+/*
+ *	btree traversal stack
+ *
+ * record the path traversed during the search;
+ * top frame record the leaf page/entry selected.
+ */
+struct btframe {	/* stack frame */
+	s64 bn;			/* 8: */
+	s16 index;		/* 2: */
+	s16 lastindex;		/* 2: unused */
+	struct metapage *mp;	/* 4/8: */
+};				/* (16/24) */
+
+struct btstack {
+	struct btframe *top;
+	int nsplit;
+	struct btframe stack[MAXTREEHEIGHT];
+};
+
+#define BT_CLR(btstack)\
+	(btstack)->top = (btstack)->stack
+
+#define BT_STACK_FULL(btstack)\
+	( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1]))
+
+#define BT_PUSH(BTSTACK, BN, INDEX)\
+{\
+	assert(!BT_STACK_FULL(BTSTACK));\
+	(BTSTACK)->top->bn = BN;\
+	(BTSTACK)->top->index = INDEX;\
+	++(BTSTACK)->top;\
+}
+
+#define BT_POP(btstack)\
+	( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top )
+
+#define BT_STACK(btstack)\
+	( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top )
+
+static inline void BT_STACK_DUMP(struct btstack *btstack)
+{
+	int i;
+	printk("btstack dump:\n");
+	for (i = 0; i < MAXTREEHEIGHT; i++)
+		printk(KERN_ERR "bn = %Lx, index = %d\n",
+		       (long long)btstack->stack[i].bn,
+		       btstack->stack[i].index);
+}
+
+/* retrieve search results */
+#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\
+{\
+	BN = (LEAF)->bn;\
+	MP = (LEAF)->mp;\
+	if (BN)\
+		P = (TYPE *)MP->data;\
+	else\
+		P = (TYPE *)&JFS_IP(IP)->ROOT;\
+	INDEX = (LEAF)->index;\
+}
+
+/* put the page buffer of search */
+#define BT_PUTSEARCH(BTSTACK)\
+{\
+	if (! BT_IS_ROOT((BTSTACK)->top->mp))\
+		release_metapage((BTSTACK)->top->mp);\
+}
+#endif				/* _H_JFS_BTREE */
diff --git a/kernel/fs/jfs/jfs_debug.c b/kernel/fs/jfs/jfs_debug.c
new file mode 100644
index 000000000..dd824d9b0
--- /dev/null
+++ b/kernel/fs/jfs/jfs_debug.c
@@ -0,0 +1,109 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_debug.h"
+
+#ifdef PROC_FS_JFS /* see jfs_debug.h */
+
+static struct proc_dir_entry *base;
+#ifdef CONFIG_JFS_DEBUG
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", jfsloglevel);
+	return 0;
+}
+
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_loglevel_proc_show, NULL);
+}
+
+static ssize_t jfs_loglevel_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+
+	/* yes, I know this is an ASCIIism.  --hch */
+	if (c < '0' || c > '9')
+		return -EINVAL;
+	jfsloglevel = c - '0';
+	return count;
+}
+
+static const struct file_operations jfs_loglevel_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_loglevel_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= jfs_loglevel_proc_write,
+};
+#endif
+
+static struct {
+	const char	*name;
+	const struct file_operations *proc_fops;
+} Entries[] = {
+#ifdef CONFIG_JFS_STATISTICS
+	{ "lmstats",	&jfs_lmstats_proc_fops, },
+	{ "txstats",	&jfs_txstats_proc_fops, },
+	{ "xtstat",	&jfs_xtstat_proc_fops, },
+	{ "mpstat",	&jfs_mpstat_proc_fops, },
+#endif
+#ifdef CONFIG_JFS_DEBUG
+	{ "TxAnchor",	&jfs_txanchor_proc_fops, },
+	{ "loglevel",	&jfs_loglevel_proc_fops }
+#endif
+};
+#define NPROCENT	ARRAY_SIZE(Entries)
+
+void jfs_proc_init(void)
+{
+	int i;
+
+	if (!(base = proc_mkdir("fs/jfs", NULL)))
+		return;
+
+	for (i = 0; i < NPROCENT; i++)
+		proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
+}
+
+void jfs_proc_clean(void)
+{
+	int i;
+
+	if (base) {
+		for (i = 0; i < NPROCENT; i++)
+			remove_proc_entry(Entries[i].name, base);
+		remove_proc_entry("fs/jfs", NULL);
+	}
+}
+
+#endif /* PROC_FS_JFS */
diff --git a/kernel/fs/jfs/jfs_debug.h b/kernel/fs/jfs/jfs_debug.h
new file mode 100644
index 000000000..eafd1300a
--- /dev/null
+++ b/kernel/fs/jfs/jfs_debug.h
@@ -0,0 +1,122 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DEBUG
+#define _H_JFS_DEBUG
+
+/*
+ *	jfs_debug.h
+ *
+ * global debug message, data structure/macro definitions
+ * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS;
+ */
+
+/*
+ * Create /proc/fs/jfs if procfs is enabled andeither
+ * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined
+ */
+#if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS))
+#define PROC_FS_JFS
+extern void jfs_proc_init(void);
+extern void jfs_proc_clean(void);
+#endif
+
+/*
+ *	assert with traditional printf/panic
+ */
+#define assert(p) do {	\
+	if (!(p)) {	\
+		printk(KERN_CRIT "BUG at %s:%d assert(%s)\n",	\
+		       __FILE__, __LINE__, #p);			\
+		BUG();	\
+	}		\
+} while (0)
+
+/*
+ *	debug ON
+ *	--------
+ */
+#ifdef CONFIG_JFS_DEBUG
+#define ASSERT(p) assert(p)
+
+/* printk verbosity */
+#define JFS_LOGLEVEL_ERR 1
+#define JFS_LOGLEVEL_WARN 2
+#define JFS_LOGLEVEL_DEBUG 3
+#define JFS_LOGLEVEL_INFO 4
+
+extern int jfsloglevel;
+
+extern const struct file_operations jfs_txanchor_proc_fops;
+
+/* information message: e.g., configuration, major event */
+#define jfs_info(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_INFO)		\
+		printk(KERN_INFO fmt "\n", ## arg);	\
+} while (0)
+
+/* debug message: ad hoc */
+#define jfs_debug(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_DEBUG)		\
+		printk(KERN_DEBUG fmt "\n", ## arg);	\
+} while (0)
+
+/* warn message: */
+#define jfs_warn(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_WARN)		\
+		printk(KERN_WARNING fmt "\n", ## arg);	\
+} while (0)
+
+/* error event message: e.g., i/o error */
+#define jfs_err(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_ERR)		\
+		printk(KERN_ERR fmt "\n", ## arg);	\
+} while (0)
+
+/*
+ *	debug OFF
+ *	---------
+ */
+#else				/* CONFIG_JFS_DEBUG */
+#define ASSERT(p) do {} while (0)
+#define jfs_info(fmt, arg...) do {} while (0)
+#define jfs_debug(fmt, arg...) do {} while (0)
+#define jfs_warn(fmt, arg...) do {} while (0)
+#define jfs_err(fmt, arg...) do {} while (0)
+#endif				/* CONFIG_JFS_DEBUG */
+
+/*
+ *	statistics
+ *	----------
+ */
+#ifdef	CONFIG_JFS_STATISTICS
+extern const struct file_operations jfs_lmstats_proc_fops;
+extern const struct file_operations jfs_txstats_proc_fops;
+extern const struct file_operations jfs_mpstat_proc_fops;
+extern const struct file_operations jfs_xtstat_proc_fops;
+
+#define	INCREMENT(x)		((x)++)
+#define	DECREMENT(x)		((x)--)
+#define	HIGHWATERMARK(x,y)	((x) = max((x), (y)))
+#else
+#define	INCREMENT(x)
+#define	DECREMENT(x)
+#define	HIGHWATERMARK(x,y)
+#endif				/* CONFIG_JFS_STATISTICS */
+
+#endif				/* _H_JFS_DEBUG */
diff --git a/kernel/fs/jfs/jfs_dinode.h b/kernel/fs/jfs/jfs_dinode.h
new file mode 100644
index 000000000..395c4c0d0
--- /dev/null
+++ b/kernel/fs/jfs/jfs_dinode.h
@@ -0,0 +1,176 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DINODE
+#define _H_JFS_DINODE
+
+/*
+ *	jfs_dinode.h: on-disk inode manager
+ */
+
+#define INODESLOTSIZE		128
+#define L2INODESLOTSIZE		7
+#define log2INODESIZE		9	/* log2(bytes per dinode) */
+
+
+/*
+ *	on-disk inode : 512 bytes
+ *
+ * note: align 64-bit fields on 8-byte boundary.
+ */
+struct dinode {
+	/*
+	 *	I. base area (128 bytes)
+	 *	------------------------
+	 *
+	 * define generic/POSIX attributes
+	 */
+	__le32 di_inostamp;	/* 4: stamp to show inode belongs to fileset */
+	__le32 di_fileset;	/* 4: fileset number */
+	__le32 di_number;	/* 4: inode number, aka file serial number */
+	__le32 di_gen;		/* 4: inode generation number */
+
+	pxd_t di_ixpxd;		/* 8: inode extent descriptor */
+
+	__le64 di_size;		/* 8: size */
+	__le64 di_nblocks;	/* 8: number of blocks allocated */
+
+	__le32 di_nlink;	/* 4: number of links to the object */
+
+	__le32 di_uid;		/* 4: user id of owner */
+	__le32 di_gid;		/* 4: group id of owner */
+
+	__le32 di_mode;		/* 4: attribute, format and permission */
+
+	struct timestruc_t di_atime;	/* 8: time last data accessed */
+	struct timestruc_t di_ctime;	/* 8: time last status changed */
+	struct timestruc_t di_mtime;	/* 8: time last data modified */
+	struct timestruc_t di_otime;	/* 8: time created */
+
+	dxd_t di_acl;		/* 16: acl descriptor */
+
+	dxd_t di_ea;		/* 16: ea descriptor */
+
+	__le32 di_next_index;	/* 4: Next available dir_table index */
+
+	__le32 di_acltype;	/* 4: Type of ACL */
+
+	/*
+	 *	Extension Areas.
+	 *
+	 *	Historically, the inode was partitioned into 4 128-byte areas,
+	 *	the last 3 being defined as unions which could have multiple
+	 *	uses.  The first 96 bytes had been completely unused until
+	 *	an index table was added to the directory.  It is now more
+	 *	useful to describe the last 3/4 of the inode as a single
+	 *	union.  We would probably be better off redesigning the
+	 *	entire structure from scratch, but we don't want to break
+	 *	commonality with OS/2's JFS at this time.
+	 */
+	union {
+		struct {
+			/*
+			 * This table contains the information needed to
+			 * find a directory entry from a 32-bit index.
+			 * If the index is small enough, the table is inline,
+			 * otherwise, an x-tree root overlays this table
+			 */
+			struct dir_table_slot _table[12]; /* 96: inline */
+
+			dtroot_t _dtroot;		/* 288: dtree root */
+		} _dir;					/* (384) */
+#define di_dirtable	u._dir._table
+#define di_dtroot	u._dir._dtroot
+#define di_parent	di_dtroot.header.idotdot
+#define di_DASD		di_dtroot.header.DASD
+
+		struct {
+			union {
+				u8 _data[96];		/* 96: unused */
+				struct {
+					void *_imap;	/* 4: unused */
+					__le32 _gengen;	/* 4: generator */
+				} _imap;
+			} _u1;				/* 96: */
+#define di_gengen	u._file._u1._imap._gengen
+
+			union {
+				xtpage_t _xtroot;
+				struct {
+					u8 unused[16];	/* 16: */
+					dxd_t _dxd;	/* 16: */
+					union {
+						__le32 _rdev;	/* 4: */
+						u8 _fastsymlink[128];
+					} _u;
+					u8 _inlineea[128];
+				} _special;
+			} _u2;
+		} _file;
+#define di_xtroot	u._file._u2._xtroot
+#define di_dxd		u._file._u2._special._dxd
+#define di_btroot	di_xtroot
+#define di_inlinedata	u._file._u2._special._u
+#define di_rdev		u._file._u2._special._u._rdev
+#define di_fastsymlink	u._file._u2._special._u._fastsymlink
+#define di_inlineea	u._file._u2._special._inlineea
+	} u;
+};
+
+/* extended mode bits (on-disk inode di_mode) */
+#define IFJOURNAL	0x00010000	/* journalled file */
+#define ISPARSE		0x00020000	/* sparse file enabled */
+#define INLINEEA	0x00040000	/* inline EA area free */
+#define ISWAPFILE	0x00800000	/* file open for pager swap space */
+
+/* more extended mode bits: attributes for OS/2 */
+#define IREADONLY	0x02000000	/* no write access to file */
+#define IHIDDEN		0x04000000	/* hidden file */
+#define ISYSTEM		0x08000000	/* system file */
+
+#define IDIRECTORY	0x20000000	/* directory (shadow of real bit) */
+#define IARCHIVE	0x40000000	/* file archive bit */
+#define INEWNAME	0x80000000	/* non-8.3 filename format */
+
+#define IRASH		0x4E000000	/* mask for changeable attributes */
+#define ATTRSHIFT	25	/* bits to shift to move attribute
+				   specification to mode position */
+
+/* extended attributes for Linux */
+
+#define JFS_NOATIME_FL		0x00080000 /* do not update atime */
+
+#define JFS_DIRSYNC_FL		0x00100000 /* dirsync behaviour */
+#define JFS_SYNC_FL		0x00200000 /* Synchronous updates */
+#define JFS_SECRM_FL		0x00400000 /* Secure deletion */
+#define JFS_UNRM_FL		0x00800000 /* allow for undelete */
+
+#define JFS_APPEND_FL		0x01000000 /* writes to file may only append */
+#define JFS_IMMUTABLE_FL	0x02000000 /* Immutable file */
+
+#define JFS_FL_USER_VISIBLE	0x03F80000
+#define JFS_FL_USER_MODIFIABLE	0x03F80000
+#define JFS_FL_INHERIT		0x03C80000
+
+/* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */
+#define JFS_IOC_GETFLAGS	_IOR('f', 1, long)
+#define JFS_IOC_SETFLAGS	_IOW('f', 2, long)
+
+#define JFS_IOC_GETFLAGS32	_IOR('f', 1, int)
+#define JFS_IOC_SETFLAGS32	_IOW('f', 2, int)
+
+#endif /*_H_JFS_DINODE */
diff --git a/kernel/fs/jfs/jfs_discard.c b/kernel/fs/jfs/jfs_discard.c
new file mode 100644
index 000000000..dfcd50304
--- /dev/null
+++ b/kernel/fs/jfs/jfs_discard.c
@@ -0,0 +1,121 @@
+/*
+ *   Copyright (C) Tino Reichardt, 2012
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_discard.h"
+#include "jfs_dmap.h"
+#include "jfs_debug.h"
+
+
+/*
+ * NAME:	jfs_issue_discard()
+ *
+ * FUNCTION:	TRIM the specified block range on device, if supported
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode
+ *	blkno	- starting block number to be trimmed (0..N)
+ *	nblocks	- number of blocks to be trimmed
+ *
+ * RETURN VALUES:
+ *	none
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks)
+{
+	struct super_block *sb = ip->i_sb;
+	int r = 0;
+
+	r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0);
+	if (unlikely(r != 0)) {
+		jfs_err("JFS: sb_issue_discard" \
+			"(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!\n",
+			sb, (unsigned long long)blkno,
+			(unsigned long long)nblocks, r);
+	}
+
+	jfs_info("JFS: sb_issue_discard" \
+		"(%p, %llu, %llu, GFP_NOFS, 0) = %d\n",
+		sb, (unsigned long long)blkno,
+		(unsigned long long)nblocks, r);
+
+	return;
+}
+
+/*
+ * NAME:	jfs_ioc_trim()
+ *
+ * FUNCTION:	attempt to discard (TRIM) all free blocks from the
+ *              filesystem.
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode;
+ *	range	- the range, given by user space
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
+{
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+	struct super_block *sb = ipbmap->i_sb;
+	int agno, agno_end;
+	u64 start, end, minlen;
+	u64 trimmed = 0;
+
+	/**
+	 * convert byte values to block size of filesystem:
+	 * start:	First Byte to trim
+	 * len:		number of Bytes to trim from start
+	 * minlen:	minimum extent length in Bytes
+	 */
+	start = range->start >> sb->s_blocksize_bits;
+	end = start + (range->len >> sb->s_blocksize_bits) - 1;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+	if (minlen == 0)
+		minlen = 1;
+
+	if (minlen > bmp->db_agsize ||
+	    start >= bmp->db_mapsize ||
+	    range->len < sb->s_blocksize)
+		return -EINVAL;
+
+	if (end >= bmp->db_mapsize)
+		end = bmp->db_mapsize - 1;
+
+	/**
+	 * we trim all ag's within the range
+	 */
+	agno = BLKTOAG(start, JFS_SBI(ip->i_sb));
+	agno_end = BLKTOAG(end, JFS_SBI(ip->i_sb));
+	while (agno <= agno_end) {
+		trimmed += dbDiscardAG(ip, agno, minlen);
+		agno++;
+	}
+	range->len = trimmed << sb->s_blocksize_bits;
+
+	return 0;
+}
diff --git a/kernel/fs/jfs/jfs_discard.h b/kernel/fs/jfs/jfs_discard.h
new file mode 100644
index 000000000..40d1ee608
--- /dev/null
+++ b/kernel/fs/jfs/jfs_discard.h
@@ -0,0 +1,26 @@
+/*
+ *   Copyright (C) Tino Reichardt, 2012
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DISCARD
+#define _H_JFS_DISCARD
+
+struct fstrim_range;
+
+extern void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks);
+extern int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range);
+
+#endif /* _H_JFS_DISCARD */
diff --git a/kernel/fs/jfs/jfs_dmap.c b/kernel/fs/jfs/jfs_dmap.c
new file mode 100644
index 000000000..2d514c7af
--- /dev/null
+++ b/kernel/fs/jfs/jfs_dmap.c
@@ -0,0 +1,4092 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Tino Reichardt, 2012
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_lock.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+#include "jfs_discard.h"
+
+/*
+ *	SERIALIZATION of the Block Allocation Map.
+ *
+ *	the working state of the block allocation map is accessed in
+ *	two directions:
+ *
+ *	1) allocation and free requests that start at the dmap
+ *	   level and move up through the dmap control pages (i.e.
+ *	   the vast majority of requests).
+ *
+ *	2) allocation requests that start at dmap control page
+ *	   level and work down towards the dmaps.
+ *
+ *	the serialization scheme used here is as follows.
+ *
+ *	requests which start at the bottom are serialized against each
+ *	other through buffers and each requests holds onto its buffers
+ *	as it works it way up from a single dmap to the required level
+ *	of dmap control page.
+ *	requests that start at the top are serialized against each other
+ *	and request that start from the bottom by the multiple read/single
+ *	write inode lock of the bmap inode. requests starting at the top
+ *	take this lock in write mode while request starting at the bottom
+ *	take the lock in read mode.  a single top-down request may proceed
+ *	exclusively while multiple bottoms-up requests may proceed
+ *	simultaneously (under the protection of busy buffers).
+ *
+ *	in addition to information found in dmaps and dmap control pages,
+ *	the working state of the block allocation map also includes read/
+ *	write information maintained in the bmap descriptor (i.e. total
+ *	free block count, allocation group level free block counts).
+ *	a single exclusive lock (BMAP_LOCK) is used to guard this information
+ *	in the face of multiple-bottoms up requests.
+ *	(lock ordering: IREAD_LOCK, BMAP_LOCK);
+ *
+ *	accesses to the persistent state of the block allocation map (limited
+ *	to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
+ */
+
+#define BMAP_LOCK_INIT(bmp)	mutex_init(&bmp->db_bmaplock)
+#define BMAP_LOCK(bmp)		mutex_lock(&bmp->db_bmaplock)
+#define BMAP_UNLOCK(bmp)	mutex_unlock(&bmp->db_bmaplock)
+
+/*
+ * forward references
+ */
+static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			int nblocks);
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
+static int dbBackSplit(dmtree_t * tp, int leafno);
+static int dbJoin(dmtree_t * tp, int leafno, int newval);
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
+		    int level);
+static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
+static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks);
+static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks,
+		       int l2nb, s64 * results);
+static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks);
+static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
+			  int l2nb,
+			  s64 * results);
+static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
+		     s64 * results);
+static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
+		      s64 * results);
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
+static int dbFindBits(u32 word, int l2nb);
+static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		      int nblocks);
+static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		      int nblocks);
+static int dbMaxBud(u8 * cp);
+static int blkstol2(s64 nb);
+
+static int cntlz(u32 value);
+static int cnttz(u32 word);
+
+static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			 int nblocks);
+static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
+static int dbInitDmapTree(struct dmap * dp);
+static int dbInitTree(struct dmaptree * dtp);
+static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
+static int dbGetL2AGSize(s64 nblocks);
+
+/*
+ *	buddy table
+ *
+ * table used for determining buddy sizes within characters of
+ * dmap bitmap words.  the characters themselves serve as indexes
+ * into the table, with the table elements yielding the maximum
+ * binary buddy of free bits within the character.
+ */
+static const s8 budtab[256] = {
+	3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
+};
+
+/*
+ * NAME:	dbMount()
+ *
+ * FUNCTION:	initializate the block allocation map.
+ *
+ *		memory is allocated for the in-core bmap descriptor and
+ *		the in-core descriptor is initialized from disk.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient memory
+ *	-EIO	- i/o error
+ */
+int dbMount(struct inode *ipbmap)
+{
+	struct bmap *bmp;
+	struct dbmap_disk *dbmp_le;
+	struct metapage *mp;
+	int i;
+
+	/*
+	 * allocate/initialize the in-memory bmap descriptor
+	 */
+	/* allocate memory for the in-memory bmap descriptor */
+	bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
+	if (bmp == NULL)
+		return -ENOMEM;
+
+	/* read the on-disk bmap descriptor. */
+	mp = read_metapage(ipbmap,
+			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+			   PSIZE, 0);
+	if (mp == NULL) {
+		kfree(bmp);
+		return -EIO;
+	}
+
+	/* copy the on-disk bmap descriptor to its in-memory version. */
+	dbmp_le = (struct dbmap_disk *) mp->data;
+	bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
+	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
+	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
+	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
+	bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
+	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
+	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
+	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+	for (i = 0; i < MAXAG; i++)
+		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
+	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
+	bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
+
+	/* release the buffer. */
+	release_metapage(mp);
+
+	/* bind the bmap inode and the bmap descriptor to each other. */
+	bmp->db_ipbmap = ipbmap;
+	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
+
+	memset(bmp->db_active, 0, sizeof(bmp->db_active));
+
+	/*
+	 * allocate/initialize the bmap lock
+	 */
+	BMAP_LOCK_INIT(bmp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbUnmount()
+ *
+ * FUNCTION:	terminate the block allocation map in preparation for
+ *		file system unmount.
+ *
+ *		the in-core bmap descriptor is written to disk and
+ *		the memory for this descriptor is freed.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int dbUnmount(struct inode *ipbmap, int mounterror)
+{
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+
+	if (!(mounterror || isReadOnly(ipbmap)))
+		dbSync(ipbmap);
+
+	/*
+	 * Invalidate the page cache buffers
+	 */
+	truncate_inode_pages(ipbmap->i_mapping, 0);
+
+	/* free the memory for the in-memory bmap. */
+	kfree(bmp);
+
+	return (0);
+}
+
+/*
+ *	dbSync()
+ */
+int dbSync(struct inode *ipbmap)
+{
+	struct dbmap_disk *dbmp_le;
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	struct metapage *mp;
+	int i;
+
+	/*
+	 * write bmap global control page
+	 */
+	/* get the buffer for the on-disk bmap descriptor. */
+	mp = read_metapage(ipbmap,
+			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+			   PSIZE, 0);
+	if (mp == NULL) {
+		jfs_err("dbSync: read_metapage failed!");
+		return -EIO;
+	}
+	/* copy the in-memory version of the bmap to the on-disk version */
+	dbmp_le = (struct dbmap_disk *) mp->data;
+	dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
+	dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
+	dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
+	dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
+	dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
+	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
+	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
+	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
+	dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
+	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
+	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
+	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
+	for (i = 0; i < MAXAG; i++)
+		dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
+	dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
+	dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
+
+	/* write the buffer */
+	write_metapage(mp);
+
+	/*
+	 * write out dirty pages of bmap
+	 */
+	filemap_write_and_wait(ipbmap->i_mapping);
+
+	diWriteSpecial(ipbmap, 0);
+
+	return (0);
+}
+
+/*
+ * NAME:	dbFree()
+ *
+ * FUNCTION:	free the specified block range from the working block
+ *		allocation map.
+ *
+ *		the blocks will be free from the working map one dmap
+ *		at a time.
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode;
+ *	blkno	- starting block number to be freed.
+ *	nblocks	- number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
+{
+	struct metapage *mp;
+	struct dmap *dp;
+	int nb, rc;
+	s64 lblkno, rem;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+	struct super_block *sb = ipbmap->i_sb;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* block to be freed better be within the mapsize. */
+	if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
+		IREAD_UNLOCK(ipbmap);
+		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+		       (unsigned long long) blkno,
+		       (unsigned long long) nblocks);
+		jfs_error(ip->i_sb, "block to be freed is outside the map\n");
+		return -EIO;
+	}
+
+	/**
+	 * TRIM the blocks, when mounted with discard option
+	 */
+	if (JFS_SBI(sb)->flag & JFS_DISCARD)
+		if (JFS_SBI(sb)->minblks_trim <= nblocks)
+			jfs_issue_discard(ipbmap, blkno, nblocks);
+
+	/*
+	 * free the blocks a dmap at a time.
+	 */
+	mp = NULL;
+	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+		/* release previous dmap if any */
+		if (mp) {
+			write_metapage(mp);
+		}
+
+		/* get the buffer for the current dmap. */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			IREAD_UNLOCK(ipbmap);
+			return -EIO;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* determine the number of blocks to be freed from
+		 * this dmap.
+		 */
+		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+
+		/* free the blocks. */
+		if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
+			jfs_error(ip->i_sb, "error in block map\n");
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			return (rc);
+		}
+	}
+
+	/* write the last buffer. */
+	write_metapage(mp);
+
+	IREAD_UNLOCK(ipbmap);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbUpdatePMap()
+ *
+ * FUNCTION:	update the allocation state (free or allocate) of the
+ *		specified block range in the persistent block allocation map.
+ *
+ *		the blocks will be updated in the persistent map one
+ *		dmap at a time.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *	free	- 'true' if block range is to be freed from the persistent
+ *		  map; 'false' if it is to be allocated.
+ *	blkno	- starting block number of the range.
+ *	nblocks	- number of contiguous blocks in the range.
+ *	tblk	- transaction block;
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int
+dbUpdatePMap(struct inode *ipbmap,
+	     int free, s64 blkno, s64 nblocks, struct tblock * tblk)
+{
+	int nblks, dbitno, wbitno, rbits;
+	int word, nbits, nwords;
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	s64 lblkno, rem, lastlblkno;
+	u32 mask;
+	struct dmap *dp;
+	struct metapage *mp;
+	struct jfs_log *log;
+	int lsn, difft, diffp;
+	unsigned long flags;
+
+	/* the blocks better be within the mapsize. */
+	if (blkno + nblocks > bmp->db_mapsize) {
+		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+		       (unsigned long long) blkno,
+		       (unsigned long long) nblocks);
+		jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
+		return -EIO;
+	}
+
+	/* compute delta of transaction lsn from log syncpt */
+	lsn = tblk->lsn;
+	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
+	logdiff(difft, lsn, log);
+
+	/*
+	 * update the block state a dmap at a time.
+	 */
+	mp = NULL;
+	lastlblkno = 0;
+	for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
+		/* get the buffer for the current dmap. */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		if (lblkno != lastlblkno) {
+			if (mp) {
+				write_metapage(mp);
+			}
+
+			mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
+					   0);
+			if (mp == NULL)
+				return -EIO;
+			metapage_wait_for_io(mp);
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* determine the bit number and word within the dmap of
+		 * the starting block.  also determine how many blocks
+		 * are to be updated within this dmap.
+		 */
+		dbitno = blkno & (BPERDMAP - 1);
+		word = dbitno >> L2DBWORD;
+		nblks = min(rem, (s64)BPERDMAP - dbitno);
+
+		/* update the bits of the dmap words. the first and last
+		 * words may only have a subset of their bits updated. if
+		 * this is the case, we'll work against that word (i.e.
+		 * partial first and/or last) only in a single pass.  a
+		 * single pass will also be used to update all words that
+		 * are to have all their bits updated.
+		 */
+		for (rbits = nblks; rbits > 0;
+		     rbits -= nbits, dbitno += nbits) {
+			/* determine the bit number within the word and
+			 * the number of bits within the word.
+			 */
+			wbitno = dbitno & (DBWORD - 1);
+			nbits = min(rbits, DBWORD - wbitno);
+
+			/* check if only part of the word is to be updated. */
+			if (nbits < DBWORD) {
+				/* update (free or allocate) the bits
+				 * in this word.
+				 */
+				mask =
+				    (ONES << (DBWORD - nbits) >> wbitno);
+				if (free)
+					dp->pmap[word] &=
+					    cpu_to_le32(~mask);
+				else
+					dp->pmap[word] |=
+					    cpu_to_le32(mask);
+
+				word += 1;
+			} else {
+				/* one or more words are to have all
+				 * their bits updated.  determine how
+				 * many words and how many bits.
+				 */
+				nwords = rbits >> L2DBWORD;
+				nbits = nwords << L2DBWORD;
+
+				/* update (free or allocate) the bits
+				 * in these words.
+				 */
+				if (free)
+					memset(&dp->pmap[word], 0,
+					       nwords * 4);
+				else
+					memset(&dp->pmap[word], (int) ONES,
+					       nwords * 4);
+
+				word += nwords;
+			}
+		}
+
+		/*
+		 * update dmap lsn
+		 */
+		if (lblkno == lastlblkno)
+			continue;
+
+		lastlblkno = lblkno;
+
+		LOGSYNC_LOCK(log, flags);
+		if (mp->lsn != 0) {
+			/* inherit older/smaller lsn */
+			logdiff(diffp, mp->lsn, log);
+			if (difft < diffp) {
+				mp->lsn = lsn;
+
+				/* move bp after tblock in logsync list */
+				list_move(&mp->synclist, &tblk->synclist);
+			}
+
+			/* inherit younger/larger clsn */
+			logdiff(difft, tblk->clsn, log);
+			logdiff(diffp, mp->clsn, log);
+			if (difft > diffp)
+				mp->clsn = tblk->clsn;
+		} else {
+			mp->log = log;
+			mp->lsn = lsn;
+
+			/* insert bp after tblock in logsync list */
+			log->count++;
+			list_add(&mp->synclist, &tblk->synclist);
+
+			mp->clsn = tblk->clsn;
+		}
+		LOGSYNC_UNLOCK(log, flags);
+	}
+
+	/* write the last buffer. */
+	if (mp) {
+		write_metapage(mp);
+	}
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbNextAG()
+ *
+ * FUNCTION:	find the preferred allocation group for new allocations.
+ *
+ *		Within the allocation groups, we maintain a preferred
+ *		allocation group which consists of a group with at least
+ *		average free space.  It is the preferred group that we target
+ *		new inode allocation towards.  The tie-in between inode
+ *		allocation and block allocation occurs as we allocate the
+ *		first (data) block of an inode and specify the inode (block)
+ *		as the allocation hint for this block.
+ *
+ *		We try to avoid having more than one open file growing in
+ *		an allocation group, as this will lead to fragmentation.
+ *		This differs from the old OS/2 method of trying to keep
+ *		empty ags around for large allocations.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *	the preferred allocation group number.
+ */
+int dbNextAG(struct inode *ipbmap)
+{
+	s64 avgfree;
+	int agpref;
+	s64 hwm = 0;
+	int i;
+	int next_best = -1;
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+
+	BMAP_LOCK(bmp);
+
+	/* determine the average number of free blocks within the ags. */
+	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
+
+	/*
+	 * if the current preferred ag does not have an active allocator
+	 * and has at least average freespace, return it
+	 */
+	agpref = bmp->db_agpref;
+	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
+	    (bmp->db_agfree[agpref] >= avgfree))
+		goto unlock;
+
+	/* From the last preferred ag, find the next one with at least
+	 * average free space.
+	 */
+	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
+		if (agpref == bmp->db_numag)
+			agpref = 0;
+
+		if (atomic_read(&bmp->db_active[agpref]))
+			/* open file is currently growing in this ag */
+			continue;
+		if (bmp->db_agfree[agpref] >= avgfree) {
+			/* Return this one */
+			bmp->db_agpref = agpref;
+			goto unlock;
+		} else if (bmp->db_agfree[agpref] > hwm) {
+			/* Less than avg. freespace, but best so far */
+			hwm = bmp->db_agfree[agpref];
+			next_best = agpref;
+		}
+	}
+
+	/*
+	 * If no inactive ag was found with average freespace, use the
+	 * next best
+	 */
+	if (next_best != -1)
+		bmp->db_agpref = next_best;
+	/* else leave db_agpref unchanged */
+unlock:
+	BMAP_UNLOCK(bmp);
+
+	/* return the preferred group.
+	 */
+	return (bmp->db_agpref);
+}
+
+/*
+ * NAME:	dbAlloc()
+ *
+ * FUNCTION:	attempt to allocate a specified number of contiguous free
+ *		blocks from the working allocation block map.
+ *
+ *		the block allocation policy uses hints and a multi-step
+ *		approach.
+ *
+ *		for allocation requests smaller than the number of blocks
+ *		per dmap, we first try to allocate the new blocks
+ *		immediately following the hint.  if these blocks are not
+ *		available, we try to allocate blocks near the hint.  if
+ *		no blocks near the hint are available, we next try to
+ *		allocate within the same dmap as contains the hint.
+ *
+ *		if no blocks are available in the dmap or the allocation
+ *		request is larger than the dmap size, we try to allocate
+ *		within the same allocation group as contains the hint. if
+ *		this does not succeed, we finally try to allocate anywhere
+ *		within the aggregate.
+ *
+ *		we also try to allocate anywhere within the aggregate for
+ *		for allocation requests larger than the allocation group
+ *		size or requests that specify no hint value.
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode;
+ *	hint	- allocation hint.
+ *	nblocks	- number of contiguous blocks in the range.
+ *	results	- on successful return, set to the starting block number
+ *		  of the newly allocated contiguous range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
+{
+	int rc, agno;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp;
+	struct metapage *mp;
+	s64 lblkno, blkno;
+	struct dmap *dp;
+	int l2nb;
+	s64 mapSize;
+	int writers;
+
+	/* assert that nblocks is valid */
+	assert(nblocks > 0);
+
+	/* get the log2 number of blocks to be allocated.
+	 * if the number of blocks is not a log2 multiple,
+	 * it will be rounded up to the next log2 multiple.
+	 */
+	l2nb = BLKSTOL2(nblocks);
+
+	bmp = JFS_SBI(ip->i_sb)->bmap;
+
+	mapSize = bmp->db_mapsize;
+
+	/* the hint should be within the map */
+	if (hint >= mapSize) {
+		jfs_error(ip->i_sb, "the hint is outside the map\n");
+		return -EIO;
+	}
+
+	/* if the number of blocks to be allocated is greater than the
+	 * allocation group size, try to allocate anywhere.
+	 */
+	if (l2nb > bmp->db_agl2size) {
+		IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+		rc = dbAllocAny(bmp, nblocks, l2nb, results);
+
+		goto write_unlock;
+	}
+
+	/*
+	 * If no hint, let dbNextAG recommend an allocation group
+	 */
+	if (hint == 0)
+		goto pref_ag;
+
+	/* we would like to allocate close to the hint.  adjust the
+	 * hint to the block following the hint since the allocators
+	 * will start looking for free space starting at this point.
+	 */
+	blkno = hint + 1;
+
+	if (blkno >= bmp->db_mapsize)
+		goto pref_ag;
+
+	agno = blkno >> bmp->db_agl2size;
+
+	/* check if blkno crosses over into a new allocation group.
+	 * if so, check if we should allow allocations within this
+	 * allocation group.
+	 */
+	if ((blkno & (bmp->db_agsize - 1)) == 0)
+		/* check if the AG is currently being written to.
+		 * if so, call dbNextAG() to find a non-busy
+		 * AG with sufficient free space.
+		 */
+		if (atomic_read(&bmp->db_active[agno]))
+			goto pref_ag;
+
+	/* check if the allocation request size can be satisfied from a
+	 * single dmap.  if so, try to allocate from the dmap containing
+	 * the hint using a tiered strategy.
+	 */
+	if (nblocks <= BPERDMAP) {
+		IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+		/* get the buffer for the dmap containing the hint.
+		 */
+		rc = -EIO;
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL)
+			goto read_unlock;
+
+		dp = (struct dmap *) mp->data;
+
+		/* first, try to satisfy the allocation request with the
+		 * blocks beginning at the hint.
+		 */
+		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
+		    != -ENOSPC) {
+			if (rc == 0) {
+				*results = blkno;
+				mark_metapage_dirty(mp);
+			}
+
+			release_metapage(mp);
+			goto read_unlock;
+		}
+
+		writers = atomic_read(&bmp->db_active[agno]);
+		if ((writers > 1) ||
+		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
+			/*
+			 * Someone else is writing in this allocation
+			 * group.  To avoid fragmenting, try another ag
+			 */
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			goto pref_ag;
+		}
+
+		/* next, try to satisfy the allocation request with blocks
+		 * near the hint.
+		 */
+		if ((rc =
+		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
+		    != -ENOSPC) {
+			if (rc == 0)
+				mark_metapage_dirty(mp);
+
+			release_metapage(mp);
+			goto read_unlock;
+		}
+
+		/* try to satisfy the allocation request with blocks within
+		 * the same dmap as the hint.
+		 */
+		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
+		    != -ENOSPC) {
+			if (rc == 0)
+				mark_metapage_dirty(mp);
+
+			release_metapage(mp);
+			goto read_unlock;
+		}
+
+		release_metapage(mp);
+		IREAD_UNLOCK(ipbmap);
+	}
+
+	/* try to satisfy the allocation request with blocks within
+	 * the same allocation group as the hint.
+	 */
+	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
+	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
+		goto write_unlock;
+
+	IWRITE_UNLOCK(ipbmap);
+
+
+      pref_ag:
+	/*
+	 * Let dbNextAG recommend a preferred allocation group
+	 */
+	agno = dbNextAG(ipbmap);
+	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* Try to allocate within this allocation group.  if that fails, try to
+	 * allocate anywhere in the map.
+	 */
+	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
+		rc = dbAllocAny(bmp, nblocks, l2nb, results);
+
+      write_unlock:
+	IWRITE_UNLOCK(ipbmap);
+
+	return (rc);
+
+      read_unlock:
+	IREAD_UNLOCK(ipbmap);
+
+	return (rc);
+}
+
+#ifdef _NOTYET
+/*
+ * NAME:	dbAllocExact()
+ *
+ * FUNCTION:	try to allocate the requested extent;
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode;
+ *	blkno	- extent address;
+ *	nblocks	- extent length;
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
+{
+	int rc;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+	struct dmap *dp;
+	s64 lblkno;
+	struct metapage *mp;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/*
+	 * validate extent request:
+	 *
+	 * note: defragfs policy:
+	 *  max 64 blocks will be moved.
+	 *  allocation request size must be satisfied from a single dmap.
+	 */
+	if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
+		IREAD_UNLOCK(ipbmap);
+		return -EINVAL;
+	}
+
+	if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
+		/* the free space is no longer available */
+		IREAD_UNLOCK(ipbmap);
+		return -ENOSPC;
+	}
+
+	/* read in the dmap covering the extent */
+	lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL) {
+		IREAD_UNLOCK(ipbmap);
+		return -EIO;
+	}
+	dp = (struct dmap *) mp->data;
+
+	/* try to allocate the requested extent */
+	rc = dbAllocNext(bmp, dp, blkno, nblocks);
+
+	IREAD_UNLOCK(ipbmap);
+
+	if (rc == 0)
+		mark_metapage_dirty(mp);
+
+	release_metapage(mp);
+
+	return (rc);
+}
+#endif /* _NOTYET */
+
+/*
+ * NAME:	dbReAlloc()
+ *
+ * FUNCTION:	attempt to extend a current allocation by a specified
+ *		number of blocks.
+ *
+ *		this routine attempts to satisfy the allocation request
+ *		by first trying to extend the existing allocation in
+ *		place by allocating the additional blocks as the blocks
+ *		immediately following the current allocation.  if these
+ *		blocks are not available, this routine will attempt to
+ *		allocate a new set of contiguous blocks large enough
+ *		to cover the existing allocation plus the additional
+ *		number of blocks required.
+ *
+ * PARAMETERS:
+ *	ip	    -  pointer to in-core inode requiring allocation.
+ *	blkno	    -  starting block of the current allocation.
+ *	nblocks	    -  number of contiguous blocks within the current
+ *		       allocation.
+ *	addnblocks  -  number of blocks to add to the allocation.
+ *	results	-      on successful return, set to the starting block number
+ *		       of the existing allocation if the existing allocation
+ *		       was extended in place or to a newly allocated contiguous
+ *		       range if the existing allocation could not be extended
+ *		       in place.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+int
+dbReAlloc(struct inode *ip,
+	  s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
+{
+	int rc;
+
+	/* try to extend the allocation in place.
+	 */
+	if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
+		*results = blkno;
+		return (0);
+	} else {
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* could not extend the allocation in place, so allocate a
+	 * new set of blocks for the entire request (i.e. try to get
+	 * a range of contiguous blocks large enough to cover the
+	 * existing allocation plus the additional blocks.)
+	 */
+	return (dbAlloc
+		(ip, blkno + nblocks - 1, addnblocks + nblocks, results));
+}
+
+
+/*
+ * NAME:	dbExtend()
+ *
+ * FUNCTION:	attempt to extend a current allocation by a specified
+ *		number of blocks.
+ *
+ *		this routine attempts to satisfy the allocation request
+ *		by first trying to extend the existing allocation in
+ *		place by allocating the additional blocks as the blocks
+ *		immediately following the current allocation.
+ *
+ * PARAMETERS:
+ *	ip	    -  pointer to in-core inode requiring allocation.
+ *	blkno	    -  starting block of the current allocation.
+ *	nblocks	    -  number of contiguous blocks within the current
+ *		       allocation.
+ *	addnblocks  -  number of blocks to add to the allocation.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	s64 lblkno, lastblkno, extblkno;
+	uint rel_block;
+	struct metapage *mp;
+	struct dmap *dp;
+	int rc;
+	struct inode *ipbmap = sbi->ipbmap;
+	struct bmap *bmp;
+
+	/*
+	 * We don't want a non-aligned extent to cross a page boundary
+	 */
+	if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
+	    (rel_block + nblocks + addnblocks > sbi->nbperpage))
+		return -ENOSPC;
+
+	/* get the last block of the current allocation */
+	lastblkno = blkno + nblocks - 1;
+
+	/* determine the block number of the block following
+	 * the existing allocation.
+	 */
+	extblkno = lastblkno + 1;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* better be within the file system */
+	bmp = sbi->bmap;
+	if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
+		IREAD_UNLOCK(ipbmap);
+		jfs_error(ip->i_sb, "the block is outside the filesystem\n");
+		return -EIO;
+	}
+
+	/* we'll attempt to extend the current allocation in place by
+	 * allocating the additional blocks as the blocks immediately
+	 * following the current allocation.  we only try to extend the
+	 * current allocation in place if the number of additional blocks
+	 * can fit into a dmap, the last block of the current allocation
+	 * is not the last block of the file system, and the start of the
+	 * inplace extension is not on an allocation group boundary.
+	 */
+	if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
+	    (extblkno & (bmp->db_agsize - 1)) == 0) {
+		IREAD_UNLOCK(ipbmap);
+		return -ENOSPC;
+	}
+
+	/* get the buffer for the dmap containing the first block
+	 * of the extension.
+	 */
+	lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
+	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL) {
+		IREAD_UNLOCK(ipbmap);
+		return -EIO;
+	}
+
+	dp = (struct dmap *) mp->data;
+
+	/* try to allocate the blocks immediately following the
+	 * current allocation.
+	 */
+	rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
+
+	IREAD_UNLOCK(ipbmap);
+
+	/* were we successful ? */
+	if (rc == 0)
+		write_metapage(mp);
+	else
+		/* we were not successful */
+		release_metapage(mp);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocNext()
+ *
+ * FUNCTION:	attempt to allocate the blocks of the specified block
+ *		range within a dmap.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap.
+ *	blkno	-  starting block number of the range.
+ *	nblocks	-  number of contiguous free blocks of the range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks)
+{
+	int dbitno, word, rembits, nb, nwords, wbitno, nw;
+	int l2size;
+	s8 *leaf;
+	u32 mask;
+
+	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
+		return -EIO;
+	}
+
+	/* pick up a pointer to the leaves of the dmap tree.
+	 */
+	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* check if the specified block range is contained within
+	 * this dmap.
+	 */
+	if (dbitno + nblocks > BPERDMAP)
+		return -ENOSPC;
+
+	/* check if the starting leaf indicates that anything
+	 * is free.
+	 */
+	if (leaf[word] == NOFREE)
+		return -ENOSPC;
+
+	/* check the dmaps words corresponding to block range to see
+	 * if the block range is free.  not all bits of the first and
+	 * last words may be contained within the block range.  if this
+	 * is the case, we'll work against those words (i.e. partial first
+	 * and/or last) on an individual basis (a single pass) and examine
+	 * the actual bits to determine if they are free.  a single pass
+	 * will be used for all dmap words fully contained within the
+	 * specified range.  within this pass, the leaves of the dmap
+	 * tree will be examined to determine if the blocks are free. a
+	 * single leaf may describe the free space of multiple dmap
+	 * words, so we may visit only a subset of the actual leaves
+	 * corresponding to the dmap words of the block range.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of the word is to be examined.
+		 */
+		if (nb < DBWORD) {
+			/* check if the bits are free.
+			 */
+			mask = (ONES << (DBWORD - nb) >> wbitno);
+			if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
+				return -ENOSPC;
+
+			word += 1;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and how many bits.
+			 */
+			nwords = rembits >> L2DBWORD;
+			nb = nwords << L2DBWORD;
+
+			/* now examine the appropriate leaves to determine
+			 * if the blocks are free.
+			 */
+			while (nwords > 0) {
+				/* does the leaf describe any free space ?
+				 */
+				if (leaf[word] < BUDMIN)
+					return -ENOSPC;
+
+				/* determine the l2 number of bits provided
+				 * by this leaf.
+				 */
+				l2size =
+				    min_t(int, leaf[word], NLSTOL2BSZ(nwords));
+
+				/* determine how many words were handled.
+				 */
+				nw = BUDSIZE(l2size, BUDMIN);
+
+				nwords -= nw;
+				word += nw;
+			}
+		}
+	}
+
+	/* allocate the blocks.
+	 */
+	return (dbAllocDmap(bmp, dp, blkno, nblocks));
+}
+
+
+/*
+ * NAME:	dbAllocNear()
+ *
+ * FUNCTION:	attempt to allocate a number of contiguous free blocks near
+ *		a specified block (hint) within a dmap.
+ *
+ *		starting with the dmap leaf that covers the hint, we'll
+ *		check the next four contiguous leaves for sufficient free
+ *		space.  if sufficient free space is found, we'll allocate
+ *		the desired free space.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap.
+ *	blkno	-  block number to allocate near.
+ *	nblocks	-  actual number of contiguous free blocks desired.
+ *	l2nb	-  log2 number of contiguous free blocks desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocNear(struct bmap * bmp,
+	    struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
+{
+	int word, lword, rc;
+	s8 *leaf;
+
+	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
+		return -EIO;
+	}
+
+	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+
+	/* determine the word within the dmap that holds the hint
+	 * (i.e. blkno).  also, determine the last word in the dmap
+	 * that we'll include in our examination.
+	 */
+	word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+	lword = min(word + 4, LPERDMAP);
+
+	/* examine the leaves for sufficient free space.
+	 */
+	for (; word < lword; word++) {
+		/* does the leaf describe sufficient free space ?
+		 */
+		if (leaf[word] < l2nb)
+			continue;
+
+		/* determine the block number within the file system
+		 * of the first block described by this dmap word.
+		 */
+		blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
+
+		/* if not all bits of the dmap word are free, get the
+		 * starting bit number within the dmap word of the required
+		 * string of free bits and adjust the block number with the
+		 * value.
+		 */
+		if (leaf[word] < BUDMIN)
+			blkno +=
+			    dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
+
+		/* allocate the blocks.
+		 */
+		if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+			*results = blkno;
+
+		return (rc);
+	}
+
+	return -ENOSPC;
+}
+
+
+/*
+ * NAME:	dbAllocAG()
+ *
+ * FUNCTION:	attempt to allocate the specified number of contiguous
+ *		free blocks within the specified allocation group.
+ *
+ *		unless the allocation group size is equal to the number
+ *		of blocks per dmap, the dmap control pages will be used to
+ *		find the required free space, if available.  we start the
+ *		search at the highest dmap control page level which
+ *		distinctly describes the allocation group's free space
+ *		(i.e. the highest level at which the allocation group's
+ *		free space is not mixed in with that of any other group).
+ *		in addition, we start the search within this level at a
+ *		height of the dmapctl dmtree at which the nodes distinctly
+ *		describe the allocation group's free space.  at this height,
+ *		the allocation group's free space may be represented by 1
+ *		or two sub-trees, depending on the allocation group size.
+ *		we search the top nodes of these subtrees left to right for
+ *		sufficient free space.  if sufficient free space is found,
+ *		the subtree is searched to find the leftmost leaf that
+ *		has free space.  once we have made it to the leaf, we
+ *		move the search to the next lower level dmap control page
+ *		corresponding to this leaf.  we continue down the dmap control
+ *		pages until we find the dmap that contains or starts the
+ *		sufficient free space and we allocate at this dmap.
+ *
+ *		if the allocation group size is equal to the dmap size,
+ *		we'll start at the dmap corresponding to the allocation
+ *		group and attempt the allocation at this level.
+ *
+ *		the dmap control page search is also not performed if the
+ *		allocation group is completely free and we go to the first
+ *		dmap of the allocation group to do the allocation.  this is
+ *		done because the allocation group may be part (not the first
+ *		part) of a larger binary buddy system, causing the dmap
+ *		control pages to indicate no free space (NOFREE) within
+ *		the allocation group.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	agno	- allocation group number.
+ *	nblocks	-  actual number of contiguous free blocks desired.
+ *	l2nb	-  log2 number of contiguous free blocks desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * note: IWRITE_LOCK(ipmap) held on entry/exit;
+ */
+static int
+dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
+{
+	struct metapage *mp;
+	struct dmapctl *dcp;
+	int rc, ti, i, k, m, n, agperlev;
+	s64 blkno, lblkno;
+	int budmin;
+
+	/* allocation request should not be for more than the
+	 * allocation group size.
+	 */
+	if (l2nb > bmp->db_agl2size) {
+		jfs_error(bmp->db_ipbmap->i_sb,
+			  "allocation request is larger than the allocation group size\n");
+		return -EIO;
+	}
+
+	/* determine the starting block number of the allocation
+	 * group.
+	 */
+	blkno = (s64) agno << bmp->db_agl2size;
+
+	/* check if the allocation group size is the minimum allocation
+	 * group size or if the allocation group is completely free. if
+	 * the allocation group size is the minimum size of BPERDMAP (i.e.
+	 * 1 dmap), there is no need to search the dmap control page (below)
+	 * that fully describes the allocation group since the allocation
+	 * group is already fully described by a dmap.  in this case, we
+	 * just call dbAllocCtl() to search the dmap tree and allocate the
+	 * required space if available.
+	 *
+	 * if the allocation group is completely free, dbAllocCtl() is
+	 * also called to allocate the required space.  this is done for
+	 * two reasons.  first, it makes no sense searching the dmap control
+	 * pages for free space when we know that free space exists.  second,
+	 * the dmap control pages may indicate that the allocation group
+	 * has no free space if the allocation group is part (not the first
+	 * part) of a larger binary buddy system.
+	 */
+	if (bmp->db_agsize == BPERDMAP
+	    || bmp->db_agfree[agno] == bmp->db_agsize) {
+		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+		if ((rc == -ENOSPC) &&
+		    (bmp->db_agfree[agno] == bmp->db_agsize)) {
+			printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
+			       (unsigned long long) blkno,
+			       (unsigned long long) nblocks);
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "dbAllocCtl failed in free AG\n");
+		}
+		return (rc);
+	}
+
+	/* the buffer for the dmap control page that fully describes the
+	 * allocation group.
+	 */
+	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
+	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL)
+		return -EIO;
+	dcp = (struct dmapctl *) mp->data;
+	budmin = dcp->budmin;
+
+	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
+		release_metapage(mp);
+		return -EIO;
+	}
+
+	/* search the subtree(s) of the dmap control page that describes
+	 * the allocation group, looking for sufficient free space.  to begin,
+	 * determine how many allocation groups are represented in a dmap
+	 * control page at the control page level (i.e. L0, L1, L2) that
+	 * fully describes an allocation group. next, determine the starting
+	 * tree index of this allocation group within the control page.
+	 */
+	agperlev =
+	    (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
+	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
+
+	/* dmap control page trees fan-out by 4 and a single allocation
+	 * group may be described by 1 or 2 subtrees within the ag level
+	 * dmap control page, depending upon the ag size. examine the ag's
+	 * subtrees for sufficient free space, starting with the leftmost
+	 * subtree.
+	 */
+	for (i = 0; i < bmp->db_agwidth; i++, ti++) {
+		/* is there sufficient free space ?
+		 */
+		if (l2nb > dcp->stree[ti])
+			continue;
+
+		/* sufficient free space found in a subtree. now search down
+		 * the subtree to find the leftmost leaf that describes this
+		 * free space.
+		 */
+		for (k = bmp->db_agheight; k > 0; k--) {
+			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
+				if (l2nb <= dcp->stree[m + n]) {
+					ti = m + n;
+					break;
+				}
+			}
+			if (n == 4) {
+				jfs_error(bmp->db_ipbmap->i_sb,
+					  "failed descending stree\n");
+				release_metapage(mp);
+				return -EIO;
+			}
+		}
+
+		/* determine the block number within the file system
+		 * that corresponds to this leaf.
+		 */
+		if (bmp->db_aglevel == 2)
+			blkno = 0;
+		else if (bmp->db_aglevel == 1)
+			blkno &= ~(MAXL1SIZE - 1);
+		else		/* bmp->db_aglevel == 0 */
+			blkno &= ~(MAXL0SIZE - 1);
+
+		blkno +=
+		    ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
+
+		/* release the buffer in preparation for going down
+		 * the next level of dmap control pages.
+		 */
+		release_metapage(mp);
+
+		/* check if we need to continue to search down the lower
+		 * level dmap control pages.  we need to if the number of
+		 * blocks required is less than maximum number of blocks
+		 * described at the next lower level.
+		 */
+		if (l2nb < budmin) {
+
+			/* search the lower level dmap control pages to get
+			 * the starting block number of the dmap that
+			 * contains or starts off the free space.
+			 */
+			if ((rc =
+			     dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
+				       &blkno))) {
+				if (rc == -ENOSPC) {
+					jfs_error(bmp->db_ipbmap->i_sb,
+						  "control page inconsistent\n");
+					return -EIO;
+				}
+				return (rc);
+			}
+		}
+
+		/* allocate the blocks.
+		 */
+		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+		if (rc == -ENOSPC) {
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "unable to allocate blocks\n");
+			rc = -EIO;
+		}
+		return (rc);
+	}
+
+	/* no space in the allocation group.  release the buffer and
+	 * return -ENOSPC.
+	 */
+	release_metapage(mp);
+
+	return -ENOSPC;
+}
+
+
+/*
+ * NAME:	dbAllocAny()
+ *
+ * FUNCTION:	attempt to allocate the specified number of contiguous
+ *		free blocks anywhere in the file system.
+ *
+ *		dbAllocAny() attempts to find the sufficient free space by
+ *		searching down the dmap control pages, starting with the
+ *		highest level (i.e. L0, L1, L2) control page.  if free space
+ *		large enough to satisfy the desired free space is found, the
+ *		desired free space is allocated.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	nblocks	 -  actual number of contiguous free blocks desired.
+ *	l2nb	 -  log2 number of contiguous free blocks desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
+{
+	int rc;
+	s64 blkno = 0;
+
+	/* starting with the top level dmap control page, search
+	 * down the dmap control levels for sufficient free space.
+	 * if free space is found, dbFindCtl() returns the starting
+	 * block number of the dmap that contains or starts off the
+	 * range of free space.
+	 */
+	if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
+		return (rc);
+
+	/* allocate the blocks.
+	 */
+	rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+	if (rc == -ENOSPC) {
+		jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
+		return -EIO;
+	}
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbDiscardAG()
+ *
+ * FUNCTION:	attempt to discard (TRIM) all free blocks of specific AG
+ *
+ *		algorithm:
+ *		1) allocate blocks, as large as possible and save them
+ *		   while holding IWRITE_LOCK on ipbmap
+ *		2) trim all these saved block/length values
+ *		3) mark the blocks free again
+ *
+ *		benefit:
+ *		- we work only on one ag at some time, minimizing how long we
+ *		  need to lock ipbmap
+ *		- reading / writing the fs is possible most time, even on
+ *		  trimming
+ *
+ *		downside:
+ *		- we write two times to the dmapctl and dmap pages
+ *		- but for me, this seems the best way, better ideas?
+ *		/TR 2012
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode
+ *	agno	- ag to trim
+ *	minlen	- minimum value of contiguous blocks
+ *
+ * RETURN VALUES:
+ *	s64	- actual number of blocks trimmed
+ */
+s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
+{
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+	s64 nblocks, blkno;
+	u64 trimmed = 0;
+	int rc, l2nb;
+	struct super_block *sb = ipbmap->i_sb;
+
+	struct range2trim {
+		u64 blkno;
+		u64 nblocks;
+	} *totrim, *tt;
+
+	/* max blkno / nblocks pairs to trim */
+	int count = 0, range_cnt;
+	u64 max_ranges;
+
+	/* prevent others from writing new stuff here, while trimming */
+	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	nblocks = bmp->db_agfree[agno];
+	max_ranges = nblocks;
+	do_div(max_ranges, minlen);
+	range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
+	totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
+	if (totrim == NULL) {
+		jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
+		IWRITE_UNLOCK(ipbmap);
+		return 0;
+	}
+
+	tt = totrim;
+	while (nblocks >= minlen) {
+		l2nb = BLKSTOL2(nblocks);
+
+		/* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */
+		rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno);
+		if (rc == 0) {
+			tt->blkno = blkno;
+			tt->nblocks = nblocks;
+			tt++; count++;
+
+			/* the whole ag is free, trim now */
+			if (bmp->db_agfree[agno] == 0)
+				break;
+
+			/* give a hint for the next while */
+			nblocks = bmp->db_agfree[agno];
+			continue;
+		} else if (rc == -ENOSPC) {
+			/* search for next smaller log2 block */
+			l2nb = BLKSTOL2(nblocks) - 1;
+			nblocks = 1 << l2nb;
+		} else {
+			/* Trim any already allocated blocks */
+			jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
+			break;
+		}
+
+		/* check, if our trim array is full */
+		if (unlikely(count >= range_cnt - 1))
+			break;
+	}
+	IWRITE_UNLOCK(ipbmap);
+
+	tt->nblocks = 0; /* mark the current end */
+	for (tt = totrim; tt->nblocks != 0; tt++) {
+		/* when mounted with online discard, dbFree() will
+		 * call jfs_issue_discard() itself */
+		if (!(JFS_SBI(sb)->flag & JFS_DISCARD))
+			jfs_issue_discard(ip, tt->blkno, tt->nblocks);
+		dbFree(ip, tt->blkno, tt->nblocks);
+		trimmed += tt->nblocks;
+	}
+	kfree(totrim);
+
+	return trimmed;
+}
+
+/*
+ * NAME:	dbFindCtl()
+ *
+ * FUNCTION:	starting at a specified dmap control page level and block
+ *		number, search down the dmap control levels for a range of
+ *		contiguous free blocks large enough to satisfy an allocation
+ *		request for the specified number of free blocks.
+ *
+ *		if sufficient contiguous free blocks are found, this routine
+ *		returns the starting block number within a dmap page that
+ *		contains or starts a range of contiqious free blocks that
+ *		is sufficient in size.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	level	-  starting dmap control page level.
+ *	l2nb	-  log2 number of contiguous free blocks desired.
+ *	*blkno	-  on entry, starting block number for conducting the search.
+ *		   on successful return, the first block within a dmap page
+ *		   that contains or starts a range of contiguous free blocks.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
+{
+	int rc, leafidx, lev;
+	s64 b, lblkno;
+	struct dmapctl *dcp;
+	int budmin;
+	struct metapage *mp;
+
+	/* starting at the specified dmap control page level and block
+	 * number, search down the dmap control levels for the starting
+	 * block number of a dmap page that contains or starts off
+	 * sufficient free blocks.
+	 */
+	for (lev = level, b = *blkno; lev >= 0; lev--) {
+		/* get the buffer of the dmap control page for the block
+		 * number and level (i.e. L0, L1, L2).
+		 */
+		lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL)
+			return -EIO;
+		dcp = (struct dmapctl *) mp->data;
+		budmin = dcp->budmin;
+
+		if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "Corrupt dmapctl page\n");
+			release_metapage(mp);
+			return -EIO;
+		}
+
+		/* search the tree within the dmap control page for
+		 * sufficient free space.  if sufficient free space is found,
+		 * dbFindLeaf() returns the index of the leaf at which
+		 * free space was found.
+		 */
+		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
+
+		/* release the buffer.
+		 */
+		release_metapage(mp);
+
+		/* space found ?
+		 */
+		if (rc) {
+			if (lev != level) {
+				jfs_error(bmp->db_ipbmap->i_sb,
+					  "dmap inconsistent\n");
+				return -EIO;
+			}
+			return -ENOSPC;
+		}
+
+		/* adjust the block number to reflect the location within
+		 * the dmap control page (i.e. the leaf) at which free
+		 * space was found.
+		 */
+		b += (((s64) leafidx) << budmin);
+
+		/* we stop the search at this dmap control page level if
+		 * the number of blocks required is greater than or equal
+		 * to the maximum number of blocks described at the next
+		 * (lower) level.
+		 */
+		if (l2nb >= budmin)
+			break;
+	}
+
+	*blkno = b;
+	return (0);
+}
+
+
+/*
+ * NAME:	dbAllocCtl()
+ *
+ * FUNCTION:	attempt to allocate a specified number of contiguous
+ *		blocks starting within a specific dmap.
+ *
+ *		this routine is called by higher level routines that search
+ *		the dmap control pages above the actual dmaps for contiguous
+ *		free space.  the result of successful searches by these
+ *		routines are the starting block numbers within dmaps, with
+ *		the dmaps themselves containing the desired contiguous free
+ *		space or starting a contiguous free space of desired size
+ *		that is made up of the blocks of one or more dmaps. these
+ *		calls should not fail due to insufficent resources.
+ *
+ *		this routine is called in some cases where it is not known
+ *		whether it will fail due to insufficient resources.  more
+ *		specifically, this occurs when allocating from an allocation
+ *		group whose size is equal to the number of blocks per dmap.
+ *		in this case, the dmap control pages are not examined prior
+ *		to calling this routine (to save pathlength) and the call
+ *		might fail.
+ *
+ *		for a request size that fits within a dmap, this routine relies
+ *		upon the dmap's dmtree to find the requested contiguous free
+ *		space.  for request sizes that are larger than a dmap, the
+ *		requested free space will start at the first block of the
+ *		first dmap (i.e. blkno).
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	nblocks	 -  actual number of contiguous free blocks to allocate.
+ *	l2nb	 -  log2 number of contiguous free blocks to allocate.
+ *	blkno	 -  starting block number of the dmap to start the allocation
+ *		    from.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
+{
+	int rc, nb;
+	s64 b, lblkno, n;
+	struct metapage *mp;
+	struct dmap *dp;
+
+	/* check if the allocation request is confined to a single dmap.
+	 */
+	if (l2nb <= L2BPERDMAP) {
+		/* get the buffer for the dmap.
+		 */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL)
+			return -EIO;
+		dp = (struct dmap *) mp->data;
+
+		/* try to allocate the blocks.
+		 */
+		rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
+		if (rc == 0)
+			mark_metapage_dirty(mp);
+
+		release_metapage(mp);
+
+		return (rc);
+	}
+
+	/* allocation request involving multiple dmaps. it must start on
+	 * a dmap boundary.
+	 */
+	assert((blkno & (BPERDMAP - 1)) == 0);
+
+	/* allocate the blocks dmap by dmap.
+	 */
+	for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
+		/* get the buffer for the dmap.
+		 */
+		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			rc = -EIO;
+			goto backout;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* the dmap better be all free.
+		 */
+		if (dp->tree.stree[ROOT] != L2BPERDMAP) {
+			release_metapage(mp);
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "the dmap is not all free\n");
+			rc = -EIO;
+			goto backout;
+		}
+
+		/* determine how many blocks to allocate from this dmap.
+		 */
+		nb = min_t(s64, n, BPERDMAP);
+
+		/* allocate the blocks from the dmap.
+		 */
+		if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
+			release_metapage(mp);
+			goto backout;
+		}
+
+		/* write the buffer.
+		 */
+		write_metapage(mp);
+	}
+
+	/* set the results (starting block number) and return.
+	 */
+	*results = blkno;
+	return (0);
+
+	/* something failed in handling an allocation request involving
+	 * multiple dmaps.  we'll try to clean up by backing out any
+	 * allocation that has already happened for this request.  if
+	 * we fail in backing out the allocation, we'll mark the file
+	 * system to indicate that blocks have been leaked.
+	 */
+      backout:
+
+	/* try to backout the allocations dmap by dmap.
+	 */
+	for (n = nblocks - n, b = blkno; n > 0;
+	     n -= BPERDMAP, b += BPERDMAP) {
+		/* get the buffer for this dmap.
+		 */
+		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			/* could not back out.  mark the file system
+			 * to indicate that we have leaked blocks.
+			 */
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "I/O Error: Block Leakage\n");
+			continue;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* free the blocks is this dmap.
+		 */
+		if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
+			/* could not back out.  mark the file system
+			 * to indicate that we have leaked blocks.
+			 */
+			release_metapage(mp);
+			jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
+			continue;
+		}
+
+		/* write the buffer.
+		 */
+		write_metapage(mp);
+	}
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocDmapLev()
+ *
+ * FUNCTION:	attempt to allocate a specified number of contiguous blocks
+ *		from a specified dmap.
+ *
+ *		this routine checks if the contiguous blocks are available.
+ *		if so, nblocks of blocks are allocated; otherwise, ENOSPC is
+ *		returned.
+ *
+ * PARAMETERS:
+ *	mp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to attempt to allocate blocks from.
+ *	l2nb	-  log2 number of contiguous block desired.
+ *	nblocks	-  actual number of contiguous block desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
+ *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
+ */
+static int
+dbAllocDmapLev(struct bmap * bmp,
+	       struct dmap * dp, int nblocks, int l2nb, s64 * results)
+{
+	s64 blkno;
+	int leafidx, rc;
+
+	/* can't be more than a dmaps worth of blocks */
+	assert(l2nb <= L2BPERDMAP);
+
+	/* search the tree within the dmap page for sufficient
+	 * free space.  if sufficient free space is found, dbFindLeaf()
+	 * returns the index of the leaf at which free space was found.
+	 */
+	if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
+		return -ENOSPC;
+
+	/* determine the block number within the file system corresponding
+	 * to the leaf at which free space was found.
+	 */
+	blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
+
+	/* if not all bits of the dmap word are free, get the starting
+	 * bit number within the dmap word of the required string of free
+	 * bits and adjust the block number with this value.
+	 */
+	if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
+		blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
+
+	/* allocate the blocks */
+	if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+		*results = blkno;
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocDmap()
+ *
+ * FUNCTION:	adjust the disk allocation map to reflect the allocation
+ *		of a specified block range within a dmap.
+ *
+ *		this routine allocates the specified blocks from the dmap
+ *		through a call to dbAllocBits(). if the allocation of the
+ *		block range causes the maximum string of free blocks within
+ *		the dmap to change (i.e. the value of the root of the dmap's
+ *		dmtree), this routine will cause this change to be reflected
+ *		up through the appropriate levels of the dmap control pages
+ *		by a call to dbAdjCtl() for the L0 dmap control page that
+ *		covers this dmap.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to allocate the block range from.
+ *	blkno	-  starting block number of the block to be allocated.
+ *	nblocks	-  number of blocks to be allocated.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks)
+{
+	s8 oldroot;
+	int rc;
+
+	/* save the current value of the root (i.e. maximum free string)
+	 * of the dmap tree.
+	 */
+	oldroot = dp->tree.stree[ROOT];
+
+	/* allocate the specified (blocks) bits */
+	dbAllocBits(bmp, dp, blkno, nblocks);
+
+	/* if the root has not changed, done. */
+	if (dp->tree.stree[ROOT] == oldroot)
+		return (0);
+
+	/* root changed. bubble the change up to the dmap control pages.
+	 * if the adjustment of the upper level control pages fails,
+	 * backout the bit allocation (thus making everything consistent).
+	 */
+	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
+		dbFreeBits(bmp, dp, blkno, nblocks);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbFreeDmap()
+ *
+ * FUNCTION:	adjust the disk allocation map to reflect the allocation
+ *		of a specified block range within a dmap.
+ *
+ *		this routine frees the specified blocks from the dmap through
+ *		a call to dbFreeBits(). if the deallocation of the block range
+ *		causes the maximum string of free blocks within the dmap to
+ *		change (i.e. the value of the root of the dmap's dmtree), this
+ *		routine will cause this change to be reflected up through the
+ *		appropriate levels of the dmap control pages by a call to
+ *		dbAdjCtl() for the L0 dmap control page that covers this dmap.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to free the block range from.
+ *	blkno	-  starting block number of the block to be freed.
+ *	nblocks	-  number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		      int nblocks)
+{
+	s8 oldroot;
+	int rc = 0, word;
+
+	/* save the current value of the root (i.e. maximum free string)
+	 * of the dmap tree.
+	 */
+	oldroot = dp->tree.stree[ROOT];
+
+	/* free the specified (blocks) bits */
+	rc = dbFreeBits(bmp, dp, blkno, nblocks);
+
+	/* if error or the root has not changed, done. */
+	if (rc || (dp->tree.stree[ROOT] == oldroot))
+		return (rc);
+
+	/* root changed. bubble the change up to the dmap control pages.
+	 * if the adjustment of the upper level control pages fails,
+	 * backout the deallocation.
+	 */
+	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
+		word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+
+		/* as part of backing out the deallocation, we will have
+		 * to back split the dmap tree if the deallocation caused
+		 * the freed blocks to become part of a larger binary buddy
+		 * system.
+		 */
+		if (dp->tree.stree[word] == NOFREE)
+			dbBackSplit((dmtree_t *) & dp->tree, word);
+
+		dbAllocBits(bmp, dp, blkno, nblocks);
+	}
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocBits()
+ *
+ * FUNCTION:	allocate a specified block range from a dmap.
+ *
+ *		this routine updates the dmap to reflect the working
+ *		state allocation of the specified block range. it directly
+ *		updates the bits of the working map and causes the adjustment
+ *		of the binary buddy system described by the dmap's dmtree
+ *		leaves to reflect the bits allocated.  it also causes the
+ *		dmap's dmtree, as a whole, to reflect the allocated range.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to allocate bits from.
+ *	blkno	-  starting block number of the bits to be allocated.
+ *	nblocks	-  number of bits to be allocated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			int nblocks)
+{
+	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+	dmtree_t *tp = (dmtree_t *) & dp->tree;
+	int size;
+	s8 *leaf;
+
+	/* pick up a pointer to the leaves of the dmap tree */
+	leaf = dp->tree.stree + LEAFIND;
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* block range better be within the dmap */
+	assert(dbitno + nblocks <= BPERDMAP);
+
+	/* allocate the bits of the dmap's words corresponding to the block
+	 * range. not all bits of the first and last words may be contained
+	 * within the block range.  if this is the case, we'll work against
+	 * those words (i.e. partial first and/or last) on an individual basis
+	 * (a single pass), allocating the bits of interest by hand and
+	 * updating the leaf corresponding to the dmap word. a single pass
+	 * will be used for all dmap words fully contained within the
+	 * specified range.  within this pass, the bits of all fully contained
+	 * dmap words will be marked as free in a single shot and the leaves
+	 * will be updated. a single leaf may describe the free space of
+	 * multiple dmap words, so we may update only a subset of the actual
+	 * leaves corresponding to the dmap words of the block range.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of a word is to be allocated.
+		 */
+		if (nb < DBWORD) {
+			/* allocate (set to 1) the appropriate bits within
+			 * this dmap word.
+			 */
+			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+						      >> wbitno);
+
+			/* update the leaf for this dmap word. in addition
+			 * to setting the leaf value to the binary buddy max
+			 * of the updated dmap word, dbSplit() will split
+			 * the binary system of the leaves if need be.
+			 */
+			dbSplit(tp, word, BUDMIN,
+				dbMaxBud((u8 *) & dp->wmap[word]));
+
+			word += 1;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and allocate (set to 1) the bits of these
+			 * words.
+			 */
+			nwords = rembits >> L2DBWORD;
+			memset(&dp->wmap[word], (int) ONES, nwords * 4);
+
+			/* determine how many bits.
+			 */
+			nb = nwords << L2DBWORD;
+
+			/* now update the appropriate leaves to reflect
+			 * the allocated words.
+			 */
+			for (; nwords > 0; nwords -= nw) {
+				if (leaf[word] < BUDMIN) {
+					jfs_error(bmp->db_ipbmap->i_sb,
+						  "leaf page corrupt\n");
+					break;
+				}
+
+				/* determine what the leaf value should be
+				 * updated to as the minimum of the l2 number
+				 * of bits being allocated and the l2 number
+				 * of bits currently described by this leaf.
+				 */
+				size = min_t(int, leaf[word],
+					     NLSTOL2BSZ(nwords));
+
+				/* update the leaf to reflect the allocation.
+				 * in addition to setting the leaf value to
+				 * NOFREE, dbSplit() will split the binary
+				 * system of the leaves to reflect the current
+				 * allocation (size).
+				 */
+				dbSplit(tp, word, size, NOFREE);
+
+				/* get the number of dmap words handled */
+				nw = BUDSIZE(size, BUDMIN);
+				word += nw;
+			}
+		}
+	}
+
+	/* update the free count for this dmap */
+	le32_add_cpu(&dp->nfree, -nblocks);
+
+	BMAP_LOCK(bmp);
+
+	/* if this allocation group is completely free,
+	 * update the maximum allocation group number if this allocation
+	 * group is the new max.
+	 */
+	agno = blkno >> bmp->db_agl2size;
+	if (agno > bmp->db_maxag)
+		bmp->db_maxag = agno;
+
+	/* update the free count for the allocation group and map */
+	bmp->db_agfree[agno] -= nblocks;
+	bmp->db_nfree -= nblocks;
+
+	BMAP_UNLOCK(bmp);
+}
+
+
+/*
+ * NAME:	dbFreeBits()
+ *
+ * FUNCTION:	free a specified block range from a dmap.
+ *
+ *		this routine updates the dmap to reflect the working
+ *		state allocation of the specified block range. it directly
+ *		updates the bits of the working map and causes the adjustment
+ *		of the binary buddy system described by the dmap's dmtree
+ *		leaves to reflect the bits freed.  it also causes the dmap's
+ *		dmtree, as a whole, to reflect the deallocated range.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to free bits from.
+ *	blkno	-  starting block number of the bits to be freed.
+ *	nblocks	-  number of bits to be freed.
+ *
+ * RETURN VALUES: 0 for success
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks)
+{
+	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+	dmtree_t *tp = (dmtree_t *) & dp->tree;
+	int rc = 0;
+	int size;
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* block range better be within the dmap.
+	 */
+	assert(dbitno + nblocks <= BPERDMAP);
+
+	/* free the bits of the dmaps words corresponding to the block range.
+	 * not all bits of the first and last words may be contained within
+	 * the block range.  if this is the case, we'll work against those
+	 * words (i.e. partial first and/or last) on an individual basis
+	 * (a single pass), freeing the bits of interest by hand and updating
+	 * the leaf corresponding to the dmap word. a single pass will be used
+	 * for all dmap words fully contained within the specified range.
+	 * within this pass, the bits of all fully contained dmap words will
+	 * be marked as free in a single shot and the leaves will be updated. a
+	 * single leaf may describe the free space of multiple dmap words,
+	 * so we may update only a subset of the actual leaves corresponding
+	 * to the dmap words of the block range.
+	 *
+	 * dbJoin() is used to update leaf values and will join the binary
+	 * buddy system of the leaves if the new leaf values indicate this
+	 * should be done.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of a word is to be freed.
+		 */
+		if (nb < DBWORD) {
+			/* free (zero) the appropriate bits within this
+			 * dmap word.
+			 */
+			dp->wmap[word] &=
+			    cpu_to_le32(~(ONES << (DBWORD - nb)
+					  >> wbitno));
+
+			/* update the leaf for this dmap word.
+			 */
+			rc = dbJoin(tp, word,
+				    dbMaxBud((u8 *) & dp->wmap[word]));
+			if (rc)
+				return rc;
+
+			word += 1;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and free (zero) the bits of these words.
+			 */
+			nwords = rembits >> L2DBWORD;
+			memset(&dp->wmap[word], 0, nwords * 4);
+
+			/* determine how many bits.
+			 */
+			nb = nwords << L2DBWORD;
+
+			/* now update the appropriate leaves to reflect
+			 * the freed words.
+			 */
+			for (; nwords > 0; nwords -= nw) {
+				/* determine what the leaf value should be
+				 * updated to as the minimum of the l2 number
+				 * of bits being freed and the l2 (max) number
+				 * of bits that can be described by this leaf.
+				 */
+				size =
+				    min(LITOL2BSZ
+					(word, L2LPERDMAP, BUDMIN),
+					NLSTOL2BSZ(nwords));
+
+				/* update the leaf.
+				 */
+				rc = dbJoin(tp, word, size);
+				if (rc)
+					return rc;
+
+				/* get the number of dmap words handled.
+				 */
+				nw = BUDSIZE(size, BUDMIN);
+				word += nw;
+			}
+		}
+	}
+
+	/* update the free count for this dmap.
+	 */
+	le32_add_cpu(&dp->nfree, nblocks);
+
+	BMAP_LOCK(bmp);
+
+	/* update the free count for the allocation group and
+	 * map.
+	 */
+	agno = blkno >> bmp->db_agl2size;
+	bmp->db_nfree += nblocks;
+	bmp->db_agfree[agno] += nblocks;
+
+	/* check if this allocation group is not completely free and
+	 * if it is currently the maximum (rightmost) allocation group.
+	 * if so, establish the new maximum allocation group number by
+	 * searching left for the first allocation group with allocation.
+	 */
+	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
+	    (agno == bmp->db_numag - 1 &&
+	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
+		while (bmp->db_maxag > 0) {
+			bmp->db_maxag -= 1;
+			if (bmp->db_agfree[bmp->db_maxag] !=
+			    bmp->db_agsize)
+				break;
+		}
+
+		/* re-establish the allocation group preference if the
+		 * current preference is right of the maximum allocation
+		 * group.
+		 */
+		if (bmp->db_agpref > bmp->db_maxag)
+			bmp->db_agpref = bmp->db_maxag;
+	}
+
+	BMAP_UNLOCK(bmp);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	dbAdjCtl()
+ *
+ * FUNCTION:	adjust a dmap control page at a specified level to reflect
+ *		the change in a lower level dmap or dmap control page's
+ *		maximum string of free blocks (i.e. a change in the root
+ *		of the lower level object's dmtree) due to the allocation
+ *		or deallocation of a range of blocks with a single dmap.
+ *
+ *		on entry, this routine is provided with the new value of
+ *		the lower level dmap or dmap control page root and the
+ *		starting block number of the block range whose allocation
+ *		or deallocation resulted in the root change.  this range
+ *		is respresented by a single leaf of the current dmapctl
+ *		and the leaf will be updated with this value, possibly
+ *		causing a binary buddy system within the leaves to be
+ *		split or joined.  the update may also cause the dmapctl's
+ *		dmtree to be updated.
+ *
+ *		if the adjustment of the dmap control page, itself, causes its
+ *		root to change, this change will be bubbled up to the next dmap
+ *		control level by a recursive call to this routine, specifying
+ *		the new root value and the next dmap control page level to
+ *		be adjusted.
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	blkno	-  the first block of a block range within a dmap.  it is
+ *		   the allocation or deallocation of this block range that
+ *		   requires the dmap control page to be adjusted.
+ *	newval	-  the new value of the lower level dmap or dmap control
+ *		   page root.
+ *	alloc	-  'true' if adjustment is due to an allocation.
+ *	level	-  current level of dmap control page (i.e. L0, L1, L2) to
+ *		   be adjusted.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
+{
+	struct metapage *mp;
+	s8 oldroot;
+	int oldval;
+	s64 lblkno;
+	struct dmapctl *dcp;
+	int rc, leafno, ti;
+
+	/* get the buffer for the dmap control page for the specified
+	 * block number and control page level.
+	 */
+	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
+	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL)
+		return -EIO;
+	dcp = (struct dmapctl *) mp->data;
+
+	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
+		release_metapage(mp);
+		return -EIO;
+	}
+
+	/* determine the leaf number corresponding to the block and
+	 * the index within the dmap control tree.
+	 */
+	leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
+	ti = leafno + le32_to_cpu(dcp->leafidx);
+
+	/* save the current leaf value and the current root level (i.e.
+	 * maximum l2 free string described by this dmapctl).
+	 */
+	oldval = dcp->stree[ti];
+	oldroot = dcp->stree[ROOT];
+
+	/* check if this is a control page update for an allocation.
+	 * if so, update the leaf to reflect the new leaf value using
+	 * dbSplit(); otherwise (deallocation), use dbJoin() to update
+	 * the leaf with the new value.  in addition to updating the
+	 * leaf, dbSplit() will also split the binary buddy system of
+	 * the leaves, if required, and bubble new values within the
+	 * dmapctl tree, if required.  similarly, dbJoin() will join
+	 * the binary buddy system of leaves and bubble new values up
+	 * the dmapctl tree as required by the new leaf value.
+	 */
+	if (alloc) {
+		/* check if we are in the middle of a binary buddy
+		 * system.  this happens when we are performing the
+		 * first allocation out of an allocation group that
+		 * is part (not the first part) of a larger binary
+		 * buddy system.  if we are in the middle, back split
+		 * the system prior to calling dbSplit() which assumes
+		 * that it is at the front of a binary buddy system.
+		 */
+		if (oldval == NOFREE) {
+			rc = dbBackSplit((dmtree_t *) dcp, leafno);
+			if (rc)
+				return rc;
+			oldval = dcp->stree[ti];
+		}
+		dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
+	} else {
+		rc = dbJoin((dmtree_t *) dcp, leafno, newval);
+		if (rc)
+			return rc;
+	}
+
+	/* check if the root of the current dmap control page changed due
+	 * to the update and if the current dmap control page is not at
+	 * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
+	 * root changed and this is not the top level), call this routine
+	 * again (recursion) for the next higher level of the mapping to
+	 * reflect the change in root for the current dmap control page.
+	 */
+	if (dcp->stree[ROOT] != oldroot) {
+		/* are we below the top level of the map.  if so,
+		 * bubble the root up to the next higher level.
+		 */
+		if (level < bmp->db_maxlevel) {
+			/* bubble up the new root of this dmap control page to
+			 * the next level.
+			 */
+			if ((rc =
+			     dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
+				      level + 1))) {
+				/* something went wrong in bubbling up the new
+				 * root value, so backout the changes to the
+				 * current dmap control page.
+				 */
+				if (alloc) {
+					dbJoin((dmtree_t *) dcp, leafno,
+					       oldval);
+				} else {
+					/* the dbJoin() above might have
+					 * caused a larger binary buddy system
+					 * to form and we may now be in the
+					 * middle of it.  if this is the case,
+					 * back split the buddies.
+					 */
+					if (dcp->stree[ti] == NOFREE)
+						dbBackSplit((dmtree_t *)
+							    dcp, leafno);
+					dbSplit((dmtree_t *) dcp, leafno,
+						dcp->budmin, oldval);
+				}
+
+				/* release the buffer and return the error.
+				 */
+				release_metapage(mp);
+				return (rc);
+			}
+		} else {
+			/* we're at the top level of the map. update
+			 * the bmap control page to reflect the size
+			 * of the maximum free buddy system.
+			 */
+			assert(level == bmp->db_maxlevel);
+			if (bmp->db_maxfreebud != oldroot) {
+				jfs_error(bmp->db_ipbmap->i_sb,
+					  "the maximum free buddy is not the old root\n");
+			}
+			bmp->db_maxfreebud = dcp->stree[ROOT];
+		}
+	}
+
+	/* write the buffer.
+	 */
+	write_metapage(mp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbSplit()
+ *
+ * FUNCTION:	update the leaf of a dmtree with a new value, splitting
+ *		the leaf from the binary buddy system of the dmtree's
+ *		leaves, as required.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree containing the leaf.
+ *	leafno	- the number of the leaf to be updated.
+ *	splitsz	- the size the binary buddy system starting at the leaf
+ *		  must be split to, specified as the log2 number of blocks.
+ *	newval	- the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
+{
+	int budsz;
+	int cursz;
+	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+	/* check if the leaf needs to be split.
+	 */
+	if (leaf[leafno] > tp->dmt_budmin) {
+		/* the split occurs by cutting the buddy system in half
+		 * at the specified leaf until we reach the specified
+		 * size.  pick up the starting split size (current size
+		 * - 1 in l2) and the corresponding buddy size.
+		 */
+		cursz = leaf[leafno] - 1;
+		budsz = BUDSIZE(cursz, tp->dmt_budmin);
+
+		/* split until we reach the specified size.
+		 */
+		while (cursz >= splitsz) {
+			/* update the buddy's leaf with its new value.
+			 */
+			dbAdjTree(tp, leafno ^ budsz, cursz);
+
+			/* on to the next size and buddy.
+			 */
+			cursz -= 1;
+			budsz >>= 1;
+		}
+	}
+
+	/* adjust the dmap tree to reflect the specified leaf's new
+	 * value.
+	 */
+	dbAdjTree(tp, leafno, newval);
+}
+
+
+/*
+ * NAME:	dbBackSplit()
+ *
+ * FUNCTION:	back split the binary buddy system of dmtree leaves
+ *		that hold a specified leaf until the specified leaf
+ *		starts its own binary buddy system.
+ *
+ *		the allocators typically perform allocations at the start
+ *		of binary buddy systems and dbSplit() is used to accomplish
+ *		any required splits.  in some cases, however, allocation
+ *		may occur in the middle of a binary system and requires a
+ *		back split, with the split proceeding out from the middle of
+ *		the system (less efficient) rather than the start of the
+ *		system (more efficient).  the cases in which a back split
+ *		is required are rare and are limited to the first allocation
+ *		within an allocation group which is a part (not first part)
+ *		of a larger binary buddy system and a few exception cases
+ *		in which a previous join operation must be backed out.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree containing the leaf.
+ *	leafno	- the number of the leaf to be updated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbBackSplit(dmtree_t * tp, int leafno)
+{
+	int budsz, bud, w, bsz, size;
+	int cursz;
+	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+	/* leaf should be part (not first part) of a binary
+	 * buddy system.
+	 */
+	assert(leaf[leafno] == NOFREE);
+
+	/* the back split is accomplished by iteratively finding the leaf
+	 * that starts the buddy system that contains the specified leaf and
+	 * splitting that system in two.  this iteration continues until
+	 * the specified leaf becomes the start of a buddy system.
+	 *
+	 * determine maximum possible l2 size for the specified leaf.
+	 */
+	size =
+	    LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
+		      tp->dmt_budmin);
+
+	/* determine the number of leaves covered by this size.  this
+	 * is the buddy size that we will start with as we search for
+	 * the buddy system that contains the specified leaf.
+	 */
+	budsz = BUDSIZE(size, tp->dmt_budmin);
+
+	/* back split.
+	 */
+	while (leaf[leafno] == NOFREE) {
+		/* find the leftmost buddy leaf.
+		 */
+		for (w = leafno, bsz = budsz;; bsz <<= 1,
+		     w = (w < bud) ? w : bud) {
+			if (bsz >= le32_to_cpu(tp->dmt_nleafs)) {
+				jfs_err("JFS: block map error in dbBackSplit");
+				return -EIO;
+			}
+
+			/* determine the buddy.
+			 */
+			bud = w ^ bsz;
+
+			/* check if this buddy is the start of the system.
+			 */
+			if (leaf[bud] != NOFREE) {
+				/* split the leaf at the start of the
+				 * system in two.
+				 */
+				cursz = leaf[bud] - 1;
+				dbSplit(tp, bud, cursz, cursz);
+				break;
+			}
+		}
+	}
+
+	if (leaf[leafno] != size) {
+		jfs_err("JFS: wrong leaf value in dbBackSplit");
+		return -EIO;
+	}
+	return 0;
+}
+
+
+/*
+ * NAME:	dbJoin()
+ *
+ * FUNCTION:	update the leaf of a dmtree with a new value, joining
+ *		the leaf with other leaves of the dmtree into a multi-leaf
+ *		binary buddy system, as required.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree containing the leaf.
+ *	leafno	- the number of the leaf to be updated.
+ *	newval	- the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static int dbJoin(dmtree_t * tp, int leafno, int newval)
+{
+	int budsz, buddy;
+	s8 *leaf;
+
+	/* can the new leaf value require a join with other leaves ?
+	 */
+	if (newval >= tp->dmt_budmin) {
+		/* pickup a pointer to the leaves of the tree.
+		 */
+		leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+		/* try to join the specified leaf into a large binary
+		 * buddy system.  the join proceeds by attempting to join
+		 * the specified leafno with its buddy (leaf) at new value.
+		 * if the join occurs, we attempt to join the left leaf
+		 * of the joined buddies with its buddy at new value + 1.
+		 * we continue to join until we find a buddy that cannot be
+		 * joined (does not have a value equal to the size of the
+		 * last join) or until all leaves have been joined into a
+		 * single system.
+		 *
+		 * get the buddy size (number of words covered) of
+		 * the new value.
+		 */
+		budsz = BUDSIZE(newval, tp->dmt_budmin);
+
+		/* try to join.
+		 */
+		while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
+			/* get the buddy leaf.
+			 */
+			buddy = leafno ^ budsz;
+
+			/* if the leaf's new value is greater than its
+			 * buddy's value, we join no more.
+			 */
+			if (newval > leaf[buddy])
+				break;
+
+			/* It shouldn't be less */
+			if (newval < leaf[buddy])
+				return -EIO;
+
+			/* check which (leafno or buddy) is the left buddy.
+			 * the left buddy gets to claim the blocks resulting
+			 * from the join while the right gets to claim none.
+			 * the left buddy is also eligible to participate in
+			 * a join at the next higher level while the right
+			 * is not.
+			 *
+			 */
+			if (leafno < buddy) {
+				/* leafno is the left buddy.
+				 */
+				dbAdjTree(tp, buddy, NOFREE);
+			} else {
+				/* buddy is the left buddy and becomes
+				 * leafno.
+				 */
+				dbAdjTree(tp, leafno, NOFREE);
+				leafno = buddy;
+			}
+
+			/* on to try the next join.
+			 */
+			newval += 1;
+			budsz <<= 1;
+		}
+	}
+
+	/* update the leaf value.
+	 */
+	dbAdjTree(tp, leafno, newval);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	dbAdjTree()
+ *
+ * FUNCTION:	update a leaf of a dmtree with a new value, adjusting
+ *		the dmtree, as required, to reflect the new leaf value.
+ *		the combination of any buddies must already be done before
+ *		this is called.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree to be adjusted.
+ *	leafno	- the number of the leaf to be updated.
+ *	newval	- the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
+{
+	int lp, pp, k;
+	int max;
+
+	/* pick up the index of the leaf for this leafno.
+	 */
+	lp = leafno + le32_to_cpu(tp->dmt_leafidx);
+
+	/* is the current value the same as the old value ?  if so,
+	 * there is nothing to do.
+	 */
+	if (tp->dmt_stree[lp] == newval)
+		return;
+
+	/* set the new value.
+	 */
+	tp->dmt_stree[lp] = newval;
+
+	/* bubble the new value up the tree as required.
+	 */
+	for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
+		/* get the index of the first leaf of the 4 leaf
+		 * group containing the specified leaf (leafno).
+		 */
+		lp = ((lp - 1) & ~0x03) + 1;
+
+		/* get the index of the parent of this 4 leaf group.
+		 */
+		pp = (lp - 1) >> 2;
+
+		/* determine the maximum of the 4 leaves.
+		 */
+		max = TREEMAX(&tp->dmt_stree[lp]);
+
+		/* if the maximum of the 4 is the same as the
+		 * parent's value, we're done.
+		 */
+		if (tp->dmt_stree[pp] == max)
+			break;
+
+		/* parent gets new value.
+		 */
+		tp->dmt_stree[pp] = max;
+
+		/* parent becomes leaf for next go-round.
+		 */
+		lp = pp;
+	}
+}
+
+
+/*
+ * NAME:	dbFindLeaf()
+ *
+ * FUNCTION:	search a dmtree_t for sufficient free blocks, returning
+ *		the index of a leaf describing the free blocks if
+ *		sufficient free blocks are found.
+ *
+ *		the search starts at the top of the dmtree_t tree and
+ *		proceeds down the tree to the leftmost leaf with sufficient
+ *		free space.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree to be searched.
+ *	l2nb	- log2 number of free blocks to search for.
+ *	leafidx	- return pointer to be set to the index of the leaf
+ *		  describing at least l2nb free blocks if sufficient
+ *		  free blocks are found.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient free blocks.
+ */
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
+{
+	int ti, n = 0, k, x = 0;
+
+	/* first check the root of the tree to see if there is
+	 * sufficient free space.
+	 */
+	if (l2nb > tp->dmt_stree[ROOT])
+		return -ENOSPC;
+
+	/* sufficient free space available. now search down the tree
+	 * starting at the next level for the leftmost leaf that
+	 * describes sufficient free space.
+	 */
+	for (k = le32_to_cpu(tp->dmt_height), ti = 1;
+	     k > 0; k--, ti = ((ti + n) << 2) + 1) {
+		/* search the four nodes at this level, starting from
+		 * the left.
+		 */
+		for (x = ti, n = 0; n < 4; n++) {
+			/* sufficient free space found.  move to the next
+			 * level (or quit if this is the last level).
+			 */
+			if (l2nb <= tp->dmt_stree[x + n])
+				break;
+		}
+
+		/* better have found something since the higher
+		 * levels of the tree said it was here.
+		 */
+		assert(n < 4);
+	}
+
+	/* set the return to the leftmost leaf describing sufficient
+	 * free space.
+	 */
+	*leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbFindBits()
+ *
+ * FUNCTION:	find a specified number of binary buddy free bits within a
+ *		dmap bitmap word value.
+ *
+ *		this routine searches the bitmap value for (1 << l2nb) free
+ *		bits at (1 << l2nb) alignments within the value.
+ *
+ * PARAMETERS:
+ *	word	-  dmap bitmap word value.
+ *	l2nb	-  number of free bits specified as a log2 number.
+ *
+ * RETURN VALUES:
+ *	starting bit number of free bits.
+ */
+static int dbFindBits(u32 word, int l2nb)
+{
+	int bitno, nb;
+	u32 mask;
+
+	/* get the number of bits.
+	 */
+	nb = 1 << l2nb;
+	assert(nb <= DBWORD);
+
+	/* complement the word so we can use a mask (i.e. 0s represent
+	 * free bits) and compute the mask.
+	 */
+	word = ~word;
+	mask = ONES << (DBWORD - nb);
+
+	/* scan the word for nb free bits at nb alignments.
+	 */
+	for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
+		if ((mask & word) == mask)
+			break;
+	}
+
+	ASSERT(bitno < 32);
+
+	/* return the bit number.
+	 */
+	return (bitno);
+}
+
+
+/*
+ * NAME:	dbMaxBud(u8 *cp)
+ *
+ * FUNCTION:	determine the largest binary buddy string of free
+ *		bits within 32-bits of the map.
+ *
+ * PARAMETERS:
+ *	cp	-  pointer to the 32-bit value.
+ *
+ * RETURN VALUES:
+ *	largest binary buddy of free bits within a dmap word.
+ */
+static int dbMaxBud(u8 * cp)
+{
+	signed char tmp1, tmp2;
+
+	/* check if the wmap word is all free. if so, the
+	 * free buddy size is BUDMIN.
+	 */
+	if (*((uint *) cp) == 0)
+		return (BUDMIN);
+
+	/* check if the wmap word is half free. if so, the
+	 * free buddy size is BUDMIN-1.
+	 */
+	if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
+		return (BUDMIN - 1);
+
+	/* not all free or half free. determine the free buddy
+	 * size thru table lookup using quarters of the wmap word.
+	 */
+	tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
+	tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
+	return (max(tmp1, tmp2));
+}
+
+
+/*
+ * NAME:	cnttz(uint word)
+ *
+ * FUNCTION:	determine the number of trailing zeros within a 32-bit
+ *		value.
+ *
+ * PARAMETERS:
+ *	value	-  32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ *	count of trailing zeros
+ */
+static int cnttz(u32 word)
+{
+	int n;
+
+	for (n = 0; n < 32; n++, word >>= 1) {
+		if (word & 0x01)
+			break;
+	}
+
+	return (n);
+}
+
+
+/*
+ * NAME:	cntlz(u32 value)
+ *
+ * FUNCTION:	determine the number of leading zeros within a 32-bit
+ *		value.
+ *
+ * PARAMETERS:
+ *	value	-  32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ *	count of leading zeros
+ */
+static int cntlz(u32 value)
+{
+	int n;
+
+	for (n = 0; n < 32; n++, value <<= 1) {
+		if (value & HIGHORDER)
+			break;
+	}
+	return (n);
+}
+
+
+/*
+ * NAME:	blkstol2(s64 nb)
+ *
+ * FUNCTION:	convert a block count to its log2 value. if the block
+ *		count is not a l2 multiple, it is rounded up to the next
+ *		larger l2 multiple.
+ *
+ * PARAMETERS:
+ *	nb	-  number of blocks
+ *
+ * RETURN VALUES:
+ *	log2 number of blocks
+ */
+static int blkstol2(s64 nb)
+{
+	int l2nb;
+	s64 mask;		/* meant to be signed */
+
+	mask = (s64) 1 << (64 - 1);
+
+	/* count the leading bits.
+	 */
+	for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
+		/* leading bit found.
+		 */
+		if (nb & mask) {
+			/* determine the l2 value.
+			 */
+			l2nb = (64 - 1) - l2nb;
+
+			/* check if we need to round up.
+			 */
+			if (~mask & nb)
+				l2nb++;
+
+			return (l2nb);
+		}
+	}
+	assert(0);
+	return 0;		/* fix compiler warning */
+}
+
+
+/*
+ * NAME:	dbAllocBottomUp()
+ *
+ * FUNCTION:	alloc the specified block range from the working block
+ *		allocation map.
+ *
+ *		the blocks will be alloc from the working map one dmap
+ *		at a time.
+ *
+ * PARAMETERS:
+ *	ip	-  pointer to in-core inode;
+ *	blkno	-  starting block number to be freed.
+ *	nblocks	-  number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
+{
+	struct metapage *mp;
+	struct dmap *dp;
+	int nb, rc;
+	s64 lblkno, rem;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* block to be allocated better be within the mapsize. */
+	ASSERT(nblocks <= bmp->db_mapsize - blkno);
+
+	/*
+	 * allocate the blocks a dmap at a time.
+	 */
+	mp = NULL;
+	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+		/* release previous dmap if any */
+		if (mp) {
+			write_metapage(mp);
+		}
+
+		/* get the buffer for the current dmap. */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			IREAD_UNLOCK(ipbmap);
+			return -EIO;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* determine the number of blocks to be allocated from
+		 * this dmap.
+		 */
+		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+
+		/* allocate the blocks. */
+		if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			return (rc);
+		}
+	}
+
+	/* write the last buffer. */
+	write_metapage(mp);
+
+	IREAD_UNLOCK(ipbmap);
+
+	return (0);
+}
+
+
+static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			 int nblocks)
+{
+	int rc;
+	int dbitno, word, rembits, nb, nwords, wbitno, agno;
+	s8 oldroot;
+	struct dmaptree *tp = (struct dmaptree *) & dp->tree;
+
+	/* save the current value of the root (i.e. maximum free string)
+	 * of the dmap tree.
+	 */
+	oldroot = tp->stree[ROOT];
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* block range better be within the dmap */
+	assert(dbitno + nblocks <= BPERDMAP);
+
+	/* allocate the bits of the dmap's words corresponding to the block
+	 * range. not all bits of the first and last words may be contained
+	 * within the block range.  if this is the case, we'll work against
+	 * those words (i.e. partial first and/or last) on an individual basis
+	 * (a single pass), allocating the bits of interest by hand and
+	 * updating the leaf corresponding to the dmap word. a single pass
+	 * will be used for all dmap words fully contained within the
+	 * specified range.  within this pass, the bits of all fully contained
+	 * dmap words will be marked as free in a single shot and the leaves
+	 * will be updated. a single leaf may describe the free space of
+	 * multiple dmap words, so we may update only a subset of the actual
+	 * leaves corresponding to the dmap words of the block range.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of a word is to be allocated.
+		 */
+		if (nb < DBWORD) {
+			/* allocate (set to 1) the appropriate bits within
+			 * this dmap word.
+			 */
+			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+						      >> wbitno);
+
+			word++;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and allocate (set to 1) the bits of these
+			 * words.
+			 */
+			nwords = rembits >> L2DBWORD;
+			memset(&dp->wmap[word], (int) ONES, nwords * 4);
+
+			/* determine how many bits */
+			nb = nwords << L2DBWORD;
+			word += nwords;
+		}
+	}
+
+	/* update the free count for this dmap */
+	le32_add_cpu(&dp->nfree, -nblocks);
+
+	/* reconstruct summary tree */
+	dbInitDmapTree(dp);
+
+	BMAP_LOCK(bmp);
+
+	/* if this allocation group is completely free,
+	 * update the highest active allocation group number
+	 * if this allocation group is the new max.
+	 */
+	agno = blkno >> bmp->db_agl2size;
+	if (agno > bmp->db_maxag)
+		bmp->db_maxag = agno;
+
+	/* update the free count for the allocation group and map */
+	bmp->db_agfree[agno] -= nblocks;
+	bmp->db_nfree -= nblocks;
+
+	BMAP_UNLOCK(bmp);
+
+	/* if the root has not changed, done. */
+	if (tp->stree[ROOT] == oldroot)
+		return (0);
+
+	/* root changed. bubble the change up to the dmap control pages.
+	 * if the adjustment of the upper level control pages fails,
+	 * backout the bit allocation (thus making everything consistent).
+	 */
+	if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
+		dbFreeBits(bmp, dp, blkno, nblocks);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbExtendFS()
+ *
+ * FUNCTION:	extend bmap from blkno for nblocks;
+ *		dbExtendFS() updates bmap ready for dbAllocBottomUp();
+ *
+ * L2
+ *  |
+ *   L1---------------------------------L1
+ *    |					 |
+ *     L0---------L0---------L0		  L0---------L0---------L0
+ *      |	   |	      |		   |	      |		 |
+ *	 d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
+ * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
+ *
+ * <---old---><----------------------------extend----------------------->
+ */
+int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
+	int nbperpage = sbi->nbperpage;
+	int i, i0 = true, j, j0 = true, k, n;
+	s64 newsize;
+	s64 p;
+	struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
+	struct dmapctl *l2dcp, *l1dcp, *l0dcp;
+	struct dmap *dp;
+	s8 *l0leaf, *l1leaf, *l2leaf;
+	struct bmap *bmp = sbi->bmap;
+	int agno, l2agsize, oldl2agsize;
+	s64 ag_rem;
+
+	newsize = blkno + nblocks;
+
+	jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
+		 (long long) blkno, (long long) nblocks, (long long) newsize);
+
+	/*
+	 *	initialize bmap control page.
+	 *
+	 * all the data in bmap control page should exclude
+	 * the mkfs hidden dmap page.
+	 */
+
+	/* update mapsize */
+	bmp->db_mapsize = newsize;
+	bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
+
+	/* compute new AG size */
+	l2agsize = dbGetL2AGSize(newsize);
+	oldl2agsize = bmp->db_agl2size;
+
+	bmp->db_agl2size = l2agsize;
+	bmp->db_agsize = 1 << l2agsize;
+
+	/* compute new number of AG */
+	agno = bmp->db_numag;
+	bmp->db_numag = newsize >> l2agsize;
+	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
+
+	/*
+	 *	reconfigure db_agfree[]
+	 * from old AG configuration to new AG configuration;
+	 *
+	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+	 * note: new AG size = old AG size * (2**x).
+	 */
+	if (l2agsize == oldl2agsize)
+		goto extend;
+	k = 1 << (l2agsize - oldl2agsize);
+	ag_rem = bmp->db_agfree[0];	/* save agfree[0] */
+	for (i = 0, n = 0; i < agno; n++) {
+		bmp->db_agfree[n] = 0;	/* init collection point */
+
+		/* coalesce contiguous k AGs; */
+		for (j = 0; j < k && i < agno; j++, i++) {
+			/* merge AGi to AGn */
+			bmp->db_agfree[n] += bmp->db_agfree[i];
+		}
+	}
+	bmp->db_agfree[0] += ag_rem;	/* restore agfree[0] */
+
+	for (; n < MAXAG; n++)
+		bmp->db_agfree[n] = 0;
+
+	/*
+	 * update highest active ag number
+	 */
+
+	bmp->db_maxag = bmp->db_maxag / k;
+
+	/*
+	 *	extend bmap
+	 *
+	 * update bit maps and corresponding level control pages;
+	 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
+	 */
+      extend:
+	/* get L2 page */
+	p = BMAPBLKNO + nbperpage;	/* L2 page */
+	l2mp = read_metapage(ipbmap, p, PSIZE, 0);
+	if (!l2mp) {
+		jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
+		return -EIO;
+	}
+	l2dcp = (struct dmapctl *) l2mp->data;
+
+	/* compute start L1 */
+	k = blkno >> L2MAXL1SIZE;
+	l2leaf = l2dcp->stree + CTLLEAFIND + k;
+	p = BLKTOL1(blkno, sbi->l2nbperpage);	/* L1 page */
+
+	/*
+	 * extend each L1 in L2
+	 */
+	for (; k < LPERCTL; k++, p += nbperpage) {
+		/* get L1 page */
+		if (j0) {
+			/* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
+			l1mp = read_metapage(ipbmap, p, PSIZE, 0);
+			if (l1mp == NULL)
+				goto errout;
+			l1dcp = (struct dmapctl *) l1mp->data;
+
+			/* compute start L0 */
+			j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
+			l1leaf = l1dcp->stree + CTLLEAFIND + j;
+			p = BLKTOL0(blkno, sbi->l2nbperpage);
+			j0 = false;
+		} else {
+			/* assign/init L1 page */
+			l1mp = get_metapage(ipbmap, p, PSIZE, 0);
+			if (l1mp == NULL)
+				goto errout;
+
+			l1dcp = (struct dmapctl *) l1mp->data;
+
+			/* compute start L0 */
+			j = 0;
+			l1leaf = l1dcp->stree + CTLLEAFIND;
+			p += nbperpage;	/* 1st L0 of L1.k */
+		}
+
+		/*
+		 * extend each L0 in L1
+		 */
+		for (; j < LPERCTL; j++) {
+			/* get L0 page */
+			if (i0) {
+				/* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
+
+				l0mp = read_metapage(ipbmap, p, PSIZE, 0);
+				if (l0mp == NULL)
+					goto errout;
+				l0dcp = (struct dmapctl *) l0mp->data;
+
+				/* compute start dmap */
+				i = (blkno & (MAXL0SIZE - 1)) >>
+				    L2BPERDMAP;
+				l0leaf = l0dcp->stree + CTLLEAFIND + i;
+				p = BLKTODMAP(blkno,
+					      sbi->l2nbperpage);
+				i0 = false;
+			} else {
+				/* assign/init L0 page */
+				l0mp = get_metapage(ipbmap, p, PSIZE, 0);
+				if (l0mp == NULL)
+					goto errout;
+
+				l0dcp = (struct dmapctl *) l0mp->data;
+
+				/* compute start dmap */
+				i = 0;
+				l0leaf = l0dcp->stree + CTLLEAFIND;
+				p += nbperpage;	/* 1st dmap of L0.j */
+			}
+
+			/*
+			 * extend each dmap in L0
+			 */
+			for (; i < LPERCTL; i++) {
+				/*
+				 * reconstruct the dmap page, and
+				 * initialize corresponding parent L0 leaf
+				 */
+				if ((n = blkno & (BPERDMAP - 1))) {
+					/* read in dmap page: */
+					mp = read_metapage(ipbmap, p,
+							   PSIZE, 0);
+					if (mp == NULL)
+						goto errout;
+					n = min(nblocks, (s64)BPERDMAP - n);
+				} else {
+					/* assign/init dmap page */
+					mp = read_metapage(ipbmap, p,
+							   PSIZE, 0);
+					if (mp == NULL)
+						goto errout;
+
+					n = min_t(s64, nblocks, BPERDMAP);
+				}
+
+				dp = (struct dmap *) mp->data;
+				*l0leaf = dbInitDmap(dp, blkno, n);
+
+				bmp->db_nfree += n;
+				agno = le64_to_cpu(dp->start) >> l2agsize;
+				bmp->db_agfree[agno] += n;
+
+				write_metapage(mp);
+
+				l0leaf++;
+				p += nbperpage;
+
+				blkno += n;
+				nblocks -= n;
+				if (nblocks == 0)
+					break;
+			}	/* for each dmap in a L0 */
+
+			/*
+			 * build current L0 page from its leaves, and
+			 * initialize corresponding parent L1 leaf
+			 */
+			*l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
+			write_metapage(l0mp);
+			l0mp = NULL;
+
+			if (nblocks)
+				l1leaf++;	/* continue for next L0 */
+			else {
+				/* more than 1 L0 ? */
+				if (j > 0)
+					break;	/* build L1 page */
+				else {
+					/* summarize in global bmap page */
+					bmp->db_maxfreebud = *l1leaf;
+					release_metapage(l1mp);
+					release_metapage(l2mp);
+					goto finalize;
+				}
+			}
+		}		/* for each L0 in a L1 */
+
+		/*
+		 * build current L1 page from its leaves, and
+		 * initialize corresponding parent L2 leaf
+		 */
+		*l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
+		write_metapage(l1mp);
+		l1mp = NULL;
+
+		if (nblocks)
+			l2leaf++;	/* continue for next L1 */
+		else {
+			/* more than 1 L1 ? */
+			if (k > 0)
+				break;	/* build L2 page */
+			else {
+				/* summarize in global bmap page */
+				bmp->db_maxfreebud = *l2leaf;
+				release_metapage(l2mp);
+				goto finalize;
+			}
+		}
+	}			/* for each L1 in a L2 */
+
+	jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
+errout:
+	if (l0mp)
+		release_metapage(l0mp);
+	if (l1mp)
+		release_metapage(l1mp);
+	release_metapage(l2mp);
+	return -EIO;
+
+	/*
+	 *	finalize bmap control page
+	 */
+finalize:
+
+	return 0;
+}
+
+
+/*
+ *	dbFinalizeBmap()
+ */
+void dbFinalizeBmap(struct inode *ipbmap)
+{
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	int actags, inactags, l2nl;
+	s64 ag_rem, actfree, inactfree, avgfree;
+	int i, n;
+
+	/*
+	 *	finalize bmap control page
+	 */
+//finalize:
+	/*
+	 * compute db_agpref: preferred ag to allocate from
+	 * (the leftmost ag with average free space in it);
+	 */
+//agpref:
+	/* get the number of active ags and inacitve ags */
+	actags = bmp->db_maxag + 1;
+	inactags = bmp->db_numag - actags;
+	ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);	/* ??? */
+
+	/* determine how many blocks are in the inactive allocation
+	 * groups. in doing this, we must account for the fact that
+	 * the rightmost group might be a partial group (i.e. file
+	 * system size is not a multiple of the group size).
+	 */
+	inactfree = (inactags && ag_rem) ?
+	    ((inactags - 1) << bmp->db_agl2size) + ag_rem
+	    : inactags << bmp->db_agl2size;
+
+	/* determine how many free blocks are in the active
+	 * allocation groups plus the average number of free blocks
+	 * within the active ags.
+	 */
+	actfree = bmp->db_nfree - inactfree;
+	avgfree = (u32) actfree / (u32) actags;
+
+	/* if the preferred allocation group has not average free space.
+	 * re-establish the preferred group as the leftmost
+	 * group with average free space.
+	 */
+	if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
+		for (bmp->db_agpref = 0; bmp->db_agpref < actags;
+		     bmp->db_agpref++) {
+			if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
+				break;
+		}
+		if (bmp->db_agpref >= bmp->db_numag) {
+			jfs_error(ipbmap->i_sb,
+				  "cannot find ag with average freespace\n");
+		}
+	}
+
+	/*
+	 * compute db_aglevel, db_agheight, db_width, db_agstart:
+	 * an ag is covered in aglevel dmapctl summary tree,
+	 * at agheight level height (from leaf) with agwidth number of nodes
+	 * each, which starts at agstart index node of the smmary tree node
+	 * array;
+	 */
+	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
+	l2nl =
+	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
+	bmp->db_agheight = l2nl >> 1;
+	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
+	for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
+	     i--) {
+		bmp->db_agstart += n;
+		n <<= 2;
+	}
+
+}
+
+
+/*
+ * NAME:	dbInitDmap()/ujfs_idmap_page()
+ *
+ * FUNCTION:	initialize working/persistent bitmap of the dmap page
+ *		for the specified number of blocks:
+ *
+ *		at entry, the bitmaps had been initialized as free (ZEROS);
+ *		The number of blocks will only account for the actually
+ *		existing blocks. Blocks which don't actually exist in
+ *		the aggregate will be marked as allocated (ONES);
+ *
+ * PARAMETERS:
+ *	dp	- pointer to page of map
+ *	nblocks	- number of blocks this page
+ *
+ * RETURNS: NONE
+ */
+static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
+{
+	int blkno, w, b, r, nw, nb, i;
+
+	/* starting block number within the dmap */
+	blkno = Blkno & (BPERDMAP - 1);
+
+	if (blkno == 0) {
+		dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
+		dp->start = cpu_to_le64(Blkno);
+
+		if (nblocks == BPERDMAP) {
+			memset(&dp->wmap[0], 0, LPERDMAP * 4);
+			memset(&dp->pmap[0], 0, LPERDMAP * 4);
+			goto initTree;
+		}
+	} else {
+		le32_add_cpu(&dp->nblocks, nblocks);
+		le32_add_cpu(&dp->nfree, nblocks);
+	}
+
+	/* word number containing start block number */
+	w = blkno >> L2DBWORD;
+
+	/*
+	 * free the bits corresponding to the block range (ZEROS):
+	 * note: not all bits of the first and last words may be contained
+	 * within the block range.
+	 */
+	for (r = nblocks; r > 0; r -= nb, blkno += nb) {
+		/* number of bits preceding range to be freed in the word */
+		b = blkno & (DBWORD - 1);
+		/* number of bits to free in the word */
+		nb = min(r, DBWORD - b);
+
+		/* is partial word to be freed ? */
+		if (nb < DBWORD) {
+			/* free (set to 0) from the bitmap word */
+			dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+						     >> b));
+			dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+						     >> b));
+
+			/* skip the word freed */
+			w++;
+		} else {
+			/* free (set to 0) contiguous bitmap words */
+			nw = r >> L2DBWORD;
+			memset(&dp->wmap[w], 0, nw * 4);
+			memset(&dp->pmap[w], 0, nw * 4);
+
+			/* skip the words freed */
+			nb = nw << L2DBWORD;
+			w += nw;
+		}
+	}
+
+	/*
+	 * mark bits following the range to be freed (non-existing
+	 * blocks) as allocated (ONES)
+	 */
+
+	if (blkno == BPERDMAP)
+		goto initTree;
+
+	/* the first word beyond the end of existing blocks */
+	w = blkno >> L2DBWORD;
+
+	/* does nblocks fall on a 32-bit boundary ? */
+	b = blkno & (DBWORD - 1);
+	if (b) {
+		/* mark a partial word allocated */
+		dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
+		w++;
+	}
+
+	/* set the rest of the words in the page to allocated (ONES) */
+	for (i = w; i < LPERDMAP; i++)
+		dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
+
+	/*
+	 * init tree
+	 */
+      initTree:
+	return (dbInitDmapTree(dp));
+}
+
+
+/*
+ * NAME:	dbInitDmapTree()/ujfs_complete_dmap()
+ *
+ * FUNCTION:	initialize summary tree of the specified dmap:
+ *
+ *		at entry, bitmap of the dmap has been initialized;
+ *
+ * PARAMETERS:
+ *	dp	- dmap to complete
+ *	blkno	- starting block number for this dmap
+ *	treemax	- will be filled in with max free for this dmap
+ *
+ * RETURNS:	max free string at the root of the tree
+ */
+static int dbInitDmapTree(struct dmap * dp)
+{
+	struct dmaptree *tp;
+	s8 *cp;
+	int i;
+
+	/* init fixed info of tree */
+	tp = &dp->tree;
+	tp->nleafs = cpu_to_le32(LPERDMAP);
+	tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
+	tp->leafidx = cpu_to_le32(LEAFIND);
+	tp->height = cpu_to_le32(4);
+	tp->budmin = BUDMIN;
+
+	/* init each leaf from corresponding wmap word:
+	 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
+	 * bitmap word are allocated.
+	 */
+	cp = tp->stree + le32_to_cpu(tp->leafidx);
+	for (i = 0; i < LPERDMAP; i++)
+		*cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
+
+	/* build the dmap's binary buddy summary tree */
+	return (dbInitTree(tp));
+}
+
+
+/*
+ * NAME:	dbInitTree()/ujfs_adjtree()
+ *
+ * FUNCTION:	initialize binary buddy summary tree of a dmap or dmapctl.
+ *
+ *		at entry, the leaves of the tree has been initialized
+ *		from corresponding bitmap word or root of summary tree
+ *		of the child control page;
+ *		configure binary buddy system at the leaf level, then
+ *		bubble up the values of the leaf nodes up the tree.
+ *
+ * PARAMETERS:
+ *	cp	- Pointer to the root of the tree
+ *	l2leaves- Number of leaf nodes as a power of 2
+ *	l2min	- Number of blocks that can be covered by a leaf
+ *		  as a power of 2
+ *
+ * RETURNS: max free string at the root of the tree
+ */
+static int dbInitTree(struct dmaptree * dtp)
+{
+	int l2max, l2free, bsize, nextb, i;
+	int child, parent, nparent;
+	s8 *tp, *cp, *cp1;
+
+	tp = dtp->stree;
+
+	/* Determine the maximum free string possible for the leaves */
+	l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
+
+	/*
+	 * configure the leaf levevl into binary buddy system
+	 *
+	 * Try to combine buddies starting with a buddy size of 1
+	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
+	 * can be combined if both buddies have a maximum free of l2min;
+	 * the combination will result in the left-most buddy leaf having
+	 * a maximum free of l2min+1.
+	 * After processing all buddies for a given size, process buddies
+	 * at the next higher buddy size (i.e. current size * 2) and
+	 * the next maximum free (current free + 1).
+	 * This continues until the maximum possible buddy combination
+	 * yields maximum free.
+	 */
+	for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
+	     l2free++, bsize = nextb) {
+		/* get next buddy size == current buddy pair size */
+		nextb = bsize << 1;
+
+		/* scan each adjacent buddy pair at current buddy size */
+		for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
+		     i < le32_to_cpu(dtp->nleafs);
+		     i += nextb, cp += nextb) {
+			/* coalesce if both adjacent buddies are max free */
+			if (*cp == l2free && *(cp + bsize) == l2free) {
+				*cp = l2free + 1;	/* left take right */
+				*(cp + bsize) = -1;	/* right give left */
+			}
+		}
+	}
+
+	/*
+	 * bubble summary information of leaves up the tree.
+	 *
+	 * Starting at the leaf node level, the four nodes described by
+	 * the higher level parent node are compared for a maximum free and
+	 * this maximum becomes the value of the parent node.
+	 * when all lower level nodes are processed in this fashion then
+	 * move up to the next level (parent becomes a lower level node) and
+	 * continue the process for that level.
+	 */
+	for (child = le32_to_cpu(dtp->leafidx),
+	     nparent = le32_to_cpu(dtp->nleafs) >> 2;
+	     nparent > 0; nparent >>= 2, child = parent) {
+		/* get index of 1st node of parent level */
+		parent = (child - 1) >> 2;
+
+		/* set the value of the parent node as the maximum
+		 * of the four nodes of the current level.
+		 */
+		for (i = 0, cp = tp + child, cp1 = tp + parent;
+		     i < nparent; i++, cp += 4, cp1++)
+			*cp1 = TREEMAX(cp);
+	}
+
+	return (*tp);
+}
+
+
+/*
+ *	dbInitDmapCtl()
+ *
+ * function: initialize dmapctl page
+ */
+static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
+{				/* start leaf index not covered by range */
+	s8 *cp;
+
+	dcp->nleafs = cpu_to_le32(LPERCTL);
+	dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
+	dcp->leafidx = cpu_to_le32(CTLLEAFIND);
+	dcp->height = cpu_to_le32(5);
+	dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
+
+	/*
+	 * initialize the leaves of current level that were not covered
+	 * by the specified input block range (i.e. the leaves have no
+	 * low level dmapctl or dmap).
+	 */
+	cp = &dcp->stree[CTLLEAFIND + i];
+	for (; i < LPERCTL; i++)
+		*cp++ = NOFREE;
+
+	/* build the dmap's binary buddy summary tree */
+	return (dbInitTree((struct dmaptree *) dcp));
+}
+
+
+/*
+ * NAME:	dbGetL2AGSize()/ujfs_getagl2size()
+ *
+ * FUNCTION:	Determine log2(allocation group size) from aggregate size
+ *
+ * PARAMETERS:
+ *	nblocks	- Number of blocks in aggregate
+ *
+ * RETURNS: log2(allocation group size) in aggregate blocks
+ */
+static int dbGetL2AGSize(s64 nblocks)
+{
+	s64 sz;
+	s64 m;
+	int l2sz;
+
+	if (nblocks < BPERDMAP * MAXAG)
+		return (L2BPERDMAP);
+
+	/* round up aggregate size to power of 2 */
+	m = ((u64) 1 << (64 - 1));
+	for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
+		if (m & nblocks)
+			break;
+	}
+
+	sz = (s64) 1 << l2sz;
+	if (sz < nblocks)
+		l2sz += 1;
+
+	/* agsize = roundupSize/max_number_of_ag */
+	return (l2sz - L2MAXAG);
+}
+
+
+/*
+ * NAME:	dbMapFileSizeToMapSize()
+ *
+ * FUNCTION:	compute number of blocks the block allocation map file
+ *		can cover from the map file size;
+ *
+ * RETURNS:	Number of blocks which can be covered by this block map file;
+ */
+
+/*
+ * maximum number of map pages at each level including control pages
+ */
+#define MAXL0PAGES	(1 + LPERCTL)
+#define MAXL1PAGES	(1 + LPERCTL * MAXL0PAGES)
+#define MAXL2PAGES	(1 + LPERCTL * MAXL1PAGES)
+
+/*
+ * convert number of map pages to the zero origin top dmapctl level
+ */
+#define BMAPPGTOLEV(npages)	\
+	(((npages) <= 3 + MAXL0PAGES) ? 0 : \
+	 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
+
+s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
+{
+	struct super_block *sb = ipbmap->i_sb;
+	s64 nblocks;
+	s64 npages, ndmaps;
+	int level, i;
+	int complete, factor;
+
+	nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
+	npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
+	level = BMAPPGTOLEV(npages);
+
+	/* At each level, accumulate the number of dmap pages covered by
+	 * the number of full child levels below it;
+	 * repeat for the last incomplete child level.
+	 */
+	ndmaps = 0;
+	npages--;		/* skip the first global control page */
+	/* skip higher level control pages above top level covered by map */
+	npages -= (2 - level);
+	npages--;		/* skip top level's control page */
+	for (i = level; i >= 0; i--) {
+		factor =
+		    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
+		complete = (u32) npages / factor;
+		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
+				      ((i == 1) ? LPERCTL : 1));
+
+		/* pages in last/incomplete child */
+		npages = (u32) npages % factor;
+		/* skip incomplete child's level control page */
+		npages--;
+	}
+
+	/* convert the number of dmaps into the number of blocks
+	 * which can be covered by the dmaps;
+	 */
+	nblocks = ndmaps << L2BPERDMAP;
+
+	return (nblocks);
+}
diff --git a/kernel/fs/jfs/jfs_dmap.h b/kernel/fs/jfs/jfs_dmap.h
new file mode 100644
index 000000000..562b9a7e4
--- /dev/null
+++ b/kernel/fs/jfs/jfs_dmap.h
@@ -0,0 +1,316 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_DMAP
+#define _H_JFS_DMAP
+
+#include "jfs_txnmgr.h"
+
+#define BMAPVERSION	1	/* version number */
+#define	TREESIZE	(256+64+16+4+1)	/* size of a dmap tree */
+#define	LEAFIND		(64+16+4+1)	/* index of 1st leaf of a dmap tree */
+#define LPERDMAP	256	/* num leaves per dmap tree */
+#define L2LPERDMAP	8	/* l2 number of leaves per dmap tree */
+#define	DBWORD		32	/* # of blks covered by a map word */
+#define	L2DBWORD	5	/* l2 # of blks covered by a mword */
+#define BUDMIN		L2DBWORD	/* max free string in a map word */
+#define BPERDMAP	(LPERDMAP * DBWORD)	/* num of blks per dmap */
+#define L2BPERDMAP	13	/* l2 num of blks per dmap */
+#define CTLTREESIZE	(1024+256+64+16+4+1)	/* size of a dmapctl tree */
+#define CTLLEAFIND	(256+64+16+4+1)	/* idx of 1st leaf of a dmapctl tree */
+#define LPERCTL		1024	/* num of leaves per dmapctl tree */
+#define L2LPERCTL	10	/* l2 num of leaves per dmapctl tree */
+#define	ROOT		0	/* index of the root of a tree */
+#define	NOFREE		((s8) -1)	/* no blocks free */
+#define	MAXAG		128	/* max number of allocation groups */
+#define L2MAXAG		7	/* l2 max num of AG */
+#define L2MINAGSZ	25	/* l2 of minimum AG size in bytes */
+#define	BMAPBLKNO	0	/* lblkno of bmap within the map */
+
+/*
+ * maximum l2 number of disk blocks at the various dmapctl levels.
+ */
+#define	L2MAXL0SIZE	(L2BPERDMAP + 1 * L2LPERCTL)
+#define	L2MAXL1SIZE	(L2BPERDMAP + 2 * L2LPERCTL)
+#define	L2MAXL2SIZE	(L2BPERDMAP + 3 * L2LPERCTL)
+
+/*
+ * maximum number of disk blocks at the various dmapctl levels.
+ */
+#define	MAXL0SIZE	((s64)1 << L2MAXL0SIZE)
+#define	MAXL1SIZE	((s64)1 << L2MAXL1SIZE)
+#define	MAXL2SIZE	((s64)1 << L2MAXL2SIZE)
+
+#define	MAXMAPSIZE	MAXL2SIZE	/* maximum aggregate map size */
+
+/*
+ * determine the maximum free string for four (lower level) nodes
+ * of the tree.
+ */
+static inline signed char TREEMAX(signed char *cp)
+{
+	signed char tmp1, tmp2;
+
+	tmp1 = max(*(cp+2), *(cp+3));
+	tmp2 = max(*(cp), *(cp+1));
+
+	return max(tmp1, tmp2);
+}
+
+/*
+ * convert disk block number to the logical block number of the dmap
+ * describing the disk block.  s is the log2(number of logical blocks per page)
+ *
+ * The calculation figures out how many logical pages are in front of the dmap.
+ *	- the number of dmaps preceding it
+ *	- the number of L0 pages preceding its L0 page
+ *	- the number of L1 pages preceding its L1 page
+ *	- 3 is added to account for the L2, L1, and L0 page for this dmap
+ *	- 1 is added to account for the control page of the map.
+ */
+#define BLKTODMAP(b,s)    \
+	((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the LEVEL 0
+ * dmapctl describing the disk block.  s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L0.
+ *	- the number of dmap pages preceding it
+ *	- the number of L0 pages preceding it
+ *	- the number of L1 pages preceding its L1 page
+ *	- 2 is added to account for the L2, and L1 page for this L0
+ *	- 1 is added to account for the control page of the map.
+ */
+#define BLKTOL0(b,s)      \
+	(((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the LEVEL 1
+ * dmapctl describing the disk block.  s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L1.
+ *	- the number of dmap pages preceding it
+ *	- the number of L0 pages preceding it
+ *	- the number of L1 pages preceding it
+ *	- 1 is added to account for the L2 page
+ *	- 1 is added to account for the control page of the map.
+ */
+#define BLKTOL1(b,s)      \
+     (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the dmapctl
+ * at the specified level which describes the disk block.
+ */
+#define BLKTOCTL(b,s,l)   \
+	(((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
+
+/*
+ * convert aggregate map size to the zero origin dmapctl level of the
+ * top dmapctl.
+ */
+#define	BMAPSZTOLEV(size)	\
+	(((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2)
+
+/* convert disk block number to allocation group number.
+ */
+#define BLKTOAG(b,sbi)	((b) >> ((sbi)->bmap->db_agl2size))
+
+/* convert allocation group number to starting disk block
+ * number.
+ */
+#define AGTOBLK(a,ip)	\
+	((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size))
+
+/*
+ *	dmap summary tree
+ *
+ * dmaptree must be consistent with dmapctl.
+ */
+struct dmaptree {
+	__le32 nleafs;		/* 4: number of tree leafs	*/
+	__le32 l2nleafs;	/* 4: l2 number of tree leafs	*/
+	__le32 leafidx;		/* 4: index of first tree leaf	*/
+	__le32 height;		/* 4: height of the tree	*/
+	s8 budmin;		/* 1: min l2 tree leaf value to combine */
+	s8 stree[TREESIZE];	/* TREESIZE: tree		*/
+	u8 pad[2];		/* 2: pad to word boundary	*/
+};				/* - 360 -			*/
+
+/*
+ *	dmap page per 8K blocks bitmap
+ */
+struct dmap {
+	__le32 nblocks;		/* 4: num blks covered by this dmap	*/
+	__le32 nfree;		/* 4: num of free blks in this dmap	*/
+	__le64 start;		/* 8: starting blkno for this dmap	*/
+	struct dmaptree tree;	/* 360: dmap tree			*/
+	u8 pad[1672];		/* 1672: pad to 2048 bytes		*/
+	__le32 wmap[LPERDMAP];	/* 1024: bits of the working map	*/
+	__le32 pmap[LPERDMAP];	/* 1024: bits of the persistent map	*/
+};				/* - 4096 -				*/
+
+/*
+ *	disk map control page per level.
+ *
+ * dmapctl must be consistent with dmaptree.
+ */
+struct dmapctl {
+	__le32 nleafs;		/* 4: number of tree leafs	*/
+	__le32 l2nleafs;	/* 4: l2 number of tree leafs	*/
+	__le32 leafidx;		/* 4: index of the first tree leaf	*/
+	__le32 height;		/* 4: height of tree		*/
+	s8 budmin;		/* 1: minimum l2 tree leaf value	*/
+	s8 stree[CTLTREESIZE];	/* CTLTREESIZE: dmapctl tree	*/
+	u8 pad[2714];		/* 2714: pad to 4096		*/
+};				/* - 4096 -			*/
+
+/*
+ *	common definition for dmaptree within dmap and dmapctl
+ */
+typedef union dmtree {
+	struct dmaptree t1;
+	struct dmapctl t2;
+} dmtree_t;
+
+/* macros for accessing fields within dmtree */
+#define	dmt_nleafs	t1.nleafs
+#define	dmt_l2nleafs	t1.l2nleafs
+#define	dmt_leafidx	t1.leafidx
+#define	dmt_height	t1.height
+#define	dmt_budmin	t1.budmin
+#define	dmt_stree	t1.stree
+
+/*
+ *	on-disk aggregate disk allocation map descriptor.
+ */
+struct dbmap_disk {
+	__le64 dn_mapsize;	/* 8: number of blocks in aggregate	*/
+	__le64 dn_nfree;	/* 8: num free blks in aggregate map	*/
+	__le32 dn_l2nbperpage;	/* 4: number of blks per page		*/
+	__le32 dn_numag;	/* 4: total number of ags		*/
+	__le32 dn_maxlevel;	/* 4: number of active ags		*/
+	__le32 dn_maxag;	/* 4: max active alloc group number	*/
+	__le32 dn_agpref;	/* 4: preferred alloc group (hint)	*/
+	__le32 dn_aglevel;	/* 4: dmapctl level holding the AG	*/
+	__le32 dn_agheight;	/* 4: height in dmapctl of the AG	*/
+	__le32 dn_agwidth;	/* 4: width in dmapctl of the AG	*/
+	__le32 dn_agstart;	/* 4: start tree index at AG height	*/
+	__le32 dn_agl2size;	/* 4: l2 num of blks per alloc group	*/
+	__le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count		*/
+	__le64 dn_agsize;	/* 8: num of blks per alloc group	*/
+	s8 dn_maxfreebud;	/* 1: max free buddy system		*/
+	u8 pad[3007];		/* 3007: pad to 4096			*/
+};				/* - 4096 -				*/
+
+struct dbmap {
+	s64 dn_mapsize;		/* number of blocks in aggregate	*/
+	s64 dn_nfree;		/* num free blks in aggregate map	*/
+	int dn_l2nbperpage;	/* number of blks per page		*/
+	int dn_numag;		/* total number of ags			*/
+	int dn_maxlevel;	/* number of active ags			*/
+	int dn_maxag;		/* max active alloc group number	*/
+	int dn_agpref;		/* preferred alloc group (hint)		*/
+	int dn_aglevel;		/* dmapctl level holding the AG		*/
+	int dn_agheight;	/* height in dmapctl of the AG		*/
+	int dn_agwidth;		/* width in dmapctl of the AG		*/
+	int dn_agstart;		/* start tree index at AG height	*/
+	int dn_agl2size;	/* l2 num of blks per alloc group	*/
+	s64 dn_agfree[MAXAG];	/* per AG free count			*/
+	s64 dn_agsize;		/* num of blks per alloc group		*/
+	signed char dn_maxfreebud;	/* max free buddy system	*/
+};				/* - 4096 -				*/
+/*
+ *	in-memory aggregate disk allocation map descriptor.
+ */
+struct bmap {
+	struct dbmap db_bmap;		/* on-disk aggregate map descriptor */
+	struct inode *db_ipbmap;	/* ptr to aggregate map incore inode */
+	struct mutex db_bmaplock;	/* aggregate map lock */
+	atomic_t db_active[MAXAG];	/* count of active, open files in AG */
+	u32 *db_DBmap;
+};
+
+/* macros for accessing fields within in-memory aggregate map descriptor */
+#define	db_mapsize	db_bmap.dn_mapsize
+#define	db_nfree	db_bmap.dn_nfree
+#define	db_agfree	db_bmap.dn_agfree
+#define	db_agsize	db_bmap.dn_agsize
+#define	db_agl2size	db_bmap.dn_agl2size
+#define	db_agwidth	db_bmap.dn_agwidth
+#define	db_agheight	db_bmap.dn_agheight
+#define	db_agstart	db_bmap.dn_agstart
+#define	db_numag	db_bmap.dn_numag
+#define	db_maxlevel	db_bmap.dn_maxlevel
+#define	db_aglevel	db_bmap.dn_aglevel
+#define	db_agpref	db_bmap.dn_agpref
+#define	db_maxag	db_bmap.dn_maxag
+#define	db_maxfreebud	db_bmap.dn_maxfreebud
+#define	db_l2nbperpage	db_bmap.dn_l2nbperpage
+
+/*
+ * macros for various conversions needed by the allocators.
+ * blkstol2(), cntlz(), and cnttz() are operating system dependent functions.
+ */
+/* convert number of blocks to log2 number of blocks, rounding up to
+ * the next log2 value if blocks is not a l2 multiple.
+ */
+#define	BLKSTOL2(d)		(blkstol2(d))
+
+/* convert number of leafs to log2 leaf value */
+#define	NLSTOL2BSZ(n)		(31 - cntlz((n)) + BUDMIN)
+
+/* convert leaf index to log2 leaf value */
+#define	LITOL2BSZ(n,m,b)	((((n) == 0) ? (m) : cnttz((n))) + (b))
+
+/* convert a block number to a dmap control leaf index */
+#define BLKTOCTLLEAF(b,m)	\
+	(((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m))
+
+/* convert log2 leaf value to buddy size */
+#define	BUDSIZE(s,m)		(1 << ((s) - (m)))
+
+/*
+ *	external references.
+ */
+extern int dbMount(struct inode *ipbmap);
+
+extern int dbUnmount(struct inode *ipbmap, int mounterror);
+
+extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks);
+
+extern int dbUpdatePMap(struct inode *ipbmap,
+			int free, s64 blkno, s64 nblocks, struct tblock * tblk);
+
+extern int dbNextAG(struct inode *ipbmap);
+
+extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results);
+
+extern int dbReAlloc(struct inode *ipbmap,
+		     s64 blkno, s64 nblocks, s64 addnblocks, s64 * results);
+
+extern int dbSync(struct inode *ipbmap);
+extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks);
+extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks);
+extern void dbFinalizeBmap(struct inode *ipbmap);
+extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+extern s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen);
+
+#endif				/* _H_JFS_DMAP */
diff --git a/kernel/fs/jfs/jfs_dtree.c b/kernel/fs/jfs/jfs_dtree.c
new file mode 100644
index 000000000..d88576e23
--- /dev/null
+++ b/kernel/fs/jfs/jfs_dtree.c
@@ -0,0 +1,4576 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_dtree.c: directory B+-tree manager
+ *
+ * B+-tree with variable length key directory:
+ *
+ * each directory page is structured as an array of 32-byte
+ * directory entry slots initialized as a freelist
+ * to avoid search/compaction of free space at insertion.
+ * when an entry is inserted, a number of slots are allocated
+ * from the freelist as required to store variable length data
+ * of the entry; when the entry is deleted, slots of the entry
+ * are returned to freelist.
+ *
+ * leaf entry stores full name as key and file serial number
+ * (aka inode number) as data.
+ * internal/router entry stores sufffix compressed name
+ * as key and simple extent descriptor as data.
+ *
+ * each directory page maintains a sorted entry index table
+ * which stores the start slot index of sorted entries
+ * to allow binary search on the table.
+ *
+ * directory starts as a root/leaf page in on-disk inode
+ * inline data area.
+ * when it becomes full, it starts a leaf of a external extent
+ * of length of 1 block. each time the first leaf becomes full,
+ * it is extended rather than split (its size is doubled),
+ * until its length becoms 4 KBytes, from then the extent is split
+ * with new 4 Kbyte extent when it becomes full
+ * to reduce external fragmentation of small directories.
+ *
+ * blah, blah, blah, for linear scan of directory in pieces by
+ * readdir().
+ *
+ *
+ *	case-insensitive directory file system
+ *
+ * names are stored in case-sensitive way in leaf entry.
+ * but stored, searched and compared in case-insensitive (uppercase) order
+ * (i.e., both search key and entry key are folded for search/compare):
+ * (note that case-sensitive order is BROKEN in storage, e.g.,
+ *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
+ *
+ *  entries which folds to the same key makes up a equivalent class
+ *  whose members are stored as contiguous cluster (may cross page boundary)
+ *  but whose order is arbitrary and acts as duplicate, e.g.,
+ *  abc, Abc, aBc, abC)
+ *
+ * once match is found at leaf, requires scan forward/backward
+ * either for, in case-insensitive search, duplicate
+ * or for, in case-sensitive search, for exact match
+ *
+ * router entry must be created/stored in case-insensitive way
+ * in internal entry:
+ * (right most key of left page and left most key of right page
+ * are folded, and its suffix compression is propagated as router
+ * key in parent)
+ * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
+ * should be made the router key for the split)
+ *
+ * case-insensitive search:
+ *
+ *	fold search key;
+ *
+ *	case-insensitive search of B-tree:
+ *	for internal entry, router key is already folded;
+ *	for leaf entry, fold the entry key before comparison.
+ *
+ *	if (leaf entry case-insensitive match found)
+ *		if (next entry satisfies case-insensitive match)
+ *			return EDUPLICATE;
+ *		if (prev entry satisfies case-insensitive match)
+ *			return EDUPLICATE;
+ *		return match;
+ *	else
+ *		return no match;
+ *
+ *	serialization:
+ * target directory inode lock is being held on entry/exit
+ * of all main directory service routines.
+ *
+ *	log based recovery:
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+/* dtree split parameter */
+struct dtsplit {
+	struct metapage *mp;
+	s16 index;
+	s16 nslot;
+	struct component_name *key;
+	ddata_t *data;
+	struct pxdlist *pxdlist;
+};
+
+#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
+
+/* get page buffer for specified block address */
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)				\
+do {									\
+	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot);	\
+	if (!(RC)) {							\
+		if (((P)->header.nextindex >				\
+		     (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
+		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) {	\
+			BT_PUTPAGE(MP);					\
+			jfs_error((IP)->i_sb,				\
+				  "DT_GETPAGE: dtree page corrupt\n");	\
+			MP = NULL;					\
+			RC = -EIO;					\
+		}							\
+	}								\
+} while (0)
+
+/* for consistency */
+#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
+
+#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
+
+/*
+ * forward references
+ */
+static int dtSplitUp(tid_t tid, struct inode *ip,
+		     struct dtsplit * split, struct btstack * btstack);
+
+static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
+		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
+
+static int dtExtendPage(tid_t tid, struct inode *ip,
+			struct dtsplit * split, struct btstack * btstack);
+
+static int dtSplitRoot(tid_t tid, struct inode *ip,
+		       struct dtsplit * split, struct metapage ** rmpp);
+
+static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+		      dtpage_t * fp, struct btstack * btstack);
+
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
+
+static int dtReadFirst(struct inode *ip, struct btstack * btstack);
+
+static int dtReadNext(struct inode *ip,
+		      loff_t * offset, struct btstack * btstack);
+
+static int dtCompare(struct component_name * key, dtpage_t * p, int si);
+
+static int ciCompare(struct component_name * key, dtpage_t * p, int si,
+		     int flag);
+
+static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
+		     int flag);
+
+static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+			      int ri, struct component_name * key, int flag);
+
+static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
+			  ddata_t * data, struct dt_lock **);
+
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
+			int do_index);
+
+static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
+
+static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
+
+static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
+
+#define ciToUpper(c)	UniStrupr((c)->name)
+
+/*
+ *	read_index_page()
+ *
+ *	Reads a page of a directory's index table.
+ *	Having metadata mapped into the directory inode's address space
+ *	presents a multitude of problems.  We avoid this by mapping to
+ *	the absolute address space outside of the *_metapage routines
+ */
+static struct metapage *read_index_page(struct inode *inode, s64 blkno)
+{
+	int rc;
+	s64 xaddr;
+	int xflag;
+	s32 xlen;
+
+	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+	if (rc || (xaddr == 0))
+		return NULL;
+
+	return read_metapage(inode, xaddr, PSIZE, 1);
+}
+
+/*
+ *	get_index_page()
+ *
+ *	Same as get_index_page(), but get's a new page without reading
+ */
+static struct metapage *get_index_page(struct inode *inode, s64 blkno)
+{
+	int rc;
+	s64 xaddr;
+	int xflag;
+	s32 xlen;
+
+	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+	if (rc || (xaddr == 0))
+		return NULL;
+
+	return get_metapage(inode, xaddr, PSIZE, 1);
+}
+
+/*
+ *	find_index()
+ *
+ *	Returns dtree page containing directory table entry for specified
+ *	index and pointer to its entry.
+ *
+ *	mp must be released by caller.
+ */
+static struct dir_table_slot *find_index(struct inode *ip, u32 index,
+					 struct metapage ** mp, s64 *lblock)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	s64 blkno;
+	s64 offset;
+	int page_offset;
+	struct dir_table_slot *slot;
+	static int maxWarnings = 10;
+
+	if (index < 2) {
+		if (maxWarnings) {
+			jfs_warn("find_entry called with index = %d", index);
+			maxWarnings--;
+		}
+		return NULL;
+	}
+
+	if (index >= jfs_ip->next_index) {
+		jfs_warn("find_entry called with index >= next_index");
+		return NULL;
+	}
+
+	if (jfs_dirtable_inline(ip)) {
+		/*
+		 * Inline directory table
+		 */
+		*mp = NULL;
+		slot = &jfs_ip->i_dirtable[index - 2];
+	} else {
+		offset = (index - 2) * sizeof(struct dir_table_slot);
+		page_offset = offset & (PSIZE - 1);
+		blkno = ((offset + 1) >> L2PSIZE) <<
+		    JFS_SBI(ip->i_sb)->l2nbperpage;
+
+		if (*mp && (*lblock != blkno)) {
+			release_metapage(*mp);
+			*mp = NULL;
+		}
+		if (!(*mp)) {
+			*lblock = blkno;
+			*mp = read_index_page(ip, blkno);
+		}
+		if (!(*mp)) {
+			jfs_err("free_index: error reading directory table");
+			return NULL;
+		}
+
+		slot =
+		    (struct dir_table_slot *) ((char *) (*mp)->data +
+					       page_offset);
+	}
+	return slot;
+}
+
+static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
+			      u32 index)
+{
+	struct tlock *tlck;
+	struct linelock *llck;
+	struct lv *lv;
+
+	tlck = txLock(tid, ip, mp, tlckDATA);
+	llck = (struct linelock *) tlck->lock;
+
+	if (llck->index >= llck->maxcnt)
+		llck = txLinelock(llck);
+	lv = &llck->lv[llck->index];
+
+	/*
+	 *	Linelock slot size is twice the size of directory table
+	 *	slot size.  512 entries per page.
+	 */
+	lv->offset = ((index - 2) & 511) >> 1;
+	lv->length = 1;
+	llck->index++;
+}
+
+/*
+ *	add_index()
+ *
+ *	Adds an entry to the directory index table.  This is used to provide
+ *	each directory entry with a persistent index in which to resume
+ *	directory traversals
+ */
+static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
+{
+	struct super_block *sb = ip->i_sb;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	u64 blkno;
+	struct dir_table_slot *dirtab_slot;
+	u32 index;
+	struct linelock *llck;
+	struct lv *lv;
+	struct metapage *mp;
+	s64 offset;
+	uint page_offset;
+	struct tlock *tlck;
+	s64 xaddr;
+
+	ASSERT(DO_INDEX(ip));
+
+	if (jfs_ip->next_index < 2) {
+		jfs_warn("add_index: next_index = %d.  Resetting!",
+			   jfs_ip->next_index);
+		jfs_ip->next_index = 2;
+	}
+
+	index = jfs_ip->next_index++;
+
+	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
+		/*
+		 * i_size reflects size of index table, or 8 bytes per entry.
+		 */
+		ip->i_size = (loff_t) (index - 1) << 3;
+
+		/*
+		 * dir table fits inline within inode
+		 */
+		dirtab_slot = &jfs_ip->i_dirtable[index-2];
+		dirtab_slot->flag = DIR_INDEX_VALID;
+		dirtab_slot->slot = slot;
+		DTSaddress(dirtab_slot, bn);
+
+		set_cflag(COMMIT_Dirtable, ip);
+
+		return index;
+	}
+	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
+		struct dir_table_slot temp_table[12];
+
+		/*
+		 * It's time to move the inline table to an external
+		 * page and begin to build the xtree
+		 */
+		if (dquot_alloc_block(ip, sbi->nbperpage))
+			goto clean_up;
+		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
+			dquot_free_block(ip, sbi->nbperpage);
+			goto clean_up;
+		}
+
+		/*
+		 * Save the table, we're going to overwrite it with the
+		 * xtree root
+		 */
+		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
+
+		/*
+		 * Initialize empty x-tree
+		 */
+		xtInitRoot(tid, ip);
+
+		/*
+		 * Add the first block to the xtree
+		 */
+		if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
+			/* This really shouldn't fail */
+			jfs_warn("add_index: xtInsert failed!");
+			memcpy(&jfs_ip->i_dirtable, temp_table,
+			       sizeof (temp_table));
+			dbFree(ip, xaddr, sbi->nbperpage);
+			dquot_free_block(ip, sbi->nbperpage);
+			goto clean_up;
+		}
+		ip->i_size = PSIZE;
+
+		mp = get_index_page(ip, 0);
+		if (!mp) {
+			jfs_err("add_index: get_metapage failed!");
+			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+			memcpy(&jfs_ip->i_dirtable, temp_table,
+			       sizeof (temp_table));
+			goto clean_up;
+		}
+		tlck = txLock(tid, ip, mp, tlckDATA);
+		llck = (struct linelock *) & tlck->lock;
+		ASSERT(llck->index == 0);
+		lv = &llck->lv[0];
+
+		lv->offset = 0;
+		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
+		llck->index++;
+
+		memcpy(mp->data, temp_table, sizeof(temp_table));
+
+		mark_metapage_dirty(mp);
+		release_metapage(mp);
+
+		/*
+		 * Logging is now directed by xtree tlocks
+		 */
+		clear_cflag(COMMIT_Dirtable, ip);
+	}
+
+	offset = (index - 2) * sizeof(struct dir_table_slot);
+	page_offset = offset & (PSIZE - 1);
+	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
+	if (page_offset == 0) {
+		/*
+		 * This will be the beginning of a new page
+		 */
+		xaddr = 0;
+		if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
+			jfs_warn("add_index: xtInsert failed!");
+			goto clean_up;
+		}
+		ip->i_size += PSIZE;
+
+		if ((mp = get_index_page(ip, blkno)))
+			memset(mp->data, 0, PSIZE);	/* Just looks better */
+		else
+			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
+	} else
+		mp = read_index_page(ip, blkno);
+
+	if (!mp) {
+		jfs_err("add_index: get/read_metapage failed!");
+		goto clean_up;
+	}
+
+	lock_index(tid, ip, mp, index);
+
+	dirtab_slot =
+	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
+	dirtab_slot->flag = DIR_INDEX_VALID;
+	dirtab_slot->slot = slot;
+	DTSaddress(dirtab_slot, bn);
+
+	mark_metapage_dirty(mp);
+	release_metapage(mp);
+
+	return index;
+
+      clean_up:
+
+	jfs_ip->next_index--;
+
+	return 0;
+}
+
+/*
+ *	free_index()
+ *
+ *	Marks an entry to the directory index table as free.
+ */
+static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
+{
+	struct dir_table_slot *dirtab_slot;
+	s64 lblock;
+	struct metapage *mp = NULL;
+
+	dirtab_slot = find_index(ip, index, &mp, &lblock);
+
+	if (!dirtab_slot)
+		return;
+
+	dirtab_slot->flag = DIR_INDEX_FREE;
+	dirtab_slot->slot = dirtab_slot->addr1 = 0;
+	dirtab_slot->addr2 = cpu_to_le32(next);
+
+	if (mp) {
+		lock_index(tid, ip, mp, index);
+		mark_metapage_dirty(mp);
+		release_metapage(mp);
+	} else
+		set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ *	modify_index()
+ *
+ *	Changes an entry in the directory index table
+ */
+static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
+			 int slot, struct metapage ** mp, s64 *lblock)
+{
+	struct dir_table_slot *dirtab_slot;
+
+	dirtab_slot = find_index(ip, index, mp, lblock);
+
+	if (!dirtab_slot)
+		return;
+
+	DTSaddress(dirtab_slot, bn);
+	dirtab_slot->slot = slot;
+
+	if (*mp) {
+		lock_index(tid, ip, *mp, index);
+		mark_metapage_dirty(*mp);
+	} else
+		set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ *	read_index()
+ *
+ *	reads a directory table slot
+ */
+static int read_index(struct inode *ip, u32 index,
+		     struct dir_table_slot * dirtab_slot)
+{
+	s64 lblock;
+	struct metapage *mp = NULL;
+	struct dir_table_slot *slot;
+
+	slot = find_index(ip, index, &mp, &lblock);
+	if (!slot) {
+		return -EIO;
+	}
+
+	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
+
+	if (mp)
+		release_metapage(mp);
+
+	return 0;
+}
+
+/*
+ *	dtSearch()
+ *
+ * function:
+ *	Search for the entry with specified key
+ *
+ * parameter:
+ *
+ * return: 0 - search result on stack, leaf page pinned;
+ *	   errno - I/O error
+ */
+int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
+	     struct btstack * btstack, int flag)
+{
+	int rc = 0;
+	int cmp = 1;		/* init for empty page */
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	s8 *stbl;
+	int base, index, lim;
+	struct btframe *btsp;
+	pxd_t *pxd;
+	int psize = 288;	/* initial in-line directory */
+	ino_t inumber;
+	struct component_name ciKey;
+	struct super_block *sb = ip->i_sb;
+
+	ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
+	if (!ciKey.name) {
+		rc = -ENOMEM;
+		goto dtSearch_Exit2;
+	}
+
+
+	/* uppercase search key for c-i directory */
+	UniStrcpy(ciKey.name, key->name);
+	ciKey.namlen = key->namlen;
+
+	/* only uppercase if case-insensitive support is on */
+	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
+		ciToUpper(&ciKey);
+	}
+	BT_CLR(btstack);	/* reset stack */
+
+	/* init level count for max pages to split */
+	btstack->nsplit = 1;
+
+	/*
+	 *	search down tree from root:
+	 *
+	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+	 *
+	 * if entry with search key K is not found
+	 * internal page search find the entry with largest key Ki
+	 * less than K which point to the child page to search;
+	 * leaf page search find the entry with smallest key Kj
+	 * greater than K so that the returned index is the position of
+	 * the entry to be shifted right for insertion of new entry.
+	 * for empty tree, search key is greater than any key of the tree.
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		DT_GETPAGE(ip, bn, mp, psize, p, rc);
+		if (rc)
+			goto dtSearch_Exit1;
+
+		/* get sorted entry table of the page */
+		stbl = DT_GETSTBL(p);
+
+		/*
+		 * binary search with search key K on the current page.
+		 */
+		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
+			index = base + (lim >> 1);
+
+			if (p->header.flag & BT_LEAF) {
+				/* uppercase leaf name to compare */
+				cmp =
+				    ciCompare(&ciKey, p, stbl[index],
+					      JFS_SBI(sb)->mntflag);
+			} else {
+				/* router key is in uppercase */
+
+				cmp = dtCompare(&ciKey, p, stbl[index]);
+
+
+			}
+			if (cmp == 0) {
+				/*
+				 *	search hit
+				 */
+				/* search hit - leaf page:
+				 * return the entry found
+				 */
+				if (p->header.flag & BT_LEAF) {
+					inumber = le32_to_cpu(
+			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
+
+					/*
+					 * search for JFS_LOOKUP
+					 */
+					if (flag == JFS_LOOKUP) {
+						*data = inumber;
+						rc = 0;
+						goto out;
+					}
+
+					/*
+					 * search for JFS_CREATE
+					 */
+					if (flag == JFS_CREATE) {
+						*data = inumber;
+						rc = -EEXIST;
+						goto out;
+					}
+
+					/*
+					 * search for JFS_REMOVE or JFS_RENAME
+					 */
+					if ((flag == JFS_REMOVE ||
+					     flag == JFS_RENAME) &&
+					    *data != inumber) {
+						rc = -ESTALE;
+						goto out;
+					}
+
+					/*
+					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
+					 */
+					/* save search result */
+					*data = inumber;
+					btsp = btstack->top;
+					btsp->bn = bn;
+					btsp->index = index;
+					btsp->mp = mp;
+
+					rc = 0;
+					goto dtSearch_Exit1;
+				}
+
+				/* search hit - internal page:
+				 * descend/search its child page
+				 */
+				goto getChild;
+			}
+
+			if (cmp > 0) {
+				base = index + 1;
+				--lim;
+			}
+		}
+
+		/*
+		 *	search miss
+		 *
+		 * base is the smallest index with key (Kj) greater than
+		 * search key (K) and may be zero or (maxindex + 1) index.
+		 */
+		/*
+		 * search miss - leaf page
+		 *
+		 * return location of entry (base) where new entry with
+		 * search key K is to be inserted.
+		 */
+		if (p->header.flag & BT_LEAF) {
+			/*
+			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
+			 */
+			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
+			    flag == JFS_RENAME) {
+				rc = -ENOENT;
+				goto out;
+			}
+
+			/*
+			 * search for JFS_CREATE|JFS_FINDDIR:
+			 *
+			 * save search result
+			 */
+			*data = 0;
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = base;
+			btsp->mp = mp;
+
+			rc = 0;
+			goto dtSearch_Exit1;
+		}
+
+		/*
+		 * search miss - internal page
+		 *
+		 * if base is non-zero, decrement base by one to get the parent
+		 * entry of the child page to search.
+		 */
+		index = base ? base - 1 : base;
+
+		/*
+		 * go down to child page
+		 */
+	      getChild:
+		/* update max. number of pages to split */
+		if (BT_STACK_FULL(btstack)) {
+			/* Something's corrupted, mark filesystem dirty so
+			 * chkdsk will fix it.
+			 */
+			jfs_error(sb, "stack overrun!\n");
+			BT_STACK_DUMP(btstack);
+			rc = -EIO;
+			goto out;
+		}
+		btstack->nsplit++;
+
+		/* push (bn, index) of the parent page/entry */
+		BT_PUSH(btstack, bn, index);
+
+		/* get the child page block number */
+		pxd = (pxd_t *) & p->slot[stbl[index]];
+		bn = addressPXD(pxd);
+		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+	}
+
+      out:
+	DT_PUTPAGE(mp);
+
+      dtSearch_Exit1:
+
+	kfree(ciKey.name);
+
+      dtSearch_Exit2:
+
+	return rc;
+}
+
+
+/*
+ *	dtInsert()
+ *
+ * function: insert an entry to directory tree
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ */
+int dtInsert(tid_t tid, struct inode *ip,
+	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *mp;	/* meta-page buffer */
+	dtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index;
+	struct dtsplit split;	/* split information */
+	ddata_t data;
+	struct dt_lock *dtlck;
+	int n;
+	struct tlock *tlck;
+	struct lv *lv;
+
+	/*
+	 *	retrieve search result
+	 *
+	 * dtSearch() returns (leaf page pinned, index at which to insert).
+	 * n.b. dtSearch() may return index of (maxindex + 1) of
+	 * the full page.
+	 */
+	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+
+	/*
+	 *	insert entry for new key
+	 */
+	if (DO_INDEX(ip)) {
+		if (JFS_IP(ip)->next_index == DIREND) {
+			DT_PUTPAGE(mp);
+			return -EMLINK;
+		}
+		n = NDTLEAF(name->namlen);
+		data.leaf.tid = tid;
+		data.leaf.ip = ip;
+	} else {
+		n = NDTLEAF_LEGACY(name->namlen);
+		data.leaf.ip = NULL;	/* signifies legacy directory format */
+	}
+	data.leaf.ino = *fsn;
+
+	/*
+	 *	leaf page does not have enough room for new entry:
+	 *
+	 *	extend/split the leaf page;
+	 *
+	 * dtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	if (n > p->header.freecnt) {
+		split.mp = mp;
+		split.index = index;
+		split.nslot = n;
+		split.key = name;
+		split.data = &data;
+		rc = dtSplitUp(tid, ip, &split, btstack);
+		return rc;
+	}
+
+	/*
+	 *	leaf page does have enough room for new entry:
+	 *
+	 *	insert the new data entry into the leaf page;
+	 */
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page
+	 */
+	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+
+	/* linelock header */
+	lv->offset = 0;
+	lv->length = 1;
+	dtlck->index++;
+
+	dtInsertEntry(p, index, name, &data, &dtlck);
+
+	/* linelock stbl of non-root leaf page */
+	if (!(p->header.flag & BT_ROOT)) {
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		n = index >> L2DTSLOTSIZE;
+		lv->offset = p->header.stblindex + n;
+		lv->length =
+		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+		dtlck->index++;
+	}
+
+	/* unpin the leaf page */
+	DT_PUTPAGE(mp);
+
+	return 0;
+}
+
+
+/*
+ *	dtSplitUp()
+ *
+ * function: propagate insertion bottom up;
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	leaf page unpinned;
+ */
+static int dtSplitUp(tid_t tid,
+	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	int rc = 0;
+	struct metapage *smp;
+	dtpage_t *sp;		/* split page */
+	struct metapage *rmp;
+	dtpage_t *rp;		/* new right page split from sp */
+	pxd_t rpxd;		/* new right page extent descriptor */
+	struct metapage *lmp;
+	dtpage_t *lp;		/* left child page */
+	int skip;		/* index of entry of insertion */
+	struct btframe *parent;	/* parent page entry on traverse stack */
+	s64 xaddr, nxaddr;
+	int xlen, xsize;
+	struct pxdlist pxdlist;
+	pxd_t *pxd;
+	struct component_name key = { 0, NULL };
+	ddata_t *data = split->data;
+	int n;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	int quota_allocation = 0;
+
+	/* get split page */
+	smp = split->mp;
+	sp = DT_PAGE(ip, smp);
+
+	key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
+	if (!key.name) {
+		DT_PUTPAGE(smp);
+		rc = -ENOMEM;
+		goto dtSplitUp_Exit;
+	}
+
+	/*
+	 *	split leaf page
+	 *
+	 * The split routines insert the new entry, and
+	 * acquire txLock as appropriate.
+	 */
+	/*
+	 *	split root leaf page:
+	 */
+	if (sp->header.flag & BT_ROOT) {
+		/*
+		 * allocate a single extent child page
+		 */
+		xlen = 1;
+		n = sbi->bsize >> L2DTSLOTSIZE;
+		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
+		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
+		if (n <= split->nslot)
+			xlen++;
+		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
+			DT_PUTPAGE(smp);
+			goto freeKeyName;
+		}
+
+		pxdlist.maxnpxd = 1;
+		pxdlist.npxd = 0;
+		pxd = &pxdlist.pxd[0];
+		PXDaddress(pxd, xaddr);
+		PXDlength(pxd, xlen);
+		split->pxdlist = &pxdlist;
+		rc = dtSplitRoot(tid, ip, split, &rmp);
+
+		if (rc)
+			dbFree(ip, xaddr, xlen);
+		else
+			DT_PUTPAGE(rmp);
+
+		DT_PUTPAGE(smp);
+
+		if (!DO_INDEX(ip))
+			ip->i_size = xlen << sbi->l2bsize;
+
+		goto freeKeyName;
+	}
+
+	/*
+	 *	extend first leaf page
+	 *
+	 * extend the 1st extent if less than buffer page size
+	 * (dtExtendPage() reurns leaf page unpinned)
+	 */
+	pxd = &sp->header.self;
+	xlen = lengthPXD(pxd);
+	xsize = xlen << sbi->l2bsize;
+	if (xsize < PSIZE) {
+		xaddr = addressPXD(pxd);
+		n = xsize >> L2DTSLOTSIZE;
+		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
+		if ((n + sp->header.freecnt) <= split->nslot)
+			n = xlen + (xlen << 1);
+		else
+			n = xlen;
+
+		/* Allocate blocks to quota. */
+		rc = dquot_alloc_block(ip, n);
+		if (rc)
+			goto extendOut;
+		quota_allocation += n;
+
+		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
+				    (s64) n, &nxaddr)))
+			goto extendOut;
+
+		pxdlist.maxnpxd = 1;
+		pxdlist.npxd = 0;
+		pxd = &pxdlist.pxd[0];
+		PXDaddress(pxd, nxaddr);
+		PXDlength(pxd, xlen + n);
+		split->pxdlist = &pxdlist;
+		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
+			nxaddr = addressPXD(pxd);
+			if (xaddr != nxaddr) {
+				/* free relocated extent */
+				xlen = lengthPXD(pxd);
+				dbFree(ip, nxaddr, (s64) xlen);
+			} else {
+				/* free extended delta */
+				xlen = lengthPXD(pxd) - n;
+				xaddr = addressPXD(pxd) + xlen;
+				dbFree(ip, xaddr, (s64) n);
+			}
+		} else if (!DO_INDEX(ip))
+			ip->i_size = lengthPXD(pxd) << sbi->l2bsize;
+
+
+	      extendOut:
+		DT_PUTPAGE(smp);
+		goto freeKeyName;
+	}
+
+	/*
+	 *	split leaf page <sp> into <sp> and a new right page <rp>.
+	 *
+	 * return <rp> pinned and its extent descriptor <rpxd>
+	 */
+	/*
+	 * allocate new directory page extent and
+	 * new index page(s) to cover page split(s)
+	 *
+	 * allocation hint: ?
+	 */
+	n = btstack->nsplit;
+	pxdlist.maxnpxd = pxdlist.npxd = 0;
+	xlen = sbi->nbperpage;
+	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
+		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
+			PXDaddress(pxd, xaddr);
+			PXDlength(pxd, xlen);
+			pxdlist.maxnpxd++;
+			continue;
+		}
+
+		DT_PUTPAGE(smp);
+
+		/* undo allocation */
+		goto splitOut;
+	}
+
+	split->pxdlist = &pxdlist;
+	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
+		DT_PUTPAGE(smp);
+
+		/* undo allocation */
+		goto splitOut;
+	}
+
+	if (!DO_INDEX(ip))
+		ip->i_size += PSIZE;
+
+	/*
+	 * propagate up the router entry for the leaf page just split
+	 *
+	 * insert a router entry for the new page into the parent page,
+	 * propagate the insert/split up the tree by walking back the stack
+	 * of (bn of parent page, index of child page entry in parent page)
+	 * that were traversed during the search for the page that split.
+	 *
+	 * the propagation of insert/split up the tree stops if the root
+	 * splits or the page inserted into doesn't have to split to hold
+	 * the new entry.
+	 *
+	 * the parent entry for the split page remains the same, and
+	 * a new entry is inserted at its right with the first key and
+	 * block number of the new right page.
+	 *
+	 * There are a maximum of 4 pages pinned at any time:
+	 * two children, left parent and right parent (when the parent splits).
+	 * keep the child pages pinned while working on the parent.
+	 * make sure that all pins are released at exit.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* parent page specified by stack frame <parent> */
+
+		/* keep current child pages (<lp>, <rp>) pinned */
+		lmp = smp;
+		lp = sp;
+
+		/*
+		 * insert router entry in parent for new right child page <rp>
+		 */
+		/* get the parent page <sp> */
+		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+		if (rc) {
+			DT_PUTPAGE(lmp);
+			DT_PUTPAGE(rmp);
+			goto splitOut;
+		}
+
+		/*
+		 * The new key entry goes ONE AFTER the index of parent entry,
+		 * because the split was to the right.
+		 */
+		skip = parent->index + 1;
+
+		/*
+		 * compute the key for the router entry
+		 *
+		 * key suffix compression:
+		 * for internal pages that have leaf pages as children,
+		 * retain only what's needed to distinguish between
+		 * the new entry and the entry on the page to its left.
+		 * If the keys compare equal, retain the entire key.
+		 *
+		 * note that compression is performed only at computing
+		 * router key at the lowest internal level.
+		 * further compression of the key between pairs of higher
+		 * level internal pages loses too much information and
+		 * the search may fail.
+		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
+		 * results in two adjacent parent entries (a)(xx).
+		 * if split occurs between these two entries, and
+		 * if compression is applied, the router key of parent entry
+		 * of right page (x) will divert search for x into right
+		 * subtree and miss x in the left subtree.)
+		 *
+		 * the entire key must be retained for the next-to-leftmost
+		 * internal key at any level of the tree, or search may fail
+		 * (e.g., ?)
+		 */
+		switch (rp->header.flag & BT_TYPE) {
+		case BT_LEAF:
+			/*
+			 * compute the length of prefix for suffix compression
+			 * between last entry of left page and first entry
+			 * of right page
+			 */
+			if ((sp->header.flag & BT_ROOT && skip > 1) ||
+			    sp->header.prev != 0 || skip > 1) {
+				/* compute uppercase router prefix key */
+				rc = ciGetLeafPrefixKey(lp,
+							lp->header.nextindex-1,
+							rp, 0, &key,
+							sbi->mntflag);
+				if (rc) {
+					DT_PUTPAGE(lmp);
+					DT_PUTPAGE(rmp);
+					DT_PUTPAGE(smp);
+					goto splitOut;
+				}
+			} else {
+				/* next to leftmost entry of
+				   lowest internal level */
+
+				/* compute uppercase router key */
+				dtGetKey(rp, 0, &key, sbi->mntflag);
+				key.name[key.namlen] = 0;
+
+				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
+					ciToUpper(&key);
+			}
+
+			n = NDTINTERNAL(key.namlen);
+			break;
+
+		case BT_INTERNAL:
+			dtGetKey(rp, 0, &key, sbi->mntflag);
+			n = NDTINTERNAL(key.namlen);
+			break;
+
+		default:
+			jfs_err("dtSplitUp(): UFO!");
+			break;
+		}
+
+		/* unpin left child page */
+		DT_PUTPAGE(lmp);
+
+		/*
+		 * compute the data for the router entry
+		 */
+		data->xd = rpxd;	/* child page xd */
+
+		/*
+		 * parent page is full - split the parent page
+		 */
+		if (n > sp->header.freecnt) {
+			/* init for parent page split */
+			split->mp = smp;
+			split->index = skip;	/* index at insert */
+			split->nslot = n;
+			split->key = &key;
+			/* split->data = data; */
+
+			/* unpin right child page */
+			DT_PUTPAGE(rmp);
+
+			/* The split routines insert the new entry,
+			 * acquire txLock as appropriate.
+			 * return <rp> pinned and its block number <rbn>.
+			 */
+			rc = (sp->header.flag & BT_ROOT) ?
+			    dtSplitRoot(tid, ip, split, &rmp) :
+			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
+			if (rc) {
+				DT_PUTPAGE(smp);
+				goto splitOut;
+			}
+
+			/* smp and rmp are pinned */
+		}
+		/*
+		 * parent page is not full - insert router entry in parent page
+		 */
+		else {
+			BT_MARK_DIRTY(smp, ip);
+			/*
+			 * acquire a transaction lock on the parent page
+			 */
+			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+			dtlck = (struct dt_lock *) & tlck->lock;
+			ASSERT(dtlck->index == 0);
+			lv = & dtlck->lv[0];
+
+			/* linelock header */
+			lv->offset = 0;
+			lv->length = 1;
+			dtlck->index++;
+
+			/* linelock stbl of non-root parent page */
+			if (!(sp->header.flag & BT_ROOT)) {
+				lv++;
+				n = skip >> L2DTSLOTSIZE;
+				lv->offset = sp->header.stblindex + n;
+				lv->length =
+				    ((sp->header.nextindex -
+				      1) >> L2DTSLOTSIZE) - n + 1;
+				dtlck->index++;
+			}
+
+			dtInsertEntry(sp, skip, &key, data, &dtlck);
+
+			/* exit propagate up */
+			break;
+		}
+	}
+
+	/* unpin current split and its right page */
+	DT_PUTPAGE(smp);
+	DT_PUTPAGE(rmp);
+
+	/*
+	 * free remaining extents allocated for split
+	 */
+      splitOut:
+	n = pxdlist.npxd;
+	pxd = &pxdlist.pxd[n];
+	for (; n < pxdlist.maxnpxd; n++, pxd++)
+		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
+
+      freeKeyName:
+	kfree(key.name);
+
+	/* Rollback quota allocation */
+	if (rc && quota_allocation)
+		dquot_free_block(ip, quota_allocation);
+
+      dtSplitUp_Exit:
+
+	return rc;
+}
+
+
+/*
+ *	dtSplitPage()
+ *
+ * function: Split a non-root page of a btree.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	return split and new page pinned;
+ */
+static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
+	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
+{
+	int rc = 0;
+	struct metapage *smp;
+	dtpage_t *sp;
+	struct metapage *rmp;
+	dtpage_t *rp;		/* new right page allocated */
+	s64 rbn;		/* new right page block number */
+	struct metapage *mp;
+	dtpage_t *p;
+	s64 nextbn;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd;
+	int skip, nextindex, half, left, nxt, off, si;
+	struct ldtentry *ldtentry;
+	struct idtentry *idtentry;
+	u8 *stbl;
+	struct dtslot *f;
+	int fsi, stblsize;
+	int n;
+	struct dt_lock *sdtlck, *rdtlck;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *slv, *rlv, *lv;
+
+	/* get split page */
+	smp = split->mp;
+	sp = DT_PAGE(ip, smp);
+
+	/*
+	 * allocate the new right page for the split
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+	rmp = get_metapage(ip, rbn, PSIZE, 1);
+	if (rmp == NULL)
+		return -EIO;
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
+		release_metapage(rmp);
+		return rc;
+	}
+
+	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
+
+	BT_MARK_DIRTY(rmp, ip);
+	/*
+	 * acquire a transaction lock on the new right page
+	 */
+	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+	rdtlck = (struct dt_lock *) & tlck->lock;
+
+	rp = (dtpage_t *) rmp->data;
+	*rpp = rp;
+	rp->header.self = *pxd;
+
+	BT_MARK_DIRTY(smp, ip);
+	/*
+	 * acquire a transaction lock on the split page
+	 *
+	 * action:
+	 */
+	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+	sdtlck = (struct dt_lock *) & tlck->lock;
+
+	/* linelock header of split page */
+	ASSERT(sdtlck->index == 0);
+	slv = & sdtlck->lv[0];
+	slv->offset = 0;
+	slv->length = 1;
+	sdtlck->index++;
+
+	/*
+	 * initialize/update sibling pointers between sp and rp
+	 */
+	nextbn = le64_to_cpu(sp->header.next);
+	rp->header.next = cpu_to_le64(nextbn);
+	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+	sp->header.next = cpu_to_le64(rbn);
+
+	/*
+	 * initialize new right page
+	 */
+	rp->header.flag = sp->header.flag;
+
+	/* compute sorted entry table at start of extent data area */
+	rp->header.nextindex = 0;
+	rp->header.stblindex = 1;
+
+	n = PSIZE >> L2DTSLOTSIZE;
+	rp->header.maxslot = n;
+	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
+
+	/* init freelist */
+	fsi = rp->header.stblindex + stblsize;
+	rp->header.freelist = fsi;
+	rp->header.freecnt = rp->header.maxslot - fsi;
+
+	/*
+	 *	sequential append at tail: append without split
+	 *
+	 * If splitting the last page on a level because of appending
+	 * a entry to it (skip is maxentry), it's likely that the access is
+	 * sequential. Adding an empty page on the side of the level is less
+	 * work and can push the fill factor much higher than normal.
+	 * If we're wrong it's no big deal, we'll just do the split the right
+	 * way next time.
+	 * (It may look like it's equally easy to do a similar hack for
+	 * reverse sorted data, that is, split the tree left,
+	 * but it's not. Be my guest.)
+	 */
+	if (nextbn == 0 && split->index == sp->header.nextindex) {
+		/* linelock header + stbl (first slot) of new page */
+		rlv = & rdtlck->lv[rdtlck->index];
+		rlv->offset = 0;
+		rlv->length = 2;
+		rdtlck->index++;
+
+		/*
+		 * initialize freelist of new right page
+		 */
+		f = &rp->slot[fsi];
+		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+			f->next = fsi;
+		f->next = -1;
+
+		/* insert entry at the first entry of the new right page */
+		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
+
+		goto out;
+	}
+
+	/*
+	 *	non-sequential insert (at possibly middle page)
+	 */
+
+	/*
+	 * update prev pointer of previous right sibling page;
+	 */
+	if (nextbn != 0) {
+		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc) {
+			discard_metapage(rmp);
+			return rc;
+		}
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the next page
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+		jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+			tlck, ip, mp);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header of previous right sibling page */
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		p->header.prev = cpu_to_le64(rbn);
+
+		DT_PUTPAGE(mp);
+	}
+
+	/*
+	 * split the data between the split and right pages.
+	 */
+	skip = split->index;
+	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
+	left = 0;
+
+	/*
+	 *	compute fill factor for split pages
+	 *
+	 * <nxt> traces the next entry to move to rp
+	 * <off> traces the next entry to stay in sp
+	 */
+	stbl = (u8 *) & sp->slot[sp->header.stblindex];
+	nextindex = sp->header.nextindex;
+	for (nxt = off = 0; nxt < nextindex; ++off) {
+		if (off == skip)
+			/* check for fill factor with new entry size */
+			n = split->nslot;
+		else {
+			si = stbl[nxt];
+			switch (sp->header.flag & BT_TYPE) {
+			case BT_LEAF:
+				ldtentry = (struct ldtentry *) & sp->slot[si];
+				if (DO_INDEX(ip))
+					n = NDTLEAF(ldtentry->namlen);
+				else
+					n = NDTLEAF_LEGACY(ldtentry->
+							   namlen);
+				break;
+
+			case BT_INTERNAL:
+				idtentry = (struct idtentry *) & sp->slot[si];
+				n = NDTINTERNAL(idtentry->namlen);
+				break;
+
+			default:
+				break;
+			}
+
+			++nxt;	/* advance to next entry to move in sp */
+		}
+
+		left += n;
+		if (left >= half)
+			break;
+	}
+
+	/* <nxt> poins to the 1st entry to move */
+
+	/*
+	 *	move entries to right page
+	 *
+	 * dtMoveEntry() initializes rp and reserves entry for insertion
+	 *
+	 * split page moved out entries are linelocked;
+	 * new/right page moved in entries are linelocked;
+	 */
+	/* linelock header + stbl of new right page */
+	rlv = & rdtlck->lv[rdtlck->index];
+	rlv->offset = 0;
+	rlv->length = 5;
+	rdtlck->index++;
+
+	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
+
+	sp->header.nextindex = nxt;
+
+	/*
+	 * finalize freelist of new right page
+	 */
+	fsi = rp->header.freelist;
+	f = &rp->slot[fsi];
+	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	/*
+	 * Update directory index table for entries now in right page
+	 */
+	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+		s64 lblock;
+
+		mp = NULL;
+		stbl = DT_GETSTBL(rp);
+		for (n = 0; n < rp->header.nextindex; n++) {
+			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
+			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+				     rbn, n, &mp, &lblock);
+		}
+		if (mp)
+			release_metapage(mp);
+	}
+
+	/*
+	 * the skipped index was on the left page,
+	 */
+	if (skip <= off) {
+		/* insert the new entry in the split page */
+		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
+
+		/* linelock stbl of split page */
+		if (sdtlck->index >= sdtlck->maxcnt)
+			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
+		slv = & sdtlck->lv[sdtlck->index];
+		n = skip >> L2DTSLOTSIZE;
+		slv->offset = sp->header.stblindex + n;
+		slv->length =
+		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+		sdtlck->index++;
+	}
+	/*
+	 * the skipped index was on the right page,
+	 */
+	else {
+		/* adjust the skip index to reflect the new position */
+		skip -= nxt;
+
+		/* insert the new entry in the right page */
+		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
+	}
+
+      out:
+	*rmpp = rmp;
+	*rpxdp = *pxd;
+
+	return rc;
+}
+
+
+/*
+ *	dtExtendPage()
+ *
+ * function: extend 1st/only directory leaf page
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	return extended page pinned;
+ */
+static int dtExtendPage(tid_t tid,
+	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
+{
+	struct super_block *sb = ip->i_sb;
+	int rc;
+	struct metapage *smp, *pmp, *mp;
+	dtpage_t *sp, *pp;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd, *tpxd;
+	int xlen, xsize;
+	int newstblindex, newstblsize;
+	int oldstblindex, oldstblsize;
+	int fsi, last;
+	struct dtslot *f;
+	struct btframe *parent;
+	int n;
+	struct dt_lock *dtlck;
+	s64 xaddr, txaddr;
+	struct tlock *tlck;
+	struct pxd_lock *pxdlock;
+	struct lv *lv;
+	uint type;
+	struct ldtentry *ldtentry;
+	u8 *stbl;
+
+	/* get page to extend */
+	smp = split->mp;
+	sp = DT_PAGE(ip, smp);
+
+	/* get parent/root page */
+	parent = BT_POP(btstack);
+	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
+	if (rc)
+		return (rc);
+
+	/*
+	 *	extend the extent
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+
+	xaddr = addressPXD(pxd);
+	tpxd = &sp->header.self;
+	txaddr = addressPXD(tpxd);
+	/* in-place extension */
+	if (xaddr == txaddr) {
+		type = tlckEXTEND;
+	}
+	/* relocation */
+	else {
+		type = tlckNEW;
+
+		/* save moved extent descriptor for later free */
+		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		pxdlock->flag = mlckFREEPXD;
+		pxdlock->pxd = sp->header.self;
+		pxdlock->index = 1;
+
+		/*
+		 * Update directory index table to reflect new page address
+		 */
+		if (DO_INDEX(ip)) {
+			s64 lblock;
+
+			mp = NULL;
+			stbl = DT_GETSTBL(sp);
+			for (n = 0; n < sp->header.nextindex; n++) {
+				ldtentry =
+				    (struct ldtentry *) & sp->slot[stbl[n]];
+				modify_index(tid, ip,
+					     le32_to_cpu(ldtentry->index),
+					     xaddr, n, &mp, &lblock);
+			}
+			if (mp)
+				release_metapage(mp);
+		}
+	}
+
+	/*
+	 *	extend the page
+	 */
+	sp->header.self = *pxd;
+
+	jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
+
+	BT_MARK_DIRTY(smp, ip);
+	/*
+	 * acquire a transaction lock on the extended/leaf page
+	 */
+	tlck = txLock(tid, ip, smp, tlckDTREE | type);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	lv = & dtlck->lv[0];
+
+	/* update buffer extent descriptor of extended page */
+	xlen = lengthPXD(pxd);
+	xsize = xlen << JFS_SBI(sb)->l2bsize;
+
+	/*
+	 * copy old stbl to new stbl at start of extended area
+	 */
+	oldstblindex = sp->header.stblindex;
+	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
+	newstblindex = sp->header.maxslot;
+	n = xsize >> L2DTSLOTSIZE;
+	newstblsize = (n + 31) >> L2DTSLOTSIZE;
+	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
+	       sp->header.nextindex);
+
+	/*
+	 * in-line extension: linelock old area of extended page
+	 */
+	if (type == tlckEXTEND) {
+		/* linelock header */
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+		lv++;
+
+		/* linelock new stbl of extended page */
+		lv->offset = newstblindex;
+		lv->length = newstblsize;
+	}
+	/*
+	 * relocation: linelock whole relocated area
+	 */
+	else {
+		lv->offset = 0;
+		lv->length = sp->header.maxslot + newstblsize;
+	}
+
+	dtlck->index++;
+
+	sp->header.maxslot = n;
+	sp->header.stblindex = newstblindex;
+	/* sp->header.nextindex remains the same */
+
+	/*
+	 * add old stbl region at head of freelist
+	 */
+	fsi = oldstblindex;
+	f = &sp->slot[fsi];
+	last = sp->header.freelist;
+	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
+		f->next = last;
+		last = fsi;
+	}
+	sp->header.freelist = last;
+	sp->header.freecnt += oldstblsize;
+
+	/*
+	 * append free region of newly extended area at tail of freelist
+	 */
+	/* init free region of newly extended area */
+	fsi = n = newstblindex + newstblsize;
+	f = &sp->slot[fsi];
+	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	/* append new free region at tail of old freelist */
+	fsi = sp->header.freelist;
+	if (fsi == -1)
+		sp->header.freelist = n;
+	else {
+		do {
+			f = &sp->slot[fsi];
+			fsi = f->next;
+		} while (fsi != -1);
+
+		f->next = n;
+	}
+
+	sp->header.freecnt += sp->header.maxslot - n;
+
+	/*
+	 * insert the new entry
+	 */
+	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
+
+	BT_MARK_DIRTY(pmp, ip);
+	/*
+	 * linelock any freeslots residing in old extent
+	 */
+	if (type == tlckEXTEND) {
+		n = sp->header.maxslot >> 2;
+		if (sp->header.freelist < n)
+			dtLinelockFreelist(sp, n, &dtlck);
+	}
+
+	/*
+	 *	update parent entry on the parent/root page
+	 */
+	/*
+	 * acquire a transaction lock on the parent/root page
+	 */
+	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	lv = & dtlck->lv[dtlck->index];
+
+	/* linelock parent entry - 1st slot */
+	lv->offset = 1;
+	lv->length = 1;
+	dtlck->index++;
+
+	/* update the parent pxd for page extension */
+	tpxd = (pxd_t *) & pp->slot[1];
+	*tpxd = *pxd;
+
+	DT_PUTPAGE(pmp);
+	return 0;
+}
+
+
+/*
+ *	dtSplitRoot()
+ *
+ * function:
+ *	split the full root page into
+ *	original/root/split page and new right page
+ *	i.e., root remains fixed in tree anchor (inode) and
+ *	the root is copied to a single new right child page
+ *	since root page << non-root page, and
+ *	the split root page contains a single entry for the
+ *	new right child page.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	return new page pinned;
+ */
+static int dtSplitRoot(tid_t tid,
+	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
+{
+	struct super_block *sb = ip->i_sb;
+	struct metapage *smp;
+	dtroot_t *sp;
+	struct metapage *rmp;
+	dtpage_t *rp;
+	s64 rbn;
+	int xlen;
+	int xsize;
+	struct dtslot *f;
+	s8 *stbl;
+	int fsi, stblsize, n;
+	struct idtentry *s;
+	pxd_t *ppxd;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	int rc;
+
+	/* get split root page */
+	smp = split->mp;
+	sp = &JFS_IP(ip)->i_dtroot;
+
+	/*
+	 *	allocate/initialize a single (right) child page
+	 *
+	 * N.B. at first split, a one (or two) block to fit new entry
+	 * is allocated; at subsequent split, a full page is allocated;
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+	xlen = lengthPXD(pxd);
+	xsize = xlen << JFS_SBI(sb)->l2bsize;
+	rmp = get_metapage(ip, rbn, xsize, 1);
+	if (!rmp)
+		return -EIO;
+
+	rp = rmp->data;
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
+		release_metapage(rmp);
+		return rc;
+	}
+
+	BT_MARK_DIRTY(rmp, ip);
+	/*
+	 * acquire a transaction lock on the new right page
+	 */
+	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	rp->header.flag =
+	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+	rp->header.self = *pxd;
+
+	/* initialize sibling pointers */
+	rp->header.next = 0;
+	rp->header.prev = 0;
+
+	/*
+	 *	move in-line root page into new right page extent
+	 */
+	/* linelock header + copied entries + new stbl (1st slot) in new page */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = 0;
+	lv->length = 10;	/* 1 + 8 + 1 */
+	dtlck->index++;
+
+	n = xsize >> L2DTSLOTSIZE;
+	rp->header.maxslot = n;
+	stblsize = (n + 31) >> L2DTSLOTSIZE;
+
+	/* copy old stbl to new stbl at start of extended area */
+	rp->header.stblindex = DTROOTMAXSLOT;
+	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
+	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
+	rp->header.nextindex = sp->header.nextindex;
+
+	/* copy old data area to start of new data area */
+	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
+
+	/*
+	 * append free region of newly extended area at tail of freelist
+	 */
+	/* init free region of newly extended area */
+	fsi = n = DTROOTMAXSLOT + stblsize;
+	f = &rp->slot[fsi];
+	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	/* append new free region at tail of old freelist */
+	fsi = sp->header.freelist;
+	if (fsi == -1)
+		rp->header.freelist = n;
+	else {
+		rp->header.freelist = fsi;
+
+		do {
+			f = &rp->slot[fsi];
+			fsi = f->next;
+		} while (fsi != -1);
+
+		f->next = n;
+	}
+
+	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
+
+	/*
+	 * Update directory index table for entries now in right page
+	 */
+	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+		s64 lblock;
+		struct metapage *mp = NULL;
+		struct ldtentry *ldtentry;
+
+		stbl = DT_GETSTBL(rp);
+		for (n = 0; n < rp->header.nextindex; n++) {
+			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
+			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+				     rbn, n, &mp, &lblock);
+		}
+		if (mp)
+			release_metapage(mp);
+	}
+	/*
+	 * insert the new entry into the new right/child page
+	 * (skip index in the new right page will not change)
+	 */
+	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
+
+	/*
+	 *	reset parent/root page
+	 *
+	 * set the 1st entry offset to 0, which force the left-most key
+	 * at any level of the tree to be less than any search key.
+	 *
+	 * The btree comparison code guarantees that the left-most key on any
+	 * level of the tree is never used, so it doesn't need to be filled in.
+	 */
+	BT_MARK_DIRTY(smp, ip);
+	/*
+	 * acquire a transaction lock on the root page (in-memory inode)
+	 */
+	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	/* linelock root */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = 0;
+	lv->length = DTROOTMAXSLOT;
+	dtlck->index++;
+
+	/* update page header of root */
+	if (sp->header.flag & BT_LEAF) {
+		sp->header.flag &= ~BT_LEAF;
+		sp->header.flag |= BT_INTERNAL;
+	}
+
+	/* init the first entry */
+	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
+	ppxd = (pxd_t *) s;
+	*ppxd = *pxd;
+	s->next = -1;
+	s->namlen = 0;
+
+	stbl = sp->header.stbl;
+	stbl[0] = DTENTRYSTART;
+	sp->header.nextindex = 1;
+
+	/* init freelist */
+	fsi = DTENTRYSTART + 1;
+	f = &sp->slot[fsi];
+
+	/* init free region of remaining area */
+	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	sp->header.freelist = DTENTRYSTART + 1;
+	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
+
+	*rmpp = rmp;
+
+	return 0;
+}
+
+
+/*
+ *	dtDelete()
+ *
+ * function: delete the entry(s) referenced by a key.
+ *
+ * parameter:
+ *
+ * return:
+ */
+int dtDelete(tid_t tid,
+	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
+{
+	int rc = 0;
+	s64 bn;
+	struct metapage *mp, *imp;
+	dtpage_t *p;
+	int index;
+	struct btstack btstack;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	int i;
+	struct ldtentry *ldtentry;
+	u8 *stbl;
+	u32 table_index, next_index;
+	struct metapage *nmp;
+	dtpage_t *np;
+
+	/*
+	 *	search for the entry to delete:
+	 *
+	 * dtSearch() returns (leaf page pinned, index at which to delete).
+	 */
+	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
+		return rc;
+
+	/* retrieve search result */
+	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	/*
+	 * We need to find put the index of the next entry into the
+	 * directory index table in order to resume a readdir from this
+	 * entry.
+	 */
+	if (DO_INDEX(ip)) {
+		stbl = DT_GETSTBL(p);
+		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
+		table_index = le32_to_cpu(ldtentry->index);
+		if (index == (p->header.nextindex - 1)) {
+			/*
+			 * Last entry in this leaf page
+			 */
+			if ((p->header.flag & BT_ROOT)
+			    || (p->header.next == 0))
+				next_index = -1;
+			else {
+				/* Read next leaf page */
+				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
+					   nmp, PSIZE, np, rc);
+				if (rc)
+					next_index = -1;
+				else {
+					stbl = DT_GETSTBL(np);
+					ldtentry =
+					    (struct ldtentry *) & np->
+					    slot[stbl[0]];
+					next_index =
+					    le32_to_cpu(ldtentry->index);
+					DT_PUTPAGE(nmp);
+				}
+			}
+		} else {
+			ldtentry =
+			    (struct ldtentry *) & p->slot[stbl[index + 1]];
+			next_index = le32_to_cpu(ldtentry->index);
+		}
+		free_index(tid, ip, table_index, next_index);
+	}
+	/*
+	 * the leaf page becomes empty, delete the page
+	 */
+	if (p->header.nextindex == 1) {
+		/* delete empty page */
+		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
+	}
+	/*
+	 * the leaf page has other entries remaining:
+	 *
+	 * delete the entry from the leaf page.
+	 */
+	else {
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the leaf page
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/*
+		 * Do not assume that dtlck->index will be zero.  During a
+		 * rename within a directory, this transaction may have
+		 * modified this page already when adding the new entry.
+		 */
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		/* linelock stbl of non-root leaf page */
+		if (!(p->header.flag & BT_ROOT)) {
+			if (dtlck->index >= dtlck->maxcnt)
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+			lv = & dtlck->lv[dtlck->index];
+			i = index >> L2DTSLOTSIZE;
+			lv->offset = p->header.stblindex + i;
+			lv->length =
+			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+			    i + 1;
+			dtlck->index++;
+		}
+
+		/* free the leaf entry */
+		dtDeleteEntry(p, index, &dtlck);
+
+		/*
+		 * Update directory index table for entries moved in stbl
+		 */
+		if (DO_INDEX(ip) && index < p->header.nextindex) {
+			s64 lblock;
+
+			imp = NULL;
+			stbl = DT_GETSTBL(p);
+			for (i = index; i < p->header.nextindex; i++) {
+				ldtentry =
+				    (struct ldtentry *) & p->slot[stbl[i]];
+				modify_index(tid, ip,
+					     le32_to_cpu(ldtentry->index),
+					     bn, i, &imp, &lblock);
+			}
+			if (imp)
+				release_metapage(imp);
+		}
+
+		DT_PUTPAGE(mp);
+	}
+
+	return rc;
+}
+
+
+/*
+ *	dtDeleteUp()
+ *
+ * function:
+ *	free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int dtDeleteUp(tid_t tid, struct inode *ip,
+	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index, nextindex;
+	int xlen;
+	struct btframe *parent;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	struct pxd_lock *pxdlock;
+	int i;
+
+	/*
+	 *	keep the root leaf page which has become empty
+	 */
+	if (BT_IS_ROOT(fmp)) {
+		/*
+		 * reset the root
+		 *
+		 * dtInitRoot() acquires txlock on the root
+		 */
+		dtInitRoot(tid, ip, PARENT(ip));
+
+		DT_PUTPAGE(fmp);
+
+		return 0;
+	}
+
+	/*
+	 *	free the non-root leaf page
+	 */
+	/*
+	 * acquire a transaction lock on the page
+	 *
+	 * write FREEXTENT|NOREDOPAGE log record
+	 * N.B. linelock is overlaid as freed extent descriptor, and
+	 * the buffer page is freed;
+	 */
+	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	pxdlock->pxd = fp->header.self;
+	pxdlock->index = 1;
+
+	/* update sibling pointers */
+	if ((rc = dtRelink(tid, ip, fp))) {
+		BT_PUTPAGE(fmp);
+		return rc;
+	}
+
+	xlen = lengthPXD(&fp->header.self);
+
+	/* Free quota allocation. */
+	dquot_free_block(ip, xlen);
+
+	/* free/invalidate its buffer page */
+	discard_metapage(fmp);
+
+	/*
+	 *	propagate page deletion up the directory tree
+	 *
+	 * If the delete from the parent page makes it empty,
+	 * continue all the way up the tree.
+	 * stop if the root page is reached (which is never deleted) or
+	 * if the entry deletion does not empty the page.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* pin the parent page <sp> */
+		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * free the extent of the child page deleted
+		 */
+		index = parent->index;
+
+		/*
+		 * delete the entry for the child page from parent
+		 */
+		nextindex = p->header.nextindex;
+
+		/*
+		 * the parent has the single entry being deleted:
+		 *
+		 * free the parent page which has become empty.
+		 */
+		if (nextindex == 1) {
+			/*
+			 * keep the root internal page which has become empty
+			 */
+			if (p->header.flag & BT_ROOT) {
+				/*
+				 * reset the root
+				 *
+				 * dtInitRoot() acquires txlock on the root
+				 */
+				dtInitRoot(tid, ip, PARENT(ip));
+
+				DT_PUTPAGE(mp);
+
+				return 0;
+			}
+			/*
+			 * free the parent page
+			 */
+			else {
+				/*
+				 * acquire a transaction lock on the page
+				 *
+				 * write FREEXTENT|NOREDOPAGE log record
+				 */
+				tlck =
+				    txMaplock(tid, ip,
+					      tlckDTREE | tlckFREE);
+				pxdlock = (struct pxd_lock *) & tlck->lock;
+				pxdlock->flag = mlckFREEPXD;
+				pxdlock->pxd = p->header.self;
+				pxdlock->index = 1;
+
+				/* update sibling pointers */
+				if ((rc = dtRelink(tid, ip, p))) {
+					DT_PUTPAGE(mp);
+					return rc;
+				}
+
+				xlen = lengthPXD(&p->header.self);
+
+				/* Free quota allocation */
+				dquot_free_block(ip, xlen);
+
+				/* free/invalidate its buffer page */
+				discard_metapage(mp);
+
+				/* propagate up */
+				continue;
+			}
+		}
+
+		/*
+		 * the parent has other entries remaining:
+		 *
+		 * delete the router entry from the parent page.
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the page
+		 *
+		 * action: router entry deletion
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		/* linelock stbl of non-root leaf page */
+		if (!(p->header.flag & BT_ROOT)) {
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+			i = index >> L2DTSLOTSIZE;
+			lv->offset = p->header.stblindex + i;
+			lv->length =
+			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+			    i + 1;
+			dtlck->index++;
+		}
+
+		/* free the router entry */
+		dtDeleteEntry(p, index, &dtlck);
+
+		/* reset key of new leftmost entry of level (for consistency) */
+		if (index == 0 &&
+		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
+			dtTruncateEntry(p, 0, &dtlck);
+
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+
+		/* exit propagation up */
+		break;
+	}
+
+	if (!DO_INDEX(ip))
+		ip->i_size -= PSIZE;
+
+	return 0;
+}
+
+#ifdef _NOTYET
+/*
+ * NAME:	dtRelocate()
+ *
+ * FUNCTION:	relocate dtpage (internal or leaf) of directory;
+ *		This function is mainly used by defragfs utility.
+ */
+int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
+	       s64 nxaddr)
+{
+	int rc = 0;
+	struct metapage *mp, *pmp, *lmp, *rmp;
+	dtpage_t *p, *pp, *rp = 0, *lp= 0;
+	s64 bn;
+	int index;
+	struct btstack btstack;
+	pxd_t *pxd;
+	s64 oxaddr, nextbn, prevbn;
+	int xlen, xsize;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct pxd_lock *pxdlock;
+	s8 *stbl;
+	struct lv *lv;
+
+	oxaddr = addressPXD(opxd);
+	xlen = lengthPXD(opxd);
+
+	jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
+		   (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
+		   xlen);
+
+	/*
+	 *	1. get the internal parent dtpage covering
+	 *	router entry for the tartget page to be relocated;
+	 */
+	rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
+	if (rc)
+		return rc;
+
+	/* retrieve search result */
+	DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+	jfs_info("dtRelocate: parent router entry validated.");
+
+	/*
+	 *	2. relocate the target dtpage
+	 */
+	/* read in the target page from src extent */
+	DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+	if (rc) {
+		/* release the pinned parent page */
+		DT_PUTPAGE(pmp);
+		return rc;
+	}
+
+	/*
+	 * read in sibling pages if any to update sibling pointers;
+	 */
+	rmp = NULL;
+	if (p->header.next) {
+		nextbn = le64_to_cpu(p->header.next);
+		DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+		if (rc) {
+			DT_PUTPAGE(mp);
+			DT_PUTPAGE(pmp);
+			return (rc);
+		}
+	}
+
+	lmp = NULL;
+	if (p->header.prev) {
+		prevbn = le64_to_cpu(p->header.prev);
+		DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+		if (rc) {
+			DT_PUTPAGE(mp);
+			DT_PUTPAGE(pmp);
+			if (rmp)
+				DT_PUTPAGE(rmp);
+			return (rc);
+		}
+	}
+
+	/* at this point, all xtpages to be updated are in memory */
+
+	/*
+	 * update sibling pointers of sibling dtpages if any;
+	 */
+	if (lmp) {
+		tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
+		dtlck = (struct dt_lock *) & tlck->lock;
+		/* linelock header */
+		ASSERT(dtlck->index == 0);
+		lv = & dtlck->lv[0];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		lp->header.next = cpu_to_le64(nxaddr);
+		DT_PUTPAGE(lmp);
+	}
+
+	if (rmp) {
+		tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
+		dtlck = (struct dt_lock *) & tlck->lock;
+		/* linelock header */
+		ASSERT(dtlck->index == 0);
+		lv = & dtlck->lv[0];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		rp->header.prev = cpu_to_le64(nxaddr);
+		DT_PUTPAGE(rmp);
+	}
+
+	/*
+	 * update the target dtpage to be relocated
+	 *
+	 * write LOG_REDOPAGE of LOG_NEW type for dst page
+	 * for the whole target page (logredo() will apply
+	 * after image and update bmap for allocation of the
+	 * dst extent), and update bmap for allocation of
+	 * the dst extent;
+	 */
+	tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	/* linelock header */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+
+	/* update the self address in the dtpage header */
+	pxd = &p->header.self;
+	PXDaddress(pxd, nxaddr);
+
+	/* the dst page is the same as the src page, i.e.,
+	 * linelock for afterimage of the whole page;
+	 */
+	lv->offset = 0;
+	lv->length = p->header.maxslot;
+	dtlck->index++;
+
+	/* update the buffer extent descriptor of the dtpage */
+	xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+
+	/* unpin the relocated page */
+	DT_PUTPAGE(mp);
+	jfs_info("dtRelocate: target dtpage relocated.");
+
+	/* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
+	 * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
+	 * will also force a bmap update ).
+	 */
+
+	/*
+	 *	3. acquire maplock for the source extent to be freed;
+	 */
+	/* for dtpage relocation, write a LOG_NOREDOPAGE record
+	 * for the source dtpage (logredo() will init NoRedoPage
+	 * filter and will also update bmap for free of the source
+	 * dtpage), and upadte bmap for free of the source dtpage;
+	 */
+	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	PXDaddress(&pxdlock->pxd, oxaddr);
+	PXDlength(&pxdlock->pxd, xlen);
+	pxdlock->index = 1;
+
+	/*
+	 *	4. update the parent router entry for relocation;
+	 *
+	 * acquire tlck for the parent entry covering the target dtpage;
+	 * write LOG_REDOPAGE to apply after image only;
+	 */
+	jfs_info("dtRelocate: update parent router entry.");
+	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	lv = & dtlck->lv[dtlck->index];
+
+	/* update the PXD with the new address */
+	stbl = DT_GETSTBL(pp);
+	pxd = (pxd_t *) & pp->slot[stbl[index]];
+	PXDaddress(pxd, nxaddr);
+	lv->offset = stbl[index];
+	lv->length = 1;
+	dtlck->index++;
+
+	/* unpin the parent dtpage */
+	DT_PUTPAGE(pmp);
+
+	return rc;
+}
+
+/*
+ * NAME:	dtSearchNode()
+ *
+ * FUNCTION:	Search for an dtpage containing a specified address
+ *		This function is mainly used by defragfs utility.
+ *
+ * NOTE:	Search result on stack, the found page is pinned at exit.
+ *		The result page must be an internal dtpage.
+ *		lmxaddr give the address of the left most page of the
+ *		dtree level, in which the required dtpage resides.
+ */
+static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
+			struct btstack * btstack)
+{
+	int rc = 0;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int psize = 288;	/* initial in-line directory */
+	s8 *stbl;
+	int i;
+	pxd_t *pxd;
+	struct btframe *btsp;
+
+	BT_CLR(btstack);	/* reset stack */
+
+	/*
+	 *	descend tree to the level with specified leftmost page
+	 *
+	 *  by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		DT_GETPAGE(ip, bn, mp, psize, p, rc);
+		if (rc)
+			return rc;
+
+		/* does the xaddr of leftmost page of the levevl
+		 * matches levevl search key ?
+		 */
+		if (p->header.flag & BT_ROOT) {
+			if (lmxaddr == 0)
+				break;
+		} else if (addressPXD(&p->header.self) == lmxaddr)
+			break;
+
+		/*
+		 * descend down to leftmost child page
+		 */
+		if (p->header.flag & BT_LEAF) {
+			DT_PUTPAGE(mp);
+			return -ESTALE;
+		}
+
+		/* get the leftmost entry */
+		stbl = DT_GETSTBL(p);
+		pxd = (pxd_t *) & p->slot[stbl[0]];
+
+		/* get the child page block address */
+		bn = addressPXD(pxd);
+		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+	}
+
+	/*
+	 *	search each page at the current levevl
+	 */
+      loop:
+	stbl = DT_GETSTBL(p);
+	for (i = 0; i < p->header.nextindex; i++) {
+		pxd = (pxd_t *) & p->slot[stbl[i]];
+
+		/* found the specified router entry */
+		if (addressPXD(pxd) == addressPXD(kpxd) &&
+		    lengthPXD(pxd) == lengthPXD(kpxd)) {
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = i;
+			btsp->mp = mp;
+
+			return 0;
+		}
+	}
+
+	/* get the right sibling page if any */
+	if (p->header.next)
+		bn = le64_to_cpu(p->header.next);
+	else {
+		DT_PUTPAGE(mp);
+		return -ESTALE;
+	}
+
+	/* unpin current page */
+	DT_PUTPAGE(mp);
+
+	/* get the right sibling page */
+	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	goto loop;
+}
+#endif /* _NOTYET */
+
+/*
+ *	dtRelink()
+ *
+ * function:
+ *	link around a freed page.
+ *
+ * parameter:
+ *	fp:	page to be freed
+ *
+ * return:
+ */
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
+{
+	int rc;
+	struct metapage *mp;
+	s64 nextbn, prevbn;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+
+	nextbn = le64_to_cpu(p->header.next);
+	prevbn = le64_to_cpu(p->header.prev);
+
+	/* update prev pointer of the next page */
+	if (nextbn != 0) {
+		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the next page
+		 *
+		 * action: update prev pointer;
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+		jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+			tlck, ip, mp);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		p->header.prev = cpu_to_le64(prevbn);
+		DT_PUTPAGE(mp);
+	}
+
+	/* update next pointer of the previous page */
+	if (prevbn != 0) {
+		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the prev page
+		 *
+		 * action: update next pointer;
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+		jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+			tlck, ip, mp);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		p->header.next = cpu_to_le64(nextbn);
+		DT_PUTPAGE(mp);
+	}
+
+	return 0;
+}
+
+
+/*
+ *	dtInitRoot()
+ *
+ * initialize directory root (inline in inode)
+ */
+void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	dtroot_t *p;
+	int fsi;
+	struct dtslot *f;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+	u16 xflag_save;
+
+	/*
+	 * If this was previously an non-empty directory, we need to remove
+	 * the old directory table.
+	 */
+	if (DO_INDEX(ip)) {
+		if (!jfs_dirtable_inline(ip)) {
+			struct tblock *tblk = tid_to_tblock(tid);
+			/*
+			 * We're playing games with the tid's xflag.  If
+			 * we're removing a regular file, the file's xtree
+			 * is committed with COMMIT_PMAP, but we always
+			 * commit the directories xtree with COMMIT_PWMAP.
+			 */
+			xflag_save = tblk->xflag;
+			tblk->xflag = 0;
+			/*
+			 * xtTruncate isn't guaranteed to fully truncate
+			 * the xtree.  The caller needs to check i_size
+			 * after committing the transaction to see if
+			 * additional truncation is needed.  The
+			 * COMMIT_Stale flag tells caller that we
+			 * initiated the truncation.
+			 */
+			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+			set_cflag(COMMIT_Stale, ip);
+
+			tblk->xflag = xflag_save;
+		} else
+			ip->i_size = 1;
+
+		jfs_ip->next_index = 2;
+	} else
+		ip->i_size = IDATASIZE;
+
+	/*
+	 * acquire a transaction lock on the root
+	 *
+	 * action: directory initialization;
+	 */
+	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
+		      tlckDTREE | tlckENTRY | tlckBTROOT);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	/* linelock root */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = 0;
+	lv->length = DTROOTMAXSLOT;
+	dtlck->index++;
+
+	p = &jfs_ip->i_dtroot;
+
+	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+
+	p->header.nextindex = 0;
+
+	/* init freelist */
+	fsi = 1;
+	f = &p->slot[fsi];
+
+	/* init data area of root */
+	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	p->header.freelist = 1;
+	p->header.freecnt = 8;
+
+	/* init '..' entry */
+	p->header.idotdot = cpu_to_le32(idotdot);
+
+	return;
+}
+
+/*
+ *	add_missing_indices()
+ *
+ * function: Fix dtree page in which one or more entries has an invalid index.
+ *	     fsck.jfs should really fix this, but it currently does not.
+ *	     Called from jfs_readdir when bad index is detected.
+ */
+static void add_missing_indices(struct inode *inode, s64 bn)
+{
+	struct ldtentry *d;
+	struct dt_lock *dtlck;
+	int i;
+	uint index;
+	struct lv *lv;
+	struct metapage *mp;
+	dtpage_t *p;
+	int rc;
+	s8 *stbl;
+	tid_t tid;
+	struct tlock *tlck;
+
+	tid = txBegin(inode->i_sb, 0);
+
+	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
+
+	if (rc) {
+		printk(KERN_ERR "DT_GETPAGE failed!\n");
+		goto end;
+	}
+	BT_MARK_DIRTY(mp, inode);
+
+	ASSERT(p->header.flag & BT_LEAF);
+
+	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
+	if (BT_IS_ROOT(mp))
+		tlck->type |= tlckBTROOT;
+
+	dtlck = (struct dt_lock *) &tlck->lock;
+
+	stbl = DT_GETSTBL(p);
+	for (i = 0; i < p->header.nextindex; i++) {
+		d = (struct ldtentry *) &p->slot[stbl[i]];
+		index = le32_to_cpu(d->index);
+		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
+			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
+			if (dtlck->index >= dtlck->maxcnt)
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+			lv = &dtlck->lv[dtlck->index];
+			lv->offset = stbl[i];
+			lv->length = 1;
+			dtlck->index++;
+		}
+	}
+
+	DT_PUTPAGE(mp);
+	(void) txCommit(tid, 1, &inode, 0);
+end:
+	txEnd(tid);
+}
+
+/*
+ * Buffer to hold directory entry info while traversing a dtree page
+ * before being fed to the filldir function
+ */
+struct jfs_dirent {
+	loff_t position;
+	int ino;
+	u16 name_len;
+	char name[0];
+};
+
+/*
+ * function to determine next variable-sized jfs_dirent in buffer
+ */
+static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
+{
+	return (struct jfs_dirent *)
+		((char *)dirent +
+		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
+		   sizeof (loff_t) - 1) &
+		  ~(sizeof (loff_t) - 1)));
+}
+
+/*
+ *	jfs_readdir()
+ *
+ * function: read directory entries sequentially
+ *	from the specified entry offset
+ *
+ * parameter:
+ *
+ * return: offset = (pn, index) of start entry
+ *	of next jfs_readdir()/dtRead()
+ */
+int jfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *ip = file_inode(file);
+	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
+	int rc = 0;
+	loff_t dtpos;	/* legacy OS/2 style position */
+	struct dtoffset {
+		s16 pn;
+		s16 index;
+		s32 unused;
+	} *dtoffset = (struct dtoffset *) &dtpos;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index;
+	s8 *stbl;
+	struct btstack btstack;
+	int i, next;
+	struct ldtentry *d;
+	struct dtslot *t;
+	int d_namleft, len, outlen;
+	unsigned long dirent_buf;
+	char *name_ptr;
+	u32 dir_index;
+	int do_index = 0;
+	uint loop_count = 0;
+	struct jfs_dirent *jfs_dirent;
+	int jfs_dirents;
+	int overflow, fix_page, page_fixed = 0;
+	static int unique_pos = 2;	/* If we can't fix broken index */
+
+	if (ctx->pos == DIREND)
+		return 0;
+
+	if (DO_INDEX(ip)) {
+		/*
+		 * persistent index is stored in directory entries.
+		 * Special cases:	 0 = .
+		 *			 1 = ..
+		 *			-1 = End of directory
+		 */
+		do_index = 1;
+
+		dir_index = (u32) ctx->pos;
+
+		/*
+		 * NFSv4 reserves cookies 1 and 2 for . and .. so the value
+		 * we return to the vfs is one greater than the one we use
+		 * internally.
+		 */
+		if (dir_index)
+			dir_index--;
+
+		if (dir_index > 1) {
+			struct dir_table_slot dirtab_slot;
+
+			if (dtEmpty(ip) ||
+			    (dir_index >= JFS_IP(ip)->next_index)) {
+				/* Stale position.  Directory has shrunk */
+				ctx->pos = DIREND;
+				return 0;
+			}
+		      repeat:
+			rc = read_index(ip, dir_index, &dirtab_slot);
+			if (rc) {
+				ctx->pos = DIREND;
+				return rc;
+			}
+			if (dirtab_slot.flag == DIR_INDEX_FREE) {
+				if (loop_count++ > JFS_IP(ip)->next_index) {
+					jfs_err("jfs_readdir detected "
+						   "infinite loop!");
+					ctx->pos = DIREND;
+					return 0;
+				}
+				dir_index = le32_to_cpu(dirtab_slot.addr2);
+				if (dir_index == -1) {
+					ctx->pos = DIREND;
+					return 0;
+				}
+				goto repeat;
+			}
+			bn = addressDTS(&dirtab_slot);
+			index = dirtab_slot.slot;
+			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc) {
+				ctx->pos = DIREND;
+				return 0;
+			}
+			if (p->header.flag & BT_INTERNAL) {
+				jfs_err("jfs_readdir: bad index table");
+				DT_PUTPAGE(mp);
+				ctx->pos = DIREND;
+				return 0;
+			}
+		} else {
+			if (dir_index == 0) {
+				/*
+				 * self "."
+				 */
+				ctx->pos = 1;
+				if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
+					return 0;
+			}
+			/*
+			 * parent ".."
+			 */
+			ctx->pos = 2;
+			if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
+				return 0;
+
+			/*
+			 * Find first entry of left-most leaf
+			 */
+			if (dtEmpty(ip)) {
+				ctx->pos = DIREND;
+				return 0;
+			}
+
+			if ((rc = dtReadFirst(ip, &btstack)))
+				return rc;
+
+			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+		}
+	} else {
+		/*
+		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
+		 *
+		 * pn = 0; index = 1:	First entry "."
+		 * pn = 0; index = 2:	Second entry ".."
+		 * pn > 0:		Real entries, pn=1 -> leftmost page
+		 * pn = index = -1:	No more entries
+		 */
+		dtpos = ctx->pos;
+		if (dtpos < 2) {
+			/* build "." entry */
+			ctx->pos = 1;
+			if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
+				return 0;
+			dtoffset->index = 2;
+			ctx->pos = dtpos;
+		}
+
+		if (dtoffset->pn == 0) {
+			if (dtoffset->index == 2) {
+				/* build ".." entry */
+				if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
+					return 0;
+			} else {
+				jfs_err("jfs_readdir called with "
+					"invalid offset!");
+			}
+			dtoffset->pn = 1;
+			dtoffset->index = 0;
+			ctx->pos = dtpos;
+		}
+
+		if (dtEmpty(ip)) {
+			ctx->pos = DIREND;
+			return 0;
+		}
+
+		if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
+			jfs_err("jfs_readdir: unexpected rc = %d "
+				"from dtReadNext", rc);
+			ctx->pos = DIREND;
+			return 0;
+		}
+		/* get start leaf page and index */
+		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+		/* offset beyond directory eof ? */
+		if (bn < 0) {
+			ctx->pos = DIREND;
+			return 0;
+		}
+	}
+
+	dirent_buf = __get_free_page(GFP_KERNEL);
+	if (dirent_buf == 0) {
+		DT_PUTPAGE(mp);
+		jfs_warn("jfs_readdir: __get_free_page failed!");
+		ctx->pos = DIREND;
+		return -ENOMEM;
+	}
+
+	while (1) {
+		jfs_dirent = (struct jfs_dirent *) dirent_buf;
+		jfs_dirents = 0;
+		overflow = fix_page = 0;
+
+		stbl = DT_GETSTBL(p);
+
+		for (i = index; i < p->header.nextindex; i++) {
+			d = (struct ldtentry *) & p->slot[stbl[i]];
+
+			if (((long) jfs_dirent + d->namlen + 1) >
+			    (dirent_buf + PAGE_SIZE)) {
+				/* DBCS codepages could overrun dirent_buf */
+				index = i;
+				overflow = 1;
+				break;
+			}
+
+			d_namleft = d->namlen;
+			name_ptr = jfs_dirent->name;
+			jfs_dirent->ino = le32_to_cpu(d->inumber);
+
+			if (do_index) {
+				len = min(d_namleft, DTLHDRDATALEN);
+				jfs_dirent->position = le32_to_cpu(d->index);
+				/*
+				 * d->index should always be valid, but it
+				 * isn't.  fsck.jfs doesn't create the
+				 * directory index for the lost+found
+				 * directory.  Rather than let it go,
+				 * we can try to fix it.
+				 */
+				if ((jfs_dirent->position < 2) ||
+				    (jfs_dirent->position >=
+				     JFS_IP(ip)->next_index)) {
+					if (!page_fixed && !isReadOnly(ip)) {
+						fix_page = 1;
+						/*
+						 * setting overflow and setting
+						 * index to i will cause the
+						 * same page to be processed
+						 * again starting here
+						 */
+						overflow = 1;
+						index = i;
+						break;
+					}
+					jfs_dirent->position = unique_pos++;
+				}
+				/*
+				 * We add 1 to the index because we may
+				 * use a value of 2 internally, and NFSv4
+				 * doesn't like that.
+				 */
+				jfs_dirent->position++;
+			} else {
+				jfs_dirent->position = dtpos;
+				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
+			}
+
+			/* copy the name of head/only segment */
+			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
+						   codepage);
+			jfs_dirent->name_len = outlen;
+
+			/* copy name in the additional segment(s) */
+			next = d->next;
+			while (next >= 0) {
+				t = (struct dtslot *) & p->slot[next];
+				name_ptr += outlen;
+				d_namleft -= len;
+				/* Sanity Check */
+				if (d_namleft == 0) {
+					jfs_error(ip->i_sb,
+						  "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
+						  (long)ip->i_ino,
+						  (long long)bn,
+						  i);
+					goto skip_one;
+				}
+				len = min(d_namleft, DTSLOTDATALEN);
+				outlen = jfs_strfromUCS_le(name_ptr, t->name,
+							   len, codepage);
+				jfs_dirent->name_len += outlen;
+
+				next = t->next;
+			}
+
+			jfs_dirents++;
+			jfs_dirent = next_jfs_dirent(jfs_dirent);
+skip_one:
+			if (!do_index)
+				dtoffset->index++;
+		}
+
+		if (!overflow) {
+			/* Point to next leaf page */
+			if (p->header.flag & BT_ROOT)
+				bn = 0;
+			else {
+				bn = le64_to_cpu(p->header.next);
+				index = 0;
+				/* update offset (pn:index) for new page */
+				if (!do_index) {
+					dtoffset->pn++;
+					dtoffset->index = 0;
+				}
+			}
+			page_fixed = 0;
+		}
+
+		/* unpin previous leaf page */
+		DT_PUTPAGE(mp);
+
+		jfs_dirent = (struct jfs_dirent *) dirent_buf;
+		while (jfs_dirents--) {
+			ctx->pos = jfs_dirent->position;
+			if (!dir_emit(ctx, jfs_dirent->name,
+				    jfs_dirent->name_len,
+				    jfs_dirent->ino, DT_UNKNOWN))
+				goto out;
+			jfs_dirent = next_jfs_dirent(jfs_dirent);
+		}
+
+		if (fix_page) {
+			add_missing_indices(ip, bn);
+			page_fixed = 1;
+		}
+
+		if (!overflow && (bn == 0)) {
+			ctx->pos = DIREND;
+			break;
+		}
+
+		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc) {
+			free_page(dirent_buf);
+			return rc;
+		}
+	}
+
+      out:
+	free_page(dirent_buf);
+
+	return rc;
+}
+
+
+/*
+ *	dtReadFirst()
+ *
+ * function: get the leftmost page of the directory
+ */
+static int dtReadFirst(struct inode *ip, struct btstack * btstack)
+{
+	int rc = 0;
+	s64 bn;
+	int psize = 288;	/* initial in-line directory */
+	struct metapage *mp;
+	dtpage_t *p;
+	s8 *stbl;
+	struct btframe *btsp;
+	pxd_t *xd;
+
+	BT_CLR(btstack);	/* reset stack */
+
+	/*
+	 *	descend leftmost path of the tree
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		DT_GETPAGE(ip, bn, mp, psize, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * leftmost leaf page
+		 */
+		if (p->header.flag & BT_LEAF) {
+			/* return leftmost entry */
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = 0;
+			btsp->mp = mp;
+
+			return 0;
+		}
+
+		/*
+		 * descend down to leftmost child page
+		 */
+		if (BT_STACK_FULL(btstack)) {
+			DT_PUTPAGE(mp);
+			jfs_error(ip->i_sb, "btstack overrun\n");
+			BT_STACK_DUMP(btstack);
+			return -EIO;
+		}
+		/* push (bn, index) of the parent page/entry */
+		BT_PUSH(btstack, bn, 0);
+
+		/* get the leftmost entry */
+		stbl = DT_GETSTBL(p);
+		xd = (pxd_t *) & p->slot[stbl[0]];
+
+		/* get the child page block address */
+		bn = addressPXD(xd);
+		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
+
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+	}
+}
+
+
+/*
+ *	dtReadNext()
+ *
+ * function: get the page of the specified offset (pn:index)
+ *
+ * return: if (offset > eof), bn = -1;
+ *
+ * note: if index > nextindex of the target leaf page,
+ * start with 1st entry of next leaf page;
+ */
+static int dtReadNext(struct inode *ip, loff_t * offset,
+		      struct btstack * btstack)
+{
+	int rc = 0;
+	struct dtoffset {
+		s16 pn;
+		s16 index;
+		s32 unused;
+	} *dtoffset = (struct dtoffset *) offset;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index;
+	int pn;
+	s8 *stbl;
+	struct btframe *btsp, *parent;
+	pxd_t *xd;
+
+	/*
+	 * get leftmost leaf page pinned
+	 */
+	if ((rc = dtReadFirst(ip, btstack)))
+		return rc;
+
+	/* get leaf page */
+	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+
+	/* get the start offset (pn:index) */
+	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
+	index = dtoffset->index;
+
+	/* start at leftmost page ? */
+	if (pn == 0) {
+		/* offset beyond eof ? */
+		if (index < p->header.nextindex)
+			goto out;
+
+		if (p->header.flag & BT_ROOT) {
+			bn = -1;
+			goto out;
+		}
+
+		/* start with 1st entry of next leaf page */
+		dtoffset->pn++;
+		dtoffset->index = index = 0;
+		goto a;
+	}
+
+	/* start at non-leftmost page: scan parent pages for large pn */
+	if (p->header.flag & BT_ROOT) {
+		bn = -1;
+		goto out;
+	}
+
+	/* start after next leaf page ? */
+	if (pn > 1)
+		goto b;
+
+	/* get leaf page pn = 1 */
+      a:
+	bn = le64_to_cpu(p->header.next);
+
+	/* unpin leaf page */
+	DT_PUTPAGE(mp);
+
+	/* offset beyond eof ? */
+	if (bn == 0) {
+		bn = -1;
+		goto out;
+	}
+
+	goto c;
+
+	/*
+	 * scan last internal page level to get target leaf page
+	 */
+      b:
+	/* unpin leftmost leaf page */
+	DT_PUTPAGE(mp);
+
+	/* get left most parent page */
+	btsp = btstack->top;
+	parent = btsp - 1;
+	bn = parent->bn;
+	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	/* scan parent pages at last internal page level */
+	while (pn >= p->header.nextindex) {
+		pn -= p->header.nextindex;
+
+		/* get next parent page address */
+		bn = le64_to_cpu(p->header.next);
+
+		/* unpin current parent page */
+		DT_PUTPAGE(mp);
+
+		/* offset beyond eof ? */
+		if (bn == 0) {
+			bn = -1;
+			goto out;
+		}
+
+		/* get next parent page */
+		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* update parent page stack frame */
+		parent->bn = bn;
+	}
+
+	/* get leaf page address */
+	stbl = DT_GETSTBL(p);
+	xd = (pxd_t *) & p->slot[stbl[pn]];
+	bn = addressPXD(xd);
+
+	/* unpin parent page */
+	DT_PUTPAGE(mp);
+
+	/*
+	 * get target leaf page
+	 */
+      c:
+	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	/*
+	 * leaf page has been completed:
+	 * start with 1st entry of next leaf page
+	 */
+	if (index >= p->header.nextindex) {
+		bn = le64_to_cpu(p->header.next);
+
+		/* unpin leaf page */
+		DT_PUTPAGE(mp);
+
+		/* offset beyond eof ? */
+		if (bn == 0) {
+			bn = -1;
+			goto out;
+		}
+
+		/* get next leaf page */
+		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* start with 1st entry of next leaf page */
+		dtoffset->pn++;
+		dtoffset->index = 0;
+	}
+
+      out:
+	/* return target leaf page pinned */
+	btsp = btstack->top;
+	btsp->bn = bn;
+	btsp->index = dtoffset->index;
+	btsp->mp = mp;
+
+	return 0;
+}
+
+
+/*
+ *	dtCompare()
+ *
+ * function: compare search key with an internal entry
+ *
+ * return:
+ *	< 0 if k is < record
+ *	= 0 if k is = record
+ *	> 0 if k is > record
+ */
+static int dtCompare(struct component_name * key,	/* search key */
+		     dtpage_t * p,	/* directory page */
+		     int si)
+{				/* entry slot index */
+	wchar_t *kname;
+	__le16 *name;
+	int klen, namlen, len, rc;
+	struct idtentry *ih;
+	struct dtslot *t;
+
+	/*
+	 * force the left-most key on internal pages, at any level of
+	 * the tree, to be less than any search key.
+	 * this obviates having to update the leftmost key on an internal
+	 * page when the user inserts a new key in the tree smaller than
+	 * anything that has been stored.
+	 *
+	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+	 * at any internal page at any level of the tree,
+	 * it descends to child of the entry anyway -
+	 * ? make the entry as min size dummy entry)
+	 *
+	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+	 * return (1);
+	 */
+
+	kname = key->name;
+	klen = key->namlen;
+
+	ih = (struct idtentry *) & p->slot[si];
+	si = ih->next;
+	name = ih->name;
+	namlen = ih->namlen;
+	len = min(namlen, DTIHDRDATALEN);
+
+	/* compare with head/only segment */
+	len = min(klen, len);
+	if ((rc = UniStrncmp_le(kname, name, len)))
+		return rc;
+
+	klen -= len;
+	namlen -= len;
+
+	/* compare with additional segment(s) */
+	kname += len;
+	while (klen > 0 && namlen > 0) {
+		/* compare with next name segment */
+		t = (struct dtslot *) & p->slot[si];
+		len = min(namlen, DTSLOTDATALEN);
+		len = min(klen, len);
+		name = t->name;
+		if ((rc = UniStrncmp_le(kname, name, len)))
+			return rc;
+
+		klen -= len;
+		namlen -= len;
+		kname += len;
+		si = t->next;
+	}
+
+	return (klen - namlen);
+}
+
+
+
+
+/*
+ *	ciCompare()
+ *
+ * function: compare search key with an (leaf/internal) entry
+ *
+ * return:
+ *	< 0 if k is < record
+ *	= 0 if k is = record
+ *	> 0 if k is > record
+ */
+static int ciCompare(struct component_name * key,	/* search key */
+		     dtpage_t * p,	/* directory page */
+		     int si,	/* entry slot index */
+		     int flag)
+{
+	wchar_t *kname, x;
+	__le16 *name;
+	int klen, namlen, len, rc;
+	struct ldtentry *lh;
+	struct idtentry *ih;
+	struct dtslot *t;
+	int i;
+
+	/*
+	 * force the left-most key on internal pages, at any level of
+	 * the tree, to be less than any search key.
+	 * this obviates having to update the leftmost key on an internal
+	 * page when the user inserts a new key in the tree smaller than
+	 * anything that has been stored.
+	 *
+	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+	 * at any internal page at any level of the tree,
+	 * it descends to child of the entry anyway -
+	 * ? make the entry as min size dummy entry)
+	 *
+	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+	 * return (1);
+	 */
+
+	kname = key->name;
+	klen = key->namlen;
+
+	/*
+	 * leaf page entry
+	 */
+	if (p->header.flag & BT_LEAF) {
+		lh = (struct ldtentry *) & p->slot[si];
+		si = lh->next;
+		name = lh->name;
+		namlen = lh->namlen;
+		if (flag & JFS_DIR_INDEX)
+			len = min(namlen, DTLHDRDATALEN);
+		else
+			len = min(namlen, DTLHDRDATALEN_LEGACY);
+	}
+	/*
+	 * internal page entry
+	 */
+	else {
+		ih = (struct idtentry *) & p->slot[si];
+		si = ih->next;
+		name = ih->name;
+		namlen = ih->namlen;
+		len = min(namlen, DTIHDRDATALEN);
+	}
+
+	/* compare with head/only segment */
+	len = min(klen, len);
+	for (i = 0; i < len; i++, kname++, name++) {
+		/* only uppercase if case-insensitive support is on */
+		if ((flag & JFS_OS2) == JFS_OS2)
+			x = UniToupper(le16_to_cpu(*name));
+		else
+			x = le16_to_cpu(*name);
+		if ((rc = *kname - x))
+			return rc;
+	}
+
+	klen -= len;
+	namlen -= len;
+
+	/* compare with additional segment(s) */
+	while (klen > 0 && namlen > 0) {
+		/* compare with next name segment */
+		t = (struct dtslot *) & p->slot[si];
+		len = min(namlen, DTSLOTDATALEN);
+		len = min(klen, len);
+		name = t->name;
+		for (i = 0; i < len; i++, kname++, name++) {
+			/* only uppercase if case-insensitive support is on */
+			if ((flag & JFS_OS2) == JFS_OS2)
+				x = UniToupper(le16_to_cpu(*name));
+			else
+				x = le16_to_cpu(*name);
+
+			if ((rc = *kname - x))
+				return rc;
+		}
+
+		klen -= len;
+		namlen -= len;
+		si = t->next;
+	}
+
+	return (klen - namlen);
+}
+
+
+/*
+ *	ciGetLeafPrefixKey()
+ *
+ * function: compute prefix of suffix compression
+ *	     from two adjacent leaf entries
+ *	     across page boundary
+ *
+ * return: non-zero on error
+ *
+ */
+static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+			       int ri, struct component_name * key, int flag)
+{
+	int klen, namlen;
+	wchar_t *pl, *pr, *kname;
+	struct component_name lkey;
+	struct component_name rkey;
+
+	lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+					GFP_KERNEL);
+	if (lkey.name == NULL)
+		return -ENOMEM;
+
+	rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+					GFP_KERNEL);
+	if (rkey.name == NULL) {
+		kfree(lkey.name);
+		return -ENOMEM;
+	}
+
+	/* get left and right key */
+	dtGetKey(lp, li, &lkey, flag);
+	lkey.name[lkey.namlen] = 0;
+
+	if ((flag & JFS_OS2) == JFS_OS2)
+		ciToUpper(&lkey);
+
+	dtGetKey(rp, ri, &rkey, flag);
+	rkey.name[rkey.namlen] = 0;
+
+
+	if ((flag & JFS_OS2) == JFS_OS2)
+		ciToUpper(&rkey);
+
+	/* compute prefix */
+	klen = 0;
+	kname = key->name;
+	namlen = min(lkey.namlen, rkey.namlen);
+	for (pl = lkey.name, pr = rkey.name;
+	     namlen; pl++, pr++, namlen--, klen++, kname++) {
+		*kname = *pr;
+		if (*pl != *pr) {
+			key->namlen = klen + 1;
+			goto free_names;
+		}
+	}
+
+	/* l->namlen <= r->namlen since l <= r */
+	if (lkey.namlen < rkey.namlen) {
+		*kname = *pr;
+		key->namlen = klen + 1;
+	} else			/* l->namelen == r->namelen */
+		key->namlen = klen;
+
+free_names:
+	kfree(lkey.name);
+	kfree(rkey.name);
+	return 0;
+}
+
+
+
+/*
+ *	dtGetKey()
+ *
+ * function: get key of the entry
+ */
+static void dtGetKey(dtpage_t * p, int i,	/* entry index */
+		     struct component_name * key, int flag)
+{
+	int si;
+	s8 *stbl;
+	struct ldtentry *lh;
+	struct idtentry *ih;
+	struct dtslot *t;
+	int namlen, len;
+	wchar_t *kname;
+	__le16 *name;
+
+	/* get entry */
+	stbl = DT_GETSTBL(p);
+	si = stbl[i];
+	if (p->header.flag & BT_LEAF) {
+		lh = (struct ldtentry *) & p->slot[si];
+		si = lh->next;
+		namlen = lh->namlen;
+		name = lh->name;
+		if (flag & JFS_DIR_INDEX)
+			len = min(namlen, DTLHDRDATALEN);
+		else
+			len = min(namlen, DTLHDRDATALEN_LEGACY);
+	} else {
+		ih = (struct idtentry *) & p->slot[si];
+		si = ih->next;
+		namlen = ih->namlen;
+		name = ih->name;
+		len = min(namlen, DTIHDRDATALEN);
+	}
+
+	key->namlen = namlen;
+	kname = key->name;
+
+	/*
+	 * move head/only segment
+	 */
+	UniStrncpy_from_le(kname, name, len);
+
+	/*
+	 * move additional segment(s)
+	 */
+	while (si >= 0) {
+		/* get next segment */
+		t = &p->slot[si];
+		kname += len;
+		namlen -= len;
+		len = min(namlen, DTSLOTDATALEN);
+		UniStrncpy_from_le(kname, t->name, len);
+
+		si = t->next;
+	}
+}
+
+
+/*
+ *	dtInsertEntry()
+ *
+ * function: allocate free slot(s) and
+ *	     write a leaf/internal entry
+ *
+ * return: entry slot index
+ */
+static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
+			  ddata_t * data, struct dt_lock ** dtlock)
+{
+	struct dtslot *h, *t;
+	struct ldtentry *lh = NULL;
+	struct idtentry *ih = NULL;
+	int hsi, fsi, klen, len, nextindex;
+	wchar_t *kname;
+	__le16 *name;
+	s8 *stbl;
+	pxd_t *xd;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int xsi, n;
+	s64 bn = 0;
+	struct metapage *mp = NULL;
+
+	klen = key->namlen;
+	kname = key->name;
+
+	/* allocate a free slot */
+	hsi = fsi = p->header.freelist;
+	h = &p->slot[fsi];
+	p->header.freelist = h->next;
+	--p->header.freecnt;
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+
+	lv = & dtlck->lv[dtlck->index];
+	lv->offset = hsi;
+
+	/* write head/only segment */
+	if (p->header.flag & BT_LEAF) {
+		lh = (struct ldtentry *) h;
+		lh->next = h->next;
+		lh->inumber = cpu_to_le32(data->leaf.ino);
+		lh->namlen = klen;
+		name = lh->name;
+		if (data->leaf.ip) {
+			len = min(klen, DTLHDRDATALEN);
+			if (!(p->header.flag & BT_ROOT))
+				bn = addressPXD(&p->header.self);
+			lh->index = cpu_to_le32(add_index(data->leaf.tid,
+							  data->leaf.ip,
+							  bn, index));
+		} else
+			len = min(klen, DTLHDRDATALEN_LEGACY);
+	} else {
+		ih = (struct idtentry *) h;
+		ih->next = h->next;
+		xd = (pxd_t *) ih;
+		*xd = data->xd;
+		ih->namlen = klen;
+		name = ih->name;
+		len = min(klen, DTIHDRDATALEN);
+	}
+
+	UniStrncpy_to_le(name, kname, len);
+
+	n = 1;
+	xsi = hsi;
+
+	/* write additional segment(s) */
+	t = h;
+	klen -= len;
+	while (klen) {
+		/* get free slot */
+		fsi = p->header.freelist;
+		t = &p->slot[fsi];
+		p->header.freelist = t->next;
+		--p->header.freecnt;
+
+		/* is next slot contiguous ? */
+		if (fsi != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = fsi;
+			n = 0;
+		}
+
+		kname += len;
+		len = min(klen, DTSLOTDATALEN);
+		UniStrncpy_to_le(t->name, kname, len);
+
+		n++;
+		xsi = fsi;
+		klen -= len;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+
+	/* terminate last/only segment */
+	if (h == t) {
+		/* single segment entry */
+		if (p->header.flag & BT_LEAF)
+			lh->next = -1;
+		else
+			ih->next = -1;
+	} else
+		/* multi-segment entry */
+		t->next = -1;
+
+	/* if insert into middle, shift right succeeding entries in stbl */
+	stbl = DT_GETSTBL(p);
+	nextindex = p->header.nextindex;
+	if (index < nextindex) {
+		memmove(stbl + index + 1, stbl + index, nextindex - index);
+
+		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
+			s64 lblock;
+
+			/*
+			 * Need to update slot number for entries that moved
+			 * in the stbl
+			 */
+			mp = NULL;
+			for (n = index + 1; n <= nextindex; n++) {
+				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
+				modify_index(data->leaf.tid, data->leaf.ip,
+					     le32_to_cpu(lh->index), bn, n,
+					     &mp, &lblock);
+			}
+			if (mp)
+				release_metapage(mp);
+		}
+	}
+
+	stbl[index] = hsi;
+
+	/* advance next available entry index of stbl */
+	++p->header.nextindex;
+}
+
+
+/*
+ *	dtMoveEntry()
+ *
+ * function: move entries from split/left page to new/right page
+ *
+ *	nextindex of dst page and freelist/freecnt of both pages
+ *	are updated.
+ */
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
+			int do_index)
+{
+	int ssi, next;		/* src slot index */
+	int di;			/* dst entry index */
+	int dsi;		/* dst slot index */
+	s8 *sstbl, *dstbl;	/* sorted entry table */
+	int snamlen, len;
+	struct ldtentry *slh, *dlh = NULL;
+	struct idtentry *sih, *dih = NULL;
+	struct dtslot *h, *s, *d;
+	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
+	struct lv *slv, *dlv;
+	int xssi, ns, nd;
+	int sfsi;
+
+	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
+	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
+
+	dsi = dp->header.freelist;	/* first (whole page) free slot */
+	sfsi = sp->header.freelist;
+
+	/* linelock destination entry slot */
+	dlv = & ddtlck->lv[ddtlck->index];
+	dlv->offset = dsi;
+
+	/* linelock source entry slot */
+	slv = & sdtlck->lv[sdtlck->index];
+	slv->offset = sstbl[si];
+	xssi = slv->offset - 1;
+
+	/*
+	 * move entries
+	 */
+	ns = nd = 0;
+	for (di = 0; si < sp->header.nextindex; si++, di++) {
+		ssi = sstbl[si];
+		dstbl[di] = dsi;
+
+		/* is next slot contiguous ? */
+		if (ssi != xssi + 1) {
+			/* close current linelock */
+			slv->length = ns;
+			sdtlck->index++;
+
+			/* open new linelock */
+			if (sdtlck->index < sdtlck->maxcnt)
+				slv++;
+			else {
+				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
+				slv = & sdtlck->lv[0];
+			}
+
+			slv->offset = ssi;
+			ns = 0;
+		}
+
+		/*
+		 * move head/only segment of an entry
+		 */
+		/* get dst slot */
+		h = d = &dp->slot[dsi];
+
+		/* get src slot and move */
+		s = &sp->slot[ssi];
+		if (sp->header.flag & BT_LEAF) {
+			/* get source entry */
+			slh = (struct ldtentry *) s;
+			dlh = (struct ldtentry *) h;
+			snamlen = slh->namlen;
+
+			if (do_index) {
+				len = min(snamlen, DTLHDRDATALEN);
+				dlh->index = slh->index; /* little-endian */
+			} else
+				len = min(snamlen, DTLHDRDATALEN_LEGACY);
+
+			memcpy(dlh, slh, 6 + len * 2);
+
+			next = slh->next;
+
+			/* update dst head/only segment next field */
+			dsi++;
+			dlh->next = dsi;
+		} else {
+			sih = (struct idtentry *) s;
+			snamlen = sih->namlen;
+
+			len = min(snamlen, DTIHDRDATALEN);
+			dih = (struct idtentry *) h;
+			memcpy(dih, sih, 10 + len * 2);
+			next = sih->next;
+
+			dsi++;
+			dih->next = dsi;
+		}
+
+		/* free src head/only segment */
+		s->next = sfsi;
+		s->cnt = 1;
+		sfsi = ssi;
+
+		ns++;
+		nd++;
+		xssi = ssi;
+
+		/*
+		 * move additional segment(s) of the entry
+		 */
+		snamlen -= len;
+		while ((ssi = next) >= 0) {
+			/* is next slot contiguous ? */
+			if (ssi != xssi + 1) {
+				/* close current linelock */
+				slv->length = ns;
+				sdtlck->index++;
+
+				/* open new linelock */
+				if (sdtlck->index < sdtlck->maxcnt)
+					slv++;
+				else {
+					sdtlck =
+					    (struct dt_lock *)
+					    txLinelock(sdtlck);
+					slv = & sdtlck->lv[0];
+				}
+
+				slv->offset = ssi;
+				ns = 0;
+			}
+
+			/* get next source segment */
+			s = &sp->slot[ssi];
+
+			/* get next destination free slot */
+			d++;
+
+			len = min(snamlen, DTSLOTDATALEN);
+			UniStrncpy_le(d->name, s->name, len);
+
+			ns++;
+			nd++;
+			xssi = ssi;
+
+			dsi++;
+			d->next = dsi;
+
+			/* free source segment */
+			next = s->next;
+			s->next = sfsi;
+			s->cnt = 1;
+			sfsi = ssi;
+
+			snamlen -= len;
+		}		/* end while */
+
+		/* terminate dst last/only segment */
+		if (h == d) {
+			/* single segment entry */
+			if (dp->header.flag & BT_LEAF)
+				dlh->next = -1;
+			else
+				dih->next = -1;
+		} else
+			/* multi-segment entry */
+			d->next = -1;
+	}			/* end for */
+
+	/* close current linelock */
+	slv->length = ns;
+	sdtlck->index++;
+	*sdtlock = sdtlck;
+
+	dlv->length = nd;
+	ddtlck->index++;
+	*ddtlock = ddtlck;
+
+	/* update source header */
+	sp->header.freelist = sfsi;
+	sp->header.freecnt += nd;
+
+	/* update destination header */
+	dp->header.nextindex = di;
+
+	dp->header.freelist = dsi;
+	dp->header.freecnt -= nd;
+}
+
+
+/*
+ *	dtDeleteEntry()
+ *
+ * function: free a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
+{
+	int fsi;		/* free entry slot index */
+	s8 *stbl;
+	struct dtslot *t;
+	int si, freecnt;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int xsi, n;
+
+	/* get free entry slot index */
+	stbl = DT_GETSTBL(p);
+	fsi = stbl[fi];
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+	lv = & dtlck->lv[dtlck->index];
+
+	lv->offset = fsi;
+
+	/* get the head/only segment */
+	t = &p->slot[fsi];
+	if (p->header.flag & BT_LEAF)
+		si = ((struct ldtentry *) t)->next;
+	else
+		si = ((struct idtentry *) t)->next;
+	t->next = si;
+	t->cnt = 1;
+
+	n = freecnt = 1;
+	xsi = fsi;
+
+	/* find the last/only segment */
+	while (si >= 0) {
+		/* is next slot contiguous ? */
+		if (si != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = si;
+			n = 0;
+		}
+
+		n++;
+		xsi = si;
+		freecnt++;
+
+		t = &p->slot[si];
+		t->cnt = 1;
+		si = t->next;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+
+	/* update freelist */
+	t->next = p->header.freelist;
+	p->header.freelist = fsi;
+	p->header.freecnt += freecnt;
+
+	/* if delete from middle,
+	 * shift left the succedding entries in the stbl
+	 */
+	si = p->header.nextindex;
+	if (fi < si - 1)
+		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
+
+	p->header.nextindex--;
+}
+
+
+/*
+ *	dtTruncateEntry()
+ *
+ * function: truncate a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
+{
+	int tsi;		/* truncate entry slot index */
+	s8 *stbl;
+	struct dtslot *t;
+	int si, freecnt;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int fsi, xsi, n;
+
+	/* get free entry slot index */
+	stbl = DT_GETSTBL(p);
+	tsi = stbl[ti];
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+	lv = & dtlck->lv[dtlck->index];
+
+	lv->offset = tsi;
+
+	/* get the head/only segment */
+	t = &p->slot[tsi];
+	ASSERT(p->header.flag & BT_INTERNAL);
+	((struct idtentry *) t)->namlen = 0;
+	si = ((struct idtentry *) t)->next;
+	((struct idtentry *) t)->next = -1;
+
+	n = 1;
+	freecnt = 0;
+	fsi = si;
+	xsi = tsi;
+
+	/* find the last/only segment */
+	while (si >= 0) {
+		/* is next slot contiguous ? */
+		if (si != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = si;
+			n = 0;
+		}
+
+		n++;
+		xsi = si;
+		freecnt++;
+
+		t = &p->slot[si];
+		t->cnt = 1;
+		si = t->next;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+
+	/* update freelist */
+	if (freecnt == 0)
+		return;
+	t->next = p->header.freelist;
+	p->header.freelist = fsi;
+	p->header.freecnt += freecnt;
+}
+
+
+/*
+ *	dtLinelockFreelist()
+ */
+static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
+			       int m,	/* max slot index */
+			       struct dt_lock ** dtlock)
+{
+	int fsi;		/* free entry slot index */
+	struct dtslot *t;
+	int si;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int xsi, n;
+
+	/* get free entry slot index */
+	fsi = p->header.freelist;
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+	lv = & dtlck->lv[dtlck->index];
+
+	lv->offset = fsi;
+
+	n = 1;
+	xsi = fsi;
+
+	t = &p->slot[fsi];
+	si = t->next;
+
+	/* find the last/only segment */
+	while (si < m && si >= 0) {
+		/* is next slot contiguous ? */
+		if (si != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = si;
+			n = 0;
+		}
+
+		n++;
+		xsi = si;
+
+		t = &p->slot[si];
+		si = t->next;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+}
+
+
+/*
+ * NAME: dtModify
+ *
+ * FUNCTION: Modify the inode number part of a directory entry
+ *
+ * PARAMETERS:
+ *	tid	- Transaction id
+ *	ip	- Inode of parent directory
+ *	key	- Name of entry to be modified
+ *	orig_ino	- Original inode number expected in entry
+ *	new_ino	- New inode number to put into entry
+ *	flag	- JFS_RENAME
+ *
+ * RETURNS:
+ *	-ESTALE	- If entry found does not match orig_ino passed in
+ *	-ENOENT	- If no entry can be found to match key
+ *	0	- If successfully modified entry
+ */
+int dtModify(tid_t tid, struct inode *ip,
+	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
+{
+	int rc;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index;
+	struct btstack btstack;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+	s8 *stbl;
+	int entry_si;		/* entry slot index */
+	struct ldtentry *entry;
+
+	/*
+	 *	search for the entry to modify:
+	 *
+	 * dtSearch() returns (leaf page pinned, index at which to modify).
+	 */
+	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
+		return rc;
+
+	/* retrieve search result */
+	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page of named entry
+	 */
+	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	/* get slot index of the entry */
+	stbl = DT_GETSTBL(p);
+	entry_si = stbl[index];
+
+	/* linelock entry */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = entry_si;
+	lv->length = 1;
+	dtlck->index++;
+
+	/* get the head/only segment */
+	entry = (struct ldtentry *) & p->slot[entry_si];
+
+	/* substitute the inode number of the entry */
+	entry->inumber = cpu_to_le32(new_ino);
+
+	/* unpin the leaf page */
+	DT_PUTPAGE(mp);
+
+	return 0;
+}
diff --git a/kernel/fs/jfs/jfs_dtree.h b/kernel/fs/jfs/jfs_dtree.h
new file mode 100644
index 000000000..fd4169e6e
--- /dev/null
+++ b/kernel/fs/jfs/jfs_dtree.h
@@ -0,0 +1,269 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DTREE
+#define	_H_JFS_DTREE
+
+/*
+ *	jfs_dtree.h: directory B+-tree manager
+ */
+
+#include "jfs_btree.h"
+
+typedef union {
+	struct {
+		tid_t tid;
+		struct inode *ip;
+		u32 ino;
+	} leaf;
+	pxd_t xd;
+} ddata_t;
+
+
+/*
+ *	entry segment/slot
+ *
+ * an entry consists of type dependent head/only segment/slot and
+ * additional segments/slots linked vi next field;
+ * N.B. last/only segment of entry is terminated by next = -1;
+ */
+/*
+ *	directory page slot
+ */
+struct dtslot {
+	s8 next;		/* 1: */
+	s8 cnt;			/* 1: */
+	__le16 name[15];	/* 30: */
+};				/* (32) */
+
+
+#define DATASLOTSIZE	16
+#define L2DATASLOTSIZE	4
+#define	DTSLOTSIZE	32
+#define	L2DTSLOTSIZE	5
+#define DTSLOTHDRSIZE	2
+#define DTSLOTDATASIZE	30
+#define DTSLOTDATALEN	15
+
+/*
+ *	 internal node entry head/only segment
+ */
+struct idtentry {
+	pxd_t xd;		/* 8: child extent descriptor */
+
+	s8 next;		/* 1: */
+	u8 namlen;		/* 1: */
+	__le16 name[11];	/* 22: 2-byte aligned */
+};				/* (32) */
+
+#define DTIHDRSIZE	10
+#define DTIHDRDATALEN	11
+
+/* compute number of slots for entry */
+#define	NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15))
+
+
+/*
+ *	leaf node entry head/only segment
+ *
+ *	For legacy filesystems, name contains 13 wchars -- no index field
+ */
+struct ldtentry {
+	__le32 inumber;		/* 4: 4-byte aligned */
+	s8 next;		/* 1: */
+	u8 namlen;		/* 1: */
+	__le16 name[11];	/* 22: 2-byte aligned */
+	__le32 index;		/* 4: index into dir_table */
+};				/* (32) */
+
+#define DTLHDRSIZE	6
+#define DTLHDRDATALEN_LEGACY	13	/* Old (OS/2) format */
+#define DTLHDRDATALEN	11
+
+/*
+ * dir_table used for directory traversal during readdir
+ */
+
+/*
+ * Keep persistent index for directory entries
+ */
+#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX)
+
+/*
+ * Maximum entry in inline directory table
+ */
+#define MAX_INLINE_DIRTABLE_ENTRY 13
+
+struct dir_table_slot {
+	u8 rsrvd;		/* 1: */
+	u8 flag;		/* 1: 0 if free */
+	u8 slot;		/* 1: slot within leaf page of entry */
+	u8 addr1;		/* 1: upper 8 bits of leaf page address */
+	__le32 addr2;		/* 4: lower 32 bits of leaf page address -OR-
+				   index of next entry when this entry was deleted */
+};				/* (8) */
+
+/*
+ * flag values
+ */
+#define DIR_INDEX_VALID 1
+#define DIR_INDEX_FREE 0
+
+#define DTSaddress(dir_table_slot, address64)\
+{\
+	(dir_table_slot)->addr1 = ((u64)address64) >> 32;\
+	(dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+
+#define addressDTS(dts)\
+	( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
+
+/* compute number of slots for entry */
+#define	NDTLEAF_LEGACY(klen)	(DIV_ROUND_UP((2 + (klen)), 15))
+#define	NDTLEAF	NDTINTERNAL
+
+
+/*
+ *	directory root page (in-line in on-disk inode):
+ *
+ * cf. dtpage_t below.
+ */
+typedef union {
+	struct {
+		struct dasd DASD; /* 16: DASD limit/usage info */
+
+		u8 flag;	/* 1: */
+		u8 nextindex;	/* 1: next free entry in stbl */
+		s8 freecnt;	/* 1: free count */
+		s8 freelist;	/* 1: freelist header */
+
+		__le32 idotdot;	/* 4: parent inode number */
+
+		s8 stbl[8];	/* 8: sorted entry index table */
+	} header;		/* (32) */
+
+	struct dtslot slot[9];
+} dtroot_t;
+
+#define PARENT(IP) \
+	(le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot))
+
+#define DTROOTMAXSLOT	9
+
+#define	dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0)
+
+
+/*
+ *	directory regular page:
+ *
+ *	entry slot array of 32 byte slot
+ *
+ * sorted entry slot index table (stbl):
+ * contiguous slots at slot specified by stblindex,
+ * 1-byte per entry
+ *   512 byte block:  16 entry tbl (1 slot)
+ *  1024 byte block:  32 entry tbl (1 slot)
+ *  2048 byte block:  64 entry tbl (2 slot)
+ *  4096 byte block: 128 entry tbl (4 slot)
+ *
+ * data area:
+ *   512 byte block:  16 - 2 =  14 slot
+ *  1024 byte block:  32 - 2 =  30 slot
+ *  2048 byte block:  64 - 3 =  61 slot
+ *  4096 byte block: 128 - 5 = 123 slot
+ *
+ * N.B. index is 0-based; index fields refer to slot index
+ * except nextindex which refers to entry index in stbl;
+ * end of entry stot list or freelist is marked with -1.
+ */
+typedef union {
+	struct {
+		__le64 next;	/* 8: next sibling */
+		__le64 prev;	/* 8: previous sibling */
+
+		u8 flag;	/* 1: */
+		u8 nextindex;	/* 1: next entry index in stbl */
+		s8 freecnt;	/* 1: */
+		s8 freelist;	/* 1: slot index of head of freelist */
+
+		u8 maxslot;	/* 1: number of slots in page slot[] */
+		u8 stblindex;	/* 1: slot index of start of stbl */
+		u8 rsrvd[2];	/* 2: */
+
+		pxd_t self;	/* 8: self pxd */
+	} header;		/* (32) */
+
+	struct dtslot slot[128];
+} dtpage_t;
+
+#define DTPAGEMAXSLOT        128
+
+#define DT8THPGNODEBYTES     512
+#define DT8THPGNODETSLOTS      1
+#define DT8THPGNODESLOTS      16
+
+#define DTQTRPGNODEBYTES    1024
+#define DTQTRPGNODETSLOTS      1
+#define DTQTRPGNODESLOTS      32
+
+#define DTHALFPGNODEBYTES   2048
+#define DTHALFPGNODETSLOTS     2
+#define DTHALFPGNODESLOTS     64
+
+#define DTFULLPGNODEBYTES   4096
+#define DTFULLPGNODETSLOTS     4
+#define DTFULLPGNODESLOTS    128
+
+#define DTENTRYSTART	1
+
+/* get sorted entry table of the page */
+#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\
+	((dtroot_t *)(p))->header.stbl : \
+	(s8 *)&(p)->slot[(p)->header.stblindex] )
+
+/*
+ * Flags for dtSearch
+ */
+#define JFS_CREATE 1
+#define JFS_LOOKUP 2
+#define JFS_REMOVE 3
+#define JFS_RENAME 4
+
+/*
+ * Maximum file offset for directories.
+ */
+#define DIREND	INT_MAX
+
+/*
+ *	external declarations
+ */
+extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot);
+
+extern int dtSearch(struct inode *ip, struct component_name * key,
+		    ino_t * data, struct btstack * btstack, int flag);
+
+extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key,
+		    ino_t * ino, struct btstack * btstack);
+
+extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
+		    ino_t * data, int flag);
+
+extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
+		    ino_t * orig_ino, ino_t new_ino, int flag);
+
+extern int jfs_readdir(struct file *file, struct dir_context *ctx);
+#endif				/* !_H_JFS_DTREE */
diff --git a/kernel/fs/jfs/jfs_extent.c b/kernel/fs/jfs/jfs_extent.c
new file mode 100644
index 000000000..2ae7d59ab
--- /dev/null
+++ b/kernel/fs/jfs/jfs_extent.c
@@ -0,0 +1,651 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_extent.h"
+#include "jfs_debug.h"
+
+/*
+ * forward references
+ */
+static int extBalloc(struct inode *, s64, s64 *, s64 *);
+#ifdef _NOTYET
+static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
+#endif
+static s64 extRoundDown(s64 nb);
+
+#define DPD(a)		(printk("(a): %d\n",(a)))
+#define DPC(a)		(printk("(a): %c\n",(a)))
+#define DPL1(a)					\
+{						\
+	if ((a) >> 32)				\
+		printk("(a): %x%08x  ",(a));	\
+	else					\
+		printk("(a): %x  ",(a) << 32);	\
+}
+#define DPL(a)					\
+{						\
+	if ((a) >> 32)				\
+		printk("(a): %x%08x\n",(a));	\
+	else					\
+		printk("(a): %x\n",(a) << 32);	\
+}
+
+#define DPD1(a)		(printk("(a): %d  ",(a)))
+#define DPX(a)		(printk("(a): %08x\n",(a)))
+#define DPX1(a)		(printk("(a): %08x  ",(a)))
+#define DPS(a)		(printk("%s\n",(a)))
+#define DPE(a)		(printk("\nENTERING: %s\n",(a)))
+#define DPE1(a)		(printk("\nENTERING: %s",(a)))
+#define DPS1(a)		(printk("  %s  ",(a)))
+
+
+/*
+ * NAME:	extAlloc()
+ *
+ * FUNCTION:	allocate an extent for a specified page range within a
+ *		file.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	xlen	- requested extent length.
+ *	pno	- the starting page number with the file.
+ *	xp	- pointer to an xad.  on entry, xad describes an
+ *		  extent that is used as an allocation hint if the
+ *		  xaddr of the xad is non-zero.  on successful exit,
+ *		  the xad describes the newly allocated extent.
+ *	abnr	- bool indicating whether the newly allocated extent
+ *		  should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int
+extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	s64 nxlen, nxaddr, xoff, hint, xaddr = 0;
+	int rc;
+	int xflag;
+
+	/* This blocks if we are low on resources */
+	txBeginAnon(ip->i_sb);
+
+	/* Avoid race with jfs_commit_inode() */
+	mutex_lock(&JFS_IP(ip)->commit_mutex);
+
+	/* validate extent length */
+	if (xlen > MAXXLEN)
+		xlen = MAXXLEN;
+
+	/* get the page's starting extent offset */
+	xoff = pno << sbi->l2nbperpage;
+
+	/* check if an allocation hint was provided */
+	if ((hint = addressXAD(xp))) {
+		/* get the size of the extent described by the hint */
+		nxlen = lengthXAD(xp);
+
+		/* check if the hint is for the portion of the file
+		 * immediately previous to the current allocation
+		 * request and if hint extent has the same abnr
+		 * value as the current request.  if so, we can
+		 * extend the hint extent to include the current
+		 * extent if we can allocate the blocks immediately
+		 * following the hint extent.
+		 */
+		if (offsetXAD(xp) + nxlen == xoff &&
+		    abnr == ((xp->flag & XAD_NOTRECORDED) ? true : false))
+			xaddr = hint + nxlen;
+
+		/* adjust the hint to the last block of the extent */
+		hint += (nxlen - 1);
+	}
+
+	/* allocate the disk blocks for the extent.  initially, extBalloc()
+	 * will try to allocate disk blocks for the requested size (xlen).
+	 * if this fails (xlen contiguous free blocks not available), it'll
+	 * try to allocate a smaller number of blocks (producing a smaller
+	 * extent), with this smaller number of blocks consisting of the
+	 * requested number of blocks rounded down to the next smaller
+	 * power of 2 number (i.e. 16 -> 8).  it'll continue to round down
+	 * and retry the allocation until the number of blocks to allocate
+	 * is smaller than the number of blocks per page.
+	 */
+	nxlen = xlen;
+	if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) {
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return (rc);
+	}
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
+		dbFree(ip, nxaddr, (s64) nxlen);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return rc;
+	}
+
+	/* determine the value of the extent flag */
+	xflag = abnr ? XAD_NOTRECORDED : 0;
+
+	/* if we can extend the hint extent to cover the current request,
+	 * extend it.  otherwise, insert a new extent to
+	 * cover the current request.
+	 */
+	if (xaddr && xaddr == nxaddr)
+		rc = xtExtend(0, ip, xoff, (int) nxlen, 0);
+	else
+		rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0);
+
+	/* if the extend or insert failed,
+	 * free the newly allocated blocks and return the error.
+	 */
+	if (rc) {
+		dbFree(ip, nxaddr, nxlen);
+		dquot_free_block(ip, nxlen);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return (rc);
+	}
+
+	/* set the results of the extent allocation */
+	XADaddress(xp, nxaddr);
+	XADlength(xp, nxlen);
+	XADoffset(xp, xoff);
+	xp->flag = xflag;
+
+	mark_inode_dirty(ip);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	/*
+	 * COMMIT_SyncList flags an anonymous tlock on page that is on
+	 * sync list.
+	 * We need to commit the inode to get the page written disk.
+	 */
+	if (test_and_clear_cflag(COMMIT_Synclist,ip))
+		jfs_commit_inode(ip, 0);
+
+	return (0);
+}
+
+
+#ifdef _NOTYET
+/*
+ * NAME:	extRealloc()
+ *
+ * FUNCTION:	extend the allocation of a file extent containing a
+ *		partial back last page.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	cp	- cbuf for the partial backed last page.
+ *	xlen	- request size of the resulting extent.
+ *	xp	- pointer to an xad. on successful exit, the xad
+ *		  describes the newly allocated extent.
+ *	abnr	- bool indicating whether the newly allocated extent
+ *		  should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
+{
+	struct super_block *sb = ip->i_sb;
+	s64 xaddr, xlen, nxaddr, delta, xoff;
+	s64 ntail, nextend, ninsert;
+	int rc, nbperpage = JFS_SBI(sb)->nbperpage;
+	int xflag;
+
+	/* This blocks if we are low on resources */
+	txBeginAnon(ip->i_sb);
+
+	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	/* validate extent length */
+	if (nxlen > MAXXLEN)
+		nxlen = MAXXLEN;
+
+	/* get the extend (partial) page's disk block address and
+	 * number of blocks.
+	 */
+	xaddr = addressXAD(xp);
+	xlen = lengthXAD(xp);
+	xoff = offsetXAD(xp);
+
+	/* if the extend page is abnr and if the request is for
+	 * the extent to be allocated and recorded,
+	 * make the page allocated and recorded.
+	 */
+	if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
+		xp->flag = 0;
+		if ((rc = xtUpdate(0, ip, xp)))
+			goto exit;
+	}
+
+	/* try to allocated the request number of blocks for the
+	 * extent.  dbRealloc() first tries to satisfy the request
+	 * by extending the allocation in place. otherwise, it will
+	 * try to allocate a new set of blocks large enough for the
+	 * request.  in satisfying a request, dbReAlloc() may allocate
+	 * less than what was request but will always allocate enough
+	 * space as to satisfy the extend page.
+	 */
+	if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
+		goto exit;
+
+	/* Allocat blocks to quota. */
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
+		dbFree(ip, nxaddr, (s64) nxlen);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return rc;
+	}
+
+	delta = nxlen - xlen;
+
+	/* check if the extend page is not abnr but the request is abnr
+	 * and the allocated disk space is for more than one page.  if this
+	 * is the case, there is a miss match of abnr between the extend page
+	 * and the one or more pages following the extend page.  as a result,
+	 * two extents will have to be manipulated. the first will be that
+	 * of the extent of the extend page and will be manipulated thru
+	 * an xtExtend() or an xtTailgate(), depending upon whether the
+	 * disk allocation occurred as an inplace extension.  the second
+	 * extent will be manipulated (created) through an xtInsert() and
+	 * will be for the pages following the extend page.
+	 */
+	if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) {
+		ntail = nbperpage;
+		nextend = ntail - xlen;
+		ninsert = nxlen - nbperpage;
+
+		xflag = XAD_NOTRECORDED;
+	} else {
+		ntail = nxlen;
+		nextend = delta;
+		ninsert = 0;
+
+		xflag = xp->flag;
+	}
+
+	/* if we were able to extend the disk allocation in place,
+	 * extend the extent.  otherwise, move the extent to a
+	 * new disk location.
+	 */
+	if (xaddr == nxaddr) {
+		/* extend the extent */
+		if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
+			dbFree(ip, xaddr + xlen, delta);
+			dquot_free_block(ip, nxlen);
+			goto exit;
+		}
+	} else {
+		/*
+		 * move the extent to a new location:
+		 *
+		 * xtTailgate() accounts for relocated tail extent;
+		 */
+		if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
+			dbFree(ip, nxaddr, nxlen);
+			dquot_free_block(ip, nxlen);
+			goto exit;
+		}
+	}
+
+
+	/* check if we need to also insert a new extent */
+	if (ninsert) {
+		/* perform the insert.  if it fails, free the blocks
+		 * to be inserted and make it appear that we only did
+		 * the xtExtend() or xtTailgate() above.
+		 */
+		xaddr = nxaddr + ntail;
+		if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert,
+			      &xaddr, 0)) {
+			dbFree(ip, xaddr, (s64) ninsert);
+			delta = nextend;
+			nxlen = ntail;
+			xflag = 0;
+		}
+	}
+
+	/* set the return results */
+	XADaddress(xp, nxaddr);
+	XADlength(xp, nxlen);
+	XADoffset(xp, xoff);
+	xp->flag = xflag;
+
+	mark_inode_dirty(ip);
+exit:
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	return (rc);
+}
+#endif			/* _NOTYET */
+
+
+/*
+ * NAME:	extHint()
+ *
+ * FUNCTION:	produce an extent allocation hint for a file offset.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	offset  - file offset for which the hint is needed.
+ *	xp	- pointer to the xad that is to be filled in with
+ *		  the hint.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ */
+int extHint(struct inode *ip, s64 offset, xad_t * xp)
+{
+	struct super_block *sb = ip->i_sb;
+	int nbperpage = JFS_SBI(sb)->nbperpage;
+	s64 prev;
+	int rc = 0;
+	s64 xaddr;
+	int xlen;
+	int xflag;
+
+	/* init the hint as "no hint provided" */
+	XADaddress(xp, 0);
+
+	/* determine the starting extent offset of the page previous
+	 * to the page containing the offset.
+	 */
+	prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage;
+
+	/* if the offset is in the first page of the file, no hint provided.
+	 */
+	if (prev < 0)
+		goto out;
+
+	rc = xtLookup(ip, prev, nbperpage, &xflag, &xaddr, &xlen, 0);
+
+	if ((rc == 0) && xlen) {
+		if (xlen != nbperpage) {
+			jfs_error(ip->i_sb, "corrupt xtree\n");
+			rc = -EIO;
+		}
+		XADaddress(xp, xaddr);
+		XADlength(xp, xlen);
+		XADoffset(xp, prev);
+		/*
+		 * only preserve the abnr flag within the xad flags
+		 * of the returned hint.
+		 */
+		xp->flag  = xflag & XAD_NOTRECORDED;
+	} else
+		rc = 0;
+
+out:
+	return (rc);
+}
+
+
+/*
+ * NAME:	extRecord()
+ *
+ * FUNCTION:	change a page with a file from not recorded to recorded.
+ *
+ * PARAMETERS:
+ *	ip	- inode of the file.
+ *	cp	- cbuf of the file page.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int extRecord(struct inode *ip, xad_t * xp)
+{
+	int rc;
+
+	txBeginAnon(ip->i_sb);
+
+	mutex_lock(&JFS_IP(ip)->commit_mutex);
+
+	/* update the extent */
+	rc = xtUpdate(0, ip, xp);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	return rc;
+}
+
+
+#ifdef _NOTYET
+/*
+ * NAME:	extFill()
+ *
+ * FUNCTION:	allocate disk space for a file page that represents
+ *		a file hole.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	cp	- cbuf of the file page represent the hole.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int extFill(struct inode *ip, xad_t * xp)
+{
+	int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
+	s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
+
+//	assert(ISSPARSE(ip));
+
+	/* initialize the extent allocation hint */
+	XADaddress(xp, 0);
+
+	/* allocate an extent to fill the hole */
+	if ((rc = extAlloc(ip, nbperpage, blkno, xp, false)))
+		return (rc);
+
+	assert(lengthPXD(xp) == nbperpage);
+
+	return (0);
+}
+#endif			/* _NOTYET */
+
+
+/*
+ * NAME:	extBalloc()
+ *
+ * FUNCTION:	allocate disk blocks to form an extent.
+ *
+ *		initially, we will try to allocate disk blocks for the
+ *		requested size (nblocks).  if this fails (nblocks
+ *		contiguous free blocks not available), we'll try to allocate
+ *		a smaller number of blocks (producing a smaller extent), with
+ *		this smaller number of blocks consisting of the requested
+ *		number of blocks rounded down to the next smaller power of 2
+ *		number (i.e. 16 -> 8).  we'll continue to round down and
+ *		retry the allocation until the number of blocks to allocate
+ *		is smaller than the number of blocks per page.
+ *
+ * PARAMETERS:
+ *	ip	 - the inode of the file.
+ *	hint	 - disk block number to be used as an allocation hint.
+ *	*nblocks - pointer to an s64 value.  on entry, this value specifies
+ *		   the desired number of block to be allocated. on successful
+ *		   exit, this value is set to the number of blocks actually
+ *		   allocated.
+ *	blkno	 - pointer to a block address that is filled in on successful
+ *		   return with the starting block number of the newly
+ *		   allocated block range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+static int
+extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
+{
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	s64 nb, nblks, daddr, max;
+	int rc, nbperpage = sbi->nbperpage;
+	struct bmap *bmp = sbi->bmap;
+	int ag;
+
+	/* get the number of blocks to initially attempt to allocate.
+	 * we'll first try the number of blocks requested unless this
+	 * number is greater than the maximum number of contiguous free
+	 * blocks in the map. in that case, we'll start off with the
+	 * maximum free.
+	 */
+	max = (s64) 1 << bmp->db_maxfreebud;
+	if (*nblocks >= max && *nblocks > nbperpage)
+		nb = nblks = (max > nbperpage) ? max : nbperpage;
+	else
+		nb = nblks = *nblocks;
+
+	/* try to allocate blocks */
+	while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) {
+		/* if something other than an out of space error,
+		 * stop and return this error.
+		 */
+		if (rc != -ENOSPC)
+			return (rc);
+
+		/* decrease the allocation request size */
+		nb = min(nblks, extRoundDown(nb));
+
+		/* give up if we cannot cover a page */
+		if (nb < nbperpage)
+			return (rc);
+	}
+
+	*nblocks = nb;
+	*blkno = daddr;
+
+	if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) {
+		ag = BLKTOAG(daddr, sbi);
+		spin_lock_irq(&ji->ag_lock);
+		if (ji->active_ag == -1) {
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		} else if (ji->active_ag != ag) {
+			atomic_dec(&bmp->db_active[ji->active_ag]);
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		}
+		spin_unlock_irq(&ji->ag_lock);
+	}
+
+	return (0);
+}
+
+
+#ifdef _NOTYET
+/*
+ * NAME:	extBrealloc()
+ *
+ * FUNCTION:	attempt to extend an extent's allocation.
+ *
+ *		Initially, we will try to extend the extent's allocation
+ *		in place.  If this fails, we'll try to move the extent
+ *		to a new set of blocks.  If moving the extent, we initially
+ *		will try to allocate disk blocks for the requested size
+ *		(newnblks).  if this fails (new contiguous free blocks not
+ *		available), we'll try to allocate a smaller number of
+ *		blocks (producing a smaller extent), with this smaller
+ *		number of blocks consisting of the requested number of
+ *		blocks rounded down to the next smaller power of 2
+ *		number (i.e. 16 -> 8).  We'll continue to round down and
+ *		retry the allocation until the number of blocks to allocate
+ *		is smaller than the number of blocks per page.
+ *
+ * PARAMETERS:
+ *	ip	 - the inode of the file.
+ *	blkno	 - starting block number of the extents current allocation.
+ *	nblks	 - number of blocks within the extents current allocation.
+ *	newnblks - pointer to a s64 value.  on entry, this value is the
+ *		   the new desired extent size (number of blocks).  on
+ *		   successful exit, this value is set to the extent's actual
+ *		   new size (new number of blocks).
+ *	newblkno - the starting block number of the extents new allocation.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+static int
+extBrealloc(struct inode *ip,
+	    s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno)
+{
+	int rc;
+
+	/* try to extend in place */
+	if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) {
+		*newblkno = blkno;
+		return (0);
+	} else {
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* in place extension not possible.
+	 * try to move the extent to a new set of blocks.
+	 */
+	return (extBalloc(ip, blkno, newnblks, newblkno));
+}
+#endif			/* _NOTYET */
+
+
+/*
+ * NAME:	extRoundDown()
+ *
+ * FUNCTION:	round down a specified number of blocks to the next
+ *		smallest power of 2 number.
+ *
+ * PARAMETERS:
+ *	nb	- the inode of the file.
+ *
+ * RETURN VALUES:
+ *	next smallest power of 2 number.
+ */
+static s64 extRoundDown(s64 nb)
+{
+	int i;
+	u64 m, k;
+
+	for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) {
+		if (m & nb)
+			break;
+	}
+
+	i = 63 - i;
+	k = (u64) 1 << i;
+	k = ((k - 1) & nb) ? k : k >> 1;
+
+	return (k);
+}
diff --git a/kernel/fs/jfs/jfs_extent.h b/kernel/fs/jfs/jfs_extent.h
new file mode 100644
index 000000000..b567e12c5
--- /dev/null
+++ b/kernel/fs/jfs/jfs_extent.h
@@ -0,0 +1,31 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_EXTENT
+#define _H_JFS_EXTENT
+
+/*  get block allocation allocation hint as location of disk inode */
+#define	INOHINT(ip)	\
+	(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
+
+extern int	extAlloc(struct inode *, s64, s64, xad_t *, bool);
+extern int	extFill(struct inode *, xad_t *);
+extern int	extHint(struct inode *, s64, xad_t *);
+extern int	extRealloc(struct inode *, s64, xad_t *, bool);
+extern int	extRecord(struct inode *, xad_t *);
+
+#endif	/* _H_JFS_EXTENT */
diff --git a/kernel/fs/jfs/jfs_filsys.h b/kernel/fs/jfs/jfs_filsys.h
new file mode 100644
index 000000000..b67d64671
--- /dev/null
+++ b/kernel/fs/jfs/jfs_filsys.h
@@ -0,0 +1,285 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_FILSYS
+#define _H_JFS_FILSYS
+
+/*
+ *	jfs_filsys.h
+ *
+ * file system (implementation-dependent) constants
+ *
+ * refer to <limits.h> for system wide implementation-dependent constants
+ */
+
+/*
+ *	 file system option (superblock flag)
+ */
+
+/* directory option */
+#define JFS_UNICODE	0x00000001	/* unicode name */
+
+/* mount time flags for error handling */
+#define JFS_ERR_REMOUNT_RO 0x00000002	/* remount read-only */
+#define JFS_ERR_CONTINUE   0x00000004	/* continue */
+#define JFS_ERR_PANIC      0x00000008	/* panic */
+
+/* Quota support */
+#define	JFS_USRQUOTA	0x00000010
+#define	JFS_GRPQUOTA	0x00000020
+
+/* mount time flag to disable journaling to disk */
+#define JFS_NOINTEGRITY 0x00000040
+
+/* mount time flag to enable TRIM to ssd disks */
+#define JFS_DISCARD     0x00000080
+
+/* commit option */
+#define	JFS_COMMIT	0x00000f00	/* commit option mask */
+#define	JFS_GROUPCOMMIT	0x00000100	/* group (of 1) commit */
+#define	JFS_LAZYCOMMIT	0x00000200	/* lazy commit */
+#define	JFS_TMPFS	0x00000400	/* temporary file system -
+					 * do not log/commit:
+					 * Never implemented
+					 */
+
+/* log logical volume option */
+#define	JFS_INLINELOG	0x00000800	/* inline log within file system */
+#define JFS_INLINEMOVE	0x00001000	/* inline log being moved */
+
+/* Secondary aggregate inode table */
+#define JFS_BAD_SAIT	0x00010000	/* current secondary ait is bad */
+
+/* sparse regular file support */
+#define JFS_SPARSE	0x00020000	/* sparse regular file */
+
+/* DASD Limits		F226941 */
+#define JFS_DASD_ENABLED 0x00040000	/* DASD limits enabled */
+#define	JFS_DASD_PRIME	0x00080000	/* Prime DASD usage on boot */
+
+/* big endian flag */
+#define	JFS_SWAP_BYTES	0x00100000	/* running on big endian computer */
+
+/* Directory index */
+#define JFS_DIR_INDEX	0x00200000	/* Persistent index for */
+
+/* platform options */
+#define JFS_LINUX	0x10000000	/* Linux support */
+#define JFS_DFS		0x20000000	/* DCE DFS LFS support */
+/*	Never implemented */
+
+#define JFS_OS2		0x40000000	/* OS/2 support */
+/*	case-insensitive name/directory support */
+
+#define JFS_AIX		0x80000000	/* AIX support */
+
+/*
+ *	buffer cache configuration
+ */
+/* page size */
+#ifdef PSIZE
+#undef PSIZE
+#endif
+#define	PSIZE		4096	/* page size (in byte) */
+#define	L2PSIZE		12	/* log2(PSIZE) */
+#define	POFFSET		4095	/* offset within page */
+
+/* buffer page size */
+#define BPSIZE	PSIZE
+
+/*
+ *	fs fundamental size
+ *
+ * PSIZE >= file system block size >= PBSIZE >= DISIZE
+ */
+#define	PBSIZE		512	/* physical block size (in byte) */
+#define	L2PBSIZE	9	/* log2(PBSIZE) */
+
+#define DISIZE		512	/* on-disk inode size (in byte) */
+#define L2DISIZE	9	/* log2(DISIZE) */
+
+#define IDATASIZE	256	/* inode inline data size */
+#define	IXATTRSIZE	128	/* inode inline extended attribute size */
+
+#define XTPAGE_SIZE	4096
+#define log2_PAGESIZE	12
+
+#define IAG_SIZE	4096
+#define IAG_EXTENT_SIZE 4096
+#define	INOSPERIAG	4096	/* number of disk inodes per iag */
+#define	L2INOSPERIAG	12	/* l2 number of disk inodes per iag */
+#define INOSPEREXT	32	/* number of disk inode per extent */
+#define L2INOSPEREXT	5	/* l2 number of disk inode per extent */
+#define	IXSIZE		(DISIZE * INOSPEREXT)	/* inode extent size */
+#define	INOSPERPAGE	8	/* number of disk inodes per 4K page */
+#define	L2INOSPERPAGE	3	/* log2(INOSPERPAGE) */
+
+#define	IAGFREELIST_LWM	64
+
+#define INODE_EXTENT_SIZE	IXSIZE	/* inode extent size */
+#define NUM_INODE_PER_EXTENT	INOSPEREXT
+#define NUM_INODE_PER_IAG	INOSPERIAG
+
+#define MINBLOCKSIZE		512
+#define MAXBLOCKSIZE		4096
+#define	MAXFILESIZE		((s64)1 << 52)
+
+#define JFS_LINK_MAX		0xffffffff
+
+/* Minimum number of bytes supported for a JFS partition */
+#define MINJFS			(0x1000000)
+#define MINJFSTEXT		"16"
+
+/*
+ * file system block size -> physical block size
+ */
+#define LBOFFSET(x)	((x) & (PBSIZE - 1))
+#define LBNUMBER(x)	((x) >> L2PBSIZE)
+#define	LBLK2PBLK(sb,b)	((b) << (sb->s_blocksize_bits - L2PBSIZE))
+#define	PBLK2LBLK(sb,b)	((b) >> (sb->s_blocksize_bits - L2PBSIZE))
+/* size in byte -> last page number */
+#define	SIZE2PN(size)	( ((s64)((size) - 1)) >> (L2PSIZE) )
+/* size in byte -> last file system block number */
+#define	SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) )
+
+/*
+ * fixed physical block address (physical block size = 512 byte)
+ *
+ * NOTE: since we can't guarantee a physical block size of 512 bytes the use of
+ *	 these macros should be removed and the byte offset macros used instead.
+ */
+#define SUPER1_B	64	/* primary superblock */
+#define	AIMAP_B		(SUPER1_B + 8)	/* 1st extent of aggregate inode map */
+#define	AITBL_B		(AIMAP_B + 16)	/*
+					 * 1st extent of aggregate inode table
+					 */
+#define	SUPER2_B	(AITBL_B + 32)	/* 2ndary superblock pbn */
+#define	BMAP_B		(SUPER2_B + 8)	/* block allocation map */
+
+/*
+ * SIZE_OF_SUPER defines the total amount of space reserved on disk for the
+ * superblock.  This is not the same as the superblock structure, since all of
+ * this space is not currently being used.
+ */
+#define SIZE_OF_SUPER	PSIZE
+
+/*
+ * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table
+ */
+#define SIZE_OF_AG_TABLE	PSIZE
+
+/*
+ * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of
+ * the inode allocation map (to hold iag)
+ */
+#define SIZE_OF_MAP_PAGE	PSIZE
+
+/*
+ * fixed byte offset address
+ */
+#define SUPER1_OFF	0x8000	/* primary superblock */
+#define AIMAP_OFF	(SUPER1_OFF + SIZE_OF_SUPER)
+					/*
+					 * Control page of aggregate inode map
+					 * followed by 1st extent of map
+					 */
+#define AITBL_OFF	(AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1))
+					/*
+					 * 1st extent of aggregate inode table
+					 */
+#define SUPER2_OFF	(AITBL_OFF + INODE_EXTENT_SIZE)
+					/*
+					 * secondary superblock
+					 */
+#define BMAP_OFF	(SUPER2_OFF + SIZE_OF_SUPER)
+					/*
+					 * block allocation map
+					 */
+
+/*
+ * The following macro is used to indicate the number of reserved disk blocks at
+ * the front of an aggregate, in terms of physical blocks.  This value is
+ * currently defined to be 32K.  This turns out to be the same as the primary
+ * superblock's address, since it directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BLOCKS	SUPER1_B
+
+/*
+ * The following macro is used to indicate the number of reserved bytes at the
+ * front of an aggregate.  This value is currently defined to be 32K.  This
+ * turns out to be the same as the primary superblock's byte offset, since it
+ * directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BYTES	SUPER1_OFF
+
+/*
+ * The following macro defines the byte offset for the first inode extent in
+ * the aggregate inode table.  This allows us to find the self inode to find the
+ * rest of the table.  Currently this value is 44K.
+ */
+#define AGGR_INODE_TABLE_START	AITBL_OFF
+
+/*
+ *	fixed reserved inode number
+ */
+/* aggregate inode */
+#define AGGR_RESERVED_I	0	/* aggregate inode (reserved) */
+#define	AGGREGATE_I	1	/* aggregate inode map inode */
+#define	BMAP_I		2	/* aggregate block allocation map inode */
+#define	LOG_I		3	/* aggregate inline log inode */
+#define BADBLOCK_I	4	/* aggregate bad block inode */
+#define	FILESYSTEM_I	16	/* 1st/only fileset inode in ait:
+				 * fileset inode map inode
+				 */
+
+/* per fileset inode */
+#define FILESET_RSVD_I	0	/* fileset inode (reserved) */
+#define FILESET_EXT_I	1	/* fileset inode extension */
+#define	ROOT_I		2	/* fileset root inode */
+#define ACL_I		3	/* fileset ACL inode */
+
+#define FILESET_OBJECT_I 4	/* the first fileset inode available for a file
+				 * or directory or link...
+				 */
+#define FIRST_FILESET_INO 16	/* the first aggregate inode which describes
+				 * an inode.  (To fsck this is also the first
+				 * inode in part 2 of the agg inode table.)
+				 */
+
+/*
+ *	directory configuration
+ */
+#define JFS_NAME_MAX	255
+#define JFS_PATH_MAX	BPSIZE
+
+
+/*
+ *	file system state (superblock state)
+ */
+#define FM_CLEAN 0x00000000	/* file system is unmounted and clean */
+#define FM_MOUNT 0x00000001	/* file system is mounted cleanly */
+#define FM_DIRTY 0x00000002	/* file system was not unmounted and clean
+				 * when mounted or
+				 * commit failure occurred while being mounted:
+				 * fsck() must be run to repair
+				 */
+#define	FM_LOGREDO 0x00000004	/* log based recovery (logredo()) failed:
+				 * fsck() must be run to repair
+				 */
+#define	FM_EXTENDFS 0x00000008	/* file system extendfs() in progress */
+
+#endif				/* _H_JFS_FILSYS */
diff --git a/kernel/fs/jfs/jfs_imap.c b/kernel/fs/jfs/jfs_imap.c
new file mode 100644
index 000000000..f321986e7
--- /dev/null
+++ b/kernel/fs/jfs/jfs_imap.c
@@ -0,0 +1,3178 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_imap.c: inode allocation map manager
+ *
+ * Serialization:
+ *   Each AG has a simple lock which is used to control the serialization of
+ *	the AG level lists.  This lock should be taken first whenever an AG
+ *	level list will be modified or accessed.
+ *
+ *   Each IAG is locked by obtaining the buffer for the IAG page.
+ *
+ *   There is also a inode lock for the inode map inode.  A read lock needs to
+ *	be taken whenever an IAG is read from the map or the global level
+ *	information is read.  A write lock needs to be taken whenever the global
+ *	level information is modified or an atomic operation needs to be used.
+ *
+ *	If more than one IAG is read at one time, the read lock may not
+ *	be given up until all of the IAG's are read.  Otherwise, a deadlock
+ *	may occur when trying to obtain the read lock while another thread
+ *	holding the read lock is waiting on the IAG already being held.
+ *
+ *   The control page of the inode map is read into memory by diMount().
+ *	Thereafter it should only be modified in memory and then it will be
+ *	written out when the filesystem is unmounted by diUnmount().
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ * imap locks
+ */
+/* iag free list lock */
+#define IAGFREE_LOCK_INIT(imap)		mutex_init(&imap->im_freelock)
+#define IAGFREE_LOCK(imap)		mutex_lock(&imap->im_freelock)
+#define IAGFREE_UNLOCK(imap)		mutex_unlock(&imap->im_freelock)
+
+/* per ag iag list locks */
+#define AG_LOCK_INIT(imap,index)	mutex_init(&(imap->im_aglock[index]))
+#define AG_LOCK(imap,agno)		mutex_lock(&imap->im_aglock[agno])
+#define AG_UNLOCK(imap,agno)		mutex_unlock(&imap->im_aglock[agno])
+
+/*
+ * forward references
+ */
+static int diAllocAG(struct inomap *, int, bool, struct inode *);
+static int diAllocAny(struct inomap *, int, bool, struct inode *);
+static int diAllocBit(struct inomap *, struct iag *, int);
+static int diAllocExt(struct inomap *, int, struct inode *);
+static int diAllocIno(struct inomap *, int, struct inode *);
+static int diFindFree(u32, int);
+static int diNewExt(struct inomap *, struct iag *, int);
+static int diNewIAG(struct inomap *, int *, int, struct metapage **);
+static void duplicateIXtree(struct super_block *, s64, int, s64 *);
+
+static int diIAGRead(struct inomap * imap, int, struct metapage **);
+static int copy_from_dinode(struct dinode *, struct inode *);
+static void copy_to_dinode(struct dinode *, struct inode *);
+
+/*
+ * NAME:	diMount()
+ *
+ * FUNCTION:	initialize the incore inode map control structures for
+ *		a fileset or aggregate init time.
+ *
+ *		the inode map's control structure (dinomap) is
+ *		brought in from disk and placed in virtual memory.
+ *
+ * PARAMETERS:
+ *	ipimap	- pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient free virtual memory.
+ *	-EIO	- i/o error.
+ */
+int diMount(struct inode *ipimap)
+{
+	struct inomap *imap;
+	struct metapage *mp;
+	int index;
+	struct dinomap_disk *dinom_le;
+
+	/*
+	 * allocate/initialize the in-memory inode map control structure
+	 */
+	/* allocate the in-memory inode map control structure. */
+	imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
+	if (imap == NULL) {
+		jfs_err("diMount: kmalloc returned NULL!");
+		return -ENOMEM;
+	}
+
+	/* read the on-disk inode map control structure. */
+
+	mp = read_metapage(ipimap,
+			   IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+			   PSIZE, 0);
+	if (mp == NULL) {
+		kfree(imap);
+		return -EIO;
+	}
+
+	/* copy the on-disk version to the in-memory version. */
+	dinom_le = (struct dinomap_disk *) mp->data;
+	imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
+	imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
+	atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
+	atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
+	imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
+	imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
+	for (index = 0; index < MAXAG; index++) {
+		imap->im_agctl[index].inofree =
+		    le32_to_cpu(dinom_le->in_agctl[index].inofree);
+		imap->im_agctl[index].extfree =
+		    le32_to_cpu(dinom_le->in_agctl[index].extfree);
+		imap->im_agctl[index].numinos =
+		    le32_to_cpu(dinom_le->in_agctl[index].numinos);
+		imap->im_agctl[index].numfree =
+		    le32_to_cpu(dinom_le->in_agctl[index].numfree);
+	}
+
+	/* release the buffer. */
+	release_metapage(mp);
+
+	/*
+	 * allocate/initialize inode allocation map locks
+	 */
+	/* allocate and init iag free list lock */
+	IAGFREE_LOCK_INIT(imap);
+
+	/* allocate and init ag list locks */
+	for (index = 0; index < MAXAG; index++) {
+		AG_LOCK_INIT(imap, index);
+	}
+
+	/* bind the inode map inode and inode map control structure
+	 * to each other.
+	 */
+	imap->im_ipimap = ipimap;
+	JFS_IP(ipimap)->i_imap = imap;
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diUnmount()
+ *
+ * FUNCTION:	write to disk the incore inode map control structures for
+ *		a fileset or aggregate at unmount time.
+ *
+ * PARAMETERS:
+ *	ipimap	- pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient free virtual memory.
+ *	-EIO	- i/o error.
+ */
+int diUnmount(struct inode *ipimap, int mounterror)
+{
+	struct inomap *imap = JFS_IP(ipimap)->i_imap;
+
+	/*
+	 * update the on-disk inode map control structure
+	 */
+
+	if (!(mounterror || isReadOnly(ipimap)))
+		diSync(ipimap);
+
+	/*
+	 * Invalidate the page cache buffers
+	 */
+	truncate_inode_pages(ipimap->i_mapping, 0);
+
+	/*
+	 * free in-memory control structure
+	 */
+	kfree(imap);
+
+	return (0);
+}
+
+
+/*
+ *	diSync()
+ */
+int diSync(struct inode *ipimap)
+{
+	struct dinomap_disk *dinom_le;
+	struct inomap *imp = JFS_IP(ipimap)->i_imap;
+	struct metapage *mp;
+	int index;
+
+	/*
+	 * write imap global conrol page
+	 */
+	/* read the on-disk inode map control structure */
+	mp = get_metapage(ipimap,
+			  IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+			  PSIZE, 0);
+	if (mp == NULL) {
+		jfs_err("diSync: get_metapage failed!");
+		return -EIO;
+	}
+
+	/* copy the in-memory version to the on-disk version */
+	dinom_le = (struct dinomap_disk *) mp->data;
+	dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
+	dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
+	dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
+	dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
+	dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
+	dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
+	for (index = 0; index < MAXAG; index++) {
+		dinom_le->in_agctl[index].inofree =
+		    cpu_to_le32(imp->im_agctl[index].inofree);
+		dinom_le->in_agctl[index].extfree =
+		    cpu_to_le32(imp->im_agctl[index].extfree);
+		dinom_le->in_agctl[index].numinos =
+		    cpu_to_le32(imp->im_agctl[index].numinos);
+		dinom_le->in_agctl[index].numfree =
+		    cpu_to_le32(imp->im_agctl[index].numfree);
+	}
+
+	/* write out the control structure */
+	write_metapage(mp);
+
+	/*
+	 * write out dirty pages of imap
+	 */
+	filemap_write_and_wait(ipimap->i_mapping);
+
+	diWriteSpecial(ipimap, 0);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diRead()
+ *
+ * FUNCTION:	initialize an incore inode from disk.
+ *
+ *		on entry, the specifed incore inode should itself
+ *		specify the disk inode number corresponding to the
+ *		incore inode (i.e. i_number should be initialized).
+ *
+ *		this routine handles incore inode initialization for
+ *		both "special" and "regular" inodes.  special inodes
+ *		are those required early in the mount process and
+ *		require special handling since much of the file system
+ *		is not yet initialized.  these "special" inodes are
+ *		identified by a NULL inode map inode pointer and are
+ *		actually initialized by a call to diReadSpecial().
+ *
+ *		for regular inodes, the iag describing the disk inode
+ *		is read from disk to determine the inode extent address
+ *		for the disk inode.  with the inode extent address in
+ *		hand, the page of the extent that contains the disk
+ *		inode is read and the disk inode is copied to the
+ *		incore inode.
+ *
+ * PARAMETERS:
+ *	ip	-  pointer to incore inode to be initialized from disk.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOMEM	- insufficient memory
+ *
+ */
+int diRead(struct inode *ip)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	int iagno, ino, extno, rc;
+	struct inode *ipimap;
+	struct dinode *dp;
+	struct iag *iagp;
+	struct metapage *mp;
+	s64 blkno, agstart;
+	struct inomap *imap;
+	int block_offset;
+	int inodes_left;
+	unsigned long pageno;
+	int rel_inode;
+
+	jfs_info("diRead: ino = %ld", ip->i_ino);
+
+	ipimap = sbi->ipimap;
+	JFS_IP(ip)->ipimap = ipimap;
+
+	/* determine the iag number for this inode (number) */
+	iagno = INOTOIAG(ip->i_ino);
+
+	/* read the iag */
+	imap = JFS_IP(ipimap)->i_imap;
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+	rc = diIAGRead(imap, iagno, &mp);
+	IREAD_UNLOCK(ipimap);
+	if (rc) {
+		jfs_err("diRead: diIAGRead returned %d", rc);
+		return (rc);
+	}
+
+	iagp = (struct iag *) mp->data;
+
+	/* determine inode extent that holds the disk inode */
+	ino = ip->i_ino & (INOSPERIAG - 1);
+	extno = ino >> L2INOSPEREXT;
+
+	if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
+	    (addressPXD(&iagp->inoext[extno]) == 0)) {
+		release_metapage(mp);
+		return -ESTALE;
+	}
+
+	/* get disk block number of the page within the inode extent
+	 * that holds the disk inode.
+	 */
+	blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
+
+	/* get the ag for the iag */
+	agstart = le64_to_cpu(iagp->agstart);
+
+	release_metapage(mp);
+
+	rel_inode = (ino & (INOSPERPAGE - 1));
+	pageno = blkno >> sbi->l2nbperpage;
+
+	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+		/*
+		 * OS/2 didn't always align inode extents on page boundaries
+		 */
+		inodes_left =
+		     (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+		if (rel_inode < inodes_left)
+			rel_inode += block_offset << sbi->l2niperblk;
+		else {
+			pageno += 1;
+			rel_inode -= inodes_left;
+		}
+	}
+
+	/* read the page of disk inode */
+	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+	if (!mp) {
+		jfs_err("diRead: read_metapage failed");
+		return -EIO;
+	}
+
+	/* locate the disk inode requested */
+	dp = (struct dinode *) mp->data;
+	dp += rel_inode;
+
+	if (ip->i_ino != le32_to_cpu(dp->di_number)) {
+		jfs_error(ip->i_sb, "i_ino != di_number\n");
+		rc = -EIO;
+	} else if (le32_to_cpu(dp->di_nlink) == 0)
+		rc = -ESTALE;
+	else
+		/* copy the disk inode to the in-memory inode */
+		rc = copy_from_dinode(dp, ip);
+
+	release_metapage(mp);
+
+	/* set the ag for the inode */
+	JFS_IP(ip)->agstart = agstart;
+	JFS_IP(ip)->active_ag = -1;
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	diReadSpecial()
+ *
+ * FUNCTION:	initialize a 'special' inode from disk.
+ *
+ *		this routines handles aggregate level inodes.  The
+ *		inode cache cannot differentiate between the
+ *		aggregate inodes and the filesystem inodes, so we
+ *		handle these here.  We don't actually use the aggregate
+ *		inode map, since these inodes are at a fixed location
+ *		and in some cases the aggregate inode map isn't initialized
+ *		yet.
+ *
+ * PARAMETERS:
+ *	sb - filesystem superblock
+ *	inum - aggregate inode number
+ *	secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES:
+ *	new inode	- success
+ *	NULL		- i/o error.
+ */
+struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	uint address;
+	struct dinode *dp;
+	struct inode *ip;
+	struct metapage *mp;
+
+	ip = new_inode(sb);
+	if (ip == NULL) {
+		jfs_err("diReadSpecial: new_inode returned NULL!");
+		return ip;
+	}
+
+	if (secondary) {
+		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+		JFS_IP(ip)->ipimap = sbi->ipaimap2;
+	} else {
+		address = AITBL_OFF >> L2PSIZE;
+		JFS_IP(ip)->ipimap = sbi->ipaimap;
+	}
+
+	ASSERT(inum < INOSPEREXT);
+
+	ip->i_ino = inum;
+
+	address += inum >> 3;	/* 8 inodes per 4K page */
+
+	/* read the page of fixed disk inode (AIT) in raw mode */
+	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+	if (mp == NULL) {
+		set_nlink(ip, 1);	/* Don't want iput() deleting it */
+		iput(ip);
+		return (NULL);
+	}
+
+	/* get the pointer to the disk inode of interest */
+	dp = (struct dinode *) (mp->data);
+	dp += inum % 8;		/* 8 inodes per 4K page */
+
+	/* copy on-disk inode to in-memory inode */
+	if ((copy_from_dinode(dp, ip)) != 0) {
+		/* handle bad return by returning NULL for ip */
+		set_nlink(ip, 1);	/* Don't want iput() deleting it */
+		iput(ip);
+		/* release the page */
+		release_metapage(mp);
+		return (NULL);
+
+	}
+
+	ip->i_mapping->a_ops = &jfs_metapage_aops;
+	mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS);
+
+	/* Allocations to metadata inodes should not affect quotas */
+	ip->i_flags |= S_NOQUOTA;
+
+	if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
+		sbi->gengen = le32_to_cpu(dp->di_gengen);
+		sbi->inostamp = le32_to_cpu(dp->di_inostamp);
+	}
+
+	/* release the page */
+	release_metapage(mp);
+
+	/*
+	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+	 * want special inodes in the fileset inode space, we make them
+	 * appear hashed, but do not put on any lists.  hlist_del()
+	 * will work fine and require no locking.
+	 */
+	hlist_add_fake(&ip->i_hash);
+
+	return (ip);
+}
+
+/*
+ * NAME:	diWriteSpecial()
+ *
+ * FUNCTION:	Write the special inode to disk
+ *
+ * PARAMETERS:
+ *	ip - special inode
+ *	secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES: none
+ */
+
+void diWriteSpecial(struct inode *ip, int secondary)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	uint address;
+	struct dinode *dp;
+	ino_t inum = ip->i_ino;
+	struct metapage *mp;
+
+	if (secondary)
+		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+	else
+		address = AITBL_OFF >> L2PSIZE;
+
+	ASSERT(inum < INOSPEREXT);
+
+	address += inum >> 3;	/* 8 inodes per 4K page */
+
+	/* read the page of fixed disk inode (AIT) in raw mode */
+	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+	if (mp == NULL) {
+		jfs_err("diWriteSpecial: failed to read aggregate inode "
+			"extent!");
+		return;
+	}
+
+	/* get the pointer to the disk inode of interest */
+	dp = (struct dinode *) (mp->data);
+	dp += inum % 8;		/* 8 inodes per 4K page */
+
+	/* copy on-disk inode to in-memory inode */
+	copy_to_dinode(dp, ip);
+	memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
+
+	if (inum == FILESYSTEM_I)
+		dp->di_gengen = cpu_to_le32(sbi->gengen);
+
+	/* write the page */
+	write_metapage(mp);
+}
+
+/*
+ * NAME:	diFreeSpecial()
+ *
+ * FUNCTION:	Free allocated space for special inode
+ */
+void diFreeSpecial(struct inode *ip)
+{
+	if (ip == NULL) {
+		jfs_err("diFreeSpecial called with NULL ip!");
+		return;
+	}
+	filemap_write_and_wait(ip->i_mapping);
+	truncate_inode_pages(ip->i_mapping, 0);
+	iput(ip);
+}
+
+
+
+/*
+ * NAME:	diWrite()
+ *
+ * FUNCTION:	write the on-disk inode portion of the in-memory inode
+ *		to its corresponding on-disk inode.
+ *
+ *		on entry, the specifed incore inode should itself
+ *		specify the disk inode number corresponding to the
+ *		incore inode (i.e. i_number should be initialized).
+ *
+ *		the inode contains the inode extent address for the disk
+ *		inode.  with the inode extent address in hand, the
+ *		page of the extent that contains the disk inode is
+ *		read and the disk inode portion of the incore inode
+ *		is copied to the disk inode.
+ *
+ * PARAMETERS:
+ *	tid -  transacation id
+ *	ip  -  pointer to incore inode to be written to the inode extent.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ */
+int diWrite(tid_t tid, struct inode *ip)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	int rc = 0;
+	s32 ino;
+	struct dinode *dp;
+	s64 blkno;
+	int block_offset;
+	int inodes_left;
+	struct metapage *mp;
+	unsigned long pageno;
+	int rel_inode;
+	int dioffset;
+	struct inode *ipimap;
+	uint type;
+	lid_t lid;
+	struct tlock *ditlck, *tlck;
+	struct linelock *dilinelock, *ilinelock;
+	struct lv *lv;
+	int n;
+
+	ipimap = jfs_ip->ipimap;
+
+	ino = ip->i_ino & (INOSPERIAG - 1);
+
+	if (!addressPXD(&(jfs_ip->ixpxd)) ||
+	    (lengthPXD(&(jfs_ip->ixpxd)) !=
+	     JFS_IP(ipimap)->i_imap->im_nbperiext)) {
+		jfs_error(ip->i_sb, "ixpxd invalid\n");
+		return -EIO;
+	}
+
+	/*
+	 * read the page of disk inode containing the specified inode:
+	 */
+	/* compute the block address of the page */
+	blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
+
+	rel_inode = (ino & (INOSPERPAGE - 1));
+	pageno = blkno >> sbi->l2nbperpage;
+
+	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+		/*
+		 * OS/2 didn't always align inode extents on page boundaries
+		 */
+		inodes_left =
+		    (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+		if (rel_inode < inodes_left)
+			rel_inode += block_offset << sbi->l2niperblk;
+		else {
+			pageno += 1;
+			rel_inode -= inodes_left;
+		}
+	}
+	/* read the page of disk inode */
+      retry:
+	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+	if (!mp)
+		return -EIO;
+
+	/* get the pointer to the disk inode */
+	dp = (struct dinode *) mp->data;
+	dp += rel_inode;
+
+	dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
+
+	/*
+	 * acquire transaction lock on the on-disk inode;
+	 * N.B. tlock is acquired on ipimap not ip;
+	 */
+	if ((ditlck =
+	     txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
+		goto retry;
+	dilinelock = (struct linelock *) & ditlck->lock;
+
+	/*
+	 * copy btree root from in-memory inode to on-disk inode
+	 *
+	 * (tlock is taken from inline B+-tree root in in-memory
+	 * inode when the B+-tree root is updated, which is pointed
+	 * by jfs_ip->blid as well as being on tx tlock list)
+	 *
+	 * further processing of btree root is based on the copy
+	 * in in-memory inode, where txLog() will log from, and,
+	 * for xtree root, txUpdateMap() will update map and reset
+	 * XAD_NEW bit;
+	 */
+
+	if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
+		/*
+		 * This is the special xtree inside the directory for storing
+		 * the directory table
+		 */
+		xtpage_t *p, *xp;
+		xad_t *xad;
+
+		jfs_ip->xtlid = 0;
+		tlck = lid_to_tlock(lid);
+		assert(tlck->type & tlckXTREE);
+		tlck->type |= tlckBTROOT;
+		tlck->mp = mp;
+		ilinelock = (struct linelock *) & tlck->lock;
+
+		/*
+		 * copy xtree root from inode to dinode:
+		 */
+		p = &jfs_ip->i_xtroot;
+		xp = (xtpage_t *) &dp->di_dirtable;
+		lv = ilinelock->lv;
+		for (n = 0; n < ilinelock->index; n++, lv++) {
+			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+			       lv->length << L2XTSLOTSIZE);
+		}
+
+		/* reset on-disk (metadata page) xtree XAD_NEW bit */
+		xad = &xp->xad[XTENTRYSTART];
+		for (n = XTENTRYSTART;
+		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+	}
+
+	if ((lid = jfs_ip->blid) == 0)
+		goto inlineData;
+	jfs_ip->blid = 0;
+
+	tlck = lid_to_tlock(lid);
+	type = tlck->type;
+	tlck->type |= tlckBTROOT;
+	tlck->mp = mp;
+	ilinelock = (struct linelock *) & tlck->lock;
+
+	/*
+	 *	regular file: 16 byte (XAD slot) granularity
+	 */
+	if (type & tlckXTREE) {
+		xtpage_t *p, *xp;
+		xad_t *xad;
+
+		/*
+		 * copy xtree root from inode to dinode:
+		 */
+		p = &jfs_ip->i_xtroot;
+		xp = &dp->di_xtroot;
+		lv = ilinelock->lv;
+		for (n = 0; n < ilinelock->index; n++, lv++) {
+			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+			       lv->length << L2XTSLOTSIZE);
+		}
+
+		/* reset on-disk (metadata page) xtree XAD_NEW bit */
+		xad = &xp->xad[XTENTRYSTART];
+		for (n = XTENTRYSTART;
+		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+	}
+	/*
+	 *	directory: 32 byte (directory entry slot) granularity
+	 */
+	else if (type & tlckDTREE) {
+		dtpage_t *p, *xp;
+
+		/*
+		 * copy dtree root from inode to dinode:
+		 */
+		p = (dtpage_t *) &jfs_ip->i_dtroot;
+		xp = (dtpage_t *) & dp->di_dtroot;
+		lv = ilinelock->lv;
+		for (n = 0; n < ilinelock->index; n++, lv++) {
+			memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
+			       lv->length << L2DTSLOTSIZE);
+		}
+	} else {
+		jfs_err("diWrite: UFO tlock");
+	}
+
+      inlineData:
+	/*
+	 * copy inline symlink from in-memory inode to on-disk inode
+	 */
+	if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
+		lv = & dilinelock->lv[dilinelock->index];
+		lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
+		lv->length = 2;
+		memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
+		dilinelock->index++;
+	}
+	/*
+	 * copy inline data from in-memory inode to on-disk inode:
+	 * 128 byte slot granularity
+	 */
+	if (test_cflag(COMMIT_Inlineea, ip)) {
+		lv = & dilinelock->lv[dilinelock->index];
+		lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
+		lv->length = 1;
+		memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
+		dilinelock->index++;
+
+		clear_cflag(COMMIT_Inlineea, ip);
+	}
+
+	/*
+	 *	lock/copy inode base: 128 byte slot granularity
+	 */
+	lv = & dilinelock->lv[dilinelock->index];
+	lv->offset = dioffset >> L2INODESLOTSIZE;
+	copy_to_dinode(dp, ip);
+	if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
+		lv->length = 2;
+		memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
+	} else
+		lv->length = 1;
+	dilinelock->index++;
+
+	/* release the buffer holding the updated on-disk inode.
+	 * the buffer will be later written by commit processing.
+	 */
+	write_metapage(mp);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	diFree(ip)
+ *
+ * FUNCTION:	free a specified inode from the inode working map
+ *		for a fileset or aggregate.
+ *
+ *		if the inode to be freed represents the first (only)
+ *		free inode within the iag, the iag will be placed on
+ *		the ag free inode list.
+ *
+ *		freeing the inode will cause the inode extent to be
+ *		freed if the inode is the only allocated inode within
+ *		the extent.  in this case all the disk resource backing
+ *		up the inode extent will be freed. in addition, the iag
+ *		will be placed on the ag extent free list if the extent
+ *		is the first free extent in the iag.  if freeing the
+ *		extent also means that no free inodes will exist for
+ *		the iag, the iag will also be removed from the ag free
+ *		inode list.
+ *
+ *		the iag describing the inode will be freed if the extent
+ *		is to be freed and it is the only backed extent within
+ *		the iag.  in this case, the iag will be removed from the
+ *		ag free extent list and ag free inode list and placed on
+ *		the inode map's free iag list.
+ *
+ *		a careful update approach is used to provide consistency
+ *		in the face of updates to multiple buffers.  under this
+ *		approach, all required buffers are obtained before making
+ *		any updates and are held until all updates are complete.
+ *
+ * PARAMETERS:
+ *	ip	- inode to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ */
+int diFree(struct inode *ip)
+{
+	int rc;
+	ino_t inum = ip->i_ino;
+	struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
+	struct metapage *mp, *amp, *bmp, *cmp, *dmp;
+	int iagno, ino, extno, bitno, sword, agno;
+	int back, fwd;
+	u32 bitmap, mask;
+	struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
+	struct inomap *imap = JFS_IP(ipimap)->i_imap;
+	pxd_t freepxd;
+	tid_t tid;
+	struct inode *iplist[3];
+	struct tlock *tlck;
+	struct pxd_lock *pxdlock;
+
+	/*
+	 * This is just to suppress compiler warnings.  The same logic that
+	 * references these variables is used to initialize them.
+	 */
+	aiagp = biagp = ciagp = diagp = NULL;
+
+	/* get the iag number containing the inode.
+	 */
+	iagno = INOTOIAG(inum);
+
+	/* make sure that the iag is contained within
+	 * the map.
+	 */
+	if (iagno >= imap->im_nextiag) {
+		print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       imap, 32, 0);
+		jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
+			  (uint) inum, iagno, imap->im_nextiag);
+		return -EIO;
+	}
+
+	/* get the allocation group for this ino.
+	 */
+	agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb));
+
+	/* Lock the AG specific inode map information
+	 */
+	AG_LOCK(imap, agno);
+
+	/* Obtain read lock in imap inode.  Don't release it until we have
+	 * read all of the IAG's that we are going to.
+	 */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+
+	/* read the iag.
+	 */
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		return (rc);
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* get the inode number and extent number of the inode within
+	 * the iag and the inode number within the extent.
+	 */
+	ino = inum & (INOSPERIAG - 1);
+	extno = ino >> L2INOSPEREXT;
+	bitno = ino & (INOSPEREXT - 1);
+	mask = HIGHORDER >> bitno;
+
+	if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+		jfs_error(ip->i_sb, "wmap shows inode already free\n");
+	}
+
+	if (!addressPXD(&iagp->inoext[extno])) {
+		release_metapage(mp);
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		jfs_error(ip->i_sb, "invalid inoext\n");
+		return -EIO;
+	}
+
+	/* compute the bitmap for the extent reflecting the freed inode.
+	 */
+	bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
+
+	if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
+		release_metapage(mp);
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		jfs_error(ip->i_sb, "numfree > numinos\n");
+		return -EIO;
+	}
+	/*
+	 *	inode extent still has some inodes or below low water mark:
+	 *	keep the inode extent;
+	 */
+	if (bitmap ||
+	    imap->im_agctl[agno].numfree < 96 ||
+	    (imap->im_agctl[agno].numfree < 288 &&
+	     (((imap->im_agctl[agno].numfree * 100) /
+	       imap->im_agctl[agno].numinos) <= 25))) {
+		/* if the iag currently has no free inodes (i.e.,
+		 * the inode being freed is the first free inode of iag),
+		 * insert the iag at head of the inode free list for the ag.
+		 */
+		if (iagp->nfreeinos == 0) {
+			/* check if there are any iags on the ag inode
+			 * free list.  if so, read the first one so that
+			 * we can link the current iag onto the list at
+			 * the head.
+			 */
+			if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
+				/* read the iag that currently is the head
+				 * of the list.
+				 */
+				if ((rc = diIAGRead(imap, fwd, &amp))) {
+					IREAD_UNLOCK(ipimap);
+					AG_UNLOCK(imap, agno);
+					release_metapage(mp);
+					return (rc);
+				}
+				aiagp = (struct iag *) amp->data;
+
+				/* make current head point back to the iag.
+				 */
+				aiagp->inofreeback = cpu_to_le32(iagno);
+
+				write_metapage(amp);
+			}
+
+			/* iag points forward to current head and iag
+			 * becomes the new head of the list.
+			 */
+			iagp->inofreefwd =
+			    cpu_to_le32(imap->im_agctl[agno].inofree);
+			iagp->inofreeback = cpu_to_le32(-1);
+			imap->im_agctl[agno].inofree = iagno;
+		}
+		IREAD_UNLOCK(ipimap);
+
+		/* update the free inode summary map for the extent if
+		 * freeing the inode means the extent will now have free
+		 * inodes (i.e., the inode being freed is the first free
+		 * inode of extent),
+		 */
+		if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
+			sword = extno >> L2EXTSPERSUM;
+			bitno = extno & (EXTSPERSUM - 1);
+			iagp->inosmap[sword] &=
+			    cpu_to_le32(~(HIGHORDER >> bitno));
+		}
+
+		/* update the bitmap.
+		 */
+		iagp->wmap[extno] = cpu_to_le32(bitmap);
+
+		/* update the free inode counts at the iag, ag and
+		 * map level.
+		 */
+		le32_add_cpu(&iagp->nfreeinos, 1);
+		imap->im_agctl[agno].numfree += 1;
+		atomic_inc(&imap->im_numfree);
+
+		/* release the AG inode map lock
+		 */
+		AG_UNLOCK(imap, agno);
+
+		/* write the iag */
+		write_metapage(mp);
+
+		return (0);
+	}
+
+
+	/*
+	 *	inode extent has become free and above low water mark:
+	 *	free the inode extent;
+	 */
+
+	/*
+	 *	prepare to update iag list(s) (careful update step 1)
+	 */
+	amp = bmp = cmp = dmp = NULL;
+	fwd = back = -1;
+
+	/* check if the iag currently has no free extents.  if so,
+	 * it will be placed on the head of the ag extent free list.
+	 */
+	if (iagp->nfreeexts == 0) {
+		/* check if the ag extent free list has any iags.
+		 * if so, read the iag at the head of the list now.
+		 * this (head) iag will be updated later to reflect
+		 * the addition of the current iag at the head of
+		 * the list.
+		 */
+		if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+			if ((rc = diIAGRead(imap, fwd, &amp)))
+				goto error_out;
+			aiagp = (struct iag *) amp->data;
+		}
+	} else {
+		/* iag has free extents. check if the addition of a free
+		 * extent will cause all extents to be free within this
+		 * iag.  if so, the iag will be removed from the ag extent
+		 * free list and placed on the inode map's free iag list.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+			/* in preparation for removing the iag from the
+			 * ag extent free list, read the iags preceding
+			 * and following the iag on the ag extent free
+			 * list.
+			 */
+			if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+				if ((rc = diIAGRead(imap, fwd, &amp)))
+					goto error_out;
+				aiagp = (struct iag *) amp->data;
+			}
+
+			if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+				if ((rc = diIAGRead(imap, back, &bmp)))
+					goto error_out;
+				biagp = (struct iag *) bmp->data;
+			}
+		}
+	}
+
+	/* remove the iag from the ag inode free list if freeing
+	 * this extent cause the iag to have no free inodes.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+		int inofreeback = le32_to_cpu(iagp->inofreeback);
+		int inofreefwd = le32_to_cpu(iagp->inofreefwd);
+
+		/* in preparation for removing the iag from the
+		 * ag inode free list, read the iags preceding
+		 * and following the iag on the ag inode free
+		 * list.  before reading these iags, we must make
+		 * sure that we already don't have them in hand
+		 * from up above, since re-reading an iag (buffer)
+		 * we are currently holding would cause a deadlock.
+		 */
+		if (inofreefwd >= 0) {
+
+			if (inofreefwd == fwd)
+				ciagp = (struct iag *) amp->data;
+			else if (inofreefwd == back)
+				ciagp = (struct iag *) bmp->data;
+			else {
+				if ((rc =
+				     diIAGRead(imap, inofreefwd, &cmp)))
+					goto error_out;
+				ciagp = (struct iag *) cmp->data;
+			}
+			assert(ciagp != NULL);
+		}
+
+		if (inofreeback >= 0) {
+			if (inofreeback == fwd)
+				diagp = (struct iag *) amp->data;
+			else if (inofreeback == back)
+				diagp = (struct iag *) bmp->data;
+			else {
+				if ((rc =
+				     diIAGRead(imap, inofreeback, &dmp)))
+					goto error_out;
+				diagp = (struct iag *) dmp->data;
+			}
+			assert(diagp != NULL);
+		}
+	}
+
+	IREAD_UNLOCK(ipimap);
+
+	/*
+	 * invalidate any page of the inode extent freed from buffer cache;
+	 */
+	freepxd = iagp->inoext[extno];
+	invalidate_pxd_metapages(ip, freepxd);
+
+	/*
+	 *	update iag list(s) (careful update step 2)
+	 */
+	/* add the iag to the ag extent free list if this is the
+	 * first free extent for the iag.
+	 */
+	if (iagp->nfreeexts == 0) {
+		if (fwd >= 0)
+			aiagp->extfreeback = cpu_to_le32(iagno);
+
+		iagp->extfreefwd =
+		    cpu_to_le32(imap->im_agctl[agno].extfree);
+		iagp->extfreeback = cpu_to_le32(-1);
+		imap->im_agctl[agno].extfree = iagno;
+	} else {
+		/* remove the iag from the ag extent list if all extents
+		 * are now free and place it on the inode map iag free list.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+			if (fwd >= 0)
+				aiagp->extfreeback = iagp->extfreeback;
+
+			if (back >= 0)
+				biagp->extfreefwd = iagp->extfreefwd;
+			else
+				imap->im_agctl[agno].extfree =
+				    le32_to_cpu(iagp->extfreefwd);
+
+			iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+
+			IAGFREE_LOCK(imap);
+			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+			imap->im_freeiag = iagno;
+			IAGFREE_UNLOCK(imap);
+		}
+	}
+
+	/* remove the iag from the ag inode free list if freeing
+	 * this extent causes the iag to have no free inodes.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
+			ciagp->inofreeback = iagp->inofreeback;
+
+		if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
+			diagp->inofreefwd = iagp->inofreefwd;
+		else
+			imap->im_agctl[agno].inofree =
+			    le32_to_cpu(iagp->inofreefwd);
+
+		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+	}
+
+	/* update the inode extent address and working map
+	 * to reflect the free extent.
+	 * the permanent map should have been updated already
+	 * for the inode being freed.
+	 */
+	if (iagp->pmap[extno] != 0) {
+		jfs_error(ip->i_sb, "the pmap does not show inode free\n");
+	}
+	iagp->wmap[extno] = 0;
+	PXDlength(&iagp->inoext[extno], 0);
+	PXDaddress(&iagp->inoext[extno], 0);
+
+	/* update the free extent and free inode summary maps
+	 * to reflect the freed extent.
+	 * the inode summary map is marked to indicate no inodes
+	 * available for the freed extent.
+	 */
+	sword = extno >> L2EXTSPERSUM;
+	bitno = extno & (EXTSPERSUM - 1);
+	mask = HIGHORDER >> bitno;
+	iagp->inosmap[sword] |= cpu_to_le32(mask);
+	iagp->extsmap[sword] &= cpu_to_le32(~mask);
+
+	/* update the number of free inodes and number of free extents
+	 * for the iag.
+	 */
+	le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, 1);
+
+	/* update the number of free inodes and backed inodes
+	 * at the ag and inode map level.
+	 */
+	imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
+	imap->im_agctl[agno].numinos -= INOSPEREXT;
+	atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
+	atomic_sub(INOSPEREXT, &imap->im_numinos);
+
+	if (amp)
+		write_metapage(amp);
+	if (bmp)
+		write_metapage(bmp);
+	if (cmp)
+		write_metapage(cmp);
+	if (dmp)
+		write_metapage(dmp);
+
+	/*
+	 * start transaction to update block allocation map
+	 * for the inode extent freed;
+	 *
+	 * N.B. AG_LOCK is released and iag will be released below, and
+	 * other thread may allocate inode from/reusing the ixad freed
+	 * BUT with new/different backing inode extent from the extent
+	 * to be freed by the transaction;
+	 */
+	tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
+	mutex_lock(&JFS_IP(ipimap)->commit_mutex);
+
+	/* acquire tlock of the iag page of the freed ixad
+	 * to force the page NOHOMEOK (even though no data is
+	 * logged from the iag page) until NOREDOPAGE|FREEXTENT log
+	 * for the free of the extent is committed;
+	 * write FREEXTENT|NOREDOPAGE log record
+	 * N.B. linelock is overlaid as freed extent descriptor;
+	 */
+	tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	pxdlock->pxd = freepxd;
+	pxdlock->index = 1;
+
+	write_metapage(mp);
+
+	iplist[0] = ipimap;
+
+	/*
+	 * logredo needs the IAG number and IAG extent index in order
+	 * to ensure that the IMap is consistent.  The least disruptive
+	 * way to pass these values through  to the transaction manager
+	 * is in the iplist array.
+	 *
+	 * It's not pretty, but it works.
+	 */
+	iplist[1] = (struct inode *) (size_t)iagno;
+	iplist[2] = (struct inode *) (size_t)extno;
+
+	rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+
+	/* unlock the AG inode map information */
+	AG_UNLOCK(imap, agno);
+
+	return (0);
+
+      error_out:
+	IREAD_UNLOCK(ipimap);
+
+	if (amp)
+		release_metapage(amp);
+	if (bmp)
+		release_metapage(bmp);
+	if (cmp)
+		release_metapage(cmp);
+	if (dmp)
+		release_metapage(dmp);
+
+	AG_UNLOCK(imap, agno);
+
+	release_metapage(mp);
+
+	return (rc);
+}
+
+/*
+ * There are several places in the diAlloc* routines where we initialize
+ * the inode.
+ */
+static inline void
+diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+
+	ip->i_ino = (iagno << L2INOSPERIAG) + ino;
+	jfs_ip->ixpxd = iagp->inoext[extno];
+	jfs_ip->agstart = le64_to_cpu(iagp->agstart);
+	jfs_ip->active_ag = -1;
+}
+
+
+/*
+ * NAME:	diAlloc(pip,dir,ip)
+ *
+ * FUNCTION:	allocate a disk inode from the inode working map
+ *		for a fileset or aggregate.
+ *
+ * PARAMETERS:
+ *	pip	- pointer to incore inode for the parent inode.
+ *	dir	- 'true' if the new disk inode is for a directory.
+ *	ip	- pointer to a new inode
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+int diAlloc(struct inode *pip, bool dir, struct inode *ip)
+{
+	int rc, ino, iagno, addext, extno, bitno, sword;
+	int nwords, rem, i, agno;
+	u32 mask, inosmap, extsmap;
+	struct inode *ipimap;
+	struct metapage *mp;
+	ino_t inum;
+	struct iag *iagp;
+	struct inomap *imap;
+
+	/* get the pointers to the inode map inode and the
+	 * corresponding imap control structure.
+	 */
+	ipimap = JFS_SBI(pip->i_sb)->ipimap;
+	imap = JFS_IP(ipimap)->i_imap;
+	JFS_IP(ip)->ipimap = ipimap;
+	JFS_IP(ip)->fileset = FILESYSTEM_I;
+
+	/* for a directory, the allocation policy is to start
+	 * at the ag level using the preferred ag.
+	 */
+	if (dir) {
+		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+		AG_LOCK(imap, agno);
+		goto tryag;
+	}
+
+	/* for files, the policy starts off by trying to allocate from
+	 * the same iag containing the parent disk inode:
+	 * try to allocate the new disk inode close to the parent disk
+	 * inode, using parent disk inode number + 1 as the allocation
+	 * hint.  (we use a left-to-right policy to attempt to avoid
+	 * moving backward on the disk.)  compute the hint within the
+	 * file system and the iag.
+	 */
+
+	/* get the ag number of this iag */
+	agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
+
+	if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
+		/*
+		 * There is an open file actively growing.  We want to
+		 * allocate new inodes from a different ag to avoid
+		 * fragmentation problems.
+		 */
+		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+		AG_LOCK(imap, agno);
+		goto tryag;
+	}
+
+	inum = pip->i_ino + 1;
+	ino = inum & (INOSPERIAG - 1);
+
+	/* back off the hint if it is outside of the iag */
+	if (ino == 0)
+		inum = pip->i_ino;
+
+	/* lock the AG inode map information */
+	AG_LOCK(imap, agno);
+
+	/* Get read lock on imap inode */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+
+	/* get the iag number and read the iag */
+	iagno = INOTOIAG(inum);
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		return (rc);
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* determine if new inode extent is allowed to be added to the iag.
+	 * new inode extent can be added to the iag if the ag
+	 * has less than 32 free disk inodes and the iag has free extents.
+	 */
+	addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
+
+	/*
+	 *	try to allocate from the IAG
+	 */
+	/* check if the inode may be allocated from the iag
+	 * (i.e. the inode has free inodes or new extent can be added).
+	 */
+	if (iagp->nfreeinos || addext) {
+		/* determine the extent number of the hint.
+		 */
+		extno = ino >> L2INOSPEREXT;
+
+		/* check if the extent containing the hint has backed
+		 * inodes.  if so, try to allocate within this extent.
+		 */
+		if (addressPXD(&iagp->inoext[extno])) {
+			bitno = ino & (INOSPEREXT - 1);
+			if ((bitno =
+			     diFindFree(le32_to_cpu(iagp->wmap[extno]),
+					bitno))
+			    < INOSPEREXT) {
+				ino = (extno << L2INOSPEREXT) + bitno;
+
+				/* a free inode (bit) was found within this
+				 * extent, so allocate it.
+				 */
+				rc = diAllocBit(imap, iagp, ino);
+				IREAD_UNLOCK(ipimap);
+				if (rc) {
+					assert(rc == -EIO);
+				} else {
+					/* set the results of the allocation
+					 * and write the iag.
+					 */
+					diInitInode(ip, iagno, ino, extno,
+						    iagp);
+					mark_metapage_dirty(mp);
+				}
+				release_metapage(mp);
+
+				/* free the AG lock and return.
+				 */
+				AG_UNLOCK(imap, agno);
+				return (rc);
+			}
+
+			if (!addext)
+				extno =
+				    (extno ==
+				     EXTSPERIAG - 1) ? 0 : extno + 1;
+		}
+
+		/*
+		 * no free inodes within the extent containing the hint.
+		 *
+		 * try to allocate from the backed extents following
+		 * hint or, if appropriate (i.e. addext is true), allocate
+		 * an extent of free inodes at or following the extent
+		 * containing the hint.
+		 *
+		 * the free inode and free extent summary maps are used
+		 * here, so determine the starting summary map position
+		 * and the number of words we'll have to examine.  again,
+		 * the approach is to allocate following the hint, so we
+		 * might have to initially ignore prior bits of the summary
+		 * map that represent extents prior to the extent containing
+		 * the hint and later revisit these bits.
+		 */
+		bitno = extno & (EXTSPERSUM - 1);
+		nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
+		sword = extno >> L2EXTSPERSUM;
+
+		/* mask any prior bits for the starting words of the
+		 * summary map.
+		 */
+		mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno));
+		inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
+		extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
+
+		/* scan the free inode and free extent summary maps for
+		 * free resources.
+		 */
+		for (i = 0; i < nwords; i++) {
+			/* check if this word of the free inode summary
+			 * map describes an extent with free inodes.
+			 */
+			if (~inosmap) {
+				/* an extent with free inodes has been
+				 * found. determine the extent number
+				 * and the inode number within the extent.
+				 */
+				rem = diFindFree(inosmap, 0);
+				extno = (sword << L2EXTSPERSUM) + rem;
+				rem = diFindFree(le32_to_cpu(iagp->wmap[extno]),
+						 0);
+				if (rem >= INOSPEREXT) {
+					IREAD_UNLOCK(ipimap);
+					release_metapage(mp);
+					AG_UNLOCK(imap, agno);
+					jfs_error(ip->i_sb,
+						  "can't find free bit in wmap\n");
+					return -EIO;
+				}
+
+				/* determine the inode number within the
+				 * iag and allocate the inode from the
+				 * map.
+				 */
+				ino = (extno << L2INOSPEREXT) + rem;
+				rc = diAllocBit(imap, iagp, ino);
+				IREAD_UNLOCK(ipimap);
+				if (rc)
+					assert(rc == -EIO);
+				else {
+					/* set the results of the allocation
+					 * and write the iag.
+					 */
+					diInitInode(ip, iagno, ino, extno,
+						    iagp);
+					mark_metapage_dirty(mp);
+				}
+				release_metapage(mp);
+
+				/* free the AG lock and return.
+				 */
+				AG_UNLOCK(imap, agno);
+				return (rc);
+
+			}
+
+			/* check if we may allocate an extent of free
+			 * inodes and whether this word of the free
+			 * extents summary map describes a free extent.
+			 */
+			if (addext && ~extsmap) {
+				/* a free extent has been found.  determine
+				 * the extent number.
+				 */
+				rem = diFindFree(extsmap, 0);
+				extno = (sword << L2EXTSPERSUM) + rem;
+
+				/* allocate an extent of free inodes.
+				 */
+				if ((rc = diNewExt(imap, iagp, extno))) {
+					/* if there is no disk space for a
+					 * new extent, try to allocate the
+					 * disk inode from somewhere else.
+					 */
+					if (rc == -ENOSPC)
+						break;
+
+					assert(rc == -EIO);
+				} else {
+					/* set the results of the allocation
+					 * and write the iag.
+					 */
+					diInitInode(ip, iagno,
+						    extno << L2INOSPEREXT,
+						    extno, iagp);
+					mark_metapage_dirty(mp);
+				}
+				release_metapage(mp);
+				/* free the imap inode & the AG lock & return.
+				 */
+				IREAD_UNLOCK(ipimap);
+				AG_UNLOCK(imap, agno);
+				return (rc);
+			}
+
+			/* move on to the next set of summary map words.
+			 */
+			sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
+			inosmap = le32_to_cpu(iagp->inosmap[sword]);
+			extsmap = le32_to_cpu(iagp->extsmap[sword]);
+		}
+	}
+	/* unlock imap inode */
+	IREAD_UNLOCK(ipimap);
+
+	/* nothing doing in this iag, so release it. */
+	release_metapage(mp);
+
+      tryag:
+	/*
+	 * try to allocate anywhere within the same AG as the parent inode.
+	 */
+	rc = diAllocAG(imap, agno, dir, ip);
+
+	AG_UNLOCK(imap, agno);
+
+	if (rc != -ENOSPC)
+		return (rc);
+
+	/*
+	 * try to allocate in any AG.
+	 */
+	return (diAllocAny(imap, agno, dir, ip));
+}
+
+
+/*
+ * NAME:	diAllocAG(imap,agno,dir,ip)
+ *
+ * FUNCTION:	allocate a disk inode from the allocation group.
+ *
+ *		this routine first determines if a new extent of free
+ *		inodes should be added for the allocation group, with
+ *		the current request satisfied from this extent. if this
+ *		is the case, an attempt will be made to do just that.  if
+ *		this attempt fails or it has been determined that a new
+ *		extent should not be added, an attempt is made to satisfy
+ *		the request by allocating an existing (backed) free inode
+ *		from the allocation group.
+ *
+ * PRE CONDITION: Already have the AG lock for this AG.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- allocation group to allocate from.
+ *	dir	- 'true' if the new disk inode is for a directory.
+ *	ip	- pointer to the new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int
+diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
+{
+	int rc, addext, numfree, numinos;
+
+	/* get the number of free and the number of backed disk
+	 * inodes currently within the ag.
+	 */
+	numfree = imap->im_agctl[agno].numfree;
+	numinos = imap->im_agctl[agno].numinos;
+
+	if (numfree > numinos) {
+		jfs_error(ip->i_sb, "numfree > numinos\n");
+		return -EIO;
+	}
+
+	/* determine if we should allocate a new extent of free inodes
+	 * within the ag: for directory inodes, add a new extent
+	 * if there are a small number of free inodes or number of free
+	 * inodes is a small percentage of the number of backed inodes.
+	 */
+	if (dir)
+		addext = (numfree < 64 ||
+			  (numfree < 256
+			   && ((numfree * 100) / numinos) <= 20));
+	else
+		addext = (numfree == 0);
+
+	/*
+	 * try to allocate a new extent of free inodes.
+	 */
+	if (addext) {
+		/* if free space is not available for this new extent, try
+		 * below to allocate a free and existing (already backed)
+		 * inode from the ag.
+		 */
+		if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC)
+			return (rc);
+	}
+
+	/*
+	 * try to allocate an existing free inode from the ag.
+	 */
+	return (diAllocIno(imap, agno, ip));
+}
+
+
+/*
+ * NAME:	diAllocAny(imap,agno,dir,iap)
+ *
+ * FUNCTION:	allocate a disk inode from any other allocation group.
+ *
+ *		this routine is called when an allocation attempt within
+ *		the primary allocation group has failed. if attempts to
+ *		allocate an inode from any allocation group other than the
+ *		specified primary group.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- primary allocation group (to avoid).
+ *	dir	- 'true' if the new disk inode is for a directory.
+ *	ip	- pointer to a new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int
+diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
+{
+	int ag, rc;
+	int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
+
+
+	/* try to allocate from the ags following agno up to
+	 * the maximum ag number.
+	 */
+	for (ag = agno + 1; ag <= maxag; ag++) {
+		AG_LOCK(imap, ag);
+
+		rc = diAllocAG(imap, ag, dir, ip);
+
+		AG_UNLOCK(imap, ag);
+
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* try to allocate from the ags in front of agno.
+	 */
+	for (ag = 0; ag < agno; ag++) {
+		AG_LOCK(imap, ag);
+
+		rc = diAllocAG(imap, ag, dir, ip);
+
+		AG_UNLOCK(imap, ag);
+
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* no free disk inodes.
+	 */
+	return -ENOSPC;
+}
+
+
+/*
+ * NAME:	diAllocIno(imap,agno,ip)
+ *
+ * FUNCTION:	allocate a disk inode from the allocation group's free
+ *		inode list, returning an error if this free list is
+ *		empty (i.e. no iags on the list).
+ *
+ *		allocation occurs from the first iag on the list using
+ *		the iag's free inode summary map to find the leftmost
+ *		free inode in the iag.
+ *
+ * PRE CONDITION: Already have AG lock for this AG.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- allocation group.
+ *	ip	- pointer to new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
+{
+	int iagno, ino, rc, rem, extno, sword;
+	struct metapage *mp;
+	struct iag *iagp;
+
+	/* check if there are iags on the ag's free inode list.
+	 */
+	if ((iagno = imap->im_agctl[agno].inofree) < 0)
+		return -ENOSPC;
+
+	/* obtain read lock on imap inode */
+	IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
+
+	/* read the iag at the head of the list.
+	 */
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		return (rc);
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* better be free inodes in this iag if it is on the
+	 * list.
+	 */
+	if (!iagp->nfreeinos) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		release_metapage(mp);
+		jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
+		return -EIO;
+	}
+
+	/* scan the free inode summary map to find an extent
+	 * with free inodes.
+	 */
+	for (sword = 0;; sword++) {
+		if (sword >= SMAPSZ) {
+			IREAD_UNLOCK(imap->im_ipimap);
+			release_metapage(mp);
+			jfs_error(ip->i_sb,
+				  "free inode not found in summary map\n");
+			return -EIO;
+		}
+
+		if (~iagp->inosmap[sword])
+			break;
+	}
+
+	/* found a extent with free inodes. determine
+	 * the extent number.
+	 */
+	rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
+	if (rem >= EXTSPERSUM) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		release_metapage(mp);
+		jfs_error(ip->i_sb, "no free extent found\n");
+		return -EIO;
+	}
+	extno = (sword << L2EXTSPERSUM) + rem;
+
+	/* find the first free inode in the extent.
+	 */
+	rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
+	if (rem >= INOSPEREXT) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		release_metapage(mp);
+		jfs_error(ip->i_sb, "free inode not found\n");
+		return -EIO;
+	}
+
+	/* compute the inode number within the iag.
+	 */
+	ino = (extno << L2INOSPEREXT) + rem;
+
+	/* allocate the inode.
+	 */
+	rc = diAllocBit(imap, iagp, ino);
+	IREAD_UNLOCK(imap->im_ipimap);
+	if (rc) {
+		release_metapage(mp);
+		return (rc);
+	}
+
+	/* set the results of the allocation and write the iag.
+	 */
+	diInitInode(ip, iagno, ino, extno, iagp);
+	write_metapage(mp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diAllocExt(imap,agno,ip)
+ *
+ * FUNCTION:	add a new extent of free inodes to an iag, allocating
+ *		an inode from this extent to satisfy the current allocation
+ *		request.
+ *
+ *		this routine first tries to find an existing iag with free
+ *		extents through the ag free extent list.  if list is not
+ *		empty, the head of the list will be selected as the home
+ *		of the new extent of free inodes.  otherwise (the list is
+ *		empty), a new iag will be allocated for the ag to contain
+ *		the extent.
+ *
+ *		once an iag has been selected, the free extent summary map
+ *		is used to locate a free extent within the iag and diNewExt()
+ *		is called to initialize the extent, with initialization
+ *		including the allocation of the first inode of the extent
+ *		for the purpose of satisfying this request.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- allocation group number.
+ *	ip	- pointer to new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
+{
+	int rem, iagno, sword, extno, rc;
+	struct metapage *mp;
+	struct iag *iagp;
+
+	/* check if the ag has any iags with free extents.  if not,
+	 * allocate a new iag for the ag.
+	 */
+	if ((iagno = imap->im_agctl[agno].extfree) < 0) {
+		/* If successful, diNewIAG will obtain the read lock on the
+		 * imap inode.
+		 */
+		if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
+			return (rc);
+		}
+		iagp = (struct iag *) mp->data;
+
+		/* set the ag number if this a brand new iag
+		 */
+		iagp->agstart =
+		    cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
+	} else {
+		/* read the iag.
+		 */
+		IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
+		if ((rc = diIAGRead(imap, iagno, &mp))) {
+			IREAD_UNLOCK(imap->im_ipimap);
+			jfs_error(ip->i_sb, "error reading iag\n");
+			return rc;
+		}
+		iagp = (struct iag *) mp->data;
+	}
+
+	/* using the free extent summary map, find a free extent.
+	 */
+	for (sword = 0;; sword++) {
+		if (sword >= SMAPSZ) {
+			release_metapage(mp);
+			IREAD_UNLOCK(imap->im_ipimap);
+			jfs_error(ip->i_sb, "free ext summary map not found\n");
+			return -EIO;
+		}
+		if (~iagp->extsmap[sword])
+			break;
+	}
+
+	/* determine the extent number of the free extent.
+	 */
+	rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
+	if (rem >= EXTSPERSUM) {
+		release_metapage(mp);
+		IREAD_UNLOCK(imap->im_ipimap);
+		jfs_error(ip->i_sb, "free extent not found\n");
+		return -EIO;
+	}
+	extno = (sword << L2EXTSPERSUM) + rem;
+
+	/* initialize the new extent.
+	 */
+	rc = diNewExt(imap, iagp, extno);
+	IREAD_UNLOCK(imap->im_ipimap);
+	if (rc) {
+		/* something bad happened.  if a new iag was allocated,
+		 * place it back on the inode map's iag free list, and
+		 * clear the ag number information.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			IAGFREE_LOCK(imap);
+			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+			imap->im_freeiag = iagno;
+			IAGFREE_UNLOCK(imap);
+		}
+		write_metapage(mp);
+		return (rc);
+	}
+
+	/* set the results of the allocation and write the iag.
+	 */
+	diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
+
+	write_metapage(mp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diAllocBit(imap,iagp,ino)
+ *
+ * FUNCTION:	allocate a backed inode from an iag.
+ *
+ *		this routine performs the mechanics of allocating a
+ *		specified inode from a backed extent.
+ *
+ *		if the inode to be allocated represents the last free
+ *		inode within the iag, the iag will be removed from the
+ *		ag free inode list.
+ *
+ *		a careful update approach is used to provide consistency
+ *		in the face of updates to multiple buffers.  under this
+ *		approach, all required buffers are obtained before making
+ *		any updates and are held all are updates are complete.
+ *
+ * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
+ *	this AG.  Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagp	- pointer to iag.
+ *	ino	- inode number to be allocated within the iag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
+{
+	int extno, bitno, agno, sword, rc;
+	struct metapage *amp = NULL, *bmp = NULL;
+	struct iag *aiagp = NULL, *biagp = NULL;
+	u32 mask;
+
+	/* check if this is the last free inode within the iag.
+	 * if so, it will have to be removed from the ag free
+	 * inode list, so get the iags preceding and following
+	 * it on the list.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(1)) {
+		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
+			if ((rc =
+			     diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
+				       &amp)))
+				return (rc);
+			aiagp = (struct iag *) amp->data;
+		}
+
+		if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
+			if ((rc =
+			     diIAGRead(imap,
+				       le32_to_cpu(iagp->inofreeback),
+				       &bmp))) {
+				if (amp)
+					release_metapage(amp);
+				return (rc);
+			}
+			biagp = (struct iag *) bmp->data;
+		}
+	}
+
+	/* get the ag number, extent number, inode number within
+	 * the extent.
+	 */
+	agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
+	extno = ino >> L2INOSPEREXT;
+	bitno = ino & (INOSPEREXT - 1);
+
+	/* compute the mask for setting the map.
+	 */
+	mask = HIGHORDER >> bitno;
+
+	/* the inode should be free and backed.
+	 */
+	if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) ||
+	    ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) ||
+	    (addressPXD(&iagp->inoext[extno]) == 0)) {
+		if (amp)
+			release_metapage(amp);
+		if (bmp)
+			release_metapage(bmp);
+
+		jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
+		return -EIO;
+	}
+
+	/* mark the inode as allocated in the working map.
+	 */
+	iagp->wmap[extno] |= cpu_to_le32(mask);
+
+	/* check if all inodes within the extent are now
+	 * allocated.  if so, update the free inode summary
+	 * map to reflect this.
+	 */
+	if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
+		sword = extno >> L2EXTSPERSUM;
+		bitno = extno & (EXTSPERSUM - 1);
+		iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
+	}
+
+	/* if this was the last free inode in the iag, remove the
+	 * iag from the ag free inode list.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(1)) {
+		if (amp) {
+			aiagp->inofreeback = iagp->inofreeback;
+			write_metapage(amp);
+		}
+
+		if (bmp) {
+			biagp->inofreefwd = iagp->inofreefwd;
+			write_metapage(bmp);
+		} else {
+			imap->im_agctl[agno].inofree =
+			    le32_to_cpu(iagp->inofreefwd);
+		}
+		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+	}
+
+	/* update the free inode count at the iag, ag, inode
+	 * map levels.
+	 */
+	le32_add_cpu(&iagp->nfreeinos, -1);
+	imap->im_agctl[agno].numfree -= 1;
+	atomic_dec(&imap->im_numfree);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diNewExt(imap,iagp,extno)
+ *
+ * FUNCTION:	initialize a new extent of inodes for an iag, allocating
+ *		the first inode of the extent for use for the current
+ *		allocation request.
+ *
+ *		disk resources are allocated for the new extent of inodes
+ *		and the inodes themselves are initialized to reflect their
+ *		existence within the extent (i.e. their inode numbers and
+ *		inode extent addresses are set) and their initial state
+ *		(mode and link count are set to zero).
+ *
+ *		if the iag is new, it is not yet on an ag extent free list
+ *		but will now be placed on this list.
+ *
+ *		if the allocation of the new extent causes the iag to
+ *		have no free extent, the iag will be removed from the
+ *		ag extent free list.
+ *
+ *		if the iag has no free backed inodes, it will be placed
+ *		on the ag free inode list, since the addition of the new
+ *		extent will now cause it to have free inodes.
+ *
+ *		a careful update approach is used to provide consistency
+ *		(i.e. list consistency) in the face of updates to multiple
+ *		buffers.  under this approach, all required buffers are
+ *		obtained before making any updates and are held until all
+ *		updates are complete.
+ *
+ * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
+ *	this AG.  Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagp	- pointer to iag.
+ *	extno	- extent number.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
+{
+	int agno, iagno, fwd, back, freei = 0, sword, rc;
+	struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL;
+	struct metapage *amp, *bmp, *cmp, *dmp;
+	struct inode *ipimap;
+	s64 blkno, hint;
+	int i, j;
+	u32 mask;
+	ino_t ino;
+	struct dinode *dp;
+	struct jfs_sb_info *sbi;
+
+	/* better have free extents.
+	 */
+	if (!iagp->nfreeexts) {
+		jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
+		return -EIO;
+	}
+
+	/* get the inode map inode.
+	 */
+	ipimap = imap->im_ipimap;
+	sbi = JFS_SBI(ipimap->i_sb);
+
+	amp = bmp = cmp = NULL;
+
+	/* get the ag and iag numbers for this iag.
+	 */
+	agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+	iagno = le32_to_cpu(iagp->iagnum);
+
+	/* check if this is the last free extent within the
+	 * iag.  if so, the iag must be removed from the ag
+	 * free extent list, so get the iags preceding and
+	 * following the iag on this list.
+	 */
+	if (iagp->nfreeexts == cpu_to_le32(1)) {
+		if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+			if ((rc = diIAGRead(imap, fwd, &amp)))
+				return (rc);
+			aiagp = (struct iag *) amp->data;
+		}
+
+		if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+			if ((rc = diIAGRead(imap, back, &bmp)))
+				goto error_out;
+			biagp = (struct iag *) bmp->data;
+		}
+	} else {
+		/* the iag has free extents.  if all extents are free
+		 * (as is the case for a newly allocated iag), the iag
+		 * must be added to the ag free extent list, so get
+		 * the iag at the head of the list in preparation for
+		 * adding this iag to this list.
+		 */
+		fwd = back = -1;
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+				if ((rc = diIAGRead(imap, fwd, &amp)))
+					goto error_out;
+				aiagp = (struct iag *) amp->data;
+			}
+		}
+	}
+
+	/* check if the iag has no free inodes.  if so, the iag
+	 * will have to be added to the ag free inode list, so get
+	 * the iag at the head of the list in preparation for
+	 * adding this iag to this list.  in doing this, we must
+	 * check if we already have the iag at the head of
+	 * the list in hand.
+	 */
+	if (iagp->nfreeinos == 0) {
+		freei = imap->im_agctl[agno].inofree;
+
+		if (freei >= 0) {
+			if (freei == fwd) {
+				ciagp = aiagp;
+			} else if (freei == back) {
+				ciagp = biagp;
+			} else {
+				if ((rc = diIAGRead(imap, freei, &cmp)))
+					goto error_out;
+				ciagp = (struct iag *) cmp->data;
+			}
+			if (ciagp == NULL) {
+				jfs_error(imap->im_ipimap->i_sb,
+					  "ciagp == NULL\n");
+				rc = -EIO;
+				goto error_out;
+			}
+		}
+	}
+
+	/* allocate disk space for the inode extent.
+	 */
+	if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
+		hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
+	else
+		hint = addressPXD(&iagp->inoext[extno - 1]) +
+		    lengthPXD(&iagp->inoext[extno - 1]) - 1;
+
+	if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
+		goto error_out;
+
+	/* compute the inode number of the first inode within the
+	 * extent.
+	 */
+	ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
+
+	/* initialize the inodes within the newly allocated extent a
+	 * page at a time.
+	 */
+	for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
+		/* get a buffer for this page of disk inodes.
+		 */
+		dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
+		if (dmp == NULL) {
+			rc = -EIO;
+			goto error_out;
+		}
+		dp = (struct dinode *) dmp->data;
+
+		/* initialize the inode number, mode, link count and
+		 * inode extent address.
+		 */
+		for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
+			dp->di_inostamp = cpu_to_le32(sbi->inostamp);
+			dp->di_number = cpu_to_le32(ino);
+			dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
+			dp->di_mode = 0;
+			dp->di_nlink = 0;
+			PXDaddress(&(dp->di_ixpxd), blkno);
+			PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
+		}
+		write_metapage(dmp);
+	}
+
+	/* if this is the last free extent within the iag, remove the
+	 * iag from the ag free extent list.
+	 */
+	if (iagp->nfreeexts == cpu_to_le32(1)) {
+		if (fwd >= 0)
+			aiagp->extfreeback = iagp->extfreeback;
+
+		if (back >= 0)
+			biagp->extfreefwd = iagp->extfreefwd;
+		else
+			imap->im_agctl[agno].extfree =
+			    le32_to_cpu(iagp->extfreefwd);
+
+		iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+	} else {
+		/* if the iag has all free extents (newly allocated iag),
+		 * add the iag to the ag free extent list.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			if (fwd >= 0)
+				aiagp->extfreeback = cpu_to_le32(iagno);
+
+			iagp->extfreefwd = cpu_to_le32(fwd);
+			iagp->extfreeback = cpu_to_le32(-1);
+			imap->im_agctl[agno].extfree = iagno;
+		}
+	}
+
+	/* if the iag has no free inodes, add the iag to the
+	 * ag free inode list.
+	 */
+	if (iagp->nfreeinos == 0) {
+		if (freei >= 0)
+			ciagp->inofreeback = cpu_to_le32(iagno);
+
+		iagp->inofreefwd =
+		    cpu_to_le32(imap->im_agctl[agno].inofree);
+		iagp->inofreeback = cpu_to_le32(-1);
+		imap->im_agctl[agno].inofree = iagno;
+	}
+
+	/* initialize the extent descriptor of the extent. */
+	PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
+	PXDaddress(&iagp->inoext[extno], blkno);
+
+	/* initialize the working and persistent map of the extent.
+	 * the working map will be initialized such that
+	 * it indicates the first inode of the extent is allocated.
+	 */
+	iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
+	iagp->pmap[extno] = 0;
+
+	/* update the free inode and free extent summary maps
+	 * for the extent to indicate the extent has free inodes
+	 * and no longer represents a free extent.
+	 */
+	sword = extno >> L2EXTSPERSUM;
+	mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
+	iagp->extsmap[sword] |= cpu_to_le32(mask);
+	iagp->inosmap[sword] &= cpu_to_le32(~mask);
+
+	/* update the free inode and free extent counts for the
+	 * iag.
+	 */
+	le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, -1);
+
+	/* update the free and backed inode counts for the ag.
+	 */
+	imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
+	imap->im_agctl[agno].numinos += INOSPEREXT;
+
+	/* update the free and backed inode counts for the inode map.
+	 */
+	atomic_add(INOSPEREXT - 1, &imap->im_numfree);
+	atomic_add(INOSPEREXT, &imap->im_numinos);
+
+	/* write the iags.
+	 */
+	if (amp)
+		write_metapage(amp);
+	if (bmp)
+		write_metapage(bmp);
+	if (cmp)
+		write_metapage(cmp);
+
+	return (0);
+
+      error_out:
+
+	/* release the iags.
+	 */
+	if (amp)
+		release_metapage(amp);
+	if (bmp)
+		release_metapage(bmp);
+	if (cmp)
+		release_metapage(cmp);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	diNewIAG(imap,iagnop,agno)
+ *
+ * FUNCTION:	allocate a new iag for an allocation group.
+ *
+ *		first tries to allocate the iag from the inode map
+ *		iagfree list:
+ *		if the list has free iags, the head of the list is removed
+ *		and returned to satisfy the request.
+ *		if the inode map's iag free list is empty, the inode map
+ *		is extended to hold a new iag. this new iag is initialized
+ *		and returned to satisfy the request.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagnop	- pointer to an iag number set with the number of the
+ *		  newly allocated iag upon successful return.
+ *	agno	- allocation group number.
+ *	bpp	- Buffer pointer to be filled in with new IAG's buffer
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ *
+ * serialization:
+ *	AG lock held on entry/exit;
+ *	write lock on the map is held inside;
+ *	read lock on the map is held on successful completion;
+ *
+ * note: new iag transaction:
+ * . synchronously write iag;
+ * . write log of xtree and inode of imap;
+ * . commit;
+ * . synchronous write of xtree (right to left, bottom to top);
+ * . at start of logredo(): init in-memory imap with one additional iag page;
+ * . at end of logredo(): re-read imap inode to determine
+ *   new imap size;
+ */
+static int
+diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
+{
+	int rc;
+	int iagno, i, xlen;
+	struct inode *ipimap;
+	struct super_block *sb;
+	struct jfs_sb_info *sbi;
+	struct metapage *mp;
+	struct iag *iagp;
+	s64 xaddr = 0;
+	s64 blkno;
+	tid_t tid;
+	struct inode *iplist[1];
+
+	/* pick up pointers to the inode map and mount inodes */
+	ipimap = imap->im_ipimap;
+	sb = ipimap->i_sb;
+	sbi = JFS_SBI(sb);
+
+	/* acquire the free iag lock */
+	IAGFREE_LOCK(imap);
+
+	/* if there are any iags on the inode map free iag list,
+	 * allocate the iag from the head of the list.
+	 */
+	if (imap->im_freeiag >= 0) {
+		/* pick up the iag number at the head of the list */
+		iagno = imap->im_freeiag;
+
+		/* determine the logical block number of the iag */
+		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+	} else {
+		/* no free iags. the inode map will have to be extented
+		 * to include a new iag.
+		 */
+
+		/* acquire inode map lock */
+		IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
+
+		if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
+			IWRITE_UNLOCK(ipimap);
+			IAGFREE_UNLOCK(imap);
+			jfs_error(imap->im_ipimap->i_sb,
+				  "ipimap->i_size is wrong\n");
+			return -EIO;
+		}
+
+
+		/* get the next available iag number */
+		iagno = imap->im_nextiag;
+
+		/* make sure that we have not exceeded the maximum inode
+		 * number limit.
+		 */
+		if (iagno > (MAXIAGS - 1)) {
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			rc = -ENOSPC;
+			goto out;
+		}
+
+		/*
+		 * synchronously append new iag page.
+		 */
+		/* determine the logical address of iag page to append */
+		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+
+		/* Allocate extent for new iag page */
+		xlen = sbi->nbperpage;
+		if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			goto out;
+		}
+
+		/*
+		 * start transaction of update of the inode map
+		 * addressing structure pointing to the new iag page;
+		 */
+		tid = txBegin(sb, COMMIT_FORCE);
+		mutex_lock(&JFS_IP(ipimap)->commit_mutex);
+
+		/* update the inode map addressing structure to point to it */
+		if ((rc =
+		     xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+			/* Free the blocks allocated for the iag since it was
+			 * not successfully added to the inode map
+			 */
+			dbFree(ipimap, xaddr, (s64) xlen);
+
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			goto out;
+		}
+
+		/* update the inode map's inode to reflect the extension */
+		ipimap->i_size += PSIZE;
+		inode_add_bytes(ipimap, PSIZE);
+
+		/* assign a buffer for the page */
+		mp = get_metapage(ipimap, blkno, PSIZE, 0);
+		if (!mp) {
+			/*
+			 * This is very unlikely since we just created the
+			 * extent, but let's try to handle it correctly
+			 */
+			xtTruncate(tid, ipimap, ipimap->i_size - PSIZE,
+				   COMMIT_PWMAP);
+
+			txAbort(tid, 0);
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			rc = -EIO;
+			goto out;
+		}
+		iagp = (struct iag *) mp->data;
+
+		/* init the iag */
+		memset(iagp, 0, sizeof(struct iag));
+		iagp->iagnum = cpu_to_le32(iagno);
+		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+		iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+		iagp->iagfree = cpu_to_le32(-1);
+		iagp->nfreeinos = 0;
+		iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
+
+		/* initialize the free inode summary map (free extent
+		 * summary map initialization handled by bzero).
+		 */
+		for (i = 0; i < SMAPSZ; i++)
+			iagp->inosmap[i] = cpu_to_le32(ONES);
+
+		/*
+		 * Write and sync the metapage
+		 */
+		flush_metapage(mp);
+
+		/*
+		 * txCommit(COMMIT_FORCE) will synchronously write address
+		 * index pages and inode after commit in careful update order
+		 * of address index pages (right to left, bottom up);
+		 */
+		iplist[0] = ipimap;
+		rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+
+		duplicateIXtree(sb, blkno, xlen, &xaddr);
+
+		/* update the next available iag number */
+		imap->im_nextiag += 1;
+
+		/* Add the iag to the iag free list so we don't lose the iag
+		 * if a failure happens now.
+		 */
+		imap->im_freeiag = iagno;
+
+		/* Until we have logredo working, we want the imap inode &
+		 * control page to be up to date.
+		 */
+		diSync(ipimap);
+
+		/* release the inode map lock */
+		IWRITE_UNLOCK(ipimap);
+	}
+
+	/* obtain read lock on map */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+
+	/* read the iag */
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(ipimap);
+		rc = -EIO;
+		goto out;
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* remove the iag from the iag free list */
+	imap->im_freeiag = le32_to_cpu(iagp->iagfree);
+	iagp->iagfree = cpu_to_le32(-1);
+
+	/* set the return iag number and buffer pointer */
+	*iagnop = iagno;
+	*mpp = mp;
+
+      out:
+	/* release the iag free lock */
+	IAGFREE_UNLOCK(imap);
+
+	return (rc);
+}
+
+/*
+ * NAME:	diIAGRead()
+ *
+ * FUNCTION:	get the buffer for the specified iag within a fileset
+ *		or aggregate inode map.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagno	- iag number.
+ *	bpp	- point to buffer pointer to be filled in on successful
+ *		  exit.
+ *
+ * SERIALIZATION:
+ *	must have read lock on imap inode
+ *	(When called by diExtendFS, the filesystem is quiesced, therefore
+ *	 the read lock is unnecessary.)
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-EIO	- i/o error.
+ */
+static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
+{
+	struct inode *ipimap = imap->im_ipimap;
+	s64 blkno;
+
+	/* compute the logical block number of the iag. */
+	blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
+
+	/* read the iag. */
+	*mpp = read_metapage(ipimap, blkno, PSIZE, 0);
+	if (*mpp == NULL) {
+		return -EIO;
+	}
+
+	return (0);
+}
+
+/*
+ * NAME:	diFindFree()
+ *
+ * FUNCTION:	find the first free bit in a word starting at
+ *		the specified bit position.
+ *
+ * PARAMETERS:
+ *	word	- word to be examined.
+ *	start	- starting bit position.
+ *
+ * RETURN VALUES:
+ *	bit position of first free bit in the word or 32 if
+ *	no free bits were found.
+ */
+static int diFindFree(u32 word, int start)
+{
+	int bitno;
+	assert(start < 32);
+	/* scan the word for the first free bit. */
+	for (word <<= start, bitno = start; bitno < 32;
+	     bitno++, word <<= 1) {
+		if ((word & HIGHORDER) == 0)
+			break;
+	}
+	return (bitno);
+}
+
+/*
+ * NAME:	diUpdatePMap()
+ *
+ * FUNCTION: Update the persistent map in an IAG for the allocation or
+ *	freeing of the specified inode.
+ *
+ * PRE CONDITIONS: Working map has already been updated for allocate.
+ *
+ * PARAMETERS:
+ *	ipimap	- Incore inode map inode
+ *	inum	- Number of inode to mark in permanent map
+ *	is_free	- If 'true' indicates inode should be marked freed, otherwise
+ *		  indicates inode should be marked allocated.
+ *
+ * RETURN VALUES:
+ *		0 for success
+ */
+int
+diUpdatePMap(struct inode *ipimap,
+	     unsigned long inum, bool is_free, struct tblock * tblk)
+{
+	int rc;
+	struct iag *iagp;
+	struct metapage *mp;
+	int iagno, ino, extno, bitno;
+	struct inomap *imap;
+	u32 mask;
+	struct jfs_log *log;
+	int lsn, difft, diffp;
+	unsigned long flags;
+
+	imap = JFS_IP(ipimap)->i_imap;
+	/* get the iag number containing the inode */
+	iagno = INOTOIAG(inum);
+	/* make sure that the iag is contained within the map */
+	if (iagno >= imap->im_nextiag) {
+		jfs_error(ipimap->i_sb, "the iag is outside the map\n");
+		return -EIO;
+	}
+	/* read the iag */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+	rc = diIAGRead(imap, iagno, &mp);
+	IREAD_UNLOCK(ipimap);
+	if (rc)
+		return (rc);
+	metapage_wait_for_io(mp);
+	iagp = (struct iag *) mp->data;
+	/* get the inode number and extent number of the inode within
+	 * the iag and the inode number within the extent.
+	 */
+	ino = inum & (INOSPERIAG - 1);
+	extno = ino >> L2INOSPEREXT;
+	bitno = ino & (INOSPEREXT - 1);
+	mask = HIGHORDER >> bitno;
+	/*
+	 * mark the inode free in persistent map:
+	 */
+	if (is_free) {
+		/* The inode should have been allocated both in working
+		 * map and in persistent map;
+		 * the inode will be freed from working map at the release
+		 * of last reference release;
+		 */
+		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+			jfs_error(ipimap->i_sb,
+				  "inode %ld not marked as allocated in wmap!\n",
+				  inum);
+		}
+		if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
+			jfs_error(ipimap->i_sb,
+				  "inode %ld not marked as allocated in pmap!\n",
+				  inum);
+		}
+		/* update the bitmap for the extent of the freed inode */
+		iagp->pmap[extno] &= cpu_to_le32(~mask);
+	}
+	/*
+	 * mark the inode allocated in persistent map:
+	 */
+	else {
+		/* The inode should be already allocated in the working map
+		 * and should be free in persistent map;
+		 */
+		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+			release_metapage(mp);
+			jfs_error(ipimap->i_sb,
+				  "the inode is not allocated in the working map\n");
+			return -EIO;
+		}
+		if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
+			release_metapage(mp);
+			jfs_error(ipimap->i_sb,
+				  "the inode is not free in the persistent map\n");
+			return -EIO;
+		}
+		/* update the bitmap for the extent of the allocated inode */
+		iagp->pmap[extno] |= cpu_to_le32(mask);
+	}
+	/*
+	 * update iag lsn
+	 */
+	lsn = tblk->lsn;
+	log = JFS_SBI(tblk->sb)->log;
+	LOGSYNC_LOCK(log, flags);
+	if (mp->lsn != 0) {
+		/* inherit older/smaller lsn */
+		logdiff(difft, lsn, log);
+		logdiff(diffp, mp->lsn, log);
+		if (difft < diffp) {
+			mp->lsn = lsn;
+			/* move mp after tblock in logsync list */
+			list_move(&mp->synclist, &tblk->synclist);
+		}
+		/* inherit younger/larger clsn */
+		assert(mp->clsn);
+		logdiff(difft, tblk->clsn, log);
+		logdiff(diffp, mp->clsn, log);
+		if (difft > diffp)
+			mp->clsn = tblk->clsn;
+	} else {
+		mp->log = log;
+		mp->lsn = lsn;
+		/* insert mp after tblock in logsync list */
+		log->count++;
+		list_add(&mp->synclist, &tblk->synclist);
+		mp->clsn = tblk->clsn;
+	}
+	LOGSYNC_UNLOCK(log, flags);
+	write_metapage(mp);
+	return (0);
+}
+
+/*
+ *	diExtendFS()
+ *
+ * function: update imap for extendfs();
+ *
+ * note: AG size has been increased s.t. each k old contiguous AGs are
+ * coalesced into a new AG;
+ */
+int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
+{
+	int rc, rcx = 0;
+	struct inomap *imap = JFS_IP(ipimap)->i_imap;
+	struct iag *iagp = NULL, *hiagp = NULL;
+	struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap;
+	struct metapage *bp, *hbp;
+	int i, n, head;
+	int numinos, xnuminos = 0, xnumfree = 0;
+	s64 agstart;
+
+	jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d",
+		   imap->im_nextiag, atomic_read(&imap->im_numinos),
+		   atomic_read(&imap->im_numfree));
+
+	/*
+	 *	reconstruct imap
+	 *
+	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+	 * note: new AG size = old AG size * (2**x).
+	 */
+
+	/* init per AG control information im_agctl[] */
+	for (i = 0; i < MAXAG; i++) {
+		imap->im_agctl[i].inofree = -1;
+		imap->im_agctl[i].extfree = -1;
+		imap->im_agctl[i].numinos = 0;	/* number of backed inodes */
+		imap->im_agctl[i].numfree = 0;	/* number of free backed inodes */
+	}
+
+	/*
+	 *	process each iag page of the map.
+	 *
+	 * rebuild AG Free Inode List, AG Free Inode Extent List;
+	 */
+	for (i = 0; i < imap->im_nextiag; i++) {
+		if ((rc = diIAGRead(imap, i, &bp))) {
+			rcx = rc;
+			continue;
+		}
+		iagp = (struct iag *) bp->data;
+		if (le32_to_cpu(iagp->iagnum) != i) {
+			release_metapage(bp);
+			jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
+			return -EIO;
+		}
+
+		/* leave free iag in the free iag list */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			release_metapage(bp);
+			continue;
+		}
+
+		agstart = le64_to_cpu(iagp->agstart);
+		n = agstart >> mp->db_agl2size;
+		iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size);
+
+		/* compute backed inodes */
+		numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
+		    << L2INOSPEREXT;
+		if (numinos > 0) {
+			/* merge AG backed inodes */
+			imap->im_agctl[n].numinos += numinos;
+			xnuminos += numinos;
+		}
+
+		/* if any backed free inodes, insert at AG free inode list */
+		if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
+			if ((head = imap->im_agctl[n].inofree) == -1) {
+				iagp->inofreefwd = cpu_to_le32(-1);
+				iagp->inofreeback = cpu_to_le32(-1);
+			} else {
+				if ((rc = diIAGRead(imap, head, &hbp))) {
+					rcx = rc;
+					goto nextiag;
+				}
+				hiagp = (struct iag *) hbp->data;
+				hiagp->inofreeback = iagp->iagnum;
+				iagp->inofreefwd = cpu_to_le32(head);
+				iagp->inofreeback = cpu_to_le32(-1);
+				write_metapage(hbp);
+			}
+
+			imap->im_agctl[n].inofree =
+			    le32_to_cpu(iagp->iagnum);
+
+			/* merge AG backed free inodes */
+			imap->im_agctl[n].numfree +=
+			    le32_to_cpu(iagp->nfreeinos);
+			xnumfree += le32_to_cpu(iagp->nfreeinos);
+		}
+
+		/* if any free extents, insert at AG free extent list */
+		if (le32_to_cpu(iagp->nfreeexts) > 0) {
+			if ((head = imap->im_agctl[n].extfree) == -1) {
+				iagp->extfreefwd = cpu_to_le32(-1);
+				iagp->extfreeback = cpu_to_le32(-1);
+			} else {
+				if ((rc = diIAGRead(imap, head, &hbp))) {
+					rcx = rc;
+					goto nextiag;
+				}
+				hiagp = (struct iag *) hbp->data;
+				hiagp->extfreeback = iagp->iagnum;
+				iagp->extfreefwd = cpu_to_le32(head);
+				iagp->extfreeback = cpu_to_le32(-1);
+				write_metapage(hbp);
+			}
+
+			imap->im_agctl[n].extfree =
+			    le32_to_cpu(iagp->iagnum);
+		}
+
+	      nextiag:
+		write_metapage(bp);
+	}
+
+	if (xnuminos != atomic_read(&imap->im_numinos) ||
+	    xnumfree != atomic_read(&imap->im_numfree)) {
+		jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
+		return -EIO;
+	}
+
+	return rcx;
+}
+
+
+/*
+ *	duplicateIXtree()
+ *
+ * serialization: IWRITE_LOCK held on entry/exit
+ *
+ * note: shadow page with regular inode (rel.2);
+ */
+static void duplicateIXtree(struct super_block *sb, s64 blkno,
+			    int xlen, s64 *xaddr)
+{
+	struct jfs_superblock *j_sb;
+	struct buffer_head *bh;
+	struct inode *ip;
+	tid_t tid;
+
+	/* if AIT2 ipmap2 is bad, do not try to update it */
+	if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT)	/* s_flag */
+		return;
+	ip = diReadSpecial(sb, FILESYSTEM_I, 1);
+	if (ip == NULL) {
+		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+		if (readSuper(sb, &bh))
+			return;
+		j_sb = (struct jfs_superblock *)bh->b_data;
+		j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
+
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+		return;
+	}
+
+	/* start transaction */
+	tid = txBegin(sb, COMMIT_FORCE);
+	/* update the inode map addressing structure to point to it */
+	if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) {
+		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+		txAbort(tid, 1);
+		goto cleanup;
+
+	}
+	/* update the inode map's inode to reflect the extension */
+	ip->i_size += PSIZE;
+	inode_add_bytes(ip, PSIZE);
+	txCommit(tid, 1, &ip, COMMIT_FORCE);
+      cleanup:
+	txEnd(tid);
+	diFreeSpecial(ip);
+}
+
+/*
+ * NAME:	copy_from_dinode()
+ *
+ * FUNCTION:	Copies inode info from disk inode to in-memory inode
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient memory
+ */
+static int copy_from_dinode(struct dinode * dip, struct inode *ip)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+
+	jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
+	jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
+	jfs_set_inode_flags(ip);
+
+	ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
+	if (sbi->umask != -1) {
+		ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask);
+		/* For directories, add x permission if r is allowed by umask */
+		if (S_ISDIR(ip->i_mode)) {
+			if (ip->i_mode & 0400)
+				ip->i_mode |= 0100;
+			if (ip->i_mode & 0040)
+				ip->i_mode |= 0010;
+			if (ip->i_mode & 0004)
+				ip->i_mode |= 0001;
+		}
+	}
+	set_nlink(ip, le32_to_cpu(dip->di_nlink));
+
+	jfs_ip->saved_uid = make_kuid(&init_user_ns, le32_to_cpu(dip->di_uid));
+	if (!uid_valid(sbi->uid))
+		ip->i_uid = jfs_ip->saved_uid;
+	else {
+		ip->i_uid = sbi->uid;
+	}
+
+	jfs_ip->saved_gid = make_kgid(&init_user_ns, le32_to_cpu(dip->di_gid));
+	if (!gid_valid(sbi->gid))
+		ip->i_gid = jfs_ip->saved_gid;
+	else {
+		ip->i_gid = sbi->gid;
+	}
+
+	ip->i_size = le64_to_cpu(dip->di_size);
+	ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
+	ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
+	ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
+	ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
+	ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
+	ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
+	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
+	ip->i_generation = le32_to_cpu(dip->di_gen);
+
+	jfs_ip->ixpxd = dip->di_ixpxd;	/* in-memory pxd's are little-endian */
+	jfs_ip->acl = dip->di_acl;	/* as are dxd's */
+	jfs_ip->ea = dip->di_ea;
+	jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
+	jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
+	jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
+
+	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) {
+		jfs_ip->dev = le32_to_cpu(dip->di_rdev);
+		ip->i_rdev = new_decode_dev(jfs_ip->dev);
+	}
+
+	if (S_ISDIR(ip->i_mode)) {
+		memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
+	} else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
+		memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
+	} else
+		memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128);
+
+	/* Zero the in-memory-only stuff */
+	jfs_ip->cflag = 0;
+	jfs_ip->btindex = 0;
+	jfs_ip->btorder = 0;
+	jfs_ip->bxflag = 0;
+	jfs_ip->blid = 0;
+	jfs_ip->atlhead = 0;
+	jfs_ip->atltail = 0;
+	jfs_ip->xtlid = 0;
+	return (0);
+}
+
+/*
+ * NAME:	copy_to_dinode()
+ *
+ * FUNCTION:	Copies inode info from in-memory inode to disk inode
+ */
+static void copy_to_dinode(struct dinode * dip, struct inode *ip)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+
+	dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
+	dip->di_inostamp = cpu_to_le32(sbi->inostamp);
+	dip->di_number = cpu_to_le32(ip->i_ino);
+	dip->di_gen = cpu_to_le32(ip->i_generation);
+	dip->di_size = cpu_to_le64(ip->i_size);
+	dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
+	dip->di_nlink = cpu_to_le32(ip->i_nlink);
+	if (!uid_valid(sbi->uid))
+		dip->di_uid = cpu_to_le32(i_uid_read(ip));
+	else
+		dip->di_uid =cpu_to_le32(from_kuid(&init_user_ns,
+						   jfs_ip->saved_uid));
+	if (!gid_valid(sbi->gid))
+		dip->di_gid = cpu_to_le32(i_gid_read(ip));
+	else
+		dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns,
+						    jfs_ip->saved_gid));
+	jfs_get_inode_flags(jfs_ip);
+	/*
+	 * mode2 is only needed for storing the higher order bits.
+	 * Trust i_mode for the lower order ones
+	 */
+	if (sbi->umask == -1)
+		dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) |
+					   ip->i_mode);
+	else /* Leave the original permissions alone */
+		dip->di_mode = cpu_to_le32(jfs_ip->mode2);
+
+	dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
+	dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
+	dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
+	dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
+	dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
+	dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
+	dip->di_ixpxd = jfs_ip->ixpxd;	/* in-memory pxd's are little-endian */
+	dip->di_acl = jfs_ip->acl;	/* as are dxd's */
+	dip->di_ea = jfs_ip->ea;
+	dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
+	dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
+	dip->di_otime.tv_nsec = 0;
+	dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
+	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
+		dip->di_rdev = cpu_to_le32(jfs_ip->dev);
+}
diff --git a/kernel/fs/jfs/jfs_imap.h b/kernel/fs/jfs/jfs_imap.h
new file mode 100644
index 000000000..610a0e9d8
--- /dev/null
+++ b/kernel/fs/jfs/jfs_imap.h
@@ -0,0 +1,175 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_IMAP
+#define _H_JFS_IMAP
+
+#include "jfs_txnmgr.h"
+
+/*
+ *	jfs_imap.h: disk inode manager
+ */
+
+#define	EXTSPERIAG	128	/* number of disk inode extent per iag	*/
+#define IMAPBLKNO	0	/* lblkno of dinomap within inode map	*/
+#define SMAPSZ		4	/* number of words per summary map	*/
+#define	EXTSPERSUM	32	/* number of extents per summary map entry */
+#define	L2EXTSPERSUM	5	/* l2 number of extents per summary map */
+#define	PGSPERIEXT	4	/* number of 4K pages per dinode extent */
+#define	MAXIAGS		((1<<20)-1)	/* maximum number of iags	*/
+#define	MAXAG		128	/* maximum number of allocation groups	*/
+
+#define AMAPSIZE	512	/* bytes in the IAG allocation maps */
+#define SMAPSIZE	16	/* bytes in the IAG summary maps */
+
+/* convert inode number to iag number */
+#define	INOTOIAG(ino)	((ino) >> L2INOSPERIAG)
+
+/* convert iag number to logical block number of the iag page */
+#define IAGTOLBLK(iagno,l2nbperpg)	(((iagno) + 1) << (l2nbperpg))
+
+/* get the starting block number of the 4K page of an inode extent
+ * that contains ino.
+ */
+#define INOPBLK(pxd,ino,l2nbperpg)	(addressPXD((pxd)) +		\
+	((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg)))
+
+/*
+ *	inode allocation map:
+ *
+ * inode allocation map consists of
+ * . the inode map control page and
+ * . inode allocation group pages (per 4096 inodes)
+ * which are addressed by standard JFS xtree.
+ */
+/*
+ *	inode allocation group page (per 4096 inodes of an AG)
+ */
+struct iag {
+	__le64 agstart;		/* 8: starting block of ag		*/
+	__le32 iagnum;		/* 4: inode allocation group number	*/
+	__le32 inofreefwd;	/* 4: ag inode free list forward	*/
+	__le32 inofreeback;	/* 4: ag inode free list back		*/
+	__le32 extfreefwd;	/* 4: ag inode extent free list forward	*/
+	__le32 extfreeback;	/* 4: ag inode extent free list back	*/
+	__le32 iagfree;		/* 4: iag free list			*/
+
+	/* summary map: 1 bit per inode extent */
+	__le32 inosmap[SMAPSZ];	/* 16: sum map of mapwords w/ free inodes;
+				 *	note: this indicates free and backed
+				 *	inodes, if the extent is not backed the
+				 *	value will be 1.  if the extent is
+				 *	backed but all inodes are being used the
+				 *	value will be 1.  if the extent is
+				 *	backed but at least one of the inodes is
+				 *	free the value will be 0.
+				 */
+	__le32 extsmap[SMAPSZ];	/* 16: sum map of mapwords w/ free extents */
+	__le32 nfreeinos;	/* 4: number of free inodes		*/
+	__le32 nfreeexts;	/* 4: number of free extents		*/
+	/* (72) */
+	u8 pad[1976];		/* 1976: pad to 2048 bytes */
+	/* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
+	__le32 wmap[EXTSPERIAG];	/* 512: working allocation map */
+	__le32 pmap[EXTSPERIAG];	/* 512: persistent allocation map */
+	pxd_t inoext[EXTSPERIAG];	/* 1024: inode extent addresses */
+};				/* (4096) */
+
+/*
+ *	per AG control information (in inode map control page)
+ */
+struct iagctl_disk {
+	__le32 inofree;		/* 4: free inode list anchor		*/
+	__le32 extfree;		/* 4: free extent list anchor		*/
+	__le32 numinos;		/* 4: number of backed inodes		*/
+	__le32 numfree;		/* 4: number of free inodes		*/
+};				/* (16) */
+
+struct iagctl {
+	int inofree;		/* free inode list anchor		*/
+	int extfree;		/* free extent list anchor		*/
+	int numinos;		/* number of backed inodes		*/
+	int numfree;		/* number of free inodes		*/
+};
+
+/*
+ *	per fileset/aggregate inode map control page
+ */
+struct dinomap_disk {
+	__le32 in_freeiag;	/* 4: free iag list anchor	*/
+	__le32 in_nextiag;	/* 4: next free iag number	*/
+	__le32 in_numinos;	/* 4: num of backed inodes	*/
+	__le32 in_numfree;	/* 4: num of free backed inodes */
+	__le32 in_nbperiext;	/* 4: num of blocks per inode extent */
+	__le32 in_l2nbperiext;	/* 4: l2 of in_nbperiext	*/
+	__le32 in_diskblock;	/* 4: for standalone test driver */
+	__le32 in_maxag;	/* 4: for standalone test driver */
+	u8 pad[2016];		/* 2016: pad to 2048		*/
+	struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
+};				/* (4096) */
+
+struct dinomap {
+	int in_freeiag;		/* free iag list anchor		*/
+	int in_nextiag;		/* next free iag number		*/
+	int in_numinos;		/* num of backed inodes		*/
+	int in_numfree;		/* num of free backed inodes	*/
+	int in_nbperiext;	/* num of blocks per inode extent */
+	int in_l2nbperiext;	/* l2 of in_nbperiext		*/
+	int in_diskblock;	/* for standalone test driver	*/
+	int in_maxag;		/* for standalone test driver	*/
+	struct iagctl in_agctl[MAXAG];	/* AG control information */
+};
+
+/*
+ *	In-core inode map control page
+ */
+struct inomap {
+	struct dinomap im_imap;		/* 4096: inode allocation control */
+	struct inode *im_ipimap;	/* 4: ptr to inode for imap	*/
+	struct mutex im_freelock;	/* 4: iag free list lock	*/
+	struct mutex im_aglock[MAXAG];	/* 512: per AG locks		*/
+	u32 *im_DBGdimap;
+	atomic_t im_numinos;	/* num of backed inodes */
+	atomic_t im_numfree;	/* num of free backed inodes */
+};
+
+#define	im_freeiag	im_imap.in_freeiag
+#define	im_nextiag	im_imap.in_nextiag
+#define	im_agctl	im_imap.in_agctl
+#define	im_nbperiext	im_imap.in_nbperiext
+#define	im_l2nbperiext	im_imap.in_l2nbperiext
+
+/* for standalone testdriver
+ */
+#define	im_diskblock	im_imap.in_diskblock
+#define	im_maxag	im_imap.in_maxag
+
+extern int diFree(struct inode *);
+extern int diAlloc(struct inode *, bool, struct inode *);
+extern int diSync(struct inode *);
+/* external references */
+extern int diUpdatePMap(struct inode *ipimap, unsigned long inum,
+			bool is_free, struct tblock * tblk);
+extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap);
+extern int diMount(struct inode *);
+extern int diUnmount(struct inode *, int);
+extern int diRead(struct inode *);
+extern struct inode *diReadSpecial(struct super_block *, ino_t, int);
+extern void diWriteSpecial(struct inode *, int);
+extern void diFreeSpecial(struct inode *);
+extern int diWrite(tid_t tid, struct inode *);
+#endif				/* _H_JFS_IMAP */
diff --git a/kernel/fs/jfs/jfs_incore.h b/kernel/fs/jfs/jfs_incore.h
new file mode 100644
index 000000000..fa7e795bd
--- /dev/null
+++ b/kernel/fs/jfs/jfs_incore.h
@@ -0,0 +1,228 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_INCORE
+#define _H_JFS_INCORE
+
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include "jfs_types.h"
+#include "jfs_xtree.h"
+#include "jfs_dtree.h"
+
+/*
+ * JFS magic number
+ */
+#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */
+
+/*
+ * JFS-private inode information
+ */
+struct jfs_inode_info {
+	int	fileset;	/* fileset number (always 16)*/
+	uint	mode2;		/* jfs-specific mode		*/
+	kuid_t	saved_uid;	/* saved for uid mount option */
+	kgid_t	saved_gid;	/* saved for gid mount option */
+	pxd_t	ixpxd;		/* inode extent descriptor	*/
+	dxd_t	acl;		/* dxd describing acl	*/
+	dxd_t	ea;		/* dxd describing ea	*/
+	time_t	otime;		/* time created	*/
+	uint	next_index;	/* next available directory entry index */
+	int	acltype;	/* Type of ACL	*/
+	short	btorder;	/* access order	*/
+	short	btindex;	/* btpage entry index*/
+	struct inode *ipimap;	/* inode map			*/
+	unsigned long cflag;	/* commit flags		*/
+	u64	agstart;	/* agstart of the containing IAG */
+	u16	bxflag;		/* xflag of pseudo buffer?	*/
+	unchar	pad;
+	signed char active_ag;	/* ag currently allocating from	*/
+	lid_t	blid;		/* lid of pseudo buffer?	*/
+	lid_t	atlhead;	/* anonymous tlock list head	*/
+	lid_t	atltail;	/* anonymous tlock list tail	*/
+	spinlock_t ag_lock;	/* protects active_ag		*/
+	struct list_head anon_inode_list; /* inodes having anonymous txns */
+	/*
+	 * rdwrlock serializes xtree between reads & writes and synchronizes
+	 * changes to special inodes.  It's use would be redundant on
+	 * directories since the i_mutex taken in the VFS is sufficient.
+	 */
+	struct rw_semaphore rdwrlock;
+	/*
+	 * commit_mutex serializes transaction processing on an inode.
+	 * It must be taken after beginning a transaction (txBegin), since
+	 * dirty inodes may be committed while a new transaction on the
+	 * inode is blocked in txBegin or TxBeginAnon
+	 */
+	struct mutex commit_mutex;
+	/* xattr_sem allows us to access the xattrs without taking i_mutex */
+	struct rw_semaphore xattr_sem;
+	lid_t	xtlid;		/* lid of xtree lock on directory */
+	union {
+		struct {
+			xtpage_t _xtroot;	/* 288: xtree root */
+			struct inomap *_imap;	/* 4: inode map header	*/
+		} file;
+		struct {
+			struct dir_table_slot _table[12]; /* 96: dir index */
+			dtroot_t _dtroot;	/* 288: dtree root */
+		} dir;
+		struct {
+			unchar _unused[16];	/* 16: */
+			dxd_t _dxd;		/* 16: */
+			unchar _inline[128];	/* 128: inline symlink */
+			/* _inline_ea may overlay the last part of
+			 * file._xtroot if maxentry = XTROOTINITSLOT
+			 */
+			unchar _inline_ea[128];	/* 128: inline extended attr */
+		} link;
+	} u;
+#ifdef CONFIG_QUOTA
+	struct dquot *i_dquot[MAXQUOTAS];
+#endif
+	u32 dev;	/* will die when we get wide dev_t */
+	struct inode	vfs_inode;
+};
+#define i_xtroot u.file._xtroot
+#define i_imap u.file._imap
+#define i_dirtable u.dir._table
+#define i_dtroot u.dir._dtroot
+#define i_inline u.link._inline
+#define i_inline_ea u.link._inline_ea
+
+#define IREAD_LOCK(ip, subclass) \
+	down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
+#define IREAD_UNLOCK(ip)	up_read(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK(ip, subclass) \
+	down_write_nested(&JFS_IP(ip)->rdwrlock, subclass)
+#define IWRITE_UNLOCK(ip)	up_write(&JFS_IP(ip)->rdwrlock)
+
+/*
+ * cflag
+ */
+enum cflags {
+	COMMIT_Nolink,		/* inode committed with zero link count */
+	COMMIT_Inlineea,	/* commit inode inline EA */
+	COMMIT_Freewmap,	/* free WMAP at iClose() */
+	COMMIT_Dirty,		/* Inode is really dirty */
+	COMMIT_Dirtable,	/* commit changes to di_dirtable */
+	COMMIT_Stale,		/* data extent is no longer valid */
+	COMMIT_Synclist,	/* metadata pages on group commit synclist */
+};
+
+/*
+ * commit_mutex nesting subclasses:
+ */
+enum commit_mutex_class
+{
+	COMMIT_MUTEX_PARENT,
+	COMMIT_MUTEX_CHILD,
+	COMMIT_MUTEX_SECOND_PARENT,	/* Renaming */
+	COMMIT_MUTEX_VICTIM		/* Inode being unlinked due to rename */
+};
+
+/*
+ * rdwrlock subclasses:
+ * The dmap inode may be locked while a normal inode or the imap inode are
+ * locked.
+ */
+enum rdwrlock_class
+{
+	RDWRLOCK_NORMAL,
+	RDWRLOCK_IMAP,
+	RDWRLOCK_DMAP
+};
+
+#define set_cflag(flag, ip)	set_bit(flag, &(JFS_IP(ip)->cflag))
+#define clear_cflag(flag, ip)	clear_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_cflag(flag, ip)	test_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_and_clear_cflag(flag, ip) \
+	test_and_clear_bit(flag, &(JFS_IP(ip)->cflag))
+/*
+ * JFS-private superblock information.
+ */
+struct jfs_sb_info {
+	struct super_block *sb;		/* Point back to vfs super block */
+	unsigned long	mntflag;	/* aggregate attributes	*/
+	struct inode	*ipbmap;	/* block map inode		*/
+	struct inode	*ipaimap;	/* aggregate inode map inode	*/
+	struct inode	*ipaimap2;	/* secondary aimap inode	*/
+	struct inode	*ipimap;	/* aggregate inode map inode	*/
+	struct jfs_log	*log;		/* log			*/
+	struct list_head log_list;	/* volumes associated with a journal */
+	short		bsize;		/* logical block size	*/
+	short		l2bsize;	/* log2 logical block size	*/
+	short		nbperpage;	/* blocks per page		*/
+	short		l2nbperpage;	/* log2 blocks per page	*/
+	short		l2niperblk;	/* log2 inodes per page	*/
+	dev_t		logdev;		/* external log device	*/
+	uint		aggregate;	/* volume identifier in log record */
+	pxd_t		logpxd;		/* pxd describing log	*/
+	pxd_t		fsckpxd;	/* pxd describing fsck wkspc */
+	pxd_t		ait2;		/* pxd describing AIT copy	*/
+	char		uuid[16];	/* 128-bit uuid for volume	*/
+	char		loguuid[16];	/* 128-bit uuid for log	*/
+	/*
+	 * commit_state is used for synchronization of the jfs_commit
+	 * threads.  It is protected by LAZY_LOCK().
+	 */
+	int		commit_state;	/* commit state */
+	/* Formerly in ipimap */
+	uint		gengen;		/* inode generation generator*/
+	uint		inostamp;	/* shows inode belongs to fileset*/
+
+	/* Formerly in ipbmap */
+	struct bmap	*bmap;		/* incore bmap descriptor	*/
+	struct nls_table *nls_tab;	/* current codepage		*/
+	struct inode *direct_inode;	/* metadata inode */
+	uint		state;		/* mount/recovery state	*/
+	unsigned long	flag;		/* mount time flags */
+	uint		p_state;	/* state prior to going no integrity */
+	kuid_t		uid;		/* uid to override on-disk uid */
+	kgid_t		gid;		/* gid to override on-disk gid */
+	uint		umask;		/* umask to override on-disk umask */
+	uint		minblks_trim;	/* minimum blocks, for online trim */
+};
+
+/* jfs_sb_info commit_state */
+#define IN_LAZYCOMMIT 1
+
+static inline struct jfs_inode_info *JFS_IP(struct inode *inode)
+{
+	return list_entry(inode, struct jfs_inode_info, vfs_inode);
+}
+
+static inline int jfs_dirtable_inline(struct inode *inode)
+{
+	return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1));
+}
+
+static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline int isReadOnly(struct inode *inode)
+{
+	if (JFS_SBI(inode->i_sb)->log)
+		return 0;
+	return 1;
+}
+#endif /* _H_JFS_INCORE */
diff --git a/kernel/fs/jfs/jfs_inode.c b/kernel/fs/jfs/jfs_inode.c
new file mode 100644
index 000000000..6b0f81620
--- /dev/null
+++ b/kernel/fs/jfs/jfs_inode.c
@@ -0,0 +1,165 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_dinode.h"
+#include "jfs_debug.h"
+
+
+void jfs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = JFS_IP(inode)->mode2;
+	unsigned int new_fl = 0;
+
+	if (flags & JFS_IMMUTABLE_FL)
+		new_fl |= S_IMMUTABLE;
+	if (flags & JFS_APPEND_FL)
+		new_fl |= S_APPEND;
+	if (flags & JFS_NOATIME_FL)
+		new_fl |= S_NOATIME;
+	if (flags & JFS_DIRSYNC_FL)
+		new_fl |= S_DIRSYNC;
+	if (flags & JFS_SYNC_FL)
+		new_fl |= S_SYNC;
+	inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME |
+			S_DIRSYNC | S_SYNC);
+}
+
+void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
+{
+	unsigned int flags = jfs_ip->vfs_inode.i_flags;
+
+	jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL |
+			   JFS_DIRSYNC_FL | JFS_SYNC_FL);
+	if (flags & S_IMMUTABLE)
+		jfs_ip->mode2 |= JFS_IMMUTABLE_FL;
+	if (flags & S_APPEND)
+		jfs_ip->mode2 |= JFS_APPEND_FL;
+	if (flags & S_NOATIME)
+		jfs_ip->mode2 |= JFS_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		jfs_ip->mode2 |= JFS_DIRSYNC_FL;
+	if (flags & S_SYNC)
+		jfs_ip->mode2 |= JFS_SYNC_FL;
+}
+
+/*
+ * NAME:	ialloc()
+ *
+ * FUNCTION:	Allocate a new inode
+ *
+ */
+struct inode *ialloc(struct inode *parent, umode_t mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode *inode;
+	struct jfs_inode_info *jfs_inode;
+	int rc;
+
+	inode = new_inode(sb);
+	if (!inode) {
+		jfs_warn("ialloc: new_inode returned NULL!");
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	jfs_inode = JFS_IP(inode);
+
+	rc = diAlloc(parent, S_ISDIR(mode), inode);
+	if (rc) {
+		jfs_warn("ialloc: diAlloc returned %d!", rc);
+		if (rc == -EIO)
+			make_bad_inode(inode);
+		goto fail_put;
+	}
+
+	if (insert_inode_locked(inode) < 0) {
+		rc = -EINVAL;
+		goto fail_put;
+	}
+
+	inode_init_owner(inode, parent, mode);
+	/*
+	 * New inodes need to save sane values on disk when
+	 * uid & gid mount options are used
+	 */
+	jfs_inode->saved_uid = inode->i_uid;
+	jfs_inode->saved_gid = inode->i_gid;
+
+	/*
+	 * Allocate inode to quota.
+	 */
+	dquot_initialize(inode);
+	rc = dquot_alloc_inode(inode);
+	if (rc)
+		goto fail_drop;
+
+	/* inherit flags from parent */
+	jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
+
+	if (S_ISDIR(mode)) {
+		jfs_inode->mode2 |= IDIRECTORY;
+		jfs_inode->mode2 &= ~JFS_DIRSYNC_FL;
+	}
+	else {
+		jfs_inode->mode2 |= INLINEEA | ISPARSE;
+		if (S_ISLNK(mode))
+			jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
+	}
+	jfs_inode->mode2 |= inode->i_mode;
+
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	jfs_inode->otime = inode->i_ctime.tv_sec;
+	inode->i_generation = JFS_SBI(sb)->gengen++;
+
+	jfs_inode->cflag = 0;
+
+	/* Zero remaining fields */
+	memset(&jfs_inode->acl, 0, sizeof(dxd_t));
+	memset(&jfs_inode->ea, 0, sizeof(dxd_t));
+	jfs_inode->next_index = 0;
+	jfs_inode->acltype = 0;
+	jfs_inode->btorder = 0;
+	jfs_inode->btindex = 0;
+	jfs_inode->bxflag = 0;
+	jfs_inode->blid = 0;
+	jfs_inode->atlhead = 0;
+	jfs_inode->atltail = 0;
+	jfs_inode->xtlid = 0;
+	jfs_set_inode_flags(inode);
+
+	jfs_info("ialloc returns inode = 0x%p\n", inode);
+
+	return inode;
+
+fail_drop:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+fail_put:
+	iput(inode);
+fail:
+	return ERR_PTR(rc);
+}
diff --git a/kernel/fs/jfs/jfs_inode.h b/kernel/fs/jfs/jfs_inode.h
new file mode 100644
index 000000000..9271cfe4a
--- /dev/null
+++ b/kernel/fs/jfs/jfs_inode.h
@@ -0,0 +1,53 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_INODE
+#define _H_JFS_INODE
+
+struct fid;
+
+extern struct inode *ialloc(struct inode *, umode_t);
+extern int jfs_fsync(struct file *, loff_t, loff_t, int);
+extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
+extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
+extern struct inode *jfs_iget(struct super_block *, unsigned long);
+extern int jfs_commit_inode(struct inode *, int);
+extern int jfs_write_inode(struct inode *, struct writeback_control *);
+extern void jfs_evict_inode(struct inode *);
+extern void jfs_dirty_inode(struct inode *, int);
+extern void jfs_truncate(struct inode *);
+extern void jfs_truncate_nolock(struct inode *, loff_t);
+extern void jfs_free_zero_link(struct inode *);
+extern struct dentry *jfs_get_parent(struct dentry *dentry);
+extern void jfs_get_inode_flags(struct jfs_inode_info *);
+extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+	int fh_len, int fh_type);
+extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+	int fh_len, int fh_type);
+extern void jfs_set_inode_flags(struct inode *);
+extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int jfs_setattr(struct dentry *, struct iattr *);
+
+extern const struct address_space_operations jfs_aops;
+extern const struct inode_operations jfs_dir_inode_operations;
+extern const struct file_operations jfs_dir_operations;
+extern const struct inode_operations jfs_file_inode_operations;
+extern const struct file_operations jfs_file_operations;
+extern const struct inode_operations jfs_symlink_inode_operations;
+extern const struct inode_operations jfs_fast_symlink_inode_operations;
+extern const struct dentry_operations jfs_ci_dentry_operations;
+#endif				/* _H_JFS_INODE */
diff --git a/kernel/fs/jfs/jfs_lock.h b/kernel/fs/jfs/jfs_lock.h
new file mode 100644
index 000000000..ecf048822
--- /dev/null
+++ b/kernel/fs/jfs/jfs_lock.h
@@ -0,0 +1,52 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_LOCK
+#define _H_JFS_LOCK
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+
+/*
+ *	jfs_lock.h
+ */
+
+/*
+ * Conditional sleep where condition is protected by spinlock
+ *
+ * lock_cmd and unlock_cmd take and release the spinlock
+ */
+#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd)	\
+do {							\
+	DECLARE_WAITQUEUE(__wait, current);		\
+							\
+	add_wait_queue(&wq, &__wait);			\
+	for (;;) {					\
+		set_current_state(TASK_UNINTERRUPTIBLE);\
+		if (cond)				\
+			break;				\
+		unlock_cmd;				\
+		io_schedule();				\
+		lock_cmd;				\
+	}						\
+	__set_current_state(TASK_RUNNING);			\
+	remove_wait_queue(&wq, &__wait);		\
+} while (0)
+
+#endif				/* _H_JFS_LOCK */
diff --git a/kernel/fs/jfs/jfs_logmgr.c b/kernel/fs/jfs/jfs_logmgr.c
new file mode 100644
index 000000000..bc462dcd7
--- /dev/null
+++ b/kernel/fs/jfs/jfs_logmgr.c
@@ -0,0 +1,2533 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_logmgr.c: log manager
+ *
+ * for related information, see transaction manager (jfs_txnmgr.c), and
+ * recovery manager (jfs_logredo.c).
+ *
+ * note: for detail, RTFS.
+ *
+ *	log buffer manager:
+ * special purpose buffer manager supporting log i/o requirements.
+ * per log serial pageout of logpage
+ * queuing i/o requests and redrive i/o at iodone
+ * maintain current logpage buffer
+ * no caching since append only
+ * appropriate jfs buffer cache buffers as needed
+ *
+ *	group commit:
+ * transactions which wrote COMMIT records in the same in-memory
+ * log page during the pageout of previous/current log page(s) are
+ * committed together by the pageout of the page.
+ *
+ *	TBD lazy commit:
+ * transactions are committed asynchronously when the log page
+ * containing it COMMIT is paged out when it becomes full;
+ *
+ *	serialization:
+ * . a per log lock serialize log write.
+ * . a per log lock serialize group commit.
+ * . a per log lock serialize log open/close;
+ *
+ *	TBD log integrity:
+ * careful-write (ping-pong) of last logpage to recover from crash
+ * in overwrite.
+ * detection of split (out-of-order) write of physical sectors
+ * of last logpage via timestamp at end of each sector
+ * with its mirror data array at trailer).
+ *
+ *	alternatives:
+ * lsn - 64-bit monotonically increasing integer vs
+ * 32-bit lspn and page eor.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/buffer_head.h>		/* for sync_blockdev() */
+#include <linux/bio.h>
+#include <linux/freezer.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+
+/*
+ * lbuf's ready to be redriven.  Protected by log_redrive_lock (jfsIO thread)
+ */
+static struct lbuf *log_redrive_list;
+static DEFINE_SPINLOCK(log_redrive_lock);
+
+
+/*
+ *	log read/write serialization (per log)
+ */
+#define LOG_LOCK_INIT(log)	mutex_init(&(log)->loglock)
+#define LOG_LOCK(log)		mutex_lock(&((log)->loglock))
+#define LOG_UNLOCK(log)		mutex_unlock(&((log)->loglock))
+
+
+/*
+ *	log group commit serialization (per log)
+ */
+
+#define LOGGC_LOCK_INIT(log)	spin_lock_init(&(log)->gclock)
+#define LOGGC_LOCK(log)		spin_lock_irq(&(log)->gclock)
+#define LOGGC_UNLOCK(log)	spin_unlock_irq(&(log)->gclock)
+#define LOGGC_WAKEUP(tblk)	wake_up_all(&(tblk)->gcwait)
+
+/*
+ *	log sync serialization (per log)
+ */
+#define	LOGSYNC_DELTA(logsize)		min((logsize)/8, 128*LOGPSIZE)
+#define	LOGSYNC_BARRIER(logsize)	((logsize)/4)
+/*
+#define	LOGSYNC_DELTA(logsize)		min((logsize)/4, 256*LOGPSIZE)
+#define	LOGSYNC_BARRIER(logsize)	((logsize)/2)
+*/
+
+
+/*
+ *	log buffer cache synchronization
+ */
+static DEFINE_SPINLOCK(jfsLCacheLock);
+
+#define	LCACHE_LOCK(flags)	spin_lock_irqsave(&jfsLCacheLock, flags)
+#define	LCACHE_UNLOCK(flags)	spin_unlock_irqrestore(&jfsLCacheLock, flags)
+
+/*
+ * See __SLEEP_COND in jfs_locks.h
+ */
+#define LCACHE_SLEEP_COND(wq, cond, flags)	\
+do {						\
+	if (cond)				\
+		break;				\
+	__SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
+} while (0)
+
+#define	LCACHE_WAKEUP(event)	wake_up(event)
+
+
+/*
+ *	lbuf buffer cache (lCache) control
+ */
+/* log buffer manager pageout control (cumulative, inclusive) */
+#define	lbmREAD		0x0001
+#define	lbmWRITE	0x0002	/* enqueue at tail of write queue;
+				 * init pageout if at head of queue;
+				 */
+#define	lbmRELEASE	0x0004	/* remove from write queue
+				 * at completion of pageout;
+				 * do not free/recycle it yet:
+				 * caller will free it;
+				 */
+#define	lbmSYNC		0x0008	/* do not return to freelist
+				 * when removed from write queue;
+				 */
+#define lbmFREE		0x0010	/* return to freelist
+				 * at completion of pageout;
+				 * the buffer may be recycled;
+				 */
+#define	lbmDONE		0x0020
+#define	lbmERROR	0x0040
+#define lbmGC		0x0080	/* lbmIODone to perform post-GC processing
+				 * of log page
+				 */
+#define lbmDIRECT	0x0100
+
+/*
+ * Global list of active external journals
+ */
+static LIST_HEAD(jfs_external_logs);
+static struct jfs_log *dummy_log;
+static DEFINE_MUTEX(jfs_log_mutex);
+
+/*
+ * forward references
+ */
+static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
+			 struct lrd * lrd, struct tlock * tlck);
+
+static int lmNextPage(struct jfs_log * log);
+static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
+			   int activate);
+
+static int open_inline_log(struct super_block *sb);
+static int open_dummy_log(struct super_block *sb);
+static int lbmLogInit(struct jfs_log * log);
+static void lbmLogShutdown(struct jfs_log * log);
+static struct lbuf *lbmAllocate(struct jfs_log * log, int);
+static void lbmFree(struct lbuf * bp);
+static void lbmfree(struct lbuf * bp);
+static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
+static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block);
+static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
+static int lbmIOWait(struct lbuf * bp, int flag);
+static bio_end_io_t lbmIODone;
+static void lbmStartIO(struct lbuf * bp);
+static void lmGCwrite(struct jfs_log * log, int cant_block);
+static int lmLogSync(struct jfs_log * log, int hard_sync);
+
+
+
+/*
+ *	statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+static struct lmStat {
+	uint commit;		/* # of commit */
+	uint pagedone;		/* # of page written */
+	uint submitted;		/* # of pages submitted */
+	uint full_page;		/* # of full pages submitted */
+	uint partial_page;	/* # of partial pages submitted */
+} lmStat;
+#endif
+
+static void write_special_inodes(struct jfs_log *log,
+				 int (*writer)(struct address_space *))
+{
+	struct jfs_sb_info *sbi;
+
+	list_for_each_entry(sbi, &log->sb_list, log_list) {
+		writer(sbi->ipbmap->i_mapping);
+		writer(sbi->ipimap->i_mapping);
+		writer(sbi->direct_inode->i_mapping);
+	}
+}
+
+/*
+ * NAME:	lmLog()
+ *
+ * FUNCTION:	write a log record;
+ *
+ * PARAMETER:
+ *
+ * RETURN:	lsn - offset to the next log record to write (end-of-log);
+ *		-1  - error;
+ *
+ * note: todo: log error handler
+ */
+int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	  struct tlock * tlck)
+{
+	int lsn;
+	int diffp, difft;
+	struct metapage *mp = NULL;
+	unsigned long flags;
+
+	jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
+		 log, tblk, lrd, tlck);
+
+	LOG_LOCK(log);
+
+	/* log by (out-of-transaction) JFS ? */
+	if (tblk == NULL)
+		goto writeRecord;
+
+	/* log from page ? */
+	if (tlck == NULL ||
+	    tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
+		goto writeRecord;
+
+	/*
+	 *	initialize/update page/transaction recovery lsn
+	 */
+	lsn = log->lsn;
+
+	LOGSYNC_LOCK(log, flags);
+
+	/*
+	 * initialize page lsn if first log write of the page
+	 */
+	if (mp->lsn == 0) {
+		mp->log = log;
+		mp->lsn = lsn;
+		log->count++;
+
+		/* insert page at tail of logsynclist */
+		list_add_tail(&mp->synclist, &log->synclist);
+	}
+
+	/*
+	 *	initialize/update lsn of tblock of the page
+	 *
+	 * transaction inherits oldest lsn of pages associated
+	 * with allocation/deallocation of resources (their
+	 * log records are used to reconstruct allocation map
+	 * at recovery time: inode for inode allocation map,
+	 * B+-tree index of extent descriptors for block
+	 * allocation map);
+	 * allocation map pages inherit transaction lsn at
+	 * commit time to allow forwarding log syncpt past log
+	 * records associated with allocation/deallocation of
+	 * resources only after persistent map of these map pages
+	 * have been updated and propagated to home.
+	 */
+	/*
+	 * initialize transaction lsn:
+	 */
+	if (tblk->lsn == 0) {
+		/* inherit lsn of its first page logged */
+		tblk->lsn = mp->lsn;
+		log->count++;
+
+		/* insert tblock after the page on logsynclist */
+		list_add(&tblk->synclist, &mp->synclist);
+	}
+	/*
+	 * update transaction lsn:
+	 */
+	else {
+		/* inherit oldest/smallest lsn of page */
+		logdiff(diffp, mp->lsn, log);
+		logdiff(difft, tblk->lsn, log);
+		if (diffp < difft) {
+			/* update tblock lsn with page lsn */
+			tblk->lsn = mp->lsn;
+
+			/* move tblock after page on logsynclist */
+			list_move(&tblk->synclist, &mp->synclist);
+		}
+	}
+
+	LOGSYNC_UNLOCK(log, flags);
+
+	/*
+	 *	write the log record
+	 */
+      writeRecord:
+	lsn = lmWriteRecord(log, tblk, lrd, tlck);
+
+	/*
+	 * forward log syncpt if log reached next syncpt trigger
+	 */
+	logdiff(diffp, lsn, log);
+	if (diffp >= log->nextsync)
+		lsn = lmLogSync(log, 0);
+
+	/* update end-of-log lsn */
+	log->lsn = lsn;
+
+	LOG_UNLOCK(log);
+
+	/* return end-of-log address */
+	return lsn;
+}
+
+/*
+ * NAME:	lmWriteRecord()
+ *
+ * FUNCTION:	move the log record to current log page
+ *
+ * PARAMETER:	cd	- commit descriptor
+ *
+ * RETURN:	end-of-log address
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int
+lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	      struct tlock * tlck)
+{
+	int lsn = 0;		/* end-of-log address */
+	struct lbuf *bp;	/* dst log page buffer */
+	struct logpage *lp;	/* dst log page */
+	caddr_t dst;		/* destination address in log page */
+	int dstoffset;		/* end-of-log offset in log page */
+	int freespace;		/* free space in log page */
+	caddr_t p;		/* src meta-data page */
+	caddr_t src;
+	int srclen;
+	int nbytes;		/* number of bytes to move */
+	int i;
+	int len;
+	struct linelock *linelock;
+	struct lv *lv;
+	struct lvd *lvd;
+	int l2linesize;
+
+	len = 0;
+
+	/* retrieve destination log page to write */
+	bp = (struct lbuf *) log->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	dstoffset = log->eor;
+
+	/* any log data to write ? */
+	if (tlck == NULL)
+		goto moveLrd;
+
+	/*
+	 *	move log record data
+	 */
+	/* retrieve source meta-data page to log */
+	if (tlck->flag & tlckPAGELOCK) {
+		p = (caddr_t) (tlck->mp->data);
+		linelock = (struct linelock *) & tlck->lock;
+	}
+	/* retrieve source in-memory inode to log */
+	else if (tlck->flag & tlckINODELOCK) {
+		if (tlck->type & tlckDTREE)
+			p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
+		else
+			p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
+		linelock = (struct linelock *) & tlck->lock;
+	}
+#ifdef	_JFS_WIP
+	else if (tlck->flag & tlckINLINELOCK) {
+
+		inlinelock = (struct inlinelock *) & tlck;
+		p = (caddr_t) & inlinelock->pxd;
+		linelock = (struct linelock *) & tlck;
+	}
+#endif				/* _JFS_WIP */
+	else {
+		jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
+		return 0;	/* Probably should trap */
+	}
+	l2linesize = linelock->l2linesize;
+
+      moveData:
+	ASSERT(linelock->index <= linelock->maxcnt);
+
+	lv = linelock->lv;
+	for (i = 0; i < linelock->index; i++, lv++) {
+		if (lv->length == 0)
+			continue;
+
+		/* is page full ? */
+		if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
+			/* page become full: move on to next page */
+			lmNextPage(log);
+
+			bp = log->bp;
+			lp = (struct logpage *) bp->l_ldata;
+			dstoffset = LOGPHDRSIZE;
+		}
+
+		/*
+		 * move log vector data
+		 */
+		src = (u8 *) p + (lv->offset << l2linesize);
+		srclen = lv->length << l2linesize;
+		len += srclen;
+		while (srclen > 0) {
+			freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+			nbytes = min(freespace, srclen);
+			dst = (caddr_t) lp + dstoffset;
+			memcpy(dst, src, nbytes);
+			dstoffset += nbytes;
+
+			/* is page not full ? */
+			if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+				break;
+
+			/* page become full: move on to next page */
+			lmNextPage(log);
+
+			bp = (struct lbuf *) log->bp;
+			lp = (struct logpage *) bp->l_ldata;
+			dstoffset = LOGPHDRSIZE;
+
+			srclen -= nbytes;
+			src += nbytes;
+		}
+
+		/*
+		 * move log vector descriptor
+		 */
+		len += 4;
+		lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
+		lvd->offset = cpu_to_le16(lv->offset);
+		lvd->length = cpu_to_le16(lv->length);
+		dstoffset += 4;
+		jfs_info("lmWriteRecord: lv offset:%d length:%d",
+			 lv->offset, lv->length);
+	}
+
+	if ((i = linelock->next)) {
+		linelock = (struct linelock *) lid_to_tlock(i);
+		goto moveData;
+	}
+
+	/*
+	 *	move log record descriptor
+	 */
+      moveLrd:
+	lrd->length = cpu_to_le16(len);
+
+	src = (caddr_t) lrd;
+	srclen = LOGRDSIZE;
+
+	while (srclen > 0) {
+		freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+		nbytes = min(freespace, srclen);
+		dst = (caddr_t) lp + dstoffset;
+		memcpy(dst, src, nbytes);
+
+		dstoffset += nbytes;
+		srclen -= nbytes;
+
+		/* are there more to move than freespace of page ? */
+		if (srclen)
+			goto pageFull;
+
+		/*
+		 * end of log record descriptor
+		 */
+
+		/* update last log record eor */
+		log->eor = dstoffset;
+		bp->l_eor = dstoffset;
+		lsn = (log->page << L2LOGPSIZE) + dstoffset;
+
+		if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
+			tblk->clsn = lsn;
+			jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
+				 bp->l_eor);
+
+			INCREMENT(lmStat.commit);	/* # of commit */
+
+			/*
+			 * enqueue tblock for group commit:
+			 *
+			 * enqueue tblock of non-trivial/synchronous COMMIT
+			 * at tail of group commit queue
+			 * (trivial/asynchronous COMMITs are ignored by
+			 * group commit.)
+			 */
+			LOGGC_LOCK(log);
+
+			/* init tblock gc state */
+			tblk->flag = tblkGC_QUEUE;
+			tblk->bp = log->bp;
+			tblk->pn = log->page;
+			tblk->eor = log->eor;
+
+			/* enqueue transaction to commit queue */
+			list_add_tail(&tblk->cqueue, &log->cqueue);
+
+			LOGGC_UNLOCK(log);
+		}
+
+		jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
+			le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);
+
+		/* page not full ? */
+		if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+			return lsn;
+
+	      pageFull:
+		/* page become full: move on to next page */
+		lmNextPage(log);
+
+		bp = (struct lbuf *) log->bp;
+		lp = (struct logpage *) bp->l_ldata;
+		dstoffset = LOGPHDRSIZE;
+		src += nbytes;
+	}
+
+	return lsn;
+}
+
+
+/*
+ * NAME:	lmNextPage()
+ *
+ * FUNCTION:	write current page and allocate next page.
+ *
+ * PARAMETER:	log
+ *
+ * RETURN:	0
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int lmNextPage(struct jfs_log * log)
+{
+	struct logpage *lp;
+	int lspn;		/* log sequence page number */
+	int pn;			/* current page number */
+	struct lbuf *bp;
+	struct lbuf *nextbp;
+	struct tblock *tblk;
+
+	/* get current log page number and log sequence page number */
+	pn = log->page;
+	bp = log->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	lspn = le32_to_cpu(lp->h.page);
+
+	LOGGC_LOCK(log);
+
+	/*
+	 *	write or queue the full page at the tail of write queue
+	 */
+	/* get the tail tblk on commit queue */
+	if (list_empty(&log->cqueue))
+		tblk = NULL;
+	else
+		tblk = list_entry(log->cqueue.prev, struct tblock, cqueue);
+
+	/* every tblk who has COMMIT record on the current page,
+	 * and has not been committed, must be on commit queue
+	 * since tblk is queued at commit queueu at the time
+	 * of writing its COMMIT record on the page before
+	 * page becomes full (even though the tblk thread
+	 * who wrote COMMIT record may have been suspended
+	 * currently);
+	 */
+
+	/* is page bound with outstanding tail tblk ? */
+	if (tblk && tblk->pn == pn) {
+		/* mark tblk for end-of-page */
+		tblk->flag |= tblkGC_EOP;
+
+		if (log->cflag & logGC_PAGEOUT) {
+			/* if page is not already on write queue,
+			 * just enqueue (no lbmWRITE to prevent redrive)
+			 * buffer to wqueue to ensure correct serial order
+			 * of the pages since log pages will be added
+			 * continuously
+			 */
+			if (bp->l_wqnext == NULL)
+				lbmWrite(log, bp, 0, 0);
+		} else {
+			/*
+			 * No current GC leader, initiate group commit
+			 */
+			log->cflag |= logGC_PAGEOUT;
+			lmGCwrite(log, 0);
+		}
+	}
+	/* page is not bound with outstanding tblk:
+	 * init write or mark it to be redriven (lbmWRITE)
+	 */
+	else {
+		/* finalize the page */
+		bp->l_ceor = bp->l_eor;
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+		lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
+	}
+	LOGGC_UNLOCK(log);
+
+	/*
+	 *	allocate/initialize next page
+	 */
+	/* if log wraps, the first data page of log is 2
+	 * (0 never used, 1 is superblock).
+	 */
+	log->page = (pn == log->size - 1) ? 2 : pn + 1;
+	log->eor = LOGPHDRSIZE;	/* ? valid page empty/full at logRedo() */
+
+	/* allocate/initialize next log page buffer */
+	nextbp = lbmAllocate(log, log->page);
+	nextbp->l_eor = log->eor;
+	log->bp = nextbp;
+
+	/* initialize next log page */
+	lp = (struct logpage *) nextbp->l_ldata;
+	lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
+	lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	lmGroupCommit()
+ *
+ * FUNCTION:	group commit
+ *	initiate pageout of the pages with COMMIT in the order of
+ *	page number - redrive pageout of the page at the head of
+ *	pageout queue until full page has been written.
+ *
+ * RETURN:
+ *
+ * NOTE:
+ *	LOGGC_LOCK serializes log group commit queue, and
+ *	transaction blocks on the commit queue.
+ *	N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
+{
+	int rc = 0;
+
+	LOGGC_LOCK(log);
+
+	/* group committed already ? */
+	if (tblk->flag & tblkGC_COMMITTED) {
+		if (tblk->flag & tblkGC_ERROR)
+			rc = -EIO;
+
+		LOGGC_UNLOCK(log);
+		return rc;
+	}
+	jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);
+
+	if (tblk->xflag & COMMIT_LAZY)
+		tblk->flag |= tblkGC_LAZY;
+
+	if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) &&
+	    (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag)
+	     || jfs_tlocks_low)) {
+		/*
+		 * No pageout in progress
+		 *
+		 * start group commit as its group leader.
+		 */
+		log->cflag |= logGC_PAGEOUT;
+
+		lmGCwrite(log, 0);
+	}
+
+	if (tblk->xflag & COMMIT_LAZY) {
+		/*
+		 * Lazy transactions can leave now
+		 */
+		LOGGC_UNLOCK(log);
+		return 0;
+	}
+
+	/* lmGCwrite gives up LOGGC_LOCK, check again */
+
+	if (tblk->flag & tblkGC_COMMITTED) {
+		if (tblk->flag & tblkGC_ERROR)
+			rc = -EIO;
+
+		LOGGC_UNLOCK(log);
+		return rc;
+	}
+
+	/* upcount transaction waiting for completion
+	 */
+	log->gcrtc++;
+	tblk->flag |= tblkGC_READY;
+
+	__SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
+		     LOGGC_LOCK(log), LOGGC_UNLOCK(log));
+
+	/* removed from commit queue */
+	if (tblk->flag & tblkGC_ERROR)
+		rc = -EIO;
+
+	LOGGC_UNLOCK(log);
+	return rc;
+}
+
+/*
+ * NAME:	lmGCwrite()
+ *
+ * FUNCTION:	group commit write
+ *	initiate write of log page, building a group of all transactions
+ *	with commit records on that page.
+ *
+ * RETURN:	None
+ *
+ * NOTE:
+ *	LOGGC_LOCK must be held by caller.
+ *	N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+static void lmGCwrite(struct jfs_log * log, int cant_write)
+{
+	struct lbuf *bp;
+	struct logpage *lp;
+	int gcpn;		/* group commit page number */
+	struct tblock *tblk;
+	struct tblock *xtblk = NULL;
+
+	/*
+	 * build the commit group of a log page
+	 *
+	 * scan commit queue and make a commit group of all
+	 * transactions with COMMIT records on the same log page.
+	 */
+	/* get the head tblk on the commit queue */
+	gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn;
+
+	list_for_each_entry(tblk, &log->cqueue, cqueue) {
+		if (tblk->pn != gcpn)
+			break;
+
+		xtblk = tblk;
+
+		/* state transition: (QUEUE, READY) -> COMMIT */
+		tblk->flag |= tblkGC_COMMIT;
+	}
+	tblk = xtblk;		/* last tblk of the page */
+
+	/*
+	 * pageout to commit transactions on the log page.
+	 */
+	bp = (struct lbuf *) tblk->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	/* is page already full ? */
+	if (tblk->flag & tblkGC_EOP) {
+		/* mark page to free at end of group commit of the page */
+		tblk->flag &= ~tblkGC_EOP;
+		tblk->flag |= tblkGC_FREE;
+		bp->l_ceor = bp->l_eor;
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+		lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
+			 cant_write);
+		INCREMENT(lmStat.full_page);
+	}
+	/* page is not yet full */
+	else {
+		bp->l_ceor = tblk->eor;	/* ? bp->l_ceor = bp->l_eor; */
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+		lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
+		INCREMENT(lmStat.partial_page);
+	}
+}
+
+/*
+ * NAME:	lmPostGC()
+ *
+ * FUNCTION:	group commit post-processing
+ *	Processes transactions after their commit records have been written
+ *	to disk, redriving log I/O if necessary.
+ *
+ * RETURN:	None
+ *
+ * NOTE:
+ *	This routine is called a interrupt time by lbmIODone
+ */
+static void lmPostGC(struct lbuf * bp)
+{
+	unsigned long flags;
+	struct jfs_log *log = bp->l_log;
+	struct logpage *lp;
+	struct tblock *tblk, *temp;
+
+	//LOGGC_LOCK(log);
+	spin_lock_irqsave(&log->gclock, flags);
+	/*
+	 * current pageout of group commit completed.
+	 *
+	 * remove/wakeup transactions from commit queue who were
+	 * group committed with the current log page
+	 */
+	list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) {
+		if (!(tblk->flag & tblkGC_COMMIT))
+			break;
+		/* if transaction was marked GC_COMMIT then
+		 * it has been shipped in the current pageout
+		 * and made it to disk - it is committed.
+		 */
+
+		if (bp->l_flag & lbmERROR)
+			tblk->flag |= tblkGC_ERROR;
+
+		/* remove it from the commit queue */
+		list_del(&tblk->cqueue);
+		tblk->flag &= ~tblkGC_QUEUE;
+
+		if (tblk == log->flush_tblk) {
+			/* we can stop flushing the log now */
+			clear_bit(log_FLUSH, &log->flag);
+			log->flush_tblk = NULL;
+		}
+
+		jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
+			 tblk->flag);
+
+		if (!(tblk->xflag & COMMIT_FORCE))
+			/*
+			 * Hand tblk over to lazy commit thread
+			 */
+			txLazyUnlock(tblk);
+		else {
+			/* state transition: COMMIT -> COMMITTED */
+			tblk->flag |= tblkGC_COMMITTED;
+
+			if (tblk->flag & tblkGC_READY)
+				log->gcrtc--;
+
+			LOGGC_WAKEUP(tblk);
+		}
+
+		/* was page full before pageout ?
+		 * (and this is the last tblk bound with the page)
+		 */
+		if (tblk->flag & tblkGC_FREE)
+			lbmFree(bp);
+		/* did page become full after pageout ?
+		 * (and this is the last tblk bound with the page)
+		 */
+		else if (tblk->flag & tblkGC_EOP) {
+			/* finalize the page */
+			lp = (struct logpage *) bp->l_ldata;
+			bp->l_ceor = bp->l_eor;
+			lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+			jfs_info("lmPostGC: calling lbmWrite");
+			lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
+				 1);
+		}
+
+	}
+
+	/* are there any transactions who have entered lnGroupCommit()
+	 * (whose COMMITs are after that of the last log page written.
+	 * They are waiting for new group commit (above at (SLEEP 1))
+	 * or lazy transactions are on a full (queued) log page,
+	 * select the latest ready transaction as new group leader and
+	 * wake her up to lead her group.
+	 */
+	if ((!list_empty(&log->cqueue)) &&
+	    ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
+	     test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low))
+		/*
+		 * Call lmGCwrite with new group leader
+		 */
+		lmGCwrite(log, 1);
+
+	/* no transaction are ready yet (transactions are only just
+	 * queued (GC_QUEUE) and not entered for group commit yet).
+	 * the first transaction entering group commit
+	 * will elect herself as new group leader.
+	 */
+	else
+		log->cflag &= ~logGC_PAGEOUT;
+
+	//LOGGC_UNLOCK(log);
+	spin_unlock_irqrestore(&log->gclock, flags);
+	return;
+}
+
+/*
+ * NAME:	lmLogSync()
+ *
+ * FUNCTION:	write log SYNCPT record for specified log
+ *	if new sync address is available
+ *	(normally the case if sync() is executed by back-ground
+ *	process).
+ *	calculate new value of i_nextsync which determines when
+ *	this code is called again.
+ *
+ * PARAMETERS:	log	- log structure
+ *		hard_sync - 1 to force all metadata to be written
+ *
+ * RETURN:	0
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int lmLogSync(struct jfs_log * log, int hard_sync)
+{
+	int logsize;
+	int written;		/* written since last syncpt */
+	int free;		/* free space left available */
+	int delta;		/* additional delta to write normally */
+	int more;		/* additional write granted */
+	struct lrd lrd;
+	int lsn;
+	struct logsyncblk *lp;
+	unsigned long flags;
+
+	/* push dirty metapages out to disk */
+	if (hard_sync)
+		write_special_inodes(log, filemap_fdatawrite);
+	else
+		write_special_inodes(log, filemap_flush);
+
+	/*
+	 *	forward syncpt
+	 */
+	/* if last sync is same as last syncpt,
+	 * invoke sync point forward processing to update sync.
+	 */
+
+	if (log->sync == log->syncpt) {
+		LOGSYNC_LOCK(log, flags);
+		if (list_empty(&log->synclist))
+			log->sync = log->lsn;
+		else {
+			lp = list_entry(log->synclist.next,
+					struct logsyncblk, synclist);
+			log->sync = lp->lsn;
+		}
+		LOGSYNC_UNLOCK(log, flags);
+
+	}
+
+	/* if sync is different from last syncpt,
+	 * write a SYNCPT record with syncpt = sync.
+	 * reset syncpt = sync
+	 */
+	if (log->sync != log->syncpt) {
+		lrd.logtid = 0;
+		lrd.backchain = 0;
+		lrd.type = cpu_to_le16(LOG_SYNCPT);
+		lrd.length = 0;
+		lrd.log.syncpt.sync = cpu_to_le32(log->sync);
+		lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+
+		log->syncpt = log->sync;
+	} else
+		lsn = log->lsn;
+
+	/*
+	 *	setup next syncpt trigger (SWAG)
+	 */
+	logsize = log->logsize;
+
+	logdiff(written, lsn, log);
+	free = logsize - written;
+	delta = LOGSYNC_DELTA(logsize);
+	more = min(free / 2, delta);
+	if (more < 2 * LOGPSIZE) {
+		jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
+		/*
+		 *	log wrapping
+		 *
+		 * option 1 - panic ? No.!
+		 * option 2 - shutdown file systems
+		 *	      associated with log ?
+		 * option 3 - extend log ?
+		 * option 4 - second chance
+		 *
+		 * mark log wrapped, and continue.
+		 * when all active transactions are completed,
+		 * mark log valid for recovery.
+		 * if crashed during invalid state, log state
+		 * implies invalid log, forcing fsck().
+		 */
+		/* mark log state log wrap in log superblock */
+		/* log->state = LOGWRAP; */
+
+		/* reset sync point computation */
+		log->syncpt = log->sync = lsn;
+		log->nextsync = delta;
+	} else
+		/* next syncpt trigger = written + more */
+		log->nextsync = written + more;
+
+	/* if number of bytes written from last sync point is more
+	 * than 1/4 of the log size, stop new transactions from
+	 * starting until all current transactions are completed
+	 * by setting syncbarrier flag.
+	 */
+	if (!test_bit(log_SYNCBARRIER, &log->flag) &&
+	    (written > LOGSYNC_BARRIER(logsize)) && log->active) {
+		set_bit(log_SYNCBARRIER, &log->flag);
+		jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
+			 log->syncpt);
+		/*
+		 * We may have to initiate group commit
+		 */
+		jfs_flush_journal(log, 0);
+	}
+
+	return lsn;
+}
+
+/*
+ * NAME:	jfs_syncpt
+ *
+ * FUNCTION:	write log SYNCPT record for specified log
+ *
+ * PARAMETERS:	log	  - log structure
+ *		hard_sync - set to 1 to force metadata to be written
+ */
+void jfs_syncpt(struct jfs_log *log, int hard_sync)
+{	LOG_LOCK(log);
+	if (!test_bit(log_QUIESCE, &log->flag))
+		lmLogSync(log, hard_sync);
+	LOG_UNLOCK(log);
+}
+
+/*
+ * NAME:	lmLogOpen()
+ *
+ * FUNCTION:	open the log on first open;
+ *	insert filesystem in the active list of the log.
+ *
+ * PARAMETER:	ipmnt	- file system mount inode
+ *		iplog	- log inode (out)
+ *
+ * RETURN:
+ *
+ * serialization:
+ */
+int lmLogOpen(struct super_block *sb)
+{
+	int rc;
+	struct block_device *bdev;
+	struct jfs_log *log;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+
+	if (sbi->flag & JFS_NOINTEGRITY)
+		return open_dummy_log(sb);
+
+	if (sbi->mntflag & JFS_INLINELOG)
+		return open_inline_log(sb);
+
+	mutex_lock(&jfs_log_mutex);
+	list_for_each_entry(log, &jfs_external_logs, journal_list) {
+		if (log->bdev->bd_dev == sbi->logdev) {
+			if (memcmp(log->uuid, sbi->loguuid,
+				   sizeof(log->uuid))) {
+				jfs_warn("wrong uuid on JFS journal\n");
+				mutex_unlock(&jfs_log_mutex);
+				return -EINVAL;
+			}
+			/*
+			 * add file system to log active file system list
+			 */
+			if ((rc = lmLogFileSystem(log, sbi, 1))) {
+				mutex_unlock(&jfs_log_mutex);
+				return rc;
+			}
+			goto journal_found;
+		}
+	}
+
+	if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) {
+		mutex_unlock(&jfs_log_mutex);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&log->sb_list);
+	init_waitqueue_head(&log->syncwait);
+
+	/*
+	 *	external log as separate logical volume
+	 *
+	 * file systems to log may have n-to-1 relationship;
+	 */
+
+	bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 log);
+	if (IS_ERR(bdev)) {
+		rc = PTR_ERR(bdev);
+		goto free;
+	}
+
+	log->bdev = bdev;
+	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
+
+	/*
+	 * initialize log:
+	 */
+	if ((rc = lmLogInit(log)))
+		goto close;
+
+	list_add(&log->journal_list, &jfs_external_logs);
+
+	/*
+	 * add file system to log active file system list
+	 */
+	if ((rc = lmLogFileSystem(log, sbi, 1)))
+		goto shutdown;
+
+journal_found:
+	LOG_LOCK(log);
+	list_add(&sbi->log_list, &log->sb_list);
+	sbi->log = log;
+	LOG_UNLOCK(log);
+
+	mutex_unlock(&jfs_log_mutex);
+	return 0;
+
+	/*
+	 *	unwind on error
+	 */
+      shutdown:		/* unwind lbmLogInit() */
+	list_del(&log->journal_list);
+	lbmLogShutdown(log);
+
+      close:		/* close external log device */
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+      free:		/* free log descriptor */
+	mutex_unlock(&jfs_log_mutex);
+	kfree(log);
+
+	jfs_warn("lmLogOpen: exit(%d)", rc);
+	return rc;
+}
+
+static int open_inline_log(struct super_block *sb)
+{
+	struct jfs_log *log;
+	int rc;
+
+	if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL)))
+		return -ENOMEM;
+	INIT_LIST_HEAD(&log->sb_list);
+	init_waitqueue_head(&log->syncwait);
+
+	set_bit(log_INLINELOG, &log->flag);
+	log->bdev = sb->s_bdev;
+	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
+	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
+	    (L2LOGPSIZE - sb->s_blocksize_bits);
+	log->l2bsize = sb->s_blocksize_bits;
+	ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
+
+	/*
+	 * initialize log.
+	 */
+	if ((rc = lmLogInit(log))) {
+		kfree(log);
+		jfs_warn("lmLogOpen: exit(%d)", rc);
+		return rc;
+	}
+
+	list_add(&JFS_SBI(sb)->log_list, &log->sb_list);
+	JFS_SBI(sb)->log = log;
+
+	return rc;
+}
+
+static int open_dummy_log(struct super_block *sb)
+{
+	int rc;
+
+	mutex_lock(&jfs_log_mutex);
+	if (!dummy_log) {
+		dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL);
+		if (!dummy_log) {
+			mutex_unlock(&jfs_log_mutex);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&dummy_log->sb_list);
+		init_waitqueue_head(&dummy_log->syncwait);
+		dummy_log->no_integrity = 1;
+		/* Make up some stuff */
+		dummy_log->base = 0;
+		dummy_log->size = 1024;
+		rc = lmLogInit(dummy_log);
+		if (rc) {
+			kfree(dummy_log);
+			dummy_log = NULL;
+			mutex_unlock(&jfs_log_mutex);
+			return rc;
+		}
+	}
+
+	LOG_LOCK(dummy_log);
+	list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list);
+	JFS_SBI(sb)->log = dummy_log;
+	LOG_UNLOCK(dummy_log);
+	mutex_unlock(&jfs_log_mutex);
+
+	return 0;
+}
+
+/*
+ * NAME:	lmLogInit()
+ *
+ * FUNCTION:	log initialization at first log open.
+ *
+ *	logredo() (or logformat()) should have been run previously.
+ *	initialize the log from log superblock.
+ *	set the log state in the superblock to LOGMOUNT and
+ *	write SYNCPT log record.
+ *
+ * PARAMETER:	log	- log structure
+ *
+ * RETURN:	0	- if ok
+ *		-EINVAL	- bad log magic number or superblock dirty
+ *		error returned from logwait()
+ *
+ * serialization: single first open thread
+ */
+int lmLogInit(struct jfs_log * log)
+{
+	int rc = 0;
+	struct lrd lrd;
+	struct logsuper *logsuper;
+	struct lbuf *bpsuper;
+	struct lbuf *bp;
+	struct logpage *lp;
+	int lsn = 0;
+
+	jfs_info("lmLogInit: log:0x%p", log);
+
+	/* initialize the group commit serialization lock */
+	LOGGC_LOCK_INIT(log);
+
+	/* allocate/initialize the log write serialization lock */
+	LOG_LOCK_INIT(log);
+
+	LOGSYNC_LOCK_INIT(log);
+
+	INIT_LIST_HEAD(&log->synclist);
+
+	INIT_LIST_HEAD(&log->cqueue);
+	log->flush_tblk = NULL;
+
+	log->count = 0;
+
+	/*
+	 * initialize log i/o
+	 */
+	if ((rc = lbmLogInit(log)))
+		return rc;
+
+	if (!test_bit(log_INLINELOG, &log->flag))
+		log->l2bsize = L2LOGPSIZE;
+
+	/* check for disabled journaling to disk */
+	if (log->no_integrity) {
+		/*
+		 * Journal pages will still be filled.  When the time comes
+		 * to actually do the I/O, the write is not done, and the
+		 * endio routine is called directly.
+		 */
+		bp = lbmAllocate(log , 0);
+		log->bp = bp;
+		bp->l_pn = bp->l_eor = 0;
+	} else {
+		/*
+		 * validate log superblock
+		 */
+		if ((rc = lbmRead(log, 1, &bpsuper)))
+			goto errout10;
+
+		logsuper = (struct logsuper *) bpsuper->l_ldata;
+
+		if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
+			jfs_warn("*** Log Format Error ! ***");
+			rc = -EINVAL;
+			goto errout20;
+		}
+
+		/* logredo() should have been run successfully. */
+		if (logsuper->state != cpu_to_le32(LOGREDONE)) {
+			jfs_warn("*** Log Is Dirty ! ***");
+			rc = -EINVAL;
+			goto errout20;
+		}
+
+		/* initialize log from log superblock */
+		if (test_bit(log_INLINELOG,&log->flag)) {
+			if (log->size != le32_to_cpu(logsuper->size)) {
+				rc = -EINVAL;
+				goto errout20;
+			}
+			jfs_info("lmLogInit: inline log:0x%p base:0x%Lx "
+				 "size:0x%x", log,
+				 (unsigned long long) log->base, log->size);
+		} else {
+			if (memcmp(logsuper->uuid, log->uuid, 16)) {
+				jfs_warn("wrong uuid on JFS log device");
+				goto errout20;
+			}
+			log->size = le32_to_cpu(logsuper->size);
+			log->l2bsize = le32_to_cpu(logsuper->l2bsize);
+			jfs_info("lmLogInit: external log:0x%p base:0x%Lx "
+				 "size:0x%x", log,
+				 (unsigned long long) log->base, log->size);
+		}
+
+		log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
+		log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
+
+		/*
+		 * initialize for log append write mode
+		 */
+		/* establish current/end-of-log page/buffer */
+		if ((rc = lbmRead(log, log->page, &bp)))
+			goto errout20;
+
+		lp = (struct logpage *) bp->l_ldata;
+
+		jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
+			 le32_to_cpu(logsuper->end), log->page, log->eor,
+			 le16_to_cpu(lp->h.eor));
+
+		log->bp = bp;
+		bp->l_pn = log->page;
+		bp->l_eor = log->eor;
+
+		/* if current page is full, move on to next page */
+		if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
+			lmNextPage(log);
+
+		/*
+		 * initialize log syncpoint
+		 */
+		/*
+		 * write the first SYNCPT record with syncpoint = 0
+		 * (i.e., log redo up to HERE !);
+		 * remove current page from lbm write queue at end of pageout
+		 * (to write log superblock update), but do not release to
+		 * freelist;
+		 */
+		lrd.logtid = 0;
+		lrd.backchain = 0;
+		lrd.type = cpu_to_le16(LOG_SYNCPT);
+		lrd.length = 0;
+		lrd.log.syncpt.sync = 0;
+		lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+		bp = log->bp;
+		bp->l_ceor = bp->l_eor;
+		lp = (struct logpage *) bp->l_ldata;
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+		lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
+		if ((rc = lbmIOWait(bp, 0)))
+			goto errout30;
+
+		/*
+		 * update/write superblock
+		 */
+		logsuper->state = cpu_to_le32(LOGMOUNT);
+		log->serial = le32_to_cpu(logsuper->serial) + 1;
+		logsuper->serial = cpu_to_le32(log->serial);
+		lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+		if ((rc = lbmIOWait(bpsuper, lbmFREE)))
+			goto errout30;
+	}
+
+	/* initialize logsync parameters */
+	log->logsize = (log->size - 2) << L2LOGPSIZE;
+	log->lsn = lsn;
+	log->syncpt = lsn;
+	log->sync = log->syncpt;
+	log->nextsync = LOGSYNC_DELTA(log->logsize);
+
+	jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
+		 log->lsn, log->syncpt, log->sync);
+
+	/*
+	 * initialize for lazy/group commit
+	 */
+	log->clsn = lsn;
+
+	return 0;
+
+	/*
+	 *	unwind on error
+	 */
+      errout30:		/* release log page */
+	log->wqueue = NULL;
+	bp->l_wqnext = NULL;
+	lbmFree(bp);
+
+      errout20:		/* release log superblock */
+	lbmFree(bpsuper);
+
+      errout10:		/* unwind lbmLogInit() */
+	lbmLogShutdown(log);
+
+	jfs_warn("lmLogInit: exit(%d)", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	lmLogClose()
+ *
+ * FUNCTION:	remove file system <ipmnt> from active list of log <iplog>
+ *		and close it on last close.
+ *
+ * PARAMETER:	sb	- superblock
+ *
+ * RETURN:	errors from subroutines
+ *
+ * serialization:
+ */
+int lmLogClose(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+	struct block_device *bdev;
+	int rc = 0;
+
+	jfs_info("lmLogClose: log:0x%p", log);
+
+	mutex_lock(&jfs_log_mutex);
+	LOG_LOCK(log);
+	list_del(&sbi->log_list);
+	LOG_UNLOCK(log);
+	sbi->log = NULL;
+
+	/*
+	 * We need to make sure all of the "written" metapages
+	 * actually make it to disk
+	 */
+	sync_blockdev(sb->s_bdev);
+
+	if (test_bit(log_INLINELOG, &log->flag)) {
+		/*
+		 *	in-line log in host file system
+		 */
+		rc = lmLogShutdown(log);
+		kfree(log);
+		goto out;
+	}
+
+	if (!log->no_integrity)
+		lmLogFileSystem(log, sbi, 0);
+
+	if (!list_empty(&log->sb_list))
+		goto out;
+
+	/*
+	 * TODO: ensure that the dummy_log is in a state to allow
+	 * lbmLogShutdown to deallocate all the buffers and call
+	 * kfree against dummy_log.  For now, leave dummy_log & its
+	 * buffers in memory, and resuse if another no-integrity mount
+	 * is requested.
+	 */
+	if (log->no_integrity)
+		goto out;
+
+	/*
+	 *	external log as separate logical volume
+	 */
+	list_del(&log->journal_list);
+	bdev = log->bdev;
+	rc = lmLogShutdown(log);
+
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+	kfree(log);
+
+      out:
+	mutex_unlock(&jfs_log_mutex);
+	jfs_info("lmLogClose: exit(%d)", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_flush_journal()
+ *
+ * FUNCTION:	initiate write of any outstanding transactions to the journal
+ *		and optionally wait until they are all written to disk
+ *
+ *		wait == 0  flush until latest txn is committed, don't wait
+ *		wait == 1  flush until latest txn is committed, wait
+ *		wait > 1   flush until all txn's are complete, wait
+ */
+void jfs_flush_journal(struct jfs_log *log, int wait)
+{
+	int i;
+	struct tblock *target = NULL;
+
+	/* jfs_write_inode may call us during read-only mount */
+	if (!log)
+		return;
+
+	jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);
+
+	LOGGC_LOCK(log);
+
+	if (!list_empty(&log->cqueue)) {
+		/*
+		 * This ensures that we will keep writing to the journal as long
+		 * as there are unwritten commit records
+		 */
+		target = list_entry(log->cqueue.prev, struct tblock, cqueue);
+
+		if (test_bit(log_FLUSH, &log->flag)) {
+			/*
+			 * We're already flushing.
+			 * if flush_tblk is NULL, we are flushing everything,
+			 * so leave it that way.  Otherwise, update it to the
+			 * latest transaction
+			 */
+			if (log->flush_tblk)
+				log->flush_tblk = target;
+		} else {
+			/* Only flush until latest transaction is committed */
+			log->flush_tblk = target;
+			set_bit(log_FLUSH, &log->flag);
+
+			/*
+			 * Initiate I/O on outstanding transactions
+			 */
+			if (!(log->cflag & logGC_PAGEOUT)) {
+				log->cflag |= logGC_PAGEOUT;
+				lmGCwrite(log, 0);
+			}
+		}
+	}
+	if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
+		/* Flush until all activity complete */
+		set_bit(log_FLUSH, &log->flag);
+		log->flush_tblk = NULL;
+	}
+
+	if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
+		DECLARE_WAITQUEUE(__wait, current);
+
+		add_wait_queue(&target->gcwait, &__wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		LOGGC_UNLOCK(log);
+		schedule();
+		LOGGC_LOCK(log);
+		remove_wait_queue(&target->gcwait, &__wait);
+	}
+	LOGGC_UNLOCK(log);
+
+	if (wait < 2)
+		return;
+
+	write_special_inodes(log, filemap_fdatawrite);
+
+	/*
+	 * If there was recent activity, we may need to wait
+	 * for the lazycommit thread to catch up
+	 */
+	if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
+		for (i = 0; i < 200; i++) {	/* Too much? */
+			msleep(250);
+			write_special_inodes(log, filemap_fdatawrite);
+			if (list_empty(&log->cqueue) &&
+			    list_empty(&log->synclist))
+				break;
+		}
+	}
+	assert(list_empty(&log->cqueue));
+
+#ifdef CONFIG_JFS_DEBUG
+	if (!list_empty(&log->synclist)) {
+		struct logsyncblk *lp;
+
+		printk(KERN_ERR "jfs_flush_journal: synclist not empty\n");
+		list_for_each_entry(lp, &log->synclist, synclist) {
+			if (lp->xflag & COMMIT_PAGE) {
+				struct metapage *mp = (struct metapage *)lp;
+				print_hex_dump(KERN_ERR, "metapage: ",
+					       DUMP_PREFIX_ADDRESS, 16, 4,
+					       mp, sizeof(struct metapage), 0);
+				print_hex_dump(KERN_ERR, "page: ",
+					       DUMP_PREFIX_ADDRESS, 16,
+					       sizeof(long), mp->page,
+					       sizeof(struct page), 0);
+			} else
+				print_hex_dump(KERN_ERR, "tblock:",
+					       DUMP_PREFIX_ADDRESS, 16, 4,
+					       lp, sizeof(struct tblock), 0);
+		}
+	}
+#else
+	WARN_ON(!list_empty(&log->synclist));
+#endif
+	clear_bit(log_FLUSH, &log->flag);
+}
+
+/*
+ * NAME:	lmLogShutdown()
+ *
+ * FUNCTION:	log shutdown at last LogClose().
+ *
+ *		write log syncpt record.
+ *		update super block to set redone flag to 0.
+ *
+ * PARAMETER:	log	- log inode
+ *
+ * RETURN:	0	- success
+ *
+ * serialization: single last close thread
+ */
+int lmLogShutdown(struct jfs_log * log)
+{
+	int rc;
+	struct lrd lrd;
+	int lsn;
+	struct logsuper *logsuper;
+	struct lbuf *bpsuper;
+	struct lbuf *bp;
+	struct logpage *lp;
+
+	jfs_info("lmLogShutdown: log:0x%p", log);
+
+	jfs_flush_journal(log, 2);
+
+	/*
+	 * write the last SYNCPT record with syncpoint = 0
+	 * (i.e., log redo up to HERE !)
+	 */
+	lrd.logtid = 0;
+	lrd.backchain = 0;
+	lrd.type = cpu_to_le16(LOG_SYNCPT);
+	lrd.length = 0;
+	lrd.log.syncpt.sync = 0;
+
+	lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+	bp = log->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+	lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
+	lbmIOWait(log->bp, lbmFREE);
+	log->bp = NULL;
+
+	/*
+	 * synchronous update log superblock
+	 * mark log state as shutdown cleanly
+	 * (i.e., Log does not need to be replayed).
+	 */
+	if ((rc = lbmRead(log, 1, &bpsuper)))
+		goto out;
+
+	logsuper = (struct logsuper *) bpsuper->l_ldata;
+	logsuper->state = cpu_to_le32(LOGREDONE);
+	logsuper->end = cpu_to_le32(lsn);
+	lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+	rc = lbmIOWait(bpsuper, lbmFREE);
+
+	jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
+		 lsn, log->page, log->eor);
+
+      out:
+	/*
+	 * shutdown per log i/o
+	 */
+	lbmLogShutdown(log);
+
+	if (rc) {
+		jfs_warn("lmLogShutdown: exit(%d)", rc);
+	}
+	return rc;
+}
+
+
+/*
+ * NAME:	lmLogFileSystem()
+ *
+ * FUNCTION:	insert (<activate> = true)/remove (<activate> = false)
+ *	file system into/from log active file system list.
+ *
+ * PARAMETE:	log	- pointer to logs inode.
+ *		fsdev	- kdev_t of filesystem.
+ *		serial	- pointer to returned log serial number
+ *		activate - insert/remove device from active list.
+ *
+ * RETURN:	0	- success
+ *		errors returned by vms_iowait().
+ */
+static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
+			   int activate)
+{
+	int rc = 0;
+	int i;
+	struct logsuper *logsuper;
+	struct lbuf *bpsuper;
+	char *uuid = sbi->uuid;
+
+	/*
+	 * insert/remove file system device to log active file system list.
+	 */
+	if ((rc = lbmRead(log, 1, &bpsuper)))
+		return rc;
+
+	logsuper = (struct logsuper *) bpsuper->l_ldata;
+	if (activate) {
+		for (i = 0; i < MAX_ACTIVE; i++)
+			if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
+				memcpy(logsuper->active[i].uuid, uuid, 16);
+				sbi->aggregate = i;
+				break;
+			}
+		if (i == MAX_ACTIVE) {
+			jfs_warn("Too many file systems sharing journal!");
+			lbmFree(bpsuper);
+			return -EMFILE;	/* Is there a better rc? */
+		}
+	} else {
+		for (i = 0; i < MAX_ACTIVE; i++)
+			if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
+				memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
+				break;
+			}
+		if (i == MAX_ACTIVE) {
+			jfs_warn("Somebody stomped on the journal!");
+			lbmFree(bpsuper);
+			return -EIO;
+		}
+
+	}
+
+	/*
+	 * synchronous write log superblock:
+	 *
+	 * write sidestream bypassing write queue:
+	 * at file system mount, log super block is updated for
+	 * activation of the file system before any log record
+	 * (MOUNT record) of the file system, and at file system
+	 * unmount, all meta data for the file system has been
+	 * flushed before log super block is updated for deactivation
+	 * of the file system.
+	 */
+	lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+	rc = lbmIOWait(bpsuper, lbmFREE);
+
+	return rc;
+}
+
+/*
+ *		log buffer manager (lbm)
+ *		------------------------
+ *
+ * special purpose buffer manager supporting log i/o requirements.
+ *
+ * per log write queue:
+ * log pageout occurs in serial order by fifo write queue and
+ * restricting to a single i/o in pregress at any one time.
+ * a circular singly-linked list
+ * (log->wrqueue points to the tail, and buffers are linked via
+ * bp->wrqueue field), and
+ * maintains log page in pageout ot waiting for pageout in serial pageout.
+ */
+
+/*
+ *	lbmLogInit()
+ *
+ * initialize per log I/O setup at lmLogInit()
+ */
+static int lbmLogInit(struct jfs_log * log)
+{				/* log inode */
+	int i;
+	struct lbuf *lbuf;
+
+	jfs_info("lbmLogInit: log:0x%p", log);
+
+	/* initialize current buffer cursor */
+	log->bp = NULL;
+
+	/* initialize log device write queue */
+	log->wqueue = NULL;
+
+	/*
+	 * Each log has its own buffer pages allocated to it.  These are
+	 * not managed by the page cache.  This ensures that a transaction
+	 * writing to the log does not block trying to allocate a page from
+	 * the page cache (for the log).  This would be bad, since page
+	 * allocation waits on the kswapd thread that may be committing inodes
+	 * which would cause log activity.  Was that clear?  I'm trying to
+	 * avoid deadlock here.
+	 */
+	init_waitqueue_head(&log->free_wait);
+
+	log->lbuf_free = NULL;
+
+	for (i = 0; i < LOGPAGES;) {
+		char *buffer;
+		uint offset;
+		struct page *page;
+
+		buffer = (char *) get_zeroed_page(GFP_KERNEL);
+		if (buffer == NULL)
+			goto error;
+		page = virt_to_page(buffer);
+		for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
+			lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
+			if (lbuf == NULL) {
+				if (offset == 0)
+					free_page((unsigned long) buffer);
+				goto error;
+			}
+			if (offset) /* we already have one reference */
+				get_page(page);
+			lbuf->l_offset = offset;
+			lbuf->l_ldata = buffer + offset;
+			lbuf->l_page = page;
+			lbuf->l_log = log;
+			init_waitqueue_head(&lbuf->l_ioevent);
+
+			lbuf->l_freelist = log->lbuf_free;
+			log->lbuf_free = lbuf;
+			i++;
+		}
+	}
+
+	return (0);
+
+      error:
+	lbmLogShutdown(log);
+	return -ENOMEM;
+}
+
+
+/*
+ *	lbmLogShutdown()
+ *
+ * finalize per log I/O setup at lmLogShutdown()
+ */
+static void lbmLogShutdown(struct jfs_log * log)
+{
+	struct lbuf *lbuf;
+
+	jfs_info("lbmLogShutdown: log:0x%p", log);
+
+	lbuf = log->lbuf_free;
+	while (lbuf) {
+		struct lbuf *next = lbuf->l_freelist;
+		__free_page(lbuf->l_page);
+		kfree(lbuf);
+		lbuf = next;
+	}
+}
+
+
+/*
+ *	lbmAllocate()
+ *
+ * allocate an empty log buffer
+ */
+static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
+{
+	struct lbuf *bp;
+	unsigned long flags;
+
+	/*
+	 * recycle from log buffer freelist if any
+	 */
+	LCACHE_LOCK(flags);
+	LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
+	log->lbuf_free = bp->l_freelist;
+	LCACHE_UNLOCK(flags);
+
+	bp->l_flag = 0;
+
+	bp->l_wqnext = NULL;
+	bp->l_freelist = NULL;
+
+	bp->l_pn = pn;
+	bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
+	bp->l_ceor = 0;
+
+	return bp;
+}
+
+
+/*
+ *	lbmFree()
+ *
+ * release a log buffer to freelist
+ */
+static void lbmFree(struct lbuf * bp)
+{
+	unsigned long flags;
+
+	LCACHE_LOCK(flags);
+
+	lbmfree(bp);
+
+	LCACHE_UNLOCK(flags);
+}
+
+static void lbmfree(struct lbuf * bp)
+{
+	struct jfs_log *log = bp->l_log;
+
+	assert(bp->l_wqnext == NULL);
+
+	/*
+	 * return the buffer to head of freelist
+	 */
+	bp->l_freelist = log->lbuf_free;
+	log->lbuf_free = bp;
+
+	wake_up(&log->free_wait);
+	return;
+}
+
+
+/*
+ * NAME:	lbmRedrive
+ *
+ * FUNCTION:	add a log buffer to the log redrive list
+ *
+ * PARAMETER:
+ *	bp	- log buffer
+ *
+ * NOTES:
+ *	Takes log_redrive_lock.
+ */
+static inline void lbmRedrive(struct lbuf *bp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&log_redrive_lock, flags);
+	bp->l_redrive_next = log_redrive_list;
+	log_redrive_list = bp;
+	spin_unlock_irqrestore(&log_redrive_lock, flags);
+
+	wake_up_process(jfsIOthread);
+}
+
+
+/*
+ *	lbmRead()
+ */
+static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
+{
+	struct bio *bio;
+	struct lbuf *bp;
+
+	/*
+	 * allocate a log buffer
+	 */
+	*bpp = bp = lbmAllocate(log, pn);
+	jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
+
+	bp->l_flag |= lbmREAD;
+
+	bio = bio_alloc(GFP_NOFS, 1);
+
+	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
+	bio->bi_bdev = log->bdev;
+	bio->bi_io_vec[0].bv_page = bp->l_page;
+	bio->bi_io_vec[0].bv_len = LOGPSIZE;
+	bio->bi_io_vec[0].bv_offset = bp->l_offset;
+
+	bio->bi_vcnt = 1;
+	bio->bi_iter.bi_size = LOGPSIZE;
+
+	bio->bi_end_io = lbmIODone;
+	bio->bi_private = bp;
+	/*check if journaling to disk has been disabled*/
+	if (log->no_integrity) {
+		bio->bi_iter.bi_size = 0;
+		lbmIODone(bio, 0);
+	} else {
+		submit_bio(READ_SYNC, bio);
+	}
+
+	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
+
+	return 0;
+}
+
+
+/*
+ *	lbmWrite()
+ *
+ * buffer at head of pageout queue stays after completion of
+ * partial-page pageout and redriven by explicit initiation of
+ * pageout by caller until full-page pageout is completed and
+ * released.
+ *
+ * device driver i/o done redrives pageout of new buffer at
+ * head of pageout queue when current buffer at head of pageout
+ * queue is released at the completion of its full-page pageout.
+ *
+ * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
+ * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
+ */
+static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
+		     int cant_block)
+{
+	struct lbuf *tail;
+	unsigned long flags;
+
+	jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);
+
+	/* map the logical block address to physical block address */
+	bp->l_blkno =
+	    log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+
+	LCACHE_LOCK(flags);		/* disable+lock */
+
+	/*
+	 * initialize buffer for device driver
+	 */
+	bp->l_flag = flag;
+
+	/*
+	 *	insert bp at tail of write queue associated with log
+	 *
+	 * (request is either for bp already/currently at head of queue
+	 * or new bp to be inserted at tail)
+	 */
+	tail = log->wqueue;
+
+	/* is buffer not already on write queue ? */
+	if (bp->l_wqnext == NULL) {
+		/* insert at tail of wqueue */
+		if (tail == NULL) {
+			log->wqueue = bp;
+			bp->l_wqnext = bp;
+		} else {
+			log->wqueue = bp;
+			bp->l_wqnext = tail->l_wqnext;
+			tail->l_wqnext = bp;
+		}
+
+		tail = bp;
+	}
+
+	/* is buffer at head of wqueue and for write ? */
+	if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+		return;
+	}
+
+	LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+	if (cant_block)
+		lbmRedrive(bp);
+	else if (flag & lbmSYNC)
+		lbmStartIO(bp);
+	else {
+		LOGGC_UNLOCK(log);
+		lbmStartIO(bp);
+		LOGGC_LOCK(log);
+	}
+}
+
+
+/*
+ *	lbmDirectWrite()
+ *
+ * initiate pageout bypassing write queue for sidestream
+ * (e.g., log superblock) write;
+ */
+static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
+{
+	jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
+		 bp, flag, bp->l_pn);
+
+	/*
+	 * initialize buffer for device driver
+	 */
+	bp->l_flag = flag | lbmDIRECT;
+
+	/* map the logical block address to physical block address */
+	bp->l_blkno =
+	    log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+
+	/*
+	 *	initiate pageout of the page
+	 */
+	lbmStartIO(bp);
+}
+
+
+/*
+ * NAME:	lbmStartIO()
+ *
+ * FUNCTION:	Interface to DD strategy routine
+ *
+ * RETURN:	none
+ *
+ * serialization: LCACHE_LOCK() is NOT held during log i/o;
+ */
+static void lbmStartIO(struct lbuf * bp)
+{
+	struct bio *bio;
+	struct jfs_log *log = bp->l_log;
+
+	jfs_info("lbmStartIO\n");
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
+	bio->bi_bdev = log->bdev;
+	bio->bi_io_vec[0].bv_page = bp->l_page;
+	bio->bi_io_vec[0].bv_len = LOGPSIZE;
+	bio->bi_io_vec[0].bv_offset = bp->l_offset;
+
+	bio->bi_vcnt = 1;
+	bio->bi_iter.bi_size = LOGPSIZE;
+
+	bio->bi_end_io = lbmIODone;
+	bio->bi_private = bp;
+
+	/* check if journaling to disk has been disabled */
+	if (log->no_integrity) {
+		bio->bi_iter.bi_size = 0;
+		lbmIODone(bio, 0);
+	} else {
+		submit_bio(WRITE_SYNC, bio);
+		INCREMENT(lmStat.submitted);
+	}
+}
+
+
+/*
+ *	lbmIOWait()
+ */
+static int lbmIOWait(struct lbuf * bp, int flag)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
+
+	LCACHE_LOCK(flags);		/* disable+lock */
+
+	LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
+
+	rc = (bp->l_flag & lbmERROR) ? -EIO : 0;
+
+	if (flag & lbmFREE)
+		lbmfree(bp);
+
+	LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+	jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
+	return rc;
+}
+
+/*
+ *	lbmIODone()
+ *
+ * executed at INTIODONE level
+ */
+static void lbmIODone(struct bio *bio, int error)
+{
+	struct lbuf *bp = bio->bi_private;
+	struct lbuf *nextbp, *tail;
+	struct jfs_log *log;
+	unsigned long flags;
+
+	/*
+	 * get back jfs buffer bound to the i/o buffer
+	 */
+	jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);
+
+	LCACHE_LOCK(flags);		/* disable+lock */
+
+	bp->l_flag |= lbmDONE;
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		bp->l_flag |= lbmERROR;
+
+		jfs_err("lbmIODone: I/O error in JFS log");
+	}
+
+	bio_put(bio);
+
+	/*
+	 *	pagein completion
+	 */
+	if (bp->l_flag & lbmREAD) {
+		bp->l_flag &= ~lbmREAD;
+
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+		/* wakeup I/O initiator */
+		LCACHE_WAKEUP(&bp->l_ioevent);
+
+		return;
+	}
+
+	/*
+	 *	pageout completion
+	 *
+	 * the bp at the head of write queue has completed pageout.
+	 *
+	 * if single-commit/full-page pageout, remove the current buffer
+	 * from head of pageout queue, and redrive pageout with
+	 * the new buffer at head of pageout queue;
+	 * otherwise, the partial-page pageout buffer stays at
+	 * the head of pageout queue to be redriven for pageout
+	 * by lmGroupCommit() until full-page pageout is completed.
+	 */
+	bp->l_flag &= ~lbmWRITE;
+	INCREMENT(lmStat.pagedone);
+
+	/* update committed lsn */
+	log = bp->l_log;
+	log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
+
+	if (bp->l_flag & lbmDIRECT) {
+		LCACHE_WAKEUP(&bp->l_ioevent);
+		LCACHE_UNLOCK(flags);
+		return;
+	}
+
+	tail = log->wqueue;
+
+	/* single element queue */
+	if (bp == tail) {
+		/* remove head buffer of full-page pageout
+		 * from log device write queue
+		 */
+		if (bp->l_flag & lbmRELEASE) {
+			log->wqueue = NULL;
+			bp->l_wqnext = NULL;
+		}
+	}
+	/* multi element queue */
+	else {
+		/* remove head buffer of full-page pageout
+		 * from log device write queue
+		 */
+		if (bp->l_flag & lbmRELEASE) {
+			nextbp = tail->l_wqnext = bp->l_wqnext;
+			bp->l_wqnext = NULL;
+
+			/*
+			 * redrive pageout of next page at head of write queue:
+			 * redrive next page without any bound tblk
+			 * (i.e., page w/o any COMMIT records), or
+			 * first page of new group commit which has been
+			 * queued after current page (subsequent pageout
+			 * is performed synchronously, except page without
+			 * any COMMITs) by lmGroupCommit() as indicated
+			 * by lbmWRITE flag;
+			 */
+			if (nextbp->l_flag & lbmWRITE) {
+				/*
+				 * We can't do the I/O at interrupt time.
+				 * The jfsIO thread can do it
+				 */
+				lbmRedrive(nextbp);
+			}
+		}
+	}
+
+	/*
+	 *	synchronous pageout:
+	 *
+	 * buffer has not necessarily been removed from write queue
+	 * (e.g., synchronous write of partial-page with COMMIT):
+	 * leave buffer for i/o initiator to dispose
+	 */
+	if (bp->l_flag & lbmSYNC) {
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+		/* wakeup I/O initiator */
+		LCACHE_WAKEUP(&bp->l_ioevent);
+	}
+
+	/*
+	 *	Group Commit pageout:
+	 */
+	else if (bp->l_flag & lbmGC) {
+		LCACHE_UNLOCK(flags);
+		lmPostGC(bp);
+	}
+
+	/*
+	 *	asynchronous pageout:
+	 *
+	 * buffer must have been removed from write queue:
+	 * insert buffer at head of freelist where it can be recycled
+	 */
+	else {
+		assert(bp->l_flag & lbmRELEASE);
+		assert(bp->l_flag & lbmFREE);
+		lbmfree(bp);
+
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+	}
+}
+
+int jfsIOWait(void *arg)
+{
+	struct lbuf *bp;
+
+	do {
+		spin_lock_irq(&log_redrive_lock);
+		while ((bp = log_redrive_list)) {
+			log_redrive_list = bp->l_redrive_next;
+			bp->l_redrive_next = NULL;
+			spin_unlock_irq(&log_redrive_lock);
+			lbmStartIO(bp);
+			spin_lock_irq(&log_redrive_lock);
+		}
+
+		if (freezing(current)) {
+			spin_unlock_irq(&log_redrive_lock);
+			try_to_freeze();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&log_redrive_lock);
+			schedule();
+		}
+	} while (!kthread_should_stop());
+
+	jfs_info("jfsIOWait being killed!");
+	return 0;
+}
+
+/*
+ * NAME:	lmLogFormat()/jfs_logform()
+ *
+ * FUNCTION:	format file system log
+ *
+ * PARAMETERS:
+ *	log	- volume log
+ *	logAddress - start address of log space in FS block
+ *	logSize	- length of log space in FS block;
+ *
+ * RETURN:	0	- success
+ *		-EIO	- i/o error
+ *
+ * XXX: We're synchronously writing one page at a time.  This needs to
+ *	be improved by writing multiple pages at once.
+ */
+int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
+{
+	int rc = -EIO;
+	struct jfs_sb_info *sbi;
+	struct logsuper *logsuper;
+	struct logpage *lp;
+	int lspn;		/* log sequence page number */
+	struct lrd *lrd_ptr;
+	int npages = 0;
+	struct lbuf *bp;
+
+	jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
+		 (long long)logAddress, logSize);
+
+	sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list);
+
+	/* allocate a log buffer */
+	bp = lbmAllocate(log, 1);
+
+	npages = logSize >> sbi->l2nbperpage;
+
+	/*
+	 *	log space:
+	 *
+	 * page 0 - reserved;
+	 * page 1 - log superblock;
+	 * page 2 - log data page: A SYNC log record is written
+	 *	    into this page at logform time;
+	 * pages 3-N - log data page: set to empty log data pages;
+	 */
+	/*
+	 *	init log superblock: log page 1
+	 */
+	logsuper = (struct logsuper *) bp->l_ldata;
+
+	logsuper->magic = cpu_to_le32(LOGMAGIC);
+	logsuper->version = cpu_to_le32(LOGVERSION);
+	logsuper->state = cpu_to_le32(LOGREDONE);
+	logsuper->flag = cpu_to_le32(sbi->mntflag);	/* ? */
+	logsuper->size = cpu_to_le32(npages);
+	logsuper->bsize = cpu_to_le32(sbi->bsize);
+	logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
+	logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
+
+	bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+	bp->l_blkno = logAddress + sbi->nbperpage;
+	lbmStartIO(bp);
+	if ((rc = lbmIOWait(bp, 0)))
+		goto exit;
+
+	/*
+	 *	init pages 2 to npages-1 as log data pages:
+	 *
+	 * log page sequence number (lpsn) initialization:
+	 *
+	 * pn:   0     1     2     3                 n-1
+	 *       +-----+-----+=====+=====+===.....===+=====+
+	 * lspn:             N-1   0     1           N-2
+	 *                   <--- N page circular file ---->
+	 *
+	 * the N (= npages-2) data pages of the log is maintained as
+	 * a circular file for the log records;
+	 * lpsn grows by 1 monotonically as each log page is written
+	 * to the circular file of the log;
+	 * and setLogpage() will not reset the page number even if
+	 * the eor is equal to LOGPHDRSIZE. In order for binary search
+	 * still work in find log end process, we have to simulate the
+	 * log wrap situation at the log format time.
+	 * The 1st log page written will have the highest lpsn. Then
+	 * the succeeding log pages will have ascending order of
+	 * the lspn starting from 0, ... (N-2)
+	 */
+	lp = (struct logpage *) bp->l_ldata;
+	/*
+	 * initialize 1st log page to be written: lpsn = N - 1,
+	 * write a SYNCPT log record is written to this page
+	 */
+	lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
+	lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
+
+	lrd_ptr = (struct lrd *) &lp->data;
+	lrd_ptr->logtid = 0;
+	lrd_ptr->backchain = 0;
+	lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
+	lrd_ptr->length = 0;
+	lrd_ptr->log.syncpt.sync = 0;
+
+	bp->l_blkno += sbi->nbperpage;
+	bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+	lbmStartIO(bp);
+	if ((rc = lbmIOWait(bp, 0)))
+		goto exit;
+
+	/*
+	 *	initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
+	 */
+	for (lspn = 0; lspn < npages - 3; lspn++) {
+		lp->h.page = lp->t.page = cpu_to_le32(lspn);
+		lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+
+		bp->l_blkno += sbi->nbperpage;
+		bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+		lbmStartIO(bp);
+		if ((rc = lbmIOWait(bp, 0)))
+			goto exit;
+	}
+
+	rc = 0;
+exit:
+	/*
+	 *	finalize log
+	 */
+	/* release the buffer */
+	lbmFree(bp);
+
+	return rc;
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS Logmgr stats\n"
+		       "================\n"
+		       "commits = %d\n"
+		       "writes submitted = %d\n"
+		       "writes completed = %d\n"
+		       "full pages submitted = %d\n"
+		       "partial pages submitted = %d\n",
+		       lmStat.commit,
+		       lmStat.submitted,
+		       lmStat.pagedone,
+		       lmStat.full_page,
+		       lmStat.partial_page);
+	return 0;
+}
+
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_lmstats_proc_show, NULL);
+}
+
+const struct file_operations jfs_lmstats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_lmstats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_JFS_STATISTICS */
diff --git a/kernel/fs/jfs/jfs_logmgr.h b/kernel/fs/jfs/jfs_logmgr.h
new file mode 100644
index 000000000..e38c21598
--- /dev/null
+++ b/kernel/fs/jfs/jfs_logmgr.h
@@ -0,0 +1,513 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_LOGMGR
+#define _H_JFS_LOGMGR
+
+#include "jfs_filsys.h"
+#include "jfs_lock.h"
+
+/*
+ *	log manager configuration parameters
+ */
+
+/* log page size */
+#define	LOGPSIZE	4096
+#define	L2LOGPSIZE	12
+
+#define LOGPAGES	16	/* Log pages per mounted file system */
+
+/*
+ *	log logical volume
+ *
+ * a log is used to make the commit operation on journalled
+ * files within the same logical volume group atomic.
+ * a log is implemented with a logical volume.
+ * there is one log per logical volume group.
+ *
+ * block 0 of the log logical volume is not used (ipl etc).
+ * block 1 contains a log "superblock" and is used by logFormat(),
+ * lmLogInit(), lmLogShutdown(), and logRedo() to record status
+ * of the log but is not otherwise used during normal processing.
+ * blocks 2 - (N-1) are used to contain log records.
+ *
+ * when a volume group is varied-on-line, logRedo() must have
+ * been executed before the file systems (logical volumes) in
+ * the volume group can be mounted.
+ */
+/*
+ *	log superblock (block 1 of logical volume)
+ */
+#define	LOGSUPER_B	1
+#define	LOGSTART_B	2
+
+#define	LOGMAGIC	0x87654321
+#define	LOGVERSION	1
+
+#define MAX_ACTIVE	128	/* Max active file systems sharing log */
+
+struct logsuper {
+	__le32 magic;		/* 4: log lv identifier */
+	__le32 version;		/* 4: version number */
+	__le32 serial;		/* 4: log open/mount counter */
+	__le32 size;		/* 4: size in number of LOGPSIZE blocks */
+	__le32 bsize;		/* 4: logical block size in byte */
+	__le32 l2bsize;		/* 4: log2 of bsize */
+
+	__le32 flag;		/* 4: option */
+	__le32 state;		/* 4: state - see below */
+
+	__le32 end;		/* 4: addr of last log record set by logredo */
+	char uuid[16];		/* 16: 128-bit journal uuid */
+	char label[16];		/* 16: journal label */
+	struct {
+		char uuid[16];
+	} active[MAX_ACTIVE];	/* 2048: active file systems list */
+};
+
+#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+/* log flag: commit option (see jfs_filsys.h) */
+
+/* log state */
+#define	LOGMOUNT	0	/* log mounted by lmLogInit() */
+#define LOGREDONE	1	/* log shutdown by lmLogShutdown().
+				 * log redo completed by logredo().
+				 */
+#define LOGWRAP		2	/* log wrapped */
+#define LOGREADERR	3	/* log read error detected in logredo() */
+
+
+/*
+ *	log logical page
+ *
+ * (this comment should be rewritten !)
+ * the header and trailer structures (h,t) will normally have
+ * the same page and eor value.
+ * An exception to this occurs when a complete page write is not
+ * accomplished on a power failure. Since the hardware may "split write"
+ * sectors in the page, any out of order sequence may occur during powerfail
+ * and needs to be recognized during log replay.  The xor value is
+ * an "exclusive or" of all log words in the page up to eor.  This
+ * 32 bit eor is stored with the top 16 bits in the header and the
+ * bottom 16 bits in the trailer.  logredo can easily recognize pages
+ * that were not completed by reconstructing this eor and checking
+ * the log page.
+ *
+ * Previous versions of the operating system did not allow split
+ * writes and detected partially written records in logredo by
+ * ordering the updates to the header, trailer, and the move of data
+ * into the logdata area.  The order: (1) data is moved (2) header
+ * is updated (3) trailer is updated.  In logredo, when the header
+ * differed from the trailer, the header and trailer were reconciled
+ * as follows: if h.page != t.page they were set to the smaller of
+ * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only)
+ * h.eor != t.eor they were set to the smaller of their two values.
+ */
+struct logpage {
+	struct {		/* header */
+		__le32 page;	/* 4: log sequence page number */
+		__le16 rsrvd;	/* 2: */
+		__le16 eor;	/* 2: end-of-log offset of lasrt record write */
+	} h;
+
+	__le32 data[LOGPSIZE / 4 - 4];	/* log record area */
+
+	struct {		/* trailer */
+		__le32 page;	/* 4: normally the same as h.page */
+		__le16 rsrvd;	/* 2: */
+		__le16 eor;	/* 2: normally the same as h.eor */
+	} t;
+};
+
+#define LOGPHDRSIZE	8	/* log page header size */
+#define LOGPTLRSIZE	8	/* log page trailer size */
+
+
+/*
+ *	log record
+ *
+ * (this comment should be rewritten !)
+ * jfs uses only "after" log records (only a single writer is allowed
+ * in a page, pages are written to temporary paging space if
+ * if they must be written to disk before commit, and i/o is
+ * scheduled for modified pages to their home location after
+ * the log records containing the after values and the commit
+ * record is written to the log on disk, undo discards the copy
+ * in main-memory.)
+ *
+ * a log record consists of a data area of variable length followed by
+ * a descriptor of fixed size LOGRDSIZE bytes.
+ * the data area is rounded up to an integral number of 4-bytes and
+ * must be no longer than LOGPSIZE.
+ * the descriptor is of size of multiple of 4-bytes and aligned on a
+ * 4-byte boundary.
+ * records are packed one after the other in the data area of log pages.
+ * (sometimes a DUMMY record is inserted so that at least one record ends
+ * on every page or the longest record is placed on at most two pages).
+ * the field eor in page header/trailer points to the byte following
+ * the last record on a page.
+ */
+
+/* log record types */
+#define LOG_COMMIT		0x8000
+#define LOG_SYNCPT		0x4000
+#define LOG_MOUNT		0x2000
+#define LOG_REDOPAGE		0x0800
+#define LOG_NOREDOPAGE		0x0080
+#define LOG_NOREDOINOEXT	0x0040
+#define LOG_UPDATEMAP		0x0008
+#define LOG_NOREDOFILE		0x0001
+
+/* REDOPAGE/NOREDOPAGE log record data type */
+#define	LOG_INODE		0x0001
+#define	LOG_XTREE		0x0002
+#define	LOG_DTREE		0x0004
+#define	LOG_BTROOT		0x0010
+#define	LOG_EA			0x0020
+#define	LOG_ACL			0x0040
+#define	LOG_DATA		0x0080
+#define	LOG_NEW			0x0100
+#define	LOG_EXTEND		0x0200
+#define LOG_RELOCATE		0x0400
+#define LOG_DIR_XTREE		0x0800	/* Xtree is in directory inode */
+
+/* UPDATEMAP log record descriptor type */
+#define	LOG_ALLOCXADLIST	0x0080
+#define	LOG_ALLOCPXDLIST	0x0040
+#define	LOG_ALLOCXAD		0x0020
+#define	LOG_ALLOCPXD		0x0010
+#define	LOG_FREEXADLIST		0x0008
+#define	LOG_FREEPXDLIST		0x0004
+#define	LOG_FREEXAD		0x0002
+#define	LOG_FREEPXD		0x0001
+
+
+struct lrd {
+	/*
+	 * type independent area
+	 */
+	__le32 logtid;		/* 4: log transaction identifier */
+	__le32 backchain;	/* 4: ptr to prev record of same transaction */
+	__le16 type;		/* 2: record type */
+	__le16 length;		/* 2: length of data in record (in byte) */
+	__le32 aggregate;	/* 4: file system lv/aggregate */
+	/* (16) */
+
+	/*
+	 * type dependent area (20)
+	 */
+	union {
+
+		/*
+		 *	COMMIT: commit
+		 *
+		 * transaction commit: no type-dependent information;
+		 */
+
+		/*
+		 *	REDOPAGE: after-image
+		 *
+		 * apply after-image;
+		 *
+		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le16 type;	/* 2: REDOPAGE record type */
+			__le16 l2linesize;	/* 2: log2 of line size */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} redopage;	/* (20) */
+
+		/*
+		 *	NOREDOPAGE: the page is freed
+		 *
+		 * do not apply after-image records which precede this record
+		 * in the log with the same page block number to this page.
+		 *
+		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le16 type;	/* 2: NOREDOPAGE record type */
+			__le16 rsrvd;	/* 2: reserved */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} noredopage;	/* (20) */
+
+		/*
+		 *	UPDATEMAP: update block allocation map
+		 *
+		 * either in-line PXD,
+		 * or     out-of-line  XADLIST;
+		 *
+		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le16 type;	/* 2: UPDATEMAP record type */
+			__le16 nxd;	/* 2: number of extents */
+			pxd_t pxd;	/* 8: pxd */
+		} updatemap;	/* (20) */
+
+		/*
+		 *	NOREDOINOEXT: the inode extent is freed
+		 *
+		 * do not apply after-image records which precede this
+		 * record in the log with the any of the 4 page block
+		 * numbers in this inode extent.
+		 *
+		 * NOTE: The fileset and pxd fields MUST remain in
+		 *       the same fields in the REDOPAGE record format.
+		 *
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 iagnum;	/* 4: IAG number     */
+			__le32 inoext_idx;	/* 4: inode extent index */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} noredoinoext;	/* (20) */
+
+		/*
+		 *	SYNCPT: log sync point
+		 *
+		 * replay log up to syncpt address specified;
+		 */
+		struct {
+			__le32 sync;	/* 4: syncpt address (0 = here) */
+		} syncpt;
+
+		/*
+		 *	MOUNT: file system mount
+		 *
+		 * file system mount: no type-dependent information;
+		 */
+
+		/*
+		 *	? FREEXTENT: free specified extent(s)
+		 *
+		 * free specified extent(s) from block allocation map
+		 * N.B.: nextents should be length of data/sizeof(xad_t)
+		 */
+		struct {
+			__le32 type;	/* 4: FREEXTENT record type */
+			__le32 nextent;	/* 4: number of extents */
+
+			/* data: PXD or XAD list */
+		} freextent;
+
+		/*
+		 *	? NOREDOFILE: this file is freed
+		 *
+		 * do not apply records which precede this record in the log
+		 * with the same inode number.
+		 *
+		 * NOREDOFILE must be the first to be written at commit
+		 * (last to be read in logredo()) - it prevents
+		 * replay of preceding updates of all preceding generations
+		 * of the inumber esp. the on-disk inode itself.
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+		} noredofile;
+
+		/*
+		 *	? NEWPAGE:
+		 *
+		 * metadata type dependent
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le32 type;	/* 4: NEWPAGE record type */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} newpage;
+
+		/*
+		 *	? DUMMY: filler
+		 *
+		 * no type-dependent information
+		 */
+	} log;
+};					/* (36) */
+
+#define	LOGRDSIZE	(sizeof(struct lrd))
+
+/*
+ *	line vector descriptor
+ */
+struct lvd {
+	__le16 offset;
+	__le16 length;
+};
+
+
+/*
+ *	log logical volume
+ */
+struct jfs_log {
+
+	struct list_head sb_list;/*  This is used to sync metadata
+				 *    before writing syncpt.
+				 */
+	struct list_head journal_list; /* Global list */
+	struct block_device *bdev; /* 4: log lv pointer */
+	int serial;		/* 4: log mount serial number */
+
+	s64 base;		/* @8: log extent address (inline log ) */
+	int size;		/* 4: log size in log page (in page) */
+	int l2bsize;		/* 4: log2 of bsize */
+
+	unsigned long flag;	/* 4: flag */
+
+	struct lbuf *lbuf_free;	/* 4: free lbufs */
+	wait_queue_head_t free_wait;	/* 4: */
+
+	/* log write */
+	int logtid;		/* 4: log tid */
+	int page;		/* 4: page number of eol page */
+	int eor;		/* 4: eor of last record in eol page */
+	struct lbuf *bp;	/* 4: current log page buffer */
+
+	struct mutex loglock;	/* 4: log write serialization lock */
+
+	/* syncpt */
+	int nextsync;		/* 4: bytes to write before next syncpt */
+	int active;		/* 4: */
+	wait_queue_head_t syncwait;	/* 4: */
+
+	/* commit */
+	uint cflag;		/* 4: */
+	struct list_head cqueue; /* FIFO commit queue */
+	struct tblock *flush_tblk; /* tblk we're waiting on for flush */
+	int gcrtc;		/* 4: GC_READY transaction count */
+	struct tblock *gclrt;	/* 4: latest GC_READY transaction */
+	spinlock_t gclock;	/* 4: group commit lock */
+	int logsize;		/* 4: log data area size in byte */
+	int lsn;		/* 4: end-of-log */
+	int clsn;		/* 4: clsn */
+	int syncpt;		/* 4: addr of last syncpt record */
+	int sync;		/* 4: addr from last logsync() */
+	struct list_head synclist;	/* 8: logsynclist anchor */
+	spinlock_t synclock;	/* 4: synclist lock */
+	struct lbuf *wqueue;	/* 4: log pageout queue */
+	int count;		/* 4: count */
+	char uuid[16];		/* 16: 128-bit uuid of log device */
+
+	int no_integrity;	/* 3: flag to disable journaling to disk */
+};
+
+/*
+ * Log flag
+ */
+#define log_INLINELOG	1
+#define log_SYNCBARRIER	2
+#define log_QUIESCE	3
+#define log_FLUSH	4
+
+/*
+ * group commit flag
+ */
+/* jfs_log */
+#define logGC_PAGEOUT	0x00000001
+
+/* tblock/lbuf */
+#define tblkGC_QUEUE		0x0001
+#define tblkGC_READY		0x0002
+#define tblkGC_COMMIT		0x0004
+#define tblkGC_COMMITTED	0x0008
+#define tblkGC_EOP		0x0010
+#define tblkGC_FREE		0x0020
+#define tblkGC_LEADER		0x0040
+#define tblkGC_ERROR		0x0080
+#define tblkGC_LAZY		0x0100	// D230860
+#define tblkGC_UNLOCKED		0x0200	// D230860
+
+/*
+ *		log cache buffer header
+ */
+struct lbuf {
+	struct jfs_log *l_log;	/* 4: log associated with buffer */
+
+	/*
+	 * data buffer base area
+	 */
+	uint l_flag;		/* 4: pageout control flags */
+
+	struct lbuf *l_wqnext;	/* 4: write queue link */
+	struct lbuf *l_freelist;	/* 4: freelistlink */
+
+	int l_pn;		/* 4: log page number */
+	int l_eor;		/* 4: log record eor */
+	int l_ceor;		/* 4: committed log record eor */
+
+	s64 l_blkno;		/* 8: log page block number */
+	caddr_t l_ldata;	/* 4: data page */
+	struct page *l_page;	/* The page itself */
+	uint l_offset;		/* Offset of l_ldata within the page */
+
+	wait_queue_head_t l_ioevent;	/* 4: i/o done event */
+};
+
+/* Reuse l_freelist for redrive list */
+#define l_redrive_next l_freelist
+
+/*
+ *	logsynclist block
+ *
+ * common logsyncblk prefix for jbuf_t and tblock
+ */
+struct logsyncblk {
+	u16 xflag;		/* flags */
+	u16 flag;		/* only meaninful in tblock */
+	lid_t lid;		/* lock id */
+	s32 lsn;		/* log sequence number */
+	struct list_head synclist;	/* log sync list link */
+};
+
+/*
+ *	logsynclist serialization (per log)
+ */
+
+#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock)
+#define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags)
+#define LOGSYNC_UNLOCK(log, flags) \
+	spin_unlock_irqrestore(&(log)->synclock, flags)
+
+/* compute the difference in bytes of lsn from sync point */
+#define logdiff(diff, lsn, log)\
+{\
+	diff = (lsn) - (log)->syncpt;\
+	if (diff < 0)\
+		diff += (log)->logsize;\
+}
+
+extern int lmLogOpen(struct super_block *sb);
+extern int lmLogClose(struct super_block *sb);
+extern int lmLogShutdown(struct jfs_log * log);
+extern int lmLogInit(struct jfs_log * log);
+extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize);
+extern int lmGroupCommit(struct jfs_log *, struct tblock *);
+extern int jfsIOWait(void *);
+extern void jfs_flush_journal(struct jfs_log * log, int wait);
+extern void jfs_syncpt(struct jfs_log *log, int hard_sync);
+
+#endif				/* _H_JFS_LOGMGR */
diff --git a/kernel/fs/jfs/jfs_metapage.c b/kernel/fs/jfs/jfs_metapage.c
new file mode 100644
index 000000000..16a0922be
--- /dev/null
+++ b/kernel/fs/jfs/jfs_metapage.c
@@ -0,0 +1,837 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/mempool.h>
+#include <linux/seq_file.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+	uint	pagealloc;	/* # of page allocations */
+	uint	pagefree;	/* # of page frees */
+	uint	lockwait;	/* # of sleeping lock_metapage() calls */
+} mpStat;
+#endif
+
+#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag)
+#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag)
+
+static inline void unlock_metapage(struct metapage *mp)
+{
+	clear_bit_unlock(META_locked, &mp->flag);
+	wake_up(&mp->wait);
+}
+
+static inline void __lock_metapage(struct metapage *mp)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	INCREMENT(mpStat.lockwait);
+	add_wait_queue_exclusive(&mp->wait, &wait);
+	do {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (metapage_locked(mp)) {
+			unlock_page(mp->page);
+			io_schedule();
+			lock_page(mp->page);
+		}
+	} while (trylock_metapage(mp));
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&mp->wait, &wait);
+}
+
+/*
+ * Must have mp->page locked
+ */
+static inline void lock_metapage(struct metapage *mp)
+{
+	if (trylock_metapage(mp))
+		__lock_metapage(mp);
+}
+
+#define METAPOOL_MIN_PAGES 32
+static struct kmem_cache *metapage_cache;
+static mempool_t *metapage_mempool;
+
+#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE)
+
+#if MPS_PER_PAGE > 1
+
+struct meta_anchor {
+	int mp_count;
+	atomic_t io_count;
+	struct metapage *mp[MPS_PER_PAGE];
+};
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
+
+static inline struct metapage *page_to_mp(struct page *page, int offset)
+{
+	if (!PagePrivate(page))
+		return NULL;
+	return mp_anchor(page)->mp[offset >> L2PSIZE];
+}
+
+static inline int insert_metapage(struct page *page, struct metapage *mp)
+{
+	struct meta_anchor *a;
+	int index;
+	int l2mp_blocks;	/* log2 blocks per metapage */
+
+	if (PagePrivate(page))
+		a = mp_anchor(page);
+	else {
+		a = kzalloc(sizeof(struct meta_anchor), GFP_NOFS);
+		if (!a)
+			return -ENOMEM;
+		set_page_private(page, (unsigned long)a);
+		SetPagePrivate(page);
+		kmap(page);
+	}
+
+	if (mp) {
+		l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits;
+		index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1);
+		a->mp_count++;
+		a->mp[index] = mp;
+	}
+
+	return 0;
+}
+
+static inline void remove_metapage(struct page *page, struct metapage *mp)
+{
+	struct meta_anchor *a = mp_anchor(page);
+	int l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits;
+	int index;
+
+	index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1);
+
+	BUG_ON(a->mp[index] != mp);
+
+	a->mp[index] = NULL;
+	if (--a->mp_count == 0) {
+		kfree(a);
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		kunmap(page);
+	}
+}
+
+static inline void inc_io(struct page *page)
+{
+	atomic_inc(&mp_anchor(page)->io_count);
+}
+
+static inline void dec_io(struct page *page, void (*handler) (struct page *))
+{
+	if (atomic_dec_and_test(&mp_anchor(page)->io_count))
+		handler(page);
+}
+
+#else
+static inline struct metapage *page_to_mp(struct page *page, int offset)
+{
+	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
+}
+
+static inline int insert_metapage(struct page *page, struct metapage *mp)
+{
+	if (mp) {
+		set_page_private(page, (unsigned long)mp);
+		SetPagePrivate(page);
+		kmap(page);
+	}
+	return 0;
+}
+
+static inline void remove_metapage(struct page *page, struct metapage *mp)
+{
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	kunmap(page);
+}
+
+#define inc_io(page) do {} while(0)
+#define dec_io(page, handler) handler(page)
+
+#endif
+
+static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
+{
+	struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
+
+	if (mp) {
+		mp->lid = 0;
+		mp->lsn = 0;
+		mp->data = NULL;
+		mp->clsn = 0;
+		mp->log = NULL;
+		init_waitqueue_head(&mp->wait);
+	}
+	return mp;
+}
+
+static inline void free_metapage(struct metapage *mp)
+{
+	mempool_free(mp, metapage_mempool);
+}
+
+int __init metapage_init(void)
+{
+	/*
+	 * Allocate the metapage structures
+	 */
+	metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage),
+					   0, 0, NULL);
+	if (metapage_cache == NULL)
+		return -ENOMEM;
+
+	metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES,
+						    metapage_cache);
+
+	if (metapage_mempool == NULL) {
+		kmem_cache_destroy(metapage_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void metapage_exit(void)
+{
+	mempool_destroy(metapage_mempool);
+	kmem_cache_destroy(metapage_cache);
+}
+
+static inline void drop_metapage(struct page *page, struct metapage *mp)
+{
+	if (mp->count || mp->nohomeok || test_bit(META_dirty, &mp->flag) ||
+	    test_bit(META_io, &mp->flag))
+		return;
+	remove_metapage(page, mp);
+	INCREMENT(mpStat.pagefree);
+	free_metapage(mp);
+}
+
+/*
+ * Metapage address space operations
+ */
+
+static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
+				    int *len)
+{
+	int rc = 0;
+	int xflag;
+	s64 xaddr;
+	sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+			       inode->i_blkbits;
+
+	if (lblock >= file_blocks)
+		return 0;
+	if (lblock + *len > file_blocks)
+		*len = file_blocks - lblock;
+
+	if (inode->i_ino) {
+		rc = xtLookup(inode, (s64)lblock, *len, &xflag, &xaddr, len, 0);
+		if ((rc == 0) && *len)
+			lblock = (sector_t)xaddr;
+		else
+			lblock = 0;
+	} /* else no mapping */
+
+	return lblock;
+}
+
+static void last_read_complete(struct page *page)
+{
+	if (!PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+
+static void metapage_read_end_io(struct bio *bio, int err)
+{
+	struct page *page = bio->bi_private;
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		printk(KERN_ERR "metapage_read_end_io: I/O error\n");
+		SetPageError(page);
+	}
+
+	dec_io(page, last_read_complete);
+	bio_put(bio);
+}
+
+static void remove_from_logsync(struct metapage *mp)
+{
+	struct jfs_log *log = mp->log;
+	unsigned long flags;
+/*
+ * This can race.  Recheck that log hasn't been set to null, and after
+ * acquiring logsync lock, recheck lsn
+ */
+	if (!log)
+		return;
+
+	LOGSYNC_LOCK(log, flags);
+	if (mp->lsn) {
+		mp->log = NULL;
+		mp->lsn = 0;
+		mp->clsn = 0;
+		log->count--;
+		list_del(&mp->synclist);
+	}
+	LOGSYNC_UNLOCK(log, flags);
+}
+
+static void last_write_complete(struct page *page)
+{
+	struct metapage *mp;
+	unsigned int offset;
+
+	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+		mp = page_to_mp(page, offset);
+		if (mp && test_bit(META_io, &mp->flag)) {
+			if (mp->lsn)
+				remove_from_logsync(mp);
+			clear_bit(META_io, &mp->flag);
+		}
+		/*
+		 * I'd like to call drop_metapage here, but I don't think it's
+		 * safe unless I have the page locked
+		 */
+	}
+	end_page_writeback(page);
+}
+
+static void metapage_write_end_io(struct bio *bio, int err)
+{
+	struct page *page = bio->bi_private;
+
+	BUG_ON(!PagePrivate(page));
+
+	if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		printk(KERN_ERR "metapage_write_end_io: I/O error\n");
+		SetPageError(page);
+	}
+	dec_io(page, last_write_complete);
+	bio_put(bio);
+}
+
+static int metapage_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct bio *bio = NULL;
+	int block_offset;	/* block offset of mp within page */
+	struct inode *inode = page->mapping->host;
+	int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
+	int len;
+	int xlen;
+	struct metapage *mp;
+	int redirty = 0;
+	sector_t lblock;
+	int nr_underway = 0;
+	sector_t pblock;
+	sector_t next_block = 0;
+	sector_t page_start;
+	unsigned long bio_bytes = 0;
+	unsigned long bio_offset = 0;
+	int offset;
+	int bad_blocks = 0;
+
+	page_start = (sector_t)page->index <<
+		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+		mp = page_to_mp(page, offset);
+
+		if (!mp || !test_bit(META_dirty, &mp->flag))
+			continue;
+
+		if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) {
+			redirty = 1;
+			/*
+			 * Make sure this page isn't blocked indefinitely.
+			 * If the journal isn't undergoing I/O, push it
+			 */
+			if (mp->log && !(mp->log->cflag & logGC_PAGEOUT))
+				jfs_flush_journal(mp->log, 0);
+			continue;
+		}
+
+		clear_bit(META_dirty, &mp->flag);
+		set_bit(META_io, &mp->flag);
+		block_offset = offset >> inode->i_blkbits;
+		lblock = page_start + block_offset;
+		if (bio) {
+			if (xlen && lblock == next_block) {
+				/* Contiguous, in memory & on disk */
+				len = min(xlen, blocks_per_mp);
+				xlen -= len;
+				bio_bytes += len << inode->i_blkbits;
+				continue;
+			}
+			/* Not contiguous */
+			if (bio_add_page(bio, page, bio_bytes, bio_offset) <
+			    bio_bytes)
+				goto add_failed;
+			/*
+			 * Increment counter before submitting i/o to keep
+			 * count from hitting zero before we're through
+			 */
+			inc_io(page);
+			if (!bio->bi_iter.bi_size)
+				goto dump_bio;
+			submit_bio(WRITE, bio);
+			nr_underway++;
+			bio = NULL;
+		} else
+			inc_io(page);
+		xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
+		pblock = metapage_get_blocks(inode, lblock, &xlen);
+		if (!pblock) {
+			printk(KERN_ERR "JFS: metapage_get_blocks failed\n");
+			/*
+			 * We already called inc_io(), but can't cancel it
+			 * with dec_io() until we're done with the page
+			 */
+			bad_blocks++;
+			continue;
+		}
+		len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
+
+		bio = bio_alloc(GFP_NOFS, 1);
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
+		bio->bi_end_io = metapage_write_end_io;
+		bio->bi_private = page;
+
+		/* Don't call bio_add_page yet, we may add to this vec */
+		bio_offset = offset;
+		bio_bytes = len << inode->i_blkbits;
+
+		xlen -= len;
+		next_block = lblock + len;
+	}
+	if (bio) {
+		if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes)
+				goto add_failed;
+		if (!bio->bi_iter.bi_size)
+			goto dump_bio;
+
+		submit_bio(WRITE, bio);
+		nr_underway++;
+	}
+	if (redirty)
+		redirty_page_for_writepage(wbc, page);
+
+	unlock_page(page);
+
+	if (bad_blocks)
+		goto err_out;
+
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
+	return 0;
+add_failed:
+	/* We should never reach here, since we're only adding one vec */
+	printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
+	goto skip;
+dump_bio:
+	print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16,
+		       4, bio, sizeof(*bio), 0);
+skip:
+	bio_put(bio);
+	unlock_page(page);
+	dec_io(page, last_write_complete);
+err_out:
+	while (bad_blocks--)
+		dec_io(page, last_write_complete);
+	return -EIO;
+}
+
+static int metapage_readpage(struct file *fp, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct bio *bio = NULL;
+	int block_offset;
+	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	sector_t page_start;	/* address of page in fs blocks */
+	sector_t pblock;
+	int xlen;
+	unsigned int len;
+	int offset;
+
+	BUG_ON(!PageLocked(page));
+	page_start = (sector_t)page->index <<
+		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	block_offset = 0;
+	while (block_offset < blocks_per_page) {
+		xlen = blocks_per_page - block_offset;
+		pblock = metapage_get_blocks(inode, page_start + block_offset,
+					     &xlen);
+		if (pblock) {
+			if (!PagePrivate(page))
+				insert_metapage(page, NULL);
+			inc_io(page);
+			if (bio)
+				submit_bio(READ, bio);
+
+			bio = bio_alloc(GFP_NOFS, 1);
+			bio->bi_bdev = inode->i_sb->s_bdev;
+			bio->bi_iter.bi_sector =
+				pblock << (inode->i_blkbits - 9);
+			bio->bi_end_io = metapage_read_end_io;
+			bio->bi_private = page;
+			len = xlen << inode->i_blkbits;
+			offset = block_offset << inode->i_blkbits;
+			if (bio_add_page(bio, page, len, offset) < len)
+				goto add_failed;
+			block_offset += xlen;
+		} else
+			block_offset++;
+	}
+	if (bio)
+		submit_bio(READ, bio);
+	else
+		unlock_page(page);
+
+	return 0;
+
+add_failed:
+	printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
+	bio_put(bio);
+	dec_io(page, last_read_complete);
+	return -EIO;
+}
+
+static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct metapage *mp;
+	int ret = 1;
+	int offset;
+
+	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+		mp = page_to_mp(page, offset);
+
+		if (!mp)
+			continue;
+
+		jfs_info("metapage_releasepage: mp = 0x%p", mp);
+		if (mp->count || mp->nohomeok ||
+		    test_bit(META_dirty, &mp->flag)) {
+			jfs_info("count = %ld, nohomeok = %d", mp->count,
+				 mp->nohomeok);
+			ret = 0;
+			continue;
+		}
+		if (mp->lsn)
+			remove_from_logsync(mp);
+		remove_metapage(page, mp);
+		INCREMENT(mpStat.pagefree);
+		free_metapage(mp);
+	}
+	return ret;
+}
+
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+				    unsigned int length)
+{
+	BUG_ON(offset || length < PAGE_CACHE_SIZE);
+
+	BUG_ON(PageWriteback(page));
+
+	metapage_releasepage(page, 0);
+}
+
+const struct address_space_operations jfs_metapage_aops = {
+	.readpage	= metapage_readpage,
+	.writepage	= metapage_writepage,
+	.releasepage	= metapage_releasepage,
+	.invalidatepage	= metapage_invalidatepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+};
+
+struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
+				unsigned int size, int absolute,
+				unsigned long new)
+{
+	int l2BlocksPerPage;
+	int l2bsize;
+	struct address_space *mapping;
+	struct metapage *mp = NULL;
+	struct page *page;
+	unsigned long page_index;
+	unsigned long page_offset;
+
+	jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d",
+		 inode->i_ino, lblock, absolute);
+
+	l2bsize = inode->i_blkbits;
+	l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+	page_index = lblock >> l2BlocksPerPage;
+	page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize;
+	if ((page_offset + size) > PAGE_CACHE_SIZE) {
+		jfs_err("MetaData crosses page boundary!!");
+		jfs_err("lblock = %lx, size  = %d", lblock, size);
+		dump_stack();
+		return NULL;
+	}
+	if (absolute)
+		mapping = JFS_SBI(inode->i_sb)->direct_inode->i_mapping;
+	else {
+		/*
+		 * If an nfs client tries to read an inode that is larger
+		 * than any existing inodes, we may try to read past the
+		 * end of the inode map
+		 */
+		if ((lblock << inode->i_blkbits) >= inode->i_size)
+			return NULL;
+		mapping = inode->i_mapping;
+	}
+
+	if (new && (PSIZE == PAGE_CACHE_SIZE)) {
+		page = grab_cache_page(mapping, page_index);
+		if (!page) {
+			jfs_err("grab_cache_page failed!");
+			return NULL;
+		}
+		SetPageUptodate(page);
+	} else {
+		page = read_mapping_page(mapping, page_index, NULL);
+		if (IS_ERR(page) || !PageUptodate(page)) {
+			jfs_err("read_mapping_page failed!");
+			return NULL;
+		}
+		lock_page(page);
+	}
+
+	mp = page_to_mp(page, page_offset);
+	if (mp) {
+		if (mp->logical_size != size) {
+			jfs_error(inode->i_sb,
+				  "get_mp->logical_size != size\n");
+			jfs_err("logical_size = %d, size = %d",
+				mp->logical_size, size);
+			dump_stack();
+			goto unlock;
+		}
+		mp->count++;
+		lock_metapage(mp);
+		if (test_bit(META_discard, &mp->flag)) {
+			if (!new) {
+				jfs_error(inode->i_sb,
+					  "using a discarded metapage\n");
+				discard_metapage(mp);
+				goto unlock;
+			}
+			clear_bit(META_discard, &mp->flag);
+		}
+	} else {
+		INCREMENT(mpStat.pagealloc);
+		mp = alloc_metapage(GFP_NOFS);
+		mp->page = page;
+		mp->flag = 0;
+		mp->xflag = COMMIT_PAGE;
+		mp->count = 1;
+		mp->nohomeok = 0;
+		mp->logical_size = size;
+		mp->data = page_address(page) + page_offset;
+		mp->index = lblock;
+		if (unlikely(insert_metapage(page, mp))) {
+			free_metapage(mp);
+			goto unlock;
+		}
+		lock_metapage(mp);
+	}
+
+	if (new) {
+		jfs_info("zeroing mp = 0x%p", mp);
+		memset(mp->data, 0, PSIZE);
+	}
+
+	unlock_page(page);
+	jfs_info("__get_metapage: returning = 0x%p data = 0x%p", mp, mp->data);
+	return mp;
+
+unlock:
+	unlock_page(page);
+	return NULL;
+}
+
+void grab_metapage(struct metapage * mp)
+{
+	jfs_info("grab_metapage: mp = 0x%p", mp);
+	page_cache_get(mp->page);
+	lock_page(mp->page);
+	mp->count++;
+	lock_metapage(mp);
+	unlock_page(mp->page);
+}
+
+void force_metapage(struct metapage *mp)
+{
+	struct page *page = mp->page;
+	jfs_info("force_metapage: mp = 0x%p", mp);
+	set_bit(META_forcewrite, &mp->flag);
+	clear_bit(META_sync, &mp->flag);
+	page_cache_get(page);
+	lock_page(page);
+	set_page_dirty(page);
+	write_one_page(page, 1);
+	clear_bit(META_forcewrite, &mp->flag);
+	page_cache_release(page);
+}
+
+void hold_metapage(struct metapage *mp)
+{
+	lock_page(mp->page);
+}
+
+void put_metapage(struct metapage *mp)
+{
+	if (mp->count || mp->nohomeok) {
+		/* Someone else will release this */
+		unlock_page(mp->page);
+		return;
+	}
+	page_cache_get(mp->page);
+	mp->count++;
+	lock_metapage(mp);
+	unlock_page(mp->page);
+	release_metapage(mp);
+}
+
+void release_metapage(struct metapage * mp)
+{
+	struct page *page = mp->page;
+	jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag);
+
+	BUG_ON(!page);
+
+	lock_page(page);
+	unlock_metapage(mp);
+
+	assert(mp->count);
+	if (--mp->count || mp->nohomeok) {
+		unlock_page(page);
+		page_cache_release(page);
+		return;
+	}
+
+	if (test_bit(META_dirty, &mp->flag)) {
+		set_page_dirty(page);
+		if (test_bit(META_sync, &mp->flag)) {
+			clear_bit(META_sync, &mp->flag);
+			write_one_page(page, 1);
+			lock_page(page); /* write_one_page unlocks the page */
+		}
+	} else if (mp->lsn)	/* discard_metapage doesn't remove it */
+		remove_from_logsync(mp);
+
+	/* Try to keep metapages from using up too much memory */
+	drop_metapage(page, mp);
+
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+void __invalidate_metapages(struct inode *ip, s64 addr, int len)
+{
+	sector_t lblock;
+	int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits;
+	int BlocksPerPage = 1 << l2BlocksPerPage;
+	/* All callers are interested in block device's mapping */
+	struct address_space *mapping =
+		JFS_SBI(ip->i_sb)->direct_inode->i_mapping;
+	struct metapage *mp;
+	struct page *page;
+	unsigned int offset;
+
+	/*
+	 * Mark metapages to discard.  They will eventually be
+	 * released, but should not be written.
+	 */
+	for (lblock = addr & ~(BlocksPerPage - 1); lblock < addr + len;
+	     lblock += BlocksPerPage) {
+		page = find_lock_page(mapping, lblock >> l2BlocksPerPage);
+		if (!page)
+			continue;
+		for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+			mp = page_to_mp(page, offset);
+			if (!mp)
+				continue;
+			if (mp->index < addr)
+				continue;
+			if (mp->index >= addr + len)
+				break;
+
+			clear_bit(META_dirty, &mp->flag);
+			set_bit(META_discard, &mp->flag);
+			if (mp->lsn)
+				remove_from_logsync(mp);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS Metapage statistics\n"
+		       "=======================\n"
+		       "page allocations = %d\n"
+		       "page frees = %d\n"
+		       "lock waits = %d\n",
+		       mpStat.pagealloc,
+		       mpStat.pagefree,
+		       mpStat.lockwait);
+	return 0;
+}
+
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_mpstat_proc_show, NULL);
+}
+
+const struct file_operations jfs_mpstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_mpstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
diff --git a/kernel/fs/jfs/jfs_metapage.h b/kernel/fs/jfs/jfs_metapage.h
new file mode 100644
index 000000000..337e9e51a
--- /dev/null
+++ b/kernel/fs/jfs/jfs_metapage.h
@@ -0,0 +1,154 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_METAPAGE
+#define _H_JFS_METAPAGE
+
+#include <linux/pagemap.h>
+
+struct metapage {
+	/* Common logsyncblk prefix (see jfs_logmgr.h) */
+	u16 xflag;
+	u16 unused;
+	lid_t lid;
+	int lsn;
+	struct list_head synclist;
+	/* End of logsyncblk prefix */
+
+	unsigned long flag;	/* See Below */
+	unsigned long count;	/* Reference count */
+	void *data;		/* Data pointer */
+	sector_t index;		/* block address of page */
+	wait_queue_head_t wait;
+
+	/* implementation */
+	struct page *page;
+	unsigned int logical_size;
+
+	/* Journal management */
+	int clsn;
+	int nohomeok;
+	struct jfs_log *log;
+};
+
+/* metapage flag */
+#define META_locked	0
+#define META_dirty	2
+#define META_sync	3
+#define META_discard	4
+#define META_forcewrite	5
+#define META_io		6
+
+#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag)
+
+/* function prototypes */
+extern int metapage_init(void);
+extern void metapage_exit(void);
+extern struct metapage *__get_metapage(struct inode *inode,
+				  unsigned long lblock, unsigned int size,
+				  int absolute, unsigned long new);
+
+#define read_metapage(inode, lblock, size, absolute)\
+	 __get_metapage(inode, lblock, size, absolute, false)
+
+#define get_metapage(inode, lblock, size, absolute)\
+	 __get_metapage(inode, lblock, size, absolute, true)
+
+extern void release_metapage(struct metapage *);
+extern void grab_metapage(struct metapage *);
+extern void force_metapage(struct metapage *);
+
+/*
+ * hold_metapage and put_metapage are used in conjunction.  The page lock
+ * is not dropped between the two, so no other threads can get or release
+ * the metapage
+ */
+extern void hold_metapage(struct metapage *);
+extern void put_metapage(struct metapage *);
+
+static inline void write_metapage(struct metapage *mp)
+{
+	set_bit(META_dirty, &mp->flag);
+	release_metapage(mp);
+}
+
+static inline void flush_metapage(struct metapage *mp)
+{
+	set_bit(META_sync, &mp->flag);
+	write_metapage(mp);
+}
+
+static inline void discard_metapage(struct metapage *mp)
+{
+	clear_bit(META_dirty, &mp->flag);
+	set_bit(META_discard, &mp->flag);
+	release_metapage(mp);
+}
+
+static inline void metapage_nohomeok(struct metapage *mp)
+{
+	struct page *page = mp->page;
+	lock_page(page);
+	if (!mp->nohomeok++) {
+		mark_metapage_dirty(mp);
+		page_cache_get(page);
+		wait_on_page_writeback(page);
+	}
+	unlock_page(page);
+}
+
+/*
+ * This serializes access to mp->lsn when metapages are added to logsynclist
+ * without setting nohomeok.  i.e. updating imap & dmap
+ */
+static inline void metapage_wait_for_io(struct metapage *mp)
+{
+	if (test_bit(META_io, &mp->flag))
+		wait_on_page_writeback(mp->page);
+}
+
+/*
+ * This is called when already holding the metapage
+ */
+static inline void _metapage_homeok(struct metapage *mp)
+{
+	if (!--mp->nohomeok)
+		page_cache_release(mp->page);
+}
+
+static inline void metapage_homeok(struct metapage *mp)
+{
+	hold_metapage(mp);
+	_metapage_homeok(mp);
+	put_metapage(mp);
+}
+
+extern const struct address_space_operations jfs_metapage_aops;
+
+/*
+ * This routines invalidate all pages for an extent.
+ */
+extern void __invalidate_metapages(struct inode *, s64, int);
+#define invalidate_pxd_metapages(ip, pxd) \
+	__invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd)))
+#define invalidate_dxd_metapages(ip, dxd) \
+	__invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd)))
+#define invalidate_xad_metapages(ip, xad) \
+	__invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad)))
+
+#endif				/* _H_JFS_METAPAGE */
diff --git a/kernel/fs/jfs/jfs_mount.c b/kernel/fs/jfs/jfs_mount.c
new file mode 100644
index 000000000..9895595fd
--- /dev/null
+++ b/kernel/fs/jfs/jfs_mount.c
@@ -0,0 +1,507 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * Module: jfs_mount.c
+ *
+ * note: file system in transition to aggregate/fileset:
+ *
+ * file system mount is interpreted as the mount of aggregate,
+ * if not already mounted, and mount of the single/only fileset in
+ * the aggregate;
+ *
+ * a file system/aggregate is represented by an internal inode
+ * (aka mount inode) initialized with aggregate superblock;
+ * each vfs represents a fileset, and points to its "fileset inode
+ * allocation map inode" (aka fileset inode):
+ * (an aggregate itself is structured recursively as a filset:
+ * an internal vfs is constructed and points to its "fileset inode
+ * allocation map inode" (aka aggregate inode) where each inode
+ * represents a fileset inode) so that inode number is mapped to
+ * on-disk inode in uniform way at both aggregate and fileset level;
+ *
+ * each vnode/inode of a fileset is linked to its vfs (to facilitate
+ * per fileset inode operations, e.g., unmount of a fileset, etc.);
+ * each inode points to the mount inode (to facilitate access to
+ * per aggregate information, e.g., block size, etc.) as well as
+ * its file set inode.
+ *
+ *   aggregate
+ *   ipmnt
+ *   mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap;
+ *             fileset vfs     -> vp(1) <-> ... <-> vp(n) <->vproot;
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+
+/*
+ * forward references
+ */
+static int chkSuper(struct super_block *);
+static int logMOUNT(struct super_block *sb);
+
+/*
+ * NAME:	jfs_mount(sb)
+ *
+ * FUNCTION:	vfs_mount()
+ *
+ * PARAMETER:	sb	- super block
+ *
+ * RETURN:	-EBUSY	- device already mounted or open for write
+ *		-EBUSY	- cvrdvp already mounted;
+ *		-EBUSY	- mount table full
+ *		-ENOTDIR- cvrdvp not directory on a device mount
+ *		-ENXIO	- device open failure
+ */
+int jfs_mount(struct super_block *sb)
+{
+	int rc = 0;		/* Return code */
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct inode *ipaimap = NULL;
+	struct inode *ipaimap2 = NULL;
+	struct inode *ipimap = NULL;
+	struct inode *ipbmap = NULL;
+
+	/*
+	 * read/validate superblock
+	 * (initialize mount inode from the superblock)
+	 */
+	if ((rc = chkSuper(sb))) {
+		goto errout20;
+	}
+
+	ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
+	if (ipaimap == NULL) {
+		jfs_err("jfs_mount: Failed to read AGGREGATE_I");
+		rc = -EIO;
+		goto errout20;
+	}
+	sbi->ipaimap = ipaimap;
+
+	jfs_info("jfs_mount: ipaimap:0x%p", ipaimap);
+
+	/*
+	 * initialize aggregate inode allocation map
+	 */
+	if ((rc = diMount(ipaimap))) {
+		jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
+		goto errout21;
+	}
+
+	/*
+	 * open aggregate block allocation map
+	 */
+	ipbmap = diReadSpecial(sb, BMAP_I, 0);
+	if (ipbmap == NULL) {
+		rc = -EIO;
+		goto errout22;
+	}
+
+	jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
+
+	sbi->ipbmap = ipbmap;
+
+	/*
+	 * initialize aggregate block allocation map
+	 */
+	if ((rc = dbMount(ipbmap))) {
+		jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
+		goto errout22;
+	}
+
+	/*
+	 * open the secondary aggregate inode allocation map
+	 *
+	 * This is a duplicate of the aggregate inode allocation map.
+	 *
+	 * hand craft a vfs in the same fashion as we did to read ipaimap.
+	 * By adding INOSPEREXT (32) to the inode number, we are telling
+	 * diReadSpecial that we are reading from the secondary aggregate
+	 * inode table.  This also creates a unique entry in the inode hash
+	 * table.
+	 */
+	if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
+		ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
+		if (!ipaimap2) {
+			jfs_err("jfs_mount: Failed to read AGGREGATE_I");
+			rc = -EIO;
+			goto errout35;
+		}
+		sbi->ipaimap2 = ipaimap2;
+
+		jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2);
+
+		/*
+		 * initialize secondary aggregate inode allocation map
+		 */
+		if ((rc = diMount(ipaimap2))) {
+			jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
+				rc);
+			goto errout35;
+		}
+	} else
+		/* Secondary aggregate inode table is not valid */
+		sbi->ipaimap2 = NULL;
+
+	/*
+	 *	mount (the only/single) fileset
+	 */
+	/*
+	 * open fileset inode allocation map (aka fileset inode)
+	 */
+	ipimap = diReadSpecial(sb, FILESYSTEM_I, 0);
+	if (ipimap == NULL) {
+		jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
+		/* open fileset secondary inode allocation map */
+		rc = -EIO;
+		goto errout40;
+	}
+	jfs_info("jfs_mount: ipimap:0x%p", ipimap);
+
+	/* map further access of per fileset inodes by the fileset inode */
+	sbi->ipimap = ipimap;
+
+	/* initialize fileset inode allocation map */
+	if ((rc = diMount(ipimap))) {
+		jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
+		goto errout41;
+	}
+
+	goto out;
+
+	/*
+	 *	unwind on error
+	 */
+      errout41:		/* close fileset inode allocation map inode */
+	diFreeSpecial(ipimap);
+
+      errout40:		/* fileset closed */
+
+	/* close secondary aggregate inode allocation map */
+	if (ipaimap2) {
+		diUnmount(ipaimap2, 1);
+		diFreeSpecial(ipaimap2);
+	}
+
+      errout35:
+
+	/* close aggregate block allocation map */
+	dbUnmount(ipbmap, 1);
+	diFreeSpecial(ipbmap);
+
+      errout22:		/* close aggregate inode allocation map */
+
+	diUnmount(ipaimap, 1);
+
+      errout21:		/* close aggregate inodes */
+	diFreeSpecial(ipaimap);
+      errout20:		/* aggregate closed */
+
+      out:
+
+	if (rc)
+		jfs_err("Mount JFS Failure: %d", rc);
+
+	return rc;
+}
+
+/*
+ * NAME:	jfs_mount_rw(sb, remount)
+ *
+ * FUNCTION:	Completes read-write mount, or remounts read-only volume
+ *		as read-write
+ */
+int jfs_mount_rw(struct super_block *sb, int remount)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int rc;
+
+	/*
+	 * If we are re-mounting a previously read-only volume, we want to
+	 * re-read the inode and block maps, since fsck.jfs may have updated
+	 * them.
+	 */
+	if (remount) {
+		if (chkSuper(sb) || (sbi->state != FM_CLEAN))
+			return -EINVAL;
+
+		truncate_inode_pages(sbi->ipimap->i_mapping, 0);
+		truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+		diUnmount(sbi->ipimap, 1);
+		if ((rc = diMount(sbi->ipimap))) {
+			jfs_err("jfs_mount_rw: diMount failed!");
+			return rc;
+		}
+
+		dbUnmount(sbi->ipbmap, 1);
+		if ((rc = dbMount(sbi->ipbmap))) {
+			jfs_err("jfs_mount_rw: dbMount failed!");
+			return rc;
+		}
+	}
+
+	/*
+	 * open/initialize log
+	 */
+	if ((rc = lmLogOpen(sb)))
+		return rc;
+
+	/*
+	 * update file system superblock;
+	 */
+	if ((rc = updateSuper(sb, FM_MOUNT))) {
+		jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc);
+		lmLogClose(sb);
+		return rc;
+	}
+
+	/*
+	 * write MOUNT log record of the file system
+	 */
+	logMOUNT(sb);
+
+	return rc;
+}
+
+/*
+ *	chkSuper()
+ *
+ * validate the superblock of the file system to be mounted and
+ * get the file system parameters.
+ *
+ * returns
+ *	0 with fragsize set if check successful
+ *	error code if not successful
+ */
+static int chkSuper(struct super_block *sb)
+{
+	int rc = 0;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_superblock *j_sb;
+	struct buffer_head *bh;
+	int AIM_bytesize, AIT_bytesize;
+	int expected_AIM_bytesize, expected_AIT_bytesize;
+	s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr;
+	s64 byte_addr_diff0, byte_addr_diff1;
+	s32 bsize;
+
+	if ((rc = readSuper(sb, &bh)))
+		return rc;
+	j_sb = (struct jfs_superblock *)bh->b_data;
+
+	/*
+	 * validate superblock
+	 */
+	/* validate fs signature */
+	if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) ||
+	    le32_to_cpu(j_sb->s_version) > JFS_VERSION) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	bsize = le32_to_cpu(j_sb->s_bsize);
+#ifdef _JFS_4K
+	if (bsize != PSIZE) {
+		jfs_err("Currently only 4K block size supported!");
+		rc = -EINVAL;
+		goto out;
+	}
+#endif				/* _JFS_4K */
+
+	jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx",
+		 le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
+		 (unsigned long long) le64_to_cpu(j_sb->s_size));
+
+	/* validate the descriptors for Secondary AIM and AIT */
+	if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) !=
+	    cpu_to_le32(JFS_BAD_SAIT)) {
+		expected_AIM_bytesize = 2 * PSIZE;
+		AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize;
+		expected_AIT_bytesize = 4 * PSIZE;
+		AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize;
+		AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize;
+		AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize;
+		byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr;
+		fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize;
+		byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr;
+		if ((AIM_bytesize != expected_AIM_bytesize) ||
+		    (AIT_bytesize != expected_AIT_bytesize) ||
+		    (byte_addr_diff0 != AIM_bytesize) ||
+		    (byte_addr_diff1 <= AIT_bytesize))
+			j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
+	}
+
+	if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) !=
+	    cpu_to_le32(JFS_GROUPCOMMIT))
+		j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT);
+
+	/* validate fs state */
+	if (j_sb->s_state != cpu_to_le32(FM_CLEAN) &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		jfs_err("jfs_mount: Mount Failure: File System Dirty.");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	sbi->state = le32_to_cpu(j_sb->s_state);
+	sbi->mntflag = le32_to_cpu(j_sb->s_flag);
+
+	/*
+	 * JFS always does I/O by 4K pages.  Don't tell the buffer cache
+	 * that we use anything else (leave s_blocksize alone).
+	 */
+	sbi->bsize = bsize;
+	sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize);
+
+	/*
+	 * For now, ignore s_pbsize, l2bfactor.  All I/O going through buffer
+	 * cache.
+	 */
+	sbi->nbperpage = PSIZE >> sbi->l2bsize;
+	sbi->l2nbperpage = L2PSIZE - sbi->l2bsize;
+	sbi->l2niperblk = sbi->l2bsize - L2DISIZE;
+	if (sbi->mntflag & JFS_INLINELOG)
+		sbi->logpxd = j_sb->s_logpxd;
+	else {
+		sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev));
+		memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid));
+		memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid));
+	}
+	sbi->fsckpxd = j_sb->s_fsckpxd;
+	sbi->ait2 = j_sb->s_ait2;
+
+      out:
+	brelse(bh);
+	return rc;
+}
+
+
+/*
+ *	updateSuper()
+ *
+ * update synchronously superblock if it is mounted read-write.
+ */
+int updateSuper(struct super_block *sb, uint state)
+{
+	struct jfs_superblock *j_sb;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct buffer_head *bh;
+	int rc;
+
+	if (sbi->flag & JFS_NOINTEGRITY) {
+		if (state == FM_DIRTY) {
+			sbi->p_state = state;
+			return 0;
+		} else if (state == FM_MOUNT) {
+			sbi->p_state = sbi->state;
+			state = FM_DIRTY;
+		} else if (state == FM_CLEAN) {
+			state = sbi->p_state;
+		} else
+			jfs_err("updateSuper: bad state");
+	} else if (sbi->state == FM_DIRTY)
+		return 0;
+
+	if ((rc = readSuper(sb, &bh)))
+		return rc;
+
+	j_sb = (struct jfs_superblock *)bh->b_data;
+
+	j_sb->s_state = cpu_to_le32(state);
+	sbi->state = state;
+
+	if (state == FM_MOUNT) {
+		/* record log's dev_t and mount serial number */
+		j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
+		j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
+	} else if (state == FM_CLEAN) {
+		/*
+		 * If this volume is shared with OS/2, OS/2 will need to
+		 * recalculate DASD usage, since we don't deal with it.
+		 */
+		if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED))
+			j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME);
+	}
+
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	return 0;
+}
+
+
+/*
+ *	readSuper()
+ *
+ * read superblock by raw sector address
+ */
+int readSuper(struct super_block *sb, struct buffer_head **bpp)
+{
+	/* read in primary superblock */
+	*bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits);
+	if (*bpp)
+		return 0;
+
+	/* read in secondary/replicated superblock */
+	*bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
+	if (*bpp)
+		return 0;
+
+	return -EIO;
+}
+
+
+/*
+ *	logMOUNT()
+ *
+ * function: write a MOUNT log record for file system.
+ *
+ * MOUNT record keeps logredo() from processing log records
+ * for this file system past this point in log.
+ * it is harmless if mount fails.
+ *
+ * note: MOUNT record is at aggregate level, not at fileset level,
+ * since log records of previous mounts of a fileset
+ * (e.g., AFTER record of extent allocation) have to be processed
+ * to update block allocation map at aggregate level.
+ */
+static int logMOUNT(struct super_block *sb)
+{
+	struct jfs_log *log = JFS_SBI(sb)->log;
+	struct lrd lrd;
+
+	lrd.logtid = 0;
+	lrd.backchain = 0;
+	lrd.type = cpu_to_le16(LOG_MOUNT);
+	lrd.length = 0;
+	lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev));
+	lmLog(log, NULL, &lrd, NULL);
+
+	return 0;
+}
diff --git a/kernel/fs/jfs/jfs_superblock.h b/kernel/fs/jfs/jfs_superblock.h
new file mode 100644
index 000000000..04847b8d3
--- /dev/null
+++ b/kernel/fs/jfs/jfs_superblock.h
@@ -0,0 +1,122 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_SUPERBLOCK
+#define _H_JFS_SUPERBLOCK
+
+/*
+ * make the magic number something a human could read
+ */
+#define JFS_MAGIC	"JFS1"	/* Magic word */
+
+#define JFS_VERSION	2	/* Version number: Version 2 */
+
+#define LV_NAME_SIZE	11	/* MUST BE 11 for OS/2 boot sector */
+
+/*
+ *	aggregate superblock
+ *
+ * The name superblock is too close to super_block, so the name has been
+ * changed to jfs_superblock.  The utilities are still using the old name.
+ */
+struct jfs_superblock {
+	char s_magic[4];	/* 4: magic number */
+	__le32 s_version;	/* 4: version number */
+
+	__le64 s_size;		/* 8: aggregate size in hardware/LVM blocks;
+				 * VFS: number of blocks
+				 */
+	__le32 s_bsize;		/* 4: aggregate block size in bytes;
+				 * VFS: fragment size
+				 */
+	__le16 s_l2bsize;	/* 2: log2 of s_bsize */
+	__le16 s_l2bfactor;	/* 2: log2(s_bsize/hardware block size) */
+	__le32 s_pbsize;	/* 4: hardware/LVM block size in bytes */
+	__le16 s_l2pbsize;	/* 2: log2 of s_pbsize */
+	__le16 pad;		/* 2: padding necessary for alignment */
+
+	__le32 s_agsize;	/* 4: allocation group size in aggr. blocks */
+
+	__le32 s_flag;		/* 4: aggregate attributes:
+				 *    see jfs_filsys.h
+				 */
+	__le32 s_state;		/* 4: mount/unmount/recovery state:
+				 *    see jfs_filsys.h
+				 */
+	__le32 s_compress;		/* 4: > 0 if data compression */
+
+	pxd_t s_ait2;		/* 8: first extent of secondary
+				 *    aggregate inode table
+				 */
+
+	pxd_t s_aim2;		/* 8: first extent of secondary
+				 *    aggregate inode map
+				 */
+	__le32 s_logdev;		/* 4: device address of log */
+	__le32 s_logserial;	/* 4: log serial number at aggregate mount */
+	pxd_t s_logpxd;		/* 8: inline log extent */
+
+	pxd_t s_fsckpxd;	/* 8: inline fsck work space extent */
+
+	struct timestruc_t s_time;	/* 8: time last updated */
+
+	__le32 s_fsckloglen;	/* 4: Number of filesystem blocks reserved for
+				 *    the fsck service log.
+				 *    N.B. These blocks are divided among the
+				 *         versions kept.  This is not a per
+				 *         version size.
+				 *    N.B. These blocks are included in the
+				 *         length field of s_fsckpxd.
+				 */
+	s8 s_fscklog;		/* 1: which fsck service log is most recent
+				 *    0 => no service log data yet
+				 *    1 => the first one
+				 *    2 => the 2nd one
+				 */
+	char s_fpack[11];	/* 11: file system volume name
+				 *     N.B. This must be 11 bytes to
+				 *          conform with the OS/2 BootSector
+				 *          requirements
+				 *          Only used when s_version is 1
+				 */
+
+	/* extendfs() parameter under s_state & FM_EXTENDFS */
+	__le64 s_xsize;		/* 8: extendfs s_size */
+	pxd_t s_xfsckpxd;	/* 8: extendfs fsckpxd */
+	pxd_t s_xlogpxd;	/* 8: extendfs logpxd */
+	/* - 128 byte boundary - */
+
+	char s_uuid[16];	/* 16: 128-bit uuid for volume */
+	char s_label[16];	/* 16: volume label */
+	char s_loguuid[16];	/* 16: 128-bit uuid for log device */
+
+};
+
+extern int readSuper(struct super_block *, struct buffer_head **);
+extern int updateSuper(struct super_block *, uint);
+__printf(2, 3)
+extern void jfs_error(struct super_block *, const char *, ...);
+extern int jfs_mount(struct super_block *);
+extern int jfs_mount_rw(struct super_block *, int);
+extern int jfs_umount(struct super_block *);
+extern int jfs_umount_rw(struct super_block *);
+extern int jfs_extendfs(struct super_block *, s64, int);
+
+extern struct task_struct *jfsIOthread;
+extern struct task_struct *jfsSyncThread;
+
+#endif /*_H_JFS_SUPERBLOCK */
diff --git a/kernel/fs/jfs/jfs_txnmgr.c b/kernel/fs/jfs/jfs_txnmgr.c
new file mode 100644
index 000000000..d59585645
--- /dev/null
+++ b/kernel/fs/jfs/jfs_txnmgr.c
@@ -0,0 +1,3093 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_txnmgr.c: transaction manager
+ *
+ * notes:
+ * transaction starts with txBegin() and ends with txCommit()
+ * or txAbort().
+ *
+ * tlock is acquired at the time of update;
+ * (obviate scan at commit time for xtree and dtree)
+ * tlock and mp points to each other;
+ * (no hashlist for mp -> tlock).
+ *
+ * special cases:
+ * tlock on in-memory inode:
+ * in-place tlock in the in-memory inode itself;
+ * converted to page lock by iWrite() at commit time.
+ *
+ * tlock during write()/mmap() under anonymous transaction (tid = 0):
+ * transferred (?) to transaction at commit time.
+ *
+ * use the page itself to update allocation maps
+ * (obviate intermediate replication of allocation/deallocation data)
+ * hold on to mp+lock thru update of maps
+ */
+
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kthread.h>
+#include <linux/seq_file.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dinode.h"
+#include "jfs_imap.h"
+#include "jfs_dmap.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ *	transaction management structures
+ */
+static struct {
+	int freetid;		/* index of a free tid structure */
+	int freelock;		/* index first free lock word */
+	wait_queue_head_t freewait;	/* eventlist of free tblock */
+	wait_queue_head_t freelockwait;	/* eventlist of free tlock */
+	wait_queue_head_t lowlockwait;	/* eventlist of ample tlocks */
+	int tlocksInUse;	/* Number of tlocks in use */
+	spinlock_t LazyLock;	/* synchronize sync_queue & unlock_queue */
+/*	struct tblock *sync_queue; * Transactions waiting for data sync */
+	struct list_head unlock_queue;	/* Txns waiting to be released */
+	struct list_head anon_list;	/* inodes having anonymous txns */
+	struct list_head anon_list2;	/* inodes having anonymous txns
+					   that couldn't be sync'ed */
+} TxAnchor;
+
+int jfs_tlocks_low;		/* Indicates low number of available tlocks */
+
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+	uint txBegin;
+	uint txBegin_barrier;
+	uint txBegin_lockslow;
+	uint txBegin_freetid;
+	uint txBeginAnon;
+	uint txBeginAnon_barrier;
+	uint txBeginAnon_lockslow;
+	uint txLockAlloc;
+	uint txLockAlloc_freelock;
+} TxStat;
+#endif
+
+static int nTxBlock = -1;	/* number of transaction blocks */
+module_param(nTxBlock, int, 0);
+MODULE_PARM_DESC(nTxBlock,
+		 "Number of transaction blocks (max:65536)");
+
+static int nTxLock = -1;	/* number of transaction locks */
+module_param(nTxLock, int, 0);
+MODULE_PARM_DESC(nTxLock,
+		 "Number of transaction locks (max:65536)");
+
+struct tblock *TxBlock;	/* transaction block table */
+static int TxLockLWM;	/* Low water mark for number of txLocks used */
+static int TxLockHWM;	/* High water mark for number of txLocks used */
+static int TxLockVHWM;	/* Very High water mark */
+struct tlock *TxLock;	/* transaction lock table */
+
+/*
+ *	transaction management lock
+ */
+static DEFINE_SPINLOCK(jfsTxnLock);
+
+#define TXN_LOCK()		spin_lock(&jfsTxnLock)
+#define TXN_UNLOCK()		spin_unlock(&jfsTxnLock)
+
+#define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock);
+#define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
+#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
+
+static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
+static int jfs_commit_thread_waking;
+
+/*
+ * Retry logic exist outside these macros to protect from spurrious wakeups.
+ */
+static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(event, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	TXN_UNLOCK();
+	io_schedule();
+	remove_wait_queue(event, &wait);
+}
+
+#define TXN_SLEEP(event)\
+{\
+	TXN_SLEEP_DROP_LOCK(event);\
+	TXN_LOCK();\
+}
+
+#define TXN_WAKEUP(event) wake_up_all(event)
+
+/*
+ *	statistics
+ */
+static struct {
+	tid_t maxtid;		/* 4: biggest tid ever used */
+	lid_t maxlid;		/* 4: biggest lid ever used */
+	int ntid;		/* 4: # of transactions performed */
+	int nlid;		/* 4: # of tlocks acquired */
+	int waitlock;		/* 4: # of tlock wait */
+} stattx;
+
+/*
+ * forward references
+ */
+static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck, struct commit * cd);
+static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck);
+static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck);
+static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck);
+static void txAllocPMap(struct inode *ip, struct maplock * maplock,
+		struct tblock * tblk);
+static void txForce(struct tblock * tblk);
+static int txLog(struct jfs_log * log, struct tblock * tblk,
+		struct commit * cd);
+static void txUpdateMap(struct tblock * tblk);
+static void txRelease(struct tblock * tblk);
+static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	   struct tlock * tlck);
+static void LogSyncRelease(struct metapage * mp);
+
+/*
+ *		transaction block/lock management
+ *		---------------------------------
+ */
+
+/*
+ * Get a transaction lock from the free list.  If the number in use is
+ * greater than the high water mark, wake up the sync daemon.  This should
+ * free some anonymous transaction locks.  (TXN_LOCK must be held.)
+ */
+static lid_t txLockAlloc(void)
+{
+	lid_t lid;
+
+	INCREMENT(TxStat.txLockAlloc);
+	if (!TxAnchor.freelock) {
+		INCREMENT(TxStat.txLockAlloc_freelock);
+	}
+
+	while (!(lid = TxAnchor.freelock))
+		TXN_SLEEP(&TxAnchor.freelockwait);
+	TxAnchor.freelock = TxLock[lid].next;
+	HIGHWATERMARK(stattx.maxlid, lid);
+	if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
+		jfs_info("txLockAlloc tlocks low");
+		jfs_tlocks_low = 1;
+		wake_up_process(jfsSyncThread);
+	}
+
+	return lid;
+}
+
+static void txLockFree(lid_t lid)
+{
+	TxLock[lid].tid = 0;
+	TxLock[lid].next = TxAnchor.freelock;
+	TxAnchor.freelock = lid;
+	TxAnchor.tlocksInUse--;
+	if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
+		jfs_info("txLockFree jfs_tlocks_low no more");
+		jfs_tlocks_low = 0;
+		TXN_WAKEUP(&TxAnchor.lowlockwait);
+	}
+	TXN_WAKEUP(&TxAnchor.freelockwait);
+}
+
+/*
+ * NAME:	txInit()
+ *
+ * FUNCTION:	initialize transaction management structures
+ *
+ * RETURN:
+ *
+ * serialization: single thread at jfs_init()
+ */
+int txInit(void)
+{
+	int k, size;
+	struct sysinfo si;
+
+	/* Set defaults for nTxLock and nTxBlock if unset */
+
+	if (nTxLock == -1) {
+		if (nTxBlock == -1) {
+			/* Base default on memory size */
+			si_meminfo(&si);
+			if (si.totalram > (256 * 1024)) /* 1 GB */
+				nTxLock = 64 * 1024;
+			else
+				nTxLock = si.totalram >> 2;
+		} else if (nTxBlock > (8 * 1024))
+			nTxLock = 64 * 1024;
+		else
+			nTxLock = nTxBlock << 3;
+	}
+	if (nTxBlock == -1)
+		nTxBlock = nTxLock >> 3;
+
+	/* Verify tunable parameters */
+	if (nTxBlock < 16)
+		nTxBlock = 16;	/* No one should set it this low */
+	if (nTxBlock > 65536)
+		nTxBlock = 65536;
+	if (nTxLock < 256)
+		nTxLock = 256;	/* No one should set it this low */
+	if (nTxLock > 65536)
+		nTxLock = 65536;
+
+	printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
+	       nTxBlock, nTxLock);
+	/*
+	 * initialize transaction block (tblock) table
+	 *
+	 * transaction id (tid) = tblock index
+	 * tid = 0 is reserved.
+	 */
+	TxLockLWM = (nTxLock * 4) / 10;
+	TxLockHWM = (nTxLock * 7) / 10;
+	TxLockVHWM = (nTxLock * 8) / 10;
+
+	size = sizeof(struct tblock) * nTxBlock;
+	TxBlock = vmalloc(size);
+	if (TxBlock == NULL)
+		return -ENOMEM;
+
+	for (k = 1; k < nTxBlock - 1; k++) {
+		TxBlock[k].next = k + 1;
+		init_waitqueue_head(&TxBlock[k].gcwait);
+		init_waitqueue_head(&TxBlock[k].waitor);
+	}
+	TxBlock[k].next = 0;
+	init_waitqueue_head(&TxBlock[k].gcwait);
+	init_waitqueue_head(&TxBlock[k].waitor);
+
+	TxAnchor.freetid = 1;
+	init_waitqueue_head(&TxAnchor.freewait);
+
+	stattx.maxtid = 1;	/* statistics */
+
+	/*
+	 * initialize transaction lock (tlock) table
+	 *
+	 * transaction lock id = tlock index
+	 * tlock id = 0 is reserved.
+	 */
+	size = sizeof(struct tlock) * nTxLock;
+	TxLock = vmalloc(size);
+	if (TxLock == NULL) {
+		vfree(TxBlock);
+		return -ENOMEM;
+	}
+
+	/* initialize tlock table */
+	for (k = 1; k < nTxLock - 1; k++)
+		TxLock[k].next = k + 1;
+	TxLock[k].next = 0;
+	init_waitqueue_head(&TxAnchor.freelockwait);
+	init_waitqueue_head(&TxAnchor.lowlockwait);
+
+	TxAnchor.freelock = 1;
+	TxAnchor.tlocksInUse = 0;
+	INIT_LIST_HEAD(&TxAnchor.anon_list);
+	INIT_LIST_HEAD(&TxAnchor.anon_list2);
+
+	LAZY_LOCK_INIT();
+	INIT_LIST_HEAD(&TxAnchor.unlock_queue);
+
+	stattx.maxlid = 1;	/* statistics */
+
+	return 0;
+}
+
+/*
+ * NAME:	txExit()
+ *
+ * FUNCTION:	clean up when module is unloaded
+ */
+void txExit(void)
+{
+	vfree(TxLock);
+	TxLock = NULL;
+	vfree(TxBlock);
+	TxBlock = NULL;
+}
+
+/*
+ * NAME:	txBegin()
+ *
+ * FUNCTION:	start a transaction.
+ *
+ * PARAMETER:	sb	- superblock
+ *		flag	- force for nested tx;
+ *
+ * RETURN:	tid	- transaction id
+ *
+ * note: flag force allows to start tx for nested tx
+ * to prevent deadlock on logsync barrier;
+ */
+tid_t txBegin(struct super_block *sb, int flag)
+{
+	tid_t t;
+	struct tblock *tblk;
+	struct jfs_log *log;
+
+	jfs_info("txBegin: flag = 0x%x", flag);
+	log = JFS_SBI(sb)->log;
+
+	TXN_LOCK();
+
+	INCREMENT(TxStat.txBegin);
+
+      retry:
+	if (!(flag & COMMIT_FORCE)) {
+		/*
+		 * synchronize with logsync barrier
+		 */
+		if (test_bit(log_SYNCBARRIER, &log->flag) ||
+		    test_bit(log_QUIESCE, &log->flag)) {
+			INCREMENT(TxStat.txBegin_barrier);
+			TXN_SLEEP(&log->syncwait);
+			goto retry;
+		}
+	}
+	if (flag == 0) {
+		/*
+		 * Don't begin transaction if we're getting starved for tlocks
+		 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
+		 * free tlocks)
+		 */
+		if (TxAnchor.tlocksInUse > TxLockVHWM) {
+			INCREMENT(TxStat.txBegin_lockslow);
+			TXN_SLEEP(&TxAnchor.lowlockwait);
+			goto retry;
+		}
+	}
+
+	/*
+	 * allocate transaction id/block
+	 */
+	if ((t = TxAnchor.freetid) == 0) {
+		jfs_info("txBegin: waiting for free tid");
+		INCREMENT(TxStat.txBegin_freetid);
+		TXN_SLEEP(&TxAnchor.freewait);
+		goto retry;
+	}
+
+	tblk = tid_to_tblock(t);
+
+	if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
+		/* Don't let a non-forced transaction take the last tblk */
+		jfs_info("txBegin: waiting for free tid");
+		INCREMENT(TxStat.txBegin_freetid);
+		TXN_SLEEP(&TxAnchor.freewait);
+		goto retry;
+	}
+
+	TxAnchor.freetid = tblk->next;
+
+	/*
+	 * initialize transaction
+	 */
+
+	/*
+	 * We can't zero the whole thing or we screw up another thread being
+	 * awakened after sleeping on tblk->waitor
+	 *
+	 * memset(tblk, 0, sizeof(struct tblock));
+	 */
+	tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
+
+	tblk->sb = sb;
+	++log->logtid;
+	tblk->logtid = log->logtid;
+
+	++log->active;
+
+	HIGHWATERMARK(stattx.maxtid, t);	/* statistics */
+	INCREMENT(stattx.ntid);	/* statistics */
+
+	TXN_UNLOCK();
+
+	jfs_info("txBegin: returning tid = %d", t);
+
+	return t;
+}
+
+/*
+ * NAME:	txBeginAnon()
+ *
+ * FUNCTION:	start an anonymous transaction.
+ *		Blocks if logsync or available tlocks are low to prevent
+ *		anonymous tlocks from depleting supply.
+ *
+ * PARAMETER:	sb	- superblock
+ *
+ * RETURN:	none
+ */
+void txBeginAnon(struct super_block *sb)
+{
+	struct jfs_log *log;
+
+	log = JFS_SBI(sb)->log;
+
+	TXN_LOCK();
+	INCREMENT(TxStat.txBeginAnon);
+
+      retry:
+	/*
+	 * synchronize with logsync barrier
+	 */
+	if (test_bit(log_SYNCBARRIER, &log->flag) ||
+	    test_bit(log_QUIESCE, &log->flag)) {
+		INCREMENT(TxStat.txBeginAnon_barrier);
+		TXN_SLEEP(&log->syncwait);
+		goto retry;
+	}
+
+	/*
+	 * Don't begin transaction if we're getting starved for tlocks
+	 */
+	if (TxAnchor.tlocksInUse > TxLockVHWM) {
+		INCREMENT(TxStat.txBeginAnon_lockslow);
+		TXN_SLEEP(&TxAnchor.lowlockwait);
+		goto retry;
+	}
+	TXN_UNLOCK();
+}
+
+/*
+ *	txEnd()
+ *
+ * function: free specified transaction block.
+ *
+ *	logsync barrier processing:
+ *
+ * serialization:
+ */
+void txEnd(tid_t tid)
+{
+	struct tblock *tblk = tid_to_tblock(tid);
+	struct jfs_log *log;
+
+	jfs_info("txEnd: tid = %d", tid);
+	TXN_LOCK();
+
+	/*
+	 * wakeup transactions waiting on the page locked
+	 * by the current transaction
+	 */
+	TXN_WAKEUP(&tblk->waitor);
+
+	log = JFS_SBI(tblk->sb)->log;
+
+	/*
+	 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
+	 * otherwise, we would be left with a transaction that may have been
+	 * reused.
+	 *
+	 * Lazy commit thread will turn off tblkGC_LAZY before calling this
+	 * routine.
+	 */
+	if (tblk->flag & tblkGC_LAZY) {
+		jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
+		TXN_UNLOCK();
+
+		spin_lock_irq(&log->gclock);	// LOGGC_LOCK
+		tblk->flag |= tblkGC_UNLOCKED;
+		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
+		return;
+	}
+
+	jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
+
+	assert(tblk->next == 0);
+
+	/*
+	 * insert tblock back on freelist
+	 */
+	tblk->next = TxAnchor.freetid;
+	TxAnchor.freetid = tid;
+
+	/*
+	 * mark the tblock not active
+	 */
+	if (--log->active == 0) {
+		clear_bit(log_FLUSH, &log->flag);
+
+		/*
+		 * synchronize with logsync barrier
+		 */
+		if (test_bit(log_SYNCBARRIER, &log->flag)) {
+			TXN_UNLOCK();
+
+			/* write dirty metadata & forward log syncpt */
+			jfs_syncpt(log, 1);
+
+			jfs_info("log barrier off: 0x%x", log->lsn);
+
+			/* enable new transactions start */
+			clear_bit(log_SYNCBARRIER, &log->flag);
+
+			/* wakeup all waitors for logsync barrier */
+			TXN_WAKEUP(&log->syncwait);
+
+			goto wakeup;
+		}
+	}
+
+	TXN_UNLOCK();
+wakeup:
+	/*
+	 * wakeup all waitors for a free tblock
+	 */
+	TXN_WAKEUP(&TxAnchor.freewait);
+}
+
+/*
+ *	txLock()
+ *
+ * function: acquire a transaction lock on the specified <mp>
+ *
+ * parameter:
+ *
+ * return:	transaction lock id
+ *
+ * serialization:
+ */
+struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
+		     int type)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	int dir_xtree = 0;
+	lid_t lid;
+	tid_t xtid;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	struct linelock *linelock;
+	xtpage_t *p;
+	struct tblock *tblk;
+
+	TXN_LOCK();
+
+	if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
+	    !(mp->xflag & COMMIT_PAGE)) {
+		/*
+		 * Directory inode is special.  It can have both an xtree tlock
+		 * and a dtree tlock associated with it.
+		 */
+		dir_xtree = 1;
+		lid = jfs_ip->xtlid;
+	} else
+		lid = mp->lid;
+
+	/* is page not locked by a transaction ? */
+	if (lid == 0)
+		goto allocateLock;
+
+	jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
+
+	/* is page locked by the requester transaction ? */
+	tlck = lid_to_tlock(lid);
+	if ((xtid = tlck->tid) == tid) {
+		TXN_UNLOCK();
+		goto grantLock;
+	}
+
+	/*
+	 * is page locked by anonymous transaction/lock ?
+	 *
+	 * (page update without transaction (i.e., file write) is
+	 * locked under anonymous transaction tid = 0:
+	 * anonymous tlocks maintained on anonymous tlock list of
+	 * the inode of the page and available to all anonymous
+	 * transactions until txCommit() time at which point
+	 * they are transferred to the transaction tlock list of
+	 * the committing transaction of the inode)
+	 */
+	if (xtid == 0) {
+		tlck->tid = tid;
+		TXN_UNLOCK();
+		tblk = tid_to_tblock(tid);
+		/*
+		 * The order of the tlocks in the transaction is important
+		 * (during truncate, child xtree pages must be freed before
+		 * parent's tlocks change the working map).
+		 * Take tlock off anonymous list and add to tail of
+		 * transaction list
+		 *
+		 * Note:  We really need to get rid of the tid & lid and
+		 * use list_head's.  This code is getting UGLY!
+		 */
+		if (jfs_ip->atlhead == lid) {
+			if (jfs_ip->atltail == lid) {
+				/* only anonymous txn.
+				 * Remove from anon_list
+				 */
+				TXN_LOCK();
+				list_del_init(&jfs_ip->anon_inode_list);
+				TXN_UNLOCK();
+			}
+			jfs_ip->atlhead = tlck->next;
+		} else {
+			lid_t last;
+			for (last = jfs_ip->atlhead;
+			     lid_to_tlock(last)->next != lid;
+			     last = lid_to_tlock(last)->next) {
+				assert(last);
+			}
+			lid_to_tlock(last)->next = tlck->next;
+			if (jfs_ip->atltail == lid)
+				jfs_ip->atltail = last;
+		}
+
+		/* insert the tlock at tail of transaction tlock list */
+
+		if (tblk->next)
+			lid_to_tlock(tblk->last)->next = lid;
+		else
+			tblk->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+
+		goto grantLock;
+	}
+
+	goto waitLock;
+
+	/*
+	 * allocate a tlock
+	 */
+      allocateLock:
+	lid = txLockAlloc();
+	tlck = lid_to_tlock(lid);
+
+	/*
+	 * initialize tlock
+	 */
+	tlck->tid = tid;
+
+	TXN_UNLOCK();
+
+	/* mark tlock for meta-data page */
+	if (mp->xflag & COMMIT_PAGE) {
+
+		tlck->flag = tlckPAGELOCK;
+
+		/* mark the page dirty and nohomeok */
+		metapage_nohomeok(mp);
+
+		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
+			 mp, mp->nohomeok, tid, tlck);
+
+		/* if anonymous transaction, and buffer is on the group
+		 * commit synclist, mark inode to show this.  This will
+		 * prevent the buffer from being marked nohomeok for too
+		 * long a time.
+		 */
+		if ((tid == 0) && mp->lsn)
+			set_cflag(COMMIT_Synclist, ip);
+	}
+	/* mark tlock for in-memory inode */
+	else
+		tlck->flag = tlckINODELOCK;
+
+	if (S_ISDIR(ip->i_mode))
+		tlck->flag |= tlckDIRECTORY;
+
+	tlck->type = 0;
+
+	/* bind the tlock and the page */
+	tlck->ip = ip;
+	tlck->mp = mp;
+	if (dir_xtree)
+		jfs_ip->xtlid = lid;
+	else
+		mp->lid = lid;
+
+	/*
+	 * enqueue transaction lock to transaction/inode
+	 */
+	/* insert the tlock at tail of transaction tlock list */
+	if (tid) {
+		tblk = tid_to_tblock(tid);
+		if (tblk->next)
+			lid_to_tlock(tblk->last)->next = lid;
+		else
+			tblk->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+	}
+	/* anonymous transaction:
+	 * insert the tlock at head of inode anonymous tlock list
+	 */
+	else {
+		tlck->next = jfs_ip->atlhead;
+		jfs_ip->atlhead = lid;
+		if (tlck->next == 0) {
+			/* This inode's first anonymous transaction */
+			jfs_ip->atltail = lid;
+			TXN_LOCK();
+			list_add_tail(&jfs_ip->anon_inode_list,
+				      &TxAnchor.anon_list);
+			TXN_UNLOCK();
+		}
+	}
+
+	/* initialize type dependent area for linelock */
+	linelock = (struct linelock *) & tlck->lock;
+	linelock->next = 0;
+	linelock->flag = tlckLINELOCK;
+	linelock->maxcnt = TLOCKSHORT;
+	linelock->index = 0;
+
+	switch (type & tlckTYPE) {
+	case tlckDTREE:
+		linelock->l2linesize = L2DTSLOTSIZE;
+		break;
+
+	case tlckXTREE:
+		linelock->l2linesize = L2XTSLOTSIZE;
+
+		xtlck = (struct xtlock *) linelock;
+		xtlck->header.offset = 0;
+		xtlck->header.length = 2;
+
+		if (type & tlckNEW) {
+			xtlck->lwm.offset = XTENTRYSTART;
+		} else {
+			if (mp->xflag & COMMIT_PAGE)
+				p = (xtpage_t *) mp->data;
+			else
+				p = &jfs_ip->i_xtroot;
+			xtlck->lwm.offset =
+			    le16_to_cpu(p->header.nextindex);
+		}
+		xtlck->lwm.length = 0;	/* ! */
+		xtlck->twm.offset = 0;
+		xtlck->hwm.offset = 0;
+
+		xtlck->index = 2;
+		break;
+
+	case tlckINODE:
+		linelock->l2linesize = L2INODESLOTSIZE;
+		break;
+
+	case tlckDATA:
+		linelock->l2linesize = L2DATASLOTSIZE;
+		break;
+
+	default:
+		jfs_err("UFO tlock:0x%p", tlck);
+	}
+
+	/*
+	 * update tlock vector
+	 */
+      grantLock:
+	tlck->type |= type;
+
+	return tlck;
+
+	/*
+	 * page is being locked by another transaction:
+	 */
+      waitLock:
+	/* Only locks on ipimap or ipaimap should reach here */
+	/* assert(jfs_ip->fileset == AGGREGATE_I); */
+	if (jfs_ip->fileset != AGGREGATE_I) {
+		printk(KERN_ERR "txLock: trying to lock locked page!");
+		print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       ip, sizeof(*ip), 0);
+		print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       mp, sizeof(*mp), 0);
+		print_hex_dump(KERN_ERR, "Locker's tblock: ",
+			       DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
+			       sizeof(struct tblock), 0);
+		print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       tlck, sizeof(*tlck), 0);
+		BUG();
+	}
+	INCREMENT(stattx.waitlock);	/* statistics */
+	TXN_UNLOCK();
+	release_metapage(mp);
+	TXN_LOCK();
+	xtid = tlck->tid;	/* reacquire after dropping TXN_LOCK */
+
+	jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
+		 tid, xtid, lid);
+
+	/* Recheck everything since dropping TXN_LOCK */
+	if (xtid && (tlck->mp == mp) && (mp->lid == lid))
+		TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
+	else
+		TXN_UNLOCK();
+	jfs_info("txLock: awakened     tid = %d, lid = %d", tid, lid);
+
+	return NULL;
+}
+
+/*
+ * NAME:	txRelease()
+ *
+ * FUNCTION:	Release buffers associated with transaction locks, but don't
+ *		mark homeok yet.  The allows other transactions to modify
+ *		buffers, but won't let them go to disk until commit record
+ *		actually gets written.
+ *
+ * PARAMETER:
+ *		tblk	-
+ *
+ * RETURN:	Errors from subroutines.
+ */
+static void txRelease(struct tblock * tblk)
+{
+	struct metapage *mp;
+	lid_t lid;
+	struct tlock *tlck;
+
+	TXN_LOCK();
+
+	for (lid = tblk->next; lid; lid = tlck->next) {
+		tlck = lid_to_tlock(lid);
+		if ((mp = tlck->mp) != NULL &&
+		    (tlck->type & tlckBTROOT) == 0) {
+			assert(mp->xflag & COMMIT_PAGE);
+			mp->lid = 0;
+		}
+	}
+
+	/*
+	 * wakeup transactions waiting on a page locked
+	 * by the current transaction
+	 */
+	TXN_WAKEUP(&tblk->waitor);
+
+	TXN_UNLOCK();
+}
+
+/*
+ * NAME:	txUnlock()
+ *
+ * FUNCTION:	Initiates pageout of pages modified by tid in journalled
+ *		objects and frees their lockwords.
+ */
+static void txUnlock(struct tblock * tblk)
+{
+	struct tlock *tlck;
+	struct linelock *linelock;
+	lid_t lid, next, llid, k;
+	struct metapage *mp;
+	struct jfs_log *log;
+	int difft, diffp;
+	unsigned long flags;
+
+	jfs_info("txUnlock: tblk = 0x%p", tblk);
+	log = JFS_SBI(tblk->sb)->log;
+
+	/*
+	 * mark page under tlock homeok (its log has been written):
+	 */
+	for (lid = tblk->next; lid; lid = next) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+
+		jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
+
+		/* unbind page from tlock */
+		if ((mp = tlck->mp) != NULL &&
+		    (tlck->type & tlckBTROOT) == 0) {
+			assert(mp->xflag & COMMIT_PAGE);
+
+			/* hold buffer
+			 */
+			hold_metapage(mp);
+
+			assert(mp->nohomeok > 0);
+			_metapage_homeok(mp);
+
+			/* inherit younger/larger clsn */
+			LOGSYNC_LOCK(log, flags);
+			if (mp->clsn) {
+				logdiff(difft, tblk->clsn, log);
+				logdiff(diffp, mp->clsn, log);
+				if (difft > diffp)
+					mp->clsn = tblk->clsn;
+			} else
+				mp->clsn = tblk->clsn;
+			LOGSYNC_UNLOCK(log, flags);
+
+			assert(!(tlck->flag & tlckFREEPAGE));
+
+			put_metapage(mp);
+		}
+
+		/* insert tlock, and linelock(s) of the tlock if any,
+		 * at head of freelist
+		 */
+		TXN_LOCK();
+
+		llid = ((struct linelock *) & tlck->lock)->next;
+		while (llid) {
+			linelock = (struct linelock *) lid_to_tlock(llid);
+			k = linelock->next;
+			txLockFree(llid);
+			llid = k;
+		}
+		txLockFree(lid);
+
+		TXN_UNLOCK();
+	}
+	tblk->next = tblk->last = 0;
+
+	/*
+	 * remove tblock from logsynclist
+	 * (allocation map pages inherited lsn of tblk and
+	 * has been inserted in logsync list at txUpdateMap())
+	 */
+	if (tblk->lsn) {
+		LOGSYNC_LOCK(log, flags);
+		log->count--;
+		list_del(&tblk->synclist);
+		LOGSYNC_UNLOCK(log, flags);
+	}
+}
+
+/*
+ *	txMaplock()
+ *
+ * function: allocate a transaction lock for freed page/entry;
+ *	for freed page, maplock is used as xtlock/dtlock type;
+ */
+struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	lid_t lid;
+	struct tblock *tblk;
+	struct tlock *tlck;
+	struct maplock *maplock;
+
+	TXN_LOCK();
+
+	/*
+	 * allocate a tlock
+	 */
+	lid = txLockAlloc();
+	tlck = lid_to_tlock(lid);
+
+	/*
+	 * initialize tlock
+	 */
+	tlck->tid = tid;
+
+	/* bind the tlock and the object */
+	tlck->flag = tlckINODELOCK;
+	if (S_ISDIR(ip->i_mode))
+		tlck->flag |= tlckDIRECTORY;
+	tlck->ip = ip;
+	tlck->mp = NULL;
+
+	tlck->type = type;
+
+	/*
+	 * enqueue transaction lock to transaction/inode
+	 */
+	/* insert the tlock at tail of transaction tlock list */
+	if (tid) {
+		tblk = tid_to_tblock(tid);
+		if (tblk->next)
+			lid_to_tlock(tblk->last)->next = lid;
+		else
+			tblk->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+	}
+	/* anonymous transaction:
+	 * insert the tlock at head of inode anonymous tlock list
+	 */
+	else {
+		tlck->next = jfs_ip->atlhead;
+		jfs_ip->atlhead = lid;
+		if (tlck->next == 0) {
+			/* This inode's first anonymous transaction */
+			jfs_ip->atltail = lid;
+			list_add_tail(&jfs_ip->anon_inode_list,
+				      &TxAnchor.anon_list);
+		}
+	}
+
+	TXN_UNLOCK();
+
+	/* initialize type dependent area for maplock */
+	maplock = (struct maplock *) & tlck->lock;
+	maplock->next = 0;
+	maplock->maxcnt = 0;
+	maplock->index = 0;
+
+	return tlck;
+}
+
+/*
+ *	txLinelock()
+ *
+ * function: allocate a transaction lock for log vector list
+ */
+struct linelock *txLinelock(struct linelock * tlock)
+{
+	lid_t lid;
+	struct tlock *tlck;
+	struct linelock *linelock;
+
+	TXN_LOCK();
+
+	/* allocate a TxLock structure */
+	lid = txLockAlloc();
+	tlck = lid_to_tlock(lid);
+
+	TXN_UNLOCK();
+
+	/* initialize linelock */
+	linelock = (struct linelock *) tlck;
+	linelock->next = 0;
+	linelock->flag = tlckLINELOCK;
+	linelock->maxcnt = TLOCKLONG;
+	linelock->index = 0;
+	if (tlck->flag & tlckDIRECTORY)
+		linelock->flag |= tlckDIRECTORY;
+
+	/* append linelock after tlock */
+	linelock->next = tlock->next;
+	tlock->next = lid;
+
+	return linelock;
+}
+
+/*
+ *		transaction commit management
+ *		-----------------------------
+ */
+
+/*
+ * NAME:	txCommit()
+ *
+ * FUNCTION:	commit the changes to the objects specified in
+ *		clist.  For journalled segments only the
+ *		changes of the caller are committed, ie by tid.
+ *		for non-journalled segments the data are flushed to
+ *		disk and then the change to the disk inode and indirect
+ *		blocks committed (so blocks newly allocated to the
+ *		segment will be made a part of the segment atomically).
+ *
+ *		all of the segments specified in clist must be in
+ *		one file system. no more than 6 segments are needed
+ *		to handle all unix svcs.
+ *
+ *		if the i_nlink field (i.e. disk inode link count)
+ *		is zero, and the type of inode is a regular file or
+ *		directory, or symbolic link , the inode is truncated
+ *		to zero length. the truncation is committed but the
+ *		VM resources are unaffected until it is closed (see
+ *		iput and iclose).
+ *
+ * PARAMETER:
+ *
+ * RETURN:
+ *
+ * serialization:
+ *		on entry the inode lock on each segment is assumed
+ *		to be held.
+ *
+ * i/o error:
+ */
+int txCommit(tid_t tid,		/* transaction identifier */
+	     int nip,		/* number of inodes to commit */
+	     struct inode **iplist,	/* list of inode to commit */
+	     int flag)
+{
+	int rc = 0;
+	struct commit cd;
+	struct jfs_log *log;
+	struct tblock *tblk;
+	struct lrd *lrd;
+	struct inode *ip;
+	struct jfs_inode_info *jfs_ip;
+	int k, n;
+	ino_t top;
+	struct super_block *sb;
+
+	jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
+	/* is read-only file system ? */
+	if (isReadOnly(iplist[0])) {
+		rc = -EROFS;
+		goto TheEnd;
+	}
+
+	sb = cd.sb = iplist[0]->i_sb;
+	cd.tid = tid;
+
+	if (tid == 0)
+		tid = txBegin(sb, 0);
+	tblk = tid_to_tblock(tid);
+
+	/*
+	 * initialize commit structure
+	 */
+	log = JFS_SBI(sb)->log;
+	cd.log = log;
+
+	/* initialize log record descriptor in commit */
+	lrd = &cd.lrd;
+	lrd->logtid = cpu_to_le32(tblk->logtid);
+	lrd->backchain = 0;
+
+	tblk->xflag |= flag;
+
+	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
+		tblk->xflag |= COMMIT_LAZY;
+	/*
+	 *	prepare non-journaled objects for commit
+	 *
+	 * flush data pages of non-journaled file
+	 * to prevent the file getting non-initialized disk blocks
+	 * in case of crash.
+	 * (new blocks - )
+	 */
+	cd.iplist = iplist;
+	cd.nip = nip;
+
+	/*
+	 *	acquire transaction lock on (on-disk) inodes
+	 *
+	 * update on-disk inode from in-memory inode
+	 * acquiring transaction locks for AFTER records
+	 * on the on-disk inode of file object
+	 *
+	 * sort the inodes array by inode number in descending order
+	 * to prevent deadlock when acquiring transaction lock
+	 * of on-disk inodes on multiple on-disk inode pages by
+	 * multiple concurrent transactions
+	 */
+	for (k = 0; k < cd.nip; k++) {
+		top = (cd.iplist[k])->i_ino;
+		for (n = k + 1; n < cd.nip; n++) {
+			ip = cd.iplist[n];
+			if (ip->i_ino > top) {
+				top = ip->i_ino;
+				cd.iplist[n] = cd.iplist[k];
+				cd.iplist[k] = ip;
+			}
+		}
+
+		ip = cd.iplist[k];
+		jfs_ip = JFS_IP(ip);
+
+		/*
+		 * BUGBUG - This code has temporarily been removed.  The
+		 * intent is to ensure that any file data is written before
+		 * the metadata is committed to the journal.  This prevents
+		 * uninitialized data from appearing in a file after the
+		 * journal has been replayed.  (The uninitialized data
+		 * could be sensitive data removed by another user.)
+		 *
+		 * The problem now is that we are holding the IWRITELOCK
+		 * on the inode, and calling filemap_fdatawrite on an
+		 * unmapped page will cause a deadlock in jfs_get_block.
+		 *
+		 * The long term solution is to pare down the use of
+		 * IWRITELOCK.  We are currently holding it too long.
+		 * We could also be smarter about which data pages need
+		 * to be written before the transaction is committed and
+		 * when we don't need to worry about it at all.
+		 *
+		 * if ((!S_ISDIR(ip->i_mode))
+		 *    && (tblk->flag & COMMIT_DELETE) == 0)
+		 *	filemap_write_and_wait(ip->i_mapping);
+		 */
+
+		/*
+		 * Mark inode as not dirty.  It will still be on the dirty
+		 * inode list, but we'll know not to commit it again unless
+		 * it gets marked dirty again
+		 */
+		clear_cflag(COMMIT_Dirty, ip);
+
+		/* inherit anonymous tlock(s) of inode */
+		if (jfs_ip->atlhead) {
+			lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
+			tblk->next = jfs_ip->atlhead;
+			if (!tblk->last)
+				tblk->last = jfs_ip->atltail;
+			jfs_ip->atlhead = jfs_ip->atltail = 0;
+			TXN_LOCK();
+			list_del_init(&jfs_ip->anon_inode_list);
+			TXN_UNLOCK();
+		}
+
+		/*
+		 * acquire transaction lock on on-disk inode page
+		 * (become first tlock of the tblk's tlock list)
+		 */
+		if (((rc = diWrite(tid, ip))))
+			goto out;
+	}
+
+	/*
+	 *	write log records from transaction locks
+	 *
+	 * txUpdateMap() resets XAD_NEW in XAD.
+	 */
+	if ((rc = txLog(log, tblk, &cd)))
+		goto TheEnd;
+
+	/*
+	 * Ensure that inode isn't reused before
+	 * lazy commit thread finishes processing
+	 */
+	if (tblk->xflag & COMMIT_DELETE) {
+		ihold(tblk->u.ip);
+		/*
+		 * Avoid a rare deadlock
+		 *
+		 * If the inode is locked, we may be blocked in
+		 * jfs_commit_inode.  If so, we don't want the
+		 * lazy_commit thread doing the last iput() on the inode
+		 * since that may block on the locked inode.  Instead,
+		 * commit the transaction synchronously, so the last iput
+		 * will be done by the calling thread (or later)
+		 */
+		/*
+		 * I believe this code is no longer needed.  Splitting I_LOCK
+		 * into two bits, I_NEW and I_SYNC should prevent this
+		 * deadlock as well.  But since I don't have a JFS testload
+		 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
+		 * Joern
+		 */
+		if (tblk->u.ip->i_state & I_SYNC)
+			tblk->xflag &= ~COMMIT_LAZY;
+	}
+
+	ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
+	       ((tblk->u.ip->i_nlink == 0) &&
+		!test_cflag(COMMIT_Nolink, tblk->u.ip)));
+
+	/*
+	 *	write COMMIT log record
+	 */
+	lrd->type = cpu_to_le16(LOG_COMMIT);
+	lrd->length = 0;
+	lmLog(log, tblk, lrd, NULL);
+
+	lmGroupCommit(log, tblk);
+
+	/*
+	 *	- transaction is now committed -
+	 */
+
+	/*
+	 * force pages in careful update
+	 * (imap addressing structure update)
+	 */
+	if (flag & COMMIT_FORCE)
+		txForce(tblk);
+
+	/*
+	 *	update allocation map.
+	 *
+	 * update inode allocation map and inode:
+	 * free pager lock on memory object of inode if any.
+	 * update block allocation map.
+	 *
+	 * txUpdateMap() resets XAD_NEW in XAD.
+	 */
+	if (tblk->xflag & COMMIT_FORCE)
+		txUpdateMap(tblk);
+
+	/*
+	 *	free transaction locks and pageout/free pages
+	 */
+	txRelease(tblk);
+
+	if ((tblk->flag & tblkGC_LAZY) == 0)
+		txUnlock(tblk);
+
+
+	/*
+	 *	reset in-memory object state
+	 */
+	for (k = 0; k < cd.nip; k++) {
+		ip = cd.iplist[k];
+		jfs_ip = JFS_IP(ip);
+
+		/*
+		 * reset in-memory inode state
+		 */
+		jfs_ip->bxflag = 0;
+		jfs_ip->blid = 0;
+	}
+
+      out:
+	if (rc != 0)
+		txAbort(tid, 1);
+
+      TheEnd:
+	jfs_info("txCommit: tid = %d, returning %d", tid, rc);
+	return rc;
+}
+
+/*
+ * NAME:	txLog()
+ *
+ * FUNCTION:	Writes AFTER log records for all lines modified
+ *		by tid for segments specified by inodes in comdata.
+ *		Code assumes only WRITELOCKS are recorded in lockwords.
+ *
+ * PARAMETERS:
+ *
+ * RETURN :
+ */
+static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
+{
+	int rc = 0;
+	struct inode *ip;
+	lid_t lid;
+	struct tlock *tlck;
+	struct lrd *lrd = &cd->lrd;
+
+	/*
+	 * write log record(s) for each tlock of transaction,
+	 */
+	for (lid = tblk->next; lid; lid = tlck->next) {
+		tlck = lid_to_tlock(lid);
+
+		tlck->flag |= tlckLOG;
+
+		/* initialize lrd common */
+		ip = tlck->ip;
+		lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
+		lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
+		lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
+
+		/* write log record of page from the tlock */
+		switch (tlck->type & tlckTYPE) {
+		case tlckXTREE:
+			xtLog(log, tblk, lrd, tlck);
+			break;
+
+		case tlckDTREE:
+			dtLog(log, tblk, lrd, tlck);
+			break;
+
+		case tlckINODE:
+			diLog(log, tblk, lrd, tlck, cd);
+			break;
+
+		case tlckMAP:
+			mapLog(log, tblk, lrd, tlck);
+			break;
+
+		case tlckDATA:
+			dataLog(log, tblk, lrd, tlck);
+			break;
+
+		default:
+			jfs_err("UFO tlock:0x%p", tlck);
+		}
+	}
+
+	return rc;
+}
+
+/*
+ *	diLog()
+ *
+ * function:	log inode tlock and format maplock to update bmap;
+ */
+static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		 struct tlock * tlck, struct commit * cd)
+{
+	int rc = 0;
+	struct metapage *mp;
+	pxd_t *pxd;
+	struct pxd_lock *pxdlock;
+
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
+
+	pxd = &lrd->log.redopage.pxd;
+
+	/*
+	 *	inode after image
+	 */
+	if (tlck->type & tlckENTRY) {
+		/* log after-image for logredo(): */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(pxd, mp->index);
+		PXDlength(pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+	} else if (tlck->type & tlckFREE) {
+		/*
+		 *	free inode extent
+		 *
+		 * (pages of the freed inode extent have been invalidated and
+		 * a maplock for free of the extent has been formatted at
+		 * txLock() time);
+		 *
+		 * the tlock had been acquired on the inode allocation map page
+		 * (iag) that specifies the freed extent, even though the map
+		 * page is not itself logged, to prevent pageout of the map
+		 * page before the log;
+		 */
+
+		/* log LOG_NOREDOINOEXT of the freed inode extent for
+		 * logredo() to start NoRedoPage filters, and to update
+		 * imap and bmap for free of the extent;
+		 */
+		lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
+		/*
+		 * For the LOG_NOREDOINOEXT record, we need
+		 * to pass the IAG number and inode extent
+		 * index (within that IAG) from which the
+		 * the extent being released.  These have been
+		 * passed to us in the iplist[1] and iplist[2].
+		 */
+		lrd->log.noredoinoext.iagnum =
+		    cpu_to_le32((u32) (size_t) cd->iplist[1]);
+		lrd->log.noredoinoext.inoext_idx =
+		    cpu_to_le32((u32) (size_t) cd->iplist[2]);
+
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		*pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* update bmap */
+		tlck->flag |= tlckUPDATEMAP;
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+	} else
+		jfs_err("diLog: UFO type tlck:0x%p", tlck);
+#ifdef  _JFS_WIP
+	/*
+	 *	alloc/free external EA extent
+	 *
+	 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
+	 * of the extent has been formatted at txLock() time;
+	 */
+	else {
+		assert(tlck->type & tlckEA);
+
+		/* log LOG_UPDATEMAP for logredo() to update bmap for
+		 * alloc of new (and free of old) external EA extent;
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		nlock = pxdlock->index;
+		for (i = 0; i < nlock; i++, pxdlock++) {
+			if (pxdlock->flag & mlckALLOCPXD)
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_ALLOCPXD);
+			else
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_FREEPXD);
+			lrd->log.updatemap.nxd = cpu_to_le16(1);
+			lrd->log.updatemap.pxd = pxdlock->pxd;
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+		}
+
+		/* update bmap */
+		tlck->flag |= tlckUPDATEMAP;
+	}
+#endif				/* _JFS_WIP */
+
+	return rc;
+}
+
+/*
+ *	dataLog()
+ *
+ * function:	log data tlock
+ */
+static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	    struct tlock * tlck)
+{
+	struct metapage *mp;
+	pxd_t *pxd;
+
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
+
+	pxd = &lrd->log.redopage.pxd;
+
+	/* log after-image for logredo(): */
+	lrd->type = cpu_to_le16(LOG_REDOPAGE);
+
+	if (jfs_dirtable_inline(tlck->ip)) {
+		/*
+		 * The table has been truncated, we've must have deleted
+		 * the last entry, so don't bother logging this
+		 */
+		mp->lid = 0;
+		grab_metapage(mp);
+		metapage_homeok(mp);
+		discard_metapage(mp);
+		tlck->mp = NULL;
+		return 0;
+	}
+
+	PXDaddress(pxd, mp->index);
+	PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
+
+	lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+	/* mark page as homeward bound */
+	tlck->flag |= tlckWRITEPAGE;
+
+	return 0;
+}
+
+/*
+ *	dtLog()
+ *
+ * function:	log dtree tlock and format maplock to update bmap;
+ */
+static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	   struct tlock * tlck)
+{
+	struct metapage *mp;
+	struct pxd_lock *pxdlock;
+	pxd_t *pxd;
+
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE/NOREDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
+
+	pxd = &lrd->log.redopage.pxd;
+
+	if (tlck->type & tlckBTROOT)
+		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
+
+	/*
+	 *	page extension via relocation: entry insertion;
+	 *	page extension in-place: entry insertion;
+	 *	new right page from page split, reinitialized in-line
+	 *	root from root page split: entry insertion;
+	 */
+	if (tlck->type & (tlckNEW | tlckEXTEND)) {
+		/* log after-image of the new page for logredo():
+		 * mark log (LOG_NEW) for logredo() to initialize
+		 * freelist and update bmap for alloc of the new page;
+		 */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		if (tlck->type & tlckEXTEND)
+			lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
+		else
+			lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
+		PXDaddress(pxd, mp->index);
+		PXDlength(pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* format a maplock for txUpdateMap() to update bPMAP for
+		 * alloc of the new page;
+		 */
+		if (tlck->type & tlckBTROOT)
+			return;
+		tlck->flag |= tlckUPDATEMAP;
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		pxdlock->flag = mlckALLOCPXD;
+		pxdlock->pxd = *pxd;
+
+		pxdlock->index = 1;
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+		return;
+	}
+
+	/*
+	 *	entry insertion/deletion,
+	 *	sibling page link update (old right page before split);
+	 */
+	if (tlck->type & (tlckENTRY | tlckRELINK)) {
+		/* log after-image for logredo(): */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(pxd, mp->index);
+		PXDlength(pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+		return;
+	}
+
+	/*
+	 *	page deletion: page has been invalidated
+	 *	page relocation: source extent
+	 *
+	 *	a maplock for free of the page has been formatted
+	 *	at txLock() time);
+	 */
+	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
+		/* log LOG_NOREDOPAGE of the deleted page for logredo()
+		 * to start NoRedoPage filter and to update bmap for free
+		 * of the deletd page
+		 */
+		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		*pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* a maplock for txUpdateMap() for free of the page
+		 * has been formatted at txLock() time;
+		 */
+		tlck->flag |= tlckUPDATEMAP;
+	}
+	return;
+}
+
+/*
+ *	xtLog()
+ *
+ * function:	log xtree tlock and format maplock to update bmap;
+ */
+static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	   struct tlock * tlck)
+{
+	struct inode *ip;
+	struct metapage *mp;
+	xtpage_t *p;
+	struct xtlock *xtlck;
+	struct maplock *maplock;
+	struct xdlistlock *xadlock;
+	struct pxd_lock *pxdlock;
+	pxd_t *page_pxd;
+	int next, lwm, hwm;
+
+	ip = tlck->ip;
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE/NOREDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
+
+	page_pxd = &lrd->log.redopage.pxd;
+
+	if (tlck->type & tlckBTROOT) {
+		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
+		p = &JFS_IP(ip)->i_xtroot;
+		if (S_ISDIR(ip->i_mode))
+			lrd->log.redopage.type |=
+			    cpu_to_le16(LOG_DIR_XTREE);
+	} else
+		p = (xtpage_t *) mp->data;
+	next = le16_to_cpu(p->header.nextindex);
+
+	xtlck = (struct xtlock *) & tlck->lock;
+
+	maplock = (struct maplock *) & tlck->lock;
+	xadlock = (struct xdlistlock *) maplock;
+
+	/*
+	 *	entry insertion/extension;
+	 *	sibling page link update (old right page before split);
+	 */
+	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
+		/* log after-image for logredo():
+		 * logredo() will update bmap for alloc of new/extended
+		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
+		 * after-image of XADlist;
+		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
+		 * applying the after-image to the meta-data page.
+		 */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(page_pxd, mp->index);
+		PXDlength(page_pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* format a maplock for txUpdateMap() to update bPMAP
+		 * for alloc of new/extended extents of XAD[lwm:next)
+		 * from the page itself;
+		 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
+		 */
+		lwm = xtlck->lwm.offset;
+		if (lwm == 0)
+			lwm = XTPAGEMAXSLOT;
+
+		if (lwm == next)
+			goto out;
+		if (lwm > next) {
+			jfs_err("xtLog: lwm > next\n");
+			goto out;
+		}
+		tlck->flag |= tlckUPDATEMAP;
+		xadlock->flag = mlckALLOCXADLIST;
+		xadlock->count = next - lwm;
+		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
+			int i;
+			pxd_t *pxd;
+			/*
+			 * Lazy commit may allow xtree to be modified before
+			 * txUpdateMap runs.  Copy xad into linelock to
+			 * preserve correct data.
+			 *
+			 * We can fit twice as may pxd's as xads in the lock
+			 */
+			xadlock->flag = mlckALLOCPXDLIST;
+			pxd = xadlock->xdlist = &xtlck->pxdlock;
+			for (i = 0; i < xadlock->count; i++) {
+				PXDaddress(pxd, addressXAD(&p->xad[lwm + i]));
+				PXDlength(pxd, lengthXAD(&p->xad[lwm + i]));
+				p->xad[lwm + i].flag &=
+				    ~(XAD_NEW | XAD_EXTENDED);
+				pxd++;
+			}
+		} else {
+			/*
+			 * xdlist will point to into inode's xtree, ensure
+			 * that transaction is not committed lazily.
+			 */
+			xadlock->flag = mlckALLOCXADLIST;
+			xadlock->xdlist = &p->xad[lwm];
+			tblk->xflag &= ~COMMIT_LAZY;
+		}
+		jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d "
+			 "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count);
+
+		maplock->index = 1;
+
+	      out:
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+
+		return;
+	}
+
+	/*
+	 *	page deletion: file deletion/truncation (ref. xtTruncate())
+	 *
+	 * (page will be invalidated after log is written and bmap
+	 * is updated from the page);
+	 */
+	if (tlck->type & tlckFREE) {
+		/* LOG_NOREDOPAGE log for NoRedoPage filter:
+		 * if page free from file delete, NoRedoFile filter from
+		 * inode image of zero link count will subsume NoRedoPage
+		 * filters for each page;
+		 * if page free from file truncattion, write NoRedoPage
+		 * filter;
+		 *
+		 * upadte of block allocation map for the page itself:
+		 * if page free from deletion and truncation, LOG_UPDATEMAP
+		 * log for the page itself is generated from processing
+		 * its parent page xad entries;
+		 */
+		/* if page free from file truncation, log LOG_NOREDOPAGE
+		 * of the deleted page for logredo() to start NoRedoPage
+		 * filter for the page;
+		 */
+		if (tblk->xflag & COMMIT_TRUNCATE) {
+			/* write NOREDOPAGE for the page */
+			lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+			PXDaddress(page_pxd, mp->index);
+			PXDlength(page_pxd,
+				  mp->logical_size >> tblk->sb->
+				  s_blocksize_bits);
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+			if (tlck->type & tlckBTROOT) {
+				/* Empty xtree must be logged */
+				lrd->type = cpu_to_le16(LOG_REDOPAGE);
+				lrd->backchain =
+				    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+			}
+		}
+
+		/* init LOG_UPDATEMAP of the freed extents
+		 * XAD[XTENTRYSTART:hwm) from the deleted page itself
+		 * for logredo() to update bmap;
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
+		xtlck = (struct xtlock *) & tlck->lock;
+		hwm = xtlck->hwm.offset;
+		lrd->log.updatemap.nxd =
+		    cpu_to_le16(hwm - XTENTRYSTART + 1);
+		/* reformat linelock for lmLog() */
+		xtlck->header.offset = XTENTRYSTART;
+		xtlck->header.length = hwm - XTENTRYSTART + 1;
+		xtlck->index = 1;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* format a maplock for txUpdateMap() to update bmap
+		 * to free extents of XAD[XTENTRYSTART:hwm) from the
+		 * deleted page itself;
+		 */
+		tlck->flag |= tlckUPDATEMAP;
+		xadlock->count = hwm - XTENTRYSTART + 1;
+		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
+			int i;
+			pxd_t *pxd;
+			/*
+			 * Lazy commit may allow xtree to be modified before
+			 * txUpdateMap runs.  Copy xad into linelock to
+			 * preserve correct data.
+			 *
+			 * We can fit twice as may pxd's as xads in the lock
+			 */
+			xadlock->flag = mlckFREEPXDLIST;
+			pxd = xadlock->xdlist = &xtlck->pxdlock;
+			for (i = 0; i < xadlock->count; i++) {
+				PXDaddress(pxd,
+					addressXAD(&p->xad[XTENTRYSTART + i]));
+				PXDlength(pxd,
+					lengthXAD(&p->xad[XTENTRYSTART + i]));
+				pxd++;
+			}
+		} else {
+			/*
+			 * xdlist will point to into inode's xtree, ensure
+			 * that transaction is not committed lazily.
+			 */
+			xadlock->flag = mlckFREEXADLIST;
+			xadlock->xdlist = &p->xad[XTENTRYSTART];
+			tblk->xflag &= ~COMMIT_LAZY;
+		}
+		jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
+			 tlck->ip, mp, xadlock->count);
+
+		maplock->index = 1;
+
+		/* mark page as invalid */
+		if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
+		    && !(tlck->type & tlckBTROOT))
+			tlck->flag |= tlckFREEPAGE;
+		/*
+		   else (tblk->xflag & COMMIT_PMAP)
+		   ? release the page;
+		 */
+		return;
+	}
+
+	/*
+	 *	page/entry truncation: file truncation (ref. xtTruncate())
+	 *
+	 *	|----------+------+------+---------------|
+	 *		   |      |      |
+	 *		   |      |     hwm - hwm before truncation
+	 *		   |     next - truncation point
+	 *		  lwm - lwm before truncation
+	 * header ?
+	 */
+	if (tlck->type & tlckTRUNCATE) {
+		/* This odd declaration suppresses a bogus gcc warning */
+		pxd_t pxd = pxd;	/* truncated extent of xad */
+		int twm;
+
+		/*
+		 * For truncation the entire linelock may be used, so it would
+		 * be difficult to store xad list in linelock itself.
+		 * Therefore, we'll just force transaction to be committed
+		 * synchronously, so that xtree pages won't be changed before
+		 * txUpdateMap runs.
+		 */
+		tblk->xflag &= ~COMMIT_LAZY;
+		lwm = xtlck->lwm.offset;
+		if (lwm == 0)
+			lwm = XTPAGEMAXSLOT;
+		hwm = xtlck->hwm.offset;
+		twm = xtlck->twm.offset;
+
+		/*
+		 *	write log records
+		 */
+		/* log after-image for logredo():
+		 *
+		 * logredo() will update bmap for alloc of new/extended
+		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
+		 * after-image of XADlist;
+		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
+		 * applying the after-image to the meta-data page.
+		 */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(page_pxd, mp->index);
+		PXDlength(page_pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/*
+		 * truncate entry XAD[twm == next - 1]:
+		 */
+		if (twm == next - 1) {
+			/* init LOG_UPDATEMAP for logredo() to update bmap for
+			 * free of truncated delta extent of the truncated
+			 * entry XAD[next - 1]:
+			 * (xtlck->pxdlock = truncated delta extent);
+			 */
+			pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
+			/* assert(pxdlock->type & tlckTRUNCATE); */
+			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+			lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
+			lrd->log.updatemap.nxd = cpu_to_le16(1);
+			lrd->log.updatemap.pxd = pxdlock->pxd;
+			pxd = pxdlock->pxd;	/* save to format maplock */
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+		}
+
+		/*
+		 * free entries XAD[next:hwm]:
+		 */
+		if (hwm >= next) {
+			/* init LOG_UPDATEMAP of the freed extents
+			 * XAD[next:hwm] from the deleted page itself
+			 * for logredo() to update bmap;
+			 */
+			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+			lrd->log.updatemap.type =
+			    cpu_to_le16(LOG_FREEXADLIST);
+			xtlck = (struct xtlock *) & tlck->lock;
+			hwm = xtlck->hwm.offset;
+			lrd->log.updatemap.nxd =
+			    cpu_to_le16(hwm - next + 1);
+			/* reformat linelock for lmLog() */
+			xtlck->header.offset = next;
+			xtlck->header.length = hwm - next + 1;
+			xtlck->index = 1;
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+		}
+
+		/*
+		 *	format maplock(s) for txUpdateMap() to update bmap
+		 */
+		maplock->index = 0;
+
+		/*
+		 * allocate entries XAD[lwm:next):
+		 */
+		if (lwm < next) {
+			/* format a maplock for txUpdateMap() to update bPMAP
+			 * for alloc of new/extended extents of XAD[lwm:next)
+			 * from the page itself;
+			 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
+			 */
+			tlck->flag |= tlckUPDATEMAP;
+			xadlock->flag = mlckALLOCXADLIST;
+			xadlock->count = next - lwm;
+			xadlock->xdlist = &p->xad[lwm];
+
+			jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d "
+				 "lwm:%d next:%d",
+				 tlck->ip, mp, xadlock->count, lwm, next);
+			maplock->index++;
+			xadlock++;
+		}
+
+		/*
+		 * truncate entry XAD[twm == next - 1]:
+		 */
+		if (twm == next - 1) {
+			/* format a maplock for txUpdateMap() to update bmap
+			 * to free truncated delta extent of the truncated
+			 * entry XAD[next - 1];
+			 * (xtlck->pxdlock = truncated delta extent);
+			 */
+			tlck->flag |= tlckUPDATEMAP;
+			pxdlock = (struct pxd_lock *) xadlock;
+			pxdlock->flag = mlckFREEPXD;
+			pxdlock->count = 1;
+			pxdlock->pxd = pxd;
+
+			jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d "
+				 "hwm:%d", ip, mp, pxdlock->count, hwm);
+			maplock->index++;
+			xadlock++;
+		}
+
+		/*
+		 * free entries XAD[next:hwm]:
+		 */
+		if (hwm >= next) {
+			/* format a maplock for txUpdateMap() to update bmap
+			 * to free extents of XAD[next:hwm] from thedeleted
+			 * page itself;
+			 */
+			tlck->flag |= tlckUPDATEMAP;
+			xadlock->flag = mlckFREEXADLIST;
+			xadlock->count = hwm - next + 1;
+			xadlock->xdlist = &p->xad[next];
+
+			jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d "
+				 "next:%d hwm:%d",
+				 tlck->ip, mp, xadlock->count, next, hwm);
+			maplock->index++;
+		}
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+	}
+	return;
+}
+
+/*
+ *	mapLog()
+ *
+ * function:	log from maplock of freed data extents;
+ */
+static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		   struct tlock * tlck)
+{
+	struct pxd_lock *pxdlock;
+	int i, nlock;
+	pxd_t *pxd;
+
+	/*
+	 *	page relocation: free the source page extent
+	 *
+	 * a maplock for txUpdateMap() for free of the page
+	 * has been formatted at txLock() time saving the src
+	 * relocated page address;
+	 */
+	if (tlck->type & tlckRELOCATE) {
+		/* log LOG_NOREDOPAGE of the old relocated page
+		 * for logredo() to start NoRedoPage filter;
+		 */
+		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		pxd = &lrd->log.redopage.pxd;
+		*pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* (N.B. currently, logredo() does NOT update bmap
+		 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
+		 * if page free from relocation, LOG_UPDATEMAP log is
+		 * specifically generated now for logredo()
+		 * to update bmap for free of src relocated page;
+		 * (new flag LOG_RELOCATE may be introduced which will
+		 * inform logredo() to start NORedoPage filter and also
+		 * update block allocation map at the same time, thus
+		 * avoiding an extra log write);
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
+		lrd->log.updatemap.nxd = cpu_to_le16(1);
+		lrd->log.updatemap.pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* a maplock for txUpdateMap() for free of the page
+		 * has been formatted at txLock() time;
+		 */
+		tlck->flag |= tlckUPDATEMAP;
+		return;
+	}
+	/*
+
+	 * Otherwise it's not a relocate request
+	 *
+	 */
+	else {
+		/* log LOG_UPDATEMAP for logredo() to update bmap for
+		 * free of truncated/relocated delta extent of the data;
+		 * e.g.: external EA extent, relocated/truncated extent
+		 * from xtTailgate();
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		nlock = pxdlock->index;
+		for (i = 0; i < nlock; i++, pxdlock++) {
+			if (pxdlock->flag & mlckALLOCPXD)
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_ALLOCPXD);
+			else
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_FREEPXD);
+			lrd->log.updatemap.nxd = cpu_to_le16(1);
+			lrd->log.updatemap.pxd = pxdlock->pxd;
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+			jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
+				 (ulong) addressPXD(&pxdlock->pxd),
+				 lengthPXD(&pxdlock->pxd));
+		}
+
+		/* update bmap */
+		tlck->flag |= tlckUPDATEMAP;
+	}
+}
+
+/*
+ *	txEA()
+ *
+ * function:	acquire maplock for EA/ACL extents or
+ *		set COMMIT_INLINE flag;
+ */
+void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
+{
+	struct tlock *tlck = NULL;
+	struct pxd_lock *maplock = NULL, *pxdlock = NULL;
+
+	/*
+	 * format maplock for alloc of new EA extent
+	 */
+	if (newea) {
+		/* Since the newea could be a completely zeroed entry we need to
+		 * check for the two flags which indicate we should actually
+		 * commit new EA data
+		 */
+		if (newea->flag & DXD_EXTENT) {
+			tlck = txMaplock(tid, ip, tlckMAP);
+			maplock = (struct pxd_lock *) & tlck->lock;
+			pxdlock = (struct pxd_lock *) maplock;
+			pxdlock->flag = mlckALLOCPXD;
+			PXDaddress(&pxdlock->pxd, addressDXD(newea));
+			PXDlength(&pxdlock->pxd, lengthDXD(newea));
+			pxdlock++;
+			maplock->index = 1;
+		} else if (newea->flag & DXD_INLINE) {
+			tlck = NULL;
+
+			set_cflag(COMMIT_Inlineea, ip);
+		}
+	}
+
+	/*
+	 * format maplock for free of old EA extent
+	 */
+	if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
+		if (tlck == NULL) {
+			tlck = txMaplock(tid, ip, tlckMAP);
+			maplock = (struct pxd_lock *) & tlck->lock;
+			pxdlock = (struct pxd_lock *) maplock;
+			maplock->index = 0;
+		}
+		pxdlock->flag = mlckFREEPXD;
+		PXDaddress(&pxdlock->pxd, addressDXD(oldea));
+		PXDlength(&pxdlock->pxd, lengthDXD(oldea));
+		maplock->index++;
+	}
+}
+
+/*
+ *	txForce()
+ *
+ * function: synchronously write pages locked by transaction
+ *	     after txLog() but before txUpdateMap();
+ */
+static void txForce(struct tblock * tblk)
+{
+	struct tlock *tlck;
+	lid_t lid, next;
+	struct metapage *mp;
+
+	/*
+	 * reverse the order of transaction tlocks in
+	 * careful update order of address index pages
+	 * (right to left, bottom up)
+	 */
+	tlck = lid_to_tlock(tblk->next);
+	lid = tlck->next;
+	tlck->next = 0;
+	while (lid) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+		tlck->next = tblk->next;
+		tblk->next = lid;
+		lid = next;
+	}
+
+	/*
+	 * synchronously write the page, and
+	 * hold the page for txUpdateMap();
+	 */
+	for (lid = tblk->next; lid; lid = next) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+
+		if ((mp = tlck->mp) != NULL &&
+		    (tlck->type & tlckBTROOT) == 0) {
+			assert(mp->xflag & COMMIT_PAGE);
+
+			if (tlck->flag & tlckWRITEPAGE) {
+				tlck->flag &= ~tlckWRITEPAGE;
+
+				/* do not release page to freelist */
+				force_metapage(mp);
+#if 0
+				/*
+				 * The "right" thing to do here is to
+				 * synchronously write the metadata.
+				 * With the current implementation this
+				 * is hard since write_metapage requires
+				 * us to kunmap & remap the page.  If we
+				 * have tlocks pointing into the metadata
+				 * pages, we don't want to do this.  I think
+				 * we can get by with synchronously writing
+				 * the pages when they are released.
+				 */
+				assert(mp->nohomeok);
+				set_bit(META_dirty, &mp->flag);
+				set_bit(META_sync, &mp->flag);
+#endif
+			}
+		}
+	}
+}
+
+/*
+ *	txUpdateMap()
+ *
+ * function:	update persistent allocation map (and working map
+ *		if appropriate);
+ *
+ * parameter:
+ */
+static void txUpdateMap(struct tblock * tblk)
+{
+	struct inode *ip;
+	struct inode *ipimap;
+	lid_t lid;
+	struct tlock *tlck;
+	struct maplock *maplock;
+	struct pxd_lock pxdlock;
+	int maptype;
+	int k, nlock;
+	struct metapage *mp = NULL;
+
+	ipimap = JFS_SBI(tblk->sb)->ipimap;
+
+	maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
+
+
+	/*
+	 *	update block allocation map
+	 *
+	 * update allocation state in pmap (and wmap) and
+	 * update lsn of the pmap page;
+	 */
+	/*
+	 * scan each tlock/page of transaction for block allocation/free:
+	 *
+	 * for each tlock/page of transaction, update map.
+	 *  ? are there tlock for pmap and pwmap at the same time ?
+	 */
+	for (lid = tblk->next; lid; lid = tlck->next) {
+		tlck = lid_to_tlock(lid);
+
+		if ((tlck->flag & tlckUPDATEMAP) == 0)
+			continue;
+
+		if (tlck->flag & tlckFREEPAGE) {
+			/*
+			 * Another thread may attempt to reuse freed space
+			 * immediately, so we want to get rid of the metapage
+			 * before anyone else has a chance to get it.
+			 * Lock metapage, update maps, then invalidate
+			 * the metapage.
+			 */
+			mp = tlck->mp;
+			ASSERT(mp->xflag & COMMIT_PAGE);
+			grab_metapage(mp);
+		}
+
+		/*
+		 * extent list:
+		 * . in-line PXD list:
+		 * . out-of-line XAD list:
+		 */
+		maplock = (struct maplock *) & tlck->lock;
+		nlock = maplock->index;
+
+		for (k = 0; k < nlock; k++, maplock++) {
+			/*
+			 * allocate blocks in persistent map:
+			 *
+			 * blocks have been allocated from wmap at alloc time;
+			 */
+			if (maplock->flag & mlckALLOC) {
+				txAllocPMap(ipimap, maplock, tblk);
+			}
+			/*
+			 * free blocks in persistent and working map:
+			 * blocks will be freed in pmap and then in wmap;
+			 *
+			 * ? tblock specifies the PMAP/PWMAP based upon
+			 * transaction
+			 *
+			 * free blocks in persistent map:
+			 * blocks will be freed from wmap at last reference
+			 * release of the object for regular files;
+			 *
+			 * Alway free blocks from both persistent & working
+			 * maps for directories
+			 */
+			else {	/* (maplock->flag & mlckFREE) */
+
+				if (tlck->flag & tlckDIRECTORY)
+					txFreeMap(ipimap, maplock,
+						  tblk, COMMIT_PWMAP);
+				else
+					txFreeMap(ipimap, maplock,
+						  tblk, maptype);
+			}
+		}
+		if (tlck->flag & tlckFREEPAGE) {
+			if (!(tblk->flag & tblkGC_LAZY)) {
+				/* This is equivalent to txRelease */
+				ASSERT(mp->lid == lid);
+				tlck->mp->lid = 0;
+			}
+			assert(mp->nohomeok == 1);
+			metapage_homeok(mp);
+			discard_metapage(mp);
+			tlck->mp = NULL;
+		}
+	}
+	/*
+	 *	update inode allocation map
+	 *
+	 * update allocation state in pmap and
+	 * update lsn of the pmap page;
+	 * update in-memory inode flag/state
+	 *
+	 * unlock mapper/write lock
+	 */
+	if (tblk->xflag & COMMIT_CREATE) {
+		diUpdatePMap(ipimap, tblk->ino, false, tblk);
+		/* update persistent block allocation map
+		 * for the allocation of inode extent;
+		 */
+		pxdlock.flag = mlckALLOCPXD;
+		pxdlock.pxd = tblk->u.ixpxd;
+		pxdlock.index = 1;
+		txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
+	} else if (tblk->xflag & COMMIT_DELETE) {
+		ip = tblk->u.ip;
+		diUpdatePMap(ipimap, ip->i_ino, true, tblk);
+		iput(ip);
+	}
+}
+
+/*
+ *	txAllocPMap()
+ *
+ * function: allocate from persistent map;
+ *
+ * parameter:
+ *	ipbmap	-
+ *	malock	-
+ *		xad list:
+ *		pxd:
+ *
+ *	maptype -
+ *		allocate from persistent map;
+ *		free from persistent map;
+ *		(e.g., tmp file - free from working map at releae
+ *		 of last reference);
+ *		free from persistent and working map;
+ *
+ *	lsn	- log sequence number;
+ */
+static void txAllocPMap(struct inode *ip, struct maplock * maplock,
+			struct tblock * tblk)
+{
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct xdlistlock *xadlistlock;
+	xad_t *xad;
+	s64 xaddr;
+	int xlen;
+	struct pxd_lock *pxdlock;
+	struct xdlistlock *pxdlistlock;
+	pxd_t *pxd;
+	int n;
+
+	/*
+	 * allocate from persistent map;
+	 */
+	if (maplock->flag & mlckALLOCXADLIST) {
+		xadlistlock = (struct xdlistlock *) maplock;
+		xad = xadlistlock->xdlist;
+		for (n = 0; n < xadlistlock->count; n++, xad++) {
+			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
+				xaddr = addressXAD(xad);
+				xlen = lengthXAD(xad);
+				dbUpdatePMap(ipbmap, false, xaddr,
+					     (s64) xlen, tblk);
+				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+				jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		}
+	} else if (maplock->flag & mlckALLOCPXD) {
+		pxdlock = (struct pxd_lock *) maplock;
+		xaddr = addressPXD(&pxdlock->pxd);
+		xlen = lengthPXD(&pxdlock->pxd);
+		dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk);
+		jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
+	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
+
+		pxdlistlock = (struct xdlistlock *) maplock;
+		pxd = pxdlistlock->xdlist;
+		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+			xaddr = addressPXD(pxd);
+			xlen = lengthPXD(pxd);
+			dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen,
+				     tblk);
+			jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
+				 (ulong) xaddr, xlen);
+		}
+	}
+}
+
+/*
+ *	txFreeMap()
+ *
+ * function:	free from persistent and/or working map;
+ *
+ * todo: optimization
+ */
+void txFreeMap(struct inode *ip,
+	       struct maplock * maplock, struct tblock * tblk, int maptype)
+{
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct xdlistlock *xadlistlock;
+	xad_t *xad;
+	s64 xaddr;
+	int xlen;
+	struct pxd_lock *pxdlock;
+	struct xdlistlock *pxdlistlock;
+	pxd_t *pxd;
+	int n;
+
+	jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
+		 tblk, maplock, maptype);
+
+	/*
+	 * free from persistent map;
+	 */
+	if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
+		if (maplock->flag & mlckFREEXADLIST) {
+			xadlistlock = (struct xdlistlock *) maplock;
+			xad = xadlistlock->xdlist;
+			for (n = 0; n < xadlistlock->count; n++, xad++) {
+				if (!(xad->flag & XAD_NEW)) {
+					xaddr = addressXAD(xad);
+					xlen = lengthXAD(xad);
+					dbUpdatePMap(ipbmap, true, xaddr,
+						     (s64) xlen, tblk);
+					jfs_info("freePMap: xaddr:0x%lx "
+						 "xlen:%d",
+						 (ulong) xaddr, xlen);
+				}
+			}
+		} else if (maplock->flag & mlckFREEPXD) {
+			pxdlock = (struct pxd_lock *) maplock;
+			xaddr = addressPXD(&pxdlock->pxd);
+			xlen = lengthPXD(&pxdlock->pxd);
+			dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen,
+				     tblk);
+			jfs_info("freePMap: xaddr:0x%lx xlen:%d",
+				 (ulong) xaddr, xlen);
+		} else {	/* (maplock->flag & mlckALLOCPXDLIST) */
+
+			pxdlistlock = (struct xdlistlock *) maplock;
+			pxd = pxdlistlock->xdlist;
+			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+				xaddr = addressPXD(pxd);
+				xlen = lengthPXD(pxd);
+				dbUpdatePMap(ipbmap, true, xaddr,
+					     (s64) xlen, tblk);
+				jfs_info("freePMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		}
+	}
+
+	/*
+	 * free from working map;
+	 */
+	if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
+		if (maplock->flag & mlckFREEXADLIST) {
+			xadlistlock = (struct xdlistlock *) maplock;
+			xad = xadlistlock->xdlist;
+			for (n = 0; n < xadlistlock->count; n++, xad++) {
+				xaddr = addressXAD(xad);
+				xlen = lengthXAD(xad);
+				dbFree(ip, xaddr, (s64) xlen);
+				xad->flag = 0;
+				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		} else if (maplock->flag & mlckFREEPXD) {
+			pxdlock = (struct pxd_lock *) maplock;
+			xaddr = addressPXD(&pxdlock->pxd);
+			xlen = lengthPXD(&pxdlock->pxd);
+			dbFree(ip, xaddr, (s64) xlen);
+			jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+				 (ulong) xaddr, xlen);
+		} else {	/* (maplock->flag & mlckFREEPXDLIST) */
+
+			pxdlistlock = (struct xdlistlock *) maplock;
+			pxd = pxdlistlock->xdlist;
+			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+				xaddr = addressPXD(pxd);
+				xlen = lengthPXD(pxd);
+				dbFree(ip, xaddr, (s64) xlen);
+				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		}
+	}
+}
+
+/*
+ *	txFreelock()
+ *
+ * function:	remove tlock from inode anonymous locklist
+ */
+void txFreelock(struct inode *ip)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	struct tlock *xtlck, *tlck;
+	lid_t xlid = 0, lid;
+
+	if (!jfs_ip->atlhead)
+		return;
+
+	TXN_LOCK();
+	xtlck = (struct tlock *) &jfs_ip->atlhead;
+
+	while ((lid = xtlck->next) != 0) {
+		tlck = lid_to_tlock(lid);
+		if (tlck->flag & tlckFREELOCK) {
+			xtlck->next = tlck->next;
+			txLockFree(lid);
+		} else {
+			xtlck = tlck;
+			xlid = lid;
+		}
+	}
+
+	if (jfs_ip->atlhead)
+		jfs_ip->atltail = xlid;
+	else {
+		jfs_ip->atltail = 0;
+		/*
+		 * If inode was on anon_list, remove it
+		 */
+		list_del_init(&jfs_ip->anon_inode_list);
+	}
+	TXN_UNLOCK();
+}
+
+/*
+ *	txAbort()
+ *
+ * function: abort tx before commit;
+ *
+ * frees line-locks and segment locks for all
+ * segments in comdata structure.
+ * Optionally sets state of file-system to FM_DIRTY in super-block.
+ * log age of page-frames in memory for which caller has
+ * are reset to 0 (to avoid logwarap).
+ */
+void txAbort(tid_t tid, int dirty)
+{
+	lid_t lid, next;
+	struct metapage *mp;
+	struct tblock *tblk = tid_to_tblock(tid);
+	struct tlock *tlck;
+
+	/*
+	 * free tlocks of the transaction
+	 */
+	for (lid = tblk->next; lid; lid = next) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+		mp = tlck->mp;
+		JFS_IP(tlck->ip)->xtlid = 0;
+
+		if (mp) {
+			mp->lid = 0;
+
+			/*
+			 * reset lsn of page to avoid logwarap:
+			 *
+			 * (page may have been previously committed by another
+			 * transaction(s) but has not been paged, i.e.,
+			 * it may be on logsync list even though it has not
+			 * been logged for the current tx.)
+			 */
+			if (mp->xflag & COMMIT_PAGE && mp->lsn)
+				LogSyncRelease(mp);
+		}
+		/* insert tlock at head of freelist */
+		TXN_LOCK();
+		txLockFree(lid);
+		TXN_UNLOCK();
+	}
+
+	/* caller will free the transaction block */
+
+	tblk->next = tblk->last = 0;
+
+	/*
+	 * mark filesystem dirty
+	 */
+	if (dirty)
+		jfs_error(tblk->sb, "\n");
+
+	return;
+}
+
+/*
+ *	txLazyCommit(void)
+ *
+ *	All transactions except those changing ipimap (COMMIT_FORCE) are
+ *	processed by this routine.  This insures that the inode and block
+ *	allocation maps are updated in order.  For synchronous transactions,
+ *	let the user thread finish processing after txUpdateMap() is called.
+ */
+static void txLazyCommit(struct tblock * tblk)
+{
+	struct jfs_log *log;
+
+	while (((tblk->flag & tblkGC_READY) == 0) &&
+	       ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
+		/* We must have gotten ahead of the user thread
+		 */
+		jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
+		yield();
+	}
+
+	jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
+
+	txUpdateMap(tblk);
+
+	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
+
+	spin_lock_irq(&log->gclock);	// LOGGC_LOCK
+
+	tblk->flag |= tblkGC_COMMITTED;
+
+	if (tblk->flag & tblkGC_READY)
+		log->gcrtc--;
+
+	wake_up_all(&tblk->gcwait);	// LOGGC_WAKEUP
+
+	/*
+	 * Can't release log->gclock until we've tested tblk->flag
+	 */
+	if (tblk->flag & tblkGC_LAZY) {
+		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
+		txUnlock(tblk);
+		tblk->flag &= ~tblkGC_LAZY;
+		txEnd(tblk - TxBlock);	/* Convert back to tid */
+	} else
+		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
+
+	jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
+}
+
+/*
+ *	jfs_lazycommit(void)
+ *
+ *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
+ *	context, or where blocking is not wanted, this routine will process
+ *	committed transactions from the unlock queue.
+ */
+int jfs_lazycommit(void *arg)
+{
+	int WorkDone;
+	struct tblock *tblk;
+	unsigned long flags;
+	struct jfs_sb_info *sbi;
+
+	do {
+		LAZY_LOCK(flags);
+		jfs_commit_thread_waking = 0;	/* OK to wake another thread */
+		while (!list_empty(&TxAnchor.unlock_queue)) {
+			WorkDone = 0;
+			list_for_each_entry(tblk, &TxAnchor.unlock_queue,
+					    cqueue) {
+
+				sbi = JFS_SBI(tblk->sb);
+				/*
+				 * For each volume, the transactions must be
+				 * handled in order.  If another commit thread
+				 * is handling a tblk for this superblock,
+				 * skip it
+				 */
+				if (sbi->commit_state & IN_LAZYCOMMIT)
+					continue;
+
+				sbi->commit_state |= IN_LAZYCOMMIT;
+				WorkDone = 1;
+
+				/*
+				 * Remove transaction from queue
+				 */
+				list_del(&tblk->cqueue);
+
+				LAZY_UNLOCK(flags);
+				txLazyCommit(tblk);
+				LAZY_LOCK(flags);
+
+				sbi->commit_state &= ~IN_LAZYCOMMIT;
+				/*
+				 * Don't continue in the for loop.  (We can't
+				 * anyway, it's unsafe!)  We want to go back to
+				 * the beginning of the list.
+				 */
+				break;
+			}
+
+			/* If there was nothing to do, don't continue */
+			if (!WorkDone)
+				break;
+		}
+		/* In case a wakeup came while all threads were active */
+		jfs_commit_thread_waking = 0;
+
+		if (freezing(current)) {
+			LAZY_UNLOCK(flags);
+			try_to_freeze();
+		} else {
+			DECLARE_WAITQUEUE(wq, current);
+
+			add_wait_queue(&jfs_commit_thread_wait, &wq);
+			set_current_state(TASK_INTERRUPTIBLE);
+			LAZY_UNLOCK(flags);
+			schedule();
+			remove_wait_queue(&jfs_commit_thread_wait, &wq);
+		}
+	} while (!kthread_should_stop());
+
+	if (!list_empty(&TxAnchor.unlock_queue))
+		jfs_err("jfs_lazycommit being killed w/pending transactions!");
+	else
+		jfs_info("jfs_lazycommit being killed\n");
+	return 0;
+}
+
+void txLazyUnlock(struct tblock * tblk)
+{
+	unsigned long flags;
+
+	LAZY_LOCK(flags);
+
+	list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
+	/*
+	 * Don't wake up a commit thread if there is already one servicing
+	 * this superblock, or if the last one we woke up hasn't started yet.
+	 */
+	if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
+	    !jfs_commit_thread_waking) {
+		jfs_commit_thread_waking = 1;
+		wake_up(&jfs_commit_thread_wait);
+	}
+	LAZY_UNLOCK(flags);
+}
+
+static void LogSyncRelease(struct metapage * mp)
+{
+	struct jfs_log *log = mp->log;
+
+	assert(mp->nohomeok);
+	assert(log);
+	metapage_homeok(mp);
+}
+
+/*
+ *	txQuiesce
+ *
+ *	Block all new transactions and push anonymous transactions to
+ *	completion
+ *
+ *	This does almost the same thing as jfs_sync below.  We don't
+ *	worry about deadlocking when jfs_tlocks_low is set, since we would
+ *	expect jfs_sync to get us out of that jam.
+ */
+void txQuiesce(struct super_block *sb)
+{
+	struct inode *ip;
+	struct jfs_inode_info *jfs_ip;
+	struct jfs_log *log = JFS_SBI(sb)->log;
+	tid_t tid;
+
+	set_bit(log_QUIESCE, &log->flag);
+
+	TXN_LOCK();
+restart:
+	while (!list_empty(&TxAnchor.anon_list)) {
+		jfs_ip = list_entry(TxAnchor.anon_list.next,
+				    struct jfs_inode_info,
+				    anon_inode_list);
+		ip = &jfs_ip->vfs_inode;
+
+		/*
+		 * inode will be removed from anonymous list
+		 * when it is committed
+		 */
+		TXN_UNLOCK();
+		tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
+		mutex_lock(&jfs_ip->commit_mutex);
+		txCommit(tid, 1, &ip, 0);
+		txEnd(tid);
+		mutex_unlock(&jfs_ip->commit_mutex);
+		/*
+		 * Just to be safe.  I don't know how
+		 * long we can run without blocking
+		 */
+		cond_resched();
+		TXN_LOCK();
+	}
+
+	/*
+	 * If jfs_sync is running in parallel, there could be some inodes
+	 * on anon_list2.  Let's check.
+	 */
+	if (!list_empty(&TxAnchor.anon_list2)) {
+		list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
+		INIT_LIST_HEAD(&TxAnchor.anon_list2);
+		goto restart;
+	}
+	TXN_UNLOCK();
+
+	/*
+	 * We may need to kick off the group commit
+	 */
+	jfs_flush_journal(log, 0);
+}
+
+/*
+ * txResume()
+ *
+ * Allows transactions to start again following txQuiesce
+ */
+void txResume(struct super_block *sb)
+{
+	struct jfs_log *log = JFS_SBI(sb)->log;
+
+	clear_bit(log_QUIESCE, &log->flag);
+	TXN_WAKEUP(&log->syncwait);
+}
+
+/*
+ *	jfs_sync(void)
+ *
+ *	To be run as a kernel daemon.  This is awakened when tlocks run low.
+ *	We write any inodes that have anonymous tlocks so they will become
+ *	available.
+ */
+int jfs_sync(void *arg)
+{
+	struct inode *ip;
+	struct jfs_inode_info *jfs_ip;
+	tid_t tid;
+
+	do {
+		/*
+		 * write each inode on the anonymous inode list
+		 */
+		TXN_LOCK();
+		while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
+			jfs_ip = list_entry(TxAnchor.anon_list.next,
+					    struct jfs_inode_info,
+					    anon_inode_list);
+			ip = &jfs_ip->vfs_inode;
+
+			if (! igrab(ip)) {
+				/*
+				 * Inode is being freed
+				 */
+				list_del_init(&jfs_ip->anon_inode_list);
+			} else if (mutex_trylock(&jfs_ip->commit_mutex)) {
+				/*
+				 * inode will be removed from anonymous list
+				 * when it is committed
+				 */
+				TXN_UNLOCK();
+				tid = txBegin(ip->i_sb, COMMIT_INODE);
+				txCommit(tid, 1, &ip, 0);
+				txEnd(tid);
+				mutex_unlock(&jfs_ip->commit_mutex);
+
+				iput(ip);
+				/*
+				 * Just to be safe.  I don't know how
+				 * long we can run without blocking
+				 */
+				cond_resched();
+				TXN_LOCK();
+			} else {
+				/* We can't get the commit mutex.  It may
+				 * be held by a thread waiting for tlock's
+				 * so let's not block here.  Save it to
+				 * put back on the anon_list.
+				 */
+
+				/* Move from anon_list to anon_list2 */
+				list_move(&jfs_ip->anon_inode_list,
+					  &TxAnchor.anon_list2);
+
+				TXN_UNLOCK();
+				iput(ip);
+				TXN_LOCK();
+			}
+		}
+		/* Add anon_list2 back to anon_list */
+		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
+
+		if (freezing(current)) {
+			TXN_UNLOCK();
+			try_to_freeze();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			TXN_UNLOCK();
+			schedule();
+		}
+	} while (!kthread_should_stop());
+
+	jfs_info("jfs_sync being killed");
+	return 0;
+}
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
+{
+	char *freewait;
+	char *freelockwait;
+	char *lowlockwait;
+
+	freewait =
+	    waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
+	freelockwait =
+	    waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
+	lowlockwait =
+	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
+
+	seq_printf(m,
+		       "JFS TxAnchor\n"
+		       "============\n"
+		       "freetid = %d\n"
+		       "freewait = %s\n"
+		       "freelock = %d\n"
+		       "freelockwait = %s\n"
+		       "lowlockwait = %s\n"
+		       "tlocksInUse = %d\n"
+		       "jfs_tlocks_low = %d\n"
+		       "unlock_queue is %sempty\n",
+		       TxAnchor.freetid,
+		       freewait,
+		       TxAnchor.freelock,
+		       freelockwait,
+		       lowlockwait,
+		       TxAnchor.tlocksInUse,
+		       jfs_tlocks_low,
+		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+	return 0;
+}
+
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_txanchor_proc_show, NULL);
+}
+
+const struct file_operations jfs_txanchor_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_txanchor_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS TxStats\n"
+		       "===========\n"
+		       "calls to txBegin = %d\n"
+		       "txBegin blocked by sync barrier = %d\n"
+		       "txBegin blocked by tlocks low = %d\n"
+		       "txBegin blocked by no free tid = %d\n"
+		       "calls to txBeginAnon = %d\n"
+		       "txBeginAnon blocked by sync barrier = %d\n"
+		       "txBeginAnon blocked by tlocks low = %d\n"
+		       "calls to txLockAlloc = %d\n"
+		       "tLockAlloc blocked by no free lock = %d\n",
+		       TxStat.txBegin,
+		       TxStat.txBegin_barrier,
+		       TxStat.txBegin_lockslow,
+		       TxStat.txBegin_freetid,
+		       TxStat.txBeginAnon,
+		       TxStat.txBeginAnon_barrier,
+		       TxStat.txBeginAnon_lockslow,
+		       TxStat.txLockAlloc,
+		       TxStat.txLockAlloc_freelock);
+	return 0;
+}
+
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_txstats_proc_show, NULL);
+}
+
+const struct file_operations jfs_txstats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_txstats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
diff --git a/kernel/fs/jfs/jfs_txnmgr.h b/kernel/fs/jfs/jfs_txnmgr.h
new file mode 100644
index 000000000..ab7288937
--- /dev/null
+++ b/kernel/fs/jfs/jfs_txnmgr.h
@@ -0,0 +1,311 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_TXNMGR
+#define _H_JFS_TXNMGR
+
+#include "jfs_logmgr.h"
+
+/*
+ * Hide implementation of TxBlock and TxLock
+ */
+#define tid_to_tblock(tid) (&TxBlock[tid])
+
+#define lid_to_tlock(lid) (&TxLock[lid])
+
+/*
+ *	transaction block
+ */
+struct tblock {
+	/*
+	 * tblock and jbuf_t common area: struct logsyncblk
+	 *
+	 * the following 5 fields are the same as struct logsyncblk
+	 * which is common to tblock and jbuf to form logsynclist
+	 */
+	u16 xflag;		/* tx commit type */
+	u16 flag;		/* tx commit state */
+	lid_t dummy;		/* Must keep structures common */
+	s32 lsn;		/* recovery lsn */
+	struct list_head synclist;	/* logsynclist link */
+
+	/* lock management */
+	struct super_block *sb;	/* super block */
+	lid_t next;		/* index of first tlock of tid */
+	lid_t last;		/* index of last tlock of tid */
+	wait_queue_head_t waitor;	/* tids waiting on this tid */
+
+	/* log management */
+	u32 logtid;		/* log transaction id */
+
+	/* commit management */
+	struct list_head cqueue;	/* commit queue list */
+	s32 clsn;		/* commit lsn */
+	struct lbuf *bp;
+	s32 pn;			/* commit record log page number */
+	s32 eor;		/* commit record eor */
+	wait_queue_head_t gcwait;	/* group commit event list:
+					 * ready transactions wait on this
+					 * event for group commit completion.
+					 */
+	union {
+		struct inode *ip; /* inode being deleted */
+		pxd_t ixpxd;	/* pxd of inode extent for created inode */
+	} u;
+	u32 ino;		/* inode number being created */
+};
+
+extern struct tblock *TxBlock;	/* transaction block table */
+
+/* commit flags: tblk->xflag */
+#define	COMMIT_SYNC	0x0001	/* synchronous commit */
+#define	COMMIT_FORCE	0x0002	/* force pageout at end of commit */
+#define	COMMIT_FLUSH	0x0004	/* init flush at end of commit */
+#define COMMIT_MAP	0x00f0
+#define	COMMIT_PMAP	0x0010	/* update pmap */
+#define	COMMIT_WMAP	0x0020	/* update wmap */
+#define	COMMIT_PWMAP	0x0040	/* update pwmap */
+#define	COMMIT_FREE	0x0f00
+#define	COMMIT_DELETE	0x0100	/* inode delete */
+#define	COMMIT_TRUNCATE	0x0200	/* file truncation */
+#define	COMMIT_CREATE	0x0400	/* inode create */
+#define	COMMIT_LAZY	0x0800	/* lazy commit */
+#define COMMIT_PAGE	0x1000	/* Identifies element as metapage */
+#define COMMIT_INODE	0x2000	/* Identifies element as inode */
+
+/* group commit flags tblk->flag: see jfs_logmgr.h */
+
+/*
+ *	transaction lock
+ */
+struct tlock {
+	lid_t next;		/* 2: index next lockword on tid locklist
+				 *	    next lockword on freelist
+				 */
+	tid_t tid;		/* 2: transaction id holding lock */
+
+	u16 flag;		/* 2: lock control */
+	u16 type;		/* 2: log type */
+
+	struct metapage *mp;	/* 4/8: object page buffer locked */
+	struct inode *ip;	/* 4/8: object */
+	/* (16) */
+
+	s16 lock[24];		/* 48: overlay area */
+};				/* (64) */
+
+extern struct tlock *TxLock;	/* transaction lock table */
+
+/*
+ * tlock flag
+ */
+/* txLock state */
+#define tlckPAGELOCK		0x8000
+#define tlckINODELOCK		0x4000
+#define tlckLINELOCK		0x2000
+#define tlckINLINELOCK		0x1000
+/* lmLog state */
+#define tlckLOG			0x0800
+/* updateMap state */
+#define	tlckUPDATEMAP		0x0080
+#define	tlckDIRECTORY		0x0040
+/* freeLock state */
+#define tlckFREELOCK		0x0008
+#define tlckWRITEPAGE		0x0004
+#define tlckFREEPAGE		0x0002
+
+/*
+ * tlock type
+ */
+#define	tlckTYPE		0xfe00
+#define	tlckINODE		0x8000
+#define	tlckXTREE		0x4000
+#define	tlckDTREE		0x2000
+#define	tlckMAP			0x1000
+#define	tlckEA			0x0800
+#define	tlckACL			0x0400
+#define	tlckDATA		0x0200
+#define	tlckBTROOT		0x0100
+
+#define	tlckOPERATION		0x00ff
+#define tlckGROW		0x0001	/* file grow */
+#define tlckREMOVE		0x0002	/* file delete */
+#define tlckTRUNCATE		0x0004	/* file truncate */
+#define tlckRELOCATE		0x0008	/* file/directory relocate */
+#define tlckENTRY		0x0001	/* directory insert/delete */
+#define tlckEXTEND		0x0002	/* directory extend in-line */
+#define tlckSPLIT		0x0010	/* splited page */
+#define tlckNEW			0x0020	/* new page from split */
+#define tlckFREE		0x0040	/* free page */
+#define tlckRELINK		0x0080	/* update sibling pointer */
+
+/*
+ *	linelock for lmLog()
+ *
+ * note: linelock and its variations are overlaid
+ * at tlock.lock: watch for alignment;
+ */
+struct lv {
+	u8 offset;		/* 1: */
+	u8 length;		/* 1: */
+};				/* (2) */
+
+#define	TLOCKSHORT	20
+#define	TLOCKLONG	28
+
+struct linelock {
+	lid_t next;		/* 2: next linelock */
+
+	s8 maxcnt;		/* 1: */
+	s8 index;		/* 1: */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 l2linesize;		/* 1: log2 of linesize */
+	/* (8) */
+
+	struct lv lv[20];	/* 40: */
+};				/* (48) */
+
+#define dt_lock	linelock
+
+struct xtlock {
+	lid_t next;		/* 2: */
+
+	s8 maxcnt;		/* 1: */
+	s8 index;		/* 1: */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 l2linesize;		/* 1: log2 of linesize */
+				/* (8) */
+
+	struct lv header;	/* 2: */
+	struct lv lwm;		/* 2: low water mark */
+	struct lv hwm;		/* 2: high water mark */
+	struct lv twm;		/* 2: */
+				/* (16) */
+
+	s32 pxdlock[8];		/* 32: */
+};				/* (48) */
+
+
+/*
+ *	maplock for txUpdateMap()
+ *
+ * note: maplock and its variations are overlaid
+ * at tlock.lock/linelock: watch for alignment;
+ * N.B. next field may be set by linelock, and should not
+ * be modified by maplock;
+ * N.B. index of the first pxdlock specifies index of next
+ * free maplock (i.e., number of maplock) in the tlock;
+ */
+struct maplock {
+	lid_t next;		/* 2: */
+
+	u8 maxcnt;		/* 2: */
+	u8 index;		/* 2: next free maplock index */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 count;		/* 1: number of pxd/xad */
+				/* (8) */
+
+	pxd_t pxd;		/* 8: */
+};				/* (16): */
+
+/* maplock flag */
+#define	mlckALLOC		0x00f0
+#define	mlckALLOCXADLIST	0x0080
+#define	mlckALLOCPXDLIST	0x0040
+#define	mlckALLOCXAD		0x0020
+#define	mlckALLOCPXD		0x0010
+#define	mlckFREE		0x000f
+#define	mlckFREEXADLIST		0x0008
+#define	mlckFREEPXDLIST		0x0004
+#define	mlckFREEXAD		0x0002
+#define	mlckFREEPXD		0x0001
+
+#define	pxd_lock	maplock
+
+struct xdlistlock {
+	lid_t next;		/* 2: */
+
+	u8 maxcnt;		/* 2: */
+	u8 index;		/* 2: */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 count;		/* 1: number of pxd/xad */
+				/* (8) */
+
+	/*
+	 * We need xdlist to be 64 bits (8 bytes), regardless of
+	 * whether void * is 32 or 64 bits
+	 */
+	union {
+		void *_xdlist;	/* pxd/xad list */
+		s64 pad;	/* 8: Force 64-bit xdlist size */
+	} union64;
+};				/* (16): */
+
+#define xdlist union64._xdlist
+
+/*
+ *	commit
+ *
+ * parameter to the commit manager routines
+ */
+struct commit {
+	tid_t tid;		/* tid = index of tblock */
+	int flag;		/* flags */
+	struct jfs_log *log;	/* log */
+	struct super_block *sb;	/* superblock */
+
+	int nip;		/* number of entries in iplist */
+	struct inode **iplist;	/* list of pointers to inodes */
+
+	/* log record descriptor on 64-bit boundary */
+	struct lrd lrd;		/* : log record descriptor */
+};
+
+/*
+ * external declarations
+ */
+extern int jfs_tlocks_low;
+
+extern int txInit(void);
+extern void txExit(void);
+extern struct tlock *txLock(tid_t, struct inode *, struct metapage *, int);
+extern struct tlock *txMaplock(tid_t, struct inode *, int);
+extern int txCommit(tid_t, int, struct inode **, int);
+extern tid_t txBegin(struct super_block *, int);
+extern void txBeginAnon(struct super_block *);
+extern void txEnd(tid_t);
+extern void txAbort(tid_t, int);
+extern struct linelock *txLinelock(struct linelock *);
+extern void txFreeMap(struct inode *, struct maplock *, struct tblock *, int);
+extern void txEA(tid_t, struct inode *, dxd_t *, dxd_t *);
+extern void txFreelock(struct inode *);
+extern int lmLog(struct jfs_log *, struct tblock *, struct lrd *,
+		 struct tlock *);
+extern void txQuiesce(struct super_block *);
+extern void txResume(struct super_block *);
+extern void txLazyUnlock(struct tblock *);
+extern int jfs_lazycommit(void *);
+extern int jfs_sync(void *);
+#endif				/* _H_JFS_TXNMGR */
diff --git a/kernel/fs/jfs/jfs_types.h b/kernel/fs/jfs/jfs_types.h
new file mode 100644
index 000000000..8f602dcb5
--- /dev/null
+++ b/kernel/fs/jfs/jfs_types.h
@@ -0,0 +1,170 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_TYPES
+#define	_H_JFS_TYPES
+
+/*
+ *	jfs_types.h:
+ *
+ * basic type/utility definitions
+ *
+ * note: this header file must be the 1st include file
+ * of JFS include list in all JFS .c file.
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+
+/*
+ * transaction and lock id's
+ *
+ * Don't change these without carefully considering the impact on the
+ * size and alignment of all of the linelock variants
+ */
+typedef u16 tid_t;
+typedef u16 lid_t;
+
+/*
+ * Almost identical to Linux's timespec, but not quite
+ */
+struct timestruc_t {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+};
+
+/*
+ *	handy
+ */
+
+#define LEFTMOSTONE	0x80000000
+#define	HIGHORDER	0x80000000u	/* high order bit on	*/
+#define	ONES		0xffffffffu	/* all bit on		*/
+
+/*
+ *	physical xd (pxd)
+ *
+ *	The leftmost 24 bits of len_addr are the extent length.
+ *	The rightmost 8 bits of len_addr are the most signficant bits of
+ *	the extent address
+ */
+typedef struct {
+	__le32 len_addr;
+	__le32 addr2;
+} pxd_t;
+
+/* xd_t field construction */
+
+static inline void PXDlength(pxd_t *pxd, __u32 len)
+{
+	pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) |
+			cpu_to_le32(len & 0xffffff);
+}
+
+static inline void PXDaddress(pxd_t *pxd, __u64 addr)
+{
+	pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) |
+			cpu_to_le32((addr >> 32)<<24);
+	pxd->addr2 = cpu_to_le32(addr & 0xffffffff);
+}
+
+/* xd_t field extraction */
+static inline __u32 lengthPXD(pxd_t *pxd)
+{
+	return le32_to_cpu((pxd)->len_addr) & 0xffffff;
+}
+
+static inline __u64 addressPXD(pxd_t *pxd)
+{
+	__u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff;
+	return (n << 8) + le32_to_cpu(pxd->addr2);
+}
+
+#define MAXTREEHEIGHT 8
+/* pxd list */
+struct pxdlist {
+	s16 maxnpxd;
+	s16 npxd;
+	pxd_t pxd[MAXTREEHEIGHT];
+};
+
+
+/*
+ *	data extent descriptor (dxd)
+ */
+typedef struct {
+	__u8 flag;	/* 1: flags */
+	__u8 rsrvd[3];
+	__le32 size;		/* 4: size in byte */
+	pxd_t loc;		/* 8: address and length in unit of fsblksize */
+} dxd_t;			/* - 16 - */
+
+/* dxd_t flags */
+#define	DXD_INDEX	0x80	/* B+-tree index */
+#define	DXD_INLINE	0x40	/* in-line data extent */
+#define	DXD_EXTENT	0x20	/* out-of-line single extent */
+#define	DXD_FILE	0x10	/* out-of-line file (inode) */
+#define DXD_CORRUPT	0x08	/* Inconsistency detected */
+
+/* dxd_t field construction
+ */
+#define	DXDlength(dxd, len)	PXDlength(&(dxd)->loc, len)
+#define	DXDaddress(dxd, addr)	PXDaddress(&(dxd)->loc, addr)
+#define	lengthDXD(dxd)	lengthPXD(&(dxd)->loc)
+#define	addressDXD(dxd)	addressPXD(&(dxd)->loc)
+#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
+#define sizeDXD(dxd)	le32_to_cpu((dxd)->size)
+
+/*
+ *	directory entry argument
+ */
+struct component_name {
+	int namlen;
+	wchar_t *name;
+};
+
+
+/*
+ *	DASD limit information - stored in directory inode
+ */
+struct dasd {
+	u8 thresh;		/* Alert Threshold (in percent)		*/
+	u8 delta;		/* Alert Threshold delta (in percent)	*/
+	u8 rsrvd1;
+	u8 limit_hi;		/* DASD limit (in logical blocks)	*/
+	__le32 limit_lo;	/* DASD limit (in logical blocks)	*/
+	u8 rsrvd2[3];
+	u8 used_hi;		/* DASD usage (in logical blocks)	*/
+	__le32 used_lo;		/* DASD usage (in logical blocks)	*/
+};
+
+#define DASDLIMIT(dasdp) \
+	(((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo))
+#define setDASDLIMIT(dasdp, limit)\
+{\
+	(dasdp)->limit_hi = ((u64)limit) >> 32;\
+	(dasdp)->limit_lo = __cpu_to_le32(limit);\
+}
+#define DASDUSED(dasdp) \
+	(((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo))
+#define setDASDUSED(dasdp, used)\
+{\
+	(dasdp)->used_hi = ((u64)used) >> 32;\
+	(dasdp)->used_lo = __cpu_to_le32(used);\
+}
+
+#endif				/* !_H_JFS_TYPES */
diff --git a/kernel/fs/jfs/jfs_umount.c b/kernel/fs/jfs/jfs_umount.c
new file mode 100644
index 000000000..7971f3753
--- /dev/null
+++ b/kernel/fs/jfs/jfs_umount.c
@@ -0,0 +1,168 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_umount.c
+ *
+ * note: file system in transition to aggregate/fileset:
+ * (ref. jfs_mount.c)
+ *
+ * file system unmount is interpreted as mount of the single/only
+ * fileset in the aggregate and, if unmount of the last fileset,
+ * as unmount of the aggerate;
+ */
+
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+/*
+ * NAME:	jfs_umount(vfsp, flags, crp)
+ *
+ * FUNCTION:	vfs_umount()
+ *
+ * PARAMETERS:	vfsp	- virtual file system pointer
+ *		flags	- unmount for shutdown
+ *		crp	- credential
+ *
+ * RETURN :	EBUSY	- device has open files
+ */
+int jfs_umount(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct inode *ipbmap = sbi->ipbmap;
+	struct inode *ipimap = sbi->ipimap;
+	struct inode *ipaimap = sbi->ipaimap;
+	struct inode *ipaimap2 = sbi->ipaimap2;
+	struct jfs_log *log;
+	int rc = 0;
+
+	jfs_info("UnMount JFS: sb:0x%p", sb);
+
+	/*
+	 *	update superblock and close log
+	 *
+	 * if mounted read-write and log based recovery was enabled
+	 */
+	if ((log = sbi->log))
+		/*
+		 * Wait for outstanding transactions to be written to log:
+		 */
+		jfs_flush_journal(log, 2);
+
+	/*
+	 * close fileset inode allocation map (aka fileset inode)
+	 */
+	diUnmount(ipimap, 0);
+
+	diFreeSpecial(ipimap);
+	sbi->ipimap = NULL;
+
+	/*
+	 * close secondary aggregate inode allocation map
+	 */
+	ipaimap2 = sbi->ipaimap2;
+	if (ipaimap2) {
+		diUnmount(ipaimap2, 0);
+		diFreeSpecial(ipaimap2);
+		sbi->ipaimap2 = NULL;
+	}
+
+	/*
+	 * close aggregate inode allocation map
+	 */
+	ipaimap = sbi->ipaimap;
+	diUnmount(ipaimap, 0);
+	diFreeSpecial(ipaimap);
+	sbi->ipaimap = NULL;
+
+	/*
+	 * close aggregate block allocation map
+	 */
+	dbUnmount(ipbmap, 0);
+
+	diFreeSpecial(ipbmap);
+	sbi->ipimap = NULL;
+
+	/*
+	 * Make sure all metadata makes it to disk before we mark
+	 * the superblock as clean
+	 */
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
+
+	/*
+	 * ensure all file system file pages are propagated to their
+	 * home blocks on disk (and their in-memory buffer pages are
+	 * invalidated) BEFORE updating file system superblock state
+	 * (to signify file system is unmounted cleanly, and thus in
+	 * consistent state) and log superblock active file system
+	 * list (to signify skip logredo()).
+	 */
+	if (log) {		/* log = NULL if read-only mount */
+		updateSuper(sb, FM_CLEAN);
+
+		/*
+		 * close log:
+		 *
+		 * remove file system from log active file system list.
+		 */
+		rc = lmLogClose(sb);
+	}
+	jfs_info("UnMount JFS Complete: rc = %d", rc);
+	return rc;
+}
+
+
+int jfs_umount_rw(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+
+	if (!log)
+		return 0;
+
+	/*
+	 * close log:
+	 *
+	 * remove file system from log active file system list.
+	 */
+	jfs_flush_journal(log, 2);
+
+	/*
+	 * Make sure all metadata makes it to disk
+	 */
+	dbSync(sbi->ipbmap);
+	diSync(sbi->ipimap);
+
+	/*
+	 * Note that we have to do this even if sync_blockdev() will
+	 * do exactly the same a few instructions later:  We can't
+	 * mark the superblock clean before everything is flushed to
+	 * disk.
+	 */
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
+
+	updateSuper(sb, FM_CLEAN);
+
+	return lmLogClose(sb);
+}
diff --git a/kernel/fs/jfs/jfs_unicode.c b/kernel/fs/jfs/jfs_unicode.c
new file mode 100644
index 000000000..c7de6f5bb
--- /dev/null
+++ b/kernel/fs/jfs/jfs_unicode.c
@@ -0,0 +1,138 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+/*
+ * NAME:	jfs_strfromUCS()
+ *
+ * FUNCTION:	Convert little-endian unicode string to character string
+ *
+ */
+int jfs_strfromUCS_le(char *to, const __le16 * from,
+		      int len, struct nls_table *codepage)
+{
+	int i;
+	int outlen = 0;
+	static int warn_again = 5;	/* Only warn up to 5 times total */
+	int warn = !!warn_again;	/* once per string */
+
+	if (codepage) {
+		for (i = 0; (i < len) && from[i]; i++) {
+			int charlen;
+			charlen =
+			    codepage->uni2char(le16_to_cpu(from[i]),
+					       &to[outlen],
+					       NLS_MAX_CHARSET_SIZE);
+			if (charlen > 0)
+				outlen += charlen;
+			else
+				to[outlen++] = '?';
+		}
+	} else {
+		for (i = 0; (i < len) && from[i]; i++) {
+			if (unlikely(le16_to_cpu(from[i]) & 0xff00)) {
+				to[i] = '?';
+				if (unlikely(warn)) {
+					warn--;
+					warn_again--;
+					printk(KERN_ERR
+			"non-latin1 character 0x%x found in JFS file name\n",
+					       le16_to_cpu(from[i]));
+					printk(KERN_ERR
+				"mount with iocharset=utf8 to access\n");
+				}
+
+			}
+			else
+				to[i] = (char) (le16_to_cpu(from[i]));
+		}
+		outlen = i;
+	}
+	to[outlen] = 0;
+	return outlen;
+}
+
+/*
+ * NAME:	jfs_strtoUCS()
+ *
+ * FUNCTION:	Convert character string to unicode string
+ *
+ */
+static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len,
+		struct nls_table *codepage)
+{
+	int charlen;
+	int i;
+
+	if (codepage) {
+		for (i = 0; len && *from; i++, from += charlen, len -= charlen)
+		{
+			charlen = codepage->char2uni(from, len, &to[i]);
+			if (charlen < 1) {
+				jfs_err("jfs_strtoUCS: char2uni returned %d.",
+					charlen);
+				jfs_err("charset = %s, char = 0x%x",
+					codepage->charset, *from);
+				return charlen;
+			}
+		}
+	} else {
+		for (i = 0; (i < len) && from[i]; i++)
+			to[i] = (wchar_t) from[i];
+	}
+
+	to[i] = 0;
+	return i;
+}
+
+/*
+ * NAME:	get_UCSname()
+ *
+ * FUNCTION:	Allocate and translate to unicode string
+ *
+ */
+int get_UCSname(struct component_name * uniName, struct dentry *dentry)
+{
+	struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab;
+	int length = dentry->d_name.len;
+
+	if (length > JFS_NAME_MAX)
+		return -ENAMETOOLONG;
+
+	uniName->name =
+	    kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS);
+
+	if (uniName->name == NULL)
+		return -ENOMEM;
+
+	uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name,
+				       length, nls_tab);
+
+	if (uniName->namlen < 0) {
+		kfree(uniName->name);
+		return uniName->namlen;
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/jfs/jfs_unicode.h b/kernel/fs/jfs/jfs_unicode.h
new file mode 100644
index 000000000..8f0f02cb6
--- /dev/null
+++ b/kernel/fs/jfs/jfs_unicode.h
@@ -0,0 +1,156 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_UNICODE
+#define _H_JFS_UNICODE
+
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include "jfs_types.h"
+
+typedef struct {
+	wchar_t start;
+	wchar_t end;
+	signed char *table;
+} UNICASERANGE;
+
+extern signed char UniUpperTable[512];
+extern UNICASERANGE UniUpperRange[];
+extern int get_UCSname(struct component_name *, struct dentry *);
+extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *);
+
+#define free_UCSname(COMP) kfree((COMP)->name)
+
+/*
+ * UniStrcpy:  Copy a string
+ */
+static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2)
+{
+	wchar_t *anchor = ucs1;	/* save the start of result string */
+
+	while ((*ucs1++ = *ucs2++));
+	return anchor;
+}
+
+
+
+/*
+ * UniStrncpy:  Copy length limited string with pad
+ */
+static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2,
+				  size_t n)
+{
+	__le16 *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = *ucs2++;
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrncmp_le:  Compare length limited string - native to little-endian
+ */
+static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2,
+				size_t n)
+{
+	if (!n)
+		return 0;	/* Null strings are equal */
+	while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) __le16_to_cpu(*ucs2);
+}
+
+/*
+ * UniStrncpy_to_le:  Copy length limited string with pad to little-endian
+ */
+static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2,
+				       size_t n)
+{
+	__le16 *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = cpu_to_le16(*ucs2++);
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrncpy_from_le:  Copy length limited string with pad from little-endian
+ */
+static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2,
+					  size_t n)
+{
+	wchar_t *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = __le16_to_cpu(*ucs2++);
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniToupper:  Convert a unicode character to upper case
+ */
+static inline wchar_t UniToupper(wchar_t uc)
+{
+	UNICASERANGE *rp;
+
+	if (uc < sizeof(UniUpperTable)) {	/* Latin characters */
+		return uc + UniUpperTable[uc];	/* Use base tables */
+	} else {
+		rp = UniUpperRange;	/* Use range tables */
+		while (rp->start) {
+			if (uc < rp->start)	/* Before start of range */
+				return uc;	/* Uppercase = input */
+			if (uc <= rp->end)	/* In range */
+				return uc + rp->table[uc - rp->start];
+			rp++;	/* Try next range */
+		}
+	}
+	return uc;		/* Past last range */
+}
+
+
+/*
+ * UniStrupr:  Upper case a unicode string
+ */
+static inline wchar_t *UniStrupr(wchar_t * upin)
+{
+	wchar_t *up;
+
+	up = upin;
+	while (*up) {		/* For all characters */
+		*up = UniToupper(*up);
+		up++;
+	}
+	return upin;		/* Return input pointer */
+}
+
+#endif				/* !_H_JFS_UNICODE */
diff --git a/kernel/fs/jfs/jfs_uniupr.c b/kernel/fs/jfs/jfs_uniupr.c
new file mode 100644
index 000000000..cfe50666d
--- /dev/null
+++ b/kernel/fs/jfs/jfs_uniupr.c
@@ -0,0 +1,134 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include "jfs_unicode.h"
+
+/*
+ * Latin upper case
+ */
+signed char UniUpperTable[512] = {
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 000-00f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 010-01f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 020-02f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 030-03f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 040-04f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 050-05f */
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,  0,  0,  0,  0,  0, /* 070-07f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 080-08f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 090-09f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0a0-0af */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0b0-0bf */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0c0-0cf */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0d0-0df */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */
+ -32,-32,-32,-32,-32,-32,-32,  0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 100-10f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 110-11f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 120-12f */
+   0,  0,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1,  0, /* 130-13f */
+  -1,  0, -1,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1, /* 140-14f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 150-15f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 160-16f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1,  0, /* 170-17f */
+   0,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, /* 180-18f */
+   0,  0, -1,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0, /* 190-19f */
+   0, -1,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0, /* 1a0-1af */
+  -1,  0,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0, -1,  0,  0, /* 1b0-1bf */
+   0,  0,  0,  0,  0, -1, -2,  0, -1, -2,  0, -1, -2,  0, -1,  0, /* 1c0-1cf */
+  -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,-79,  0, -1, /* 1d0-1df */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e0-1ef */
+   0,  0, -1, -2,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0, -1, /* 1f0-1ff */
+};
+
+/* Upper case range - Greek */
+static signed char UniCaseRangeU03a0[47] = {
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,-38,-37,-37,-37, /* 3a0-3af */
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */
+ -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63,
+};
+
+/* Upper case range - Cyrillic */
+static signed char UniCaseRangeU0430[48] = {
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */
+   0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,  0,-80,-80, /* 450-45f */
+};
+
+/* Upper case range - Extended cyrillic */
+static signed char UniCaseRangeU0490[61] = {
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 490-49f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 4a0-4af */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 4b0-4bf */
+   0,  0, -1,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,
+};
+
+/* Upper case range - Extended latin and greek */
+static signed char UniCaseRangeU1e00[509] = {
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e00-1e0f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e10-1e1f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e20-1e2f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e30-1e3f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e40-1e4f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e50-1e5f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e60-1e6f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e70-1e7f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e80-1e8f */
+   0, -1,  0, -1,  0, -1,  0,  0,  0,  0,  0,-59,  0, -1,  0, -1, /* 1e90-1e9f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ea0-1eaf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1eb0-1ebf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ec0-1ecf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ed0-1edf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ee0-1eef */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0, /* 1ef0-1eff */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f00-1f0f */
+   8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f10-1f1f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f20-1f2f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f30-1f3f */
+   8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f40-1f4f */
+   0,  8,  0,  8,  0,  8,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f50-1f5f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f60-1f6f */
+  74, 74, 86, 86, 86, 86,100,100,  0,  0,112,112,126,126,  0,  0, /* 1f70-1f7f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f80-1f8f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f90-1f9f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fa0-1faf */
+   8,  8,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fb0-1fbf */
+   0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fc0-1fcf */
+   8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fd0-1fdf */
+   8,  8,  0,  0,  0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fe0-1fef */
+   0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+
+/* Upper case range - Wide latin */
+static signed char UniCaseRangeUff40[27] = {
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,
+};
+
+/*
+ * Upper Case Range
+ */
+UNICASERANGE UniUpperRange[] = {
+    { 0x03a0,  0x03ce,  UniCaseRangeU03a0 },
+    { 0x0430,  0x045f,  UniCaseRangeU0430 },
+    { 0x0490,  0x04cc,  UniCaseRangeU0490 },
+    { 0x1e00,  0x1ffc,  UniCaseRangeU1e00 },
+    { 0xff40,  0xff5a,  UniCaseRangeUff40 },
+    { 0 }
+};
diff --git a/kernel/fs/jfs/jfs_xattr.h b/kernel/fs/jfs/jfs_xattr.h
new file mode 100644
index 000000000..e8d717dab
--- /dev/null
+++ b/kernel/fs/jfs/jfs_xattr.h
@@ -0,0 +1,77 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef H_JFS_XATTR
+#define H_JFS_XATTR
+
+/*
+ * jfs_ea_list describe the on-disk format of the extended attributes.
+ * I know the null-terminator is redundant since namelen is stored, but
+ * I am maintaining compatibility with OS/2 where possible.
+ */
+struct jfs_ea {
+	u8 flag;	/* Unused? */
+	u8 namelen;	/* Length of name */
+	__le16 valuelen;	/* Length of value */
+	char name[0];	/* Attribute name (includes null-terminator) */
+};			/* Value immediately follows name */
+
+struct jfs_ea_list {
+	__le32 size;		/* overall size */
+	struct jfs_ea ea[0];	/* Variable length list */
+};
+
+/* Macros for defining maxiumum number of bytes supported for EAs */
+#define MAXEASIZE	65535
+#define MAXEALISTSIZE	MAXEASIZE
+
+/*
+ * some macros for dealing with variable length EA lists.
+ */
+#define EA_SIZE(ea) \
+	(sizeof (struct jfs_ea) + (ea)->namelen + 1 + \
+	 le16_to_cpu((ea)->valuelen))
+#define	NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea))))
+#define	FIRST_EA(ealist) ((ealist)->ea)
+#define	EALIST_SIZE(ealist) le32_to_cpu((ealist)->size)
+#define	END_EALIST(ealist) \
+	((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
+
+extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
+			  size_t, int);
+extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
+			int);
+extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
+extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
+extern int jfs_removexattr(struct dentry *, const char *);
+
+extern const struct xattr_handler *jfs_xattr_handlers[];
+
+#ifdef CONFIG_JFS_SECURITY
+extern int jfs_init_security(tid_t, struct inode *, struct inode *,
+			     const struct qstr *);
+#else
+static inline int jfs_init_security(tid_t tid, struct inode *inode,
+				    struct inode *dir, const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
+
+#endif	/* H_JFS_XATTR */
diff --git a/kernel/fs/jfs/jfs_xtree.c b/kernel/fs/jfs/jfs_xtree.c
new file mode 100644
index 000000000..5ad774886
--- /dev/null
+++ b/kernel/fs/jfs/jfs_xtree.c
@@ -0,0 +1,3903 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *	jfs_xtree.c: extent allocation descriptor B+-tree manager
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_dinode.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ * xtree local flag
+ */
+#define XT_INSERT	0x00000001
+
+/*
+ *	xtree key/entry comparison: extent offset
+ *
+ * return:
+ *	-1: k < start of extent
+ *	 0: start_of_extent <= k <= end_of_extent
+ *	 1: k > end_of_extent
+ */
+#define XT_CMP(CMP, K, X, OFFSET64)\
+{\
+	OFFSET64 = offsetXAD(X);\
+	(CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
+		((K) < OFFSET64) ? -1 : 0;\
+}
+
+/* write a xad entry */
+#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
+{\
+	(XAD)->flag = (FLAG);\
+	XADoffset((XAD), (OFF));\
+	XADlength((XAD), (LEN));\
+	XADaddress((XAD), (ADDR));\
+}
+
+#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
+
+/* get page buffer for specified block address */
+/* ToDo: Replace this ugly macro with a function */
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)				\
+do {									\
+	BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot);	\
+	if (!(RC)) {							\
+		if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
+		    (le16_to_cpu((P)->header.nextindex) >		\
+		     le16_to_cpu((P)->header.maxentry)) ||		\
+		    (le16_to_cpu((P)->header.maxentry) >		\
+		     (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
+			jfs_error((IP)->i_sb,				\
+				  "XT_GETPAGE: xtree page corrupt\n");	\
+			BT_PUTPAGE(MP);					\
+			MP = NULL;					\
+			RC = -EIO;					\
+		}							\
+	}								\
+} while (0)
+
+/* for consistency */
+#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
+
+#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+	BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
+/* xtree entry parameter descriptor */
+struct xtsplit {
+	struct metapage *mp;
+	s16 index;
+	u8 flag;
+	s64 off;
+	s64 addr;
+	int len;
+	struct pxdlist *pxdlist;
+};
+
+
+/*
+ *	statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+	uint search;
+	uint fastSearch;
+	uint split;
+} xtStat;
+#endif
+
+
+/*
+ * forward references
+ */
+static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp,
+		    struct btstack * btstack, int flag);
+
+static int xtSplitUp(tid_t tid,
+		     struct inode *ip,
+		     struct xtsplit * split, struct btstack * btstack);
+
+static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
+		       struct metapage ** rmpp, s64 * rbnp);
+
+static int xtSplitRoot(tid_t tid, struct inode *ip,
+		       struct xtsplit * split, struct metapage ** rmpp);
+
+#ifdef _STILL_TO_PORT
+static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+		      xtpage_t * fp, struct btstack * btstack);
+
+static int xtSearchNode(struct inode *ip,
+			xad_t * xad,
+			int *cmpp, struct btstack * btstack, int flag);
+
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
+#endif				/*  _STILL_TO_PORT */
+
+/*
+ *	xtLookup()
+ *
+ * function: map a single page into a physical extent;
+ */
+int xtLookup(struct inode *ip, s64 lstart,
+	     s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
+{
+	int rc = 0;
+	struct btstack btstack;
+	int cmp;
+	s64 bn;
+	struct metapage *mp;
+	xtpage_t *p;
+	int index;
+	xad_t *xad;
+	s64 next, size, xoff, xend;
+	int xlen;
+	s64 xaddr;
+
+	*paddr = 0;
+	*plen = llen;
+
+	if (!no_check) {
+		/* is lookup offset beyond eof ? */
+		size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+		    JFS_SBI(ip->i_sb)->l2bsize;
+		if (lstart >= size)
+			return 0;
+	}
+
+	/*
+	 * search for the xad entry covering the logical extent
+	 */
+//search:
+	if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) {
+		jfs_err("xtLookup: xtSearch returned %d", rc);
+		return rc;
+	}
+
+	/*
+	 *	compute the physical extent covering logical extent
+	 *
+	 * N.B. search may have failed (e.g., hole in sparse file),
+	 * and returned the index of the next entry.
+	 */
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	/* is xad found covering start of logical extent ?
+	 * lstart is a page start address,
+	 * i.e., lstart cannot start in a hole;
+	 */
+	if (cmp) {
+		if (next)
+			*plen = min(next - lstart, llen);
+		goto out;
+	}
+
+	/*
+	 * lxd covered by xad
+	 */
+	xad = &p->xad[index];
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	xend = xoff + xlen;
+	xaddr = addressXAD(xad);
+
+	/* initialize new pxd */
+	*pflag = xad->flag;
+	*paddr = xaddr + (lstart - xoff);
+	/* a page must be fully covered by an xad */
+	*plen = min(xend - lstart, llen);
+
+      out:
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+/*
+ *	xtSearch()
+ *
+ * function:	search for the xad entry covering specified offset.
+ *
+ * parameters:
+ *	ip	- file object;
+ *	xoff	- extent offset;
+ *	nextp	- address of next extent (if any) for search miss
+ *	cmpp	- comparison result:
+ *	btstack - traverse stack;
+ *	flag	- search process flag (XT_INSERT);
+ *
+ * returns:
+ *	btstack contains (bn, index) of search path traversed to the entry.
+ *	*cmpp is set to result of comparison with the entry returned.
+ *	the page containing the entry is pinned at exit.
+ */
+static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
+		    int *cmpp, struct btstack * btstack, int flag)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	int rc = 0;
+	int cmp = 1;		/* init for empty page */
+	s64 bn;			/* block number */
+	struct metapage *mp;	/* page buffer */
+	xtpage_t *p;		/* page */
+	xad_t *xad;
+	int base, index, lim, btindex;
+	struct btframe *btsp;
+	int nsplit = 0;		/* number of pages to split */
+	s64 t64;
+	s64 next = 0;
+
+	INCREMENT(xtStat.search);
+
+	BT_CLR(btstack);
+
+	btstack->nsplit = 0;
+
+	/*
+	 *	search down tree from root:
+	 *
+	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+	 *
+	 * if entry with search key K is not found
+	 * internal page search find the entry with largest key Ki
+	 * less than K which point to the child page to search;
+	 * leaf page search find the entry with smallest key Kj
+	 * greater than K so that the returned index is the position of
+	 * the entry to be shifted right for insertion of new entry.
+	 * for empty tree, search key is greater than any key of the tree.
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* try sequential access heuristics with the previous
+		 * access entry in target leaf page:
+		 * once search narrowed down into the target leaf,
+		 * key must either match an entry in the leaf or
+		 * key entry does not exist in the tree;
+		 */
+//fastSearch:
+		if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
+		    (p->header.flag & BT_LEAF) &&
+		    (index = jfs_ip->btindex) <
+		    le16_to_cpu(p->header.nextindex)) {
+			xad = &p->xad[index];
+			t64 = offsetXAD(xad);
+			if (xoff < t64 + lengthXAD(xad)) {
+				if (xoff >= t64) {
+					*cmpp = 0;
+					goto out;
+				}
+
+				/* stop sequential access heuristics */
+				goto binarySearch;
+			} else {	/* (t64 + lengthXAD(xad)) <= xoff */
+
+				/* try next sequential entry */
+				index++;
+				if (index <
+				    le16_to_cpu(p->header.nextindex)) {
+					xad++;
+					t64 = offsetXAD(xad);
+					if (xoff < t64 + lengthXAD(xad)) {
+						if (xoff >= t64) {
+							*cmpp = 0;
+							goto out;
+						}
+
+						/* miss: key falls between
+						 * previous and this entry
+						 */
+						*cmpp = 1;
+						next = t64;
+						goto out;
+					}
+
+					/* (xoff >= t64 + lengthXAD(xad));
+					 * matching entry may be further out:
+					 * stop heuristic search
+					 */
+					/* stop sequential access heuristics */
+					goto binarySearch;
+				}
+
+				/* (index == p->header.nextindex);
+				 * miss: key entry does not exist in
+				 * the target leaf/tree
+				 */
+				*cmpp = 1;
+				goto out;
+			}
+
+			/*
+			 * if hit, return index of the entry found, and
+			 * if miss, where new entry with search key is
+			 * to be inserted;
+			 */
+		      out:
+			/* compute number of pages to split */
+			if (flag & XT_INSERT) {
+				if (p->header.nextindex ==	/* little-endian */
+				    p->header.maxentry)
+					nsplit++;
+				else
+					nsplit = 0;
+				btstack->nsplit = nsplit;
+			}
+
+			/* save search result */
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = index;
+			btsp->mp = mp;
+
+			/* update sequential access heuristics */
+			jfs_ip->btindex = index;
+
+			if (nextp)
+				*nextp = next;
+
+			INCREMENT(xtStat.fastSearch);
+			return 0;
+		}
+
+		/* well, ... full search now */
+	      binarySearch:
+		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+
+		/*
+		 * binary search with search key K on the current page
+		 */
+		for (base = XTENTRYSTART; lim; lim >>= 1) {
+			index = base + (lim >> 1);
+
+			XT_CMP(cmp, xoff, &p->xad[index], t64);
+			if (cmp == 0) {
+				/*
+				 *	search hit
+				 */
+				/* search hit - leaf page:
+				 * return the entry found
+				 */
+				if (p->header.flag & BT_LEAF) {
+					*cmpp = cmp;
+
+					/* compute number of pages to split */
+					if (flag & XT_INSERT) {
+						if (p->header.nextindex ==
+						    p->header.maxentry)
+							nsplit++;
+						else
+							nsplit = 0;
+						btstack->nsplit = nsplit;
+					}
+
+					/* save search result */
+					btsp = btstack->top;
+					btsp->bn = bn;
+					btsp->index = index;
+					btsp->mp = mp;
+
+					/* init sequential access heuristics */
+					btindex = jfs_ip->btindex;
+					if (index == btindex ||
+					    index == btindex + 1)
+						jfs_ip->btorder = BT_SEQUENTIAL;
+					else
+						jfs_ip->btorder = BT_RANDOM;
+					jfs_ip->btindex = index;
+
+					return 0;
+				}
+				/* search hit - internal page:
+				 * descend/search its child page
+				 */
+				if (index < le16_to_cpu(p->header.nextindex)-1)
+					next = offsetXAD(&p->xad[index + 1]);
+				goto next;
+			}
+
+			if (cmp > 0) {
+				base = index + 1;
+				--lim;
+			}
+		}
+
+		/*
+		 *	search miss
+		 *
+		 * base is the smallest index with key (Kj) greater than
+		 * search key (K) and may be zero or maxentry index.
+		 */
+		if (base < le16_to_cpu(p->header.nextindex))
+			next = offsetXAD(&p->xad[base]);
+		/*
+		 * search miss - leaf page:
+		 *
+		 * return location of entry (base) where new entry with
+		 * search key K is to be inserted.
+		 */
+		if (p->header.flag & BT_LEAF) {
+			*cmpp = cmp;
+
+			/* compute number of pages to split */
+			if (flag & XT_INSERT) {
+				if (p->header.nextindex ==
+				    p->header.maxentry)
+					nsplit++;
+				else
+					nsplit = 0;
+				btstack->nsplit = nsplit;
+			}
+
+			/* save search result */
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = base;
+			btsp->mp = mp;
+
+			/* init sequential access heuristics */
+			btindex = jfs_ip->btindex;
+			if (base == btindex || base == btindex + 1)
+				jfs_ip->btorder = BT_SEQUENTIAL;
+			else
+				jfs_ip->btorder = BT_RANDOM;
+			jfs_ip->btindex = base;
+
+			if (nextp)
+				*nextp = next;
+
+			return 0;
+		}
+
+		/*
+		 * search miss - non-leaf page:
+		 *
+		 * if base is non-zero, decrement base by one to get the parent
+		 * entry of the child page to search.
+		 */
+		index = base ? base - 1 : base;
+
+		/*
+		 * go down to child page
+		 */
+	      next:
+		/* update number of pages to split */
+		if (p->header.nextindex == p->header.maxentry)
+			nsplit++;
+		else
+			nsplit = 0;
+
+		/* push (bn, index) of the parent page/entry */
+		if (BT_STACK_FULL(btstack)) {
+			jfs_error(ip->i_sb, "stack overrun!\n");
+			XT_PUTPAGE(mp);
+			return -EIO;
+		}
+		BT_PUSH(btstack, bn, index);
+
+		/* get the child page block number */
+		bn = addressXAD(&p->xad[index]);
+
+		/* unpin the parent page */
+		XT_PUTPAGE(mp);
+	}
+}
+
+/*
+ *	xtInsert()
+ *
+ * function:
+ *
+ * parameter:
+ *	tid	- transaction id;
+ *	ip	- file object;
+ *	xflag	- extent flag (XAD_NOTRECORDED):
+ *	xoff	- extent offset;
+ *	xlen	- extent length;
+ *	xaddrp	- extent address pointer (in/out):
+ *		if (*xaddrp)
+ *			caller allocated data extent at *xaddrp;
+ *		else
+ *			allocate data extent and return its xaddr;
+ *	flag	-
+ *
+ * return:
+ */
+int xtInsert(tid_t tid,		/* transaction id */
+	     struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
+	     int flag)
+{
+	int rc = 0;
+	s64 xaddr, hint;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index, nextindex;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	int cmp;
+	s64 next;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+
+	/*
+	 *	search for the entry location at which to insert:
+	 *
+	 * xtFastSearch() and xtSearch() both returns (leaf page
+	 * pinned, index at which to insert).
+	 * n.b. xtSearch() may return index of maxentry of
+	 * the full page.
+	 */
+	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	/* This test must follow XT_GETSEARCH since mp must be valid if
+	 * we branch to out: */
+	if ((cmp == 0) || (next && (xlen > next - xoff))) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	/*
+	 * allocate data extent requested
+	 *
+	 * allocation hint: last xad
+	 */
+	if ((xaddr = *xaddrp) == 0) {
+		if (index > XTENTRYSTART) {
+			xad = &p->xad[index - 1];
+			hint = addressXAD(xad) + lengthXAD(xad) - 1;
+		} else
+			hint = 0;
+		if ((rc = dquot_alloc_block(ip, xlen)))
+			goto out;
+		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
+			dquot_free_block(ip, xlen);
+			goto out;
+		}
+	}
+
+	/*
+	 *	insert entry for new extent
+	 */
+	xflag |= XAD_NEW;
+
+	/*
+	 *	if the leaf page is full, split the page and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		split.mp = mp;
+		split.index = index;
+		split.flag = xflag;
+		split.off = xoff;
+		split.len = xlen;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+			/* undo data extent allocation */
+			if (*xaddrp == 0) {
+				dbFree(ip, xaddr, (s64) xlen);
+				dquot_free_block(ip, xlen);
+			}
+			return rc;
+		}
+
+		*xaddrp = xaddr;
+		return 0;
+	}
+
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action: xad insertion/extension;
+	 */
+	BT_MARK_DIRTY(mp, ip);
+
+	/* if insert into middle, shift right remaining entries. */
+	if (index < nextindex)
+		memmove(&p->xad[index + 1], &p->xad[index],
+			(nextindex - index) * sizeof(xad_t));
+
+	/* insert the new entry: mark the entry NEW */
+	xad = &p->xad[index];
+	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+	/* advance next available entry index */
+	le16_add_cpu(&p->header.nextindex, 1);
+
+	/* Don't log it if there are no links to the file */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->lwm.offset =
+		    (xtlck->lwm.offset) ? min(index,
+					      (int)xtlck->lwm.offset) : index;
+		xtlck->lwm.length =
+		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+	}
+
+	*xaddrp = xaddr;
+
+      out:
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+
+/*
+ *	xtSplitUp()
+ *
+ * function:
+ *	split full pages as propagating insertion up the tree
+ *
+ * parameter:
+ *	tid	- transaction id;
+ *	ip	- file object;
+ *	split	- entry parameter descriptor;
+ *	btstack - traverse stack from xtSearch()
+ *
+ * return:
+ */
+static int
+xtSplitUp(tid_t tid,
+	  struct inode *ip, struct xtsplit * split, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *smp;
+	xtpage_t *sp;		/* split page */
+	struct metapage *rmp;
+	s64 rbn;		/* new right page block number */
+	struct metapage *rcmp;
+	xtpage_t *rcp;		/* right child page */
+	s64 rcbn;		/* right child page block number */
+	int skip;		/* index of entry of insertion */
+	int nextindex;		/* next available entry index of p */
+	struct btframe *parent;	/* parent page entry on traverse stack */
+	xad_t *xad;
+	s64 xaddr;
+	int xlen;
+	int nsplit;		/* number of pages split */
+	struct pxdlist pxdlist;
+	pxd_t *pxd;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	smp = split->mp;
+	sp = XT_PAGE(ip, smp);
+
+	/* is inode xtree root extension/inline EA area free ? */
+	if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
+	    (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) &&
+	    (JFS_IP(ip)->mode2 & INLINEEA)) {
+		sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
+		JFS_IP(ip)->mode2 &= ~INLINEEA;
+
+		BT_MARK_DIRTY(smp, ip);
+		/*
+		 * acquire a transaction lock on the leaf page;
+		 *
+		 * action: xad insertion/extension;
+		 */
+
+		/* if insert into middle, shift right remaining entries. */
+		skip = split->index;
+		nextindex = le16_to_cpu(sp->header.nextindex);
+		if (skip < nextindex)
+			memmove(&sp->xad[skip + 1], &sp->xad[skip],
+				(nextindex - skip) * sizeof(xad_t));
+
+		/* insert the new entry: mark the entry NEW */
+		xad = &sp->xad[skip];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		/* advance next available entry index */
+		le16_add_cpu(&sp->header.nextindex, 1);
+
+		/* Don't log it if there are no links to the file */
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+			xtlck = (struct xtlock *) & tlck->lock;
+			xtlck->lwm.offset = (xtlck->lwm.offset) ?
+			    min(skip, (int)xtlck->lwm.offset) : skip;
+			xtlck->lwm.length =
+			    le16_to_cpu(sp->header.nextindex) -
+			    xtlck->lwm.offset;
+		}
+
+		return 0;
+	}
+
+	/*
+	 * allocate new index blocks to cover index page split(s)
+	 *
+	 * allocation hint: ?
+	 */
+	if (split->pxdlist == NULL) {
+		nsplit = btstack->nsplit;
+		split->pxdlist = &pxdlist;
+		pxdlist.maxnpxd = pxdlist.npxd = 0;
+		pxd = &pxdlist.pxd[0];
+		xlen = JFS_SBI(ip->i_sb)->nbperpage;
+		for (; nsplit > 0; nsplit--, pxd++) {
+			if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
+			    == 0) {
+				PXDaddress(pxd, xaddr);
+				PXDlength(pxd, xlen);
+
+				pxdlist.maxnpxd++;
+
+				continue;
+			}
+
+			/* undo allocation */
+
+			XT_PUTPAGE(smp);
+			return rc;
+		}
+	}
+
+	/*
+	 * Split leaf page <sp> into <sp> and a new right page <rp>.
+	 *
+	 * The split routines insert the new entry into the leaf page,
+	 * and acquire txLock as appropriate.
+	 * return <rp> pinned and its block number <rpbn>.
+	 */
+	rc = (sp->header.flag & BT_ROOT) ?
+	    xtSplitRoot(tid, ip, split, &rmp) :
+	    xtSplitPage(tid, ip, split, &rmp, &rbn);
+
+	XT_PUTPAGE(smp);
+
+	if (rc)
+		return -EIO;
+	/*
+	 * propagate up the router entry for the leaf page just split
+	 *
+	 * insert a router entry for the new page into the parent page,
+	 * propagate the insert/split up the tree by walking back the stack
+	 * of (bn of parent page, index of child page entry in parent page)
+	 * that were traversed during the search for the page that split.
+	 *
+	 * the propagation of insert/split up the tree stops if the root
+	 * splits or the page inserted into doesn't have to split to hold
+	 * the new entry.
+	 *
+	 * the parent entry for the split page remains the same, and
+	 * a new entry is inserted at its right with the first key and
+	 * block number of the new right page.
+	 *
+	 * There are a maximum of 3 pages pinned at any time:
+	 * right child, left parent and right parent (when the parent splits)
+	 * to keep the child page pinned while working on the parent.
+	 * make sure that all pins are released at exit.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* parent page specified by stack frame <parent> */
+
+		/* keep current child pages <rcp> pinned */
+		rcmp = rmp;
+		rcbn = rbn;
+		rcp = XT_PAGE(ip, rcmp);
+
+		/*
+		 * insert router entry in parent for new right child page <rp>
+		 */
+		/* get/pin the parent page <sp> */
+		XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+		if (rc) {
+			XT_PUTPAGE(rcmp);
+			return rc;
+		}
+
+		/*
+		 * The new key entry goes ONE AFTER the index of parent entry,
+		 * because the split was to the right.
+		 */
+		skip = parent->index + 1;
+
+		/*
+		 * split or shift right remaining entries of the parent page
+		 */
+		nextindex = le16_to_cpu(sp->header.nextindex);
+		/*
+		 * parent page is full - split the parent page
+		 */
+		if (nextindex == le16_to_cpu(sp->header.maxentry)) {
+			/* init for parent page split */
+			split->mp = smp;
+			split->index = skip;	/* index at insert */
+			split->flag = XAD_NEW;
+			split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
+			split->len = JFS_SBI(ip->i_sb)->nbperpage;
+			split->addr = rcbn;
+
+			/* unpin previous right child page */
+			XT_PUTPAGE(rcmp);
+
+			/* The split routines insert the new entry,
+			 * and acquire txLock as appropriate.
+			 * return <rp> pinned and its block number <rpbn>.
+			 */
+			rc = (sp->header.flag & BT_ROOT) ?
+			    xtSplitRoot(tid, ip, split, &rmp) :
+			    xtSplitPage(tid, ip, split, &rmp, &rbn);
+			if (rc) {
+				XT_PUTPAGE(smp);
+				return rc;
+			}
+
+			XT_PUTPAGE(smp);
+			/* keep new child page <rp> pinned */
+		}
+		/*
+		 * parent page is not full - insert in parent page
+		 */
+		else {
+			/*
+			 * insert router entry in parent for the right child
+			 * page from the first entry of the right child page:
+			 */
+			/*
+			 * acquire a transaction lock on the parent page;
+			 *
+			 * action: router xad insertion;
+			 */
+			BT_MARK_DIRTY(smp, ip);
+
+			/*
+			 * if insert into middle, shift right remaining entries
+			 */
+			if (skip < nextindex)
+				memmove(&sp->xad[skip + 1], &sp->xad[skip],
+					(nextindex -
+					 skip) << L2XTSLOTSIZE);
+
+			/* insert the router entry */
+			xad = &sp->xad[skip];
+			XT_PUTENTRY(xad, XAD_NEW,
+				    offsetXAD(&rcp->xad[XTENTRYSTART]),
+				    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
+
+			/* advance next available entry index. */
+			le16_add_cpu(&sp->header.nextindex, 1);
+
+			/* Don't log it if there are no links to the file */
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, smp,
+					      tlckXTREE | tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+				xtlck->lwm.offset = (xtlck->lwm.offset) ?
+				    min(skip, (int)xtlck->lwm.offset) : skip;
+				xtlck->lwm.length =
+				    le16_to_cpu(sp->header.nextindex) -
+				    xtlck->lwm.offset;
+			}
+
+			/* unpin parent page */
+			XT_PUTPAGE(smp);
+
+			/* exit propagate up */
+			break;
+		}
+	}
+
+	/* unpin current right page */
+	XT_PUTPAGE(rmp);
+
+	return 0;
+}
+
+
+/*
+ *	xtSplitPage()
+ *
+ * function:
+ *	split a full non-root page into
+ *	original/split/left page and new right page
+ *	i.e., the original/split page remains as left page.
+ *
+ * parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	struct xtsplit	*split,
+ *	struct metapage	**rmpp,
+ *	u64		*rbnp,
+ *
+ * return:
+ *	Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitPage(tid_t tid, struct inode *ip,
+	    struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
+{
+	int rc = 0;
+	struct metapage *smp;
+	xtpage_t *sp;
+	struct metapage *rmp;
+	xtpage_t *rp;		/* new right page allocated */
+	s64 rbn;		/* new right page block number */
+	struct metapage *mp;
+	xtpage_t *p;
+	s64 nextbn;
+	int skip, maxentry, middle, righthalf, n;
+	xad_t *xad;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd;
+	struct tlock *tlck;
+	struct xtlock *sxtlck = NULL, *rxtlck = NULL;
+	int quota_allocation = 0;
+
+	smp = split->mp;
+	sp = XT_PAGE(ip, smp);
+
+	INCREMENT(xtStat.split);
+
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc)
+		goto clean_up;
+
+	quota_allocation += lengthPXD(pxd);
+
+	/*
+	 * allocate the new right page for the split
+	 */
+	rmp = get_metapage(ip, rbn, PSIZE, 1);
+	if (rmp == NULL) {
+		rc = -EIO;
+		goto clean_up;
+	}
+
+	jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
+
+	BT_MARK_DIRTY(rmp, ip);
+	/*
+	 * action: new page;
+	 */
+
+	rp = (xtpage_t *) rmp->data;
+	rp->header.self = *pxd;
+	rp->header.flag = sp->header.flag & BT_TYPE;
+	rp->header.maxentry = sp->header.maxentry;	/* little-endian */
+	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+	BT_MARK_DIRTY(smp, ip);
+	/* Don't log it if there are no links to the file */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		/*
+		 * acquire a transaction lock on the new right page;
+		 */
+		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+		rxtlck = (struct xtlock *) & tlck->lock;
+		rxtlck->lwm.offset = XTENTRYSTART;
+		/*
+		 * acquire a transaction lock on the split page
+		 */
+		tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+		sxtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	/*
+	 * initialize/update sibling pointers of <sp> and <rp>
+	 */
+	nextbn = le64_to_cpu(sp->header.next);
+	rp->header.next = cpu_to_le64(nextbn);
+	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+	sp->header.next = cpu_to_le64(rbn);
+
+	skip = split->index;
+
+	/*
+	 *	sequential append at tail (after last entry of last page)
+	 *
+	 * if splitting the last page on a level because of appending
+	 * a entry to it (skip is maxentry), it's likely that the access is
+	 * sequential. adding an empty page on the side of the level is less
+	 * work and can push the fill factor much higher than normal.
+	 * if we're wrong it's no big deal -  we will do the split the right
+	 * way next time.
+	 * (it may look like it's equally easy to do a similar hack for
+	 * reverse sorted data, that is, split the tree left, but it's not.
+	 * Be my guest.)
+	 */
+	if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
+		/*
+		 * acquire a transaction lock on the new/right page;
+		 *
+		 * action: xad insertion;
+		 */
+		/* insert entry at the first entry of the new right page */
+		xad = &rp->xad[XTENTRYSTART];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			/* rxtlck->lwm.offset = XTENTRYSTART; */
+			rxtlck->lwm.length = 1;
+		}
+
+		*rmpp = rmp;
+		*rbnp = rbn;
+
+		jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+		return 0;
+	}
+
+	/*
+	 *	non-sequential insert (at possibly middle page)
+	 */
+
+	/*
+	 * update previous pointer of old next/right page of <sp>
+	 */
+	if (nextbn != 0) {
+		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc) {
+			XT_PUTPAGE(rmp);
+			goto clean_up;
+		}
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the next page;
+		 *
+		 * action:sibling pointer update;
+		 */
+		if (!test_cflag(COMMIT_Nolink, ip))
+			tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+		p->header.prev = cpu_to_le64(rbn);
+
+		/* sibling page may have been updated previously, or
+		 * it may be updated later;
+		 */
+
+		XT_PUTPAGE(mp);
+	}
+
+	/*
+	 * split the data between the split and new/right pages
+	 */
+	maxentry = le16_to_cpu(sp->header.maxentry);
+	middle = maxentry >> 1;
+	righthalf = maxentry - middle;
+
+	/*
+	 * skip index in old split/left page - insert into left page:
+	 */
+	if (skip <= middle) {
+		/* move right half of split page to the new right page */
+		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+			righthalf << L2XTSLOTSIZE);
+
+		/* shift right tail of left half to make room for new entry */
+		if (skip < middle)
+			memmove(&sp->xad[skip + 1], &sp->xad[skip],
+				(middle - skip) << L2XTSLOTSIZE);
+
+		/* insert new entry */
+		xad = &sp->xad[skip];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		/* update page header */
+		sp->header.nextindex = cpu_to_le16(middle + 1);
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+			    min(skip, (int)sxtlck->lwm.offset) : skip;
+		}
+
+		rp->header.nextindex =
+		    cpu_to_le16(XTENTRYSTART + righthalf);
+	}
+	/*
+	 * skip index in new right page - insert into right page:
+	 */
+	else {
+		/* move left head of right half to right page */
+		n = skip - middle;
+		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+			n << L2XTSLOTSIZE);
+
+		/* insert new entry */
+		n += XTENTRYSTART;
+		xad = &rp->xad[n];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		/* move right tail of right half to right page */
+		if (skip < maxentry)
+			memmove(&rp->xad[n + 1], &sp->xad[skip],
+				(maxentry - skip) << L2XTSLOTSIZE);
+
+		/* update page header */
+		sp->header.nextindex = cpu_to_le16(middle);
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+			    min(middle, (int)sxtlck->lwm.offset) : middle;
+		}
+
+		rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
+						   righthalf + 1);
+	}
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
+		    sxtlck->lwm.offset;
+
+		/* rxtlck->lwm.offset = XTENTRYSTART; */
+		rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+		    XTENTRYSTART;
+	}
+
+	*rmpp = rmp;
+	*rbnp = rbn;
+
+	jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+	return rc;
+
+      clean_up:
+
+	/* Rollback quota allocation. */
+	if (quota_allocation)
+		dquot_free_block(ip, quota_allocation);
+
+	return (rc);
+}
+
+
+/*
+ *	xtSplitRoot()
+ *
+ * function:
+ *	split the full root page into original/root/split page and new
+ *	right page
+ *	i.e., root remains fixed in tree anchor (inode) and the root is
+ *	copied to a single new right child page since root page <<
+ *	non-root page, and the split root page contains a single entry
+ *	for the new right child page.
+ *
+ * parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	struct xtsplit	*split,
+ *	struct metapage	**rmpp)
+ *
+ * return:
+ *	Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitRoot(tid_t tid,
+	    struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
+{
+	xtpage_t *sp;
+	struct metapage *rmp;
+	xtpage_t *rp;
+	s64 rbn;
+	int skip, nextindex;
+	xad_t *xad;
+	pxd_t *pxd;
+	struct pxdlist *pxdlist;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	int rc;
+
+	sp = &JFS_IP(ip)->i_xtroot;
+
+	INCREMENT(xtStat.split);
+
+	/*
+	 *	allocate a single (right) child page
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+	rmp = get_metapage(ip, rbn, PSIZE, 1);
+	if (rmp == NULL)
+		return -EIO;
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
+		release_metapage(rmp);
+		return rc;
+	}
+
+	jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
+
+	/*
+	 * acquire a transaction lock on the new right page;
+	 *
+	 * action: new page;
+	 */
+	BT_MARK_DIRTY(rmp, ip);
+
+	rp = (xtpage_t *) rmp->data;
+	rp->header.flag =
+	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+	rp->header.self = *pxd;
+	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+	rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
+
+	/* initialize sibling pointers */
+	rp->header.next = 0;
+	rp->header.prev = 0;
+
+	/*
+	 * copy the in-line root page into new right page extent
+	 */
+	nextindex = le16_to_cpu(sp->header.maxentry);
+	memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
+		(nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
+
+	/*
+	 * insert the new entry into the new right/child page
+	 * (skip index in the new right page will not change)
+	 */
+	skip = split->index;
+	/* if insert into middle, shift right remaining entries */
+	if (skip != nextindex)
+		memmove(&rp->xad[skip + 1], &rp->xad[skip],
+			(nextindex - skip) * sizeof(xad_t));
+
+	xad = &rp->xad[skip];
+	XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
+
+	/* update page header */
+	rp->header.nextindex = cpu_to_le16(nextindex + 1);
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->lwm.offset = XTENTRYSTART;
+		xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+		    XTENTRYSTART;
+	}
+
+	/*
+	 *	reset the root
+	 *
+	 * init root with the single entry for the new right page
+	 * set the 1st entry offset to 0, which force the left-most key
+	 * at any level of the tree to be less than any search key.
+	 */
+	/*
+	 * acquire a transaction lock on the root page (in-memory inode);
+	 *
+	 * action: root split;
+	 */
+	BT_MARK_DIRTY(split->mp, ip);
+
+	xad = &sp->xad[XTENTRYSTART];
+	XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
+
+	/* update page header of root */
+	sp->header.flag &= ~BT_LEAF;
+	sp->header.flag |= BT_INTERNAL;
+
+	sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->lwm.offset = XTENTRYSTART;
+		xtlck->lwm.length = 1;
+	}
+
+	*rmpp = rmp;
+
+	jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp);
+	return 0;
+}
+
+
+/*
+ *	xtExtend()
+ *
+ * function: extend in-place;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: alloc whole extended extent;
+ */
+int xtExtend(tid_t tid,		/* transaction id */
+	     struct inode *ip, s64 xoff,	/* delta extent offset */
+	     s32 xlen,		/* delta extent length */
+	     int flag)
+{
+	int rc = 0;
+	int cmp;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index, nextindex, len;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	s64 xaddr;
+	struct tlock *tlck;
+	struct xtlock *xtlck = NULL;
+
+	jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+
+	/* there must exist extent to be extended */
+	if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	if (cmp != 0) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "xtSearch did not find extent\n");
+		return -EIO;
+	}
+
+	/* extension must be contiguous */
+	xad = &p->xad[index];
+	if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "extension is not contiguous\n");
+		return -EIO;
+	}
+
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action: xad insertion/extension;
+	 */
+	BT_MARK_DIRTY(mp, ip);
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	/* extend will overflow extent ? */
+	xlen = lengthXAD(xad) + xlen;
+	if ((len = xlen - MAXXLEN) <= 0)
+		goto extendOld;
+
+	/*
+	 *	extent overflow: insert entry for new extent
+	 */
+//insertNew:
+	xoff = offsetXAD(xad) + MAXXLEN;
+	xaddr = addressXAD(xad) + MAXXLEN;
+	nextindex = le16_to_cpu(p->header.nextindex);
+
+	/*
+	 *	if the leaf page is full, insert the new entry and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = index + 1;
+		split.flag = XAD_NEW;
+		split.off = xoff;	/* split offset */
+		split.len = len;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		}
+	}
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+	else {
+		/* insert the new entry: mark the entry NEW */
+		xad = &p->xad[index + 1];
+		XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
+
+		/* advance next available entry index */
+		le16_add_cpu(&p->header.nextindex, 1);
+	}
+
+	/* get back old entry */
+	xad = &p->xad[index];
+	xlen = MAXXLEN;
+
+	/*
+	 * extend old extent
+	 */
+      extendOld:
+	XADlength(xad, xlen);
+	if (!(xad->flag & XAD_NEW))
+		xad->flag |= XAD_EXTENDED;
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		xtlck->lwm.offset =
+		    (xtlck->lwm.offset) ? min(index,
+					      (int)xtlck->lwm.offset) : index;
+		xtlck->lwm.length =
+		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+	}
+
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+#ifdef _NOTYET
+/*
+ *	xtTailgate()
+ *
+ * function: split existing 'tail' extent
+ *	(split offset >= start offset of tail extent), and
+ *	relocate and extend the split tail half;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: free old split tail extent, alloc new extent;
+ */
+int xtTailgate(tid_t tid,		/* transaction id */
+	       struct inode *ip, s64 xoff,	/* split/new extent offset */
+	       s32 xlen,	/* new extent length */
+	       s64 xaddr,	/* new extent address */
+	       int flag)
+{
+	int rc = 0;
+	int cmp;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index, nextindex, llen, rlen;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	struct tlock *tlck;
+	struct xtlock *xtlck = 0;
+	struct tlock *mtlck;
+	struct maplock *pxdlock;
+
+/*
+printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
+	(ulong)xoff, xlen, (ulong)xaddr);
+*/
+
+	/* there must exist extent to be tailgated */
+	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	if (cmp != 0) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "couldn't find extent\n");
+		return -EIO;
+	}
+
+	/* entry found must be last entry */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	if (index != nextindex - 1) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "the entry found is not the last entry\n");
+		return -EIO;
+	}
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire tlock of the leaf page containing original entry
+	 */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	/* completely replace extent ? */
+	xad = &p->xad[index];
+/*
+printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
+	(ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
+*/
+	if ((llen = xoff - offsetXAD(xad)) == 0)
+		goto updateOld;
+
+	/*
+	 *	partially replace extent: insert entry for new extent
+	 */
+//insertNew:
+	/*
+	 *	if the leaf page is full, insert the new entry and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = index + 1;
+		split.flag = XAD_NEW;
+		split.off = xoff;	/* split offset */
+		split.len = xlen;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		}
+	}
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+	else {
+		/* insert the new entry: mark the entry NEW */
+		xad = &p->xad[index + 1];
+		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+
+		/* advance next available entry index */
+		le16_add_cpu(&p->header.nextindex, 1);
+	}
+
+	/* get back old XAD */
+	xad = &p->xad[index];
+
+	/*
+	 * truncate/relocate old extent at split offset
+	 */
+      updateOld:
+	/* update dmap for old/committed/truncated extent */
+	rlen = lengthXAD(xad) - llen;
+	if (!(xad->flag & XAD_NEW)) {
+		/* free from PWMAP at commit */
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			mtlck = txMaplock(tid, ip, tlckMAP);
+			pxdlock = (struct maplock *) & mtlck->lock;
+			pxdlock->flag = mlckFREEPXD;
+			PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
+			PXDlength(&pxdlock->pxd, rlen);
+			pxdlock->index = 1;
+		}
+	} else
+		/* free from WMAP */
+		dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
+
+	if (llen)
+		/* truncate */
+		XADlength(xad, llen);
+	else
+		/* replace */
+		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		xtlck->lwm.offset = (xtlck->lwm.offset) ?
+		    min(index, (int)xtlck->lwm.offset) : index;
+		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+		    xtlck->lwm.offset;
+	}
+
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+#endif /* _NOTYET */
+
+/*
+ *	xtUpdate()
+ *
+ * function: update XAD;
+ *
+ *	update extent for allocated_but_not_recorded or
+ *	compressed extent;
+ *
+ * parameter:
+ *	nxad	- new XAD;
+ *		logical extent of the specified XAD must be completely
+ *		contained by an existing XAD;
+ */
+int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
+{				/* new XAD */
+	int rc = 0;
+	int cmp;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index0, index, newindex, nextindex;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad, *lxad, *rxad;
+	int xflag;
+	s64 nxoff, xoff;
+	int nxlen, xlen, lxlen, rxlen;
+	s64 nxaddr, xaddr;
+	struct tlock *tlck;
+	struct xtlock *xtlck = NULL;
+	int newpage = 0;
+
+	/* there must exist extent to be tailgated */
+	nxoff = offsetXAD(nxad);
+	nxlen = lengthXAD(nxad);
+	nxaddr = addressXAD(nxad);
+
+	if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+
+	if (cmp != 0) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "Could not find extent\n");
+		return -EIO;
+	}
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire tlock of the leaf page containing original entry
+	 */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	xad = &p->xad[index0];
+	xflag = xad->flag;
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	xaddr = addressXAD(xad);
+
+	/* nXAD must be completely contained within XAD */
+	if ((xoff > nxoff) ||
+	    (nxoff + nxlen > xoff + xlen)) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb,
+			  "nXAD in not completely contained within XAD\n");
+		return -EIO;
+	}
+
+	index = index0;
+	newindex = index + 1;
+	nextindex = le16_to_cpu(p->header.nextindex);
+
+#ifdef  _JFS_WIP_NOCOALESCE
+	if (xoff < nxoff)
+		goto updateRight;
+
+	/*
+	 * replace XAD with nXAD
+	 */
+      replace:			/* (nxoff == xoff) */
+	if (nxlen == xlen) {
+		/* replace XAD with nXAD:recorded */
+		*xad = *nxad;
+		xad->flag = xflag & ~XAD_NOTRECORDED;
+
+		goto out;
+	} else			/* (nxlen < xlen) */
+		goto updateLeft;
+#endif				/* _JFS_WIP_NOCOALESCE */
+
+/* #ifdef _JFS_WIP_COALESCE */
+	if (xoff < nxoff)
+		goto coalesceRight;
+
+	/*
+	 * coalesce with left XAD
+	 */
+//coalesceLeft: /* (xoff == nxoff) */
+	/* is XAD first entry of page ? */
+	if (index == XTENTRYSTART)
+		goto replace;
+
+	/* is nXAD logically and physically contiguous with lXAD ? */
+	lxad = &p->xad[index - 1];
+	lxlen = lengthXAD(lxad);
+	if (!(lxad->flag & XAD_NOTRECORDED) &&
+	    (nxoff == offsetXAD(lxad) + lxlen) &&
+	    (nxaddr == addressXAD(lxad) + lxlen) &&
+	    (lxlen + nxlen < MAXXLEN)) {
+		/* extend right lXAD */
+		index0 = index - 1;
+		XADlength(lxad, lxlen + nxlen);
+
+		/* If we just merged two extents together, need to make sure the
+		 * right extent gets logged.  If the left one is marked XAD_NEW,
+		 * then we know it will be logged.  Otherwise, mark as
+		 * XAD_EXTENDED
+		 */
+		if (!(lxad->flag & XAD_NEW))
+			lxad->flag |= XAD_EXTENDED;
+
+		if (xlen > nxlen) {
+			/* truncate XAD */
+			XADoffset(xad, xoff + nxlen);
+			XADlength(xad, xlen - nxlen);
+			XADaddress(xad, xaddr + nxlen);
+			goto out;
+		} else {	/* (xlen == nxlen) */
+
+			/* remove XAD */
+			if (index < nextindex - 1)
+				memmove(&p->xad[index], &p->xad[index + 1],
+					(nextindex - index -
+					 1) << L2XTSLOTSIZE);
+
+			p->header.nextindex =
+			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+					1);
+
+			index = index0;
+			newindex = index + 1;
+			nextindex = le16_to_cpu(p->header.nextindex);
+			xoff = nxoff = offsetXAD(lxad);
+			xlen = nxlen = lxlen + nxlen;
+			xaddr = nxaddr = addressXAD(lxad);
+			goto coalesceRight;
+		}
+	}
+
+	/*
+	 * replace XAD with nXAD
+	 */
+      replace:			/* (nxoff == xoff) */
+	if (nxlen == xlen) {
+		/* replace XAD with nXAD:recorded */
+		*xad = *nxad;
+		xad->flag = xflag & ~XAD_NOTRECORDED;
+
+		goto coalesceRight;
+	} else			/* (nxlen < xlen) */
+		goto updateLeft;
+
+	/*
+	 * coalesce with right XAD
+	 */
+      coalesceRight:		/* (xoff <= nxoff) */
+	/* is XAD last entry of page ? */
+	if (newindex == nextindex) {
+		if (xoff == nxoff)
+			goto out;
+		goto updateRight;
+	}
+
+	/* is nXAD logically and physically contiguous with rXAD ? */
+	rxad = &p->xad[index + 1];
+	rxlen = lengthXAD(rxad);
+	if (!(rxad->flag & XAD_NOTRECORDED) &&
+	    (nxoff + nxlen == offsetXAD(rxad)) &&
+	    (nxaddr + nxlen == addressXAD(rxad)) &&
+	    (rxlen + nxlen < MAXXLEN)) {
+		/* extend left rXAD */
+		XADoffset(rxad, nxoff);
+		XADlength(rxad, rxlen + nxlen);
+		XADaddress(rxad, nxaddr);
+
+		/* If we just merged two extents together, need to make sure
+		 * the left extent gets logged.  If the right one is marked
+		 * XAD_NEW, then we know it will be logged.  Otherwise, mark as
+		 * XAD_EXTENDED
+		 */
+		if (!(rxad->flag & XAD_NEW))
+			rxad->flag |= XAD_EXTENDED;
+
+		if (xlen > nxlen)
+			/* truncate XAD */
+			XADlength(xad, xlen - nxlen);
+		else {		/* (xlen == nxlen) */
+
+			/* remove XAD */
+			memmove(&p->xad[index], &p->xad[index + 1],
+				(nextindex - index - 1) << L2XTSLOTSIZE);
+
+			p->header.nextindex =
+			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+					1);
+		}
+
+		goto out;
+	} else if (xoff == nxoff)
+		goto out;
+
+	if (xoff >= nxoff) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "xoff >= nxoff\n");
+		return -EIO;
+	}
+/* #endif _JFS_WIP_COALESCE */
+
+	/*
+	 * split XAD into (lXAD, nXAD):
+	 *
+	 *          |---nXAD--->
+	 * --|----------XAD----------|--
+	 *   |-lXAD-|
+	 */
+      updateRight:		/* (xoff < nxoff) */
+	/* truncate old XAD as lXAD:not_recorded */
+	xad = &p->xad[index];
+	XADlength(xad, nxoff - xoff);
+
+	/* insert nXAD:recorded */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = newindex;
+		split.flag = xflag & ~XAD_NOTRECORDED;
+		split.off = nxoff;
+		split.len = nxlen;
+		split.addr = nxaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		} else {
+			/* is nXAD on new page ? */
+			if (newindex >
+			    (le16_to_cpu(p->header.maxentry) >> 1)) {
+				newindex =
+				    newindex -
+				    le16_to_cpu(p->header.nextindex) +
+				    XTENTRYSTART;
+				newpage = 1;
+			}
+		}
+	} else {
+		/* if insert into middle, shift right remaining entries */
+		if (newindex < nextindex)
+			memmove(&p->xad[newindex + 1], &p->xad[newindex],
+				(nextindex - newindex) << L2XTSLOTSIZE);
+
+		/* insert the entry */
+		xad = &p->xad[newindex];
+		*xad = *nxad;
+		xad->flag = xflag & ~XAD_NOTRECORDED;
+
+		/* advance next available entry index. */
+		p->header.nextindex =
+		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+	}
+
+	/*
+	 * does nXAD force 3-way split ?
+	 *
+	 *          |---nXAD--->|
+	 * --|----------XAD-------------|--
+	 *   |-lXAD-|           |-rXAD -|
+	 */
+	if (nxoff + nxlen == xoff + xlen)
+		goto out;
+
+	/* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
+	if (newpage) {
+		/* close out old page */
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			xtlck->lwm.offset = (xtlck->lwm.offset) ?
+			    min(index0, (int)xtlck->lwm.offset) : index0;
+			xtlck->lwm.length =
+			    le16_to_cpu(p->header.nextindex) -
+			    xtlck->lwm.offset;
+		}
+
+		bn = le64_to_cpu(p->header.next);
+		XT_PUTPAGE(mp);
+
+		/* get new right page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		BT_MARK_DIRTY(mp, ip);
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+			xtlck = (struct xtlock *) & tlck->lock;
+		}
+
+		index0 = index = newindex;
+	} else
+		index++;
+
+	newindex = index + 1;
+	nextindex = le16_to_cpu(p->header.nextindex);
+	xlen = xlen - (nxoff - xoff);
+	xoff = nxoff;
+	xaddr = nxaddr;
+
+	/* recompute split pages */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		XT_PUTPAGE(mp);
+
+		if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
+			return rc;
+
+		/* retrieve search result */
+		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+
+		if (cmp != 0) {
+			XT_PUTPAGE(mp);
+			jfs_error(ip->i_sb, "xtSearch failed\n");
+			return -EIO;
+		}
+
+		if (index0 != index) {
+			XT_PUTPAGE(mp);
+			jfs_error(ip->i_sb, "unexpected value of index\n");
+			return -EIO;
+		}
+	}
+
+	/*
+	 * split XAD into (nXAD, rXAD)
+	 *
+	 *          ---nXAD---|
+	 * --|----------XAD----------|--
+	 *                    |-rXAD-|
+	 */
+      updateLeft:		/* (nxoff == xoff) && (nxlen < xlen) */
+	/* update old XAD with nXAD:recorded */
+	xad = &p->xad[index];
+	*xad = *nxad;
+	xad->flag = xflag & ~XAD_NOTRECORDED;
+
+	/* insert rXAD:not_recorded */
+	xoff = xoff + nxlen;
+	xlen = xlen - nxlen;
+	xaddr = xaddr + nxlen;
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+/*
+printf("xtUpdate.updateLeft.split p:0x%p\n", p);
+*/
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = newindex;
+		split.flag = xflag;
+		split.off = xoff;
+		split.len = xlen;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		}
+	} else {
+		/* if insert into middle, shift right remaining entries */
+		if (newindex < nextindex)
+			memmove(&p->xad[newindex + 1], &p->xad[newindex],
+				(nextindex - newindex) << L2XTSLOTSIZE);
+
+		/* insert the entry */
+		xad = &p->xad[newindex];
+		XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+		/* advance next available entry index. */
+		p->header.nextindex =
+		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+	}
+
+      out:
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		xtlck->lwm.offset = (xtlck->lwm.offset) ?
+		    min(index0, (int)xtlck->lwm.offset) : index0;
+		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+		    xtlck->lwm.offset;
+	}
+
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+
+/*
+ *	xtAppend()
+ *
+ * function: grow in append mode from contiguous region specified ;
+ *
+ * parameter:
+ *	tid		- transaction id;
+ *	ip		- file object;
+ *	xflag		- extent flag:
+ *	xoff		- extent offset;
+ *	maxblocks	- max extent length;
+ *	xlen		- extent length (in/out);
+ *	xaddrp		- extent address pointer (in/out):
+ *	flag		-
+ *
+ * return:
+ */
+int xtAppend(tid_t tid,		/* transaction id */
+	     struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
+	     s32 * xlenp,	/* (in/out) */
+	     s64 * xaddrp,	/* (in/out) */
+	     int flag)
+{
+	int rc = 0;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn, xaddr;
+	int index, nextindex;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	int cmp;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	int nsplit, nblocks, xlen;
+	struct pxdlist pxdlist;
+	pxd_t *pxd;
+	s64 next;
+
+	xaddr = *xaddrp;
+	xlen = *xlenp;
+	jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx",
+		 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
+
+	/*
+	 *	search for the entry location at which to insert:
+	 *
+	 * xtFastSearch() and xtSearch() both returns (leaf page
+	 * pinned, index at which to insert).
+	 * n.b. xtSearch() may return index of maxentry of
+	 * the full page.
+	 */
+	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	if (cmp == 0) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	if (next)
+		xlen = min(xlen, (int)(next - xoff));
+//insert:
+	/*
+	 *	insert entry for new extent
+	 */
+	xflag |= XAD_NEW;
+
+	/*
+	 *	if the leaf page is full, split the page and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	if (nextindex < le16_to_cpu(p->header.maxentry))
+		goto insertLeaf;
+
+	/*
+	 * allocate new index blocks to cover index page split(s)
+	 */
+	nsplit = btstack.nsplit;
+	split.pxdlist = &pxdlist;
+	pxdlist.maxnpxd = pxdlist.npxd = 0;
+	pxd = &pxdlist.pxd[0];
+	nblocks = JFS_SBI(ip->i_sb)->nbperpage;
+	for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
+		if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
+			PXDaddress(pxd, xaddr);
+			PXDlength(pxd, nblocks);
+
+			pxdlist.maxnpxd++;
+
+			continue;
+		}
+
+		/* undo allocation */
+
+		goto out;
+	}
+
+	xlen = min(xlen, maxblocks);
+
+	/*
+	 * allocate data extent requested
+	 */
+	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+		goto out;
+
+	split.mp = mp;
+	split.index = index;
+	split.flag = xflag;
+	split.off = xoff;
+	split.len = xlen;
+	split.addr = xaddr;
+	if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+		/* undo data extent allocation */
+		dbFree(ip, *xaddrp, (s64) * xlenp);
+
+		return rc;
+	}
+
+	*xaddrp = xaddr;
+	*xlenp = xlen;
+	return 0;
+
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+      insertLeaf:
+	/*
+	 * allocate data extent requested
+	 */
+	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+		goto out;
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action: xad insertion/extension;
+	 */
+	tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+	xtlck = (struct xtlock *) & tlck->lock;
+
+	/* insert the new entry: mark the entry NEW */
+	xad = &p->xad[index];
+	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+	/* advance next available entry index */
+	le16_add_cpu(&p->header.nextindex, 1);
+
+	xtlck->lwm.offset =
+	    (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
+	xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+	    xtlck->lwm.offset;
+
+	*xaddrp = xaddr;
+	*xlenp = xlen;
+
+      out:
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+#ifdef _STILL_TO_PORT
+
+/* - TBD for defragmentaion/reorganization -
+ *
+ *	xtDelete()
+ *
+ * function:
+ *	delete the entry with the specified key.
+ *
+ *	N.B.: whole extent of the entry is assumed to be deleted.
+ *
+ * parameter:
+ *
+ * return:
+ *	ENOENT: if the entry is not found.
+ *
+ * exception:
+ */
+int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
+{
+	int rc = 0;
+	struct btstack btstack;
+	int cmp;
+	s64 bn;
+	struct metapage *mp;
+	xtpage_t *p;
+	int index, nextindex;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	/*
+	 * find the matching entry; xtSearch() pins the page
+	 */
+	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
+		return rc;
+
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+	if (cmp) {
+		/* unpin the leaf page */
+		XT_PUTPAGE(mp);
+		return -ENOENT;
+	}
+
+	/*
+	 * delete the entry from the leaf page
+	 */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	le16_add_cpu(&p->header.nextindex, -1);
+
+	/*
+	 * if the leaf page bocome empty, free the page
+	 */
+	if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
+		return (xtDeleteUp(tid, ip, mp, p, &btstack));
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action:xad deletion;
+	 */
+	tlck = txLock(tid, ip, mp, tlckXTREE);
+	xtlck = (struct xtlock *) & tlck->lock;
+	xtlck->lwm.offset =
+	    (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
+
+	/* if delete from middle, shift left/compact the remaining entries */
+	if (index < nextindex - 1)
+		memmove(&p->xad[index], &p->xad[index + 1],
+			(nextindex - index - 1) * sizeof(xad_t));
+
+	XT_PUTPAGE(mp);
+
+	return 0;
+}
+
+
+/* - TBD for defragmentaion/reorganization -
+ *
+ *	xtDeleteUp()
+ *
+ * function:
+ *	free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int
+xtDeleteUp(tid_t tid, struct inode *ip,
+	   struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *mp;
+	xtpage_t *p;
+	int index, nextindex;
+	s64 xaddr;
+	int xlen;
+	struct btframe *parent;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	/*
+	 * keep root leaf page which has become empty
+	 */
+	if (fp->header.flag & BT_ROOT) {
+		/* keep the root page */
+		fp->header.flag &= ~BT_INTERNAL;
+		fp->header.flag |= BT_LEAF;
+		fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+		/* XT_PUTPAGE(fmp); */
+
+		return 0;
+	}
+
+	/*
+	 * free non-root leaf page
+	 */
+	if ((rc = xtRelink(tid, ip, fp))) {
+		XT_PUTPAGE(fmp);
+		return rc;
+	}
+
+	xaddr = addressPXD(&fp->header.self);
+	xlen = lengthPXD(&fp->header.self);
+	/* free the page extent */
+	dbFree(ip, xaddr, (s64) xlen);
+
+	/* free the buffer page */
+	discard_metapage(fmp);
+
+	/*
+	 * propagate page deletion up the index tree
+	 *
+	 * If the delete from the parent page makes it empty,
+	 * continue all the way up the tree.
+	 * stop if the root page is reached (which is never deleted) or
+	 * if the entry deletion does not empty the page.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* get/pin the parent page <sp> */
+		XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		index = parent->index;
+
+		/* delete the entry for the freed child page from parent.
+		 */
+		nextindex = le16_to_cpu(p->header.nextindex);
+
+		/*
+		 * the parent has the single entry being deleted:
+		 * free the parent page which has become empty.
+		 */
+		if (nextindex == 1) {
+			if (p->header.flag & BT_ROOT) {
+				/* keep the root page */
+				p->header.flag &= ~BT_INTERNAL;
+				p->header.flag |= BT_LEAF;
+				p->header.nextindex =
+				    cpu_to_le16(XTENTRYSTART);
+
+				/* XT_PUTPAGE(mp); */
+
+				break;
+			} else {
+				/* free the parent page */
+				if ((rc = xtRelink(tid, ip, p)))
+					return rc;
+
+				xaddr = addressPXD(&p->header.self);
+				/* free the page extent */
+				dbFree(ip, xaddr,
+				       (s64) JFS_SBI(ip->i_sb)->nbperpage);
+
+				/* unpin/free the buffer page */
+				discard_metapage(mp);
+
+				/* propagate up */
+				continue;
+			}
+		}
+		/*
+		 * the parent has other entries remaining:
+		 * delete the router entry from the parent page.
+		 */
+		else {
+			BT_MARK_DIRTY(mp, ip);
+			/*
+			 * acquire a transaction lock on the leaf page;
+			 *
+			 * action:xad deletion;
+			 */
+			tlck = txLock(tid, ip, mp, tlckXTREE);
+			xtlck = (struct xtlock *) & tlck->lock;
+			xtlck->lwm.offset =
+			    (xtlck->lwm.offset) ? min(index,
+						      xtlck->lwm.
+						      offset) : index;
+
+			/* if delete from middle,
+			 * shift left/compact the remaining entries in the page
+			 */
+			if (index < nextindex - 1)
+				memmove(&p->xad[index], &p->xad[index + 1],
+					(nextindex - index -
+					 1) << L2XTSLOTSIZE);
+
+			le16_add_cpu(&p->header.nextindex, -1);
+			jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
+				 (ulong) parent->bn, index);
+		}
+
+		/* unpin the parent page */
+		XT_PUTPAGE(mp);
+
+		/* exit propagation up */
+		break;
+	}
+
+	return 0;
+}
+
+
+/*
+ * NAME:	xtRelocate()
+ *
+ * FUNCTION:	relocate xtpage or data extent of regular file;
+ *		This function is mainly used by defragfs utility.
+ *
+ * NOTE:	This routine does not have the logic to handle
+ *		uncommitted allocated extent. The caller should call
+ *		txCommit() to commit all the allocation before call
+ *		this routine.
+ */
+int
+xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
+	   s64 nxaddr,		/* new xaddr */
+	   int xtype)
+{				/* extent type: XTPAGE or DATAEXT */
+	int rc = 0;
+	struct tblock *tblk;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	struct metapage *mp, *pmp, *lmp, *rmp;	/* meta-page buffer */
+	xtpage_t *p, *pp, *rp, *lp;	/* base B+-tree index page */
+	xad_t *xad;
+	pxd_t *pxd;
+	s64 xoff, xsize;
+	int xlen;
+	s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
+	cbuf_t *cp;
+	s64 offset, nbytes, nbrd, pno;
+	int nb, npages, nblks;
+	s64 bn;
+	int cmp;
+	int index;
+	struct pxd_lock *pxdlock;
+	struct btstack btstack;	/* traverse stack */
+
+	xtype = xtype & EXTENT_TYPE;
+
+	xoff = offsetXAD(oxad);
+	oxaddr = addressXAD(oxad);
+	xlen = lengthXAD(oxad);
+
+	/* validate extent offset */
+	offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+	if (offset >= ip->i_size)
+		return -ESTALE;	/* stale extent */
+
+	jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
+		 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
+
+	/*
+	 *	1. get and validate the parent xtpage/xad entry
+	 *	covering the source extent to be relocated;
+	 */
+	if (xtype == DATAEXT) {
+		/* search in leaf entry */
+		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
+		if (rc)
+			return rc;
+
+		/* retrieve search result */
+		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+
+		if (cmp) {
+			XT_PUTPAGE(pmp);
+			return -ESTALE;
+		}
+
+		/* validate for exact match with a single entry */
+		xad = &pp->xad[index];
+		if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
+			XT_PUTPAGE(pmp);
+			return -ESTALE;
+		}
+	} else {		/* (xtype == XTPAGE) */
+
+		/* search in internal entry */
+		rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
+		if (rc)
+			return rc;
+
+		/* retrieve search result */
+		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+
+		if (cmp) {
+			XT_PUTPAGE(pmp);
+			return -ESTALE;
+		}
+
+		/* xtSearchNode() validated for exact match with a single entry
+		 */
+		xad = &pp->xad[index];
+	}
+	jfs_info("xtRelocate: parent xad entry validated.");
+
+	/*
+	 *	2. relocate the extent
+	 */
+	if (xtype == DATAEXT) {
+		/* if the extent is allocated-but-not-recorded
+		 * there is no real data to be moved in this extent,
+		 */
+		if (xad->flag & XAD_NOTRECORDED)
+			goto out;
+		else
+			/* release xtpage for cmRead()/xtLookup() */
+			XT_PUTPAGE(pmp);
+
+		/*
+		 *	cmRelocate()
+		 *
+		 * copy target data pages to be relocated;
+		 *
+		 * data extent must start at page boundary and
+		 * multiple of page size (except the last data extent);
+		 * read in each page of the source data extent into cbuf,
+		 * update the cbuf extent descriptor of the page to be
+		 * homeward bound to new dst data extent
+		 * copy the data from the old extent to new extent.
+		 * copy is essential for compressed files to avoid problems
+		 * that can arise if there was a change in compression
+		 * algorithms.
+		 * it is a good strategy because it may disrupt cache
+		 * policy to keep the pages in memory afterwards.
+		 */
+		offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+		assert((offset & CM_OFFSET) == 0);
+		nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+		pno = offset >> CM_L2BSIZE;
+		npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
+/*
+		npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
+			  (offset >> CM_L2BSIZE) + 1;
+*/
+		sxaddr = oxaddr;
+		dxaddr = nxaddr;
+
+		/* process the request one cache buffer at a time */
+		for (nbrd = 0; nbrd < nbytes; nbrd += nb,
+		     offset += nb, pno++, npages--) {
+			/* compute page size */
+			nb = min(nbytes - nbrd, CM_BSIZE);
+
+			/* get the cache buffer of the page */
+			if (rc = cmRead(ip, offset, npages, &cp))
+				break;
+
+			assert(addressPXD(&cp->cm_pxd) == sxaddr);
+			assert(!cp->cm_modified);
+
+			/* bind buffer with the new extent address */
+			nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
+			cmSetXD(ip, cp, pno, dxaddr, nblks);
+
+			/* release the cbuf, mark it as modified */
+			cmPut(cp, true);
+
+			dxaddr += nblks;
+			sxaddr += nblks;
+		}
+
+		/* get back parent page */
+		if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
+			return rc;
+
+		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+		jfs_info("xtRelocate: target data extent relocated.");
+	} else {		/* (xtype == XTPAGE) */
+
+		/*
+		 * read in the target xtpage from the source extent;
+		 */
+		XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+		if (rc) {
+			XT_PUTPAGE(pmp);
+			return rc;
+		}
+
+		/*
+		 * read in sibling pages if any to update sibling pointers;
+		 */
+		rmp = NULL;
+		if (p->header.next) {
+			nextbn = le64_to_cpu(p->header.next);
+			XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+			if (rc) {
+				XT_PUTPAGE(pmp);
+				XT_PUTPAGE(mp);
+				return (rc);
+			}
+		}
+
+		lmp = NULL;
+		if (p->header.prev) {
+			prevbn = le64_to_cpu(p->header.prev);
+			XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+			if (rc) {
+				XT_PUTPAGE(pmp);
+				XT_PUTPAGE(mp);
+				if (rmp)
+					XT_PUTPAGE(rmp);
+				return (rc);
+			}
+		}
+
+		/* at this point, all xtpages to be updated are in memory */
+
+		/*
+		 * update sibling pointers of sibling xtpages if any;
+		 */
+		if (lmp) {
+			BT_MARK_DIRTY(lmp, ip);
+			tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
+			lp->header.next = cpu_to_le64(nxaddr);
+			XT_PUTPAGE(lmp);
+		}
+
+		if (rmp) {
+			BT_MARK_DIRTY(rmp, ip);
+			tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
+			rp->header.prev = cpu_to_le64(nxaddr);
+			XT_PUTPAGE(rmp);
+		}
+
+		/*
+		 * update the target xtpage to be relocated
+		 *
+		 * update the self address of the target page
+		 * and write to destination extent;
+		 * redo image covers the whole xtpage since it is new page
+		 * to the destination extent;
+		 * update of bmap for the free of source extent
+		 * of the target xtpage itself:
+		 * update of bmap for the allocation of destination extent
+		 * of the target xtpage itself:
+		 * update of bmap for the extents covered by xad entries in
+		 * the target xtpage is not necessary since they are not
+		 * updated;
+		 * if not committed before this relocation,
+		 * target page may contain XAD_NEW entries which must
+		 * be scanned for bmap update (logredo() always
+		 * scan xtpage REDOPAGE image for bmap update);
+		 * if committed before this relocation (tlckRELOCATE),
+		 * scan may be skipped by commit() and logredo();
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		/* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
+		xtlck = (struct xtlock *) & tlck->lock;
+
+		/* update the self address in the xtpage header */
+		pxd = &p->header.self;
+		PXDaddress(pxd, nxaddr);
+
+		/* linelock for the after image of the whole page */
+		xtlck->lwm.length =
+		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+
+		/* update the buffer extent descriptor of target xtpage */
+		xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+		bmSetXD(mp, nxaddr, xsize);
+
+		/* unpin the target page to new homeward bound */
+		XT_PUTPAGE(mp);
+		jfs_info("xtRelocate: target xtpage relocated.");
+	}
+
+	/*
+	 *	3. acquire maplock for the source extent to be freed;
+	 *
+	 * acquire a maplock saving the src relocated extent address;
+	 * to free of the extent at commit time;
+	 */
+      out:
+	/* if DATAEXT relocation, write a LOG_UPDATEMAP record for
+	 * free PXD of the source data extent (logredo() will update
+	 * bmap for free of source data extent), and update bmap for
+	 * free of the source data extent;
+	 */
+	if (xtype == DATAEXT)
+		tlck = txMaplock(tid, ip, tlckMAP);
+	/* if XTPAGE relocation, write a LOG_NOREDOPAGE record
+	 * for the source xtpage (logredo() will init NoRedoPage
+	 * filter and will also update bmap for free of the source
+	 * xtpage), and update bmap for free of the source xtpage;
+	 * N.B. We use tlckMAP instead of tlkcXTREE because there
+	 *      is no buffer associated with this lock since the buffer
+	 *      has been redirected to the target location.
+	 */
+	else			/* (xtype == XTPAGE) */
+		tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
+
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	PXDaddress(&pxdlock->pxd, oxaddr);
+	PXDlength(&pxdlock->pxd, xlen);
+	pxdlock->index = 1;
+
+	/*
+	 *	4. update the parent xad entry for relocation;
+	 *
+	 * acquire tlck for the parent entry with XAD_NEW as entry
+	 * update which will write LOG_REDOPAGE and update bmap for
+	 * allocation of XAD_NEW destination extent;
+	 */
+	jfs_info("xtRelocate: update parent xad entry.");
+	BT_MARK_DIRTY(pmp, ip);
+	tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
+	xtlck = (struct xtlock *) & tlck->lock;
+
+	/* update the XAD with the new destination extent; */
+	xad = &pp->xad[index];
+	xad->flag |= XAD_NEW;
+	XADaddress(xad, nxaddr);
+
+	xtlck->lwm.offset = min(index, xtlck->lwm.offset);
+	xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
+	    xtlck->lwm.offset;
+
+	/* unpin the parent xtpage */
+	XT_PUTPAGE(pmp);
+
+	return rc;
+}
+
+
+/*
+ *	xtSearchNode()
+ *
+ * function:	search for the internal xad entry covering specified extent.
+ *		This function is mainly used by defragfs utility.
+ *
+ * parameters:
+ *	ip	- file object;
+ *	xad	- extent to find;
+ *	cmpp	- comparison result:
+ *	btstack - traverse stack;
+ *	flag	- search process flag;
+ *
+ * returns:
+ *	btstack contains (bn, index) of search path traversed to the entry.
+ *	*cmpp is set to result of comparison with the entry returned.
+ *	the page containing the entry is pinned at exit.
+ */
+static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
+			int *cmpp, struct btstack * btstack, int flag)
+{
+	int rc = 0;
+	s64 xoff, xaddr;
+	int xlen;
+	int cmp = 1;		/* init for empty page */
+	s64 bn;			/* block number */
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* page */
+	int base, index, lim;
+	struct btframe *btsp;
+	s64 t64;
+
+	BT_CLR(btstack);
+
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	xaddr = addressXAD(xad);
+
+	/*
+	 *	search down tree from root:
+	 *
+	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+	 *
+	 * if entry with search key K is not found
+	 * internal page search find the entry with largest key Ki
+	 * less than K which point to the child page to search;
+	 * leaf page search find the entry with smallest key Kj
+	 * greater than K so that the returned index is the position of
+	 * the entry to be shifted right for insertion of new entry.
+	 * for empty tree, search key is greater than any key of the tree.
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		if (p->header.flag & BT_LEAF) {
+			XT_PUTPAGE(mp);
+			return -ESTALE;
+		}
+
+		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+
+		/*
+		 * binary search with search key K on the current page
+		 */
+		for (base = XTENTRYSTART; lim; lim >>= 1) {
+			index = base + (lim >> 1);
+
+			XT_CMP(cmp, xoff, &p->xad[index], t64);
+			if (cmp == 0) {
+				/*
+				 *	search hit
+				 *
+				 * verify for exact match;
+				 */
+				if (xaddr == addressXAD(&p->xad[index]) &&
+				    xoff == offsetXAD(&p->xad[index])) {
+					*cmpp = cmp;
+
+					/* save search result */
+					btsp = btstack->top;
+					btsp->bn = bn;
+					btsp->index = index;
+					btsp->mp = mp;
+
+					return 0;
+				}
+
+				/* descend/search its child page */
+				goto next;
+			}
+
+			if (cmp > 0) {
+				base = index + 1;
+				--lim;
+			}
+		}
+
+		/*
+		 *	search miss - non-leaf page:
+		 *
+		 * base is the smallest index with key (Kj) greater than
+		 * search key (K) and may be zero or maxentry index.
+		 * if base is non-zero, decrement base by one to get the parent
+		 * entry of the child page to search.
+		 */
+		index = base ? base - 1 : base;
+
+		/*
+		 * go down to child page
+		 */
+	      next:
+		/* get the child page block number */
+		bn = addressXAD(&p->xad[index]);
+
+		/* unpin the parent page */
+		XT_PUTPAGE(mp);
+	}
+}
+
+
+/*
+ *	xtRelink()
+ *
+ * function:
+ *	link around a freed page.
+ *
+ * Parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	xtpage_t	*p)
+ *
+ * returns:
+ */
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
+{
+	int rc = 0;
+	struct metapage *mp;
+	s64 nextbn, prevbn;
+	struct tlock *tlck;
+
+	nextbn = le64_to_cpu(p->header.next);
+	prevbn = le64_to_cpu(p->header.prev);
+
+	/* update prev pointer of the next page */
+	if (nextbn != 0) {
+		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * acquire a transaction lock on the page;
+		 *
+		 * action: update prev pointer;
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+		/* the page may already have been tlock'd */
+
+		p->header.prev = cpu_to_le64(prevbn);
+
+		XT_PUTPAGE(mp);
+	}
+
+	/* update next pointer of the previous page */
+	if (prevbn != 0) {
+		XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * acquire a transaction lock on the page;
+		 *
+		 * action: update next pointer;
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+		/* the page may already have been tlock'd */
+
+		p->header.next = le64_to_cpu(nextbn);
+
+		XT_PUTPAGE(mp);
+	}
+
+	return 0;
+}
+#endif				/*  _STILL_TO_PORT */
+
+
+/*
+ *	xtInitRoot()
+ *
+ * initialize file root (inline in inode)
+ */
+void xtInitRoot(tid_t tid, struct inode *ip)
+{
+	xtpage_t *p;
+
+	/*
+	 * acquire a transaction lock on the root
+	 *
+	 * action:
+	 */
+	txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
+		      tlckXTREE | tlckNEW);
+	p = &JFS_IP(ip)->i_xtroot;
+
+	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+	p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+	if (S_ISDIR(ip->i_mode))
+		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
+	else {
+		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
+		ip->i_size = 0;
+	}
+
+
+	return;
+}
+
+
+/*
+ * We can run into a deadlock truncating a file with a large number of
+ * xtree pages (large fragmented file).  A robust fix would entail a
+ * reservation system where we would reserve a number of metadata pages
+ * and tlocks which we would be guaranteed without a deadlock.  Without
+ * this, a partial fix is to limit number of metadata pages we will lock
+ * in a single transaction.  Currently we will truncate the file so that
+ * no more than 50 leaf pages will be locked.  The caller of xtTruncate
+ * will be responsible for ensuring that the current transaction gets
+ * committed, and that subsequent transactions are created to truncate
+ * the file further if needed.
+ */
+#define MAX_TRUNCATE_LEAVES 50
+
+/*
+ *	xtTruncate()
+ *
+ * function:
+ *	traverse for truncation logging backward bottom up;
+ *	terminate at the last extent entry at the current subtree
+ *	root page covering new down size.
+ *	truncation may occur within the last extent entry.
+ *
+ * parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	s64		newsize,
+ *	int		type)	{PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
+ *
+ * return:
+ *
+ * note:
+ *	PWMAP:
+ *	 1. truncate (non-COMMIT_NOLINK file)
+ *	    by jfs_truncate() or jfs_open(O_TRUNC):
+ *	    xtree is updated;
+ *	 2. truncate index table of directory when last entry removed
+ *	map update via tlock at commit time;
+ *	PMAP:
+ *	 Call xtTruncate_pmap instead
+ *	WMAP:
+ *	 1. remove (free zero link count) on last reference release
+ *	    (pmap has been freed at commit zero link count);
+ *	 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
+ *	    xtree is updated;
+ *	 map update directly at truncation time;
+ *
+ *	if (DELETE)
+ *		no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
+ *	else if (TRUNCATE)
+ *		must write LOG_NOREDOPAGE for deleted index page;
+ *
+ * pages may already have been tlocked by anonymous transactions
+ * during file growth (i.e., write) before truncation;
+ *
+ * except last truncated entry, deleted entries remains as is
+ * in the page (nextindex is updated) for other use
+ * (e.g., log/update allocation map): this avoid copying the page
+ * info but delay free of pages;
+ *
+ */
+s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
+{
+	int rc = 0;
+	s64 teof;
+	struct metapage *mp;
+	xtpage_t *p;
+	s64 bn;
+	int index, nextindex;
+	xad_t *xad;
+	s64 xoff, xaddr;
+	int xlen, len, freexlen;
+	struct btstack btstack;
+	struct btframe *parent;
+	struct tblock *tblk = NULL;
+	struct tlock *tlck = NULL;
+	struct xtlock *xtlck = NULL;
+	struct xdlistlock xadlock;	/* maplock for COMMIT_WMAP */
+	struct pxd_lock *pxdlock;		/* maplock for COMMIT_WMAP */
+	s64 nfreed;
+	int freed, log;
+	int locked_leaves = 0;
+
+	/* save object truncation type */
+	if (tid) {
+		tblk = tid_to_tblock(tid);
+		tblk->xflag |= flag;
+	}
+
+	nfreed = 0;
+
+	flag &= COMMIT_MAP;
+	assert(flag != COMMIT_PMAP);
+
+	if (flag == COMMIT_PWMAP)
+		log = 1;
+	else {
+		log = 0;
+		xadlock.flag = mlckFREEXADLIST;
+		xadlock.index = 1;
+	}
+
+	/*
+	 * if the newsize is not an integral number of pages,
+	 * the file between newsize and next page boundary will
+	 * be cleared.
+	 * if truncating into a file hole, it will cause
+	 * a full block to be allocated for the logical block.
+	 */
+
+	/*
+	 * release page blocks of truncated region <teof, eof>
+	 *
+	 * free the data blocks from the leaf index blocks.
+	 * delete the parent index entries corresponding to
+	 * the freed child data/index blocks.
+	 * free the index blocks themselves which aren't needed
+	 * in new sized file.
+	 *
+	 * index blocks are updated only if the blocks are to be
+	 * retained in the new sized file.
+	 * if type is PMAP, the data and index pages are NOT
+	 * freed, and the data and index blocks are NOT freed
+	 * from working map.
+	 * (this will allow continued access of data/index of
+	 * temporary file (zerolink count file truncated to zero-length)).
+	 */
+	teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+	    JFS_SBI(ip->i_sb)->l2bsize;
+
+	/* clear stack */
+	BT_CLR(&btstack);
+
+	/*
+	 * start with root
+	 *
+	 * root resides in the inode
+	 */
+	bn = 0;
+
+	/*
+	 * first access of each page:
+	 */
+      getPage:
+	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	/* process entries backward from last index */
+	index = le16_to_cpu(p->header.nextindex) - 1;
+
+
+	/* Since this is the rightmost page at this level, and we may have
+	 * already freed a page that was formerly to the right, let's make
+	 * sure that the next pointer is zero.
+	 */
+	if (p->header.next) {
+		if (log)
+			/*
+			 * Make sure this change to the header is logged.
+			 * If we really truncate this leaf, the flag
+			 * will be changed to tlckTRUNCATE
+			 */
+			tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+		BT_MARK_DIRTY(mp, ip);
+		p->header.next = 0;
+	}
+
+	if (p->header.flag & BT_INTERNAL)
+		goto getChild;
+
+	/*
+	 *	leaf page
+	 */
+	freed = 0;
+
+	/* does region covered by leaf page precede Teof ? */
+	xad = &p->xad[index];
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	if (teof >= xoff + xlen) {
+		XT_PUTPAGE(mp);
+		goto getParent;
+	}
+
+	/* (re)acquire tlock of the leaf page */
+	if (log) {
+		if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+			/*
+			 * We need to limit the size of the transaction
+			 * to avoid exhausting pagecache & tlocks
+			 */
+			XT_PUTPAGE(mp);
+			newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+			goto getParent;
+		}
+		tlck = txLock(tid, ip, mp, tlckXTREE);
+		tlck->type = tlckXTREE | tlckTRUNCATE;
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+	}
+	BT_MARK_DIRTY(mp, ip);
+
+	/*
+	 * scan backward leaf page entries
+	 */
+	for (; index >= XTENTRYSTART; index--) {
+		xad = &p->xad[index];
+		xoff = offsetXAD(xad);
+		xlen = lengthXAD(xad);
+		xaddr = addressXAD(xad);
+
+		/*
+		 * The "data" for a directory is indexed by the block
+		 * device's address space.  This metadata must be invalidated
+		 * here
+		 */
+		if (S_ISDIR(ip->i_mode) && (teof == 0))
+			invalidate_xad_metapages(ip, *xad);
+		/*
+		 * entry beyond eof: continue scan of current page
+		 *          xad
+		 * ---|---=======------->
+		 *   eof
+		 */
+		if (teof < xoff) {
+			nfreed += xlen;
+			continue;
+		}
+
+		/*
+		 * (xoff <= teof): last entry to be deleted from page;
+		 * If other entries remain in page: keep and update the page.
+		 */
+
+		/*
+		 * eof == entry_start: delete the entry
+		 *           xad
+		 * -------|=======------->
+		 *       eof
+		 *
+		 */
+		if (teof == xoff) {
+			nfreed += xlen;
+
+			if (index == XTENTRYSTART)
+				break;
+
+			nextindex = index;
+		}
+		/*
+		 * eof within the entry: truncate the entry.
+		 *          xad
+		 * -------===|===------->
+		 *          eof
+		 */
+		else if (teof < xoff + xlen) {
+			/* update truncated entry */
+			len = teof - xoff;
+			freexlen = xlen - len;
+			XADlength(xad, len);
+
+			/* save pxd of truncated extent in tlck */
+			xaddr += len;
+			if (log) {	/* COMMIT_PWMAP */
+				xtlck->lwm.offset = (xtlck->lwm.offset) ?
+				    min(index, (int)xtlck->lwm.offset) : index;
+				xtlck->lwm.length = index + 1 -
+				    xtlck->lwm.offset;
+				xtlck->twm.offset = index;
+				pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
+				pxdlock->flag = mlckFREEPXD;
+				PXDaddress(&pxdlock->pxd, xaddr);
+				PXDlength(&pxdlock->pxd, freexlen);
+			}
+			/* free truncated extent */
+			else {	/* COMMIT_WMAP */
+
+				pxdlock = (struct pxd_lock *) & xadlock;
+				pxdlock->flag = mlckFREEPXD;
+				PXDaddress(&pxdlock->pxd, xaddr);
+				PXDlength(&pxdlock->pxd, freexlen);
+				txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+
+				/* reset map lock */
+				xadlock.flag = mlckFREEXADLIST;
+			}
+
+			/* current entry is new last entry; */
+			nextindex = index + 1;
+
+			nfreed += freexlen;
+		}
+		/*
+		 * eof beyond the entry:
+		 *          xad
+		 * -------=======---|--->
+		 *                 eof
+		 */
+		else {		/* (xoff + xlen < teof) */
+
+			nextindex = index + 1;
+		}
+
+		if (nextindex < le16_to_cpu(p->header.nextindex)) {
+			if (!log) {	/* COMMIT_WAMP */
+				xadlock.xdlist = &p->xad[nextindex];
+				xadlock.count =
+				    le16_to_cpu(p->header.nextindex) -
+				    nextindex;
+				txFreeMap(ip, (struct maplock *) & xadlock,
+					  NULL, COMMIT_WMAP);
+			}
+			p->header.nextindex = cpu_to_le16(nextindex);
+		}
+
+		XT_PUTPAGE(mp);
+
+		/* assert(freed == 0); */
+		goto getParent;
+	}			/* end scan of leaf page entries */
+
+	freed = 1;
+
+	/*
+	 * leaf page become empty: free the page if type != PMAP
+	 */
+	if (log) {		/* COMMIT_PWMAP */
+		/* txCommit() with tlckFREE:
+		 * free data extents covered by leaf [XTENTRYSTART:hwm);
+		 * invalidate leaf if COMMIT_PWMAP;
+		 * if (TRUNCATE), will write LOG_NOREDOPAGE;
+		 */
+		tlck->type = tlckXTREE | tlckFREE;
+	} else {		/* COMMIT_WAMP */
+
+		/* free data extents covered by leaf */
+		xadlock.xdlist = &p->xad[XTENTRYSTART];
+		xadlock.count =
+		    le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+		txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP);
+	}
+
+	if (p->header.flag & BT_ROOT) {
+		p->header.flag &= ~BT_INTERNAL;
+		p->header.flag |= BT_LEAF;
+		p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+		XT_PUTPAGE(mp);	/* debug */
+		goto out;
+	} else {
+		if (log) {	/* COMMIT_PWMAP */
+			/* page will be invalidated at tx completion
+			 */
+			XT_PUTPAGE(mp);
+		} else {	/* COMMIT_WMAP */
+
+			if (mp->lid)
+				lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
+
+			/* invalidate empty leaf page */
+			discard_metapage(mp);
+		}
+	}
+
+	/*
+	 * the leaf page become empty: delete the parent entry
+	 * for the leaf page if the parent page is to be kept
+	 * in the new sized file.
+	 */
+
+	/*
+	 * go back up to the parent page
+	 */
+      getParent:
+	/* pop/restore parent entry for the current child page */
+	if ((parent = BT_POP(&btstack)) == NULL)
+		/* current page must have been root */
+		goto out;
+
+	/* get back the parent page */
+	bn = parent->bn;
+	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	index = parent->index;
+
+	/*
+	 * child page was not empty:
+	 */
+	if (freed == 0) {
+		/* has any entry deleted from parent ? */
+		if (index < le16_to_cpu(p->header.nextindex) - 1) {
+			/* (re)acquire tlock on the parent page */
+			if (log) {	/* COMMIT_PWMAP */
+				/* txCommit() with tlckTRUNCATE:
+				 * free child extents covered by parent [);
+				 */
+				tlck = txLock(tid, ip, mp, tlckXTREE);
+				xtlck = (struct xtlock *) & tlck->lock;
+				if (!(tlck->type & tlckTRUNCATE)) {
+					xtlck->hwm.offset =
+					    le16_to_cpu(p->header.
+							nextindex) - 1;
+					tlck->type =
+					    tlckXTREE | tlckTRUNCATE;
+				}
+			} else {	/* COMMIT_WMAP */
+
+				/* free child extents covered by parent */
+				xadlock.xdlist = &p->xad[index + 1];
+				xadlock.count =
+				    le16_to_cpu(p->header.nextindex) -
+				    index - 1;
+				txFreeMap(ip, (struct maplock *) & xadlock,
+					  NULL, COMMIT_WMAP);
+			}
+			BT_MARK_DIRTY(mp, ip);
+
+			p->header.nextindex = cpu_to_le16(index + 1);
+		}
+		XT_PUTPAGE(mp);
+		goto getParent;
+	}
+
+	/*
+	 * child page was empty:
+	 */
+	nfreed += lengthXAD(&p->xad[index]);
+
+	/*
+	 * During working map update, child page's tlock must be handled
+	 * before parent's.  This is because the parent's tlock will cause
+	 * the child's disk space to be marked available in the wmap, so
+	 * it's important that the child page be released by that time.
+	 *
+	 * ToDo:  tlocks should be on doubly-linked list, so we can
+	 * quickly remove it and add it to the end.
+	 */
+
+	/*
+	 * Move parent page's tlock to the end of the tid's tlock list
+	 */
+	if (log && mp->lid && (tblk->last != mp->lid) &&
+	    lid_to_tlock(mp->lid)->tid) {
+		lid_t lid = mp->lid;
+		struct tlock *prev;
+
+		tlck = lid_to_tlock(lid);
+
+		if (tblk->next == lid)
+			tblk->next = tlck->next;
+		else {
+			for (prev = lid_to_tlock(tblk->next);
+			     prev->next != lid;
+			     prev = lid_to_tlock(prev->next)) {
+				assert(prev->next);
+			}
+			prev->next = tlck->next;
+		}
+		lid_to_tlock(tblk->last)->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+	}
+
+	/*
+	 * parent page become empty: free the page
+	 */
+	if (index == XTENTRYSTART) {
+		if (log) {	/* COMMIT_PWMAP */
+			/* txCommit() with tlckFREE:
+			 * free child extents covered by parent;
+			 * invalidate parent if COMMIT_PWMAP;
+			 */
+			tlck = txLock(tid, ip, mp, tlckXTREE);
+			xtlck = (struct xtlock *) & tlck->lock;
+			xtlck->hwm.offset =
+			    le16_to_cpu(p->header.nextindex) - 1;
+			tlck->type = tlckXTREE | tlckFREE;
+		} else {	/* COMMIT_WMAP */
+
+			/* free child extents covered by parent */
+			xadlock.xdlist = &p->xad[XTENTRYSTART];
+			xadlock.count =
+			    le16_to_cpu(p->header.nextindex) -
+			    XTENTRYSTART;
+			txFreeMap(ip, (struct maplock *) & xadlock, NULL,
+				  COMMIT_WMAP);
+		}
+		BT_MARK_DIRTY(mp, ip);
+
+		if (p->header.flag & BT_ROOT) {
+			p->header.flag &= ~BT_INTERNAL;
+			p->header.flag |= BT_LEAF;
+			p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+			if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
+				/*
+				 * Shrink root down to allow inline
+				 * EA (otherwise fsck complains)
+				 */
+				p->header.maxentry =
+				    cpu_to_le16(XTROOTINITSLOT);
+				JFS_IP(ip)->mode2 |= INLINEEA;
+			}
+
+			XT_PUTPAGE(mp);	/* debug */
+			goto out;
+		} else {
+			if (log) {	/* COMMIT_PWMAP */
+				/* page will be invalidated at tx completion
+				 */
+				XT_PUTPAGE(mp);
+			} else {	/* COMMIT_WMAP */
+
+				if (mp->lid)
+					lid_to_tlock(mp->lid)->flag |=
+						tlckFREELOCK;
+
+				/* invalidate parent page */
+				discard_metapage(mp);
+			}
+
+			/* parent has become empty and freed:
+			 * go back up to its parent page
+			 */
+			/* freed = 1; */
+			goto getParent;
+		}
+	}
+	/*
+	 * parent page still has entries for front region;
+	 */
+	else {
+		/* try truncate region covered by preceding entry
+		 * (process backward)
+		 */
+		index--;
+
+		/* go back down to the child page corresponding
+		 * to the entry
+		 */
+		goto getChild;
+	}
+
+	/*
+	 *	internal page: go down to child page of current entry
+	 */
+      getChild:
+	/* save current parent entry for the child page */
+	if (BT_STACK_FULL(&btstack)) {
+		jfs_error(ip->i_sb, "stack overrun!\n");
+		XT_PUTPAGE(mp);
+		return -EIO;
+	}
+	BT_PUSH(&btstack, bn, index);
+
+	/* get child page */
+	xad = &p->xad[index];
+	bn = addressXAD(xad);
+
+	/*
+	 * first access of each internal entry:
+	 */
+	/* release parent page */
+	XT_PUTPAGE(mp);
+
+	/* process the child page */
+	goto getPage;
+
+      out:
+	/*
+	 * update file resource stat
+	 */
+	/* set size
+	 */
+	if (S_ISDIR(ip->i_mode) && !newsize)
+		ip->i_size = 1;	/* fsck hates zero-length directories */
+	else
+		ip->i_size = newsize;
+
+	/* update quota allocation to reflect freed blocks */
+	dquot_free_block(ip, nfreed);
+
+	/*
+	 * free tlock of invalidated pages
+	 */
+	if (flag == COMMIT_WMAP)
+		txFreelock(ip);
+
+	return newsize;
+}
+
+
+/*
+ *	xtTruncate_pmap()
+ *
+ * function:
+ *	Perform truncate to zero length for deleted file, leaving the
+ *	the xtree and working map untouched.  This allows the file to
+ *	be accessed via open file handles, while the delete of the file
+ *	is committed to disk.
+ *
+ * parameter:
+ *	tid_t		tid,
+ *	struct inode	*ip,
+ *	s64		committed_size)
+ *
+ * return: new committed size
+ *
+ * note:
+ *
+ *	To avoid deadlock by holding too many transaction locks, the
+ *	truncation may be broken up into multiple transactions.
+ *	The committed_size keeps track of part of the file has been
+ *	freed from the pmaps.
+ */
+s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
+{
+	s64 bn;
+	struct btstack btstack;
+	int cmp;
+	int index;
+	int locked_leaves = 0;
+	struct metapage *mp;
+	xtpage_t *p;
+	struct btframe *parent;
+	int rc;
+	struct tblock *tblk;
+	struct tlock *tlck = NULL;
+	xad_t *xad;
+	int xlen;
+	s64 xoff;
+	struct xtlock *xtlck = NULL;
+
+	/* save object truncation type */
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_PMAP;
+
+	/* clear stack */
+	BT_CLR(&btstack);
+
+	if (committed_size) {
+		xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
+		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
+		if (rc)
+			return rc;
+
+		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+		if (cmp != 0) {
+			XT_PUTPAGE(mp);
+			jfs_error(ip->i_sb, "did not find extent\n");
+			return -EIO;
+		}
+	} else {
+		/*
+		 * start with root
+		 *
+		 * root resides in the inode
+		 */
+		bn = 0;
+
+		/*
+		 * first access of each page:
+		 */
+      getPage:
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* process entries backward from last index */
+		index = le16_to_cpu(p->header.nextindex) - 1;
+
+		if (p->header.flag & BT_INTERNAL)
+			goto getChild;
+	}
+
+	/*
+	 *	leaf page
+	 */
+
+	if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+		/*
+		 * We need to limit the size of the transaction
+		 * to avoid exhausting pagecache & tlocks
+		 */
+		xad = &p->xad[index];
+		xoff = offsetXAD(xad);
+		xlen = lengthXAD(xad);
+		XT_PUTPAGE(mp);
+		return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+	}
+	tlck = txLock(tid, ip, mp, tlckXTREE);
+	tlck->type = tlckXTREE | tlckFREE;
+	xtlck = (struct xtlock *) & tlck->lock;
+	xtlck->hwm.offset = index;
+
+
+	XT_PUTPAGE(mp);
+
+	/*
+	 * go back up to the parent page
+	 */
+      getParent:
+	/* pop/restore parent entry for the current child page */
+	if ((parent = BT_POP(&btstack)) == NULL)
+		/* current page must have been root */
+		goto out;
+
+	/* get back the parent page */
+	bn = parent->bn;
+	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	index = parent->index;
+
+	/*
+	 * parent page become empty: free the page
+	 */
+	if (index == XTENTRYSTART) {
+		/* txCommit() with tlckFREE:
+		 * free child extents covered by parent;
+		 * invalidate parent if COMMIT_PWMAP;
+		 */
+		tlck = txLock(tid, ip, mp, tlckXTREE);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+		tlck->type = tlckXTREE | tlckFREE;
+
+		XT_PUTPAGE(mp);
+
+		if (p->header.flag & BT_ROOT) {
+
+			goto out;
+		} else {
+			goto getParent;
+		}
+	}
+	/*
+	 * parent page still has entries for front region;
+	 */
+	else
+		index--;
+	/*
+	 *	internal page: go down to child page of current entry
+	 */
+      getChild:
+	/* save current parent entry for the child page */
+	if (BT_STACK_FULL(&btstack)) {
+		jfs_error(ip->i_sb, "stack overrun!\n");
+		XT_PUTPAGE(mp);
+		return -EIO;
+	}
+	BT_PUSH(&btstack, bn, index);
+
+	/* get child page */
+	xad = &p->xad[index];
+	bn = addressXAD(xad);
+
+	/*
+	 * first access of each internal entry:
+	 */
+	/* release parent page */
+	XT_PUTPAGE(mp);
+
+	/* process the child page */
+	goto getPage;
+
+      out:
+
+	return 0;
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS Xtree statistics\n"
+		       "====================\n"
+		       "searches = %d\n"
+		       "fast searches = %d\n"
+		       "splits = %d\n",
+		       xtStat.search,
+		       xtStat.fastSearch,
+		       xtStat.split);
+	return 0;
+}
+
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_xtstat_proc_show, NULL);
+}
+
+const struct file_operations jfs_xtstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_xtstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
diff --git a/kernel/fs/jfs/jfs_xtree.h b/kernel/fs/jfs/jfs_xtree.h
new file mode 100644
index 000000000..1e0987986
--- /dev/null
+++ b/kernel/fs/jfs/jfs_xtree.h
@@ -0,0 +1,125 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_XTREE
+#define _H_JFS_XTREE
+
+/*
+ *	jfs_xtree.h: extent allocation descriptor B+-tree manager
+ */
+
+#include "jfs_btree.h"
+
+
+/*
+ *	extent allocation descriptor (xad)
+ */
+typedef struct xad {
+	__u8 flag;	/* 1: flag */
+	__u8 rsvrd[2];	/* 2: reserved */
+	__u8 off1;	/* 1: offset in unit of fsblksize */
+	__le32 off2;	/* 4: offset in unit of fsblksize */
+	pxd_t loc;	/* 8: length and address in unit of fsblksize */
+} xad_t;			/* (16) */
+
+#define MAXXLEN		((1 << 24) - 1)
+
+#define XTSLOTSIZE	16
+#define L2XTSLOTSIZE	4
+
+/* xad_t field construction */
+#define XADoffset(xad, offset64)\
+{\
+	(xad)->off1 = ((u64)offset64) >> 32;\
+	(xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
+}
+#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64)
+#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32)
+
+/* xad_t field extraction */
+#define offsetXAD(xad)\
+	( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
+#define addressXAD(xad) addressPXD(&(xad)->loc)
+#define lengthXAD(xad) lengthPXD(&(xad)->loc)
+
+/* xad list */
+struct xadlist {
+	s16 maxnxad;
+	s16 nxad;
+	xad_t *xad;
+};
+
+/* xad_t flags */
+#define XAD_NEW		0x01	/* new */
+#define XAD_EXTENDED	0x02	/* extended */
+#define XAD_COMPRESSED	0x04	/* compressed with recorded length */
+#define XAD_NOTRECORDED 0x08	/* allocated but not recorded */
+#define XAD_COW		0x10	/* copy-on-write */
+
+
+/* possible values for maxentry */
+#define XTROOTINITSLOT_DIR 6
+#define XTROOTINITSLOT	10
+#define XTROOTMAXSLOT	18
+#define XTPAGEMAXSLOT	256
+#define XTENTRYSTART	2
+
+/*
+ *	xtree page:
+ */
+typedef union {
+	struct xtheader {
+		__le64 next;	/* 8: */
+		__le64 prev;	/* 8: */
+
+		u8 flag;	/* 1: */
+		u8 rsrvd1;	/* 1: */
+		__le16 nextindex;	/* 2: next index = number of entries */
+		__le16 maxentry;	/* 2: max number of entries */
+		__le16 rsrvd2;	/* 2: */
+
+		pxd_t self;	/* 8: self */
+	} header;		/* (32) */
+
+	xad_t xad[XTROOTMAXSLOT];	/* 16 * maxentry: xad array */
+} xtpage_t;
+
+/*
+ *	external declaration
+ */
+extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
+		    int *pflag, s64 * paddr, int *plen, int flag);
+extern void xtInitRoot(tid_t tid, struct inode *ip);
+extern int xtInsert(tid_t tid, struct inode *ip,
+		    int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
+extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
+		    int flag);
+#ifdef _NOTYET
+extern int xtTailgate(tid_t tid, struct inode *ip,
+		      s64 xoff, int xlen, s64 xaddr, int flag);
+#endif
+extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
+extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
+		    int flag);
+extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
+extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
+extern int xtRelocate(tid_t tid, struct inode *ip,
+		      xad_t * oxad, s64 nxaddr, int xtype);
+extern int xtAppend(tid_t tid,
+		    struct inode *ip, int xflag, s64 xoff, int maxblocks,
+		    int *xlenp, s64 * xaddrp, int flag);
+#endif				/* !_H_JFS_XTREE */
diff --git a/kernel/fs/jfs/namei.c b/kernel/fs/jfs/namei.c
new file mode 100644
index 000000000..66db7bc0e
--- /dev/null
+++ b/kernel/fs/jfs/namei.c
@@ -0,0 +1,1606 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+#include <linux/quotaops.h>
+#include <linux/exportfs.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_inode.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_metapage.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+
+/*
+ * forward references
+ */
+const struct dentry_operations jfs_ci_dentry_operations;
+
+static s64 commitZeroLink(tid_t, struct inode *);
+
+/*
+ * NAME:	free_ea_wmap(inode)
+ *
+ * FUNCTION:	free uncommitted extended attributes from working map
+ *
+ */
+static inline void free_ea_wmap(struct inode *inode)
+{
+	dxd_t *ea = &JFS_IP(inode)->ea;
+
+	if (ea->flag & DXD_EXTENT) {
+		/* free EA pages from cache */
+		invalidate_dxd_metapages(inode, *ea);
+		dbFree(inode, addressDXD(ea), lengthDXD(ea));
+	}
+	ea->flag = 0;
+}
+
+/*
+ * NAME:	jfs_create(dip, dentry, mode)
+ *
+ * FUNCTION:	create a regular file in the parent directory <dip>
+ *		with name = <from dentry> and mode = <mode>
+ *
+ * PARAMETER:	dip	- parent directory vnode
+ *		dentry	- dentry of new file
+ *		mode	- create mode (rwxrwxrwx).
+ *		nd- nd struct
+ *
+ * RETURN:	Errors from subroutines
+ *
+ */
+static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
+		bool excl)
+{
+	int rc = 0;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = NULL;	/* child directory inode */
+	ino_t ino;
+	struct component_name dname;	/* child directory name */
+	struct btstack btstack;
+	struct inode *iplist[2];
+	struct tblock *tblk;
+
+	jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry);
+
+	dquot_initialize(dip);
+
+	/*
+	 * search parent directory for entry/freespace
+	 * (dtSearch() returns parent directory page pinned)
+	 */
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out1;
+
+	/*
+	 * Either iAlloc() or txBegin() may block.  Deadlock can occur if we
+	 * block there while holding dtree page, so we allocate the inode &
+	 * begin the transaction before we search the directory.
+	 */
+	ip = ialloc(dip, mode);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out2;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_acl(tid, ip, dip);
+	if (rc)
+		goto out3;
+
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
+		jfs_err("jfs_create: dtSearch returned %d", rc);
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	/*
+	 * initialize the child XAD tree root in-line in inode
+	 */
+	xtInitRoot(tid, ip);
+
+	/*
+	 * create entry in parent directory for child directory
+	 * (dtInsert() releases parent directory page)
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
+		if (rc == -EIO) {
+			jfs_err("jfs_create: dtInsert returned -EIO");
+			txAbort(tid, 1);	/* Marks Filesystem dirty */
+		} else
+			txAbort(tid, 0);	/* Filesystem full */
+		goto out3;
+	}
+
+	ip->i_op = &jfs_file_inode_operations;
+	ip->i_fop = &jfs_file_operations;
+	ip->i_mapping->a_ops = &jfs_aops;
+
+	mark_inode_dirty(ip);
+
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+
+	mark_inode_dirty(dip);
+
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		clear_nlink(ip);
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		unlock_new_inode(ip);
+		d_instantiate(dentry, ip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+      out1:
+
+	jfs_info("jfs_create: rc:%d", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_mkdir(dip, dentry, mode)
+ *
+ * FUNCTION:	create a child directory in the parent directory <dip>
+ *		with name = <from dentry> and mode = <mode>
+ *
+ * PARAMETER:	dip	- parent directory vnode
+ *		dentry	- dentry of child directory
+ *		mode	- create mode (rwxrwxrwx).
+ *
+ * RETURN:	Errors from subroutines
+ *
+ * note:
+ * EACCESS: user needs search+write permission on the parent directory
+ */
+static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
+{
+	int rc = 0;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = NULL;	/* child directory inode */
+	ino_t ino;
+	struct component_name dname;	/* child directory name */
+	struct btstack btstack;
+	struct inode *iplist[2];
+	struct tblock *tblk;
+
+	jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry);
+
+	dquot_initialize(dip);
+
+	/*
+	 * search parent directory for entry/freespace
+	 * (dtSearch() returns parent directory page pinned)
+	 */
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out1;
+
+	/*
+	 * Either iAlloc() or txBegin() may block.  Deadlock can occur if we
+	 * block there while holding dtree page, so we allocate the inode &
+	 * begin the transaction before we search the directory.
+	 */
+	ip = ialloc(dip, S_IFDIR | mode);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out2;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_acl(tid, ip, dip);
+	if (rc)
+		goto out3;
+
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
+		jfs_err("jfs_mkdir: dtSearch returned %d", rc);
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	/*
+	 * initialize the child directory in-line in inode
+	 */
+	dtInitRoot(tid, ip, dip->i_ino);
+
+	/*
+	 * create entry in parent directory for child directory
+	 * (dtInsert() releases parent directory page)
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
+		if (rc == -EIO) {
+			jfs_err("jfs_mkdir: dtInsert returned -EIO");
+			txAbort(tid, 1);	/* Marks Filesystem dirty */
+		} else
+			txAbort(tid, 0);	/* Filesystem full */
+		goto out3;
+	}
+
+	set_nlink(ip, 2);	/* for '.' */
+	ip->i_op = &jfs_dir_inode_operations;
+	ip->i_fop = &jfs_dir_operations;
+
+	mark_inode_dirty(ip);
+
+	/* update parent directory inode */
+	inc_nlink(dip);		/* for '..' from child directory */
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dip);
+
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		clear_nlink(ip);
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		unlock_new_inode(ip);
+		d_instantiate(dentry, ip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+
+      out1:
+
+	jfs_info("jfs_mkdir: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	jfs_rmdir(dip, dentry)
+ *
+ * FUNCTION:	remove a link to child directory
+ *
+ * PARAMETER:	dip	- parent inode
+ *		dentry	- child directory dentry
+ *
+ * RETURN:	-EINVAL	- if name is . or ..
+ *		-EINVAL - if . or .. exist but are invalid.
+ *		errors from subroutines
+ *
+ * note:
+ * if other threads have the directory open when the last link
+ * is removed, the "." and ".." entries, if present, are removed before
+ * rmdir() returns and no new entries may be created in the directory,
+ * but the directory is not removed until the last reference to
+ * the directory is released (cf.unlink() of regular file).
+ */
+static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
+{
+	int rc;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = d_inode(dentry);
+	ino_t ino;
+	struct component_name dname;
+	struct inode *iplist[2];
+	struct tblock *tblk;
+
+	jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry);
+
+	/* Init inode for quota operations. */
+	dquot_initialize(dip);
+	dquot_initialize(ip);
+
+	/* directory must be empty to be removed */
+	if (!dtEmpty(ip)) {
+		rc = -ENOTEMPTY;
+		goto out;
+	}
+
+	if ((rc = get_UCSname(&dname, dentry))) {
+		goto out;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_DELETE;
+	tblk->u.ip = ip;
+
+	/*
+	 * delete the entry of target directory from parent directory
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
+		jfs_err("jfs_rmdir: dtDelete returned %d", rc);
+		if (rc == -EIO)
+			txAbort(tid, 1);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		mutex_unlock(&JFS_IP(dip)->commit_mutex);
+
+		goto out2;
+	}
+
+	/* update parent directory's link count corresponding
+	 * to ".." entry of the target directory deleted
+	 */
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	inode_dec_link_count(dip);
+
+	/*
+	 * OS/2 could have created EA and/or ACL
+	 */
+	/* free EA from both persistent and working map */
+	if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
+		/* free EA pages */
+		txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
+	}
+	JFS_IP(ip)->ea.flag = 0;
+
+	/* free ACL from both persistent and working map */
+	if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
+		/* free ACL pages */
+		txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
+	}
+	JFS_IP(ip)->acl.flag = 0;
+
+	/* mark the target directory as deleted */
+	clear_nlink(ip);
+	mark_inode_dirty(ip);
+
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+	txEnd(tid);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+
+	/*
+	 * Truncating the directory index table is not guaranteed.  It
+	 * may need to be done iteratively
+	 */
+	if (test_cflag(COMMIT_Stale, dip)) {
+		if (dip->i_size > 1)
+			jfs_truncate_nolock(dip, 0);
+
+		clear_cflag(COMMIT_Stale, dip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+      out:
+	jfs_info("jfs_rmdir: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	jfs_unlink(dip, dentry)
+ *
+ * FUNCTION:	remove a link to object <vp> named by <name>
+ *		from parent directory <dvp>
+ *
+ * PARAMETER:	dip	- inode of parent directory
+ *		dentry	- dentry of object to be removed
+ *
+ * RETURN:	errors from subroutines
+ *
+ * note:
+ * temporary file: if one or more processes have the file open
+ * when the last link is removed, the link will be removed before
+ * unlink() returns, but the removal of the file contents will be
+ * postponed until all references to the files are closed.
+ *
+ * JFS does NOT support unlink() on directories.
+ *
+ */
+static int jfs_unlink(struct inode *dip, struct dentry *dentry)
+{
+	int rc;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = d_inode(dentry);
+	ino_t ino;
+	struct component_name dname;	/* object name */
+	struct inode *iplist[2];
+	struct tblock *tblk;
+	s64 new_size = 0;
+	int commit_flag;
+
+	jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry);
+
+	/* Init inode for quota operations. */
+	dquot_initialize(dip);
+	dquot_initialize(ip);
+
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out;
+
+	IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	/*
+	 * delete the entry of target file from parent directory
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
+		jfs_err("jfs_unlink: dtDelete returned %d", rc);
+		if (rc == -EIO)
+			txAbort(tid, 1);	/* Marks FS Dirty */
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		mutex_unlock(&JFS_IP(dip)->commit_mutex);
+		IWRITE_UNLOCK(ip);
+		goto out1;
+	}
+
+	ASSERT(ip->i_nlink);
+
+	ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dip);
+
+	/* update target's inode */
+	inode_dec_link_count(ip);
+
+	/*
+	 *	commit zero link count object
+	 */
+	if (ip->i_nlink == 0) {
+		assert(!test_cflag(COMMIT_Nolink, ip));
+		/* free block resources */
+		if ((new_size = commitZeroLink(tid, ip)) < 0) {
+			txAbort(tid, 1);	/* Marks FS Dirty */
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ip)->commit_mutex);
+			mutex_unlock(&JFS_IP(dip)->commit_mutex);
+			IWRITE_UNLOCK(ip);
+			rc = new_size;
+			goto out1;
+		}
+		tblk = tid_to_tblock(tid);
+		tblk->xflag |= COMMIT_DELETE;
+		tblk->u.ip = ip;
+	}
+
+	/*
+	 * Incomplete truncate of file data can
+	 * result in timing problems unless we synchronously commit the
+	 * transaction.
+	 */
+	if (new_size)
+		commit_flag = COMMIT_SYNC;
+	else
+		commit_flag = 0;
+
+	/*
+	 * If xtTruncate was incomplete, commit synchronously to avoid
+	 * timing complications
+	 */
+	rc = txCommit(tid, 2, &iplist[0], commit_flag);
+
+	txEnd(tid);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+
+	while (new_size && (rc == 0)) {
+		tid = txBegin(dip->i_sb, 0);
+		mutex_lock(&JFS_IP(ip)->commit_mutex);
+		new_size = xtTruncate_pmap(tid, ip, new_size);
+		if (new_size < 0) {
+			txAbort(tid, 1);	/* Marks FS Dirty */
+			rc = new_size;
+		} else
+			rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	}
+
+	if (ip->i_nlink == 0)
+		set_cflag(COMMIT_Nolink, ip);
+
+	IWRITE_UNLOCK(ip);
+
+	/*
+	 * Truncating the directory index table is not guaranteed.  It
+	 * may need to be done iteratively
+	 */
+	if (test_cflag(COMMIT_Stale, dip)) {
+		if (dip->i_size > 1)
+			jfs_truncate_nolock(dip, 0);
+
+		clear_cflag(COMMIT_Stale, dip);
+	}
+
+      out1:
+	free_UCSname(&dname);
+      out:
+	jfs_info("jfs_unlink: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	commitZeroLink()
+ *
+ * FUNCTION:	for non-directory, called by jfs_remove(),
+ *		truncate a regular file, directory or symbolic
+ *		link to zero length. return 0 if type is not
+ *		one of these.
+ *
+ *		if the file is currently associated with a VM segment
+ *		only permanent disk and inode map resources are freed,
+ *		and neither the inode nor indirect blocks are modified
+ *		so that the resources can be later freed in the work
+ *		map by ctrunc1.
+ *		if there is no VM segment on entry, the resources are
+ *		freed in both work and permanent map.
+ *		(? for temporary file - memory object is cached even
+ *		after no reference:
+ *		reference count > 0 -   )
+ *
+ * PARAMETERS:	cd	- pointer to commit data structure.
+ *			  current inode is the one to truncate.
+ *
+ * RETURN:	Errors from subroutines
+ */
+static s64 commitZeroLink(tid_t tid, struct inode *ip)
+{
+	int filetype;
+	struct tblock *tblk;
+
+	jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip);
+
+	filetype = ip->i_mode & S_IFMT;
+	switch (filetype) {
+	case S_IFREG:
+		break;
+	case S_IFLNK:
+		/* fast symbolic link */
+		if (ip->i_size < IDATASIZE) {
+			ip->i_size = 0;
+			return 0;
+		}
+		break;
+	default:
+		assert(filetype != S_IFDIR);
+		return 0;
+	}
+
+	set_cflag(COMMIT_Freewmap, ip);
+
+	/* mark transaction of block map update type */
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_PMAP;
+
+	/*
+	 * free EA
+	 */
+	if (JFS_IP(ip)->ea.flag & DXD_EXTENT)
+		/* acquire maplock on EA to be freed from block map */
+		txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
+
+	/*
+	 * free ACL
+	 */
+	if (JFS_IP(ip)->acl.flag & DXD_EXTENT)
+		/* acquire maplock on EA to be freed from block map */
+		txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
+
+	/*
+	 * free xtree/data (truncate to zero length):
+	 * free xtree/data pages from cache if COMMIT_PWMAP,
+	 * free xtree/data blocks from persistent block map, and
+	 * free xtree/data blocks from working block map if COMMIT_PWMAP;
+	 */
+	if (ip->i_size)
+		return xtTruncate_pmap(tid, ip, 0);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	jfs_free_zero_link()
+ *
+ * FUNCTION:	for non-directory, called by iClose(),
+ *		free resources of a file from cache and WORKING map
+ *		for a file previously committed with zero link count
+ *		while associated with a pager object,
+ *
+ * PARAMETER:	ip	- pointer to inode of file.
+ */
+void jfs_free_zero_link(struct inode *ip)
+{
+	int type;
+
+	jfs_info("jfs_free_zero_link: ip = 0x%p", ip);
+
+	/* return if not reg or symbolic link or if size is
+	 * already ok.
+	 */
+	type = ip->i_mode & S_IFMT;
+
+	switch (type) {
+	case S_IFREG:
+		break;
+	case S_IFLNK:
+		/* if its contained in inode nothing to do */
+		if (ip->i_size < IDATASIZE)
+			return;
+		break;
+	default:
+		return;
+	}
+
+	/*
+	 * free EA
+	 */
+	if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
+		s64 xaddr = addressDXD(&JFS_IP(ip)->ea);
+		int xlen = lengthDXD(&JFS_IP(ip)->ea);
+		struct maplock maplock;	/* maplock for COMMIT_WMAP */
+		struct pxd_lock *pxdlock;	/* maplock for COMMIT_WMAP */
+
+		/* free EA pages from cache */
+		invalidate_dxd_metapages(ip, JFS_IP(ip)->ea);
+
+		/* free EA extent from working block map */
+		maplock.index = 1;
+		pxdlock = (struct pxd_lock *) & maplock;
+		pxdlock->flag = mlckFREEPXD;
+		PXDaddress(&pxdlock->pxd, xaddr);
+		PXDlength(&pxdlock->pxd, xlen);
+		txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+	}
+
+	/*
+	 * free ACL
+	 */
+	if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
+		s64 xaddr = addressDXD(&JFS_IP(ip)->acl);
+		int xlen = lengthDXD(&JFS_IP(ip)->acl);
+		struct maplock maplock;	/* maplock for COMMIT_WMAP */
+		struct pxd_lock *pxdlock;	/* maplock for COMMIT_WMAP */
+
+		invalidate_dxd_metapages(ip, JFS_IP(ip)->acl);
+
+		/* free ACL extent from working block map */
+		maplock.index = 1;
+		pxdlock = (struct pxd_lock *) & maplock;
+		pxdlock->flag = mlckFREEPXD;
+		PXDaddress(&pxdlock->pxd, xaddr);
+		PXDlength(&pxdlock->pxd, xlen);
+		txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+	}
+
+	/*
+	 * free xtree/data (truncate to zero length):
+	 * free xtree/data pages from cache, and
+	 * free xtree/data blocks from working block map;
+	 */
+	if (ip->i_size)
+		xtTruncate(0, ip, 0, COMMIT_WMAP);
+}
+
+/*
+ * NAME:	jfs_link(vp, dvp, name, crp)
+ *
+ * FUNCTION:	create a link to <vp> by the name = <name>
+ *		in the parent directory <dvp>
+ *
+ * PARAMETER:	vp	- target object
+ *		dvp	- parent directory of new link
+ *		name	- name of new link to target object
+ *		crp	- credential
+ *
+ * RETURN:	Errors from subroutines
+ *
+ * note:
+ * JFS does NOT support link() on directories (to prevent circular
+ * path in the directory hierarchy);
+ * EPERM: the target object is a directory, and either the caller
+ * does not have appropriate privileges or the implementation prohibits
+ * using link() on directories [XPG4.2].
+ *
+ * JFS does NOT support links between file systems:
+ * EXDEV: target object and new link are on different file systems and
+ * implementation does not support links between file systems [XPG4.2].
+ */
+static int jfs_link(struct dentry *old_dentry,
+	     struct inode *dir, struct dentry *dentry)
+{
+	int rc;
+	tid_t tid;
+	struct inode *ip = d_inode(old_dentry);
+	ino_t ino;
+	struct component_name dname;
+	struct btstack btstack;
+	struct inode *iplist[2];
+
+	jfs_info("jfs_link: %pd %pd", old_dentry, dentry);
+
+	dquot_initialize(dir);
+
+	tid = txBegin(ip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	/*
+	 * scan parent directory for entry/freespace
+	 */
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out;
+
+	if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
+		goto free_dname;
+
+	/*
+	 * create entry for new link in parent directory
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack)))
+		goto free_dname;
+
+	/* update object inode */
+	inc_nlink(ip);		/* for new link */
+	ip->i_ctime = CURRENT_TIME;
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	ihold(ip);
+
+	iplist[0] = ip;
+	iplist[1] = dir;
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+	if (rc) {
+		drop_nlink(ip); /* never instantiated */
+		iput(ip);
+	} else
+		d_instantiate(dentry, ip);
+
+      free_dname:
+	free_UCSname(&dname);
+
+      out:
+	txEnd(tid);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dir)->commit_mutex);
+
+	jfs_info("jfs_link: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	jfs_symlink(dip, dentry, name)
+ *
+ * FUNCTION:	creates a symbolic link to <symlink> by name <name>
+ *			in directory <dip>
+ *
+ * PARAMETER:	dip	- parent directory vnode
+ *		dentry	- dentry of symbolic link
+ *		name	- the path name of the existing object
+ *			  that will be the source of the link
+ *
+ * RETURN:	errors from subroutines
+ *
+ * note:
+ * ENAMETOOLONG: pathname resolution of a symbolic link produced
+ * an intermediate result whose length exceeds PATH_MAX [XPG4.2]
+*/
+
+static int jfs_symlink(struct inode *dip, struct dentry *dentry,
+		const char *name)
+{
+	int rc;
+	tid_t tid;
+	ino_t ino = 0;
+	struct component_name dname;
+	int ssize;		/* source pathname size */
+	struct btstack btstack;
+	struct inode *ip = d_inode(dentry);
+	unchar *i_fastsymlink;
+	s64 xlen = 0;
+	int bmask = 0, xsize;
+	s64 xaddr;
+	struct metapage *mp;
+	struct super_block *sb;
+	struct tblock *tblk;
+
+	struct inode *iplist[2];
+
+	jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
+
+	dquot_initialize(dip);
+
+	ssize = strlen(name) + 1;
+
+	/*
+	 * search parent directory for entry/freespace
+	 * (dtSearch() returns parent directory page pinned)
+	 */
+
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out1;
+
+	/*
+	 * allocate on-disk/in-memory inode for symbolic link:
+	 * (iAlloc() returns new, locked inode)
+	 */
+	ip = ialloc(dip, S_IFLNK | 0777);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out2;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
+	if (rc)
+		goto out3;
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	/* fix symlink access permission
+	 * (dir_create() ANDs in the u.u_cmask,
+	 * but symlinks really need to be 777 access)
+	 */
+	ip->i_mode |= 0777;
+
+	/*
+	 * write symbolic link target path name
+	 */
+	xtInitRoot(tid, ip);
+
+	/*
+	 * write source path name inline in on-disk inode (fast symbolic link)
+	 */
+
+	if (ssize <= IDATASIZE) {
+		ip->i_op = &jfs_fast_symlink_inode_operations;
+
+		i_fastsymlink = JFS_IP(ip)->i_inline;
+		memcpy(i_fastsymlink, name, ssize);
+		ip->i_size = ssize - 1;
+
+		/*
+		 * if symlink is > 128 bytes, we don't have the space to
+		 * store inline extended attributes
+		 */
+		if (ssize > sizeof (JFS_IP(ip)->i_inline))
+			JFS_IP(ip)->mode2 &= ~INLINEEA;
+
+		jfs_info("jfs_symlink: fast symlink added  ssize:%d name:%s ",
+			 ssize, name);
+	}
+	/*
+	 * write source path name in a single extent
+	 */
+	else {
+		jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
+
+		ip->i_op = &jfs_symlink_inode_operations;
+		ip->i_mapping->a_ops = &jfs_aops;
+
+		/*
+		 * even though the data of symlink object (source
+		 * path name) is treated as non-journaled user data,
+		 * it is read/written thru buffer cache for performance.
+		 */
+		sb = ip->i_sb;
+		bmask = JFS_SBI(sb)->bsize - 1;
+		xsize = (ssize + bmask) & ~bmask;
+		xaddr = 0;
+		xlen = xsize >> JFS_SBI(sb)->l2bsize;
+		if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) {
+			txAbort(tid, 0);
+			goto out3;
+		}
+		ip->i_size = ssize - 1;
+		while (ssize) {
+			/* This is kind of silly since PATH_MAX == 4K */
+			int copy_size = min(ssize, PSIZE);
+
+			mp = get_metapage(ip, xaddr, PSIZE, 1);
+
+			if (mp == NULL) {
+				xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+				rc = -EIO;
+				txAbort(tid, 0);
+				goto out3;
+			}
+			memcpy(mp->data, name, copy_size);
+			flush_metapage(mp);
+			ssize -= copy_size;
+			name += copy_size;
+			xaddr += JFS_SBI(sb)->nbperpage;
+		}
+	}
+
+	/*
+	 * create entry for symbolic link in parent directory
+	 */
+	rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE);
+	if (rc == 0) {
+		ino = ip->i_ino;
+		rc = dtInsert(tid, dip, &dname, &ino, &btstack);
+	}
+	if (rc) {
+		if (xlen)
+			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+		txAbort(tid, 0);
+		/* discard new inode */
+		goto out3;
+	}
+
+	mark_inode_dirty(ip);
+
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dip);
+	/*
+	 * commit update of parent directory and link object
+	 */
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		clear_nlink(ip);
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		unlock_new_inode(ip);
+		d_instantiate(dentry, ip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+      out1:
+	jfs_info("jfs_symlink: rc:%d", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_rename
+ *
+ * FUNCTION:	rename a file or directory
+ */
+static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+	       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct btstack btstack;
+	ino_t ino;
+	struct component_name new_dname;
+	struct inode *new_ip;
+	struct component_name old_dname;
+	struct inode *old_ip;
+	int rc;
+	tid_t tid;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+	int ipcount;
+	struct inode *iplist[4];
+	struct tblock *tblk;
+	s64 new_size = 0;
+	int commit_flag;
+
+
+	jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry);
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_ip = d_inode(old_dentry);
+	new_ip = d_inode(new_dentry);
+
+	if ((rc = get_UCSname(&old_dname, old_dentry)))
+		goto out1;
+
+	if ((rc = get_UCSname(&new_dname, new_dentry)))
+		goto out2;
+
+	/*
+	 * Make sure source inode number is what we think it is
+	 */
+	rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP);
+	if (rc || (ino != old_ip->i_ino)) {
+		rc = -ENOENT;
+		goto out3;
+	}
+
+	/*
+	 * Make sure dest inode number (if any) is what we think it is
+	 */
+	rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
+	if (!rc) {
+		if ((!new_ip) || (ino != new_ip->i_ino)) {
+			rc = -ESTALE;
+			goto out3;
+		}
+	} else if (rc != -ENOENT)
+		goto out3;
+	else if (new_ip) {
+		/* no entry exists, but one was expected */
+		rc = -ESTALE;
+		goto out3;
+	}
+
+	if (S_ISDIR(old_ip->i_mode)) {
+		if (new_ip) {
+			if (!dtEmpty(new_ip)) {
+				rc = -ENOTEMPTY;
+				goto out3;
+			}
+		}
+	} else if (new_ip) {
+		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
+		/* Init inode for quota operations. */
+		dquot_initialize(new_ip);
+	}
+
+	/*
+	 * The real work starts here
+	 */
+	tid = txBegin(new_dir->i_sb, 0);
+
+	/*
+	 * How do we know the locking is safe from deadlocks?
+	 * The vfs does the hard part for us.  Any time we are taking nested
+	 * commit_mutexes, the vfs already has i_mutex held on the parent.
+	 * Here, the vfs has already taken i_mutex on both old_dir and new_dir.
+	 */
+	mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+	if (old_dir != new_dir)
+		mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex,
+				  COMMIT_MUTEX_SECOND_PARENT);
+
+	if (new_ip) {
+		mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex,
+				  COMMIT_MUTEX_VICTIM);
+		/*
+		 * Change existing directory entry to new inode number
+		 */
+		ino = new_ip->i_ino;
+		rc = dtModify(tid, new_dir, &new_dname, &ino,
+			      old_ip->i_ino, JFS_RENAME);
+		if (rc)
+			goto out4;
+		drop_nlink(new_ip);
+		if (S_ISDIR(new_ip->i_mode)) {
+			drop_nlink(new_ip);
+			if (new_ip->i_nlink) {
+				mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
+				if (old_dir != new_dir)
+					mutex_unlock(&JFS_IP(old_dir)->commit_mutex);
+				mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
+				mutex_unlock(&JFS_IP(new_dir)->commit_mutex);
+				if (!S_ISDIR(old_ip->i_mode) && new_ip)
+					IWRITE_UNLOCK(new_ip);
+				jfs_error(new_ip->i_sb,
+					  "new_ip->i_nlink != 0\n");
+				return -EIO;
+			}
+			tblk = tid_to_tblock(tid);
+			tblk->xflag |= COMMIT_DELETE;
+			tblk->u.ip = new_ip;
+		} else if (new_ip->i_nlink == 0) {
+			assert(!test_cflag(COMMIT_Nolink, new_ip));
+			/* free block resources */
+			if ((new_size = commitZeroLink(tid, new_ip)) < 0) {
+				txAbort(tid, 1);	/* Marks FS Dirty */
+				rc = new_size;
+				goto out4;
+			}
+			tblk = tid_to_tblock(tid);
+			tblk->xflag |= COMMIT_DELETE;
+			tblk->u.ip = new_ip;
+		} else {
+			new_ip->i_ctime = CURRENT_TIME;
+			mark_inode_dirty(new_ip);
+		}
+	} else {
+		/*
+		 * Add new directory entry
+		 */
+		rc = dtSearch(new_dir, &new_dname, &ino, &btstack,
+			      JFS_CREATE);
+		if (rc) {
+			jfs_err("jfs_rename didn't expect dtSearch to fail "
+				"w/rc = %d", rc);
+			goto out4;
+		}
+
+		ino = old_ip->i_ino;
+		rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack);
+		if (rc) {
+			if (rc == -EIO)
+				jfs_err("jfs_rename: dtInsert returned -EIO");
+			goto out4;
+		}
+		if (S_ISDIR(old_ip->i_mode))
+			inc_nlink(new_dir);
+	}
+	/*
+	 * Remove old directory entry
+	 */
+
+	ino = old_ip->i_ino;
+	rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE);
+	if (rc) {
+		jfs_err("jfs_rename did not expect dtDelete to return rc = %d",
+			rc);
+		txAbort(tid, 1);	/* Marks Filesystem dirty */
+		goto out4;
+	}
+	if (S_ISDIR(old_ip->i_mode)) {
+		drop_nlink(old_dir);
+		if (old_dir != new_dir) {
+			/*
+			 * Change inode number of parent for moved directory
+			 */
+
+			JFS_IP(old_ip)->i_dtroot.header.idotdot =
+				cpu_to_le32(new_dir->i_ino);
+
+			/* Linelock header of dtree */
+			tlck = txLock(tid, old_ip,
+				    (struct metapage *) &JFS_IP(old_ip)->bxflag,
+				      tlckDTREE | tlckBTROOT | tlckRELINK);
+			dtlck = (struct dt_lock *) & tlck->lock;
+			ASSERT(dtlck->index == 0);
+			lv = & dtlck->lv[0];
+			lv->offset = 0;
+			lv->length = 1;
+			dtlck->index++;
+		}
+	}
+
+	/*
+	 * Update ctime on changed/moved inodes & mark dirty
+	 */
+	old_ip->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_ip);
+
+	new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
+	mark_inode_dirty(new_dir);
+
+	/* Build list of inodes modified by this transaction */
+	ipcount = 0;
+	iplist[ipcount++] = old_ip;
+	if (new_ip)
+		iplist[ipcount++] = new_ip;
+	iplist[ipcount++] = old_dir;
+
+	if (old_dir != new_dir) {
+		iplist[ipcount++] = new_dir;
+		old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+		mark_inode_dirty(old_dir);
+	}
+
+	/*
+	 * Incomplete truncate of file data can
+	 * result in timing problems unless we synchronously commit the
+	 * transaction.
+	 */
+	if (new_size)
+		commit_flag = COMMIT_SYNC;
+	else
+		commit_flag = 0;
+
+	rc = txCommit(tid, ipcount, iplist, commit_flag);
+
+      out4:
+	txEnd(tid);
+	if (new_ip)
+		mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
+	if (old_dir != new_dir)
+		mutex_unlock(&JFS_IP(old_dir)->commit_mutex);
+	mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(new_dir)->commit_mutex);
+
+	while (new_size && (rc == 0)) {
+		tid = txBegin(new_ip->i_sb, 0);
+		mutex_lock(&JFS_IP(new_ip)->commit_mutex);
+		new_size = xtTruncate_pmap(tid, new_ip, new_size);
+		if (new_size < 0) {
+			txAbort(tid, 1);
+			rc = new_size;
+		} else
+			rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
+	}
+	if (new_ip && (new_ip->i_nlink == 0))
+		set_cflag(COMMIT_Nolink, new_ip);
+      out3:
+	free_UCSname(&new_dname);
+      out2:
+	free_UCSname(&old_dname);
+      out1:
+	if (new_ip && !S_ISDIR(new_ip->i_mode))
+		IWRITE_UNLOCK(new_ip);
+	/*
+	 * Truncating the directory index table is not guaranteed.  It
+	 * may need to be done iteratively
+	 */
+	if (test_cflag(COMMIT_Stale, old_dir)) {
+		if (old_dir->i_size > 1)
+			jfs_truncate_nolock(old_dir, 0);
+
+		clear_cflag(COMMIT_Stale, old_dir);
+	}
+
+	jfs_info("jfs_rename: returning %d", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_mknod
+ *
+ * FUNCTION:	Create a special file (device)
+ */
+static int jfs_mknod(struct inode *dir, struct dentry *dentry,
+		umode_t mode, dev_t rdev)
+{
+	struct jfs_inode_info *jfs_ip;
+	struct btstack btstack;
+	struct component_name dname;
+	ino_t ino;
+	struct inode *ip;
+	struct inode *iplist[2];
+	int rc;
+	tid_t tid;
+	struct tblock *tblk;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	jfs_info("jfs_mknod: %pd", dentry);
+
+	dquot_initialize(dir);
+
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out;
+
+	ip = ialloc(dir, mode);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out1;
+	}
+	jfs_ip = JFS_IP(ip);
+
+	tid = txBegin(dir->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_acl(tid, ip, dir);
+	if (rc)
+		goto out3;
+
+	rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	ip->i_op = &jfs_file_inode_operations;
+	jfs_ip->dev = new_encode_dev(rdev);
+	init_special_inode(ip, ip->i_mode, rdev);
+
+	mark_inode_dirty(ip);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	mark_inode_dirty(dir);
+
+	iplist[0] = dir;
+	iplist[1] = ip;
+	rc = txCommit(tid, 2, iplist, 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dir)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		clear_nlink(ip);
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		unlock_new_inode(ip);
+		d_instantiate(dentry, ip);
+	}
+
+      out1:
+	free_UCSname(&dname);
+
+      out:
+	jfs_info("jfs_mknod: returning %d", rc);
+	return rc;
+}
+
+static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
+{
+	struct btstack btstack;
+	ino_t inum;
+	struct inode *ip;
+	struct component_name key;
+	int rc;
+
+	jfs_info("jfs_lookup: name = %pd", dentry);
+
+	if ((rc = get_UCSname(&key, dentry)))
+		return ERR_PTR(rc);
+	rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP);
+	free_UCSname(&key);
+	if (rc == -ENOENT) {
+		ip = NULL;
+	} else if (rc) {
+		jfs_err("jfs_lookup: dtSearch returned %d", rc);
+		ip = ERR_PTR(rc);
+	} else {
+		ip = jfs_iget(dip->i_sb, inum);
+		if (IS_ERR(ip))
+			jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum);
+	}
+
+	return d_splice_alias(ip, dentry);
+}
+
+static struct inode *jfs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+	inode = jfs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    jfs_nfs_get_inode);
+}
+
+struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    jfs_nfs_get_inode);
+}
+
+struct dentry *jfs_get_parent(struct dentry *dentry)
+{
+	unsigned long parent_ino;
+
+	parent_ino =
+		le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot);
+
+	return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino));
+}
+
+const struct inode_operations jfs_dir_inode_operations = {
+	.create		= jfs_create,
+	.lookup		= jfs_lookup,
+	.link		= jfs_link,
+	.unlink		= jfs_unlink,
+	.symlink	= jfs_symlink,
+	.mkdir		= jfs_mkdir,
+	.rmdir		= jfs_rmdir,
+	.mknod		= jfs_mknod,
+	.rename		= jfs_rename,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+	.setattr	= jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
+	.get_acl	= jfs_get_acl,
+	.set_acl	= jfs_set_acl,
+#endif
+};
+
+const struct file_operations jfs_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= jfs_readdir,
+	.fsync		= jfs_fsync,
+	.unlocked_ioctl = jfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= jfs_compat_ioctl,
+#endif
+	.llseek		= generic_file_llseek,
+};
+
+static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
+{
+	unsigned long hash;
+	int i;
+
+	hash = init_name_hash();
+	for (i=0; i < this->len; i++)
+		hash = partial_name_hash(tolower(this->name[i]), hash);
+	this->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	int i, result = 1;
+
+	if (len != name->len)
+		goto out;
+	for (i=0; i < len; i++) {
+		if (tolower(str[i]) != tolower(name->name[i]))
+			goto out;
+	}
+	result = 0;
+out:
+	return result;
+}
+
+static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	/*
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
+	 */
+	if (d_really_is_positive(dentry))
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!flags)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+		return 0;
+	return 1;
+}
+
+const struct dentry_operations jfs_ci_dentry_operations =
+{
+	.d_hash = jfs_ci_hash,
+	.d_compare = jfs_ci_compare,
+	.d_revalidate = jfs_ci_revalidate,
+};
diff --git a/kernel/fs/jfs/resize.c b/kernel/fs/jfs/resize.c
new file mode 100644
index 000000000..90b3bc21e
--- /dev/null
+++ b/kernel/fs/jfs/resize.c
@@ -0,0 +1,543 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dinode.h"
+#include "jfs_imap.h"
+#include "jfs_dmap.h"
+#include "jfs_superblock.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+#define BITSPERPAGE	(PSIZE << 3)
+#define L2MEGABYTE	20
+#define MEGABYTE	(1 << L2MEGABYTE)
+#define MEGABYTE32	(MEGABYTE << 5)
+
+/* convert block number to bmap file page number */
+#define BLKTODMAPN(b)\
+	(((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
+
+/*
+ *	jfs_extendfs()
+ *
+ * function: extend file system;
+ *
+ *   |-------------------------------|----------|----------|
+ *   file system space               fsck       inline log
+ *                                   workspace  space
+ *
+ * input:
+ *	new LVSize: in LV blocks (required)
+ *	new LogSize: in LV blocks (optional)
+ *	new FSSize: in LV blocks (optional)
+ *
+ * new configuration:
+ * 1. set new LogSize as specified or default from new LVSize;
+ * 2. compute new FSCKSize from new LVSize;
+ * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where
+ *    assert(new FSSize >= old FSSize),
+ *    i.e., file system must not be shrunk;
+ */
+int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
+{
+	int rc = 0;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct inode *ipbmap = sbi->ipbmap;
+	struct inode *ipbmap2;
+	struct inode *ipimap = sbi->ipimap;
+	struct jfs_log *log = sbi->log;
+	struct bmap *bmp = sbi->bmap;
+	s64 newLogAddress, newFSCKAddress;
+	int newFSCKSize;
+	s64 newMapSize = 0, mapSize;
+	s64 XAddress, XSize, nblocks, xoff, xaddr, t64;
+	s64 oldLVSize;
+	s64 newFSSize;
+	s64 VolumeSize;
+	int newNpages = 0, nPages, newPage, xlen, t32;
+	int tid;
+	int log_formatted = 0;
+	struct inode *iplist[1];
+	struct jfs_superblock *j_sb, *j_sb2;
+	s64 old_agsize;
+	int agsizechanged = 0;
+	struct buffer_head *bh, *bh2;
+
+	/* If the volume hasn't grown, get out now */
+
+	if (sbi->mntflag & JFS_INLINELOG)
+		oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd);
+	else
+		oldLVSize = addressPXD(&sbi->fsckpxd) +
+		    lengthPXD(&sbi->fsckpxd);
+
+	if (oldLVSize >= newLVSize) {
+		printk(KERN_WARNING
+		       "jfs_extendfs: volume hasn't grown, returning\n");
+		goto out;
+	}
+
+	VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+
+	if (VolumeSize) {
+		if (newLVSize > VolumeSize) {
+			printk(KERN_WARNING "jfs_extendfs: invalid size\n");
+			rc = -EINVAL;
+			goto out;
+		}
+	} else {
+		/* check the device */
+		bh = sb_bread(sb, newLVSize - 1);
+		if (!bh) {
+			printk(KERN_WARNING "jfs_extendfs: invalid size\n");
+			rc = -EINVAL;
+			goto out;
+		}
+		bforget(bh);
+	}
+
+	/* Can't extend write-protected drive */
+
+	if (isReadOnly(ipbmap)) {
+		printk(KERN_WARNING "jfs_extendfs: read-only file system\n");
+		rc = -EROFS;
+		goto out;
+	}
+
+	/*
+	 *	reconfigure LV spaces
+	 *	---------------------
+	 *
+	 * validate new size, or, if not specified, determine new size
+	 */
+
+	/*
+	 * reconfigure inline log space:
+	 */
+	if ((sbi->mntflag & JFS_INLINELOG)) {
+		if (newLogSize == 0) {
+			/*
+			 * no size specified: default to 1/256 of aggregate
+			 * size; rounded up to a megabyte boundary;
+			 */
+			newLogSize = newLVSize >> 8;
+			t32 = (1 << (20 - sbi->l2bsize)) - 1;
+			newLogSize = (newLogSize + t32) & ~t32;
+			newLogSize =
+			    min(newLogSize, MEGABYTE32 >> sbi->l2bsize);
+		} else {
+			/*
+			 * convert the newLogSize to fs blocks.
+			 *
+			 * Since this is given in megabytes, it will always be
+			 * an even number of pages.
+			 */
+			newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize;
+		}
+
+	} else
+		newLogSize = 0;
+
+	newLogAddress = newLVSize - newLogSize;
+
+	/*
+	 * reconfigure fsck work space:
+	 *
+	 * configure it to the end of the logical volume regardless of
+	 * whether file system extends to the end of the aggregate;
+	 * Need enough 4k pages to cover:
+	 *  - 1 bit per block in aggregate rounded up to BPERDMAP boundary
+	 *  - 1 extra page to handle control page and intermediate level pages
+	 *  - 50 extra pages for the chkdsk service log
+	 */
+	t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
+	    << L2BPERDMAP;
+	t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50;
+	newFSCKSize = t32 << sbi->l2nbperpage;
+	newFSCKAddress = newLogAddress - newFSCKSize;
+
+	/*
+	 * compute new file system space;
+	 */
+	newFSSize = newLVSize - newLogSize - newFSCKSize;
+
+	/* file system cannot be shrunk */
+	if (newFSSize < bmp->db_mapsize) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * If we're expanding enough that the inline log does not overlap
+	 * the old one, we can format the new log before we quiesce the
+	 * filesystem.
+	 */
+	if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) {
+		if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
+			goto out;
+		log_formatted = 1;
+	}
+	/*
+	 *	quiesce file system
+	 *
+	 * (prepare to move the inline log and to prevent map update)
+	 *
+	 * block any new transactions and wait for completion of
+	 * all wip transactions and flush modified pages s.t.
+	 * on-disk file system is in consistent state and
+	 * log is not required for recovery.
+	 */
+	txQuiesce(sb);
+
+	/* Reset size of direct inode */
+	sbi->direct_inode->i_size =  sb->s_bdev->bd_inode->i_size;
+
+	if (sbi->mntflag & JFS_INLINELOG) {
+		/*
+		 * deactivate old inline log
+		 */
+		lmLogShutdown(log);
+
+		/*
+		 * mark on-disk super block for fs in transition;
+		 *
+		 * update on-disk superblock for the new space configuration
+		 * of inline log space and fsck work space descriptors:
+		 * N.B. FS descriptor is NOT updated;
+		 *
+		 * crash recovery:
+		 * logredo(): if FM_EXTENDFS, return to fsck() for cleanup;
+		 * fsck(): if FM_EXTENDFS, reformat inline log and fsck
+		 * workspace from superblock inline log descriptor and fsck
+		 * workspace descriptor;
+		 */
+
+		/* read in superblock */
+		if ((rc = readSuper(sb, &bh)))
+			goto error_out;
+		j_sb = (struct jfs_superblock *)bh->b_data;
+
+		/* mark extendfs() in progress */
+		j_sb->s_state |= cpu_to_le32(FM_EXTENDFS);
+		j_sb->s_xsize = cpu_to_le64(newFSSize);
+		PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress);
+		PXDlength(&j_sb->s_xfsckpxd, newFSCKSize);
+		PXDaddress(&j_sb->s_xlogpxd, newLogAddress);
+		PXDlength(&j_sb->s_xlogpxd, newLogSize);
+
+		/* synchronously update superblock */
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+
+		/*
+		 * format new inline log synchronously;
+		 *
+		 * crash recovery: if log move in progress,
+		 * reformat log and exit success;
+		 */
+		if (!log_formatted)
+			if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
+				goto error_out;
+
+		/*
+		 * activate new log
+		 */
+		log->base = newLogAddress;
+		log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits);
+		if ((rc = lmLogInit(log)))
+			goto error_out;
+	}
+
+	/*
+	 *	extend block allocation map
+	 *	---------------------------
+	 *
+	 * extendfs() for new extension, retry after crash recovery;
+	 *
+	 * note: both logredo() and fsck() rebuild map from
+	 * the bitmap and configuration parameter from superblock
+	 * (disregarding all other control information in the map);
+	 *
+	 * superblock:
+	 *  s_size: aggregate size in physical blocks;
+	 */
+	/*
+	 *	compute the new block allocation map configuration
+	 *
+	 * map dinode:
+	 *  di_size: map file size in byte;
+	 *  di_nblocks: number of blocks allocated for map file;
+	 *  di_mapsize: number of blocks in aggregate (covered by map);
+	 * map control page:
+	 *  db_mapsize: number of blocks in aggregate (covered by map);
+	 */
+	newMapSize = newFSSize;
+	/* number of data pages of new bmap file:
+	 * roundup new size to full dmap page boundary and
+	 * add 1 extra dmap page for next extendfs()
+	 */
+	t64 = (newMapSize - 1) + BPERDMAP;
+	newNpages = BLKTODMAPN(t64) + 1;
+
+	/*
+	 *	extend map from current map (WITHOUT growing mapfile)
+	 *
+	 * map new extension with unmapped part of the last partial
+	 * dmap page, if applicable, and extra page(s) allocated
+	 * at end of bmap by mkfs() or previous extendfs();
+	 */
+      extendBmap:
+	/* compute number of blocks requested to extend */
+	mapSize = bmp->db_mapsize;
+	XAddress = mapSize;	/* eXtension Address */
+	XSize = newMapSize - mapSize;	/* eXtension Size */
+	old_agsize = bmp->db_agsize;	/* We need to know if this changes */
+
+	/* compute number of blocks that can be extended by current mapfile */
+	t64 = dbMapFileSizeToMapSize(ipbmap);
+	if (mapSize > t64) {
+		printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n",
+		       (long long) mapSize, (long long) t64);
+		rc = -EIO;
+		goto error_out;
+	}
+	nblocks = min(t64 - mapSize, XSize);
+
+	/*
+	 * update map pages for new extension:
+	 *
+	 * update/init dmap and bubble up the control hierarchy
+	 * incrementally fold up dmaps into upper levels;
+	 * update bmap control page;
+	 */
+	if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
+		goto error_out;
+
+	agsizechanged |= (bmp->db_agsize != old_agsize);
+
+	/*
+	 * the map now has extended to cover additional nblocks:
+	 * dn_mapsize = oldMapsize + nblocks;
+	 */
+	/* ipbmap->i_mapsize += nblocks; */
+	XSize -= nblocks;
+
+	/*
+	 *	grow map file to cover remaining extension
+	 *	and/or one extra dmap page for next extendfs();
+	 *
+	 * allocate new map pages and its backing blocks, and
+	 * update map file xtree
+	 */
+	/* compute number of data pages of current bmap file */
+	nPages = ipbmap->i_size >> L2PSIZE;
+
+	/* need to grow map file ? */
+	if (nPages == newNpages)
+		goto finalizeBmap;
+
+	/*
+	 * grow bmap file for the new map pages required:
+	 *
+	 * allocate growth at the start of newly extended region;
+	 * bmap file only grows sequentially, i.e., both data pages
+	 * and possibly xtree index pages may grow in append mode,
+	 * s.t. logredo() can reconstruct pre-extension state
+	 * by washing away bmap file of pages outside s_size boundary;
+	 */
+	/*
+	 * journal map file growth as if a regular file growth:
+	 * (note: bmap is created with di_mode = IFJOURNAL|IFREG);
+	 *
+	 * journaling of bmap file growth is not required since
+	 * logredo() do/can not use log records of bmap file growth
+	 * but it provides careful write semantics, pmap update, etc.;
+	 */
+	/* synchronous write of data pages: bmap data pages are
+	 * cached in meta-data cache, and not written out
+	 * by txCommit();
+	 */
+	filemap_fdatawait(ipbmap->i_mapping);
+	filemap_write_and_wait(ipbmap->i_mapping);
+	diWriteSpecial(ipbmap, 0);
+
+	newPage = nPages;	/* first new page number */
+	xoff = newPage << sbi->l2nbperpage;
+	xlen = (newNpages - nPages) << sbi->l2nbperpage;
+	xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1);
+	xaddr = XAddress;
+
+	tid = txBegin(sb, COMMIT_FORCE);
+
+	if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) {
+		txEnd(tid);
+		goto error_out;
+	}
+	/* update bmap file size */
+	ipbmap->i_size += xlen << sbi->l2bsize;
+	inode_add_bytes(ipbmap, xlen << sbi->l2bsize);
+
+	iplist[0] = ipbmap;
+	rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+
+	txEnd(tid);
+
+	if (rc)
+		goto error_out;
+
+	/*
+	 * map file has been grown now to cover extension to further out;
+	 * di_size = new map file size;
+	 *
+	 * if huge extension, the previous extension based on previous
+	 * map file size may not have been sufficient to cover whole extension
+	 * (it could have been used up for new map pages),
+	 * but the newly grown map file now covers lot bigger new free space
+	 * available for further extension of map;
+	 */
+	/* any more blocks to extend ? */
+	if (XSize)
+		goto extendBmap;
+
+      finalizeBmap:
+	/* finalize bmap */
+	dbFinalizeBmap(ipbmap);
+
+	/*
+	 *	update inode allocation map
+	 *	---------------------------
+	 *
+	 * move iag lists from old to new iag;
+	 * agstart field is not updated for logredo() to reconstruct
+	 * iag lists if system crash occurs.
+	 * (computation of ag number from agstart based on agsize
+	 * will correctly identify the new ag);
+	 */
+	/* if new AG size the same as old AG size, done! */
+	if (agsizechanged) {
+		if ((rc = diExtendFS(ipimap, ipbmap)))
+			goto error_out;
+
+		/* finalize imap */
+		if ((rc = diSync(ipimap)))
+			goto error_out;
+	}
+
+	/*
+	 *	finalize
+	 *	--------
+	 *
+	 * extension is committed when on-disk super block is
+	 * updated with new descriptors: logredo will recover
+	 * crash before it to pre-extension state;
+	 */
+
+	/* sync log to skip log replay of bmap file growth transaction; */
+	/* lmLogSync(log, 1); */
+
+	/*
+	 * synchronous write bmap global control page;
+	 * for crash before completion of write
+	 * logredo() will recover to pre-extendfs state;
+	 * for crash after completion of write,
+	 * logredo() will recover post-extendfs state;
+	 */
+	if ((rc = dbSync(ipbmap)))
+		goto error_out;
+
+	/*
+	 * copy primary bmap inode to secondary bmap inode
+	 */
+
+	ipbmap2 = diReadSpecial(sb, BMAP_I, 1);
+	if (ipbmap2 == NULL) {
+		printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n");
+		goto error_out;
+	}
+	memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288);
+	ipbmap2->i_size = ipbmap->i_size;
+	ipbmap2->i_blocks = ipbmap->i_blocks;
+
+	diWriteSpecial(ipbmap2, 1);
+	diFreeSpecial(ipbmap2);
+
+	/*
+	 *	update superblock
+	 */
+	if ((rc = readSuper(sb, &bh)))
+		goto error_out;
+	j_sb = (struct jfs_superblock *)bh->b_data;
+
+	/* mark extendfs() completion */
+	j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS);
+	j_sb->s_size = cpu_to_le64(bmp->db_mapsize <<
+				   le16_to_cpu(j_sb->s_l2bfactor));
+	j_sb->s_agsize = cpu_to_le32(bmp->db_agsize);
+
+	/* update inline log space descriptor */
+	if (sbi->mntflag & JFS_INLINELOG) {
+		PXDaddress(&(j_sb->s_logpxd), newLogAddress);
+		PXDlength(&(j_sb->s_logpxd), newLogSize);
+	}
+
+	/* record log's mount serial number */
+	j_sb->s_logserial = cpu_to_le32(log->serial);
+
+	/* update fsck work space descriptor */
+	PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress);
+	PXDlength(&(j_sb->s_fsckpxd), newFSCKSize);
+	j_sb->s_fscklog = 1;
+	/* sb->s_fsckloglen remains the same */
+
+	/* Update secondary superblock */
+	bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
+	if (bh2) {
+		j_sb2 = (struct jfs_superblock *)bh2->b_data;
+		memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock));
+
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh2);
+		brelse(bh2);
+	}
+
+	/* write primary superblock */
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	goto resume;
+
+      error_out:
+	jfs_error(sb, "\n");
+
+      resume:
+	/*
+	 *	resume file system transactions
+	 */
+	txResume(sb);
+
+      out:
+	return rc;
+}
diff --git a/kernel/fs/jfs/super.c b/kernel/fs/jfs/super.c
new file mode 100644
index 000000000..4cd9798f4
--- /dev/null
+++ b/kernel/fs/jfs/super.c
@@ -0,0 +1,1012 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/completion.h>
+#include <linux/vfs.h>
+#include <linux/quotaops.h>
+#include <linux/mount.h>
+#include <linux/moduleparam.h>
+#include <linux/kthread.h>
+#include <linux/posix_acl.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_inode.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+#include "jfs_xattr.h"
+
+MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
+MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *jfs_inode_cachep;
+
+static const struct super_operations jfs_super_operations;
+static const struct export_operations jfs_export_operations;
+static struct file_system_type jfs_fs_type;
+
+#define MAX_COMMIT_THREADS 64
+static int commit_threads;
+module_param(commit_threads, int, 0);
+MODULE_PARM_DESC(commit_threads, "Number of commit threads");
+
+static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS];
+struct task_struct *jfsIOthread;
+struct task_struct *jfsSyncThread;
+
+#ifdef CONFIG_JFS_DEBUG
+int jfsloglevel = JFS_LOGLEVEL_WARN;
+module_param(jfsloglevel, int, 0644);
+MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)");
+#endif
+
+static void jfs_handle_error(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	updateSuper(sb, FM_DIRTY);
+
+	if (sbi->flag & JFS_ERR_PANIC)
+		panic("JFS (device %s): panic forced after error\n",
+			sb->s_id);
+	else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
+		jfs_err("ERROR: (device %s): remounting filesystem as read-only\n",
+			sb->s_id);
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	/* nothing is done for continue beyond marking the superblock dirty */
+}
+
+void jfs_error(struct super_block *sb, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("ERROR: (device %s): %ps: %pV\n",
+	       sb->s_id, __builtin_return_address(0), &vaf);
+
+	va_end(args);
+
+	jfs_handle_error(sb);
+}
+
+static struct inode *jfs_alloc_inode(struct super_block *sb)
+{
+	struct jfs_inode_info *jfs_inode;
+
+	jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
+	if (!jfs_inode)
+		return NULL;
+#ifdef CONFIG_QUOTA
+	memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot));
+#endif
+	return &jfs_inode->vfs_inode;
+}
+
+static void jfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	kmem_cache_free(jfs_inode_cachep, ji);
+}
+
+static void jfs_destroy_inode(struct inode *inode)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+
+	BUG_ON(!list_empty(&ji->anon_inode_list));
+
+	spin_lock_irq(&ji->ag_lock);
+	if (ji->active_ag != -1) {
+		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+		atomic_dec(&bmap->db_active[ji->active_ag]);
+		ji->active_ag = -1;
+	}
+	spin_unlock_irq(&ji->ag_lock);
+	call_rcu(&inode->i_rcu, jfs_i_callback);
+}
+
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
+	s64 maxinodes;
+	struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
+
+	jfs_info("In jfs_statfs");
+	buf->f_type = JFS_SUPER_MAGIC;
+	buf->f_bsize = sbi->bsize;
+	buf->f_blocks = sbi->bmap->db_mapsize;
+	buf->f_bfree = sbi->bmap->db_nfree;
+	buf->f_bavail = sbi->bmap->db_nfree;
+	/*
+	 * If we really return the number of allocated & free inodes, some
+	 * applications will fail because they won't see enough free inodes.
+	 * We'll try to calculate some guess as to how many inodes we can
+	 * really allocate
+	 *
+	 * buf->f_files = atomic_read(&imap->im_numinos);
+	 * buf->f_ffree = atomic_read(&imap->im_numfree);
+	 */
+	maxinodes = min((s64) atomic_read(&imap->im_numinos) +
+			((sbi->bmap->db_nfree >> imap->im_l2nbperiext)
+			 << L2INOSPEREXT), (s64) 0xffffffffLL);
+	buf->f_files = maxinodes;
+	buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) -
+				    atomic_read(&imap->im_numfree));
+	buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2);
+	buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2,
+					sizeof(sbi->uuid)/2);
+
+	buf->f_namelen = JFS_NAME_MAX;
+	return 0;
+}
+
+static void jfs_put_super(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int rc;
+
+	jfs_info("In jfs_put_super");
+
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	rc = jfs_umount(sb);
+	if (rc)
+		jfs_err("jfs_umount failed with return code %d", rc);
+
+	unload_nls(sbi->nls_tab);
+
+	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
+	iput(sbi->direct_inode);
+
+	kfree(sbi);
+}
+
+enum {
+	Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
+	Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask,
+	Opt_discard, Opt_nodiscard, Opt_discard_minblk
+};
+
+static const match_table_t tokens = {
+	{Opt_integrity, "integrity"},
+	{Opt_nointegrity, "nointegrity"},
+	{Opt_iocharset, "iocharset=%s"},
+	{Opt_resize, "resize=%u"},
+	{Opt_resize_nosize, "resize"},
+	{Opt_errors, "errors=%s"},
+	{Opt_ignore, "noquota"},
+	{Opt_ignore, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%u"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_discard_minblk, "discard=%u"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
+			 int *flag)
+{
+	void *nls_map = (void *)-1;	/* -1: no change;  NULL: none */
+	char *p;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+
+	*newLVSize = 0;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_integrity:
+			*flag &= ~JFS_NOINTEGRITY;
+			break;
+		case Opt_nointegrity:
+			*flag |= JFS_NOINTEGRITY;
+			break;
+		case Opt_ignore:
+			/* Silently ignore the quota options */
+			/* Don't do anything ;-) */
+			break;
+		case Opt_iocharset:
+			if (nls_map && nls_map != (void *) -1)
+				unload_nls(nls_map);
+			if (!strcmp(args[0].from, "none"))
+				nls_map = NULL;
+			else {
+				nls_map = load_nls(args[0].from);
+				if (!nls_map) {
+					pr_err("JFS: charset not found\n");
+					goto cleanup;
+				}
+			}
+			break;
+		case Opt_resize:
+		{
+			char *resize = args[0].from;
+			int rc = kstrtoll(resize, 0, newLVSize);
+
+			if (rc)
+				goto cleanup;
+			break;
+		}
+		case Opt_resize_nosize:
+		{
+			*newLVSize = sb->s_bdev->bd_inode->i_size >>
+				sb->s_blocksize_bits;
+			if (*newLVSize == 0)
+				pr_err("JFS: Cannot determine volume size\n");
+			break;
+		}
+		case Opt_errors:
+		{
+			char *errors = args[0].from;
+			if (!errors || !*errors)
+				goto cleanup;
+			if (!strcmp(errors, "continue")) {
+				*flag &= ~JFS_ERR_REMOUNT_RO;
+				*flag &= ~JFS_ERR_PANIC;
+				*flag |= JFS_ERR_CONTINUE;
+			} else if (!strcmp(errors, "remount-ro")) {
+				*flag &= ~JFS_ERR_CONTINUE;
+				*flag &= ~JFS_ERR_PANIC;
+				*flag |= JFS_ERR_REMOUNT_RO;
+			} else if (!strcmp(errors, "panic")) {
+				*flag &= ~JFS_ERR_CONTINUE;
+				*flag &= ~JFS_ERR_REMOUNT_RO;
+				*flag |= JFS_ERR_PANIC;
+			} else {
+				pr_err("JFS: %s is an invalid error handler\n",
+				       errors);
+				goto cleanup;
+			}
+			break;
+		}
+
+#ifdef CONFIG_QUOTA
+		case Opt_quota:
+		case Opt_usrquota:
+			*flag |= JFS_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			*flag |= JFS_GRPQUOTA;
+			break;
+#else
+		case Opt_usrquota:
+		case Opt_grpquota:
+		case Opt_quota:
+			pr_err("JFS: quota operations not supported\n");
+			break;
+#endif
+		case Opt_uid:
+		{
+			char *uid = args[0].from;
+			uid_t val;
+			int rc = kstrtouint(uid, 0, &val);
+
+			if (rc)
+				goto cleanup;
+			sbi->uid = make_kuid(current_user_ns(), val);
+			if (!uid_valid(sbi->uid))
+				goto cleanup;
+			break;
+		}
+
+		case Opt_gid:
+		{
+			char *gid = args[0].from;
+			gid_t val;
+			int rc = kstrtouint(gid, 0, &val);
+
+			if (rc)
+				goto cleanup;
+			sbi->gid = make_kgid(current_user_ns(), val);
+			if (!gid_valid(sbi->gid))
+				goto cleanup;
+			break;
+		}
+
+		case Opt_umask:
+		{
+			char *umask = args[0].from;
+			int rc = kstrtouint(umask, 8, &sbi->umask);
+
+			if (rc)
+				goto cleanup;
+			if (sbi->umask & ~0777) {
+				pr_err("JFS: Invalid value of umask\n");
+				goto cleanup;
+			}
+			break;
+		}
+
+		case Opt_discard:
+		{
+			struct request_queue *q = bdev_get_queue(sb->s_bdev);
+			/* if set to 1, even copying files will cause
+			 * trimming :O
+			 * -> user has more control over the online trimming
+			 */
+			sbi->minblks_trim = 64;
+			if (blk_queue_discard(q))
+				*flag |= JFS_DISCARD;
+			else
+				pr_err("JFS: discard option not supported on device\n");
+			break;
+		}
+
+		case Opt_nodiscard:
+			*flag &= ~JFS_DISCARD;
+			break;
+
+		case Opt_discard_minblk:
+		{
+			struct request_queue *q = bdev_get_queue(sb->s_bdev);
+			char *minblks_trim = args[0].from;
+			int rc;
+			if (blk_queue_discard(q)) {
+				*flag |= JFS_DISCARD;
+				rc = kstrtouint(minblks_trim, 0,
+						&sbi->minblks_trim);
+				if (rc)
+					goto cleanup;
+			} else
+				pr_err("JFS: discard option not supported on device\n");
+			break;
+		}
+
+		default:
+			printk("jfs: Unrecognized mount option \"%s\" or missing value\n",
+			       p);
+			goto cleanup;
+		}
+	}
+
+	if (nls_map != (void *) -1) {
+		/* Discard old (if remount) */
+		unload_nls(sbi->nls_tab);
+		sbi->nls_tab = nls_map;
+	}
+	return 1;
+
+cleanup:
+	if (nls_map && nls_map != (void *) -1)
+		unload_nls(nls_map);
+	return 0;
+}
+
+static int jfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	s64 newLVSize = 0;
+	int rc = 0;
+	int flag = JFS_SBI(sb)->flag;
+	int ret;
+
+	sync_filesystem(sb);
+	if (!parse_options(data, sb, &newLVSize, &flag))
+		return -EINVAL;
+
+	if (newLVSize) {
+		if (sb->s_flags & MS_RDONLY) {
+			pr_err("JFS: resize requires volume to be mounted read-write\n");
+			return -EROFS;
+		}
+		rc = jfs_extendfs(sb, newLVSize, 0);
+		if (rc)
+			return rc;
+	}
+
+	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		/*
+		 * Invalidate any previously read metadata.  fsck may have
+		 * changed the on-disk data since we mounted r/o
+		 */
+		truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
+
+		JFS_SBI(sb)->flag = flag;
+		ret = jfs_mount_rw(sb, 1);
+
+		/* mark the fs r/w for quota activity */
+		sb->s_flags &= ~MS_RDONLY;
+
+		dquot_resume(sb, -1);
+		return ret;
+	}
+	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+		rc = dquot_suspend(sb, -1);
+		if (rc < 0)
+			return rc;
+		rc = jfs_umount_rw(sb);
+		JFS_SBI(sb)->flag = flag;
+		return rc;
+	}
+	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
+		if (!(sb->s_flags & MS_RDONLY)) {
+			rc = jfs_umount_rw(sb);
+			if (rc)
+				return rc;
+
+			JFS_SBI(sb)->flag = flag;
+			ret = jfs_mount_rw(sb, 1);
+			return ret;
+		}
+	JFS_SBI(sb)->flag = flag;
+
+	return 0;
+}
+
+static int jfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct jfs_sb_info *sbi;
+	struct inode *inode;
+	int rc;
+	s64 newLVSize = 0;
+	int flag, ret = -EINVAL;
+
+	jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
+
+	if (!new_valid_dev(sb->s_bdev->bd_dev))
+		return -EOVERFLOW;
+
+	sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+	sb->s_max_links = JFS_LINK_MAX;
+	sbi->sb = sb;
+	sbi->uid = INVALID_UID;
+	sbi->gid = INVALID_GID;
+	sbi->umask = -1;
+
+	/* initialize the mount flag and determine the default error handler */
+	flag = JFS_ERR_REMOUNT_RO;
+
+	if (!parse_options((char *) data, sb, &newLVSize, &flag))
+		goto out_kfree;
+	sbi->flag = flag;
+
+#ifdef CONFIG_JFS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+
+	if (newLVSize) {
+		pr_err("resize option for remount only\n");
+		goto out_kfree;
+	}
+
+	/*
+	 * Initialize blocksize to 4K.
+	 */
+	sb_set_blocksize(sb, PSIZE);
+
+	/*
+	 * Set method vectors.
+	 */
+	sb->s_op = &jfs_super_operations;
+	sb->s_export_op = &jfs_export_operations;
+	sb->s_xattr = jfs_xattr_handlers;
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &dquot_operations;
+	sb->s_qcop = &dquot_quotactl_ops;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+#endif
+
+	/*
+	 * Initialize direct-mapping inode/address-space
+	 */
+	inode = new_inode(sb);
+	if (inode == NULL) {
+		ret = -ENOMEM;
+		goto out_unload;
+	}
+	inode->i_ino = 0;
+	inode->i_size = sb->s_bdev->bd_inode->i_size;
+	inode->i_mapping->a_ops = &jfs_metapage_aops;
+	hlist_add_fake(&inode->i_hash);
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+
+	sbi->direct_inode = inode;
+
+	rc = jfs_mount(sb);
+	if (rc) {
+		if (!silent)
+			jfs_err("jfs_mount failed w/return code = %d", rc);
+		goto out_mount_failed;
+	}
+	if (sb->s_flags & MS_RDONLY)
+		sbi->log = NULL;
+	else {
+		rc = jfs_mount_rw(sb, 0);
+		if (rc) {
+			if (!silent) {
+				jfs_err("jfs_mount_rw failed, return code = %d",
+					rc);
+			}
+			goto out_no_rw;
+		}
+	}
+
+	sb->s_magic = JFS_SUPER_MAGIC;
+
+	if (sbi->mntflag & JFS_OS2)
+		sb->s_d_op = &jfs_ci_dentry_operations;
+
+	inode = jfs_iget(sb, ROOT_I);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out_no_rw;
+	}
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		goto out_no_root;
+
+	/* logical blocks are represented by 40 bits in pxd_t, etc. */
+	sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
+#if BITS_PER_LONG == 32
+	/*
+	 * Page cache is indexed by long.
+	 * I would use MAX_LFS_FILESIZE, but it's only half as big
+	 */
+	sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1,
+			     (u64)sb->s_maxbytes);
+#endif
+	sb->s_time_gran = 1;
+	return 0;
+
+out_no_root:
+	jfs_err("jfs_read_super: get root dentry failed");
+
+out_no_rw:
+	rc = jfs_umount(sb);
+	if (rc)
+		jfs_err("jfs_umount failed with return code %d", rc);
+out_mount_failed:
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
+	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
+	make_bad_inode(sbi->direct_inode);
+	iput(sbi->direct_inode);
+	sbi->direct_inode = NULL;
+out_unload:
+	unload_nls(sbi->nls_tab);
+out_kfree:
+	kfree(sbi);
+	return ret;
+}
+
+static int jfs_freeze(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+	int rc = 0;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		txQuiesce(sb);
+		rc = lmLogShutdown(log);
+		if (rc) {
+			jfs_error(sb, "lmLogShutdown failed\n");
+
+			/* let operations fail rather than hang */
+			txResume(sb);
+
+			return rc;
+		}
+		rc = updateSuper(sb, FM_CLEAN);
+		if (rc) {
+			jfs_err("jfs_freeze: updateSuper failed\n");
+			/*
+			 * Don't fail here. Everything succeeded except
+			 * marking the superblock clean, so there's really
+			 * no harm in leaving it frozen for now.
+			 */
+		}
+	}
+	return 0;
+}
+
+static int jfs_unfreeze(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+	int rc = 0;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		rc = updateSuper(sb, FM_MOUNT);
+		if (rc) {
+			jfs_error(sb, "updateSuper failed\n");
+			goto out;
+		}
+		rc = lmLogInit(log);
+		if (rc)
+			jfs_error(sb, "lmLogInit failed\n");
+out:
+		txResume(sb);
+	}
+	return rc;
+}
+
+static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+}
+
+static int jfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct jfs_log *log = JFS_SBI(sb)->log;
+
+	/* log == NULL indicates read-only mount */
+	if (log) {
+		/*
+		 * Write quota structures to quota file, sync_blockdev() will
+		 * write them to disk later
+		 */
+		dquot_writeback_dquots(sb, -1);
+		jfs_flush_journal(log, wait);
+		jfs_syncpt(log, 0);
+	}
+
+	return 0;
+}
+
+static int jfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(root->d_sb);
+
+	if (uid_valid(sbi->uid))
+		seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid));
+	if (gid_valid(sbi->gid))
+		seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid));
+	if (sbi->umask != -1)
+		seq_printf(seq, ",umask=%03o", sbi->umask);
+	if (sbi->flag & JFS_NOINTEGRITY)
+		seq_puts(seq, ",nointegrity");
+	if (sbi->flag & JFS_DISCARD)
+		seq_printf(seq, ",discard=%u", sbi->minblks_trim);
+	if (sbi->nls_tab)
+		seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
+	if (sbi->flag & JFS_ERR_CONTINUE)
+		seq_printf(seq, ",errors=continue");
+	if (sbi->flag & JFS_ERR_PANIC)
+		seq_printf(seq, ",errors=panic");
+
+#ifdef CONFIG_QUOTA
+	if (sbi->flag & JFS_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->flag & JFS_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
+			      size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+
+		tmp_bh.b_state = 0;
+		tmp_bh.b_size = 1 << inode->i_blkbits;
+		err = jfs_get_block(inode, blk, &tmp_bh, 0);
+		if (err)
+			return err;
+		if (!buffer_mapped(&tmp_bh))	/* A hole? */
+			memset(data, 0, tocopy);
+		else {
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			if (!bh)
+				return -EIO;
+			memcpy(data, bh->b_data+offset, tocopy);
+			brelse(bh);
+		}
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile */
+static ssize_t jfs_quota_write(struct super_block *sb, int type,
+			       const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t towrite = len;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+
+	mutex_lock(&inode->i_mutex);
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+				sb->s_blocksize - offset : towrite;
+
+		tmp_bh.b_state = 0;
+		tmp_bh.b_size = 1 << inode->i_blkbits;
+		err = jfs_get_block(inode, blk, &tmp_bh, 1);
+		if (err)
+			goto out;
+		if (offset || tocopy != sb->s_blocksize)
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+		else
+			bh = sb_getblk(sb, tmp_bh.b_blocknr);
+		if (!bh) {
+			err = -EIO;
+			goto out;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data+offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		brelse(bh);
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+out:
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+	if (inode->i_size < off+len-towrite)
+		i_size_write(inode, off+len-towrite);
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	mutex_unlock(&inode->i_mutex);
+	return len - towrite;
+}
+
+static struct dquot **jfs_get_dquots(struct inode *inode)
+{
+	return JFS_IP(inode)->i_dquot;
+}
+#endif
+
+static const struct super_operations jfs_super_operations = {
+	.alloc_inode	= jfs_alloc_inode,
+	.destroy_inode	= jfs_destroy_inode,
+	.dirty_inode	= jfs_dirty_inode,
+	.write_inode	= jfs_write_inode,
+	.evict_inode	= jfs_evict_inode,
+	.put_super	= jfs_put_super,
+	.sync_fs	= jfs_sync_fs,
+	.freeze_fs	= jfs_freeze,
+	.unfreeze_fs	= jfs_unfreeze,
+	.statfs		= jfs_statfs,
+	.remount_fs	= jfs_remount,
+	.show_options	= jfs_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= jfs_quota_read,
+	.quota_write	= jfs_quota_write,
+	.get_dquots	= jfs_get_dquots,
+#endif
+};
+
+static const struct export_operations jfs_export_operations = {
+	.fh_to_dentry	= jfs_fh_to_dentry,
+	.fh_to_parent	= jfs_fh_to_parent,
+	.get_parent	= jfs_get_parent,
+};
+
+static struct file_system_type jfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "jfs",
+	.mount		= jfs_do_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("jfs");
+
+static void init_once(void *foo)
+{
+	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
+
+	memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
+	INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
+	init_rwsem(&jfs_ip->rdwrlock);
+	mutex_init(&jfs_ip->commit_mutex);
+	init_rwsem(&jfs_ip->xattr_sem);
+	spin_lock_init(&jfs_ip->ag_lock);
+	jfs_ip->active_ag = -1;
+	inode_init_once(&jfs_ip->vfs_inode);
+}
+
+static int __init init_jfs_fs(void)
+{
+	int i;
+	int rc;
+
+	jfs_inode_cachep =
+	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
+			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			    init_once);
+	if (jfs_inode_cachep == NULL)
+		return -ENOMEM;
+
+	/*
+	 * Metapage initialization
+	 */
+	rc = metapage_init();
+	if (rc) {
+		jfs_err("metapage_init failed w/rc = %d", rc);
+		goto free_slab;
+	}
+
+	/*
+	 * Transaction Manager initialization
+	 */
+	rc = txInit();
+	if (rc) {
+		jfs_err("txInit failed w/rc = %d", rc);
+		goto free_metapage;
+	}
+
+	/*
+	 * I/O completion thread (endio)
+	 */
+	jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO");
+	if (IS_ERR(jfsIOthread)) {
+		rc = PTR_ERR(jfsIOthread);
+		jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
+		goto end_txmngr;
+	}
+
+	if (commit_threads < 1)
+		commit_threads = num_online_cpus();
+	if (commit_threads > MAX_COMMIT_THREADS)
+		commit_threads = MAX_COMMIT_THREADS;
+
+	for (i = 0; i < commit_threads; i++) {
+		jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL,
+						 "jfsCommit");
+		if (IS_ERR(jfsCommitThread[i])) {
+			rc = PTR_ERR(jfsCommitThread[i]);
+			jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
+			commit_threads = i;
+			goto kill_committask;
+		}
+	}
+
+	jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync");
+	if (IS_ERR(jfsSyncThread)) {
+		rc = PTR_ERR(jfsSyncThread);
+		jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
+		goto kill_committask;
+	}
+
+#ifdef PROC_FS_JFS
+	jfs_proc_init();
+#endif
+
+	rc = register_filesystem(&jfs_fs_type);
+	if (!rc)
+		return 0;
+
+#ifdef PROC_FS_JFS
+	jfs_proc_clean();
+#endif
+	kthread_stop(jfsSyncThread);
+kill_committask:
+	for (i = 0; i < commit_threads; i++)
+		kthread_stop(jfsCommitThread[i]);
+	kthread_stop(jfsIOthread);
+end_txmngr:
+	txExit();
+free_metapage:
+	metapage_exit();
+free_slab:
+	kmem_cache_destroy(jfs_inode_cachep);
+	return rc;
+}
+
+static void __exit exit_jfs_fs(void)
+{
+	int i;
+
+	jfs_info("exit_jfs_fs called");
+
+	txExit();
+	metapage_exit();
+
+	kthread_stop(jfsIOthread);
+	for (i = 0; i < commit_threads; i++)
+		kthread_stop(jfsCommitThread[i]);
+	kthread_stop(jfsSyncThread);
+#ifdef PROC_FS_JFS
+	jfs_proc_clean();
+#endif
+	unregister_filesystem(&jfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(jfs_inode_cachep);
+}
+
+module_init(init_jfs_fs)
+module_exit(exit_jfs_fs)
diff --git a/kernel/fs/jfs/symlink.c b/kernel/fs/jfs/symlink.c
new file mode 100644
index 000000000..80f42bcc4
--- /dev/null
+++ b/kernel/fs/jfs/symlink.c
@@ -0,0 +1,52 @@
+/*
+ *   Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_xattr.h"
+
+static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *s = JFS_IP(d_inode(dentry))->i_inline;
+	nd_set_link(nd, s);
+	return NULL;
+}
+
+const struct inode_operations jfs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= jfs_follow_link,
+	.setattr	= jfs_setattr,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+};
+
+const struct inode_operations jfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= jfs_setattr,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+};
+
diff --git a/kernel/fs/jfs/xattr.c b/kernel/fs/jfs/xattr.c
new file mode 100644
index 000000000..48b15a6e5
--- /dev/null
+++ b/kernel/fs/jfs/xattr.c
@@ -0,0 +1,1106 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2000-2004
+ *   Copyright (C) Christoph Hellwig, 2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+#include <linux/security.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_debug.h"
+#include "jfs_dinode.h"
+#include "jfs_extent.h"
+#include "jfs_metapage.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+
+/*
+ *	jfs_xattr.c: extended attribute service
+ *
+ * Overall design --
+ *
+ * Format:
+ *
+ *   Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit
+ *   value) and a variable (0 or more) number of extended attribute
+ *   entries.  Each extended attribute entry (jfs_ea) is a <name,value> double
+ *   where <name> is constructed from a null-terminated ascii string
+ *   (1 ... 255 bytes in the name) and <value> is arbitrary 8 bit data
+ *   (1 ... 65535 bytes).  The in-memory format is
+ *
+ *   0       1        2        4                4 + namelen + 1
+ *   +-------+--------+--------+----------------+-------------------+
+ *   | Flags | Name   | Value  | Name String \0 | Data . . . .      |
+ *   |       | Length | Length |                |                   |
+ *   +-------+--------+--------+----------------+-------------------+
+ *
+ *   A jfs_ea_list then is structured as
+ *
+ *   0            4                   4 + EA_SIZE(ea1)
+ *   +------------+-------------------+--------------------+-----
+ *   | Overall EA | First FEA Element | Second FEA Element | .....
+ *   | List Size  |                   |                    |
+ *   +------------+-------------------+--------------------+-----
+ *
+ *   On-disk:
+ *
+ *	FEALISTs are stored on disk using blocks allocated by dbAlloc() and
+ *	written directly. An EA list may be in-lined in the inode if there is
+ *	sufficient room available.
+ */
+
+struct ea_buffer {
+	int flag;		/* Indicates what storage xattr points to */
+	int max_size;		/* largest xattr that fits in current buffer */
+	dxd_t new_ea;		/* dxd to replace ea when modifying xattr */
+	struct metapage *mp;	/* metapage containing ea list */
+	struct jfs_ea_list *xattr;	/* buffer containing ea list */
+};
+
+/*
+ * ea_buffer.flag values
+ */
+#define EA_INLINE	0x0001
+#define EA_EXTENT	0x0002
+#define EA_NEW		0x0004
+#define EA_MALLOC	0x0008
+
+
+static int is_known_namespace(const char *name)
+{
+	if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return false;
+
+	return true;
+}
+
+/*
+ * These three routines are used to recognize on-disk extended attributes
+ * that are in a recognized namespace.  If the attribute is not recognized,
+ * "os2." is prepended to the name
+ */
+static int is_os2_xattr(struct jfs_ea *ea)
+{
+	return !is_known_namespace(ea->name);
+}
+
+static inline int name_size(struct jfs_ea *ea)
+{
+	if (is_os2_xattr(ea))
+		return ea->namelen + XATTR_OS2_PREFIX_LEN;
+	else
+		return ea->namelen;
+}
+
+static inline int copy_name(char *buffer, struct jfs_ea *ea)
+{
+	int len = ea->namelen;
+
+	if (is_os2_xattr(ea)) {
+		memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN);
+		buffer += XATTR_OS2_PREFIX_LEN;
+		len += XATTR_OS2_PREFIX_LEN;
+	}
+	memcpy(buffer, ea->name, ea->namelen);
+	buffer[ea->namelen] = 0;
+
+	return len;
+}
+
+/* Forward references */
+static void ea_release(struct inode *inode, struct ea_buffer *ea_buf);
+
+/*
+ * NAME: ea_write_inline
+ *
+ * FUNCTION: Attempt to write an EA inline if area is available
+ *
+ * PRE CONDITIONS:
+ *	Already verified that the specified EA is small enough to fit inline
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- EA list pointer
+ *	size	- size of ealist in bytes
+ *	ea	- dxd_t structure to be filled in with necessary EA information
+ *		  if we successfully copy the EA inline
+ *
+ * NOTES:
+ *	Checks if the inode's inline area is available.  If so, copies EA inline
+ *	and sets <ea> fields appropriately.  Otherwise, returns failure, EA will
+ *	have to be put into an extent.
+ *
+ * RETURNS: 0 for successful copy to inline area; -1 if area not available
+ */
+static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist,
+			   int size, dxd_t * ea)
+{
+	struct jfs_inode_info *ji = JFS_IP(ip);
+
+	/*
+	 * Make sure we have an EA -- the NULL EA list is valid, but you
+	 * can't copy it!
+	 */
+	if (ealist && size > sizeof (struct jfs_ea_list)) {
+		assert(size <= sizeof (ji->i_inline_ea));
+
+		/*
+		 * See if the space is available or if it is already being
+		 * used for an inline EA.
+		 */
+		if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE))
+			return -EPERM;
+
+		DXDsize(ea, size);
+		DXDlength(ea, 0);
+		DXDaddress(ea, 0);
+		memcpy(ji->i_inline_ea, ealist, size);
+		ea->flag = DXD_INLINE;
+		ji->mode2 &= ~INLINEEA;
+	} else {
+		ea->flag = 0;
+		DXDsize(ea, 0);
+		DXDlength(ea, 0);
+		DXDaddress(ea, 0);
+
+		/* Free up INLINE area */
+		if (ji->ea.flag & DXD_INLINE)
+			ji->mode2 |= INLINEEA;
+	}
+
+	return 0;
+}
+
+/*
+ * NAME: ea_write
+ *
+ * FUNCTION: Write an EA for an inode
+ *
+ * PRE CONDITIONS: EA has been verified
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- EA list pointer
+ *	size	- size of ealist in bytes
+ *	ea	- dxd_t structure to be filled in appropriately with where the
+ *		  EA was copied
+ *
+ * NOTES: Will write EA inline if able to, otherwise allocates blocks for an
+ *	extent and synchronously writes it to those blocks.
+ *
+ * RETURNS: 0 for success; Anything else indicates failure
+ */
+static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
+		       dxd_t * ea)
+{
+	struct super_block *sb = ip->i_sb;
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int nblocks;
+	s64 blkno;
+	int rc = 0, i;
+	char *cp;
+	s32 nbytes, nb;
+	s32 bytes_to_write;
+	struct metapage *mp;
+
+	/*
+	 * Quick check to see if this is an in-linable EA.  Short EAs
+	 * and empty EAs are all in-linable, provided the space exists.
+	 */
+	if (!ealist || size <= sizeof (ji->i_inline_ea)) {
+		if (!ea_write_inline(ip, ealist, size, ea))
+			return 0;
+	}
+
+	/* figure out how many blocks we need */
+	nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
+
+	/* Allocate new blocks to quota. */
+	rc = dquot_alloc_block(ip, nblocks);
+	if (rc)
+		return rc;
+
+	rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
+	if (rc) {
+		/*Rollback quota allocation. */
+		dquot_free_block(ip, nblocks);
+		return rc;
+	}
+
+	/*
+	 * Now have nblocks worth of storage to stuff into the FEALIST.
+	 * loop over the FEALIST copying data into the buffer one page at
+	 * a time.
+	 */
+	cp = (char *) ealist;
+	nbytes = size;
+	for (i = 0; i < nblocks; i += sbi->nbperpage) {
+		/*
+		 * Determine how many bytes for this request, and round up to
+		 * the nearest aggregate block size
+		 */
+		nb = min(PSIZE, nbytes);
+		bytes_to_write =
+		    ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
+		    << sb->s_blocksize_bits;
+
+		if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) {
+			rc = -EIO;
+			goto failed;
+		}
+
+		memcpy(mp->data, cp, nb);
+
+		/*
+		 * We really need a way to propagate errors for
+		 * forced writes like this one.  --hch
+		 *
+		 * (__write_metapage => release_metapage => flush_metapage)
+		 */
+#ifdef _JFS_FIXME
+		if ((rc = flush_metapage(mp))) {
+			/*
+			 * the write failed -- this means that the buffer
+			 * is still assigned and the blocks are not being
+			 * used.  this seems like the best error recovery
+			 * we can get ...
+			 */
+			goto failed;
+		}
+#else
+		flush_metapage(mp);
+#endif
+
+		cp += PSIZE;
+		nbytes -= nb;
+	}
+
+	ea->flag = DXD_EXTENT;
+	DXDsize(ea, le32_to_cpu(ealist->size));
+	DXDlength(ea, nblocks);
+	DXDaddress(ea, blkno);
+
+	/* Free up INLINE area */
+	if (ji->ea.flag & DXD_INLINE)
+		ji->mode2 |= INLINEEA;
+
+	return 0;
+
+      failed:
+	/* Rollback quota allocation. */
+	dquot_free_block(ip, nblocks);
+
+	dbFree(ip, blkno, nblocks);
+	return rc;
+}
+
+/*
+ * NAME: ea_read_inline
+ *
+ * FUNCTION: Read an inlined EA into user's buffer
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- Pointer to buffer to fill in with EA
+ *
+ * RETURNS: 0
+ */
+static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist)
+{
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	int ea_size = sizeDXD(&ji->ea);
+
+	if (ea_size == 0) {
+		ealist->size = 0;
+		return 0;
+	}
+
+	/* Sanity Check */
+	if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea)))
+		return -EIO;
+	if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size)
+	    != ea_size)
+		return -EIO;
+
+	memcpy(ealist, ji->i_inline_ea, ea_size);
+	return 0;
+}
+
+/*
+ * NAME: ea_read
+ *
+ * FUNCTION: copy EA data into user's buffer
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- Pointer to buffer to fill in with EA
+ *
+ * NOTES:  If EA is inline calls ea_read_inline() to copy EA.
+ *
+ * RETURNS: 0 for success; other indicates failure
+ */
+static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
+{
+	struct super_block *sb = ip->i_sb;
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int nblocks;
+	s64 blkno;
+	char *cp = (char *) ealist;
+	int i;
+	int nbytes, nb;
+	s32 bytes_to_read;
+	struct metapage *mp;
+
+	/* quick check for in-line EA */
+	if (ji->ea.flag & DXD_INLINE)
+		return ea_read_inline(ip, ealist);
+
+	nbytes = sizeDXD(&ji->ea);
+	if (!nbytes) {
+		jfs_error(sb, "nbytes is 0\n");
+		return -EIO;
+	}
+
+	/*
+	 * Figure out how many blocks were allocated when this EA list was
+	 * originally written to disk.
+	 */
+	nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage;
+	blkno = addressDXD(&ji->ea) << sbi->l2nbperpage;
+
+	/*
+	 * I have found the disk blocks which were originally used to store
+	 * the FEALIST.  now i loop over each contiguous block copying the
+	 * data into the buffer.
+	 */
+	for (i = 0; i < nblocks; i += sbi->nbperpage) {
+		/*
+		 * Determine how many bytes for this request, and round up to
+		 * the nearest aggregate block size
+		 */
+		nb = min(PSIZE, nbytes);
+		bytes_to_read =
+		    ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
+		    << sb->s_blocksize_bits;
+
+		if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1)))
+			return -EIO;
+
+		memcpy(cp, mp->data, nb);
+		release_metapage(mp);
+
+		cp += PSIZE;
+		nbytes -= nb;
+	}
+
+	return 0;
+}
+
+/*
+ * NAME: ea_get
+ *
+ * FUNCTION: Returns buffer containing existing extended attributes.
+ *	     The size of the buffer will be the larger of the existing
+ *	     attributes size, or min_size.
+ *
+ *	     The buffer, which may be inlined in the inode or in the
+ *	     page cache must be release by calling ea_release or ea_put
+ *
+ * PARAMETERS:
+ *	inode	- Inode pointer
+ *	ea_buf	- Structure to be populated with ealist and its metadata
+ *	min_size- minimum size of buffer to be returned
+ *
+ * RETURNS: 0 for success; Other indicates failure
+ */
+static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	struct super_block *sb = inode->i_sb;
+	int size;
+	int ea_size = sizeDXD(&ji->ea);
+	int blocks_needed, current_blocks;
+	s64 blkno;
+	int rc;
+	int quota_allocation = 0;
+
+	/* When fsck.jfs clears a bad ea, it doesn't clear the size */
+	if (ji->ea.flag == 0)
+		ea_size = 0;
+
+	if (ea_size == 0) {
+		if (min_size == 0) {
+			ea_buf->flag = 0;
+			ea_buf->max_size = 0;
+			ea_buf->xattr = NULL;
+			return 0;
+		}
+		if ((min_size <= sizeof (ji->i_inline_ea)) &&
+		    (ji->mode2 & INLINEEA)) {
+			ea_buf->flag = EA_INLINE | EA_NEW;
+			ea_buf->max_size = sizeof (ji->i_inline_ea);
+			ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
+			DXDlength(&ea_buf->new_ea, 0);
+			DXDaddress(&ea_buf->new_ea, 0);
+			ea_buf->new_ea.flag = DXD_INLINE;
+			DXDsize(&ea_buf->new_ea, min_size);
+			return 0;
+		}
+		current_blocks = 0;
+	} else if (ji->ea.flag & DXD_INLINE) {
+		if (min_size <= sizeof (ji->i_inline_ea)) {
+			ea_buf->flag = EA_INLINE;
+			ea_buf->max_size = sizeof (ji->i_inline_ea);
+			ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
+			goto size_check;
+		}
+		current_blocks = 0;
+	} else {
+		if (!(ji->ea.flag & DXD_EXTENT)) {
+			jfs_error(sb, "invalid ea.flag\n");
+			return -EIO;
+		}
+		current_blocks = (ea_size + sb->s_blocksize - 1) >>
+		    sb->s_blocksize_bits;
+	}
+	size = max(min_size, ea_size);
+
+	if (size > PSIZE) {
+		/*
+		 * To keep the rest of the code simple.  Allocate a
+		 * contiguous buffer to work with
+		 */
+		ea_buf->xattr = kmalloc(size, GFP_KERNEL);
+		if (ea_buf->xattr == NULL)
+			return -ENOMEM;
+
+		ea_buf->flag = EA_MALLOC;
+		ea_buf->max_size = (size + sb->s_blocksize - 1) &
+		    ~(sb->s_blocksize - 1);
+
+		if (ea_size == 0)
+			return 0;
+
+		if ((rc = ea_read(inode, ea_buf->xattr))) {
+			kfree(ea_buf->xattr);
+			ea_buf->xattr = NULL;
+			return rc;
+		}
+		goto size_check;
+	}
+	blocks_needed = (min_size + sb->s_blocksize - 1) >>
+	    sb->s_blocksize_bits;
+
+	if (blocks_needed > current_blocks) {
+		/* Allocate new blocks to quota. */
+		rc = dquot_alloc_block(inode, blocks_needed);
+		if (rc)
+			return -EDQUOT;
+
+		quota_allocation = blocks_needed;
+
+		rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed,
+			     &blkno);
+		if (rc)
+			goto clean_up;
+
+		DXDlength(&ea_buf->new_ea, blocks_needed);
+		DXDaddress(&ea_buf->new_ea, blkno);
+		ea_buf->new_ea.flag = DXD_EXTENT;
+		DXDsize(&ea_buf->new_ea, min_size);
+
+		ea_buf->flag = EA_EXTENT | EA_NEW;
+
+		ea_buf->mp = get_metapage(inode, blkno,
+					  blocks_needed << sb->s_blocksize_bits,
+					  1);
+		if (ea_buf->mp == NULL) {
+			dbFree(inode, blkno, (s64) blocks_needed);
+			rc = -EIO;
+			goto clean_up;
+		}
+		ea_buf->xattr = ea_buf->mp->data;
+		ea_buf->max_size = (min_size + sb->s_blocksize - 1) &
+		    ~(sb->s_blocksize - 1);
+		if (ea_size == 0)
+			return 0;
+		if ((rc = ea_read(inode, ea_buf->xattr))) {
+			discard_metapage(ea_buf->mp);
+			dbFree(inode, blkno, (s64) blocks_needed);
+			goto clean_up;
+		}
+		goto size_check;
+	}
+	ea_buf->flag = EA_EXTENT;
+	ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea),
+				   lengthDXD(&ji->ea) << sb->s_blocksize_bits,
+				   1);
+	if (ea_buf->mp == NULL) {
+		rc = -EIO;
+		goto clean_up;
+	}
+	ea_buf->xattr = ea_buf->mp->data;
+	ea_buf->max_size = (ea_size + sb->s_blocksize - 1) &
+	    ~(sb->s_blocksize - 1);
+
+      size_check:
+	if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
+		printk(KERN_ERR "ea_get: invalid extended attribute\n");
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
+				     ea_buf->xattr, ea_size, 1);
+		ea_release(inode, ea_buf);
+		rc = -EIO;
+		goto clean_up;
+	}
+
+	return ea_size;
+
+      clean_up:
+	/* Rollback quota allocation */
+	if (quota_allocation)
+		dquot_free_block(inode, quota_allocation);
+
+	return (rc);
+}
+
+static void ea_release(struct inode *inode, struct ea_buffer *ea_buf)
+{
+	if (ea_buf->flag & EA_MALLOC)
+		kfree(ea_buf->xattr);
+	else if (ea_buf->flag & EA_EXTENT) {
+		assert(ea_buf->mp);
+		release_metapage(ea_buf->mp);
+
+		if (ea_buf->flag & EA_NEW)
+			dbFree(inode, addressDXD(&ea_buf->new_ea),
+			       lengthDXD(&ea_buf->new_ea));
+	}
+}
+
+static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
+		  int new_size)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	unsigned long old_blocks, new_blocks;
+	int rc = 0;
+
+	if (new_size == 0) {
+		ea_release(inode, ea_buf);
+		ea_buf = NULL;
+	} else if (ea_buf->flag & EA_INLINE) {
+		assert(new_size <= sizeof (ji->i_inline_ea));
+		ji->mode2 &= ~INLINEEA;
+		ea_buf->new_ea.flag = DXD_INLINE;
+		DXDsize(&ea_buf->new_ea, new_size);
+		DXDaddress(&ea_buf->new_ea, 0);
+		DXDlength(&ea_buf->new_ea, 0);
+	} else if (ea_buf->flag & EA_MALLOC) {
+		rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
+		kfree(ea_buf->xattr);
+	} else if (ea_buf->flag & EA_NEW) {
+		/* We have already allocated a new dxd */
+		flush_metapage(ea_buf->mp);
+	} else {
+		/* ->xattr must point to original ea's metapage */
+		rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
+		discard_metapage(ea_buf->mp);
+	}
+	if (rc)
+		return rc;
+
+	old_blocks = new_blocks = 0;
+
+	if (ji->ea.flag & DXD_EXTENT) {
+		invalidate_dxd_metapages(inode, ji->ea);
+		old_blocks = lengthDXD(&ji->ea);
+	}
+
+	if (ea_buf) {
+		txEA(tid, inode, &ji->ea, &ea_buf->new_ea);
+		if (ea_buf->new_ea.flag & DXD_EXTENT) {
+			new_blocks = lengthDXD(&ea_buf->new_ea);
+			if (ji->ea.flag & DXD_INLINE)
+				ji->mode2 |= INLINEEA;
+		}
+		ji->ea = ea_buf->new_ea;
+	} else {
+		txEA(tid, inode, &ji->ea, NULL);
+		if (ji->ea.flag & DXD_INLINE)
+			ji->mode2 |= INLINEEA;
+		ji->ea.flag = 0;
+		ji->ea.size = 0;
+	}
+
+	/* If old blocks exist, they must be removed from quota allocation. */
+	if (old_blocks)
+		dquot_free_block(inode, old_blocks);
+
+	inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
+/*
+ * Most of the permission checking is done by xattr_permission in the vfs.
+ * We also need to verify that this is a namespace that we recognize.
+ */
+static int can_set_xattr(struct inode *inode, const char *name,
+			 const void *value, size_t value_len)
+{
+	if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
+		/*
+		 * This makes sure that we aren't trying to set an
+		 * attribute in a different namespace by prefixing it
+		 * with "os2."
+		 */
+		if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
+			return -EOPNOTSUPP;
+		return 0;
+	}
+
+	/*
+	 * Don't allow setting an attribute in an unknown namespace.
+	 */
+	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
+	struct jfs_ea_list *ealist;
+	struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL;
+	struct ea_buffer ea_buf;
+	int old_ea_size = 0;
+	int xattr_size;
+	int new_size;
+	int namelen = strlen(name);
+	char *os2name = NULL;
+	int found = 0;
+	int rc;
+	int length;
+
+	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+		os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
+				  GFP_KERNEL);
+		if (!os2name)
+			return -ENOMEM;
+		strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
+		name = os2name;
+		namelen -= XATTR_OS2_PREFIX_LEN;
+	}
+
+	down_write(&JFS_IP(inode)->xattr_sem);
+
+	xattr_size = ea_get(inode, &ea_buf, 0);
+	if (xattr_size < 0) {
+		rc = xattr_size;
+		goto out;
+	}
+
+      again:
+	ealist = (struct jfs_ea_list *) ea_buf.xattr;
+	new_size = sizeof (struct jfs_ea_list);
+
+	if (xattr_size) {
+		for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist);
+		     ea = NEXT_EA(ea)) {
+			if ((namelen == ea->namelen) &&
+			    (memcmp(name, ea->name, namelen) == 0)) {
+				found = 1;
+				if (flags & XATTR_CREATE) {
+					rc = -EEXIST;
+					goto release;
+				}
+				old_ea = ea;
+				old_ea_size = EA_SIZE(ea);
+				next_ea = NEXT_EA(ea);
+			} else
+				new_size += EA_SIZE(ea);
+		}
+	}
+
+	if (!found) {
+		if (flags & XATTR_REPLACE) {
+			rc = -ENODATA;
+			goto release;
+		}
+		if (value == NULL) {
+			rc = 0;
+			goto release;
+		}
+	}
+	if (value)
+		new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len;
+
+	if (new_size > ea_buf.max_size) {
+		/*
+		 * We need to allocate more space for merged ea list.
+		 * We should only have loop to again: once.
+		 */
+		ea_release(inode, &ea_buf);
+		xattr_size = ea_get(inode, &ea_buf, new_size);
+		if (xattr_size < 0) {
+			rc = xattr_size;
+			goto out;
+		}
+		goto again;
+	}
+
+	/* Remove old ea of the same name */
+	if (found) {
+		/* number of bytes following target EA */
+		length = (char *) END_EALIST(ealist) - (char *) next_ea;
+		if (length > 0)
+			memmove(old_ea, next_ea, length);
+		xattr_size -= old_ea_size;
+	}
+
+	/* Add new entry to the end */
+	if (value) {
+		if (xattr_size == 0)
+			/* Completely new ea list */
+			xattr_size = sizeof (struct jfs_ea_list);
+
+		/*
+		 * The size of EA value is limitted by on-disk format up to
+		 *  __le16, there would be an overflow if the size is equal
+		 * to XATTR_SIZE_MAX (65536).  In order to avoid this issue,
+		 * we can pre-checkup the value size against USHRT_MAX, and
+		 * return -E2BIG in this case, which is consistent with the
+		 * VFS setxattr interface.
+		 */
+		if (value_len >= USHRT_MAX) {
+			rc = -E2BIG;
+			goto release;
+		}
+
+		ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
+		ea->flag = 0;
+		ea->namelen = namelen;
+		ea->valuelen = (cpu_to_le16(value_len));
+		memcpy(ea->name, name, namelen);
+		ea->name[namelen] = 0;
+		if (value_len)
+			memcpy(&ea->name[namelen + 1], value, value_len);
+		xattr_size += EA_SIZE(ea);
+	}
+
+	/* DEBUG - If we did this right, these number match */
+	if (xattr_size != new_size) {
+		printk(KERN_ERR
+		       "__jfs_setxattr: xattr_size = %d, new_size = %d\n",
+		       xattr_size, new_size);
+
+		rc = -EINVAL;
+		goto release;
+	}
+
+	/*
+	 * If we're left with an empty list, there's no ea
+	 */
+	if (new_size == sizeof (struct jfs_ea_list))
+		new_size = 0;
+
+	ealist->size = cpu_to_le32(new_size);
+
+	rc = ea_put(tid, inode, &ea_buf, new_size);
+
+	goto out;
+      release:
+	ea_release(inode, &ea_buf);
+      out:
+	up_write(&JFS_IP(inode)->xattr_sem);
+
+	kfree(os2name);
+
+	return rc;
+}
+
+int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		 size_t value_len, int flags)
+{
+	struct inode *inode = d_inode(dentry);
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	int rc;
+	tid_t tid;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, value_len, flags);
+
+	if ((rc = can_set_xattr(inode, name, value, value_len)))
+		return rc;
+
+	if (value == NULL) {	/* empty EA, do not remove */
+		value = "";
+		value_len = 0;
+	}
+
+	tid = txBegin(inode->i_sb, 0);
+	mutex_lock(&ji->commit_mutex);
+	rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len,
+			    flags);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	mutex_unlock(&ji->commit_mutex);
+
+	return rc;
+}
+
+ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
+		       size_t buf_size)
+{
+	struct jfs_ea_list *ealist;
+	struct jfs_ea *ea;
+	struct ea_buffer ea_buf;
+	int xattr_size;
+	ssize_t size;
+	int namelen = strlen(name);
+	char *value;
+
+	down_read(&JFS_IP(inode)->xattr_sem);
+
+	xattr_size = ea_get(inode, &ea_buf, 0);
+
+	if (xattr_size < 0) {
+		size = xattr_size;
+		goto out;
+	}
+
+	if (xattr_size == 0)
+		goto not_found;
+
+	ealist = (struct jfs_ea_list *) ea_buf.xattr;
+
+	/* Find the named attribute */
+	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea))
+		if ((namelen == ea->namelen) &&
+		    memcmp(name, ea->name, namelen) == 0) {
+			/* Found it */
+			size = le16_to_cpu(ea->valuelen);
+			if (!data)
+				goto release;
+			else if (size > buf_size) {
+				size = -ERANGE;
+				goto release;
+			}
+			value = ((char *) &ea->name) + ea->namelen + 1;
+			memcpy(data, value, size);
+			goto release;
+		}
+      not_found:
+	size = -ENODATA;
+      release:
+	ea_release(inode, &ea_buf);
+      out:
+	up_read(&JFS_IP(inode)->xattr_sem);
+
+	return size;
+}
+
+ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
+		     size_t buf_size)
+{
+	int err;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, data, buf_size);
+
+	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+		/*
+		 * skip past "os2." prefix
+		 */
+		name += XATTR_OS2_PREFIX_LEN;
+		/*
+		 * Don't allow retrieving properly prefixed attributes
+		 * by prepending them with "os2."
+		 */
+		if (is_known_namespace(name))
+			return -EOPNOTSUPP;
+	}
+
+	err = __jfs_getxattr(d_inode(dentry), name, data, buf_size);
+
+	return err;
+}
+
+/*
+ * No special permissions are needed to list attributes except for trusted.*
+ */
+static inline int can_list(struct jfs_ea *ea)
+{
+	return (strncmp(ea->name, XATTR_TRUSTED_PREFIX,
+			    XATTR_TRUSTED_PREFIX_LEN) ||
+		capable(CAP_SYS_ADMIN));
+}
+
+ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
+{
+	struct inode *inode = d_inode(dentry);
+	char *buffer;
+	ssize_t size = 0;
+	int xattr_size;
+	struct jfs_ea_list *ealist;
+	struct jfs_ea *ea;
+	struct ea_buffer ea_buf;
+
+	down_read(&JFS_IP(inode)->xattr_sem);
+
+	xattr_size = ea_get(inode, &ea_buf, 0);
+	if (xattr_size < 0) {
+		size = xattr_size;
+		goto out;
+	}
+
+	if (xattr_size == 0)
+		goto release;
+
+	ealist = (struct jfs_ea_list *) ea_buf.xattr;
+
+	/* compute required size of list */
+	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
+		if (can_list(ea))
+			size += name_size(ea) + 1;
+	}
+
+	if (!data)
+		goto release;
+
+	if (size > buf_size) {
+		size = -ERANGE;
+		goto release;
+	}
+
+	/* Copy attribute names to buffer */
+	buffer = data;
+	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
+		if (can_list(ea)) {
+			int namelen = copy_name(buffer, ea);
+			buffer += namelen + 1;
+		}
+	}
+
+      release:
+	ea_release(inode, &ea_buf);
+      out:
+	up_read(&JFS_IP(inode)->xattr_sem);
+	return size;
+}
+
+int jfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = d_inode(dentry);
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	int rc;
+	tid_t tid;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	if ((rc = can_set_xattr(inode, name, NULL, 0)))
+		return rc;
+
+	tid = txBegin(inode->i_sb, 0);
+	mutex_lock(&ji->commit_mutex);
+	rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	mutex_unlock(&ji->commit_mutex);
+
+	return rc;
+}
+
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+const struct xattr_handler *jfs_xattr_handlers[] = {
+#ifdef CONFIG_JFS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL,
+};
+
+
+#ifdef CONFIG_JFS_SECURITY
+static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+			  void *fs_info)
+{
+	const struct xattr *xattr;
+	tid_t *tid = fs_info;
+	char *name;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+			       strlen(xattr->name) + 1, GFP_NOFS);
+		if (!name) {
+			err = -ENOMEM;
+			break;
+		}
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+		err = __jfs_setxattr(*tid, inode, name,
+				     xattr->value, xattr->value_len, 0);
+		kfree(name);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+		      const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &jfs_initxattrs, &tid);
+}
+#endif
diff --git a/kernel/fs/kernfs/Kconfig b/kernel/fs/kernfs/Kconfig
new file mode 100644
index 000000000..397b5f7a7
--- /dev/null
+++ b/kernel/fs/kernfs/Kconfig
@@ -0,0 +1,7 @@
+#
+# KERNFS should be selected by its users
+#
+
+config KERNFS
+	bool
+	default n
diff --git a/kernel/fs/kernfs/Makefile b/kernel/fs/kernfs/Makefile
new file mode 100644
index 000000000..674337c76
--- /dev/null
+++ b/kernel/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the kernfs pseudo filesystem
+#
+
+obj-y		:= mount.o inode.o dir.o file.o symlink.o
diff --git a/kernel/fs/kernfs/dir.c b/kernel/fs/kernfs/dir.c
new file mode 100644
index 000000000..2d48d28e1
--- /dev/null
+++ b/kernel/fs/kernfs/dir.c
@@ -0,0 +1,1464 @@
+/*
+ * fs/kernfs/dir.c - kernfs directory implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+
+#include "kernfs-internal.h"
+
+DEFINE_MUTEX(kernfs_mutex);
+static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
+static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
+
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+
+static bool kernfs_active(struct kernfs_node *kn)
+{
+	lockdep_assert_held(&kernfs_mutex);
+	return atomic_read(&kn->active) >= 0;
+}
+
+static bool kernfs_lockdep(struct kernfs_node *kn)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	return kn->flags & KERNFS_LOCKDEP;
+#else
+	return false;
+#endif
+}
+
+static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+	return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
+}
+
+static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
+					      size_t buflen)
+{
+	char *p = buf + buflen;
+	int len;
+
+	*--p = '\0';
+
+	do {
+		len = strlen(kn->name);
+		if (p - buf < len + 1) {
+			buf[0] = '\0';
+			p = NULL;
+			break;
+		}
+		p -= len;
+		memcpy(p, kn->name, len);
+		*--p = '/';
+		kn = kn->parent;
+	} while (kn && kn->parent);
+
+	return p;
+}
+
+/**
+ * kernfs_name - obtain the name of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
+ * similar to strlcpy().  It returns the length of @kn's name and if @buf
+ * isn't long enough, it's filled upto @buflen-1 and nul terminated.
+ *
+ * This function can be called from any context.
+ */
+int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	ret = kernfs_name_locked(kn, buf, buflen);
+	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	return ret;
+}
+
+/**
+ * kernfs_path - build full path of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Builds and returns the full path of @kn in @buf of @buflen bytes.  The
+ * path is built from the end of @buf so the returned pointer usually
+ * doesn't match @buf.  If @buf isn't long enough, @buf is nul terminated
+ * and %NULL is returned.
+ */
+char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+	unsigned long flags;
+	char *p;
+
+	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	p = kernfs_path_locked(kn, buf, buflen);
+	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	return p;
+}
+EXPORT_SYMBOL_GPL(kernfs_path);
+
+/**
+ * pr_cont_kernfs_name - pr_cont name of a kernfs_node
+ * @kn: kernfs_node of interest
+ *
+ * This function can be called from any context.
+ */
+void pr_cont_kernfs_name(struct kernfs_node *kn)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+	kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+	pr_cont("%s", kernfs_pr_cont_buf);
+
+	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+}
+
+/**
+ * pr_cont_kernfs_path - pr_cont path of a kernfs_node
+ * @kn: kernfs_node of interest
+ *
+ * This function can be called from any context.
+ */
+void pr_cont_kernfs_path(struct kernfs_node *kn)
+{
+	unsigned long flags;
+	char *p;
+
+	spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+	p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
+			       sizeof(kernfs_pr_cont_buf));
+	if (p)
+		pr_cont("%s", p);
+	else
+		pr_cont("<name too long>");
+
+	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+}
+
+/**
+ * kernfs_get_parent - determine the parent node and pin it
+ * @kn: kernfs_node of interest
+ *
+ * Determines @kn's parent, pins and returns it.  This function can be
+ * called from any context.
+ */
+struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
+{
+	struct kernfs_node *parent;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	parent = kn->parent;
+	kernfs_get(parent);
+	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+	return parent;
+}
+
+/**
+ *	kernfs_name_hash
+ *	@name: Null terminated string to hash
+ *	@ns:   Namespace tag to hash
+ *
+ *	Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int kernfs_name_hash(const char *name, const void *ns)
+{
+	unsigned long hash = init_name_hash();
+	unsigned int len = strlen(name);
+	while (len--)
+		hash = partial_name_hash(*name++, hash);
+	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+	hash &= 0x7fffffffU;
+	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+	if (hash < 2)
+		hash += 2;
+	if (hash >= INT_MAX)
+		hash = INT_MAX - 1;
+	return hash;
+}
+
+static int kernfs_name_compare(unsigned int hash, const char *name,
+			       const void *ns, const struct kernfs_node *kn)
+{
+	if (hash < kn->hash)
+		return -1;
+	if (hash > kn->hash)
+		return 1;
+	if (ns < kn->ns)
+		return -1;
+	if (ns > kn->ns)
+		return 1;
+	return strcmp(name, kn->name);
+}
+
+static int kernfs_sd_compare(const struct kernfs_node *left,
+			     const struct kernfs_node *right)
+{
+	return kernfs_name_compare(left->hash, left->name, left->ns, right);
+}
+
+/**
+ *	kernfs_link_sibling - link kernfs_node into sibling rbtree
+ *	@kn: kernfs_node of interest
+ *
+ *	Link @kn into its sibling rbtree which starts from
+ *	@kn->parent->dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(kernfs_mutex)
+ *
+ *	RETURNS:
+ *	0 on susccess -EEXIST on failure.
+ */
+static int kernfs_link_sibling(struct kernfs_node *kn)
+{
+	struct rb_node **node = &kn->parent->dir.children.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*node) {
+		struct kernfs_node *pos;
+		int result;
+
+		pos = rb_to_kn(*node);
+		parent = *node;
+		result = kernfs_sd_compare(kn, pos);
+		if (result < 0)
+			node = &pos->rb.rb_left;
+		else if (result > 0)
+			node = &pos->rb.rb_right;
+		else
+			return -EEXIST;
+	}
+
+	/* add new node and rebalance the tree */
+	rb_link_node(&kn->rb, parent, node);
+	rb_insert_color(&kn->rb, &kn->parent->dir.children);
+
+	/* successfully added, account subdir number */
+	if (kernfs_type(kn) == KERNFS_DIR)
+		kn->parent->dir.subdirs++;
+
+	return 0;
+}
+
+/**
+ *	kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ *	@kn: kernfs_node of interest
+ *
+ *	Try to unlink @kn from its sibling rbtree which starts from
+ *	kn->parent->dir.children.  Returns %true if @kn was actually
+ *	removed, %false if @kn wasn't on the rbtree.
+ *
+ *	Locking:
+ *	mutex_lock(kernfs_mutex)
+ */
+static bool kernfs_unlink_sibling(struct kernfs_node *kn)
+{
+	if (RB_EMPTY_NODE(&kn->rb))
+		return false;
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		kn->parent->dir.subdirs--;
+
+	rb_erase(&kn->rb, &kn->parent->dir.children);
+	RB_CLEAR_NODE(&kn->rb);
+	return true;
+}
+
+/**
+ *	kernfs_get_active - get an active reference to kernfs_node
+ *	@kn: kernfs_node to get an active reference to
+ *
+ *	Get an active reference of @kn.  This function is noop if @kn
+ *	is NULL.
+ *
+ *	RETURNS:
+ *	Pointer to @kn on success, NULL on failure.
+ */
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
+{
+	if (unlikely(!kn))
+		return NULL;
+
+	if (!atomic_inc_unless_negative(&kn->active))
+		return NULL;
+
+	if (kernfs_lockdep(kn))
+		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+	return kn;
+}
+
+/**
+ *	kernfs_put_active - put an active reference to kernfs_node
+ *	@kn: kernfs_node to put an active reference to
+ *
+ *	Put an active reference to @kn.  This function is noop if @kn
+ *	is NULL.
+ */
+void kernfs_put_active(struct kernfs_node *kn)
+{
+	struct kernfs_root *root = kernfs_root(kn);
+	int v;
+
+	if (unlikely(!kn))
+		return;
+
+	if (kernfs_lockdep(kn))
+		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	v = atomic_dec_return(&kn->active);
+	if (likely(v != KN_DEACTIVATED_BIAS))
+		return;
+
+	wake_up_all(&root->deactivate_waitq);
+}
+
+/**
+ * kernfs_drain - drain kernfs_node
+ * @kn: kernfs_node to drain
+ *
+ * Drain existing usages and nuke all existing mmaps of @kn.  Mutiple
+ * removers may invoke this function concurrently on @kn and all will
+ * return after draining is complete.
+ */
+static void kernfs_drain(struct kernfs_node *kn)
+	__releases(&kernfs_mutex) __acquires(&kernfs_mutex)
+{
+	struct kernfs_root *root = kernfs_root(kn);
+
+	lockdep_assert_held(&kernfs_mutex);
+	WARN_ON_ONCE(kernfs_active(kn));
+
+	mutex_unlock(&kernfs_mutex);
+
+	if (kernfs_lockdep(kn)) {
+		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
+			lock_contended(&kn->dep_map, _RET_IP_);
+	}
+
+	/* but everyone should wait for draining */
+	wait_event(root->deactivate_waitq,
+		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
+
+	if (kernfs_lockdep(kn)) {
+		lock_acquired(&kn->dep_map, _RET_IP_);
+		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	}
+
+	kernfs_unmap_bin_file(kn);
+
+	mutex_lock(&kernfs_mutex);
+}
+
+/**
+ * kernfs_get - get a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ */
+void kernfs_get(struct kernfs_node *kn)
+{
+	if (kn) {
+		WARN_ON(!atomic_read(&kn->count));
+		atomic_inc(&kn->count);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+
+/**
+ * kernfs_put - put a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ *
+ * Put a reference count of @kn and destroy it if it reached zero.
+ */
+void kernfs_put(struct kernfs_node *kn)
+{
+	struct kernfs_node *parent;
+	struct kernfs_root *root;
+
+	if (!kn || !atomic_dec_and_test(&kn->count))
+		return;
+	root = kernfs_root(kn);
+ repeat:
+	/*
+	 * Moving/renaming is always done while holding reference.
+	 * kn->parent won't change beneath us.
+	 */
+	parent = kn->parent;
+
+	WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
+		  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
+		  parent ? parent->name : "", kn->name, atomic_read(&kn->active));
+
+	if (kernfs_type(kn) == KERNFS_LINK)
+		kernfs_put(kn->symlink.target_kn);
+
+	kfree_const(kn->name);
+
+	if (kn->iattr) {
+		if (kn->iattr->ia_secdata)
+			security_release_secctx(kn->iattr->ia_secdata,
+						kn->iattr->ia_secdata_len);
+		simple_xattrs_free(&kn->iattr->xattrs);
+	}
+	kfree(kn->iattr);
+	ida_simple_remove(&root->ino_ida, kn->ino);
+	kmem_cache_free(kernfs_node_cache, kn);
+
+	kn = parent;
+	if (kn) {
+		if (atomic_dec_and_test(&kn->count))
+			goto repeat;
+	} else {
+		/* just released the root kn, free @root too */
+		ida_destroy(&root->ino_ida);
+		kfree(root);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_put);
+
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct kernfs_node *kn;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	/* Always perform fresh lookup for negatives */
+	if (d_really_is_negative(dentry))
+		goto out_bad_unlocked;
+
+	kn = dentry->d_fsdata;
+	mutex_lock(&kernfs_mutex);
+
+	/* The kernfs node has been deactivated */
+	if (!kernfs_active(kn))
+		goto out_bad;
+
+	/* The kernfs node has been moved? */
+	if (dentry->d_parent->d_fsdata != kn->parent)
+		goto out_bad;
+
+	/* The kernfs node has been renamed */
+	if (strcmp(dentry->d_name.name, kn->name) != 0)
+		goto out_bad;
+
+	/* The kernfs node has been moved to a different namespace */
+	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+	    kernfs_info(dentry->d_sb)->ns != kn->ns)
+		goto out_bad;
+
+	mutex_unlock(&kernfs_mutex);
+	return 1;
+out_bad:
+	mutex_unlock(&kernfs_mutex);
+out_bad_unlocked:
+	return 0;
+}
+
+static void kernfs_dop_release(struct dentry *dentry)
+{
+	kernfs_put(dentry->d_fsdata);
+}
+
+const struct dentry_operations kernfs_dops = {
+	.d_revalidate	= kernfs_dop_revalidate,
+	.d_release	= kernfs_dop_release,
+};
+
+/**
+ * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
+ * @dentry: the dentry in question
+ *
+ * Return the kernfs_node associated with @dentry.  If @dentry is not a
+ * kernfs one, %NULL is returned.
+ *
+ * While the returned kernfs_node will stay accessible as long as @dentry
+ * is accessible, the returned node can be in any state and the caller is
+ * fully responsible for determining what's accessible.
+ */
+struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
+{
+	if (dentry->d_sb->s_op == &kernfs_sops)
+		return dentry->d_fsdata;
+	return NULL;
+}
+
+static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+					     const char *name, umode_t mode,
+					     unsigned flags)
+{
+	struct kernfs_node *kn;
+	int ret;
+
+	name = kstrdup_const(name, GFP_KERNEL);
+	if (!name)
+		return NULL;
+
+	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
+	if (!kn)
+		goto err_out1;
+
+	/*
+	 * If the ino of the sysfs entry created for a kmem cache gets
+	 * allocated from an ida layer, which is accounted to the memcg that
+	 * owns the cache, the memcg will get pinned forever. So do not account
+	 * ino ida allocations.
+	 */
+	ret = ida_simple_get(&root->ino_ida, 1, 0,
+			     GFP_KERNEL | __GFP_NOACCOUNT);
+	if (ret < 0)
+		goto err_out2;
+	kn->ino = ret;
+
+	atomic_set(&kn->count, 1);
+	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
+	RB_CLEAR_NODE(&kn->rb);
+
+	kn->name = name;
+	kn->mode = mode;
+	kn->flags = flags;
+
+	return kn;
+
+ err_out2:
+	kmem_cache_free(kernfs_node_cache, kn);
+ err_out1:
+	kfree_const(name);
+	return NULL;
+}
+
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+				    const char *name, umode_t mode,
+				    unsigned flags)
+{
+	struct kernfs_node *kn;
+
+	kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+	if (kn) {
+		kernfs_get(parent);
+		kn->parent = parent;
+	}
+	return kn;
+}
+
+/**
+ *	kernfs_add_one - add kernfs_node to parent without warning
+ *	@kn: kernfs_node to be added
+ *
+ *	The caller must already have initialized @kn->parent.  This
+ *	function increments nlink of the parent's inode if @kn is a
+ *	directory and link into the children list of the parent.
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int kernfs_add_one(struct kernfs_node *kn)
+{
+	struct kernfs_node *parent = kn->parent;
+	struct kernfs_iattrs *ps_iattr;
+	bool has_ns;
+	int ret;
+
+	mutex_lock(&kernfs_mutex);
+
+	ret = -EINVAL;
+	has_ns = kernfs_ns_enabled(parent);
+	if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		 has_ns ? "required" : "invalid", parent->name, kn->name))
+		goto out_unlock;
+
+	if (kernfs_type(parent) != KERNFS_DIR)
+		goto out_unlock;
+
+	ret = -ENOENT;
+	if (parent->flags & KERNFS_EMPTY_DIR)
+		goto out_unlock;
+
+	if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
+		goto out_unlock;
+
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
+
+	ret = kernfs_link_sibling(kn);
+	if (ret)
+		goto out_unlock;
+
+	/* Update timestamps on the parent */
+	ps_iattr = parent->iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
+	mutex_unlock(&kernfs_mutex);
+
+	/*
+	 * Activate the new node unless CREATE_DEACTIVATED is requested.
+	 * If not activated here, the kernfs user is responsible for
+	 * activating the node with kernfs_activate().  A node which hasn't
+	 * been activated is not visible to userland and its removal won't
+	 * trigger deactivation.
+	 */
+	if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
+		kernfs_activate(kn);
+	return 0;
+
+out_unlock:
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+/**
+ * kernfs_find_ns - find kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent.  Returns pointer to
+ * the found kernfs_node on success, %NULL on failure.
+ */
+static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
+					  const unsigned char *name,
+					  const void *ns)
+{
+	struct rb_node *node = parent->dir.children.rb_node;
+	bool has_ns = kernfs_ns_enabled(parent);
+	unsigned int hash;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	if (has_ns != (bool)ns) {
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid", parent->name, name);
+		return NULL;
+	}
+
+	hash = kernfs_name_hash(name, ns);
+	while (node) {
+		struct kernfs_node *kn;
+		int result;
+
+		kn = rb_to_kn(node);
+		result = kernfs_name_compare(hash, name, ns, kn);
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return kn;
+	}
+	return NULL;
+}
+
+/**
+ * kernfs_find_and_get_ns - find and get kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+					   const char *name, const void *ns)
+{
+	struct kernfs_node *kn;
+
+	mutex_lock(&kernfs_mutex);
+	kn = kernfs_find_ns(parent, name, ns);
+	kernfs_get(kn);
+	mutex_unlock(&kernfs_mutex);
+
+	return kn;
+}
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
+
+/**
+ * kernfs_create_root - create a new kernfs hierarchy
+ * @scops: optional syscall operations for the hierarchy
+ * @flags: KERNFS_ROOT_* flags
+ * @priv: opaque data associated with the new directory
+ *
+ * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * failure.
+ */
+struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
+				       unsigned int flags, void *priv)
+{
+	struct kernfs_root *root;
+	struct kernfs_node *kn;
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	ida_init(&root->ino_ida);
+	INIT_LIST_HEAD(&root->supers);
+
+	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+			       KERNFS_DIR);
+	if (!kn) {
+		ida_destroy(&root->ino_ida);
+		kfree(root);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	kn->priv = priv;
+	kn->dir.root = root;
+
+	root->syscall_ops = scops;
+	root->flags = flags;
+	root->kn = kn;
+	init_waitqueue_head(&root->deactivate_waitq);
+
+	if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
+		kernfs_activate(kn);
+
+	return root;
+}
+
+/**
+ * kernfs_destroy_root - destroy a kernfs hierarchy
+ * @root: root of the hierarchy to destroy
+ *
+ * Destroy the hierarchy anchored at @root by removing all existing
+ * directories and destroying @root.
+ */
+void kernfs_destroy_root(struct kernfs_root *root)
+{
+	kernfs_remove(root->kn);	/* will also free @root */
+}
+
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @mode: mode of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+					 const char *name, umode_t mode,
+					 void *priv, const void *ns)
+{
+	struct kernfs_node *kn;
+	int rc;
+
+	/* allocate */
+	kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->dir.root = parent->dir.root;
+	kn->ns = ns;
+	kn->priv = priv;
+
+	/* link in */
+	rc = kernfs_add_one(kn);
+	if (!rc)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(rc);
+}
+
+/**
+ * kernfs_create_empty_dir - create an always empty directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
+					    const char *name)
+{
+	struct kernfs_node *kn;
+	int rc;
+
+	/* allocate */
+	kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->flags |= KERNFS_EMPTY_DIR;
+	kn->dir.root = parent->dir.root;
+	kn->ns = NULL;
+	kn->priv = NULL;
+
+	/* link in */
+	rc = kernfs_add_one(kn);
+	if (!rc)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(rc);
+}
+
+static struct dentry *kernfs_iop_lookup(struct inode *dir,
+					struct dentry *dentry,
+					unsigned int flags)
+{
+	struct dentry *ret;
+	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+	struct kernfs_node *kn;
+	struct inode *inode;
+	const void *ns = NULL;
+
+	mutex_lock(&kernfs_mutex);
+
+	if (kernfs_ns_enabled(parent))
+		ns = kernfs_info(dir->i_sb)->ns;
+
+	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+
+	/* no such entry */
+	if (!kn || !kernfs_active(kn)) {
+		ret = NULL;
+		goto out_unlock;
+	}
+	kernfs_get(kn);
+	dentry->d_fsdata = kn;
+
+	/* attach dentry and inode */
+	inode = kernfs_get_inode(dir->i_sb, kn);
+	if (!inode) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	/* instantiate and hash dentry */
+	ret = d_splice_alias(inode, dentry);
+ out_unlock:
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+			    umode_t mode)
+{
+	struct kernfs_node *parent = dir->i_private;
+	struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
+	int ret;
+
+	if (!scops || !scops->mkdir)
+		return -EPERM;
+
+	if (!kernfs_get_active(parent))
+		return -ENODEV;
+
+	ret = scops->mkdir(parent, dentry->d_name.name, mode);
+
+	kernfs_put_active(parent);
+	return ret;
+}
+
+static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct kernfs_node *kn  = dentry->d_fsdata;
+	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
+	int ret;
+
+	if (!scops || !scops->rmdir)
+		return -EPERM;
+
+	if (!kernfs_get_active(kn))
+		return -ENODEV;
+
+	ret = scops->rmdir(kn);
+
+	kernfs_put_active(kn);
+	return ret;
+}
+
+static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct kernfs_node *kn  = old_dentry->d_fsdata;
+	struct kernfs_node *new_parent = new_dir->i_private;
+	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
+	int ret;
+
+	if (!scops || !scops->rename)
+		return -EPERM;
+
+	if (!kernfs_get_active(kn))
+		return -ENODEV;
+
+	if (!kernfs_get_active(new_parent)) {
+		kernfs_put_active(kn);
+		return -ENODEV;
+	}
+
+	ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
+
+	kernfs_put_active(new_parent);
+	kernfs_put_active(kn);
+	return ret;
+}
+
+const struct inode_operations kernfs_dir_iops = {
+	.lookup		= kernfs_iop_lookup,
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+
+	.mkdir		= kernfs_iop_mkdir,
+	.rmdir		= kernfs_iop_rmdir,
+	.rename		= kernfs_iop_rename,
+};
+
+static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
+{
+	struct kernfs_node *last;
+
+	while (true) {
+		struct rb_node *rbn;
+
+		last = pos;
+
+		if (kernfs_type(pos) != KERNFS_DIR)
+			break;
+
+		rbn = rb_first(&pos->dir.children);
+		if (!rbn)
+			break;
+
+		pos = rb_to_kn(rbn);
+	}
+
+	return last;
+}
+
+/**
+ * kernfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: kernfs_node whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
+						       struct kernfs_node *root)
+{
+	struct rb_node *rbn;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	/* if first iteration, visit leftmost descendant which may be root */
+	if (!pos)
+		return kernfs_leftmost_descendant(root);
+
+	/* if we visited @root, we're done */
+	if (pos == root)
+		return NULL;
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	rbn = rb_next(&pos->rb);
+	if (rbn)
+		return kernfs_leftmost_descendant(rb_to_kn(rbn));
+
+	/* no sibling left, visit parent */
+	return pos->parent;
+}
+
+/**
+ * kernfs_activate - activate a node which started deactivated
+ * @kn: kernfs_node whose subtree is to be activated
+ *
+ * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
+ * needs to be explicitly activated.  A node which hasn't been activated
+ * isn't visible to userland and deactivation is skipped during its
+ * removal.  This is useful to construct atomic init sequences where
+ * creation of multiple nodes should either succeed or fail atomically.
+ *
+ * The caller is responsible for ensuring that this function is not called
+ * after kernfs_remove*() is invoked on @kn.
+ */
+void kernfs_activate(struct kernfs_node *kn)
+{
+	struct kernfs_node *pos;
+
+	mutex_lock(&kernfs_mutex);
+
+	pos = NULL;
+	while ((pos = kernfs_next_descendant_post(pos, kn))) {
+		if (!pos || (pos->flags & KERNFS_ACTIVATED))
+			continue;
+
+		WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
+		WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
+
+		atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
+		pos->flags |= KERNFS_ACTIVATED;
+	}
+
+	mutex_unlock(&kernfs_mutex);
+}
+
+static void __kernfs_remove(struct kernfs_node *kn)
+{
+	struct kernfs_node *pos;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	/*
+	 * Short-circuit if non-root @kn has already finished removal.
+	 * This is for kernfs_remove_self() which plays with active ref
+	 * after removal.
+	 */
+	if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
+		return;
+
+	pr_debug("kernfs %s: removing\n", kn->name);
+
+	/* prevent any new usage under @kn by deactivating all nodes */
+	pos = NULL;
+	while ((pos = kernfs_next_descendant_post(pos, kn)))
+		if (kernfs_active(pos))
+			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+
+	/* deactivate and unlink the subtree node-by-node */
+	do {
+		pos = kernfs_leftmost_descendant(kn);
+
+		/*
+		 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
+		 * base ref could have been put by someone else by the time
+		 * the function returns.  Make sure it doesn't go away
+		 * underneath us.
+		 */
+		kernfs_get(pos);
+
+		/*
+		 * Drain iff @kn was activated.  This avoids draining and
+		 * its lockdep annotations for nodes which have never been
+		 * activated and allows embedding kernfs_remove() in create
+		 * error paths without worrying about draining.
+		 */
+		if (kn->flags & KERNFS_ACTIVATED)
+			kernfs_drain(pos);
+		else
+			WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+		/*
+		 * kernfs_unlink_sibling() succeeds once per node.  Use it
+		 * to decide who's responsible for cleanups.
+		 */
+		if (!pos->parent || kernfs_unlink_sibling(pos)) {
+			struct kernfs_iattrs *ps_iattr =
+				pos->parent ? pos->parent->iattr : NULL;
+
+			/* update timestamps on the parent */
+			if (ps_iattr) {
+				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+			}
+
+			kernfs_put(pos);
+		}
+
+		kernfs_put(pos);
+	} while (pos != kn);
+}
+
+/**
+ * kernfs_remove - remove a kernfs_node recursively
+ * @kn: the kernfs_node to remove
+ *
+ * Remove @kn along with all its subdirectories and files.
+ */
+void kernfs_remove(struct kernfs_node *kn)
+{
+	mutex_lock(&kernfs_mutex);
+	__kernfs_remove(kn);
+	mutex_unlock(&kernfs_mutex);
+}
+
+/**
+ * kernfs_break_active_protection - break out of active protection
+ * @kn: the self kernfs_node
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops.  Each invocation of
+ * this function must also be matched with an invocation of
+ * kernfs_unbreak_active_protection().
+ *
+ * This function releases the active reference of @kn the caller is
+ * holding.  Once this function is called, @kn may be removed at any point
+ * and the caller is solely responsible for ensuring that the objects it
+ * dereferences are accessible.
+ */
+void kernfs_break_active_protection(struct kernfs_node *kn)
+{
+	/*
+	 * Take out ourself out of the active ref dependency chain.  If
+	 * we're called without an active ref, lockdep will complain.
+	 */
+	kernfs_put_active(kn);
+}
+
+/**
+ * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
+ * @kn: the self kernfs_node
+ *
+ * If kernfs_break_active_protection() was called, this function must be
+ * invoked before finishing the kernfs operation.  Note that while this
+ * function restores the active reference, it doesn't and can't actually
+ * restore the active protection - @kn may already or be in the process of
+ * being removed.  Once kernfs_break_active_protection() is invoked, that
+ * protection is irreversibly gone for the kernfs operation instance.
+ *
+ * While this function may be called at any point after
+ * kernfs_break_active_protection() is invoked, its most useful location
+ * would be right before the enclosing kernfs operation returns.
+ */
+void kernfs_unbreak_active_protection(struct kernfs_node *kn)
+{
+	/*
+	 * @kn->active could be in any state; however, the increment we do
+	 * here will be undone as soon as the enclosing kernfs operation
+	 * finishes and this temporary bump can't break anything.  If @kn
+	 * is alive, nothing changes.  If @kn is being deactivated, the
+	 * soon-to-follow put will either finish deactivation or restore
+	 * deactivated state.  If @kn is already removed, the temporary
+	 * bump is guaranteed to be gone before @kn is released.
+	 */
+	atomic_inc(&kn->active);
+	if (kernfs_lockdep(kn))
+		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
+}
+
+/**
+ * kernfs_remove_self - remove a kernfs_node from its own method
+ * @kn: the self kernfs_node to remove
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops.  This can be used to
+ * implement a file operation which deletes itself.
+ *
+ * For example, the "delete" file for a sysfs device directory can be
+ * implemented by invoking kernfs_remove_self() on the "delete" file
+ * itself.  This function breaks the circular dependency of trying to
+ * deactivate self while holding an active ref itself.  It isn't necessary
+ * to modify the usual removal path to use kernfs_remove_self().  The
+ * "delete" implementation can simply invoke kernfs_remove_self() on self
+ * before proceeding with the usual removal path.  kernfs will ignore later
+ * kernfs_remove() on self.
+ *
+ * kernfs_remove_self() can be called multiple times concurrently on the
+ * same kernfs_node.  Only the first one actually performs removal and
+ * returns %true.  All others will wait until the kernfs operation which
+ * won self-removal finishes and return %false.  Note that the losers wait
+ * for the completion of not only the winning kernfs_remove_self() but also
+ * the whole kernfs_ops which won the arbitration.  This can be used to
+ * guarantee, for example, all concurrent writes to a "delete" file to
+ * finish only after the whole operation is complete.
+ */
+bool kernfs_remove_self(struct kernfs_node *kn)
+{
+	bool ret;
+
+	mutex_lock(&kernfs_mutex);
+	kernfs_break_active_protection(kn);
+
+	/*
+	 * SUICIDAL is used to arbitrate among competing invocations.  Only
+	 * the first one will actually perform removal.  When the removal
+	 * is complete, SUICIDED is set and the active ref is restored
+	 * while holding kernfs_mutex.  The ones which lost arbitration
+	 * waits for SUICDED && drained which can happen only after the
+	 * enclosing kernfs operation which executed the winning instance
+	 * of kernfs_remove_self() finished.
+	 */
+	if (!(kn->flags & KERNFS_SUICIDAL)) {
+		kn->flags |= KERNFS_SUICIDAL;
+		__kernfs_remove(kn);
+		kn->flags |= KERNFS_SUICIDED;
+		ret = true;
+	} else {
+		wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
+		DEFINE_WAIT(wait);
+
+		while (true) {
+			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
+
+			if ((kn->flags & KERNFS_SUICIDED) &&
+			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
+				break;
+
+			mutex_unlock(&kernfs_mutex);
+			schedule();
+			mutex_lock(&kernfs_mutex);
+		}
+		finish_wait(waitq, &wait);
+		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
+		ret = false;
+	}
+
+	/*
+	 * This must be done while holding kernfs_mutex; otherwise, waiting
+	 * for SUICIDED && deactivated could finish prematurely.
+	 */
+	kernfs_unbreak_active_protection(kn);
+
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+/**
+ * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
+ * @parent: parent of the target
+ * @name: name of the kernfs_node to remove
+ * @ns: namespace tag of the kernfs_node to remove
+ *
+ * Look for the kernfs_node with @name and @ns under @parent and remove it.
+ * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
+			     const void *ns)
+{
+	struct kernfs_node *kn;
+
+	if (!parent) {
+		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
+			name);
+		return -ENOENT;
+	}
+
+	mutex_lock(&kernfs_mutex);
+
+	kn = kernfs_find_ns(parent, name, ns);
+	if (kn)
+		__kernfs_remove(kn);
+
+	mutex_unlock(&kernfs_mutex);
+
+	if (kn)
+		return 0;
+	else
+		return -ENOENT;
+}
+
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @kn: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		     const char *new_name, const void *new_ns)
+{
+	struct kernfs_node *old_parent;
+	const char *old_name = NULL;
+	int error;
+
+	/* can't move or rename root */
+	if (!kn->parent)
+		return -EINVAL;
+
+	mutex_lock(&kernfs_mutex);
+
+	error = -ENOENT;
+	if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
+	    (new_parent->flags & KERNFS_EMPTY_DIR))
+		goto out;
+
+	error = 0;
+	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+	    (strcmp(kn->name, new_name) == 0))
+		goto out;	/* nothing to rename */
+
+	error = -EEXIST;
+	if (kernfs_find_ns(new_parent, new_name, new_ns))
+		goto out;
+
+	/* rename kernfs_node */
+	if (strcmp(kn->name, new_name) != 0) {
+		error = -ENOMEM;
+		new_name = kstrdup_const(new_name, GFP_KERNEL);
+		if (!new_name)
+			goto out;
+	} else {
+		new_name = NULL;
+	}
+
+	/*
+	 * Move to the appropriate place in the appropriate directories rbtree.
+	 */
+	kernfs_unlink_sibling(kn);
+	kernfs_get(new_parent);
+
+	/* rename_lock protects ->parent and ->name accessors */
+	spin_lock_irq(&kernfs_rename_lock);
+
+	old_parent = kn->parent;
+	kn->parent = new_parent;
+
+	kn->ns = new_ns;
+	if (new_name) {
+		old_name = kn->name;
+		kn->name = new_name;
+	}
+
+	spin_unlock_irq(&kernfs_rename_lock);
+
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
+	kernfs_link_sibling(kn);
+
+	kernfs_put(old_parent);
+	kfree_const(old_name);
+
+	error = 0;
+ out:
+	mutex_unlock(&kernfs_mutex);
+	return error;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct kernfs_node *kn)
+{
+	return (kn->mode >> 12) & 15;
+}
+
+static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
+{
+	kernfs_put(filp->private_data);
+	return 0;
+}
+
+static struct kernfs_node *kernfs_dir_pos(const void *ns,
+	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
+{
+	if (pos) {
+		int valid = kernfs_active(pos) &&
+			pos->parent == parent && hash == pos->hash;
+		kernfs_put(pos);
+		if (!valid)
+			pos = NULL;
+	}
+	if (!pos && (hash > 1) && (hash < INT_MAX)) {
+		struct rb_node *node = parent->dir.children.rb_node;
+		while (node) {
+			pos = rb_to_kn(node);
+
+			if (hash < pos->hash)
+				node = node->rb_left;
+			else if (hash > pos->hash)
+				node = node->rb_right;
+			else
+				break;
+		}
+	}
+	/* Skip over entries which are dying/dead or in the wrong namespace */
+	while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
+		struct rb_node *node = rb_next(&pos->rb);
+		if (!node)
+			pos = NULL;
+		else
+			pos = rb_to_kn(node);
+	}
+	return pos;
+}
+
+static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
+	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
+{
+	pos = kernfs_dir_pos(ns, parent, ino, pos);
+	if (pos) {
+		do {
+			struct rb_node *node = rb_next(&pos->rb);
+			if (!node)
+				pos = NULL;
+			else
+				pos = rb_to_kn(node);
+		} while (pos && (!kernfs_active(pos) || pos->ns != ns));
+	}
+	return pos;
+}
+
+static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct kernfs_node *parent = dentry->d_fsdata;
+	struct kernfs_node *pos = file->private_data;
+	const void *ns = NULL;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	mutex_lock(&kernfs_mutex);
+
+	if (kernfs_ns_enabled(parent))
+		ns = kernfs_info(dentry->d_sb)->ns;
+
+	for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
+	     pos;
+	     pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+		const char *name = pos->name;
+		unsigned int type = dt_type(pos);
+		int len = strlen(name);
+		ino_t ino = pos->ino;
+
+		ctx->pos = pos->hash;
+		file->private_data = pos;
+		kernfs_get(pos);
+
+		mutex_unlock(&kernfs_mutex);
+		if (!dir_emit(ctx, name, len, ino, type))
+			return 0;
+		mutex_lock(&kernfs_mutex);
+	}
+	mutex_unlock(&kernfs_mutex);
+	file->private_data = NULL;
+	ctx->pos = INT_MAX;
+	return 0;
+}
+
+static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
+				    int whence)
+{
+	struct inode *inode = file_inode(file);
+	loff_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = generic_file_llseek(file, offset, whence);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+const struct file_operations kernfs_dir_fops = {
+	.read		= generic_read_dir,
+	.iterate	= kernfs_fop_readdir,
+	.release	= kernfs_dir_fop_release,
+	.llseek		= kernfs_dir_fop_llseek,
+};
diff --git a/kernel/fs/kernfs/file.c b/kernel/fs/kernfs/file.c
new file mode 100644
index 000000000..2bacb9988
--- /dev/null
+++ b/kernel/fs/kernfs/file.c
@@ -0,0 +1,954 @@
+/*
+ * fs/kernfs/file.c - kernfs file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/fsnotify.h>
+
+#include "kernfs-internal.h"
+
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
+ * protected by kernfs_open_node_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file.  kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
+ */
+static DEFINE_SPINLOCK(kernfs_open_node_lock);
+static DEFINE_MUTEX(kernfs_open_file_mutex);
+
+struct kernfs_open_node {
+	atomic_t		refcnt;
+	atomic_t		event;
+	wait_queue_head_t	poll;
+	struct list_head	files; /* goes through kernfs_open_file.list */
+};
+
+/*
+ * kernfs_notify() may be called from any context and bounces notifications
+ * through a work item.  To minimize space overhead in kernfs_node, the
+ * pending queue is implemented as a singly linked list of kernfs_nodes.
+ * The list is terminated with the self pointer so that whether a
+ * kernfs_node is on the list or not can be determined by testing the next
+ * pointer for NULL.
+ */
+#define KERNFS_NOTIFY_EOL			((void *)&kernfs_notify_list)
+
+static DEFINE_SPINLOCK(kernfs_notify_lock);
+static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+
+static struct kernfs_open_file *kernfs_of(struct file *file)
+{
+	return ((struct seq_file *)file->private_data)->private;
+}
+
+/*
+ * Determine the kernfs_ops for the given kernfs_node.  This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
+{
+	if (kn->flags & KERNFS_LOCKDEP)
+		lockdep_assert_held(kn);
+	return kn->attr.ops;
+}
+
+/*
+ * As kernfs_seq_stop() is also called after kernfs_seq_start() or
+ * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
+ * a seq_file iteration which is fully initialized with an active reference
+ * or an aborted kernfs_seq_start() due to get_active failure.  The
+ * position pointer is the only context for each seq_file iteration and
+ * thus the stop condition should be encoded in it.  As the return value is
+ * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
+ * choice to indicate get_active failure.
+ *
+ * Unfortunately, this is complicated due to the optional custom seq_file
+ * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
+ * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
+ * custom seq_file operations and thus can't decide whether put_active
+ * should be performed or not only on ERR_PTR(-ENODEV).
+ *
+ * This is worked around by factoring out the custom seq_stop() and
+ * put_active part into kernfs_seq_stop_active(), skipping it from
+ * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
+ * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
+ * that kernfs_seq_stop_active() is skipped only after get_active failure.
+ */
+static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
+	kernfs_put_active(of->kn);
+}
+
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops;
+
+	/*
+	 * @of->mutex nests outside active ref and is primarily to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn))
+		return ERR_PTR(-ENODEV);
+
+	ops = kernfs_ops(of->kn);
+	if (ops->seq_start) {
+		void *next = ops->seq_start(sf, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
+	} else {
+		/*
+		 * The same behavior and code as single_open().  Returns
+		 * !NULL if pos is at the beginning; otherwise, NULL.
+		 */
+		return NULL + !*ppos;
+	}
+}
+
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_next) {
+		void *next = ops->seq_next(sf, v, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
+	} else {
+		/*
+		 * The same behavior and code as single_open(), always
+		 * terminate after the initial read.
+		 */
+		++*ppos;
+		return NULL;
+	}
+}
+
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+
+	if (v != ERR_PTR(-ENODEV))
+		kernfs_seq_stop_active(sf, v);
+	mutex_unlock(&of->mutex);
+}
+
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+
+	of->event = atomic_read(&of->kn->attr.open->event);
+
+	return of->kn->attr.ops->seq_show(sf, v);
+}
+
+static const struct seq_operations kernfs_seq_ops = {
+	.start = kernfs_seq_start,
+	.next = kernfs_seq_next,
+	.stop = kernfs_seq_stop,
+	.show = kernfs_seq_show,
+};
+
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file.  Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
+				       char __user *user_buf, size_t count,
+				       loff_t *ppos)
+{
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
+	char *buf;
+
+	buf = of->prealloc_buf;
+	if (!buf)
+		buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * @of->mutex nests outside active ref and is used both to ensure that
+	 * the ops aren't called concurrently for the same open file, and
+	 * to provide exclusive access to ->prealloc_buf (when that exists).
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		len = -ENODEV;
+		mutex_unlock(&of->mutex);
+		goto out_free;
+	}
+
+	of->event = atomic_read(&of->kn->attr.open->event);
+	ops = kernfs_ops(of->kn);
+	if (ops->read)
+		len = ops->read(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	if (len < 0)
+		goto out_unlock;
+
+	if (copy_to_user(user_buf, buf, len)) {
+		len = -EFAULT;
+		goto out_unlock;
+	}
+
+	*ppos += len;
+
+ out_unlock:
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+ out_free:
+	if (buf != of->prealloc_buf)
+		kfree(buf);
+	return len;
+}
+
+/**
+ * kernfs_fop_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+
+	if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
+		return seq_read(file, user_buf, count, ppos);
+	else
+		return kernfs_file_direct_read(of, user_buf, count, ppos);
+}
+
+/**
+ * kernfs_fop_write - kernfs vfs write callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
+ */
+static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	const struct kernfs_ops *ops;
+	size_t len;
+	char *buf;
+
+	if (of->atomic_write_len) {
+		len = count;
+		if (len > of->atomic_write_len)
+			return -E2BIG;
+	} else {
+		len = min_t(size_t, count, PAGE_SIZE);
+	}
+
+	buf = of->prealloc_buf;
+	if (!buf)
+		buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * @of->mutex nests outside active ref and is used both to ensure that
+	 * the ops aren't called concurrently for the same open file, and
+	 * to provide exclusive access to ->prealloc_buf (when that exists).
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		mutex_unlock(&of->mutex);
+		len = -ENODEV;
+		goto out_free;
+	}
+
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_unlock;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
+	ops = kernfs_ops(of->kn);
+	if (ops->write)
+		len = ops->write(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	if (len > 0)
+		*ppos += len;
+
+out_unlock:
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+out_free:
+	if (buf != of->prealloc_buf)
+		kfree(buf);
+	return len;
+}
+
+static void kernfs_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+
+	if (!of->vm_ops)
+		return;
+
+	if (!kernfs_get_active(of->kn))
+		return;
+
+	if (of->vm_ops->open)
+		of->vm_ops->open(vma);
+
+	kernfs_put_active(of->kn);
+}
+
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!kernfs_get_active(of->kn))
+		return VM_FAULT_SIGBUS;
+
+	ret = VM_FAULT_SIGBUS;
+	if (of->vm_ops->fault)
+		ret = of->vm_ops->fault(vma, vmf);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+				   struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!kernfs_get_active(of->kn))
+		return VM_FAULT_SIGBUS;
+
+	ret = 0;
+	if (of->vm_ops->page_mkwrite)
+		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	else
+		file_update_time(file);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+			     void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return -EINVAL;
+
+	if (!kernfs_get_active(of->kn))
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (of->vm_ops->access)
+		ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+				 struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!kernfs_get_active(of->kn))
+		return -EINVAL;
+
+	ret = 0;
+	if (of->vm_ops->set_policy)
+		ret = of->vm_ops->set_policy(vma, new);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+					       unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	struct mempolicy *pol;
+
+	if (!of->vm_ops)
+		return vma->vm_policy;
+
+	if (!kernfs_get_active(of->kn))
+		return vma->vm_policy;
+
+	pol = vma->vm_policy;
+	if (of->vm_ops->get_policy)
+		pol = of->vm_ops->get_policy(vma, addr);
+
+	kernfs_put_active(of->kn);
+	return pol;
+}
+
+#endif
+
+static const struct vm_operations_struct kernfs_vm_ops = {
+	.open		= kernfs_vma_open,
+	.fault		= kernfs_vma_fault,
+	.page_mkwrite	= kernfs_vma_page_mkwrite,
+	.access		= kernfs_vma_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= kernfs_vma_set_policy,
+	.get_policy	= kernfs_vma_get_policy,
+#endif
+};
+
+static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	const struct kernfs_ops *ops;
+	int rc;
+
+	/*
+	 * mmap path and of->mutex are prone to triggering spurious lockdep
+	 * warnings and we don't want to add spurious locking dependency
+	 * between the two.  Check whether mmap is actually implemented
+	 * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
+	 * comment in kernfs_file_open() for more details.
+	 */
+	if (!(of->kn->flags & KERNFS_HAS_MMAP))
+		return -ENODEV;
+
+	mutex_lock(&of->mutex);
+
+	rc = -ENODEV;
+	if (!kernfs_get_active(of->kn))
+		goto out_unlock;
+
+	ops = kernfs_ops(of->kn);
+	rc = ops->mmap(of, vma);
+	if (rc)
+		goto out_put;
+
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
+
+	rc = -EINVAL;
+	if (of->mmapped && of->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
+	rc = 0;
+	of->mmapped = 1;
+	of->vm_ops = vma->vm_ops;
+	vma->vm_ops = &kernfs_vm_ops;
+out_put:
+	kernfs_put_active(of->kn);
+out_unlock:
+	mutex_unlock(&of->mutex);
+
+	return rc;
+}
+
+/**
+ *	kernfs_get_open_node - get or create kernfs_open_node
+ *	@kn: target kernfs_node
+ *	@of: kernfs_open_file for this instance of open
+ *
+ *	If @kn->attr.open exists, increment its reference count; otherwise,
+ *	create one.  @of is chained to the files list.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int kernfs_get_open_node(struct kernfs_node *kn,
+				struct kernfs_open_file *of)
+{
+	struct kernfs_open_node *on, *new_on = NULL;
+
+ retry:
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irq(&kernfs_open_node_lock);
+
+	if (!kn->attr.open && new_on) {
+		kn->attr.open = new_on;
+		new_on = NULL;
+	}
+
+	on = kn->attr.open;
+	if (on) {
+		atomic_inc(&on->refcnt);
+		list_add_tail(&of->list, &on->files);
+	}
+
+	spin_unlock_irq(&kernfs_open_node_lock);
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	if (on) {
+		kfree(new_on);
+		return 0;
+	}
+
+	/* not there, initialize a new one and retry */
+	new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+	if (!new_on)
+		return -ENOMEM;
+
+	atomic_set(&new_on->refcnt, 0);
+	atomic_set(&new_on->event, 1);
+	init_waitqueue_head(&new_on->poll);
+	INIT_LIST_HEAD(&new_on->files);
+	goto retry;
+}
+
+/**
+ *	kernfs_put_open_node - put kernfs_open_node
+ *	@kn: target kernfs_nodet
+ *	@of: associated kernfs_open_file
+ *
+ *	Put @kn->attr.open and unlink @of from the files list.  If
+ *	reference count reaches zero, disassociate and free it.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void kernfs_put_open_node(struct kernfs_node *kn,
+				 struct kernfs_open_file *of)
+{
+	struct kernfs_open_node *on = kn->attr.open;
+	unsigned long flags;
+
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+	if (of)
+		list_del(&of->list);
+
+	if (atomic_dec_and_test(&on->refcnt))
+		kn->attr.open = NULL;
+	else
+		on = NULL;
+
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	kfree(on);
+}
+
+static int kernfs_fop_open(struct inode *inode, struct file *file)
+{
+	struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+	struct kernfs_root *root = kernfs_root(kn);
+	const struct kernfs_ops *ops;
+	struct kernfs_open_file *of;
+	bool has_read, has_write, has_mmap;
+	int error = -EACCES;
+
+	if (!kernfs_get_active(kn))
+		return -ENODEV;
+
+	ops = kernfs_ops(kn);
+
+	has_read = ops->seq_show || ops->read || ops->mmap;
+	has_write = ops->write || ops->mmap;
+	has_mmap = ops->mmap;
+
+	/* see the flag definition for details */
+	if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
+		if ((file->f_mode & FMODE_WRITE) &&
+		    (!(inode->i_mode & S_IWUGO) || !has_write))
+			goto err_out;
+
+		if ((file->f_mode & FMODE_READ) &&
+		    (!(inode->i_mode & S_IRUGO) || !has_read))
+			goto err_out;
+	}
+
+	/* allocate a kernfs_open_file for the file */
+	error = -ENOMEM;
+	of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
+	if (!of)
+		goto err_out;
+
+	/*
+	 * The following is done to give a different lockdep key to
+	 * @of->mutex for files which implement mmap.  This is a rather
+	 * crude way to avoid false positive lockdep warning around
+	 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+	 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+	 * which mm->mmap_sem nests, while holding @of->mutex.  As each
+	 * open file has a separate mutex, it's okay as long as those don't
+	 * happen on the same file.  At this point, we can't easily give
+	 * each file a separate locking class.  Let's differentiate on
+	 * whether the file has mmap or not for now.
+	 *
+	 * Both paths of the branch look the same.  They're supposed to
+	 * look that way and give @of->mutex different static lockdep keys.
+	 */
+	if (has_mmap)
+		mutex_init(&of->mutex);
+	else
+		mutex_init(&of->mutex);
+
+	of->kn = kn;
+	of->file = file;
+
+	/*
+	 * Write path needs to atomic_write_len outside active reference.
+	 * Cache it in open_file.  See kernfs_fop_write() for details.
+	 */
+	of->atomic_write_len = ops->atomic_write_len;
+
+	error = -EINVAL;
+	/*
+	 * ->seq_show is incompatible with ->prealloc,
+	 * as seq_read does its own allocation.
+	 * ->read must be used instead.
+	 */
+	if (ops->prealloc && ops->seq_show)
+		goto err_free;
+	if (ops->prealloc) {
+		int len = of->atomic_write_len ?: PAGE_SIZE;
+		of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
+		error = -ENOMEM;
+		if (!of->prealloc_buf)
+			goto err_free;
+	}
+
+	/*
+	 * Always instantiate seq_file even if read access doesn't use
+	 * seq_file or is not requested.  This unifies private data access
+	 * and readable regular files are the vast majority anyway.
+	 */
+	if (ops->seq_show)
+		error = seq_open(file, &kernfs_seq_ops);
+	else
+		error = seq_open(file, NULL);
+	if (error)
+		goto err_free;
+
+	((struct seq_file *)file->private_data)->private = of;
+
+	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
+	if (file->f_mode & FMODE_WRITE)
+		file->f_mode |= FMODE_PWRITE;
+
+	/* make sure we have open node struct */
+	error = kernfs_get_open_node(kn, of);
+	if (error)
+		goto err_close;
+
+	/* open succeeded, put active references */
+	kernfs_put_active(kn);
+	return 0;
+
+err_close:
+	seq_release(inode, file);
+err_free:
+	kfree(of->prealloc_buf);
+	kfree(of);
+err_out:
+	kernfs_put_active(kn);
+	return error;
+}
+
+static int kernfs_fop_release(struct inode *inode, struct file *filp)
+{
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+	struct kernfs_open_file *of = kernfs_of(filp);
+
+	kernfs_put_open_node(kn, of);
+	seq_release(inode, filp);
+	kfree(of->prealloc_buf);
+	kfree(of);
+
+	return 0;
+}
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
+{
+	struct kernfs_open_node *on;
+	struct kernfs_open_file *of;
+
+	if (!(kn->flags & KERNFS_HAS_MMAP))
+		return;
+
+	spin_lock_irq(&kernfs_open_node_lock);
+	on = kn->attr.open;
+	if (on)
+		atomic_inc(&on->refcnt);
+	spin_unlock_irq(&kernfs_open_node_lock);
+	if (!on)
+		return;
+
+	mutex_lock(&kernfs_open_file_mutex);
+	list_for_each_entry(of, &on->files, list) {
+		struct inode *inode = file_inode(of->file);
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	kernfs_put_open_node(kn, NULL);
+}
+
+/*
+ * Kernfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
+{
+	struct kernfs_open_file *of = kernfs_of(filp);
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+	struct kernfs_open_node *on = kn->attr.open;
+
+	/* need parent for the kobj, grab both */
+	if (!kernfs_get_active(kn))
+		goto trigger;
+
+	poll_wait(filp, &on->poll, wait);
+
+	kernfs_put_active(kn);
+
+	if (of->event != atomic_read(&on->event))
+		goto trigger;
+
+	return DEFAULT_POLLMASK;
+
+ trigger:
+	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+
+static void kernfs_notify_workfn(struct work_struct *work)
+{
+	struct kernfs_node *kn;
+	struct kernfs_open_node *on;
+	struct kernfs_super_info *info;
+repeat:
+	/* pop one off the notify_list */
+	spin_lock_irq(&kernfs_notify_lock);
+	kn = kernfs_notify_list;
+	if (kn == KERNFS_NOTIFY_EOL) {
+		spin_unlock_irq(&kernfs_notify_lock);
+		return;
+	}
+	kernfs_notify_list = kn->attr.notify_next;
+	kn->attr.notify_next = NULL;
+	spin_unlock_irq(&kernfs_notify_lock);
+
+	/* kick poll */
+	spin_lock_irq(&kernfs_open_node_lock);
+
+	on = kn->attr.open;
+	if (on) {
+		atomic_inc(&on->event);
+		wake_up_interruptible(&on->poll);
+	}
+
+	spin_unlock_irq(&kernfs_open_node_lock);
+
+	/* kick fsnotify */
+	mutex_lock(&kernfs_mutex);
+
+	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
+		struct inode *inode;
+		struct dentry *dentry;
+
+		inode = ilookup(info->sb, kn->ino);
+		if (!inode)
+			continue;
+
+		dentry = d_find_any_alias(inode);
+		if (dentry) {
+			fsnotify_parent(NULL, dentry, FS_MODIFY);
+			fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
+				 NULL, 0);
+			dput(dentry);
+		}
+
+		iput(inode);
+	}
+
+	mutex_unlock(&kernfs_mutex);
+	kernfs_put(kn);
+	goto repeat;
+}
+
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up.  Maybe be called from any
+ * context.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+	static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
+	unsigned long flags;
+
+	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+		return;
+
+	spin_lock_irqsave(&kernfs_notify_lock, flags);
+	if (!kn->attr.notify_next) {
+		kernfs_get(kn);
+		kn->attr.notify_next = kernfs_notify_list;
+		kernfs_notify_list = kn;
+		schedule_work(&kernfs_notify_work);
+	}
+	spin_unlock_irqrestore(&kernfs_notify_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kernfs_notify);
+
+const struct file_operations kernfs_file_fops = {
+	.read		= kernfs_fop_read,
+	.write		= kernfs_fop_write,
+	.llseek		= generic_file_llseek,
+	.mmap		= kernfs_fop_mmap,
+	.open		= kernfs_fop_open,
+	.release	= kernfs_fop_release,
+	.poll		= kernfs_fop_poll,
+};
+
+/**
+ * __kernfs_create_file - kernfs internal function to create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
+					 const char *name,
+					 umode_t mode, loff_t size,
+					 const struct kernfs_ops *ops,
+					 void *priv, const void *ns,
+					 struct lock_class_key *key)
+{
+	struct kernfs_node *kn;
+	unsigned flags;
+	int rc;
+
+	flags = KERNFS_FILE;
+
+	kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->attr.ops = ops;
+	kn->attr.size = size;
+	kn->ns = ns;
+	kn->priv = priv;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (key) {
+		lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+		kn->flags |= KERNFS_LOCKDEP;
+	}
+#endif
+
+	/*
+	 * kn->attr.ops is accesible only while holding active ref.  We
+	 * need to know whether some ops are implemented outside active
+	 * ref.  Cache their existence in flags.
+	 */
+	if (ops->seq_show)
+		kn->flags |= KERNFS_HAS_SEQ_SHOW;
+	if (ops->mmap)
+		kn->flags |= KERNFS_HAS_MMAP;
+
+	rc = kernfs_add_one(kn);
+	if (rc) {
+		kernfs_put(kn);
+		return ERR_PTR(rc);
+	}
+	return kn;
+}
diff --git a/kernel/fs/kernfs/inode.c b/kernel/fs/kernfs/inode.c
new file mode 100644
index 000000000..756dd56aa
--- /dev/null
+++ b/kernel/fs/kernfs/inode.c
@@ -0,0 +1,372 @@
+/*
+ * fs/kernfs/inode.c - kernfs inode implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include "kernfs-internal.h"
+
+static const struct address_space_operations kernfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+};
+
+static const struct inode_operations kernfs_iops = {
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+};
+
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+{
+	static DEFINE_MUTEX(iattr_mutex);
+	struct kernfs_iattrs *ret;
+	struct iattr *iattrs;
+
+	mutex_lock(&iattr_mutex);
+
+	if (kn->iattr)
+		goto out_unlock;
+
+	kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+	if (!kn->iattr)
+		goto out_unlock;
+	iattrs = &kn->iattr->ia_iattr;
+
+	/* assign default attributes */
+	iattrs->ia_mode = kn->mode;
+	iattrs->ia_uid = GLOBAL_ROOT_UID;
+	iattrs->ia_gid = GLOBAL_ROOT_GID;
+	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+	simple_xattrs_init(&kn->iattr->xattrs);
+out_unlock:
+	ret = kn->iattr;
+	mutex_unlock(&iattr_mutex);
+	return ret;
+}
+
+static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+	struct kernfs_iattrs *attrs;
+	struct iattr *iattrs;
+	unsigned int ia_valid = iattr->ia_valid;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	iattrs = &attrs->ia_iattr;
+
+	if (ia_valid & ATTR_UID)
+		iattrs->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		iattrs->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		iattrs->ia_atime = iattr->ia_atime;
+	if (ia_valid & ATTR_MTIME)
+		iattrs->ia_mtime = iattr->ia_mtime;
+	if (ia_valid & ATTR_CTIME)
+		iattrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+		iattrs->ia_mode = kn->mode = mode;
+	}
+	return 0;
+}
+
+/**
+ * kernfs_setattr - set iattr on a node
+ * @kn: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+	int ret;
+
+	mutex_lock(&kernfs_mutex);
+	ret = __kernfs_setattr(kn, iattr);
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct kernfs_node *kn = dentry->d_fsdata;
+	int error;
+
+	if (!kn)
+		return -EINVAL;
+
+	mutex_lock(&kernfs_mutex);
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		goto out;
+
+	error = __kernfs_setattr(kn, iattr);
+	if (error)
+		goto out;
+
+	/* this ignores size changes */
+	setattr_copy(inode, iattr);
+
+out:
+	mutex_unlock(&kernfs_mutex);
+	return error;
+}
+
+static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+				  u32 *secdata_len)
+{
+	struct kernfs_iattrs *attrs;
+	void *old_secdata;
+	size_t old_secdata_len;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	old_secdata = attrs->ia_secdata;
+	old_secdata_len = attrs->ia_secdata_len;
+
+	attrs->ia_secdata = *secdata;
+	attrs->ia_secdata_len = *secdata_len;
+
+	*secdata = old_secdata;
+	*secdata_len = old_secdata_len;
+	return 0;
+}
+
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+	void *secdata;
+	int error;
+	u32 secdata_len = 0;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(d_inode(dentry), suffix,
+						value, size, flags);
+		if (error)
+			return error;
+		error = security_inode_getsecctx(d_inode(dentry),
+						&secdata, &secdata_len);
+		if (error)
+			return error;
+
+		mutex_lock(&kernfs_mutex);
+		error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
+		mutex_unlock(&kernfs_mutex);
+
+		if (secdata)
+			security_release_secctx(secdata, secdata_len);
+		return error;
+	} else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+		return simple_xattr_set(&attrs->xattrs, name, value, size,
+					flags);
+	}
+
+	return -EINVAL;
+}
+
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_remove(&attrs->xattrs, name);
+}
+
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_get(&attrs->xattrs, name, buf, size);
+}
+
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_list(&attrs->xattrs, buf, size);
+}
+
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+{
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
+{
+	struct kernfs_iattrs *attrs = kn->iattr;
+
+	inode->i_mode = kn->mode;
+	if (attrs) {
+		/*
+		 * kernfs_node has non-default attributes get them from
+		 * persistent copy in kernfs_node.
+		 */
+		set_inode_attr(inode, &attrs->ia_iattr);
+		security_inode_notifysecctx(inode, attrs->ia_secdata,
+					    attrs->ia_secdata_len);
+	}
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		set_nlink(inode, kn->dir.subdirs + 2);
+}
+
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct inode *inode = d_inode(dentry);
+
+	mutex_lock(&kernfs_mutex);
+	kernfs_refresh_inode(kn, inode);
+	mutex_unlock(&kernfs_mutex);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
+{
+	kernfs_get(kn);
+	inode->i_private = kn;
+	inode->i_mapping->a_ops = &kernfs_aops;
+	inode->i_op = &kernfs_iops;
+
+	set_default_inode_attr(inode, kn->mode);
+	kernfs_refresh_inode(kn, inode);
+
+	/* initialize inode according to type */
+	switch (kernfs_type(kn)) {
+	case KERNFS_DIR:
+		inode->i_op = &kernfs_dir_iops;
+		inode->i_fop = &kernfs_dir_fops;
+		if (kn->flags & KERNFS_EMPTY_DIR)
+			make_empty_dir_inode(inode);
+		break;
+	case KERNFS_FILE:
+		inode->i_size = kn->attr.size;
+		inode->i_fop = &kernfs_file_fops;
+		break;
+	case KERNFS_LINK:
+		inode->i_op = &kernfs_symlink_iops;
+		break;
+	default:
+		BUG();
+	}
+
+	unlock_new_inode(inode);
+}
+
+/**
+ *	kernfs_get_inode - get inode for kernfs_node
+ *	@sb: super block
+ *	@kn: kernfs_node to allocate inode for
+ *
+ *	Get inode for @kn.  If such inode doesn't exist, a new inode is
+ *	allocated and basics are initialized.  New inode is returned
+ *	locked.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, kn->ino);
+	if (inode && (inode->i_state & I_NEW))
+		kernfs_init_inode(kn, inode);
+
+	return inode;
+}
+
+/*
+ * The kernfs_node serves as both an inode and a directory entry for
+ * kernfs.  To prevent the kernfs inode numbers from being freed
+ * prematurely we take a reference to kernfs_node from the kernfs inode.  A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void kernfs_evict_inode(struct inode *inode)
+{
+	struct kernfs_node *kn = inode->i_private;
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	kernfs_put(kn);
+}
+
+int kernfs_iop_permission(struct inode *inode, int mask)
+{
+	struct kernfs_node *kn;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	kn = inode->i_private;
+
+	mutex_lock(&kernfs_mutex);
+	kernfs_refresh_inode(kn, inode);
+	mutex_unlock(&kernfs_mutex);
+
+	return generic_permission(inode, mask);
+}
diff --git a/kernel/fs/kernfs/kernfs-internal.h b/kernel/fs/kernfs/kernfs-internal.h
new file mode 100644
index 000000000..af9fa7499
--- /dev/null
+++ b/kernel/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,119 @@
+/*
+ * fs/kernfs/kernfs-internal.h - kernfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __KERNFS_INTERNAL_H
+#define __KERNFS_INTERNAL_H
+
+#include <linux/lockdep.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/xattr.h>
+
+#include <linux/kernfs.h>
+
+struct kernfs_iattrs {
+	struct iattr		ia_iattr;
+	void			*ia_secdata;
+	u32			ia_secdata_len;
+
+	struct simple_xattrs	xattrs;
+};
+
+/* +1 to avoid triggering overflow warning when negating it */
+#define KN_DEACTIVATED_BIAS		(INT_MIN + 1)
+
+/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
+
+/**
+ * kernfs_root - find out the kernfs_root a kernfs_node belongs to
+ * @kn: kernfs_node of interest
+ *
+ * Return the kernfs_root @kn belongs to.
+ */
+static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+{
+	/* if parent exists, it's always a dir; otherwise, @sd is a dir */
+	if (kn->parent)
+		kn = kn->parent;
+	return kn->dir.root;
+}
+
+/*
+ * mount.c
+ */
+struct kernfs_super_info {
+	struct super_block	*sb;
+
+	/*
+	 * The root associated with this super_block.  Each super_block is
+	 * identified by the root and ns it's associated with.
+	 */
+	struct kernfs_root	*root;
+
+	/*
+	 * Each sb is associated with one namespace tag, currently the
+	 * network namespace of the task which mounted this kernfs
+	 * instance.  If multiple tags become necessary, make the following
+	 * an array and compare kernfs_node tag against every entry.
+	 */
+	const void		*ns;
+
+	/* anchored at kernfs_root->supers, protected by kernfs_mutex */
+	struct list_head	node;
+};
+#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+
+extern const struct super_operations kernfs_sops;
+extern struct kmem_cache *kernfs_node_cache;
+
+/*
+ * inode.c
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
+void kernfs_evict_inode(struct inode *inode);
+int kernfs_iop_permission(struct inode *inode, int mask);
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+			size_t size, int flags);
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size);
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+
+/*
+ * dir.c
+ */
+extern struct mutex kernfs_mutex;
+extern const struct dentry_operations kernfs_dops;
+extern const struct file_operations kernfs_dir_fops;
+extern const struct inode_operations kernfs_dir_iops;
+
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
+void kernfs_put_active(struct kernfs_node *kn);
+int kernfs_add_one(struct kernfs_node *kn);
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+				    const char *name, umode_t mode,
+				    unsigned flags);
+
+/*
+ * file.c
+ */
+extern const struct file_operations kernfs_file_fops;
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
+
+/*
+ * symlink.c
+ */
+extern const struct inode_operations kernfs_symlink_iops;
+
+#endif	/* __KERNFS_INTERNAL_H */
diff --git a/kernel/fs/kernfs/mount.c b/kernel/fs/kernfs/mount.c
new file mode 100644
index 000000000..8eaf41718
--- /dev/null
+++ b/kernel/fs/kernfs/mount.c
@@ -0,0 +1,249 @@
+/*
+ * fs/kernfs/mount.c - kernfs mount implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include "kernfs-internal.h"
+
+struct kmem_cache *kernfs_node_cache;
+
+static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct kernfs_root *root = kernfs_info(sb)->root;
+	struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+	if (scops && scops->remount_fs)
+		return scops->remount_fs(root, flags, data);
+	return 0;
+}
+
+static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
+{
+	struct kernfs_root *root = kernfs_root(dentry->d_fsdata);
+	struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+	if (scops && scops->show_options)
+		return scops->show_options(sf, root);
+	return 0;
+}
+
+const struct super_operations kernfs_sops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= kernfs_evict_inode,
+
+	.remount_fs	= kernfs_sop_remount_fs,
+	.show_options	= kernfs_sop_show_options,
+};
+
+/**
+ * kernfs_root_from_sb - determine kernfs_root associated with a super_block
+ * @sb: the super_block in question
+ *
+ * Return the kernfs_root associated with @sb.  If @sb is not a kernfs one,
+ * %NULL is returned.
+ */
+struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
+{
+	if (sb->s_op == &kernfs_sops)
+		return kernfs_info(sb)->root;
+	return NULL;
+}
+
+static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+	struct inode *inode;
+	struct dentry *root;
+
+	info->sb = sb;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = magic;
+	sb->s_op = &kernfs_sops;
+	sb->s_time_gran = 1;
+
+	/* get root inode, initialize and unlock it */
+	mutex_lock(&kernfs_mutex);
+	inode = kernfs_get_inode(sb, info->root->kn);
+	mutex_unlock(&kernfs_mutex);
+	if (!inode) {
+		pr_debug("kernfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	/* instantiate and link root dentry */
+	root = d_make_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n", __func__);
+		return -ENOMEM;
+	}
+	kernfs_get(info->root->kn);
+	root->d_fsdata = info->root->kn;
+	sb->s_root = root;
+	sb->s_d_op = &kernfs_dops;
+	return 0;
+}
+
+static int kernfs_test_super(struct super_block *sb, void *data)
+{
+	struct kernfs_super_info *sb_info = kernfs_info(sb);
+	struct kernfs_super_info *info = data;
+
+	return sb_info->root == info->root && sb_info->ns == info->ns;
+}
+
+static int kernfs_set_super(struct super_block *sb, void *data)
+{
+	int error;
+	error = set_anon_super(sb, data);
+	if (!error)
+		sb->s_fs_info = data;
+	return error;
+}
+
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+
+	return info->ns;
+}
+
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @magic: file system specific magic number
+ * @new_sb_created: tell the caller if we allocated a new superblock
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+				struct kernfs_root *root, unsigned long magic,
+				bool *new_sb_created, const void *ns)
+{
+	struct super_block *sb;
+	struct kernfs_super_info *info;
+	int error;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	info->root = root;
+	info->ns = ns;
+
+	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+	if (IS_ERR(sb) || sb->s_fs_info != info)
+		kfree(info);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+
+	if (new_sb_created)
+		*new_sb_created = !sb->s_root;
+
+	if (!sb->s_root) {
+		struct kernfs_super_info *info = kernfs_info(sb);
+
+		error = kernfs_fill_super(sb, magic);
+		if (error) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(error);
+		}
+		sb->s_flags |= MS_ACTIVE;
+
+		mutex_lock(&kernfs_mutex);
+		list_add(&info->node, &root->supers);
+		mutex_unlock(&kernfs_mutex);
+	}
+
+	return dget(sb->s_root);
+}
+
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb().  If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+	struct kernfs_node *root_kn = sb->s_root->d_fsdata;
+
+	mutex_lock(&kernfs_mutex);
+	list_del(&info->node);
+	mutex_unlock(&kernfs_mutex);
+
+	/*
+	 * Remove the superblock from fs_supers/s_instances
+	 * so we can't find it, before freeing kernfs_super_info.
+	 */
+	kill_anon_super(sb);
+	kfree(info);
+	kernfs_put(root_kn);
+}
+
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations.  This can be used to block ->kill_sb() which may be useful
+ * for kernfs users which dynamically manage superblocks.
+ *
+ * Returns NULL if there's no superblock associated to this kernfs_root, or
+ * -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+	struct kernfs_super_info *info;
+	struct super_block *sb = NULL;
+
+	mutex_lock(&kernfs_mutex);
+	list_for_each_entry(info, &root->supers, node) {
+		if (info->ns == ns) {
+			sb = info->sb;
+			if (!atomic_inc_not_zero(&info->sb->s_active))
+				sb = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	mutex_unlock(&kernfs_mutex);
+	return sb;
+}
+
+void __init kernfs_init(void)
+{
+	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
+					      sizeof(struct kernfs_node),
+					      0, SLAB_PANIC, NULL);
+}
diff --git a/kernel/fs/kernfs/symlink.c b/kernel/fs/kernfs/symlink.c
new file mode 100644
index 000000000..8a198898e
--- /dev/null
+++ b/kernel/fs/kernfs/symlink.c
@@ -0,0 +1,147 @@
+/*
+ * fs/kernfs/symlink.c - kernfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/namei.h>
+
+#include "kernfs-internal.h"
+
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+				       const char *name,
+				       struct kernfs_node *target)
+{
+	struct kernfs_node *kn;
+	int error;
+
+	kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	if (kernfs_ns_enabled(parent))
+		kn->ns = target->ns;
+	kn->symlink.target_kn = target;
+	kernfs_get(target);	/* ref owned by symlink */
+
+	error = kernfs_add_one(kn);
+	if (!error)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(error);
+}
+
+static int kernfs_get_target_path(struct kernfs_node *parent,
+				  struct kernfs_node *target, char *path)
+{
+	struct kernfs_node *base, *kn;
+	char *s = path;
+	int len = 0;
+
+	/* go up to the root, stop at the base */
+	base = parent;
+	while (base->parent) {
+		kn = target->parent;
+		while (kn->parent && base != kn)
+			kn = kn->parent;
+
+		if (base == kn)
+			break;
+
+		strcpy(s, "../");
+		s += 3;
+		base = base->parent;
+	}
+
+	/* determine end of target string for reverse fillup */
+	kn = target;
+	while (kn->parent && kn != base) {
+		len += strlen(kn->name) + 1;
+		kn = kn->parent;
+	}
+
+	/* check limits */
+	if (len < 2)
+		return -EINVAL;
+	len--;
+	if ((s - path) + len > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	/* reverse fillup of target string from target to base */
+	kn = target;
+	while (kn->parent && kn != base) {
+		int slen = strlen(kn->name);
+
+		len -= slen;
+		strncpy(s + len, kn->name, slen);
+		if (len)
+			s[--len] = '/';
+
+		kn = kn->parent;
+	}
+
+	return 0;
+}
+
+static int kernfs_getlink(struct dentry *dentry, char *path)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_node *parent = kn->parent;
+	struct kernfs_node *target = kn->symlink.target_kn;
+	int error;
+
+	mutex_lock(&kernfs_mutex);
+	error = kernfs_get_target_path(parent, target, path);
+	mutex_unlock(&kernfs_mutex);
+
+	return error;
+}
+
+static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	if (page) {
+		error = kernfs_getlink(dentry, (char *) page);
+		if (error < 0)
+			free_page((unsigned long)page);
+	}
+	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+	return NULL;
+}
+
+static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *page = nd_get_link(nd);
+	if (!IS_ERR(page))
+		free_page((unsigned long)page);
+}
+
+const struct inode_operations kernfs_symlink_iops = {
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+	.readlink	= generic_readlink,
+	.follow_link	= kernfs_iop_follow_link,
+	.put_link	= kernfs_iop_put_link,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.permission	= kernfs_iop_permission,
+};
diff --git a/kernel/fs/libfs.c b/kernel/fs/libfs.c
new file mode 100644
index 000000000..02813592e
--- /dev/null
+++ b/kernel/fs/libfs.c
@@ -0,0 +1,1191 @@
+/*
+ *	fs/libfs.c
+ *	Library for filesystems writers.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/export.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/quotaops.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h> /* sync_mapping_buffers */
+
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+static inline int simple_positive(struct dentry *dentry)
+{
+	return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
+int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	generic_fillattr(inode, stat);
+	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+	return 0;
+}
+EXPORT_SYMBOL(simple_getattr);
+
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+EXPORT_SYMBOL(simple_statfs);
+
+/*
+ * Retaining negative dentries for an in-memory filesystem just wastes
+ * memory and lookup time: arrange for them to be deleted immediately.
+ */
+int always_delete_dentry(const struct dentry *dentry)
+{
+	return 1;
+}
+EXPORT_SYMBOL(always_delete_dentry);
+
+const struct dentry_operations simple_dentry_operations = {
+	.d_delete = always_delete_dentry,
+};
+EXPORT_SYMBOL(simple_dentry_operations);
+
+/*
+ * Lookup the data. This is trivial - if the dentry didn't already
+ * exist, we know it is negative.  Set d_op to delete negative dentries.
+ */
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	if (!dentry->d_sb->s_d_op)
+		d_set_d_op(dentry, &simple_dentry_operations);
+	d_add(dentry, NULL);
+	return NULL;
+}
+EXPORT_SYMBOL(simple_lookup);
+
+int dcache_dir_open(struct inode *inode, struct file *file)
+{
+	static struct qstr cursor_name = QSTR_INIT(".", 1);
+
+	file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
+
+	return file->private_data ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(dcache_dir_open);
+
+int dcache_dir_close(struct inode *inode, struct file *file)
+{
+	dput(file->private_data);
+	return 0;
+}
+EXPORT_SYMBOL(dcache_dir_close);
+
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	mutex_lock(&d_inode(dentry)->i_mutex);
+	switch (whence) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			mutex_unlock(&d_inode(dentry)->i_mutex);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct list_head *p;
+			struct dentry *cursor = file->private_data;
+			loff_t n = file->f_pos - 2;
+
+			spin_lock(&dentry->d_lock);
+			/* d_lock not required for cursor */
+			list_del(&cursor->d_child);
+			p = dentry->d_subdirs.next;
+			while (n && p != &dentry->d_subdirs) {
+				struct dentry *next;
+				next = list_entry(p, struct dentry, d_child);
+				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+				if (simple_positive(next))
+					n--;
+				spin_unlock(&next->d_lock);
+				p = p->next;
+			}
+			list_add_tail(&cursor->d_child, p);
+			spin_unlock(&dentry->d_lock);
+		}
+	}
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+	return offset;
+}
+EXPORT_SYMBOL(dcache_dir_lseek);
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+
+int dcache_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct dentry *cursor = file->private_data;
+	struct list_head *p, *q = &cursor->d_child;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	spin_lock(&dentry->d_lock);
+	if (ctx->pos == 2)
+		list_move(q, &dentry->d_subdirs);
+
+	for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
+		struct dentry *next = list_entry(p, struct dentry, d_child);
+		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+		if (!simple_positive(next)) {
+			spin_unlock(&next->d_lock);
+			continue;
+		}
+
+		spin_unlock(&next->d_lock);
+		spin_unlock(&dentry->d_lock);
+		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
+			      d_inode(next)->i_ino, dt_type(d_inode(next))))
+			return 0;
+		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+		/* next is still alive */
+		list_move(q, p);
+		spin_unlock(&next->d_lock);
+		p = q;
+		ctx->pos++;
+	}
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+EXPORT_SYMBOL(dcache_readdir);
+
+ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
+{
+	return -EISDIR;
+}
+EXPORT_SYMBOL(generic_read_dir);
+
+const struct file_operations simple_dir_operations = {
+	.open		= dcache_dir_open,
+	.release	= dcache_dir_close,
+	.llseek		= dcache_dir_lseek,
+	.read		= generic_read_dir,
+	.iterate	= dcache_readdir,
+	.fsync		= noop_fsync,
+};
+EXPORT_SYMBOL(simple_dir_operations);
+
+const struct inode_operations simple_dir_inode_operations = {
+	.lookup		= simple_lookup,
+};
+EXPORT_SYMBOL(simple_dir_inode_operations);
+
+static const struct super_operations simple_super_operations = {
+	.statfs		= simple_statfs,
+};
+
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
+	const struct super_operations *ops,
+	const struct dentry_operations *dops, unsigned long magic)
+{
+	struct super_block *s;
+	struct dentry *dentry;
+	struct inode *root;
+	struct qstr d_name = QSTR_INIT(name, strlen(name));
+
+	s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	s->s_maxbytes = MAX_LFS_FILESIZE;
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
+	s->s_magic = magic;
+	s->s_op = ops ? ops : &simple_super_operations;
+	s->s_time_gran = 1;
+	root = new_inode(s);
+	if (!root)
+		goto Enomem;
+	/*
+	 * since this is the first inode, make it number 1. New inodes created
+	 * after this must take care not to collide with it (by passing
+	 * max_reserved of 1 to iunique).
+	 */
+	root->i_ino = 1;
+	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+	dentry = __d_alloc(s, &d_name);
+	if (!dentry) {
+		iput(root);
+		goto Enomem;
+	}
+	d_instantiate(dentry, root);
+	s->s_root = dentry;
+	s->s_d_op = dops;
+	s->s_flags |= MS_ACTIVE;
+	return dget(s->s_root);
+
+Enomem:
+	deactivate_locked_super(s);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(mount_pseudo);
+
+int simple_open(struct inode *inode, struct file *file)
+{
+	if (inode->i_private)
+		file->private_data = inode->i_private;
+	return 0;
+}
+EXPORT_SYMBOL(simple_open);
+
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inc_nlink(inode);
+	ihold(inode);
+	dget(dentry);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+EXPORT_SYMBOL(simple_link);
+
+int simple_empty(struct dentry *dentry)
+{
+	struct dentry *child;
+	int ret = 0;
+
+	spin_lock(&dentry->d_lock);
+	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+		if (simple_positive(child)) {
+			spin_unlock(&child->d_lock);
+			goto out;
+		}
+		spin_unlock(&child->d_lock);
+	}
+	ret = 1;
+out:
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
+EXPORT_SYMBOL(simple_empty);
+
+int simple_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	drop_nlink(inode);
+	dput(dentry);
+	return 0;
+}
+EXPORT_SYMBOL(simple_unlink);
+
+int simple_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	if (!simple_empty(dentry))
+		return -ENOTEMPTY;
+
+	drop_nlink(d_inode(dentry));
+	simple_unlink(dir, dentry);
+	drop_nlink(dir);
+	return 0;
+}
+EXPORT_SYMBOL(simple_rmdir);
+
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+	int they_are_dirs = d_is_dir(old_dentry);
+
+	if (!simple_empty(new_dentry))
+		return -ENOTEMPTY;
+
+	if (d_really_is_positive(new_dentry)) {
+		simple_unlink(new_dir, new_dentry);
+		if (they_are_dirs) {
+			drop_nlink(d_inode(new_dentry));
+			drop_nlink(old_dir);
+		}
+	} else if (they_are_dirs) {
+		drop_nlink(old_dir);
+		inc_nlink(new_dir);
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
+		new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+EXPORT_SYMBOL(simple_rename);
+
+/**
+ * simple_setattr - setattr for simple filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr is a simple ->setattr implementation without a proper
+ * implementation of size changes.
+ *
+ * It can either be used for in-memory filesystems or special files
+ * on simple regular filesystems.  Anything that needs to change on-disk
+ * or wire state on size changes needs its own setattr method.
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	if (iattr->ia_valid & ATTR_SIZE)
+		truncate_setsize(inode, iattr->ia_size);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+EXPORT_SYMBOL(simple_setattr);
+
+int simple_readpage(struct file *file, struct page *page)
+{
+	clear_highpage(page);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+}
+EXPORT_SYMBOL(simple_readpage);
+
+int simple_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	struct page *page;
+	pgoff_t index;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	*pagep = page;
+
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(simple_write_begin);
+
+/**
+ * simple_write_end - .write_end helper for non-block-device FSes
+ * @available: See .write_end of address_space_operations
+ * @file: 		"
+ * @mapping: 		"
+ * @pos: 		"
+ * @len: 		"
+ * @copied: 		"
+ * @page: 		"
+ * @fsdata: 		"
+ *
+ * simple_write_end does the minimum needed for updating a page after writing is
+ * done. It has the same API signature as the .write_end of
+ * address_space_operations vector. So it can just be set onto .write_end for
+ * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * Block based filesystems should use generic_write_end().
+ * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
+ * is not called, so a filesystem that actually does store data in .write_inode
+ * should extend on what's done here with a call to mark_inode_dirty() in the
+ * case that i_size has changed.
+ */
+int simple_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t last_pos = pos + copied;
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len) {
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
+	}
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size)
+		i_size_write(inode, last_pos);
+
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+EXPORT_SYMBOL(simple_write_end);
+
+/*
+ * the inodes created here are not hashed. If you use iunique to generate
+ * unique inode values later for this filesystem, then you must take care
+ * to pass it an appropriate max_reserved value to avoid collisions.
+ */
+int simple_fill_super(struct super_block *s, unsigned long magic,
+		      struct tree_descr *files)
+{
+	struct inode *inode;
+	struct dentry *root;
+	struct dentry *dentry;
+	int i;
+
+	s->s_blocksize = PAGE_CACHE_SIZE;
+	s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	s->s_magic = magic;
+	s->s_op = &simple_super_operations;
+	s->s_time_gran = 1;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+	/*
+	 * because the root inode is 1, the files array must not contain an
+	 * entry at index 1
+	 */
+	inode->i_ino = 1;
+	inode->i_mode = S_IFDIR | 0755;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	set_nlink(inode, 2);
+	root = d_make_root(inode);
+	if (!root)
+		return -ENOMEM;
+	for (i = 0; !files->name || files->name[0]; i++, files++) {
+		if (!files->name)
+			continue;
+
+		/* warn if it tries to conflict with the root inode */
+		if (unlikely(i == 1))
+			printk(KERN_WARNING "%s: %s passed in a files array"
+				"with an index of 1!\n", __func__,
+				s->s_type->name);
+
+		dentry = d_alloc_name(root, files->name);
+		if (!dentry)
+			goto out;
+		inode = new_inode(s);
+		if (!inode) {
+			dput(dentry);
+			goto out;
+		}
+		inode->i_mode = S_IFREG | files->mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_fop = files->ops;
+		inode->i_ino = i;
+		d_add(dentry, inode);
+	}
+	s->s_root = root;
+	return 0;
+out:
+	d_genocide(root);
+	shrink_dcache_parent(root);
+	dput(root);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(simple_fill_super);
+
+static DEFINE_SPINLOCK(pin_fs_lock);
+
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt = NULL;
+	spin_lock(&pin_fs_lock);
+	if (unlikely(!*mount)) {
+		spin_unlock(&pin_fs_lock);
+		mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
+		if (IS_ERR(mnt))
+			return PTR_ERR(mnt);
+		spin_lock(&pin_fs_lock);
+		if (!*mount)
+			*mount = mnt;
+	}
+	mntget(*mount);
+	++*count;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+	return 0;
+}
+EXPORT_SYMBOL(simple_pin_fs);
+
+void simple_release_fs(struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt;
+	spin_lock(&pin_fs_lock);
+	mnt = *mount;
+	if (!--*count)
+		*mount = NULL;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+}
+EXPORT_SYMBOL(simple_release_fs);
+
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+	size_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available || !count)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	ret = copy_to_user(to, from + pos, count);
+	if (ret == count)
+		return -EFAULT;
+	count -= ret;
+	*ppos = pos + count;
+	return count;
+}
+EXPORT_SYMBOL(simple_read_from_buffer);
+
+/**
+ * simple_write_to_buffer - copy data from user space to the buffer
+ * @to: the buffer to write to
+ * @available: the size of the buffer
+ * @ppos: the current position in the buffer
+ * @from: the user space buffer to read from
+ * @count: the maximum number of bytes to read
+ *
+ * The simple_write_to_buffer() function reads up to @count bytes from the user
+ * space address starting at @from into the buffer @to at offset @ppos.
+ *
+ * On success, the number of bytes written is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
+		const void __user *from, size_t count)
+{
+	loff_t pos = *ppos;
+	size_t res;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available || !count)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	res = copy_from_user(to + pos, from, count);
+	if (res == count)
+		return -EFAULT;
+	count -= res;
+	*ppos = pos + count;
+	return count;
+}
+EXPORT_SYMBOL(simple_write_to_buffer);
+
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	memcpy(to, from + pos, count);
+	*ppos = pos + count;
+
+	return count;
+}
+EXPORT_SYMBOL(memory_read_from_buffer);
+
+/*
+ * Transaction based IO.
+ * The file expects a single write which triggers the transaction, and then
+ * possibly a read which collects the result - which is stored in a
+ * file-local buffer.
+ */
+
+void simple_transaction_set(struct file *file, size_t n)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+
+	/*
+	 * The barrier ensures that ar->size will really remain zero until
+	 * ar->data is ready for reading.
+	 */
+	smp_mb();
+	ar->size = n;
+}
+EXPORT_SYMBOL(simple_transaction_set);
+
+char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
+{
+	struct simple_transaction_argresp *ar;
+	static DEFINE_SPINLOCK(simple_transaction_lock);
+
+	if (size > SIMPLE_TRANSACTION_LIMIT - 1)
+		return ERR_PTR(-EFBIG);
+
+	ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
+	if (!ar)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock(&simple_transaction_lock);
+
+	/* only one write allowed per open */
+	if (file->private_data) {
+		spin_unlock(&simple_transaction_lock);
+		free_page((unsigned long)ar);
+		return ERR_PTR(-EBUSY);
+	}
+
+	file->private_data = ar;
+
+	spin_unlock(&simple_transaction_lock);
+
+	if (copy_from_user(ar->data, buf, size))
+		return ERR_PTR(-EFAULT);
+
+	return ar->data;
+}
+EXPORT_SYMBOL(simple_transaction_get);
+
+ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	if (!ar)
+		return 0;
+	return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
+}
+EXPORT_SYMBOL(simple_transaction_read);
+
+int simple_transaction_release(struct inode *inode, struct file *file)
+{
+	free_page((unsigned long)file->private_data);
+	return 0;
+}
+EXPORT_SYMBOL(simple_transaction_release);
+
+/* Simple attribute files */
+
+struct simple_attr {
+	int (*get)(void *, u64 *);
+	int (*set)(void *, u64);
+	char get_buf[24];	/* enough to store a u64 and "\n\0" */
+	char set_buf[24];
+	void *data;
+	const char *fmt;	/* format for read operation */
+	struct mutex mutex;	/* protects access to these buffers */
+};
+
+/* simple_attr_open is called by an actual attribute open file operation
+ * to set the attribute specific access operations. */
+int simple_attr_open(struct inode *inode, struct file *file,
+		     int (*get)(void *, u64 *), int (*set)(void *, u64),
+		     const char *fmt)
+{
+	struct simple_attr *attr;
+
+	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->get = get;
+	attr->set = set;
+	attr->data = inode->i_private;
+	attr->fmt = fmt;
+	mutex_init(&attr->mutex);
+
+	file->private_data = attr;
+
+	return nonseekable_open(inode, file);
+}
+EXPORT_SYMBOL_GPL(simple_attr_open);
+
+int simple_attr_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(simple_attr_release);	/* GPL-only?  This?  Really? */
+
+/* read from the buffer that is filled with the get function */
+ssize_t simple_attr_read(struct file *file, char __user *buf,
+			 size_t len, loff_t *ppos)
+{
+	struct simple_attr *attr;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+
+	if (!attr->get)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	if (*ppos) {		/* continued read */
+		size = strlen(attr->get_buf);
+	} else {		/* first read */
+		u64 val;
+		ret = attr->get(attr->data, &val);
+		if (ret)
+			goto out;
+
+		size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
+				 attr->fmt, (unsigned long long)val);
+	}
+
+	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(simple_attr_read);
+
+/* interpret the buffer as a number to call the set function with */
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+			  size_t len, loff_t *ppos)
+{
+	struct simple_attr *attr;
+	u64 val;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->set)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	size = min(sizeof(attr->set_buf) - 1, len);
+	if (copy_from_user(attr->set_buf, buf, size))
+		goto out;
+
+	attr->set_buf[size] = '\0';
+	val = simple_strtoll(attr->set_buf, NULL, 0);
+	ret = attr->set(attr->data, val);
+	if (ret == 0)
+		ret = len; /* on success, claim we got the whole input */
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(simple_attr_write);
+
+/**
+ * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
+ * @sb:		filesystem to do the file handle conversion on
+ * @fid:	file handle to convert
+ * @fh_len:	length of the file handle in bytes
+ * @fh_type:	type of file handle
+ * @get_inode:	filesystem callback to retrieve inode
+ *
+ * This function decodes @fid as long as it has one of the well-known
+ * Linux filehandle types and calls @get_inode on it to retrieve the
+ * inode for the object specified in the file handle.
+ */
+struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type, struct inode *(*get_inode)
+			(struct super_block *sb, u64 ino, u32 gen))
+{
+	struct inode *inode = NULL;
+
+	if (fh_len < 2)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_INO32_GEN:
+	case FILEID_INO32_GEN_PARENT:
+		inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
+
+/**
+ * generic_fh_to_parent - generic helper for the fh_to_parent export operation
+ * @sb:		filesystem to do the file handle conversion on
+ * @fid:	file handle to convert
+ * @fh_len:	length of the file handle in bytes
+ * @fh_type:	type of file handle
+ * @get_inode:	filesystem callback to retrieve inode
+ *
+ * This function decodes @fid as long as it has one of the well-known
+ * Linux filehandle types and calls @get_inode on it to retrieve the
+ * inode for the _parent_ object specified in the file handle if it
+ * is specified in the file handle, or NULL otherwise.
+ */
+struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type, struct inode *(*get_inode)
+			(struct super_block *sb, u64 ino, u32 gen))
+{
+	struct inode *inode = NULL;
+
+	if (fh_len <= 2)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_INO32_GEN_PARENT:
+		inode = get_inode(sb, fid->i32.parent_ino,
+				  (fh_len > 3 ? fid->i32.parent_gen : 0));
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+EXPORT_SYMBOL_GPL(generic_fh_to_parent);
+
+/**
+ * __generic_file_fsync - generic fsync implementation for simple filesystems
+ *
+ * @file:	file to synchronize
+ * @start:	start offset in bytes
+ * @end:	end offset in bytes (inclusive)
+ * @datasync:	only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
+				 int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int err;
+	int ret;
+
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY_ALL))
+		goto out;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		goto out;
+
+	err = sync_inode_metadata(inode, 1);
+	if (ret == 0)
+		ret = err;
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(__generic_file_fsync);
+
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ *			with flush
+ * @file:	file to synchronize
+ * @start:	start offset in bytes
+ * @end:	end offset in bytes (inclusive)
+ * @datasync:	only synchronize essential metadata if true
+ *
+ */
+
+int generic_file_fsync(struct file *file, loff_t start, loff_t end,
+		       int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int err;
+
+	err = __generic_file_fsync(file, start, end, datasync);
+	if (err)
+		return err;
+	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+}
+EXPORT_SYMBOL(generic_file_fsync);
+
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits:	log of file system block size
+ * @num_blocks:		number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+	u64 last_fs_block = num_blocks - 1;
+	u64 last_fs_page =
+		last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+
+	if (unlikely(num_blocks == 0))
+		return 0;
+
+	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+		return -EINVAL;
+
+	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+	    (last_fs_page > (pgoff_t)(~0ULL))) {
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
+
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	return 0;
+}
+EXPORT_SYMBOL(noop_fsync);
+
+void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
+}
+EXPORT_SYMBOL(kfree_put_link);
+
+/*
+ * nop .set_page_dirty method so that people can use .page_mkwrite on
+ * anon inodes.
+ */
+static int anon_set_page_dirty(struct page *page)
+{
+	return 0;
+};
+
+/*
+ * A single inode exists for all anon_inode files. Contrary to pipes,
+ * anon_inode inodes have no associated per-instance data, so we need
+ * only allocate one of them.
+ */
+struct inode *alloc_anon_inode(struct super_block *s)
+{
+	static const struct address_space_operations anon_aops = {
+		.set_page_dirty = anon_set_page_dirty,
+	};
+	struct inode *inode = new_inode_pseudo(s);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_ino = get_next_ino();
+	inode->i_mapping->a_ops = &anon_aops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because mark_inode_dirty() will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_flags |= S_PRIVATE;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	return inode;
+}
+EXPORT_SYMBOL(alloc_anon_inode);
+
+/**
+ * simple_nosetlease - generic helper for prohibiting leases
+ * @filp: file pointer
+ * @arg: type of lease to obtain
+ * @flp: new lease supplied for insertion
+ * @priv: private data for lm_setup operation
+ *
+ * Generic helper for filesystems that do not wish to allow leases to be set.
+ * All arguments are ignored and it just returns -EINVAL.
+ */
+int
+simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
+		  void **priv)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL(simple_nosetlease);
+
+
+/*
+ * Operations for a permanently empty directory.
+ */
+static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return ERR_PTR(-ENOENT);
+}
+
+static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	return -EPERM;
+}
+
+static int empty_dir_setxattr(struct dentry *dentry, const char *name,
+			      const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
+				  void *value, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static int empty_dir_removexattr(struct dentry *dentry, const char *name)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct inode_operations empty_dir_inode_operations = {
+	.lookup		= empty_dir_lookup,
+	.permission	= generic_permission,
+	.setattr	= empty_dir_setattr,
+	.getattr	= empty_dir_getattr,
+	.setxattr	= empty_dir_setxattr,
+	.getxattr	= empty_dir_getxattr,
+	.removexattr	= empty_dir_removexattr,
+	.listxattr	= empty_dir_listxattr,
+};
+
+static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	/* An empty directory has two entries . and .. at offsets 0 and 1 */
+	return generic_file_llseek_size(file, offset, whence, 2, 2);
+}
+
+static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+	dir_emit_dots(file, ctx);
+	return 0;
+}
+
+static const struct file_operations empty_dir_operations = {
+	.llseek		= empty_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= empty_dir_readdir,
+	.fsync		= noop_fsync,
+};
+
+
+void make_empty_dir_inode(struct inode *inode)
+{
+	set_nlink(inode, 2);
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->i_uid = GLOBAL_ROOT_UID;
+	inode->i_gid = GLOBAL_ROOT_GID;
+	inode->i_rdev = 0;
+	inode->i_size = 2;
+	inode->i_blkbits = PAGE_SHIFT;
+	inode->i_blocks = 0;
+
+	inode->i_op = &empty_dir_inode_operations;
+	inode->i_fop = &empty_dir_operations;
+}
+
+bool is_empty_dir_inode(struct inode *inode)
+{
+	return (inode->i_fop == &empty_dir_operations) &&
+		(inode->i_op == &empty_dir_inode_operations);
+}
diff --git a/kernel/fs/lockd/Makefile b/kernel/fs/lockd/Makefile
new file mode 100644
index 000000000..9b320cc2a
--- /dev/null
+++ b/kernel/fs/lockd/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux lock manager stuff
+#
+
+obj-$(CONFIG_LOCKD) += lockd.o
+
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
+	        svcshare.o svcproc.o svcsubs.o mon.o xdr.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_PROC_FS) += procfs.o
+lockd-objs		      := $(lockd-objs-y)
diff --git a/kernel/fs/lockd/clnt4xdr.c b/kernel/fs/lockd/clnt4xdr.c
new file mode 100644
index 000000000..d3e40db28
--- /dev/null
+++ b/kernel/fs/lockd/clnt4xdr.c
@@ -0,0 +1,599 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#include <uapi/linux/nfs3.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz		(0)
+#define NLM4_cookie_sz		(1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz		(1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz		(5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz		(6+NLM4_owner_sz)
+
+#define NLM4_testargs_sz	(NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz	(NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz	(NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz	(NLM4_cookie_sz+NLM4_lock_sz)
+
+#define NLM4_testres_sz		(NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz		(NLM4_cookie_sz+1)
+#define NLM4_norep_sz		(0)
+
+
+static s64 loff_t_to_s64(loff_t offset)
+{
+	s64 res;
+
+	if (offset >= NLM4_OFFSET_MAX)
+		res = NLM4_OFFSET_MAX;
+	else if (offset <= -NLM4_OFFSET_MAX)
+		res = -NLM4_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+				 u64 *l_offset, u64 *l_len)
+{
+	const struct file_lock *fl = &lock->fl;
+
+	*l_offset = loff_t_to_s64(fl->fl_start);
+	if (fl->fl_end == OFFSET_MAX)
+		*l_len = 0;
+	else
+		*l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = value ? xdr_one : xdr_zero;
+}
+
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+/*
+ *	typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+			  const u8 *data, const unsigned int length)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, data, length);
+}
+
+static int decode_netobj(struct xdr_stream *xdr,
+			 struct xdr_netobj *obj)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > XDR_MAX_NETOBJ))
+		goto out_size;
+	obj->len = length;
+	obj->data = (u8 *)p;
+	return 0;
+out_size:
+	dprintk("NFS: returned netobj was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+			  const struct nlm_cookie *cookie)
+{
+	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+
+static int decode_cookie(struct xdr_stream *xdr,
+			     struct nlm_cookie *cookie)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	/* apparently HPUX can return empty cookies */
+	if (length == 0)
+		goto out_hpux;
+	if (length > NLM_MAXCOOKIELEN)
+		goto out_size;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	cookie->len = length;
+	memcpy(cookie->data, p, length);
+	return 0;
+out_hpux:
+	cookie->len = 4;
+	memset(cookie->data, 0, 4);
+	return 0;
+out_size:
+	dprintk("NFS: returned cookie was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+
+/*
+ *	enum nlm4_stats {
+ *		NLM4_GRANTED = 0,
+ *		NLM4_DENIED = 1,
+ *		NLM4_DENIED_NOLOCKS = 2,
+ *		NLM4_BLOCKED = 3,
+ *		NLM4_DENIED_GRACE_PERIOD = 4,
+ *		NLM4_DEADLCK = 5,
+ *		NLM4_ROFS = 6,
+ *		NLM4_STALE_FH = 7,
+ *		NLM4_FBIG = 8,
+ *		NLM4_FAILED = 9
+ *	};
+ *
+ *	struct nlm4_stat {
+ *		nlm4_stats stat;
+ *	};
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+			     const __be32 stat)
+{
+	__be32 *p;
+
+	BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+	p = xdr_reserve_space(xdr, 4);
+	*p = stat;
+}
+
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (unlikely(ntohl(*p) > ntohl(nlm4_failed)))
+		goto out_bad_xdr;
+	*stat = *p;
+	return 0;
+out_bad_xdr:
+	dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+			__func__, be32_to_cpup(p));
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	struct nlm4_holder {
+ *		bool	exclusive;
+ *		int32	svid;
+ *		netobj	oh;
+ *		uint64	l_offset;
+ *		uint64	l_len;
+ *	};
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+			       const struct nlm_res *result)
+{
+	const struct nlm_lock *lock = &result->lock;
+	u64 l_offset, l_len;
+	__be32 *p;
+
+	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_int32(xdr, lock->svid);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	nlm4_compute_offsets(lock, &l_offset, &l_len);
+	p = xdr_encode_hyper(p, l_offset);
+	xdr_encode_hyper(p, l_len);
+}
+
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+	struct nlm_lock *lock = &result->lock;
+	struct file_lock *fl = &lock->fl;
+	u64 l_offset, l_len;
+	u32 exclusive;
+	int error;
+	__be32 *p;
+	s32 end;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(fl);
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	exclusive = be32_to_cpup(p++);
+	lock->svid = be32_to_cpup(p);
+	fl->fl_pid = (pid_t)lock->svid;
+
+	error = decode_netobj(xdr, &lock->oh);
+	if (unlikely(error))
+		goto out;
+
+	p = xdr_inline_decode(xdr, 8 + 8);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	p = xdr_decode_hyper(p, &l_offset);
+	xdr_decode_hyper(p, &l_len);
+	end = l_offset + l_len - 1;
+
+	fl->fl_start = (loff_t)l_offset;
+	if (l_len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = (loff_t)end;
+	error = 0;
+out:
+	return error;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+	/* NB: client-side does not set lock->len */
+	u32 length = strlen(name);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+/*
+ *	struct nlm4_lock {
+ *		string	caller_name<LM_MAXSTRLEN>;
+ *		netobj	fh;
+ *		netobj	oh;
+ *		int32	svid;
+ *		uint64	l_offset;
+ *		uint64	l_len;
+ *	};
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+			     const struct nlm_lock *lock)
+{
+	u64 l_offset, l_len;
+	__be32 *p;
+
+	encode_caller_name(xdr, lock->caller);
+	encode_fh(xdr, &lock->fh);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 8 + 8);
+	*p++ = cpu_to_be32(lock->svid);
+
+	nlm4_compute_offsets(lock, &l_offset, &l_len);
+	p = xdr_encode_hyper(p, l_offset);
+	xdr_encode_hyper(p, l_len);
+}
+
+
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	struct nlm4_testargs {
+ *		netobj cookie;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_lockargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *		bool reclaim;
+ *		int state;
+ *	};
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
+}
+
+/*
+ *	struct nlm4_cancargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_unlockargs {
+ *		netobj cookie;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_res {
+ *		netobj cookie;
+ *		nlm4_stat stat;
+ *	};
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+			     struct xdr_stream *xdr,
+			     const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
+}
+
+/*
+ *	union nlm4_testrply switch (nlm4_stats stat) {
+ *	case NLM4_DENIED:
+ *		struct nlm4_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm4_testres {
+ *		netobj cookie;
+ *		nlm4_testrply test_stat;
+ *	};
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
+	if (result->status == nlm_lck_denied)
+		encode_nlm4_holder(xdr, result);
+}
+
+
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	union nlm4_testrply switch (nlm4_stats stat) {
+ *	case NLM4_DENIED:
+ *		struct nlm4_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm4_testres {
+ *		netobj cookie;
+ *		nlm4_testrply test_stat;
+ *	};
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+				struct nlm_res *result)
+{
+	int error;
+
+	error = decode_nlm4_stat(xdr, &result->status);
+	if (unlikely(error))
+		goto out;
+	if (result->status == nlm_lck_denied)
+		error = decode_nlm4_holder(xdr, result);
+out:
+	return error;
+}
+
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm4_testrply(xdr, result);
+out:
+	return error;
+}
+
+/*
+ *	struct nlm4_res {
+ *		netobj cookie;
+ *		nlm4_stat stat;
+ *	};
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
+			    struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm4_stat(xdr, &result->status);
+out:
+	return error;
+}
+
+
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep	NULL
+
+#define PROC(proc, argtype, restype)					\
+[NLMPROC_##proc] = {							\
+	.p_proc      = NLMPROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,		\
+	.p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,		\
+	.p_arglen    = NLM4_##argtype##_sz,				\
+	.p_replen    = NLM4_##restype##_sz,				\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+static struct rpc_procinfo	nlm4_procedures[] = {
+	PROC(TEST,		testargs,	testres),
+	PROC(LOCK,		lockargs,	res),
+	PROC(CANCEL,		cancargs,	res),
+	PROC(UNLOCK,		unlockargs,	res),
+	PROC(GRANTED,		testargs,	res),
+	PROC(TEST_MSG,		testargs,	norep),
+	PROC(LOCK_MSG,		lockargs,	norep),
+	PROC(CANCEL_MSG,	cancargs,	norep),
+	PROC(UNLOCK_MSG,	unlockargs,	norep),
+	PROC(GRANTED_MSG,	testargs,	norep),
+	PROC(TEST_RES,		testres,	norep),
+	PROC(LOCK_RES,		res,		norep),
+	PROC(CANCEL_RES,	res,		norep),
+	PROC(UNLOCK_RES,	res,		norep),
+	PROC(GRANTED_RES,	res,		norep),
+};
+
+const struct rpc_version nlm_version4 = {
+	.number		= 4,
+	.nrprocs	= ARRAY_SIZE(nlm4_procedures),
+	.procs		= nlm4_procedures,
+};
diff --git a/kernel/fs/lockd/clntlock.c b/kernel/fs/lockd/clntlock.c
new file mode 100644
index 000000000..41e491b8e
--- /dev/null
+++ b/kernel/fs/lockd/clntlock.c
@@ -0,0 +1,301 @@
+/*
+ * linux/fs/lockd/clntlock.c
+ *
+ * Lock handling for the client side NLM implementation
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+#include <linux/kthread.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+
+/*
+ * Local function prototypes
+ */
+static int			reclaimer(void *ptr);
+
+/*
+ * The following functions handle blocking and granting from the
+ * client perspective.
+ */
+
+/*
+ * This is the representation of a blocked client lock.
+ */
+struct nlm_wait {
+	struct list_head	b_list;		/* linked list */
+	wait_queue_head_t	b_wait;		/* where to wait on */
+	struct nlm_host *	b_host;
+	struct file_lock *	b_lock;		/* local file lock */
+	unsigned short		b_reclaim;	/* got to reclaim lock */
+	__be32			b_status;	/* grant callback status */
+};
+
+static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
+
+/**
+ * nlmclnt_init - Set up per-NFS mount point lockd data structures
+ * @nlm_init: pointer to arguments structure
+ *
+ * Returns pointer to an appropriate nlm_host struct,
+ * or an ERR_PTR value.
+ */
+struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
+{
+	struct nlm_host *host;
+	u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
+	int status;
+
+	status = lockd_up(nlm_init->net);
+	if (status < 0)
+		return ERR_PTR(status);
+
+	host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
+				   nlm_init->protocol, nlm_version,
+				   nlm_init->hostname, nlm_init->noresvport,
+				   nlm_init->net);
+	if (host == NULL)
+		goto out_nohost;
+	if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
+		goto out_nobind;
+
+	return host;
+out_nobind:
+	nlmclnt_release_host(host);
+out_nohost:
+	lockd_down(nlm_init->net);
+	return ERR_PTR(-ENOLCK);
+}
+EXPORT_SYMBOL_GPL(nlmclnt_init);
+
+/**
+ * nlmclnt_done - Release resources allocated by nlmclnt_init()
+ * @host: nlm_host structure reserved by nlmclnt_init()
+ *
+ */
+void nlmclnt_done(struct nlm_host *host)
+{
+	struct net *net = host->net;
+
+	nlmclnt_release_host(host);
+	lockd_down(net);
+}
+EXPORT_SYMBOL_GPL(nlmclnt_done);
+
+/*
+ * Queue up a lock for blocking so that the GRANTED request can see it
+ */
+struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl)
+{
+	struct nlm_wait *block;
+
+	block = kmalloc(sizeof(*block), GFP_KERNEL);
+	if (block != NULL) {
+		block->b_host = host;
+		block->b_lock = fl;
+		init_waitqueue_head(&block->b_wait);
+		block->b_status = nlm_lck_blocked;
+
+		spin_lock(&nlm_blocked_lock);
+		list_add(&block->b_list, &nlm_blocked);
+		spin_unlock(&nlm_blocked_lock);
+	}
+	return block;
+}
+
+void nlmclnt_finish_block(struct nlm_wait *block)
+{
+	if (block == NULL)
+		return;
+	spin_lock(&nlm_blocked_lock);
+	list_del(&block->b_list);
+	spin_unlock(&nlm_blocked_lock);
+	kfree(block);
+}
+
+/*
+ * Block on a lock
+ */
+int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
+{
+	long ret;
+
+	/* A borken server might ask us to block even if we didn't
+	 * request it. Just say no!
+	 */
+	if (block == NULL)
+		return -EAGAIN;
+
+	/* Go to sleep waiting for GRANT callback. Some servers seem
+	 * to lose callbacks, however, so we're going to poll from
+	 * time to time just to make sure.
+	 *
+	 * For now, the retry frequency is pretty high; normally 
+	 * a 1 minute timeout would do. See the comment before
+	 * nlmclnt_lock for an explanation.
+	 */
+	ret = wait_event_interruptible_timeout(block->b_wait,
+			block->b_status != nlm_lck_blocked,
+			timeout);
+	if (ret < 0)
+		return -ERESTARTSYS;
+	/* Reset the lock status after a server reboot so we resend */
+	if (block->b_status == nlm_lck_denied_grace_period)
+		block->b_status = nlm_lck_blocked;
+	req->a_res.status = block->b_status;
+	return 0;
+}
+
+/*
+ * The server lockd has called us back to tell us the lock was granted
+ */
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
+{
+	const struct file_lock *fl = &lock->fl;
+	const struct nfs_fh *fh = &lock->fh;
+	struct nlm_wait	*block;
+	__be32 res = nlm_lck_denied;
+
+	/*
+	 * Look up blocked request based on arguments. 
+	 * Warning: must not use cookie to match it!
+	 */
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		struct file_lock *fl_blocked = block->b_lock;
+
+		if (fl_blocked->fl_start != fl->fl_start)
+			continue;
+		if (fl_blocked->fl_end != fl->fl_end)
+			continue;
+		/*
+		 * Careful! The NLM server will return the 32-bit "pid" that
+		 * we put on the wire: in this case the lockowner "pid".
+		 */
+		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
+			continue;
+		if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
+			continue;
+		if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0)
+			continue;
+		/* Alright, we found a lock. Set the return status
+		 * and wake up the caller
+		 */
+		block->b_status = nlm_granted;
+		wake_up(&block->b_wait);
+		res = nlm_granted;
+	}
+	spin_unlock(&nlm_blocked_lock);
+	return res;
+}
+
+/*
+ * The following procedures deal with the recovery of locks after a
+ * server crash.
+ */
+
+/*
+ * Reclaim all locks on server host. We do this by spawning a separate
+ * reclaimer thread.
+ */
+void
+nlmclnt_recovery(struct nlm_host *host)
+{
+	struct task_struct *task;
+
+	if (!host->h_reclaiming++) {
+		nlm_get_host(host);
+		task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name);
+		if (IS_ERR(task))
+			printk(KERN_ERR "lockd: unable to spawn reclaimer "
+				"thread. Locks for %s won't be reclaimed! "
+				"(%ld)\n", host->h_name, PTR_ERR(task));
+	}
+}
+
+static int
+reclaimer(void *ptr)
+{
+	struct nlm_host	  *host = (struct nlm_host *) ptr;
+	struct nlm_wait	  *block;
+	struct nlm_rqst   *req;
+	struct file_lock *fl, *next;
+	u32 nsmstate;
+	struct net *net = host->net;
+
+	req = kmalloc(sizeof(*req), GFP_KERNEL);
+	if (!req) {
+		printk(KERN_ERR "lockd: reclaimer unable to alloc memory."
+				" Locks for %s won't be reclaimed!\n",
+				host->h_name);
+		return 0;
+	}
+
+	allow_signal(SIGKILL);
+
+	down_write(&host->h_rwsem);
+	lockd_up(net);	/* note: this cannot fail as lockd is already running */
+
+	dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
+
+restart:
+	nsmstate = host->h_nsmstate;
+
+	/* Force a portmap getport - the peer's lockd will
+	 * most likely end up on a different port.
+	 */
+	host->h_nextrebind = jiffies;
+	nlm_rebind_host(host);
+
+	/* First, reclaim all locks that have been granted. */
+	list_splice_init(&host->h_granted, &host->h_reclaim);
+	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
+		list_del_init(&fl->fl_u.nfs_fl.list);
+
+		/*
+		 * sending this thread a SIGKILL will result in any unreclaimed
+		 * locks being removed from the h_granted list. This means that
+		 * the kernel will not attempt to reclaim them again if a new
+		 * reclaimer thread is spawned for this host.
+		 */
+		if (signalled())
+			continue;
+		if (nlmclnt_reclaim(host, fl, req) != 0)
+			continue;
+		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+		if (host->h_nsmstate != nsmstate) {
+			/* Argh! The server rebooted again! */
+			goto restart;
+		}
+	}
+
+	host->h_reclaiming = 0;
+	up_write(&host->h_rwsem);
+	dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
+
+	/* Now, wake up all processes that sleep on a blocked lock */
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (block->b_host == host) {
+			block->b_status = nlm_lck_denied_grace_period;
+			wake_up(&block->b_wait);
+		}
+	}
+	spin_unlock(&nlm_blocked_lock);
+
+	/* Release host handle after use */
+	nlmclnt_release_host(host);
+	lockd_down(net);
+	kfree(req);
+	return 0;
+}
diff --git a/kernel/fs/lockd/clntproc.c b/kernel/fs/lockd/clntproc.c
new file mode 100644
index 000000000..acd394716
--- /dev/null
+++ b/kernel/fs/lockd/clntproc.c
@@ -0,0 +1,849 @@
+/*
+ * linux/fs/lockd/clntproc.c
+ *
+ * RPC procedures for the client side NLM implementation
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/utsname.h>
+#include <linux/freezer.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+#define NLMCLNT_GRACE_WAIT	(5*HZ)
+#define NLMCLNT_POLL_TIMEOUT	(30*HZ)
+#define NLMCLNT_MAX_RETRIES	3
+
+static int	nlmclnt_test(struct nlm_rqst *, struct file_lock *);
+static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
+static int	nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
+static int	nlm_stat_to_errno(__be32 stat);
+static void	nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
+static int	nlmclnt_cancel(struct nlm_host *, int , struct file_lock *);
+
+static const struct rpc_call_ops nlmclnt_unlock_ops;
+static const struct rpc_call_ops nlmclnt_cancel_ops;
+
+/*
+ * Cookie counter for NLM requests
+ */
+static atomic_t	nlm_cookie = ATOMIC_INIT(0x1234);
+
+void nlmclnt_next_cookie(struct nlm_cookie *c)
+{
+	u32	cookie = atomic_inc_return(&nlm_cookie);
+
+	memcpy(c->data, &cookie, 4);
+	c->len=4;
+}
+
+static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
+{
+	atomic_inc(&lockowner->count);
+	return lockowner;
+}
+
+static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
+{
+	if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
+		return;
+	list_del(&lockowner->list);
+	spin_unlock(&lockowner->host->h_lock);
+	nlmclnt_release_host(lockowner->host);
+	kfree(lockowner);
+}
+
+static inline int nlm_pidbusy(struct nlm_host *host, uint32_t pid)
+{
+	struct nlm_lockowner *lockowner;
+	list_for_each_entry(lockowner, &host->h_lockowners, list) {
+		if (lockowner->pid == pid)
+			return -EBUSY;
+	}
+	return 0;
+}
+
+static inline uint32_t __nlm_alloc_pid(struct nlm_host *host)
+{
+	uint32_t res;
+	do {
+		res = host->h_pidcount++;
+	} while (nlm_pidbusy(host, res) < 0);
+	return res;
+}
+
+static struct nlm_lockowner *__nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner)
+{
+	struct nlm_lockowner *lockowner;
+	list_for_each_entry(lockowner, &host->h_lockowners, list) {
+		if (lockowner->owner != owner)
+			continue;
+		return nlm_get_lockowner(lockowner);
+	}
+	return NULL;
+}
+
+static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner)
+{
+	struct nlm_lockowner *res, *new = NULL;
+
+	spin_lock(&host->h_lock);
+	res = __nlm_find_lockowner(host, owner);
+	if (res == NULL) {
+		spin_unlock(&host->h_lock);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		spin_lock(&host->h_lock);
+		res = __nlm_find_lockowner(host, owner);
+		if (res == NULL && new != NULL) {
+			res = new;
+			atomic_set(&new->count, 1);
+			new->owner = owner;
+			new->pid = __nlm_alloc_pid(host);
+			new->host = nlm_get_host(host);
+			list_add(&new->list, &host->h_lockowners);
+			new = NULL;
+		}
+	}
+	spin_unlock(&host->h_lock);
+	kfree(new);
+	return res;
+}
+
+/*
+ * Initialize arguments for TEST/LOCK/UNLOCK/CANCEL calls
+ */
+static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
+{
+	struct nlm_args	*argp = &req->a_args;
+	struct nlm_lock	*lock = &argp->lock;
+	char *nodename = req->a_host->h_rpcclnt->cl_nodename;
+
+	nlmclnt_next_cookie(&argp->cookie);
+	memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
+	lock->caller  = nodename;
+	lock->oh.data = req->a_owner;
+	lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
+				(unsigned int)fl->fl_u.nfs_fl.owner->pid,
+				nodename);
+	lock->svid = fl->fl_u.nfs_fl.owner->pid;
+	lock->fl.fl_start = fl->fl_start;
+	lock->fl.fl_end = fl->fl_end;
+	lock->fl.fl_type = fl->fl_type;
+}
+
+static void nlmclnt_release_lockargs(struct nlm_rqst *req)
+{
+	WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL);
+}
+
+/**
+ * nlmclnt_proc - Perform a single client-side lock request
+ * @host: address of a valid nlm_host context representing the NLM server
+ * @cmd: fcntl-style file lock operation to perform
+ * @fl: address of arguments for the lock operation
+ *
+ */
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
+{
+	struct nlm_rqst		*call;
+	int			status;
+
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return -ENOMEM;
+
+	nlmclnt_locks_init_private(fl, host);
+	if (!fl->fl_u.nfs_fl.owner) {
+		/* lockowner allocation has failed */
+		nlmclnt_release_call(call);
+		return -ENOMEM;
+	}
+	/* Set up the argument struct */
+	nlmclnt_setlockargs(call, fl);
+
+	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
+		if (fl->fl_type != F_UNLCK) {
+			call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
+			status = nlmclnt_lock(call, fl);
+		} else
+			status = nlmclnt_unlock(call, fl);
+	} else if (IS_GETLK(cmd))
+		status = nlmclnt_test(call, fl);
+	else
+		status = -EINVAL;
+	fl->fl_ops->fl_release_private(fl);
+	fl->fl_ops = NULL;
+
+	dprintk("lockd: clnt proc returns %d\n", status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(nlmclnt_proc);
+
+/*
+ * Allocate an NLM RPC call struct
+ */
+struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
+{
+	struct nlm_rqst	*call;
+
+	for(;;) {
+		call = kzalloc(sizeof(*call), GFP_KERNEL);
+		if (call != NULL) {
+			atomic_set(&call->a_count, 1);
+			locks_init_lock(&call->a_args.lock.fl);
+			locks_init_lock(&call->a_res.lock.fl);
+			call->a_host = nlm_get_host(host);
+			return call;
+		}
+		if (signalled())
+			break;
+		printk("nlm_alloc_call: failed, waiting for memory\n");
+		schedule_timeout_interruptible(5*HZ);
+	}
+	return NULL;
+}
+
+void nlmclnt_release_call(struct nlm_rqst *call)
+{
+	if (!atomic_dec_and_test(&call->a_count))
+		return;
+	nlmclnt_release_host(call->a_host);
+	nlmclnt_release_lockargs(call);
+	kfree(call);
+}
+
+static void nlmclnt_rpc_release(void *data)
+{
+	nlmclnt_release_call(data);
+}
+
+static int nlm_wait_on_grace(wait_queue_head_t *queue)
+{
+	DEFINE_WAIT(wait);
+	int status = -EINTR;
+
+	prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE);
+	if (!signalled ()) {
+		schedule_timeout(NLMCLNT_GRACE_WAIT);
+		try_to_freeze();
+		if (!signalled ())
+			status = 0;
+	}
+	finish_wait(queue, &wait);
+	return status;
+}
+
+/*
+ * Generic NLM call
+ */
+static int
+nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc)
+{
+	struct nlm_host	*host = req->a_host;
+	struct rpc_clnt	*clnt;
+	struct nlm_args	*argp = &req->a_args;
+	struct nlm_res	*resp = &req->a_res;
+	struct rpc_message msg = {
+		.rpc_argp	= argp,
+		.rpc_resp	= resp,
+		.rpc_cred	= cred,
+	};
+	int		status;
+
+	dprintk("lockd: call procedure %d on %s\n",
+			(int)proc, host->h_name);
+
+	do {
+		if (host->h_reclaiming && !argp->reclaim)
+			goto in_grace_period;
+
+		/* If we have no RPC client yet, create one. */
+		if ((clnt = nlm_bind_host(host)) == NULL)
+			return -ENOLCK;
+		msg.rpc_proc = &clnt->cl_procinfo[proc];
+
+		/* Perform the RPC call. If an error occurs, try again */
+		if ((status = rpc_call_sync(clnt, &msg, 0)) < 0) {
+			dprintk("lockd: rpc_call returned error %d\n", -status);
+			switch (status) {
+			case -EPROTONOSUPPORT:
+				status = -EINVAL;
+				break;
+			case -ECONNREFUSED:
+			case -ETIMEDOUT:
+			case -ENOTCONN:
+				nlm_rebind_host(host);
+				status = -EAGAIN;
+				break;
+			case -ERESTARTSYS:
+				return signalled () ? -EINTR : status;
+			default:
+				break;
+			}
+			break;
+		} else
+		if (resp->status == nlm_lck_denied_grace_period) {
+			dprintk("lockd: server in grace period\n");
+			if (argp->reclaim) {
+				printk(KERN_WARNING
+				     "lockd: spurious grace period reject?!\n");
+				return -ENOLCK;
+			}
+		} else {
+			if (!argp->reclaim) {
+				/* We appear to be out of the grace period */
+				wake_up_all(&host->h_gracewait);
+			}
+			dprintk("lockd: server returns status %d\n",
+				ntohl(resp->status));
+			return 0;	/* Okay, call complete */
+		}
+
+in_grace_period:
+		/*
+		 * The server has rebooted and appears to be in the grace
+		 * period during which locks are only allowed to be
+		 * reclaimed.
+		 * We can only back off and try again later.
+		 */
+		status = nlm_wait_on_grace(&host->h_gracewait);
+	} while (status == 0);
+
+	return status;
+}
+
+/*
+ * Generic NLM call, async version.
+ */
+static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
+{
+	struct nlm_host	*host = req->a_host;
+	struct rpc_clnt	*clnt;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = msg,
+		.callback_ops = tk_ops,
+		.callback_data = req,
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	dprintk("lockd: call procedure %d on %s (async)\n",
+			(int)proc, host->h_name);
+
+	/* If we have no RPC client yet, create one. */
+	clnt = nlm_bind_host(host);
+	if (clnt == NULL)
+		goto out_err;
+	msg->rpc_proc = &clnt->cl_procinfo[proc];
+	task_setup_data.rpc_client = clnt;
+
+        /* bootstrap and kick off the async RPC call */
+	return rpc_run_task(&task_setup_data);
+out_err:
+	tk_ops->rpc_release(req);
+	return ERR_PTR(-ENOLCK);
+}
+
+static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_task *task;
+
+	task = __nlm_async_call(req, proc, msg, tk_ops);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+
+/*
+ * NLM asynchronous call.
+ */
+int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_args,
+		.rpc_resp	= &req->a_res,
+	};
+	return nlm_do_async_call(req, proc, &msg, tk_ops);
+}
+
+int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_res,
+	};
+	return nlm_do_async_call(req, proc, &msg, tk_ops);
+}
+
+/*
+ * NLM client asynchronous call.
+ *
+ * Note that although the calls are asynchronous, and are therefore
+ *      guaranteed to complete, we still always attempt to wait for
+ *      completion in order to be able to correctly track the lock
+ *      state.
+ */
+static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_args,
+		.rpc_resp	= &req->a_res,
+		.rpc_cred	= cred,
+	};
+	struct rpc_task *task;
+	int err;
+
+	task = __nlm_async_call(req, proc, &msg, tk_ops);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	err = rpc_wait_for_completion_task(task);
+	rpc_put_task(task);
+	return err;
+}
+
+/*
+ * TEST for the presence of a conflicting lock
+ */
+static int
+nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
+{
+	int	status;
+
+	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST);
+	if (status < 0)
+		goto out;
+
+	switch (req->a_res.status) {
+		case nlm_granted:
+			fl->fl_type = F_UNLCK;
+			break;
+		case nlm_lck_denied:
+			/*
+			 * Report the conflicting lock back to the application.
+			 */
+			fl->fl_start = req->a_res.lock.fl.fl_start;
+			fl->fl_end = req->a_res.lock.fl.fl_end;
+			fl->fl_type = req->a_res.lock.fl.fl_type;
+			fl->fl_pid = 0;
+			break;
+		default:
+			status = nlm_stat_to_errno(req->a_res.status);
+	}
+out:
+	nlmclnt_release_call(req);
+	return status;
+}
+
+static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+	new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
+	new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
+	list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+}
+
+static void nlmclnt_locks_release_private(struct file_lock *fl)
+{
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+	list_del(&fl->fl_u.nfs_fl.list);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
+}
+
+static const struct file_lock_operations nlmclnt_lock_ops = {
+	.fl_copy_lock = nlmclnt_locks_copy_lock,
+	.fl_release_private = nlmclnt_locks_release_private,
+};
+
+static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
+{
+	fl->fl_u.nfs_fl.state = 0;
+	fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
+	INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
+	fl->fl_ops = &nlmclnt_lock_ops;
+}
+
+static int do_vfs_lock(struct file_lock *fl)
+{
+	int res = 0;
+	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+		case FL_POSIX:
+			res = posix_lock_file_wait(fl->fl_file, fl);
+			break;
+		case FL_FLOCK:
+			res = flock_lock_file_wait(fl->fl_file, fl);
+			break;
+		default:
+			BUG();
+	}
+	return res;
+}
+
+/*
+ * LOCK: Try to create a lock
+ *
+ *			Programmer Harassment Alert
+ *
+ * When given a blocking lock request in a sync RPC call, the HPUX lockd
+ * will faithfully return LCK_BLOCKED but never cares to notify us when
+ * the lock could be granted. This way, our local process could hang
+ * around forever waiting for the callback.
+ *
+ *  Solution A:	Implement busy-waiting
+ *  Solution B: Use the async version of the call (NLM_LOCK_{MSG,RES})
+ *
+ * For now I am implementing solution A, because I hate the idea of
+ * re-implementing lockd for a third time in two months. The async
+ * calls shouldn't be too hard to do, however.
+ *
+ * This is one of the lovely things about standards in the NFS area:
+ * they're so soft and squishy you can't really blame HP for doing this.
+ */
+static int
+nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
+{
+	struct rpc_cred *cred = nfs_file_cred(fl->fl_file);
+	struct nlm_host	*host = req->a_host;
+	struct nlm_res	*resp = &req->a_res;
+	struct nlm_wait *block = NULL;
+	unsigned char fl_flags = fl->fl_flags;
+	unsigned char fl_type;
+	int status = -ENOLCK;
+
+	if (nsm_monitor(host) < 0)
+		goto out;
+	req->a_args.state = nsm_local_state;
+
+	fl->fl_flags |= FL_ACCESS;
+	status = do_vfs_lock(fl);
+	fl->fl_flags = fl_flags;
+	if (status < 0)
+		goto out;
+
+	block = nlmclnt_prepare_block(host, fl);
+again:
+	/*
+	 * Initialise resp->status to a valid non-zero value,
+	 * since 0 == nlm_lck_granted
+	 */
+	resp->status = nlm_lck_blocked;
+	for(;;) {
+		/* Reboot protection */
+		fl->fl_u.nfs_fl.state = host->h_state;
+		status = nlmclnt_call(cred, req, NLMPROC_LOCK);
+		if (status < 0)
+			break;
+		/* Did a reclaimer thread notify us of a server reboot? */
+		if (resp->status ==  nlm_lck_denied_grace_period)
+			continue;
+		if (resp->status != nlm_lck_blocked)
+			break;
+		/* Wait on an NLM blocking lock */
+		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
+		if (status < 0)
+			break;
+		if (resp->status != nlm_lck_blocked)
+			break;
+	}
+
+	/* if we were interrupted while blocking, then cancel the lock request
+	 * and exit
+	 */
+	if (resp->status == nlm_lck_blocked) {
+		if (!req->a_args.block)
+			goto out_unlock;
+		if (nlmclnt_cancel(host, req->a_args.block, fl) == 0)
+			goto out_unblock;
+	}
+
+	if (resp->status == nlm_granted) {
+		down_read(&host->h_rwsem);
+		/* Check whether or not the server has rebooted */
+		if (fl->fl_u.nfs_fl.state != host->h_state) {
+			up_read(&host->h_rwsem);
+			goto again;
+		}
+		/* Ensure the resulting lock will get added to granted list */
+		fl->fl_flags |= FL_SLEEP;
+		if (do_vfs_lock(fl) < 0)
+			printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
+		up_read(&host->h_rwsem);
+		fl->fl_flags = fl_flags;
+		status = 0;
+	}
+	if (status < 0)
+		goto out_unlock;
+	/*
+	 * EAGAIN doesn't make sense for sleeping locks, and in some
+	 * cases NLM_LCK_DENIED is returned for a permanent error.  So
+	 * turn it into an ENOLCK.
+	 */
+	if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+		status = -ENOLCK;
+	else
+		status = nlm_stat_to_errno(resp->status);
+out_unblock:
+	nlmclnt_finish_block(block);
+out:
+	nlmclnt_release_call(req);
+	return status;
+out_unlock:
+	/* Fatal error: ensure that we remove the lock altogether */
+	dprintk("lockd: lock attempt ended in fatal error.\n"
+		"       Attempting to unlock.\n");
+	nlmclnt_finish_block(block);
+	fl_type = fl->fl_type;
+	fl->fl_type = F_UNLCK;
+	down_read(&host->h_rwsem);
+	do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
+	fl->fl_type = fl_type;
+	fl->fl_flags = fl_flags;
+	nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	return status;
+}
+
+/*
+ * RECLAIM: Try to reclaim a lock
+ */
+int
+nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
+		struct nlm_rqst *req)
+{
+	int		status;
+
+	memset(req, 0, sizeof(*req));
+	locks_init_lock(&req->a_args.lock.fl);
+	locks_init_lock(&req->a_res.lock.fl);
+	req->a_host  = host;
+
+	/* Set up the argument struct */
+	nlmclnt_setlockargs(req, fl);
+	req->a_args.reclaim = 1;
+
+	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK);
+	if (status >= 0 && req->a_res.status == nlm_granted)
+		return 0;
+
+	printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d "
+				"(errno %d, status %d)\n", fl->fl_pid,
+				status, ntohl(req->a_res.status));
+
+	/*
+	 * FIXME: This is a serious failure. We can
+	 *
+	 *  a.	Ignore the problem
+	 *  b.	Send the owning process some signal (Linux doesn't have
+	 *	SIGLOST, though...)
+	 *  c.	Retry the operation
+	 *
+	 * Until someone comes up with a simple implementation
+	 * for b or c, I'll choose option a.
+	 */
+
+	return -ENOLCK;
+}
+
+/*
+ * UNLOCK: remove an existing lock
+ */
+static int
+nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
+{
+	struct nlm_host	*host = req->a_host;
+	struct nlm_res	*resp = &req->a_res;
+	int status;
+	unsigned char fl_flags = fl->fl_flags;
+
+	/*
+	 * Note: the server is supposed to either grant us the unlock
+	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
+	 * case, we want to unlock.
+	 */
+	fl->fl_flags |= FL_EXISTS;
+	down_read(&host->h_rwsem);
+	status = do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
+	fl->fl_flags = fl_flags;
+	if (status == -ENOENT) {
+		status = 0;
+		goto out;
+	}
+
+	atomic_inc(&req->a_count);
+	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
+			NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	if (status < 0)
+		goto out;
+
+	if (resp->status == nlm_granted)
+		goto out;
+
+	if (resp->status != nlm_lck_denied_nolocks)
+		printk("lockd: unexpected unlock status: %d\n",
+			ntohl(resp->status));
+	/* What to do now? I'm out of my depth... */
+	status = -ENOLCK;
+out:
+	nlmclnt_release_call(req);
+	return status;
+}
+
+static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst	*req = data;
+	u32 status = ntohl(req->a_res.status);
+
+	if (RPC_ASSASSINATED(task))
+		goto die;
+
+	if (task->tk_status < 0) {
+		dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status);
+		switch (task->tk_status) {
+		case -EACCES:
+		case -EIO:
+			goto die;
+		default:
+			goto retry_rebind;
+		}
+	}
+	if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
+		rpc_delay(task, NLMCLNT_GRACE_WAIT);
+		goto retry_unlock;
+	}
+	if (status != NLM_LCK_GRANTED)
+		printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status);
+die:
+	return;
+ retry_rebind:
+	nlm_rebind_host(req->a_host);
+ retry_unlock:
+	rpc_restart_call(task);
+}
+
+static const struct rpc_call_ops nlmclnt_unlock_ops = {
+	.rpc_call_done = nlmclnt_unlock_callback,
+	.rpc_release = nlmclnt_rpc_release,
+};
+
+/*
+ * Cancel a blocked lock request.
+ * We always use an async RPC call for this in order not to hang a
+ * process that has been Ctrl-C'ed.
+ */
+static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl)
+{
+	struct nlm_rqst	*req;
+	int status;
+
+	dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"
+		"       Attempting to cancel lock.\n");
+
+	req = nlm_alloc_call(host);
+	if (!req)
+		return -ENOMEM;
+	req->a_flags = RPC_TASK_ASYNC;
+
+	nlmclnt_setlockargs(req, fl);
+	req->a_args.block = block;
+
+	atomic_inc(&req->a_count);
+	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
+			NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+	if (status == 0 && req->a_res.status == nlm_lck_denied)
+		status = -ENOLCK;
+	nlmclnt_release_call(req);
+	return status;
+}
+
+static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst	*req = data;
+	u32 status = ntohl(req->a_res.status);
+
+	if (RPC_ASSASSINATED(task))
+		goto die;
+
+	if (task->tk_status < 0) {
+		dprintk("lockd: CANCEL call error %d, retrying.\n",
+					task->tk_status);
+		goto retry_cancel;
+	}
+
+	dprintk("lockd: cancel status %u (task %u)\n",
+			status, task->tk_pid);
+
+	switch (status) {
+	case NLM_LCK_GRANTED:
+	case NLM_LCK_DENIED_GRACE_PERIOD:
+	case NLM_LCK_DENIED:
+		/* Everything's good */
+		break;
+	case NLM_LCK_DENIED_NOLOCKS:
+		dprintk("lockd: CANCEL failed (server has no locks)\n");
+		goto retry_cancel;
+	default:
+		printk(KERN_NOTICE "lockd: weird return %d for CANCEL call\n",
+			status);
+	}
+
+die:
+	return;
+
+retry_cancel:
+	/* Don't ever retry more than 3 times */
+	if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
+		goto die;
+	nlm_rebind_host(req->a_host);
+	rpc_restart_call(task);
+	rpc_delay(task, 30 * HZ);
+}
+
+static const struct rpc_call_ops nlmclnt_cancel_ops = {
+	.rpc_call_done = nlmclnt_cancel_callback,
+	.rpc_release = nlmclnt_rpc_release,
+};
+
+/*
+ * Convert an NLM status code to a generic kernel errno
+ */
+static int
+nlm_stat_to_errno(__be32 status)
+{
+	switch(ntohl(status)) {
+	case NLM_LCK_GRANTED:
+		return 0;
+	case NLM_LCK_DENIED:
+		return -EAGAIN;
+	case NLM_LCK_DENIED_NOLOCKS:
+	case NLM_LCK_DENIED_GRACE_PERIOD:
+		return -ENOLCK;
+	case NLM_LCK_BLOCKED:
+		printk(KERN_NOTICE "lockd: unexpected status NLM_BLOCKED\n");
+		return -ENOLCK;
+#ifdef CONFIG_LOCKD_V4
+	case NLM_DEADLCK:
+		return -EDEADLK;
+	case NLM_ROFS:
+		return -EROFS;
+	case NLM_STALE_FH:
+		return -ESTALE;
+	case NLM_FBIG:
+		return -EOVERFLOW;
+	case NLM_FAILED:
+		return -ENOLCK;
+#endif
+	}
+	printk(KERN_NOTICE "lockd: unexpected server status %d\n",
+		 ntohl(status));
+	return -ENOLCK;
+}
diff --git a/kernel/fs/lockd/clntxdr.c b/kernel/fs/lockd/clntxdr.c
new file mode 100644
index 000000000..3e9f7874b
--- /dev/null
+++ b/kernel/fs/lockd/clntxdr.c
@@ -0,0 +1,621 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#include <uapi/linux/nfs2.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz		(1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz		(1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz		(3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz		(4+NLM_owner_sz)
+
+#define NLM_testargs_sz		(NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz		(NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz		(NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz	(NLM_cookie_sz+NLM_lock_sz)
+
+#define NLM_testres_sz		(NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz		(NLM_cookie_sz+1)
+#define NLM_norep_sz		(0)
+
+
+static s32 loff_t_to_s32(loff_t offset)
+{
+	s32 res;
+
+	if (offset >= NLM_OFFSET_MAX)
+		res = NLM_OFFSET_MAX;
+	else if (offset <= -NLM_OFFSET_MAX)
+		res = -NLM_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+				u32 *l_offset, u32 *l_len)
+{
+	const struct file_lock *fl = &lock->fl;
+
+	*l_offset = loff_t_to_s32(fl->fl_start);
+	if (fl->fl_end == OFFSET_MAX)
+		*l_len = 0;
+	else
+		*l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = value ? xdr_one : xdr_zero;
+}
+
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+/*
+ *	typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+			  const u8 *data, const unsigned int length)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, data, length);
+}
+
+static int decode_netobj(struct xdr_stream *xdr,
+			 struct xdr_netobj *obj)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > XDR_MAX_NETOBJ))
+		goto out_size;
+	obj->len = length;
+	obj->data = (u8 *)p;
+	return 0;
+out_size:
+	dprintk("NFS: returned netobj was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+			  const struct nlm_cookie *cookie)
+{
+	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+
+static int decode_cookie(struct xdr_stream *xdr,
+			 struct nlm_cookie *cookie)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	/* apparently HPUX can return empty cookies */
+	if (length == 0)
+		goto out_hpux;
+	if (length > NLM_MAXCOOKIELEN)
+		goto out_size;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	cookie->len = length;
+	memcpy(cookie->data, p, length);
+	return 0;
+out_hpux:
+	cookie->len = 4;
+	memset(cookie->data, 0, 4);
+	return 0;
+out_size:
+	dprintk("NFS: returned cookie was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+
+/*
+ *	enum nlm_stats {
+ *		LCK_GRANTED = 0,
+ *		LCK_DENIED = 1,
+ *		LCK_DENIED_NOLOCKS = 2,
+ *		LCK_BLOCKED = 3,
+ *		LCK_DENIED_GRACE_PERIOD = 4
+ *	};
+ *
+ *
+ *	struct nlm_stat {
+ *		nlm_stats stat;
+ *	};
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+
+static void encode_nlm_stat(struct xdr_stream *xdr,
+			    const __be32 stat)
+{
+	__be32 *p;
+
+	WARN_ON_ONCE(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+	p = xdr_reserve_space(xdr, 4);
+	*p = stat;
+}
+
+static int decode_nlm_stat(struct xdr_stream *xdr,
+			   __be32 *stat)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (unlikely(ntohl(*p) > ntohl(nlm_lck_denied_grace_period)))
+		goto out_enum;
+	*stat = *p;
+	return 0;
+out_enum:
+	dprintk("%s: server returned invalid nlm_stats value: %u\n",
+		__func__, be32_to_cpup(p));
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	struct nlm_holder {
+ *		bool exclusive;
+ *		int uppid;
+ *		netobj oh;
+ *		unsigned l_offset;
+ *		unsigned l_len;
+ *	};
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+			      const struct nlm_res *result)
+{
+	const struct nlm_lock *lock = &result->lock;
+	u32 l_offset, l_len;
+	__be32 *p;
+
+	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_int32(xdr, lock->svid);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	nlm_compute_offsets(lock, &l_offset, &l_len);
+	*p++ = cpu_to_be32(l_offset);
+	*p   = cpu_to_be32(l_len);
+}
+
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+	struct nlm_lock *lock = &result->lock;
+	struct file_lock *fl = &lock->fl;
+	u32 exclusive, l_offset, l_len;
+	int error;
+	__be32 *p;
+	s32 end;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(fl);
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	exclusive = be32_to_cpup(p++);
+	lock->svid = be32_to_cpup(p);
+	fl->fl_pid = (pid_t)lock->svid;
+
+	error = decode_netobj(xdr, &lock->oh);
+	if (unlikely(error))
+		goto out;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	l_offset = be32_to_cpup(p++);
+	l_len = be32_to_cpup(p);
+	end = l_offset + l_len - 1;
+
+	fl->fl_start = (loff_t)l_offset;
+	if (l_len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = (loff_t)end;
+	error = 0;
+out:
+	return error;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+	/* NB: client-side does not set lock->len */
+	u32 length = strlen(name);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+/*
+ *	struct nlm_lock {
+ *		string caller_name<LM_MAXSTRLEN>;
+ *		netobj fh;
+ *		netobj oh;
+ *		int uppid;
+ *		unsigned l_offset;
+ *		unsigned l_len;
+ *	};
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+			    const struct nlm_lock *lock)
+{
+	u32 l_offset, l_len;
+	__be32 *p;
+
+	encode_caller_name(xdr, lock->caller);
+	encode_fh(xdr, &lock->fh);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(lock->svid);
+
+	nlm_compute_offsets(lock, &l_offset, &l_len);
+	*p++ = cpu_to_be32(l_offset);
+	*p   = cpu_to_be32(l_len);
+}
+
+
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	struct nlm_testargs {
+ *		netobj cookie;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_lockargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *		bool reclaim;
+ *		int state;
+ *	};
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
+}
+
+/*
+ *	struct nlm_cancargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_unlockargs {
+ *		netobj cookie;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_res {
+ *		netobj cookie;
+ *		nlm_stat stat;
+ *	};
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
+			    const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
+}
+
+/*
+ *	union nlm_testrply switch (nlm_stats stat) {
+ *	case LCK_DENIED:
+ *		struct nlm_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm_testres {
+ *		netobj cookie;
+ *		nlm_testrply test_stat;
+ *	};
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+				const struct nlm_res *result)
+{
+	if (result->status == nlm_lck_denied)
+		encode_nlm_holder(xdr, result);
+}
+
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
+	encode_nlm_testrply(xdr, result);
+}
+
+
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	union nlm_testrply switch (nlm_stats stat) {
+ *	case LCK_DENIED:
+ *		struct nlm_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm_testres {
+ *		netobj cookie;
+ *		nlm_testrply test_stat;
+ *	};
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+			       struct nlm_res *result)
+{
+	int error;
+
+	error = decode_nlm_stat(xdr, &result->status);
+	if (unlikely(error))
+		goto out;
+	if (result->status == nlm_lck_denied)
+		error = decode_nlm_holder(xdr, result);
+out:
+	return error;
+}
+
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+			       struct xdr_stream *xdr,
+			       struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm_testrply(xdr, result);
+out:
+	return error;
+}
+
+/*
+ *	struct nlm_res {
+ *		netobj cookie;
+ *		nlm_stat stat;
+ *	};
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+			   struct xdr_stream *xdr,
+			   struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm_stat(xdr, &result->status);
+out:
+	return error;
+}
+
+
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep	NULL
+
+#define PROC(proc, argtype, restype)	\
+[NLMPROC_##proc] = {							\
+	.p_proc      = NLMPROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,		\
+	.p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,		\
+	.p_arglen    = NLM_##argtype##_sz,				\
+	.p_replen    = NLM_##restype##_sz,				\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+static struct rpc_procinfo	nlm_procedures[] = {
+	PROC(TEST,		testargs,	testres),
+	PROC(LOCK,		lockargs,	res),
+	PROC(CANCEL,		cancargs,	res),
+	PROC(UNLOCK,		unlockargs,	res),
+	PROC(GRANTED,		testargs,	res),
+	PROC(TEST_MSG,		testargs,	norep),
+	PROC(LOCK_MSG,		lockargs,	norep),
+	PROC(CANCEL_MSG,	cancargs,	norep),
+	PROC(UNLOCK_MSG,	unlockargs,	norep),
+	PROC(GRANTED_MSG,	testargs,	norep),
+	PROC(TEST_RES,		testres,	norep),
+	PROC(LOCK_RES,		res,		norep),
+	PROC(CANCEL_RES,	res,		norep),
+	PROC(UNLOCK_RES,	res,		norep),
+	PROC(GRANTED_RES,	res,		norep),
+};
+
+static const struct rpc_version	nlm_version1 = {
+		.number		= 1,
+		.nrprocs	= ARRAY_SIZE(nlm_procedures),
+		.procs		= nlm_procedures,
+};
+
+static const struct rpc_version	nlm_version3 = {
+		.number		= 3,
+		.nrprocs	= ARRAY_SIZE(nlm_procedures),
+		.procs		= nlm_procedures,
+};
+
+static const struct rpc_version	*nlm_versions[] = {
+	[1] = &nlm_version1,
+	[3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+	[4] = &nlm_version4,
+#endif
+};
+
+static struct rpc_stat		nlm_rpc_stats;
+
+const struct rpc_program	nlm_program = {
+		.name		= "lockd",
+		.number		= NLM_PROGRAM,
+		.nrvers		= ARRAY_SIZE(nlm_versions),
+		.version	= nlm_versions,
+		.stats		= &nlm_rpc_stats,
+};
diff --git a/kernel/fs/lockd/host.c b/kernel/fs/lockd/host.c
new file mode 100644
index 000000000..969d589c8
--- /dev/null
+++ b/kernel/fs/lockd/host.c
@@ -0,0 +1,675 @@
+/*
+ * linux/fs/lockd/host.c
+ *
+ * Management for NLM peer hosts. The nlm_host struct is shared
+ * between client and server implementation. The only reason to
+ * do so is to reduce code bloat.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+#include <linux/mutex.h>
+
+#include <linux/sunrpc/svc_xprt.h>
+
+#include <net/ipv6.h>
+
+#include "netns.h"
+
+#define NLMDBG_FACILITY		NLMDBG_HOSTCACHE
+#define NLM_HOST_NRHASH		32
+#define NLM_HOST_REBIND		(60 * HZ)
+#define NLM_HOST_EXPIRE		(300 * HZ)
+#define NLM_HOST_COLLECT	(120 * HZ)
+
+static struct hlist_head	nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head	nlm_client_hosts[NLM_HOST_NRHASH];
+
+#define for_each_host(host, chain, table) \
+	for ((chain) = (table); \
+	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+		hlist_for_each_entry((host), (chain), h_hash)
+
+#define for_each_host_safe(host, next, chain, table) \
+	for ((chain) = (table); \
+	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+		hlist_for_each_entry_safe((host), (next), \
+						(chain), h_hash)
+
+static unsigned long		nrhosts;
+static DEFINE_MUTEX(nlm_host_mutex);
+
+static void			nlm_gc_hosts(struct net *net);
+
+struct nlm_lookup_host_info {
+	const int		server;		/* search for server|client */
+	const struct sockaddr	*sap;		/* address to search for */
+	const size_t		salen;		/* it's length */
+	const unsigned short	protocol;	/* transport to search for*/
+	const u32		version;	/* NLM version to search for */
+	const char		*hostname;	/* remote's hostname */
+	const size_t		hostname_len;	/* it's length */
+	const int		noresvport;	/* use non-priv port */
+	struct net		*net;		/* network namespace to bind */
+};
+
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+	unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+	return hash ^ (hash >> 8);
+}
+
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	return __nlm_hash32(sin->sin_addr.s_addr);
+}
+
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	const struct in6_addr addr = sin6->sin6_addr;
+	return __nlm_hash32(addr.s6_addr32[0]) ^
+	       __nlm_hash32(addr.s6_addr32[1]) ^
+	       __nlm_hash32(addr.s6_addr32[2]) ^
+	       __nlm_hash32(addr.s6_addr32[3]);
+}
+
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+	unsigned int hash;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		hash = __nlm_hash_addr4(sap);
+		break;
+	case AF_INET6:
+		hash = __nlm_hash_addr6(sap);
+		break;
+	default:
+		hash = 0;
+	}
+	return hash & (NLM_HOST_NRHASH - 1);
+}
+
+/*
+ * Allocate and initialize an nlm_host.  Common to both client and server.
+ */
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+				       struct nsm_handle *nsm)
+{
+	struct nlm_host *host = NULL;
+	unsigned long now = jiffies;
+
+	if (nsm != NULL)
+		atomic_inc(&nsm->sm_count);
+	else {
+		host = NULL;
+		nsm = nsm_get_handle(ni->sap, ni->salen,
+					ni->hostname, ni->hostname_len);
+		if (unlikely(nsm == NULL)) {
+			dprintk("lockd: %s failed; no nsm handle\n",
+				__func__);
+			goto out;
+		}
+	}
+
+	host = kmalloc(sizeof(*host), GFP_KERNEL);
+	if (unlikely(host == NULL)) {
+		dprintk("lockd: %s failed; no memory\n", __func__);
+		nsm_release(nsm);
+		goto out;
+	}
+
+	memcpy(nlm_addr(host), ni->sap, ni->salen);
+	host->h_addrlen    = ni->salen;
+	rpc_set_port(nlm_addr(host), 0);
+	host->h_srcaddrlen = 0;
+
+	host->h_rpcclnt    = NULL;
+	host->h_name	   = nsm->sm_name;
+	host->h_version    = ni->version;
+	host->h_proto      = ni->protocol;
+	host->h_reclaiming = 0;
+	host->h_server     = ni->server;
+	host->h_noresvport = ni->noresvport;
+	host->h_inuse      = 0;
+	init_waitqueue_head(&host->h_gracewait);
+	init_rwsem(&host->h_rwsem);
+	host->h_state      = 0;
+	host->h_nsmstate   = 0;
+	host->h_pidcount   = 0;
+	atomic_set(&host->h_count, 1);
+	mutex_init(&host->h_mutex);
+	host->h_nextrebind = now + NLM_HOST_REBIND;
+	host->h_expires    = now + NLM_HOST_EXPIRE;
+	INIT_LIST_HEAD(&host->h_lockowners);
+	spin_lock_init(&host->h_lock);
+	INIT_LIST_HEAD(&host->h_granted);
+	INIT_LIST_HEAD(&host->h_reclaim);
+	host->h_nsmhandle  = nsm;
+	host->h_addrbuf    = nsm->sm_addrbuf;
+	host->net	   = ni->net;
+
+out:
+	return host;
+}
+
+/*
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
+ */
+static void nlm_destroy_host_locked(struct nlm_host *host)
+{
+	struct rpc_clnt	*clnt;
+	struct lockd_net *ln = net_generic(host->net, lockd_net_id);
+
+	dprintk("lockd: destroy host %s\n", host->h_name);
+
+	hlist_del_init(&host->h_hash);
+
+	nsm_unmonitor(host);
+	nsm_release(host->h_nsmhandle);
+
+	clnt = host->h_rpcclnt;
+	if (clnt != NULL)
+		rpc_shutdown_client(clnt);
+	kfree(host);
+
+	ln->nrhosts--;
+	nrhosts--;
+}
+
+/**
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ * @noresvport: 1 if non-privileged port should be used
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
+ */
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+				     const size_t salen,
+				     const unsigned short protocol,
+				     const u32 version,
+				     const char *hostname,
+				     int noresvport,
+				     struct net *net)
+{
+	struct nlm_lookup_host_info ni = {
+		.server		= 0,
+		.sap		= sap,
+		.salen		= salen,
+		.protocol	= protocol,
+		.version	= version,
+		.hostname	= hostname,
+		.hostname_len	= strlen(hostname),
+		.noresvport	= noresvport,
+		.net		= net,
+	};
+	struct hlist_head *chain;
+	struct nlm_host	*host;
+	struct nsm_handle *nsm = NULL;
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+	dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+			(hostname ? hostname : "<none>"), version,
+			(protocol == IPPROTO_UDP ? "udp" : "tcp"));
+
+	mutex_lock(&nlm_host_mutex);
+
+	chain = &nlm_client_hosts[nlm_hash_address(sap)];
+	hlist_for_each_entry(host, chain, h_hash) {
+		if (host->net != net)
+			continue;
+		if (!rpc_cmp_addr(nlm_addr(host), sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != protocol)
+			continue;
+		if (host->h_version != version)
+			continue;
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n", __func__,
+			host->h_name, host->h_addrbuf);
+		goto out;
+	}
+
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	hlist_add_head(&host->h_hash, chain);
+	ln->nrhosts++;
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n", __func__,
+		host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release client host %s\n", host->h_name);
+
+	WARN_ON_ONCE(host->h_server);
+
+	if (atomic_dec_and_test(&host->h_count)) {
+		WARN_ON_ONCE(!list_empty(&host->h_lockowners));
+		WARN_ON_ONCE(!list_empty(&host->h_granted));
+		WARN_ON_ONCE(!list_empty(&host->h_reclaim));
+
+		mutex_lock(&nlm_host_mutex);
+		nlm_destroy_host_locked(host);
+		mutex_unlock(&nlm_host_mutex);
+	}
+}
+
+/**
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request.  If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses.  The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
+ */
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
+				    const char *hostname,
+				    const size_t hostname_len)
+{
+	struct hlist_head *chain;
+	struct nlm_host	*host = NULL;
+	struct nsm_handle *nsm = NULL;
+	struct sockaddr *src_sap = svc_daddr(rqstp);
+	size_t src_len = rqstp->rq_daddrlen;
+	struct net *net = SVC_NET(rqstp);
+	struct nlm_lookup_host_info ni = {
+		.server		= 1,
+		.sap		= svc_addr(rqstp),
+		.salen		= rqstp->rq_addrlen,
+		.protocol	= rqstp->rq_prot,
+		.version	= rqstp->rq_vers,
+		.hostname	= hostname,
+		.hostname_len	= hostname_len,
+		.net		= net,
+	};
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+	dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+			(int)hostname_len, hostname, rqstp->rq_vers,
+			(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+
+	mutex_lock(&nlm_host_mutex);
+
+	if (time_after_eq(jiffies, ln->next_gc))
+		nlm_gc_hosts(net);
+
+	chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+	hlist_for_each_entry(host, chain, h_hash) {
+		if (host->net != net)
+			continue;
+		if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != ni.protocol)
+			continue;
+		if (host->h_version != ni.version)
+			continue;
+		if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+			continue;
+
+		/* Move to head of hash chain. */
+		hlist_del(&host->h_hash);
+		hlist_add_head(&host->h_hash, chain);
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n",
+			__func__, host->h_name, host->h_addrbuf);
+		goto out;
+	}
+
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	memcpy(nlm_srcaddr(host), src_sap, src_len);
+	host->h_srcaddrlen = src_len;
+	hlist_add_head(&host->h_hash, chain);
+	ln->nrhosts++;
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n",
+		__func__, host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release server host %s\n", host->h_name);
+
+	WARN_ON_ONCE(!host->h_server);
+	atomic_dec(&host->h_count);
+}
+
+/*
+ * Create the NLM RPC client for an NLM peer
+ */
+struct rpc_clnt *
+nlm_bind_host(struct nlm_host *host)
+{
+	struct rpc_clnt	*clnt;
+
+	dprintk("lockd: nlm_bind_host %s (%s)\n",
+			host->h_name, host->h_addrbuf);
+
+	/* Lock host handle */
+	mutex_lock(&host->h_mutex);
+
+	/* If we've already created an RPC client, check whether
+	 * RPC rebind is required
+	 */
+	if ((clnt = host->h_rpcclnt) != NULL) {
+		if (time_after_eq(jiffies, host->h_nextrebind)) {
+			rpc_force_rebind(clnt);
+			host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+			dprintk("lockd: next rebind in %lu jiffies\n",
+					host->h_nextrebind - jiffies);
+		}
+	} else {
+		unsigned long increment = nlmsvc_timeout;
+		struct rpc_timeout timeparms = {
+			.to_initval	= increment,
+			.to_increment	= increment,
+			.to_maxval	= increment * 6UL,
+			.to_retries	= 5U,
+		};
+		struct rpc_create_args args = {
+			.net		= host->net,
+			.protocol	= host->h_proto,
+			.address	= nlm_addr(host),
+			.addrsize	= host->h_addrlen,
+			.timeout	= &timeparms,
+			.servername	= host->h_name,
+			.program	= &nlm_program,
+			.version	= host->h_version,
+			.authflavor	= RPC_AUTH_UNIX,
+			.flags		= (RPC_CLNT_CREATE_NOPING |
+					   RPC_CLNT_CREATE_AUTOBIND),
+		};
+
+		/*
+		 * lockd retries server side blocks automatically so we want
+		 * those to be soft RPC calls. Client side calls need to be
+		 * hard RPC tasks.
+		 */
+		if (!host->h_server)
+			args.flags |= RPC_CLNT_CREATE_HARDRTRY;
+		if (host->h_noresvport)
+			args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+		if (host->h_srcaddrlen)
+			args.saddress = nlm_srcaddr(host);
+
+		clnt = rpc_create(&args);
+		if (!IS_ERR(clnt))
+			host->h_rpcclnt = clnt;
+		else {
+			printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
+			clnt = NULL;
+		}
+	}
+
+	mutex_unlock(&host->h_mutex);
+	return clnt;
+}
+
+/*
+ * Force a portmap lookup of the remote lockd port
+ */
+void
+nlm_rebind_host(struct nlm_host *host)
+{
+	dprintk("lockd: rebind host %s\n", host->h_name);
+	if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) {
+		rpc_force_rebind(host->h_rpcclnt);
+		host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+	}
+}
+
+/*
+ * Increment NLM host count
+ */
+struct nlm_host * nlm_get_host(struct nlm_host *host)
+{
+	if (host) {
+		dprintk("lockd: get host %s\n", host->h_name);
+		atomic_inc(&host->h_count);
+		host->h_expires = jiffies + NLM_HOST_EXPIRE;
+	}
+	return host;
+}
+
+static struct nlm_host *next_host_state(struct hlist_head *cache,
+					struct nsm_handle *nsm,
+					const struct nlm_reboot *info)
+{
+	struct nlm_host *host;
+	struct hlist_head *chain;
+
+	mutex_lock(&nlm_host_mutex);
+	for_each_host(host, chain, cache) {
+		if (host->h_nsmhandle == nsm
+		    && host->h_nsmstate != info->state) {
+			host->h_nsmstate = info->state;
+			host->h_state++;
+
+			nlm_get_host(host);
+			mutex_unlock(&nlm_host_mutex);
+			return host;
+		}
+	}
+
+	mutex_unlock(&nlm_host_mutex);
+	return NULL;
+}
+
+/**
+ * nlm_host_rebooted - Release all resources held by rebooted host
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
+ *
+ * We were notified that the specified host has rebooted.  Release
+ * all resources held by that peer.
+ */
+void nlm_host_rebooted(const struct nlm_reboot *info)
+{
+	struct nsm_handle *nsm;
+	struct nlm_host	*host;
+
+	nsm = nsm_reboot_lookup(info);
+	if (unlikely(nsm == NULL))
+		return;
+
+	/* Mark all hosts tied to this NSM state as having rebooted.
+	 * We run the loop repeatedly, because we drop the host table
+	 * lock for this.
+	 * To avoid processing a host several times, we match the nsmstate.
+	 */
+	while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
+		nlmsvc_free_host_resources(host);
+		nlmsvc_release_host(host);
+	}
+	while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+		nlmclnt_recovery(host);
+		nlmclnt_release_host(host);
+	}
+
+	nsm_release(nsm);
+}
+
+static void nlm_complain_hosts(struct net *net)
+{
+	struct hlist_head *chain;
+	struct nlm_host	*host;
+
+	if (net) {
+		struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+		if (ln->nrhosts == 0)
+			return;
+		printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net);
+		dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net);
+	} else {
+		if (nrhosts == 0)
+			return;
+		printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
+		dprintk("lockd: %lu hosts left:\n", nrhosts);
+	}
+
+	for_each_host(host, chain, nlm_server_hosts) {
+		if (net && host->net != net)
+			continue;
+		dprintk("       %s (cnt %d use %d exp %ld net %p)\n",
+			host->h_name, atomic_read(&host->h_count),
+			host->h_inuse, host->h_expires, host->net);
+	}
+}
+
+void
+nlm_shutdown_hosts_net(struct net *net)
+{
+	struct hlist_head *chain;
+	struct nlm_host	*host;
+
+	mutex_lock(&nlm_host_mutex);
+
+	/* First, make all hosts eligible for gc */
+	dprintk("lockd: nuking all hosts in net %p...\n", net);
+	for_each_host(host, chain, nlm_server_hosts) {
+		if (net && host->net != net)
+			continue;
+		host->h_expires = jiffies - 1;
+		if (host->h_rpcclnt) {
+			rpc_shutdown_client(host->h_rpcclnt);
+			host->h_rpcclnt = NULL;
+		}
+	}
+
+	/* Then, perform a garbage collection pass */
+	nlm_gc_hosts(net);
+	mutex_unlock(&nlm_host_mutex);
+
+	nlm_complain_hosts(net);
+}
+
+/*
+ * Shut down the hosts module.
+ * Note that this routine is called only at server shutdown time.
+ */
+void
+nlm_shutdown_hosts(void)
+{
+	dprintk("lockd: shutting down host module\n");
+	nlm_shutdown_hosts_net(NULL);
+}
+
+/*
+ * Garbage collect any unused NLM hosts.
+ * This GC combines reference counting for async operations with
+ * mark & sweep for resources held by remote clients.
+ */
+static void
+nlm_gc_hosts(struct net *net)
+{
+	struct hlist_head *chain;
+	struct hlist_node *next;
+	struct nlm_host	*host;
+
+	dprintk("lockd: host garbage collection for net %p\n", net);
+	for_each_host(host, chain, nlm_server_hosts) {
+		if (net && host->net != net)
+			continue;
+		host->h_inuse = 0;
+	}
+
+	/* Mark all hosts that hold locks, blocks or shares */
+	nlmsvc_mark_resources(net);
+
+	for_each_host_safe(host, next, chain, nlm_server_hosts) {
+		if (net && host->net != net)
+			continue;
+		if (atomic_read(&host->h_count) || host->h_inuse
+		 || time_before(jiffies, host->h_expires)) {
+			dprintk("nlm_gc_hosts skipping %s "
+				"(cnt %d use %d exp %ld net %p)\n",
+				host->h_name, atomic_read(&host->h_count),
+				host->h_inuse, host->h_expires, host->net);
+			continue;
+		}
+		nlm_destroy_host_locked(host);
+	}
+
+	if (net) {
+		struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+		ln->next_gc = jiffies + NLM_HOST_COLLECT;
+	}
+}
diff --git a/kernel/fs/lockd/mon.c b/kernel/fs/lockd/mon.c
new file mode 100644
index 000000000..47a32b6d9
--- /dev/null
+++ b/kernel/fs/lockd/mon.c
@@ -0,0 +1,622 @@
+/*
+ * linux/fs/lockd/mon.c
+ *
+ * The kernel statd client.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+
+#include <asm/unaligned.h>
+
+#include "netns.h"
+
+#define NLMDBG_FACILITY		NLMDBG_MONITOR
+#define NSM_PROGRAM		100024
+#define NSM_VERSION		1
+
+enum {
+	NSMPROC_NULL,
+	NSMPROC_STAT,
+	NSMPROC_MON,
+	NSMPROC_UNMON,
+	NSMPROC_UNMON_ALL,
+	NSMPROC_SIMU_CRASH,
+	NSMPROC_NOTIFY,
+};
+
+struct nsm_args {
+	struct nsm_private	*priv;
+	u32			prog;		/* RPC callback info */
+	u32			vers;
+	u32			proc;
+
+	char			*mon_name;
+	char			*nodename;
+};
+
+struct nsm_res {
+	u32			status;
+	u32			state;
+};
+
+static const struct rpc_program	nsm_program;
+static				LIST_HEAD(nsm_handles);
+static				DEFINE_SPINLOCK(nsm_lock);
+
+/*
+ * Local NSM state
+ */
+u32	__read_mostly		nsm_local_state;
+bool	__read_mostly		nsm_use_hostnames;
+
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
+{
+	return (struct sockaddr *)&nsm->sm_addr;
+}
+
+static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
+{
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
+	};
+	struct rpc_create_args args = {
+		.net			= net,
+		.protocol		= XPRT_TRANSPORT_TCP,
+		.address		= (struct sockaddr *)&sin,
+		.addrsize		= sizeof(sin),
+		.servername		= "rpc.statd",
+		.nodename		= nodename,
+		.program		= &nsm_program,
+		.version		= NSM_VERSION,
+		.authflavor		= RPC_AUTH_NULL,
+		.flags			= RPC_CLNT_CREATE_NOPING,
+	};
+
+	return rpc_create(&args);
+}
+
+static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
+		struct rpc_clnt *clnt)
+{
+	spin_lock(&ln->nsm_clnt_lock);
+	if (ln->nsm_users == 0) {
+		if (clnt == NULL)
+			goto out;
+		ln->nsm_clnt = clnt;
+	}
+	clnt = ln->nsm_clnt;
+	ln->nsm_users++;
+out:
+	spin_unlock(&ln->nsm_clnt_lock);
+	return clnt;
+}
+
+static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
+{
+	struct rpc_clnt	*clnt, *new;
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+	clnt = nsm_client_set(ln, NULL);
+	if (clnt != NULL)
+		goto out;
+
+	clnt = new = nsm_create(net, nodename);
+	if (IS_ERR(clnt))
+		goto out;
+
+	clnt = nsm_client_set(ln, new);
+	if (clnt != new)
+		rpc_shutdown_client(new);
+out:
+	return clnt;
+}
+
+static void nsm_client_put(struct net *net)
+{
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+	struct rpc_clnt	*clnt = NULL;
+
+	spin_lock(&ln->nsm_clnt_lock);
+	ln->nsm_users--;
+	if (ln->nsm_users == 0) {
+		clnt = ln->nsm_clnt;
+		ln->nsm_clnt = NULL;
+	}
+	spin_unlock(&ln->nsm_clnt_lock);
+	if (clnt != NULL)
+		rpc_shutdown_client(clnt);
+}
+
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
+			 struct rpc_clnt *clnt)
+{
+	int		status;
+	struct nsm_args args = {
+		.priv		= &nsm->sm_priv,
+		.prog		= NLM_PROGRAM,
+		.vers		= 3,
+		.proc		= NLMPROC_NSM_NOTIFY,
+		.mon_name	= nsm->sm_mon_name,
+		.nodename	= clnt->cl_nodename,
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= res,
+	};
+
+	memset(res, 0, sizeof(*res));
+
+	msg.rpc_proc = &clnt->cl_procinfo[proc];
+	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+	if (status == -ECONNREFUSED) {
+		dprintk("lockd:	NSM upcall RPC failed, status=%d, forcing rebind\n",
+				status);
+		rpc_force_rebind(clnt);
+		status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+	}
+	if (status < 0)
+		dprintk("lockd: NSM upcall RPC failed, status=%d\n",
+				status);
+	else
+		status = 0;
+	return status;
+}
+
+/**
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
+ */
+int nsm_monitor(const struct nlm_host *host)
+{
+	struct nsm_handle *nsm = host->h_nsmhandle;
+	struct nsm_res	res;
+	int		status;
+	struct rpc_clnt *clnt;
+	const char *nodename = NULL;
+
+	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
+
+	if (nsm->sm_monitored)
+		return 0;
+
+	if (host->h_rpcclnt)
+		nodename = host->h_rpcclnt->cl_nodename;
+
+	/*
+	 * Choose whether to record the caller_name or IP address of
+	 * this peer in the local rpc.statd's database.
+	 */
+	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
+
+	clnt = nsm_client_get(host->net, nodename);
+	if (IS_ERR(clnt)) {
+		status = PTR_ERR(clnt);
+		dprintk("lockd: failed to create NSM upcall transport, "
+				"status=%d, net=%p\n", status, host->net);
+		return status;
+	}
+
+	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
+	if (unlikely(res.status != 0))
+		status = -EIO;
+	if (unlikely(status < 0)) {
+		pr_notice_ratelimited("lockd: cannot monitor %s\n", nsm->sm_name);
+		return status;
+	}
+
+	nsm->sm_monitored = 1;
+	if (unlikely(nsm_local_state != res.state)) {
+		nsm_local_state = res.state;
+		dprintk("lockd: NSM state changed to %d\n", nsm_local_state);
+	}
+	return 0;
+}
+
+/**
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
+ */
+void nsm_unmonitor(const struct nlm_host *host)
+{
+	struct nsm_handle *nsm = host->h_nsmhandle;
+	struct nsm_res	res;
+	int status;
+
+	if (atomic_read(&nsm->sm_count) == 1
+	 && nsm->sm_monitored && !nsm->sm_sticky) {
+		struct lockd_net *ln = net_generic(host->net, lockd_net_id);
+
+		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
+
+		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
+		if (res.status != 0)
+			status = -EIO;
+		if (status < 0)
+			printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
+					nsm->sm_name);
+		else
+			nsm->sm_monitored = 0;
+
+		nsm_client_put(host->net);
+	}
+}
+
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+					      const size_t len)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (strlen(nsm->sm_name) == len &&
+		    memcmp(nsm->sm_name, hostname, len) == 0)
+			return nsm;
+	return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (rpc_cmp_addr(nsm_addr(nsm), sap))
+			return nsm;
+	return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (memcmp(nsm->sm_priv.data, priv->data,
+					sizeof(priv->data)) == 0)
+			return nsm;
+	return NULL;
+}
+
+/*
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running.  We prefer a stronger requirement of making them
+ * unique across reboots.  If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback.  They
+ * do not appear on the physical network.  If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
+ */
+static void nsm_init_private(struct nsm_handle *nsm)
+{
+	u64 *p = (u64 *)&nsm->sm_priv.data;
+	s64 ns;
+
+	ns = ktime_get_ns();
+	put_unaligned(ns, p);
+	put_unaligned((unsigned long)nsm, p + 1);
+}
+
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+					    const size_t salen,
+					    const char *hostname,
+					    const size_t hostname_len)
+{
+	struct nsm_handle *new;
+
+	new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+	if (unlikely(new == NULL))
+		return NULL;
+
+	atomic_set(&new->sm_count, 1);
+	new->sm_name = (char *)(new + 1);
+	memcpy(nsm_addr(new), sap, salen);
+	new->sm_addrlen = salen;
+	nsm_init_private(new);
+
+	if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+					sizeof(new->sm_addrbuf)) == 0)
+		(void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+				"unsupported address family");
+	memcpy(new->sm_name, hostname, hostname_len);
+	new->sm_name[hostname_len] = '\0';
+
+	return new;
+}
+
+/**
+ * nsm_get_handle - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache.  Returns NULL if
+ * an error occurs.
+ */
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+				  const size_t salen, const char *hostname,
+				  const size_t hostname_len)
+{
+	struct nsm_handle *cached, *new = NULL;
+
+	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+		if (printk_ratelimit()) {
+			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+					    "in NFS lock request\n",
+				(int)hostname_len, hostname);
+		}
+		return NULL;
+	}
+
+retry:
+	spin_lock(&nsm_lock);
+
+	if (nsm_use_hostnames && hostname != NULL)
+		cached = nsm_lookup_hostname(hostname, hostname_len);
+	else
+		cached = nsm_lookup_addr(sap);
+
+	if (cached != NULL) {
+		atomic_inc(&cached->sm_count);
+		spin_unlock(&nsm_lock);
+		kfree(new);
+		dprintk("lockd: found nsm_handle for %s (%s), "
+				"cnt %d\n", cached->sm_name,
+				cached->sm_addrbuf,
+				atomic_read(&cached->sm_count));
+		return cached;
+	}
+
+	if (new != NULL) {
+		list_add(&new->sm_link, &nsm_handles);
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: created nsm_handle for %s (%s)\n",
+				new->sm_name, new->sm_addrbuf);
+		return new;
+	}
+
+	spin_unlock(&nsm_lock);
+
+	new = nsm_create_handle(sap, salen, hostname, hostname_len);
+	if (unlikely(new == NULL))
+		return NULL;
+	goto retry;
+}
+
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache. The returned
+ * nsm_handle's reference count is bumped. Otherwise returns NULL if some
+ * error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+	struct nsm_handle *cached;
+
+	spin_lock(&nsm_lock);
+
+	cached = nsm_lookup_priv(&info->priv);
+	if (unlikely(cached == NULL)) {
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+				info->len, info->mon);
+		return cached;
+	}
+
+	atomic_inc(&cached->sm_count);
+	spin_unlock(&nsm_lock);
+
+	dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+			cached->sm_name, cached->sm_addrbuf,
+			atomic_read(&cached->sm_count));
+	return cached;
+}
+
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+		list_del(&nsm->sm_link);
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+				nsm->sm_name, nsm->sm_addrbuf);
+		kfree(nsm);
+	}
+}
+
+/*
+ * XDR functions for NSM.
+ *
+ * See http://www.opengroup.org/ for details on the Network
+ * Status Monitor wire protocol.
+ */
+
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
+{
+	const u32 len = strlen(string);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + len);
+	xdr_encode_opaque(p, string, len);
+}
+
+/*
+ * "mon_name" specifies the host to be monitored.
+ */
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	encode_nsm_string(xdr, argp->mon_name);
+}
+
+/*
+ * The "my_id" argument specifies the hostname and RPC procedure
+ * to be called when the status manager receives notification
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
+ * has changed.
+ */
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	__be32 *p;
+
+	encode_nsm_string(xdr, argp->nodename);
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(argp->prog);
+	*p++ = cpu_to_be32(argp->vers);
+	*p = cpu_to_be32(argp->proc);
+}
+
+/*
+ * The "mon_id" argument specifies the non-private arguments
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
+ */
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	encode_mon_name(xdr, argp);
+	encode_my_id(xdr, argp);
+}
+
+/*
+ * The "priv" argument may contain private information required
+ * by the NSMPROC_MON call. This information will be supplied in the
+ * NLMPROC_SM_NOTIFY call.
+ */
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+	xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
+}
+
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			    const struct nsm_args *argp)
+{
+	encode_mon_id(xdr, argp);
+	encode_priv(xdr, argp);
+}
+
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      const struct nsm_args *argp)
+{
+	encode_mon_id(xdr, argp);
+}
+
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nsm_res *resp)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	resp->status = be32_to_cpup(p++);
+	resp->state = be32_to_cpup(p);
+
+	dprintk("lockd: %s status %d state %d\n",
+		__func__, resp->status, resp->state);
+	return 0;
+}
+
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
+			    struct xdr_stream *xdr,
+			    struct nsm_res *resp)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	resp->state = be32_to_cpup(p);
+
+	dprintk("lockd: %s state %d\n", __func__, resp->state);
+	return 0;
+}
+
+#define SM_my_name_sz	(1+XDR_QUADLEN(SM_MAXSTRLEN))
+#define SM_my_id_sz	(SM_my_name_sz+3)
+#define SM_mon_name_sz	(1+XDR_QUADLEN(SM_MAXSTRLEN))
+#define SM_mon_id_sz	(SM_mon_name_sz+SM_my_id_sz)
+#define SM_priv_sz	(XDR_QUADLEN(SM_PRIV_SIZE))
+#define SM_mon_sz	(SM_mon_id_sz+SM_priv_sz)
+#define SM_monres_sz	2
+#define SM_unmonres_sz	1
+
+static struct rpc_procinfo	nsm_procedures[] = {
+[NSMPROC_MON] = {
+		.p_proc		= NSMPROC_MON,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_mon,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat_res,
+		.p_arglen	= SM_mon_sz,
+		.p_replen	= SM_monres_sz,
+		.p_statidx	= NSMPROC_MON,
+		.p_name		= "MONITOR",
+	},
+[NSMPROC_UNMON] = {
+		.p_proc		= NSMPROC_UNMON,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_unmon,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat,
+		.p_arglen	= SM_mon_id_sz,
+		.p_replen	= SM_unmonres_sz,
+		.p_statidx	= NSMPROC_UNMON,
+		.p_name		= "UNMONITOR",
+	},
+};
+
+static const struct rpc_version nsm_version1 = {
+		.number		= 1,
+		.nrprocs	= ARRAY_SIZE(nsm_procedures),
+		.procs		= nsm_procedures
+};
+
+static const struct rpc_version *nsm_version[] = {
+	[1] = &nsm_version1,
+};
+
+static struct rpc_stat		nsm_stats;
+
+static const struct rpc_program nsm_program = {
+		.name		= "statd",
+		.number		= NSM_PROGRAM,
+		.nrvers		= ARRAY_SIZE(nsm_version),
+		.version	= nsm_version,
+		.stats		= &nsm_stats
+};
diff --git a/kernel/fs/lockd/netns.h b/kernel/fs/lockd/netns.h
new file mode 100644
index 000000000..097bfa3ad
--- /dev/null
+++ b/kernel/fs/lockd/netns.h
@@ -0,0 +1,22 @@
+#ifndef __LOCKD_NETNS_H__
+#define __LOCKD_NETNS_H__
+
+#include <linux/fs.h>
+#include <net/netns/generic.h>
+
+struct lockd_net {
+	unsigned int nlmsvc_users;
+	unsigned long next_gc;
+	unsigned long nrhosts;
+
+	struct delayed_work grace_period_end;
+	struct lock_manager lockd_manager;
+
+	spinlock_t nsm_clnt_lock;
+	unsigned int nsm_users;
+	struct rpc_clnt *nsm_clnt;
+};
+
+extern int lockd_net_id;
+
+#endif
diff --git a/kernel/fs/lockd/procfs.c b/kernel/fs/lockd/procfs.c
new file mode 100644
index 000000000..2a0a98480
--- /dev/null
+++ b/kernel/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+#include "netns.h"
+#include "procfs.h"
+
+/*
+ * We only allow strings that start with 'Y', 'y', or '1'.
+ */
+static ssize_t
+nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
+		    loff_t *pos)
+{
+	char *data;
+	struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+					   lockd_net_id);
+
+	if (size < 1)
+		return -EINVAL;
+
+	data = simple_transaction_get(file, buf, size);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	switch(data[0]) {
+	case 'Y':
+	case 'y':
+	case '1':
+		locks_end_grace(&ln->lockd_manager);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return size;
+}
+
+static ssize_t
+nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
+		   loff_t *pos)
+{
+	struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+					   lockd_net_id);
+	char resp[3];
+
+	resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
+	resp[1] = '\n';
+	resp[2] = '\0';
+
+	return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
+}
+
+static const struct file_operations lockd_end_grace_operations = {
+	.write		= nlm_end_grace_write,
+	.read		= nlm_end_grace_read,
+	.llseek		= default_llseek,
+	.release	= simple_transaction_release,
+	.owner		= THIS_MODULE,
+};
+
+int __init
+lockd_create_procfs(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_mkdir("fs/lockd", NULL);
+	if (!entry)
+		return -ENOMEM;
+	entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
+				 &lockd_end_grace_operations);
+	if (!entry) {
+		remove_proc_entry("fs/lockd", NULL);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void __exit
+lockd_remove_procfs(void)
+{
+	remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
+	remove_proc_entry("fs/lockd", NULL);
+}
diff --git a/kernel/fs/lockd/procfs.h b/kernel/fs/lockd/procfs.h
new file mode 100644
index 000000000..2257a1311
--- /dev/null
+++ b/kernel/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+#ifndef _LOCKD_PROCFS_H
+#define _LOCKD_PROCFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_PROC_FS)
+int lockd_create_procfs(void);
+void lockd_remove_procfs(void);
+#else
+static inline int
+lockd_create_procfs(void)
+{
+	return 0;
+}
+
+static inline void
+lockd_remove_procfs(void)
+{
+	return;
+}
+#endif /* IS_ENABLED(CONFIG_PROC_FS) */
+
+#endif /* _LOCKD_PROCFS_H */
diff --git a/kernel/fs/lockd/svc.c b/kernel/fs/lockd/svc.c
new file mode 100644
index 000000000..55505cbe1
--- /dev/null
+++ b/kernel/fs/lockd/svc.c
@@ -0,0 +1,695 @@
+/*
+ * linux/fs/lockd/svc.c
+ *
+ * This is the central lockd service.
+ *
+ * FIXME: Separate the lockd NFS server functionality from the lockd NFS
+ * 	  client functionality. Oh why didn't Sun create two separate
+ *	  services in the first place?
+ *
+ * Authors:	Olaf Kirch (okir@monad.swb.de)
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/moduleparam.h>
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/uio.h>
+#include <linux/smp.h>
+#include <linux/mutex.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <net/ip.h>
+#include <linux/lockd/lockd.h>
+#include <linux/nfs.h>
+
+#include "netns.h"
+#include "procfs.h"
+
+#define NLMDBG_FACILITY		NLMDBG_SVC
+#define LOCKD_BUFSIZE		(1024 + NLMSVC_XDRSIZE)
+#define ALLOWED_SIGS		(sigmask(SIGKILL))
+
+static struct svc_program	nlmsvc_program;
+
+struct nlmsvc_binding *		nlmsvc_ops;
+EXPORT_SYMBOL_GPL(nlmsvc_ops);
+
+static DEFINE_MUTEX(nlmsvc_mutex);
+static unsigned int		nlmsvc_users;
+static struct task_struct	*nlmsvc_task;
+static struct svc_rqst		*nlmsvc_rqst;
+unsigned long			nlmsvc_timeout;
+
+int lockd_net_id;
+
+/*
+ * These can be set at insmod time (useful for NFS as root filesystem),
+ * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
+ */
+static unsigned long		nlm_grace_period;
+static unsigned long		nlm_timeout = LOCKD_DFLT_TIMEO;
+static int			nlm_udpport, nlm_tcpport;
+
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int		nlm_max_connections = 1024;
+
+/*
+ * Constants needed for the sysctl interface.
+ */
+static const unsigned long	nlm_grace_period_min = 0;
+static const unsigned long	nlm_grace_period_max = 240;
+static const unsigned long	nlm_timeout_min = 3;
+static const unsigned long	nlm_timeout_max = 20;
+static const int		nlm_port_min = 0, nlm_port_max = 65535;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header * nlm_sysctl_table;
+#endif
+
+static unsigned long get_lockd_grace_period(void)
+{
+	/* Note: nlm_timeout should always be nonzero */
+	if (nlm_grace_period)
+		return roundup(nlm_grace_period, nlm_timeout) * HZ;
+	else
+		return nlm_timeout * 5 * HZ;
+}
+
+static void grace_ender(struct work_struct *grace)
+{
+	struct delayed_work *dwork = container_of(grace, struct delayed_work,
+						  work);
+	struct lockd_net *ln = container_of(dwork, struct lockd_net,
+					    grace_period_end);
+
+	locks_end_grace(&ln->lockd_manager);
+}
+
+static void set_grace_period(struct net *net)
+{
+	unsigned long grace_period = get_lockd_grace_period();
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+	locks_start_grace(net, &ln->lockd_manager);
+	cancel_delayed_work_sync(&ln->grace_period_end);
+	schedule_delayed_work(&ln->grace_period_end, grace_period);
+}
+
+static void restart_grace(void)
+{
+	if (nlmsvc_ops) {
+		struct net *net = &init_net;
+		struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+		cancel_delayed_work_sync(&ln->grace_period_end);
+		locks_end_grace(&ln->lockd_manager);
+		nlmsvc_invalidate_all();
+		set_grace_period(net);
+	}
+}
+
+/*
+ * This is the lockd kernel thread
+ */
+static int
+lockd(void *vrqstp)
+{
+	int		err = 0;
+	struct svc_rqst *rqstp = vrqstp;
+
+	/* try_to_freeze() is called from svc_recv() */
+	set_freezable();
+
+	/* Allow SIGKILL to tell lockd to drop all of its locks */
+	allow_signal(SIGKILL);
+
+	dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
+
+	/*
+	 * The main request loop. We don't terminate until the last
+	 * NFS mount or NFS daemon has gone away.
+	 */
+	while (!kthread_should_stop()) {
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+		/* update sv_maxconn if it has changed */
+		rqstp->rq_server->sv_maxconn = nlm_max_connections;
+
+		if (signalled()) {
+			flush_signals(current);
+			restart_grace();
+			continue;
+		}
+
+		timeout = nlmsvc_retry_blocked();
+
+		/*
+		 * Find a socket with data available and call its
+		 * recvfrom routine.
+		 */
+		err = svc_recv(rqstp, timeout);
+		if (err == -EAGAIN || err == -EINTR)
+			continue;
+		dprintk("lockd: request from %s\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)));
+
+		svc_process(rqstp);
+	}
+	flush_signals(current);
+	if (nlmsvc_ops)
+		nlmsvc_invalidate_all();
+	nlm_shutdown_hosts();
+	return 0;
+}
+
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
+				 struct net *net, const int family,
+				 const unsigned short port)
+{
+	struct svc_xprt *xprt;
+
+	xprt = svc_find_xprt(serv, name, net, family, 0);
+	if (xprt == NULL)
+		return svc_create_xprt(serv, name, net, family, port,
+						SVC_SOCK_DEFAULTS);
+	svc_xprt_put(xprt);
+	return 0;
+}
+
+static int create_lockd_family(struct svc_serv *serv, struct net *net,
+			       const int family)
+{
+	int err;
+
+	err = create_lockd_listener(serv, "udp", net, family, nlm_udpport);
+	if (err < 0)
+		return err;
+
+	return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport);
+}
+
+/*
+ * Ensure there are active UDP and TCP listeners for lockd.
+ *
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
+ */
+static int make_socks(struct svc_serv *serv, struct net *net)
+{
+	static int warned;
+	int err;
+
+	err = create_lockd_family(serv, net, PF_INET);
+	if (err < 0)
+		goto out_err;
+
+	err = create_lockd_family(serv, net, PF_INET6);
+	if (err < 0 && err != -EAFNOSUPPORT)
+		goto out_err;
+
+	warned = 0;
+	return 0;
+
+out_err:
+	if (warned++ == 0)
+		printk(KERN_WARNING
+			"lockd_up: makesock failed, error=%d\n", err);
+	svc_shutdown_net(serv, net);
+	return err;
+}
+
+static int lockd_up_net(struct svc_serv *serv, struct net *net)
+{
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+	int error;
+
+	if (ln->nlmsvc_users++)
+		return 0;
+
+	error = svc_bind(serv, net);
+	if (error)
+		goto err_bind;
+
+	error = make_socks(serv, net);
+	if (error < 0)
+		goto err_bind;
+	set_grace_period(net);
+	dprintk("lockd_up_net: per-net data created; net=%p\n", net);
+	return 0;
+
+err_bind:
+	ln->nlmsvc_users--;
+	return error;
+}
+
+static void lockd_down_net(struct svc_serv *serv, struct net *net)
+{
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+	if (ln->nlmsvc_users) {
+		if (--ln->nlmsvc_users == 0) {
+			nlm_shutdown_hosts_net(net);
+			cancel_delayed_work_sync(&ln->grace_period_end);
+			locks_end_grace(&ln->lockd_manager);
+			svc_shutdown_net(serv, net);
+			dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
+		}
+	} else {
+		printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
+				nlmsvc_task, net);
+		BUG();
+	}
+}
+
+static int lockd_start_svc(struct svc_serv *serv)
+{
+	int error;
+
+	if (nlmsvc_rqst)
+		return 0;
+
+	/*
+	 * Create the kernel thread and wait for it to start.
+	 */
+	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+	if (IS_ERR(nlmsvc_rqst)) {
+		error = PTR_ERR(nlmsvc_rqst);
+		printk(KERN_WARNING
+			"lockd_up: svc_rqst allocation failed, error=%d\n",
+			error);
+		goto out_rqst;
+	}
+
+	svc_sock_update_bufs(serv);
+	serv->sv_maxconn = nlm_max_connections;
+
+	nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
+	if (IS_ERR(nlmsvc_task)) {
+		error = PTR_ERR(nlmsvc_task);
+		printk(KERN_WARNING
+			"lockd_up: kthread_run failed, error=%d\n", error);
+		goto out_task;
+	}
+	nlmsvc_rqst->rq_task = nlmsvc_task;
+	wake_up_process(nlmsvc_task);
+
+	dprintk("lockd_up: service started\n");
+	return 0;
+
+out_task:
+	svc_exit_thread(nlmsvc_rqst);
+	nlmsvc_task = NULL;
+out_rqst:
+	nlmsvc_rqst = NULL;
+	return error;
+}
+
+static struct svc_serv *lockd_create_svc(void)
+{
+	struct svc_serv *serv;
+
+	/*
+	 * Check whether we're already up and running.
+	 */
+	if (nlmsvc_rqst) {
+		/*
+		 * Note: increase service usage, because later in case of error
+		 * svc_destroy() will be called.
+		 */
+		svc_get(nlmsvc_rqst->rq_server);
+		return nlmsvc_rqst->rq_server;
+	}
+
+	/*
+	 * Sanity check: if there's no pid,
+	 * we should be the first user ...
+	 */
+	if (nlmsvc_users)
+		printk(KERN_WARNING
+			"lockd_up: no pid, %d users??\n", nlmsvc_users);
+
+	if (!nlm_timeout)
+		nlm_timeout = LOCKD_DFLT_TIMEO;
+	nlmsvc_timeout = nlm_timeout * HZ;
+
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
+	if (!serv) {
+		printk(KERN_WARNING "lockd_up: create service failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	dprintk("lockd_up: service created\n");
+	return serv;
+}
+
+/*
+ * Bring up the lockd process if it's not already up.
+ */
+int lockd_up(struct net *net)
+{
+	struct svc_serv *serv;
+	int error;
+
+	mutex_lock(&nlmsvc_mutex);
+
+	serv = lockd_create_svc();
+	if (IS_ERR(serv)) {
+		error = PTR_ERR(serv);
+		goto err_create;
+	}
+
+	error = lockd_up_net(serv, net);
+	if (error < 0)
+		goto err_net;
+
+	error = lockd_start_svc(serv);
+	if (error < 0)
+		goto err_start;
+
+	nlmsvc_users++;
+	/*
+	 * Note: svc_serv structures have an initial use count of 1,
+	 * so we exit through here on both success and failure.
+	 */
+err_net:
+	svc_destroy(serv);
+err_create:
+	mutex_unlock(&nlmsvc_mutex);
+	return error;
+
+err_start:
+	lockd_down_net(serv, net);
+	goto err_net;
+}
+EXPORT_SYMBOL_GPL(lockd_up);
+
+/*
+ * Decrement the user count and bring down lockd if we're the last.
+ */
+void
+lockd_down(struct net *net)
+{
+	mutex_lock(&nlmsvc_mutex);
+	lockd_down_net(nlmsvc_rqst->rq_server, net);
+	if (nlmsvc_users) {
+		if (--nlmsvc_users)
+			goto out;
+	} else {
+		printk(KERN_ERR "lockd_down: no users! task=%p\n",
+			nlmsvc_task);
+		BUG();
+	}
+
+	if (!nlmsvc_task) {
+		printk(KERN_ERR "lockd_down: no lockd running.\n");
+		BUG();
+	}
+	kthread_stop(nlmsvc_task);
+	dprintk("lockd_down: service stopped\n");
+	svc_exit_thread(nlmsvc_rqst);
+	dprintk("lockd_down: service destroyed\n");
+	nlmsvc_task = NULL;
+	nlmsvc_rqst = NULL;
+out:
+	mutex_unlock(&nlmsvc_mutex);
+}
+EXPORT_SYMBOL_GPL(lockd_down);
+
+#ifdef CONFIG_SYSCTL
+
+/*
+ * Sysctl parameters (same as module parameters, different interface).
+ */
+
+static struct ctl_table nlm_sysctls[] = {
+	{
+		.procname	= "nlm_grace_period",
+		.data		= &nlm_grace_period,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= (unsigned long *) &nlm_grace_period_min,
+		.extra2		= (unsigned long *) &nlm_grace_period_max,
+	},
+	{
+		.procname	= "nlm_timeout",
+		.data		= &nlm_timeout,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= (unsigned long *) &nlm_timeout_min,
+		.extra2		= (unsigned long *) &nlm_timeout_max,
+	},
+	{
+		.procname	= "nlm_udpport",
+		.data		= &nlm_udpport,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (int *) &nlm_port_min,
+		.extra2		= (int *) &nlm_port_max,
+	},
+	{
+		.procname	= "nlm_tcpport",
+		.data		= &nlm_tcpport,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (int *) &nlm_port_min,
+		.extra2		= (int *) &nlm_port_max,
+	},
+	{
+		.procname	= "nsm_use_hostnames",
+		.data		= &nsm_use_hostnames,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "nsm_local_state",
+		.data		= &nsm_local_state,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_table nlm_sysctl_dir[] = {
+	{
+		.procname	= "nfs",
+		.mode		= 0555,
+		.child		= nlm_sysctls,
+	},
+	{ }
+};
+
+static struct ctl_table nlm_sysctl_root[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= nlm_sysctl_dir,
+	},
+	{ }
+};
+
+#endif	/* CONFIG_SYSCTL */
+
+/*
+ * Module (and sysfs) parameters.
+ */
+
+#define param_set_min_max(name, type, which_strtol, min, max)		\
+static int param_set_##name(const char *val, struct kernel_param *kp)	\
+{									\
+	char *endp;							\
+	__typeof__(type) num = which_strtol(val, &endp, 0);		\
+	if (endp == val || *endp || num < (min) || num > (max))		\
+		return -EINVAL;						\
+	*((type *) kp->arg) = num;					\
+	return 0;							\
+}
+
+static inline int is_callback(u32 proc)
+{
+	return proc == NLMPROC_GRANTED
+		|| proc == NLMPROC_GRANTED_MSG
+		|| proc == NLMPROC_TEST_RES
+		|| proc == NLMPROC_LOCK_RES
+		|| proc == NLMPROC_CANCEL_RES
+		|| proc == NLMPROC_UNLOCK_RES
+		|| proc == NLMPROC_NSM_NOTIFY;
+}
+
+
+static int lockd_authenticate(struct svc_rqst *rqstp)
+{
+	rqstp->rq_client = NULL;
+	switch (rqstp->rq_authop->flavour) {
+		case RPC_AUTH_NULL:
+		case RPC_AUTH_UNIX:
+			if (rqstp->rq_proc == 0)
+				return SVC_OK;
+			if (is_callback(rqstp->rq_proc)) {
+				/* Leave it to individual procedures to
+				 * call nlmsvc_lookup_host(rqstp)
+				 */
+				return SVC_OK;
+			}
+			return svc_set_client(rqstp);
+	}
+	return SVC_DENIED;
+}
+
+
+param_set_min_max(port, int, simple_strtol, 0, 65535)
+param_set_min_max(grace_period, unsigned long, simple_strtoul,
+		  nlm_grace_period_min, nlm_grace_period_max)
+param_set_min_max(timeout, unsigned long, simple_strtoul,
+		  nlm_timeout_min, nlm_timeout_max)
+
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION ".");
+MODULE_LICENSE("GPL");
+
+module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong,
+		  &nlm_grace_period, 0644);
+module_param_call(nlm_timeout, param_set_timeout, param_get_ulong,
+		  &nlm_timeout, 0644);
+module_param_call(nlm_udpport, param_set_port, param_get_int,
+		  &nlm_udpport, 0644);
+module_param_call(nlm_tcpport, param_set_port, param_get_int,
+		  &nlm_tcpport, 0644);
+module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
+
+static int lockd_init_net(struct net *net)
+{
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+	INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
+	INIT_LIST_HEAD(&ln->lockd_manager.list);
+	spin_lock_init(&ln->nsm_clnt_lock);
+	return 0;
+}
+
+static void lockd_exit_net(struct net *net)
+{
+}
+
+static struct pernet_operations lockd_net_ops = {
+	.init = lockd_init_net,
+	.exit = lockd_exit_net,
+	.id = &lockd_net_id,
+	.size = sizeof(struct lockd_net),
+};
+
+
+/*
+ * Initialising and terminating the module.
+ */
+
+static int __init init_nlm(void)
+{
+	int err;
+
+#ifdef CONFIG_SYSCTL
+	err = -ENOMEM;
+	nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
+	if (nlm_sysctl_table == NULL)
+		goto err_sysctl;
+#endif
+	err = register_pernet_subsys(&lockd_net_ops);
+	if (err)
+		goto err_pernet;
+
+	err = lockd_create_procfs();
+	if (err)
+		goto err_procfs;
+
+	return 0;
+
+err_procfs:
+	unregister_pernet_subsys(&lockd_net_ops);
+err_pernet:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(nlm_sysctl_table);
+err_sysctl:
+#endif
+	return err;
+}
+
+static void __exit exit_nlm(void)
+{
+	/* FIXME: delete all NLM clients */
+	nlm_shutdown_hosts();
+	lockd_remove_procfs();
+	unregister_pernet_subsys(&lockd_net_ops);
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(nlm_sysctl_table);
+#endif
+}
+
+module_init(init_nlm);
+module_exit(exit_nlm);
+
+/*
+ * Define NLM program and procedures
+ */
+static struct svc_version	nlmsvc_version1 = {
+		.vs_vers	= 1,
+		.vs_nproc	= 17,
+		.vs_proc	= nlmsvc_procedures,
+		.vs_xdrsize	= NLMSVC_XDRSIZE,
+};
+static struct svc_version	nlmsvc_version3 = {
+		.vs_vers	= 3,
+		.vs_nproc	= 24,
+		.vs_proc	= nlmsvc_procedures,
+		.vs_xdrsize	= NLMSVC_XDRSIZE,
+};
+#ifdef CONFIG_LOCKD_V4
+static struct svc_version	nlmsvc_version4 = {
+		.vs_vers	= 4,
+		.vs_nproc	= 24,
+		.vs_proc	= nlmsvc_procedures4,
+		.vs_xdrsize	= NLMSVC_XDRSIZE,
+};
+#endif
+static struct svc_version *	nlmsvc_version[] = {
+	[1] = &nlmsvc_version1,
+	[3] = &nlmsvc_version3,
+#ifdef CONFIG_LOCKD_V4
+	[4] = &nlmsvc_version4,
+#endif
+};
+
+static struct svc_stat		nlmsvc_stats;
+
+#define NLM_NRVERS	ARRAY_SIZE(nlmsvc_version)
+static struct svc_program	nlmsvc_program = {
+	.pg_prog		= NLM_PROGRAM,		/* program number */
+	.pg_nvers		= NLM_NRVERS,		/* number of entries in nlmsvc_version */
+	.pg_vers		= nlmsvc_version,	/* version table */
+	.pg_name		= "lockd",		/* service name */
+	.pg_class		= "nfsd",		/* share authentication with nfsd */
+	.pg_stats		= &nlmsvc_stats,	/* stats table */
+	.pg_authenticate = &lockd_authenticate	/* export authentication */
+};
diff --git a/kernel/fs/lockd/svc4proc.c b/kernel/fs/lockd/svc4proc.c
new file mode 100644
index 000000000..b147d1ae7
--- /dev/null
+++ b/kernel/fs/lockd/svc4proc.c
@@ -0,0 +1,505 @@
+/*
+ * linux/fs/lockd/svc4proc.c
+ *
+ * Lockd server procedures. We don't implement the NLM_*_RES 
+ * procedures because we don't use the async procedures.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+
+/*
+ * Obtain client and file from arguments
+ */
+static __be32
+nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+			struct nlm_host **hostp, struct nlm_file **filp)
+{
+	struct nlm_host		*host = NULL;
+	struct nlm_file		*file = NULL;
+	struct nlm_lock		*lock = &argp->lock;
+	__be32			error = 0;
+
+	/* nfsd callbacks must have been installed for this procedure */
+	if (!nlmsvc_ops)
+		return nlm_lck_denied_nolocks;
+
+	/* Obtain host handle */
+	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+	 || (argp->monitor && nsm_monitor(host) < 0))
+		goto no_locks;
+	*hostp = host;
+
+	/* Obtain file pointer. Not used by FREE_ALL call. */
+	if (filp != NULL) {
+		if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0)
+			goto no_locks;
+		*filp = file;
+
+		/* Set up the missing parts of the file_lock structure */
+		lock->fl.fl_file  = file->f_file;
+		lock->fl.fl_owner = (fl_owner_t) host;
+		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+	}
+
+	return 0;
+
+no_locks:
+	nlmsvc_release_host(host);
+ 	if (error)
+		return error;	
+	return nlm_lck_denied_nolocks;
+}
+
+/*
+ * NULL: Test for presence of service
+ */
+static __be32
+nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	dprintk("lockd: NULL          called\n");
+	return rpc_success;
+}
+
+/*
+ * TEST: Check for conflicting lock
+ */
+static __be32
+nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: TEST4        called\n");
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now check for conflicting locks */
+	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: LOCK          called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+#if 0
+	/* If supplied state doesn't match current state, we assume it's
+	 * an old request that time-warped somehow. Any error return would
+	 * do in this case because it's irrelevant anyway.
+	 *
+	 * NB: We don't retrieve the remote host's state yet.
+	 */
+	if (host->h_nsmstate && host->h_nsmstate != argp->state) {
+		resp->status = nlm_lck_denied_nolocks;
+	} else
+#endif
+
+	/* Now try to lock the file */
+	resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
+					argp->block, &argp->cookie,
+					argp->reclaim);
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: CANCEL        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace(SVC_NET(rqstp))) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Try to cancel request. */
+	resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock);
+
+	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNLOCK: release a lock
+ */
+static __be32
+nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: UNLOCK        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace(SVC_NET(rqstp))) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to remove the lock */
+	resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock);
+
+	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * GRANTED: A server calls us to tell that a process' lock request
+ * was granted
+ */
+static __be32
+nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	resp->cookie = argp->cookie;
+
+	dprintk("lockd: GRANTED       called\n");
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
+	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
+	return rpc_success;
+}
+
+/*
+ * This is the generic lockd callback for async RPC calls
+ */
+static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
+{
+	dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
+
+static void nlm4svc_callback_release(void *data)
+{
+	nlmsvc_release_call(data);
+}
+
+static const struct rpc_call_ops nlm4svc_callback_ops = {
+	.rpc_call_done = nlm4svc_callback_exit,
+	.rpc_release = nlm4svc_callback_release,
+};
+
+/*
+ * `Async' versions of the above service routines. They aren't really,
+ * because we send the callback before the reply proper. I hope this
+ * doesn't break any clients.
+ */
+static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		__be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+{
+	struct nlm_host	*host;
+	struct nlm_rqst	*call;
+	__be32 stat;
+
+	host = nlmsvc_lookup_host(rqstp,
+				  argp->lock.caller,
+				  argp->lock.len);
+	if (host == NULL)
+		return rpc_system_err;
+
+	call = nlm_alloc_call(host);
+	nlmsvc_release_host(host);
+	if (call == NULL)
+		return rpc_system_err;
+
+	stat = func(rqstp, argp, &call->a_res);
+	if (stat != 0) {
+		nlmsvc_release_call(call);
+		return stat;
+	}
+
+	call->a_flags = RPC_TASK_ASYNC;
+	if (nlm_async_reply(call, proc, &nlm4svc_callback_ops) < 0)
+		return rpc_system_err;
+	return rpc_success;
+}
+
+static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: TEST_MSG      called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test);
+}
+
+static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: LOCK_MSG      called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock);
+}
+
+static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					       void	       *resp)
+{
+	dprintk("lockd: CANCEL_MSG    called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel);
+}
+
+static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                               void            *resp)
+{
+	dprintk("lockd: UNLOCK_MSG    called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock);
+}
+
+static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                                void            *resp)
+{
+	dprintk("lockd: GRANTED_MSG   called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlm4svc_proc_granted);
+}
+
+/*
+ * SHARE: create a DOS share or alter existing share.
+ */
+static __be32
+nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
+				          struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: SHARE         called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to create the share */
+	resp->status = nlmsvc_share_file(host, file, argp);
+
+	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNSHARE: Release a DOS share.
+ */
+static __be32
+nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: UNSHARE       called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace(SVC_NET(rqstp))) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to lock the file */
+	resp->status = nlmsvc_unshare_file(host, file, argp);
+
+	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * NM_LOCK: Create an unmonitored lock
+ */
+static __be32
+nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	dprintk("lockd: NM_LOCK       called\n");
+
+	argp->monitor = 0;		/* just clean the monitor flag */
+	return nlm4svc_proc_lock(rqstp, argp, resp);
+}
+
+/*
+ * FREE_ALL: Release all locks and shares held by client
+ */
+static __be32
+nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void            *resp)
+{
+	struct nlm_host	*host;
+
+	/* Obtain client */
+	if (nlm4svc_retrieve_args(rqstp, argp, &host, NULL))
+		return rpc_success;
+
+	nlmsvc_free_host_resources(host);
+	nlmsvc_release_host(host);
+	return rpc_success;
+}
+
+/*
+ * SM_NOTIFY: private callback from statd (not part of official NLM proto)
+ */
+static __be32
+nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
+					      void	        *resp)
+{
+	dprintk("lockd: SM_NOTIFY     called\n");
+
+	if (!nlm_privileged_requester(rqstp)) {
+		char buf[RPC_MAX_ADDRBUFLEN];
+		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)));
+		return rpc_system_err;
+	}
+
+	nlm_host_rebooted(argp);
+	return rpc_success;
+}
+
+/*
+ * client sent a GRANTED_RES, let's remove the associated block
+ */
+static __be32
+nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
+                                                void            *resp)
+{
+        if (!nlmsvc_ops)
+                return rpc_success;
+
+        dprintk("lockd: GRANTED_RES   called\n");
+
+        nlmsvc_grant_reply(&argp->cookie, argp->status);
+        return rpc_success;
+}
+
+
+/*
+ * NLM Server procedures.
+ */
+
+#define nlm4svc_encode_norep	nlm4svc_encode_void
+#define nlm4svc_decode_norep	nlm4svc_decode_void
+#define nlm4svc_decode_testres	nlm4svc_decode_void
+#define nlm4svc_decode_lockres	nlm4svc_decode_void
+#define nlm4svc_decode_unlockres	nlm4svc_decode_void
+#define nlm4svc_decode_cancelres	nlm4svc_decode_void
+#define nlm4svc_decode_grantedres	nlm4svc_decode_void
+
+#define nlm4svc_proc_none	nlm4svc_proc_null
+#define nlm4svc_proc_test_res	nlm4svc_proc_null
+#define nlm4svc_proc_lock_res	nlm4svc_proc_null
+#define nlm4svc_proc_cancel_res	nlm4svc_proc_null
+#define nlm4svc_proc_unlock_res	nlm4svc_proc_null
+
+struct nlm_void			{ int dummy; };
+
+#define PROC(name, xargt, xrest, argt, rest, respsize)	\
+ { .pc_func	= (svc_procfunc) nlm4svc_proc_##name,	\
+   .pc_decode	= (kxdrproc_t) nlm4svc_decode_##xargt,	\
+   .pc_encode	= (kxdrproc_t) nlm4svc_encode_##xrest,	\
+   .pc_release	= NULL,					\
+   .pc_argsize	= sizeof(struct nlm_##argt),		\
+   .pc_ressize	= sizeof(struct nlm_##rest),		\
+   .pc_xdrressize = respsize,				\
+ }
+#define	Ck	(1+XDR_QUADLEN(NLM_MAXCOOKIELEN))	/* cookie */
+#define	No	(1+1024/4)				/* netobj */
+#define	St	1					/* status */
+#define	Rg	4					/* range (offset + length) */
+struct svc_procedure		nlmsvc_procedures4[] = {
+  PROC(null,		void,		void,		void,	void, 1),
+  PROC(test,		testargs,	testres,	args,	res, Ck+St+2+No+Rg),
+  PROC(lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(cancel,		cancargs,	res,		args,	res, Ck+St),
+  PROC(unlock,		unlockargs,	res,		args,	res, Ck+St),
+  PROC(granted,		testargs,	res,		args,	res, Ck+St),
+  PROC(test_msg,	testargs,	norep,		args,	void, 1),
+  PROC(lock_msg,	lockargs,	norep,		args,	void, 1),
+  PROC(cancel_msg,	cancargs,	norep,		args,	void, 1),
+  PROC(unlock_msg,	unlockargs,	norep,		args,	void, 1),
+  PROC(granted_msg,	testargs,	norep,		args,	void, 1),
+  PROC(test_res,	testres,	norep,		res,	void, 1),
+  PROC(lock_res,	lockres,	norep,		res,	void, 1),
+  PROC(cancel_res,	cancelres,	norep,		res,	void, 1),
+  PROC(unlock_res,	unlockres,	norep,		res,	void, 1),
+  PROC(granted_res,	res,		norep,		res,	void, 1),
+  /* statd callback */
+  PROC(sm_notify,	reboot,		void,		reboot,	void, 1),
+  PROC(none,		void,		void,		void,	void, 0),
+  PROC(none,		void,		void,		void,	void, 0),
+  PROC(none,		void,		void,		void,	void, 0),
+  PROC(share,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(unshare,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(nm_lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(free_all,	notify,		void,		args,	void, 1),
+
+};
diff --git a/kernel/fs/lockd/svclock.c b/kernel/fs/lockd/svclock.c
new file mode 100644
index 000000000..5581e0206
--- /dev/null
+++ b/kernel/fs/lockd/svclock.c
@@ -0,0 +1,939 @@
+/*
+ * linux/fs/lockd/svclock.c
+ *
+ * Handling of server-side locks, mostly of the blocked variety.
+ * This is the ugliest part of lockd because we tread on very thin ice.
+ * GRANT and CANCEL calls may get stuck, meet in mid-flight, etc.
+ * IMNSHO introducing the grant callback into the NLM protocol was one
+ * of the worst ideas Sun ever had. Except maybe for the idea of doing
+ * NFS file locking at all.
+ *
+ * I'm trying hard to avoid race conditions by protecting most accesses
+ * to a file's list of blocked locks through a semaphore. The global
+ * list of blocked locks is not protected in this fashion however.
+ * Therefore, some functions (such as the RPC callback for the async grant
+ * call) move blocked locks towards the head of the list *while some other
+ * process might be traversing it*. This should not be a problem in
+ * practice, because this will only cause functions traversing the list
+ * to visit some blocks twice.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/lockd/nlm.h>
+#include <linux/lockd/lockd.h>
+#include <linux/kthread.h>
+
+#define NLMDBG_FACILITY		NLMDBG_SVCLOCK
+
+#ifdef CONFIG_LOCKD_V4
+#define nlm_deadlock	nlm4_deadlock
+#else
+#define nlm_deadlock	nlm_lck_denied
+#endif
+
+static void nlmsvc_release_block(struct nlm_block *block);
+static void	nlmsvc_insert_block(struct nlm_block *block, unsigned long);
+static void	nlmsvc_remove_block(struct nlm_block *block);
+
+static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
+static void nlmsvc_freegrantargs(struct nlm_rqst *call);
+static const struct rpc_call_ops nlmsvc_grant_ops;
+
+/*
+ * The list of blocked locks to retry
+ */
+static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+	/*
+	 * We can get away with a static buffer because this is only called
+	 * from lockd, which is single-threaded.
+	 */
+	static char buf[2*NLM_MAXCOOKIELEN+1];
+	unsigned int i, len = sizeof(buf);
+	char *p = buf;
+
+	len--;	/* allow for trailing \0 */
+	if (len < 3)
+		return "???";
+	for (i = 0 ; i < cookie->len ; i++) {
+		if (len < 2) {
+			strcpy(p-3, "...");
+			break;
+		}
+		sprintf(p, "%02x", cookie->data[i]);
+		p += 2;
+		len -= 2;
+	}
+	*p = '\0';
+
+	return buf;
+}
+#endif
+
+/*
+ * Insert a blocked lock into the global list
+ */
+static void
+nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
+{
+	struct nlm_block *b;
+	struct list_head *pos;
+
+	dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when);
+	if (list_empty(&block->b_list)) {
+		kref_get(&block->b_count);
+	} else {
+		list_del_init(&block->b_list);
+	}
+
+	pos = &nlm_blocked;
+	if (when != NLM_NEVER) {
+		if ((when += jiffies) == NLM_NEVER)
+			when ++;
+		list_for_each(pos, &nlm_blocked) {
+			b = list_entry(pos, struct nlm_block, b_list);
+			if (time_after(b->b_when,when) || b->b_when == NLM_NEVER)
+				break;
+		}
+		/* On normal exit from the loop, pos == &nlm_blocked,
+		 * so we will be adding to the end of the list - good
+		 */
+	}
+
+	list_add_tail(&block->b_list, pos);
+	block->b_when = when;
+}
+
+static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+{
+	spin_lock(&nlm_blocked_lock);
+	nlmsvc_insert_block_locked(block, when);
+	spin_unlock(&nlm_blocked_lock);
+}
+
+/*
+ * Remove a block from the global list
+ */
+static inline void
+nlmsvc_remove_block(struct nlm_block *block)
+{
+	if (!list_empty(&block->b_list)) {
+		spin_lock(&nlm_blocked_lock);
+		list_del_init(&block->b_list);
+		spin_unlock(&nlm_blocked_lock);
+		nlmsvc_release_block(block);
+	}
+}
+
+/*
+ * Find a block for a given lock
+ */
+static struct nlm_block *
+nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
+{
+	struct nlm_block	*block;
+	struct file_lock	*fl;
+
+	dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
+				file, lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end, lock->fl.fl_type);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		fl = &block->b_call->a_args.lock.fl;
+		dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
+				block->b_file, fl->fl_pid,
+				(long long)fl->fl_start,
+				(long long)fl->fl_end, fl->fl_type,
+				nlmdbg_cookie2a(&block->b_call->a_args.cookie));
+		if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
+			kref_get(&block->b_count);
+			return block;
+		}
+	}
+
+	return NULL;
+}
+
+static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
+{
+	if (a->len != b->len)
+		return 0;
+	if (memcmp(a->data, b->data, a->len))
+		return 0;
+	return 1;
+}
+
+/*
+ * Find a block with a given NLM cookie.
+ */
+static inline struct nlm_block *
+nlmsvc_find_block(struct nlm_cookie *cookie)
+{
+	struct nlm_block *block;
+
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie))
+			goto found;
+	}
+
+	return NULL;
+
+found:
+	dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block);
+	kref_get(&block->b_count);
+	return block;
+}
+
+/*
+ * Create a block and initialize it.
+ *
+ * Note: we explicitly set the cookie of the grant reply to that of
+ * the blocked lock request. The spec explicitly mentions that the client
+ * should _not_ rely on the callback containing the same cookie as the
+ * request, but (as I found out later) that's because some implementations
+ * do just this. Never mind the standards comittees, they support our
+ * logging industries.
+ *
+ * 10 years later: I hope we can safely ignore these old and broken
+ * clients by now. Let's fix this so we can uniquely identify an incoming
+ * GRANTED_RES message by cookie, without having to rely on the client's IP
+ * address. --okir
+ */
+static struct nlm_block *
+nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
+		    struct nlm_file *file, struct nlm_lock *lock,
+		    struct nlm_cookie *cookie)
+{
+	struct nlm_block	*block;
+	struct nlm_rqst		*call = NULL;
+
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return NULL;
+
+	/* Allocate memory for block, and initialize arguments */
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (block == NULL)
+		goto failed;
+	kref_init(&block->b_count);
+	INIT_LIST_HEAD(&block->b_list);
+	INIT_LIST_HEAD(&block->b_flist);
+
+	if (!nlmsvc_setgrantargs(call, lock))
+		goto failed_free;
+
+	/* Set notifier function for VFS, and init args */
+	call->a_args.lock.fl.fl_flags |= FL_SLEEP;
+	call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
+	nlmclnt_next_cookie(&call->a_args.cookie);
+
+	dprintk("lockd: created block %p...\n", block);
+
+	/* Create and initialize the block */
+	block->b_daemon = rqstp->rq_server;
+	block->b_host   = host;
+	block->b_file   = file;
+	file->f_count++;
+
+	/* Add to file's list of blocks */
+	list_add(&block->b_flist, &file->f_blocks);
+
+	/* Set up RPC arguments for callback */
+	block->b_call = call;
+	call->a_flags   = RPC_TASK_ASYNC;
+	call->a_block = block;
+
+	return block;
+
+failed_free:
+	kfree(block);
+failed:
+	nlmsvc_release_call(call);
+	return NULL;
+}
+
+/*
+ * Delete a block.
+ * It is the caller's responsibility to check whether the file
+ * can be closed hereafter.
+ */
+static int nlmsvc_unlink_block(struct nlm_block *block)
+{
+	int status;
+	dprintk("lockd: unlinking block %p...\n", block);
+
+	/* Remove block from list */
+	status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
+	nlmsvc_remove_block(block);
+	return status;
+}
+
+static void nlmsvc_free_block(struct kref *kref)
+{
+	struct nlm_block *block = container_of(kref, struct nlm_block, b_count);
+	struct nlm_file		*file = block->b_file;
+
+	dprintk("lockd: freeing block %p...\n", block);
+
+	/* Remove block from file's list of blocks */
+	list_del_init(&block->b_flist);
+	mutex_unlock(&file->f_mutex);
+
+	nlmsvc_freegrantargs(block->b_call);
+	nlmsvc_release_call(block->b_call);
+	nlm_release_file(block->b_file);
+	kfree(block);
+}
+
+static void nlmsvc_release_block(struct nlm_block *block)
+{
+	if (block != NULL)
+		kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex);
+}
+
+/*
+ * Loop over all blocks and delete blocks held by
+ * a matching host.
+ */
+void nlmsvc_traverse_blocks(struct nlm_host *host,
+			struct nlm_file *file,
+			nlm_host_match_fn_t match)
+{
+	struct nlm_block *block, *next;
+
+restart:
+	mutex_lock(&file->f_mutex);
+	list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) {
+		if (!match(block->b_host, host))
+			continue;
+		/* Do not destroy blocks that are not on
+		 * the global retry list - why? */
+		if (list_empty(&block->b_list))
+			continue;
+		kref_get(&block->b_count);
+		mutex_unlock(&file->f_mutex);
+		nlmsvc_unlink_block(block);
+		nlmsvc_release_block(block);
+		goto restart;
+	}
+	mutex_unlock(&file->f_mutex);
+}
+
+/*
+ * Initialize arguments for GRANTED call. The nlm_rqst structure
+ * has been cleared already.
+ */
+static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
+{
+	locks_copy_lock(&call->a_args.lock.fl, &lock->fl);
+	memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh));
+	call->a_args.lock.caller = utsname()->nodename;
+	call->a_args.lock.oh.len = lock->oh.len;
+
+	/* set default data area */
+	call->a_args.lock.oh.data = call->a_owner;
+	call->a_args.lock.svid = lock->fl.fl_pid;
+
+	if (lock->oh.len > NLMCLNT_OHSIZE) {
+		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
+		if (!data)
+			return 0;
+		call->a_args.lock.oh.data = (u8 *) data;
+	}
+
+	memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len);
+	return 1;
+}
+
+static void nlmsvc_freegrantargs(struct nlm_rqst *call)
+{
+	if (call->a_args.lock.oh.data != call->a_owner)
+		kfree(call->a_args.lock.oh.data);
+
+	locks_release_private(&call->a_args.lock.fl);
+}
+
+/*
+ * Deferred lock request handling for non-blocking lock
+ */
+static __be32
+nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
+{
+	__be32 status = nlm_lck_denied_nolocks;
+
+	block->b_flags |= B_QUEUED;
+
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+
+	block->b_cache_req = &rqstp->rq_chandle;
+	if (rqstp->rq_chandle.defer) {
+		block->b_deferred_req =
+			rqstp->rq_chandle.defer(block->b_cache_req);
+		if (block->b_deferred_req != NULL)
+			status = nlm_drop_reply;
+	}
+	dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n",
+		block, block->b_flags, ntohl(status));
+
+	return status;
+}
+
+/*
+ * Attempt to establish a lock, and if it can't be granted, block it
+ * if required.
+ */
+__be32
+nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
+	    struct nlm_host *host, struct nlm_lock *lock, int wait,
+	    struct nlm_cookie *cookie, int reclaim)
+{
+	struct nlm_block	*block = NULL;
+	int			error;
+	__be32			ret;
+
+	dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
+				lock->fl.fl_type, lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end,
+				wait);
+
+	/* Lock file against concurrent access */
+	mutex_lock(&file->f_mutex);
+	/* Get existing block (in case client is busy-waiting)
+	 * or create new block
+	 */
+	block = nlmsvc_lookup_block(file, lock);
+	if (block == NULL) {
+		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
+		ret = nlm_lck_denied_nolocks;
+		if (block == NULL)
+			goto out;
+		lock = &block->b_call->a_args.lock;
+	} else
+		lock->fl.fl_flags &= ~FL_SLEEP;
+
+	if (block->b_flags & B_QUEUED) {
+		dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n",
+							block, block->b_flags);
+		if (block->b_granted) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_granted;
+			goto out;
+		}
+		if (block->b_flags & B_TIMED_OUT) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_lck_denied;
+			goto out;
+		}
+		ret = nlm_drop_reply;
+		goto out;
+	}
+
+	if (locks_in_grace(SVC_NET(rqstp)) && !reclaim) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+	if (reclaim && !locks_in_grace(SVC_NET(rqstp))) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+
+	if (!wait)
+		lock->fl.fl_flags &= ~FL_SLEEP;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+	lock->fl.fl_flags &= ~FL_SLEEP;
+
+	dprintk("lockd: vfs_lock_file returned %d\n", error);
+	switch (error) {
+		case 0:
+			ret = nlm_granted;
+			goto out;
+		case -EAGAIN:
+			/*
+			 * If this is a blocking request for an
+			 * already pending lock request then we need
+			 * to put it back on lockd's block list
+			 */
+			if (wait)
+				break;
+			ret = nlm_lck_denied;
+			goto out;
+		case FILE_LOCK_DEFERRED:
+			if (wait)
+				break;
+			/* Filesystem lock operation is in progress
+			   Add it to the queue waiting for callback */
+			ret = nlmsvc_defer_lock_rqst(rqstp, block);
+			goto out;
+		case -EDEADLK:
+			ret = nlm_deadlock;
+			goto out;
+		default:			/* includes ENOLCK */
+			ret = nlm_lck_denied_nolocks;
+			goto out;
+	}
+
+	ret = nlm_lck_blocked;
+
+	/* Append to list of blocked */
+	nlmsvc_insert_block(block, NLM_NEVER);
+out:
+	mutex_unlock(&file->f_mutex);
+	nlmsvc_release_block(block);
+	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
+	return ret;
+}
+
+/*
+ * Test for presence of a conflicting lock.
+ */
+__be32
+nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+		struct nlm_host *host, struct nlm_lock *lock,
+		struct nlm_lock *conflock, struct nlm_cookie *cookie)
+{
+	int			error;
+	__be32			ret;
+
+	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
+				lock->fl.fl_type,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
+
+	if (locks_in_grace(SVC_NET(rqstp))) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+
+	error = vfs_test_lock(file->f_file, &lock->fl);
+	if (error) {
+		/* We can't currently deal with deferred test requests */
+		if (error == FILE_LOCK_DEFERRED)
+			WARN_ON_ONCE(1);
+
+		ret = nlm_lck_denied_nolocks;
+		goto out;
+	}
+
+	if (lock->fl.fl_type == F_UNLCK) {
+		ret = nlm_granted;
+		goto out;
+	}
+
+	dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
+		lock->fl.fl_type, (long long)lock->fl.fl_start,
+		(long long)lock->fl.fl_end);
+	conflock->caller = "somehost";	/* FIXME */
+	conflock->len = strlen(conflock->caller);
+	conflock->oh.len = 0;		/* don't return OH info */
+	conflock->svid = lock->fl.fl_pid;
+	conflock->fl.fl_type = lock->fl.fl_type;
+	conflock->fl.fl_start = lock->fl.fl_start;
+	conflock->fl.fl_end = lock->fl.fl_end;
+	locks_release_private(&lock->fl);
+	ret = nlm_lck_denied;
+out:
+	return ret;
+}
+
+/*
+ * Remove a lock.
+ * This implies a CANCEL call: We send a GRANT_MSG, the client replies
+ * with a GRANT_RES call which gets lost, and calls UNLOCK immediately
+ * afterwards. In this case the block will still be there, and hence
+ * must be removed.
+ */
+__be32
+nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
+{
+	int	error;
+
+	dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
+				lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
+
+	/* First, cancel any lock that might be there */
+	nlmsvc_cancel_blocked(net, file, lock);
+
+	lock->fl.fl_type = F_UNLCK;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+
+	return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
+}
+
+/*
+ * Cancel a previously blocked request.
+ *
+ * A cancel request always overrides any grant that may currently
+ * be in progress.
+ * The calling procedure must check whether the file can be closed.
+ */
+__be32
+nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
+{
+	struct nlm_block	*block;
+	int status = 0;
+
+	dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
+				lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
+
+	if (locks_in_grace(net))
+		return nlm_lck_denied_grace_period;
+
+	mutex_lock(&file->f_mutex);
+	block = nlmsvc_lookup_block(file, lock);
+	mutex_unlock(&file->f_mutex);
+	if (block != NULL) {
+		vfs_cancel_lock(block->b_file->f_file,
+				&block->b_call->a_args.lock.fl);
+		status = nlmsvc_unlink_block(block);
+		nlmsvc_release_block(block);
+	}
+	return status ? nlm_lck_denied : nlm_granted;
+}
+
+/*
+ * This is a callback from the filesystem for VFS file lock requests.
+ * It will be used if lm_grant is defined and the filesystem can not
+ * respond to the request immediately.
+ * For SETLK or SETLKW request it will get the local posix lock.
+ * In all cases it will move the block to the head of nlm_blocked q where
+ * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the
+ * deferred rpc for GETLK and SETLK.
+ */
+static void
+nlmsvc_update_deferred_block(struct nlm_block *block, int result)
+{
+	block->b_flags |= B_GOT_CALLBACK;
+	if (result == 0)
+		block->b_granted = 1;
+	else
+		block->b_flags |= B_TIMED_OUT;
+}
+
+static int nlmsvc_grant_deferred(struct file_lock *fl, int result)
+{
+	struct nlm_block *block;
+	int rc = -ENOENT;
+
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
+			dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
+							block, block->b_flags);
+			if (block->b_flags & B_QUEUED) {
+				if (block->b_flags & B_TIMED_OUT) {
+					rc = -ENOLCK;
+					break;
+				}
+				nlmsvc_update_deferred_block(block, result);
+			} else if (result == 0)
+				block->b_granted = 1;
+
+			nlmsvc_insert_block_locked(block, 0);
+			svc_wake_up(block->b_daemon);
+			rc = 0;
+			break;
+		}
+	}
+	spin_unlock(&nlm_blocked_lock);
+	if (rc == -ENOENT)
+		printk(KERN_WARNING "lockd: grant for unknown block\n");
+	return rc;
+}
+
+/*
+ * Unblock a blocked lock request. This is a callback invoked from the
+ * VFS layer when a lock on which we blocked is removed.
+ *
+ * This function doesn't grant the blocked lock instantly, but rather moves
+ * the block to the head of nlm_blocked where it can be picked up by lockd.
+ */
+static void
+nlmsvc_notify_blocked(struct file_lock *fl)
+{
+	struct nlm_block	*block;
+
+	dprintk("lockd: VFS unblock notification for block %p\n", fl);
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
+			nlmsvc_insert_block_locked(block, 0);
+			spin_unlock(&nlm_blocked_lock);
+			svc_wake_up(block->b_daemon);
+			return;
+		}
+	}
+	spin_unlock(&nlm_blocked_lock);
+	printk(KERN_WARNING "lockd: notification for unknown block!\n");
+}
+
+static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+{
+	return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
+}
+
+/*
+ * Since NLM uses two "keys" for tracking locks, we need to hash them down
+ * to one for the blocked_hash. Here, we're just xor'ing the host address
+ * with the pid in order to create a key value for picking a hash bucket.
+ */
+static unsigned long
+nlmsvc_owner_key(struct file_lock *fl)
+{
+	return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
+}
+
+const struct lock_manager_operations nlmsvc_lock_operations = {
+	.lm_compare_owner = nlmsvc_same_owner,
+	.lm_owner_key = nlmsvc_owner_key,
+	.lm_notify = nlmsvc_notify_blocked,
+	.lm_grant = nlmsvc_grant_deferred,
+};
+
+/*
+ * Try to claim a lock that was previously blocked.
+ *
+ * Note that we use both the RPC_GRANTED_MSG call _and_ an async
+ * RPC thread when notifying the client. This seems like overkill...
+ * Here's why:
+ *  -	we don't want to use a synchronous RPC thread, otherwise
+ *	we might find ourselves hanging on a dead portmapper.
+ *  -	Some lockd implementations (e.g. HP) don't react to
+ *	RPC_GRANTED calls; they seem to insist on RPC_GRANTED_MSG calls.
+ */
+static void
+nlmsvc_grant_blocked(struct nlm_block *block)
+{
+	struct nlm_file		*file = block->b_file;
+	struct nlm_lock		*lock = &block->b_call->a_args.lock;
+	int			error;
+	loff_t			fl_start, fl_end;
+
+	dprintk("lockd: grant blocked lock %p\n", block);
+
+	kref_get(&block->b_count);
+
+	/* Unlink block request from list */
+	nlmsvc_unlink_block(block);
+
+	/* If b_granted is true this means we've been here before.
+	 * Just retry the grant callback, possibly refreshing the RPC
+	 * binding */
+	if (block->b_granted) {
+		nlm_rebind_host(block->b_host);
+		goto callback;
+	}
+
+	/* Try the lock operation again */
+	/* vfs_lock_file() can mangle fl_start and fl_end, but we need
+	 * them unchanged for the GRANT_MSG
+	 */
+	lock->fl.fl_flags |= FL_SLEEP;
+	fl_start = lock->fl.fl_start;
+	fl_end = lock->fl.fl_end;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+	lock->fl.fl_flags &= ~FL_SLEEP;
+	lock->fl.fl_start = fl_start;
+	lock->fl.fl_end = fl_end;
+
+	switch (error) {
+	case 0:
+		break;
+	case FILE_LOCK_DEFERRED:
+		dprintk("lockd: lock still blocked error %d\n", error);
+		nlmsvc_insert_block(block, NLM_NEVER);
+		nlmsvc_release_block(block);
+		return;
+	default:
+		printk(KERN_WARNING "lockd: unexpected error %d in %s!\n",
+				-error, __func__);
+		nlmsvc_insert_block(block, 10 * HZ);
+		nlmsvc_release_block(block);
+		return;
+	}
+
+callback:
+	/* Lock was granted by VFS. */
+	dprintk("lockd: GRANTing blocked lock.\n");
+	block->b_granted = 1;
+
+	/* keep block on the list, but don't reattempt until the RPC
+	 * completes or the submission fails
+	 */
+	nlmsvc_insert_block(block, NLM_NEVER);
+
+	/* Call the client -- use a soft RPC task since nlmsvc_retry_blocked
+	 * will queue up a new one if this one times out
+	 */
+	error = nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG,
+				&nlmsvc_grant_ops);
+
+	/* RPC submission failed, wait a bit and retry */
+	if (error < 0)
+		nlmsvc_insert_block(block, 10 * HZ);
+}
+
+/*
+ * This is the callback from the RPC layer when the NLM_GRANTED_MSG
+ * RPC call has succeeded or timed out.
+ * Like all RPC callbacks, it is invoked by the rpciod process, so it
+ * better not sleep. Therefore, we put the blocked lock on the nlm_blocked
+ * chain once more in order to have it removed by lockd itself (which can
+ * then sleep on the file semaphore without disrupting e.g. the nfs client).
+ */
+static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst		*call = data;
+	struct nlm_block	*block = call->a_block;
+	unsigned long		timeout;
+
+	dprintk("lockd: GRANT_MSG RPC callback\n");
+
+	spin_lock(&nlm_blocked_lock);
+	/* if the block is not on a list at this point then it has
+	 * been invalidated. Don't try to requeue it.
+	 *
+	 * FIXME: it's possible that the block is removed from the list
+	 * after this check but before the nlmsvc_insert_block. In that
+	 * case it will be added back. Perhaps we need better locking
+	 * for nlm_blocked?
+	 */
+	if (list_empty(&block->b_list))
+		goto out;
+
+	/* Technically, we should down the file semaphore here. Since we
+	 * move the block towards the head of the queue only, no harm
+	 * can be done, though. */
+	if (task->tk_status < 0) {
+		/* RPC error: Re-insert for retransmission */
+		timeout = 10 * HZ;
+	} else {
+		/* Call was successful, now wait for client callback */
+		timeout = 60 * HZ;
+	}
+	nlmsvc_insert_block_locked(block, timeout);
+	svc_wake_up(block->b_daemon);
+out:
+	spin_unlock(&nlm_blocked_lock);
+}
+
+/*
+ * FIXME: nlmsvc_release_block() grabs a mutex.  This is not allowed for an
+ * .rpc_release rpc_call_op
+ */
+static void nlmsvc_grant_release(void *data)
+{
+	struct nlm_rqst		*call = data;
+	nlmsvc_release_block(call->a_block);
+}
+
+static const struct rpc_call_ops nlmsvc_grant_ops = {
+	.rpc_call_done = nlmsvc_grant_callback,
+	.rpc_release = nlmsvc_grant_release,
+};
+
+/*
+ * We received a GRANT_RES callback. Try to find the corresponding
+ * block.
+ */
+void
+nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
+{
+	struct nlm_block	*block;
+
+	dprintk("grant_reply: looking for cookie %x, s=%d \n",
+		*(unsigned int *)(cookie->data), status);
+	if (!(block = nlmsvc_find_block(cookie)))
+		return;
+
+	if (block) {
+		if (status == nlm_lck_denied_grace_period) {
+			/* Try again in a couple of seconds */
+			nlmsvc_insert_block(block, 10 * HZ);
+		} else {
+			/* Lock is now held by client, or has been rejected.
+			 * In both cases, the block should be removed. */
+			nlmsvc_unlink_block(block);
+		}
+	}
+	nlmsvc_release_block(block);
+}
+
+/* Helper function to handle retry of a deferred block.
+ * If it is a blocking lock, call grant_blocked.
+ * For a non-blocking lock or test lock, revisit the request.
+ */
+static void
+retry_deferred_block(struct nlm_block *block)
+{
+	if (!(block->b_flags & B_GOT_CALLBACK))
+		block->b_flags |= B_TIMED_OUT;
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+	dprintk("revisit block %p flags %d\n",	block, block->b_flags);
+	if (block->b_deferred_req) {
+		block->b_deferred_req->revisit(block->b_deferred_req, 0);
+		block->b_deferred_req = NULL;
+	}
+}
+
+/*
+ * Retry all blocked locks that have been notified. This is where lockd
+ * picks up locks that can be granted, or grant notifications that must
+ * be retransmitted.
+ */
+unsigned long
+nlmsvc_retry_blocked(void)
+{
+	unsigned long	timeout = MAX_SCHEDULE_TIMEOUT;
+	struct nlm_block *block;
+
+	spin_lock(&nlm_blocked_lock);
+	while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+		block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
+
+		if (block->b_when == NLM_NEVER)
+			break;
+		if (time_after(block->b_when, jiffies)) {
+			timeout = block->b_when - jiffies;
+			break;
+		}
+		spin_unlock(&nlm_blocked_lock);
+
+		dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
+			block, block->b_when);
+		if (block->b_flags & B_QUEUED) {
+			dprintk("nlmsvc_retry_blocked delete block (%p, granted=%d, flags=%d)\n",
+				block, block->b_granted, block->b_flags);
+			retry_deferred_block(block);
+		} else
+			nlmsvc_grant_blocked(block);
+		spin_lock(&nlm_blocked_lock);
+	}
+	spin_unlock(&nlm_blocked_lock);
+
+	return timeout;
+}
diff --git a/kernel/fs/lockd/svcproc.c b/kernel/fs/lockd/svcproc.c
new file mode 100644
index 000000000..21171f0c6
--- /dev/null
+++ b/kernel/fs/lockd/svcproc.c
@@ -0,0 +1,549 @@
+/*
+ * linux/fs/lockd/svcproc.c
+ *
+ * Lockd server procedures. We don't implement the NLM_*_RES 
+ * procedures because we don't use the async procedures.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+
+#ifdef CONFIG_LOCKD_V4
+static __be32
+cast_to_nlm(__be32 status, u32 vers)
+{
+	/* Note: status is assumed to be in network byte order !!! */
+	if (vers != 4){
+		switch (status) {
+		case nlm_granted:
+		case nlm_lck_denied:
+		case nlm_lck_denied_nolocks:
+		case nlm_lck_blocked:
+		case nlm_lck_denied_grace_period:
+		case nlm_drop_reply:
+			break;
+		case nlm4_deadlock:
+			status = nlm_lck_denied;
+			break;
+		default:
+			status = nlm_lck_denied_nolocks;
+		}
+	}
+
+	return (status);
+}
+#define	cast_status(status) (cast_to_nlm(status, rqstp->rq_vers))
+#else
+#define cast_status(status) (status)
+#endif
+
+/*
+ * Obtain client and file from arguments
+ */
+static __be32
+nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+			struct nlm_host **hostp, struct nlm_file **filp)
+{
+	struct nlm_host		*host = NULL;
+	struct nlm_file		*file = NULL;
+	struct nlm_lock		*lock = &argp->lock;
+	__be32			error = 0;
+
+	/* nfsd callbacks must have been installed for this procedure */
+	if (!nlmsvc_ops)
+		return nlm_lck_denied_nolocks;
+
+	/* Obtain host handle */
+	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+	 || (argp->monitor && nsm_monitor(host) < 0))
+		goto no_locks;
+	*hostp = host;
+
+	/* Obtain file pointer. Not used by FREE_ALL call. */
+	if (filp != NULL) {
+		error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh));
+		if (error != 0)
+			goto no_locks;
+		*filp = file;
+
+		/* Set up the missing parts of the file_lock structure */
+		lock->fl.fl_file  = file->f_file;
+		lock->fl.fl_owner = (fl_owner_t) host;
+		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+	}
+
+	return 0;
+
+no_locks:
+	nlmsvc_release_host(host);
+	if (error)
+		return error;
+	return nlm_lck_denied_nolocks;
+}
+
+/*
+ * NULL: Test for presence of service
+ */
+static __be32
+nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	dprintk("lockd: NULL          called\n");
+	return rpc_success;
+}
+
+/*
+ * TEST: Check for conflicting lock
+ */
+static __be32
+nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: TEST          called\n");
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now check for conflicting locks */
+	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: TEST          status %d vers %d\n",
+			ntohl(resp->status), rqstp->rq_vers);
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: LOCK          called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+#if 0
+	/* If supplied state doesn't match current state, we assume it's
+	 * an old request that time-warped somehow. Any error return would
+	 * do in this case because it's irrelevant anyway.
+	 *
+	 * NB: We don't retrieve the remote host's state yet.
+	 */
+	if (host->h_nsmstate && host->h_nsmstate != argp->state) {
+		resp->status = nlm_lck_denied_nolocks;
+	} else
+#endif
+
+	/* Now try to lock the file */
+	resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
+					       argp->block, &argp->cookie,
+					       argp->reclaim));
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	struct net *net = SVC_NET(rqstp);
+
+	dprintk("lockd: CANCEL        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace(net)) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Try to cancel request. */
+	resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock));
+
+	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNLOCK: release a lock
+ */
+static __be32
+nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	struct net *net = SVC_NET(rqstp);
+
+	dprintk("lockd: UNLOCK        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace(net)) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to remove the lock */
+	resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock));
+
+	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * GRANTED: A server calls us to tell that a process' lock request
+ * was granted
+ */
+static __be32
+nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	resp->cookie = argp->cookie;
+
+	dprintk("lockd: GRANTED       called\n");
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
+	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
+	return rpc_success;
+}
+
+/*
+ * This is the generic lockd callback for async RPC calls
+ */
+static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
+{
+	dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
+
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+	if (!atomic_dec_and_test(&call->a_count))
+		return;
+	nlmsvc_release_host(call->a_host);
+	kfree(call);
+}
+
+static void nlmsvc_callback_release(void *data)
+{
+	nlmsvc_release_call(data);
+}
+
+static const struct rpc_call_ops nlmsvc_callback_ops = {
+	.rpc_call_done = nlmsvc_callback_exit,
+	.rpc_release = nlmsvc_callback_release,
+};
+
+/*
+ * `Async' versions of the above service routines. They aren't really,
+ * because we send the callback before the reply proper. I hope this
+ * doesn't break any clients.
+ */
+static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		__be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+{
+	struct nlm_host	*host;
+	struct nlm_rqst	*call;
+	__be32 stat;
+
+	host = nlmsvc_lookup_host(rqstp,
+				  argp->lock.caller,
+				  argp->lock.len);
+	if (host == NULL)
+		return rpc_system_err;
+
+	call = nlm_alloc_call(host);
+	nlmsvc_release_host(host);
+	if (call == NULL)
+		return rpc_system_err;
+
+	stat = func(rqstp, argp, &call->a_res);
+	if (stat != 0) {
+		nlmsvc_release_call(call);
+		return stat;
+	}
+
+	call->a_flags = RPC_TASK_ASYNC;
+	if (nlm_async_reply(call, proc, &nlmsvc_callback_ops) < 0)
+		return rpc_system_err;
+	return rpc_success;
+}
+
+static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: TEST_MSG      called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test);
+}
+
+static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: LOCK_MSG      called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock);
+}
+
+static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					       void	       *resp)
+{
+	dprintk("lockd: CANCEL_MSG    called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel);
+}
+
+static __be32
+nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                               void            *resp)
+{
+	dprintk("lockd: UNLOCK_MSG    called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock);
+}
+
+static __be32
+nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                                void            *resp)
+{
+	dprintk("lockd: GRANTED_MSG   called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlmsvc_proc_granted);
+}
+
+/*
+ * SHARE: create a DOS share or alter existing share.
+ */
+static __be32
+nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
+				          struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: SHARE         called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to create the share */
+	resp->status = cast_status(nlmsvc_share_file(host, file, argp));
+
+	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNSHARE: Release a DOS share.
+ */
+static __be32
+nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: UNSHARE       called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace(SVC_NET(rqstp))) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to unshare the file */
+	resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
+
+	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * NM_LOCK: Create an unmonitored lock
+ */
+static __be32
+nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	dprintk("lockd: NM_LOCK       called\n");
+
+	argp->monitor = 0;		/* just clean the monitor flag */
+	return nlmsvc_proc_lock(rqstp, argp, resp);
+}
+
+/*
+ * FREE_ALL: Release all locks and shares held by client
+ */
+static __be32
+nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void            *resp)
+{
+	struct nlm_host	*host;
+
+	/* Obtain client */
+	if (nlmsvc_retrieve_args(rqstp, argp, &host, NULL))
+		return rpc_success;
+
+	nlmsvc_free_host_resources(host);
+	nlmsvc_release_host(host);
+	return rpc_success;
+}
+
+/*
+ * SM_NOTIFY: private callback from statd (not part of official NLM proto)
+ */
+static __be32
+nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
+					      void	        *resp)
+{
+	dprintk("lockd: SM_NOTIFY     called\n");
+
+	if (!nlm_privileged_requester(rqstp)) {
+		char buf[RPC_MAX_ADDRBUFLEN];
+		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)));
+		return rpc_system_err;
+	}
+
+	nlm_host_rebooted(argp);
+	return rpc_success;
+}
+
+/*
+ * client sent a GRANTED_RES, let's remove the associated block
+ */
+static __be32
+nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
+                                                void            *resp)
+{
+	if (!nlmsvc_ops)
+		return rpc_success;
+
+	dprintk("lockd: GRANTED_RES   called\n");
+
+	nlmsvc_grant_reply(&argp->cookie, argp->status);
+	return rpc_success;
+}
+
+/*
+ * NLM Server procedures.
+ */
+
+#define nlmsvc_encode_norep	nlmsvc_encode_void
+#define nlmsvc_decode_norep	nlmsvc_decode_void
+#define nlmsvc_decode_testres	nlmsvc_decode_void
+#define nlmsvc_decode_lockres	nlmsvc_decode_void
+#define nlmsvc_decode_unlockres	nlmsvc_decode_void
+#define nlmsvc_decode_cancelres	nlmsvc_decode_void
+#define nlmsvc_decode_grantedres	nlmsvc_decode_void
+
+#define nlmsvc_proc_none	nlmsvc_proc_null
+#define nlmsvc_proc_test_res	nlmsvc_proc_null
+#define nlmsvc_proc_lock_res	nlmsvc_proc_null
+#define nlmsvc_proc_cancel_res	nlmsvc_proc_null
+#define nlmsvc_proc_unlock_res	nlmsvc_proc_null
+
+struct nlm_void			{ int dummy; };
+
+#define PROC(name, xargt, xrest, argt, rest, respsize)	\
+ { .pc_func	= (svc_procfunc) nlmsvc_proc_##name,	\
+   .pc_decode	= (kxdrproc_t) nlmsvc_decode_##xargt,	\
+   .pc_encode	= (kxdrproc_t) nlmsvc_encode_##xrest,	\
+   .pc_release	= NULL,					\
+   .pc_argsize	= sizeof(struct nlm_##argt),		\
+   .pc_ressize	= sizeof(struct nlm_##rest),		\
+   .pc_xdrressize = respsize,				\
+ }
+
+#define	Ck	(1+XDR_QUADLEN(NLM_MAXCOOKIELEN))	/* cookie */
+#define	St	1				/* status */
+#define	No	(1+1024/4)			/* Net Obj */
+#define	Rg	2				/* range - offset + size */
+
+struct svc_procedure		nlmsvc_procedures[] = {
+  PROC(null,		void,		void,		void,	void, 1),
+  PROC(test,		testargs,	testres,	args,	res, Ck+St+2+No+Rg),
+  PROC(lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(cancel,		cancargs,	res,		args,	res, Ck+St),
+  PROC(unlock,		unlockargs,	res,		args,	res, Ck+St),
+  PROC(granted,		testargs,	res,		args,	res, Ck+St),
+  PROC(test_msg,	testargs,	norep,		args,	void, 1),
+  PROC(lock_msg,	lockargs,	norep,		args,	void, 1),
+  PROC(cancel_msg,	cancargs,	norep,		args,	void, 1),
+  PROC(unlock_msg,	unlockargs,	norep,		args,	void, 1),
+  PROC(granted_msg,	testargs,	norep,		args,	void, 1),
+  PROC(test_res,	testres,	norep,		res,	void, 1),
+  PROC(lock_res,	lockres,	norep,		res,	void, 1),
+  PROC(cancel_res,	cancelres,	norep,		res,	void, 1),
+  PROC(unlock_res,	unlockres,	norep,		res,	void, 1),
+  PROC(granted_res,	res,		norep,		res,	void, 1),
+  /* statd callback */
+  PROC(sm_notify,	reboot,		void,		reboot,	void, 1),
+  PROC(none,		void,		void,		void,	void, 1),
+  PROC(none,		void,		void,		void,	void, 1),
+  PROC(none,		void,		void,		void,	void, 1),
+  PROC(share,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(unshare,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(nm_lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(free_all,	notify,		void,		args,	void, 0),
+
+};
diff --git a/kernel/fs/lockd/svcshare.c b/kernel/fs/lockd/svcshare.c
new file mode 100644
index 000000000..b0ae07008
--- /dev/null
+++ b/kernel/fs/lockd/svcshare.c
@@ -0,0 +1,106 @@
+/*
+ * linux/fs/lockd/svcshare.c
+ *
+ * Management of DOS shares.
+ *
+ * Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/time.h>
+#include <linux/unistd.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+
+static inline int
+nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh)
+{
+	return share->s_owner.len == oh->len
+	    && !memcmp(share->s_owner.data, oh->data, oh->len);
+}
+
+__be32
+nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file,
+			struct nlm_args *argp)
+{
+	struct nlm_share	*share;
+	struct xdr_netobj	*oh = &argp->lock.oh;
+	u8			*ohdata;
+
+	for (share = file->f_shares; share; share = share->s_next) {
+		if (share->s_host == host && nlm_cmp_owner(share, oh))
+			goto update;
+		if ((argp->fsm_access & share->s_mode)
+		 || (argp->fsm_mode   & share->s_access ))
+			return nlm_lck_denied;
+	}
+
+	share = kmalloc(sizeof(*share) + oh->len,
+						GFP_KERNEL);
+	if (share == NULL)
+		return nlm_lck_denied_nolocks;
+
+	/* Copy owner handle */
+	ohdata = (u8 *) (share + 1);
+	memcpy(ohdata, oh->data, oh->len);
+
+	share->s_file	    = file;
+	share->s_host       = host;
+	share->s_owner.data = ohdata;
+	share->s_owner.len  = oh->len;
+	share->s_next       = file->f_shares;
+	file->f_shares      = share;
+
+update:
+	share->s_access = argp->fsm_access;
+	share->s_mode   = argp->fsm_mode;
+	return nlm_granted;
+}
+
+/*
+ * Delete a share.
+ */
+__be32
+nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
+			struct nlm_args *argp)
+{
+	struct nlm_share	*share, **shpp;
+	struct xdr_netobj	*oh = &argp->lock.oh;
+
+	for (shpp = &file->f_shares; (share = *shpp) != NULL;
+					shpp = &share->s_next) {
+		if (share->s_host == host && nlm_cmp_owner(share, oh)) {
+			*shpp = share->s_next;
+			kfree(share);
+			return nlm_granted;
+		}
+	}
+
+	/* X/Open spec says return success even if there was no
+	 * corresponding share. */
+	return nlm_granted;
+}
+
+/*
+ * Traverse all shares for a given file, and delete
+ * those owned by the given (type of) host
+ */
+void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file,
+		nlm_host_match_fn_t match)
+{
+	struct nlm_share	*share, **shpp;
+
+	shpp = &file->f_shares;
+	while ((share = *shpp) !=  NULL) {
+		if (match(share->s_host, host)) {
+			*shpp = share->s_next;
+			kfree(share);
+			continue;
+		}
+		shpp = &share->s_next;
+	}
+}
diff --git a/kernel/fs/lockd/svcsubs.c b/kernel/fs/lockd/svcsubs.c
new file mode 100644
index 000000000..a563ddbc1
--- /dev/null
+++ b/kernel/fs/lockd/svcsubs.c
@@ -0,0 +1,457 @@
+/*
+ * linux/fs/lockd/svcsubs.c
+ *
+ * Various support routines for the NLM server.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <uapi/linux/nfs2.h>
+
+#define NLMDBG_FACILITY		NLMDBG_SVCSUBS
+
+
+/*
+ * Global file hash table
+ */
+#define FILE_HASH_BITS		7
+#define FILE_NRHASH		(1<<FILE_HASH_BITS)
+static struct hlist_head	nlm_files[FILE_NRHASH];
+static DEFINE_MUTEX(nlm_file_mutex);
+
+#ifdef CONFIG_SUNRPC_DEBUG
+static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
+{
+	u32 *fhp = (u32*)f->data;
+
+	/* print the first 32 bytes of the fh */
+	dprintk("lockd: %s (%08x %08x %08x %08x %08x %08x %08x %08x)\n",
+		msg, fhp[0], fhp[1], fhp[2], fhp[3],
+		fhp[4], fhp[5], fhp[6], fhp[7]);
+}
+
+static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
+{
+	struct inode *inode = file_inode(file->f_file);
+
+	dprintk("lockd: %s %s/%ld\n",
+		msg, inode->i_sb->s_id, inode->i_ino);
+}
+#else
+static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
+{
+	return;
+}
+
+static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
+{
+	return;
+}
+#endif
+
+static inline unsigned int file_hash(struct nfs_fh *f)
+{
+	unsigned int tmp=0;
+	int i;
+	for (i=0; i<NFS2_FHSIZE;i++)
+		tmp += f->data[i];
+	return tmp & (FILE_NRHASH - 1);
+}
+
+/*
+ * Lookup file info. If it doesn't exist, create a file info struct
+ * and open a (VFS) file for the given inode.
+ *
+ * FIXME:
+ * Note that we open the file O_RDONLY even when creating write locks.
+ * This is not quite right, but for now, we assume the client performs
+ * the proper R/W checking.
+ */
+__be32
+nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
+					struct nfs_fh *f)
+{
+	struct nlm_file	*file;
+	unsigned int	hash;
+	__be32		nfserr;
+
+	nlm_debug_print_fh("nlm_lookup_file", f);
+
+	hash = file_hash(f);
+
+	/* Lock file table */
+	mutex_lock(&nlm_file_mutex);
+
+	hlist_for_each_entry(file, &nlm_files[hash], f_list)
+		if (!nfs_compare_fh(&file->f_handle, f))
+			goto found;
+
+	nlm_debug_print_fh("creating file for", f);
+
+	nfserr = nlm_lck_denied_nolocks;
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		goto out_unlock;
+
+	memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
+	mutex_init(&file->f_mutex);
+	INIT_HLIST_NODE(&file->f_list);
+	INIT_LIST_HEAD(&file->f_blocks);
+
+	/* Open the file. Note that this must not sleep for too long, else
+	 * we would lock up lockd:-) So no NFS re-exports, folks.
+	 *
+	 * We have to make sure we have the right credential to open
+	 * the file.
+	 */
+	if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
+		dprintk("lockd: open failed (error %d)\n", nfserr);
+		goto out_free;
+	}
+
+	hlist_add_head(&file->f_list, &nlm_files[hash]);
+
+found:
+	dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
+	*result = file;
+	file->f_count++;
+	nfserr = 0;
+
+out_unlock:
+	mutex_unlock(&nlm_file_mutex);
+	return nfserr;
+
+out_free:
+	kfree(file);
+	goto out_unlock;
+}
+
+/*
+ * Delete a file after having released all locks, blocks and shares
+ */
+static inline void
+nlm_delete_file(struct nlm_file *file)
+{
+	nlm_debug_print_file("closing file", file);
+	if (!hlist_unhashed(&file->f_list)) {
+		hlist_del(&file->f_list);
+		nlmsvc_ops->fclose(file->f_file);
+		kfree(file);
+	} else {
+		printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
+	}
+}
+
+/*
+ * Loop over all locks on the given file and perform the specified
+ * action.
+ */
+static int
+nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
+			nlm_host_match_fn_t match)
+{
+	struct inode	 *inode = nlmsvc_file_inode(file);
+	struct file_lock *fl;
+	struct file_lock_context *flctx = inode->i_flctx;
+	struct nlm_host	 *lockhost;
+
+	if (!flctx || list_empty_careful(&flctx->flc_posix))
+		return 0;
+again:
+	file->f_locks = 0;
+	spin_lock(&flctx->flc_lock);
+	list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+		if (fl->fl_lmops != &nlmsvc_lock_operations)
+			continue;
+
+		/* update current lock count */
+		file->f_locks++;
+
+		lockhost = (struct nlm_host *) fl->fl_owner;
+		if (match(lockhost, host)) {
+			struct file_lock lock = *fl;
+
+			spin_unlock(&flctx->flc_lock);
+			lock.fl_type  = F_UNLCK;
+			lock.fl_start = 0;
+			lock.fl_end   = OFFSET_MAX;
+			if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) {
+				printk("lockd: unlock failure in %s:%d\n",
+						__FILE__, __LINE__);
+				return 1;
+			}
+			goto again;
+		}
+	}
+	spin_unlock(&flctx->flc_lock);
+
+	return 0;
+}
+
+static int
+nlmsvc_always_match(void *dummy1, struct nlm_host *dummy2)
+{
+	return 1;
+}
+
+/*
+ * Inspect a single file
+ */
+static inline int
+nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match)
+{
+	nlmsvc_traverse_blocks(host, file, match);
+	nlmsvc_traverse_shares(host, file, match);
+	return nlm_traverse_locks(host, file, match);
+}
+
+/*
+ * Quick check whether there are still any locks, blocks or
+ * shares on a given file.
+ */
+static inline int
+nlm_file_inuse(struct nlm_file *file)
+{
+	struct inode	 *inode = nlmsvc_file_inode(file);
+	struct file_lock *fl;
+	struct file_lock_context *flctx = inode->i_flctx;
+
+	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
+		return 1;
+
+	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+		spin_lock(&flctx->flc_lock);
+		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+			if (fl->fl_lmops == &nlmsvc_lock_operations) {
+				spin_unlock(&flctx->flc_lock);
+				return 1;
+			}
+		}
+		spin_unlock(&flctx->flc_lock);
+	}
+	file->f_locks = 0;
+	return 0;
+}
+
+/*
+ * Loop over all files in the file table.
+ */
+static int
+nlm_traverse_files(void *data, nlm_host_match_fn_t match,
+		int (*is_failover_file)(void *data, struct nlm_file *file))
+{
+	struct hlist_node *next;
+	struct nlm_file	*file;
+	int i, ret = 0;
+
+	mutex_lock(&nlm_file_mutex);
+	for (i = 0; i < FILE_NRHASH; i++) {
+		hlist_for_each_entry_safe(file, next, &nlm_files[i], f_list) {
+			if (is_failover_file && !is_failover_file(data, file))
+				continue;
+			file->f_count++;
+			mutex_unlock(&nlm_file_mutex);
+
+			/* Traverse locks, blocks and shares of this file
+			 * and update file->f_locks count */
+			if (nlm_inspect_file(data, file, match))
+				ret = 1;
+
+			mutex_lock(&nlm_file_mutex);
+			file->f_count--;
+			/* No more references to this file. Let go of it. */
+			if (list_empty(&file->f_blocks) && !file->f_locks
+			 && !file->f_shares && !file->f_count) {
+				hlist_del(&file->f_list);
+				nlmsvc_ops->fclose(file->f_file);
+				kfree(file);
+			}
+		}
+	}
+	mutex_unlock(&nlm_file_mutex);
+	return ret;
+}
+
+/*
+ * Release file. If there are no more remote locks on this file,
+ * close it and free the handle.
+ *
+ * Note that we can't do proper reference counting without major
+ * contortions because the code in fs/locks.c creates, deletes and
+ * splits locks without notification. Our only way is to walk the
+ * entire lock list each time we remove a lock.
+ */
+void
+nlm_release_file(struct nlm_file *file)
+{
+	dprintk("lockd: nlm_release_file(%p, ct = %d)\n",
+				file, file->f_count);
+
+	/* Lock file table */
+	mutex_lock(&nlm_file_mutex);
+
+	/* If there are no more locks etc, delete the file */
+	if (--file->f_count == 0 && !nlm_file_inuse(file))
+		nlm_delete_file(file);
+
+	mutex_unlock(&nlm_file_mutex);
+}
+
+/*
+ * Helpers function for resource traversal
+ *
+ * nlmsvc_mark_host:
+ *	used by the garbage collector; simply sets h_inuse only for those
+ *	hosts, which passed network check.
+ *	Always returns 0.
+ *
+ * nlmsvc_same_host:
+ *	returns 1 iff the two hosts match. Used to release
+ *	all resources bound to a specific host.
+ *
+ * nlmsvc_is_client:
+ *	returns 1 iff the host is a client.
+ *	Used by nlmsvc_invalidate_all
+ */
+
+static int
+nlmsvc_mark_host(void *data, struct nlm_host *hint)
+{
+	struct nlm_host *host = data;
+
+	if ((hint->net == NULL) ||
+	    (host->net == hint->net))
+		host->h_inuse = 1;
+	return 0;
+}
+
+static int
+nlmsvc_same_host(void *data, struct nlm_host *other)
+{
+	struct nlm_host *host = data;
+
+	return host == other;
+}
+
+static int
+nlmsvc_is_client(void *data, struct nlm_host *dummy)
+{
+	struct nlm_host *host = data;
+
+	if (host->h_server) {
+		/* we are destroying locks even though the client
+		 * hasn't asked us too, so don't unmonitor the
+		 * client
+		 */
+		if (host->h_nsmhandle)
+			host->h_nsmhandle->sm_sticky = 1;
+		return 1;
+	} else
+		return 0;
+}
+
+/*
+ * Mark all hosts that still hold resources
+ */
+void
+nlmsvc_mark_resources(struct net *net)
+{
+	struct nlm_host hint;
+
+	dprintk("lockd: nlmsvc_mark_resources for net %p\n", net);
+	hint.net = net;
+	nlm_traverse_files(&hint, nlmsvc_mark_host, NULL);
+}
+
+/*
+ * Release all resources held by the given client
+ */
+void
+nlmsvc_free_host_resources(struct nlm_host *host)
+{
+	dprintk("lockd: nlmsvc_free_host_resources\n");
+
+	if (nlm_traverse_files(host, nlmsvc_same_host, NULL)) {
+		printk(KERN_WARNING
+			"lockd: couldn't remove all locks held by %s\n",
+			host->h_name);
+		BUG();
+	}
+}
+
+/**
+ * nlmsvc_invalidate_all - remove all locks held for clients
+ *
+ * Release all locks held by NFS clients.
+ *
+ */
+void
+nlmsvc_invalidate_all(void)
+{
+	/*
+	 * Previously, the code would call
+	 * nlmsvc_free_host_resources for each client in
+	 * turn, which is about as inefficient as it gets.
+	 * Now we just do it once in nlm_traverse_files.
+	 */
+	nlm_traverse_files(NULL, nlmsvc_is_client, NULL);
+}
+
+static int
+nlmsvc_match_sb(void *datap, struct nlm_file *file)
+{
+	struct super_block *sb = datap;
+
+	return sb == file_inode(file->f_file)->i_sb;
+}
+
+/**
+ * nlmsvc_unlock_all_by_sb - release locks held on this file system
+ * @sb: super block
+ *
+ * Release all locks held by clients accessing this file system.
+ */
+int
+nlmsvc_unlock_all_by_sb(struct super_block *sb)
+{
+	int ret;
+
+	ret = nlm_traverse_files(sb, nlmsvc_always_match, nlmsvc_match_sb);
+	return ret ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
+
+static int
+nlmsvc_match_ip(void *datap, struct nlm_host *host)
+{
+	return rpc_cmp_addr(nlm_srcaddr(host), datap);
+}
+
+/**
+ * nlmsvc_unlock_all_by_ip - release local locks by IP address
+ * @server_addr: server's IP address as seen by clients
+ *
+ * Release all locks held by clients accessing this host
+ * via the passed in IP address.
+ */
+int
+nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr)
+{
+	int ret;
+
+	ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL);
+	return ret ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip);
diff --git a/kernel/fs/lockd/xdr.c b/kernel/fs/lockd/xdr.c
new file mode 100644
index 000000000..5b651daad
--- /dev/null
+++ b/kernel/fs/lockd/xdr.c
@@ -0,0 +1,337 @@
+/*
+ * linux/fs/lockd/xdr.c
+ *
+ * XDR support for lockd and the lock client.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/nfs.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#include <uapi/linux/nfs2.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+
+static inline loff_t
+s32_to_loff_t(__s32 offset)
+{
+	return (loff_t)offset;
+}
+
+static inline __s32
+loff_t_to_s32(loff_t offset)
+{
+	__s32 res;
+	if (offset >= NLM_OFFSET_MAX)
+		res = NLM_OFFSET_MAX;
+	else if (offset <= -NLM_OFFSET_MAX)
+		res = -NLM_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+/*
+ * XDR functions for basic NLM types
+ */
+static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	unsigned int	len;
+
+	len = ntohl(*p++);
+	
+	if(len==0)
+	{
+		c->len=4;
+		memset(c->data, 0, 4);	/* hockeypux brain damage */
+	}
+	else if(len<=NLM_MAXCOOKIELEN)
+	{
+		c->len=len;
+		memcpy(c->data, p, len);
+		p+=XDR_QUADLEN(len);
+	}
+	else 
+	{
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
+		return NULL;
+	}
+	return p;
+}
+
+static inline __be32 *
+nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	*p++ = htonl(c->len);
+	memcpy(p, c->data, c->len);
+	p+=XDR_QUADLEN(c->len);
+	return p;
+}
+
+static __be32 *
+nlm_decode_fh(__be32 *p, struct nfs_fh *f)
+{
+	unsigned int	len;
+
+	if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
+		dprintk("lockd: bad fhandle size %d (should be %d)\n",
+			len, NFS2_FHSIZE);
+		return NULL;
+	}
+	f->size = NFS2_FHSIZE;
+	memset(f->data, 0, sizeof(f->data));
+	memcpy(f->data, p, NFS2_FHSIZE);
+	return p + XDR_QUADLEN(NFS2_FHSIZE);
+}
+
+/*
+ * Encode and decode owner handle
+ */
+static inline __be32 *
+nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
+{
+	return xdr_decode_netobj(p, oh);
+}
+
+static inline __be32 *
+nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
+{
+	return xdr_encode_netobj(p, oh);
+}
+
+static __be32 *
+nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
+{
+	struct file_lock	*fl = &lock->fl;
+	s32			start, len, end;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len,
+					    NLM_MAXSTRLEN))
+	 || !(p = nlm_decode_fh(p, &lock->fh))
+	 || !(p = nlm_decode_oh(p, &lock->oh)))
+		return NULL;
+	lock->svid  = ntohl(*p++);
+
+	locks_init_lock(fl);
+	fl->fl_owner = current->files;
+	fl->fl_pid   = (pid_t)lock->svid;
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = F_RDLCK;		/* as good as anything else */
+	start = ntohl(*p++);
+	len = ntohl(*p++);
+	end = start + len - 1;
+
+	fl->fl_start = s32_to_loff_t(start);
+
+	if (len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = s32_to_loff_t(end);
+	return p;
+}
+
+/*
+ * Encode result of a TEST/TEST_MSG call
+ */
+static __be32 *
+nlm_encode_testres(__be32 *p, struct nlm_res *resp)
+{
+	s32		start, len;
+
+	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+		return NULL;
+	*p++ = resp->status;
+
+	if (resp->status == nlm_lck_denied) {
+		struct file_lock	*fl = &resp->lock.fl;
+
+		*p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
+		*p++ = htonl(resp->lock.svid);
+
+		/* Encode owner handle. */
+		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
+			return NULL;
+
+		start = loff_t_to_s32(fl->fl_start);
+		if (fl->fl_end == OFFSET_MAX)
+			len = 0;
+		else
+			len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+
+		*p++ = htonl(start);
+		*p++ = htonl(len);
+	}
+
+	return p;
+}
+
+
+/*
+ * First, the server side XDR functions
+ */
+int
+nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+		return 0;
+
+	exclusive = ntohl(*p++);
+	if (!(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_encode_testres(p, resp)))
+		return 0;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block  = ntohl(*p++);
+	exclusive    = ntohl(*p++);
+	if (!(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	argp->reclaim = ntohl(*p++);
+	argp->state   = ntohl(*p++);
+	argp->monitor = 1;		/* monitor client by default */
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block = ntohl(*p++);
+	exclusive = ntohl(*p++);
+	if (!(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	if (!(p = nlm_decode_cookie(p, &argp->cookie))
+	 || !(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	argp->lock.fl.fl_type = F_UNLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(&lock->fl);
+	lock->svid = ~(u32) 0;
+	lock->fl.fl_pid = (pid_t)lock->svid;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie))
+	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN))
+	 || !(p = nlm_decode_fh(p, &lock->fh))
+	 || !(p = nlm_decode_oh(p, &lock->oh)))
+		return 0;
+	argp->fsm_mode = ntohl(*p++);
+	argp->fsm_access = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	*p++ = xdr_zero;		/* sequence argument */
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
+{
+	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_decode_cookie(p, &resp->cookie)))
+		return 0;
+	resp->status = *p++;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
diff --git a/kernel/fs/lockd/xdr4.c b/kernel/fs/lockd/xdr4.c
new file mode 100644
index 000000000..dfa4789cd
--- /dev/null
+++ b/kernel/fs/lockd/xdr4.c
@@ -0,0 +1,334 @@
+/*
+ * linux/fs/lockd/xdr4.c
+ *
+ * XDR support for lockd and the lock client.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ * Copyright (C) 1999, Trond Myklebust <trond.myklebust@fys.uio.no>
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/nfs.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+static inline loff_t
+s64_to_loff_t(__s64 offset)
+{
+	return (loff_t)offset;
+}
+
+
+static inline s64
+loff_t_to_s64(loff_t offset)
+{
+	s64 res;
+	if (offset > NLM4_OFFSET_MAX)
+		res = NLM4_OFFSET_MAX;
+	else if (offset < -NLM4_OFFSET_MAX)
+		res = -NLM4_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+/*
+ * XDR functions for basic NLM types
+ */
+static __be32 *
+nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	unsigned int	len;
+
+	len = ntohl(*p++);
+	
+	if(len==0)
+	{
+		c->len=4;
+		memset(c->data, 0, 4);	/* hockeypux brain damage */
+	}
+	else if(len<=NLM_MAXCOOKIELEN)
+	{
+		c->len=len;
+		memcpy(c->data, p, len);
+		p+=XDR_QUADLEN(len);
+	}
+	else 
+	{
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
+		return NULL;
+	}
+	return p;
+}
+
+static __be32 *
+nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	*p++ = htonl(c->len);
+	memcpy(p, c->data, c->len);
+	p+=XDR_QUADLEN(c->len);
+	return p;
+}
+
+static __be32 *
+nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
+{
+	memset(f->data, 0, sizeof(f->data));
+	f->size = ntohl(*p++);
+	if (f->size > NFS_MAXFHSIZE) {
+		dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
+			f->size, NFS_MAXFHSIZE);
+		return NULL;
+	}
+      	memcpy(f->data, p, f->size);
+	return p + XDR_QUADLEN(f->size);
+}
+
+/*
+ * Encode and decode owner handle
+ */
+static __be32 *
+nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
+{
+	return xdr_decode_netobj(p, oh);
+}
+
+static __be32 *
+nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
+{
+	struct file_lock	*fl = &lock->fl;
+	__u64			len, start;
+	__s64			end;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN))
+	 || !(p = nlm4_decode_fh(p, &lock->fh))
+	 || !(p = nlm4_decode_oh(p, &lock->oh)))
+		return NULL;
+	lock->svid  = ntohl(*p++);
+
+	locks_init_lock(fl);
+	fl->fl_owner = current->files;
+	fl->fl_pid   = (pid_t)lock->svid;
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = F_RDLCK;		/* as good as anything else */
+	p = xdr_decode_hyper(p, &start);
+	p = xdr_decode_hyper(p, &len);
+	end = start + len - 1;
+
+	fl->fl_start = s64_to_loff_t(start);
+
+	if (len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = s64_to_loff_t(end);
+	return p;
+}
+
+/*
+ * Encode result of a TEST/TEST_MSG call
+ */
+static __be32 *
+nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
+{
+	s64		start, len;
+
+	dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp);
+	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+		return NULL;
+	*p++ = resp->status;
+
+	if (resp->status == nlm_lck_denied) {
+		struct file_lock	*fl = &resp->lock.fl;
+
+		*p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
+		*p++ = htonl(resp->lock.svid);
+
+		/* Encode owner handle. */
+		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
+			return NULL;
+
+		start = loff_t_to_s64(fl->fl_start);
+		if (fl->fl_end == OFFSET_MAX)
+			len = 0;
+		else
+			len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+		
+		p = xdr_encode_hyper(p, start);
+		p = xdr_encode_hyper(p, len);
+		dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n",
+			resp->status, (int)resp->lock.svid, fl->fl_type,
+			(long long)fl->fl_start,  (long long)fl->fl_end);
+	}
+
+	dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp);
+	return p;
+}
+
+
+/*
+ * First, the server side XDR functions
+ */
+int
+nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+		return 0;
+
+	exclusive = ntohl(*p++);
+	if (!(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_encode_testres(p, resp)))
+		return 0;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block  = ntohl(*p++);
+	exclusive    = ntohl(*p++);
+	if (!(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	argp->reclaim = ntohl(*p++);
+	argp->state   = ntohl(*p++);
+	argp->monitor = 1;		/* monitor client by default */
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block = ntohl(*p++);
+	exclusive = ntohl(*p++);
+	if (!(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
+	 || !(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	argp->lock.fl.fl_type = F_UNLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(&lock->fl);
+	lock->svid = ~(u32) 0;
+	lock->fl.fl_pid = (pid_t)lock->svid;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
+	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN))
+	 || !(p = nlm4_decode_fh(p, &lock->fh))
+	 || !(p = nlm4_decode_oh(p, &lock->oh)))
+		return 0;
+	argp->fsm_mode = ntohl(*p++);
+	argp->fsm_access = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	*p++ = xdr_zero;		/* sequence argument */
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
+{
+	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
+		return 0;
+	resp->status = *p++;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
diff --git a/kernel/fs/locks.c b/kernel/fs/locks.c
new file mode 100644
index 000000000..653faabb0
--- /dev/null
+++ b/kernel/fs/locks.c
@@ -0,0 +1,2703 @@
+/*
+ *  linux/fs/locks.c
+ *
+ *  Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
+ *  Doug Evans (dje@spiff.uucp), August 07, 1992
+ *
+ *  Deadlock detection added.
+ *  FIXME: one thing isn't handled yet:
+ *	- mandatory locks (requires lots of changes elsewhere)
+ *  Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
+ *
+ *  Miscellaneous edits, and a total rewrite of posix_lock_file() code.
+ *  Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
+ *  
+ *  Converted file_lock_table to a linked list from an array, which eliminates
+ *  the limits on how many active file locks are open.
+ *  Chad Page (pageone@netcom.com), November 27, 1994
+ * 
+ *  Removed dependency on file descriptors. dup()'ed file descriptors now
+ *  get the same locks as the original file descriptors, and a close() on
+ *  any file descriptor removes ALL the locks on the file for the current
+ *  process. Since locks still depend on the process id, locks are inherited
+ *  after an exec() but not after a fork(). This agrees with POSIX, and both
+ *  BSD and SVR4 practice.
+ *  Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
+ *
+ *  Scrapped free list which is redundant now that we allocate locks
+ *  dynamically with kmalloc()/kfree().
+ *  Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
+ *
+ *  Implemented two lock personalities - FL_FLOCK and FL_POSIX.
+ *
+ *  FL_POSIX locks are created with calls to fcntl() and lockf() through the
+ *  fcntl() system call. They have the semantics described above.
+ *
+ *  FL_FLOCK locks are created with calls to flock(), through the flock()
+ *  system call, which is new. Old C libraries implement flock() via fcntl()
+ *  and will continue to use the old, broken implementation.
+ *
+ *  FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
+ *  with a file pointer (filp). As a result they can be shared by a parent
+ *  process and its children after a fork(). They are removed when the last
+ *  file descriptor referring to the file pointer is closed (unless explicitly
+ *  unlocked). 
+ *
+ *  FL_FLOCK locks never deadlock, an existing lock is always removed before
+ *  upgrading from shared to exclusive (or vice versa). When this happens
+ *  any processes blocked by the current lock are woken up and allowed to
+ *  run before the new lock is applied.
+ *  Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
+ *
+ *  Removed some race conditions in flock_lock_file(), marked other possible
+ *  races. Just grep for FIXME to see them. 
+ *  Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
+ *
+ *  Addressed Dmitry's concerns. Deadlock checking no longer recursive.
+ *  Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
+ *  once we've checked for blocking and deadlocking.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
+ *
+ *  Initial implementation of mandatory locks. SunOS turned out to be
+ *  a rotten model, so I implemented the "obvious" semantics.
+ *  See 'Documentation/filesystems/mandatory-locking.txt' for details.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
+ *
+ *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
+ *  check if a file has mandatory locks, used by mmap(), open() and creat() to
+ *  see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
+ *  Manual, Section 2.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
+ *
+ *  Tidied up block list handling. Added '/proc/locks' interface.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
+ *
+ *  Fixed deadlock condition for pathological code that mixes calls to
+ *  flock() and fcntl().
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
+ *
+ *  Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
+ *  for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
+ *  guarantee sensible behaviour in the case where file system modules might
+ *  be compiled with different options than the kernel itself.
+ *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
+ *
+ *  Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
+ *  (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
+ *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
+ *
+ *  Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
+ *  locks. Changed process synchronisation to avoid dereferencing locks that
+ *  have already been freed.
+ *  Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
+ *
+ *  Made the block list a circular list to minimise searching in the list.
+ *  Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
+ *
+ *  Made mandatory locking a mount option. Default is not to allow mandatory
+ *  locking.
+ *  Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
+ *
+ *  Some adaptations for NFS support.
+ *  Olaf Kirch (okir@monad.swb.de), Dec 1996,
+ *
+ *  Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
+ *  Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
+ *
+ *  Use slab allocator instead of kmalloc/kfree.
+ *  Use generic list implementation from <linux/list.h>.
+ *  Sped up posix_locks_deadlock by only considering blocked locks.
+ *  Matthew Wilcox <willy@debian.org>, March, 2000.
+ *
+ *  Leases and LOCK_MAND
+ *  Matthew Wilcox <willy@debian.org>, June, 2000.
+ *  Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
+ */
+
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/rcupdate.h>
+#include <linux/pid_namespace.h>
+#include <linux/hashtable.h>
+#include <linux/percpu.h>
+#include <linux/lglock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/filelock.h>
+
+#include <asm/uaccess.h>
+
+#define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
+#define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
+#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
+#define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
+
+static bool lease_breaking(struct file_lock *fl)
+{
+	return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+}
+
+static int target_leasetype(struct file_lock *fl)
+{
+	if (fl->fl_flags & FL_UNLOCK_PENDING)
+		return F_UNLCK;
+	if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+		return F_RDLCK;
+	return fl->fl_type;
+}
+
+int leases_enable = 1;
+int lease_break_time = 45;
+
+/*
+ * The global file_lock_list is only used for displaying /proc/locks, so we
+ * keep a list on each CPU, with each list protected by its own spinlock via
+ * the file_lock_lglock. Note that alterations to the list also require that
+ * the relevant flc_lock is held.
+ */
+DEFINE_STATIC_LGLOCK(file_lock_lglock);
+static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+
+/*
+ * The blocked_hash is used to find POSIX lock loops for deadlock detection.
+ * It is protected by blocked_lock_lock.
+ *
+ * We hash locks by lockowner in order to optimize searching for the lock a
+ * particular lockowner is waiting on.
+ *
+ * FIXME: make this value scale via some heuristic? We generally will want more
+ * buckets when we have more lockowners holding locks, but that's a little
+ * difficult to determine without knowing what the workload will look like.
+ */
+#define BLOCKED_HASH_BITS	7
+static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
+
+/*
+ * This lock protects the blocked_hash. Generally, if you're accessing it, you
+ * want to be holding this lock.
+ *
+ * In addition, it also protects the fl->fl_block list, and the fl->fl_next
+ * pointer for file_lock structures that are acting as lock requests (in
+ * contrast to those that are acting as records of acquired locks).
+ *
+ * Note that when we acquire this lock in order to change the above fields,
+ * we often hold the flc_lock as well. In certain cases, when reading the fields
+ * protected by this lock, we can skip acquiring it iff we already hold the
+ * flc_lock.
+ *
+ * In particular, adding an entry to the fl_block list requires that you hold
+ * both the flc_lock and the blocked_lock_lock (acquired in that order).
+ * Deleting an entry from the list however only requires the file_lock_lock.
+ */
+static DEFINE_SPINLOCK(blocked_lock_lock);
+
+static struct kmem_cache *flctx_cache __read_mostly;
+static struct kmem_cache *filelock_cache __read_mostly;
+
+static struct file_lock_context *
+locks_get_lock_context(struct inode *inode, int type)
+{
+	struct file_lock_context *new;
+
+	if (likely(inode->i_flctx) || type == F_UNLCK)
+		goto out;
+
+	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+	if (!new)
+		goto out;
+
+	spin_lock_init(&new->flc_lock);
+	INIT_LIST_HEAD(&new->flc_flock);
+	INIT_LIST_HEAD(&new->flc_posix);
+	INIT_LIST_HEAD(&new->flc_lease);
+
+	/*
+	 * Assign the pointer if it's not already assigned. If it is, then
+	 * free the context we just allocated.
+	 */
+	if (cmpxchg(&inode->i_flctx, NULL, new))
+		kmem_cache_free(flctx_cache, new);
+out:
+	return inode->i_flctx;
+}
+
+void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+	if (ctx) {
+		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+		WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
+		WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+		kmem_cache_free(flctx_cache, ctx);
+	}
+}
+
+static void locks_init_lock_heads(struct file_lock *fl)
+{
+	INIT_HLIST_NODE(&fl->fl_link);
+	INIT_LIST_HEAD(&fl->fl_list);
+	INIT_LIST_HEAD(&fl->fl_block);
+	init_waitqueue_head(&fl->fl_wait);
+}
+
+/* Allocate an empty lock structure. */
+struct file_lock *locks_alloc_lock(void)
+{
+	struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
+
+	if (fl)
+		locks_init_lock_heads(fl);
+
+	return fl;
+}
+EXPORT_SYMBOL_GPL(locks_alloc_lock);
+
+void locks_release_private(struct file_lock *fl)
+{
+	if (fl->fl_ops) {
+		if (fl->fl_ops->fl_release_private)
+			fl->fl_ops->fl_release_private(fl);
+		fl->fl_ops = NULL;
+	}
+
+	if (fl->fl_lmops) {
+		if (fl->fl_lmops->lm_put_owner) {
+			fl->fl_lmops->lm_put_owner(fl->fl_owner);
+			fl->fl_owner = NULL;
+		}
+		fl->fl_lmops = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(locks_release_private);
+
+/* Free a lock which is not in use. */
+void locks_free_lock(struct file_lock *fl)
+{
+	BUG_ON(waitqueue_active(&fl->fl_wait));
+	BUG_ON(!list_empty(&fl->fl_list));
+	BUG_ON(!list_empty(&fl->fl_block));
+	BUG_ON(!hlist_unhashed(&fl->fl_link));
+
+	locks_release_private(fl);
+	kmem_cache_free(filelock_cache, fl);
+}
+EXPORT_SYMBOL(locks_free_lock);
+
+static void
+locks_dispose_list(struct list_head *dispose)
+{
+	struct file_lock *fl;
+
+	while (!list_empty(dispose)) {
+		fl = list_first_entry(dispose, struct file_lock, fl_list);
+		list_del_init(&fl->fl_list);
+		locks_free_lock(fl);
+	}
+}
+
+void locks_init_lock(struct file_lock *fl)
+{
+	memset(fl, 0, sizeof(struct file_lock));
+	locks_init_lock_heads(fl);
+}
+
+EXPORT_SYMBOL(locks_init_lock);
+
+/*
+ * Initialize a new lock from an existing file_lock structure.
+ */
+void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
+{
+	new->fl_owner = fl->fl_owner;
+	new->fl_pid = fl->fl_pid;
+	new->fl_file = NULL;
+	new->fl_flags = fl->fl_flags;
+	new->fl_type = fl->fl_type;
+	new->fl_start = fl->fl_start;
+	new->fl_end = fl->fl_end;
+	new->fl_lmops = fl->fl_lmops;
+	new->fl_ops = NULL;
+
+	if (fl->fl_lmops) {
+		if (fl->fl_lmops->lm_get_owner)
+			fl->fl_lmops->lm_get_owner(fl->fl_owner);
+	}
+}
+EXPORT_SYMBOL(locks_copy_conflock);
+
+void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	/* "new" must be a freshly-initialized lock */
+	WARN_ON_ONCE(new->fl_ops);
+
+	locks_copy_conflock(new, fl);
+
+	new->fl_file = fl->fl_file;
+	new->fl_ops = fl->fl_ops;
+
+	if (fl->fl_ops) {
+		if (fl->fl_ops->fl_copy_lock)
+			fl->fl_ops->fl_copy_lock(new, fl);
+	}
+}
+
+EXPORT_SYMBOL(locks_copy_lock);
+
+static inline int flock_translate_cmd(int cmd) {
+	if (cmd & LOCK_MAND)
+		return cmd & (LOCK_MAND | LOCK_RW);
+	switch (cmd) {
+	case LOCK_SH:
+		return F_RDLCK;
+	case LOCK_EX:
+		return F_WRLCK;
+	case LOCK_UN:
+		return F_UNLCK;
+	}
+	return -EINVAL;
+}
+
+/* Fill in a file_lock structure with an appropriate FLOCK lock. */
+static struct file_lock *
+flock_make_lock(struct file *filp, unsigned int cmd)
+{
+	struct file_lock *fl;
+	int type = flock_translate_cmd(cmd);
+
+	if (type < 0)
+		return ERR_PTR(type);
+	
+	fl = locks_alloc_lock();
+	if (fl == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	fl->fl_file = filp;
+	fl->fl_owner = filp;
+	fl->fl_pid = current->tgid;
+	fl->fl_flags = FL_FLOCK;
+	fl->fl_type = type;
+	fl->fl_end = OFFSET_MAX;
+	
+	return fl;
+}
+
+static int assign_type(struct file_lock *fl, long type)
+{
+	switch (type) {
+	case F_RDLCK:
+	case F_WRLCK:
+	case F_UNLCK:
+		fl->fl_type = type;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
+				 struct flock64 *l)
+{
+	switch (l->l_whence) {
+	case SEEK_SET:
+		fl->fl_start = 0;
+		break;
+	case SEEK_CUR:
+		fl->fl_start = filp->f_pos;
+		break;
+	case SEEK_END:
+		fl->fl_start = i_size_read(file_inode(filp));
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (l->l_start > OFFSET_MAX - fl->fl_start)
+		return -EOVERFLOW;
+	fl->fl_start += l->l_start;
+	if (fl->fl_start < 0)
+		return -EINVAL;
+
+	/* POSIX-1996 leaves the case l->l_len < 0 undefined;
+	   POSIX-2001 defines it. */
+	if (l->l_len > 0) {
+		if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
+			return -EOVERFLOW;
+		fl->fl_end = fl->fl_start + l->l_len - 1;
+
+	} else if (l->l_len < 0) {
+		if (fl->fl_start + l->l_len < 0)
+			return -EINVAL;
+		fl->fl_end = fl->fl_start - 1;
+		fl->fl_start += l->l_len;
+	} else
+		fl->fl_end = OFFSET_MAX;
+
+	fl->fl_owner = current->files;
+	fl->fl_pid = current->tgid;
+	fl->fl_file = filp;
+	fl->fl_flags = FL_POSIX;
+	fl->fl_ops = NULL;
+	fl->fl_lmops = NULL;
+
+	return assign_type(fl, l->l_type);
+}
+
+/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
+ * style lock.
+ */
+static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
+			       struct flock *l)
+{
+	struct flock64 ll = {
+		.l_type = l->l_type,
+		.l_whence = l->l_whence,
+		.l_start = l->l_start,
+		.l_len = l->l_len,
+	};
+
+	return flock64_to_posix_lock(filp, fl, &ll);
+}
+
+/* default lease lock manager operations */
+static bool
+lease_break_callback(struct file_lock *fl)
+{
+	kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
+	return false;
+}
+
+static void
+lease_setup(struct file_lock *fl, void **priv)
+{
+	struct file *filp = fl->fl_file;
+	struct fasync_struct *fa = *priv;
+
+	/*
+	 * fasync_insert_entry() returns the old entry if any. If there was no
+	 * old entry, then it used "priv" and inserted it into the fasync list.
+	 * Clear the pointer to indicate that it shouldn't be freed.
+	 */
+	if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
+		*priv = NULL;
+
+	__f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+}
+
+static const struct lock_manager_operations lease_manager_ops = {
+	.lm_break = lease_break_callback,
+	.lm_change = lease_modify,
+	.lm_setup = lease_setup,
+};
+
+/*
+ * Initialize a lease, use the default lock manager operations
+ */
+static int lease_init(struct file *filp, long type, struct file_lock *fl)
+ {
+	if (assign_type(fl, type) != 0)
+		return -EINVAL;
+
+	fl->fl_owner = filp;
+	fl->fl_pid = current->tgid;
+
+	fl->fl_file = filp;
+	fl->fl_flags = FL_LEASE;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_ops = NULL;
+	fl->fl_lmops = &lease_manager_ops;
+	return 0;
+}
+
+/* Allocate a file_lock initialised to this type of lease */
+static struct file_lock *lease_alloc(struct file *filp, long type)
+{
+	struct file_lock *fl = locks_alloc_lock();
+	int error = -ENOMEM;
+
+	if (fl == NULL)
+		return ERR_PTR(error);
+
+	error = lease_init(filp, type, fl);
+	if (error) {
+		locks_free_lock(fl);
+		return ERR_PTR(error);
+	}
+	return fl;
+}
+
+/* Check if two locks overlap each other.
+ */
+static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
+{
+	return ((fl1->fl_end >= fl2->fl_start) &&
+		(fl2->fl_end >= fl1->fl_start));
+}
+
+/*
+ * Check whether two locks have the same owner.
+ */
+static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+{
+	if (fl1->fl_lmops && fl1->fl_lmops->lm_compare_owner)
+		return fl2->fl_lmops == fl1->fl_lmops &&
+			fl1->fl_lmops->lm_compare_owner(fl1, fl2);
+	return fl1->fl_owner == fl2->fl_owner;
+}
+
+/* Must be called with the flc_lock held! */
+static void locks_insert_global_locks(struct file_lock *fl)
+{
+	lg_local_lock(&file_lock_lglock);
+	fl->fl_link_cpu = smp_processor_id();
+	hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
+	lg_local_unlock(&file_lock_lglock);
+}
+
+/* Must be called with the flc_lock held! */
+static void locks_delete_global_locks(struct file_lock *fl)
+{
+	/*
+	 * Avoid taking lock if already unhashed. This is safe since this check
+	 * is done while holding the flc_lock, and new insertions into the list
+	 * also require that it be held.
+	 */
+	if (hlist_unhashed(&fl->fl_link))
+		return;
+	lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+	hlist_del_init(&fl->fl_link);
+	lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+}
+
+static unsigned long
+posix_owner_key(struct file_lock *fl)
+{
+	if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
+		return fl->fl_lmops->lm_owner_key(fl);
+	return (unsigned long)fl->fl_owner;
+}
+
+static void locks_insert_global_blocked(struct file_lock *waiter)
+{
+	lockdep_assert_held(&blocked_lock_lock);
+
+	hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
+}
+
+static void locks_delete_global_blocked(struct file_lock *waiter)
+{
+	lockdep_assert_held(&blocked_lock_lock);
+
+	hash_del(&waiter->fl_link);
+}
+
+/* Remove waiter from blocker's block list.
+ * When blocker ends up pointing to itself then the list is empty.
+ *
+ * Must be called with blocked_lock_lock held.
+ */
+static void __locks_delete_block(struct file_lock *waiter)
+{
+	locks_delete_global_blocked(waiter);
+	list_del_init(&waiter->fl_block);
+	waiter->fl_next = NULL;
+}
+
+static void locks_delete_block(struct file_lock *waiter)
+{
+	spin_lock(&blocked_lock_lock);
+	__locks_delete_block(waiter);
+	spin_unlock(&blocked_lock_lock);
+}
+
+/* Insert waiter into blocker's block list.
+ * We use a circular list so that processes can be easily woken up in
+ * the order they blocked. The documentation doesn't require this but
+ * it seems like the reasonable thing to do.
+ *
+ * Must be called with both the flc_lock and blocked_lock_lock held. The
+ * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
+ * that the flc_lock is also held on insertions we can avoid taking the
+ * blocked_lock_lock in some cases when we see that the fl_block list is empty.
+ */
+static void __locks_insert_block(struct file_lock *blocker,
+					struct file_lock *waiter)
+{
+	BUG_ON(!list_empty(&waiter->fl_block));
+	waiter->fl_next = blocker;
+	list_add_tail(&waiter->fl_block, &blocker->fl_block);
+	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
+		locks_insert_global_blocked(waiter);
+}
+
+/* Must be called with flc_lock held. */
+static void locks_insert_block(struct file_lock *blocker,
+					struct file_lock *waiter)
+{
+	spin_lock(&blocked_lock_lock);
+	__locks_insert_block(blocker, waiter);
+	spin_unlock(&blocked_lock_lock);
+}
+
+/*
+ * Wake up processes blocked waiting for blocker.
+ *
+ * Must be called with the inode->flc_lock held!
+ */
+static void locks_wake_up_blocks(struct file_lock *blocker)
+{
+	/*
+	 * Avoid taking global lock if list is empty. This is safe since new
+	 * blocked requests are only added to the list under the flc_lock, and
+	 * the flc_lock is always held here. Note that removal from the fl_block
+	 * list does not require the flc_lock, so we must recheck list_empty()
+	 * after acquiring the blocked_lock_lock.
+	 */
+	if (list_empty(&blocker->fl_block))
+		return;
+
+	spin_lock(&blocked_lock_lock);
+	while (!list_empty(&blocker->fl_block)) {
+		struct file_lock *waiter;
+
+		waiter = list_first_entry(&blocker->fl_block,
+				struct file_lock, fl_block);
+		__locks_delete_block(waiter);
+		if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
+			waiter->fl_lmops->lm_notify(waiter);
+		else
+			wake_up(&waiter->fl_wait);
+	}
+	spin_unlock(&blocked_lock_lock);
+}
+
+static void
+locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
+{
+	fl->fl_nspid = get_pid(task_tgid(current));
+	list_add_tail(&fl->fl_list, before);
+	locks_insert_global_locks(fl);
+}
+
+static void
+locks_unlink_lock_ctx(struct file_lock *fl)
+{
+	locks_delete_global_locks(fl);
+	list_del_init(&fl->fl_list);
+	if (fl->fl_nspid) {
+		put_pid(fl->fl_nspid);
+		fl->fl_nspid = NULL;
+	}
+	locks_wake_up_blocks(fl);
+}
+
+static void
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
+{
+	locks_unlink_lock_ctx(fl);
+	if (dispose)
+		list_add(&fl->fl_list, dispose);
+	else
+		locks_free_lock(fl);
+}
+
+/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
+ * checks for shared/exclusive status of overlapping locks.
+ */
+static int locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+{
+	if (sys_fl->fl_type == F_WRLCK)
+		return 1;
+	if (caller_fl->fl_type == F_WRLCK)
+		return 1;
+	return 0;
+}
+
+/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
+ * checking before calling the locks_conflict().
+ */
+static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+{
+	/* POSIX locks owned by the same process do not conflict with
+	 * each other.
+	 */
+	if (posix_same_owner(caller_fl, sys_fl))
+		return (0);
+
+	/* Check whether they overlap */
+	if (!locks_overlap(caller_fl, sys_fl))
+		return 0;
+
+	return (locks_conflict(caller_fl, sys_fl));
+}
+
+/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
+ * checking before calling the locks_conflict().
+ */
+static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+{
+	/* FLOCK locks referring to the same filp do not conflict with
+	 * each other.
+	 */
+	if (caller_fl->fl_file == sys_fl->fl_file)
+		return (0);
+	if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
+		return 0;
+
+	return (locks_conflict(caller_fl, sys_fl));
+}
+
+void
+posix_test_lock(struct file *filp, struct file_lock *fl)
+{
+	struct file_lock *cfl;
+	struct file_lock_context *ctx;
+	struct inode *inode = file_inode(filp);
+
+	ctx = inode->i_flctx;
+	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
+		fl->fl_type = F_UNLCK;
+		return;
+	}
+
+	spin_lock(&ctx->flc_lock);
+	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+		if (posix_locks_conflict(fl, cfl)) {
+			locks_copy_conflock(fl, cfl);
+			if (cfl->fl_nspid)
+				fl->fl_pid = pid_vnr(cfl->fl_nspid);
+			goto out;
+		}
+	}
+	fl->fl_type = F_UNLCK;
+out:
+	spin_unlock(&ctx->flc_lock);
+	return;
+}
+EXPORT_SYMBOL(posix_test_lock);
+
+/*
+ * Deadlock detection:
+ *
+ * We attempt to detect deadlocks that are due purely to posix file
+ * locks.
+ *
+ * We assume that a task can be waiting for at most one lock at a time.
+ * So for any acquired lock, the process holding that lock may be
+ * waiting on at most one other lock.  That lock in turns may be held by
+ * someone waiting for at most one other lock.  Given a requested lock
+ * caller_fl which is about to wait for a conflicting lock block_fl, we
+ * follow this chain of waiters to ensure we are not about to create a
+ * cycle.
+ *
+ * Since we do this before we ever put a process to sleep on a lock, we
+ * are ensured that there is never a cycle; that is what guarantees that
+ * the while() loop in posix_locks_deadlock() eventually completes.
+ *
+ * Note: the above assumption may not be true when handling lock
+ * requests from a broken NFS client. It may also fail in the presence
+ * of tasks (such as posix threads) sharing the same open file table.
+ * To handle those cases, we just bail out after a few iterations.
+ *
+ * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
+ * Because the owner is not even nominally tied to a thread of
+ * execution, the deadlock detection below can't reasonably work well. Just
+ * skip it for those.
+ *
+ * In principle, we could do a more limited deadlock detection on FL_OFDLCK
+ * locks that just checks for the case where two tasks are attempting to
+ * upgrade from read to write locks on the same inode.
+ */
+
+#define MAX_DEADLK_ITERATIONS 10
+
+/* Find a lock that the owner of the given block_fl is blocking on. */
+static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
+{
+	struct file_lock *fl;
+
+	hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
+		if (posix_same_owner(fl, block_fl))
+			return fl->fl_next;
+	}
+	return NULL;
+}
+
+/* Must be called with the blocked_lock_lock held! */
+static int posix_locks_deadlock(struct file_lock *caller_fl,
+				struct file_lock *block_fl)
+{
+	int i = 0;
+
+	lockdep_assert_held(&blocked_lock_lock);
+
+	/*
+	 * This deadlock detector can't reasonably detect deadlocks with
+	 * FL_OFDLCK locks, since they aren't owned by a process, per-se.
+	 */
+	if (IS_OFDLCK(caller_fl))
+		return 0;
+
+	while ((block_fl = what_owner_is_waiting_for(block_fl))) {
+		if (i++ > MAX_DEADLK_ITERATIONS)
+			return 0;
+		if (posix_same_owner(caller_fl, block_fl))
+			return 1;
+	}
+	return 0;
+}
+
+/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
+ * after any leases, but before any posix locks.
+ *
+ * Note that if called with an FL_EXISTS argument, the caller may determine
+ * whether or not a lock was successfully freed by testing the return
+ * value for -ENOENT.
+ */
+static int flock_lock_file(struct file *filp, struct file_lock *request)
+{
+	struct file_lock *new_fl = NULL;
+	struct file_lock *fl;
+	struct file_lock_context *ctx;
+	struct inode *inode = file_inode(filp);
+	int error = 0;
+	bool found = false;
+	LIST_HEAD(dispose);
+
+	ctx = locks_get_lock_context(inode, request->fl_type);
+	if (!ctx) {
+		if (request->fl_type != F_UNLCK)
+			return -ENOMEM;
+		return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0;
+	}
+
+	if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
+		new_fl = locks_alloc_lock();
+		if (!new_fl)
+			return -ENOMEM;
+	}
+
+	spin_lock(&ctx->flc_lock);
+	if (request->fl_flags & FL_ACCESS)
+		goto find_conflict;
+
+	list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
+		if (filp != fl->fl_file)
+			continue;
+		if (request->fl_type == fl->fl_type)
+			goto out;
+		found = true;
+		locks_delete_lock_ctx(fl, &dispose);
+		break;
+	}
+
+	if (request->fl_type == F_UNLCK) {
+		if ((request->fl_flags & FL_EXISTS) && !found)
+			error = -ENOENT;
+		goto out;
+	}
+
+find_conflict:
+	list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
+		if (!flock_locks_conflict(request, fl))
+			continue;
+		error = -EAGAIN;
+		if (!(request->fl_flags & FL_SLEEP))
+			goto out;
+		error = FILE_LOCK_DEFERRED;
+		locks_insert_block(fl, request);
+		goto out;
+	}
+	if (request->fl_flags & FL_ACCESS)
+		goto out;
+	locks_copy_lock(new_fl, request);
+	locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
+	new_fl = NULL;
+	error = 0;
+
+out:
+	spin_unlock(&ctx->flc_lock);
+	if (new_fl)
+		locks_free_lock(new_fl);
+	locks_dispose_list(&dispose);
+	return error;
+}
+
+static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+{
+	struct file_lock *fl, *tmp;
+	struct file_lock *new_fl = NULL;
+	struct file_lock *new_fl2 = NULL;
+	struct file_lock *left = NULL;
+	struct file_lock *right = NULL;
+	struct file_lock_context *ctx;
+	int error;
+	bool added = false;
+	LIST_HEAD(dispose);
+
+	ctx = locks_get_lock_context(inode, request->fl_type);
+	if (!ctx)
+		return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM;
+
+	/*
+	 * We may need two file_lock structures for this operation,
+	 * so we get them in advance to avoid races.
+	 *
+	 * In some cases we can be sure, that no new locks will be needed
+	 */
+	if (!(request->fl_flags & FL_ACCESS) &&
+	    (request->fl_type != F_UNLCK ||
+	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
+		new_fl = locks_alloc_lock();
+		new_fl2 = locks_alloc_lock();
+	}
+
+	spin_lock(&ctx->flc_lock);
+	/*
+	 * New lock request. Walk all POSIX locks and look for conflicts. If
+	 * there are any, either return error or put the request on the
+	 * blocker's list of waiters and the global blocked_hash.
+	 */
+	if (request->fl_type != F_UNLCK) {
+		list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
+			if (!posix_locks_conflict(request, fl))
+				continue;
+			if (conflock)
+				locks_copy_conflock(conflock, fl);
+			error = -EAGAIN;
+			if (!(request->fl_flags & FL_SLEEP))
+				goto out;
+			/*
+			 * Deadlock detection and insertion into the blocked
+			 * locks list must be done while holding the same lock!
+			 */
+			error = -EDEADLK;
+			spin_lock(&blocked_lock_lock);
+			if (likely(!posix_locks_deadlock(request, fl))) {
+				error = FILE_LOCK_DEFERRED;
+				__locks_insert_block(fl, request);
+			}
+			spin_unlock(&blocked_lock_lock);
+			goto out;
+  		}
+  	}
+
+	/* If we're just looking for a conflict, we're done. */
+	error = 0;
+	if (request->fl_flags & FL_ACCESS)
+		goto out;
+
+	/* Find the first old lock with the same owner as the new lock */
+	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
+		if (posix_same_owner(request, fl))
+			break;
+	}
+
+	/* Process locks with this owner. */
+	list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
+		if (!posix_same_owner(request, fl))
+			break;
+
+		/* Detect adjacent or overlapping regions (if same lock type) */
+		if (request->fl_type == fl->fl_type) {
+			/* In all comparisons of start vs end, use
+			 * "start - 1" rather than "end + 1". If end
+			 * is OFFSET_MAX, end + 1 will become negative.
+			 */
+			if (fl->fl_end < request->fl_start - 1)
+				continue;
+			/* If the next lock in the list has entirely bigger
+			 * addresses than the new one, insert the lock here.
+			 */
+			if (fl->fl_start - 1 > request->fl_end)
+				break;
+
+			/* If we come here, the new and old lock are of the
+			 * same type and adjacent or overlapping. Make one
+			 * lock yielding from the lower start address of both
+			 * locks to the higher end address.
+			 */
+			if (fl->fl_start > request->fl_start)
+				fl->fl_start = request->fl_start;
+			else
+				request->fl_start = fl->fl_start;
+			if (fl->fl_end < request->fl_end)
+				fl->fl_end = request->fl_end;
+			else
+				request->fl_end = fl->fl_end;
+			if (added) {
+				locks_delete_lock_ctx(fl, &dispose);
+				continue;
+			}
+			request = fl;
+			added = true;
+		} else {
+			/* Processing for different lock types is a bit
+			 * more complex.
+			 */
+			if (fl->fl_end < request->fl_start)
+				continue;
+			if (fl->fl_start > request->fl_end)
+				break;
+			if (request->fl_type == F_UNLCK)
+				added = true;
+			if (fl->fl_start < request->fl_start)
+				left = fl;
+			/* If the next lock in the list has a higher end
+			 * address than the new one, insert the new one here.
+			 */
+			if (fl->fl_end > request->fl_end) {
+				right = fl;
+				break;
+			}
+			if (fl->fl_start >= request->fl_start) {
+				/* The new lock completely replaces an old
+				 * one (This may happen several times).
+				 */
+				if (added) {
+					locks_delete_lock_ctx(fl, &dispose);
+					continue;
+				}
+				/*
+				 * Replace the old lock with new_fl, and
+				 * remove the old one. It's safe to do the
+				 * insert here since we know that we won't be
+				 * using new_fl later, and that the lock is
+				 * just replacing an existing lock.
+				 */
+				error = -ENOLCK;
+				if (!new_fl)
+					goto out;
+				locks_copy_lock(new_fl, request);
+				request = new_fl;
+				new_fl = NULL;
+				locks_insert_lock_ctx(request, &fl->fl_list);
+				locks_delete_lock_ctx(fl, &dispose);
+				added = true;
+			}
+		}
+	}
+
+	/*
+	 * The above code only modifies existing locks in case of merging or
+	 * replacing. If new lock(s) need to be inserted all modifications are
+	 * done below this, so it's safe yet to bail out.
+	 */
+	error = -ENOLCK; /* "no luck" */
+	if (right && left == right && !new_fl2)
+		goto out;
+
+	error = 0;
+	if (!added) {
+		if (request->fl_type == F_UNLCK) {
+			if (request->fl_flags & FL_EXISTS)
+				error = -ENOENT;
+			goto out;
+		}
+
+		if (!new_fl) {
+			error = -ENOLCK;
+			goto out;
+		}
+		locks_copy_lock(new_fl, request);
+		locks_insert_lock_ctx(new_fl, &fl->fl_list);
+		fl = new_fl;
+		new_fl = NULL;
+	}
+	if (right) {
+		if (left == right) {
+			/* The new lock breaks the old one in two pieces,
+			 * so we have to use the second new lock.
+			 */
+			left = new_fl2;
+			new_fl2 = NULL;
+			locks_copy_lock(left, right);
+			locks_insert_lock_ctx(left, &fl->fl_list);
+		}
+		right->fl_start = request->fl_end + 1;
+		locks_wake_up_blocks(right);
+	}
+	if (left) {
+		left->fl_end = request->fl_start - 1;
+		locks_wake_up_blocks(left);
+	}
+ out:
+	spin_unlock(&ctx->flc_lock);
+	/*
+	 * Free any unused locks.
+	 */
+	if (new_fl)
+		locks_free_lock(new_fl);
+	if (new_fl2)
+		locks_free_lock(new_fl2);
+	locks_dispose_list(&dispose);
+	return error;
+}
+
+/**
+ * posix_lock_file - Apply a POSIX-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ * @conflock: Place to return a copy of the conflicting lock, if found.
+ *
+ * Add a POSIX style lock to a file.
+ * We merge adjacent & overlapping locks whenever possible.
+ * POSIX locks are sorted by owner task, then by starting address
+ *
+ * Note that if called with an FL_EXISTS argument, the caller may determine
+ * whether or not a lock was successfully freed by testing the return
+ * value for -ENOENT.
+ */
+int posix_lock_file(struct file *filp, struct file_lock *fl,
+			struct file_lock *conflock)
+{
+	return __posix_lock_file(file_inode(filp), fl, conflock);
+}
+EXPORT_SYMBOL(posix_lock_file);
+
+/**
+ * posix_lock_file_wait - Apply a POSIX-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a POSIX style lock to a file.
+ * We merge adjacent & overlapping locks whenever possible.
+ * POSIX locks are sorted by owner task, then by starting address
+ */
+int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	int error;
+	might_sleep ();
+	for (;;) {
+		error = posix_lock_file(filp, fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
+	}
+	return error;
+}
+EXPORT_SYMBOL(posix_lock_file_wait);
+
+/**
+ * locks_mandatory_locked - Check for an active lock
+ * @file: the file to check
+ *
+ * Searches the inode's list of locks to find any POSIX locks which conflict.
+ * This function is called from locks_verify_locked() only.
+ */
+int locks_mandatory_locked(struct file *file)
+{
+	int ret;
+	struct inode *inode = file_inode(file);
+	struct file_lock_context *ctx;
+	struct file_lock *fl;
+
+	ctx = inode->i_flctx;
+	if (!ctx || list_empty_careful(&ctx->flc_posix))
+		return 0;
+
+	/*
+	 * Search the lock list for this inode for any POSIX locks.
+	 */
+	spin_lock(&ctx->flc_lock);
+	ret = 0;
+	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
+		if (fl->fl_owner != current->files &&
+		    fl->fl_owner != file) {
+			ret = -EAGAIN;
+			break;
+		}
+	}
+	spin_unlock(&ctx->flc_lock);
+	return ret;
+}
+
+/**
+ * locks_mandatory_area - Check for a conflicting lock
+ * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ
+ *		for shared
+ * @inode:      the file to check
+ * @filp:       how the file was opened (if it was)
+ * @offset:     start of area to check
+ * @count:      length of area to check
+ *
+ * Searches the inode's list of locks to find any POSIX locks which conflict.
+ * This function is called from rw_verify_area() and
+ * locks_verify_truncate().
+ */
+int locks_mandatory_area(int read_write, struct inode *inode,
+			 struct file *filp, loff_t offset,
+			 size_t count)
+{
+	struct file_lock fl;
+	int error;
+	bool sleep = false;
+
+	locks_init_lock(&fl);
+	fl.fl_pid = current->tgid;
+	fl.fl_file = filp;
+	fl.fl_flags = FL_POSIX | FL_ACCESS;
+	if (filp && !(filp->f_flags & O_NONBLOCK))
+		sleep = true;
+	fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
+	fl.fl_start = offset;
+	fl.fl_end = offset + count - 1;
+
+	for (;;) {
+		if (filp) {
+			fl.fl_owner = filp;
+			fl.fl_flags &= ~FL_SLEEP;
+			error = __posix_lock_file(inode, &fl, NULL);
+			if (!error)
+				break;
+		}
+
+		if (sleep)
+			fl.fl_flags |= FL_SLEEP;
+		fl.fl_owner = current->files;
+		error = __posix_lock_file(inode, &fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
+		if (!error) {
+			/*
+			 * If we've been sleeping someone might have
+			 * changed the permissions behind our back.
+			 */
+			if (__mandatory_lock(inode))
+				continue;
+		}
+
+		locks_delete_block(&fl);
+		break;
+	}
+
+	return error;
+}
+
+EXPORT_SYMBOL(locks_mandatory_area);
+
+static void lease_clear_pending(struct file_lock *fl, int arg)
+{
+	switch (arg) {
+	case F_UNLCK:
+		fl->fl_flags &= ~FL_UNLOCK_PENDING;
+		/* fall through: */
+	case F_RDLCK:
+		fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+	}
+}
+
+/* We already had a lease on this file; just change its type */
+int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
+{
+	int error = assign_type(fl, arg);
+
+	if (error)
+		return error;
+	lease_clear_pending(fl, arg);
+	locks_wake_up_blocks(fl);
+	if (arg == F_UNLCK) {
+		struct file *filp = fl->fl_file;
+
+		f_delown(filp);
+		filp->f_owner.signum = 0;
+		fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
+		if (fl->fl_fasync != NULL) {
+			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
+			fl->fl_fasync = NULL;
+		}
+		locks_delete_lock_ctx(fl, dispose);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lease_modify);
+
+static bool past_time(unsigned long then)
+{
+	if (!then)
+		/* 0 is a special value meaning "this never expires": */
+		return false;
+	return time_after(jiffies, then);
+}
+
+static void time_out_leases(struct inode *inode, struct list_head *dispose)
+{
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl, *tmp;
+
+	lockdep_assert_held(&ctx->flc_lock);
+
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
+		trace_time_out_leases(inode, fl);
+		if (past_time(fl->fl_downgrade_time))
+			lease_modify(fl, F_RDLCK, dispose);
+		if (past_time(fl->fl_break_time))
+			lease_modify(fl, F_UNLCK, dispose);
+	}
+}
+
+static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
+{
+	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+		return false;
+	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
+		return false;
+	return locks_conflict(breaker, lease);
+}
+
+static bool
+any_leases_conflict(struct inode *inode, struct file_lock *breaker)
+{
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl;
+
+	lockdep_assert_held(&ctx->flc_lock);
+
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+		if (leases_conflict(fl, breaker))
+			return true;
+	}
+	return false;
+}
+
+/**
+ *	__break_lease	-	revoke all outstanding leases on file
+ *	@inode: the inode of the file to return
+ *	@mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
+ *	    break all leases
+ *	@type: FL_LEASE: break leases and delegations; FL_DELEG: break
+ *	    only delegations
+ *
+ *	break_lease (inlined for speed) has checked there already is at least
+ *	some kind of lock (maybe a lease) on this file.  Leases are broken on
+ *	a call to open() or truncate().  This function can sleep unless you
+ *	specified %O_NONBLOCK to your open().
+ */
+int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
+{
+	int error = 0;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *new_fl, *fl, *tmp;
+	unsigned long break_time;
+	int want_write = (mode & O_ACCMODE) != O_RDONLY;
+	LIST_HEAD(dispose);
+
+	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
+	if (IS_ERR(new_fl))
+		return PTR_ERR(new_fl);
+	new_fl->fl_flags = type;
+
+	/* typically we will check that ctx is non-NULL before calling */
+	if (!ctx) {
+		WARN_ON_ONCE(1);
+		return error;
+	}
+
+	spin_lock(&ctx->flc_lock);
+
+	time_out_leases(inode, &dispose);
+
+	if (!any_leases_conflict(inode, new_fl))
+		goto out;
+
+	break_time = 0;
+	if (lease_break_time > 0) {
+		break_time = jiffies + lease_break_time * HZ;
+		if (break_time == 0)
+			break_time++;	/* so that 0 means no break time */
+	}
+
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
+		if (!leases_conflict(fl, new_fl))
+			continue;
+		if (want_write) {
+			if (fl->fl_flags & FL_UNLOCK_PENDING)
+				continue;
+			fl->fl_flags |= FL_UNLOCK_PENDING;
+			fl->fl_break_time = break_time;
+		} else {
+			if (lease_breaking(fl))
+				continue;
+			fl->fl_flags |= FL_DOWNGRADE_PENDING;
+			fl->fl_downgrade_time = break_time;
+		}
+		if (fl->fl_lmops->lm_break(fl))
+			locks_delete_lock_ctx(fl, &dispose);
+	}
+
+	if (list_empty(&ctx->flc_lease))
+		goto out;
+
+	if (mode & O_NONBLOCK) {
+		trace_break_lease_noblock(inode, new_fl);
+		error = -EWOULDBLOCK;
+		goto out;
+	}
+
+restart:
+	fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+	break_time = fl->fl_break_time;
+	if (break_time != 0)
+		break_time -= jiffies;
+	if (break_time == 0)
+		break_time++;
+	locks_insert_block(fl, new_fl);
+	trace_break_lease_block(inode, new_fl);
+	spin_unlock(&ctx->flc_lock);
+	locks_dispose_list(&dispose);
+	error = wait_event_interruptible_timeout(new_fl->fl_wait,
+						!new_fl->fl_next, break_time);
+	spin_lock(&ctx->flc_lock);
+	trace_break_lease_unblock(inode, new_fl);
+	locks_delete_block(new_fl);
+	if (error >= 0) {
+		/*
+		 * Wait for the next conflicting lease that has not been
+		 * broken yet
+		 */
+		if (error == 0)
+			time_out_leases(inode, &dispose);
+		if (any_leases_conflict(inode, new_fl))
+			goto restart;
+		error = 0;
+	}
+out:
+	spin_unlock(&ctx->flc_lock);
+	locks_dispose_list(&dispose);
+	locks_free_lock(new_fl);
+	return error;
+}
+
+EXPORT_SYMBOL(__break_lease);
+
+/**
+ *	lease_get_mtime - get the last modified time of an inode
+ *	@inode: the inode
+ *      @time:  pointer to a timespec which will contain the last modified time
+ *
+ * This is to force NFS clients to flush their caches for files with
+ * exclusive leases.  The justification is that if someone has an
+ * exclusive lease, then they could be modifying it.
+ */
+void lease_get_mtime(struct inode *inode, struct timespec *time)
+{
+	bool has_lease = false;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl;
+
+	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+		spin_lock(&ctx->flc_lock);
+		if (!list_empty(&ctx->flc_lease)) {
+			fl = list_first_entry(&ctx->flc_lease,
+						struct file_lock, fl_list);
+			if (fl->fl_type == F_WRLCK)
+				has_lease = true;
+		}
+		spin_unlock(&ctx->flc_lock);
+	}
+
+	if (has_lease)
+		*time = current_fs_time(inode->i_sb);
+	else
+		*time = inode->i_mtime;
+}
+
+EXPORT_SYMBOL(lease_get_mtime);
+
+/**
+ *	fcntl_getlease - Enquire what lease is currently active
+ *	@filp: the file
+ *
+ *	The value returned by this function will be one of
+ *	(if no lease break is pending):
+ *
+ *	%F_RDLCK to indicate a shared lease is held.
+ *
+ *	%F_WRLCK to indicate an exclusive lease is held.
+ *
+ *	%F_UNLCK to indicate no lease is held.
+ *
+ *	(if a lease break is pending):
+ *
+ *	%F_RDLCK to indicate an exclusive lease needs to be
+ *		changed to a shared lease (or removed).
+ *
+ *	%F_UNLCK to indicate the lease needs to be removed.
+ *
+ *	XXX: sfr & willy disagree over whether F_INPROGRESS
+ *	should be returned to userspace.
+ */
+int fcntl_getlease(struct file *filp)
+{
+	struct file_lock *fl;
+	struct inode *inode = file_inode(filp);
+	struct file_lock_context *ctx = inode->i_flctx;
+	int type = F_UNLCK;
+	LIST_HEAD(dispose);
+
+	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+		spin_lock(&ctx->flc_lock);
+		time_out_leases(file_inode(filp), &dispose);
+		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+			if (fl->fl_file != filp)
+				continue;
+			type = target_leasetype(fl);
+			break;
+		}
+		spin_unlock(&ctx->flc_lock);
+		locks_dispose_list(&dispose);
+	}
+	return type;
+}
+
+/**
+ * check_conflicting_open - see if the given dentry points to a file that has
+ * 			    an existing open that would conflict with the
+ * 			    desired lease.
+ * @dentry:	dentry to check
+ * @arg:	type of lease that we're trying to acquire
+ *
+ * Check to see if there's an existing open fd on this file that would
+ * conflict with the lease we're trying to set.
+ */
+static int
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
+{
+	int ret = 0;
+	struct inode *inode = dentry->d_inode;
+
+	if (flags & FL_LAYOUT)
+		return 0;
+
+	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+		return -EAGAIN;
+
+	if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
+	    (atomic_read(&inode->i_count) > 1)))
+		ret = -EAGAIN;
+
+	return ret;
+}
+
+static int
+generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
+{
+	struct file_lock *fl, *my_fl = NULL, *lease;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct file_lock_context *ctx;
+	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
+	int error;
+	LIST_HEAD(dispose);
+
+	lease = *flp;
+	trace_generic_add_lease(inode, lease);
+
+	/* Note that arg is never F_UNLCK here */
+	ctx = locks_get_lock_context(inode, arg);
+	if (!ctx)
+		return -ENOMEM;
+
+	/*
+	 * In the delegation case we need mutual exclusion with
+	 * a number of operations that take the i_mutex.  We trylock
+	 * because delegations are an optional optimization, and if
+	 * there's some chance of a conflict--we'd rather not
+	 * bother, maybe that's a sign this just isn't a good file to
+	 * hand out a delegation on.
+	 */
+	if (is_deleg && !mutex_trylock(&inode->i_mutex))
+		return -EAGAIN;
+
+	if (is_deleg && arg == F_WRLCK) {
+		/* Write delegations are not currently supported: */
+		mutex_unlock(&inode->i_mutex);
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+
+	spin_lock(&ctx->flc_lock);
+	time_out_leases(inode, &dispose);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	if (error)
+		goto out;
+
+	/*
+	 * At this point, we know that if there is an exclusive
+	 * lease on this file, then we hold it on this filp
+	 * (otherwise our open of this file would have blocked).
+	 * And if we are trying to acquire an exclusive lease,
+	 * then the file is not open by anyone (including us)
+	 * except for this filp.
+	 */
+	error = -EAGAIN;
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+		if (fl->fl_file == filp &&
+		    fl->fl_owner == lease->fl_owner) {
+			my_fl = fl;
+			continue;
+		}
+
+		/*
+		 * No exclusive leases if someone else has a lease on
+		 * this file:
+		 */
+		if (arg == F_WRLCK)
+			goto out;
+		/*
+		 * Modifying our existing lease is OK, but no getting a
+		 * new lease if someone else is opening for write:
+		 */
+		if (fl->fl_flags & FL_UNLOCK_PENDING)
+			goto out;
+	}
+
+	if (my_fl != NULL) {
+		lease = my_fl;
+		error = lease->fl_lmops->lm_change(lease, arg, &dispose);
+		if (error)
+			goto out;
+		goto out_setup;
+	}
+
+	error = -EINVAL;
+	if (!leases_enable)
+		goto out;
+
+	locks_insert_lock_ctx(lease, &ctx->flc_lease);
+	/*
+	 * The check in break_lease() is lockless. It's possible for another
+	 * open to race in after we did the earlier check for a conflicting
+	 * open but before the lease was inserted. Check again for a
+	 * conflicting open and cancel the lease if there is one.
+	 *
+	 * We also add a barrier here to ensure that the insertion of the lock
+	 * precedes these checks.
+	 */
+	smp_mb();
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	if (error) {
+		locks_unlink_lock_ctx(lease);
+		goto out;
+	}
+
+out_setup:
+	if (lease->fl_lmops->lm_setup)
+		lease->fl_lmops->lm_setup(lease, priv);
+out:
+	spin_unlock(&ctx->flc_lock);
+	locks_dispose_list(&dispose);
+	if (is_deleg)
+		mutex_unlock(&inode->i_mutex);
+	if (!error && !my_fl)
+		*flp = NULL;
+	return error;
+}
+
+static int generic_delete_lease(struct file *filp, void *owner)
+{
+	int error = -EAGAIN;
+	struct file_lock *fl, *victim = NULL;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct file_lock_context *ctx = inode->i_flctx;
+	LIST_HEAD(dispose);
+
+	if (!ctx) {
+		trace_generic_delete_lease(inode, NULL);
+		return error;
+	}
+
+	spin_lock(&ctx->flc_lock);
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+		if (fl->fl_file == filp &&
+		    fl->fl_owner == owner) {
+			victim = fl;
+			break;
+		}
+	}
+	trace_generic_delete_lease(inode, victim);
+	if (victim)
+		error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
+	spin_unlock(&ctx->flc_lock);
+	locks_dispose_list(&dispose);
+	return error;
+}
+
+/**
+ *	generic_setlease	-	sets a lease on an open file
+ *	@filp:	file pointer
+ *	@arg:	type of lease to obtain
+ *	@flp:	input - file_lock to use, output - file_lock inserted
+ *	@priv:	private data for lm_setup (may be NULL if lm_setup
+ *		doesn't require it)
+ *
+ *	The (input) flp->fl_lmops->lm_break function is required
+ *	by break_lease().
+ */
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
+			void **priv)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
+		return -EACCES;
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+	error = security_file_lock(filp, arg);
+	if (error)
+		return error;
+
+	switch (arg) {
+	case F_UNLCK:
+		return generic_delete_lease(filp, *priv);
+	case F_RDLCK:
+	case F_WRLCK:
+		if (!(*flp)->fl_lmops->lm_break) {
+			WARN_ON_ONCE(1);
+			return -ENOLCK;
+		}
+
+		return generic_add_lease(filp, arg, flp, priv);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(generic_setlease);
+
+/**
+ * vfs_setlease        -       sets a lease on an open file
+ * @filp:	file pointer
+ * @arg:	type of lease to obtain
+ * @lease:	file_lock to use when adding a lease
+ * @priv:	private info for lm_setup when adding a lease (may be
+ * 		NULL if lm_setup doesn't require it)
+ *
+ * Call this to establish a lease on the file. The "lease" argument is not
+ * used for F_UNLCK requests and may be NULL. For commands that set or alter
+ * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set;
+ * if not, this function will return -ENOLCK (and generate a scary-looking
+ * stack trace).
+ *
+ * The "priv" pointer is passed directly to the lm_setup function as-is. It
+ * may be NULL if the lm_setup operation doesn't require it.
+ */
+int
+vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
+{
+	if (filp->f_op->setlease)
+		return filp->f_op->setlease(filp, arg, lease, priv);
+	else
+		return generic_setlease(filp, arg, lease, priv);
+}
+EXPORT_SYMBOL_GPL(vfs_setlease);
+
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
+{
+	struct file_lock *fl;
+	struct fasync_struct *new;
+	int error;
+
+	fl = lease_alloc(filp, arg);
+	if (IS_ERR(fl))
+		return PTR_ERR(fl);
+
+	new = fasync_alloc();
+	if (!new) {
+		locks_free_lock(fl);
+		return -ENOMEM;
+	}
+	new->fa_fd = fd;
+
+	error = vfs_setlease(filp, arg, &fl, (void **)&new);
+	if (fl)
+		locks_free_lock(fl);
+	if (new)
+		fasync_free(new);
+	return error;
+}
+
+/**
+ *	fcntl_setlease	-	sets a lease on an open file
+ *	@fd: open file descriptor
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *
+ *	Call this fcntl to establish a lease on the file.
+ *	Note that you also need to call %F_SETSIG to
+ *	receive a signal when the lease is broken.
+ */
+int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+{
+	if (arg == F_UNLCK)
+		return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
+	return do_fcntl_add_lease(fd, filp, arg);
+}
+
+/**
+ * flock_lock_file_wait - Apply a FLOCK-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a FLOCK style lock to a file.
+ */
+int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	int error;
+	might_sleep();
+	for (;;) {
+		error = flock_lock_file(filp, fl);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
+	}
+	return error;
+}
+
+EXPORT_SYMBOL(flock_lock_file_wait);
+
+/**
+ *	sys_flock: - flock() system call.
+ *	@fd: the file descriptor to lock.
+ *	@cmd: the type of lock to apply.
+ *
+ *	Apply a %FL_FLOCK style lock to an open file descriptor.
+ *	The @cmd can be one of
+ *
+ *	%LOCK_SH -- a shared lock.
+ *
+ *	%LOCK_EX -- an exclusive lock.
+ *
+ *	%LOCK_UN -- remove an existing lock.
+ *
+ *	%LOCK_MAND -- a `mandatory' flock.  This exists to emulate Windows Share Modes.
+ *
+ *	%LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
+ *	processes read and write access respectively.
+ */
+SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
+{
+	struct fd f = fdget(fd);
+	struct file_lock *lock;
+	int can_sleep, unlock;
+	int error;
+
+	error = -EBADF;
+	if (!f.file)
+		goto out;
+
+	can_sleep = !(cmd & LOCK_NB);
+	cmd &= ~LOCK_NB;
+	unlock = (cmd == LOCK_UN);
+
+	if (!unlock && !(cmd & LOCK_MAND) &&
+	    !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+		goto out_putf;
+
+	lock = flock_make_lock(f.file, cmd);
+	if (IS_ERR(lock)) {
+		error = PTR_ERR(lock);
+		goto out_putf;
+	}
+
+	if (can_sleep)
+		lock->fl_flags |= FL_SLEEP;
+
+	error = security_file_lock(f.file, lock->fl_type);
+	if (error)
+		goto out_free;
+
+	if (f.file->f_op->flock)
+		error = f.file->f_op->flock(f.file,
+					  (can_sleep) ? F_SETLKW : F_SETLK,
+					  lock);
+	else
+		error = flock_lock_file_wait(f.file, lock);
+
+ out_free:
+	locks_free_lock(lock);
+
+ out_putf:
+	fdput(f);
+ out:
+	return error;
+}
+
+/**
+ * vfs_test_lock - test file byte range lock
+ * @filp: The file to test lock for
+ * @fl: The lock to test; also used to hold result
+ *
+ * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
+ * setting conf->fl_type to something other than F_UNLCK.
+ */
+int vfs_test_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op->lock)
+		return filp->f_op->lock(filp, F_GETLK, fl);
+	posix_test_lock(filp, fl);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfs_test_lock);
+
+static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
+{
+	flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
+#if BITS_PER_LONG == 32
+	/*
+	 * Make sure we can represent the posix lock via
+	 * legacy 32bit flock.
+	 */
+	if (fl->fl_start > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+	if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+#endif
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	flock->l_type = fl->fl_type;
+	return 0;
+}
+
+#if BITS_PER_LONG == 32
+static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
+{
+	flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	flock->l_type = fl->fl_type;
+}
+#endif
+
+/* Report the first existing lock that would conflict with l.
+ * This implements the F_GETLK command of fcntl().
+ */
+int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
+{
+	struct file_lock file_lock;
+	struct flock flock;
+	int error;
+
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+	error = -EINVAL;
+	if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK))
+		goto out;
+
+	error = flock_to_posix_lock(filp, &file_lock, &flock);
+	if (error)
+		goto out;
+
+	if (cmd == F_OFD_GETLK) {
+		error = -EINVAL;
+		if (flock.l_pid != 0)
+			goto out;
+
+		cmd = F_GETLK;
+		file_lock.fl_flags |= FL_OFDLCK;
+		file_lock.fl_owner = filp;
+	}
+
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
+ 
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK) {
+		error = posix_lock_to_flock(&flock, &file_lock);
+		if (error)
+			goto rel_priv;
+	}
+	error = -EFAULT;
+	if (!copy_to_user(l, &flock, sizeof(flock)))
+		error = 0;
+rel_priv:
+	locks_release_private(&file_lock);
+out:
+	return error;
+}
+
+/**
+ * vfs_lock_file - file byte range lock
+ * @filp: The file to apply the lock to
+ * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
+ * @fl: The lock to be applied
+ * @conf: Place to return a copy of the conflicting lock, if found.
+ *
+ * A caller that doesn't care about the conflicting lock may pass NULL
+ * as the final argument.
+ *
+ * If the filesystem defines a private ->lock() method, then @conf will
+ * be left unchanged; so a caller that cares should initialize it to
+ * some acceptable default.
+ *
+ * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
+ * locks, the ->lock() interface may return asynchronously, before the lock has
+ * been granted or denied by the underlying filesystem, if (and only if)
+ * lm_grant is set. Callers expecting ->lock() to return asynchronously
+ * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
+ * the request is for a blocking lock. When ->lock() does return asynchronously,
+ * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
+ * request completes.
+ * If the request is for non-blocking lock the file system should return
+ * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
+ * with the result. If the request timed out the callback routine will return a
+ * nonzero return code and the file system should release the lock. The file
+ * system is also responsible to keep a corresponding posix lock when it
+ * grants a lock so the VFS can find out which locks are locally held and do
+ * the correct lock cleanup when required.
+ * The underlying filesystem must not drop the kernel lock or call
+ * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED
+ * return code.
+ */
+int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
+{
+	if (filp->f_op->lock)
+		return filp->f_op->lock(filp, cmd, fl);
+	else
+		return posix_lock_file(filp, fl, conf);
+}
+EXPORT_SYMBOL_GPL(vfs_lock_file);
+
+static int do_lock_file_wait(struct file *filp, unsigned int cmd,
+			     struct file_lock *fl)
+{
+	int error;
+
+	error = security_file_lock(filp, fl->fl_type);
+	if (error)
+		return error;
+
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
+	}
+
+	return error;
+}
+
+/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+static int
+check_fmode_for_setlk(struct file_lock *fl)
+{
+	switch (fl->fl_type) {
+	case F_RDLCK:
+		if (!(fl->fl_file->f_mode & FMODE_READ))
+			return -EBADF;
+		break;
+	case F_WRLCK:
+		if (!(fl->fl_file->f_mode & FMODE_WRITE))
+			return -EBADF;
+	}
+	return 0;
+}
+
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+		struct flock __user *l)
+{
+	struct file_lock *file_lock = locks_alloc_lock();
+	struct flock flock;
+	struct inode *inode;
+	struct file *f;
+	int error;
+
+	if (file_lock == NULL)
+		return -ENOLCK;
+
+	/*
+	 * This might block, so we do it before checking the inode.
+	 */
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+
+	inode = file_inode(filp);
+
+	/* Don't allow mandatory locks on files that may be memory mapped
+	 * and shared.
+	 */
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
+	}
+
+again:
+	error = flock_to_posix_lock(filp, file_lock, &flock);
+	if (error)
+		goto out;
+
+	error = check_fmode_for_setlk(file_lock);
+	if (error)
+		goto out;
+
+	/*
+	 * If the cmd is requesting file-private locks, then set the
+	 * FL_OFDLCK flag and override the owner.
+	 */
+	switch (cmd) {
+	case F_OFD_SETLK:
+		error = -EINVAL;
+		if (flock.l_pid != 0)
+			goto out;
+
+		cmd = F_SETLK;
+		file_lock->fl_flags |= FL_OFDLCK;
+		file_lock->fl_owner = filp;
+		break;
+	case F_OFD_SETLKW:
+		error = -EINVAL;
+		if (flock.l_pid != 0)
+			goto out;
+
+		cmd = F_SETLKW;
+		file_lock->fl_flags |= FL_OFDLCK;
+		file_lock->fl_owner = filp;
+		/* Fallthrough */
+	case F_SETLKW:
+		file_lock->fl_flags |= FL_SLEEP;
+	}
+
+	error = do_lock_file_wait(filp, cmd, file_lock);
+
+	/*
+	 * Attempt to detect a close/fcntl race and recover by
+	 * releasing the lock that was just acquired.
+	 */
+	/*
+	 * we need that spin_lock here - it prevents reordering between
+	 * update of i_flctx->flc_posix and check for it done in close().
+	 * rcu_read_lock() wouldn't do.
+	 */
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
+		flock.l_type = F_UNLCK;
+		goto again;
+	}
+
+out:
+	locks_free_lock(file_lock);
+	return error;
+}
+
+#if BITS_PER_LONG == 32
+/* Report the first existing lock that would conflict with l.
+ * This implements the F_GETLK command of fcntl().
+ */
+int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
+{
+	struct file_lock file_lock;
+	struct flock64 flock;
+	int error;
+
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+	error = -EINVAL;
+	if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK))
+		goto out;
+
+	error = flock64_to_posix_lock(filp, &file_lock, &flock);
+	if (error)
+		goto out;
+
+	if (cmd == F_OFD_GETLK) {
+		error = -EINVAL;
+		if (flock.l_pid != 0)
+			goto out;
+
+		cmd = F_GETLK64;
+		file_lock.fl_flags |= FL_OFDLCK;
+		file_lock.fl_owner = filp;
+	}
+
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
+
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK)
+		posix_lock_to_flock64(&flock, &file_lock);
+
+	error = -EFAULT;
+	if (!copy_to_user(l, &flock, sizeof(flock)))
+		error = 0;
+
+	locks_release_private(&file_lock);
+out:
+	return error;
+}
+
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+		struct flock64 __user *l)
+{
+	struct file_lock *file_lock = locks_alloc_lock();
+	struct flock64 flock;
+	struct inode *inode;
+	struct file *f;
+	int error;
+
+	if (file_lock == NULL)
+		return -ENOLCK;
+
+	/*
+	 * This might block, so we do it before checking the inode.
+	 */
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+
+	inode = file_inode(filp);
+
+	/* Don't allow mandatory locks on files that may be memory mapped
+	 * and shared.
+	 */
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
+	}
+
+again:
+	error = flock64_to_posix_lock(filp, file_lock, &flock);
+	if (error)
+		goto out;
+
+	error = check_fmode_for_setlk(file_lock);
+	if (error)
+		goto out;
+
+	/*
+	 * If the cmd is requesting file-private locks, then set the
+	 * FL_OFDLCK flag and override the owner.
+	 */
+	switch (cmd) {
+	case F_OFD_SETLK:
+		error = -EINVAL;
+		if (flock.l_pid != 0)
+			goto out;
+
+		cmd = F_SETLK64;
+		file_lock->fl_flags |= FL_OFDLCK;
+		file_lock->fl_owner = filp;
+		break;
+	case F_OFD_SETLKW:
+		error = -EINVAL;
+		if (flock.l_pid != 0)
+			goto out;
+
+		cmd = F_SETLKW64;
+		file_lock->fl_flags |= FL_OFDLCK;
+		file_lock->fl_owner = filp;
+		/* Fallthrough */
+	case F_SETLKW64:
+		file_lock->fl_flags |= FL_SLEEP;
+	}
+
+	error = do_lock_file_wait(filp, cmd, file_lock);
+
+	/*
+	 * Attempt to detect a close/fcntl race and recover by
+	 * releasing the lock that was just acquired.
+	 */
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
+		flock.l_type = F_UNLCK;
+		goto again;
+	}
+
+out:
+	locks_free_lock(file_lock);
+	return error;
+}
+#endif /* BITS_PER_LONG == 32 */
+
+/*
+ * This function is called when the file is being removed
+ * from the task's fd array.  POSIX locks belonging to this task
+ * are deleted at this time.
+ */
+void locks_remove_posix(struct file *filp, fl_owner_t owner)
+{
+	struct file_lock lock;
+	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+
+	/*
+	 * If there are no locks held on this file, we don't need to call
+	 * posix_lock_file().  Another process could be setting a lock on this
+	 * file at the same time, but we wouldn't remove that lock anyway.
+	 */
+	if (!ctx || list_empty(&ctx->flc_posix))
+		return;
+
+	lock.fl_type = F_UNLCK;
+	lock.fl_flags = FL_POSIX | FL_CLOSE;
+	lock.fl_start = 0;
+	lock.fl_end = OFFSET_MAX;
+	lock.fl_owner = owner;
+	lock.fl_pid = current->tgid;
+	lock.fl_file = filp;
+	lock.fl_ops = NULL;
+	lock.fl_lmops = NULL;
+
+	vfs_lock_file(filp, F_SETLK, &lock, NULL);
+
+	if (lock.fl_ops && lock.fl_ops->fl_release_private)
+		lock.fl_ops->fl_release_private(&lock);
+}
+
+EXPORT_SYMBOL(locks_remove_posix);
+
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_flock(struct file *filp)
+{
+	struct file_lock fl = {
+		.fl_owner = filp,
+		.fl_pid = current->tgid,
+		.fl_file = filp,
+		.fl_flags = FL_FLOCK,
+		.fl_type = F_UNLCK,
+		.fl_end = OFFSET_MAX,
+	};
+	struct file_lock_context *flctx = file_inode(filp)->i_flctx;
+
+	if (list_empty(&flctx->flc_flock))
+		return;
+
+	if (filp->f_op->flock)
+		filp->f_op->flock(filp, F_SETLKW, &fl);
+	else
+		flock_lock_file(filp, &fl);
+
+	if (fl.fl_ops && fl.fl_ops->fl_release_private)
+		fl.fl_ops->fl_release_private(&fl);
+}
+
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_lease(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl, *tmp;
+	LIST_HEAD(dispose);
+
+	if (list_empty(&ctx->flc_lease))
+		return;
+
+	spin_lock(&ctx->flc_lock);
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
+		if (filp == fl->fl_file)
+			lease_modify(fl, F_UNLCK, &dispose);
+	spin_unlock(&ctx->flc_lock);
+	locks_dispose_list(&dispose);
+}
+
+/*
+ * This function is called on the last close of an open file.
+ */
+void locks_remove_file(struct file *filp)
+{
+	if (!file_inode(filp)->i_flctx)
+		return;
+
+	/* remove any OFD locks */
+	locks_remove_posix(filp, filp);
+
+	/* remove flock locks */
+	locks_remove_flock(filp);
+
+	/* remove any leases */
+	locks_remove_lease(filp);
+}
+
+/**
+ *	posix_unblock_lock - stop waiting for a file lock
+ *	@waiter: the lock which was waiting
+ *
+ *	lockd needs to block waiting for locks.
+ */
+int
+posix_unblock_lock(struct file_lock *waiter)
+{
+	int status = 0;
+
+	spin_lock(&blocked_lock_lock);
+	if (waiter->fl_next)
+		__locks_delete_block(waiter);
+	else
+		status = -ENOENT;
+	spin_unlock(&blocked_lock_lock);
+	return status;
+}
+EXPORT_SYMBOL(posix_unblock_lock);
+
+/**
+ * vfs_cancel_lock - file byte range unblock lock
+ * @filp: The file to apply the unblock to
+ * @fl: The lock to be unblocked
+ *
+ * Used by lock managers to cancel blocked requests
+ */
+int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op->lock)
+		return filp->f_op->lock(filp, F_CANCELLK, fl);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(vfs_cancel_lock);
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+struct locks_iterator {
+	int	li_cpu;
+	loff_t	li_pos;
+};
+
+static void lock_get_status(struct seq_file *f, struct file_lock *fl,
+			    loff_t id, char *pfx)
+{
+	struct inode *inode = NULL;
+	unsigned int fl_pid;
+
+	if (fl->fl_nspid)
+		fl_pid = pid_vnr(fl->fl_nspid);
+	else
+		fl_pid = fl->fl_pid;
+
+	if (fl->fl_file != NULL)
+		inode = file_inode(fl->fl_file);
+
+	seq_printf(f, "%lld:%s ", id, pfx);
+	if (IS_POSIX(fl)) {
+		if (fl->fl_flags & FL_ACCESS)
+			seq_puts(f, "ACCESS");
+		else if (IS_OFDLCK(fl))
+			seq_puts(f, "OFDLCK");
+		else
+			seq_puts(f, "POSIX ");
+
+		seq_printf(f, " %s ",
+			     (inode == NULL) ? "*NOINODE*" :
+			     mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
+	} else if (IS_FLOCK(fl)) {
+		if (fl->fl_type & LOCK_MAND) {
+			seq_puts(f, "FLOCK  MSNFS     ");
+		} else {
+			seq_puts(f, "FLOCK  ADVISORY  ");
+		}
+	} else if (IS_LEASE(fl)) {
+		if (fl->fl_flags & FL_DELEG)
+			seq_puts(f, "DELEG  ");
+		else
+			seq_puts(f, "LEASE  ");
+
+		if (lease_breaking(fl))
+			seq_puts(f, "BREAKING  ");
+		else if (fl->fl_file)
+			seq_puts(f, "ACTIVE    ");
+		else
+			seq_puts(f, "BREAKER   ");
+	} else {
+		seq_puts(f, "UNKNOWN UNKNOWN  ");
+	}
+	if (fl->fl_type & LOCK_MAND) {
+		seq_printf(f, "%s ",
+			       (fl->fl_type & LOCK_READ)
+			       ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
+			       : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
+	} else {
+		seq_printf(f, "%s ",
+			       (lease_breaking(fl))
+			       ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ "
+			       : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
+	}
+	if (inode) {
+		/* userspace relies on this representation of dev_t */
+		seq_printf(f, "%d %02x:%02x:%ld ", fl_pid,
+				MAJOR(inode->i_sb->s_dev),
+				MINOR(inode->i_sb->s_dev), inode->i_ino);
+	} else {
+		seq_printf(f, "%d <none>:0 ", fl_pid);
+	}
+	if (IS_POSIX(fl)) {
+		if (fl->fl_end == OFFSET_MAX)
+			seq_printf(f, "%Ld EOF\n", fl->fl_start);
+		else
+			seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
+	} else {
+		seq_puts(f, "0 EOF\n");
+	}
+}
+
+static int locks_show(struct seq_file *f, void *v)
+{
+	struct locks_iterator *iter = f->private;
+	struct file_lock *fl, *bfl;
+
+	fl = hlist_entry(v, struct file_lock, fl_link);
+
+	lock_get_status(f, fl, iter->li_pos, "");
+
+	list_for_each_entry(bfl, &fl->fl_block, fl_block)
+		lock_get_status(f, bfl, iter->li_pos, " ->");
+
+	return 0;
+}
+
+static void __show_fd_locks(struct seq_file *f,
+			struct list_head *head, int *id,
+			struct file *filp, struct files_struct *files)
+{
+	struct file_lock *fl;
+
+	list_for_each_entry(fl, head, fl_list) {
+
+		if (filp != fl->fl_file)
+			continue;
+		if (fl->fl_owner != files &&
+		    fl->fl_owner != filp)
+			continue;
+
+		(*id)++;
+		seq_puts(f, "lock:\t");
+		lock_get_status(f, fl, *id, "");
+	}
+}
+
+void show_fd_locks(struct seq_file *f,
+		  struct file *filp, struct files_struct *files)
+{
+	struct inode *inode = file_inode(filp);
+	struct file_lock_context *ctx;
+	int id = 0;
+
+	ctx = inode->i_flctx;
+	if (!ctx)
+		return;
+
+	spin_lock(&ctx->flc_lock);
+	__show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
+	__show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
+	__show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
+	spin_unlock(&ctx->flc_lock);
+}
+
+static void *locks_start(struct seq_file *f, loff_t *pos)
+	__acquires(&blocked_lock_lock)
+{
+	struct locks_iterator *iter = f->private;
+
+	iter->li_pos = *pos + 1;
+	lg_global_lock(&file_lock_lglock);
+	spin_lock(&blocked_lock_lock);
+	return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
+}
+
+static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	struct locks_iterator *iter = f->private;
+
+	++iter->li_pos;
+	return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
+}
+
+static void locks_stop(struct seq_file *f, void *v)
+	__releases(&blocked_lock_lock)
+{
+	spin_unlock(&blocked_lock_lock);
+	lg_global_unlock(&file_lock_lglock);
+}
+
+static const struct seq_operations locks_seq_operations = {
+	.start	= locks_start,
+	.next	= locks_next,
+	.stop	= locks_stop,
+	.show	= locks_show,
+};
+
+static int locks_open(struct inode *inode, struct file *filp)
+{
+	return seq_open_private(filp, &locks_seq_operations,
+					sizeof(struct locks_iterator));
+}
+
+static const struct file_operations proc_locks_operations = {
+	.open		= locks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int __init proc_locks_init(void)
+{
+	proc_create("locks", 0, NULL, &proc_locks_operations);
+	return 0;
+}
+module_init(proc_locks_init);
+#endif
+
+static int __init filelock_init(void)
+{
+	int i;
+
+	flctx_cache = kmem_cache_create("file_lock_ctx",
+			sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+
+	filelock_cache = kmem_cache_create("file_lock_cache",
+			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+
+	lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+
+	for_each_possible_cpu(i)
+		INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+
+	return 0;
+}
+
+core_initcall(filelock_init);
diff --git a/kernel/fs/logfs/Kconfig b/kernel/fs/logfs/Kconfig
new file mode 100644
index 000000000..09ed066c0
--- /dev/null
+++ b/kernel/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
+config LOGFS
+	tristate "LogFS file system"
+	depends on (MTD || BLOCK)
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select CRC32
+	select BTREE
+	help
+	  Flash filesystem aimed to scale efficiently to large devices.
+	  In comparison to JFFS2 it offers significantly faster mount
+	  times and potentially less RAM usage, although the latter has
+	  not been measured yet.
+
+	  In its current state it is still very experimental and should
+	  not be used for other than testing purposes.
+
+	  If unsure, say N.
diff --git a/kernel/fs/logfs/Makefile b/kernel/fs/logfs/Makefile
new file mode 100644
index 000000000..482002778
--- /dev/null
+++ b/kernel/fs/logfs/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LOGFS)	+= logfs.o
+
+logfs-y	+= compr.o
+logfs-y	+= dir.o
+logfs-y	+= file.o
+logfs-y	+= gc.o
+logfs-y	+= inode.o
+logfs-y	+= journal.o
+logfs-y	+= readwrite.o
+logfs-y	+= segment.o
+logfs-y	+= super.o
+logfs-$(CONFIG_BLOCK)	+= dev_bdev.o
+logfs-$(CONFIG_MTD)	+= dev_mtd.o
diff --git a/kernel/fs/logfs/compr.c b/kernel/fs/logfs/compr.c
new file mode 100644
index 000000000..961f02b86
--- /dev/null
+++ b/kernel/fs/logfs/compr.c
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c	- compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#define COMPR_LEVEL 3
+
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_deflateInit(&stream, COMPR_LEVEL);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = 0;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int __init logfs_compr_init(void)
+{
+	size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+			zlib_inflate_workspacesize());
+	stream.workspace = vmalloc(size);
+	if (!stream.workspace)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_compr_exit(void)
+{
+	vfree(stream.workspace);
+}
diff --git a/kernel/fs/logfs/dev_bdev.c b/kernel/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000..76279e119
--- /dev/null
+++ b/kernel/fs/logfs/dev_bdev.c
@@ -0,0 +1,321 @@
+/*
+ * fs/logfs/dev_bdev.c	- Device access methods for block devices
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/prefetch.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static int sync_request(struct page *page, struct block_device *bdev, int rw)
+{
+	struct bio bio;
+	struct bio_vec bio_vec;
+
+	bio_init(&bio);
+	bio.bi_max_vecs = 1;
+	bio.bi_io_vec = &bio_vec;
+	bio_vec.bv_page = page;
+	bio_vec.bv_len = PAGE_SIZE;
+	bio_vec.bv_offset = 0;
+	bio.bi_vcnt = 1;
+	bio.bi_bdev = bdev;
+	bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
+	bio.bi_iter.bi_size = PAGE_SIZE;
+
+	return submit_bio_wait(rw, &bio);
+}
+
+static int bdev_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+	int err;
+
+	err = sync_request(page, bdev, READ);
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(wq);
+
+static void writeseg_end_io(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec;
+	int i;
+	struct super_block *sb = bio->bi_private;
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
+	BUG_ON(err);
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		end_page_writeback(bvec->bv_page);
+		page_cache_release(bvec->bv_page);
+	}
+	bio_put(bio);
+	if (atomic_dec_and_test(&super->s_pending_writes))
+		wake_up(&wq);
+}
+
+static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct bio *bio;
+	struct page *page;
+	unsigned int max_pages;
+	int i;
+
+	max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+
+	bio = bio_alloc(GFP_NOFS, max_pages);
+	BUG_ON(!bio);
+
+	for (i = 0; i < nr_pages; i++) {
+		if (i >= max_pages) {
+			/* Block layer cannot split bios :( */
+			bio->bi_vcnt = i;
+			bio->bi_iter.bi_size = i * PAGE_SIZE;
+			bio->bi_bdev = super->s_bdev;
+			bio->bi_iter.bi_sector = ofs >> 9;
+			bio->bi_private = sb;
+			bio->bi_end_io = writeseg_end_io;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(WRITE, bio);
+
+			ofs += i * PAGE_SIZE;
+			index += i;
+			nr_pages -= i;
+			i = 0;
+
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+		}
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+		bio->bi_io_vec[i].bv_page = page;
+		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[i].bv_offset = 0;
+
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+	}
+	bio->bi_vcnt = nr_pages;
+	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_bdev = super->s_bdev;
+	bio->bi_iter.bi_sector = ofs >> 9;
+	bio->bi_private = sb;
+	bio->bi_end_io = writeseg_end_io;
+	atomic_inc(&super->s_pending_writes);
+	submit_bio(WRITE, bio);
+	return 0;
+}
+
+static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+
+
+static void erase_end_io(struct bio *bio, int err) 
+{ 
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 
+	struct super_block *sb = bio->bi_private; 
+	struct logfs_super *super = logfs_super(sb); 
+
+	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ 
+	BUG_ON(err); 
+	BUG_ON(bio->bi_vcnt == 0); 
+	bio_put(bio); 
+	if (atomic_dec_and_test(&super->s_pending_writes))
+		wake_up(&wq); 
+} 
+
+static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct bio *bio;
+	unsigned int max_pages;
+	int i;
+
+	max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+
+	bio = bio_alloc(GFP_NOFS, max_pages);
+	BUG_ON(!bio);
+
+	for (i = 0; i < nr_pages; i++) {
+		if (i >= max_pages) {
+			/* Block layer cannot split bios :( */
+			bio->bi_vcnt = i;
+			bio->bi_iter.bi_size = i * PAGE_SIZE;
+			bio->bi_bdev = super->s_bdev;
+			bio->bi_iter.bi_sector = ofs >> 9;
+			bio->bi_private = sb;
+			bio->bi_end_io = erase_end_io;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(WRITE, bio);
+
+			ofs += i * PAGE_SIZE;
+			index += i;
+			nr_pages -= i;
+			i = 0;
+
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+		}
+		bio->bi_io_vec[i].bv_page = super->s_erase_page;
+		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[i].bv_offset = 0;
+	}
+	bio->bi_vcnt = nr_pages;
+	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_bdev = super->s_bdev;
+	bio->bi_iter.bi_sector = ofs >> 9;
+	bio->bi_private = sb;
+	bio->bi_end_io = erase_end_io;
+	atomic_inc(&super->s_pending_writes);
+	submit_bio(WRITE, bio);
+	return 0;
+}
+
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
+		int ensure_write)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(to & (PAGE_SIZE - 1));
+	BUG_ON(len & (PAGE_SIZE - 1));
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	if (ensure_write) {
+		/*
+		 * Object store doesn't care whether erases happen or not.
+		 * But for the journal they are required.  Otherwise a scan
+		 * can find an old commit entry and assume it is the current
+		 * one, travelling back in time.
+		 */
+		do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
+	}
+
+	return 0;
+}
+
+static void bdev_sync(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
+}
+
+static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+
+	*ofs = 0;
+	return read_cache_page(mapping, 0, filler, sb);
+}
+
+static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+	u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
+	pgoff_t index = pos >> PAGE_SHIFT;
+
+	*ofs = pos;
+	return read_cache_page(mapping, index, filler, sb);
+}
+
+static int bdev_write_sb(struct super_block *sb, struct page *page)
+{
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+
+	/* Nothing special to do for block devices. */
+	return sync_request(page, bdev, WRITE);
+}
+
+static void bdev_put_device(struct logfs_super *s)
+{
+	blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	return 0;
+}
+
+static const struct logfs_device_ops bd_devops = {
+	.find_first_sb	= bdev_find_first_sb,
+	.find_last_sb	= bdev_find_last_sb,
+	.write_sb	= bdev_write_sb,
+	.readpage	= bdev_readpage,
+	.writeseg	= bdev_writeseg,
+	.erase		= bdev_erase,
+	.can_write_buf	= bdev_can_write_buf,
+	.sync		= bdev_sync,
+	.put_device	= bdev_put_device,
+};
+
+int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
+		const char *devname)
+{
+	struct block_device *bdev;
+
+	bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  type);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
+		int mtdnr = MINOR(bdev->bd_dev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+		return logfs_get_sb_mtd(p, mtdnr);
+	}
+
+	p->s_bdev = bdev;
+	p->s_mtd = NULL;
+	p->s_devops = &bd_devops;
+	return 0;
+}
diff --git a/kernel/fs/logfs/dev_mtd.c b/kernel/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000..9c5014494
--- /dev/null
+++ b/kernel/fs/logfs/dev_mtd.c
@@ -0,0 +1,274 @@
+/*
+ * fs/logfs/dev_mtd.c	- Device access methods for MTD
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/completion.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
+			void *buf)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	size_t retlen;
+	int ret;
+
+	ret = mtd_read(mtd, ofs, len, &retlen, buf);
+	BUG_ON(ret == -EINVAL);
+	if (ret)
+		return ret;
+
+	/* Not sure if we should loop instead. */
+	if (retlen != len)
+		return -EIO;
+
+	return 0;
+}
+
+static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
+			void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct mtd_info *mtd = super->s_mtd;
+	size_t retlen;
+	loff_t page_start, page_end;
+	int ret;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
+	BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
+	BUG_ON(len > PAGE_CACHE_SIZE);
+	page_start = ofs & PAGE_CACHE_MASK;
+	page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+	ret = mtd_write(mtd, ofs, len, &retlen, buf);
+	if (ret || (retlen != len))
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * For as long as I can remember (since about 2001) mtd->erase has been an
+ * asynchronous interface lacking the first driver to actually use the
+ * asynchronous properties.  So just to prevent the first implementor of such
+ * a thing from breaking logfs in 2350, we do the usual pointless dance to
+ * declare a completion variable and wait for completion before returning
+ * from logfs_mtd_erase().  What an exercise in futility!
+ */
+static void logfs_erase_callback(struct erase_info *ei)
+{
+	complete((struct completion *)ei->priv);
+}
+
+static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
+				size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	pgoff_t index = ofs >> PAGE_SHIFT;
+
+	for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
+		page = find_get_page(mapping, index);
+		if (!page)
+			continue;
+		memset(page_address(page), 0xFF, PAGE_SIZE);
+		page_cache_release(page);
+	}
+	return 0;
+}
+
+static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+		int ensure_write)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	struct erase_info ei;
+	DECLARE_COMPLETION_ONSTACK(complete);
+	int ret;
+
+	BUG_ON(len % mtd->erasesize);
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	memset(&ei, 0, sizeof(ei));
+	ei.mtd = mtd;
+	ei.addr = ofs;
+	ei.len = len;
+	ei.callback = logfs_erase_callback;
+	ei.priv = (long)&complete;
+	ret = mtd_erase(mtd, &ei);
+	if (ret)
+		return -EIO;
+
+	wait_for_completion(&complete);
+	if (ei.state != MTD_ERASE_DONE)
+		return -EIO;
+	return logfs_mtd_erase_mapping(sb, ofs, len);
+}
+
+static void logfs_mtd_sync(struct super_block *sb)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+
+	mtd_sync(mtd);
+}
+
+static int logfs_mtd_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	int err;
+
+	err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+			page_address(page));
+	if (err == -EUCLEAN || err == -EBADMSG) {
+		/* -EBADMSG happens regularly on power failures */
+		err = 0;
+		/* FIXME: force GC this segment */
+	}
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = logfs_mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	*ofs = 0;
+	while (mtd_block_isbad(mtd, *ofs)) {
+		*ofs += mtd->erasesize;
+		if (*ofs >= mtd->size)
+			return NULL;
+	}
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = logfs_mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	*ofs = mtd->size - mtd->erasesize;
+	while (mtd_block_isbad(mtd, *ofs)) {
+		*ofs -= mtd->erasesize;
+		if (*ofs <= 0)
+			return NULL;
+	}
+	*ofs = *ofs + mtd->erasesize - 0x1000;
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	int i, err;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+
+		err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+					page_address(page));
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return;
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+
+static void logfs_mtd_put_device(struct logfs_super *s)
+{
+	put_mtd_device(s->s_mtd);
+}
+
+static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *buf;
+	int err;
+
+	buf = kmalloc(super->s_writesize, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
+	if (err)
+		goto out;
+	if (memchr_inv(buf, 0xff, super->s_writesize))
+		err = -EIO;
+	kfree(buf);
+out:
+	return err;
+}
+
+static const struct logfs_device_ops mtd_devops = {
+	.find_first_sb	= logfs_mtd_find_first_sb,
+	.find_last_sb	= logfs_mtd_find_last_sb,
+	.readpage	= logfs_mtd_readpage,
+	.writeseg	= logfs_mtd_writeseg,
+	.erase		= logfs_mtd_erase,
+	.can_write_buf	= logfs_mtd_can_write_buf,
+	.sync		= logfs_mtd_sync,
+	.put_device	= logfs_mtd_put_device,
+};
+
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
+{
+	struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
+	if (IS_ERR(mtd))
+		return PTR_ERR(mtd);
+
+	s->s_bdev = NULL;
+	s->s_mtd = mtd;
+	s->s_devops = &mtd_devops;
+	return 0;
+}
diff --git a/kernel/fs/logfs/dir.c b/kernel/fs/logfs/dir.c
new file mode 100644
index 000000000..4cf38f118
--- /dev/null
+++ b/kernel/fs/logfs/dir.c
@@ -0,0 +1,801 @@
+/*
+ * fs/logfs/dir.c	- directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic.  Dentries and Inodes are
+ * created/removed/altered in separate operations.  Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create.  This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor.  On next mount, if we were
+ * interrupted, we delete the inode.  From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink.  Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined.  Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode.  On next mount, if we were
+ * interrupted, we delete the dentry.  From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry.  If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode.  If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount.  From
+ * a users point of view, the operation succeeded.
+ */
+
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos)
+{
+	return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
+}
+
+static int write_inode(struct inode *inode)
+{
+	return __logfs_write_inode(inode, NULL, WF_LOCK);
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+	s64 new_pos = logfs_seek_data(inode, pos);
+
+	return max(pos, new_pos - 1);
+}
+
+static int beyond_eof(struct inode *inode, loff_t bix)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	return pos >= i_size_read(inode);
+}
+
+/*
+ * Prime value was chosen to be roughly 256 + 26.  r5 hash uses 11,
+ * so short names (len <= 9) don't even occupy the complete 32bit name
+ * space.  A prime >256 ensures short names quickly spread the 32bit
+ * name space.  Add about 26 for the estimated amount of information
+ * of each character and pick a prime nearby, preferably a bit-sparse
+ * one.
+ */
+static u32 hash_32(const char *s, int len, u32 seed)
+{
+	u32 hash = seed;
+	int i;
+
+	for (i = 0; i < len; i++)
+		hash = hash * 293 + s[i];
+	return hash;
+}
+
+/*
+ * We have to satisfy several conflicting requirements here.  Small
+ * directories should stay fairly compact and not require too many
+ * indirect blocks.  The number of possible locations for a given hash
+ * should be small to make lookup() fast.  And we should try hard not
+ * to overflow the 32bit name space or nfs and 32bit host systems will
+ * be unhappy.
+ *
+ * So we use the following scheme.  First we reduce the hash to 0..15
+ * and try a direct block.  If that is occupied we reduce the hash to
+ * 16..255 and try an indirect block.  Same for 2x and 3x indirect
+ * blocks.  Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
+ * but use buckets containing eight entries instead of a single one.
+ *
+ * Using 16 entries should allow for a reasonable amount of hash
+ * collisions, so the 32bit name space can be packed fairly tight
+ * before overflowing.  Oh and currently we don't overflow but return
+ * and error.
+ *
+ * How likely are collisions?  Doing the appropriate math is beyond me
+ * and the Bronstein textbook.  But running a test program to brute
+ * force collisions for a couple of days showed that on average the
+ * first collision occurs after 598M entries, with 290M being the
+ * smallest result.  Obviously 21 entries could already cause a
+ * collision if all entries are carefully chosen.
+ */
+static pgoff_t hash_index(u32 hash, int round)
+{
+	u32 i0_blocks = I0_BLOCKS;
+	u32 i1_blocks = I1_BLOCKS;
+	u32 i2_blocks = I2_BLOCKS;
+	u32 i3_blocks = I3_BLOCKS;
+
+	switch (round) {
+	case 0:
+		return hash % i0_blocks;
+	case 1:
+		return i0_blocks + hash % (i1_blocks - i0_blocks);
+	case 2:
+		return i1_blocks + hash % (i2_blocks - i1_blocks);
+	case 3:
+		return i2_blocks + hash % (i3_blocks - i2_blocks);
+	case 4 ... 19:
+		return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
+			+ round - 4;
+	}
+	BUG();
+}
+
+static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
+{
+	struct qstr *name = &dentry->d_name;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(name->name, name->len, 0);
+	pgoff_t index;
+	int round;
+
+	if (name->len > LOGFS_MAX_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (beyond_eof(dir, index))
+			return NULL;
+		if (!logfs_exist_block(dir, index))
+			continue;
+		page = read_cache_page(dir->i_mapping, index,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return page;
+		dd = kmap_atomic(page);
+		BUG_ON(dd->namelen == 0);
+
+		if (name->len != be16_to_cpu(dd->namelen) ||
+				memcmp(name->name, dd->name, name->len)) {
+			kunmap_atomic(dd);
+			page_cache_release(page);
+			continue;
+		}
+
+		kunmap_atomic(dd);
+		return page;
+	}
+	return NULL;
+}
+
+static int logfs_remove_inode(struct inode *inode)
+{
+	int ret;
+
+	drop_nlink(inode);
+	ret = write_inode(inode);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	if (logfs_inode(inode)->li_block)
+		logfs_inode(inode)->li_block->ta = NULL;
+	kfree(ta);
+}
+
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct inode *inode = d_inode(dentry);
+	struct logfs_transaction *ta;
+	struct page *page;
+	pgoff_t index;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = UNLINK_1;
+	ta->ino = inode->i_ino;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (!page) {
+		kfree(ta);
+		return -ENOENT;
+	}
+	if (IS_ERR(page)) {
+		kfree(ta);
+		return PTR_ERR(page);
+	}
+	index = page->index;
+	page_cache_release(page);
+
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(dir, ta);
+
+	ret = logfs_delete(dir, index, NULL);
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		abort_transaction(dir, ta);
+		printk(KERN_ERR"LOGFS: unable to delete inode\n");
+		goto out;
+	}
+
+	ta->state = UNLINK_2;
+	logfs_add_transaction(inode, ta);
+	ret = logfs_remove_inode(inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static inline int logfs_empty_dir(struct inode *dir)
+{
+	u64 data;
+
+	data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
+	return data >= i_size_read(dir);
+}
+
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (!logfs_empty_dir(inode))
+		return -ENOTEMPTY;
+
+	return logfs_unlink(dir, dentry);
+}
+
+/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
+ * way to combine the two copies */
+static int logfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *dir = file_inode(file);
+	loff_t pos;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+
+	if (ctx->pos < 0)
+		return -EINVAL;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	pos = ctx->pos - 2;
+	BUG_ON(pos < 0);
+	for (;; pos++, ctx->pos++) {
+		bool full;
+		if (beyond_eof(dir, pos))
+			break;
+		if (!logfs_exist_block(dir, pos)) {
+			/* deleted dentry */
+			pos = dir_seek_data(dir, pos);
+			continue;
+		}
+		page = read_cache_page(dir->i_mapping, pos,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		dd = kmap(page);
+		BUG_ON(dd->namelen == 0);
+
+		full = !dir_emit(ctx, (char *)dd->name,
+				be16_to_cpu(dd->namelen),
+				be64_to_cpu(dd->ino), dd->type);
+		kunmap(page);
+		page_cache_release(page);
+		if (full)
+			break;
+	}
+	return 0;
+}
+
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+	dd->namelen = cpu_to_be16(name->len);
+	memcpy(dd->name, name->name, name->len);
+}
+
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+		unsigned int flags)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	pgoff_t index;
+	u64 ino = 0;
+	struct inode *inode;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return ERR_CAST(page);
+	if (!page) {
+		d_add(dentry, NULL);
+		return NULL;
+	}
+	index = page->index;
+	dd = kmap_atomic(page);
+	ino = be64_to_cpu(dd->ino);
+	kunmap_atomic(dd);
+	page_cache_release(page);
+
+	inode = logfs_iget(dir->i_sb, ino);
+	if (IS_ERR(inode))
+		printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
+				ino, dir->i_ino, index);
+	return d_splice_alias(inode, dentry);
+}
+
+static void grow_dir(struct inode *dir, loff_t index)
+{
+	index = (index + 1) << dir->i_sb->s_blocksize_bits;
+	if (i_size_read(dir) < index)
+		i_size_write(dir, index);
+}
+
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+		struct inode *inode)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+	pgoff_t index;
+	int round, err;
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (logfs_exist_block(dir, index))
+			continue;
+		page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		dd = kmap_atomic(page);
+		memset(dd, 0, sizeof(*dd));
+		dd->ino = cpu_to_be64(inode->i_ino);
+		dd->type = logfs_type(inode);
+		logfs_set_name(dd, &dentry->d_name);
+		kunmap_atomic(dd);
+
+		err = logfs_write_buf(dir, page, WF_LOCK);
+		unlock_page(page);
+		page_cache_release(page);
+		if (!err)
+			grow_dir(dir, index);
+		return err;
+	}
+	/* FIXME: Is there a better return value?  In most cases neither
+	 * the filesystem nor the directory are full.  But we have had
+	 * too many collisions for this particular hash and no fallback.
+	 */
+	return -ENOSPC;
+}
+
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, const char *dest, long destlen)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta) {
+		drop_nlink(inode);
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	ta->state = CREATE_1;
+	ta->ino = inode->i_ino;
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(inode, ta);
+
+	if (dest) {
+		/* symlink */
+		ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
+		if (!ret)
+			ret = write_inode(inode);
+	} else {
+		/* creat/mkdir/mknod */
+		ret = write_inode(inode);
+	}
+	if (ret) {
+		abort_transaction(inode, ta);
+		li->li_flags |= LOGFS_IF_STILLBORN;
+		/* FIXME: truncate symlink */
+		drop_nlink(inode);
+		iput(inode);
+		goto out;
+	}
+
+	ta->state = CREATE_2;
+	logfs_add_transaction(dir, ta);
+	ret = logfs_write_dir(dir, dentry, inode);
+	/* sync directory */
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		logfs_del_transaction(dir, ta);
+		ta->state = CREATE_2;
+		logfs_add_transaction(inode, ta);
+		logfs_remove_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+
+	/*
+	 * FIXME: why do we have to fill in S_IFDIR, while the mode is
+	 * correct for mknod, creat, etc.?  Smells like the vfs *should*
+	 * do it for us but for some reason fails to do so.
+	 */
+	inode = logfs_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_dir_iops;
+	inode->i_fop = &logfs_dir_fops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool excl)
+{
+	struct inode *inode;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_reg_iops;
+	inode->i_fop = &logfs_reg_fops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		dev_t rdev)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, mode, rdev);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+		const char *target)
+{
+	struct inode *inode;
+	size_t destlen = strlen(target) + 1;
+
+	if (destlen > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, S_IFLNK | 0777);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_symlink_iops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, target, destlen);
+}
+
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	ihold(inode);
+	inc_nlink(inode);
+	mark_inode_dirty_sync(inode);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	struct page *page;
+	void *map;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	*pos = page->index;
+	map = kmap_atomic(page);
+	memcpy(dd, map, sizeof(*dd));
+	kunmap_atomic(map);
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_delete_dd(struct inode *dir, loff_t pos)
+{
+	/*
+	 * Getting called with pos somewhere beyond eof is either a goofup
+	 * within this file or means someone maliciously edited the
+	 * (crc-protected) journal.
+	 */
+	BUG_ON(beyond_eof(dir, pos));
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
+	return logfs_delete(dir, pos, NULL);
+}
+
+/*
+ * Cross-directory rename, target does not exist.  Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+			      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CROSS_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+
+	/* 2. write target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry));
+	if (!err)
+		err = write_inode(new_dir);
+
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = CROSS_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, struct inode *inode)
+{
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, dentry, dd, &pos);
+	if (err)
+		return err;
+	dd->ino = cpu_to_be64(inode->i_ino);
+	dd->type = logfs_type(inode);
+
+	err = write_dir(dir, dd, pos);
+	if (err)
+		return err;
+	log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
+			dd->name, be64_to_cpu(dd->ino));
+	return write_inode(dir);
+}
+
+/* Target dentry exists - the worst case.  We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	int isdir = S_ISDIR(old_inode->i_mode);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+	if (isdir) {
+		if (!logfs_empty_dir(new_inode))
+			return -ENOTEMPTY;
+	}
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = TARGET_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+	ta->ino = new_inode->i_ino;
+
+	/* 2. attach source inode to target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		super->s_victim_ino = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = TARGET_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+
+	/* 4. remove target inode */
+	ta->state = TARGET_RENAME_3;
+	logfs_add_transaction(new_inode, ta);
+	err = logfs_remove_inode(new_inode);
+
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (d_really_is_positive(new_dentry))
+		return logfs_rename_target(old_dir, old_dentry,
+					   new_dir, new_dentry);
+	return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	u64 ino, pos;
+	int err;
+
+	if (super->s_victim_ino) {
+		/* delete victim inode */
+		ino = super->s_victim_ino;
+		printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+		super->s_victim_ino = 0;
+		err = logfs_remove_inode(inode);
+		iput(inode);
+		if (err) {
+			super->s_victim_ino = ino;
+			goto fail;
+		}
+	}
+	if (super->s_rename_dir) {
+		/* delete old dd from rename */
+		ino = super->s_rename_dir;
+		pos = super->s_rename_pos;
+		printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+				ino, pos);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		err = logfs_delete_dd(inode, pos);
+		iput(inode);
+		if (err) {
+			super->s_rename_dir = ino;
+			super->s_rename_pos = pos;
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	LOGFS_BUG(sb);
+	return -EIO;
+}
+
+const struct inode_operations logfs_symlink_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+};
+
+const struct inode_operations logfs_dir_iops = {
+	.create		= logfs_create,
+	.link		= logfs_link,
+	.lookup		= logfs_lookup,
+	.mkdir		= logfs_mkdir,
+	.mknod		= logfs_mknod,
+	.rename		= logfs_rename,
+	.rmdir		= logfs_rmdir,
+	.symlink	= logfs_symlink,
+	.unlink		= logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+	.fsync		= logfs_fsync,
+	.unlocked_ioctl	= logfs_ioctl,
+	.iterate	= logfs_readdir,
+	.read		= generic_read_dir,
+	.llseek		= default_llseek,
+};
diff --git a/kernel/fs/logfs/file.c b/kernel/fs/logfs/file.c
new file mode 100644
index 000000000..1a6f0167b
--- /dev/null
+++ b/kernel/fs/logfs/file.c
@@ -0,0 +1,285 @@
+/*
+ * fs/logfs/file.c	- prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+static int logfs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+		return 0;
+	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned end = start + len;
+
+		/* Reading beyond i_size is simple: memset to zero */
+		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+		return 0;
+	}
+	return logfs_readpage_nolock(page);
+}
+
+static int logfs_write_end(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied, struct page *page,
+		void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index = page->index;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned end = start + copied;
+	int ret = 0;
+
+	BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+	BUG_ON(page->index > I3_BLOCKS);
+
+	if (copied < len) {
+		/*
+		 * Short write of a non-initialized paged.  Just tell userspace
+		 * to retry the entire page.
+		 */
+		if (!PageUptodate(page)) {
+			copied = 0;
+			goto out;
+		}
+	}
+	if (copied == 0)
+		goto out; /* FIXME: do we need to update inode? */
+
+	if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+		i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+		mark_inode_dirty_sync(inode);
+	}
+
+	SetPageUptodate(page);
+	if (!PageDirty(page)) {
+		if (!get_page_reserve(inode, page))
+			__set_page_dirty_nobuffers(page);
+		else
+			ret = logfs_write_buf(inode, page, WF_LOCK);
+	}
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret ? ret : copied;
+}
+
+int logfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	ret = logfs_readpage_nolock(page);
+	unlock_page(page);
+	return ret;
+}
+
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+	BUG_ON(PagePrivate(page) || page->private);
+	set_page_writeback(page);
+	end_page_writeback(page);
+}
+
+static int __logfs_writepage(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	err = logfs_write_buf(inode, page, WF_LOCK);
+	if (err)
+		set_page_dirty(page);
+	else
+		clear_radix_tree_dirty(page);
+	unlock_page(page);
+	return err;
+}
+
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	u64 bix;
+	level_t level;
+
+	log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+			page);
+
+	logfs_unpack_index(page->index, &bix, &level);
+
+	/* Indirect blocks are never truncated */
+	if (level != 0)
+		return __logfs_writepage(page);
+
+	/*
+	 * TODO: everything below is a near-verbatim copy of nobh_writepage().
+	 * The relevant bits should be factored out after logfs is merged.
+	 */
+
+	/* Is the page fully inside i_size? */
+	if (bix < end_index)
+		return __logfs_writepage(page);
+
+	 /* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (bix > end_index || offset == 0) {
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __logfs_writepage(page);
+}
+
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
+{
+	struct logfs_block *block = logfs_block(page);
+
+	if (block->reserved_bytes) {
+		struct super_block *sb = page->mapping->host->i_sb;
+		struct logfs_super *super = logfs_super(sb);
+
+		super->s_dirty_pages -= block->reserved_bytes;
+		block->ops->free_block(sb, block);
+		BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
+	} else
+		move_page_to_btree(page);
+	BUG_ON(PagePrivate(page) || page->private);
+}
+
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+	return 0; /* None of these are easy to release */
+}
+
+
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned int oldflags, flags;
+	int err;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *)arg);
+	case FS_IOC_SETFLAGS:
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = get_user(flags, (int __user *)arg);
+		if (err)
+			return err;
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = li->li_flags;
+		flags &= LOGFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+		li->li_flags = flags;
+		mutex_unlock(&inode->i_mutex);
+
+		inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty_sync(inode);
+		return 0;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct super_block *sb = file->f_mapping->host->i_sb;
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	logfs_get_wblocks(sb, NULL, WF_LOCK);
+	logfs_write_anchor(sb);
+	logfs_put_wblocks(sb, NULL, WF_LOCK);
+	mutex_unlock(&inode->i_mutex);
+
+	return 0;
+}
+
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int err = 0;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		err = logfs_truncate(inode, attr->ia_size);
+		if (err)
+			return err;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations logfs_reg_iops = {
+	.setattr	= logfs_setattr,
+};
+
+const struct file_operations logfs_reg_fops = {
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.fsync		= logfs_fsync,
+	.unlocked_ioctl	= logfs_ioctl,
+	.llseek		= generic_file_llseek,
+	.mmap		= generic_file_readonly_mmap,
+	.open		= generic_file_open,
+};
+
+const struct address_space_operations logfs_reg_aops = {
+	.invalidatepage	= logfs_invalidatepage,
+	.readpage	= logfs_readpage,
+	.releasepage	= logfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.writepage	= logfs_writepage,
+	.writepages	= generic_writepages,
+	.write_begin	= logfs_write_begin,
+	.write_end	= logfs_write_end,
+};
diff --git a/kernel/fs/logfs/gc.c b/kernel/fs/logfs/gc.c
new file mode 100644
index 000000000..d4efb061b
--- /dev/null
+++ b/kernel/fs/logfs/gc.c
@@ -0,0 +1,732 @@
+/*
+ * fs/logfs/gc.c	- garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * Wear leveling needs to kick in when the difference between low erase
+ * counts and high erase counts gets too big.  A good value for "too big"
+ * may be somewhat below 10% of maximum erase count for the device.
+ * Why not 397, to pick a nice round number with no specific meaning? :)
+ *
+ * WL_RATELIMIT is the minimum time between two wear level events.  A huge
+ * number of segments may fulfil the requirements for wear leveling at the
+ * same time.  If that happens we don't want to cause a latency from hell,
+ * but just gently pick one segment every so often and minimize overhead.
+ */
+#define WL_DELTA 397
+#define WL_RATELIMIT 100
+#define MAX_OBJ_ALIASES	2600
+#define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */
+#define LIST_SIZE 64	/* base size of candidate lists */
+#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+
+static int no_free_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return super->s_free_list.count;
+}
+
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u8 gc_level = (__force u8)__gc_level;
+
+	switch (gc_level) {
+	case 0: /* fall through */
+	case 1: /* fall through */
+	case 2: /* fall through */
+	case 3:
+		/* file data or indirect blocks */
+		return super->s_ifile_levels + super->s_iblock_levels - gc_level;
+	case 6: /* fall through */
+	case 7: /* fall through */
+	case 8: /* fall through */
+	case 9:
+		/* inode file data or indirect blocks */
+		return super->s_ifile_levels - (gc_level - 6);
+	default:
+		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+				gc_level);
+		WARN_ON(1);
+		return super->s_ifile_levels + super->s_iblock_levels;
+	}
+}
+
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area;
+	void *reserved;
+	int i;
+
+	/* Some segments are reserved.  Just pretend they were all valid */
+	reserved = btree_lookup32(&super->s_reserved_segments, segno);
+	if (reserved)
+		return 1;
+
+	/* Currently open segments */
+	for_each_area(i) {
+		area = super->s_area[i];
+		if (area->a_is_open && area->a_segno == segno)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	BUG();
+}
+
+/*
+ * Returns the bytes consumed by valid objects in this segment.  Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+		gc_level_t *gc_level)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(sb, segno, &se);
+	if (se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED))
+		return RESERVED;
+
+	ec_level = be32_to_cpu(se.ec_level);
+	*ec = ec_level >> 4;
+	*gc_level = GC_LEVEL(ec_level & 0xf);
+	return be32_to_cpu(se.valid);
+}
+
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+		u64 bix, gc_level_t gc_level)
+{
+	struct inode *inode;
+	int err, cookie;
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
+	BUG_ON(err);
+	logfs_safe_iput(inode, cookie);
+}
+
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, logical_segno, cleaned = 0;
+	int err, len, valid;
+	gc_level_t gc_level;
+
+	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+
+	btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	BUG_ON(err);
+	gc_level = GC_LEVEL(sh.level);
+	logical_segno = be32_to_cpu(sh.segno);
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		cleaned = -1;
+		goto out;
+	}
+
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize; ) {
+		ofs = dev_ofs(sb, logical_segno, seg_ofs);
+		err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
+				&oh);
+		BUG_ON(err);
+
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			cleaned = super->s_segsize - 1;
+			goto out;
+		}
+
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		len = sizeof(oh) + be16_to_cpu(oh.len);
+		valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
+		if (valid == 1) {
+			logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
+			cleaned += len;
+		} else if (valid == 2) {
+			/* Will be invalid upon journal commit */
+			cleaned += len;
+		}
+		seg_ofs += len;
+	}
+out:
+	btree_remove32(&super->s_reserved_segments, segno);
+	return cleaned;
+}
+
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+		struct candidate_list *list)
+{
+	struct rb_node **p = &list->rb_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct gc_candidate *cur;
+	int comp;
+
+	cand->list = list;
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct gc_candidate, rb_node);
+
+		if (list->sort_by_ec)
+			comp = cand->erase_count < cur->erase_count;
+		else
+			comp = cand->valid < cur->valid;
+
+		if (comp)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&cand->rb_node, parent, p);
+	rb_insert_color(&cand->rb_node, &list->rb_tree);
+
+	if (list->count <= list->maxcount) {
+		list->count++;
+		return NULL;
+	}
+	cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	cand->list = NULL;
+	return cand;
+}
+
+static void remove_from_list(struct gc_candidate *cand)
+{
+	struct candidate_list *list = cand->list;
+
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	list->count--;
+}
+
+static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_remove32(&super->s_cand_tree, cand->segno);
+	kfree(cand);
+}
+
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
+{
+	struct gc_candidate *cand;
+	u32 segno;
+
+	BUG_ON(list->count == 0);
+
+	cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+	remove_from_list(cand);
+	segno = cand->segno;
+	if (ec)
+		*ec = cand->erase_count;
+	free_candidate(sb, cand);
+	return segno;
+}
+
+/*
+ * We have several lists to manage segments with.  The reserve_list is used to
+ * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
+ * list.
+ * The free_list contains free segments for normal usage.  It usually gets the
+ * second pick after the reserve_list.  But when the free_list is running short
+ * it is more important to keep the free_list full than to keep a reserve.
+ *
+ * Segments that are not free are put onto a per-level low_list.  If we have
+ * to run garbage collection, we pick a candidate from there.  All segments on
+ * those lists should have at least some free space so GC will make progress.
+ *
+ * And last we have the ec_list, which is used to pick segments for wear
+ * leveling.
+ *
+ * If all appropriate lists are full, we simply free the candidate and forget
+ * about that segment for a while.  We have better candidates for each purpose.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+
+	if (cand->valid == 0) {
+		/* 100% free segments */
+		log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
+				cand->segno, cand->erase_count,
+				dev_ofs(sb, cand->segno, 0));
+		cand = add_list(cand, &super->s_reserve_list);
+		if (cand) {
+			log_gc_noisy("add free segment %x (ec %x) at %llx\n",
+					cand->segno, cand->erase_count,
+					dev_ofs(sb, cand->segno, 0));
+			cand = add_list(cand, &super->s_free_list);
+		}
+	} else {
+		/* good candidates for Garbage Collection */
+		if (cand->valid < full)
+			cand = add_list(cand, &super->s_low_list[cand->dist]);
+		/* good candidates for wear leveling,
+		 * segments that were recently written get ignored */
+		if (cand)
+			cand = add_list(cand, &super->s_ec_list);
+	}
+	if (cand)
+		free_candidate(sb, cand);
+}
+
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+		u8 dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = kmalloc(sizeof(*cand), GFP_NOFS);
+	if (!cand)
+		return -ENOMEM;
+
+	cand->segno = segno;
+	cand->valid = valid;
+	cand->erase_count = ec;
+	cand->dist = dist;
+
+	btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
+	__add_candidate(sb, cand);
+	return 0;
+}
+
+static void remove_segment_from_lists(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = btree_lookup32(&super->s_cand_tree, segno);
+	if (cand) {
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+}
+
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+	u32 valid, ec = 0;
+	gc_level_t gc_level = 0;
+	u8 dist;
+
+	if (segment_is_reserved(sb, segno))
+		return;
+
+	remove_segment_from_lists(sb, segno);
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	if (valid == RESERVED)
+		return;
+
+	dist = root_distance(sb, gc_level);
+	add_candidate(sb, segno, valid, ec, dist);
+}
+
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+	if (list->count == 0)
+		return NULL;
+	return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+}
+
+/*
+ * Find the best segment for garbage collection.  Main criterion is
+ * the segment requiring the least effort to clean.  Secondary
+ * criterion is to GC on the lowest level available.
+ *
+ * So we search the least effort segment on the lowest level first,
+ * then move up and pick another segment iff is requires significantly
+ * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
+ */
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, max_dist;
+	struct gc_candidate *cand = NULL, *this;
+
+	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
+
+	for (i = max_dist; i >= 0; i--) {
+		this = first_in_list(&super->s_low_list[i]);
+		if (!this)
+			continue;
+		if (!cand)
+			cand = this;
+		if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
+			cand = this;
+	}
+	return cand;
+}
+
+static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	gc_level_t gc_level;
+	u32 cleaned, valid, segno, ec;
+	u8 dist;
+
+	if (!cand) {
+		log_gc("GC attempted, but no candidate found\n");
+		return 0;
+	}
+
+	segno = cand->segno;
+	dist = cand->dist;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	free_candidate(sb, cand);
+	log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
+			segno, (u64)segno << super->s_segshift,
+			dist, no_free_segments(sb), valid,
+			super->s_free_bytes);
+	cleaned = logfs_gc_segment(sb, segno);
+	log_gc("GC segment #%02x complete - now %x valid\n", segno,
+			valid - cleaned);
+	BUG_ON(cleaned != valid);
+	return 1;
+}
+
+static int logfs_gc_once(struct super_block *sb)
+{
+	struct gc_candidate *cand;
+
+	cand = get_candidate(sb);
+	if (cand)
+		remove_from_list(cand);
+	return __logfs_gc_once(sb, cand);
+}
+
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, ret = 0;
+
+	segno = super->s_sweeper;
+	for (i = SCAN_RATIO; i > 0; i--) {
+		segno++;
+		if (segno >= super->s_no_segs) {
+			segno = 0;
+			ret = 1;
+			/* Break out of the loop.  We want to read a single
+			 * block from the segment size on next invocation if
+			 * SCAN_RATIO is set to match block size
+			 */
+			break;
+		}
+
+		scan_segment(sb, segno);
+	}
+	super->s_sweeper = segno;
+	return ret;
+}
+
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data.  LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed.  An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int round, progress, last_progress = 0;
+
+	/*
+	 * Doing too many changes to the segfile at once would result
+	 * in a large number of aliases.  Write the journal before
+	 * things get out of hand.
+	 */
+	if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
+		logfs_write_anchor(sb);
+
+	if (no_free_segments(sb) >= target &&
+			super->s_no_object_aliases < MAX_OBJ_ALIASES)
+		return;
+
+	log_gc("__logfs_gc_pass(%x)\n", target);
+	for (round = 0; round < SCAN_ROUNDS; ) {
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+
+		/* Sync in-memory state with on-medium state in case they
+		 * diverged */
+		logfs_write_anchor(sb);
+		round += logfs_scan_some(sb);
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+		progress = logfs_gc_once(sb);
+		if (progress)
+			last_progress = round;
+		else if (round - last_progress > 2)
+			break;
+		continue;
+
+		/*
+		 * The goto logic is nasty, I just don't know a better way to
+		 * code it.  GC is supposed to ensure two things:
+		 * 1. Enough free segments are available.
+		 * 2. The number of aliases is bounded.
+		 * When 1. is achieved, we take a look at 2. and write back
+		 * some alias-containing blocks, if necessary.  However, after
+		 * each such write we need to go back to 1., as writes can
+		 * consume free segments.
+		 */
+write_alias:
+		if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
+			return;
+		if (list_empty(&super->s_object_alias)) {
+			/* All aliases are still in btree */
+			return;
+		}
+		log_gc("Write back one alias\n");
+		block = list_entry(super->s_object_alias.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+		/*
+		 * To round off the nasty goto logic, we reset round here.  It
+		 * is a safety-net for GC not making any progress and limited
+		 * to something reasonably small.  If incremented it for every
+		 * single alias, the loop could terminate rather quickly.
+		 */
+		round = 0;
+	}
+	LOGFS_BUG(sb);
+}
+
+static int wl_ratelimit(struct super_block *sb, u64 *next_event)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (*next_event < super->s_gec) {
+		*next_event = super->s_gec + WL_RATELIMIT;
+		return 0;
+	}
+	return 1;
+}
+
+static void logfs_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *wl_cand, *free_cand;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
+		return;
+
+	wl_cand = first_in_list(&super->s_ec_list);
+	if (!wl_cand)
+		return;
+	free_cand = first_in_list(&super->s_free_list);
+	if (!free_cand)
+		return;
+
+	if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
+		remove_from_list(wl_cand);
+		__logfs_gc_once(sb, wl_cand);
+	}
+}
+
+/*
+ * The journal needs wear leveling as well.  But moving the journal is an
+ * expensive operation so we try to avoid it as much as possible.  And if we
+ * have to do it, we move the whole journal, not individual segments.
+ *
+ * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
+ * calculations.  First we check whether moving the journal would be a
+ * significant improvement.  That means that a) the current journal segments
+ * have more wear than the future journal segments and b) the current journal
+ * segments have more wear than normal ostore segments.
+ * Rationale for b) is that we don't have to move the journal if it is aging
+ * less than the ostore, even if the reserve segments age even less (they are
+ * excluded from wear leveling, after all).
+ * Next we check that the superblocks have less wear than the journal.  Since
+ * moving the journal requires writing the superblocks, we have to protect the
+ * superblocks even more than the journal.
+ *
+ * Also we double the acceptable wear difference, compared to ostore wear
+ * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
+ * soft errors have much less time to accumulate and we allow the journal to
+ * be a bit worse than the ostore.
+ */
+static void logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u32 min_journal_ec = -1, max_reserve_ec = 0;
+	int i;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_journal))
+		return;
+
+	if (super->s_reserve_list.count < super->s_no_journal_segs) {
+		/* Reserve is not full enough to move complete journal */
+		return;
+	}
+
+	journal_for_each(i)
+		if (super->s_journal_seg[i])
+			min_journal_ec = min(min_journal_ec,
+					super->s_journal_ec[i]);
+	cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
+			struct gc_candidate, rb_node);
+	max_reserve_ec = cand->erase_count;
+	for (i = 0; i < 2; i++) {
+		struct logfs_segment_entry se;
+		u32 segno = seg_no(sb, super->s_sb_ofs[i]);
+		u32 ec;
+
+		logfs_get_segment_entry(sb, segno, &se);
+		ec = be32_to_cpu(se.ec_level) >> 4;
+		max_reserve_ec = max(max_reserve_ec, ec);
+	}
+
+	if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
+		do_logfs_journal_wl_pass(sb);
+	}
+}
+
+void logfs_gc_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
+	/* Write journal before free space is getting saturated with dirty
+	 * objects.
+	 */
+	if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
+			+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
+		logfs_write_anchor(sb);
+	__logfs_gc_pass(sb, super->s_total_levels);
+	logfs_wl_pass(sb);
+	logfs_journal_wl_pass(sb);
+}
+
+static int check_area(struct super_block *sb, int i)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[i];
+	gc_level_t gc_level;
+	u32 cleaned, valid, ec;
+	u32 segno = area->a_segno;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+
+	if (!area->a_is_open)
+		return 0;
+
+	if (super->s_devops->can_write_buf(sb, ofs) == 0)
+		return 0;
+
+	printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
+	/*
+	 * The device cannot write back the write buffer.  Most likely the
+	 * wbuf was already written out and the system crashed at some point
+	 * before the journal commit happened.  In that case we wouldn't have
+	 * to do anything.  But if the crash happened before the wbuf was
+	 * written out correctly, we must GC this segment.  So assume the
+	 * worst and always do the GC run.
+	 */
+	area->a_is_open = 0;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	cleaned = logfs_gc_segment(sb, segno);
+	if (cleaned != valid)
+		return -EIO;
+	return 0;
+}
+
+int logfs_check_areas(struct super_block *sb)
+{
+	int i, err;
+
+	for_each_area(i) {
+		err = check_area(sb, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+		int sort_by_ec)
+{
+	list->count = 0;
+	list->maxcount = maxcount;
+	list->sort_by_ec = sort_by_ec;
+	list->rb_tree = RB_ROOT;
+}
+
+int logfs_init_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
+	logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
+	logfs_init_candlist(&super->s_reserve_list,
+			super->s_bad_seg_reserve, 1);
+	for_each_area(i)
+		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+	return 0;
+}
+
+static void logfs_cleanup_list(struct super_block *sb,
+		struct candidate_list *list)
+{
+	struct gc_candidate *cand;
+
+	while (list->count) {
+		cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
+				rb_node);
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+	BUG_ON(list->rb_tree.rb_node);
+}
+
+void logfs_cleanup_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	if (!super->s_free_list.count)
+		return;
+
+	/*
+	 * FIXME: The btree may still contain a single empty node.  So we
+	 * call the grim visitor to clean up that mess.  Btree code should
+	 * do it for us, really.
+	 */
+	btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
+	logfs_cleanup_list(sb, &super->s_free_list);
+	logfs_cleanup_list(sb, &super->s_reserve_list);
+	for_each_area(i)
+		logfs_cleanup_list(sb, &super->s_low_list[i]);
+	logfs_cleanup_list(sb, &super->s_ec_list);
+}
diff --git a/kernel/fs/logfs/inode.c b/kernel/fs/logfs/inode.c
new file mode 100644
index 000000000..af49e2d69
--- /dev/null
+++ b/kernel/fs/logfs/inode.c
@@ -0,0 +1,426 @@
+/*
+ * fs/logfs/inode.c	- inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+/*
+ * How soon to reuse old inode numbers?  LogFS doesn't store deleted inodes
+ * on the medium.  It therefore also lacks a method to store the previous
+ * generation number for deleted inodes.  Instead a single generation number
+ * is stored which will be used for new inodes.  Being just a 32bit counter,
+ * this can obvious wrap relatively quickly.  So we only reuse inodes if we
+ * know that a fair number of inodes can be created before we have to increment
+ * the generation again - effectively adding some bits to the counter.
+ * But being too aggressive here means we keep a very large and very sparse
+ * inode file, wasting space on indirect blocks.
+ * So what is a good value?  Beats me.  64k seems moderately bad on both
+ * fronts, so let's use that for now...
+ *
+ * NFS sucks, as everyone already knows.
+ */
+#define INOS_PER_WRAP (0x10000)
+
+/*
+ * Logfs' requirement to read inodes for garbage collection makes life a bit
+ * harder.  GC may have to read inodes that are in I_FREEING state, when they
+ * are being written out - and waiting for GC to make progress, naturally.
+ *
+ * So we cannot just call iget() or some variant of it, but first have to check
+ * whether the inode in question might be in I_FREEING state.  Therefore we
+ * maintain our own per-sb list of "almost deleted" inodes and check against
+ * that list first.  Normally this should be at most 1-2 entries long.
+ *
+ * Also, inodes have logfs-specific reference counting on top of what the vfs
+ * does.  When .destroy_inode is called, normally the reference count will drop
+ * to zero and the inode gets deleted.  But if GC accessed the inode, its
+ * refcount will remain nonzero and final deletion will have to wait.
+ *
+ * As a result we have two sets of functions to get/put inodes:
+ * logfs_safe_iget/logfs_safe_iput	- safe to call from GC context
+ * logfs_iget/iput			- normal version
+ */
+static struct kmem_cache *logfs_inode_cache;
+
+static DEFINE_SPINLOCK(logfs_inode_lock);
+
+static void logfs_inode_setops(struct inode *inode)
+{
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &logfs_dir_iops;
+		inode->i_fop = &logfs_dir_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFREG:
+		inode->i_op = &logfs_reg_iops;
+		inode->i_fop = &logfs_reg_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &logfs_symlink_iops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode = iget_locked(sb, ino);
+	int err;
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err || inode->i_nlink == 0) {
+		/* inode->i_nlink == 0 can be true when called from
+		 * block validator */
+		/* set i_nlink to 0 to prevent caching */
+		clear_nlink(inode);
+		logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+		iget_failed(inode);
+		if (!err)
+			err = -ENOENT;
+		return ERR_PTR(err);
+	}
+
+	logfs_inode_setops(inode);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *logfs_iget(struct super_block *sb, ino_t ino)
+{
+	BUG_ON(ino == LOGFS_INO_MASTER);
+	BUG_ON(ino == LOGFS_INO_SEGFILE);
+	return __logfs_iget(sb, ino);
+}
+
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li;
+
+	if (ino == LOGFS_INO_MASTER)
+		return super->s_master_inode;
+	if (ino == LOGFS_INO_SEGFILE)
+		return super->s_segfile_inode;
+
+	spin_lock(&logfs_inode_lock);
+	list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+		if (li->vfs_inode.i_ino == ino) {
+			li->li_refcount++;
+			spin_unlock(&logfs_inode_lock);
+			*is_cached = 1;
+			return &li->vfs_inode;
+		}
+	spin_unlock(&logfs_inode_lock);
+
+	*is_cached = 0;
+	return __logfs_iget(sb, ino);
+}
+
+static void logfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
+
+static void __logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(li->li_block);
+	list_del(&li->li_freeing_list);
+	call_rcu(&inode->i_rcu, logfs_i_callback);
+}
+
+static void __logfs_destroy_meta_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	BUG_ON(li->li_block);
+	call_rcu(&inode->i_rcu, logfs_i_callback);
+}
+
+static void logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (inode->i_ino < LOGFS_RESERVED_INOS) {
+		/*
+		 * The reserved inodes are never destroyed unless we are in
+		 * unmont path.
+		 */
+		__logfs_destroy_meta_inode(inode);
+		return;
+	}
+
+	BUG_ON(list_empty(&li->li_freeing_list));
+	spin_lock(&logfs_inode_lock);
+	li->li_refcount--;
+	if (li->li_refcount == 0)
+		__logfs_destroy_inode(inode);
+	spin_unlock(&logfs_inode_lock);
+}
+
+void logfs_safe_iput(struct inode *inode, int is_cached)
+{
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		return;
+	if (inode->i_ino == LOGFS_INO_SEGFILE)
+		return;
+
+	if (is_cached) {
+		logfs_destroy_inode(inode);
+		return;
+	}
+
+	iput(inode);
+}
+
+static void logfs_init_inode(struct super_block *sb, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	li->li_flags	= 0;
+	li->li_height	= 0;
+	li->li_used_bytes = 0;
+	li->li_block	= NULL;
+	i_uid_write(inode, 0);
+	i_gid_write(inode, 0);
+	inode->i_size	= 0;
+	inode->i_blocks	= 0;
+	inode->i_ctime	= CURRENT_TIME;
+	inode->i_mtime	= CURRENT_TIME;
+	li->li_refcount = 1;
+	INIT_LIST_HEAD(&li->li_freeing_list);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+
+	return;
+}
+
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+	struct logfs_inode *li;
+
+	li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
+	if (!li)
+		return NULL;
+	logfs_init_inode(sb, &li->vfs_inode);
+	return &li->vfs_inode;
+}
+
+/*
+ * In logfs inodes are written to an inode file.  The inode file, like any
+ * other file, is managed with a inode.  The inode file's inode, aka master
+ * inode, requires special handling in several respects.  First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written.  The ordering is important.  Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty.  Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed.  Sadly, this method has another side-effect.  The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use.  An ugly hack, no doubt.  Suggections for
+ * improvement are welcome.
+ *
+ * AV: that's what ->put_super() is for...
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_mode = S_IFREG;
+	inode->i_ino = ino;
+	inode->i_data.a_ops = &logfs_reg_aops;
+	mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
+
+	return inode;
+}
+
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = logfs_new_meta_inode(sb, ino);
+	if (IS_ERR(inode))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	logfs_inode_setops(inode);
+	return inode;
+}
+
+static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+	long flags = WF_LOCK;
+
+	/* Can only happen if creat() failed.  Safe to skip. */
+	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+		return 0;
+
+	ret = __logfs_write_inode(inode, NULL, flags);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+/* called with inode->i_lock held */
+static int logfs_drop_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	spin_lock(&logfs_inode_lock);
+	list_move(&li->li_freeing_list, &super->s_freeing_list);
+	spin_unlock(&logfs_inode_lock);
+	return generic_drop_inode(inode);
+}
+
+static void logfs_set_ino_generation(struct super_block *sb,
+		struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 ino;
+
+	mutex_lock(&super->s_journal_mutex);
+	ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
+	super->s_last_ino = ino;
+	super->s_inos_till_wrap--;
+	if (super->s_inos_till_wrap < 0) {
+		super->s_last_ino = LOGFS_RESERVED_INOS;
+		super->s_generation++;
+		super->s_inos_till_wrap = INOS_PER_WRAP;
+	}
+	inode->i_ino = ino;
+	inode->i_generation = super->s_generation;
+	mutex_unlock(&super->s_journal_mutex);
+}
+
+struct inode *logfs_new_inode(struct inode *dir, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	logfs_init_inode(sb, inode);
+
+	/* inherit parent flags */
+	logfs_inode(inode)->li_flags |=
+		logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+
+	inode->i_mode = mode;
+	logfs_set_ino_generation(sb, inode);
+
+	inode_init_owner(inode, dir, mode);
+	logfs_inode_setops(inode);
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
+static void logfs_init_once(void *_li)
+{
+	struct logfs_inode *li = _li;
+	int i;
+
+	li->li_flags = 0;
+	li->li_used_bytes = 0;
+	li->li_refcount = 1;
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+	inode_init_once(&li->vfs_inode);
+}
+
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+	logfs_get_wblocks(sb, NULL, WF_LOCK);
+	logfs_write_anchor(sb);
+	logfs_put_wblocks(sb, NULL, WF_LOCK);
+	return 0;
+}
+
+static void logfs_put_super(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	/* kill the meta-inodes */
+	iput(super->s_segfile_inode);
+	iput(super->s_master_inode);
+	iput(super->s_mapping_inode);
+}
+
+const struct super_operations logfs_super_operations = {
+	.alloc_inode	= logfs_alloc_inode,
+	.destroy_inode	= logfs_destroy_inode,
+	.evict_inode	= logfs_evict_inode,
+	.drop_inode	= logfs_drop_inode,
+	.put_super	= logfs_put_super,
+	.write_inode	= logfs_write_inode,
+	.statfs		= logfs_statfs,
+	.sync_fs	= logfs_sync_fs,
+};
+
+int logfs_init_inode_cache(void)
+{
+	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			logfs_init_once);
+	if (!logfs_inode_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_destroy_inode_cache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(logfs_inode_cache);
+}
diff --git a/kernel/fs/logfs/journal.c b/kernel/fs/logfs/journal.c
new file mode 100644
index 000000000..2a09b8d73
--- /dev/null
+++ b/kernel/fs/logfs/journal.c
@@ -0,0 +1,894 @@
+/*
+ * fs/logfs/journal.c	- journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+static void logfs_calc_free(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 reserve, no_segs = super->s_no_segs;
+	s64 free;
+	int i;
+
+	/* superblock segments */
+	no_segs -= 2;
+	super->s_no_journal_segs = 0;
+	/* journal */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			no_segs--;
+			super->s_no_journal_segs++;
+		}
+
+	/* open segments plus one extra per level for GC */
+	no_segs -= 2 * super->s_total_levels;
+
+	free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+	free -= super->s_used_bytes;
+	/* just a bit extra */
+	free -= super->s_total_levels * 4096;
+
+	/* Bad blocks are 'paid' for with speed reserve - the filesystem
+	 * simply gets slower as bad blocks accumulate.  Until the bad blocks
+	 * exceed the speed reserve - then the filesystem gets smaller.
+	 */
+	reserve = super->s_bad_segments + super->s_bad_seg_reserve;
+	reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
+	reserve = max(reserve, super->s_speed_reserve);
+	free -= reserve;
+	if (free < 0)
+		free = 0;
+
+	super->s_free_bytes = free;
+}
+
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int i, err;
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	journal_for_each(i) {
+		if (!super->s_journal_seg[i])
+			continue;
+		err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
+				GFP_KERNEL);
+		BUG_ON(err);
+	}
+}
+
+static void read_dynsb(struct super_block *sb,
+		struct logfs_je_dynsb *dynsb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec		= be64_to_cpu(dynsb->ds_gec);
+	super->s_sweeper	= be64_to_cpu(dynsb->ds_sweeper);
+	super->s_victim_ino	= be64_to_cpu(dynsb->ds_victim_ino);
+	super->s_rename_dir	= be64_to_cpu(dynsb->ds_rename_dir);
+	super->s_rename_pos	= be64_to_cpu(dynsb->ds_rename_pos);
+	super->s_used_bytes	= be64_to_cpu(dynsb->ds_used_bytes);
+	super->s_generation	= be32_to_cpu(dynsb->ds_generation);
+}
+
+static void read_anchor(struct super_block *sb,
+		struct logfs_je_anchor *da)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	super->s_last_ino = be64_to_cpu(da->da_last_ino);
+	li->li_flags	= 0;
+	li->li_height	= da->da_height;
+	i_size_write(inode, be64_to_cpu(da->da_size));
+	li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+
+static void read_erasecount(struct super_block *sb,
+		struct logfs_je_journal_ec *ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	journal_for_each(i)
+		super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+
+static int read_area(struct super_block *sb, struct logfs_je_area *a)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[a->gc_level];
+	u64 ofs;
+	u32 writemask = ~(super->s_writesize - 1);
+
+	if (a->gc_level >= LOGFS_NO_AREAS)
+		return -EIO;
+	if (a->vim != VIM_DEFAULT)
+		return -EIO; /* TODO: close area and continue */
+
+	area->a_used_bytes = be32_to_cpu(a->used_bytes);
+	area->a_written_bytes = area->a_used_bytes & writemask;
+	area->a_segno = be32_to_cpu(a->segno);
+	if (area->a_segno)
+		area->a_is_open = 1;
+
+	ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	if (super->s_writesize > 1)
+		return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+	else
+		return logfs_buf_recover(area, ofs, NULL, 0);
+}
+
+static void *unpack(void *from, void *to)
+{
+	struct logfs_journal_header *jh = from;
+	void *data = from + sizeof(struct logfs_journal_header);
+	int err;
+	size_t inlen, outlen;
+
+	inlen = be16_to_cpu(jh->h_len);
+	outlen = be16_to_cpu(jh->h_datalen);
+
+	if (jh->h_compr == COMPR_NONE)
+		memcpy(to, data, inlen);
+	else {
+		err = logfs_uncompress(data, to, inlen, outlen);
+		BUG_ON(err);
+	}
+	return to;
+}
+
+static int __read_je_header(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	u16 type, len, datalen;
+	int err;
+
+	/* read header only */
+	err = wbuf_read(sb, ofs, sizeof(*jh), jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	len = be16_to_cpu(jh->h_len);
+	datalen = be16_to_cpu(jh->h_datalen);
+	if (len > sb->s_blocksize)
+		return -EIO;
+	if ((type < JE_FIRST) || (type > JE_LAST))
+		return -EIO;
+	if (datalen > bufsize)
+		return -EIO;
+	return 0;
+}
+
+static int __read_je_payload(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	u16 len;
+	int err;
+
+	len = be16_to_cpu(jh->h_len);
+	err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
+	if (err)
+		return err;
+	if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
+		/* Old code was confused.  It forgot about the header length
+		 * and stopped calculating the crc 16 bytes before the end
+		 * of data - ick!
+		 * FIXME: Remove this hack once the old code is fixed.
+		 */
+		if (jh->h_crc == logfs_crc32(jh, len, 4))
+			WARN_ON_ONCE(1);
+		else
+			return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * jh needs to be large enough to hold the complete entry, not just the header
+ */
+static int __read_je(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	int err;
+
+	err = __read_je_header(sb, ofs, jh);
+	if (err)
+		return err;
+	return __read_je_payload(sb, ofs, jh);
+}
+
+static int read_je(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	void *scratch = super->s_je;
+	u16 type, datalen;
+	int err;
+
+	err = __read_je(sb, ofs, jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	datalen = be16_to_cpu(jh->h_datalen);
+
+	switch (type) {
+	case JE_DYNSB:
+		read_dynsb(sb, unpack(jh, scratch));
+		break;
+	case JE_ANCHOR:
+		read_anchor(sb, unpack(jh, scratch));
+		break;
+	case JE_ERASECOUNT:
+		read_erasecount(sb, unpack(jh, scratch));
+		break;
+	case JE_AREA:
+		err = read_area(sb, unpack(jh, scratch));
+		break;
+	case JE_OBJ_ALIAS:
+		err = logfs_load_object_aliases(sb, unpack(jh, scratch),
+				datalen);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+	return err;
+}
+
+static int logfs_read_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+	u32 h_ofs, last_ofs = 0;
+	u16 len, datalen, last_len = 0;
+	int i, err;
+
+	/* search for most recent commit */
+	for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
+		ofs = seg_ofs + h_ofs;
+		err = __read_je_header(sb, ofs, jh);
+		if (err)
+			continue;
+		if (jh->h_type != cpu_to_be16(JE_COMMIT))
+			continue;
+		err = __read_je_payload(sb, ofs, jh);
+		if (err)
+			continue;
+		len = be16_to_cpu(jh->h_len);
+		datalen = be16_to_cpu(jh->h_datalen);
+		if ((datalen > sizeof(super->s_je_array)) ||
+				(datalen % sizeof(__be64)))
+			continue;
+		last_ofs = h_ofs;
+		last_len = datalen;
+		h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
+	}
+	/* read commit */
+	if (last_ofs == 0)
+		return -ENOENT;
+	ofs = seg_ofs + last_ofs;
+	log_journal("Read commit from %llx\n", ofs);
+	err = __read_je(sb, ofs, jh);
+	BUG_ON(err); /* We should have caught it in the scan loop already */
+	if (err)
+		return err;
+	/* uncompress */
+	unpack(jh, super->s_je_array);
+	super->s_no_je = last_len / sizeof(__be64);
+	/* iterate over array */
+	for (i = 0; i < super->s_no_je; i++) {
+		err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
+		if (err)
+			return err;
+	}
+	super->s_journal_area->a_segno = segno;
+	return 0;
+}
+
+static u64 read_gec(struct super_block *sb, u32 segno)
+{
+	struct logfs_segment_header sh;
+	__be32 crc;
+	int err;
+
+	if (!segno)
+		return 0;
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	if (err)
+		return 0;
+	crc = logfs_crc32(&sh, sizeof(sh), 4);
+	if (crc != sh.crc) {
+		WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
+		/* Most likely it was just erased */
+		return 0;
+	}
+	return be64_to_cpu(sh.gec);
+}
+
+static int logfs_read_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 gec[LOGFS_JOURNAL_SEGS], max;
+	u32 segno;
+	int i, max_i;
+
+	max = 0;
+	max_i = -1;
+	journal_for_each(i) {
+		segno = super->s_journal_seg[i];
+		gec[i] = read_gec(sb, super->s_journal_seg[i]);
+		if (gec[i] > max) {
+			max = gec[i];
+			max_i = i;
+		}
+	}
+	if (max_i == -1)
+		return -EIO;
+	/* FIXME: Try older segments in case of error */
+	return logfs_read_segment(sb, super->s_journal_seg[max_i]);
+}
+
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+	struct logfs_super *super = logfs_super(area->a_sb);
+	int i;
+
+	journal_for_each(i) {
+		if (area->a_segno != super->s_journal_seg[i])
+			continue;
+
+		do {
+			i++;
+			if (i == LOGFS_JOURNAL_SEGS)
+				i = 0;
+		} while (!super->s_journal_seg[i]);
+
+		area->a_segno = super->s_journal_seg[i];
+		area->a_erase_count = ++(super->s_journal_ec[i]);
+		log_journal("Journal now at %x (ec %x)\n", area->a_segno,
+				area->a_erase_count);
+		return;
+	}
+	BUG();
+}
+
+static void journal_get_erase_count(struct logfs_area *area)
+{
+	/* erase count is stored globally and incremented in
+	 * journal_get_free_segment() - nothing to do here */
+}
+
+static int journal_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	union {
+		struct logfs_segment_header sh;
+		unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
+	} u;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno, 1);
+	if (err)
+		return err;
+
+	memset(&u, 0, sizeof(u));
+	u.sh.pad = 0;
+	u.sh.type = SEG_JOURNAL;
+	u.sh.level = 0;
+	u.sh.segno = cpu_to_be32(area->a_segno);
+	u.sh.ec = cpu_to_be32(area->a_erase_count);
+	u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
+
+	/* This causes a bug in segment.c.  Not yet. */
+	//logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(u);
+	logfs_buf_write(area, ofs, &u, sizeof(u));
+	return 0;
+}
+
+static size_t __logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t len, size_t datalen,
+		u16 type, u8 compr)
+{
+	jh->h_len	= cpu_to_be16(len);
+	jh->h_type	= cpu_to_be16(type);
+	jh->h_datalen	= cpu_to_be16(datalen);
+	jh->h_compr	= compr;
+	jh->h_pad[0]	= 'H';
+	jh->h_pad[1]	= 'E';
+	jh->h_pad[2]	= 'A';
+	jh->h_pad[3]	= 'D';
+	jh->h_pad[4]	= 'R';
+	jh->h_crc	= logfs_crc32(jh, len + sizeof(*jh), 4);
+	return ALIGN(len, 16) + sizeof(*jh);
+}
+
+static size_t logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t datalen, u16 type)
+{
+	size_t len = datalen;
+
+	return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
+}
+
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+	return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_journal_ec *ec = _ec;
+	int i;
+
+	journal_for_each(i)
+		ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+	*type = JE_ERASECOUNT;
+	*len = logfs_journal_erasecount_size(super);
+	return ec;
+}
+
+static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
+		size_t ignore2)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct super_block *sb = (void *)_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	/* consume new space */
+	super->s_free_bytes	  -= shadow->new_len;
+	super->s_used_bytes	  += shadow->new_len;
+	super->s_dirty_used_bytes -= shadow->new_len;
+
+	/* free up old space */
+	super->s_free_bytes	  += shadow->old_len;
+	super->s_used_bytes	  -= shadow->old_len;
+	super->s_dirty_free_bytes -= shadow->old_len;
+
+	logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
+	logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
+
+	log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void account_shadows(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
+	btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+	btree_grim_visitor32(&tree->segment_map, 0, NULL);
+	tree->no_shadowed_segments = 0;
+
+	if (li->li_block) {
+		/*
+		 * We never actually use the structure, when attached to the
+		 * master inode.  But it is easier to always free it here than
+		 * to have checks in several places elsewhere when allocating
+		 * it.
+		 */
+		li->li_block->ops->free_block(sb, li->li_block);
+	}
+	BUG_ON((s64)li->li_used_bytes < 0);
+}
+
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_anchor *da = _da;
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	da->da_height	= li->li_height;
+	da->da_last_ino = cpu_to_be64(super->s_last_ino);
+	da->da_size	= cpu_to_be64(i_size_read(inode));
+	da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		da->da_data[i] = cpu_to_be64(li->li_data[i]);
+	*type = JE_ANCHOR;
+	*len = sizeof(*da);
+	return da;
+}
+
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_dynsb *dynsb = _dynsb;
+
+	dynsb->ds_gec		= cpu_to_be64(super->s_gec);
+	dynsb->ds_sweeper	= cpu_to_be64(super->s_sweeper);
+	dynsb->ds_victim_ino	= cpu_to_be64(super->s_victim_ino);
+	dynsb->ds_rename_dir	= cpu_to_be64(super->s_rename_dir);
+	dynsb->ds_rename_pos	= cpu_to_be64(super->s_rename_pos);
+	dynsb->ds_used_bytes	= cpu_to_be64(super->s_used_bytes);
+	dynsb->ds_generation	= cpu_to_be32(super->s_generation);
+	*type = JE_DYNSB;
+	*len = sizeof(*dynsb);
+	return dynsb;
+}
+
+static void write_wbuf(struct super_block *sb, struct logfs_area *area,
+		void *wbuf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	u64 ofs;
+	pgoff_t index;
+	int page_ofs;
+	struct page *page;
+
+	ofs = dev_ofs(sb, area->a_segno,
+			area->a_used_bytes & ~(super->s_writesize - 1));
+	index = ofs >> PAGE_SHIFT;
+	page_ofs = ofs & (PAGE_SIZE - 1);
+
+	page = find_or_create_page(mapping, index, GFP_NOFS);
+	BUG_ON(!page);
+	memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
+	unlock_page(page);
+}
+
+static void *logfs_write_area(struct super_block *sb, void *_a,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[super->s_sum_index];
+	struct logfs_je_area *a = _a;
+
+	a->vim = VIM_DEFAULT;
+	a->gc_level = super->s_sum_index;
+	a->used_bytes = cpu_to_be32(area->a_used_bytes);
+	a->segno = cpu_to_be32(area->a_segno);
+	if (super->s_writesize > 1)
+		write_wbuf(sb, area, a + 1);
+
+	*type = JE_AREA;
+	*len = sizeof(*a) + super->s_writesize;
+	return a;
+}
+
+static void *logfs_write_commit(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	*type = JE_COMMIT;
+	*len = super->s_no_je * sizeof(__be64);
+	return super->s_je_array;
+}
+
+static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
+		size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *header = super->s_compressed_je;
+	void *data = header + sizeof(struct logfs_journal_header);
+	ssize_t compr_len, pad_len;
+	u8 compr = COMPR_ZLIB;
+
+	if (len == 0)
+		return logfs_write_header(super, header, 0, type);
+
+	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
+	if (compr_len < 0 || type == JE_ANCHOR) {
+		memcpy(data, buf, len);
+		compr_len = len;
+		compr = COMPR_NONE;
+	}
+
+	pad_len = ALIGN(compr_len, 16);
+	memset(data + compr_len, 0, pad_len - compr_len);
+
+	return __logfs_write_header(super, header, compr_len, len, type, compr);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+		int must_pad)
+{
+	u32 writesize = logfs_super(area->a_sb)->s_writesize;
+	s32 ofs;
+	int ret;
+
+	ret = logfs_open_area(area, *bytes);
+	if (ret)
+		return -EAGAIN;
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += *bytes;
+
+	if (must_pad) {
+		area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+		*bytes = area->a_used_bytes - ofs;
+	}
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
+		size_t buf_len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	size_t len;
+	int must_pad = 0;
+	s64 ofs;
+
+	len = __logfs_write_je(sb, buf, type, buf_len);
+	if (jh->h_type == cpu_to_be16(JE_COMMIT))
+		must_pad = 1;
+
+	ofs = logfs_get_free_bytes(area, &len, must_pad);
+	if (ofs < 0)
+		return ofs;
+	logfs_buf_write(area, ofs, super->s_compressed_je, len);
+	BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
+	super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
+	return 0;
+}
+
+static int logfs_write_je(struct super_block *sb,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	void *buf;
+	size_t len;
+	u16 type;
+
+	buf = write(sb, logfs_super(sb)->s_je, &type, &len);
+	return logfs_write_je_buf(sb, buf, type, len);
+}
+
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_obj_alias *oa = super->s_je;
+	int err = 0, fill = super->s_je_fill;
+
+	log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
+			fill, ino, bix, level, child_no, be64_to_cpu(val));
+	oa[fill].ino = cpu_to_be64(ino);
+	oa[fill].bix = cpu_to_be64(bix);
+	oa[fill].val = val;
+	oa[fill].level = (__force u8)level;
+	oa[fill].child_no = cpu_to_be16(child_no);
+	fill++;
+	if (fill >= sb->s_blocksize / sizeof(*oa)) {
+		err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
+		fill = 0;
+	}
+
+	super->s_je_fill = fill;
+	return err;
+}
+
+static int logfs_write_obj_aliases(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	log_journal("logfs_write_obj_aliases: %d aliases to write\n",
+			super->s_no_object_aliases);
+	super->s_je_fill = 0;
+	err = logfs_write_obj_aliases_pagecache(sb);
+	if (err)
+		return err;
+
+	if (super->s_je_fill)
+		err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
+				super->s_je_fill
+				* sizeof(struct logfs_obj_alias));
+	return err;
+}
+
+/*
+ * Write all journal entries.  The goto logic ensures that all journal entries
+ * are written whenever a new segment is used.  It is ugly and potentially a
+ * bit wasteful, but robustness is more important.  With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+void logfs_write_anchor(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	int i, err;
+
+	if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
+		return;
+	super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
+
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	mutex_lock(&super->s_journal_mutex);
+
+	/* Do this first or suffer corruption */
+	logfs_sync_segments(sb);
+	account_shadows(sb);
+
+again:
+	super->s_no_je = 0;
+	for_each_area(i) {
+		if (!super->s_area[i]->a_is_open)
+			continue;
+		super->s_sum_index = i;
+		err = logfs_write_je(sb, logfs_write_area);
+		if (err)
+			goto again;
+	}
+	err = logfs_write_obj_aliases(sb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_erasecount);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, __logfs_write_anchor);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_dynsb);
+	if (err)
+		goto again;
+	/*
+	 * Order is imperative.  First we sync all writes, including the
+	 * non-committed journal writes.  Then we write the final commit and
+	 * sync the current journal segment.
+	 * There is a theoretical bug here.  Syncing the journal segment will
+	 * write a number of journal entries and the final commit.  All these
+	 * are written in a single operation.  If the device layer writes the
+	 * data back-to-front, the commit will precede the other journal
+	 * entries, leaving a race window.
+	 * Two fixes are possible.  Preferred is to fix the device layer to
+	 * ensure writes happen front-to-back.  Alternatively we can insert
+	 * another logfs_sync_area() super->s_devops->sync() combo before
+	 * writing the commit.
+	 */
+	/*
+	 * On another subject, super->s_devops->sync is usually not necessary.
+	 * Unless called from sys_sync or friends, a barrier would suffice.
+	 */
+	super->s_devops->sync(sb);
+	err = logfs_write_je(sb, logfs_write_commit);
+	if (err)
+		goto again;
+	log_journal("Write commit to %llx\n",
+			be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
+	logfs_sync_area(area);
+	BUG_ON(area->a_used_bytes != area->a_written_bytes);
+	super->s_devops->sync(sb);
+
+	mutex_unlock(&super->s_journal_mutex);
+	return;
+}
+
+void do_logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct btree_head32 *head = &super->s_reserved_segments;
+	u32 segno, ec;
+	int i, err;
+
+	log_journal("Journal requires wear-leveling.\n");
+	/* Drop old segments */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			btree_remove32(head, super->s_journal_seg[i]);
+			logfs_set_segment_unreserved(sb,
+					super->s_journal_seg[i],
+					super->s_journal_ec[i]);
+			super->s_journal_seg[i] = 0;
+			super->s_journal_ec[i] = 0;
+		}
+	/* Get new segments */
+	for (i = 0; i < super->s_no_journal_segs; i++) {
+		segno = get_best_cand(sb, &super->s_reserve_list, &ec);
+		super->s_journal_seg[i] = segno;
+		super->s_journal_ec[i] = ec;
+		logfs_set_segment_reserved(sb, segno);
+		err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+		BUG_ON(err); /* mempool should prevent this */
+		err = logfs_erase_segment(sb, segno, 1);
+		BUG_ON(err); /* FIXME: remount-ro would be nicer */
+	}
+	/* Manually move journal_area */
+	freeseg(sb, area->a_segno);
+	area->a_segno = super->s_journal_seg[0];
+	area->a_is_open = 0;
+	area->a_used_bytes = 0;
+	/* Write journal */
+	logfs_write_anchor(sb);
+	/* Write superblocks */
+	err = logfs_write_sb(sb);
+	BUG_ON(err);
+}
+
+static const struct logfs_area_ops journal_area_ops = {
+	.get_free_segment	= journal_get_free_segment,
+	.get_erase_count	= journal_get_erase_count,
+	.erase_segment		= journal_erase_segment,
+};
+
+int logfs_init_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	int ret = -ENOMEM;
+
+	mutex_init(&super->s_journal_mutex);
+	btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
+
+	super->s_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_je)
+		return ret;
+
+	super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_compressed_je)
+		return ret;
+
+	super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+	if (IS_ERR(super->s_master_inode))
+		return PTR_ERR(super->s_master_inode);
+
+	ret = logfs_read_journal(sb);
+	if (ret)
+		return -EIO;
+
+	reserve_sb_and_journal(sb);
+	logfs_calc_free(sb);
+
+	super->s_journal_area->a_ops = &journal_area_ops;
+	return 0;
+}
+
+void logfs_cleanup_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
+
+	kfree(super->s_compressed_je);
+	kfree(super->s_je);
+}
diff --git a/kernel/fs/logfs/logfs.h b/kernel/fs/logfs/logfs.h
new file mode 100644
index 000000000..5f0937609
--- /dev/null
+++ b/kernel/fs/logfs/logfs.h
@@ -0,0 +1,736 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_H
+#define FS_LOGFS_LOGFS_H
+
+#undef __CHECK_ENDIAN__
+#define __CHECK_ENDIAN__
+
+#include <linux/btree.h>
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+
+#define LOGFS_DEBUG_SUPER	(0x0001)
+#define LOGFS_DEBUG_SEGMENT	(0x0002)
+#define LOGFS_DEBUG_JOURNAL	(0x0004)
+#define LOGFS_DEBUG_DIR		(0x0008)
+#define LOGFS_DEBUG_FILE	(0x0010)
+#define LOGFS_DEBUG_INODE	(0x0020)
+#define LOGFS_DEBUG_READWRITE	(0x0040)
+#define LOGFS_DEBUG_GC		(0x0080)
+#define LOGFS_DEBUG_GC_NOISY	(0x0100)
+#define LOGFS_DEBUG_ALIASES	(0x0200)
+#define LOGFS_DEBUG_BLOCKMOVE	(0x0400)
+#define LOGFS_DEBUG_ALL		(0xffffffff)
+
+#define LOGFS_DEBUG		(0x01)
+/*
+ * To enable specific log messages, simply define LOGFS_DEBUG to match any
+ * or all of the above.
+ */
+#ifndef LOGFS_DEBUG
+#define LOGFS_DEBUG		(0)
+#endif
+
+#define log_cond(cond, fmt, arg...) do {	\
+	if (cond)				\
+		printk(KERN_DEBUG fmt, ##arg);	\
+} while (0)
+
+#define log_super(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
+#define log_segment(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
+#define log_journal(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
+#define log_dir(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
+#define log_file(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
+#define log_inode(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
+#define log_readwrite(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
+#define log_gc(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
+#define log_gc_noisy(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
+#define log_aliases(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
+#define log_blockmove(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
+
+#define PG_pre_locked		PG_owner_priv_1
+#define PagePreLocked(page)	test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page)	set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX		(1<<30)
+
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO	0x0001
+#define LOGFS_SB_FLAG_DIRTY	0x0002
+#define LOGFS_SB_FLAG_OBJ_ALIAS	0x0004
+#define LOGFS_SB_FLAG_SHUTDOWN	0x0008
+
+/* Write Control Flags */
+#define WF_LOCK			0x01 /* take write lock */
+#define WF_WRITE		0x02 /* write block */
+#define WF_DELETE		0x04 /* delete old block */
+
+typedef u8 __bitwise level_t;
+typedef u8 __bitwise gc_level_t;
+
+#define LEVEL(level) ((__force level_t)(level))
+#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
+
+#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)),	\
+		(__force level_t)((__force u8)(level) - 1) )
+
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb:			the superblock this area belongs to
+ * @a_is_open:			1 if the area is currently open, else 0
+ * @a_segno:			segment number of area
+ * @a_written_bytes:		number of bytes already written back
+ * @a_used_bytes:		number of used bytes
+ * @a_ops:			area operations (either journal or ostore)
+ * @a_erase_count:		erase count
+ * @a_level:			GC level
+ */
+struct logfs_area { /* a segment open for writing */
+	struct super_block *a_sb;
+	int	a_is_open;
+	u32	a_segno;
+	u32	a_written_bytes;
+	u32	a_used_bytes;
+	const struct logfs_area_ops *a_ops;
+	u32	a_erase_count;
+	gc_level_t a_level;
+};
+
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment:		fill area->ofs with the offset of a free segment
+ * @get_erase_count:		fill area->erase_count (needs area->ofs)
+ * @erase_segment:		erase and setup segment
+ */
+struct logfs_area_ops {
+	void	(*get_free_segment)(struct logfs_area *area);
+	void	(*get_erase_count)(struct logfs_area *area);
+	int	(*erase_segment)(struct logfs_area *area);
+};
+
+struct logfs_super;	/* forward */
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @readpage:			read one page (mm page)
+ * @writeseg:			write one segment.  may be a partial segment
+ * @erase:			erase one segment
+ * @read:			read from the device
+ * @erase:			erase part of the device
+ * @can_write_buf:		decide whether wbuf can be written to ofs
+ */
+struct logfs_device_ops {
+	struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
+	struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
+	int (*write_sb)(struct super_block *sb, struct page *page);
+	int (*readpage)(void *_sb, struct page *page);
+	void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
+	int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
+			int ensure_write);
+	int (*can_write_buf)(struct super_block *sb, u64 ofs);
+	void (*sync)(struct super_block *sb);
+	void (*put_device)(struct logfs_super *s);
+};
+
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+	struct rb_root rb_tree;
+	int count;
+	int maxcount;
+	int sort_by_ec;
+};
+
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list:			list (either free of low)
+ * @segno:			segment number
+ * @valid:			number of valid bytes
+ * @erase_count:		erase count of segment
+ * @dist:			distance from tree root
+ *
+ * Candidates can be on two lists.  The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data.  The
+ * low list contains candidates to be picked for GC.  It should be kept
+ * short.  It is not required to always pick a perfect candidate.  In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+	struct rb_node rb_node;
+	struct candidate_list *list;
+	u32	segno;
+	u32	valid;
+	u32	erase_count;
+	u8	dist;
+};
+
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version:			normalized version
+ * @len:			length
+ * @offset:			offset
+ */
+struct logfs_journal_entry {
+	int used;
+	s16 version;
+	u16 len;
+	u16 datalen;
+	u64 offset;
+};
+
+enum transaction_state {
+	CREATE_1 = 1,
+	CREATE_2,
+	UNLINK_1,
+	UNLINK_2,
+	CROSS_RENAME_1,
+	CROSS_RENAME_2,
+	TARGET_RENAME_1,
+	TARGET_RENAME_2,
+	TARGET_RENAME_3
+};
+
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino:			target inode
+ * @dir:			inode of directory containing dentry
+ * @pos:			pos of dentry in directory
+ */
+struct logfs_transaction {
+	enum transaction_state state;
+	u64	 ino;
+	u64	 dir;
+	u64	 pos;
+};
+
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs:			offset of old block on medium
+ * @new_ofs:			offset of new block on medium
+ * @ino:			inode number
+ * @bix:			block index
+ * @old_len:			size of old block, including header
+ * @new_len:			size of new block, including header
+ * @level:			block level
+ */
+struct logfs_shadow {
+	u64 old_ofs;
+	u64 new_ofs;
+	u64 ino;
+	u64 bix;
+	int old_len;
+	int new_len;
+	gc_level_t gc_level;
+};
+
+/**
+ * struct shadow_tree
+ * @new:			shadows where old_ofs==0, indexed by new_ofs
+ * @old:			shadows where old_ofs!=0, indexed by old_ofs
+ * @segment_map:		bitfield of segments containing shadows
+ * @no_shadowed_segment:	number of segments containing shadows
+ */
+struct shadow_tree {
+	struct btree_head64 new;
+	struct btree_head64 old;
+	struct btree_head32 segment_map;
+	int no_shadowed_segments;
+};
+
+struct object_alias_item {
+	struct list_head list;
+	__be64 val;
+	int child_no;
+};
+
+/**
+ * struct logfs_block - contains any block state
+ * @type:			indirect block or inode
+ * @full:			number of fully populated children
+ * @partial:			number of partially populated children
+ *
+ * Most blocks are directly represented by page cache pages.  But when a block
+ * becomes dirty, is part of a transaction, contains aliases or is otherwise
+ * special, a struct logfs_block is allocated to track the additional state.
+ * Inodes are very similar to indirect blocks, so they can also get one of
+ * these structures added when appropriate.
+ */
+#define BLOCK_INDIRECT	1	/* Indirect block */
+#define BLOCK_INODE	2	/* Inode */
+struct logfs_block_ops;
+struct logfs_block {
+	struct list_head alias_list;
+	struct list_head item_list;
+	struct super_block *sb;
+	u64 ino;
+	u64 bix;
+	level_t level;
+	struct page *page;
+	struct inode *inode;
+	struct logfs_transaction *ta;
+	unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
+	struct logfs_block_ops *ops;
+	int full;
+	int partial;
+	int reserved_bytes;
+};
+
+typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+struct logfs_block_ops {
+	void	(*write_block)(struct logfs_block *block);
+	void	(*free_block)(struct super_block *sb, struct logfs_block*block);
+	int	(*write_alias)(struct super_block *sb,
+			struct logfs_block *block,
+			write_alias_t *write_one_alias);
+};
+
+#define MAX_JOURNAL_ENTRIES 256
+
+struct logfs_super {
+	struct mtd_info *s_mtd;			/* underlying device */
+	struct block_device *s_bdev;		/* underlying device */
+	const struct logfs_device_ops *s_devops;/* device access */
+	struct inode	*s_master_inode;	/* inode file */
+	struct inode	*s_segfile_inode;	/* segment file */
+	struct inode *s_mapping_inode;		/* device mapping */
+	atomic_t s_pending_writes;		/* outstanting bios */
+	long	 s_flags;
+	mempool_t *s_btree_pool;		/* for btree nodes */
+	mempool_t *s_alias_pool;		/* aliases in segment.c */
+	u64	 s_feature_incompat;
+	u64	 s_feature_ro_compat;
+	u64	 s_feature_compat;
+	u64	 s_feature_flags;
+	u64	 s_sb_ofs[2];
+	struct page *s_erase_page;		/* for dev_bdev.c */
+	/* alias.c fields */
+	struct btree_head32 s_segment_alias;	/* remapped segments */
+	int	 s_no_object_aliases;
+	struct list_head s_object_alias;	/* remapped objects */
+	struct btree_head128 s_object_alias_tree; /* remapped objects */
+	struct mutex s_object_alias_mutex;
+	/* dir.c fields */
+	struct mutex s_dirop_mutex;		/* for creat/unlink/rename */
+	u64	 s_victim_ino;			/* used for atomic dir-ops */
+	u64	 s_rename_dir;			/* source directory ino */
+	u64	 s_rename_pos;			/* position of source dd */
+	/* gc.c fields */
+	long	 s_segsize;			/* size of a segment */
+	int	 s_segshift;			/* log2 of segment size */
+	long	 s_segmask;			/* 1 << s_segshift - 1 */
+	long	 s_no_segs;			/* segments on device */
+	long	 s_no_journal_segs;		/* segments used for journal */
+	long	 s_no_blocks;			/* blocks per segment */
+	long	 s_writesize;			/* minimum write size */
+	int	 s_writeshift;			/* log2 of write size */
+	u64	 s_size;			/* filesystem size */
+	struct logfs_area *s_area[LOGFS_NO_AREAS];	/* open segment array */
+	u64	 s_gec;				/* global erase count */
+	u64	 s_wl_gec_ostore;		/* time of last wl event */
+	u64	 s_wl_gec_journal;		/* time of last wl event */
+	u64	 s_sweeper;			/* current sweeper pos */
+	u8	 s_ifile_levels;		/* max level of ifile */
+	u8	 s_iblock_levels;		/* max level of regular files */
+	u8	 s_data_levels;			/* # of segments to leaf block*/
+	u8	 s_total_levels;		/* sum of above three */
+	struct btree_head32 s_cand_tree;	/* all candidates */
+	struct candidate_list s_free_list;	/* 100% free segments */
+	struct candidate_list s_reserve_list;	/* Bad segment reserve */
+	struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+	struct candidate_list s_ec_list;	/* wear level candidates */
+	struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
+	/* inode.c fields */
+	u64	 s_last_ino;			/* highest ino used */
+	long	 s_inos_till_wrap;
+	u32	 s_generation;			/* i_generation for new files */
+	struct list_head s_freeing_list;	/* inodes being freed */
+	/* journal.c fields */
+	struct mutex s_journal_mutex;
+	void	*s_je;				/* journal entry to compress */
+	void	*s_compressed_je;		/* block to write to journal */
+	u32	 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+	u64	 s_last_version;
+	struct logfs_area *s_journal_area;	/* open journal segment */
+	__be64	s_je_array[MAX_JOURNAL_ENTRIES];
+	int	s_no_je;
+
+	int	 s_sum_index;			/* for the 12 summaries */
+	struct shadow_tree s_shadow_tree;
+	int	 s_je_fill;			/* index of current je */
+	/* readwrite.c fields */
+	struct mutex s_write_mutex;
+	int	 s_lock_count;
+	mempool_t *s_block_pool;		/* struct logfs_block pool */
+	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
+	struct list_head s_writeback_list;	/* writeback pages */
+	/*
+	 * Space accounting:
+	 * - s_used_bytes specifies space used to store valid data objects.
+	 * - s_dirty_used_bytes is space used to store non-committed data
+	 *   objects.  Those objects have already been written themselves,
+	 *   but they don't become valid until all indirect blocks up to the
+	 *   journal have been written as well.
+	 * - s_dirty_free_bytes is space used to store the old copy of a
+	 *   replaced object, as long as the replacement is non-committed.
+	 *   In other words, it is the amount of space freed when all dirty
+	 *   blocks are written back.
+	 * - s_free_bytes is the amount of free space available for any
+	 *   purpose.
+	 * - s_root_reserve is the amount of free space available only to
+	 *   the root user.  Non-privileged users can no longer write once
+	 *   this watermark has been reached.
+	 * - s_speed_reserve is space which remains unused to speed up
+	 *   garbage collection performance.
+	 * - s_dirty_pages is the space reserved for currently dirty pages.
+	 *   It is a pessimistic estimate, so some/most will get freed on
+	 *   page writeback.
+	 *
+	 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
+	 */
+	u64	 s_free_bytes;
+	u64	 s_used_bytes;
+	u64	 s_dirty_free_bytes;
+	u64	 s_dirty_used_bytes;
+	u64	 s_root_reserve;
+	u64	 s_speed_reserve;
+	u64	 s_dirty_pages;
+	/* Bad block handling:
+	 * - s_bad_seg_reserve is a number of segments usually kept
+	 *   free.  When encountering bad blocks, the affected segment's data
+	 *   is _temporarily_ moved to a reserved segment.
+	 * - s_bad_segments is the number of known bad segments.
+	 */
+	u32	 s_bad_seg_reserve;
+	u32	 s_bad_segments;
+};
+
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode:			struct inode
+ * @li_data:			data pointers
+ * @li_used_bytes:		number of used bytes
+ * @li_freeing_list:		used to track inodes currently being freed
+ * @li_flags:			inode flags
+ * @li_refcount:		number of internal (GC-induced) references
+ */
+struct logfs_inode {
+	struct inode vfs_inode;
+	u64	li_data[LOGFS_EMBEDDED_FIELDS];
+	u64	li_used_bytes;
+	struct list_head li_freeing_list;
+	struct logfs_block *li_block;
+	u32	li_flags;
+	u8	li_height;
+	int	li_refcount;
+};
+
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
+
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type,
+		const char *devname);
+#else
+static inline int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type,
+		const char *devname)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
+#else
+static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
+
+/* gc.c */
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct super_block *sb);
+void logfs_cleanup_gc(struct super_block *sb);
+
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino);
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_safe_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, umode_t mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+void logfs_set_blocks(struct inode *inode, u64 no);
+/* these logically belong into inode.c but actually reside in readwrite.c */
+int logfs_read_inode(struct inode *inode);
+int __logfs_write_inode(struct inode *inode, struct page *, long flags);
+void logfs_evict_inode(struct inode *inode);
+
+/* journal.c */
+void logfs_write_anchor(struct super_block *sb);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+void do_logfs_journal_wl_pass(struct super_block *sb);
+
+/* readwrite.c */
+pgoff_t logfs_pack_index(u64 bix, level_t level);
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_open_segfile(struct super_block *sb);
+int logfs_init_rw(struct super_block *sb);
+void logfs_cleanup_rw(struct super_block *sb);
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_write_block(struct logfs_block *block, long flags);
+int logfs_write_obj_aliases_pagecache(struct super_block *sb);
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se);
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level);
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level);
+void __free_block(struct super_block *sb, struct logfs_block *block);
+void btree_write_block(struct logfs_block *block);
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty);
+int logfs_exist_block(struct inode *inode, u64 bix);
+int get_page_reserve(struct inode *inode, struct page *page);
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
+extern struct logfs_block_ops indirect_block_ops;
+
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+		level_t level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count);
+void move_page_to_btree(struct page *page);
+int logfs_init_mapping(struct super_block *sb);
+void logfs_sync_area(struct logfs_area *area);
+void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
+void free_areas(struct super_block *sb);
+
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct super_block *sb);
+int logfs_open_area(struct logfs_area *area, size_t bytes);
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler);
+
+static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	return __logfs_buf_write(area, ofs, buf, len, 0);
+}
+
+static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	return __logfs_buf_write(area, ofs, buf, len, 1);
+}
+
+/* super.c */
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
+void emergency_read_end(struct page *page);
+void logfs_crash_dump(struct super_block *sb);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_check_ds(struct logfs_disk_super *ds);
+int logfs_write_sb(struct super_block *sb);
+
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+	return container_of(inode, struct logfs_inode, vfs_inode);
+}
+
+static inline void logfs_set_ro(struct super_block *sb)
+{
+	logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+
+#define LOGFS_BUG(sb) do {					\
+	struct super_block *__sb = sb;				\
+	logfs_crash_dump(__sb);					\
+	logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;		\
+	BUG();							\
+} while (0)
+
+#define LOGFS_BUG_ON(condition, sb) \
+	do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+	return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+
+static inline u8 logfs_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+	return pos >> sb->s_blocksize_bits;
+}
+
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+	return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+
+static inline u32 seg_no(struct super_block *sb, u64 ofs)
+{
+	return ofs >> logfs_super(sb)->s_segshift;
+}
+
+static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
+{
+	return ofs & logfs_super(sb)->s_segmask;
+}
+
+static inline u64 seg_align(struct super_block *sb, u64 ofs)
+{
+	return ofs & ~logfs_super(sb)->s_segmask;
+}
+
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+	return (void *)page->private;
+}
+
+static inline level_t shrink_level(gc_level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (level >= LOGFS_MAX_LEVELS)
+		level -= LOGFS_MAX_LEVELS;
+	return (__force level_t)level;
+}
+
+static inline gc_level_t expand_level(u64 ino, level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (ino == LOGFS_INO_MASTER) {
+		/* ifile has separate areas */
+		level += LOGFS_MAX_LEVELS;
+	}
+	return (__force gc_level_t)level;
+}
+
+static inline int logfs_block_shift(struct super_block *sb, level_t level)
+{
+	level = shrink_level((__force gc_level_t)level);
+	return (__force int)level * (sb->s_blocksize_bits - 3);
+}
+
+static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
+{
+	return ~0ull << logfs_block_shift(sb, level);
+}
+
+static inline struct logfs_area *get_area(struct super_block *sb,
+		gc_level_t gc_level)
+{
+	return logfs_super(sb)->s_area[(__force u8)gc_level];
+}
+
+static inline void logfs_mempool_destroy(mempool_t *pool)
+{
+	if (pool)
+		mempool_destroy(pool);
+}
+
+#endif
diff --git a/kernel/fs/logfs/logfs_abi.h b/kernel/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000..ae960519c
--- /dev/null
+++ b/kernel/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
+/*
+ * fs/logfs/logfs_abi.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Public header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_ABI_H
+#define FS_LOGFS_LOGFS_ABI_H
+
+/* For out-of-kernel compiles */
+#ifndef BUILD_BUG_ON
+#define BUILD_BUG_ON(condition) /**/
+#endif
+
+#define SIZE_CHECK(type, size)					\
+static inline void check_##type(void)				\
+{								\
+	BUILD_BUG_ON(sizeof(struct type) != (size));		\
+}
+
+/*
+ * Throughout the logfs code, we're constantly dealing with blocks at
+ * various positions or offsets.  To remove confusion, we stricly
+ * distinguish between a "position" - the logical position within a
+ * file and an "offset" - the physical location within the device.
+ *
+ * Any usage of the term offset for a logical location or position for
+ * a physical one is a bug and should get fixed.
+ */
+
+/*
+ * Block are allocated in one of several segments depending on their
+ * level.  The following levels are used:
+ *  0	- regular data block
+ *  1	- i1 indirect blocks
+ *  2	- i2 indirect blocks
+ *  3	- i3 indirect blocks
+ *  4	- i4 indirect blocks
+ *  5	- i5 indirect blocks
+ *  6	- ifile data blocks
+ *  7	- ifile i1 indirect blocks
+ *  8	- ifile i2 indirect blocks
+ *  9	- ifile i3 indirect blocks
+ * 10	- ifile i4 indirect blocks
+ * 11	- ifile i5 indirect blocks
+ * Potential levels to be used in the future:
+ * 12	- gc recycled blocks, long-lived data
+ * 13	- replacement blocks, short-lived data
+ *
+ * Levels 1-11 are necessary for robust gc operations and help separate
+ * short-lived metadata from longer-lived file data.  In the future,
+ * file data should get separated into several segments based on simple
+ * heuristics.  Old data recycled during gc operation is expected to be
+ * long-lived.  New data is of uncertain life expectancy.  New data
+ * used to replace older blocks in existing files is expected to be
+ * short-lived.
+ */
+
+
+/* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
+#define LOGFS_MAGIC		0x7a3a8e5cb9d5bf67ull
+#define LOGFS_MAGIC_U32		0xc97e8168u
+
+/*
+ * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
+ * Sooner or later that should become configurable and the macros replaced
+ * by something superblock-dependent.  Pointers in indirect blocks are and
+ * will remain 64bit.
+ *
+ * LOGFS_BLOCKSIZE	- self-explaining
+ * LOGFS_BLOCK_FACTOR	- number of pointers per indirect block
+ * LOGFS_BLOCK_BITS	- log2 of LOGFS_BLOCK_FACTOR, used for shifts
+ */
+#define LOGFS_BLOCKSIZE		(4096ull)
+#define LOGFS_BLOCK_FACTOR	(LOGFS_BLOCKSIZE / sizeof(u64))
+#define LOGFS_BLOCK_BITS	(9)
+
+/*
+ * Number of blocks at various levels of indirection.  There are 16 direct
+ * block pointers plus a single indirect pointer.
+ */
+#define I0_BLOCKS		(16)
+#define I1_BLOCKS		LOGFS_BLOCK_FACTOR
+#define I2_BLOCKS		(LOGFS_BLOCK_FACTOR * I1_BLOCKS)
+#define I3_BLOCKS		(LOGFS_BLOCK_FACTOR * I2_BLOCKS)
+#define I4_BLOCKS		(LOGFS_BLOCK_FACTOR * I3_BLOCKS)
+#define I5_BLOCKS		(LOGFS_BLOCK_FACTOR * I4_BLOCKS)
+
+#define INDIRECT_INDEX		I0_BLOCKS
+#define LOGFS_EMBEDDED_FIELDS	(I0_BLOCKS + 1)
+
+/*
+ * Sizes at which files require another level of indirection.  Files smaller
+ * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
+ * similar like ext2 fast symlinks.
+ *
+ * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
+ * direct pointers, else through the 1x indirect pointer and so forth.
+ */
+#define LOGFS_EMBEDDED_SIZE	(LOGFS_EMBEDDED_FIELDS * sizeof(u64))
+#define LOGFS_I0_SIZE		(I0_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I1_SIZE		(I1_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I2_SIZE		(I2_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I3_SIZE		(I3_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I4_SIZE		(I4_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I5_SIZE		(I5_BLOCKS * LOGFS_BLOCKSIZE)
+
+/*
+ * Each indirect block pointer must have this flag set, if all block pointers
+ * behind it are set, i.e. there is no hole hidden in the shadow of this
+ * indirect block pointer.
+ */
+#define LOGFS_FULLY_POPULATED (1ULL << 63)
+#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
+
+/*
+ * LogFS needs to separate data into levels.  Each level is defined as the
+ * maximal possible distance from the master inode (inode of the inode file).
+ * Data blocks reside on level 0, 1x indirect block on level 1, etc.
+ * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
+ * This effort is necessary to guarantee garbage collection to always make
+ * progress.
+ *
+ * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
+ * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
+ * the maximal number of levels for one file.
+ * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
+ * effectively stacked on top of each other.
+ */
+#define LOGFS_MAX_INDIRECT	(5)
+#define LOGFS_MAX_LEVELS	(LOGFS_MAX_INDIRECT + 1)
+#define LOGFS_NO_AREAS		(2 * LOGFS_MAX_LEVELS)
+
+/* Maximum size of filenames */
+#define LOGFS_MAX_NAMELEN	(255)
+
+/* Number of segments in the primary journal. */
+#define LOGFS_JOURNAL_SEGS	(16)
+
+/* Maximum number of free/erased/etc. segments in journal entries */
+#define MAX_CACHED_SEGS		(64)
+
+
+/*
+ * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
+ * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
+ * its header,
+ * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
+ * its segment header and the padded space at the end when no further objects
+ * fit.
+ */
+#define LOGFS_OBJECT_HEADERSIZE	(0x1c)
+#define LOGFS_SEGMENT_HEADERSIZE (0x18)
+#define LOGFS_MAX_OBJECTSIZE	(LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
+#define LOGFS_SEGMENT_RESERVE	\
+	(LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
+
+/*
+ * Segment types:
+ * SEG_SUPER	- Data or indirect block
+ * SEG_JOURNAL	- Inode
+ * SEG_OSTORE	- Dentry
+ */
+enum {
+	SEG_SUPER	= 0x01,
+	SEG_JOURNAL	= 0x02,
+	SEG_OSTORE	= 0x03,
+};
+
+/**
+ * struct logfs_segment_header - per-segment header in the ostore
+ *
+ * @crc:			crc32 of header (there is no data)
+ * @pad:			unused, must be 0
+ * @type:			segment type, see above
+ * @level:			GC level for all objects in this segment
+ * @segno:			segment number
+ * @ec:				erase count for this segment
+ * @gec:			global erase count at time of writing
+ */
+struct logfs_segment_header {
+	__be32	crc;
+	__be16	pad;
+	__u8	type;
+	__u8	level;
+	__be32	segno;
+	__be32	ec;
+	__be64	gec;
+};
+
+SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
+
+#define LOGFS_FEATURES_INCOMPAT		(0ull)
+#define LOGFS_FEATURES_RO_COMPAT	(0ull)
+#define LOGFS_FEATURES_COMPAT		(0ull)
+
+/**
+ * struct logfs_disk_super - on-medium superblock
+ *
+ * @ds_magic:			magic number, must equal LOGFS_MAGIC
+ * @ds_crc:			crc32 of structure starting with the next field
+ * @ds_ifile_levels:		maximum number of levels for ifile
+ * @ds_iblock_levels:		maximum number of levels for regular files
+ * @ds_data_levels:		number of separate levels for data
+ * @pad0:			reserved, must be 0
+ * @ds_feature_incompat:	incompatible filesystem features
+ * @ds_feature_ro_compat:	read-only compatible filesystem features
+ * @ds_feature_compat:		compatible filesystem features
+ * @ds_flags:			flags
+ * @ds_segment_shift:		log2 of segment size
+ * @ds_block_shift:		log2 of block size
+ * @ds_write_shift:		log2 of write size
+ * @pad1:			reserved, must be 0
+ * @ds_journal_seg:		segments used by primary journal
+ * @ds_root_reserve:		bytes reserved for the superuser
+ * @ds_speed_reserve:		bytes reserved to speed up GC
+ * @ds_bad_seg_reserve:		number of segments reserved to handle bad blocks
+ * @pad2:			reserved, must be 0
+ * @pad3:			reserved, must be 0
+ *
+ * Contains only read-only fields.  Read-write fields like the amount of used
+ * space is tracked in the dynamic superblock, which is stored in the journal.
+ */
+struct logfs_disk_super {
+	struct logfs_segment_header ds_sh;
+	__be64	ds_magic;
+
+	__be32	ds_crc;
+	__u8	ds_ifile_levels;
+	__u8	ds_iblock_levels;
+	__u8	ds_data_levels;
+	__u8	ds_segment_shift;
+	__u8	ds_block_shift;
+	__u8	ds_write_shift;
+	__u8	pad0[6];
+
+	__be64	ds_filesystem_size;
+	__be32	ds_segment_size;
+	__be32  ds_bad_seg_reserve;
+
+	__be64	ds_feature_incompat;
+	__be64	ds_feature_ro_compat;
+
+	__be64	ds_feature_compat;
+	__be64	ds_feature_flags;
+
+	__be64	ds_root_reserve;
+	__be64  ds_speed_reserve;
+
+	__be32	ds_journal_seg[LOGFS_JOURNAL_SEGS];
+
+	__be64	ds_super_ofs[2];
+	__be64	pad3[8];
+};
+
+SIZE_CHECK(logfs_disk_super, 256);
+
+/*
+ * Object types:
+ * OBJ_BLOCK	- Data or indirect block
+ * OBJ_INODE	- Inode
+ * OBJ_DENTRY	- Dentry
+ */
+enum {
+	OBJ_BLOCK	= 0x04,
+	OBJ_INODE	= 0x05,
+	OBJ_DENTRY	= 0x06,
+};
+
+/**
+ * struct logfs_object_header - per-object header in the ostore
+ *
+ * @crc:			crc32 of header, excluding data_crc
+ * @len:			length of data
+ * @type:			object type, see above
+ * @compr:			compression type
+ * @ino:			inode number
+ * @bix:			block index
+ * @data_crc:			crc32 of payload
+ */
+struct logfs_object_header {
+	__be32	crc;
+	__be16	len;
+	__u8	type;
+	__u8	compr;
+	__be64	ino;
+	__be64	bix;
+	__be32	data_crc;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
+
+/*
+ * Reserved inode numbers:
+ * LOGFS_INO_MASTER	- master inode (for inode file)
+ * LOGFS_INO_ROOT	- root directory
+ * LOGFS_INO_SEGFILE	- per-segment used bytes and erase count
+ */
+enum {
+	LOGFS_INO_MAPPING	= 0x00,
+	LOGFS_INO_MASTER	= 0x01,
+	LOGFS_INO_ROOT		= 0x02,
+	LOGFS_INO_SEGFILE	= 0x03,
+	LOGFS_RESERVED_INOS	= 0x10,
+};
+
+/*
+ * Inode flags.  High bits should never be written to the medium.  They are
+ * reserved for in-memory usage.
+ * Low bits should either remain in sync with the corresponding FS_*_FL or
+ * reuse slots that obviously don't make sense for logfs.
+ *
+ * LOGFS_IF_DIRTY	Inode must be written back
+ * LOGFS_IF_ZOMBIE	Inode has been deleted
+ * LOGFS_IF_STILLBORN	-ENOSPC happened when creating inode
+ */
+#define LOGFS_IF_COMPRESSED	0x00000004 /* == FS_COMPR_FL */
+#define LOGFS_IF_DIRTY		0x20000000
+#define LOGFS_IF_ZOMBIE		0x40000000
+#define LOGFS_IF_STILLBORN	0x80000000
+
+/* Flags available to chattr */
+#define LOGFS_FL_USER_VISIBLE	(LOGFS_IF_COMPRESSED)
+#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
+/* Flags inherited from parent directory on file/directory creation */
+#define LOGFS_FL_INHERITED	(LOGFS_IF_COMPRESSED)
+
+/**
+ * struct logfs_disk_inode - on-medium inode
+ *
+ * @di_mode:			file mode
+ * @di_pad:			reserved, must be 0
+ * @di_flags:			inode flags, see above
+ * @di_uid:			user id
+ * @di_gid:			group id
+ * @di_ctime:			change time
+ * @di_mtime:			modify time
+ * @di_refcount:		reference count (aka nlink or link count)
+ * @di_generation:		inode generation, for nfs
+ * @di_used_bytes:		number of bytes used
+ * @di_size:			file size
+ * @di_data:			data pointers
+ */
+struct logfs_disk_inode {
+	__be16	di_mode;
+	__u8	di_height;
+	__u8	di_pad;
+	__be32	di_flags;
+	__be32	di_uid;
+	__be32	di_gid;
+
+	__be64	di_ctime;
+	__be64	di_mtime;
+
+	__be64	di_atime;
+	__be32	di_refcount;
+	__be32	di_generation;
+
+	__be64	di_used_bytes;
+	__be64	di_size;
+
+	__be64	di_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_disk_inode, 200);
+
+#define INODE_POINTER_OFS \
+	(offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
+#define INODE_USED_OFS \
+	(offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
+#define INODE_SIZE_OFS \
+	(offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
+#define INODE_HEIGHT_OFS	(0)
+
+/**
+ * struct logfs_disk_dentry - on-medium dentry structure
+ *
+ * @ino:			inode number
+ * @namelen:			length of file name
+ * @type:			file type, identical to bits 12..15 of mode
+ * @name:			file name
+ */
+/* FIXME: add 6 bytes of padding to remove the __packed */
+struct logfs_disk_dentry {
+	__be64	ino;
+	__be16	namelen;
+	__u8	type;
+	__u8	name[LOGFS_MAX_NAMELEN];
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_disk_dentry, 266);
+
+#define RESERVED		0xffffffff
+#define BADSEG			0xffffffff
+/**
+ * struct logfs_segment_entry - segment file entry
+ *
+ * @ec_level:			erase count and level
+ * @valid:			number of valid bytes
+ *
+ * Segment file contains one entry for every segment.  ec_level contains the
+ * erasecount in the upper 28 bits and the level in the lower 4 bits.  An
+ * ec_level of BADSEG (-1) identifies bad segments.  valid contains the number
+ * of valid bytes or RESERVED (-1 again) if the segment is used for either the
+ * superblock or the journal, or when the segment is bad.
+ */
+struct logfs_segment_entry {
+	__be32	ec_level;
+	__be32	valid;
+};
+
+SIZE_CHECK(logfs_segment_entry, 8);
+
+/**
+ * struct logfs_journal_header - header for journal entries (JEs)
+ *
+ * @h_crc:			crc32 of journal entry
+ * @h_len:			length of compressed journal entry,
+ *				not including header
+ * @h_datalen:			length of uncompressed data
+ * @h_type:			JE type
+ * @h_compr:			compression type
+ * @h_pad:			reserved
+ */
+struct logfs_journal_header {
+	__be32	h_crc;
+	__be16	h_len;
+	__be16	h_datalen;
+	__be16	h_type;
+	__u8	h_compr;
+	__u8	h_pad[5];
+};
+
+SIZE_CHECK(logfs_journal_header, 16);
+
+/*
+ * Life expectency of data.
+ * VIM_DEFAULT		- default vim
+ * VIM_SEGFILE		- for segment file only - very short-living
+ * VIM_GC		- GC'd data - likely long-living
+ */
+enum logfs_vim {
+	VIM_DEFAULT	= 0,
+	VIM_SEGFILE	= 1,
+};
+
+/**
+ * struct logfs_je_area - wbuf header
+ *
+ * @segno:			segment number of area
+ * @used_bytes:			number of bytes already used
+ * @gc_level:			GC level
+ * @vim:			life expectancy of data
+ *
+ * "Areas" are segments currently being used for writing.  There is at least
+ * one area per GC level.  Several may be used to separate long-living from
+ * short-living data.  If an area with unknown vim is encountered, it can
+ * simply be closed.
+ * The write buffer immediately follow this header.
+ */
+struct logfs_je_area {
+	__be32	segno;
+	__be32	used_bytes;
+	__u8	gc_level;
+	__u8	vim;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_je_area, 10);
+
+#define MAX_JOURNAL_HEADER \
+	(sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
+
+/**
+ * struct logfs_je_dynsb - dynamic superblock
+ *
+ * @ds_gec:			global erase count
+ * @ds_sweeper:			current position of GC "sweeper"
+ * @ds_rename_dir:		source directory ino (see dir.c documentation)
+ * @ds_rename_pos:		position of source dd (see dir.c documentation)
+ * @ds_victim_ino:		victims of incomplete dir operation (see dir.c)
+ * @ds_victim_ino:		parent inode of victim (see dir.c)
+ * @ds_used_bytes:		number of used bytes
+ */
+struct logfs_je_dynsb {
+	__be64	ds_gec;
+	__be64	ds_sweeper;
+
+	__be64	ds_rename_dir;
+	__be64	ds_rename_pos;
+
+	__be64	ds_victim_ino;
+	__be64	ds_victim_parent; /* XXX */
+
+	__be64	ds_used_bytes;
+	__be32	ds_generation;
+	__be32	pad;
+};
+
+SIZE_CHECK(logfs_je_dynsb, 64);
+
+/**
+ * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
+ *
+ * @da_size:			size of inode file
+ * @da_last_ino:		last created inode
+ * @da_used_bytes:		number of bytes used
+ * @da_data:			data pointers
+ */
+struct logfs_je_anchor {
+	__be64	da_size;
+	__be64	da_last_ino;
+
+	__be64	da_used_bytes;
+	u8	da_height;
+	u8	pad[7];
+
+	__be64	da_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_je_anchor, 168);
+
+/**
+ * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
+ *
+ * @so_segment:			segments used for 2nd journal
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_spillout {
+	__be64	so_segment[0];
+};
+
+SIZE_CHECK(logfs_je_spillout, 0);
+
+/**
+ * struct logfs_je_journal_ec - erase counts for all journal segments
+ *
+ * @ec:				erase count
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_journal_ec {
+	__be32	ec[0];
+};
+
+SIZE_CHECK(logfs_je_journal_ec, 0);
+
+/**
+ * struct logfs_je_free_segments - list of free segmetns with erase count
+ */
+struct logfs_je_free_segments {
+	__be32	segno;
+	__be32	ec;
+};
+
+SIZE_CHECK(logfs_je_free_segments, 8);
+
+/**
+ * struct logfs_seg_alias - list of segment aliases
+ */
+struct logfs_seg_alias {
+	__be32	old_segno;
+	__be32	new_segno;
+};
+
+SIZE_CHECK(logfs_seg_alias, 8);
+
+/**
+ * struct logfs_obj_alias - list of object aliases
+ */
+struct logfs_obj_alias {
+	__be64	ino;
+	__be64	bix;
+	__be64	val;
+	u8	level;
+	u8	pad[5];
+	__be16	child_no;
+};
+
+SIZE_CHECK(logfs_obj_alias, 32);
+
+/**
+ * Compression types.
+ *
+ * COMPR_NONE	- uncompressed
+ * COMPR_ZLIB	- compressed with zlib
+ */
+enum {
+	COMPR_NONE	= 0,
+	COMPR_ZLIB	= 1,
+};
+
+/*
+ * Journal entries come in groups of 16.  First group contains unique
+ * entries, next groups contain one entry per level
+ *
+ * JE_FIRST	- smallest possible journal entry number
+ *
+ * JEG_BASE	- base group, containing unique entries
+ * JE_COMMIT	- commit entry, validates all previous entries
+ * JE_DYNSB	- dynamic superblock, anything that ought to be in the
+ *		  superblock but cannot because it is read-write data
+ * JE_ANCHOR	- anchor aka master inode aka inode file's inode
+ * JE_ERASECOUNT  erasecounts for all journal segments
+ * JE_SPILLOUT	- unused
+ * JE_SEG_ALIAS	- aliases segments
+ * JE_AREA	- area description
+ *
+ * JE_LAST	- largest possible journal entry number
+ */
+enum {
+	JE_FIRST	= 0x01,
+
+	JEG_BASE	= 0x00,
+	JE_COMMIT	= 0x02,
+	JE_DYNSB	= 0x03,
+	JE_ANCHOR	= 0x04,
+	JE_ERASECOUNT	= 0x05,
+	JE_SPILLOUT	= 0x06,
+	JE_OBJ_ALIAS	= 0x0d,
+	JE_AREA		= 0x0e,
+
+	JE_LAST		= 0x0e,
+};
+
+#endif
diff --git a/kernel/fs/logfs/readwrite.c b/kernel/fs/logfs/readwrite.c
new file mode 100644
index 000000000..380d86e1a
--- /dev/null
+++ b/kernel/fs/logfs/readwrite.c
@@ -0,0 +1,2298 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read		read blocks from a file
+ * seek_hole	find next hole
+ * seek_data	find next data block
+ * valid	check whether a block still belongs to a file
+ * write	write blocks to a file
+ * delete	delete a block (for directories and ifile)
+ * rewrite	move existing blocks of a file to a new location (gc helper)
+ * truncate	truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static u64 adjust_bix(u64 bix, level_t level)
+{
+	switch (level) {
+	case 0:
+		return bix;
+	case LEVEL(1):
+		return max_t(u64, bix, I0_BLOCKS);
+	case LEVEL(2):
+		return max_t(u64, bix, I1_BLOCKS);
+	case LEVEL(3):
+		return max_t(u64, bix, I2_BLOCKS);
+	case LEVEL(4):
+		return max_t(u64, bix, I3_BLOCKS);
+	case LEVEL(5):
+		return max_t(u64, bix, I4_BLOCKS);
+	default:
+		WARN_ON(1);
+		return bix;
+	}
+}
+
+static inline u64 maxbix(u8 height)
+{
+	return 1ULL << (LOGFS_BLOCK_BITS * height);
+}
+
+/**
+ * The inode address space is cut in two halves.  Lower half belongs to data
+ * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT	(BITS_PER_LONG - 32)
+#define INDIRECT_BIT	(0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT	(28 + ARCH_SHIFT)
+static inline pgoff_t first_indirect_block(void)
+{
+	return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
+}
+
+pgoff_t logfs_pack_index(u64 bix, level_t level)
+{
+	pgoff_t index;
+
+	BUG_ON(bix >= INDIRECT_BIT);
+	if (level == 0)
+		return bix;
+
+	index  = INDIRECT_BIT;
+	index |= (__force long)level << LEVEL_SHIFT;
+	index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
+	return index;
+}
+
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	u8 __level;
+
+	if (!(index & INDIRECT_BIT)) {
+		*bix = index;
+		*level = 0;
+		return;
+	}
+
+	__level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+	*level = LEVEL(__level);
+	*bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+	*bix = adjust_bix(*bix, *level);
+	return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+	return ns_to_timespec(be64_to_cpu(betime));
+}
+
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+	return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	inode->i_mode	= be16_to_cpu(di->di_mode);
+	li->li_height	= di->di_height;
+	li->li_flags	= be32_to_cpu(di->di_flags);
+	i_uid_write(inode, be32_to_cpu(di->di_uid));
+	i_gid_write(inode, be32_to_cpu(di->di_gid));
+	inode->i_size	= be64_to_cpu(di->di_size);
+	logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+	inode->i_atime	= be64_to_timespec(di->di_atime);
+	inode->i_ctime	= be64_to_timespec(di->di_ctime);
+	inode->i_mtime	= be64_to_timespec(di->di_mtime);
+	set_nlink(inode, be32_to_cpu(di->di_refcount));
+	inode->i_generation = be32_to_cpu(di->di_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		inode->i_rdev = be64_to_cpu(di->di_data[0]);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			li->li_data[i] = be64_to_cpu(di->di_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	di->di_mode	= cpu_to_be16(inode->i_mode);
+	di->di_height	= li->li_height;
+	di->di_pad	= 0;
+	di->di_flags	= cpu_to_be32(li->li_flags);
+	di->di_uid	= cpu_to_be32(i_uid_read(inode));
+	di->di_gid	= cpu_to_be32(i_gid_read(inode));
+	di->di_size	= cpu_to_be64(i_size_read(inode));
+	di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+	di->di_atime	= timespec_to_be64(inode->i_atime);
+	di->di_ctime	= timespec_to_be64(inode->i_ctime);
+	di->di_mtime	= timespec_to_be64(inode->i_mtime);
+	di->di_refcount	= cpu_to_be32(inode->i_nlink);
+	di->di_generation = cpu_to_be32(inode->i_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		di->di_data[0] = cpu_to_be64(inode->i_rdev);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			di->di_data[i] = cpu_to_be64(li->li_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void __logfs_set_blocks(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	inode->i_blocks = ULONG_MAX;
+	if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+		inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes = bytes;
+	__logfs_set_blocks(inode);
+}
+
+static void prelock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock) {
+		BUG_ON(PagePreLocked(page));
+		SetPagePreLocked(page);
+	} else {
+		/* We are in GC path. */
+		if (PagePreLocked(page))
+			super->s_lock_count++;
+		else
+			SetPagePreLocked(page);
+	}
+}
+
+static void preunlock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock)
+		ClearPagePreLocked(page);
+	else {
+		/* We are in GC path. */
+		BUG_ON(!PagePreLocked(page));
+		if (super->s_lock_count)
+			super->s_lock_count--;
+		else
+			ClearPagePreLocked(page);
+	}
+}
+
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_write_mutex with a locked page and GC tries to get that page while holding
+ * s_write_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ */
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		prelock_page(sb, page, lock);
+
+	if (lock) {
+		mutex_lock(&super->s_write_mutex);
+		logfs_gc_pass(sb);
+		/* FIXME: We also have to check for shadowed space
+		 * and mempool fill grade */
+	}
+}
+
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		preunlock_page(sb, page, lock);
+	/* Order matters - we must clear PG_pre_locked before releasing
+	 * s_write_mutex or we could race against another task. */
+	if (lock)
+		mutex_unlock(&super->s_write_mutex);
+}
+
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	return find_or_create_page(inode->i_mapping,
+			logfs_pack_index(bix, level), GFP_NOFS);
+}
+
+static void logfs_put_read_page(struct page *page)
+{
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+static void logfs_lock_write_page(struct page *page)
+{
+	int loop = 0;
+
+	while (unlikely(!trylock_page(page))) {
+		if (loop++ > 0x1000) {
+			/* Has been observed once so far... */
+			printk(KERN_ERR "stack at %p\n", &loop);
+			BUG();
+		}
+		if (PagePreLocked(page)) {
+			/* Holder of page lock is waiting for us, it
+			 * is safe to use this page. */
+			break;
+		}
+		/* Some other process has this page locked and has
+		 * nothing to do with us.  Wait for it to finish.
+		 */
+		schedule();
+	}
+	BUG_ON(!PageLocked(page));
+}
+
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index = logfs_pack_index(bix, level);
+	struct page *page;
+	int err;
+
+repeat:
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return NULL;
+		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			if (err == -EEXIST)
+				goto repeat;
+			return NULL;
+		}
+	} else logfs_lock_write_page(page);
+	BUG_ON(!PageLocked(page));
+	return page;
+}
+
+static void logfs_unlock_write_page(struct page *page)
+{
+	if (!PagePreLocked(page))
+		unlock_page(page);
+}
+
+static void logfs_put_write_page(struct page *page)
+{
+	logfs_unlock_write_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
+		int rw)
+{
+	if (rw == READ)
+		return logfs_get_read_page(inode, bix, level);
+	else
+		return logfs_get_write_page(inode, bix, level);
+}
+
+static void logfs_put_page(struct page *page, int rw)
+{
+	if (rw == READ)
+		logfs_put_read_page(page);
+	else
+		logfs_put_write_page(page);
+}
+
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+	u64 ret = val;
+
+	ret >>= skip * no;
+	ret <<= 64 - no;
+	ret >>= 64 - no;
+	return ret;
+}
+
+static unsigned long get_bits(u64 val, level_t skip)
+{
+	return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
+}
+
+static inline void init_shadow_tree(struct super_block *sb,
+		struct shadow_tree *tree)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_init_mempool64(&tree->new, super->s_btree_pool);
+	btree_init_mempool64(&tree->old, super->s_btree_pool);
+}
+
+static void indirect_write_block(struct logfs_block *block)
+{
+	struct page *page;
+	struct inode *inode;
+	int ret;
+
+	page = block->page;
+	inode = page->mapping->host;
+	logfs_lock_write_page(page);
+	ret = logfs_write_buf(inode, page, 0);
+	logfs_unlock_write_page(page);
+	/*
+	 * This needs some rework.  Unless you want your filesystem to run
+	 * completely synchronously (you don't), the filesystem will always
+	 * report writes as 'successful' before the actual work has been
+	 * done.  The actual work gets done here and this is where any errors
+	 * will show up.  And there isn't much we can do about it, really.
+	 *
+	 * Some attempts to fix the errors (move from bad blocks, retry io,...)
+	 * have already been done, so anything left should be either a broken
+	 * device or a bug somewhere in logfs itself.  Being relatively new,
+	 * the odds currently favor a bug, so for now the line below isn't
+	 * entirely tasteles.
+	 */
+	BUG_ON(ret);
+}
+
+static void inode_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = block->inode;
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		logfs_write_anchor(inode->i_sb);
+	else {
+		ret = __logfs_write_inode(inode, NULL, 0);
+		/* see indirect_write_block comment */
+		BUG_ON(ret);
+	}
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+static __be64 inode_val0(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 val;
+
+	/*
+	 * Explicit shifting generates good code, but must match the format
+	 * of the structure.  Add some paranoia just in case.
+	 */
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
+
+	val =	(u64)inode->i_mode << 48 |
+		(u64)li->li_height << 40 |
+		(u64)li->li_flags;
+	return cpu_to_be64(val);
+}
+
+static int inode_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	struct inode *inode = block->inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned long pos;
+	u64 ino , bix;
+	__be64 val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
+			return 0;
+
+		switch (pos) {
+		case INODE_HEIGHT_OFS:
+			val = inode_val0(inode);
+			break;
+		case INODE_USED_OFS:
+			val = cpu_to_be64(li->li_used_bytes);
+			break;
+		case INODE_SIZE_OFS:
+			val = cpu_to_be64(i_size_read(inode));
+			break;
+		case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
+			val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
+			break;
+		default:
+			BUG();
+		}
+
+		ino = LOGFS_INO_MASTER;
+		bix = inode->i_ino;
+		level = LEVEL(0);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+static int indirect_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	unsigned long pos;
+	struct page *page = block->page;
+	u64 ino , bix;
+	__be64 *child, val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			return 0;
+
+		ino = page->mapping->host->i_ino;
+		logfs_unpack_index(page->index, &bix, &level);
+		child = kmap_atomic(page);
+		val = child[pos];
+		kunmap_atomic(child);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+int logfs_write_obj_aliases_pagecache(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int err;
+
+	list_for_each_entry(block, &super->s_object_alias, alias_list) {
+		err = block->ops->write_alias(sb, block, write_alias_journal);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+void __free_block(struct super_block *sb, struct logfs_block *block)
+{
+	BUG_ON(!list_empty(&block->item_list));
+	list_del(&block->alias_list);
+	mempool_free(block, logfs_super(sb)->s_block_pool);
+}
+
+static void inode_free_block(struct super_block *sb, struct logfs_block *block)
+{
+	struct inode *inode = block->inode;
+
+	logfs_inode(inode)->li_block = NULL;
+	__free_block(sb, block);
+}
+
+static void indirect_free_block(struct super_block *sb,
+		struct logfs_block *block)
+{
+	struct page *page = block->page;
+
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
+	__free_block(sb, block);
+}
+
+
+static struct logfs_block_ops inode_block_ops = {
+	.write_block = inode_write_block,
+	.free_block = inode_free_block,
+	.write_alias = inode_write_alias,
+};
+
+struct logfs_block_ops indirect_block_ops = {
+	.write_block = indirect_write_block,
+	.free_block = indirect_free_block,
+	.write_alias = indirect_write_alias,
+};
+
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+
+	block = mempool_alloc(super->s_block_pool, GFP_NOFS);
+	memset(block, 0, sizeof(*block));
+	INIT_LIST_HEAD(&block->alias_list);
+	INIT_LIST_HEAD(&block->item_list);
+	block->sb = sb;
+	block->ino = ino;
+	block->bix = bix;
+	block->level = level;
+	return block;
+}
+
+static void alloc_inode_block(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block;
+
+	if (li->li_block)
+		return;
+
+	block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
+	block->inode = inode;
+	li->li_block = block;
+	block->ops = &inode_block_ops;
+}
+
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty)
+{
+	u64 ptr;
+	int i, start;
+
+	block->partial = 0;
+	block->full = 0;
+	start = 0;
+	if (page->index < first_indirect_block()) {
+		/* Counters are pointless on level 0 */
+		return;
+	}
+	if (page->index == first_indirect_block()) {
+		/* Skip unused pointers */
+		start = I0_BLOCKS;
+		block->full = I0_BLOCKS;
+	}
+	if (!page_is_empty) {
+		for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
+			ptr = be64_to_cpu(array[i]);
+			if (ptr)
+				block->partial++;
+			if (ptr & LOGFS_FULLY_POPULATED)
+				block->full++;
+		}
+	}
+}
+
+static void alloc_data_block(struct inode *inode, struct page *page)
+{
+	struct logfs_block *block;
+	u64 bix;
+	level_t level;
+
+	if (PagePrivate(page))
+		return;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
+	block->page = page;
+
+	SetPagePrivate(page);
+	page_cache_get(page);
+	set_page_private(page, (unsigned long) block);
+
+	block->ops = &indirect_block_ops;
+}
+
+static void alloc_indirect_block(struct inode *inode, struct page *page,
+		int page_is_empty)
+{
+	struct logfs_block *block;
+	__be64 *array;
+
+	if (PagePrivate(page))
+		return;
+
+	alloc_data_block(inode, page);
+
+	block = logfs_block(page);
+	array = kmap_atomic(page);
+	initialize_block_counters(page, block, array, page_is_empty);
+	kunmap_atomic(array);
+}
+
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+	struct logfs_block *block = logfs_block(page);
+	__be64 *array;
+	u64 oldptr;
+
+	BUG_ON(!block);
+	array = kmap_atomic(page);
+	oldptr = be64_to_cpu(array[index]);
+	array[index] = cpu_to_be64(ptr);
+	kunmap_atomic(array);
+	SetPageUptodate(page);
+
+	block->full += !!(ptr & LOGFS_FULLY_POPULATED)
+		- !!(oldptr & LOGFS_FULLY_POPULATED);
+	block->partial += !!ptr - !!oldptr;
+}
+
+static u64 block_get_pointer(struct page *page, int index)
+{
+	__be64 *block;
+	u64 ptr;
+
+	block = kmap_atomic(page);
+	ptr = be64_to_cpu(block[index]);
+	kunmap_atomic(block);
+	return ptr;
+}
+
+static int logfs_read_empty(struct page *page)
+{
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	return 0;
+}
+
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+	u64 block;
+
+	block = li->li_data[index];
+	if (!block)
+		return logfs_read_empty(page);
+
+	return logfs_segment_read(inode, page, block, index, 0);
+}
+
+static int logfs_read_loop(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bix, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level, target_level;
+	int ret;
+	struct page *ipage;
+
+	logfs_unpack_index(page->index, &bix, &target_level);
+	if (!bofs)
+		return logfs_read_empty(page);
+
+	if (bix >= maxbix(li->li_height))
+		return logfs_read_empty(page);
+
+	for (level = LEVEL(li->li_height);
+			(__force u8)level > (__force u8)target_level;
+			level = SUBLEVEL(level)){
+		ipage = logfs_get_page(inode, bix, level, rw_context);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_page(ipage, rw_context);
+		if (!bofs)
+			return logfs_read_empty(page);
+	}
+
+	return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+
+static int logfs_read_block(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	pgoff_t index = page->index;
+
+	if (index < I0_BLOCKS)
+		return logfs_read_direct(inode, page);
+	return logfs_read_loop(inode, page, rw_context);
+}
+
+static int logfs_exist_loop(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret;
+	struct page *ipage;
+
+	if (!bofs)
+		return 0;
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		ipage = logfs_get_read_page(inode, bix, level);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_read_page(ipage);
+		if (!bofs)
+			return 0;
+	}
+
+	return 1;
+}
+
+int logfs_exist_block(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS)
+		return !!li->li_data[bix];
+	return logfs_exist_loop(inode, bix);
+}
+
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	for (; bix < I0_BLOCKS; bix++)
+		if (data ^ (li->li_data[bix] == 0))
+			return bix;
+	return I0_BLOCKS;
+}
+
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	__be64 *rblock;
+	u64 increment, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret, slot;
+	struct page *page;
+
+	BUG_ON(!bofs);
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
+		page = logfs_get_read_page(inode, bix, level);
+		if (!page)
+			return bix;
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(page);
+			return bix;
+		}
+
+		slot = get_bits(bix, SUBLEVEL(level));
+		rblock = kmap_atomic(page);
+		while (slot < LOGFS_BLOCK_FACTOR) {
+			if (data && (rblock[slot] != 0))
+				break;
+			if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+				break;
+			slot++;
+			bix += increment;
+			bix &= ~(increment - 1);
+		}
+		if (slot >= LOGFS_BLOCK_FACTOR) {
+			kunmap_atomic(rblock);
+			logfs_put_read_page(page);
+			return bix;
+		}
+		bofs = be64_to_cpu(rblock[slot]);
+		kunmap_atomic(rblock);
+		logfs_put_read_page(page);
+		if (!bofs) {
+			BUG_ON(data);
+			return bix;
+		}
+	}
+	return bix;
+}
+
+/**
+ * logfs_seek_hole - find next hole starting at a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next hole.  If the file doesn't contain any further holes, the
+ * block address next to eof is returned instead.
+ */
+u64 logfs_seek_hole(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 0);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (!li->li_data[INDIRECT_INDEX])
+		return bix;
+	else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
+		bix = maxbix(li->li_height);
+	else if (bix >= maxbix(li->li_height))
+		return bix;
+	else {
+		bix = seek_holedata_loop(inode, bix, 0);
+		if (bix < maxbix(li->li_height))
+			return bix;
+		/* Should not happen anymore.  But if some port writes semi-
+		 * corrupt images (as this one used to) we might run into it.
+		 */
+		WARN_ON_ONCE(bix == maxbix(li->li_height));
+	}
+
+	return bix;
+}
+
+static u64 __logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 1);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (bix < maxbix(li->li_height)) {
+		if (!li->li_data[INDIRECT_INDEX])
+			bix = maxbix(li->li_height);
+		else
+			return seek_holedata_loop(inode, bix, 1);
+	}
+
+	return bix;
+}
+
+/**
+ * logfs_seek_data - find next data block after a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next data block.  If the file doesn't contain any further data
+ * blocks, the last block in the file is returned instead.
+ */
+u64 logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 ret, end;
+
+	ret = __logfs_seek_data(inode, bix);
+	end = i_size_read(inode) >> sb->s_blocksize_bits;
+	if (ret >= end)
+		ret = max(bix, end);
+	return ret;
+}
+
+static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
+{
+	return pure_ofs(li->li_data[bix]) == ofs;
+}
+
+static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
+		u64 ofs, u64 bofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	level_t level;
+	int ret;
+	struct page *page;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
+		page = logfs_get_write_page(inode, bix, level);
+		BUG_ON(!page);
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_write_page(page);
+			return 0;
+		}
+
+		bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_write_page(page);
+		if (!bofs)
+			return 0;
+
+		if (pure_ofs(bofs) == ofs)
+			return 1;
+	}
+	return 0;
+}
+
+static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+
+	if (!bofs)
+		return 0;
+
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	if (pure_ofs(bofs) == ofs)
+		return 1;
+
+	return __logfs_is_valid_loop(inode, bix, ofs, bofs);
+}
+
+static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
+		return 0;
+
+	if (bix < I0_BLOCKS)
+		return logfs_is_valid_direct(li, bix, ofs);
+	return logfs_is_valid_loop(inode, bix, ofs);
+}
+
+/**
+ * logfs_is_valid_block - check whether this block is still valid
+ *
+ * @sb:		superblock
+ * @ofs:	block physical offset
+ * @ino:	block inode number
+ * @bix:	block index
+ * @gc_level:	block level
+ *
+ * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
+ * become invalid once the journal is written.
+ */
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	int ret, cookie;
+
+	/* Umount closes a segment with free blocks remaining.  Those
+	 * blocks are by definition invalid. */
+	if (ino == -1)
+		return 0;
+
+	LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	if (IS_ERR(inode))
+		goto invalid;
+
+	ret = __logfs_is_valid_block(inode, bix, ofs);
+	logfs_safe_iput(inode, cookie);
+	if (ret)
+		return ret;
+
+invalid:
+	/* Block is nominally invalid, but may still sit in the shadow tree,
+	 * waiting for a journal commit.
+	 */
+	if (btree_lookup64(&super->s_shadow_tree.old, ofs))
+		return 2;
+	return 0;
+}
+
+int logfs_readpage_nolock(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = -EIO;
+
+	ret = logfs_read_block(inode, page, READ);
+
+	if (ret) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	flush_dcache_page(page);
+
+	return ret;
+}
+
+static int logfs_reserve_bytes(struct inode *inode, int bytes)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	u64 available = super->s_free_bytes + super->s_dirty_free_bytes
+			- super->s_dirty_used_bytes - super->s_dirty_pages;
+
+	if (!bytes)
+		return 0;
+
+	if (available < bytes)
+		return -ENOSPC;
+
+	if (available < bytes + super->s_root_reserve &&
+			!capable(CAP_SYS_RESOURCE))
+		return -ENOSPC;
+
+	return 0;
+}
+
+int get_page_reserve(struct inode *inode, struct page *page)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
+	int ret;
+
+	if (block && block->reserved_bytes)
+		return 0;
+
+	logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
+	while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
+			!list_empty(&super->s_writeback_list)) {
+		block = list_entry(super->s_writeback_list.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+	}
+	if (!ret) {
+		alloc_data_block(inode, page);
+		block = logfs_block(page);
+		block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+		super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+		list_move_tail(&block->alias_list, &super->s_writeback_list);
+	}
+	logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
+	return ret;
+}
+
+/*
+ * We are protected by write lock.  Push victims up to superblock level
+ * and release transaction when appropriate.
+ */
+/* FIXME: This is currently called from the wrong spots. */
+static void logfs_handle_transaction(struct inode *inode,
+		struct logfs_transaction *ta)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	if (!ta)
+		return;
+	logfs_inode(inode)->li_block->ta = NULL;
+
+	if (inode->i_ino != LOGFS_INO_MASTER) {
+		BUG(); /* FIXME: Yes, this needs more thought */
+		/* just remember the transaction until inode is written */
+		//BUG_ON(logfs_inode(inode)->li_transaction);
+		//logfs_inode(inode)->li_transaction = ta;
+		return;
+	}
+
+	switch (ta->state) {
+	case CREATE_1: /* fall through */
+	case UNLINK_1:
+		BUG_ON(super->s_victim_ino);
+		super->s_victim_ino = ta->ino;
+		break;
+	case CREATE_2: /* fall through */
+	case UNLINK_2:
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		/* transaction ends here - free it */
+		kfree(ta);
+		break;
+	case CROSS_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		break;
+	case CROSS_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		kfree(ta);
+		break;
+	case TARGET_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		super->s_victim_ino = ta->ino;
+		break;
+	case TARGET_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		break;
+	case TARGET_RENAME_3:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		kfree(ta);
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Not strictly a reservation, but rather a check that we still have enough
+ * space to satisfy the write.
+ */
+static int logfs_reserve_blocks(struct inode *inode, int blocks)
+{
+	return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
+}
+
+struct write_control {
+	u64 ofs;
+	long flags;
+};
+
+static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
+		level_t level, u64 old_ofs)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_shadow *shadow;
+
+	shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
+	memset(shadow, 0, sizeof(*shadow));
+	shadow->ino = inode->i_ino;
+	shadow->bix = bix;
+	shadow->gc_level = expand_level(inode->i_ino, level);
+	shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
+	return shadow;
+}
+
+static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void mark_segment(struct shadow_tree *tree, u32 segno)
+{
+	int err;
+
+	if (!btree_lookup32(&tree->segment_map, segno)) {
+		err = btree_insert32(&tree->segment_map, segno, (void *)1,
+				GFP_NOFS);
+		BUG_ON(err);
+		tree->no_shadowed_segments++;
+	}
+}
+
+/**
+ * fill_shadow_tree - Propagate shadow tree changes due to a write
+ * @inode:	Inode owning the page
+ * @page:	Struct page that was written
+ * @shadow:	Shadow for the current write
+ *
+ * Writes in logfs can result in two semi-valid objects.  The old object
+ * is still valid as long as it can be reached by following pointers on
+ * the medium.  Only when writes propagate all the way up to the journal
+ * has the new object safely replaced the old one.
+ *
+ * To handle this problem, a struct logfs_shadow is used to represent
+ * every single write.  It is attached to the indirect block, which is
+ * marked dirty.  When the indirect block is written, its shadows are
+ * handed up to the next indirect block (or inode).  Untimately they
+ * will reach the master inode and be freed upon journal commit.
+ *
+ * This function handles a single step in the propagation.  It adds the
+ * shadow for the current write to the tree, along with any shadows in
+ * the page's tree, in case it was an indirect block.  If a page is
+ * written, the inode parameter is left NULL, if an inode is written,
+ * the page parameter is left NULL.
+ */
+static void fill_shadow_tree(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	if (PagePrivate(page)) {
+		if (block->alias_map)
+			super->s_no_object_aliases -= bitmap_weight(
+					block->alias_map, LOGFS_BLOCK_FACTOR);
+		logfs_handle_transaction(inode, block->ta);
+		block->ops->free_block(inode->i_sb, block);
+	}
+	if (shadow) {
+		if (shadow->old_ofs)
+			btree_insert64(&tree->old, shadow->old_ofs, shadow,
+					GFP_NOFS);
+		else
+			btree_insert64(&tree->new, shadow->new_ofs, shadow,
+					GFP_NOFS);
+
+		super->s_dirty_used_bytes += shadow->new_len;
+		super->s_dirty_free_bytes += shadow->old_len;
+		mark_segment(tree, shadow->old_ofs >> super->s_segshift);
+		mark_segment(tree, shadow->new_ofs >> super->s_segshift);
+	}
+}
+
+static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
+		long child_no)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
+		/* Aliases in the master inode are pointless. */
+		return;
+	}
+
+	if (!test_bit(child_no, block->alias_map)) {
+		set_bit(child_no, block->alias_map);
+		super->s_no_object_aliases++;
+	}
+	list_move_tail(&block->alias_list, &super->s_object_alias);
+}
+
+/*
+ * Object aliases can and often do change the size and occupied space of a
+ * file.  So not only do we have to change the pointers, we also have to
+ * change inode->i_size and li->li_used_bytes.  Which is done by setting
+ * another two object aliases for the inode itself.
+ */
+static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (shadow->new_len == shadow->old_len)
+		return;
+
+	alloc_inode_block(inode);
+	li->li_used_bytes += shadow->new_len - shadow->old_len;
+	__logfs_set_blocks(inode);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
+}
+
+static int logfs_write_i0(struct inode *inode, struct page *page,
+		struct write_control *wc)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int full, err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	if (wc->ofs == 0)
+		if (logfs_reserve_blocks(inode, 1))
+			return -ENOSPC;
+
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+	if (wc->flags & WF_WRITE)
+		err = logfs_segment_write(inode, page, shadow);
+	if (wc->flags & WF_DELETE)
+		logfs_segment_delete(inode, shadow);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	set_iused(inode, shadow);
+	full = 1;
+	if (level != 0) {
+		alloc_indirect_block(inode, page, 0);
+		full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
+	}
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	if (wc->ofs && full)
+		wc->ofs |= LOGFS_FULLY_POPULATED;
+	return 0;
+}
+
+static int logfs_write_direct(struct inode *inode, struct page *page,
+		long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[page->index],
+		.flags = flags,
+	};
+	int err;
+
+	alloc_inode_block(inode);
+
+	err = logfs_write_i0(inode, page, &wc);
+	if (err)
+		return err;
+
+	li->li_data[page->index] = wc.ofs;
+	logfs_set_alias(inode->i_sb, li->li_block,
+			page->index + INODE_POINTER_OFS);
+	return 0;
+}
+
+static int ptr_change(u64 ofs, struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	int empty0, empty1, full0, full1;
+
+	empty0 = ofs == 0;
+	empty1 = block->partial == 0;
+	if (empty0 != empty1)
+		return 1;
+
+	/* The !! is necessary to shrink result to int */
+	full0 = !!(ofs & LOGFS_FULLY_POPULATED);
+	full1 = block->full == LOGFS_BLOCK_FACTOR;
+	if (full0 != full1)
+		return 1;
+	return 0;
+}
+
+static int __logfs_write_rec(struct inode *inode, struct page *page,
+		struct write_control *this_wc,
+		pgoff_t bix, level_t target_level, level_t level)
+{
+	int ret, page_empty = 0;
+	int child_no = get_bits(bix, SUBLEVEL(level));
+	struct page *ipage;
+	struct write_control child_wc = {
+		.flags = this_wc->flags,
+	};
+
+	ipage = logfs_get_write_page(inode, bix, level);
+	if (!ipage)
+		return -ENOMEM;
+
+	if (this_wc->ofs) {
+		ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+		if (ret)
+			goto out;
+	} else if (!PageUptodate(ipage)) {
+		page_empty = 1;
+		logfs_read_empty(ipage);
+	}
+
+	child_wc.ofs = block_get_pointer(ipage, child_no);
+
+	if ((__force u8)level-1 > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &child_wc, bix,
+				target_level, SUBLEVEL(level));
+	else
+		ret = logfs_write_i0(inode, page, &child_wc);
+
+	if (ret)
+		goto out;
+
+	alloc_indirect_block(inode, ipage, page_empty);
+	block_set_pointer(ipage, child_no, child_wc.ofs);
+	/* FIXME: first condition seems superfluous */
+	if (child_wc.ofs || logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+	/* the condition on this_wc->ofs ensures that we won't consume extra
+	 * space for indirect blocks in the future, which we cannot reserve */
+	if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
+		ret = logfs_write_i0(inode, ipage, this_wc);
+	else
+		logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
+out:
+	logfs_put_write_page(ipage);
+	return ret;
+}
+
+static int logfs_write_rec(struct inode *inode, struct page *page,
+		pgoff_t bix, level_t target_level, long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+		.flags = flags,
+	};
+	int ret;
+
+	alloc_inode_block(inode);
+
+	if (li->li_height > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
+				LEVEL(li->li_height));
+	else
+		ret = logfs_write_i0(inode, page, &wc);
+	if (ret)
+		return ret;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		logfs_set_alias(inode->i_sb, li->li_block,
+				INDIRECT_INDEX + INODE_POINTER_OFS);
+	}
+	return ret;
+}
+
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	alloc_inode_block(inode);
+	logfs_inode(inode)->li_block->ta = ta;
+}
+
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	struct logfs_block *block = logfs_inode(inode)->li_block;
+
+	if (block && block->ta)
+		block->ta = NULL;
+}
+
+static int grow_inode(struct inode *inode, u64 bix, level_t level)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u8 height = (__force u8)level;
+	struct page *page;
+	struct write_control wc = {
+		.flags = WF_WRITE,
+	};
+	int err;
+
+	BUG_ON(height > 5 || li->li_height > 5);
+	while (height > li->li_height || bix >= maxbix(li->li_height)) {
+		page = logfs_get_write_page(inode, I0_BLOCKS + 1,
+				LEVEL(li->li_height + 1));
+		if (!page)
+			return -ENOMEM;
+		logfs_read_empty(page);
+		alloc_indirect_block(inode, page, 1);
+		block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
+		err = logfs_write_i0(inode, page, &wc);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		wc.ofs = 0;
+		li->li_height++;
+		logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
+	}
+	return 0;
+}
+
+static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	pgoff_t index = page->index;
+	u64 bix;
+	level_t level;
+	int err;
+
+	flags |= WF_WRITE | WF_DELETE;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	logfs_unpack_index(index, &bix, &level);
+	if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+		super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
+
+	if (index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+
+	bix = adjust_bix(bix, level);
+	err = grow_inode(inode, bix, level);
+	if (err)
+		return err;
+	return logfs_write_rec(inode, page, bix, level, flags);
+}
+
+int logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
+	ret = __logfs_write_buf(inode, page, flags);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
+	return ret;
+}
+
+static int __logfs_delete(struct inode *inode, struct page *page)
+{
+	long flags = WF_DELETE;
+	int err;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	if (page->index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+	err = grow_inode(inode, page->index, 0);
+	if (err)
+		return err;
+	return logfs_write_rec(inode, page, page->index, 0, flags);
+}
+
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_read_page(inode, index, 0);
+	if (!page)
+		return -ENOMEM;
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_read_page(page);
+
+	return ret;
+}
+
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags)
+{
+	level_t level = shrink_level(gc_level);
+	struct page *page;
+	int err;
+
+	page = logfs_get_write_page(inode, bix, level);
+	if (!page)
+		return -ENOMEM;
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (!err) {
+		if (level != 0)
+			alloc_indirect_block(inode, page, 0);
+		err = logfs_write_buf(inode, page, flags);
+		if (!err && shrink_level(gc_level) == 0) {
+			/* Rewrite cannot mark the inode dirty but has to
+			 * write it immediately.
+			 * Q: Can't we just create an alias for the inode
+			 * instead?  And if not, why not?
+			 */
+			if (inode->i_ino == LOGFS_INO_MASTER)
+				logfs_write_anchor(inode->i_sb);
+			else {
+				err = __logfs_write_inode(inode, page, flags);
+			}
+		}
+	}
+	logfs_put_write_page(page);
+	return err;
+}
+
+static int truncate_data_block(struct inode *inode, struct page *page,
+		u64 ofs, struct logfs_shadow *shadow, u64 size)
+{
+	loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
+	u64 bix;
+	level_t level;
+	int err;
+
+	/* Does truncation happen within this page? */
+	if (size <= pageofs || size - pageofs >= PAGE_SIZE)
+		return 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (err)
+		return err;
+
+	zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+	return logfs_segment_write(inode, page, shadow);
+}
+
+static int logfs_truncate_i0(struct inode *inode, struct page *page,
+		struct write_control *wc, u64 size)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+
+	err = truncate_data_block(inode, page, wc->ofs, shadow, size);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	logfs_segment_delete(inode, shadow);
+	set_iused(inode, shadow);
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	return 0;
+}
+
+static int logfs_truncate_direct(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc;
+	struct page *page;
+	int e;
+	int err;
+
+	alloc_inode_block(inode);
+
+	for (e = I0_BLOCKS - 1; e >= 0; e--) {
+		if (size > (e+1) * LOGFS_BLOCKSIZE)
+			break;
+
+		wc.ofs = li->li_data[e];
+		if (!wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, e, 0);
+		if (!page)
+			return -ENOMEM;
+		err = logfs_segment_read(inode, page, wc.ofs, e, 0);
+		if (err) {
+			logfs_put_write_page(page);
+			return err;
+		}
+		err = logfs_truncate_i0(inode, page, &wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		li->li_data[e] = wc.ofs;
+	}
+	return 0;
+}
+
+/* FIXME: these need to become per-sb once we support different blocksizes */
+static u64 __logfs_step[] = {
+	1,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS,
+};
+
+static u64 __logfs_start_index[] = {
+	I0_BLOCKS,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS
+};
+
+static inline u64 logfs_step(level_t level)
+{
+	return __logfs_step[(__force u8)level];
+}
+
+static inline u64 logfs_factor(u8 level)
+{
+	return __logfs_step[level] * LOGFS_BLOCKSIZE;
+}
+
+static inline u64 logfs_start_index(level_t level)
+{
+	return __logfs_start_index[(__force u8)level];
+}
+
+static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	logfs_unpack_index(index, bix, level);
+	if (*bix <= logfs_start_index(SUBLEVEL(*level)))
+		*bix = 0;
+}
+
+static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
+		struct write_control *this_wc, u64 size)
+{
+	int truncate_happened = 0;
+	int e, err = 0;
+	u64 bix, child_bix, next_bix;
+	level_t level;
+	struct page *page;
+	struct write_control child_wc = { /* FIXME: flags */ };
+
+	logfs_unpack_raw_index(ipage->index, &bix, &level);
+	err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+	if (err)
+		return err;
+
+	for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
+		child_bix = bix + e * logfs_step(SUBLEVEL(level));
+		next_bix = child_bix + logfs_step(SUBLEVEL(level));
+		if (size > next_bix * LOGFS_BLOCKSIZE)
+			break;
+
+		child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
+		if (!child_wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
+		if (!page)
+			return -ENOMEM;
+
+		if ((__force u8)level > 1)
+			err = __logfs_truncate_rec(inode, page, &child_wc, size);
+		else
+			err = logfs_truncate_i0(inode, page, &child_wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		truncate_happened = 1;
+		alloc_indirect_block(inode, ipage, 0);
+		block_set_pointer(ipage, e, child_wc.ofs);
+	}
+
+	if (!truncate_happened) {
+		printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
+		return 0;
+	}
+
+	this_wc->flags = WF_DELETE;
+	if (logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+
+	return logfs_write_i0(inode, ipage, this_wc);
+}
+
+static int logfs_truncate_rec(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+	};
+	struct page *page;
+	int err;
+
+	alloc_inode_block(inode);
+
+	if (!wc.ofs)
+		return 0;
+
+	page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
+	if (!page)
+		return -ENOMEM;
+
+	err = __logfs_truncate_rec(inode, page, &wc, size);
+	logfs_put_write_page(page);
+	if (err)
+		return err;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs)
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+	return 0;
+}
+
+static int __logfs_truncate(struct inode *inode, u64 size)
+{
+	int ret;
+
+	if (size >= logfs_factor(logfs_inode(inode)->li_height))
+		return 0;
+
+	ret = logfs_truncate_rec(inode, size);
+	if (ret)
+		return ret;
+
+	return logfs_truncate_direct(inode, size);
+}
+
+/*
+ * Truncate, by changing the segment file, can consume a fair amount
+ * of resources.  So back off from time to time and do some GC.
+ * 8 or 2048 blocks should be well within safety limits even if
+ * every single block resided in a different segment.
+ */
+#define TRUNCATE_STEP	(8 * 1024 * 1024)
+int logfs_truncate(struct inode *inode, u64 target)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 size = i_size_read(inode);
+	int err = 0;
+
+	size = ALIGN(size, TRUNCATE_STEP);
+	while (size > target) {
+		if (size > TRUNCATE_STEP)
+			size -= TRUNCATE_STEP;
+		else
+			size = 0;
+		if (size < target)
+			size = target;
+
+		logfs_get_wblocks(sb, NULL, 1);
+		err = __logfs_truncate(inode, size);
+		if (!err)
+			err = __logfs_write_inode(inode, NULL, 0);
+		logfs_put_wblocks(sb, NULL, 1);
+	}
+
+	if (!err) {
+		err = inode_newsize_ok(inode, target);
+		if (err)
+			goto out;
+
+		truncate_setsize(inode, target);
+	}
+
+ out:
+	/* I don't trust error recovery yet. */
+	WARN_ON(err);
+	return err;
+}
+
+static void move_page_to_inode(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = logfs_block(page);
+
+	if (!block)
+		return;
+
+	log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(li->li_block);
+	block->ops = &inode_block_ops;
+	block->inode = inode;
+	li->li_block = block;
+
+	block->page = NULL;
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
+}
+
+static void move_inode_to_page(struct page *page, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+
+	if (!block)
+		return;
+
+	log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(PagePrivate(page));
+	block->ops = &indirect_block_ops;
+	block->page = page;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long) block);
+	}
+
+	block->inode = NULL;
+	li->li_block = NULL;
+}
+
+int logfs_read_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *master_inode = super->s_master_inode;
+	struct page *page;
+	struct logfs_disk_inode *di;
+	u64 ino = inode->i_ino;
+
+	if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
+		return -ENODATA;
+	if (!logfs_exist_block(master_inode, ino))
+		return -ENODATA;
+
+	page = read_cache_page(master_inode->i_mapping, ino,
+			(filler_t *)logfs_readpage, NULL);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	di = kmap_atomic(page);
+	logfs_disk_to_inode(di, inode);
+	kunmap_atomic(di);
+	move_page_to_inode(inode, page);
+	page_cache_release(page);
+	return 0;
+}
+
+/* Caller must logfs_put_write_page(page); */
+static struct page *inode_to_page(struct inode *inode)
+{
+	struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
+	struct logfs_disk_inode *di;
+	struct page *page;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return NULL;
+
+	di = kmap_atomic(page);
+	logfs_inode_to_disk(inode, di);
+	kunmap_atomic(di);
+	move_inode_to_page(page, inode);
+	return page;
+}
+
+static int do_write_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
+	struct page *page;
+	int err;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+	/* FIXME: lock inode */
+
+	if (i_size_read(master_inode) < size)
+		i_size_write(master_inode, size);
+
+	/* TODO: Tell vfs this inode is clean now */
+
+	page = inode_to_page(inode);
+	if (!page)
+		return -ENOMEM;
+
+	/* FIXME: transaction is part of logfs_block now.  Is that enough? */
+	err = logfs_write_buf(master_inode, page, 0);
+	if (err)
+		move_page_to_inode(inode, page);
+
+	logfs_put_write_page(page);
+	return err;
+}
+
+static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
+		int write,
+		void (*change_se)(struct logfs_segment_entry *, long),
+		long arg)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	struct page *page;
+	struct logfs_segment_entry *se;
+	pgoff_t page_no;
+	int child_no;
+
+	page_no = segno >> (sb->s_blocksize_bits - 3);
+	child_no = segno & ((sb->s_blocksize >> 3) - 1);
+
+	inode = super->s_segfile_inode;
+	page = logfs_get_write_page(inode, page_no, 0);
+	BUG_ON(!page); /* FIXME: We need some reserve page for this case */
+	if (!PageUptodate(page))
+		logfs_read_block(inode, page, WRITE);
+
+	if (write)
+		alloc_indirect_block(inode, page, 0);
+	se = kmap_atomic(page);
+	change_se(se + child_no, arg);
+	if (write) {
+		logfs_set_alias(sb, logfs_block(page), child_no);
+		BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
+	}
+	kunmap_atomic(se);
+
+	logfs_put_write_page(page);
+}
+
+static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
+{
+	struct logfs_segment_entry *target = (void *)_target;
+
+	*target = *se;
+}
+
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se)
+{
+	logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
+}
+
+static void __set_segment_used(struct logfs_segment_entry *se, long increment)
+{
+	u32 valid;
+
+	valid = be32_to_cpu(se->valid);
+	valid += increment;
+	se->valid = cpu_to_be32(valid);
+}
+
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno = ofs >> super->s_segshift;
+
+	if (!increment)
+		return;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
+}
+
+static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
+{
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level)
+{
+	u32 ec_level = ec << 4 | (__force u8)gc_level;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
+}
+
+static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
+{
+	se->valid = cpu_to_be32(RESERVED);
+}
+
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
+{
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
+}
+
+static void __set_segment_unreserved(struct logfs_segment_entry *se,
+		long ec_level)
+{
+	se->valid = 0;
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
+{
+	u32 ec_level = ec << 4;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
+			ec_level);
+}
+
+int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
+	ret = do_write_inode(inode);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
+	return ret;
+}
+
+static int do_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return -ENOMEM;
+
+	move_inode_to_page(page, inode);
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(master_inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_write_page(page);
+	return ret;
+}
+
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking.  No need to kill them again here.
+ */
+void logfs_evict_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+	struct page *page;
+
+	if (!inode->i_nlink) {
+		if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+			li->li_flags |= LOGFS_IF_ZOMBIE;
+			if (i_size_read(inode) > 0)
+				logfs_truncate(inode, 0);
+			do_delete_inode(inode);
+		}
+	}
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	/* Cheaper version of write_inode.  All changes are concealed in
+	 * aliases, which are moved back.  No write to the medium happens.
+	 */
+	/* Only deleted files may be dirty at this point */
+	BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+	if (!block)
+		return;
+	if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+		block->ops->free_block(inode->i_sb, block);
+		return;
+	}
+
+	page = inode_to_page(inode);
+	BUG_ON(!page); /* FIXME: Use emergency page */
+	logfs_put_write_page(page);
+}
+
+void btree_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	struct page *page;
+	int err, cookie;
+
+	inode = logfs_safe_iget(block->sb, block->ino, &cookie);
+	page = logfs_get_write_page(inode, block->bix, block->level);
+
+	err = logfs_readpage_nolock(page);
+	BUG_ON(err);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(logfs_block(page) != block);
+	err = __logfs_write_buf(inode, page, 0);
+	BUG_ON(err);
+	BUG_ON(PagePrivate(page) || page->private);
+
+	logfs_put_write_page(page);
+	logfs_safe_iput(inode, cookie);
+}
+
+/**
+ * logfs_inode_write - write inode or dentry objects
+ *
+ * @inode:		parent inode (ifile or directory)
+ * @buf:		object to write (inode or dentry)
+ * @count:		object size
+ * @bix:		block index
+ * @flags:		write flags
+ * @shadow_tree:	shadow below this inode
+ *
+ * FIXME: All caller of this put a 200-300 byte variable on the stack,
+ * only to call here and do a memcpy from that stack variable.  A good
+ * example of wasted performance and stack space.
+ */
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	int err;
+	struct page *page;
+	void *pagebuf;
+
+	BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
+	BUG_ON(count > LOGFS_BLOCKSIZE);
+	page = logfs_get_write_page(inode, bix, 0);
+	if (!page)
+		return -ENOMEM;
+
+	pagebuf = kmap_atomic(page);
+	memcpy(pagebuf, buf, count);
+	flush_dcache_page(page);
+	kunmap_atomic(pagebuf);
+
+	if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
+		i_size_write(inode, pos + LOGFS_BLOCKSIZE);
+
+	err = logfs_write_buf(inode, page, flags);
+	logfs_put_write_page(page);
+	return err;
+}
+
+int logfs_open_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+
+	inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_segfile_inode = inode;
+	return 0;
+}
+
+int logfs_init_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int min_fill = 3 * super->s_no_blocks;
+
+	INIT_LIST_HEAD(&super->s_object_alias);
+	INIT_LIST_HEAD(&super->s_writeback_list);
+	mutex_init(&super->s_write_mutex);
+	super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_block));
+	super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_shadow));
+	return 0;
+}
+
+void logfs_cleanup_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	logfs_mempool_destroy(super->s_block_pool);
+	logfs_mempool_destroy(super->s_shadow_pool);
+}
diff --git a/kernel/fs/logfs/segment.c b/kernel/fs/logfs/segment.c
new file mode 100644
index 000000000..7f9b096d8
--- /dev/null
+++ b/kernel/fs/logfs/segment.c
@@ -0,0 +1,961 @@
+/*
+ * fs/logfs/segment.c	- Handling the Object Store
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Object store or ostore makes up the complete device with exception of
+ * the superblock and journal areas.  Apart from its own metadata it stores
+ * three kinds of objects: inodes, dentries and blocks, both data and indirect.
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int err;
+
+	err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+	if (err)
+		return err;
+	logfs_super(sb)->s_bad_segments++;
+	/* FIXME: write to journal */
+	return 0;
+}
+
+int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec++;
+
+	return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
+			super->s_segsize, ensure_erase);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
+{
+	s32 ofs;
+
+	logfs_open_area(area, bytes);
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += bytes;
+	BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
+		int use_filler)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = super->s_devops->readpage;
+	struct page *page;
+
+	BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+	if (use_filler)
+		page = read_cache_page(mapping, index, filler, sb);
+	else {
+		page = find_or_create_page(mapping, index, GFP_NOFS);
+		if (page)
+			unlock_page(page);
+	}
+	return page;
+}
+
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	/* Only logfs_wbuf_recover may use len==0 */
+	BUG_ON(!len && !use_filler);
+	do {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(area->a_sb, index, use_filler);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		SetPageUptodate(page);
+		memcpy(page_address(page) + offset, buf, copylen);
+
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	} while (len);
+	return 0;
+}
+
+static void pad_partial_page(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct page *page;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	long offset = ofs & (PAGE_SIZE-1);
+	u32 len = PAGE_SIZE - offset;
+
+	if (len % PAGE_SIZE) {
+		page = get_mapping_page(sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		memset(page_address(page) + offset, 0xff, len);
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
+		page_cache_release(page);
+	}
+}
+
+static void pad_full_pages(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	u32 len = super->s_segsize - area->a_used_bytes;
+	pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+	pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (no_indizes) {
+		page = get_mapping_page(sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		SetPageUptodate(page);
+		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
+		page_cache_release(page);
+		index++;
+		no_indizes--;
+	}
+}
+
+/*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+	pad_partial_page(area);
+	if (final)
+		pad_full_pages(area);
+}
+
+/*
+ * We have to be careful with the alias tree.  Since lookup is done by bix,
+ * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
+ * indirect blocks.  So always use it through accessor functions.
+ */
+static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
+		level_t level)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_lookup128(head, ino, index);
+}
+
+static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, void *val)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_insert128(head, ino, index, val, GFP_NOFS);
+}
+
+static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
+		write_alias_t *write_one_alias)
+{
+	struct object_alias_item *item;
+	int err;
+
+	list_for_each_entry(item, &block->item_list, list) {
+		err = write_alias_journal(sb, block->ino, block->bix,
+				block->level, item->child_no, item->val);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static struct logfs_block_ops btree_block_ops = {
+	.write_block	= btree_write_block,
+	.free_block	= __free_block,
+	.write_alias	= btree_write_alias,
+};
+
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	struct object_alias_item *item;
+	u64 ino, bix;
+	level_t level;
+	int i, err;
+
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+	count /= sizeof(*oa);
+	for (i = 0; i < count; i++) {
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		if (!item)
+			return -ENOMEM;
+		memset(item, 0, sizeof(*item));
+
+		super->s_no_object_aliases++;
+		item->val = oa[i].val;
+		item->child_no = be16_to_cpu(oa[i].child_no);
+
+		ino = be64_to_cpu(oa[i].ino);
+		bix = be64_to_cpu(oa[i].bix);
+		level = LEVEL(oa[i].level);
+
+		log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
+				ino, bix, level, item->child_no,
+				be64_to_cpu(item->val));
+		block = alias_tree_lookup(sb, ino, bix, level);
+		if (!block) {
+			block = __alloc_block(sb, ino, bix, level);
+			block->ops = &btree_block_ops;
+			err = alias_tree_insert(sb, ino, bix, level, block);
+			BUG_ON(err); /* mempool empty */
+		}
+		if (test_and_set_bit(item->child_no, block->alias_map)) {
+			printk(KERN_ERR"LogFS: Alias collision detected\n");
+			return -EIO;
+		}
+		list_move_tail(&block->alias_list, &super->s_object_alias);
+		list_add(&item->list, &block->item_list);
+	}
+	return 0;
+}
+
+static void kill_alias(void *_block, unsigned long ignore0,
+		u64 ignore1, u64 ignore2, size_t ignore3)
+{
+	struct logfs_block *block = _block;
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+
+	while (!list_empty(&block->item_list)) {
+		item = list_entry(block->item_list.next, typeof(*item), list);
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->ops->free_block(sb, block);
+}
+
+static int obj_type(struct inode *inode, level_t level)
+{
+	if (level == 0) {
+		if (S_ISDIR(inode->i_mode))
+			return OBJ_DENTRY;
+		if (inode->i_ino == LOGFS_INO_MASTER)
+			return OBJ_INODE;
+	}
+	return OBJ_BLOCK;
+}
+
+static int obj_len(struct super_block *sb, int obj_type)
+{
+	switch (obj_type) {
+	case OBJ_DENTRY:
+		return sizeof(struct logfs_disk_dentry);
+	case OBJ_INODE:
+		return sizeof(struct logfs_disk_inode);
+	case OBJ_BLOCK:
+		return sb->s_blocksize;
+	default:
+		BUG();
+	}
+}
+
+static int __logfs_segment_write(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len, int compr)
+{
+	struct logfs_area *area;
+	struct super_block *sb = inode->i_sb;
+	s64 ofs;
+	struct logfs_object_header h;
+	int acc_len;
+
+	if (shadow->gc_level == 0)
+		acc_len = len;
+	else
+		acc_len = obj_len(sb, type);
+
+	area = get_area(sb, shadow->gc_level);
+	ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
+	LOGFS_BUG_ON(ofs <= 0, sb);
+	/*
+	 * Order is important.  logfs_get_free_bytes(), by modifying the
+	 * segment file, may modify the content of the very page we're about
+	 * to write now.  Which is fine, as long as the calculated crc and
+	 * written data still match.  So do the modifications _before_
+	 * calculating the crc.
+	 */
+
+	h.len	= cpu_to_be16(len);
+	h.type	= type;
+	h.compr	= compr;
+	h.ino	= cpu_to_be64(inode->i_ino);
+	h.bix	= cpu_to_be64(shadow->bix);
+	h.crc	= logfs_crc32(&h, sizeof(h) - 4, 4);
+	h.data_crc = logfs_crc32(buf, len, 0);
+
+	logfs_buf_write(area, ofs, &h, sizeof(h));
+	logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
+
+	shadow->new_ofs = ofs;
+	shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
+
+	return 0;
+}
+
+static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	ssize_t compr_len;
+	int ret;
+
+	mutex_lock(&logfs_super(sb)->s_journal_mutex);
+	compr_len = logfs_compress(buf, compressor_buf, len, len);
+
+	if (compr_len >= 0) {
+		ret = __logfs_segment_write(inode, compressor_buf, shadow,
+				type, compr_len, COMPR_ZLIB);
+	} else {
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	}
+	mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+	return ret;
+}
+
+/**
+ * logfs_segment_write - write data block to object store
+ * @inode:		inode containing data
+ *
+ * Returns an errno or zero.
+ */
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int do_compress, type, len;
+	int ret;
+	void *buf;
+
+	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
+	if (shadow->gc_level != 0) {
+		/* temporarily disable compression for indirect blocks */
+		do_compress = 0;
+	}
+
+	type = obj_type(inode, shrink_level(shadow->gc_level));
+	len = obj_len(sb, type);
+	buf = kmap(page);
+	if (do_compress)
+		ret = logfs_segment_write_compress(inode, buf, shadow, type,
+				len);
+	else
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	kunmap(page);
+
+	log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	/* this BUG_ON did catch a locking bug.  useful */
+	BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
+	return ret;
+}
+
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	while (len) {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(sb, index, 1);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		memcpy(buf, page_address(page) + offset, copylen);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	}
+	return 0;
+}
+
+/*
+ * The "position" of indirect blocks is ambiguous.  It can be the position
+ * of any data block somewhere behind this indirect block.  So we need to
+ * normalize the positions through logfs_block_mask() before comparing.
+ */
+static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
+{
+	return	(pos1 & logfs_block_mask(sb, level)) !=
+		(pos2 & logfs_block_mask(sb, level));
+}
+
+#if 0
+static int read_seg_header(struct super_block *sb, u64 ofs,
+		struct logfs_segment_header *sh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*sh), sh);
+	if (err)
+		return err;
+	crc = logfs_crc32(sh, sizeof(*sh), 4);
+	if (crc != sh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(sh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+#endif
+
+static int read_obj_header(struct super_block *sb, u64 ofs,
+		struct logfs_object_header *oh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*oh), oh);
+	if (err)
+		return err;
+	crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
+	if (crc != oh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(oh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+
+static void move_btree_to_page(struct inode *inode, struct page *page,
+		__be64 *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head128 *head = &super->s_object_alias_tree;
+	struct logfs_block *block;
+	struct object_alias_item *item, *next;
+
+	if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
+		return;
+
+	block = btree_remove128(head, inode->i_ino, page->index);
+	if (!block)
+		return;
+
+	log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	list_for_each_entry_safe(item, next, &block->item_list, list) {
+		data[item->child_no] = item->val;
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->page = page;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long) block);
+	}
+	block->ops = &indirect_block_ops;
+	initialize_block_counters(page, block, data, 0);
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+void move_page_to_btree(struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+	unsigned long pos;
+	__be64 *child;
+	int err;
+
+	if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
+		block->ops->free_block(sb, block);
+		return;
+	}
+	log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			break;
+
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		BUG_ON(!item); /* mempool empty */
+		memset(item, 0, sizeof(*item));
+
+		child = kmap_atomic(page);
+		item->val = child[pos];
+		kunmap_atomic(child);
+		item->child_no = pos;
+		list_add(&item->list, &block->item_list);
+	}
+	block->page = NULL;
+
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
+	block->ops = &btree_block_ops;
+	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
+			block);
+	BUG_ON(err); /* mempool empty */
+	ClearPageUptodate(page);
+}
+
+static int __logfs_segment_read(struct inode *inode, void *buf,
+		u64 ofs, u64 bix, level_t level)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	struct logfs_object_header oh;
+	__be32 crc;
+	u16 len;
+	int err, block_len;
+
+	block_len = obj_len(sb, obj_type(inode, level));
+	err = read_obj_header(sb, ofs, &oh);
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (be64_to_cpu(oh.ino) != inode->i_ino
+			|| check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
+		printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
+				"expected (%lx, %llx), got (%llx, %llx)\n",
+				ofs, inode->i_ino, bix,
+				be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
+		goto out_err;
+	}
+
+	len = be16_to_cpu(oh.len);
+
+	switch (oh.compr) {
+	case COMPR_NONE:
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
+		if (err)
+			goto out_err;
+		crc = logfs_crc32(buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: uncompressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			goto out_err;
+		}
+		break;
+	case COMPR_ZLIB:
+		mutex_lock(&logfs_super(sb)->s_journal_mutex);
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
+				compressor_buf);
+		if (err) {
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		crc = logfs_crc32(compressor_buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: compressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		err = logfs_uncompress(compressor_buf, buf, len, block_len);
+		mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+		if (err) {
+			printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
+			goto out_err;
+		}
+		break;
+	default:
+		LOGFS_BUG(sb);
+		err = -EIO;
+		goto out_err;
+	}
+	return 0;
+
+out_err:
+	logfs_set_ro(sb);
+	printk(KERN_ERR"LOGFS: device is read-only now\n");
+	LOGFS_BUG(sb);
+	return err;
+}
+
+/**
+ * logfs_segment_read - read data block from object store
+ * @inode:		inode containing data
+ * @buf:		data buffer
+ * @ofs:		physical data offset
+ * @bix:		block index
+ * @level:		block level
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int logfs_segment_read(struct inode *inode, struct page *page,
+		u64 ofs, u64 bix, level_t level)
+{
+	int err;
+	void *buf;
+
+	if (PageUptodate(page))
+		return 0;
+
+	ofs &= ~LOGFS_FULLY_POPULATED;
+
+	buf = kmap(page);
+	err = __logfs_segment_read(inode, buf, ofs, bix, level);
+	if (!err) {
+		move_btree_to_page(inode, page, buf);
+		SetPageUptodate(page);
+	}
+	kunmap(page);
+	log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
+			inode->i_ino, bix, level, ofs, err);
+	return err;
+}
+
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_object_header h;
+	u16 len;
+	int err;
+
+	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
+	if (!shadow->old_ofs)
+		return 0;
+
+	log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	err = read_obj_header(sb, shadow->old_ofs, &h);
+	LOGFS_BUG_ON(err, sb);
+	LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
+	LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
+				shrink_level(shadow->gc_level)), sb);
+
+	if (shadow->gc_level == 0)
+		len = be16_to_cpu(h.len);
+	else
+		len = obj_len(sb, h.type);
+	shadow->old_len = len + sizeof(h);
+	return 0;
+}
+
+void freeseg(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	u64 ofs, start, end;
+
+	start = dev_ofs(sb, segno, 0);
+	end = dev_ofs(sb, segno + 1, 0);
+	for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
+		page = find_get_page(mapping, ofs >> PAGE_SHIFT);
+		if (!page)
+			continue;
+		if (PagePrivate(page)) {
+			ClearPagePrivate(page);
+			page_cache_release(page);
+		}
+		page_cache_release(page);
+	}
+}
+
+int logfs_open_area(struct logfs_area *area, size_t bytes)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int err, closed = 0;
+
+	if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
+		return 0;
+
+	if (area->a_is_open) {
+		u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+		u32 len = super->s_segsize - area->a_written_bytes;
+
+		log_gc("logfs_close_area(%x)\n", area->a_segno);
+		pad_wbuf(area, 1);
+		super->s_devops->writeseg(area->a_sb, ofs, len);
+		freeseg(sb, area->a_segno);
+		closed = 1;
+	}
+
+	area->a_used_bytes = 0;
+	area->a_written_bytes = 0;
+again:
+	area->a_ops->get_free_segment(area);
+	area->a_ops->get_erase_count(area);
+
+	log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
+	err = area->a_ops->erase_segment(area);
+	if (err) {
+		printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
+				area->a_segno);
+		logfs_mark_segment_bad(sb, area->a_segno);
+		goto again;
+	}
+	area->a_is_open = 1;
+	return closed;
+}
+
+void logfs_sync_area(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	u32 len = (area->a_used_bytes - area->a_written_bytes);
+
+	if (super->s_writesize)
+		len &= ~(super->s_writesize - 1);
+	if (len == 0)
+		return;
+	pad_wbuf(area, 0);
+	super->s_devops->writeseg(sb, ofs, len);
+	area->a_written_bytes += len;
+}
+
+void logfs_sync_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for_each_area(i)
+		logfs_sync_area(super->s_area[i]);
+}
+
+/*
+ * Pick a free segment to be used for this area.  Effectively takes a
+ * candidate from the free list (not really a candidate anymore).
+ */
+static void ostore_get_free_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	if (super->s_free_list.count == 0) {
+		printk(KERN_ERR"LOGFS: ran out of free segments\n");
+		LOGFS_BUG(sb);
+	}
+
+	area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
+}
+
+static void ostore_get_erase_count(struct logfs_area *area)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
+	BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED));
+
+	ec_level = be32_to_cpu(se.ec_level);
+	area->a_erase_count = (ec_level >> 4) + 1;
+}
+
+static int ostore_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_segment_header sh;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno, 0);
+	if (err)
+		return err;
+
+	sh.pad = 0;
+	sh.type = SEG_OSTORE;
+	sh.level = (__force u8)area->a_level;
+	sh.segno = cpu_to_be32(area->a_segno);
+	sh.ec = cpu_to_be32(area->a_erase_count);
+	sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+
+	logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
+			area->a_level);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(sh);
+	logfs_buf_write(area, ofs, &sh, sizeof(sh));
+	return 0;
+}
+
+static const struct logfs_area_ops ostore_area_ops = {
+	.get_free_segment	= ostore_get_free_segment,
+	.get_erase_count	= ostore_get_erase_count,
+	.erase_segment		= ostore_erase_segment,
+};
+
+static void free_area(struct logfs_area *area)
+{
+	if (area)
+		freeseg(area->a_sb, area->a_segno);
+	kfree(area);
+}
+
+void free_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for_each_area(i)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+}
+
+static struct logfs_area *alloc_area(struct super_block *sb)
+{
+	struct logfs_area *area;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return NULL;
+
+	area->a_sb = sb;
+	return area;
+}
+
+static void map_invalidatepage(struct page *page, unsigned int o,
+			       unsigned int l)
+{
+	return;
+}
+
+static int map_releasepage(struct page *page, gfp_t g)
+{
+	/* Don't release these pages */
+	return 0;
+}
+
+static const struct address_space_operations mapping_aops = {
+	.invalidatepage = map_invalidatepage,
+	.releasepage	= map_releasepage,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+int logfs_init_mapping(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping;
+	struct inode *inode;
+
+	inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_mapping_inode = inode;
+	mapping = inode->i_mapping;
+	mapping->a_ops = &mapping_aops;
+	/* Would it be possible to use __GFP_HIGHMEM as well? */
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	return 0;
+}
+
+int logfs_init_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i = -1;
+
+	super->s_alias_pool = mempool_create_kmalloc_pool(600,
+			sizeof(struct object_alias_item));
+	if (!super->s_alias_pool)
+		return -ENOMEM;
+
+	super->s_journal_area = alloc_area(sb);
+	if (!super->s_journal_area)
+		goto err;
+
+	for_each_area(i) {
+		super->s_area[i] = alloc_area(sb);
+		if (!super->s_area[i])
+			goto err;
+		super->s_area[i]->a_level = GC_LEVEL(i);
+		super->s_area[i]->a_ops = &ostore_area_ops;
+	}
+	btree_init_mempool128(&super->s_object_alias_tree,
+			super->s_btree_pool);
+	return 0;
+
+err:
+	for (i--; i >= 0; i--)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+	logfs_mempool_destroy(super->s_alias_pool);
+	return -ENOMEM;
+}
+
+void logfs_cleanup_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
+}
diff --git a/kernel/fs/logfs/super.c b/kernel/fs/logfs/super.c
new file mode 100644
index 000000000..54360293b
--- /dev/null
+++ b/kernel/fs/logfs/super.c
@@ -0,0 +1,653 @@
+/*
+ * fs/logfs/super.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Generally contains mount/umount code and also serves as a dump area for
+ * any functions that don't fit elsewhere and neither justify a file of their
+ * own.
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+
+static DEFINE_MUTEX(emergency_mutex);
+static struct page *emergency_page;
+
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
+{
+	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+	struct page *page;
+	int err;
+
+	page = read_cache_page(mapping, index, filler, NULL);
+	if (page)
+		return page;
+
+	/* No more pages available, switch to emergency page */
+	printk(KERN_INFO"Logfs: Using emergency page\n");
+	mutex_lock(&emergency_mutex);
+	err = filler(NULL, emergency_page);
+	if (err) {
+		mutex_unlock(&emergency_mutex);
+		printk(KERN_EMERG"Logfs: Error reading emergency page\n");
+		return ERR_PTR(err);
+	}
+	return emergency_page;
+}
+
+void emergency_read_end(struct page *page)
+{
+	if (page == emergency_page)
+		mutex_unlock(&emergency_mutex);
+	else
+		page_cache_release(page);
+}
+
+static void dump_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_entry se;
+	u32 segno;
+
+	for (segno = 0; segno < super->s_no_segs; segno++) {
+		logfs_get_segment_entry(sb, segno, &se);
+		printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
+				be32_to_cpu(se.valid));
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		printk("\n");
+	}
+}
+
+/*
+ * logfs_crash_dump - dump debug information to device
+ *
+ * The LogFS superblock only occupies part of a segment.  This function will
+ * write as much debug information as it can gather into the spare space.
+ */
+void logfs_crash_dump(struct super_block *sb)
+{
+	dump_segfile(sb);
+}
+
+/*
+ * FIXME: There should be a reserve for root, similar to ext2.
+ */
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	stats->f_type		= LOGFS_MAGIC_U32;
+	stats->f_bsize		= sb->s_blocksize;
+	stats->f_blocks		= super->s_size >> LOGFS_BLOCK_BITS >> 3;
+	stats->f_bfree		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_bavail		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_files		= 0;
+	stats->f_ffree		= 0;
+	stats->f_namelen	= LOGFS_MAX_NAMELEN;
+	return 0;
+}
+
+static int logfs_sb_set(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+
+	sb->s_fs_info = super;
+	sb->s_mtd = super->s_mtd;
+	sb->s_bdev = super->s_bdev;
+#ifdef CONFIG_BLOCK
+	if (sb->s_bdev)
+		sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+#endif
+#ifdef CONFIG_MTD
+	if (sb->s_mtd)
+		sb->s_bdi = sb->s_mtd->backing_dev_info;
+#endif
+	return 0;
+}
+
+static int logfs_sb_test(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (mtd && sb->s_mtd == mtd)
+		return 1;
+	if (super->s_bdev && sb->s_bdev == super->s_bdev)
+		return 1;
+	return 0;
+}
+
+static void set_segment_header(struct logfs_segment_header *sh, u8 type,
+		u8 level, u32 segno, u32 ec)
+{
+	sh->pad = 0;
+	sh->type = type;
+	sh->level = level;
+	sh->segno = cpu_to_be32(segno);
+	sh->ec = cpu_to_be32(ec);
+	sh->gec = cpu_to_be64(segno);
+	sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
+}
+
+static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
+		u32 segno, u32 ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header *sh = &ds->ds_sh;
+	int i;
+
+	memset(ds, 0, sizeof(*ds));
+	set_segment_header(sh, SEG_SUPER, 0, segno, ec);
+
+	ds->ds_ifile_levels	= super->s_ifile_levels;
+	ds->ds_iblock_levels	= super->s_iblock_levels;
+	ds->ds_data_levels	= super->s_data_levels; /* XXX: Remove */
+	ds->ds_segment_shift	= super->s_segshift;
+	ds->ds_block_shift	= sb->s_blocksize_bits;
+	ds->ds_write_shift	= super->s_writeshift;
+	ds->ds_filesystem_size	= cpu_to_be64(super->s_size);
+	ds->ds_segment_size	= cpu_to_be32(super->s_segsize);
+	ds->ds_bad_seg_reserve	= cpu_to_be32(super->s_bad_seg_reserve);
+	ds->ds_feature_incompat	= cpu_to_be64(super->s_feature_incompat);
+	ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
+	ds->ds_feature_compat	= cpu_to_be64(super->s_feature_compat);
+	ds->ds_feature_flags	= cpu_to_be64(super->s_feature_flags);
+	ds->ds_root_reserve	= cpu_to_be64(super->s_root_reserve);
+	ds->ds_speed_reserve	= cpu_to_be64(super->s_speed_reserve);
+	journal_for_each(i)
+		ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
+	ds->ds_magic		= cpu_to_be64(LOGFS_MAGIC);
+	ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
+			LOGFS_SEGMENT_HEADERSIZE + 12);
+}
+
+static int write_one_sb(struct super_block *sb,
+		struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super *ds;
+	struct logfs_segment_entry se;
+	struct page *page;
+	u64 ofs;
+	u32 ec, segno;
+	int err;
+
+	page = find_sb(sb, &ofs);
+	if (!page)
+		return -EIO;
+	ds = page_address(page);
+	segno = seg_no(sb, ofs);
+	logfs_get_segment_entry(sb, segno, &se);
+	ec = be32_to_cpu(se.ec_level) >> 4;
+	ec++;
+	logfs_set_segment_erased(sb, segno, ec, 0);
+	logfs_write_ds(sb, ds, segno, ec);
+	err = super->s_devops->write_sb(sb, page);
+	page_cache_release(page);
+	return err;
+}
+
+int logfs_write_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	/* First superblock */
+	err = write_one_sb(sb, super->s_devops->find_first_sb);
+	if (err)
+		return err;
+
+	/* Last superblock */
+	err = write_one_sb(sb, super->s_devops->find_last_sb);
+	if (err)
+		return err;
+	return 0;
+}
+
+static int ds_cmp(const void *ds0, const void *ds1)
+{
+	size_t len = sizeof(struct logfs_disk_super);
+
+	/* We know the segment headers differ, so ignore them */
+	len -= LOGFS_SEGMENT_HEADERSIZE;
+	ds0 += LOGFS_SEGMENT_HEADERSIZE;
+	ds1 += LOGFS_SEGMENT_HEADERSIZE;
+	return memcmp(ds0, ds1, len);
+}
+
+static int logfs_recover_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super _ds0, *ds0 = &_ds0;
+	struct logfs_disk_super _ds1, *ds1 = &_ds1;
+	int err, valid0, valid1;
+
+	/* read first superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
+	if (err)
+		return err;
+	/* read last superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
+	if (err)
+		return err;
+	valid0 = logfs_check_ds(ds0) == 0;
+	valid1 = logfs_check_ds(ds1) == 0;
+
+	if (!valid0 && valid1) {
+		printk(KERN_INFO"First superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_first_sb);
+	}
+	if (valid0 && !valid1) {
+		printk(KERN_INFO"Last superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_last_sb);
+	}
+	if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
+		printk(KERN_INFO"Superblocks don't match - fixing.\n");
+		return logfs_write_sb(sb);
+	}
+	/* If neither is valid now, something's wrong.  Didn't we properly
+	 * check them before?!? */
+	BUG_ON(!valid0 && !valid1);
+	return 0;
+}
+
+static int logfs_make_writeable(struct super_block *sb)
+{
+	int err;
+
+	err = logfs_open_segfile(sb);
+	if (err)
+		return err;
+
+	/* Repair any broken superblock copies */
+	err = logfs_recover_sb(sb);
+	if (err)
+		return err;
+
+	/* Check areas for trailing unaccounted data */
+	err = logfs_check_areas(sb);
+	if (err)
+		return err;
+
+	/* Do one GC pass before any data gets dirtied */
+	logfs_gc_pass(sb);
+
+	/* after all initializations are done, replay the journal
+	 * for rw-mounts, if necessary */
+	err = logfs_replay_journal(sb);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int logfs_get_sb_final(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *rootdir;
+	int err;
+
+	/* root dir */
+	rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
+	if (IS_ERR(rootdir))
+		goto fail;
+
+	sb->s_root = d_make_root(rootdir);
+	if (!sb->s_root)
+		goto fail;
+
+	/* at that point we know that ->put_super() will be called */
+	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
+	if (!super->s_erase_page)
+		return -ENOMEM;
+	memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
+
+	/* FIXME: check for read-only mounts */
+	err = logfs_make_writeable(sb);
+	if (err) {
+		__free_page(super->s_erase_page);
+		return err;
+	}
+
+	log_super("LogFS: Finished mounting\n");
+	return 0;
+
+fail:
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
+	return -EIO;
+}
+
+int logfs_check_ds(struct logfs_disk_super *ds)
+{
+	struct logfs_segment_header *sh = &ds->ds_sh;
+
+	if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
+		return -EINVAL;
+	if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
+		return -EINVAL;
+	if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
+				LOGFS_SEGMENT_HEADERSIZE + 12))
+		return -EINVAL;
+	return 0;
+}
+
+static struct page *find_super_block(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *first, *last;
+
+	first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
+	if (!first || IS_ERR(first))
+		return NULL;
+	last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
+	if (!last || IS_ERR(last)) {
+		page_cache_release(first);
+		return NULL;
+	}
+
+	if (!logfs_check_ds(page_address(first))) {
+		page_cache_release(last);
+		return first;
+	}
+
+	/* First one didn't work, try the second superblock */
+	if (!logfs_check_ds(page_address(last))) {
+		page_cache_release(first);
+		return last;
+	}
+
+	/* Neither worked, sorry folks */
+	page_cache_release(first);
+	page_cache_release(last);
+	return NULL;
+}
+
+static int __logfs_read_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+	struct logfs_disk_super *ds;
+	int i;
+
+	page = find_super_block(sb);
+	if (!page)
+		return -EINVAL;
+
+	ds = page_address(page);
+	super->s_size = be64_to_cpu(ds->ds_filesystem_size);
+	super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
+	super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
+	super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
+	super->s_segsize = 1 << ds->ds_segment_shift;
+	super->s_segmask = (1 << ds->ds_segment_shift) - 1;
+	super->s_segshift = ds->ds_segment_shift;
+	sb->s_blocksize = 1 << ds->ds_block_shift;
+	sb->s_blocksize_bits = ds->ds_block_shift;
+	super->s_writesize = 1 << ds->ds_write_shift;
+	super->s_writeshift = ds->ds_write_shift;
+	super->s_no_segs = super->s_size >> super->s_segshift;
+	super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
+	super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
+	super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
+	super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
+	super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
+
+	journal_for_each(i)
+		super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
+
+	super->s_ifile_levels = ds->ds_ifile_levels;
+	super->s_iblock_levels = ds->ds_iblock_levels;
+	super->s_data_levels = ds->ds_data_levels;
+	super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+		+ super->s_data_levels;
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_read_sb(struct super_block *sb, int read_only)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int ret;
+
+	super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
+	if (!super->s_btree_pool)
+		return -ENOMEM;
+
+	btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
+	btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+	btree_init_mempool32(&super->s_shadow_tree.segment_map,
+			super->s_btree_pool);
+
+	ret = logfs_init_mapping(sb);
+	if (ret)
+		return ret;
+
+	ret = __logfs_read_sb(sb);
+	if (ret)
+		return ret;
+
+	if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
+		return -EIO;
+	if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
+			!read_only)
+		return -EIO;
+
+	ret = logfs_init_rw(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_areas(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_gc(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_journal(sb);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void logfs_kill_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	log_super("LogFS: Start unmounting\n");
+	/* Alias entries slow down mount, so evict as many as possible */
+	sync_filesystem(sb);
+	logfs_write_anchor(sb);
+	free_areas(sb);
+
+	/*
+	 * From this point on alias entries are simply dropped - and any
+	 * writes to the object store are considered bugs.
+	 */
+	log_super("LogFS: Now in shutdown\n");
+	generic_shutdown_super(sb);
+	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
+
+	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
+
+	logfs_cleanup_gc(sb);
+	logfs_cleanup_journal(sb);
+	logfs_cleanup_areas(sb);
+	logfs_cleanup_rw(sb);
+	if (super->s_erase_page)
+		__free_page(super->s_erase_page);
+	super->s_devops->put_device(super);
+	logfs_mempool_destroy(super->s_btree_pool);
+	logfs_mempool_destroy(super->s_alias_pool);
+	kfree(super);
+	log_super("LogFS: Finished unmounting\n");
+}
+
+static struct dentry *logfs_get_sb_device(struct logfs_super *super,
+		struct file_system_type *type, int flags)
+{
+	struct super_block *sb;
+	int err = -ENOMEM;
+	static int mount_count;
+
+	log_super("LogFS: Start mount %x\n", mount_count++);
+
+	err = -EINVAL;
+	sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super);
+	if (IS_ERR(sb)) {
+		super->s_devops->put_device(super);
+		kfree(super);
+		return ERR_CAST(sb);
+	}
+
+	if (sb->s_root) {
+		/* Device is already in use */
+		super->s_devops->put_device(super);
+		kfree(super);
+		return dget(sb->s_root);
+	}
+
+	/*
+	 * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
+	 * only covers 16TB and the upper 8TB are used for indirect blocks.
+	 * On 64bit system we could bump up the limit, but that would make
+	 * the filesystem incompatible with 32bit systems.
+	 */
+	sb->s_maxbytes	= (1ull << 43) - 1;
+	sb->s_max_links = LOGFS_LINK_MAX;
+	sb->s_op	= &logfs_super_operations;
+
+	err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
+	if (err)
+		goto err1;
+
+	sb->s_flags |= MS_ACTIVE;
+	err = logfs_get_sb_final(sb);
+	if (err) {
+		deactivate_locked_super(sb);
+		return ERR_PTR(err);
+	}
+	return dget(sb->s_root);
+
+err1:
+	/* no ->s_root, no ->put_super() */
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
+	deactivate_locked_super(sb);
+	return ERR_PTR(err);
+}
+
+static struct dentry *logfs_mount(struct file_system_type *type, int flags,
+		const char *devname, void *data)
+{
+	ulong mtdnr;
+	struct logfs_super *super;
+	int err;
+
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&super->s_dirop_mutex);
+	mutex_init(&super->s_object_alias_mutex);
+	INIT_LIST_HEAD(&super->s_freeing_list);
+
+	if (!devname)
+		err = logfs_get_sb_bdev(super, type, devname);
+	else if (strncmp(devname, "mtd", 3))
+		err = logfs_get_sb_bdev(super, type, devname);
+	else {
+		char *garbage;
+		mtdnr = simple_strtoul(devname+3, &garbage, 0);
+		if (*garbage)
+			err = -EINVAL;
+		else
+			err = logfs_get_sb_mtd(super, mtdnr);
+	}
+
+	if (err) {
+		kfree(super);
+		return ERR_PTR(err);
+	}
+
+	return logfs_get_sb_device(super, type, flags);
+}
+
+static struct file_system_type logfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "logfs",
+	.mount		= logfs_mount,
+	.kill_sb	= logfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+
+};
+MODULE_ALIAS_FS("logfs");
+
+static int __init logfs_init(void)
+{
+	int ret;
+
+	emergency_page = alloc_pages(GFP_KERNEL, 0);
+	if (!emergency_page)
+		return -ENOMEM;
+
+	ret = logfs_compr_init();
+	if (ret)
+		goto out1;
+
+	ret = logfs_init_inode_cache();
+	if (ret)
+		goto out2;
+
+	ret = register_filesystem(&logfs_fs_type);
+	if (!ret)
+		return 0;
+	logfs_destroy_inode_cache();
+out2:
+	logfs_compr_exit();
+out1:
+	__free_pages(emergency_page, 0);
+	return ret;
+}
+
+static void __exit logfs_exit(void)
+{
+	unregister_filesystem(&logfs_fs_type);
+	logfs_destroy_inode_cache();
+	logfs_compr_exit();
+	__free_pages(emergency_page, 0);
+}
+
+module_init(logfs_init);
+module_exit(logfs_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/kernel/fs/mbcache.c b/kernel/fs/mbcache.c
new file mode 100644
index 000000000..187477ded
--- /dev/null
+++ b/kernel/fs/mbcache.c
@@ -0,0 +1,858 @@
+/*
+ * linux/fs/mbcache.c
+ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+/*
+ * Filesystem Meta Information Block Cache (mbcache)
+ *
+ * The mbcache caches blocks of block devices that need to be located
+ * by their device/block number, as well as by other criteria (such
+ * as the block's contents).
+ *
+ * There can only be one cache entry in a cache per device and block number.
+ * Additional indexes need not be unique in this sense. The number of
+ * additional indexes (=other criteria) can be hardwired at compile time
+ * or specified at cache create time.
+ *
+ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
+ * in the cache. A valid entry is in the main hash tables of the cache,
+ * and may also be in the lru list. An invalid entry is not in any hashes
+ * or lists.
+ *
+ * A valid cache entry is only in the lru list if no handles refer to it.
+ * Invalid cache entries will be freed when the last handle to the cache
+ * entry is released. Entries that cannot be freed immediately are put
+ * back on the lru list.
+ */
+
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
+ * accesses to its local data, such as e_used and e_queued.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest lock order, followed by an
+ * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
+ * lock), and mb_cach_spinlock, with the lowest order.  While holding
+ * either a block or index hash chain lock, a thread can acquire an
+ * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
+ * index hash chian, it needs to lock the corresponding hash chain.  For each
+ * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
+ * prevent either any simultaneous release or free on the entry and also
+ * to serialize accesses to either the e_used or e_queued member of the entry.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced, both e_used,
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
+ * first removed from a block hash chain.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/list_bl.h>
+#include <linux/mbcache.h>
+#include <linux/init.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/log2.h>
+
+#ifdef MB_CACHE_DEBUG
+# define mb_debug(f...) do { \
+		printk(KERN_DEBUG f); \
+		printk("\n"); \
+	} while (0)
+#define mb_assert(c) do { if (!(c)) \
+		printk(KERN_ERR "assertion " #c " failed\n"); \
+	} while(0)
+#else
+# define mb_debug(f...) do { } while(0)
+# define mb_assert(c) do { } while(0)
+#endif
+#define mb_error(f...) do { \
+		printk(KERN_ERR f); \
+		printk("\n"); \
+	} while(0)
+
+#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
+
+#define MB_CACHE_ENTRY_LOCK_BITS	ilog2(NR_BG_LOCKS)
+#define	MB_CACHE_ENTRY_LOCK_INDEX(ce)			\
+	(hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
+
+static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
+static struct blockgroup_lock *mb_cache_bg_lock;
+static struct kmem_cache *mb_cache_kmem_cache;
+
+MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(mb_cache_create);
+EXPORT_SYMBOL(mb_cache_shrink);
+EXPORT_SYMBOL(mb_cache_destroy);
+EXPORT_SYMBOL(mb_cache_entry_alloc);
+EXPORT_SYMBOL(mb_cache_entry_insert);
+EXPORT_SYMBOL(mb_cache_entry_release);
+EXPORT_SYMBOL(mb_cache_entry_free);
+EXPORT_SYMBOL(mb_cache_entry_get);
+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+EXPORT_SYMBOL(mb_cache_entry_find_first);
+EXPORT_SYMBOL(mb_cache_entry_find_next);
+#endif
+
+/*
+ * Global data: list of all mbcache's, lru list, and a spinlock for
+ * accessing cache data structures on SMP machines. The lru list is
+ * global across all mbcaches.
+ */
+
+static LIST_HEAD(mb_cache_list);
+static LIST_HEAD(mb_cache_lru_list);
+static DEFINE_SPINLOCK(mb_cache_spinlock);
+
+static inline void
+__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+	spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
+static inline void
+__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+	spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
+static inline int
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
+{
+	return !hlist_bl_unhashed(&ce->e_block_list);
+}
+
+
+static inline void
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
+{
+	if (__mb_cache_entry_is_block_hashed(ce))
+		hlist_bl_del_init(&ce->e_block_list);
+}
+
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+	return !hlist_bl_unhashed(&ce->e_index.o_list);
+}
+
+static inline void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+	if (__mb_cache_entry_is_index_hashed(ce))
+		hlist_bl_del_init(&ce->e_index.o_list);
+}
+
+/*
+ * __mb_cache_entry_unhash_unlock()
+ *
+ * This function is called to unhash both the block and index hash
+ * chain.
+ * It assumes both the block and index hash chain is locked upon entry.
+ * It also unlock both hash chains both exit
+ */
+static inline void
+__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+{
+	__mb_cache_entry_unhash_index(ce);
+	hlist_bl_unlock(ce->e_index_hash_p);
+	__mb_cache_entry_unhash_block(ce);
+	hlist_bl_unlock(ce->e_block_hash_p);
+}
+
+static void
+__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
+{
+	struct mb_cache *cache = ce->e_cache;
+
+	mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
+	kmem_cache_free(cache->c_entry_cache, ce);
+	atomic_dec(&cache->c_entry_count);
+}
+
+static void
+__mb_cache_entry_release(struct mb_cache_entry *ce)
+{
+	/* First lock the entry to serialize access to its local data. */
+	__spin_lock_mb_cache_entry(ce);
+	/* Wake up all processes queuing for this cache entry. */
+	if (ce->e_queued)
+		wake_up_all(&mb_cache_queue);
+	if (ce->e_used >= MB_CACHE_WRITER)
+		ce->e_used -= MB_CACHE_WRITER;
+	/*
+	 * Make sure that all cache entries on lru_list have
+	 * both e_used and e_qued of 0s.
+	 */
+	ce->e_used--;
+	if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
+		if (!__mb_cache_entry_is_block_hashed(ce)) {
+			__spin_unlock_mb_cache_entry(ce);
+			goto forget;
+		}
+		/*
+		 * Need access to lru list, first drop entry lock,
+		 * then reacquire the lock in the proper order.
+		 */
+		spin_lock(&mb_cache_spinlock);
+		if (list_empty(&ce->e_lru_list))
+			list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+		spin_unlock(&mb_cache_spinlock);
+	}
+	__spin_unlock_mb_cache_entry(ce);
+	return;
+forget:
+	mb_assert(list_empty(&ce->e_lru_list));
+	__mb_cache_entry_forget(ce, GFP_KERNEL);
+}
+
+/*
+ * mb_cache_shrink_scan()  memory pressure callback
+ *
+ * This function is called by the kernel memory management when memory
+ * gets low.
+ *
+ * @shrink: (ignored)
+ * @sc: shrink_control passed from reclaim
+ *
+ * Returns the number of objects freed.
+ */
+static unsigned long
+mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	LIST_HEAD(free_list);
+	struct mb_cache_entry *entry, *tmp;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+	unsigned long freed = 0;
+
+	mb_debug("trying to free %d entries", nr_to_scan);
+	spin_lock(&mb_cache_spinlock);
+	while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
+		struct mb_cache_entry *ce =
+			list_entry(mb_cache_lru_list.next,
+				struct mb_cache_entry, e_lru_list);
+		list_del_init(&ce->e_lru_list);
+		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
+			continue;
+		spin_unlock(&mb_cache_spinlock);
+		/* Prevent any find or get operation on the entry */
+		hlist_bl_lock(ce->e_block_hash_p);
+		hlist_bl_lock(ce->e_index_hash_p);
+		/* Ignore if it is touched by a find/get */
+		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
+			!list_empty(&ce->e_lru_list)) {
+			hlist_bl_unlock(ce->e_index_hash_p);
+			hlist_bl_unlock(ce->e_block_hash_p);
+			spin_lock(&mb_cache_spinlock);
+			continue;
+		}
+		__mb_cache_entry_unhash_unlock(ce);
+		list_add_tail(&ce->e_lru_list, &free_list);
+		spin_lock(&mb_cache_spinlock);
+	}
+	spin_unlock(&mb_cache_spinlock);
+
+	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(entry, gfp_mask);
+		freed++;
+	}
+	return freed;
+}
+
+static unsigned long
+mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct mb_cache *cache;
+	unsigned long count = 0;
+
+	spin_lock(&mb_cache_spinlock);
+	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+		mb_debug("cache %s (%d)", cache->c_name,
+			  atomic_read(&cache->c_entry_count));
+		count += atomic_read(&cache->c_entry_count);
+	}
+	spin_unlock(&mb_cache_spinlock);
+
+	return vfs_pressure_ratio(count);
+}
+
+static struct shrinker mb_cache_shrinker = {
+	.count_objects = mb_cache_shrink_count,
+	.scan_objects = mb_cache_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * mb_cache_create()  create a new cache
+ *
+ * All entries in one cache are equal size. Cache entries may be from
+ * multiple devices. If this is the first mbcache created, registers
+ * the cache with kernel memory management. Returns NULL if no more
+ * memory was available.
+ *
+ * @name: name of the cache (informal)
+ * @bucket_bits: log2(number of hash buckets)
+ */
+struct mb_cache *
+mb_cache_create(const char *name, int bucket_bits)
+{
+	int n, bucket_count = 1 << bucket_bits;
+	struct mb_cache *cache = NULL;
+
+	if (!mb_cache_bg_lock) {
+		mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+			GFP_KERNEL);
+		if (!mb_cache_bg_lock)
+			return NULL;
+		bgl_lock_init(mb_cache_bg_lock);
+	}
+
+	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
+	if (!cache)
+		return NULL;
+	cache->c_name = name;
+	atomic_set(&cache->c_entry_count, 0);
+	cache->c_bucket_bits = bucket_bits;
+	cache->c_block_hash = kmalloc(bucket_count *
+		sizeof(struct hlist_bl_head), GFP_KERNEL);
+	if (!cache->c_block_hash)
+		goto fail;
+	for (n=0; n<bucket_count; n++)
+		INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
+	cache->c_index_hash = kmalloc(bucket_count *
+		sizeof(struct hlist_bl_head), GFP_KERNEL);
+	if (!cache->c_index_hash)
+		goto fail;
+	for (n=0; n<bucket_count; n++)
+		INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
+	if (!mb_cache_kmem_cache) {
+		mb_cache_kmem_cache = kmem_cache_create(name,
+			sizeof(struct mb_cache_entry), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+		if (!mb_cache_kmem_cache)
+			goto fail2;
+	}
+	cache->c_entry_cache = mb_cache_kmem_cache;
+
+	/*
+	 * Set an upper limit on the number of cache entries so that the hash
+	 * chains won't grow too long.
+	 */
+	cache->c_max_entries = bucket_count << 4;
+
+	spin_lock(&mb_cache_spinlock);
+	list_add(&cache->c_cache_list, &mb_cache_list);
+	spin_unlock(&mb_cache_spinlock);
+	return cache;
+
+fail2:
+	kfree(cache->c_index_hash);
+
+fail:
+	kfree(cache->c_block_hash);
+	kfree(cache);
+	return NULL;
+}
+
+
+/*
+ * mb_cache_shrink()
+ *
+ * Removes all cache entries of a device from the cache. All cache entries
+ * currently in use cannot be freed, and thus remain in the cache. All others
+ * are freed.
+ *
+ * @bdev: which device's cache entries to shrink
+ */
+void
+mb_cache_shrink(struct block_device *bdev)
+{
+	LIST_HEAD(free_list);
+	struct list_head *l;
+	struct mb_cache_entry *ce, *tmp;
+
+	l = &mb_cache_lru_list;
+	spin_lock(&mb_cache_spinlock);
+	while (!list_is_last(l, &mb_cache_lru_list)) {
+		l = l->next;
+		ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+		if (ce->e_bdev == bdev) {
+			list_del_init(&ce->e_lru_list);
+			if (ce->e_used || ce->e_queued ||
+				atomic_read(&ce->e_refcnt))
+				continue;
+			spin_unlock(&mb_cache_spinlock);
+			/*
+			 * Prevent any find or get operation on the entry.
+			 */
+			hlist_bl_lock(ce->e_block_hash_p);
+			hlist_bl_lock(ce->e_index_hash_p);
+			/* Ignore if it is touched by a find/get */
+			if (ce->e_used || ce->e_queued ||
+				atomic_read(&ce->e_refcnt) ||
+				!list_empty(&ce->e_lru_list)) {
+				hlist_bl_unlock(ce->e_index_hash_p);
+				hlist_bl_unlock(ce->e_block_hash_p);
+				l = &mb_cache_lru_list;
+				spin_lock(&mb_cache_spinlock);
+				continue;
+			}
+			__mb_cache_entry_unhash_unlock(ce);
+			mb_assert(!(ce->e_used || ce->e_queued ||
+				atomic_read(&ce->e_refcnt)));
+			list_add_tail(&ce->e_lru_list, &free_list);
+			l = &mb_cache_lru_list;
+			spin_lock(&mb_cache_spinlock);
+		}
+	}
+	spin_unlock(&mb_cache_spinlock);
+
+	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(ce, GFP_KERNEL);
+	}
+}
+
+
+/*
+ * mb_cache_destroy()
+ *
+ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
+ * and then destroys it. If this was the last mbcache, un-registers the
+ * mbcache from kernel memory management.
+ */
+void
+mb_cache_destroy(struct mb_cache *cache)
+{
+	LIST_HEAD(free_list);
+	struct mb_cache_entry *ce, *tmp;
+
+	spin_lock(&mb_cache_spinlock);
+	list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
+		if (ce->e_cache == cache)
+			list_move_tail(&ce->e_lru_list, &free_list);
+	}
+	list_del(&cache->c_cache_list);
+	spin_unlock(&mb_cache_spinlock);
+
+	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+		list_del_init(&ce->e_lru_list);
+		/*
+		 * Prevent any find or get operation on the entry.
+		 */
+		hlist_bl_lock(ce->e_block_hash_p);
+		hlist_bl_lock(ce->e_index_hash_p);
+		mb_assert(!(ce->e_used || ce->e_queued ||
+			atomic_read(&ce->e_refcnt)));
+		__mb_cache_entry_unhash_unlock(ce);
+		__mb_cache_entry_forget(ce, GFP_KERNEL);
+	}
+
+	if (atomic_read(&cache->c_entry_count) > 0) {
+		mb_error("cache %s: %d orphaned entries",
+			  cache->c_name,
+			  atomic_read(&cache->c_entry_count));
+	}
+
+	if (list_empty(&mb_cache_list)) {
+		kmem_cache_destroy(mb_cache_kmem_cache);
+		mb_cache_kmem_cache = NULL;
+	}
+	kfree(cache->c_index_hash);
+	kfree(cache->c_block_hash);
+	kfree(cache);
+}
+
+/*
+ * mb_cache_entry_alloc()
+ *
+ * Allocates a new cache entry. The new entry will not be valid initially,
+ * and thus cannot be looked up yet. It should be filled with data, and
+ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
+ * if no more memory was available.
+ */
+struct mb_cache_entry *
+mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
+{
+	struct mb_cache_entry *ce;
+
+	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+		struct list_head *l;
+
+		l = &mb_cache_lru_list;
+		spin_lock(&mb_cache_spinlock);
+		while (!list_is_last(l, &mb_cache_lru_list)) {
+			l = l->next;
+			ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+			if (ce->e_cache == cache) {
+				list_del_init(&ce->e_lru_list);
+				if (ce->e_used || ce->e_queued ||
+					atomic_read(&ce->e_refcnt))
+					continue;
+				spin_unlock(&mb_cache_spinlock);
+				/*
+				 * Prevent any find or get operation on the
+				 * entry.
+				 */
+				hlist_bl_lock(ce->e_block_hash_p);
+				hlist_bl_lock(ce->e_index_hash_p);
+				/* Ignore if it is touched by a find/get */
+				if (ce->e_used || ce->e_queued ||
+					atomic_read(&ce->e_refcnt) ||
+					!list_empty(&ce->e_lru_list)) {
+					hlist_bl_unlock(ce->e_index_hash_p);
+					hlist_bl_unlock(ce->e_block_hash_p);
+					l = &mb_cache_lru_list;
+					spin_lock(&mb_cache_spinlock);
+					continue;
+				}
+				mb_assert(list_empty(&ce->e_lru_list));
+				mb_assert(!(ce->e_used || ce->e_queued ||
+					atomic_read(&ce->e_refcnt)));
+				__mb_cache_entry_unhash_unlock(ce);
+				goto found;
+			}
+		}
+		spin_unlock(&mb_cache_spinlock);
+	}
+
+	ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+	if (!ce)
+		return NULL;
+	atomic_inc(&cache->c_entry_count);
+	INIT_LIST_HEAD(&ce->e_lru_list);
+	INIT_HLIST_BL_NODE(&ce->e_block_list);
+	INIT_HLIST_BL_NODE(&ce->e_index.o_list);
+	ce->e_cache = cache;
+	ce->e_queued = 0;
+	atomic_set(&ce->e_refcnt, 0);
+found:
+	ce->e_block_hash_p = &cache->c_block_hash[0];
+	ce->e_index_hash_p = &cache->c_index_hash[0];
+	ce->e_used = 1 + MB_CACHE_WRITER;
+	return ce;
+}
+
+
+/*
+ * mb_cache_entry_insert()
+ *
+ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
+ * the cache. After this, the cache entry can be looked up, but is not yet
+ * in the lru list as the caller still holds a handle to it. Returns 0 on
+ * success, or -EBUSY if a cache entry for that device + inode exists
+ * already (this may happen after a failed lookup, but when another process
+ * has inserted the same cache entry in the meantime).
+ *
+ * @bdev: device the cache entry belongs to
+ * @block: block number
+ * @key: lookup key
+ */
+int
+mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
+		      sector_t block, unsigned int key)
+{
+	struct mb_cache *cache = ce->e_cache;
+	unsigned int bucket;
+	struct hlist_bl_node *l;
+	struct hlist_bl_head *block_hash_p;
+	struct hlist_bl_head *index_hash_p;
+	struct mb_cache_entry *lce;
+
+	mb_assert(ce);
+	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
+			   cache->c_bucket_bits);
+	block_hash_p = &cache->c_block_hash[bucket];
+	hlist_bl_lock(block_hash_p);
+	hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
+		if (lce->e_bdev == bdev && lce->e_block == block) {
+			hlist_bl_unlock(block_hash_p);
+			return -EBUSY;
+		}
+	}
+	mb_assert(!__mb_cache_entry_is_block_hashed(ce));
+	__mb_cache_entry_unhash_block(ce);
+	__mb_cache_entry_unhash_index(ce);
+	ce->e_bdev = bdev;
+	ce->e_block = block;
+	ce->e_block_hash_p = block_hash_p;
+	ce->e_index.o_key = key;
+	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+	hlist_bl_unlock(block_hash_p);
+	bucket = hash_long(key, cache->c_bucket_bits);
+	index_hash_p = &cache->c_index_hash[bucket];
+	hlist_bl_lock(index_hash_p);
+	ce->e_index_hash_p = index_hash_p;
+	hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
+	hlist_bl_unlock(index_hash_p);
+	return 0;
+}
+
+
+/*
+ * mb_cache_entry_release()
+ *
+ * Release a handle to a cache entry. When the last handle to a cache entry
+ * is released it is either freed (if it is invalid) or otherwise inserted
+ * in to the lru list.
+ */
+void
+mb_cache_entry_release(struct mb_cache_entry *ce)
+{
+	__mb_cache_entry_release(ce);
+}
+
+
+/*
+ * mb_cache_entry_free()
+ *
+ */
+void
+mb_cache_entry_free(struct mb_cache_entry *ce)
+{
+	mb_assert(ce);
+	mb_assert(list_empty(&ce->e_lru_list));
+	hlist_bl_lock(ce->e_index_hash_p);
+	__mb_cache_entry_unhash_index(ce);
+	hlist_bl_unlock(ce->e_index_hash_p);
+	hlist_bl_lock(ce->e_block_hash_p);
+	__mb_cache_entry_unhash_block(ce);
+	hlist_bl_unlock(ce->e_block_hash_p);
+	__mb_cache_entry_release(ce);
+}
+
+
+/*
+ * mb_cache_entry_get()
+ *
+ * Get a cache entry  by device / block number. (There can only be one entry
+ * in the cache per device and block.) Returns NULL if no such cache entry
+ * exists. The returned cache entry is locked for exclusive access ("single
+ * writer").
+ */
+struct mb_cache_entry *
+mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
+		   sector_t block)
+{
+	unsigned int bucket;
+	struct hlist_bl_node *l;
+	struct mb_cache_entry *ce;
+	struct hlist_bl_head *block_hash_p;
+
+	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
+			   cache->c_bucket_bits);
+	block_hash_p = &cache->c_block_hash[bucket];
+	/* First serialize access to the block corresponding hash chain. */
+	hlist_bl_lock(block_hash_p);
+	hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+		mb_assert(ce->e_block_hash_p == block_hash_p);
+		if (ce->e_bdev == bdev && ce->e_block == block) {
+			/*
+			 * Prevent a free from removing the entry.
+			 */
+			atomic_inc(&ce->e_refcnt);
+			hlist_bl_unlock(block_hash_p);
+			__spin_lock_mb_cache_entry(ce);
+			atomic_dec(&ce->e_refcnt);
+			if (ce->e_used > 0) {
+				DEFINE_WAIT(wait);
+				while (ce->e_used > 0) {
+					ce->e_queued++;
+					prepare_to_wait(&mb_cache_queue, &wait,
+							TASK_UNINTERRUPTIBLE);
+					__spin_unlock_mb_cache_entry(ce);
+					schedule();
+					__spin_lock_mb_cache_entry(ce);
+					ce->e_queued--;
+				}
+				finish_wait(&mb_cache_queue, &wait);
+			}
+			ce->e_used += 1 + MB_CACHE_WRITER;
+			__spin_unlock_mb_cache_entry(ce);
+
+			if (!list_empty(&ce->e_lru_list)) {
+				spin_lock(&mb_cache_spinlock);
+				list_del_init(&ce->e_lru_list);
+				spin_unlock(&mb_cache_spinlock);
+			}
+			if (!__mb_cache_entry_is_block_hashed(ce)) {
+				__mb_cache_entry_release(ce);
+				return NULL;
+			}
+			return ce;
+		}
+	}
+	hlist_bl_unlock(block_hash_p);
+	return NULL;
+}
+
+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+
+static struct mb_cache_entry *
+__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
+		      struct block_device *bdev, unsigned int key)
+{
+
+	/* The index hash chain is alredy acquire by caller. */
+	while (l != NULL) {
+		struct mb_cache_entry *ce =
+			hlist_bl_entry(l, struct mb_cache_entry,
+				e_index.o_list);
+		mb_assert(ce->e_index_hash_p == head);
+		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
+			/*
+			 * Prevent a free from removing the entry.
+			 */
+			atomic_inc(&ce->e_refcnt);
+			hlist_bl_unlock(head);
+			__spin_lock_mb_cache_entry(ce);
+			atomic_dec(&ce->e_refcnt);
+			ce->e_used++;
+			/* Incrementing before holding the lock gives readers
+			   priority over writers. */
+			if (ce->e_used >= MB_CACHE_WRITER) {
+				DEFINE_WAIT(wait);
+
+				while (ce->e_used >= MB_CACHE_WRITER) {
+					ce->e_queued++;
+					prepare_to_wait(&mb_cache_queue, &wait,
+							TASK_UNINTERRUPTIBLE);
+					__spin_unlock_mb_cache_entry(ce);
+					schedule();
+					__spin_lock_mb_cache_entry(ce);
+					ce->e_queued--;
+				}
+				finish_wait(&mb_cache_queue, &wait);
+			}
+			__spin_unlock_mb_cache_entry(ce);
+			if (!list_empty(&ce->e_lru_list)) {
+				spin_lock(&mb_cache_spinlock);
+				list_del_init(&ce->e_lru_list);
+				spin_unlock(&mb_cache_spinlock);
+			}
+			if (!__mb_cache_entry_is_block_hashed(ce)) {
+				__mb_cache_entry_release(ce);
+				return ERR_PTR(-EAGAIN);
+			}
+			return ce;
+		}
+		l = l->next;
+	}
+	hlist_bl_unlock(head);
+	return NULL;
+}
+
+
+/*
+ * mb_cache_entry_find_first()
+ *
+ * Find the first cache entry on a given device with a certain key in
+ * an additional index. Additional matches can be found with
+ * mb_cache_entry_find_next(). Returns NULL if no match was found. The
+ * returned cache entry is locked for shared access ("multiple readers").
+ *
+ * @cache: the cache to search
+ * @bdev: the device the cache entry should belong to
+ * @key: the key in the index
+ */
+struct mb_cache_entry *
+mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
+			  unsigned int key)
+{
+	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
+	struct hlist_bl_node *l;
+	struct mb_cache_entry *ce = NULL;
+	struct hlist_bl_head *index_hash_p;
+
+	index_hash_p = &cache->c_index_hash[bucket];
+	hlist_bl_lock(index_hash_p);
+	if (!hlist_bl_empty(index_hash_p)) {
+		l = hlist_bl_first(index_hash_p);
+		ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+	} else
+		hlist_bl_unlock(index_hash_p);
+	return ce;
+}
+
+
+/*
+ * mb_cache_entry_find_next()
+ *
+ * Find the next cache entry on a given device with a certain key in an
+ * additional index. Returns NULL if no match could be found. The previous
+ * entry is atomatically released, so that mb_cache_entry_find_next() can
+ * be called like this:
+ *
+ * entry = mb_cache_entry_find_first();
+ * while (entry) {
+ * 	...
+ *	entry = mb_cache_entry_find_next(entry, ...);
+ * }
+ *
+ * @prev: The previous match
+ * @bdev: the device the cache entry should belong to
+ * @key: the key in the index
+ */
+struct mb_cache_entry *
+mb_cache_entry_find_next(struct mb_cache_entry *prev,
+			 struct block_device *bdev, unsigned int key)
+{
+	struct mb_cache *cache = prev->e_cache;
+	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
+	struct hlist_bl_node *l;
+	struct mb_cache_entry *ce;
+	struct hlist_bl_head *index_hash_p;
+
+	index_hash_p = &cache->c_index_hash[bucket];
+	mb_assert(prev->e_index_hash_p == index_hash_p);
+	hlist_bl_lock(index_hash_p);
+	mb_assert(!hlist_bl_empty(index_hash_p));
+	l = prev->e_index.o_list.next;
+	ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+	__mb_cache_entry_release(prev);
+	return ce;
+}
+
+#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
+
+static int __init init_mbcache(void)
+{
+	register_shrinker(&mb_cache_shrinker);
+	return 0;
+}
+
+static void __exit exit_mbcache(void)
+{
+	unregister_shrinker(&mb_cache_shrinker);
+}
+
+module_init(init_mbcache)
+module_exit(exit_mbcache)
+
diff --git a/kernel/fs/minix/Kconfig b/kernel/fs/minix/Kconfig
new file mode 100644
index 000000000..f2a0cfcef
--- /dev/null
+++ b/kernel/fs/minix/Kconfig
@@ -0,0 +1,25 @@
+config MINIX_FS
+	tristate "Minix file system support"
+	depends on BLOCK
+	help
+	  Minix is a simple operating system used in many classes about OS's.
+	  The minix file system (method to organize files on a hard disk
+	  partition or a floppy disk) was the original file system for Linux,
+	  but has been superseded by the second extended file system ext2fs.
+	  You don't want to use the minix file system on your hard disk
+	  because of certain built-in restrictions, but it is sometimes found
+	  on older Linux floppy disks.  This option will enlarge your kernel
+	  by about 28 KB. If unsure, say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called minix.  Note that the file system of your root
+	  partition (the one containing the directory /) cannot be compiled as
+	  a module.
+
+config MINIX_FS_NATIVE_ENDIAN
+	def_bool MINIX_FS
+	depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+
+config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
+	def_bool MINIX_FS
+	depends on M68K && MMU
diff --git a/kernel/fs/minix/Makefile b/kernel/fs/minix/Makefile
new file mode 100644
index 000000000..3063015ab
--- /dev/null
+++ b/kernel/fs/minix/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux minix filesystem routines.
+#
+
+obj-$(CONFIG_MINIX_FS) += minix.o
+
+minix-objs := bitmap.o itree_v1.o itree_v2.o namei.o inode.o file.o dir.o
diff --git a/kernel/fs/minix/bitmap.c b/kernel/fs/minix/bitmap.c
new file mode 100644
index 000000000..742942a98
--- /dev/null
+++ b/kernel/fs/minix/bitmap.c
@@ -0,0 +1,272 @@
+/*
+ *  linux/fs/minix/bitmap.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * Modified for 680x0 by Hamish Macdonald
+ * Fixed for 680x0 by Andreas Schwab
+ */
+
+/* bitmap.c contains the code that handles the inode and block bitmaps */
+
+#include "minix.h"
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/sched.h>
+
+static DEFINE_SPINLOCK(bitmap_lock);
+
+/*
+ * bitmap consists of blocks filled with 16bit words
+ * bit set == busy, bit clear == free
+ * endianness is a mess, but for counting zero bits it really doesn't matter...
+ */
+static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
+{
+	__u32 sum = 0;
+	unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
+
+	while (blocks--) {
+		unsigned words = blocksize / 2;
+		__u16 *p = (__u16 *)(*map++)->b_data;
+		while (words--)
+			sum += 16 - hweight16(*p++);
+	}
+
+	return sum;
+}
+
+void minix_free_block(struct inode *inode, unsigned long block)
+{
+	struct super_block *sb = inode->i_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct buffer_head *bh;
+	int k = sb->s_blocksize_bits + 3;
+	unsigned long bit, zone;
+
+	if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
+		printk("Trying to free block not in datazone\n");
+		return;
+	}
+	zone = block - sbi->s_firstdatazone + 1;
+	bit = zone & ((1<<k) - 1);
+	zone >>= k;
+	if (zone >= sbi->s_zmap_blocks) {
+		printk("minix_free_block: nonexistent bitmap buffer\n");
+		return;
+	}
+	bh = sbi->s_zmap[zone];
+	spin_lock(&bitmap_lock);
+	if (!minix_test_and_clear_bit(bit, bh->b_data))
+		printk("minix_free_block (%s:%lu): bit already cleared\n",
+		       sb->s_id, block);
+	spin_unlock(&bitmap_lock);
+	mark_buffer_dirty(bh);
+	return;
+}
+
+int minix_new_block(struct inode * inode)
+{
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	int bits_per_zone = 8 * inode->i_sb->s_blocksize;
+	int i;
+
+	for (i = 0; i < sbi->s_zmap_blocks; i++) {
+		struct buffer_head *bh = sbi->s_zmap[i];
+		int j;
+
+		spin_lock(&bitmap_lock);
+		j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
+		if (j < bits_per_zone) {
+			minix_set_bit(j, bh->b_data);
+			spin_unlock(&bitmap_lock);
+			mark_buffer_dirty(bh);
+			j += i * bits_per_zone + sbi->s_firstdatazone-1;
+			if (j < sbi->s_firstdatazone || j >= sbi->s_nzones)
+				break;
+			return j;
+		}
+		spin_unlock(&bitmap_lock);
+	}
+	return 0;
+}
+
+unsigned long minix_count_free_blocks(struct super_block *sb)
+{
+	struct minix_sb_info *sbi = minix_sb(sb);
+	u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1;
+
+	return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
+		<< sbi->s_log_zone_size);
+}
+
+struct minix_inode *
+minix_V1_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh)
+{
+	int block;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct minix_inode *p;
+
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %ld is out of range\n",
+		       sb->s_id, (long)ino);
+		return NULL;
+	}
+	ino--;
+	block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks +
+		 ino / MINIX_INODES_PER_BLOCK;
+	*bh = sb_bread(sb, block);
+	if (!*bh) {
+		printk("Unable to read inode block\n");
+		return NULL;
+	}
+	p = (void *)(*bh)->b_data;
+	return p + ino % MINIX_INODES_PER_BLOCK;
+}
+
+struct minix2_inode *
+minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh)
+{
+	int block;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct minix2_inode *p;
+	int minix2_inodes_per_block = sb->s_blocksize / sizeof(struct minix2_inode);
+
+	*bh = NULL;
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %ld is out of range\n",
+		       sb->s_id, (long)ino);
+		return NULL;
+	}
+	ino--;
+	block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks +
+		 ino / minix2_inodes_per_block;
+	*bh = sb_bread(sb, block);
+	if (!*bh) {
+		printk("Unable to read inode block\n");
+		return NULL;
+	}
+	p = (void *)(*bh)->b_data;
+	return p + ino % minix2_inodes_per_block;
+}
+
+/* Clear the link count and mode of a deleted inode on disk. */
+
+static void minix_clear_inode(struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+
+	if (INODE_VERSION(inode) == MINIX_V1) {
+		struct minix_inode *raw_inode;
+		raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+		if (raw_inode) {
+			raw_inode->i_nlinks = 0;
+			raw_inode->i_mode = 0;
+		}
+	} else {
+		struct minix2_inode *raw_inode;
+		raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
+		if (raw_inode) {
+			raw_inode->i_nlinks = 0;
+			raw_inode->i_mode = 0;
+		}
+	}
+	if (bh) {
+		mark_buffer_dirty(bh);
+		brelse (bh);
+	}
+}
+
+void minix_free_inode(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	struct buffer_head *bh;
+	int k = sb->s_blocksize_bits + 3;
+	unsigned long ino, bit;
+
+	ino = inode->i_ino;
+	if (ino < 1 || ino > sbi->s_ninodes) {
+		printk("minix_free_inode: inode 0 or nonexistent inode\n");
+		return;
+	}
+	bit = ino & ((1<<k) - 1);
+	ino >>= k;
+	if (ino >= sbi->s_imap_blocks) {
+		printk("minix_free_inode: nonexistent imap in superblock\n");
+		return;
+	}
+
+	minix_clear_inode(inode);	/* clear on-disk copy */
+
+	bh = sbi->s_imap[ino];
+	spin_lock(&bitmap_lock);
+	if (!minix_test_and_clear_bit(bit, bh->b_data))
+		printk("minix_free_inode: bit %lu already cleared\n", bit);
+	spin_unlock(&bitmap_lock);
+	mark_buffer_dirty(bh);
+}
+
+struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error)
+{
+	struct super_block *sb = dir->i_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct inode *inode = new_inode(sb);
+	struct buffer_head * bh;
+	int bits_per_zone = 8 * sb->s_blocksize;
+	unsigned long j;
+	int i;
+
+	if (!inode) {
+		*error = -ENOMEM;
+		return NULL;
+	}
+	j = bits_per_zone;
+	bh = NULL;
+	*error = -ENOSPC;
+	spin_lock(&bitmap_lock);
+	for (i = 0; i < sbi->s_imap_blocks; i++) {
+		bh = sbi->s_imap[i];
+		j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
+		if (j < bits_per_zone)
+			break;
+	}
+	if (!bh || j >= bits_per_zone) {
+		spin_unlock(&bitmap_lock);
+		iput(inode);
+		return NULL;
+	}
+	if (minix_test_and_set_bit(j, bh->b_data)) {	/* shouldn't happen */
+		spin_unlock(&bitmap_lock);
+		printk("minix_new_inode: bit already set\n");
+		iput(inode);
+		return NULL;
+	}
+	spin_unlock(&bitmap_lock);
+	mark_buffer_dirty(bh);
+	j += i * bits_per_zone;
+	if (!j || j > sbi->s_ninodes) {
+		iput(inode);
+		return NULL;
+	}
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = j;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_blocks = 0;
+	memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+
+	*error = 0;
+	return inode;
+}
+
+unsigned long minix_count_free_inodes(struct super_block *sb)
+{
+	struct minix_sb_info *sbi = minix_sb(sb);
+	u32 bits = sbi->s_ninodes + 1;
+
+	return count_free(sbi->s_imap, sb->s_blocksize, bits);
+}
diff --git a/kernel/fs/minix/dir.c b/kernel/fs/minix/dir.c
new file mode 100644
index 000000000..118e4e7bc
--- /dev/null
+++ b/kernel/fs/minix/dir.c
@@ -0,0 +1,473 @@
+/*
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  minix directory handling functions
+ *
+ *  Updated to filesystem version 3 by Daniel Aragones
+ */
+
+#include "minix.h"
+#include <linux/buffer_head.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+
+typedef struct minix_dir_entry minix_dirent;
+typedef struct minix3_dir_entry minix3_dirent;
+
+static int minix_readdir(struct file *, struct dir_context *);
+
+const struct file_operations minix_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= minix_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+static inline void dir_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+minix_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = PAGE_CACHE_SIZE;
+
+	if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT))
+		last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	return last_byte;
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
+
+static struct page * dir_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page))
+		kmap(page);
+	return page;
+}
+
+static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
+{
+	return (void*)((char*)de + sbi->s_dirsize);
+}
+
+static int minix_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	unsigned chunk_size = sbi->s_dirsize;
+	unsigned long npages = dir_pages(inode);
+	unsigned long pos = ctx->pos;
+	unsigned offset;
+	unsigned long n;
+
+	ctx->pos = pos = ALIGN(pos, chunk_size);
+	if (pos >= inode->i_size)
+		return 0;
+
+	offset = pos & ~PAGE_CACHE_MASK;
+	n = pos >> PAGE_CACHE_SHIFT;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *p, *kaddr, *limit;
+		struct page *page = dir_get_page(inode, n);
+
+		if (IS_ERR(page))
+			continue;
+		kaddr = (char *)page_address(page);
+		p = kaddr+offset;
+		limit = kaddr + minix_last_byte(inode, n) - chunk_size;
+		for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+			const char *name;
+			__u32 inumber;
+			if (sbi->s_version == MINIX_V3) {
+				minix3_dirent *de3 = (minix3_dirent *)p;
+				name = de3->name;
+				inumber = de3->inode;
+	 		} else {
+				minix_dirent *de = (minix_dirent *)p;
+				name = de->name;
+				inumber = de->inode;
+			}
+			if (inumber) {
+				unsigned l = strnlen(name, sbi->s_namelen);
+				if (!dir_emit(ctx, name, l,
+					      inumber, DT_UNKNOWN)) {
+					dir_put_page(page);
+					return 0;
+				}
+			}
+			ctx->pos += chunk_size;
+		}
+		dir_put_page(page);
+	}
+	return 0;
+}
+
+static inline int namecompare(int len, int maxlen,
+	const char * name, const char * buffer)
+{
+	if (len < maxlen && buffer[len])
+		return 0;
+	return !memcmp(name, buffer, len);
+}
+
+/*
+ *	minix_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ */
+minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
+{
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct inode * dir = d_inode(dentry->d_parent);
+	struct super_block * sb = dir->i_sb;
+	struct minix_sb_info * sbi = minix_sb(sb);
+	unsigned long n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	char *p;
+
+	char *namx;
+	__u32 inumber;
+	*res_page = NULL;
+
+	for (n = 0; n < npages; n++) {
+		char *kaddr, *limit;
+
+		page = dir_get_page(dir, n);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = (char*)page_address(page);
+		limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize;
+		for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
+			if (sbi->s_version == MINIX_V3) {
+				minix3_dirent *de3 = (minix3_dirent *)p;
+				namx = de3->name;
+				inumber = de3->inode;
+ 			} else {
+				minix_dirent *de = (minix_dirent *)p;
+				namx = de->name;
+				inumber = de->inode;
+			}
+			if (!inumber)
+				continue;
+			if (namecompare(namelen, sbi->s_namelen, name, namx))
+				goto found;
+		}
+		dir_put_page(page);
+	}
+	return NULL;
+
+found:
+	*res_page = page;
+	return (minix_dirent *)p;
+}
+
+int minix_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct super_block * sb = dir->i_sb;
+	struct minix_sb_info * sbi = minix_sb(sb);
+	struct page *page = NULL;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr, *p;
+	minix_dirent *de;
+	minix3_dirent *de3;
+	loff_t pos;
+	int err;
+	char *namx = NULL;
+	__u32 inumber;
+
+	/*
+	 * We take care of directory expansion in the same loop
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *limit, *dir_end;
+
+		page = dir_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = (char*)page_address(page);
+		dir_end = kaddr + minix_last_byte(dir, n);
+		limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize;
+		for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
+			de = (minix_dirent *)p;
+			de3 = (minix3_dirent *)p;
+			if (sbi->s_version == MINIX_V3) {
+				namx = de3->name;
+				inumber = de3->inode;
+		 	} else {
+  				namx = de->name;
+				inumber = de->inode;
+			}
+			if (p == dir_end) {
+				/* We hit i_size */
+				if (sbi->s_version == MINIX_V3)
+					de3->inode = 0;
+		 		else
+					de->inode = 0;
+				goto got_it;
+			}
+			if (!inumber)
+				goto got_it;
+			err = -EEXIST;
+			if (namecompare(namelen, sbi->s_namelen, name, namx))
+				goto out_unlock;
+		}
+		unlock_page(page);
+		dir_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) + p - (char *)page_address(page);
+	err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
+	if (err)
+		goto out_unlock;
+	memcpy (namx, name, namelen);
+	if (sbi->s_version == MINIX_V3) {
+		memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4);
+		de3->inode = inode->i_ino;
+	} else {
+		memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2);
+		de->inode = inode->i_ino;
+	}
+	err = dir_commit_chunk(page, pos, sbi->s_dirsize);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+out_put:
+	dir_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr = page_address(page);
+	loff_t pos = page_offset(page) + (char*)de - kaddr;
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	unsigned len = sbi->s_dirsize;
+	int err;
+
+	lock_page(page);
+	err = minix_prepare_chunk(page, pos, len);
+	if (err == 0) {
+		if (sbi->s_version == MINIX_V3)
+			((minix3_dirent *) de)->inode = 0;
+		else
+			de->inode = 0;
+		err = dir_commit_chunk(page, pos, len);
+	} else {
+		unlock_page(page);
+	}
+	dir_put_page(page);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	return err;
+}
+
+int minix_make_empty(struct inode *inode, struct inode *dir)
+{
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	char *kaddr;
+	int err;
+
+	if (!page)
+		return -ENOMEM;
+	err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kaddr = kmap_atomic(page);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+
+	if (sbi->s_version == MINIX_V3) {
+		minix3_dirent *de3 = (minix3_dirent *)kaddr;
+
+		de3->inode = inode->i_ino;
+		strcpy(de3->name, ".");
+		de3 = minix_next_entry(de3, sbi);
+		de3->inode = dir->i_ino;
+		strcpy(de3->name, "..");
+	} else {
+		minix_dirent *de = (minix_dirent *)kaddr;
+
+		de->inode = inode->i_ino;
+		strcpy(de->name, ".");
+		de = minix_next_entry(de, sbi);
+		de->inode = dir->i_ino;
+		strcpy(de->name, "..");
+	}
+	kunmap_atomic(kaddr);
+
+	err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int minix_empty_dir(struct inode * inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	char *name;
+	__u32 inumber;
+
+	for (i = 0; i < npages; i++) {
+		char *p, *kaddr, *limit;
+
+		page = dir_get_page(inode, i);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = (char *)page_address(page);
+		limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize;
+		for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
+			if (sbi->s_version == MINIX_V3) {
+				minix3_dirent *de3 = (minix3_dirent *)p;
+				name = de3->name;
+				inumber = de3->inode;
+			} else {
+				minix_dirent *de = (minix_dirent *)p;
+				name = de->name;
+				inumber = de->inode;
+			}
+
+			if (inumber != 0) {
+				/* check for . and .. */
+				if (name[0] != '.')
+					goto not_empty;
+				if (!name[1]) {
+					if (inumber != inode->i_ino)
+						goto not_empty;
+				} else if (name[1] != '.')
+					goto not_empty;
+				else if (name[2])
+					goto not_empty;
+			}
+		}
+		dir_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	dir_put_page(page);
+	return 0;
+}
+
+/* Releases the page */
+void minix_set_link(struct minix_dir_entry *de, struct page *page,
+	struct inode *inode)
+{
+	struct inode *dir = page->mapping->host;
+	struct minix_sb_info *sbi = minix_sb(dir->i_sb);
+	loff_t pos = page_offset(page) +
+			(char *)de-(char*)page_address(page);
+	int err;
+
+	lock_page(page);
+
+	err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
+	if (err == 0) {
+		if (sbi->s_version == MINIX_V3)
+			((minix3_dirent *) de)->inode = inode->i_ino;
+		else
+			de->inode = inode->i_ino;
+		err = dir_commit_chunk(page, pos, sbi->s_dirsize);
+	} else {
+		unlock_page(page);
+	}
+	dir_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
+
+struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p)
+{
+	struct page *page = dir_get_page(dir, 0);
+	struct minix_sb_info *sbi = minix_sb(dir->i_sb);
+	struct minix_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = minix_next_entry(page_address(page), sbi);
+		*p = page;
+	}
+	return de;
+}
+
+ino_t minix_inode_by_name(struct dentry *dentry)
+{
+	struct page *page;
+	struct minix_dir_entry *de = minix_find_entry(dentry, &page);
+	ino_t res = 0;
+
+	if (de) {
+		struct address_space *mapping = page->mapping;
+		struct inode *inode = mapping->host;
+		struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+
+		if (sbi->s_version == MINIX_V3)
+			res = ((minix3_dirent *) de)->inode;
+		else
+			res = de->inode;
+		dir_put_page(page);
+	}
+	return res;
+}
diff --git a/kernel/fs/minix/file.c b/kernel/fs/minix/file.c
new file mode 100644
index 000000000..94f0eb9a6
--- /dev/null
+++ b/kernel/fs/minix/file.c
@@ -0,0 +1,51 @@
+/*
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  minix regular file handling primitives
+ */
+
+#include "minix.h"
+
+/*
+ * We have mostly NULLs here: the current defaults are OK for
+ * the minix filesystem.
+ */
+const struct file_operations minix_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.fsync		= generic_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+
+		truncate_setsize(inode, attr->ia_size);
+		minix_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations minix_file_inode_operations = {
+	.setattr	= minix_setattr,
+	.getattr	= minix_getattr,
+};
diff --git a/kernel/fs/minix/inode.c b/kernel/fs/minix/inode.c
new file mode 100644
index 000000000..1182d1e26
--- /dev/null
+++ b/kernel/fs/minix/inode.c
@@ -0,0 +1,690 @@
+/*
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Copyright (C) 1996  Gertjan van Wingerde
+ *	Minix V2 fs support.
+ *
+ *  Modified for 680x0 by Andreas Schwab
+ *  Updated to filesystem version 3 by Daniel Aragones
+ */
+
+#include <linux/module.h>
+#include "minix.h"
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+
+static int minix_write_inode(struct inode *inode,
+		struct writeback_control *wbc);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int minix_remount (struct super_block * sb, int * flags, char * data);
+
+static void minix_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		minix_truncate(inode);
+	}
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+	if (!inode->i_nlink)
+		minix_free_inode(inode);
+}
+
+static void minix_put_super(struct super_block *sb)
+{
+	int i;
+	struct minix_sb_info *sbi = minix_sb(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		if (sbi->s_version != MINIX_V3)	 /* s_state is now out from V3 sb */
+			sbi->s_ms->s_state = sbi->s_mount_state;
+		mark_buffer_dirty(sbi->s_sbh);
+	}
+	for (i = 0; i < sbi->s_imap_blocks; i++)
+		brelse(sbi->s_imap[i]);
+	for (i = 0; i < sbi->s_zmap_blocks; i++)
+		brelse(sbi->s_zmap[i]);
+	brelse (sbi->s_sbh);
+	kfree(sbi->s_imap);
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+}
+
+static struct kmem_cache * minix_inode_cachep;
+
+static struct inode *minix_alloc_inode(struct super_block *sb)
+{
+	struct minix_inode_info *ei;
+	ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void minix_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(minix_inode_cachep, minix_i(inode));
+}
+
+static void minix_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, minix_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
+					     sizeof(struct minix_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (minix_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(minix_inode_cachep);
+}
+
+static const struct super_operations minix_sops = {
+	.alloc_inode	= minix_alloc_inode,
+	.destroy_inode	= minix_destroy_inode,
+	.write_inode	= minix_write_inode,
+	.evict_inode	= minix_evict_inode,
+	.put_super	= minix_put_super,
+	.statfs		= minix_statfs,
+	.remount_fs	= minix_remount,
+};
+
+static int minix_remount (struct super_block * sb, int * flags, char * data)
+{
+	struct minix_sb_info * sbi = minix_sb(sb);
+	struct minix_super_block * ms;
+
+	sync_filesystem(sb);
+	ms = sbi->s_ms;
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+	if (*flags & MS_RDONLY) {
+		if (ms->s_state & MINIX_VALID_FS ||
+		    !(sbi->s_mount_state & MINIX_VALID_FS))
+			return 0;
+		/* Mounting a rw partition read-only. */
+		if (sbi->s_version != MINIX_V3)
+			ms->s_state = sbi->s_mount_state;
+		mark_buffer_dirty(sbi->s_sbh);
+	} else {
+	  	/* Mount a partition which is read-only, read-write. */
+		if (sbi->s_version != MINIX_V3) {
+			sbi->s_mount_state = ms->s_state;
+			ms->s_state &= ~MINIX_VALID_FS;
+		} else {
+			sbi->s_mount_state = MINIX_VALID_FS;
+		}
+		mark_buffer_dirty(sbi->s_sbh);
+
+		if (!(sbi->s_mount_state & MINIX_VALID_FS))
+			printk("MINIX-fs warning: remounting unchecked fs, "
+				"running fsck is recommended\n");
+		else if ((sbi->s_mount_state & MINIX_ERROR_FS))
+			printk("MINIX-fs warning: remounting fs with errors, "
+				"running fsck is recommended\n");
+	}
+	return 0;
+}
+
+static int minix_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh;
+	struct buffer_head **map;
+	struct minix_super_block *ms;
+	struct minix3_super_block *m3s = NULL;
+	unsigned long i, block;
+	struct inode *root_inode;
+	struct minix_sb_info *sbi;
+	int ret = -EINVAL;
+
+	sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	s->s_fs_info = sbi;
+
+	BUILD_BUG_ON(32 != sizeof (struct minix_inode));
+	BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
+
+	if (!sb_set_blocksize(s, BLOCK_SIZE))
+		goto out_bad_hblock;
+
+	if (!(bh = sb_bread(s, 1)))
+		goto out_bad_sb;
+
+	ms = (struct minix_super_block *) bh->b_data;
+	sbi->s_ms = ms;
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = ms->s_state;
+	sbi->s_ninodes = ms->s_ninodes;
+	sbi->s_nzones = ms->s_nzones;
+	sbi->s_imap_blocks = ms->s_imap_blocks;
+	sbi->s_zmap_blocks = ms->s_zmap_blocks;
+	sbi->s_firstdatazone = ms->s_firstdatazone;
+	sbi->s_log_zone_size = ms->s_log_zone_size;
+	sbi->s_max_size = ms->s_max_size;
+	s->s_magic = ms->s_magic;
+	if (s->s_magic == MINIX_SUPER_MAGIC) {
+		sbi->s_version = MINIX_V1;
+		sbi->s_dirsize = 16;
+		sbi->s_namelen = 14;
+		s->s_max_links = MINIX_LINK_MAX;
+	} else if (s->s_magic == MINIX_SUPER_MAGIC2) {
+		sbi->s_version = MINIX_V1;
+		sbi->s_dirsize = 32;
+		sbi->s_namelen = 30;
+		s->s_max_links = MINIX_LINK_MAX;
+	} else if (s->s_magic == MINIX2_SUPER_MAGIC) {
+		sbi->s_version = MINIX_V2;
+		sbi->s_nzones = ms->s_zones;
+		sbi->s_dirsize = 16;
+		sbi->s_namelen = 14;
+		s->s_max_links = MINIX2_LINK_MAX;
+	} else if (s->s_magic == MINIX2_SUPER_MAGIC2) {
+		sbi->s_version = MINIX_V2;
+		sbi->s_nzones = ms->s_zones;
+		sbi->s_dirsize = 32;
+		sbi->s_namelen = 30;
+		s->s_max_links = MINIX2_LINK_MAX;
+	} else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) {
+		m3s = (struct minix3_super_block *) bh->b_data;
+		s->s_magic = m3s->s_magic;
+		sbi->s_imap_blocks = m3s->s_imap_blocks;
+		sbi->s_zmap_blocks = m3s->s_zmap_blocks;
+		sbi->s_firstdatazone = m3s->s_firstdatazone;
+		sbi->s_log_zone_size = m3s->s_log_zone_size;
+		sbi->s_max_size = m3s->s_max_size;
+		sbi->s_ninodes = m3s->s_ninodes;
+		sbi->s_nzones = m3s->s_zones;
+		sbi->s_dirsize = 64;
+		sbi->s_namelen = 60;
+		sbi->s_version = MINIX_V3;
+		sbi->s_mount_state = MINIX_VALID_FS;
+		sb_set_blocksize(s, m3s->s_blocksize);
+		s->s_max_links = MINIX2_LINK_MAX;
+	} else
+		goto out_no_fs;
+
+	/*
+	 * Allocate the buffer map to keep the superblock small.
+	 */
+	if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
+		goto out_illegal_sb;
+	i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
+	map = kzalloc(i, GFP_KERNEL);
+	if (!map)
+		goto out_no_map;
+	sbi->s_imap = &map[0];
+	sbi->s_zmap = &map[sbi->s_imap_blocks];
+
+	block=2;
+	for (i=0 ; i < sbi->s_imap_blocks ; i++) {
+		if (!(sbi->s_imap[i]=sb_bread(s, block)))
+			goto out_no_bitmap;
+		block++;
+	}
+	for (i=0 ; i < sbi->s_zmap_blocks ; i++) {
+		if (!(sbi->s_zmap[i]=sb_bread(s, block)))
+			goto out_no_bitmap;
+		block++;
+	}
+
+	minix_set_bit(0,sbi->s_imap[0]->b_data);
+	minix_set_bit(0,sbi->s_zmap[0]->b_data);
+
+	/* Apparently minix can create filesystems that allocate more blocks for
+	 * the bitmaps than needed.  We simply ignore that, but verify it didn't
+	 * create one with not enough blocks and bail out if so.
+	 */
+	block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
+	if (sbi->s_imap_blocks < block) {
+		printk("MINIX-fs: file system does not have enough "
+				"imap blocks allocated.  Refusing to mount.\n");
+		goto out_no_bitmap;
+	}
+
+	block = minix_blocks_needed(
+			(sbi->s_nzones - sbi->s_firstdatazone + 1),
+			s->s_blocksize);
+	if (sbi->s_zmap_blocks < block) {
+		printk("MINIX-fs: file system does not have enough "
+				"zmap blocks allocated.  Refusing to mount.\n");
+		goto out_no_bitmap;
+	}
+
+	/* set up enough so that it can read an inode */
+	s->s_op = &minix_sops;
+	root_inode = minix_iget(s, MINIX_ROOT_INO);
+	if (IS_ERR(root_inode)) {
+		ret = PTR_ERR(root_inode);
+		goto out_no_root;
+	}
+
+	ret = -ENOMEM;
+	s->s_root = d_make_root(root_inode);
+	if (!s->s_root)
+		goto out_no_root;
+
+	if (!(s->s_flags & MS_RDONLY)) {
+		if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
+			ms->s_state &= ~MINIX_VALID_FS;
+		mark_buffer_dirty(bh);
+	}
+	if (!(sbi->s_mount_state & MINIX_VALID_FS))
+		printk("MINIX-fs: mounting unchecked file system, "
+			"running fsck is recommended\n");
+	else if (sbi->s_mount_state & MINIX_ERROR_FS)
+		printk("MINIX-fs: mounting file system with errors, "
+			"running fsck is recommended\n");
+
+	return 0;
+
+out_no_root:
+	if (!silent)
+		printk("MINIX-fs: get root inode failed\n");
+	goto out_freemap;
+
+out_no_bitmap:
+	printk("MINIX-fs: bad superblock or unable to read bitmaps\n");
+out_freemap:
+	for (i = 0; i < sbi->s_imap_blocks; i++)
+		brelse(sbi->s_imap[i]);
+	for (i = 0; i < sbi->s_zmap_blocks; i++)
+		brelse(sbi->s_zmap[i]);
+	kfree(sbi->s_imap);
+	goto out_release;
+
+out_no_map:
+	ret = -ENOMEM;
+	if (!silent)
+		printk("MINIX-fs: can't allocate map\n");
+	goto out_release;
+
+out_illegal_sb:
+	if (!silent)
+		printk("MINIX-fs: bad superblock\n");
+	goto out_release;
+
+out_no_fs:
+	if (!silent)
+		printk("VFS: Can't find a Minix filesystem V1 | V2 | V3 "
+		       "on device %s.\n", s->s_id);
+out_release:
+	brelse(bh);
+	goto out;
+
+out_bad_hblock:
+	printk("MINIX-fs: blocksize too small for device\n");
+	goto out;
+
+out_bad_sb:
+	printk("MINIX-fs: unable to read superblock\n");
+out:
+	s->s_fs_info = NULL;
+	kfree(sbi);
+	return ret;
+}
+
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
+	buf->f_bfree = minix_count_free_blocks(sb);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = sbi->s_ninodes;
+	buf->f_ffree = minix_count_free_inodes(sb);
+	buf->f_namelen = sbi->s_namelen;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static int minix_get_block(struct inode *inode, sector_t block,
+		    struct buffer_head *bh_result, int create)
+{
+	if (INODE_VERSION(inode) == MINIX_V1)
+		return V1_minix_get_block(inode, block, bh_result, create);
+	else
+		return V2_minix_get_block(inode, block, bh_result, create);
+}
+
+static int minix_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, minix_get_block, wbc);
+}
+
+static int minix_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,minix_get_block);
+}
+
+int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, minix_get_block);
+}
+
+static void minix_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		minix_truncate(inode);
+	}
+}
+
+static int minix_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				minix_get_block);
+	if (unlikely(ret))
+		minix_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t minix_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,minix_get_block);
+}
+
+static const struct address_space_operations minix_aops = {
+	.readpage = minix_readpage,
+	.writepage = minix_writepage,
+	.write_begin = minix_write_begin,
+	.write_end = generic_write_end,
+	.bmap = minix_bmap
+};
+
+static const struct inode_operations minix_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= minix_getattr,
+};
+
+void minix_set_inode(struct inode *inode, dev_t rdev)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &minix_file_inode_operations;
+		inode->i_fop = &minix_file_operations;
+		inode->i_mapping->a_ops = &minix_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &minix_dir_inode_operations;
+		inode->i_fop = &minix_dir_operations;
+		inode->i_mapping->a_ops = &minix_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &minix_symlink_inode_operations;
+		inode->i_mapping->a_ops = &minix_aops;
+	} else
+		init_special_inode(inode, inode->i_mode, rdev);
+}
+
+/*
+ * The minix V1 function to read an inode.
+ */
+static struct inode *V1_minix_iget(struct inode *inode)
+{
+	struct buffer_head * bh;
+	struct minix_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode) {
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	inode->i_mode = raw_inode->i_mode;
+	i_uid_write(inode, raw_inode->i_uid);
+	i_gid_write(inode, raw_inode->i_gid);
+	set_nlink(inode, raw_inode->i_nlinks);
+	inode->i_size = raw_inode->i_size;
+	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks = 0;
+	for (i = 0; i < 9; i++)
+		minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
+	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/*
+ * The minix V2 function to read an inode.
+ */
+static struct inode *V2_minix_iget(struct inode *inode)
+{
+	struct buffer_head * bh;
+	struct minix2_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode) {
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	inode->i_mode = raw_inode->i_mode;
+	i_uid_write(inode, raw_inode->i_uid);
+	i_gid_write(inode, raw_inode->i_gid);
+	set_nlink(inode, raw_inode->i_nlinks);
+	inode->i_size = raw_inode->i_size;
+	inode->i_mtime.tv_sec = raw_inode->i_mtime;
+	inode->i_atime.tv_sec = raw_inode->i_atime;
+	inode->i_ctime.tv_sec = raw_inode->i_ctime;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks = 0;
+	for (i = 0; i < 10; i++)
+		minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
+	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/*
+ * The global function to read an inode.
+ */
+struct inode *minix_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	if (INODE_VERSION(inode) == MINIX_V1)
+		return V1_minix_iget(inode);
+	else
+		return V2_minix_iget(inode);
+}
+
+/*
+ * The minix V1 function to synchronize an inode.
+ */
+static struct buffer_head * V1_minix_update_inode(struct inode * inode)
+{
+	struct buffer_head * bh;
+	struct minix_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode)
+		return NULL;
+	raw_inode->i_mode = inode->i_mode;
+	raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
+	raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
+	raw_inode->i_nlinks = inode->i_nlink;
+	raw_inode->i_size = inode->i_size;
+	raw_inode->i_time = inode->i_mtime.tv_sec;
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
+	else for (i = 0; i < 9; i++)
+		raw_inode->i_zone[i] = minix_inode->u.i1_data[i];
+	mark_buffer_dirty(bh);
+	return bh;
+}
+
+/*
+ * The minix V2 function to synchronize an inode.
+ */
+static struct buffer_head * V2_minix_update_inode(struct inode * inode)
+{
+	struct buffer_head * bh;
+	struct minix2_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode)
+		return NULL;
+	raw_inode->i_mode = inode->i_mode;
+	raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
+	raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
+	raw_inode->i_nlinks = inode->i_nlink;
+	raw_inode->i_size = inode->i_size;
+	raw_inode->i_mtime = inode->i_mtime.tv_sec;
+	raw_inode->i_atime = inode->i_atime.tv_sec;
+	raw_inode->i_ctime = inode->i_ctime.tv_sec;
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
+	else for (i = 0; i < 10; i++)
+		raw_inode->i_zone[i] = minix_inode->u.i2_data[i];
+	mark_buffer_dirty(bh);
+	return bh;
+}
+
+static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int err = 0;
+	struct buffer_head *bh;
+
+	if (INODE_VERSION(inode) == MINIX_V1)
+		bh = V1_minix_update_inode(inode);
+	else
+		bh = V2_minix_update_inode(inode);
+	if (!bh)
+		return -EIO;
+	if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			printk("IO error syncing minix inode [%s:%08lx]\n",
+				inode->i_sb->s_id, inode->i_ino);
+			err = -EIO;
+		}
+	}
+	brelse (bh);
+	return err;
+}
+
+int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct super_block *sb = dentry->d_sb;
+	generic_fillattr(d_inode(dentry), stat);
+	if (INODE_VERSION(d_inode(dentry)) == MINIX_V1)
+		stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
+	else
+		stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb);
+	stat->blksize = sb->s_blocksize;
+	return 0;
+}
+
+/*
+ * The function that is called for file truncation.
+ */
+void minix_truncate(struct inode * inode)
+{
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
+		return;
+	if (INODE_VERSION(inode) == MINIX_V1)
+		V1_minix_truncate(inode);
+	else
+		V2_minix_truncate(inode);
+}
+
+static struct dentry *minix_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+}
+
+static struct file_system_type minix_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "minix",
+	.mount		= minix_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("minix");
+
+static int __init init_minix_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&minix_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_minix_fs(void)
+{
+        unregister_filesystem(&minix_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_minix_fs)
+module_exit(exit_minix_fs)
+MODULE_LICENSE("GPL");
+
diff --git a/kernel/fs/minix/itree_common.c b/kernel/fs/minix/itree_common.c
new file mode 100644
index 000000000..a731cabf1
--- /dev/null
+++ b/kernel/fs/minix/itree_common.c
@@ -0,0 +1,364 @@
+/* Generic part */
+
+typedef struct {
+	block_t	*p;
+	block_t	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static DEFINE_RWLOCK(pointers_lock);
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, block_t *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static inline int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+static inline block_t *block_end(struct buffer_head *bh)
+{
+	return (block_t *)((char*)bh->b_data + bh->b_size);
+}
+
+static inline Indirect *get_branch(struct inode *inode,
+					int depth,
+					int *offsets,
+					Indirect chain[DEPTH],
+					int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, i_data(inode) + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_bread(sb, block_to_cpu(p->key));
+		if (!bh)
+			goto failure;
+		read_lock(&pointers_lock);
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (block_t *)bh->b_data + *++offsets);
+		read_unlock(&pointers_lock);
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	read_unlock(&pointers_lock);
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+static int alloc_branch(struct inode *inode,
+			     int num,
+			     int *offsets,
+			     Indirect *branch)
+{
+	int n = 0;
+	int i;
+	int parent = minix_new_block(inode);
+
+	branch[0].key = cpu_to_block(parent);
+	if (parent) for (n = 1; n < num; n++) {
+		struct buffer_head *bh;
+		/* Allocate the next block */
+		int nr = minix_new_block(inode);
+		if (!nr)
+			break;
+		branch[n].key = cpu_to_block(nr);
+		bh = sb_getblk(inode->i_sb, parent);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, bh->b_size);
+		branch[n].bh = bh;
+		branch[n].p = (block_t*) bh->b_data + offsets[n];
+		*branch[n].p = branch[n].key;
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		mark_buffer_dirty_inode(bh, inode);
+		parent = nr;
+	}
+	if (n == num)
+		return 0;
+
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < n; i++)
+		minix_free_block(inode, block_to_cpu(branch[i].key));
+	return -ENOSPC;
+}
+
+static inline int splice_branch(struct inode *inode,
+				     Indirect chain[DEPTH],
+				     Indirect *where,
+				     int num)
+{
+	int i;
+
+	write_lock(&pointers_lock);
+
+	/* Verify that place we are splicing to is still there and vacant */
+	if (!verify_chain(chain, where-1) || *where->p)
+		goto changed;
+
+	*where->p = where->key;
+
+	write_unlock(&pointers_lock);
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh)
+		mark_buffer_dirty_inode(where->bh, inode);
+
+	mark_inode_dirty(inode);
+	return 0;
+
+changed:
+	write_unlock(&pointers_lock);
+	for (i = 1; i < num; i++)
+		bforget(where[i].bh);
+	for (i = 0; i < num; i++)
+		minix_free_block(inode, block_to_cpu(where[i].key));
+	return -EAGAIN;
+}
+
+static inline int get_block(struct inode * inode, sector_t block,
+			struct buffer_head *bh, int create)
+{
+	int err = -EIO;
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	Indirect *partial;
+	int left;
+	int depth = block_to_path(inode, block, offsets);
+
+	if (depth == 0)
+		goto out;
+
+reread:
+	partial = get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+got_it:
+		map_bh(bh, inode->i_sb, block_to_cpu(chain[depth-1].key));
+		/* Clean up and exit */
+		partial = chain+depth-1; /* the whole chain */
+		goto cleanup;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO) {
+cleanup:
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+out:
+		return err;
+	}
+
+	/*
+	 * Indirect block might be removed by truncate while we were
+	 * reading it. Handling of that case (forget what we've got and
+	 * reread) is taken out of the main path.
+	 */
+	if (err == -EAGAIN)
+		goto changed;
+
+	left = (chain + depth) - partial;
+	err = alloc_branch(inode, left, offsets+(partial-chain), partial);
+	if (err)
+		goto cleanup;
+
+	if (splice_branch(inode, chain, partial, left) < 0)
+		goto changed;
+
+	set_buffer_new(bh);
+	goto got_it;
+
+changed:
+	while (partial > chain) {
+		brelse(partial->bh);
+		partial--;
+	}
+	goto reread;
+}
+
+static inline int all_zeroes(block_t *p, block_t *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+static Indirect *find_shared(struct inode *inode,
+				int depth,
+				int offsets[DEPTH],
+				Indirect chain[DEPTH],
+				block_t *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = get_branch(inode, k, offsets, chain, &err);
+
+	write_lock(&pointers_lock);
+	if (!partial)
+		partial = chain + k-1;
+	if (!partial->key && *partial->p) {
+		write_unlock(&pointers_lock);
+		goto no_top;
+	}
+	for (p=partial;p>chain && all_zeroes((block_t*)p->bh->b_data,p->p);p--)
+		;
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		*p->p = 0;
+	}
+	write_unlock(&pointers_lock);
+
+	while(partial > p)
+	{
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+static inline void free_data(struct inode *inode, block_t *p, block_t *q)
+{
+	unsigned long nr;
+
+	for ( ; p < q ; p++) {
+		nr = block_to_cpu(*p);
+		if (nr) {
+			*p = 0;
+			minix_free_block(inode, nr);
+		}
+	}
+}
+
+static void free_branches(struct inode *inode, block_t *p, block_t *q, int depth)
+{
+	struct buffer_head * bh;
+	unsigned long nr;
+
+	if (depth--) {
+		for ( ; p < q ; p++) {
+			nr = block_to_cpu(*p);
+			if (!nr)
+				continue;
+			*p = 0;
+			bh = sb_bread(inode->i_sb, nr);
+			if (!bh)
+				continue;
+			free_branches(inode, (block_t*)bh->b_data,
+				      block_end(bh), depth);
+			bforget(bh);
+			minix_free_block(inode, nr);
+			mark_inode_dirty(inode);
+		}
+	} else
+		free_data(inode, p, q);
+}
+
+static inline void truncate (struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	block_t *idata = i_data(inode);
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	Indirect *partial;
+	block_t nr = 0;
+	int n;
+	int first_whole;
+	long iblock;
+
+	iblock = (inode->i_size + sb->s_blocksize -1) >> sb->s_blocksize_bits;
+	block_truncate_page(inode->i_mapping, inode->i_size, get_block);
+
+	n = block_to_path(inode, iblock, offsets);
+	if (!n)
+		return;
+
+	if (n == 1) {
+		free_data(inode, idata+offsets[0], idata + DIRECT);
+		first_whole = 0;
+		goto do_indirects;
+	}
+
+	first_whole = offsets[0] + 1 - DIRECT;
+	partial = find_shared(inode, n, offsets, chain, &nr);
+	if (nr) {
+		if (partial == chain)
+			mark_inode_dirty(inode);
+		else
+			mark_buffer_dirty_inode(partial->bh, inode);
+		free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		free_branches(inode, partial->p + 1, block_end(partial->bh),
+				(chain+n-1) - partial);
+		mark_buffer_dirty_inode(partial->bh, inode);
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	while (first_whole < DEPTH-1) {
+		nr = idata[DIRECT+first_whole];
+		if (nr) {
+			idata[DIRECT+first_whole] = 0;
+			mark_inode_dirty(inode);
+			free_branches(inode, &nr, &nr+1, first_whole+1);
+		}
+		first_whole++;
+	}
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+static inline unsigned nblocks(loff_t size, struct super_block *sb)
+{
+	int k = sb->s_blocksize_bits - 10;
+	unsigned blocks, res, direct = DIRECT, i = DEPTH;
+	blocks = (size + sb->s_blocksize - 1) >> (BLOCK_SIZE_BITS + k);
+	res = blocks;
+	while (--i && blocks > direct) {
+		blocks -= direct;
+		blocks += sb->s_blocksize/sizeof(block_t) - 1;
+		blocks /= sb->s_blocksize/sizeof(block_t);
+		res += blocks;
+		direct = 1;
+	}
+	return res;
+}
diff --git a/kernel/fs/minix/itree_v1.c b/kernel/fs/minix/itree_v1.c
new file mode 100644
index 000000000..282e15ad8
--- /dev/null
+++ b/kernel/fs/minix/itree_v1.c
@@ -0,0 +1,67 @@
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include "minix.h"
+
+enum {DEPTH = 3, DIRECT = 7};	/* Only double indirect */
+
+typedef u16 block_t;	/* 16 bit, host order */
+
+static inline unsigned long block_to_cpu(block_t n)
+{
+	return n;
+}
+
+static inline block_t cpu_to_block(unsigned long n)
+{
+	return n;
+}
+
+static inline block_t *i_data(struct inode *inode)
+{
+	return (block_t *)minix_i(inode)->u.i1_data;
+}
+
+static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
+{
+	int n = 0;
+	char b[BDEVNAME_SIZE];
+
+	if (block < 0) {
+		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
+			block, bdevname(inode->i_sb->s_bdev, b));
+	} else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
+		if (printk_ratelimit())
+			printk("MINIX-fs: block_to_path: "
+			       "block %ld too big on dev %s\n",
+				block, bdevname(inode->i_sb->s_bdev, b));
+	} else if (block < 7) {
+		offsets[n++] = block;
+	} else if ((block -= 7) < 512) {
+		offsets[n++] = 7;
+		offsets[n++] = block;
+	} else {
+		block -= 512;
+		offsets[n++] = 8;
+		offsets[n++] = block>>9;
+		offsets[n++] = block & 511;
+	}
+	return n;
+}
+
+#include "itree_common.c"
+
+int V1_minix_get_block(struct inode * inode, long block,
+			struct buffer_head *bh_result, int create)
+{
+	return get_block(inode, block, bh_result, create);
+}
+
+void V1_minix_truncate(struct inode * inode)
+{
+	truncate(inode);
+}
+
+unsigned V1_minix_blocks(loff_t size, struct super_block *sb)
+{
+	return nblocks(size, sb);
+}
diff --git a/kernel/fs/minix/itree_v2.c b/kernel/fs/minix/itree_v2.c
new file mode 100644
index 000000000..78e2d93e5
--- /dev/null
+++ b/kernel/fs/minix/itree_v2.c
@@ -0,0 +1,76 @@
+#include <linux/buffer_head.h>
+#include "minix.h"
+
+enum {DIRECT = 7, DEPTH = 4};	/* Have triple indirect */
+
+typedef u32 block_t;	/* 32 bit, host order */
+
+static inline unsigned long block_to_cpu(block_t n)
+{
+	return n;
+}
+
+static inline block_t cpu_to_block(unsigned long n)
+{
+	return n;
+}
+
+static inline block_t *i_data(struct inode *inode)
+{
+	return (block_t *)minix_i(inode)->u.i2_data;
+}
+
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
+
+static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
+{
+	int n = 0;
+	char b[BDEVNAME_SIZE];
+	struct super_block *sb = inode->i_sb;
+
+	if (block < 0) {
+		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
+			block, bdevname(sb->s_bdev, b));
+	} else if ((u64)block * (u64)sb->s_blocksize >=
+			minix_sb(sb)->s_max_size) {
+		if (printk_ratelimit())
+			printk("MINIX-fs: block_to_path: "
+			       "block %ld too big on dev %s\n",
+				block, bdevname(sb->s_bdev, b));
+	} else if (block < DIRCOUNT) {
+		offsets[n++] = block;
+	} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
+		offsets[n++] = DIRCOUNT;
+		offsets[n++] = block;
+	} else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
+		offsets[n++] = DIRCOUNT + 1;
+		offsets[n++] = block / INDIRCOUNT(sb);
+		offsets[n++] = block % INDIRCOUNT(sb);
+	} else {
+		block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
+		offsets[n++] = DIRCOUNT + 2;
+		offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
+		offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
+		offsets[n++] = block % INDIRCOUNT(sb);
+	}
+	return n;
+}
+
+#include "itree_common.c"
+
+int V2_minix_get_block(struct inode * inode, long block,
+			struct buffer_head *bh_result, int create)
+{
+	return get_block(inode, block, bh_result, create);
+}
+
+void V2_minix_truncate(struct inode * inode)
+{
+	truncate(inode);
+}
+
+unsigned V2_minix_blocks(loff_t size, struct super_block *sb)
+{
+	return nblocks(size, sb);
+}
diff --git a/kernel/fs/minix/minix.h b/kernel/fs/minix/minix.h
new file mode 100644
index 000000000..1ebd11854
--- /dev/null
+++ b/kernel/fs/minix/minix.h
@@ -0,0 +1,169 @@
+#ifndef FS_MINIX_H
+#define FS_MINIX_H
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/minix_fs.h>
+
+#define INODE_VERSION(inode)	minix_sb(inode->i_sb)->s_version
+#define MINIX_V1		0x0001		/* original minix fs */
+#define MINIX_V2		0x0002		/* minix V2 fs */
+#define MINIX_V3		0x0003		/* minix V3 fs */
+
+/*
+ * minix fs inode data in memory
+ */
+struct minix_inode_info {
+	union {
+		__u16 i1_data[16];
+		__u32 i2_data[16];
+	} u;
+	struct inode vfs_inode;
+};
+
+/*
+ * minix super-block data in memory
+ */
+struct minix_sb_info {
+	unsigned long s_ninodes;
+	unsigned long s_nzones;
+	unsigned long s_imap_blocks;
+	unsigned long s_zmap_blocks;
+	unsigned long s_firstdatazone;
+	unsigned long s_log_zone_size;
+	unsigned long s_max_size;
+	int s_dirsize;
+	int s_namelen;
+	struct buffer_head ** s_imap;
+	struct buffer_head ** s_zmap;
+	struct buffer_head * s_sbh;
+	struct minix_super_block * s_ms;
+	unsigned short s_mount_state;
+	unsigned short s_version;
+};
+
+extern struct inode *minix_iget(struct super_block *, unsigned long);
+extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
+extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
+extern struct inode * minix_new_inode(const struct inode *, umode_t, int *);
+extern void minix_free_inode(struct inode * inode);
+extern unsigned long minix_count_free_inodes(struct super_block *sb);
+extern int minix_new_block(struct inode * inode);
+extern void minix_free_block(struct inode *inode, unsigned long block);
+extern unsigned long minix_count_free_blocks(struct super_block *sb);
+extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+
+extern void V1_minix_truncate(struct inode *);
+extern void V2_minix_truncate(struct inode *);
+extern void minix_truncate(struct inode *);
+extern void minix_set_inode(struct inode *, dev_t);
+extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
+extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
+extern unsigned V1_minix_blocks(loff_t, struct super_block *);
+extern unsigned V2_minix_blocks(loff_t, struct super_block *);
+
+extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**);
+extern int minix_add_link(struct dentry*, struct inode*);
+extern int minix_delete_entry(struct minix_dir_entry*, struct page*);
+extern int minix_make_empty(struct inode*, struct inode*);
+extern int minix_empty_dir(struct inode*);
+extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
+extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
+extern ino_t minix_inode_by_name(struct dentry*);
+
+extern const struct inode_operations minix_file_inode_operations;
+extern const struct inode_operations minix_dir_inode_operations;
+extern const struct file_operations minix_file_operations;
+extern const struct file_operations minix_dir_operations;
+
+static inline struct minix_sb_info *minix_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct minix_inode_info *minix_i(struct inode *inode)
+{
+	return list_entry(inode, struct minix_inode_info, vfs_inode);
+}
+
+static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
+{
+	return DIV_ROUND_UP(bits, blocksize * 8);
+}
+
+#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
+	defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+
+#error Minix file system byte order broken
+
+#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN)
+
+/*
+ * big-endian 32 or 64 bit indexed bitmaps on big-endian system or
+ * little-endian bitmaps on little-endian system
+ */
+
+#define minix_test_and_set_bit(nr, addr)	\
+	__test_and_set_bit((nr), (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)		\
+	__set_bit((nr), (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr) \
+	__test_and_clear_bit((nr), (unsigned long *)(addr))
+#define minix_test_bit(nr, addr)		\
+	test_bit((nr), (unsigned long *)(addr))
+#define minix_find_first_zero_bit(addr, size) \
+	find_first_zero_bit((unsigned long *)(addr), (size))
+
+#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+
+/*
+ * big-endian 16bit indexed bitmaps
+ */
+
+static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
+{
+	const unsigned short *p = vaddr, *addr = vaddr;
+	unsigned short num;
+
+	if (!size)
+		return 0;
+
+	size >>= 4;
+	while (*p++ == 0xffff) {
+		if (--size == 0)
+			return (p - addr) << 4;
+	}
+
+	num = *--p;
+	return ((p - addr) << 4) + ffz(num);
+}
+
+#define minix_test_and_set_bit(nr, addr)	\
+	__test_and_set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)	\
+	__set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr)	\
+	__test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr))
+
+static inline int minix_test_bit(int nr, const void *vaddr)
+{
+	const unsigned short *p = vaddr;
+	return (p[nr >> 4] & (1U << (nr & 15))) != 0;
+}
+
+#else
+
+/*
+ * little-endian bitmaps
+ */
+
+#define minix_test_and_set_bit	__test_and_set_bit_le
+#define minix_set_bit		__set_bit_le
+#define minix_test_and_clear_bit	__test_and_clear_bit_le
+#define minix_test_bit	test_bit_le
+#define minix_find_first_zero_bit	find_first_zero_bit_le
+
+#endif
+
+#endif /* FS_MINIX_H */
diff --git a/kernel/fs/minix/namei.c b/kernel/fs/minix/namei.c
new file mode 100644
index 000000000..a795a11e5
--- /dev/null
+++ b/kernel/fs/minix/namei.c
@@ -0,0 +1,270 @@
+/*
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include "minix.h"
+
+static int add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = minix_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode * inode = NULL;
+	ino_t ino;
+
+	if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = minix_inode_by_name(dentry);
+	if (ino) {
+		inode = minix_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	int error;
+	struct inode *inode;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = minix_new_inode(dir, mode, &error);
+
+	if (inode) {
+		minix_set_inode(inode, rdev);
+		mark_inode_dirty(inode);
+		error = add_nondir(dentry, inode);
+	}
+	return error;
+}
+
+static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int error;
+	struct inode *inode = minix_new_inode(dir, mode, &error);
+	if (inode) {
+		minix_set_inode(inode, 0);
+		mark_inode_dirty(inode);
+		d_tmpfile(dentry, inode);
+	}
+	return error;
+}
+
+static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool excl)
+{
+	return minix_mknod(dir, dentry, mode, 0);
+}
+
+static int minix_symlink(struct inode * dir, struct dentry *dentry,
+	  const char * symname)
+{
+	int err = -ENAMETOOLONG;
+	int i = strlen(symname)+1;
+	struct inode * inode;
+
+	if (i > dir->i_sb->s_blocksize)
+		goto out;
+
+	inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
+	if (!inode)
+		goto out;
+
+	minix_set_inode(inode, 0);
+	err = page_symlink(inode, symname, i);
+	if (err)
+		goto out_fail;
+
+	err = add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int minix_link(struct dentry * old_dentry, struct inode * dir,
+	struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+	return add_nondir(dentry, inode);
+}
+
+static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode * inode;
+	int err;
+
+	inode_inc_link_count(dir);
+
+	inode = minix_new_inode(dir, S_IFDIR | mode, &err);
+	if (!inode)
+		goto out_dir;
+
+	minix_set_inode(inode, 0);
+
+	inode_inc_link_count(inode);
+
+	err = minix_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = minix_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int minix_unlink(struct inode * dir, struct dentry *dentry)
+{
+	int err = -ENOENT;
+	struct inode * inode = d_inode(dentry);
+	struct page * page;
+	struct minix_dir_entry * de;
+
+	de = minix_find_entry(dentry, &page);
+	if (!de)
+		goto end_unlink;
+
+	err = minix_delete_entry(de, page);
+	if (err)
+		goto end_unlink;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+end_unlink:
+	return err;
+}
+
+static int minix_rmdir(struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = d_inode(dentry);
+	int err = -ENOTEMPTY;
+
+	if (minix_empty_dir(inode)) {
+		err = minix_unlink(dir, dentry);
+		if (!err) {
+			inode_dec_link_count(dir);
+			inode_dec_link_count(inode);
+		}
+	}
+	return err;
+}
+
+static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
+			   struct inode * new_dir, struct dentry *new_dentry)
+{
+	struct inode * old_inode = d_inode(old_dentry);
+	struct inode * new_inode = d_inode(new_dentry);
+	struct page * dir_page = NULL;
+	struct minix_dir_entry * dir_de = NULL;
+	struct page * old_page;
+	struct minix_dir_entry * old_de;
+	int err = -ENOENT;
+
+	old_de = minix_find_entry(old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = minix_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page * new_page;
+		struct minix_dir_entry * new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !minix_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = minix_find_entry(new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		minix_set_link(new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		err = minix_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	minix_delete_entry(old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		minix_set_link(dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations minix_dir_inode_operations = {
+	.create		= minix_create,
+	.lookup		= minix_lookup,
+	.link		= minix_link,
+	.unlink		= minix_unlink,
+	.symlink	= minix_symlink,
+	.mkdir		= minix_mkdir,
+	.rmdir		= minix_rmdir,
+	.mknod		= minix_mknod,
+	.rename		= minix_rename,
+	.getattr	= minix_getattr,
+	.tmpfile	= minix_tmpfile,
+};
diff --git a/kernel/fs/mount.h b/kernel/fs/mount.h
new file mode 100644
index 000000000..6a61c2b3e
--- /dev/null
+++ b/kernel/fs/mount.h
@@ -0,0 +1,140 @@
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/poll.h>
+#include <linux/ns_common.h>
+#include <linux/fs_pin.h>
+
+struct mnt_namespace {
+	atomic_t		count;
+	struct ns_common	ns;
+	struct mount *	root;
+	struct list_head	list;
+	struct user_namespace	*user_ns;
+	u64			seq;	/* Sequence number to prevent loops */
+	wait_queue_head_t poll;
+	u64 event;
+};
+
+struct mnt_pcp {
+	int mnt_count;
+	int mnt_writers;
+};
+
+struct mountpoint {
+	struct hlist_node m_hash;
+	struct dentry *m_dentry;
+	struct hlist_head m_list;
+	int m_count;
+};
+
+struct mount {
+	struct hlist_node mnt_hash;
+	struct mount *mnt_parent;
+	struct dentry *mnt_mountpoint;
+	struct vfsmount mnt;
+	union {
+		struct rcu_head mnt_rcu;
+		struct llist_node mnt_llist;
+	};
+#ifdef CONFIG_SMP
+	struct mnt_pcp __percpu *mnt_pcp;
+#else
+	int mnt_count;
+	int mnt_writers;
+#endif
+	struct list_head mnt_mounts;	/* list of children, anchored here */
+	struct list_head mnt_child;	/* and going through their mnt_child */
+	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
+	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
+	struct list_head mnt_list;
+	struct list_head mnt_expire;	/* link in fs-specific expiry list */
+	struct list_head mnt_share;	/* circular list of shared mounts */
+	struct list_head mnt_slave_list;/* list of slave mounts */
+	struct list_head mnt_slave;	/* slave list entry */
+	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
+	struct mnt_namespace *mnt_ns;	/* containing namespace */
+	struct mountpoint *mnt_mp;	/* where is it mounted */
+	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+#ifdef CONFIG_FSNOTIFY
+	struct hlist_head mnt_fsnotify_marks;
+	__u32 mnt_fsnotify_mask;
+#endif
+	int mnt_id;			/* mount identifier */
+	int mnt_group_id;		/* peer group identifier */
+	int mnt_expiry_mark;		/* true if marked for expiry */
+	struct hlist_head mnt_pins;
+	struct fs_pin mnt_umount;
+	struct dentry *mnt_ex_mountpoint;
+};
+
+#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
+
+static inline struct mount *real_mount(struct vfsmount *mnt)
+{
+	return container_of(mnt, struct mount, mnt);
+}
+
+static inline int mnt_has_parent(struct mount *mnt)
+{
+	return mnt != mnt->mnt_parent;
+}
+
+static inline int is_mounted(struct vfsmount *mnt)
+{
+	/* neither detached nor internal? */
+	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
+}
+
+extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
+
+extern bool legitimize_mnt(struct vfsmount *, unsigned);
+
+extern void __detach_mounts(struct dentry *dentry);
+
+static inline void detach_mounts(struct dentry *dentry)
+{
+	if (!d_mountpoint(dentry))
+		return;
+	__detach_mounts(dentry);
+}
+
+static inline void get_mnt_ns(struct mnt_namespace *ns)
+{
+	atomic_inc(&ns->count);
+}
+
+extern seqlock_t mount_lock;
+
+static inline void lock_mount_hash(void)
+{
+	write_seqlock(&mount_lock);
+}
+
+static inline void unlock_mount_hash(void)
+{
+	write_sequnlock(&mount_lock);
+}
+
+struct proc_mounts {
+	struct seq_file m;
+	struct mnt_namespace *ns;
+	struct path root;
+	int (*show)(struct seq_file *, struct vfsmount *);
+	void *cached_mount;
+	u64 cached_event;
+	loff_t cached_index;
+};
+
+#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
+
+extern const struct seq_operations mounts_op;
+
+extern bool __is_local_mountpoint(struct dentry *dentry);
+static inline bool is_local_mountpoint(struct dentry *dentry)
+{
+	if (!d_mountpoint(dentry))
+		return false;
+
+	return __is_local_mountpoint(dentry);
+}
diff --git a/kernel/fs/mpage.c b/kernel/fs/mpage.c
new file mode 100644
index 000000000..3e79220ba
--- /dev/null
+++ b/kernel/fs/mpage.c
@@ -0,0 +1,717 @@
+/*
+ * fs/mpage.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains functions related to preparing and submitting BIOs which contain
+ * multiple pagecache pages.
+ *
+ * 15May2002	Andrew Morton
+ *		Initial version
+ * 27Jun2002	axboe@suse.de
+ *		use bio_add_page() to build bio's just the right size
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/kdev_t.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/prefetch.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/cleancache.h>
+#include "internal.h"
+
+/*
+ * I/O completion handler for multipage BIOs.
+ *
+ * The mpage code never puts partial pages into a BIO (except for end-of-file).
+ * If a page does not map to a contiguous run of blocks then it simply falls
+ * back to block_read_full_page().
+ *
+ * Why is this?  If a page's completion depends on a number of different BIOs
+ * which can complete in any order (or at the same time) then determining the
+ * status of that page is hard.  See end_buffer_async_read() for the details.
+ * There is no point in duplicating all that complexity.
+ */
+static void mpage_end_io(struct bio *bio, int err)
+{
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+		page_endio(page, bio_data_dir(bio), err);
+	}
+
+	bio_put(bio);
+}
+
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+{
+	bio->bi_end_io = mpage_end_io;
+	guard_bio_eod(rw, bio);
+	submit_bio(rw, bio);
+	return NULL;
+}
+
+static struct bio *
+mpage_alloc(struct block_device *bdev,
+		sector_t first_sector, int nr_vecs,
+		gfp_t gfp_flags)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_iter.bi_sector = first_sector;
+	}
+	return bio;
+}
+
+/*
+ * support function for mpage_readpages.  The fs supplied get_block might
+ * return an up to date buffer.  This is used to map that buffer into
+ * the page, which allows readpage to avoid triggering a duplicate call
+ * to get_block.
+ *
+ * The idea is to avoid adding buffers to pages that don't already have
+ * them.  So when the buffer is up to date and the page size == block size,
+ * this marks the page up to date instead of adding new buffers.
+ */
+static void 
+map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *page_bh, *head;
+	int block = 0;
+
+	if (!page_has_buffers(page)) {
+		/*
+		 * don't make any buffers if there is only one buffer on
+		 * the page and the page just needs to be set up to date
+		 */
+		if (inode->i_blkbits == PAGE_CACHE_SHIFT && 
+		    buffer_uptodate(bh)) {
+			SetPageUptodate(page);    
+			return;
+		}
+		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+	}
+	head = page_buffers(page);
+	page_bh = head;
+	do {
+		if (block == page_block) {
+			page_bh->b_state = bh->b_state;
+			page_bh->b_bdev = bh->b_bdev;
+			page_bh->b_blocknr = bh->b_blocknr;
+			break;
+		}
+		page_bh = page_bh->b_this_page;
+		block++;
+	} while (page_bh != head);
+}
+
+/*
+ * This is the worker routine which does all the work of mapping the disk
+ * blocks and constructs largest possible bios, submits them for IO if the
+ * blocks are not contiguous on the disk.
+ *
+ * We pass a buffer_head back and forth and use its buffer_mapped() flag to
+ * represent the validity of its disk mapping and to decide when to do the next
+ * get_block() call.
+ */
+static struct bio *
+do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
+		sector_t *last_block_in_bio, struct buffer_head *map_bh,
+		unsigned long *first_logical_block, get_block_t get_block)
+{
+	struct inode *inode = page->mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	sector_t last_block_in_file;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	unsigned page_block;
+	unsigned first_hole = blocks_per_page;
+	struct block_device *bdev = NULL;
+	int length;
+	int fully_mapped = 1;
+	unsigned nblocks;
+	unsigned relative_block;
+
+	if (page_has_buffers(page))
+		goto confused;
+
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	last_block = block_in_file + nr_pages * blocks_per_page;
+	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+	if (last_block > last_block_in_file)
+		last_block = last_block_in_file;
+	page_block = 0;
+
+	/*
+	 * Map blocks using the result from the previous get_blocks call first.
+	 */
+	nblocks = map_bh->b_size >> blkbits;
+	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
+			block_in_file < (*first_logical_block + nblocks)) {
+		unsigned map_offset = block_in_file - *first_logical_block;
+		unsigned last = nblocks - map_offset;
+
+		for (relative_block = 0; ; relative_block++) {
+			if (relative_block == last) {
+				clear_buffer_mapped(map_bh);
+				break;
+			}
+			if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr + map_offset +
+						relative_block;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
+	}
+
+	/*
+	 * Then do more get_blocks calls until we are done with this page.
+	 */
+	map_bh->b_page = page;
+	while (page_block < blocks_per_page) {
+		map_bh->b_state = 0;
+		map_bh->b_size = 0;
+
+		if (block_in_file < last_block) {
+			map_bh->b_size = (last_block-block_in_file) << blkbits;
+			if (get_block(inode, block_in_file, map_bh, 0))
+				goto confused;
+			*first_logical_block = block_in_file;
+		}
+
+		if (!buffer_mapped(map_bh)) {
+			fully_mapped = 0;
+			if (first_hole == blocks_per_page)
+				first_hole = page_block;
+			page_block++;
+			block_in_file++;
+			continue;
+		}
+
+		/* some filesystems will copy data into the page during
+		 * the get_block call, in which case we don't want to
+		 * read it again.  map_buffer_to_page copies the data
+		 * we just collected from get_block into the page's buffers
+		 * so readpage doesn't have to repeat the get_block call
+		 */
+		if (buffer_uptodate(map_bh)) {
+			map_buffer_to_page(page, map_bh, page_block);
+			goto confused;
+		}
+	
+		if (first_hole != blocks_per_page)
+			goto confused;		/* hole -> non-hole */
+
+		/* Contiguous blocks? */
+		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
+			goto confused;
+		nblocks = map_bh->b_size >> blkbits;
+		for (relative_block = 0; ; relative_block++) {
+			if (relative_block == nblocks) {
+				clear_buffer_mapped(map_bh);
+				break;
+			} else if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr+relative_block;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
+	}
+
+	if (first_hole != blocks_per_page) {
+		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
+		if (first_hole == 0) {
+			SetPageUptodate(page);
+			unlock_page(page);
+			goto out;
+		}
+	} else if (fully_mapped) {
+		SetPageMappedToDisk(page);
+	}
+
+	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+	    cleancache_get_page(page) == 0) {
+		SetPageUptodate(page);
+		goto confused;
+	}
+
+	/*
+	 * This page will go to BIO.  Do we need to send this BIO off first?
+	 */
+	if (bio && (*last_block_in_bio != blocks[0] - 1))
+		bio = mpage_bio_submit(READ, bio);
+
+alloc_new:
+	if (bio == NULL) {
+		if (first_hole == blocks_per_page) {
+			if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
+								page))
+				goto out;
+		}
+		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
+				GFP_KERNEL);
+		if (bio == NULL)
+			goto confused;
+	}
+
+	length = first_hole << blkbits;
+	if (bio_add_page(bio, page, length, 0) < length) {
+		bio = mpage_bio_submit(READ, bio);
+		goto alloc_new;
+	}
+
+	relative_block = block_in_file - *first_logical_block;
+	nblocks = map_bh->b_size >> blkbits;
+	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+	    (first_hole != blocks_per_page))
+		bio = mpage_bio_submit(READ, bio);
+	else
+		*last_block_in_bio = blocks[blocks_per_page - 1];
+out:
+	return bio;
+
+confused:
+	if (bio)
+		bio = mpage_bio_submit(READ, bio);
+	if (!PageUptodate(page))
+	        block_read_full_page(page, get_block);
+	else
+		unlock_page(page);
+	goto out;
+}
+
+/**
+ * mpage_readpages - populate an address space with some pages & start reads against them
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages.  These
+ *   pages have their ->index populated and are otherwise uninitialised.
+ *   The page at @pages->prev has the lowest file offset, and reads should be
+ *   issued in @pages->prev to @pages->next order.
+ * @nr_pages: The number of pages at *@pages
+ * @get_block: The filesystem's block mapper function.
+ *
+ * This function walks the pages and the blocks within each page, building and
+ * emitting large BIOs.
+ *
+ * If anything unusual happens, such as:
+ *
+ * - encountering a page which has buffers
+ * - encountering a page which has a non-hole after a hole
+ * - encountering a page with non-contiguous blocks
+ *
+ * then this code just gives up and calls the buffer_head-based read function.
+ * It does handle a page which has holes at the end - that is a common case:
+ * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ *
+ * BH_Boundary explanation:
+ *
+ * There is a problem.  The mpage read code assembles several pages, gets all
+ * their disk mappings, and then submits them all.  That's fine, but obtaining
+ * the disk mappings may require I/O.  Reads of indirect blocks, for example.
+ *
+ * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
+ * submitted in the following order:
+ * 	12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
+ *
+ * because the indirect block has to be read to get the mappings of blocks
+ * 13,14,15,16.  Obviously, this impacts performance.
+ *
+ * So what we do it to allow the filesystem's get_block() function to set
+ * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
+ * after this one will require I/O against a block which is probably close to
+ * this one.  So you should push what I/O you have currently accumulated.
+ *
+ * This all causes the disk requests to be issued in the correct order.
+ */
+int
+mpage_readpages(struct address_space *mapping, struct list_head *pages,
+				unsigned nr_pages, get_block_t get_block)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	sector_t last_block_in_bio = 0;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		if (!add_to_page_cache_lru(page, mapping,
+					page->index, GFP_KERNEL)) {
+			bio = do_mpage_readpage(bio, page,
+					nr_pages - page_idx,
+					&last_block_in_bio, &map_bh,
+					&first_logical_block,
+					get_block);
+		}
+		page_cache_release(page);
+	}
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		mpage_bio_submit(READ, bio);
+	return 0;
+}
+EXPORT_SYMBOL(mpage_readpages);
+
+/*
+ * This isn't called much at all
+ */
+int mpage_readpage(struct page *page, get_block_t get_block)
+{
+	struct bio *bio = NULL;
+	sector_t last_block_in_bio = 0;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
+	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
+			&map_bh, &first_logical_block, get_block);
+	if (bio)
+		mpage_bio_submit(READ, bio);
+	return 0;
+}
+EXPORT_SYMBOL(mpage_readpage);
+
+/*
+ * Writing is not so simple.
+ *
+ * If the page has buffers then they will be used for obtaining the disk
+ * mapping.  We only support pages which are fully mapped-and-dirty, with a
+ * special case for pages which are unmapped at the end: end-of-file.
+ *
+ * If the page has no buffers (preferred) then the page is mapped here.
+ *
+ * If all blocks are found to be contiguous then the page can go into the
+ * BIO.  Otherwise fall back to the mapping's writepage().
+ * 
+ * FIXME: This code wants an estimate of how many pages are still to be
+ * written, so it can intelligently allocate a suitably-sized BIO.  For now,
+ * just allocate full-size (16-page) BIOs.
+ */
+
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
+/*
+ * We have our BIO, so we can now mark the buffers clean.  Make
+ * sure to only clean buffers which we know we'll be writing.
+ */
+static void clean_buffers(struct page *page, unsigned first_unmapped)
+{
+	unsigned buffer_counter = 0;
+	struct buffer_head *bh, *head;
+	if (!page_has_buffers(page))
+		return;
+	head = page_buffers(page);
+	bh = head;
+
+	do {
+		if (buffer_counter++ == first_unmapped)
+			break;
+		clear_buffer_dirty(bh);
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/*
+	 * we cannot drop the bh if the page is not uptodate or a concurrent
+	 * readpage would fail to serialize with the bh and it would read from
+	 * disk before we reach the platter.
+	 */
+	if (buffer_heads_over_limit && PageUptodate(page))
+		try_to_free_buffers(page);
+}
+
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data)
+{
+	struct mpage_data *mpd = data;
+	struct bio *bio = mpd->bio;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = page->mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	unsigned long end_index;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	sector_t last_block;
+	sector_t block_in_file;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	unsigned page_block;
+	unsigned first_unmapped = blocks_per_page;
+	struct block_device *bdev = NULL;
+	int boundary = 0;
+	sector_t boundary_block = 0;
+	struct block_device *boundary_bdev = NULL;
+	int length;
+	struct buffer_head map_bh;
+	loff_t i_size = i_size_read(inode);
+	int ret = 0;
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		/* If they're all mapped and dirty, do it */
+		page_block = 0;
+		do {
+			BUG_ON(buffer_locked(bh));
+			if (!buffer_mapped(bh)) {
+				/*
+				 * unmapped dirty buffers are created by
+				 * __set_page_dirty_buffers -> mmapped data
+				 */
+				if (buffer_dirty(bh))
+					goto confused;
+				if (first_unmapped == blocks_per_page)
+					first_unmapped = page_block;
+				continue;
+			}
+
+			if (first_unmapped != blocks_per_page)
+				goto confused;	/* hole -> non-hole */
+
+			if (!buffer_dirty(bh) || !buffer_uptodate(bh))
+				goto confused;
+			if (page_block) {
+				if (bh->b_blocknr != blocks[page_block-1] + 1)
+					goto confused;
+			}
+			blocks[page_block++] = bh->b_blocknr;
+			boundary = buffer_boundary(bh);
+			if (boundary) {
+				boundary_block = bh->b_blocknr;
+				boundary_bdev = bh->b_bdev;
+			}
+			bdev = bh->b_bdev;
+		} while ((bh = bh->b_this_page) != head);
+
+		if (first_unmapped)
+			goto page_is_mapped;
+
+		/*
+		 * Page has buffers, but they are all unmapped. The page was
+		 * created by pagein or read over a hole which was handled by
+		 * block_read_full_page().  If this address_space is also
+		 * using mpage_readpages then this can rarely happen.
+		 */
+		goto confused;
+	}
+
+	/*
+	 * The page has no buffers: map it to disk
+	 */
+	BUG_ON(!PageUptodate(page));
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	last_block = (i_size - 1) >> blkbits;
+	map_bh.b_page = page;
+	for (page_block = 0; page_block < blocks_per_page; ) {
+
+		map_bh.b_state = 0;
+		map_bh.b_size = 1 << blkbits;
+		if (mpd->get_block(inode, block_in_file, &map_bh, 1))
+			goto confused;
+		if (buffer_new(&map_bh))
+			unmap_underlying_metadata(map_bh.b_bdev,
+						map_bh.b_blocknr);
+		if (buffer_boundary(&map_bh)) {
+			boundary_block = map_bh.b_blocknr;
+			boundary_bdev = map_bh.b_bdev;
+		}
+		if (page_block) {
+			if (map_bh.b_blocknr != blocks[page_block-1] + 1)
+				goto confused;
+		}
+		blocks[page_block++] = map_bh.b_blocknr;
+		boundary = buffer_boundary(&map_bh);
+		bdev = map_bh.b_bdev;
+		if (block_in_file == last_block)
+			break;
+		block_in_file++;
+	}
+	BUG_ON(page_block == 0);
+
+	first_unmapped = page_block;
+
+page_is_mapped:
+	end_index = i_size >> PAGE_CACHE_SHIFT;
+	if (page->index >= end_index) {
+		/*
+		 * The page straddles i_size.  It must be zeroed out on each
+		 * and every writepage invocation because it may be mmapped.
+		 * "A file is mapped in multiples of the page size.  For a file
+		 * that is not a multiple of the page size, the remaining memory
+		 * is zeroed when mapped, and writes to that region are not
+		 * written out to the file."
+		 */
+		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
+
+		if (page->index > end_index || !offset)
+			goto confused;
+		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	}
+
+	/*
+	 * This page will go to BIO.  Do we need to send this BIO off first?
+	 */
+	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
+		bio = mpage_bio_submit(WRITE, bio);
+
+alloc_new:
+	if (bio == NULL) {
+		if (first_unmapped == blocks_per_page) {
+			if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
+								page, wbc)) {
+				clean_buffers(page, first_unmapped);
+				goto out;
+			}
+		}
+		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
+		if (bio == NULL)
+			goto confused;
+	}
+
+	/*
+	 * Must try to add the page before marking the buffer clean or
+	 * the confused fail path above (OOM) will be very confused when
+	 * it finds all bh marked clean (i.e. it will not write anything)
+	 */
+	length = first_unmapped << blkbits;
+	if (bio_add_page(bio, page, length, 0) < length) {
+		bio = mpage_bio_submit(WRITE, bio);
+		goto alloc_new;
+	}
+
+	clean_buffers(page, first_unmapped);
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	if (boundary || (first_unmapped != blocks_per_page)) {
+		bio = mpage_bio_submit(WRITE, bio);
+		if (boundary_block) {
+			write_boundary_block(boundary_bdev,
+					boundary_block, 1 << blkbits);
+		}
+	} else {
+		mpd->last_block_in_bio = blocks[blocks_per_page - 1];
+	}
+	goto out;
+
+confused:
+	if (bio)
+		bio = mpage_bio_submit(WRITE, bio);
+
+	if (mpd->use_writepage) {
+		ret = mapping->a_ops->writepage(page, wbc);
+	} else {
+		ret = -EAGAIN;
+		goto out;
+	}
+	/*
+	 * The caller has a ref on the inode, so *mapping is stable
+	 */
+	mapping_set_error(mapping, ret);
+out:
+	mpd->bio = bio;
+	return ret;
+}
+
+/**
+ * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *             If this is NULL then use a_ops->writepage.  Otherwise, go
+ *             direct-to-BIO.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * If a page is already under I/O, generic_writepages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+int
+mpage_writepages(struct address_space *mapping,
+		struct writeback_control *wbc, get_block_t get_block)
+{
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+
+	if (!get_block)
+		ret = generic_writepages(mapping, wbc);
+	else {
+		struct mpage_data mpd = {
+			.bio = NULL,
+			.last_block_in_bio = 0,
+			.get_block = get_block,
+			.use_writepage = 1,
+		};
+
+		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
+		if (mpd.bio)
+			mpage_bio_submit(WRITE, mpd.bio);
+	}
+	blk_finish_plug(&plug);
+	return ret;
+}
+EXPORT_SYMBOL(mpage_writepages);
+
+int mpage_writepage(struct page *page, get_block_t get_block,
+	struct writeback_control *wbc)
+{
+	struct mpage_data mpd = {
+		.bio = NULL,
+		.last_block_in_bio = 0,
+		.get_block = get_block,
+		.use_writepage = 0,
+	};
+	int ret = __mpage_writepage(page, wbc, &mpd);
+	if (mpd.bio)
+		mpage_bio_submit(WRITE, mpd.bio);
+	return ret;
+}
+EXPORT_SYMBOL(mpage_writepage);
diff --git a/kernel/fs/namei.c b/kernel/fs/namei.c
new file mode 100644
index 000000000..fe30d3be4
--- /dev/null
+++ b/kernel/fs/namei.c
@@ -0,0 +1,4552 @@
+/*
+ *  linux/fs/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * Some corrections by tytso.
+ */
+
+/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
+ * lookup logic.
+ */
+/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/fsnotify.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/ima.h>
+#include <linux/syscalls.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
+#include <linux/posix_acl.h>
+#include <linux/hash.h>
+#include <asm/uaccess.h>
+
+#include "internal.h"
+#include "mount.h"
+
+/* [Feb-1997 T. Schoebel-Theuer]
+ * Fundamental changes in the pathname lookup mechanisms (namei)
+ * were necessary because of omirr.  The reason is that omirr needs
+ * to know the _real_ pathname, not the user-supplied one, in case
+ * of symlinks (and also when transname replacements occur).
+ *
+ * The new code replaces the old recursive symlink resolution with
+ * an iterative one (in case of non-nested symlink chains).  It does
+ * this with calls to <fs>_follow_link().
+ * As a side effect, dir_namei(), _namei() and follow_link() are now 
+ * replaced with a single function lookup_dentry() that can handle all 
+ * the special cases of the former code.
+ *
+ * With the new dcache, the pathname is stored at each inode, at least as
+ * long as the refcount of the inode is positive.  As a side effect, the
+ * size of the dcache depends on the inode cache and thus is dynamic.
+ *
+ * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
+ * resolution to correspond with current state of the code.
+ *
+ * Note that the symlink resolution is not *completely* iterative.
+ * There is still a significant amount of tail- and mid- recursion in
+ * the algorithm.  Also, note that <fs>_readlink() is not used in
+ * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
+ * may return different results than <fs>_follow_link().  Many virtual
+ * filesystems (including /proc) exhibit this behavior.
+ */
+
+/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
+ * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
+ * and the name already exists in form of a symlink, try to create the new
+ * name indicated by the symlink. The old code always complained that the
+ * name already exists, due to not following the symlink even if its target
+ * is nonexistent.  The new semantics affects also mknod() and link() when
+ * the name is a symlink pointing to a non-existent name.
+ *
+ * I don't know which semantics is the right one, since I have no access
+ * to standards. But I found by trial that HP-UX 9.0 has the full "new"
+ * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
+ * "old" one. Personally, I think the new semantics is much more logical.
+ * Note that "ln old new" where "new" is a symlink pointing to a non-existing
+ * file does succeed in both HP-UX and SunOs, but not in Solaris
+ * and in the old Linux semantics.
+ */
+
+/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
+ * semantics.  See the comments in "open_namei" and "do_link" below.
+ *
+ * [10-Sep-98 Alan Modra] Another symlink change.
+ */
+
+/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
+ *	inside the path - always follow.
+ *	in the last component in creation/removal/renaming - never follow.
+ *	if LOOKUP_FOLLOW passed - follow.
+ *	if the pathname has trailing slashes - follow.
+ *	otherwise - don't follow.
+ * (applied in that order).
+ *
+ * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
+ * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
+ * During the 2.4 we need to fix the userland stuff depending on it -
+ * hopefully we will be able to get rid of that wart in 2.5. So far only
+ * XEmacs seems to be relying on it...
+ */
+/*
+ * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
+ * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
+ * any extra contention...
+ */
+
+/* In order to reduce some races, while at the same time doing additional
+ * checking and hopefully speeding things up, we copy filenames to the
+ * kernel data space before using them..
+ *
+ * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
+ * PATH_MAX includes the nul terminator --RR.
+ */
+
+#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
+
+struct filename *
+getname_flags(const char __user *filename, int flags, int *empty)
+{
+	struct filename *result;
+	char *kname;
+	int len;
+
+	result = audit_reusename(filename);
+	if (result)
+		return result;
+
+	result = __getname();
+	if (unlikely(!result))
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * First, try to embed the struct filename inside the names_cache
+	 * allocation
+	 */
+	kname = (char *)result->iname;
+	result->name = kname;
+
+	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
+	if (unlikely(len < 0)) {
+		__putname(result);
+		return ERR_PTR(len);
+	}
+
+	/*
+	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
+	 * separate struct filename so we can dedicate the entire
+	 * names_cache allocation for the pathname, and re-do the copy from
+	 * userland.
+	 */
+	if (unlikely(len == EMBEDDED_NAME_MAX)) {
+		const size_t size = offsetof(struct filename, iname[1]);
+		kname = (char *)result;
+
+		/*
+		 * size is chosen that way we to guarantee that
+		 * result->iname[0] is within the same object and that
+		 * kname can't be equal to result->iname, no matter what.
+		 */
+		result = kzalloc(size, GFP_KERNEL);
+		if (unlikely(!result)) {
+			__putname(kname);
+			return ERR_PTR(-ENOMEM);
+		}
+		result->name = kname;
+		len = strncpy_from_user(kname, filename, PATH_MAX);
+		if (unlikely(len < 0)) {
+			__putname(kname);
+			kfree(result);
+			return ERR_PTR(len);
+		}
+		if (unlikely(len == PATH_MAX)) {
+			__putname(kname);
+			kfree(result);
+			return ERR_PTR(-ENAMETOOLONG);
+		}
+	}
+
+	result->refcnt = 1;
+	/* The empty path is special. */
+	if (unlikely(!len)) {
+		if (empty)
+			*empty = 1;
+		if (!(flags & LOOKUP_EMPTY)) {
+			putname(result);
+			return ERR_PTR(-ENOENT);
+		}
+	}
+
+	result->uptr = filename;
+	result->aname = NULL;
+	audit_getname(result);
+	return result;
+}
+
+struct filename *
+getname(const char __user * filename)
+{
+	return getname_flags(filename, 0, NULL);
+}
+
+struct filename *
+getname_kernel(const char * filename)
+{
+	struct filename *result;
+	int len = strlen(filename) + 1;
+
+	result = __getname();
+	if (unlikely(!result))
+		return ERR_PTR(-ENOMEM);
+
+	if (len <= EMBEDDED_NAME_MAX) {
+		result->name = (char *)result->iname;
+	} else if (len <= PATH_MAX) {
+		struct filename *tmp;
+
+		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
+		if (unlikely(!tmp)) {
+			__putname(result);
+			return ERR_PTR(-ENOMEM);
+		}
+		tmp->name = (char *)result;
+		result = tmp;
+	} else {
+		__putname(result);
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+	memcpy((char *)result->name, filename, len);
+	result->uptr = NULL;
+	result->aname = NULL;
+	result->refcnt = 1;
+	audit_getname(result);
+
+	return result;
+}
+
+void putname(struct filename *name)
+{
+	BUG_ON(name->refcnt <= 0);
+
+	if (--name->refcnt > 0)
+		return;
+
+	if (name->name != name->iname) {
+		__putname(name->name);
+		kfree(name);
+	} else
+		__putname(name);
+}
+
+static int check_acl(struct inode *inode, int mask)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl *acl;
+
+	if (mask & MAY_NOT_BLOCK) {
+		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
+	        if (!acl)
+	                return -EAGAIN;
+		/* no ->get_acl() calls in RCU mode... */
+		if (acl == ACL_NOT_CACHED)
+			return -ECHILD;
+	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
+	}
+
+	acl = get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+	        int error = posix_acl_permission(inode, acl, mask);
+	        posix_acl_release(acl);
+	        return error;
+	}
+#endif
+
+	return -EAGAIN;
+}
+
+/*
+ * This does the basic permission checking
+ */
+static int acl_permission_check(struct inode *inode, int mask)
+{
+	unsigned int mode = inode->i_mode;
+
+	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
+		mode >>= 6;
+	else {
+		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
+			int error = check_acl(inode, mask);
+			if (error != -EAGAIN)
+				return error;
+		}
+
+		if (in_group_p(inode->i_gid))
+			mode >>= 3;
+	}
+
+	/*
+	 * If the DACs are ok we don't need any capability check.
+	 */
+	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		return 0;
+	return -EACCES;
+}
+
+/**
+ * generic_permission -  check for access rights on a Posix-like filesystem
+ * @inode:	inode to check access rights for
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
+ */
+int generic_permission(struct inode *inode, int mask)
+{
+	int ret;
+
+	/*
+	 * Do the basic permission checks.
+	 */
+	ret = acl_permission_check(inode, mask);
+	if (ret != -EACCES)
+		return ret;
+
+	if (S_ISDIR(inode->i_mode)) {
+		/* DACs are overridable for directories */
+		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+			return 0;
+		if (!(mask & MAY_WRITE))
+			if (capable_wrt_inode_uidgid(inode,
+						     CAP_DAC_READ_SEARCH))
+				return 0;
+		return -EACCES;
+	}
+	/*
+	 * Read/write DACs are always overridable.
+	 * Executable DACs are overridable when there is
+	 * at least one exec bit set.
+	 */
+	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
+		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+			return 0;
+
+	/*
+	 * Searching includes executable on directories, else just read.
+	 */
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+	if (mask == MAY_READ)
+		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
+			return 0;
+
+	return -EACCES;
+}
+EXPORT_SYMBOL(generic_permission);
+
+/*
+ * We _really_ want to just do "generic_permission()" without
+ * even looking at the inode->i_op values. So we keep a cache
+ * flag in inode->i_opflags, that says "this has not special
+ * permission function, use the fast case".
+ */
+static inline int do_inode_permission(struct inode *inode, int mask)
+{
+	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+		if (likely(inode->i_op->permission))
+			return inode->i_op->permission(inode, mask);
+
+		/* This gets set once for the inode lifetime */
+		spin_lock(&inode->i_lock);
+		inode->i_opflags |= IOP_FASTPERM;
+		spin_unlock(&inode->i_lock);
+	}
+	return generic_permission(inode, mask);
+}
+
+/**
+ * __inode_permission - Check for access rights to a given inode
+ * @inode: Inode to check permission on
+ * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Check for read/write/execute permissions on an inode.
+ *
+ * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
+ *
+ * This does not check for a read-only file system.  You probably want
+ * inode_permission().
+ */
+int __inode_permission(struct inode *inode, int mask)
+{
+	int retval;
+
+	if (unlikely(mask & MAY_WRITE)) {
+		/*
+		 * Nobody gets write access to an immutable file.
+		 */
+		if (IS_IMMUTABLE(inode))
+			return -EACCES;
+	}
+
+	retval = do_inode_permission(inode, mask);
+	if (retval)
+		return retval;
+
+	retval = devcgroup_inode_permission(inode, mask);
+	if (retval)
+		return retval;
+
+	return security_inode_permission(inode, mask);
+}
+EXPORT_SYMBOL(__inode_permission);
+
+/**
+ * sb_permission - Check superblock-level permissions
+ * @sb: Superblock of inode to check permission on
+ * @inode: Inode to check permission on
+ * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Separate out file-system wide checks from inode-specific permission checks.
+ */
+static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
+{
+	if (unlikely(mask & MAY_WRITE)) {
+		umode_t mode = inode->i_mode;
+
+		/* Nobody gets write access to a read-only fs. */
+		if ((sb->s_flags & MS_RDONLY) &&
+		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+			return -EROFS;
+	}
+	return 0;
+}
+
+/**
+ * inode_permission - Check for access rights to a given inode
+ * @inode: Inode to check permission on
+ * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
+ * this, letting us set arbitrary permissions for filesystem access without
+ * changing the "normal" UIDs which are used for other things.
+ *
+ * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
+ */
+int inode_permission(struct inode *inode, int mask)
+{
+	int retval;
+
+	retval = sb_permission(inode->i_sb, inode, mask);
+	if (retval)
+		return retval;
+	return __inode_permission(inode, mask);
+}
+EXPORT_SYMBOL(inode_permission);
+
+/**
+ * path_get - get a reference to a path
+ * @path: path to get the reference to
+ *
+ * Given a path increment the reference count to the dentry and the vfsmount.
+ */
+void path_get(const struct path *path)
+{
+	mntget(path->mnt);
+	dget(path->dentry);
+}
+EXPORT_SYMBOL(path_get);
+
+/**
+ * path_put - put a reference to a path
+ * @path: path to put the reference to
+ *
+ * Given a path decrement the reference count to the dentry and the vfsmount.
+ */
+void path_put(const struct path *path)
+{
+	dput(path->dentry);
+	mntput(path->mnt);
+}
+EXPORT_SYMBOL(path_put);
+
+struct nameidata {
+	struct path	path;
+	struct qstr	last;
+	struct path	root;
+	struct inode	*inode; /* path.dentry.d_inode */
+	unsigned int	flags;
+	unsigned	seq, m_seq;
+	int		last_type;
+	unsigned	depth;
+	struct file	*base;
+	char *saved_names[MAX_NESTED_LINKS + 1];
+};
+
+/*
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
+ */
+
+/**
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+ * @nd or NULL.  Must be called from rcu-walk context.
+ */
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
+{
+	struct fs_struct *fs = current->fs;
+	struct dentry *parent = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+
+	/*
+	 * After legitimizing the bastards, terminate_walk()
+	 * will do the right thing for non-RCU mode, and all our
+	 * subsequent exit cases should rcu_read_unlock()
+	 * before returning.  Do vfsmount first; if dentry
+	 * can't be legitimized, just set nd->path.dentry to NULL
+	 * and rely on dput(NULL) being a no-op.
+	 */
+	if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
+		return -ECHILD;
+	nd->flags &= ~LOOKUP_RCU;
+
+	if (!lockref_get_not_dead(&parent->d_lockref)) {
+		nd->path.dentry = NULL;	
+		goto out;
+	}
+
+	/*
+	 * For a negative lookup, the lookup sequence point is the parents
+	 * sequence point, and it only needs to revalidate the parent dentry.
+	 *
+	 * For a positive lookup, we need to move both the parent and the
+	 * dentry from the RCU domain to be properly refcounted. And the
+	 * sequence number in the dentry validates *both* dentry counters,
+	 * since we checked the sequence number of the parent after we got
+	 * the child sequence number. So we know the parent must still
+	 * be valid if the child sequence number is still valid.
+	 */
+	if (!dentry) {
+		if (read_seqcount_retry(&parent->d_seq, nd->seq))
+			goto out;
+		BUG_ON(nd->inode != parent->d_inode);
+	} else {
+		if (!lockref_get_not_dead(&dentry->d_lockref))
+			goto out;
+		if (read_seqcount_retry(&dentry->d_seq, nd->seq))
+			goto drop_dentry;
+	}
+
+	/*
+	 * Sequence counts matched. Now make sure that the root is
+	 * still valid and get it if required.
+	 */
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		spin_lock(&fs->lock);
+		if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
+			goto unlock_and_drop_dentry;
+		path_get(&nd->root);
+		spin_unlock(&fs->lock);
+	}
+
+	rcu_read_unlock();
+	return 0;
+
+unlock_and_drop_dentry:
+	spin_unlock(&fs->lock);
+drop_dentry:
+	rcu_read_unlock();
+	dput(dentry);
+	goto drop_root_mnt;
+out:
+	rcu_read_unlock();
+drop_root_mnt:
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
+	return -ECHILD;
+}
+
+static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	return dentry->d_op->d_revalidate(dentry, flags);
+}
+
+/**
+ * complete_walk - successful completion of path walk
+ * @nd:  pointer nameidata
+ *
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
+ * success, -error on failure.  In case of failure caller does not
+ * need to drop nd->path.
+ */
+static int complete_walk(struct nameidata *nd)
+{
+	struct dentry *dentry = nd->path.dentry;
+	int status;
+
+	if (nd->flags & LOOKUP_RCU) {
+		nd->flags &= ~LOOKUP_RCU;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
+
+		if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+			rcu_read_unlock();
+			return -ECHILD;
+		}
+		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
+			rcu_read_unlock();
+			mntput(nd->path.mnt);
+			return -ECHILD;
+		}
+		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
+			rcu_read_unlock();
+			dput(dentry);
+			mntput(nd->path.mnt);
+			return -ECHILD;
+		}
+		rcu_read_unlock();
+	}
+
+	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+		return 0;
+
+	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
+		return 0;
+
+	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
+	if (status > 0)
+		return 0;
+
+	if (!status)
+		status = -ESTALE;
+
+	path_put(&nd->path);
+	return status;
+}
+
+static __always_inline void set_root(struct nameidata *nd)
+{
+	get_fs_root(current->fs, &nd->root);
+}
+
+static int link_path_walk(const char *, struct nameidata *);
+
+static __always_inline unsigned set_root_rcu(struct nameidata *nd)
+{
+	struct fs_struct *fs = current->fs;
+	unsigned seq, res;
+
+	do {
+		seq = read_seqcount_begin(&fs->seq);
+		nd->root = fs->root;
+		res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+	} while (read_seqcount_retry(&fs->seq, seq));
+	return res;
+}
+
+static void path_put_conditional(struct path *path, struct nameidata *nd)
+{
+	dput(path->dentry);
+	if (path->mnt != nd->path.mnt)
+		mntput(path->mnt);
+}
+
+static inline void path_to_nameidata(const struct path *path,
+					struct nameidata *nd)
+{
+	if (!(nd->flags & LOOKUP_RCU)) {
+		dput(nd->path.dentry);
+		if (nd->path.mnt != path->mnt)
+			mntput(nd->path.mnt);
+	}
+	nd->path.mnt = path->mnt;
+	nd->path.dentry = path->dentry;
+}
+
+/*
+ * Helper to directly jump to a known parsed path from ->follow_link,
+ * caller must have taken a reference to path beforehand.
+ */
+void nd_jump_link(struct nameidata *nd, struct path *path)
+{
+	path_put(&nd->path);
+
+	nd->path = *path;
+	nd->inode = nd->path.dentry->d_inode;
+	nd->flags |= LOOKUP_JUMPED;
+}
+
+void nd_set_link(struct nameidata *nd, char *path)
+{
+	nd->saved_names[nd->depth] = path;
+}
+EXPORT_SYMBOL(nd_set_link);
+
+char *nd_get_link(struct nameidata *nd)
+{
+	return nd->saved_names[nd->depth];
+}
+EXPORT_SYMBOL(nd_get_link);
+
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+	struct inode *inode = link->dentry->d_inode;
+	if (inode->i_op->put_link)
+		inode->i_op->put_link(link->dentry, nd, cookie);
+	path_put(link);
+}
+
+int sysctl_protected_symlinks __read_mostly = 0;
+int sysctl_protected_hardlinks __read_mostly = 0;
+
+/**
+ * may_follow_link - Check symlink following for unsafe situations
+ * @link: The path of the symlink
+ * @nd: nameidata pathwalk data
+ *
+ * In the case of the sysctl_protected_symlinks sysctl being enabled,
+ * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
+ * in a sticky world-writable directory. This is to protect privileged
+ * processes from failing races against path names that may change out
+ * from under them by way of other users creating malicious symlinks.
+ * It will permit symlinks to be followed only when outside a sticky
+ * world-writable directory, or when the uid of the symlink and follower
+ * match, or when the directory owner matches the symlink's owner.
+ *
+ * Returns 0 if following the symlink is allowed, -ve on error.
+ */
+static inline int may_follow_link(struct path *link, struct nameidata *nd)
+{
+	const struct inode *inode;
+	const struct inode *parent;
+
+	if (!sysctl_protected_symlinks)
+		return 0;
+
+	/* Allowed if owner and follower match. */
+	inode = link->dentry->d_inode;
+	if (uid_eq(current_cred()->fsuid, inode->i_uid))
+		return 0;
+
+	/* Allowed if parent directory not sticky and world-writable. */
+	parent = nd->path.dentry->d_inode;
+	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
+		return 0;
+
+	/* Allowed if parent directory and link owner match. */
+	if (uid_eq(parent->i_uid, inode->i_uid))
+		return 0;
+
+	audit_log_link_denied("follow_link", link);
+	path_put_conditional(link, nd);
+	path_put(&nd->path);
+	return -EACCES;
+}
+
+/**
+ * safe_hardlink_source - Check for safe hardlink conditions
+ * @inode: the source inode to hardlink from
+ *
+ * Return false if at least one of the following conditions:
+ *    - inode is not a regular file
+ *    - inode is setuid
+ *    - inode is setgid and group-exec
+ *    - access failure for read and write
+ *
+ * Otherwise returns true.
+ */
+static bool safe_hardlink_source(struct inode *inode)
+{
+	umode_t mode = inode->i_mode;
+
+	/* Special files should not get pinned to the filesystem. */
+	if (!S_ISREG(mode))
+		return false;
+
+	/* Setuid files should not get pinned to the filesystem. */
+	if (mode & S_ISUID)
+		return false;
+
+	/* Executable setgid files should not get pinned to the filesystem. */
+	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
+		return false;
+
+	/* Hardlinking to unreadable or unwritable sources is dangerous. */
+	if (inode_permission(inode, MAY_READ | MAY_WRITE))
+		return false;
+
+	return true;
+}
+
+/**
+ * may_linkat - Check permissions for creating a hardlink
+ * @link: the source to hardlink from
+ *
+ * Block hardlink when all of:
+ *  - sysctl_protected_hardlinks enabled
+ *  - fsuid does not match inode
+ *  - hardlink source is unsafe (see safe_hardlink_source() above)
+ *  - not CAP_FOWNER
+ *
+ * Returns 0 if successful, -ve on error.
+ */
+static int may_linkat(struct path *link)
+{
+	const struct cred *cred;
+	struct inode *inode;
+
+	if (!sysctl_protected_hardlinks)
+		return 0;
+
+	cred = current_cred();
+	inode = link->dentry->d_inode;
+
+	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
+	 * otherwise, it must be a safe source.
+	 */
+	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
+	    capable(CAP_FOWNER))
+		return 0;
+
+	audit_log_link_denied("linkat", link);
+	return -EPERM;
+}
+
+static __always_inline int
+follow_link(struct path *link, struct nameidata *nd, void **p)
+{
+	struct dentry *dentry = link->dentry;
+	int error;
+	char *s;
+
+	BUG_ON(nd->flags & LOOKUP_RCU);
+
+	if (link->mnt == nd->path.mnt)
+		mntget(link->mnt);
+
+	error = -ELOOP;
+	if (unlikely(current->total_link_count >= 40))
+		goto out_put_nd_path;
+
+	cond_resched();
+	current->total_link_count++;
+
+	touch_atime(link);
+	nd_set_link(nd, NULL);
+
+	error = security_inode_follow_link(link->dentry, nd);
+	if (error)
+		goto out_put_nd_path;
+
+	nd->last_type = LAST_BIND;
+	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
+	error = PTR_ERR(*p);
+	if (IS_ERR(*p))
+		goto out_put_nd_path;
+
+	error = 0;
+	s = nd_get_link(nd);
+	if (s) {
+		if (unlikely(IS_ERR(s))) {
+			path_put(&nd->path);
+			put_link(nd, link, *p);
+			return PTR_ERR(s);
+		}
+		if (*s == '/') {
+			if (!nd->root.mnt)
+				set_root(nd);
+			path_put(&nd->path);
+			nd->path = nd->root;
+			path_get(&nd->root);
+			nd->flags |= LOOKUP_JUMPED;
+		}
+		nd->inode = nd->path.dentry->d_inode;
+		error = link_path_walk(s, nd);
+		if (unlikely(error))
+			put_link(nd, link, *p);
+	}
+
+	return error;
+
+out_put_nd_path:
+	*p = NULL;
+	path_put(&nd->path);
+	path_put(link);
+	return error;
+}
+
+static int follow_up_rcu(struct path *path)
+{
+	struct mount *mnt = real_mount(path->mnt);
+	struct mount *parent;
+	struct dentry *mountpoint;
+
+	parent = mnt->mnt_parent;
+	if (&parent->mnt == path->mnt)
+		return 0;
+	mountpoint = mnt->mnt_mountpoint;
+	path->dentry = mountpoint;
+	path->mnt = &parent->mnt;
+	return 1;
+}
+
+/*
+ * follow_up - Find the mountpoint of path's vfsmount
+ *
+ * Given a path, find the mountpoint of its source file system.
+ * Replace @path with the path of the mountpoint in the parent mount.
+ * Up is towards /.
+ *
+ * Return 1 if we went up a level and 0 if we were already at the
+ * root.
+ */
+int follow_up(struct path *path)
+{
+	struct mount *mnt = real_mount(path->mnt);
+	struct mount *parent;
+	struct dentry *mountpoint;
+
+	read_seqlock_excl(&mount_lock);
+	parent = mnt->mnt_parent;
+	if (parent == mnt) {
+		read_sequnlock_excl(&mount_lock);
+		return 0;
+	}
+	mntget(&parent->mnt);
+	mountpoint = dget(mnt->mnt_mountpoint);
+	read_sequnlock_excl(&mount_lock);
+	dput(path->dentry);
+	path->dentry = mountpoint;
+	mntput(path->mnt);
+	path->mnt = &parent->mnt;
+	return 1;
+}
+EXPORT_SYMBOL(follow_up);
+
+/*
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
+ */
+static int follow_automount(struct path *path, unsigned flags,
+			    bool *need_mntput)
+{
+	struct vfsmount *mnt;
+	int err;
+
+	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+		return -EREMOTE;
+
+	/* We don't want to mount if someone's just doing a stat -
+	 * unless they're stat'ing a directory and appended a '/' to
+	 * the name.
+	 *
+	 * We do, however, want to mount if someone wants to open or
+	 * create a file of any type under the mountpoint, wants to
+	 * traverse through the mountpoint or wants to open the
+	 * mounted directory.  Also, autofs may mark negative dentries
+	 * as being automount points.  These will need the attentions
+	 * of the daemon to instantiate them before they can be used.
+	 */
+	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+	    path->dentry->d_inode)
+		return -EISDIR;
+
+	current->total_link_count++;
+	if (current->total_link_count >= 40)
+		return -ELOOP;
+
+	mnt = path->dentry->d_op->d_automount(path);
+	if (IS_ERR(mnt)) {
+		/*
+		 * The filesystem is allowed to return -EISDIR here to indicate
+		 * it doesn't want to automount.  For instance, autofs would do
+		 * this so that its userspace daemon can mount on this dentry.
+		 *
+		 * However, we can only permit this if it's a terminal point in
+		 * the path being looked up; if it wasn't then the remainder of
+		 * the path is inaccessible and we should say so.
+		 */
+		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
+			return -EREMOTE;
+		return PTR_ERR(mnt);
+	}
+
+	if (!mnt) /* mount collision */
+		return 0;
+
+	if (!*need_mntput) {
+		/* lock_mount() may release path->mnt on error */
+		mntget(path->mnt);
+		*need_mntput = true;
+	}
+	err = finish_automount(mnt, path);
+
+	switch (err) {
+	case -EBUSY:
+		/* Someone else made a mount here whilst we were busy */
+		return 0;
+	case 0:
+		path_put(path);
+		path->mnt = mnt;
+		path->dentry = dget(mnt->mnt_root);
+		return 0;
+	default:
+		return err;
+	}
+
+}
+
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
+{
+	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
+	unsigned managed;
+	bool need_mntput = false;
+	int ret = 0;
+
+	/* Given that we're not holding a lock here, we retain the value in a
+	 * local variable for each dentry as we look at it so that we don't see
+	 * the components of that value change under us */
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       managed &= DCACHE_MANAGED_DENTRY,
+	       unlikely(managed != 0)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held. */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			if (ret < 0)
+				break;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (mounted) {
+				dput(path->dentry);
+				if (need_mntput)
+					mntput(path->mnt);
+				path->mnt = mounted;
+				path->dentry = dget(mounted->mnt_root);
+				need_mntput = true;
+				continue;
+			}
+
+			/* Something is mounted on this dentry in another
+			 * namespace and/or whatever was mounted there in this
+			 * namespace got unmounted before lookup_mnt() could
+			 * get it */
+		}
+
+		/* Handle an automount point */
+		if (managed & DCACHE_NEED_AUTOMOUNT) {
+			ret = follow_automount(path, flags, &need_mntput);
+			if (ret < 0)
+				break;
+			continue;
+		}
+
+		/* We didn't change the current path point */
+		break;
+	}
+
+	if (need_mntput && path->mnt == mnt)
+		mntput(path->mnt);
+	if (ret == -EISDIR)
+		ret = 0;
+	return ret < 0 ? ret : need_mntput;
+}
+
+int follow_down_one(struct path *path)
+{
+	struct vfsmount *mounted;
+
+	mounted = lookup_mnt(path);
+	if (mounted) {
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(follow_down_one);
+
+static inline int managed_dentry_rcu(struct dentry *dentry)
+{
+	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+		dentry->d_op->d_manage(dentry, true) : 0;
+}
+
+/*
+ * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
+ * we meet a managed dentry that would need blocking.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+			       struct inode **inode)
+{
+	for (;;) {
+		struct mount *mounted;
+		/*
+		 * Don't forget we might have a non-mountpoint managed dentry
+		 * that wants to block transit.
+		 */
+		switch (managed_dentry_rcu(path->dentry)) {
+		case -ECHILD:
+		default:
+			return false;
+		case -EISDIR:
+			return true;
+		case 0:
+			break;
+		}
+
+		if (!d_mountpoint(path->dentry))
+			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
+
+		mounted = __lookup_mnt(path->mnt, path->dentry);
+		if (!mounted)
+			break;
+		path->mnt = &mounted->mnt;
+		path->dentry = mounted->mnt.mnt_root;
+		nd->flags |= LOOKUP_JUMPED;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		/*
+		 * Update the inode too. We don't need to re-check the
+		 * dentry sequence number here after this d_inode read,
+		 * because a mount-point is always pinned.
+		 */
+		*inode = path->dentry->d_inode;
+	}
+	return !read_seqretry(&mount_lock, nd->m_seq) &&
+		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
+}
+
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+	struct inode *inode = nd->inode;
+	if (!nd->root.mnt)
+		set_root_rcu(nd);
+
+	while (1) {
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
+			break;
+		}
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			struct dentry *old = nd->path.dentry;
+			struct dentry *parent = old->d_parent;
+			unsigned seq;
+
+			inode = parent->d_inode;
+			seq = read_seqcount_begin(&parent->d_seq);
+			if (read_seqcount_retry(&old->d_seq, nd->seq))
+				goto failed;
+			nd->path.dentry = parent;
+			nd->seq = seq;
+			break;
+		}
+		if (!follow_up_rcu(&nd->path))
+			break;
+		inode = nd->path.dentry->d_inode;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+	}
+	while (d_mountpoint(nd->path.dentry)) {
+		struct mount *mounted;
+		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+		if (!mounted)
+			break;
+		nd->path.mnt = &mounted->mnt;
+		nd->path.dentry = mounted->mnt.mnt_root;
+		inode = nd->path.dentry->d_inode;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		if (read_seqretry(&mount_lock, nd->m_seq))
+			goto failed;
+	}
+	nd->inode = inode;
+	return 0;
+
+failed:
+	nd->flags &= ~LOOKUP_RCU;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
+	rcu_read_unlock();
+	return -ECHILD;
+}
+
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ */
+int follow_down(struct path *path)
+{
+	unsigned managed;
+	int ret;
+
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held.
+		 *
+		 * We indicate to the filesystem if someone is trying to mount
+		 * something here.  This gives autofs the chance to deny anyone
+		 * other than its daemon the right to mount on its
+		 * superstructure.
+		 *
+		 * The filesystem may sleep at this point.
+		 */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(
+				path->dentry, false);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (!mounted)
+				break;
+			dput(path->dentry);
+			mntput(path->mnt);
+			path->mnt = mounted;
+			path->dentry = dget(mounted->mnt_root);
+			continue;
+		}
+
+		/* Don't handle automount points here */
+		break;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(follow_down);
+
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path);
+		if (!mounted)
+			break;
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+	}
+}
+
+static void follow_dotdot(struct nameidata *nd)
+{
+	if (!nd->root.mnt)
+		set_root(nd);
+
+	while(1) {
+		struct dentry *old = nd->path.dentry;
+
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
+			break;
+		}
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			/* rare case of legitimate dget_parent()... */
+			nd->path.dentry = dget_parent(nd->path.dentry);
+			dput(old);
+			break;
+		}
+		if (!follow_up(&nd->path))
+			break;
+	}
+	follow_mount(&nd->path);
+	nd->inode = nd->path.dentry->d_inode;
+}
+
+/*
+ * This looks up the name in dcache, possibly revalidates the old dentry and
+ * allocates a new one if not found or not valid.  In the need_lookup argument
+ * returns whether i_op->lookup is necessary.
+ *
+ * dir->d_inode->i_mutex must be held
+ */
+static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
+				    unsigned int flags, bool *need_lookup)
+{
+	struct dentry *dentry;
+	int error;
+
+	*need_lookup = false;
+	dentry = d_lookup(dir, name);
+	if (dentry) {
+		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
+			error = d_revalidate(dentry, flags);
+			if (unlikely(error <= 0)) {
+				if (error < 0) {
+					dput(dentry);
+					return ERR_PTR(error);
+				} else {
+					d_invalidate(dentry);
+					dput(dentry);
+					dentry = NULL;
+				}
+			}
+		}
+	}
+
+	if (!dentry) {
+		dentry = d_alloc(dir, name);
+		if (unlikely(!dentry))
+			return ERR_PTR(-ENOMEM);
+
+		*need_lookup = true;
+	}
+	return dentry;
+}
+
+/*
+ * Call i_op->lookup on the dentry.  The dentry must be negative and
+ * unhashed.
+ *
+ * dir->d_inode->i_mutex must be held
+ */
+static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct dentry *old;
+
+	/* Don't create child dentry for a dead directory. */
+	if (unlikely(IS_DEADDIR(dir))) {
+		dput(dentry);
+		return ERR_PTR(-ENOENT);
+	}
+
+	old = dir->i_op->lookup(dir, dentry, flags);
+	if (unlikely(old)) {
+		dput(dentry);
+		dentry = old;
+	}
+	return dentry;
+}
+
+static struct dentry *__lookup_hash(struct qstr *name,
+		struct dentry *base, unsigned int flags)
+{
+	bool need_lookup;
+	struct dentry *dentry;
+
+	dentry = lookup_dcache(name, base, flags, &need_lookup);
+	if (!need_lookup)
+		return dentry;
+
+	return lookup_real(base->d_inode, dentry, flags);
+}
+
+/*
+ *  It's more convoluted than I'd like it to be, but... it's still fairly
+ *  small and for now I'd prefer to have fast path as straight as possible.
+ *  It _is_ time-critical.
+ */
+static int lookup_fast(struct nameidata *nd,
+		       struct path *path, struct inode **inode)
+{
+	struct vfsmount *mnt = nd->path.mnt;
+	struct dentry *dentry, *parent = nd->path.dentry;
+	int need_reval = 1;
+	int status = 1;
+	int err;
+
+	/*
+	 * Rename seqlock is not required here because in the off chance
+	 * of a false negative due to a concurrent rename, we're going to
+	 * do the non-racy lookup, below.
+	 */
+	if (nd->flags & LOOKUP_RCU) {
+		unsigned seq;
+		bool negative;
+		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
+		if (!dentry)
+			goto unlazy;
+
+		/*
+		 * This sequence count validates that the inode matches
+		 * the dentry name information from lookup.
+		 */
+		*inode = dentry->d_inode;
+		negative = d_is_negative(dentry);
+		if (read_seqcount_retry(&dentry->d_seq, seq))
+			return -ECHILD;
+		if (negative)
+			return -ENOENT;
+
+		/*
+		 * This sequence count validates that the parent had no
+		 * changes while we did the lookup of the dentry above.
+		 *
+		 * The memory barrier in read_seqcount_begin of child is
+		 *  enough, we can use __read_seqcount_retry here.
+		 */
+		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+			return -ECHILD;
+		nd->seq = seq;
+
+		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+			status = d_revalidate(dentry, nd->flags);
+			if (unlikely(status <= 0)) {
+				if (status != -ECHILD)
+					need_reval = 0;
+				goto unlazy;
+			}
+		}
+		path->mnt = mnt;
+		path->dentry = dentry;
+		if (likely(__follow_mount_rcu(nd, path, inode)))
+			return 0;
+unlazy:
+		if (unlazy_walk(nd, dentry))
+			return -ECHILD;
+	} else {
+		dentry = __d_lookup(parent, &nd->last);
+	}
+
+	if (unlikely(!dentry))
+		goto need_lookup;
+
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+		status = d_revalidate(dentry, nd->flags);
+	if (unlikely(status <= 0)) {
+		if (status < 0) {
+			dput(dentry);
+			return status;
+		}
+		d_invalidate(dentry);
+		dput(dentry);
+		goto need_lookup;
+	}
+
+	if (unlikely(d_is_negative(dentry))) {
+		dput(dentry);
+		return -ENOENT;
+	}
+	path->mnt = mnt;
+	path->dentry = dentry;
+	err = follow_managed(path, nd->flags);
+	if (unlikely(err < 0)) {
+		path_put_conditional(path, nd);
+		return err;
+	}
+	if (err)
+		nd->flags |= LOOKUP_JUMPED;
+	*inode = path->dentry->d_inode;
+	return 0;
+
+need_lookup:
+	return 1;
+}
+
+/* Fast lookup failed, do it the slow way */
+static int lookup_slow(struct nameidata *nd, struct path *path)
+{
+	struct dentry *dentry, *parent;
+	int err;
+
+	parent = nd->path.dentry;
+	BUG_ON(nd->inode != parent->d_inode);
+
+	mutex_lock(&parent->d_inode->i_mutex);
+	dentry = __lookup_hash(&nd->last, parent, nd->flags);
+	mutex_unlock(&parent->d_inode->i_mutex);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	path->mnt = nd->path.mnt;
+	path->dentry = dentry;
+	err = follow_managed(path, nd->flags);
+	if (unlikely(err < 0)) {
+		path_put_conditional(path, nd);
+		return err;
+	}
+	if (err)
+		nd->flags |= LOOKUP_JUMPED;
+	return 0;
+}
+
+static inline int may_lookup(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+		if (err != -ECHILD)
+			return err;
+		if (unlazy_walk(nd, NULL))
+			return -ECHILD;
+	}
+	return inode_permission(nd->inode, MAY_EXEC);
+}
+
+static inline int handle_dots(struct nameidata *nd, int type)
+{
+	if (type == LAST_DOTDOT) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (follow_dotdot_rcu(nd))
+				return -ECHILD;
+		} else
+			follow_dotdot(nd);
+	}
+	return 0;
+}
+
+static void terminate_walk(struct nameidata *nd)
+{
+	if (!(nd->flags & LOOKUP_RCU)) {
+		path_put(&nd->path);
+	} else {
+		nd->flags &= ~LOOKUP_RCU;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
+		rcu_read_unlock();
+	}
+}
+
+/*
+ * Do we need to follow links? We _really_ want to be able
+ * to do this check without having to look at inode->i_op,
+ * so we keep a cache of "no, this doesn't need follow_link"
+ * for the common case.
+ */
+static inline int should_follow_link(struct dentry *dentry, int follow)
+{
+	return unlikely(d_is_symlink(dentry)) ? follow : 0;
+}
+
+static inline int walk_component(struct nameidata *nd, struct path *path,
+		int follow)
+{
+	struct inode *inode;
+	int err;
+	/*
+	 * "." and ".." are special - ".." especially so because it has
+	 * to be able to know about the current root directory and
+	 * parent relationships.
+	 */
+	if (unlikely(nd->last_type != LAST_NORM))
+		return handle_dots(nd, nd->last_type);
+	err = lookup_fast(nd, path, &inode);
+	if (unlikely(err)) {
+		if (err < 0)
+			goto out_err;
+
+		err = lookup_slow(nd, path);
+		if (err < 0)
+			goto out_err;
+
+		inode = path->dentry->d_inode;
+		err = -ENOENT;
+		if (d_is_negative(path->dentry))
+			goto out_path_put;
+	}
+
+	if (should_follow_link(path->dentry, follow)) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (unlikely(nd->path.mnt != path->mnt ||
+				     unlazy_walk(nd, path->dentry))) {
+				err = -ECHILD;
+				goto out_err;
+			}
+		}
+		BUG_ON(inode != path->dentry->d_inode);
+		return 1;
+	}
+	path_to_nameidata(path, nd);
+	nd->inode = inode;
+	return 0;
+
+out_path_put:
+	path_to_nameidata(path, nd);
+out_err:
+	terminate_walk(nd);
+	return err;
+}
+
+/*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+	int res;
+
+	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+		path_put_conditional(path, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+
+	nd->depth++;
+	current->link_count++;
+
+	do {
+		struct path link = *path;
+		void *cookie;
+
+		res = follow_link(&link, nd, &cookie);
+		if (res)
+			break;
+		res = walk_component(nd, path, LOOKUP_FOLLOW);
+		put_link(nd, &link, cookie);
+	} while (res > 0);
+
+	current->link_count--;
+	nd->depth--;
+	return res;
+}
+
+/*
+ * We can do the critical dentry name comparison and hashing
+ * operations one word at a time, but we are limited to:
+ *
+ * - Architectures with fast unaligned word accesses. We could
+ *   do a "get_unaligned()" if this helps and is sufficiently
+ *   fast.
+ *
+ * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+ *   do not trap on the (extremely unlikely) case of a page
+ *   crossing operation.
+ *
+ * - Furthermore, we need an efficient 64-bit compile for the
+ *   64-bit case in order to generate the "number of bytes in
+ *   the final mask". Again, that could be replaced with a
+ *   efficient population count instruction or similar.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+#include <asm/word-at-a-time.h>
+
+#ifdef CONFIG_64BIT
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+	return hash_64(hash, 32);
+}
+
+#else	/* 32-bit case */
+
+#define fold_hash(x) (x)
+
+#endif
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+	unsigned long a, mask;
+	unsigned long hash = 0;
+
+	for (;;) {
+		a = load_unaligned_zeropad(name);
+		if (len < sizeof(unsigned long))
+			break;
+		hash += a;
+		hash *= 9;
+		name += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+		if (!len)
+			goto done;
+	}
+	mask = bytemask_from_count(len);
+	hash += mask & a;
+done:
+	return fold_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+/*
+ * Calculate the length and hash of the path component, and
+ * return the "hash_len" as the result.
+ */
+static inline u64 hash_name(const char *name)
+{
+	unsigned long a, b, adata, bdata, mask, hash, len;
+	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+
+	hash = a = 0;
+	len = -sizeof(unsigned long);
+	do {
+		hash = (hash + a) * 9;
+		len += sizeof(unsigned long);
+		a = load_unaligned_zeropad(name+len);
+		b = a ^ REPEAT_BYTE('/');
+	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
+
+	adata = prep_zero_mask(a, adata, &constants);
+	bdata = prep_zero_mask(b, bdata, &constants);
+
+	mask = create_zero_mask(adata | bdata);
+
+	hash += a & zero_bytemask(mask);
+	len += find_zero(mask);
+	return hashlen_create(fold_hash(hash), len);
+}
+
+#else
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+	unsigned long hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(*name++, hash);
+	return end_name_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+/*
+ * We know there's a real path component here of at least
+ * one character.
+ */
+static inline u64 hash_name(const char *name)
+{
+	unsigned long hash = init_name_hash();
+	unsigned long len = 0, c;
+
+	c = (unsigned char)*name;
+	do {
+		len++;
+		hash = partial_name_hash(c, hash);
+		c = (unsigned char)name[len];
+	} while (c && c != '/');
+	return hashlen_create(end_name_hash(hash), len);
+}
+
+#endif
+
+/*
+ * Name resolution.
+ * This is the basic name resolution function, turning a pathname into
+ * the final dentry. We expect 'base' to be positive and a directory.
+ *
+ * Returns 0 and nd will have valid dentry and mnt on success.
+ * Returns error and drops reference to input namei data on failure.
+ */
+static int link_path_walk(const char *name, struct nameidata *nd)
+{
+	struct path next;
+	int err;
+	
+	while (*name=='/')
+		name++;
+	if (!*name)
+		return 0;
+
+	/* At this point we know we have a real path component. */
+	for(;;) {
+		u64 hash_len;
+		int type;
+
+		err = may_lookup(nd);
+ 		if (err)
+			break;
+
+		hash_len = hash_name(name);
+
+		type = LAST_NORM;
+		if (name[0] == '.') switch (hashlen_len(hash_len)) {
+			case 2:
+				if (name[1] == '.') {
+					type = LAST_DOTDOT;
+					nd->flags |= LOOKUP_JUMPED;
+				}
+				break;
+			case 1:
+				type = LAST_DOT;
+		}
+		if (likely(type == LAST_NORM)) {
+			struct dentry *parent = nd->path.dentry;
+			nd->flags &= ~LOOKUP_JUMPED;
+			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+				struct qstr this = { { .hash_len = hash_len }, .name = name };
+				err = parent->d_op->d_hash(parent, &this);
+				if (err < 0)
+					break;
+				hash_len = this.hash_len;
+				name = this.name;
+			}
+		}
+
+		nd->last.hash_len = hash_len;
+		nd->last.name = name;
+		nd->last_type = type;
+
+		name += hashlen_len(hash_len);
+		if (!*name)
+			return 0;
+		/*
+		 * If it wasn't NUL, we know it was '/'. Skip that
+		 * slash, and continue until no more slashes.
+		 */
+		do {
+			name++;
+		} while (unlikely(*name == '/'));
+		if (!*name)
+			return 0;
+
+		err = walk_component(nd, &next, LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
+
+		if (err) {
+			err = nested_symlink(&next, nd);
+			if (err)
+				return err;
+		}
+		if (!d_can_lookup(nd->path.dentry)) {
+			err = -ENOTDIR; 
+			break;
+		}
+	}
+	terminate_walk(nd);
+	return err;
+}
+
+static int path_init(int dfd, const struct filename *name, unsigned int flags,
+		     struct nameidata *nd)
+{
+	int retval = 0;
+	const char *s = name->name;
+
+	nd->last_type = LAST_ROOT; /* if there are only slashes... */
+	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
+	nd->depth = 0;
+	nd->base = NULL;
+	if (flags & LOOKUP_ROOT) {
+		struct dentry *root = nd->root.dentry;
+		struct inode *inode = root->d_inode;
+		if (*s) {
+			if (!d_can_lookup(root))
+				return -ENOTDIR;
+			retval = inode_permission(inode, MAY_EXEC);
+			if (retval)
+				return retval;
+		}
+		nd->path = nd->root;
+		nd->inode = inode;
+		if (flags & LOOKUP_RCU) {
+			rcu_read_lock();
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			nd->m_seq = read_seqbegin(&mount_lock);
+		} else {
+			path_get(&nd->path);
+		}
+		goto done;
+	}
+
+	nd->root.mnt = NULL;
+
+	nd->m_seq = read_seqbegin(&mount_lock);
+	if (*s == '/') {
+		if (flags & LOOKUP_RCU) {
+			rcu_read_lock();
+			nd->seq = set_root_rcu(nd);
+		} else {
+			set_root(nd);
+			path_get(&nd->root);
+		}
+		nd->path = nd->root;
+	} else if (dfd == AT_FDCWD) {
+		if (flags & LOOKUP_RCU) {
+			struct fs_struct *fs = current->fs;
+			unsigned seq;
+
+			rcu_read_lock();
+
+			do {
+				seq = read_seqcount_begin(&fs->seq);
+				nd->path = fs->pwd;
+				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			} while (read_seqcount_retry(&fs->seq, seq));
+		} else {
+			get_fs_pwd(current->fs, &nd->path);
+		}
+	} else {
+		/* Caller must check execute permissions on the starting path component */
+		struct fd f = fdget_raw(dfd);
+		struct dentry *dentry;
+
+		if (!f.file)
+			return -EBADF;
+
+		dentry = f.file->f_path.dentry;
+
+		if (*s) {
+			if (!d_can_lookup(dentry)) {
+				fdput(f);
+				return -ENOTDIR;
+			}
+		}
+
+		nd->path = f.file->f_path;
+		if (flags & LOOKUP_RCU) {
+			if (f.flags & FDPUT_FPUT)
+				nd->base = f.file;
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			rcu_read_lock();
+		} else {
+			path_get(&nd->path);
+			fdput(f);
+		}
+	}
+
+	nd->inode = nd->path.dentry->d_inode;
+	if (!(flags & LOOKUP_RCU))
+		goto done;
+	if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
+		goto done;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
+	rcu_read_unlock();
+	return -ECHILD;
+done:
+	current->total_link_count = 0;
+	return link_path_walk(s, nd);
+}
+
+static void path_cleanup(struct nameidata *nd)
+{
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
+	if (unlikely(nd->base))
+		fput(nd->base);
+}
+
+static inline int lookup_last(struct nameidata *nd, struct path *path)
+{
+	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+	nd->flags &= ~LOOKUP_PARENT;
+	return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
+}
+
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_lookupat(int dfd, const struct filename *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	struct path path;
+	int err;
+
+	/*
+	 * Path walking is largely split up into 2 different synchronisation
+	 * schemes, rcu-walk and ref-walk (explained in
+	 * Documentation/filesystems/path-lookup.txt). These share much of the
+	 * path walk code, but some things particularly setup, cleanup, and
+	 * following mounts are sufficiently divergent that functions are
+	 * duplicated. Typically there is a function foo(), and its RCU
+	 * analogue, foo_rcu().
+	 *
+	 * -ECHILD is the error number of choice (just to avoid clashes) that
+	 * is returned if some aspect of an rcu-walk fails. Such an error must
+	 * be handled by restarting a traditional ref-walk (which will always
+	 * be able to complete).
+	 */
+	err = path_init(dfd, name, flags, nd);
+	if (!err && !(flags & LOOKUP_PARENT)) {
+		err = lookup_last(nd, &path);
+		while (err > 0) {
+			void *cookie;
+			struct path link = path;
+			err = may_follow_link(&link, nd);
+			if (unlikely(err))
+				break;
+			nd->flags |= LOOKUP_PARENT;
+			err = follow_link(&link, nd, &cookie);
+			if (err)
+				break;
+			err = lookup_last(nd, &path);
+			put_link(nd, &link, cookie);
+		}
+	}
+
+	if (!err)
+		err = complete_walk(nd);
+
+	if (!err && nd->flags & LOOKUP_DIRECTORY) {
+		if (!d_can_lookup(nd->path.dentry)) {
+			path_put(&nd->path);
+			err = -ENOTDIR;
+		}
+	}
+
+	path_cleanup(nd);
+	return err;
+}
+
+static int filename_lookup(int dfd, struct filename *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+	if (unlikely(retval == -ECHILD))
+		retval = path_lookupat(dfd, name, flags, nd);
+	if (unlikely(retval == -ESTALE))
+		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
+
+	if (likely(!retval))
+		audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
+	return retval;
+}
+
+/* does lookup, returns the object with parent locked */
+struct dentry *kern_path_locked(const char *name, struct path *path)
+{
+	struct filename *filename = getname_kernel(name);
+	struct nameidata nd;
+	struct dentry *d;
+	int err;
+
+	if (IS_ERR(filename))
+		return ERR_CAST(filename);
+
+	err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
+	if (err) {
+		d = ERR_PTR(err);
+		goto out;
+	}
+	if (nd.last_type != LAST_NORM) {
+		path_put(&nd.path);
+		d = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	d = __lookup_hash(&nd.last, nd.path.dentry, 0);
+	if (IS_ERR(d)) {
+		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+		path_put(&nd.path);
+		goto out;
+	}
+	*path = nd.path;
+out:
+	putname(filename);
+	return d;
+}
+
+int kern_path(const char *name, unsigned int flags, struct path *path)
+{
+	struct nameidata nd;
+	struct filename *filename = getname_kernel(name);
+	int res = PTR_ERR(filename);
+
+	if (!IS_ERR(filename)) {
+		res = filename_lookup(AT_FDCWD, filename, flags, &nd);
+		putname(filename);
+		if (!res)
+			*path = nd.path;
+	}
+	return res;
+}
+EXPORT_SYMBOL(kern_path);
+
+/**
+ * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
+ * @dentry:  pointer to dentry of the base directory
+ * @mnt: pointer to vfs mount of the base directory
+ * @name: pointer to file name
+ * @flags: lookup flags
+ * @path: pointer to struct path to fill
+ */
+int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
+		    const char *name, unsigned int flags,
+		    struct path *path)
+{
+	struct filename *filename = getname_kernel(name);
+	int err = PTR_ERR(filename);
+
+	BUG_ON(flags & LOOKUP_PARENT);
+
+	/* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */
+	if (!IS_ERR(filename)) {
+		struct nameidata nd;
+		nd.root.dentry = dentry;
+		nd.root.mnt = mnt;
+		err = filename_lookup(AT_FDCWD, filename,
+				      flags | LOOKUP_ROOT, &nd);
+		if (!err)
+			*path = nd.path;
+		putname(filename);
+	}
+	return err;
+}
+EXPORT_SYMBOL(vfs_path_lookup);
+
+/*
+ * Restricted form of lookup. Doesn't follow links, single-component only,
+ * needs parent already locked. Doesn't follow mounts.
+ * SMP-safe.
+ */
+static struct dentry *lookup_hash(struct nameidata *nd)
+{
+	return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
+}
+
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ */
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+
+	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	return __lookup_hash(&this, base, 0);
+}
+EXPORT_SYMBOL(lookup_one_len);
+
+int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
+		 struct path *path, int *empty)
+{
+	struct nameidata nd;
+	struct filename *tmp = getname_flags(name, flags, empty);
+	int err = PTR_ERR(tmp);
+	if (!IS_ERR(tmp)) {
+
+		BUG_ON(flags & LOOKUP_PARENT);
+
+		err = filename_lookup(dfd, tmp, flags, &nd);
+		putname(tmp);
+		if (!err)
+			*path = nd.path;
+	}
+	return err;
+}
+
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+		 struct path *path)
+{
+	return user_path_at_empty(dfd, name, flags, path, NULL);
+}
+EXPORT_SYMBOL(user_path_at);
+
+/*
+ * NB: most callers don't do anything directly with the reference to the
+ *     to struct filename, but the nd->last pointer points into the name string
+ *     allocated by getname. So we must hold the reference to it until all
+ *     path-walking is complete.
+ */
+static struct filename *
+user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
+		 unsigned int flags)
+{
+	struct filename *s = getname(path);
+	int error;
+
+	/* only LOOKUP_REVAL is allowed in extra flags */
+	flags &= LOOKUP_REVAL;
+
+	if (IS_ERR(s))
+		return s;
+
+	error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
+	if (error) {
+		putname(s);
+		return ERR_PTR(error);
+	}
+
+	return s;
+}
+
+/**
+ * mountpoint_last - look up last component for umount
+ * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
+ * @path: pointer to container for result
+ *
+ * This is a special lookup_last function just for umount. In this case, we
+ * need to resolve the path without doing any revalidation.
+ *
+ * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
+ * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
+ * in almost all cases, this lookup will be served out of the dcache. The only
+ * cases where it won't are if nd->last refers to a symlink or the path is
+ * bogus and it doesn't exist.
+ *
+ * Returns:
+ * -error: if there was an error during lookup. This includes -ENOENT if the
+ *         lookup found a negative dentry. The nd->path reference will also be
+ *         put in this case.
+ *
+ * 0:      if we successfully resolved nd->path and found it to not to be a
+ *         symlink that needs to be followed. "path" will also be populated.
+ *         The nd->path reference will also be put.
+ *
+ * 1:      if we successfully resolved nd->last and found it to be a symlink
+ *         that needs to be followed. "path" will be populated with the path
+ *         to the link, and nd->path will *not* be put.
+ */
+static int
+mountpoint_last(struct nameidata *nd, struct path *path)
+{
+	int error = 0;
+	struct dentry *dentry;
+	struct dentry *dir = nd->path.dentry;
+
+	/* If we're in rcuwalk, drop out of it to handle last component */
+	if (nd->flags & LOOKUP_RCU) {
+		if (unlazy_walk(nd, NULL)) {
+			error = -ECHILD;
+			goto out;
+		}
+	}
+
+	nd->flags &= ~LOOKUP_PARENT;
+
+	if (unlikely(nd->last_type != LAST_NORM)) {
+		error = handle_dots(nd, nd->last_type);
+		if (error)
+			goto out;
+		dentry = dget(nd->path.dentry);
+		goto done;
+	}
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	dentry = d_lookup(dir, &nd->last);
+	if (!dentry) {
+		/*
+		 * No cached dentry. Mounted dentries are pinned in the cache,
+		 * so that means that this dentry is probably a symlink or the
+		 * path doesn't actually point to a mounted dentry.
+		 */
+		dentry = d_alloc(dir, &nd->last);
+		if (!dentry) {
+			error = -ENOMEM;
+			mutex_unlock(&dir->d_inode->i_mutex);
+			goto out;
+		}
+		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+		error = PTR_ERR(dentry);
+		if (IS_ERR(dentry)) {
+			mutex_unlock(&dir->d_inode->i_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+done:
+	if (d_is_negative(dentry)) {
+		error = -ENOENT;
+		dput(dentry);
+		goto out;
+	}
+	path->dentry = dentry;
+	path->mnt = nd->path.mnt;
+	if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
+		return 1;
+	mntget(path->mnt);
+	follow_mount(path);
+	error = 0;
+out:
+	terminate_walk(nd);
+	return error;
+}
+
+/**
+ * path_mountpoint - look up a path to be umounted
+ * @dfd:	directory file descriptor to start walk from
+ * @name:	full pathname to walk
+ * @path:	pointer to container for result
+ * @flags:	lookup flags
+ *
+ * Look up the given name, but don't attempt to revalidate the last component.
+ * Returns 0 and "path" will be valid on success; Returns error otherwise.
+ */
+static int
+path_mountpoint(int dfd, const struct filename *name, struct path *path,
+		unsigned int flags)
+{
+	struct nameidata nd;
+	int err;
+
+	err = path_init(dfd, name, flags, &nd);
+	if (unlikely(err))
+		goto out;
+
+	err = mountpoint_last(&nd, path);
+	while (err > 0) {
+		void *cookie;
+		struct path link = *path;
+		err = may_follow_link(&link, &nd);
+		if (unlikely(err))
+			break;
+		nd.flags |= LOOKUP_PARENT;
+		err = follow_link(&link, &nd, &cookie);
+		if (err)
+			break;
+		err = mountpoint_last(&nd, path);
+		put_link(&nd, &link, cookie);
+	}
+out:
+	path_cleanup(&nd);
+	return err;
+}
+
+static int
+filename_mountpoint(int dfd, struct filename *name, struct path *path,
+			unsigned int flags)
+{
+	int error;
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+	error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU);
+	if (unlikely(error == -ECHILD))
+		error = path_mountpoint(dfd, name, path, flags);
+	if (unlikely(error == -ESTALE))
+		error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL);
+	if (likely(!error))
+		audit_inode(name, path->dentry, 0);
+	putname(name);
+	return error;
+}
+
+/**
+ * user_path_mountpoint_at - lookup a path from userland in order to umount it
+ * @dfd:	directory file descriptor
+ * @name:	pathname from userland
+ * @flags:	lookup flags
+ * @path:	pointer to container to hold result
+ *
+ * A umount is a special case for path walking. We're not actually interested
+ * in the inode in this situation, and ESTALE errors can be a problem. We
+ * simply want track down the dentry and vfsmount attached at the mountpoint
+ * and avoid revalidating the last component.
+ *
+ * Returns 0 and populates "path" on success.
+ */
+int
+user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
+			struct path *path)
+{
+	return filename_mountpoint(dfd, getname(name), path, flags);
+}
+
+int
+kern_path_mountpoint(int dfd, const char *name, struct path *path,
+			unsigned int flags)
+{
+	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
+}
+EXPORT_SYMBOL(kern_path_mountpoint);
+
+int __check_sticky(struct inode *dir, struct inode *inode)
+{
+	kuid_t fsuid = current_fsuid();
+
+	if (uid_eq(inode->i_uid, fsuid))
+		return 0;
+	if (uid_eq(dir->i_uid, fsuid))
+		return 0;
+	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
+}
+EXPORT_SYMBOL(__check_sticky);
+
+/*
+ *	Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *	a. be owner of dir, or
+ *	b. be owner of victim, or
+ *	c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+{
+	struct inode *inode = victim->d_inode;
+	int error;
+
+	if (d_is_negative(victim))
+		return -ENOENT;
+	BUG_ON(!inode);
+
+	BUG_ON(victim->d_parent->d_inode != dir);
+	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
+
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+	if (IS_APPEND(dir))
+		return -EPERM;
+
+	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
+	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+		return -EPERM;
+	if (isdir) {
+		if (!d_is_dir(victim))
+			return -ENOTDIR;
+		if (IS_ROOT(victim))
+			return -EBUSY;
+	} else if (d_is_dir(victim))
+		return -EISDIR;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EBUSY;
+	return 0;
+}
+
+/*	Check whether we can create an object with dentry child in directory
+ *  dir.
+ *  1. We can't do it if child already exists (open has special treatment for
+ *     this case, but since we are inlined it's OK)
+ *  2. We can't do it if dir is read-only (done in permission())
+ *  3. We should have write and exec permissions on dir
+ *  4. We can't do it if dir is immutable (done in permission())
+ */
+static inline int may_create(struct inode *dir, struct dentry *child)
+{
+	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * p1 and p2 should be directories on the same fs.
+ */
+struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+{
+	struct dentry *p;
+
+	if (p1 == p2) {
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		return NULL;
+	}
+
+	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+
+	p = d_ancestor(p2, p1);
+	if (p) {
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
+	}
+
+	p = d_ancestor(p1, p2);
+	if (p) {
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
+	}
+
+	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
+	return NULL;
+}
+EXPORT_SYMBOL(lock_rename);
+
+void unlock_rename(struct dentry *p1, struct dentry *p2)
+{
+	mutex_unlock(&p1->d_inode->i_mutex);
+	if (p1 != p2) {
+		mutex_unlock(&p2->d_inode->i_mutex);
+		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+	}
+}
+EXPORT_SYMBOL(unlock_rename);
+
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool want_excl)
+{
+	int error = may_create(dir, dentry);
+	if (error)
+		return error;
+
+	if (!dir->i_op->create)
+		return -EACCES;	/* shouldn't it be ENOSYS? */
+	mode &= S_IALLUGO;
+	mode |= S_IFREG;
+	error = security_inode_create(dir, dentry, mode);
+	if (error)
+		return error;
+	error = dir->i_op->create(dir, dentry, mode, want_excl);
+	if (!error)
+		fsnotify_create(dir, dentry);
+	return error;
+}
+EXPORT_SYMBOL(vfs_create);
+
+static int may_open(struct path *path, int acc_mode, int flag)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	/* O_PATH? */
+	if (!acc_mode)
+		return 0;
+
+	if (!inode)
+		return -ENOENT;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFLNK:
+		return -ELOOP;
+	case S_IFDIR:
+		if (acc_mode & MAY_WRITE)
+			return -EISDIR;
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
+		if (path->mnt->mnt_flags & MNT_NODEV)
+			return -EACCES;
+		/*FALLTHRU*/
+	case S_IFIFO:
+	case S_IFSOCK:
+		flag &= ~O_TRUNC;
+		break;
+	}
+
+	error = inode_permission(inode, acc_mode);
+	if (error)
+		return error;
+
+	/*
+	 * An append-only file must be opened in append mode for writing.
+	 */
+	if (IS_APPEND(inode)) {
+		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
+			return -EPERM;
+		if (flag & O_TRUNC)
+			return -EPERM;
+	}
+
+	/* O_NOATIME can only be set by the owner or superuser */
+	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
+		return -EPERM;
+
+	return 0;
+}
+
+static int handle_truncate(struct file *filp)
+{
+	struct path *path = &filp->f_path;
+	struct inode *inode = path->dentry->d_inode;
+	int error = get_write_access(inode);
+	if (error)
+		return error;
+	/*
+	 * Refuse to truncate files with mandatory locks held on them.
+	 */
+	error = locks_verify_locked(filp);
+	if (!error)
+		error = security_path_truncate(path);
+	if (!error) {
+		error = do_truncate(path->dentry, 0,
+				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
+				    filp);
+	}
+	put_write_access(inode);
+	return error;
+}
+
+static inline int open_to_namei_flags(int flag)
+{
+	if ((flag & O_ACCMODE) == 3)
+		flag--;
+	return flag;
+}
+
+static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
+{
+	int error = security_path_mknod(dir, dentry, mode, 0);
+	if (error)
+		return error;
+
+	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+
+	return security_inode_create(dir->dentry->d_inode, dentry, mode);
+}
+
+/*
+ * Attempt to atomically look up, create and open a file from a negative
+ * dentry.
+ *
+ * Returns 0 if successful.  The file will have been created and attached to
+ * @file by the filesystem calling finish_open().
+ *
+ * Returns 1 if the file was looked up only or didn't need creating.  The
+ * caller will need to perform the open themselves.  @path will have been
+ * updated to point to the new dentry.  This may be negative.
+ *
+ * Returns an error code otherwise.
+ */
+static int atomic_open(struct nameidata *nd, struct dentry *dentry,
+			struct path *path, struct file *file,
+			const struct open_flags *op,
+			bool got_write, bool need_lookup,
+			int *opened)
+{
+	struct inode *dir =  nd->path.dentry->d_inode;
+	unsigned open_flag = open_to_namei_flags(op->open_flag);
+	umode_t mode;
+	int error;
+	int acc_mode;
+	int create_error = 0;
+	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
+	bool excl;
+
+	BUG_ON(dentry->d_inode);
+
+	/* Don't create child dentry for a dead directory. */
+	if (unlikely(IS_DEADDIR(dir))) {
+		error = -ENOENT;
+		goto out;
+	}
+
+	mode = op->mode;
+	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
+		mode &= ~current_umask();
+
+	excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
+	if (excl)
+		open_flag &= ~O_TRUNC;
+
+	/*
+	 * Checking write permission is tricky, bacuse we don't know if we are
+	 * going to actually need it: O_CREAT opens should work as long as the
+	 * file exists.  But checking existence breaks atomicity.  The trick is
+	 * to check access and if not granted clear O_CREAT from the flags.
+	 *
+	 * Another problem is returing the "right" error value (e.g. for an
+	 * O_EXCL open we want to return EEXIST not EROFS).
+	 */
+	if (((open_flag & (O_CREAT | O_TRUNC)) ||
+	    (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
+		if (!(open_flag & O_CREAT)) {
+			/*
+			 * No O_CREATE -> atomicity not a requirement -> fall
+			 * back to lookup + open
+			 */
+			goto no_open;
+		} else if (open_flag & (O_EXCL | O_TRUNC)) {
+			/* Fall back and fail with the right error */
+			create_error = -EROFS;
+			goto no_open;
+		} else {
+			/* No side effects, safe to clear O_CREAT */
+			create_error = -EROFS;
+			open_flag &= ~O_CREAT;
+		}
+	}
+
+	if (open_flag & O_CREAT) {
+		error = may_o_create(&nd->path, dentry, mode);
+		if (error) {
+			create_error = error;
+			if (open_flag & O_EXCL)
+				goto no_open;
+			open_flag &= ~O_CREAT;
+		}
+	}
+
+	if (nd->flags & LOOKUP_DIRECTORY)
+		open_flag |= O_DIRECTORY;
+
+	file->f_path.dentry = DENTRY_NOT_SET;
+	file->f_path.mnt = nd->path.mnt;
+	error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
+				      opened);
+	if (error < 0) {
+		if (create_error && error == -ENOENT)
+			error = create_error;
+		goto out;
+	}
+
+	if (error) {	/* returned 1, that is */
+		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
+			error = -EIO;
+			goto out;
+		}
+		if (file->f_path.dentry) {
+			dput(dentry);
+			dentry = file->f_path.dentry;
+		}
+		if (*opened & FILE_CREATED)
+			fsnotify_create(dir, dentry);
+		if (!dentry->d_inode) {
+			WARN_ON(*opened & FILE_CREATED);
+			if (create_error) {
+				error = create_error;
+				goto out;
+			}
+		} else {
+			if (excl && !(*opened & FILE_CREATED)) {
+				error = -EEXIST;
+				goto out;
+			}
+		}
+		goto looked_up;
+	}
+
+	/*
+	 * We didn't have the inode before the open, so check open permission
+	 * here.
+	 */
+	acc_mode = op->acc_mode;
+	if (*opened & FILE_CREATED) {
+		WARN_ON(!(open_flag & O_CREAT));
+		fsnotify_create(dir, dentry);
+		acc_mode = MAY_OPEN;
+	}
+	error = may_open(&file->f_path, acc_mode, open_flag);
+	if (error)
+		fput(file);
+
+out:
+	dput(dentry);
+	return error;
+
+no_open:
+	if (need_lookup) {
+		dentry = lookup_real(dir, dentry, nd->flags);
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+
+		if (create_error) {
+			int open_flag = op->open_flag;
+
+			error = create_error;
+			if ((open_flag & O_EXCL)) {
+				if (!dentry->d_inode)
+					goto out;
+			} else if (!dentry->d_inode) {
+				goto out;
+			} else if ((open_flag & O_TRUNC) &&
+				   d_is_reg(dentry)) {
+				goto out;
+			}
+			/* will fail later, go on to get the right error */
+		}
+	}
+looked_up:
+	path->dentry = dentry;
+	path->mnt = nd->path.mnt;
+	return 1;
+}
+
+/*
+ * Look up and maybe create and open the last component.
+ *
+ * Must be called with i_mutex held on parent.
+ *
+ * Returns 0 if the file was successfully atomically created (if necessary) and
+ * opened.  In this case the file will be returned attached to @file.
+ *
+ * Returns 1 if the file was not completely opened at this time, though lookups
+ * and creations will have been performed and the dentry returned in @path will
+ * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
+ * specified then a negative dentry may be returned.
+ *
+ * An error code is returned otherwise.
+ *
+ * FILE_CREATE will be set in @*opened if the dentry was created and will be
+ * cleared otherwise prior to returning.
+ */
+static int lookup_open(struct nameidata *nd, struct path *path,
+			struct file *file,
+			const struct open_flags *op,
+			bool got_write, int *opened)
+{
+	struct dentry *dir = nd->path.dentry;
+	struct inode *dir_inode = dir->d_inode;
+	struct dentry *dentry;
+	int error;
+	bool need_lookup;
+
+	*opened &= ~FILE_CREATED;
+	dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	/* Cached positive dentry: will open in f_op->open */
+	if (!need_lookup && dentry->d_inode)
+		goto out_no_open;
+
+	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
+		return atomic_open(nd, dentry, path, file, op, got_write,
+				   need_lookup, opened);
+	}
+
+	if (need_lookup) {
+		BUG_ON(dentry->d_inode);
+
+		dentry = lookup_real(dir_inode, dentry, nd->flags);
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+	}
+
+	/* Negative dentry, just create the file */
+	if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
+		umode_t mode = op->mode;
+		if (!IS_POSIXACL(dir->d_inode))
+			mode &= ~current_umask();
+		/*
+		 * This write is needed to ensure that a
+		 * rw->ro transition does not occur between
+		 * the time when the file is created and when
+		 * a permanent write count is taken through
+		 * the 'struct file' in finish_open().
+		 */
+		if (!got_write) {
+			error = -EROFS;
+			goto out_dput;
+		}
+		*opened |= FILE_CREATED;
+		error = security_path_mknod(&nd->path, dentry, mode, 0);
+		if (error)
+			goto out_dput;
+		error = vfs_create(dir->d_inode, dentry, mode,
+				   nd->flags & LOOKUP_EXCL);
+		if (error)
+			goto out_dput;
+	}
+out_no_open:
+	path->dentry = dentry;
+	path->mnt = nd->path.mnt;
+	return 1;
+
+out_dput:
+	dput(dentry);
+	return error;
+}
+
+/*
+ * Handle the last step of open()
+ */
+static int do_last(struct nameidata *nd, struct path *path,
+		   struct file *file, const struct open_flags *op,
+		   int *opened, struct filename *name)
+{
+	struct dentry *dir = nd->path.dentry;
+	int open_flag = op->open_flag;
+	bool will_truncate = (open_flag & O_TRUNC) != 0;
+	bool got_write = false;
+	int acc_mode = op->acc_mode;
+	struct inode *inode;
+	bool symlink_ok = false;
+	struct path save_parent = { .dentry = NULL, .mnt = NULL };
+	bool retried = false;
+	int error;
+
+	nd->flags &= ~LOOKUP_PARENT;
+	nd->flags |= op->intent;
+
+	if (nd->last_type != LAST_NORM) {
+		error = handle_dots(nd, nd->last_type);
+		if (error)
+			return error;
+		goto finish_open;
+	}
+
+	if (!(open_flag & O_CREAT)) {
+		if (nd->last.name[nd->last.len])
+			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+			symlink_ok = true;
+		/* we _can_ be in RCU mode here */
+		error = lookup_fast(nd, path, &inode);
+		if (likely(!error))
+			goto finish_lookup;
+
+		if (error < 0)
+			goto out;
+
+		BUG_ON(nd->inode != dir->d_inode);
+	} else {
+		/* create side of things */
+		/*
+		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
+		 * has been cleared when we got to the last component we are
+		 * about to look up
+		 */
+		error = complete_walk(nd);
+		if (error)
+			return error;
+
+		audit_inode(name, dir, LOOKUP_PARENT);
+		error = -EISDIR;
+		/* trailing slashes? */
+		if (nd->last.name[nd->last.len])
+			goto out;
+	}
+
+retry_lookup:
+	if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+		error = mnt_want_write(nd->path.mnt);
+		if (!error)
+			got_write = true;
+		/*
+		 * do _not_ fail yet - we might not need that or fail with
+		 * a different error; let lookup_open() decide; we'll be
+		 * dropping this one anyway.
+		 */
+	}
+	mutex_lock(&dir->d_inode->i_mutex);
+	error = lookup_open(nd, path, file, op, got_write, opened);
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	if (error <= 0) {
+		if (error)
+			goto out;
+
+		if ((*opened & FILE_CREATED) ||
+		    !S_ISREG(file_inode(file)->i_mode))
+			will_truncate = false;
+
+		audit_inode(name, file->f_path.dentry, 0);
+		goto opened;
+	}
+
+	if (*opened & FILE_CREATED) {
+		/* Don't check for write permission, don't truncate */
+		open_flag &= ~O_TRUNC;
+		will_truncate = false;
+		acc_mode = MAY_OPEN;
+		path_to_nameidata(path, nd);
+		goto finish_open_created;
+	}
+
+	/*
+	 * create/update audit record if it already exists.
+	 */
+	if (d_is_positive(path->dentry))
+		audit_inode(name, path->dentry, 0);
+
+	/*
+	 * If atomic_open() acquired write access it is dropped now due to
+	 * possible mount and symlink following (this might be optimized away if
+	 * necessary...)
+	 */
+	if (got_write) {
+		mnt_drop_write(nd->path.mnt);
+		got_write = false;
+	}
+
+	error = -EEXIST;
+	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
+		goto exit_dput;
+
+	error = follow_managed(path, nd->flags);
+	if (error < 0)
+		goto exit_dput;
+
+	if (error)
+		nd->flags |= LOOKUP_JUMPED;
+
+	BUG_ON(nd->flags & LOOKUP_RCU);
+	inode = path->dentry->d_inode;
+	error = -ENOENT;
+	if (d_is_negative(path->dentry)) {
+		path_to_nameidata(path, nd);
+		goto out;
+	}
+finish_lookup:
+	/* we _can_ be in RCU mode here */
+	if (should_follow_link(path->dentry, !symlink_ok)) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (unlikely(nd->path.mnt != path->mnt ||
+				     unlazy_walk(nd, path->dentry))) {
+				error = -ECHILD;
+				goto out;
+			}
+		}
+		BUG_ON(inode != path->dentry->d_inode);
+		return 1;
+	}
+
+	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
+		path_to_nameidata(path, nd);
+	} else {
+		save_parent.dentry = nd->path.dentry;
+		save_parent.mnt = mntget(path->mnt);
+		nd->path.dentry = path->dentry;
+
+	}
+	nd->inode = inode;
+	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+finish_open:
+	error = complete_walk(nd);
+	if (error) {
+		path_put(&save_parent);
+		return error;
+	}
+	audit_inode(name, nd->path.dentry, 0);
+	error = -EISDIR;
+	if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
+		goto out;
+	error = -ENOTDIR;
+	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
+		goto out;
+	if (!d_is_reg(nd->path.dentry))
+		will_truncate = false;
+
+	if (will_truncate) {
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto out;
+		got_write = true;
+	}
+finish_open_created:
+	error = may_open(&nd->path, acc_mode, open_flag);
+	if (error)
+		goto out;
+
+	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
+	error = vfs_open(&nd->path, file, current_cred());
+	if (!error) {
+		*opened |= FILE_OPENED;
+	} else {
+		if (error == -EOPENSTALE)
+			goto stale_open;
+		goto out;
+	}
+opened:
+	error = open_check_o_direct(file);
+	if (error)
+		goto exit_fput;
+	error = ima_file_check(file, op->acc_mode, *opened);
+	if (error)
+		goto exit_fput;
+
+	if (will_truncate) {
+		error = handle_truncate(file);
+		if (error)
+			goto exit_fput;
+	}
+out:
+	if (got_write)
+		mnt_drop_write(nd->path.mnt);
+	path_put(&save_parent);
+	terminate_walk(nd);
+	return error;
+
+exit_dput:
+	path_put_conditional(path, nd);
+	goto out;
+exit_fput:
+	fput(file);
+	goto out;
+
+stale_open:
+	/* If no saved parent or already retried then can't retry */
+	if (!save_parent.dentry || retried)
+		goto out;
+
+	BUG_ON(save_parent.dentry != dir);
+	path_put(&nd->path);
+	nd->path = save_parent;
+	nd->inode = dir->d_inode;
+	save_parent.mnt = NULL;
+	save_parent.dentry = NULL;
+	if (got_write) {
+		mnt_drop_write(nd->path.mnt);
+		got_write = false;
+	}
+	retried = true;
+	goto retry_lookup;
+}
+
+static int do_tmpfile(int dfd, struct filename *pathname,
+		struct nameidata *nd, int flags,
+		const struct open_flags *op,
+		struct file *file, int *opened)
+{
+	static const struct qstr name = QSTR_INIT("/", 1);
+	struct dentry *dentry, *child;
+	struct inode *dir;
+	int error = path_lookupat(dfd, pathname,
+				  flags | LOOKUP_DIRECTORY, nd);
+	if (unlikely(error))
+		return error;
+	error = mnt_want_write(nd->path.mnt);
+	if (unlikely(error))
+		goto out;
+	/* we want directory to be writable */
+	error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto out2;
+	dentry = nd->path.dentry;
+	dir = dentry->d_inode;
+	if (!dir->i_op->tmpfile) {
+		error = -EOPNOTSUPP;
+		goto out2;
+	}
+	child = d_alloc(dentry, &name);
+	if (unlikely(!child)) {
+		error = -ENOMEM;
+		goto out2;
+	}
+	nd->flags &= ~LOOKUP_DIRECTORY;
+	nd->flags |= op->intent;
+	dput(nd->path.dentry);
+	nd->path.dentry = child;
+	error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+	if (error)
+		goto out2;
+	audit_inode(pathname, nd->path.dentry, 0);
+	/* Don't check for other permissions, the inode was just created */
+	error = may_open(&nd->path, MAY_OPEN, op->open_flag);
+	if (error)
+		goto out2;
+	file->f_path.mnt = nd->path.mnt;
+	error = finish_open(file, nd->path.dentry, NULL, opened);
+	if (error)
+		goto out2;
+	error = open_check_o_direct(file);
+	if (error) {
+		fput(file);
+	} else if (!(op->open_flag & O_EXCL)) {
+		struct inode *inode = file_inode(file);
+		spin_lock(&inode->i_lock);
+		inode->i_state |= I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
+out2:
+	mnt_drop_write(nd->path.mnt);
+out:
+	path_put(&nd->path);
+	return error;
+}
+
+static struct file *path_openat(int dfd, struct filename *pathname,
+		struct nameidata *nd, const struct open_flags *op, int flags)
+{
+	struct file *file;
+	struct path path;
+	int opened = 0;
+	int error;
+
+	file = get_empty_filp();
+	if (IS_ERR(file))
+		return file;
+
+	file->f_flags = op->open_flag;
+
+	if (unlikely(file->f_flags & __O_TMPFILE)) {
+		error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
+		goto out2;
+	}
+
+	error = path_init(dfd, pathname, flags, nd);
+	if (unlikely(error))
+		goto out;
+
+	error = do_last(nd, &path, file, op, &opened, pathname);
+	while (unlikely(error > 0)) { /* trailing symlink */
+		struct path link = path;
+		void *cookie;
+		if (!(nd->flags & LOOKUP_FOLLOW)) {
+			path_put_conditional(&path, nd);
+			path_put(&nd->path);
+			error = -ELOOP;
+			break;
+		}
+		error = may_follow_link(&link, nd);
+		if (unlikely(error))
+			break;
+		nd->flags |= LOOKUP_PARENT;
+		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+		error = follow_link(&link, nd, &cookie);
+		if (unlikely(error))
+			break;
+		error = do_last(nd, &path, file, op, &opened, pathname);
+		put_link(nd, &link, cookie);
+	}
+out:
+	path_cleanup(nd);
+out2:
+	if (!(opened & FILE_OPENED)) {
+		BUG_ON(!error);
+		put_filp(file);
+	}
+	if (unlikely(error)) {
+		if (error == -EOPENSTALE) {
+			if (flags & LOOKUP_RCU)
+				error = -ECHILD;
+			else
+				error = -ESTALE;
+		}
+		file = ERR_PTR(error);
+	}
+	return file;
+}
+
+struct file *do_filp_open(int dfd, struct filename *pathname,
+		const struct open_flags *op)
+{
+	struct nameidata nd;
+	int flags = op->lookup_flags;
+	struct file *filp;
+
+	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+	if (unlikely(filp == ERR_PTR(-ECHILD)))
+		filp = path_openat(dfd, pathname, &nd, op, flags);
+	if (unlikely(filp == ERR_PTR(-ESTALE)))
+		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+	return filp;
+}
+
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+		const char *name, const struct open_flags *op)
+{
+	struct nameidata nd;
+	struct file *file;
+	struct filename *filename;
+	int flags = op->lookup_flags | LOOKUP_ROOT;
+
+	nd.root.mnt = mnt;
+	nd.root.dentry = dentry;
+
+	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
+		return ERR_PTR(-ELOOP);
+
+	filename = getname_kernel(name);
+	if (unlikely(IS_ERR(filename)))
+		return ERR_CAST(filename);
+
+	file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
+	if (unlikely(file == ERR_PTR(-ECHILD)))
+		file = path_openat(-1, filename, &nd, op, flags);
+	if (unlikely(file == ERR_PTR(-ESTALE)))
+		file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+	putname(filename);
+	return file;
+}
+
+static struct dentry *filename_create(int dfd, struct filename *name,
+				struct path *path, unsigned int lookup_flags)
+{
+	struct dentry *dentry = ERR_PTR(-EEXIST);
+	struct nameidata nd;
+	int err2;
+	int error;
+	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
+
+	/*
+	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
+	 * other flags passed in are ignored!
+	 */
+	lookup_flags &= LOOKUP_REVAL;
+
+	error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
+	if (error)
+		return ERR_PTR(error);
+
+	/*
+	 * Yucky last component or no last component at all?
+	 * (foo/., foo/.., /////)
+	 */
+	if (nd.last_type != LAST_NORM)
+		goto out;
+	nd.flags &= ~LOOKUP_PARENT;
+	nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+
+	/* don't fail immediately if it's r/o, at least try to report other errors */
+	err2 = mnt_want_write(nd.path.mnt);
+	/*
+	 * Do the final lookup.
+	 */
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_hash(&nd);
+	if (IS_ERR(dentry))
+		goto unlock;
+
+	error = -EEXIST;
+	if (d_is_positive(dentry))
+		goto fail;
+
+	/*
+	 * Special case - lookup gave negative, but... we had foo/bar/
+	 * From the vfs_mknod() POV we just have a negative dentry -
+	 * all is fine. Let's be bastards - you had / on the end, you've
+	 * been asking for (non-existent) directory. -ENOENT for you.
+	 */
+	if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
+		error = -ENOENT;
+		goto fail;
+	}
+	if (unlikely(err2)) {
+		error = err2;
+		goto fail;
+	}
+	*path = nd.path;
+	return dentry;
+fail:
+	dput(dentry);
+	dentry = ERR_PTR(error);
+unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	if (!err2)
+		mnt_drop_write(nd.path.mnt);
+out:
+	path_put(&nd.path);
+	return dentry;
+}
+
+struct dentry *kern_path_create(int dfd, const char *pathname,
+				struct path *path, unsigned int lookup_flags)
+{
+	struct filename *filename = getname_kernel(pathname);
+	struct dentry *res;
+
+	if (IS_ERR(filename))
+		return ERR_CAST(filename);
+	res = filename_create(dfd, filename, path, lookup_flags);
+	putname(filename);
+	return res;
+}
+EXPORT_SYMBOL(kern_path_create);
+
+void done_path_create(struct path *path, struct dentry *dentry)
+{
+	dput(dentry);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	mnt_drop_write(path->mnt);
+	path_put(path);
+}
+EXPORT_SYMBOL(done_path_create);
+
+struct dentry *user_path_create(int dfd, const char __user *pathname,
+				struct path *path, unsigned int lookup_flags)
+{
+	struct filename *tmp = getname(pathname);
+	struct dentry *res;
+	if (IS_ERR(tmp))
+		return ERR_CAST(tmp);
+	res = filename_create(dfd, tmp, path, lookup_flags);
+	putname(tmp);
+	return res;
+}
+EXPORT_SYMBOL(user_path_create);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	int error = may_create(dir, dentry);
+
+	if (error)
+		return error;
+
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+		return -EPERM;
+
+	if (!dir->i_op->mknod)
+		return -EPERM;
+
+	error = devcgroup_inode_mknod(mode, dev);
+	if (error)
+		return error;
+
+	error = security_inode_mknod(dir, dentry, mode, dev);
+	if (error)
+		return error;
+
+	error = dir->i_op->mknod(dir, dentry, mode, dev);
+	if (!error)
+		fsnotify_create(dir, dentry);
+	return error;
+}
+EXPORT_SYMBOL(vfs_mknod);
+
+static int may_mknod(umode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+	case 0: /* zero mode translates to S_IFREG */
+		return 0;
+	case S_IFDIR:
+		return -EPERM;
+	default:
+		return -EINVAL;
+	}
+}
+
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
+		unsigned, dev)
+{
+	struct dentry *dentry;
+	struct path path;
+	int error;
+	unsigned int lookup_flags = 0;
+
+	error = may_mknod(mode);
+	if (error)
+		return error;
+retry:
+	dentry = user_path_create(dfd, filename, &path, lookup_flags);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	if (!IS_POSIXACL(path.dentry->d_inode))
+		mode &= ~current_umask();
+	error = security_path_mknod(&path, dentry, mode, dev);
+	if (error)
+		goto out;
+	switch (mode & S_IFMT) {
+		case 0: case S_IFREG:
+			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+			break;
+		case S_IFCHR: case S_IFBLK:
+			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
+					new_decode_dev(dev));
+			break;
+		case S_IFIFO: case S_IFSOCK:
+			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
+			break;
+	}
+out:
+	done_path_create(&path, dentry);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
+{
+	return sys_mknodat(AT_FDCWD, filename, mode, dev);
+}
+
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int error = may_create(dir, dentry);
+	unsigned max_links = dir->i_sb->s_max_links;
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->mkdir)
+		return -EPERM;
+
+	mode &= (S_IRWXUGO|S_ISVTX);
+	error = security_inode_mkdir(dir, dentry, mode);
+	if (error)
+		return error;
+
+	if (max_links && dir->i_nlink >= max_links)
+		return -EMLINK;
+
+	error = dir->i_op->mkdir(dir, dentry, mode);
+	if (!error)
+		fsnotify_mkdir(dir, dentry);
+	return error;
+}
+EXPORT_SYMBOL(vfs_mkdir);
+
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
+{
+	struct dentry *dentry;
+	struct path path;
+	int error;
+	unsigned int lookup_flags = LOOKUP_DIRECTORY;
+
+retry:
+	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	if (!IS_POSIXACL(path.dentry->d_inode))
+		mode &= ~current_umask();
+	error = security_path_mkdir(&path, dentry, mode);
+	if (!error)
+		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+	done_path_create(&path, dentry);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
+{
+	return sys_mkdirat(AT_FDCWD, pathname, mode);
+}
+
+/*
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 1 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
+ *
+ * A low-level filesystem can, if it choses, legally
+ * do a
+ *
+ *	if (!d_unhashed(dentry))
+ *		return -EBUSY;
+ *
+ * if it cannot handle the case of removing a directory
+ * that is still in use by something else..
+ */
+void dentry_unhash(struct dentry *dentry)
+{
+	shrink_dcache_parent(dentry);
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_lockref.count == 1)
+		__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(dentry_unhash);
+
+int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int error = may_delete(dir, dentry, 1);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->rmdir)
+		return -EPERM;
+
+	dget(dentry);
+	mutex_lock(&dentry->d_inode->i_mutex);
+
+	error = -EBUSY;
+	if (is_local_mountpoint(dentry))
+		goto out;
+
+	error = security_inode_rmdir(dir, dentry);
+	if (error)
+		goto out;
+
+	shrink_dcache_parent(dentry);
+	error = dir->i_op->rmdir(dir, dentry);
+	if (error)
+		goto out;
+
+	dentry->d_inode->i_flags |= S_DEAD;
+	dont_mount(dentry);
+	detach_mounts(dentry);
+
+out:
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	dput(dentry);
+	if (!error)
+		d_delete(dentry);
+	return error;
+}
+EXPORT_SYMBOL(vfs_rmdir);
+
+static long do_rmdir(int dfd, const char __user *pathname)
+{
+	int error = 0;
+	struct filename *name;
+	struct dentry *dentry;
+	struct nameidata nd;
+	unsigned int lookup_flags = 0;
+retry:
+	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	switch(nd.last_type) {
+	case LAST_DOTDOT:
+		error = -ENOTEMPTY;
+		goto exit1;
+	case LAST_DOT:
+		error = -EINVAL;
+		goto exit1;
+	case LAST_ROOT:
+		error = -EBUSY;
+		goto exit1;
+	}
+
+	nd.flags &= ~LOOKUP_PARENT;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto exit1;
+
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_hash(&nd);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto exit2;
+	if (!dentry->d_inode) {
+		error = -ENOENT;
+		goto exit3;
+	}
+	error = security_path_rmdir(&nd.path, dentry);
+	if (error)
+		goto exit3;
+	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit3:
+	dput(dentry);
+exit2:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	mnt_drop_write(nd.path.mnt);
+exit1:
+	path_put(&nd.path);
+	putname(name);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
+{
+	return do_rmdir(AT_FDCWD, pathname);
+}
+
+/**
+ * vfs_unlink - unlink a filesystem object
+ * @dir:	parent directory
+ * @dentry:	victim
+ * @delegated_inode: returns victim inode, if the inode is delegated.
+ *
+ * The caller must hold dir->i_mutex.
+ *
+ * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
+ * return a reference to the inode in delegated_inode.  The caller
+ * should then break the delegation on that inode and retry.  Because
+ * breaking a delegation may take a long time, the caller should drop
+ * dir->i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ */
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+{
+	struct inode *target = dentry->d_inode;
+	int error = may_delete(dir, dentry, 0);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->unlink)
+		return -EPERM;
+
+	mutex_lock(&target->i_mutex);
+	if (is_local_mountpoint(dentry))
+		error = -EBUSY;
+	else {
+		error = security_inode_unlink(dir, dentry);
+		if (!error) {
+			error = try_break_deleg(target, delegated_inode);
+			if (error)
+				goto out;
+			error = dir->i_op->unlink(dir, dentry);
+			if (!error) {
+				dont_mount(dentry);
+				detach_mounts(dentry);
+			}
+		}
+	}
+out:
+	mutex_unlock(&target->i_mutex);
+
+	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
+	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+		fsnotify_link_count(target);
+		d_delete(dentry);
+	}
+
+	return error;
+}
+EXPORT_SYMBOL(vfs_unlink);
+
+/*
+ * Make sure that the actual truncation of the file will occur outside its
+ * directory's i_mutex.  Truncate can take a long time if there is a lot of
+ * writeout happening, and we don't want to prevent access to the directory
+ * while waiting on the I/O.
+ */
+static long do_unlinkat(int dfd, const char __user *pathname)
+{
+	int error;
+	struct filename *name;
+	struct dentry *dentry;
+	struct nameidata nd;
+	struct inode *inode = NULL;
+	struct inode *delegated_inode = NULL;
+	unsigned int lookup_flags = 0;
+retry:
+	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	error = -EISDIR;
+	if (nd.last_type != LAST_NORM)
+		goto exit1;
+
+	nd.flags &= ~LOOKUP_PARENT;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto exit1;
+retry_deleg:
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_hash(&nd);
+	error = PTR_ERR(dentry);
+	if (!IS_ERR(dentry)) {
+		/* Why not before? Because we want correct error value */
+		if (nd.last.name[nd.last.len])
+			goto slashes;
+		inode = dentry->d_inode;
+		if (d_is_negative(dentry))
+			goto slashes;
+		ihold(inode);
+		error = security_path_unlink(&nd.path, dentry);
+		if (error)
+			goto exit2;
+		error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
+exit2:
+		dput(dentry);
+	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	if (inode)
+		iput(inode);	/* truncate the inode here */
+	inode = NULL;
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+	mnt_drop_write(nd.path.mnt);
+exit1:
+	path_put(&nd.path);
+	putname(name);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		inode = NULL;
+		goto retry;
+	}
+	return error;
+
+slashes:
+	if (d_is_negative(dentry))
+		error = -ENOENT;
+	else if (d_is_dir(dentry))
+		error = -EISDIR;
+	else
+		error = -ENOTDIR;
+	goto exit2;
+}
+
+SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
+{
+	if ((flag & ~AT_REMOVEDIR) != 0)
+		return -EINVAL;
+
+	if (flag & AT_REMOVEDIR)
+		return do_rmdir(dfd, pathname);
+
+	return do_unlinkat(dfd, pathname);
+}
+
+SYSCALL_DEFINE1(unlink, const char __user *, pathname)
+{
+	return do_unlinkat(AT_FDCWD, pathname);
+}
+
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+	int error = may_create(dir, dentry);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->symlink)
+		return -EPERM;
+
+	error = security_inode_symlink(dir, dentry, oldname);
+	if (error)
+		return error;
+
+	error = dir->i_op->symlink(dir, dentry, oldname);
+	if (!error)
+		fsnotify_create(dir, dentry);
+	return error;
+}
+EXPORT_SYMBOL(vfs_symlink);
+
+SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
+{
+	int error;
+	struct filename *from;
+	struct dentry *dentry;
+	struct path path;
+	unsigned int lookup_flags = 0;
+
+	from = getname(oldname);
+	if (IS_ERR(from))
+		return PTR_ERR(from);
+retry:
+	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_putname;
+
+	error = security_path_symlink(&path, dentry, from->name);
+	if (!error)
+		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+	done_path_create(&path, dentry);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+out_putname:
+	putname(from);
+	return error;
+}
+
+SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
+{
+	return sys_symlinkat(oldname, AT_FDCWD, newname);
+}
+
+/**
+ * vfs_link - create a new link
+ * @old_dentry:	object to be linked
+ * @dir:	new parent
+ * @new_dentry:	where to create the new link
+ * @delegated_inode: returns inode needing a delegation break
+ *
+ * The caller must hold dir->i_mutex
+ *
+ * If vfs_link discovers a delegation on the to-be-linked file in need
+ * of breaking, it will return -EWOULDBLOCK and return a reference to the
+ * inode in delegated_inode.  The caller should then break the delegation
+ * and retry.  Because breaking a delegation may take a long time, the
+ * caller should drop the i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ */
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+{
+	struct inode *inode = old_dentry->d_inode;
+	unsigned max_links = dir->i_sb->s_max_links;
+	int error;
+
+	if (!inode)
+		return -ENOENT;
+
+	error = may_create(dir, new_dentry);
+	if (error)
+		return error;
+
+	if (dir->i_sb != inode->i_sb)
+		return -EXDEV;
+
+	/*
+	 * A link to an append-only or immutable file cannot be created.
+	 */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+	if (!dir->i_op->link)
+		return -EPERM;
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	error = security_inode_link(old_dentry, dir, new_dentry);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	/* Make sure we don't allow creating hardlink to an unlinked file */
+	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+		error =  -ENOENT;
+	else if (max_links && inode->i_nlink >= max_links)
+		error = -EMLINK;
+	else {
+		error = try_break_deleg(inode, delegated_inode);
+		if (!error)
+			error = dir->i_op->link(old_dentry, dir, new_dentry);
+	}
+
+	if (!error && (inode->i_state & I_LINKABLE)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
+	mutex_unlock(&inode->i_mutex);
+	if (!error)
+		fsnotify_link(dir, inode, new_dentry);
+	return error;
+}
+EXPORT_SYMBOL(vfs_link);
+
+/*
+ * Hardlinks are often used in delicate situations.  We avoid
+ * security-related surprises by not following symlinks on the
+ * newname.  --KAB
+ *
+ * We don't follow them on the oldname either to be compatible
+ * with linux 2.0, and to avoid hard-linking to directories
+ * and other special files.  --ADM
+ */
+SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname, int, flags)
+{
+	struct dentry *new_dentry;
+	struct path old_path, new_path;
+	struct inode *delegated_inode = NULL;
+	int how = 0;
+	int error;
+
+	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+	/*
+	 * To use null names we require CAP_DAC_READ_SEARCH
+	 * This ensures that not everyone will be able to create
+	 * handlink using the passed filedescriptor.
+	 */
+	if (flags & AT_EMPTY_PATH) {
+		if (!capable(CAP_DAC_READ_SEARCH))
+			return -ENOENT;
+		how = LOOKUP_EMPTY;
+	}
+
+	if (flags & AT_SYMLINK_FOLLOW)
+		how |= LOOKUP_FOLLOW;
+retry:
+	error = user_path_at(olddfd, oldname, how, &old_path);
+	if (error)
+		return error;
+
+	new_dentry = user_path_create(newdfd, newname, &new_path,
+					(how & LOOKUP_REVAL));
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry))
+		goto out;
+
+	error = -EXDEV;
+	if (old_path.mnt != new_path.mnt)
+		goto out_dput;
+	error = may_linkat(&old_path);
+	if (unlikely(error))
+		goto out_dput;
+	error = security_path_link(old_path.dentry, &new_path, new_dentry);
+	if (error)
+		goto out_dput;
+	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+out_dput:
+	done_path_create(&new_path, new_dentry);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error) {
+			path_put(&old_path);
+			goto retry;
+		}
+	}
+	if (retry_estale(error, how)) {
+		path_put(&old_path);
+		how |= LOOKUP_REVAL;
+		goto retry;
+	}
+out:
+	path_put(&old_path);
+
+	return error;
+}
+
+SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
+{
+	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+}
+
+/**
+ * vfs_rename - rename a filesystem object
+ * @old_dir:	parent of source
+ * @old_dentry:	source
+ * @new_dir:	parent of destination
+ * @new_dentry:	destination
+ * @delegated_inode: returns an inode needing a delegation break
+ * @flags:	rename flags
+ *
+ * The caller must hold multiple mutexes--see lock_rename()).
+ *
+ * If vfs_rename discovers a delegation in need of breaking at either
+ * the source or destination, it will return -EWOULDBLOCK and return a
+ * reference to the inode in delegated_inode.  The caller should then
+ * break the delegation and retry.  Because breaking a delegation may
+ * take a long time, the caller should drop all locks before doing
+ * so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ *
+ * The worst of all namespace operations - renaming directory. "Perverted"
+ * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
+ * Problems:
+ *	a) we can get into loop creation.
+ *	b) race potential - two innocent renames can create a loop together.
+ *	   That's where 4.4 screws up. Current fix: serialization on
+ *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
+ *	   story.
+ *	c) we have to lock _four_ objects - parents and victim (if it exists),
+ *	   and source (if it is not a directory).
+ *	   And that - after we got ->i_mutex on parents (until then we don't know
+ *	   whether the target exists).  Solution: try to be smart with locking
+ *	   order for inodes.  We rely on the fact that tree topology may change
+ *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
+ *	   move will be locked.  Thus we can rank directories by the tree
+ *	   (ancestors first) and rank all non-directories after them.
+ *	   That works since everybody except rename does "lock parent, lookup,
+ *	   lock child" and rename is under ->s_vfs_rename_mutex.
+ *	   HOWEVER, it relies on the assumption that any object with ->lookup()
+ *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
+ *	   we'd better make sure that there's no link(2) for them.
+ *	d) conversion from fhandle to dentry may come in the wrong moment - when
+ *	   we are removing the target. Solution: we will have to grab ->i_mutex
+ *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
+ *	   ->i_mutex on parents, which works but leads to some truly excessive
+ *	   locking].
+ */
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+	       struct inode *new_dir, struct dentry *new_dentry,
+	       struct inode **delegated_inode, unsigned int flags)
+{
+	int error;
+	bool is_dir = d_is_dir(old_dentry);
+	const unsigned char *old_name;
+	struct inode *source = old_dentry->d_inode;
+	struct inode *target = new_dentry->d_inode;
+	bool new_is_dir = false;
+	unsigned max_links = new_dir->i_sb->s_max_links;
+
+	if (source == target)
+		return 0;
+
+	error = may_delete(old_dir, old_dentry, is_dir);
+	if (error)
+		return error;
+
+	if (!target) {
+		error = may_create(new_dir, new_dentry);
+	} else {
+		new_is_dir = d_is_dir(new_dentry);
+
+		if (!(flags & RENAME_EXCHANGE))
+			error = may_delete(new_dir, new_dentry, is_dir);
+		else
+			error = may_delete(new_dir, new_dentry, new_is_dir);
+	}
+	if (error)
+		return error;
+
+	if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
+		return -EPERM;
+
+	if (flags && !old_dir->i_op->rename2)
+		return -EINVAL;
+
+	/*
+	 * If we are going to change the parent - check write permissions,
+	 * we'll need to flip '..'.
+	 */
+	if (new_dir != old_dir) {
+		if (is_dir) {
+			error = inode_permission(source, MAY_WRITE);
+			if (error)
+				return error;
+		}
+		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
+			error = inode_permission(target, MAY_WRITE);
+			if (error)
+				return error;
+		}
+	}
+
+	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
+				      flags);
+	if (error)
+		return error;
+
+	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+	dget(new_dentry);
+	if (!is_dir || (flags & RENAME_EXCHANGE))
+		lock_two_nondirectories(source, target);
+	else if (target)
+		mutex_lock(&target->i_mutex);
+
+	error = -EBUSY;
+	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
+		goto out;
+
+	if (max_links && new_dir != old_dir) {
+		error = -EMLINK;
+		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
+			goto out;
+		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
+		    old_dir->i_nlink >= max_links)
+			goto out;
+	}
+	if (is_dir && !(flags & RENAME_EXCHANGE) && target)
+		shrink_dcache_parent(new_dentry);
+	if (!is_dir) {
+		error = try_break_deleg(source, delegated_inode);
+		if (error)
+			goto out;
+	}
+	if (target && !new_is_dir) {
+		error = try_break_deleg(target, delegated_inode);
+		if (error)
+			goto out;
+	}
+	if (!old_dir->i_op->rename2) {
+		error = old_dir->i_op->rename(old_dir, old_dentry,
+					      new_dir, new_dentry);
+	} else {
+		WARN_ON(old_dir->i_op->rename != NULL);
+		error = old_dir->i_op->rename2(old_dir, old_dentry,
+					       new_dir, new_dentry, flags);
+	}
+	if (error)
+		goto out;
+
+	if (!(flags & RENAME_EXCHANGE) && target) {
+		if (is_dir)
+			target->i_flags |= S_DEAD;
+		dont_mount(new_dentry);
+		detach_mounts(new_dentry);
+	}
+	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
+		if (!(flags & RENAME_EXCHANGE))
+			d_move(old_dentry, new_dentry);
+		else
+			d_exchange(old_dentry, new_dentry);
+	}
+out:
+	if (!is_dir || (flags & RENAME_EXCHANGE))
+		unlock_two_nondirectories(source, target);
+	else if (target)
+		mutex_unlock(&target->i_mutex);
+	dput(new_dentry);
+	if (!error) {
+		fsnotify_move(old_dir, new_dir, old_name, is_dir,
+			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
+		if (flags & RENAME_EXCHANGE) {
+			fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
+				      new_is_dir, NULL, new_dentry);
+		}
+	}
+	fsnotify_oldname_free(old_name);
+
+	return error;
+}
+EXPORT_SYMBOL(vfs_rename);
+
+SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname, unsigned int, flags)
+{
+	struct dentry *old_dir, *new_dir;
+	struct dentry *old_dentry, *new_dentry;
+	struct dentry *trap;
+	struct nameidata oldnd, newnd;
+	struct inode *delegated_inode = NULL;
+	struct filename *from;
+	struct filename *to;
+	unsigned int lookup_flags = 0;
+	bool should_retry = false;
+	int error;
+
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+		return -EINVAL;
+
+	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
+	    (flags & RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+		return -EPERM;
+
+retry:
+	from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
+	if (IS_ERR(from)) {
+		error = PTR_ERR(from);
+		goto exit;
+	}
+
+	to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
+	if (IS_ERR(to)) {
+		error = PTR_ERR(to);
+		goto exit1;
+	}
+
+	error = -EXDEV;
+	if (oldnd.path.mnt != newnd.path.mnt)
+		goto exit2;
+
+	old_dir = oldnd.path.dentry;
+	error = -EBUSY;
+	if (oldnd.last_type != LAST_NORM)
+		goto exit2;
+
+	new_dir = newnd.path.dentry;
+	if (flags & RENAME_NOREPLACE)
+		error = -EEXIST;
+	if (newnd.last_type != LAST_NORM)
+		goto exit2;
+
+	error = mnt_want_write(oldnd.path.mnt);
+	if (error)
+		goto exit2;
+
+	oldnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags &= ~LOOKUP_PARENT;
+	if (!(flags & RENAME_EXCHANGE))
+		newnd.flags |= LOOKUP_RENAME_TARGET;
+
+retry_deleg:
+	trap = lock_rename(new_dir, old_dir);
+
+	old_dentry = lookup_hash(&oldnd);
+	error = PTR_ERR(old_dentry);
+	if (IS_ERR(old_dentry))
+		goto exit3;
+	/* source must exist */
+	error = -ENOENT;
+	if (d_is_negative(old_dentry))
+		goto exit4;
+	new_dentry = lookup_hash(&newnd);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry))
+		goto exit4;
+	error = -EEXIST;
+	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
+		goto exit5;
+	if (flags & RENAME_EXCHANGE) {
+		error = -ENOENT;
+		if (d_is_negative(new_dentry))
+			goto exit5;
+
+		if (!d_is_dir(new_dentry)) {
+			error = -ENOTDIR;
+			if (newnd.last.name[newnd.last.len])
+				goto exit5;
+		}
+	}
+	/* unless the source is a directory trailing slashes give -ENOTDIR */
+	if (!d_is_dir(old_dentry)) {
+		error = -ENOTDIR;
+		if (oldnd.last.name[oldnd.last.len])
+			goto exit5;
+		if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
+			goto exit5;
+	}
+	/* source should not be ancestor of target */
+	error = -EINVAL;
+	if (old_dentry == trap)
+		goto exit5;
+	/* target should not be an ancestor of source */
+	if (!(flags & RENAME_EXCHANGE))
+		error = -ENOTEMPTY;
+	if (new_dentry == trap)
+		goto exit5;
+
+	error = security_path_rename(&oldnd.path, old_dentry,
+				     &newnd.path, new_dentry, flags);
+	if (error)
+		goto exit5;
+	error = vfs_rename(old_dir->d_inode, old_dentry,
+			   new_dir->d_inode, new_dentry,
+			   &delegated_inode, flags);
+exit5:
+	dput(new_dentry);
+exit4:
+	dput(old_dentry);
+exit3:
+	unlock_rename(new_dir, old_dir);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+	mnt_drop_write(oldnd.path.mnt);
+exit2:
+	if (retry_estale(error, lookup_flags))
+		should_retry = true;
+	path_put(&newnd.path);
+	putname(to);
+exit1:
+	path_put(&oldnd.path);
+	putname(from);
+	if (should_retry) {
+		should_retry = false;
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+exit:
+	return error;
+}
+
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
+{
+	return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
+}
+
+SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
+{
+	return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+}
+
+int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+{
+	int error = may_create(dir, dentry);
+	if (error)
+		return error;
+
+	if (!dir->i_op->mknod)
+		return -EPERM;
+
+	return dir->i_op->mknod(dir, dentry,
+				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+}
+EXPORT_SYMBOL(vfs_whiteout);
+
+int readlink_copy(char __user *buffer, int buflen, const char *link)
+{
+	int len = PTR_ERR(link);
+	if (IS_ERR(link))
+		goto out;
+
+	len = strlen(link);
+	if (len > (unsigned) buflen)
+		len = buflen;
+	if (copy_to_user(buffer, link, len))
+		len = -EFAULT;
+out:
+	return len;
+}
+EXPORT_SYMBOL(readlink_copy);
+
+/*
+ * A helper for ->readlink().  This should be used *ONLY* for symlinks that
+ * have ->follow_link() touching nd only in nd_set_link().  Using (or not
+ * using) it for any given inode is up to filesystem.
+ */
+int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct nameidata nd;
+	void *cookie;
+	int res;
+
+	nd.depth = 0;
+	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
+	if (IS_ERR(cookie))
+		return PTR_ERR(cookie);
+
+	res = readlink_copy(buffer, buflen, nd_get_link(&nd));
+	if (dentry->d_inode->i_op->put_link)
+		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+	return res;
+}
+EXPORT_SYMBOL(generic_readlink);
+
+/* get the link contents into pagecache */
+static char *page_getlink(struct dentry * dentry, struct page **ppage)
+{
+	char *kaddr;
+	struct page *page;
+	struct address_space *mapping = dentry->d_inode->i_mapping;
+	page = read_mapping_page(mapping, 0, NULL);
+	if (IS_ERR(page))
+		return (char*)page;
+	*ppage = page;
+	kaddr = kmap(page);
+	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+	return kaddr;
+}
+
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct page *page = NULL;
+	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+	return res;
+}
+EXPORT_SYMBOL(page_readlink);
+
+void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
+{
+	struct page *page = NULL;
+	nd_set_link(nd, page_getlink(dentry, &page));
+	return page;
+}
+EXPORT_SYMBOL(page_follow_link_light);
+
+void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	struct page *page = cookie;
+
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+}
+EXPORT_SYMBOL(page_put_link);
+
+/*
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	void *fsdata;
+	int err;
+	char *kaddr;
+	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+	if (nofs)
+		flags |= AOP_FLAG_NOFS;
+
+retry:
+	err = pagecache_write_begin(NULL, mapping, 0, len-1,
+				flags, &page, &fsdata);
+	if (err)
+		goto fail;
+
+	kaddr = kmap_atomic(page);
+	memcpy(kaddr, symname, len-1);
+	kunmap_atomic(kaddr);
+
+	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
+							page, fsdata);
+	if (err < 0)
+		goto fail;
+	if (err < len-1)
+		goto retry;
+
+	mark_inode_dirty(inode);
+	return 0;
+fail:
+	return err;
+}
+EXPORT_SYMBOL(__page_symlink);
+
+int page_symlink(struct inode *inode, const char *symname, int len)
+{
+	return __page_symlink(inode, symname, len,
+			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+}
+EXPORT_SYMBOL(page_symlink);
+
+const struct inode_operations page_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/kernel/fs/namespace.c b/kernel/fs/namespace.c
new file mode 100644
index 000000000..c246b29e4
--- /dev/null
+++ b/kernel/fs/namespace.c
@@ -0,0 +1,3298 @@
+/*
+ *  linux/fs/namespace.c
+ *
+ * (C) Copyright Al Viro 2000, 2001
+ *	Released under GPL v2.
+ *
+ * Based on code from fs/super.c, copyright Linus Torvalds and others.
+ * Heavily rewritten.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/capability.h>
+#include <linux/mnt_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/namei.h>
+#include <linux/delay.h>
+#include <linux/security.h>
+#include <linux/idr.h>
+#include <linux/init.h>		/* init_rootfs */
+#include <linux/fs_struct.h>	/* get_fs_root et.al. */
+#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
+#include <linux/uaccess.h>
+#include <linux/proc_ns.h>
+#include <linux/magic.h>
+#include <linux/bootmem.h>
+#include <linux/task_work.h>
+#include "pnode.h"
+#include "internal.h"
+
+static unsigned int m_hash_mask __read_mostly;
+static unsigned int m_hash_shift __read_mostly;
+static unsigned int mp_hash_mask __read_mostly;
+static unsigned int mp_hash_shift __read_mostly;
+
+static __initdata unsigned long mhash_entries;
+static int __init set_mhash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	mhash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("mhash_entries=", set_mhash_entries);
+
+static __initdata unsigned long mphash_entries;
+static int __init set_mphash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	mphash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("mphash_entries=", set_mphash_entries);
+
+static u64 event;
+static DEFINE_IDA(mnt_id_ida);
+static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
+static int mnt_id_start = 0;
+static int mnt_group_start = 1;
+
+static struct hlist_head *mount_hashtable __read_mostly;
+static struct hlist_head *mountpoint_hashtable __read_mostly;
+static struct kmem_cache *mnt_cache __read_mostly;
+static DECLARE_RWSEM(namespace_sem);
+
+/* /sys/fs */
+struct kobject *fs_kobj;
+EXPORT_SYMBOL_GPL(fs_kobj);
+
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
+
+static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
+{
+	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
+	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
+	tmp = tmp + (tmp >> m_hash_shift);
+	return &mount_hashtable[tmp & m_hash_mask];
+}
+
+static inline struct hlist_head *mp_hash(struct dentry *dentry)
+{
+	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
+	tmp = tmp + (tmp >> mp_hash_shift);
+	return &mountpoint_hashtable[tmp & mp_hash_mask];
+}
+
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
+static int mnt_alloc_id(struct mount *mnt)
+{
+	int res;
+
+retry:
+	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
+	spin_lock(&mnt_id_lock);
+	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
+	if (!res)
+		mnt_id_start = mnt->mnt_id + 1;
+	spin_unlock(&mnt_id_lock);
+	if (res == -EAGAIN)
+		goto retry;
+
+	return res;
+}
+
+static void mnt_free_id(struct mount *mnt)
+{
+	int id = mnt->mnt_id;
+	spin_lock(&mnt_id_lock);
+	ida_remove(&mnt_id_ida, id);
+	if (mnt_id_start > id)
+		mnt_id_start = id;
+	spin_unlock(&mnt_id_lock);
+}
+
+/*
+ * Allocate a new peer group ID
+ *
+ * mnt_group_ida is protected by namespace_sem
+ */
+static int mnt_alloc_group_id(struct mount *mnt)
+{
+	int res;
+
+	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
+		return -ENOMEM;
+
+	res = ida_get_new_above(&mnt_group_ida,
+				mnt_group_start,
+				&mnt->mnt_group_id);
+	if (!res)
+		mnt_group_start = mnt->mnt_group_id + 1;
+
+	return res;
+}
+
+/*
+ * Release a peer group ID
+ */
+void mnt_release_group_id(struct mount *mnt)
+{
+	int id = mnt->mnt_group_id;
+	ida_remove(&mnt_group_ida, id);
+	if (mnt_group_start > id)
+		mnt_group_start = id;
+	mnt->mnt_group_id = 0;
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct mount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+	preempt_disable();
+	mnt->mnt_count += n;
+	preempt_enable();
+#endif
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+	}
+
+	return count;
+#else
+	return mnt->mnt_count;
+#endif
+}
+
+static void drop_mountpoint(struct fs_pin *p)
+{
+	struct mount *m = container_of(p, struct mount, mnt_umount);
+	dput(m->mnt_ex_mountpoint);
+	pin_remove(p);
+	mntput(&m->mnt);
+}
+
+static struct mount *alloc_vfsmnt(const char *name)
+{
+	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+	if (mnt) {
+		int err;
+
+		err = mnt_alloc_id(mnt);
+		if (err)
+			goto out_free_cache;
+
+		if (name) {
+			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
+			if (!mnt->mnt_devname)
+				goto out_free_id;
+		}
+
+#ifdef CONFIG_SMP
+		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+		if (!mnt->mnt_pcp)
+			goto out_free_devname;
+
+		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+#else
+		mnt->mnt_count = 1;
+		mnt->mnt_writers = 0;
+#endif
+
+		INIT_HLIST_NODE(&mnt->mnt_hash);
+		INIT_LIST_HEAD(&mnt->mnt_child);
+		INIT_LIST_HEAD(&mnt->mnt_mounts);
+		INIT_LIST_HEAD(&mnt->mnt_list);
+		INIT_LIST_HEAD(&mnt->mnt_expire);
+		INIT_LIST_HEAD(&mnt->mnt_share);
+		INIT_LIST_HEAD(&mnt->mnt_slave_list);
+		INIT_LIST_HEAD(&mnt->mnt_slave);
+		INIT_HLIST_NODE(&mnt->mnt_mp_list);
+#ifdef CONFIG_FSNOTIFY
+		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+#endif
+		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
+	}
+	return mnt;
+
+#ifdef CONFIG_SMP
+out_free_devname:
+	kfree_const(mnt->mnt_devname);
+#endif
+out_free_id:
+	mnt_free_id(mnt);
+out_free_cache:
+	kmem_cache_free(mnt_cache, mnt);
+	return NULL;
+}
+
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*.  This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+	if (mnt->mnt_flags & MNT_READONLY)
+		return 1;
+	if (mnt->mnt_sb->s_flags & MS_RDONLY)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
+static inline void mnt_inc_writers(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
+#else
+	mnt->mnt_writers++;
+#endif
+}
+
+static inline void mnt_dec_writers(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
+#else
+	mnt->mnt_writers--;
+#endif
+}
+
+static unsigned int mnt_get_writers(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
+	}
+
+	return count;
+#else
+	return mnt->mnt_writers;
+#endif
+}
+
+static int mnt_is_readonly(struct vfsmount *mnt)
+{
+	if (mnt->mnt_sb->s_readonly_remount)
+		return 1;
+	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
+	smp_rmb();
+	return __mnt_is_readonly(mnt);
+}
+
+/*
+ * Most r/o & frozen checks on a fs are for operations that take discrete
+ * amounts of time, like a write() or unlink().  We must keep track of when
+ * those operations start (for permission checks) and when they end, so that we
+ * can determine when writes are able to occur to a filesystem.
+ */
+/**
+ * __mnt_want_write - get write access to a mount without freeze protection
+ * @m: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that writes are allowed (mnt it read-write) before
+ * returning success. This operation does not protect against filesystem being
+ * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * called. This is effectively a refcount.
+ */
+int __mnt_want_write(struct vfsmount *m)
+{
+	struct mount *mnt = real_mount(m);
+	int ret = 0;
+
+	preempt_disable();
+	mnt_inc_writers(mnt);
+	/*
+	 * The store to mnt_inc_writers must be visible before we pass
+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+	 * incremented count after it has set MNT_WRITE_HOLD.
+	 */
+	smp_mb();
+	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+		preempt_enable();
+		cpu_chill();
+		preempt_disable();
+	}
+	/*
+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+	 * be set to match its requirements. So we must not load that until
+	 * MNT_WRITE_HOLD is cleared.
+	 */
+	smp_rmb();
+	if (mnt_is_readonly(m)) {
+		mnt_dec_writers(mnt);
+		ret = -EROFS;
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+/**
+ * mnt_want_write - get write access to a mount
+ * @m: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that writes are allowed (mount is read-write, filesystem
+ * is not frozen) before returning success.  When the write operation is
+ * finished, mnt_drop_write() must be called.  This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *m)
+{
+	int ret;
+
+	sb_start_write(m->mnt_sb);
+	ret = __mnt_want_write(m);
+	if (ret)
+		sb_end_write(m->mnt_sb);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+/**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+	/* superblock may be r/o */
+	if (__mnt_is_readonly(mnt))
+		return -EROFS;
+	preempt_disable();
+	mnt_inc_writers(real_mount(mnt));
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * __mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like __mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int __mnt_want_write_file(struct file *file)
+{
+	if (!(file->f_mode & FMODE_WRITER))
+		return __mnt_want_write(file->f_path.mnt);
+	else
+		return mnt_clone_write(file->f_path.mnt);
+}
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+	int ret;
+
+	sb_start_write(file->f_path.mnt->mnt_sb);
+	ret = __mnt_want_write_file(file);
+	if (ret)
+		sb_end_write(file->f_path.mnt->mnt_sb);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
+/**
+ * __mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done
+ * performing writes to it.  Must be matched with
+ * __mnt_want_write() call above.
+ */
+void __mnt_drop_write(struct vfsmount *mnt)
+{
+	preempt_disable();
+	mnt_dec_writers(real_mount(mnt));
+	preempt_enable();
+}
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done performing writes to it and
+ * also allows filesystem to be frozen again.  Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+	__mnt_drop_write(mnt);
+	sb_end_write(mnt->mnt_sb);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
+void __mnt_drop_write_file(struct file *file)
+{
+	__mnt_drop_write(file->f_path.mnt);
+}
+
+void mnt_drop_write_file(struct file *file)
+{
+	mnt_drop_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL(mnt_drop_write_file);
+
+static int mnt_make_readonly(struct mount *mnt)
+{
+	int ret = 0;
+
+	lock_mount_hash();
+	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
+	/*
+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+	 * should be visible before we do.
+	 */
+	smp_mb();
+
+	/*
+	 * With writers on hold, if this value is zero, then there are
+	 * definitely no active writers (although held writers may subsequently
+	 * increment the count, they'll have to wait, and decrement it after
+	 * seeing MNT_READONLY).
+	 *
+	 * It is OK to have counter incremented on one CPU and decremented on
+	 * another: the sum will add up correctly. The danger would be when we
+	 * sum up each counter, if we read a counter before it is incremented,
+	 * but then read another CPU's count which it has been subsequently
+	 * decremented from -- we would see more decrements than we should.
+	 * MNT_WRITE_HOLD protects against this scenario, because
+	 * mnt_want_write first increments count, then smp_mb, then spins on
+	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+	 * we're counting up here.
+	 */
+	if (mnt_get_writers(mnt) > 0)
+		ret = -EBUSY;
+	else
+		mnt->mnt.mnt_flags |= MNT_READONLY;
+	/*
+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+	 * that become unheld will see MNT_READONLY.
+	 */
+	smp_wmb();
+	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+	unlock_mount_hash();
+	return ret;
+}
+
+static void __mnt_unmake_readonly(struct mount *mnt)
+{
+	lock_mount_hash();
+	mnt->mnt.mnt_flags &= ~MNT_READONLY;
+	unlock_mount_hash();
+}
+
+int sb_prepare_remount_readonly(struct super_block *sb)
+{
+	struct mount *mnt;
+	int err = 0;
+
+	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
+	if (atomic_long_read(&sb->s_remove_count))
+		return -EBUSY;
+
+	lock_mount_hash();
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
+			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
+			smp_mb();
+			if (mnt_get_writers(mnt) > 0) {
+				err = -EBUSY;
+				break;
+			}
+		}
+	}
+	if (!err && atomic_long_read(&sb->s_remove_count))
+		err = -EBUSY;
+
+	if (!err) {
+		sb->s_readonly_remount = 1;
+		smp_wmb();
+	}
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
+			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+	}
+	unlock_mount_hash();
+
+	return err;
+}
+
+static void free_vfsmnt(struct mount *mnt)
+{
+	kfree_const(mnt->mnt_devname);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_pcp);
+#endif
+	kmem_cache_free(mnt_cache, mnt);
+}
+
+static void delayed_free_vfsmnt(struct rcu_head *head)
+{
+	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
+}
+
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+	struct mount *mnt;
+	if (read_seqretry(&mount_lock, seq))
+		return false;
+	if (bastard == NULL)
+		return true;
+	mnt = real_mount(bastard);
+	mnt_add_count(mnt, 1);
+	if (likely(!read_seqretry(&mount_lock, seq)))
+		return true;
+	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
+		mnt_add_count(mnt, -1);
+		return false;
+	}
+	rcu_read_unlock();
+	mntput(bastard);
+	rcu_read_lock();
+	return false;
+}
+
+/*
+ * find the first mount at @dentry on vfsmount @mnt.
+ * call under rcu_read_lock()
+ */
+struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct hlist_head *head = m_hash(mnt, dentry);
+	struct mount *p;
+
+	hlist_for_each_entry_rcu(p, head, mnt_hash)
+		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
+			return p;
+	return NULL;
+}
+
+/*
+ * find the last mount at @dentry on vfsmount @mnt.
+ * mount_lock must be held.
+ */
+struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct mount *p, *res = NULL;
+	p = __lookup_mnt(mnt, dentry);
+	if (!p)
+		goto out;
+	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+		res = p;
+	hlist_for_each_entry_continue(p, mnt_hash) {
+		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
+			break;
+		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+			res = p;
+	}
+out:
+	return res;
+}
+
+/*
+ * lookup_mnt - Return the first child mount mounted at path
+ *
+ * "First" means first mounted chronologically.  If you create the
+ * following mounts:
+ *
+ * mount /dev/sda1 /mnt
+ * mount /dev/sda2 /mnt
+ * mount /dev/sda3 /mnt
+ *
+ * Then lookup_mnt() on the base /mnt dentry in the root mount will
+ * return successively the root dentry and vfsmount of /dev/sda1, then
+ * /dev/sda2, then /dev/sda3, then NULL.
+ *
+ * lookup_mnt takes a reference to the found vfsmount.
+ */
+struct vfsmount *lookup_mnt(struct path *path)
+{
+	struct mount *child_mnt;
+	struct vfsmount *m;
+	unsigned seq;
+
+	rcu_read_lock();
+	do {
+		seq = read_seqbegin(&mount_lock);
+		child_mnt = __lookup_mnt(path->mnt, path->dentry);
+		m = child_mnt ? &child_mnt->mnt : NULL;
+	} while (!legitimize_mnt(m, seq));
+	rcu_read_unlock();
+	return m;
+}
+
+/*
+ * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
+ *                         current mount namespace.
+ *
+ * The common case is dentries are not mountpoints at all and that
+ * test is handled inline.  For the slow case when we are actually
+ * dealing with a mountpoint of some kind, walk through all of the
+ * mounts in the current mount namespace and test to see if the dentry
+ * is a mountpoint.
+ *
+ * The mount_hashtable is not usable in the context because we
+ * need to identify all mounts that may be in the current mount
+ * namespace not just a mount that happens to have some specified
+ * parent mount.
+ */
+bool __is_local_mountpoint(struct dentry *dentry)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct mount *mnt;
+	bool is_covered = false;
+
+	if (!d_mountpoint(dentry))
+		goto out;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		is_covered = (mnt->mnt_mountpoint == dentry);
+		if (is_covered)
+			break;
+	}
+	up_read(&namespace_sem);
+out:
+	return is_covered;
+}
+
+static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
+{
+	struct hlist_head *chain = mp_hash(dentry);
+	struct mountpoint *mp;
+
+	hlist_for_each_entry(mp, chain, m_hash) {
+		if (mp->m_dentry == dentry) {
+			/* might be worth a WARN_ON() */
+			if (d_unlinked(dentry))
+				return ERR_PTR(-ENOENT);
+			mp->m_count++;
+			return mp;
+		}
+	}
+	return NULL;
+}
+
+static struct mountpoint *new_mountpoint(struct dentry *dentry)
+{
+	struct hlist_head *chain = mp_hash(dentry);
+	struct mountpoint *mp;
+	int ret;
+
+	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	if (!mp)
+		return ERR_PTR(-ENOMEM);
+
+	ret = d_set_mounted(dentry);
+	if (ret) {
+		kfree(mp);
+		return ERR_PTR(ret);
+	}
+
+	mp->m_dentry = dentry;
+	mp->m_count = 1;
+	hlist_add_head(&mp->m_hash, chain);
+	INIT_HLIST_HEAD(&mp->m_list);
+	return mp;
+}
+
+static void put_mountpoint(struct mountpoint *mp)
+{
+	if (!--mp->m_count) {
+		struct dentry *dentry = mp->m_dentry;
+		BUG_ON(!hlist_empty(&mp->m_list));
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_MOUNTED;
+		spin_unlock(&dentry->d_lock);
+		hlist_del(&mp->m_hash);
+		kfree(mp);
+	}
+}
+
+static inline int check_mnt(struct mount *mnt)
+{
+	return mnt->mnt_ns == current->nsproxy->mnt_ns;
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void touch_mnt_namespace(struct mnt_namespace *ns)
+{
+	if (ns) {
+		ns->event = ++event;
+		wake_up_interruptible(&ns->poll);
+	}
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void __touch_mnt_namespace(struct mnt_namespace *ns)
+{
+	if (ns && ns->event != event) {
+		ns->event = event;
+		wake_up_interruptible(&ns->poll);
+	}
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void unhash_mnt(struct mount *mnt)
+{
+	mnt->mnt_parent = mnt;
+	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+	list_del_init(&mnt->mnt_child);
+	hlist_del_init_rcu(&mnt->mnt_hash);
+	hlist_del_init(&mnt->mnt_mp_list);
+	put_mountpoint(mnt->mnt_mp);
+	mnt->mnt_mp = NULL;
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void detach_mnt(struct mount *mnt, struct path *old_path)
+{
+	old_path->dentry = mnt->mnt_mountpoint;
+	old_path->mnt = &mnt->mnt_parent->mnt;
+	unhash_mnt(mnt);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void umount_mnt(struct mount *mnt)
+{
+	/* old mountpoint will be dropped when we can do that */
+	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
+	unhash_mnt(mnt);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+void mnt_set_mountpoint(struct mount *mnt,
+			struct mountpoint *mp,
+			struct mount *child_mnt)
+{
+	mp->m_count++;
+	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
+	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
+	child_mnt->mnt_parent = mnt;
+	child_mnt->mnt_mp = mp;
+	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void attach_mnt(struct mount *mnt,
+			struct mount *parent,
+			struct mountpoint *mp)
+{
+	mnt_set_mountpoint(parent, mp, mnt);
+	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+}
+
+static void attach_shadowed(struct mount *mnt,
+			struct mount *parent,
+			struct mount *shadows)
+{
+	if (shadows) {
+		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
+		list_add(&mnt->mnt_child, &shadows->mnt_child);
+	} else {
+		hlist_add_head_rcu(&mnt->mnt_hash,
+				m_hash(&parent->mnt, mnt->mnt_mountpoint));
+		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	}
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void commit_tree(struct mount *mnt, struct mount *shadows)
+{
+	struct mount *parent = mnt->mnt_parent;
+	struct mount *m;
+	LIST_HEAD(head);
+	struct mnt_namespace *n = parent->mnt_ns;
+
+	BUG_ON(parent == mnt);
+
+	list_add_tail(&head, &mnt->mnt_list);
+	list_for_each_entry(m, &head, mnt_list)
+		m->mnt_ns = n;
+
+	list_splice(&head, n->list.prev);
+
+	attach_shadowed(mnt, parent, shadows);
+	touch_mnt_namespace(n);
+}
+
+static struct mount *next_mnt(struct mount *p, struct mount *root)
+{
+	struct list_head *next = p->mnt_mounts.next;
+	if (next == &p->mnt_mounts) {
+		while (1) {
+			if (p == root)
+				return NULL;
+			next = p->mnt_child.next;
+			if (next != &p->mnt_parent->mnt_mounts)
+				break;
+			p = p->mnt_parent;
+		}
+	}
+	return list_entry(next, struct mount, mnt_child);
+}
+
+static struct mount *skip_mnt_tree(struct mount *p)
+{
+	struct list_head *prev = p->mnt_mounts.prev;
+	while (prev != &p->mnt_mounts) {
+		p = list_entry(prev, struct mount, mnt_child);
+		prev = p->mnt_mounts.prev;
+	}
+	return p;
+}
+
+struct vfsmount *
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+{
+	struct mount *mnt;
+	struct dentry *root;
+
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = alloc_vfsmnt(name);
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
+
+	if (flags & MS_KERNMOUNT)
+		mnt->mnt.mnt_flags = MNT_INTERNAL;
+
+	root = mount_fs(type, flags, name, data);
+	if (IS_ERR(root)) {
+		mnt_free_id(mnt);
+		free_vfsmnt(mnt);
+		return ERR_CAST(root);
+	}
+
+	mnt->mnt.mnt_root = root;
+	mnt->mnt.mnt_sb = root->d_sb;
+	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+	mnt->mnt_parent = mnt;
+	lock_mount_hash();
+	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
+	unlock_mount_hash();
+	return &mnt->mnt;
+}
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+static struct mount *clone_mnt(struct mount *old, struct dentry *root,
+					int flag)
+{
+	struct super_block *sb = old->mnt.mnt_sb;
+	struct mount *mnt;
+	int err;
+
+	mnt = alloc_vfsmnt(old->mnt_devname);
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
+
+	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
+		mnt->mnt_group_id = 0; /* not a peer of original */
+	else
+		mnt->mnt_group_id = old->mnt_group_id;
+
+	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
+		err = mnt_alloc_group_id(mnt);
+		if (err)
+			goto out_free;
+	}
+
+	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
+	/* Don't allow unprivileged users to change mount flags */
+	if (flag & CL_UNPRIVILEGED) {
+		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
+
+		if (mnt->mnt.mnt_flags & MNT_READONLY)
+			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+
+		if (mnt->mnt.mnt_flags & MNT_NODEV)
+			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
+
+		if (mnt->mnt.mnt_flags & MNT_NOSUID)
+			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
+
+		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
+			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
+	}
+
+	/* Don't allow unprivileged users to reveal what is under a mount */
+	if ((flag & CL_UNPRIVILEGED) &&
+	    (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
+		mnt->mnt.mnt_flags |= MNT_LOCKED;
+
+	atomic_inc(&sb->s_active);
+	mnt->mnt.mnt_sb = sb;
+	mnt->mnt.mnt_root = dget(root);
+	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+	mnt->mnt_parent = mnt;
+	lock_mount_hash();
+	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
+	unlock_mount_hash();
+
+	if ((flag & CL_SLAVE) ||
+	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
+		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+		mnt->mnt_master = old;
+		CLEAR_MNT_SHARED(mnt);
+	} else if (!(flag & CL_PRIVATE)) {
+		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
+			list_add(&mnt->mnt_share, &old->mnt_share);
+		if (IS_MNT_SLAVE(old))
+			list_add(&mnt->mnt_slave, &old->mnt_slave);
+		mnt->mnt_master = old->mnt_master;
+	}
+	if (flag & CL_MAKE_SHARED)
+		set_mnt_shared(mnt);
+
+	/* stick the duplicate mount on the same expiry list
+	 * as the original if that was on one */
+	if (flag & CL_EXPIRE) {
+		if (!list_empty(&old->mnt_expire))
+			list_add(&mnt->mnt_expire, &old->mnt_expire);
+	}
+
+	return mnt;
+
+ out_free:
+	mnt_free_id(mnt);
+	free_vfsmnt(mnt);
+	return ERR_PTR(err);
+}
+
+static void cleanup_mnt(struct mount *mnt)
+{
+	/*
+	 * This probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this
+	 * happens, the filesystem was probably unable
+	 * to make r/w->r/o transitions.
+	 */
+	/*
+	 * The locking used to deal with mnt_count decrement provides barriers,
+	 * so mnt_get_writers() below is safe.
+	 */
+	WARN_ON(mnt_get_writers(mnt));
+	if (unlikely(mnt->mnt_pins.first))
+		mnt_pin_kill(mnt);
+	fsnotify_vfsmount_delete(&mnt->mnt);
+	dput(mnt->mnt.mnt_root);
+	deactivate_super(mnt->mnt.mnt_sb);
+	mnt_free_id(mnt);
+	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
+}
+
+static void __cleanup_mnt(struct rcu_head *head)
+{
+	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
+}
+
+static LLIST_HEAD(delayed_mntput_list);
+static void delayed_mntput(struct work_struct *unused)
+{
+	struct llist_node *node = llist_del_all(&delayed_mntput_list);
+	struct llist_node *next;
+
+	for (; node; node = next) {
+		next = llist_next(node);
+		cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
+	}
+}
+static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
+
+static void mntput_no_expire(struct mount *mnt)
+{
+	rcu_read_lock();
+	mnt_add_count(mnt, -1);
+	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+		rcu_read_unlock();
+		return;
+	}
+	lock_mount_hash();
+	if (mnt_get_count(mnt)) {
+		rcu_read_unlock();
+		unlock_mount_hash();
+		return;
+	}
+	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
+		rcu_read_unlock();
+		unlock_mount_hash();
+		return;
+	}
+	mnt->mnt.mnt_flags |= MNT_DOOMED;
+	rcu_read_unlock();
+
+	list_del(&mnt->mnt_instance);
+
+	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
+		struct mount *p, *tmp;
+		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
+			umount_mnt(p);
+		}
+	}
+	unlock_mount_hash();
+
+	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
+		struct task_struct *task = current;
+		if (likely(!(task->flags & PF_KTHREAD))) {
+			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
+			if (!task_work_add(task, &mnt->mnt_rcu, true))
+				return;
+		}
+		if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
+			schedule_delayed_work(&delayed_mntput_work, 1);
+		return;
+	}
+	cleanup_mnt(mnt);
+}
+
+void mntput(struct vfsmount *mnt)
+{
+	if (mnt) {
+		struct mount *m = real_mount(mnt);
+		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		if (unlikely(m->mnt_expiry_mark))
+			m->mnt_expiry_mark = 0;
+		mntput_no_expire(m);
+	}
+}
+EXPORT_SYMBOL(mntput);
+
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+	if (mnt)
+		mnt_add_count(real_mount(mnt), 1);
+	return mnt;
+}
+EXPORT_SYMBOL(mntget);
+
+struct vfsmount *mnt_clone_internal(struct path *path)
+{
+	struct mount *p;
+	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
+	if (IS_ERR(p))
+		return ERR_CAST(p);
+	p->mnt.mnt_flags |= MNT_INTERNAL;
+	return &p->mnt;
+}
+
+static inline void mangle(struct seq_file *m, const char *s)
+{
+	seq_escape(m, s, " \t\n\\");
+}
+
+/*
+ * Simple .show_options callback for filesystems which don't want to
+ * implement more complex mount option showing.
+ *
+ * See also save_mount_options().
+ */
+int generic_show_options(struct seq_file *m, struct dentry *root)
+{
+	const char *options;
+
+	rcu_read_lock();
+	options = rcu_dereference(root->d_sb->s_options);
+
+	if (options != NULL && options[0]) {
+		seq_putc(m, ',');
+		mangle(m, options);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL(generic_show_options);
+
+/*
+ * If filesystem uses generic_show_options(), this function should be
+ * called from the fill_super() callback.
+ *
+ * The .remount_fs callback usually needs to be handled in a special
+ * way, to make sure, that previous options are not overwritten if the
+ * remount fails.
+ *
+ * Also note, that if the filesystem's .remount_fs function doesn't
+ * reset all options to their default value, but changes only newly
+ * given options, then the displayed options will not reflect reality
+ * any more.
+ */
+void save_mount_options(struct super_block *sb, char *options)
+{
+	BUG_ON(sb->s_options);
+	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
+}
+EXPORT_SYMBOL(save_mount_options);
+
+void replace_mount_options(struct super_block *sb, char *options)
+{
+	char *old = sb->s_options;
+	rcu_assign_pointer(sb->s_options, options);
+	if (old) {
+		synchronize_rcu();
+		kfree(old);
+	}
+}
+EXPORT_SYMBOL(replace_mount_options);
+
+#ifdef CONFIG_PROC_FS
+/* iterator; we want it to have access to namespace_sem, thus here... */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	struct proc_mounts *p = proc_mounts(m);
+
+	down_read(&namespace_sem);
+	if (p->cached_event == p->ns->event) {
+		void *v = p->cached_mount;
+		if (*pos == p->cached_index)
+			return v;
+		if (*pos == p->cached_index + 1) {
+			v = seq_list_next(v, &p->ns->list, &p->cached_index);
+			return p->cached_mount = v;
+		}
+	}
+
+	p->cached_event = p->ns->event;
+	p->cached_mount = seq_list_start(&p->ns->list, *pos);
+	p->cached_index = *pos;
+	return p->cached_mount;
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct proc_mounts *p = proc_mounts(m);
+
+	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
+	p->cached_index = *pos;
+	return p->cached_mount;
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+	up_read(&namespace_sem);
+}
+
+static int m_show(struct seq_file *m, void *v)
+{
+	struct proc_mounts *p = proc_mounts(m);
+	struct mount *r = list_entry(v, struct mount, mnt_list);
+	return p->show(m, &r->mnt);
+}
+
+const struct seq_operations mounts_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= m_show,
+};
+#endif  /* CONFIG_PROC_FS */
+
+/**
+ * may_umount_tree - check if a mount tree is busy
+ * @mnt: root of mount tree
+ *
+ * This is called to check if a tree of mounts has any
+ * open files, pwds, chroots or sub mounts that are
+ * busy.
+ */
+int may_umount_tree(struct vfsmount *m)
+{
+	struct mount *mnt = real_mount(m);
+	int actual_refs = 0;
+	int minimum_refs = 0;
+	struct mount *p;
+	BUG_ON(!m);
+
+	/* write lock needed for mnt_get_count */
+	lock_mount_hash();
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		actual_refs += mnt_get_count(p);
+		minimum_refs += 2;
+	}
+	unlock_mount_hash();
+
+	if (actual_refs > minimum_refs)
+		return 0;
+
+	return 1;
+}
+
+EXPORT_SYMBOL(may_umount_tree);
+
+/**
+ * may_umount - check if a mount point is busy
+ * @mnt: root of mount
+ *
+ * This is called to check if a mount point has any
+ * open files, pwds, chroots or sub mounts. If the
+ * mount has sub mounts this will return busy
+ * regardless of whether the sub mounts are busy.
+ *
+ * Doesn't take quota and stuff into account. IOW, in some cases it will
+ * give false negatives. The main reason why it's here is that we need
+ * a non-destructive way to look for easily umountable filesystems.
+ */
+int may_umount(struct vfsmount *mnt)
+{
+	int ret = 1;
+	down_read(&namespace_sem);
+	lock_mount_hash();
+	if (propagate_mount_busy(real_mount(mnt), 2))
+		ret = 0;
+	unlock_mount_hash();
+	up_read(&namespace_sem);
+	return ret;
+}
+
+EXPORT_SYMBOL(may_umount);
+
+static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
+
+static void namespace_unlock(void)
+{
+	struct hlist_head head;
+
+	hlist_move_list(&unmounted, &head);
+
+	up_write(&namespace_sem);
+
+	if (likely(hlist_empty(&head)))
+		return;
+
+	synchronize_rcu();
+
+	group_pin_kill(&head);
+}
+
+static inline void namespace_lock(void)
+{
+	down_write(&namespace_sem);
+}
+
+enum umount_tree_flags {
+	UMOUNT_SYNC = 1,
+	UMOUNT_PROPAGATE = 2,
+	UMOUNT_CONNECTED = 4,
+};
+/*
+ * mount_lock must be held
+ * namespace_sem must be held for write
+ */
+static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
+{
+	LIST_HEAD(tmp_list);
+	struct mount *p;
+
+	if (how & UMOUNT_PROPAGATE)
+		propagate_mount_unlock(mnt);
+
+	/* Gather the mounts to umount */
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		p->mnt.mnt_flags |= MNT_UMOUNT;
+		list_move(&p->mnt_list, &tmp_list);
+	}
+
+	/* Hide the mounts from mnt_mounts */
+	list_for_each_entry(p, &tmp_list, mnt_list) {
+		list_del_init(&p->mnt_child);
+	}
+
+	/* Add propogated mounts to the tmp_list */
+	if (how & UMOUNT_PROPAGATE)
+		propagate_umount(&tmp_list);
+
+	while (!list_empty(&tmp_list)) {
+		bool disconnect;
+		p = list_first_entry(&tmp_list, struct mount, mnt_list);
+		list_del_init(&p->mnt_expire);
+		list_del_init(&p->mnt_list);
+		__touch_mnt_namespace(p->mnt_ns);
+		p->mnt_ns = NULL;
+		if (how & UMOUNT_SYNC)
+			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+
+		disconnect = !(((how & UMOUNT_CONNECTED) &&
+				mnt_has_parent(p) &&
+				(p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) ||
+			       IS_MNT_LOCKED_AND_LAZY(p));
+
+		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+				 disconnect ? &unmounted : NULL);
+		if (mnt_has_parent(p)) {
+			mnt_add_count(p->mnt_parent, -1);
+			if (!disconnect) {
+				/* Don't forget about p */
+				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+			} else {
+				umount_mnt(p);
+			}
+		}
+		change_mnt_propagation(p, MS_PRIVATE);
+	}
+}
+
+static void shrink_submounts(struct mount *mnt);
+
+static int do_umount(struct mount *mnt, int flags)
+{
+	struct super_block *sb = mnt->mnt.mnt_sb;
+	int retval;
+
+	retval = security_sb_umount(&mnt->mnt, flags);
+	if (retval)
+		return retval;
+
+	/*
+	 * Allow userspace to request a mountpoint be expired rather than
+	 * unmounting unconditionally. Unmount only happens if:
+	 *  (1) the mark is already set (the mark is cleared by mntput())
+	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
+	 */
+	if (flags & MNT_EXPIRE) {
+		if (&mnt->mnt == current->fs->root.mnt ||
+		    flags & (MNT_FORCE | MNT_DETACH))
+			return -EINVAL;
+
+		/*
+		 * probably don't strictly need the lock here if we examined
+		 * all race cases, but it's a slowpath.
+		 */
+		lock_mount_hash();
+		if (mnt_get_count(mnt) != 2) {
+			unlock_mount_hash();
+			return -EBUSY;
+		}
+		unlock_mount_hash();
+
+		if (!xchg(&mnt->mnt_expiry_mark, 1))
+			return -EAGAIN;
+	}
+
+	/*
+	 * If we may have to abort operations to get out of this
+	 * mount, and they will themselves hold resources we must
+	 * allow the fs to do things. In the Unix tradition of
+	 * 'Gee thats tricky lets do it in userspace' the umount_begin
+	 * might fail to complete on the first run through as other tasks
+	 * must return, and the like. Thats for the mount program to worry
+	 * about for the moment.
+	 */
+
+	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
+		sb->s_op->umount_begin(sb);
+	}
+
+	/*
+	 * No sense to grab the lock for this test, but test itself looks
+	 * somewhat bogus. Suggestions for better replacement?
+	 * Ho-hum... In principle, we might treat that as umount + switch
+	 * to rootfs. GC would eventually take care of the old vfsmount.
+	 * Actually it makes sense, especially if rootfs would contain a
+	 * /reboot - static binary that would close all descriptors and
+	 * call reboot(9). Then init(8) could umount root and exec /reboot.
+	 */
+	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
+		/*
+		 * Special case for "unmounting" root ...
+		 * we just try to remount it readonly.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		down_write(&sb->s_umount);
+		if (!(sb->s_flags & MS_RDONLY))
+			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
+		up_write(&sb->s_umount);
+		return retval;
+	}
+
+	namespace_lock();
+	lock_mount_hash();
+	event++;
+
+	if (flags & MNT_DETACH) {
+		if (!list_empty(&mnt->mnt_list))
+			umount_tree(mnt, UMOUNT_PROPAGATE);
+		retval = 0;
+	} else {
+		shrink_submounts(mnt);
+		retval = -EBUSY;
+		if (!propagate_mount_busy(mnt, 2)) {
+			if (!list_empty(&mnt->mnt_list))
+				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
+			retval = 0;
+		}
+	}
+	unlock_mount_hash();
+	namespace_unlock();
+	return retval;
+}
+
+/*
+ * __detach_mounts - lazily unmount all mounts on the specified dentry
+ *
+ * During unlink, rmdir, and d_drop it is possible to loose the path
+ * to an existing mountpoint, and wind up leaking the mount.
+ * detach_mounts allows lazily unmounting those mounts instead of
+ * leaking them.
+ *
+ * The caller may hold dentry->d_inode->i_mutex.
+ */
+void __detach_mounts(struct dentry *dentry)
+{
+	struct mountpoint *mp;
+	struct mount *mnt;
+
+	namespace_lock();
+	mp = lookup_mountpoint(dentry);
+	if (IS_ERR_OR_NULL(mp))
+		goto out_unlock;
+
+	lock_mount_hash();
+	while (!hlist_empty(&mp->m_list)) {
+		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
+		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
+			struct mount *p, *tmp;
+			list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
+				hlist_add_head(&p->mnt_umount.s_list, &unmounted);
+				umount_mnt(p);
+			}
+		}
+		else umount_tree(mnt, UMOUNT_CONNECTED);
+	}
+	unlock_mount_hash();
+	put_mountpoint(mp);
+out_unlock:
+	namespace_unlock();
+}
+
+/* 
+ * Is the caller allowed to modify his namespace?
+ */
+static inline bool may_mount(void)
+{
+	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
+}
+
+/*
+ * Now umount can handle mount points as well as block devices.
+ * This is important for filesystems which use unnamed block devices.
+ *
+ * We now support a flag for forced unmount like the other 'big iron'
+ * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
+ */
+
+SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
+{
+	struct path path;
+	struct mount *mnt;
+	int retval;
+	int lookup_flags = 0;
+
+	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
+		return -EINVAL;
+
+	if (!may_mount())
+		return -EPERM;
+
+	if (!(flags & UMOUNT_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+
+	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
+	if (retval)
+		goto out;
+	mnt = real_mount(path.mnt);
+	retval = -EINVAL;
+	if (path.dentry != path.mnt->mnt_root)
+		goto dput_and_out;
+	if (!check_mnt(mnt))
+		goto dput_and_out;
+	if (mnt->mnt.mnt_flags & MNT_LOCKED)
+		goto dput_and_out;
+	retval = -EPERM;
+	if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+		goto dput_and_out;
+
+	retval = do_umount(mnt, flags);
+dput_and_out:
+	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
+	dput(path.dentry);
+	mntput_no_expire(mnt);
+out:
+	return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_OLDUMOUNT
+
+/*
+ *	The 2.0 compatible umount. No flags.
+ */
+SYSCALL_DEFINE1(oldumount, char __user *, name)
+{
+	return sys_umount(name, 0);
+}
+
+#endif
+
+static bool is_mnt_ns_file(struct dentry *dentry)
+{
+	/* Is this a proxy for a mount namespace? */
+	return dentry->d_op == &ns_dentry_operations &&
+	       dentry->d_fsdata == &mntns_operations;
+}
+
+struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct mnt_namespace, ns);
+}
+
+static bool mnt_ns_loop(struct dentry *dentry)
+{
+	/* Could bind mounting the mount namespace inode cause a
+	 * mount namespace loop?
+	 */
+	struct mnt_namespace *mnt_ns;
+	if (!is_mnt_ns_file(dentry))
+		return false;
+
+	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
+	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+}
+
+struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
+					int flag)
+{
+	struct mount *res, *p, *q, *r, *parent;
+
+	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
+		return ERR_PTR(-EINVAL);
+
+	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
+		return ERR_PTR(-EINVAL);
+
+	res = q = clone_mnt(mnt, dentry, flag);
+	if (IS_ERR(q))
+		return q;
+
+	q->mnt_mountpoint = mnt->mnt_mountpoint;
+
+	p = mnt;
+	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
+		struct mount *s;
+		if (!is_subdir(r->mnt_mountpoint, dentry))
+			continue;
+
+		for (s = r; s; s = next_mnt(s, r)) {
+			struct mount *t = NULL;
+			if (!(flag & CL_COPY_UNBINDABLE) &&
+			    IS_MNT_UNBINDABLE(s)) {
+				s = skip_mnt_tree(s);
+				continue;
+			}
+			if (!(flag & CL_COPY_MNT_NS_FILE) &&
+			    is_mnt_ns_file(s->mnt.mnt_root)) {
+				s = skip_mnt_tree(s);
+				continue;
+			}
+			while (p != s->mnt_parent) {
+				p = p->mnt_parent;
+				q = q->mnt_parent;
+			}
+			p = s;
+			parent = q;
+			q = clone_mnt(p, p->mnt.mnt_root, flag);
+			if (IS_ERR(q))
+				goto out;
+			lock_mount_hash();
+			list_add_tail(&q->mnt_list, &res->mnt_list);
+			mnt_set_mountpoint(parent, p->mnt_mp, q);
+			if (!list_empty(&parent->mnt_mounts)) {
+				t = list_last_entry(&parent->mnt_mounts,
+					struct mount, mnt_child);
+				if (t->mnt_mp != p->mnt_mp)
+					t = NULL;
+			}
+			attach_shadowed(q, parent, t);
+			unlock_mount_hash();
+		}
+	}
+	return res;
+out:
+	if (res) {
+		lock_mount_hash();
+		umount_tree(res, UMOUNT_SYNC);
+		unlock_mount_hash();
+	}
+	return q;
+}
+
+/* Caller should check returned pointer for errors */
+
+struct vfsmount *collect_mounts(struct path *path)
+{
+	struct mount *tree;
+	namespace_lock();
+	if (!check_mnt(real_mount(path->mnt)))
+		tree = ERR_PTR(-EINVAL);
+	else
+		tree = copy_tree(real_mount(path->mnt), path->dentry,
+				 CL_COPY_ALL | CL_PRIVATE);
+	namespace_unlock();
+	if (IS_ERR(tree))
+		return ERR_CAST(tree);
+	return &tree->mnt;
+}
+
+void drop_collected_mounts(struct vfsmount *mnt)
+{
+	namespace_lock();
+	lock_mount_hash();
+	umount_tree(real_mount(mnt), UMOUNT_SYNC);
+	unlock_mount_hash();
+	namespace_unlock();
+}
+
+/**
+ * clone_private_mount - create a private clone of a path
+ *
+ * This creates a new vfsmount, which will be the clone of @path.  The new will
+ * not be attached anywhere in the namespace and will be private (i.e. changes
+ * to the originating mount won't be propagated into this).
+ *
+ * Release with mntput().
+ */
+struct vfsmount *clone_private_mount(struct path *path)
+{
+	struct mount *old_mnt = real_mount(path->mnt);
+	struct mount *new_mnt;
+
+	if (IS_MNT_UNBINDABLE(old_mnt))
+		return ERR_PTR(-EINVAL);
+
+	down_read(&namespace_sem);
+	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+	up_read(&namespace_sem);
+	if (IS_ERR(new_mnt))
+		return ERR_CAST(new_mnt);
+
+	return &new_mnt->mnt;
+}
+EXPORT_SYMBOL_GPL(clone_private_mount);
+
+int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
+		   struct vfsmount *root)
+{
+	struct mount *mnt;
+	int res = f(root, arg);
+	if (res)
+		return res;
+	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
+		res = f(&mnt->mnt, arg);
+		if (res)
+			return res;
+	}
+	return 0;
+}
+
+static void cleanup_group_ids(struct mount *mnt, struct mount *end)
+{
+	struct mount *p;
+
+	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
+		if (p->mnt_group_id && !IS_MNT_SHARED(p))
+			mnt_release_group_id(p);
+	}
+}
+
+static int invent_group_ids(struct mount *mnt, bool recurse)
+{
+	struct mount *p;
+
+	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
+		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
+			int err = mnt_alloc_group_id(p);
+			if (err) {
+				cleanup_group_ids(mnt, p);
+				return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *  @source_mnt : mount tree to be attached
+ *  @nd         : place the mount tree @source_mnt is attached
+ *  @parent_nd  : if non-null, detach the source_mnt from its parent and
+ *  		   store the parent mount and mountpoint dentry.
+ *  		   (done when source_mnt is moved)
+ *
+ *  NOTE: in the table below explains the semantics when a source mount
+ *  of a given type is attached to a destination mount of a given type.
+ * ---------------------------------------------------------------------------
+ * |         BIND MOUNT OPERATION                                            |
+ * |**************************************************************************
+ * | source-->| shared        |       private  |       slave    | unbindable |
+ * | dest     |               |                |                |            |
+ * |   |      |               |                |                |            |
+ * |   v      |               |                |                |            |
+ * |**************************************************************************
+ * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
+ * |          |               |                |                |            |
+ * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
+ * ***************************************************************************
+ * A bind operation clones the source mount and mounts the clone on the
+ * destination mount.
+ *
+ * (++)  the cloned mount is propagated to all the mounts in the propagation
+ * 	 tree of the destination mount and the cloned mount is added to
+ * 	 the peer group of the source mount.
+ * (+)   the cloned mount is created under the destination mount and is marked
+ *       as shared. The cloned mount is added to the peer group of the source
+ *       mount.
+ * (+++) the mount is propagated to all the mounts in the propagation tree
+ *       of the destination mount and the cloned mount is made slave
+ *       of the same master as that of the source mount. The cloned mount
+ *       is marked as 'shared and slave'.
+ * (*)   the cloned mount is made a slave of the same master as that of the
+ * 	 source mount.
+ *
+ * ---------------------------------------------------------------------------
+ * |         		MOVE MOUNT OPERATION                                 |
+ * |**************************************************************************
+ * | source-->| shared        |       private  |       slave    | unbindable |
+ * | dest     |               |                |                |            |
+ * |   |      |               |                |                |            |
+ * |   v      |               |                |                |            |
+ * |**************************************************************************
+ * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
+ * |          |               |                |                |            |
+ * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
+ * ***************************************************************************
+ *
+ * (+)  the mount is moved to the destination. And is then propagated to
+ * 	all the mounts in the propagation tree of the destination mount.
+ * (+*)  the mount is moved to the destination.
+ * (+++)  the mount is moved to the destination and is then propagated to
+ * 	all the mounts belonging to the destination mount's propagation tree.
+ * 	the mount is marked as 'shared and slave'.
+ * (*)	the mount continues to be a slave at the new location.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each mount in the tree.
+ * Must be called without spinlocks held, since this function can sleep
+ * in allocations.
+ */
+static int attach_recursive_mnt(struct mount *source_mnt,
+			struct mount *dest_mnt,
+			struct mountpoint *dest_mp,
+			struct path *parent_path)
+{
+	HLIST_HEAD(tree_list);
+	struct mount *child, *p;
+	struct hlist_node *n;
+	int err;
+
+	if (IS_MNT_SHARED(dest_mnt)) {
+		err = invent_group_ids(source_mnt, true);
+		if (err)
+			goto out;
+		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
+		lock_mount_hash();
+		if (err)
+			goto out_cleanup_ids;
+		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+			set_mnt_shared(p);
+	} else {
+		lock_mount_hash();
+	}
+	if (parent_path) {
+		detach_mnt(source_mnt, parent_path);
+		attach_mnt(source_mnt, dest_mnt, dest_mp);
+		touch_mnt_namespace(source_mnt->mnt_ns);
+	} else {
+		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
+		commit_tree(source_mnt, NULL);
+	}
+
+	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
+		struct mount *q;
+		hlist_del_init(&child->mnt_hash);
+		q = __lookup_mnt_last(&child->mnt_parent->mnt,
+				      child->mnt_mountpoint);
+		commit_tree(child, q);
+	}
+	unlock_mount_hash();
+
+	return 0;
+
+ out_cleanup_ids:
+	while (!hlist_empty(&tree_list)) {
+		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+		umount_tree(child, UMOUNT_SYNC);
+	}
+	unlock_mount_hash();
+	cleanup_group_ids(source_mnt, NULL);
+ out:
+	return err;
+}
+
+static struct mountpoint *lock_mount(struct path *path)
+{
+	struct vfsmount *mnt;
+	struct dentry *dentry = path->dentry;
+retry:
+	mutex_lock(&dentry->d_inode->i_mutex);
+	if (unlikely(cant_mount(dentry))) {
+		mutex_unlock(&dentry->d_inode->i_mutex);
+		return ERR_PTR(-ENOENT);
+	}
+	namespace_lock();
+	mnt = lookup_mnt(path);
+	if (likely(!mnt)) {
+		struct mountpoint *mp = lookup_mountpoint(dentry);
+		if (!mp)
+			mp = new_mountpoint(dentry);
+		if (IS_ERR(mp)) {
+			namespace_unlock();
+			mutex_unlock(&dentry->d_inode->i_mutex);
+			return mp;
+		}
+		return mp;
+	}
+	namespace_unlock();
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	path_put(path);
+	path->mnt = mnt;
+	dentry = path->dentry = dget(mnt->mnt_root);
+	goto retry;
+}
+
+static void unlock_mount(struct mountpoint *where)
+{
+	struct dentry *dentry = where->m_dentry;
+	put_mountpoint(where);
+	namespace_unlock();
+	mutex_unlock(&dentry->d_inode->i_mutex);
+}
+
+static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
+{
+	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
+		return -EINVAL;
+
+	if (d_is_dir(mp->m_dentry) !=
+	      d_is_dir(mnt->mnt.mnt_root))
+		return -ENOTDIR;
+
+	return attach_recursive_mnt(mnt, p, mp, NULL);
+}
+
+/*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+	int type = flags & ~(MS_REC | MS_SILENT);
+
+	/* Fail if any non-propagation flags are set */
+	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+		return 0;
+	/* Only one propagation flag should be set */
+	if (!is_power_of_2(type))
+		return 0;
+	return type;
+}
+
+/*
+ * recursively change the type of the mountpoint.
+ */
+static int do_change_type(struct path *path, int flag)
+{
+	struct mount *m;
+	struct mount *mnt = real_mount(path->mnt);
+	int recurse = flag & MS_REC;
+	int type;
+	int err = 0;
+
+	if (path->dentry != path->mnt->mnt_root)
+		return -EINVAL;
+
+	type = flags_to_propagation_type(flag);
+	if (!type)
+		return -EINVAL;
+
+	namespace_lock();
+	if (type == MS_SHARED) {
+		err = invent_group_ids(mnt, recurse);
+		if (err)
+			goto out_unlock;
+	}
+
+	lock_mount_hash();
+	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
+		change_mnt_propagation(m, type);
+	unlock_mount_hash();
+
+ out_unlock:
+	namespace_unlock();
+	return err;
+}
+
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+	struct mount *child;
+	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+		if (!is_subdir(child->mnt_mountpoint, dentry))
+			continue;
+
+		if (child->mnt.mnt_flags & MNT_LOCKED)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * do loopback mount.
+ */
+static int do_loopback(struct path *path, const char *old_name,
+				int recurse)
+{
+	struct path old_path;
+	struct mount *mnt = NULL, *old, *parent;
+	struct mountpoint *mp;
+	int err;
+	if (!old_name || !*old_name)
+		return -EINVAL;
+	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
+	if (err)
+		return err;
+
+	err = -EINVAL;
+	if (mnt_ns_loop(old_path.dentry))
+		goto out; 
+
+	mp = lock_mount(path);
+	err = PTR_ERR(mp);
+	if (IS_ERR(mp))
+		goto out;
+
+	old = real_mount(old_path.mnt);
+	parent = real_mount(path->mnt);
+
+	err = -EINVAL;
+	if (IS_MNT_UNBINDABLE(old))
+		goto out2;
+
+	if (!check_mnt(parent))
+		goto out2;
+
+	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
+		goto out2;
+
+	if (!recurse && has_locked_children(old, old_path.dentry))
+		goto out2;
+
+	if (recurse)
+		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
+	else
+		mnt = clone_mnt(old, old_path.dentry, 0);
+
+	if (IS_ERR(mnt)) {
+		err = PTR_ERR(mnt);
+		goto out2;
+	}
+
+	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
+	err = graft_tree(mnt, parent, mp);
+	if (err) {
+		lock_mount_hash();
+		umount_tree(mnt, UMOUNT_SYNC);
+		unlock_mount_hash();
+	}
+out2:
+	unlock_mount(mp);
+out:
+	path_put(&old_path);
+	return err;
+}
+
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+	int error = 0;
+	int readonly_request = 0;
+
+	if (ms_flags & MS_RDONLY)
+		readonly_request = 1;
+	if (readonly_request == __mnt_is_readonly(mnt))
+		return 0;
+
+	if (readonly_request)
+		error = mnt_make_readonly(real_mount(mnt));
+	else
+		__mnt_unmake_readonly(real_mount(mnt));
+	return error;
+}
+
+/*
+ * change filesystem flags. dir should be a physical root of filesystem.
+ * If you've mounted a non-root directory somewhere and want to do remount
+ * on it - tough luck.
+ */
+static int do_remount(struct path *path, int flags, int mnt_flags,
+		      void *data)
+{
+	int err;
+	struct super_block *sb = path->mnt->mnt_sb;
+	struct mount *mnt = real_mount(path->mnt);
+
+	if (!check_mnt(mnt))
+		return -EINVAL;
+
+	if (path->dentry != path->mnt->mnt_root)
+		return -EINVAL;
+
+	/* Don't allow changing of locked mnt flags.
+	 *
+	 * No locks need to be held here while testing the various
+	 * MNT_LOCK flags because those flags can never be cleared
+	 * once they are set.
+	 */
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+	    !(mnt_flags & MNT_READONLY)) {
+		return -EPERM;
+	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+	    !(mnt_flags & MNT_NODEV)) {
+		/* Was the nodev implicitly added in mount? */
+		if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
+		    !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+			mnt_flags |= MNT_NODEV;
+		} else {
+			return -EPERM;
+		}
+	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
+	    !(mnt_flags & MNT_NOSUID)) {
+		return -EPERM;
+	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
+	    !(mnt_flags & MNT_NOEXEC)) {
+		return -EPERM;
+	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
+	    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
+		return -EPERM;
+	}
+
+	err = security_sb_remount(sb, data);
+	if (err)
+		return err;
+
+	down_write(&sb->s_umount);
+	if (flags & MS_BIND)
+		err = change_mount_flags(path->mnt, flags);
+	else if (!capable(CAP_SYS_ADMIN))
+		err = -EPERM;
+	else
+		err = do_remount_sb(sb, flags, data, 0);
+	if (!err) {
+		lock_mount_hash();
+		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
+		mnt->mnt.mnt_flags = mnt_flags;
+		touch_mnt_namespace(mnt->mnt_ns);
+		unlock_mount_hash();
+	}
+	up_write(&sb->s_umount);
+	return err;
+}
+
+static inline int tree_contains_unbindable(struct mount *mnt)
+{
+	struct mount *p;
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		if (IS_MNT_UNBINDABLE(p))
+			return 1;
+	}
+	return 0;
+}
+
+static int do_move_mount(struct path *path, const char *old_name)
+{
+	struct path old_path, parent_path;
+	struct mount *p;
+	struct mount *old;
+	struct mountpoint *mp;
+	int err;
+	if (!old_name || !*old_name)
+		return -EINVAL;
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+	if (err)
+		return err;
+
+	mp = lock_mount(path);
+	err = PTR_ERR(mp);
+	if (IS_ERR(mp))
+		goto out;
+
+	old = real_mount(old_path.mnt);
+	p = real_mount(path->mnt);
+
+	err = -EINVAL;
+	if (!check_mnt(p) || !check_mnt(old))
+		goto out1;
+
+	if (old->mnt.mnt_flags & MNT_LOCKED)
+		goto out1;
+
+	err = -EINVAL;
+	if (old_path.dentry != old_path.mnt->mnt_root)
+		goto out1;
+
+	if (!mnt_has_parent(old))
+		goto out1;
+
+	if (d_is_dir(path->dentry) !=
+	      d_is_dir(old_path.dentry))
+		goto out1;
+	/*
+	 * Don't move a mount residing in a shared parent.
+	 */
+	if (IS_MNT_SHARED(old->mnt_parent))
+		goto out1;
+	/*
+	 * Don't move a mount tree containing unbindable mounts to a destination
+	 * mount which is shared.
+	 */
+	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
+		goto out1;
+	err = -ELOOP;
+	for (; mnt_has_parent(p); p = p->mnt_parent)
+		if (p == old)
+			goto out1;
+
+	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
+	if (err)
+		goto out1;
+
+	/* if the mount is moved, it should no longer be expire
+	 * automatically */
+	list_del_init(&old->mnt_expire);
+out1:
+	unlock_mount(mp);
+out:
+	if (!err)
+		path_put(&parent_path);
+	path_put(&old_path);
+	return err;
+}
+
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+{
+	int err;
+	const char *subtype = strchr(fstype, '.');
+	if (subtype) {
+		subtype++;
+		err = -EINVAL;
+		if (!subtype[0])
+			goto err;
+	} else
+		subtype = "";
+
+	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!mnt->mnt_sb->s_subtype)
+		goto err;
+	return mnt;
+
+ err:
+	mntput(mnt);
+	return ERR_PTR(err);
+}
+
+/*
+ * add a mount into a namespace's mount tree
+ */
+static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
+{
+	struct mountpoint *mp;
+	struct mount *parent;
+	int err;
+
+	mnt_flags &= ~MNT_INTERNAL_FLAGS;
+
+	mp = lock_mount(path);
+	if (IS_ERR(mp))
+		return PTR_ERR(mp);
+
+	parent = real_mount(path->mnt);
+	err = -EINVAL;
+	if (unlikely(!check_mnt(parent))) {
+		/* that's acceptable only for automounts done in private ns */
+		if (!(mnt_flags & MNT_SHRINKABLE))
+			goto unlock;
+		/* ... and for those we'd better have mountpoint still alive */
+		if (!parent->mnt_ns)
+			goto unlock;
+	}
+
+	/* Refuse the same filesystem on the same mount point */
+	err = -EBUSY;
+	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
+	    path->mnt->mnt_root == path->dentry)
+		goto unlock;
+
+	err = -EINVAL;
+	if (d_is_symlink(newmnt->mnt.mnt_root))
+		goto unlock;
+
+	newmnt->mnt.mnt_flags = mnt_flags;
+	err = graft_tree(newmnt, parent, mp);
+
+unlock:
+	unlock_mount(mp);
+	return err;
+}
+
+static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+
+/*
+ * create a new mount for userspace and request it to be added into the
+ * namespace's tree
+ */
+static int do_new_mount(struct path *path, const char *fstype, int flags,
+			int mnt_flags, const char *name, void *data)
+{
+	struct file_system_type *type;
+	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+	struct vfsmount *mnt;
+	int err;
+
+	if (!fstype)
+		return -EINVAL;
+
+	type = get_fs_type(fstype);
+	if (!type)
+		return -ENODEV;
+
+	if (user_ns != &init_user_ns) {
+		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
+			put_filesystem(type);
+			return -EPERM;
+		}
+		/* Only in special cases allow devices from mounts
+		 * created outside the initial user namespace.
+		 */
+		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+			flags |= MS_NODEV;
+			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
+		}
+		if (type->fs_flags & FS_USERNS_VISIBLE) {
+			if (!fs_fully_visible(type, &mnt_flags))
+				return -EPERM;
+		}
+	}
+
+	mnt = vfs_kern_mount(type, flags, name, data);
+	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+	    !mnt->mnt_sb->s_subtype)
+		mnt = fs_set_subtype(mnt, fstype);
+
+	put_filesystem(type);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	err = do_add_mount(real_mount(mnt), path, mnt_flags);
+	if (err)
+		mntput(mnt);
+	return err;
+}
+
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+	struct mount *mnt = real_mount(m);
+	int err;
+	/* The new mount record should have at least 2 refs to prevent it being
+	 * expired before we get a chance to add it
+	 */
+	BUG_ON(mnt_get_count(mnt) < 2);
+
+	if (m->mnt_sb == path->mnt->mnt_sb &&
+	    m->mnt_root == path->dentry) {
+		err = -ELOOP;
+		goto fail;
+	}
+
+	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	if (!err)
+		return 0;
+fail:
+	/* remove m from any expiration list it may be on */
+	if (!list_empty(&mnt->mnt_expire)) {
+		namespace_lock();
+		list_del_init(&mnt->mnt_expire);
+		namespace_unlock();
+	}
+	mntput(m);
+	mntput(m);
+	return err;
+}
+
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+	namespace_lock();
+
+	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
+
+	namespace_unlock();
+}
+EXPORT_SYMBOL(mnt_set_expiry);
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * mountpoints that aren't in use and haven't been touched since last we came
+ * here
+ */
+void mark_mounts_for_expiry(struct list_head *mounts)
+{
+	struct mount *mnt, *next;
+	LIST_HEAD(graveyard);
+
+	if (list_empty(mounts))
+		return;
+
+	namespace_lock();
+	lock_mount_hash();
+
+	/* extract from the expiration list every vfsmount that matches the
+	 * following criteria:
+	 * - only referenced by its parent vfsmount
+	 * - still marked for expiry (marked on the last call here; marks are
+	 *   cleared by mntput())
+	 */
+	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
+		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
+			propagate_mount_busy(mnt, 1))
+			continue;
+		list_move(&mnt->mnt_expire, &graveyard);
+	}
+	while (!list_empty(&graveyard)) {
+		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
+		touch_mnt_namespace(mnt->mnt_ns);
+		umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
+	}
+	unlock_mount_hash();
+	namespace_unlock();
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct mount *parent, struct list_head *graveyard)
+{
+	struct mount *this_parent = parent;
+	struct list_head *next;
+	int found = 0;
+
+repeat:
+	next = this_parent->mnt_mounts.next;
+resume:
+	while (next != &this_parent->mnt_mounts) {
+		struct list_head *tmp = next;
+		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
+
+		next = tmp->next;
+		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
+			continue;
+		/*
+		 * Descend a level if the d_mounts list is non-empty.
+		 */
+		if (!list_empty(&mnt->mnt_mounts)) {
+			this_parent = mnt;
+			goto repeat;
+		}
+
+		if (!propagate_mount_busy(mnt, 1)) {
+			list_move_tail(&mnt->mnt_expire, graveyard);
+			found++;
+		}
+	}
+	/*
+	 * All done at this level ... ascend and resume the search
+	 */
+	if (this_parent != parent) {
+		next = this_parent->mnt_child.next;
+		this_parent = this_parent->mnt_parent;
+		goto resume;
+	}
+	return found;
+}
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ *
+ * mount_lock must be held for write
+ */
+static void shrink_submounts(struct mount *mnt)
+{
+	LIST_HEAD(graveyard);
+	struct mount *m;
+
+	/* extract submounts of 'mountpoint' from the expiration list */
+	while (select_submounts(mnt, &graveyard)) {
+		while (!list_empty(&graveyard)) {
+			m = list_first_entry(&graveyard, struct mount,
+						mnt_expire);
+			touch_mnt_namespace(m->mnt_ns);
+			umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
+		}
+	}
+}
+
+/*
+ * Some copy_from_user() implementations do not return the exact number of
+ * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
+ * Note that this function differs from copy_from_user() in that it will oops
+ * on bad values of `to', rather than returning a short copy.
+ */
+static long exact_copy_from_user(void *to, const void __user * from,
+				 unsigned long n)
+{
+	char *t = to;
+	const char __user *f = from;
+	char c;
+
+	if (!access_ok(VERIFY_READ, from, n))
+		return n;
+
+	while (n) {
+		if (__get_user(c, f)) {
+			memset(t, 0, n);
+			break;
+		}
+		*t++ = c;
+		f++;
+		n--;
+	}
+	return n;
+}
+
+int copy_mount_options(const void __user * data, unsigned long *where)
+{
+	int i;
+	unsigned long page;
+	unsigned long size;
+
+	*where = 0;
+	if (!data)
+		return 0;
+
+	if (!(page = __get_free_page(GFP_KERNEL)))
+		return -ENOMEM;
+
+	/* We only care that *some* data at the address the user
+	 * gave us is valid.  Just in case, we'll zero
+	 * the remainder of the page.
+	 */
+	/* copy_from_user cannot cross TASK_SIZE ! */
+	size = TASK_SIZE - (unsigned long)data;
+	if (size > PAGE_SIZE)
+		size = PAGE_SIZE;
+
+	i = size - exact_copy_from_user((void *)page, data, size);
+	if (!i) {
+		free_page(page);
+		return -EFAULT;
+	}
+	if (i != PAGE_SIZE)
+		memset((char *)page + i, 0, PAGE_SIZE - i);
+	*where = page;
+	return 0;
+}
+
+char *copy_mount_string(const void __user *data)
+{
+	return data ? strndup_user(data, PAGE_SIZE) : NULL;
+}
+
+/*
+ * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
+ * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
+ *
+ * data is a (void *) that can point to any structure up to
+ * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
+ * information (or be NULL).
+ *
+ * Pre-0.97 versions of mount() didn't have a flags word.
+ * When the flags word was introduced its top half was required
+ * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
+ * Therefore, if this magic number is present, it carries no information
+ * and must be discarded.
+ */
+long do_mount(const char *dev_name, const char __user *dir_name,
+		const char *type_page, unsigned long flags, void *data_page)
+{
+	struct path path;
+	int retval = 0;
+	int mnt_flags = 0;
+
+	/* Discard magic */
+	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
+		flags &= ~MS_MGC_MSK;
+
+	/* Basic sanity checks */
+	if (data_page)
+		((char *)data_page)[PAGE_SIZE - 1] = 0;
+
+	/* ... and get the mountpoint */
+	retval = user_path(dir_name, &path);
+	if (retval)
+		return retval;
+
+	retval = security_sb_mount(dev_name, &path,
+				   type_page, flags, data_page);
+	if (!retval && !may_mount())
+		retval = -EPERM;
+	if (retval)
+		goto dput_out;
+
+	/* Default to relatime unless overriden */
+	if (!(flags & MS_NOATIME))
+		mnt_flags |= MNT_RELATIME;
+
+	/* Separate the per-mountpoint flags */
+	if (flags & MS_NOSUID)
+		mnt_flags |= MNT_NOSUID;
+	if (flags & MS_NODEV)
+		mnt_flags |= MNT_NODEV;
+	if (flags & MS_NOEXEC)
+		mnt_flags |= MNT_NOEXEC;
+	if (flags & MS_NOATIME)
+		mnt_flags |= MNT_NOATIME;
+	if (flags & MS_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+	if (flags & MS_STRICTATIME)
+		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
+	if (flags & MS_RDONLY)
+		mnt_flags |= MNT_READONLY;
+
+	/* The default atime for remount is preservation */
+	if ((flags & MS_REMOUNT) &&
+	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
+		       MS_STRICTATIME)) == 0)) {
+		mnt_flags &= ~MNT_ATIME_MASK;
+		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
+	}
+
+	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
+		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
+		   MS_STRICTATIME);
+
+	if (flags & MS_REMOUNT)
+		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
+				    data_page);
+	else if (flags & MS_BIND)
+		retval = do_loopback(&path, dev_name, flags & MS_REC);
+	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+		retval = do_change_type(&path, flags);
+	else if (flags & MS_MOVE)
+		retval = do_move_mount(&path, dev_name);
+	else
+		retval = do_new_mount(&path, type_page, flags, mnt_flags,
+				      dev_name, data_page);
+dput_out:
+	path_put(&path);
+	return retval;
+}
+
+static void free_mnt_ns(struct mnt_namespace *ns)
+{
+	ns_free_inum(&ns->ns);
+	put_user_ns(ns->user_ns);
+	kfree(ns);
+}
+
+/*
+ * Assign a sequence number so we can detect when we attempt to bind
+ * mount a reference to an older mount namespace into the current
+ * mount namespace, preventing reference counting loops.  A 64bit
+ * number incrementing at 10Ghz will take 12,427 years to wrap which
+ * is effectively never, so we can ignore the possibility.
+ */
+static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
+
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
+{
+	struct mnt_namespace *new_ns;
+	int ret;
+
+	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	ret = ns_alloc_inum(&new_ns->ns);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	new_ns->ns.ops = &mntns_operations;
+	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+	atomic_set(&new_ns->count, 1);
+	new_ns->root = NULL;
+	INIT_LIST_HEAD(&new_ns->list);
+	init_waitqueue_head(&new_ns->poll);
+	new_ns->event = 0;
+	new_ns->user_ns = get_user_ns(user_ns);
+	return new_ns;
+}
+
+struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
+		struct user_namespace *user_ns, struct fs_struct *new_fs)
+{
+	struct mnt_namespace *new_ns;
+	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
+	struct mount *p, *q;
+	struct mount *old;
+	struct mount *new;
+	int copy_flags;
+
+	BUG_ON(!ns);
+
+	if (likely(!(flags & CLONE_NEWNS))) {
+		get_mnt_ns(ns);
+		return ns;
+	}
+
+	old = ns->root;
+
+	new_ns = alloc_mnt_ns(user_ns);
+	if (IS_ERR(new_ns))
+		return new_ns;
+
+	namespace_lock();
+	/* First pass: copy the tree topology */
+	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
+	if (user_ns != ns->user_ns)
+		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
+	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
+	if (IS_ERR(new)) {
+		namespace_unlock();
+		free_mnt_ns(new_ns);
+		return ERR_CAST(new);
+	}
+	new_ns->root = new;
+	list_add_tail(&new_ns->list, &new->mnt_list);
+
+	/*
+	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
+	 * as belonging to new namespace.  We have already acquired a private
+	 * fs_struct, so tsk->fs->lock is not needed.
+	 */
+	p = old;
+	q = new;
+	while (p) {
+		q->mnt_ns = new_ns;
+		if (new_fs) {
+			if (&p->mnt == new_fs->root.mnt) {
+				new_fs->root.mnt = mntget(&q->mnt);
+				rootmnt = &p->mnt;
+			}
+			if (&p->mnt == new_fs->pwd.mnt) {
+				new_fs->pwd.mnt = mntget(&q->mnt);
+				pwdmnt = &p->mnt;
+			}
+		}
+		p = next_mnt(p, old);
+		q = next_mnt(q, new);
+		if (!q)
+			break;
+		while (p->mnt.mnt_root != q->mnt.mnt_root)
+			p = next_mnt(p, old);
+	}
+	namespace_unlock();
+
+	if (rootmnt)
+		mntput(rootmnt);
+	if (pwdmnt)
+		mntput(pwdmnt);
+
+	return new_ns;
+}
+
+/**
+ * create_mnt_ns - creates a private namespace and adds a root filesystem
+ * @mnt: pointer to the new root filesystem mountpoint
+ */
+static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
+{
+	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
+	if (!IS_ERR(new_ns)) {
+		struct mount *mnt = real_mount(m);
+		mnt->mnt_ns = new_ns;
+		new_ns->root = mnt;
+		list_add(&mnt->mnt_list, &new_ns->list);
+	} else {
+		mntput(m);
+	}
+	return new_ns;
+}
+
+struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
+{
+	struct mnt_namespace *ns;
+	struct super_block *s;
+	struct path path;
+	int err;
+
+	ns = create_mnt_ns(mnt);
+	if (IS_ERR(ns))
+		return ERR_CAST(ns);
+
+	err = vfs_path_lookup(mnt->mnt_root, mnt,
+			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+
+	put_mnt_ns(ns);
+
+	if (err)
+		return ERR_PTR(err);
+
+	/* trade a vfsmount reference for active sb one */
+	s = path.mnt->mnt_sb;
+	atomic_inc(&s->s_active);
+	mntput(path.mnt);
+	/* lock the sucker */
+	down_write(&s->s_umount);
+	/* ... and return the root of (sub)tree on it */
+	return path.dentry;
+}
+EXPORT_SYMBOL(mount_subtree);
+
+SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
+		char __user *, type, unsigned long, flags, void __user *, data)
+{
+	int ret;
+	char *kernel_type;
+	char *kernel_dev;
+	unsigned long data_page;
+
+	kernel_type = copy_mount_string(type);
+	ret = PTR_ERR(kernel_type);
+	if (IS_ERR(kernel_type))
+		goto out_type;
+
+	kernel_dev = copy_mount_string(dev_name);
+	ret = PTR_ERR(kernel_dev);
+	if (IS_ERR(kernel_dev))
+		goto out_dev;
+
+	ret = copy_mount_options(data, &data_page);
+	if (ret < 0)
+		goto out_data;
+
+	ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
+		(void *) data_page);
+
+	free_page(data_page);
+out_data:
+	kfree(kernel_dev);
+out_dev:
+	kfree(kernel_type);
+out_type:
+	return ret;
+}
+
+/*
+ * Return true if path is reachable from root
+ *
+ * namespace_sem or mount_lock is held
+ */
+bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
+			 const struct path *root)
+{
+	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
+		dentry = mnt->mnt_mountpoint;
+		mnt = mnt->mnt_parent;
+	}
+	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
+}
+
+int path_is_under(struct path *path1, struct path *path2)
+{
+	int res;
+	read_seqlock_excl(&mount_lock);
+	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
+	read_sequnlock_excl(&mount_lock);
+	return res;
+}
+EXPORT_SYMBOL(path_is_under);
+
+/*
+ * pivot_root Semantics:
+ * Moves the root file system of the current process to the directory put_old,
+ * makes new_root as the new root file system of the current process, and sets
+ * root/cwd of all processes which had them on the current root to new_root.
+ *
+ * Restrictions:
+ * The new_root and put_old must be directories, and  must not be on the
+ * same file  system as the current process root. The put_old  must  be
+ * underneath new_root,  i.e. adding a non-zero number of /.. to the string
+ * pointed to by put_old must yield the same directory as new_root. No other
+ * file system may be mounted on put_old. After all, new_root is a mountpoint.
+ *
+ * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
+ * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
+ * in this situation.
+ *
+ * Notes:
+ *  - we don't move root/cwd if they are not at the root (reason: if something
+ *    cared enough to change them, it's probably wrong to force them elsewhere)
+ *  - it's okay to pick a root that isn't the root of a file system, e.g.
+ *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
+ *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
+ *    first.
+ */
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
+		const char __user *, put_old)
+{
+	struct path new, old, parent_path, root_parent, root;
+	struct mount *new_mnt, *root_mnt, *old_mnt;
+	struct mountpoint *old_mp, *root_mp;
+	int error;
+
+	if (!may_mount())
+		return -EPERM;
+
+	error = user_path_dir(new_root, &new);
+	if (error)
+		goto out0;
+
+	error = user_path_dir(put_old, &old);
+	if (error)
+		goto out1;
+
+	error = security_sb_pivotroot(&old, &new);
+	if (error)
+		goto out2;
+
+	get_fs_root(current->fs, &root);
+	old_mp = lock_mount(&old);
+	error = PTR_ERR(old_mp);
+	if (IS_ERR(old_mp))
+		goto out3;
+
+	error = -EINVAL;
+	new_mnt = real_mount(new.mnt);
+	root_mnt = real_mount(root.mnt);
+	old_mnt = real_mount(old.mnt);
+	if (IS_MNT_SHARED(old_mnt) ||
+		IS_MNT_SHARED(new_mnt->mnt_parent) ||
+		IS_MNT_SHARED(root_mnt->mnt_parent))
+		goto out4;
+	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
+		goto out4;
+	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
+		goto out4;
+	error = -ENOENT;
+	if (d_unlinked(new.dentry))
+		goto out4;
+	error = -EBUSY;
+	if (new_mnt == root_mnt || old_mnt == root_mnt)
+		goto out4; /* loop, on the same file system  */
+	error = -EINVAL;
+	if (root.mnt->mnt_root != root.dentry)
+		goto out4; /* not a mountpoint */
+	if (!mnt_has_parent(root_mnt))
+		goto out4; /* not attached */
+	root_mp = root_mnt->mnt_mp;
+	if (new.mnt->mnt_root != new.dentry)
+		goto out4; /* not a mountpoint */
+	if (!mnt_has_parent(new_mnt))
+		goto out4; /* not attached */
+	/* make sure we can reach put_old from new_root */
+	if (!is_path_reachable(old_mnt, old.dentry, &new))
+		goto out4;
+	/* make certain new is below the root */
+	if (!is_path_reachable(new_mnt, new.dentry, &root))
+		goto out4;
+	root_mp->m_count++; /* pin it so it won't go away */
+	lock_mount_hash();
+	detach_mnt(new_mnt, &parent_path);
+	detach_mnt(root_mnt, &root_parent);
+	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
+		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
+		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+	}
+	/* mount old root on put_old */
+	attach_mnt(root_mnt, old_mnt, old_mp);
+	/* mount new_root on / */
+	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
+	touch_mnt_namespace(current->nsproxy->mnt_ns);
+	/* A moved mount should not expire automatically */
+	list_del_init(&new_mnt->mnt_expire);
+	unlock_mount_hash();
+	chroot_fs_refs(&root, &new);
+	put_mountpoint(root_mp);
+	error = 0;
+out4:
+	unlock_mount(old_mp);
+	if (!error) {
+		path_put(&root_parent);
+		path_put(&parent_path);
+	}
+out3:
+	path_put(&root);
+out2:
+	path_put(&old);
+out1:
+	path_put(&new);
+out0:
+	return error;
+}
+
+static void __init init_mount_tree(void)
+{
+	struct vfsmount *mnt;
+	struct mnt_namespace *ns;
+	struct path root;
+	struct file_system_type *type;
+
+	type = get_fs_type("rootfs");
+	if (!type)
+		panic("Can't find rootfs type");
+	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
+	put_filesystem(type);
+	if (IS_ERR(mnt))
+		panic("Can't create rootfs");
+
+	ns = create_mnt_ns(mnt);
+	if (IS_ERR(ns))
+		panic("Can't allocate initial namespace");
+
+	init_task.nsproxy->mnt_ns = ns;
+	get_mnt_ns(ns);
+
+	root.mnt = mnt;
+	root.dentry = mnt->mnt_root;
+	mnt->mnt_flags |= MNT_LOCKED;
+
+	set_fs_pwd(current->fs, &root);
+	set_fs_root(current->fs, &root);
+}
+
+void __init mnt_init(void)
+{
+	unsigned u;
+	int err;
+
+	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	mount_hashtable = alloc_large_system_hash("Mount-cache",
+				sizeof(struct hlist_head),
+				mhash_entries, 19,
+				0,
+				&m_hash_shift, &m_hash_mask, 0, 0);
+	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
+				sizeof(struct hlist_head),
+				mphash_entries, 19,
+				0,
+				&mp_hash_shift, &mp_hash_mask, 0, 0);
+
+	if (!mount_hashtable || !mountpoint_hashtable)
+		panic("Failed to allocate mount hash table\n");
+
+	for (u = 0; u <= m_hash_mask; u++)
+		INIT_HLIST_HEAD(&mount_hashtable[u]);
+	for (u = 0; u <= mp_hash_mask; u++)
+		INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
+
+	kernfs_init();
+
+	err = sysfs_init();
+	if (err)
+		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
+			__func__, err);
+	fs_kobj = kobject_create_and_add("fs", NULL);
+	if (!fs_kobj)
+		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+	init_rootfs();
+	init_mount_tree();
+}
+
+void put_mnt_ns(struct mnt_namespace *ns)
+{
+	if (!atomic_dec_and_test(&ns->count))
+		return;
+	drop_collected_mounts(&ns->root->mnt);
+	free_mnt_ns(ns);
+}
+
+struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+{
+	struct vfsmount *mnt;
+	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
+	if (!IS_ERR(mnt)) {
+		/*
+		 * it is a longterm mount, don't release mnt until
+		 * we unmount before file sys is unregistered
+		*/
+		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
+	}
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(kern_mount_data);
+
+void kern_unmount(struct vfsmount *mnt)
+{
+	/* release long term mount so mount point can be released */
+	if (!IS_ERR_OR_NULL(mnt)) {
+		real_mount(mnt)->mnt_ns = NULL;
+		synchronize_rcu();	/* yecchhh... */
+		mntput(mnt);
+	}
+}
+EXPORT_SYMBOL(kern_unmount);
+
+bool our_mnt(struct vfsmount *mnt)
+{
+	return check_mnt(real_mount(mnt));
+}
+
+bool current_chrooted(void)
+{
+	/* Does the current process have a non-standard root */
+	struct path ns_root;
+	struct path fs_root;
+	bool chrooted;
+
+	/* Find the namespace root */
+	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
+	ns_root.dentry = ns_root.mnt->mnt_root;
+	path_get(&ns_root);
+	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
+		;
+
+	get_fs_root(current->fs, &fs_root);
+
+	chrooted = !path_equal(&fs_root, &ns_root);
+
+	path_put(&fs_root);
+	path_put(&ns_root);
+
+	return chrooted;
+}
+
+static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	int new_flags = *new_mnt_flags;
+	struct mount *mnt;
+	bool visible = false;
+
+	if (unlikely(!ns))
+		return false;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		struct mount *child;
+		if (mnt->mnt.mnt_sb->s_type != type)
+			continue;
+
+		/* This mount is not fully visible if it's root directory
+		 * is not the root directory of the filesystem.
+		 */
+		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
+			continue;
+
+		/* Verify the mount flags are equal to or more permissive
+		 * than the proposed new mount.
+		 */
+		if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+		    !(new_flags & MNT_READONLY))
+			continue;
+		if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+		    !(new_flags & MNT_NODEV))
+			continue;
+		if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
+		    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
+			continue;
+
+		/* This mount is not fully visible if there are any
+		 * locked child mounts that cover anything except for
+		 * empty directories.
+		 */
+		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+			struct inode *inode = child->mnt_mountpoint->d_inode;
+			/* Only worry about locked mounts */
+			if (!(mnt->mnt.mnt_flags & MNT_LOCKED))
+				continue;
+			/* Is the directory permanetly empty? */
+			if (!is_empty_dir_inode(inode))
+				goto next;
+		}
+		/* Preserve the locked attributes */
+		*new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \
+							MNT_LOCK_NODEV    | \
+							MNT_LOCK_ATIME);
+		visible = true;
+		goto found;
+	next:	;
+	}
+found:
+	up_read(&namespace_sem);
+	return visible;
+}
+
+static struct ns_common *mntns_get(struct task_struct *task)
+{
+	struct ns_common *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = &nsproxy->mnt_ns->ns;
+		get_mnt_ns(to_mnt_ns(ns));
+	}
+	task_unlock(task);
+
+	return ns;
+}
+
+static void mntns_put(struct ns_common *ns)
+{
+	put_mnt_ns(to_mnt_ns(ns));
+}
+
+static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+	struct fs_struct *fs = current->fs;
+	struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
+	struct path root;
+
+	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (fs->users != 1)
+		return -EINVAL;
+
+	get_mnt_ns(mnt_ns);
+	put_mnt_ns(nsproxy->mnt_ns);
+	nsproxy->mnt_ns = mnt_ns;
+
+	/* Find the root */
+	root.mnt    = &mnt_ns->root->mnt;
+	root.dentry = mnt_ns->root->mnt.mnt_root;
+	path_get(&root);
+	while(d_mountpoint(root.dentry) && follow_down_one(&root))
+		;
+
+	/* Update the pwd and root */
+	set_fs_pwd(fs, &root);
+	set_fs_root(fs, &root);
+
+	path_put(&root);
+	return 0;
+}
+
+const struct proc_ns_operations mntns_operations = {
+	.name		= "mnt",
+	.type		= CLONE_NEWNS,
+	.get		= mntns_get,
+	.put		= mntns_put,
+	.install	= mntns_install,
+};
diff --git a/kernel/fs/ncpfs/Kconfig b/kernel/fs/ncpfs/Kconfig
new file mode 100644
index 000000000..c931cf22a
--- /dev/null
+++ b/kernel/fs/ncpfs/Kconfig
@@ -0,0 +1,108 @@
+#
+# NCP Filesystem configuration
+#
+config NCP_FS
+	tristate "NCP file system support (to mount NetWare volumes)"
+	depends on IPX!=n || INET
+	help
+	  NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
+	  used by Novell NetWare clients to talk to file servers.  It is to
+	  IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
+	  to mount NetWare file server volumes and to access them just like
+	  any other Unix directory.  For details, please read the file
+	  <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
+	  the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
+
+	  You do not have to say Y here if you want your Linux box to act as a
+	  file *server* for Novell NetWare clients.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile this as a module, choose M here: the module will be called
+	  ncpfs.  Say N unless you are connected to a Novell network.
+
+config NCPFS_PACKET_SIGNING
+	bool "Packet signatures"
+	depends on NCP_FS
+	help
+	  NCP allows packets to be signed for stronger security. If you want
+	  security, say Y.  Normal users can leave it off.  To be able to use
+	  packet signing you must use ncpfs > 2.0.12.
+
+config NCPFS_IOCTL_LOCKING
+	bool "Proprietary file locking"
+	depends on NCP_FS
+	help
+	  Allows locking of records on remote volumes.  Say N unless you have
+	  special applications which are able to utilize this locking scheme.
+
+config NCPFS_STRONG
+	bool "Clear remove/delete inhibit when needed"
+	depends on NCP_FS
+	help
+	  Allows manipulation of files flagged as Delete or Rename Inhibit.
+	  To use this feature you must mount volumes with the ncpmount
+	  parameter "-s" (ncpfs-2.0.12 and newer).  Say Y unless you are not
+	  mounting volumes with -f 444.
+
+config NCPFS_NFS_NS
+	bool "Use NFS namespace if available"
+	depends on NCP_FS
+	help
+	  Allows you to utilize NFS namespace on NetWare servers.  It brings
+	  you case sensitive filenames.  Say Y.  You can disable it at
+	  mount-time with the `-N nfs' parameter of ncpmount.
+
+config NCPFS_OS2_NS
+	bool "Use LONG (OS/2) namespace if available"
+	depends on NCP_FS
+	help
+	  Allows you to utilize OS2/LONG namespace on NetWare servers.
+	  Filenames in this namespace are limited to 255 characters, they are
+	  case insensitive, and case in names is preserved.  Say Y.  You can
+	  disable it at mount time with the -N os2 parameter of ncpmount.
+
+config NCPFS_SMALLDOS
+	bool "Lowercase DOS filenames"
+	depends on NCP_FS
+	---help---
+	  If you say Y here, every filename on a NetWare server volume using
+	  the OS2/LONG namespace and created under DOS or on a volume using
+	  DOS namespace will be converted to lowercase characters.
+	  Saying N here will give you these filenames in uppercase.
+
+	  This is only a cosmetic option since the OS2/LONG namespace is case
+	  insensitive. The only major reason for this option is backward
+	  compatibility when moving from DOS to OS2/LONG namespace support.
+	  Long filenames (created by Win95) will not be affected.
+
+	  This option does not solve the problem that filenames appear
+	  differently under Linux and under Windows, since Windows does an
+	  additional conversions on the client side. You can achieve similar
+	  effects by saying Y to "Allow using of Native Language Support"
+	  below.
+
+config NCPFS_NLS
+	bool "Use Native Language Support"
+	depends on NCP_FS
+	select NLS
+	help
+	  Allows you to use codepages and I/O charsets for file name
+	  translation between the server file system and input/output. This
+	  may be useful, if you want to access the server with other operating
+	  systems, e.g. Windows 95. See also NLS for more Information.
+
+	  To select codepages and I/O charsets use ncpfs-2.2.0.13 or newer.
+
+config NCPFS_EXTRAS
+	bool "Enable symbolic links and execute flags"
+	depends on NCP_FS
+	help
+	  This enables the use of symbolic links and an execute permission
+	  bit on NCPFS. The file server need not have long name space or NFS
+	  name space loaded for these to work.
+
+	  To use the new attributes, it is recommended to use the flags
+	  '-f 600 -d 755' on the ncpmount command line.
+
diff --git a/kernel/fs/ncpfs/Makefile b/kernel/fs/ncpfs/Makefile
new file mode 100644
index 000000000..c66af563f
--- /dev/null
+++ b/kernel/fs/ncpfs/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the linux ncp filesystem routines.
+#
+
+obj-$(CONFIG_NCP_FS) += ncpfs.o
+
+ncpfs-y      := dir.o file.o inode.o ioctl.o mmap.o ncplib_kernel.o sock.o \
+		ncpsign_kernel.o getopt.o
+
+ncpfs-$(CONFIG_NCPFS_EXTRAS)   += symlink.o
+ncpfs-$(CONFIG_NCPFS_NFS_NS)   += symlink.o
+
+# If you want debugging output, please uncomment the following line
+# ccflags-y := -DDEBUG_NCP=1
+
+CFLAGS_ncplib_kernel.o := -finline-functions
diff --git a/kernel/fs/ncpfs/dir.c b/kernel/fs/ncpfs/dir.c
new file mode 100644
index 000000000..80021c709
--- /dev/null
+++ b/kernel/fs/ncpfs/dir.c
@@ -0,0 +1,1237 @@
+/*
+ *  dir.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998, 1999 Wolfram Pienkoss for NLS
+ *  Modified 1999 Wolfram Pienkoss for directory caching
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/namei.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+#include "ncp_fs.h"
+
+static void ncp_read_volume_list(struct file *, struct dir_context *,
+				struct ncp_cache_control *);
+static void ncp_do_readdir(struct file *, struct dir_context *,
+				struct ncp_cache_control *);
+
+static int ncp_readdir(struct file *, struct dir_context *);
+
+static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
+static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
+static int ncp_unlink(struct inode *, struct dentry *);
+static int ncp_mkdir(struct inode *, struct dentry *, umode_t);
+static int ncp_rmdir(struct inode *, struct dentry *);
+static int ncp_rename(struct inode *, struct dentry *,
+	  	      struct inode *, struct dentry *);
+static int ncp_mknod(struct inode * dir, struct dentry *dentry,
+		     umode_t mode, dev_t rdev);
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern int ncp_symlink(struct inode *, struct dentry *, const char *);
+#else
+#define ncp_symlink NULL
+#endif
+		      
+const struct file_operations ncp_dir_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= ncp_readdir,
+	.unlocked_ioctl	= ncp_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ncp_compat_ioctl,
+#endif
+};
+
+const struct inode_operations ncp_dir_inode_operations =
+{
+	.create		= ncp_create,
+	.lookup		= ncp_lookup,
+	.unlink		= ncp_unlink,
+	.symlink	= ncp_symlink,
+	.mkdir		= ncp_mkdir,
+	.rmdir		= ncp_rmdir,
+	.mknod		= ncp_mknod,
+	.rename		= ncp_rename,
+	.setattr	= ncp_notify_change,
+};
+
+/*
+ * Dentry operations routines
+ */
+static int ncp_lookup_validate(struct dentry *, unsigned int);
+static int ncp_hash_dentry(const struct dentry *, struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
+		unsigned int, const char *, const struct qstr *);
+static int ncp_delete_dentry(const struct dentry *);
+static void ncp_d_prune(struct dentry *dentry);
+
+const struct dentry_operations ncp_dentry_operations =
+{
+	.d_revalidate	= ncp_lookup_validate,
+	.d_hash		= ncp_hash_dentry,
+	.d_compare	= ncp_compare_dentry,
+	.d_delete	= ncp_delete_dentry,
+	.d_prune	= ncp_d_prune,
+};
+
+#define ncp_namespace(i)	(NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
+
+static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
+{
+#ifdef CONFIG_NCPFS_SMALLDOS
+	int ns = ncp_namespace(i);
+
+	if ((ns == NW_NS_DOS)
+#ifdef CONFIG_NCPFS_OS2_NS
+		|| ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
+#endif /* CONFIG_NCPFS_OS2_NS */
+	   )
+		return 0;
+#endif /* CONFIG_NCPFS_SMALLDOS */
+	return 1;
+}
+
+#define ncp_preserve_case(i)	(ncp_namespace(i) != NW_NS_DOS)
+
+static inline int ncp_case_sensitive(const struct inode *i)
+{
+#ifdef CONFIG_NCPFS_NFS_NS
+	return ncp_namespace(i) == NW_NS_NFS;
+#else
+	return 0;
+#endif /* CONFIG_NCPFS_NFS_NS */
+}
+
+/*
+ * Note: leave the hash unchanged if the directory
+ * is case-sensitive.
+ *
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
+ */
+static int 
+ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
+{
+	struct inode *inode = d_inode_rcu(dentry);
+
+	if (!inode)
+		return 0;
+
+	if (!ncp_case_sensitive(inode)) {
+		struct super_block *sb = dentry->d_sb;
+		struct nls_table *t;
+		unsigned long hash;
+		int i;
+
+		t = NCP_IO_TABLE(sb);
+		hash = init_name_hash();
+		for (i=0; i<this->len ; i++)
+			hash = partial_name_hash(ncp_tolower(t, this->name[i]),
+									hash);
+		this->hash = end_name_hash(hash);
+	}
+	return 0;
+}
+
+/*
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
+ */
+static int
+ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct inode *pinode;
+
+	if (len != name->len)
+		return 1;
+
+	pinode = d_inode_rcu(parent);
+	if (!pinode)
+		return 1;
+
+	if (ncp_case_sensitive(pinode))
+		return strncmp(str, name->name, len);
+
+	return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
+}
+
+/*
+ * This is the callback from dput() when d_count is going to 0.
+ * We use this to unhash dentries with bad inodes.
+ * Closing files can be safely postponed until iput() - it's done there anyway.
+ */
+static int
+ncp_delete_dentry(const struct dentry * dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (inode) {
+		if (is_bad_inode(inode))
+			return 1;
+	} else
+	{
+	/* N.B. Unhash negative dentries? */
+	}
+	return 0;
+}
+
+static inline int
+ncp_single_volume(struct ncp_server *server)
+{
+	return (server->m.mounted_vol[0] != '\0');
+}
+
+static inline int ncp_is_server_root(struct inode *inode)
+{
+	return !ncp_single_volume(NCP_SERVER(inode)) &&
+		is_root_inode(inode);
+}
+
+
+/*
+ * This is the callback when the dcache has a lookup hit.
+ */
+
+
+#ifdef CONFIG_NCPFS_STRONG
+/* try to delete a readonly file (NW R bit set) */
+
+static int
+ncp_force_unlink(struct inode *dir, struct dentry* dentry)
+{
+        int res=0x9c,res2;
+	struct nw_modify_dos_info info;
+	__le32 old_nwattr;
+	struct inode *inode;
+
+	memset(&info, 0, sizeof(info));
+	
+        /* remove the Read-Only flag on the NW server */
+	inode = d_inode(dentry);
+
+	old_nwattr = NCP_FINFO(inode)->nwattr;
+	info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT);
+	res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info);
+	if (res2)
+		goto leave_me;
+
+        /* now try again the delete operation */
+        res = ncp_del_file_or_subdir2(NCP_SERVER(dir), dentry);
+
+        if (res)  /* delete failed, set R bit again */
+        {
+		info.attributes = old_nwattr;
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info);
+		if (res2)
+                        goto leave_me;
+        }
+leave_me:
+        return(res);
+}
+#endif	/* CONFIG_NCPFS_STRONG */
+
+#ifdef CONFIG_NCPFS_STRONG
+static int
+ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_name,
+                 struct inode *new_dir, struct dentry* new_dentry, char *_new_name)
+{
+	struct nw_modify_dos_info info;
+        int res=0x90,res2;
+	struct inode *old_inode = d_inode(old_dentry);
+	__le32 old_nwattr = NCP_FINFO(old_inode)->nwattr;
+	__le32 new_nwattr = 0; /* shut compiler warning */
+	int old_nwattr_changed = 0;
+	int new_nwattr_changed = 0;
+
+	memset(&info, 0, sizeof(info));
+	
+        /* remove the Read-Only flag on the NW server */
+
+	info.attributes = old_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+	res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info);
+	if (!res2)
+		old_nwattr_changed = 1;
+	if (new_dentry && d_really_is_positive(new_dentry)) {
+		new_nwattr = NCP_FINFO(d_inode(new_dentry))->nwattr;
+		info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info);
+		if (!res2)
+			new_nwattr_changed = 1;
+	}
+        /* now try again the rename operation */
+	/* but only if something really happened */
+	if (new_nwattr_changed || old_nwattr_changed) {
+	        res = ncp_ren_or_mov_file_or_subdir(NCP_SERVER(old_dir),
+        	                                    old_dir, _old_name,
+                	                            new_dir, _new_name);
+	} 
+	if (res)
+		goto leave_me;
+	/* file was successfully renamed, so:
+	   do not set attributes on old file - it no longer exists
+	   copy attributes from old file to new */
+	new_nwattr_changed = old_nwattr_changed;
+	new_nwattr = old_nwattr;
+	old_nwattr_changed = 0;
+	
+leave_me:;
+	if (old_nwattr_changed) {
+		info.attributes = old_nwattr;
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info);
+		/* ignore errors */
+	}
+	if (new_nwattr_changed)	{
+		info.attributes = new_nwattr;
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info);
+		/* ignore errors */
+	}
+        return(res);
+}
+#endif	/* CONFIG_NCPFS_STRONG */
+
+
+static int
+ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
+{
+	struct ncp_server *server;
+	struct dentry *parent;
+	struct inode *dir;
+	struct ncp_entry_info finfo;
+	int res, val = 0, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	if (dentry == dentry->d_sb->s_root)
+		return 1;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+
+	if (d_really_is_negative(dentry))
+		goto finished;
+
+	server = NCP_SERVER(dir);
+
+	/*
+	 * Inspired by smbfs:
+	 * The default validation is based on dentry age:
+	 * We set the max age at mount time.  (But each
+	 * successful server lookup renews the timestamp.)
+	 */
+	val = NCP_TEST_AGE(server, dentry);
+	if (val)
+		goto finished;
+
+	ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n",
+		dentry, NCP_GET_AGE(dentry));
+
+	len = sizeof(__name);
+	if (ncp_is_server_root(dir)) {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, 1);
+		if (!res) {
+			res = ncp_lookup_volume(server, __name, &(finfo.i));
+			if (!res)
+				ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+		}
+	} else {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, !ncp_preserve_case(dir));
+		if (!res)
+			res = ncp_obtain_info(server, dir, __name, &(finfo.i));
+	}
+	finfo.volume = finfo.i.volNumber;
+	ncp_dbg(2, "looked for %pd/%s, res=%d\n",
+		dentry->d_parent, __name, res);
+	/*
+	 * If we didn't find it, or if it has a different dirEntNum to
+	 * what we remember, it's not valid any more.
+	 */
+	if (!res) {
+		struct inode *inode = d_inode(dentry);
+
+		mutex_lock(&inode->i_mutex);
+		if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
+			ncp_new_dentry(dentry);
+			val=1;
+		} else
+			ncp_dbg(2, "found, but dirEntNum changed\n");
+
+		ncp_update_inode2(inode, &finfo);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+finished:
+	ncp_dbg(2, "result=%d\n", val);
+	dput(parent);
+	return val;
+}
+
+static time_t ncp_obtain_mtime(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ncp_server *server = NCP_SERVER(inode);
+	struct nw_info_struct i;
+
+	if (!ncp_conn_valid(server) || ncp_is_server_root(inode))
+		return 0;
+
+	if (ncp_obtain_info(server, inode, NULL, &i))
+		return 0;
+
+	return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
+}
+
+static inline void
+ncp_invalidate_dircache_entries(struct dentry *parent)
+{
+	struct ncp_server *server = NCP_SERVER(d_inode(parent));
+	struct dentry *dentry;
+
+	spin_lock(&parent->d_lock);
+	list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
+		dentry->d_fsdata = NULL;
+		ncp_age_dentry(server, dentry);
+	}
+	spin_unlock(&parent->d_lock);
+}
+
+static int ncp_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	struct page *page = NULL;
+	struct ncp_server *server = NCP_SERVER(inode);
+	union  ncp_dir_cache *cache = NULL;
+	struct ncp_cache_control ctl;
+	int result, mtime_valid = 0;
+	time_t mtime = 0;
+
+	ctl.page  = NULL;
+	ctl.cache = NULL;
+
+	ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos);
+
+	result = -EIO;
+	/* Do not generate '.' and '..' when server is dead. */
+	if (!ncp_conn_valid(server))
+		goto out;
+
+	result = 0;
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+
+	page = grab_cache_page(&inode->i_data, 0);
+	if (!page)
+		goto read_really;
+
+	ctl.cache = cache = kmap(page);
+	ctl.head  = cache->head;
+
+	if (!PageUptodate(page) || !ctl.head.eof)
+		goto init_cache;
+
+	if (ctx->pos == 2) {
+		if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
+			goto init_cache;
+
+		mtime = ncp_obtain_mtime(dentry);
+		mtime_valid = 1;
+		if ((!mtime) || (mtime != ctl.head.mtime))
+			goto init_cache;
+	}
+
+	if (ctx->pos > ctl.head.end)
+		goto finished;
+
+	ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
+	ctl.ofs  = ctl.fpos / NCP_DIRCACHE_SIZE;
+	ctl.idx  = ctl.fpos % NCP_DIRCACHE_SIZE;
+
+	for (;;) {
+		if (ctl.ofs != 0) {
+			ctl.page = find_lock_page(&inode->i_data, ctl.ofs);
+			if (!ctl.page)
+				goto invalid_cache;
+			ctl.cache = kmap(ctl.page);
+			if (!PageUptodate(ctl.page))
+				goto invalid_cache;
+		}
+		while (ctl.idx < NCP_DIRCACHE_SIZE) {
+			struct dentry *dent;
+			bool over;
+
+			spin_lock(&dentry->d_lock);
+			if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) { 
+				spin_unlock(&dentry->d_lock);
+				goto invalid_cache;
+			}
+			dent = ctl.cache->dentry[ctl.idx];
+			if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) {
+				spin_unlock(&dentry->d_lock);
+				goto invalid_cache;
+			}
+			spin_unlock(&dentry->d_lock);
+			if (d_really_is_negative(dent)) {
+				dput(dent);
+				goto invalid_cache;
+			}
+			over = !dir_emit(ctx, dent->d_name.name,
+					dent->d_name.len,
+					d_inode(dent)->i_ino, DT_UNKNOWN);
+			dput(dent);
+			if (over)
+				goto finished;
+			ctx->pos += 1;
+			ctl.idx += 1;
+			if (ctx->pos > ctl.head.end)
+				goto finished;
+		}
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+			ctl.page = NULL;
+		}
+		ctl.idx  = 0;
+		ctl.ofs += 1;
+	}
+invalid_cache:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+		ctl.page = NULL;
+	}
+	ctl.cache = cache;
+init_cache:
+	ncp_invalidate_dircache_entries(dentry);
+	if (!mtime_valid) {
+		mtime = ncp_obtain_mtime(dentry);
+		mtime_valid = 1;
+	}
+	ctl.head.mtime = mtime;
+	ctl.head.time = jiffies;
+	ctl.head.eof = 0;
+	ctl.fpos = 2;
+	ctl.ofs = 0;
+	ctl.idx = NCP_DIRCACHE_START;
+	ctl.filled = 0;
+	ctl.valid  = 1;
+read_really:
+	spin_lock(&dentry->d_lock);
+	NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE;
+	spin_unlock(&dentry->d_lock);
+	if (ncp_is_server_root(inode)) {
+		ncp_read_volume_list(file, ctx, &ctl);
+	} else {
+		ncp_do_readdir(file, ctx, &ctl);
+	}
+	ctl.head.end = ctl.fpos - 1;
+	ctl.head.eof = ctl.valid;
+finished:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		SetPageUptodate(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+	}
+	if (page) {
+		cache->head = ctl.head;
+		kunmap(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+out:
+	return result;
+}
+
+static void ncp_d_prune(struct dentry *dentry)
+{
+	if (!dentry->d_fsdata)	/* not referenced from page cache */
+		return;
+	NCP_FINFO(d_inode(dentry->d_parent))->flags &= ~NCPI_DIR_CACHE;
+}
+
+static int
+ncp_fill_cache(struct file *file, struct dir_context *ctx,
+		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
+		int inval_childs)
+{
+	struct dentry *newdent, *dentry = file->f_path.dentry;
+	struct inode *dir = d_inode(dentry);
+	struct ncp_cache_control ctl = *ctrl;
+	struct qstr qname;
+	int valid = 0;
+	int hashed = 0;
+	ino_t ino = 0;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	qname.len = sizeof(__name);
+	if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
+			entry->i.entryName, entry->i.nameLen,
+			!ncp_preserve_entry_case(dir, entry->i.NSCreator)))
+		return 1; /* I'm not sure */
+
+	qname.name = __name;
+
+	newdent = d_hash_and_lookup(dentry, &qname);
+	if (unlikely(IS_ERR(newdent)))
+		goto end_advance;
+	if (!newdent) {
+		newdent = d_alloc(dentry, &qname);
+		if (!newdent)
+			goto end_advance;
+	} else {
+		hashed = 1;
+
+		/* If case sensitivity changed for this volume, all entries below this one
+		   should be thrown away.  This entry itself is not affected, as its case
+		   sensitivity is controlled by its own parent. */
+		if (inval_childs)
+			shrink_dcache_parent(newdent);
+
+		/*
+		 * NetWare's OS2 namespace is case preserving yet case
+		 * insensitive.  So we update dentry's name as received from
+		 * server. Parent dir's i_mutex is locked because we're in
+		 * readdir.
+		 */
+		dentry_update_name_case(newdent, &qname);
+	}
+
+	if (d_really_is_negative(newdent)) {
+		struct inode *inode;
+
+		entry->opened = 0;
+		entry->ino = iunique(dir->i_sb, 2);
+		inode = ncp_iget(dir->i_sb, entry);
+		if (inode) {
+			d_instantiate(newdent, inode);
+			if (!hashed)
+				d_rehash(newdent);
+		} else {
+			spin_lock(&dentry->d_lock);
+			NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+			spin_unlock(&dentry->d_lock);
+		}
+	} else {
+		struct inode *inode = d_inode(newdent);
+
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ncp_update_inode2(inode, entry);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	if (ctl.idx >= NCP_DIRCACHE_SIZE) {
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+		}
+		ctl.cache = NULL;
+		ctl.idx  -= NCP_DIRCACHE_SIZE;
+		ctl.ofs  += 1;
+		ctl.page  = grab_cache_page(&dir->i_data, ctl.ofs);
+		if (ctl.page)
+			ctl.cache = kmap(ctl.page);
+	}
+	if (ctl.cache) {
+		if (d_really_is_positive(newdent)) {
+			newdent->d_fsdata = newdent;
+			ctl.cache->dentry[ctl.idx] = newdent;
+			ino = d_inode(newdent)->i_ino;
+			ncp_new_dentry(newdent);
+		}
+ 		valid = 1;
+	}
+	dput(newdent);
+end_advance:
+	if (!valid)
+		ctl.valid = 0;
+	if (!ctl.filled && (ctl.fpos == ctx->pos)) {
+		if (!ino)
+			ino = iunique(dir->i_sb, 2);
+		ctl.filled = !dir_emit(ctx, qname.name, qname.len,
+				     ino, DT_UNKNOWN);
+		if (!ctl.filled)
+			ctx->pos += 1;
+	}
+	ctl.fpos += 1;
+	ctl.idx  += 1;
+	*ctrl = ctl;
+	return (ctl.valid || !ctl.filled);
+}
+
+static void
+ncp_read_volume_list(struct file *file, struct dir_context *ctx,
+			struct ncp_cache_control *ctl)
+{
+	struct inode *inode = file_inode(file);
+	struct ncp_server *server = NCP_SERVER(inode);
+	struct ncp_volume_info info;
+	struct ncp_entry_info entry;
+	int i;
+
+	ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos);
+
+	for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
+		int inval_dentry;
+
+		if (ncp_get_volume_info_with_number(server, i, &info) != 0)
+			return;
+		if (!strlen(info.volume_name))
+			continue;
+
+		ncp_dbg(1, "found vol: %s\n", info.volume_name);
+
+		if (ncp_lookup_volume(server, info.volume_name,
+					&entry.i)) {
+			ncp_dbg(1, "could not lookup vol %s\n",
+				info.volume_name);
+			continue;
+		}
+		inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
+		entry.volume = entry.i.volNumber;
+		if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
+			return;
+	}
+}
+
+static void
+ncp_do_readdir(struct file *file, struct dir_context *ctx,
+						struct ncp_cache_control *ctl)
+{
+	struct inode *dir = file_inode(file);
+	struct ncp_server *server = NCP_SERVER(dir);
+	struct nw_search_sequence seq;
+	struct ncp_entry_info entry;
+	int err;
+	void* buf;
+	int more;
+	size_t bufsize;
+
+	ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos);
+	ncp_vdbg("init %pD, volnum=%d, dirent=%u\n",
+		 file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
+
+	err = ncp_initialize_search(server, dir, &seq);
+	if (err) {
+		ncp_dbg(1, "init failed, err=%d\n", err);
+		return;
+	}
+	/* We MUST NOT use server->buffer_size handshaked with server if we are
+	   using UDP, as for UDP server uses max. buffer size determined by
+	   MTU, and for TCP server uses hardwired value 65KB (== 66560 bytes). 
+	   So we use 128KB, just to be sure, as there is no way how to know
+	   this value in advance. */
+	bufsize = 131072;
+	buf = vmalloc(bufsize);
+	if (!buf)
+		return;
+	do {
+		int cnt;
+		char* rpl;
+		size_t rpls;
+
+		err = ncp_search_for_fileset(server, &seq, &more, &cnt, buf, bufsize, &rpl, &rpls);
+		if (err)		/* Error */
+			break;
+		if (!cnt)		/* prevent endless loop */
+			break;
+		while (cnt--) {
+			size_t onerpl;
+			
+			if (rpls < offsetof(struct nw_info_struct, entryName))
+				break;	/* short packet */
+			ncp_extract_file_info(rpl, &entry.i);
+			onerpl = offsetof(struct nw_info_struct, entryName) + entry.i.nameLen;
+			if (rpls < onerpl)
+				break;	/* short packet */
+			(void)ncp_obtain_nfs_info(server, &entry.i);
+			rpl += onerpl;
+			rpls -= onerpl;
+			entry.volume = entry.i.volNumber;
+			if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
+				break;
+		}
+	} while (more);
+	vfree(buf);
+	return;
+}
+
+int ncp_conn_logged_in(struct super_block *sb)
+{
+	struct ncp_server* server = NCP_SBP(sb);
+	int result;
+
+	if (ncp_single_volume(server)) {
+		int len;
+		struct dentry* dent;
+		__u32 volNumber;
+		__le32 dirEntNum;
+		__le32 DosDirNum;
+		__u8 __name[NCP_MAXPATHLEN + 1];
+
+		len = sizeof(__name);
+		result = ncp_io2vol(server, __name, &len, server->m.mounted_vol,
+				    strlen(server->m.mounted_vol), 1);
+		if (result)
+			goto out;
+		result = -ENOENT;
+		if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) {
+			ncp_vdbg("%s not found\n", server->m.mounted_vol);
+			goto out;
+		}
+		dent = sb->s_root;
+		if (dent) {
+			struct inode* ino = d_inode(dent);
+			if (ino) {
+				ncp_update_known_namespace(server, volNumber, NULL);
+				NCP_FINFO(ino)->volNumber = volNumber;
+				NCP_FINFO(ino)->dirEntNum = dirEntNum;
+				NCP_FINFO(ino)->DosDirNum = DosDirNum;
+				result = 0;
+			} else {
+				ncp_dbg(1, "d_inode(sb->s_root) == NULL!\n");
+			}
+		} else {
+			ncp_dbg(1, "sb->s_root == NULL!\n");
+		}
+	} else
+		result = 0;
+
+out:
+	return result;
+}
+
+static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct ncp_server *server = NCP_SERVER(dir);
+	struct inode *inode = NULL;
+	struct ncp_entry_info finfo;
+	int error, res, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	error = -EIO;
+	if (!ncp_conn_valid(server))
+		goto finished;
+
+	ncp_vdbg("server lookup for %pd2\n", dentry);
+
+	len = sizeof(__name);
+	if (ncp_is_server_root(dir)) {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, 1);
+		if (!res)
+			res = ncp_lookup_volume(server, __name, &(finfo.i));
+		if (!res)
+			ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+	} else {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, !ncp_preserve_case(dir));
+		if (!res)
+			res = ncp_obtain_info(server, dir, __name, &(finfo.i));
+	}
+	ncp_vdbg("looked for %pd2, res=%d\n", dentry, res);
+	/*
+	 * If we didn't find an entry, make a negative dentry.
+	 */
+	if (res)
+		goto add_entry;
+
+	/*
+	 * Create an inode for the entry.
+	 */
+	finfo.opened = 0;
+	finfo.ino = iunique(dir->i_sb, 2);
+	finfo.volume = finfo.i.volNumber;
+	error = -EACCES;
+	inode = ncp_iget(dir->i_sb, &finfo);
+
+	if (inode) {
+		ncp_new_dentry(dentry);
+add_entry:
+		d_add(dentry, inode);
+		error = 0;
+	}
+
+finished:
+	ncp_vdbg("result=%d\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * This code is common to create, mkdir, and mknod.
+ */
+static int ncp_instantiate(struct inode *dir, struct dentry *dentry,
+			struct ncp_entry_info *finfo)
+{
+	struct inode *inode;
+	int error = -EINVAL;
+
+	finfo->ino = iunique(dir->i_sb, 2);
+	inode = ncp_iget(dir->i_sb, finfo);
+	if (!inode)
+		goto out_close;
+	d_instantiate(dentry,inode);
+	error = 0;
+out:
+	return error;
+
+out_close:
+	ncp_vdbg("%pd2 failed, closing file\n", dentry);
+	ncp_close_file(NCP_SERVER(dir), finfo->file_handle);
+	goto out;
+}
+
+int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
+		   dev_t rdev, __le32 attributes)
+{
+	struct ncp_server *server = NCP_SERVER(dir);
+	struct ncp_entry_info finfo;
+	int error, result, len;
+	int opmode;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+	
+	ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode);
+
+	ncp_age_dentry(server, dentry);
+	len = sizeof(__name);
+	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+			   dentry->d_name.len, !ncp_preserve_case(dir));
+	if (error)
+		goto out;
+
+	error = -EACCES;
+	
+	if (S_ISREG(mode) && 
+	    (server->m.flags & NCP_MOUNT_EXTRAS) && 
+	    (mode & S_IXUGO))
+		attributes |= aSYSTEM | aSHARED;
+	
+	result = ncp_open_create_file_or_subdir(server, dir, __name,
+				OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE,
+				attributes, AR_READ | AR_WRITE, &finfo);
+	opmode = O_RDWR;
+	if (result) {
+		result = ncp_open_create_file_or_subdir(server, dir, __name,
+				OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE,
+				attributes, AR_WRITE, &finfo);
+		if (result) {
+			if (result == 0x87)
+				error = -ENAMETOOLONG;
+			else if (result < 0)
+				error = result;
+			ncp_dbg(1, "%pd2 failed\n", dentry);
+			goto out;
+		}
+		opmode = O_WRONLY;
+	}
+	finfo.access = opmode;
+	if (ncp_is_nfs_extras(server, finfo.volume)) {
+		finfo.i.nfs.mode = mode;
+		finfo.i.nfs.rdev = new_encode_dev(rdev);
+		if (ncp_modify_nfs_info(server, finfo.volume,
+					finfo.i.dirEntNum,
+					mode, new_encode_dev(rdev)) != 0)
+			goto out;
+	}
+
+	error = ncp_instantiate(dir, dentry, &finfo);
+out:
+	return error;
+}
+
+static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool excl)
+{
+	return ncp_create_new(dir, dentry, mode, 0, 0);
+}
+
+static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct ncp_entry_info finfo;
+	struct ncp_server *server = NCP_SERVER(dir);
+	int error, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	ncp_dbg(1, "making %pd2\n", dentry);
+
+	ncp_age_dentry(server, dentry);
+	len = sizeof(__name);
+	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+			   dentry->d_name.len, !ncp_preserve_case(dir));
+	if (error)
+		goto out;
+
+	error = ncp_open_create_file_or_subdir(server, dir, __name,
+					   OC_MODE_CREATE, aDIR,
+					   cpu_to_le16(0xffff),
+					   &finfo);
+	if (error == 0) {
+		if (ncp_is_nfs_extras(server, finfo.volume)) {
+			mode |= S_IFDIR;
+			finfo.i.nfs.mode = mode;
+			if (ncp_modify_nfs_info(server,
+						finfo.volume,
+						finfo.i.dirEntNum,
+						mode, 0) != 0)
+				goto out;
+		}
+		error = ncp_instantiate(dir, dentry, &finfo);
+	} else if (error > 0) {
+		error = -EACCES;
+	}
+out:
+	return error;
+}
+
+static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct ncp_server *server = NCP_SERVER(dir);
+	int error, result, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	ncp_dbg(1, "removing %pd2\n", dentry);
+
+	len = sizeof(__name);
+	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+			   dentry->d_name.len, !ncp_preserve_case(dir));
+	if (error)
+		goto out;
+
+	result = ncp_del_file_or_subdir(server, dir, __name);
+	switch (result) {
+		case 0x00:
+			error = 0;
+			break;
+		case 0x85:	/* unauthorized to delete file */
+		case 0x8A:	/* unauthorized to delete file */
+			error = -EACCES;
+			break;
+		case 0x8F:
+		case 0x90:	/* read only */
+			error = -EPERM;
+			break;
+		case 0x9F:	/* in use by another client */
+			error = -EBUSY;
+			break;
+		case 0xA0:	/* directory not empty */
+			error = -ENOTEMPTY;
+			break;
+		case 0xFF:	/* someone deleted file */
+			error = -ENOENT;
+			break;
+		default:
+			error = result < 0 ? result : -EACCES;
+			break;
+       	}
+out:
+	return error;
+}
+
+static int ncp_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ncp_server *server;
+	int error;
+
+	server = NCP_SERVER(dir);
+	ncp_dbg(1, "unlinking %pd2\n", dentry);
+	
+	/*
+	 * Check whether to close the file ...
+	 */
+	if (inode) {
+		ncp_vdbg("closing file\n");
+		ncp_make_closed(inode);
+	}
+
+	error = ncp_del_file_or_subdir2(server, dentry);
+#ifdef CONFIG_NCPFS_STRONG
+	/* 9C is Invalid path.. It should be 8F, 90 - read only, but
+	   it is not :-( */
+	if ((error == 0x9C || error == 0x90) && server->m.flags & NCP_MOUNT_STRONG) { /* R/O */
+		error = ncp_force_unlink(dir, dentry);
+	}
+#endif
+	switch (error) {
+		case 0x00:
+			ncp_dbg(1, "removed %pd2\n", dentry);
+			break;
+		case 0x85:
+		case 0x8A:
+			error = -EACCES;
+			break;
+		case 0x8D:	/* some files in use */
+		case 0x8E:	/* all files in use */
+			error = -EBUSY;
+			break;
+		case 0x8F:	/* some read only */
+		case 0x90:	/* all read only */
+		case 0x9C:	/* !!! returned when in-use or read-only by NW4 */
+			error = -EPERM;
+			break;
+		case 0xFF:
+			error = -ENOENT;
+			break;
+		default:
+			error = error < 0 ? error : -EACCES;
+			break;
+	}
+	return error;
+}
+
+static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ncp_server *server = NCP_SERVER(old_dir);
+	int error;
+	int old_len, new_len;
+	__u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
+
+	ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry);
+
+	ncp_age_dentry(server, old_dentry);
+	ncp_age_dentry(server, new_dentry);
+
+	old_len = sizeof(__old_name);
+	error = ncp_io2vol(server, __old_name, &old_len,
+			   old_dentry->d_name.name, old_dentry->d_name.len,
+			   !ncp_preserve_case(old_dir));
+	if (error)
+		goto out;
+
+	new_len = sizeof(__new_name);
+	error = ncp_io2vol(server, __new_name, &new_len,
+			   new_dentry->d_name.name, new_dentry->d_name.len,
+			   !ncp_preserve_case(new_dir));
+	if (error)
+		goto out;
+
+	error = ncp_ren_or_mov_file_or_subdir(server, old_dir, __old_name,
+						      new_dir, __new_name);
+#ifdef CONFIG_NCPFS_STRONG
+	if ((error == 0x90 || error == 0x8B || error == -EACCES) &&
+			server->m.flags & NCP_MOUNT_STRONG) {	/* RO */
+		error = ncp_force_rename(old_dir, old_dentry, __old_name,
+					 new_dir, new_dentry, __new_name);
+	}
+#endif
+	switch (error) {
+		case 0x00:
+			ncp_dbg(1, "renamed %pd -> %pd\n",
+				old_dentry, new_dentry);
+			break;
+		case 0x9E:
+			error = -ENAMETOOLONG;
+			break;
+		case 0xFF:
+			error = -ENOENT;
+			break;
+		default:
+			error = error < 0 ? error : -EACCES;
+			break;
+	}
+out:
+	return error;
+}
+
+static int ncp_mknod(struct inode * dir, struct dentry *dentry,
+		     umode_t mode, dev_t rdev)
+{
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+	if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
+		ncp_dbg(1, "mode = 0%ho\n", mode);
+		return ncp_create_new(dir, dentry, mode, rdev, 0);
+	}
+	return -EPERM; /* Strange, but true */
+}
+
+/* The following routines are taken directly from msdos-fs */
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+
+static int day_n[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
+/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */
+
+static int utc2local(int time)
+{
+	return time - sys_tz.tz_minuteswest * 60;
+}
+
+static int local2utc(int time)
+{
+	return time + sys_tz.tz_minuteswest * 60;
+}
+
+/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
+int
+ncp_date_dos2unix(__le16 t, __le16 d)
+{
+	unsigned short time = le16_to_cpu(t), date = le16_to_cpu(d);
+	int month, year, secs;
+
+	/* first subtract and mask after that... Otherwise, if
+	   date == 0, bad things happen */
+	month = ((date >> 5) - 1) & 15;
+	year = date >> 9;
+	secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 +
+		86400 * ((date & 31) - 1 + day_n[month] + (year / 4) + 
+		year * 365 - ((year & 3) == 0 && month < 2 ? 1 : 0) + 3653);
+	/* days since 1.1.70 plus 80's leap day */
+	return local2utc(secs);
+}
+
+
+/* Convert linear UNIX date to a MS-DOS time/date pair. */
+void
+ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
+{
+	int day, year, nl_day, month;
+
+	unix_date = utc2local(unix_date);
+	*time = cpu_to_le16(
+		(unix_date % 60) / 2 + (((unix_date / 60) % 60) << 5) +
+		(((unix_date / 3600) % 24) << 11));
+	day = unix_date / 86400 - 3652;
+	year = day / 365;
+	if ((year + 3) / 4 + 365 * year > day)
+		year--;
+	day -= (year + 3) / 4 + 365 * year;
+	if (day == 59 && !(year & 3)) {
+		nl_day = day;
+		month = 2;
+	} else {
+		nl_day = (year & 3) || day <= 59 ? day : day - 1;
+		for (month = 1; month < 12; month++)
+			if (day_n[month] > nl_day)
+				break;
+	}
+	*date = cpu_to_le16(nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9));
+}
diff --git a/kernel/fs/ncpfs/file.c b/kernel/fs/ncpfs/file.c
new file mode 100644
index 000000000..011324ce9
--- /dev/null
+++ b/kernel/fs/ncpfs/file.c
@@ -0,0 +1,262 @@
+/*
+ *  file.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/uaccess.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+
+#include "ncp_fs.h"
+
+static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	return filemap_write_and_wait_range(file->f_mapping, start, end);
+}
+
+/*
+ * Open a file with the specified read/write mode.
+ */
+int ncp_make_open(struct inode *inode, int right)
+{
+	int error;
+	int access;
+
+	error = -EINVAL;
+	if (!inode) {
+		pr_err("%s: got NULL inode\n", __func__);
+		goto out;
+	}
+
+	ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n",
+		atomic_read(&NCP_FINFO(inode)->opened), 
+		NCP_FINFO(inode)->volNumber, 
+		NCP_FINFO(inode)->dirEntNum);
+	error = -EACCES;
+	mutex_lock(&NCP_FINFO(inode)->open_mutex);
+	if (!atomic_read(&NCP_FINFO(inode)->opened)) {
+		struct ncp_entry_info finfo;
+		int result;
+
+		/* tries max. rights */
+		finfo.access = O_RDWR;
+		result = ncp_open_create_file_or_subdir(NCP_SERVER(inode),
+					inode, NULL, OC_MODE_OPEN,
+					0, AR_READ | AR_WRITE, &finfo);
+		if (!result)
+			goto update;
+		/* RDWR did not succeeded, try readonly or writeonly as requested */
+		switch (right) {
+			case O_RDONLY:
+				finfo.access = O_RDONLY;
+				result = ncp_open_create_file_or_subdir(NCP_SERVER(inode),
+					inode, NULL, OC_MODE_OPEN,
+					0, AR_READ, &finfo);
+				break;
+			case O_WRONLY:
+				finfo.access = O_WRONLY;
+				result = ncp_open_create_file_or_subdir(NCP_SERVER(inode),
+					inode, NULL, OC_MODE_OPEN,
+					0, AR_WRITE, &finfo);
+				break;
+		}
+		if (result) {
+			ncp_vdbg("failed, result=%d\n", result);
+			goto out_unlock;
+		}
+		/*
+		 * Update the inode information.
+		 */
+	update:
+		ncp_update_inode(inode, &finfo);
+		atomic_set(&NCP_FINFO(inode)->opened, 1);
+	}
+
+	access = NCP_FINFO(inode)->access;
+	ncp_vdbg("file open, access=%x\n", access);
+	if (access == right || access == O_RDWR) {
+		atomic_inc(&NCP_FINFO(inode)->opened);
+		error = 0;
+	}
+
+out_unlock:
+	mutex_unlock(&NCP_FINFO(inode)->open_mutex);
+out:
+	return error;
+}
+
+static ssize_t
+ncp_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	size_t already_read = 0;
+	off_t pos = iocb->ki_pos;
+	size_t bufsize;
+	int error;
+	void *freepage;
+	size_t freelen;
+
+	ncp_dbg(1, "enter %pD2\n", file);
+
+	if (!iov_iter_count(to))
+		return 0;
+	if (pos > inode->i_sb->s_maxbytes)
+		return 0;
+	iov_iter_truncate(to, inode->i_sb->s_maxbytes - pos);
+
+	error = ncp_make_open(inode, O_RDONLY);
+	if (error) {
+		ncp_dbg(1, "open failed, error=%d\n", error);
+		return error;
+	}
+
+	bufsize = NCP_SERVER(inode)->buffer_size;
+
+	error = -EIO;
+	freelen = ncp_read_bounce_size(bufsize);
+	freepage = vmalloc(freelen);
+	if (!freepage)
+		goto outrel;
+	error = 0;
+	/* First read in as much as possible for each bufsize. */
+	while (iov_iter_count(to)) {
+		int read_this_time;
+		size_t to_read = min_t(size_t,
+				     bufsize - (pos % bufsize),
+				     iov_iter_count(to));
+
+		error = ncp_read_bounce(NCP_SERVER(inode),
+			 	NCP_FINFO(inode)->file_handle,
+				pos, to_read, to, &read_this_time, 
+				freepage, freelen);
+		if (error) {
+			error = -EIO;	/* NW errno -> Linux errno */
+			break;
+		}
+		pos += read_this_time;
+		already_read += read_this_time;
+
+		if (read_this_time != to_read)
+			break;
+	}
+	vfree(freepage);
+
+	iocb->ki_pos = pos;
+
+	file_accessed(file);
+
+	ncp_dbg(1, "exit %pD2\n", file);
+outrel:
+	ncp_inode_close(inode);		
+	return already_read ? already_read : error;
+}
+
+static ssize_t
+ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	size_t already_written = 0;
+	size_t bufsize;
+	int errno;
+	void *bouncebuffer;
+	off_t pos;
+
+	ncp_dbg(1, "enter %pD2\n", file);
+	errno = generic_write_checks(iocb, from);
+	if (errno <= 0)
+		return errno;
+
+	errno = ncp_make_open(inode, O_WRONLY);
+	if (errno) {
+		ncp_dbg(1, "open failed, error=%d\n", errno);
+		return errno;
+	}
+	bufsize = NCP_SERVER(inode)->buffer_size;
+
+	errno = file_update_time(file);
+	if (errno)
+		goto outrel;
+
+	bouncebuffer = vmalloc(bufsize);
+	if (!bouncebuffer) {
+		errno = -EIO;	/* -ENOMEM */
+		goto outrel;
+	}
+	pos = iocb->ki_pos;
+	while (iov_iter_count(from)) {
+		int written_this_time;
+		size_t to_write = min_t(size_t,
+				      bufsize - (pos % bufsize),
+				      iov_iter_count(from));
+
+		if (copy_from_iter(bouncebuffer, to_write, from) != to_write) {
+			errno = -EFAULT;
+			break;
+		}
+		if (ncp_write_kernel(NCP_SERVER(inode), 
+		    NCP_FINFO(inode)->file_handle,
+		    pos, to_write, bouncebuffer, &written_this_time) != 0) {
+			errno = -EIO;
+			break;
+		}
+		pos += written_this_time;
+		already_written += written_this_time;
+
+		if (written_this_time != to_write)
+			break;
+	}
+	vfree(bouncebuffer);
+
+	iocb->ki_pos = pos;
+
+	if (pos > i_size_read(inode)) {
+		mutex_lock(&inode->i_mutex);
+		if (pos > i_size_read(inode))
+			i_size_write(inode, pos);
+		mutex_unlock(&inode->i_mutex);
+	}
+	ncp_dbg(1, "exit %pD2\n", file);
+outrel:
+	ncp_inode_close(inode);		
+	return already_written ? already_written : errno;
+}
+
+static int ncp_release(struct inode *inode, struct file *file) {
+	if (ncp_make_closed(inode)) {
+		ncp_dbg(1, "failed to close\n");
+	}
+	return 0;
+}
+
+const struct file_operations ncp_file_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read_iter	= ncp_file_read_iter,
+	.write_iter	= ncp_file_write_iter,
+	.unlocked_ioctl	= ncp_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ncp_compat_ioctl,
+#endif
+	.mmap		= ncp_mmap,
+	.release	= ncp_release,
+	.fsync		= ncp_fsync,
+};
+
+const struct inode_operations ncp_file_inode_operations =
+{
+	.setattr	= ncp_notify_change,
+};
diff --git a/kernel/fs/ncpfs/getopt.c b/kernel/fs/ncpfs/getopt.c
new file mode 100644
index 000000000..344889cd1
--- /dev/null
+++ b/kernel/fs/ncpfs/getopt.c
@@ -0,0 +1,75 @@
+/*
+ * getopt.c
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/errno.h>
+
+#include "getopt.h"
+
+/**
+ *	ncp_getopt - option parser
+ *	@caller: name of the caller, for error messages
+ *	@options: the options string
+ *	@opts: an array of &struct option entries controlling parser operations
+ *	@optopt: output; will contain the current option
+ *	@optarg: output; will contain the value (if one exists)
+ *	@value: output; may be NULL; will be overwritten with the integer value
+ *		of the current argument.
+ *
+ *	Helper to parse options on the format used by mount ("a=b,c=d,e,f").
+ *	Returns opts->val if a matching entry in the 'opts' array is found,
+ *	0 when no more tokens are found, -1 if an error is encountered.
+ */
+int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts,
+	       char **optopt, char **optarg, unsigned long *value)
+{
+	char *token;
+	char *val;
+
+	do {
+		if ((token = strsep(options, ",")) == NULL)
+			return 0;
+	} while (*token == '\0');
+	if (optopt)
+		*optopt = token;
+
+	if ((val = strchr (token, '=')) != NULL) {
+		*val++ = 0;
+	}
+	*optarg = val;
+	for (; opts->name; opts++) {
+		if (!strcmp(opts->name, token)) {
+			if (!val) {
+				if (opts->has_arg & OPT_NOPARAM) {
+					return opts->val;
+				}
+				pr_info("%s: the %s option requires an argument\n",
+					caller, token);
+				return -EINVAL;
+			}
+			if (opts->has_arg & OPT_INT) {
+				int rc = kstrtoul(val, 0, value);
+
+				if (rc) {
+					pr_info("%s: invalid numeric value in %s=%s\n",
+						caller, token, val);
+					return rc;
+				}
+				return opts->val;
+			}
+			if (opts->has_arg & OPT_STRING) {
+				return opts->val;
+			}
+			pr_info("%s: unexpected argument %s to the %s option\n",
+				caller, val, token);
+			return -EINVAL;
+		}
+	}
+	pr_info("%s: Unrecognized mount option %s\n", caller, token);
+	return -EOPNOTSUPP;
+}
diff --git a/kernel/fs/ncpfs/getopt.h b/kernel/fs/ncpfs/getopt.h
new file mode 100644
index 000000000..cccc007dc
--- /dev/null
+++ b/kernel/fs/ncpfs/getopt.h
@@ -0,0 +1,16 @@
+#ifndef _LINUX_GETOPT_H
+#define _LINUX_GETOPT_H
+
+#define OPT_NOPARAM	1
+#define OPT_INT		2
+#define OPT_STRING	4
+struct ncp_option {
+	const char *name;
+	unsigned int has_arg;
+	int val;
+};
+
+extern int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts,
+		      char **optopt, char **optarg, unsigned long *value);
+
+#endif /* _LINUX_GETOPT_H */
diff --git a/kernel/fs/ncpfs/inode.c b/kernel/fs/ncpfs/inode.c
new file mode 100644
index 000000000..9605a2f63
--- /dev/null
+++ b/kernel/fs/ncpfs/inode.c
@@ -0,0 +1,1072 @@
+/*
+ *  inode.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998 Wolfram Pienkoss for NLS
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <net/sock.h>
+
+#include "ncp_fs.h"
+#include "getopt.h"
+
+#define NCP_DEFAULT_FILE_MODE 0600
+#define NCP_DEFAULT_DIR_MODE 0700
+#define NCP_DEFAULT_TIME_OUT 10
+#define NCP_DEFAULT_RETRY_COUNT 20
+
+static void ncp_evict_inode(struct inode *);
+static void ncp_put_super(struct super_block *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
+static int  ncp_show_options(struct seq_file *, struct dentry *);
+
+static struct kmem_cache * ncp_inode_cachep;
+
+static struct inode *ncp_alloc_inode(struct super_block *sb)
+{
+	struct ncp_inode_info *ei;
+	ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void ncp_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
+}
+
+static void ncp_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ncp_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
+
+	mutex_init(&ei->open_mutex);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
+					     sizeof(struct ncp_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ncp_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ncp_inode_cachep);
+}
+
+static int ncp_remount(struct super_block *sb, int *flags, char* data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+static const struct super_operations ncp_sops =
+{
+	.alloc_inode	= ncp_alloc_inode,
+	.destroy_inode	= ncp_destroy_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= ncp_evict_inode,
+	.put_super	= ncp_put_super,
+	.statfs		= ncp_statfs,
+	.remount_fs	= ncp_remount,
+	.show_options	= ncp_show_options,
+};
+
+/*
+ * Fill in the ncpfs-specific information in the inode.
+ */
+static void ncp_update_dirent(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	NCP_FINFO(inode)->DosDirNum = nwinfo->i.DosDirNum;
+	NCP_FINFO(inode)->dirEntNum = nwinfo->i.dirEntNum;
+	NCP_FINFO(inode)->volNumber = nwinfo->volume;
+}
+
+void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	ncp_update_dirent(inode, nwinfo);
+	NCP_FINFO(inode)->nwattr = nwinfo->i.attributes;
+	NCP_FINFO(inode)->access = nwinfo->access;
+	memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle,
+			sizeof(nwinfo->file_handle));
+	ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n",
+		nwinfo->i.entryName, NCP_FINFO(inode)->volNumber,
+		NCP_FINFO(inode)->dirEntNum);
+}
+
+static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
+{
+	/* NFS namespace mode overrides others if it's set. */
+	ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode);
+	if (nwi->nfs.mode) {
+		/* XXX Security? */
+		inode->i_mode = nwi->nfs.mode;
+	}
+
+	inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
+
+	inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
+	inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
+	inode->i_atime.tv_sec = ncp_date_dos2unix(0, nwi->lastAccessDate);
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+}
+
+static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	struct nw_info_struct *nwi = &nwinfo->i;
+	struct ncp_server *server = NCP_SERVER(inode);
+
+	if (nwi->attributes & aDIR) {
+		inode->i_mode = server->m.dir_mode;
+		/* for directories dataStreamSize seems to be some
+		   Object ID ??? */
+		i_size_write(inode, NCP_BLOCK_SIZE);
+	} else {
+		u32 size;
+
+		inode->i_mode = server->m.file_mode;
+		size = le32_to_cpu(nwi->dataStreamSize);
+		i_size_write(inode, size);
+#ifdef CONFIG_NCPFS_EXTRAS
+		if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 
+		 && (nwi->attributes & aSHARED)) {
+			switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
+				case aHIDDEN:
+					if (server->m.flags & NCP_MOUNT_SYMLINKS) {
+						if (/* (size >= NCP_MIN_SYMLINK_SIZE)
+						 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
+							inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
+							NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
+							break;
+						}
+					}
+					/* FALLTHROUGH */
+				case 0:
+					if (server->m.flags & NCP_MOUNT_EXTRAS)
+						inode->i_mode |= S_IRUGO;
+					break;
+				case aSYSTEM:
+					if (server->m.flags & NCP_MOUNT_EXTRAS)
+						inode->i_mode |= (inode->i_mode >> 2) & S_IXUGO;
+					break;
+				/* case aSYSTEM|aHIDDEN: */
+				default:
+					/* reserved combination */
+					break;
+			}
+		}
+#endif
+	}
+	if (nwi->attributes & aRONLY) inode->i_mode &= ~S_IWUGO;
+}
+
+void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
+{
+	NCP_FINFO(inode)->flags = 0;
+	if (!atomic_read(&NCP_FINFO(inode)->opened)) {
+		NCP_FINFO(inode)->nwattr = nwinfo->i.attributes;
+		ncp_update_attrs(inode, nwinfo);
+	}
+
+	ncp_update_dates(inode, &nwinfo->i);
+	ncp_update_dirent(inode, nwinfo);
+}
+
+/*
+ * Fill in the inode based on the ncp_entry_info structure.  Used only for brand new inodes.
+ */
+static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	struct ncp_server *server = NCP_SERVER(inode);
+
+	NCP_FINFO(inode)->flags = 0;
+	
+	ncp_update_attrs(inode, nwinfo);
+
+	ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode);
+
+	set_nlink(inode, 1);
+	inode->i_uid = server->m.uid;
+	inode->i_gid = server->m.gid;
+
+	ncp_update_dates(inode, &nwinfo->i);
+	ncp_update_inode(inode, nwinfo);
+}
+
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+static const struct inode_operations ncp_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ncp_notify_change,
+};
+#endif
+
+/*
+ * Get a new inode.
+ */
+struct inode * 
+ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
+{
+	struct inode *inode;
+
+	if (info == NULL) {
+		pr_err("%s: info is NULL\n", __func__);
+		return NULL;
+	}
+
+	inode = new_inode(sb);
+	if (inode) {
+		atomic_set(&NCP_FINFO(inode)->opened, info->opened);
+
+		inode->i_ino = info->ino;
+		ncp_set_attr(inode, info);
+		if (S_ISREG(inode->i_mode)) {
+			inode->i_op = &ncp_file_inode_operations;
+			inode->i_fop = &ncp_file_operations;
+		} else if (S_ISDIR(inode->i_mode)) {
+			inode->i_op = &ncp_dir_inode_operations;
+			inode->i_fop = &ncp_dir_operations;
+#ifdef CONFIG_NCPFS_NFS_NS
+		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+			init_special_inode(inode, inode->i_mode,
+				new_decode_dev(info->i.nfs.rdev));
+#endif
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+		} else if (S_ISLNK(inode->i_mode)) {
+			inode->i_op = &ncp_symlink_inode_operations;
+			inode->i_data.a_ops = &ncp_symlink_aops;
+#endif
+		} else {
+			make_bad_inode(inode);
+		}
+		insert_inode_hash(inode);
+	} else
+		pr_err("%s: iget failed!\n", __func__);
+	return inode;
+}
+
+static void
+ncp_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	if (S_ISDIR(inode->i_mode)) {
+		ncp_dbg(2, "put directory %ld\n", inode->i_ino);
+	}
+
+	if (ncp_make_closed(inode) != 0) {
+		/* We can't do anything but complain. */
+		pr_err("%s: could not close\n", __func__);
+	}
+}
+
+static void ncp_stop_tasks(struct ncp_server *server) {
+	struct sock* sk = server->ncp_sock->sk;
+
+	lock_sock(sk);
+	sk->sk_error_report = server->error_report;
+	sk->sk_data_ready   = server->data_ready;
+	sk->sk_write_space  = server->write_space;
+	release_sock(sk);
+	del_timer_sync(&server->timeout_tm);
+
+	flush_work(&server->rcv.tq);
+	if (sk->sk_socket->type == SOCK_STREAM)
+		flush_work(&server->tx.tq);
+	else
+		flush_work(&server->timeout_tq);
+}
+
+static int  ncp_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct ncp_server *server = NCP_SBP(root->d_sb);
+	unsigned int tmp;
+
+	if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID))
+		seq_printf(seq, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, server->m.uid));
+	if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID))
+		seq_printf(seq, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, server->m.gid));
+	if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID))
+		seq_printf(seq, ",owner=%u",
+			   from_kuid_munged(&init_user_ns, server->m.mounted_uid));
+	tmp = server->m.file_mode & S_IALLUGO;
+	if (tmp != NCP_DEFAULT_FILE_MODE)
+		seq_printf(seq, ",mode=0%o", tmp);
+	tmp = server->m.dir_mode & S_IALLUGO;
+	if (tmp != NCP_DEFAULT_DIR_MODE)
+		seq_printf(seq, ",dirmode=0%o", tmp);
+	if (server->m.time_out != NCP_DEFAULT_TIME_OUT * HZ / 100) {
+		tmp = server->m.time_out * 100 / HZ;
+		seq_printf(seq, ",timeout=%u", tmp);
+	}
+	if (server->m.retry_count != NCP_DEFAULT_RETRY_COUNT)
+		seq_printf(seq, ",retry=%u", server->m.retry_count);
+	if (server->m.flags != 0)
+		seq_printf(seq, ",flags=%lu", server->m.flags);
+	if (server->m.wdog_pid != NULL)
+		seq_printf(seq, ",wdogpid=%u", pid_vnr(server->m.wdog_pid));
+
+	return 0;
+}
+
+static const struct ncp_option ncp_opts[] = {
+	{ "uid",	OPT_INT,	'u' },
+	{ "gid",	OPT_INT,	'g' },
+	{ "owner",	OPT_INT,	'o' },
+	{ "mode",	OPT_INT,	'm' },
+	{ "dirmode",	OPT_INT,	'd' },
+	{ "timeout",	OPT_INT,	't' },
+	{ "retry",	OPT_INT,	'r' },
+	{ "flags",	OPT_INT,	'f' },
+	{ "wdogpid",	OPT_INT,	'w' },
+	{ "ncpfd",	OPT_INT,	'n' },
+	{ "infofd",	OPT_INT,	'i' },	/* v5 */
+	{ "version",	OPT_INT,	'v' },
+	{ NULL,		0,		0 } };
+
+static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) {
+	int optval;
+	char *optarg;
+	unsigned long optint;
+	int version = 0;
+	int ret;
+
+	data->flags = 0;
+	data->int_flags = 0;
+	data->mounted_uid = GLOBAL_ROOT_UID;
+	data->wdog_pid = NULL;
+	data->ncp_fd = ~0;
+	data->time_out = NCP_DEFAULT_TIME_OUT;
+	data->retry_count = NCP_DEFAULT_RETRY_COUNT;
+	data->uid = GLOBAL_ROOT_UID;
+	data->gid = GLOBAL_ROOT_GID;
+	data->file_mode = NCP_DEFAULT_FILE_MODE;
+	data->dir_mode = NCP_DEFAULT_DIR_MODE;
+	data->info_fd = -1;
+	data->mounted_vol[0] = 0;
+	
+	while ((optval = ncp_getopt("ncpfs", &options, ncp_opts, NULL, &optarg, &optint)) != 0) {
+		ret = optval;
+		if (ret < 0)
+			goto err;
+		switch (optval) {
+			case 'u':
+				data->uid = make_kuid(current_user_ns(), optint);
+				if (!uid_valid(data->uid)) {
+					ret = -EINVAL;
+					goto err;
+				}
+				break;
+			case 'g':
+				data->gid = make_kgid(current_user_ns(), optint);
+				if (!gid_valid(data->gid)) {
+					ret = -EINVAL;
+					goto err;
+				}
+				break;
+			case 'o':
+				data->mounted_uid = make_kuid(current_user_ns(), optint);
+				if (!uid_valid(data->mounted_uid)) {
+					ret = -EINVAL;
+					goto err;
+				}
+				break;
+			case 'm':
+				data->file_mode = optint;
+				break;
+			case 'd':
+				data->dir_mode = optint;
+				break;
+			case 't':
+				data->time_out = optint;
+				break;
+			case 'r':
+				data->retry_count = optint;
+				break;
+			case 'f':
+				data->flags = optint;
+				break;
+			case 'w':
+				data->wdog_pid = find_get_pid(optint);
+				break;
+			case 'n':
+				data->ncp_fd = optint;
+				break;
+			case 'i':
+				data->info_fd = optint;
+				break;
+			case 'v':
+				ret = -ECHRNG;
+				if (optint < NCP_MOUNT_VERSION_V4)
+					goto err;
+				if (optint > NCP_MOUNT_VERSION_V5)
+					goto err;
+				version = optint;
+				break;
+			
+		}
+	}
+	return 0;
+err:
+	put_pid(data->wdog_pid);
+	data->wdog_pid = NULL;
+	return ret;
+}
+
+static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
+{
+	struct ncp_mount_data_kernel data;
+	struct ncp_server *server;
+	struct inode *root_inode;
+	struct socket *sock;
+	int error;
+	int default_bufsize;
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	int options;
+#endif
+	struct ncp_entry_info finfo;
+
+	memset(&data, 0, sizeof(data));
+	server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
+	if (!server)
+		return -ENOMEM;
+	sb->s_fs_info = server;
+
+	error = -EFAULT;
+	if (raw_data == NULL)
+		goto out;
+	switch (*(int*)raw_data) {
+		case NCP_MOUNT_VERSION:
+			{
+				struct ncp_mount_data* md = (struct ncp_mount_data*)raw_data;
+
+				data.flags = md->flags;
+				data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE;
+				data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);
+				data.wdog_pid = find_get_pid(md->wdog_pid);
+				data.ncp_fd = md->ncp_fd;
+				data.time_out = md->time_out;
+				data.retry_count = md->retry_count;
+				data.uid = make_kuid(current_user_ns(), md->uid);
+				data.gid = make_kgid(current_user_ns(), md->gid);
+				data.file_mode = md->file_mode;
+				data.dir_mode = md->dir_mode;
+				data.info_fd = -1;
+				memcpy(data.mounted_vol, md->mounted_vol,
+					NCP_VOLNAME_LEN+1);
+			}
+			break;
+		case NCP_MOUNT_VERSION_V4:
+			{
+				struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
+
+				data.flags = md->flags;
+				data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);
+				data.wdog_pid = find_get_pid(md->wdog_pid);
+				data.ncp_fd = md->ncp_fd;
+				data.time_out = md->time_out;
+				data.retry_count = md->retry_count;
+				data.uid = make_kuid(current_user_ns(), md->uid);
+				data.gid = make_kgid(current_user_ns(), md->gid);
+				data.file_mode = md->file_mode;
+				data.dir_mode = md->dir_mode;
+				data.info_fd = -1;
+			}
+			break;
+		default:
+			error = -ECHRNG;
+			if (memcmp(raw_data, "vers", 4) == 0) {
+				error = ncp_parse_options(&data, raw_data);
+			}
+			if (error)
+				goto out;
+			break;
+	}
+	error = -EINVAL;
+	if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
+	    !gid_valid(data.gid))
+		goto out;
+	sock = sockfd_lookup(data.ncp_fd, &error);
+	if (!sock)
+		goto out;
+
+	if (sock->type == SOCK_STREAM)
+		default_bufsize = 0xF000;
+	else
+		default_bufsize = 1024;
+
+	sb->s_flags |= MS_NODIRATIME;	/* probably even noatime */
+	sb->s_maxbytes = 0xFFFFFFFFU;
+	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = NCP_SUPER_MAGIC;
+	sb->s_op = &ncp_sops;
+	sb->s_d_op = &ncp_dentry_operations;
+	sb->s_bdi = &server->bdi;
+
+	server = NCP_SBP(sb);
+	memset(server, 0, sizeof(*server));
+
+	error = bdi_setup_and_register(&server->bdi, "ncpfs");
+	if (error)
+		goto out_fput;
+
+	server->ncp_sock = sock;
+	
+	if (data.info_fd != -1) {
+		struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
+		if (!info_sock)
+			goto out_bdi;
+		server->info_sock = info_sock;
+		error = -EBADFD;
+		if (info_sock->type != SOCK_STREAM)
+			goto out_fput2;
+	}
+
+/*	server->lock = 0;	*/
+	mutex_init(&server->mutex);
+	server->packet = NULL;
+/*	server->buffer_size = 0;	*/
+/*	server->conn_status = 0;	*/
+/*	server->root_dentry = NULL;	*/
+/*	server->root_setuped = 0;	*/
+	mutex_init(&server->root_setup_lock);
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+/*	server->sign_wanted = 0;	*/
+/*	server->sign_active = 0;	*/
+#endif
+	init_rwsem(&server->auth_rwsem);
+	server->auth.auth_type = NCP_AUTH_NONE;
+/*	server->auth.object_name_len = 0;	*/
+/*	server->auth.object_name = NULL;	*/
+/*	server->auth.object_type = 0;		*/
+/*	server->priv.len = 0;			*/
+/*	server->priv.data = NULL;		*/
+
+	server->m = data;
+	/* Although anything producing this is buggy, it happens
+	   now because of PATH_MAX changes.. */
+	if (server->m.time_out < 1) {
+		server->m.time_out = 10;
+		pr_info("You need to recompile your ncpfs utils..\n");
+	}
+	server->m.time_out = server->m.time_out * HZ / 100;
+	server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG;
+	server->m.dir_mode = (server->m.dir_mode & S_IRWXUGO) | S_IFDIR;
+
+#ifdef CONFIG_NCPFS_NLS
+	/* load the default NLS charsets */
+	server->nls_vol = load_nls_default();
+	server->nls_io = load_nls_default();
+#endif /* CONFIG_NCPFS_NLS */
+
+	atomic_set(&server->dentry_ttl, 0);	/* no caching */
+
+	INIT_LIST_HEAD(&server->tx.requests);
+	mutex_init(&server->rcv.creq_mutex);
+	server->tx.creq		= NULL;
+	server->rcv.creq	= NULL;
+
+	init_timer(&server->timeout_tm);
+#undef NCP_PACKET_SIZE
+#define NCP_PACKET_SIZE 131072
+	error = -ENOMEM;
+	server->packet_size = NCP_PACKET_SIZE;
+	server->packet = vmalloc(NCP_PACKET_SIZE);
+	if (server->packet == NULL)
+		goto out_nls;
+	server->txbuf = vmalloc(NCP_PACKET_SIZE);
+	if (server->txbuf == NULL)
+		goto out_packet;
+	server->rxbuf = vmalloc(NCP_PACKET_SIZE);
+	if (server->rxbuf == NULL)
+		goto out_txbuf;
+
+	lock_sock(sock->sk);
+	server->data_ready	= sock->sk->sk_data_ready;
+	server->write_space	= sock->sk->sk_write_space;
+	server->error_report	= sock->sk->sk_error_report;
+	sock->sk->sk_user_data	= server;
+	sock->sk->sk_data_ready	  = ncp_tcp_data_ready;
+	sock->sk->sk_error_report = ncp_tcp_error_report;
+	if (sock->type == SOCK_STREAM) {
+		server->rcv.ptr = (unsigned char*)&server->rcv.buf;
+		server->rcv.len = 10;
+		server->rcv.state = 0;
+		INIT_WORK(&server->rcv.tq, ncp_tcp_rcv_proc);
+		INIT_WORK(&server->tx.tq, ncp_tcp_tx_proc);
+		sock->sk->sk_write_space = ncp_tcp_write_space;
+	} else {
+		INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc);
+		INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc);
+		server->timeout_tm.data = (unsigned long)server;
+		server->timeout_tm.function = ncpdgram_timeout_call;
+	}
+	release_sock(sock->sk);
+
+	ncp_lock_server(server);
+	error = ncp_connect(server);
+	ncp_unlock_server(server);
+	if (error < 0)
+		goto out_rxbuf;
+	ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb));
+
+	error = -EMSGSIZE;	/* -EREMOTESIDEINCOMPATIBLE */
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	if (ncp_negotiate_size_and_options(server, default_bufsize,
+		NCP_DEFAULT_OPTIONS, &(server->buffer_size), &options) == 0)
+	{
+		if (options != NCP_DEFAULT_OPTIONS)
+		{
+			if (ncp_negotiate_size_and_options(server, 
+				default_bufsize,
+				options & 2, 
+				&(server->buffer_size), &options) != 0)
+				
+			{
+				goto out_disconnect;
+			}
+		}
+		ncp_lock_server(server);
+		if (options & 2)
+			server->sign_wanted = 1;
+		ncp_unlock_server(server);
+	}
+	else 
+#endif	/* CONFIG_NCPFS_PACKET_SIGNING */
+	if (ncp_negotiate_buffersize(server, default_bufsize,
+  				     &(server->buffer_size)) != 0)
+		goto out_disconnect;
+	ncp_dbg(1, "bufsize = %d\n", server->buffer_size);
+
+	memset(&finfo, 0, sizeof(finfo));
+	finfo.i.attributes	= aDIR;
+	finfo.i.dataStreamSize	= 0;	/* ignored */
+	finfo.i.dirEntNum	= 0;
+	finfo.i.DosDirNum	= 0;
+#ifdef CONFIG_NCPFS_SMALLDOS
+	finfo.i.NSCreator	= NW_NS_DOS;
+#endif
+	finfo.volume		= NCP_NUMBER_OF_VOLUMES;
+	/* set dates of mountpoint to Jan 1, 1986; 00:00 */
+	finfo.i.creationTime	= finfo.i.modifyTime
+				= cpu_to_le16(0x0000);
+	finfo.i.creationDate	= finfo.i.modifyDate
+				= finfo.i.lastAccessDate
+				= cpu_to_le16(0x0C21);
+	finfo.i.nameLen		= 0;
+	finfo.i.entryName[0]	= '\0';
+
+	finfo.opened		= 0;
+	finfo.ino		= 2;	/* tradition */
+
+	server->name_space[finfo.volume] = NW_NS_DOS;
+
+	error = -ENOMEM;
+        root_inode = ncp_iget(sb, &finfo);
+        if (!root_inode)
+		goto out_disconnect;
+	ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
+	sb->s_root = d_make_root(root_inode);
+        if (!sb->s_root)
+		goto out_disconnect;
+	return 0;
+
+out_disconnect:
+	ncp_lock_server(server);
+	ncp_disconnect(server);
+	ncp_unlock_server(server);
+out_rxbuf:
+	ncp_stop_tasks(server);
+	vfree(server->rxbuf);
+out_txbuf:
+	vfree(server->txbuf);
+out_packet:
+	vfree(server->packet);
+out_nls:
+#ifdef CONFIG_NCPFS_NLS
+	unload_nls(server->nls_io);
+	unload_nls(server->nls_vol);
+#endif
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
+out_fput2:
+	if (server->info_sock)
+		sockfd_put(server->info_sock);
+out_bdi:
+	bdi_destroy(&server->bdi);
+out_fput:
+	sockfd_put(sock);
+out:
+	put_pid(data.wdog_pid);
+	sb->s_fs_info = NULL;
+	kfree(server);
+	return error;
+}
+
+static void delayed_free(struct rcu_head *p)
+{
+	struct ncp_server *server = container_of(p, struct ncp_server, rcu);
+#ifdef CONFIG_NCPFS_NLS
+	/* unload the NLS charsets */
+	unload_nls(server->nls_vol);
+	unload_nls(server->nls_io);
+#endif /* CONFIG_NCPFS_NLS */
+	kfree(server);
+}
+
+static void ncp_put_super(struct super_block *sb)
+{
+	struct ncp_server *server = NCP_SBP(sb);
+
+	ncp_lock_server(server);
+	ncp_disconnect(server);
+	ncp_unlock_server(server);
+
+	ncp_stop_tasks(server);
+
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
+
+	if (server->info_sock)
+		sockfd_put(server->info_sock);
+	sockfd_put(server->ncp_sock);
+	kill_pid(server->m.wdog_pid, SIGTERM, 1);
+	put_pid(server->m.wdog_pid);
+
+	bdi_destroy(&server->bdi);
+	kfree(server->priv.data);
+	kfree(server->auth.object_name);
+	vfree(server->rxbuf);
+	vfree(server->txbuf);
+	vfree(server->packet);
+	call_rcu(&server->rcu, delayed_free);
+}
+
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct dentry* d;
+	struct inode* i;
+	struct ncp_inode_info* ni;
+	struct ncp_server* s;
+	struct ncp_volume_info vi;
+	struct super_block *sb = dentry->d_sb;
+	int err;
+	__u8 dh;
+	
+	d = sb->s_root;
+	if (!d) {
+		goto dflt;
+	}
+	i = d_inode(d);
+	if (!i) {
+		goto dflt;
+	}
+	ni = NCP_FINFO(i);
+	if (!ni) {
+		goto dflt;
+	}
+	s = NCP_SBP(sb);
+	if (!s) {
+		goto dflt;
+	}
+	if (!s->m.mounted_vol[0]) {
+		goto dflt;
+	}
+
+	err = ncp_dirhandle_alloc(s, ni->volNumber, ni->DosDirNum, &dh);
+	if (err) {
+		goto dflt;
+	}
+	err = ncp_get_directory_info(s, dh, &vi);
+	ncp_dirhandle_free(s, dh);
+	if (err) {
+		goto dflt;
+	}
+	buf->f_type = NCP_SUPER_MAGIC;
+	buf->f_bsize = vi.sectors_per_block * 512;
+	buf->f_blocks = vi.total_blocks;
+	buf->f_bfree = vi.free_blocks;
+	buf->f_bavail = vi.free_blocks;
+	buf->f_files = vi.total_dir_entries;
+	buf->f_ffree = vi.available_dir_entries;
+	buf->f_namelen = 12;
+	return 0;
+
+	/* We cannot say how much disk space is left on a mounted
+	   NetWare Server, because free space is distributed over
+	   volumes, and the current user might have disk quotas. So
+	   free space is not that simple to determine. Our decision
+	   here is to err conservatively. */
+
+dflt:;
+	buf->f_type = NCP_SUPER_MAGIC;
+	buf->f_bsize = NCP_BLOCK_SIZE;
+	buf->f_blocks = 0;
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_namelen = 12;
+	return 0;
+}
+
+int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int result = 0;
+	__le32 info_mask;
+	struct nw_modify_dos_info info;
+	struct ncp_server *server;
+
+	result = -EIO;
+
+	server = NCP_SERVER(inode);
+	if (!server)	/* How this could happen? */
+		goto out;
+
+	result = -EPERM;
+	if (IS_DEADDIR(d_inode(dentry)))
+		goto out;
+
+	/* ageing the dentry to force validation */
+	ncp_age_dentry(server, dentry);
+
+	result = inode_change_ok(inode, attr);
+	if (result < 0)
+		goto out;
+
+	result = -EPERM;
+	if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid))
+		goto out;
+
+	if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid))
+		goto out;
+
+	if (((attr->ia_valid & ATTR_MODE) &&
+	     (attr->ia_mode &
+	      ~(S_IFREG | S_IFDIR | S_IRWXUGO))))
+		goto out;
+
+	info_mask = 0;
+	memset(&info, 0, sizeof(info));
+
+#if 1 
+        if ((attr->ia_valid & ATTR_MODE) != 0)
+        {
+		umode_t newmode = attr->ia_mode;
+
+		info_mask |= DM_ATTRIBUTES;
+
+                if (S_ISDIR(inode->i_mode)) {
+                	newmode &= server->m.dir_mode;
+		} else {
+#ifdef CONFIG_NCPFS_EXTRAS			
+			if (server->m.flags & NCP_MOUNT_EXTRAS) {
+				/* any non-default execute bit set */
+				if (newmode & ~server->m.file_mode & S_IXUGO)
+					info.attributes |= aSHARED | aSYSTEM;
+				/* read for group/world and not in default file_mode */
+				else if (newmode & ~server->m.file_mode & S_IRUGO)
+					info.attributes |= aSHARED;
+			} else
+#endif
+				newmode &= server->m.file_mode;			
+                }
+                if (newmode & S_IWUGO)
+                	info.attributes &= ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+                else
+			info.attributes |=  (aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+
+#ifdef CONFIG_NCPFS_NFS_NS
+		if (ncp_is_nfs_extras(server, NCP_FINFO(inode)->volNumber)) {
+			result = ncp_modify_nfs_info(server,
+						     NCP_FINFO(inode)->volNumber,
+						     NCP_FINFO(inode)->dirEntNum,
+						     attr->ia_mode, 0);
+			if (result != 0)
+				goto out;
+			info.attributes &= ~(aSHARED | aSYSTEM);
+			{
+				/* mark partial success */
+				struct iattr tmpattr;
+				
+				tmpattr.ia_valid = ATTR_MODE;
+				tmpattr.ia_mode = attr->ia_mode;
+
+				setattr_copy(inode, &tmpattr);
+				mark_inode_dirty(inode);
+			}
+		}
+#endif
+        }
+#endif
+
+	/* Do SIZE before attributes, otherwise mtime together with size does not work...
+	 */
+	if ((attr->ia_valid & ATTR_SIZE) != 0) {
+		int written;
+
+		ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size);
+
+		if ((result = ncp_make_open(inode, O_WRONLY)) < 0) {
+			result = -EACCES;
+			goto out;
+		}
+		ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle,
+			  attr->ia_size, 0, "", &written);
+
+		/* According to ndir, the changes only take effect after
+		   closing the file */
+		ncp_inode_close(inode);
+		result = ncp_make_closed(inode);
+		if (result)
+			goto out;
+
+		if (attr->ia_size != i_size_read(inode)) {
+			truncate_setsize(inode, attr->ia_size);
+			mark_inode_dirty(inode);
+		}
+	}
+	if ((attr->ia_valid & ATTR_CTIME) != 0) {
+		info_mask |= (DM_CREATE_TIME | DM_CREATE_DATE);
+		ncp_date_unix2dos(attr->ia_ctime.tv_sec,
+			     &info.creationTime, &info.creationDate);
+	}
+	if ((attr->ia_valid & ATTR_MTIME) != 0) {
+		info_mask |= (DM_MODIFY_TIME | DM_MODIFY_DATE);
+		ncp_date_unix2dos(attr->ia_mtime.tv_sec,
+				  &info.modifyTime, &info.modifyDate);
+	}
+	if ((attr->ia_valid & ATTR_ATIME) != 0) {
+		__le16 dummy;
+		info_mask |= (DM_LAST_ACCESS_DATE);
+		ncp_date_unix2dos(attr->ia_atime.tv_sec,
+				  &dummy, &info.lastAccessDate);
+	}
+	if (info_mask != 0) {
+		result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
+				      inode, info_mask, &info);
+		if (result != 0) {
+			if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
+				/* NetWare seems not to allow this. I
+				   do not know why. So, just tell the
+				   user everything went fine. This is
+				   a terrible hack, but I do not know
+				   how to do this correctly. */
+				result = 0;
+			} else
+				goto out;
+		}
+#ifdef CONFIG_NCPFS_STRONG		
+		if ((!result) && (info_mask & DM_ATTRIBUTES))
+			NCP_FINFO(inode)->nwattr = info.attributes;
+#endif
+	}
+	if (result)
+		goto out;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+out:
+	if (result > 0)
+		result = -EACCES;
+	return result;
+}
+
+static struct dentry *ncp_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, ncp_fill_super);
+}
+
+static struct file_system_type ncp_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ncpfs",
+	.mount		= ncp_mount,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("ncpfs");
+
+static int __init init_ncp_fs(void)
+{
+	int err;
+	ncp_dbg(1, "called\n");
+
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&ncp_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_ncp_fs(void)
+{
+	ncp_dbg(1, "called\n");
+	unregister_filesystem(&ncp_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_ncp_fs)
+module_exit(exit_ncp_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/ncpfs/ioctl.c b/kernel/fs/ncpfs/ioctl.c
new file mode 100644
index 000000000..79b113048
--- /dev/null
+++ b/kernel/fs/ncpfs/ioctl.c
@@ -0,0 +1,919 @@
+/*
+ *  ioctl.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998, 1999 Wolfram Pienkoss for NLS
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/highuid.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+
+#include <asm/uaccess.h>
+
+#include "ncp_fs.h"
+
+/* maximum limit for ncp_objectname_ioctl */
+#define NCP_OBJECT_NAME_MAX_LEN	4096
+/* maximum limit for ncp_privatedata_ioctl */
+#define NCP_PRIVATE_DATA_MAX_LEN 8192
+/* maximum negotiable packet size */
+#define NCP_PACKET_SIZE_INTERNAL 65536
+
+static int
+ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
+		struct ncp_fs_info __user *arg)
+{
+	struct ncp_fs_info info;
+
+	if (copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	if (info.version != NCP_GET_FS_INFO_VERSION) {
+		ncp_dbg(1, "info.version invalid: %d\n", info.version);
+		return -EINVAL;
+	}
+	/* TODO: info.addr = server->m.serv_addr; */
+	SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));
+	info.connection		= server->connection;
+	info.buffer_size	= server->buffer_size;
+	info.volume_number	= NCP_FINFO(inode)->volNumber;
+	info.directory_id	= NCP_FINFO(inode)->DosDirNum;
+
+	if (copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
+		   struct ncp_fs_info_v2 __user * arg)
+{
+	struct ncp_fs_info_v2 info2;
+
+	if (copy_from_user(&info2, arg, sizeof(info2)))
+		return -EFAULT;
+
+	if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
+		ncp_dbg(1, "info.version invalid: %d\n", info2.version);
+		return -EINVAL;
+	}
+	info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+	info2.connection    = server->connection;
+	info2.buffer_size   = server->buffer_size;
+	info2.volume_number = NCP_FINFO(inode)->volNumber;
+	info2.directory_id  = NCP_FINFO(inode)->DosDirNum;
+	info2.dummy1 = info2.dummy2 = info2.dummy3 = 0;
+
+	if (copy_to_user(arg, &info2, sizeof(info2)))
+		return -EFAULT;
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ncp_objectname_ioctl
+{
+	s32		auth_type;
+	u32		object_name_len;
+	compat_caddr_t	object_name;	/* a userspace data, in most cases user name */
+};
+
+struct compat_ncp_fs_info_v2 {
+	s32 version;
+	u32 mounted_uid;
+	u32 connection;
+	u32 buffer_size;
+
+	u32 volume_number;
+	u32 directory_id;
+
+	u32 dummy1;
+	u32 dummy2;
+	u32 dummy3;
+};
+
+struct compat_ncp_ioctl_request {
+	u32 function;
+	u32 size;
+	compat_caddr_t data;
+};
+
+struct compat_ncp_privatedata_ioctl
+{
+	u32		len;
+	compat_caddr_t	data;		/* ~1000 for NDS */
+};
+
+#define NCP_IOC_GET_FS_INFO_V2_32	_IOWR('n', 4, struct compat_ncp_fs_info_v2)
+#define NCP_IOC_NCPREQUEST_32		_IOR('n', 1, struct compat_ncp_ioctl_request)
+#define NCP_IOC_GETOBJECTNAME_32	_IOWR('n', 9, struct compat_ncp_objectname_ioctl)
+#define NCP_IOC_SETOBJECTNAME_32	_IOR('n', 9, struct compat_ncp_objectname_ioctl)
+#define NCP_IOC_GETPRIVATEDATA_32	_IOWR('n', 10, struct compat_ncp_privatedata_ioctl)
+#define NCP_IOC_SETPRIVATEDATA_32	_IOR('n', 10, struct compat_ncp_privatedata_ioctl)
+
+static int
+ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
+		   struct compat_ncp_fs_info_v2 __user * arg)
+{
+	struct compat_ncp_fs_info_v2 info2;
+
+	if (copy_from_user(&info2, arg, sizeof(info2)))
+		return -EFAULT;
+
+	if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
+		ncp_dbg(1, "info.version invalid: %d\n", info2.version);
+		return -EINVAL;
+	}
+	info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+	info2.connection    = server->connection;
+	info2.buffer_size   = server->buffer_size;
+	info2.volume_number = NCP_FINFO(inode)->volNumber;
+	info2.directory_id  = NCP_FINFO(inode)->DosDirNum;
+	info2.dummy1 = info2.dummy2 = info2.dummy3 = 0;
+
+	if (copy_to_user(arg, &info2, sizeof(info2)))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+#define NCP_IOC_GETMOUNTUID16		_IOW('n', 2, u16)
+#define NCP_IOC_GETMOUNTUID32		_IOW('n', 2, u32)
+#define NCP_IOC_GETMOUNTUID64		_IOW('n', 2, u64)
+
+#ifdef CONFIG_NCPFS_NLS
+/* Here we are select the iocharset and the codepage for NLS.
+ * Thanks Petr Vandrovec for idea and many hints.
+ */
+static int
+ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
+{
+	struct ncp_nls_ioctl user;
+	struct nls_table *codepage;
+	struct nls_table *iocharset;
+	struct nls_table *oldset_io;
+	struct nls_table *oldset_cp;
+	int utf8;
+	int err;
+
+	if (copy_from_user(&user, arg, sizeof(user)))
+		return -EFAULT;
+
+	codepage = NULL;
+	user.codepage[NCP_IOCSNAME_LEN] = 0;
+	if (!user.codepage[0] || !strcmp(user.codepage, "default"))
+		codepage = load_nls_default();
+	else {
+		codepage = load_nls(user.codepage);
+		if (!codepage) {
+			return -EBADRQC;
+		}
+	}
+
+	iocharset = NULL;
+	user.iocharset[NCP_IOCSNAME_LEN] = 0;
+	if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
+		iocharset = load_nls_default();
+		utf8 = 0;
+	} else if (!strcmp(user.iocharset, "utf8")) {
+		iocharset = load_nls_default();
+		utf8 = 1;
+	} else {
+		iocharset = load_nls(user.iocharset);
+		if (!iocharset) {
+			unload_nls(codepage);
+			return -EBADRQC;
+		}
+		utf8 = 0;
+	}
+
+	mutex_lock(&server->root_setup_lock);
+	if (server->root_setuped) {
+		oldset_cp = codepage;
+		oldset_io = iocharset;
+		err = -EBUSY;
+	} else {
+		if (utf8)
+			NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+		else
+			NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		oldset_cp = server->nls_vol;
+		server->nls_vol = codepage;
+		oldset_io = server->nls_io;
+		server->nls_io = iocharset;
+		err = 0;
+	}
+	mutex_unlock(&server->root_setup_lock);
+	unload_nls(oldset_cp);
+	unload_nls(oldset_io);
+
+	return err;
+}
+
+static int
+ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
+{
+	struct ncp_nls_ioctl user;
+	int len;
+
+	memset(&user, 0, sizeof(user));
+	mutex_lock(&server->root_setup_lock);
+	if (server->nls_vol && server->nls_vol->charset) {
+		len = strlen(server->nls_vol->charset);
+		if (len > NCP_IOCSNAME_LEN)
+			len = NCP_IOCSNAME_LEN;
+		strncpy(user.codepage, server->nls_vol->charset, len);
+		user.codepage[len] = 0;
+	}
+
+	if (NCP_IS_FLAG(server, NCP_FLAG_UTF8))
+		strcpy(user.iocharset, "utf8");
+	else if (server->nls_io && server->nls_io->charset) {
+		len = strlen(server->nls_io->charset);
+		if (len > NCP_IOCSNAME_LEN)
+			len = NCP_IOCSNAME_LEN;
+		strncpy(user.iocharset,	server->nls_io->charset, len);
+		user.iocharset[len] = 0;
+	}
+	mutex_unlock(&server->root_setup_lock);
+
+	if (copy_to_user(arg, &user, sizeof(user)))
+		return -EFAULT;
+	return 0;
+}
+#endif /* CONFIG_NCPFS_NLS */
+
+static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+	struct ncp_server *server = NCP_SERVER(inode);
+	int result;
+	struct ncp_ioctl_request request;
+	char* bouncebuffer;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_NCPREQUEST_32:
+#endif
+	case NCP_IOC_NCPREQUEST:
+#ifdef CONFIG_COMPAT
+		if (cmd == NCP_IOC_NCPREQUEST_32) {
+			struct compat_ncp_ioctl_request request32;
+			if (copy_from_user(&request32, argp, sizeof(request32)))
+				return -EFAULT;
+			request.function = request32.function;
+			request.size = request32.size;
+			request.data = compat_ptr(request32.data);
+		} else
+#endif
+		if (copy_from_user(&request, argp, sizeof(request)))
+			return -EFAULT;
+
+		if ((request.function > 255)
+		    || (request.size >
+		  NCP_PACKET_SIZE - sizeof(struct ncp_request_header))) {
+			return -EINVAL;
+		}
+		bouncebuffer = vmalloc(NCP_PACKET_SIZE_INTERNAL);
+		if (!bouncebuffer)
+			return -ENOMEM;
+		if (copy_from_user(bouncebuffer, request.data, request.size)) {
+			vfree(bouncebuffer);
+			return -EFAULT;
+		}
+		ncp_lock_server(server);
+
+		/* FIXME: We hack around in the server's structures
+		   here to be able to use ncp_request */
+
+		server->has_subfunction = 0;
+		server->current_size = request.size;
+		memcpy(server->packet, bouncebuffer, request.size);
+
+		result = ncp_request2(server, request.function,
+			bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
+		if (result < 0)
+			result = -EIO;
+		else
+			result = server->reply_size;
+		ncp_unlock_server(server);
+		ncp_dbg(1, "copy %d bytes\n", result);
+		if (result >= 0)
+			if (copy_to_user(request.data, bouncebuffer, result))
+				result = -EFAULT;
+		vfree(bouncebuffer);
+		return result;
+
+	case NCP_IOC_CONN_LOGGED_IN:
+
+		if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
+			return -EINVAL;
+		mutex_lock(&server->root_setup_lock);
+		if (server->root_setuped)
+			result = -EBUSY;
+		else {
+			result = ncp_conn_logged_in(inode->i_sb);
+			if (result == 0)
+				server->root_setuped = 1;
+		}
+		mutex_unlock(&server->root_setup_lock);
+		return result;
+
+	case NCP_IOC_GET_FS_INFO:
+		return ncp_get_fs_info(server, inode, argp);
+
+	case NCP_IOC_GET_FS_INFO_V2:
+		return ncp_get_fs_info_v2(server, inode, argp);
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GET_FS_INFO_V2_32:
+		return ncp_get_compat_fs_info_v2(server, inode, argp);
+#endif
+	/* we have too many combinations of CONFIG_COMPAT,
+	 * CONFIG_64BIT and CONFIG_UID16, so just handle
+	 * any of the possible ioctls */
+	case NCP_IOC_GETMOUNTUID16:
+		{
+			u16 uid;
+
+			SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));
+			if (put_user(uid, (u16 __user *)argp))
+				return -EFAULT;
+			return 0;
+		}
+	case NCP_IOC_GETMOUNTUID32:
+	{
+		uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+		if (put_user(uid, (u32 __user *)argp))
+			return -EFAULT;
+		return 0;
+	}
+	case NCP_IOC_GETMOUNTUID64:
+	{
+		uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+		if (put_user(uid, (u64 __user *)argp))
+			return -EFAULT;
+		return 0;
+	}
+	case NCP_IOC_GETROOT:
+		{
+			struct ncp_setroot_ioctl sr;
+
+			result = -EACCES;
+			mutex_lock(&server->root_setup_lock);
+			if (server->m.mounted_vol[0]) {
+				struct dentry* dentry = inode->i_sb->s_root;
+
+				if (dentry) {
+					struct inode* s_inode = d_inode(dentry);
+
+					if (s_inode) {
+						sr.volNumber = NCP_FINFO(s_inode)->volNumber;
+						sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
+						sr.namespace = server->name_space[sr.volNumber];
+						result = 0;
+					} else
+						ncp_dbg(1, "d_inode(s_root)==NULL\n");
+				} else
+					ncp_dbg(1, "s_root==NULL\n");
+			} else {
+				sr.volNumber = -1;
+				sr.namespace = 0;
+				sr.dirEntNum = 0;
+				result = 0;
+			}
+			mutex_unlock(&server->root_setup_lock);
+			if (!result && copy_to_user(argp, &sr, sizeof(sr)))
+				result = -EFAULT;
+			return result;
+		}
+
+	case NCP_IOC_SETROOT:
+		{
+			struct ncp_setroot_ioctl sr;
+			__u32 vnum;
+			__le32 de;
+			__le32 dosde;
+			struct dentry* dentry;
+
+			if (copy_from_user(&sr, argp, sizeof(sr)))
+				return -EFAULT;
+			mutex_lock(&server->root_setup_lock);
+			if (server->root_setuped)
+				result = -EBUSY;
+			else {
+				if (sr.volNumber < 0) {
+					server->m.mounted_vol[0] = 0;
+					vnum = NCP_NUMBER_OF_VOLUMES;
+					de = 0;
+					dosde = 0;
+					result = 0;
+				} else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
+					result = -EINVAL;
+				} else if (ncp_mount_subdir(server, sr.volNumber,
+							sr.namespace, sr.dirEntNum,
+							&vnum, &de, &dosde)) {
+					result = -ENOENT;
+				} else
+					result = 0;
+
+				if (result == 0) {
+					dentry = inode->i_sb->s_root;
+					if (dentry) {
+						struct inode* s_inode = d_inode(dentry);
+
+						if (s_inode) {
+							NCP_FINFO(s_inode)->volNumber = vnum;
+							NCP_FINFO(s_inode)->dirEntNum = de;
+							NCP_FINFO(s_inode)->DosDirNum = dosde;
+							server->root_setuped = 1;
+						} else {
+							ncp_dbg(1, "d_inode(s_root)==NULL\n");
+							result = -EIO;
+						}
+					} else {
+						ncp_dbg(1, "s_root==NULL\n");
+						result = -EIO;
+					}
+				}
+			}
+			mutex_unlock(&server->root_setup_lock);
+
+			return result;
+		}
+
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	case NCP_IOC_SIGN_INIT:
+		{
+			struct ncp_sign_init sign;
+
+			if (argp)
+				if (copy_from_user(&sign, argp, sizeof(sign)))
+					return -EFAULT;
+			ncp_lock_server(server);
+			mutex_lock(&server->rcv.creq_mutex);
+			if (argp) {
+				if (server->sign_wanted) {
+					memcpy(server->sign_root,sign.sign_root,8);
+					memcpy(server->sign_last,sign.sign_last,16);
+					server->sign_active = 1;
+				}
+				/* ignore when signatures not wanted */
+			} else {
+				server->sign_active = 0;
+			}
+			mutex_unlock(&server->rcv.creq_mutex);
+			ncp_unlock_server(server);
+			return 0;
+		}
+
+        case NCP_IOC_SIGN_WANTED:
+		{
+			int state;
+
+			ncp_lock_server(server);
+			state = server->sign_wanted;
+			ncp_unlock_server(server);
+			if (put_user(state, (int __user *)argp))
+				return -EFAULT;
+			return 0;
+		}
+
+	case NCP_IOC_SET_SIGN_WANTED:
+		{
+			int newstate;
+
+			/* get only low 8 bits... */
+			if (get_user(newstate, (unsigned char __user *)argp))
+				return -EFAULT;
+			result = 0;
+			ncp_lock_server(server);
+			if (server->sign_active) {
+				/* cannot turn signatures OFF when active */
+				if (!newstate)
+					result = -EINVAL;
+			} else {
+				server->sign_wanted = newstate != 0;
+			}
+			ncp_unlock_server(server);
+			return result;
+		}
+
+#endif /* CONFIG_NCPFS_PACKET_SIGNING */
+
+#ifdef CONFIG_NCPFS_IOCTL_LOCKING
+	case NCP_IOC_LOCKUNLOCK:
+		{
+			struct ncp_lock_ioctl	 rqdata;
+
+			if (copy_from_user(&rqdata, argp, sizeof(rqdata)))
+				return -EFAULT;
+			if (rqdata.origin != 0)
+				return -EINVAL;
+			/* check for cmd */
+			switch (rqdata.cmd) {
+				case NCP_LOCK_EX:
+				case NCP_LOCK_SH:
+						if (rqdata.timeout == 0)
+							rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT;
+						else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT)
+							rqdata.timeout = NCP_LOCK_MAX_TIMEOUT;
+						break;
+				case NCP_LOCK_LOG:
+						rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT;	/* has no effect */
+				case NCP_LOCK_CLEAR:
+						break;
+				default:
+						return -EINVAL;
+			}
+			/* locking needs both read and write access */
+			if ((result = ncp_make_open(inode, O_RDWR)) != 0)
+			{
+				return result;
+			}
+			result = -EISDIR;
+			if (!S_ISREG(inode->i_mode))
+				goto outrel;
+			if (rqdata.cmd == NCP_LOCK_CLEAR)
+			{
+				result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
+							NCP_FINFO(inode)->file_handle,
+							rqdata.offset,
+							rqdata.length);
+				if (result > 0) result = 0;	/* no such lock */
+			}
+			else
+			{
+				int lockcmd;
+
+				switch (rqdata.cmd)
+				{
+					case NCP_LOCK_EX:  lockcmd=1; break;
+					case NCP_LOCK_SH:  lockcmd=3; break;
+					default:	   lockcmd=0; break;
+				}
+				result = ncp_LogPhysicalRecord(NCP_SERVER(inode),
+							NCP_FINFO(inode)->file_handle,
+							lockcmd,
+							rqdata.offset,
+							rqdata.length,
+							rqdata.timeout);
+				if (result > 0) result = -EAGAIN;
+			}
+outrel:
+			ncp_inode_close(inode);
+			return result;
+		}
+#endif	/* CONFIG_NCPFS_IOCTL_LOCKING */
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GETOBJECTNAME_32:
+		{
+			struct compat_ncp_objectname_ioctl user;
+			size_t outl;
+
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+			down_read(&server->auth_rwsem);
+			user.auth_type = server->auth.auth_type;
+			outl = user.object_name_len;
+			user.object_name_len = server->auth.object_name_len;
+			if (outl > user.object_name_len)
+				outl = user.object_name_len;
+			result = 0;
+			if (outl) {
+				if (copy_to_user(compat_ptr(user.object_name),
+						 server->auth.object_name,
+						 outl))
+					result = -EFAULT;
+			}
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
+		}
+#endif
+
+	case NCP_IOC_GETOBJECTNAME:
+		{
+			struct ncp_objectname_ioctl user;
+			size_t outl;
+
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+			down_read(&server->auth_rwsem);
+			user.auth_type = server->auth.auth_type;
+			outl = user.object_name_len;
+			user.object_name_len = server->auth.object_name_len;
+			if (outl > user.object_name_len)
+				outl = user.object_name_len;
+			result = 0;
+			if (outl) {
+				if (copy_to_user(user.object_name,
+						 server->auth.object_name,
+						 outl))
+					result = -EFAULT;
+			}
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
+		}
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_SETOBJECTNAME_32:
+#endif
+	case NCP_IOC_SETOBJECTNAME:
+		{
+			struct ncp_objectname_ioctl user;
+			void* newname;
+			void* oldname;
+			size_t oldnamelen;
+			void* oldprivate;
+			size_t oldprivatelen;
+
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_SETOBJECTNAME_32) {
+				struct compat_ncp_objectname_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.auth_type = user32.auth_type;
+				user.object_name_len = user32.object_name_len;
+				user.object_name = compat_ptr(user32.object_name);
+			} else
+#endif
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+
+			if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
+				return -ENOMEM;
+			if (user.object_name_len) {
+				newname = memdup_user(user.object_name,
+						      user.object_name_len);
+				if (IS_ERR(newname))
+					return PTR_ERR(newname);
+			} else {
+				newname = NULL;
+			}
+			down_write(&server->auth_rwsem);
+			oldname = server->auth.object_name;
+			oldnamelen = server->auth.object_name_len;
+			oldprivate = server->priv.data;
+			oldprivatelen = server->priv.len;
+			server->auth.auth_type = user.auth_type;
+			server->auth.object_name_len = user.object_name_len;
+			server->auth.object_name = newname;
+			server->priv.len = 0;
+			server->priv.data = NULL;
+			up_write(&server->auth_rwsem);
+			kfree(oldprivate);
+			kfree(oldname);
+			return 0;
+		}
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GETPRIVATEDATA_32:
+#endif
+	case NCP_IOC_GETPRIVATEDATA:
+		{
+			struct ncp_privatedata_ioctl user;
+			size_t outl;
+
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.len = user32.len;
+				user.data = compat_ptr(user32.data);
+			} else
+#endif
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+
+			down_read(&server->auth_rwsem);
+			outl = user.len;
+			user.len = server->priv.len;
+			if (outl > user.len) outl = user.len;
+			result = 0;
+			if (outl) {
+				if (copy_to_user(user.data,
+						 server->priv.data,
+						 outl))
+					result = -EFAULT;
+			}
+			up_read(&server->auth_rwsem);
+			if (result)
+				return result;
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				user32.len = user.len;
+				user32.data = (unsigned long) user.data;
+				if (copy_to_user(argp, &user32, sizeof(user32)))
+					return -EFAULT;
+			} else
+#endif
+			if (copy_to_user(argp, &user, sizeof(user)))
+				return -EFAULT;
+
+			return 0;
+		}
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_SETPRIVATEDATA_32:
+#endif
+	case NCP_IOC_SETPRIVATEDATA:
+		{
+			struct ncp_privatedata_ioctl user;
+			void* new;
+			void* old;
+			size_t oldlen;
+
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_SETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.len = user32.len;
+				user.data = compat_ptr(user32.data);
+			} else
+#endif
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+
+			if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
+				return -ENOMEM;
+			if (user.len) {
+				new = memdup_user(user.data, user.len);
+				if (IS_ERR(new))
+					return PTR_ERR(new);
+			} else {
+				new = NULL;
+			}
+			down_write(&server->auth_rwsem);
+			old = server->priv.data;
+			oldlen = server->priv.len;
+			server->priv.len = user.len;
+			server->priv.data = new;
+			up_write(&server->auth_rwsem);
+			kfree(old);
+			return 0;
+		}
+
+#ifdef CONFIG_NCPFS_NLS
+	case NCP_IOC_SETCHARSETS:
+		return ncp_set_charsets(server, argp);
+
+	case NCP_IOC_GETCHARSETS:
+		return ncp_get_charsets(server, argp);
+
+#endif /* CONFIG_NCPFS_NLS */
+
+	case NCP_IOC_SETDENTRYTTL:
+		{
+			u_int32_t user;
+
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+			/* 20 secs at most... */
+			if (user > 20000)
+				return -EINVAL;
+			user = (user * HZ) / 1000;
+			atomic_set(&server->dentry_ttl, user);
+			return 0;
+		}
+
+	case NCP_IOC_GETDENTRYTTL:
+		{
+			u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
+			if (copy_to_user(argp, &user, sizeof(user)))
+				return -EFAULT;
+			return 0;
+		}
+
+	}
+	return -EINVAL;
+}
+
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct ncp_server *server = NCP_SERVER(inode);
+	kuid_t uid = current_uid();
+	int need_drop_write = 0;
+	long ret;
+
+	switch (cmd) {
+	case NCP_IOC_SETCHARSETS:
+	case NCP_IOC_CONN_LOGGED_IN:
+	case NCP_IOC_SETROOT:
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out;
+		}
+		break;
+	}
+	if (!uid_eq(server->m.mounted_uid, uid)) {
+		switch (cmd) {
+		/*
+		 * Only mount owner can issue these ioctls.  Information
+		 * necessary to authenticate to other NDS servers are
+		 * stored here.
+		 */
+		case NCP_IOC_GETOBJECTNAME:
+		case NCP_IOC_SETOBJECTNAME:
+		case NCP_IOC_GETPRIVATEDATA:
+		case NCP_IOC_SETPRIVATEDATA:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GETOBJECTNAME_32:
+		case NCP_IOC_SETOBJECTNAME_32:
+		case NCP_IOC_GETPRIVATEDATA_32:
+		case NCP_IOC_SETPRIVATEDATA_32:
+#endif
+			ret = -EACCES;
+			goto out;
+		/*
+		 * These require write access on the inode if user id
+		 * does not match.  Note that they do not write to the
+		 * file...  But old code did mnt_want_write, so I keep
+		 * it as is.  Of course not for mountpoint owner, as
+		 * that breaks read-only mounts altogether as ncpmount
+		 * needs working NCP_IOC_NCPREQUEST and
+		 * NCP_IOC_GET_FS_INFO.  Some of these codes (setdentryttl,
+		 * signinit, setsignwanted) should be probably restricted
+		 * to owner only, or even more to CAP_SYS_ADMIN).
+		 */
+		case NCP_IOC_GET_FS_INFO:
+		case NCP_IOC_GET_FS_INFO_V2:
+		case NCP_IOC_NCPREQUEST:
+		case NCP_IOC_SETDENTRYTTL:
+		case NCP_IOC_SIGN_INIT:
+		case NCP_IOC_LOCKUNLOCK:
+		case NCP_IOC_SET_SIGN_WANTED:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GET_FS_INFO_V2_32:
+		case NCP_IOC_NCPREQUEST_32:
+#endif
+			ret = mnt_want_write_file(filp);
+			if (ret)
+				goto out;
+			need_drop_write = 1;
+			ret = inode_permission(inode, MAY_WRITE);
+			if (ret)
+				goto outDropWrite;
+			break;
+		/*
+		 * Read access required.
+		 */
+		case NCP_IOC_GETMOUNTUID16:
+		case NCP_IOC_GETMOUNTUID32:
+		case NCP_IOC_GETMOUNTUID64:
+		case NCP_IOC_GETROOT:
+		case NCP_IOC_SIGN_WANTED:
+			ret = inode_permission(inode, MAY_READ);
+			if (ret)
+				goto out;
+			break;
+		/*
+		 * Anybody can read these.
+		 */
+		case NCP_IOC_GETCHARSETS:
+		case NCP_IOC_GETDENTRYTTL:
+		default:
+		/* Three codes below are protected by CAP_SYS_ADMIN above. */
+		case NCP_IOC_SETCHARSETS:
+		case NCP_IOC_CONN_LOGGED_IN:
+		case NCP_IOC_SETROOT:
+			break;
+		}
+	}
+	ret = __ncp_ioctl(inode, cmd, arg);
+outDropWrite:
+	if (need_drop_write)
+		mnt_drop_write_file(filp);
+out:
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long ret;
+
+	arg = (unsigned long) compat_ptr(arg);
+	ret = ncp_ioctl(file, cmd, arg);
+	return ret;
+}
+#endif
diff --git a/kernel/fs/ncpfs/mmap.c b/kernel/fs/ncpfs/mmap.c
new file mode 100644
index 000000000..33b873b25
--- /dev/null
+++ b/kernel/fs/ncpfs/mmap.c
@@ -0,0 +1,125 @@
+/*
+ *  mmap.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *
+ */
+
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/errno.h>
+#include <linux/mman.h>
+#include <linux/string.h>
+#include <linux/fcntl.h>
+#include <linux/memcontrol.h>
+
+#include <asm/uaccess.h>
+
+#include "ncp_fs.h"
+
+/*
+ * Fill in the supplied page for mmap
+ * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
+ * page?
+ */
+static int ncp_file_mmap_fault(struct vm_area_struct *area,
+					struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(area->vm_file);
+	char *pg_addr;
+	unsigned int already_read;
+	unsigned int count;
+	int bufsize;
+	int pos; /* XXX: loff_t ? */
+
+	/*
+	 * ncpfs has nothing against high pages as long
+	 * as recvmsg and memset works on it
+	 */
+	vmf->page = alloc_page(GFP_HIGHUSER);
+	if (!vmf->page)
+		return VM_FAULT_OOM;
+	pg_addr = kmap(vmf->page);
+	pos = vmf->pgoff << PAGE_SHIFT;
+
+	count = PAGE_SIZE;
+	/* what we can read in one go */
+	bufsize = NCP_SERVER(inode)->buffer_size;
+
+	already_read = 0;
+	if (ncp_make_open(inode, O_RDONLY) >= 0) {
+		while (already_read < count) {
+			int read_this_time;
+			int to_read;
+
+			to_read = bufsize - (pos % bufsize);
+
+			to_read = min_t(unsigned int, to_read, count - already_read);
+
+			if (ncp_read_kernel(NCP_SERVER(inode),
+				     NCP_FINFO(inode)->file_handle,
+				     pos, to_read,
+				     pg_addr + already_read,
+				     &read_this_time) != 0) {
+				read_this_time = 0;
+			}
+			pos += read_this_time;
+			already_read += read_this_time;
+
+			if (read_this_time < to_read) {
+				break;
+			}
+		}
+		ncp_inode_close(inode);
+
+	}
+
+	if (already_read < PAGE_SIZE)
+		memset(pg_addr + already_read, 0, PAGE_SIZE - already_read);
+	flush_dcache_page(vmf->page);
+	kunmap(vmf->page);
+
+	/*
+	 * If I understand ncp_read_kernel() properly, the above always
+	 * fetches from the network, here the analogue of disk.
+	 * -- nyc
+	 */
+	count_vm_event(PGMAJFAULT);
+	mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
+	return VM_FAULT_MAJOR;
+}
+
+static const struct vm_operations_struct ncp_file_mmap =
+{
+	.fault = ncp_file_mmap_fault,
+};
+
+
+/* This is used for a general mmap of a ncp file */
+int ncp_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(file);
+	
+	ncp_dbg(1, "called\n");
+
+	if (!ncp_conn_valid(NCP_SERVER(inode)))
+		return -EIO;
+
+	/* only PAGE_COW or read-only supported now */
+	if (vma->vm_flags & VM_SHARED)
+		return -EINVAL;
+	/* we do not support files bigger than 4GB... We eventually 
+	   supports just 4GB... */
+	if (vma_pages(vma) + vma->vm_pgoff
+	   > (1U << (32 - PAGE_SHIFT)))
+		return -EFBIG;
+
+	vma->vm_ops = &ncp_file_mmap;
+	file_accessed(file);
+	return 0;
+}
diff --git a/kernel/fs/ncpfs/ncp_fs.h b/kernel/fs/ncpfs/ncp_fs.h
new file mode 100644
index 000000000..b9f69e1b1
--- /dev/null
+++ b/kernel/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,100 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define ncp_vdbg(fmt, ...)					\
+	pr_debug(fmt, ##__VA_ARGS__)
+#else
+#define ncp_vdbg(fmt, ...)					\
+do {								\
+	if (0)							\
+		pr_debug(fmt, ##__VA_ARGS__);			\
+} while (0)
+#endif
+
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+
+#if DEBUG_NCP > 0 && !defined(DEBUG)
+#define DEBUG
+#endif
+
+#define ncp_dbg(level, fmt, ...)				\
+do {								\
+	if (level <= DEBUG_NCP)					\
+		pr_debug(fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+
+
+struct ncp_entry_info {
+	struct nw_info_struct	i;
+	ino_t			ino;
+	int			opened;
+	int			access;
+	unsigned int		volume;
+	__u8			file_handle[6];
+};
+
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+#define NCP_SERVER(inode)	NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+	return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+	void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+	return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+
+#include "ncplib_kernel.h"
diff --git a/kernel/fs/ncpfs/ncp_fs_i.h b/kernel/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 000000000..c4794504f
--- /dev/null
+++ b/kernel/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,30 @@
+/*
+ *  ncp_fs_i.h
+ *
+ *  Copyright (C) 1995 Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+	__le32	dirEntNum;
+	__le32	DosDirNum;
+	__u8	volNumber;
+	__le32	nwattr;
+	struct mutex open_mutex;
+	atomic_t	opened;
+	int	access;
+	int	flags;
+#define NCPI_KLUDGE_SYMLINK	0x0001
+#define NCPI_DIR_CACHE		0x0002
+	__u8	file_handle[6];
+	struct inode vfs_inode;
+};
+
+#endif	/* _LINUX_NCP_FS_I */
diff --git a/kernel/fs/ncpfs/ncp_fs_sb.h b/kernel/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 000000000..55e26fd80
--- /dev/null
+++ b/kernel/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,174 @@
+/*
+ *  ncp_fs_sb.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+
+#define NCP_DEFAULT_OPTIONS 0		/* 2 for packet signatures */
+
+struct sock;
+
+struct ncp_mount_data_kernel {
+	unsigned long    flags;		/* NCP_MOUNT_* flags */
+	unsigned int	 int_flags;	/* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE	0x0001
+	kuid_t		 mounted_uid;	/* Who may umount() this filesystem? */
+	struct pid      *wdog_pid;	/* Who cares for our watchdog packets? */
+	unsigned int     ncp_fd;	/* The socket to the ncp port */
+	unsigned int     time_out;	/* How long should I wait after
+					   sending a NCP request? */
+	unsigned int     retry_count;	/* And how often should I retry? */
+	unsigned char	 mounted_vol[NCP_VOLNAME_LEN + 1];
+	kuid_t		 uid;
+	kgid_t		 gid;
+	umode_t		 file_mode;
+	umode_t		 dir_mode;
+	int		 info_fd;
+};
+
+struct ncp_server {
+	struct rcu_head rcu;
+	struct ncp_mount_data_kernel m;	/* Nearly all of the mount data is of
+					   interest for us later, so we store
+					   it completely. */
+
+	__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+
+	struct socket *ncp_sock;/* ncp socket */
+	struct socket *info_sock;
+
+	u8 sequence;
+	u8 task;
+	u16 connection;		/* Remote connection number */
+
+	u8 completion;		/* Status message from server */
+	u8 conn_status;		/* Bit 4 = 1 ==> Server going down, no
+				   requests allowed anymore.
+				   Bit 0 = 1 ==> Server is down. */
+
+	int buffer_size;	/* Negotiated bufsize */
+
+	int reply_size;		/* Size of last reply */
+
+	int packet_size;
+	unsigned char *packet;	/* Here we prepare requests and
+				   receive replies */
+	unsigned char *txbuf;	/* Storage for current request */
+	unsigned char *rxbuf;	/* Storage for reply to current request */
+
+	int lock;		/* To prevent mismatch in protocols. */
+	struct mutex mutex;
+
+	int current_size;	/* for packet preparation */
+	int has_subfunction;
+	int ncp_reply_size;
+
+	int root_setuped;
+	struct mutex root_setup_lock;
+
+	/* info for packet signing */
+	int sign_wanted;	/* 1=Server needs signed packets */
+	int sign_active;	/* 0=don't do signing, 1=do */
+	char sign_root[8];	/* generated from password and encr. key */
+	char sign_last[16];	
+
+	/* Authentication info: NDS or BINDERY, username */
+	struct {
+		int	auth_type;
+		size_t	object_name_len;
+		void*	object_name;
+		int	object_type;
+	} auth;
+	/* Password info */
+	struct {
+		size_t	len;
+		void*	data;
+	} priv;
+	struct rw_semaphore auth_rwsem;
+
+	/* nls info: codepage for volume and charset for I/O */
+	struct nls_table *nls_vol;
+	struct nls_table *nls_io;
+
+	/* maximum age in jiffies */
+	atomic_t dentry_ttl;
+
+	/* miscellaneous */
+	unsigned int flags;
+
+	spinlock_t requests_lock;	/* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+
+	void (*data_ready)(struct sock* sk);
+	void (*error_report)(struct sock* sk);
+	void (*write_space)(struct sock* sk);	/* STREAM mode only */
+	struct {
+		struct work_struct tq;		/* STREAM/DGRAM: data/error ready */
+		struct ncp_request_reply* creq;	/* STREAM/DGRAM: awaiting reply from this request */
+		struct mutex creq_mutex;	/* DGRAM only: lock accesses to rcv.creq */
+
+		unsigned int state;		/* STREAM only: receiver state */
+		struct {
+			__u32 magic __packed;
+			__u32 len __packed;
+			__u16 type __packed;
+			__u16 p1 __packed;
+			__u16 p2 __packed;
+			__u16 p3 __packed;
+			__u16 type2 __packed;
+		} buf;				/* STREAM only: temporary buffer */
+		unsigned char* ptr;		/* STREAM only: pointer to data */
+		size_t len;			/* STREAM only: length of data to receive */
+	} rcv;
+	struct {
+		struct list_head requests;	/* STREAM only: queued requests */
+		struct work_struct tq;		/* STREAM only: transmitter ready */
+		struct ncp_request_reply* creq;	/* STREAM only: currently transmitted entry */
+	} tx;
+	struct timer_list timeout_tm;		/* DGRAM only: timeout timer */
+	struct work_struct timeout_tq;		/* DGRAM only: associated queue, we run timers from process context */
+	int timeout_last;			/* DGRAM only: current timeout length */
+	int timeout_retries;			/* DGRAM only: retries left */
+	struct {
+		size_t len;
+		__u8 data[128];
+	} unexpected_packet;
+	struct backing_dev_info bdi;
+};
+
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+
+#define NCP_FLAG_UTF8	1
+
+#define NCP_CLR_FLAG(server, flag)	((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag)	((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag)	((server)->flags & (flag))
+
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+	return ((server->conn_status & 0x11) == 0);
+}
+
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+	server->conn_status |= 0x01;
+}
+
+#endif
diff --git a/kernel/fs/ncpfs/ncplib_kernel.c b/kernel/fs/ncpfs/ncplib_kernel.c
new file mode 100644
index 000000000..88dbbc9fc
--- /dev/null
+++ b/kernel/fs/ncpfs/ncplib_kernel.c
@@ -0,0 +1,1321 @@
+/*
+ *  ncplib_kernel.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1999 Wolfram Pienkoss for NLS
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "ncp_fs.h"
+
+static inline void assert_server_locked(struct ncp_server *server)
+{
+	if (server->lock == 0) {
+		ncp_dbg(1, "server not locked!\n");
+	}
+}
+
+static void ncp_add_byte(struct ncp_server *server, __u8 x)
+{
+	assert_server_locked(server);
+	*(__u8 *) (&(server->packet[server->current_size])) = x;
+	server->current_size += 1;
+	return;
+}
+
+static void ncp_add_word(struct ncp_server *server, __le16 x)
+{
+	assert_server_locked(server);
+	put_unaligned(x, (__le16 *) (&(server->packet[server->current_size])));
+	server->current_size += 2;
+	return;
+}
+
+static void ncp_add_be16(struct ncp_server *server, __u16 x)
+{
+	assert_server_locked(server);
+	put_unaligned(cpu_to_be16(x), (__be16 *) (&(server->packet[server->current_size])));
+	server->current_size += 2;
+}
+
+static void ncp_add_dword(struct ncp_server *server, __le32 x)
+{
+	assert_server_locked(server);
+	put_unaligned(x, (__le32 *) (&(server->packet[server->current_size])));
+	server->current_size += 4;
+	return;
+}
+
+static void ncp_add_be32(struct ncp_server *server, __u32 x)
+{
+	assert_server_locked(server);
+	put_unaligned(cpu_to_be32(x), (__be32 *)(&(server->packet[server->current_size])));
+	server->current_size += 4;
+}
+
+static inline void ncp_add_dword_lh(struct ncp_server *server, __u32 x) {
+	ncp_add_dword(server, cpu_to_le32(x));
+}
+
+static void ncp_add_mem(struct ncp_server *server, const void *source, int size)
+{
+	assert_server_locked(server);
+	memcpy(&(server->packet[server->current_size]), source, size);
+	server->current_size += size;
+	return;
+}
+
+static void ncp_add_pstring(struct ncp_server *server, const char *s)
+{
+	int len = strlen(s);
+	assert_server_locked(server);
+	if (len > 255) {
+		ncp_dbg(1, "string too long: %s\n", s);
+		len = 255;
+	}
+	ncp_add_byte(server, len);
+	ncp_add_mem(server, s, len);
+	return;
+}
+
+static inline void ncp_init_request(struct ncp_server *server)
+{
+	ncp_lock_server(server);
+
+	server->current_size = sizeof(struct ncp_request_header);
+	server->has_subfunction = 0;
+}
+
+static inline void ncp_init_request_s(struct ncp_server *server, int subfunction)
+{
+	ncp_lock_server(server);
+	
+	server->current_size = sizeof(struct ncp_request_header) + 2;
+	ncp_add_byte(server, subfunction);
+
+	server->has_subfunction = 1;
+}
+
+static inline char *
+ncp_reply_data(struct ncp_server *server, int offset)
+{
+	return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
+}
+
+static inline u8 BVAL(const void *data)
+{
+	return *(const u8 *)data;
+}
+
+static u8 ncp_reply_byte(struct ncp_server *server, int offset)
+{
+	return *(const u8 *)ncp_reply_data(server, offset);
+}
+
+static inline u16 WVAL_LH(const void *data)
+{
+	return get_unaligned_le16(data);
+}
+
+static u16
+ncp_reply_le16(struct ncp_server *server, int offset)
+{
+	return get_unaligned_le16(ncp_reply_data(server, offset));
+}
+
+static u16
+ncp_reply_be16(struct ncp_server *server, int offset)
+{
+	return get_unaligned_be16(ncp_reply_data(server, offset));
+}
+
+static inline u32 DVAL_LH(const void *data)
+{
+	return get_unaligned_le32(data);
+}
+
+static __le32
+ncp_reply_dword(struct ncp_server *server, int offset)
+{
+	return get_unaligned((__le32 *)ncp_reply_data(server, offset));
+}
+
+static inline __u32 ncp_reply_dword_lh(struct ncp_server* server, int offset) {
+	return le32_to_cpu(ncp_reply_dword(server, offset));
+}
+
+int
+ncp_negotiate_buffersize(struct ncp_server *server, int size, int *target)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_be16(server, size);
+
+	if ((result = ncp_request(server, 33)) != 0) {
+		ncp_unlock_server(server);
+		return result;
+	}
+	*target = min_t(unsigned int, ncp_reply_be16(server, 0), size);
+
+	ncp_unlock_server(server);
+	return 0;
+}
+
+
+/* options: 
+ *	bit 0	ipx checksum
+ *	bit 1	packet signing
+ */
+int
+ncp_negotiate_size_and_options(struct ncp_server *server, 
+	int size, int options, int *ret_size, int *ret_options) {
+	int result;
+
+	/* there is minimum */
+	if (size < NCP_BLOCK_SIZE) size = NCP_BLOCK_SIZE;
+
+	ncp_init_request(server);
+	ncp_add_be16(server, size);
+	ncp_add_byte(server, options);
+	
+	if ((result = ncp_request(server, 0x61)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+
+	/* NCP over UDP returns 0 (!!!) */
+	result = ncp_reply_be16(server, 0);
+	if (result >= NCP_BLOCK_SIZE)
+		size = min(result, size);
+	*ret_size = size;
+	*ret_options = ncp_reply_byte(server, 4);
+
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int ncp_get_volume_info_with_number(struct ncp_server* server,
+			     int n, struct ncp_volume_info* target) {
+	int result;
+	int len;
+
+	ncp_init_request_s(server, 44);
+	ncp_add_byte(server, n);
+
+	if ((result = ncp_request(server, 22)) != 0) {
+		goto out;
+	}
+	target->total_blocks = ncp_reply_dword_lh(server, 0);
+	target->free_blocks = ncp_reply_dword_lh(server, 4);
+	target->purgeable_blocks = ncp_reply_dword_lh(server, 8);
+	target->not_yet_purgeable_blocks = ncp_reply_dword_lh(server, 12);
+	target->total_dir_entries = ncp_reply_dword_lh(server, 16);
+	target->available_dir_entries = ncp_reply_dword_lh(server, 20);
+	target->sectors_per_block = ncp_reply_byte(server, 28);
+
+	memset(&(target->volume_name), 0, sizeof(target->volume_name));
+
+	result = -EIO;
+	len = ncp_reply_byte(server, 29);
+	if (len > NCP_VOLNAME_LEN) {
+		ncp_dbg(1, "volume name too long: %d\n", len);
+		goto out;
+	}
+	memcpy(&(target->volume_name), ncp_reply_data(server, 30), len);
+	result = 0;
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_get_directory_info(struct ncp_server* server, __u8 n, 
+			   struct ncp_volume_info* target) {
+	int result;
+	int len;
+
+	ncp_init_request_s(server, 45);
+	ncp_add_byte(server, n);
+
+	if ((result = ncp_request(server, 22)) != 0) {
+		goto out;
+	}
+	target->total_blocks = ncp_reply_dword_lh(server, 0);
+	target->free_blocks = ncp_reply_dword_lh(server, 4);
+	target->purgeable_blocks = 0;
+	target->not_yet_purgeable_blocks = 0;
+	target->total_dir_entries = ncp_reply_dword_lh(server, 8);
+	target->available_dir_entries = ncp_reply_dword_lh(server, 12);
+	target->sectors_per_block = ncp_reply_byte(server, 20);
+
+	memset(&(target->volume_name), 0, sizeof(target->volume_name));
+
+	result = -EIO;
+	len = ncp_reply_byte(server, 21);
+	if (len > NCP_VOLNAME_LEN) {
+		ncp_dbg(1, "volume name too long: %d\n", len);
+		goto out;
+	}
+	memcpy(&(target->volume_name), ncp_reply_data(server, 22), len);
+	result = 0;
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_close_file(struct ncp_server *server, const char *file_id)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+
+	result = ncp_request(server, 66);
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_make_closed(struct inode *inode)
+{
+	int err;
+
+	err = 0;
+	mutex_lock(&NCP_FINFO(inode)->open_mutex);
+	if (atomic_read(&NCP_FINFO(inode)->opened) == 1) {
+		atomic_set(&NCP_FINFO(inode)->opened, 0);
+		err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle);
+
+		if (!err)
+			ncp_vdbg("volnum=%d, dirent=%u, error=%d\n",
+				 NCP_FINFO(inode)->volNumber,
+				 NCP_FINFO(inode)->dirEntNum, err);
+	}
+	mutex_unlock(&NCP_FINFO(inode)->open_mutex);
+	return err;
+}
+
+static void ncp_add_handle_path(struct ncp_server *server, __u8 vol_num,
+				__le32 dir_base, int have_dir_base, 
+				const char *path)
+{
+	ncp_add_byte(server, vol_num);
+	ncp_add_dword(server, dir_base);
+	if (have_dir_base != 0) {
+		ncp_add_byte(server, 1);	/* dir_base */
+	} else {
+		ncp_add_byte(server, 0xff);	/* no handle */
+	}
+	if (path != NULL) {
+		ncp_add_byte(server, 1);	/* 1 component */
+		ncp_add_pstring(server, path);
+	} else {
+		ncp_add_byte(server, 0);
+	}
+}
+
+int ncp_dirhandle_alloc(struct ncp_server* server, __u8 volnum, __le32 dirent,
+			__u8* dirhandle) {
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 12);		/* subfunction */
+	ncp_add_byte(server, NW_NS_DOS);
+	ncp_add_byte(server, 0);
+	ncp_add_word(server, 0);
+	ncp_add_handle_path(server, volnum, dirent, 1, NULL);
+	if ((result = ncp_request(server, 87)) == 0) {
+		*dirhandle = ncp_reply_byte(server, 0);
+	}
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
+	int result;
+	
+	ncp_init_request_s(server, 20);
+	ncp_add_byte(server, dirhandle);
+	result = ncp_request(server, 22);
+	ncp_unlock_server(server);
+	return result;
+}
+
+void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
+{
+	const __u8 *name_len;
+	const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
+
+	memcpy(target, structure, info_struct_size);
+	name_len = structure + info_struct_size;
+	target->nameLen = *name_len;
+	memcpy(target->entryName, name_len + 1, *name_len);
+	target->entryName[*name_len] = '\0';
+	target->volNumber = le32_to_cpu(target->volNumber);
+	return;
+}
+
+#ifdef CONFIG_NCPFS_NFS_NS
+static inline void ncp_extract_nfs_info(const unsigned char *structure,
+				 struct nw_nfs_info *target)
+{
+	target->mode = DVAL_LH(structure);
+	target->rdev = DVAL_LH(structure + 8);
+}
+#endif
+
+int ncp_obtain_nfs_info(struct ncp_server *server,
+		        struct nw_info_struct *target)
+
+{
+	int result = 0;
+#ifdef CONFIG_NCPFS_NFS_NS
+	__u32 volnum = target->volNumber;
+
+	if (ncp_is_nfs_extras(server, volnum)) {
+		ncp_init_request(server);
+		ncp_add_byte(server, 19);	/* subfunction */
+		ncp_add_byte(server, server->name_space[volnum]);
+		ncp_add_byte(server, NW_NS_NFS);
+		ncp_add_byte(server, 0);
+		ncp_add_byte(server, volnum);
+		ncp_add_dword(server, target->dirEntNum);
+		/* We must retrieve both nlinks and rdev, otherwise some server versions
+		   report zeroes instead of valid data */
+		ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV);
+
+		if ((result = ncp_request(server, 87)) == 0) {
+			ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs);
+			ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n",
+				target->entryName, target->nfs.mode,
+				target->nfs.rdev);
+		} else {
+			target->nfs.mode = 0;
+			target->nfs.rdev = 0;
+		}
+	        ncp_unlock_server(server);
+
+	} else
+#endif
+	{
+		target->nfs.mode = 0;
+		target->nfs.rdev = 0;
+	}
+	return result;
+}
+
+/*
+ * Returns information for a (one-component) name relative to
+ * the specified directory.
+ */
+int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
+			struct nw_info_struct *target)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int result;
+
+	if (target == NULL) {
+		pr_err("%s: invalid call\n", __func__);
+		return -EINVAL;
+	}
+	ncp_init_request(server);
+	ncp_add_byte(server, 6);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, server->name_space[volnum]); /* N.B. twice ?? */
+	ncp_add_word(server, cpu_to_le16(0x8006));	/* get all */
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_handle_path(server, volnum, dirent, 1, path);
+
+	if ((result = ncp_request(server, 87)) != 0)
+		goto out;
+	ncp_extract_file_info(ncp_reply_data(server, 0), target);
+	ncp_unlock_server(server);
+	
+	result = ncp_obtain_nfs_info(server, target);
+	return result;
+
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+#ifdef CONFIG_NCPFS_NFS_NS
+static int
+ncp_obtain_DOS_dir_base(struct ncp_server *server,
+		__u8 ns, __u8 volnum, __le32 dirent,
+		const char *path, /* At most 1 component */
+		__le32 *DOS_dir_base)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 6); /* subfunction */
+	ncp_add_byte(server, ns);
+	ncp_add_byte(server, ns);
+	ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
+	ncp_add_dword(server, RIM_DIRECTORY);
+	ncp_add_handle_path(server, volnum, dirent, 1, path);
+
+	if ((result = ncp_request(server, 87)) == 0)
+	{
+	   	if (DOS_dir_base) *DOS_dir_base=ncp_reply_dword(server, 0x34);
+	}
+	ncp_unlock_server(server);
+	return result;
+}
+#endif /* CONFIG_NCPFS_NFS_NS */
+
+static inline int
+ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
+{
+#if defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS)
+	int result;
+	__u8 *namespace;
+	__u16 no_namespaces;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 24);	/* Subfunction: Get Name Spaces Loaded */
+	ncp_add_word(server, 0);
+	ncp_add_byte(server, volume);
+
+	if ((result = ncp_request(server, 87)) != 0) {
+		ncp_unlock_server(server);
+		return NW_NS_DOS; /* not result ?? */
+	}
+
+	result = NW_NS_DOS;
+	no_namespaces = ncp_reply_le16(server, 0);
+	namespace = ncp_reply_data(server, 2);
+
+	while (no_namespaces > 0) {
+		ncp_dbg(1, "found %d on %d\n", *namespace, volume);
+
+#ifdef CONFIG_NCPFS_NFS_NS
+		if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) 
+		{
+			result = NW_NS_NFS;
+			break;
+		}
+#endif	/* CONFIG_NCPFS_NFS_NS */
+#ifdef CONFIG_NCPFS_OS2_NS
+		if ((*namespace == NW_NS_OS2) && !(server->m.flags&NCP_MOUNT_NO_OS2))
+		{
+			result = NW_NS_OS2;
+		}
+#endif	/* CONFIG_NCPFS_OS2_NS */
+		namespace += 1;
+		no_namespaces -= 1;
+	}
+	ncp_unlock_server(server);
+	return result;
+#else	/* neither OS2 nor NFS - only DOS */
+	return NW_NS_DOS;
+#endif	/* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
+}
+
+int
+ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
+{
+	int ns = ncp_get_known_namespace(server, volume);
+
+	if (ret_ns)
+		*ret_ns = ns;
+
+	ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]);
+
+	if (server->name_space[volume] == ns)
+		return 0;
+	server->name_space[volume] = ns;
+	return 1;
+}
+
+static int
+ncp_ObtainSpecificDirBase(struct ncp_server *server,
+		__u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
+		const char *path, /* At most 1 component */
+		__le32 *dirEntNum, __le32 *DosDirNum)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 6); /* subfunction */
+	ncp_add_byte(server, nsSrc);
+	ncp_add_byte(server, nsDst);
+	ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_handle_path(server, vol_num, dir_base, 1, path);
+
+	if ((result = ncp_request(server, 87)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+
+	if (dirEntNum)
+		*dirEntNum = ncp_reply_dword(server, 0x30);
+	if (DosDirNum)
+		*DosDirNum = ncp_reply_dword(server, 0x34);
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int
+ncp_mount_subdir(struct ncp_server *server,
+		 __u8 volNumber, __u8 srcNS, __le32 dirEntNum,
+		 __u32* volume, __le32* newDirEnt, __le32* newDosEnt)
+{
+	int dstNS;
+	int result;
+
+	ncp_update_known_namespace(server, volNumber, &dstNS);
+	if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 
+				      dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
+	{
+		return result;
+	}
+	*volume = volNumber;
+	server->m.mounted_vol[1] = 0;
+	server->m.mounted_vol[0] = 'X';
+	return 0;
+}
+
+int 
+ncp_get_volume_root(struct ncp_server *server,
+		    const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
+{
+	int result;
+
+	ncp_dbg(1, "looking up vol %s\n", volname);
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 22);	/* Subfunction: Generate dir handle */
+	ncp_add_byte(server, 0);	/* DOS namespace */
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_byte(server, 0);	/* reserved */
+
+	ncp_add_byte(server, 0);	/* faked volume number */
+	ncp_add_dword(server, 0);	/* faked dir_base */
+	ncp_add_byte(server, 0xff);	/* Don't have a dir_base */
+	ncp_add_byte(server, 1);	/* 1 path component */
+	ncp_add_pstring(server, volname);
+
+	if ((result = ncp_request(server, 87)) != 0) {
+		ncp_unlock_server(server);
+		return result;
+	}
+	*dirent = *dosdirent = ncp_reply_dword(server, 4);
+	*volume = ncp_reply_byte(server, 8);
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int
+ncp_lookup_volume(struct ncp_server *server,
+		  const char *volname, struct nw_info_struct *target)
+{
+	int result;
+
+	memset(target, 0, sizeof(*target));
+	result = ncp_get_volume_root(server, volname,
+			&target->volNumber, &target->dirEntNum, &target->DosDirNum);
+	if (result) {
+		return result;
+	}
+	ncp_update_known_namespace(server, target->volNumber, NULL);
+	target->nameLen = strlen(volname);
+	memcpy(target->entryName, volname, target->nameLen+1);
+	target->attributes = aDIR;
+	/* set dates to Jan 1, 1986  00:00 */
+	target->creationTime = target->modifyTime = cpu_to_le16(0x0000);
+	target->creationDate = target->modifyDate = target->lastAccessDate = cpu_to_le16(0x0C21);
+	target->nfs.mode = 0;
+	return 0;
+}
+
+int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *server,
+					    struct inode *dir,
+					    const char *path,
+					    __le32 info_mask,
+					    const struct nw_modify_dos_info *info)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 7);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_word(server, cpu_to_le16(0x8006));	/* search attribs: all */
+
+	ncp_add_dword(server, info_mask);
+	ncp_add_mem(server, info, sizeof(*info));
+	ncp_add_handle_path(server, volnum, dirent, 1, path);
+
+	result = ncp_request(server, 87);
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_modify_file_or_subdir_dos_info(struct ncp_server *server,
+				       struct inode *dir,
+				       __le32 info_mask,
+				       const struct nw_modify_dos_info *info)
+{
+	return ncp_modify_file_or_subdir_dos_info_path(server, dir, NULL,
+		info_mask, info);
+}
+
+#ifdef CONFIG_NCPFS_NFS_NS
+int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
+			       __u32 mode, __u32 rdev)
+
+{
+	int result = 0;
+
+	ncp_init_request(server);
+	if (server->name_space[volnum] == NW_NS_NFS) {
+		ncp_add_byte(server, 25);	/* subfunction */
+		ncp_add_byte(server, server->name_space[volnum]);
+		ncp_add_byte(server, NW_NS_NFS);
+		ncp_add_byte(server, volnum);
+		ncp_add_dword(server, dirent);
+		/* we must always operate on both nlinks and rdev, otherwise
+		   rdev is not set */
+		ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV);
+		ncp_add_dword_lh(server, mode);
+		ncp_add_dword_lh(server, 1);	/* nlinks */
+		ncp_add_dword_lh(server, rdev);
+		result = ncp_request(server, 87);
+	}
+	ncp_unlock_server(server);
+	return result;
+}
+#endif
+
+
+static int
+ncp_DeleteNSEntry(struct ncp_server *server,
+		  __u8 have_dir_base, __u8 volnum, __le32 dirent,
+		  const char* name, __u8 ns, __le16 attr)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 8);	/* subfunction */
+	ncp_add_byte(server, ns);
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_word(server, attr);	/* search attribs: all */
+	ncp_add_handle_path(server, volnum, dirent, have_dir_base, name);
+
+	result = ncp_request(server, 87);
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_del_file_or_subdir2(struct ncp_server *server,
+			struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	__u8  volnum;
+	__le32 dirent;
+
+	if (!inode) {
+		return 0xFF;	/* Any error */
+	}
+	volnum = NCP_FINFO(inode)->volNumber;
+	dirent = NCP_FINFO(inode)->DosDirNum;
+	return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006));
+}
+
+int
+ncp_del_file_or_subdir(struct ncp_server *server,
+		       struct inode *dir, const char *name)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int name_space;
+
+	name_space = server->name_space[volnum];
+#ifdef CONFIG_NCPFS_NFS_NS
+	if (name_space == NW_NS_NFS)
+ 	{
+ 		int result;
+ 
+		result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
+ 		if (result) return result;
+		name = NULL;
+		name_space = NW_NS_DOS;
+ 	}
+#endif	/* CONFIG_NCPFS_NFS_NS */
+	return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
+}
+
+static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
+{
+	__le16 *dest = (__le16 *) ret;
+	dest[1] = cpu_to_le16(v0);
+	dest[2] = cpu_to_le16(v1);
+	dest[0] = cpu_to_le16(v0 + 1);
+	return;
+}
+
+/* If both dir and name are NULL, then in target there's already a
+   looked-up entry that wants to be opened. */
+int ncp_open_create_file_or_subdir(struct ncp_server *server,
+				   struct inode *dir, const char *name,
+				   int open_create_mode,
+				   __le32 create_attributes,
+				   __le16 desired_acc_rights,
+				   struct ncp_entry_info *target)
+{
+	__le16 search_attribs = cpu_to_le16(0x0006);
+	__u8  volnum;
+	__le32 dirent;
+	int result;
+
+	volnum = NCP_FINFO(dir)->volNumber;
+	dirent = NCP_FINFO(dir)->dirEntNum;
+
+	if ((create_attributes & aDIR) != 0) {
+		search_attribs |= cpu_to_le16(0x8000);
+	}
+	ncp_init_request(server);
+	ncp_add_byte(server, 1);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, open_create_mode);
+	ncp_add_word(server, search_attribs);
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_dword(server, create_attributes);
+	/* The desired acc rights seem to be the inherited rights mask
+	   for directories */
+	ncp_add_word(server, desired_acc_rights);
+	ncp_add_handle_path(server, volnum, dirent, 1, name);
+
+	if ((result = ncp_request(server, 87)) != 0)
+		goto out;
+	if (!(create_attributes & aDIR))
+		target->opened = 1;
+
+	/* in target there's a new finfo to fill */
+	ncp_extract_file_info(ncp_reply_data(server, 6), &(target->i));
+	target->volume = target->i.volNumber;
+	ConvertToNWfromDWORD(ncp_reply_le16(server, 0),
+			     ncp_reply_le16(server, 2),
+			     target->file_handle);
+	
+	ncp_unlock_server(server);
+
+	(void)ncp_obtain_nfs_info(server, &(target->i));
+	return 0;
+
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_initialize_search(struct ncp_server *server, struct inode *dir,
+			struct nw_search_sequence *target)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 2);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_handle_path(server, volnum, dirent, 1, NULL);
+
+	result = ncp_request(server, 87);
+	if (result)
+		goto out;
+	memcpy(target, ncp_reply_data(server, 0), sizeof(*target));
+
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_search_for_fileset(struct ncp_server *server,
+			   struct nw_search_sequence *seq,
+			   int* more,
+			   int* cnt,
+			   char* buffer,
+			   size_t bufsize,
+			   char** rbuf,
+			   size_t* rsize)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 20);
+	ncp_add_byte(server, server->name_space[seq->volNumber]);
+	ncp_add_byte(server, 0);		/* datastream */
+	ncp_add_word(server, cpu_to_le16(0x8006));
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_word(server, cpu_to_le16(32767));	/* max returned items */
+	ncp_add_mem(server, seq, 9);
+#ifdef CONFIG_NCPFS_NFS_NS
+	if (server->name_space[seq->volNumber] == NW_NS_NFS) {
+		ncp_add_byte(server, 0);	/* 0 byte pattern */
+	} else 
+#endif
+	{
+		ncp_add_byte(server, 2);	/* 2 byte pattern */
+		ncp_add_byte(server, 0xff);	/* following is a wildcard */
+		ncp_add_byte(server, '*');
+	}
+	result = ncp_request2(server, 87, buffer, bufsize);
+	if (result) {
+		ncp_unlock_server(server);
+		return result;
+	}
+	if (server->ncp_reply_size < 12) {
+		ncp_unlock_server(server);
+		return 0xFF;
+	}
+	*rsize = server->ncp_reply_size - 12;
+	ncp_unlock_server(server);
+	buffer = buffer + sizeof(struct ncp_reply_header);
+	*rbuf = buffer + 12;
+	*cnt = WVAL_LH(buffer + 10);
+	*more = BVAL(buffer + 9);
+	memcpy(seq, buffer, 9);
+	return 0;
+}
+
+static int
+ncp_RenameNSEntry(struct ncp_server *server,
+		  struct inode *old_dir, const char *old_name, __le16 old_type,
+		  struct inode *new_dir, const char *new_name)
+{
+	int result = -EINVAL;
+
+	if ((old_dir == NULL) || (old_name == NULL) ||
+	    (new_dir == NULL) || (new_name == NULL))
+		goto out;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 4);	/* subfunction */
+	ncp_add_byte(server, server->name_space[NCP_FINFO(old_dir)->volNumber]);
+	ncp_add_byte(server, 1);	/* rename flag */
+	ncp_add_word(server, old_type);	/* search attributes */
+
+	/* source Handle Path */
+	ncp_add_byte(server, NCP_FINFO(old_dir)->volNumber);
+	ncp_add_dword(server, NCP_FINFO(old_dir)->dirEntNum);
+	ncp_add_byte(server, 1);
+	ncp_add_byte(server, 1);	/* 1 source component */
+
+	/* dest Handle Path */
+	ncp_add_byte(server, NCP_FINFO(new_dir)->volNumber);
+	ncp_add_dword(server, NCP_FINFO(new_dir)->dirEntNum);
+	ncp_add_byte(server, 1);
+	ncp_add_byte(server, 1);	/* 1 destination component */
+
+	/* source path string */
+	ncp_add_pstring(server, old_name);
+	/* dest path string */
+	ncp_add_pstring(server, new_name);
+
+	result = ncp_request(server, 87);
+	ncp_unlock_server(server);
+out:
+	return result;
+}
+
+int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
+				struct inode *old_dir, const char *old_name,
+				struct inode *new_dir, const char *new_name)
+{
+        int result;
+        __le16 old_type = cpu_to_le16(0x06);
+
+/* If somebody can do it atomic, call me... vandrove@vc.cvut.cz */
+	result = ncp_RenameNSEntry(server, old_dir, old_name, old_type,
+	                                   new_dir, new_name);
+        if (result == 0xFF)	/* File Not Found, try directory */
+	{
+		old_type = cpu_to_le16(0x16);
+		result = ncp_RenameNSEntry(server, old_dir, old_name, old_type,
+						   new_dir, new_name);
+	}
+	if (result != 0x92) return result;	/* All except NO_FILES_RENAMED */
+	result = ncp_del_file_or_subdir(server, new_dir, new_name);
+	if (result != 0) return -EACCES;
+	result = ncp_RenameNSEntry(server, old_dir, old_name, old_type,
+					   new_dir, new_name);
+	return result;
+}
+	
+
+/* We have to transfer to/from user space */
+int
+ncp_read_kernel(struct ncp_server *server, const char *file_id,
+	     __u32 offset, __u16 to_read, char *target, int *bytes_read)
+{
+	const char *source;
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be16(server, to_read);
+
+	if ((result = ncp_request(server, 72)) != 0) {
+		goto out;
+	}
+	*bytes_read = ncp_reply_be16(server, 0);
+	source = ncp_reply_data(server, 2 + (offset & 1));
+
+	memcpy(target, source, *bytes_read);
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+/* There is a problem... egrep and some other silly tools do:
+	x = mmap(NULL, MAP_PRIVATE, PROT_READ|PROT_WRITE, <ncpfs fd>, 32768);
+	read(<ncpfs fd>, x, 32768);
+   Now copying read result by copy_to_user causes pagefault. This pagefault
+   could not be handled because of server was locked due to read. So we have
+   to use temporary buffer. So ncp_unlock_server must be done before
+   copy_to_user (and for write, copy_from_user must be done before 
+   ncp_init_request... same applies for send raw packet ioctl). Because of
+   file is normally read in bigger chunks, caller provides kmalloced 
+   (vmalloced) chunk of memory with size >= to_read...
+ */
+int
+ncp_read_bounce(struct ncp_server *server, const char *file_id,
+	 __u32 offset, __u16 to_read, struct iov_iter *to,
+	 int *bytes_read, void *bounce, __u32 bufsize)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be16(server, to_read);
+	result = ncp_request2(server, 72, bounce, bufsize);
+	ncp_unlock_server(server);
+	if (!result) {
+		int len = get_unaligned_be16((char *)bounce +
+			  sizeof(struct ncp_reply_header));
+		result = -EIO;
+		if (len <= to_read) {
+			char* source;
+
+			source = (char*)bounce + 
+			         sizeof(struct ncp_reply_header) + 2 + 
+			         (offset & 1);
+			*bytes_read = len;
+			result = 0;
+			if (copy_to_iter(source, len, to) != len)
+				result = -EFAULT;
+		}
+	}
+	return result;
+}
+
+int
+ncp_write_kernel(struct ncp_server *server, const char *file_id,
+		 __u32 offset, __u16 to_write,
+		 const char *source, int *bytes_written)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be16(server, to_write);
+	ncp_add_mem(server, source, to_write);
+	
+	if ((result = ncp_request(server, 73)) == 0)
+		*bytes_written = to_write;
+	ncp_unlock_server(server);
+	return result;
+}
+
+#ifdef CONFIG_NCPFS_IOCTL_LOCKING
+int
+ncp_LogPhysicalRecord(struct ncp_server *server, const char *file_id,
+	  __u8 locktype, __u32 offset, __u32 length, __u16 timeout)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, locktype);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be32(server, length);
+	ncp_add_be16(server, timeout);
+
+	if ((result = ncp_request(server, 0x1A)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int
+ncp_ClearPhysicalRecord(struct ncp_server *server, const char *file_id,
+	  __u32 offset, __u32 length)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);	/* who knows... lanalyzer says that */
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be32(server, length);
+
+	if ((result = ncp_request(server, 0x1E)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+	ncp_unlock_server(server);
+	return 0;
+}
+#endif	/* CONFIG_NCPFS_IOCTL_LOCKING */
+
+#ifdef CONFIG_NCPFS_NLS
+/* This are the NLS conversion routines with inspirations and code parts
+ * from the vfat file system and hints from Petr Vandrovec.
+ */
+
+int
+ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
+		const unsigned char *iname, unsigned int ilen, int cc)
+{
+	struct nls_table *in = server->nls_io;
+	struct nls_table *out = server->nls_vol;
+	unsigned char *vname_start;
+	unsigned char *vname_end;
+	const unsigned char *iname_end;
+
+	iname_end = iname + ilen;
+	vname_start = vname;
+	vname_end = vname + *vlen - 1;
+
+	while (iname < iname_end) {
+		int chl;
+		wchar_t ec;
+
+		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
+			int k;
+			unicode_t u;
+
+			k = utf8_to_utf32(iname, iname_end - iname, &u);
+			if (k < 0 || u > MAX_WCHAR_T)
+				return -EINVAL;
+			iname += k;
+			ec = u;
+		} else {
+			if (*iname == NCP_ESC) {
+				int k;
+
+				if (iname_end - iname < 5)
+					goto nospec;
+
+				ec = 0;
+				for (k = 1; k < 5; k++) {
+					unsigned char nc;
+
+					nc = iname[k] - '0';
+					if (nc >= 10) {
+						nc -= 'A' - '0' - 10;
+						if ((nc < 10) || (nc > 15)) {
+							goto nospec;
+						}
+					}
+					ec = (ec << 4) | nc;
+				}
+				iname += 5;
+			} else {
+nospec:;			
+				if ( (chl = in->char2uni(iname, iname_end - iname, &ec)) < 0)
+					return chl;
+				iname += chl;
+			}
+		}
+
+		/* unitoupper should be here! */
+
+		chl = out->uni2char(ec, vname, vname_end - vname);
+		if (chl < 0)
+			return chl;
+
+		/* this is wrong... */
+		if (cc) {
+			int chi;
+
+			for (chi = 0; chi < chl; chi++){
+				vname[chi] = ncp_toupper(out, vname[chi]);
+			}
+		}
+		vname += chl;
+	}
+
+	*vname = 0;
+	*vlen = vname - vname_start;
+	return 0;
+}
+
+int
+ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
+		const unsigned char *vname, unsigned int vlen, int cc)
+{
+	struct nls_table *in = server->nls_vol;
+	struct nls_table *out = server->nls_io;
+	const unsigned char *vname_end;
+	unsigned char *iname_start;
+	unsigned char *iname_end;
+	unsigned char *vname_cc;
+	int err;
+
+	vname_cc = NULL;
+
+	if (cc) {
+		int i;
+
+		/* this is wrong! */
+		vname_cc = kmalloc(vlen, GFP_KERNEL);
+		if (!vname_cc)
+			return -ENOMEM;
+		for (i = 0; i < vlen; i++)
+			vname_cc[i] = ncp_tolower(in, vname[i]);
+		vname = vname_cc;
+	}
+
+	iname_start = iname;
+	iname_end = iname + *ilen - 1;
+	vname_end = vname + vlen;
+
+	while (vname < vname_end) {
+		wchar_t ec;
+		int chl;
+
+		if ( (chl = in->char2uni(vname, vname_end - vname, &ec)) < 0) {
+			err = chl;
+			goto quit;
+		}
+		vname += chl;
+
+		/* unitolower should be here! */
+
+		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
+			int k;
+
+			k = utf32_to_utf8(ec, iname, iname_end - iname);
+			if (k < 0) {
+				err = -ENAMETOOLONG;
+				goto quit;
+			}
+			iname += k;
+		} else {
+			if ( (chl = out->uni2char(ec, iname, iname_end - iname)) >= 0) {
+				iname += chl;
+			} else {
+				int k;
+
+				if (iname_end - iname < 5) {
+					err = -ENAMETOOLONG;
+					goto quit;
+				}
+				*iname = NCP_ESC;
+				for (k = 4; k > 0; k--) {
+					unsigned char v;
+					
+					v = (ec & 0xF) + '0';
+					if (v > '9') {
+						v += 'A' - '9' - 1;
+					}
+					iname[k] = v;
+					ec >>= 4;
+				}
+				iname += 5;
+			}
+		}
+	}
+
+	*iname = 0;
+	*ilen = iname - iname_start;
+	err = 0;
+quit:;
+	if (cc)
+		kfree(vname_cc);
+	return err;
+}
+
+#else
+
+int
+ncp__io2vol(unsigned char *vname, unsigned int *vlen,
+		const unsigned char *iname, unsigned int ilen, int cc)
+{
+	int i;
+
+	if (*vlen <= ilen)
+		return -ENAMETOOLONG;
+
+	if (cc)
+		for (i = 0; i < ilen; i++) {
+			*vname = toupper(*iname);
+			vname++;
+			iname++;
+		}
+	else {
+		memmove(vname, iname, ilen);
+		vname += ilen;
+	}
+
+	*vlen = ilen;
+	*vname = 0;
+	return 0;
+}
+
+int
+ncp__vol2io(unsigned char *iname, unsigned int *ilen,
+		const unsigned char *vname, unsigned int vlen, int cc)
+{
+	int i;
+
+	if (*ilen <= vlen)
+		return -ENAMETOOLONG;
+
+	if (cc)
+		for (i = 0; i < vlen; i++) {
+			*iname = tolower(*vname);
+			iname++;
+			vname++;
+		}
+	else {
+		memmove(iname, vname, vlen);
+		iname += vlen;
+	}
+
+	*ilen = vlen;
+	*iname = 0;
+	return 0;
+}
+
+#endif
diff --git a/kernel/fs/ncpfs/ncplib_kernel.h b/kernel/fs/ncpfs/ncplib_kernel.h
new file mode 100644
index 000000000..5233fbc17
--- /dev/null
+++ b/kernel/fs/ncpfs/ncplib_kernel.h
@@ -0,0 +1,214 @@
+/*
+ *  ncplib_kernel.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998, 1999 Wolfram Pienkoss for NLS
+ *  Modified 1999 Wolfram Pienkoss for directory caching
+ *
+ */
+
+#ifndef _NCPLIB_H
+#define _NCPLIB_H
+
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/pagemap.h>
+
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <asm/string.h>
+
+#ifdef CONFIG_NCPFS_NLS
+#include <linux/nls.h>
+#else
+#include <linux/ctype.h>
+#endif /* CONFIG_NCPFS_NLS */
+
+#define NCP_MIN_SYMLINK_SIZE	8
+#define NCP_MAX_SYMLINK_SIZE	512
+
+#define NCP_BLOCK_SHIFT		9
+#define NCP_BLOCK_SIZE		(1 << (NCP_BLOCK_SHIFT))
+
+int ncp_negotiate_buffersize(struct ncp_server *, int, int *);
+int ncp_negotiate_size_and_options(struct ncp_server *server, int size,
+  			  int options, int *ret_size, int *ret_options);
+
+int ncp_get_volume_info_with_number(struct ncp_server* server, int n,
+				    struct ncp_volume_info *target);
+
+int ncp_get_directory_info(struct ncp_server* server, __u8 dirhandle,
+			   struct ncp_volume_info* target);
+
+int ncp_close_file(struct ncp_server *, const char *);
+static inline int ncp_read_bounce_size(__u32 size) {
+	return sizeof(struct ncp_reply_header) + 2 + 2 + size + 8;
+};
+int ncp_read_bounce(struct ncp_server *, const char *, __u32, __u16, 
+		struct iov_iter *, int *, void *bounce, __u32 bouncelen);
+int ncp_read_kernel(struct ncp_server *, const char *, __u32, __u16, 
+		char *, int *);
+int ncp_write_kernel(struct ncp_server *, const char *, __u32, __u16,
+		const char *, int *);
+
+static inline void ncp_inode_close(struct inode *inode) {
+	atomic_dec(&NCP_FINFO(inode)->opened);
+}
+
+void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
+int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
+		struct nw_info_struct *target);
+int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
+int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
+int ncp_get_volume_root(struct ncp_server *server, const char *volname,
+			__u32 *volume, __le32 *dirent, __le32 *dosdirent);
+int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
+int ncp_modify_file_or_subdir_dos_info(struct ncp_server *, struct inode *,
+	 __le32, const struct nw_modify_dos_info *info);
+int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *, struct inode *,
+	 const char* path, __le32, const struct nw_modify_dos_info *info);
+int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
+			__u32 mode, __u32 rdev);
+
+int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
+int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
+int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
+				int, __le32, __le16, struct ncp_entry_info *);
+
+int ncp_initialize_search(struct ncp_server *, struct inode *,
+		      struct nw_search_sequence *target);
+int ncp_search_for_fileset(struct ncp_server *server,
+			   struct nw_search_sequence *seq,
+			   int* more, int* cnt,
+			   char* buffer, size_t bufsize,
+			   char** rbuf, size_t* rsize);
+
+int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
+			      struct inode *, const char *, struct inode *, const char *);
+
+
+int
+ncp_LogPhysicalRecord(struct ncp_server *server,
+		      const char *file_id, __u8 locktype,
+		      __u32 offset, __u32 length, __u16 timeout);
+
+#ifdef CONFIG_NCPFS_IOCTL_LOCKING
+int
+ncp_ClearPhysicalRecord(struct ncp_server *server,
+			const char *file_id,
+			__u32 offset, __u32 length);
+#endif	/* CONFIG_NCPFS_IOCTL_LOCKING */
+
+int
+ncp_mount_subdir(struct ncp_server *, __u8, __u8, __le32,
+		 __u32* volume, __le32* dirent, __le32* dosdirent);
+int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirhandle);
+int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle);
+
+int ncp_create_new(struct inode *dir, struct dentry *dentry,
+                          umode_t mode, dev_t rdev, __le32 attributes);
+
+static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) {
+#ifdef CONFIG_NCPFS_NFS_NS
+	return (server->m.flags & NCP_MOUNT_NFS_EXTRAS) &&
+	       (server->name_space[volnum] == NW_NS_NFS);
+#else
+	return 0;
+#endif
+}
+
+#ifdef CONFIG_NCPFS_NLS
+
+int ncp__io2vol(struct ncp_server *, unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+
+#define NCP_ESC			':'
+#define NCP_IO_TABLE(sb)	(NCP_SBP(sb)->nls_io)
+#define ncp_tolower(t, c)	nls_tolower(t, c)
+#define ncp_toupper(t, c)	nls_toupper(t, c)
+#define ncp_strnicmp(t, s1, s2, len) \
+	nls_strnicmp(t, s1, s2, len)
+#define ncp_io2vol(S,m,i,n,k,U)	ncp__io2vol(S,m,i,n,k,U)
+#define ncp_vol2io(S,m,i,n,k,U)	ncp__vol2io(S,m,i,n,k,U)
+
+#else
+
+int ncp__io2vol(unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+int ncp__vol2io(unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+
+#define NCP_IO_TABLE(sb)	NULL
+#define ncp_tolower(t, c)	tolower(c)
+#define ncp_toupper(t, c)	toupper(c)
+#define ncp_io2vol(S,m,i,n,k,U)	ncp__io2vol(m,i,n,k,U)
+#define ncp_vol2io(S,m,i,n,k,U)	ncp__vol2io(m,i,n,k,U)
+
+
+static inline int ncp_strnicmp(const struct nls_table *t,
+		const unsigned char *s1, const unsigned char *s2, int len)
+{
+	while (len--) {
+		if (tolower(*s1++) != tolower(*s2++))
+			return 1;
+	}
+
+	return 0;
+}
+
+#endif /* CONFIG_NCPFS_NLS */
+
+#define NCP_GET_AGE(dentry)	(jiffies - (dentry)->d_time)
+#define NCP_MAX_AGE(server)	atomic_read(&(server)->dentry_ttl)
+#define NCP_TEST_AGE(server,dentry)	(NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
+
+static inline void
+ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
+{
+	dentry->d_time = jiffies - NCP_MAX_AGE(server);
+}
+
+static inline void
+ncp_new_dentry(struct dentry* dentry)
+{
+	dentry->d_time = jiffies;
+}
+
+struct ncp_cache_head {
+	time_t		mtime;
+	unsigned long	time;	/* cache age */
+	unsigned long	end;	/* last valid fpos in cache */
+	int		eof;
+};
+
+#define NCP_DIRCACHE_SIZE	((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
+union ncp_dir_cache {
+	struct ncp_cache_head	head;
+	struct dentry		*dentry[NCP_DIRCACHE_SIZE];
+};
+
+#define NCP_FIRSTCACHE_SIZE	((int)((NCP_DIRCACHE_SIZE * \
+	sizeof(struct dentry *) - sizeof(struct ncp_cache_head)) / \
+	sizeof(struct dentry *)))
+
+#define NCP_DIRCACHE_START	(NCP_DIRCACHE_SIZE - NCP_FIRSTCACHE_SIZE)
+
+struct ncp_cache_control {
+	struct	ncp_cache_head		head;
+	struct	page			*page;
+	union	ncp_dir_cache		*cache;
+	unsigned long			fpos, ofs;
+	int				filled, valid, idx;
+};
+
+#endif /* _NCPLIB_H */
diff --git a/kernel/fs/ncpfs/ncpsign_kernel.c b/kernel/fs/ncpfs/ncpsign_kernel.c
new file mode 100644
index 000000000..08907599d
--- /dev/null
+++ b/kernel/fs/ncpfs/ncpsign_kernel.c
@@ -0,0 +1,127 @@
+/*
+ *  ncpsign_kernel.c
+ *
+ *  Arne de Bruijn (arne@knoware.nl), 1997
+ *
+ */
+
+
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+
+#include <linux/string.h>
+#include <linux/ncp.h>
+#include <linux/bitops.h>
+#include "ncp_fs.h"
+#include "ncpsign_kernel.h"
+
+/* i386: 32-bit, little endian, handles mis-alignment */
+#ifdef __i386__
+#define GET_LE32(p) (*(const int *)(p))
+#define PUT_LE32(p,v) { *(int *)(p)=v; }
+#else
+/* from include/ncplib.h */
+#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
+#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
+#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
+
+static inline __u16
+WVAL_LH(const __u8 * buf, int pos)
+{
+	return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
+}
+static inline __u32
+DVAL_LH(const __u8 * buf, int pos)
+{
+	return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
+}
+static inline void
+WSET_LH(__u8 * buf, int pos, __u16 val)
+{
+	BSET(buf, pos, val & 0xff);
+	BSET(buf, pos + 1, val >> 8);
+}
+static inline void
+DSET_LH(__u8 * buf, int pos, __u32 val)
+{
+	WSET_LH(buf, pos, val & 0xffff);
+	WSET_LH(buf, pos + 2, val >> 16);
+}
+
+#define GET_LE32(p) DVAL_LH(p,0)
+#define PUT_LE32(p,v) DSET_LH(p,0,v)
+#endif
+
+static void nwsign(char *r_data1, char *r_data2, char *outdata) {
+ int i;
+ unsigned int w0,w1,w2,w3;
+ static int rbit[4]={0, 2, 1, 3};
+#ifdef __i386__
+ unsigned int *data2=(unsigned int *)r_data2;
+#else
+ unsigned int data2[16];
+ for (i=0;i<16;i++)
+  data2[i]=GET_LE32(r_data2+(i<<2));
+#endif 
+ w0=GET_LE32(r_data1);
+ w1=GET_LE32(r_data1+4);
+ w2=GET_LE32(r_data1+8);
+ w3=GET_LE32(r_data1+12);
+ for (i=0;i<16;i+=4) {
+  w0=rol32(w0 + ((w1 & w2) | ((~w1) & w3)) + data2[i+0],3);
+  w3=rol32(w3 + ((w0 & w1) | ((~w0) & w2)) + data2[i+1],7);
+  w2=rol32(w2 + ((w3 & w0) | ((~w3) & w1)) + data2[i+2],11);
+  w1=rol32(w1 + ((w2 & w3) | ((~w2) & w0)) + data2[i+3],19);
+ }
+ for (i=0;i<4;i++) {
+  w0=rol32(w0 + (((w2 | w3) & w1) | (w2 & w3)) + 0x5a827999 + data2[i+0],3);
+  w3=rol32(w3 + (((w1 | w2) & w0) | (w1 & w2)) + 0x5a827999 + data2[i+4],5);
+  w2=rol32(w2 + (((w0 | w1) & w3) | (w0 & w1)) + 0x5a827999 + data2[i+8],9);
+  w1=rol32(w1 + (((w3 | w0) & w2) | (w3 & w0)) + 0x5a827999 + data2[i+12],13);
+ }
+ for (i=0;i<4;i++) {
+  w0=rol32(w0 + ((w1 ^ w2) ^ w3) + 0x6ed9eba1 + data2[rbit[i]+0],3);
+  w3=rol32(w3 + ((w0 ^ w1) ^ w2) + 0x6ed9eba1 + data2[rbit[i]+8],9);
+  w2=rol32(w2 + ((w3 ^ w0) ^ w1) + 0x6ed9eba1 + data2[rbit[i]+4],11);
+  w1=rol32(w1 + ((w2 ^ w3) ^ w0) + 0x6ed9eba1 + data2[rbit[i]+12],15);
+ }
+ PUT_LE32(outdata,(w0+GET_LE32(r_data1)) & 0xffffffff);
+ PUT_LE32(outdata+4,(w1+GET_LE32(r_data1+4)) & 0xffffffff);
+ PUT_LE32(outdata+8,(w2+GET_LE32(r_data1+8)) & 0xffffffff);
+ PUT_LE32(outdata+12,(w3+GET_LE32(r_data1+12)) & 0xffffffff);
+}
+
+/* Make a signature for the current packet and add it at the end of the */
+/* packet. */
+void __sign_packet(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, void *sign_buff) {
+	unsigned char data[64];
+
+	memcpy(data, server->sign_root, 8);
+	*(__u32*)(data + 8) = totalsize;
+	if (size < 52) {
+		memcpy(data + 12, packet, size);
+		memset(data + 12 + size, 0, 52 - size);
+	} else {
+		memcpy(data + 12, packet, 52);
+	}
+	nwsign(server->sign_last, data, server->sign_last);
+	memcpy(sign_buff, server->sign_last, 8);
+}
+
+int sign_verify_reply(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, const void *sign_buff) {
+	unsigned char data[64];
+	unsigned char hash[16];
+
+	memcpy(data, server->sign_root, 8);
+	*(__u32*)(data + 8) = totalsize;
+	if (size < 52) {
+		memcpy(data + 12, packet, size);
+		memset(data + 12 + size, 0, 52 - size);
+	} else {
+		memcpy(data + 12, packet, 52);
+	}
+	nwsign(server->sign_last, data, hash);
+	return memcmp(sign_buff, hash, 8);
+}
+
+#endif	/* CONFIG_NCPFS_PACKET_SIGNING */
+
diff --git a/kernel/fs/ncpfs/ncpsign_kernel.h b/kernel/fs/ncpfs/ncpsign_kernel.h
new file mode 100644
index 000000000..d9a1438bb
--- /dev/null
+++ b/kernel/fs/ncpfs/ncpsign_kernel.h
@@ -0,0 +1,26 @@
+/*
+ *  ncpsign_kernel.h
+ *
+ *  Arne de Bruijn (arne@knoware.nl), 1997
+ *
+ */
+ 
+#ifndef _NCPSIGN_KERNEL_H
+#define _NCPSIGN_KERNEL_H
+
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
+int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
+#endif
+
+static inline size_t sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff) {
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	if (server->sign_active) {
+		__sign_packet(server, data, size, totalsize, sign_buff);
+		return 8;
+	}
+#endif
+	return 0;
+}
+
+#endif
diff --git a/kernel/fs/ncpfs/sock.c b/kernel/fs/ncpfs/sock.c
new file mode 100644
index 000000000..471bc3d11
--- /dev/null
+++ b/kernel/fs/ncpfs/sock.c
@@ -0,0 +1,881 @@
+/*
+ *  linux/fs/ncpfs/sock.c
+ *
+ *  Copyright (C) 1992, 1993  Rick Sladkey
+ *
+ *  Modified 1995, 1996 by Volker Lendecke to be usable for ncp
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <net/scm.h>
+#include <net/sock.h>
+#include <linux/ipx.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+
+#include "ncp_fs.h"
+
+#include "ncpsign_kernel.h"
+
+static int _recv(struct socket *sock, void *buf, int size, unsigned flags)
+{
+	struct msghdr msg = {NULL, };
+	struct kvec iov = {buf, size};
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+static inline int do_send(struct socket *sock, struct kvec *vec, int count,
+			  int len, unsigned flags)
+{
+	struct msghdr msg = { .msg_flags = flags };
+	return kernel_sendmsg(sock, &msg, vec, count, len);
+}
+
+static int _send(struct socket *sock, const void *buff, int len)
+{
+	struct kvec vec;
+	vec.iov_base = (void *) buff;
+	vec.iov_len = len;
+	return do_send(sock, &vec, 1, len, 0);
+}
+
+struct ncp_request_reply {
+	struct list_head req;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned char* reply_buf;
+	size_t datalen;
+	int result;
+	enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE, RQ_ABANDONED } status;
+	struct kvec* tx_ciov;
+	size_t tx_totallen;
+	size_t tx_iovlen;
+	struct kvec tx_iov[3];
+	u_int16_t tx_type;
+	u_int32_t sign[6];
+};
+
+static inline struct ncp_request_reply* ncp_alloc_req(void)
+{
+	struct ncp_request_reply *req;
+
+	req = kmalloc(sizeof(struct ncp_request_reply), GFP_KERNEL);
+	if (!req)
+		return NULL;
+
+	init_waitqueue_head(&req->wq);
+	atomic_set(&req->refs, (1));
+	req->status = RQ_IDLE;
+
+	return req;
+}
+
+static void ncp_req_get(struct ncp_request_reply *req)
+{
+	atomic_inc(&req->refs);
+}
+
+static void ncp_req_put(struct ncp_request_reply *req)
+{
+	if (atomic_dec_and_test(&req->refs))
+		kfree(req);
+}
+
+void ncp_tcp_data_ready(struct sock *sk)
+{
+	struct ncp_server *server = sk->sk_user_data;
+
+	server->data_ready(sk);
+	schedule_work(&server->rcv.tq);
+}
+
+void ncp_tcp_error_report(struct sock *sk)
+{
+	struct ncp_server *server = sk->sk_user_data;
+	
+	server->error_report(sk);
+	schedule_work(&server->rcv.tq);
+}
+
+void ncp_tcp_write_space(struct sock *sk)
+{
+	struct ncp_server *server = sk->sk_user_data;
+	
+	/* We do not need any locking: we first set tx.creq, and then we do sendmsg,
+	   not vice versa... */
+	server->write_space(sk);
+	if (server->tx.creq)
+		schedule_work(&server->tx.tq);
+}
+
+void ncpdgram_timeout_call(unsigned long v)
+{
+	struct ncp_server *server = (void*)v;
+	
+	schedule_work(&server->timeout_tq);
+}
+
+static inline void ncp_finish_request(struct ncp_server *server, struct ncp_request_reply *req, int result)
+{
+	req->result = result;
+	if (req->status != RQ_ABANDONED)
+		memcpy(req->reply_buf, server->rxbuf, req->datalen);
+	req->status = RQ_DONE;
+	wake_up_all(&req->wq);
+	ncp_req_put(req);
+}
+
+static void __abort_ncp_connection(struct ncp_server *server)
+{
+	struct ncp_request_reply *req;
+
+	ncp_invalidate_conn(server);
+	del_timer(&server->timeout_tm);
+	while (!list_empty(&server->tx.requests)) {
+		req = list_entry(server->tx.requests.next, struct ncp_request_reply, req);
+		
+		list_del_init(&req->req);
+		ncp_finish_request(server, req, -EIO);
+	}
+	req = server->rcv.creq;
+	if (req) {
+		server->rcv.creq = NULL;
+		ncp_finish_request(server, req, -EIO);
+		server->rcv.ptr = NULL;
+		server->rcv.state = 0;
+	}
+	req = server->tx.creq;
+	if (req) {
+		server->tx.creq = NULL;
+		ncp_finish_request(server, req, -EIO);
+	}
+}
+
+static inline int get_conn_number(struct ncp_reply_header *rp)
+{
+	return rp->conn_low | (rp->conn_high << 8);
+}
+
+static inline void __ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err)
+{
+	/* If req is done, we got signal, but we also received answer... */
+	switch (req->status) {
+		case RQ_IDLE:
+		case RQ_DONE:
+			break;
+		case RQ_QUEUED:
+			list_del_init(&req->req);
+			ncp_finish_request(server, req, err);
+			break;
+		case RQ_INPROGRESS:
+			req->status = RQ_ABANDONED;
+			break;
+		case RQ_ABANDONED:
+			break;
+	}
+}
+
+static inline void ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err)
+{
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncp_abort_request(server, req, err);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+static inline void __ncptcp_abort(struct ncp_server *server)
+{
+	__abort_ncp_connection(server);
+}
+
+static int ncpdgram_send(struct socket *sock, struct ncp_request_reply *req)
+{
+	struct kvec vec[3];
+	/* sock_sendmsg updates iov pointers for us :-( */
+	memcpy(vec, req->tx_ciov, req->tx_iovlen * sizeof(vec[0]));
+	return do_send(sock, vec, req->tx_iovlen,
+		       req->tx_totallen, MSG_DONTWAIT);
+}
+
+static void __ncptcp_try_send(struct ncp_server *server)
+{
+	struct ncp_request_reply *rq;
+	struct kvec *iov;
+	struct kvec iovc[3];
+	int result;
+
+	rq = server->tx.creq;
+	if (!rq)
+		return;
+
+	/* sock_sendmsg updates iov pointers for us :-( */
+	memcpy(iovc, rq->tx_ciov, rq->tx_iovlen * sizeof(iov[0]));
+	result = do_send(server->ncp_sock, iovc, rq->tx_iovlen,
+			 rq->tx_totallen, MSG_NOSIGNAL | MSG_DONTWAIT);
+
+	if (result == -EAGAIN)
+		return;
+
+	if (result < 0) {
+		pr_err("tcp: Send failed: %d\n", result);
+		__ncp_abort_request(server, rq, result);
+		return;
+	}
+	if (result >= rq->tx_totallen) {
+		server->rcv.creq = rq;
+		server->tx.creq = NULL;
+		return;
+	}
+	rq->tx_totallen -= result;
+	iov = rq->tx_ciov;
+	while (iov->iov_len <= result) {
+		result -= iov->iov_len;
+		iov++;
+		rq->tx_iovlen--;
+	}
+	iov->iov_base += result;
+	iov->iov_len -= result;
+	rq->tx_ciov = iov;
+}
+
+static inline void ncp_init_header(struct ncp_server *server, struct ncp_request_reply *req, struct ncp_request_header *h)
+{
+	req->status = RQ_INPROGRESS;
+	h->conn_low = server->connection;
+	h->conn_high = server->connection >> 8;
+	h->sequence = ++server->sequence;
+}
+	
+static void ncpdgram_start_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	size_t signlen;
+	struct ncp_request_header* h;
+	
+	req->tx_ciov = req->tx_iov + 1;
+
+	h = req->tx_iov[1].iov_base;
+	ncp_init_header(server, req, h);
+	signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, 
+			req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1,
+			cpu_to_le32(req->tx_totallen), req->sign);
+	if (signlen) {
+		req->tx_ciov[1].iov_base = req->sign;
+		req->tx_ciov[1].iov_len = signlen;
+		req->tx_iovlen += 1;
+		req->tx_totallen += signlen;
+	}
+	server->rcv.creq = req;
+	server->timeout_last = server->m.time_out;
+	server->timeout_retries = server->m.retry_count;
+	ncpdgram_send(server->ncp_sock, req);
+	mod_timer(&server->timeout_tm, jiffies + server->m.time_out);
+}
+
+#define NCP_TCP_XMIT_MAGIC	(0x446D6454)
+#define NCP_TCP_XMIT_VERSION	(1)
+#define NCP_TCP_RCVD_MAGIC	(0x744E6350)
+
+static void ncptcp_start_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	size_t signlen;
+	struct ncp_request_header* h;
+
+	req->tx_ciov = req->tx_iov;
+	h = req->tx_iov[1].iov_base;
+	ncp_init_header(server, req, h);
+	signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1,
+			req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1,
+			cpu_to_be32(req->tx_totallen + 24), req->sign + 4) + 16;
+
+	req->sign[0] = htonl(NCP_TCP_XMIT_MAGIC);
+	req->sign[1] = htonl(req->tx_totallen + signlen);
+	req->sign[2] = htonl(NCP_TCP_XMIT_VERSION);
+	req->sign[3] = htonl(req->datalen + 8);
+	req->tx_iov[0].iov_base = req->sign;
+	req->tx_iov[0].iov_len = signlen;
+	req->tx_iovlen += 1;
+	req->tx_totallen += signlen;
+
+	server->tx.creq = req;
+	__ncptcp_try_send(server);
+}
+
+static inline void __ncp_start_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	/* we copy the data so that we do not depend on the caller
+	   staying alive */
+	memcpy(server->txbuf, req->tx_iov[1].iov_base, req->tx_iov[1].iov_len);
+	req->tx_iov[1].iov_base = server->txbuf;
+
+	if (server->ncp_sock->type == SOCK_STREAM)
+		ncptcp_start_request(server, req);
+	else
+		ncpdgram_start_request(server, req);
+}
+
+static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	mutex_lock(&server->rcv.creq_mutex);
+	if (!ncp_conn_valid(server)) {
+		mutex_unlock(&server->rcv.creq_mutex);
+		pr_err("tcp: Server died\n");
+		return -EIO;
+	}
+	ncp_req_get(req);
+	if (server->tx.creq || server->rcv.creq) {
+		req->status = RQ_QUEUED;
+		list_add_tail(&req->req, &server->tx.requests);
+		mutex_unlock(&server->rcv.creq_mutex);
+		return 0;
+	}
+	__ncp_start_request(server, req);
+	mutex_unlock(&server->rcv.creq_mutex);
+	return 0;
+}
+
+static void __ncp_next_request(struct ncp_server *server)
+{
+	struct ncp_request_reply *req;
+
+	server->rcv.creq = NULL;
+	if (list_empty(&server->tx.requests)) {
+		return;
+	}
+	req = list_entry(server->tx.requests.next, struct ncp_request_reply, req);
+	list_del_init(&req->req);
+	__ncp_start_request(server, req);
+}
+
+static void info_server(struct ncp_server *server, unsigned int id, const void * data, size_t len)
+{
+	if (server->info_sock) {
+		struct kvec iov[2];
+		__be32 hdr[2];
+	
+		hdr[0] = cpu_to_be32(len + 8);
+		hdr[1] = cpu_to_be32(id);
+	
+		iov[0].iov_base = hdr;
+		iov[0].iov_len = 8;
+		iov[1].iov_base = (void *) data;
+		iov[1].iov_len = len;
+
+		do_send(server->info_sock, iov, 2, len + 8, MSG_NOSIGNAL);
+	}
+}
+
+void ncpdgram_rcv_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, rcv.tq);
+	struct socket* sock;
+	
+	sock = server->ncp_sock;
+	
+	while (1) {
+		struct ncp_reply_header reply;
+		int result;
+
+		result = _recv(sock, &reply, sizeof(reply), MSG_PEEK | MSG_DONTWAIT);
+		if (result < 0) {
+			break;
+		}
+		if (result >= sizeof(reply)) {
+			struct ncp_request_reply *req;
+	
+			if (reply.type == NCP_WATCHDOG) {
+				unsigned char buf[10];
+
+				if (server->connection != get_conn_number(&reply)) {
+					goto drop;
+				}
+				result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT);
+				if (result < 0) {
+					ncp_dbg(1, "recv failed with %d\n", result);
+					continue;
+				}
+				if (result < 10) {
+					ncp_dbg(1, "too short (%u) watchdog packet\n", result);
+					continue;
+				}
+				if (buf[9] != '?') {
+					ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]);
+					continue;
+				}
+				buf[9] = 'Y';
+				_send(sock, buf, sizeof(buf));
+				continue;
+			}
+			if (reply.type != NCP_POSITIVE_ACK && reply.type != NCP_REPLY) {
+				result = _recv(sock, server->unexpected_packet.data, sizeof(server->unexpected_packet.data), MSG_DONTWAIT);
+				if (result < 0) {
+					continue;
+				}
+				info_server(server, 0, server->unexpected_packet.data, result);
+				continue;
+			}
+			mutex_lock(&server->rcv.creq_mutex);
+			req = server->rcv.creq;
+			if (req && (req->tx_type == NCP_ALLOC_SLOT_REQUEST || (server->sequence == reply.sequence && 
+					server->connection == get_conn_number(&reply)))) {
+				if (reply.type == NCP_POSITIVE_ACK) {
+					server->timeout_retries = server->m.retry_count;
+					server->timeout_last = NCP_MAX_RPC_TIMEOUT;
+					mod_timer(&server->timeout_tm, jiffies + NCP_MAX_RPC_TIMEOUT);
+				} else if (reply.type == NCP_REPLY) {
+					result = _recv(sock, server->rxbuf, req->datalen, MSG_DONTWAIT);
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+					if (result >= 0 && server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
+						if (result < 8 + 8) {
+							result = -EIO;
+						} else {
+							unsigned int hdrl;
+							
+							result -= 8;
+							hdrl = sock->sk->sk_family == AF_INET ? 8 : 6;
+							if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) {
+								pr_info("Signature violation\n");
+								result = -EIO;
+							}
+						}
+					}
+#endif
+					del_timer(&server->timeout_tm);
+				     	server->rcv.creq = NULL;
+					ncp_finish_request(server, req, result);
+					__ncp_next_request(server);
+					mutex_unlock(&server->rcv.creq_mutex);
+					continue;
+				}
+			}
+			mutex_unlock(&server->rcv.creq_mutex);
+		}
+drop:;		
+		_recv(sock, &reply, sizeof(reply), MSG_DONTWAIT);
+	}
+}
+
+static void __ncpdgram_timeout_proc(struct ncp_server *server)
+{
+	/* If timer is pending, we are processing another request... */
+	if (!timer_pending(&server->timeout_tm)) {
+		struct ncp_request_reply* req;
+		
+		req = server->rcv.creq;
+		if (req) {
+			int timeout;
+			
+			if (server->m.flags & NCP_MOUNT_SOFT) {
+				if (server->timeout_retries-- == 0) {
+					__ncp_abort_request(server, req, -ETIMEDOUT);
+					return;
+				}
+			}
+			/* Ignore errors */
+			ncpdgram_send(server->ncp_sock, req);
+			timeout = server->timeout_last << 1;
+			if (timeout > NCP_MAX_RPC_TIMEOUT) {
+				timeout = NCP_MAX_RPC_TIMEOUT;
+			}
+			server->timeout_last = timeout;
+			mod_timer(&server->timeout_tm, jiffies + timeout);
+		}
+	}
+}
+
+void ncpdgram_timeout_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, timeout_tq);
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncpdgram_timeout_proc(server);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
+{
+	int result;
+	
+	if (buffer) {
+		result = _recv(server->ncp_sock, buffer, len, MSG_DONTWAIT);
+	} else {
+		static unsigned char dummy[1024];
+			
+		if (len > sizeof(dummy)) {
+			len = sizeof(dummy);
+		}
+		result = _recv(server->ncp_sock, dummy, len, MSG_DONTWAIT);
+	}
+	if (result < 0) {
+		return result;
+	}
+	if (result > len) {
+		pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len);
+		return -EIO;			
+	}
+	return result;
+}	
+
+static int __ncptcp_rcv_proc(struct ncp_server *server)
+{
+	/* We have to check the result, so store the complete header */
+	while (1) {
+		int result;
+		struct ncp_request_reply *req;
+		int datalen;
+		int type;
+
+		while (server->rcv.len) {
+			result = do_tcp_rcv(server, server->rcv.ptr, server->rcv.len);
+			if (result == -EAGAIN) {
+				return 0;
+			}
+			if (result <= 0) {
+				req = server->rcv.creq;
+				if (req) {
+					__ncp_abort_request(server, req, -EIO);
+				} else {
+					__ncptcp_abort(server);
+				}
+				if (result < 0) {
+					pr_err("tcp: error in recvmsg: %d\n", result);
+				} else {
+					ncp_dbg(1, "tcp: EOF\n");
+				}
+				return -EIO;
+			}
+			if (server->rcv.ptr) {
+				server->rcv.ptr += result;
+			}
+			server->rcv.len -= result;
+		}
+		switch (server->rcv.state) {
+			case 0:
+				if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) {
+					pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
+					__ncptcp_abort(server);
+					return -EIO;
+				}
+				datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF;
+				if (datalen < 10) {
+					pr_err("tcp: Unexpected reply len %d\n", datalen);
+					__ncptcp_abort(server);
+					return -EIO;
+				}
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+				if (server->sign_active) {
+					if (datalen < 18) {
+						pr_err("tcp: Unexpected reply len %d\n", datalen);
+						__ncptcp_abort(server);
+						return -EIO;
+					}
+					server->rcv.buf.len = datalen - 8;
+					server->rcv.ptr = (unsigned char*)&server->rcv.buf.p1;
+					server->rcv.len = 8;
+					server->rcv.state = 4;
+					break;
+				}
+#endif				
+				type = ntohs(server->rcv.buf.type);
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+cont:;				
+#endif
+				if (type != NCP_REPLY) {
+					if (datalen - 8 <= sizeof(server->unexpected_packet.data)) {
+						*(__u16*)(server->unexpected_packet.data) = htons(type);
+						server->unexpected_packet.len = datalen - 8;
+
+						server->rcv.state = 5;
+						server->rcv.ptr = server->unexpected_packet.data + 2;
+						server->rcv.len = datalen - 10;
+						break;
+					}					
+					ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type);
+skipdata2:;
+					server->rcv.state = 2;
+skipdata:;
+					server->rcv.ptr = NULL;
+					server->rcv.len = datalen - 10;
+					break;
+				}
+				req = server->rcv.creq;
+				if (!req) {
+					ncp_dbg(1, "Reply without appropriate request\n");
+					goto skipdata2;
+				}
+				if (datalen > req->datalen + 8) {
+					pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
+					server->rcv.state = 3;
+					goto skipdata;
+				}
+				req->datalen = datalen - 8;
+				((struct ncp_reply_header*)server->rxbuf)->type = NCP_REPLY;
+				server->rcv.ptr = server->rxbuf + 2;
+				server->rcv.len = datalen - 10;
+				server->rcv.state = 1;
+				break;
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+			case 4:
+				datalen = server->rcv.buf.len;
+				type = ntohs(server->rcv.buf.type2);
+				goto cont;
+#endif
+			case 1:
+				req = server->rcv.creq;
+				if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) {
+					if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) {
+						pr_err("tcp: Bad sequence number\n");
+						__ncp_abort_request(server, req, -EIO);
+						return -EIO;
+					}
+					if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) {
+						pr_err("tcp: Connection number mismatch\n");
+						__ncp_abort_request(server, req, -EIO);
+						return -EIO;
+					}
+				}
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+				if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
+					if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
+						pr_err("tcp: Signature violation\n");
+						__ncp_abort_request(server, req, -EIO);
+						return -EIO;
+					}
+				}
+#endif				
+				ncp_finish_request(server, req, req->datalen);
+			nextreq:;
+				__ncp_next_request(server);
+			case 2:
+			next:;
+				server->rcv.ptr = (unsigned char*)&server->rcv.buf;
+				server->rcv.len = 10;
+				server->rcv.state = 0;
+				break;
+			case 3:
+				ncp_finish_request(server, server->rcv.creq, -EIO);
+				goto nextreq;
+			case 5:
+				info_server(server, 0, server->unexpected_packet.data, server->unexpected_packet.len);
+				goto next;
+		}
+	}
+}
+
+void ncp_tcp_rcv_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, rcv.tq);
+
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncptcp_rcv_proc(server);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+void ncp_tcp_tx_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, tx.tq);
+	
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncptcp_try_send(server);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+static int do_ncp_rpc_call(struct ncp_server *server, int size,
+		unsigned char* reply_buf, int max_reply_size)
+{
+	int result;
+	struct ncp_request_reply *req;
+
+	req = ncp_alloc_req();
+	if (!req)
+		return -ENOMEM;
+
+	req->reply_buf = reply_buf;
+	req->datalen = max_reply_size;
+	req->tx_iov[1].iov_base = server->packet;
+	req->tx_iov[1].iov_len = size;
+	req->tx_iovlen = 1;
+	req->tx_totallen = size;
+	req->tx_type = *(u_int16_t*)server->packet;
+
+	result = ncp_add_request(server, req);
+	if (result < 0)
+		goto out;
+
+	if (wait_event_interruptible(req->wq, req->status == RQ_DONE)) {
+		ncp_abort_request(server, req, -EINTR);
+		result = -EINTR;
+		goto out;
+	}
+
+	result = req->result;
+
+out:
+	ncp_req_put(req);
+
+	return result;
+}
+
+/*
+ * We need the server to be locked here, so check!
+ */
+
+static int ncp_do_request(struct ncp_server *server, int size,
+		void* reply, int max_reply_size)
+{
+	int result;
+
+	if (server->lock == 0) {
+		pr_err("Server not locked!\n");
+		return -EIO;
+	}
+	if (!ncp_conn_valid(server)) {
+		return -EIO;
+	}
+	{
+		sigset_t old_set;
+		unsigned long mask, flags;
+
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		old_set = current->blocked;
+		if (current->flags & PF_EXITING)
+			mask = 0;
+		else
+			mask = sigmask(SIGKILL);
+		if (server->m.flags & NCP_MOUNT_INTR) {
+			/* FIXME: This doesn't seem right at all.  So, like,
+			   we can't handle SIGINT and get whatever to stop?
+			   What if we've blocked it ourselves?  What about
+			   alarms?  Why, in fact, are we mucking with the
+			   sigmask at all? -- r~ */
+			if (current->sighand->action[SIGINT - 1].sa.sa_handler == SIG_DFL)
+				mask |= sigmask(SIGINT);
+			if (current->sighand->action[SIGQUIT - 1].sa.sa_handler == SIG_DFL)
+				mask |= sigmask(SIGQUIT);
+		}
+		siginitsetinv(&current->blocked, mask);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+		
+		result = do_ncp_rpc_call(server, size, reply, max_reply_size);
+
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		current->blocked = old_set;
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+
+	ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result);
+
+	return result;
+}
+
+/* ncp_do_request assures that at least a complete reply header is
+ * received. It assumes that server->current_size contains the ncp
+ * request size
+ */
+int ncp_request2(struct ncp_server *server, int function, 
+		void* rpl, int size)
+{
+	struct ncp_request_header *h;
+	struct ncp_reply_header* reply = rpl;
+	int result;
+
+	h = (struct ncp_request_header *) (server->packet);
+	if (server->has_subfunction != 0) {
+		*(__u16 *) & (h->data[0]) = htons(server->current_size - sizeof(*h) - 2);
+	}
+	h->type = NCP_REQUEST;
+	/*
+	 * The server shouldn't know or care what task is making a
+	 * request, so we always use the same task number.
+	 */
+	h->task = 2; /* (current->pid) & 0xff; */
+	h->function = function;
+
+	result = ncp_do_request(server, server->current_size, reply, size);
+	if (result < 0) {
+		ncp_dbg(1, "ncp_request_error: %d\n", result);
+		goto out;
+	}
+	server->completion = reply->completion_code;
+	server->conn_status = reply->connection_state;
+	server->reply_size = result;
+	server->ncp_reply_size = result - sizeof(struct ncp_reply_header);
+
+	result = reply->completion_code;
+
+	if (result != 0)
+		ncp_vdbg("completion code=%x\n", result);
+out:
+	return result;
+}
+
+int ncp_connect(struct ncp_server *server)
+{
+	struct ncp_request_header *h;
+	int result;
+
+	server->connection = 0xFFFF;
+	server->sequence = 255;
+
+	h = (struct ncp_request_header *) (server->packet);
+	h->type = NCP_ALLOC_SLOT_REQUEST;
+	h->task		= 2; /* see above */
+	h->function	= 0;
+
+	result = ncp_do_request(server, sizeof(*h), server->packet, server->packet_size);
+	if (result < 0)
+		goto out;
+	server->connection = h->conn_low + (h->conn_high * 256);
+	result = 0;
+out:
+	return result;
+}
+
+int ncp_disconnect(struct ncp_server *server)
+{
+	struct ncp_request_header *h;
+
+	h = (struct ncp_request_header *) (server->packet);
+	h->type = NCP_DEALLOC_SLOT_REQUEST;
+	h->task		= 2; /* see above */
+	h->function	= 0;
+
+	return ncp_do_request(server, sizeof(*h), server->packet, server->packet_size);
+}
+
+void ncp_lock_server(struct ncp_server *server)
+{
+	mutex_lock(&server->mutex);
+	if (server->lock)
+		pr_warn("%s: was locked!\n", __func__);
+	server->lock = 1;
+}
+
+void ncp_unlock_server(struct ncp_server *server)
+{
+	if (!server->lock) {
+		pr_warn("%s: was not locked!\n", __func__);
+		return;
+	}
+	server->lock = 0;
+	mutex_unlock(&server->mutex);
+}
diff --git a/kernel/fs/ncpfs/symlink.c b/kernel/fs/ncpfs/symlink.c
new file mode 100644
index 000000000..421b6f91e
--- /dev/null
+++ b/kernel/fs/ncpfs/symlink.c
@@ -0,0 +1,181 @@
+/*
+ *  linux/fs/ncpfs/symlink.c
+ *
+ *  Code for allowing symbolic links on NCPFS (i.e. NetWare)
+ *  Symbolic links are not supported on native NetWare, so we use an
+ *  infrequently-used flag (Sh) and store a two-word magic header in
+ *  the file to make sure we don't accidentally use a non-link file
+ *  as a link.
+ *
+ *  When using the NFS namespace, we set the mode to indicate a symlink and
+ *  don't bother with the magic numbers.
+ *
+ *  from linux/fs/ext2/symlink.c
+ *
+ *  Copyright (C) 1998-99, Frank A. Vorstenbosch
+ *
+ *  ncpfs symlink handling code
+ *  NLS support (c) 1999 Petr Vandrovec
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include "ncp_fs.h"
+
+/* these magic numbers must appear in the symlink file -- this makes it a bit
+   more resilient against the magic attributes being set on random files. */
+
+#define NCP_SYMLINK_MAGIC0	cpu_to_le32(0x6c6d7973)     /* "symlnk->" */
+#define NCP_SYMLINK_MAGIC1	cpu_to_le32(0x3e2d6b6e)
+
+/* ----- read a symbolic link ------------------------------------------ */
+
+static int ncp_symlink_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int error, length, len;
+	char *link, *rawlink;
+	char *buf = kmap(page);
+
+	error = -ENOMEM;
+	rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+	if (!rawlink)
+		goto fail;
+
+	if (ncp_make_open(inode,O_RDONLY))
+		goto failEIO;
+
+	error=ncp_read_kernel(NCP_SERVER(inode),NCP_FINFO(inode)->file_handle,
+                         0,NCP_MAX_SYMLINK_SIZE,rawlink,&length);
+
+	ncp_inode_close(inode);
+	/* Close file handle if no other users... */
+	ncp_make_closed(inode);
+	if (error)
+		goto failEIO;
+
+	if (NCP_FINFO(inode)->flags & NCPI_KLUDGE_SYMLINK) {
+		if (length<NCP_MIN_SYMLINK_SIZE || 
+		    ((__le32 *)rawlink)[0]!=NCP_SYMLINK_MAGIC0 ||
+		    ((__le32 *)rawlink)[1]!=NCP_SYMLINK_MAGIC1)
+		    	goto failEIO;
+		link = rawlink + 8;
+		length -= 8;
+	} else {
+		link = rawlink;
+	}
+
+	len = NCP_MAX_SYMLINK_SIZE;
+	error = ncp_vol2io(NCP_SERVER(inode), buf, &len, link, length, 0);
+	kfree(rawlink);
+	if (error)
+		goto fail;
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+failEIO:
+	error = -EIO;
+	kfree(rawlink);
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return error;
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct address_space_operations ncp_symlink_aops = {
+	.readpage	= ncp_symlink_readpage,
+};
+	
+/* ----- create a new symbolic link -------------------------------------- */
+ 
+int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
+	struct inode *inode;
+	char *rawlink;
+	int length, err, i, outlen;
+	int kludge;
+	umode_t mode;
+	__le32 attr;
+	unsigned int hdr;
+
+	ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname);
+
+	if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber))
+		kludge = 0;
+	else
+#ifdef CONFIG_NCPFS_EXTRAS
+	if (NCP_SERVER(dir)->m.flags & NCP_MOUNT_SYMLINKS)
+		kludge = 1;
+	else
+#endif
+	/* EPERM is returned by VFS if symlink procedure does not exist */
+		return -EPERM;
+  
+	rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+	if (!rawlink)
+		return -ENOMEM;
+
+	if (kludge) {
+		mode = 0;
+		attr = aSHARED | aHIDDEN;
+		((__le32 *)rawlink)[0]=NCP_SYMLINK_MAGIC0;
+		((__le32 *)rawlink)[1]=NCP_SYMLINK_MAGIC1;
+		hdr = 8;
+	} else {
+		mode = S_IFLNK | S_IRWXUGO;
+		attr = 0;
+		hdr = 0;
+	}			
+
+	length = strlen(symname);
+	/* map to/from server charset, do not touch upper/lower case as
+	   symlink can point out of ncp filesystem */
+	outlen = NCP_MAX_SYMLINK_SIZE - hdr;
+	err = ncp_io2vol(NCP_SERVER(dir), rawlink + hdr, &outlen, symname, length, 0);
+	if (err)
+		goto failfree;
+
+	outlen += hdr;
+
+	err = -EIO;
+	if (ncp_create_new(dir,dentry,mode,0,attr)) {
+		goto failfree;
+	}
+
+	inode=d_inode(dentry);
+
+	if (ncp_make_open(inode, O_WRONLY))
+		goto failfree;
+
+	if (ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, 
+			     0, outlen, rawlink, &i) || i!=outlen) {
+		goto fail;
+	}
+
+	ncp_inode_close(inode);
+	ncp_make_closed(inode);
+	kfree(rawlink);
+	return 0;
+fail:;
+	ncp_inode_close(inode);
+	ncp_make_closed(inode);
+failfree:;
+	kfree(rawlink);
+	return err;
+}
+
+/* ----- EOF ----- */
diff --git a/kernel/fs/nfs/Kconfig b/kernel/fs/nfs/Kconfig
new file mode 100644
index 000000000..f31fd0dd9
--- /dev/null
+++ b/kernel/fs/nfs/Kconfig
@@ -0,0 +1,202 @@
+config NFS_FS
+	tristate "NFS client support"
+	depends on INET && FILE_LOCKING && MULTIUSER
+	select LOCKD
+	select SUNRPC
+	select NFS_ACL_SUPPORT if NFS_V3_ACL
+	help
+	  Choose Y here if you want to access files residing on other
+	  computers using Sun's Network File System protocol.  To compile
+	  this file system support as a module, choose M here: the module
+	  will be called nfs.
+
+	  To mount file systems exported by NFS servers, you also need to
+	  install the user space mount.nfs command which can be found in
+	  the Linux nfs-utils package, available from http://linux-nfs.org/.
+	  Information about using the mount command is available in the
+	  mount(8) man page.  More detail about the Linux NFS client
+	  implementation is available via the nfs(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available in the kernel to mount NFS servers.  Support for NFS
+	  version 2 (RFC 1094) is always available when NFS_FS is selected.
+
+	  To configure a system which mounts its root file system via NFS
+	  at boot time, say Y here, select "Kernel level IP
+	  autoconfiguration" in the NETWORK menu, and select "Root file
+	  system on NFS" below.  You cannot compile this file system as a
+	  module in this case.
+
+	  If unsure, say N.
+
+config NFS_V2
+	tristate "NFS client support for NFS version 2"
+	depends on NFS_FS
+	default y
+	help
+	  This option enables support for version 2 of the NFS protocol
+	  (RFC 1094) in the kernel's NFS client.
+
+	  If unsure, say Y.
+
+config NFS_V3
+	tristate "NFS client support for NFS version 3"
+	depends on NFS_FS
+	default y
+	help
+	  This option enables support for version 3 of the NFS protocol
+	  (RFC 1813) in the kernel's NFS client.
+
+	  If unsure, say Y.
+
+config NFS_V3_ACL
+	bool "NFS client support for the NFSv3 ACL protocol extension"
+	depends on NFS_V3
+	help
+	  Some NFS servers support an auxiliary NFSv3 ACL protocol that
+	  Sun added to Solaris but never became an official part of the
+	  NFS version 3 protocol.  This protocol extension allows
+	  applications on NFS clients to manipulate POSIX Access Control
+	  Lists on files residing on NFS servers.  NFS servers enforce
+	  ACLs on local files whether this protocol is available or not.
+
+	  Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+	  protocol extension and you want your NFS client to allow
+	  applications to access and modify ACLs on files on the server.
+
+	  Most NFS servers don't support the Solaris NFSv3 ACL protocol
+	  extension.  You can choose N here or specify the "noacl" mount
+	  option to prevent your NFS client from trying to use the NFSv3
+	  ACL protocol.
+
+	  If unsure, say N.
+
+config NFS_V4
+	tristate "NFS client support for NFS version 4"
+	depends on NFS_FS
+	select SUNRPC_GSS
+	select KEYS
+	help
+	  This option enables support for version 4 of the NFS protocol
+	  (RFC 3530) in the kernel's NFS client.
+
+	  To mount NFS servers using NFSv4, you also need to install user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say Y.
+
+config NFS_SWAP
+	bool "Provide swap over NFS support"
+	default n
+	depends on NFS_FS
+	select SUNRPC_SWAP
+	help
+	  This option enables swapon to work on files located on NFS mounts.
+
+config NFS_V4_1
+	bool "NFS client support for NFSv4.1"
+	depends on NFS_V4
+	select SUNRPC_BACKCHANNEL
+	help
+	  This option enables support for minor version 1 of the NFSv4 protocol
+	  (RFC 5661) in the kernel's NFS client.
+
+	  If unsure, say N.
+
+config NFS_V4_2
+	bool "NFS client support for NFSv4.2"
+	depends on NFS_V4_1
+	help
+	  This option enables support for minor version 2 of the NFSv4 protocol
+	  in the kernel's NFS client.
+
+	  If unsure, say N.
+
+config PNFS_FILE_LAYOUT
+	tristate
+	depends on NFS_V4_1
+	default NFS_V4
+
+config PNFS_BLOCK
+	tristate
+	depends on NFS_V4_1 && BLK_DEV_DM
+	default NFS_V4
+
+config PNFS_OBJLAYOUT
+	tristate
+	depends on NFS_V4_1 && SCSI_OSD_ULD
+	default NFS_V4
+
+config PNFS_FLEXFILE_LAYOUT
+	tristate
+	depends on NFS_V4_1 && NFS_V3
+	default m
+
+config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
+	string "NFSv4.1 Implementation ID Domain"
+	depends on NFS_V4_1
+	default "kernel.org"
+	help
+	  This option defines the domain portion of the implementation ID that
+	  may be sent in the NFS exchange_id operation.  The value must be in
+	  the format of a DNS domain name and should be set to the DNS domain
+	  name of the distribution.
+	  If the NFS client is unchanged from the upstream kernel, this
+	  option should be set to the default "kernel.org".
+
+config NFS_V4_1_MIGRATION
+	bool "NFSv4.1 client support for migration"
+	depends on NFS_V4_1
+	default n
+	help
+	  This option makes the NFS client advertise to NFSv4.1 servers that
+          it can support NFSv4 migration.
+
+          The NFSv4.1 pieces of the Linux NFSv4 migration implementation are
+          still experimental.  If you are not an NFSv4 developer, say N here.
+
+config NFS_V4_SECURITY_LABEL
+	bool
+	depends on NFS_V4_2 && SECURITY
+	default y
+
+config ROOT_NFS
+	bool "Root file system on NFS"
+	depends on NFS_FS=y && IP_PNP
+	help
+	  If you want your system to mount its root file system via NFS,
+	  choose Y here.  This is common practice for managing systems
+	  without local permanent storage.  For details, read
+	  <file:Documentation/filesystems/nfs/nfsroot.txt>.
+
+	  Most people say N here.
+
+config NFS_FSCACHE
+	bool "Provide NFS client caching support"
+	depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+	help
+	  Say Y here if you want NFS data to be cached locally on disc through
+	  the general filesystem cache manager
+
+config NFS_USE_LEGACY_DNS
+	bool "Use the legacy NFS DNS resolver"
+	depends on NFS_V4
+	help
+	  The kernel now provides a method for translating a host name into an
+	  IP address.  Select Y here if you would rather use your own DNS
+	  resolver script.
+
+	  If unsure, say N
+
+config NFS_USE_KERNEL_DNS
+	bool
+	depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+	select DNS_RESOLVER
+	default y
+
+config NFS_DEBUG
+	bool
+	depends on NFS_FS && SUNRPC_DEBUG
+	select CRC32
+	default y
diff --git a/kernel/fs/nfs/Makefile b/kernel/fs/nfs/Makefile
new file mode 100644
index 000000000..866441795
--- /dev/null
+++ b/kernel/fs/nfs/Makefile
@@ -0,0 +1,36 @@
+#
+# Makefile for the Linux nfs filesystem routines.
+#
+
+obj-$(CONFIG_NFS_FS) += nfs.o
+
+CFLAGS_nfstrace.o += -I$(src)
+nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \
+			   direct.o pagelist.o read.o symlink.o unlink.o \
+			   write.o namespace.o mount_clnt.o nfstrace.o
+nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
+nfs-$(CONFIG_SYSCTL)	+= sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_NFS_V2) += nfsv2.o
+nfsv2-y := nfs2super.o proc.o nfs2xdr.o
+
+obj-$(CONFIG_NFS_V3) += nfsv3.o
+nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
+nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
+
+obj-$(CONFIG_NFS_V4) += nfsv4.o
+CFLAGS_nfs4trace.o += -I$(src)
+nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
+	  delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \
+	  nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
+	  dns_resolve.o nfs4trace.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
+nfsv4-$(CONFIG_SYSCTL)	+= nfs4sysctl.o
+nfsv4-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o pnfs_nfs.o
+nfsv4-$(CONFIG_NFS_V4_2)	+= nfs42proc.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/kernel/fs/nfs/blocklayout/Makefile b/kernel/fs/nfs/blocklayout/Makefile
new file mode 100644
index 000000000..3ca14c36d
--- /dev/null
+++ b/kernel/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/kernel/fs/nfs/blocklayout/blocklayout.c b/kernel/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 000000000..d2554fe14
--- /dev/null
+++ b/kernel/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,928 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/bio.h>		/* struct bio */
+#include <linux/prefetch.h>
+#include <linux/pagevec.h>
+
+#include "../pnfs.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY	NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+
+static bool is_hole(struct pnfs_block_extent *be)
+{
+	switch (be->be_state) {
+	case PNFS_BLOCK_NONE_DATA:
+		return true;
+	case PNFS_BLOCK_INVALID_DATA:
+		return be->be_tag ? false : true;
+	default:
+		return false;
+	}
+}
+
+/* The data we are handed might be spread across several bios.  We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+	struct kref refcnt;
+	void (*pnfs_callback) (void *data);
+	void *data;
+};
+
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+	struct parallel_io *rv;
+
+	rv  = kmalloc(sizeof(*rv), GFP_NOFS);
+	if (rv) {
+		rv->data = data;
+		kref_init(&rv->refcnt);
+	}
+	return rv;
+}
+
+static inline void get_parallel(struct parallel_io *p)
+{
+	kref_get(&p->refcnt);
+}
+
+static void destroy_parallel(struct kref *kref)
+{
+	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+
+	dprintk("%s enter\n", __func__);
+	p->pnfs_callback(p->data);
+	kfree(p);
+}
+
+static inline void put_parallel(struct parallel_io *p)
+{
+	kref_put(&p->refcnt, destroy_parallel);
+}
+
+static struct bio *
+bl_submit_bio(int rw, struct bio *bio)
+{
+	if (bio) {
+		get_parallel(bio->bi_private);
+		dprintk("%s submitting %s bio %u@%llu\n", __func__,
+			rw == READ ? "read" : "write", bio->bi_iter.bi_size,
+			(unsigned long long)bio->bi_iter.bi_sector);
+		submit_bio(rw, bio);
+	}
+	return NULL;
+}
+
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+		void (*end_io)(struct bio *, int err), struct parallel_io *par)
+{
+	struct bio *bio;
+
+	npg = min(npg, BIO_MAX_PAGES);
+	bio = bio_alloc(GFP_NOIO, npg);
+	if (!bio && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (npg /= 2))
+			bio = bio_alloc(GFP_NOIO, npg);
+	}
+
+	if (bio) {
+		bio->bi_iter.bi_sector = disk_sector;
+		bio->bi_bdev = bdev;
+		bio->bi_end_io = end_io;
+		bio->bi_private = par;
+	}
+	return bio;
+}
+
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+		struct page *page, struct pnfs_block_dev_map *map,
+		struct pnfs_block_extent *be,
+		void (*end_io)(struct bio *, int err),
+		struct parallel_io *par, unsigned int offset, int *len)
+{
+	struct pnfs_block_dev *dev =
+		container_of(be->be_device, struct pnfs_block_dev, node);
+	u64 disk_addr, end;
+
+	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+		npg, rw, (unsigned long long)isect, offset, *len);
+
+	/* translate to device offset */
+	isect += be->be_v_offset;
+	isect -= be->be_f_offset;
+
+	/* translate to physical disk offset */
+	disk_addr = (u64)isect << SECTOR_SHIFT;
+	if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+		if (!dev->map(dev, disk_addr, map))
+			return ERR_PTR(-EIO);
+		bio = bl_submit_bio(rw, bio);
+	}
+	disk_addr += map->disk_offset;
+	disk_addr -= map->start;
+
+	/* limit length to what the device mapping allows */
+	end = disk_addr + *len;
+	if (end >= map->start + map->len)
+		*len = map->start + map->len - disk_addr;
+
+retry:
+	if (!bio) {
+		bio = bl_alloc_init_bio(npg, map->bdev,
+				disk_addr >> SECTOR_SHIFT, end_io, par);
+		if (!bio)
+			return ERR_PTR(-ENOMEM);
+	}
+	if (bio_add_page(bio, page, *len, offset) < *len) {
+		bio = bl_submit_bio(rw, bio);
+		goto retry;
+	}
+	return bio;
+}
+
+static void bl_end_io_read(struct bio *bio, int err)
+{
+	struct parallel_io *par = bio->bi_private;
+
+	if (err) {
+		struct nfs_pgio_header *header = par->data;
+
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
+	}
+
+	bio_put(bio);
+	put_parallel(par);
+}
+
+static void bl_read_cleanup(struct work_struct *work)
+{
+	struct rpc_task *task;
+	struct nfs_pgio_header *hdr;
+	dprintk("%s enter\n", __func__);
+	task = container_of(work, struct rpc_task, u.tk_work);
+	hdr = container_of(task, struct nfs_pgio_header, task);
+	pnfs_ld_read_done(hdr);
+}
+
+static void
+bl_end_par_io_read(void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	hdr->task.tk_status = hdr->pnfs_error;
+	INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+	schedule_work(&hdr->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_pgio_header *header)
+{
+	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
+	struct bio *bio = NULL;
+	struct pnfs_block_extent be;
+	sector_t isect, extent_length = 0;
+	struct parallel_io *par;
+	loff_t f_offset = header->args.offset;
+	size_t bytes_left = header->args.count;
+	unsigned int pg_offset, pg_len;
+	struct page **pages = header->args.pages;
+	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	const bool is_dio = (header->dreq != NULL);
+	struct blk_plug plug;
+	int i;
+
+	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
+		header->page_array.npages, f_offset,
+		(unsigned int)header->args.count);
+
+	par = alloc_parallel(header);
+	if (!par)
+		return PNFS_NOT_ATTEMPTED;
+	par->pnfs_callback = bl_end_par_io_read;
+
+	blk_start_plug(&plug);
+
+	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+	/* Code assumes extents are page-aligned */
+	for (i = pg_index; i < header->page_array.npages; i++) {
+		if (extent_length <= 0) {
+			/* We've used up the previous extent */
+			bio = bl_submit_bio(READ, bio);
+
+			/* Get the next one */
+			if (!ext_tree_lookup(bl, isect, &be, false)) {
+				header->pnfs_error = -EIO;
+				goto out;
+			}
+			extent_length = be.be_length - (isect - be.be_f_offset);
+		}
+
+		pg_offset = f_offset & ~PAGE_CACHE_MASK;
+		if (is_dio) {
+			if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
+				pg_len = PAGE_CACHE_SIZE - pg_offset;
+			else
+				pg_len = bytes_left;
+		} else {
+			BUG_ON(pg_offset != 0);
+			pg_len = PAGE_CACHE_SIZE;
+		}
+
+		isect += (pg_offset >> SECTOR_SHIFT);
+		extent_length -= (pg_offset >> SECTOR_SHIFT);
+
+		if (is_hole(&be)) {
+			bio = bl_submit_bio(READ, bio);
+			/* Fill hole w/ zeroes w/o accessing device */
+			dprintk("%s Zeroing page for hole\n", __func__);
+			zero_user_segment(pages[i], pg_offset, pg_len);
+
+			/* invalidate map */
+			map.start = NFS4_MAX_UINT64;
+		} else {
+			bio = do_add_page_to_bio(bio,
+						 header->page_array.npages - i,
+						 READ,
+						 isect, pages[i], &map, &be,
+						 bl_end_io_read, par,
+						 pg_offset, &pg_len);
+			if (IS_ERR(bio)) {
+				header->pnfs_error = PTR_ERR(bio);
+				bio = NULL;
+				goto out;
+			}
+		}
+		isect += (pg_len >> SECTOR_SHIFT);
+		extent_length -= (pg_len >> SECTOR_SHIFT);
+		f_offset += pg_len;
+		bytes_left -= pg_len;
+	}
+	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
+		header->res.eof = 1;
+		header->res.count = header->inode->i_size - header->args.offset;
+	} else {
+		header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
+	}
+out:
+	bl_submit_bio(READ, bio);
+	blk_finish_plug(&plug);
+	put_parallel(par);
+	return PNFS_ATTEMPTED;
+}
+
+static void bl_end_io_write(struct bio *bio, int err)
+{
+	struct parallel_io *par = bio->bi_private;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct nfs_pgio_header *header = par->data;
+
+	if (!uptodate) {
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
+	}
+	bio_put(bio);
+	put_parallel(par);
+}
+
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+	struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+	struct nfs_pgio_header *hdr =
+			container_of(task, struct nfs_pgio_header, task);
+
+	dprintk("%s enter\n", __func__);
+
+	if (likely(!hdr->pnfs_error)) {
+		struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+		u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+		u64 end = (hdr->args.offset + hdr->args.count +
+			PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+
+		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+					(end - start) >> SECTOR_SHIFT);
+	}
+
+	pnfs_ld_write_done(hdr);
+}
+
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	hdr->task.tk_status = hdr->pnfs_error;
+	hdr->verf.committed = NFS_FILE_SYNC;
+	INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+	schedule_work(&hdr->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
+{
+	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
+	struct bio *bio = NULL;
+	struct pnfs_block_extent be;
+	sector_t isect, extent_length = 0;
+	struct parallel_io *par = NULL;
+	loff_t offset = header->args.offset;
+	size_t count = header->args.count;
+	struct page **pages = header->args.pages;
+	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	unsigned int pg_len;
+	struct blk_plug plug;
+	int i;
+
+	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+
+	/* At this point, header->page_aray is a (sequential) list of nfs_pages.
+	 * We want to write each, and if there is an error set pnfs_error
+	 * to have it redone using nfs.
+	 */
+	par = alloc_parallel(header);
+	if (!par)
+		return PNFS_NOT_ATTEMPTED;
+	par->pnfs_callback = bl_end_par_io_write;
+
+	blk_start_plug(&plug);
+
+	/* we always write out the whole page */
+	offset = offset & (loff_t)PAGE_CACHE_MASK;
+	isect = offset >> SECTOR_SHIFT;
+
+	for (i = pg_index; i < header->page_array.npages; i++) {
+		if (extent_length <= 0) {
+			/* We've used up the previous extent */
+			bio = bl_submit_bio(WRITE, bio);
+			/* Get the next one */
+			if (!ext_tree_lookup(bl, isect, &be, true)) {
+				header->pnfs_error = -EINVAL;
+				goto out;
+			}
+
+			extent_length = be.be_length - (isect - be.be_f_offset);
+		}
+
+		pg_len = PAGE_CACHE_SIZE;
+		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+					 WRITE, isect, pages[i], &map, &be,
+					 bl_end_io_write, par,
+					 0, &pg_len);
+		if (IS_ERR(bio)) {
+			header->pnfs_error = PTR_ERR(bio);
+			bio = NULL;
+			goto out;
+		}
+
+		offset += pg_len;
+		count -= pg_len;
+		isect += (pg_len >> SECTOR_SHIFT);
+		extent_length -= (pg_len >> SECTOR_SHIFT);
+	}
+
+	header->res.count = header->args.count;
+out:
+	bl_submit_bio(WRITE, bio);
+	blk_finish_plug(&plug);
+	put_parallel(par);
+	return PNFS_ATTEMPTED;
+}
+
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	int err;
+
+	dprintk("%s enter\n", __func__);
+
+	err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+	WARN_ON(err);
+
+	kfree(bl);
+}
+
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+						   gfp_t gfp_flags)
+{
+	struct pnfs_block_layout *bl;
+
+	dprintk("%s enter\n", __func__);
+	bl = kzalloc(sizeof(*bl), gfp_flags);
+	if (!bl)
+		return NULL;
+
+	bl->bl_ext_rw = RB_ROOT;
+	bl->bl_ext_ro = RB_ROOT;
+	spin_lock_init(&bl->bl_ext_lock);
+
+	return &bl->bl_layout;
+}
+
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	dprintk("%s enter\n", __func__);
+	kfree(lseg);
+}
+
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+	u32 mode;	/* R or RW */
+	u64 start;	/* Expected start of next non-COW extent */
+	u64 inval;	/* Start of INVAL coverage */
+	u64 cowread;	/* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+			 struct layout_verification *lv)
+{
+	if (lv->mode == IOMODE_READ) {
+		if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+		    be->be_state == PNFS_BLOCK_INVALID_DATA)
+			return -EIO;
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		return 0;
+	}
+	/* lv->mode == IOMODE_RW */
+	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		if (lv->cowread > lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		lv->inval = lv->start;
+		return 0;
+	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		return 0;
+	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+		if (be->be_f_offset > lv->start)
+			return -EIO;
+		if (be->be_f_offset < lv->inval)
+			return -EIO;
+		if (be->be_f_offset < lv->cowread)
+			return -EIO;
+		/* It looks like you might want to min this with lv->start,
+		 * but you really don't.
+		 */
+		lv->inval = lv->inval + be->be_length;
+		lv->cowread = be->be_f_offset + be->be_length;
+		return 0;
+	} else
+		return -EIO;
+}
+
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+	uint64_t s;
+
+	*rp = xdr_decode_hyper(*rp, &s);
+	if (s & 0x1ff) {
+		printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+		return -1;
+	}
+	*sp = s >> SECTOR_SHIFT;
+	return 0;
+}
+
+static int
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+		struct layout_verification *lv, struct list_head *extents,
+		gfp_t gfp_mask)
+{
+	struct pnfs_block_extent *be;
+	struct nfs4_deviceid id;
+	int error;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+	if (!p)
+		return -EIO;
+
+	be = kzalloc(sizeof(*be), GFP_NOFS);
+	if (!be)
+		return -ENOMEM;
+
+	memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+	error = -EIO;
+	be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+						lo->plh_lc_cred, gfp_mask);
+	if (!be->be_device)
+		goto out_free_be;
+
+	/*
+	 * The next three values are read in as bytes, but stored in the
+	 * extent structure in 512-byte granularity.
+	 */
+	if (decode_sector_number(&p, &be->be_f_offset) < 0)
+		goto out_put_deviceid;
+	if (decode_sector_number(&p, &be->be_length) < 0)
+		goto out_put_deviceid;
+	if (decode_sector_number(&p, &be->be_v_offset) < 0)
+		goto out_put_deviceid;
+	be->be_state = be32_to_cpup(p++);
+
+	error = verify_extent(be, lv);
+	if (error) {
+		dprintk("%s: extent verification failed\n", __func__);
+		goto out_put_deviceid;
+	}
+
+	list_add_tail(&be->be_list, extents);
+	return 0;
+
+out_put_deviceid:
+	nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+	kfree(be);
+	return error;
+}
+
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+		gfp_t gfp_mask)
+{
+	struct layout_verification lv = {
+		.mode = lgr->range.iomode,
+		.start = lgr->range.offset >> SECTOR_SHIFT,
+		.inval = lgr->range.offset >> SECTOR_SHIFT,
+		.cowread = lgr->range.offset >> SECTOR_SHIFT,
+	};
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	struct pnfs_layout_segment *lseg;
+	struct xdr_buf buf;
+	struct xdr_stream xdr;
+	struct page *scratch;
+	int status, i;
+	uint32_t count;
+	__be32 *p;
+	LIST_HEAD(extents);
+
+	dprintk("---> %s\n", __func__);
+
+	lseg = kzalloc(sizeof(*lseg), gfp_mask);
+	if (!lseg)
+		return ERR_PTR(-ENOMEM);
+
+	status = -ENOMEM;
+	scratch = alloc_page(gfp_mask);
+	if (!scratch)
+		goto out;
+
+	xdr_init_decode_pages(&xdr, &buf,
+			lgr->layoutp->pages, lgr->layoutp->len);
+	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+	status = -EIO;
+	p = xdr_inline_decode(&xdr, 4);
+	if (unlikely(!p))
+		goto out_free_scratch;
+
+	count = be32_to_cpup(p++);
+	dprintk("%s: number of extents %d\n", __func__, count);
+
+	/*
+	 * Decode individual extents, putting them in temporary staging area
+	 * until whole layout is decoded to make error recovery easier.
+	 */
+	for (i = 0; i < count; i++) {
+		status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+		if (status)
+			goto process_extents;
+	}
+
+	if (lgr->range.offset + lgr->range.length !=
+			lv.start << SECTOR_SHIFT) {
+		dprintk("%s Final length mismatch\n", __func__);
+		status = -EIO;
+		goto process_extents;
+	}
+
+	if (lv.start < lv.cowread) {
+		dprintk("%s Final uncovered COW extent\n", __func__);
+		status = -EIO;
+	}
+
+process_extents:
+	while (!list_empty(&extents)) {
+		struct pnfs_block_extent *be =
+			list_first_entry(&extents, struct pnfs_block_extent,
+					 be_list);
+		list_del(&be->be_list);
+
+		if (!status)
+			status = ext_tree_insert(bl, be);
+
+		if (status) {
+			nfs4_put_deviceid_node(be->be_device);
+			kfree(be);
+		}
+	}
+
+out_free_scratch:
+	__free_page(scratch);
+out:
+	dprintk("%s returns %d\n", __func__, status);
+	if (status) {
+		kfree(lseg);
+		return ERR_PTR(status);
+	}
+	return lseg;
+}
+
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_range *range)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	sector_t offset = range->offset >> SECTOR_SHIFT, end;
+
+	if (range->offset % 8) {
+		dprintk("%s: offset %lld not block size aligned\n",
+			__func__, range->offset);
+		return;
+	}
+
+	if (range->length != NFS4_MAX_UINT64) {
+		if (range->length % 8) {
+			dprintk("%s: length %lld not block size aligned\n",
+				__func__, range->length);
+			return;
+		}
+
+		end = offset + (range->length >> SECTOR_SHIFT);
+	} else {
+		end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+	}
+
+	ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
+}
+
+static int
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
+{
+	return ext_tree_prepare_commit(arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+	ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+	dprintk("%s enter\n", __func__);
+
+	if (server->pnfs_blksize == 0) {
+		dprintk("%s Server did not return blksize\n", __func__);
+		return -EINVAL;
+	}
+	if (server->pnfs_blksize > PAGE_SIZE) {
+		printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+			__func__, server->pnfs_blksize);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+		struct nfs_page *req, unsigned int alignment)
+{
+	/*
+	 * Always accept buffered writes, higher layers take care of the
+	 * right alignment.
+	 */
+	if (pgio->pg_dreq == NULL)
+		return true;
+
+	if (!IS_ALIGNED(req->wb_offset, alignment))
+		return false;
+
+	if (IS_ALIGNED(req->wb_bytes, alignment))
+		return true;
+
+	if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+		/*
+		 * If the write goes up to the inode size, just write
+		 * the full page.  Data past the inode size is
+		 * guaranteed to be zeroed by the higher level client
+		 * code, and this behaviour is mandated by RFC 5663
+		 * section 2.3.2.
+		 */
+		return true;
+	}
+
+	return false;
+}
+
+static void
+bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+	if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
+		nfs_pageio_reset_read_mds(pgio);
+		return;
+	}
+
+	pnfs_generic_pg_init_read(pgio, req);
+}
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		struct nfs_page *req)
+{
+	if (!is_aligned_req(pgio, req, SECTOR_SIZE))
+		return 0;
+	return pnfs_generic_pg_test(pgio, prev, req);
+}
+
+/*
+ * Return the number of contiguous bytes for a given inode
+ * starting at page frame idx.
+ */
+static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t end;
+
+	/* Optimize common case that writes from 0 to end of file */
+	end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+	if (end != inode->i_mapping->nrpages) {
+		rcu_read_lock();
+		end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+		rcu_read_unlock();
+	}
+
+	if (!end)
+		return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
+	else
+		return (end - idx) << PAGE_CACHE_SHIFT;
+}
+
+static void
+bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+	u64 wb_size;
+
+	if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
+		nfs_pageio_reset_write_mds(pgio);
+		return;
+	}
+
+	if (pgio->pg_dreq == NULL)
+		wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+					      req->wb_index);
+	else
+		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+	pnfs_generic_pg_init_write(pgio, req, wb_size);
+}
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		 struct nfs_page *req)
+{
+	if (!is_aligned_req(pgio, req, PAGE_SIZE))
+		return 0;
+	return pnfs_generic_pg_test(pgio, prev, req);
+}
+
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+	.pg_init = bl_pg_init_read,
+	.pg_test = bl_pg_test_read,
+	.pg_doio = pnfs_generic_pg_readpages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+	.pg_init = bl_pg_init_write,
+	.pg_test = bl_pg_test_write,
+	.pg_doio = pnfs_generic_pg_writepages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static struct pnfs_layoutdriver_type blocklayout_type = {
+	.id				= LAYOUT_BLOCK_VOLUME,
+	.name				= "LAYOUT_BLOCK_VOLUME",
+	.owner				= THIS_MODULE,
+	.flags				= PNFS_LAYOUTRET_ON_SETATTR |
+					  PNFS_READ_WHOLE_PAGE,
+	.read_pagelist			= bl_read_pagelist,
+	.write_pagelist			= bl_write_pagelist,
+	.alloc_layout_hdr		= bl_alloc_layout_hdr,
+	.free_layout_hdr		= bl_free_layout_hdr,
+	.alloc_lseg			= bl_alloc_lseg,
+	.free_lseg			= bl_free_lseg,
+	.return_range			= bl_return_range,
+	.prepare_layoutcommit		= bl_prepare_layoutcommit,
+	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
+	.set_layoutdriver		= bl_set_layoutdriver,
+	.alloc_deviceid_node		= bl_alloc_deviceid_node,
+	.free_deviceid_node		= bl_free_deviceid_node,
+	.pg_read_ops			= &bl_pg_read_ops,
+	.pg_write_ops			= &bl_pg_write_ops,
+	.sync				= pnfs_generic_sync,
+};
+
+static int __init nfs4blocklayout_init(void)
+{
+	int ret;
+
+	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+
+	ret = pnfs_register_layoutdriver(&blocklayout_type);
+	if (ret)
+		goto out;
+	ret = bl_init_pipefs();
+	if (ret)
+		goto out_unregister;
+	return 0;
+
+out_unregister:
+	pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
+	return ret;
+}
+
+static void __exit nfs4blocklayout_exit(void)
+{
+	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+	       __func__);
+
+	bl_cleanup_pipefs();
+	pnfs_unregister_layoutdriver(&blocklayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-3");
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/kernel/fs/nfs/blocklayout/blocklayout.h b/kernel/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 000000000..92dca9e90
--- /dev/null
+++ b/kernel/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,204 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "../nfs4_fs.h"
+#include "../pnfs.h"
+#include "../netns.h"
+
+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+
+struct pnfs_block_dev;
+
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+};
+
+#define PNFS_BLOCK_MAX_UUIDS	4
+#define PNFS_BLOCK_MAX_DEVICES	64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN	128
+
+
+struct pnfs_block_volume {
+	enum pnfs_block_volume_type	type;
+	union {
+		struct {
+			int		len;
+			int		nr_sigs;
+			struct {
+				u64		offset;
+				u32		sig_len;
+				u8		sig[PNFS_BLOCK_UUID_LEN];
+			} sigs[PNFS_BLOCK_MAX_UUIDS];
+		} simple;
+		struct {
+			u64		start;
+			u64		len;
+			u32		volume;
+		} slice;
+		struct {
+			u32		volumes_count;
+			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
+		} concat;
+		struct {
+			u64		chunk_size;
+			u32		volumes_count;
+			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
+		} stripe;
+	};
+};
+
+struct pnfs_block_dev_map {
+	sector_t			start;
+	sector_t			len;
+
+	sector_t			disk_offset;
+	struct block_device		*bdev;
+};
+
+struct pnfs_block_dev {
+	struct nfs4_deviceid_node	node;
+
+	u64				start;
+	u64				len;
+
+	u32				nr_children;
+	struct pnfs_block_dev		*children;
+	u64				chunk_size;
+
+	struct block_device		*bdev;
+	u64				disk_offset;
+
+	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+			struct pnfs_block_dev_map *map);
+};
+
+enum exstate4 {
+	PNFS_BLOCK_READWRITE_DATA	= 0,
+	PNFS_BLOCK_READ_DATA		= 1,
+	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
+	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
+};
+
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+	union {
+		struct rb_node	be_node;
+		struct list_head be_list;
+	};
+	struct nfs4_deviceid_node *be_device;
+	sector_t	be_f_offset;	/* the starting offset in the file */
+	sector_t	be_length;	/* the size of the extent */
+	sector_t	be_v_offset;	/* the starting offset in the volume */
+	enum exstate4	be_state;	/* the state of this extent */
+#define EXTENT_WRITTEN		1
+#define EXTENT_COMMITTING	2
+	unsigned int	be_tag;
+};
+
+/* on the wire size of the extent */
+#define BL_EXTENT_SIZE	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
+struct pnfs_block_layout {
+	struct pnfs_layout_hdr	bl_layout;
+	struct rb_root		bl_ext_rw;
+	struct rb_root		bl_ext_ro;
+	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
+};
+
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+	return BLK_LO2EXT(lseg->pls_layout);
+}
+
+struct bl_pipe_msg {
+	struct rpc_pipe_msg msg;
+	wait_queue_head_t *bl_wq;
+};
+
+struct bl_msg_hdr {
+	u8  type;
+	u16 totallen; /* length of entire message, including hdr itself */
+};
+
+#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
+
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+		struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+		sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+		sector_t len);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+		struct pnfs_block_extent *ret, bool rw);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
+
+/* rpc_pipefs.c */
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+		struct pnfs_block_volume *b, gfp_t gfp_mask);
+int __init bl_init_pipefs(void);
+void __exit bl_cleanup_pipefs(void);
+
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/kernel/fs/nfs/blocklayout/dev.c b/kernel/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000..e535599a0
--- /dev/null
+++ b/kernel/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+	if (dev->nr_children) {
+		int i;
+
+		for (i = 0; i < dev->nr_children; i++)
+			bl_free_device(&dev->children[i]);
+		kfree(dev->children);
+	} else {
+		if (dev->bdev)
+			blkdev_put(dev->bdev, FMODE_READ);
+	}
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	struct pnfs_block_dev *dev =
+		container_of(d, struct pnfs_block_dev, node);
+
+	bl_free_device(dev);
+	kfree_rcu(dev, node.rcu);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+	__be32 *p;
+	int i;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (!p)
+		return -EIO;
+	b->type = be32_to_cpup(p++);
+
+	switch (b->type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		p = xdr_inline_decode(xdr, 4);
+		if (!p)
+			return -EIO;
+		b->simple.nr_sigs = be32_to_cpup(p++);
+		if (!b->simple.nr_sigs) {
+			dprintk("no signature\n");
+			return -EIO;
+		}
+
+		b->simple.len = 4 + 4;
+		for (i = 0; i < b->simple.nr_sigs; i++) {
+			p = xdr_inline_decode(xdr, 8 + 4);
+			if (!p)
+				return -EIO;
+			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+
+			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+			if (!p)
+				return -EIO;
+			memcpy(&b->simple.sigs[i].sig, p,
+				b->simple.sigs[i].sig_len);
+
+			b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+		}
+		break;
+	case PNFS_BLOCK_VOLUME_SLICE:
+		p = xdr_inline_decode(xdr, 8 + 8 + 4);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->slice.start);
+		p = xdr_decode_hyper(p, &b->slice.len);
+		b->slice.volume = be32_to_cpup(p++);
+		break;
+	case PNFS_BLOCK_VOLUME_CONCAT:
+		p = xdr_inline_decode(xdr, 4);
+		if (!p)
+			return -EIO;
+		b->concat.volumes_count = be32_to_cpup(p++);
+
+		p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+		if (!p)
+			return -EIO;
+		for (i = 0; i < b->concat.volumes_count; i++)
+			b->concat.volumes[i] = be32_to_cpup(p++);
+		break;
+	case PNFS_BLOCK_VOLUME_STRIPE:
+		p = xdr_inline_decode(xdr, 8 + 4);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+		b->stripe.volumes_count = be32_to_cpup(p++);
+
+		p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+		if (!p)
+			return -EIO;
+		for (i = 0; i < b->stripe.volumes_count; i++)
+			b->stripe.volumes[i] = be32_to_cpup(p++);
+		break;
+	default:
+		dprintk("unknown volume type!\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	map->start = dev->start;
+	map->len = dev->len;
+	map->disk_offset = dev->disk_offset;
+	map->bdev = dev->bdev;
+	return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	int i;
+
+	for (i = 0; i < dev->nr_children; i++) {
+		struct pnfs_block_dev *child = &dev->children[i];
+
+		if (child->start > offset ||
+		    child->start + child->len <= offset)
+			continue;
+
+		child->map(child, offset - child->start, map);
+		return true;
+	}
+
+	dprintk("%s: ran off loop!\n", __func__);
+	return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	struct pnfs_block_dev *child;
+	u64 chunk;
+	u32 chunk_idx;
+	u64 disk_offset;
+
+	chunk = div_u64(offset, dev->chunk_size);
+	div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
+	if (chunk_idx > dev->nr_children) {
+		dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+			__func__, chunk_idx, offset, dev->chunk_size);
+		/* error, should not happen */
+		return false;
+	}
+
+	/* truncate offset to the beginning of the stripe */
+	offset = chunk * dev->chunk_size;
+
+	/* disk offset of the stripe */
+	disk_offset = div_u64(offset, dev->nr_children);
+
+	child = &dev->children[chunk_idx];
+	child->map(child, disk_offset, map);
+
+	map->start += offset;
+	map->disk_offset += disk_offset;
+	map->len = dev->chunk_size;
+	return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	dev_t dev;
+
+	dev = bl_resolve_deviceid(server, v, gfp_mask);
+	if (!dev)
+		return -EIO;
+
+	d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+	if (IS_ERR(d->bdev)) {
+		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+		return PTR_ERR(d->bdev);
+	}
+
+
+	d->len = i_size_read(d->bdev->bd_inode);
+	d->map = bl_map_simple;
+
+	printk(KERN_INFO "pNFS: using block device %s\n",
+		d->bdev->bd_disk->disk_name);
+	return 0;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	int ret;
+
+	ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+	if (ret)
+		return ret;
+
+	d->disk_offset = v->slice.start;
+	d->len = v->slice.len;
+	return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	u64 len = 0;
+	int ret, i;
+
+	d->children = kcalloc(v->concat.volumes_count,
+			sizeof(struct pnfs_block_dev), GFP_KERNEL);
+	if (!d->children)
+		return -ENOMEM;
+
+	for (i = 0; i < v->concat.volumes_count; i++) {
+		ret = bl_parse_deviceid(server, &d->children[i],
+				volumes, v->concat.volumes[i], gfp_mask);
+		if (ret)
+			return ret;
+
+		d->nr_children++;
+		d->children[i].start += len;
+		len += d->children[i].len;
+	}
+
+	d->len = len;
+	d->map = bl_map_concat;
+	return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	u64 len = 0;
+	int ret, i;
+
+	d->children = kcalloc(v->stripe.volumes_count,
+			sizeof(struct pnfs_block_dev), GFP_KERNEL);
+	if (!d->children)
+		return -ENOMEM;
+
+	for (i = 0; i < v->stripe.volumes_count; i++) {
+		ret = bl_parse_deviceid(server, &d->children[i],
+				volumes, v->stripe.volumes[i], gfp_mask);
+		if (ret)
+			return ret;
+
+		d->nr_children++;
+		len += d->children[i].len;
+	}
+
+	d->len = len;
+	d->chunk_size = v->stripe.chunk_size;
+	d->map = bl_map_stripe;
+	return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	switch (volumes[idx].type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_SLICE:
+		return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_CONCAT:
+		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_STRIPE:
+		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+	default:
+		dprintk("unsupported volume type: %d\n", volumes[idx].type);
+		return -EIO;
+	}
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+		gfp_t gfp_mask)
+{
+	struct nfs4_deviceid_node *node = NULL;
+	struct pnfs_block_volume *volumes;
+	struct pnfs_block_dev *top;
+	struct xdr_stream xdr;
+	struct xdr_buf buf;
+	struct page *scratch;
+	int nr_volumes, ret, i;
+	__be32 *p;
+
+	scratch = alloc_page(gfp_mask);
+	if (!scratch)
+		goto out;
+
+	xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+	p = xdr_inline_decode(&xdr, sizeof(__be32));
+	if (!p)
+		goto out_free_scratch;
+	nr_volumes = be32_to_cpup(p++);
+
+	volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+			  gfp_mask);
+	if (!volumes)
+		goto out_free_scratch;
+
+	for (i = 0; i < nr_volumes; i++) {
+		ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+		if (ret < 0)
+			goto out_free_volumes;
+	}
+
+	top = kzalloc(sizeof(*top), gfp_mask);
+	if (!top)
+		goto out_free_volumes;
+
+	ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+	if (ret) {
+		bl_free_device(top);
+		kfree(top);
+		goto out_free_volumes;
+	}
+
+	node = &top->node;
+	nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+
+out_free_volumes:
+	kfree(volumes);
+out_free_scratch:
+	__free_page(scratch);
+out:
+	return node;
+}
diff --git a/kernel/fs/nfs/blocklayout/extent_tree.c b/kernel/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000..31d0b5e53
--- /dev/null
+++ b/kernel/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+
+#include <linux/vmalloc.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+	return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+	struct rb_node *node = rb_first(root);
+	return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+	struct rb_node *node = rb_prev(&be->be_node);
+	return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+	struct rb_node *node = rb_next(&be->be_node);
+	return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+	return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+	struct rb_node *node = root->rb_node;
+	struct pnfs_block_extent *be = NULL;
+
+	while (node) {
+		be = ext_node(node);
+		if (start < be->be_f_offset)
+			node = node->rb_left;
+		else if (start >= ext_f_end(be))
+			node = node->rb_right;
+		else
+			return be;
+	}
+
+	if (be) {
+		if (start < be->be_f_offset)
+			return be;
+
+		if (start >= ext_f_end(be))
+			return ext_tree_next(be);
+	}
+
+	return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+	if (be1->be_state != be2->be_state)
+		return false;
+	if (be1->be_device != be2->be_device)
+		return false;
+
+	if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+		return false;
+
+	if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+	    (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+		return false;
+
+	if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+	    be1->be_tag != be2->be_tag)
+		return false;
+
+	return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+	struct pnfs_block_extent *left = ext_tree_prev(be);
+
+	if (left && ext_can_merge(left, be)) {
+		left->be_length += be->be_length;
+		rb_erase(&be->be_node, root);
+		nfs4_put_deviceid_node(be->be_device);
+		kfree(be);
+		return left;
+	}
+
+	return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+	struct pnfs_block_extent *right = ext_tree_next(be);
+
+	if (right && ext_can_merge(be, right)) {
+		be->be_length += right->be_length;
+		rb_erase(&right->be_node, root);
+		nfs4_put_deviceid_node(right->be_device);
+		kfree(right);
+	}
+
+	return be;
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+		struct pnfs_block_extent *new, bool merge_ok)
+{
+	struct rb_node **p = &root->rb_node, *parent = NULL;
+	struct pnfs_block_extent *be;
+
+	while (*p) {
+		parent = *p;
+		be = ext_node(parent);
+
+		if (new->be_f_offset < be->be_f_offset) {
+			if (merge_ok && ext_can_merge(new, be)) {
+				be->be_f_offset = new->be_f_offset;
+				if (be->be_state != PNFS_BLOCK_NONE_DATA)
+					be->be_v_offset = new->be_v_offset;
+				be->be_length += new->be_length;
+				be = ext_try_to_merge_left(root, be);
+				goto free_new;
+			}
+			p = &(*p)->rb_left;
+		} else if (new->be_f_offset >= ext_f_end(be)) {
+			if (merge_ok && ext_can_merge(be, new)) {
+				be->be_length += new->be_length;
+				be = ext_try_to_merge_right(root, be);
+				goto free_new;
+			}
+			p = &(*p)->rb_right;
+		} else {
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->be_node, parent, p);
+	rb_insert_color(&new->be_node, root);
+	return;
+free_new:
+	nfs4_put_deviceid_node(new->be_device);
+	kfree(new);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+{
+	struct pnfs_block_extent *be;
+	sector_t len1 = 0, len2 = 0;
+	sector_t orig_v_offset;
+	sector_t orig_len;
+
+	be = __ext_tree_search(root, start);
+	if (!be)
+		return 0;
+	if (be->be_f_offset >= end)
+		return 0;
+
+	orig_v_offset = be->be_v_offset;
+	orig_len = be->be_length;
+
+	if (start > be->be_f_offset)
+		len1 = start - be->be_f_offset;
+	if (ext_f_end(be) > end)
+		len2 = ext_f_end(be) - end;
+
+	if (len2 > 0) {
+		if (len1 > 0) {
+			struct pnfs_block_extent *new;
+
+			new = kzalloc(sizeof(*new), GFP_ATOMIC);
+			if (!new)
+				return -ENOMEM;
+
+			be->be_length = len1;
+
+			new->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+				new->be_v_offset =
+					orig_v_offset + orig_len - len2;
+			}
+			new->be_length = len2;
+			new->be_state = be->be_state;
+			new->be_tag = be->be_tag;
+			new->be_device = nfs4_get_deviceid(be->be_device);
+
+			__ext_tree_insert(root, new, true);
+		} else {
+			be->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+				be->be_v_offset =
+					orig_v_offset + orig_len - len2;
+			}
+			be->be_length = len2;
+		}
+	} else {
+		if (len1 > 0) {
+			be->be_length = len1;
+			be = ext_tree_next(be);
+		}
+
+		while (be && ext_f_end(be) <= end) {
+			struct pnfs_block_extent *next = ext_tree_next(be);
+
+			rb_erase(&be->be_node, root);
+			nfs4_put_deviceid_node(be->be_device);
+			kfree(be);
+			be = next;
+		}
+
+		if (be && be->be_f_offset < end) {
+			len1 = ext_f_end(be) - end;
+			be->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA)
+				be->be_v_offset += be->be_length - len1;
+			be->be_length = len1;
+		}
+	}
+
+	return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+	struct pnfs_block_extent *be;
+	struct rb_root *root;
+	int err = 0;
+
+	switch (new->be_state) {
+	case PNFS_BLOCK_READWRITE_DATA:
+	case PNFS_BLOCK_INVALID_DATA:
+		root = &bl->bl_ext_rw;
+		break;
+	case PNFS_BLOCK_READ_DATA:
+	case PNFS_BLOCK_NONE_DATA:
+		root = &bl->bl_ext_ro;
+		break;
+	default:
+		dprintk("invalid extent type\n");
+		return -EINVAL;
+	}
+
+	spin_lock(&bl->bl_ext_lock);
+retry:
+	be = __ext_tree_search(root, new->be_f_offset);
+	if (!be || be->be_f_offset >= ext_f_end(new)) {
+		__ext_tree_insert(root, new, true);
+	} else if (new->be_f_offset >= be->be_f_offset) {
+		if (ext_f_end(new) <= ext_f_end(be)) {
+			nfs4_put_deviceid_node(new->be_device);
+			kfree(new);
+		} else {
+			sector_t new_len = ext_f_end(new) - ext_f_end(be);
+			sector_t diff = new->be_length - new_len;
+
+			new->be_f_offset += diff;
+			new->be_v_offset += diff;
+			new->be_length = new_len;
+			goto retry;
+		}
+	} else if (ext_f_end(new) <= ext_f_end(be)) {
+		new->be_length = be->be_f_offset - new->be_f_offset;
+		__ext_tree_insert(root, new, true);
+	} else {
+		struct pnfs_block_extent *split;
+		sector_t new_len = ext_f_end(new) - ext_f_end(be);
+		sector_t diff = new->be_length - new_len;
+
+		split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+		if (!split) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		split->be_length = be->be_f_offset - split->be_f_offset;
+		split->be_device = nfs4_get_deviceid(new->be_device);
+		__ext_tree_insert(root, split, true);
+
+		new->be_f_offset += diff;
+		new->be_v_offset += diff;
+		new->be_length = new_len;
+		goto retry;
+	}
+out:
+	spin_unlock(&bl->bl_ext_lock);
+	return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+		struct pnfs_block_extent *ret)
+{
+	struct rb_node *node;
+	struct pnfs_block_extent *be;
+
+	node = root->rb_node;
+	while (node) {
+		be = ext_node(node);
+		if (isect < be->be_f_offset)
+			node = node->rb_left;
+		else if (isect >= ext_f_end(be))
+			node = node->rb_right;
+		else {
+			*ret = *be;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+	    struct pnfs_block_extent *ret, bool rw)
+{
+	bool found = false;
+
+	spin_lock(&bl->bl_ext_lock);
+	if (!rw)
+		found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+	if (!found)
+		found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+	spin_unlock(&bl->bl_ext_lock);
+
+	return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+		sector_t start, sector_t end)
+{
+	int err, err2;
+
+	spin_lock(&bl->bl_ext_lock);
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	if (rw) {
+		err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+		if (!err)
+			err = err2;
+	}
+	spin_unlock(&bl->bl_ext_lock);
+
+	return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+		sector_t split)
+{
+	struct pnfs_block_extent *new;
+	sector_t orig_len = be->be_length;
+
+	new = kzalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new)
+		return -ENOMEM;
+
+	be->be_length = split - be->be_f_offset;
+
+	new->be_f_offset = split;
+	if (be->be_state != PNFS_BLOCK_NONE_DATA)
+		new->be_v_offset = be->be_v_offset + be->be_length;
+	new->be_length = orig_len - be->be_length;
+	new->be_state = be->be_state;
+	new->be_tag = be->be_tag;
+	new->be_device = nfs4_get_deviceid(be->be_device);
+
+	__ext_tree_insert(root, new, false);
+	return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+		sector_t len)
+{
+	struct rb_root *root = &bl->bl_ext_rw;
+	sector_t end = start + len;
+	struct pnfs_block_extent *be;
+	int err = 0;
+
+	spin_lock(&bl->bl_ext_lock);
+	/*
+	 * First remove all COW extents or holes from written to range.
+	 */
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	if (err)
+		goto out;
+
+	/*
+	 * Then mark all invalid extents in the range as written to.
+	 */
+	for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+		if (be->be_f_offset >= end)
+			break;
+
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+			continue;
+
+		if (be->be_f_offset < start) {
+			struct pnfs_block_extent *left = ext_tree_prev(be);
+
+			if (left && ext_can_merge(left, be)) {
+				sector_t diff = start - be->be_f_offset;
+
+				left->be_length += diff;
+
+				be->be_f_offset += diff;
+				be->be_v_offset += diff;
+				be->be_length -= diff;
+			} else {
+				err = ext_tree_split(root, be, start);
+				if (err)
+					goto out;
+			}
+		}
+
+		if (ext_f_end(be) > end) {
+			struct pnfs_block_extent *right = ext_tree_next(be);
+
+			if (right && ext_can_merge(be, right)) {
+				sector_t diff = end - be->be_f_offset;
+
+				be->be_length -= diff;
+
+				right->be_f_offset -= diff;
+				right->be_v_offset -= diff;
+				right->be_length += diff;
+			} else {
+				err = ext_tree_split(root, be, end);
+				if (err)
+					goto out;
+			}
+		}
+
+		if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+			be->be_tag = EXTENT_WRITTEN;
+			be = ext_try_to_merge_left(root, be);
+			be = ext_try_to_merge_right(root, be);
+		}
+	}
+out:
+	spin_unlock(&bl->bl_ext_lock);
+	return err;
+}
+
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+		size_t buffer_size)
+{
+	if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+		int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+		for (i = 0; i < nr_pages; i++)
+			put_page(arg->layoutupdate_pages[i]);
+		kfree(arg->layoutupdate_pages);
+	} else {
+		put_page(arg->layoutupdate_page);
+	}
+}
+
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+		size_t buffer_size, size_t *count)
+{
+	struct pnfs_block_extent *be;
+	int ret = 0;
+
+	spin_lock(&bl->bl_ext_lock);
+	for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+		    be->be_tag != EXTENT_WRITTEN)
+			continue;
+
+		(*count)++;
+		if (*count * BL_EXTENT_SIZE > buffer_size) {
+			/* keep counting.. */
+			ret = -ENOSPC;
+			continue;
+		}
+
+		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+				NFS4_DEVICEID4_SIZE);
+		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+		p = xdr_encode_hyper(p, 0LL);
+		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+
+		be->be_tag = EXTENT_COMMITTING;
+	}
+	spin_unlock(&bl->bl_ext_lock);
+
+	return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+	size_t count = 0, buffer_size = PAGE_SIZE;
+	__be32 *start_p;
+	int ret;
+
+	dprintk("%s enter\n", __func__);
+
+	arg->layoutupdate_page = alloc_page(GFP_NOFS);
+	if (!arg->layoutupdate_page)
+		return -ENOMEM;
+	start_p = page_address(arg->layoutupdate_page);
+	arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+	if (unlikely(ret)) {
+		ext_tree_free_commitdata(arg, buffer_size);
+
+		buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+		count = 0;
+
+		arg->layoutupdate_pages =
+			kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+				sizeof(struct page *), GFP_NOFS);
+		if (!arg->layoutupdate_pages)
+			return -ENOMEM;
+
+		start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+		if (!start_p) {
+			kfree(arg->layoutupdate_pages);
+			return -ENOMEM;
+		}
+
+		goto retry;
+	}
+
+	*start_p = cpu_to_be32(count);
+	arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+
+	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+		__be32 *p = start_p;
+		int i = 0;
+
+		for (p = start_p;
+		     p < start_p + arg->layoutupdate_len;
+		     p += PAGE_SIZE) {
+			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+		}
+	}
+
+	dprintk("%s found %zu ranges\n", __func__, count);
+	return 0;
+}
+
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+	struct rb_root *root = &bl->bl_ext_rw;
+	struct pnfs_block_extent *be;
+
+	dprintk("%s status %d\n", __func__, status);
+
+	ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
+	spin_lock(&bl->bl_ext_lock);
+	for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+		    be->be_tag != EXTENT_COMMITTING)
+			continue;
+
+		if (status) {
+			/*
+			 * Mark as written and try again.
+			 *
+			 * XXX: some real error handling here wouldn't hurt..
+			 */
+			be->be_tag = EXTENT_WRITTEN;
+		} else {
+			be->be_state = PNFS_BLOCK_READWRITE_DATA;
+			be->be_tag = 0;
+		}
+
+		be = ext_try_to_merge_left(root, be);
+		be = ext_try_to_merge_right(root, be);
+	}
+	spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/kernel/fs/nfs/blocklayout/rpc_pipefs.c b/kernel/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000..dbe5839cd
--- /dev/null
+++ b/kernel/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,288 @@
+/*
+ *  Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+	int i;
+
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(b->type);
+	*p++ = cpu_to_be32(b->simple.nr_sigs);
+	for (i = 0; i < b->simple.nr_sigs; i++) {
+		p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+		p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+					 b->simple.sigs[i].sig_len);
+	}
+}
+
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+		gfp_t gfp_mask)
+{
+	struct net *net = server->nfs_client->cl_net;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct bl_dev_msg *reply = &nn->bl_mount_reply;
+	struct bl_pipe_msg bl_pipe_msg;
+	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+	struct bl_msg_hdr *bl_msg;
+	DECLARE_WAITQUEUE(wq, current);
+	dev_t dev = 0;
+	int rc;
+
+	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+
+	mutex_lock(&nn->bl_mutex);
+	bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+	b->simple.len += 4;	/* single volume */
+	if (b->simple.len > PAGE_SIZE)
+		goto out_unlock;
+
+	memset(msg, 0, sizeof(*msg));
+	msg->len = sizeof(*bl_msg) + b->simple.len;
+	msg->data = kzalloc(msg->len, gfp_mask);
+	if (!msg->data)
+		goto out_free_data;
+
+	bl_msg = msg->data;
+	bl_msg->type = BL_DEVICE_MOUNT,
+	bl_msg->totallen = b->simple.len;
+	nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+
+	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+	add_wait_queue(&nn->bl_wq, &wq);
+	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+	if (rc < 0) {
+		remove_wait_queue(&nn->bl_wq, &wq);
+		goto out_free_data;
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule();
+	remove_wait_queue(&nn->bl_wq, &wq);
+
+	if (reply->status != BL_DEVICE_REQUEST_PROC) {
+		printk(KERN_WARNING "%s failed to decode device: %d\n",
+			__func__, reply->status);
+		goto out_free_data;
+	}
+
+	dev = MKDEV(reply->major, reply->minor);
+out_free_data:
+	kfree(msg->data);
+out_unlock:
+	mutex_unlock(&nn->bl_mutex);
+	return dev;
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+			 size_t mlen)
+{
+	struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
+					 nfs_net_id);
+
+	if (mlen != sizeof (struct bl_dev_msg))
+		return -EINVAL;
+
+	if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+		return -EFAULT;
+
+	wake_up(&nn->bl_wq);
+
+	return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct bl_pipe_msg *bl_pipe_msg =
+		container_of(msg, struct bl_pipe_msg, msg);
+
+	if (msg->errno >= 0)
+		return;
+	wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+	.upcall		= rpc_pipe_generic_upcall,
+	.downcall	= bl_pipe_downcall,
+	.destroy_msg	= bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+					    struct rpc_pipe *pipe)
+{
+	struct dentry *dir, *dentry;
+
+	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+	if (dir == NULL)
+		return ERR_PTR(-ENOENT);
+	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+	dput(dir);
+	return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+					  struct rpc_pipe *pipe)
+{
+	if (pipe->dentry)
+		rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+			   void *ptr)
+{
+	struct super_block *sb = ptr;
+	struct net *net = sb->s_fs_info;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct dentry *dentry;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return 0;
+
+	if (nn->bl_device_pipe == NULL) {
+		module_put(THIS_MODULE);
+		return 0;
+	}
+
+	switch (event) {
+	case RPC_PIPEFS_MOUNT:
+		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			break;
+		}
+		nn->bl_device_pipe->dentry = dentry;
+		break;
+	case RPC_PIPEFS_UMOUNT:
+		if (nn->bl_device_pipe->dentry)
+			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+		break;
+	default:
+		ret = -ENOTSUPP;
+		break;
+	}
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+	.notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+						   struct rpc_pipe *pipe)
+{
+	struct super_block *pipefs_sb;
+	struct dentry *dentry;
+
+	pipefs_sb = rpc_get_sb_net(net);
+	if (!pipefs_sb)
+		return NULL;
+	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+	rpc_put_sb_net(net);
+	return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+					   struct rpc_pipe *pipe)
+{
+	struct super_block *pipefs_sb;
+
+	pipefs_sb = rpc_get_sb_net(net);
+	if (pipefs_sb) {
+		nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+		rpc_put_sb_net(net);
+	}
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct dentry *dentry;
+
+	mutex_init(&nn->bl_mutex);
+	init_waitqueue_head(&nn->bl_wq);
+	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+	if (IS_ERR(nn->bl_device_pipe))
+		return PTR_ERR(nn->bl_device_pipe);
+	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+	if (IS_ERR(dentry)) {
+		rpc_destroy_pipe_data(nn->bl_device_pipe);
+		return PTR_ERR(dentry);
+	}
+	nn->bl_device_pipe->dentry = dentry;
+	return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+	rpc_destroy_pipe_data(nn->bl_device_pipe);
+	nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+	.init = nfs4blocklayout_net_init,
+	.exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+	int ret;
+
+	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+	if (ret)
+		goto out;
+	ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+	if (ret)
+		goto out_unregister_notifier;
+	return 0;
+
+out_unregister_notifier:
+	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+	return ret;
+}
+
+void __exit bl_cleanup_pipefs(void)
+{
+	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/kernel/fs/nfs/cache_lib.c b/kernel/fs/nfs/cache_lib.c
new file mode 100644
index 000000000..5f7b05372
--- /dev/null
+++ b/kernel/fs/nfs/cache_lib.c
@@ -0,0 +1,158 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <net/net_namespace.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+				"/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+		sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+		"the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+	static char *envp[] = { "HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+	char *argv[] = {
+		nfs_cache_getent_prog,
+		cd->name,
+		entry_name,
+		NULL
+	};
+	int ret = -EACCES;
+
+	if (nfs_cache_getent_prog[0] == '\0')
+		goto out;
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	/*
+	 * Disable the upcall mechanism if we're getting an ENOENT or
+	 * EACCES error. The admin can re-enable it on the fly by using
+	 * sysfs to set the 'cache_getent' parameter once the problem
+	 * has been fixed.
+	 */
+	if (ret == -ENOENT || ret == -EACCES)
+		nfs_cache_getent_prog[0] = '\0';
+out:
+	return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+	if (atomic_dec_and_test(&dreq->count))
+		kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+	complete_all(&dreq->completion);
+	nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(req, struct nfs_cache_defer_req, req);
+	dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+	atomic_inc(&dreq->count);
+
+	return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+	if (dreq) {
+		init_completion(&dreq->completion);
+		atomic_set(&dreq->count, 1);
+		dreq->req.defer = nfs_dns_cache_defer;
+	}
+	return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+	if (wait_for_completion_timeout(&dreq->completion,
+			nfs_cache_getent_timeout * HZ) == 0)
+		return -ETIMEDOUT;
+	return 0;
+}
+
+int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
+{
+	int ret;
+	struct dentry *dir;
+
+	dir = rpc_d_lookup_sb(sb, "cache");
+	ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
+	dput(dir);
+	return ret;
+}
+
+int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
+{
+	struct super_block *pipefs_sb;
+	int ret = 0;
+
+	sunrpc_init_cache_detail(cd);
+	pipefs_sb = rpc_get_sb_net(net);
+	if (pipefs_sb) {
+		ret = nfs_cache_register_sb(pipefs_sb, cd);
+		rpc_put_sb_net(net);
+		if (ret)
+			sunrpc_destroy_cache_detail(cd);
+	}
+	return ret;
+}
+
+void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
+{
+	if (cd->u.pipefs.dir)
+		sunrpc_cache_unregister_pipefs(cd);
+}
+
+void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
+{
+	struct super_block *pipefs_sb;
+
+	pipefs_sb = rpc_get_sb_net(net);
+	if (pipefs_sb) {
+		nfs_cache_unregister_sb(pipefs_sb, cd);
+		rpc_put_sb_net(net);
+	}
+	sunrpc_destroy_cache_detail(cd);
+}
diff --git a/kernel/fs/nfs/cache_lib.h b/kernel/fs/nfs/cache_lib.h
new file mode 100644
index 000000000..4116d2c3f
--- /dev/null
+++ b/kernel/fs/nfs/cache_lib.h
@@ -0,0 +1,31 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+	struct cache_req req;
+	struct cache_deferred_req deferred_req;
+	struct completion completion;
+	atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
+extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
+extern int nfs_cache_register_sb(struct super_block *sb,
+				 struct cache_detail *cd);
+extern void nfs_cache_unregister_sb(struct super_block *sb,
+				    struct cache_detail *cd);
diff --git a/kernel/fs/nfs/callback.c b/kernel/fs/nfs/callback.c
new file mode 100644
index 000000000..8d129bb73
--- /dev/null
+++ b/kernel/fs/nfs/callback.c
@@ -0,0 +1,499 @@
+/*
+ * linux/fs/nfs/callback.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback handling
+ */
+
+#include <linux/completion.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/nfs_fs.h>
+#include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <net/inet_sock.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+struct nfs_callback_data {
+	unsigned int users;
+	struct svc_serv *serv;
+	struct svc_rqst *rqst;
+	struct task_struct *task;
+};
+
+static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
+static DEFINE_MUTEX(nfs_callback_mutex);
+static struct svc_program nfs4_callback_program;
+
+static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
+{
+	int ret;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	ret = svc_create_xprt(serv, "tcp", net, PF_INET,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+	if (ret <= 0)
+		goto out_err;
+	nn->nfs_callback_tcpport = ret;
+	dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
+			nn->nfs_callback_tcpport, PF_INET, net);
+
+	ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+	if (ret > 0) {
+		nn->nfs_callback_tcpport6 = ret;
+		dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
+				nn->nfs_callback_tcpport6, PF_INET6, net);
+	} else if (ret != -EAFNOSUPPORT)
+		goto out_err;
+	return 0;
+
+out_err:
+	return (ret) ? ret : -ENOMEM;
+}
+
+/*
+ * This is the NFSv4 callback kernel thread.
+ */
+static int
+nfs4_callback_svc(void *vrqstp)
+{
+	int err;
+	struct svc_rqst *rqstp = vrqstp;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		/*
+		 * Listen for a request on the socket
+		 */
+		err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
+		if (err == -EAGAIN || err == -EINTR)
+			continue;
+		svc_process(rqstp);
+	}
+	return 0;
+}
+
+/*
+ * Prepare to bring up the NFSv4 callback service
+ */
+static struct svc_rqst *
+nfs4_callback_up(struct svc_serv *serv)
+{
+	return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
+{
+	/*
+	 * Create an svc_sock for the back channel service that shares the
+	 * fore channel connection.
+	 * Returns the input port (0) and sets the svc_serv bc_xprt on success
+	 */
+	return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
+			      SVC_SOCK_ANONYMOUS);
+}
+
+/*
+ * The callback service for NFSv4.1 callbacks
+ */
+static int
+nfs41_callback_svc(void *vrqstp)
+{
+	struct svc_rqst *rqstp = vrqstp;
+	struct svc_serv *serv = rqstp->rq_server;
+	struct rpc_rqst *req;
+	int error;
+	DEFINE_WAIT(wq);
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+
+		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+		spin_lock_bh(&serv->sv_cb_lock);
+		if (!list_empty(&serv->sv_cb_list)) {
+			req = list_first_entry(&serv->sv_cb_list,
+					struct rpc_rqst, rq_bc_list);
+			list_del(&req->rq_bc_list);
+			spin_unlock_bh(&serv->sv_cb_lock);
+			finish_wait(&serv->sv_cb_waitq, &wq);
+			dprintk("Invoking bc_svc_process()\n");
+			error = bc_svc_process(serv, req, rqstp);
+			dprintk("bc_svc_process() returned w/ error code= %d\n",
+				error);
+		} else {
+			spin_unlock_bh(&serv->sv_cb_lock);
+			schedule();
+			finish_wait(&serv->sv_cb_waitq, &wq);
+		}
+		flush_signals(current);
+	}
+	return 0;
+}
+
+/*
+ * Bring up the NFSv4.1 callback service
+ */
+static struct svc_rqst *
+nfs41_callback_up(struct svc_serv *serv)
+{
+	struct svc_rqst *rqstp;
+
+	INIT_LIST_HEAD(&serv->sv_cb_list);
+	spin_lock_init(&serv->sv_cb_lock);
+	init_waitqueue_head(&serv->sv_cb_waitq);
+	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+	if (IS_ERR(rqstp)) {
+		svc_xprt_put(serv->sv_bc_xprt);
+		serv->sv_bc_xprt = NULL;
+	}
+	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
+	return rqstp;
+}
+
+static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
+		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+	*rqstpp = nfs41_callback_up(serv);
+	*callback_svc = nfs41_callback_svc;
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+		struct svc_serv *serv)
+{
+	if (minorversion)
+		/*
+		 * Save the svc_serv in the transport so that it can
+		 * be referenced when the session backchannel is initialized
+		 */
+		xprt->bc_serv = serv;
+}
+#else
+static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
+{
+	return 0;
+}
+
+static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
+		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+	*rqstpp = ERR_PTR(-ENOTSUPP);
+	*callback_svc = ERR_PTR(-ENOTSUPP);
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+		struct svc_serv *serv)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
+				  struct svc_serv *serv)
+{
+	struct svc_rqst *rqstp;
+	int (*callback_svc)(void *vrqstp);
+	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	int ret;
+
+	nfs_callback_bc_serv(minorversion, xprt, serv);
+
+	if (cb_info->task)
+		return 0;
+
+	switch (minorversion) {
+	case 0:
+		/* v4.0 callback setup */
+		rqstp = nfs4_callback_up(serv);
+		callback_svc = nfs4_callback_svc;
+		break;
+	default:
+		nfs_minorversion_callback_svc_setup(serv,
+				&rqstp, &callback_svc);
+	}
+
+	if (IS_ERR(rqstp))
+		return PTR_ERR(rqstp);
+
+	svc_sock_update_bufs(serv);
+
+	cb_info->serv = serv;
+	cb_info->rqst = rqstp;
+	cb_info->task = kthread_create(callback_svc, cb_info->rqst,
+				    "nfsv4.%u-svc", minorversion);
+	if (IS_ERR(cb_info->task)) {
+		ret = PTR_ERR(cb_info->task);
+		svc_exit_thread(cb_info->rqst);
+		cb_info->rqst = NULL;
+		cb_info->task = NULL;
+		return ret;
+	}
+	rqstp->rq_task = cb_info->task;
+	wake_up_process(cb_info->task);
+	dprintk("nfs_callback_up: service started\n");
+	return 0;
+}
+
+static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	if (--nn->cb_users[minorversion])
+		return;
+
+	dprintk("NFS: destroy per-net callback data; net=%p\n", net);
+	svc_shutdown_net(serv, net);
+}
+
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	int ret;
+
+	if (nn->cb_users[minorversion]++)
+		return 0;
+
+	dprintk("NFS: create per-net callback data; net=%p\n", net);
+
+	ret = svc_bind(serv, net);
+	if (ret < 0) {
+		printk(KERN_WARNING "NFS: bind callback service failed\n");
+		goto err_bind;
+	}
+
+	switch (minorversion) {
+		case 0:
+			ret = nfs4_callback_up_net(serv, net);
+			break;
+		case 1:
+		case 2:
+			ret = nfs41_callback_up_net(serv, net);
+			break;
+		default:
+			printk(KERN_ERR "NFS: unknown callback version: %d\n",
+					minorversion);
+			ret = -EINVAL;
+			break;
+	}
+
+	if (ret < 0) {
+		printk(KERN_ERR "NFS: callback service start failed\n");
+		goto err_socks;
+	}
+	return 0;
+
+err_socks:
+	svc_rpcb_cleanup(serv, net);
+err_bind:
+	dprintk("NFS: Couldn't create callback socket: err = %d; "
+			"net = %p\n", ret, net);
+	return ret;
+}
+
+static struct svc_serv *nfs_callback_create_svc(int minorversion)
+{
+	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	struct svc_serv *serv;
+
+	/*
+	 * Check whether we're already up and running.
+	 */
+	if (cb_info->task) {
+		/*
+		 * Note: increase service usage, because later in case of error
+		 * svc_destroy() will be called.
+		 */
+		svc_get(cb_info->serv);
+		return cb_info->serv;
+	}
+
+	/*
+	 * Sanity check: if there's no task,
+	 * we should be the first user ...
+	 */
+	if (cb_info->users)
+		printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
+			cb_info->users);
+
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+	if (!serv) {
+		printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	/* As there is only one thread we need to over-ride the
+	 * default maximum of 80 connections
+	 */
+	serv->sv_maxconn = 1024;
+	dprintk("nfs_callback_create_svc: service created\n");
+	return serv;
+}
+
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+	struct svc_serv *serv;
+	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	int ret;
+	struct net *net = xprt->xprt_net;
+
+	mutex_lock(&nfs_callback_mutex);
+
+	serv = nfs_callback_create_svc(minorversion);
+	if (IS_ERR(serv)) {
+		ret = PTR_ERR(serv);
+		goto err_create;
+	}
+
+	ret = nfs_callback_up_net(minorversion, serv, net);
+	if (ret < 0)
+		goto err_net;
+
+	ret = nfs_callback_start_svc(minorversion, xprt, serv);
+	if (ret < 0)
+		goto err_start;
+
+	cb_info->users++;
+	/*
+	 * svc_create creates the svc_serv with sv_nrthreads == 1, and then
+	 * svc_prepare_thread increments that. So we need to call svc_destroy
+	 * on both success and failure so that the refcount is 1 when the
+	 * thread exits.
+	 */
+err_net:
+	svc_destroy(serv);
+err_create:
+	mutex_unlock(&nfs_callback_mutex);
+	return ret;
+
+err_start:
+	nfs_callback_down_net(minorversion, serv, net);
+	dprintk("NFS: Couldn't create server thread; err = %d\n", ret);
+	goto err_net;
+}
+
+/*
+ * Kill the callback thread if it's no longer being used.
+ */
+void nfs_callback_down(int minorversion, struct net *net)
+{
+	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+
+	mutex_lock(&nfs_callback_mutex);
+	nfs_callback_down_net(minorversion, cb_info->serv, net);
+	cb_info->users--;
+	if (cb_info->users == 0 && cb_info->task != NULL) {
+		kthread_stop(cb_info->task);
+		dprintk("nfs_callback_down: service stopped\n");
+		svc_exit_thread(cb_info->rqst);
+		dprintk("nfs_callback_down: service destroyed\n");
+		cb_info->serv = NULL;
+		cb_info->rqst = NULL;
+		cb_info->task = NULL;
+	}
+	mutex_unlock(&nfs_callback_mutex);
+}
+
+/* Boolean check of RPC_AUTH_GSS principal */
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
+{
+	char *p = rqstp->rq_cred.cr_principal;
+
+	if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+		return 1;
+
+	/* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+	if (clp->cl_minorversion != 0)
+		return 0;
+	/*
+	 * It might just be a normal user principal, in which case
+	 * userspace won't bother to tell us the name at all.
+	 */
+	if (p == NULL)
+		return 0;
+
+	/*
+	 * Did we get the acceptor from userland during the SETCLIENID
+	 * negotiation?
+	 */
+	if (clp->cl_acceptor)
+		return !strcmp(p, clp->cl_acceptor);
+
+	/*
+	 * Otherwise try to verify it using the cl_hostname. Note that this
+	 * doesn't work if a non-canonical hostname was used in the devname.
+	 */
+
+	/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+
+	if (memcmp(p, "nfs@", 4) != 0)
+		return 0;
+	p += 4;
+	if (strcmp(p, clp->cl_hostname) != 0)
+		return 0;
+	return 1;
+}
+
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Drop packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
+static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+{
+	switch (rqstp->rq_authop->flavour) {
+	case RPC_AUTH_NULL:
+		if (rqstp->rq_proc != CB_NULL)
+			return SVC_DROP;
+		break;
+	case RPC_AUTH_GSS:
+		/* No RPC_AUTH_GSS support yet in NFSv4.1 */
+		 if (svc_is_backchannel(rqstp))
+			return SVC_DROP;
+	}
+	return SVC_OK;
+}
+
+/*
+ * Define NFS4 callback program
+ */
+static struct svc_version *nfs4_callback_version[] = {
+	[1] = &nfs4_callback_version1,
+	[4] = &nfs4_callback_version4,
+};
+
+static struct svc_stat nfs4_callback_stats;
+
+static struct svc_program nfs4_callback_program = {
+	.pg_prog = NFS4_CALLBACK,			/* RPC service number */
+	.pg_nvers = ARRAY_SIZE(nfs4_callback_version),	/* Number of entries */
+	.pg_vers = nfs4_callback_version,		/* version table */
+	.pg_name = "NFSv4 callback",			/* service name */
+	.pg_class = "nfs",				/* authentication class */
+	.pg_stats = &nfs4_callback_stats,
+	.pg_authenticate = nfs_callback_authenticate,
+};
diff --git a/kernel/fs/nfs/callback.h b/kernel/fs/nfs/callback.h
new file mode 100644
index 000000000..84326e9fb
--- /dev/null
+++ b/kernel/fs/nfs/callback.h
@@ -0,0 +1,214 @@
+/*
+ * linux/fs/nfs/callback.h
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback definitions
+ */
+#ifndef __LINUX_FS_NFS_CALLBACK_H
+#define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
+
+#define NFS4_CALLBACK 0x40000000
+#define NFS4_CALLBACK_XDRSIZE 2048
+#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
+
+enum nfs4_callback_procnum {
+	CB_NULL = 0,
+	CB_COMPOUND = 1,
+};
+
+enum nfs4_callback_opnum {
+	OP_CB_GETATTR = 3,
+	OP_CB_RECALL  = 4,
+/* Callback operations new to NFSv4.1 */
+	OP_CB_LAYOUTRECALL  = 5,
+	OP_CB_NOTIFY        = 6,
+	OP_CB_PUSH_DELEG    = 7,
+	OP_CB_RECALL_ANY    = 8,
+	OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+	OP_CB_RECALL_SLOT   = 10,
+	OP_CB_SEQUENCE      = 11,
+	OP_CB_WANTS_CANCELLED = 12,
+	OP_CB_NOTIFY_LOCK   = 13,
+	OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+	OP_CB_OFFLOAD = 15,
+	OP_CB_ILLEGAL = 10044,
+};
+
+struct cb_process_state {
+	__be32			drc_status;
+	struct nfs_client	*clp;
+	u32			slotid;
+	u32			minorversion;
+	struct net		*net;
+};
+
+struct cb_compound_hdr_arg {
+	unsigned int taglen;
+	const char *tag;
+	unsigned int minorversion;
+	unsigned int cb_ident; /* v4.0 callback identifier */
+	unsigned nops;
+};
+
+struct cb_compound_hdr_res {
+	__be32 *status;
+	unsigned int taglen;
+	const char *tag;
+	__be32 *nops;
+};
+
+struct cb_getattrargs {
+	struct sockaddr *addr;
+	struct nfs_fh fh;
+	uint32_t bitmap[2];
+};
+
+struct cb_getattrres {
+	__be32 status;
+	uint32_t bitmap[2];
+	uint64_t size;
+	uint64_t change_attr;
+	struct timespec ctime;
+	struct timespec mtime;
+};
+
+struct cb_recallargs {
+	struct sockaddr *addr;
+	struct nfs_fh fh;
+	nfs4_stateid stateid;
+	uint32_t truncate;
+};
+
+#if defined(CONFIG_NFS_V4_1)
+
+struct referring_call {
+	uint32_t			rc_sequenceid;
+	uint32_t			rc_slotid;
+};
+
+struct referring_call_list {
+	struct nfs4_sessionid		rcl_sessionid;
+	uint32_t			rcl_nrefcalls;
+	struct referring_call 		*rcl_refcalls;
+};
+
+struct cb_sequenceargs {
+	struct sockaddr			*csa_addr;
+	struct nfs4_sessionid		csa_sessionid;
+	uint32_t			csa_sequenceid;
+	uint32_t			csa_slotid;
+	uint32_t			csa_highestslotid;
+	uint32_t			csa_cachethis;
+	uint32_t			csa_nrclists;
+	struct referring_call_list	*csa_rclists;
+};
+
+struct cb_sequenceres {
+	__be32				csr_status;
+	struct nfs4_sessionid		csr_sessionid;
+	uint32_t			csr_sequenceid;
+	uint32_t			csr_slotid;
+	uint32_t			csr_highestslotid;
+	uint32_t			csr_target_highestslotid;
+};
+
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+				       struct cb_sequenceres *res,
+				       struct cb_process_state *cps);
+
+extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
+					     const nfs4_stateid *stateid);
+
+#define RCA4_TYPE_MASK_RDATA_DLG	0
+#define RCA4_TYPE_MASK_WDATA_DLG	1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
+
+struct cb_recallanyargs {
+	struct sockaddr	*craa_addr;
+	uint32_t	craa_objs_to_keep;
+	uint32_t	craa_type_mask;
+};
+
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+					void *dummy,
+					struct cb_process_state *cps);
+
+struct cb_recallslotargs {
+	struct sockaddr	*crsa_addr;
+	uint32_t	crsa_target_highest_slotid;
+};
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
+					 void *dummy,
+					 struct cb_process_state *cps);
+
+struct cb_layoutrecallargs {
+	struct sockaddr		*cbl_addr;
+	uint32_t		cbl_recall_type;
+	uint32_t		cbl_layout_type;
+	uint32_t		cbl_layoutchanged;
+	union {
+		struct {
+			struct nfs_fh		cbl_fh;
+			struct pnfs_layout_range cbl_range;
+			nfs4_stateid		cbl_stateid;
+		};
+		struct nfs_fsid		cbl_fsid;
+	};
+};
+
+extern __be32 nfs4_callback_layoutrecall(
+	struct cb_layoutrecallargs *args,
+	void *dummy, struct cb_process_state *cps);
+
+struct cb_devicenotifyitem {
+	uint32_t		cbd_notify_type;
+	uint32_t		cbd_layout_type;
+	struct nfs4_deviceid	cbd_dev_id;
+	uint32_t		cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+	int				 ndevs;
+	struct cb_devicenotifyitem	 *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(
+	struct cb_devicenotifyargs *args,
+	void *dummy, struct cb_process_state *cps);
+
+#endif /* CONFIG_NFS_V4_1 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+				    struct cb_getattrres *res,
+				    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+				   struct cb_process_state *cps);
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
+extern void nfs_callback_down(int minorversion, struct net *net);
+extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
+					    const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4 */
+/*
+ * nfs41: Callbacks are expected to not cause substantial latency,
+ * so we limit their concurrency to 1 by setting up the maximum number
+ * of slots for the backchannel.
+ */
+#define NFS41_BC_MIN_CALLBACKS 1
+#define NFS41_BC_MAX_CALLBACKS 1
+
+extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_tcpport;
+
+#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/kernel/fs/nfs/callback_proc.c b/kernel/fs/nfs/callback_proc.c
new file mode 100644
index 000000000..197806fb8
--- /dev/null
+++ b/kernel/fs/nfs/callback_proc.c
@@ -0,0 +1,554 @@
+/*
+ * linux/fs/nfs/callback_proc.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback procedures
+ */
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "pnfs.h"
+#include "nfs4session.h"
+#include "nfs4trace.h"
+
+#ifdef NFS_DEBUG
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+#endif
+
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+			     struct cb_getattrres *res,
+			     struct cb_process_state *cps)
+{
+	struct nfs_delegation *delegation;
+	struct nfs_inode *nfsi;
+	struct inode *inode;
+
+	res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+		goto out;
+
+	res->bitmap[0] = res->bitmap[1] = 0;
+	res->status = htonl(NFS4ERR_BADHANDLE);
+
+	dprintk_rcu("NFS: GETATTR callback request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+	if (inode == NULL)
+		goto out;
+	nfsi = NFS_I(inode);
+	rcu_read_lock();
+	delegation = rcu_dereference(nfsi->delegation);
+	if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
+		goto out_iput;
+	res->size = i_size_read(inode);
+	res->change_attr = delegation->change_attr;
+	if (nfsi->nrequests != 0)
+		res->change_attr++;
+	res->ctime = inode->i_ctime;
+	res->mtime = inode->i_mtime;
+	res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
+		args->bitmap[0];
+	res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
+		args->bitmap[1];
+	res->status = 0;
+out_iput:
+	rcu_read_unlock();
+	iput(inode);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
+	return res->status;
+}
+
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+			    struct cb_process_state *cps)
+{
+	struct inode *inode;
+	__be32 res;
+	
+	res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+		goto out;
+
+	dprintk_rcu("NFS: RECALL callback request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	res = htonl(NFS4ERR_BADHANDLE);
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+	if (inode == NULL)
+		goto out;
+	/* Set up a helper thread to actually return the delegation */
+	switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+	case 0:
+		res = 0;
+		break;
+	case -ENOENT:
+		res = htonl(NFS4ERR_BAD_STATEID);
+		break;
+	default:
+		res = htonl(NFS4ERR_RESOURCE);
+	}
+	trace_nfs4_recall_delegation(inode, -ntohl(res));
+	iput(inode);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
+	return res;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/*
+ * Lookup a layout by filehandle.
+ *
+ * Note: gets a refcount on the layout hdr and on its respective inode.
+ * Caller must put the layout hdr and the inode.
+ *
+ * TODO: keep track of all layouts (and delegations) in a hash table
+ * hashed by filehandle.
+ */
+static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
+		struct nfs_fh *fh, nfs4_stateid *stateid)
+{
+	struct nfs_server *server;
+	struct inode *ino;
+	struct pnfs_layout_hdr *lo;
+
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry(lo, &server->layouts, plh_layouts) {
+			if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+				continue;
+			if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
+				continue;
+			ino = igrab(lo->plh_inode);
+			if (!ino)
+				break;
+			spin_lock(&ino->i_lock);
+			/* Is this layout in the process of being freed? */
+			if (NFS_I(ino)->layout != lo) {
+				spin_unlock(&ino->i_lock);
+				iput(ino);
+				break;
+			}
+			pnfs_get_layout_hdr(lo);
+			spin_unlock(&ino->i_lock);
+			return lo;
+		}
+	}
+
+	return NULL;
+}
+
+static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
+		struct nfs_fh *fh, nfs4_stateid *stateid)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&clp->cl_lock);
+	rcu_read_lock();
+	lo = get_layout_by_fh_locked(clp, fh, stateid);
+	rcu_read_unlock();
+	spin_unlock(&clp->cl_lock);
+
+	return lo;
+}
+
+static u32 initiate_file_draining(struct nfs_client *clp,
+				  struct cb_layoutrecallargs *args)
+{
+	struct inode *ino;
+	struct pnfs_layout_hdr *lo;
+	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+	LIST_HEAD(free_me_list);
+
+	lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
+	if (!lo)
+		goto out;
+
+	ino = lo->plh_inode;
+
+	spin_lock(&ino->i_lock);
+	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+	spin_unlock(&ino->i_lock);
+
+	pnfs_layoutcommit_inode(ino, false);
+
+	spin_lock(&ino->i_lock);
+	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+	    pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
+					&args->cbl_range)) {
+		rv = NFS4ERR_DELAY;
+		goto unlock;
+	}
+
+	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+			&args->cbl_range);
+	}
+unlock:
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&free_me_list);
+	pnfs_put_layout_hdr(lo);
+	iput(ino);
+out:
+	return rv;
+}
+
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+				  struct cb_layoutrecallargs *args)
+{
+	int stat;
+
+	if (args->cbl_recall_type == RETURN_FSID)
+		stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
+	else
+		stat = pnfs_destroy_layouts_byclid(clp, true);
+	if (stat != 0)
+		return NFS4ERR_DELAY;
+	return NFS4ERR_NOMATCHING_LAYOUT;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+				    struct cb_layoutrecallargs *args)
+{
+	u32 res;
+
+	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+	if (args->cbl_recall_type == RETURN_FILE)
+		res = initiate_file_draining(clp, args);
+	else
+		res = initiate_bulk_draining(clp, args);
+	dprintk("%s returning %i\n", __func__, res);
+	return res;
+
+}
+
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+				  void *dummy, struct cb_process_state *cps)
+{
+	u32 res;
+
+	dprintk("%s: -->\n", __func__);
+
+	if (cps->clp)
+		res = do_callback_layoutrecall(cps->clp, args);
+	else
+		res = NFS4ERR_OP_NOT_IN_SESSION;
+
+	dprintk("%s: exit with status = %d\n", __func__, res);
+	return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+	struct cb_layoutrecallargs args;
+
+	/* Pretend we got a CB_LAYOUTRECALL(ALL) */
+	memset(&args, 0, sizeof(args));
+	args.cbl_recall_type = RETURN_ALL;
+	/* FIXME we ignore errors, what should we do? */
+	do_callback_layoutrecall(clp, &args);
+}
+
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+				  void *dummy, struct cb_process_state *cps)
+{
+	int i;
+	__be32 res = 0;
+	struct nfs_client *clp = cps->clp;
+	struct nfs_server *server = NULL;
+
+	dprintk("%s: -->\n", __func__);
+
+	if (!clp) {
+		res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+		goto out;
+	}
+
+	for (i = 0; i < args->ndevs; i++) {
+		struct cb_devicenotifyitem *dev = &args->devs[i];
+
+		if (!server ||
+		    server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+			rcu_read_lock();
+			list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+				if (server->pnfs_curr_ld &&
+				    server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+					rcu_read_unlock();
+					goto found;
+				}
+			rcu_read_unlock();
+			dprintk("%s: layout type %u not found\n",
+				__func__, dev->cbd_layout_type);
+			continue;
+		}
+
+	found:
+		nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+	}
+
+out:
+	kfree(args->devs);
+	dprintk("%s: exit with status = %u\n",
+		__func__, be32_to_cpu(res));
+	return res;
+}
+
+/*
+ * Validate the sequenceID sent by the server.
+ * Return success if the sequenceID is one more than what we last saw on
+ * this slot, accounting for wraparound.  Increments the slot's sequence.
+ *
+ * We don't yet implement a duplicate request cache, instead we set the
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
+ * since we only currently implement idempotent callbacks anyway.
+ *
+ * We have a single slot backchannel at this time, so we don't bother
+ * checking the used_slots bit array on the table.  The lower layer guarantees
+ * a single outstanding callback request at a time.
+ */
+static __be32
+validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
+{
+	struct nfs4_slot *slot;
+
+	dprintk("%s enter. slotid %u seqid %u\n",
+		__func__, args->csa_slotid, args->csa_sequenceid);
+
+	if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
+		return htonl(NFS4ERR_BADSLOT);
+
+	slot = tbl->slots + args->csa_slotid;
+	dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
+
+	/* Normal */
+	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
+		slot->seq_nr++;
+		goto out_ok;
+	}
+
+	/* Replay */
+	if (args->csa_sequenceid == slot->seq_nr) {
+		dprintk("%s seqid %u is a replay\n",
+			__func__, args->csa_sequenceid);
+		/* Signal process_op to set this error on next op */
+		if (args->csa_cachethis == 0)
+			return htonl(NFS4ERR_RETRY_UNCACHED_REP);
+
+		/* The ca_maxresponsesize_cached is 0 with no DRC */
+		else if (args->csa_cachethis == 1)
+			return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+	}
+
+	/* Wraparound */
+	if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
+		slot->seq_nr = 1;
+		goto out_ok;
+	}
+
+	/* Misordered request */
+	return htonl(NFS4ERR_SEQ_MISORDERED);
+out_ok:
+	tbl->highest_used_slotid = args->csa_slotid;
+	return htonl(NFS4_OK);
+}
+
+/*
+ * For each referring call triple, check the session's slot table for
+ * a match.  If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static bool referring_call_exists(struct nfs_client *clp,
+				  uint32_t nrclists,
+				  struct referring_call_list *rclists)
+{
+	bool status = 0;
+	int i, j;
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+	struct referring_call_list *rclist;
+	struct referring_call *ref;
+
+	/*
+	 * XXX When client trunking is implemented, this becomes
+	 * a session lookup from within the loop
+	 */
+	session = clp->cl_session;
+	tbl = &session->fc_slot_table;
+
+	for (i = 0; i < nrclists; i++) {
+		rclist = &rclists[i];
+		if (memcmp(session->sess_id.data,
+			   rclist->rcl_sessionid.data,
+			   NFS4_MAX_SESSIONID_LEN) != 0)
+			continue;
+
+		for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+			ref = &rclist->rcl_refcalls[j];
+
+			dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
+				"slotid %u\n", __func__,
+				((u32 *)&rclist->rcl_sessionid.data)[0],
+				((u32 *)&rclist->rcl_sessionid.data)[1],
+				((u32 *)&rclist->rcl_sessionid.data)[2],
+				((u32 *)&rclist->rcl_sessionid.data)[3],
+				ref->rc_sequenceid, ref->rc_slotid);
+
+			spin_lock(&tbl->slot_tbl_lock);
+			status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
+				  tbl->slots[ref->rc_slotid].seq_nr ==
+					ref->rc_sequenceid);
+			spin_unlock(&tbl->slot_tbl_lock);
+			if (status)
+				goto out;
+		}
+	}
+
+out:
+	return status;
+}
+
+__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+			      struct cb_sequenceres *res,
+			      struct cb_process_state *cps)
+{
+	struct nfs4_slot_table *tbl;
+	struct nfs_client *clp;
+	int i;
+	__be32 status = htonl(NFS4ERR_BADSESSION);
+
+	clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+					 &args->csa_sessionid, cps->minorversion);
+	if (clp == NULL)
+		goto out;
+
+	if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+		goto out;
+	tbl = &clp->cl_session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/* state manager is resetting the session */
+	if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
+		spin_unlock(&tbl->slot_tbl_lock);
+		status = htonl(NFS4ERR_DELAY);
+		/* Return NFS4ERR_BADSESSION if we're draining the session
+		 * in order to reset it.
+		 */
+		if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+			status = htonl(NFS4ERR_BADSESSION);
+		goto out;
+	}
+
+	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
+	spin_unlock(&tbl->slot_tbl_lock);
+	if (status)
+		goto out;
+
+	cps->slotid = args->csa_slotid;
+
+	/*
+	 * Check for pending referring calls.  If a match is found, a
+	 * related callback was received before the response to the original
+	 * call.
+	 */
+	if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
+		status = htonl(NFS4ERR_DELAY);
+		goto out;
+	}
+
+	memcpy(&res->csr_sessionid, &args->csa_sessionid,
+	       sizeof(res->csr_sessionid));
+	res->csr_sequenceid = args->csa_sequenceid;
+	res->csr_slotid = args->csa_slotid;
+	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+
+out:
+	cps->clp = clp; /* put in nfs4_callback_compound */
+	for (i = 0; i < args->csa_nrclists; i++)
+		kfree(args->csa_rclists[i].rcl_refcalls);
+	kfree(args->csa_rclists);
+
+	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+		cps->drc_status = status;
+		status = 0;
+	} else
+		res->csr_status = status;
+
+	trace_nfs4_cb_sequence(args, res, status);
+	dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
+		ntohl(status), ntohl(res->csr_status));
+	return status;
+}
+
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+	return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+			       struct cb_process_state *cps)
+{
+	__be32 status;
+	fmode_t flags = 0;
+
+	status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* set in cb_sequence */
+		goto out;
+
+	dprintk_rcu("NFS: RECALL_ANY callback request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	status = cpu_to_be32(NFS4ERR_INVAL);
+	if (!validate_bitmap_values(args->craa_type_mask))
+		goto out;
+
+	status = cpu_to_be32(NFS4_OK);
+	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
+		     &args->craa_type_mask))
+		flags = FMODE_READ;
+	if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
+		     &args->craa_type_mask))
+		flags |= FMODE_WRITE;
+	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+		     &args->craa_type_mask))
+		pnfs_recall_all_layouts(cps->clp);
+	if (flags)
+		nfs_expire_unused_delegation_types(cps->clp, flags);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+				struct cb_process_state *cps)
+{
+	struct nfs4_slot_table *fc_tbl;
+	__be32 status;
+
+	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* set in cb_sequence */
+		goto out;
+
+	dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+		args->crsa_target_highest_slotid);
+
+	fc_tbl = &cps->clp->cl_session->fc_slot_table;
+
+	status = htonl(NFS4_OK);
+
+	nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
+	nfs41_server_notify_target_slotid_update(cps->clp);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/kernel/fs/nfs/callback_xdr.c b/kernel/fs/nfs/callback_xdr.c
new file mode 100644
index 000000000..19ca95cdf
--- /dev/null
+++ b/kernel/fs/nfs/callback_xdr.c
@@ -0,0 +1,1033 @@
+/*
+ * linux/fs/nfs/callback_xdr.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback encode/decode procedures
+ */
+#include <linux/kernel.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+#include "nfs4session.h"
+
+#define CB_OP_TAGLEN_MAXSZ	(512)
+#define CB_OP_HDR_RES_MAXSZ	(2 + CB_OP_TAGLEN_MAXSZ)
+#define CB_OP_GETATTR_BITMAP_MAXSZ	(4)
+#define CB_OP_GETATTR_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
+				CB_OP_GETATTR_BITMAP_MAXSZ + \
+				2 + 2 + 3 + 3)
+#define CB_OP_RECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+
+#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
+					4 + 1 + 3)
+#define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#endif /* CONFIG_NFS_V4_1 */
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR	11050
+
+typedef __be32 (*callback_process_op_t)(void *, void *,
+					struct cb_process_state *);
+typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
+
+
+struct callback_op {
+	callback_process_op_t process_op;
+	callback_decode_arg_t decode_args;
+	callback_encode_res_t encode_res;
+	long res_maxsize;
+};
+
+static struct callback_op callback_ops[];
+
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return htonl(NFS4_OK);
+}
+
+static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, nbytes);
+	if (unlikely(p == NULL))
+		printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
+	return p;
+}
+
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
+{
+	__be32 *p;
+
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	*len = ntohl(*p);
+
+	if (*len != 0) {
+		p = read_buf(xdr, *len);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*str = (const char *)p;
+	} else
+		*str = NULL;
+
+	return 0;
+}
+
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	fh->size = ntohl(*p);
+	if (fh->size > NFS4_FHSIZE)
+		return htonl(NFS4ERR_BADHANDLE);
+	p = read_buf(xdr, fh->size);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	memcpy(&fh->data[0], p, fh->size);
+	memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size);
+	return 0;
+}
+
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+	__be32 *p;
+	unsigned int attrlen;
+
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	attrlen = ntohl(*p);
+	p = read_buf(xdr, attrlen << 2);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	if (likely(attrlen > 0))
+		bitmap[0] = ntohl(*p++);
+	if (attrlen > 1)
+		bitmap[1] = ntohl(*p);
+	return 0;
+}
+
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = read_buf(xdr, NFS4_STATEID_SIZE);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	memcpy(stateid, p, NFS4_STATEID_SIZE);
+	return 0;
+}
+
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+{
+	__be32 *p;
+	__be32 status;
+
+	status = decode_string(xdr, &hdr->taglen, &hdr->tag);
+	if (unlikely(status != 0))
+		return status;
+	/* We do not like overly long tags! */
+	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+		printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
+				__func__, hdr->taglen);
+		return htonl(NFS4ERR_RESOURCE);
+	}
+	p = read_buf(xdr, 12);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	hdr->minorversion = ntohl(*p++);
+	/* Check for minor version support */
+	if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
+	} else {
+		pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
+			"illegal minor version %u!\n",
+			__func__, hdr->minorversion);
+		return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+	}
+	hdr->nops = ntohl(*p);
+	dprintk("%s: minorversion %d nops %d\n", __func__,
+		hdr->minorversion, hdr->nops);
+	return 0;
+}
+
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+{
+	__be32 *p;
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE_HDR);
+	*op = ntohl(*p);
+	return 0;
+}
+
+static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
+{
+	__be32 status;
+
+	status = decode_fh(xdr, &args->fh);
+	if (unlikely(status != 0))
+		goto out;
+	args->addr = svc_addr(rqstp);
+	status = decode_bitmap(xdr, args->bitmap);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
+{
+	__be32 *p;
+	__be32 status;
+
+	args->addr = svc_addr(rqstp);
+	status = decode_stateid(xdr, &args->stateid);
+	if (unlikely(status != 0))
+		goto out;
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_RESOURCE);
+		goto out;
+	}
+	args->truncate = ntohl(*p);
+	status = decode_fh(xdr, &args->fh);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct cb_layoutrecallargs *args)
+{
+	__be32 *p;
+	__be32 status = 0;
+	uint32_t iomode;
+
+	args->cbl_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4 * sizeof(uint32_t));
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+
+	args->cbl_layout_type = ntohl(*p++);
+	/* Depite the spec's xdr, iomode really belongs in the FILE switch,
+	 * as it is unusable and ignored with the other types.
+	 */
+	iomode = ntohl(*p++);
+	args->cbl_layoutchanged = ntohl(*p++);
+	args->cbl_recall_type = ntohl(*p++);
+
+	if (args->cbl_recall_type == RETURN_FILE) {
+		args->cbl_range.iomode = iomode;
+		status = decode_fh(xdr, &args->cbl_fh);
+		if (unlikely(status != 0))
+			goto out;
+
+		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto out;
+		}
+		p = xdr_decode_hyper(p, &args->cbl_range.offset);
+		p = xdr_decode_hyper(p, &args->cbl_range.length);
+		status = decode_stateid(xdr, &args->cbl_stateid);
+		if (unlikely(status != 0))
+			goto out;
+	} else if (args->cbl_recall_type == RETURN_FSID) {
+		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto out;
+		}
+		p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+		p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+	} else if (args->cbl_recall_type != RETURN_ALL) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+	dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+		__func__,
+		args->cbl_layout_type, iomode,
+		args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct cb_devicenotifyargs *args)
+{
+	__be32 *p;
+	__be32 status = 0;
+	u32 tmp;
+	int n, i;
+	args->ndevs = 0;
+
+	/* Num of device notifications */
+	p = read_buf(xdr, sizeof(uint32_t));
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+	n = ntohl(*p++);
+	if (n <= 0)
+		goto out;
+	if (n > ULONG_MAX / sizeof(*args->devs)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+
+	args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
+	if (!args->devs) {
+		status = htonl(NFS4ERR_DELAY);
+		goto out;
+	}
+
+	/* Decode each dev notification */
+	for (i = 0; i < n; i++) {
+		struct cb_devicenotifyitem *dev = &args->devs[i];
+
+		p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto err;
+		}
+
+		tmp = ntohl(*p++);	/* bitmap size */
+		if (tmp != 1) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+		dev->cbd_notify_type = ntohl(*p++);
+		if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+		    dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+
+		tmp = ntohl(*p++);	/* opaque size */
+		if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+		     (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+		    ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+		     (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+		dev->cbd_layout_type = ntohl(*p++);
+		memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+		if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+			p = read_buf(xdr, sizeof(uint32_t));
+			if (unlikely(p == NULL)) {
+				status = htonl(NFS4ERR_BADXDR);
+				goto err;
+			}
+			dev->cbd_immediate = ntohl(*p++);
+		} else {
+			dev->cbd_immediate = 0;
+		}
+
+		args->ndevs++;
+
+		dprintk("%s: type %d layout 0x%x immediate %d\n",
+			__func__, dev->cbd_notify_type, dev->cbd_layout_type,
+			dev->cbd_immediate);
+	}
+out:
+	dprintk("%s: status %d ndevs %d\n",
+		__func__, ntohl(status), args->ndevs);
+	return status;
+err:
+	kfree(args->devs);
+	goto out;
+}
+
+static __be32 decode_sessionid(struct xdr_stream *xdr,
+				 struct nfs4_sessionid *sid)
+{
+	__be32 *p;
+	int len = NFS4_MAX_SESSIONID_LEN;
+
+	p = read_buf(xdr, len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+
+	memcpy(sid->data, p, len);
+	return 0;
+}
+
+static __be32 decode_rc_list(struct xdr_stream *xdr,
+			       struct referring_call_list *rc_list)
+{
+	__be32 *p;
+	int i;
+	__be32 status;
+
+	status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
+	if (status)
+		goto out;
+
+	status = htonl(NFS4ERR_RESOURCE);
+	p = read_buf(xdr, sizeof(uint32_t));
+	if (unlikely(p == NULL))
+		goto out;
+
+	rc_list->rcl_nrefcalls = ntohl(*p++);
+	if (rc_list->rcl_nrefcalls) {
+		p = read_buf(xdr,
+			     rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
+		if (unlikely(p == NULL))
+			goto out;
+		rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
+						sizeof(*rc_list->rcl_refcalls),
+						GFP_KERNEL);
+		if (unlikely(rc_list->rcl_refcalls == NULL))
+			goto out;
+		for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
+			rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
+			rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
+		}
+	}
+	status = 0;
+
+out:
+	return status;
+}
+
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct cb_sequenceargs *args)
+{
+	__be32 *p;
+	int i;
+	__be32 status;
+
+	status = decode_sessionid(xdr, &args->csa_sessionid);
+	if (status)
+		goto out;
+
+	status = htonl(NFS4ERR_RESOURCE);
+	p = read_buf(xdr, 5 * sizeof(uint32_t));
+	if (unlikely(p == NULL))
+		goto out;
+
+	args->csa_addr = svc_addr(rqstp);
+	args->csa_sequenceid = ntohl(*p++);
+	args->csa_slotid = ntohl(*p++);
+	args->csa_highestslotid = ntohl(*p++);
+	args->csa_cachethis = ntohl(*p++);
+	args->csa_nrclists = ntohl(*p++);
+	args->csa_rclists = NULL;
+	if (args->csa_nrclists) {
+		args->csa_rclists = kmalloc_array(args->csa_nrclists,
+						  sizeof(*args->csa_rclists),
+						  GFP_KERNEL);
+		if (unlikely(args->csa_rclists == NULL))
+			goto out;
+
+		for (i = 0; i < args->csa_nrclists; i++) {
+			status = decode_rc_list(xdr, &args->csa_rclists[i]);
+			if (status) {
+				args->csa_nrclists = i;
+				goto out_free;
+			}
+		}
+	}
+	status = 0;
+
+	dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
+		"highestslotid %u cachethis %d nrclists %u\n",
+		__func__,
+		((u32 *)&args->csa_sessionid)[0],
+		((u32 *)&args->csa_sessionid)[1],
+		((u32 *)&args->csa_sessionid)[2],
+		((u32 *)&args->csa_sessionid)[3],
+		args->csa_sequenceid, args->csa_slotid,
+		args->csa_highestslotid, args->csa_cachethis,
+		args->csa_nrclists);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+
+out_free:
+	for (i = 0; i < args->csa_nrclists; i++)
+		kfree(args->csa_rclists[i].rcl_refcalls);
+	kfree(args->csa_rclists);
+	goto out;
+}
+
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
+				      struct xdr_stream *xdr,
+				      struct cb_recallanyargs *args)
+{
+	uint32_t bitmap[2];
+	__be32 *p, status;
+
+	args->craa_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->craa_objs_to_keep = ntohl(*p++);
+	status = decode_bitmap(xdr, bitmap);
+	if (unlikely(status))
+		return status;
+	args->craa_type_mask = bitmap[0];
+
+	return 0;
+}
+
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct cb_recallslotargs *args)
+{
+	__be32 *p;
+
+	args->crsa_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->crsa_target_highest_slotid = ntohl(*p++);
+	return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	xdr_encode_opaque(p, str, len);
+	return 0;
+}
+
+#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
+#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
+{
+	__be32 bm[2];
+	__be32 *p;
+
+	bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
+	bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
+	if (bm[1] != 0) {
+		p = xdr_reserve_space(xdr, 16);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*p++ = htonl(2);
+		*p++ = bm[0];
+		*p++ = bm[1];
+	} else if (bm[0] != 0) {
+		p = xdr_reserve_space(xdr, 12);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*p++ = htonl(1);
+		*p++ = bm[0];
+	} else {
+		p = xdr_reserve_space(xdr, 8);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*p++ = htonl(0);
+	}
+	*savep = p;
+	return 0;
+}
+
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+{
+	__be32 *p;
+
+	if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
+		return 0;
+	p = xdr_reserve_space(xdr, 8);
+	if (unlikely(!p))
+		return htonl(NFS4ERR_RESOURCE);
+	p = xdr_encode_hyper(p, change);
+	return 0;
+}
+
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+{
+	__be32 *p;
+
+	if (!(bitmap[0] & FATTR4_WORD0_SIZE))
+		return 0;
+	p = xdr_reserve_space(xdr, 8);
+	if (unlikely(!p))
+		return htonl(NFS4ERR_RESOURCE);
+	p = xdr_encode_hyper(p, size);
+	return 0;
+}
+
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 12);
+	if (unlikely(!p))
+		return htonl(NFS4ERR_RESOURCE);
+	p = xdr_encode_hyper(p, time->tv_sec);
+	*p = htonl(time->tv_nsec);
+	return 0;
+}
+
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+{
+	if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
+		return 0;
+	return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+{
+	if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
+		return 0;
+	return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+{
+	__be32 status;
+
+	hdr->status = xdr_reserve_space(xdr, 4);
+	if (unlikely(hdr->status == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	status = encode_string(xdr, hdr->taglen, hdr->tag);
+	if (unlikely(status != 0))
+		return status;
+	hdr->nops = xdr_reserve_space(xdr, 4);
+	if (unlikely(hdr->nops == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	return 0;
+}
+
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
+{
+	__be32 *p;
+	
+	p = xdr_reserve_space(xdr, 8);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE_HDR);
+	*p++ = htonl(op);
+	*p = res;
+	return 0;
+}
+
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
+{
+	__be32 *savep = NULL;
+	__be32 status = res->status;
+	
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_bitmap(xdr, res->bitmap, &savep);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_change(xdr, res->bitmap, res->change_attr);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_size(xdr, res->bitmap, res->size);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
+	*savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 encode_sessionid(struct xdr_stream *xdr,
+				 const struct nfs4_sessionid *sid)
+{
+	__be32 *p;
+	int len = NFS4_MAX_SESSIONID_LEN;
+
+	p = xdr_reserve_space(xdr, len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+
+	memcpy(p, sid, len);
+	return 0;
+}
+
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       const struct cb_sequenceres *res)
+{
+	__be32 *p;
+	__be32 status = res->csr_status;
+
+	if (unlikely(status != 0))
+		goto out;
+
+	encode_sessionid(xdr, &res->csr_sessionid);
+
+	p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+
+	*p++ = htonl(res->csr_sequenceid);
+	*p++ = htonl(res->csr_slotid);
+	*p++ = htonl(res->csr_highestslotid);
+	*p++ = htonl(res->csr_target_highestslotid);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	if (op_nr == OP_CB_SEQUENCE) {
+		if (nop != 0)
+			return htonl(NFS4ERR_SEQUENCE_POS);
+	} else {
+		if (nop == 0)
+			return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	}
+
+	switch (op_nr) {
+	case OP_CB_GETATTR:
+	case OP_CB_RECALL:
+	case OP_CB_SEQUENCE:
+	case OP_CB_RECALL_ANY:
+	case OP_CB_RECALL_SLOT:
+	case OP_CB_LAYOUTRECALL:
+	case OP_CB_NOTIFY_DEVICEID:
+		*op = &callback_ops[op_nr];
+		break;
+
+	case OP_CB_NOTIFY:
+	case OP_CB_PUSH_DELEG:
+	case OP_CB_RECALLABLE_OBJ_AVAIL:
+	case OP_CB_WANTS_CANCELLED:
+	case OP_CB_NOTIFY_LOCK:
+		return htonl(NFS4ERR_NOTSUPP);
+
+	default:
+		return htonl(NFS4ERR_OP_ILLEGAL);
+	}
+
+	return htonl(NFS_OK);
+}
+
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+	struct nfs4_slot_table *tbl = &session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/*
+	 * Let the state manager know callback processing done.
+	 * A single slot, so highest used slotid is either 0 or -1
+	 */
+	tbl->highest_used_slotid = NFS4_NO_SLOT;
+	nfs4_slot_tbl_drain_complete(tbl);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+	if (cps->slotid != NFS4_NO_SLOT)
+		nfs4_callback_free_slot(cps->clp->cl_session);
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	__be32 status = preprocess_nfs41_op(nop, op_nr, op);
+	if (status != htonl(NFS4ERR_OP_ILLEGAL))
+		return status;
+
+	if (op_nr == OP_CB_OFFLOAD)
+		return htonl(NFS4ERR_NOTSUPP);
+	return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static __be32
+preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
+{
+	switch (op_nr) {
+	case OP_CB_GETATTR:
+	case OP_CB_RECALL:
+		*op = &callback_ops[op_nr];
+		break;
+	default:
+		return htonl(NFS4ERR_OP_ILLEGAL);
+	}
+
+	return htonl(NFS_OK);
+}
+
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
+		struct xdr_stream *xdr_in, void *argp,
+		struct xdr_stream *xdr_out, void *resp,
+		struct cb_process_state *cps)
+{
+	struct callback_op *op = &callback_ops[0];
+	unsigned int op_nr;
+	__be32 status;
+	long maxlen;
+	__be32 res;
+
+	dprintk("%s: start\n", __func__);
+	status = decode_op_hdr(xdr_in, &op_nr);
+	if (unlikely(status))
+		return status;
+
+	dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
+		__func__, cps->minorversion, nop, op_nr);
+
+	switch (cps->minorversion) {
+	case 0:
+		status = preprocess_nfs4_op(op_nr, &op);
+		break;
+	case 1:
+		status = preprocess_nfs41_op(nop, op_nr, &op);
+		break;
+	case 2:
+		status = preprocess_nfs42_op(nop, op_nr, &op);
+		break;
+	default:
+		status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+	}
+
+	if (status == htonl(NFS4ERR_OP_ILLEGAL))
+		op_nr = OP_CB_ILLEGAL;
+	if (status)
+		goto encode_hdr;
+
+	if (cps->drc_status) {
+		status = cps->drc_status;
+		goto encode_hdr;
+	}
+
+	maxlen = xdr_out->end - xdr_out->p;
+	if (maxlen > 0 && maxlen < PAGE_SIZE) {
+		status = op->decode_args(rqstp, xdr_in, argp);
+		if (likely(status == 0))
+			status = op->process_op(argp, resp, cps);
+	} else
+		status = htonl(NFS4ERR_RESOURCE);
+
+encode_hdr:
+	res = encode_op_hdr(xdr_out, op_nr, status);
+	if (unlikely(res))
+		return res;
+	if (op->encode_res != NULL && status == 0)
+		status = op->encode_res(rqstp, xdr_out, resp);
+	dprintk("%s: done, status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+/*
+ * Decode, process and encode a COMPOUND
+ */
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	struct cb_compound_hdr_arg hdr_arg = { 0 };
+	struct cb_compound_hdr_res hdr_res = { NULL };
+	struct xdr_stream xdr_in, xdr_out;
+	__be32 *p, status;
+	struct cb_process_state cps = {
+		.drc_status = 0,
+		.clp = NULL,
+		.slotid = NFS4_NO_SLOT,
+		.net = SVC_NET(rqstp),
+	};
+	unsigned int nops = 0;
+
+	dprintk("%s: start\n", __func__);
+
+	xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+
+	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+
+	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+	if (status == __constant_htonl(NFS4ERR_RESOURCE))
+		return rpc_garbage_args;
+
+	if (hdr_arg.minorversion == 0) {
+		cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
+		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
+			return rpc_drop_reply;
+	}
+
+	cps.minorversion = hdr_arg.minorversion;
+	hdr_res.taglen = hdr_arg.taglen;
+	hdr_res.tag = hdr_arg.tag;
+	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
+		return rpc_system_err;
+
+	while (status == 0 && nops != hdr_arg.nops) {
+		status = process_op(nops, rqstp, &xdr_in,
+				    argp, &xdr_out, resp, &cps);
+		nops++;
+	}
+
+	/* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+	* resource error in cb_compound status without returning op */
+	if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+		status = htonl(NFS4ERR_RESOURCE);
+		nops--;
+	}
+
+	*hdr_res.status = status;
+	*hdr_res.nops = htonl(nops);
+	nfs4_cb_free_slot(&cps);
+	nfs_put_client(cps.clp);
+	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
+	return rpc_success;
+}
+
+/*
+ * Define NFS4 callback COMPOUND ops.
+ */
+static struct callback_op callback_ops[] = {
+	[0] = {
+		.res_maxsize = CB_OP_HDR_RES_MAXSZ,
+	},
+	[OP_CB_GETATTR] = {
+		.process_op = (callback_process_op_t)nfs4_callback_getattr,
+		.decode_args = (callback_decode_arg_t)decode_getattr_args,
+		.encode_res = (callback_encode_res_t)encode_getattr_res,
+		.res_maxsize = CB_OP_GETATTR_RES_MAXSZ,
+	},
+	[OP_CB_RECALL] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recall,
+		.decode_args = (callback_decode_arg_t)decode_recall_args,
+		.res_maxsize = CB_OP_RECALL_RES_MAXSZ,
+	},
+#if defined(CONFIG_NFS_V4_1)
+	[OP_CB_LAYOUTRECALL] = {
+		.process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+		.decode_args =
+			(callback_decode_arg_t)decode_layoutrecall_args,
+		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+	},
+	[OP_CB_NOTIFY_DEVICEID] = {
+		.process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+		.decode_args =
+			(callback_decode_arg_t)decode_devicenotify_args,
+		.res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+	},
+	[OP_CB_SEQUENCE] = {
+		.process_op = (callback_process_op_t)nfs4_callback_sequence,
+		.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
+		.encode_res = (callback_encode_res_t)encode_cb_sequence_res,
+		.res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
+	},
+	[OP_CB_RECALL_ANY] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recallany,
+		.decode_args = (callback_decode_arg_t)decode_recallany_args,
+		.res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
+	},
+	[OP_CB_RECALL_SLOT] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recallslot,
+		.decode_args = (callback_decode_arg_t)decode_recallslot_args,
+		.res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+	},
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+/*
+ * Define NFS4 callback procedures
+ */
+static struct svc_procedure nfs4_callback_procedures1[] = {
+	[CB_NULL] = {
+		.pc_func = nfs4_callback_null,
+		.pc_decode = (kxdrproc_t)nfs4_decode_void,
+		.pc_encode = (kxdrproc_t)nfs4_encode_void,
+		.pc_xdrressize = 1,
+	},
+	[CB_COMPOUND] = {
+		.pc_func = nfs4_callback_compound,
+		.pc_encode = (kxdrproc_t)nfs4_encode_void,
+		.pc_argsize = 256,
+		.pc_ressize = 256,
+		.pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
+	}
+};
+
+struct svc_version nfs4_callback_version1 = {
+	.vs_vers = 1,
+	.vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+	.vs_proc = nfs4_callback_procedures1,
+	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+	.vs_dispatch = NULL,
+	.vs_hidden = 1,
+};
+
+struct svc_version nfs4_callback_version4 = {
+	.vs_vers = 4,
+	.vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+	.vs_proc = nfs4_callback_procedures1,
+	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+	.vs_dispatch = NULL,
+	.vs_hidden = 1,
+};
diff --git a/kernel/fs/nfs/client.c b/kernel/fs/nfs/client.c
new file mode 100644
index 000000000..892aefff3
--- /dev/null
+++ b/kernel/fs/nfs/client.c
@@ -0,0 +1,1472 @@
+/* client.c: NFS client sharing and management code
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <net/ipv6.h>
+#include <linux/nfs_xdr.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfs.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+
+static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+static DEFINE_SPINLOCK(nfs_version_lock);
+static DEFINE_MUTEX(nfs_version_mutex);
+static LIST_HEAD(nfs_versions);
+
+/*
+ * RPC cruft for NFS
+ */
+static const struct rpc_version *nfs_version[5] = {
+	[2] = NULL,
+	[3] = NULL,
+	[4] = NULL,
+};
+
+const struct rpc_program nfs_program = {
+	.name			= "nfs",
+	.number			= NFS_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfs_version),
+	.version		= nfs_version,
+	.stats			= &nfs_rpcstat,
+	.pipe_dir_name		= NFS_PIPE_DIRNAME,
+};
+
+struct rpc_stat nfs_rpcstat = {
+	.program		= &nfs_program
+};
+
+static struct nfs_subversion *find_nfs_version(unsigned int version)
+{
+	struct nfs_subversion *nfs;
+	spin_lock(&nfs_version_lock);
+
+	list_for_each_entry(nfs, &nfs_versions, list) {
+		if (nfs->rpc_ops->version == version) {
+			spin_unlock(&nfs_version_lock);
+			return nfs;
+		}
+	}
+
+	spin_unlock(&nfs_version_lock);
+	return ERR_PTR(-EPROTONOSUPPORT);
+}
+
+struct nfs_subversion *get_nfs_version(unsigned int version)
+{
+	struct nfs_subversion *nfs = find_nfs_version(version);
+
+	if (IS_ERR(nfs)) {
+		mutex_lock(&nfs_version_mutex);
+		request_module("nfsv%d", version);
+		nfs = find_nfs_version(version);
+		mutex_unlock(&nfs_version_mutex);
+	}
+
+	if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+		return ERR_PTR(-EAGAIN);
+	return nfs;
+}
+
+void put_nfs_version(struct nfs_subversion *nfs)
+{
+	module_put(nfs->owner);
+}
+
+void register_nfs_version(struct nfs_subversion *nfs)
+{
+	spin_lock(&nfs_version_lock);
+
+	list_add(&nfs->list, &nfs_versions);
+	nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers;
+
+	spin_unlock(&nfs_version_lock);
+}
+EXPORT_SYMBOL_GPL(register_nfs_version);
+
+void unregister_nfs_version(struct nfs_subversion *nfs)
+{
+	spin_lock(&nfs_version_lock);
+
+	nfs_version[nfs->rpc_ops->version] = NULL;
+	list_del(&nfs->list);
+
+	spin_unlock(&nfs_version_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_nfs_version);
+
+/*
+ * Allocate a shared client record
+ *
+ * Since these are allocated/deallocated very rarely, we don't
+ * bother putting them in a slab cache...
+ */
+struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+	struct nfs_client *clp;
+	struct rpc_cred *cred;
+	int err = -ENOMEM;
+
+	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
+		goto error_0;
+
+	clp->cl_nfs_mod = cl_init->nfs_mod;
+	if (!try_module_get(clp->cl_nfs_mod->owner))
+		goto error_dealloc;
+
+	clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
+
+	atomic_set(&clp->cl_count, 1);
+	clp->cl_cons_state = NFS_CS_INITING;
+
+	memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
+	clp->cl_addrlen = cl_init->addrlen;
+
+	if (cl_init->hostname) {
+		err = -ENOMEM;
+		clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
+		if (!clp->cl_hostname)
+			goto error_cleanup;
+	}
+
+	INIT_LIST_HEAD(&clp->cl_superblocks);
+	clp->cl_rpcclient = ERR_PTR(-EINVAL);
+
+	clp->cl_proto = cl_init->proto;
+	clp->cl_net = get_net(cl_init->net);
+
+	cred = rpc_lookup_machine_cred("*");
+	if (!IS_ERR(cred))
+		clp->cl_machine_cred = cred;
+	nfs_fscache_get_client_cookie(clp);
+
+	return clp;
+
+error_cleanup:
+	put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
+	kfree(clp);
+error_0:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_client);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+void nfs_cleanup_cb_ident_idr(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	idr_destroy(&nn->cb_ident_idr);
+}
+
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+	if (clp->cl_cb_ident)
+		idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
+#else
+void nfs_cleanup_cb_ident_idr(struct net *net)
+{
+}
+
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Destroy a shared client record
+ */
+void nfs_free_client(struct nfs_client *clp)
+{
+	dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
+
+	nfs_fscache_release_client_cookie(clp);
+
+	/* -EIO all pending I/O */
+	if (!IS_ERR(clp->cl_rpcclient))
+		rpc_shutdown_client(clp->cl_rpcclient);
+
+	if (clp->cl_machine_cred != NULL)
+		put_rpccred(clp->cl_machine_cred);
+
+	put_net(clp->cl_net);
+	put_nfs_version(clp->cl_nfs_mod);
+	kfree(clp->cl_hostname);
+	kfree(clp->cl_acceptor);
+	kfree(clp);
+
+	dprintk("<-- nfs_free_client()\n");
+}
+EXPORT_SYMBOL_GPL(nfs_free_client);
+
+/*
+ * Release a reference to a shared client record
+ */
+void nfs_put_client(struct nfs_client *clp)
+{
+	struct nfs_net *nn;
+
+	if (!clp)
+		return;
+
+	dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
+	nn = net_generic(clp->cl_net, nfs_net_id);
+
+	if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
+		list_del(&clp->cl_share_link);
+		nfs_cb_idr_remove_locked(clp);
+		spin_unlock(&nn->nfs_client_lock);
+
+		WARN_ON_ONCE(!list_empty(&clp->cl_superblocks));
+
+		clp->rpc_ops->free_client(clp);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_put_client);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/*
+ * Test if two ip6 socket addresses refer to the same socket by
+ * comparing relevant fields. The padding bytes specifically, are not
+ * compared. sin6_flowinfo is not compared because it only affects QoS
+ * and sin6_scope_id is only compared if the address is "link local"
+ * because "link local" addresses need only be unique to a specific
+ * link. Conversely, ordinary unicast addresses might have different
+ * sin6_scope_id.
+ *
+ * The caller should ensure both socket addresses are AF_INET6.
+ */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+		return 0;
+	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		return sin1->sin6_scope_id == sin2->sin6_scope_id;
+
+	return 1;
+}
+#else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Test if two ip4 socket addresses refer to the same socket, by
+ * comparing relevant fields. The padding bytes specifically, are
+ * not compared.
+ *
+ * The caller should ensure both socket addresses are AF_INET.
+ */
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+	return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+		(sin1->sin6_port == sin2->sin6_port);
+}
+
+static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+	return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+		(sin1->sin_port == sin2->sin_port);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+			      const struct sockaddr *sa2)
+{
+	if (sa1->sa_family != sa2->sa_family)
+		return 0;
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, including the port number.
+ */
+static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+			    const struct sockaddr *sa2)
+{
+	if (sa1->sa_family != sa2->sa_family)
+		return 0;
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_cmp_ip4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_cmp_ip6(sa1, sa2);
+	}
+	return 0;
+}
+
+/*
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
+ */
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
+{
+	struct nfs_client *clp;
+	const struct sockaddr *sap = data->addr;
+	struct nfs_net *nn = net_generic(data->net, nfs_net_id);
+
+	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+		/* Don't match clients that failed to initialise properly */
+		if (clp->cl_cons_state < 0)
+			continue;
+
+		/* Different NFS versions cannot share the same nfs_client */
+		if (clp->rpc_ops != data->nfs_mod->rpc_ops)
+			continue;
+
+		if (clp->cl_proto != data->proto)
+			continue;
+		/* Match nfsv4 minorversion */
+		if (clp->cl_minorversion != data->minorversion)
+			continue;
+		/* Match the full socket address */
+		if (!nfs_sockaddr_cmp(sap, clap))
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		return clp;
+	}
+	return NULL;
+}
+
+static bool nfs_client_init_is_complete(const struct nfs_client *clp)
+{
+	return clp->cl_cons_state <= NFS_CS_READY;
+}
+
+int nfs_wait_client_init_complete(const struct nfs_client *clp)
+{
+	return wait_event_killable(nfs_client_active_wq,
+			nfs_client_init_is_complete(clp));
+}
+EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete);
+
+/*
+ * Found an existing client.  Make sure it's ready before returning.
+ */
+static struct nfs_client *
+nfs_found_client(const struct nfs_client_initdata *cl_init,
+		 struct nfs_client *clp)
+{
+	int error;
+
+	error = nfs_wait_client_init_complete(clp);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(-ERESTARTSYS);
+	}
+
+	if (clp->cl_cons_state < NFS_CS_READY) {
+		error = clp->cl_cons_state;
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
+
+	smp_rmb();
+
+	dprintk("<-- %s found nfs_client %p for %s\n",
+		__func__, clp, cl_init->hostname ?: "");
+	return clp;
+}
+
+/*
+ * Look up a client by IP address and protocol version
+ * - creates a new record if one doesn't yet exist
+ */
+struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+	       const struct rpc_timeout *timeparms,
+	       const char *ip_addr,
+	       rpc_authflavor_t authflavour)
+{
+	struct nfs_client *clp, *new = NULL;
+	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
+	const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
+
+	if (cl_init->hostname == NULL) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	dprintk("--> nfs_get_client(%s,v%u)\n",
+		cl_init->hostname, rpc_ops->version);
+
+	/* see if the client already exists */
+	do {
+		spin_lock(&nn->nfs_client_lock);
+
+		clp = nfs_match_client(cl_init);
+		if (clp) {
+			spin_unlock(&nn->nfs_client_lock);
+			if (new)
+				new->rpc_ops->free_client(new);
+			return nfs_found_client(cl_init, clp);
+		}
+		if (new) {
+			list_add_tail(&new->cl_share_link,
+					&nn->nfs_client_list);
+			spin_unlock(&nn->nfs_client_lock);
+			new->cl_flags = cl_init->init_flags;
+			return rpc_ops->init_client(new, timeparms, ip_addr);
+		}
+
+		spin_unlock(&nn->nfs_client_lock);
+
+		new = rpc_ops->alloc_client(cl_init);
+	} while (!IS_ERR(new));
+
+	dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
+		cl_init->hostname, PTR_ERR(new));
+	return new;
+}
+EXPORT_SYMBOL_GPL(nfs_get_client);
+
+/*
+ * Mark a server as ready or failed
+ */
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
+{
+	smp_wmb();
+	clp->cl_cons_state = state;
+	wake_up_all(&nfs_client_active_wq);
+}
+EXPORT_SYMBOL_GPL(nfs_mark_client_ready);
+
+/*
+ * Initialise the timeout values for a connection
+ */
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+				    unsigned int timeo, unsigned int retrans)
+{
+	to->to_initval = timeo * HZ / 10;
+	to->to_retries = retrans;
+
+	switch (proto) {
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		if (to->to_retries == 0)
+			to->to_retries = NFS_DEF_TCP_RETRANS;
+		if (to->to_initval == 0)
+			to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
+		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+			to->to_initval = NFS_MAX_TCP_TIMEOUT;
+		to->to_increment = to->to_initval;
+		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+		if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+			to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+		if (to->to_maxval < to->to_initval)
+			to->to_maxval = to->to_initval;
+		to->to_exponential = 0;
+		break;
+	case XPRT_TRANSPORT_UDP:
+		if (to->to_retries == 0)
+			to->to_retries = NFS_DEF_UDP_RETRANS;
+		if (!to->to_initval)
+			to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
+		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+			to->to_initval = NFS_MAX_UDP_TIMEOUT;
+		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+		to->to_exponential = 1;
+		break;
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
+
+/*
+ * Create an RPC client handle
+ */
+int nfs_create_rpc_client(struct nfs_client *clp,
+			  const struct rpc_timeout *timeparms,
+			  rpc_authflavor_t flavor)
+{
+	struct rpc_clnt		*clnt = NULL;
+	struct rpc_create_args args = {
+		.net		= clp->cl_net,
+		.protocol	= clp->cl_proto,
+		.address	= (struct sockaddr *)&clp->cl_addr,
+		.addrsize	= clp->cl_addrlen,
+		.timeout	= timeparms,
+		.servername	= clp->cl_hostname,
+		.program	= &nfs_program,
+		.version	= clp->rpc_ops->version,
+		.authflavor	= flavor,
+	};
+
+	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+	if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;
+	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS;
+
+	if (!IS_ERR(clp->cl_rpcclient))
+		return 0;
+
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt)) {
+		dprintk("%s: cannot create RPC client. Error = %ld\n",
+				__func__, PTR_ERR(clnt));
+		return PTR_ERR(clnt);
+	}
+
+	clp->cl_rpcclient = clnt;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
+
+/*
+ * Version 2 or 3 client destruction
+ */
+static void nfs_destroy_server(struct nfs_server *server)
+{
+	if (server->nlm_host)
+		nlmclnt_done(server->nlm_host);
+}
+
+/*
+ * Version 2 or 3 lockd setup
+ */
+static int nfs_start_lockd(struct nfs_server *server)
+{
+	struct nlm_host *host;
+	struct nfs_client *clp = server->nfs_client;
+	struct nlmclnt_initdata nlm_init = {
+		.hostname	= clp->cl_hostname,
+		.address	= (struct sockaddr *)&clp->cl_addr,
+		.addrlen	= clp->cl_addrlen,
+		.nfs_version	= clp->rpc_ops->version,
+		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
+					1 : 0,
+		.net		= clp->cl_net,
+	};
+
+	if (nlm_init.nfs_version > 3)
+		return 0;
+	if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+			(server->flags & NFS_MOUNT_LOCAL_FCNTL))
+		return 0;
+
+	switch (clp->cl_proto) {
+		default:
+			nlm_init.protocol = IPPROTO_TCP;
+			break;
+		case XPRT_TRANSPORT_UDP:
+			nlm_init.protocol = IPPROTO_UDP;
+	}
+
+	host = nlmclnt_init(&nlm_init);
+	if (IS_ERR(host))
+		return PTR_ERR(host);
+
+	server->nlm_host = host;
+	server->destroy = nfs_destroy_server;
+	return 0;
+}
+
+/*
+ * Create a general RPC client
+ */
+int nfs_init_server_rpcclient(struct nfs_server *server,
+		const struct rpc_timeout *timeo,
+		rpc_authflavor_t pseudoflavour)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	server->client = rpc_clone_client_set_auth(clp->cl_rpcclient,
+							pseudoflavour);
+	if (IS_ERR(server->client)) {
+		dprintk("%s: couldn't create rpc_client!\n", __func__);
+		return PTR_ERR(server->client);
+	}
+
+	memcpy(&server->client->cl_timeout_default,
+			timeo,
+			sizeof(server->client->cl_timeout_default));
+	server->client->cl_timeout = &server->client->cl_timeout_default;
+	server->client->cl_softrtry = 0;
+	if (server->flags & NFS_MOUNT_SOFT)
+		server->client->cl_softrtry = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
+
+/**
+ * nfs_init_client - Initialise an NFS2 or NFS3 client
+ *
+ * @clp: nfs_client to initialise
+ * @timeparms: timeout parameters for underlying RPC transport
+ * @ip_addr: IP presentation address (not used)
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
+ */
+struct nfs_client *nfs_init_client(struct nfs_client *clp,
+		    const struct rpc_timeout *timeparms,
+		    const char *ip_addr)
+{
+	int error;
+
+	if (clp->cl_cons_state == NFS_CS_READY) {
+		/* the client is already initialised */
+		dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
+		return clp;
+	}
+
+	/*
+	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
+	 * - RFC 2623, sec 2.3.2
+	 */
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+	if (error < 0)
+		goto error;
+	nfs_mark_client_ready(clp, NFS_CS_READY);
+	return clp;
+
+error:
+	nfs_mark_client_ready(clp, error);
+	nfs_put_client(clp);
+	dprintk("<-- nfs_init_client() = xerror %d\n", error);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(nfs_init_client);
+
+/*
+ * Create a version 2 or 3 client
+ */
+static int nfs_init_server(struct nfs_server *server,
+			   const struct nfs_parsed_mount_data *data,
+			   struct nfs_subversion *nfs_mod)
+{
+	struct nfs_client_initdata cl_init = {
+		.hostname = data->nfs_server.hostname,
+		.addr = (const struct sockaddr *)&data->nfs_server.address,
+		.addrlen = data->nfs_server.addrlen,
+		.nfs_mod = nfs_mod,
+		.proto = data->nfs_server.protocol,
+		.net = data->net,
+	};
+	struct rpc_timeout timeparms;
+	struct nfs_client *clp;
+	int error;
+
+	dprintk("--> nfs_init_server()\n");
+
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+	if (data->flags & NFS_MOUNT_NORESVPORT)
+		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
+	/* Allocate or find a client reference we can use */
+	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
+	if (IS_ERR(clp)) {
+		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
+		return PTR_ERR(clp);
+	}
+
+	server->nfs_client = clp;
+
+	/* Initialise the client representation from the mount data */
+	server->flags = data->flags;
+	server->options = data->options;
+	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;
+
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+
+	server->acregmin = data->acregmin * HZ;
+	server->acregmax = data->acregmax * HZ;
+	server->acdirmin = data->acdirmin * HZ;
+	server->acdirmax = data->acdirmax * HZ;
+
+	/* Start lockd here, before we might error out */
+	error = nfs_start_lockd(server);
+	if (error < 0)
+		goto error;
+
+	server->port = data->nfs_server.port;
+	server->auth_info = data->auth_info;
+
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
+	if (error < 0)
+		goto error;
+
+	/* Preserve the values of mount_server-related mount options */
+	if (data->mount_server.addrlen) {
+		memcpy(&server->mountd_address, &data->mount_server.address,
+			data->mount_server.addrlen);
+		server->mountd_addrlen = data->mount_server.addrlen;
+	}
+	server->mountd_version = data->mount_server.version;
+	server->mountd_port = data->mount_server.port;
+	server->mountd_protocol = data->mount_server.protocol;
+
+	server->namelen  = data->namlen;
+	dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
+	return 0;
+
+error:
+	server->nfs_client = NULL;
+	nfs_put_client(clp);
+	dprintk("<-- nfs_init_server() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Load up the server record from information gained in an fsinfo record
+ */
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+				  struct nfs_fh *mntfh,
+				  struct nfs_fsinfo *fsinfo)
+{
+	unsigned long max_rpc_payload;
+
+	/* Work out a lot of parameters */
+	if (server->rsize == 0)
+		server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+	if (server->wsize == 0)
+		server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+
+	if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
+		server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+	if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
+		server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+
+	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+	if (server->rsize > max_rpc_payload)
+		server->rsize = max_rpc_payload;
+	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+		server->rsize = NFS_MAX_FILE_IO_SIZE;
+	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	server->backing_dev_info.name = "nfs";
+	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+
+	if (server->wsize > max_rpc_payload)
+		server->wsize = max_rpc_payload;
+	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+		server->wsize = NFS_MAX_FILE_IO_SIZE;
+	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
+
+	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
+	if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
+		server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
+	if (server->dtsize > server->rsize)
+		server->dtsize = server->rsize;
+
+	if (server->flags & NFS_MOUNT_NOAC) {
+		server->acregmin = server->acregmax = 0;
+		server->acdirmin = server->acdirmax = 0;
+	}
+
+	server->maxfilesize = fsinfo->maxfilesize;
+
+	server->time_delta = fsinfo->time_delta;
+
+	/* We're airborne Set socket buffersize */
+	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+}
+
+/*
+ * Probe filesystem information, including the FSID on v2/v3
+ */
+int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+{
+	struct nfs_fsinfo fsinfo;
+	struct nfs_client *clp = server->nfs_client;
+	int error;
+
+	dprintk("--> nfs_probe_fsinfo()\n");
+
+	if (clp->rpc_ops->set_capabilities != NULL) {
+		error = clp->rpc_ops->set_capabilities(server, mntfh);
+		if (error < 0)
+			goto out_error;
+	}
+
+	fsinfo.fattr = fattr;
+	fsinfo.layouttype = 0;
+	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+	if (error < 0)
+		goto out_error;
+
+	nfs_server_set_fsinfo(server, mntfh, &fsinfo);
+
+	/* Get some general file system info */
+	if (server->namelen == 0) {
+		struct nfs_pathconf pathinfo;
+
+		pathinfo.fattr = fattr;
+		nfs_fattr_init(fattr);
+
+		if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+			server->namelen = pathinfo.max_namelen;
+	}
+
+	dprintk("<-- nfs_probe_fsinfo() = 0\n");
+	return 0;
+
+out_error:
+	dprintk("nfs_probe_fsinfo: error = %d\n", -error);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_probe_fsinfo);
+
+/*
+ * Copy useful information when duplicating a server record
+ */
+void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
+{
+	target->flags = source->flags;
+	target->rsize = source->rsize;
+	target->wsize = source->wsize;
+	target->acregmin = source->acregmin;
+	target->acregmax = source->acregmax;
+	target->acdirmin = source->acdirmin;
+	target->acdirmax = source->acdirmax;
+	target->caps = source->caps;
+	target->options = source->options;
+	target->auth_info = source->auth_info;
+}
+EXPORT_SYMBOL_GPL(nfs_server_copy_userdata);
+
+void nfs_server_insert_lists(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+	spin_lock(&nn->nfs_client_lock);
+	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+	list_add_tail(&server->master_link, &nn->nfs_volume_list);
+	clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+	spin_unlock(&nn->nfs_client_lock);
+
+}
+EXPORT_SYMBOL_GPL(nfs_server_insert_lists);
+
+void nfs_server_remove_lists(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_net *nn;
+
+	if (clp == NULL)
+		return;
+	nn = net_generic(clp->cl_net, nfs_net_id);
+	spin_lock(&nn->nfs_client_lock);
+	list_del_rcu(&server->client_link);
+	if (list_empty(&clp->cl_superblocks))
+		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+	list_del(&server->master_link);
+	spin_unlock(&nn->nfs_client_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nfs_server_remove_lists);
+
+/*
+ * Allocate and initialise a server record
+ */
+struct nfs_server *nfs_alloc_server(void)
+{
+	struct nfs_server *server;
+
+	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (!server)
+		return NULL;
+
+	server->client = server->client_acl = ERR_PTR(-EINVAL);
+
+	/* Zero out the NFS state stuff */
+	INIT_LIST_HEAD(&server->client_link);
+	INIT_LIST_HEAD(&server->master_link);
+	INIT_LIST_HEAD(&server->delegations);
+	INIT_LIST_HEAD(&server->layouts);
+	INIT_LIST_HEAD(&server->state_owners_lru);
+
+	atomic_set(&server->active, 0);
+
+	server->io_stats = nfs_alloc_iostats();
+	if (!server->io_stats) {
+		kfree(server);
+		return NULL;
+	}
+
+	if (bdi_init(&server->backing_dev_info)) {
+		nfs_free_iostats(server->io_stats);
+		kfree(server);
+		return NULL;
+	}
+
+	ida_init(&server->openowner_id);
+	ida_init(&server->lockowner_id);
+	pnfs_init_server(server);
+
+	return server;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_server);
+
+/*
+ * Free up a server record
+ */
+void nfs_free_server(struct nfs_server *server)
+{
+	dprintk("--> nfs_free_server()\n");
+
+	nfs_server_remove_lists(server);
+
+	if (server->destroy != NULL)
+		server->destroy(server);
+
+	if (!IS_ERR(server->client_acl))
+		rpc_shutdown_client(server->client_acl);
+	if (!IS_ERR(server->client))
+		rpc_shutdown_client(server->client);
+
+	nfs_put_client(server->nfs_client);
+
+	ida_destroy(&server->lockowner_id);
+	ida_destroy(&server->openowner_id);
+	nfs_free_iostats(server->io_stats);
+	bdi_destroy(&server->backing_dev_info);
+	kfree(server);
+	nfs_release_automount_timer();
+	dprintk("<-- nfs_free_server()\n");
+}
+EXPORT_SYMBOL_GPL(nfs_free_server);
+
+/*
+ * Create a version 2 or 3 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
+				     struct nfs_subversion *nfs_mod)
+{
+	struct nfs_server *server;
+	struct nfs_fattr *fattr;
+	int error;
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	error = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto error;
+
+	/* Get a client representation */
+	error = nfs_init_server(server, mount_info->parsed, nfs_mod);
+	if (error < 0)
+		goto error;
+
+	/* Probe the root fh to retrieve its FSID */
+	error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr);
+	if (error < 0)
+		goto error;
+	if (server->nfs_client->rpc_ops->version == 3) {
+		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+			server->namelen = NFS3_MAXNAMLEN;
+		if (!(mount_info->parsed->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+			server->namelen = NFS2_MAXNAMLEN;
+	}
+
+	if (!(fattr->valid & NFS_ATTR_FATTR)) {
+		error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
+		if (error < 0) {
+			dprintk("nfs_create_server: getattr error = %d\n", -error);
+			goto error;
+		}
+	}
+	memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+	dprintk("Server FSID: %llx:%llx\n",
+		(unsigned long long) server->fsid.major,
+		(unsigned long long) server->fsid.minor);
+
+	nfs_server_insert_lists(server);
+	server->mount_time = jiffies;
+	nfs_free_fattr(fattr);
+	return server;
+
+error:
+	nfs_free_fattr(fattr);
+	nfs_free_server(server);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(nfs_create_server);
+
+/*
+ * Clone an NFS2, NFS3 or NFS4 server record
+ */
+struct nfs_server *nfs_clone_server(struct nfs_server *source,
+				    struct nfs_fh *fh,
+				    struct nfs_fattr *fattr,
+				    rpc_authflavor_t flavor)
+{
+	struct nfs_server *server;
+	struct nfs_fattr *fattr_fsinfo;
+	int error;
+
+	dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
+		(unsigned long long) fattr->fsid.major,
+		(unsigned long long) fattr->fsid.minor);
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	error = -ENOMEM;
+	fattr_fsinfo = nfs_alloc_fattr();
+	if (fattr_fsinfo == NULL)
+		goto out_free_server;
+
+	/* Copy data from the source */
+	server->nfs_client = source->nfs_client;
+	server->destroy = source->destroy;
+	atomic_inc(&server->nfs_client->cl_count);
+	nfs_server_copy_userdata(server, source);
+
+	server->fsid = fattr->fsid;
+
+	error = nfs_init_server_rpcclient(server,
+			source->client->cl_timeout,
+			flavor);
+	if (error < 0)
+		goto out_free_server;
+
+	/* probe the filesystem info for this server filesystem */
+	error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
+	if (error < 0)
+		goto out_free_server;
+
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
+	dprintk("Cloned FSID: %llx:%llx\n",
+		(unsigned long long) server->fsid.major,
+		(unsigned long long) server->fsid.minor);
+
+	error = nfs_start_lockd(server);
+	if (error < 0)
+		goto out_free_server;
+
+	nfs_server_insert_lists(server);
+	server->mount_time = jiffies;
+
+	nfs_free_fattr(fattr_fsinfo);
+	dprintk("<-- nfs_clone_server() = %p\n", server);
+	return server;
+
+out_free_server:
+	nfs_free_fattr(fattr_fsinfo);
+	nfs_free_server(server);
+	dprintk("<-- nfs_clone_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(nfs_clone_server);
+
+void nfs_clients_init(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	INIT_LIST_HEAD(&nn->nfs_client_list);
+	INIT_LIST_HEAD(&nn->nfs_volume_list);
+#if IS_ENABLED(CONFIG_NFS_V4)
+	idr_init(&nn->cb_ident_idr);
+#endif
+	spin_lock_init(&nn->nfs_client_lock);
+	nn->boot_time = CURRENT_TIME;
+}
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *proc_fs_nfs;
+
+static int nfs_server_list_open(struct inode *inode, struct file *file);
+static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_server_list_stop(struct seq_file *p, void *v);
+static int nfs_server_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_server_list_ops = {
+	.start	= nfs_server_list_start,
+	.next	= nfs_server_list_next,
+	.stop	= nfs_server_list_stop,
+	.show	= nfs_server_list_show,
+};
+
+static const struct file_operations nfs_server_list_fops = {
+	.open		= nfs_server_list_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+	.owner		= THIS_MODULE,
+};
+
+static int nfs_volume_list_open(struct inode *inode, struct file *file);
+static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_volume_list_stop(struct seq_file *p, void *v);
+static int nfs_volume_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_volume_list_ops = {
+	.start	= nfs_volume_list_start,
+	.next	= nfs_volume_list_next,
+	.stop	= nfs_volume_list_stop,
+	.show	= nfs_volume_list_show,
+};
+
+static const struct file_operations nfs_volume_list_fops = {
+	.open		= nfs_volume_list_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which
+ * we're dealing
+ */
+static int nfs_server_list_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &nfs_server_list_ops,
+			   sizeof(struct seq_net_private));
+}
+
+/*
+ * set up the iterator to start reading from the server list and return the first item
+ */
+static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+				__acquires(&nn->nfs_client_lock)
+{
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+	/* lock the list against modification */
+	spin_lock(&nn->nfs_client_lock);
+	return seq_list_start_head(&nn->nfs_client_list, *_pos);
+}
+
+/*
+ * move to next server
+ */
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+	return seq_list_next(v, &nn->nfs_client_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_server_list_stop(struct seq_file *p, void *v)
+				__releases(&nn->nfs_client_lock)
+{
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+	spin_unlock(&nn->nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_server_list_show(struct seq_file *m, void *v)
+{
+	struct nfs_client *clp;
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+	/* display header on line 1 */
+	if (v == &nn->nfs_client_list) {
+		seq_puts(m, "NV SERVER   PORT USE HOSTNAME\n");
+		return 0;
+	}
+
+	/* display one transport per line on subsequent lines */
+	clp = list_entry(v, struct nfs_client, cl_share_link);
+
+	/* Check if the client is initialized */
+	if (clp->cl_cons_state != NFS_CS_READY)
+		return 0;
+
+	rcu_read_lock();
+	seq_printf(m, "v%u %s %s %3d %s\n",
+		   clp->rpc_ops->version,
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+		   atomic_read(&clp->cl_count),
+		   clp->cl_hostname);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/*
+ * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes
+ */
+static int nfs_volume_list_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &nfs_volume_list_ops,
+			   sizeof(struct seq_net_private));
+}
+
+/*
+ * set up the iterator to start reading from the volume list and return the first item
+ */
+static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+				__acquires(&nn->nfs_client_lock)
+{
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+	/* lock the list against modification */
+	spin_lock(&nn->nfs_client_lock);
+	return seq_list_start_head(&nn->nfs_volume_list, *_pos);
+}
+
+/*
+ * move to next volume
+ */
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+	return seq_list_next(v, &nn->nfs_volume_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_volume_list_stop(struct seq_file *p, void *v)
+				__releases(&nn->nfs_client_lock)
+{
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+	spin_unlock(&nn->nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_volume_list_show(struct seq_file *m, void *v)
+{
+	struct nfs_server *server;
+	struct nfs_client *clp;
+	char dev[8], fsid[17];
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+	/* display header on line 1 */
+	if (v == &nn->nfs_volume_list) {
+		seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
+		return 0;
+	}
+	/* display one transport per line on subsequent lines */
+	server = list_entry(v, struct nfs_server, master_link);
+	clp = server->nfs_client;
+
+	snprintf(dev, 8, "%u:%u",
+		 MAJOR(server->s_dev), MINOR(server->s_dev));
+
+	snprintf(fsid, 17, "%llx:%llx",
+		 (unsigned long long) server->fsid.major,
+		 (unsigned long long) server->fsid.minor);
+
+	rcu_read_lock();
+	seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
+		   clp->rpc_ops->version,
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+		   dev,
+		   fsid,
+		   nfs_server_fscache_state(server));
+	rcu_read_unlock();
+
+	return 0;
+}
+
+int nfs_fs_proc_net_init(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct proc_dir_entry *p;
+
+	nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
+	if (!nn->proc_nfsfs)
+		goto error_0;
+
+	/* a file of servers with which we're dealing */
+	p = proc_create("servers", S_IFREG|S_IRUGO,
+			nn->proc_nfsfs, &nfs_server_list_fops);
+	if (!p)
+		goto error_1;
+
+	/* a file of volumes that we have mounted */
+	p = proc_create("volumes", S_IFREG|S_IRUGO,
+			nn->proc_nfsfs, &nfs_volume_list_fops);
+	if (!p)
+		goto error_1;
+	return 0;
+
+error_1:
+	remove_proc_subtree("nfsfs", net->proc_net);
+error_0:
+	return -ENOMEM;
+}
+
+void nfs_fs_proc_net_exit(struct net *net)
+{
+	remove_proc_subtree("nfsfs", net->proc_net);
+}
+
+/*
+ * initialise the /proc/fs/nfsfs/ directory
+ */
+int __init nfs_fs_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
+	if (!proc_fs_nfs)
+		goto error_0;
+
+	/* a file of servers with which we're dealing */
+	p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
+	if (!p)
+		goto error_1;
+
+	/* a file of volumes that we have mounted */
+	p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
+	if (!p)
+		goto error_2;
+	return 0;
+
+error_2:
+	remove_proc_entry("servers", proc_fs_nfs);
+error_1:
+	remove_proc_entry("fs/nfsfs", NULL);
+error_0:
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/nfsfs/ directory
+ */
+void nfs_fs_proc_exit(void)
+{
+	remove_proc_entry("volumes", proc_fs_nfs);
+	remove_proc_entry("servers", proc_fs_nfs);
+	remove_proc_entry("fs/nfsfs", NULL);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/fs/nfs/delegation.c b/kernel/fs/nfs/delegation.c
new file mode 100644
index 000000000..029d688a9
--- /dev/null
+++ b/kernel/fs/nfs/delegation.c
@@ -0,0 +1,902 @@
+/*
+ * linux/fs/nfs/delegation.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFS file delegation management
+ *
+ */
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4trace.h"
+
+static void nfs_free_delegation(struct nfs_delegation *delegation)
+{
+	if (delegation->cred) {
+		put_rpccred(delegation->cred);
+		delegation->cred = NULL;
+	}
+	kfree_rcu(delegation, rcu);
+}
+
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
+{
+	struct nfs_delegation *delegation;
+	int ret = 0;
+
+	flags &= FMODE_READ|FMODE_WRITE;
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL && (delegation->type & flags) == flags &&
+	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+		if (mark)
+			nfs_mark_delegation_referenced(delegation);
+		ret = 1;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+	return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+	return nfs4_do_check_delegation(inode, flags, false);
+}
+
+static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+	struct inode *inode = state->inode;
+	struct file_lock *fl;
+	struct file_lock_context *flctx = inode->i_flctx;
+	struct list_head *list;
+	int status = 0;
+
+	if (flctx == NULL)
+		goto out;
+
+	list = &flctx->flc_posix;
+	spin_lock(&flctx->flc_lock);
+restart:
+	list_for_each_entry(fl, list, fl_list) {
+		if (nfs_file_open_context(fl->fl_file) != ctx)
+			continue;
+		spin_unlock(&flctx->flc_lock);
+		status = nfs4_lock_delegation_recall(fl, state, stateid);
+		if (status < 0)
+			goto out;
+		spin_lock(&flctx->flc_lock);
+	}
+	if (list == &flctx->flc_posix) {
+		list = &flctx->flc_flock;
+		goto restart;
+	}
+	spin_unlock(&flctx->flc_lock);
+out:
+	return status;
+}
+
+static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+	struct nfs4_state_owner *sp;
+	struct nfs4_state *state;
+	unsigned int seq;
+	int err;
+
+again:
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		state = ctx->state;
+		if (state == NULL)
+			continue;
+		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+			continue;
+		if (!nfs4_valid_open_stateid(state))
+			continue;
+		if (!nfs4_stateid_match(&state->stateid, stateid))
+			continue;
+		get_nfs_open_context(ctx);
+		spin_unlock(&inode->i_lock);
+		sp = state->owner;
+		/* Block nfs4_proc_unlck */
+		mutex_lock(&sp->so_delegreturn_mutex);
+		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+		err = nfs4_open_delegation_recall(ctx, state, stateid);
+		if (!err)
+			err = nfs_delegation_claim_locks(ctx, state, stateid);
+		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+			err = -EAGAIN;
+		mutex_unlock(&sp->so_delegreturn_mutex);
+		put_nfs_open_context(ctx);
+		if (err != 0)
+			return err;
+		goto again;
+	}
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+/**
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
+ */
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+				  struct nfs_openres *res)
+{
+	struct nfs_delegation *delegation;
+	struct rpc_cred *oldcred = NULL;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL) {
+		spin_lock(&delegation->lock);
+		if (delegation->inode != NULL) {
+			nfs4_stateid_copy(&delegation->stateid, &res->delegation);
+			delegation->type = res->delegation_type;
+			delegation->maxsize = res->maxsize;
+			oldcred = delegation->cred;
+			delegation->cred = get_rpccred(cred);
+			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
+				  &delegation->flags);
+			spin_unlock(&delegation->lock);
+			rcu_read_unlock();
+			put_rpccred(oldcred);
+			trace_nfs4_reclaim_delegation(inode, res->delegation_type);
+		} else {
+			/* We appear to have raced with a delegation return. */
+			spin_unlock(&delegation->lock);
+			rcu_read_unlock();
+			nfs_inode_set_delegation(inode, cred, res);
+		}
+	} else {
+		rcu_read_unlock();
+	}
+}
+
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+	int res = 0;
+
+	if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+		res = nfs4_proc_delegreturn(inode,
+				delegation->cred,
+				&delegation->stateid,
+				issync);
+	nfs_free_delegation(delegation);
+	return res;
+}
+
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+	struct inode *inode = NULL;
+
+	spin_lock(&delegation->lock);
+	if (delegation->inode != NULL)
+		inode = igrab(delegation->inode);
+	spin_unlock(&delegation->lock);
+	return inode;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
+{
+	struct nfs_delegation *ret = NULL;
+	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+	if (delegation == NULL)
+		goto out;
+	spin_lock(&delegation->lock);
+	if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		ret = delegation;
+	spin_unlock(&delegation->lock);
+out:
+	return ret;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return(struct nfs_inode *nfsi)
+{
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = nfs_start_delegation_return_locked(nfsi);
+	rcu_read_unlock();
+	return delegation;
+}
+
+static void
+nfs_abort_delegation_return(struct nfs_delegation *delegation,
+		struct nfs_client *clp)
+{
+
+	spin_lock(&delegation->lock);
+	clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+	set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+	spin_unlock(&delegation->lock);
+	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
+
+static struct nfs_delegation *
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+		struct nfs_delegation *delegation,
+		struct nfs_client *clp)
+{
+	struct nfs_delegation *deleg_cur =
+		rcu_dereference_protected(nfsi->delegation,
+				lockdep_is_held(&clp->cl_lock));
+
+	if (deleg_cur == NULL || delegation != deleg_cur)
+		return NULL;
+
+	spin_lock(&delegation->lock);
+	set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+	list_del_rcu(&delegation->super_list);
+	delegation->inode = NULL;
+	rcu_assign_pointer(nfsi->delegation, NULL);
+	spin_unlock(&delegation->lock);
+	return delegation;
+}
+
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
+		struct nfs_delegation *delegation,
+		struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	spin_lock(&clp->cl_lock);
+	delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);
+	spin_unlock(&clp->cl_lock);
+	return delegation;
+}
+
+static struct nfs_delegation *
+nfs_inode_detach_delegation(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_delegation *delegation;
+
+	delegation = nfs_start_delegation_return(nfsi);
+	if (delegation == NULL)
+		return NULL;
+	return nfs_detach_delegation(nfsi, delegation, server);
+}
+
+static void
+nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+		const struct nfs_delegation *update)
+{
+	if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
+		delegation->stateid.seqid = update->stateid.seqid;
+		smp_wmb();
+		delegation->type = update->type;
+	}
+}
+
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation, *old_delegation;
+	struct nfs_delegation *freeme = NULL;
+	int status = 0;
+
+	delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
+	if (delegation == NULL)
+		return -ENOMEM;
+	nfs4_stateid_copy(&delegation->stateid, &res->delegation);
+	delegation->type = res->delegation_type;
+	delegation->maxsize = res->maxsize;
+	delegation->change_attr = inode->i_version;
+	delegation->cred = get_rpccred(cred);
+	delegation->inode = inode;
+	delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+	spin_lock_init(&delegation->lock);
+
+	spin_lock(&clp->cl_lock);
+	old_delegation = rcu_dereference_protected(nfsi->delegation,
+					lockdep_is_held(&clp->cl_lock));
+	if (old_delegation != NULL) {
+		/* Is this an update of the existing delegation? */
+		if (nfs4_stateid_match_other(&old_delegation->stateid,
+					&delegation->stateid)) {
+			nfs_update_inplace_delegation(old_delegation,
+					delegation);
+			goto out;
+		}
+		/*
+		 * Deal with broken servers that hand out two
+		 * delegations for the same file.
+		 * Allow for upgrades to a WRITE delegation, but
+		 * nothing else.
+		 */
+		dfprintk(FILE, "%s: server %s handed out "
+				"a duplicate delegation!\n",
+				__func__, clp->cl_hostname);
+		if (delegation->type == old_delegation->type ||
+		    !(delegation->type & FMODE_WRITE)) {
+			freeme = delegation;
+			delegation = NULL;
+			goto out;
+		}
+		if (test_and_set_bit(NFS_DELEGATION_RETURNING,
+					&old_delegation->flags))
+			goto out;
+		freeme = nfs_detach_delegation_locked(nfsi,
+				old_delegation, clp);
+		if (freeme == NULL)
+			goto out;
+	}
+	list_add_tail_rcu(&delegation->super_list, &server->delegations);
+	rcu_assign_pointer(nfsi->delegation, delegation);
+	delegation = NULL;
+
+	/* Ensure we revalidate the attributes and page cache! */
+	spin_lock(&inode->i_lock);
+	nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+	spin_unlock(&inode->i_lock);
+	trace_nfs4_set_delegation(inode, res->delegation_type);
+
+out:
+	spin_unlock(&clp->cl_lock);
+	if (delegation != NULL)
+		nfs_free_delegation(delegation);
+	if (freeme != NULL)
+		nfs_do_return_delegation(inode, freeme, 0);
+	return status;
+}
+
+/*
+ * Basic procedure for returning a delegation to the server
+ */
+static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int err = 0;
+
+	if (delegation == NULL)
+		return 0;
+	do {
+		if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+			break;
+		err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+		if (!issync || err != -EAGAIN)
+			break;
+		/*
+		 * Guard against state recovery
+		 */
+		err = nfs4_wait_clnt_recover(clp);
+	} while (err == 0);
+
+	if (err) {
+		nfs_abort_delegation_return(delegation, clp);
+		goto out;
+	}
+	if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode)))
+		goto out;
+
+	err = nfs_do_return_delegation(inode, delegation, issync);
+out:
+	return err;
+}
+
+static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
+{
+	bool ret = false;
+
+	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		goto out;
+	if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+		ret = true;
+	if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) {
+		struct inode *inode;
+
+		spin_lock(&delegation->lock);
+		inode = delegation->inode;
+		if (inode && list_empty(&NFS_I(inode)->open_files))
+			ret = true;
+		spin_unlock(&delegation->lock);
+	}
+out:
+	return ret;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+	struct inode *inode;
+	int err = 0;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (!nfs_delegation_need_return(delegation))
+				continue;
+			if (!nfs_sb_active(server->super))
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL) {
+				rcu_read_unlock();
+				nfs_sb_deactive(server->super);
+				goto restart;
+			}
+			delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+			rcu_read_unlock();
+
+			err = nfs_end_delegation_return(inode, delegation, 0);
+			iput(inode);
+			nfs_sb_deactive(server->super);
+			if (!err)
+				goto restart;
+			set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+			return err;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
+ * @inode: inode to process
+ *
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
+ */
+void nfs_inode_return_delegation_noreclaim(struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+
+	delegation = nfs_inode_detach_delegation(inode);
+	if (delegation != NULL)
+		nfs_do_return_delegation(inode, delegation, 1);
+}
+
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * This routine will always flush any dirty data to disk on the
+ * assumption that if we need to return the delegation, then
+ * we should stop caching.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_inode_return_delegation(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+	int err = 0;
+
+	nfs_wb_all(inode);
+	delegation = nfs_start_delegation_return(nfsi);
+	if (delegation != NULL)
+		err = nfs_end_delegation_return(inode, delegation, 1);
+	return err;
+}
+
+static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+		struct nfs_delegation *delegation)
+{
+	set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+static void nfs_mark_return_delegation(struct nfs_server *server,
+		struct nfs_delegation *delegation)
+{
+	set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+	bool ret = false;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		nfs_mark_return_delegation(server, delegation);
+		ret = true;
+	}
+	return ret;
+}
+
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_server_mark_return_all_delegations(server);
+	rcu_read_unlock();
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+		nfs4_schedule_state_manager(clp);
+}
+
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+	nfs_client_mark_return_all_delegations(clp);
+	nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
+ */
+void nfs_server_return_all_delegations(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	bool need_wait;
+
+	if (clp == NULL)
+		return;
+
+	rcu_read_lock();
+	need_wait = nfs_server_mark_return_all_delegations(server);
+	rcu_read_unlock();
+
+	if (need_wait) {
+		nfs4_schedule_state_manager(clp);
+		nfs4_wait_clnt_recover(clp);
+	}
+}
+
+static void nfs_mark_return_unused_delegation_types(struct nfs_server *server,
+						 fmode_t flags)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
+			continue;
+		if (delegation->type & flags)
+			nfs_mark_return_if_closed_delegation(server, delegation);
+	}
+}
+
+static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp,
+							fmode_t flags)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_unused_delegation_types(server, flags);
+	rcu_read_unlock();
+}
+
+static void nfs_revoke_delegation(struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL) {
+		set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+		nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+	}
+	rcu_read_unlock();
+}
+
+void nfs_remove_bad_delegation(struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+
+	nfs_revoke_delegation(inode);
+	delegation = nfs_inode_detach_delegation(inode);
+	if (delegation) {
+		nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+		nfs_free_delegation(delegation);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
+
+/**
+ * nfs_expire_unused_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags)
+{
+	nfs_client_mark_return_unused_delegation_types(clp, flags);
+	nfs_delegation_run_state_manager(clp);
+}
+
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
+			continue;
+		nfs_mark_return_if_closed_delegation(server, delegation);
+	}
+}
+
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_unreferenced_delegations(server);
+	rcu_read_unlock();
+
+	nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_async_inode_return_delegation(struct inode *inode,
+				      const nfs4_stateid *stateid)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_delegation *delegation;
+
+	filemap_flush(inode->i_mapping);
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation == NULL)
+		goto out_enoent;
+
+	if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+		goto out_enoent;
+	nfs_mark_return_delegation(server, delegation);
+	rcu_read_unlock();
+
+	nfs_delegation_run_state_manager(clp);
+	return 0;
+out_enoent:
+	rcu_read_unlock();
+	return -ENOENT;
+}
+
+static struct inode *
+nfs_delegation_find_inode_server(struct nfs_server *server,
+				 const struct nfs_fh *fhandle)
+{
+	struct nfs_delegation *delegation;
+	struct inode *res = NULL;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		spin_lock(&delegation->lock);
+		if (delegation->inode != NULL &&
+		    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+			res = igrab(delegation->inode);
+		}
+		spin_unlock(&delegation->lock);
+		if (res != NULL)
+			break;
+	}
+	return res;
+}
+
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+					const struct nfs_fh *fhandle)
+{
+	struct nfs_server *server;
+	struct inode *res = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		res = nfs_delegation_find_inode_server(server, fhandle);
+		if (res != NULL)
+			break;
+	}
+	rcu_read_unlock();
+	return res;
+}
+
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_mark_reclaim(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_delegation_mark_reclaim_server(server);
+	rcu_read_unlock();
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+	struct inode *inode;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (test_bit(NFS_DELEGATION_RETURNING,
+						&delegation->flags))
+				continue;
+			if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+						&delegation->flags) == 0)
+				continue;
+			if (!nfs_sb_active(server->super))
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL) {
+				rcu_read_unlock();
+				nfs_sb_deactive(server->super);
+				goto restart;
+			}
+			delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+			rcu_read_unlock();
+			if (delegation != NULL) {
+				delegation = nfs_detach_delegation(NFS_I(inode),
+					delegation, server);
+				if (delegation != NULL)
+					nfs_free_delegation(delegation);
+			}
+			iput(inode);
+			nfs_sb_deactive(server->super);
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+	int ret = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		if (!list_empty(&server->delegations)) {
+			ret = 1;
+			break;
+		}
+	rcu_read_unlock();
+	return ret;
+}
+
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ * @flags: delegation type requirement
+ *
+ * Returns "true" and fills in "dst->data" * if inode had a delegation,
+ * otherwise "false" is returned.
+ */
+bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
+		fmode_t flags)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+	bool ret;
+
+	flags &= FMODE_READ|FMODE_WRITE;
+	rcu_read_lock();
+	delegation = rcu_dereference(nfsi->delegation);
+	ret = (delegation != NULL && (delegation->type & flags) == flags);
+	if (ret) {
+		nfs4_stateid_copy(dst, &delegation->stateid);
+		nfs_mark_delegation_referenced(delegation);
+	}
+	rcu_read_unlock();
+	return ret;
+}
diff --git a/kernel/fs/nfs/delegation.h b/kernel/fs/nfs/delegation.h
new file mode 100644
index 000000000..e3c20a3cc
--- /dev/null
+++ b/kernel/fs/nfs/delegation.h
@@ -0,0 +1,73 @@
+/*
+ * linux/fs/nfs/delegation.h
+ *
+ * Copyright (c) Trond Myklebust
+ *
+ * Definitions pertaining to NFS delegated files
+ */
+#ifndef FS_NFS_DELEGATION_H
+#define FS_NFS_DELEGATION_H
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * NFSv4 delegation
+ */
+struct nfs_delegation {
+	struct list_head super_list;
+	struct rpc_cred *cred;
+	struct inode *inode;
+	nfs4_stateid stateid;
+	fmode_t type;
+	loff_t maxsize;
+	__u64 change_attr;
+	unsigned long flags;
+	spinlock_t lock;
+	struct rcu_head rcu;
+};
+
+enum {
+	NFS_DELEGATION_NEED_RECLAIM = 0,
+	NFS_DELEGATION_RETURN,
+	NFS_DELEGATION_RETURN_IF_CLOSED,
+	NFS_DELEGATION_REFERENCED,
+	NFS_DELEGATION_RETURNING,
+	NFS_DELEGATION_REVOKED,
+};
+
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
+int nfs4_inode_return_delegation(struct inode *inode);
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_return_delegation_noreclaim(struct inode *inode);
+
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
+void nfs_server_return_all_delegations(struct nfs_server *);
+void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
+int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
+void nfs_remove_bad_delegation(struct inode *inode);
+
+void nfs_delegation_mark_reclaim(struct nfs_client *clp);
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
+
+/* NFSv4 delegation-related procedures */
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
+bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
+
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
+int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+
+#endif
+
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+	return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
+		!(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
+#endif
diff --git a/kernel/fs/nfs/dir.c b/kernel/fs/nfs/dir.c
new file mode 100644
index 000000000..b2c8b31b2
--- /dev/null
+++ b/kernel/fs/nfs/dir.c
@@ -0,0 +1,2520 @@
+/*
+ *  linux/fs/nfs/dir.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs directory handling functions
+ *
+ * 10 Apr 1996	Added silly rename for unlink	--okir
+ * 28 Sep 1996	Improved directory cache --okir
+ * 23 Aug 1997  Claus Heine claus@momo.math.rwth-aachen.de 
+ *              Re-implemented silly rename for unlink, newly implemented
+ *              silly rename for nfs_rename() following the suggestions
+ *              of Olaf Kirch (okir) found in this file.
+ *              Following Linus comments on my original hack, this version
+ *              depends only on the dcache stuff and doesn't touch the inode
+ *              layer (iput() and friends).
+ *  6 Jun 1999	Cache readdir lookups in the page cache. -DaveM
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+#include <linux/sched.h>
+#include <linux/kmemleak.h>
+#include <linux/xattr.h>
+
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+
+#include "nfstrace.h"
+
+/* #define NFS_DEBUG_VERBOSE 1 */
+
+static int nfs_opendir(struct inode *, struct file *);
+static int nfs_closedir(struct inode *, struct file *);
+static int nfs_readdir(struct file *, struct dir_context *);
+static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
+static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static void nfs_readdir_clear_array(struct page*);
+
+const struct file_operations nfs_dir_operations = {
+	.llseek		= nfs_llseek_dir,
+	.read		= generic_read_dir,
+	.iterate	= nfs_readdir,
+	.open		= nfs_opendir,
+	.release	= nfs_closedir,
+	.fsync		= nfs_fsync_dir,
+};
+
+const struct address_space_operations nfs_dir_aops = {
+	.freepage = nfs_readdir_clear_array,
+};
+
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
+{
+	struct nfs_inode *nfsi = NFS_I(dir);
+	struct nfs_open_dir_context *ctx;
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (ctx != NULL) {
+		ctx->duped = 0;
+		ctx->attr_gencount = nfsi->attr_gencount;
+		ctx->dir_cookie = 0;
+		ctx->dup_cookie = 0;
+		ctx->cred = get_rpccred(cred);
+		spin_lock(&dir->i_lock);
+		list_add(&ctx->list, &nfsi->open_files);
+		spin_unlock(&dir->i_lock);
+		return ctx;
+	}
+	return  ERR_PTR(-ENOMEM);
+}
+
+static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
+{
+	spin_lock(&dir->i_lock);
+	list_del(&ctx->list);
+	spin_unlock(&dir->i_lock);
+	put_rpccred(ctx->cred);
+	kfree(ctx);
+}
+
+/*
+ * Open file
+ */
+static int
+nfs_opendir(struct inode *inode, struct file *filp)
+{
+	int res = 0;
+	struct nfs_open_dir_context *ctx;
+	struct rpc_cred *cred;
+
+	dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);
+
+	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+	ctx = alloc_nfs_open_dir_context(inode, cred);
+	if (IS_ERR(ctx)) {
+		res = PTR_ERR(ctx);
+		goto out;
+	}
+	filp->private_data = ctx;
+	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+		/* This is a mountpoint, so d_revalidate will never
+		 * have been called, so we need to refresh the
+		 * inode (for close-open consistency) ourselves.
+		 */
+		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	}
+out:
+	put_rpccred(cred);
+	return res;
+}
+
+static int
+nfs_closedir(struct inode *inode, struct file *filp)
+{
+	put_nfs_open_dir_context(file_inode(filp), filp->private_data);
+	return 0;
+}
+
+struct nfs_cache_array_entry {
+	u64 cookie;
+	u64 ino;
+	struct qstr string;
+	unsigned char d_type;
+};
+
+struct nfs_cache_array {
+	int size;
+	int eof_index;
+	u64 last_cookie;
+	struct nfs_cache_array_entry array[0];
+};
+
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
+typedef struct {
+	struct file	*file;
+	struct page	*page;
+	struct dir_context *ctx;
+	unsigned long	page_index;
+	u64		*dir_cookie;
+	u64		last_cookie;
+	loff_t		current_index;
+	decode_dirent_t	decode;
+
+	unsigned long	timestamp;
+	unsigned long	gencount;
+	unsigned int	cache_entry_index;
+	unsigned int	plus:1;
+	unsigned int	eof:1;
+} nfs_readdir_descriptor_t;
+
+/*
+ * The caller is responsible for calling nfs_readdir_release_array(page)
+ */
+static
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+	void *ptr;
+	if (page == NULL)
+		return ERR_PTR(-EIO);
+	ptr = kmap(page);
+	if (ptr == NULL)
+		return ERR_PTR(-ENOMEM);
+	return ptr;
+}
+
+static
+void nfs_readdir_release_array(struct page *page)
+{
+	kunmap(page);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+void nfs_readdir_clear_array(struct page *page)
+{
+	struct nfs_cache_array *array;
+	int i;
+
+	array = kmap_atomic(page);
+	for (i = 0; i < array->size; i++)
+		kfree(array->array[i].string.name);
+	kunmap_atomic(array);
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+	string->len = len;
+	string->name = kmemdup(name, len, GFP_KERNEL);
+	if (string->name == NULL)
+		return -ENOMEM;
+	/*
+	 * Avoid a kmemleak false positive. The pointer to the name is stored
+	 * in a page cache page which kmemleak does not scan.
+	 */
+	kmemleak_not_leak(string->name);
+	string->hash = full_name_hash(name, len);
+	return 0;
+}
+
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	struct nfs_cache_array_entry *cache_entry;
+	int ret;
+
+	if (IS_ERR(array))
+		return PTR_ERR(array);
+
+	cache_entry = &array->array[array->size];
+
+	/* Check that this entry lies within the page bounds */
+	ret = -ENOSPC;
+	if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+		goto out;
+
+	cache_entry->cookie = entry->prev_cookie;
+	cache_entry->ino = entry->ino;
+	cache_entry->d_type = entry->d_type;
+	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+	if (ret)
+		goto out;
+	array->last_cookie = entry->cookie;
+	array->size++;
+	if (entry->eof != 0)
+		array->eof_index = array->size;
+out:
+	nfs_readdir_release_array(page);
+	return ret;
+}
+
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	loff_t diff = desc->ctx->pos - desc->current_index;
+	unsigned int index;
+
+	if (diff < 0)
+		goto out_eof;
+	if (diff >= array->size) {
+		if (array->eof_index >= 0)
+			goto out_eof;
+		return -EAGAIN;
+	}
+
+	index = (unsigned int)diff;
+	*desc->dir_cookie = array->array[index].cookie;
+	desc->cache_entry_index = index;
+	return 0;
+out_eof:
+	desc->eof = 1;
+	return -EBADCOOKIE;
+}
+
+static bool
+nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
+{
+	if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+		return false;
+	smp_rmb();
+	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
+}
+
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	int i;
+	loff_t new_pos;
+	int status = -EAGAIN;
+
+	for (i = 0; i < array->size; i++) {
+		if (array->array[i].cookie == *desc->dir_cookie) {
+			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
+			struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+			new_pos = desc->current_index + i;
+			if (ctx->attr_gencount != nfsi->attr_gencount ||
+			    !nfs_readdir_inode_mapping_valid(nfsi)) {
+				ctx->duped = 0;
+				ctx->attr_gencount = nfsi->attr_gencount;
+			} else if (new_pos < desc->ctx->pos) {
+				if (ctx->duped > 0
+				    && ctx->dup_cookie == *desc->dir_cookie) {
+					if (printk_ratelimit()) {
+						pr_notice("NFS: directory %pD2 contains a readdir loop."
+								"Please contact your server vendor.  "
+								"The file: %.*s has duplicate cookie %llu\n",
+								desc->file, array->array[i].string.len,
+								array->array[i].string.name, *desc->dir_cookie);
+					}
+					status = -ELOOP;
+					goto out;
+				}
+				ctx->dup_cookie = *desc->dir_cookie;
+				ctx->duped = -1;
+			}
+			desc->ctx->pos = new_pos;
+			desc->cache_entry_index = i;
+			return 0;
+		}
+	}
+	if (array->eof_index >= 0) {
+		status = -EBADCOOKIE;
+		if (*desc->dir_cookie == array->last_cookie)
+			desc->eof = 1;
+	}
+out:
+	return status;
+}
+
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+	struct nfs_cache_array *array;
+	int status;
+
+	array = nfs_readdir_get_array(desc->page);
+	if (IS_ERR(array)) {
+		status = PTR_ERR(array);
+		goto out;
+	}
+
+	if (*desc->dir_cookie == 0)
+		status = nfs_readdir_search_for_pos(array, desc);
+	else
+		status = nfs_readdir_search_for_cookie(array, desc);
+
+	if (status == -EAGAIN) {
+		desc->last_cookie = array->last_cookie;
+		desc->current_index += array->size;
+		desc->page_index++;
+	}
+	nfs_readdir_release_array(desc->page);
+out:
+	return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+			struct nfs_entry *entry, struct file *file, struct inode *inode)
+{
+	struct nfs_open_dir_context *ctx = file->private_data;
+	struct rpc_cred	*cred = ctx->cred;
+	unsigned long	timestamp, gencount;
+	int		error;
+
+ again:
+	timestamp = jiffies;
+	gencount = nfs_inc_attr_generation_counter();
+	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
+					  NFS_SERVER(inode)->dtsize, desc->plus);
+	if (error < 0) {
+		/* We requested READDIRPLUS, but the server doesn't grok it */
+		if (error == -ENOTSUPP && desc->plus) {
+			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+			desc->plus = 0;
+			goto again;
+		}
+		goto error;
+	}
+	desc->timestamp = timestamp;
+	desc->gencount = gencount;
+error:
+	return error;
+}
+
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
+		      struct nfs_entry *entry, struct xdr_stream *xdr)
+{
+	int error;
+
+	error = desc->decode(xdr, entry, desc->plus);
+	if (error)
+		return error;
+	entry->fattr->time_start = desc->timestamp;
+	entry->fattr->gencount = desc->gencount;
+	return 0;
+}
+
+/* Match file and dirent using either filehandle or fileid
+ * Note: caller is responsible for checking the fsid
+ */
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
+{
+	struct nfs_inode *nfsi;
+
+	if (d_really_is_negative(dentry))
+		return 0;
+
+	nfsi = NFS_I(d_inode(dentry));
+	if (entry->fattr->fileid == nfsi->fileid)
+		return 1;
+	if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
+		return 1;
+	return 0;
+}
+
+static
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
+		return false;
+	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
+		return true;
+	if (ctx->pos == 0)
+		return true;
+	return false;
+}
+
+/*
+ * This function is called by the lookup code to request the use of
+ * readdirplus to accelerate any future lookups in the same
+ * directory.
+ */
+static
+void nfs_advise_use_readdirplus(struct inode *dir)
+{
+	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
+}
+
+/*
+ * This function is mainly for use by nfs_getattr().
+ *
+ * If this is an 'ls -l', we want to force use of readdirplus.
+ * Do this by checking if there is an active file descriptor
+ * and calling nfs_advise_use_readdirplus, then forcing a
+ * cache flush.
+ */
+void nfs_force_use_readdirplus(struct inode *dir)
+{
+	if (!list_empty(&NFS_I(dir)->open_files)) {
+		nfs_advise_use_readdirplus(dir);
+		nfs_zap_mapping(dir, dir->i_mapping);
+	}
+}
+
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
+{
+	struct qstr filename = QSTR_INIT(entry->name, entry->len);
+	struct dentry *dentry;
+	struct dentry *alias;
+	struct inode *dir = d_inode(parent);
+	struct inode *inode;
+	int status;
+
+	if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID))
+		return;
+	if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
+		return;
+	if (filename.name[0] == '.') {
+		if (filename.len == 1)
+			return;
+		if (filename.len == 2 && filename.name[1] == '.')
+			return;
+	}
+	filename.hash = full_name_hash(filename.name, filename.len);
+
+	dentry = d_lookup(parent, &filename);
+	if (dentry != NULL) {
+		/* Is there a mountpoint here? If so, just exit */
+		if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
+					&entry->fattr->fsid))
+			goto out;
+		if (nfs_same_file(dentry, entry)) {
+			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+			status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
+			if (!status)
+				nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label);
+			goto out;
+		} else {
+			d_invalidate(dentry);
+			dput(dentry);
+		}
+	}
+
+	dentry = d_alloc(parent, &filename);
+	if (dentry == NULL)
+		return;
+
+	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
+	if (IS_ERR(inode))
+		goto out;
+
+	alias = d_splice_alias(inode, dentry);
+	if (IS_ERR(alias))
+		goto out;
+	else if (alias) {
+		nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+		dput(alias);
+	} else
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+out:
+	dput(dentry);
+}
+
+/* Perform conversion from xdr to cache array */
+static
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+				struct page **xdr_pages, struct page *page, unsigned int buflen)
+{
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	struct nfs_cache_array *array;
+	unsigned int count = 0;
+	int status;
+
+	scratch = alloc_page(GFP_KERNEL);
+	if (scratch == NULL)
+		return -ENOMEM;
+
+	if (buflen == 0)
+		goto out_nopages;
+
+	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	do {
+		status = xdr_decode(desc, entry, &stream);
+		if (status != 0) {
+			if (status == -EAGAIN)
+				status = 0;
+			break;
+		}
+
+		count++;
+
+		if (desc->plus != 0)
+			nfs_prime_dcache(desc->file->f_path.dentry, entry);
+
+		status = nfs_readdir_add_to_array(entry, page);
+		if (status != 0)
+			break;
+	} while (!entry->eof);
+
+out_nopages:
+	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
+		array = nfs_readdir_get_array(page);
+		if (!IS_ERR(array)) {
+			array->eof_index = array->size;
+			status = 0;
+			nfs_readdir_release_array(page);
+		} else
+			status = PTR_ERR(array);
+	}
+
+	put_page(scratch);
+	return status;
+}
+
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+	for (i = 0; i < npages; i++)
+		put_page(pages[i]);
+}
+
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+		unsigned int npages)
+{
+	nfs_readdir_free_pagearray(pages, npages);
+}
+
+/*
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
+ * to nfs_readdir_free_large_page
+ */
+static
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = alloc_page(GFP_KERNEL);
+		if (page == NULL)
+			goto out_freepages;
+		pages[i] = page;
+	}
+	return 0;
+
+out_freepages:
+	nfs_readdir_free_pagearray(pages, i);
+	return -ENOMEM;
+}
+
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+	struct page *pages[NFS_MAX_READDIR_PAGES];
+	void *pages_ptr = NULL;
+	struct nfs_entry entry;
+	struct file	*file = desc->file;
+	struct nfs_cache_array *array;
+	int status = -ENOMEM;
+	unsigned int array_size = ARRAY_SIZE(pages);
+
+	entry.prev_cookie = 0;
+	entry.cookie = desc->last_cookie;
+	entry.eof = 0;
+	entry.fh = nfs_alloc_fhandle();
+	entry.fattr = nfs_alloc_fattr();
+	entry.server = NFS_SERVER(inode);
+	if (entry.fh == NULL || entry.fattr == NULL)
+		goto out;
+
+	entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+	if (IS_ERR(entry.label)) {
+		status = PTR_ERR(entry.label);
+		goto out;
+	}
+
+	array = nfs_readdir_get_array(page);
+	if (IS_ERR(array)) {
+		status = PTR_ERR(array);
+		goto out_label_free;
+	}
+	memset(array, 0, sizeof(struct nfs_cache_array));
+	array->eof_index = -1;
+
+	status = nfs_readdir_large_page(pages, array_size);
+	if (status < 0)
+		goto out_release_array;
+	do {
+		unsigned int pglen;
+		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+
+		if (status < 0)
+			break;
+		pglen = status;
+		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+		if (status < 0) {
+			if (status == -ENOSPC)
+				status = 0;
+			break;
+		}
+	} while (array->eof_index < 0);
+
+	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+out_release_array:
+	nfs_readdir_release_array(page);
+out_label_free:
+	nfs4_label_free(entry.label);
+out:
+	nfs_free_fattr(entry.fattr);
+	nfs_free_fhandle(entry.fh);
+	return status;
+}
+
+/*
+ * Now we cache directories properly, by converting xdr information
+ * to an array that can be used for lookups later.  This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
+ */
+static
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
+{
+	struct inode	*inode = file_inode(desc->file);
+	int ret;
+
+	ret = nfs_readdir_xdr_to_array(desc, page, inode);
+	if (ret < 0)
+		goto error;
+	SetPageUptodate(page);
+
+	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+		/* Should never happen */
+		nfs_zap_mapping(inode, inode->i_mapping);
+	}
+	unlock_page(page);
+	return 0;
+ error:
+	unlock_page(page);
+	return ret;
+}
+
+static
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+	if (!desc->page->mapping)
+		nfs_readdir_clear_array(desc->page);
+	page_cache_release(desc->page);
+	desc->page = NULL;
+}
+
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	return read_cache_page(file_inode(desc->file)->i_mapping,
+			desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+}
+
+/*
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
+ */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	int res;
+
+	desc->page = get_cache_page(desc);
+	if (IS_ERR(desc->page))
+		return PTR_ERR(desc->page);
+
+	res = nfs_readdir_search_array(desc);
+	if (res != 0)
+		cache_page_release(desc);
+	return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
+static inline
+int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+{
+	int res;
+
+	if (desc->page_index == 0) {
+		desc->current_index = 0;
+		desc->last_cookie = 0;
+	}
+	do {
+		res = find_cache_page(desc);
+	} while (res == -EAGAIN);
+	return res;
+}
+
+/*
+ * Once we've found the start of the dirent within a page: fill 'er up...
+ */
+static 
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
+{
+	struct file	*file = desc->file;
+	int i = 0;
+	int res = 0;
+	struct nfs_cache_array *array = NULL;
+	struct nfs_open_dir_context *ctx = file->private_data;
+
+	array = nfs_readdir_get_array(desc->page);
+	if (IS_ERR(array)) {
+		res = PTR_ERR(array);
+		goto out;
+	}
+
+	for (i = desc->cache_entry_index; i < array->size; i++) {
+		struct nfs_cache_array_entry *ent;
+
+		ent = &array->array[i];
+		if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+		    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
+			desc->eof = 1;
+			break;
+		}
+		desc->ctx->pos++;
+		if (i < (array->size-1))
+			*desc->dir_cookie = array->array[i+1].cookie;
+		else
+			*desc->dir_cookie = array->last_cookie;
+		if (ctx->duped != 0)
+			ctx->duped = 1;
+	}
+	if (array->eof_index >= 0)
+		desc->eof = 1;
+
+	nfs_readdir_release_array(desc->page);
+out:
+	cache_page_release(desc);
+	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
+			(unsigned long long)*desc->dir_cookie, res);
+	return res;
+}
+
+/*
+ * If we cannot find a cookie in our cache, we suspect that this is
+ * because it points to a deleted file, so we ask the server to return
+ * whatever it thinks is the next entry. We then feed this to filldir.
+ * If all goes well, we should then be able to find our way round the
+ * cache on the next call to readdir_search_pagecache();
+ *
+ * NOTE: we cannot add the anonymous page to the pagecache because
+ *	 the data it contains might not be page aligned. Besides,
+ *	 we should already have a complete representation of the
+ *	 directory in the page cache by the time we get here.
+ */
+static inline
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
+{
+	struct page	*page = NULL;
+	int		status;
+	struct inode *inode = file_inode(desc->file);
+	struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
+			(unsigned long long)*desc->dir_cookie);
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (!page) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	desc->page_index = 0;
+	desc->last_cookie = *desc->dir_cookie;
+	desc->page = page;
+	ctx->duped = 0;
+
+	status = nfs_readdir_xdr_to_array(desc, page, inode);
+	if (status < 0)
+		goto out_release;
+
+	status = nfs_do_filldir(desc);
+
+ out:
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
+			__func__, status);
+	return status;
+ out_release:
+	cache_page_release(desc);
+	goto out;
+}
+
+static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
+{
+	struct nfs_inode *nfsi = NFS_I(dir);
+
+	if (nfs_attribute_cache_expired(dir))
+		return true;
+	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+		return true;
+	return false;
+}
+
+/* The file offset position represents the dirent entry number.  A
+   last cookie cache takes care of the common case of reading the
+   whole directory.
+ */
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry	*dentry = file->f_path.dentry;
+	struct inode	*inode = d_inode(dentry);
+	nfs_readdir_descriptor_t my_desc,
+			*desc = &my_desc;
+	struct nfs_open_dir_context *dir_ctx = file->private_data;
+	int res = 0;
+
+	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
+			file, (long long)ctx->pos);
+	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
+
+	/*
+	 * ctx->pos points to the dirent entry number.
+	 * *desc->dir_cookie has the cookie for the next entry. We have
+	 * to either find the entry with the appropriate number or
+	 * revalidate the cookie.
+	 */
+	memset(desc, 0, sizeof(*desc));
+
+	desc->file = file;
+	desc->ctx = ctx;
+	desc->dir_cookie = &dir_ctx->dir_cookie;
+	desc->decode = NFS_PROTO(inode)->decode_dirent;
+	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
+
+	nfs_block_sillyrename(dentry);
+	if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
+		res = nfs_revalidate_mapping(inode, file->f_mapping);
+	if (res < 0)
+		goto out;
+
+	do {
+		res = readdir_search_pagecache(desc);
+
+		if (res == -EBADCOOKIE) {
+			res = 0;
+			/* This means either end of directory */
+			if (*desc->dir_cookie && desc->eof == 0) {
+				/* Or that the server has 'lost' a cookie */
+				res = uncached_readdir(desc);
+				if (res == 0)
+					continue;
+			}
+			break;
+		}
+		if (res == -ETOOSMALL && desc->plus) {
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+			nfs_zap_caches(inode);
+			desc->page_index = 0;
+			desc->plus = 0;
+			desc->eof = 0;
+			continue;
+		}
+		if (res < 0)
+			break;
+
+		res = nfs_do_filldir(desc);
+		if (res < 0)
+			break;
+	} while (!desc->eof);
+out:
+	nfs_unblock_sillyrename(dentry);
+	if (res > 0)
+		res = 0;
+	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
+	return res;
+}
+
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
+{
+	struct inode *inode = file_inode(filp);
+	struct nfs_open_dir_context *dir_ctx = filp->private_data;
+
+	dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
+			filp, offset, whence);
+
+	mutex_lock(&inode->i_mutex);
+	switch (whence) {
+		case 1:
+			offset += filp->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			offset = -EINVAL;
+			goto out;
+	}
+	if (offset != filp->f_pos) {
+		filp->f_pos = offset;
+		dir_ctx->dir_cookie = 0;
+		dir_ctx->duped = 0;
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+/*
+ * All directory operations under NFS are synchronous, so fsync()
+ * is a dummy operation.
+ */
+static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
+			 int datasync)
+{
+	struct inode *inode = file_inode(filp);
+
+	dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
+
+	mutex_lock(&inode->i_mutex);
+	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir - pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+	NFS_I(dir)->cache_change_attribute++;
+}
+EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
+
+/*
+ * A check for whether or not the parent directory has changed.
+ * In the case it has, we assume that the dentries are untrustworthy
+ * and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
+ */
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+			      int rcu_walk)
+{
+	int ret;
+
+	if (IS_ROOT(dentry))
+		return 1;
+	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+		return 0;
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
+		return 0;
+	/* Revalidate nfsi->cache_change_attribute before we declare a match */
+	if (rcu_walk)
+		ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+	else
+		ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+	if (ret < 0)
+		return 0;
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
+		return 0;
+	return 1;
+}
+
+/*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
+{
+	if (NFS_PROTO(dir)->version == 2)
+		return 0;
+	return flags & LOOKUP_EXCL;
+}
+
+/*
+ * Inode and filehandle revalidation for lookups.
+ *
+ * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
+ * or if the intent information indicates that we're about to open this
+ * particular file and the "nocto" mount flag is not set.
+ *
+ */
+static
+int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	int ret;
+
+	if (IS_AUTOMOUNT(inode))
+		return 0;
+	/* VFS wants an on-the-wire revalidation */
+	if (flags & LOOKUP_REVAL)
+		goto out_force;
+	/* This is an open(2) */
+	if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&
+	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+		goto out_force;
+out:
+	return (inode->i_nlink == 0) ? -ENOENT : 0;
+out_force:
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+	ret = __nfs_revalidate_inode(server, inode);
+	if (ret != 0)
+		return ret;
+	goto out;
+}
+
+/*
+ * We judge how long we want to trust negative
+ * dentries by looking at the parent inode mtime.
+ *
+ * If parent mtime has changed, we revalidate, else we wait for a
+ * period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
+ */
+static inline
+int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
+		       unsigned int flags)
+{
+	/* Don't revalidate a negative dentry if we're creating a new file */
+	if (flags & LOOKUP_CREATE)
+		return 0;
+	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+		return 1;
+	return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
+}
+
+/*
+ * This is called every time the dcache has a lookup hit,
+ * and we should check whether we can really trust that
+ * lookup.
+ *
+ * NOTE! The hit can be a negative hit too, don't assume
+ * we have an inode!
+ *
+ * If the parent directory is seen to have changed, we throw out the
+ * cached dentry and do a new lookup.
+ */
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *dir;
+	struct inode *inode;
+	struct dentry *parent;
+	struct nfs_fh *fhandle = NULL;
+	struct nfs_fattr *fattr = NULL;
+	struct nfs4_label *label = NULL;
+	int error;
+
+	if (flags & LOOKUP_RCU) {
+		parent = ACCESS_ONCE(dentry->d_parent);
+		dir = d_inode_rcu(parent);
+		if (!dir)
+			return -ECHILD;
+	} else {
+		parent = dget_parent(dentry);
+		dir = d_inode(parent);
+	}
+	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
+	inode = d_inode(dentry);
+
+	if (!inode) {
+		if (nfs_neg_need_reval(dir, dentry, flags)) {
+			if (flags & LOOKUP_RCU)
+				return -ECHILD;
+			goto out_bad;
+		}
+		goto out_valid_noent;
+	}
+
+	if (is_bad_inode(inode)) {
+		if (flags & LOOKUP_RCU)
+			return -ECHILD;
+		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+				__func__, dentry);
+		goto out_bad;
+	}
+
+	if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+		goto out_set_verifier;
+
+	/* Force a full look up iff the parent directory has changed */
+	if (!nfs_is_exclusive_create(dir, flags) &&
+	    nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+
+		if (nfs_lookup_verify_inode(inode, flags)) {
+			if (flags & LOOKUP_RCU)
+				return -ECHILD;
+			goto out_zap_parent;
+		}
+		goto out_valid;
+	}
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	if (NFS_STALE(inode))
+		goto out_bad;
+
+	error = -ENOMEM;
+	fhandle = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fhandle == NULL || fattr == NULL)
+		goto out_error;
+
+	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+	if (IS_ERR(label))
+		goto out_error;
+
+	trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+	trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
+	if (error)
+		goto out_bad;
+	if (nfs_compare_fh(NFS_FH(inode), fhandle))
+		goto out_bad;
+	if ((error = nfs_refresh_inode(inode, fattr)) != 0)
+		goto out_bad;
+
+	nfs_setsecurity(inode, fattr, label);
+
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	nfs4_label_free(label);
+
+out_set_verifier:
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ out_valid:
+	/* Success: notify readdir to use READDIRPLUS */
+	nfs_advise_use_readdirplus(dir);
+ out_valid_noent:
+	if (flags & LOOKUP_RCU) {
+		if (parent != ACCESS_ONCE(dentry->d_parent))
+			return -ECHILD;
+	} else
+		dput(parent);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+			__func__, dentry);
+	return 1;
+out_zap_parent:
+	nfs_zap_caches(dir);
+ out_bad:
+	WARN_ON(flags & LOOKUP_RCU);
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	nfs4_label_free(label);
+	nfs_mark_for_revalidate(dir);
+	if (inode && S_ISDIR(inode->i_mode)) {
+		/* Purge readdir caches. */
+		nfs_zap_caches(inode);
+		/*
+		 * We can't d_drop the root of a disconnected tree:
+		 * its d_hash is on the s_anon list and d_drop() would hide
+		 * it from shrink_dcache_for_unmount(), leading to busy
+		 * inodes on unmount and further oopses.
+		 */
+		if (IS_ROOT(dentry))
+			goto out_valid;
+	}
+	dput(parent);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+			__func__, dentry);
+	return 0;
+out_error:
+	WARN_ON(flags & LOOKUP_RCU);
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	nfs4_label_free(label);
+	dput(parent);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+			__func__, dentry, error);
+	return error;
+}
+
+/*
+ * A weaker form of d_revalidate for revalidating just the d_inode(dentry)
+ * when we don't really care about the dentry name. This is called when a
+ * pathwalk ends on a dentry that was not found via a normal lookup in the
+ * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals).
+ *
+ * In this situation, we just want to verify that the inode itself is OK
+ * since the dentry might have changed on the server.
+ */
+static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	int error;
+	struct inode *inode = d_inode(dentry);
+
+	/*
+	 * I believe we can only get a negative dentry here in the case of a
+	 * procfs-style symlink. Just assume it's correct for now, but we may
+	 * eventually need to do something more here.
+	 */
+	if (!inode) {
+		dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n",
+				__func__, dentry);
+		return 1;
+	}
+
+	if (is_bad_inode(inode)) {
+		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+				__func__, dentry);
+		return 0;
+	}
+
+	error = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
+			__func__, inode->i_ino, error ? "invalid" : "valid");
+	return !error;
+}
+
+/*
+ * This is called from dput() when d_count is going to 0.
+ */
+static int nfs_dentry_delete(const struct dentry *dentry)
+{
+	dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n",
+		dentry, dentry->d_flags);
+
+	/* Unhash any dentry with a stale inode */
+	if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry)))
+		return 1;
+
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		/* Unhash it, so that ->d_iput() would be called */
+		return 1;
+	}
+	if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
+		/* Unhash it, so that ancestors of killed async unlink
+		 * files will be cleaned up during umount */
+		return 1;
+	}
+	return 0;
+
+}
+
+/* Ensure that we revalidate inode->i_nlink */
+static void nfs_drop_nlink(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	/* drop the inode if we're reasonably sure this is the last link */
+	if (inode->i_nlink == 1)
+		clear_nlink(inode);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Called when the dentry loses inode.
+ * We use it to clean up silly-renamed files.
+ */
+static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		/* drop any readdir cache as it could easily be old */
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		nfs_complete_unlink(dentry, inode);
+		nfs_drop_nlink(inode);
+	}
+	iput(inode);
+}
+
+static void nfs_d_release(struct dentry *dentry)
+{
+	/* free cached devname value, if it survived that far */
+	if (unlikely(dentry->d_fsdata)) {
+		if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+			WARN_ON(1);
+		else
+			kfree(dentry->d_fsdata);
+	}
+}
+
+const struct dentry_operations nfs_dentry_operations = {
+	.d_revalidate	= nfs_lookup_revalidate,
+	.d_weak_revalidate	= nfs_weak_revalidate,
+	.d_delete	= nfs_dentry_delete,
+	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
+	.d_release	= nfs_d_release,
+};
+EXPORT_SYMBOL_GPL(nfs_dentry_operations);
+
+struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+{
+	struct dentry *res;
+	struct dentry *parent;
+	struct inode *inode = NULL;
+	struct nfs_fh *fhandle = NULL;
+	struct nfs_fattr *fattr = NULL;
+	struct nfs4_label *label = NULL;
+	int error;
+
+	dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
+	nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
+
+	res = ERR_PTR(-ENAMETOOLONG);
+	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+		goto out;
+
+	/*
+	 * If we're doing an exclusive create, optimize away the lookup
+	 * but don't hash the dentry.
+	 */
+	if (nfs_is_exclusive_create(dir, flags)) {
+		d_instantiate(dentry, NULL);
+		res = NULL;
+		goto out;
+	}
+
+	res = ERR_PTR(-ENOMEM);
+	fhandle = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fhandle == NULL || fattr == NULL)
+		goto out;
+
+	label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+	if (IS_ERR(label))
+		goto out;
+
+	parent = dentry->d_parent;
+	/* Protect against concurrent sillydeletes */
+	trace_nfs_lookup_enter(dir, dentry, flags);
+	nfs_block_sillyrename(parent);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+	if (error == -ENOENT)
+		goto no_entry;
+	if (error < 0) {
+		res = ERR_PTR(error);
+		goto out_unblock_sillyrename;
+	}
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+	res = ERR_CAST(inode);
+	if (IS_ERR(res))
+		goto out_unblock_sillyrename;
+
+	/* Success: notify readdir to use READDIRPLUS */
+	nfs_advise_use_readdirplus(dir);
+
+no_entry:
+	res = d_splice_alias(inode, dentry);
+	if (res != NULL) {
+		if (IS_ERR(res))
+			goto out_unblock_sillyrename;
+		dentry = res;
+	}
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+out_unblock_sillyrename:
+	nfs_unblock_sillyrename(parent);
+	trace_nfs_lookup_exit(dir, dentry, flags, error);
+	nfs4_label_free(label);
+out:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	return res;
+}
+EXPORT_SYMBOL_GPL(nfs_lookup);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
+
+const struct dentry_operations nfs4_dentry_operations = {
+	.d_revalidate	= nfs4_lookup_revalidate,
+	.d_delete	= nfs_dentry_delete,
+	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
+	.d_release	= nfs_d_release,
+};
+EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
+
+static fmode_t flags_to_mode(int flags)
+{
+	fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+	if ((flags & O_ACCMODE) != O_WRONLY)
+		res |= FMODE_READ;
+	if ((flags & O_ACCMODE) != O_RDONLY)
+		res |= FMODE_WRITE;
+	return res;
+}
+
+static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
+{
+	return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+	nfs_fscache_open_file(inode, filp);
+	return 0;
+}
+
+static int nfs_finish_open(struct nfs_open_context *ctx,
+			   struct dentry *dentry,
+			   struct file *file, unsigned open_flags,
+			   int *opened)
+{
+	int err;
+
+	if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		*opened |= FILE_CREATED;
+
+	err = finish_open(file, dentry, do_open, opened);
+	if (err)
+		goto out;
+	nfs_file_set_open_context(file, ctx);
+
+out:
+	return err;
+}
+
+int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
+		    struct file *file, unsigned open_flags,
+		    umode_t mode, int *opened)
+{
+	struct nfs_open_context *ctx;
+	struct dentry *res;
+	struct iattr attr = { .ia_valid = ATTR_OPEN };
+	struct inode *inode;
+	unsigned int lookup_flags = 0;
+	int err;
+
+	/* Expect a negative dentry */
+	BUG_ON(d_inode(dentry));
+
+	dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
+
+	err = nfs_check_flags(open_flags);
+	if (err)
+		return err;
+
+	/* NFS only supports OPEN on regular files */
+	if ((open_flags & O_DIRECTORY)) {
+		if (!d_unhashed(dentry)) {
+			/*
+			 * Hashed negative dentry with O_DIRECTORY: dentry was
+			 * revalidated and is fine, no need to perform lookup
+			 * again
+			 */
+			return -ENOENT;
+		}
+		lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY;
+		goto no_open;
+	}
+
+	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+		return -ENAMETOOLONG;
+
+	if (open_flags & O_CREAT) {
+		attr.ia_valid |= ATTR_MODE;
+		attr.ia_mode = mode & ~current_umask();
+	}
+	if (open_flags & O_TRUNC) {
+		attr.ia_valid |= ATTR_SIZE;
+		attr.ia_size = 0;
+	}
+
+	ctx = create_nfs_open_context(dentry, open_flags);
+	err = PTR_ERR(ctx);
+	if (IS_ERR(ctx))
+		goto out;
+
+	trace_nfs_atomic_open_enter(dir, ctx, open_flags);
+	nfs_block_sillyrename(dentry->d_parent);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);
+	nfs_unblock_sillyrename(dentry->d_parent);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+		put_nfs_open_context(ctx);
+		switch (err) {
+		case -ENOENT:
+			d_drop(dentry);
+			d_add(dentry, NULL);
+			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+			break;
+		case -EISDIR:
+		case -ENOTDIR:
+			goto no_open;
+		case -ELOOP:
+			if (!(open_flags & O_NOFOLLOW))
+				goto no_open;
+			break;
+			/* case -EINVAL: */
+		default:
+			break;
+		}
+		goto out;
+	}
+
+	err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
+	trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+	put_nfs_open_context(ctx);
+out:
+	return err;
+
+no_open:
+	res = nfs_lookup(dir, dentry, lookup_flags);
+	err = PTR_ERR(res);
+	if (IS_ERR(res))
+		goto out;
+
+	return finish_no_open(file, res);
+}
+EXPORT_SYMBOL_GPL(nfs_atomic_open);
+
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode;
+	int ret = 0;
+
+	if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
+		goto no_open;
+	if (d_mountpoint(dentry))
+		goto no_open;
+	if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
+		goto no_open;
+
+	inode = d_inode(dentry);
+
+	/* We can't create new files in nfs_open_revalidate(), so we
+	 * optimize away revalidation of negative dentries.
+	 */
+	if (inode == NULL) {
+		struct dentry *parent;
+		struct inode *dir;
+
+		if (flags & LOOKUP_RCU) {
+			parent = ACCESS_ONCE(dentry->d_parent);
+			dir = d_inode_rcu(parent);
+			if (!dir)
+				return -ECHILD;
+		} else {
+			parent = dget_parent(dentry);
+			dir = d_inode(parent);
+		}
+		if (!nfs_neg_need_reval(dir, dentry, flags))
+			ret = 1;
+		else if (flags & LOOKUP_RCU)
+			ret = -ECHILD;
+		if (!(flags & LOOKUP_RCU))
+			dput(parent);
+		else if (parent != ACCESS_ONCE(dentry->d_parent))
+			return -ECHILD;
+		goto out;
+	}
+
+	/* NFS only supports OPEN on regular files */
+	if (!S_ISREG(inode->i_mode))
+		goto no_open;
+	/* We cannot do exclusive creation on a positive dentry */
+	if (flags & LOOKUP_EXCL)
+		goto no_open;
+
+	/* Let f_op->open() actually open (and revalidate) the file */
+	ret = 1;
+
+out:
+	return ret;
+
+no_open:
+	return nfs_lookup_revalidate(dentry, flags);
+}
+
+#endif /* CONFIG_NFSV4 */
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr,
+				struct nfs4_label *label)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir = d_inode(parent);
+	struct inode *inode;
+	int error = -EACCES;
+
+	d_drop(dentry);
+
+	/* We may have been initialized further down */
+	if (d_really_is_positive(dentry))
+		goto out;
+	if (fhandle->size == 0) {
+		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
+		if (error)
+			goto out_error;
+	}
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	if (!(fattr->valid & NFS_ATTR_FATTR)) {
+		struct nfs_server *server = NFS_SB(dentry->d_sb);
+		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
+		if (error < 0)
+			goto out_error;
+	}
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+	error = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_error;
+	d_add(dentry, inode);
+out:
+	dput(parent);
+	return 0;
+out_error:
+	nfs_mark_for_revalidate(dir);
+	dput(parent);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_instantiate);
+
+/*
+ * Following a failed create operation, we drop the dentry rather
+ * than retain a negative dentry. This avoids a problem in the event
+ * that the operation succeeded on the server, but an error in the
+ * reply path made it appear to have failed.
+ */
+int nfs_create(struct inode *dir, struct dentry *dentry,
+		umode_t mode, bool excl)
+{
+	struct iattr attr;
+	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
+	int error;
+
+	dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
+
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
+
+	trace_nfs_create_enter(dir, dentry, open_flags);
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
+	trace_nfs_create_exit(dir, dentry, open_flags, error);
+	if (error != 0)
+		goto out_err;
+	return 0;
+out_err:
+	d_drop(dentry);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_create);
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+int
+nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	struct iattr attr;
+	int status;
+
+	dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
+
+	trace_nfs_mknod_enter(dir, dentry);
+	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
+	trace_nfs_mknod_exit(dir, dentry, status);
+	if (status != 0)
+		goto out_err;
+	return 0;
+out_err:
+	d_drop(dentry);
+	return status;
+}
+EXPORT_SYMBOL_GPL(nfs_mknod);
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct iattr attr;
+	int error;
+
+	dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
+
+	attr.ia_valid = ATTR_MODE;
+	attr.ia_mode = mode | S_IFDIR;
+
+	trace_nfs_mkdir_enter(dir, dentry);
+	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+	trace_nfs_mkdir_exit(dir, dentry, error);
+	if (error != 0)
+		goto out_err;
+	return 0;
+out_err:
+	d_drop(dentry);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_mkdir);
+
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+	if (d_really_is_positive(dentry) && !d_unhashed(dentry))
+		d_delete(dentry);
+}
+
+int nfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+
+	dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
+
+	trace_nfs_rmdir_enter(dir, dentry);
+	if (d_really_is_positive(dentry)) {
+		nfs_wait_on_sillyrename(dentry);
+		error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+		/* Ensure the VFS deletes this inode */
+		switch (error) {
+		case 0:
+			clear_nlink(d_inode(dentry));
+			break;
+		case -ENOENT:
+			nfs_dentry_handle_enoent(dentry);
+		}
+	} else
+		error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+	trace_nfs_rmdir_exit(dir, dentry, error);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_rmdir);
+
+/*
+ * Remove a file after making sure there are no pending writes,
+ * and after checking that the file has only one user. 
+ *
+ * We invalidate the attribute cache and free the inode prior to the operation
+ * to avoid possible races if the server reuses the inode.
+ */
+static int nfs_safe_remove(struct dentry *dentry)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct inode *inode = d_inode(dentry);
+	int error = -EBUSY;
+		
+	dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry);
+
+	/* If the dentry was sillyrenamed, we simply call d_delete() */
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		error = 0;
+		goto out;
+	}
+
+	trace_nfs_remove_enter(dir, dentry);
+	if (inode != NULL) {
+		NFS_PROTO(inode)->return_delegation(inode);
+		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+		if (error == 0)
+			nfs_drop_nlink(inode);
+	} else
+		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+	if (error == -ENOENT)
+		nfs_dentry_handle_enoent(dentry);
+	trace_nfs_remove_exit(dir, dentry, error);
+out:
+	return error;
+}
+
+/*  We do silly rename. In case sillyrename() returns -EBUSY, the inode
+ *  belongs to an active ".nfs..." file and we return -EBUSY.
+ *
+ *  If sillyrename() returns 0, we do nothing, otherwise we unlink.
+ */
+int nfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+	int need_rehash = 0;
+
+	dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
+		dir->i_ino, dentry);
+
+	trace_nfs_unlink_enter(dir, dentry);
+	spin_lock(&dentry->d_lock);
+	if (d_count(dentry) > 1) {
+		spin_unlock(&dentry->d_lock);
+		/* Start asynchronous writeout of the inode */
+		write_inode_now(d_inode(dentry), 0);
+		error = nfs_sillyrename(dir, dentry);
+		goto out;
+	}
+	if (!d_unhashed(dentry)) {
+		__d_drop(dentry);
+		need_rehash = 1;
+	}
+	spin_unlock(&dentry->d_lock);
+	error = nfs_safe_remove(dentry);
+	if (!error || error == -ENOENT) {
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	} else if (need_rehash)
+		d_rehash(dentry);
+out:
+	trace_nfs_unlink_exit(dir, dentry, error);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_unlink);
+
+/*
+ * To create a symbolic link, most file systems instantiate a new inode,
+ * add a page to it containing the path, then write it out to the disk
+ * using prepare_write/commit_write.
+ *
+ * Unfortunately the NFS client can't create the in-core inode first
+ * because it needs a file handle to create an in-core inode (see
+ * fs/nfs/inode.c:nfs_fhget).  We only have a file handle *after* the
+ * symlink request has completed on the server.
+ *
+ * So instead we allocate a raw page, copy the symname into it, then do
+ * the SYMLINK request with the page as the buffer.  If it succeeds, we
+ * now have a new file handle and can instantiate an in-core NFS inode
+ * and move the raw page into its mapping.
+ */
+int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct page *page;
+	char *kaddr;
+	struct iattr attr;
+	unsigned int pathlen = strlen(symname);
+	int error;
+
+	dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id,
+		dir->i_ino, dentry, symname);
+
+	if (pathlen > PAGE_SIZE)
+		return -ENAMETOOLONG;
+
+	attr.ia_mode = S_IFLNK | S_IRWXUGO;
+	attr.ia_valid = ATTR_MODE;
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (!page)
+		return -ENOMEM;
+
+	kaddr = kmap_atomic(page);
+	memcpy(kaddr, symname, pathlen);
+	if (pathlen < PAGE_SIZE)
+		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
+	kunmap_atomic(kaddr);
+
+	trace_nfs_symlink_enter(dir, dentry);
+	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+	trace_nfs_symlink_exit(dir, dentry, error);
+	if (error != 0) {
+		dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
+			dir->i_sb->s_id, dir->i_ino,
+			dentry, symname, error);
+		d_drop(dentry);
+		__free_page(page);
+		return error;
+	}
+
+	/*
+	 * No big deal if we can't add this page to the page cache here.
+	 * READLINK will get the missing page from the server if needed.
+	 */
+	if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
+							GFP_KERNEL)) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		/*
+		 * add_to_page_cache_lru() grabs an extra page refcount.
+		 * Drop it here to avoid leaking this page later.
+		 */
+		page_cache_release(page);
+	} else
+		__free_page(page);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_symlink);
+
+int
+nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+	int error;
+
+	dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n",
+		old_dentry, dentry);
+
+	trace_nfs_link_enter(inode, dir, dentry);
+	NFS_PROTO(inode)->return_delegation(inode);
+
+	d_drop(dentry);
+	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
+	if (error == 0) {
+		ihold(inode);
+		d_add(dentry, inode);
+	}
+	trace_nfs_link_exit(inode, dir, dentry, error);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_link);
+
+/*
+ * RENAME
+ * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
+ * different file handle for the same inode after a rename (e.g. when
+ * moving to a different directory). A fail-safe method to do so would
+ * be to look up old_dir/old_name, create a link to new_dir/new_name and
+ * rename the old file using the sillyrename stuff. This way, the original
+ * file in old_dir will go away when the last process iput()s the inode.
+ *
+ * FIXED.
+ * 
+ * It actually works quite well. One needs to have the possibility for
+ * at least one ".nfs..." file in each directory the file ever gets
+ * moved or linked to which happens automagically with the new
+ * implementation that only depends on the dcache stuff instead of
+ * using the inode layer
+ *
+ * Unfortunately, things are a little more complicated than indicated
+ * above. For a cross-directory move, we want to make sure we can get
+ * rid of the old inode after the operation.  This means there must be
+ * no pending writes (if it's a file), and the use count must be 1.
+ * If these conditions are met, we can drop the dentries before doing
+ * the rename.
+ */
+int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct dentry *dentry = NULL, *rehash = NULL;
+	struct rpc_task *task;
+	int error = -EBUSY;
+
+	dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
+		 old_dentry, new_dentry,
+		 d_count(new_dentry));
+
+	trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
+	/*
+	 * For non-directories, check whether the target is busy and if so,
+	 * make a copy of the dentry and then do a silly-rename. If the
+	 * silly-rename succeeds, the copied dentry is hashed and becomes
+	 * the new target.
+	 */
+	if (new_inode && !S_ISDIR(new_inode->i_mode)) {
+		/*
+		 * To prevent any new references to the target during the
+		 * rename, we unhash the dentry in advance.
+		 */
+		if (!d_unhashed(new_dentry)) {
+			d_drop(new_dentry);
+			rehash = new_dentry;
+		}
+
+		if (d_count(new_dentry) > 2) {
+			int err;
+
+			/* copy the target dentry's name */
+			dentry = d_alloc(new_dentry->d_parent,
+					 &new_dentry->d_name);
+			if (!dentry)
+				goto out;
+
+			/* silly-rename the existing target ... */
+			err = nfs_sillyrename(new_dir, new_dentry);
+			if (err)
+				goto out;
+
+			new_dentry = dentry;
+			rehash = NULL;
+			new_inode = NULL;
+		}
+	}
+
+	NFS_PROTO(old_inode)->return_delegation(old_inode);
+	if (new_inode != NULL)
+		NFS_PROTO(new_inode)->return_delegation(new_inode);
+
+	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+	if (IS_ERR(task)) {
+		error = PTR_ERR(task);
+		goto out;
+	}
+
+	error = rpc_wait_for_completion_task(task);
+	if (error == 0)
+		error = task->tk_status;
+	rpc_put_task(task);
+	nfs_mark_for_revalidate(old_inode);
+out:
+	if (rehash)
+		d_rehash(rehash);
+	trace_nfs_rename_exit(old_dir, old_dentry,
+			new_dir, new_dentry, error);
+	if (!error) {
+		if (new_inode != NULL)
+			nfs_drop_nlink(new_inode);
+		d_move(old_dentry, new_dentry);
+		nfs_set_verifier(new_dentry,
+					nfs_save_change_attribute(new_dir));
+	} else if (error == -ENOENT)
+		nfs_dentry_handle_enoent(old_dentry);
+
+	/* new dentry created? */
+	if (dentry)
+		dput(dentry);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_rename);
+
+static DEFINE_SPINLOCK(nfs_access_lru_lock);
+static LIST_HEAD(nfs_access_lru_list);
+static atomic_long_t nfs_access_nr_entries;
+
+static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
+static void nfs_access_free_entry(struct nfs_access_entry *entry)
+{
+	put_rpccred(entry->cred);
+	kfree_rcu(entry, rcu_head);
+	smp_mb__before_atomic();
+	atomic_long_dec(&nfs_access_nr_entries);
+	smp_mb__after_atomic();
+}
+
+static void nfs_access_free_list(struct list_head *head)
+{
+	struct nfs_access_entry *cache;
+
+	while (!list_empty(head)) {
+		cache = list_entry(head->next, struct nfs_access_entry, lru);
+		list_del(&cache->lru);
+		nfs_access_free_entry(cache);
+	}
+}
+
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
+{
+	LIST_HEAD(head);
+	struct nfs_inode *nfsi, *next;
+	struct nfs_access_entry *cache;
+	long freed = 0;
+
+	spin_lock(&nfs_access_lru_lock);
+	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
+		struct inode *inode;
+
+		if (nr_to_scan-- == 0)
+			break;
+		inode = &nfsi->vfs_inode;
+		spin_lock(&inode->i_lock);
+		if (list_empty(&nfsi->access_cache_entry_lru))
+			goto remove_lru_entry;
+		cache = list_entry(nfsi->access_cache_entry_lru.next,
+				struct nfs_access_entry, lru);
+		list_move(&cache->lru, &head);
+		rb_erase(&cache->rb_node, &nfsi->access_cache);
+		freed++;
+		if (!list_empty(&nfsi->access_cache_entry_lru))
+			list_move_tail(&nfsi->access_cache_inode_lru,
+					&nfs_access_lru_list);
+		else {
+remove_lru_entry:
+			list_del_init(&nfsi->access_cache_inode_lru);
+			smp_mb__before_atomic();
+			clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+			smp_mb__after_atomic();
+		}
+		spin_unlock(&inode->i_lock);
+	}
+	spin_unlock(&nfs_access_lru_lock);
+	nfs_access_free_list(&head);
+	return freed;
+}
+
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return SHRINK_STOP;
+	return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
+unsigned long
+nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
+}
+
+static void
+nfs_access_cache_enforce_limit(void)
+{
+	long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+	unsigned long diff;
+	unsigned int nr_to_scan;
+
+	if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+		return;
+	nr_to_scan = 100;
+	diff = nr_entries - nfs_access_max_cachesize;
+	if (diff < nr_to_scan)
+		nr_to_scan = diff;
+	nfs_do_access_cache_scan(nr_to_scan);
+}
+
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
+{
+	struct rb_root *root_node = &nfsi->access_cache;
+	struct rb_node *n;
+	struct nfs_access_entry *entry;
+
+	/* Unhook entries from the cache */
+	while ((n = rb_first(root_node)) != NULL) {
+		entry = rb_entry(n, struct nfs_access_entry, rb_node);
+		rb_erase(n, root_node);
+		list_move(&entry->lru, head);
+	}
+	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+}
+
+void nfs_access_zap_cache(struct inode *inode)
+{
+	LIST_HEAD(head);
+
+	if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+		return;
+	/* Remove from global LRU init */
+	spin_lock(&nfs_access_lru_lock);
+	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+		list_del_init(&NFS_I(inode)->access_cache_inode_lru);
+
+	spin_lock(&inode->i_lock);
+	__nfs_access_zap_cache(NFS_I(inode), &head);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&nfs_access_lru_lock);
+	nfs_access_free_list(&head);
+}
+EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
+
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
+{
+	struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
+	struct nfs_access_entry *entry;
+
+	while (n != NULL) {
+		entry = rb_entry(n, struct nfs_access_entry, rb_node);
+
+		if (cred < entry->cred)
+			n = n->rb_left;
+		else if (cred > entry->cred)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache;
+	int err = -ENOENT;
+
+	spin_lock(&inode->i_lock);
+	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+		goto out_zap;
+	cache = nfs_access_search_rbtree(inode, cred);
+	if (cache == NULL)
+		goto out;
+	if (!nfs_have_delegated_attributes(inode) &&
+	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+		goto out_stale;
+	res->jiffies = cache->jiffies;
+	res->cred = cache->cred;
+	res->mask = cache->mask;
+	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
+	err = 0;
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+out_stale:
+	rb_erase(&cache->rb_node, &nfsi->access_cache);
+	list_del(&cache->lru);
+	spin_unlock(&inode->i_lock);
+	nfs_access_free_entry(cache);
+	return -ENOENT;
+out_zap:
+	spin_unlock(&inode->i_lock);
+	nfs_access_zap_cache(inode);
+	return -ENOENT;
+}
+
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+	/* Only check the most recently returned cache entry,
+	 * but do it without locking.
+	 */
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache;
+	int err = -ECHILD;
+	struct list_head *lh;
+
+	rcu_read_lock();
+	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+		goto out;
+	lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+	cache = list_entry(lh, struct nfs_access_entry, lru);
+	if (lh == &nfsi->access_cache_entry_lru ||
+	    cred != cache->cred)
+		cache = NULL;
+	if (cache == NULL)
+		goto out;
+	if (!nfs_have_delegated_attributes(inode) &&
+	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+		goto out;
+	res->jiffies = cache->jiffies;
+	res->cred = cache->cred;
+	res->mask = cache->mask;
+	err = 0;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct rb_root *root_node = &nfsi->access_cache;
+	struct rb_node **p = &root_node->rb_node;
+	struct rb_node *parent = NULL;
+	struct nfs_access_entry *entry;
+
+	spin_lock(&inode->i_lock);
+	while (*p != NULL) {
+		parent = *p;
+		entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+
+		if (set->cred < entry->cred)
+			p = &parent->rb_left;
+		else if (set->cred > entry->cred)
+			p = &parent->rb_right;
+		else
+			goto found;
+	}
+	rb_link_node(&set->rb_node, parent, p);
+	rb_insert_color(&set->rb_node, root_node);
+	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+	spin_unlock(&inode->i_lock);
+	return;
+found:
+	rb_replace_node(parent, &set->rb_node, root_node);
+	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+	list_del(&entry->lru);
+	spin_unlock(&inode->i_lock);
+	nfs_access_free_entry(entry);
+}
+
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+{
+	struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+	if (cache == NULL)
+		return;
+	RB_CLEAR_NODE(&cache->rb_node);
+	cache->jiffies = set->jiffies;
+	cache->cred = get_rpccred(set->cred);
+	cache->mask = set->mask;
+
+	/* The above field assignments must be visible
+	 * before this item appears on the lru.  We cannot easily
+	 * use rcu_assign_pointer, so just force the memory barrier.
+	 */
+	smp_wmb();
+	nfs_access_add_rbtree(inode, cache);
+
+	/* Update accounting */
+	smp_mb__before_atomic();
+	atomic_long_inc(&nfs_access_nr_entries);
+	smp_mb__after_atomic();
+
+	/* Add inode to global LRU list */
+	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+		spin_lock(&nfs_access_lru_lock);
+		if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+			list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+					&nfs_access_lru_list);
+		spin_unlock(&nfs_access_lru_lock);
+	}
+	nfs_access_cache_enforce_limit();
+}
+EXPORT_SYMBOL_GPL(nfs_access_add_cache);
+
+void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
+{
+	entry->mask = 0;
+	if (access_result & NFS4_ACCESS_READ)
+		entry->mask |= MAY_READ;
+	if (access_result &
+	    (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
+		entry->mask |= MAY_WRITE;
+	if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
+		entry->mask |= MAY_EXEC;
+}
+EXPORT_SYMBOL_GPL(nfs_access_set_mask);
+
+static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
+{
+	struct nfs_access_entry cache;
+	int status;
+
+	trace_nfs_access_enter(inode);
+
+	status = nfs_access_get_cached_rcu(inode, cred, &cache);
+	if (status != 0)
+		status = nfs_access_get_cached(inode, cred, &cache);
+	if (status == 0)
+		goto out_cached;
+
+	status = -ECHILD;
+	if (mask & MAY_NOT_BLOCK)
+		goto out;
+
+	/* Be clever: ask server to check for all possible rights */
+	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
+	cache.cred = cred;
+	cache.jiffies = jiffies;
+	status = NFS_PROTO(inode)->access(inode, &cache);
+	if (status != 0) {
+		if (status == -ESTALE) {
+			nfs_zap_caches(inode);
+			if (!S_ISDIR(inode->i_mode))
+				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		}
+		goto out;
+	}
+	nfs_access_add_cache(inode, &cache);
+out_cached:
+	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0)
+		status = -EACCES;
+out:
+	trace_nfs_access_exit(inode, status);
+	return status;
+}
+
+static int nfs_open_permission_mask(int openflags)
+{
+	int mask = 0;
+
+	if (openflags & __FMODE_EXEC) {
+		/* ONLY check exec rights */
+		mask = MAY_EXEC;
+	} else {
+		if ((openflags & O_ACCMODE) != O_WRONLY)
+			mask |= MAY_READ;
+		if ((openflags & O_ACCMODE) != O_RDONLY)
+			mask |= MAY_WRITE;
+	}
+
+	return mask;
+}
+
+int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+{
+	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+EXPORT_SYMBOL_GPL(nfs_may_open);
+
+int nfs_permission(struct inode *inode, int mask)
+{
+	struct rpc_cred *cred;
+	int res = 0;
+
+	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
+
+	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		goto out;
+	/* Is this sys_access() ? */
+	if (mask & (MAY_ACCESS | MAY_CHDIR))
+		goto force_lookup;
+
+	switch (inode->i_mode & S_IFMT) {
+		case S_IFLNK:
+			goto out;
+		case S_IFREG:
+			break;
+		case S_IFDIR:
+			/*
+			 * Optimize away all write operations, since the server
+			 * will check permissions when we perform the op.
+			 */
+			if ((mask & MAY_WRITE) && !(mask & MAY_READ))
+				goto out;
+	}
+
+force_lookup:
+	if (!NFS_PROTO(inode)->access)
+		goto out_notsup;
+
+	/* Always try fast lookups first */
+	rcu_read_lock();
+	cred = rpc_lookup_cred_nonblock();
+	if (!IS_ERR(cred))
+		res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+	else
+		res = PTR_ERR(cred);
+	rcu_read_unlock();
+	if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+		/* Fast lookup failed, try the slow way */
+		cred = rpc_lookup_cred();
+		if (!IS_ERR(cred)) {
+			res = nfs_do_access(inode, cred, mask);
+			put_rpccred(cred);
+		} else
+			res = PTR_ERR(cred);
+	}
+out:
+	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
+		res = -EACCES;
+
+	dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
+		inode->i_sb->s_id, inode->i_ino, mask, res);
+	return res;
+out_notsup:
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (res == 0)
+		res = generic_permission(inode, mask);
+	goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_permission);
+
+/*
+ * Local variables:
+ *  version-control: t
+ *  kept-new-versions: 5
+ * End:
+ */
diff --git a/kernel/fs/nfs/direct.c b/kernel/fs/nfs/direct.c
new file mode 100644
index 000000000..38678d9a5
--- /dev/null
+++ b/kernel/fs/nfs/direct.c
@@ -0,0 +1,1066 @@
+/*
+ * linux/fs/nfs/direct.c
+ *
+ * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
+ *
+ * High-performance uncached I/O for the Linux NFS client
+ *
+ * There are important applications whose performance or correctness
+ * depends on uncached access to file data.  Database clusters
+ * (multiple copies of the same instance running on separate hosts)
+ * implement their own cache coherency protocol that subsumes file
+ * system cache protocols.  Applications that process datasets
+ * considerably larger than the client's memory do not always benefit
+ * from a local cache.  A streaming video server, for instance, has no
+ * need to cache the contents of a file.
+ *
+ * When an application requests uncached I/O, all read and write requests
+ * are made directly to the server; data stored or fetched via these
+ * requests is not cached in the Linux page cache.  The client does not
+ * correct unaligned requests from applications.  All requested bytes are
+ * held on permanent storage before a direct write system call returns to
+ * an application.
+ *
+ * Solaris implements an uncached I/O facility called directio() that
+ * is used for backups and sequential I/O to very large files.  Solaris
+ * also supports uncaching whole NFS partitions with "-o forcedirectio,"
+ * an undocumented mount option.
+ *
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
+ * help from Andrew Morton.
+ *
+ * 18 Dec 2001	Initial implementation for 2.4  --cel
+ * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
+ * 08 Jun 2003	Port to 2.5 APIs  --cel
+ * 31 Mar 2004	Handle direct I/O without VFS support  --cel
+ * 15 Sep 2004	Parallel async reads  --cel
+ * 04 May 2005	support O_DIRECT with aio  --cel
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/module.h>
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <asm/uaccess.h>
+#include <linux/atomic.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+static struct kmem_cache *nfs_direct_cachep;
+
+/*
+ * This represents a set of asynchronous requests that we're waiting on
+ */
+struct nfs_direct_mirror {
+	ssize_t count;
+};
+
+struct nfs_direct_req {
+	struct kref		kref;		/* release manager */
+
+	/* I/O parameters */
+	struct nfs_open_context	*ctx;		/* file open context info */
+	struct nfs_lock_context *l_ctx;		/* Lock context info */
+	struct kiocb *		iocb;		/* controlling i/o request */
+	struct inode *		inode;		/* target file of i/o */
+
+	/* completion state */
+	atomic_t		io_count;	/* i/os we're waiting for */
+	spinlock_t		lock;		/* protect completion state */
+
+	struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
+	int			mirror_count;
+
+	ssize_t			count,		/* bytes actually processed */
+				bytes_left,	/* bytes left to be sent */
+				io_start,	/* start of IO */
+				error;		/* any reported error */
+	struct completion	completion;	/* wait for i/o completion */
+
+	/* commit state */
+	struct nfs_mds_commit_info mds_cinfo;	/* Storage for cinfo */
+	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */
+	struct work_struct	work;
+	int			flags;
+#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
+	struct nfs_writeverf	verf;		/* unstable write verifier */
+};
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+static void nfs_direct_write_schedule_work(struct work_struct *work);
+
+static inline void get_dreq(struct nfs_direct_req *dreq)
+{
+	atomic_inc(&dreq->io_count);
+}
+
+static inline int put_dreq(struct nfs_direct_req *dreq)
+{
+	return atomic_dec_and_test(&dreq->io_count);
+}
+
+void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
+{
+	dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+}
+EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
+
+static void
+nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+{
+	int i;
+	ssize_t count;
+
+	if (dreq->mirror_count == 1) {
+		dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
+		dreq->count += hdr->good_bytes;
+	} else {
+		/* mirrored writes */
+		count = dreq->mirrors[hdr->pgio_mirror_idx].count;
+		if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
+			count = hdr->io_start + hdr->good_bytes - dreq->io_start;
+			dreq->mirrors[hdr->pgio_mirror_idx].count = count;
+		}
+		/* update the dreq->count by finding the minimum agreed count from all
+		 * mirrors */
+		count = dreq->mirrors[0].count;
+
+		for (i = 1; i < dreq->mirror_count; i++)
+			count = min(count, dreq->mirrors[i].count);
+
+		dreq->count = count;
+	}
+}
+
+/*
+ * nfs_direct_select_verf - select the right verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
+ * @commit_idx - commit bucket index for the DS
+ *
+ * returns the correct verifier to use given the role of the server
+ */
+static struct nfs_writeverf *
+nfs_direct_select_verf(struct nfs_direct_req *dreq,
+		       struct nfs_client *ds_clp,
+		       int commit_idx)
+{
+	struct nfs_writeverf *verfp = &dreq->verf;
+
+#ifdef CONFIG_NFS_V4_1
+	if (ds_clp) {
+		/* pNFS is in use, use the DS verf */
+		if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
+			verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
+		else
+			WARN_ON_ONCE(1);
+	}
+#endif
+	return verfp;
+}
+
+
+/*
+ * nfs_direct_set_hdr_verf - set the write/commit verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verfs
+ *
+ * Set the server's (MDS or DS) "seen" verifier
+ */
+static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
+				    struct nfs_pgio_header *hdr)
+{
+	struct nfs_writeverf *verfp;
+
+	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
+	WARN_ON_ONCE(verfp->committed >= 0);
+	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+	WARN_ON_ONCE(verfp->committed < 0);
+}
+
+/*
+ * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verf
+ *
+ * set the server's "seen" verf if not initialized.
+ * returns result of comparison between @hdr->verf and the "seen"
+ * verf of the server used by @hdr (DS or MDS)
+ */
+static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
+					  struct nfs_pgio_header *hdr)
+{
+	struct nfs_writeverf *verfp;
+
+	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
+	if (verfp->committed < 0) {
+		nfs_direct_set_hdr_verf(dreq, hdr);
+		return 0;
+	}
+	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+}
+
+/*
+ * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
+ * @dreq - direct request possibly spanning multiple servers
+ * @data - commit data to validate against previously seen verf
+ *
+ * returns result of comparison between @data->verf and the verf of
+ * the server used by @data (DS or MDS)
+ */
+static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
+					   struct nfs_commit_data *data)
+{
+	struct nfs_writeverf *verfp;
+
+	verfp = nfs_direct_select_verf(dreq, data->ds_clp,
+					 data->ds_commit_index);
+
+	/* verifier not set so always fail */
+	if (verfp->committed < 0)
+		return 1;
+
+	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+}
+
+/**
+ * nfs_direct_IO - NFS address space operation for direct I/O
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of this routine in the address space ops vector means
+ * the NFS client supports direct I/O. However, for most direct IO, we
+ * shunt off direct read and write requests before the VFS gets them,
+ * so this method is only ever called for swap.
+ */
+ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+	/* we only support swap file calling nfs_direct_IO */
+	if (!IS_SWAPFILE(inode))
+		return 0;
+
+	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
+
+	if (iov_iter_rw(iter) == READ)
+		return nfs_file_direct_read(iocb, iter, pos);
+	return nfs_file_direct_write(iocb, iter);
+}
+
+static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+	for (i = 0; i < npages; i++)
+		page_cache_release(pages[i]);
+}
+
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+			      struct nfs_direct_req *dreq)
+{
+	cinfo->lock = &dreq->inode->i_lock;
+	cinfo->mds = &dreq->mds_cinfo;
+	cinfo->ds = &dreq->ds_cinfo;
+	cinfo->dreq = dreq;
+	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
+}
+
+static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
+					     struct nfs_pageio_descriptor *pgio,
+					     struct nfs_page *req)
+{
+	int mirror_count = 1;
+
+	if (pgio->pg_ops->pg_get_mirror_count)
+		mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+
+	dreq->mirror_count = mirror_count;
+}
+
+static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
+{
+	struct nfs_direct_req *dreq;
+
+	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
+	if (!dreq)
+		return NULL;
+
+	kref_init(&dreq->kref);
+	kref_get(&dreq->kref);
+	init_completion(&dreq->completion);
+	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+	dreq->verf.committed = NFS_INVALID_STABLE_HOW;	/* not set yet */
+	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
+	dreq->mirror_count = 1;
+	spin_lock_init(&dreq->lock);
+
+	return dreq;
+}
+
+static void nfs_direct_req_free(struct kref *kref)
+{
+	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+
+	nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
+	if (dreq->l_ctx != NULL)
+		nfs_put_lock_context(dreq->l_ctx);
+	if (dreq->ctx != NULL)
+		put_nfs_open_context(dreq->ctx);
+	kmem_cache_free(nfs_direct_cachep, dreq);
+}
+
+static void nfs_direct_req_release(struct nfs_direct_req *dreq)
+{
+	kref_put(&dreq->kref, nfs_direct_req_free);
+}
+
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
+{
+	return dreq->bytes_left;
+}
+EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
+
+/*
+ * Collects and returns the final error value/byte-count.
+ */
+static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
+{
+	ssize_t result = -EIOCBQUEUED;
+
+	/* Async requests don't wait here */
+	if (dreq->iocb)
+		goto out;
+
+	result = wait_for_completion_killable(&dreq->completion);
+
+	if (!result)
+		result = dreq->error;
+	if (!result)
+		result = dreq->count;
+
+out:
+	return (ssize_t) result;
+}
+
+/*
+ * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
+ */
+static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
+{
+	struct inode *inode = dreq->inode;
+
+	if (dreq->iocb && write) {
+		loff_t pos = dreq->iocb->ki_pos + dreq->count;
+
+		spin_lock(&inode->i_lock);
+		if (i_size_read(inode) < pos)
+			i_size_write(inode, pos);
+		spin_unlock(&inode->i_lock);
+	}
+
+	if (write)
+		nfs_zap_mapping(inode, inode->i_mapping);
+
+	inode_dio_end(inode);
+
+	if (dreq->iocb) {
+		long res = (long) dreq->error;
+		if (!res)
+			res = (long) dreq->count;
+		dreq->iocb->ki_complete(dreq->iocb, res, 0);
+	}
+
+	complete_all(&dreq->completion);
+
+	nfs_direct_req_release(dreq);
+}
+
+static void nfs_direct_readpage_release(struct nfs_page *req)
+{
+	dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
+		d_inode(req->wb_context->dentry)->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
+		req->wb_bytes,
+		(long long)req_offset(req));
+	nfs_release_request(req);
+}
+
+static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
+{
+	unsigned long bytes = 0;
+	struct nfs_direct_req *dreq = hdr->dreq;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out_put;
+
+	spin_lock(&dreq->lock);
+	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
+		dreq->error = hdr->error;
+	else
+		nfs_direct_good_bytes(dreq, hdr);
+
+	spin_unlock(&dreq->lock);
+
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+		struct page *page = req->wb_page;
+
+		if (!PageCompound(page) && bytes < hdr->good_bytes)
+			set_page_dirty(page);
+		bytes += req->wb_bytes;
+		nfs_list_remove_request(req);
+		nfs_direct_readpage_release(req);
+	}
+out_put:
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq, false);
+	hdr->release(hdr);
+}
+
+static void nfs_read_sync_pgio_error(struct list_head *head)
+{
+	struct nfs_page *req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_release_request(req);
+	}
+}
+
+static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
+{
+	get_dreq(hdr->dreq);
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
+	.error_cleanup = nfs_read_sync_pgio_error,
+	.init_hdr = nfs_direct_pgio_init,
+	.completion = nfs_direct_read_completion,
+};
+
+/*
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
+ * bail and stop sending more reads.  Read length accounting is
+ * handled automatically by nfs_direct_read_result().  Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+					      struct iov_iter *iter,
+					      loff_t pos)
+{
+	struct nfs_pageio_descriptor desc;
+	struct inode *inode = dreq->inode;
+	ssize_t result = -EINVAL;
+	size_t requested_bytes = 0;
+	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
+
+	nfs_pageio_init_read(&desc, dreq->inode, false,
+			     &nfs_direct_read_completion_ops);
+	get_dreq(dreq);
+	desc.pg_dreq = dreq;
+	inode_dio_begin(inode);
+
+	while (iov_iter_count(iter)) {
+		struct page **pagevec;
+		size_t bytes;
+		size_t pgbase;
+		unsigned npages, i;
+
+		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+						  rsize, &pgbase);
+		if (result < 0)
+			break;
+	
+		bytes = result;
+		iov_iter_advance(iter, bytes);
+		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+		for (i = 0; i < npages; i++) {
+			struct nfs_page *req;
+			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+			/* XXX do we need to do the eof zeroing found in async_filler? */
+			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+						 pgbase, req_len);
+			if (IS_ERR(req)) {
+				result = PTR_ERR(req);
+				break;
+			}
+			req->wb_index = pos >> PAGE_SHIFT;
+			req->wb_offset = pos & ~PAGE_MASK;
+			if (!nfs_pageio_add_request(&desc, req)) {
+				result = desc.pg_error;
+				nfs_release_request(req);
+				break;
+			}
+			pgbase = 0;
+			bytes -= req_len;
+			requested_bytes += req_len;
+			pos += req_len;
+			dreq->bytes_left -= req_len;
+		}
+		nfs_direct_release_pages(pagevec, npages);
+		kvfree(pagevec);
+		if (result < 0)
+			break;
+	}
+
+	nfs_pageio_complete(&desc);
+
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		inode_dio_end(inode);
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq, false);
+	return 0;
+}
+
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iter: vector of user buffers into which to read data
+ * @pos: byte offset in file where reading starts
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file.  For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change.  Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+				loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct nfs_direct_req *dreq;
+	struct nfs_lock_context *l_ctx;
+	ssize_t result = -EINVAL;
+	size_t count = iov_iter_count(iter);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+		file, count, (long long) pos);
+
+	result = 0;
+	if (!count)
+		goto out;
+
+	mutex_lock(&inode->i_mutex);
+	result = nfs_sync_mapping(mapping);
+	if (result)
+		goto out_unlock;
+
+	task_io_account_read(count);
+
+	result = -ENOMEM;
+	dreq = nfs_direct_req_alloc();
+	if (dreq == NULL)
+		goto out_unlock;
+
+	dreq->inode = inode;
+	dreq->bytes_left = count;
+	dreq->io_start = pos;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (IS_ERR(l_ctx)) {
+		result = PTR_ERR(l_ctx);
+		goto out_release;
+	}
+	dreq->l_ctx = l_ctx;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	NFS_I(inode)->read_io += count;
+	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (!result) {
+		result = nfs_direct_wait(dreq);
+		if (result > 0)
+			iocb->ki_pos = pos + result;
+	}
+
+	nfs_direct_req_release(dreq);
+	return result;
+
+out_release:
+	nfs_direct_req_release(dreq);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+out:
+	return result;
+}
+
+static void
+nfs_direct_write_scan_commit_list(struct inode *inode,
+				  struct list_head *list,
+				  struct nfs_commit_info *cinfo)
+{
+	spin_lock(cinfo->lock);
+#ifdef CONFIG_NFS_V4_1
+	if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
+		NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
+#endif
+	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
+	spin_unlock(cinfo->lock);
+}
+
+static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+{
+	struct nfs_pageio_descriptor desc;
+	struct nfs_page *req, *tmp;
+	LIST_HEAD(reqs);
+	struct nfs_commit_info cinfo;
+	LIST_HEAD(failed);
+	int i;
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+	dreq->count = 0;
+	for (i = 0; i < dreq->mirror_count; i++)
+		dreq->mirrors[i].count = 0;
+	get_dreq(dreq);
+
+	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
+			      &nfs_direct_write_completion_ops);
+	desc.pg_dreq = dreq;
+
+	req = nfs_list_entry(reqs.next);
+	nfs_direct_setup_mirroring(dreq, &desc, req);
+
+	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+		if (!nfs_pageio_add_request(&desc, req)) {
+			nfs_list_remove_request(req);
+			nfs_list_add_request(req, &failed);
+			spin_lock(cinfo.lock);
+			dreq->flags = 0;
+			dreq->error = -EIO;
+			spin_unlock(cinfo.lock);
+		}
+		nfs_release_request(req);
+	}
+	nfs_pageio_complete(&desc);
+
+	while (!list_empty(&failed)) {
+		req = nfs_list_entry(failed.next);
+		nfs_list_remove_request(req);
+		nfs_unlock_and_release_request(req);
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, dreq->inode);
+}
+
+static void nfs_direct_commit_complete(struct nfs_commit_data *data)
+{
+	struct nfs_direct_req *dreq = data->dreq;
+	struct nfs_commit_info cinfo;
+	struct nfs_page *req;
+	int status = data->task.tk_status;
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	if (status < 0) {
+		dprintk("NFS: %5u commit failed with error %d.\n",
+			data->task.tk_pid, status);
+		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+	} else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
+		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
+		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+	}
+
+	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
+	while (!list_empty(&data->pages)) {
+		req = nfs_list_entry(data->pages.next);
+		nfs_list_remove_request(req);
+		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+			/* Note the rewrite will go through mds */
+			nfs_mark_request_commit(req, NULL, &cinfo, 0);
+		} else
+			nfs_release_request(req);
+		nfs_unlock_and_release_request(req);
+	}
+
+	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+		nfs_direct_write_complete(dreq, data->inode);
+}
+
+static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+{
+	/* There is no lock to clear */
+}
+
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
+	.completion = nfs_direct_commit_complete,
+	.error_cleanup = nfs_direct_error_cleanup,
+};
+
+static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+{
+	int res;
+	struct nfs_commit_info cinfo;
+	LIST_HEAD(mds_list);
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
+	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
+	if (res < 0) /* res == -ENOMEM */
+		nfs_direct_write_reschedule(dreq);
+}
+
+static void nfs_direct_write_schedule_work(struct work_struct *work)
+{
+	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
+	int flags = dreq->flags;
+
+	dreq->flags = 0;
+	switch (flags) {
+		case NFS_ODIRECT_DO_COMMIT:
+			nfs_direct_commit_schedule(dreq);
+			break;
+		case NFS_ODIRECT_RESCHED_WRITES:
+			nfs_direct_write_reschedule(dreq);
+			break;
+		default:
+			nfs_direct_complete(dreq, true);
+	}
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
+}
+
+static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
+{
+	struct nfs_direct_req *dreq = hdr->dreq;
+	struct nfs_commit_info cinfo;
+	bool request_commit = false;
+	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out_put;
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+
+	spin_lock(&dreq->lock);
+
+	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+		dreq->flags = 0;
+		dreq->error = hdr->error;
+	}
+	if (dreq->error == 0) {
+		nfs_direct_good_bytes(dreq, hdr);
+		if (nfs_write_need_commit(hdr)) {
+			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
+				request_commit = true;
+			else if (dreq->flags == 0) {
+				nfs_direct_set_hdr_verf(dreq, hdr);
+				request_commit = true;
+				dreq->flags = NFS_ODIRECT_DO_COMMIT;
+			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
+				request_commit = true;
+				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
+					dreq->flags =
+						NFS_ODIRECT_RESCHED_WRITES;
+			}
+		}
+	}
+	spin_unlock(&dreq->lock);
+
+	while (!list_empty(&hdr->pages)) {
+
+		req = nfs_list_entry(hdr->pages.next);
+		nfs_list_remove_request(req);
+		if (request_commit) {
+			kref_get(&req->wb_kref);
+			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+				hdr->ds_commit_idx);
+		}
+		nfs_unlock_and_release_request(req);
+	}
+
+out_put:
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, hdr->inode);
+	hdr->release(hdr);
+}
+
+static void nfs_write_sync_pgio_error(struct list_head *head)
+{
+	struct nfs_page *req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_unlock_and_release_request(req);
+	}
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
+	.error_cleanup = nfs_write_sync_pgio_error,
+	.init_hdr = nfs_direct_pgio_init,
+	.completion = nfs_direct_write_completion,
+};
+
+
+/*
+ * NB: Return the value of the first error return code.  Subsequent
+ *     errors after the first one are ignored.
+ */
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes.  Write length accounting is
+ * handled automatically by nfs_direct_write_result().  Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
+					       struct iov_iter *iter,
+					       loff_t pos)
+{
+	struct nfs_pageio_descriptor desc;
+	struct inode *inode = dreq->inode;
+	ssize_t result = 0;
+	size_t requested_bytes = 0;
+	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+
+	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
+			      &nfs_direct_write_completion_ops);
+	desc.pg_dreq = dreq;
+	get_dreq(dreq);
+	inode_dio_begin(inode);
+
+	NFS_I(inode)->write_io += iov_iter_count(iter);
+	while (iov_iter_count(iter)) {
+		struct page **pagevec;
+		size_t bytes;
+		size_t pgbase;
+		unsigned npages, i;
+
+		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+						  wsize, &pgbase);
+		if (result < 0)
+			break;
+
+		bytes = result;
+		iov_iter_advance(iter, bytes);
+		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+		for (i = 0; i < npages; i++) {
+			struct nfs_page *req;
+			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+
+			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+						 pgbase, req_len);
+			if (IS_ERR(req)) {
+				result = PTR_ERR(req);
+				break;
+			}
+
+			nfs_direct_setup_mirroring(dreq, &desc, req);
+
+			nfs_lock_request(req);
+			req->wb_index = pos >> PAGE_SHIFT;
+			req->wb_offset = pos & ~PAGE_MASK;
+			if (!nfs_pageio_add_request(&desc, req)) {
+				result = desc.pg_error;
+				nfs_unlock_and_release_request(req);
+				break;
+			}
+			pgbase = 0;
+			bytes -= req_len;
+			requested_bytes += req_len;
+			pos += req_len;
+			dreq->bytes_left -= req_len;
+		}
+		nfs_direct_release_pages(pagevec, npages);
+		kvfree(pagevec);
+		if (result < 0)
+			break;
+	}
+	nfs_pageio_complete(&desc);
+
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		inode_dio_end(inode);
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, dreq->inode);
+	return 0;
+}
+
+/**
+ * nfs_file_direct_write - file direct write operation for NFS files
+ * @iocb: target I/O control block
+ * @iter: vector of user buffers from which to write data
+ * @pos: byte offset in file where writing starts
+ *
+ * We use this function for direct writes instead of calling
+ * generic_file_aio_write() in order to avoid taking the inode
+ * semaphore and updating the i_size.  The NFS server will set
+ * the new i_size and this client must read the updated size
+ * back into its cache.  We let the server do generic write
+ * parameter checking and report problems.
+ *
+ * We eliminate local atime updates, see direct read above.
+ *
+ * We avoid unnecessary page cache invalidations for normal cached
+ * readers of this file.
+ *
+ * Note that O_APPEND is not supported for NFS direct writes, as there
+ * is no atomic O_APPEND write facility in the NFS protocol.
+ */
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t result = -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct nfs_direct_req *dreq;
+	struct nfs_lock_context *l_ctx;
+	loff_t pos, end;
+
+	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
+		file, iov_iter_count(iter), (long long) iocb->ki_pos);
+
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES,
+		      iov_iter_count(iter));
+
+	pos = iocb->ki_pos;
+	end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
+
+	mutex_lock(&inode->i_mutex);
+
+	result = nfs_sync_mapping(mapping);
+	if (result)
+		goto out_unlock;
+
+	if (mapping->nrpages) {
+		result = invalidate_inode_pages2_range(mapping,
+					pos >> PAGE_CACHE_SHIFT, end);
+		if (result)
+			goto out_unlock;
+	}
+
+	task_io_account_write(iov_iter_count(iter));
+
+	result = -ENOMEM;
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		goto out_unlock;
+
+	dreq->inode = inode;
+	dreq->bytes_left = iov_iter_count(iter);
+	dreq->io_start = pos;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (IS_ERR(l_ctx)) {
+		result = PTR_ERR(l_ctx);
+		goto out_release;
+	}
+	dreq->l_ctx = l_ctx;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+
+	if (mapping->nrpages) {
+		invalidate_inode_pages2_range(mapping,
+					      pos >> PAGE_CACHE_SHIFT, end);
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (!result) {
+		result = nfs_direct_wait(dreq);
+		if (result > 0) {
+			struct inode *inode = mapping->host;
+
+			iocb->ki_pos = pos + result;
+			spin_lock(&inode->i_lock);
+			if (i_size_read(inode) < iocb->ki_pos)
+				i_size_write(inode, iocb->ki_pos);
+			spin_unlock(&inode->i_lock);
+			generic_write_sync(file, pos, result);
+		}
+	}
+	nfs_direct_req_release(dreq);
+	return result;
+
+out_release:
+	nfs_direct_req_release(dreq);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return result;
+}
+
+/**
+ * nfs_init_directcache - create a slab cache for nfs_direct_req structures
+ *
+ */
+int __init nfs_init_directcache(void)
+{
+	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
+						sizeof(struct nfs_direct_req),
+						0, (SLAB_RECLAIM_ACCOUNT|
+							SLAB_MEM_SPREAD),
+						NULL);
+	if (nfs_direct_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
+ *
+ */
+void nfs_destroy_directcache(void)
+{
+	kmem_cache_destroy(nfs_direct_cachep);
+}
diff --git a/kernel/fs/nfs/dns_resolve.c b/kernel/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000..d25f10fb4
--- /dev/null
+++ b/kernel/fs/nfs/dns_resolve.c
@@ -0,0 +1,470 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/module.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/dns_resolver.h>
+#include "dns_resolve.h"
+
+ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen)
+{
+	ssize_t ret;
+	char *ip_addr = NULL;
+	int ip_len;
+
+	ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+	if (ip_len > 0)
+		ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
+	else
+		ret = -ESRCH;
+	kfree(ip_addr);
+	return ret;
+}
+
+#else
+
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "dns_resolve.h"
+#include "cache_lib.h"
+#include "netns.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+struct nfs_dns_ent {
+	struct cache_head h;
+
+	char *hostname;
+	size_t namelen;
+
+	struct sockaddr_storage addr;
+	size_t addrlen;
+};
+
+
+static void nfs_dns_ent_update(struct cache_head *cnew,
+		struct cache_head *ckey)
+{
+	struct nfs_dns_ent *new;
+	struct nfs_dns_ent *key;
+
+	new = container_of(cnew, struct nfs_dns_ent, h);
+	key = container_of(ckey, struct nfs_dns_ent, h);
+
+	memcpy(&new->addr, &key->addr, key->addrlen);
+	new->addrlen = key->addrlen;
+}
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+		struct cache_head *ckey)
+{
+	struct nfs_dns_ent *new;
+	struct nfs_dns_ent *key;
+
+	new = container_of(cnew, struct nfs_dns_ent, h);
+	key = container_of(ckey, struct nfs_dns_ent, h);
+
+	kfree(new->hostname);
+	new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+	if (new->hostname) {
+		new->namelen = key->namelen;
+		nfs_dns_ent_update(cnew, ckey);
+	} else {
+		new->namelen = 0;
+		new->addrlen = 0;
+	}
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+	struct nfs_dns_ent *item;
+
+	item = container_of(ref, struct nfs_dns_ent, h.ref);
+	kfree(item->hostname);
+	kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+	struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+	if (item != NULL) {
+		item->hostname = NULL;
+		item->namelen = 0;
+		item->addrlen = 0;
+		return &item->h;
+	}
+	return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+	return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+		struct cache_head *ch,
+		char **bpp, int *blen)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+	qword_add(bpp, blen, key->hostname);
+	(*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+		struct cache_head *ch)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+	int ret;
+
+	ret = nfs_cache_upcall(cd, key->hostname);
+	if (ret)
+		ret = sunrpc_cache_pipe_upcall(cd, ch);
+	return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+		struct cache_head *cb)
+{
+	struct nfs_dns_ent *a;
+	struct nfs_dns_ent *b;
+
+	a = container_of(ca, struct nfs_dns_ent, h);
+	b = container_of(cb, struct nfs_dns_ent, h);
+
+	if (a->namelen == 0 || a->namelen != b->namelen)
+		return 0;
+	return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+		struct cache_head *h)
+{
+	struct nfs_dns_ent *item;
+	long ttl;
+
+	if (h == NULL) {
+		seq_puts(m, "# ip address      hostname        ttl\n");
+		return 0;
+	}
+	item = container_of(h, struct nfs_dns_ent, h);
+	ttl = item->h.expiry_time - seconds_since_boot();
+	if (ttl < 0)
+		ttl = 0;
+
+	if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+		char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+		rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+		seq_printf(m, "%15s ", buf);
+	} else
+		seq_puts(m, "<none>          ");
+	seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+	return 0;
+}
+
+static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_lookup(cd,
+			&key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+		struct nfs_dns_ent *new,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_update(cd,
+			&new->h, &key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+	struct nfs_dns_ent key, *item;
+	unsigned int ttl;
+	ssize_t len;
+	int ret = -EINVAL;
+
+	if (buf[buflen-1] != '\n')
+		goto out;
+	buf[buflen-1] = '\0';
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+	key.addrlen = rpc_pton(cd->net, buf1, len,
+			(struct sockaddr *)&key.addr,
+			sizeof(key.addr));
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+
+	key.hostname = buf1;
+	key.namelen = len;
+	memset(&key.h, 0, sizeof(key.h));
+
+	if (get_uint(&buf, &ttl) < 0)
+		goto out;
+	if (ttl == 0)
+		goto out;
+	key.h.expiry_time = ttl + seconds_since_boot();
+
+	ret = -ENOMEM;
+	item = nfs_dns_lookup(cd, &key);
+	if (item == NULL)
+		goto out;
+
+	if (key.addrlen == 0)
+		set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+	item = nfs_dns_update(cd, &key, item);
+	if (item == NULL)
+		goto out;
+
+	ret = 0;
+	cache_put(&item->h, cd);
+out:
+	return ret;
+}
+
+static int do_cache_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item,
+		struct nfs_cache_defer_req *dreq)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (*item) {
+		ret = cache_check(cd, &(*item)->h, &dreq->req);
+		if (ret)
+			*item = NULL;
+	}
+	return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (!*item)
+		goto out_err;
+	ret = -ETIMEDOUT;
+	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+			|| (*item)->h.expiry_time < seconds_since_boot()
+			|| cd->flush_time > (*item)->h.last_refresh)
+		goto out_put;
+	ret = -ENOENT;
+	if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+		goto out_put;
+	return 0;
+out_put:
+	cache_put(&(*item)->h, cd);
+out_err:
+	*item = NULL;
+	return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	struct nfs_cache_defer_req *dreq;
+	int ret = -ENOMEM;
+
+	dreq = nfs_cache_defer_req_alloc();
+	if (!dreq)
+		goto out;
+	ret = do_cache_lookup(cd, key, item, dreq);
+	if (ret == -EAGAIN) {
+		ret = nfs_cache_wait_for_upcall(dreq);
+		if (!ret)
+			ret = do_cache_lookup_nowait(cd, key, item);
+	}
+	nfs_cache_defer_req_put(dreq);
+out:
+	return ret;
+}
+
+ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+		size_t namelen, struct sockaddr *sa, size_t salen)
+{
+	struct nfs_dns_ent key = {
+		.hostname = name,
+		.namelen = namelen,
+	};
+	struct nfs_dns_ent *item = NULL;
+	ssize_t ret;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
+	if (ret == 0) {
+		if (salen >= item->addrlen) {
+			memcpy(sa, &item->addr, item->addrlen);
+			ret = item->addrlen;
+		} else
+			ret = -EOVERFLOW;
+		cache_put(&item->h, nn->nfs_dns_resolve);
+	} else if (ret == -ENOENT)
+		ret = -ESRCH;
+	return ret;
+}
+
+static struct cache_detail nfs_dns_resolve_template = {
+	.owner		= THIS_MODULE,
+	.hash_size	= NFS_DNS_HASHTBL_SIZE,
+	.name		= "dns_resolve",
+	.cache_put	= nfs_dns_ent_put,
+	.cache_upcall	= nfs_dns_upcall,
+	.cache_request	= nfs_dns_request,
+	.cache_parse	= nfs_dns_parse,
+	.cache_show	= nfs_dns_show,
+	.match		= nfs_dns_match,
+	.init		= nfs_dns_ent_init,
+	.update		= nfs_dns_ent_update,
+	.alloc		= nfs_dns_ent_alloc,
+};
+
+
+int nfs_dns_resolver_cache_init(struct net *net)
+{
+	int err;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net);
+	if (IS_ERR(nn->nfs_dns_resolve))
+		return PTR_ERR(nn->nfs_dns_resolve);
+
+	err = nfs_cache_register_net(net, nn->nfs_dns_resolve);
+	if (err)
+		goto err_reg;
+	return 0;
+
+err_reg:
+	cache_destroy_net(nn->nfs_dns_resolve, net);
+	return err;
+}
+
+void nfs_dns_resolver_cache_destroy(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	nfs_cache_unregister_net(net, nn->nfs_dns_resolve);
+	cache_destroy_net(nn->nfs_dns_resolve, net);
+}
+
+static int nfs4_dns_net_init(struct net *net)
+{
+	return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+	nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+	.init = nfs4_dns_net_init,
+	.exit = nfs4_dns_net_exit,
+};
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+			   void *ptr)
+{
+	struct super_block *sb = ptr;
+	struct net *net = sb->s_fs_info;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct cache_detail *cd = nn->nfs_dns_resolve;
+	int ret = 0;
+
+	if (cd == NULL)
+		return 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return 0;
+
+	switch (event) {
+	case RPC_PIPEFS_MOUNT:
+		ret = nfs_cache_register_sb(sb, cd);
+		break;
+	case RPC_PIPEFS_UMOUNT:
+		nfs_cache_unregister_sb(sb, cd);
+		break;
+	default:
+		ret = -ENOTSUPP;
+		break;
+	}
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static struct notifier_block nfs_dns_resolver_block = {
+	.notifier_call	= rpc_pipefs_event,
+};
+
+int nfs_dns_resolver_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+	if (err < 0)
+		goto out;
+	err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+	if (err < 0)
+		goto out1;
+	return 0;
+out1:
+	unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+	return err;
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+	rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+	unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+}
+#endif
diff --git a/kernel/fs/nfs/dns_resolve.h b/kernel/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000..2e4f596d2
--- /dev/null
+++ b/kernel/fs/nfs/dns_resolve.h
@@ -0,0 +1,36 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN	(128)
+
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+	return 0;
+}
+
+static inline void nfs_dns_resolver_destroy(void)
+{}
+
+static inline int nfs_dns_resolver_cache_init(struct net *net)
+{
+	return 0;
+}
+
+static inline void nfs_dns_resolver_cache_destroy(struct net *net)
+{}
+#else
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern int nfs_dns_resolver_cache_init(struct net *net);
+extern void nfs_dns_resolver_cache_destroy(struct net *net);
+#endif
+
+extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+		size_t namelen,	struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/kernel/fs/nfs/file.c b/kernel/fs/nfs/file.c
new file mode 100644
index 000000000..8b8d83a52
--- /dev/null
+++ b/kernel/fs/nfs/file.c
@@ -0,0 +1,947 @@
+/*
+ *  linux/fs/nfs/file.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Changes Copyright (C) 1994 by Florian La Roche
+ *   - Do not copy data too often around in the kernel.
+ *   - In nfs_file_read the return value of kmalloc wasn't checked.
+ *   - Put in a better version of read look-ahead buffering. Original idea
+ *     and implementation by Wai S Kok elekokws@ee.nus.sg.
+ *
+ *  Expire cache on write to a file by Wai S Kok (Oct 1994).
+ *
+ *  Total rewrite of read side for new NFS buffer cache.. Linus.
+ *
+ *  nfs regular file handling functions
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+
+#include <asm/uaccess.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FILE
+
+static const struct vm_operations_struct nfs_file_vm_ops;
+
+/* Hack for future NFS swap support */
+#ifndef IS_SWAPFILE
+# define IS_SWAPFILE(inode)	(0)
+#endif
+
+int nfs_check_flags(int flags)
+{
+	if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_check_flags);
+
+/*
+ * Open file
+ */
+static int
+nfs_file_open(struct inode *inode, struct file *filp)
+{
+	int res;
+
+	dprintk("NFS: open file(%pD2)\n", filp);
+
+	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+	res = nfs_check_flags(filp->f_flags);
+	if (res)
+		return res;
+
+	res = nfs_open(inode, filp);
+	return res;
+}
+
+int
+nfs_file_release(struct inode *inode, struct file *filp)
+{
+	dprintk("NFS: release(%pD2)\n", filp);
+
+	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
+	return nfs_release(inode, filp);
+}
+EXPORT_SYMBOL_GPL(nfs_file_release);
+
+/**
+ * nfs_revalidate_size - Revalidate the file size
+ * @inode - pointer to inode struct
+ * @file - pointer to struct file
+ *
+ * Revalidates the file length. This is basically a wrapper around
+ * nfs_revalidate_inode() that takes into account the fact that we may
+ * have cached writes (in which case we don't care about the server's
+ * idea of what the file length is), or O_DIRECT (in which case we
+ * shouldn't trust the cache).
+ */
+static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfs_have_delegated_attributes(inode))
+		goto out_noreval;
+
+	if (filp->f_flags & O_DIRECT)
+		goto force_reval;
+	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		goto force_reval;
+	if (nfs_attribute_timeout(inode))
+		goto force_reval;
+out_noreval:
+	return 0;
+force_reval:
+	return __nfs_revalidate_inode(server, inode);
+}
+
+loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
+{
+	dprintk("NFS: llseek file(%pD2, %lld, %d)\n",
+			filp, offset, whence);
+
+	/*
+	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+	 * the cached file length
+	 */
+	if (whence != SEEK_SET && whence != SEEK_CUR) {
+		struct inode *inode = filp->f_mapping->host;
+
+		int retval = nfs_revalidate_file_size(inode, filp);
+		if (retval < 0)
+			return (loff_t)retval;
+	}
+
+	return generic_file_llseek(filp, offset, whence);
+}
+EXPORT_SYMBOL_GPL(nfs_file_llseek);
+
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+int
+nfs_file_flush(struct file *file, fl_owner_t id)
+{
+	struct inode	*inode = file_inode(file);
+
+	dprintk("NFS: flush(%pD2)\n", file);
+
+	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+	if ((file->f_mode & FMODE_WRITE) == 0)
+		return 0;
+
+	/*
+	 * If we're holding a write delegation, then just start the i/o
+	 * but don't wait for completion (or send a commit).
+	 */
+	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+		return filemap_fdatawrite(file->f_mapping);
+
+	/* Flush writes to the server and return any errors */
+	return vfs_fsync(file, 0);
+}
+EXPORT_SYMBOL_GPL(nfs_file_flush);
+
+ssize_t
+nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t result;
+
+	if (iocb->ki_flags & IOCB_DIRECT)
+		return nfs_file_direct_read(iocb, to, iocb->ki_pos);
+
+	dprintk("NFS: read(%pD2, %zu@%lu)\n",
+		iocb->ki_filp,
+		iov_iter_count(to), (unsigned long) iocb->ki_pos);
+
+	result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
+	if (!result) {
+		result = generic_file_read_iter(iocb, to);
+		if (result > 0)
+			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+	}
+	return result;
+}
+EXPORT_SYMBOL_GPL(nfs_file_read);
+
+ssize_t
+nfs_file_splice_read(struct file *filp, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t count,
+		     unsigned int flags)
+{
+	struct inode *inode = file_inode(filp);
+	ssize_t res;
+
+	dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
+		filp, (unsigned long) count, (unsigned long long) *ppos);
+
+	res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
+	if (!res) {
+		res = generic_file_splice_read(filp, ppos, pipe, count, flags);
+		if (res > 0)
+			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
+	}
+	return res;
+}
+EXPORT_SYMBOL_GPL(nfs_file_splice_read);
+
+int
+nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct inode *inode = file_inode(file);
+	int	status;
+
+	dprintk("NFS: mmap(%pD2)\n", file);
+
+	/* Note: generic_file_mmap() returns ENOSYS on nommu systems
+	 *       so we call that before revalidating the mapping
+	 */
+	status = generic_file_mmap(file, vma);
+	if (!status) {
+		vma->vm_ops = &nfs_file_vm_ops;
+		status = nfs_revalidate_mapping(inode, file->f_mapping);
+	}
+	return status;
+}
+EXPORT_SYMBOL_GPL(nfs_file_mmap);
+
+/*
+ * Flush any dirty pages for this process, and check for write errors.
+ * The return status from this call provides a reliable indication of
+ * whether any write errors occurred for this process.
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occurred, and hence cause it to
+ * fall back to doing a synchronous write.
+ */
+int
+nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	struct inode *inode = file_inode(file);
+	int have_error, do_resend, status;
+	int ret = 0;
+
+	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
+
+	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
+	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	status = nfs_commit_inode(inode, FLUSH_SYNC);
+	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	if (have_error) {
+		ret = xchg(&ctx->error, 0);
+		if (ret)
+			goto out;
+	}
+	if (status < 0) {
+		ret = status;
+		goto out;
+	}
+	do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+	if (do_resend)
+		ret = -EAGAIN;
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
+
+static int
+nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	int ret;
+	struct inode *inode = file_inode(file);
+
+	trace_nfs_fsync_enter(inode);
+
+	nfs_inode_dio_wait(inode);
+	do {
+		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+		if (ret != 0)
+			break;
+		mutex_lock(&inode->i_mutex);
+		ret = nfs_file_fsync_commit(file, start, end, datasync);
+		mutex_unlock(&inode->i_mutex);
+		/*
+		 * If nfs_file_fsync_commit detected a server reboot, then
+		 * resend all dirty pages that might have been covered by
+		 * the NFS_CONTEXT_RESEND_WRITES flag
+		 */
+		start = 0;
+		end = LLONG_MAX;
+	} while (ret == -EAGAIN);
+
+	trace_nfs_fsync_exit(inode, ret);
+	return ret;
+}
+
+/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer.  In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+			loff_t pos, unsigned len)
+{
+	unsigned int pglen = nfs_page_length(page);
+	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int end = offset + len;
+
+	if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+		if (!PageUptodate(page))
+			return 1;
+		return 0;
+	}
+
+	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
+	    !PageUptodate(page) &&		/* Uptodate? */
+	    !PagePrivate(page) &&		/* i/o request already? */
+	    pglen &&				/* valid bytes of file? */
+	    (end < pglen || offset))		/* replace all valid bytes? */
+		return 1;
+	return 0;
+}
+
+/*
+ * This does the "real" work of the write. We must allocate and lock the
+ * page to be sent back to the generic routine, which then copies the
+ * data from user space.
+ *
+ * If the writer ends up delaying the write, the writer needs to
+ * increment the page use counts until he is done with the page.
+ */
+static int nfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int once_thru = 0;
+
+	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
+		file, mapping->host->i_ino, len, (long long) pos);
+
+start:
+	/*
+	 * Prevent starvation issues if someone is doing a consistency
+	 * sync-to-disk
+	 */
+	ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+				 nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		return ret;
+	/*
+	 * Wait for O_DIRECT to complete
+	 */
+	nfs_inode_dio_wait(mapping->host);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	ret = nfs_flush_incompatible(file, page);
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+	} else if (!once_thru &&
+		   nfs_want_read_modify_write(file, page, pos, len)) {
+		once_thru = 1;
+		ret = nfs_readpage(file, page);
+		page_cache_release(page);
+		if (!ret)
+			goto start;
+	}
+	return ret;
+}
+
+static int nfs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	int status;
+
+	dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
+		file, mapping->host->i_ino, len, (long long) pos);
+
+	/*
+	 * Zero any uninitialised parts of the page, and then mark the page
+	 * as up to date if it turns out that we're extending the file.
+	 */
+	if (!PageUptodate(page)) {
+		unsigned pglen = nfs_page_length(page);
+		unsigned end = offset + len;
+
+		if (pglen == 0) {
+			zero_user_segments(page, 0, offset,
+					end, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
+		} else if (end >= pglen) {
+			zero_user_segment(page, end, PAGE_CACHE_SIZE);
+			if (offset == 0)
+				SetPageUptodate(page);
+		} else
+			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+	}
+
+	status = nfs_updatepage(file, page, offset, copied);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (status < 0)
+		return status;
+	NFS_I(mapping->host)->write_io += copied;
+
+	if (nfs_ctx_key_to_expire(ctx)) {
+		status = nfs_wb_all(mapping->host);
+		if (status < 0)
+			return status;
+	}
+
+	return copied;
+}
+
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ *   page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+				unsigned int length)
+{
+	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+		 page, offset, length);
+
+	if (offset != 0 || length < PAGE_CACHE_SIZE)
+		return;
+	/* Cancel any unstarted writes on this page */
+	nfs_wb_page_cancel(page_file_mapping(page)->host, page);
+
+	nfs_fscache_invalidate_page(page, page->mapping->host);
+}
+
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
+static int nfs_release_page(struct page *page, gfp_t gfp)
+{
+	struct address_space *mapping = page->mapping;
+
+	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
+	/* Always try to initiate a 'commit' if relevant, but only
+	 * wait for it if __GFP_WAIT is set.  Even then, only wait 1
+	 * second and only if the 'bdi' is not congested.
+	 * Waiting indefinitely can cause deadlocks when the NFS
+	 * server is on this machine, when a new TCP connection is
+	 * needed and in other rare cases.  There is no particular
+	 * need to wait extensively here.  A short wait has the
+	 * benefit that someone else can worry about the freezer.
+	 */
+	if (mapping) {
+		struct nfs_server *nfss = NFS_SERVER(mapping->host);
+		nfs_commit_inode(mapping->host, 0);
+		if ((gfp & __GFP_WAIT) &&
+		    !bdi_write_congested(&nfss->backing_dev_info)) {
+			wait_on_page_bit_killable_timeout(page, PG_private,
+							  HZ);
+			if (PagePrivate(page))
+				set_bdi_congested(&nfss->backing_dev_info,
+						  BLK_RW_ASYNC);
+		}
+	}
+	/* If PagePrivate() is set, then the page is not freeable */
+	if (PagePrivate(page))
+		return 0;
+	return nfs_fscache_release_page(page, gfp);
+}
+
+static void nfs_check_dirty_writeback(struct page *page,
+				bool *dirty, bool *writeback)
+{
+	struct nfs_inode *nfsi;
+	struct address_space *mapping = page_file_mapping(page);
+
+	if (!mapping || PageSwapCache(page))
+		return;
+
+	/*
+	 * Check if an unstable page is currently being committed and
+	 * if so, have the VM treat it as if the page is under writeback
+	 * so it will not block due to pages that will shortly be freeable.
+	 */
+	nfsi = NFS_I(mapping->host);
+	if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+		*writeback = true;
+		return;
+	}
+
+	/*
+	 * If PagePrivate() is set, then the page is not freeable and as the
+	 * inode is not being committed, it's not going to be cleaned in the
+	 * near future so treat it as dirty
+	 */
+	if (PagePrivate(page))
+		*dirty = true;
+}
+
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
+static int nfs_launder_page(struct page *page)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+		inode->i_ino, (long long)page_offset(page));
+
+	nfs_fscache_wait_on_page_write(nfsi, page);
+	return nfs_wb_page(inode, page);
+}
+
+#ifdef CONFIG_NFS_SWAP
+static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+						sector_t *span)
+{
+	int ret;
+	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+	*span = sis->pages;
+
+	rcu_read_lock();
+	ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static void nfs_swap_deactivate(struct file *file)
+{
+	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+	rcu_read_lock();
+	xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+	rcu_read_unlock();
+}
+#endif
+
+const struct address_space_operations nfs_file_aops = {
+	.readpage = nfs_readpage,
+	.readpages = nfs_readpages,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.writepage = nfs_writepage,
+	.writepages = nfs_writepages,
+	.write_begin = nfs_write_begin,
+	.write_end = nfs_write_end,
+	.invalidatepage = nfs_invalidate_page,
+	.releasepage = nfs_release_page,
+	.direct_IO = nfs_direct_IO,
+	.migratepage = nfs_migrate_page,
+	.launder_page = nfs_launder_page,
+	.is_dirty_writeback = nfs_check_dirty_writeback,
+	.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_NFS_SWAP
+	.swap_activate = nfs_swap_activate,
+	.swap_deactivate = nfs_swap_deactivate,
+#endif
+};
+
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct file *filp = vma->vm_file;
+	struct inode *inode = file_inode(filp);
+	unsigned pagelen;
+	int ret = VM_FAULT_NOPAGE;
+	struct address_space *mapping;
+
+	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
+		filp, filp->f_mapping->host->i_ino,
+		(long long)page_offset(page));
+
+	/* make sure the cache has finished storing the page */
+	nfs_fscache_wait_on_page_write(NFS_I(inode), page);
+
+	wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+
+	lock_page(page);
+	mapping = page_file_mapping(page);
+	if (mapping != inode->i_mapping)
+		goto out_unlock;
+
+	wait_on_page_writeback(page);
+
+	pagelen = nfs_page_length(page);
+	if (pagelen == 0)
+		goto out_unlock;
+
+	ret = VM_FAULT_LOCKED;
+	if (nfs_flush_incompatible(filp, page) == 0 &&
+	    nfs_updatepage(filp, page, 0, pagelen) == 0)
+		goto out;
+
+	ret = VM_FAULT_SIGBUS;
+out_unlock:
+	unlock_page(page);
+out:
+	return ret;
+}
+
+static const struct vm_operations_struct nfs_file_vm_ops = {
+	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = nfs_vm_page_mkwrite,
+};
+
+static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+{
+	struct nfs_open_context *ctx;
+
+	if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
+		return 1;
+	ctx = nfs_file_open_context(filp);
+	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
+	    nfs_ctx_key_to_expire(ctx))
+		return 1;
+	return 0;
+}
+
+ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	unsigned long written = 0;
+	ssize_t result;
+	size_t count = iov_iter_count(from);
+
+	result = nfs_key_timeout_notify(file, inode);
+	if (result)
+		return result;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		result = generic_write_checks(iocb, from);
+		if (result <= 0)
+			return result;
+		return nfs_file_direct_write(iocb, from);
+	}
+
+	dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+		file, count, (long long) iocb->ki_pos);
+
+	result = -EBUSY;
+	if (IS_SWAPFILE(inode))
+		goto out_swapfile;
+	/*
+	 * O_APPEND implies that we must revalidate the file length.
+	 */
+	if (iocb->ki_flags & IOCB_APPEND) {
+		result = nfs_revalidate_file_size(inode, file);
+		if (result)
+			goto out;
+	}
+
+	result = count;
+	if (!count)
+		goto out;
+
+	result = generic_file_write_iter(iocb, from);
+	if (result > 0)
+		written = result;
+
+	/* Return error values for O_DSYNC and IS_SYNC() */
+	if (result >= 0 && nfs_need_sync_write(file, inode)) {
+		int err = vfs_fsync(file, 0);
+		if (err < 0)
+			result = err;
+	}
+	if (result > 0)
+		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+out:
+	return result;
+
+out_swapfile:
+	printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
+	goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_file_write);
+
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int status = 0;
+	unsigned int saved_type = fl->fl_type;
+
+	/* Try local locking first */
+	posix_test_lock(filp, fl);
+	if (fl->fl_type != F_UNLCK) {
+		/* found a conflict */
+		goto out;
+	}
+	fl->fl_type = saved_type;
+
+	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+		goto out_noconflict;
+
+	if (is_local)
+		goto out_noconflict;
+
+	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+out:
+	return status;
+out_noconflict:
+	fl->fl_type = F_UNLCK;
+	goto out;
+}
+
+static int do_vfs_lock(struct file *file, struct file_lock *fl)
+{
+	int res = 0;
+	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+		case FL_POSIX:
+			res = posix_lock_file_wait(file, fl);
+			break;
+		case FL_FLOCK:
+			res = flock_lock_file_wait(file, fl);
+			break;
+		default:
+			BUG();
+	}
+	return res;
+}
+
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct nfs_lock_context *l_ctx;
+	int status;
+
+	/*
+	 * Flush all pending writes before doing anything
+	 * with locks..
+	 */
+	vfs_fsync(filp, 0);
+
+	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
+	if (!IS_ERR(l_ctx)) {
+		status = nfs_iocounter_wait(&l_ctx->io_count);
+		nfs_put_lock_context(l_ctx);
+		if (status < 0)
+			return status;
+	}
+
+	/* NOTE: special case
+	 * 	If we're signalled while cleaning up locks on process exit, we
+	 * 	still need to complete the unlock.
+	 */
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
+		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+	else
+		status = do_vfs_lock(filp, fl);
+	return status;
+}
+
+static int
+is_time_granular(struct timespec *ts) {
+	return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int status;
+
+	/*
+	 * Flush all pending writes before doing anything
+	 * with locks..
+	 */
+	status = nfs_sync_mapping(filp->f_mapping);
+	if (status != 0)
+		goto out;
+
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
+		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+	else
+		status = do_vfs_lock(filp, fl);
+	if (status < 0)
+		goto out;
+
+	/*
+	 * Revalidate the cache if the server has time stamps granular
+	 * enough to detect subsecond changes.  Otherwise, clear the
+	 * cache to prevent missing any changes.
+	 *
+	 * This makes locking act as a cache coherency point.
+	 */
+	nfs_sync_mapping(filp->f_mapping);
+	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
+		if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+			__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		else
+			nfs_zap_caches(inode);
+	}
+out:
+	return status;
+}
+
+/*
+ * Lock a (portion of) a file
+ */
+int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int ret = -ENOLCK;
+	int is_local = 0;
+
+	dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n",
+			filp, fl->fl_type, fl->fl_flags,
+			(long long)fl->fl_start, (long long)fl->fl_end);
+
+	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
+
+	/* No mandatory locks over NFS */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+		is_local = 1;
+
+	if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+		ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+		if (ret < 0)
+			goto out_err;
+	}
+
+	if (IS_GETLK(cmd))
+		ret = do_getlk(filp, cmd, fl, is_local);
+	else if (fl->fl_type == F_UNLCK)
+		ret = do_unlk(filp, cmd, fl, is_local);
+	else
+		ret = do_setlk(filp, cmd, fl, is_local);
+out_err:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_lock);
+
+/*
+ * Lock a (portion of) a file
+ */
+int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int is_local = 0;
+
+	dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n",
+			filp, fl->fl_type, fl->fl_flags);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+
+	/*
+	 * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
+	 * any standard. In principle we might be able to support LOCK_MAND
+	 * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
+	 * NFS code is not set up for it.
+	 */
+	if (fl->fl_type & LOCK_MAND)
+		return -EINVAL;
+
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+		is_local = 1;
+
+	/* We're simulating flock() locks using posix locks on the server */
+	if (fl->fl_type == F_UNLCK)
+		return do_unlk(filp, cmd, fl, is_local);
+	return do_setlk(filp, cmd, fl, is_local);
+}
+EXPORT_SYMBOL_GPL(nfs_flock);
+
+const struct file_operations nfs_file_operations = {
+	.llseek		= nfs_file_llseek,
+	.read_iter	= nfs_file_read,
+	.write_iter	= nfs_file_write,
+	.mmap		= nfs_file_mmap,
+	.open		= nfs_file_open,
+	.flush		= nfs_file_flush,
+	.release	= nfs_file_release,
+	.fsync		= nfs_file_fsync,
+	.lock		= nfs_lock,
+	.flock		= nfs_flock,
+	.splice_read	= nfs_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.check_flags	= nfs_check_flags,
+	.setlease	= simple_nosetlease,
+};
+EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/kernel/fs/nfs/filelayout/Makefile b/kernel/fs/nfs/filelayout/Makefile
new file mode 100644
index 000000000..8516cdffb
--- /dev/null
+++ b/kernel/fs/nfs/filelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Files Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o
diff --git a/kernel/fs/nfs/filelayout/filelayout.c b/kernel/fs/nfs/filelayout/filelayout.c
new file mode 100644
index 000000000..a46bf6de9
--- /dev/null
+++ b/kernel/fs/nfs/filelayout/filelayout.c
@@ -0,0 +1,1162 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include <linux/sunrpc/metrics.h>
+
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "filelayout.h"
+#include "../nfs4trace.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
+
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+			    loff_t offset)
+{
+	u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+	u64 stripe_no;
+	u32 rem;
+
+	offset -= flseg->pattern_offset;
+	stripe_no = div_u64(offset, stripe_width);
+	div_u64_rem(offset, flseg->stripe_unit, &rem);
+
+	return stripe_no * flseg->stripe_unit + rem;
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	switch (flseg->stripe_type) {
+	case STRIPE_SPARSE:
+		return offset;
+
+	case STRIPE_DENSE:
+		return filelayout_get_dense_offset(flseg, offset);
+	}
+
+	BUG();
+}
+
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
+{
+	struct rpc_task *task = &hdr->task;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+	}
+}
+
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
+{
+	struct rpc_task *task = &hdr->task;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+	}
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+					 struct nfs4_state *state,
+					 struct nfs_client *clp,
+					 struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_hdr *lo = lseg->pls_layout;
+	struct inode *inode = lo->plh_inode;
+	struct nfs_server *mds_server = NFS_SERVER(inode);
+	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+	struct nfs_client *mds_client = mds_server->nfs_client;
+	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+	if (task->tk_status >= 0)
+		return 0;
+
+	switch (task->tk_status) {
+	/* MDS state errors */
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_OPENMODE:
+		if (state == NULL)
+			break;
+		if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+			goto out_bad_stateid;
+		goto wait_on_recovery;
+	case -NFS4ERR_EXPIRED:
+		if (state != NULL) {
+			if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+				goto out_bad_stateid;
+		}
+		nfs4_schedule_lease_recovery(mds_client);
+		goto wait_on_recovery;
+	/* DS session errors */
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+	case -NFS4ERR_SEQ_MISORDERED:
+		dprintk("%s ERROR %d, Reset session. Exchangeid "
+			"flags 0x%x\n", __func__, task->tk_status,
+			clp->cl_exchange_flags);
+		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+		break;
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+		rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+		break;
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		break;
+	/* Invalidate Layout errors */
+	case -NFS4ERR_PNFS_NO_LAYOUT:
+	case -ESTALE:           /* mapped NFS4ERR_STALE */
+	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+	case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+	case -NFS4ERR_FHEXPIRED:
+	case -NFS4ERR_WRONG_TYPE:
+		dprintk("%s Invalid layout error %d\n", __func__,
+			task->tk_status);
+		/*
+		 * Destroy layout so new i/o will get a new layout.
+		 * Layout will not be destroyed until all current lseg
+		 * references are put. Mark layout as invalid to resend failed
+		 * i/o and all i/o waiting on the slot table to the MDS until
+		 * layout is destroyed and a new valid layout is obtained.
+		 */
+		pnfs_destroy_layout(NFS_I(inode));
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		goto reset;
+	/* RPC connection errors */
+	case -ECONNREFUSED:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -EIO:
+	case -ETIMEDOUT:
+	case -EPIPE:
+		dprintk("%s DS connection error %d\n", __func__,
+			task->tk_status);
+		nfs4_mark_deviceid_unavailable(devid);
+		pnfs_error_mark_layout_for_return(inode, lseg);
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		/* fall through */
+	default:
+reset:
+		dprintk("%s Retry through MDS. Error %d\n", __func__,
+			task->tk_status);
+		return -NFS4ERR_RESET_TO_MDS;
+	}
+out:
+	task->tk_status = 0;
+	return -EAGAIN;
+out_bad_stateid:
+	task->tk_status = -EIO;
+	return 0;
+wait_on_recovery:
+	rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+		rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+	goto out;
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
+{
+	int err;
+
+	trace_nfs4_pnfs_read(hdr, task->tk_status);
+	err = filelayout_async_handle_error(task, hdr->args.context->state,
+					    hdr->ds_clp, hdr->lseg);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		filelayout_reset_read(hdr);
+		return task->tk_status;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ */
+static void
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+
+	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
+	    hdr->res.verf->committed != NFS_DATA_SYNC)
+		return;
+
+	pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+			hdr->mds_offset + hdr->res.count);
+	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+
+bool
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+	return filelayout_test_devid_invalid(node) ||
+		nfs4_test_deviceid_unavailable(node);
+}
+
+static bool
+filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg);
+
+	return filelayout_test_devid_unavailable(node);
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return;
+	}
+	if (filelayout_reset_to_mds(hdr->lseg)) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		filelayout_reset_read(hdr);
+		rpc_exit(task, 0);
+		return;
+	}
+	hdr->pgio_done_cb = filelayout_read_done_cb;
+
+	if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+			&hdr->args.seq_args,
+			&hdr->res.seq_res,
+			task))
+		return;
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_READ) == -EIO)
+		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+	    task->tk_status == 0) {
+		nfs41_sequence_done(task, &hdr->res.seq_res);
+		return;
+	}
+
+	/* Note this may cause RPC to be resent */
+	hdr->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
+}
+
+static int filelayout_write_done_cb(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
+{
+	int err;
+
+	trace_nfs4_pnfs_write(hdr, task->tk_status);
+	err = filelayout_async_handle_error(task, hdr->args.context->state,
+					    hdr->ds_clp, hdr->lseg);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		filelayout_reset_write(hdr);
+		return task->tk_status;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	filelayout_set_layoutcommit(hdr);
+	return 0;
+}
+
+static int filelayout_commit_done_cb(struct rpc_task *task,
+				     struct nfs_commit_data *data)
+{
+	int err;
+
+	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+	err = filelayout_async_handle_error(task, NULL, data->ds_clp,
+					    data->lseg);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		pnfs_generic_prepare_to_resend_writes(data);
+		return -EAGAIN;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	if (data->verf.committed == NFS_UNSTABLE)
+		pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+
+	return 0;
+}
+
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return;
+	}
+	if (filelayout_reset_to_mds(hdr->lseg)) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		filelayout_reset_write(hdr);
+		rpc_exit(task, 0);
+		return;
+	}
+	if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+			&hdr->args.seq_args,
+			&hdr->res.seq_res,
+			task))
+		return;
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_WRITE) == -EIO)
+		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+	    task->tk_status == 0) {
+		nfs41_sequence_done(task, &hdr->res.seq_res);
+		return;
+	}
+
+	/* Note this may cause RPC to be resent */
+	hdr->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
+}
+
+static void filelayout_commit_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *wdata = data;
+
+	nfs41_setup_sequence(wdata->ds_clp->cl_session,
+			&wdata->args.seq_args,
+			&wdata->res.seq_res,
+			task);
+}
+
+static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *cdata = data;
+
+	rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
+}
+
+static const struct rpc_call_ops filelayout_read_call_ops = {
+	.rpc_call_prepare = filelayout_read_prepare,
+	.rpc_call_done = filelayout_read_call_done,
+	.rpc_count_stats = filelayout_read_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops filelayout_write_call_ops = {
+	.rpc_call_prepare = filelayout_write_prepare,
+	.rpc_call_done = filelayout_write_call_done,
+	.rpc_count_stats = filelayout_write_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops filelayout_commit_call_ops = {
+	.rpc_call_prepare = filelayout_commit_prepare,
+	.rpc_call_done = pnfs_generic_write_commit_done,
+	.rpc_count_stats = filelayout_commit_count_stats,
+	.rpc_release = pnfs_generic_commit_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+	struct pnfs_layout_segment *lseg = hdr->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	loff_t offset = hdr->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+
+	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+		__func__, hdr->inode->i_ino,
+		hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds)
+		return PNFS_NOT_ATTEMPTED;
+
+	ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+	if (IS_ERR(ds_clnt))
+		return PNFS_NOT_ATTEMPTED;
+
+	dprintk("%s USE DS: %s cl_count %d\n", __func__,
+		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+
+	/* No multipath support. Use first DS */
+	atomic_inc(&ds->ds_clp->cl_count);
+	hdr->ds_clp = ds->ds_clp;
+	hdr->ds_commit_idx = idx;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		hdr->args.fh = fh;
+
+	hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+	hdr->mds_offset = offset;
+
+	/* Perform an asynchronous read to ds */
+	nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+			  NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
+			  0, RPC_TASK_SOFTCONN);
+	return PNFS_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+	struct pnfs_layout_segment *lseg = hdr->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	loff_t offset = hdr->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds)
+		return PNFS_NOT_ATTEMPTED;
+
+	ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+	if (IS_ERR(ds_clnt))
+		return PNFS_NOT_ATTEMPTED;
+
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
+		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+
+	hdr->pgio_done_cb = filelayout_write_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
+	hdr->ds_clp = ds->ds_clp;
+	hdr->ds_commit_idx = idx;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		hdr->args.fh = fh;
+	hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+
+	/* Perform an asynchronous write */
+	nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+			  NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
+			  sync, RPC_TASK_SOFTCONN);
+	return PNFS_ATTEMPTED;
+}
+
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+			struct nfs4_filelayout_segment *fl,
+			struct nfs4_layoutget_res *lgr,
+			struct nfs4_deviceid *id,
+			gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d;
+	struct nfs4_file_layout_dsaddr *dsaddr;
+	int status = -EINVAL;
+
+	dprintk("--> %s\n", __func__);
+
+	/* FIXME: remove this check when layout segment support is added */
+	if (lgr->range.offset != 0 ||
+	    lgr->range.length != NFS4_MAX_UINT64) {
+		dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+			__func__);
+		goto out;
+	}
+
+	if (fl->pattern_offset > lgr->range.offset) {
+		dprintk("%s pattern_offset %lld too large\n",
+				__func__, fl->pattern_offset);
+		goto out;
+	}
+
+	if (!fl->stripe_unit) {
+		dprintk("%s Invalid stripe unit (%u)\n",
+			__func__, fl->stripe_unit);
+		goto out;
+	}
+
+	/* find and reference the deviceid */
+	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
+			lo->plh_lc_cred, gfp_flags);
+	if (d == NULL)
+		goto out;
+
+	dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+	/* Found deviceid is unavailable */
+	if (filelayout_test_devid_unavailable(&dsaddr->id_node))
+		goto out_put;
+
+	fl->dsaddr = dsaddr;
+
+	if (fl->first_stripe_index >= dsaddr->stripe_count) {
+		dprintk("%s Bad first_stripe_index %u\n",
+				__func__, fl->first_stripe_index);
+		goto out_put;
+	}
+
+	if ((fl->stripe_type == STRIPE_SPARSE &&
+	    fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+	    (fl->stripe_type == STRIPE_DENSE &&
+	    fl->num_fh != dsaddr->stripe_count)) {
+		dprintk("%s num_fh %u not valid for given packing\n",
+			__func__, fl->num_fh);
+		goto out_put;
+	}
+
+	status = 0;
+out:
+	dprintk("--> %s returns %d\n", __func__, status);
+	return status;
+out_put:
+	nfs4_fl_put_deviceid(dsaddr);
+	goto out;
+}
+
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+	int i;
+
+	for (i = 0; i < fl->num_fh; i++) {
+		if (!fl->fh_array[i])
+			break;
+		kfree(fl->fh_array[i]);
+	}
+	kfree(fl->fh_array);
+	fl->fh_array = NULL;
+}
+
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+	filelayout_free_fh_array(fl);
+	kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+			 struct nfs4_filelayout_segment *fl,
+			 struct nfs4_layoutget_res *lgr,
+			 struct nfs4_deviceid *id,
+			 gfp_t gfp_flags)
+{
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	__be32 *p;
+	uint32_t nfl_util;
+	int i;
+
+	dprintk("%s: set_layout_map Begin\n", __func__);
+
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		return -ENOMEM;
+
+	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
+	 * num_fh (4) */
+	p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
+	if (unlikely(!p))
+		goto out_err;
+
+	memcpy(id, p, sizeof(*id));
+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+	nfs4_print_deviceid(id);
+
+	nfl_util = be32_to_cpup(p++);
+	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+		fl->commit_through_mds = 1;
+	if (nfl_util & NFL4_UFLG_DENSE)
+		fl->stripe_type = STRIPE_DENSE;
+	else
+		fl->stripe_type = STRIPE_SPARSE;
+	fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+	fl->first_stripe_index = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &fl->pattern_offset);
+	fl->num_fh = be32_to_cpup(p++);
+
+	dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+		__func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+		fl->pattern_offset);
+
+	/* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
+	 * Futher checking is done in filelayout_check_layout */
+	if (fl->num_fh >
+	    max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
+		goto out_err;
+
+	if (fl->num_fh > 0) {
+		fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]),
+				       gfp_flags);
+		if (!fl->fh_array)
+			goto out_err;
+	}
+
+	for (i = 0; i < fl->num_fh; i++) {
+		/* Do we want to use a mempool here? */
+		fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags);
+		if (!fl->fh_array[i])
+			goto out_err_free;
+
+		p = xdr_inline_decode(&stream, 4);
+		if (unlikely(!p))
+			goto out_err_free;
+		fl->fh_array[i]->size = be32_to_cpup(p++);
+		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+			printk(KERN_ERR "NFS: Too big fh %d received %d\n",
+			       i, fl->fh_array[i]->size);
+			goto out_err_free;
+		}
+
+		p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
+		if (unlikely(!p))
+			goto out_err_free;
+		memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+		dprintk("DEBUG: %s: fh len %d\n", __func__,
+			fl->fh_array[i]->size);
+	}
+
+	__free_page(scratch);
+	return 0;
+
+out_err_free:
+	filelayout_free_fh_array(fl);
+out_err:
+	__free_page(scratch);
+	return -EIO;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+	dprintk("--> %s\n", __func__);
+	nfs4_fl_put_deviceid(fl->dsaddr);
+	/* This assumes a single RW lseg */
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		struct nfs4_filelayout *flo;
+
+		flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
+		flo->commit_info.nbuckets = 0;
+		kfree(flo->commit_info.buckets);
+		flo->commit_info.buckets = NULL;
+	}
+	_filelayout_free_lseg(fl);
+}
+
+static int
+filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+			     struct nfs_commit_info *cinfo,
+			     gfp_t gfp_flags)
+{
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+	struct pnfs_commit_bucket *buckets;
+	int size, i;
+
+	if (fl->commit_through_mds)
+		return 0;
+
+	size = (fl->stripe_type == STRIPE_SPARSE) ?
+		fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+	if (cinfo->ds->nbuckets >= size) {
+		/* This assumes there is only one IOMODE_RW lseg.  What
+		 * we really want to do is have a layout_hdr level
+		 * dictionary of <multipath_list4, fh> keys, each
+		 * associated with a struct list_head, populated by calls
+		 * to filelayout_write_pagelist().
+		 * */
+		return 0;
+	}
+
+	buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+			  gfp_flags);
+	if (!buckets)
+		return -ENOMEM;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&buckets[i].written);
+		INIT_LIST_HEAD(&buckets[i].committing);
+		/* mark direct verifier as unset */
+		buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+	}
+
+	spin_lock(cinfo->lock);
+	if (cinfo->ds->nbuckets >= size)
+		goto out;
+	for (i = 0; i < cinfo->ds->nbuckets; i++) {
+		list_splice(&cinfo->ds->buckets[i].written,
+			    &buckets[i].written);
+		list_splice(&cinfo->ds->buckets[i].committing,
+			    &buckets[i].committing);
+		buckets[i].direct_verf.committed =
+			cinfo->ds->buckets[i].direct_verf.committed;
+		buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
+		buckets[i].clseg = cinfo->ds->buckets[i].clseg;
+	}
+	swap(cinfo->ds->buckets, buckets);
+	cinfo->ds->nbuckets = size;
+out:
+	spin_unlock(cinfo->lock);
+	kfree(buckets);
+	return 0;
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+		      struct nfs4_layoutget_res *lgr,
+		      gfp_t gfp_flags)
+{
+	struct nfs4_filelayout_segment *fl;
+	int rc;
+	struct nfs4_deviceid id;
+
+	dprintk("--> %s\n", __func__);
+	fl = kzalloc(sizeof(*fl), gfp_flags);
+	if (!fl)
+		return NULL;
+
+	rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags);
+	if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) {
+		_filelayout_free_lseg(fl);
+		return NULL;
+	}
+	return &fl->generic_hdr;
+}
+
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		   struct nfs_page *req)
+{
+	unsigned int size;
+	u64 p_stripe, r_stripe;
+	u32 stripe_offset;
+	u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+	u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+	/* calls nfs_generic_pg_test */
+	size = pnfs_generic_pg_test(pgio, prev, req);
+	if (!size)
+		return 0;
+
+	/* see if req and prev are in the same stripe */
+	if (prev) {
+		p_stripe = (u64)req_offset(prev) - segment_offset;
+		r_stripe = (u64)req_offset(req) - segment_offset;
+		do_div(p_stripe, stripe_unit);
+		do_div(r_stripe, stripe_unit);
+
+		if (p_stripe != r_stripe)
+			return 0;
+	}
+
+	/* calculate remaining bytes in the current stripe */
+	div_u64_rem((u64)req_offset(req) - segment_offset,
+			stripe_unit,
+			&stripe_offset);
+	WARN_ON_ONCE(stripe_offset > stripe_unit);
+	if (stripe_offset >= stripe_unit)
+		return 0;
+	return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
+static void
+filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+			struct nfs_page *req)
+{
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+					   req->wb_context,
+					   0,
+					   NFS4_MAX_UINT64,
+					   IOMODE_READ,
+					   GFP_KERNEL);
+	/* If no lseg, fall back to read through mds */
+	if (pgio->pg_lseg == NULL)
+		nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+			 struct nfs_page *req)
+{
+	struct nfs_commit_info cinfo;
+	int status;
+
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+					   req->wb_context,
+					   0,
+					   NFS4_MAX_UINT64,
+					   IOMODE_RW,
+					   GFP_NOFS);
+	/* If no lseg, fall back to write through mds */
+	if (pgio->pg_lseg == NULL)
+		goto out_mds;
+	nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+	status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+	if (status < 0) {
+		pnfs_put_lseg(pgio->pg_lseg);
+		pgio->pg_lseg = NULL;
+		goto out_mds;
+	}
+	return;
+out_mds:
+	nfs_pageio_reset_write_mds(pgio);
+}
+
+static const struct nfs_pageio_ops filelayout_pg_read_ops = {
+	.pg_init = filelayout_pg_init_read,
+	.pg_test = filelayout_pg_test,
+	.pg_doio = pnfs_generic_pg_readpages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops filelayout_pg_write_ops = {
+	.pg_init = filelayout_pg_init_write,
+	.pg_test = filelayout_pg_test,
+	.pg_doio = pnfs_generic_pg_writepages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
+{
+	if (fl->stripe_type == STRIPE_SPARSE)
+		return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
+	else
+		return j;
+}
+
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+			       struct pnfs_layout_segment *lseg,
+			       struct nfs_commit_info *cinfo,
+			       u32 ds_commit_idx)
+
+{
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+	u32 i, j;
+
+	if (fl->commit_through_mds) {
+		nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+	} else {
+		/* Note that we are calling nfs4_fl_calc_j_index on each page
+		 * that ends up being committed to a data server.  An attractive
+		 * alternative is to add a field to nfs_write_data and nfs_page
+		 * to store the value calculated in filelayout_write_pagelist
+		 * and just use that here.
+		 */
+		j = nfs4_fl_calc_j_index(lseg, req_offset(req));
+		i = select_bucket_index(fl, j);
+		pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
+	}
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	if (flseg->stripe_type == STRIPE_SPARSE)
+		return i;
+	else
+		return nfs4_fl_calc_ds_index(lseg, i);
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	if (flseg->stripe_type == STRIPE_SPARSE) {
+		if (flseg->num_fh == 1)
+			i = 0;
+		else if (flseg->num_fh == 0)
+			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+			return NULL;
+	}
+	return flseg->fh_array[i];
+}
+
+static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	u32 idx;
+	struct nfs_fh *fh;
+
+	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds)
+		goto out_err;
+
+	ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode);
+	if (IS_ERR(ds_clnt))
+		goto out_err;
+
+	dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
+		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
+	data->commit_done_cb = filelayout_commit_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
+	data->ds_clp = ds->ds_clp;
+	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+	if (fh)
+		data->args.fh = fh;
+	return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
+				   &filelayout_commit_call_ops, how,
+				   RPC_TASK_SOFTCONN);
+out_err:
+	pnfs_generic_prepare_to_resend_writes(data);
+	pnfs_generic_commit_release(data);
+	return -EAGAIN;
+}
+
+/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
+ *				   for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+static struct nfs_page *
+filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+	struct nfs_page *freq, *t;
+	struct pnfs_commit_bucket *b;
+	int i;
+
+	/* Linearly search the commit lists for each bucket until a matching
+	 * request is found */
+	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+		list_for_each_entry_safe(freq, t, &b->written, wb_list) {
+			if (freq->wb_page == page)
+				return freq->wb_head;
+		}
+		list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
+			if (freq->wb_page == page)
+				return freq->wb_head;
+		}
+	}
+
+	return NULL;
+}
+
+static int
+filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+			   int how, struct nfs_commit_info *cinfo)
+{
+	return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+					    filelayout_initiate_commit);
+}
+
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr;
+
+	dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+	if (!dsaddr)
+		return NULL;
+	return &dsaddr->id_node;
+}
+
+static void
+filelayout_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
+static struct pnfs_layout_hdr *
+filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct nfs4_filelayout *flo;
+
+	flo = kzalloc(sizeof(*flo), gfp_flags);
+	return flo != NULL ? &flo->generic_hdr : NULL;
+}
+
+static void
+filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	kfree(FILELAYOUT_FROM_HDR(lo));
+}
+
+static struct pnfs_ds_commit_info *
+filelayout_get_ds_info(struct inode *inode)
+{
+	struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+	if (layout == NULL)
+		return NULL;
+	else
+		return &FILELAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+	.id			= LAYOUT_NFSV4_1_FILES,
+	.name			= "LAYOUT_NFSV4_1_FILES",
+	.owner			= THIS_MODULE,
+	.alloc_layout_hdr	= filelayout_alloc_layout_hdr,
+	.free_layout_hdr	= filelayout_free_layout_hdr,
+	.alloc_lseg		= filelayout_alloc_lseg,
+	.free_lseg		= filelayout_free_lseg,
+	.pg_read_ops		= &filelayout_pg_read_ops,
+	.pg_write_ops		= &filelayout_pg_write_ops,
+	.get_ds_info		= &filelayout_get_ds_info,
+	.mark_request_commit	= filelayout_mark_request_commit,
+	.clear_request_commit	= pnfs_generic_clear_request_commit,
+	.scan_commit_lists	= pnfs_generic_scan_commit_lists,
+	.recover_commit_reqs	= pnfs_generic_recover_commit_reqs,
+	.search_commit_reqs	= filelayout_search_commit_reqs,
+	.commit_pagelist	= filelayout_commit_pagelist,
+	.read_pagelist		= filelayout_read_pagelist,
+	.write_pagelist		= filelayout_write_pagelist,
+	.alloc_deviceid_node	= filelayout_alloc_deviceid_node,
+	.free_deviceid_node	= filelayout_free_deviceid_node,
+	.sync			= pnfs_nfs_generic_sync,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+	       __func__);
+	return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+	       __func__);
+	pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-1");
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/kernel/fs/nfs/filelayout/filelayout.h b/kernel/fs/nfs/filelayout/filelayout.h
new file mode 100644
index 000000000..2896cb833
--- /dev/null
+++ b/kernel/fs/nfs/filelayout/filelayout.h
@@ -0,0 +1,117 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "../pnfs.h"
+
+/*
+ * Field testing shows we need to support up to 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+	STRIPE_SPARSE = 1,
+	STRIPE_DENSE = 2
+};
+
+struct nfs4_file_layout_dsaddr {
+	struct nfs4_deviceid_node	id_node;
+	u32				stripe_count;
+	u8				*stripe_indices;
+	u32				ds_num;
+	struct nfs4_pnfs_ds		*ds_list[1];
+};
+
+struct nfs4_filelayout_segment {
+	struct pnfs_layout_segment generic_hdr;
+	u32 stripe_type;
+	u32 commit_through_mds;
+	u32 stripe_unit;
+	u32 first_stripe_index;
+	u64 pattern_offset;
+	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+	unsigned int num_fh;
+	struct nfs_fh **fh_array;
+};
+
+struct nfs4_filelayout {
+	struct pnfs_layout_hdr generic_hdr;
+	struct pnfs_ds_commit_info commit_info;
+};
+
+static inline struct nfs4_filelayout *
+FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct nfs4_filelayout, generic_hdr);
+}
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg,
+			    struct nfs4_filelayout_segment,
+			    generic_hdr);
+}
+
+static inline struct nfs4_deviceid_node *
+FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
+{
+	return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
+}
+
+static inline bool
+filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
+{
+	return test_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+extern bool
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
+
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+					u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+	struct pnfs_device *pdev, gfp_t gfp_flags);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/kernel/fs/nfs/filelayout/filelayoutdev.c b/kernel/fs/nfs/filelayout/filelayoutdev.c
new file mode 100644
index 000000000..4946ef40b
--- /dev/null
+++ b/kernel/fs/nfs/filelayout/filelayoutdev.c
@@ -0,0 +1,299 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "filelayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
+void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	struct nfs4_pnfs_ds *ds;
+	int i;
+
+	nfs4_print_deviceid(&dsaddr->id_node.deviceid);
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		ds = dsaddr->ds_list[i];
+		if (ds != NULL)
+			nfs4_pnfs_ds_put(ds);
+	}
+	kfree(dsaddr->stripe_indices);
+	kfree_rcu(dsaddr, id_node.rcu);
+}
+
+/* Decode opaque device data and return the result */
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+		gfp_t gfp_flags)
+{
+	int i;
+	u32 cnt, num;
+	u8 *indexp;
+	__be32 *p;
+	u8 *stripe_indices;
+	u8 max_stripe_index;
+	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	struct list_head dsaddrs;
+	struct nfs4_pnfs_ds_addr *da;
+
+	/* set up xdr stream */
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		goto out_err;
+
+	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* Get the stripe count (number of stripe index) */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_free_scratch;
+
+	cnt = be32_to_cpup(p);
+	dprintk("%s stripe count  %d\n", __func__, cnt);
+	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+		printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
+		       "supported maximum %d\n", __func__,
+			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+		goto out_err_free_scratch;
+	}
+
+	/* read stripe indices */
+	stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
+	if (!stripe_indices)
+		goto out_err_free_scratch;
+
+	p = xdr_inline_decode(&stream, cnt << 2);
+	if (unlikely(!p))
+		goto out_err_free_stripe_indices;
+
+	indexp = &stripe_indices[0];
+	max_stripe_index = 0;
+	for (i = 0; i < cnt; i++) {
+		*indexp = be32_to_cpup(p++);
+		max_stripe_index = max(max_stripe_index, *indexp);
+		indexp++;
+	}
+
+	/* Check the multipath list count */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_free_stripe_indices;
+
+	num = be32_to_cpup(p);
+	dprintk("%s ds_num %u\n", __func__, num);
+	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+		printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
+			"supported maximum %d\n", __func__,
+			num, NFS4_PNFS_MAX_MULTI_CNT);
+		goto out_err_free_stripe_indices;
+	}
+
+	/* validate stripe indices are all < num */
+	if (max_stripe_index >= num) {
+		printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
+			__func__, max_stripe_index, num);
+		goto out_err_free_stripe_indices;
+	}
+
+	dsaddr = kzalloc(sizeof(*dsaddr) +
+			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+			gfp_flags);
+	if (!dsaddr)
+		goto out_err_free_stripe_indices;
+
+	dsaddr->stripe_count = cnt;
+	dsaddr->stripe_indices = stripe_indices;
+	stripe_indices = NULL;
+	dsaddr->ds_num = num;
+	nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
+
+	INIT_LIST_HEAD(&dsaddrs);
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		int j;
+		u32 mp_count;
+
+		p = xdr_inline_decode(&stream, 4);
+		if (unlikely(!p))
+			goto out_err_free_deviceid;
+
+		mp_count = be32_to_cpup(p); /* multipath count */
+		for (j = 0; j < mp_count; j++) {
+			da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+						    &stream, gfp_flags);
+			if (da)
+				list_add_tail(&da->da_node, &dsaddrs);
+		}
+		if (list_empty(&dsaddrs)) {
+			dprintk("%s: no suitable DS addresses found\n",
+				__func__);
+			goto out_err_free_deviceid;
+		}
+
+		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+		if (!dsaddr->ds_list[i])
+			goto out_err_drain_dsaddrs;
+
+		/* If DS was already in cache, free ds addrs */
+		while (!list_empty(&dsaddrs)) {
+			da = list_first_entry(&dsaddrs,
+					      struct nfs4_pnfs_ds_addr,
+					      da_node);
+			list_del_init(&da->da_node);
+			kfree(da->da_remotestr);
+			kfree(da);
+		}
+	}
+
+	__free_page(scratch);
+	return dsaddr;
+
+out_err_drain_dsaddrs:
+	while (!list_empty(&dsaddrs)) {
+		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+				      da_node);
+		list_del_init(&da->da_node);
+		kfree(da->da_remotestr);
+		kfree(da);
+	}
+out_err_free_deviceid:
+	nfs4_fl_free_deviceid(dsaddr);
+	/* stripe_indicies was part of dsaddr */
+	goto out_err_free_scratch;
+out_err_free_stripe_indices:
+	kfree(stripe_indices);
+out_err_free_scratch:
+	__free_page(scratch);
+out_err:
+	dprintk("%s ERROR: returning NULL\n", __func__);
+	return NULL;
+}
+
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	nfs4_put_deviceid_node(&dsaddr->id_node);
+}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u64 tmp;
+
+	tmp = offset - flseg->pattern_offset;
+	do_div(tmp, flseg->stripe_unit);
+	tmp += flseg->first_stripe_index;
+	return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u32 i;
+
+	if (flseg->stripe_type == STRIPE_SPARSE) {
+		if (flseg->num_fh == 1)
+			i = 0;
+		else if (flseg->num_fh == 0)
+			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+			return NULL;
+		else
+			i = nfs4_fl_calc_ds_index(lseg, j);
+	} else
+		i = j;
+	return flseg->fh_array[i];
+}
+
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+	struct nfs4_pnfs_ds *ret = ds;
+	struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+
+	if (ds == NULL) {
+		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+			__func__, ds_idx);
+		pnfs_generic_mark_devid_invalid(devid);
+		goto out;
+	}
+	smp_rmb();
+	if (ds->ds_clp)
+		goto out_test_devid;
+
+	nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+			     dataserver_retrans, 4,
+			     s->nfs_client->cl_minorversion,
+			     s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+
+out_test_devid:
+	if (filelayout_test_devid_unavailable(devid))
+		ret = NULL;
+out:
+	return ret;
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+			"retries a request before it attempts further "
+			" recovery  action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+			"NFSv4.1  client  waits for a response from a "
+			" data server before it retries an NFS request.");
diff --git a/kernel/fs/nfs/flexfilelayout/Makefile b/kernel/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000..1d2c9f6bb
--- /dev/null
+++ b/kernel/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Flexfile Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
+nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/kernel/fs/nfs/flexfilelayout/flexfilelayout.c b/kernel/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000..7d05089e5
--- /dev/null
+++ b/kernel/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1535 @@
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include <linux/sunrpc/metrics.h>
+
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../nfs4idmap.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
+
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct nfs4_flexfile_layout *ffl;
+
+	ffl = kzalloc(sizeof(*ffl), gfp_flags);
+	if (ffl) {
+		INIT_LIST_HEAD(&ffl->error_list);
+		return &ffl->generic_hdr;
+	} else
+		return NULL;
+}
+
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct nfs4_ff_layout_ds_err *err, *n;
+
+	list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
+				 list) {
+		list_del(&err->list);
+		kfree(err);
+	}
+	kfree(FF_LAYOUT_FROM_HDR(lo));
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+	if (unlikely(p == NULL))
+		return -ENOBUFS;
+	memcpy(stateid, p, NFS4_STATEID_SIZE);
+	dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+		p[0], p[1], p[2], p[3]);
+	return 0;
+}
+
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+	nfs4_print_deviceid(devid);
+	return 0;
+}
+
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	fh->size = be32_to_cpup(p++);
+	if (fh->size > sizeof(struct nfs_fh)) {
+		printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+		       fh->size);
+		return -EOVERFLOW;
+	}
+	/* fh.data */
+	p = xdr_inline_decode(xdr, fh->size);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	memcpy(&fh->data, p, fh->size);
+	dprintk("%s: fh len %d\n", __func__, fh->size);
+
+	return 0;
+}
+
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+	__be32 *p;
+	int len;
+
+	/* opaque_length(4)*/
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	len = be32_to_cpup(p++);
+	if (len < 0)
+		return -EINVAL;
+
+	dprintk("%s: len %u\n", __func__, len);
+
+	/* opaque body */
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(!p))
+		return -ENOBUFS;
+
+	if (!nfs_map_string_to_numeric((char *)p, len, id))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+	int i;
+
+	if (fls->mirror_array) {
+		for (i = 0; i < fls->mirror_array_cnt; i++) {
+			/* normally mirror_ds is freed in
+			 * .free_deviceid_node but we still do it here
+			 * for .alloc_lseg error path */
+			if (fls->mirror_array[i]) {
+				kfree(fls->mirror_array[i]->fh_versions);
+				nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+				kfree(fls->mirror_array[i]);
+			}
+		}
+		kfree(fls->mirror_array);
+		fls->mirror_array = NULL;
+	}
+}
+
+static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
+{
+	int ret = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	/* FIXME: remove this check when layout segment support is added */
+	if (lgr->range.offset != 0 ||
+	    lgr->range.length != NFS4_MAX_UINT64) {
+		dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+			__func__);
+		ret = -EINVAL;
+	}
+
+	dprintk("--> %s returns %d\n", __func__, ret);
+	return ret;
+}
+
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+	if (fls) {
+		ff_layout_free_mirror_array(fls);
+		kfree(fls);
+	}
+}
+
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+	struct nfs4_ff_layout_mirror *tmp;
+	int i, j;
+
+	for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+		for (j = i + 1; j < fls->mirror_array_cnt; j++)
+			if (fls->mirror_array[i]->efficiency <
+			    fls->mirror_array[j]->efficiency) {
+				tmp = fls->mirror_array[i];
+				fls->mirror_array[i] = fls->mirror_array[j];
+				fls->mirror_array[j] = tmp;
+			}
+	}
+}
+
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+		     struct nfs4_layoutget_res *lgr,
+		     gfp_t gfp_flags)
+{
+	struct pnfs_layout_segment *ret;
+	struct nfs4_ff_layout_segment *fls = NULL;
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	u64 stripe_unit;
+	u32 mirror_array_cnt;
+	__be32 *p;
+	int i, rc;
+
+	dprintk("--> %s\n", __func__);
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		return ERR_PTR(-ENOMEM);
+
+	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+			      lgr->layoutp->len);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* stripe unit and mirror_array_cnt */
+	rc = -EIO;
+	p = xdr_inline_decode(&stream, 8 + 4);
+	if (!p)
+		goto out_err_free;
+
+	p = xdr_decode_hyper(p, &stripe_unit);
+	mirror_array_cnt = be32_to_cpup(p++);
+	dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+		stripe_unit, mirror_array_cnt);
+
+	if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+	    mirror_array_cnt == 0)
+		goto out_err_free;
+
+	rc = -ENOMEM;
+	fls = kzalloc(sizeof(*fls), gfp_flags);
+	if (!fls)
+		goto out_err_free;
+
+	fls->mirror_array_cnt = mirror_array_cnt;
+	fls->stripe_unit = stripe_unit;
+	fls->mirror_array = kcalloc(fls->mirror_array_cnt,
+				    sizeof(fls->mirror_array[0]), gfp_flags);
+	if (fls->mirror_array == NULL)
+		goto out_err_free;
+
+	for (i = 0; i < fls->mirror_array_cnt; i++) {
+		struct nfs4_deviceid devid;
+		struct nfs4_deviceid_node *idnode;
+		u32 ds_count;
+		u32 fh_count;
+		int j;
+
+		rc = -EIO;
+		p = xdr_inline_decode(&stream, 4);
+		if (!p)
+			goto out_err_free;
+		ds_count = be32_to_cpup(p);
+
+		/* FIXME: allow for striping? */
+		if (ds_count != 1)
+			goto out_err_free;
+
+		fls->mirror_array[i] =
+			kzalloc(sizeof(struct nfs4_ff_layout_mirror),
+				gfp_flags);
+		if (fls->mirror_array[i] == NULL) {
+			rc = -ENOMEM;
+			goto out_err_free;
+		}
+
+		spin_lock_init(&fls->mirror_array[i]->lock);
+		fls->mirror_array[i]->ds_count = ds_count;
+
+		/* deviceid */
+		rc = decode_deviceid(&stream, &devid);
+		if (rc)
+			goto out_err_free;
+
+		idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+						&devid, lh->plh_lc_cred,
+						gfp_flags);
+		/*
+		 * upon success, mirror_ds is allocated by previous
+		 * getdeviceinfo, or newly by .alloc_deviceid_node
+		 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
+		 */
+		if (idnode)
+			fls->mirror_array[i]->mirror_ds =
+				FF_LAYOUT_MIRROR_DS(idnode);
+		else
+			goto out_err_free;
+
+		/* efficiency */
+		rc = -EIO;
+		p = xdr_inline_decode(&stream, 4);
+		if (!p)
+			goto out_err_free;
+		fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+
+		/* stateid */
+		rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+		if (rc)
+			goto out_err_free;
+
+		/* fh */
+		p = xdr_inline_decode(&stream, 4);
+		if (!p)
+			goto out_err_free;
+		fh_count = be32_to_cpup(p);
+
+		fls->mirror_array[i]->fh_versions =
+			kzalloc(fh_count * sizeof(struct nfs_fh),
+				gfp_flags);
+		if (fls->mirror_array[i]->fh_versions == NULL) {
+			rc = -ENOMEM;
+			goto out_err_free;
+		}
+
+		for (j = 0; j < fh_count; j++) {
+			rc = decode_nfs_fh(&stream,
+					   &fls->mirror_array[i]->fh_versions[j]);
+			if (rc)
+				goto out_err_free;
+		}
+
+		fls->mirror_array[i]->fh_versions_cnt = fh_count;
+
+		/* user */
+		rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+		if (rc)
+			goto out_err_free;
+
+		/* group */
+		rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+		if (rc)
+			goto out_err_free;
+
+		dprintk("%s: uid %d gid %d\n", __func__,
+			fls->mirror_array[i]->uid,
+			fls->mirror_array[i]->gid);
+	}
+
+	ff_layout_sort_mirrors(fls);
+	rc = ff_layout_check_layout(lgr);
+	if (rc)
+		goto out_err_free;
+
+	ret = &fls->generic_hdr;
+	dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+	__free_page(scratch);
+	return ret;
+out_err_free:
+	_ff_layout_free_lseg(fls);
+	ret = ERR_PTR(rc);
+	dprintk("<-- %s (%d)\n", __func__, rc);
+	goto out_free_page;
+}
+
+static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
+{
+	struct pnfs_layout_segment *lseg;
+
+	list_for_each_entry(lseg, &layout->plh_segs, pls_list)
+		if (lseg->pls_range.iomode == IOMODE_RW)
+			return true;
+
+	return false;
+}
+
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+	int i;
+
+	dprintk("--> %s\n", __func__);
+
+	for (i = 0; i < fls->mirror_array_cnt; i++) {
+		if (fls->mirror_array[i]) {
+			nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+			fls->mirror_array[i]->mirror_ds = NULL;
+			if (fls->mirror_array[i]->cred) {
+				put_rpccred(fls->mirror_array[i]->cred);
+				fls->mirror_array[i]->cred = NULL;
+			}
+		}
+	}
+
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		struct nfs4_flexfile_layout *ffl;
+		struct inode *inode;
+
+		ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+		inode = ffl->generic_hdr.plh_inode;
+		spin_lock(&inode->i_lock);
+		if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
+			ffl->commit_info.nbuckets = 0;
+			kfree(ffl->commit_info.buckets);
+			ffl->commit_info.buckets = NULL;
+		}
+		spin_unlock(&inode->i_lock);
+	}
+	_ff_layout_free_lseg(fls);
+}
+
+/* Return 1 until we have multiple lsegs support */
+static int
+ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
+{
+	return 1;
+}
+
+static int
+ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+			    struct nfs_commit_info *cinfo,
+			    gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+	struct pnfs_commit_bucket *buckets;
+	int size;
+
+	if (cinfo->ds->nbuckets != 0) {
+		/* This assumes there is only one RW lseg per file.
+		 * To support multiple lseg per file, we need to
+		 * change struct pnfs_commit_bucket to allow dynamic
+		 * increasing nbuckets.
+		 */
+		return 0;
+	}
+
+	size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
+
+	buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+			  gfp_flags);
+	if (!buckets)
+		return -ENOMEM;
+	else {
+		int i;
+
+		spin_lock(cinfo->lock);
+		if (cinfo->ds->nbuckets != 0)
+			kfree(buckets);
+		else {
+			cinfo->ds->buckets = buckets;
+			cinfo->ds->nbuckets = size;
+			for (i = 0; i < size; i++) {
+				INIT_LIST_HEAD(&buckets[i].written);
+				INIT_LIST_HEAD(&buckets[i].committing);
+				/* mark direct verifier as unset */
+				buckets[i].direct_verf.committed =
+					NFS_INVALID_STABLE_HOW;
+			}
+		}
+		spin_unlock(cinfo->lock);
+		return 0;
+	}
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+				  int *best_idx)
+{
+	struct nfs4_ff_layout_segment *fls;
+	struct nfs4_pnfs_ds *ds;
+	int idx;
+
+	fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
+	/* mirrors are sorted by efficiency */
+	for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
+		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+		if (ds) {
+			*best_idx = idx;
+			return ds;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+			struct nfs_page *req)
+{
+	struct nfs_pgio_mirror *pgm;
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_pnfs_ds *ds;
+	int ds_idx;
+
+	/* Use full layout for now */
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   0,
+						   NFS4_MAX_UINT64,
+						   IOMODE_READ,
+						   GFP_KERNEL);
+	/* If no lseg, fall back to read through mds */
+	if (pgio->pg_lseg == NULL)
+		goto out_mds;
+
+	ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+	if (!ds)
+		goto out_mds;
+	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+
+	pgio->pg_mirror_idx = ds_idx;
+
+	/* read always uses only one mirror - idx 0 for pgio layer */
+	pgm = &pgio->pg_mirrors[0];
+	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+
+	return;
+out_mds:
+	pnfs_put_lseg(pgio->pg_lseg);
+	pgio->pg_lseg = NULL;
+	nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+			struct nfs_page *req)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs_pgio_mirror *pgm;
+	struct nfs_commit_info cinfo;
+	struct nfs4_pnfs_ds *ds;
+	int i;
+	int status;
+
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   0,
+						   NFS4_MAX_UINT64,
+						   IOMODE_RW,
+						   GFP_NOFS);
+	/* If no lseg, fall back to write through mds */
+	if (pgio->pg_lseg == NULL)
+		goto out_mds;
+
+	nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+	status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+	if (status < 0)
+		goto out_mds;
+
+	/* Use a direct mapping of ds_idx to pgio mirror_idx */
+	if (WARN_ON_ONCE(pgio->pg_mirror_count !=
+	    FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
+		goto out_mds;
+
+	for (i = 0; i < pgio->pg_mirror_count; i++) {
+		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+		if (!ds)
+			goto out_mds;
+		pgm = &pgio->pg_mirrors[i];
+		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+		pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+	}
+
+	return;
+
+out_mds:
+	pnfs_put_lseg(pgio->pg_lseg);
+	pgio->pg_lseg = NULL;
+	nfs_pageio_reset_write_mds(pgio);
+}
+
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+				    struct nfs_page *req)
+{
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   0,
+						   NFS4_MAX_UINT64,
+						   IOMODE_RW,
+						   GFP_NOFS);
+	if (pgio->pg_lseg)
+		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+
+	/* no lseg means that pnfs is not in use, so no mirroring here */
+	pnfs_put_lseg(pgio->pg_lseg);
+	pgio->pg_lseg = NULL;
+	nfs_pageio_reset_write_mds(pgio);
+	return 1;
+}
+
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+	.pg_init = ff_layout_pg_init_read,
+	.pg_test = pnfs_generic_pg_test,
+	.pg_doio = pnfs_generic_pg_readpages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+	.pg_init = ff_layout_pg_init_write,
+	.pg_test = pnfs_generic_pg_test,
+	.pg_doio = pnfs_generic_pg_writepages,
+	.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+	struct rpc_task *task = &hdr->task;
+
+	pnfs_layoutcommit_inode(hdr->inode, false);
+
+	if (retry_pnfs) {
+		dprintk("%s Reset task %5u for i/o through pNFS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		if (!hdr->dreq) {
+			struct nfs_open_context *ctx;
+
+			ctx = nfs_list_entry(hdr->pages.next)->wb_context;
+			set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+			hdr->completion_ops->error_cleanup(&hdr->pages);
+		} else {
+			nfs_direct_set_resched_writes(hdr->dreq);
+			/* fake unstable write to let common nfs resend pages */
+			hdr->verf.committed = NFS_UNSTABLE;
+			hdr->good_bytes = 0;
+		}
+		return;
+	}
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+	}
+}
+
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+	struct rpc_task *task = &hdr->task;
+
+	pnfs_layoutcommit_inode(hdr->inode, false);
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+	}
+}
+
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+					   struct nfs4_state *state,
+					   struct nfs_client *clp,
+					   struct pnfs_layout_segment *lseg,
+					   int idx)
+{
+	struct pnfs_layout_hdr *lo = lseg->pls_layout;
+	struct inode *inode = lo->plh_inode;
+	struct nfs_server *mds_server = NFS_SERVER(inode);
+
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+	struct nfs_client *mds_client = mds_server->nfs_client;
+	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+	if (task->tk_status >= 0)
+		return 0;
+
+	switch (task->tk_status) {
+	/* MDS state errors */
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+		if (state == NULL)
+			break;
+		nfs_remove_bad_delegation(state->inode);
+	case -NFS4ERR_OPENMODE:
+		if (state == NULL)
+			break;
+		if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+			goto out_bad_stateid;
+		goto wait_on_recovery;
+	case -NFS4ERR_EXPIRED:
+		if (state != NULL) {
+			if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+				goto out_bad_stateid;
+		}
+		nfs4_schedule_lease_recovery(mds_client);
+		goto wait_on_recovery;
+	/* DS session errors */
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+	case -NFS4ERR_SEQ_MISORDERED:
+		dprintk("%s ERROR %d, Reset session. Exchangeid "
+			"flags 0x%x\n", __func__, task->tk_status,
+			clp->cl_exchange_flags);
+		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+		break;
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+		rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+		break;
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		break;
+	/* Invalidate Layout errors */
+	case -NFS4ERR_PNFS_NO_LAYOUT:
+	case -ESTALE:           /* mapped NFS4ERR_STALE */
+	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+	case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+	case -NFS4ERR_FHEXPIRED:
+	case -NFS4ERR_WRONG_TYPE:
+		dprintk("%s Invalid layout error %d\n", __func__,
+			task->tk_status);
+		/*
+		 * Destroy layout so new i/o will get a new layout.
+		 * Layout will not be destroyed until all current lseg
+		 * references are put. Mark layout as invalid to resend failed
+		 * i/o and all i/o waiting on the slot table to the MDS until
+		 * layout is destroyed and a new valid layout is obtained.
+		 */
+		pnfs_destroy_layout(NFS_I(inode));
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		goto reset;
+	/* RPC connection errors */
+	case -ECONNREFUSED:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -EIO:
+	case -ETIMEDOUT:
+	case -EPIPE:
+		dprintk("%s DS connection error %d\n", __func__,
+			task->tk_status);
+		nfs4_mark_deviceid_unavailable(devid);
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		/* fall through */
+	default:
+		if (ff_layout_has_available_ds(lseg))
+			return -NFS4ERR_RESET_TO_PNFS;
+reset:
+		dprintk("%s Retry through MDS. Error %d\n", __func__,
+			task->tk_status);
+		return -NFS4ERR_RESET_TO_MDS;
+	}
+out:
+	task->tk_status = 0;
+	return -EAGAIN;
+out_bad_stateid:
+	task->tk_status = -EIO;
+	return 0;
+wait_on_recovery:
+	rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+		rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+	goto out;
+}
+
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+					   struct pnfs_layout_segment *lseg,
+					   int idx)
+{
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+	if (task->tk_status >= 0)
+		return 0;
+
+	if (task->tk_status != -EJUKEBOX) {
+		dprintk("%s DS connection error %d\n", __func__,
+			task->tk_status);
+		nfs4_mark_deviceid_unavailable(devid);
+		if (ff_layout_has_available_ds(lseg))
+			return -NFS4ERR_RESET_TO_PNFS;
+		else
+			return -NFS4ERR_RESET_TO_MDS;
+	}
+
+	if (task->tk_status == -EJUKEBOX)
+		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+	task->tk_status = 0;
+	rpc_restart_call(task);
+	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+	return -EAGAIN;
+}
+
+static int ff_layout_async_handle_error(struct rpc_task *task,
+					struct nfs4_state *state,
+					struct nfs_client *clp,
+					struct pnfs_layout_segment *lseg,
+					int idx)
+{
+	int vers = clp->cl_nfs_mod->rpc_vers->number;
+
+	switch (vers) {
+	case 3:
+		return ff_layout_async_handle_error_v3(task, lseg, idx);
+	case 4:
+		return ff_layout_async_handle_error_v4(task, state, clp,
+						       lseg, idx);
+	default:
+		/* should never happen */
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+					int idx, u64 offset, u64 length,
+					u32 status, int opnum)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	int err;
+
+	mirror = FF_LAYOUT_COMP(lseg, idx);
+	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+				       mirror, offset, length, status, opnum,
+				       GFP_NOIO);
+	dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int ff_layout_read_done_cb(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
+{
+	struct inode *inode;
+	int err;
+
+	trace_nfs4_pnfs_read(hdr, task->tk_status);
+	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+		hdr->res.op_status = NFS4ERR_NXIO;
+	if (task->tk_status < 0 && hdr->res.op_status)
+		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+					    hdr->args.offset, hdr->args.count,
+					    hdr->res.op_status, OP_READ);
+	err = ff_layout_async_handle_error(task, hdr->args.context->state,
+					   hdr->ds_clp, hdr->lseg,
+					   hdr->pgio_mirror_idx);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_PNFS:
+		set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+			&hdr->lseg->pls_layout->plh_flags);
+		pnfs_read_resend_pnfs(hdr);
+		return task->tk_status;
+	case -NFS4ERR_RESET_TO_MDS:
+		inode = hdr->lseg->pls_layout->plh_inode;
+		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+		ff_layout_reset_read(hdr);
+		return task->tk_status;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+	pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+			hdr->mds_offset + hdr->res.count);
+	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+
+static bool
+ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+{
+	/* No mirroring for now */
+	struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+	return ff_layout_test_devid_unavailable(node);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+					 struct nfs_pgio_header *hdr)
+{
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return -EIO;
+	}
+	if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		if (ff_layout_has_available_ds(hdr->lseg))
+			pnfs_read_resend_pnfs(hdr);
+		else
+			ff_layout_reset_read(hdr);
+		rpc_exit(task, 0);
+		return -EAGAIN;
+	}
+	hdr->pgio_done_cb = ff_layout_read_done_cb;
+
+	return 0;
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_read_prepare_common(task, hdr))
+		return;
+
+	rpc_call_start(task);
+}
+
+static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
+				    struct nfs4_sequence_args *args,
+				    struct nfs4_sequence_res *res,
+				    struct rpc_task *task)
+{
+	if (ds_clp->cl_session)
+		return nfs41_setup_sequence(ds_clp->cl_session,
+					   args,
+					   res,
+					   task);
+	return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
+				   args,
+				   res,
+				   task);
+}
+
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_read_prepare_common(task, hdr))
+		return;
+
+	if (ff_layout_setup_sequence(hdr->ds_clp,
+				     &hdr->args.seq_args,
+				     &hdr->res.seq_res,
+				     task))
+		return;
+
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_READ) == -EIO)
+		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+	    task->tk_status == 0) {
+		nfs4_sequence_done(task, &hdr->res.seq_res);
+		return;
+	}
+
+	/* Note this may cause RPC to be resent */
+	hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	rpc_count_iostats_metrics(task,
+	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+
+static int ff_layout_write_done_cb(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
+{
+	struct inode *inode;
+	int err;
+
+	trace_nfs4_pnfs_write(hdr, task->tk_status);
+	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+		hdr->res.op_status = NFS4ERR_NXIO;
+	if (task->tk_status < 0 && hdr->res.op_status)
+		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+					    hdr->args.offset, hdr->args.count,
+					    hdr->res.op_status, OP_WRITE);
+	err = ff_layout_async_handle_error(task, hdr->args.context->state,
+					   hdr->ds_clp, hdr->lseg,
+					   hdr->pgio_mirror_idx);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_PNFS:
+	case -NFS4ERR_RESET_TO_MDS:
+		inode = hdr->lseg->pls_layout->plh_inode;
+		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+		if (err == -NFS4ERR_RESET_TO_PNFS) {
+			pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+			ff_layout_reset_write(hdr, true);
+		} else {
+			pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+			ff_layout_reset_write(hdr, false);
+		}
+		return task->tk_status;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+	    hdr->res.verf->committed == NFS_DATA_SYNC)
+		ff_layout_set_layoutcommit(hdr);
+
+	return 0;
+}
+
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+				     struct nfs_commit_data *data)
+{
+	struct inode *inode;
+	int err;
+
+	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+	if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
+		data->res.op_status = NFS4ERR_NXIO;
+	if (task->tk_status < 0 && data->res.op_status)
+		ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+					    data->args.offset, data->args.count,
+					    data->res.op_status, OP_COMMIT);
+	err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+					   data->lseg, data->ds_commit_index);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_PNFS:
+	case -NFS4ERR_RESET_TO_MDS:
+		inode = data->lseg->pls_layout->plh_inode;
+		pnfs_error_mark_layout_for_return(inode, data->lseg);
+		if (err == -NFS4ERR_RESET_TO_PNFS)
+			pnfs_set_retry_layoutget(data->lseg->pls_layout);
+		else
+			pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+		pnfs_generic_prepare_to_resend_writes(data);
+		return -EAGAIN;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	if (data->verf.committed == NFS_UNSTABLE)
+		pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+
+	return 0;
+}
+
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+					  struct nfs_pgio_header *hdr)
+{
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return -EIO;
+	}
+
+	if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+		bool retry_pnfs;
+
+		retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
+		dprintk("%s task %u reset io to %s\n", __func__,
+			task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
+		ff_layout_reset_write(hdr, retry_pnfs);
+		rpc_exit(task, 0);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_write_prepare_common(task, hdr))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_write_prepare_common(task, hdr))
+		return;
+
+	if (ff_layout_setup_sequence(hdr->ds_clp,
+				     &hdr->args.seq_args,
+				     &hdr->res.seq_res,
+				     task))
+		return;
+
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_WRITE) == -EIO)
+		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+	    task->tk_status == 0) {
+		nfs4_sequence_done(task, &hdr->res.seq_res);
+		return;
+	}
+
+	/* Note this may cause RPC to be resent */
+	hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	rpc_count_iostats_metrics(task,
+	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+	rpc_call_start(task);
+}
+
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *wdata = data;
+
+	ff_layout_setup_sequence(wdata->ds_clp,
+				 &wdata->args.seq_args,
+				 &wdata->res.seq_res,
+				 task);
+}
+
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *cdata = data;
+
+	rpc_count_iostats_metrics(task,
+	    &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+	.rpc_call_prepare = ff_layout_read_prepare_v3,
+	.rpc_call_done = ff_layout_read_call_done,
+	.rpc_count_stats = ff_layout_read_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+	.rpc_call_prepare = ff_layout_read_prepare_v4,
+	.rpc_call_done = ff_layout_read_call_done,
+	.rpc_count_stats = ff_layout_read_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+	.rpc_call_prepare = ff_layout_write_prepare_v3,
+	.rpc_call_done = ff_layout_write_call_done,
+	.rpc_count_stats = ff_layout_write_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+	.rpc_call_prepare = ff_layout_write_prepare_v4,
+	.rpc_call_done = ff_layout_write_call_done,
+	.rpc_count_stats = ff_layout_write_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+	.rpc_call_prepare = ff_layout_commit_prepare_v3,
+	.rpc_call_done = pnfs_generic_write_commit_done,
+	.rpc_count_stats = ff_layout_commit_count_stats,
+	.rpc_release = pnfs_generic_commit_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+	.rpc_call_prepare = ff_layout_commit_prepare_v4,
+	.rpc_call_done = pnfs_generic_write_commit_done,
+	.rpc_count_stats = ff_layout_commit_count_stats,
+	.rpc_release = pnfs_generic_commit_release,
+};
+
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+	struct pnfs_layout_segment *lseg = hdr->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	struct rpc_cred *ds_cred;
+	loff_t offset = hdr->args.offset;
+	u32 idx = hdr->pgio_mirror_idx;
+	int vers;
+	struct nfs_fh *fh;
+
+	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+		__func__, hdr->inode->i_ino,
+		hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+	ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+	if (!ds)
+		goto out_failed;
+
+	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+						   hdr->inode);
+	if (IS_ERR(ds_clnt))
+		goto out_failed;
+
+	ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+	if (IS_ERR(ds_cred))
+		goto out_failed;
+
+	vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+	dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+
+	atomic_inc(&ds->ds_clp->cl_count);
+	hdr->ds_clp = ds->ds_clp;
+	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+	if (fh)
+		hdr->args.fh = fh;
+
+	/*
+	 * Note that if we ever decide to split across DSes,
+	 * then we may need to handle dense-like offsets.
+	 */
+	hdr->args.offset = offset;
+	hdr->mds_offset = offset;
+
+	/* Perform an asynchronous read to ds */
+	nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+			  vers == 3 ? &ff_layout_read_call_ops_v3 :
+				      &ff_layout_read_call_ops_v4,
+			  0, RPC_TASK_SOFTCONN);
+
+	return PNFS_ATTEMPTED;
+
+out_failed:
+	if (ff_layout_has_available_ds(lseg))
+		return PNFS_TRY_AGAIN;
+	return PNFS_NOT_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+	struct pnfs_layout_segment *lseg = hdr->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	struct rpc_cred *ds_cred;
+	loff_t offset = hdr->args.offset;
+	int vers;
+	struct nfs_fh *fh;
+	int idx = hdr->pgio_mirror_idx;
+
+	ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+	if (!ds)
+		return PNFS_NOT_ATTEMPTED;
+
+	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+						   hdr->inode);
+	if (IS_ERR(ds_clnt))
+		return PNFS_NOT_ATTEMPTED;
+
+	ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+	if (IS_ERR(ds_cred))
+		return PNFS_NOT_ATTEMPTED;
+
+	vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+		vers);
+
+	hdr->pgio_done_cb = ff_layout_write_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
+	hdr->ds_clp = ds->ds_clp;
+	hdr->ds_commit_idx = idx;
+	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+	if (fh)
+		hdr->args.fh = fh;
+
+	/*
+	 * Note that if we ever decide to split across DSes,
+	 * then we may need to handle dense-like offsets.
+	 */
+	hdr->args.offset = offset;
+
+	/* Perform an asynchronous write */
+	nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+			  vers == 3 ? &ff_layout_write_call_ops_v3 :
+				      &ff_layout_write_call_ops_v4,
+			  sync, RPC_TASK_SOFTCONN);
+	return PNFS_ATTEMPTED;
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	return i;
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+	/* FIXME: Assume that there is only one NFS version available
+	 * for the DS.
+	 */
+	return &flseg->mirror_array[i]->fh_versions[0];
+}
+
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	struct rpc_cred *ds_cred;
+	u32 idx;
+	int vers;
+	struct nfs_fh *fh;
+
+	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+	ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+	if (!ds)
+		goto out_err;
+
+	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+						   data->inode);
+	if (IS_ERR(ds_clnt))
+		goto out_err;
+
+	ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+	if (IS_ERR(ds_cred))
+		goto out_err;
+
+	vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+	dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+		vers);
+	data->commit_done_cb = ff_layout_commit_done_cb;
+	data->cred = ds_cred;
+	atomic_inc(&ds->ds_clp->cl_count);
+	data->ds_clp = ds->ds_clp;
+	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+	if (fh)
+		data->args.fh = fh;
+	return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+				   vers == 3 ? &ff_layout_commit_call_ops_v3 :
+					       &ff_layout_commit_call_ops_v4,
+				   how, RPC_TASK_SOFTCONN);
+out_err:
+	pnfs_generic_prepare_to_resend_writes(data);
+	pnfs_generic_commit_release(data);
+	return -EAGAIN;
+}
+
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+			   int how, struct nfs_commit_info *cinfo)
+{
+	return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+					    ff_layout_initiate_commit);
+}
+
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+	struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+	if (layout == NULL)
+		return NULL;
+
+	return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+						  id_node));
+}
+
+static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
+				  struct xdr_stream *xdr,
+				  const struct nfs4_layoutreturn_args *args)
+{
+	struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
+	__be32 *start;
+	int count = 0, ret = 0;
+
+	start = xdr_reserve_space(xdr, 4);
+	if (unlikely(!start))
+		return -E2BIG;
+
+	/* This assume we always return _ALL_ layouts */
+	spin_lock(&hdr->plh_inode->i_lock);
+	ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
+	spin_unlock(&hdr->plh_inode->i_lock);
+
+	*start = cpu_to_be32(count);
+
+	return ret;
+}
+
+/* report nothing for now */
+static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutreturn_args *args)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (likely(p))
+		*p = cpu_to_be32(0);
+}
+
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+			      struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_ds *dsaddr;
+
+	dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+	if (!dsaddr)
+		return NULL;
+	return &dsaddr->id_node;
+}
+
+static void
+ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutreturn_args *args)
+{
+	struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+	start = xdr_reserve_space(xdr, 4);
+	BUG_ON(!start);
+
+	if (ff_layout_encode_ioerr(flo, xdr, args))
+		goto out;
+
+	ff_layout_encode_iostats(flo, xdr, args);
+out:
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+	dprintk("%s: Return\n", __func__);
+}
+
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+	.id			= LAYOUT_FLEX_FILES,
+	.name			= "LAYOUT_FLEX_FILES",
+	.owner			= THIS_MODULE,
+	.alloc_layout_hdr	= ff_layout_alloc_layout_hdr,
+	.free_layout_hdr	= ff_layout_free_layout_hdr,
+	.alloc_lseg		= ff_layout_alloc_lseg,
+	.free_lseg		= ff_layout_free_lseg,
+	.pg_read_ops		= &ff_layout_pg_read_ops,
+	.pg_write_ops		= &ff_layout_pg_write_ops,
+	.get_ds_info		= ff_layout_get_ds_info,
+	.free_deviceid_node	= ff_layout_free_deviceid_node,
+	.mark_request_commit	= pnfs_layout_mark_request_commit,
+	.clear_request_commit	= pnfs_generic_clear_request_commit,
+	.scan_commit_lists	= pnfs_generic_scan_commit_lists,
+	.recover_commit_reqs	= pnfs_generic_recover_commit_reqs,
+	.commit_pagelist	= ff_layout_commit_pagelist,
+	.read_pagelist		= ff_layout_read_pagelist,
+	.write_pagelist		= ff_layout_write_pagelist,
+	.alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
+	.encode_layoutreturn    = ff_layout_encode_layoutreturn,
+	.sync			= pnfs_nfs_generic_sync,
+};
+
+static int __init nfs4flexfilelayout_init(void)
+{
+	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+	       __func__);
+	return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+
+static void __exit nfs4flexfilelayout_exit(void)
+{
+	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+	       __func__);
+	pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-4");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
diff --git a/kernel/fs/nfs/flexfilelayout/flexfilelayout.h b/kernel/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000..070f20445
--- /dev/null
+++ b/kernel/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
+/*
+ * NFSv4 flexfile layout driver data structures.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FS_NFS_NFS4FLEXFILELAYOUT_H
+
+#include "../pnfs.h"
+
+/* XXX: Let's filter out insanely large mirror count for now to avoid oom
+ * due to network error etc. */
+#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+
+struct nfs4_ff_ds_version {
+	u32				version;
+	u32				minor_version;
+	u32				rsize;
+	u32				wsize;
+	bool				tightly_coupled;
+};
+
+/* chained in global deviceid hlist */
+struct nfs4_ff_layout_ds {
+	struct nfs4_deviceid_node	id_node;
+	u32				ds_versions_cnt;
+	struct nfs4_ff_ds_version	*ds_versions;
+	struct nfs4_pnfs_ds		*ds;
+};
+
+struct nfs4_ff_layout_ds_err {
+	struct list_head		list; /* linked in mirror error_list */
+	u64				offset;
+	u64				length;
+	int				status;
+	enum nfs_opnum4			opnum;
+	nfs4_stateid			stateid;
+	struct nfs4_deviceid		deviceid;
+};
+
+struct nfs4_ff_layout_mirror {
+	u32				ds_count;
+	u32				efficiency;
+	struct nfs4_ff_layout_ds	*mirror_ds;
+	u32				fh_versions_cnt;
+	struct nfs_fh			*fh_versions;
+	nfs4_stateid			stateid;
+	struct nfs4_string		user_name;
+	struct nfs4_string		group_name;
+	u32				uid;
+	u32				gid;
+	struct rpc_cred			*cred;
+	spinlock_t			lock;
+};
+
+struct nfs4_ff_layout_segment {
+	struct pnfs_layout_segment	generic_hdr;
+	u64				stripe_unit;
+	u32				mirror_array_cnt;
+	struct nfs4_ff_layout_mirror	**mirror_array;
+};
+
+struct nfs4_flexfile_layout {
+	struct pnfs_layout_hdr generic_hdr;
+	struct pnfs_ds_commit_info commit_info;
+	struct list_head	error_list; /* nfs4_ff_layout_ds_err */
+};
+
+static inline struct nfs4_flexfile_layout *
+FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
+}
+
+static inline struct nfs4_ff_layout_segment *
+FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg,
+			    struct nfs4_ff_layout_segment,
+			    generic_hdr);
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+	if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
+	    FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
+	    FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
+		return NULL;
+	return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
+}
+
+static inline struct nfs4_ff_layout_ds *
+FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
+{
+	return container_of(node, struct nfs4_ff_layout_ds, id_node);
+}
+
+static inline struct nfs4_ff_layout_mirror *
+FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
+{
+	if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
+		return NULL;
+	return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+}
+
+static inline u32
+FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
+{
+	return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
+}
+
+static inline bool
+ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+	return nfs4_test_deviceid_unavailable(node);
+}
+
+static inline int
+nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+	return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+}
+
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+			    gfp_t gfp_flags);
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
+			     u64 length, int status, enum nfs_opnum4 opnum,
+			     gfp_t gfp_flags);
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+			      struct xdr_stream *xdr, int *count,
+			      const struct pnfs_layout_range *range);
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
+
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+			  bool fail_return);
+
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
+				 u32 ds_idx,
+				 struct nfs_client *ds_clp,
+				 struct inode *inode);
+struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
+				       u32 ds_idx, struct rpc_cred *mdscred);
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000..77a2d026a
--- /dev/null
+++ b/kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "flexfilelayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+	if (mirror_ds)
+		nfs4_put_deviceid_node(&mirror_ds->id_node);
+}
+
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+	nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
+	nfs4_pnfs_ds_put(mirror_ds->ds);
+	kfree_rcu(mirror_ds, id_node.rcu);
+}
+
+/* Decode opaque device data and construct new_ds using it */
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+			    gfp_t gfp_flags)
+{
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	struct list_head dsaddrs;
+	struct nfs4_pnfs_ds_addr *da;
+	struct nfs4_ff_layout_ds *new_ds = NULL;
+	struct nfs4_ff_ds_version *ds_versions = NULL;
+	u32 mp_count;
+	u32 version_count;
+	__be32 *p;
+	int i, ret = -ENOMEM;
+
+	/* set up xdr stream */
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		goto out_err;
+
+	new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
+	if (!new_ds)
+		goto out_scratch;
+
+	nfs4_init_deviceid_node(&new_ds->id_node,
+				server,
+				&pdev->dev_id);
+	INIT_LIST_HEAD(&dsaddrs);
+
+	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* multipath count */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_drain_dsaddrs;
+	mp_count = be32_to_cpup(p);
+	dprintk("%s: multipath ds count %d\n", __func__, mp_count);
+
+	for (i = 0; i < mp_count; i++) {
+		/* multipath ds */
+		da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+					    &stream, gfp_flags);
+		if (da)
+			list_add_tail(&da->da_node, &dsaddrs);
+	}
+	if (list_empty(&dsaddrs)) {
+		dprintk("%s: no suitable DS addresses found\n",
+			__func__);
+		ret = -ENOMEDIUM;
+		goto out_err_drain_dsaddrs;
+	}
+
+	/* version count */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_drain_dsaddrs;
+	version_count = be32_to_cpup(p);
+	dprintk("%s: version count %d\n", __func__, version_count);
+
+	ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
+			      gfp_flags);
+	if (!ds_versions)
+		goto out_scratch;
+
+	for (i = 0; i < version_count; i++) {
+		/* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
+		 * tightly_coupled(4) */
+		p = xdr_inline_decode(&stream, 20);
+		if (unlikely(!p))
+			goto out_err_drain_dsaddrs;
+		ds_versions[i].version = be32_to_cpup(p++);
+		ds_versions[i].minor_version = be32_to_cpup(p++);
+		ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
+		ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+		ds_versions[i].tightly_coupled = be32_to_cpup(p);
+
+		if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
+			ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
+		if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
+			ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
+
+		if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
+			dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
+				i, ds_versions[i].version,
+				ds_versions[i].minor_version);
+			ret = -EPROTONOSUPPORT;
+			goto out_err_drain_dsaddrs;
+		}
+
+		dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
+			__func__, i, ds_versions[i].version,
+			ds_versions[i].minor_version,
+			ds_versions[i].rsize,
+			ds_versions[i].wsize,
+			ds_versions[i].tightly_coupled);
+	}
+
+	new_ds->ds_versions = ds_versions;
+	new_ds->ds_versions_cnt = version_count;
+
+	new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+	if (!new_ds->ds)
+		goto out_err_drain_dsaddrs;
+
+	/* If DS was already in cache, free ds addrs */
+	while (!list_empty(&dsaddrs)) {
+		da = list_first_entry(&dsaddrs,
+				      struct nfs4_pnfs_ds_addr,
+				      da_node);
+		list_del_init(&da->da_node);
+		kfree(da->da_remotestr);
+		kfree(da);
+	}
+
+	__free_page(scratch);
+	return new_ds;
+
+out_err_drain_dsaddrs:
+	while (!list_empty(&dsaddrs)) {
+		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+				      da_node);
+		list_del_init(&da->da_node);
+		kfree(da->da_remotestr);
+		kfree(da);
+	}
+
+	kfree(ds_versions);
+out_scratch:
+	__free_page(scratch);
+out_err:
+	kfree(new_ds);
+
+	dprintk("%s ERROR: returning %d\n", __func__, ret);
+	return NULL;
+}
+
+static u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
+			    u64 offset, u64 length)
+{
+	u64 end;
+
+	end = max_t(u64, end_offset(err->offset, err->length),
+		    end_offset(offset, length));
+	err->offset = min_t(u64, err->offset, offset);
+	err->length = end - err->offset;
+}
+
+static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
+			       u64 length, int status, enum nfs_opnum4 opnum,
+			       nfs4_stateid *stateid,
+			       struct nfs4_deviceid *deviceid)
+{
+	return err->status == status && err->opnum == opnum &&
+	       nfs4_stateid_match(&err->stateid, stateid) &&
+	       !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
+	       end_offset(err->offset, err->length) >= offset &&
+	       err->offset <= end_offset(offset, length);
+}
+
+static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
+			   struct nfs4_ff_layout_ds_err *new)
+{
+	if (!ds_error_can_merge(old, new->offset, new->length, new->status,
+				new->opnum, &new->stateid, &new->deviceid))
+		return false;
+
+	extend_ds_error(old, new->offset, new->length);
+	return true;
+}
+
+static bool
+ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
+			      struct nfs4_ff_layout_ds_err *dserr)
+{
+	struct nfs4_ff_layout_ds_err *err;
+
+	list_for_each_entry(err, &flo->error_list, list) {
+		if (merge_ds_error(err, dserr)) {
+			return true;
+		}
+	}
+
+	list_add(&dserr->list, &flo->error_list);
+	return false;
+}
+
+static bool
+ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
+			  u64 length, int status, enum nfs_opnum4 opnum,
+			  nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
+{
+	bool found = false;
+	struct nfs4_ff_layout_ds_err *err;
+
+	list_for_each_entry(err, &flo->error_list, list) {
+		if (ds_error_can_merge(err, offset, length, status, opnum,
+				       stateid, deviceid)) {
+			found = true;
+			extend_ds_error(err, offset, length);
+			break;
+		}
+	}
+
+	return found;
+}
+
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
+			     u64 length, int status, enum nfs_opnum4 opnum,
+			     gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_ds_err *dserr;
+	bool needfree;
+
+	if (status == 0)
+		return 0;
+
+	if (mirror->mirror_ds == NULL)
+		return -EINVAL;
+
+	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+	if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
+				      &mirror->stateid,
+				      &mirror->mirror_ds->id_node.deviceid)) {
+		spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+		return 0;
+	}
+	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+	dserr = kmalloc(sizeof(*dserr), gfp_flags);
+	if (!dserr)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&dserr->list);
+	dserr->offset = offset;
+	dserr->length = length;
+	dserr->status = status;
+	dserr->opnum = opnum;
+	nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
+	memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+	       NFS4_DEVICEID4_SIZE);
+
+	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+	needfree = ff_layout_add_ds_error_locked(flo, dserr);
+	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+	if (needfree)
+		kfree(dserr);
+
+	return 0;
+}
+
+/* currently we only support AUTH_NONE and AUTH_SYS */
+static rpc_authflavor_t
+nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+{
+	if (mirror->uid == (u32)-1)
+		return RPC_AUTH_NULL;
+	return RPC_AUTH_UNIX;
+}
+
+/* fetch cred for NFSv3 DS */
+static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
+				      struct nfs4_pnfs_ds *ds)
+{
+	if (ds->ds_clp && !mirror->cred &&
+	    mirror->mirror_ds->ds_versions[0].version == 3) {
+		struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
+		struct rpc_cred *cred;
+		struct auth_cred acred = {
+			.uid = make_kuid(&init_user_ns, mirror->uid),
+			.gid = make_kgid(&init_user_ns, mirror->gid),
+		};
+
+		/* AUTH_NULL ignores acred */
+		cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+		if (IS_ERR(cred)) {
+			dprintk("%s: lookup_cred failed with %ld\n",
+				__func__, PTR_ERR(cred));
+			return PTR_ERR(cred);
+		} else {
+			mirror->cred = cred;
+		}
+	}
+	return 0;
+}
+
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+	struct nfs_fh *fh = NULL;
+	struct nfs4_deviceid_node *devid;
+
+	if (mirror == NULL || mirror->mirror_ds == NULL ||
+	    mirror->mirror_ds->ds == NULL) {
+		printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+			__func__, mirror_idx);
+		if (mirror && mirror->mirror_ds) {
+			devid = &mirror->mirror_ds->id_node;
+			pnfs_generic_mark_devid_invalid(devid);
+		}
+		goto out;
+	}
+
+	/* FIXME: For now assume there is only 1 version available for the DS */
+	fh = &mirror->fh_versions[0];
+out:
+	return fh;
+}
+
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+			  bool fail_return)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+	struct nfs4_pnfs_ds *ds = NULL;
+	struct nfs4_deviceid_node *devid;
+	struct inode *ino = lseg->pls_layout->plh_inode;
+	struct nfs_server *s = NFS_SERVER(ino);
+	unsigned int max_payload;
+	rpc_authflavor_t flavor;
+
+	if (mirror == NULL || mirror->mirror_ds == NULL ||
+	    mirror->mirror_ds->ds == NULL) {
+		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+			__func__, ds_idx);
+		if (mirror && mirror->mirror_ds) {
+			devid = &mirror->mirror_ds->id_node;
+			pnfs_generic_mark_devid_invalid(devid);
+		}
+		goto out;
+	}
+
+	devid = &mirror->mirror_ds->id_node;
+	if (ff_layout_test_devid_unavailable(devid))
+		goto out;
+
+	ds = mirror->mirror_ds->ds;
+	/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
+	smp_rmb();
+	if (ds->ds_clp)
+		goto out;
+
+	flavor = nfs4_ff_layout_choose_authflavor(mirror);
+
+	/* FIXME: For now we assume the server sent only one version of NFS
+	 * to use for the DS.
+	 */
+	nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+			     dataserver_retrans,
+			     mirror->mirror_ds->ds_versions[0].version,
+			     mirror->mirror_ds->ds_versions[0].minor_version,
+			     flavor);
+
+	/* connect success, check rsize/wsize limit */
+	if (ds->ds_clp) {
+		max_payload =
+			nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
+				       NULL);
+		if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
+			mirror->mirror_ds->ds_versions[0].rsize = max_payload;
+		if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
+			mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+	} else {
+		ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+					 mirror, lseg->pls_range.offset,
+					 lseg->pls_range.length, NFS4ERR_NXIO,
+					 OP_ILLEGAL, GFP_NOIO);
+		if (fail_return) {
+			pnfs_error_mark_layout_for_return(ino, lseg);
+			if (ff_layout_has_available_ds(lseg))
+				pnfs_set_retry_layoutget(lseg->pls_layout);
+			else
+				pnfs_clear_retry_layoutget(lseg->pls_layout);
+
+		} else {
+			if (ff_layout_has_available_ds(lseg))
+				set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					&lseg->pls_layout->plh_flags);
+			else {
+				pnfs_error_mark_layout_for_return(ino, lseg);
+				pnfs_clear_retry_layoutget(lseg->pls_layout);
+			}
+		}
+	}
+
+	if (ff_layout_update_mirror_cred(mirror, ds))
+		ds = NULL;
+out:
+	return ds;
+}
+
+struct rpc_cred *
+ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+		      struct rpc_cred *mdscred)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+	struct rpc_cred *cred = ERR_PTR(-EINVAL);
+
+	if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
+		goto out;
+
+	if (mirror && mirror->cred)
+		cred = mirror->cred;
+	else
+		cred = mdscred;
+out:
+	return cred;
+}
+
+/**
+* Find or create a DS rpc client with th MDS server rpc client auth flavor
+* in the nfs_client cl_ds_clients list.
+*/
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+				 struct nfs_client *ds_clp, struct inode *inode)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+
+	switch (mirror->mirror_ds->ds_versions[0].version) {
+	case 3:
+		/* For NFSv3 DS, flavor is set when creating DS connections */
+		return ds_clp->cl_rpcclient;
+	case 4:
+		return nfs4_find_or_create_ds_client(ds_clp, inode);
+	default:
+		BUG();
+	}
+}
+
+static bool is_range_intersecting(u64 offset1, u64 length1,
+				  u64 offset2, u64 length2)
+{
+	u64 end1 = end_offset(offset1, length1);
+	u64 end2 = end_offset(offset2, length2);
+
+	return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
+	       (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+}
+
+/* called with inode i_lock held */
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+			      struct xdr_stream *xdr, int *count,
+			      const struct pnfs_layout_range *range)
+{
+	struct nfs4_ff_layout_ds_err *err, *n;
+	__be32 *p;
+
+	list_for_each_entry_safe(err, n, &flo->error_list, list) {
+		if (!is_range_intersecting(err->offset, err->length,
+					   range->offset, range->length))
+			continue;
+		/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
+		 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+		 */
+		p = xdr_reserve_space(xdr,
+				24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+		if (unlikely(!p))
+			return -ENOBUFS;
+		p = xdr_encode_hyper(p, err->offset);
+		p = xdr_encode_hyper(p, err->length);
+		p = xdr_encode_opaque_fixed(p, &err->stateid,
+					    NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, &err->deviceid,
+					    NFS4_DEVICEID4_SIZE);
+		*p++ = cpu_to_be32(err->status);
+		*p++ = cpu_to_be32(err->opnum);
+		*count += 1;
+		list_del(&err->list);
+		dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+			__func__, err->offset, err->length, err->status,
+			err->opnum, *count);
+		kfree(err);
+	}
+
+	return 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_deviceid_node *devid;
+	int idx;
+
+	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+		mirror = FF_LAYOUT_COMP(lseg, idx);
+		if (mirror && mirror->mirror_ds) {
+			devid = &mirror->mirror_ds->id_node;
+			if (!ff_layout_test_devid_unavailable(devid))
+				return true;
+		}
+	}
+
+	return false;
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+			"retries a request before it attempts further "
+			" recovery  action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+			"NFSv4.1  client  waits for a response from a "
+			" data server before it retries an NFS request.");
diff --git a/kernel/fs/nfs/fscache-index.c b/kernel/fs/nfs/fscache-index.c
new file mode 100644
index 000000000..777b05506
--- /dev/null
+++ b/kernel/fs/nfs/fscache-index.c
@@ -0,0 +1,336 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+/*
+ * Define the NFS filesystem for FS-Cache.  Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here.  The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+	.name		= "nfs",
+	.version	= 0,
+};
+
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+	return fscache_register_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+	uint16_t	nfsversion;		/* NFS protocol version */
+	uint16_t	family;			/* address family */
+	uint16_t	port;			/* IP port */
+	union {
+		struct in_addr	ipv4_addr;	/* IPv4 address */
+		struct in6_addr ipv6_addr;	/* IPv6 address */
+	} addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t bufmax)
+{
+	const struct nfs_client *clp = cookie_netfs_data;
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+	const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+	struct nfs_server_key *key = buffer;
+	uint16_t len = sizeof(struct nfs_server_key);
+
+	memset(key, 0, len);
+	key->nfsversion = clp->rpc_ops->version;
+	key->family = clp->cl_addr.ss_family;
+
+	switch (clp->cl_addr.ss_family) {
+	case AF_INET:
+		key->port = sin->sin_port;
+		key->addr[0].ipv4_addr = sin->sin_addr;
+		len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->port = sin6->sin6_port;
+		key->addr[0].ipv6_addr = sin6->sin6_addr;
+		len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+		       clp->cl_addr.ss_family);
+		len = 0;
+		break;
+	}
+
+	return len;
+}
+
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+	.name		= "NFS.server",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_server_get_key,
+};
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+				  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_fscache_key *key;
+	const struct nfs_server *nfss = cookie_netfs_data;
+	uint16_t len;
+
+	key = nfss->fscache_key;
+	len = sizeof(key->key) + key->key.uniq_len;
+	if (len > bufmax) {
+		len = 0;
+	} else {
+		memcpy(buffer, &key->key, sizeof(key->key));
+		memcpy(buffer + sizeof(key->key),
+		       key->key.uniquifier, key->key.uniq_len);
+	}
+
+	return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+	.name		= "NFS.super",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_super_get_key,
+};
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode.  This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+	struct timespec	mtime;
+	struct timespec	ctime;
+	loff_t		size;
+	u64		change_attr;
+};
+
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+					  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+	uint16_t nsize;
+
+	/* use the inode's NFS filehandle as the key */
+	nsize = nfsi->fh.size;
+	memcpy(buffer, nfsi->fh.data, nsize);
+	return nsize;
+}
+
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+				       uint64_t *size)
+{
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+
+	*size = nfsi->vfs_inode.i_size;
+}
+
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+					  void *buffer, uint16_t bufmax)
+{
+	struct nfs_fscache_inode_auxdata auxdata;
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.size = nfsi->vfs_inode.i_size;
+	auxdata.mtime = nfsi->vfs_inode.i_mtime;
+	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+		auxdata.change_attr = nfsi->vfs_inode.i_version;
+
+	if (bufmax > sizeof(auxdata))
+		bufmax = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, bufmax);
+	return bufmax;
+}
+
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ *   presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+						  const void *data,
+						  uint16_t datalen)
+{
+	struct nfs_fscache_inode_auxdata auxdata;
+	struct nfs_inode *nfsi = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.size = nfsi->vfs_inode.i_size;
+	auxdata.mtime = nfsi->vfs_inode.i_mtime;
+	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+		auxdata.change_attr = nfsi->vfs_inode.i_version;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ *   is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct nfs_inode *nfsi = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+
+	for (;;) {
+		/* grab a bunch of pages to unmark */
+		nr_pages = pagevec_lookup(&pvec,
+					  nfsi->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ *   cache fails with EIO - in which case the server must be contacted to
+ *   retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+	get_nfs_open_context(context);
+}
+
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+	if (context)
+		put_nfs_open_context(context);
+}
+
+/*
+ * Define the inode object for FS-Cache.  This is used to describe an inode
+ * object to fscache_acquire_cookie().  It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+	.name		= "NFS.fh",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= nfs_fscache_inode_get_key,
+	.get_attr	= nfs_fscache_inode_get_attr,
+	.get_aux	= nfs_fscache_inode_get_aux,
+	.check_aux	= nfs_fscache_inode_check_aux,
+	.now_uncached	= nfs_fscache_inode_now_uncached,
+	.get_context	= nfs_fh_get_context,
+	.put_context	= nfs_fh_put_context,
+};
diff --git a/kernel/fs/nfs/fscache.c b/kernel/fs/nfs/fscache.c
new file mode 100644
index 000000000..d63bea8bb
--- /dev/null
+++ b/kernel/fs/nfs/fscache.c
@@ -0,0 +1,439 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+	/* create a cache index for looking up filehandles */
+	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+					      &nfs_fscache_server_index_def,
+					      clp, true);
+	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+	dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+
+	fscache_relinquish_cookie(clp->fscache, 0);
+	clp->fscache = NULL;
+}
+
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)
+{
+	struct nfs_fscache_key *key, *xkey;
+	struct nfs_server *nfss = NFS_SB(sb);
+	struct rb_node **p, *parent;
+	int diff;
+
+	if (!uniq) {
+		uniq = "";
+		ulen = 1;
+	}
+
+	key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+	if (!key)
+		return;
+
+	key->nfs_client = nfss->nfs_client;
+	key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+	key->key.nfs_server.flags = nfss->flags;
+	key->key.nfs_server.rsize = nfss->rsize;
+	key->key.nfs_server.wsize = nfss->wsize;
+	key->key.nfs_server.acregmin = nfss->acregmin;
+	key->key.nfs_server.acregmax = nfss->acregmax;
+	key->key.nfs_server.acdirmin = nfss->acdirmin;
+	key->key.nfs_server.acdirmax = nfss->acdirmax;
+	key->key.nfs_server.fsid = nfss->fsid;
+	key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+	key->key.uniq_len = ulen;
+	memcpy(key->key.uniquifier, uniq, ulen);
+
+	spin_lock(&nfs_fscache_keys_lock);
+	p = &nfs_fscache_keys.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+		if (key->nfs_client < xkey->nfs_client)
+			goto go_left;
+		if (key->nfs_client > xkey->nfs_client)
+			goto go_right;
+
+		diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+
+		if (key->key.uniq_len == 0)
+			goto non_unique;
+		diff = memcmp(key->key.uniquifier,
+			      xkey->key.uniquifier,
+			      key->key.uniq_len);
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+		goto non_unique;
+
+	go_left:
+		p = &(*p)->rb_left;
+		continue;
+	go_right:
+		p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&key->node, parent, p);
+	rb_insert_color(&key->node, &nfs_fscache_keys);
+	spin_unlock(&nfs_fscache_keys_lock);
+	nfss->fscache_key = key;
+
+	/* create a cache index for looking up filehandles */
+	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+					       &nfs_fscache_super_index_def,
+					       nfss, true);
+	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+	return;
+
+non_unique:
+	spin_unlock(&nfs_fscache_keys_lock);
+	kfree(key);
+	nfss->fscache_key = NULL;
+	nfss->fscache = NULL;
+	printk(KERN_WARNING "NFS:"
+	       " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+	struct nfs_server *nfss = NFS_SB(sb);
+
+	dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+
+	fscache_relinquish_cookie(nfss->fscache, 0);
+	nfss->fscache = NULL;
+
+	if (nfss->fscache_key) {
+		spin_lock(&nfs_fscache_keys_lock);
+		rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+		spin_unlock(&nfs_fscache_keys_lock);
+		kfree(nfss->fscache_key);
+		nfss->fscache_key = NULL;
+	}
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	nfsi->fscache = NULL;
+	if (!S_ISREG(inode->i_mode))
+		return;
+	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+					       &nfs_fscache_inode_object_def,
+					       nfsi, false);
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_clear_inode(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
+
+	fscache_relinquish_cookie(cookie, false);
+	nfsi->fscache = NULL;
+}
+
+static bool nfs_fscache_can_enable(void *data)
+{
+	struct inode *inode = data;
+
+	return !inode_is_open_for_write(inode);
+}
+
+/*
+ * Enable or disable caching for a file that is being opened as appropriate.
+ * The cookie is allocated when the inode is initialised, but is not enabled at
+ * that time.  Enablement is deferred to file-open time to avoid stat() and
+ * access() thrashing the cache.
+ *
+ * For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ *
+ * We enable the cache for an inode if we open it read-only and it isn't
+ * currently open for writing.  We disable the cache if the inode is open
+ * write-only.
+ *
+ * The caller uses the file struct to pin i_writecount on the inode before
+ * calling us when a file is opened for writing, so we can make use of that.
+ *
+ * Note that this may be invoked multiple times in parallel by parallel
+ * nfs_open() functions.
+ */
+void nfs_fscache_open_file(struct inode *inode, struct file *filp)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+
+	if (!fscache_cookie_valid(cookie))
+		return;
+
+	if (inode_is_open_for_write(inode)) {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
+		clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
+		fscache_disable_cookie(cookie, true);
+		fscache_uncache_all_inode_pages(cookie, inode);
+	} else {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
+		fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode);
+		if (fscache_cookie_enabled(cookie))
+			set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
+
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	if (PageFsCache(page)) {
+		struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);
+
+		BUG_ON(!cookie);
+		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+			 cookie, page, NFS_I(page->mapping->host));
+
+		if (!fscache_maybe_release_page(cookie, page, gfp))
+			return 0;
+
+		nfs_inc_fscache_stats(page->mapping->host,
+				      NFSIOS_FSCACHE_PAGES_UNCACHED);
+	}
+
+	return 1;
+}
+
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+
+	BUG_ON(!cookie);
+
+	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+		 cookie, page, NFS_I(inode));
+
+	fscache_wait_on_page_write(cookie, page);
+
+	BUG_ON(!PageLocked(page));
+	fscache_uncache_page(cookie, page);
+	nfs_inc_fscache_stats(page->mapping->host,
+			      NFSIOS_FSCACHE_PAGES_UNCACHED);
+}
+
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+					       void *context,
+					       int error)
+{
+	dfprintk(FSCACHE,
+		 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+		 page, context, error);
+
+	/* if the read completes with an error, we just unlock the page and let
+	 * the VM reissue the readpage */
+	if (!error) {
+		SetPageUptodate(page);
+		unlock_page(page);
+	} else {
+		error = nfs_readpage_async(context, page->mapping->host, page);
+		if (error)
+			unlock_page(page);
+	}
+}
+
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+				struct inode *inode, struct page *page)
+{
+	int ret;
+
+	dfprintk(FSCACHE,
+		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+		 nfs_i_fscache(inode), page, page->index, page->flags, inode);
+
+	ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
+					 page,
+					 nfs_readpage_from_fscache_complete,
+					 ctx,
+					 GFP_KERNEL);
+
+	switch (ret) {
+	case 0: /* read BIO submitted (page in fscache) */
+		dfprintk(FSCACHE,
+			 "NFS:    readpage_from_fscache: BIO submitted\n");
+		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
+		return ret;
+
+	case -ENOBUFS: /* inode not in cache */
+	case -ENODATA: /* page not in cache */
+		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
+		dfprintk(FSCACHE,
+			 "NFS:    readpage_from_fscache %d\n", ret);
+		return 1;
+
+	default:
+		dfprintk(FSCACHE, "NFS:    readpage_from_fscache %d\n", ret);
+		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
+	}
+	return ret;
+}
+
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+				 struct inode *inode,
+				 struct address_space *mapping,
+				 struct list_head *pages,
+				 unsigned *nr_pages)
+{
+	unsigned npages = *nr_pages;
+	int ret;
+
+	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+		 nfs_i_fscache(inode), npages, inode);
+
+	ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),
+					  mapping, pages, nr_pages,
+					  nfs_readpage_from_fscache_complete,
+					  ctx,
+					  mapping_gfp_mask(mapping));
+	if (*nr_pages < npages)
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+				      npages);
+	if (*nr_pages > 0)
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+				      *nr_pages);
+
+	switch (ret) {
+	case 0: /* read submitted to the cache for all pages */
+		BUG_ON(!list_empty(pages));
+		BUG_ON(*nr_pages != 0);
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: submitted\n");
+
+		return ret;
+
+	case -ENOBUFS: /* some pages aren't cached and can't be */
+	case -ENODATA: /* some pages aren't cached */
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+		return 1;
+
+	default:
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: ret  %d\n", ret);
+	}
+
+	return ret;
+}
+
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+	int ret;
+
+	dfprintk(FSCACHE,
+		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+		 nfs_i_fscache(inode), page, page->index, page->flags, sync);
+
+	ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);
+	dfprintk(FSCACHE,
+		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+		 page, page->index, page->flags, ret);
+
+	if (ret != 0) {
+		fscache_uncache_page(nfs_i_fscache(inode), page);
+		nfs_inc_fscache_stats(inode,
+				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
+		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
+	} else {
+		nfs_inc_fscache_stats(inode,
+				      NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
+	}
+}
diff --git a/kernel/fs/nfs/fscache.h b/kernel/fs/nfs/fscache.h
new file mode 100644
index 000000000..d7fe3e799
--- /dev/null
+++ b/kernel/fs/nfs/fscache.h
@@ -0,0 +1,229 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+	struct rb_node		node;
+	struct nfs_client	*nfs_client;	/* the server */
+
+	/* the elements of the unique key - as used by nfs_compare_super() and
+	 * nfs_compare_mount_options() to distinguish superblocks */
+	struct {
+		struct {
+			unsigned long	s_flags;	/* various flags
+							 * (& NFS_MS_MASK) */
+		} super;
+
+		struct {
+			struct nfs_fsid fsid;
+			int		flags;
+			unsigned int	rsize;		/* read size */
+			unsigned int	wsize;		/* write size */
+			unsigned int	acregmin;	/* attr cache timeouts */
+			unsigned int	acregmax;
+			unsigned int	acdirmin;
+			unsigned int	acdirmax;
+		} nfs_server;
+
+		struct {
+			rpc_authflavor_t au_flavor;
+		} rpc_auth;
+
+		/* uniquifier - can be used if nfs_server.flags includes
+		 * NFS_MOUNT_UNSHARED  */
+		u8 uniq_len;
+		char uniquifier[0];
+	} key;
+};
+
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
+extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode(struct inode *);
+extern void nfs_fscache_clear_inode(struct inode *);
+extern void nfs_fscache_open_file(struct inode *, struct file *);
+
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+				       struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+					struct inode *, struct address_space *,
+					struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+						  struct page *page)
+{
+	if (PageFsCache(page))
+		fscache_wait_on_page_write(nfsi->fscache, page);
+}
+
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode)
+{
+	if (PageFsCache(page))
+		__nfs_fscache_invalidate_page(page, inode);
+}
+
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+					    struct inode *inode,
+					    struct page *page)
+{
+	if (NFS_I(inode)->fscache)
+		return __nfs_readpage_from_fscache(ctx, inode, page);
+	return -ENOBUFS;
+}
+
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+					     struct inode *inode,
+					     struct address_space *mapping,
+					     struct list_head *pages,
+					     unsigned *nr_pages)
+{
+	if (NFS_I(inode)->fscache)
+		return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+						    nr_pages);
+	return -ENOBUFS;
+}
+
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+					   struct page *page,
+					   int sync)
+{
+	if (PageFsCache(page))
+		__nfs_readpage_to_fscache(inode, page, sync);
+}
+
+/*
+ * Invalidate the contents of fscache for this inode.  This will not sleep.
+ */
+static inline void nfs_fscache_invalidate(struct inode *inode)
+{
+	fscache_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
+ * Wait for an object to finish being invalidated.
+ */
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
+{
+	fscache_wait_on_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+	if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+		return "yes";
+	return "no ";
+}
+
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode(struct inode *inode) {}
+static inline void nfs_fscache_clear_inode(struct inode *inode) {}
+static inline void nfs_fscache_open_file(struct inode *inode,
+					 struct file *filp) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+						  struct page *page) {}
+
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+					    struct inode *inode,
+					    struct page *page)
+{
+	return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+					     struct inode *inode,
+					     struct address_space *mapping,
+					     struct list_head *pages,
+					     unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+					   struct page *page, int sync) {}
+
+
+static inline void nfs_fscache_invalidate(struct inode *inode) {}
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+	return "no ";
+}
+
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/kernel/fs/nfs/getroot.c b/kernel/fs/nfs/getroot.c
new file mode 100644
index 000000000..a608ffd28
--- /dev/null
+++ b/kernel/fs/nfs/getroot.c
@@ -0,0 +1,133 @@
+/* getroot.c: get the root dentry for an NFS mount
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+
+/*
+ * Set the superblock root dentry.
+ * Note that this function frees the inode in case of error.
+ */
+static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode)
+{
+	/* The mntroot acts as the dummy root dentry for this superblock */
+	if (sb->s_root == NULL) {
+		sb->s_root = d_make_root(inode);
+		if (sb->s_root == NULL)
+			return -ENOMEM;
+		ihold(inode);
+		/*
+		 * Ensure that this dentry is invisible to d_find_alias().
+		 * Otherwise, it may be spliced into the tree by
+		 * d_splice_alias if a parent directory from the same
+		 * filesystem gets mounted at a later time.
+		 * This again causes shrink_dcache_for_umount_subtree() to
+		 * Oops, since the test for IS_ROOT() will fail.
+		 */
+		spin_lock(&d_inode(sb->s_root)->i_lock);
+		spin_lock(&sb->s_root->d_lock);
+		hlist_del_init(&sb->s_root->d_u.d_alias);
+		spin_unlock(&sb->s_root->d_lock);
+		spin_unlock(&d_inode(sb->s_root)->i_lock);
+	}
+	return 0;
+}
+
+/*
+ * get an NFS2/NFS3 root dentry from the root filehandle
+ */
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+			    const char *devname)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_fsinfo fsinfo;
+	struct dentry *ret;
+	struct inode *inode;
+	void *name = kstrdup(devname, GFP_KERNEL);
+	int error;
+
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+
+	/* get the actual root for this mount */
+	fsinfo.fattr = nfs_alloc_fattr();
+	if (fsinfo.fattr == NULL) {
+		kfree(name);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+	if (error < 0) {
+		dprintk("nfs_get_root: getattr error = %d\n", -error);
+		ret = ERR_PTR(error);
+		goto out;
+	}
+
+	inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
+	if (IS_ERR(inode)) {
+		dprintk("nfs_get_root: get root inode failed\n");
+		ret = ERR_CAST(inode);
+		goto out;
+	}
+
+	error = nfs_superblock_set_dummy_root(sb, inode);
+	if (error != 0) {
+		ret = ERR_PTR(error);
+		goto out;
+	}
+
+	/* root dentries normally start off anonymous and get spliced in later
+	 * if the dentry tree reaches them; however if the dentry already
+	 * exists, we'll pick it up at this point and use it as the root
+	 */
+	ret = d_obtain_root(inode);
+	if (IS_ERR(ret)) {
+		dprintk("nfs_get_root: get root dentry failed\n");
+		goto out;
+	}
+
+	security_d_instantiate(ret, inode);
+	spin_lock(&ret->d_lock);
+	if (IS_ROOT(ret) && !ret->d_fsdata &&
+	    !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+		ret->d_fsdata = name;
+		name = NULL;
+	}
+	spin_unlock(&ret->d_lock);
+out:
+	kfree(name);
+	nfs_free_fattr(fsinfo.fattr);
+	return ret;
+}
diff --git a/kernel/fs/nfs/inode.c b/kernel/fs/nfs/inode.c
new file mode 100644
index 000000000..f734562c6
--- /dev/null
+++ b/kernel/fs/nfs/inode.c
@@ -0,0 +1,2066 @@
+/*
+ *  linux/fs/nfs/inode.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs inode and superblock handling functions
+ *
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/freezer.h>
+
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfs.h"
+#include "netns.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED	1
+
+/* Default is to see 64-bit inode numbers */
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
+static void nfs_invalidate_inode(struct inode *);
+static int nfs_update_inode(struct inode *, struct nfs_fattr *);
+
+static struct kmem_cache * nfs_inode_cachep;
+
+static inline unsigned long
+nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
+{
+	return nfs_fileid_to_ino_t(fattr->fileid);
+}
+
+/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(struct wait_bit_key *key)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	freezable_schedule_unsafe();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
+
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+#ifdef CONFIG_COMPAT
+	compat_ulong_t ino;
+#else	
+	unsigned long ino;
+#endif
+
+	if (enable_ino64)
+		return fileid;
+	ino = fileid;
+	if (sizeof(ino) < sizeof(fileid))
+		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+	return ino;
+}
+
+int nfs_drop_inode(struct inode *inode)
+{
+	return NFS_STALE(inode) || generic_drop_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_drop_inode);
+
+void nfs_clear_inode(struct inode *inode)
+{
+	/*
+	 * The following should never happen...
+	 */
+	WARN_ON_ONCE(nfs_have_writebacks(inode));
+	WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
+	nfs_zap_acl_cache(inode);
+	nfs_access_zap_cache(inode);
+	nfs_fscache_clear_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_clear_inode);
+
+void nfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	nfs_clear_inode(inode);
+}
+
+int nfs_sync_inode(struct inode *inode)
+{
+	nfs_inode_dio_wait(inode);
+	return nfs_wb_all(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_sync_inode);
+
+/**
+ * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ */
+int nfs_sync_mapping(struct address_space *mapping)
+{
+	int ret = 0;
+
+	if (mapping->nrpages != 0) {
+		unmap_mapping_range(mapping, 0, 0, 0);
+		ret = nfs_wb_all(mapping->host);
+	}
+	return ret;
+}
+
+static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (inode->i_mapping->nrpages == 0)
+		flags &= ~NFS_INO_INVALID_DATA;
+	nfsi->cache_validity |= flags;
+	if (flags & NFS_INO_INVALID_DATA)
+		nfs_fscache_invalidate(inode);
+}
+
+/*
+ * Invalidate the local caches
+ */
+static void nfs_zap_caches_locked(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int mode = inode->i_mode;
+
+	nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+
+	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+	nfsi->attrtimeo_timestamp = jiffies;
+
+	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
+	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+					| NFS_INO_INVALID_DATA
+					| NFS_INO_INVALID_ACCESS
+					| NFS_INO_INVALID_ACL
+					| NFS_INO_REVAL_PAGECACHE);
+	} else
+		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+					| NFS_INO_INVALID_ACCESS
+					| NFS_INO_INVALID_ACL
+					| NFS_INO_REVAL_PAGECACHE);
+	nfs_zap_label_cache_locked(nfsi);
+}
+
+void nfs_zap_caches(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	nfs_zap_caches_locked(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+	if (mapping->nrpages != 0) {
+		spin_lock(&inode->i_lock);
+		nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+void nfs_zap_acl_cache(struct inode *inode)
+{
+	void (*clear_acl_cache)(struct inode *);
+
+	clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
+	if (clear_acl_cache != NULL)
+		clear_acl_cache(inode);
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);
+
+void nfs_invalidate_atime(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
+
+/*
+ * Invalidate, but do not unhash, the inode.
+ * NB: must be called with inode->i_lock held!
+ */
+static void nfs_invalidate_inode(struct inode *inode)
+{
+	set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+	nfs_zap_caches_locked(inode);
+}
+
+struct nfs_find_desc {
+	struct nfs_fh		*fh;
+	struct nfs_fattr	*fattr;
+};
+
+/*
+ * In NFSv3 we can have 64bit inode numbers. In order to support
+ * this, and re-exported directories (also seen in NFSv2)
+ * we are forced to allow 2 different inodes to have the same
+ * i_ino.
+ */
+static int
+nfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct nfs_find_desc	*desc = (struct nfs_find_desc *)opaque;
+	struct nfs_fh		*fh = desc->fh;
+	struct nfs_fattr	*fattr = desc->fattr;
+
+	if (NFS_FILEID(inode) != fattr->fileid)
+		return 0;
+	if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode))
+		return 0;
+	if (nfs_compare_fh(NFS_FH(inode), fh))
+		return 0;
+	if (is_bad_inode(inode) || NFS_STALE(inode))
+		return 0;
+	return 1;
+}
+
+static int
+nfs_init_locked(struct inode *inode, void *opaque)
+{
+	struct nfs_find_desc	*desc = (struct nfs_find_desc *)opaque;
+	struct nfs_fattr	*fattr = desc->fattr;
+
+	set_nfs_fileid(inode, fattr->fileid);
+	nfs_copy_fh(NFS_FH(inode), desc->fh);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static void nfs_clear_label_invalid(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL;
+	spin_unlock(&inode->i_lock);
+}
+
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+					struct nfs4_label *label)
+{
+	int error;
+
+	if (label == NULL)
+		return;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+		error = security_inode_notifysecctx(inode, label->label,
+				label->len);
+		if (error)
+			printk(KERN_ERR "%s() %s %d "
+					"security_inode_notifysecctx() %d\n",
+					__func__,
+					(char *)label->label,
+					label->len, error);
+		nfs_clear_label_invalid(inode);
+	}
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+	struct nfs4_label *label = NULL;
+	int minor_version = server->nfs_client->cl_minorversion;
+
+	if (minor_version < 2)
+		return label;
+
+	if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+		return label;
+
+	label = kzalloc(sizeof(struct nfs4_label), flags);
+	if (label == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+	if (label->label == NULL) {
+		kfree(label);
+		return ERR_PTR(-ENOMEM);
+	}
+	label->len = NFS4_MAXLABELLEN;
+
+	return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+					struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
+/*
+ * This is our front-end to iget that looks up inodes by file handle
+ * instead of inode number.
+ */
+struct inode *
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	struct nfs_find_desc desc = {
+		.fh	= fh,
+		.fattr	= fattr
+	};
+	struct inode *inode = ERR_PTR(-ENOENT);
+	unsigned long hash;
+
+	nfs_attr_check_mountpoint(sb, fattr);
+
+	if (nfs_attr_use_mounted_on_fileid(fattr))
+		fattr->fileid = fattr->mounted_on_fileid;
+	else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
+		goto out_no_inode;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
+		goto out_no_inode;
+
+	hash = nfs_fattr_to_ino_t(fattr);
+
+	inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc);
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out_no_inode;
+	}
+
+	if (inode->i_state & I_NEW) {
+		struct nfs_inode *nfsi = NFS_I(inode);
+		unsigned long now = jiffies;
+
+		/* We set i_ino for the few things that still rely on it,
+		 * such as stat(2) */
+		inode->i_ino = hash;
+
+		/* We can't support update_atime(), since the server will reset it */
+		inode->i_flags |= S_NOATIME|S_NOCMTIME;
+		inode->i_mode = fattr->mode;
+		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+				&& nfs_server_capable(inode, NFS_CAP_MODE))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		/* Why so? Because we want revalidate for devices/FIFOs, and
+		 * that's precisely what we have in nfs_file_inode_operations.
+		 */
+		inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
+		if (S_ISREG(inode->i_mode)) {
+			inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
+			inode->i_data.a_ops = &nfs_file_aops;
+		} else if (S_ISDIR(inode->i_mode)) {
+			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
+			inode->i_fop = &nfs_dir_operations;
+			inode->i_data.a_ops = &nfs_dir_aops;
+			/* Deal with crossing mountpoints */
+			if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
+					fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+					inode->i_op = &nfs_referral_inode_operations;
+				else
+					inode->i_op = &nfs_mountpoint_inode_operations;
+				inode->i_fop = NULL;
+				inode->i_flags |= S_AUTOMOUNT;
+			}
+		} else if (S_ISLNK(inode->i_mode))
+			inode->i_op = &nfs_symlink_inode_operations;
+		else
+			init_special_inode(inode, inode->i_mode, fattr->rdev);
+
+		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+		inode->i_version = 0;
+		inode->i_size = 0;
+		clear_nlink(inode);
+		inode->i_uid = make_kuid(&init_user_ns, -2);
+		inode->i_gid = make_kgid(&init_user_ns, -2);
+		inode->i_blocks = 0;
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+		nfsi->write_io = 0;
+		nfsi->read_io = 0;
+
+		nfsi->read_cache_jiffies = fattr->time_start;
+		nfsi->attr_gencount = fattr->gencount;
+		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+			inode->i_atime = fattr->atime;
+		else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			inode->i_mtime = fattr->mtime;
+		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			inode->i_ctime = fattr->ctime;
+		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			inode->i_version = fattr->change_attr;
+		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		else
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_PAGECACHE);
+		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+			set_nlink(inode, fattr->nlink);
+		else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+			inode->i_uid = fattr->uid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+			inode->i_gid = fattr->gid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+			inode->i_blocks = fattr->du.nfs2.blocks;
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+			/*
+			 * report the blocks in 512byte units
+			 */
+			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+		}
+
+		nfs_setsecurity(inode, fattr, label);
+
+		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+		nfsi->attrtimeo_timestamp = now;
+		nfsi->access_cache = RB_ROOT;
+
+		nfs_fscache_init_inode(inode);
+
+		unlock_new_inode(inode);
+	} else
+		nfs_refresh_inode(inode, fattr);
+	dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",
+		inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(inode),
+		nfs_display_fhandle_hash(fh),
+		atomic_read(&inode->i_count));
+
+out:
+	return inode;
+
+out_no_inode:
+	dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode));
+	goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_fhget);
+
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
+
+int
+nfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct nfs_fattr *fattr;
+	int error = -ENOMEM;
+
+	nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
+
+	/* skip mode change if it's just for clearing setuid/setgid */
+	if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+		attr->ia_valid &= ~ATTR_MODE;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		loff_t i_size;
+
+		BUG_ON(!S_ISREG(inode->i_mode));
+
+		i_size = i_size_read(inode);
+		if (attr->ia_size == i_size)
+			attr->ia_valid &= ~ATTR_SIZE;
+		else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
+			return -ETXTBSY;
+	}
+
+	/* Optimization: if the end result is no change, don't RPC */
+	attr->ia_valid &= NFS_VALID_ATTRS;
+	if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+		return 0;
+
+	trace_nfs_setattr_enter(inode);
+
+	/* Write all dirty data */
+	if (S_ISREG(inode->i_mode))
+		nfs_sync_inode(inode);
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+	/*
+	 * Return any delegations if we're going to change ACLs
+	 */
+	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
+		NFS_PROTO(inode)->return_delegation(inode);
+	error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
+	if (error == 0)
+		error = nfs_refresh_inode(inode, fattr);
+	nfs_free_fattr(fattr);
+out:
+	trace_nfs_setattr_exit(inode, error);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_setattr);
+
+/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ * Note: must be called with inode->i_lock held!
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+	int err;
+
+	err = inode_newsize_ok(inode, offset);
+	if (err)
+		goto out;
+
+	i_size_write(inode, offset);
+	/* Optimisation */
+	if (offset == 0)
+		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+
+	spin_unlock(&inode->i_lock);
+	truncate_pagecache(inode, offset);
+	spin_lock(&inode->i_lock);
+out:
+	return err;
+}
+
+/**
+ * nfs_setattr_update_inode - Update inode metadata after a setattr call.
+ * @inode: pointer to struct inode
+ * @attr: pointer to struct iattr
+ *
+ * Note: we do this in the *proc.c in order to ensure that
+ *       it works for things like exclusive creates too.
+ */
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
+		struct nfs_fattr *fattr)
+{
+	/* Barrier: bump the attribute generation count. */
+	nfs_fattr_set_barrier(fattr);
+
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->attr_gencount = fattr->gencount;
+	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+		if ((attr->ia_valid & ATTR_MODE) != 0) {
+			int mode = attr->ia_mode & S_IALLUGO;
+			mode |= inode->i_mode & ~S_IALLUGO;
+			inode->i_mode = mode;
+		}
+		if ((attr->ia_valid & ATTR_UID) != 0)
+			inode->i_uid = attr->ia_uid;
+		if ((attr->ia_valid & ATTR_GID) != 0)
+			inode->i_gid = attr->ia_gid;
+		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL);
+	}
+	if ((attr->ia_valid & ATTR_SIZE) != 0) {
+		nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
+		nfs_vmtruncate(inode, attr->ia_size);
+	}
+	nfs_update_inode(inode, fattr);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
+
+static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	parent = dget_parent(dentry);
+	nfs_force_use_readdirplus(d_inode(parent));
+	dput(parent);
+}
+
+static bool nfs_need_revalidate_inode(struct inode *inode)
+{
+	if (NFS_I(inode)->cache_validity &
+			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+		return true;
+	if (nfs_attribute_cache_expired(inode))
+		return true;
+	return false;
+}
+
+int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+	int err = 0;
+
+	trace_nfs_getattr_enter(inode);
+	/* Flush out writes to the server in order to update c/mtime.  */
+	if (S_ISREG(inode->i_mode)) {
+		mutex_lock(&inode->i_mutex);
+		err = nfs_sync_inode(inode);
+		mutex_unlock(&inode->i_mutex);
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * We may force a getattr if the user cares about atime.
+	 *
+	 * Note that we only have to check the vfsmount flags here:
+	 *  - NFS always sets S_NOATIME by so checking it would give a
+	 *    bogus result
+	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+	 *    no point in checking those.
+	 */
+ 	if ((mnt->mnt_flags & MNT_NOATIME) ||
+ 	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+		need_atime = 0;
+
+	if (need_atime || nfs_need_revalidate_inode(inode)) {
+		struct nfs_server *server = NFS_SERVER(inode);
+
+		if (server->caps & NFS_CAP_READDIRPLUS)
+			nfs_request_parent_use_readdirplus(dentry);
+		err = __nfs_revalidate_inode(server, inode);
+	}
+	if (!err) {
+		generic_fillattr(inode, stat);
+		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+	}
+out:
+	trace_nfs_getattr_exit(inode, err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nfs_getattr);
+
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+	atomic_set(&l_ctx->count, 1);
+	l_ctx->lockowner.l_owner = current->files;
+	l_ctx->lockowner.l_pid = current->tgid;
+	INIT_LIST_HEAD(&l_ctx->list);
+	nfs_iocounter_init(&l_ctx->io_count);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *head = &ctx->lock_context;
+	struct nfs_lock_context *pos = head;
+
+	do {
+		if (pos->lockowner.l_owner != current->files)
+			continue;
+		if (pos->lockowner.l_pid != current->tgid)
+			continue;
+		atomic_inc(&pos->count);
+		return pos;
+	} while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
+	return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *res, *new = NULL;
+	struct inode *inode = d_inode(ctx->dentry);
+
+	spin_lock(&inode->i_lock);
+	res = __nfs_find_lock_context(ctx);
+	if (res == NULL) {
+		spin_unlock(&inode->i_lock);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return ERR_PTR(-ENOMEM);
+		nfs_init_lock_context(new);
+		spin_lock(&inode->i_lock);
+		res = __nfs_find_lock_context(ctx);
+		if (res == NULL) {
+			list_add_tail(&new->list, &ctx->lock_context.list);
+			new->open_context = ctx;
+			res = new;
+			new = NULL;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	kfree(new);
+	return res;
+}
+EXPORT_SYMBOL_GPL(nfs_get_lock_context);
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+	struct nfs_open_context *ctx = l_ctx->open_context;
+	struct inode *inode = d_inode(ctx->dentry);
+
+	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+		return;
+	list_del(&l_ctx->list);
+	spin_unlock(&inode->i_lock);
+	kfree(l_ctx);
+}
+EXPORT_SYMBOL_GPL(nfs_put_lock_context);
+
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode;
+	struct nfs_server *server;
+
+	if (!(ctx->mode & FMODE_WRITE))
+		return;
+	if (!is_sync)
+		return;
+	inode = d_inode(ctx->dentry);
+	if (!list_empty(&NFS_I(inode)->open_files))
+		return;
+	server = NFS_SERVER(inode);
+	if (server->flags & NFS_MOUNT_NOCTO)
+		return;
+	nfs_revalidate_inode(server, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_close_context);
+
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode)
+{
+	struct nfs_open_context *ctx;
+	struct rpc_cred *cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return ERR_CAST(cred);
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		put_rpccred(cred);
+		return ERR_PTR(-ENOMEM);
+	}
+	nfs_sb_active(dentry->d_sb);
+	ctx->dentry = dget(dentry);
+	ctx->cred = cred;
+	ctx->state = NULL;
+	ctx->mode = f_mode;
+	ctx->flags = 0;
+	ctx->error = 0;
+	nfs_init_lock_context(&ctx->lock_context);
+	ctx->lock_context.open_context = ctx;
+	INIT_LIST_HEAD(&ctx->list);
+	ctx->mdsthreshold = NULL;
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
+
+struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
+{
+	if (ctx != NULL)
+		atomic_inc(&ctx->lock_context.count);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(get_nfs_open_context);
+
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode = d_inode(ctx->dentry);
+	struct super_block *sb = ctx->dentry->d_sb;
+
+	if (!list_empty(&ctx->list)) {
+		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+			return;
+		list_del(&ctx->list);
+		spin_unlock(&inode->i_lock);
+	} else if (!atomic_dec_and_test(&ctx->lock_context.count))
+		return;
+	if (inode != NULL)
+		NFS_PROTO(inode)->close_context(ctx, is_sync);
+	if (ctx->cred != NULL)
+		put_rpccred(ctx->cred);
+	dput(ctx->dentry);
+	nfs_sb_deactive(sb);
+	kfree(ctx->mdsthreshold);
+	kfree(ctx);
+}
+
+void put_nfs_open_context(struct nfs_open_context *ctx)
+{
+	__put_nfs_open_context(ctx, 0);
+}
+EXPORT_SYMBOL_GPL(put_nfs_open_context);
+
+/*
+ * Ensure that mmap has a recent RPC credential for use when writing out
+ * shared pages
+ */
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
+{
+	struct inode *inode = d_inode(ctx->dentry);
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	list_add(&ctx->list, &nfsi->open_files);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+	filp->private_data = get_nfs_open_context(ctx);
+	if (list_empty(&ctx->list))
+		nfs_inode_attach_open_context(ctx);
+}
+EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
+
+/*
+ * Given an inode, search for an open context with the desired characteristics
+ */
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *pos, *ctx = NULL;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(pos, &nfsi->open_files, list) {
+		if (cred != NULL && pos->cred != cred)
+			continue;
+		if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
+			continue;
+		ctx = get_nfs_open_context(pos);
+		break;
+	}
+	spin_unlock(&inode->i_lock);
+	return ctx;
+}
+
+static void nfs_file_clear_open_context(struct file *filp)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(filp);
+
+	if (ctx) {
+		struct inode *inode = d_inode(ctx->dentry);
+
+		filp->private_data = NULL;
+		spin_lock(&inode->i_lock);
+		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
+		spin_unlock(&inode->i_lock);
+		__put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+	}
+}
+
+/*
+ * These allocate and release file read/write context information.
+ */
+int nfs_open(struct inode *inode, struct file *filp)
+{
+	struct nfs_open_context *ctx;
+
+	ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+	nfs_file_set_open_context(filp, ctx);
+	put_nfs_open_context(ctx);
+	nfs_fscache_open_file(inode, filp);
+	return 0;
+}
+
+int nfs_release(struct inode *inode, struct file *filp)
+{
+	nfs_file_clear_open_context(filp);
+	return 0;
+}
+
+/*
+ * This function is called whenever some part of NFS notices that
+ * the cached attributes have to be refreshed.
+ */
+int
+__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+	int		 status = -ESTALE;
+	struct nfs4_label *label = NULL;
+	struct nfs_fattr *fattr = NULL;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n",
+		inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));
+
+	trace_nfs_revalidate_inode_enter(inode);
+
+	if (is_bad_inode(inode))
+		goto out;
+	if (NFS_STALE(inode))
+		goto out;
+
+	status = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+
+	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
+
+	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+	if (IS_ERR(label)) {
+		status = PTR_ERR(label);
+		goto out;
+	}
+
+	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
+	if (status != 0) {
+		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
+			 inode->i_sb->s_id,
+			 (unsigned long long)NFS_FILEID(inode), status);
+		if (status == -ESTALE) {
+			nfs_zap_caches(inode);
+			if (!S_ISDIR(inode->i_mode))
+				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		}
+		goto err_out;
+	}
+
+	status = nfs_refresh_inode(inode, fattr);
+	if (status) {
+		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
+			 inode->i_sb->s_id,
+			 (unsigned long long)NFS_FILEID(inode), status);
+		goto err_out;
+	}
+
+	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
+
+	nfs_setsecurity(inode, fattr, label);
+
+	dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
+		inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(inode));
+
+err_out:
+	nfs4_label_free(label);
+out:
+	nfs_free_fattr(fattr);
+	trace_nfs_revalidate_inode_exit(inode, status);
+	return status;
+}
+
+int nfs_attribute_timeout(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+int nfs_attribute_cache_expired(struct inode *inode)
+{
+	if (nfs_have_delegated_attributes(inode))
+		return 0;
+	return nfs_attribute_timeout(inode);
+}
+
+/**
+ * nfs_revalidate_inode - Revalidate the inode attributes
+ * @server - pointer to nfs_server struct
+ * @inode - pointer to inode struct
+ *
+ * Updates inode attribute information by retrieving the data from the server.
+ */
+int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+	if (!nfs_need_revalidate_inode(inode))
+		return NFS_STALE(inode) ? -ESTALE : 0;
+	return __nfs_revalidate_inode(server, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
+
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+	if (!(NFS_I(inode)->cache_validity &
+			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+			&& !nfs_attribute_cache_expired(inode))
+		return NFS_STALE(inode) ? -ESTALE : 0;
+	return -ECHILD;
+}
+
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret;
+
+	if (mapping->nrpages != 0) {
+		if (S_ISREG(inode->i_mode)) {
+			unmap_mapping_range(mapping, 0, 0, 0);
+			ret = nfs_sync_mapping(mapping);
+			if (ret < 0)
+				return ret;
+		}
+		ret = invalidate_inode_pages2(mapping);
+		if (ret < 0)
+			return ret;
+	}
+	if (S_ISDIR(inode->i_mode)) {
+		spin_lock(&inode->i_lock);
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+		spin_unlock(&inode->i_lock);
+	}
+	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+	nfs_fscache_wait_on_invalidate(inode);
+
+	dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
+			inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(inode));
+	return 0;
+}
+
+static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+	if (nfs_have_delegated_attributes(inode))
+		return false;
+	return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		|| nfs_attribute_timeout(inode)
+		|| NFS_STALE(inode);
+}
+
+/**
+ * __nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ * @may_lock - take inode->i_mutex?
+ */
+static int __nfs_revalidate_mapping(struct inode *inode,
+		struct address_space *mapping,
+		bool may_lock)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &nfsi->flags;
+	int ret = 0;
+
+	/* swapfiles are not supposed to be shared. */
+	if (IS_SWAPFILE(inode))
+		goto out;
+
+	if (nfs_mapping_need_revalidate_inode(inode)) {
+		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		if (ret < 0)
+			goto out;
+	}
+
+	/*
+	 * We must clear NFS_INO_INVALID_DATA first to ensure that
+	 * invalidations that come in while we're shooting down the mappings
+	 * are respected. But, that leaves a race window where one revalidator
+	 * can clear the flag, and then another checks it before the mapping
+	 * gets invalidated. Fix that by serializing access to this part of
+	 * the function.
+	 *
+	 * At the same time, we need to allow other tasks to see whether we
+	 * might be in the middle of invalidating the pages, so we only set
+	 * the bit lock here if it looks like we're going to be doing that.
+	 */
+	for (;;) {
+		ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
+					 nfs_wait_bit_killable, TASK_KILLABLE);
+		if (ret)
+			goto out;
+		spin_lock(&inode->i_lock);
+		if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+			break;
+		spin_unlock(&inode->i_lock);
+		goto out;
+	}
+
+	set_bit(NFS_INO_INVALIDATING, bitlock);
+	smp_wmb();
+	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	spin_unlock(&inode->i_lock);
+	trace_nfs_invalidate_mapping_enter(inode);
+	if (may_lock) {
+		mutex_lock(&inode->i_mutex);
+		ret = nfs_invalidate_mapping(inode, mapping);
+		mutex_unlock(&inode->i_mutex);
+	} else
+		ret = nfs_invalidate_mapping(inode, mapping);
+	trace_nfs_invalidate_mapping_exit(inode, ret);
+
+	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+	smp_mb__after_atomic();
+	wake_up_bit(bitlock, NFS_INO_INVALIDATING);
+out:
+	return ret;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	return __nfs_revalidate_mapping(inode, mapping, false);
+}
+
+/**
+ * nfs_revalidate_mapping_protected - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ *
+ * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
+ * while invalidating the mapping.
+ */
+int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+{
+	return __nfs_revalidate_mapping(inode, mapping, true);
+}
+
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long ret = 0;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			&& inode->i_version == fattr->pre_change_attr) {
+		inode->i_version = fattr->change_attr;
+		if (S_ISDIR(inode->i_mode))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+	/* If we have atomic WCC data, we may update some attributes */
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+		if (S_ISDIR(inode->i_mode))
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+			&& nfsi->nrequests == 0) {
+		i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+
+	return ret;
+}
+
+/**
+ * nfs_check_inode_attributes - verify consistency of the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * Verifies the attribute cache. If we have just changed the attributes,
+ * so that fattr carries weak cache consistency data, then it may
+ * also update the ctime/mtime/change_attribute.
+ */
+static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	loff_t cur_size, new_isize;
+	unsigned long invalid = 0;
+
+
+	if (nfs_have_delegated_attributes(inode))
+		return 0;
+	/* Has the inode gone and changed behind our back? */
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+		return -EIO;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+		return -EIO;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			inode->i_version != fattr->change_attr)
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+
+	/* Verify a few of the more important attributes */
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+		invalid |= NFS_INO_INVALID_ATTR;
+
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		cur_size = i_size_read(inode);
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		if (cur_size != new_isize && nfsi->nrequests == 0)
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	}
+
+	/* Have any file permissions changed? */
+	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+
+	/* Has the link count changed? */
+	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
+		invalid |= NFS_INO_INVALID_ATTR;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
+		invalid |= NFS_INO_INVALID_ATIME;
+
+	if (invalid != 0)
+		nfs_set_cache_invalid(inode, invalid);
+
+	nfsi->read_cache_jiffies = fattr->time_start;
+	return 0;
+}
+
+static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+		return 0;
+	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
+}
+
+static atomic_long_t nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+	return atomic_long_read(&nfs_attr_generation_counter);
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+	return atomic_long_inc_return(&nfs_attr_generation_counter);
+}
+EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter);
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+	fattr->valid = 0;
+	fattr->time_start = jiffies;
+	fattr->gencount = nfs_inc_attr_generation_counter();
+	fattr->owner_name = NULL;
+	fattr->group_name = NULL;
+}
+EXPORT_SYMBOL_GPL(nfs_fattr_init);
+
+/**
+ * nfs_fattr_set_barrier
+ * @fattr: attributes
+ *
+ * Used to set a barrier after an attribute was updated. This
+ * barrier ensures that older attributes from RPC calls that may
+ * have raced with our update cannot clobber these new values.
+ * Note that you are still responsible for ensuring that other
+ * operations which change the attribute on the server do not
+ * collide.
+ */
+void nfs_fattr_set_barrier(struct nfs_fattr *fattr)
+{
+	fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+	struct nfs_fattr *fattr;
+
+	fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+	if (fattr != NULL)
+		nfs_fattr_init(fattr);
+	return fattr;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fattr);
+
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+	struct nfs_fh *fh;
+
+	fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+	if (fh != NULL)
+		fh->size = 0;
+	return fh;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fhandle);
+
+#ifdef NFS_DEBUG
+/*
+ * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle
+ *                             in the same way that wireshark does
+ *
+ * @fh: file handle
+ *
+ * For debugging only.
+ */
+u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
+{
+	/* wireshark uses 32-bit AUTODIN crc and does a bitwise
+	 * not on the result */
+	return nfs_fhandle_hash(fh);
+}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);
+
+/*
+ * _nfs_display_fhandle - display an NFS file handle on the console
+ *
+ * @fh: file handle to display
+ * @caption: display caption
+ *
+ * For debugging only.
+ */
+void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
+{
+	unsigned short i;
+
+	if (fh == NULL || fh->size == 0) {
+		printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh);
+		return;
+	}
+
+	printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n",
+	       caption, fh, fh->size, _nfs_display_fhandle_hash(fh));
+	for (i = 0; i < fh->size; i += 16) {
+		__be32 *pos = (__be32 *)&fh->data[i];
+
+		switch ((fh->size - i - 1) >> 2) {
+		case 0:
+			printk(KERN_DEFAULT " %08x\n",
+				be32_to_cpup(pos));
+			break;
+		case 1:
+			printk(KERN_DEFAULT " %08x %08x\n",
+				be32_to_cpup(pos), be32_to_cpup(pos + 1));
+			break;
+		case 2:
+			printk(KERN_DEFAULT " %08x %08x %08x\n",
+				be32_to_cpup(pos), be32_to_cpup(pos + 1),
+				be32_to_cpup(pos + 2));
+			break;
+		default:
+			printk(KERN_DEFAULT " %08x %08x %08x %08x\n",
+				be32_to_cpup(pos), be32_to_cpup(pos + 1),
+				be32_to_cpup(pos + 2), be32_to_cpup(pos + 3));
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
+#endif
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode - pointer to inode
+ * @fattr - attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	const struct nfs_inode *nfsi = NFS_I(inode);
+
+	return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
+		nfs_ctime_need_update(inode, fattr) ||
+		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+}
+
+/*
+ * Don't trust the change_attribute, mtime, ctime or size if
+ * a pnfs LAYOUTCOMMIT is outstanding
+ */
+static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
+		struct nfs_fattr *fattr)
+{
+	if (pnfs_layoutcommit_outstanding(inode))
+		fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
+				NFS_ATTR_FATTR_MTIME |
+				NFS_ATTR_FATTR_CTIME |
+				NFS_ATTR_FATTR_SIZE);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int ret;
+
+	trace_nfs_refresh_inode_enter(inode);
+
+	nfs_inode_attrs_handle_layoutcommit(inode, fattr);
+
+	if (nfs_inode_attrs_need_update(inode, fattr))
+		ret = nfs_update_inode(inode, fattr);
+	else
+		ret = nfs_check_inode_attributes(inode, fattr);
+
+	trace_nfs_refresh_inode_exit(inode, ret);
+	return ret;
+}
+
+/**
+ * nfs_refresh_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * Check that an RPC call that returned attributes has not overlapped with
+ * other recent updates of the inode metadata, then decide whether it is
+ * safe to do a full update of the inode attributes, or whether just to
+ * call nfs_check_inode_attributes.
+ */
+int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+		return 0;
+	spin_lock(&inode->i_lock);
+	status = nfs_refresh_inode_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(nfs_refresh_inode);
+
+static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+	unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+
+	if (S_ISDIR(inode->i_mode))
+		invalid |= NFS_INO_INVALID_DATA;
+	nfs_set_cache_invalid(inode, invalid);
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+		return 0;
+	return nfs_refresh_inode_locked(inode, fattr);
+}
+
+/**
+ * nfs_post_op_update_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it.
+ *
+ * NB: if the server didn't return any post op attributes, this
+ * function will force the retrieval of attributes before the next
+ * NFS request.  Thus it should be used only for operations that
+ * are expected to change one or more attributes, to avoid
+ * unnecessary NFS requests and trips through nfs_update_inode().
+ */
+int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	spin_lock(&inode->i_lock);
+	nfs_fattr_set_barrier(fattr);
+	status = nfs_post_op_update_inode_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
+
+/**
+ * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	/* Don't do a WCC update if these attributes are already stale */
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+			!nfs_inode_attrs_need_update(inode, fattr)) {
+		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+				| NFS_ATTR_FATTR_PRESIZE
+				| NFS_ATTR_FATTR_PREMTIME
+				| NFS_ATTR_FATTR_PRECTIME);
+		goto out_noforce;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
+		fattr->pre_change_attr = inode->i_version;
+		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
+		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
+		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
+		fattr->pre_size = i_size_read(inode);
+		fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
+	}
+out_noforce:
+	status = nfs_post_op_update_inode_locked(inode, fattr);
+	return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	spin_lock(&inode->i_lock);
+	nfs_fattr_set_barrier(fattr);
+	status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+	return status;
+}
+EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
+
+
+static inline bool nfs_fileid_valid(struct nfs_inode *nfsi,
+				    struct nfs_fattr *fattr)
+{
+	bool ret1 = true, ret2 = true;
+
+	if (fattr->valid & NFS_ATTR_FATTR_FILEID)
+		ret1 = (nfsi->fileid == fattr->fileid);
+	if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+		ret2 = (nfsi->fileid == fattr->mounted_on_fileid);
+	return ret1 || ret2;
+}
+
+/*
+ * Many nfs protocol calls return the new file attributes after
+ * an operation.  Here we update the inode to reflect the state
+ * of the server's inode.
+ *
+ * This is a bit tricky because we have to make sure all dirty pages
+ * have been sent off to the server before calling invalidate_inode_pages.
+ * To make sure no other process adds more write requests while we try
+ * our best to flush them, we make them sleep during the attribute refresh.
+ *
+ * A very similar scenario holds for the dir cache.
+ */
+static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_server *server;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	loff_t cur_isize, new_isize;
+	unsigned long invalid = 0;
+	unsigned long now = jiffies;
+	unsigned long save_cache_validity;
+
+	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
+			__func__, inode->i_sb->s_id, inode->i_ino,
+			nfs_display_fhandle_hash(NFS_FH(inode)),
+			atomic_read(&inode->i_count), fattr->valid);
+
+	if (!nfs_fileid_valid(nfsi, fattr)) {
+		printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+			"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+			NFS_SERVER(inode)->nfs_client->cl_hostname,
+			inode->i_sb->s_id, (long long)nfsi->fileid,
+			(long long)fattr->fileid);
+		goto out_err;
+	}
+
+	/*
+	 * Make sure the inode's type hasn't changed.
+	 */
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+		/*
+		* Big trouble! The inode has become a different object.
+		*/
+		printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",
+				__func__, inode->i_ino, inode->i_mode, fattr->mode);
+		goto out_err;
+	}
+
+	server = NFS_SERVER(inode);
+	/* Update the fsid? */
+	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
+			!IS_AUTOMOUNT(inode))
+		server->fsid = fattr->fsid;
+
+	/*
+	 * Update the read time so we don't revalidate too often.
+	 */
+	nfsi->read_cache_jiffies = fattr->time_start;
+
+	save_cache_validity = nfsi->cache_validity;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+			| NFS_INO_INVALID_ATIME
+			| NFS_INO_REVAL_FORCED
+			| NFS_INO_REVAL_PAGECACHE);
+
+	/* Do atomic weak cache consistency updates */
+	invalid |= nfs_wcc_update_inode(inode, fattr);
+
+	/* More cache consistency checks */
+	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (inode->i_version != fattr->change_attr) {
+			dprintk("NFS: change_attr change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_PAGECACHE;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
+			inode->i_version = fattr->change_attr;
+		}
+	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
+		nfsi->cache_validity |= save_cache_validity;
+
+	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
+		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+	} else if (server->caps & NFS_CAP_MTIME)
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+	} else if (server->caps & NFS_CAP_CTIME)
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_FORCED);
+
+	/* Check if our cached file size is stale */
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		cur_isize = i_size_read(inode);
+		if (new_isize != cur_isize) {
+			/* Do we perhaps have any outstanding writes, or has
+			 * the file grown beyond our last write? */
+			if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
+				i_size_write(inode, new_isize);
+				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+				invalid &= ~NFS_INO_REVAL_PAGECACHE;
+			}
+			dprintk("NFS: isize change on server for file %s/%ld "
+					"(%Ld to %Ld)\n",
+					inode->i_sb->s_id,
+					inode->i_ino,
+					(long long)cur_isize,
+					(long long)new_isize);
+		}
+	} else
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_PAGECACHE
+				| NFS_INO_REVAL_FORCED);
+
+
+	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	else if (server->caps & NFS_CAP_ATIME)
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATIME
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+			umode_t newmode = inode->i_mode & S_IFMT;
+			newmode |= fattr->mode & S_IALLUGO;
+			inode->i_mode = newmode;
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		}
+	} else if (server->caps & NFS_CAP_MODE)
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+		if (!uid_eq(inode->i_uid, fattr->uid)) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_uid = fattr->uid;
+		}
+	} else if (server->caps & NFS_CAP_OWNER)
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+		if (!gid_eq(inode->i_gid, fattr->gid)) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_gid = fattr->gid;
+		}
+	} else if (server->caps & NFS_CAP_OWNER_GROUP)
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+		if (inode->i_nlink != fattr->nlink) {
+			invalid |= NFS_INO_INVALID_ATTR;
+			if (S_ISDIR(inode->i_mode))
+				invalid |= NFS_INO_INVALID_DATA;
+			set_nlink(inode, fattr->nlink);
+		}
+	} else if (server->caps & NFS_CAP_NLINK)
+		nfsi->cache_validity |= save_cache_validity &
+				(NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+		/*
+		 * report the blocks in 512byte units
+		 */
+		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ 	}
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+		inode->i_blocks = fattr->du.nfs2.blocks;
+
+	/* Update attrtimeo value if we're out of the unstable period */
+	if (invalid & NFS_INO_INVALID_ATTR) {
+		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+		nfsi->attrtimeo_timestamp = now;
+		/* Set barrier to be more recent than all outstanding updates */
+		nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+	} else {
+		if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+			if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
+				nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+			nfsi->attrtimeo_timestamp = now;
+		}
+		/* Set the barrier to be more recent than this fattr */
+		if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+			nfsi->attr_gencount = fattr->gencount;
+	}
+	invalid &= ~NFS_INO_INVALID_ATTR;
+	/* Don't invalidate the data if we were to blame */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+				|| S_ISLNK(inode->i_mode)))
+		invalid &= ~NFS_INO_INVALID_DATA;
+	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) ||
+			(save_cache_validity & NFS_INO_REVAL_FORCED))
+		nfs_set_cache_invalid(inode, invalid);
+
+	return 0;
+ out_err:
+	/*
+	 * No need to worry about unhashing the dentry, as the
+	 * lookup validation will know that the inode is bad.
+	 * (But we fall through to invalidate the caches.)
+	 */
+	nfs_invalidate_inode(inode);
+	return -ESTALE;
+}
+
+struct inode *nfs_alloc_inode(struct super_block *sb)
+{
+	struct nfs_inode *nfsi;
+	nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+	if (!nfsi)
+		return NULL;
+	nfsi->flags = 0UL;
+	nfsi->cache_validity = 0UL;
+#if IS_ENABLED(CONFIG_NFS_V4)
+	nfsi->nfs4_acl = NULL;
+#endif /* CONFIG_NFS_V4 */
+	return &nfsi->vfs_inode;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_inode);
+
+static void nfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
+}
+
+void nfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nfs_i_callback);
+}
+EXPORT_SYMBOL_GPL(nfs_destroy_inode);
+
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#if IS_ENABLED(CONFIG_NFS_V4)
+	INIT_LIST_HEAD(&nfsi->open_states);
+	nfsi->delegation = NULL;
+	init_rwsem(&nfsi->rwsem);
+	nfsi->layout = NULL;
+#endif
+}
+
+static void init_once(void *foo)
+{
+	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
+
+	inode_init_once(&nfsi->vfs_inode);
+	INIT_LIST_HEAD(&nfsi->open_files);
+	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+	INIT_LIST_HEAD(&nfsi->commit_info.list);
+	nfsi->nrequests = 0;
+	nfsi->commit_info.ncommit = 0;
+	atomic_set(&nfsi->commit_info.rpcs_out, 0);
+	atomic_set(&nfsi->silly_count, 1);
+	INIT_HLIST_HEAD(&nfsi->silly_list);
+	init_waitqueue_head(&nfsi->waitqueue);
+	nfs4_init_once(nfsi);
+}
+
+static int __init nfs_init_inodecache(void)
+{
+	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
+					     sizeof(struct nfs_inode),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (nfs_inode_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void nfs_destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(nfs_inode_cachep);
+}
+
+struct workqueue_struct *nfsiod_workqueue;
+EXPORT_SYMBOL_GPL(nfsiod_workqueue);
+
+/*
+ * start up the nfsiod workqueue
+ */
+static int nfsiod_start(void)
+{
+	struct workqueue_struct *wq;
+	dprintk("RPC:       creating workqueue nfsiod\n");
+	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
+	if (wq == NULL)
+		return -ENOMEM;
+	nfsiod_workqueue = wq;
+	return 0;
+}
+
+/*
+ * Destroy the nfsiod workqueue
+ */
+static void nfsiod_stop(void)
+{
+	struct workqueue_struct *wq;
+
+	wq = nfsiod_workqueue;
+	if (wq == NULL)
+		return;
+	nfsiod_workqueue = NULL;
+	destroy_workqueue(wq);
+}
+
+int nfs_net_id;
+EXPORT_SYMBOL_GPL(nfs_net_id);
+
+static int nfs_net_init(struct net *net)
+{
+	nfs_clients_init(net);
+	return nfs_fs_proc_net_init(net);
+}
+
+static void nfs_net_exit(struct net *net)
+{
+	nfs_fs_proc_net_exit(net);
+	nfs_cleanup_cb_ident_idr(net);
+}
+
+static struct pernet_operations nfs_net_ops = {
+	.init = nfs_net_init,
+	.exit = nfs_net_exit,
+	.id   = &nfs_net_id,
+	.size = sizeof(struct nfs_net),
+};
+
+/*
+ * Initialize NFS
+ */
+static int __init init_nfs_fs(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&nfs_net_ops);
+	if (err < 0)
+		goto out9;
+
+	err = nfs_fscache_register();
+	if (err < 0)
+		goto out8;
+
+	err = nfsiod_start();
+	if (err)
+		goto out7;
+
+	err = nfs_fs_proc_init();
+	if (err)
+		goto out6;
+
+	err = nfs_init_nfspagecache();
+	if (err)
+		goto out5;
+
+	err = nfs_init_inodecache();
+	if (err)
+		goto out4;
+
+	err = nfs_init_readpagecache();
+	if (err)
+		goto out3;
+
+	err = nfs_init_writepagecache();
+	if (err)
+		goto out2;
+
+	err = nfs_init_directcache();
+	if (err)
+		goto out1;
+
+#ifdef CONFIG_PROC_FS
+	rpc_proc_register(&init_net, &nfs_rpcstat);
+#endif
+	if ((err = register_nfs_fs()) != 0)
+		goto out0;
+
+	return 0;
+out0:
+#ifdef CONFIG_PROC_FS
+	rpc_proc_unregister(&init_net, "nfs");
+#endif
+	nfs_destroy_directcache();
+out1:
+	nfs_destroy_writepagecache();
+out2:
+	nfs_destroy_readpagecache();
+out3:
+	nfs_destroy_inodecache();
+out4:
+	nfs_destroy_nfspagecache();
+out5:
+	nfs_fs_proc_exit();
+out6:
+	nfsiod_stop();
+out7:
+	nfs_fscache_unregister();
+out8:
+	unregister_pernet_subsys(&nfs_net_ops);
+out9:
+	return err;
+}
+
+static void __exit exit_nfs_fs(void)
+{
+	nfs_destroy_directcache();
+	nfs_destroy_writepagecache();
+	nfs_destroy_readpagecache();
+	nfs_destroy_inodecache();
+	nfs_destroy_nfspagecache();
+	nfs_fscache_unregister();
+	unregister_pernet_subsys(&nfs_net_ops);
+#ifdef CONFIG_PROC_FS
+	rpc_proc_unregister(&init_net, "nfs");
+#endif
+	unregister_nfs_fs();
+	nfs_fs_proc_exit();
+	nfsiod_stop();
+}
+
+/* Not quite true; I just maintain it */
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
+
+module_init(init_nfs_fs)
+module_exit(exit_nfs_fs)
diff --git a/kernel/fs/nfs/internal.h b/kernel/fs/nfs/internal.h
new file mode 100644
index 000000000..9e6475bc5
--- /dev/null
+++ b/kernel/fs/nfs/internal.h
@@ -0,0 +1,683 @@
+/*
+ * NFS internal definitions
+ */
+
+#include "nfs4_fs.h"
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/crc32.h>
+#include <linux/nfs_page.h>
+
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
+struct nfs_string;
+
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
+
+static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
+{
+	if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
+		fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
+}
+
+static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
+{
+	if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) ||
+	    (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
+	     ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
+		return 0;
+	return 1;
+}
+
+struct nfs_clone_mount {
+	const struct super_block *sb;
+	const struct dentry *dentry;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+	char *hostname;
+	char *mnt_path;
+	struct sockaddr *addr;
+	size_t addrlen;
+	rpc_authflavor_t authflavor;
+};
+
+/*
+ * Note: RFC 1813 doesn't limit the number of auth flavors that
+ * a server can return, so make something up.
+ */
+#define NFS_MAX_SECFLAVORS	(12)
+
+/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT		(-1)
+
+/*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+
+struct nfs_client_initdata {
+	unsigned long init_flags;
+	const char *hostname;
+	const struct sockaddr *addr;
+	size_t addrlen;
+	struct nfs_subversion *nfs_mod;
+	int proto;
+	u32 minorversion;
+	struct net *net;
+};
+
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_parsed_mount_data {
+	int			flags;
+	unsigned int		rsize, wsize;
+	unsigned int		timeo, retrans;
+	unsigned int		acregmin, acregmax,
+				acdirmin, acdirmax;
+	unsigned int		namlen;
+	unsigned int		options;
+	unsigned int		bsize;
+	struct nfs_auth_info	auth_info;
+	rpc_authflavor_t	selected_flavor;
+	char			*client_address;
+	unsigned int		version;
+	unsigned int		minorversion;
+	char			*fscache_uniq;
+	bool			need_mount;
+
+	struct {
+		struct sockaddr_storage	address;
+		size_t			addrlen;
+		char			*hostname;
+		u32			version;
+		int			port;
+		unsigned short		protocol;
+	} mount_server;
+
+	struct {
+		struct sockaddr_storage	address;
+		size_t			addrlen;
+		char			*hostname;
+		char			*export_path;
+		int			port;
+		unsigned short		protocol;
+	} nfs_server;
+
+	struct security_mnt_opts lsm_opts;
+	struct net		*net;
+};
+
+/* mount_clnt.c */
+struct nfs_mount_request {
+	struct sockaddr		*sap;
+	size_t			salen;
+	char			*hostname;
+	char			*dirpath;
+	u32			version;
+	unsigned short		protocol;
+	struct nfs_fh		*fh;
+	int			noresvport;
+	unsigned int		*auth_flav_len;
+	rpc_authflavor_t	*auth_flavs;
+	struct net		*net;
+};
+
+struct nfs_mount_info {
+	void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+	int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
+	struct nfs_parsed_mount_data *parsed;
+	struct nfs_clone_mount *cloned;
+	struct nfs_fh *mntfh;
+};
+
+extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
+
+/* client.c */
+extern const struct rpc_program nfs_program;
+extern void nfs_clients_init(struct net *net);
+extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
+int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t);
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
+				  const struct rpc_timeout *, const char *,
+				  rpc_authflavor_t);
+int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
+void nfs_server_insert_lists(struct nfs_server *);
+void nfs_server_remove_lists(struct nfs_server *);
+void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
+int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
+		rpc_authflavor_t);
+struct nfs_server *nfs_alloc_server(void);
+void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *);
+
+extern void nfs_cleanup_cb_ident_idr(struct net *);
+extern void nfs_put_client(struct nfs_client *);
+extern void nfs_free_client(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
+				struct nfs4_sessionid *, u32);
+extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
+					struct nfs_subversion *);
+extern struct nfs_server *nfs4_create_server(
+					struct nfs_mount_info *,
+					struct nfs_subversion *);
+extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
+						      struct nfs_fh *);
+extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
+					struct sockaddr *sap, size_t salen,
+					struct net *net);
+extern void nfs_free_server(struct nfs_server *server);
+extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+					   struct nfs_fh *,
+					   struct nfs_fattr *,
+					   rpc_authflavor_t);
+extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+					     const struct sockaddr *ds_addr,
+					     int ds_addrlen, int ds_proto,
+					     unsigned int ds_timeo,
+					     unsigned int ds_retrans,
+					     u32 minor_version,
+					     rpc_authflavor_t au_flavor);
+extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
+						struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+			const struct sockaddr *ds_addr, int ds_addrlen,
+			int ds_proto, unsigned int ds_timeo,
+			unsigned int ds_retrans, rpc_authflavor_t au_flavor);
+#ifdef CONFIG_PROC_FS
+extern int __init nfs_fs_proc_init(void);
+extern void nfs_fs_proc_exit(void);
+extern int nfs_fs_proc_net_init(struct net *net);
+extern void nfs_fs_proc_net_exit(struct net *net);
+#else
+static inline int nfs_fs_proc_net_init(struct net *net)
+{
+	return 0;
+}
+static inline void nfs_fs_proc_net_exit(struct net *net)
+{
+}
+static inline int nfs_fs_proc_init(void)
+{
+	return 0;
+}
+static inline void nfs_fs_proc_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
+#endif
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
+
+struct nfs_pageio_descriptor;
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
+
+extern int __init nfs_init_directcache(void);
+extern void nfs_destroy_directcache(void);
+extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+			      struct nfs_pgio_header *hdr,
+			      void (*release)(struct nfs_pgio_header *hdr));
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
+int nfs_iocounter_wait(struct nfs_io_counter *c);
+
+extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_destroy(struct nfs_pgio_header *);
+int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+		      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+		      const struct rpc_call_ops *call_ops, int how, int flags);
+void nfs_free_request(struct nfs_page *req);
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
+
+static inline void nfs_iocounter_init(struct nfs_io_counter *c)
+{
+	c->flags = 0;
+	atomic_set(&c->io_count, 0);
+}
+
+static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
+{
+	WARN_ON_ONCE(desc->pg_mirror_count < 1);
+	return desc->pg_mirror_count > 1;
+}
+
+/* nfs2xdr.c */
+extern struct rpc_procinfo nfs_procedures[];
+extern int nfs2_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
+
+/* nfs3xdr.c */
+extern struct rpc_procinfo nfs3_procedures[];
+extern int nfs3_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
+
+/* nfs4xdr.c */
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs4_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
+#endif
+#ifdef CONFIG_NFS_V4_1
+extern const u32 nfs41_maxread_overhead;
+extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
+#endif
+
+/* nfs4proc.c */
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern struct rpc_procinfo nfs4_procedures[];
+#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline void nfs4_label_free(struct nfs4_label *label)
+{
+	if (label) {
+		kfree(label->label);
+		kfree(label);
+	}
+	return;
+}
+
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+	if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL))
+		nfsi->cache_validity |= NFS_INO_INVALID_LABEL;
+}
+#else
+static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
+static inline void nfs4_label_free(void *label) {}
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
+			   const struct rpc_timeout *timeparms,
+			   const char *ip_addr);
+
+/* dir.c */
+extern void nfs_force_use_readdirplus(struct inode *dir);
+extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
+struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
+int nfs_create(struct inode *, struct dentry *, umode_t, bool);
+int nfs_mkdir(struct inode *, struct dentry *, umode_t);
+int nfs_rmdir(struct inode *, struct dentry *);
+int nfs_unlink(struct inode *, struct dentry *);
+int nfs_symlink(struct inode *, struct dentry *, const char *);
+int nfs_link(struct dentry *, struct inode *, struct dentry *);
+int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+
+/* file.c */
+int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
+loff_t nfs_file_llseek(struct file *, loff_t, int);
+int nfs_file_flush(struct file *, fl_owner_t);
+ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
+ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
+			     size_t, unsigned int);
+int nfs_file_mmap(struct file *, struct vm_area_struct *);
+ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
+int nfs_file_release(struct inode *, struct file *);
+int nfs_lock(struct file *, int, struct file_lock *);
+int nfs_flock(struct file *, int, struct file_lock *);
+int nfs_check_flags(int);
+
+/* inode.c */
+extern struct workqueue_struct *nfsiod_workqueue;
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_destroy_inode(struct inode *);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
+extern int nfs_drop_inode(struct inode *);
+extern void nfs_clear_inode(struct inode *);
+extern void nfs_evict_inode(struct inode *);
+void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(struct wait_bit_key *key);
+
+/* super.c */
+extern const struct super_operations nfs_sops;
+extern struct file_system_type nfs_fs_type;
+extern struct file_system_type nfs_xdev_fs_type;
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern struct file_system_type nfs4_xdev_fs_type;
+extern struct file_system_type nfs4_referral_fs_type;
+#endif
+bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
+struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *,
+			struct nfs_subversion *);
+void nfs_initialise_sb(struct super_block *);
+int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *);
+int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *);
+struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *,
+				   struct nfs_mount_info *, struct nfs_subversion *);
+struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *);
+struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
+		const char *, struct nfs_mount_info *);
+void nfs_kill_super(struct super_block *);
+void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+
+extern struct rpc_stat nfs_rpcstat;
+
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+extern bool nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
+
+/* namespace.c */
+#define NFS_PATH_CANONICAL 1
+extern char *nfs_path(char **p, struct dentry *dentry,
+		      char *buffer, ssize_t buflen, unsigned flags);
+extern struct vfsmount *nfs_d_automount(struct path *path);
+struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *,
+			      struct nfs_fh *, struct nfs_fattr *);
+struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *,
+				 struct nfs_fattr *, rpc_authflavor_t);
+
+/* getroot.c */
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
+				   const char *);
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
+				    const char *);
+
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool);
+#endif
+
+struct nfs_pgio_completion_ops;
+/* read.c */
+extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+			struct inode *inode, bool force_mds,
+			const struct nfs_pgio_completion_ops *compl_ops);
+extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
+
+/* super.c */
+void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+void nfs_umount_begin(struct super_block *);
+int  nfs_statfs(struct dentry *, struct kstatfs *);
+int  nfs_show_options(struct seq_file *, struct dentry *);
+int  nfs_show_devname(struct seq_file *, struct dentry *);
+int  nfs_show_path(struct seq_file *, struct dentry *);
+int  nfs_show_stats(struct seq_file *, struct dentry *);
+int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
+
+/* write.c */
+extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+			struct inode *inode, int ioflags, bool force_mds,
+			const struct nfs_pgio_completion_ops *compl_ops);
+extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
+extern void nfs_commit_free(struct nfs_commit_data *p);
+extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct rpc_clnt *clnt,
+			       struct nfs_commit_data *data,
+			       const struct nfs_rpc_ops *nfs_ops,
+			       const struct rpc_call_ops *call_ops,
+			       int how, int flags);
+extern void nfs_init_commit(struct nfs_commit_data *data,
+			    struct list_head *head,
+			    struct pnfs_layout_segment *lseg,
+			    struct nfs_commit_info *cinfo);
+int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+			 struct nfs_commit_info *cinfo, int max);
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
+int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		    struct nfs_commit_info *cinfo);
+void nfs_mark_request_commit(struct nfs_page *req,
+			     struct pnfs_layout_segment *lseg,
+			     struct nfs_commit_info *cinfo,
+			     u32 ds_commit_idx);
+int nfs_write_need_commit(struct nfs_pgio_header *);
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+			    int how, struct nfs_commit_info *cinfo);
+void nfs_retry_commit(struct list_head *page_list,
+		      struct pnfs_layout_segment *lseg,
+		      struct nfs_commit_info *cinfo,
+		      u32 ds_commit_idx);
+void nfs_commitdata_release(struct nfs_commit_data *data);
+void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+				 struct nfs_commit_info *cinfo);
+void nfs_request_remove_commit_list(struct nfs_page *req,
+				    struct nfs_commit_info *cinfo);
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq);
+int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
+
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+		struct page *, struct page *, enum migrate_mode);
+#else
+#define nfs_migrate_page NULL
+#endif
+
+/* unlink.c */
+extern struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+		 struct dentry *old_dentry, struct dentry *new_dentry,
+		 void (*complete)(struct rpc_task *, struct nfs_renamedata *));
+extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
+
+/* direct.c */
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+			      struct nfs_direct_req *dreq);
+static inline void nfs_inode_dio_wait(struct inode *inode)
+{
+	inode_dio_wait(inode);
+}
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
+
+/* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
+extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+			    const struct rpc_timeout *timeparms,
+			    const char *ip_addr);
+extern int nfs40_walk_client_list(struct nfs_client *clp,
+				struct nfs_client **result,
+				struct rpc_cred *cred);
+extern int nfs41_walk_client_list(struct nfs_client *clp,
+				struct nfs_client **result,
+				struct rpc_cred *cred);
+
+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+	inode = igrab(inode);
+	if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+	if (inode != NULL) {
+		struct super_block *sb = inode->i_sb;
+
+		iput(inode);
+		nfs_sb_deactive(sb);
+	}
+}
+
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(struct dentry *dentry,
+				char *buffer, ssize_t buflen)
+{
+	char *dummy;
+	return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);
+}
+
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+	/* make sure blocksize is a power of two */
+	if ((bsize & (bsize - 1)) || nrbitsp) {
+		unsigned char	nrbits;
+
+		for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+			;
+		bsize = 1 << nrbits;
+		if (nrbitsp)
+			*nrbitsp = nrbits;
+	}
+
+	return bsize;
+}
+
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline blkcnt_t nfs_calc_block_size(u64 tsize)
+{
+	blkcnt_t used = (tsize + 511) >> 9;
+	return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+	if (bsize < NFS_MIN_FILE_IO_SIZE)
+		bsize = NFS_DEF_FILE_IO_SIZE;
+	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+		bsize = NFS_MAX_FILE_IO_SIZE;
+
+	return nfs_block_bits(bsize, nrbitsp);
+}
+
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+	sb->s_maxbytes = (loff_t)maxfilesize;
+	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
+/*
+ * Record the page as unstable and mark its inode as dirty.
+ */
+static inline
+void nfs_mark_page_unstable(struct page *page)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+
+	inc_zone_page_state(page, NR_UNSTABLE_NFS);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+	 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+}
+
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline
+unsigned int nfs_page_length(struct page *page)
+{
+	loff_t i_size = i_size_read(page_file_mapping(page)->host);
+
+	if (i_size > 0) {
+		pgoff_t page_index = page_file_index(page);
+		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+		if (page_index < end_index)
+			return PAGE_CACHE_SIZE;
+		if (page_index == end_index)
+			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+	}
+	return 0;
+}
+
+/*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline
+unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+	return ((unsigned long)len + (unsigned long)base +
+		PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+/*
+ * Convert a struct timespec into a 64-bit change attribute
+ *
+ * This does approximately the same thing as timespec_to_ns(),
+ * but for calculation efficiency, we multiply the seconds by
+ * 1024*1024*1024.
+ */
+static inline
+u64 nfs_timespec_to_change_attr(const struct timespec *ts)
+{
+	return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
+}
+
+#ifdef CONFIG_CRC32
+/**
+ * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+	return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
+}
+#else
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/fs/nfs/iostat.h b/kernel/fs/nfs/iostat.h
new file mode 100644
index 000000000..0cb806fbd
--- /dev/null
+++ b/kernel/fs/nfs/iostat.h
@@ -0,0 +1,76 @@
+/*
+ *  linux/fs/nfs/iostat.h
+ *
+ *  Declarations for NFS client per-mount statistics
+ *
+ *  Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
+ *
+ */
+
+#ifndef _NFS_IOSTAT
+#define _NFS_IOSTAT
+
+#include <linux/percpu.h>
+#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
+
+struct nfs_iostats {
+	unsigned long long	bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+	unsigned long long	fscache[__NFSIOS_FSCACHEMAX];
+#endif
+	unsigned long		events[__NFSIOS_COUNTSMAX];
+} ____cacheline_aligned;
+
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+					enum nfs_stat_eventcounters stat)
+{
+	this_cpu_inc(server->io_stats->events[stat]);
+}
+
+static inline void nfs_inc_stats(const struct inode *inode,
+				 enum nfs_stat_eventcounters stat)
+{
+	nfs_inc_server_stats(NFS_SERVER(inode), stat);
+}
+
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+					enum nfs_stat_bytecounters stat,
+					long addend)
+{
+	this_cpu_add(server->io_stats->bytes[stat], addend);
+}
+
+static inline void nfs_add_stats(const struct inode *inode,
+				 enum nfs_stat_bytecounters stat,
+				 long addend)
+{
+	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
+}
+
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+					 enum nfs_stat_fscachecounters stat,
+					 long addend)
+{
+	this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
+}
+static inline void nfs_inc_fscache_stats(struct inode *inode,
+					 enum nfs_stat_fscachecounters stat)
+{
+	this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]);
+}
+#endif
+
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
+{
+	return alloc_percpu(struct nfs_iostats);
+}
+
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
+{
+	if (stats != NULL)
+		free_percpu(stats);
+}
+
+#endif /* _NFS_IOSTAT */
diff --git a/kernel/fs/nfs/mount_clnt.c b/kernel/fs/nfs/mount_clnt.c
new file mode 100644
index 000000000..99a45283b
--- /dev/null
+++ b/kernel/fs/nfs/mount_clnt.c
@@ -0,0 +1,535 @@
+/*
+ * In-kernel MOUNT protocol client
+ *
+ * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#ifdef NFS_DEBUG
+# define NFSDBG_FACILITY	NFSDBG_MOUNT
+#endif
+
+/*
+ * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
+ */
+#define MNTPATHLEN		(1024)
+
+/*
+ * XDR data type sizes
+ */
+#define encode_dirpath_sz	(1 + XDR_QUADLEN(MNTPATHLEN))
+#define MNT_status_sz		(1)
+#define MNT_fhs_status_sz	(1)
+#define MNT_fhandle_sz		XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandle3_sz		(1 + XDR_QUADLEN(NFS3_FHSIZE))
+#define MNT_authflav3_sz	(1 + NFS_MAX_SECFLAVORS)
+
+/*
+ * XDR argument and result sizes
+ */
+#define MNT_enc_dirpath_sz	encode_dirpath_sz
+#define MNT_dec_mountres_sz	(MNT_status_sz + MNT_fhandle_sz)
+#define MNT_dec_mountres3_sz	(MNT_status_sz + MNT_fhandle_sz + \
+				 MNT_authflav3_sz)
+
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+	MOUNTPROC_NULL		= 0,
+	MOUNTPROC_MNT		= 1,
+	MOUNTPROC_DUMP		= 2,
+	MOUNTPROC_UMNT		= 3,
+	MOUNTPROC_UMNTALL	= 4,
+	MOUNTPROC_EXPORT	= 5,
+};
+
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+	MOUNTPROC3_NULL		= 0,
+	MOUNTPROC3_MNT		= 1,
+	MOUNTPROC3_DUMP		= 2,
+	MOUNTPROC3_UMNT		= 3,
+	MOUNTPROC3_UMNTALL	= 4,
+	MOUNTPROC3_EXPORT	= 5,
+};
+
+static const struct rpc_program mnt_program;
+
+/*
+ * Defined by OpenGroup XNFS Version 3W, chapter 8
+ */
+enum mountstat {
+	MNT_OK			= 0,
+	MNT_EPERM		= 1,
+	MNT_ENOENT		= 2,
+	MNT_EACCES		= 13,
+	MNT_EINVAL		= 22,
+};
+
+static struct {
+	u32 status;
+	int errno;
+} mnt_errtbl[] = {
+	{ .status = MNT_OK,			.errno = 0,		},
+	{ .status = MNT_EPERM,			.errno = -EPERM,	},
+	{ .status = MNT_ENOENT,			.errno = -ENOENT,	},
+	{ .status = MNT_EACCES,			.errno = -EACCES,	},
+	{ .status = MNT_EINVAL,			.errno = -EINVAL,	},
+};
+
+/*
+ * Defined by RFC 1813, section 5.1.5
+ */
+enum mountstat3 {
+	MNT3_OK			= 0,		/* no error */
+	MNT3ERR_PERM		= 1,		/* Not owner */
+	MNT3ERR_NOENT		= 2,		/* No such file or directory */
+	MNT3ERR_IO		= 5,		/* I/O error */
+	MNT3ERR_ACCES		= 13,		/* Permission denied */
+	MNT3ERR_NOTDIR		= 20,		/* Not a directory */
+	MNT3ERR_INVAL		= 22,		/* Invalid argument */
+	MNT3ERR_NAMETOOLONG	= 63,		/* Filename too long */
+	MNT3ERR_NOTSUPP		= 10004,	/* Operation not supported */
+	MNT3ERR_SERVERFAULT	= 10006,	/* A failure on the server */
+};
+
+static struct {
+	u32 status;
+	int errno;
+} mnt3_errtbl[] = {
+	{ .status = MNT3_OK,			.errno = 0,		},
+	{ .status = MNT3ERR_PERM,		.errno = -EPERM,	},
+	{ .status = MNT3ERR_NOENT,		.errno = -ENOENT,	},
+	{ .status = MNT3ERR_IO,			.errno = -EIO,		},
+	{ .status = MNT3ERR_ACCES,		.errno = -EACCES,	},
+	{ .status = MNT3ERR_NOTDIR,		.errno = -ENOTDIR,	},
+	{ .status = MNT3ERR_INVAL,		.errno = -EINVAL,	},
+	{ .status = MNT3ERR_NAMETOOLONG,	.errno = -ENAMETOOLONG,	},
+	{ .status = MNT3ERR_NOTSUPP,		.errno = -ENOTSUPP,	},
+	{ .status = MNT3ERR_SERVERFAULT,	.errno = -EREMOTEIO,	},
+};
+
+struct mountres {
+	int errno;
+	struct nfs_fh *fh;
+	unsigned int *auth_count;
+	rpc_authflavor_t *auth_flavors;
+};
+
+struct mnt_fhstatus {
+	u32 status;
+	struct nfs_fh *fh;
+};
+
+/**
+ * nfs_mount - Obtain an NFS file handle for the given host and path
+ * @info: pointer to mount request arguments
+ *
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
+ */
+int nfs_mount(struct nfs_mount_request *info)
+{
+	struct mountres	result = {
+		.fh		= info->fh,
+		.auth_count	= info->auth_flav_len,
+		.auth_flavors	= info->auth_flavs,
+	};
+	struct rpc_message msg	= {
+		.rpc_argp	= info->dirpath,
+		.rpc_resp	= &result,
+	};
+	struct rpc_create_args args = {
+		.net		= info->net,
+		.protocol	= info->protocol,
+		.address	= info->sap,
+		.addrsize	= info->salen,
+		.servername	= info->hostname,
+		.program	= &mnt_program,
+		.version	= info->version,
+		.authflavor	= RPC_AUTH_UNIX,
+	};
+	struct rpc_clnt		*mnt_clnt;
+	int			status;
+
+	dprintk("NFS: sending MNT request for %s:%s\n",
+		(info->hostname ? info->hostname : "server"),
+			info->dirpath);
+
+	if (strlen(info->dirpath) > MNTPATHLEN)
+		return -ENAMETOOLONG;
+
+	if (info->noresvport)
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+	mnt_clnt = rpc_create(&args);
+	if (IS_ERR(mnt_clnt))
+		goto out_clnt_err;
+
+	if (info->version == NFS_MNT3_VERSION)
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
+	else
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
+
+	status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT);
+	rpc_shutdown_client(mnt_clnt);
+
+	if (status < 0)
+		goto out_call_err;
+	if (result.errno != 0)
+		goto out_mnt_err;
+
+	dprintk("NFS: MNT request succeeded\n");
+	status = 0;
+
+	/*
+	 * If the server didn't provide a flavor list, allow the
+	 * client to try any flavor.
+	 */
+	if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+		dprintk("NFS: Faking up auth_flavs list\n");
+		info->auth_flavs[0] = RPC_AUTH_NULL;
+		*info->auth_flav_len = 1;
+	}
+out:
+	return status;
+
+out_clnt_err:
+	status = PTR_ERR(mnt_clnt);
+	dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
+	goto out;
+
+out_call_err:
+	dprintk("NFS: MNT request failed, status=%d\n", status);
+	goto out;
+
+out_mnt_err:
+	dprintk("NFS: MNT server returned result %d\n", result.errno);
+	status = result.errno;
+	goto out;
+}
+
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+	static const struct rpc_timeout nfs_umnt_timeout = {
+		.to_initval = 1 * HZ,
+		.to_maxval = 3 * HZ,
+		.to_retries = 2,
+	};
+	struct rpc_create_args args = {
+		.net		= info->net,
+		.protocol	= IPPROTO_UDP,
+		.address	= info->sap,
+		.addrsize	= info->salen,
+		.timeout	= &nfs_umnt_timeout,
+		.servername	= info->hostname,
+		.program	= &mnt_program,
+		.version	= info->version,
+		.authflavor	= RPC_AUTH_UNIX,
+		.flags		= RPC_CLNT_CREATE_NOPING,
+	};
+	struct rpc_message msg	= {
+		.rpc_argp	= info->dirpath,
+	};
+	struct rpc_clnt *clnt;
+	int status;
+
+	if (strlen(info->dirpath) > MNTPATHLEN)
+		return;
+
+	if (info->noresvport)
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt))
+		goto out_clnt_err;
+
+	dprintk("NFS: sending UMNT request for %s:%s\n",
+		(info->hostname ? info->hostname : "server"), info->dirpath);
+
+	if (info->version == NFS_MNT3_VERSION)
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+	else
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+	status = rpc_call_sync(clnt, &msg, 0);
+	rpc_shutdown_client(clnt);
+
+	if (unlikely(status < 0))
+		goto out_call_err;
+
+	return;
+
+out_clnt_err:
+	dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+			PTR_ERR(clnt));
+	return;
+
+out_call_err:
+	dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
+/*
+ * XDR encode/decode functions for MOUNT
+ */
+
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+{
+	const u32 pathname_len = strlen(pathname);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + pathname_len);
+	xdr_encode_opaque(p, pathname, pathname_len);
+}
+
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const char *dirpath)
+{
+	encode_mntdirpath(xdr, dirpath);
+}
+
+/*
+ * RFC 1094: "A non-zero status indicates some sort of error.  In this
+ * case, the status is a UNIX error number."  This can be problematic
+ * if the server and client use different errno values for the same
+ * error.
+ *
+ * However, the OpenGroup XNFS spec provides a simple mapping that is
+ * independent of local errno values on the server and the client.
+ */
+static int decode_status(struct xdr_stream *xdr, struct mountres *res)
+{
+	unsigned int i;
+	u32 status;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	status = be32_to_cpup(p);
+
+	for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
+		if (mnt_errtbl[i].status == status) {
+			res->errno = mnt_errtbl[i].errno;
+			return 0;
+		}
+	}
+
+	dprintk("NFS: unrecognized MNT status code: %u\n", status);
+	res->errno = -EACCES;
+	return 0;
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
+{
+	struct nfs_fh *fh = res->fh;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	fh->size = NFS2_FHSIZE;
+	memcpy(fh->data, p, NFS2_FHSIZE);
+	return 0;
+}
+
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct mountres *res)
+{
+	int status;
+
+	status = decode_status(xdr, res);
+	if (unlikely(status != 0 || res->errno != 0))
+		return status;
+	return decode_fhandle(xdr, res);
+}
+
+static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
+{
+	unsigned int i;
+	u32 status;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	status = be32_to_cpup(p);
+
+	for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
+		if (mnt3_errtbl[i].status == status) {
+			res->errno = mnt3_errtbl[i].errno;
+			return 0;
+		}
+	}
+
+	dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
+	res->errno = -EACCES;
+	return 0;
+}
+
+static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
+{
+	struct nfs_fh *fh = res->fh;
+	u32 size;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	size = be32_to_cpup(p);
+	if (size > NFS3_FHSIZE || size == 0)
+		return -EIO;
+
+	p = xdr_inline_decode(xdr, size);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	fh->size = size;
+	memcpy(fh->data, p, size);
+	return 0;
+}
+
+static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
+{
+	rpc_authflavor_t *flavors = res->auth_flavors;
+	unsigned int *count = res->auth_count;
+	u32 entries, i;
+	__be32 *p;
+
+	if (*count == 0)
+		return 0;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	entries = be32_to_cpup(p);
+	dprintk("NFS: received %u auth flavors\n", entries);
+	if (entries > NFS_MAX_SECFLAVORS)
+		entries = NFS_MAX_SECFLAVORS;
+
+	p = xdr_inline_decode(xdr, 4 * entries);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	if (entries > *count)
+		entries = *count;
+
+	for (i = 0; i < entries; i++) {
+		flavors[i] = be32_to_cpup(p++);
+		dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
+	}
+	*count = i;
+
+	return 0;
+}
+
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 struct mountres *res)
+{
+	int status;
+
+	status = decode_fhs_status(xdr, res);
+	if (unlikely(status != 0 || res->errno != 0))
+		return status;
+	status = decode_fhandle3(xdr, res);
+	if (unlikely(status != 0)) {
+		res->errno = -EBADHANDLE;
+		return 0;
+	}
+	return decode_auth_flavors(xdr, res);
+}
+
+static struct rpc_procinfo mnt_procedures[] = {
+	[MOUNTPROC_MNT] = {
+		.p_proc		= MOUNTPROC_MNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_replen	= MNT_dec_mountres_sz,
+		.p_statidx	= MOUNTPROC_MNT,
+		.p_name		= "MOUNT",
+	},
+	[MOUNTPROC_UMNT] = {
+		.p_proc		= MOUNTPROC_UMNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC_UMNT,
+		.p_name		= "UMOUNT",
+	},
+};
+
+static struct rpc_procinfo mnt3_procedures[] = {
+	[MOUNTPROC3_MNT] = {
+		.p_proc		= MOUNTPROC3_MNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres3,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_replen	= MNT_dec_mountres3_sz,
+		.p_statidx	= MOUNTPROC3_MNT,
+		.p_name		= "MOUNT",
+	},
+	[MOUNTPROC3_UMNT] = {
+		.p_proc		= MOUNTPROC3_UMNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC3_UMNT,
+		.p_name		= "UMOUNT",
+	},
+};
+
+
+static const struct rpc_version mnt_version1 = {
+	.number		= 1,
+	.nrprocs	= ARRAY_SIZE(mnt_procedures),
+	.procs		= mnt_procedures,
+};
+
+static const struct rpc_version mnt_version3 = {
+	.number		= 3,
+	.nrprocs	= ARRAY_SIZE(mnt3_procedures),
+	.procs		= mnt3_procedures,
+};
+
+static const struct rpc_version *mnt_version[] = {
+	NULL,
+	&mnt_version1,
+	NULL,
+	&mnt_version3,
+};
+
+static struct rpc_stat mnt_stats;
+
+static const struct rpc_program mnt_program = {
+	.name		= "mount",
+	.number		= NFS_MNT_PROGRAM,
+	.nrvers		= ARRAY_SIZE(mnt_version),
+	.version	= mnt_version,
+	.stats		= &mnt_stats,
+};
diff --git a/kernel/fs/nfs/namespace.c b/kernel/fs/nfs/namespace.c
new file mode 100644
index 000000000..c8162c660
--- /dev/null
+++ b/kernel/fs/nfs/namespace.c
@@ -0,0 +1,289 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/gfp.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/sunrpc/gss_api.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+static void nfs_expire_automounts(struct work_struct *work);
+
+static LIST_HEAD(nfs_automount_list);
+static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - used to return pointer to the end of devname part of path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ * @flags - options (see below)
+ *
+ * Helper function for constructing the server pathname
+ * by arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
+ *
+ * Supported flags:
+ * NFS_PATH_CANONICAL: ensure there is exactly one slash after
+ *		       the original device (export) name
+ *		       (if unset, the original name is returned verbatim)
+ */
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen,
+	       unsigned flags)
+{
+	char *end;
+	int namelen;
+	unsigned seq;
+	const char *base;
+
+rename_retry:
+	end = buffer+buflen;
+	*--end = '\0';
+	buflen--;
+
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	while (1) {
+		spin_lock(&dentry->d_lock);
+		if (IS_ROOT(dentry))
+			break;
+		namelen = dentry->d_name.len;
+		buflen -= namelen + 1;
+		if (buflen < 0)
+			goto Elong_unlock;
+		end -= namelen;
+		memcpy(end, dentry->d_name.name, namelen);
+		*--end = '/';
+		spin_unlock(&dentry->d_lock);
+		dentry = dentry->d_parent;
+	}
+	if (read_seqretry(&rename_lock, seq)) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
+		goto rename_retry;
+	}
+	if ((flags & NFS_PATH_CANONICAL) && *end != '/') {
+		if (--buflen < 0) {
+			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
+			goto Elong;
+		}
+		*--end = '/';
+	}
+	*p = end;
+	base = dentry->d_fsdata;
+	if (!base) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
+		WARN_ON(1);
+		return end;
+	}
+	namelen = strlen(base);
+	if (flags & NFS_PATH_CANONICAL) {
+		/* Strip off excess slashes in base string */
+		while (namelen > 0 && base[namelen - 1] == '/')
+			namelen--;
+	}
+	buflen -= namelen;
+	if (buflen < 0) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
+		goto Elong;
+	}
+	end -= namelen;
+	memcpy(end, base, namelen);
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+	return end;
+Elong_unlock:
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+EXPORT_SYMBOL_GPL(nfs_path);
+
+/*
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+struct vfsmount *nfs_d_automount(struct path *path)
+{
+	struct vfsmount *mnt;
+	struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
+	struct nfs_fh *fh = NULL;
+	struct nfs_fattr *fattr = NULL;
+
+	dprintk("--> nfs_d_automount()\n");
+
+	mnt = ERR_PTR(-ESTALE);
+	if (IS_ROOT(path->dentry))
+		goto out_nofree;
+
+	mnt = ERR_PTR(-ENOMEM);
+	fh = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fh == NULL || fattr == NULL)
+		goto out;
+
+	dprintk("%s: enter\n", __func__);
+
+	mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr);
+	if (IS_ERR(mnt))
+		goto out;
+
+	dprintk("%s: done, success\n", __func__);
+	mntget(mnt); /* prevent immediate expiration */
+	mnt_set_expiry(mnt, &nfs_automount_list);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+
+out:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fh);
+out_nofree:
+	if (IS_ERR(mnt))
+		dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt));
+	else
+		dprintk("<-- %s() = %p\n", __func__, mnt);
+	return mnt;
+}
+
+static int
+nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	if (NFS_FH(d_inode(dentry))->size != 0)
+		return nfs_getattr(mnt, dentry, stat);
+	generic_fillattr(d_inode(dentry), stat);
+	return 0;
+}
+
+static int
+nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	if (NFS_FH(d_inode(dentry))->size != 0)
+		return nfs_setattr(dentry, attr);
+	return -EACCES;
+}
+
+const struct inode_operations nfs_mountpoint_inode_operations = {
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+};
+
+const struct inode_operations nfs_referral_inode_operations = {
+	.getattr	= nfs_namespace_getattr,
+	.setattr	= nfs_namespace_setattr,
+};
+
+static void nfs_expire_automounts(struct work_struct *work)
+{
+	struct list_head *list = &nfs_automount_list;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+	if (list_empty(&nfs_automount_list))
+		cancel_delayed_work(&nfs_automount_task);
+}
+
+/*
+ * Clone a mountpoint of the appropriate type
+ */
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
+					   const char *devname,
+					   struct nfs_clone_mount *mountdata)
+{
+	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ * @authflavor - security flavor to use when performing the mount
+ *
+ */
+struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
+				 struct nfs_fattr *fattr, rpc_authflavor_t authflavor)
+{
+	struct nfs_clone_mount mountdata = {
+		.sb = dentry->d_sb,
+		.dentry = dentry,
+		.fh = fh,
+		.fattr = fattr,
+		.authflavor = authflavor,
+	};
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *devname;
+
+	dprintk("--> nfs_do_submount()\n");
+
+	dprintk("%s: submounting on %pd2\n", __func__,
+			dentry);
+	if (page == NULL)
+		goto out;
+	devname = nfs_devname(dentry, page, PAGE_SIZE);
+	mnt = (struct vfsmount *)devname;
+	if (IS_ERR(devname))
+		goto free_page;
+	mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
+free_page:
+	free_page((unsigned long)page);
+out:
+	dprintk("%s: done\n", __func__);
+
+	dprintk("<-- nfs_do_submount() = %p\n", mnt);
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(nfs_do_submount);
+
+struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
+			      struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	int err;
+	struct dentry *parent = dget_parent(dentry);
+
+	/* Look it up again to get its attributes */
+	err = server->nfs_client->rpc_ops->lookup(d_inode(parent), &dentry->d_name, fh, fattr, NULL);
+	dput(parent);
+	if (err != 0)
+		return ERR_PTR(err);
+
+	return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor);
+}
+EXPORT_SYMBOL_GPL(nfs_submount);
diff --git a/kernel/fs/nfs/netns.h b/kernel/fs/nfs/netns.h
new file mode 100644
index 000000000..f0e06e4ac
--- /dev/null
+++ b/kernel/fs/nfs/netns.h
@@ -0,0 +1,40 @@
+/*
+ * NFS-private data for each "struct net".  Accessed with net_generic().
+ */
+
+#ifndef __NFS_NETNS_H__
+#define __NFS_NETNS_H__
+
+#include <linux/nfs4.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct bl_dev_msg {
+	int32_t status;
+	uint32_t major, minor;
+};
+
+struct nfs_net {
+	struct cache_detail *nfs_dns_resolve;
+	struct rpc_pipe *bl_device_pipe;
+	struct bl_dev_msg bl_mount_reply;
+	wait_queue_head_t bl_wq;
+	struct mutex bl_mutex;
+	struct list_head nfs_client_list;
+	struct list_head nfs_volume_list;
+#if IS_ENABLED(CONFIG_NFS_V4)
+	struct idr cb_ident_idr; /* Protected by nfs_client_lock */
+	unsigned short nfs_callback_tcpport;
+	unsigned short nfs_callback_tcpport6;
+	int cb_users[NFS4_MAX_MINOR_VERSION + 1];
+#endif
+	spinlock_t nfs_client_lock;
+	struct timespec boot_time;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *proc_nfsfs;
+#endif
+};
+
+extern int nfs_net_id;
+
+#endif
diff --git a/kernel/fs/nfs/nfs.h b/kernel/fs/nfs/nfs.h
new file mode 100644
index 000000000..43679df56
--- /dev/null
+++ b/kernel/fs/nfs/nfs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ *
+ * Function and structures exported by the NFS module
+ * for use by NFS version-specific modules.
+ */
+#ifndef __LINUX_INTERNAL_NFS_H
+#define __LINUX_INTERNAL_NFS_H
+
+#include <linux/fs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_xdr.h>
+
+struct nfs_subversion {
+	struct module *owner;	/* THIS_MODULE pointer */
+	struct file_system_type *nfs_fs;	/* NFS filesystem type */
+	const struct rpc_version *rpc_vers;	/* NFS version information */
+	const struct nfs_rpc_ops *rpc_ops;	/* NFS operations */
+	const struct super_operations *sops;	/* NFS Super operations */
+	const struct xattr_handler **xattr;	/* NFS xattr handlers */
+	struct list_head list;		/* List of NFS versions */
+};
+
+struct nfs_subversion *get_nfs_version(unsigned int);
+void put_nfs_version(struct nfs_subversion *);
+void register_nfs_version(struct nfs_subversion *);
+void unregister_nfs_version(struct nfs_subversion *);
+
+#endif /* __LINUX_INTERNAL_NFS_H */
diff --git a/kernel/fs/nfs/nfs2super.c b/kernel/fs/nfs/nfs2super.c
new file mode 100644
index 000000000..0a9782c91
--- /dev/null
+++ b/kernel/fs/nfs/nfs2super.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs.h"
+
+static struct nfs_subversion nfs_v2 = {
+	.owner = THIS_MODULE,
+	.nfs_fs   = &nfs_fs_type,
+	.rpc_vers = &nfs_version2,
+	.rpc_ops  = &nfs_v2_clientops,
+	.sops     = &nfs_sops,
+};
+
+static int __init init_nfs_v2(void)
+{
+	register_nfs_version(&nfs_v2);
+	return 0;
+}
+
+static void __exit exit_nfs_v2(void)
+{
+	unregister_nfs_version(&nfs_v2);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v2);
+module_exit(exit_nfs_v2);
diff --git a/kernel/fs/nfs/nfs2xdr.c b/kernel/fs/nfs/nfs2xdr.c
new file mode 100644
index 000000000..b4e03ed85
--- /dev/null
+++ b/kernel/fs/nfs/nfs2xdr.c
@@ -0,0 +1,1151 @@
+/*
+ * linux/fs/nfs/nfs2xdr.c
+ *
+ * XDR functions to encode/decode NFS RPC arguments and results.
+ *
+ * Copyright (C) 1992, 1993, 1994  Rick Sladkey
+ * Copyright (C) 1996 Olaf Kirch
+ * 04 Aug 1998  Ion Badulescu <ionut@cs.columbia.edu>
+ * 		FIFO's need special handling in NFSv2
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO		EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS_fhandle_sz		(8)
+#define NFS_sattr_sz		(8)
+#define NFS_filename_sz		(1+(NFS2_MAXNAMLEN>>2))
+#define NFS_path_sz		(1+(NFS2_MAXPATHLEN>>2))
+#define NFS_fattr_sz		(17)
+#define NFS_info_sz		(5)
+#define NFS_entry_sz		(NFS_filename_sz+3)
+
+#define NFS_diropargs_sz	(NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_removeargs_sz	(NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_sattrargs_sz	(NFS_fhandle_sz+NFS_sattr_sz)
+#define NFS_readlinkargs_sz	(NFS_fhandle_sz)
+#define NFS_readargs_sz		(NFS_fhandle_sz+3)
+#define NFS_writeargs_sz	(NFS_fhandle_sz+4)
+#define NFS_createargs_sz	(NFS_diropargs_sz+NFS_sattr_sz)
+#define NFS_renameargs_sz	(NFS_diropargs_sz+NFS_diropargs_sz)
+#define NFS_linkargs_sz		(NFS_fhandle_sz+NFS_diropargs_sz)
+#define NFS_symlinkargs_sz	(NFS_diropargs_sz+1+NFS_sattr_sz)
+#define NFS_readdirargs_sz	(NFS_fhandle_sz+2)
+
+#define NFS_attrstat_sz		(1+NFS_fattr_sz)
+#define NFS_diropres_sz		(1+NFS_fhandle_sz+NFS_fattr_sz)
+#define NFS_readlinkres_sz	(2)
+#define NFS_readres_sz		(1+NFS_fattr_sz+1)
+#define NFS_writeres_sz         (NFS_attrstat_sz)
+#define NFS_stat_sz		(1)
+#define NFS_readdirres_sz	(1)
+#define NFS_statfsres_sz	(1+NFS_info_sz)
+
+static int nfs_stat_to_errno(enum nfs_stat);
+
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+				 unsigned int base, unsigned int len,
+				 unsigned int bufsize)
+{
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
+	unsigned int replen;
+
+	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+/*
+ *	typedef opaque	nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
+{
+	u32 recvd, count;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	recvd = xdr_read_pages(xdr, count);
+	if (unlikely(count > recvd))
+		goto out_cheating;
+out:
+	result->eof = 0;	/* NFSv2 does not pass EOF flag on the wire. */
+	result->count = count;
+	return count;
+out_cheating:
+	dprintk("NFS: server cheating in read result: "
+		"count %u > recvd %u\n", count, recvd);
+	count = recvd;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	enum stat {
+ *		NFS_OK = 0,
+ *		NFSERR_PERM = 1,
+ *		NFSERR_NOENT = 2,
+ *		NFSERR_IO = 5,
+ *		NFSERR_NXIO = 6,
+ *		NFSERR_ACCES = 13,
+ *		NFSERR_EXIST = 17,
+ *		NFSERR_NODEV = 19,
+ *		NFSERR_NOTDIR = 20,
+ *		NFSERR_ISDIR = 21,
+ *		NFSERR_FBIG = 27,
+ *		NFSERR_NOSPC = 28,
+ *		NFSERR_ROFS = 30,
+ *		NFSERR_NAMETOOLONG = 63,
+ *		NFSERR_NOTEMPTY = 66,
+ *		NFSERR_DQUOT = 69,
+ *		NFSERR_STALE = 70,
+ *		NFSERR_WFLUSH = 99
+ *	};
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.2.  ftype
+ *
+ *	enum ftype {
+ *		NFNON = 0,
+ *		NFREG = 1,
+ *		NFDIR = 2,
+ *		NFBLK = 3,
+ *		NFCHR = 4,
+ *		NFLNK = 5
+ *	};
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
+{
+	*type = be32_to_cpup(p++);
+	if (unlikely(*type > NF2FIFO))
+		*type = NFBAD;
+	return p;
+}
+
+/*
+ * 2.3.3.  fhandle
+ *
+ *	typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+	memcpy(p, fh->data, NFS2_FHSIZE);
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	fh->size = NFS2_FHSIZE;
+	memcpy(fh->data, p, NFS2_FHSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.4.  timeval
+ *
+ *	struct timeval {
+ *		unsigned int seconds;
+ *		unsigned int useconds;
+ *	};
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	if (timep->tv_nsec != 0)
+		*p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+	else
+		*p++ = cpu_to_be32(0);
+	return p;
+}
+
+/*
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+					      const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	*p++ = cpu_to_be32(1000000);
+	return p;
+}
+
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
+{
+	timep->tv_sec = be32_to_cpup(p++);
+	timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+	return p;
+}
+
+/*
+ * 2.3.5.  fattr
+ *
+ *	struct fattr {
+ *		ftype		type;
+ *		unsigned int	mode;
+ *		unsigned int	nlink;
+ *		unsigned int	uid;
+ *		unsigned int	gid;
+ *		unsigned int	size;
+ *		unsigned int	blocksize;
+ *		unsigned int	rdev;
+ *		unsigned int	blocks;
+ *		unsigned int	fsid;
+ *		unsigned int	fileid;
+ *		timeval		atime;
+ *		timeval		mtime;
+ *		timeval		ctime;
+ *	};
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	u32 rdev, type;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fattr->valid |= NFS_ATTR_FATTR_V2;
+
+	p = xdr_decode_ftype(p, &type);
+
+	fattr->mode = be32_to_cpup(p++);
+	fattr->nlink = be32_to_cpup(p++);
+	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+	if (!uid_valid(fattr->uid))
+		goto out_uid;
+	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+	if (!gid_valid(fattr->gid))
+		goto out_gid;
+		
+	fattr->size = be32_to_cpup(p++);
+	fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+
+	rdev = be32_to_cpup(p++);
+	fattr->rdev = new_decode_dev(rdev);
+	if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
+		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
+		fattr->rdev = 0;
+	}
+
+	fattr->du.nfs2.blocks = be32_to_cpup(p++);
+	fattr->fsid.major = be32_to_cpup(p++);
+	fattr->fsid.minor = 0;
+	fattr->fileid = be32_to_cpup(p++);
+
+	p = xdr_decode_time(p, &fattr->atime);
+	p = xdr_decode_time(p, &fattr->mtime);
+	xdr_decode_time(p, &fattr->ctime);
+	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
+	return 0;
+out_uid:
+	dprintk("NFS: returned invalid uid\n");
+	return -EINVAL;
+out_gid:
+	dprintk("NFS: returned invalid gid\n");
+	return -EINVAL;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.6.  sattr
+ *
+ *	struct sattr {
+ *		unsigned int	mode;
+ *		unsigned int	uid;
+ *		unsigned int	gid;
+ *		unsigned int	size;
+ *		timeval		atime;
+ *		timeval		mtime;
+ *	};
+ */
+
+#define NFS2_SATTR_NOT_SET	(0xffffffff)
+
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	return p;
+}
+
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
+
+	if (attr->ia_valid & ATTR_MODE)
+		*p++ = cpu_to_be32(attr->ia_mode);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_UID)
+		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_GID)
+		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_SIZE)
+		*p++ = cpu_to_be32((u32)attr->ia_size);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+
+	if (attr->ia_valid & ATTR_ATIME_SET)
+		p = xdr_encode_time(p, &attr->ia_atime);
+	else if (attr->ia_valid & ATTR_ATIME)
+		p = xdr_encode_current_server_time(p, &attr->ia_atime);
+	else
+		p = xdr_time_not_set(p);
+	if (attr->ia_valid & ATTR_MTIME_SET)
+		xdr_encode_time(p, &attr->ia_mtime);
+	else if (attr->ia_valid & ATTR_MTIME)
+		xdr_encode_current_server_time(p, &attr->ia_mtime);
+	else
+		xdr_time_not_set(p);
+}
+
+/*
+ * 2.3.7.  filename
+ *
+ *	typedef string filename<MAXNAMLEN>;
+ */
+static void encode_filename(struct xdr_stream *xdr,
+			    const char *name, u32 length)
+{
+	__be32 *p;
+
+	WARN_ON_ONCE(length > NFS2_MAXNAMLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+static int decode_filename_inline(struct xdr_stream *xdr,
+				  const char **name, u32 *length)
+{
+	__be32 *p;
+	u32 count;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (count > NFS3_MAXNAMLEN)
+		goto out_nametoolong;
+	p = xdr_inline_decode(xdr, count);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*name = (const char *)p;
+	*length = count;
+	return 0;
+out_nametoolong:
+	dprintk("NFS: returned filename too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.8.  path
+ *
+ *	typedef string path<MAXPATHLEN>;
+ */
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(length);
+	xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_path(struct xdr_stream *xdr)
+{
+	u32 length, recvd;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p);
+	if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+		goto out_size;
+	recvd = xdr_read_pages(xdr, length);
+	if (unlikely(length > recvd))
+		goto out_cheating;
+	xdr_terminate_string(xdr->buf, length);
+	return 0;
+out_size:
+	dprintk("NFS: returned pathname too long: %u\n", length);
+	return -ENAMETOOLONG;
+out_cheating:
+	dprintk("NFS: server cheating in pathname result: "
+		"length %u > received %u\n", length, recvd);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.9.  attrstat
+ *
+ *	union attrstat switch (stat status) {
+ *	case NFS_OK:
+ *		fattr attributes;
+ *	default:
+ *		void;
+ *	};
+ */
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
+			   __u32 *op_status)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (op_status)
+		*op_status = status;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_fattr(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.3.10.  diropargs
+ *
+ *	struct diropargs {
+ *		fhandle  dir;
+ *		filename name;
+ *	};
+ */
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
+			     const char *name, u32 length)
+{
+	encode_fhandle(xdr, fh);
+	encode_filename(xdr, name, length);
+}
+
+/*
+ * 2.3.11.  diropres
+ *
+ *	union diropres switch (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			fhandle file;
+ *			fattr   attributes;
+ *		} diropok;
+ *	default:
+ *		void;
+ *	};
+ */
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+	int error;
+
+	error = decode_fhandle(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_fattr(xdr, result->fattr);
+out:
+	return error;
+}
+
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_diropok(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * NFSv2 XDR encode functions
+ *
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nfs_fh *fh)
+{
+	encode_fhandle(xdr, fh);
+}
+
+/*
+ * 2.2.3.  sattrargs
+ *
+ *	struct sattrargs {
+ *		fhandle file;
+ *		sattr attributes;
+ *	};
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_sattrargs *args)
+{
+	encode_fhandle(xdr, args->fh);
+	encode_sattr(xdr, args->sattr);
+}
+
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_diropargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_readlinkargs *args)
+{
+	encode_fhandle(xdr, args->fh);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->pglen, NFS_readlinkres_sz);
+}
+
+/*
+ * 2.2.7.  readargs
+ *
+ *	struct readargs {
+ *		fhandle file;
+ *		unsigned offset;
+ *		unsigned count;
+ *		unsigned totalcount;
+ *	};
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+			    const struct nfs_pgio_args *args)
+{
+	u32 offset = args->offset;
+	u32 count = args->count;
+	__be32 *p;
+
+	encode_fhandle(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(count);
+	*p = cpu_to_be32(count);
+}
+
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_pgio_args *args)
+{
+	encode_readargs(xdr, args);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->count, NFS_readres_sz);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 2.2.9.  writeargs
+ *
+ *	struct writeargs {
+ *		fhandle file;
+ *		unsigned beginoffset;
+ *		unsigned offset;
+ *		unsigned totalcount;
+ *		nfsdata data;
+ *	};
+ */
+static void encode_writeargs(struct xdr_stream *xdr,
+			     const struct nfs_pgio_args *args)
+{
+	u32 offset = args->offset;
+	u32 count = args->count;
+	__be32 *p;
+
+	encode_fhandle(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(count);
+
+	/* nfsdata */
+	*p = cpu_to_be32(count);
+	xdr_write_pages(xdr, args->pages, args->pgbase, count);
+}
+
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_pgio_args *args)
+{
+	encode_writeargs(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 2.2.10.  createargs
+ *
+ *	struct createargs {
+ *		diropargs where;
+ *		sattr attributes;
+ *	};
+ */
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_createargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name, args->len);
+	encode_sattr(xdr, args->sattr);
+}
+
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_removeargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 2.2.12.  renameargs
+ *
+ *	struct renameargs {
+ *		diropargs from;
+ *		diropargs to;
+ *	};
+ */
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_renameargs *args)
+{
+	const struct qstr *old = args->old_name;
+	const struct qstr *new = args->new_name;
+
+	encode_diropargs(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 2.2.13.  linkargs
+ *
+ *	struct linkargs {
+ *		fhandle from;
+ *		diropargs to;
+ *	};
+ */
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_linkargs *args)
+{
+	encode_fhandle(xdr, args->fromfh);
+	encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 2.2.14.  symlinkargs
+ *
+ *	struct symlinkargs {
+ *		diropargs from;
+ *		path to;
+ *		sattr attributes;
+ *	};
+ */
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_symlinkargs *args)
+{
+	encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_path(xdr, args->pages, args->pathlen);
+	encode_sattr(xdr, args->sattr);
+}
+
+/*
+ * 2.2.17.  readdirargs
+ *
+ *	struct readdirargs {
+ *		fhandle dir;
+ *		nfscookie cookie;
+ *		unsigned count;
+ *	};
+ */
+static void encode_readdirargs(struct xdr_stream *xdr,
+			       const struct nfs_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_fhandle(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	*p++ = cpu_to_be32(args->cookie);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_readdirargs *args)
+{
+	encode_readdirargs(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+					args->count, NFS_readdirres_sz);
+}
+
+/*
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     void *__unused)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_fattr *result)
+{
+	return decode_attrstat(xdr, result, NULL);
+}
+
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_diropok *result)
+{
+	return decode_diropres(xdr, result);
+}
+
+/*
+ * 2.2.6.  readlinkres
+ *
+ *	union readlinkres switch (stat status) {
+ *	case NFS_OK:
+ *		path data;
+ *	default:
+ *		void;
+ *	};
+ */
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
+				    struct xdr_stream *xdr, void *__unused)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_path(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.7.  readres
+ *
+ *	union readres switch (stat status) {
+ *	case NFS_OK:
+ *		fattr attributes;
+ *		nfsdata data;
+ *	default:
+ *		void;
+ *	};
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_pgio_res *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	result->op_status = status;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_fattr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_nfsdata(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_pgio_res *result)
+{
+	/* All NFSv2 writes are "file sync" writes */
+	result->verf->committed = NFS_FILE_SYNC;
+	return decode_attrstat(xdr, result->fattr, &result->op_status);
+}
+
+/**
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *	struct entry {
+ *		unsigned	fileid;
+ *		filename	name;
+ *		nfscookie	cookie;
+ *		entry		*nextentry;
+ *	};
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
+{
+	__be32 *p;
+	int error;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p++ == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p++ == xdr_zero)
+			return -EAGAIN;
+		entry->eof = 1;
+		return -EBADCOOKIE;
+	}
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	entry->ino = be32_to_cpup(p);
+
+	error = decode_filename_inline(xdr, &entry->name, &entry->len);
+	if (unlikely(error))
+		return error;
+
+	/*
+	 * The type (size and byte order) of nfscookie isn't defined in
+	 * RFC 1094.  This implementation assumes that it's an XDR uint32.
+	 */
+	entry->prev_cookie = entry->cookie;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	entry->cookie = be32_to_cpup(p);
+
+	entry->d_type = DT_UNKNOWN;
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EAGAIN;
+}
+
+/*
+ * 2.2.17.  readdirres
+ *
+ *	union readdirres switch (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			entry *entries;
+ *			bool eof;
+ *		} readdirok;
+ *	default:
+ *		void;
+ *	};
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_readdirok(struct xdr_stream *xdr)
+{
+	return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
+				   struct xdr_stream *xdr, void *__unused)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_readdirok(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.18.  statfsres
+ *
+ *	union statfsres (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			unsigned tsize;
+ *			unsigned bsize;
+ *			unsigned blocks;
+ *			unsigned bfree;
+ *			unsigned bavail;
+ *		} info;
+ *	default:
+ *		void;
+ *	};
+ */
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->tsize  = be32_to_cpup(p++);
+	result->bsize  = be32_to_cpup(p++);
+	result->blocks = be32_to_cpup(p++);
+	result->bfree  = be32_to_cpup(p++);
+	result->bavail = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs2_fsstat *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_info(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_errtbl[] = {
+	{ NFS_OK,		0		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-errno_NFSERR_IO},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
+#ifdef EWFLUSH
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
+#endif
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs_stat_to_errno(enum nfs_stat status)
+{
+	int i;
+
+	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+		if (nfs_errtbl[i].stat == (int)status)
+			return nfs_errtbl[i].errno;
+	}
+	dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+	return nfs_errtbl[i].errno;
+}
+
+#define PROC(proc, argtype, restype, timer)				\
+[NFSPROC_##proc] = {							\
+	.p_proc	    =  NFSPROC_##proc,					\
+	.p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,		\
+	.p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,		\
+	.p_arglen   =  NFS_##argtype##_sz,				\
+	.p_replen   =  NFS_##restype##_sz,				\
+	.p_timer    =  timer,						\
+	.p_statidx  =  NFSPROC_##proc,					\
+	.p_name     =  #proc,						\
+	}
+struct rpc_procinfo	nfs_procedures[] = {
+	PROC(GETATTR,	fhandle,	attrstat,	1),
+	PROC(SETATTR,	sattrargs,	attrstat,	0),
+	PROC(LOOKUP,	diropargs,	diropres,	2),
+	PROC(READLINK,	readlinkargs,	readlinkres,	3),
+	PROC(READ,	readargs,	readres,	3),
+	PROC(WRITE,	writeargs,	writeres,	4),
+	PROC(CREATE,	createargs,	diropres,	0),
+	PROC(REMOVE,	removeargs,	stat,		0),
+	PROC(RENAME,	renameargs,	stat,		0),
+	PROC(LINK,	linkargs,	stat,		0),
+	PROC(SYMLINK,	symlinkargs,	stat,		0),
+	PROC(MKDIR,	createargs,	diropres,	0),
+	PROC(RMDIR,	diropargs,	stat,		0),
+	PROC(READDIR,	readdirargs,	readdirres,	3),
+	PROC(STATFS,	fhandle,	statfsres,	0),
+};
+
+const struct rpc_version nfs_version2 = {
+	.number			= 2,
+	.nrprocs		= ARRAY_SIZE(nfs_procedures),
+	.procs			= nfs_procedures
+};
diff --git a/kernel/fs/nfs/nfs3_fs.h b/kernel/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000..e134d6548
--- /dev/null
+++ b/kernel/fs/nfs/nfs3_fs.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl)
+{
+	return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+				     struct nfs_fattr *, rpc_authflavor_t);
+
+/* nfs3super.c */
+extern struct nfs_subversion nfs_v3;
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/kernel/fs/nfs/nfs3acl.c b/kernel/fs/nfs/nfs3acl.c
new file mode 100644
index 000000000..1ebe2fc7c
--- /dev/null
+++ b/kernel/fs/nfs/nfs3acl.c
@@ -0,0 +1,296 @@
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/nfsacl.h>
+
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#define NFSDBG_FACILITY	NFSDBG_PROC
+
+struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct page *pages[NFSACL_MAXPAGES] = { };
+	struct nfs3_getaclargs args = {
+		.fh = NFS_FH(inode),
+		/* The xdr layer may allocate pages here. */
+		.pages = pages,
+	};
+	struct nfs3_getaclres res = {
+		NULL,
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+	};
+	int status, count;
+
+	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	status = nfs_revalidate_inode(server, inode);
+	if (status < 0)
+		return ERR_PTR(status);
+
+	/*
+	 * Only get the access acl when explicitly requested: We don't
+	 * need it for access decisions, and only some applications use
+	 * it. Applications which request the access acl first are not
+	 * penalized from this optimization.
+	 */
+	if (type == ACL_TYPE_ACCESS)
+		args.mask |= NFS_ACLCNT|NFS_ACL;
+	if (S_ISDIR(inode->i_mode))
+		args.mask |= NFS_DFACLCNT|NFS_DFACL;
+	if (args.mask == 0)
+		return NULL;
+
+	dprintk("NFS call getacl\n");
+	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	status = rpc_call_sync(server->client_acl, &msg, 0);
+	dprintk("NFS reply getacl: %d\n", status);
+
+	/* pages may have been allocated at the xdr layer. */
+	for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
+		__free_page(args.pages[count]);
+
+	switch (status) {
+		case 0:
+			status = nfs_refresh_inode(inode, res.fattr);
+			break;
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+			dprintk("NFS_V3_ACL extension not supported; disabling\n");
+			server->caps &= ~NFS_CAP_ACLS;
+		case -ENOTSUPP:
+			status = -EOPNOTSUPP;
+		default:
+			goto getout;
+	}
+	if ((args.mask & res.mask) != args.mask) {
+		status = -EIO;
+		goto getout;
+	}
+
+	if (res.acl_access != NULL) {
+		if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) ||
+		    res.acl_access->a_count == 0) {
+			posix_acl_release(res.acl_access);
+			res.acl_access = NULL;
+		}
+	}
+
+	if (res.mask & NFS_ACL)
+		set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access);
+	else
+		forget_cached_acl(inode, ACL_TYPE_ACCESS);
+
+	if (res.mask & NFS_DFACL)
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default);
+	else
+		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+
+	nfs_free_fattr(res.fattr);
+	if (type == ACL_TYPE_ACCESS) {
+		posix_acl_release(res.acl_default);
+		return res.acl_access;
+	} else {
+		posix_acl_release(res.acl_access);
+		return res.acl_default;
+	}
+
+getout:
+	posix_acl_release(res.acl_access);
+	posix_acl_release(res.acl_default);
+	nfs_free_fattr(res.fattr);
+	return ERR_PTR(status);
+}
+
+static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr *fattr;
+	struct page *pages[NFSACL_MAXPAGES];
+	struct nfs3_setaclargs args = {
+		.inode = inode,
+		.mask = NFS_ACL,
+		.acl_access = acl,
+		.pages = pages,
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= &fattr,
+	};
+	int status = 0;
+
+	if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+		goto out;
+
+	status = -EOPNOTSUPP;
+	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+		goto out;
+
+	/* We are doing this here because XDR marshalling does not
+	 * return any results, it BUGs. */
+	status = -ENOSPC;
+	if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
+		goto out;
+	if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES)
+		goto out;
+	if (S_ISDIR(inode->i_mode)) {
+		args.mask |= NFS_DFACL;
+		args.acl_default = dfacl;
+		args.len = nfsacl_size(acl, dfacl);
+	} else
+		args.len = nfsacl_size(acl, NULL);
+
+	if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+		unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+		status = -ENOMEM;
+		do {
+			args.pages[args.npages] = alloc_page(GFP_KERNEL);
+			if (args.pages[args.npages] == NULL)
+				goto out_freepages;
+			args.npages++;
+		} while (args.npages < npages);
+	}
+
+	dprintk("NFS call setacl\n");
+	status = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out_freepages;
+
+	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+	msg.rpc_resp = fattr;
+	status = rpc_call_sync(server->client_acl, &msg, 0);
+	nfs_access_zap_cache(inode);
+	nfs_zap_acl_cache(inode);
+	dprintk("NFS reply setacl: %d\n", status);
+
+	switch (status) {
+		case 0:
+			status = nfs_refresh_inode(inode, fattr);
+			set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+			set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
+			break;
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+			dprintk("NFS_V3_ACL SETACL RPC not supported"
+					"(will not retry)\n");
+			server->caps &= ~NFS_CAP_ACLS;
+		case -ENOTSUPP:
+			status = -EOPNOTSUPP;
+	}
+	nfs_free_fattr(fattr);
+out_freepages:
+	while (args.npages != 0) {
+		args.npages--;
+		__free_page(args.pages[args.npages]);
+	}
+out:
+	return status;
+}
+
+int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl)
+{
+	int ret;
+	ret = __nfs3_proc_setacls(inode, acl, dfacl);
+	return (ret == -EOPNOTSUPP) ? 0 : ret;
+
+}
+
+int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct posix_acl *alloc = NULL, *dfacl = NULL;
+	int status;
+
+	if (S_ISDIR(inode->i_mode)) {
+		switch(type) {
+		case ACL_TYPE_ACCESS:
+			alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT);
+			if (IS_ERR(alloc))
+				goto fail;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			dfacl = acl;
+			alloc = acl = get_acl(inode, ACL_TYPE_ACCESS);
+			if (IS_ERR(alloc))
+				goto fail;
+			break;
+		}
+	}
+
+	if (acl == NULL) {
+		alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		if (IS_ERR(alloc))
+			goto fail;
+	}
+	status = __nfs3_proc_setacls(inode, acl, dfacl);
+	posix_acl_release(alloc);
+	return status;
+
+fail:
+	return PTR_ERR(alloc);
+}
+
+const struct xattr_handler *nfs3_xattr_handlers[] = {
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	NULL,
+};
+
+static int
+nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
+		size_t size, ssize_t *result)
+{
+	struct posix_acl *acl;
+	char *p = data + *result;
+
+	acl = get_acl(inode, type);
+	if (IS_ERR_OR_NULL(acl))
+		return 0;
+
+	posix_acl_release(acl);
+
+	*result += strlen(name);
+	*result += 1;
+	if (!size)
+		return 0;
+	if (*result > size)
+		return -ERANGE;
+
+	strcpy(p, name);
+	return 0;
+}
+
+ssize_t
+nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+	ssize_t result = 0;
+	int error;
+
+	error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
+			POSIX_ACL_XATTR_ACCESS, data, size, &result);
+	if (error)
+		return error;
+
+	error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
+			POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+	if (error)
+		return error;
+	return result;
+}
diff --git a/kernel/fs/nfs/nfs3client.c b/kernel/fs/nfs/nfs3client.c
new file mode 100644
index 000000000..9e9fa347a
--- /dev/null
+++ b/kernel/fs/nfs/nfs3client.c
@@ -0,0 +1,107 @@
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
+static const struct rpc_version *nfsacl_version[] = {
+	[3]			= &nfsacl_version3,
+};
+
+const struct rpc_program nfsacl_program = {
+	.name			= "nfsacl",
+	.number			= NFS_ACL_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfsacl_version),
+	.version		= nfsacl_version,
+	.stats			= &nfsacl_rpcstat,
+};
+
+/*
+ * Initialise an NFSv3 ACL client connection
+ */
+static void nfs_init_server_aclclient(struct nfs_server *server)
+{
+	if (server->flags & NFS_MOUNT_NOACL)
+		goto out_noacl;
+
+	server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+	if (IS_ERR(server->client_acl))
+		goto out_noacl;
+
+	/* No errors! Assume that Sun nfsacls are supported */
+	server->caps |= NFS_CAP_ACLS;
+	return;
+
+out_noacl:
+	server->caps &= ~NFS_CAP_ACLS;
+}
+#else
+static inline void nfs_init_server_aclclient(struct nfs_server *server)
+{
+	server->flags &= ~NFS_MOUNT_NOACL;
+	server->caps &= ~NFS_CAP_ACLS;
+}
+#endif
+
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *mount_info,
+				      struct nfs_subversion *nfs_mod)
+{
+	struct nfs_server *server = nfs_create_server(mount_info, nfs_mod);
+	/* Create a client RPC handle for the NFS v3 ACL management interface */
+	if (!IS_ERR(server))
+		nfs_init_server_aclclient(server);
+	return server;
+}
+
+struct nfs_server *nfs3_clone_server(struct nfs_server *source,
+				     struct nfs_fh *fh,
+				     struct nfs_fattr *fattr,
+				     rpc_authflavor_t flavor)
+{
+	struct nfs_server *server = nfs_clone_server(source, fh, fattr, flavor);
+	if (!IS_ERR(server) && !IS_ERR(source->client_acl))
+		nfs_init_server_aclclient(server);
+	return server;
+}
+
+/*
+ * Set up a pNFS Data Server client over NFSv3.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+		const struct sockaddr *ds_addr, int ds_addrlen,
+		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+		rpc_authflavor_t au_flavor)
+{
+	struct nfs_client_initdata cl_init = {
+		.addr = ds_addr,
+		.addrlen = ds_addrlen,
+		.nfs_mod = &nfs_v3,
+		.proto = ds_proto,
+		.net = mds_clp->cl_net,
+	};
+	struct rpc_timeout ds_timeout;
+	struct nfs_client *clp;
+	char buf[INET6_ADDRSTRLEN + 1];
+
+	/* fake a hostname because lockd wants it */
+	if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+		return ERR_PTR(-EINVAL);
+	cl_init.hostname = buf;
+
+	/* Use the MDS nfs_client cl_ipaddr. */
+	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+			     au_flavor);
+
+	return clp;
+}
+EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/kernel/fs/nfs/nfs3proc.c b/kernel/fs/nfs/nfs3proc.c
new file mode 100644
index 000000000..cb28cceef
--- /dev/null
+++ b/kernel/fs/nfs/nfs3proc.c
@@ -0,0 +1,974 @@
+/*
+ *  linux/fs/nfs/nfs3proc.c
+ *
+ *  Client-side NFSv3 procedures stubs.
+ *
+ *  Copyright (C) 1997, Olaf Kirch
+ */
+
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfs_mount.h>
+#include <linux/freezer.h>
+#include <linux/xattr.h>
+
+#include "iostat.h"
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PROC
+
+/* A wrapper to handle the EJUKEBOX error messages */
+static int
+nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+	int res;
+	do {
+		res = rpc_call_sync(clnt, msg, flags);
+		if (res != -EJUKEBOX)
+			break;
+		freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+		res = -ERESTARTSYS;
+	} while (!fatal_signal_pending(current));
+	return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags)	nfs3_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
+{
+	if (task->tk_status != -EJUKEBOX)
+		return 0;
+	if (task->tk_status == -EJUKEBOX)
+		nfs_inc_stats(inode, NFSIOS_DELAY);
+	task->tk_status = 0;
+	rpc_restart_call(task);
+	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+	return 1;
+}
+
+static int
+do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
+		 struct nfs_fsinfo *info)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSINFO],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
+	int	status;
+
+	dprintk("%s: call  fsinfo\n", __func__);
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(client, &msg, 0);
+	dprintk("%s: reply fsinfo: %d\n", __func__, status);
+	if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) {
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+		msg.rpc_resp = info->fattr;
+		status = rpc_call_sync(client, &msg, 0);
+		dprintk("%s: reply getattr: %d\n", __func__, status);
+	}
+	return status;
+}
+
+/*
+ * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb
+ */
+static int
+nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_fsinfo *info)
+{
+	int	status;
+
+	status = do_proc_get_root(server->client, fhandle, info);
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info);
+	return status;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	dprintk("NFS call  getattr\n");
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply getattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+			struct iattr *sattr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct nfs3_sattrargs	arg = {
+		.fh		= NFS_FH(inode),
+		.sattr		= sattr,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	dprintk("NFS call  setattr\n");
+	if (sattr->ia_valid & ATTR_FILE)
+		msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr, fattr);
+	dprintk("NFS reply setattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_lookup(struct inode *dir, struct qstr *name,
+		 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+		 struct nfs4_label *label)
+{
+	struct nfs3_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct nfs3_diropres	res = {
+		.fh		= fhandle,
+		.fattr		= fattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_LOOKUP],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int			status;
+
+	dprintk("NFS call  lookup %s\n", name->name);
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		return -ENOMEM;
+
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_refresh_inode(dir, res.dir_attr);
+	if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+		msg.rpc_argp = fhandle;
+		msg.rpc_resp = fattr;
+		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	}
+	nfs_free_fattr(res.dir_attr);
+	dprintk("NFS reply lookup: %d\n", status);
+	return status;
+}
+
+static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+	struct nfs3_accessargs	arg = {
+		.fh		= NFS_FH(inode),
+	};
+	struct nfs3_accessres	res;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_ACCESS],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+		.rpc_cred	= entry->cred,
+	};
+	int mode = entry->mask;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  access\n");
+
+	if (mode & MAY_READ)
+		arg.access |= NFS3_ACCESS_READ;
+	if (S_ISDIR(inode->i_mode)) {
+		if (mode & MAY_WRITE)
+			arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE;
+		if (mode & MAY_EXEC)
+			arg.access |= NFS3_ACCESS_LOOKUP;
+	} else {
+		if (mode & MAY_WRITE)
+			arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND;
+		if (mode & MAY_EXEC)
+			arg.access |= NFS3_ACCESS_EXECUTE;
+	}
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_refresh_inode(inode, res.fattr);
+	if (status == 0) {
+		entry->mask = 0;
+		if (res.access & NFS3_ACCESS_READ)
+			entry->mask |= MAY_READ;
+		if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE))
+			entry->mask |= MAY_WRITE;
+		if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
+			entry->mask |= MAY_EXEC;
+	}
+	nfs_free_fattr(res.fattr);
+out:
+	dprintk("NFS reply access: %d\n", status);
+	return status;
+}
+
+static int nfs3_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs_fattr	*fattr;
+	struct nfs3_readlinkargs args = {
+		.fh		= NFS_FH(inode),
+		.pgbase		= pgbase,
+		.pglen		= pglen,
+		.pages		= &page
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_READLINK],
+		.rpc_argp	= &args,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  readlink\n");
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+	msg.rpc_resp = fattr;
+
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_refresh_inode(inode, fattr);
+	nfs_free_fattr(fattr);
+out:
+	dprintk("NFS reply readlink: %d\n", status);
+	return status;
+}
+
+struct nfs3_createdata {
+	struct rpc_message msg;
+	union {
+		struct nfs3_createargs create;
+		struct nfs3_mkdirargs mkdir;
+		struct nfs3_symlinkargs symlink;
+		struct nfs3_mknodargs mknod;
+	} arg;
+	struct nfs3_diropres res;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+	struct nfs3_createdata *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data != NULL) {
+		data->msg.rpc_argp = &data->arg;
+		data->msg.rpc_resp = &data->res;
+		data->res.fh = &data->fh;
+		data->res.fattr = &data->fattr;
+		data->res.dir_attr = &data->dir_attr;
+		nfs_fattr_init(data->res.fattr);
+		nfs_fattr_init(data->res.dir_attr);
+	}
+	return data;
+}
+
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+	int status;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+	nfs_post_op_update_inode(dir, data->res.dir_attr);
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+	return status;
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+	kfree(data);
+}
+
+/*
+ * Create a regular file.
+ */
+static int
+nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+		 int flags)
+{
+	struct posix_acl *default_acl, *acl;
+	struct nfs3_createdata *data;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  create %pd\n", dentry);
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+	data->arg.create.fh = NFS_FH(dir);
+	data->arg.create.name = dentry->d_name.name;
+	data->arg.create.len = dentry->d_name.len;
+	data->arg.create.sattr = sattr;
+
+	data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+	if (flags & O_EXCL) {
+		data->arg.create.createmode  = NFS3_CREATE_EXCLUSIVE;
+		data->arg.create.verifier[0] = cpu_to_be32(jiffies);
+		data->arg.create.verifier[1] = cpu_to_be32(current->pid);
+	}
+
+	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+	if (status)
+		goto out;
+
+	for (;;) {
+		status = nfs3_do_create(dir, dentry, data);
+
+		if (status != -ENOTSUPP)
+			break;
+		/* If the server doesn't support the exclusive creation
+		 * semantics, try again with simple 'guarded' mode. */
+		switch (data->arg.create.createmode) {
+			case NFS3_CREATE_EXCLUSIVE:
+				data->arg.create.createmode = NFS3_CREATE_GUARDED;
+				break;
+
+			case NFS3_CREATE_GUARDED:
+				data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+				break;
+
+			case NFS3_CREATE_UNCHECKED:
+				goto out;
+		}
+		nfs_fattr_init(data->res.dir_attr);
+		nfs_fattr_init(data->res.fattr);
+	}
+
+	if (status != 0)
+		goto out_release_acls;
+
+	/* When we created the file with exclusive semantics, make
+	 * sure we set the attributes afterwards. */
+	if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
+		dprintk("NFS call  setattr (post-create)\n");
+
+		if (!(sattr->ia_valid & ATTR_ATIME_SET))
+			sattr->ia_valid |= ATTR_ATIME;
+		if (!(sattr->ia_valid & ATTR_MTIME_SET))
+			sattr->ia_valid |= ATTR_MTIME;
+
+		/* Note: we could use a guarded setattr here, but I'm
+		 * not sure this buys us anything (and I'd have
+		 * to revamp the NFSv3 XDR code) */
+		status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+		nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
+		dprintk("NFS reply setattr (post-create): %d\n", status);
+		if (status != 0)
+			goto out_release_acls;
+	}
+
+	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+out_release_acls:
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
+out:
+	nfs3_free_createdata(data);
+	dprintk("NFS reply create: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs_removeargs arg = {
+		.fh = NFS_FH(dir),
+		.name = *name,
+	};
+	struct nfs_removeres res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE],
+		.rpc_argp = &arg,
+		.rpc_resp = &res,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  remove %s\n", name->name);
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_post_op_update_inode(dir, res.dir_attr);
+	nfs_free_fattr(res.dir_attr);
+out:
+	dprintk("NFS reply remove: %d\n", status);
+	return status;
+}
+
+static void
+nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
+}
+
+static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+	rpc_call_start(task);
+}
+
+static int
+nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+	struct nfs_removeres *res;
+	if (nfs3_async_handle_jukebox(task, dir))
+		return 0;
+	res = task->tk_msg.rpc_resp;
+	nfs_post_op_update_inode(dir, res->dir_attr);
+	return 1;
+}
+
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+
+static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+	rpc_call_start(task);
+}
+
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		      struct inode *new_dir)
+{
+	struct nfs_renameres *res;
+
+	if (nfs3_async_handle_jukebox(task, old_dir))
+		return 0;
+	res = task->tk_msg.rpc_resp;
+
+	nfs_post_op_update_inode(old_dir, res->old_fattr);
+	nfs_post_op_update_inode(new_dir, res->new_fattr);
+	return 1;
+}
+
+static int
+nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs3_linkargs	arg = {
+		.fromfh		= NFS_FH(inode),
+		.tofh		= NFS_FH(dir),
+		.toname		= name->name,
+		.tolen		= name->len
+	};
+	struct nfs3_linkres	res;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_LINK],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  link %s\n", name->name);
+	res.fattr = nfs_alloc_fattr();
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.fattr == NULL || res.dir_attr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_post_op_update_inode(dir, res.dir_attr);
+	nfs_post_op_update_inode(inode, res.fattr);
+out:
+	nfs_free_fattr(res.dir_attr);
+	nfs_free_fattr(res.fattr);
+	dprintk("NFS reply link: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+		  unsigned int len, struct iattr *sattr)
+{
+	struct nfs3_createdata *data;
+	int status = -ENOMEM;
+
+	if (len > NFS3_MAXPATHLEN)
+		return -ENAMETOOLONG;
+
+	dprintk("NFS call  symlink %pd\n", dentry);
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+	data->arg.symlink.fromfh = NFS_FH(dir);
+	data->arg.symlink.fromname = dentry->d_name.name;
+	data->arg.symlink.fromlen = dentry->d_name.len;
+	data->arg.symlink.pages = &page;
+	data->arg.symlink.pathlen = len;
+	data->arg.symlink.sattr = sattr;
+
+	status = nfs3_do_create(dir, dentry, data);
+
+	nfs3_free_createdata(data);
+out:
+	dprintk("NFS reply symlink: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+	struct posix_acl *default_acl, *acl;
+	struct nfs3_createdata *data;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mkdir %pd\n", dentry);
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+
+	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+	if (status)
+		goto out;
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+	data->arg.mkdir.fh = NFS_FH(dir);
+	data->arg.mkdir.name = dentry->d_name.name;
+	data->arg.mkdir.len = dentry->d_name.len;
+	data->arg.mkdir.sattr = sattr;
+
+	status = nfs3_do_create(dir, dentry, data);
+	if (status != 0)
+		goto out_release_acls;
+
+	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+out_release_acls:
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
+out:
+	nfs3_free_createdata(data);
+	dprintk("NFS reply mkdir: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
+{
+	struct nfs_fattr	*dir_attr;
+	struct nfs3_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_RMDIR],
+		.rpc_argp	= &arg,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  rmdir %s\n", name->name);
+	dir_attr = nfs_alloc_fattr();
+	if (dir_attr == NULL)
+		goto out;
+
+	msg.rpc_resp = dir_attr;
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_post_op_update_inode(dir, dir_attr);
+	nfs_free_fattr(dir_attr);
+out:
+	dprintk("NFS reply rmdir: %d\n", status);
+	return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass the user buffer
+ * to the encode function, which installs it in the receive iovec.
+ * The decode function itself doesn't perform any decoding, it just makes
+ * sure the reply is syntactically correct.
+ *
+ * Also note that this implementation handles both plain readdir and
+ * readdirplus.
+ */
+static int
+nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		  u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct inode		*dir = d_inode(dentry);
+	__be32			*verf = NFS_I(dir)->cookieverf;
+	struct nfs3_readdirargs	arg = {
+		.fh		= NFS_FH(dir),
+		.cookie		= cookie,
+		.verf		= {verf[0], verf[1]},
+		.plus		= plus,
+		.count		= count,
+		.pages		= pages
+	};
+	struct nfs3_readdirres	res = {
+		.verf		= verf,
+		.plus		= plus
+	};
+	struct rpc_message	msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_READDIR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred
+	};
+	int status = -ENOMEM;
+
+	if (plus)
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+
+	dprintk("NFS call  readdir%s %d\n",
+			plus? "plus" : "", (unsigned int) cookie);
+
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+	nfs_invalidate_atime(dir);
+	nfs_refresh_inode(dir, res.dir_attr);
+
+	nfs_free_fattr(res.dir_attr);
+out:
+	dprintk("NFS reply readdir%s: %d\n",
+			plus? "plus" : "", status);
+	return status;
+}
+
+static int
+nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+		dev_t rdev)
+{
+	struct posix_acl *default_acl, *acl;
+	struct nfs3_createdata *data;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mknod %pd %u:%u\n", dentry,
+			MAJOR(rdev), MINOR(rdev));
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+
+	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+	if (status)
+		goto out;
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+	data->arg.mknod.fh = NFS_FH(dir);
+	data->arg.mknod.name = dentry->d_name.name;
+	data->arg.mknod.len = dentry->d_name.len;
+	data->arg.mknod.sattr = sattr;
+	data->arg.mknod.rdev = rdev;
+
+	switch (sattr->ia_mode & S_IFMT) {
+	case S_IFBLK:
+		data->arg.mknod.type = NF3BLK;
+		break;
+	case S_IFCHR:
+		data->arg.mknod.type = NF3CHR;
+		break;
+	case S_IFIFO:
+		data->arg.mknod.type = NF3FIFO;
+		break;
+	case S_IFSOCK:
+		data->arg.mknod.type = NF3SOCK;
+		break;
+	default:
+		status = -EINVAL;
+		goto out;
+	}
+
+	status = nfs3_do_create(dir, dentry, data);
+	if (status != 0)
+		goto out_release_acls;
+
+	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+out_release_acls:
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
+out:
+	nfs3_free_createdata(data);
+	dprintk("NFS reply mknod: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+		 struct nfs_fsstat *stat)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSSTAT],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= stat,
+	};
+	int	status;
+
+	dprintk("NFS call  fsstat\n");
+	nfs_fattr_init(stat->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply fsstat: %d\n", status);
+	return status;
+}
+
+static int
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
+		 struct nfs_fsinfo *info)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSINFO],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
+	int	status;
+
+	dprintk("NFS call  fsinfo\n");
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(client, &msg, 0);
+	dprintk("NFS reply fsinfo: %d\n", status);
+	return status;
+}
+
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_fsinfo *info)
+{
+	int	status;
+
+	status = do_proc_fsinfo(server->client, fhandle, info);
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+	return status;
+}
+
+static int
+nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_pathconf *info)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_PATHCONF],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
+	int	status;
+
+	dprintk("NFS call  pathconf\n");
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply pathconf: %d\n", status);
+	return status;
+}
+
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+	struct inode *inode = hdr->inode;
+
+	if (hdr->pgio_done_cb != NULL)
+		return hdr->pgio_done_cb(task, hdr);
+
+	if (nfs3_async_handle_jukebox(task, inode))
+		return -EAGAIN;
+
+	nfs_invalidate_atime(inode);
+	nfs_refresh_inode(inode, &hdr->fattr);
+	return 0;
+}
+
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+				 struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
+}
+
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+				      struct nfs_pgio_header *hdr)
+{
+	rpc_call_start(task);
+	return 0;
+}
+
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+	struct inode *inode = hdr->inode;
+
+	if (hdr->pgio_done_cb != NULL)
+		return hdr->pgio_done_cb(task, hdr);
+
+	if (nfs3_async_handle_jukebox(task, inode))
+		return -EAGAIN;
+	if (task->tk_status >= 0)
+		nfs_writeback_update_inode(hdr);
+	return 0;
+}
+
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+				  struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
+}
+
+static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	rpc_call_start(task);
+}
+
+static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	if (data->commit_done_cb != NULL)
+		return data->commit_done_cb(task, data);
+
+	if (nfs3_async_handle_jukebox(task, data->inode))
+		return -EAGAIN;
+	nfs_refresh_inode(data->inode, data->res.fattr);
+	return 0;
+}
+
+static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
+}
+
+static int
+nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(filp);
+
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+}
+
+static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
+{
+	return 0;
+}
+
+static int nfs3_return_delegation(struct inode *inode)
+{
+	nfs_wb_all(inode);
+	return 0;
+}
+
+static const struct inode_operations nfs3_dir_inode_operations = {
+	.create		= nfs_create,
+	.lookup		= nfs_lookup,
+	.link		= nfs_link,
+	.unlink		= nfs_unlink,
+	.symlink	= nfs_symlink,
+	.mkdir		= nfs_mkdir,
+	.rmdir		= nfs_rmdir,
+	.mknod		= nfs_mknod,
+	.rename		= nfs_rename,
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
+	.listxattr	= nfs3_listxattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.removexattr	= generic_removexattr,
+	.get_acl	= nfs3_get_acl,
+	.set_acl	= nfs3_set_acl,
+#endif
+};
+
+static const struct inode_operations nfs3_file_inode_operations = {
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
+	.listxattr	= nfs3_listxattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.removexattr	= generic_removexattr,
+	.get_acl	= nfs3_get_acl,
+	.set_acl	= nfs3_set_acl,
+#endif
+};
+
+const struct nfs_rpc_ops nfs_v3_clientops = {
+	.version	= 3,			/* protocol version */
+	.dentry_ops	= &nfs_dentry_operations,
+	.dir_inode_ops	= &nfs3_dir_inode_operations,
+	.file_inode_ops	= &nfs3_file_inode_operations,
+	.file_ops	= &nfs_file_operations,
+	.getroot	= nfs3_proc_get_root,
+	.submount	= nfs_submount,
+	.try_mount	= nfs_try_mount,
+	.getattr	= nfs3_proc_getattr,
+	.setattr	= nfs3_proc_setattr,
+	.lookup		= nfs3_proc_lookup,
+	.access		= nfs3_proc_access,
+	.readlink	= nfs3_proc_readlink,
+	.create		= nfs3_proc_create,
+	.remove		= nfs3_proc_remove,
+	.unlink_setup	= nfs3_proc_unlink_setup,
+	.unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
+	.unlink_done	= nfs3_proc_unlink_done,
+	.rename_setup	= nfs3_proc_rename_setup,
+	.rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
+	.rename_done	= nfs3_proc_rename_done,
+	.link		= nfs3_proc_link,
+	.symlink	= nfs3_proc_symlink,
+	.mkdir		= nfs3_proc_mkdir,
+	.rmdir		= nfs3_proc_rmdir,
+	.readdir	= nfs3_proc_readdir,
+	.mknod		= nfs3_proc_mknod,
+	.statfs		= nfs3_proc_statfs,
+	.fsinfo		= nfs3_proc_fsinfo,
+	.pathconf	= nfs3_proc_pathconf,
+	.decode_dirent	= nfs3_decode_dirent,
+	.pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
+	.read_setup	= nfs3_proc_read_setup,
+	.read_done	= nfs3_read_done,
+	.write_setup	= nfs3_proc_write_setup,
+	.write_done	= nfs3_write_done,
+	.commit_setup	= nfs3_proc_commit_setup,
+	.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
+	.commit_done	= nfs3_commit_done,
+	.lock		= nfs3_proc_lock,
+	.clear_acl_cache = forget_all_cached_acls,
+	.close_context	= nfs_close_context,
+	.have_delegation = nfs3_have_delegation,
+	.return_delegation = nfs3_return_delegation,
+	.alloc_client	= nfs_alloc_client,
+	.init_client	= nfs_init_client,
+	.free_client	= nfs_free_client,
+	.create_server	= nfs3_create_server,
+	.clone_server	= nfs3_clone_server,
+};
diff --git a/kernel/fs/nfs/nfs3super.c b/kernel/fs/nfs/nfs3super.c
new file mode 100644
index 000000000..5c4394e46
--- /dev/null
+++ b/kernel/fs/nfs/nfs3super.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs3_fs.h"
+#include "nfs.h"
+
+struct nfs_subversion nfs_v3 = {
+	.owner = THIS_MODULE,
+	.nfs_fs   = &nfs_fs_type,
+	.rpc_vers = &nfs_version3,
+	.rpc_ops  = &nfs_v3_clientops,
+	.sops     = &nfs_sops,
+#ifdef CONFIG_NFS_V3_ACL
+	.xattr    = nfs3_xattr_handlers,
+#endif
+};
+
+static int __init init_nfs_v3(void)
+{
+	register_nfs_version(&nfs_v3);
+	return 0;
+}
+
+static void __exit exit_nfs_v3(void)
+{
+	unregister_nfs_version(&nfs_v3);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v3);
+module_exit(exit_nfs_v3);
diff --git a/kernel/fs/nfs/nfs3xdr.c b/kernel/fs/nfs/nfs3xdr.c
new file mode 100644
index 000000000..53852a4bd
--- /dev/null
+++ b/kernel/fs/nfs/nfs3xdr.c
@@ -0,0 +1,2564 @@
+/*
+ * linux/fs/nfs/nfs3xdr.c
+ *
+ * XDR functions to encode/decode NFSv3 RPC arguments and results.
+ *
+ * Copyright (C) 1996, 1997 Olaf Kirch
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfsacl.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO		EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS3_fhandle_sz		(1+16)
+#define NFS3_fh_sz		(NFS3_fhandle_sz)	/* shorthand */
+#define NFS3_sattr_sz		(15)
+#define NFS3_filename_sz	(1+(NFS3_MAXNAMLEN>>2))
+#define NFS3_path_sz		(1+(NFS3_MAXPATHLEN>>2))
+#define NFS3_fattr_sz		(21)
+#define NFS3_cookieverf_sz	(NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz	(6)
+#define NFS3_pre_op_attr_sz	(1+NFS3_wcc_attr_sz)
+#define NFS3_post_op_attr_sz	(1+NFS3_fattr_sz)
+#define NFS3_wcc_data_sz	(NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_diropargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+
+#define NFS3_getattrargs_sz	(NFS3_fh_sz)
+#define NFS3_setattrargs_sz	(NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_accessargs_sz	(NFS3_fh_sz+1)
+#define NFS3_readlinkargs_sz	(NFS3_fh_sz)
+#define NFS3_readargs_sz	(NFS3_fh_sz+3)
+#define NFS3_writeargs_sz	(NFS3_fh_sz+5)
+#define NFS3_createargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_mkdirargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_symlinkargs_sz	(NFS3_diropargs_sz+1+NFS3_sattr_sz)
+#define NFS3_mknodargs_sz	(NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_renameargs_sz	(NFS3_diropargs_sz+NFS3_diropargs_sz)
+#define NFS3_linkargs_sz		(NFS3_fh_sz+NFS3_diropargs_sz)
+#define NFS3_readdirargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+4)
+#define NFS3_commitargs_sz	(NFS3_fh_sz+3)
+
+#define NFS3_getattrres_sz	(1+NFS3_fattr_sz)
+#define NFS3_setattrres_sz	(1+NFS3_wcc_data_sz)
+#define NFS3_removeres_sz	(NFS3_setattrres_sz)
+#define NFS3_lookupres_sz	(1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
+#define NFS3_accessres_sz	(1+NFS3_post_op_attr_sz+1)
+#define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1)
+#define NFS3_readres_sz		(1+NFS3_post_op_attr_sz+3)
+#define NFS3_writeres_sz	(1+NFS3_wcc_data_sz+4)
+#define NFS3_createres_sz	(1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_renameres_sz	(1+(2 * NFS3_wcc_data_sz))
+#define NFS3_linkres_sz		(1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_readdirres_sz	(1+NFS3_post_op_attr_sz+2)
+#define NFS3_fsstatres_sz	(1+NFS3_post_op_attr_sz+13)
+#define NFS3_fsinfores_sz	(1+NFS3_post_op_attr_sz+12)
+#define NFS3_pathconfres_sz	(1+NFS3_post_op_attr_sz+6)
+#define NFS3_commitres_sz	(1+NFS3_wcc_data_sz+2)
+
+#define ACL3_getaclargs_sz	(NFS3_fh_sz+1)
+#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
+
+static int nfs3_stat_to_errno(enum nfs_stat);
+
+/*
+ * Map file type to S_IFMT bits
+ */
+static const umode_t nfs_type2fmt[] = {
+	[NF3BAD] = 0,
+	[NF3REG] = S_IFREG,
+	[NF3DIR] = S_IFDIR,
+	[NF3BLK] = S_IFBLK,
+	[NF3CHR] = S_IFCHR,
+	[NF3LNK] = S_IFLNK,
+	[NF3SOCK] = S_IFSOCK,
+	[NF3FIFO] = S_IFIFO,
+};
+
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+				 unsigned int base, unsigned int len,
+				 unsigned int bufsize)
+{
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
+	unsigned int replen;
+
+	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
+{
+	__be32 *p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*value = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	xdr_decode_hyper(p, value);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * fileid3
+ *
+ *	typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+	return xdr_decode_hyper(p, fileid);
+}
+
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+	return decode_uint64(xdr, fileid);
+}
+
+/*
+ * filename3
+ *
+ *	typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+			     const char *name, u32 length)
+{
+	__be32 *p;
+
+	WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+static int decode_inline_filename3(struct xdr_stream *xdr,
+				   const char **name, u32 *length)
+{
+	__be32 *p;
+	u32 count;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (count > NFS3_MAXNAMLEN)
+		goto out_nametoolong;
+	p = xdr_inline_decode(xdr, count);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*name = (const char *)p;
+	*length = count;
+	return 0;
+
+out_nametoolong:
+	dprintk("NFS: returned filename too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * nfspath3
+ *
+ *	typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+			    const u32 length)
+{
+	encode_uint32(xdr, length);
+	xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+	u32 recvd, count;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+		goto out_nametoolong;
+	recvd = xdr_read_pages(xdr, count);
+	if (unlikely(count > recvd))
+		goto out_cheating;
+	xdr_terminate_string(xdr->buf, count);
+	return 0;
+
+out_nametoolong:
+	dprintk("NFS: returned pathname too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_cheating:
+	dprintk("NFS: server cheating in pathname result: "
+		"count %u > recvd %u\n", count, recvd);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * cookie3
+ *
+ *	typedef uint64 cookie3
+ */
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
+{
+	return xdr_encode_hyper(p, cookie);
+}
+
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
+{
+	return decode_uint64(xdr, cookie);
+}
+
+/*
+ * cookieverf3
+ *
+ *	typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+	memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+	return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * createverf3
+ *
+ *	typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+	memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+
+static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier->data, p, NFS3_WRITEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * size3
+ *
+ *	typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+	return xdr_decode_hyper(p, size);
+}
+
+/*
+ * nfsstat3
+ *
+ *	enum nfsstat3 {
+ *		NFS3_OK = 0,
+ *		...
+ *	}
+ */
+#define NFS3_OK		NFS_OK
+
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * ftype3
+ *
+ *	enum ftype3 {
+ *		NF3REG	= 1,
+ *		NF3DIR	= 2,
+ *		NF3BLK	= 3,
+ *		NF3CHR	= 4,
+ *		NF3LNK	= 5,
+ *		NF3SOCK	= 6,
+ *		NF3FIFO	= 7
+ *	};
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+	encode_uint32(xdr, type);
+}
+
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
+{
+	u32 type;
+
+	type = be32_to_cpup(p++);
+	if (type > NF3FIFO)
+		type = NF3NON;
+	*mode = nfs_type2fmt[type];
+	return p;
+}
+
+/*
+ * specdata3
+ *
+ *     struct specdata3 {
+ *             uint32  specdata1;
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(MAJOR(rdev));
+	*p = cpu_to_be32(MINOR(rdev));
+}
+
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+	unsigned int major, minor;
+
+	major = be32_to_cpup(p++);
+	minor = be32_to_cpup(p++);
+	*rdev = MKDEV(major, minor);
+	if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+		*rdev = 0;
+	return p;
+}
+
+/*
+ * nfs_fh3
+ *
+ *	struct nfs_fh3 {
+ *		opaque       data<NFS3_FHSIZE>;
+ *	};
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	WARN_ON_ONCE(fh->size > NFS3_FHSIZE);
+	p = xdr_reserve_space(xdr, 4 + fh->size);
+	xdr_encode_opaque(p, fh->data, fh->size);
+}
+
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > NFS3_FHSIZE))
+		goto out_toobig;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	fh->size = length;
+	memcpy(fh->data, p, length);
+	return 0;
+out_toobig:
+	dprintk("NFS: file handle size (%u) too big\n", length);
+	return -E2BIG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+	memset(fh, 0, sizeof(*fh));
+}
+
+/*
+ * nfstime3
+ *
+ *	struct nfstime3 {
+ *		uint32	seconds;
+ *		uint32	nseconds;
+ *	};
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	*p++ = cpu_to_be32(timep->tv_nsec);
+	return p;
+}
+
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
+{
+	timep->tv_sec = be32_to_cpup(p++);
+	timep->tv_nsec = be32_to_cpup(p++);
+	return p;
+}
+
+/*
+ * sattr3
+ *
+ *	enum time_how {
+ *		DONT_CHANGE		= 0,
+ *		SET_TO_SERVER_TIME	= 1,
+ *		SET_TO_CLIENT_TIME	= 2
+ *	};
+ *
+ *	union set_mode3 switch (bool set_it) {
+ *	case TRUE:
+ *		mode3	mode;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_uid3 switch (bool set_it) {
+ *	case TRUE:
+ *		uid3	uid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_gid3 switch (bool set_it) {
+ *	case TRUE:
+ *		gid3	gid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_size3 switch (bool set_it) {
+ *	case TRUE:
+ *		size3	size;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_atime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3	atime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_mtime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3  mtime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct sattr3 {
+ *		set_mode3	mode;
+ *		set_uid3	uid;
+ *		set_gid3	gid;
+ *		set_size3	size;
+ *		set_atime	atime;
+ *		set_mtime	mtime;
+ *	};
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+	u32 nbytes;
+	__be32 *p;
+
+	/*
+	 * In order to make only a single xdr_reserve_space() call,
+	 * pre-compute the total number of bytes to be reserved.
+	 * Six boolean values, one for each set_foo field, are always
+	 * present in the encoded result, so start there.
+	 */
+	nbytes = 6 * 4;
+	if (attr->ia_valid & ATTR_MODE)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_UID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_GID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_SIZE)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_ATIME_SET)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_MTIME_SET)
+		nbytes += 8;
+	p = xdr_reserve_space(xdr, nbytes);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		*p++ = xdr_one;
+		*p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_UID) {
+		*p++ = xdr_one;
+		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_GID) {
+		*p++ = xdr_one;
+		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		*p++ = xdr_one;
+		p = xdr_encode_hyper(p, (u64)attr->ia_size);
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_ATIME_SET) {
+		*p++ = xdr_two;
+		p = xdr_encode_nfstime3(p, &attr->ia_atime);
+	} else if (attr->ia_valid & ATTR_ATIME) {
+		*p++ = xdr_one;
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_MTIME_SET) {
+		*p++ = xdr_two;
+		xdr_encode_nfstime3(p, &attr->ia_mtime);
+	} else if (attr->ia_valid & ATTR_MTIME) {
+		*p = xdr_one;
+	} else
+		*p = xdr_zero;
+}
+
+/*
+ * fattr3
+ *
+ *	struct fattr3 {
+ *		ftype3		type;
+ *		mode3		mode;
+ *		uint32		nlink;
+ *		uid3		uid;
+ *		gid3		gid;
+ *		size3		size;
+ *		size3		used;
+ *		specdata3	rdev;
+ *		uint64		fsid;
+ *		fileid3		fileid;
+ *		nfstime3	atime;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	umode_t fmode;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	p = xdr_decode_ftype3(p, &fmode);
+
+	fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+	fattr->nlink = be32_to_cpup(p++);
+	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+	if (!uid_valid(fattr->uid))
+		goto out_uid;
+	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+	if (!gid_valid(fattr->gid))
+		goto out_gid;
+
+	p = xdr_decode_size3(p, &fattr->size);
+	p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+	p = xdr_decode_specdata3(p, &fattr->rdev);
+
+	p = xdr_decode_hyper(p, &fattr->fsid.major);
+	fattr->fsid.minor = 0;
+
+	p = xdr_decode_fileid3(p, &fattr->fileid);
+	p = xdr_decode_nfstime3(p, &fattr->atime);
+	p = xdr_decode_nfstime3(p, &fattr->mtime);
+	xdr_decode_nfstime3(p, &fattr->ctime);
+	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
+	fattr->valid |= NFS_ATTR_FATTR_V3;
+	return 0;
+out_uid:
+	dprintk("NFS: returned invalid uid\n");
+	return -EINVAL;
+out_gid:
+	dprintk("NFS: returned invalid gid\n");
+	return -EINVAL;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * post_op_attr
+ *
+ *	union post_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		fattr3	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_fattr3(xdr, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * wcc_attr
+ *	struct wcc_attr {
+ *		size3		size;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PRECHANGE
+		| NFS_ATTR_FATTR_PREMTIME
+		| NFS_ATTR_FATTR_PRECTIME;
+
+	p = xdr_decode_size3(p, &fattr->pre_size);
+	p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
+	xdr_decode_nfstime3(p, &fattr->pre_ctime);
+	fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * pre_op_attr
+ *	union pre_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		wcc_attr	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ * wcc_data
+ *
+ *	struct wcc_data {
+ *		pre_op_attr	before;
+ *		post_op_attr	after;
+ *	};
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_wcc_attr(xdr, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	int error;
+
+	error = decode_pre_op_attr(xdr, fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, fattr);
+out:
+	return error;
+}
+
+/*
+ * post_op_fh3
+ *
+ *	union post_op_fh3 switch (bool handle_follows) {
+ *	case TRUE:
+ *		nfs_fh3  handle;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_nfs_fh3(xdr, fh);
+	zero_nfs_fh3(fh);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * diropargs3
+ *
+ *	struct diropargs3 {
+ *		nfs_fh3		dir;
+ *		filename3	name;
+ *	};
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+			      const char *name, u32 length)
+{
+	encode_nfs_fh3(xdr, fh);
+	encode_filename3(xdr, name, length);
+}
+
+
+/*
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1  GETATTR3args
+ *
+ *	struct GETATTR3args {
+ *		nfs_fh3  object;
+ *	};
+ */
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_fh *fh)
+{
+	encode_nfs_fh3(xdr, fh);
+}
+
+/*
+ * 3.3.2  SETATTR3args
+ *
+ *	union sattrguard3 switch (bool check) {
+ *	case TRUE:
+ *		nfstime3  obj_ctime;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ *	struct SETATTR3args {
+ *		nfs_fh3		object;
+ *		sattr3		new_attributes;
+ *		sattrguard3	guard;
+ *	};
+ */
+static void encode_sattrguard3(struct xdr_stream *xdr,
+			       const struct nfs3_sattrargs *args)
+{
+	__be32 *p;
+
+	if (args->guard) {
+		p = xdr_reserve_space(xdr, 4 + 8);
+		*p++ = xdr_one;
+		xdr_encode_nfstime3(p, &args->guardtime);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		*p = xdr_zero;
+	}
+}
+
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_sattrargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_sattr3(xdr, args->sattr);
+	encode_sattrguard3(xdr, args);
+}
+
+/*
+ * 3.3.3  LOOKUP3args
+ *
+ *	struct LOOKUP3args {
+ *		diropargs3  what;
+ *	};
+ */
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_diropargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+}
+
+/*
+ * 3.3.4  ACCESS3args
+ *
+ *	struct ACCESS3args {
+ *		nfs_fh3		object;
+ *		uint32		access;
+ *	};
+ */
+static void encode_access3args(struct xdr_stream *xdr,
+			       const struct nfs3_accessargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->access);
+}
+
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_accessargs *args)
+{
+	encode_access3args(xdr, args);
+}
+
+/*
+ * 3.3.5  READLINK3args
+ *
+ *	struct READLINK3args {
+ *		nfs_fh3	symlink;
+ *	};
+ */
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       const struct nfs3_readlinkargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->pglen, NFS3_readlinkres_sz);
+}
+
+/*
+ * 3.3.6  READ3args
+ *
+ *	struct READ3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
+ */
+static void encode_read3args(struct xdr_stream *xdr,
+			     const struct nfs_pgio_args *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_pgio_args *args)
+{
+	encode_read3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->count, NFS3_readres_sz);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 3.3.7  WRITE3args
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *		stable_how	stable;
+ *		opaque		data<>;
+ *	};
+ */
+static void encode_write3args(struct xdr_stream *xdr,
+			      const struct nfs_pgio_args *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p++ = cpu_to_be32(args->count);
+	*p++ = cpu_to_be32(args->stable);
+	*p = cpu_to_be32(args->count);
+	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_pgio_args *args)
+{
+	encode_write3args(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 3.3.8  CREATE3args
+ *
+ *	enum createmode3 {
+ *		UNCHECKED = 0,
+ *		GUARDED   = 1,
+ *		EXCLUSIVE = 2
+ *	};
+ *
+ *	union createhow3 switch (createmode3 mode) {
+ *	case UNCHECKED:
+ *	case GUARDED:
+ *		sattr3       obj_attributes;
+ *	case EXCLUSIVE:
+ *		createverf3  verf;
+ *	};
+ *
+ *	struct CREATE3args {
+ *		diropargs3	where;
+ *		createhow3	how;
+ *	};
+ */
+static void encode_createhow3(struct xdr_stream *xdr,
+			      const struct nfs3_createargs *args)
+{
+	encode_uint32(xdr, args->createmode);
+	switch (args->createmode) {
+	case NFS3_CREATE_UNCHECKED:
+	case NFS3_CREATE_GUARDED:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NFS3_CREATE_EXCLUSIVE:
+		encode_createverf3(xdr, args->verifier);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_createargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_createhow3(xdr, args);
+}
+
+/*
+ * 3.3.9  MKDIR3args
+ *
+ *	struct MKDIR3args {
+ *		diropargs3	where;
+ *		sattr3		attributes;
+ *	};
+ */
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mkdirargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_sattr3(xdr, args->sattr);
+}
+
+/*
+ * 3.3.10  SYMLINK3args
+ *
+ *	struct symlinkdata3 {
+ *		sattr3		symlink_attributes;
+ *		nfspath3	symlink_data;
+ *	};
+ *
+ *	struct SYMLINK3args {
+ *		diropargs3	where;
+ *		symlinkdata3	symlink;
+ *	};
+ */
+static void encode_symlinkdata3(struct xdr_stream *xdr,
+				const struct nfs3_symlinkargs *args)
+{
+	encode_sattr3(xdr, args->sattr);
+	encode_nfspath3(xdr, args->pages, args->pathlen);
+}
+
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_symlinkargs *args)
+{
+	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_symlinkdata3(xdr, args);
+}
+
+/*
+ * 3.3.11  MKNOD3args
+ *
+ *	struct devicedata3 {
+ *		sattr3		dev_attributes;
+ *		specdata3	spec;
+ *	};
+ *
+ *	union mknoddata3 switch (ftype3 type) {
+ *	case NF3CHR:
+ *	case NF3BLK:
+ *		devicedata3	device;
+ *	case NF3SOCK:
+ *	case NF3FIFO:
+ *		sattr3		pipe_attributes;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct MKNOD3args {
+ *		diropargs3	where;
+ *		mknoddata3	what;
+ *	};
+ */
+static void encode_devicedata3(struct xdr_stream *xdr,
+			       const struct nfs3_mknodargs *args)
+{
+	encode_sattr3(xdr, args->sattr);
+	encode_specdata3(xdr, args->rdev);
+}
+
+static void encode_mknoddata3(struct xdr_stream *xdr,
+			      const struct nfs3_mknodargs *args)
+{
+	encode_ftype3(xdr, args->type);
+	switch (args->type) {
+	case NF3CHR:
+	case NF3BLK:
+		encode_devicedata3(xdr, args);
+		break;
+	case NF3SOCK:
+	case NF3FIFO:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NF3REG:
+	case NF3DIR:
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mknodargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_mknoddata3(xdr, args);
+}
+
+/*
+ * 3.3.12  REMOVE3args
+ *
+ *	struct REMOVE3args {
+ *		diropargs3  object;
+ *	};
+ */
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_removeargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 3.3.14  RENAME3args
+ *
+ *	struct RENAME3args {
+ *		diropargs3	from;
+ *		diropargs3	to;
+ *	};
+ */
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_renameargs *args)
+{
+	const struct qstr *old = args->old_name;
+	const struct qstr *new = args->new_name;
+
+	encode_diropargs3(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs3(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 3.3.15  LINK3args
+ *
+ *	struct LINK3args {
+ *		nfs_fh3		file;
+ *		diropargs3	link;
+ *	};
+ */
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs3_linkargs *args)
+{
+	encode_nfs_fh3(xdr, args->fromfh);
+	encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 3.3.16  READDIR3args
+ *
+ *	struct READDIR3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		count;
+ *	};
+ */
+static void encode_readdir3args(struct xdr_stream *xdr,
+				const struct nfs3_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_readdirargs *args)
+{
+	encode_readdir3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
+}
+
+/*
+ * 3.3.17  READDIRPLUS3args
+ *
+ *	struct READDIRPLUS3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		dircount;
+ *		count3		maxcount;
+ *	};
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+				    const struct nfs3_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
+
+	/*
+	 * readdirplus: need dircount + buffer size.
+	 * We just make sure we make dircount big enough
+	 */
+	*p++ = cpu_to_be32(args->count >> 3);
+
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+					  const struct nfs3_readdirargs *args)
+{
+	encode_readdirplus3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
+}
+
+/*
+ * 3.3.21  COMMIT3args
+ *
+ *	struct COMMIT3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
+ */
+static void encode_commit3args(struct xdr_stream *xdr,
+			       const struct nfs_commitargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_commitargs *args)
+{
+	encode_commit3args(xdr, args);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_getaclargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->mask);
+	if (args->mask & (NFS_ACL | NFS_DFACL))
+		prepare_reply_buffer(req, args->pages, 0,
+					NFSACL_MAXPAGES << PAGE_SHIFT,
+					ACL3_getaclres_sz);
+}
+
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_setaclargs *args)
+{
+	unsigned int base;
+	int error;
+
+	encode_nfs_fh3(xdr, NFS_FH(args->inode));
+	encode_uint32(xdr, args->mask);
+
+	base = req->rq_slen;
+	if (args->npages != 0)
+		xdr_write_pages(xdr, args->pages, 0, args->len);
+	else
+		xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
+
+	error = nfsacl_encode(xdr->buf, base, args->inode,
+			    (args->mask & NFS_ACL) ?
+			    args->acl_access : NULL, 1, 0);
+	/* FIXME: this is just broken */
+	BUG_ON(error < 0);
+	error = nfsacl_encode(xdr->buf, base + error, args->inode,
+			    (args->mask & NFS_DFACL) ?
+			    args->acl_default : NULL, 1,
+			    NFS_ACL_DEFAULT);
+	BUG_ON(error < 0);
+}
+
+#endif  /* CONFIG_NFS_V3_ACL */
+
+/*
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1  GETATTR3res
+ *
+ *	struct GETATTR3resok {
+ *		fattr3		obj_attributes;
+ *	};
+ *
+ *	union GETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		GETATTR3resok  resok;
+ *	default:
+ *		void;
+ *	};
+ */
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_fattr3(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.2  SETATTR3res
+ *
+ *	struct SETATTR3resok {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	struct SETATTR3resfail {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	union SETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		SETATTR3resok   resok;
+ *	default:
+ *		SETATTR3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.3  LOOKUP3res
+ *
+ *	struct LOOKUP3resok {
+ *		nfs_fh3		object;
+ *		post_op_attr	obj_attributes;
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	struct LOOKUP3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union LOOKUP3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LOOKUP3resok	resok;
+ *	default:
+ *		LOOKUP3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_diropres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfs_fh3(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.4  ACCESS3res
+ *
+ *	struct ACCESS3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		access;
+ *	};
+ *
+ *	struct ACCESS3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union ACCESS3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		ACCESS3resok	resok;
+ *	default:
+ *		ACCESS3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_accessres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_uint32(xdr, &result->access);
+out:
+	return error;
+out_default:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.5  READLINK3res
+ *
+ *	struct READLINK3resok {
+ *		post_op_attr	symlink_attributes;
+ *		nfspath3	data;
+ *	};
+ *
+ *	struct READLINK3resfail {
+ *		post_op_attr	symlink_attributes;
+ *	};
+ *
+ *	union READLINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READLINK3resok	resok;
+ *	default:
+ *		READLINK3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfspath3(xdr);
+out:
+	return error;
+out_default:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.6  READ3res
+ *
+ *	struct READ3resok {
+ *		post_op_attr	file_attributes;
+ *		count3		count;
+ *		bool		eof;
+ *		opaque		data<>;
+ *	};
+ *
+ *	struct READ3resfail {
+ *		post_op_attr	file_attributes;
+ *	};
+ *
+ *	union READ3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READ3resok	resok;
+ *	default:
+ *		READ3resfail	resfail;
+ *	};
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+			     struct nfs_pgio_res *result)
+{
+	u32 eof, count, ocount, recvd;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p++);
+	eof = be32_to_cpup(p++);
+	ocount = be32_to_cpup(p++);
+	if (unlikely(ocount != count))
+		goto out_mismatch;
+	recvd = xdr_read_pages(xdr, count);
+	if (unlikely(count > recvd))
+		goto out_cheating;
+out:
+	result->eof = eof;
+	result->count = count;
+	return count;
+out_mismatch:
+	dprintk("NFS: READ count doesn't match length of opaque: "
+		"count %u != ocount %u\n", count, ocount);
+	return -EIO;
+out_cheating:
+	dprintk("NFS: server cheating in read result: "
+		"count %u > recvd %u\n", count, recvd);
+	count = recvd;
+	eof = 0;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_pgio_res *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	result->op_status = status;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_read3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.7  WRITE3res
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3resok {
+ *		wcc_data	file_wcc;
+ *		count3		count;
+ *		stable_how	committed;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct WRITE3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union WRITE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		WRITE3resok	resok;
+ *	default:
+ *		WRITE3resfail	resfail;
+ *	};
+ */
+static int decode_write3resok(struct xdr_stream *xdr,
+			      struct nfs_pgio_res *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->count = be32_to_cpup(p++);
+	result->verf->committed = be32_to_cpup(p++);
+	if (unlikely(result->verf->committed > NFS_FILE_SYNC))
+		goto out_badvalue;
+	if (decode_writeverf3(xdr, &result->verf->verifier))
+		goto out_eio;
+	return result->count;
+out_badvalue:
+	dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+out_eio:
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs_pgio_res *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	result->op_status = status;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_write3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.8  CREATE3res
+ *
+ *	struct CREATE3resok {
+ *		post_op_fh3	obj;
+ *		post_op_attr	obj_attributes;
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	struct CREATE3resfail {
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	union CREATE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		CREATE3resok	resok;
+ *	default:
+ *		CREATE3resfail	resfail;
+ *	};
+ */
+static int decode_create3resok(struct xdr_stream *xdr,
+			       struct nfs3_diropres *result)
+{
+	int error;
+
+	error = decode_post_op_fh3(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	/* The server isn't required to return a file handle.
+	 * If it didn't, force the client to perform a LOOKUP
+	 * to determine the correct file handle and attribute
+	 * values for the new object. */
+	if (result->fh->size == 0)
+		result->fattr->valid = 0;
+	error = decode_wcc_data(xdr, result->dir_attr);
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_diropres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_create3resok(xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.12  REMOVE3res
+ *
+ *	struct REMOVE3resok {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	struct REMOVE3resfail {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	union REMOVE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		REMOVE3resok   resok;
+ *	default:
+ *		REMOVE3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_removeres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.14  RENAME3res
+ *
+ *	struct RENAME3resok {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	struct RENAME3resfail {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	union RENAME3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		RENAME3resok   resok;
+ *	default:
+ *		RENAME3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_renameres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->old_fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->new_fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.15  LINK3res
+ *
+ *	struct LINK3resok {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	struct LINK3resfail {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	union LINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LINK3resok	resok;
+ *	default:
+ *		LINK3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs3_linkres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/**
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
+ *			the local page cache
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *	struct entry3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		fhandle3	filehandle;
+ *		post_op_attr3	attributes;
+ *		entry3		*nextentry;
+ *	};
+ *
+ * 3.3.17  entryplus3
+ *	struct entryplus3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		post_op_attr	name_attributes;
+ *		post_op_fh3	name_handle;
+ *		entryplus3	*nextentry;
+ *	};
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
+{
+	struct nfs_entry old = *entry;
+	__be32 *p;
+	int error;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p == xdr_zero)
+			return -EAGAIN;
+		entry->eof = 1;
+		return -EBADCOOKIE;
+	}
+
+	error = decode_fileid3(xdr, &entry->ino);
+	if (unlikely(error))
+		return error;
+
+	error = decode_inline_filename3(xdr, &entry->name, &entry->len);
+	if (unlikely(error))
+		return error;
+
+	entry->prev_cookie = entry->cookie;
+	error = decode_cookie3(xdr, &entry->cookie);
+	if (unlikely(error))
+		return error;
+
+	entry->d_type = DT_UNKNOWN;
+
+	if (plus) {
+		entry->fattr->valid = 0;
+		error = decode_post_op_attr(xdr, entry->fattr);
+		if (unlikely(error))
+			return error;
+		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+		if (entry->fattr->fileid != entry->ino) {
+			entry->fattr->mounted_on_fileid = entry->ino;
+			entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+		}
+
+		/* In fact, a post_op_fh3: */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p != xdr_zero) {
+			error = decode_nfs_fh3(xdr, entry->fh);
+			if (unlikely(error)) {
+				if (error == -E2BIG)
+					goto out_truncated;
+				return error;
+			}
+		} else
+			zero_nfs_fh3(entry->fh);
+	}
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EAGAIN;
+out_truncated:
+	dprintk("NFS: directory entry contains invalid file handle\n");
+	*entry = old;
+	return -EAGAIN;
+}
+
+/*
+ * 3.3.16  READDIR3res
+ *
+ *	struct dirlist3 {
+ *		entry3		*entries;
+ *		bool		eof;
+ *	};
+ *
+ *	struct READDIR3resok {
+ *		post_op_attr	dir_attributes;
+ *		cookieverf3	cookieverf;
+ *		dirlist3	reply;
+ *	};
+ *
+ *	struct READDIR3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union READDIR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READDIR3resok	resok;
+ *	default:
+ *		READDIR3resfail	resfail;
+ *	};
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_dirlist3(struct xdr_stream *xdr)
+{
+	return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int decode_readdir3resok(struct xdr_stream *xdr,
+				struct nfs3_readdirres *result)
+{
+	int error;
+
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	/* XXX: do we need to check if result->verf != NULL ? */
+	error = decode_cookieverf3(xdr, result->verf);
+	if (unlikely(error))
+		goto out;
+	error = decode_dirlist3(xdr);
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs3_readdirres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_readdir3resok(xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.18  FSSTAT3res
+ *
+ *	struct FSSTAT3resok {
+ *		post_op_attr	obj_attributes;
+ *		size3		tbytes;
+ *		size3		fbytes;
+ *		size3		abytes;
+ *		size3		tfiles;
+ *		size3		ffiles;
+ *		size3		afiles;
+ *		uint32		invarsec;
+ *	};
+ *
+ *	struct FSSTAT3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSSTAT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSSTAT3resok	resok;
+ *	default:
+ *		FSSTAT3resfail	resfail;
+ *	};
+ */
+static int decode_fsstat3resok(struct xdr_stream *xdr,
+			       struct nfs_fsstat *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8 * 6 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	p = xdr_decode_size3(p, &result->tbytes);
+	p = xdr_decode_size3(p, &result->fbytes);
+	p = xdr_decode_size3(p, &result->abytes);
+	p = xdr_decode_size3(p, &result->tfiles);
+	p = xdr_decode_size3(p, &result->ffiles);
+	xdr_decode_size3(p, &result->afiles);
+	/* ignore invarsec */
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fsstat *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsstat3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.19  FSINFO3res
+ *
+ *	struct FSINFO3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		rtmax;
+ *		uint32		rtpref;
+ *		uint32		rtmult;
+ *		uint32		wtmax;
+ *		uint32		wtpref;
+ *		uint32		wtmult;
+ *		uint32		dtpref;
+ *		size3		maxfilesize;
+ *		nfstime3	time_delta;
+ *		uint32		properties;
+ *	};
+ *
+ *	struct FSINFO3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSINFO3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSINFO3resok	resok;
+ *	default:
+ *		FSINFO3resfail	resfail;
+ *	};
+ */
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
+			       struct nfs_fsinfo *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->rtmax  = be32_to_cpup(p++);
+	result->rtpref = be32_to_cpup(p++);
+	result->rtmult = be32_to_cpup(p++);
+	result->wtmax  = be32_to_cpup(p++);
+	result->wtpref = be32_to_cpup(p++);
+	result->wtmult = be32_to_cpup(p++);
+	result->dtpref = be32_to_cpup(p++);
+	p = xdr_decode_size3(p, &result->maxfilesize);
+	xdr_decode_nfstime3(p, &result->time_delta);
+
+	/* ignore properties */
+	result->lease_time = 0;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fsinfo *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsinfo3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.20  PATHCONF3res
+ *
+ *	struct PATHCONF3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		linkmax;
+ *		uint32		name_max;
+ *		bool		no_trunc;
+ *		bool		chown_restricted;
+ *		bool		case_insensitive;
+ *		bool		case_preserving;
+ *	};
+ *
+ *	struct PATHCONF3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union PATHCONF3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		PATHCONF3resok	resok;
+ *	default:
+ *		PATHCONF3resfail resfail;
+ *	};
+ */
+static int decode_pathconf3resok(struct xdr_stream *xdr,
+				 struct nfs_pathconf *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * 6);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->max_link = be32_to_cpup(p++);
+	result->max_namelen = be32_to_cpup(p);
+	/* ignore remaining fields */
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_pathconf *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_pathconf3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.21  COMMIT3res
+ *
+ *	struct COMMIT3resok {
+ *		wcc_data	file_wcc;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct COMMIT3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union COMMIT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		COMMIT3resok	resok;
+ *	default:
+ *		COMMIT3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_commitres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	result->op_status = status;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_writeverf3(xdr, &result->verf->verifier);
+out:
+	return error;
+out_status:
+	return nfs3_stat_to_errno(status);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
+				      struct nfs3_getaclres *result)
+{
+	struct posix_acl **acl;
+	unsigned int *aclcnt;
+	size_t hdrlen;
+	int error;
+
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_uint32(xdr, &result->mask);
+	if (unlikely(error))
+		goto out;
+	error = -EINVAL;
+	if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		goto out;
+
+	hdrlen = xdr_stream_pos(xdr);
+
+	acl = NULL;
+	if (result->mask & NFS_ACL)
+		acl = &result->acl_access;
+	aclcnt = NULL;
+	if (result->mask & NFS_ACLCNT)
+		aclcnt = &result->acl_access_count;
+	error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+	if (unlikely(error <= 0))
+		goto out;
+
+	acl = NULL;
+	if (result->mask & NFS_DFACL)
+		acl = &result->acl_default;
+	aclcnt = NULL;
+	if (result->mask & NFS_DFACLCNT)
+		aclcnt = &result->acl_default_count;
+	error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+	if (unlikely(error <= 0))
+		return error;
+	error = 0;
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_getaclres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_getacl3resok(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs3_stat_to_errno(status);
+}
+
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_post_op_attr(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs3_stat_to_errno(status);
+}
+
+#endif  /* CONFIG_NFS_V3_ACL */
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_errtbl[] = {
+	{ NFS_OK,		0		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-errno_NFSERR_IO},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
+#ifdef EWFLUSH
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
+#endif
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
+};
+
+/**
+ * nfs3_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs3_stat_to_errno(enum nfs_stat status)
+{
+	int i;
+
+	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+		if (nfs_errtbl[i].stat == (int)status)
+			return nfs_errtbl[i].errno;
+	}
+	dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+	return nfs_errtbl[i].errno;
+}
+
+
+#define PROC(proc, argtype, restype, timer)				\
+[NFS3PROC_##proc] = {							\
+	.p_proc      = NFS3PROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,	\
+	.p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,	\
+	.p_arglen    = NFS3_##argtype##args_sz,				\
+	.p_replen    = NFS3_##restype##res_sz,				\
+	.p_timer     = timer,						\
+	.p_statidx   = NFS3PROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+struct rpc_procinfo	nfs3_procedures[] = {
+	PROC(GETATTR,		getattr,	getattr,	1),
+	PROC(SETATTR,		setattr,	setattr,	0),
+	PROC(LOOKUP,		lookup,		lookup,		2),
+	PROC(ACCESS,		access,		access,		1),
+	PROC(READLINK,		readlink,	readlink,	3),
+	PROC(READ,		read,		read,		3),
+	PROC(WRITE,		write,		write,		4),
+	PROC(CREATE,		create,		create,		0),
+	PROC(MKDIR,		mkdir,		create,		0),
+	PROC(SYMLINK,		symlink,	create,		0),
+	PROC(MKNOD,		mknod,		create,		0),
+	PROC(REMOVE,		remove,		remove,		0),
+	PROC(RMDIR,		lookup,		setattr,	0),
+	PROC(RENAME,		rename,		rename,		0),
+	PROC(LINK,		link,		link,		0),
+	PROC(READDIR,		readdir,	readdir,	3),
+	PROC(READDIRPLUS,	readdirplus,	readdir,	3),
+	PROC(FSSTAT,		getattr,	fsstat,		0),
+	PROC(FSINFO,		getattr,	fsinfo,		0),
+	PROC(PATHCONF,		getattr,	pathconf,	0),
+	PROC(COMMIT,		commit,		commit,		5),
+};
+
+const struct rpc_version nfs_version3 = {
+	.number			= 3,
+	.nrprocs		= ARRAY_SIZE(nfs3_procedures),
+	.procs			= nfs3_procedures
+};
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_procinfo	nfs3_acl_procedures[] = {
+	[ACLPROC3_GETACL] = {
+		.p_proc = ACLPROC3_GETACL,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
+		.p_arglen = ACL3_getaclargs_sz,
+		.p_replen = ACL3_getaclres_sz,
+		.p_timer = 1,
+		.p_name = "GETACL",
+	},
+	[ACLPROC3_SETACL] = {
+		.p_proc = ACLPROC3_SETACL,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
+		.p_arglen = ACL3_setaclargs_sz,
+		.p_replen = ACL3_setaclres_sz,
+		.p_timer = 0,
+		.p_name = "SETACL",
+	},
+};
+
+const struct rpc_version nfsacl_version3 = {
+	.number			= 3,
+	.nrprocs		= sizeof(nfs3_acl_procedures)/
+				  sizeof(nfs3_acl_procedures[0]),
+	.procs			= nfs3_acl_procedures,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
diff --git a/kernel/fs/nfs/nfs42.h b/kernel/fs/nfs/nfs42.h
new file mode 100644
index 000000000..7afb8947d
--- /dev/null
+++ b/kernel/fs/nfs/nfs42.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_2_H
+#define __LINUX_FS_NFS_NFS4_2_H
+
+/* nfs4.2proc.c */
+int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
+loff_t nfs42_proc_llseek(struct file *, loff_t, int);
+
+/* nfs4.2xdr.h */
+extern struct rpc_procinfo nfs4_2_procedures[];
+
+#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/kernel/fs/nfs/nfs42proc.c b/kernel/fs/nfs/nfs42proc.c
new file mode 100644
index 000000000..3a9e75235
--- /dev/null
+++ b/kernel/fs/nfs/nfs42proc.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#include <linux/fs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_xdr.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "nfs42.h"
+
+static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
+				fmode_t fmode)
+{
+	struct nfs_open_context *open;
+	struct nfs_lock_context *lock;
+	int ret;
+
+	open = get_nfs_open_context(nfs_file_open_context(file));
+	lock = nfs_get_lock_context(open);
+	if (IS_ERR(lock)) {
+		put_nfs_open_context(open);
+		return PTR_ERR(lock);
+	}
+
+	ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
+
+	nfs_put_lock_context(lock);
+	put_nfs_open_context(open);
+	return ret;
+}
+
+static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+				 loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(filep);
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs42_falloc_args args = {
+		.falloc_fh	= NFS_FH(inode),
+		.falloc_offset	= offset,
+		.falloc_length	= len,
+		.falloc_bitmask	= server->cache_consistency_bitmask,
+	};
+	struct nfs42_falloc_res res = {
+		.falloc_server	= server,
+	};
+	int status;
+
+	msg->rpc_argp = &args;
+	msg->rpc_resp = &res;
+
+	status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
+	if (status)
+		return status;
+
+	res.falloc_fattr = nfs_alloc_fattr();
+	if (!res.falloc_fattr)
+		return -ENOMEM;
+
+	status = nfs4_call_sync(server->client, server, msg,
+				&args.seq_args, &res.seq_res, 0);
+	if (status == 0)
+		status = nfs_post_op_update_inode(inode, res.falloc_fattr);
+
+	kfree(res.falloc_fattr);
+	return status;
+}
+
+static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+				loff_t offset, loff_t len)
+{
+	struct nfs_server *server = NFS_SERVER(file_inode(filep));
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = _nfs42_proc_fallocate(msg, filep, offset, len);
+		if (err == -ENOTSUPP)
+			return -EOPNOTSUPP;
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+
+	return err;
+}
+
+int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
+	};
+	struct inode *inode = file_inode(filep);
+	int err;
+
+	if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&inode->i_mutex);
+
+	err = nfs42_proc_fallocate(&msg, filep, offset, len);
+	if (err == -EOPNOTSUPP)
+		NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+
+	mutex_unlock(&inode->i_mutex);
+	return err;
+}
+
+int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE],
+	};
+	struct inode *inode = file_inode(filep);
+	int err;
+
+	if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
+		return -EOPNOTSUPP;
+
+	nfs_wb_all(inode);
+	mutex_lock(&inode->i_mutex);
+
+	err = nfs42_proc_fallocate(&msg, filep, offset, len);
+	if (err == 0)
+		truncate_pagecache_range(inode, offset, (offset + len) -1);
+	if (err == -EOPNOTSUPP)
+		NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+
+	mutex_unlock(&inode->i_mutex);
+	return err;
+}
+
+loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+{
+	struct inode *inode = file_inode(filep);
+	struct nfs42_seek_args args = {
+		.sa_fh		= NFS_FH(inode),
+		.sa_offset	= offset,
+		.sa_what	= (whence == SEEK_HOLE) ?
+					NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
+	};
+	struct nfs42_seek_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct nfs_server *server = NFS_SERVER(inode);
+	int status;
+
+	if (!nfs_server_capable(inode, NFS_CAP_SEEK))
+		return -ENOTSUPP;
+
+	status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+	if (status)
+		return status;
+
+	nfs_wb_all(inode);
+	status = nfs4_call_sync(server->client, server, &msg,
+				&args.seq_args, &res.seq_res, 0);
+	if (status == -ENOTSUPP)
+		server->caps &= ~NFS_CAP_SEEK;
+	if (status)
+		return status;
+
+	return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+}
diff --git a/kernel/fs/nfs/nfs42xdr.c b/kernel/fs/nfs/nfs42xdr.c
new file mode 100644
index 000000000..1a25b2724
--- /dev/null
+++ b/kernel/fs/nfs/nfs42xdr.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
+#define __LINUX_FS_NFS_NFS4_2XDR_H
+
+#define encode_fallocate_maxsz		(encode_stateid_maxsz + \
+					 2 /* offset */ + \
+					 2 /* length */)
+#define encode_allocate_maxsz		(op_encode_hdr_maxsz + \
+					 encode_fallocate_maxsz)
+#define decode_allocate_maxsz		(op_decode_hdr_maxsz)
+#define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
+					 encode_fallocate_maxsz)
+#define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
+#define encode_seek_maxsz		(op_encode_hdr_maxsz + \
+					 encode_stateid_maxsz + \
+					 2 /* offset */ + \
+					 1 /* whence */)
+#define decode_seek_maxsz		(op_decode_hdr_maxsz + \
+					 1 /* eof */ + \
+					 1 /* whence */ + \
+					 2 /* offset */ + \
+					 2 /* length */)
+
+#define NFS4_enc_allocate_sz		(compound_encode_hdr_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_allocate_maxsz + \
+					 encode_getattr_maxsz)
+#define NFS4_dec_allocate_sz		(compound_decode_hdr_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_allocate_maxsz + \
+					 decode_getattr_maxsz)
+#define NFS4_enc_deallocate_sz		(compound_encode_hdr_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_deallocate_maxsz + \
+					 encode_getattr_maxsz)
+#define NFS4_dec_deallocate_sz		(compound_decode_hdr_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_deallocate_maxsz + \
+					 decode_getattr_maxsz)
+#define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_seek_maxsz)
+#define NFS4_dec_seek_sz		(compound_decode_hdr_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_seek_maxsz)
+
+
+static void encode_fallocate(struct xdr_stream *xdr,
+			     struct nfs42_falloc_args *args)
+{
+	encode_nfs4_stateid(xdr, &args->falloc_stateid);
+	encode_uint64(xdr, args->falloc_offset);
+	encode_uint64(xdr, args->falloc_length);
+}
+
+static void encode_allocate(struct xdr_stream *xdr,
+			    struct nfs42_falloc_args *args,
+			    struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_ALLOCATE, decode_allocate_maxsz, hdr);
+	encode_fallocate(xdr, args);
+}
+
+static void encode_deallocate(struct xdr_stream *xdr,
+			      struct nfs42_falloc_args *args,
+			      struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_DEALLOCATE, decode_deallocate_maxsz, hdr);
+	encode_fallocate(xdr, args);
+}
+
+static void encode_seek(struct xdr_stream *xdr,
+			struct nfs42_seek_args *args,
+			struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->sa_stateid);
+	encode_uint64(xdr, args->sa_offset);
+	encode_uint32(xdr, args->sa_what);
+}
+
+/*
+ * Encode ALLOCATE request
+ */
+static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  struct nfs42_falloc_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->falloc_fh, &hdr);
+	encode_allocate(xdr, args, &hdr);
+	encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode DEALLOCATE request
+ */
+static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs42_falloc_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->falloc_fh, &hdr);
+	encode_deallocate(xdr, args, &hdr);
+	encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode SEEK request
+ */
+static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
+			      struct xdr_stream *xdr,
+			      struct nfs42_seek_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->sa_fh, &hdr);
+	encode_seek(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+	return decode_op_hdr(xdr, OP_ALLOCATE);
+}
+
+static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+	return decode_op_hdr(xdr, OP_DEALLOCATE);
+}
+
+static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
+{
+	int status;
+	__be32 *p;
+
+	status = decode_op_hdr(xdr, OP_SEEK);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4 + 8);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->sr_eof = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &res->sr_offset);
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * Decode ALLOCATE request
+ */
+static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
+				 struct nfs42_falloc_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_allocate(xdr, res);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+	return status;
+}
+
+/*
+ * Decode DEALLOCATE request
+ */
+static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
+				   struct xdr_stream *xdr,
+				   struct nfs42_falloc_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_deallocate(xdr, res);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+	return status;
+}
+
+/*
+ * Decode SEEK request
+ */
+static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
+			     struct xdr_stream *xdr,
+			     struct nfs42_seek_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_seek(xdr, res);
+out:
+	return status;
+}
+#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/kernel/fs/nfs/nfs4_fs.h b/kernel/fs/nfs/nfs4_fs.h
new file mode 100644
index 000000000..fdef424b0
--- /dev/null
+++ b/kernel/fs/nfs/nfs4_fs.h
@@ -0,0 +1,529 @@
+/*
+ * linux/fs/nfs/nfs4_fs.h
+ *
+ * Copyright (C) 2005 Trond Myklebust
+ *
+ * NFSv4-specific filesystem definitions and declarations
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_FS_H
+#define __LINUX_FS_NFS_NFS4_FS_H
+
+#if defined(CONFIG_NFS_V4_2)
+#define NFS4_MAX_MINOR_VERSION 2
+#elif defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MINOR_VERSION 1
+#else
+#define NFS4_MAX_MINOR_VERSION 0
+#endif
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
+#include <linux/seqlock.h>
+
+struct idmap;
+
+enum nfs4_client_state {
+	NFS4CLNT_MANAGER_RUNNING  = 0,
+	NFS4CLNT_CHECK_LEASE,
+	NFS4CLNT_LEASE_EXPIRED,
+	NFS4CLNT_RECLAIM_REBOOT,
+	NFS4CLNT_RECLAIM_NOGRACE,
+	NFS4CLNT_DELEGRETURN,
+	NFS4CLNT_SESSION_RESET,
+	NFS4CLNT_LEASE_CONFIRM,
+	NFS4CLNT_SERVER_SCOPE_MISMATCH,
+	NFS4CLNT_PURGE_STATE,
+	NFS4CLNT_BIND_CONN_TO_SESSION,
+	NFS4CLNT_MOVED,
+	NFS4CLNT_LEASE_MOVED,
+};
+
+#define NFS4_RENEW_TIMEOUT		0x01
+#define NFS4_RENEW_DELEGATION_CB	0x02
+
+struct nfs_seqid_counter;
+struct nfs4_minor_version_ops {
+	u32	minor_version;
+	unsigned init_caps;
+
+	int	(*init_client)(struct nfs_client *);
+	void	(*shutdown_client)(struct nfs_client *);
+	bool	(*match_stateid)(const nfs4_stateid *,
+			const nfs4_stateid *);
+	int	(*find_root_sec)(struct nfs_server *, struct nfs_fh *,
+			struct nfs_fsinfo *);
+	void	(*free_lock_state)(struct nfs_server *,
+			struct nfs4_lock_state *);
+	struct nfs_seqid *
+		(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+	const struct rpc_call_ops *call_sync_ops;
+	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
+	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
+	const struct nfs4_state_maintenance_ops *state_renewal_ops;
+	const struct nfs4_mig_recovery_ops *mig_recovery_ops;
+};
+
+#define NFS_SEQID_CONFIRMED 1
+struct nfs_seqid_counter {
+	ktime_t create_time;
+	int owner_id;
+	int flags;
+	u32 counter;
+	spinlock_t lock;		/* Protects the list */
+	struct list_head list;		/* Defines sequence of RPC calls */
+	struct rpc_wait_queue	wait;	/* RPC call delay queue */
+};
+
+struct nfs_seqid {
+	struct nfs_seqid_counter *sequence;
+	struct list_head list;
+	struct rpc_task *task;
+};
+
+static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
+{
+	if (seqid_mutating_err(-status))
+		seqid->flags |= NFS_SEQID_CONFIRMED;
+}
+
+/*
+ * NFS4 state_owners and lock_owners are simply labels for ordered
+ * sequences of RPC calls. Their sole purpose is to provide once-only
+ * semantics by allowing the server to identify replayed requests.
+ */
+struct nfs4_state_owner {
+	struct nfs_server    *so_server;
+	struct list_head     so_lru;
+	unsigned long        so_expires;
+	struct rb_node	     so_server_node;
+
+	struct rpc_cred	     *so_cred;	 /* Associated cred */
+
+	spinlock_t	     so_lock;
+	atomic_t	     so_count;
+	unsigned long	     so_flags;
+	struct list_head     so_states;
+	struct nfs_seqid_counter so_seqid;
+	seqcount_t	     so_reclaim_seqcount;
+	struct mutex	     so_delegreturn_mutex;
+};
+
+enum {
+	NFS_OWNER_RECLAIM_REBOOT,
+	NFS_OWNER_RECLAIM_NOGRACE
+};
+
+#define NFS_LOCK_NEW		0
+#define NFS_LOCK_RECLAIM	1
+#define NFS_LOCK_EXPIRED	2
+
+/*
+ * struct nfs4_state maintains the client-side state for a given
+ * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
+ *
+ * OPEN:
+ * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server,
+ * we need to know how many files are open for reading or writing on a
+ * given inode. This information too is stored here.
+ *
+ * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
+ */
+
+struct nfs4_lock_state {
+	struct list_head	ls_locks;	/* Other lock stateids */
+	struct nfs4_state *	ls_state;	/* Pointer to open state */
+#define NFS_LOCK_INITIALIZED 0
+#define NFS_LOCK_LOST        1
+	unsigned long		ls_flags;
+	struct nfs_seqid_counter	ls_seqid;
+	nfs4_stateid		ls_stateid;
+	atomic_t		ls_count;
+	fl_owner_t		ls_owner;
+};
+
+/* bits for nfs4_state->flags */
+enum {
+	LK_STATE_IN_USE,
+	NFS_DELEGATED_STATE,		/* Current stateid is delegation */
+	NFS_OPEN_STATE,			/* OPEN stateid is set */
+	NFS_O_RDONLY_STATE,		/* OPEN stateid has read-only state */
+	NFS_O_WRONLY_STATE,		/* OPEN stateid has write-only state */
+	NFS_O_RDWR_STATE,		/* OPEN stateid has read/write state */
+	NFS_STATE_RECLAIM_REBOOT,	/* OPEN stateid server rebooted */
+	NFS_STATE_RECLAIM_NOGRACE,	/* OPEN stateid needs to recover state */
+	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */
+	NFS_STATE_RECOVERY_FAILED,	/* OPEN stateid state recovery failed */
+};
+
+struct nfs4_state {
+	struct list_head open_states;	/* List of states for the same state_owner */
+	struct list_head inode_states;	/* List of states for the same inode */
+	struct list_head lock_states;	/* List of subservient lock stateids */
+
+	struct nfs4_state_owner *owner;	/* Pointer to the open owner */
+	struct inode *inode;		/* Pointer to the inode */
+
+	unsigned long flags;		/* Do we hold any locks? */
+	spinlock_t state_lock;		/* Protects the lock_states list */
+
+	seqlock_t seqlock;		/* Protects the stateid/open_stateid */
+	nfs4_stateid stateid;		/* Current stateid: may be delegation */
+	nfs4_stateid open_stateid;	/* OPEN stateid */
+
+	/* The following 3 fields are protected by owner->so_lock */
+	unsigned int n_rdonly;		/* Number of read-only references */
+	unsigned int n_wronly;		/* Number of write-only references */
+	unsigned int n_rdwr;		/* Number of read/write references */
+	fmode_t state;			/* State on the server (R,W, or RW) */
+	atomic_t count;
+};
+
+
+struct nfs4_exception {
+	long timeout;
+	int retry;
+	struct nfs4_state *state;
+	struct inode *inode;
+};
+
+struct nfs4_state_recovery_ops {
+	int owner_flag_bit;
+	int state_flag_bit;
+	int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
+	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
+	int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
+	int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
+	int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
+		struct rpc_cred *);
+};
+
+struct nfs4_state_maintenance_ops {
+	int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
+	struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
+	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
+};
+
+struct nfs4_mig_recovery_ops {
+	int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
+		struct page *, struct rpc_cred *);
+	int (*fsid_present)(struct inode *, struct rpc_cred *);
+};
+
+extern const struct dentry_operations nfs4_dentry_operations;
+
+/* dir.c */
+int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
+		    unsigned, umode_t, int *);
+
+/* super.c */
+extern struct file_system_type nfs4_fs_type;
+
+/* nfs4namespace.c */
+struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *);
+struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
+			       struct nfs_fh *, struct nfs_fattr *);
+int nfs4_replace_transport(struct nfs_server *server,
+				const struct nfs4_fs_locations *locations);
+
+/* nfs4proc.c */
+extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
+extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
+			  struct rpc_message *, struct nfs4_sequence_args *,
+			  struct nfs4_sequence_res *, int);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
+extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
+extern int nfs4_destroy_clientid(struct nfs_client *clp);
+extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
+				  struct nfs4_fs_locations *, struct page *);
+extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
+		struct page *page, struct rpc_cred *);
+extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
+extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
+			    struct nfs_fh *, struct nfs_fattr *);
+extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode);
+
+#if defined(CONFIG_NFS_V4_1)
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+	return server->nfs_client->cl_session;
+}
+
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		struct rpc_task *task);
+extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
+extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
+extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
+		struct nfs_fsinfo *fsinfo);
+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
+				  bool sync);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+		EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+		    struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+	struct rpc_cred *newcred = NULL;
+	rpc_authflavor_t flavor;
+
+	if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
+		spin_lock(&clp->cl_lock);
+		if (clp->cl_machine_cred != NULL)
+			/* don't call get_rpccred on the machine cred -
+			 * a reference will be held for life of clp */
+			newcred = clp->cl_machine_cred;
+		spin_unlock(&clp->cl_lock);
+		msg->rpc_cred = newcred;
+
+		flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+		WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I &&
+			     flavor != RPC_AUTH_GSS_KRB5P);
+		*clntp = clp->cl_rpcclient;
+
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Function responsible for determining if an rpc_message should use the
+ * machine cred under SP4_MACH_CRED and if so switching the credential and
+ * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p).
+ * Should be called before rpc_call_sync/rpc_call_async.
+ */
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+		   struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+	_nfs4_state_protect(clp, sp4_mode, clntp, msg);
+}
+
+/*
+ * Special wrapper to nfs4_state_protect for write.
+ * If WRITE can use machine cred but COMMIT cannot, make sure all writes
+ * that use machine cred use NFS_FILE_SYNC.
+ */
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+			 struct rpc_message *msg, struct nfs_pgio_header *hdr)
+{
+	if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
+	    !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
+		hdr->args.stable = NFS_FILE_SYNC;
+}
+#else /* CONFIG_NFS_v4_1 */
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+	return NULL;
+}
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return false;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return false;
+}
+
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
+		   struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+}
+
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+			 struct rpc_message *msg, struct nfs_pgio_header *hdr)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
+
+extern const u32 nfs4_fattr_bitmap[3];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
+extern const u32 nfs4_fsinfo_bitmap[3];
+extern const u32 nfs4_fs_locations_bitmap[3];
+
+void nfs40_shutdown_client(struct nfs_client *);
+void nfs41_shutdown_client(struct nfs_client *);
+int nfs40_init_client(struct nfs_client *);
+int nfs41_init_client(struct nfs_client *);
+void nfs4_free_client(struct nfs_client *);
+
+struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);
+
+/* nfs4renewd.c */
+extern void nfs4_schedule_state_renewal(struct nfs_client *);
+extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
+extern void nfs4_kill_renewd(struct nfs_client *);
+extern void nfs4_renew_state(struct work_struct *);
+
+/* nfs4state.c */
+struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+			struct nfs_client **);
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+			struct nfs_client **, struct rpc_cred *);
+#if defined(CONFIG_NFS_V4_1)
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+			struct nfs_client **, struct rpc_cred *);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
+extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
+extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
+
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
+extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *);
+extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
+extern void nfs4_put_open_state(struct nfs4_state *);
+extern void nfs4_close_state(struct nfs4_state *, fmode_t);
+extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
+extern void nfs_inode_find_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
+extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
+extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
+extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
+extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_server_scope(struct nfs_client *,
+				      struct nfs41_server_scope **);
+extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
+extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
+extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
+		fmode_t, const struct nfs_lockowner *);
+
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
+extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
+extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_release_seqid(struct nfs_seqid *seqid);
+extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
+				struct nfs4_sequence_args *args,
+				struct nfs4_sequence_res *res,
+				struct rpc_task *task);
+extern int nfs4_sequence_done(struct rpc_task *task,
+			      struct nfs4_sequence_res *res);
+
+extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
+
+extern const nfs4_stateid zero_stateid;
+
+/* nfs4super.c */
+struct nfs_mount_info;
+extern struct nfs_subversion nfs_v4;
+struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *);
+extern bool nfs4_disable_idmapping;
+extern unsigned short max_session_slots;
+extern unsigned short send_implementation_id;
+extern bool recover_lost_locks;
+
+#define NFS4_CLIENT_ID_UNIQ_LEN		(64)
+extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
+
+/* nfs4sysctl.c */
+#ifdef CONFIG_SYSCTL
+int nfs4_register_sysctl(void);
+void nfs4_unregister_sysctl(void);
+#else
+static inline int nfs4_register_sysctl(void)
+{
+	return 0;
+}
+
+static inline void nfs4_unregister_sysctl(void)
+{
+}
+#endif
+
+/* nfs4xdr.c */
+extern struct rpc_procinfo nfs4_procedures[];
+
+struct nfs4_mount_data;
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
+
+static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+	return memcmp(dst, src, sizeof(*dst)) == 0;
+}
+
+static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+	return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
+}
+
+static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+	return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
+}
+
+static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
+{
+	return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
+}
+
+#else
+
+#define nfs4_close_state(a, b) do { } while (0)
+#define nfs4_close_sync(a, b) do { } while (0)
+#define nfs4_state_protect(a, b, c, d) do { } while (0)
+#define nfs4_state_protect_write(a, b, c, d) do { } while (0)
+
+#endif /* CONFIG_NFS_V4 */
+#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/kernel/fs/nfs/nfs4client.c b/kernel/fs/nfs/nfs4client.c
new file mode 100644
index 000000000..e42be52a8
--- /dev/null
+++ b/kernel/fs/nfs/nfs4client.c
@@ -0,0 +1,1232 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "internal.h"
+#include "callback.h"
+#include "delegation.h"
+#include "nfs4session.h"
+#include "nfs4idmap.h"
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+	int ret = 0;
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+	if (clp->rpc_ops->version != 4 || minorversion != 0)
+		return ret;
+	idr_preload(GFP_KERNEL);
+	spin_lock(&nn->nfs_client_lock);
+	ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
+	if (ret >= 0)
+		clp->cl_cb_ident = ret;
+	spin_unlock(&nn->nfs_client_lock);
+	idr_preload_end();
+	return ret < 0 ? ret : 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/**
+ * Per auth flavor data server rpc clients
+ */
+struct nfs4_ds_server {
+	struct list_head	list;   /* ds_clp->cl_ds_clients */
+	struct rpc_clnt		*rpc_clnt;
+};
+
+/**
+ * Common lookup case for DS I/O
+ */
+static struct nfs4_ds_server *
+nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+	struct nfs4_ds_server *dss;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) {
+		if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+			continue;
+		goto out;
+	}
+	dss = NULL;
+out:
+	rcu_read_unlock();
+	return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor,
+			   struct nfs4_ds_server *new)
+{
+	struct nfs4_ds_server *dss;
+
+	spin_lock(&ds_clp->cl_lock);
+	list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) {
+		if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+			continue;
+		goto out;
+	}
+	if (new)
+		list_add_rcu(&new->list, &ds_clp->cl_ds_clients);
+	dss = new;
+out:
+	spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */
+	return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+	struct nfs4_ds_server *dss;
+
+	dss = kmalloc(sizeof(*dss), GFP_NOFS);
+	if (dss == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor);
+	if (IS_ERR(dss->rpc_clnt)) {
+		int err = PTR_ERR(dss->rpc_clnt);
+		kfree (dss);
+		return ERR_PTR(err);
+	}
+	INIT_LIST_HEAD(&dss->list);
+
+	return dss;
+}
+
+static void
+nfs4_free_ds_server(struct nfs4_ds_server *dss)
+{
+	rpc_release_client(dss->rpc_clnt);
+	kfree(dss);
+}
+
+/**
+* Find or create a DS rpc client with th MDS server rpc client auth flavor
+* in the nfs_client cl_ds_clients list.
+*/
+struct rpc_clnt *
+nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
+{
+	struct nfs4_ds_server *dss, *new;
+	rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor;
+
+	dss = nfs4_find_ds_client(ds_clp, flavor);
+	if (dss != NULL)
+		goto out;
+	new = nfs4_alloc_ds_server(ds_clp, flavor);
+	if (IS_ERR(new))
+		return ERR_CAST(new);
+	dss = nfs4_add_ds_client(ds_clp, flavor, new);
+	if (dss != new)
+		nfs4_free_ds_server(new);
+out:
+	return dss->rpc_clnt;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client);
+
+static void
+nfs4_shutdown_ds_clients(struct nfs_client *clp)
+{
+	struct nfs4_ds_server *dss;
+	LIST_HEAD(shutdown_list);
+
+	while (!list_empty(&clp->cl_ds_clients)) {
+		dss = list_entry(clp->cl_ds_clients.next,
+					struct nfs4_ds_server, list);
+		list_del(&dss->list);
+		rpc_shutdown_client(dss->rpc_clnt);
+		kfree (dss);
+	}
+}
+
+void nfs41_shutdown_client(struct nfs_client *clp)
+{
+	if (nfs4_has_session(clp)) {
+		nfs4_shutdown_ds_clients(clp);
+		nfs4_destroy_session(clp->cl_session);
+		nfs4_destroy_clientid(clp);
+	}
+
+}
+#endif	/* CONFIG_NFS_V4_1 */
+
+void nfs40_shutdown_client(struct nfs_client *clp)
+{
+	if (clp->cl_slot_tbl) {
+		nfs4_shutdown_slot_table(clp->cl_slot_tbl);
+		kfree(clp->cl_slot_tbl);
+	}
+}
+
+struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+	int err;
+	struct nfs_client *clp = nfs_alloc_client(cl_init);
+	if (IS_ERR(clp))
+		return clp;
+
+	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+	if (err)
+		goto error;
+
+	if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	spin_lock_init(&clp->cl_lock);
+	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
+	INIT_LIST_HEAD(&clp->cl_ds_clients);
+	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
+	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+	clp->cl_minorversion = cl_init->minorversion;
+	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+	clp->cl_mig_gen = 1;
+	return clp;
+
+error:
+	nfs_free_client(clp);
+	return ERR_PTR(err);
+}
+
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+		nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net);
+}
+
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+		nfs4_kill_renewd(clp);
+	clp->cl_mvops->shutdown_client(clp);
+	nfs4_destroy_callback(clp);
+	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+		nfs_idmap_delete(clp);
+
+	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+	kfree(clp->cl_serverowner);
+	kfree(clp->cl_serverscope);
+	kfree(clp->cl_implid);
+	kfree(clp->cl_owner_id);
+}
+
+void nfs4_free_client(struct nfs_client *clp)
+{
+	nfs4_shutdown_client(clp);
+	nfs_free_client(clp);
+}
+
+/*
+ * Initialize the NFS4 callback service
+ */
+static int nfs4_init_callback(struct nfs_client *clp)
+{
+	struct rpc_xprt *xprt;
+	int error;
+
+	xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
+
+	if (nfs4_has_session(clp)) {
+		error = xprt_setup_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+		if (error < 0)
+			return error;
+	}
+
+	error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
+	if (error < 0) {
+		dprintk("%s: failed to start callback. Error = %d\n",
+			__func__, error);
+		return error;
+	}
+	__set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+
+	return 0;
+}
+
+/**
+ * nfs40_init_client - nfs_client initialization tasks for NFSv4.0
+ * @clp - nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs40_init_client(struct nfs_client *clp)
+{
+	struct nfs4_slot_table *tbl;
+	int ret;
+
+	tbl = kzalloc(sizeof(*tbl), GFP_NOFS);
+	if (tbl == NULL)
+		return -ENOMEM;
+
+	ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
+					"NFSv4.0 transport Slot table");
+	if (ret) {
+		kfree(tbl);
+		return ret;
+	}
+
+	clp->cl_slot_tbl = tbl;
+	return 0;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/**
+ * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
+ * @clp - nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs41_init_client(struct nfs_client *clp)
+{
+	struct nfs4_session *session = NULL;
+
+	/*
+	 * Create the session and mark it expired.
+	 * When a SEQUENCE operation encounters the expired session
+	 * it will do session recovery to initialize it.
+	 */
+	session = nfs4_alloc_session(clp);
+	if (!session)
+		return -ENOMEM;
+
+	clp->cl_session = session;
+
+	/*
+	 * The create session reply races with the server back
+	 * channel probe. Mark the client NFS_CS_SESSION_INITING
+	 * so that the client back channel can find the
+	 * nfs_client struct
+	 */
+	nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
+	return 0;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+	int ret;
+
+	ret = clp->cl_mvops->init_client(clp);
+	if (ret)
+		return ret;
+	return nfs4_init_callback(clp);
+}
+
+/**
+ * nfs4_init_client - Initialise an NFS4 client record
+ *
+ * @clp: nfs_client to initialise
+ * @timeparms: timeout parameters for underlying RPC transport
+ * @ip_addr: callback IP address in presentation format
+ * @authflavor: authentication flavor for underlying RPC transport
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
+ */
+struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+				    const struct rpc_timeout *timeparms,
+				    const char *ip_addr)
+{
+	char buf[INET6_ADDRSTRLEN + 1];
+	struct nfs_client *old;
+	int error;
+
+	if (clp->cl_cons_state == NFS_CS_READY) {
+		/* the client is initialised already */
+		dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
+		return clp;
+	}
+
+	/* Check NFS protocol revision and initialize RPC op vector */
+	clp->rpc_ops = &nfs_v4_clientops;
+
+	if (clp->cl_minorversion != 0)
+		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
+	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
+
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+	if (error == -EINVAL)
+		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+	if (error < 0)
+		goto error;
+
+	/* If no clientaddr= option was specified, find a usable cb address */
+	if (ip_addr == NULL) {
+		struct sockaddr_storage cb_addr;
+		struct sockaddr *sap = (struct sockaddr *)&cb_addr;
+
+		error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
+		if (error < 0)
+			goto error;
+		error = rpc_ntop(sap, buf, sizeof(buf));
+		if (error < 0)
+			goto error;
+		ip_addr = (const char *)buf;
+	}
+	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+
+	error = nfs_idmap_new(clp);
+	if (error < 0) {
+		dprintk("%s: failed to create idmapper. Error = %d\n",
+			__func__, error);
+		goto error;
+	}
+	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+
+	error = nfs4_init_client_minor_version(clp);
+	if (error < 0)
+		goto error;
+
+	if (!nfs4_has_session(clp))
+		nfs_mark_client_ready(clp, NFS_CS_READY);
+
+	error = nfs4_discover_server_trunking(clp, &old);
+	if (error < 0)
+		goto error;
+
+	if (clp != old)
+		clp->cl_preserve_clid = true;
+	nfs_put_client(clp);
+	return old;
+
+error:
+	nfs_mark_client_ready(clp, error);
+	nfs_put_client(clp);
+	dprintk("<-- nfs4_init_client() = xerror %d\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * SETCLIENTID just did a callback update with the callback ident in
+ * "drop," but server trunking discovery claims "drop" and "keep" are
+ * actually the same server.  Swap the callback IDs so that "keep"
+ * will continue to use the callback ident the server now knows about,
+ * and so that "keep"'s original callback ident is destroyed when
+ * "drop" is freed.
+ */
+static void nfs4_swap_callback_idents(struct nfs_client *keep,
+				      struct nfs_client *drop)
+{
+	struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id);
+	unsigned int save = keep->cl_cb_ident;
+
+	if (keep->cl_cb_ident == drop->cl_cb_ident)
+		return;
+
+	dprintk("%s: keeping callback ident %u and dropping ident %u\n",
+		__func__, keep->cl_cb_ident, drop->cl_cb_ident);
+
+	spin_lock(&nn->nfs_client_lock);
+
+	idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident);
+	keep->cl_cb_ident = drop->cl_cb_ident;
+
+	idr_replace(&nn->cb_ident_idr, drop, save);
+	drop->cl_cb_ident = save;
+
+	spin_unlock(&nn->nfs_client_lock);
+}
+
+static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
+		const struct nfs_client *clp2)
+{
+	if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL)
+		return true;
+	return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
+}
+
+/**
+ * nfs40_walk_client_list - Find server that recognizes a client ID
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs40_walk_client_list() relies on the new nfs_client being
+ *     the last nfs_client on the list.
+ */
+int nfs40_walk_client_list(struct nfs_client *new,
+			   struct nfs_client **result,
+			   struct rpc_cred *cred)
+{
+	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+	struct nfs_client *pos, *prev = NULL;
+	struct nfs4_setclientid_res clid = {
+		.clientid	= new->cl_clientid,
+		.confirm	= new->cl_confirm,
+	};
+	int status = -NFS4ERR_STALE_CLIENTID;
+
+	spin_lock(&nn->nfs_client_lock);
+	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+		if (pos->rpc_ops != new->rpc_ops)
+			continue;
+
+		if (pos->cl_minorversion != new->cl_minorversion)
+			continue;
+
+		/* If "pos" isn't marked ready, we can't trust the
+		 * remaining fields in "pos" */
+		if (pos->cl_cons_state > NFS_CS_READY) {
+			atomic_inc(&pos->cl_count);
+			spin_unlock(&nn->nfs_client_lock);
+
+			nfs_put_client(prev);
+			prev = pos;
+
+			status = nfs_wait_client_init_complete(pos);
+			if (status < 0)
+				goto out;
+			status = -NFS4ERR_STALE_CLIENTID;
+			spin_lock(&nn->nfs_client_lock);
+		}
+		if (pos->cl_cons_state != NFS_CS_READY)
+			continue;
+
+		if (pos->cl_clientid != new->cl_clientid)
+			continue;
+
+		if (!nfs4_match_client_owner_id(pos, new))
+			continue;
+
+		atomic_inc(&pos->cl_count);
+		spin_unlock(&nn->nfs_client_lock);
+
+		nfs_put_client(prev);
+		prev = pos;
+
+		status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
+		switch (status) {
+		case -NFS4ERR_STALE_CLIENTID:
+			break;
+		case 0:
+			nfs4_swap_callback_idents(pos, new);
+
+			prev = NULL;
+			*result = pos;
+			dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
+				__func__, pos, atomic_read(&pos->cl_count));
+			goto out;
+		case -ERESTARTSYS:
+		case -ETIMEDOUT:
+			/* The callback path may have been inadvertently
+			 * changed. Schedule recovery!
+			 */
+			nfs4_schedule_path_down_recovery(pos);
+		default:
+			goto out;
+		}
+
+		spin_lock(&nn->nfs_client_lock);
+	}
+	spin_unlock(&nn->nfs_client_lock);
+
+	/* No match found. The server lost our clientid */
+out:
+	nfs_put_client(prev);
+	dprintk("NFS: <-- %s status = %d\n", __func__, status);
+	return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Returns true if the client IDs match
+ */
+static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
+{
+	if (a->cl_clientid != b->cl_clientid) {
+		dprintk("NFS: --> %s client ID %llx does not match %llx\n",
+			__func__, a->cl_clientid, b->cl_clientid);
+		return false;
+	}
+	dprintk("NFS: --> %s client ID %llx matches %llx\n",
+		__func__, a->cl_clientid, b->cl_clientid);
+	return true;
+}
+
+/*
+ * Returns true if the server major ids match
+ */
+static bool
+nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
+{
+	struct nfs41_server_owner *o1 = a->cl_serverowner;
+	struct nfs41_server_owner *o2 = b->cl_serverowner;
+
+	if (o1->major_id_sz != o2->major_id_sz)
+		goto out_major_mismatch;
+	if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
+		goto out_major_mismatch;
+
+	dprintk("NFS: --> %s server owners match\n", __func__);
+	return true;
+
+out_major_mismatch:
+	dprintk("NFS: --> %s server owner major IDs do not match\n",
+		__func__);
+	return false;
+}
+
+/**
+ * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs41_walk_client_list() relies on the new nfs_client being
+ *     the last nfs_client on the list.
+ */
+int nfs41_walk_client_list(struct nfs_client *new,
+			   struct nfs_client **result,
+			   struct rpc_cred *cred)
+{
+	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+	struct nfs_client *pos, *prev = NULL;
+	int status = -NFS4ERR_STALE_CLIENTID;
+
+	spin_lock(&nn->nfs_client_lock);
+	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+		if (pos == new)
+			goto found;
+
+		if (pos->rpc_ops != new->rpc_ops)
+			continue;
+
+		if (pos->cl_minorversion != new->cl_minorversion)
+			continue;
+
+		/* If "pos" isn't marked ready, we can't trust the
+		 * remaining fields in "pos", especially the client
+		 * ID and serverowner fields.  Wait for CREATE_SESSION
+		 * to finish. */
+		if (pos->cl_cons_state > NFS_CS_READY) {
+			atomic_inc(&pos->cl_count);
+			spin_unlock(&nn->nfs_client_lock);
+
+			nfs_put_client(prev);
+			prev = pos;
+
+			status = nfs_wait_client_init_complete(pos);
+			spin_lock(&nn->nfs_client_lock);
+			if (status < 0)
+				break;
+			status = -NFS4ERR_STALE_CLIENTID;
+		}
+		if (pos->cl_cons_state != NFS_CS_READY)
+			continue;
+
+		if (!nfs4_match_clientids(pos, new))
+			continue;
+
+		/*
+		 * Note that session trunking is just a special subcase of
+		 * client id trunking. In either case, we want to fall back
+		 * to using the existing nfs_client.
+		 */
+		if (!nfs4_check_clientid_trunking(pos, new))
+			continue;
+
+		/* Unlike NFSv4.0, we know that NFSv4.1 always uses the
+		 * uniform string, however someone might switch the
+		 * uniquifier string on us.
+		 */
+		if (!nfs4_match_client_owner_id(pos, new))
+			continue;
+found:
+		atomic_inc(&pos->cl_count);
+		*result = pos;
+		status = 0;
+		dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
+			__func__, pos, atomic_read(&pos->cl_count));
+		break;
+	}
+
+	/* No matching nfs_client found. */
+	spin_unlock(&nn->nfs_client_lock);
+	dprintk("NFS: <-- %s status = %d\n", __func__, status);
+	nfs_put_client(prev);
+	return status;
+}
+#endif	/* CONFIG_NFS_V4_1 */
+
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+	nfs_server_return_all_delegations(server);
+	unset_pnfs_layoutdriver(server);
+	nfs4_purge_state_owners(server);
+}
+
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(struct net *net, int cb_ident)
+{
+	struct nfs_client *clp;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	spin_lock(&nn->nfs_client_lock);
+	clp = idr_find(&nn->cb_ident_idr, cb_ident);
+	if (clp)
+		atomic_inc(&clp->cl_count);
+	spin_unlock(&nn->nfs_client_lock);
+	return clp;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/* Common match routine for v4.0 and v4.1 callback services */
+static bool nfs4_cb_match_client(const struct sockaddr *addr,
+		struct nfs_client *clp, u32 minorversion)
+{
+	struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
+	/* Don't match clients that failed to initialise */
+	if (!(clp->cl_cons_state == NFS_CS_READY ||
+	    clp->cl_cons_state == NFS_CS_SESSION_INITING))
+		return false;
+
+	smp_rmb();
+
+	/* Match the version and minorversion */
+	if (clp->rpc_ops->version != 4 ||
+	    clp->cl_minorversion != minorversion)
+		return false;
+
+	/* Match only the IP address, not the port number */
+	if (!nfs_sockaddr_match_ipaddr(addr, clap))
+		return false;
+
+	return true;
+}
+
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid, u32 minorversion)
+{
+	struct nfs_client *clp;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	spin_lock(&nn->nfs_client_lock);
+	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+		if (nfs4_cb_match_client(addr, clp, minorversion) == false)
+			continue;
+
+		if (!nfs4_has_session(clp))
+			continue;
+
+		/* Match sessionid*/
+		if (memcmp(clp->cl_session->sess_id.data,
+		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nn->nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nn->nfs_client_lock);
+	return NULL;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+struct nfs_client *
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid, u32 minorversion)
+{
+	return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Set up an NFS4 client
+ */
+static int nfs4_set_client(struct nfs_server *server,
+		const char *hostname,
+		const struct sockaddr *addr,
+		const size_t addrlen,
+		const char *ip_addr,
+		rpc_authflavor_t authflavour,
+		int proto, const struct rpc_timeout *timeparms,
+		u32 minorversion, struct net *net)
+{
+	struct nfs_client_initdata cl_init = {
+		.hostname = hostname,
+		.addr = addr,
+		.addrlen = addrlen,
+		.nfs_mod = &nfs_v4,
+		.proto = proto,
+		.minorversion = minorversion,
+		.net = net,
+	};
+	struct nfs_client *clp;
+	int error;
+
+	dprintk("--> nfs4_set_client()\n");
+
+	if (server->flags & NFS_MOUNT_NORESVPORT)
+		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+	if (server->options & NFS_OPTION_MIGRATION)
+		set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
+
+	/* Allocate or find a client reference we can use */
+	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
+	if (IS_ERR(clp)) {
+		error = PTR_ERR(clp);
+		goto error;
+	}
+
+	/*
+	 * Query for the lease time on clientid setup or renewal
+	 *
+	 * Note that this will be set on nfs_clients that were created
+	 * only for the DS role and did not set this bit, but now will
+	 * serve a dual role.
+	 */
+	set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
+
+	server->nfs_client = clp;
+	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
+	return 0;
+error:
+	dprintk("<-- nfs4_set_client() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+		const struct sockaddr *ds_addr, int ds_addrlen,
+		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+		u32 minor_version, rpc_authflavor_t au_flavor)
+{
+	struct nfs_client_initdata cl_init = {
+		.addr = ds_addr,
+		.addrlen = ds_addrlen,
+		.nfs_mod = &nfs_v4,
+		.proto = ds_proto,
+		.minorversion = minor_version,
+		.net = mds_clp->cl_net,
+	};
+	struct rpc_timeout ds_timeout;
+	struct nfs_client *clp;
+	char buf[INET6_ADDRSTRLEN + 1];
+
+	if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+		return ERR_PTR(-EINVAL);
+	cl_init.hostname = buf;
+
+	/*
+	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+	 * (section 13.1 RFC 5661).
+	 */
+	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+			     au_flavor);
+
+	dprintk("<-- %s %p\n", __func__, clp);
+	return clp;
+}
+EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
+
+/*
+ * Session has been established, and the client marked ready.
+ * Set the mount rsize and wsize with negotiated fore channel
+ * attributes which will be bound checked in nfs_server_set_fsinfo.
+ */
+static void nfs4_session_set_rwsize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_1
+	struct nfs4_session *sess;
+	u32 server_resp_sz;
+	u32 server_rqst_sz;
+
+	if (!nfs4_has_session(server->nfs_client))
+		return;
+	sess = server->nfs_client->cl_session;
+	server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
+	server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
+
+	if (server->rsize > server_resp_sz)
+		server->rsize = server_resp_sz;
+	if (server->wsize > server_rqst_sz)
+		server->wsize = server_rqst_sz;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+static int nfs4_server_common_setup(struct nfs_server *server,
+		struct nfs_fh *mntfh, bool auth_probe)
+{
+	struct nfs_fattr *fattr;
+	int error;
+
+	/* data servers support only a subset of NFSv4.1 */
+	if (is_ds_only_client(server->nfs_client))
+		return -EPROTONOSUPPORT;
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return -ENOMEM;
+
+	/* We must ensure the session is initialised first */
+	error = nfs4_init_session(server->nfs_client);
+	if (error < 0)
+		goto out;
+
+	/* Set the basic capabilities */
+	server->caps |= server->nfs_client->cl_mvops->init_caps;
+	if (server->flags & NFS_MOUNT_NORDIRPLUS)
+			server->caps &= ~NFS_CAP_READDIRPLUS;
+	/*
+	 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+	 * authentication.
+	 */
+	if (nfs4_disable_idmapping &&
+			server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
+		server->caps |= NFS_CAP_UIDGID_NOMAP;
+
+
+	/* Probe the root fh to retrieve its FSID and filehandle */
+	error = nfs4_get_rootfh(server, mntfh, auth_probe);
+	if (error < 0)
+		goto out;
+
+	dprintk("Server FSID: %llx:%llx\n",
+			(unsigned long long) server->fsid.major,
+			(unsigned long long) server->fsid.minor);
+	nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
+
+	nfs4_session_set_rwsize(server);
+
+	error = nfs_probe_fsinfo(server, mntfh, fattr);
+	if (error < 0)
+		goto out;
+
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
+	nfs_server_insert_lists(server);
+	server->mount_time = jiffies;
+	server->destroy = nfs4_destroy_server;
+out:
+	nfs_free_fattr(fattr);
+	return error;
+}
+
+/*
+ * Create a version 4 volume record
+ */
+static int nfs4_init_server(struct nfs_server *server,
+		struct nfs_parsed_mount_data *data)
+{
+	struct rpc_timeout timeparms;
+	int error;
+
+	dprintk("--> nfs4_init_server()\n");
+
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+
+	/* Initialise the client representation from the mount data */
+	server->flags = data->flags;
+	server->options = data->options;
+	server->auth_info = data->auth_info;
+
+	/* Use the first specified auth flavor. If this flavor isn't
+	 * allowed by the server, use the SECINFO path to try the
+	 * other specified flavors */
+	if (data->auth_info.flavor_len >= 1)
+		data->selected_flavor = data->auth_info.flavors[0];
+	else
+		data->selected_flavor = RPC_AUTH_UNIX;
+
+	/* Get a client record */
+	error = nfs4_set_client(server,
+			data->nfs_server.hostname,
+			(const struct sockaddr *)&data->nfs_server.address,
+			data->nfs_server.addrlen,
+			data->client_address,
+			data->selected_flavor,
+			data->nfs_server.protocol,
+			&timeparms,
+			data->minorversion,
+			data->net);
+	if (error < 0)
+		goto error;
+
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+
+	server->acregmin = data->acregmin * HZ;
+	server->acregmax = data->acregmax * HZ;
+	server->acdirmin = data->acdirmin * HZ;
+	server->acdirmax = data->acdirmax * HZ;
+
+	server->port = data->nfs_server.port;
+
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
+
+error:
+	/* Done */
+	dprintk("<-- nfs4_init_server() = %d\n", error);
+	return error;
+}
+
+/*
+ * Create a version 4 volume record
+ * - keyed on server and FSID
+ */
+/*struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
+				      struct nfs_fh *mntfh)*/
+struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
+				      struct nfs_subversion *nfs_mod)
+{
+	struct nfs_server *server;
+	bool auth_probe;
+	int error;
+
+	dprintk("--> nfs4_create_server()\n");
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	auth_probe = mount_info->parsed->auth_info.flavor_len < 1;
+
+	/* set up the general RPC client */
+	error = nfs4_init_server(server, mount_info->parsed);
+	if (error < 0)
+		goto error;
+
+	error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe);
+	if (error < 0)
+		goto error;
+
+	dprintk("<-- nfs4_create_server() = %p\n", server);
+	return server;
+
+error:
+	nfs_free_server(server);
+	dprintk("<-- nfs4_create_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * Create an NFS4 referral server record
+ */
+struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
+					       struct nfs_fh *mntfh)
+{
+	struct nfs_client *parent_client;
+	struct nfs_server *server, *parent_server;
+	bool auth_probe;
+	int error;
+
+	dprintk("--> nfs4_create_referral_server()\n");
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	parent_server = NFS_SB(data->sb);
+	parent_client = parent_server->nfs_client;
+
+	/* Initialise the client representation from the parent server */
+	nfs_server_copy_userdata(server, parent_server);
+
+	/* Get a client representation.
+	 * Note: NFSv4 always uses TCP, */
+	error = nfs4_set_client(server, data->hostname,
+				data->addr,
+				data->addrlen,
+				parent_client->cl_ipaddr,
+				data->authflavor,
+				rpc_protocol(parent_server->client),
+				parent_server->client->cl_timeout,
+				parent_client->cl_mvops->minor_version,
+				parent_client->cl_net);
+	if (error < 0)
+		goto error;
+
+	error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
+	if (error < 0)
+		goto error;
+
+	auth_probe = parent_server->auth_info.flavor_len < 1;
+
+	error = nfs4_server_common_setup(server, mntfh, auth_probe);
+	if (error < 0)
+		goto error;
+
+	dprintk("<-- nfs_create_referral_server() = %p\n", server);
+	return server;
+
+error:
+	nfs_free_server(server);
+	dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+static int nfs_probe_destination(struct nfs_server *server)
+{
+	struct inode *inode = d_inode(server->super->s_root);
+	struct nfs_fattr *fattr;
+	int error;
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return -ENOMEM;
+
+	/* Sanity: the probe won't work if the destination server
+	 * does not recognize the migrated FH. */
+	error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
+
+	nfs_free_fattr(fattr);
+	return error;
+}
+
+/**
+ * nfs4_update_server - Move an nfs_server to a different nfs_client
+ *
+ * @server: represents FSID to be moved
+ * @hostname: new end-point's hostname
+ * @sap: new end-point's socket address
+ * @salen: size of "sap"
+ * @net: net namespace
+ *
+ * The nfs_server must be quiescent before this function is invoked.
+ * Either its session is drained (NFSv4.1+), or its transport is
+ * plugged and drained (NFSv4.0).
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_update_server(struct nfs_server *server, const char *hostname,
+		       struct sockaddr *sap, size_t salen, struct net *net)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct xprt_create xargs = {
+		.ident		= clp->cl_proto,
+		.net		= net,
+		.dstaddr	= sap,
+		.addrlen	= salen,
+		.servername	= hostname,
+	};
+	char buf[INET6_ADDRSTRLEN + 1];
+	struct sockaddr_storage address;
+	struct sockaddr *localaddr = (struct sockaddr *)&address;
+	int error;
+
+	dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			hostname);
+
+	error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_switch_client_transport returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = rpc_localaddr(clnt, localaddr, sizeof(address));
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_localaddr returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = -EAFNOSUPPORT;
+	if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) {
+		dprintk("<-- %s(): rpc_ntop returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	nfs_server_remove_lists(server);
+	error = nfs4_set_client(server, hostname, sap, salen, buf,
+				clp->cl_rpcclient->cl_auth->au_flavor,
+				clp->cl_proto, clnt->cl_timeout,
+				clp->cl_minorversion, net);
+	nfs_put_client(clp);
+	if (error != 0) {
+		nfs_server_insert_lists(server);
+		dprintk("<-- %s(): nfs4_set_client returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	if (server->nfs_client->cl_hostname == NULL)
+		server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+	nfs_server_insert_lists(server);
+
+	error = nfs_probe_destination(server);
+	if (error < 0)
+		goto out;
+
+	dprintk("<-- %s() succeeded\n", __func__);
+
+out:
+	return error;
+}
diff --git a/kernel/fs/nfs/nfs4file.c b/kernel/fs/nfs/nfs4file.c
new file mode 100644
index 000000000..f58c17b3b
--- /dev/null
+++ b/kernel/fs/nfs/nfs4file.c
@@ -0,0 +1,189 @@
+/*
+ *  linux/fs/nfs/file.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ */
+#include <linux/fs.h>
+#include <linux/falloc.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif
+
+#define NFSDBG_FACILITY		NFSDBG_FILE
+
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+	struct nfs_open_context *ctx;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *parent = NULL;
+	struct inode *dir;
+	unsigned openflags = filp->f_flags;
+	struct iattr attr;
+	int opened = 0;
+	int err;
+
+	/*
+	 * If no cached dentry exists or if it's negative, NFSv4 handled the
+	 * opens in ->lookup() or ->create().
+	 *
+	 * We only get this far for a cached positive dentry.  We skipped
+	 * revalidation, so handle it here by dropping the dentry and returning
+	 * -EOPENSTALE.  The VFS will retry the lookup/create/open.
+	 */
+
+	dprintk("NFS: open file(%pd2)\n", dentry);
+
+	if ((openflags & O_ACCMODE) == 3)
+		openflags--;
+
+	/* We can't create new files here */
+	openflags &= ~(O_CREAT|O_EXCL);
+
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+
+	ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+	err = PTR_ERR(ctx);
+	if (IS_ERR(ctx))
+		goto out;
+
+	attr.ia_valid = ATTR_OPEN;
+	if (openflags & O_TRUNC) {
+		attr.ia_valid |= ATTR_SIZE;
+		attr.ia_size = 0;
+		nfs_sync_inode(inode);
+	}
+
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		switch (err) {
+		case -EPERM:
+		case -EACCES:
+		case -EDQUOT:
+		case -ENOSPC:
+		case -EROFS:
+			goto out_put_ctx;
+		default:
+			goto out_drop;
+		}
+	}
+	if (inode != d_inode(dentry))
+		goto out_drop;
+
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	nfs_file_set_open_context(filp, ctx);
+	nfs_fscache_open_file(inode, filp);
+	err = 0;
+
+out_put_ctx:
+	put_nfs_open_context(ctx);
+out:
+	dput(parent);
+	return err;
+
+out_drop:
+	d_drop(dentry);
+	err = -EOPENSTALE;
+	goto out_put_ctx;
+}
+
+static int
+nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	int ret;
+	struct inode *inode = file_inode(file);
+
+	trace_nfs_fsync_enter(inode);
+
+	nfs_inode_dio_wait(inode);
+	do {
+		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+		if (ret != 0)
+			break;
+		mutex_lock(&inode->i_mutex);
+		ret = nfs_file_fsync_commit(file, start, end, datasync);
+		if (!ret)
+			ret = pnfs_sync_inode(inode, !!datasync);
+		mutex_unlock(&inode->i_mutex);
+		/*
+		 * If nfs_file_fsync_commit detected a server reboot, then
+		 * resend all dirty pages that might have been covered by
+		 * the NFS_CONTEXT_RESEND_WRITES flag
+		 */
+		start = 0;
+		end = LLONG_MAX;
+	} while (ret == -EAGAIN);
+
+	trace_nfs_fsync_exit(inode, ret);
+	return ret;
+}
+
+#ifdef CONFIG_NFS_V4_2
+static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
+{
+	loff_t ret;
+
+	switch (whence) {
+	case SEEK_HOLE:
+	case SEEK_DATA:
+		ret = nfs42_proc_llseek(filep, offset, whence);
+		if (ret != -ENOTSUPP)
+			return ret;
+	default:
+		return nfs_file_llseek(filep, offset, whence);
+	}
+}
+
+static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(filep);
+	long ret;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)))
+		return -EOPNOTSUPP;
+
+	ret = inode_newsize_ok(inode, offset + len);
+	if (ret < 0)
+		return ret;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return nfs42_proc_deallocate(filep, offset, len);
+	return nfs42_proc_allocate(filep, offset, len);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+const struct file_operations nfs4_file_operations = {
+#ifdef CONFIG_NFS_V4_2
+	.llseek		= nfs4_file_llseek,
+#else
+	.llseek		= nfs_file_llseek,
+#endif
+	.read_iter	= nfs_file_read,
+	.write_iter	= nfs_file_write,
+	.mmap		= nfs_file_mmap,
+	.open		= nfs4_file_open,
+	.flush		= nfs_file_flush,
+	.release	= nfs_file_release,
+	.fsync		= nfs4_file_fsync,
+	.lock		= nfs_lock,
+	.flock		= nfs_flock,
+	.splice_read	= nfs_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+#ifdef CONFIG_NFS_V4_2
+	.fallocate	= nfs42_fallocate,
+#endif /* CONFIG_NFS_V4_2 */
+	.check_flags	= nfs_check_flags,
+	.setlease	= simple_nosetlease,
+};
diff --git a/kernel/fs/nfs/nfs4getroot.c b/kernel/fs/nfs/nfs4getroot.c
new file mode 100644
index 000000000..c0b3a16b4
--- /dev/null
+++ b/kernel/fs/nfs/nfs4getroot.c
@@ -0,0 +1,50 @@
+/*
+* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+* Written by David Howells (dhowells@redhat.com)
+*/
+
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe)
+{
+	struct nfs_fsinfo fsinfo;
+	int ret = -ENOMEM;
+
+	dprintk("--> nfs4_get_rootfh()\n");
+
+	fsinfo.fattr = nfs_alloc_fattr();
+	if (fsinfo.fattr == NULL)
+		goto out;
+
+	/* Start by getting the root filehandle from the server */
+	ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe);
+	if (ret < 0) {
+		dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
+		goto out;
+	}
+
+	if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
+			|| !S_ISDIR(fsinfo.fattr->mode)) {
+		printk(KERN_ERR "nfs4_get_rootfh:"
+		       " getroot encountered non-directory\n");
+		ret = -ENOTDIR;
+		goto out;
+	}
+
+	if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+		printk(KERN_ERR "nfs4_get_rootfh:"
+		       " getroot obtained referral\n");
+		ret = -EREMOTE;
+		goto out;
+	}
+
+	memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+out:
+	nfs_free_fattr(fsinfo.fattr);
+	dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+	return ret;
+}
diff --git a/kernel/fs/nfs/nfs4idmap.c b/kernel/fs/nfs/nfs4idmap.c
new file mode 100644
index 000000000..2e1737c40
--- /dev/null
+++ b/kernel/fs/nfs/nfs4idmap.c
@@ -0,0 +1,792 @@
+/*
+ * fs/nfs/idmap.c
+ *
+ *  UID and GID to name mapping for clients.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/types.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/module.h>
+
+#include "internal.h"
+#include "netns.h"
+#include "nfs4idmap.h"
+#include "nfs4trace.h"
+
+#define NFS_UINT_MAXLEN 11
+
+static const struct cred *id_resolver_cache;
+static struct key_type key_type_id_resolver_legacy;
+
+struct idmap_legacy_upcalldata {
+	struct rpc_pipe_msg pipe_msg;
+	struct idmap_msg idmap_msg;
+	struct key_construction	*key_cons;
+	struct idmap *idmap;
+};
+
+struct idmap {
+	struct rpc_pipe_dir_object idmap_pdo;
+	struct rpc_pipe		*idmap_pipe;
+	struct idmap_legacy_upcalldata *idmap_upcall_data;
+	struct mutex		idmap_mutex;
+};
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+		struct nfs4_string *owner_name,
+		struct nfs4_string *group_name)
+{
+	fattr->owner_name = owner_name;
+	fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+	fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+	kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+	fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+	kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	struct nfs4_string *owner = fattr->owner_name;
+	kuid_t uid;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+		return false;
+	if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+		fattr->uid = uid;
+		fattr->valid |= NFS_ATTR_FATTR_OWNER;
+	}
+	return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	struct nfs4_string *group = fattr->group_name;
+	kgid_t gid;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+		return false;
+	if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+		fattr->gid = gid;
+		fattr->valid |= NFS_ATTR_FATTR_GROUP;
+	}
+	return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+		nfs_fattr_free_owner_name(fattr);
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+		nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	if (nfs_fattr_map_owner_name(server, fattr))
+		nfs_fattr_free_owner_name(fattr);
+	if (nfs_fattr_map_group_name(server, fattr))
+		nfs_fattr_free_group_name(fattr);
+}
+
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+	unsigned long val;
+	char buf[16];
+
+	if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+		return 0;
+	memcpy(buf, name, namelen);
+	buf[namelen] = '\0';
+	if (kstrtoul(buf, 0, &val) != 0)
+		return 0;
+	*res = val;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
+
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+	return snprintf(buf, buflen, "%u", id);
+}
+
+static struct key_type key_type_id_resolver = {
+	.name		= "id_resolver",
+	.preparse	= user_preparse,
+	.free_preparse	= user_free_preparse,
+	.instantiate	= generic_key_instantiate,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+};
+
+static int nfs_idmap_init_keyring(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret = 0;
+
+	printk(KERN_NOTICE "NFS: Registering the %s key type\n",
+		key_type_id_resolver.name);
+
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = keyring_alloc(".id_resolver",
+				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = register_key_type(&key_type_id_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_id_resolver_legacy);
+	if (ret < 0)
+		goto failed_reg_legacy;
+
+	set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	id_resolver_cache = cred;
+	return 0;
+
+failed_reg_legacy:
+	unregister_key_type(&key_type_id_resolver);
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+static void nfs_idmap_quit_keyring(void)
+{
+	key_revoke(id_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_id_resolver);
+	unregister_key_type(&key_type_id_resolver_legacy);
+	put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+				const char *type, size_t typelen, char **desc)
+{
+	char *cp;
+	size_t desclen = typelen + namelen + 2;
+
+	*desc = kmalloc(desclen, GFP_KERNEL);
+	if (!*desc)
+		return -ENOMEM;
+
+	cp = *desc;
+	memcpy(cp, type, typelen);
+	cp += typelen;
+	*cp++ = ':';
+
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+	return desclen;
+}
+
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+					 const char *type, struct idmap *idmap)
+{
+	char *desc;
+	struct key *rkey;
+	ssize_t ret;
+
+	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+	if (ret <= 0)
+		return ERR_PTR(ret);
+
+	rkey = request_key(&key_type_id_resolver, desc, "");
+	if (IS_ERR(rkey)) {
+		mutex_lock(&idmap->idmap_mutex);
+		rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+						desc, "", 0, idmap);
+		mutex_unlock(&idmap->idmap_mutex);
+	}
+	if (!IS_ERR(rkey))
+		set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
+
+	kfree(desc);
+	return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+				 const char *type, void *data,
+				 size_t data_size, struct idmap *idmap)
+{
+	const struct cred *saved_cred;
+	struct key *rkey;
+	struct user_key_payload *payload;
+	ssize_t ret;
+
+	saved_cred = override_creds(id_resolver_cache);
+	rkey = nfs_idmap_request_key(name, namelen, type, idmap);
+	revert_creds(saved_cred);
+
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	rcu_read_lock();
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto out_up;
+
+	payload = rcu_dereference(rkey->payload.rcudata);
+	if (IS_ERR_OR_NULL(payload)) {
+		ret = PTR_ERR(payload);
+		goto out_up;
+	}
+
+	ret = payload->datalen;
+	if (ret > 0 && ret <= data_size)
+		memcpy(data, payload->data, ret);
+	else
+		ret = -EINVAL;
+
+out_up:
+	rcu_read_unlock();
+	key_put(rkey);
+out:
+	return ret;
+}
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
+				     size_t buflen, struct idmap *idmap)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	int id_len;
+	ssize_t ret;
+
+	id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+	ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);
+	if (ret < 0)
+		return -EINVAL;
+	return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type,
+			       __u32 *id, struct idmap *idmap)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	long id_long;
+	ssize_t data_size;
+	int ret = 0;
+
+	data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);
+	if (data_size <= 0) {
+		ret = -EINVAL;
+	} else {
+		ret = kstrtol(id_str, 10, &id_long);
+		*id = (__u32)id_long;
+	}
+	return ret;
+}
+
+/* idmap classic begins here */
+
+enum {
+	Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err
+};
+
+static const match_table_t nfs_idmap_tokens = {
+	{ Opt_find_uid, "uid:%s" },
+	{ Opt_find_gid, "gid:%s" },
+	{ Opt_find_user, "user:%s" },
+	{ Opt_find_group, "group:%s" },
+	{ Opt_find_err, NULL }
+};
+
+static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+				   size_t);
+static void idmap_release_pipe(struct inode *);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+
+static const struct rpc_pipe_ops idmap_upcall_ops = {
+	.upcall		= rpc_pipe_generic_upcall,
+	.downcall	= idmap_pipe_downcall,
+	.release_pipe	= idmap_release_pipe,
+	.destroy_msg	= idmap_pipe_destroy_msg,
+};
+
+static struct key_type key_type_id_resolver_legacy = {
+	.name		= "id_legacy",
+	.preparse	= user_preparse,
+	.free_preparse	= user_free_preparse,
+	.instantiate	= generic_key_instantiate,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+	.request_key	= nfs_idmap_legacy_upcall,
+};
+
+static void nfs_idmap_pipe_destroy(struct dentry *dir,
+		struct rpc_pipe_dir_object *pdo)
+{
+	struct idmap *idmap = pdo->pdo_data;
+	struct rpc_pipe *pipe = idmap->idmap_pipe;
+
+	if (pipe->dentry) {
+		rpc_unlink(pipe->dentry);
+		pipe->dentry = NULL;
+	}
+}
+
+static int nfs_idmap_pipe_create(struct dentry *dir,
+		struct rpc_pipe_dir_object *pdo)
+{
+	struct idmap *idmap = pdo->pdo_data;
+	struct rpc_pipe *pipe = idmap->idmap_pipe;
+	struct dentry *dentry;
+
+	dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	pipe->dentry = dentry;
+	return 0;
+}
+
+static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
+	.create = nfs_idmap_pipe_create,
+	.destroy = nfs_idmap_pipe_destroy,
+};
+
+int
+nfs_idmap_new(struct nfs_client *clp)
+{
+	struct idmap *idmap;
+	struct rpc_pipe *pipe;
+	int error;
+
+	idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
+	if (idmap == NULL)
+		return -ENOMEM;
+
+	rpc_init_pipe_dir_object(&idmap->idmap_pdo,
+			&nfs_idmap_pipe_dir_object_ops,
+			idmap);
+
+	pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
+	if (IS_ERR(pipe)) {
+		error = PTR_ERR(pipe);
+		goto err;
+	}
+	idmap->idmap_pipe = pipe;
+	mutex_init(&idmap->idmap_mutex);
+
+	error = rpc_add_pipe_dir_object(clp->cl_net,
+			&clp->cl_rpcclient->cl_pipedir_objects,
+			&idmap->idmap_pdo);
+	if (error)
+		goto err_destroy_pipe;
+
+	clp->cl_idmap = idmap;
+	return 0;
+err_destroy_pipe:
+	rpc_destroy_pipe_data(idmap->idmap_pipe);
+err:
+	kfree(idmap);
+	return error;
+}
+
+void
+nfs_idmap_delete(struct nfs_client *clp)
+{
+	struct idmap *idmap = clp->cl_idmap;
+
+	if (!idmap)
+		return;
+	clp->cl_idmap = NULL;
+	rpc_remove_pipe_dir_object(clp->cl_net,
+			&clp->cl_rpcclient->cl_pipedir_objects,
+			&idmap->idmap_pdo);
+	rpc_destroy_pipe_data(idmap->idmap_pipe);
+	kfree(idmap);
+}
+
+int nfs_idmap_init(void)
+{
+	int ret;
+	ret = nfs_idmap_init_keyring();
+	if (ret != 0)
+		goto out;
+out:
+	return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+	nfs_idmap_quit_keyring();
+}
+
+static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
+				     struct idmap_msg *im,
+				     struct rpc_pipe_msg *msg)
+{
+	substring_t substr;
+	int token, ret;
+
+	im->im_type = IDMAP_TYPE_GROUP;
+	token = match_token(desc, nfs_idmap_tokens, &substr);
+
+	switch (token) {
+	case Opt_find_uid:
+		im->im_type = IDMAP_TYPE_USER;
+	case Opt_find_gid:
+		im->im_conv = IDMAP_CONV_NAMETOID;
+		ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
+		break;
+
+	case Opt_find_user:
+		im->im_type = IDMAP_TYPE_USER;
+	case Opt_find_group:
+		im->im_conv = IDMAP_CONV_IDTONAME;
+		ret = match_int(&substr, &im->im_id);
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	msg->data = im;
+	msg->len  = sizeof(struct idmap_msg);
+
+out:
+	return ret;
+}
+
+static bool
+nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
+		struct idmap_legacy_upcalldata *data)
+{
+	if (idmap->idmap_upcall_data != NULL) {
+		WARN_ON_ONCE(1);
+		return false;
+	}
+	idmap->idmap_upcall_data = data;
+	return true;
+}
+
+static void
+nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
+{
+	struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+
+	kfree(idmap->idmap_upcall_data);
+	idmap->idmap_upcall_data = NULL;
+	complete_request_key(cons, ret);
+}
+
+static void
+nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
+{
+	if (idmap->idmap_upcall_data != NULL)
+		nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+}
+
+static int nfs_idmap_legacy_upcall(struct key_construction *cons,
+				   const char *op,
+				   void *aux)
+{
+	struct idmap_legacy_upcalldata *data;
+	struct rpc_pipe_msg *msg;
+	struct idmap_msg *im;
+	struct idmap *idmap = (struct idmap *)aux;
+	struct key *key = cons->key;
+	int ret = -ENOMEM;
+
+	/* msg and im are freed in idmap_pipe_destroy_msg */
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out1;
+
+	msg = &data->pipe_msg;
+	im = &data->idmap_msg;
+	data->idmap = idmap;
+	data->key_cons = cons;
+
+	ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
+	if (ret < 0)
+		goto out2;
+
+	ret = -EAGAIN;
+	if (!nfs_idmap_prepare_pipe_upcall(idmap, data))
+		goto out2;
+
+	ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
+	if (ret < 0)
+		nfs_idmap_abort_pipe_upcall(idmap, ret);
+
+	return ret;
+out2:
+	kfree(data);
+out1:
+	complete_request_key(cons, ret);
+	return ret;
+}
+
+static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen)
+{
+	return key_instantiate_and_link(key, data, datalen,
+					id_resolver_cache->thread_keyring,
+					authkey);
+}
+
+static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
+		struct idmap_msg *upcall,
+		struct key *key, struct key *authkey)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	size_t len;
+	int ret = -ENOKEY;
+
+	/* ret = -ENOKEY */
+	if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv)
+		goto out;
+	switch (im->im_conv) {
+	case IDMAP_CONV_NAMETOID:
+		if (strcmp(upcall->im_name, im->im_name) != 0)
+			break;
+		/* Note: here we store the NUL terminator too */
+		len = sprintf(id_str, "%d", im->im_id) + 1;
+		ret = nfs_idmap_instantiate(key, authkey, id_str, len);
+		break;
+	case IDMAP_CONV_IDTONAME:
+		if (upcall->im_id != im->im_id)
+			break;
+		len = strlen(im->im_name);
+		ret = nfs_idmap_instantiate(key, authkey, im->im_name, len);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+out:
+	return ret;
+}
+
+static ssize_t
+idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+	struct rpc_inode *rpci = RPC_I(file_inode(filp));
+	struct idmap *idmap = (struct idmap *)rpci->private;
+	struct key_construction *cons;
+	struct idmap_msg im;
+	size_t namelen_in;
+	int ret = -ENOKEY;
+
+	/* If instantiation is successful, anyone waiting for key construction
+	 * will have been woken up and someone else may now have used
+	 * idmap_key_cons - so after this point we may no longer touch it.
+	 */
+	if (idmap->idmap_upcall_data == NULL)
+		goto out_noupcall;
+
+	cons = idmap->idmap_upcall_data->key_cons;
+
+	if (mlen != sizeof(im)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	if (copy_from_user(&im, src, mlen) != 0) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
+		ret = -ENOKEY;
+		goto out;
+	}
+
+	namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
+	if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
+		ret = -EINVAL;
+		goto out;
+}
+
+	ret = nfs_idmap_read_and_verify_message(&im,
+			&idmap->idmap_upcall_data->idmap_msg,
+			cons->key, cons->authkey);
+	if (ret >= 0) {
+		key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+		ret = mlen;
+	}
+
+out:
+	nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+out_noupcall:
+	return ret;
+}
+
+static void
+idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct idmap_legacy_upcalldata *data = container_of(msg,
+			struct idmap_legacy_upcalldata,
+			pipe_msg);
+	struct idmap *idmap = data->idmap;
+
+	if (msg->errno)
+		nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
+}
+
+static void
+idmap_release_pipe(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	struct idmap *idmap = (struct idmap *)rpci->private;
+
+	nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
+}
+
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	__u32 id = -1;
+	int ret = 0;
+
+	if (!nfs_map_string_to_numeric(name, namelen, &id))
+		ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap);
+	if (ret == 0) {
+		*uid = make_kuid(&init_user_ns, id);
+		if (!uid_valid(*uid))
+			ret = -ERANGE;
+	}
+	trace_nfs4_map_name_to_uid(name, namelen, id, ret);
+	return ret;
+}
+
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	__u32 id = -1;
+	int ret = 0;
+
+	if (!nfs_map_string_to_numeric(name, namelen, &id))
+		ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap);
+	if (ret == 0) {
+		*gid = make_kgid(&init_user_ns, id);
+		if (!gid_valid(*gid))
+			ret = -ERANGE;
+	}
+	trace_nfs4_map_group_to_gid(name, namelen, id, ret);
+	return ret;
+}
+
+int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	int ret = -EINVAL;
+	__u32 id;
+
+	id = from_kuid(&init_user_ns, uid);
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(id, buf, buflen);
+	trace_nfs4_map_uid_to_name(buf, ret, id, ret);
+	return ret;
+}
+int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	int ret = -EINVAL;
+	__u32 id;
+
+	id = from_kgid(&init_user_ns, gid);
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(id, buf, buflen);
+	trace_nfs4_map_gid_to_group(buf, ret, id, ret);
+	return ret;
+}
diff --git a/kernel/fs/nfs/nfs4idmap.h b/kernel/fs/nfs/nfs4idmap.h
new file mode 100644
index 000000000..de44d7330
--- /dev/null
+++ b/kernel/fs/nfs/nfs4idmap.h
@@ -0,0 +1,68 @@
+/*
+ * fs/nfs/nfs4idmap.h
+ *
+ *  UID and GID to name mapping for clients.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef NFS_IDMAP_H
+#define NFS_IDMAP_H
+
+#include <linux/uidgid.h>
+#include <uapi/linux/nfs_idmap.h>
+
+
+/* Forward declaration to make this header independent of others */
+struct nfs_client;
+struct nfs_server;
+struct nfs_fattr;
+struct nfs4_string;
+
+int nfs_idmap_init(void);
+void nfs_idmap_quit(void);
+int nfs_idmap_new(struct nfs_client *);
+void nfs_idmap_delete(struct nfs_client *);
+
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+		struct nfs4_string *owner_name,
+		struct nfs4_string *group_name);
+void nfs_fattr_free_names(struct nfs_fattr *);
+void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *);
+
+int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *);
+int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *);
+int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t);
+int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t);
+
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res);
+
+extern unsigned int nfs_idmap_cache_timeout;
+#endif /* NFS_IDMAP_H */
diff --git a/kernel/fs/nfs/nfs4namespace.c b/kernel/fs/nfs/nfs4namespace.c
new file mode 100644
index 000000000..f59267237
--- /dev/null
+++ b/kernel/fs/nfs/nfs4namespace.c
@@ -0,0 +1,523 @@
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFSv4 namespace
+ */
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "dns_resolve.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/*
+ * Convert the NFSv4 pathname components into a standard posix path.
+ *
+ * Note that the resulting string will be placed at the end of the buffer
+ */
+static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
+					 char *buffer, ssize_t buflen)
+{
+	char *end = buffer + buflen;
+	int n;
+
+	*--end = '\0';
+	buflen--;
+
+	n = pathname->ncomponents;
+	while (--n >= 0) {
+		const struct nfs4_string *component = &pathname->components[n];
+		buflen -= component->len + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= component->len;
+		memcpy(end, component->data, component->len);
+		*--end = '/';
+	}
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+/*
+ * return the path component of "<server>:<path>"
+ *  nfspath - the "<server>:<path>" string
+ *  end - one past the last char that could contain "<server>:"
+ * returns NULL on failure
+ */
+static char *nfs_path_component(const char *nfspath, const char *end)
+{
+	char *p;
+
+	if (*nfspath == '[') {
+		/* parse [] escaped IPv6 addrs */
+		p = strchr(nfspath, ']');
+		if (p != NULL && ++p < end && *p == ':')
+			return p + 1;
+	} else {
+		/* otherwise split on first colon */
+		p = strchr(nfspath, ':');
+		if (p != NULL && p < end)
+			return p + 1;
+	}
+	return NULL;
+}
+
+/*
+ * Determine the mount path as a string
+ */
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+	char *limit;
+	char *path = nfs_path(&limit, dentry, buffer, buflen,
+			      NFS_PATH_CANONICAL);
+	if (!IS_ERR(path)) {
+		char *path_component = nfs_path_component(path, limit);
+		if (path_component)
+			return path_component;
+	}
+	return path;
+}
+
+/*
+ * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
+ * believe to be the server path to this dentry
+ */
+static int nfs4_validate_fspath(struct dentry *dentry,
+				const struct nfs4_fs_locations *locations,
+				char *page, char *page2)
+{
+	const char *path, *fs_path;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		return PTR_ERR(fs_path);
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n",
+			__func__, path, fs_path);
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+static size_t nfs_parse_server_name(char *string, size_t len,
+		struct sockaddr *sa, size_t salen, struct net *net)
+{
+	ssize_t ret;
+
+	ret = rpc_pton(net, string, len, sa, salen);
+	if (ret == 0) {
+		ret = nfs_dns_resolve_name(net, string, len, sa, salen);
+		if (ret < 0)
+			ret = 0;
+	}
+	return ret;
+}
+
+/**
+ * nfs_find_best_sec - Find a security mechanism supported locally
+ * @server: NFS server struct
+ * @flavors: List of security tuples returned by SECINFO procedure
+ *
+ * Return an rpc client that uses the first security mechanism in
+ * "flavors" that is locally supported.  The "flavors" array
+ * is searched in the order returned from the server, per RFC 3530
+ * recommendation and each flavor is checked for membership in the
+ * sec= mount option list if it exists.
+ *
+ * Return -EPERM if no matching flavor is found in the array.
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ *
+ */
+static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt,
+					  struct nfs_server *server,
+					  struct nfs4_secinfo_flavors *flavors)
+{
+	rpc_authflavor_t pflavor;
+	struct nfs4_secinfo4 *secinfo;
+	unsigned int i;
+
+	for (i = 0; i < flavors->num_flavors; i++) {
+		secinfo = &flavors->flavors[i];
+
+		switch (secinfo->flavor) {
+		case RPC_AUTH_NULL:
+		case RPC_AUTH_UNIX:
+		case RPC_AUTH_GSS:
+			pflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+							&secinfo->flavor_info);
+			/* does the pseudoflavor match a sec= mount opt? */
+			if (pflavor != RPC_AUTH_MAXFLAVOR &&
+			    nfs_auth_info_match(&server->auth_info, pflavor)) {
+				struct rpc_clnt *new;
+				struct rpc_cred *cred;
+
+				/* Cloning creates an rpc_auth for the flavor */
+				new = rpc_clone_client_set_auth(clnt, pflavor);
+				if (IS_ERR(new))
+					continue;
+				/**
+				* Check that the user actually can use the
+				* flavor. This is mostly for RPC_AUTH_GSS
+				* where cr_init obtains a gss context
+				*/
+				cred = rpcauth_lookupcred(new->cl_auth, 0);
+				if (IS_ERR(cred)) {
+					rpc_shutdown_client(new);
+					continue;
+				}
+				put_rpccred(cred);
+				return new;
+			}
+		}
+	}
+	return ERR_PTR(-EPERM);
+}
+
+/**
+ * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup,
+ * return an rpc_clnt that uses the best available security flavor with
+ * respect to the secinfo flavor list and the sec= mount options.
+ *
+ * @clnt: RPC client to clone
+ * @inode: directory inode
+ * @name: lookup name
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ */
+struct rpc_clnt *
+nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode,
+					struct qstr *name)
+{
+	struct page *page;
+	struct nfs4_secinfo_flavors *flavors;
+	struct rpc_clnt *new;
+	int err;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	flavors = page_address(page);
+
+	err = nfs4_proc_secinfo(inode, name, flavors);
+	if (err < 0) {
+		new = ERR_PTR(err);
+		goto out;
+	}
+
+	new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors);
+
+out:
+	put_page(page);
+	return new;
+}
+
+static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
+				     char *page, char *page2,
+				     const struct nfs4_fs_location *location)
+{
+	const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+	struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client);
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	char *mnt_path;
+	unsigned int maxbuflen;
+	unsigned int s;
+
+	mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+	if (IS_ERR(mnt_path))
+		return ERR_CAST(mnt_path);
+	mountdata->mnt_path = mnt_path;
+	maxbuflen = mnt_path - 1 - page2;
+
+	mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
+	if (mountdata->addr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	for (s = 0; s < location->nservers; s++) {
+		const struct nfs4_string *buf = &location->servers[s];
+
+		if (buf->len <= 0 || buf->len >= maxbuflen)
+			continue;
+
+		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+			continue;
+
+		mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
+				mountdata->addr, addr_bufsize, net);
+		if (mountdata->addrlen == 0)
+			continue;
+
+		rpc_set_port(mountdata->addr, NFS_PORT);
+
+		memcpy(page2, buf->data, buf->len);
+		page2[buf->len] = '\0';
+		mountdata->hostname = page2;
+
+		snprintf(page, PAGE_SIZE, "%s:%s",
+				mountdata->hostname,
+				mountdata->mnt_path);
+
+		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		if (!IS_ERR(mnt))
+			break;
+	}
+	kfree(mountdata->addr);
+	return mnt;
+}
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @dentry - parent directory
+ * @locations - array of NFSv4 server location information
+ *
+ */
+static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
+					    const struct nfs4_fs_locations *locations)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct nfs_clone_mount mountdata = {
+		.sb = dentry->d_sb,
+		.dentry = dentry,
+		.authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
+	};
+	char *page = NULL, *page2 = NULL;
+	int loc, error;
+
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	dprintk("%s: referral at %pd2\n", __func__, dentry);
+
+	page = (char *) __get_free_page(GFP_USER);
+	if (!page)
+		goto out;
+
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (!page2)
+		goto out;
+
+	/* Ensure fs path is a prefix of current dentry path */
+	error = nfs4_validate_fspath(dentry, locations, page, page2);
+	if (error < 0) {
+		mnt = ERR_PTR(error);
+		goto out;
+	}
+
+	for (loc = 0; loc < locations->nlocations; loc++) {
+		const struct nfs4_fs_location *location = &locations->locations[loc];
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0)
+			continue;
+
+		mnt = try_location(&mountdata, page, page2, location);
+		if (!IS_ERR(mnt))
+			break;
+	}
+
+out:
+	free_page((unsigned long) page);
+	free_page((unsigned long) page2);
+	dprintk("%s: done\n", __func__);
+	return mnt;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ *
+ */
+static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	struct dentry *parent;
+	struct nfs4_fs_locations *fs_locations = NULL;
+	struct page *page;
+	int err;
+
+	/* BUG_ON(IS_ROOT(dentry)); */
+	dprintk("%s: enter\n", __func__);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+
+	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (fs_locations == NULL)
+		goto out_free;
+
+	/* Get locations */
+	mnt = ERR_PTR(-ENOENT);
+
+	parent = dget_parent(dentry);
+	dprintk("%s: getting locations for %pd2\n",
+		__func__, dentry);
+
+	err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page);
+	dput(parent);
+	if (err != 0 ||
+	    fs_locations->nlocations <= 0 ||
+	    fs_locations->fs_path.ncomponents <= 0)
+		goto out_free;
+
+	mnt = nfs_follow_referral(dentry, fs_locations);
+out_free:
+	__free_page(page);
+	kfree(fs_locations);
+out:
+	dprintk("%s: done\n", __func__);
+	return mnt;
+}
+
+struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
+			       struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	rpc_authflavor_t flavor = server->client->cl_auth->au_flavor;
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir = d_inode(parent);
+	struct qstr *name = &dentry->d_name;
+	struct rpc_clnt *client;
+	struct vfsmount *mnt;
+
+	/* Look it up again to get its attributes and sec flavor */
+	client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr);
+	dput(parent);
+	if (IS_ERR(client))
+		return ERR_CAST(client);
+
+	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+		mnt = nfs_do_refmount(client, dentry);
+		goto out;
+	}
+
+	if (client->cl_auth->au_flavor != flavor)
+		flavor = client->cl_auth->au_flavor;
+	mnt = nfs_do_submount(dentry, fh, fattr, flavor);
+out:
+	rpc_shutdown_client(client);
+	return mnt;
+}
+
+/*
+ * Try one location from the fs_locations array.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+static int nfs4_try_replacing_one_location(struct nfs_server *server,
+		char *page, char *page2,
+		const struct nfs4_fs_location *location)
+{
+	const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+	struct net *net = rpc_net_ns(server->client);
+	struct sockaddr *sap;
+	unsigned int s;
+	size_t salen;
+	int error;
+
+	sap = kmalloc(addr_bufsize, GFP_KERNEL);
+	if (sap == NULL)
+		return -ENOMEM;
+
+	error = -ENOENT;
+	for (s = 0; s < location->nservers; s++) {
+		const struct nfs4_string *buf = &location->servers[s];
+		char *hostname;
+
+		if (buf->len <= 0 || buf->len > PAGE_SIZE)
+			continue;
+
+		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL)
+			continue;
+
+		salen = nfs_parse_server_name(buf->data, buf->len,
+						sap, addr_bufsize, net);
+		if (salen == 0)
+			continue;
+		rpc_set_port(sap, NFS_PORT);
+
+		error = -ENOMEM;
+		hostname = kstrndup(buf->data, buf->len, GFP_KERNEL);
+		if (hostname == NULL)
+			break;
+
+		error = nfs4_update_server(server, hostname, sap, salen, net);
+		kfree(hostname);
+		if (error == 0)
+			break;
+	}
+
+	kfree(sap);
+	return error;
+}
+
+/**
+ * nfs4_replace_transport - set up transport to destination server
+ *
+ * @server: export being migrated
+ * @locations: fs_locations array
+ *
+ * Returns zero on success, or a negative errno value.
+ *
+ * The client tries all the entries in the "locations" array, in the
+ * order returned by the server, until one works or the end of the
+ * array is reached.
+ */
+int nfs4_replace_transport(struct nfs_server *server,
+			   const struct nfs4_fs_locations *locations)
+{
+	char *page = NULL, *page2 = NULL;
+	int loc, error;
+
+	error = -ENOENT;
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	error = -ENOMEM;
+	page = (char *) __get_free_page(GFP_USER);
+	if (!page)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (!page2)
+		goto out;
+
+	for (loc = 0; loc < locations->nlocations; loc++) {
+		const struct nfs4_fs_location *location =
+						&locations->locations[loc];
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0)
+			continue;
+
+		error = nfs4_try_replacing_one_location(server, page,
+							page2, location);
+		if (error == 0)
+			break;
+	}
+
+out:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+	return error;
+}
diff --git a/kernel/fs/nfs/nfs4proc.c b/kernel/fs/nfs/nfs4proc.c
new file mode 100644
index 000000000..55e1e3af2
--- /dev/null
+++ b/kernel/fs/nfs/nfs4proc.c
@@ -0,0 +1,8687 @@
+/*
+ *  fs/nfs/nfs4proc.c
+ *
+ *  Client-side procedure declarations for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_mount.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/xattr.h>
+#include <linux/utsname.h>
+#include <linux/freezer.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "callback.h"
+#include "pnfs.h"
+#include "netns.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "fscache.h"
+
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PROC
+
+#define NFS4_POLL_RETRY_MIN	(HZ/10)
+#define NFS4_POLL_RETRY_MAX	(15*HZ)
+
+struct nfs4_opendata;
+static int _nfs4_proc_open(struct nfs4_opendata *data);
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
+static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			    struct nfs_fattr *fattr, struct iattr *sattr,
+			    struct nfs4_state *state, struct nfs4_label *ilabel,
+			    struct nfs4_label *olabel);
+#ifdef CONFIG_NFS_V4_1
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+		struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
+		struct rpc_cred *);
+#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+	struct iattr *sattr, struct nfs4_label *label)
+{
+	int err;
+
+	if (label == NULL)
+		return NULL;
+
+	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+		return NULL;
+
+	err = security_dentry_init_security(dentry, sattr->ia_mode,
+				&dentry->d_name, (void **)&label->label, &label->len);
+	if (err == 0)
+		return label;
+
+	return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+	if (label)
+		security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+	if (label)
+		return server->attr_bitmask;
+
+	return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+	struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
+/* Prevent leaks of NFSv4 errors into userland */
+static int nfs4_map_errors(int err)
+{
+	if (err >= -1000)
+		return err;
+	switch (err) {
+	case -NFS4ERR_RESOURCE:
+	case -NFS4ERR_LAYOUTTRYLATER:
+	case -NFS4ERR_RECALLCONFLICT:
+		return -EREMOTEIO;
+	case -NFS4ERR_WRONGSEC:
+	case -NFS4ERR_WRONG_CRED:
+		return -EPERM;
+	case -NFS4ERR_BADOWNER:
+	case -NFS4ERR_BADNAME:
+		return -EINVAL;
+	case -NFS4ERR_SHARE_DENIED:
+		return -EACCES;
+	case -NFS4ERR_MINOR_VERS_MISMATCH:
+		return -EPROTONOSUPPORT;
+	case -NFS4ERR_FILE_OPEN:
+		return -EBUSY;
+	default:
+		dprintk("%s could not handle NFSv4 error %d\n",
+				__func__, -err);
+		break;
+	}
+	return -EIO;
+}
+
+/*
+ * This is our standard bitmap for GETATTR requests.
+ */
+const u32 nfs4_fattr_bitmap[3] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_SIZE
+	| FATTR4_WORD0_FSID
+	| FATTR4_WORD0_FILEID,
+	FATTR4_WORD1_MODE
+	| FATTR4_WORD1_NUMLINKS
+	| FATTR4_WORD1_OWNER
+	| FATTR4_WORD1_OWNER_GROUP
+	| FATTR4_WORD1_RAWDEV
+	| FATTR4_WORD1_SPACE_USED
+	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_METADATA
+	| FATTR4_WORD1_TIME_MODIFY
+	| FATTR4_WORD1_MOUNTED_ON_FILEID,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+	FATTR4_WORD2_SECURITY_LABEL
+#endif
+};
+
+static const u32 nfs4_pnfs_open_bitmap[3] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_SIZE
+	| FATTR4_WORD0_FSID
+	| FATTR4_WORD0_FILEID,
+	FATTR4_WORD1_MODE
+	| FATTR4_WORD1_NUMLINKS
+	| FATTR4_WORD1_OWNER
+	| FATTR4_WORD1_OWNER_GROUP
+	| FATTR4_WORD1_RAWDEV
+	| FATTR4_WORD1_SPACE_USED
+	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_METADATA
+	| FATTR4_WORD1_TIME_MODIFY,
+	FATTR4_WORD2_MDSTHRESHOLD
+};
+
+static const u32 nfs4_open_noattr_bitmap[3] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_FILEID,
+};
+
+const u32 nfs4_statfs_bitmap[3] = {
+	FATTR4_WORD0_FILES_AVAIL
+	| FATTR4_WORD0_FILES_FREE
+	| FATTR4_WORD0_FILES_TOTAL,
+	FATTR4_WORD1_SPACE_AVAIL
+	| FATTR4_WORD1_SPACE_FREE
+	| FATTR4_WORD1_SPACE_TOTAL
+};
+
+const u32 nfs4_pathconf_bitmap[3] = {
+	FATTR4_WORD0_MAXLINK
+	| FATTR4_WORD0_MAXNAME,
+	0
+};
+
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
+			| FATTR4_WORD0_MAXREAD
+			| FATTR4_WORD0_MAXWRITE
+			| FATTR4_WORD0_LEASE_TIME,
+			FATTR4_WORD1_TIME_DELTA
+			| FATTR4_WORD1_FS_LAYOUT_TYPES,
+			FATTR4_WORD2_LAYOUT_BLKSIZE
+};
+
+const u32 nfs4_fs_locations_bitmap[3] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_SIZE
+	| FATTR4_WORD0_FSID
+	| FATTR4_WORD0_FILEID
+	| FATTR4_WORD0_FS_LOCATIONS,
+	FATTR4_WORD1_MODE
+	| FATTR4_WORD1_NUMLINKS
+	| FATTR4_WORD1_OWNER
+	| FATTR4_WORD1_OWNER_GROUP
+	| FATTR4_WORD1_RAWDEV
+	| FATTR4_WORD1_SPACE_USED
+	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_METADATA
+	| FATTR4_WORD1_TIME_MODIFY
+	| FATTR4_WORD1_MOUNTED_ON_FILEID,
+};
+
+static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
+		struct nfs4_readdir_arg *readdir)
+{
+	__be32 *start, *p;
+
+	if (cookie > 2) {
+		readdir->cookie = cookie;
+		memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
+		return;
+	}
+
+	readdir->cookie = 0;
+	memset(&readdir->verifier, 0, sizeof(readdir->verifier));
+	if (cookie == 2)
+		return;
+	
+	/*
+	 * NFSv4 servers do not return entries for '.' and '..'
+	 * Therefore, we fake these entries here.  We let '.'
+	 * have cookie 0 and '..' have cookie 1.  Note that
+	 * when talking to the server, we always send cookie 0
+	 * instead of 1 or 2.
+	 */
+	start = p = kmap_atomic(*readdir->pages);
+	
+	if (cookie == 0) {
+		*p++ = xdr_one;                                  /* next */
+		*p++ = xdr_zero;                   /* cookie, first word */
+		*p++ = xdr_one;                   /* cookie, second word */
+		*p++ = xdr_one;                             /* entry len */
+		memcpy(p, ".\0\0\0", 4);                        /* entry */
+		p++;
+		*p++ = xdr_one;                         /* bitmap length */
+		*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
+		*p++ = htonl(8);              /* attribute buffer length */
+		p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry)));
+	}
+	
+	*p++ = xdr_one;                                  /* next */
+	*p++ = xdr_zero;                   /* cookie, first word */
+	*p++ = xdr_two;                   /* cookie, second word */
+	*p++ = xdr_two;                             /* entry len */
+	memcpy(p, "..\0\0", 4);                         /* entry */
+	p++;
+	*p++ = xdr_one;                         /* bitmap length */
+	*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
+	*p++ = htonl(8);              /* attribute buffer length */
+	p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
+
+	readdir->pgbase = (char *)p - (char *)start;
+	readdir->count -= readdir->pgbase;
+	kunmap_atomic(start);
+}
+
+static long nfs4_update_delay(long *timeout)
+{
+	long ret;
+	if (!timeout)
+		return NFS4_POLL_RETRY_MAX;
+	if (*timeout <= 0)
+		*timeout = NFS4_POLL_RETRY_MIN;
+	if (*timeout > NFS4_POLL_RETRY_MAX)
+		*timeout = NFS4_POLL_RETRY_MAX;
+	ret = *timeout;
+	*timeout <<= 1;
+	return ret;
+}
+
+static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+{
+	int res = 0;
+
+	might_sleep();
+
+	freezable_schedule_timeout_killable_unsafe(
+		nfs4_update_delay(timeout));
+	if (fatal_signal_pending(current))
+		res = -ERESTARTSYS;
+	return res;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state *state = exception->state;
+	struct inode *inode = exception->inode;
+	int ret = errorcode;
+
+	exception->retry = 0;
+	switch(errorcode) {
+		case 0:
+			return 0;
+		case -NFS4ERR_OPENMODE:
+			if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
+				nfs4_inode_return_delegation(inode);
+				exception->retry = 1;
+				return 0;
+			}
+			if (state == NULL)
+				break;
+			ret = nfs4_schedule_stateid_recovery(server, state);
+			if (ret < 0)
+				break;
+			goto wait_on_recovery;
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_BAD_STATEID:
+			if (state == NULL)
+				break;
+			ret = nfs4_schedule_stateid_recovery(server, state);
+			if (ret < 0)
+				break;
+			goto wait_on_recovery;
+		case -NFS4ERR_EXPIRED:
+			if (state != NULL) {
+				ret = nfs4_schedule_stateid_recovery(server, state);
+				if (ret < 0)
+					break;
+			}
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_STALE_CLIENTID:
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			ret = nfs4_schedule_migration_recovery(server);
+			if (ret < 0)
+				break;
+			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
+#if defined(CONFIG_NFS_V4_1)
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_DEADSESSION:
+		case -NFS4ERR_SEQ_FALSE_RETRY:
+		case -NFS4ERR_SEQ_MISORDERED:
+			dprintk("%s ERROR: %d Reset session\n", __func__,
+				errorcode);
+			nfs4_schedule_session_recovery(clp->cl_session, errorcode);
+			goto wait_on_recovery;
+#endif /* defined(CONFIG_NFS_V4_1) */
+		case -NFS4ERR_FILE_OPEN:
+			if (exception->timeout > HZ) {
+				/* We have retried a decent amount, time to
+				 * fail
+				 */
+				ret = -EBUSY;
+				break;
+			}
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+			ret = nfs4_delay(server->client, &exception->timeout);
+			if (ret != 0)
+				break;
+		case -NFS4ERR_RETRY_UNCACHED_REP:
+		case -NFS4ERR_OLD_STATEID:
+			exception->retry = 1;
+			break;
+		case -NFS4ERR_BADOWNER:
+			/* The following works around a Linux server bug! */
+		case -NFS4ERR_BADNAME:
+			if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+				server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+				exception->retry = 1;
+				printk(KERN_WARNING "NFS: v4 server %s "
+						"does not accept raw "
+						"uid/gids. "
+						"Reenabling the idmapper.\n",
+						server->nfs_client->cl_hostname);
+			}
+	}
+	/* We failed to handle the error */
+	return nfs4_map_errors(ret);
+wait_on_recovery:
+	ret = nfs4_wait_clnt_recover(clp);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -EIO;
+	if (ret == 0)
+		exception->retry = 1;
+	return ret;
+}
+
+/*
+ * Return 'true' if 'clp' is using an rpc_client that is integrity protected
+ * or 'false' otherwise.
+ */
+static bool _nfs4_is_integrity_protected(struct nfs_client *clp)
+{
+	rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+
+	if (flavor == RPC_AUTH_GSS_KRB5I ||
+	    flavor == RPC_AUTH_GSS_KRB5P)
+		return true;
+
+	return false;
+}
+
+static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
+{
+	spin_lock(&clp->cl_lock);
+	if (time_before(clp->cl_last_renewal,timestamp))
+		clp->cl_last_renewal = timestamp;
+	spin_unlock(&clp->cl_lock);
+}
+
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+{
+	do_renew_lease(server->nfs_client, timestamp);
+}
+
+struct nfs4_call_sync_data {
+	const struct nfs_server *seq_server;
+	struct nfs4_sequence_args *seq_args;
+	struct nfs4_sequence_res *seq_res;
+};
+
+static void nfs4_init_sequence(struct nfs4_sequence_args *args,
+			       struct nfs4_sequence_res *res, int cache_reply)
+{
+	args->sa_slot = NULL;
+	args->sa_cache_this = cache_reply;
+	args->sa_privileged = 0;
+
+	res->sr_slot = NULL;
+}
+
+static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
+{
+	args->sa_privileged = 1;
+}
+
+int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
+			 struct nfs4_sequence_args *args,
+			 struct nfs4_sequence_res *res,
+			 struct rpc_task *task)
+{
+	struct nfs4_slot *slot;
+
+	/* slot already allocated? */
+	if (res->sr_slot != NULL)
+		goto out_start;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+		goto out_sleep;
+
+	slot = nfs4_alloc_slot(tbl);
+	if (IS_ERR(slot)) {
+		if (slot == ERR_PTR(-ENOMEM))
+			task->tk_timeout = HZ >> 2;
+		goto out_sleep;
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	args->sa_slot = slot;
+	res->sr_slot = slot;
+
+out_start:
+	rpc_call_start(task);
+	return 0;
+
+out_sleep:
+	if (args->sa_privileged)
+		rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
+				NULL, RPC_PRIORITY_PRIVILEGED);
+	else
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+	spin_unlock(&tbl->slot_tbl_lock);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
+
+static int nfs40_sequence_done(struct rpc_task *task,
+			       struct nfs4_sequence_res *res)
+{
+	struct nfs4_slot *slot = res->sr_slot;
+	struct nfs4_slot_table *tbl;
+
+	if (slot == NULL)
+		goto out;
+
+	tbl = slot->table;
+	spin_lock(&tbl->slot_tbl_lock);
+	if (!nfs41_wake_and_assign_slot(tbl, slot))
+		nfs4_free_slot(tbl, slot);
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	res->sr_slot = NULL;
+out:
+	return 1;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+	struct nfs4_slot *slot = res->sr_slot;
+	bool send_new_highest_used_slotid = false;
+
+	tbl = slot->table;
+	session = tbl->session;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/* Be nice to the server: try to ensure that the last transmitted
+	 * value for highest_user_slotid <= target_highest_slotid
+	 */
+	if (tbl->highest_used_slotid > tbl->target_highest_slotid)
+		send_new_highest_used_slotid = true;
+
+	if (nfs41_wake_and_assign_slot(tbl, slot)) {
+		send_new_highest_used_slotid = false;
+		goto out_unlock;
+	}
+	nfs4_free_slot(tbl, slot);
+
+	if (tbl->highest_used_slotid != NFS4_NO_SLOT)
+		send_new_highest_used_slotid = false;
+out_unlock:
+	spin_unlock(&tbl->slot_tbl_lock);
+	res->sr_slot = NULL;
+	if (send_new_highest_used_slotid)
+		nfs41_server_notify_highest_slotid_update(session->clp);
+}
+
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+	struct nfs4_session *session;
+	struct nfs4_slot *slot = res->sr_slot;
+	struct nfs_client *clp;
+	bool interrupted = false;
+	int ret = 1;
+
+	if (slot == NULL)
+		goto out_noaction;
+	/* don't increment the sequence number if the task wasn't sent */
+	if (!RPC_WAS_SENT(task))
+		goto out;
+
+	session = slot->table->session;
+
+	if (slot->interrupted) {
+		slot->interrupted = 0;
+		interrupted = true;
+	}
+
+	trace_nfs4_sequence_done(session, res);
+	/* Check the SEQUENCE operation status */
+	switch (res->sr_status) {
+	case 0:
+		/* Update the slot's sequence and clientid lease timer */
+		++slot->seq_nr;
+		clp = session->clp;
+		do_renew_lease(clp, res->sr_timestamp);
+		/* Check sequence flags */
+		if (res->sr_status_flags != 0)
+			nfs4_schedule_lease_recovery(clp);
+		nfs41_update_target_slotid(slot->table, slot, res);
+		break;
+	case 1:
+		/*
+		 * sr_status remains 1 if an RPC level error occurred.
+		 * The server may or may not have processed the sequence
+		 * operation..
+		 * Mark the slot as having hosted an interrupted RPC call.
+		 */
+		slot->interrupted = 1;
+		goto out;
+	case -NFS4ERR_DELAY:
+		/* The server detected a resend of the RPC call and
+		 * returned NFS4ERR_DELAY as per Section 2.10.6.2
+		 * of RFC5661.
+		 */
+		dprintk("%s: slot=%u seq=%u: Operation in progress\n",
+			__func__,
+			slot->slot_nr,
+			slot->seq_nr);
+		goto out_retry;
+	case -NFS4ERR_BADSLOT:
+		/*
+		 * The slot id we used was probably retired. Try again
+		 * using a different slot id.
+		 */
+		goto retry_nowait;
+	case -NFS4ERR_SEQ_MISORDERED:
+		/*
+		 * Was the last operation on this sequence interrupted?
+		 * If so, retry after bumping the sequence number.
+		 */
+		if (interrupted) {
+			++slot->seq_nr;
+			goto retry_nowait;
+		}
+		/*
+		 * Could this slot have been previously retired?
+		 * If so, then the server may be expecting seq_nr = 1!
+		 */
+		if (slot->seq_nr != 1) {
+			slot->seq_nr = 1;
+			goto retry_nowait;
+		}
+		break;
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+		++slot->seq_nr;
+		goto retry_nowait;
+	default:
+		/* Just update the slot sequence no. */
+		++slot->seq_nr;
+	}
+out:
+	/* The session may be reset by one of the error handlers. */
+	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
+	nfs41_sequence_free_slot(res);
+out_noaction:
+	return ret;
+retry_nowait:
+	if (rpc_restart_call_prepare(task)) {
+		task->tk_status = 0;
+		ret = 0;
+	}
+	goto out;
+out_retry:
+	if (!rpc_restart_call(task))
+		goto out;
+	rpc_delay(task, NFS4_POLL_RETRY_MAX);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs41_sequence_done);
+
+int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+	if (res->sr_slot == NULL)
+		return 1;
+	if (!res->sr_slot->table->session)
+		return nfs40_sequence_done(task, res);
+	return nfs41_sequence_done(task, res);
+}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
+
+int nfs41_setup_sequence(struct nfs4_session *session,
+				struct nfs4_sequence_args *args,
+				struct nfs4_sequence_res *res,
+				struct rpc_task *task)
+{
+	struct nfs4_slot *slot;
+	struct nfs4_slot_table *tbl;
+
+	dprintk("--> %s\n", __func__);
+	/* slot already allocated? */
+	if (res->sr_slot != NULL)
+		goto out_success;
+
+	tbl = &session->fc_slot_table;
+
+	task->tk_timeout = 0;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) &&
+	    !args->sa_privileged) {
+		/* The state manager will wait until the slot table is empty */
+		dprintk("%s session is draining\n", __func__);
+		goto out_sleep;
+	}
+
+	slot = nfs4_alloc_slot(tbl);
+	if (IS_ERR(slot)) {
+		/* If out of memory, try again in 1/4 second */
+		if (slot == ERR_PTR(-ENOMEM))
+			task->tk_timeout = HZ >> 2;
+		dprintk("<-- %s: no free slots\n", __func__);
+		goto out_sleep;
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	args->sa_slot = slot;
+
+	dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
+			slot->slot_nr, slot->seq_nr);
+
+	res->sr_slot = slot;
+	res->sr_timestamp = jiffies;
+	res->sr_status_flags = 0;
+	/*
+	 * sr_status is only set in decode_sequence, and so will remain
+	 * set to 1 if an rpc level failure occurs.
+	 */
+	res->sr_status = 1;
+	trace_nfs4_setup_sequence(session, args);
+out_success:
+	rpc_call_start(task);
+	return 0;
+out_sleep:
+	/* Privileged tasks are queued with top priority */
+	if (args->sa_privileged)
+		rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
+				NULL, RPC_PRIORITY_PRIVILEGED);
+	else
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+	spin_unlock(&tbl->slot_tbl_lock);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
+
+static int nfs4_setup_sequence(const struct nfs_server *server,
+			       struct nfs4_sequence_args *args,
+			       struct nfs4_sequence_res *res,
+			       struct rpc_task *task)
+{
+	struct nfs4_session *session = nfs4_get_session(server);
+	int ret = 0;
+
+	if (!session)
+		return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+					    args, res, task);
+
+	dprintk("--> %s clp %p session %p sr_slot %u\n",
+		__func__, session->clp, session, res->sr_slot ?
+			res->sr_slot->slot_nr : NFS4_NO_SLOT);
+
+	ret = nfs41_setup_sequence(session, args, res, task);
+
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
+}
+
+static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_call_sync_data *data = calldata;
+	struct nfs4_session *session = nfs4_get_session(data->seq_server);
+
+	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
+
+	nfs41_setup_sequence(session, data->seq_args, data->seq_res, task);
+}
+
+static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_call_sync_data *data = calldata;
+
+	nfs41_sequence_done(task, data->seq_res);
+}
+
+static const struct rpc_call_ops nfs41_call_sync_ops = {
+	.rpc_call_prepare = nfs41_call_sync_prepare,
+	.rpc_call_done = nfs41_call_sync_done,
+};
+
+#else	/* !CONFIG_NFS_V4_1 */
+
+static int nfs4_setup_sequence(const struct nfs_server *server,
+			       struct nfs4_sequence_args *args,
+			       struct nfs4_sequence_res *res,
+			       struct rpc_task *task)
+{
+	return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+				    args, res, task);
+}
+
+int nfs4_sequence_done(struct rpc_task *task,
+		       struct nfs4_sequence_res *res)
+{
+	return nfs40_sequence_done(task, res);
+}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
+
+#endif	/* !CONFIG_NFS_V4_1 */
+
+static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_call_sync_data *data = calldata;
+	nfs4_setup_sequence(data->seq_server,
+				data->seq_args, data->seq_res, task);
+}
+
+static void nfs40_call_sync_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_call_sync_data *data = calldata;
+	nfs4_sequence_done(task, data->seq_res);
+}
+
+static const struct rpc_call_ops nfs40_call_sync_ops = {
+	.rpc_call_prepare = nfs40_call_sync_prepare,
+	.rpc_call_done = nfs40_call_sync_done,
+};
+
+static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
+				   struct nfs_server *server,
+				   struct rpc_message *msg,
+				   struct nfs4_sequence_args *args,
+				   struct nfs4_sequence_res *res)
+{
+	int ret;
+	struct rpc_task *task;
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_call_sync_data data = {
+		.seq_server = server,
+		.seq_args = args,
+		.seq_res = res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clnt,
+		.rpc_message = msg,
+		.callback_ops = clp->cl_mvops->call_sync_ops,
+		.callback_data = &data
+	};
+
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else {
+		ret = task->tk_status;
+		rpc_put_task(task);
+	}
+	return ret;
+}
+
+int nfs4_call_sync(struct rpc_clnt *clnt,
+		   struct nfs_server *server,
+		   struct rpc_message *msg,
+		   struct nfs4_sequence_args *args,
+		   struct nfs4_sequence_res *res,
+		   int cache_reply)
+{
+	nfs4_init_sequence(args, res, cache_reply);
+	return nfs4_call_sync_sequence(clnt, server, msg, args, res);
+}
+
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
+{
+	struct nfs_inode *nfsi = NFS_I(dir);
+
+	spin_lock(&dir->i_lock);
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+	if (!cinfo->atomic || cinfo->before != dir->i_version)
+		nfs_force_lookup_revalidate(dir);
+	dir->i_version = cinfo->after;
+	nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+	nfs_fscache_invalidate(dir);
+	spin_unlock(&dir->i_lock);
+}
+
+struct nfs4_opendata {
+	struct kref kref;
+	struct nfs_openargs o_arg;
+	struct nfs_openres o_res;
+	struct nfs_open_confirmargs c_arg;
+	struct nfs_open_confirmres c_res;
+	struct nfs4_string owner_name;
+	struct nfs4_string group_name;
+	struct nfs_fattr f_attr;
+	struct nfs4_label *f_label;
+	struct dentry *dir;
+	struct dentry *dentry;
+	struct nfs4_state_owner *owner;
+	struct nfs4_state *state;
+	struct iattr attrs;
+	unsigned long timestamp;
+	unsigned int rpc_done : 1;
+	unsigned int file_created : 1;
+	unsigned int is_recover : 1;
+	int rpc_status;
+	int cancelled;
+};
+
+static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
+		int err, struct nfs4_exception *exception)
+{
+	if (err != -EINVAL)
+		return false;
+	if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+		return false;
+	server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1;
+	exception->retry = 1;
+	return true;
+}
+
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+		fmode_t fmode, int openflags)
+{
+	u32 res = 0;
+
+	switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+	case FMODE_READ:
+		res = NFS4_SHARE_ACCESS_READ;
+		break;
+	case FMODE_WRITE:
+		res = NFS4_SHARE_ACCESS_WRITE;
+		break;
+	case FMODE_READ|FMODE_WRITE:
+		res = NFS4_SHARE_ACCESS_BOTH;
+	}
+	if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+		goto out;
+	/* Want no delegation if we're using O_DIRECT */
+	if (openflags & O_DIRECT)
+		res |= NFS4_SHARE_WANT_NO_DELEG;
+out:
+	return res;
+}
+
+static enum open_claim_type4
+nfs4_map_atomic_open_claim(struct nfs_server *server,
+		enum open_claim_type4 claim)
+{
+	if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+		return claim;
+	switch (claim) {
+	default:
+		return claim;
+	case NFS4_OPEN_CLAIM_FH:
+		return NFS4_OPEN_CLAIM_NULL;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		return NFS4_OPEN_CLAIM_DELEGATE_CUR;
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		return NFS4_OPEN_CLAIM_DELEGATE_PREV;
+	}
+}
+
+static void nfs4_init_opendata_res(struct nfs4_opendata *p)
+{
+	p->o_res.f_attr = &p->f_attr;
+	p->o_res.f_label = p->f_label;
+	p->o_res.seqid = p->o_arg.seqid;
+	p->c_res.seqid = p->c_arg.seqid;
+	p->o_res.server = p->o_arg.server;
+	p->o_res.access_request = p->o_arg.access;
+	nfs_fattr_init(&p->f_attr);
+	nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
+}
+
+static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
+		struct nfs4_state_owner *sp, fmode_t fmode, int flags,
+		const struct iattr *attrs,
+		struct nfs4_label *label,
+		enum open_claim_type4 claim,
+		gfp_t gfp_mask)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir = d_inode(parent);
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+	struct nfs4_opendata *p;
+
+	p = kzalloc(sizeof(*p), gfp_mask);
+	if (p == NULL)
+		goto err;
+
+	p->f_label = nfs4_label_alloc(server, gfp_mask);
+	if (IS_ERR(p->f_label))
+		goto err_free_p;
+
+	alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+	p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
+	if (IS_ERR(p->o_arg.seqid))
+		goto err_free_label;
+	nfs_sb_active(dentry->d_sb);
+	p->dentry = dget(dentry);
+	p->dir = parent;
+	p->owner = sp;
+	atomic_inc(&sp->so_count);
+	p->o_arg.open_flags = flags;
+	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+	p->o_arg.share_access = nfs4_map_atomic_open_share(server,
+			fmode, flags);
+	/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
+	 * will return permission denied for all bits until close */
+	if (!(flags & O_EXCL)) {
+		/* ask server to check for all possible rights as results
+		 * are cached */
+		p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
+				  NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE;
+	}
+	p->o_arg.clientid = server->nfs_client->cl_clientid;
+	p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
+	p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
+	p->o_arg.name = &dentry->d_name;
+	p->o_arg.server = server;
+	p->o_arg.bitmask = nfs4_bitmask(server, label);
+	p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
+	p->o_arg.label = label;
+	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
+	switch (p->o_arg.claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+		p->o_arg.fh = NFS_FH(dir);
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+	case NFS4_OPEN_CLAIM_FH:
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		p->o_arg.fh = NFS_FH(d_inode(dentry));
+	}
+	if (attrs != NULL && attrs->ia_valid != 0) {
+		__u32 verf[2];
+
+		p->o_arg.u.attrs = &p->attrs;
+		memcpy(&p->attrs, attrs, sizeof(p->attrs));
+
+		verf[0] = jiffies;
+		verf[1] = current->pid;
+		memcpy(p->o_arg.u.verifier.data, verf,
+				sizeof(p->o_arg.u.verifier.data));
+	}
+	p->c_arg.fh = &p->o_res.fh;
+	p->c_arg.stateid = &p->o_res.stateid;
+	p->c_arg.seqid = p->o_arg.seqid;
+	nfs4_init_opendata_res(p);
+	kref_init(&p->kref);
+	return p;
+
+err_free_label:
+	nfs4_label_free(p->f_label);
+err_free_p:
+	kfree(p);
+err:
+	dput(parent);
+	return NULL;
+}
+
+static void nfs4_opendata_free(struct kref *kref)
+{
+	struct nfs4_opendata *p = container_of(kref,
+			struct nfs4_opendata, kref);
+	struct super_block *sb = p->dentry->d_sb;
+
+	nfs_free_seqid(p->o_arg.seqid);
+	if (p->state != NULL)
+		nfs4_put_open_state(p->state);
+	nfs4_put_state_owner(p->owner);
+
+	nfs4_label_free(p->f_label);
+
+	dput(p->dir);
+	dput(p->dentry);
+	nfs_sb_deactive(sb);
+	nfs_fattr_free_names(&p->f_attr);
+	kfree(p->f_attr.mdsthreshold);
+	kfree(p);
+}
+
+static void nfs4_opendata_put(struct nfs4_opendata *p)
+{
+	if (p != NULL)
+		kref_put(&p->kref, nfs4_opendata_free);
+}
+
+static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
+{
+	int ret;
+
+	ret = rpc_wait_for_completion_task(task);
+	return ret;
+}
+
+static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
+{
+	int ret = 0;
+
+	if (open_mode & (O_EXCL|O_TRUNC))
+		goto out;
+	switch (mode & (FMODE_READ|FMODE_WRITE)) {
+		case FMODE_READ:
+			ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
+				&& state->n_rdonly != 0;
+			break;
+		case FMODE_WRITE:
+			ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0
+				&& state->n_wronly != 0;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0
+				&& state->n_rdwr != 0;
+	}
+out:
+	return ret;
+}
+
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+{
+	if (delegation == NULL)
+		return 0;
+	if ((delegation->type & fmode) != fmode)
+		return 0;
+	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		return 0;
+	nfs_mark_delegation_referenced(delegation);
+	return 1;
+}
+
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
+{
+	switch (fmode) {
+		case FMODE_WRITE:
+			state->n_wronly++;
+			break;
+		case FMODE_READ:
+			state->n_rdonly++;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			state->n_rdwr++;
+	}
+	nfs4_state_set_mode_locked(state, state->state | fmode);
+}
+
+static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
+{
+	struct nfs_client *clp = state->owner->so_server->nfs_client;
+	bool need_recover = false;
+
+	if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
+		need_recover = true;
+	if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
+		need_recover = true;
+	if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
+		need_recover = true;
+	if (need_recover)
+		nfs4_state_mark_reclaim_nograce(clp, state);
+}
+
+static bool nfs_need_update_open_stateid(struct nfs4_state *state,
+		nfs4_stateid *stateid)
+{
+	if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
+		return true;
+	if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+		nfs_test_and_clear_all_open_stateid(state);
+		return true;
+	}
+	if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
+		return true;
+	return false;
+}
+
+static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
+{
+	if (state->n_wronly)
+		set_bit(NFS_O_WRONLY_STATE, &state->flags);
+	if (state->n_rdonly)
+		set_bit(NFS_O_RDONLY_STATE, &state->flags);
+	if (state->n_rdwr)
+		set_bit(NFS_O_RDWR_STATE, &state->flags);
+}
+
+static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+		nfs4_stateid *stateid, fmode_t fmode)
+{
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+	case FMODE_WRITE:
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+		break;
+	case FMODE_READ:
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+		break;
+	case 0:
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+		clear_bit(NFS_OPEN_STATE, &state->flags);
+	}
+	if (stateid == NULL)
+		return;
+	/* Handle races with OPEN */
+	if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
+	    !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+		nfs_resync_open_stateid_locked(state);
+		return;
+	}
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+		nfs4_stateid_copy(&state->stateid, stateid);
+	nfs4_stateid_copy(&state->open_stateid, stateid);
+}
+
+static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
+	write_seqlock(&state->seqlock);
+	nfs_clear_open_stateid_locked(state, stateid, fmode);
+	write_sequnlock(&state->seqlock);
+	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+}
+
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
+	switch (fmode) {
+		case FMODE_READ:
+			set_bit(NFS_O_RDONLY_STATE, &state->flags);
+			break;
+		case FMODE_WRITE:
+			set_bit(NFS_O_WRONLY_STATE, &state->flags);
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			set_bit(NFS_O_RDWR_STATE, &state->flags);
+	}
+	if (!nfs_need_update_open_stateid(state, stateid))
+		return;
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+		nfs4_stateid_copy(&state->stateid, stateid);
+	nfs4_stateid_copy(&state->open_stateid, stateid);
+}
+
+static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
+{
+	/*
+	 * Protect the call to nfs4_state_set_mode_locked and
+	 * serialise the stateid update
+	 */
+	write_seqlock(&state->seqlock);
+	if (deleg_stateid != NULL) {
+		nfs4_stateid_copy(&state->stateid, deleg_stateid);
+		set_bit(NFS_DELEGATED_STATE, &state->flags);
+	}
+	if (open_stateid != NULL)
+		nfs_set_open_stateid_locked(state, open_stateid, fmode);
+	write_sequnlock(&state->seqlock);
+	spin_lock(&state->owner->so_lock);
+	update_open_stateflags(state, fmode);
+	spin_unlock(&state->owner->so_lock);
+}
+
+static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_delegation *deleg_cur;
+	int ret = 0;
+
+	fmode &= (FMODE_READ|FMODE_WRITE);
+
+	rcu_read_lock();
+	deleg_cur = rcu_dereference(nfsi->delegation);
+	if (deleg_cur == NULL)
+		goto no_delegation;
+
+	spin_lock(&deleg_cur->lock);
+	if (rcu_dereference(nfsi->delegation) != deleg_cur ||
+	   test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
+	    (deleg_cur->type & fmode) != fmode)
+		goto no_delegation_unlock;
+
+	if (delegation == NULL)
+		delegation = &deleg_cur->stateid;
+	else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation))
+		goto no_delegation_unlock;
+
+	nfs_mark_delegation_referenced(deleg_cur);
+	__update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+	ret = 1;
+no_delegation_unlock:
+	spin_unlock(&deleg_cur->lock);
+no_delegation:
+	rcu_read_unlock();
+
+	if (!ret && open_stateid != NULL) {
+		__update_open_stateid(state, open_stateid, NULL, fmode);
+		ret = 1;
+	}
+	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+
+	return ret;
+}
+
+static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
+		const nfs4_stateid *stateid)
+{
+	struct nfs4_state *state = lsp->ls_state;
+	bool ret = false;
+
+	spin_lock(&state->state_lock);
+	if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
+		goto out_noupdate;
+	if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
+		goto out_noupdate;
+	nfs4_stateid_copy(&lsp->ls_stateid, stateid);
+	ret = true;
+out_noupdate:
+	spin_unlock(&state->state_lock);
+	return ret;
+}
+
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
+{
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation == NULL || (delegation->type & fmode) == fmode) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+	nfs4_inode_return_delegation(inode);
+}
+
+static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
+{
+	struct nfs4_state *state = opendata->state;
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_delegation *delegation;
+	int open_mode = opendata->o_arg.open_flags;
+	fmode_t fmode = opendata->o_arg.fmode;
+	nfs4_stateid stateid;
+	int ret = -EAGAIN;
+
+	for (;;) {
+		spin_lock(&state->owner->so_lock);
+		if (can_open_cached(state, fmode, open_mode)) {
+			update_open_stateflags(state, fmode);
+			spin_unlock(&state->owner->so_lock);
+			goto out_return_state;
+		}
+		spin_unlock(&state->owner->so_lock);
+		rcu_read_lock();
+		delegation = rcu_dereference(nfsi->delegation);
+		if (!can_open_delegated(delegation, fmode)) {
+			rcu_read_unlock();
+			break;
+		}
+		/* Save the delegation */
+		nfs4_stateid_copy(&stateid, &delegation->stateid);
+		rcu_read_unlock();
+		nfs_release_seqid(opendata->o_arg.seqid);
+		if (!opendata->is_recover) {
+			ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
+			if (ret != 0)
+				goto out;
+		}
+		ret = -EAGAIN;
+
+		/* Try to update the stateid using the delegation */
+		if (update_open_stateid(state, NULL, &stateid, fmode))
+			goto out_return_state;
+	}
+out:
+	return ERR_PTR(ret);
+out_return_state:
+	atomic_inc(&state->count);
+	return state;
+}
+
+static void
+nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
+{
+	struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client;
+	struct nfs_delegation *delegation;
+	int delegation_flags = 0;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+	if (delegation)
+		delegation_flags = delegation->flags;
+	rcu_read_unlock();
+	if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
+		pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
+				   "returning a delegation for "
+				   "OPEN(CLAIM_DELEGATE_CUR)\n",
+				   clp->cl_hostname);
+	} else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+		nfs_inode_set_delegation(state->inode,
+					 data->owner->so_cred,
+					 &data->o_res);
+	else
+		nfs_inode_reclaim_delegation(state->inode,
+					     data->owner->so_cred,
+					     &data->o_res);
+}
+
+/*
+ * Check the inode attributes against the CLAIM_PREVIOUS returned attributes
+ * and update the nfs4_state.
+ */
+static struct nfs4_state *
+_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
+{
+	struct inode *inode = data->state->inode;
+	struct nfs4_state *state = data->state;
+	int ret;
+
+	if (!data->rpc_done) {
+		if (data->rpc_status) {
+			ret = data->rpc_status;
+			goto err;
+		}
+		/* cached opens have already been processed */
+		goto update;
+	}
+
+	ret = nfs_refresh_inode(inode, &data->f_attr);
+	if (ret)
+		goto err;
+
+	if (data->o_res.delegation_type != 0)
+		nfs4_opendata_check_deleg(data, state);
+update:
+	update_open_stateid(state, &data->o_res.stateid, NULL,
+			    data->o_arg.fmode);
+	atomic_inc(&state->count);
+
+	return state;
+err:
+	return ERR_PTR(ret);
+
+}
+
+static struct nfs4_state *
+_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+	struct inode *inode;
+	struct nfs4_state *state = NULL;
+	int ret;
+
+	if (!data->rpc_done) {
+		state = nfs4_try_open_cached(data);
+		goto out;
+	}
+
+	ret = -EAGAIN;
+	if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+		goto err;
+	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
+	ret = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto err;
+	ret = -ENOMEM;
+	state = nfs4_get_open_state(inode, data->owner);
+	if (state == NULL)
+		goto err_put_inode;
+	if (data->o_res.delegation_type != 0)
+		nfs4_opendata_check_deleg(data, state);
+	update_open_stateid(state, &data->o_res.stateid, NULL,
+			data->o_arg.fmode);
+	iput(inode);
+out:
+	nfs_release_seqid(data->o_arg.seqid);
+	return state;
+err_put_inode:
+	iput(inode);
+err:
+	return ERR_PTR(ret);
+}
+
+static struct nfs4_state *
+nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
+		return _nfs4_opendata_reclaim_to_nfs4_state(data);
+	return _nfs4_opendata_to_nfs4_state(data);
+}
+
+static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_open_context *ctx;
+
+	spin_lock(&state->inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		if (ctx->state != state)
+			continue;
+		get_nfs_open_context(ctx);
+		spin_unlock(&state->inode->i_lock);
+		return ctx;
+	}
+	spin_unlock(&state->inode->i_lock);
+	return ERR_PTR(-ENOENT);
+}
+
+static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
+		struct nfs4_state *state, enum open_claim_type4 claim)
+{
+	struct nfs4_opendata *opendata;
+
+	opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
+			NULL, NULL, claim, GFP_NOFS);
+	if (opendata == NULL)
+		return ERR_PTR(-ENOMEM);
+	opendata->state = state;
+	atomic_inc(&state->count);
+	return opendata;
+}
+
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
+{
+	struct nfs4_state *newstate;
+	int ret;
+
+	opendata->o_arg.open_flags = 0;
+	opendata->o_arg.fmode = fmode;
+	opendata->o_arg.share_access = nfs4_map_atomic_open_share(
+			NFS_SB(opendata->dentry->d_sb),
+			fmode, 0);
+	memset(&opendata->o_res, 0, sizeof(opendata->o_res));
+	memset(&opendata->c_res, 0, sizeof(opendata->c_res));
+	nfs4_init_opendata_res(opendata);
+	ret = _nfs4_recover_proc_open(opendata);
+	if (ret != 0)
+		return ret; 
+	newstate = nfs4_opendata_to_nfs4_state(opendata);
+	if (IS_ERR(newstate))
+		return PTR_ERR(newstate);
+	nfs4_close_state(newstate, fmode);
+	*res = newstate;
+	return 0;
+}
+
+static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
+{
+	struct nfs4_state *newstate;
+	int ret;
+
+	/* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+	clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+	/* memory barrier prior to reading state->n_* */
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	clear_bit(NFS_OPEN_STATE, &state->flags);
+	smp_rmb();
+	if (state->n_rdwr != 0) {
+		ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
+		if (ret != 0)
+			return ret;
+		if (newstate != state)
+			return -ESTALE;
+	}
+	if (state->n_wronly != 0) {
+		ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
+		if (ret != 0)
+			return ret;
+		if (newstate != state)
+			return -ESTALE;
+	}
+	if (state->n_rdonly != 0) {
+		ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
+		if (ret != 0)
+			return ret;
+		if (newstate != state)
+			return -ESTALE;
+	}
+	/*
+	 * We may have performed cached opens for all three recoveries.
+	 * Check if we need to update the current stateid.
+	 */
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
+	    !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {
+		write_seqlock(&state->seqlock);
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+			nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+		write_sequnlock(&state->seqlock);
+	}
+	return 0;
+}
+
+/*
+ * OPEN_RECLAIM:
+ * 	reclaim state on the server after a reboot.
+ */
+static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs_delegation *delegation;
+	struct nfs4_opendata *opendata;
+	fmode_t delegation_type = 0;
+	int status;
+
+	opendata = nfs4_open_recoverdata_alloc(ctx, state,
+			NFS4_OPEN_CLAIM_PREVIOUS);
+	if (IS_ERR(opendata))
+		return PTR_ERR(opendata);
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+	if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
+		delegation_type = delegation->type;
+	rcu_read_unlock();
+	opendata->o_arg.u.delegation_type = delegation_type;
+	status = nfs4_open_recover(opendata, state);
+	nfs4_opendata_put(opendata);
+	return status;
+}
+
+static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_do_open_reclaim(ctx, state);
+		trace_nfs4_open_reclaim(ctx, 0, err);
+		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+			continue;
+		if (err != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	struct nfs_open_context *ctx;
+	int ret;
+
+	ctx = nfs4_state_find_open_context(state);
+	if (IS_ERR(ctx))
+		return -EAGAIN;
+	ret = nfs4_do_open_reclaim(ctx, state);
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
+static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err)
+{
+	switch (err) {
+		default:
+			printk(KERN_ERR "NFS: %s: unhandled error "
+					"%d.\n", __func__, err);
+		case 0:
+		case -ENOENT:
+		case -ESTALE:
+			break;
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_DEADSESSION:
+			set_bit(NFS_DELEGATED_STATE, &state->flags);
+			nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+			return -EAGAIN;
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_STALE_STATEID:
+			set_bit(NFS_DELEGATED_STATE, &state->flags);
+		case -NFS4ERR_EXPIRED:
+			/* Don't recall a delegation if it was lost */
+			nfs4_schedule_lease_recovery(server->nfs_client);
+			return -EAGAIN;
+		case -NFS4ERR_MOVED:
+			nfs4_schedule_migration_recovery(server);
+			return -EAGAIN;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(server->nfs_client);
+			return -EAGAIN;
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_OPENMODE:
+			nfs_inode_find_state_and_recover(state->inode,
+					stateid);
+			nfs4_schedule_stateid_recovery(server, state);
+			return -EAGAIN;
+		case -NFS4ERR_DELAY:
+		case -NFS4ERR_GRACE:
+			set_bit(NFS_DELEGATED_STATE, &state->flags);
+			ssleep(1);
+			return -EAGAIN;
+		case -ENOMEM:
+		case -NFS4ERR_DENIED:
+			/* kill_proc(fl->fl_pid, SIGLOST, 1); */
+			return 0;
+	}
+	return err;
+}
+
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_opendata *opendata;
+	int err;
+
+	opendata = nfs4_open_recoverdata_alloc(ctx, state,
+			NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+	if (IS_ERR(opendata))
+		return PTR_ERR(opendata);
+	nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
+	err = nfs4_open_recover(opendata, state);
+	nfs4_opendata_put(opendata);
+	return nfs4_handle_delegation_recall_error(server, state, stateid, err);
+}
+
+static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+
+	nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl,
+			     &data->c_arg.seq_args, &data->c_res.seq_res, task);
+}
+
+static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+
+	nfs40_sequence_done(task, &data->c_res.seq_res);
+
+	data->rpc_status = task->tk_status;
+	if (data->rpc_status == 0) {
+		nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
+		nfs_confirm_seqid(&data->owner->so_seqid, 0);
+		renew_lease(data->o_res.server, data->timestamp);
+		data->rpc_done = 1;
+	}
+}
+
+static void nfs4_open_confirm_release(void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state *state = NULL;
+
+	/* If this request hasn't been cancelled, do nothing */
+	if (data->cancelled == 0)
+		goto out_free;
+	/* In case of error, no cleanup! */
+	if (!data->rpc_done)
+		goto out_free;
+	state = nfs4_opendata_to_nfs4_state(data);
+	if (!IS_ERR(state))
+		nfs4_close_state(state, data->o_arg.fmode);
+out_free:
+	nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_confirm_ops = {
+	.rpc_call_prepare = nfs4_open_confirm_prepare,
+	.rpc_call_done = nfs4_open_confirm_done,
+	.rpc_release = nfs4_open_confirm_release,
+};
+
+/*
+ * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
+{
+	struct nfs_server *server = NFS_SERVER(d_inode(data->dir));
+	struct rpc_task *task;
+	struct  rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+		.rpc_argp = &data->c_arg,
+		.rpc_resp = &data->c_res,
+		.rpc_cred = data->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_open_confirm_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status;
+
+	nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1);
+	kref_get(&data->kref);
+	data->rpc_done = 0;
+	data->rpc_status = 0;
+	data->timestamp = jiffies;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0) {
+		data->cancelled = 1;
+		smp_wmb();
+	} else
+		status = data->rpc_status;
+	rpc_put_task(task);
+	return status;
+}
+
+static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state_owner *sp = data->owner;
+	struct nfs_client *clp = sp->so_server->nfs_client;
+
+	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
+		goto out_wait;
+	/*
+	 * Check if we still need to send an OPEN call, or if we can use
+	 * a delegation instead.
+	 */
+	if (data->state != NULL) {
+		struct nfs_delegation *delegation;
+
+		if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
+			goto out_no_action;
+		rcu_read_lock();
+		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
+		if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
+		    data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
+		    can_open_delegated(delegation, data->o_arg.fmode))
+			goto unlock_no_action;
+		rcu_read_unlock();
+	}
+	/* Update client id. */
+	data->o_arg.clientid = clp->cl_clientid;
+	switch (data->o_arg.claim) {
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
+	case NFS4_OPEN_CLAIM_FH:
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
+	}
+	data->timestamp = jiffies;
+	if (nfs4_setup_sequence(data->o_arg.server,
+				&data->o_arg.seq_args,
+				&data->o_res.seq_res,
+				task) != 0)
+		nfs_release_seqid(data->o_arg.seqid);
+
+	/* Set the create mode (note dependency on the session type) */
+	data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
+	if (data->o_arg.open_flags & O_EXCL) {
+		data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+		if (nfs4_has_persistent_session(clp))
+			data->o_arg.createmode = NFS4_CREATE_GUARDED;
+		else if (clp->cl_mvops->minor_version > 0)
+			data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+	}
+	return;
+unlock_no_action:
+	rcu_read_unlock();
+out_no_action:
+	task->tk_action = NULL;
+out_wait:
+	nfs4_sequence_done(task, &data->o_res.seq_res);
+}
+
+static void nfs4_open_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+
+	data->rpc_status = task->tk_status;
+
+	if (!nfs4_sequence_done(task, &data->o_res.seq_res))
+		return;
+
+	if (task->tk_status == 0) {
+		if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) {
+			switch (data->o_res.f_attr->mode & S_IFMT) {
+			case S_IFREG:
+				break;
+			case S_IFLNK:
+				data->rpc_status = -ELOOP;
+				break;
+			case S_IFDIR:
+				data->rpc_status = -EISDIR;
+				break;
+			default:
+				data->rpc_status = -ENOTDIR;
+			}
+		}
+		renew_lease(data->o_res.server, data->timestamp);
+		if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
+			nfs_confirm_seqid(&data->owner->so_seqid, 0);
+	}
+	data->rpc_done = 1;
+}
+
+static void nfs4_open_release(void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state *state = NULL;
+
+	/* If this request hasn't been cancelled, do nothing */
+	if (data->cancelled == 0)
+		goto out_free;
+	/* In case of error, no cleanup! */
+	if (data->rpc_status != 0 || !data->rpc_done)
+		goto out_free;
+	/* In case we need an open_confirm, no cleanup! */
+	if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
+		goto out_free;
+	state = nfs4_opendata_to_nfs4_state(data);
+	if (!IS_ERR(state))
+		nfs4_close_state(state, data->o_arg.fmode);
+out_free:
+	nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_ops = {
+	.rpc_call_prepare = nfs4_open_prepare,
+	.rpc_call_done = nfs4_open_done,
+	.rpc_release = nfs4_open_release,
+};
+
+static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
+{
+	struct inode *dir = d_inode(data->dir);
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_openargs *o_arg = &data->o_arg;
+	struct nfs_openres *o_res = &data->o_res;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
+		.rpc_argp = o_arg,
+		.rpc_resp = o_res,
+		.rpc_cred = data->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_open_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status;
+
+	nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);
+	kref_get(&data->kref);
+	data->rpc_done = 0;
+	data->rpc_status = 0;
+	data->cancelled = 0;
+	data->is_recover = 0;
+	if (isrecover) {
+		nfs4_set_sequence_privileged(&o_arg->seq_args);
+		data->is_recover = 1;
+	}
+	task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status != 0) {
+                data->cancelled = 1;
+                smp_wmb();
+        } else
+                status = data->rpc_status;
+        rpc_put_task(task);
+
+	return status;
+}
+
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
+{
+	struct inode *dir = d_inode(data->dir);
+	struct nfs_openres *o_res = &data->o_res;
+        int status;
+
+	status = nfs4_run_open_task(data, 1);
+	if (status != 0 || !data->rpc_done)
+		return status;
+
+	nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
+
+	if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+		status = _nfs4_proc_open_confirm(data);
+		if (status != 0)
+			return status;
+	}
+
+	return status;
+}
+
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
+static int nfs4_opendata_access(struct rpc_cred *cred,
+				struct nfs4_opendata *opendata,
+				struct nfs4_state *state, fmode_t fmode,
+				int openflags)
+{
+	struct nfs_access_entry cache;
+	u32 mask;
+
+	/* access call failed or for some reason the server doesn't
+	 * support any access modes -- defer access call until later */
+	if (opendata->o_res.access_supported == 0)
+		return 0;
+
+	mask = 0;
+	/*
+	 * Use openflags to check for exec, because fmode won't
+	 * always have FMODE_EXEC set when file open for exec.
+	 */
+	if (openflags & __FMODE_EXEC) {
+		/* ONLY check for exec rights */
+		mask = MAY_EXEC;
+	} else if ((fmode & FMODE_READ) && !opendata->file_created)
+		mask = MAY_READ;
+
+	cache.cred = cred;
+	cache.jiffies = jiffies;
+	nfs_access_set_mask(&cache, opendata->o_res.access_result);
+	nfs_access_add_cache(state->inode, &cache);
+
+	if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0)
+		return 0;
+
+	/* even though OPEN succeeded, access is denied. Close the file */
+	nfs4_close_state(state, fmode);
+	return -EACCES;
+}
+
+/*
+ * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open(struct nfs4_opendata *data)
+{
+	struct inode *dir = d_inode(data->dir);
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_openargs *o_arg = &data->o_arg;
+	struct nfs_openres *o_res = &data->o_res;
+	int status;
+
+	status = nfs4_run_open_task(data, 0);
+	if (!data->rpc_done)
+		return status;
+	if (status != 0) {
+		if (status == -NFS4ERR_BADNAME &&
+				!(o_arg->open_flags & O_CREAT))
+			return -ENOENT;
+		return status;
+	}
+
+	nfs_fattr_map_and_free_names(server, &data->f_attr);
+
+	if (o_arg->open_flags & O_CREAT) {
+		update_changeattr(dir, &o_res->cinfo);
+		if (o_arg->open_flags & O_EXCL)
+			data->file_created = 1;
+		else if (o_res->cinfo.before != o_res->cinfo.after)
+			data->file_created = 1;
+	}
+	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+		server->caps &= ~NFS_CAP_POSIX_LOCK;
+	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+		status = _nfs4_proc_open_confirm(data);
+		if (status != 0)
+			return status;
+	}
+	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
+		nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
+	return 0;
+}
+
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+	return nfs4_client_recover_expired_lease(server->nfs_client);
+}
+
+/*
+ * OPEN_EXPIRED:
+ * 	reclaim state on the server after a network partition.
+ * 	Assumes caller holds the appropriate lock
+ */
+static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs4_opendata *opendata;
+	int ret;
+
+	opendata = nfs4_open_recoverdata_alloc(ctx, state,
+			NFS4_OPEN_CLAIM_FH);
+	if (IS_ERR(opendata))
+		return PTR_ERR(opendata);
+	ret = nfs4_open_recover(opendata, state);
+	if (ret == -ESTALE)
+		d_drop(ctx->dentry);
+	nfs4_opendata_put(opendata);
+	return ret;
+}
+
+static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = _nfs4_open_expired(ctx, state);
+		trace_nfs4_open_expired(ctx, 0, err);
+		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+			continue;
+		switch (err) {
+		default:
+			goto out;
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+			nfs4_handle_exception(server, err, &exception);
+			err = 0;
+		}
+	} while (exception.retry);
+out:
+	return err;
+}
+
+static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	struct nfs_open_context *ctx;
+	int ret;
+
+	ctx = nfs4_state_find_open_context(state);
+	if (IS_ERR(ctx))
+		return -EAGAIN;
+	ret = nfs4_do_open_expired(ctx, state);
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+	nfs_remove_bad_delegation(state->inode);
+	write_seqlock(&state->seqlock);
+	nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+	write_sequnlock(&state->seqlock);
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+	if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+		nfs_finish_clear_delegation_stateid(state);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	/* NFSv4.0 doesn't allow for delegation recovery on open expire */
+	nfs40_clear_delegation_stateid(state);
+	return nfs4_open_expired(sp, state);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	nfs4_stateid stateid;
+	struct nfs_delegation *delegation;
+	struct rpc_cred *cred;
+	int status;
+
+	/* Get the delegation credential for use by test/free_stateid */
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+	if (delegation == NULL) {
+		rcu_read_unlock();
+		return;
+	}
+
+	nfs4_stateid_copy(&stateid, &delegation->stateid);
+	cred = get_rpccred(delegation->cred);
+	rcu_read_unlock();
+	status = nfs41_test_stateid(server, &stateid, cred);
+	trace_nfs4_test_delegation_stateid(state, NULL, status);
+
+	if (status != NFS_OK) {
+		/* Free the stateid unless the server explicitly
+		 * informs us the stateid is unrecognized. */
+		if (status != -NFS4ERR_BAD_STATEID)
+			nfs41_free_stateid(server, &stateid, cred);
+		nfs_finish_clear_delegation_stateid(state);
+	}
+
+	put_rpccred(cred);
+}
+
+/**
+ * nfs41_check_open_stateid - possibly free an open stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_open_stateid(struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	nfs4_stateid *stateid = &state->open_stateid;
+	struct rpc_cred *cred = state->owner->so_cred;
+	int status;
+
+	/* If a state reset has been done, test_stateid is unneeded */
+	if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) &&
+	    (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) &&
+	    (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
+		return -NFS4ERR_BAD_STATEID;
+
+	status = nfs41_test_stateid(server, stateid, cred);
+	trace_nfs4_test_open_stateid(state, NULL, status);
+	if (status != NFS_OK) {
+		/* Free the stateid unless the server explicitly
+		 * informs us the stateid is unrecognized. */
+		if (status != -NFS4ERR_BAD_STATEID)
+			nfs41_free_stateid(server, stateid, cred);
+
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+		clear_bit(NFS_O_RDWR_STATE, &state->flags);
+		clear_bit(NFS_OPEN_STATE, &state->flags);
+	}
+	return status;
+}
+
+static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	int status;
+
+	nfs41_check_delegation_stateid(state);
+	status = nfs41_check_open_stateid(state);
+	if (status != NFS_OK)
+		status = nfs4_open_expired(sp, state);
+	return status;
+}
+#endif
+
+/*
+ * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
+ * fields corresponding to attributes that were used to store the verifier.
+ * Make sure we clobber those fields in the later setattr call
+ */
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+{
+	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+	    !(sattr->ia_valid & ATTR_ATIME_SET))
+		sattr->ia_valid |= ATTR_ATIME;
+
+	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+	    !(sattr->ia_valid & ATTR_MTIME_SET))
+		sattr->ia_valid |= ATTR_MTIME;
+}
+
+static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
+		fmode_t fmode,
+		int flags,
+		struct nfs_open_context *ctx)
+{
+	struct nfs4_state_owner *sp = opendata->owner;
+	struct nfs_server *server = sp->so_server;
+	struct dentry *dentry;
+	struct nfs4_state *state;
+	unsigned int seq;
+	int ret;
+
+	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+
+	ret = _nfs4_proc_open(opendata);
+	if (ret != 0)
+		goto out;
+
+	state = nfs4_opendata_to_nfs4_state(opendata);
+	ret = PTR_ERR(state);
+	if (IS_ERR(state))
+		goto out;
+	if (server->caps & NFS_CAP_POSIX_LOCK)
+		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+
+	dentry = opendata->dentry;
+	if (d_really_is_negative(dentry)) {
+		/* FIXME: Is this d_drop() ever needed? */
+		d_drop(dentry);
+		dentry = d_add_unique(dentry, igrab(state->inode));
+		if (dentry == NULL) {
+			dentry = opendata->dentry;
+		} else if (dentry != ctx->dentry) {
+			dput(ctx->dentry);
+			ctx->dentry = dget(dentry);
+		}
+		nfs_set_verifier(dentry,
+				nfs_save_change_attribute(d_inode(opendata->dir)));
+	}
+
+	ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
+	if (ret != 0)
+		goto out;
+
+	ctx->state = state;
+	if (d_inode(dentry) == state->inode) {
+		nfs_inode_attach_open_context(ctx);
+		if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+			nfs4_schedule_stateid_recovery(server, state);
+	}
+out:
+	return ret;
+}
+
+/*
+ * Returns a referenced nfs4_state
+ */
+static int _nfs4_do_open(struct inode *dir,
+			struct nfs_open_context *ctx,
+			int flags,
+			struct iattr *sattr,
+			struct nfs4_label *label,
+			int *opened)
+{
+	struct nfs4_state_owner  *sp;
+	struct nfs4_state     *state = NULL;
+	struct nfs_server       *server = NFS_SERVER(dir);
+	struct nfs4_opendata *opendata;
+	struct dentry *dentry = ctx->dentry;
+	struct rpc_cred *cred = ctx->cred;
+	struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+	fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+	enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+	struct nfs4_label *olabel = NULL;
+	int status;
+
+	/* Protect against reboot recovery conflicts */
+	status = -ENOMEM;
+	sp = nfs4_get_state_owner(server, cred, GFP_KERNEL);
+	if (sp == NULL) {
+		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+		goto out_err;
+	}
+	status = nfs4_recover_expired_lease(server);
+	if (status != 0)
+		goto err_put_state_owner;
+	if (d_really_is_positive(dentry))
+		nfs4_return_incompatible_delegation(d_inode(dentry), fmode);
+	status = -ENOMEM;
+	if (d_really_is_positive(dentry))
+		claim = NFS4_OPEN_CLAIM_FH;
+	opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
+			label, claim, GFP_KERNEL);
+	if (opendata == NULL)
+		goto err_put_state_owner;
+
+	if (label) {
+		olabel = nfs4_label_alloc(server, GFP_KERNEL);
+		if (IS_ERR(olabel)) {
+			status = PTR_ERR(olabel);
+			goto err_opendata_put;
+		}
+	}
+
+	if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+		if (!opendata->f_attr.mdsthreshold) {
+			opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+			if (!opendata->f_attr.mdsthreshold)
+				goto err_free_label;
+		}
+		opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
+	}
+	if (d_really_is_positive(dentry))
+		opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
+
+	status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
+	if (status != 0)
+		goto err_free_label;
+	state = ctx->state;
+
+	if ((opendata->o_arg.open_flags & O_EXCL) &&
+	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
+		nfs4_exclusive_attrset(opendata, sattr);
+
+		nfs_fattr_init(opendata->o_res.f_attr);
+		status = nfs4_do_setattr(state->inode, cred,
+				opendata->o_res.f_attr, sattr,
+				state, label, olabel);
+		if (status == 0) {
+			nfs_setattr_update_inode(state->inode, sattr,
+					opendata->o_res.f_attr);
+			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+		}
+	}
+	if (opendata->file_created)
+		*opened |= FILE_CREATED;
+
+	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
+		*ctx_th = opendata->f_attr.mdsthreshold;
+		opendata->f_attr.mdsthreshold = NULL;
+	}
+
+	nfs4_label_free(olabel);
+
+	nfs4_opendata_put(opendata);
+	nfs4_put_state_owner(sp);
+	return 0;
+err_free_label:
+	nfs4_label_free(olabel);
+err_opendata_put:
+	nfs4_opendata_put(opendata);
+err_put_state_owner:
+	nfs4_put_state_owner(sp);
+out_err:
+	return status;
+}
+
+
+static struct nfs4_state *nfs4_do_open(struct inode *dir,
+					struct nfs_open_context *ctx,
+					int flags,
+					struct iattr *sattr,
+					struct nfs4_label *label,
+					int *opened)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs4_exception exception = { };
+	struct nfs4_state *res;
+	int status;
+
+	do {
+		status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);
+		res = ctx->state;
+		trace_nfs4_open_file(ctx, flags, status);
+		if (status == 0)
+			break;
+		/* NOTE: BAD_SEQID means the server and client disagree about the
+		 * book-keeping w.r.t. state-changing operations
+		 * (OPEN/CLOSE/LOCK/LOCKU...)
+		 * It is actually a sign of a bug on the client or on the server.
+		 *
+		 * If we receive a BAD_SEQID error in the particular case of
+		 * doing an OPEN, we assume that nfs_increment_open_seqid() will
+		 * have unhashed the old state_owner for us, and that we can
+		 * therefore safely retry using a new one. We should still warn
+		 * the user though...
+		 */
+		if (status == -NFS4ERR_BAD_SEQID) {
+			pr_warn_ratelimited("NFS: v4 server %s "
+					" returned a bad sequence-id error!\n",
+					NFS_SERVER(dir)->nfs_client->cl_hostname);
+			exception.retry = 1;
+			continue;
+		}
+		/*
+		 * BAD_STATEID on OPEN means that the server cancelled our
+		 * state before it received the OPEN_CONFIRM.
+		 * Recover by retrying the request as per the discussion
+		 * on Page 181 of RFC3530.
+		 */
+		if (status == -NFS4ERR_BAD_STATEID) {
+			exception.retry = 1;
+			continue;
+		}
+		if (status == -EAGAIN) {
+			/* We must have found a delegation */
+			exception.retry = 1;
+			continue;
+		}
+		if (nfs4_clear_cap_atomic_open_v1(server, status, &exception))
+			continue;
+		res = ERR_PTR(nfs4_handle_exception(server,
+					status, &exception));
+	} while (exception.retry);
+	return res;
+}
+
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			    struct nfs_fattr *fattr, struct iattr *sattr,
+			    struct nfs4_state *state, struct nfs4_label *ilabel,
+			    struct nfs4_label *olabel)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_setattrargs  arg = {
+                .fh             = NFS_FH(inode),
+                .iap            = sattr,
+		.server		= server,
+		.bitmask = server->attr_bitmask,
+		.label		= ilabel,
+        };
+        struct nfs_setattrres  res = {
+		.fattr		= fattr,
+		.label		= olabel,
+		.server		= server,
+        };
+        struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+        };
+	unsigned long timestamp = jiffies;
+	fmode_t fmode;
+	bool truncate;
+	int status;
+
+	arg.bitmask = nfs4_bitmask(server, ilabel);
+	if (ilabel)
+		arg.bitmask = nfs4_bitmask(server, olabel);
+
+	nfs_fattr_init(fattr);
+
+	/* Servers should only apply open mode checks for file size changes */
+	truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
+	fmode = truncate ? FMODE_WRITE : FMODE_READ;
+
+	if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
+		/* Use that stateid */
+	} else if (truncate && state != NULL) {
+		struct nfs_lockowner lockowner = {
+			.l_owner = current->files,
+			.l_pid = current->tgid,
+		};
+		if (!nfs4_valid_open_stateid(state))
+			return -EBADF;
+		if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
+				&lockowner) == -EIO)
+			return -EBADF;
+	} else
+		nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	if (status == 0 && state != NULL)
+		renew_lease(server, timestamp);
+	return status;
+}
+
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			   struct nfs_fattr *fattr, struct iattr *sattr,
+			   struct nfs4_state *state, struct nfs4_label *ilabel,
+			   struct nfs4_label *olabel)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_exception exception = {
+		.state = state,
+		.inode = inode,
+	};
+	int err;
+	do {
+		err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
+		trace_nfs4_setattr(inode, err);
+		switch (err) {
+		case -NFS4ERR_OPENMODE:
+			if (!(sattr->ia_valid & ATTR_SIZE)) {
+				pr_warn_once("NFSv4: server %s is incorrectly "
+						"applying open mode checks to "
+						"a SETATTR that is not "
+						"changing file size.\n",
+						server->nfs_client->cl_hostname);
+			}
+			if (state && !(state->state & FMODE_WRITE)) {
+				err = -EBADF;
+				if (sattr->ia_valid & ATTR_OPEN)
+					err = -EACCES;
+				goto out;
+			}
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+out:
+	return err;
+}
+
+struct nfs4_closedata {
+	struct inode *inode;
+	struct nfs4_state *state;
+	struct nfs_closeargs arg;
+	struct nfs_closeres res;
+	struct nfs_fattr fattr;
+	unsigned long timestamp;
+	bool roc;
+	u32 roc_barrier;
+};
+
+static void nfs4_free_closedata(void *data)
+{
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state_owner *sp = calldata->state->owner;
+	struct super_block *sb = calldata->state->inode->i_sb;
+
+	if (calldata->roc)
+		pnfs_roc_release(calldata->state->inode);
+	nfs4_put_open_state(calldata->state);
+	nfs_free_seqid(calldata->arg.seqid);
+	nfs4_put_state_owner(sp);
+	nfs_sb_deactive(sb);
+	kfree(calldata);
+}
+
+static void nfs4_close_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state *state = calldata->state;
+	struct nfs_server *server = NFS_SERVER(calldata->inode);
+	nfs4_stateid *res_stateid = NULL;
+
+	dprintk("%s: begin!\n", __func__);
+	if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+		return;
+	trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
+        /* hmm. we are done with the inode, and in the process of freeing
+	 * the state_owner. we keep this around to process errors
+	 */
+	switch (task->tk_status) {
+		case 0:
+			res_stateid = &calldata->res.stateid;
+			if (calldata->arg.fmode == 0 && calldata->roc)
+				pnfs_roc_set_barrier(state->inode,
+						     calldata->roc_barrier);
+			renew_lease(server, calldata->timestamp);
+			break;
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_OLD_STATEID:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_EXPIRED:
+			if (!nfs4_stateid_match(&calldata->arg.stateid,
+						&state->open_stateid)) {
+				rpc_restart_call_prepare(task);
+				goto out_release;
+			}
+			if (calldata->arg.fmode == 0)
+				break;
+		default:
+			if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
+				rpc_restart_call_prepare(task);
+				goto out_release;
+			}
+	}
+	nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
+out_release:
+	nfs_release_seqid(calldata->arg.seqid);
+	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
+	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
+}
+
+static void nfs4_close_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state *state = calldata->state;
+	struct inode *inode = calldata->inode;
+	bool is_rdonly, is_wronly, is_rdwr;
+	int call_close = 0;
+
+	dprintk("%s: begin!\n", __func__);
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+		goto out_wait;
+
+	task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+	spin_lock(&state->owner->so_lock);
+	is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+	is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+	is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+	nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid);
+	/* Calculate the change in open mode */
+	calldata->arg.fmode = 0;
+	if (state->n_rdwr == 0) {
+		if (state->n_rdonly == 0)
+			call_close |= is_rdonly;
+		else if (is_rdonly)
+			calldata->arg.fmode |= FMODE_READ;
+		if (state->n_wronly == 0)
+			call_close |= is_wronly;
+		else if (is_wronly)
+			calldata->arg.fmode |= FMODE_WRITE;
+	} else if (is_rdwr)
+		calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+	if (calldata->arg.fmode == 0)
+		call_close |= is_rdwr;
+
+	if (!nfs4_valid_open_stateid(state))
+		call_close = 0;
+	spin_unlock(&state->owner->so_lock);
+
+	if (!call_close) {
+		/* Note: exit _without_ calling nfs4_close_done */
+		goto out_no_action;
+	}
+
+	if (calldata->arg.fmode == 0) {
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+		if (calldata->roc &&
+		    pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
+			nfs_release_seqid(calldata->arg.seqid);
+			goto out_wait;
+		    }
+	}
+	calldata->arg.share_access =
+		nfs4_map_atomic_open_share(NFS_SERVER(inode),
+				calldata->arg.fmode, 0);
+
+	nfs_fattr_init(calldata->res.fattr);
+	calldata->timestamp = jiffies;
+	if (nfs4_setup_sequence(NFS_SERVER(inode),
+				&calldata->arg.seq_args,
+				&calldata->res.seq_res,
+				task) != 0)
+		nfs_release_seqid(calldata->arg.seqid);
+	dprintk("%s: done!\n", __func__);
+	return;
+out_no_action:
+	task->tk_action = NULL;
+out_wait:
+	nfs4_sequence_done(task, &calldata->res.seq_res);
+}
+
+static const struct rpc_call_ops nfs4_close_ops = {
+	.rpc_call_prepare = nfs4_close_prepare,
+	.rpc_call_done = nfs4_close_done,
+	.rpc_release = nfs4_free_closedata,
+};
+
+static bool nfs4_roc(struct inode *inode)
+{
+	if (!nfs_have_layout(inode))
+		return false;
+	return pnfs_roc(inode);
+}
+
+/* 
+ * It is possible for data to be read/written from a mem-mapped file 
+ * after the sys_close call (which hits the vfs layer as a flush).
+ * This means that we can't safely call nfsv4 close on a file until 
+ * the inode is cleared. This in turn means that we are not good
+ * NFSv4 citizens - we do not indicate to the server to update the file's 
+ * share state even when we are done with one of the three share 
+ * stateid's in the inode.
+ *
+ * NOTE: Caller must be holding the sp->so_owner semaphore!
+ */
+int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+	struct nfs4_closedata *calldata;
+	struct nfs4_state_owner *sp = state->owner;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+		.rpc_cred = state->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_close_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = -ENOMEM;
+
+	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
+		&task_setup_data.rpc_client, &msg);
+
+	calldata = kzalloc(sizeof(*calldata), gfp_mask);
+	if (calldata == NULL)
+		goto out;
+	nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);
+	calldata->inode = state->inode;
+	calldata->state = state;
+	calldata->arg.fh = NFS_FH(state->inode);
+	/* Serialization for the sequence id */
+	alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+	calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
+	if (IS_ERR(calldata->arg.seqid))
+		goto out_free_calldata;
+	calldata->arg.fmode = 0;
+	calldata->arg.bitmask = server->cache_consistency_bitmask;
+	calldata->res.fattr = &calldata->fattr;
+	calldata->res.seqid = calldata->arg.seqid;
+	calldata->res.server = server;
+	calldata->roc = nfs4_roc(state->inode);
+	nfs_sb_active(calldata->inode->i_sb);
+
+	msg.rpc_argp = &calldata->arg;
+	msg.rpc_resp = &calldata->res;
+	task_setup_data.callback_data = calldata;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = 0;
+	if (wait)
+		status = rpc_wait_for_completion_task(task);
+	rpc_put_task(task);
+	return status;
+out_free_calldata:
+	kfree(calldata);
+out:
+	nfs4_put_open_state(state);
+	nfs4_put_state_owner(sp);
+	return status;
+}
+
+static struct inode *
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
+		int open_flags, struct iattr *attr, int *opened)
+{
+	struct nfs4_state *state;
+	struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+
+	label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
+
+	/* Protect against concurrent sillydeletes */
+	state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened);
+
+	nfs4_label_release_security(label);
+
+	if (IS_ERR(state))
+		return ERR_CAST(state);
+	return state->inode;
+}
+
+static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	if (ctx->state == NULL)
+		return;
+	if (is_sync)
+		nfs4_close_sync(ctx->state, ctx->mode);
+	else
+		nfs4_close_state(ctx->state, ctx->mode);
+}
+
+#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
+#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL)
+
+static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+	struct nfs4_server_caps_arg args = {
+		.fhandle = fhandle,
+	};
+	struct nfs4_server_caps_res res = {};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	if (status == 0) {
+		/* Sanity check the server answers */
+		switch (server->nfs_client->cl_minorversion) {
+		case 0:
+			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
+			res.attr_bitmask[2] = 0;
+			break;
+		case 1:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+			break;
+		case 2:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+		}
+		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+				NFS_CAP_CTIME|NFS_CAP_MTIME|
+				NFS_CAP_SECURITY_LABEL);
+		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
+				res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
+			server->caps |= NFS_CAP_ACLS;
+		if (res.has_links != 0)
+			server->caps |= NFS_CAP_HARDLINKS;
+		if (res.has_symlinks != 0)
+			server->caps |= NFS_CAP_SYMLINKS;
+		if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+			server->caps |= NFS_CAP_FILEID;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+			server->caps |= NFS_CAP_MODE;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+			server->caps |= NFS_CAP_NLINK;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+			server->caps |= NFS_CAP_OWNER;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+			server->caps |= NFS_CAP_OWNER_GROUP;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+			server->caps |= NFS_CAP_ATIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+			server->caps |= NFS_CAP_CTIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+			server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+		if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+			server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+		memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+				sizeof(server->attr_bitmask));
+		server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		server->cache_consistency_bitmask[2] = 0;
+		server->acl_bitmask = res.acl_bitmask;
+		server->fh_expire_type = res.fh_expire_type;
+	}
+
+	return status;
+}
+
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_server_capabilities(server, fhandle),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fsinfo *info)
+{
+	u32 bitmask[3];
+	struct nfs4_lookup_root_arg args = {
+		.bitmask = bitmask,
+	};
+	struct nfs4_lookup_res res = {
+		.server = server,
+		.fattr = info->fattr,
+		.fh = fhandle,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP_ROOT],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	bitmask[0] = nfs4_fattr_bitmap[0];
+	bitmask[1] = nfs4_fattr_bitmap[1];
+	/*
+	 * Process the label in the upcoming getfattr
+	 */
+	bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
+	nfs_fattr_init(info->fattr);
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fsinfo *info)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_lookup_root(server, fhandle, info);
+		trace_nfs4_lookup_root(server, fhandle, info->fattr, err);
+		switch (err) {
+		case 0:
+		case -NFS4ERR_WRONGSEC:
+			goto out;
+		default:
+			err = nfs4_handle_exception(server, err, &exception);
+		}
+	} while (exception.retry);
+out:
+	return err;
+}
+
+static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+				struct nfs_fsinfo *info, rpc_authflavor_t flavor)
+{
+	struct rpc_auth_create_args auth_args = {
+		.pseudoflavor = flavor,
+	};
+	struct rpc_auth *auth;
+	int ret;
+
+	auth = rpcauth_create(&auth_args, server->client);
+	if (IS_ERR(auth)) {
+		ret = -EACCES;
+		goto out;
+	}
+	ret = nfs4_lookup_root(server, fhandle, info);
+out:
+	return ret;
+}
+
+/*
+ * Retry pseudoroot lookup with various security flavors.  We do this when:
+ *
+ *   NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC
+ *   NFSv4.1: the server does not support the SECINFO_NO_NAME operation
+ *
+ * Returns zero on success, or a negative NFS4ERR value, or a
+ * negative errno value.
+ */
+static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+			      struct nfs_fsinfo *info)
+{
+	/* Per 3530bis 15.33.5 */
+	static const rpc_authflavor_t flav_array[] = {
+		RPC_AUTH_GSS_KRB5P,
+		RPC_AUTH_GSS_KRB5I,
+		RPC_AUTH_GSS_KRB5,
+		RPC_AUTH_UNIX,			/* courtesy */
+		RPC_AUTH_NULL,
+	};
+	int status = -EPERM;
+	size_t i;
+
+	if (server->auth_info.flavor_len > 0) {
+		/* try each flavor specified by user */
+		for (i = 0; i < server->auth_info.flavor_len; i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						server->auth_info.flavors[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
+	} else {
+		/* no flavors specified by user, try default list */
+		for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						      flav_array[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
+	}
+
+	/*
+	 * -EACCESS could mean that the user doesn't have correct permissions
+	 * to access the mount.  It could also mean that we tried to mount
+	 * with a gss auth flavor, but rpc.gssd isn't running.  Either way,
+	 * existing mount programs don't handle -EACCES very well so it should
+	 * be mapped to -EPERM instead.
+	 */
+	if (status == -EACCES)
+		status = -EPERM;
+	return status;
+}
+
+static int nfs4_do_find_root_sec(struct nfs_server *server,
+		struct nfs_fh *fhandle, struct nfs_fsinfo *info)
+{
+	int mv = server->nfs_client->cl_minorversion;
+	return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
+}
+
+/**
+ * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
+ * @server: initialized nfs_server handle
+ * @fhandle: we fill in the pseudo-fs root file handle
+ * @info: we fill in an FSINFO struct
+ * @auth_probe: probe the auth flavours
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
+			 struct nfs_fsinfo *info,
+			 bool auth_probe)
+{
+	int status = 0;
+
+	if (!auth_probe)
+		status = nfs4_lookup_root(server, fhandle, info);
+
+	if (auth_probe || status == NFS4ERR_WRONGSEC)
+		status = nfs4_do_find_root_sec(server, fhandle, info);
+
+	if (status == 0)
+		status = nfs4_server_capabilities(server, fhandle);
+	if (status == 0)
+		status = nfs4_do_fsinfo(server, fhandle, info);
+
+	return nfs4_map_errors(status);
+}
+
+static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
+			      struct nfs_fsinfo *info)
+{
+	int error;
+	struct nfs_fattr *fattr = info->fattr;
+	struct nfs4_label *label = NULL;
+
+	error = nfs4_server_capabilities(server, mntfh);
+	if (error < 0) {
+		dprintk("nfs4_get_root: getcaps error = %d\n", -error);
+		return error;
+	}
+
+	label = nfs4_label_alloc(server, GFP_KERNEL);
+	if (IS_ERR(label))
+		return PTR_ERR(label);
+
+	error = nfs4_proc_getattr(server, mntfh, fattr, label);
+	if (error < 0) {
+		dprintk("nfs4_get_root: getattr error = %d\n", -error);
+		goto err_free_label;
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+	    !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+err_free_label:
+	nfs4_label_free(label);
+
+	return error;
+}
+
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
+			     const struct qstr *name, struct nfs_fattr *fattr,
+			     struct nfs_fh *fhandle)
+{
+	int status = -ENOMEM;
+	struct page *page = NULL;
+	struct nfs4_fs_locations *locations = NULL;
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (locations == NULL)
+		goto out;
+
+	status = nfs4_proc_fs_locations(client, dir, name, locations, page);
+	if (status != 0)
+		goto out;
+
+	/*
+	 * If the fsid didn't change, this is a migration event, not a
+	 * referral.  Cause us to drop into the exception handler, which
+	 * will kick off migration recovery.
+	 */
+	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+		dprintk("%s: server did not return a different fsid for"
+			" a referral at %s\n", __func__, name->name);
+		status = -NFS4ERR_MOVED;
+		goto out;
+	}
+	/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
+	nfs_fixup_referral_attributes(&locations->fattr);
+
+	/* replace the lookup nfs_fattr with the locations nfs_fattr */
+	memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+	memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+	if (page)
+		__free_page(page);
+	kfree(locations);
+	return status;
+}
+
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	struct nfs4_getattr_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_getattr_res res = {
+		.fattr = fattr,
+		.label = label,
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	args.bitmask = nfs4_bitmask(server, label);
+
+	nfs_fattr_init(fattr);
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_getattr(server, fhandle, fattr, label);
+		trace_nfs4_getattr(server, fhandle, fattr, err);
+		err = nfs4_handle_exception(server, err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+/* 
+ * The file is not closed if it is opened due to the a request to change
+ * the size of the file. The open call will not be needed once the
+ * VFS layer lookup-intents are implemented.
+ *
+ * Close is called when the inode is destroyed.
+ * If we haven't opened the file for O_WRONLY, we
+ * need to in the size_change case to obtain a stateid.
+ *
+ * Got race?
+ * Because OPEN is always done by name in nfsv4, it is
+ * possible that we opened a different file by the same
+ * name.  We can recognize this race condition, but we
+ * can't do anything about it besides returning an error.
+ *
+ * This will be fixed with VFS changes (lookup-intent).
+ */
+static int
+nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+		  struct iattr *sattr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct rpc_cred *cred = NULL;
+	struct nfs4_state *state = NULL;
+	struct nfs4_label *label = NULL;
+	int status;
+
+	if (pnfs_ld_layoutret_on_setattr(inode) &&
+	    sattr->ia_valid & ATTR_SIZE &&
+	    sattr->ia_size < i_size_read(inode))
+		pnfs_commit_and_return_layout(inode);
+
+	nfs_fattr_init(fattr);
+	
+	/* Deal with open(O_TRUNC) */
+	if (sattr->ia_valid & ATTR_OPEN)
+		sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME);
+
+	/* Optimization: if the end result is no change, don't RPC */
+	if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+		return 0;
+
+	/* Search for an existing open(O_WRITE) file */
+	if (sattr->ia_valid & ATTR_FILE) {
+		struct nfs_open_context *ctx;
+
+		ctx = nfs_file_open_context(sattr->ia_file);
+		if (ctx) {
+			cred = ctx->cred;
+			state = ctx->state;
+		}
+	}
+
+	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+	if (IS_ERR(label))
+		return PTR_ERR(label);
+
+	status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+	if (status == 0) {
+		nfs_setattr_update_inode(inode, sattr, fattr);
+		nfs_setsecurity(inode, fattr, label);
+	}
+	nfs4_label_free(label);
+	return status;
+}
+
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
+		const struct qstr *name, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	int		       status;
+	struct nfs4_lookup_arg args = {
+		.bitmask = server->attr_bitmask,
+		.dir_fh = NFS_FH(dir),
+		.name = name,
+	};
+	struct nfs4_lookup_res res = {
+		.server = server,
+		.fattr = fattr,
+		.label = label,
+		.fh = fhandle,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	args.bitmask = nfs4_bitmask(server, label);
+
+	nfs_fattr_init(fattr);
+
+	dprintk("NFS call  lookup %s\n", name->name);
+	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
+	dprintk("NFS reply lookup: %d\n", status);
+	return status;
+}
+
+static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
+{
+	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+		NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_MOUNTPOINT;
+	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	fattr->nlink = 2;
+}
+
+static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
+				   struct qstr *name, struct nfs_fh *fhandle,
+				   struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	struct nfs4_exception exception = { };
+	struct rpc_clnt *client = *clnt;
+	int err;
+	do {
+		err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
+		trace_nfs4_lookup(dir, name, err);
+		switch (err) {
+		case -NFS4ERR_BADNAME:
+			err = -ENOENT;
+			goto out;
+		case -NFS4ERR_MOVED:
+			err = nfs4_get_referral(client, dir, name, fattr, fhandle);
+			goto out;
+		case -NFS4ERR_WRONGSEC:
+			err = -EPERM;
+			if (client != *clnt)
+				goto out;
+			client = nfs4_negotiate_security(client, dir, name);
+			if (IS_ERR(client))
+				return PTR_ERR(client);
+
+			exception.retry = 1;
+			break;
+		default:
+			err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
+		}
+	} while (exception.retry);
+
+out:
+	if (err == 0)
+		*clnt = client;
+	else if (client != *clnt)
+		rpc_shutdown_client(client);
+
+	return err;
+}
+
+static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
+			    struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+			    struct nfs4_label *label)
+{
+	int status;
+	struct rpc_clnt *client = NFS_CLIENT(dir);
+
+	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
+	if (client != NFS_CLIENT(dir)) {
+		rpc_shutdown_client(client);
+		nfs_fixup_secinfo_attributes(fattr);
+	}
+	return status;
+}
+
+struct rpc_clnt *
+nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
+			    struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+	struct rpc_clnt *client = NFS_CLIENT(dir);
+	int status;
+
+	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
+	if (status < 0)
+		return ERR_PTR(status);
+	return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
+}
+
+static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_accessargs args = {
+		.fh = NFS_FH(inode),
+		.bitmask = server->cache_consistency_bitmask,
+	};
+	struct nfs4_accessres res = {
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = entry->cred,
+	};
+	int mode = entry->mask;
+	int status = 0;
+
+	/*
+	 * Determine which access bits we want to ask for...
+	 */
+	if (mode & MAY_READ)
+		args.access |= NFS4_ACCESS_READ;
+	if (S_ISDIR(inode->i_mode)) {
+		if (mode & MAY_WRITE)
+			args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE;
+		if (mode & MAY_EXEC)
+			args.access |= NFS4_ACCESS_LOOKUP;
+	} else {
+		if (mode & MAY_WRITE)
+			args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND;
+		if (mode & MAY_EXEC)
+			args.access |= NFS4_ACCESS_EXECUTE;
+	}
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		return -ENOMEM;
+
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	if (!status) {
+		nfs_access_set_mask(entry, res.access);
+		nfs_refresh_inode(inode, res.fattr);
+	}
+	nfs_free_fattr(res.fattr);
+	return status;
+}
+
+static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_access(inode, entry);
+		trace_nfs4_access(inode, err);
+		err = nfs4_handle_exception(NFS_SERVER(inode), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+/*
+ * TODO: For the time being, we don't try to get any attributes
+ * along with any of the zero-copy operations READ, READDIR,
+ * READLINK, WRITE.
+ *
+ * In the case of the first three, we want to put the GETATTR
+ * after the read-type operation -- this is because it is hard
+ * to predict the length of a GETATTR response in v4, and thus
+ * align the READ data correctly.  This means that the GETATTR
+ * may end up partially falling into the page cache, and we should
+ * shift it into the 'tail' of the xdr_buf before processing.
+ * To do this efficiently, we need to know the total length
+ * of data received, which doesn't seem to be available outside
+ * of the RPC layer.
+ *
+ * In the case of WRITE, we also want to put the GETATTR after
+ * the operation -- in this case because we want to make sure
+ * we get the post-operation mtime and size.
+ *
+ * Both of these changes to the XDR layer would in fact be quite
+ * minor, but I decided to leave them for a subsequent patch.
+ */
+static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs4_readlink args = {
+		.fh       = NFS_FH(inode),
+		.pgbase	  = pgbase,
+		.pglen    = pglen,
+		.pages    = &page,
+	};
+	struct nfs4_readlink_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_readlink(inode, page, pgbase, pglen);
+		trace_nfs4_readlink(inode, err);
+		err = nfs4_handle_exception(NFS_SERVER(inode), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+/*
+ * This is just for mknod.  open(O_CREAT) will always do ->open_context().
+ */
+static int
+nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+		 int flags)
+{
+	struct nfs4_label l, *ilabel = NULL;
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+	int opened = 0;
+	int status = 0;
+
+	ctx = alloc_nfs_open_context(dentry, FMODE_READ);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+	sattr->ia_mode &= ~current_umask();
+	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
+	if (IS_ERR(state)) {
+		status = PTR_ERR(state);
+		goto out;
+	}
+out:
+	nfs4_label_release_security(ilabel);
+	put_nfs_open_context(ctx);
+	return status;
+}
+
+static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_removeargs args = {
+		.fh = NFS_FH(dir),
+		.name = *name,
+	};
+	struct nfs_removeres res = {
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+	if (status == 0)
+		update_changeattr(dir, &res.cinfo);
+	return status;
+}
+
+static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_remove(dir, name);
+		trace_nfs4_remove(dir, name, err);
+		err = nfs4_handle_exception(NFS_SERVER(dir), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_removeargs *args = msg->rpc_argp;
+	struct nfs_removeres *res = msg->rpc_resp;
+
+	res->server = server;
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
+	nfs4_init_sequence(&args->seq_args, &res->seq_res, 1);
+
+	nfs_fattr_init(res->dir_attr);
+}
+
+static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+	nfs4_setup_sequence(NFS_SERVER(data->dir),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
+}
+
+static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+	struct nfs_unlinkdata *data = task->tk_calldata;
+	struct nfs_removeres *res = &data->res;
+
+	if (!nfs4_sequence_done(task, &res->seq_res))
+		return 0;
+	if (nfs4_async_handle_error(task, res->server, NULL,
+				    &data->timeout) == -EAGAIN)
+		return 0;
+	update_changeattr(dir, &res->cinfo);
+	return 1;
+}
+
+static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_renameargs *arg = msg->rpc_argp;
+	struct nfs_renameres *res = msg->rpc_resp;
+
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+	res->server = server;
+	nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1);
+}
+
+static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+	nfs4_setup_sequence(NFS_SERVER(data->old_dir),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
+}
+
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+				 struct inode *new_dir)
+{
+	struct nfs_renamedata *data = task->tk_calldata;
+	struct nfs_renameres *res = &data->res;
+
+	if (!nfs4_sequence_done(task, &res->seq_res))
+		return 0;
+	if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
+		return 0;
+
+	update_changeattr(old_dir, &res->old_cinfo);
+	update_changeattr(new_dir, &res->new_cinfo);
+	return 1;
+}
+
+static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_link_arg arg = {
+		.fh     = NFS_FH(inode),
+		.dir_fh = NFS_FH(dir),
+		.name   = name,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_link_res res = {
+		.server = server,
+		.label = NULL,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
+		.rpc_argp = &arg,
+		.rpc_resp = &res,
+	};
+	int status = -ENOMEM;
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		goto out;
+
+	res.label = nfs4_label_alloc(server, GFP_KERNEL);
+	if (IS_ERR(res.label)) {
+		status = PTR_ERR(res.label);
+		goto out;
+	}
+	arg.bitmask = nfs4_bitmask(server, res.label);
+
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	if (!status) {
+		update_changeattr(dir, &res.cinfo);
+		status = nfs_post_op_update_inode(inode, res.fattr);
+		if (!status)
+			nfs_setsecurity(inode, res.fattr, res.label);
+	}
+
+
+	nfs4_label_free(res.label);
+
+out:
+	nfs_free_fattr(res.fattr);
+	return status;
+}
+
+static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(inode),
+				_nfs4_proc_link(inode, dir, name),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+struct nfs4_createdata {
+	struct rpc_message msg;
+	struct nfs4_create_arg arg;
+	struct nfs4_create_res res;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	struct nfs4_label *label;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+		struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+	struct nfs4_createdata *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data != NULL) {
+		struct nfs_server *server = NFS_SERVER(dir);
+
+		data->label = nfs4_label_alloc(server, GFP_KERNEL);
+		if (IS_ERR(data->label))
+			goto out_free;
+
+		data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+		data->msg.rpc_argp = &data->arg;
+		data->msg.rpc_resp = &data->res;
+		data->arg.dir_fh = NFS_FH(dir);
+		data->arg.server = server;
+		data->arg.name = name;
+		data->arg.attrs = sattr;
+		data->arg.ftype = ftype;
+		data->arg.bitmask = nfs4_bitmask(server, data->label);
+		data->res.server = server;
+		data->res.fh = &data->fh;
+		data->res.fattr = &data->fattr;
+		data->res.label = data->label;
+		nfs_fattr_init(data->res.fattr);
+	}
+	return data;
+out_free:
+	kfree(data);
+	return NULL;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+	int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+				    &data->arg.seq_args, &data->res.seq_res, 1);
+	if (status == 0) {
+		update_changeattr(dir, &data->res.dir_cinfo);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
+	}
+	return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+	nfs4_label_free(data->label);
+	kfree(data);
+}
+
+static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+		struct page *page, unsigned int len, struct iattr *sattr,
+		struct nfs4_label *label)
+{
+	struct nfs4_createdata *data;
+	int status = -ENAMETOOLONG;
+
+	if (len > NFS4_MAXPATHLEN)
+		goto out;
+
+	status = -ENOMEM;
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+	data->arg.u.symlink.pages = &page;
+	data->arg.u.symlink.len = len;
+	data->arg.label = label;
+	
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
+	return status;
+}
+
+static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+		struct page *page, unsigned int len, struct iattr *sattr)
+{
+	struct nfs4_exception exception = { };
+	struct nfs4_label l, *label = NULL;
+	int err;
+
+	label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+	do {
+		err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+		trace_nfs4_symlink(dir, &dentry->d_name, err);
+		err = nfs4_handle_exception(NFS_SERVER(dir), err,
+				&exception);
+	} while (exception.retry);
+
+	nfs4_label_release_security(label);
+	return err;
+}
+
+static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr, struct nfs4_label *label)
+{
+	struct nfs4_createdata *data;
+	int status = -ENOMEM;
+
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+	if (data == NULL)
+		goto out;
+
+	data->arg.label = label;
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
+	return status;
+}
+
+static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr)
+{
+	struct nfs4_exception exception = { };
+	struct nfs4_label l, *label = NULL;
+	int err;
+
+	label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+	sattr->ia_mode &= ~current_umask();
+	do {
+		err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+		trace_nfs4_mkdir(dir, &dentry->d_name, err);
+		err = nfs4_handle_exception(NFS_SERVER(dir), err,
+				&exception);
+	} while (exception.retry);
+	nfs4_label_release_security(label);
+
+	return err;
+}
+
+static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct inode		*dir = d_inode(dentry);
+	struct nfs4_readdir_arg args = {
+		.fh = NFS_FH(dir),
+		.pages = pages,
+		.pgbase = 0,
+		.count = count,
+		.bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask,
+		.plus = plus,
+	};
+	struct nfs4_readdir_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	int			status;
+
+	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
+			dentry,
+			(unsigned long long)cookie);
+	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
+	res.pgbase = args.pgbase;
+	status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
+	if (status >= 0) {
+		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
+		status += args.pgbase;
+	}
+
+	nfs_invalidate_atime(dir);
+
+	dprintk("%s: returns %d\n", __func__, status);
+	return status;
+}
+
+static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_readdir(dentry, cred, cookie,
+				pages, count, plus);
+		trace_nfs4_readdir(d_inode(dentry), err);
+		err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
+{
+	struct nfs4_createdata *data;
+	int mode = sattr->ia_mode;
+	int status = -ENOMEM;
+
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+	if (data == NULL)
+		goto out;
+
+	if (S_ISFIFO(mode))
+		data->arg.ftype = NF4FIFO;
+	else if (S_ISBLK(mode)) {
+		data->arg.ftype = NF4BLK;
+		data->arg.u.device.specdata1 = MAJOR(rdev);
+		data->arg.u.device.specdata2 = MINOR(rdev);
+	}
+	else if (S_ISCHR(mode)) {
+		data->arg.ftype = NF4CHR;
+		data->arg.u.device.specdata1 = MAJOR(rdev);
+		data->arg.u.device.specdata2 = MINOR(rdev);
+	} else if (!S_ISSOCK(mode)) {
+		status = -EINVAL;
+		goto out_free;
+	}
+
+	data->arg.label = label;
+	status = nfs4_do_create(dir, dentry, data);
+out_free:
+	nfs4_free_createdata(data);
+out:
+	return status;
+}
+
+static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr, dev_t rdev)
+{
+	struct nfs4_exception exception = { };
+	struct nfs4_label l, *label = NULL;
+	int err;
+
+	label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+	sattr->ia_mode &= ~current_umask();
+	do {
+		err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
+		trace_nfs4_mknod(dir, &dentry->d_name, err);
+		err = nfs4_handle_exception(NFS_SERVER(dir), err,
+				&exception);
+	} while (exception.retry);
+
+	nfs4_label_release_security(label);
+
+	return err;
+}
+
+static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+		 struct nfs_fsstat *fsstat)
+{
+	struct nfs4_statfs_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_statfs_res res = {
+		.fsstat = fsstat,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	nfs_fattr_init(fsstat->fattr);
+	return  nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_proc_statfs(server, fhandle, fsstat),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fsinfo *fsinfo)
+{
+	struct nfs4_fsinfo_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_fsinfo_res res = {
+		.fsinfo = fsinfo,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+	struct nfs4_exception exception = { };
+	unsigned long now = jiffies;
+	int err;
+
+	do {
+		err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
+		trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
+		if (err == 0) {
+			struct nfs_client *clp = server->nfs_client;
+
+			spin_lock(&clp->cl_lock);
+			clp->cl_lease_time = fsinfo->lease_time * HZ;
+			clp->cl_last_renewal = now;
+			spin_unlock(&clp->cl_lock);
+			break;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+	int error;
+
+	nfs_fattr_init(fsinfo->fattr);
+	error = nfs4_do_fsinfo(server, fhandle, fsinfo);
+	if (error == 0) {
+		/* block layout checks this! */
+		server->pnfs_blksize = fsinfo->blksize;
+		set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+	}
+
+	return error;
+}
+
+static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_pathconf *pathconf)
+{
+	struct nfs4_pathconf_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_pathconf_res res = {
+		.pathconf = pathconf,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	/* None of the pathconf attributes are mandatory to implement */
+	if ((args.bitmask[0] & nfs4_pathconf_bitmap[0]) == 0) {
+		memset(pathconf, 0, sizeof(*pathconf));
+		return 0;
+	}
+
+	nfs_fattr_init(pathconf->fattr);
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_pathconf *pathconf)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_proc_pathconf(server, fhandle, pathconf),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode)
+{
+	const struct nfs_lockowner *lockowner = NULL;
+
+	if (l_ctx != NULL)
+		lockowner = &l_ctx->lockowner;
+	return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
+
+static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode)
+{
+	nfs4_stateid current_stateid;
+
+	/* If the current stateid represents a lost lock, then exit */
+	if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode) == -EIO)
+		return true;
+	return nfs4_stateid_match(stateid, &current_stateid);
+}
+
+static bool nfs4_error_stateid_expired(int err)
+{
+	switch (err) {
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_OLD_STATEID:
+	case -NFS4ERR_OPENMODE:
+	case -NFS4ERR_EXPIRED:
+		return true;
+	}
+	return false;
+}
+
+void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
+{
+	nfs_invalidate_atime(hdr->inode);
+}
+
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+	struct nfs_server *server = NFS_SERVER(hdr->inode);
+
+	trace_nfs4_read(hdr, task->tk_status);
+	if (nfs4_async_handle_error(task, server,
+				    hdr->args.context->state,
+				    NULL) == -EAGAIN) {
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	__nfs4_read_done_cb(hdr);
+	if (task->tk_status > 0)
+		renew_lease(server, hdr->timestamp);
+	return 0;
+}
+
+static bool nfs4_read_stateid_changed(struct rpc_task *task,
+		struct nfs_pgio_args *args)
+{
+
+	if (!nfs4_error_stateid_expired(task->tk_status) ||
+		nfs4_stateid_is_current(&args->stateid,
+				args->context,
+				args->lock_context,
+				FMODE_READ))
+		return false;
+	rpc_restart_call_prepare(task);
+	return true;
+}
+
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
+		return -EAGAIN;
+	if (nfs4_read_stateid_changed(task, &hdr->args))
+		return -EAGAIN;
+	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+				    nfs4_read_done_cb(task, hdr);
+}
+
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+				 struct rpc_message *msg)
+{
+	hdr->timestamp   = jiffies;
+	hdr->pgio_done_cb = nfs4_read_done_cb;
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
+}
+
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+				      struct nfs_pgio_header *hdr)
+{
+	if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
+			&hdr->args.seq_args,
+			&hdr->res.seq_res,
+			task))
+		return 0;
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+				hdr->args.lock_context,
+				hdr->rw_ops->rw_mode) == -EIO)
+		return -EIO;
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
+		return -EIO;
+	return 0;
+}
+
+static int nfs4_write_done_cb(struct rpc_task *task,
+			      struct nfs_pgio_header *hdr)
+{
+	struct inode *inode = hdr->inode;
+
+	trace_nfs4_write(hdr, task->tk_status);
+	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+				    hdr->args.context->state,
+				    NULL) == -EAGAIN) {
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+	if (task->tk_status >= 0) {
+		renew_lease(NFS_SERVER(inode), hdr->timestamp);
+		nfs_writeback_update_inode(hdr);
+	}
+	return 0;
+}
+
+static bool nfs4_write_stateid_changed(struct rpc_task *task,
+		struct nfs_pgio_args *args)
+{
+
+	if (!nfs4_error_stateid_expired(task->tk_status) ||
+		nfs4_stateid_is_current(&args->stateid,
+				args->context,
+				args->lock_context,
+				FMODE_WRITE))
+		return false;
+	rpc_restart_call_prepare(task);
+	return true;
+}
+
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
+		return -EAGAIN;
+	if (nfs4_write_stateid_changed(task, &hdr->args))
+		return -EAGAIN;
+	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+		nfs4_write_done_cb(task, hdr);
+}
+
+static
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
+{
+	/* Don't request attributes for pNFS or O_DIRECT writes */
+	if (hdr->ds_clp != NULL || hdr->dreq != NULL)
+		return false;
+	/* Otherwise, request attributes if and only if we don't hold
+	 * a delegation
+	 */
+	return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
+}
+
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+				  struct rpc_message *msg)
+{
+	struct nfs_server *server = NFS_SERVER(hdr->inode);
+
+	if (!nfs4_write_need_cache_consistency_data(hdr)) {
+		hdr->args.bitmask = NULL;
+		hdr->res.fattr = NULL;
+	} else
+		hdr->args.bitmask = server->cache_consistency_bitmask;
+
+	if (!hdr->pgio_done_cb)
+		hdr->pgio_done_cb = nfs4_write_done_cb;
+	hdr->res.server = server;
+	hdr->timestamp   = jiffies;
+
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
+	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
+}
+
+static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	nfs4_setup_sequence(NFS_SERVER(data->inode),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
+}
+
+static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	struct inode *inode = data->inode;
+
+	trace_nfs4_commit(data, task->tk_status);
+	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+				    NULL, NULL) == -EAGAIN) {
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+	return data->commit_done_cb(task, data);
+}
+
+static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
+{
+	struct nfs_server *server = NFS_SERVER(data->inode);
+
+	if (data->commit_done_cb == NULL)
+		data->commit_done_cb = nfs4_commit_done_cb;
+	data->res.server = server;
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+}
+
+struct nfs4_renewdata {
+	struct nfs_client	*client;
+	unsigned long		timestamp;
+};
+
+/*
+ * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
+ * standalone procedure for queueing an asynchronous RENEW.
+ */
+static void nfs4_renew_release(void *calldata)
+{
+	struct nfs4_renewdata *data = calldata;
+	struct nfs_client *clp = data->client;
+
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+	nfs_put_client(clp);
+	kfree(data);
+}
+
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_renewdata *data = calldata;
+	struct nfs_client *clp = data->client;
+	unsigned long timestamp = data->timestamp;
+
+	trace_nfs4_renew_async(clp, task->tk_status);
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_LEASE_MOVED:
+		nfs4_schedule_lease_moved_recovery(clp);
+		break;
+	default:
+		/* Unless we're shutting down, schedule state recovery! */
+		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
+			return;
+		if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
+			nfs4_schedule_lease_recovery(clp);
+			return;
+		}
+		nfs4_schedule_path_down_recovery(clp);
+	}
+	do_renew_lease(clp, timestamp);
+}
+
+static const struct rpc_call_ops nfs4_renew_ops = {
+	.rpc_call_done = nfs4_renew_done,
+	.rpc_release = nfs4_renew_release,
+};
+
+static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+		.rpc_argp	= clp,
+		.rpc_cred	= cred,
+	};
+	struct nfs4_renewdata *data;
+
+	if (renew_flags == 0)
+		return 0;
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return -EIO;
+	data = kmalloc(sizeof(*data), GFP_NOFS);
+	if (data == NULL)
+		return -ENOMEM;
+	data->client = clp;
+	data->timestamp = jiffies;
+	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,
+			&nfs4_renew_ops, data);
+}
+
+static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+		.rpc_argp	= clp,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	if (status < 0)
+		return status;
+	do_renew_lease(clp, now);
+	return 0;
+}
+
+static inline int nfs4_server_supports_acls(struct nfs_server *server)
+{
+	return server->caps & NFS_CAP_ACLS;
+}
+
+/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
+ * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on
+ * the stack.
+ */
+#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
+
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+		struct page **pages, unsigned int *pgbase)
+{
+	struct page *newpage, **spages;
+	int rc = 0;
+	size_t len;
+	spages = pages;
+
+	do {
+		len = min_t(size_t, PAGE_SIZE, buflen);
+		newpage = alloc_page(GFP_KERNEL);
+
+		if (newpage == NULL)
+			goto unwind;
+		memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+		*pages++ = newpage;
+		rc++;
+	} while (buflen != 0);
+
+	return rc;
+
+unwind:
+	for(; rc > 0; rc--)
+		__free_page(spages[rc-1]);
+	return -ENOMEM;
+}
+
+struct nfs4_cached_acl {
+	int cached;
+	size_t len;
+	char data[0];
+};
+
+static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	kfree(nfsi->nfs4_acl);
+	nfsi->nfs4_acl = acl;
+	spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_zap_acl_attr(struct inode *inode)
+{
+	nfs4_set_cached_acl(inode, NULL);
+}
+
+static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs4_cached_acl *acl;
+	int ret = -ENOENT;
+
+	spin_lock(&inode->i_lock);
+	acl = nfsi->nfs4_acl;
+	if (acl == NULL)
+		goto out;
+	if (buf == NULL) /* user is just asking for length */
+		goto out_len;
+	if (acl->cached == 0)
+		goto out;
+	ret = -ERANGE; /* see getxattr(2) man page */
+	if (acl->len > buflen)
+		goto out;
+	memcpy(buf, acl->data, acl->len);
+out_len:
+	ret = acl->len;
+out:
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len)
+{
+	struct nfs4_cached_acl *acl;
+	size_t buflen = sizeof(*acl) + acl_len;
+
+	if (buflen <= PAGE_SIZE) {
+		acl = kmalloc(buflen, GFP_KERNEL);
+		if (acl == NULL)
+			goto out;
+		acl->cached = 1;
+		_copy_from_pages(acl->data, pages, pgbase, acl_len);
+	} else {
+		acl = kmalloc(sizeof(*acl), GFP_KERNEL);
+		if (acl == NULL)
+			goto out;
+		acl->cached = 0;
+	}
+	acl->len = acl_len;
+out:
+	nfs4_set_cached_acl(inode, acl);
+}
+
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf.  On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
+static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+{
+	struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
+	struct nfs_getaclargs args = {
+		.fh = NFS_FH(inode),
+		.acl_pages = pages,
+		.acl_len = buflen,
+	};
+	struct nfs_getaclres res = {
+		.acl_len = buflen,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
+	int ret = -ENOMEM, i;
+
+	/* As long as we're doing a round trip to the server anyway,
+	 * let's be prepared for a page of acl data. */
+	if (npages == 0)
+		npages = 1;
+	if (npages > ARRAY_SIZE(pages))
+		return -ERANGE;
+
+	for (i = 0; i < npages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			goto out_free;
+	}
+
+	/* for decoding across pages */
+	res.acl_scratch = alloc_page(GFP_KERNEL);
+	if (!res.acl_scratch)
+		goto out_free;
+
+	args.acl_len = npages * PAGE_SIZE;
+	args.acl_pgbase = 0;
+
+	dprintk("%s  buf %p buflen %zu npages %d args.acl_len %zu\n",
+		__func__, buf, buflen, npages, args.acl_len);
+	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+			     &msg, &args.seq_args, &res.seq_res, 0);
+	if (ret)
+		goto out_free;
+
+	/* Handle the case where the passed-in buffer is too short */
+	if (res.acl_flags & NFS4_ACL_TRUNC) {
+		/* Did the user only issue a request for the acl length? */
+		if (buf == NULL)
+			goto out_ok;
+		ret = -ERANGE;
+		goto out_free;
+	}
+	nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
+	if (buf) {
+		if (res.acl_len > buflen) {
+			ret = -ERANGE;
+			goto out_free;
+		}
+		_copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
+	}
+out_ok:
+	ret = res.acl_len;
+out_free:
+	for (i = 0; i < npages; i++)
+		if (pages[i])
+			__free_page(pages[i]);
+	if (res.acl_scratch)
+		__free_page(res.acl_scratch);
+	return ret;
+}
+
+static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+{
+	struct nfs4_exception exception = { };
+	ssize_t ret;
+	do {
+		ret = __nfs4_get_acl_uncached(inode, buf, buflen);
+		trace_nfs4_get_acl(inode, ret);
+		if (ret >= 0)
+			break;
+		ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception);
+	} while (exception.retry);
+	return ret;
+}
+
+static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	int ret;
+
+	if (!nfs4_server_supports_acls(server))
+		return -EOPNOTSUPP;
+	ret = nfs_revalidate_inode(server, inode);
+	if (ret < 0)
+		return ret;
+	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
+	ret = nfs4_read_cached_acl(inode, buf, buflen);
+	if (ret != -ENOENT)
+		/* -ENOENT is returned if there is no ACL or if there is an ACL
+		 * but no cached acl data, just the acl length */
+		return ret;
+	return nfs4_get_acl_uncached(inode, buf, buflen);
+}
+
+static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct page *pages[NFS4ACL_MAXPAGES];
+	struct nfs_setaclargs arg = {
+		.fh		= NFS_FH(inode),
+		.acl_pages	= pages,
+		.acl_len	= buflen,
+	};
+	struct nfs_setaclres res;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETACL],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
+	int ret, i;
+
+	if (!nfs4_server_supports_acls(server))
+		return -EOPNOTSUPP;
+	if (npages > ARRAY_SIZE(pages))
+		return -ERANGE;
+	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+	if (i < 0)
+		return i;
+	nfs4_inode_return_delegation(inode);
+	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+
+	/*
+	 * Free each page after tx, so the only ref left is
+	 * held by the network stack
+	 */
+	for (; i > 0; i--)
+		put_page(pages[i-1]);
+
+	/*
+	 * Acl update can result in inode attribute update.
+	 * so mark the attribute cache invalid.
+	 */
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_unlock(&inode->i_lock);
+	nfs_access_zap_cache(inode);
+	nfs_zap_acl_cache(inode);
+	return ret;
+}
+
+static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = __nfs4_proc_set_acl(inode, buf, buflen);
+		trace_nfs4_set_acl(inode, err);
+		err = nfs4_handle_exception(NFS_SERVER(inode), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+					size_t buflen)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr fattr;
+	struct nfs4_label label = {0, 0, buflen, buf};
+
+	u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+	struct nfs4_getattr_arg arg = {
+		.fh		= NFS_FH(inode),
+		.bitmask	= bitmask,
+	};
+	struct nfs4_getattr_res res = {
+		.fattr		= &fattr,
+		.label		= &label,
+		.server		= server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int ret;
+
+	nfs_fattr_init(&fattr);
+
+	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);
+	if (ret)
+		return ret;
+	if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+		return -ENOENT;
+	if (buflen < label.len)
+		return -ERANGE;
+	return 0;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+					size_t buflen)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+		return -EOPNOTSUPP;
+
+	do {
+		err = _nfs4_get_security_label(inode, buf, buflen);
+		trace_nfs4_get_security_label(inode, err);
+		err = nfs4_handle_exception(NFS_SERVER(inode), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+		struct nfs4_label *ilabel,
+		struct nfs_fattr *fattr,
+		struct nfs4_label *olabel)
+{
+
+	struct iattr sattr = {0};
+	struct nfs_server *server = NFS_SERVER(inode);
+	const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+	struct nfs_setattrargs arg = {
+		.fh             = NFS_FH(inode),
+		.iap            = &sattr,
+		.server		= server,
+		.bitmask	= bitmask,
+		.label		= ilabel,
+	};
+	struct nfs_setattrres res = {
+		.fattr		= fattr,
+		.label		= olabel,
+		.server		= server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+		.rpc_argp       = &arg,
+		.rpc_resp       = &res,
+	};
+	int status;
+
+	nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	if (status)
+		dprintk("%s failed: %d\n", __func__, status);
+
+	return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+		struct nfs4_label *ilabel,
+		struct nfs_fattr *fattr,
+		struct nfs4_label *olabel)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = _nfs4_do_set_security_label(inode, ilabel,
+				fattr, olabel);
+		trace_nfs4_set_security_label(inode, err);
+		err = nfs4_handle_exception(NFS_SERVER(inode), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int
+nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+{
+	struct nfs4_label ilabel, *olabel = NULL;
+	struct nfs_fattr fattr;
+	struct rpc_cred *cred;
+	struct inode *inode = d_inode(dentry);
+	int status;
+
+	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+		return -EOPNOTSUPP;
+
+	nfs_fattr_init(&fattr);
+
+	ilabel.pi = 0;
+	ilabel.lfs = 0;
+	ilabel.label = (char *)buf;
+	ilabel.len = buflen;
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+
+	olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+	if (IS_ERR(olabel)) {
+		status = -PTR_ERR(olabel);
+		goto out;
+	}
+
+	status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+	if (status == 0)
+		nfs_setsecurity(inode, &fattr, olabel);
+
+	nfs4_label_free(olabel);
+out:
+	put_rpccred(cred);
+	return status;
+}
+#endif	/* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
+static int
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+			struct nfs4_state *state, long *timeout)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	if (task->tk_status >= 0)
+		return 0;
+	switch(task->tk_status) {
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_OPENMODE:
+			if (state == NULL)
+				break;
+			if (nfs4_schedule_stateid_recovery(server, state) < 0)
+				goto recovery_failed;
+			goto wait_on_recovery;
+		case -NFS4ERR_EXPIRED:
+			if (state != NULL) {
+				if (nfs4_schedule_stateid_recovery(server, state) < 0)
+					goto recovery_failed;
+			}
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_STALE_CLIENTID:
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			if (nfs4_schedule_migration_recovery(server) < 0)
+				goto recovery_failed;
+			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
+#if defined(CONFIG_NFS_V4_1)
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_DEADSESSION:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_SEQ_FALSE_RETRY:
+		case -NFS4ERR_SEQ_MISORDERED:
+			dprintk("%s ERROR %d, Reset session\n", __func__,
+				task->tk_status);
+			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+			goto wait_on_recovery;
+#endif /* CONFIG_NFS_V4_1 */
+		case -NFS4ERR_DELAY:
+			nfs_inc_server_stats(server, NFSIOS_DELAY);
+			rpc_delay(task, nfs4_update_delay(timeout));
+			goto restart_call;
+		case -NFS4ERR_GRACE:
+			rpc_delay(task, NFS4_POLL_RETRY_MAX);
+		case -NFS4ERR_RETRY_UNCACHED_REP:
+		case -NFS4ERR_OLD_STATEID:
+			goto restart_call;
+	}
+	task->tk_status = nfs4_map_errors(task->tk_status);
+	return 0;
+recovery_failed:
+	task->tk_status = -EIO;
+	return 0;
+wait_on_recovery:
+	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		goto recovery_failed;
+restart_call:
+	task->tk_status = 0;
+	return -EAGAIN;
+}
+
+static void nfs4_init_boot_verifier(const struct nfs_client *clp,
+				    nfs4_verifier *bootverf)
+{
+	__be32 verf[2];
+
+	if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+		/* An impossible timestamp guarantees this value
+		 * will never match a generated boot time. */
+		verf[0] = 0;
+		verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
+	} else {
+		struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+		verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
+		verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
+	}
+	memcpy(bootverf->data, verf, sizeof(bootverf->data));
+}
+
+static unsigned int
+nfs4_init_nonuniform_client_string(struct nfs_client *clp,
+				   char *buf, size_t len)
+{
+	unsigned int result;
+
+	if (clp->cl_owner_id != NULL)
+		return strlcpy(buf, clp->cl_owner_id, len);
+
+	rcu_read_lock();
+	result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
+				clp->cl_ipaddr,
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_ADDR),
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_PROTO));
+	rcu_read_unlock();
+	clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
+	return result;
+}
+
+static unsigned int
+nfs4_init_uniform_client_string(struct nfs_client *clp,
+				char *buf, size_t len)
+{
+	const char *nodename = clp->cl_rpcclient->cl_nodename;
+	unsigned int result;
+
+	if (clp->cl_owner_id != NULL)
+		return strlcpy(buf, clp->cl_owner_id, len);
+
+	if (nfs4_client_id_uniquifier[0] != '\0')
+		result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
+				clp->rpc_ops->version,
+				clp->cl_minorversion,
+				nfs4_client_id_uniquifier,
+				nodename);
+	else
+		result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
+				clp->rpc_ops->version, clp->cl_minorversion,
+				nodename);
+	clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
+	return result;
+}
+
+/*
+ * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
+ * services.  Advertise one based on the address family of the
+ * clientaddr.
+ */
+static unsigned int
+nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
+{
+	if (strchr(clp->cl_ipaddr, ':') != NULL)
+		return scnprintf(buf, len, "tcp6");
+	else
+		return scnprintf(buf, len, "tcp");
+}
+
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_setclientid *sc = calldata;
+
+	if (task->tk_status == 0)
+		sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+	.rpc_call_done = nfs4_setclientid_done,
+};
+
+/**
+ * nfs4_proc_setclientid - Negotiate client ID
+ * @clp: state data structure
+ * @program: RPC program for NFSv4 callback service
+ * @port: IP port number for NFS4 callback service
+ * @cred: RPC credential to use for this call
+ * @res: where to place the result
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ */
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+		unsigned short port, struct rpc_cred *cred,
+		struct nfs4_setclientid_res *res)
+{
+	nfs4_verifier sc_verifier;
+	struct nfs4_setclientid setclientid = {
+		.sc_verifier = &sc_verifier,
+		.sc_prog = program,
+		.sc_cb_ident = clp->cl_cb_ident,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
+		.rpc_argp = &setclientid,
+		.rpc_resp = res,
+		.rpc_cred = cred,
+	};
+	struct rpc_task *task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_setclientid_ops,
+		.callback_data = &setclientid,
+		.flags = RPC_TASK_TIMEOUT,
+	};
+	int status;
+
+	/* nfs_client_id4 */
+	nfs4_init_boot_verifier(clp, &sc_verifier);
+	if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
+		setclientid.sc_name_len =
+				nfs4_init_uniform_client_string(clp,
+						setclientid.sc_name,
+						sizeof(setclientid.sc_name));
+	else
+		setclientid.sc_name_len =
+				nfs4_init_nonuniform_client_string(clp,
+						setclientid.sc_name,
+						sizeof(setclientid.sc_name));
+	/* cb_client4 */
+	setclientid.sc_netid_len =
+				nfs4_init_callback_netid(clp,
+						setclientid.sc_netid,
+						sizeof(setclientid.sc_netid));
+	setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
+				sizeof(setclientid.sc_uaddr), "%s.%u.%u",
+				clp->cl_ipaddr, port >> 8, port & 255);
+
+	dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
+		clp->cl_rpcclient->cl_auth->au_ops->au_name,
+		setclientid.sc_name_len, setclientid.sc_name);
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		status = PTR_ERR(task);
+		goto out;
+	}
+	status = task->tk_status;
+	if (setclientid.sc_cred) {
+		clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+		put_rpccred(setclientid.sc_cred);
+	}
+	rpc_put_task(task);
+out:
+	trace_nfs4_setclientid(clp, status);
+	dprintk("NFS reply setclientid: %d\n", status);
+	return status;
+}
+
+/**
+ * nfs4_proc_setclientid_confirm - Confirm client ID
+ * @clp: state data structure
+ * @res: result of a previous SETCLIENTID
+ * @cred: RPC credential to use for this call
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ */
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+		struct nfs4_setclientid_res *arg,
+		struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
+		.rpc_argp = arg,
+		.rpc_cred = cred,
+	};
+	int status;
+
+	dprintk("NFS call  setclientid_confirm auth=%s, (client ID %llx)\n",
+		clp->cl_rpcclient->cl_auth->au_ops->au_name,
+		clp->cl_clientid);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	trace_nfs4_setclientid_confirm(clp, status);
+	dprintk("NFS reply setclientid_confirm: %d\n", status);
+	return status;
+}
+
+struct nfs4_delegreturndata {
+	struct nfs4_delegreturnargs args;
+	struct nfs4_delegreturnres res;
+	struct nfs_fh fh;
+	nfs4_stateid stateid;
+	unsigned long timestamp;
+	struct nfs_fattr fattr;
+	int rpc_status;
+	struct inode *inode;
+	bool roc;
+	u32 roc_barrier;
+};
+
+static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_delegreturndata *data = calldata;
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
+	switch (task->tk_status) {
+	case 0:
+		renew_lease(data->res.server, data->timestamp);
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_OLD_STATEID:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_EXPIRED:
+		task->tk_status = 0;
+		if (data->roc)
+			pnfs_roc_set_barrier(data->inode, data->roc_barrier);
+		break;
+	default:
+		if (nfs4_async_handle_error(task, data->res.server,
+					    NULL, NULL) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+	data->rpc_status = task->tk_status;
+}
+
+static void nfs4_delegreturn_release(void *calldata)
+{
+	struct nfs4_delegreturndata *data = calldata;
+	struct inode *inode = data->inode;
+
+	if (inode) {
+		if (data->roc)
+			pnfs_roc_release(inode);
+		nfs_iput_and_deactive(inode);
+	}
+	kfree(calldata);
+}
+
+static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_delegreturndata *d_data;
+
+	d_data = (struct nfs4_delegreturndata *)data;
+
+	if (d_data->roc &&
+	    pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+		return;
+
+	nfs4_setup_sequence(d_data->res.server,
+			&d_data->args.seq_args,
+			&d_data->res.seq_res,
+			task);
+}
+
+static const struct rpc_call_ops nfs4_delegreturn_ops = {
+	.rpc_call_prepare = nfs4_delegreturn_prepare,
+	.rpc_call_done = nfs4_delegreturn_done,
+	.rpc_release = nfs4_delegreturn_release,
+};
+
+static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+{
+	struct nfs4_delegreturndata *data;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_delegreturn_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = 0;
+
+	data = kzalloc(sizeof(*data), GFP_NOFS);
+	if (data == NULL)
+		return -ENOMEM;
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+	data->args.fhandle = &data->fh;
+	data->args.stateid = &data->stateid;
+	data->args.bitmask = server->cache_consistency_bitmask;
+	nfs_copy_fh(&data->fh, NFS_FH(inode));
+	nfs4_stateid_copy(&data->stateid, stateid);
+	data->res.fattr = &data->fattr;
+	data->res.server = server;
+	nfs_fattr_init(data->res.fattr);
+	data->timestamp = jiffies;
+	data->rpc_status = 0;
+	data->inode = nfs_igrab_and_active(inode);
+	if (data->inode)
+		data->roc = nfs4_roc(inode);
+
+	task_setup_data.callback_data = data;
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	if (!issync)
+		goto out;
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0)
+		goto out;
+	status = data->rpc_status;
+	if (status == 0)
+		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+	else
+		nfs_refresh_inode(inode, &data->fattr);
+out:
+	rpc_put_task(task);
+	return status;
+}
+
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
+		trace_nfs4_delegreturn(inode, err);
+		switch (err) {
+			case -NFS4ERR_STALE_STATEID:
+			case -NFS4ERR_EXPIRED:
+			case 0:
+				return 0;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+/* 
+ * sleep, with exponential backoff, and retry the LOCK operation. 
+ */
+static unsigned long
+nfs4_set_lock_task_retry(unsigned long timeout)
+{
+	freezable_schedule_timeout_killable_unsafe(timeout);
+	timeout <<= 1;
+	if (timeout > NFS4_LOCK_MAXTIMEOUT)
+		return NFS4_LOCK_MAXTIMEOUT;
+	return timeout;
+}
+
+static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct inode *inode = state->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_lockt_args arg = {
+		.fh = NFS_FH(inode),
+		.fl = request,
+	};
+	struct nfs_lockt_res res = {
+		.denied = request,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCKT],
+		.rpc_argp       = &arg,
+		.rpc_resp       = &res,
+		.rpc_cred	= state->owner->so_cred,
+	};
+	struct nfs4_lock_state *lsp;
+	int status;
+
+	arg.lock_owner.clientid = clp->cl_clientid;
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		goto out;
+	lsp = request->fl_u.nfs4_fl.owner;
+	arg.lock_owner.id = lsp->ls_seqid.owner_id;
+	arg.lock_owner.s_dev = server->s_dev;
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	switch (status) {
+		case 0:
+			request->fl_type = F_UNLCK;
+			break;
+		case -NFS4ERR_DENIED:
+			status = 0;
+	}
+	request->fl_ops->fl_release_private(request);
+	request->fl_ops = NULL;
+out:
+	return status;
+}
+
+static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = _nfs4_proc_getlk(state, cmd, request);
+		trace_nfs4_get_lock(request, state, cmd, err);
+		err = nfs4_handle_exception(NFS_SERVER(state->inode), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int do_vfs_lock(struct file *file, struct file_lock *fl)
+{
+	int res = 0;
+	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+		case FL_POSIX:
+			res = posix_lock_file_wait(file, fl);
+			break;
+		case FL_FLOCK:
+			res = flock_lock_file_wait(file, fl);
+			break;
+		default:
+			BUG();
+	}
+	return res;
+}
+
+struct nfs4_unlockdata {
+	struct nfs_locku_args arg;
+	struct nfs_locku_res res;
+	struct nfs4_lock_state *lsp;
+	struct nfs_open_context *ctx;
+	struct file_lock fl;
+	const struct nfs_server *server;
+	unsigned long timestamp;
+};
+
+static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
+		struct nfs_open_context *ctx,
+		struct nfs4_lock_state *lsp,
+		struct nfs_seqid *seqid)
+{
+	struct nfs4_unlockdata *p;
+	struct inode *inode = lsp->ls_state->inode;
+
+	p = kzalloc(sizeof(*p), GFP_NOFS);
+	if (p == NULL)
+		return NULL;
+	p->arg.fh = NFS_FH(inode);
+	p->arg.fl = &p->fl;
+	p->arg.seqid = seqid;
+	p->res.seqid = seqid;
+	p->lsp = lsp;
+	atomic_inc(&lsp->ls_count);
+	/* Ensure we don't close file until we're done freeing locks! */
+	p->ctx = get_nfs_open_context(ctx);
+	memcpy(&p->fl, fl, sizeof(p->fl));
+	p->server = NFS_SERVER(inode);
+	return p;
+}
+
+static void nfs4_locku_release_calldata(void *data)
+{
+	struct nfs4_unlockdata *calldata = data;
+	nfs_free_seqid(calldata->arg.seqid);
+	nfs4_put_lock_state(calldata->lsp);
+	put_nfs_open_context(calldata->ctx);
+	kfree(calldata);
+}
+
+static void nfs4_locku_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_unlockdata *calldata = data;
+
+	if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+		return;
+	switch (task->tk_status) {
+		case 0:
+			renew_lease(calldata->server, calldata->timestamp);
+			do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
+			if (nfs4_update_lock_stateid(calldata->lsp,
+					&calldata->res.stateid))
+				break;
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_OLD_STATEID:
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_EXPIRED:
+			if (!nfs4_stateid_match(&calldata->arg.stateid,
+						&calldata->lsp->ls_stateid))
+				rpc_restart_call_prepare(task);
+			break;
+		default:
+			if (nfs4_async_handle_error(task, calldata->server,
+						    NULL, NULL) == -EAGAIN)
+				rpc_restart_call_prepare(task);
+	}
+	nfs_release_seqid(calldata->arg.seqid);
+}
+
+static void nfs4_locku_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_unlockdata *calldata = data;
+
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+		goto out_wait;
+	nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
+	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
+		/* Note: exit _without_ running nfs4_locku_done */
+		goto out_no_action;
+	}
+	calldata->timestamp = jiffies;
+	if (nfs4_setup_sequence(calldata->server,
+				&calldata->arg.seq_args,
+				&calldata->res.seq_res,
+				task) != 0)
+		nfs_release_seqid(calldata->arg.seqid);
+	return;
+out_no_action:
+	task->tk_action = NULL;
+out_wait:
+	nfs4_sequence_done(task, &calldata->res.seq_res);
+}
+
+static const struct rpc_call_ops nfs4_locku_ops = {
+	.rpc_call_prepare = nfs4_locku_prepare,
+	.rpc_call_done = nfs4_locku_done,
+	.rpc_release = nfs4_locku_release_calldata,
+};
+
+static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
+		struct nfs_open_context *ctx,
+		struct nfs4_lock_state *lsp,
+		struct nfs_seqid *seqid)
+{
+	struct nfs4_unlockdata *data;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
+		.rpc_cred = ctx->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_locku_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
+		NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg);
+
+	/* Ensure this is an unlock - when canceling a lock, the
+	 * canceled lock is passed in, and it won't be an unlock.
+	 */
+	fl->fl_type = F_UNLCK;
+
+	data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
+	if (data == NULL) {
+		nfs_free_seqid(seqid);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+	task_setup_data.callback_data = data;
+	return rpc_run_task(&task_setup_data);
+}
+
+static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct inode *inode = state->inode;
+	struct nfs4_state_owner *sp = state->owner;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_seqid *seqid;
+	struct nfs4_lock_state *lsp;
+	struct rpc_task *task;
+	struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+	int status = 0;
+	unsigned char fl_flags = request->fl_flags;
+
+	status = nfs4_set_lock_state(state, request);
+	/* Unlock _before_ we do the RPC call */
+	request->fl_flags |= FL_EXISTS;
+	/* Exclude nfs_delegation_claim_locks() */
+	mutex_lock(&sp->so_delegreturn_mutex);
+	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
+	down_read(&nfsi->rwsem);
+	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+		up_read(&nfsi->rwsem);
+		mutex_unlock(&sp->so_delegreturn_mutex);
+		goto out;
+	}
+	up_read(&nfsi->rwsem);
+	mutex_unlock(&sp->so_delegreturn_mutex);
+	if (status != 0)
+		goto out;
+	/* Is this a delegated lock? */
+	lsp = request->fl_u.nfs4_fl.owner;
+	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
+		goto out;
+	alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
+	seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
+	status = -ENOMEM;
+	if (IS_ERR(seqid))
+		goto out;
+	task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
+	status = PTR_ERR(task);
+	if (IS_ERR(task))
+		goto out;
+	status = nfs4_wait_for_completion_rpc_task(task);
+	rpc_put_task(task);
+out:
+	request->fl_flags = fl_flags;
+	trace_nfs4_unlock(request, state, F_SETLK, status);
+	return status;
+}
+
+struct nfs4_lockdata {
+	struct nfs_lock_args arg;
+	struct nfs_lock_res res;
+	struct nfs4_lock_state *lsp;
+	struct nfs_open_context *ctx;
+	struct file_lock fl;
+	unsigned long timestamp;
+	int rpc_status;
+	int cancelled;
+	struct nfs_server *server;
+};
+
+static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
+		struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+		gfp_t gfp_mask)
+{
+	struct nfs4_lockdata *p;
+	struct inode *inode = lsp->ls_state->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+
+	p = kzalloc(sizeof(*p), gfp_mask);
+	if (p == NULL)
+		return NULL;
+
+	p->arg.fh = NFS_FH(inode);
+	p->arg.fl = &p->fl;
+	p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
+	if (IS_ERR(p->arg.open_seqid))
+		goto out_free;
+	alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+	p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
+	if (IS_ERR(p->arg.lock_seqid))
+		goto out_free_seqid;
+	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
+	p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
+	p->arg.lock_owner.s_dev = server->s_dev;
+	p->res.lock_seqid = p->arg.lock_seqid;
+	p->lsp = lsp;
+	p->server = server;
+	atomic_inc(&lsp->ls_count);
+	p->ctx = get_nfs_open_context(ctx);
+	get_file(fl->fl_file);
+	memcpy(&p->fl, fl, sizeof(p->fl));
+	return p;
+out_free_seqid:
+	nfs_free_seqid(p->arg.open_seqid);
+out_free:
+	kfree(p);
+	return NULL;
+}
+
+static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+	struct nfs4_state *state = data->lsp->ls_state;
+
+	dprintk("%s: begin!\n", __func__);
+	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+		goto out_wait;
+	/* Do we need to do an open_to_lock_owner? */
+	if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
+		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
+			goto out_release_lock_seqid;
+		}
+		nfs4_stateid_copy(&data->arg.open_stateid,
+				&state->open_stateid);
+		data->arg.new_lock_owner = 1;
+		data->res.open_seqid = data->arg.open_seqid;
+	} else {
+		data->arg.new_lock_owner = 0;
+		nfs4_stateid_copy(&data->arg.lock_stateid,
+				&data->lsp->ls_stateid);
+	}
+	if (!nfs4_valid_open_stateid(state)) {
+		data->rpc_status = -EBADF;
+		task->tk_action = NULL;
+		goto out_release_open_seqid;
+	}
+	data->timestamp = jiffies;
+	if (nfs4_setup_sequence(data->server,
+				&data->arg.seq_args,
+				&data->res.seq_res,
+				task) == 0)
+		return;
+out_release_open_seqid:
+	nfs_release_seqid(data->arg.open_seqid);
+out_release_lock_seqid:
+	nfs_release_seqid(data->arg.lock_seqid);
+out_wait:
+	nfs4_sequence_done(task, &data->res.seq_res);
+	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
+}
+
+static void nfs4_lock_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+	struct nfs4_lock_state *lsp = data->lsp;
+
+	dprintk("%s: begin!\n", __func__);
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	data->rpc_status = task->tk_status;
+	switch (task->tk_status) {
+	case 0:
+		renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
+				data->timestamp);
+		if (data->arg.new_lock) {
+			data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+			if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
+				rpc_restart_call_prepare(task);
+				break;
+			}
+		}
+		if (data->arg.new_lock_owner != 0) {
+			nfs_confirm_seqid(&lsp->ls_seqid, 0);
+			nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
+			set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+		} else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
+			rpc_restart_call_prepare(task);
+		break;
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_OLD_STATEID:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_EXPIRED:
+		if (data->arg.new_lock_owner != 0) {
+			if (!nfs4_stateid_match(&data->arg.open_stateid,
+						&lsp->ls_state->open_stateid))
+				rpc_restart_call_prepare(task);
+		} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
+						&lsp->ls_stateid))
+				rpc_restart_call_prepare(task);
+	}
+	dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
+}
+
+static void nfs4_lock_release(void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+
+	dprintk("%s: begin!\n", __func__);
+	nfs_free_seqid(data->arg.open_seqid);
+	if (data->cancelled != 0) {
+		struct rpc_task *task;
+		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
+				data->arg.lock_seqid);
+		if (!IS_ERR(task))
+			rpc_put_task_async(task);
+		dprintk("%s: cancelling lock!\n", __func__);
+	} else
+		nfs_free_seqid(data->arg.lock_seqid);
+	nfs4_put_lock_state(data->lsp);
+	put_nfs_open_context(data->ctx);
+	fput(data->fl.fl_file);
+	kfree(data);
+	dprintk("%s: done!\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_lock_ops = {
+	.rpc_call_prepare = nfs4_lock_prepare,
+	.rpc_call_done = nfs4_lock_done,
+	.rpc_release = nfs4_lock_release,
+};
+
+static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
+{
+	switch (error) {
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+		if (new_lock_owner != 0 ||
+		   test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0)
+			nfs4_schedule_stateid_recovery(server, lsp->ls_state);
+		break;
+	case -NFS4ERR_STALE_STATEID:
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+	case -NFS4ERR_EXPIRED:
+		nfs4_schedule_lease_recovery(server->nfs_client);
+	};
+}
+
+static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
+{
+	struct nfs4_lockdata *data;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+		.rpc_cred = state->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(state->inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_lock_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int ret;
+
+	dprintk("%s: begin!\n", __func__);
+	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
+			fl->fl_u.nfs4_fl.owner,
+			recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
+	if (data == NULL)
+		return -ENOMEM;
+	if (IS_SETLKW(cmd))
+		data->arg.block = 1;
+	nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+	task_setup_data.callback_data = data;
+	if (recovery_type > NFS_LOCK_NEW) {
+		if (recovery_type == NFS_LOCK_RECLAIM)
+			data->arg.reclaim = NFS_LOCK_RECLAIM;
+		nfs4_set_sequence_privileged(&data->arg.seq_args);
+	} else
+		data->arg.new_lock = 1;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	ret = nfs4_wait_for_completion_rpc_task(task);
+	if (ret == 0) {
+		ret = data->rpc_status;
+		if (ret)
+			nfs4_handle_setlk_error(data->server, data->lsp,
+					data->arg.new_lock_owner, ret);
+	} else
+		data->cancelled = 1;
+	rpc_put_task(task);
+	dprintk("%s: done, ret = %d!\n", __func__, ret);
+	return ret;
+}
+
+static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = {
+		.inode = state->inode,
+	};
+	int err;
+
+	do {
+		/* Cache the lock if possible... */
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+			return 0;
+		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
+		trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
+		if (err != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = {
+		.inode = state->inode,
+	};
+	int err;
+
+	err = nfs4_set_lock_state(state, request);
+	if (err != 0)
+		return err;
+	if (!recover_lost_locks) {
+		set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
+		return 0;
+	}
+	do {
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+			return 0;
+		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
+		trace_nfs4_lock_expired(request, state, F_SETLK, err);
+		switch (err) {
+		default:
+			goto out;
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+			nfs4_handle_exception(server, err, &exception);
+			err = 0;
+		}
+	} while (exception.retry);
+out:
+	return err;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/**
+ * nfs41_check_expired_locks - possibly free a lock stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_expired_locks(struct nfs4_state *state)
+{
+	int status, ret = -NFS4ERR_BAD_STATEID;
+	struct nfs4_lock_state *lsp;
+	struct nfs_server *server = NFS_SERVER(state->inode);
+
+	list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+			status = nfs41_test_stateid(server,
+					&lsp->ls_stateid,
+					cred);
+			trace_nfs4_test_lock_stateid(state, lsp, status);
+			if (status != NFS_OK) {
+				/* Free the stateid unless the server
+				 * informs us the stateid is unrecognized. */
+				if (status != -NFS4ERR_BAD_STATEID)
+					nfs41_free_stateid(server,
+							&lsp->ls_stateid,
+							cred);
+				clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+				ret = status;
+			}
+		}
+	};
+
+	return ret;
+}
+
+static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+	int status = NFS_OK;
+
+	if (test_bit(LK_STATE_IN_USE, &state->flags))
+		status = nfs41_check_expired_locks(state);
+	if (status != NFS_OK)
+		status = nfs4_lock_expired(state, request);
+	return status;
+}
+#endif
+
+static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	unsigned char fl_flags = request->fl_flags;
+	int status = -ENOLCK;
+
+	if ((fl_flags & FL_POSIX) &&
+			!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+		goto out;
+	/* Is this a delegated open? */
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		goto out;
+	request->fl_flags |= FL_ACCESS;
+	status = do_vfs_lock(request->fl_file, request);
+	if (status < 0)
+		goto out;
+	down_read(&nfsi->rwsem);
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+		/* Yes: cache locks! */
+		/* ...but avoid races with delegation recall... */
+		request->fl_flags = fl_flags & ~FL_SLEEP;
+		status = do_vfs_lock(request->fl_file, request);
+		up_read(&nfsi->rwsem);
+		goto out;
+	}
+	up_read(&nfsi->rwsem);
+	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
+out:
+	request->fl_flags = fl_flags;
+	return status;
+}
+
+static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct nfs4_exception exception = {
+		.state = state,
+		.inode = state->inode,
+	};
+	int err;
+
+	do {
+		err = _nfs4_proc_setlk(state, cmd, request);
+		trace_nfs4_set_lock(request, state, cmd, err);
+		if (err == -NFS4ERR_DENIED)
+			err = -EAGAIN;
+		err = nfs4_handle_exception(NFS_SERVER(state->inode),
+				err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int
+nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
+{
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+	unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
+	int status;
+
+	/* verify open state */
+	ctx = nfs_file_open_context(filp);
+	state = ctx->state;
+
+	if (request->fl_start < 0 || request->fl_end < 0)
+		return -EINVAL;
+
+	if (IS_GETLK(cmd)) {
+		if (state != NULL)
+			return nfs4_proc_getlk(state, F_GETLK, request);
+		return 0;
+	}
+
+	if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
+		return -EINVAL;
+
+	if (request->fl_type == F_UNLCK) {
+		if (state != NULL)
+			return nfs4_proc_unlck(state, cmd, request);
+		return 0;
+	}
+
+	if (state == NULL)
+		return -ENOLCK;
+	/*
+	 * Don't rely on the VFS having checked the file open mode,
+	 * since it won't do this for flock() locks.
+	 */
+	switch (request->fl_type) {
+	case F_RDLCK:
+		if (!(filp->f_mode & FMODE_READ))
+			return -EBADF;
+		break;
+	case F_WRLCK:
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+	}
+
+	do {
+		status = nfs4_proc_setlk(state, cmd, request);
+		if ((status != -EAGAIN) || IS_SETLK(cmd))
+			break;
+		timeout = nfs4_set_lock_task_retry(timeout);
+		status = -ERESTARTSYS;
+		if (signalled())
+			break;
+	} while(status < 0);
+	return status;
+}
+
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	int err;
+
+	err = nfs4_set_lock_state(state, fl);
+	if (err != 0)
+		return err;
+	err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+	return nfs4_handle_delegation_recall_error(server, state, stateid, err);
+}
+
+struct nfs_release_lockowner_data {
+	struct nfs4_lock_state *lsp;
+	struct nfs_server *server;
+	struct nfs_release_lockowner_args args;
+	struct nfs_release_lockowner_res res;
+	unsigned long timestamp;
+};
+
+static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_release_lockowner_data *data = calldata;
+	struct nfs_server *server = data->server;
+	nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+			     &data->args.seq_args, &data->res.seq_res, task);
+	data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
+	data->timestamp = jiffies;
+}
+
+static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_release_lockowner_data *data = calldata;
+	struct nfs_server *server = data->server;
+
+	nfs40_sequence_done(task, &data->res.seq_res);
+
+	switch (task->tk_status) {
+	case 0:
+		renew_lease(server, data->timestamp);
+		break;
+	case -NFS4ERR_STALE_CLIENTID:
+	case -NFS4ERR_EXPIRED:
+		nfs4_schedule_lease_recovery(server->nfs_client);
+		break;
+	case -NFS4ERR_LEASE_MOVED:
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, server,
+					    NULL, NULL) == -EAGAIN)
+			rpc_restart_call_prepare(task);
+	}
+}
+
+static void nfs4_release_lockowner_release(void *calldata)
+{
+	struct nfs_release_lockowner_data *data = calldata;
+	nfs4_free_lock_state(data->server, data->lsp);
+	kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_release_lockowner_ops = {
+	.rpc_call_prepare = nfs4_release_lockowner_prepare,
+	.rpc_call_done = nfs4_release_lockowner_done,
+	.rpc_release = nfs4_release_lockowner_release,
+};
+
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+	struct nfs_release_lockowner_data *data;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+	};
+
+	if (server->nfs_client->cl_mvops->minor_version != 0)
+		return;
+
+	data = kmalloc(sizeof(*data), GFP_NOFS);
+	if (!data)
+		return;
+	data->lsp = lsp;
+	data->server = server;
+	data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
+	data->args.lock_owner.id = lsp->ls_seqid.owner_id;
+	data->args.lock_owner.s_dev = server->s_dev;
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
+}
+
+#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
+
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+				   const void *buf, size_t buflen,
+				   int flags, int type)
+{
+	if (strcmp(key, "") != 0)
+		return -EINVAL;
+
+	return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
+}
+
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
+				   void *buf, size_t buflen, int type)
+{
+	if (strcmp(key, "") != 0)
+		return -EINVAL;
+
+	return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
+}
+
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+				       size_t list_len, const char *name,
+				       size_t name_len, int type)
+{
+	size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
+
+	if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
+		return 0;
+
+	if (list && len <= list_len)
+		memcpy(list, XATTR_NAME_NFSV4_ACL, len);
+	return len;
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline int nfs4_server_supports_labels(struct nfs_server *server)
+{
+	return server->caps & NFS_CAP_SECURITY_LABEL;
+}
+
+static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
+				   const void *buf, size_t buflen,
+				   int flags, int type)
+{
+	if (security_ismaclabel(key))
+		return nfs4_set_security_label(dentry, buf, buflen);
+
+	return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
+				   void *buf, size_t buflen, int type)
+{
+	if (security_ismaclabel(key))
+		return nfs4_get_security_label(d_inode(dentry), buf, buflen);
+	return -EOPNOTSUPP;
+}
+
+static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
+				       size_t list_len, const char *name,
+				       size_t name_len, int type)
+{
+	size_t len = 0;
+
+	if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
+		len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
+		if (list && len <= list_len)
+			security_inode_listsecurity(d_inode(dentry), list, len);
+	}
+	return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list	= nfs4_xattr_list_nfs4_label,
+	.get	= nfs4_xattr_get_nfs4_label,
+	.set	= nfs4_xattr_set_nfs4_label,
+};
+#endif
+
+
+/*
+ * nfs_fhget will use either the mounted_on_fileid or the fileid
+ */
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+	if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
+	       (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
+	      (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+	      (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))
+		return;
+
+	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+		NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;
+	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	fattr->nlink = 2;
+}
+
+static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
+				   const struct qstr *name,
+				   struct nfs4_fs_locations *fs_locations,
+				   struct page *page)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	u32 bitmask[3] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name = name,
+		.page = page,
+		.bitmask = bitmask,
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations = fs_locations,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	dprintk("%s: start\n", __func__);
+
+	/* Ask for the fileid of the absent filesystem if mounted_on_fileid
+	 * is not supported */
+	if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+		bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+	else
+		bitmask[0] |= FATTR4_WORD0_FILEID;
+
+	nfs_fattr_init(&fs_locations->fattr);
+	fs_locations->server = server;
+	fs_locations->nlocations = 0;
+	status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	dprintk("%s: returned status = %d\n", __func__, status);
+	return status;
+}
+
+int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
+			   const struct qstr *name,
+			   struct nfs4_fs_locations *fs_locations,
+			   struct page *page)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_fs_locations(client, dir, name,
+				fs_locations, page);
+		trace_nfs4_get_fs_locations(dir, name, err);
+		err = nfs4_handle_exception(NFS_SERVER(dir), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop returning
+ * NFS4ERR_LEASE_MOVED to this client.  A RENEW operation is
+ * appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.clientid	= server->nfs_client->cl_clientid,
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status)
+		return status;
+
+	renew_lease(server, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID
+ * performing this operation is identified in the SEQUENCE
+ * operation in this compound.
+ *
+ * When the client supports GETATTR(fs_locations_info), it can
+ * be plumbed in here.
+ */
+static int _nfs41_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_get_locations - discover locations for a migrated FSID
+ * @inode: inode on FSID that is migrating
+ * @locations: result of query
+ * @page: buffer
+ * @cred: credential to use for this operation
+ *
+ * Returns NFS4_OK on success, a negative NFS4ERR status code if the
+ * operation failed, or a negative errno if a local error occurred.
+ *
+ * On success, "locations" is filled in, but if the server has
+ * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not
+ * asserted.
+ *
+ * -NFS4ERR_LEASE_MOVED is returned if the server still has leases
+ * from this client that require migration recovery.
+ */
+int nfs4_proc_get_locations(struct inode *inode,
+			    struct nfs4_fs_locations *locations,
+			    struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->get_locations(inode, locations, page, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop
+ * returning NFS4ERR_LEASE_MOVED to this client.  A RENEW operation
+ * is appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+		.clientid	= clp->cl_clientid,
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fsid_present_res res = {
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status)
+		return status;
+
+	do_renew_lease(clp, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID performing
+ * this operation is identified in the SEQUENCE operation in this
+ * compound.
+ */
+static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+	};
+	struct nfs4_fsid_present_res res = {
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_fsid_present - Is this FSID present or absent on server?
+ * @inode: inode on FSID to check
+ * @cred: credential to use for this operation
+ *
+ * Server indicates whether the FSID is present, moved, or not
+ * recognized.  This operation is necessary to clear a LEASE_MOVED
+ * condition for this client ID.
+ *
+ * Returns NFS4_OK if the FSID is present on this server,
+ * -NFS4ERR_MOVED if the FSID is no longer present, a negative
+ *  NFS4ERR code if some error occurred on the server, or a
+ *  negative errno if a local failure occurred.
+ */
+int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->fsid_present(inode, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
+/**
+ * If 'use_integrity' is true and the state managment nfs_client
+ * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
+ * and the machine credential as per RFC3530bis and RFC5661 Security
+ * Considerations sections. Otherwise, just use the user cred with the
+ * filesystem's rpc_client.
+ */
+static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
+{
+	int status;
+	struct nfs4_secinfo_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name   = name,
+	};
+	struct nfs4_secinfo_res res = {
+		.flavors     = flavors,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+	struct rpc_cred *cred = NULL;
+
+	if (use_integrity) {
+		clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
+		cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
+		msg.rpc_cred = cred;
+	}
+
+	dprintk("NFS call  secinfo %s\n", name->name);
+
+	nfs4_state_protect(NFS_SERVER(dir)->nfs_client,
+		NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+
+	status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
+				&res.seq_res, 0);
+	dprintk("NFS reply  secinfo: %d\n", status);
+
+	if (cred)
+		put_rpccred(cred);
+
+	return status;
+}
+
+int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
+		      struct nfs4_secinfo_flavors *flavors)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = -NFS4ERR_WRONGSEC;
+
+		/* try to use integrity protection with machine cred */
+		if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client))
+			err = _nfs4_proc_secinfo(dir, name, flavors, true);
+
+		/*
+		 * if unable to use integrity protection, or SECINFO with
+		 * integrity protection returns NFS4ERR_WRONGSEC (which is
+		 * disallowed by spec, but exists in deployed servers) use
+		 * the current filesystem's rpc_client and the user cred.
+		 */
+		if (err == -NFS4ERR_WRONGSEC)
+			err = _nfs4_proc_secinfo(dir, name, flavors, false);
+
+		trace_nfs4_secinfo(dir, name, err);
+		err = nfs4_handle_exception(NFS_SERVER(dir), err,
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+	if (flags & ~EXCHGID4_FLAG_MASK_R)
+		goto out_inval;
+	if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+	    (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+		goto out_inval;
+	if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+		goto out_inval;
+	return NFS_OK;
+out_inval:
+	return -NFS4ERR_INVAL;
+}
+
+static bool
+nfs41_same_server_scope(struct nfs41_server_scope *a,
+			struct nfs41_server_scope *b)
+{
+	if (a->server_scope_sz == b->server_scope_sz &&
+	    memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * nfs4_proc_bind_conn_to_session()
+ *
+ * The 4.1 client currently uses the same TCP connection for the
+ * fore and backchannel.
+ */
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	int status;
+	struct nfs41_bind_conn_to_session_args args = {
+		.client = clp,
+		.dir = NFS4_CDFC4_FORE_OR_BOTH,
+	};
+	struct nfs41_bind_conn_to_session_res res;
+	struct rpc_message msg = {
+		.rpc_proc =
+			&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+
+	dprintk("--> %s\n", __func__);
+
+	nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
+	if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+		args.dir = NFS4_CDFC4_FORE;
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	trace_nfs4_bind_conn_to_session(clp, status);
+	if (status == 0) {
+		if (memcmp(res.sessionid.data,
+		    clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
+			dprintk("NFS: %s: Session ID mismatch\n", __func__);
+			status = -EIO;
+			goto out;
+		}
+		if ((res.dir & args.dir) != res.dir || res.dir == 0) {
+			dprintk("NFS: %s: Unexpected direction from server\n",
+				__func__);
+			status = -EIO;
+			goto out;
+		}
+		if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
+			dprintk("NFS: %s: Server returned RDMA mode = true\n",
+				__func__);
+			status = -EIO;
+			goto out;
+		}
+	}
+out:
+	dprintk("<-- %s status= %d\n", __func__, status);
+	return status;
+}
+
+/*
+ * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
+ * and operations we'd like to see to enable certain features in the allow map
+ */
+static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
+	.how = SP4_MACH_CRED,
+	.enforce.u.words = {
+		[1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+		      1 << (OP_EXCHANGE_ID - 32) |
+		      1 << (OP_CREATE_SESSION - 32) |
+		      1 << (OP_DESTROY_SESSION - 32) |
+		      1 << (OP_DESTROY_CLIENTID - 32)
+	},
+	.allow.u.words = {
+		[0] = 1 << (OP_CLOSE) |
+		      1 << (OP_LOCKU) |
+		      1 << (OP_COMMIT),
+		[1] = 1 << (OP_SECINFO - 32) |
+		      1 << (OP_SECINFO_NO_NAME - 32) |
+		      1 << (OP_TEST_STATEID - 32) |
+		      1 << (OP_FREE_STATEID - 32) |
+		      1 << (OP_WRITE - 32)
+	}
+};
+
+/*
+ * Select the state protection mode for client `clp' given the server results
+ * from exchange_id in `sp'.
+ *
+ * Returns 0 on success, negative errno otherwise.
+ */
+static int nfs4_sp4_select_mode(struct nfs_client *clp,
+				 struct nfs41_state_protection *sp)
+{
+	static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = {
+		[1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+		      1 << (OP_EXCHANGE_ID - 32) |
+		      1 << (OP_CREATE_SESSION - 32) |
+		      1 << (OP_DESTROY_SESSION - 32) |
+		      1 << (OP_DESTROY_CLIENTID - 32)
+	};
+	unsigned int i;
+
+	if (sp->how == SP4_MACH_CRED) {
+		/* Print state protect result */
+		dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n");
+		for (i = 0; i <= LAST_NFS4_OP; i++) {
+			if (test_bit(i, sp->enforce.u.longs))
+				dfprintk(MOUNT, "  enforce op %d\n", i);
+			if (test_bit(i, sp->allow.u.longs))
+				dfprintk(MOUNT, "  allow op %d\n", i);
+		}
+
+		/* make sure nothing is on enforce list that isn't supported */
+		for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
+			if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
+				dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+				return -EINVAL;
+			}
+		}
+
+		/*
+		 * Minimal mode - state operations are allowed to use machine
+		 * credential.  Note this already happens by default, so the
+		 * client doesn't have to do anything more than the negotiation.
+		 *
+		 * NOTE: we don't care if EXCHANGE_ID is in the list -
+		 *       we're already using the machine cred for exchange_id
+		 *       and will never use a different cred.
+		 */
+		if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) &&
+		    test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) &&
+		    test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) &&
+		    test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
+			dfprintk(MOUNT, "sp4_mach_cred:\n");
+			dfprintk(MOUNT, "  minimal mode enabled\n");
+			set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags);
+		} else {
+			dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+			return -EINVAL;
+		}
+
+		if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+		    test_bit(OP_LOCKU, sp->allow.u.longs)) {
+			dfprintk(MOUNT, "  cleanup mode enabled\n");
+			set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
+		}
+
+		if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
+		    test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
+			dfprintk(MOUNT, "  secinfo mode enabled\n");
+			set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags);
+		}
+
+		if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
+		    test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
+			dfprintk(MOUNT, "  stateid mode enabled\n");
+			set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags);
+		}
+
+		if (test_bit(OP_WRITE, sp->allow.u.longs)) {
+			dfprintk(MOUNT, "  write mode enabled\n");
+			set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags);
+		}
+
+		if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
+			dfprintk(MOUNT, "  commit mode enabled\n");
+			set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+	u32 sp4_how)
+{
+	nfs4_verifier verifier;
+	struct nfs41_exchange_id_args args = {
+		.verifier = &verifier,
+		.client = clp,
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID |
+			 EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
+	};
+	struct nfs41_exchange_id_res res = {
+		0
+	};
+	int status;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+
+	nfs4_init_boot_verifier(clp, &verifier);
+	args.id_len = nfs4_init_uniform_client_string(clp, args.id,
+							sizeof(args.id));
+	dprintk("NFS call  exchange_id auth=%s, '%.*s'\n",
+		clp->cl_rpcclient->cl_auth->au_ops->au_name,
+		args.id_len, args.id);
+
+	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+					GFP_NOFS);
+	if (unlikely(res.server_owner == NULL)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+					GFP_NOFS);
+	if (unlikely(res.server_scope == NULL)) {
+		status = -ENOMEM;
+		goto out_server_owner;
+	}
+
+	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+	if (unlikely(res.impl_id == NULL)) {
+		status = -ENOMEM;
+		goto out_server_scope;
+	}
+
+	switch (sp4_how) {
+	case SP4_NONE:
+		args.state_protect.how = SP4_NONE;
+		break;
+
+	case SP4_MACH_CRED:
+		args.state_protect = nfs4_sp4_mach_cred_request;
+		break;
+
+	default:
+		/* unsupported! */
+		WARN_ON_ONCE(1);
+		status = -EINVAL;
+		goto out_server_scope;
+	}
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	trace_nfs4_exchange_id(clp, status);
+	if (status == 0)
+		status = nfs4_check_cl_exchange_flags(res.flags);
+
+	if (status == 0)
+		status = nfs4_sp4_select_mode(clp, &res.state_protect);
+
+	if (status == 0) {
+		clp->cl_clientid = res.clientid;
+		clp->cl_exchange_flags = res.flags;
+		/* Client ID is not confirmed */
+		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+			clear_bit(NFS4_SESSION_ESTABLISHED,
+					&clp->cl_session->session_state);
+			clp->cl_seqid = res.seqid;
+		}
+
+		kfree(clp->cl_serverowner);
+		clp->cl_serverowner = res.server_owner;
+		res.server_owner = NULL;
+
+		/* use the most recent implementation id */
+		kfree(clp->cl_implid);
+		clp->cl_implid = res.impl_id;
+
+		if (clp->cl_serverscope != NULL &&
+		    !nfs41_same_server_scope(clp->cl_serverscope,
+					     res.server_scope)) {
+			dprintk("%s: server_scope mismatch detected\n",
+				__func__);
+			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+			kfree(clp->cl_serverscope);
+			clp->cl_serverscope = NULL;
+		}
+
+		if (clp->cl_serverscope == NULL) {
+			clp->cl_serverscope = res.server_scope;
+			goto out;
+		}
+	} else
+		kfree(res.impl_id);
+
+out_server_owner:
+	kfree(res.server_owner);
+out_server_scope:
+	kfree(res.server_scope);
+out:
+	if (clp->cl_implid != NULL)
+		dprintk("NFS reply exchange_id: Server Implementation ID: "
+			"domain: %s, name: %s, date: %llu,%u\n",
+			clp->cl_implid->domain, clp->cl_implid->name,
+			clp->cl_implid->date.seconds,
+			clp->cl_implid->date.nseconds);
+	dprintk("NFS reply exchange_id: %d\n", status);
+	return status;
+}
+
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ *
+ * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used.
+ */
+int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor;
+	int status;
+
+	/* try SP4_MACH_CRED if krb5i/p	*/
+	if (authflavor == RPC_AUTH_GSS_KRB5I ||
+	    authflavor == RPC_AUTH_GSS_KRB5P) {
+		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+		if (!status)
+			return 0;
+	}
+
+	/* try SP4_NONE */
+	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+}
+
+static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID],
+		.rpc_argp = clp,
+		.rpc_cred = cred,
+	};
+	int status;
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	trace_nfs4_destroy_clientid(clp, status);
+	if (status)
+		dprintk("NFS: Got error %d from the server %s on "
+			"DESTROY_CLIENTID.", status, clp->cl_hostname);
+	return status;
+}
+
+static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	unsigned int loop;
+	int ret;
+
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+		ret = _nfs4_proc_destroy_clientid(clp, cred);
+		switch (ret) {
+		case -NFS4ERR_DELAY:
+		case -NFS4ERR_CLIENTID_BUSY:
+			ssleep(1);
+			break;
+		default:
+			return ret;
+		}
+	}
+	return 0;
+}
+
+int nfs4_destroy_clientid(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	int ret = 0;
+
+	if (clp->cl_mvops->minor_version < 1)
+		goto out;
+	if (clp->cl_exchange_flags == 0)
+		goto out;
+	if (clp->cl_preserve_clid)
+		goto out;
+	cred = nfs4_get_clid_cred(clp);
+	ret = nfs4_proc_destroy_clientid(clp, cred);
+	if (cred)
+		put_rpccred(cred);
+	switch (ret) {
+	case 0:
+	case -NFS4ERR_STALE_CLIENTID:
+		clp->cl_exchange_flags = 0;
+	}
+out:
+	return ret;
+}
+
+struct nfs4_get_lease_time_data {
+	struct nfs4_get_lease_time_args *args;
+	struct nfs4_get_lease_time_res *res;
+	struct nfs_client *clp;
+};
+
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+					void *calldata)
+{
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	/* just setup sequence, do not trigger session recovery
+	   since we're invoked within one */
+	nfs41_setup_sequence(data->clp->cl_session,
+			&data->args->la_seq_args,
+			&data->res->lr_seq_res,
+			task);
+	dprintk("<-- %s\n", __func__);
+}
+
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
+		return;
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
+		rpc_delay(task, NFS4_POLL_RETRY_MIN);
+		task->tk_status = 0;
+		/* fall through */
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		rpc_restart_call_prepare(task);
+		return;
+	}
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_get_lease_time_ops = {
+	.rpc_call_prepare = nfs4_get_lease_time_prepare,
+	.rpc_call_done = nfs4_get_lease_time_done,
+};
+
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+	struct rpc_task *task;
+	struct nfs4_get_lease_time_args args;
+	struct nfs4_get_lease_time_res res = {
+		.lr_fsinfo = fsinfo,
+	};
+	struct nfs4_get_lease_time_data data = {
+		.args = &args,
+		.res = &res,
+		.clp = clp,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_get_lease_time_ops,
+		.callback_data = &data,
+		.flags = RPC_TASK_TIMEOUT,
+	};
+	int status;
+
+	nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
+	nfs4_set_sequence_privileged(&args.la_seq_args);
+	dprintk("--> %s\n", __func__);
+	task = rpc_run_task(&task_setup);
+
+	if (IS_ERR(task))
+		status = PTR_ERR(task);
+	else {
+		status = task->tk_status;
+		rpc_put_task(task);
+	}
+	dprintk("<-- %s return %d\n", __func__, status);
+
+	return status;
+}
+
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+{
+	unsigned int max_rqst_sz, max_resp_sz;
+
+	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
+
+	/* Fore channel attributes */
+	args->fc_attrs.max_rqst_sz = max_rqst_sz;
+	args->fc_attrs.max_resp_sz = max_resp_sz;
+	args->fc_attrs.max_ops = NFS4_MAX_OPS;
+	args->fc_attrs.max_reqs = max_session_slots;
+
+	dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+		args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
+
+	/* Back channel attributes */
+	args->bc_attrs.max_rqst_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz_cached = 0;
+	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+	args->bc_attrs.max_reqs = 1;
+
+	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+		args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+		args->bc_attrs.max_reqs);
+}
+
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+		struct nfs41_create_session_res *res)
+{
+	struct nfs4_channel_attrs *sent = &args->fc_attrs;
+	struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
+
+	if (rcvd->max_resp_sz > sent->max_resp_sz)
+		return -EINVAL;
+	/*
+	 * Our requested max_ops is the minimum we need; we're not
+	 * prepared to break up compounds into smaller pieces than that.
+	 * So, no point even trying to continue if the server won't
+	 * cooperate:
+	 */
+	if (rcvd->max_ops < sent->max_ops)
+		return -EINVAL;
+	if (rcvd->max_reqs == 0)
+		return -EINVAL;
+	if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE)
+		rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;
+	return 0;
+}
+
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+		struct nfs41_create_session_res *res)
+{
+	struct nfs4_channel_attrs *sent = &args->bc_attrs;
+	struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
+
+	if (!(res->flags & SESSION4_BACK_CHAN))
+		goto out;
+	if (rcvd->max_rqst_sz > sent->max_rqst_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz < sent->max_resp_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+		return -EINVAL;
+	/* These would render the backchannel useless: */
+	if (rcvd->max_ops != sent->max_ops)
+		return -EINVAL;
+	if (rcvd->max_reqs != sent->max_reqs)
+		return -EINVAL;
+out:
+	return 0;
+}
+
+static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
+				     struct nfs41_create_session_res *res)
+{
+	int ret;
+
+	ret = nfs4_verify_fore_channel_attrs(args, res);
+	if (ret)
+		return ret;
+	return nfs4_verify_back_channel_attrs(args, res);
+}
+
+static void nfs4_update_session(struct nfs4_session *session,
+		struct nfs41_create_session_res *res)
+{
+	nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+	/* Mark client id and session as being confirmed */
+	session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+	set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state);
+	session->flags = res->flags;
+	memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+	if (res->flags & SESSION4_BACK_CHAN)
+		memcpy(&session->bc_attrs, &res->bc_attrs,
+				sizeof(session->bc_attrs));
+}
+
+static int _nfs4_proc_create_session(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	struct nfs4_session *session = clp->cl_session;
+	struct nfs41_create_session_args args = {
+		.client = clp,
+		.clientid = clp->cl_clientid,
+		.seqid = clp->cl_seqid,
+		.cb_program = NFS4_CALLBACK,
+	};
+	struct nfs41_create_session_res res;
+
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	int status;
+
+	nfs4_init_channel_attrs(&args);
+	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
+
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	trace_nfs4_create_session(clp, status);
+
+	if (!status) {
+		/* Verify the session's negotiated channel_attrs values */
+		status = nfs4_verify_channel_attrs(&args, &res);
+		/* Increment the clientid slot sequence id */
+		if (clp->cl_seqid == res.seqid)
+			clp->cl_seqid++;
+		if (status)
+			goto out;
+		nfs4_update_session(session, &res);
+	}
+out:
+	return status;
+}
+
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	int status;
+	unsigned *ptr;
+	struct nfs4_session *session = clp->cl_session;
+
+	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+
+	status = _nfs4_proc_create_session(clp, cred);
+	if (status)
+		goto out;
+
+	/* Init or reset the session slot tables */
+	status = nfs4_setup_session_slot_tables(session);
+	dprintk("slot table setup returned %d\n", status);
+	if (status)
+		goto out;
+
+	ptr = (unsigned *)&session->sess_id.data[0];
+	dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+		clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+out:
+	dprintk("<-- %s\n", __func__);
+	return status;
+}
+
+/*
+ * Issue the over-the-wire RPC DESTROY_SESSION.
+ * The caller must serialize access to this routine.
+ */
+int nfs4_proc_destroy_session(struct nfs4_session *session,
+		struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION],
+		.rpc_argp = session,
+		.rpc_cred = cred,
+	};
+	int status = 0;
+
+	dprintk("--> nfs4_proc_destroy_session\n");
+
+	/* session is still being setup */
+	if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
+		return 0;
+
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	trace_nfs4_destroy_session(session->clp, status);
+
+	if (status)
+		dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
+			"Session has been destroyed regardless...\n", status);
+
+	dprintk("<-- nfs4_proc_destroy_session\n");
+	return status;
+}
+
+/*
+ * Renew the cl_session lease.
+ */
+struct nfs4_sequence_data {
+	struct nfs_client *clp;
+	struct nfs4_sequence_args args;
+	struct nfs4_sequence_res res;
+};
+
+static void nfs41_sequence_release(void *data)
+{
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+	nfs_put_client(clp);
+	kfree(calldata);
+}
+
+static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+	switch(task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, NFS4_POLL_RETRY_MAX);
+		return -EAGAIN;
+	default:
+		nfs4_schedule_lease_recovery(clp);
+	}
+	return 0;
+}
+
+static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+
+	if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
+		return;
+
+	trace_nfs4_sequence(clp, task->tk_status);
+	if (task->tk_status < 0) {
+		dprintk("%s ERROR %d\n", __func__, task->tk_status);
+		if (atomic_read(&clp->cl_count) == 1)
+			goto out;
+
+		if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+	dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
+out:
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+	struct nfs4_sequence_args *args;
+	struct nfs4_sequence_res *res;
+
+	args = task->tk_msg.rpc_argp;
+	res = task->tk_msg.rpc_resp;
+
+	nfs41_setup_sequence(clp->cl_session, args, res, task);
+}
+
+static const struct rpc_call_ops nfs41_sequence_ops = {
+	.rpc_call_done = nfs41_sequence_call_done,
+	.rpc_call_prepare = nfs41_sequence_prepare,
+	.rpc_release = nfs41_sequence_release,
+};
+
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+		struct rpc_cred *cred,
+		bool is_privileged)
+{
+	struct nfs4_sequence_data *calldata;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs41_sequence_ops,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+	};
+
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return ERR_PTR(-EIO);
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	if (calldata == NULL) {
+		nfs_put_client(clp);
+		return ERR_PTR(-ENOMEM);
+	}
+	nfs4_init_sequence(&calldata->args, &calldata->res, 0);
+	if (is_privileged)
+		nfs4_set_sequence_privileged(&calldata->args);
+	msg.rpc_argp = &calldata->args;
+	msg.rpc_resp = &calldata->res;
+	calldata->clp = clp;
+	task_setup_data.callback_data = calldata;
+
+	return rpc_run_task(&task_setup_data);
+}
+
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
+{
+	struct rpc_task *task;
+	int ret = 0;
+
+	if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
+		return -EAGAIN;
+	task = _nfs41_proc_sequence(clp, cred, false);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else
+		rpc_put_task_async(task);
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
+}
+
+static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_task *task;
+	int ret;
+
+	task = _nfs41_proc_sequence(clp, cred, true);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	ret = rpc_wait_for_completion_task(task);
+	if (!ret) {
+		struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+
+		if (task->tk_status == 0)
+			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		ret = task->tk_status;
+	}
+	rpc_put_task(task);
+out:
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
+}
+
+struct nfs4_reclaim_complete_data {
+	struct nfs_client *clp;
+	struct nfs41_reclaim_complete_args arg;
+	struct nfs41_reclaim_complete_res res;
+};
+
+static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+
+	nfs41_setup_sequence(calldata->clp->cl_session,
+			&calldata->arg.seq_args,
+			&calldata->res.seq_res,
+			task);
+}
+
+static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+	switch(task->tk_status) {
+	case 0:
+	case -NFS4ERR_COMPLETE_ALREADY:
+	case -NFS4ERR_WRONG_CRED: /* What to do here? */
+		break;
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, NFS4_POLL_RETRY_MAX);
+		/* fall through */
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		return -EAGAIN;
+	default:
+		nfs4_schedule_lease_recovery(clp);
+	}
+	return 0;
+}
+
+static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+	struct nfs4_sequence_res *res = &calldata->res.seq_res;
+
+	dprintk("--> %s\n", __func__);
+	if (!nfs41_sequence_done(task, res))
+		return;
+
+	trace_nfs4_reclaim_complete(clp, task->tk_status);
+	if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
+		rpc_restart_call_prepare(task);
+		return;
+	}
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_free_reclaim_complete_data(void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+
+	kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
+	.rpc_call_prepare = nfs4_reclaim_complete_prepare,
+	.rpc_call_done = nfs4_reclaim_complete_done,
+	.rpc_release = nfs4_free_reclaim_complete_data,
+};
+
+/*
+ * Issue a global reclaim complete.
+ */
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	struct nfs4_reclaim_complete_data *calldata;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_reclaim_complete_call_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = -ENOMEM;
+
+	dprintk("--> %s\n", __func__);
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	if (calldata == NULL)
+		goto out;
+	calldata->clp = clp;
+	calldata->arg.one_fs = 0;
+
+	nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
+	nfs4_set_sequence_privileged(&calldata->arg.seq_args);
+	msg.rpc_argp = &calldata->arg;
+	msg.rpc_resp = &calldata->res;
+	task_setup_data.callback_data = calldata;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		status = PTR_ERR(task);
+		goto out;
+	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status == 0)
+		status = task->tk_status;
+	rpc_put_task(task);
+	return 0;
+out:
+	dprintk("<-- %s status=%d\n", __func__, status);
+	return status;
+}
+
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct nfs4_session *session = nfs4_get_session(server);
+
+	dprintk("--> %s\n", __func__);
+	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
+	 * right now covering the LAYOUTGET we are about to send.
+	 * However, that is not so catastrophic, and there seems
+	 * to be no way to prevent it completely.
+	 */
+	if (nfs41_setup_sequence(session, &lgp->args.seq_args,
+				&lgp->res.seq_res, task))
+		return;
+	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+					  NFS_I(lgp->args.inode)->layout,
+					  &lgp->args.range,
+					  lgp->args.ctx->state)) {
+		rpc_exit(task, NFS4_OK);
+	}
+}
+
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct inode *inode = lgp->args.inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct pnfs_layout_hdr *lo;
+	struct nfs4_state *state = NULL;
+	unsigned long timeo, now, giveup;
+
+	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
+
+	if (!nfs41_sequence_done(task, &lgp->res.seq_res))
+		goto out;
+
+	switch (task->tk_status) {
+	case 0:
+		goto out;
+	/*
+	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+	 * (or clients) writing to the same RAID stripe
+	 */
+	case -NFS4ERR_LAYOUTTRYLATER:
+	/*
+	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
+	 * existing layout before getting a new one).
+	 */
+	case -NFS4ERR_RECALLCONFLICT:
+		timeo = rpc_get_timeout(task->tk_client);
+		giveup = lgp->args.timestamp + timeo;
+		now = jiffies;
+		if (time_after(giveup, now)) {
+			unsigned long delay;
+
+			/* Delay for:
+			 * - Not less then NFS4_POLL_RETRY_MIN.
+			 * - One last time a jiffie before we give up
+			 * - exponential backoff (time_now minus start_attempt)
+			 */
+			delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
+				    min((giveup - now - 1),
+					now - lgp->args.timestamp));
+
+			dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
+				__func__, delay);
+			rpc_delay(task, delay);
+			task->tk_status = 0;
+			rpc_restart_call_prepare(task);
+			goto out; /* Do not call nfs4_async_handle_error() */
+		}
+		break;
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_BAD_STATEID:
+		spin_lock(&inode->i_lock);
+		lo = NFS_I(inode)->layout;
+		if (!lo || list_empty(&lo->plh_segs)) {
+			spin_unlock(&inode->i_lock);
+			/* If the open stateid was bad, then recover it. */
+			state = lgp->args.ctx->state;
+		} else {
+			LIST_HEAD(head);
+
+			/*
+			 * Mark the bad layout state as invalid, then retry
+			 * with the current stateid.
+			 */
+			pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+			spin_unlock(&inode->i_lock);
+			pnfs_free_lseg_list(&head);
+	
+			task->tk_status = 0;
+			rpc_restart_call_prepare(task);
+		}
+	}
+	if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+		rpc_restart_call_prepare(task);
+out:
+	dprintk("<-- %s\n", __func__);
+}
+
+static size_t max_response_pages(struct nfs_server *server)
+{
+	u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	return nfs_page_array_len(0, max_resp_sz);
+}
+
+static void nfs4_free_pages(struct page **pages, size_t size)
+{
+	int i;
+
+	if (!pages)
+		return;
+
+	for (i = 0; i < size; i++) {
+		if (!pages[i])
+			break;
+		__free_page(pages[i]);
+	}
+	kfree(pages);
+}
+
+static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
+{
+	struct page **pages;
+	int i;
+
+	pages = kcalloc(size, sizeof(struct page *), gfp_flags);
+	if (!pages) {
+		dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
+		return NULL;
+	}
+
+	for (i = 0; i < size; i++) {
+		pages[i] = alloc_page(gfp_flags);
+		if (!pages[i]) {
+			dprintk("%s: failed to allocate page\n", __func__);
+			nfs4_free_pages(pages, size);
+			return NULL;
+		}
+	}
+
+	return pages;
+}
+
+static void nfs4_layoutget_release(void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct inode *inode = lgp->args.inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	size_t max_pages = max_response_pages(server);
+
+	dprintk("--> %s\n", __func__);
+	nfs4_free_pages(lgp->args.layout.pages, max_pages);
+	pnfs_put_layout_hdr(NFS_I(inode)->layout);
+	put_nfs_open_context(lgp->args.ctx);
+	kfree(calldata);
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+	.rpc_call_prepare = nfs4_layoutget_prepare,
+	.rpc_call_done = nfs4_layoutget_done,
+	.rpc_release = nfs4_layoutget_release,
+};
+
+struct pnfs_layout_segment *
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+{
+	struct inode *inode = lgp->args.inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	size_t max_pages = max_response_pages(server);
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+		.rpc_argp = &lgp->args,
+		.rpc_resp = &lgp->res,
+		.rpc_cred = lgp->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutget_call_ops,
+		.callback_data = lgp,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct pnfs_layout_segment *lseg = NULL;
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	/* nfs4_layoutget_release calls pnfs_put_layout_hdr */
+	pnfs_get_layout_hdr(NFS_I(inode)->layout);
+
+	lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
+	if (!lgp->args.layout.pages) {
+		nfs4_layoutget_release(lgp);
+		return ERR_PTR(-ENOMEM);
+	}
+	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+	lgp->args.timestamp = jiffies;
+
+	lgp->res.layoutp = &lgp->args.layout;
+	lgp->res.seq_res.sr_slot = NULL;
+	nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return ERR_CAST(task);
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status == 0)
+		status = task->tk_status;
+	trace_nfs4_layoutget(lgp->args.ctx,
+			&lgp->args.range,
+			&lgp->res.range,
+			status);
+	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
+	if (status == 0 && lgp->res.layoutp->len)
+		lseg = pnfs_layout_process(lgp);
+	rpc_put_task(task);
+	dprintk("<-- %s status=%d\n", __func__, status);
+	if (status)
+		return ERR_PTR(status);
+	return lseg;
+}
+
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	nfs41_setup_sequence(lrp->clp->cl_session,
+			&lrp->args.seq_args,
+			&lrp->res.seq_res,
+			task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+	struct nfs_server *server;
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs41_sequence_done(task, &lrp->res.seq_res))
+		return;
+
+	server = NFS_SERVER(lrp->args.inode);
+	switch (task->tk_status) {
+	default:
+		task->tk_status = 0;
+	case 0:
+		break;
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
+			break;
+		rpc_restart_call_prepare(task);
+		return;
+	}
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+	struct pnfs_layout_hdr *lo = lrp->args.layout;
+
+	dprintk("--> %s\n", __func__);
+	spin_lock(&lo->plh_inode->i_lock);
+	if (lrp->res.lrs_present)
+		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+	pnfs_clear_layoutreturn_waitbit(lo);
+	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
+	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+	lo->plh_block_lgets--;
+	spin_unlock(&lo->plh_inode->i_lock);
+	pnfs_put_layout_hdr(lrp->args.layout);
+	nfs_iput_and_deactive(lrp->inode);
+	kfree(calldata);
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+	.rpc_call_prepare = nfs4_layoutreturn_prepare,
+	.rpc_call_done = nfs4_layoutreturn_done,
+	.rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
+{
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+		.rpc_argp = &lrp->args,
+		.rpc_resp = &lrp->res,
+		.rpc_cred = lrp->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_SERVER(lrp->args.inode)->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutreturn_call_ops,
+		.callback_data = lrp,
+	};
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+	if (!sync) {
+		lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+		if (!lrp->inode) {
+			nfs4_layoutreturn_release(lrp);
+			return -EAGAIN;
+		}
+		task_setup_data.flags |= RPC_TASK_ASYNC;
+	}
+	nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	if (sync)
+		status = task->tk_status;
+	trace_nfs4_layoutreturn(lrp->args.inode, status);
+	dprintk("<-- %s status=%d\n", __func__, status);
+	rpc_put_task(task);
+	return status;
+}
+
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+		struct pnfs_device *pdev,
+		struct rpc_cred *cred)
+{
+	struct nfs4_getdeviceinfo_args args = {
+		.pdev = pdev,
+		.notify_types = NOTIFY_DEVICEID4_CHANGE |
+			NOTIFY_DEVICEID4_DELETE,
+	};
+	struct nfs4_getdeviceinfo_res res = {
+		.pdev = pdev,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	if (res.notification & ~args.notify_types)
+		dprintk("%s: unsupported notification\n", __func__);
+	if (res.notification != args.notify_types)
+		pdev->nocache = 1;
+
+	dprintk("<-- %s status=%d\n", __func__, status);
+
+	return status;
+}
+
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+		struct pnfs_device *pdev,
+		struct rpc_cred *cred)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(server,
+					_nfs4_proc_getdeviceinfo(server, pdev, cred),
+					&exception);
+	} while (exception.retry);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutcommit_data *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->args.inode);
+	struct nfs4_session *session = nfs4_get_session(server);
+
+	nfs41_setup_sequence(session,
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
+}
+
+static void
+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutcommit_data *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+	if (!nfs41_sequence_done(task, &data->res.seq_res))
+		return;
+
+	switch (task->tk_status) { /* Just ignore these failures */
+	case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */
+	case -NFS4ERR_BADIOMODE:     /* no IOMODE_RW layout for range */
+	case -NFS4ERR_BADLAYOUT:     /* no layout */
+	case -NFS4ERR_GRACE:	    /* loca_recalim always false */
+		task->tk_status = 0;
+	case 0:
+		break;
+	default:
+		if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+}
+
+static void nfs4_layoutcommit_release(void *calldata)
+{
+	struct nfs4_layoutcommit_data *data = calldata;
+
+	pnfs_cleanup_layoutcommit(data);
+	nfs_post_op_update_inode_force_wcc(data->args.inode,
+					   data->res.fattr);
+	put_rpccred(data->cred);
+	nfs_iput_and_deactive(data->inode);
+	kfree(data);
+}
+
+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
+	.rpc_call_prepare = nfs4_layoutcommit_prepare,
+	.rpc_call_done = nfs4_layoutcommit_done,
+	.rpc_release = nfs4_layoutcommit_release,
+};
+
+int
+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
+		.rpc_client = NFS_CLIENT(data->args.inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutcommit_ops,
+		.callback_data = data,
+	};
+	struct rpc_task *task;
+	int status = 0;
+
+	dprintk("NFS: %4d initiating layoutcommit call. sync %d "
+		"lbw: %llu inode %lu\n",
+		data->task.tk_pid, sync,
+		data->args.lastbytewritten,
+		data->args.inode->i_ino);
+
+	if (!sync) {
+		data->inode = nfs_igrab_and_active(data->args.inode);
+		if (data->inode == NULL) {
+			nfs4_layoutcommit_release(data);
+			return -EAGAIN;
+		}
+		task_setup_data.flags = RPC_TASK_ASYNC;
+	}
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	if (sync)
+		status = task->tk_status;
+	trace_nfs4_layoutcommit(data->args.inode, status);
+	dprintk("%s: status %d\n", __func__, status);
+	rpc_put_task(task);
+	return status;
+}
+
+/**
+ * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
+ * possible) as per RFC3530bis and RFC5661 Security Considerations sections
+ */
+static int
+_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+		    struct nfs_fsinfo *info,
+		    struct nfs4_secinfo_flavors *flavors, bool use_integrity)
+{
+	struct nfs41_secinfo_no_name_args args = {
+		.style = SECINFO_STYLE_CURRENT_FH,
+	};
+	struct nfs4_secinfo_res res = {
+		.flavors = flavors,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct rpc_clnt *clnt = server->client;
+	struct rpc_cred *cred = NULL;
+	int status;
+
+	if (use_integrity) {
+		clnt = server->nfs_client->cl_rpcclient;
+		cred = nfs4_get_clid_cred(server->nfs_client);
+		msg.rpc_cred = cred;
+	}
+
+	dprintk("--> %s\n", __func__);
+	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
+				&res.seq_res, 0);
+	dprintk("<-- %s status=%d\n", __func__, status);
+
+	if (cred)
+		put_rpccred(cred);
+
+	return status;
+}
+
+static int
+nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+			   struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		/* first try using integrity protection */
+		err = -NFS4ERR_WRONGSEC;
+
+		/* try to use integrity protection with machine cred */
+		if (_nfs4_is_integrity_protected(server->nfs_client))
+			err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+							  flavors, true);
+
+		/*
+		 * if unable to use integrity protection, or SECINFO with
+		 * integrity protection returns NFS4ERR_WRONGSEC (which is
+		 * disallowed by spec, but exists in deployed servers) use
+		 * the current filesystem's rpc_client and the user cred.
+		 */
+		if (err == -NFS4ERR_WRONGSEC)
+			err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+							  flavors, false);
+
+		switch (err) {
+		case 0:
+		case -NFS4ERR_WRONGSEC:
+		case -ENOTSUPP:
+			goto out;
+		default:
+			err = nfs4_handle_exception(server, err, &exception);
+		}
+	} while (exception.retry);
+out:
+	return err;
+}
+
+static int
+nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+		    struct nfs_fsinfo *info)
+{
+	int err;
+	struct page *page;
+	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+	struct nfs4_secinfo_flavors *flavors;
+	struct nfs4_secinfo4 *secinfo;
+	int i;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	flavors = page_address(page);
+	err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
+
+	/*
+	 * Fall back on "guess and check" method if
+	 * the server doesn't support SECINFO_NO_NAME
+	 */
+	if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
+		err = nfs4_find_root_sec(server, fhandle, info);
+		goto out_freepage;
+	}
+	if (err)
+		goto out_freepage;
+
+	for (i = 0; i < flavors->num_flavors; i++) {
+		secinfo = &flavors->flavors[i];
+
+		switch (secinfo->flavor) {
+		case RPC_AUTH_NULL:
+		case RPC_AUTH_UNIX:
+		case RPC_AUTH_GSS:
+			flavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+					&secinfo->flavor_info);
+			break;
+		default:
+			flavor = RPC_AUTH_MAXFLAVOR;
+			break;
+		}
+
+		if (!nfs_auth_info_match(&server->auth_info, flavor))
+			flavor = RPC_AUTH_MAXFLAVOR;
+
+		if (flavor != RPC_AUTH_MAXFLAVOR) {
+			err = nfs4_lookup_root_sec(server, fhandle,
+						   info, flavor);
+			if (!err)
+				break;
+		}
+	}
+
+	if (flavor == RPC_AUTH_MAXFLAVOR)
+		err = -EPERM;
+
+out_freepage:
+	put_page(page);
+	if (err == -EACCES)
+		return -EPERM;
+out:
+	return err;
+}
+
+static int _nfs41_test_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	int status;
+	struct nfs41_test_stateid_args args = {
+		.stateid = stateid,
+	};
+	struct nfs41_test_stateid_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	struct rpc_clnt *rpc_client = server->client;
+
+	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+		&rpc_client, &msg);
+
+	dprintk("NFS call  test_stateid %p\n", stateid);
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(rpc_client, server, &msg,
+			&args.seq_args, &res.seq_res);
+	if (status != NFS_OK) {
+		dprintk("NFS reply test_stateid: failed, %d\n", status);
+		return status;
+	}
+	dprintk("NFS reply test_stateid: succeeded, %d\n", -res.status);
+	return -res.status;
+}
+
+/**
+ * nfs41_test_stateid - perform a TEST_STATEID operation
+ *
+ * @server: server / transport on which to perform the operation
+ * @stateid: state ID to test
+ * @cred: credential
+ *
+ * Returns NFS_OK if the server recognizes that "stateid" is valid.
+ * Otherwise a negative NFS4ERR value is returned if the operation
+ * failed or the state ID is not currently valid.
+ */
+static int nfs41_test_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs41_test_stateid(server, stateid, cred);
+		if (err != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+struct nfs_free_stateid_data {
+	struct nfs_server *server;
+	struct nfs41_free_stateid_args args;
+	struct nfs41_free_stateid_res res;
+};
+
+static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_free_stateid_data *data = calldata;
+	nfs41_setup_sequence(nfs4_get_session(data->server),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
+}
+
+static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_free_stateid_data *data = calldata;
+
+	nfs41_sequence_done(task, &data->res.seq_res);
+
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
+			rpc_restart_call_prepare(task);
+	}
+}
+
+static void nfs41_free_stateid_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs41_free_stateid_ops = {
+	.rpc_call_prepare = nfs41_free_stateid_prepare,
+	.rpc_call_done = nfs41_free_stateid_done,
+	.rpc_release = nfs41_free_stateid_release,
+};
+
+static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred,
+		bool privileged)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs41_free_stateid_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct nfs_free_stateid_data *data;
+
+	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+		&task_setup.rpc_client, &msg);
+
+	dprintk("NFS call  free_stateid %p\n", stateid);
+	data = kmalloc(sizeof(*data), GFP_NOFS);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+	data->server = server;
+	nfs4_stateid_copy(&data->args.stateid, stateid);
+
+	task_setup.callback_data = data;
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	if (privileged)
+		nfs4_set_sequence_privileged(&data->args.seq_args);
+
+	return rpc_run_task(&task_setup);
+}
+
+/**
+ * nfs41_free_stateid - perform a FREE_STATEID operation
+ *
+ * @server: server / transport on which to perform the operation
+ * @stateid: state ID to release
+ * @cred: credential
+ *
+ * Returns NFS_OK if the server freed "stateid".  Otherwise a
+ * negative NFS4ERR value is returned.
+ */
+static int nfs41_free_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	struct rpc_task *task;
+	int ret;
+
+	task = _nfs41_free_stateid(server, stateid, cred, true);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	ret = rpc_wait_for_completion_task(task);
+	if (!ret)
+		ret = task->tk_status;
+	rpc_put_task(task);
+	return ret;
+}
+
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+	struct rpc_task *task;
+	struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+	task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+	nfs4_free_lock_state(server, lsp);
+	if (IS_ERR(task))
+		return;
+	rpc_put_task(task);
+}
+
+static bool nfs41_match_stateid(const nfs4_stateid *s1,
+		const nfs4_stateid *s2)
+{
+	if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
+		return false;
+
+	if (s1->seqid == s2->seqid)
+		return true;
+	if (s1->seqid == 0 || s2->seqid == 0)
+		return true;
+
+	return false;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+static bool nfs4_match_stateid(const nfs4_stateid *s1,
+		const nfs4_stateid *s2)
+{
+	return nfs4_stateid_match(s1, s2);
+}
+
+
+static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,
+	.recover_open	= nfs4_open_reclaim,
+	.recover_lock	= nfs4_lock_reclaim,
+	.establish_clid = nfs4_init_clientid,
+	.detect_trunking = nfs40_discover_server_trunking,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,
+	.recover_open	= nfs4_open_reclaim,
+	.recover_lock	= nfs4_lock_reclaim,
+	.establish_clid = nfs41_init_clientid,
+	.reclaim_complete = nfs41_proc_reclaim_complete,
+	.detect_trunking = nfs41_discover_server_trunking,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,
+	.recover_open	= nfs40_open_expired,
+	.recover_lock	= nfs4_lock_expired,
+	.establish_clid = nfs4_init_clientid,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,
+	.recover_open	= nfs41_open_expired,
+	.recover_lock	= nfs41_lock_expired,
+	.establish_clid = nfs41_init_clientid,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+	.sched_state_renewal = nfs4_proc_async_renew,
+	.get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
+	.renew_lease = nfs4_proc_renew,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+	.sched_state_renewal = nfs41_proc_async_sequence,
+	.get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
+	.renew_lease = nfs4_proc_sequence,
+};
+#endif
+
+static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+	.get_locations = _nfs40_proc_get_locations,
+	.fsid_present = _nfs40_proc_fsid_present,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+	.get_locations = _nfs41_proc_get_locations,
+	.fsid_present = _nfs41_proc_fsid_present,
+};
+#endif	/* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
+	.minor_version = 0,
+	.init_caps = NFS_CAP_READDIRPLUS
+		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_CHANGE_ATTR
+		| NFS_CAP_POSIX_LOCK,
+	.init_client = nfs40_init_client,
+	.shutdown_client = nfs40_shutdown_client,
+	.match_stateid = nfs4_match_stateid,
+	.find_root_sec = nfs4_find_root_sec,
+	.free_lock_state = nfs4_release_lockowner,
+	.alloc_seqid = nfs_alloc_seqid,
+	.call_sync_ops = &nfs40_call_sync_ops,
+	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
+	.state_renewal_ops = &nfs40_state_renewal_ops,
+	.mig_recovery_ops = &nfs40_mig_recovery_ops,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static struct nfs_seqid *
+nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
+{
+	return NULL;
+}
+
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
+	.minor_version = 1,
+	.init_caps = NFS_CAP_READDIRPLUS
+		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_CHANGE_ATTR
+		| NFS_CAP_POSIX_LOCK
+		| NFS_CAP_STATEID_NFSV41
+		| NFS_CAP_ATOMIC_OPEN_V1,
+	.init_client = nfs41_init_client,
+	.shutdown_client = nfs41_shutdown_client,
+	.match_stateid = nfs41_match_stateid,
+	.find_root_sec = nfs41_find_root_sec,
+	.free_lock_state = nfs41_free_lock_state,
+	.alloc_seqid = nfs_alloc_no_seqid,
+	.call_sync_ops = &nfs41_call_sync_ops,
+	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+	.state_renewal_ops = &nfs41_state_renewal_ops,
+	.mig_recovery_ops = &nfs41_mig_recovery_ops,
+};
+#endif
+
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+	.minor_version = 2,
+	.init_caps = NFS_CAP_READDIRPLUS
+		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_CHANGE_ATTR
+		| NFS_CAP_POSIX_LOCK
+		| NFS_CAP_STATEID_NFSV41
+		| NFS_CAP_ATOMIC_OPEN_V1
+		| NFS_CAP_ALLOCATE
+		| NFS_CAP_DEALLOCATE
+		| NFS_CAP_SEEK,
+	.init_client = nfs41_init_client,
+	.shutdown_client = nfs41_shutdown_client,
+	.match_stateid = nfs41_match_stateid,
+	.find_root_sec = nfs41_find_root_sec,
+	.free_lock_state = nfs41_free_lock_state,
+	.call_sync_ops = &nfs41_call_sync_ops,
+	.alloc_seqid = nfs_alloc_no_seqid,
+	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+	.state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
+
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
+	[0] = &nfs_v4_0_minor_ops,
+#if defined(CONFIG_NFS_V4_1)
+	[1] = &nfs_v4_1_minor_ops,
+#endif
+#if defined(CONFIG_NFS_V4_2)
+	[2] = &nfs_v4_2_minor_ops,
+#endif
+};
+
+static const struct inode_operations nfs4_dir_inode_operations = {
+	.create		= nfs_create,
+	.lookup		= nfs_lookup,
+	.atomic_open	= nfs_atomic_open,
+	.link		= nfs_link,
+	.unlink		= nfs_unlink,
+	.symlink	= nfs_symlink,
+	.mkdir		= nfs_mkdir,
+	.rmdir		= nfs_rmdir,
+	.mknod		= nfs_mknod,
+	.rename		= nfs_rename,
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+};
+
+static const struct inode_operations nfs4_file_inode_operations = {
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+};
+
+const struct nfs_rpc_ops nfs_v4_clientops = {
+	.version	= 4,			/* protocol version */
+	.dentry_ops	= &nfs4_dentry_operations,
+	.dir_inode_ops	= &nfs4_dir_inode_operations,
+	.file_inode_ops	= &nfs4_file_inode_operations,
+	.file_ops	= &nfs4_file_operations,
+	.getroot	= nfs4_proc_get_root,
+	.submount	= nfs4_submount,
+	.try_mount	= nfs4_try_mount,
+	.getattr	= nfs4_proc_getattr,
+	.setattr	= nfs4_proc_setattr,
+	.lookup		= nfs4_proc_lookup,
+	.access		= nfs4_proc_access,
+	.readlink	= nfs4_proc_readlink,
+	.create		= nfs4_proc_create,
+	.remove		= nfs4_proc_remove,
+	.unlink_setup	= nfs4_proc_unlink_setup,
+	.unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
+	.unlink_done	= nfs4_proc_unlink_done,
+	.rename_setup	= nfs4_proc_rename_setup,
+	.rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
+	.rename_done	= nfs4_proc_rename_done,
+	.link		= nfs4_proc_link,
+	.symlink	= nfs4_proc_symlink,
+	.mkdir		= nfs4_proc_mkdir,
+	.rmdir		= nfs4_proc_remove,
+	.readdir	= nfs4_proc_readdir,
+	.mknod		= nfs4_proc_mknod,
+	.statfs		= nfs4_proc_statfs,
+	.fsinfo		= nfs4_proc_fsinfo,
+	.pathconf	= nfs4_proc_pathconf,
+	.set_capabilities = nfs4_server_capabilities,
+	.decode_dirent	= nfs4_decode_dirent,
+	.pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,
+	.read_setup	= nfs4_proc_read_setup,
+	.read_done	= nfs4_read_done,
+	.write_setup	= nfs4_proc_write_setup,
+	.write_done	= nfs4_write_done,
+	.commit_setup	= nfs4_proc_commit_setup,
+	.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
+	.commit_done	= nfs4_commit_done,
+	.lock		= nfs4_proc_lock,
+	.clear_acl_cache = nfs4_zap_acl_attr,
+	.close_context  = nfs4_close_context,
+	.open_context	= nfs4_atomic_open,
+	.have_delegation = nfs4_have_delegation,
+	.return_delegation = nfs4_inode_return_delegation,
+	.alloc_client	= nfs4_alloc_client,
+	.init_client	= nfs4_init_client,
+	.free_client	= nfs4_free_client,
+	.create_server	= nfs4_create_server,
+	.clone_server	= nfs_clone_server,
+};
+
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+	.prefix	= XATTR_NAME_NFSV4_ACL,
+	.list	= nfs4_xattr_list_nfs4_acl,
+	.get	= nfs4_xattr_get_nfs4_acl,
+	.set	= nfs4_xattr_set_nfs4_acl,
+};
+
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+	&nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+	&nfs4_xattr_nfs4_label_handler,
+#endif
+	NULL
+};
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/kernel/fs/nfs/nfs4renewd.c b/kernel/fs/nfs/nfs4renewd.c
new file mode 100644
index 000000000..e1ba58c3d
--- /dev/null
+++ b/kernel/fs/nfs/nfs4renewd.c
@@ -0,0 +1,143 @@
+/*
+ *  fs/nfs/nfs4renewd.c
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 "renew daemon", which wakes up periodically to
+ * send a RENEW, to keep state alive on the server.  The daemon is implemented
+ * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
+ * context.  There is one renewd per nfs_server.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "delegation.h"
+
+#define NFSDBG_FACILITY		NFSDBG_STATE
+
+void
+nfs4_renew_state(struct work_struct *work)
+{
+	const struct nfs4_state_maintenance_ops *ops;
+	struct nfs_client *clp =
+		container_of(work, struct nfs_client, cl_renewd.work);
+	struct rpc_cred *cred;
+	long lease;
+	unsigned long last, now;
+	unsigned renew_flags = 0;
+
+	ops = clp->cl_mvops->state_renewal_ops;
+	dprintk("%s: start\n", __func__);
+
+	if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
+		goto out;
+
+	spin_lock(&clp->cl_lock);
+	lease = clp->cl_lease_time;
+	last = clp->cl_last_renewal;
+	now = jiffies;
+	/* Are we close to a lease timeout? */
+	if (time_after(now, last + lease/3))
+		renew_flags |= NFS4_RENEW_TIMEOUT;
+	if (nfs_delegations_present(clp))
+		renew_flags |= NFS4_RENEW_DELEGATION_CB;
+
+	if (renew_flags != 0) {
+		cred = ops->get_state_renewal_cred_locked(clp);
+		spin_unlock(&clp->cl_lock);
+		if (cred == NULL) {
+			if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
+				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+				goto out;
+			}
+			nfs_expire_all_delegations(clp);
+		} else {
+			int ret;
+
+			/* Queue an asynchronous RENEW. */
+			ret = ops->sched_state_renewal(clp, cred, renew_flags);
+			put_rpccred(cred);
+			switch (ret) {
+			default:
+				goto out_exp;
+			case -EAGAIN:
+			case -ENOMEM:
+				break;
+			}
+		}
+	} else {
+		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
+				__func__);
+		spin_unlock(&clp->cl_lock);
+	}
+	nfs4_schedule_state_renewal(clp);
+out_exp:
+	nfs_expire_unreferenced_delegations(clp);
+out:
+	dprintk("%s: done\n", __func__);
+}
+
+void
+nfs4_schedule_state_renewal(struct nfs_client *clp)
+{
+	long timeout;
+
+	spin_lock(&clp->cl_lock);
+	timeout = (2 * clp->cl_lease_time) / 3 + (long)clp->cl_last_renewal
+		- (long)jiffies;
+	if (timeout < 5 * HZ)
+		timeout = 5 * HZ;
+	dprintk("%s: requeueing work. Lease period = %ld\n",
+			__func__, (timeout + HZ - 1) / HZ);
+	mod_delayed_work(system_wq, &clp->cl_renewd, timeout);
+	set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
+	spin_unlock(&clp->cl_lock);
+}
+
+void
+nfs4_kill_renewd(struct nfs_client *clp)
+{
+	cancel_delayed_work_sync(&clp->cl_renewd);
+}
+
+/*
+ * Local variables:
+ *   c-basic-offset: 8
+ * End:
+ */
diff --git a/kernel/fs/nfs/nfs4session.c b/kernel/fs/nfs/nfs4session.c
new file mode 100644
index 000000000..e23366eff
--- /dev/null
+++ b/kernel/fs/nfs/nfs4session.c
@@ -0,0 +1,565 @@
+/*
+ * fs/nfs/nfs4session.c
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/module.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+
+#define NFSDBG_FACILITY		NFSDBG_STATE
+
+static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
+{
+	tbl->highest_used_slotid = NFS4_NO_SLOT;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
+	init_completion(&tbl->complete);
+}
+
+/*
+ * nfs4_shrink_slot_table - free retired slots from the slot table
+ */
+static void nfs4_shrink_slot_table(struct nfs4_slot_table  *tbl, u32 newsize)
+{
+	struct nfs4_slot **p;
+	if (newsize >= tbl->max_slots)
+		return;
+
+	p = &tbl->slots;
+	while (newsize--)
+		p = &(*p)->next;
+	while (*p) {
+		struct nfs4_slot *slot = *p;
+
+		*p = slot->next;
+		kfree(slot);
+		tbl->max_slots--;
+	}
+}
+
+/**
+ * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
+ * @tbl - controlling slot table
+ *
+ */
+void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
+{
+	if (nfs4_slot_tbl_draining(tbl))
+		complete(&tbl->complete);
+}
+
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
+ */
+void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+	u32 slotid = slot->slot_nr;
+
+	/* clear used bit in bitmap */
+	__clear_bit(slotid, tbl->used_slots);
+
+	/* update highest_used_slotid when it is freed */
+	if (slotid == tbl->highest_used_slotid) {
+		u32 new_max = find_last_bit(tbl->used_slots, slotid);
+		if (new_max < slotid)
+			tbl->highest_used_slotid = new_max;
+		else {
+			tbl->highest_used_slotid = NFS4_NO_SLOT;
+			nfs4_slot_tbl_drain_complete(tbl);
+		}
+	}
+	dprintk("%s: slotid %u highest_used_slotid %u\n", __func__,
+		slotid, tbl->highest_used_slotid);
+}
+
+static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table  *tbl,
+		u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+	struct nfs4_slot *slot;
+
+	slot = kzalloc(sizeof(*slot), gfp_mask);
+	if (slot) {
+		slot->table = tbl;
+		slot->slot_nr = slotid;
+		slot->seq_nr = seq_init;
+	}
+	return slot;
+}
+
+static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table  *tbl,
+		u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+	struct nfs4_slot **p, *slot;
+
+	p = &tbl->slots;
+	for (;;) {
+		if (*p == NULL) {
+			*p = nfs4_new_slot(tbl, tbl->max_slots,
+					seq_init, gfp_mask);
+			if (*p == NULL)
+				break;
+			tbl->max_slots++;
+		}
+		slot = *p;
+		if (slot->slot_nr == slotid)
+			return slot;
+		p = &slot->next;
+	}
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * nfs4_alloc_slot - efficiently look for a free slot
+ *
+ * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
+{
+	struct nfs4_slot *ret = ERR_PTR(-EBUSY);
+	u32 slotid;
+
+	dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
+		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
+		tbl->max_slotid + 1);
+	slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
+	if (slotid > tbl->max_slotid)
+		goto out;
+	ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+	if (IS_ERR(ret))
+		goto out;
+	__set_bit(slotid, tbl->used_slots);
+	if (slotid > tbl->highest_used_slotid ||
+			tbl->highest_used_slotid == NFS4_NO_SLOT)
+		tbl->highest_used_slotid = slotid;
+	ret->generation = tbl->generation;
+
+out:
+	dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
+		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
+		!IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
+	return ret;
+}
+
+static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl,
+		 u32 max_reqs, u32 ivalue)
+{
+	if (max_reqs <= tbl->max_slots)
+		return 0;
+	if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)))
+		return 0;
+	return -ENOMEM;
+}
+
+static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
+		u32 server_highest_slotid,
+		u32 ivalue)
+{
+	struct nfs4_slot **p;
+
+	nfs4_shrink_slot_table(tbl, server_highest_slotid + 1);
+	p = &tbl->slots;
+	while (*p) {
+		(*p)->seq_nr = ivalue;
+		(*p)->interrupted = 0;
+		p = &(*p)->next;
+	}
+	tbl->highest_used_slotid = NFS4_NO_SLOT;
+	tbl->target_highest_slotid = server_highest_slotid;
+	tbl->server_highest_slotid = server_highest_slotid;
+	tbl->d_target_highest_slotid = 0;
+	tbl->d2_target_highest_slotid = 0;
+	tbl->max_slotid = server_highest_slotid;
+}
+
+/*
+ * (re)Initialise a slot table
+ */
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
+		u32 max_reqs, u32 ivalue)
+{
+	int ret;
+
+	dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__,
+		max_reqs, tbl->max_slots);
+
+	if (max_reqs > NFS4_MAX_SLOT_TABLE)
+		max_reqs = NFS4_MAX_SLOT_TABLE;
+
+	ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue);
+	if (ret)
+		goto out;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__,
+		tbl, tbl->slots, tbl->max_slots);
+out:
+	dprintk("<-- %s: return %d\n", __func__, ret);
+	return ret;
+}
+
+/*
+ * nfs4_release_slot_table - release all slot table entries
+ */
+static void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+{
+	nfs4_shrink_slot_table(tbl, 0);
+}
+
+/**
+ * nfs4_shutdown_slot_table - release resources attached to a slot table
+ * @tbl: slot table to shut down
+ *
+ */
+void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl)
+{
+	nfs4_release_slot_table(tbl);
+	rpc_destroy_wait_queue(&tbl->slot_tbl_waitq);
+}
+
+/**
+ * nfs4_setup_slot_table - prepare a stand-alone slot table for use
+ * @tbl: slot table to set up
+ * @max_reqs: maximum number of requests allowed
+ * @queue: name to give RPC wait queue
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs,
+		const char *queue)
+{
+	nfs4_init_slot_table(tbl, queue);
+	return nfs4_realloc_slot_table(tbl, max_reqs, 0);
+}
+
+static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
+{
+	struct nfs4_sequence_args *args = task->tk_msg.rpc_argp;
+	struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+	struct nfs4_slot *slot = pslot;
+	struct nfs4_slot_table *tbl = slot->table;
+
+	if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+		return false;
+	slot->generation = tbl->generation;
+	args->sa_slot = slot;
+	res->sr_timestamp = jiffies;
+	res->sr_slot = slot;
+	res->sr_status_flags = 0;
+	res->sr_status = 1;
+	return true;
+}
+
+static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot)
+{
+	if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot))
+		return true;
+	return false;
+}
+
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot)
+{
+	if (slot->slot_nr > tbl->max_slotid)
+		return false;
+	return __nfs41_wake_and_assign_slot(tbl, slot);
+}
+
+static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl)
+{
+	struct nfs4_slot *slot = nfs4_alloc_slot(tbl);
+	if (!IS_ERR(slot)) {
+		bool ret = __nfs41_wake_and_assign_slot(tbl, slot);
+		if (ret)
+			return ret;
+		nfs4_free_slot(tbl, slot);
+	}
+	return false;
+}
+
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
+{
+	for (;;) {
+		if (!nfs41_try_wake_next_slot_table_entry(tbl))
+			break;
+	}
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid)
+{
+	u32 max_slotid;
+
+	max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid);
+	if (max_slotid > tbl->server_highest_slotid)
+		max_slotid = tbl->server_highest_slotid;
+	if (max_slotid > tbl->target_highest_slotid)
+		max_slotid = tbl->target_highest_slotid;
+	tbl->max_slotid = max_slotid;
+	nfs41_wake_slot_table(tbl);
+}
+
+/* Update the client's idea of target_highest_slotid */
+static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid)
+{
+	if (tbl->target_highest_slotid == target_highest_slotid)
+		return;
+	tbl->target_highest_slotid = target_highest_slotid;
+	tbl->generation++;
+}
+
+void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid)
+{
+	spin_lock(&tbl->slot_tbl_lock);
+	nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
+	tbl->d_target_highest_slotid = 0;
+	tbl->d2_target_highest_slotid = 0;
+	nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl,
+		u32 highest_slotid)
+{
+	if (tbl->server_highest_slotid == highest_slotid)
+		return;
+	if (tbl->highest_used_slotid > highest_slotid)
+		return;
+	/* Deallocate slots */
+	nfs4_shrink_slot_table(tbl, highest_slotid + 1);
+	tbl->server_highest_slotid = highest_slotid;
+}
+
+static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2)
+{
+	s1 -= s2;
+	if (s1 == 0)
+		return 0;
+	if (s1 < 0)
+		return (s1 - 1) >> 1;
+	return (s1 + 1) >> 1;
+}
+
+static int nfs41_sign_s32(s32 s1)
+{
+	if (s1 > 0)
+		return 1;
+	if (s1 < 0)
+		return -1;
+	return 0;
+}
+
+static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2)
+{
+	if (!s1 || !s2)
+		return true;
+	return nfs41_sign_s32(s1) == nfs41_sign_s32(s2);
+}
+
+/* Try to eliminate outliers by checking for sharp changes in the
+ * derivatives and second derivatives
+ */
+static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl,
+		u32 new_target)
+{
+	s32 d_target, d2_target;
+	bool ret = true;
+
+	d_target = nfs41_derivative_target_slotid(new_target,
+			tbl->target_highest_slotid);
+	d2_target = nfs41_derivative_target_slotid(d_target,
+			tbl->d_target_highest_slotid);
+	/* Is first derivative same sign? */
+	if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid))
+		ret = false;
+	/* Is second derivative same sign? */
+	if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid))
+		ret = false;
+	tbl->d_target_highest_slotid = d_target;
+	tbl->d2_target_highest_slotid = d2_target;
+	return ret;
+}
+
+void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot,
+		struct nfs4_sequence_res *res)
+{
+	spin_lock(&tbl->slot_tbl_lock);
+	if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
+		nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
+	if (tbl->generation == slot->generation)
+		nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
+	nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_release_session_slot_tables(struct nfs4_session *session)
+{
+	nfs4_release_slot_table(&session->fc_slot_table);
+	nfs4_release_slot_table(&session->bc_slot_table);
+}
+
+/*
+ * Initialize or reset the forechannel and backchannel tables
+ */
+int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
+{
+	struct nfs4_slot_table *tbl;
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	/* Fore channel */
+	tbl = &ses->fc_slot_table;
+	tbl->session = ses;
+	status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+	if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
+		return status;
+	/* Back channel */
+	tbl = &ses->bc_slot_table;
+	tbl->session = ses;
+	status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+	if (status && tbl->slots == NULL)
+		/* Fore and back channel share a connection so get
+		 * both slot tables or neither */
+		nfs4_release_session_slot_tables(ses);
+	return status;
+}
+
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session;
+
+	session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+	if (!session)
+		return NULL;
+
+	nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table");
+	nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table");
+	session->session_state = 1<<NFS4_SESSION_INITING;
+
+	session->clp = clp;
+	return session;
+}
+
+static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+{
+	nfs4_shutdown_slot_table(&session->fc_slot_table);
+	nfs4_shutdown_slot_table(&session->bc_slot_table);
+}
+
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+	struct rpc_xprt *xprt;
+	struct rpc_cred *cred;
+
+	cred = nfs4_get_clid_cred(session->clp);
+	nfs4_proc_destroy_session(session, cred);
+	if (cred)
+		put_rpccred(cred);
+
+	rcu_read_lock();
+	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
+	rcu_read_unlock();
+	dprintk("%s Destroy backchannel for xprt %p\n",
+		__func__, xprt);
+	xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+	nfs4_destroy_session_slot_tables(session);
+	kfree(session);
+}
+
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+static int nfs41_check_session_ready(struct nfs_client *clp)
+{
+	int ret;
+	
+	if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+		ret = nfs4_client_recover_expired_lease(clp);
+		if (ret)
+			return ret;
+	}
+	if (clp->cl_cons_state < NFS_CS_READY)
+		return -EPROTONOSUPPORT;
+	smp_rmb();
+	return 0;
+}
+
+int nfs4_init_session(struct nfs_client *clp)
+{
+	if (!nfs4_has_session(clp))
+		return 0;
+
+	clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
+	return nfs41_check_session_ready(clp);
+}
+
+int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
+{
+	struct nfs4_session *session = clp->cl_session;
+	int ret;
+
+	spin_lock(&clp->cl_lock);
+	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+		/*
+		 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
+		 * DS lease to be equal to the MDS lease.
+		 */
+		clp->cl_lease_time = lease_time;
+		clp->cl_last_renewal = jiffies;
+	}
+	spin_unlock(&clp->cl_lock);
+
+	ret = nfs41_check_session_ready(clp);
+	if (ret)
+		return ret;
+	/* Test for the DS role */
+	if (!is_ds_client(clp))
+		return -ENODEV;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+#endif	/* defined(CONFIG_NFS_V4_1) */
diff --git a/kernel/fs/nfs/nfs4session.h b/kernel/fs/nfs/nfs4session.h
new file mode 100644
index 000000000..e3ea2c532
--- /dev/null
+++ b/kernel/fs/nfs/nfs4session.h
@@ -0,0 +1,160 @@
+/*
+ * fs/nfs/nfs4session.h
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#ifndef __LINUX_FS_NFS_NFS4SESSION_H
+#define __LINUX_FS_NFS_NFS4SESSION_H
+
+/* maximum number of slots to use */
+#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
+#define NFS4_MAX_SLOT_TABLE (1024U)
+#define NFS4_NO_SLOT ((u32)-1)
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+/* Sessions slot seqid */
+struct nfs4_slot {
+	struct nfs4_slot_table	*table;
+	struct nfs4_slot	*next;
+	unsigned long		generation;
+	u32			slot_nr;
+	u32		 	seq_nr;
+	unsigned int		interrupted : 1;
+};
+
+/* Sessions */
+enum nfs4_slot_tbl_state {
+	NFS4_SLOT_TBL_DRAINING,
+};
+
+#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
+struct nfs4_slot_table {
+	struct nfs4_session *session;		/* Parent session */
+	struct nfs4_slot *slots;		/* seqid per slot */
+	unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
+	spinlock_t	slot_tbl_lock;
+	struct rpc_wait_queue	slot_tbl_waitq;	/* allocators may wait here */
+	u32		max_slots;		/* # slots in table */
+	u32		max_slotid;		/* Max allowed slotid value */
+	u32		highest_used_slotid;	/* sent to server on each SEQ.
+						 * op for dynamic resizing */
+	u32		target_highest_slotid;	/* Server max_slot target */
+	u32		server_highest_slotid;	/* Server highest slotid */
+	s32		d_target_highest_slotid; /* Derivative */
+	s32		d2_target_highest_slotid; /* 2nd derivative */
+	unsigned long	generation;		/* Generation counter for
+						   target_highest_slotid */
+	struct completion complete;
+	unsigned long	slot_tbl_state;
+};
+
+/*
+ * Session related parameters
+ */
+struct nfs4_session {
+	struct nfs4_sessionid		sess_id;
+	u32				flags;
+	unsigned long			session_state;
+	u32				hash_alg;
+	u32				ssv_len;
+
+	/* The fore and back channel */
+	struct nfs4_channel_attrs	fc_attrs;
+	struct nfs4_slot_table		fc_slot_table;
+	struct nfs4_channel_attrs	bc_attrs;
+	struct nfs4_slot_table		bc_slot_table;
+	struct nfs_client		*clp;
+};
+
+enum nfs4_session_state {
+	NFS4_SESSION_INITING,
+	NFS4_SESSION_ESTABLISHED,
+};
+
+extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
+		unsigned int max_reqs, const char *queue);
+extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot);
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
+
+static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
+{
+	return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid);
+extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot,
+		struct nfs4_sequence_res *res);
+
+extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
+
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern int nfs4_init_session(struct nfs_client *clp);
+extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+	if (clp->cl_session)
+		return 1;
+	return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+	if (nfs4_has_session(clp))
+		return (clp->cl_session->flags & SESSION4_PERSIST);
+	return 0;
+}
+
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+		const struct nfs4_sessionid *src)
+{
+	memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
+
+#ifdef CONFIG_CRC32
+/*
+ * nfs_session_id_hash - calculate the crc32 hash for the session id
+ * @session - pointer to session
+ */
+#define nfs_session_id_hash(sess_id) \
+	(~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
+#else
+#define nfs_session_id_hash(session) (0)
+#endif
+#else /* defined(CONFIG_NFS_V4_1) */
+
+static inline int nfs4_init_session(struct nfs_client *clp)
+{
+	return 0;
+}
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+	return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+	return 0;
+}
+
+#endif /* defined(CONFIG_NFS_V4_1) */
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+#endif /* __LINUX_FS_NFS_NFS4SESSION_H */
diff --git a/kernel/fs/nfs/nfs4state.c b/kernel/fs/nfs/nfs4state.c
new file mode 100644
index 000000000..2782cfca2
--- /dev/null
+++ b/kernel/fs/nfs/nfs4state.c
@@ -0,0 +1,2455 @@
+/*
+ *  fs/nfs/nfs4state.c
+ *
+ *  Client-side XDR for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 state model.  For the time being,
+ * this is minimal, but will be made much more complex in a
+ * subsequent patch.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/ratelimit.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/jiffies.h>
+
+#include <linux/sunrpc/clnt.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY		NFSDBG_STATE
+
+#define OPENOWNER_POOL_SIZE	8
+
+const nfs4_stateid zero_stateid;
+static DEFINE_MUTEX(nfs_clid_init_mutex);
+
+int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct nfs4_setclientid_res clid = {
+		.clientid = clp->cl_clientid,
+		.confirm = clp->cl_confirm,
+	};
+	unsigned short port;
+	int status;
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+	if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+		goto do_confirm;
+	port = nn->nfs_callback_tcpport;
+	if (clp->cl_addr.ss_family == AF_INET6)
+		port = nn->nfs_callback_tcpport6;
+
+	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+	if (status != 0)
+		goto out;
+	clp->cl_clientid = clid.clientid;
+	clp->cl_confirm = clid.confirm;
+	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+	status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
+	if (status != 0)
+		goto out;
+	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+	nfs4_schedule_state_renewal(clp);
+out:
+	return status;
+}
+
+/**
+ * nfs40_discover_server_trunking - Detect server IP address trunking (mv0)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+				   struct nfs_client **result,
+				   struct rpc_cred *cred)
+{
+	struct nfs4_setclientid_res clid = {
+		.clientid = clp->cl_clientid,
+		.confirm = clp->cl_confirm,
+	};
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+	unsigned short port;
+	int status;
+
+	port = nn->nfs_callback_tcpport;
+	if (clp->cl_addr.ss_family == AF_INET6)
+		port = nn->nfs_callback_tcpport6;
+
+	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+	if (status != 0)
+		goto out;
+	clp->cl_clientid = clid.clientid;
+	clp->cl_confirm = clid.confirm;
+
+	status = nfs40_walk_client_list(clp, result, cred);
+	if (status == 0) {
+		/* Sustain the lease, even if it's empty.  If the clientid4
+		 * goes stale it's of no use for trunking discovery. */
+		nfs4_schedule_state_renewal(*result);
+	}
+out:
+	return status;
+}
+
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
+{
+	struct rpc_cred *cred = NULL;
+
+	if (clp->cl_machine_cred != NULL)
+		cred = get_rpccred(clp->cl_machine_cred);
+	return cred;
+}
+
+static void nfs4_root_machine_cred(struct nfs_client *clp)
+{
+	struct rpc_cred *cred, *new;
+
+	new = rpc_lookup_machine_cred(NULL);
+	spin_lock(&clp->cl_lock);
+	cred = clp->cl_machine_cred;
+	clp->cl_machine_cred = new;
+	spin_unlock(&clp->cl_lock);
+	if (cred != NULL)
+		put_rpccred(cred);
+}
+
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
+{
+	struct rpc_cred *cred = NULL;
+	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
+
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		if (list_empty(&sp->so_states))
+			continue;
+		cred = get_rpccred(sp->so_cred);
+		break;
+	}
+	return cred;
+}
+
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+	struct rpc_cred *cred = NULL;
+	struct nfs_server *server;
+
+	/* Use machine credentials if available */
+	cred = nfs4_get_machine_cred_locked(clp);
+	if (cred != NULL)
+		goto out;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		cred = nfs4_get_renew_cred_server_locked(server);
+		if (cred != NULL)
+			break;
+	}
+	rcu_read_unlock();
+
+out:
+	return cred;
+}
+
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
+{
+	if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
+		spin_lock(&tbl->slot_tbl_lock);
+		nfs41_wake_slot_table(tbl);
+		spin_unlock(&tbl->slot_tbl_lock);
+	}
+}
+
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+	struct nfs4_session *ses = clp->cl_session;
+
+	if (clp->cl_slot_tbl) {
+		nfs4_end_drain_slot_table(clp->cl_slot_tbl);
+		return;
+	}
+
+	if (ses != NULL) {
+		nfs4_end_drain_slot_table(&ses->bc_slot_table);
+		nfs4_end_drain_slot_table(&ses->fc_slot_table);
+	}
+}
+
+static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
+{
+	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
+	spin_lock(&tbl->slot_tbl_lock);
+	if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
+		reinit_completion(&tbl->complete);
+		spin_unlock(&tbl->slot_tbl_lock);
+		return wait_for_completion_interruptible(&tbl->complete);
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+	return 0;
+}
+
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+	struct nfs4_session *ses = clp->cl_session;
+	int ret = 0;
+
+	if (clp->cl_slot_tbl)
+		return nfs4_drain_slot_tbl(clp->cl_slot_tbl);
+
+	/* back channel */
+	ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
+	if (ret)
+		return ret;
+	/* fore channel */
+	return nfs4_drain_slot_tbl(&ses->fc_slot_table);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static int nfs41_setup_state_renewal(struct nfs_client *clp)
+{
+	int status;
+	struct nfs_fsinfo fsinfo;
+
+	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+		nfs4_schedule_state_renewal(clp);
+		return 0;
+	}
+
+	status = nfs4_proc_get_lease_time(clp, &fsinfo);
+	if (status == 0) {
+		/* Update lease time and schedule renewal */
+		spin_lock(&clp->cl_lock);
+		clp->cl_lease_time = fsinfo.lease_time * HZ;
+		clp->cl_last_renewal = jiffies;
+		spin_unlock(&clp->cl_lock);
+
+		nfs4_schedule_state_renewal(clp);
+	}
+
+	return status;
+}
+
+static void nfs41_finish_session_reset(struct nfs_client *clp)
+{
+	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+	/* create_session negotiated new slot table */
+	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	nfs41_setup_state_renewal(clp);
+}
+
+int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	int status;
+
+	if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+		goto do_confirm;
+	nfs4_begin_drain_session(clp);
+	status = nfs4_proc_exchange_id(clp, cred);
+	if (status != 0)
+		goto out;
+	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+	status = nfs4_proc_create_session(clp, cred);
+	if (status != 0)
+		goto out;
+	nfs41_finish_session_reset(clp);
+	nfs_mark_client_ready(clp, NFS_CS_READY);
+out:
+	return status;
+}
+
+/**
+ * nfs41_discover_server_trunking - Detect server IP address trunking (mv1)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status.
+ * If NFS4_OK is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+				   struct nfs_client **result,
+				   struct rpc_cred *cred)
+{
+	int status;
+
+	status = nfs4_proc_exchange_id(clp, cred);
+	if (status != NFS4_OK)
+		return status;
+
+	status = nfs41_walk_client_list(clp, result, cred);
+	if (status < 0)
+		return status;
+	if (clp != *result)
+		return 0;
+
+	/* Purge state if the client id was established in a prior instance */
+	if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R)
+		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+	else
+		set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+	status = nfs_wait_client_init_complete(clp);
+	if (status < 0)
+		nfs_put_client(clp);
+	return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_get_clid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+
+	spin_lock(&clp->cl_lock);
+	cred = nfs4_get_machine_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	return cred;
+}
+
+static struct nfs4_state_owner *
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
+{
+	struct rb_node **p = &server->state_owners.rb_node,
+		       *parent = NULL;
+	struct nfs4_state_owner *sp;
+
+	while (*p != NULL) {
+		parent = *p;
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+
+		if (cred < sp->so_cred)
+			p = &parent->rb_left;
+		else if (cred > sp->so_cred)
+			p = &parent->rb_right;
+		else {
+			if (!list_empty(&sp->so_lru))
+				list_del_init(&sp->so_lru);
+			atomic_inc(&sp->so_count);
+			return sp;
+		}
+	}
+	return NULL;
+}
+
+static struct nfs4_state_owner *
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
+{
+	struct nfs_server *server = new->so_server;
+	struct rb_node **p = &server->state_owners.rb_node,
+		       *parent = NULL;
+	struct nfs4_state_owner *sp;
+	int err;
+
+	while (*p != NULL) {
+		parent = *p;
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+
+		if (new->so_cred < sp->so_cred)
+			p = &parent->rb_left;
+		else if (new->so_cred > sp->so_cred)
+			p = &parent->rb_right;
+		else {
+			if (!list_empty(&sp->so_lru))
+				list_del_init(&sp->so_lru);
+			atomic_inc(&sp->so_count);
+			return sp;
+		}
+	}
+	err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id);
+	if (err)
+		return ERR_PTR(err);
+	rb_link_node(&new->so_server_node, parent, p);
+	rb_insert_color(&new->so_server_node, &server->state_owners);
+	return new;
+}
+
+static void
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
+{
+	struct nfs_server *server = sp->so_server;
+
+	if (!RB_EMPTY_NODE(&sp->so_server_node))
+		rb_erase(&sp->so_server_node, &server->state_owners);
+	ida_remove(&server->openowner_id, sp->so_seqid.owner_id);
+}
+
+static void
+nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)
+{
+	sc->create_time = ktime_get();
+	sc->flags = 0;
+	sc->counter = 0;
+	spin_lock_init(&sc->lock);
+	INIT_LIST_HEAD(&sc->list);
+	rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue");
+}
+
+static void
+nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc)
+{
+	rpc_destroy_wait_queue(&sc->wait);
+}
+
+/*
+ * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to
+ * create a new state_owner.
+ *
+ */
+static struct nfs4_state_owner *
+nfs4_alloc_state_owner(struct nfs_server *server,
+		struct rpc_cred *cred,
+		gfp_t gfp_flags)
+{
+	struct nfs4_state_owner *sp;
+
+	sp = kzalloc(sizeof(*sp), gfp_flags);
+	if (!sp)
+		return NULL;
+	sp->so_server = server;
+	sp->so_cred = get_rpccred(cred);
+	spin_lock_init(&sp->so_lock);
+	INIT_LIST_HEAD(&sp->so_states);
+	nfs4_init_seqid_counter(&sp->so_seqid);
+	atomic_set(&sp->so_count, 1);
+	INIT_LIST_HEAD(&sp->so_lru);
+	seqcount_init(&sp->so_reclaim_seqcount);
+	mutex_init(&sp->so_delegreturn_mutex);
+	return sp;
+}
+
+static void
+nfs4_drop_state_owner(struct nfs4_state_owner *sp)
+{
+	struct rb_node *rb_node = &sp->so_server_node;
+
+	if (!RB_EMPTY_NODE(rb_node)) {
+		struct nfs_server *server = sp->so_server;
+		struct nfs_client *clp = server->nfs_client;
+
+		spin_lock(&clp->cl_lock);
+		if (!RB_EMPTY_NODE(rb_node)) {
+			rb_erase(rb_node, &server->state_owners);
+			RB_CLEAR_NODE(rb_node);
+		}
+		spin_unlock(&clp->cl_lock);
+	}
+}
+
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+	nfs4_destroy_seqid_counter(&sp->so_seqid);
+	put_rpccred(sp->so_cred);
+	kfree(sp);
+}
+
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *tmp;
+	unsigned long time_min, time_max;
+	LIST_HEAD(doomed);
+
+	spin_lock(&clp->cl_lock);
+	time_max = jiffies;
+	time_min = (long)time_max - (long)clp->cl_lease_time;
+	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+		/* NB: LRU is sorted so that oldest is at the head */
+		if (time_in_range(sp->so_expires, time_min, time_max))
+			break;
+		list_move(&sp->so_lru, &doomed);
+		nfs4_remove_state_owner_locked(sp);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+		list_del(&sp->so_lru);
+		nfs4_free_state_owner(sp);
+	}
+}
+
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+					      struct rpc_cred *cred,
+					      gfp_t gfp_flags)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *new;
+
+	spin_lock(&clp->cl_lock);
+	sp = nfs4_find_state_owner_locked(server, cred);
+	spin_unlock(&clp->cl_lock);
+	if (sp != NULL)
+		goto out;
+	new = nfs4_alloc_state_owner(server, cred, gfp_flags);
+	if (new == NULL)
+		goto out;
+	do {
+		if (ida_pre_get(&server->openowner_id, gfp_flags) == 0)
+			break;
+		spin_lock(&clp->cl_lock);
+		sp = nfs4_insert_state_owner_locked(new);
+		spin_unlock(&clp->cl_lock);
+	} while (sp == ERR_PTR(-EAGAIN));
+	if (sp != new)
+		nfs4_free_state_owner(new);
+out:
+	nfs4_gc_state_owners(server);
+	return sp;
+}
+
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ * Note that we keep released state owners on an LRU
+ * list.
+ * This caches valid state owners so that they can be
+ * reused, to avoid the OPEN_CONFIRM on minor version 0.
+ * It also pins the uniquifier of dropped state owners for
+ * a while, to ensure that those state owner names are
+ * never reused.
+ */
+void nfs4_put_state_owner(struct nfs4_state_owner *sp)
+{
+	struct nfs_server *server = sp->so_server;
+	struct nfs_client *clp = server->nfs_client;
+
+	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
+		return;
+
+	sp->so_expires = jiffies;
+	list_add_tail(&sp->so_lru, &server->state_owners_lru);
+	spin_unlock(&clp->cl_lock);
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ *
+ * Called at umount time.  Remaining state owners will be on
+ * the LRU with ref count of zero.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *tmp;
+	LIST_HEAD(doomed);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+		list_move(&sp->so_lru, &doomed);
+		nfs4_remove_state_owner_locked(sp);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+		list_del(&sp->so_lru);
+		nfs4_free_state_owner(sp);
+	}
+}
+
+static struct nfs4_state *
+nfs4_alloc_open_state(void)
+{
+	struct nfs4_state *state;
+
+	state = kzalloc(sizeof(*state), GFP_NOFS);
+	if (!state)
+		return NULL;
+	atomic_set(&state->count, 1);
+	INIT_LIST_HEAD(&state->lock_states);
+	spin_lock_init(&state->state_lock);
+	seqlock_init(&state->seqlock);
+	return state;
+}
+
+void
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
+{
+	if (state->state == fmode)
+		return;
+	/* NB! List reordering - see the reclaim code for why.  */
+	if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+		if (fmode & FMODE_WRITE)
+			list_move(&state->open_states, &state->owner->so_states);
+		else
+			list_move_tail(&state->open_states, &state->owner->so_states);
+	}
+	state->state = fmode;
+}
+
+static struct nfs4_state *
+__nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs4_state *state;
+
+	list_for_each_entry(state, &nfsi->open_states, inode_states) {
+		if (state->owner != owner)
+			continue;
+		if (!nfs4_valid_open_stateid(state))
+			continue;
+		if (atomic_inc_not_zero(&state->count))
+			return state;
+	}
+	return NULL;
+}
+
+static void
+nfs4_free_open_state(struct nfs4_state *state)
+{
+	kfree(state);
+}
+
+struct nfs4_state *
+nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
+{
+	struct nfs4_state *state, *new;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	state = __nfs4_find_state_byowner(inode, owner);
+	spin_unlock(&inode->i_lock);
+	if (state)
+		goto out;
+	new = nfs4_alloc_open_state();
+	spin_lock(&owner->so_lock);
+	spin_lock(&inode->i_lock);
+	state = __nfs4_find_state_byowner(inode, owner);
+	if (state == NULL && new != NULL) {
+		state = new;
+		state->owner = owner;
+		atomic_inc(&owner->so_count);
+		list_add(&state->inode_states, &nfsi->open_states);
+		ihold(inode);
+		state->inode = inode;
+		spin_unlock(&inode->i_lock);
+		/* Note: The reclaim code dictates that we add stateless
+		 * and read-only stateids to the end of the list */
+		list_add_tail(&state->open_states, &owner->so_states);
+		spin_unlock(&owner->so_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&owner->so_lock);
+		if (new)
+			nfs4_free_open_state(new);
+	}
+out:
+	return state;
+}
+
+void nfs4_put_open_state(struct nfs4_state *state)
+{
+	struct inode *inode = state->inode;
+	struct nfs4_state_owner *owner = state->owner;
+
+	if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
+		return;
+	spin_lock(&inode->i_lock);
+	list_del(&state->inode_states);
+	list_del(&state->open_states);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&owner->so_lock);
+	iput(inode);
+	nfs4_free_open_state(state);
+	nfs4_put_state_owner(owner);
+}
+
+/*
+ * Close the current file.
+ */
+static void __nfs4_close(struct nfs4_state *state,
+		fmode_t fmode, gfp_t gfp_mask, int wait)
+{
+	struct nfs4_state_owner *owner = state->owner;
+	int call_close = 0;
+	fmode_t newstate;
+
+	atomic_inc(&owner->so_count);
+	/* Protect against nfs4_find_state() */
+	spin_lock(&owner->so_lock);
+	switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+		case FMODE_READ:
+			state->n_rdonly--;
+			break;
+		case FMODE_WRITE:
+			state->n_wronly--;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			state->n_rdwr--;
+	}
+	newstate = FMODE_READ|FMODE_WRITE;
+	if (state->n_rdwr == 0) {
+		if (state->n_rdonly == 0) {
+			newstate &= ~FMODE_READ;
+			call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+		}
+		if (state->n_wronly == 0) {
+			newstate &= ~FMODE_WRITE;
+			call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+		}
+		if (newstate == 0)
+			clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	}
+	nfs4_state_set_mode_locked(state, newstate);
+	spin_unlock(&owner->so_lock);
+
+	if (!call_close) {
+		nfs4_put_open_state(state);
+		nfs4_put_state_owner(owner);
+	} else
+		nfs4_do_close(state, gfp_mask, wait);
+}
+
+void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
+{
+	__nfs4_close(state, fmode, GFP_NOFS, 0);
+}
+
+void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
+{
+	__nfs4_close(state, fmode, GFP_KERNEL, 1);
+}
+
+/*
+ * Search the state->lock_states for an existing lock_owner
+ * that is compatible with current->files
+ */
+static struct nfs4_lock_state *
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+{
+	struct nfs4_lock_state *pos;
+	list_for_each_entry(pos, &state->lock_states, ls_locks) {
+		if (pos->ls_owner != fl_owner)
+			continue;
+		atomic_inc(&pos->ls_count);
+		return pos;
+	}
+	return NULL;
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+{
+	struct nfs4_lock_state *lsp;
+	struct nfs_server *server = state->owner->so_server;
+
+	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
+	if (lsp == NULL)
+		return NULL;
+	nfs4_init_seqid_counter(&lsp->ls_seqid);
+	atomic_set(&lsp->ls_count, 1);
+	lsp->ls_state = state;
+	lsp->ls_owner = fl_owner;
+	lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
+	if (lsp->ls_seqid.owner_id < 0)
+		goto out_free;
+	INIT_LIST_HEAD(&lsp->ls_locks);
+	return lsp;
+out_free:
+	kfree(lsp);
+	return NULL;
+}
+
+void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+	ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id);
+	nfs4_destroy_seqid_counter(&lsp->ls_seqid);
+	kfree(lsp);
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
+{
+	struct nfs4_lock_state *lsp, *new = NULL;
+	
+	for(;;) {
+		spin_lock(&state->state_lock);
+		lsp = __nfs4_find_lock_state(state, owner);
+		if (lsp != NULL)
+			break;
+		if (new != NULL) {
+			list_add(&new->ls_locks, &state->lock_states);
+			set_bit(LK_STATE_IN_USE, &state->flags);
+			lsp = new;
+			new = NULL;
+			break;
+		}
+		spin_unlock(&state->state_lock);
+		new = nfs4_alloc_lock_state(state, owner);
+		if (new == NULL)
+			return NULL;
+	}
+	spin_unlock(&state->state_lock);
+	if (new != NULL)
+		nfs4_free_lock_state(state->owner->so_server, new);
+	return lsp;
+}
+
+/*
+ * Release reference to lock_state, and free it if we see that
+ * it is no longer in use
+ */
+void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
+{
+	struct nfs_server *server;
+	struct nfs4_state *state;
+
+	if (lsp == NULL)
+		return;
+	state = lsp->ls_state;
+	if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock))
+		return;
+	list_del(&lsp->ls_locks);
+	if (list_empty(&state->lock_states))
+		clear_bit(LK_STATE_IN_USE, &state->flags);
+	spin_unlock(&state->state_lock);
+	server = state->owner->so_server;
+	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+		struct nfs_client *clp = server->nfs_client;
+
+		clp->cl_mvops->free_lock_state(server, lsp);
+	} else
+		nfs4_free_lock_state(server, lsp);
+}
+
+static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+	struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner;
+
+	dst->fl_u.nfs4_fl.owner = lsp;
+	atomic_inc(&lsp->ls_count);
+}
+
+static void nfs4_fl_release_lock(struct file_lock *fl)
+{
+	nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
+}
+
+static const struct file_lock_operations nfs4_fl_lock_ops = {
+	.fl_copy_lock = nfs4_fl_copy_lock,
+	.fl_release_private = nfs4_fl_release_lock,
+};
+
+int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
+{
+	struct nfs4_lock_state *lsp;
+
+	if (fl->fl_ops != NULL)
+		return 0;
+	lsp = nfs4_get_lock_state(state, fl->fl_owner);
+	if (lsp == NULL)
+		return -ENOMEM;
+	fl->fl_u.nfs4_fl.owner = lsp;
+	fl->fl_ops = &nfs4_fl_lock_ops;
+	return 0;
+}
+
+static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state,
+		const struct nfs_lockowner *lockowner)
+{
+	struct nfs4_lock_state *lsp;
+	fl_owner_t fl_owner;
+	int ret = -ENOENT;
+
+
+	if (lockowner == NULL)
+		goto out;
+
+	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
+		goto out;
+
+	fl_owner = lockowner->l_owner;
+	spin_lock(&state->state_lock);
+	lsp = __nfs4_find_lock_state(state, fl_owner);
+	if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+		ret = -EIO;
+	else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
+		nfs4_stateid_copy(dst, &lsp->ls_stateid);
+		ret = 0;
+	}
+	spin_unlock(&state->state_lock);
+	nfs4_put_lock_state(lsp);
+out:
+	return ret;
+}
+
+static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+{
+	const nfs4_stateid *src;
+	int seq;
+
+	do {
+		src = &zero_stateid;
+		seq = read_seqbegin(&state->seqlock);
+		if (test_bit(NFS_OPEN_STATE, &state->flags))
+			src = &state->open_stateid;
+		nfs4_stateid_copy(dst, src);
+	} while (read_seqretry(&state->seqlock, seq));
+}
+
+/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
+ */
+int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+		fmode_t fmode, const struct nfs_lockowner *lockowner)
+{
+	int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+	if (ret == -EIO)
+		/* A lost lock - don't even consider delegations */
+		goto out;
+	/* returns true if delegation stateid found and copied */
+	if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
+		ret = 0;
+		goto out;
+	}
+	if (ret != -ENOENT)
+		/* nfs4_copy_delegation_stateid() didn't over-write
+		 * dst, so it still has the lock stateid which we now
+		 * choose to use.
+		 */
+		goto out;
+	nfs4_copy_open_stateid(dst, state);
+	ret = 0;
+out:
+	if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
+		dst->seqid = 0;
+	return ret;
+}
+
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
+{
+	struct nfs_seqid *new;
+
+	new = kmalloc(sizeof(*new), gfp_mask);
+	if (new == NULL)
+		return ERR_PTR(-ENOMEM);
+	new->sequence = counter;
+	INIT_LIST_HEAD(&new->list);
+	new->task = NULL;
+	return new;
+}
+
+void nfs_release_seqid(struct nfs_seqid *seqid)
+{
+	struct nfs_seqid_counter *sequence;
+
+	if (seqid == NULL || list_empty(&seqid->list))
+		return;
+	sequence = seqid->sequence;
+	spin_lock(&sequence->lock);
+	list_del_init(&seqid->list);
+	if (!list_empty(&sequence->list)) {
+		struct nfs_seqid *next;
+
+		next = list_first_entry(&sequence->list,
+				struct nfs_seqid, list);
+		rpc_wake_up_queued_task(&sequence->wait, next->task);
+	}
+	spin_unlock(&sequence->lock);
+}
+
+void nfs_free_seqid(struct nfs_seqid *seqid)
+{
+	nfs_release_seqid(seqid);
+	kfree(seqid);
+}
+
+/*
+ * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs4.h:seqid_mutating_error()
+ */
+static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
+{
+	switch (status) {
+		case 0:
+			break;
+		case -NFS4ERR_BAD_SEQID:
+			if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
+				return;
+			pr_warn_ratelimited("NFS: v4 server returned a bad"
+					" sequence-id error on an"
+					" unconfirmed sequence %p!\n",
+					seqid->sequence);
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_BADXDR:
+		case -NFS4ERR_RESOURCE:
+		case -NFS4ERR_NOFILEHANDLE:
+			/* Non-seqid mutating errors */
+			return;
+	};
+	/*
+	 * Note: no locking needed as we are guaranteed to be first
+	 * on the sequence list
+	 */
+	seqid->sequence->counter++;
+}
+
+void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
+{
+	struct nfs4_state_owner *sp;
+
+	if (seqid == NULL)
+		return;
+
+	sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
+	if (status == -NFS4ERR_BAD_SEQID)
+		nfs4_drop_state_owner(sp);
+	if (!nfs4_has_session(sp->so_server->nfs_client))
+		nfs_increment_seqid(status, seqid);
+}
+
+/*
+ * Increment the seqid if the LOCK/LOCKU succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs4.h:seqid_mutating_error()
+ */
+void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
+{
+	if (seqid != NULL)
+		nfs_increment_seqid(status, seqid);
+}
+
+int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
+{
+	struct nfs_seqid_counter *sequence;
+	int status = 0;
+
+	if (seqid == NULL)
+		goto out;
+	sequence = seqid->sequence;
+	spin_lock(&sequence->lock);
+	seqid->task = task;
+	if (list_empty(&seqid->list))
+		list_add_tail(&seqid->list, &sequence->list);
+	if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
+		goto unlock;
+	rpc_sleep_on(&sequence->wait, task, NULL);
+	status = -EAGAIN;
+unlock:
+	spin_unlock(&sequence->lock);
+out:
+	return status;
+}
+
+static int nfs4_run_state_manager(void *);
+
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
+{
+	smp_mb__before_atomic();
+	clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+	smp_mb__after_atomic();
+	wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
+	rpc_wake_up(&clp->cl_rpcwaitq);
+}
+
+/*
+ * Schedule the nfs_client asynchronous state management routine
+ */
+void nfs4_schedule_state_manager(struct nfs_client *clp)
+{
+	struct task_struct *task;
+	char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+
+	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+		return;
+	__module_get(THIS_MODULE);
+	atomic_inc(&clp->cl_count);
+
+	/* The rcu_read_lock() is not strictly necessary, as the state
+	 * manager is the only thread that ever changes the rpc_xprt
+	 * after it's initialized.  At this point, we're single threaded. */
+	rcu_read_lock();
+	snprintf(buf, sizeof(buf), "%s-manager",
+			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+	rcu_read_unlock();
+	task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
+	if (IS_ERR(task)) {
+		printk(KERN_ERR "%s: kthread_run: %ld\n",
+			__func__, PTR_ERR(task));
+		nfs4_clear_state_manager_bit(clp);
+		nfs_put_client(clp);
+		module_put(THIS_MODULE);
+	}
+}
+
+/*
+ * Schedule a lease recovery attempt
+ */
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
+{
+	if (!clp)
+		return;
+	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+	dprintk("%s: scheduling lease recovery for server %s\n", __func__,
+			clp->cl_hostname);
+	nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
+
+/**
+ * nfs4_schedule_migration_recovery - trigger migration recovery
+ *
+ * @server: FSID that is migrating
+ *
+ * Returns zero if recovery has started, otherwise a negative NFS4ERR
+ * value is returned.
+ */
+int nfs4_schedule_migration_recovery(const struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	if (server->fh_expire_type != NFS4_FH_PERSISTENT) {
+		pr_err("NFS: volatile file handles not supported (server %s)\n",
+				clp->cl_hostname);
+		return -NFS4ERR_IO;
+	}
+
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -NFS4ERR_IO;
+
+	dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n",
+			__func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	set_bit(NFS_MIG_IN_TRANSITION,
+			&((struct nfs_server *)server)->mig_status);
+	set_bit(NFS4CLNT_MOVED, &clp->cl_state);
+
+	nfs4_schedule_state_manager(clp);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
+
+/**
+ * nfs4_schedule_lease_moved_recovery - start lease-moved recovery
+ *
+ * @clp: server to check for moved leases
+ *
+ */
+void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp)
+{
+	dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n",
+		__func__, clp->cl_clientid, clp->cl_hostname);
+
+	set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery);
+
+int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+	int res;
+
+	might_sleep();
+
+	atomic_inc(&clp->cl_count);
+	res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+				 nfs_wait_bit_killable, TASK_KILLABLE);
+	if (res)
+		goto out;
+	if (clp->cl_cons_state < 0)
+		res = clp->cl_cons_state;
+out:
+	nfs_put_client(clp);
+	return res;
+}
+
+int nfs4_client_recover_expired_lease(struct nfs_client *clp)
+{
+	unsigned int loop;
+	int ret;
+
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+		ret = nfs4_wait_clnt_recover(clp);
+		if (ret != 0)
+			break;
+		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
+			break;
+		nfs4_schedule_state_manager(clp);
+		ret = -EIO;
+	}
+	return ret;
+}
+
+/*
+ * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
+ * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a
+ * resend of the SETCLIENTID and hence re-establish the
+ * callback channel. Then return all existing delegations.
+ */
+static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
+{
+	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	nfs_expire_all_delegations(clp);
+	dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__,
+			clp->cl_hostname);
+}
+
+void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
+{
+	nfs40_handle_cb_pathdown(clp);
+	nfs4_schedule_state_manager(clp);
+}
+
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+
+	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+	/* Don't recover state that expired before the reboot */
+	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+		clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+		return 0;
+	}
+	set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+	set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+	return 1;
+}
+
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+	set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+	clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+	set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+	set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+	return 1;
+}
+
+int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	if (!nfs4_valid_open_stateid(state))
+		return -EBADF;
+	nfs4_state_mark_reclaim_nograce(clp, state);
+	dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
+			clp->cl_hostname);
+	nfs4_schedule_state_manager(clp);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
+
+void nfs_inode_find_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid)
+{
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+	bool found = false;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		state = ctx->state;
+		if (state == NULL)
+			continue;
+		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+			continue;
+		if (!nfs4_stateid_match(&state->stateid, stateid))
+			continue;
+		nfs4_state_mark_reclaim_nograce(clp, state);
+		found = true;
+	}
+	spin_unlock(&inode->i_lock);
+	if (found)
+		nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
+{
+	struct inode *inode = state->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		if (ctx->state != state)
+			continue;
+		set_bit(NFS_CONTEXT_BAD, &ctx->flags);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
+{
+	set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags);
+	nfs4_state_mark_open_context_bad(state);
+}
+
+
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
+{
+	struct inode *inode = state->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct file_lock *fl;
+	int status = 0;
+	struct file_lock_context *flctx = inode->i_flctx;
+	struct list_head *list;
+
+	if (flctx == NULL)
+		return 0;
+
+	list = &flctx->flc_posix;
+
+	/* Guard against delegation returns and new lock/unlock calls */
+	down_write(&nfsi->rwsem);
+	spin_lock(&flctx->flc_lock);
+restart:
+	list_for_each_entry(fl, list, fl_list) {
+		if (nfs_file_open_context(fl->fl_file)->state != state)
+			continue;
+		spin_unlock(&flctx->flc_lock);
+		status = ops->recover_lock(state, fl);
+		switch (status) {
+		case 0:
+			break;
+		case -ESTALE:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_EXPIRED:
+		case -NFS4ERR_NO_GRACE:
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			goto out;
+		default:
+			pr_err("NFS: %s: unhandled error %d\n",
+					__func__, status);
+		case -ENOMEM:
+		case -NFS4ERR_DENIED:
+		case -NFS4ERR_RECLAIM_BAD:
+		case -NFS4ERR_RECLAIM_CONFLICT:
+			/* kill_proc(fl->fl_pid, SIGLOST, 1); */
+			status = 0;
+		}
+		spin_lock(&flctx->flc_lock);
+	}
+	if (list == &flctx->flc_posix) {
+		list = &flctx->flc_flock;
+		goto restart;
+	}
+	spin_unlock(&flctx->flc_lock);
+out:
+	up_write(&nfsi->rwsem);
+	return status;
+}
+
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
+{
+	struct nfs4_state *state;
+	struct nfs4_lock_state *lock;
+	int status = 0;
+
+	/* Note: we rely on the sp->so_states list being ordered 
+	 * so that we always reclaim open(O_RDWR) and/or open(O_WRITE)
+	 * states first.
+	 * This is needed to ensure that the server won't give us any
+	 * read delegations that we have to return if, say, we are
+	 * recovering after a network partition or a reboot from a
+	 * server that doesn't support a grace period.
+	 */
+	spin_lock(&sp->so_lock);
+	raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
+restart:
+	list_for_each_entry(state, &sp->so_states, open_states) {
+		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+			continue;
+		if (!nfs4_valid_open_stateid(state))
+			continue;
+		if (state->state == 0)
+			continue;
+		atomic_inc(&state->count);
+		spin_unlock(&sp->so_lock);
+		status = ops->recover_open(sp, state);
+		if (status >= 0) {
+			status = nfs4_reclaim_locks(state, ops);
+			if (status >= 0) {
+				if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+					spin_lock(&state->state_lock);
+					list_for_each_entry(lock, &state->lock_states, ls_locks) {
+						if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+							pr_warn_ratelimited("NFS: "
+									    "%s: Lock reclaim "
+									    "failed!\n", __func__);
+					}
+					spin_unlock(&state->state_lock);
+				}
+				nfs4_put_open_state(state);
+				spin_lock(&sp->so_lock);
+				goto restart;
+			}
+		}
+		switch (status) {
+			default:
+				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+					__func__, status);
+			case -ENOENT:
+			case -ENOMEM:
+			case -ESTALE:
+				/* Open state on this file cannot be recovered */
+				nfs4_state_mark_recovery_failed(state, status);
+				break;
+			case -EAGAIN:
+				ssleep(1);
+			case -NFS4ERR_ADMIN_REVOKED:
+			case -NFS4ERR_STALE_STATEID:
+			case -NFS4ERR_BAD_STATEID:
+			case -NFS4ERR_RECLAIM_BAD:
+			case -NFS4ERR_RECLAIM_CONFLICT:
+				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+				break;
+			case -NFS4ERR_EXPIRED:
+			case -NFS4ERR_NO_GRACE:
+				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+				goto out_err;
+		}
+		nfs4_put_open_state(state);
+		spin_lock(&sp->so_lock);
+		goto restart;
+	}
+	raw_write_seqcount_end(&sp->so_reclaim_seqcount);
+	spin_unlock(&sp->so_lock);
+	return 0;
+out_err:
+	nfs4_put_open_state(state);
+	spin_lock(&sp->so_lock);
+	raw_write_seqcount_end(&sp->so_reclaim_seqcount);
+	spin_unlock(&sp->so_lock);
+	return status;
+}
+
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+	struct nfs4_lock_state *lock;
+
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+	clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	spin_lock(&state->state_lock);
+	list_for_each_entry(lock, &state->lock_states, ls_locks) {
+		lock->ls_seqid.flags = 0;
+		clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags);
+	}
+	spin_unlock(&state->state_lock);
+}
+
+static void nfs4_reset_seqids(struct nfs_server *server,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
+	struct nfs4_state *state;
+
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		sp->so_seqid.flags = 0;
+		spin_lock(&sp->so_lock);
+		list_for_each_entry(state, &sp->so_states, open_states) {
+			if (mark_reclaim(clp, state))
+				nfs4_clear_open_state(state);
+		}
+		spin_unlock(&sp->so_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_reset_seqids(server, mark_reclaim);
+	rcu_read_unlock();
+}
+
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+	/* Mark all delegations for reclaim */
+	nfs_delegation_mark_reclaim(clp);
+	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+
+static void nfs4_reclaim_complete(struct nfs_client *clp,
+				 const struct nfs4_state_recovery_ops *ops,
+				 struct rpc_cred *cred)
+{
+	/* Notify the server we're done reclaiming our state */
+	if (ops->reclaim_complete)
+		(void)ops->reclaim_complete(clp, cred);
+}
+
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
+	struct nfs4_state *state;
+
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		spin_lock(&sp->so_lock);
+		list_for_each_entry(state, &sp->so_states, open_states) {
+			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+						&state->flags))
+				continue;
+			nfs4_state_mark_reclaim_nograce(clp, state);
+		}
+		spin_unlock(&sp->so_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_clear_reclaim_server(server);
+	rcu_read_unlock();
+
+	nfs_delegation_reap_unclaimed(clp);
+	return 1;
+}
+
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+	const struct nfs4_state_recovery_ops *ops;
+	struct rpc_cred *cred;
+
+	if (!nfs4_state_clear_reclaim_reboot(clp))
+		return;
+	ops = clp->cl_mvops->reboot_recovery_ops;
+	cred = nfs4_get_clid_cred(clp);
+	nfs4_reclaim_complete(clp, ops, cred);
+	put_rpccred(cred);
+}
+
+static void nfs_delegation_clear_all(struct nfs_client *clp)
+{
+	nfs_delegation_mark_reclaim(clp);
+	nfs_delegation_reap_unclaimed(clp);
+}
+
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+	nfs_delegation_clear_all(clp);
+	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+
+static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+	switch (error) {
+		case 0:
+			break;
+		case -NFS4ERR_CB_PATH_DOWN:
+			nfs40_handle_cb_pathdown(clp);
+			break;
+		case -NFS4ERR_NO_GRACE:
+			nfs4_state_end_reclaim_reboot(clp);
+			break;
+		case -NFS4ERR_STALE_CLIENTID:
+			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+			nfs4_state_clear_reclaim_reboot(clp);
+			nfs4_state_start_reclaim_reboot(clp);
+			break;
+		case -NFS4ERR_EXPIRED:
+			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+			nfs4_state_start_reclaim_nograce(clp);
+			break;
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_DEADSESSION:
+		case -NFS4ERR_SEQ_FALSE_RETRY:
+		case -NFS4ERR_SEQ_MISORDERED:
+			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+			/* Zero session reset errors */
+			break;
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+			break;
+		default:
+			dprintk("%s: failed to handle error %d for server %s\n",
+					__func__, error, clp->cl_hostname);
+			return error;
+	}
+	dprintk("%s: handled error %d for server %s\n", __func__, error,
+			clp->cl_hostname);
+	return 0;
+}
+
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+	struct nfs4_state_owner *sp;
+	struct nfs_server *server;
+	struct rb_node *pos;
+	int status = 0;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		nfs4_purge_state_owners(server);
+		spin_lock(&clp->cl_lock);
+		for (pos = rb_first(&server->state_owners);
+		     pos != NULL;
+		     pos = rb_next(pos)) {
+			sp = rb_entry(pos,
+				struct nfs4_state_owner, so_server_node);
+			if (!test_and_clear_bit(ops->owner_flag_bit,
+							&sp->so_flags))
+				continue;
+			atomic_inc(&sp->so_count);
+			spin_unlock(&clp->cl_lock);
+			rcu_read_unlock();
+
+			status = nfs4_reclaim_open_state(sp, ops);
+			if (status < 0) {
+				set_bit(ops->owner_flag_bit, &sp->so_flags);
+				nfs4_put_state_owner(sp);
+				status = nfs4_recovery_handle_error(clp, status);
+				return (status != 0) ? status : -EAGAIN;
+			}
+
+			nfs4_put_state_owner(sp);
+			goto restart;
+		}
+		spin_unlock(&clp->cl_lock);
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+static int nfs4_check_lease(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	const struct nfs4_state_maintenance_ops *ops =
+		clp->cl_mvops->state_renewal_ops;
+	int status;
+
+	/* Is the client already known to have an expired lease? */
+	if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+		return 0;
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL) {
+		cred = nfs4_get_clid_cred(clp);
+		status = -ENOKEY;
+		if (cred == NULL)
+			goto out;
+	}
+	status = ops->renew_lease(clp, cred);
+	put_rpccred(cred);
+	if (status == -ETIMEDOUT) {
+		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+		return 0;
+	}
+out:
+	return nfs4_recovery_handle_error(clp, status);
+}
+
+/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors
+ * and for recoverable errors on EXCHANGE_ID for v4.1
+ */
+static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
+{
+	switch (status) {
+	case -NFS4ERR_SEQ_MISORDERED:
+		if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state))
+			return -ESERVERFAULT;
+		/* Lease confirmation error: retry after purging the lease */
+		ssleep(1);
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+		break;
+	case -NFS4ERR_STALE_CLIENTID:
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+		nfs4_state_start_reclaim_reboot(clp);
+		break;
+	case -NFS4ERR_CLID_INUSE:
+		pr_err("NFS: Server %s reports our clientid is in use\n",
+			clp->cl_hostname);
+		nfs_mark_client_ready(clp, -EPERM);
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+		return -EPERM;
+	case -EACCES:
+	case -NFS4ERR_DELAY:
+	case -ETIMEDOUT:
+	case -EAGAIN:
+		ssleep(1);
+		break;
+
+	case -NFS4ERR_MINOR_VERS_MISMATCH:
+		if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+			nfs_mark_client_ready(clp, -EPROTONOSUPPORT);
+		dprintk("%s: exit with error %d for server %s\n",
+				__func__, -EPROTONOSUPPORT, clp->cl_hostname);
+		return -EPROTONOSUPPORT;
+	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+				 * in nfs4_exchange_id */
+	default:
+		dprintk("%s: exit with error %d for server %s\n", __func__,
+				status, clp->cl_hostname);
+		return status;
+	}
+	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	dprintk("%s: handled error %d for server %s\n", __func__, status,
+			clp->cl_hostname);
+	return 0;
+}
+
+static int nfs4_establish_lease(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	const struct nfs4_state_recovery_ops *ops =
+		clp->cl_mvops->reboot_recovery_ops;
+	int status;
+
+	cred = nfs4_get_clid_cred(clp);
+	if (cred == NULL)
+		return -ENOENT;
+	status = ops->establish_clid(clp, cred);
+	put_rpccred(cred);
+	if (status != 0)
+		return status;
+	pnfs_destroy_all_layouts(clp);
+	return 0;
+}
+
+/*
+ * Returns zero or a negative errno.  NFS4ERR values are converted
+ * to local errno values.
+ */
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+	int status;
+
+	status = nfs4_establish_lease(clp);
+	if (status < 0)
+		return nfs4_handle_reclaim_lease_error(clp, status);
+	if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state))
+		nfs4_state_start_reclaim_nograce(clp);
+	if (!test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+		set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+	clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+	clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	return 0;
+}
+
+static int nfs4_purge_lease(struct nfs_client *clp)
+{
+	int status;
+
+	status = nfs4_establish_lease(clp);
+	if (status < 0)
+		return nfs4_handle_reclaim_lease_error(clp, status);
+	clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	nfs4_state_start_reclaim_nograce(clp);
+	return 0;
+}
+
+/*
+ * Try remote migration of one FSID from a source server to a
+ * destination server.  The source server provides a list of
+ * potential destinations.
+ *
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_fs_locations *locations = NULL;
+	struct inode *inode;
+	struct page *page;
+	int status, result;
+
+	dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	result = 0;
+	page = alloc_page(GFP_KERNEL);
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (page == NULL || locations == NULL) {
+		dprintk("<-- %s: no memory\n", __func__);
+		goto out;
+	}
+
+	inode = d_inode(server->super->s_root);
+	result = nfs4_proc_get_locations(inode, locations, page, cred);
+	if (result) {
+		dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
+			__func__, result);
+		goto out;
+	}
+
+	result = -NFS4ERR_NXIO;
+	if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+		dprintk("<-- %s: No fs_locations data, migration skipped\n",
+			__func__);
+		goto out;
+	}
+
+	nfs4_begin_drain_session(clp);
+
+	status = nfs4_replace_transport(server, locations);
+	if (status != 0) {
+		dprintk("<-- %s: failed to replace transport: %d\n",
+			__func__, status);
+		goto out;
+	}
+
+	result = 0;
+	dprintk("<-- %s: migration succeeded\n", __func__);
+
+out:
+	if (page != NULL)
+		__free_page(page);
+	kfree(locations);
+	if (result) {
+		pr_err("NFS: migration recovery failed (server %s)\n",
+				clp->cl_hostname);
+		set_bit(NFS_MIG_FAILED, &server->mig_status);
+	}
+	return result;
+}
+
+/*
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_handle_migration(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: migration reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION,
+						&server->mig_status))
+			continue;
+
+		rcu_read_unlock();
+		status = nfs4_try_migration(server, cred);
+		if (status < 0) {
+			put_rpccred(cred);
+			return status;
+		}
+		goto restart;
+	}
+	rcu_read_unlock();
+	put_rpccred(cred);
+	return 0;
+}
+
+/*
+ * Test each nfs_server on the clp's cl_superblocks list to see
+ * if it's moved to another server.  Stop when the server no longer
+ * returns NFS4ERR_LEASE_MOVED.
+ */
+static int nfs4_handle_lease_moved(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: lease moved reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		struct inode *inode;
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		rcu_read_unlock();
+
+		inode = d_inode(server->super->s_root);
+		status = nfs4_proc_fsid_present(inode, cred);
+		if (status != -NFS4ERR_MOVED)
+			goto restart;	/* wasn't this one */
+		if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED)
+			goto restart;	/* there are more */
+		goto out;
+	}
+	rcu_read_unlock();
+
+out:
+	put_rpccred(cred);
+	return 0;
+}
+
+/**
+ * nfs4_discover_server_trunking - Detect server IP address trunking
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ *
+ * Returns zero or a negative errno.  If zero is returned,
+ * an nfs_client pointer is planted in "result".
+ *
+ * Note: since we are invoked in process context, and
+ * not from inside the state manager, we cannot use
+ * nfs4_handle_reclaim_lease_error().
+ */
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+				  struct nfs_client **result)
+{
+	const struct nfs4_state_recovery_ops *ops =
+				clp->cl_mvops->reboot_recovery_ops;
+	struct rpc_clnt *clnt;
+	struct rpc_cred *cred;
+	int i, status;
+
+	dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
+
+	clnt = clp->cl_rpcclient;
+	i = 0;
+
+	mutex_lock(&nfs_clid_init_mutex);
+again:
+	status  = -ENOENT;
+	cred = nfs4_get_clid_cred(clp);
+	if (cred == NULL)
+		goto out_unlock;
+
+	status = ops->detect_trunking(clp, result, cred);
+	put_rpccred(cred);
+	switch (status) {
+	case 0:
+		break;
+	case -ETIMEDOUT:
+		if (clnt->cl_softrtry)
+			break;
+	case -NFS4ERR_DELAY:
+	case -EAGAIN:
+		ssleep(1);
+	case -NFS4ERR_STALE_CLIENTID:
+		dprintk("NFS: %s after status %d, retrying\n",
+			__func__, status);
+		goto again;
+	case -EACCES:
+		if (i++ == 0) {
+			nfs4_root_machine_cred(clp);
+			goto again;
+		}
+		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX)
+			break;
+	case -NFS4ERR_CLID_INUSE:
+	case -NFS4ERR_WRONGSEC:
+		/* No point in retrying if we already used RPC_AUTH_UNIX */
+		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) {
+			status = -EPERM;
+			break;
+		}
+		clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);
+		if (IS_ERR(clnt)) {
+			status = PTR_ERR(clnt);
+			break;
+		}
+		/* Note: this is safe because we haven't yet marked the
+		 * client as ready, so we are the only user of
+		 * clp->cl_rpcclient
+		 */
+		clnt = xchg(&clp->cl_rpcclient, clnt);
+		rpc_shutdown_client(clnt);
+		clnt = clp->cl_rpcclient;
+		goto again;
+
+	case -NFS4ERR_MINOR_VERS_MISMATCH:
+		status = -EPROTONOSUPPORT;
+		break;
+
+	case -EKEYEXPIRED:
+	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+				 * in nfs4_exchange_id */
+		status = -EKEYEXPIRED;
+		break;
+	default:
+		pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n",
+				__func__, status);
+		status = -EIO;
+	}
+
+out_unlock:
+	mutex_unlock(&nfs_clid_init_mutex);
+	dprintk("NFS: %s: status = %d\n", __func__, status);
+	return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
+{
+	struct nfs_client *clp = session->clp;
+
+	switch (err) {
+	default:
+		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+		break;
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	}
+	nfs4_schedule_lease_recovery(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
+
+static void nfs41_ping_server(struct nfs_client *clp)
+{
+	/* Use CHECK_LEASE to ping the server with a SEQUENCE */
+	set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+}
+
+void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
+{
+	nfs41_ping_server(clp);
+}
+
+void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
+{
+	nfs41_ping_server(clp);
+}
+
+static void nfs4_reset_all_state(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+		nfs4_state_start_reclaim_nograce(clp);
+		dprintk("%s: scheduling reset of all state for server %s!\n",
+				__func__, clp->cl_hostname);
+		nfs4_schedule_state_manager(clp);
+	}
+}
+
+static void nfs41_handle_server_reboot(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+		nfs4_state_start_reclaim_reboot(clp);
+		dprintk("%s: server %s rebooted!\n", __func__,
+				clp->cl_hostname);
+		nfs4_schedule_state_manager(clp);
+	}
+}
+
+static void nfs41_handle_state_revoked(struct nfs_client *clp)
+{
+	nfs4_reset_all_state(clp);
+	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
+}
+
+static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
+{
+	/* This will need to handle layouts too */
+	nfs_expire_all_delegations(clp);
+	dprintk("%s: Recallable state revoked on server %s!\n", __func__,
+			clp->cl_hostname);
+}
+
+static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
+{
+	nfs_expire_all_delegations(clp);
+	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
+		nfs4_schedule_state_manager(clp);
+	dprintk("%s: server %s declared a backchannel fault\n", __func__,
+			clp->cl_hostname);
+}
+
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+		&clp->cl_state) == 0)
+		nfs4_schedule_state_manager(clp);
+}
+
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
+{
+	if (!flags)
+		return;
+
+	dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
+		__func__, clp->cl_hostname, clp->cl_clientid, flags);
+
+	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+		nfs41_handle_server_reboot(clp);
+	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
+			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+			    SEQ4_STATUS_ADMIN_STATE_REVOKED))
+		nfs41_handle_state_revoked(clp);
+	if (flags & SEQ4_STATUS_LEASE_MOVED)
+		nfs4_schedule_lease_moved_recovery(clp);
+	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+		nfs41_handle_recallable_state_revoked(clp);
+	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
+		nfs41_handle_backchannel_fault(clp);
+	else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+				SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+		nfs41_handle_cb_path_down(clp);
+}
+
+static int nfs4_reset_session(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	int status;
+
+	if (!nfs4_has_session(clp))
+		return 0;
+	nfs4_begin_drain_session(clp);
+	cred = nfs4_get_clid_cred(clp);
+	status = nfs4_proc_destroy_session(clp->cl_session, cred);
+	switch (status) {
+	case 0:
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_DEADSESSION:
+		break;
+	case -NFS4ERR_BACK_CHAN_BUSY:
+	case -NFS4ERR_DELAY:
+		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+		status = 0;
+		ssleep(1);
+		goto out;
+	default:
+		status = nfs4_recovery_handle_error(clp, status);
+		goto out;
+	}
+
+	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
+	status = nfs4_proc_create_session(clp, cred);
+	if (status) {
+		dprintk("%s: session reset failed with status %d for server %s!\n",
+			__func__, status, clp->cl_hostname);
+		status = nfs4_handle_reclaim_lease_error(clp, status);
+		goto out;
+	}
+	nfs41_finish_session_reset(clp);
+	dprintk("%s: session reset was successful for server %s!\n",
+			__func__, clp->cl_hostname);
+out:
+	if (cred)
+		put_rpccred(cred);
+	return status;
+}
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	int ret;
+
+	if (!nfs4_has_session(clp))
+		return 0;
+	nfs4_begin_drain_session(clp);
+	cred = nfs4_get_clid_cred(clp);
+	ret = nfs4_proc_bind_conn_to_session(clp, cred);
+	if (cred)
+		put_rpccred(cred);
+	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	switch (ret) {
+	case 0:
+		dprintk("%s: bind_conn_to_session was successful for server %s!\n",
+			__func__, clp->cl_hostname);
+		break;
+	case -NFS4ERR_DELAY:
+		ssleep(1);
+		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+		break;
+	default:
+		return nfs4_recovery_handle_error(clp, ret);
+	}
+	return 0;
+}
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void nfs4_state_manager(struct nfs_client *clp)
+{
+	int status = 0;
+	const char *section = "", *section_sep = "";
+
+	/* Ensure exclusive access to NFSv4 state */
+	do {
+		if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+			section = "purge state";
+			status = nfs4_purge_lease(clp);
+			if (status < 0)
+				goto out_error;
+			continue;
+		}
+
+		if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
+			section = "lease expired";
+			/* We're going to have to re-establish a clientid */
+			status = nfs4_reclaim_lease(clp);
+			if (status < 0)
+				goto out_error;
+			continue;
+		}
+
+		/* Initialize or reset the session */
+		if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) {
+			section = "reset session";
+			status = nfs4_reset_session(clp);
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+				continue;
+			if (status < 0)
+				goto out_error;
+		}
+
+		/* Send BIND_CONN_TO_SESSION */
+		if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+				&clp->cl_state)) {
+			section = "bind conn to session";
+			status = nfs4_bind_conn_to_session(clp);
+			if (status < 0)
+				goto out_error;
+			continue;
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+			section = "check lease";
+			status = nfs4_check_lease(clp);
+			if (status < 0)
+				goto out_error;
+			continue;
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
+			section = "migration";
+			status = nfs4_handle_migration(clp);
+			if (status < 0)
+				goto out_error;
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) {
+			section = "lease moved";
+			status = nfs4_handle_lease_moved(clp);
+			if (status < 0)
+				goto out_error;
+		}
+
+		/* First recover reboot state... */
+		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+			section = "reclaim reboot";
+			status = nfs4_do_reclaim(clp,
+				clp->cl_mvops->reboot_recovery_ops);
+			if (status == -EAGAIN)
+				continue;
+			if (status < 0)
+				goto out_error;
+			nfs4_state_end_reclaim_reboot(clp);
+		}
+
+		/* Now recover expired state... */
+		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+			section = "reclaim nograce";
+			status = nfs4_do_reclaim(clp,
+				clp->cl_mvops->nograce_recovery_ops);
+			if (status == -EAGAIN)
+				continue;
+			if (status < 0)
+				goto out_error;
+		}
+
+		nfs4_end_drain_session(clp);
+		if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+			nfs_client_return_marked_delegations(clp);
+			continue;
+		}
+
+		nfs4_clear_state_manager_bit(clp);
+		/* Did we race with an attempt to give us more work? */
+		if (clp->cl_state == 0)
+			break;
+		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+			break;
+	} while (atomic_read(&clp->cl_count) > 1);
+	return;
+out_error:
+	if (strlen(section))
+		section_sep = ": ";
+	pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
+			" with error %d\n", section_sep, section,
+			clp->cl_hostname, -status);
+	ssleep(1);
+	nfs4_end_drain_session(clp);
+	nfs4_clear_state_manager_bit(clp);
+}
+
+static int nfs4_run_state_manager(void *ptr)
+{
+	struct nfs_client *clp = ptr;
+
+	allow_signal(SIGKILL);
+	nfs4_state_manager(clp);
+	nfs_put_client(clp);
+	module_put_and_exit(0);
+	return 0;
+}
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/kernel/fs/nfs/nfs4super.c b/kernel/fs/nfs/nfs4super.c
new file mode 100644
index 000000000..6fb7cb6b3
--- /dev/null
+++ b/kernel/fs/nfs/nfs4super.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nfs4_mount.h>
+#include <linux/nfs_fs.h>
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "nfs4idmap.h"
+#include "dns_resolve.h"
+#include "pnfs.h"
+#include "nfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc);
+static void nfs4_evict_inode(struct inode *inode);
+static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+
+static struct file_system_type nfs4_remote_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_remote_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+
+static struct file_system_type nfs4_remote_referral_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_remote_referral_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type nfs4_referral_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_referral_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+
+static const struct super_operations nfs4_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs4_write_inode,
+	.drop_inode	= nfs_drop_inode,
+	.statfs		= nfs_statfs,
+	.evict_inode	= nfs4_evict_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_devname	= nfs_show_devname,
+	.show_path	= nfs_show_path,
+	.show_stats	= nfs_show_stats,
+	.remount_fs	= nfs_remount,
+};
+
+struct nfs_subversion nfs_v4 = {
+	.owner = THIS_MODULE,
+	.nfs_fs   = &nfs4_fs_type,
+	.rpc_vers = &nfs_version4,
+	.rpc_ops  = &nfs_v4_clientops,
+	.sops     = &nfs4_sops,
+	.xattr    = nfs4_xattr_handlers,
+};
+
+static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret = nfs_write_inode(inode, wbc);
+
+	if (ret == 0)
+		ret = pnfs_layoutcommit_inode(inode,
+				wbc->sync_mode == WB_SYNC_ALL);
+	return ret;
+}
+
+/*
+ * Clean out any remaining NFSv4 state that might be left over due
+ * to open() calls that passed nfs_atomic_lookup, but failed to call
+ * nfs_open().
+ */
+static void nfs4_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	/* If we are holding a delegation, return it! */
+	nfs_inode_return_delegation_noreclaim(inode);
+	/* Note that above delegreturn would trigger pnfs return-on-close */
+	pnfs_return_layout(inode);
+	pnfs_destroy_layout(NFS_I(inode));
+	/* First call standard NFS clear_inode() code */
+	nfs_clear_inode(inode);
+}
+
+/*
+ * Get the superblock for the NFS4 root partition
+ */
+static struct dentry *
+nfs4_remote_mount(struct file_system_type *fs_type, int flags,
+		  const char *dev_name, void *info)
+{
+	struct nfs_mount_info *mount_info = info;
+	struct nfs_server *server;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+
+	mount_info->set_security = nfs_set_sb_security;
+
+	/* Get a volume representation */
+	server = nfs4_create_server(mount_info, &nfs_v4);
+	if (IS_ERR(server)) {
+		mntroot = ERR_CAST(server);
+		goto out;
+	}
+
+	mntroot = nfs_fs_mount_common(server, flags, dev_name, mount_info, &nfs_v4);
+
+out:
+	return mntroot;
+}
+
+static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
+		int flags, void *data, const char *hostname)
+{
+	struct vfsmount *root_mnt;
+	char *root_devname;
+	size_t len;
+
+	len = strlen(hostname) + 5;
+	root_devname = kmalloc(len, GFP_KERNEL);
+	if (root_devname == NULL)
+		return ERR_PTR(-ENOMEM);
+	/* Does hostname needs to be enclosed in brackets? */
+	if (strchr(hostname, ':'))
+		snprintf(root_devname, len, "[%s]:/", hostname);
+	else
+		snprintf(root_devname, len, "%s:/", hostname);
+	root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data);
+	kfree(root_devname);
+	return root_mnt;
+}
+
+struct nfs_referral_count {
+	struct list_head list;
+	const struct task_struct *task;
+	unsigned int referral_count;
+};
+
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+	struct nfs_referral_count *p;
+
+	list_for_each_entry(p, &nfs_referral_count_list, list) {
+		if (p->task == current)
+			return p;
+	}
+	return NULL;
+}
+
+#define NFS_MAX_NESTED_REFERRALS 2
+
+static int nfs_referral_loop_protect(void)
+{
+	struct nfs_referral_count *p, *new;
+	int ret = -ENOMEM;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		goto out;
+	new->task = current;
+	new->referral_count = 1;
+
+	ret = 0;
+	spin_lock(&nfs_referral_count_list_lock);
+	p = nfs_find_referral_count();
+	if (p != NULL) {
+		if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+			ret = -ELOOP;
+		else
+			p->referral_count++;
+	} else {
+		list_add(&new->list, &nfs_referral_count_list);
+		new = NULL;
+	}
+	spin_unlock(&nfs_referral_count_list_lock);
+	kfree(new);
+out:
+	return ret;
+}
+
+static void nfs_referral_loop_unprotect(void)
+{
+	struct nfs_referral_count *p;
+
+	spin_lock(&nfs_referral_count_list_lock);
+	p = nfs_find_referral_count();
+	p->referral_count--;
+	if (p->referral_count == 0)
+		list_del(&p->list);
+	else
+		p = NULL;
+	spin_unlock(&nfs_referral_count_list_lock);
+	kfree(p);
+}
+
+static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
+		const char *export_path)
+{
+	struct dentry *dentry;
+	int err;
+
+	if (IS_ERR(root_mnt))
+		return ERR_CAST(root_mnt);
+
+	err = nfs_referral_loop_protect();
+	if (err) {
+		mntput(root_mnt);
+		return ERR_PTR(err);
+	}
+
+	dentry = mount_subtree(root_mnt, export_path);
+	nfs_referral_loop_unprotect();
+
+	return dentry;
+}
+
+struct dentry *nfs4_try_mount(int flags, const char *dev_name,
+			      struct nfs_mount_info *mount_info,
+			      struct nfs_subversion *nfs_mod)
+{
+	char *export_path;
+	struct vfsmount *root_mnt;
+	struct dentry *res;
+	struct nfs_parsed_mount_data *data = mount_info->parsed;
+
+	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+
+	export_path = data->nfs_server.export_path;
+	data->nfs_server.export_path = "/";
+	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info,
+			data->nfs_server.hostname);
+	data->nfs_server.export_path = export_path;
+
+	res = nfs_follow_remote_path(root_mnt, export_path);
+
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n",
+		 PTR_ERR_OR_ZERO(res),
+		 IS_ERR(res) ? " [error]" : "");
+	return res;
+}
+
+static struct dentry *
+nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
+			   const char *dev_name, void *raw_data)
+{
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs_fill_super,
+		.set_security = nfs_clone_sb_security,
+		.cloned = raw_data,
+	};
+	struct nfs_server *server;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+
+	dprintk("--> nfs4_referral_get_sb()\n");
+
+	mount_info.mntfh = nfs_alloc_fhandle();
+	if (mount_info.cloned == NULL || mount_info.mntfh == NULL)
+		goto out;
+
+	/* create a new volume representation */
+	server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh);
+	if (IS_ERR(server)) {
+		mntroot = ERR_CAST(server);
+		goto out;
+	}
+
+	mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, &nfs_v4);
+out:
+	nfs_free_fhandle(mount_info.mntfh);
+	return mntroot;
+}
+
+/*
+ * Create an NFS4 server record on referral traversal
+ */
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	char *export_path;
+	struct vfsmount *root_mnt;
+	struct dentry *res;
+
+	dprintk("--> nfs4_referral_mount()\n");
+
+	export_path = data->mnt_path;
+	data->mnt_path = "/";
+
+	root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type,
+			flags, data, data->hostname);
+	data->mnt_path = export_path;
+
+	res = nfs_follow_remote_path(root_mnt, export_path);
+	dprintk("<-- nfs4_referral_mount() = %d%s\n",
+		PTR_ERR_OR_ZERO(res),
+		IS_ERR(res) ? " [error]" : "");
+	return res;
+}
+
+
+static int __init init_nfs_v4(void)
+{
+	int err;
+
+	err = nfs_dns_resolver_init();
+	if (err)
+		goto out;
+
+	err = nfs_idmap_init();
+	if (err)
+		goto out1;
+
+	err = nfs4_register_sysctl();
+	if (err)
+		goto out2;
+
+	register_nfs_version(&nfs_v4);
+	return 0;
+out2:
+	nfs_idmap_quit();
+out1:
+	nfs_dns_resolver_destroy();
+out:
+	return err;
+}
+
+static void __exit exit_nfs_v4(void)
+{
+	/* Not called in the _init(), conditionally loaded */
+	nfs4_pnfs_v3_ds_connect_unload();
+
+	unregister_nfs_version(&nfs_v4);
+	nfs4_unregister_sysctl();
+	nfs_idmap_quit();
+	nfs_dns_resolver_destroy();
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v4);
+module_exit(exit_nfs_v4);
diff --git a/kernel/fs/nfs/nfs4sysctl.c b/kernel/fs/nfs/nfs4sysctl.c
new file mode 100644
index 000000000..0fbd3ab1b
--- /dev/null
+++ b/kernel/fs/nfs/nfs4sysctl.c
@@ -0,0 +1,69 @@
+/*
+ * linux/fs/nfs/nfs4sysctl.c
+ *
+ * Sysctl interface to NFS v4 parameters
+ *
+ * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/sysctl.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "nfs4idmap.h"
+#include "callback.h"
+
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static struct ctl_table_header *nfs4_callback_sysctl_table;
+
+static struct ctl_table nfs4_cb_sysctls[] = {
+	{
+		.procname = "nfs_callback_tcpport",
+		.data = &nfs_callback_set_tcpport,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = (int *)&nfs_set_port_min,
+		.extra2 = (int *)&nfs_set_port_max,
+	},
+	{
+		.procname = "idmap_cache_timeout",
+		.data = &nfs_idmap_cache_timeout,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_jiffies,
+	},
+	{ }
+};
+
+static struct ctl_table nfs4_cb_sysctl_dir[] = {
+	{
+		.procname = "nfs",
+		.mode = 0555,
+		.child = nfs4_cb_sysctls,
+	},
+	{ }
+};
+
+static struct ctl_table nfs4_cb_sysctl_root[] = {
+	{
+		.procname = "fs",
+		.mode = 0555,
+		.child = nfs4_cb_sysctl_dir,
+	},
+	{ }
+};
+
+int nfs4_register_sysctl(void)
+{
+	nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root);
+	if (nfs4_callback_sysctl_table == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfs4_unregister_sysctl(void)
+{
+	unregister_sysctl_table(nfs4_callback_sysctl_table);
+	nfs4_callback_sysctl_table = NULL;
+}
diff --git a/kernel/fs/nfs/nfs4trace.c b/kernel/fs/nfs/nfs4trace.c
new file mode 100644
index 000000000..d774335cc
--- /dev/null
+++ b/kernel/fs/nfs/nfs4trace.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfs4trace.h"
+
+#ifdef CONFIG_NFS_V4_1
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+#endif
diff --git a/kernel/fs/nfs/nfs4trace.h b/kernel/fs/nfs/nfs4trace.h
new file mode 100644
index 000000000..470af1a78
--- /dev/null
+++ b/kernel/fs/nfs/nfs4trace.h
@@ -0,0 +1,1148 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs4
+
+#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS4_H
+
+#include <linux/tracepoint.h>
+
+#define show_nfsv4_errors(error) \
+	__print_symbolic(error, \
+		{ NFS4_OK, "OK" }, \
+		/* Mapped by nfs4_stat_to_errno() */ \
+		{ -EPERM, "EPERM" }, \
+		{ -ENOENT, "ENOENT" }, \
+		{ -EIO, "EIO" }, \
+		{ -ENXIO, "ENXIO" }, \
+		{ -EACCES, "EACCES" }, \
+		{ -EEXIST, "EEXIST" }, \
+		{ -EXDEV, "EXDEV" }, \
+		{ -ENOTDIR, "ENOTDIR" }, \
+		{ -EISDIR, "EISDIR" }, \
+		{ -EFBIG, "EFBIG" }, \
+		{ -ENOSPC, "ENOSPC" }, \
+		{ -EROFS, "EROFS" }, \
+		{ -EMLINK, "EMLINK" }, \
+		{ -ENAMETOOLONG, "ENAMETOOLONG" }, \
+		{ -ENOTEMPTY, "ENOTEMPTY" }, \
+		{ -EDQUOT, "EDQUOT" }, \
+		{ -ESTALE, "ESTALE" }, \
+		{ -EBADHANDLE, "EBADHANDLE" }, \
+		{ -EBADCOOKIE, "EBADCOOKIE" }, \
+		{ -ENOTSUPP, "ENOTSUPP" }, \
+		{ -ETOOSMALL, "ETOOSMALL" }, \
+		{ -EREMOTEIO, "EREMOTEIO" }, \
+		{ -EBADTYPE, "EBADTYPE" }, \
+		{ -EAGAIN, "EAGAIN" }, \
+		{ -ELOOP, "ELOOP" }, \
+		{ -EOPNOTSUPP, "EOPNOTSUPP" }, \
+		{ -EDEADLK, "EDEADLK" }, \
+		/* RPC errors */ \
+		{ -ENOMEM, "ENOMEM" }, \
+		{ -EKEYEXPIRED, "EKEYEXPIRED" }, \
+		{ -ETIMEDOUT, "ETIMEDOUT" }, \
+		{ -ERESTARTSYS, "ERESTARTSYS" }, \
+		{ -ECONNREFUSED, "ECONNREFUSED" }, \
+		{ -ECONNRESET, "ECONNRESET" }, \
+		{ -ENETUNREACH, "ENETUNREACH" }, \
+		{ -EHOSTUNREACH, "EHOSTUNREACH" }, \
+		{ -EHOSTDOWN, "EHOSTDOWN" }, \
+		{ -EPIPE, "EPIPE" }, \
+		{ -EPFNOSUPPORT, "EPFNOSUPPORT" }, \
+		{ -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
+		/* NFSv4 native errors */ \
+		{ -NFS4ERR_ACCESS, "ACCESS" }, \
+		{ -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
+		{ -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
+		{ -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
+		{ -NFS4ERR_BADCHAR, "BADCHAR" }, \
+		{ -NFS4ERR_BADHANDLE, "BADHANDLE" }, \
+		{ -NFS4ERR_BADIOMODE, "BADIOMODE" }, \
+		{ -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
+		{ -NFS4ERR_BADLABEL, "BADLABEL" }, \
+		{ -NFS4ERR_BADNAME, "BADNAME" }, \
+		{ -NFS4ERR_BADOWNER, "BADOWNER" }, \
+		{ -NFS4ERR_BADSESSION, "BADSESSION" }, \
+		{ -NFS4ERR_BADSLOT, "BADSLOT" }, \
+		{ -NFS4ERR_BADTYPE, "BADTYPE" }, \
+		{ -NFS4ERR_BADXDR, "BADXDR" }, \
+		{ -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
+		{ -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
+		{ -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
+		{ -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
+		{ -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
+		{ -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
+		{ -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+		{ -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
+		{ -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
+		{ -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
+		{ -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
+			"CONN_NOT_BOUND_TO_SESSION" }, \
+		{ -NFS4ERR_DEADLOCK, "DEADLOCK" }, \
+		{ -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
+		{ -NFS4ERR_DELAY, "DELAY" }, \
+		{ -NFS4ERR_DELEG_ALREADY_WANTED, \
+			"DELEG_ALREADY_WANTED" }, \
+		{ -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
+		{ -NFS4ERR_DENIED, "DENIED" }, \
+		{ -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
+		{ -NFS4ERR_DQUOT, "DQUOT" }, \
+		{ -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
+		{ -NFS4ERR_EXIST, "EXIST" }, \
+		{ -NFS4ERR_EXPIRED, "EXPIRED" }, \
+		{ -NFS4ERR_FBIG, "FBIG" }, \
+		{ -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
+		{ -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
+		{ -NFS4ERR_GRACE, "GRACE" }, \
+		{ -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
+		{ -NFS4ERR_INVAL, "INVAL" }, \
+		{ -NFS4ERR_IO, "IO" }, \
+		{ -NFS4ERR_ISDIR, "ISDIR" }, \
+		{ -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
+		{ -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
+		{ -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
+		{ -NFS4ERR_LOCKED, "LOCKED" }, \
+		{ -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
+		{ -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
+		{ -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
+		{ -NFS4ERR_MLINK, "MLINK" }, \
+		{ -NFS4ERR_MOVED, "MOVED" }, \
+		{ -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
+		{ -NFS4ERR_NOENT, "NOENT" }, \
+		{ -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
+		{ -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
+		{ -NFS4ERR_NOSPC, "NOSPC" }, \
+		{ -NFS4ERR_NOTDIR, "NOTDIR" }, \
+		{ -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
+		{ -NFS4ERR_NOTSUPP, "NOTSUPP" }, \
+		{ -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
+		{ -NFS4ERR_NOT_SAME, "NOT_SAME" }, \
+		{ -NFS4ERR_NO_GRACE, "NO_GRACE" }, \
+		{ -NFS4ERR_NXIO, "NXIO" }, \
+		{ -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
+		{ -NFS4ERR_OPENMODE, "OPENMODE" }, \
+		{ -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
+		{ -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
+		{ -NFS4ERR_PERM, "PERM" }, \
+		{ -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
+		{ -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
+		{ -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
+		{ -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
+		{ -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
+		{ -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
+		{ -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
+		{ -NFS4ERR_REP_TOO_BIG_TO_CACHE, \
+			"REP_TOO_BIG_TO_CACHE" }, \
+		{ -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
+		{ -NFS4ERR_RESOURCE, "RESOURCE" }, \
+		{ -NFS4ERR_RESTOREFH, "RESTOREFH" }, \
+		{ -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
+		{ -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
+		{ -NFS4ERR_ROFS, "ROFS" }, \
+		{ -NFS4ERR_SAME, "SAME" }, \
+		{ -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
+		{ -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
+		{ -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
+		{ -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
+		{ -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
+		{ -NFS4ERR_STALE, "STALE" }, \
+		{ -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
+		{ -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
+		{ -NFS4ERR_SYMLINK, "SYMLINK" }, \
+		{ -NFS4ERR_TOOSMALL, "TOOSMALL" }, \
+		{ -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
+		{ -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
+		{ -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
+		{ -NFS4ERR_WRONGSEC, "WRONGSEC" }, \
+		{ -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
+		{ -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
+		{ -NFS4ERR_XDEV, "XDEV" })
+
+#define show_open_flags(flags) \
+	__print_flags(flags, "|", \
+		{ O_CREAT, "O_CREAT" }, \
+		{ O_EXCL, "O_EXCL" }, \
+		{ O_TRUNC, "O_TRUNC" }, \
+		{ O_DIRECT, "O_DIRECT" })
+
+#define show_fmode_flags(mode) \
+	__print_flags(mode, "|", \
+		{ ((__force unsigned long)FMODE_READ), "READ" }, \
+		{ ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
+		{ ((__force unsigned long)FMODE_EXEC), "EXEC" })
+
+#define show_nfs_fattr_flags(valid) \
+	__print_flags((unsigned long)valid, "|", \
+		{ NFS_ATTR_FATTR_TYPE, "TYPE" }, \
+		{ NFS_ATTR_FATTR_MODE, "MODE" }, \
+		{ NFS_ATTR_FATTR_NLINK, "NLINK" }, \
+		{ NFS_ATTR_FATTR_OWNER, "OWNER" }, \
+		{ NFS_ATTR_FATTR_GROUP, "GROUP" }, \
+		{ NFS_ATTR_FATTR_RDEV, "RDEV" }, \
+		{ NFS_ATTR_FATTR_SIZE, "SIZE" }, \
+		{ NFS_ATTR_FATTR_FSID, "FSID" }, \
+		{ NFS_ATTR_FATTR_FILEID, "FILEID" }, \
+		{ NFS_ATTR_FATTR_ATIME, "ATIME" }, \
+		{ NFS_ATTR_FATTR_MTIME, "MTIME" }, \
+		{ NFS_ATTR_FATTR_CTIME, "CTIME" }, \
+		{ NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \
+		{ NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \
+		{ NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" })
+
+DECLARE_EVENT_CLASS(nfs4_clientid_event,
+		TP_PROTO(
+			const struct nfs_client *clp,
+			int error
+		),
+
+		TP_ARGS(clp, error),
+
+		TP_STRUCT__entry(
+			__string(dstaddr,
+				rpc_peeraddr2str(clp->cl_rpcclient,
+					RPC_DISPLAY_ADDR))
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			__entry->error = error;
+			__assign_str(dstaddr,
+				rpc_peeraddr2str(clp->cl_rpcclient,
+						RPC_DISPLAY_ADDR));
+		),
+
+		TP_printk(
+			"error=%d (%s) dstaddr=%s",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			__get_str(dstaddr)
+		)
+);
+#define DEFINE_NFS4_CLIENTID_EVENT(name) \
+	DEFINE_EVENT(nfs4_clientid_event, name,	 \
+			TP_PROTO( \
+				const struct nfs_client *clp, \
+				int error \
+			), \
+			TP_ARGS(clp, error))
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
+
+TRACE_EVENT(nfs4_setup_sequence,
+		TP_PROTO(
+			const struct nfs4_session *session,
+			const struct nfs4_sequence_args *args
+		),
+		TP_ARGS(session, args),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, session)
+			__field(unsigned int, slot_nr)
+			__field(unsigned int, seq_nr)
+			__field(unsigned int, highest_used_slotid)
+		),
+
+		TP_fast_assign(
+			const struct nfs4_slot *sa_slot = args->sa_slot;
+			__entry->session = nfs_session_id_hash(&session->sess_id);
+			__entry->slot_nr = sa_slot->slot_nr;
+			__entry->seq_nr = sa_slot->seq_nr;
+			__entry->highest_used_slotid =
+					sa_slot->table->highest_used_slotid;
+		),
+		TP_printk(
+			"session=0x%08x slot_nr=%u seq_nr=%u "
+			"highest_used_slotid=%u",
+			__entry->session,
+			__entry->slot_nr,
+			__entry->seq_nr,
+			__entry->highest_used_slotid
+		)
+);
+
+#define show_nfs4_sequence_status_flags(status) \
+	__print_flags((unsigned long)status, "|", \
+		{ SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+		{ SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \
+			"CB_GSS_CONTEXTS_EXPIRING" }, \
+		{ SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \
+			"CB_GSS_CONTEXTS_EXPIRED" }, \
+		{ SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \
+			"EXPIRED_ALL_STATE_REVOKED" }, \
+		{ SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \
+			"EXPIRED_SOME_STATE_REVOKED" }, \
+		{ SEQ4_STATUS_ADMIN_STATE_REVOKED, \
+			"ADMIN_STATE_REVOKED" }, \
+		{ SEQ4_STATUS_RECALLABLE_STATE_REVOKED,	 \
+			"RECALLABLE_STATE_REVOKED" }, \
+		{ SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \
+		{ SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \
+			"RESTART_RECLAIM_NEEDED" }, \
+		{ SEQ4_STATUS_CB_PATH_DOWN_SESSION, \
+			"CB_PATH_DOWN_SESSION" }, \
+		{ SEQ4_STATUS_BACKCHANNEL_FAULT, \
+			"BACKCHANNEL_FAULT" })
+
+TRACE_EVENT(nfs4_sequence_done,
+		TP_PROTO(
+			const struct nfs4_session *session,
+			const struct nfs4_sequence_res *res
+		),
+		TP_ARGS(session, res),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, session)
+			__field(unsigned int, slot_nr)
+			__field(unsigned int, seq_nr)
+			__field(unsigned int, highest_slotid)
+			__field(unsigned int, target_highest_slotid)
+			__field(unsigned int, status_flags)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			const struct nfs4_slot *sr_slot = res->sr_slot;
+			__entry->session = nfs_session_id_hash(&session->sess_id);
+			__entry->slot_nr = sr_slot->slot_nr;
+			__entry->seq_nr = sr_slot->seq_nr;
+			__entry->highest_slotid = res->sr_highest_slotid;
+			__entry->target_highest_slotid =
+					res->sr_target_highest_slotid;
+			__entry->error = res->sr_status;
+		),
+		TP_printk(
+			"error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+			"highest_slotid=%u target_highest_slotid=%u "
+			"status_flags=%u (%s)",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			__entry->session,
+			__entry->slot_nr,
+			__entry->seq_nr,
+			__entry->highest_slotid,
+			__entry->target_highest_slotid,
+			__entry->status_flags,
+			show_nfs4_sequence_status_flags(__entry->status_flags)
+		)
+);
+
+struct cb_sequenceargs;
+struct cb_sequenceres;
+
+TRACE_EVENT(nfs4_cb_sequence,
+		TP_PROTO(
+			const struct cb_sequenceargs *args,
+			const struct cb_sequenceres *res,
+			__be32 status
+		),
+		TP_ARGS(args, res, status),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, session)
+			__field(unsigned int, slot_nr)
+			__field(unsigned int, seq_nr)
+			__field(unsigned int, highest_slotid)
+			__field(unsigned int, cachethis)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			__entry->session = nfs_session_id_hash(&args->csa_sessionid);
+			__entry->slot_nr = args->csa_slotid;
+			__entry->seq_nr = args->csa_sequenceid;
+			__entry->highest_slotid = args->csa_highestslotid;
+			__entry->cachethis = args->csa_cachethis;
+			__entry->error = -be32_to_cpu(status);
+		),
+
+		TP_printk(
+			"error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+			"highest_slotid=%u",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			__entry->session,
+			__entry->slot_nr,
+			__entry->seq_nr,
+			__entry->highest_slotid
+		)
+);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_open_event,
+		TP_PROTO(
+			const struct nfs_open_context *ctx,
+			int flags,
+			int error
+		),
+
+		TP_ARGS(ctx, flags, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(unsigned int, flags)
+			__field(unsigned int, fmode)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(u64, dir)
+			__string(name, ctx->dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			const struct nfs4_state *state = ctx->state;
+			const struct inode *inode = NULL;
+
+			__entry->error = error;
+			__entry->flags = flags;
+			__entry->fmode = (__force unsigned int)ctx->mode;
+			__entry->dev = ctx->dentry->d_sb->s_dev;
+			if (!IS_ERR(state))
+				inode = state->inode;
+			if (inode != NULL) {
+				__entry->fileid = NFS_FILEID(inode);
+				__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			} else {
+				__entry->fileid = 0;
+				__entry->fhandle = 0;
+			}
+			__entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent));
+			__assign_str(name, ctx->dentry->d_name.name);
+		),
+
+		TP_printk(
+			"error=%d (%s) flags=%d (%s) fmode=%s "
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"name=%02x:%02x:%llu/%s",
+			 __entry->error,
+			 show_nfsv4_errors(__entry->error),
+			 __entry->flags,
+			 show_open_flags(__entry->flags),
+			 show_fmode_flags(__entry->fmode),
+			 MAJOR(__entry->dev), MINOR(__entry->dev),
+			 (unsigned long long)__entry->fileid,
+			 __entry->fhandle,
+			 MAJOR(__entry->dev), MINOR(__entry->dev),
+			 (unsigned long long)__entry->dir,
+			 __get_str(name)
+		)
+);
+
+#define DEFINE_NFS4_OPEN_EVENT(name) \
+	DEFINE_EVENT(nfs4_open_event, name, \
+			TP_PROTO( \
+				const struct nfs_open_context *ctx, \
+				int flags, \
+				int error \
+			), \
+			TP_ARGS(ctx, flags, error))
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+
+TRACE_EVENT(nfs4_close,
+		TP_PROTO(
+			const struct nfs4_state *state,
+			const struct nfs_closeargs *args,
+			const struct nfs_closeres *res,
+			int error
+		),
+
+		TP_ARGS(state, args, res, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(unsigned int, fmode)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = state->inode;
+
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->fmode = (__force unsigned int)state->state;
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
+			"fhandle=0x%08x",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			__entry->fmode ?  show_fmode_flags(__entry->fmode) :
+					  "closed",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle
+		)
+);
+
+#define show_lock_cmd(type) \
+	__print_symbolic((int)type, \
+		{ F_GETLK, "GETLK" }, \
+		{ F_SETLK, "SETLK" }, \
+		{ F_SETLKW, "SETLKW" })
+#define show_lock_type(type) \
+	__print_symbolic((int)type, \
+		{ F_RDLCK, "RDLCK" }, \
+		{ F_WRLCK, "WRLCK" }, \
+		{ F_UNLCK, "UNLCK" })
+
+DECLARE_EVENT_CLASS(nfs4_lock_event,
+		TP_PROTO(
+			const struct file_lock *request,
+			const struct nfs4_state *state,
+			int cmd,
+			int error
+		),
+
+		TP_ARGS(request, state, cmd, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(int, cmd)
+			__field(char, type)
+			__field(loff_t, start)
+			__field(loff_t, end)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = state->inode;
+
+			__entry->error = error;
+			__entry->cmd = cmd;
+			__entry->type = request->fl_type;
+			__entry->start = request->fl_start;
+			__entry->end = request->fl_end;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+		),
+
+		TP_printk(
+			"error=%d (%s) cmd=%s:%s range=%lld:%lld "
+			"fileid=%02x:%02x:%llu fhandle=0x%08x",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			show_lock_cmd(__entry->cmd),
+			show_lock_type(__entry->type),
+			(long long)__entry->start,
+			(long long)__entry->end,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle
+		)
+);
+
+#define DEFINE_NFS4_LOCK_EVENT(name) \
+	DEFINE_EVENT(nfs4_lock_event, name, \
+			TP_PROTO( \
+				const struct file_lock *request, \
+				const struct nfs4_state *state, \
+				int cmd, \
+				int error \
+			), \
+			TP_ARGS(request, state, cmd, error))
+DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
+DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
+DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
+DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
+DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+
+DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
+		TP_PROTO(
+			const struct inode *inode,
+			fmode_t fmode
+		),
+
+		TP_ARGS(inode, fmode),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(unsigned int, fmode)
+		),
+
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->fmode = (__force unsigned int)fmode;
+		),
+
+		TP_printk(
+			"fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x",
+			show_fmode_flags(__entry->fmode),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle
+		)
+);
+#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \
+	DEFINE_EVENT(nfs4_set_delegation_event, name, \
+			TP_PROTO( \
+				const struct inode *inode, \
+				fmode_t fmode \
+			), \
+			TP_ARGS(inode, fmode))
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation);
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation);
+
+TRACE_EVENT(nfs4_delegreturn_exit,
+		TP_PROTO(
+			const struct nfs4_delegreturnargs *args,
+			const struct nfs4_delegreturnres *res,
+			int error
+		),
+
+		TP_ARGS(args, res, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			__entry->dev = res->server->s_dev;
+			__entry->fhandle = nfs_fhandle_hash(args->fhandle);
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			__entry->fhandle
+		)
+);
+
+#ifdef CONFIG_NFS_V4_1
+DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
+		TP_PROTO(
+			const struct nfs4_state *state,
+			const struct nfs4_lock_state *lsp,
+			int error
+		),
+
+		TP_ARGS(state, lsp, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = state->inode;
+
+			__entry->error = error;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle
+		)
+);
+
+#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \
+	DEFINE_EVENT(nfs4_test_stateid_event, name, \
+			TP_PROTO( \
+				const struct nfs4_state *state, \
+				const struct nfs4_lock_state *lsp, \
+				int error \
+			), \
+			TP_ARGS(state, lsp, error))
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_lookup_event,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct qstr *name,
+			int error
+		),
+
+		TP_ARGS(dir, name, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(int, error)
+			__field(u64, dir)
+			__string(name, name->name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->error = error;
+			__assign_str(name, name->name);
+		),
+
+		TP_printk(
+			"error=%d (%s) name=%02x:%02x:%llu/%s",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+#define DEFINE_NFS4_LOOKUP_EVENT(name) \
+	DEFINE_EVENT(nfs4_lookup_event, name, \
+			TP_PROTO( \
+				const struct inode *dir, \
+				const struct qstr *name, \
+				int error \
+			), \
+			TP_ARGS(dir, name, error))
+
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo);
+
+TRACE_EVENT(nfs4_rename,
+		TP_PROTO(
+			const struct inode *olddir,
+			const struct qstr *oldname,
+			const struct inode *newdir,
+			const struct qstr *newname,
+			int error
+		),
+
+		TP_ARGS(olddir, oldname, newdir, newname, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(int, error)
+			__field(u64, olddir)
+			__string(oldname, oldname->name)
+			__field(u64, newdir)
+			__string(newname, newname->name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = olddir->i_sb->s_dev;
+			__entry->olddir = NFS_FILEID(olddir);
+			__entry->newdir = NFS_FILEID(newdir);
+			__entry->error = error;
+			__assign_str(oldname, oldname->name);
+			__assign_str(newname, newname->name);
+		),
+
+		TP_printk(
+			"error=%d (%s) oldname=%02x:%02x:%llu/%s "
+			"newname=%02x:%02x:%llu/%s",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->olddir,
+			__get_str(oldname),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->newdir,
+			__get_str(newname)
+		)
+);
+
+DECLARE_EVENT_CLASS(nfs4_inode_event,
+		TP_PROTO(
+			const struct inode *inode,
+			int error
+		),
+
+		TP_ARGS(inode, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle
+		)
+);
+
+#define DEFINE_NFS4_INODE_EVENT(name) \
+	DEFINE_EVENT(nfs4_inode_event, name, \
+			TP_PROTO( \
+				const struct inode *inode, \
+				int error \
+			), \
+			TP_ARGS(inode, error))
+
+DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_EVENT(nfs4_access);
+DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
+DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
+DEFINE_NFS4_INODE_EVENT(nfs4_get_acl);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
+DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_getattr_event,
+		TP_PROTO(
+			const struct nfs_server *server,
+			const struct nfs_fh *fhandle,
+			const struct nfs_fattr *fattr,
+			int error
+		),
+
+		TP_ARGS(server, fhandle, fattr, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(unsigned int, valid)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			__entry->dev = server->s_dev;
+			__entry->valid = fattr->valid;
+			__entry->fhandle = nfs_fhandle_hash(fhandle);
+			__entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0;
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"valid=%s",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			show_nfs_fattr_flags(__entry->valid)
+		)
+);
+
+#define DEFINE_NFS4_GETATTR_EVENT(name) \
+	DEFINE_EVENT(nfs4_getattr_event, name, \
+			TP_PROTO( \
+				const struct nfs_server *server, \
+				const struct nfs_fh *fhandle, \
+				const struct nfs_fattr *fattr, \
+				int error \
+			), \
+			TP_ARGS(server, fhandle, fattr, error))
+DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
+
+DECLARE_EVENT_CLASS(nfs4_idmap_event,
+		TP_PROTO(
+			const char *name,
+			int len,
+			u32 id,
+			int error
+		),
+
+		TP_ARGS(name, len, id, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(u32, id)
+			__dynamic_array(char, name, len > 0 ? len + 1 : 1)
+		),
+
+		TP_fast_assign(
+			if (len < 0)
+				len = 0;
+			__entry->error = error < 0 ? error : 0;
+			__entry->id = id;
+			memcpy(__get_dynamic_array(name), name, len);
+			((char *)__get_dynamic_array(name))[len] = 0;
+		),
+
+		TP_printk(
+			"error=%d id=%u name=%s",
+			__entry->error,
+			__entry->id,
+			__get_str(name)
+		)
+);
+#define DEFINE_NFS4_IDMAP_EVENT(name) \
+	DEFINE_EVENT(nfs4_idmap_event, name, \
+			TP_PROTO( \
+				const char *name, \
+				int len, \
+				u32 id, \
+				int error \
+			), \
+			TP_ARGS(name, len, id, error))
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
+
+DECLARE_EVENT_CLASS(nfs4_read_event,
+		TP_PROTO(
+			const struct nfs_pgio_header *hdr,
+			int error
+		),
+
+		TP_ARGS(hdr, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(loff_t, offset)
+			__field(size_t, count)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = hdr->inode;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->offset = hdr->args.offset;
+			__entry->count = hdr->args.count;
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"offset=%lld count=%zu",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			(long long)__entry->offset,
+			__entry->count
+		)
+);
+#define DEFINE_NFS4_READ_EVENT(name) \
+	DEFINE_EVENT(nfs4_read_event, name, \
+			TP_PROTO( \
+				const struct nfs_pgio_header *hdr, \
+				int error \
+			), \
+			TP_ARGS(hdr, error))
+DEFINE_NFS4_READ_EVENT(nfs4_read);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_write_event,
+		TP_PROTO(
+			const struct nfs_pgio_header *hdr,
+			int error
+		),
+
+		TP_ARGS(hdr, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(loff_t, offset)
+			__field(size_t, count)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = hdr->inode;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->offset = hdr->args.offset;
+			__entry->count = hdr->args.count;
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"offset=%lld count=%zu",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			(long long)__entry->offset,
+			__entry->count
+		)
+);
+
+#define DEFINE_NFS4_WRITE_EVENT(name) \
+	DEFINE_EVENT(nfs4_write_event, name, \
+			TP_PROTO( \
+				const struct nfs_pgio_header *hdr, \
+				int error \
+			), \
+			TP_ARGS(hdr, error))
+DEFINE_NFS4_WRITE_EVENT(nfs4_write);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_commit_event,
+		TP_PROTO(
+			const struct nfs_commit_data *data,
+			int error
+		),
+
+		TP_ARGS(data, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(loff_t, offset)
+			__field(size_t, count)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = data->inode;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->offset = data->args.offset;
+			__entry->count = data->args.count;
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"offset=%lld count=%zu",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			(long long)__entry->offset,
+			__entry->count
+		)
+);
+#define DEFINE_NFS4_COMMIT_EVENT(name) \
+	DEFINE_EVENT(nfs4_commit_event, name, \
+			TP_PROTO( \
+				const struct nfs_commit_data *data, \
+				int error \
+			), \
+			TP_ARGS(data, error))
+DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
+
+#define show_pnfs_iomode(iomode) \
+	__print_symbolic(iomode, \
+		{ IOMODE_READ, "READ" }, \
+		{ IOMODE_RW, "RW" }, \
+		{ IOMODE_ANY, "ANY" })
+
+TRACE_EVENT(nfs4_layoutget,
+		TP_PROTO(
+			const struct nfs_open_context *ctx,
+			const struct pnfs_layout_range *args,
+			const struct pnfs_layout_range *res,
+			int error
+		),
+
+		TP_ARGS(ctx, args, res, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(u32, iomode)
+			__field(u64, offset)
+			__field(u64, count)
+			__field(int, error)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = d_inode(ctx->dentry);
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->iomode = args->iomode;
+			__entry->offset = args->offset;
+			__entry->count = args->length;
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"iomode=%s offset=%llu count=%llu",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			show_pnfs_iomode(__entry->iomode),
+			(unsigned long long)__entry->offset,
+			(unsigned long long)__entry->count
+		)
+);
+
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* _TRACE_NFS4_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfs4trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fs/nfs/nfs4xdr.c b/kernel/fs/nfs/nfs4xdr.c
new file mode 100644
index 000000000..0aea97841
--- /dev/null
+++ b/kernel/fs/nfs/nfs4xdr.c
@@ -0,0 +1,7443 @@
+/*
+ *  fs/nfs/nfs4xdr.c
+ *
+ *  Client-side XDR for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY		NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO		EIO
+
+static int nfs4_stat_to_errno(int);
+
+/* NFSv4 COMPOUND tags are only wanted for debugging purposes */
+#ifdef DEBUG
+#define NFS4_MAXTAGLEN		20
+#else
+#define NFS4_MAXTAGLEN		0
+#endif
+
+/* lock,open owner id:
+ * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
+ */
+#define open_owner_id_maxsz	(1 + 2 + 1 + 1 + 2)
+#define lock_owner_id_maxsz	(1 + 1 + 4)
+#define decode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define compound_encode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
+#define compound_decode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
+#define op_encode_hdr_maxsz	(1)
+#define op_decode_hdr_maxsz	(2)
+#define encode_stateid_maxsz	(XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_stateid_maxsz	(XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define encode_verifier_maxsz	(XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define decode_verifier_maxsz	(XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define encode_putfh_maxsz	(op_encode_hdr_maxsz + 1 + \
+				(NFS4_FHSIZE >> 2))
+#define decode_putfh_maxsz	(op_decode_hdr_maxsz)
+#define encode_putrootfh_maxsz	(op_encode_hdr_maxsz)
+#define decode_putrootfh_maxsz	(op_decode_hdr_maxsz)
+#define encode_getfh_maxsz      (op_encode_hdr_maxsz)
+#define decode_getfh_maxsz      (op_decode_hdr_maxsz + 1 + \
+				((3+NFS4_FHSIZE) >> 2))
+#define nfs4_fattr_bitmap_maxsz 4
+#define encode_getattr_maxsz    (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define nfs4_name_maxsz		(1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+#define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+#define nfs4_owner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define nfs4_group_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define	nfs4_label_maxsz	(4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#else
+#define	nfs4_label_maxsz	0
+#endif
+/* We support only one layout type per file system */
+#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
+/* This is based on getfattr, which uses the most attributes: */
+#define nfs4_fattr_value_maxsz	(1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
+				3 + 3 + 3 + nfs4_owner_maxsz + \
+				nfs4_group_maxsz + nfs4_label_maxsz + \
+				 decode_mdsthreshold_maxsz))
+#define nfs4_fattr_maxsz	(nfs4_fattr_bitmap_maxsz + \
+				nfs4_fattr_value_maxsz)
+#define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
+#define encode_attrs_maxsz	(nfs4_fattr_bitmap_maxsz + \
+				 1 + 2 + 1 + \
+				nfs4_owner_maxsz + \
+				nfs4_group_maxsz + \
+				nfs4_label_maxsz + \
+				4 + 4)
+#define encode_savefh_maxsz     (op_encode_hdr_maxsz)
+#define decode_savefh_maxsz     (op_decode_hdr_maxsz)
+#define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
+#define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
+#define encode_fsinfo_maxsz	(encode_getattr_maxsz)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
+#define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)
+#define decode_renew_maxsz	(op_decode_hdr_maxsz)
+#define encode_setclientid_maxsz \
+				(op_encode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+				XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+				1 /* sc_prog */ + \
+				1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+				1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+				1) /* sc_cb_ident */
+#define decode_setclientid_maxsz \
+				(op_decode_hdr_maxsz + \
+				2 /* clientid */ + \
+				XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+				1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+				1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN))
+#define encode_setclientid_confirm_maxsz \
+				(op_encode_hdr_maxsz + \
+				3 + (NFS4_VERIFIER_SIZE >> 2))
+#define decode_setclientid_confirm_maxsz \
+				(op_decode_hdr_maxsz)
+#define encode_lookup_maxsz	(op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_lookup_maxsz	(op_decode_hdr_maxsz)
+#define encode_share_access_maxsz \
+				(2)
+#define encode_createmode_maxsz	(1 + encode_attrs_maxsz + encode_verifier_maxsz)
+#define encode_opentype_maxsz	(1 + encode_createmode_maxsz)
+#define encode_claim_null_maxsz	(1 + nfs4_name_maxsz)
+#define encode_open_maxsz	(op_encode_hdr_maxsz + \
+				2 + encode_share_access_maxsz + 2 + \
+				open_owner_id_maxsz + \
+				encode_opentype_maxsz + \
+				encode_claim_null_maxsz)
+#define decode_ace_maxsz	(3 + nfs4_owner_maxsz)
+#define decode_delegation_maxsz	(1 + decode_stateid_maxsz + 1 + \
+				decode_ace_maxsz)
+#define decode_change_info_maxsz	(5)
+#define decode_open_maxsz	(op_decode_hdr_maxsz + \
+				decode_stateid_maxsz + \
+				decode_change_info_maxsz + 1 + \
+				nfs4_fattr_bitmap_maxsz + \
+				decode_delegation_maxsz)
+#define encode_open_confirm_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 1)
+#define decode_open_confirm_maxsz \
+				(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_open_downgrade_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 1 + \
+				 encode_share_access_maxsz)
+#define decode_open_downgrade_maxsz \
+				(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_close_maxsz	(op_encode_hdr_maxsz + \
+				 1 + encode_stateid_maxsz)
+#define decode_close_maxsz	(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_setattr_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + \
+				 encode_attrs_maxsz)
+#define decode_setattr_maxsz	(op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz)
+#define encode_read_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 3)
+#define decode_read_maxsz	(op_decode_hdr_maxsz + 2)
+#define encode_readdir_maxsz	(op_encode_hdr_maxsz + \
+				 2 + encode_verifier_maxsz + 5 + \
+				nfs4_label_maxsz)
+#define decode_readdir_maxsz	(op_decode_hdr_maxsz + \
+				 decode_verifier_maxsz)
+#define encode_readlink_maxsz	(op_encode_hdr_maxsz)
+#define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1)
+#define encode_write_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 4)
+#define decode_write_maxsz	(op_decode_hdr_maxsz + \
+				 2 + decode_verifier_maxsz)
+#define encode_commit_maxsz	(op_encode_hdr_maxsz + 3)
+#define decode_commit_maxsz	(op_decode_hdr_maxsz + \
+				 decode_verifier_maxsz)
+#define encode_remove_maxsz	(op_encode_hdr_maxsz + \
+				nfs4_name_maxsz)
+#define decode_remove_maxsz	(op_decode_hdr_maxsz + \
+				 decode_change_info_maxsz)
+#define encode_rename_maxsz	(op_encode_hdr_maxsz + \
+				2 * nfs4_name_maxsz)
+#define decode_rename_maxsz	(op_decode_hdr_maxsz + \
+				 decode_change_info_maxsz + \
+				 decode_change_info_maxsz)
+#define encode_link_maxsz	(op_encode_hdr_maxsz + \
+				nfs4_name_maxsz)
+#define decode_link_maxsz	(op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_lockowner_maxsz	(7)
+#define encode_lock_maxsz	(op_encode_hdr_maxsz + \
+				 7 + \
+				 1 + encode_stateid_maxsz + 1 + \
+				 encode_lockowner_maxsz)
+#define decode_lock_denied_maxsz \
+				(8 + decode_lockowner_maxsz)
+#define decode_lock_maxsz	(op_decode_hdr_maxsz + \
+				 decode_lock_denied_maxsz)
+#define encode_lockt_maxsz	(op_encode_hdr_maxsz + 5 + \
+				encode_lockowner_maxsz)
+#define decode_lockt_maxsz	(op_decode_hdr_maxsz + \
+				 decode_lock_denied_maxsz)
+#define encode_locku_maxsz	(op_encode_hdr_maxsz + 3 + \
+				 encode_stateid_maxsz + \
+				 4)
+#define decode_locku_maxsz	(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+				(op_decode_hdr_maxsz)
+#define encode_access_maxsz	(op_encode_hdr_maxsz + 1)
+#define decode_access_maxsz	(op_decode_hdr_maxsz + 2)
+#define encode_symlink_maxsz	(op_encode_hdr_maxsz + \
+				1 + nfs4_name_maxsz + \
+				1 + \
+				nfs4_fattr_maxsz)
+#define decode_symlink_maxsz	(op_decode_hdr_maxsz + 8)
+#define encode_create_maxsz	(op_encode_hdr_maxsz + \
+				1 + 2 + nfs4_name_maxsz + \
+				encode_attrs_maxsz)
+#define decode_create_maxsz	(op_decode_hdr_maxsz + \
+				decode_change_info_maxsz + \
+				nfs4_fattr_bitmap_maxsz)
+#define encode_statfs_maxsz	(encode_getattr_maxsz)
+#define decode_statfs_maxsz	(decode_getattr_maxsz)
+#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+#define encode_getacl_maxsz	(encode_getattr_maxsz)
+#define decode_getacl_maxsz	(op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz + 1)
+#define encode_setacl_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 3)
+#define decode_setacl_maxsz	(decode_setattr_maxsz)
+#define encode_fs_locations_maxsz \
+				(encode_getattr_maxsz)
+#define decode_fs_locations_maxsz \
+				(0)
+#define encode_secinfo_maxsz	(op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_secinfo_maxsz	(op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
+
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \
+			 sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)
+
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+				encode_verifier_maxsz + \
+				1 /* co_ownerid.len */ + \
+				XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+				1 /* flags */ + \
+				1 /* spa_how */ + \
+				/* max is SP4_MACH_CRED (for now) */ + \
+				1 + NFS4_OP_MAP_NUM_WORDS + \
+				1 + NFS4_OP_MAP_NUM_WORDS + \
+				1 /* implementation id array of size 1 */ + \
+				1 /* nii_domain */ + \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+				1 /* nii_name */ + \
+				XDR_QUADLEN(IMPL_NAME_LIMIT) + \
+				3 /* nii_date */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+				2 /* eir_clientid */ + \
+				1 /* eir_sequenceid */ + \
+				1 /* eir_flags */ + \
+				1 /* spr_how */ + \
+				  /* max is SP4_MACH_CRED (for now) */ + \
+				1 + NFS4_OP_MAP_NUM_WORDS + \
+				1 + NFS4_OP_MAP_NUM_WORDS + \
+				2 /* eir_server_owner.so_minor_id */ + \
+				/* eir_server_owner.so_major_id<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				/* eir_server_scope<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				1 /* eir_server_impl_id array length */ + \
+				1 /* nii_domain */ + \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+				1 /* nii_name */ + \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+				3 /* nii_date */)
+#define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz  (6 + \
+				     1 /* ca_rdma_ird.len */ + \
+				     1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz  (op_encode_hdr_maxsz + \
+				     2 /* csa_clientid */ + \
+				     1 /* csa_sequence */ + \
+				     1 /* csa_flags */ + \
+				     encode_channel_attrs_maxsz + \
+				     encode_channel_attrs_maxsz + \
+				     1 /* csa_cb_program */ + \
+				     1 /* csa_sec_parms.len (1) */ + \
+				     1 /* cb_secflavor (AUTH_SYS) */ + \
+				     1 /* stamp */ + \
+				     1 /* machinename.len */ + \
+				     XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+				     1 /* uid */ + \
+				     1 /* gid */ + \
+				     1 /* gids.len (0) */)
+#define decode_create_session_maxsz  (op_decode_hdr_maxsz +	\
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* csr_sequence */ + \
+				     1 /* csr_flags */ + \
+				     decode_channel_attrs_maxsz + \
+				     decode_channel_attrs_maxsz)
+#define encode_bind_conn_to_session_maxsz  (op_encode_hdr_maxsz + \
+				     /* bctsa_sessid */ \
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* bctsa_dir */ + \
+				     1 /* bctsa_use_conn_in_rdma_mode */)
+#define decode_bind_conn_to_session_maxsz  (op_decode_hdr_maxsz +	\
+				     /* bctsr_sessid */ \
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* bctsr_dir */ + \
+				     1 /* bctsr_use_conn_in_rdma_mode */)
+#define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)
+#define decode_destroy_session_maxsz    (op_decode_hdr_maxsz)
+#define encode_destroy_clientid_maxsz   (op_encode_hdr_maxsz + 2)
+#define decode_destroy_clientid_maxsz   (op_decode_hdr_maxsz)
+#define encode_sequence_maxsz	(op_encode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
+#define decode_sequence_maxsz	(op_decode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
+#define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+				1 /* layout type */ + \
+				1 /* maxcount */ + \
+				1 /* bitmap size */ + \
+				1 /* notification bitmap length */ + \
+				1 /* notification bitmap, word 0 */)
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+				1 /* layout type */ + \
+				1 /* opaque devaddr4 length */ + \
+				  /* devaddr4 payload is read into page */ \
+				1 /* notification bitmap length */ + \
+				1 /* notification bitmap, word 0 */)
+#define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
+				encode_stateid_maxsz)
+#define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
+				decode_stateid_maxsz + \
+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz +          \
+				2 /* offset */ + \
+				2 /* length */ + \
+				1 /* reclaim */ + \
+				encode_stateid_maxsz + \
+				1 /* new offset (true) */ + \
+				2 /* last byte written */ + \
+				1 /* nt_timechanged (false) */ + \
+				1 /* layoutupdate4 layout type */ + \
+				1 /* layoutupdate4 opaqueue len */)
+				  /* the actual content of layoutupdate4 should
+				     be allocated by drivers and spliced in
+				     using xdr_write_pages */
+#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+				encode_stateid_maxsz + \
+				1 /* FIXME: opaque lrf_body always empty at the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+				1 + decode_stateid_maxsz)
+#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
+#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz
+#define encode_test_stateid_maxsz	(op_encode_hdr_maxsz + 2 + \
+					 XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_test_stateid_maxsz	(op_decode_hdr_maxsz + 2 + 1)
+#define encode_free_stateid_maxsz	(op_encode_hdr_maxsz + 1 + \
+					 XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_free_stateid_maxsz	(op_decode_hdr_maxsz)
+#else /* CONFIG_NFS_V4_1 */
+#define encode_sequence_maxsz	0
+#define decode_sequence_maxsz	0
+#endif /* CONFIG_NFS_V4_1 */
+
+#define NFS4_enc_compound_sz	(1024)  /* XXX: large enough? */
+#define NFS4_dec_compound_sz	(1024)  /* XXX: large enough? */
+#define NFS4_enc_read_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_read_maxsz)
+#define NFS4_dec_read_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_read_maxsz)
+#define NFS4_enc_readlink_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_readlink_maxsz)
+#define NFS4_dec_readlink_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_readlink_maxsz)
+#define NFS4_enc_readdir_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_readdir_maxsz)
+#define NFS4_dec_readdir_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_readdir_maxsz)
+#define NFS4_enc_write_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_write_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_write_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_write_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_commit_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_commit_maxsz)
+#define NFS4_dec_commit_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_commit_maxsz)
+#define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_open_maxsz + \
+				encode_access_maxsz + \
+				encode_getfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_open_maxsz + \
+				decode_access_maxsz + \
+				decode_getfh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_open_confirm_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_open_confirm_maxsz)
+#define NFS4_dec_open_confirm_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_open_confirm_maxsz)
+#define NFS4_enc_open_noattr_sz	(compound_encode_hdr_maxsz + \
+					encode_sequence_maxsz + \
+					encode_putfh_maxsz + \
+					encode_open_maxsz + \
+					encode_access_maxsz + \
+					encode_getattr_maxsz)
+#define NFS4_dec_open_noattr_sz	(compound_decode_hdr_maxsz + \
+					decode_sequence_maxsz + \
+					decode_putfh_maxsz + \
+					decode_open_maxsz + \
+					decode_access_maxsz + \
+					decode_getattr_maxsz)
+#define NFS4_enc_open_downgrade_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_open_downgrade_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_open_downgrade_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_open_downgrade_maxsz + \
+				 decode_getattr_maxsz)
+#define NFS4_enc_close_sz	(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_close_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_close_sz	(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_close_maxsz + \
+				 decode_getattr_maxsz)
+#define NFS4_enc_setattr_sz	(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_setattr_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_setattr_sz	(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_setattr_maxsz + \
+				 decode_getattr_maxsz)
+#define NFS4_enc_fsinfo_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_fsinfo_maxsz)
+#define NFS4_dec_fsinfo_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_fsinfo_maxsz)
+#define NFS4_enc_renew_sz	(compound_encode_hdr_maxsz + \
+				encode_renew_maxsz)
+#define NFS4_dec_renew_sz	(compound_decode_hdr_maxsz + \
+				decode_renew_maxsz)
+#define NFS4_enc_setclientid_sz	(compound_encode_hdr_maxsz + \
+				encode_setclientid_maxsz)
+#define NFS4_dec_setclientid_sz	(compound_decode_hdr_maxsz + \
+				decode_setclientid_maxsz)
+#define NFS4_enc_setclientid_confirm_sz \
+				(compound_encode_hdr_maxsz + \
+				encode_setclientid_confirm_maxsz)
+#define NFS4_dec_setclientid_confirm_sz \
+				(compound_decode_hdr_maxsz + \
+				decode_setclientid_confirm_maxsz)
+#define NFS4_enc_lock_sz        (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_lock_maxsz)
+#define NFS4_dec_lock_sz        (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_lock_maxsz)
+#define NFS4_enc_lockt_sz       (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_lockt_maxsz)
+#define NFS4_dec_lockt_sz       (compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_lockt_maxsz)
+#define NFS4_enc_locku_sz       (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_locku_maxsz)
+#define NFS4_dec_locku_sz       (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_lockowner_maxsz)
+#define NFS4_enc_access_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_access_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_access_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_access_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getattr_maxsz + \
+				encode_renew_maxsz)
+#define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getattr_maxsz + \
+				decode_renew_maxsz)
+#define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_lookup_maxsz + \
+				encode_getattr_maxsz + \
+				encode_getfh_maxsz)
+#define NFS4_dec_lookup_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_lookup_maxsz + \
+				decode_getattr_maxsz + \
+				decode_getfh_maxsz)
+#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putrootfh_maxsz + \
+				encode_getattr_maxsz + \
+				encode_getfh_maxsz)
+#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putrootfh_maxsz + \
+				decode_getattr_maxsz + \
+				decode_getfh_maxsz)
+#define NFS4_enc_remove_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_remove_maxsz)
+#define NFS4_dec_remove_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_remove_maxsz)
+#define NFS4_enc_rename_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_savefh_maxsz + \
+				encode_putfh_maxsz + \
+				encode_rename_maxsz)
+#define NFS4_dec_rename_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_savefh_maxsz + \
+				decode_putfh_maxsz + \
+				decode_rename_maxsz)
+#define NFS4_enc_link_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_savefh_maxsz + \
+				encode_putfh_maxsz + \
+				encode_link_maxsz + \
+				encode_restorefh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_link_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_savefh_maxsz + \
+				decode_putfh_maxsz + \
+				decode_link_maxsz + \
+				decode_restorefh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_symlink_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_symlink_maxsz + \
+				encode_getattr_maxsz + \
+				encode_getfh_maxsz)
+#define NFS4_dec_symlink_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_symlink_maxsz + \
+				decode_getattr_maxsz + \
+				decode_getfh_maxsz)
+#define NFS4_enc_create_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_create_maxsz + \
+				encode_getfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_create_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_create_maxsz + \
+				decode_getfh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_pathconf_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_pathconf_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_statfs_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_statfs_maxsz)
+#define NFS4_dec_statfs_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_statfs_maxsz)
+#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_delegreturn_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_delegreturn_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_delegreturn_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_getacl_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getacl_maxsz)
+#define NFS4_dec_getacl_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getacl_maxsz)
+#define NFS4_enc_setacl_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_setacl_maxsz)
+#define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_setacl_maxsz)
+#define NFS4_enc_fs_locations_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_lookup_maxsz + \
+				 encode_fs_locations_maxsz + \
+				 encode_renew_maxsz)
+#define NFS4_dec_fs_locations_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_lookup_maxsz + \
+				 decode_fs_locations_maxsz + \
+				 decode_renew_maxsz)
+#define NFS4_enc_secinfo_sz 	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_secinfo_maxsz)
+#define NFS4_dec_secinfo_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_secinfo_maxsz)
+#define NFS4_enc_fsid_present_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getfh_maxsz + \
+				 encode_renew_maxsz)
+#define NFS4_dec_fsid_present_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_getfh_maxsz + \
+				 decode_renew_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_bind_conn_to_session_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_bind_conn_to_session_maxsz)
+#define NFS4_dec_bind_conn_to_session_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_bind_conn_to_session_maxsz)
+#define NFS4_enc_exchange_id_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_create_session_maxsz)
+#define NFS4_enc_destroy_session_sz	(compound_encode_hdr_maxsz + \
+					 encode_destroy_session_maxsz)
+#define NFS4_dec_destroy_session_sz	(compound_decode_hdr_maxsz + \
+					 decode_destroy_session_maxsz)
+#define NFS4_enc_destroy_clientid_sz	(compound_encode_hdr_maxsz + \
+					 encode_destroy_clientid_maxsz)
+#define NFS4_dec_destroy_clientid_sz	(compound_decode_hdr_maxsz + \
+					 decode_destroy_clientid_maxsz)
+#define NFS4_enc_sequence_sz \
+				(compound_decode_hdr_maxsz + \
+				 encode_sequence_maxsz)
+#define NFS4_dec_sequence_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz)
+#define NFS4_enc_get_lease_time_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putrootfh_maxsz + \
+					 encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putrootfh_maxsz + \
+					 decode_fsinfo_maxsz)
+#define NFS4_enc_reclaim_complete_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_reclaim_complete_maxsz)
+#define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+				encode_sequence_maxsz +\
+				encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+				decode_sequence_maxsz + \
+				decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz +        \
+				encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz +        \
+				decode_layoutget_maxsz)
+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz +\
+				encode_putfh_maxsz + \
+				encode_layoutcommit_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_layoutcommit_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_layoutreturn_maxsz)
+#define NFS4_enc_secinfo_no_name_sz	(compound_encode_hdr_maxsz + \
+					encode_sequence_maxsz + \
+					encode_putrootfh_maxsz +\
+					encode_secinfo_no_name_maxsz)
+#define NFS4_dec_secinfo_no_name_sz	(compound_decode_hdr_maxsz + \
+					decode_sequence_maxsz + \
+					decode_putrootfh_maxsz + \
+					decode_secinfo_no_name_maxsz)
+#define NFS4_enc_test_stateid_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_test_stateid_maxsz)
+#define NFS4_dec_test_stateid_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_test_stateid_maxsz)
+#define NFS4_enc_free_stateid_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_free_stateid_maxsz)
+#define NFS4_dec_free_stateid_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_free_stateid_maxsz)
+
+const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+				      compound_encode_hdr_maxsz +
+				      encode_sequence_maxsz +
+				      encode_putfh_maxsz +
+				      encode_getattr_maxsz) *
+				     XDR_UNIT);
+
+const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+				     compound_decode_hdr_maxsz +
+				     decode_sequence_maxsz +
+				     decode_putfh_maxsz) *
+				    XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+					   compound_decode_hdr_maxsz +
+					   decode_sequence_maxsz) *
+					  XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
+#endif /* CONFIG_NFS_V4_1 */
+
+static const umode_t nfs_type2fmt[] = {
+	[NF4BAD] = 0,
+	[NF4REG] = S_IFREG,
+	[NF4DIR] = S_IFDIR,
+	[NF4BLK] = S_IFBLK,
+	[NF4CHR] = S_IFCHR,
+	[NF4LNK] = S_IFLNK,
+	[NF4SOCK] = S_IFSOCK,
+	[NF4FIFO] = S_IFIFO,
+	[NF4ATTRDIR] = 0,
+	[NF4NAMEDATTR] = 0,
+};
+
+struct compound_hdr {
+	int32_t		status;
+	uint32_t	nops;
+	__be32 *	nops_p;
+	uint32_t	taglen;
+	char *		tag;
+	uint32_t	replen;		/* expected reply words */
+	u32		minorversion;
+};
+
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr_reserve_space(xdr, nbytes);
+	BUG_ON(!p);
+	return p;
+}
+
+static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, len);
+	xdr_encode_opaque_fixed(p, buf, len);
+}
+
+static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4 + len);
+	xdr_encode_opaque(p, str, len);
+}
+
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(n);
+}
+
+static void encode_uint64(struct xdr_stream *xdr, u64 n)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8);
+	xdr_encode_hyper(p, n);
+}
+
+static void encode_nfs4_seqid(struct xdr_stream *xdr,
+		const struct nfs_seqid *seqid)
+{
+	if (seqid != NULL)
+		encode_uint32(xdr, seqid->sequence->counter);
+	else
+		encode_uint32(xdr, 0);
+}
+
+static void encode_compound_hdr(struct xdr_stream *xdr,
+				struct rpc_rqst *req,
+				struct compound_hdr *hdr)
+{
+	__be32 *p;
+	struct rpc_auth *auth = req->rq_cred->cr_auth;
+
+	/* initialize running count of expected bytes in reply.
+	 * NOTE: the replied tag SHOULD be the same is the one sent,
+	 * but this is not required as a MUST for the server to do so. */
+	hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
+
+	WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
+	encode_string(xdr, hdr->taglen, hdr->tag);
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(hdr->minorversion);
+	hdr->nops_p = p;
+	*p = cpu_to_be32(hdr->nops);
+}
+
+static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
+		uint32_t replen,
+		struct compound_hdr *hdr)
+{
+	encode_uint32(xdr, op);
+	hdr->nops++;
+	hdr->replen += replen;
+}
+
+static void encode_nops(struct compound_hdr *hdr)
+{
+	WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);
+	*hdr->nops_p = htonl(hdr->nops);
+}
+
+static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+	encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
+
+static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+	encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
+}
+
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+				const struct nfs4_label *label,
+				const struct nfs_server *server)
+{
+	char owner_name[IDMAP_NAMESZ];
+	char owner_group[IDMAP_NAMESZ];
+	int owner_namelen = 0;
+	int owner_grouplen = 0;
+	__be32 *p;
+	unsigned i;
+	uint32_t len = 0;
+	uint32_t bmval_len;
+	uint32_t bmval[3] = { 0 };
+
+	/*
+	 * We reserve enough space to write the entire attribute buffer at once.
+	 * In the worst-case, this would be
+	 * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+	 * = 40 bytes, plus any contribution from variable-length fields
+	 *            such as owner/group.
+	 */
+	if (iap->ia_valid & ATTR_SIZE) {
+		bmval[0] |= FATTR4_WORD0_SIZE;
+		len += 8;
+	}
+	if (iap->ia_valid & ATTR_MODE) {
+		bmval[1] |= FATTR4_WORD1_MODE;
+		len += 4;
+	}
+	if (iap->ia_valid & ATTR_UID) {
+		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+		if (owner_namelen < 0) {
+			dprintk("nfs: couldn't resolve uid %d to string\n",
+					from_kuid(&init_user_ns, iap->ia_uid));
+			/* XXX */
+			strcpy(owner_name, "nobody");
+			owner_namelen = sizeof("nobody") - 1;
+			/* goto out; */
+		}
+		bmval[1] |= FATTR4_WORD1_OWNER;
+		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
+	}
+	if (iap->ia_valid & ATTR_GID) {
+		owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+		if (owner_grouplen < 0) {
+			dprintk("nfs: couldn't resolve gid %d to string\n",
+					from_kgid(&init_user_ns, iap->ia_gid));
+			strcpy(owner_group, "nobody");
+			owner_grouplen = sizeof("nobody") - 1;
+			/* goto out; */
+		}
+		bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
+		len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
+	}
+	if (iap->ia_valid & ATTR_ATIME_SET) {
+		bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+		len += 16;
+	} else if (iap->ia_valid & ATTR_ATIME) {
+		bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+		len += 4;
+	}
+	if (iap->ia_valid & ATTR_MTIME_SET) {
+		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+		len += 16;
+	} else if (iap->ia_valid & ATTR_MTIME) {
+		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+		len += 4;
+	}
+	if (label) {
+		len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
+		bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
+	}
+
+	if (bmval[2] != 0)
+		bmval_len = 3;
+	else if (bmval[1] != 0)
+		bmval_len = 2;
+	else
+		bmval_len = 1;
+
+	p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len);
+
+	*p++ = cpu_to_be32(bmval_len);
+	for (i = 0; i < bmval_len; i++)
+		*p++ = cpu_to_be32(bmval[i]);
+	*p++ = cpu_to_be32(len);
+
+	if (bmval[0] & FATTR4_WORD0_SIZE)
+		p = xdr_encode_hyper(p, iap->ia_size);
+	if (bmval[1] & FATTR4_WORD1_MODE)
+		*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+	if (bmval[1] & FATTR4_WORD1_OWNER)
+		p = xdr_encode_opaque(p, owner_name, owner_namelen);
+	if (bmval[1] & FATTR4_WORD1_OWNER_GROUP)
+		p = xdr_encode_opaque(p, owner_group, owner_grouplen);
+	if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+		if (iap->ia_valid & ATTR_ATIME_SET) {
+			*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+			p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
+			*p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
+		} else
+			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+	}
+	if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+		if (iap->ia_valid & ATTR_MTIME_SET) {
+			*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+			p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
+			*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+		} else
+			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+	}
+	if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+		*p++ = cpu_to_be32(label->lfs);
+		*p++ = cpu_to_be32(label->pi);
+		*p++ = cpu_to_be32(label->len);
+		p = xdr_encode_opaque_fixed(p, label->label, label->len);
+	}
+
+/* out: */
+}
+
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr);
+	encode_uint32(xdr, access);
+}
+
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
+	encode_nfs4_seqid(xdr, arg->seqid);
+	encode_nfs4_stateid(xdr, &arg->stateid);
+}
+
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+	p = reserve_space(xdr, 12);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr);
+	encode_uint32(xdr, create->ftype);
+
+	switch (create->ftype) {
+	case NF4LNK:
+		p = reserve_space(xdr, 4);
+		*p = cpu_to_be32(create->u.symlink.len);
+		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+		break;
+
+	case NF4BLK: case NF4CHR:
+		p = reserve_space(xdr, 8);
+		*p++ = cpu_to_be32(create->u.device.specdata1);
+		*p = cpu_to_be32(create->u.device.specdata2);
+		break;
+
+	default:
+		break;
+	}
+
+	encode_string(xdr, create->name->len, create->name->name);
+	encode_attrs(xdr, create->attrs, create->label, create->server);
+}
+
+static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(1);
+	*p = cpu_to_be32(bitmap);
+}
+
+static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
+	p = reserve_space(xdr, 12);
+	*p++ = cpu_to_be32(2);
+	*p++ = cpu_to_be32(bm0);
+	*p = cpu_to_be32(bm1);
+}
+
+static void
+encode_getattr_three(struct xdr_stream *xdr,
+		     uint32_t bm0, uint32_t bm1, uint32_t bm2,
+		     struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
+	if (bm2) {
+		p = reserve_space(xdr, 16);
+		*p++ = cpu_to_be32(3);
+		*p++ = cpu_to_be32(bm0);
+		*p++ = cpu_to_be32(bm1);
+		*p = cpu_to_be32(bm2);
+	} else if (bm1) {
+		p = reserve_space(xdr, 12);
+		*p++ = cpu_to_be32(2);
+		*p++ = cpu_to_be32(bm0);
+		*p = cpu_to_be32(bm1);
+	} else {
+		p = reserve_space(xdr, 8);
+		*p++ = cpu_to_be32(1);
+		*p = cpu_to_be32(bm0);
+	}
+}
+
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+	encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+			   bitmask[1] & nfs4_fattr_bitmap[1],
+			   bitmask[2] & nfs4_fattr_bitmap[2],
+			   hdr);
+}
+
+static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
+				 const u32 *open_bitmap,
+				 struct compound_hdr *hdr)
+{
+	encode_getattr_three(xdr,
+			     bitmask[0] & open_bitmap[0],
+			     bitmask[1] & open_bitmap[1],
+			     bitmask[2] & open_bitmap[2],
+			     hdr);
+}
+
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+	encode_getattr_three(xdr,
+			     bitmask[0] & nfs4_fsinfo_bitmap[0],
+			     bitmask[1] & nfs4_fsinfo_bitmap[1],
+			     bitmask[2] & nfs4_fsinfo_bitmap[2],
+			     hdr);
+}
+
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+	encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
+			   bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
+}
+
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);
+}
+
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr);
+	encode_string(xdr, name->len, name->name);
+}
+
+static inline int nfs4_lock_type(struct file_lock *fl, int block)
+{
+	if (fl->fl_type == F_RDLCK)
+		return block ? NFS4_READW_LT : NFS4_READ_LT;
+	return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
+}
+
+static inline uint64_t nfs4_lock_length(struct file_lock *fl)
+{
+	if (fl->fl_end == OFFSET_MAX)
+		return ~(uint64_t)0;
+	return fl->fl_end - fl->fl_start + 1;
+}
+
+static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 32);
+	p = xdr_encode_hyper(p, lowner->clientid);
+	*p++ = cpu_to_be32(20);
+	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+	*p++ = cpu_to_be32(lowner->s_dev);
+	xdr_encode_hyper(p, lowner->id);
+}
+
+/*
+ * opcode,type,reclaim,offset,length,new_lock_owner = 32
+ * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
+ */
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr);
+	p = reserve_space(xdr, 28);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+	*p++ = cpu_to_be32(args->reclaim);
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	*p = cpu_to_be32(args->new_lock_owner);
+	if (args->new_lock_owner){
+		encode_nfs4_seqid(xdr, args->open_seqid);
+		encode_nfs4_stateid(xdr, &args->open_stateid);
+		encode_nfs4_seqid(xdr, args->lock_seqid);
+		encode_lockowner(xdr, &args->lock_owner);
+	}
+	else {
+		encode_nfs4_stateid(xdr, &args->lock_stateid);
+		encode_nfs4_seqid(xdr, args->lock_seqid);
+	}
+}
+
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr);
+	p = reserve_space(xdr, 20);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	encode_lockowner(xdr, &args->lock_owner);
+}
+
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
+	encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
+	encode_nfs4_seqid(xdr, args->seqid);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	p = reserve_space(xdr, 16);
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+}
+
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);
+	encode_lockowner(xdr, lowner);
+}
+
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr);
+	encode_string(xdr, name->len, name->name);
+}
+
+static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(share_access);
+	*p = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
+}
+
+static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+	__be32 *p;
+ /*
+ * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+ * owner 4 = 32
+ */
+	encode_nfs4_seqid(xdr, arg->seqid);
+	encode_share_access(xdr, arg->share_access);
+	p = reserve_space(xdr, 36);
+	p = xdr_encode_hyper(p, arg->clientid);
+	*p++ = cpu_to_be32(24);
+	p = xdr_encode_opaque_fixed(p, "open id:", 8);
+	*p++ = cpu_to_be32(arg->server->s_dev);
+	*p++ = cpu_to_be32(arg->id.uniquifier);
+	xdr_encode_hyper(p, arg->id.create_time);
+}
+
+static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+	struct iattr dummy;
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	switch(arg->createmode) {
+	case NFS4_CREATE_UNCHECKED:
+		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		break;
+	case NFS4_CREATE_GUARDED:
+		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		break;
+	case NFS4_CREATE_EXCLUSIVE:
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+		encode_nfs4_verifier(xdr, &arg->u.verifier);
+		break;
+	case NFS4_CREATE_EXCLUSIVE4_1:
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+		encode_nfs4_verifier(xdr, &arg->u.verifier);
+		dummy.ia_valid = 0;
+		encode_attrs(xdr, &dummy, arg->label, arg->server);
+	}
+}
+
+static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	switch (arg->open_flags & O_CREAT) {
+	case 0:
+		*p = cpu_to_be32(NFS4_OPEN_NOCREATE);
+		break;
+	default:
+		*p = cpu_to_be32(NFS4_OPEN_CREATE);
+		encode_createmode(xdr, arg);
+	}
+}
+
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	switch (delegation_type) {
+	case 0:
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
+		break;
+	case FMODE_READ:
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
+		break;
+	case FMODE_WRITE|FMODE_READ:
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
+	encode_string(xdr, name->len, name->name);
+}
+
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
+	encode_delegation_type(xdr, type);
+}
+
+static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+	encode_nfs4_stateid(xdr, stateid);
+	encode_string(xdr, name->len, name->name);
+}
+
+static inline void encode_claim_fh(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_FH);
+}
+
+static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+	encode_nfs4_stateid(xdr, stateid);
+}
+
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
+	encode_openhdr(xdr, arg);
+	encode_opentype(xdr, arg);
+	switch (arg->claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+		encode_claim_null(xdr, arg->name);
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+		encode_claim_previous(xdr, arg->u.delegation_type);
+		break;
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+		encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+		break;
+	case NFS4_OPEN_CLAIM_FH:
+		encode_claim_fh(xdr);
+		break;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		encode_claim_delegate_cur_fh(xdr, &arg->u.delegation);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr);
+	encode_nfs4_stateid(xdr, arg->stateid);
+	encode_nfs4_seqid(xdr, arg->seqid);
+}
+
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &arg->stateid);
+	encode_nfs4_seqid(xdr, arg->seqid);
+	encode_share_access(xdr, arg->share_access);
+}
+
+static void
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr);
+	encode_string(xdr, fh->size, fh->data);
+}
+
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
+}
+
+static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+			struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->stateid);
+
+	p = reserve_space(xdr, 12);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+	uint32_t attrs[3] = {
+		FATTR4_WORD0_RDATTR_ERROR,
+		FATTR4_WORD1_MOUNTED_ON_FILEID,
+	};
+	uint32_t dircount = readdir->count >> 1;
+	__be32 *p, verf[2];
+	uint32_t attrlen = 0;
+	unsigned int i;
+
+	if (readdir->plus) {
+		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+			FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;
+		attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+		dircount >>= 1;
+	}
+	/* Use mounted_on_fileid only if the server supports it */
+	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
+		attrs[0] |= FATTR4_WORD0_FILEID;
+	for (i = 0; i < ARRAY_SIZE(attrs); i++) {
+		attrs[i] &= readdir->bitmask[i];
+		if (attrs[i] != 0)
+			attrlen = i+1;
+	}
+
+	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
+	encode_uint64(xdr, readdir->cookie);
+	encode_nfs4_verifier(xdr, &readdir->verifier);
+	p = reserve_space(xdr, 12 + (attrlen << 2));
+	*p++ = cpu_to_be32(dircount);
+	*p++ = cpu_to_be32(readdir->count);
+	*p++ = cpu_to_be32(attrlen);
+	for (i = 0; i < attrlen; i++)
+		*p++ = cpu_to_be32(attrs[i]);
+	memcpy(verf, readdir->verifier.data, sizeof(verf));
+
+	dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
+			__func__,
+			(unsigned long long)readdir->cookie,
+			verf[0], verf[1],
+			attrs[0] & readdir->bitmask[0],
+			attrs[1] & readdir->bitmask[1],
+			attrs[2] & readdir->bitmask[2]);
+}
+
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);
+}
+
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr);
+	encode_string(xdr, name->len, name->name);
+}
+
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);
+	encode_string(xdr, oldname->len, oldname->name);
+	encode_string(xdr, newname->len, newname->name);
+}
+
+static void encode_renew(struct xdr_stream *xdr, clientid4 clid,
+			 struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr);
+	encode_uint64(xdr, clid);
+}
+
+static void
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
+}
+
+static void
+encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &zero_stateid);
+	p = reserve_space(xdr, 2*4);
+	*p++ = cpu_to_be32(1);
+	*p = cpu_to_be32(FATTR4_WORD0_ACL);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(arg->acl_len);
+	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+}
+
+static void
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);
+}
+
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &arg->stateid);
+	encode_attrs(xdr, arg->iap, arg->label, server);
+}
+
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
+	encode_nfs4_verifier(xdr, setclientid->sc_verifier);
+
+	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(setclientid->sc_prog);
+	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
+	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(setclientid->sc_cb_ident);
+}
+
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM,
+			decode_setclientid_confirm_maxsz, hdr);
+	encode_uint64(xdr, arg->clientid);
+	encode_nfs4_verifier(xdr, &arg->confirm);
+}
+
+static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+			 struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->stateid);
+
+	p = reserve_space(xdr, 16);
+	p = xdr_encode_hyper(p, args->offset);
+	*p++ = cpu_to_be32(args->stable);
+	*p = cpu_to_be32(args->count);
+
+	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr);
+	encode_nfs4_stateid(xdr, stateid);
+}
+
+static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr);
+	encode_string(xdr, name->len, name->name);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/* NFSv4.1 operations */
+static void encode_bind_conn_to_session(struct xdr_stream *xdr,
+				   struct nfs41_bind_conn_to_session_args *args,
+				   struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
+		decode_bind_conn_to_session_maxsz, hdr);
+	encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(args->dir);
+	*p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
+}
+
+static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
+{
+	unsigned int i;
+	encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS);
+	for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++)
+		encode_uint32(xdr, op_map->u.words[i]);
+}
+
+static void encode_exchange_id(struct xdr_stream *xdr,
+			       struct nfs41_exchange_id_args *args,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+	char impl_name[IMPL_NAME_LIMIT];
+	int len = 0;
+
+	encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
+	encode_nfs4_verifier(xdr, args->verifier);
+
+	encode_string(xdr, args->id_len, args->id);
+
+	encode_uint32(xdr, args->flags);
+	encode_uint32(xdr, args->state_protect.how);
+
+	switch (args->state_protect.how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		encode_op_map(xdr, &args->state_protect.enforce);
+		encode_op_map(xdr, &args->state_protect.allow);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	if (send_implementation_id &&
+	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
+	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
+		<= sizeof(impl_name) + 1)
+		len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
+			       utsname()->sysname, utsname()->release,
+			       utsname()->version, utsname()->machine);
+
+	if (len > 0) {
+		encode_uint32(xdr, 1);	/* implementation id array length=1 */
+
+		encode_string(xdr,
+			sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
+			CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN);
+		encode_string(xdr, len, impl_name);
+		/* just send zeros for nii_date - the date is in nii_name */
+		p = reserve_space(xdr, 12);
+		p = xdr_encode_hyper(p, 0);
+		*p = cpu_to_be32(0);
+	} else
+		encode_uint32(xdr, 0);	/* implementation id array length=0 */
+}
+
+static void encode_create_session(struct xdr_stream *xdr,
+				  struct nfs41_create_session_args *args,
+				  struct compound_hdr *hdr)
+{
+	__be32 *p;
+	struct nfs_client *clp = args->client;
+	struct rpc_clnt *clnt = clp->cl_rpcclient;
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+	u32 max_resp_sz_cached;
+
+	/*
+	 * Assumes OPEN is the biggest non-idempotent compound.
+	 * 2 is the verifier.
+	 */
+	max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
+			      RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
+
+	encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
+	p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
+	p = xdr_encode_hyper(p, args->clientid);
+	*p++ = cpu_to_be32(args->seqid);			/*Sequence id */
+	*p++ = cpu_to_be32(args->flags);			/*flags */
+
+	/* Fore Channel */
+	*p++ = cpu_to_be32(0);				/* header padding size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(max_resp_sz_cached);		/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->fc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->fc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
+
+	/* Back Channel */
+	*p++ = cpu_to_be32(0);				/* header padding size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->bc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
+
+	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
+
+	/* authsys_parms rfc1831 */
+	*p++ = cpu_to_be32(nn->boot_time.tv_nsec);	/* stamp */
+	p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
+	*p++ = cpu_to_be32(0);				/* UID */
+	*p++ = cpu_to_be32(0);				/* GID */
+	*p = cpu_to_be32(0);				/* No more gids */
+}
+
+static void encode_destroy_session(struct xdr_stream *xdr,
+				   struct nfs4_session *session,
+				   struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr);
+	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+}
+
+static void encode_destroy_clientid(struct xdr_stream *xdr,
+				   uint64_t clientid,
+				   struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr);
+	encode_uint64(xdr, clientid);
+}
+
+static void encode_reclaim_complete(struct xdr_stream *xdr,
+				    struct nfs41_reclaim_complete_args *args,
+				    struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr);
+	encode_uint32(xdr, args->one_fs);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void encode_sequence(struct xdr_stream *xdr,
+			    const struct nfs4_sequence_args *args,
+			    struct compound_hdr *hdr)
+{
+#if defined(CONFIG_NFS_V4_1)
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tp;
+	struct nfs4_slot *slot = args->sa_slot;
+	__be32 *p;
+
+	tp = slot->table;
+	session = tp->session;
+	if (!session)
+		return;
+
+	encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
+
+	/*
+	 * Sessionid + seqid + slotid + max slotid + cache_this
+	 */
+	dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d "
+		"max_slotid=%d cache_this=%d\n",
+		__func__,
+		((u32 *)session->sess_id.data)[0],
+		((u32 *)session->sess_id.data)[1],
+		((u32 *)session->sess_id.data)[2],
+		((u32 *)session->sess_id.data)[3],
+		slot->seq_nr, slot->slot_nr,
+		tp->highest_used_slotid, args->sa_cache_this);
+	p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	*p++ = cpu_to_be32(slot->seq_nr);
+	*p++ = cpu_to_be32(slot->slot_nr);
+	*p++ = cpu_to_be32(tp->highest_used_slotid);
+	*p = cpu_to_be32(args->sa_cache_this);
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+		     const struct nfs4_getdeviceinfo_args *args,
+		     struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
+	p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
+	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+				    NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(args->pdev->layout_type);
+	*p++ = cpu_to_be32(args->pdev->maxcount);	/* gdia_maxcount */
+
+	p = reserve_space(xdr, 4 + 4);
+	*p++ = cpu_to_be32(1);			/* bitmap length */
+	*p++ = cpu_to_be32(args->notify_types);
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+		      const struct nfs4_layoutget_args *args,
+		      struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr);
+	p = reserve_space(xdr, 36);
+	*p++ = cpu_to_be32(0);     /* Signal layout available */
+	*p++ = cpu_to_be32(args->type);
+	*p++ = cpu_to_be32(args->range.iomode);
+	p = xdr_encode_hyper(p, args->range.offset);
+	p = xdr_encode_hyper(p, args->range.length);
+	p = xdr_encode_hyper(p, args->minlength);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	encode_uint32(xdr, args->maxcount);
+
+	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+		__func__,
+		args->type,
+		args->range.iomode,
+		(unsigned long)args->range.offset,
+		(unsigned long)args->range.length,
+		args->maxcount);
+}
+
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+		    struct inode *inode,
+		    struct nfs4_layoutcommit_args *args,
+		    struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
+		NFS_SERVER(args->inode)->pnfs_curr_ld->id);
+
+	encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr);
+	p = reserve_space(xdr, 20);
+	/* Only whole file layouts */
+	p = xdr_encode_hyper(p, 0); /* offset */
+	p = xdr_encode_hyper(p, args->lastbytewritten + 1);	/* length */
+	*p = cpu_to_be32(0); /* reclaim */
+	encode_nfs4_stateid(xdr, &args->stateid);
+	p = reserve_space(xdr, 20);
+	*p++ = cpu_to_be32(1); /* newoffset = TRUE */
+	p = xdr_encode_hyper(p, args->lastbytewritten);
+	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
+	*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
+
+	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
+		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+			NFS_I(inode)->layout, xdr, args);
+	} else {
+		encode_uint32(xdr, args->layoutupdate_len);
+		if (args->layoutupdate_pages) {
+			xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+					args->layoutupdate_len);
+		}
+	}
+
+	return 0;
+}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+		    const struct nfs4_layoutreturn_args *args,
+		    struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
+	p = reserve_space(xdr, 16);
+	*p++ = cpu_to_be32(0);		/* reclaim. always 0 for now */
+	*p++ = cpu_to_be32(args->layout_type);
+	*p++ = cpu_to_be32(args->range.iomode);
+	*p = cpu_to_be32(RETURN_FILE);
+	p = reserve_space(xdr, 16);
+	p = xdr_encode_hyper(p, args->range.offset);
+	p = xdr_encode_hyper(p, args->range.length);
+	spin_lock(&args->inode->i_lock);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	spin_unlock(&args->inode->i_lock);
+	if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+		NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+			NFS_I(args->inode)->layout, xdr, args);
+	} else
+		encode_uint32(xdr, 0);
+}
+
+static int
+encode_secinfo_no_name(struct xdr_stream *xdr,
+		       const struct nfs41_secinfo_no_name_args *args,
+		       struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr);
+	encode_uint32(xdr, args->style);
+	return 0;
+}
+
+static void encode_test_stateid(struct xdr_stream *xdr,
+				struct nfs41_test_stateid_args *args,
+				struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr);
+	encode_uint32(xdr, 1);
+	encode_nfs4_stateid(xdr, args->stateid);
+}
+
+static void encode_free_stateid(struct xdr_stream *xdr,
+				struct nfs41_free_stateid_args *args,
+				struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->stateid);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" ENCODE ROUTINES.
+ */
+
+static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
+{
+#if defined(CONFIG_NFS_V4_1)
+	struct nfs4_session *session = args->sa_slot->table->session;
+	if (session)
+		return session->clp->cl_mvops->minor_version;
+#endif /* CONFIG_NFS_V4_1 */
+	return 0;
+}
+
+/*
+ * Encode an ACCESS request
+ */
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_accessargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_access(xdr, args->access, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP request
+ */
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_lookup_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_lookup(xdr, args->name, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP_ROOT request
+ */
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_lookup_root_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode REMOVE request
+ */
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_removeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_remove(xdr, &args->name, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode RENAME request
+ */
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_renameargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->old_dir, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->new_dir, &hdr);
+	encode_rename(xdr, args->old_name, args->new_name, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LINK request
+ */
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct nfs4_link_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_link(xdr, args->name, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode CREATE request
+ */
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_create_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_create(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode SYMLINK request
+ */
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_create_arg *args)
+{
+	nfs4_xdr_enc_create(req, xdr, args);
+}
+
+/*
+ * Encode GETATTR request
+ */
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_getattr_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a CLOSE request
+ */
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_closeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_close(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request
+ */
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_openargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	if (args->access)
+		encode_access(xdr, args->access, &hdr);
+	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_CONFIRM request
+ */
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs_open_confirmargs *args)
+{
+	struct compound_hdr hdr = {
+		.nops   = 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_confirm(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request with no attributes.
+ */
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_openargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open(xdr, args, &hdr);
+	if (args->access)
+		encode_access(xdr, args->access, &hdr);
+	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_DOWNGRADE request
+ */
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs_closeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_downgrade(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCK request
+ */
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_lock_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lock(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKT request
+ */
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_lockt_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lockt(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKU request
+ */
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_locku_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_locku(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+					   struct xdr_stream *xdr,
+					struct nfs_release_lockowner_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_release_lockowner(xdr, &args->lock_owner, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a READLINK request
+ */
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_readlink *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readlink(xdr, args, req, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
+			args->pgbase, args->pglen);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a READDIR request
+ */
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_readdir_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readdir(xdr, args, req, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
+			 args->pgbase, args->count);
+	dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
+			__func__, hdr.replen << 2, args->pages,
+			args->pgbase, args->count);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a READ request
+ */
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_pgio_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_read(xdr, args, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
+			 args->pages, args->pgbase, args->count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an SETATTR request
+ */
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_setattrargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setattr(xdr, args, args->server, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a GETACL request
+ */
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_getaclargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+	uint32_t replen;
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	replen = hdr.replen + op_decode_hdr_maxsz + 1;
+	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
+		args->acl_pages, args->acl_pgbase, args->acl_len);
+
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a WRITE request
+ */
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_pgio_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_write(xdr, args, &hdr);
+	req->rq_snd_buf.flags |= XDRBUF_WRITE;
+	if (args->bitmask)
+		encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ *  a COMMIT request
+ */
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_commitargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_commit(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * FSINFO request
+ */
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs4_fsinfo_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_fsinfo(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a PATHCONF request
+ */
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_pathconf_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
+			   &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a STATFS request
+ */
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_statfs_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
+			   args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * GETATTR_BITMAP request
+ */
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_server_caps_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+			   FATTR4_WORD0_FH_EXPIRE_TYPE|
+			   FATTR4_WORD0_LINK_SUPPORT|
+			   FATTR4_WORD0_SYMLINK_SUPPORT|
+			   FATTR4_WORD0_ACLSUPPORT, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a RENEW request
+ */
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_client *clp)
+{
+	struct compound_hdr hdr = {
+		.nops	= 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_renew(xdr, clp->cl_clientid, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID request
+ */
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_setclientid *sc)
+{
+	struct compound_hdr hdr = {
+		.nops	= 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid(xdr, sc, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID_CONFIRM request
+ */
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+					     struct xdr_stream *xdr,
+					     struct nfs4_setclientid_res *arg)
+{
+	struct compound_hdr hdr = {
+		.nops	= 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid_confirm(xdr, arg, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * DELEGRETURN request
+ */
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_delegreturnargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_delegreturn(xdr, args->stateid, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode FS_LOCATIONS request
+ */
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fs_locations_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+	uint32_t replen;
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	if (args->migration) {
+		encode_putfh(xdr, args->fh, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+		if (args->renew)
+			encode_renew(xdr, args->clientid, &hdr);
+	} else {
+		encode_putfh(xdr, args->dir_fh, &hdr);
+		encode_lookup(xdr, args->name, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+	}
+
+	/* Set up reply kvec to capture returned fs_locations array. */
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
+			0, PAGE_SIZE);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode SECINFO request
+ */
+static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nfs4_secinfo_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_secinfo(xdr, args->name, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode FSID_PRESENT request
+ */
+static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fsid_present_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfh(xdr, &hdr);
+	if (args->renew)
+		encode_renew(xdr, args->clientid, &hdr);
+	encode_nops(&hdr);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * BIND_CONN_TO_SESSION request
+ */
+static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nfs41_bind_conn_to_session_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_bind_conn_to_session(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * EXCHANGE_ID request
+ */
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs41_exchange_id_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_exchange_id(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a CREATE_SESSION request
+ */
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs41_create_session_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_create_session(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a DESTROY_SESSION request
+ */
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
+					 struct xdr_stream *xdr,
+					 struct nfs4_session *session)
+{
+	struct compound_hdr hdr = {
+		.minorversion = session->clp->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_destroy_session(xdr, session, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a DESTROY_CLIENTID request
+ */
+static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req,
+					 struct xdr_stream *xdr,
+					 struct nfs_client *clp)
+{
+	struct compound_hdr hdr = {
+		.minorversion = clp->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_destroy_clientid(xdr, clp->cl_clientid, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a SEQUENCE request
+ */
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs4_sequence_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs4_get_lease_time_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+	};
+	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->la_seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_fsinfo(xdr, lease_bitmap, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a RECLAIM_COMPLETE request
+ */
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+				struct nfs41_reclaim_complete_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args)
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_reclaim_complete(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode GETDEVICEINFO request
+ */
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       struct nfs4_getdeviceinfo_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_getdeviceinfo(xdr, args, &hdr);
+
+	/* set up reply kvec. Subtract notification bitmap max size (2)
+	 * so that notification bitmap is put in xdr_buf tail */
+	xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+			 args->pdev->pages, args->pdev->pgbase,
+			 args->pdev->pglen);
+
+	encode_nops(&hdr);
+}
+
+/*
+ *  Encode LAYOUTGET request
+ */
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs4_layoutget_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutget(xdr, args, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
+	    args->layout.pages, 0, args->layout.pglen);
+
+	encode_nops(&hdr);
+}
+
+/*
+ *  Encode LAYOUTCOMMIT request
+ */
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_layoutcommit_args *args)
+{
+	struct nfs4_layoutcommit_data *data =
+		container_of(args, struct nfs4_layoutcommit_data, args);
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutcommit(xdr, data->args.inode, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_layoutreturn_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutreturn(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode SECINFO_NO_NAME request
+ */
+static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs41_secinfo_no_name_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_secinfo_no_name(xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+
+/*
+ *  Encode TEST_STATEID request
+ */
+static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs41_test_stateid_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_test_stateid(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ *  Encode FREE_STATEID request
+ */
+static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs41_free_stateid_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_free_stateid(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*len = be32_to_cpup(p);
+	p = xdr_inline_decode(xdr, *len);
+	if (unlikely(!p))
+		goto out_overflow;
+	*string = (char *)p;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	hdr->status = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p);
+
+	p = xdr_inline_decode(xdr, hdr->taglen + 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	hdr->tag = (char *)p;
+	p += XDR_QUADLEN(hdr->taglen);
+	hdr->nops = be32_to_cpup(p);
+	if (unlikely(hdr->nops < 1))
+		return nfs4_stat_to_errno(hdr->status);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+		int *nfs_retval)
+{
+	__be32 *p;
+	uint32_t opnum;
+	int32_t nfserr;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	opnum = be32_to_cpup(p++);
+	if (unlikely(opnum != expected))
+		goto out_bad_operation;
+	nfserr = be32_to_cpup(p);
+	if (nfserr == NFS_OK)
+		*nfs_retval = 0;
+	else
+		*nfs_retval = nfs4_stat_to_errno(nfserr);
+	return true;
+out_bad_operation:
+	dprintk("nfs: Server returned operation"
+		" %d but we issued a request for %d\n",
+			opnum, expected);
+	*nfs_retval = -EREMOTEIO;
+	return false;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	*nfs_retval = -EIO;
+	return false;
+}
+
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+	int retval;
+
+	__decode_op_hdr(xdr, expected, &retval);
+	return retval;
+}
+
+/* Dummy routine */
+static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
+{
+	__be32 *p;
+	unsigned int strlen;
+	char *str;
+
+	p = xdr_inline_decode(xdr, 12);
+	if (likely(p))
+		return decode_opaque_inline(xdr, &strlen, &str);
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+	uint32_t bmlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	bmlen = be32_to_cpup(p);
+
+	bitmap[0] = bitmap[1] = bitmap[2] = 0;
+	p = xdr_inline_decode(xdr, (bmlen << 2));
+	if (unlikely(!p))
+		goto out_overflow;
+	if (bmlen > 0) {
+		bitmap[0] = be32_to_cpup(p++);
+		if (bmlen > 1) {
+			bitmap[1] = be32_to_cpup(p++);
+			if (bmlen > 2)
+				bitmap[2] = be32_to_cpup(p);
+		}
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*attrlen = be32_to_cpup(p);
+	*savep = xdr_stream_pos(xdr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
+{
+	if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
+		int ret;
+		ret = decode_attr_bitmap(xdr, bitmask);
+		if (unlikely(ret < 0))
+			return ret;
+		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
+	} else
+		bitmask[0] = bitmask[1] = bitmask[2] = 0;
+	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+		bitmask[0], bitmask[1], bitmask[2]);
+	return 0;
+}
+
+static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*type = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*type = be32_to_cpup(p);
+		if (*type < NF4REG || *type > NF4NAMEDATTR) {
+			dprintk("%s: bad type %d\n", __func__, *type);
+			return -EIO;
+		}
+		bitmap[0] &= ~FATTR4_WORD0_TYPE;
+		ret = NFS_ATTR_FATTR_TYPE;
+	}
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
+				      uint32_t *bitmap, uint32_t *type)
+{
+	__be32 *p;
+
+	*type = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*type = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
+	}
+	dprintk("%s: expire type=0x%x\n", __func__, *type);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*change = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, change);
+		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+		ret = NFS_ATTR_FATTR_CHANGE;
+	}
+	dprintk("%s: change attribute=%Lu\n", __func__,
+			(unsigned long long)*change);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*size = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, size);
+		bitmap[0] &= ~FATTR4_WORD0_SIZE;
+		ret = NFS_ATTR_FATTR_SIZE;
+	}
+	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
+	}
+	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
+	}
+	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
+{
+	__be32 *p;
+	int ret = 0;
+
+	fsid->major = 0;
+	fsid->minor = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
+		p = xdr_inline_decode(xdr, 16);
+		if (unlikely(!p))
+			goto out_overflow;
+		p = xdr_decode_hyper(p, &fsid->major);
+		xdr_decode_hyper(p, &fsid->minor);
+		bitmap[0] &= ~FATTR4_WORD0_FSID;
+		ret = NFS_ATTR_FATTR_FSID;
+	}
+	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
+			(unsigned long long)fsid->major,
+			(unsigned long long)fsid->minor);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = 60;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
+	}
+	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
+{
+	__be32 *p;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+		*res = -be32_to_cpup(p);
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+	__be32 *p;
+	int len;
+
+	if (fh != NULL)
+		memset(fh, 0, sizeof(*fh));
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		if (len > NFS4_FHSIZE)
+			return -EIO;
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (fh != NULL) {
+			memcpy(fh->data, p, len);
+			fh->size = len;
+		}
+		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
+	}
+	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*fileid = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, fileid);
+		bitmap[0] &= ~FATTR4_WORD0_FILEID;
+		ret = NFS_ATTR_FATTR_FILEID;
+	}
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*fileid = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, fileid);
+		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+		ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+	}
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
+	}
+	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
+	}
+	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
+	}
+	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+	u32 n;
+	__be32 *p;
+	int status = 0;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	n = be32_to_cpup(p);
+	if (n == 0)
+		goto root_path;
+	dprintk("pathname4: ");
+	if (n > NFS4_PATHNAME_MAXCOMPONENTS) {
+		dprintk("cannot parse %d components in path\n", n);
+		goto out_eio;
+	}
+	for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) {
+		struct nfs4_string *component = &path->components[path->ncomponents];
+		status = decode_opaque_inline(xdr, &component->len, &component->data);
+		if (unlikely(status != 0))
+			goto out_eio;
+		ifdebug (XDR)
+			pr_cont("%s%.*s ",
+				(path->ncomponents != n ? "/ " : ""),
+				component->len, component->data);
+	}
+out:
+	return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+	path->ncomponents = 1;
+	path->components[0].len=0;
+	path->components[0].data=NULL;
+	dprintk("pathname4: /\n");
+	goto out;
+out_eio:
+	dprintk(" status %d", status);
+	status = -EIO;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
+{
+	int n;
+	__be32 *p;
+	int status = -EIO;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+		goto out;
+	status = 0;
+	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+		goto out;
+	status = -EIO;
+	/* Ignore borken servers that return unrequested attrs */
+	if (unlikely(res == NULL))
+		goto out;
+	dprintk("%s: fsroot:\n", __func__);
+	status = decode_pathname(xdr, &res->fs_path);
+	if (unlikely(status != 0))
+		goto out;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	n = be32_to_cpup(p);
+	if (n <= 0)
+		goto out_eio;
+	for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {
+		u32 m;
+		struct nfs4_fs_location *loc;
+
+		if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES)
+			break;
+		loc = &res->locations[res->nlocations];
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		m = be32_to_cpup(p);
+
+		dprintk("%s: servers:\n", __func__);
+		for (loc->nservers = 0; loc->nservers < m; loc->nservers++) {
+			struct nfs4_string *server;
+
+			if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) {
+				unsigned int i;
+				dprintk("%s: using first %u of %u servers "
+					"returned for location %u\n",
+						__func__,
+						NFS4_FS_LOCATION_MAXSERVERS,
+						m, res->nlocations);
+				for (i = loc->nservers; i < m; i++) {
+					unsigned int len;
+					char *data;
+					status = decode_opaque_inline(xdr, &len, &data);
+					if (unlikely(status != 0))
+						goto out_eio;
+				}
+				break;
+			}
+			server = &loc->servers[loc->nservers];
+			status = decode_opaque_inline(xdr, &server->len, &server->data);
+			if (unlikely(status != 0))
+				goto out_eio;
+			dprintk("%s ", server->data);
+		}
+		status = decode_pathname(xdr, &loc->rootpath);
+		if (unlikely(status != 0))
+			goto out_eio;
+	}
+	if (res->nlocations != 0)
+		status = NFS_ATTR_FATTR_V4_LOCATIONS;
+out:
+	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+out_eio:
+	status = -EIO;
+	goto out;
+}
+
+static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
+	}
+	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
+{
+	__be32 *p;
+	int status = 0;
+
+	*maxlink = 1;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*maxlink = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
+	}
+	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
+{
+	__be32 *p;
+	int status = 0;
+
+	*maxname = 1024;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*maxname = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
+	}
+	dprintk("%s: maxname=%u\n", __func__, *maxname);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 1024;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXREAD - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
+		uint64_t maxread;
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, &maxread);
+		if (maxread > 0x7FFFFFFF)
+			maxread = 0x7FFFFFFF;
+		*res = (uint32_t)maxread;
+		bitmap[0] &= ~FATTR4_WORD0_MAXREAD;
+	}
+	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 1024;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXWRITE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
+		uint64_t maxwrite;
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, &maxwrite);
+		if (maxwrite > 0x7FFFFFFF)
+			maxwrite = 0x7FFFFFFF;
+		*res = (uint32_t)maxwrite;
+		bitmap[0] &= ~FATTR4_WORD0_MAXWRITE;
+	}
+	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
+{
+	uint32_t tmp;
+	__be32 *p;
+	int ret = 0;
+
+	*mode = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		tmp = be32_to_cpup(p);
+		*mode = tmp & ~S_IFMT;
+		bitmap[1] &= ~FATTR4_WORD1_MODE;
+		ret = NFS_ATTR_FATTR_MODE;
+	}
+	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*nlink = 1;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*nlink = be32_to_cpup(p);
+		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+		ret = NFS_ATTR_FATTR_NLINK;
+	}
+	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+		const struct nfs_server *server, kuid_t *uid,
+		struct nfs4_string *owner_name)
+{
+	uint32_t len;
+	__be32 *p;
+	int ret = 0;
+
+	*uid = make_kuid(&init_user_ns, -2);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (owner_name != NULL) {
+			owner_name->data = kmemdup(p, len, GFP_NOWAIT);
+			if (owner_name->data != NULL) {
+				owner_name->len = len;
+				ret = NFS_ATTR_FATTR_OWNER_NAME;
+			}
+		} else if (len < XDR_MAX_NETOBJ) {
+			if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
+				ret = NFS_ATTR_FATTR_OWNER;
+			else
+				dprintk("%s: nfs_map_name_to_uid failed!\n",
+						__func__);
+		} else
+			dprintk("%s: name too long (%u)!\n",
+					__func__, len);
+		bitmap[1] &= ~FATTR4_WORD1_OWNER;
+	}
+	dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid));
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+		const struct nfs_server *server, kgid_t *gid,
+		struct nfs4_string *group_name)
+{
+	uint32_t len;
+	__be32 *p;
+	int ret = 0;
+
+	*gid = make_kgid(&init_user_ns, -2);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (group_name != NULL) {
+			group_name->data = kmemdup(p, len, GFP_NOWAIT);
+			if (group_name->data != NULL) {
+				group_name->len = len;
+				ret = NFS_ATTR_FATTR_GROUP_NAME;
+			}
+		} else if (len < XDR_MAX_NETOBJ) {
+			if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
+				ret = NFS_ATTR_FATTR_GROUP;
+			else
+				dprintk("%s: nfs_map_group_to_gid failed!\n",
+						__func__);
+		} else
+			dprintk("%s: name too long (%u)!\n",
+					__func__, len);
+		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
+	}
+	dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid));
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
+{
+	uint32_t major = 0, minor = 0;
+	__be32 *p;
+	int ret = 0;
+
+	*rdev = MKDEV(0,0);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
+		dev_t tmp;
+
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		major = be32_to_cpup(p++);
+		minor = be32_to_cpup(p);
+		tmp = MKDEV(major, minor);
+		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
+			*rdev = tmp;
+		bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+		ret = NFS_ATTR_FATTR_RDEV;
+	}
+	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
+	}
+	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
+	}
+	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
+	}
+	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*used = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, used);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+		ret = NFS_ATTR_FATTR_SPACE_USED;
+	}
+	dprintk("%s: space used=%Lu\n", __func__,
+			(unsigned long long)*used);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
+{
+	__be32 *p;
+	uint64_t sec;
+	uint32_t nsec;
+
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &sec);
+	nsec = be32_to_cpup(p);
+	time->tv_sec = (time_t)sec;
+	time->tv_nsec = (long)nsec;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_ACCESS - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
+		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_ATIME;
+		bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+	}
+	dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
+	return status;
+}
+
+static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_METADATA - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
+		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_CTIME;
+		bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
+	}
+	dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
+	return status;
+}
+
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+				  struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+		status = decode_attr_time(xdr, time);
+		bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+	}
+	dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
+		(long)time->tv_nsec);
+	return status;
+}
+
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+					struct nfs4_label *label)
+{
+	uint32_t pi = 0;
+	uint32_t lfs = 0;
+	__u32 len;
+	__be32 *p;
+	int status = 0;
+
+	if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+		return -EIO;
+	if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		lfs = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		pi = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (len < NFS4_MAXLABELLEN) {
+			if (label) {
+				memcpy(label->label, p, len);
+				label->len = len;
+				label->pi = pi;
+				label->lfs = lfs;
+				status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+			}
+			bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+		} else
+			printk(KERN_WARNING "%s: label too long (%u)!\n",
+					__func__, len);
+	}
+	if (label && label->label)
+		dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
+			(char *)label->label, label->len, label->pi, label->lfs);
+	return status;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_MODIFY - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
+		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_MTIME;
+		bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
+	}
+	dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
+	return status;
+}
+
+static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen)
+{
+	unsigned int attrwords = XDR_QUADLEN(attrlen);
+	unsigned int nwords = (xdr_stream_pos(xdr) - savep) >> 2;
+
+	if (unlikely(attrwords != nwords)) {
+		dprintk("%s: server returned incorrect attribute length: "
+			"%u %c %u\n",
+				__func__,
+				attrwords << 2,
+				(attrwords < nwords) ? '<' : '>',
+				nwords << 2);
+		return -EIO;
+	}
+	return 0;
+}
+
+static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
+	cinfo->atomic = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &cinfo->before);
+	xdr_decode_hyper(p, &cinfo->after);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
+{
+	__be32 *p;
+	uint32_t supp, acc;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_ACCESS);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	supp = be32_to_cpup(p++);
+	acc = be32_to_cpup(p);
+	*supported = supp;
+	*access = acc;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, len);
+	if (likely(p)) {
+		memcpy(buf, p, len);
+		return 0;
+	}
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_CLOSE);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+	return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
+}
+
+static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifier *verifier)
+{
+	return decode_opaque_fixed(xdr, verifier->data, NFS4_VERIFIER_SIZE);
+}
+
+static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_COMMIT);
+	if (!status)
+		status = decode_write_verifier(xdr, &res->verf->verifier);
+	return status;
+}
+
+static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	__be32 *p;
+	uint32_t bmlen;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_CREATE);
+	if (status)
+		return status;
+	if ((status = decode_change_info(xdr, cinfo)))
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	bmlen = be32_to_cpup(p);
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
+{
+	unsigned int savep;
+	uint32_t attrlen, bitmap[3] = {0};
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_fh_expire_type(xdr, bitmap,
+						 &res->fh_expire_type)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
+		goto xdr_error;
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
+{
+	unsigned int savep;
+	uint32_t attrlen, bitmap[3] = {0};
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+
+	if ((status = decode_attr_files_avail(xdr, bitmap, &fsstat->afiles)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_files_free(xdr, bitmap, &fsstat->ffiles)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_space_total(xdr, bitmap, &fsstat->tbytes)) != 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
+{
+	unsigned int savep;
+	uint32_t attrlen, bitmap[3] = {0};
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+
+	if ((status = decode_attr_maxlink(xdr, bitmap, &pathconf->max_link)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_maxname(xdr, bitmap, &pathconf->max_namelen)) != 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_threshold_hint(struct xdr_stream *xdr,
+				  uint32_t *bitmap,
+				  uint64_t *res,
+				  uint32_t hint_bit)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (likely(bitmap[0] & hint_bit)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_first_threshold_item4(struct xdr_stream *xdr,
+					struct nfs4_threshold *res)
+{
+	__be32 *p;
+	unsigned int savep;
+	uint32_t bitmap[3] = {0,}, attrlen;
+	int status;
+
+	/* layout type */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p)) {
+		print_overflow_msg(__func__, xdr);
+		return -EIO;
+	}
+	res->l_type = be32_to_cpup(p);
+
+	/* thi_hintset bitmap */
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	/* thi_hintlist length */
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
+		goto xdr_error;
+	/* thi_hintlist */
+	status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz,
+				       THRESHOLD_RD_IO);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz,
+				       THRESHOLD_WR_IO);
+	if (status < 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+	res->bm = bitmap[0];
+
+	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+		 __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz,
+		res->wr_io_sz);
+xdr_error:
+	dprintk("%s ret=%d!\n", __func__, status);
+	return status;
+}
+
+/*
+ * Thresholds on pNFS direct I/O vrs MDS I/O
+ */
+static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
+				    uint32_t *bitmap,
+				    struct nfs4_threshold *res)
+{
+	__be32 *p;
+	int status = 0;
+	uint32_t num;
+
+	if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U)))
+		return -EIO;
+	if (bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+		/* Did the server return an unrequested attribute? */
+		if (unlikely(res == NULL))
+			return -EREMOTEIO;
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		num = be32_to_cpup(p);
+		if (num == 0)
+			return 0;
+		if (num > 1)
+			printk(KERN_INFO "%s: Warning: Multiple pNFS layout "
+				"drivers per filesystem not supported\n",
+				__func__);
+
+		status = decode_first_threshold_item4(xdr, res);
+		bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD;
+	}
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_fattr *fattr, struct nfs_fh *fh,
+		struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
+		const struct nfs_server *server)
+{
+	int status;
+	umode_t fmode = 0;
+	uint32_t type;
+	int32_t err;
+
+	status = decode_attr_type(xdr, bitmap, &type);
+	if (status < 0)
+		goto xdr_error;
+	fattr->mode = 0;
+	if (status != 0) {
+		fattr->mode |= nfs_type2fmt[type];
+		fattr->valid |= status;
+	}
+
+	status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_size(xdr, bitmap, &fattr->size);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	err = 0;
+	status = decode_attr_error(xdr, bitmap, &err);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_filehandle(xdr, bitmap, fh);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_fs_locations(xdr, bitmap, fs_loc);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_mode(xdr, bitmap, &fmode);
+	if (status < 0)
+		goto xdr_error;
+	if (status != 0) {
+		fattr->mode |= fmode;
+		fattr->valid |= status;
+	}
+
+	status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
+	if (status < 0)
+		goto xdr_error;
+
+	if (label) {
+		status = decode_attr_security_label(xdr, bitmap, label);
+		if (status < 0)
+			goto xdr_error;
+		fattr->valid |= status;
+	}
+
+xdr_error:
+	dprintk("%s: xdr returned %d\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
+		struct nfs4_label *label, const struct nfs_server *server)
+{
+	unsigned int savep;
+	uint32_t attrlen,
+		 bitmap[3] = {0};
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETATTR);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+					label, server);
+	if (status < 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct nfs4_label *label, const struct nfs_server *server)
+{
+	return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
+
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		const struct nfs_server *server)
+{
+	return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
+}
+
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+					 uint32_t *layouttype)
+{
+	__be32 *p;
+	int num;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	num = be32_to_cpup(p);
+
+	/* pNFS is not supported by the underlying file system */
+	if (num == 0) {
+		*layouttype = 0;
+		return 0;
+	}
+	if (num > 1)
+		printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
+			"drivers per filesystem not supported\n", __func__);
+
+	/* Decode and set first layout type, move xdr->p past unused types */
+	p = xdr_inline_decode(xdr, num * 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*layouttype = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+				uint32_t *layouttype)
+{
+	int status = 0;
+
+	dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+		return -EIO;
+	if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+		status = decode_first_pnfs_layout_type(xdr, layouttype);
+		bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+	} else
+		*layouttype = 0;
+	return status;
+}
+
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+				      uint32_t *res)
+{
+	__be32 *p;
+
+	dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+	*res = 0;
+	if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p)) {
+			print_overflow_msg(__func__, xdr);
+			return -EIO;
+		}
+		*res = be32_to_cpup(p);
+		bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+	}
+	return 0;
+}
+
+static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
+{
+	unsigned int savep;
+	uint32_t attrlen, bitmap[3];
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+
+	fsinfo->rtmult = fsinfo->wtmult = 512;	/* ??? */
+
+	if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0)
+		goto xdr_error;
+	fsinfo->rtpref = fsinfo->dtpref = fsinfo->rtmax;
+	if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
+		goto xdr_error;
+	fsinfo->wtpref = fsinfo->wtmax;
+	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+	if (status != 0)
+		goto xdr_error;
+	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+	if (status != 0)
+		goto xdr_error;
+	status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+	if (status)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+	uint32_t len;
+	int status;
+
+	/* Zero handle first to allow comparisons */
+	memset(fh, 0, sizeof(*fh));
+
+	status = decode_op_hdr(xdr, OP_GETFH);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len > NFS4_FHSIZE)
+		return -EIO;
+	fh->size = len;
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(!p))
+		goto out_overflow;
+	memcpy(fh->data, p, len);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LINK);
+	if (status)
+		return status;
+	return decode_change_info(xdr, cinfo);
+}
+
+/*
+ * We create the owner, so we know a proper owner.id length is 4.
+ */
+static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
+{
+	uint64_t offset, length, clientid;
+	__be32 *p;
+	uint32_t namelen, type;
+
+	p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
+	p = xdr_decode_hyper(p, &length);
+	type = be32_to_cpup(p++); /* 4 byte read */
+	if (fl != NULL) { /* manipulate file lock */
+		fl->fl_start = (loff_t)offset;
+		fl->fl_end = fl->fl_start + (loff_t)length - 1;
+		if (length == ~(uint64_t)0)
+			fl->fl_end = OFFSET_MAX;
+		fl->fl_type = F_WRLCK;
+		if (type & 1)
+			fl->fl_type = F_RDLCK;
+		fl->fl_pid = 0;
+	}
+	p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
+	namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
+	p = xdr_inline_decode(xdr, namelen); /* variable size field */
+	if (likely(p))
+		return -NFS4ERR_DENIED;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LOCK);
+	if (status == -EIO)
+		goto out;
+	if (status == 0) {
+		status = decode_stateid(xdr, &res->stateid);
+		if (unlikely(status))
+			goto out;
+	} else if (status == -NFS4ERR_DENIED)
+		status = decode_lock_denied(xdr, NULL);
+	if (res->open_seqid != NULL)
+		nfs_increment_open_seqid(status, res->open_seqid);
+	nfs_increment_lock_seqid(status, res->lock_seqid);
+out:
+	return status;
+}
+
+static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
+{
+	int status;
+	status = decode_op_hdr(xdr, OP_LOCKT);
+	if (status == -NFS4ERR_DENIED)
+		return decode_lock_denied(xdr, res->denied);
+	return status;
+}
+
+static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LOCKU);
+	if (status != -EIO)
+		nfs_increment_lock_seqid(status, res->seqid);
+	if (status == 0)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
+
+static int decode_lookup(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_LOOKUP);
+}
+
+/* This is too sick! */
+static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+{
+	__be32 *p;
+	uint32_t limit_type, nblocks, blocksize;
+
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	limit_type = be32_to_cpup(p++);
+	switch (limit_type) {
+	case 1:
+		xdr_decode_hyper(p, maxsize);
+		break;
+	case 2:
+		nblocks = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p);
+		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_rw_delegation(struct xdr_stream *xdr,
+		uint32_t delegation_type,
+		struct nfs_openres *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_stateid(xdr, &res->delegation);
+	if (unlikely(status))
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->do_recall = be32_to_cpup(p);
+
+	switch (delegation_type) {
+	case NFS4_OPEN_DELEGATE_READ:
+		res->delegation_type = FMODE_READ;
+		break;
+	case NFS4_OPEN_DELEGATE_WRITE:
+		res->delegation_type = FMODE_WRITE|FMODE_READ;
+		if (decode_space_limit(xdr, &res->maxsize) < 0)
+				return -EIO;
+	}
+	return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+	__be32 *p;
+	uint32_t why_no_delegation;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	why_no_delegation = be32_to_cpup(p);
+	switch (why_no_delegation) {
+		case WND4_CONTENTION:
+		case WND4_RESOURCE:
+			xdr_inline_decode(xdr, 4);
+			/* Ignore for now */
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+	__be32 *p;
+	uint32_t delegation_type;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	delegation_type = be32_to_cpup(p);
+	res->delegation_type = 0;
+	switch (delegation_type) {
+	case NFS4_OPEN_DELEGATE_NONE:
+		return 0;
+	case NFS4_OPEN_DELEGATE_READ:
+	case NFS4_OPEN_DELEGATE_WRITE:
+		return decode_rw_delegation(xdr, delegation_type, res);
+	case NFS4_OPEN_DELEGATE_NONE_EXT:
+		return decode_no_delegation(xdr, res);
+	}
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+	__be32 *p;
+	uint32_t savewords, bmlen, i;
+	int status;
+
+	if (!__decode_op_hdr(xdr, OP_OPEN, &status))
+		return status;
+	nfs_increment_open_seqid(status, res->seqid);
+	if (status)
+		return status;
+	status = decode_stateid(xdr, &res->stateid);
+	if (unlikely(status))
+		return status;
+
+	decode_change_info(xdr, &res->cinfo);
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->rflags = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
+	if (bmlen > 10)
+		goto xdr_error;
+
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (unlikely(!p))
+		goto out_overflow;
+	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
+	for (i = 0; i < savewords; ++i)
+		res->attrset[i] = be32_to_cpup(p++);
+	for (; i < NFS4_BITMAP_SIZE; i++)
+		res->attrset[i] = 0;
+
+	return decode_delegation(xdr, res);
+xdr_error:
+	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_putfh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_PUTFH);
+}
+
+static int decode_putrootfh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_PUTROOTFH);
+}
+
+static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
+		       struct nfs_pgio_res *res)
+{
+	__be32 *p;
+	uint32_t count, eof, recvd;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_READ);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	eof = be32_to_cpup(p++);
+	count = be32_to_cpup(p);
+	recvd = xdr_read_pages(xdr, count);
+	if (count > recvd) {
+		dprintk("NFS: server cheating in read reply: "
+				"count %u > recvd %u\n", count, recvd);
+		count = recvd;
+		eof = 0;
+	}
+	res->eof = eof;
+	res->count = count;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
+{
+	int		status;
+	__be32		verf[2];
+
+	status = decode_op_hdr(xdr, OP_READDIR);
+	if (!status)
+		status = decode_verifier(xdr, readdir->verifier.data);
+	if (unlikely(status))
+		return status;
+	memcpy(verf, readdir->verifier.data, sizeof(verf));
+	dprintk("%s: verifier = %08x:%08x\n",
+			__func__, verf[0], verf[1]);
+	return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
+{
+	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+	u32 len, recvd;
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_READLINK);
+	if (status)
+		return status;
+
+	/* Convert length of symlink */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len >= rcvbuf->page_len || len <= 0) {
+		dprintk("nfs: server returned giant symlink!\n");
+		return -ENAMETOOLONG;
+	}
+	recvd = xdr_read_pages(xdr, len);
+	if (recvd < len) {
+		dprintk("NFS: server cheating in readlink reply: "
+				"count %u > recvd %u\n", len, recvd);
+		return -EIO;
+	}
+	/*
+	 * The XDR encode routine has set things up so that
+	 * the link text will be copied directly into the
+	 * buffer.  We just have to do overflow-checking,
+	 * and and null-terminate the text (the VFS expects
+	 * null-termination).
+	 */
+	xdr_terminate_string(rcvbuf, len);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_REMOVE);
+	if (status)
+		goto out;
+	status = decode_change_info(xdr, cinfo);
+out:
+	return status;
+}
+
+static int decode_rename(struct xdr_stream *xdr, struct nfs4_change_info *old_cinfo,
+	      struct nfs4_change_info *new_cinfo)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_RENAME);
+	if (status)
+		goto out;
+	if ((status = decode_change_info(xdr, old_cinfo)))
+		goto out;
+	status = decode_change_info(xdr, new_cinfo);
+out:
+	return status;
+}
+
+static int decode_renew(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RENEW);
+}
+
+static int
+decode_restorefh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RESTOREFH);
+}
+
+static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
+			 struct nfs_getaclres *res)
+{
+	unsigned int savep;
+	uint32_t attrlen,
+		 bitmap[3] = {0};
+	int status;
+	unsigned int pg_offset;
+
+	res->acl_len = 0;
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto out;
+
+	xdr_enter_page(xdr, xdr->buf->page_len);
+
+	/* Calculate the offset of the page data */
+	pg_offset = xdr->buf->head[0].iov_len;
+
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto out;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto out;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
+
+		/* The bitmap (xdr len + bitmaps) and the attr xdr len words
+		 * are stored with the acl data to handle the problem of
+		 * variable length bitmaps.*/
+		res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset;
+		res->acl_len = attrlen;
+
+		/* Check for receive buffer overflow */
+		if (res->acl_len > (xdr->nwords << 2) ||
+		    res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
+			res->acl_flags |= NFS4_ACL_TRUNC;
+			dprintk("NFS: acl reply: attrlen %u > page_len %u\n",
+					attrlen, xdr->nwords << 2);
+		}
+	} else
+		status = -EOPNOTSUPP;
+
+out:
+	return status;
+}
+
+static int
+decode_savefh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_SAVEFH);
+}
+
+static int decode_setattr(struct xdr_stream *xdr)
+{
+	__be32 *p;
+	uint32_t bmlen;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_SETATTR);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	bmlen = be32_to_cpup(p);
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
+{
+	__be32 *p;
+	uint32_t opnum;
+	int32_t nfserr;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	opnum = be32_to_cpup(p++);
+	if (opnum != OP_SETCLIENTID) {
+		dprintk("nfs: decode_setclientid: Server returned operation"
+			" %d\n", opnum);
+		return -EIO;
+	}
+	nfserr = be32_to_cpup(p);
+	if (nfserr == NFS_OK) {
+		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+		if (unlikely(!p))
+			goto out_overflow;
+		p = xdr_decode_hyper(p, &res->clientid);
+		memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
+	} else if (nfserr == NFSERR_CLID_INUSE) {
+		uint32_t len;
+
+		/* skip netid string */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+
+		/* skip uaddr string */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		return -NFSERR_CLID_INUSE;
+	} else
+		return nfs4_stat_to_errno(nfserr);
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_setclientid_confirm(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);
+}
+
+static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_WRITE);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->count = be32_to_cpup(p++);
+	res->verf->committed = be32_to_cpup(p++);
+	return decode_write_verifier(xdr, &res->verf->verifier);
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_delegreturn(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_DELEGRETURN);
+}
+
+static int decode_secinfo_gss(struct xdr_stream *xdr,
+			      struct nfs4_secinfo4 *flavor)
+{
+	u32 oid_len;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	oid_len = be32_to_cpup(p);
+	if (oid_len > GSS_OID_MAX_LEN)
+		goto out_err;
+
+	p = xdr_inline_decode(xdr, oid_len);
+	if (unlikely(!p))
+		goto out_overflow;
+	memcpy(flavor->flavor_info.oid.data, p, oid_len);
+	flavor->flavor_info.oid.len = oid_len;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	flavor->flavor_info.qop = be32_to_cpup(p++);
+	flavor->flavor_info.service = be32_to_cpup(p);
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+out_err:
+	return -EINVAL;
+}
+
+static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+	struct nfs4_secinfo4 *sec_flavor;
+	unsigned int i, num_flavors;
+	int status;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->flavors->num_flavors = 0;
+	num_flavors = be32_to_cpup(p);
+
+	for (i = 0; i < num_flavors; i++) {
+		sec_flavor = &res->flavors->flavors[i];
+		if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE)
+			break;
+
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		sec_flavor->flavor = be32_to_cpup(p);
+
+		if (sec_flavor->flavor == RPC_AUTH_GSS) {
+			status = decode_secinfo_gss(xdr, sec_flavor);
+			if (status)
+				goto out;
+		}
+		res->flavors->num_flavors++;
+	}
+
+	status = 0;
+out:
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+	int status = decode_op_hdr(xdr, OP_SECINFO);
+	if (status)
+		return status;
+	return decode_secinfo_common(xdr, res);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+	int status = decode_op_hdr(xdr, OP_SECINFO_NO_NAME);
+	if (status)
+		return status;
+	return decode_secinfo_common(xdr, res);
+}
+
+static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
+{
+	__be32 *p;
+	uint32_t bitmap_words;
+	unsigned int i;
+
+	p = xdr_inline_decode(xdr, 4);
+	bitmap_words = be32_to_cpup(p++);
+	if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
+		return -EIO;
+	p = xdr_inline_decode(xdr, 4 * bitmap_words);
+	for (i = 0; i < bitmap_words; i++)
+		op_map->u.words[i] = be32_to_cpup(p++);
+
+	return 0;
+}
+
+static int decode_exchange_id(struct xdr_stream *xdr,
+			      struct nfs41_exchange_id_res *res)
+{
+	__be32 *p;
+	uint32_t dummy;
+	char *dummy_str;
+	int status;
+	uint32_t impl_id_count;
+
+	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	xdr_decode_hyper(p, &res->clientid);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->seqid = be32_to_cpup(p++);
+	res->flags = be32_to_cpup(p++);
+
+	res->state_protect.how = be32_to_cpup(p);
+	switch (res->state_protect.how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		status = decode_op_map(xdr, &res->state_protect.enforce);
+		if (status)
+			return status;
+		status = decode_op_map(xdr, &res->state_protect.allow);
+		if (status)
+			return status;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+
+	/* server_owner4.so_minor_id */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &res->server_owner->minor_id);
+
+	/* server_owner4.so_major_id */
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
+	if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+		return -EIO;
+	memcpy(res->server_owner->major_id, dummy_str, dummy);
+	res->server_owner->major_id_sz = dummy;
+
+	/* server_scope4 */
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
+	if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+		return -EIO;
+	memcpy(res->server_scope->server_scope, dummy_str, dummy);
+	res->server_scope->server_scope_sz = dummy;
+
+	/* Implementation Id */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	impl_id_count = be32_to_cpup(p++);
+
+	if (impl_id_count) {
+		/* nii_domain */
+		status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+		if (unlikely(status))
+			return status;
+		if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+			return -EIO;
+		memcpy(res->impl_id->domain, dummy_str, dummy);
+
+		/* nii_name */
+		status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+		if (unlikely(status))
+			return status;
+		if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+			return -EIO;
+		memcpy(res->impl_id->name, dummy_str, dummy);
+
+		/* nii_date */
+		p = xdr_inline_decode(xdr, 12);
+		if (unlikely(!p))
+			goto out_overflow;
+		p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
+		res->impl_id->date.nseconds = be32_to_cpup(p);
+
+		/* if there's more than one entry, ignore the rest */
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_chan_attrs(struct xdr_stream *xdr,
+			     struct nfs4_channel_attrs *attrs)
+{
+	__be32 *p;
+	u32 nr_attrs, val;
+
+	p = xdr_inline_decode(xdr, 28);
+	if (unlikely(!p))
+		goto out_overflow;
+	val = be32_to_cpup(p++);	/* headerpadsz */
+	if (val)
+		return -EINVAL;		/* no support for header padding yet */
+	attrs->max_rqst_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz_cached = be32_to_cpup(p++);
+	attrs->max_ops = be32_to_cpup(p++);
+	attrs->max_reqs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p);
+	if (unlikely(nr_attrs > 1)) {
+		printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs "
+			"count %u\n", __func__, nr_attrs);
+		return -EINVAL;
+	}
+	if (nr_attrs == 1) {
+		p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+		if (unlikely(!p))
+			goto out_overflow;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
+}
+
+static int decode_bind_conn_to_session(struct xdr_stream *xdr,
+				struct nfs41_bind_conn_to_session_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
+	if (!status)
+		status = decode_sessionid(xdr, &res->sessionid);
+	if (unlikely(status))
+		return status;
+
+	/* dir flags, rdma mode bool */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->dir = be32_to_cpup(p++);
+	if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
+		return -EIO;
+	if (be32_to_cpup(p) == 0)
+		res->use_conn_in_rdma_mode = false;
+	else
+		res->use_conn_in_rdma_mode = true;
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_create_session(struct xdr_stream *xdr,
+				 struct nfs41_create_session_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+	if (!status)
+		status = decode_sessionid(xdr, &res->sessionid);
+	if (unlikely(status))
+		return status;
+
+	/* seqid, flags */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->seqid = be32_to_cpup(p++);
+	res->flags = be32_to_cpup(p);
+
+	/* Channel attributes */
+	status = decode_chan_attrs(xdr, &res->fc_attrs);
+	if (!status)
+		status = decode_chan_attrs(xdr, &res->bc_attrs);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_DESTROY_SESSION);
+}
+
+static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_DESTROY_CLIENTID);
+}
+
+static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int decode_sequence(struct xdr_stream *xdr,
+			   struct nfs4_sequence_res *res,
+			   struct rpc_rqst *rqstp)
+{
+#if defined(CONFIG_NFS_V4_1)
+	struct nfs4_session *session;
+	struct nfs4_sessionid id;
+	u32 dummy;
+	int status;
+	__be32 *p;
+
+	if (res->sr_slot == NULL)
+		return 0;
+	if (!res->sr_slot->table->session)
+		return 0;
+
+	status = decode_op_hdr(xdr, OP_SEQUENCE);
+	if (!status)
+		status = decode_sessionid(xdr, &id);
+	if (unlikely(status))
+		goto out_err;
+
+	/*
+	 * If the server returns different values for sessionID, slotID or
+	 * sequence number, the server is looney tunes.
+	 */
+	status = -EREMOTEIO;
+	session = res->sr_slot->table->session;
+
+	if (memcmp(id.data, session->sess_id.data,
+		   NFS4_MAX_SESSIONID_LEN)) {
+		dprintk("%s Invalid session id\n", __func__);
+		goto out_err;
+	}
+
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	/* seqid */
+	dummy = be32_to_cpup(p++);
+	if (dummy != res->sr_slot->seq_nr) {
+		dprintk("%s Invalid sequence number\n", __func__);
+		goto out_err;
+	}
+	/* slot id */
+	dummy = be32_to_cpup(p++);
+	if (dummy != res->sr_slot->slot_nr) {
+		dprintk("%s Invalid slot id\n", __func__);
+		goto out_err;
+	}
+	/* highest slot id */
+	res->sr_highest_slotid = be32_to_cpup(p++);
+	/* target highest slot id */
+	res->sr_target_highest_slotid = be32_to_cpup(p++);
+	/* result flags */
+	res->sr_status_flags = be32_to_cpup(p);
+	status = 0;
+out_err:
+	res->sr_status = status;
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	status = -EIO;
+	goto out_err;
+#else  /* CONFIG_NFS_V4_1 */
+	return 0;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+				struct nfs4_getdeviceinfo_res *res)
+{
+	struct pnfs_device *pdev = res->pdev;
+	__be32 *p;
+	uint32_t len, type;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+	if (status) {
+		if (status == -ETOOSMALL) {
+			p = xdr_inline_decode(xdr, 4);
+			if (unlikely(!p))
+				goto out_overflow;
+			pdev->mincount = be32_to_cpup(p);
+			dprintk("%s: Min count too small. mincnt = %u\n",
+				__func__, pdev->mincount);
+		}
+		return status;
+	}
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	type = be32_to_cpup(p++);
+	if (type != pdev->layout_type) {
+		dprintk("%s: layout mismatch req: %u pdev: %u\n",
+			__func__, pdev->layout_type, type);
+		return -EINVAL;
+	}
+	/*
+	 * Get the length of the opaque device_addr4. xdr_read_pages places
+	 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+	 * and places the remaining xdr data in xdr_buf->tail
+	 */
+	pdev->mincount = be32_to_cpup(p);
+	if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
+		goto out_overflow;
+
+	/* Parse notification bitmap, verifying that it is zero. */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len) {
+		uint32_t i;
+
+		p = xdr_inline_decode(xdr, 4 * len);
+		if (unlikely(!p))
+			goto out_overflow;
+
+		res->notification = be32_to_cpup(p++);
+		for (i = 1; i < len; i++) {
+			if (be32_to_cpup(p++)) {
+				dprintk("%s: unsupported notification\n",
+					__func__);
+				return -EIO;
+			}
+		}
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+			    struct nfs4_layoutget_res *res)
+{
+	__be32 *p;
+	int status;
+	u32 layout_count;
+	u32 recvd;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTGET);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->return_on_close = be32_to_cpup(p);
+	decode_stateid(xdr, &res->stateid);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	layout_count = be32_to_cpup(p);
+	if (!layout_count) {
+		dprintk("%s: server responded with empty layout array\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	p = xdr_inline_decode(xdr, 28);
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &res->range.offset);
+	p = xdr_decode_hyper(p, &res->range.length);
+	res->range.iomode = be32_to_cpup(p++);
+	res->type = be32_to_cpup(p++);
+	res->layoutp->len = be32_to_cpup(p);
+
+	dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+		__func__,
+		(unsigned long)res->range.offset,
+		(unsigned long)res->range.length,
+		res->range.iomode,
+		res->type,
+		res->layoutp->len);
+
+	recvd = xdr_read_pages(xdr, res->layoutp->len);
+	if (res->layoutp->len > recvd) {
+		dprintk("NFS: server cheating in layoutget reply: "
+				"layout len %u > recvd %u\n",
+				res->layoutp->len, recvd);
+		return -EINVAL;
+	}
+
+	if (layout_count > 1) {
+		/* We only handle a length one array at the moment.  Any
+		 * further entries are just ignored.  Note that this means
+		 * the client may see a response that is less than the
+		 * minimum it requested.
+		 */
+		dprintk("%s: server responded with %d layouts, dropping tail\n",
+			__func__, layout_count);
+	}
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutreturn(struct xdr_stream *xdr,
+			       struct nfs4_layoutreturn_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->lrs_present = be32_to_cpup(p);
+	if (res->lrs_present)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutcommit(struct xdr_stream *xdr,
+			       struct rpc_rqst *req,
+			       struct nfs4_layoutcommit_res *res)
+{
+	__be32 *p;
+	__u32 sizechanged;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+	res->status = status;
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	sizechanged = be32_to_cpup(p);
+
+	if (sizechanged) {
+		/* throw away new size */
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_test_stateid(struct xdr_stream *xdr,
+			       struct nfs41_test_stateid_res *res)
+{
+	__be32 *p;
+	int status;
+	int num_res;
+
+	status = decode_op_hdr(xdr, OP_TEST_STATEID);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	num_res = be32_to_cpup(p++);
+	if (num_res != 1)
+		goto out;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->status = be32_to_cpup(p++);
+
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+out:
+	return -EIO;
+}
+
+static int decode_free_stateid(struct xdr_stream *xdr,
+			       struct nfs41_free_stateid_res *res)
+{
+	res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
+	return res->status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+
+/*
+ * Decode OPEN_DOWNGRADE response
+ */
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs_closeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_open_downgrade(xdr, res);
+	if (status != 0)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode ACCESS response
+ */
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_accessres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status != 0)
+		goto out;
+	status = decode_access(xdr, &res->supported, &res->access);
+	if (status != 0)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode LOOKUP response
+ */
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_lookup_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_lookup(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode LOOKUP_ROOT response
+ */
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_lookup_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putrootfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status == 0)
+		status = decode_getfattr_label(xdr, res->fattr,
+						res->label, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode REMOVE response
+ */
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_removeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_remove(xdr, &res->cinfo);
+out:
+	return status;
+}
+
+/*
+ * Decode RENAME response
+ */
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_renameres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+out:
+	return status;
+}
+
+/*
+ * Decode LINK response
+ */
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs4_link_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_link(xdr, &res->cinfo);
+	if (status)
+		goto out;
+	/*
+	 * Note order: OP_LINK leaves the directory as the current
+	 *             filehandle.
+	 */
+	status = decode_restorefh(xdr);
+	if (status)
+		goto out;
+	decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode CREATE response
+ */
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_create_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_create(xdr, &res->dir_cinfo);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode SYMLINK response
+ */
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_create_res *res)
+{
+	return nfs4_xdr_dec_create(rqstp, xdr, res);
+}
+
+/*
+ * Decode GETATTR response
+ */
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_getattr_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+	return status;
+}
+
+/*
+ * Encode an SETACL request
+ */
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_setaclargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setacl(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Decode SETACL response
+ */
+static int
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+		    struct nfs_setaclres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_setattr(xdr);
+out:
+	return status;
+}
+
+/*
+ * Decode GETACL response
+ */
+static int
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+		    struct nfs_getaclres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	if (res->acl_scratch != NULL) {
+		void *p = page_address(res->acl_scratch);
+		xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
+	}
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getacl(xdr, rqstp, res);
+
+out:
+	return status;
+}
+
+/*
+ * Decode CLOSE response
+ */
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_closeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_close(xdr, res);
+	if (status != 0)
+		goto out;
+	/*
+	 * Note: Server may do delete on close for this file
+	 * 	in which case the getattr call will fail with
+	 * 	an ESTALE error. Shouldn't be a problem,
+	 * 	though, since fattr->valid will remain unset.
+	 */
+	decode_getfattr(xdr, res->fattr, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_openres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_open(xdr, res);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, &res->fh);
+	if (status)
+		goto out;
+	if (res->access_request)
+		decode_access(xdr, &res->access_supported, &res->access_result);
+	decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode OPEN_CONFIRM response
+ */
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs_open_confirmres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_open_confirm(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs_openres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_open(xdr, res);
+	if (status)
+		goto out;
+	if (res->access_request)
+		decode_access(xdr, &res->access_supported, &res->access_result);
+	decode_getfattr(xdr, res->f_attr, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode SETATTR response
+ */
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nfs_setattrres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_setattr(xdr);
+	if (status)
+		goto out;
+	decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode LOCK response
+ */
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_lock_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_lock(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LOCKT response
+ */
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_lockt_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_lockt(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LOCKU response
+ */
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_locku_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_locku(xdr, res);
+out:
+	return status;
+}
+
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+					  struct xdr_stream *xdr, void *dummy)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_release_lockowner(xdr);
+	return status;
+}
+
+/*
+ * Decode READLINK response
+ */
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
+				 struct nfs4_readlink_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_readlink(xdr, rqstp);
+out:
+	return status;
+}
+
+/*
+ * Decode READDIR response
+ */
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_readdir_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_readdir(xdr, rqstp, res);
+out:
+	return status;
+}
+
+/*
+ * Decode Read response
+ */
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_pgio_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	res->op_status = hdr.status;
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_read(xdr, rqstp, res);
+	if (!status)
+		status = res->count;
+out:
+	return status;
+}
+
+/*
+ * Decode WRITE response
+ */
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_pgio_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	res->op_status = hdr.status;
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_write(xdr, res);
+	if (status)
+		goto out;
+	if (res->fattr)
+		decode_getfattr(xdr, res->fattr, res->server);
+	if (!status)
+		status = res->count;
+out:
+	return status;
+}
+
+/*
+ * Decode COMMIT response
+ */
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_commitres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	res->op_status = hdr.status;
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_commit(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode FSINFO response
+ */
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs4_fsinfo_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, req);
+	if (!status)
+		status = decode_putfh(xdr);
+	if (!status)
+		status = decode_fsinfo(xdr, res->fsinfo);
+	return status;
+}
+
+/*
+ * Decode PATHCONF response
+ */
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs4_pathconf_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, req);
+	if (!status)
+		status = decode_putfh(xdr);
+	if (!status)
+		status = decode_pathconf(xdr, res->pathconf);
+	return status;
+}
+
+/*
+ * Decode STATFS response
+ */
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs4_statfs_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, req);
+	if (!status)
+		status = decode_putfh(xdr);
+	if (!status)
+		status = decode_statfs(xdr, res->fsstat);
+	return status;
+}
+
+/*
+ * Decode GETATTR_BITMAP response
+ */
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_server_caps_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, req);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_server_caps(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode RENEW response
+ */
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      void *__unused)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_renew(xdr);
+	return status;
+}
+
+/*
+ * Decode SETCLIENTID response
+ */
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_setclientid_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_setclientid(xdr, res);
+	return status;
+}
+
+/*
+ * Decode SETCLIENTID_CONFIRM response
+ */
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+					    struct xdr_stream *xdr)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_setclientid_confirm(xdr);
+	return status;
+}
+
+/*
+ * Decode DELEGRETURN response
+ */
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_delegreturnres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status != 0)
+		goto out;
+	status = decode_getfattr(xdr, res->fattr, res->server);
+	if (status != 0)
+		goto out;
+	status = decode_delegreturn(xdr);
+out:
+	return status;
+}
+
+/*
+ * Decode FS_LOCATIONS response
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_fs_locations_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, req);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	if (res->migration) {
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
+					 NULL, res->fs_locations,
+					 NULL, res->fs_locations->server);
+		if (status)
+			goto out;
+		if (res->renew)
+			status = decode_renew(xdr);
+	} else {
+		status = decode_lookup(xdr);
+		if (status)
+			goto out;
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
+					 NULL, res->fs_locations,
+					 NULL, res->fs_locations->server);
+	}
+out:
+	return status;
+}
+
+/*
+ * Decode SECINFO response
+ */
+static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nfs4_secinfo_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_secinfo(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode FSID_PRESENT response
+ */
+static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_fsid_present_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	if (res->renew)
+		status = decode_renew(xdr);
+out:
+	return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Decode BIND_CONN_TO_SESSION response
+ */
+static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_bind_conn_to_session(xdr, res);
+	return status;
+}
+
+/*
+ * Decode EXCHANGE_ID response
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_exchange_id(xdr, res);
+	return status;
+}
+
+/*
+ * Decode CREATE_SESSION response
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs41_create_session_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_create_session(xdr, res);
+	return status;
+}
+
+/*
+ * Decode DESTROY_SESSION response
+ */
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_destroy_session(xdr, res);
+	return status;
+}
+
+/*
+ * Decode DESTROY_CLIENTID response
+ */
+static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_destroy_clientid(xdr, res);
+	return status;
+}
+
+/*
+ * Decode SEQUENCE response
+ */
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
+				 struct nfs4_sequence_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, res, rqstp);
+	return status;
+}
+
+/*
+ * Decode GET_LEASE_TIME response
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs4_get_lease_time_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
+	if (!status)
+		status = decode_putrootfh(xdr);
+	if (!status)
+		status = decode_fsinfo(xdr, res->lr_fsinfo);
+	return status;
+}
+
+/*
+ * Decode RECLAIM_COMPLETE response
+ */
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+					 struct xdr_stream *xdr,
+					 struct nfs41_reclaim_complete_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (!status)
+		status = decode_reclaim_complete(xdr, NULL);
+	return status;
+}
+
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+				      struct xdr_stream *xdr,
+				      struct nfs4_getdeviceinfo_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status != 0)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status != 0)
+		goto out;
+	status = decode_getdeviceinfo(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  struct nfs4_layoutget_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_layoutget(xdr, rqstp, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_layoutreturn_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_layoutreturn(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTCOMMIT response
+ */
+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_layoutcommit_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_layoutcommit(xdr, rqstp, res);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server);
+out:
+	return status;
+}
+
+/*
+ * Decode SECINFO_NO_NAME response
+ */
+static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct nfs4_secinfo_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putrootfh(xdr);
+	if (status)
+		goto out;
+	status = decode_secinfo_no_name(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode TEST_STATEID response
+ */
+static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs41_test_stateid_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_test_stateid(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode FREE_STATEID response
+ */
+static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs41_free_stateid_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_free_stateid(xdr, res);
+out:
+	return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
+{
+	unsigned int savep;
+	uint32_t bitmap[3] = {0};
+	uint32_t len;
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (*p == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (*p == xdr_zero)
+			return -EAGAIN;
+		entry->eof = 1;
+		return -EBADCOOKIE;
+	}
+
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	entry->prev_cookie = entry->cookie;
+	p = xdr_decode_hyper(p, &entry->cookie);
+	entry->len = be32_to_cpup(p);
+
+	p = xdr_inline_decode(xdr, entry->len);
+	if (unlikely(!p))
+		goto out_overflow;
+	entry->name = (const char *) p;
+
+	/*
+	 * In case the server doesn't return an inode number,
+	 * we fake one here.  (We don't use inode number 0,
+	 * since glibc seems to choke on it...)
+	 */
+	entry->ino = 1;
+	entry->fattr->valid = 0;
+
+	if (decode_attr_bitmap(xdr, bitmap) < 0)
+		goto out_overflow;
+
+	if (decode_attr_length(xdr, &len, &savep) < 0)
+		goto out_overflow;
+
+	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+			NULL, entry->label, entry->server) < 0)
+		goto out_overflow;
+	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+		entry->ino = entry->fattr->mounted_on_fileid;
+	else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+		entry->ino = entry->fattr->fileid;
+
+	entry->d_type = DT_UNKNOWN;
+	if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EAGAIN;
+}
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static struct {
+	int stat;
+	int errno;
+} nfs_errtbl[] = {
+	{ NFS4_OK,		0		},
+	{ NFS4ERR_PERM,		-EPERM		},
+	{ NFS4ERR_NOENT,	-ENOENT		},
+	{ NFS4ERR_IO,		-errno_NFSERR_IO},
+	{ NFS4ERR_NXIO,		-ENXIO		},
+	{ NFS4ERR_ACCESS,	-EACCES		},
+	{ NFS4ERR_EXIST,	-EEXIST		},
+	{ NFS4ERR_XDEV,		-EXDEV		},
+	{ NFS4ERR_NOTDIR,	-ENOTDIR	},
+	{ NFS4ERR_ISDIR,	-EISDIR		},
+	{ NFS4ERR_INVAL,	-EINVAL		},
+	{ NFS4ERR_FBIG,		-EFBIG		},
+	{ NFS4ERR_NOSPC,	-ENOSPC		},
+	{ NFS4ERR_ROFS,		-EROFS		},
+	{ NFS4ERR_MLINK,	-EMLINK		},
+	{ NFS4ERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFS4ERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFS4ERR_DQUOT,	-EDQUOT		},
+	{ NFS4ERR_STALE,	-ESTALE		},
+	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFS4ERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
+	{ NFS4ERR_LOCKED,	-EAGAIN		},
+	{ NFS4ERR_SYMLINK,	-ELOOP		},
+	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
+	{ NFS4ERR_DEADLOCK,	-EDEADLK	},
+	{ -1,			-EIO		}
+};
+
+/*
+ * Convert an NFS error code to a local one.
+ * This one is used jointly by NFSv2 and NFSv3.
+ */
+static int
+nfs4_stat_to_errno(int stat)
+{
+	int i;
+	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+		if (nfs_errtbl[i].stat == stat)
+			return nfs_errtbl[i].errno;
+	}
+	if (stat <= 10000 || stat > 10100) {
+		/* The server is looney tunes. */
+		return -EREMOTEIO;
+	}
+	/* If we cannot translate the error, the recovery routines should
+	 * handle it.
+	 * Note: remaining NFSv4 error codes have values > 10000, so should
+	 * not conflict with native Linux error codes.
+	 */
+	return -stat;
+}
+
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42xdr.c"
+#endif /* CONFIG_NFS_V4_2 */
+
+#define PROC(proc, argtype, restype)				\
+[NFSPROC4_CLNT_##proc] = {					\
+	.p_proc   = NFSPROC4_COMPOUND,				\
+	.p_encode = (kxdreproc_t)nfs4_xdr_##argtype,		\
+	.p_decode = (kxdrdproc_t)nfs4_xdr_##restype,		\
+	.p_arglen = NFS4_##argtype##_sz,			\
+	.p_replen = NFS4_##restype##_sz,			\
+	.p_statidx = NFSPROC4_CLNT_##proc,			\
+	.p_name   = #proc,					\
+}
+
+#define STUB(proc)		\
+[NFSPROC4_CLNT_##proc] = {	\
+	.p_name = #proc,	\
+}
+
+struct rpc_procinfo	nfs4_procedures[] = {
+	PROC(READ,		enc_read,		dec_read),
+	PROC(WRITE,		enc_write,		dec_write),
+	PROC(COMMIT,		enc_commit,		dec_commit),
+	PROC(OPEN,		enc_open,		dec_open),
+	PROC(OPEN_CONFIRM,	enc_open_confirm,	dec_open_confirm),
+	PROC(OPEN_NOATTR,	enc_open_noattr,	dec_open_noattr),
+	PROC(OPEN_DOWNGRADE,	enc_open_downgrade,	dec_open_downgrade),
+	PROC(CLOSE,		enc_close,		dec_close),
+	PROC(SETATTR,		enc_setattr,		dec_setattr),
+	PROC(FSINFO,		enc_fsinfo,		dec_fsinfo),
+	PROC(RENEW,		enc_renew,		dec_renew),
+	PROC(SETCLIENTID,	enc_setclientid,	dec_setclientid),
+	PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
+	PROC(LOCK,		enc_lock,		dec_lock),
+	PROC(LOCKT,		enc_lockt,		dec_lockt),
+	PROC(LOCKU,		enc_locku,		dec_locku),
+	PROC(ACCESS,		enc_access,		dec_access),
+	PROC(GETATTR,		enc_getattr,		dec_getattr),
+	PROC(LOOKUP,		enc_lookup,		dec_lookup),
+	PROC(LOOKUP_ROOT,	enc_lookup_root,	dec_lookup_root),
+	PROC(REMOVE,		enc_remove,		dec_remove),
+	PROC(RENAME,		enc_rename,		dec_rename),
+	PROC(LINK,		enc_link,		dec_link),
+	PROC(SYMLINK,		enc_symlink,		dec_symlink),
+	PROC(CREATE,		enc_create,		dec_create),
+	PROC(PATHCONF,		enc_pathconf,		dec_pathconf),
+	PROC(STATFS,		enc_statfs,		dec_statfs),
+	PROC(READLINK,		enc_readlink,		dec_readlink),
+	PROC(READDIR,		enc_readdir,		dec_readdir),
+	PROC(SERVER_CAPS,	enc_server_caps,	dec_server_caps),
+	PROC(DELEGRETURN,	enc_delegreturn,	dec_delegreturn),
+	PROC(GETACL,		enc_getacl,		dec_getacl),
+	PROC(SETACL,		enc_setacl,		dec_setacl),
+	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations),
+	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
+	PROC(SECINFO,		enc_secinfo,		dec_secinfo),
+	PROC(FSID_PRESENT,	enc_fsid_present,	dec_fsid_present),
+#if defined(CONFIG_NFS_V4_1)
+	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
+	PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
+	PROC(SEQUENCE,		enc_sequence,		dec_sequence),
+	PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
+	PROC(RECLAIM_COMPLETE,	enc_reclaim_complete,	dec_reclaim_complete),
+	PROC(GETDEVICEINFO,	enc_getdeviceinfo,	dec_getdeviceinfo),
+	PROC(LAYOUTGET,		enc_layoutget,		dec_layoutget),
+	PROC(LAYOUTCOMMIT,	enc_layoutcommit,	dec_layoutcommit),
+	PROC(LAYOUTRETURN,	enc_layoutreturn,	dec_layoutreturn),
+	PROC(SECINFO_NO_NAME,	enc_secinfo_no_name,	dec_secinfo_no_name),
+	PROC(TEST_STATEID,	enc_test_stateid,	dec_test_stateid),
+	PROC(FREE_STATEID,	enc_free_stateid,	dec_free_stateid),
+	STUB(GETDEVICELIST),
+	PROC(BIND_CONN_TO_SESSION,
+			enc_bind_conn_to_session, dec_bind_conn_to_session),
+	PROC(DESTROY_CLIENTID,	enc_destroy_clientid,	dec_destroy_clientid),
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+	PROC(SEEK,		enc_seek,		dec_seek),
+	PROC(ALLOCATE,		enc_allocate,		dec_allocate),
+	PROC(DEALLOCATE,	enc_deallocate,		dec_deallocate),
+#endif /* CONFIG_NFS_V4_2 */
+};
+
+const struct rpc_version nfs_version4 = {
+	.number			= 4,
+	.nrprocs		= ARRAY_SIZE(nfs4_procedures),
+	.procs			= nfs4_procedures
+};
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/kernel/fs/nfs/nfsroot.c b/kernel/fs/nfs/nfsroot.c
new file mode 100644
index 000000000..9bc9f04fb
--- /dev/null
+++ b/kernel/fs/nfs/nfsroot.c
@@ -0,0 +1,309 @@
+/*
+ *  Copyright (C) 1995, 1996  Gero Kuhlmann <gero@gkminix.han.de>
+ *
+ *  Allow an NFS filesystem to be mounted as root. The way this works is:
+ *     (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
+ *     (2) Construct the device string and the options string using DHCP
+ *         option 17 and/or kernel command line options.
+ *     (3) When mount_root() sets up the root file system, pass these strings
+ *         to the NFS client's regular mount interface via sys_mount().
+ *
+ *
+ *	Changes:
+ *
+ *	Alan Cox	:	Removed get_address name clash with FPU.
+ *	Alan Cox	:	Reformatted a bit.
+ *	Gero Kuhlmann	:	Code cleanup
+ *	Michael Rausch  :	Fixed recognition of an incoming RARP answer.
+ *	Martin Mares	: (2.0)	Auto-configuration via BOOTP supported.
+ *	Martin Mares	:	Manual selection of interface & BOOTP/RARP.
+ *	Martin Mares	:	Using network routes instead of host routes,
+ *				allowing the default configuration to be used
+ *				for normal operation of the host.
+ *	Martin Mares	:	Randomized timer with exponential backoff
+ *				installed to minimize network congestion.
+ *	Martin Mares	:	Code cleanup.
+ *	Martin Mares	: (2.1)	BOOTP and RARP made configuration options.
+ *	Martin Mares	:	Server hostname generation fixed.
+ *	Gerd Knorr	:	Fixed wired inode handling
+ *	Martin Mares	: (2.2)	"0.0.0.0" addresses from command line ignored.
+ *	Martin Mares	:	RARP replies not tested for server address.
+ *	Gero Kuhlmann	: (2.3) Some bug fixes and code cleanup again (please
+ *				send me your new patches _before_ bothering
+ *				Linus so that I don' always have to cleanup
+ *				_afterwards_ - thanks)
+ *	Gero Kuhlmann	:	Last changes of Martin Mares undone.
+ *	Gero Kuhlmann	: 	RARP replies are tested for specified server
+ *				again. However, it's now possible to have
+ *				different RARP and NFS servers.
+ *	Gero Kuhlmann	:	"0.0.0.0" addresses from command line are
+ *				now mapped to INADDR_NONE.
+ *	Gero Kuhlmann	:	Fixed a bug which prevented BOOTP path name
+ *				from being used (thanks to Leo Spiekman)
+ *	Andy Walker	:	Allow to specify the NFS server in nfs_root
+ *				without giving a path name
+ *	Swen Thümmler	:	Allow to specify the NFS options in nfs_root
+ *				without giving a path name. Fix BOOTP request
+ *				for domainname (domainname is NIS domain, not
+ *				DNS domain!). Skip dummy devices for BOOTP.
+ *	Jacek Zapala	:	Fixed a bug which prevented server-ip address
+ *				from nfsroot parameter from being used.
+ *	Olaf Kirch	:	Adapted to new NFS code.
+ *	Jakub Jelinek	:	Free used code segment.
+ *	Marko Kohtala	:	Fixed some bugs.
+ *	Martin Mares	:	Debug message cleanup
+ *	Martin Mares	:	Changed to use the new generic IP layer autoconfig
+ *				code. BOOTP and RARP moved there.
+ *	Martin Mares	:	Default path now contains host name instead of
+ *				host IP address (but host name defaults to IP
+ *				address anyway).
+ *	Martin Mares	:	Use root_server_addr appropriately during setup.
+ *	Martin Mares	:	Rewrote parameter parsing, now hopefully giving
+ *				correct overriding.
+ *	Trond Myklebust :	Add in preliminary support for NFSv3 and TCP.
+ *				Fix bug in root_nfs_addr(). nfs_data.namlen
+ *				is NOT for the length of the hostname.
+ *	Hua Qin		:	Support for mounting root file system via
+ *				NFS over TCP.
+ *	Fabian Frederick:	Option parser rebuilt (using parser lib)
+ *	Chuck Lever	:	Use super.c's text-based mount option parsing
+ *	Chuck Lever	:	Add "nfsrootdebug".
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+#include <linux/utsname.h>
+#include <linux/root_dev.h>
+#include <net/ipconfig.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_ROOT
+
+/* Default path we try to mount. "%s" gets replaced by our IP address */
+#define NFS_ROOT		"/tftpboot/%s"
+
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS		"vers=2,udp,rsize=4096,wsize=4096"
+
+/* Parameters passed from the kernel command line */
+static char nfs_root_parms[256] __initdata = "";
+
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
+
+/* Address of NFS server */
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
+
+/* Name of directory to mount */
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
+
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+
+#ifdef NFS_DEBUG
+/*
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
+ */
+static int __init nfs_root_debug(char *__unused)
+{
+	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
+	return 1;
+}
+
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
+
+/*
+ *  Parse NFS server and directory information passed on the kernel
+ *  command line.
+ *
+ *  nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ *  If there is a "%s" token in the <root-dir> string, it is replaced
+ *  by the ASCII-representation of the client's IP address.
+ */
+static int __init nfs_root_setup(char *line)
+{
+	ROOT_DEV = Root_NFS;
+
+	if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+		strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
+	} else {
+		size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
+		if (n >= sizeof(nfs_root_parms))
+			line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
+		sprintf(nfs_root_parms, NFS_ROOT, line);
+	}
+
+	/*
+	 * Extract the IP address of the NFS server containing our
+	 * root file system, if one was specified.
+	 *
+	 * Note: root_nfs_parse_addr() removes the server-ip from
+	 *	 nfs_root_parms, if it exists.
+	 */
+	root_server_addr = root_nfs_parse_addr(nfs_root_parms);
+
+	return 1;
+}
+
+__setup("nfsroot=", nfs_root_setup);
+
+static int __init root_nfs_copy(char *dest, const char *src,
+				     const size_t destlen)
+{
+	if (strlcpy(dest, src, destlen) > destlen)
+		return -1;
+	return 0;
+}
+
+static int __init root_nfs_cat(char *dest, const char *src,
+			       const size_t destlen)
+{
+	size_t len = strlen(dest);
+
+	if (len && dest[len - 1] != ',')
+		if (strlcat(dest, ",", destlen) > destlen)
+			return -1;
+
+	if (strlcat(dest, src, destlen) > destlen)
+		return -1;
+	return 0;
+}
+
+/*
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
+ */
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
+					 const size_t exppathlen)
+{
+	char *p;
+
+	/*
+	 * Set the NFS remote path
+	 */
+	p = strsep(&incoming, ",");
+	if (*p != '\0' && strcmp(p, "default") != 0)
+		if (root_nfs_copy(exppath, p, exppathlen))
+			return -1;
+
+	/*
+	 * @incoming now points to the rest of the string; if it
+	 * contains something, append it to our root options buffer
+	 */
+	if (incoming != NULL && *incoming != '\0')
+		if (root_nfs_cat(nfs_root_options, incoming,
+						sizeof(nfs_root_options)))
+			return -1;
+	return 0;
+}
+
+/*
+ *  Decode the export directory path name and NFS options from
+ *  the kernel command line.  This has to be done late in order to
+ *  use a dynamically acquired client IP address for the remote
+ *  root directory path.
+ *
+ *  Returns zero if successful; otherwise -1 is returned.
+ */
+static int __init root_nfs_data(char *cmdline)
+{
+	char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+	int len, retval = -1;
+	char *tmp = NULL;
+	const size_t tmplen = sizeof(nfs_export_path);
+
+	tmp = kzalloc(tmplen, GFP_KERNEL);
+	if (tmp == NULL)
+		goto out_nomem;
+	strcpy(tmp, NFS_ROOT);
+
+	if (root_server_path[0] != '\0') {
+		dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+			root_server_path);
+		if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+			goto out_optionstoolong;
+	}
+
+	if (cmdline[0] != '\0') {
+		dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+		if (root_nfs_parse_options(cmdline, tmp, tmplen))
+			goto out_optionstoolong;
+	}
+
+	/*
+	 * Append mandatory options for nfsroot so they override
+	 * what has come before
+	 */
+	snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
+			&servaddr);
+	if (root_nfs_cat(nfs_root_options, mand_options,
+						sizeof(nfs_root_options)))
+		goto out_optionstoolong;
+
+	/*
+	 * Set up nfs_root_device.  For NFS mounts, this looks like
+	 *
+	 *	server:/path
+	 *
+	 * At this point, utsname()->nodename contains our local
+	 * IP address or hostname, set by ipconfig.  If "%s" exists
+	 * in tmp, substitute the nodename, then shovel the whole
+	 * mess into nfs_root_device.
+	 */
+	len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+				tmp, utsname()->nodename);
+	if (len >= (int)sizeof(nfs_export_path))
+		goto out_devnametoolong;
+	len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+				"%pI4:%s", &servaddr, nfs_export_path);
+	if (len >= (int)sizeof(nfs_root_device))
+		goto out_devnametoolong;
+
+	retval = 0;
+
+out:
+	kfree(tmp);
+	return retval;
+out_nomem:
+	printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+	goto out;
+out_optionstoolong:
+	printk(KERN_ERR "Root-NFS: mount options string too long\n");
+	goto out;
+out_devnametoolong:
+	printk(KERN_ERR "Root-NFS: root device name too long.\n");
+	goto out;
+}
+
+/**
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
+ * @root_device: OUT: address of string containing NFSROOT device
+ * @root_data: OUT: address of string containing NFSROOT mount options
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
+ */
+int __init nfs_root_data(char **root_device, char **root_data)
+{
+	servaddr = root_server_addr;
+	if (servaddr == htonl(INADDR_NONE)) {
+		printk(KERN_ERR "Root-NFS: no NFS server address\n");
+		return -1;
+	}
+
+	if (root_nfs_data(nfs_root_parms) < 0)
+		return -1;
+
+	*root_device = nfs_root_device;
+	*root_data = nfs_root_options;
+	return 0;
+}
diff --git a/kernel/fs/nfs/nfstrace.c b/kernel/fs/nfs/nfstrace.c
new file mode 100644
index 000000000..c74f7af23
--- /dev/null
+++ b/kernel/fs/nfs/nfstrace.c
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfstrace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
diff --git a/kernel/fs/nfs/nfstrace.h b/kernel/fs/nfs/nfstrace.h
new file mode 100644
index 000000000..59f838cdc
--- /dev/null
+++ b/kernel/fs/nfs/nfstrace.h
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs
+
+#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS_H
+
+#include <linux/tracepoint.h>
+
+#define nfs_show_file_type(ftype) \
+	__print_symbolic(ftype, \
+			{ DT_UNKNOWN, "UNKNOWN" }, \
+			{ DT_FIFO, "FIFO" }, \
+			{ DT_CHR, "CHR" }, \
+			{ DT_DIR, "DIR" }, \
+			{ DT_BLK, "BLK" }, \
+			{ DT_REG, "REG" }, \
+			{ DT_LNK, "LNK" }, \
+			{ DT_SOCK, "SOCK" }, \
+			{ DT_WHT, "WHT" })
+
+#define nfs_show_cache_validity(v) \
+	__print_flags(v, "|", \
+			{ NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \
+			{ NFS_INO_INVALID_DATA, "INVALID_DATA" }, \
+			{ NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
+			{ NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
+			{ NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
+			{ NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
+			{ NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
+			{ NFS_INO_INVALID_LABEL, "INVALID_LABEL" })
+
+#define nfs_show_nfsi_flags(v) \
+	__print_flags(v, "|", \
+			{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
+			{ 1 << NFS_INO_STALE, "STALE" }, \
+			{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
+			{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
+			{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
+			{ 1 << NFS_INO_COMMIT, "COMMIT" }, \
+			{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
+			{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
+
+DECLARE_EVENT_CLASS(nfs_inode_event,
+		TP_PROTO(
+			const struct inode *inode
+		),
+
+		TP_ARGS(inode),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(u64, version)
+		),
+
+		TP_fast_assign(
+			const struct nfs_inode *nfsi = NFS_I(inode);
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->version = inode->i_version;
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			(unsigned long long)__entry->version
+		)
+);
+
+DECLARE_EVENT_CLASS(nfs_inode_event_done,
+		TP_PROTO(
+			const struct inode *inode,
+			int error
+		),
+
+		TP_ARGS(inode, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(unsigned char, type)
+			__field(u64, fileid)
+			__field(u64, version)
+			__field(loff_t, size)
+			__field(unsigned long, nfsi_flags)
+			__field(unsigned long, cache_validity)
+		),
+
+		TP_fast_assign(
+			const struct nfs_inode *nfsi = NFS_I(inode);
+			__entry->error = error;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->type = nfs_umode_to_dtype(inode->i_mode);
+			__entry->version = inode->i_version;
+			__entry->size = i_size_read(inode);
+			__entry->nfsi_flags = nfsi->flags;
+			__entry->cache_validity = nfsi->cache_validity;
+		),
+
+		TP_printk(
+			"error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"type=%u (%s) version=%llu size=%lld "
+			"cache_validity=%lu (%s) nfs_flags=%ld (%s)",
+			__entry->error,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			__entry->type,
+			nfs_show_file_type(__entry->type),
+			(unsigned long long)__entry->version,
+			(long long)__entry->size,
+			__entry->cache_validity,
+			nfs_show_cache_validity(__entry->cache_validity),
+			__entry->nfsi_flags,
+			nfs_show_nfsi_flags(__entry->nfsi_flags)
+		)
+);
+
+#define DEFINE_NFS_INODE_EVENT(name) \
+	DEFINE_EVENT(nfs_inode_event, name, \
+			TP_PROTO( \
+				const struct inode *inode \
+			), \
+			TP_ARGS(inode))
+#define DEFINE_NFS_INODE_EVENT_DONE(name) \
+	DEFINE_EVENT(nfs_inode_event_done, name, \
+			TP_PROTO( \
+				const struct inode *inode, \
+				int error \
+			), \
+			TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit);
+DEFINE_NFS_INODE_EVENT(nfs_getattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_setattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit);
+DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
+DEFINE_NFS_INODE_EVENT(nfs_access_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit);
+
+#define show_lookup_flags(flags) \
+	__print_flags((unsigned long)flags, "|", \
+			{ LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
+			{ LOOKUP_DIRECTORY, "DIRECTORY" }, \
+			{ LOOKUP_OPEN, "OPEN" }, \
+			{ LOOKUP_CREATE, "CREATE" }, \
+			{ LOOKUP_EXCL, "EXCL" })
+
+DECLARE_EVENT_CLASS(nfs_lookup_event,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct dentry *dentry,
+			unsigned int flags
+		),
+
+		TP_ARGS(dir, dentry, flags),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, flags)
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->flags = flags;
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"flags=%u (%s) name=%02x:%02x:%llu/%s",
+			__entry->flags,
+			show_lookup_flags(__entry->flags),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT(name) \
+	DEFINE_EVENT(nfs_lookup_event, name, \
+			TP_PROTO( \
+				const struct inode *dir, \
+				const struct dentry *dentry, \
+				unsigned int flags \
+			), \
+			TP_ARGS(dir, dentry, flags))
+
+DECLARE_EVENT_CLASS(nfs_lookup_event_done,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct dentry *dentry,
+			unsigned int flags,
+			int error
+		),
+
+		TP_ARGS(dir, dentry, flags, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(unsigned int, flags)
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->error = error;
+			__entry->flags = flags;
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
+			__entry->error,
+			__entry->flags,
+			show_lookup_flags(__entry->flags),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \
+	DEFINE_EVENT(nfs_lookup_event_done, name, \
+			TP_PROTO( \
+				const struct inode *dir, \
+				const struct dentry *dentry, \
+				unsigned int flags, \
+				int error \
+			), \
+			TP_ARGS(dir, dentry, flags, error))
+
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
+
+#define show_open_flags(flags) \
+	__print_flags((unsigned long)flags, "|", \
+		{ O_CREAT, "O_CREAT" }, \
+		{ O_EXCL, "O_EXCL" }, \
+		{ O_TRUNC, "O_TRUNC" }, \
+		{ O_APPEND, "O_APPEND" }, \
+		{ O_DSYNC, "O_DSYNC" }, \
+		{ O_DIRECT, "O_DIRECT" }, \
+		{ O_DIRECTORY, "O_DIRECTORY" })
+
+#define show_fmode_flags(mode) \
+	__print_flags(mode, "|", \
+		{ ((__force unsigned long)FMODE_READ), "READ" }, \
+		{ ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
+		{ ((__force unsigned long)FMODE_EXEC), "EXEC" })
+
+TRACE_EVENT(nfs_atomic_open_enter,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct nfs_open_context *ctx,
+			unsigned int flags
+		),
+
+		TP_ARGS(dir, ctx, flags),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, flags)
+			__field(unsigned int, fmode)
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, ctx->dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->flags = flags;
+			__entry->fmode = (__force unsigned int)ctx->mode;
+			__assign_str(name, ctx->dentry->d_name.name);
+		),
+
+		TP_printk(
+			"flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s",
+			__entry->flags,
+			show_open_flags(__entry->flags),
+			show_fmode_flags(__entry->fmode),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+TRACE_EVENT(nfs_atomic_open_exit,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct nfs_open_context *ctx,
+			unsigned int flags,
+			int error
+		),
+
+		TP_ARGS(dir, ctx, flags, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(unsigned int, flags)
+			__field(unsigned int, fmode)
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, ctx->dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->error = error;
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->flags = flags;
+			__entry->fmode = (__force unsigned int)ctx->mode;
+			__assign_str(name, ctx->dentry->d_name.name);
+		),
+
+		TP_printk(
+			"error=%d flags=%u (%s) fmode=%s "
+			"name=%02x:%02x:%llu/%s",
+			__entry->error,
+			__entry->flags,
+			show_open_flags(__entry->flags),
+			show_fmode_flags(__entry->fmode),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+TRACE_EVENT(nfs_create_enter,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct dentry *dentry,
+			unsigned int flags
+		),
+
+		TP_ARGS(dir, dentry, flags),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, flags)
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->flags = flags;
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"flags=%u (%s) name=%02x:%02x:%llu/%s",
+			__entry->flags,
+			show_open_flags(__entry->flags),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+TRACE_EVENT(nfs_create_exit,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct dentry *dentry,
+			unsigned int flags,
+			int error
+		),
+
+		TP_ARGS(dir, dentry, flags, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(unsigned int, flags)
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->error = error;
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->flags = flags;
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
+			__entry->error,
+			__entry->flags,
+			show_open_flags(__entry->flags),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+DECLARE_EVENT_CLASS(nfs_directory_event,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct dentry *dentry
+		),
+
+		TP_ARGS(dir, dentry),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"name=%02x:%02x:%llu/%s",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT(name) \
+	DEFINE_EVENT(nfs_directory_event, name, \
+			TP_PROTO( \
+				const struct inode *dir, \
+				const struct dentry *dentry \
+			), \
+			TP_ARGS(dir, dentry))
+
+DECLARE_EVENT_CLASS(nfs_directory_event_done,
+		TP_PROTO(
+			const struct inode *dir,
+			const struct dentry *dentry,
+			int error
+		),
+
+		TP_ARGS(dir, dentry, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->error = error;
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"error=%d name=%02x:%02x:%llu/%s",
+			__entry->error,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \
+	DEFINE_EVENT(nfs_directory_event_done, name, \
+			TP_PROTO( \
+				const struct inode *dir, \
+				const struct dentry *dentry, \
+				int error \
+			), \
+			TP_ARGS(dir, dentry, error))
+
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit);
+
+TRACE_EVENT(nfs_link_enter,
+		TP_PROTO(
+			const struct inode *inode,
+			const struct inode *dir,
+			const struct dentry *dentry
+		),
+
+		TP_ARGS(inode, dir, dentry),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, fileid)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->dir = NFS_FILEID(dir);
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			__entry->fileid,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+TRACE_EVENT(nfs_link_exit,
+		TP_PROTO(
+			const struct inode *inode,
+			const struct inode *dir,
+			const struct dentry *dentry,
+			int error
+		),
+
+		TP_ARGS(inode, dir, dentry, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u64, fileid)
+			__field(u64, dir)
+			__string(name, dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->dir = NFS_FILEID(dir);
+			__entry->error = error;
+			__assign_str(name, dentry->d_name.name);
+		),
+
+		TP_printk(
+			"error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+			__entry->error,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			__entry->fileid,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+
+DECLARE_EVENT_CLASS(nfs_rename_event,
+		TP_PROTO(
+			const struct inode *old_dir,
+			const struct dentry *old_dentry,
+			const struct inode *new_dir,
+			const struct dentry *new_dentry
+		),
+
+		TP_ARGS(old_dir, old_dentry, new_dir, new_dentry),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, old_dir)
+			__field(u64, new_dir)
+			__string(old_name, old_dentry->d_name.name)
+			__string(new_name, new_dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = old_dir->i_sb->s_dev;
+			__entry->old_dir = NFS_FILEID(old_dir);
+			__entry->new_dir = NFS_FILEID(new_dir);
+			__assign_str(old_name, old_dentry->d_name.name);
+			__assign_str(new_name, new_dentry->d_name.name);
+		),
+
+		TP_printk(
+			"old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->old_dir,
+			__get_str(old_name),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->new_dir,
+			__get_str(new_name)
+		)
+);
+#define DEFINE_NFS_RENAME_EVENT(name) \
+	DEFINE_EVENT(nfs_rename_event, name, \
+			TP_PROTO( \
+				const struct inode *old_dir, \
+				const struct dentry *old_dentry, \
+				const struct inode *new_dir, \
+				const struct dentry *new_dentry \
+			), \
+			TP_ARGS(old_dir, old_dentry, new_dir, new_dentry))
+
+DECLARE_EVENT_CLASS(nfs_rename_event_done,
+		TP_PROTO(
+			const struct inode *old_dir,
+			const struct dentry *old_dentry,
+			const struct inode *new_dir,
+			const struct dentry *new_dentry,
+			int error
+		),
+
+		TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(int, error)
+			__field(u64, old_dir)
+			__string(old_name, old_dentry->d_name.name)
+			__field(u64, new_dir)
+			__string(new_name, new_dentry->d_name.name)
+		),
+
+		TP_fast_assign(
+			__entry->dev = old_dir->i_sb->s_dev;
+			__entry->old_dir = NFS_FILEID(old_dir);
+			__entry->new_dir = NFS_FILEID(new_dir);
+			__entry->error = error;
+			__assign_str(old_name, old_dentry->d_name.name);
+			__assign_str(new_name, new_dentry->d_name.name);
+		),
+
+		TP_printk(
+			"error=%d old_name=%02x:%02x:%llu/%s "
+			"new_name=%02x:%02x:%llu/%s",
+			__entry->error,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->old_dir,
+			__get_str(old_name),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->new_dir,
+			__get_str(new_name)
+		)
+);
+#define DEFINE_NFS_RENAME_EVENT_DONE(name) \
+	DEFINE_EVENT(nfs_rename_event_done, name, \
+			TP_PROTO( \
+				const struct inode *old_dir, \
+				const struct dentry *old_dentry, \
+				const struct inode *new_dir, \
+				const struct dentry *new_dentry, \
+				int error \
+			), \
+			TP_ARGS(old_dir, old_dentry, new_dir, \
+				new_dentry, error))
+
+DEFINE_NFS_RENAME_EVENT(nfs_rename_enter);
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit);
+
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename);
+
+TRACE_EVENT(nfs_sillyrename_unlink,
+		TP_PROTO(
+			const struct nfs_unlinkdata *data,
+			int error
+		),
+
+		TP_ARGS(data, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(int, error)
+			__field(u64, dir)
+			__dynamic_array(char, name, data->args.name.len + 1)
+		),
+
+		TP_fast_assign(
+			struct inode *dir = data->dir;
+			size_t len = data->args.name.len;
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->dir = NFS_FILEID(dir);
+			__entry->error = error;
+			memcpy(__get_dynamic_array(name),
+				data->args.name.name, len);
+			((char *)__get_dynamic_array(name))[len] = 0;
+		),
+
+		TP_printk(
+			"error=%d name=%02x:%02x:%llu/%s",
+			__entry->error,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->dir,
+			__get_str(name)
+		)
+);
+#endif /* _TRACE_NFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfstrace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fs/nfs/objlayout/Kbuild b/kernel/fs/nfs/objlayout/Kbuild
new file mode 100644
index 000000000..ed30ea072
--- /dev/null
+++ b/kernel/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/kernel/fs/nfs/objlayout/objio_osd.c b/kernel/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 000000000..5aaed3635
--- /dev/null
+++ b/kernel/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,678 @@
+/*
+ *  pNFS Objects layout implementation over open-osd initiator library
+ *
+ *  Copyright (C) 2009 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <scsi/osd_ore.h>
+
+#include "objlayout.h"
+#include "../internal.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+struct objio_dev_ent {
+	struct nfs4_deviceid_node id_node;
+	struct ore_dev od;
+};
+
+static void
+objio_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
+
+	dprintk("%s: free od=%p\n", __func__, de->od.od);
+	osduld_put_device(de->od.od);
+	kfree_rcu(d, rcu);
+}
+
+struct objio_segment {
+	struct pnfs_layout_segment lseg;
+
+	struct ore_layout layout;
+	struct ore_components oc;
+};
+
+static inline struct objio_segment *
+OBJIO_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg, struct objio_segment, lseg);
+}
+
+struct objio_state {
+	/* Generic layer */
+	struct objlayout_io_res oir;
+
+	bool sync;
+	/*FIXME: Support for extra_bytes at ore_get_rw_state() */
+	struct ore_io_state *ios;
+};
+
+/* Send and wait for a get_device_info of devices in the layout,
+   then look them up with the osd_initiator library */
+struct nfs4_deviceid_node *
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+			gfp_t gfp_flags)
+{
+	struct pnfs_osd_deviceaddr *deviceaddr;
+	struct objio_dev_ent *ode = NULL;
+	struct osd_dev *od;
+	struct osd_dev_info odi;
+	bool retry_flag = true;
+	__be32 *p;
+	int err;
+
+	deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
+	if (!deviceaddr)
+		return NULL;
+
+	p = page_address(pdev->pages[0]);
+	pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
+
+	odi.systemid_len = deviceaddr->oda_systemid.len;
+	if (odi.systemid_len > sizeof(odi.systemid)) {
+		dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
+			__func__, sizeof(odi.systemid));
+		err = -EINVAL;
+		goto out;
+	} else if (odi.systemid_len)
+		memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+		       odi.systemid_len);
+	odi.osdname_len	 = deviceaddr->oda_osdname.len;
+	odi.osdname	 = (u8 *)deviceaddr->oda_osdname.data;
+
+	if (!odi.osdname_len && !odi.systemid_len) {
+		dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+			__func__);
+		err = -ENODEV;
+		goto out;
+	}
+
+retry_lookup:
+	od = osduld_info_lookup(&odi);
+	if (unlikely(IS_ERR(od))) {
+		err = PTR_ERR(od);
+		dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+		if (err == -ENODEV && retry_flag) {
+			err = objlayout_autologin(deviceaddr);
+			if (likely(!err)) {
+				retry_flag = false;
+				goto retry_lookup;
+			}
+		}
+		goto out;
+	}
+
+	dprintk("Adding new dev_id(%llx:%llx)\n",
+		_DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+
+	ode = kzalloc(sizeof(*ode), gfp_flags);
+	if (!ode) {
+		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+		goto out;
+	}
+
+	nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+	kfree(deviceaddr);
+
+	ode->od.od = od;
+	return &ode->id_node;
+
+out:
+	kfree(deviceaddr);
+	return NULL;
+}
+
+static void copy_single_comp(struct ore_components *oc, unsigned c,
+			     struct pnfs_osd_object_cred *src_comp)
+{
+	struct ore_comp *ocomp = &oc->comps[c];
+
+	WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
+	WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
+
+	ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
+	ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
+
+	memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
+}
+
+static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+		       struct objio_segment **pseg)
+{
+/*	This is the in memory structure of the objio_segment
+ *
+ *	struct __alloc_objio_segment {
+ *		struct objio_segment olseg;
+ *		struct ore_dev *ods[numdevs];
+ *		struct ore_comp	comps[numdevs];
+ *	} *aolseg;
+ *	NOTE: The code as above compiles and runs perfectly. It is elegant,
+ *	type safe and compact. At some Past time Linus has decided he does not
+ *	like variable length arrays, For the sake of this principal we uglify
+ *	the code as below.
+ */
+	struct objio_segment *lseg;
+	size_t lseg_size = sizeof(*lseg) +
+			numdevs * sizeof(lseg->oc.ods[0]) +
+			numdevs * sizeof(*lseg->oc.comps);
+
+	lseg = kzalloc(lseg_size, gfp_flags);
+	if (unlikely(!lseg)) {
+		dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__,
+			numdevs, lseg_size);
+		return -ENOMEM;
+	}
+
+	lseg->oc.numdevs = numdevs;
+	lseg->oc.single_comp = EC_MULTPLE_COMPS;
+	lseg->oc.ods = (void *)(lseg + 1);
+	lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
+
+	*pseg = lseg;
+	return 0;
+}
+
+int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+	struct pnfs_layout_hdr *pnfslay,
+	struct pnfs_layout_range *range,
+	struct xdr_stream *xdr,
+	gfp_t gfp_flags)
+{
+	struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
+	struct objio_segment *objio_seg;
+	struct pnfs_osd_xdr_decode_layout_iter iter;
+	struct pnfs_osd_layout layout;
+	struct pnfs_osd_object_cred src_comp;
+	unsigned cur_comp;
+	int err;
+
+	err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+	if (unlikely(err))
+		return err;
+
+	err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
+	if (unlikely(err))
+		return err;
+
+	objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
+	objio_seg->layout.group_width = layout.olo_map.odm_group_width;
+	objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
+	objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+	objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
+
+	err = ore_verify_layout(layout.olo_map.odm_num_comps,
+					  &objio_seg->layout);
+	if (unlikely(err))
+		goto err;
+
+	objio_seg->oc.first_dev = layout.olo_comps_index;
+	cur_comp = 0;
+	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+		struct nfs4_deviceid_node *d;
+		struct objio_dev_ent *ode;
+
+		copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
+
+		d = nfs4_find_get_deviceid(server,
+				&src_comp.oc_object_id.oid_device_id,
+				pnfslay->plh_lc_cred, gfp_flags);
+		if (!d) {
+			err = -ENXIO;
+			goto err;
+		}
+
+		ode = container_of(d, struct objio_dev_ent, id_node);
+		objio_seg->oc.ods[cur_comp++] = &ode->od;
+	}
+	/* pnfs_osd_xdr_decode_layout_comp returns false on error */
+	if (unlikely(err))
+		goto err;
+
+	*outp = &objio_seg->lseg;
+	return 0;
+
+err:
+	kfree(objio_seg);
+	dprintk("%s: Error: return %d\n", __func__, err);
+	*outp = NULL;
+	return err;
+}
+
+void objio_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	int i;
+	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+
+	for (i = 0; i < objio_seg->oc.numdevs; i++) {
+		struct ore_dev *od = objio_seg->oc.ods[i];
+		struct objio_dev_ent *ode;
+
+		if (!od)
+			break;
+		ode = container_of(od, typeof(*ode), od);
+		nfs4_put_deviceid_node(&ode->id_node);
+	}
+	kfree(objio_seg);
+}
+
+static int
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
+	struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
+	loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
+	struct objio_state **outp)
+{
+	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+	struct ore_io_state *ios;
+	int ret;
+	struct __alloc_objio_state {
+		struct objio_state objios;
+		struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
+	} *aos;
+
+	aos = kzalloc(sizeof(*aos), gfp_flags);
+	if (unlikely(!aos))
+		return -ENOMEM;
+
+	objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
+			aos->ioerrs, rpcdata, pnfs_layout_type);
+
+	ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
+			       offset, count, &ios);
+	if (unlikely(ret)) {
+		kfree(aos);
+		return ret;
+	}
+
+	ios->pages = pages;
+	ios->pgbase = pgbase;
+	ios->private = aos;
+	BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
+
+	aos->objios.sync = 0;
+	aos->objios.ios = ios;
+	*outp = &aos->objios;
+	return 0;
+}
+
+void objio_free_result(struct objlayout_io_res *oir)
+{
+	struct objio_state *objios = container_of(oir, struct objio_state, oir);
+
+	ore_put_io_state(objios->ios);
+	kfree(objios);
+}
+
+static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+	switch (oep) {
+	case OSD_ERR_PRI_NO_ERROR:
+		return (enum pnfs_osd_errno)0;
+
+	case OSD_ERR_PRI_CLEAR_PAGES:
+		BUG_ON(1);
+		return 0;
+
+	case OSD_ERR_PRI_RESOURCE:
+		return PNFS_OSD_ERR_RESOURCE;
+	case OSD_ERR_PRI_BAD_CRED:
+		return PNFS_OSD_ERR_BAD_CRED;
+	case OSD_ERR_PRI_NO_ACCESS:
+		return PNFS_OSD_ERR_NO_ACCESS;
+	case OSD_ERR_PRI_UNREACHABLE:
+		return PNFS_OSD_ERR_UNREACHABLE;
+	case OSD_ERR_PRI_NOT_FOUND:
+		return PNFS_OSD_ERR_NOT_FOUND;
+	case OSD_ERR_PRI_NO_SPACE:
+		return PNFS_OSD_ERR_NO_SPACE;
+	default:
+		WARN_ON(1);
+		/* fallthrough */
+	case OSD_ERR_PRI_EIO:
+		return PNFS_OSD_ERR_EIO;
+	}
+}
+
+static void __on_dev_error(struct ore_io_state *ios,
+	struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
+	u64 dev_offset, u64  dev_len)
+{
+	struct objio_state *objios = ios->private;
+	struct pnfs_osd_objid pooid;
+	struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
+	/* FIXME: what to do with more-then-one-group layouts. We need to
+	 * translate from ore_io_state index to oc->comps index
+	 */
+	unsigned comp = dev_index;
+
+	pooid.oid_device_id = ode->id_node.deviceid;
+	pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
+	pooid.oid_object_id = ios->oc->comps[comp].obj.id;
+
+	objlayout_io_set_result(&objios->oir, comp,
+				&pooid, osd_pri_2_pnfs_err(oep),
+				dev_offset, dev_len, !ios->reading);
+}
+
+/*
+ * read
+ */
+static void _read_done(struct ore_io_state *ios, void *private)
+{
+	struct objio_state *objios = private;
+	ssize_t status;
+	int ret = ore_check_io(ios, &__on_dev_error);
+
+	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */
+
+	if (likely(!ret))
+		status = ios->length;
+	else
+		status = ret;
+
+	objlayout_read_done(&objios->oir, status, objios->sync);
+}
+
+int objio_read_pagelist(struct nfs_pgio_header *hdr)
+{
+	struct objio_state *objios;
+	int ret;
+
+	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
+			hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+			hdr->args.offset, hdr->args.count, hdr,
+			GFP_KERNEL, &objios);
+	if (unlikely(ret))
+		return ret;
+
+	objios->ios->done = _read_done;
+	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+		hdr->args.offset, hdr->args.count);
+	ret = ore_read(objios->ios);
+	if (unlikely(ret))
+		objio_free_result(&objios->oir);
+	return ret;
+}
+
+/*
+ * write
+ */
+static void _write_done(struct ore_io_state *ios, void *private)
+{
+	struct objio_state *objios = private;
+	ssize_t status;
+	int ret = ore_check_io(ios, &__on_dev_error);
+
+	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */
+
+	if (likely(!ret)) {
+		/* FIXME: should be based on the OSD's persistence model
+		 * See OSD2r05 Section 4.13 Data persistence model */
+		objios->oir.committed = NFS_FILE_SYNC;
+		status = ios->length;
+	} else {
+		status = ret;
+	}
+
+	objlayout_write_done(&objios->oir, status, objios->sync);
+}
+
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+	struct objio_state *objios = priv;
+	struct nfs_pgio_header *hdr = objios->oir.rpcdata;
+	struct address_space *mapping = hdr->inode->i_mapping;
+	pgoff_t index = offset / PAGE_SIZE;
+	struct page *page;
+	loff_t i_size = i_size_read(hdr->inode);
+
+	if (offset >= i_size) {
+		*uptodate = true;
+		dprintk("%s: g_zero_page index=0x%lx\n", __func__, index);
+		return ZERO_PAGE(0);
+	}
+
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page = find_or_create_page(mapping, index, GFP_NOFS);
+		if (unlikely(!page)) {
+			dprintk("%s: grab_cache_page Failed index=0x%lx\n",
+				__func__, index);
+			return NULL;
+		}
+		unlock_page(page);
+	}
+	if (PageDirty(page) || PageWriteback(page))
+		*uptodate = true;
+	else
+		*uptodate = PageUptodate(page);
+	dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
+	return page;
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+	dprintk("%s: index=0x%lx\n", __func__,
+		(page == ZERO_PAGE(0)) ? -1UL : page->index);
+	if (ZERO_PAGE(0) != page)
+		page_cache_release(page);
+	return;
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+	.get_page = &__r4w_get_page,
+	.put_page = &__r4w_put_page,
+};
+
+int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
+{
+	struct objio_state *objios;
+	int ret;
+
+	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
+			hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+			hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
+			&objios);
+	if (unlikely(ret))
+		return ret;
+
+	objios->sync = 0 != (how & FLUSH_SYNC);
+	objios->ios->r4w = &_r4w_op;
+
+	if (!objios->sync)
+		objios->ios->done = _write_done;
+
+	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+		hdr->args.offset, hdr->args.count);
+	ret = ore_write(objios->ios);
+	if (unlikely(ret)) {
+		objio_free_result(&objios->oir);
+		return ret;
+	}
+
+	if (objios->sync)
+		_write_done(objios->ios, objios);
+
+	return 0;
+}
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
+			  struct nfs_page *prev, struct nfs_page *req)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
+	unsigned int size;
+
+	size = pnfs_generic_pg_test(pgio, prev, req);
+
+	if (!size || mirror->pg_count + req->wb_bytes >
+	    (unsigned long)pgio->pg_layout_private)
+		return 0;
+
+	return min(size, req->wb_bytes);
+}
+
+static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+	pnfs_generic_pg_init_read(pgio, req);
+	if (unlikely(pgio->pg_lseg == NULL))
+		return; /* Not pNFS */
+
+	pgio->pg_layout_private = (void *)
+				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+}
+
+static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
+				   unsigned long *stripe_end)
+{
+	u32 stripe_off;
+	unsigned stripe_size;
+
+	if (layout->raid_algorithm == PNFS_OSD_RAID_0)
+		return true;
+
+	stripe_size = layout->stripe_unit *
+				(layout->group_width - layout->parity);
+
+	div_u64_rem(offset, stripe_size, &stripe_off);
+	if (!stripe_off)
+		return true;
+
+	*stripe_end = stripe_size - stripe_off;
+	return false;
+}
+
+static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+	unsigned long stripe_end = 0;
+	u64 wb_size;
+
+	if (pgio->pg_dreq == NULL)
+		wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
+	else
+		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+	pnfs_generic_pg_init_write(pgio, req, wb_size);
+	if (unlikely(pgio->pg_lseg == NULL))
+		return; /* Not pNFS */
+
+	if (req->wb_offset ||
+	    !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE,
+			       &OBJIO_LSEG(pgio->pg_lseg)->layout,
+			       &stripe_end)) {
+		pgio->pg_layout_private = (void *)stripe_end;
+	} else {
+		pgio->pg_layout_private = (void *)
+				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+	}
+}
+
+static const struct nfs_pageio_ops objio_pg_read_ops = {
+	.pg_init = objio_init_read,
+	.pg_test = objio_pg_test,
+	.pg_doio = pnfs_generic_pg_readpages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops objio_pg_write_ops = {
+	.pg_init = objio_init_write,
+	.pg_test = objio_pg_test,
+	.pg_doio = pnfs_generic_pg_writepages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static struct pnfs_layoutdriver_type objlayout_type = {
+	.id = LAYOUT_OSD2_OBJECTS,
+	.name = "LAYOUT_OSD2_OBJECTS",
+	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
+				   PNFS_LAYOUTRET_ON_ERROR,
+
+	.max_deviceinfo_size	 = PAGE_SIZE,
+	.owner		       	 = THIS_MODULE,
+	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
+	.free_layout_hdr         = objlayout_free_layout_hdr,
+
+	.alloc_lseg              = objlayout_alloc_lseg,
+	.free_lseg               = objlayout_free_lseg,
+
+	.read_pagelist           = objlayout_read_pagelist,
+	.write_pagelist          = objlayout_write_pagelist,
+	.pg_read_ops             = &objio_pg_read_ops,
+	.pg_write_ops            = &objio_pg_write_ops,
+
+	.sync			 = pnfs_generic_sync,
+
+	.free_deviceid_node	 = objio_free_deviceid_node,
+
+	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
+	.encode_layoutreturn     = objlayout_encode_layoutreturn,
+};
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+	int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+	if (ret)
+		printk(KERN_INFO
+			"NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+			__func__, ret);
+	else
+		printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
+			__func__);
+	return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+	pnfs_unregister_layoutdriver(&objlayout_type);
+	printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
+	       __func__);
+}
+
+MODULE_ALIAS("nfs-layouttype4-2");
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff --git a/kernel/fs/nfs/objlayout/objlayout.c b/kernel/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 000000000..919efd4a1
--- /dev/null
+++ b/kernel/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,706 @@
+/*
+ *  pNFS Objects layout driver high level definitions
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kmod.h>
+#include <linux/moduleparam.h>
+#include <linux/ratelimit.h>
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct objlayout *objlay;
+
+	objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+	if (!objlay)
+		return NULL;
+	spin_lock_init(&objlay->lock);
+	INIT_LIST_HEAD(&objlay->err_list);
+	dprintk("%s: Return %p\n", __func__, objlay);
+	return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct objlayout *objlay = OBJLAYOUT(lo);
+
+	dprintk("%s: objlay %p\n", __func__, objlay);
+
+	WARN_ON(!list_empty(&objlay->err_list));
+	kfree(objlay);
+}
+
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+		     struct nfs4_layoutget_res *lgr,
+		     gfp_t gfp_flags)
+{
+	int status = -ENOMEM;
+	struct xdr_stream stream;
+	struct xdr_buf buf = {
+		.pages =  lgr->layoutp->pages,
+		.page_len =  lgr->layoutp->len,
+		.buflen =  lgr->layoutp->len,
+		.len = lgr->layoutp->len,
+	};
+	struct page *scratch;
+	struct pnfs_layout_segment *lseg;
+
+	dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		goto err_nofree;
+
+	xdr_init_decode(&stream, &buf, NULL);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
+	if (unlikely(status)) {
+		dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
+			status);
+		goto err;
+	}
+
+	__free_page(scratch);
+
+	dprintk("%s: Return %p\n", __func__, lseg);
+	return lseg;
+
+err:
+	__free_page(scratch);
+err_nofree:
+	dprintk("%s: Err Return=>%d\n", __func__, status);
+	return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+	if (unlikely(!lseg))
+		return;
+
+	objio_free_lseg(lseg);
+}
+
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
+			   struct page ***p_pages, unsigned *p_pgbase,
+			   u64 offset, unsigned long count)
+{
+	u64 lseg_end_offset;
+
+	BUG_ON(offset < lseg->pls_range.offset);
+	lseg_end_offset = end_offset(lseg->pls_range.offset,
+				     lseg->pls_range.length);
+	BUG_ON(offset >= lseg_end_offset);
+	WARN_ON(offset + count > lseg_end_offset);
+
+	if (*p_pgbase > PAGE_SIZE) {
+		dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
+		*p_pages += *p_pgbase >> PAGE_SHIFT;
+		*p_pgbase &= ~PAGE_MASK;
+	}
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_res *oir)
+{
+	if (likely(oir->status >= 0)) {
+		objio_free_result(oir);
+	} else {
+		struct objlayout *objlay = oir->objlay;
+
+		spin_lock(&objlay->lock);
+		objlay->delta_space_valid = OBJ_DSU_INVALID;
+		list_add(&objlay->err_list, &oir->err_list);
+		spin_unlock(&objlay->lock);
+	}
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
+			struct pnfs_osd_objid *pooid, int osd_error,
+			u64 offset, u64 length, bool is_write)
+{
+	struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
+
+	BUG_ON(index >= oir->num_comps);
+	if (osd_error) {
+		ioerr->oer_component = *pooid;
+		ioerr->oer_comp_offset = offset;
+		ioerr->oer_comp_length = length;
+		ioerr->oer_iswrite = is_write;
+		ioerr->oer_errno = osd_error;
+
+		dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+			"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+			__func__, index, ioerr->oer_errno,
+			ioerr->oer_iswrite,
+			_DEVID_LO(&ioerr->oer_component.oid_device_id),
+			_DEVID_HI(&ioerr->oer_component.oid_device_id),
+			ioerr->oer_component.oid_partition_id,
+			ioerr->oer_component.oid_object_id,
+			ioerr->oer_comp_offset,
+			ioerr->oer_comp_length);
+	} else {
+		/* User need not call if no error is reported */
+		ioerr->oer_errno = 0;
+	}
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+	struct rpc_task *task;
+	struct nfs_pgio_header *hdr;
+
+	dprintk("%s enter\n", __func__);
+	task = container_of(work, struct rpc_task, u.tk_work);
+	hdr = container_of(task, struct nfs_pgio_header, task);
+
+	pnfs_ld_read_done(hdr);
+}
+
+void
+objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
+{
+	struct nfs_pgio_header *hdr = oir->rpcdata;
+
+	oir->status = hdr->task.tk_status = status;
+	if (status >= 0)
+		hdr->res.count = status;
+	else
+		hdr->pnfs_error = status;
+	objlayout_iodone(oir);
+	/* must not use oir after this point */
+
+	dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
+		status, hdr->res.eof, sync);
+
+	if (sync)
+		pnfs_ld_read_done(hdr);
+	else {
+		INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
+		schedule_work(&hdr->task.u.tk_work);
+	}
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+	struct inode *inode = hdr->inode;
+	loff_t offset = hdr->args.offset;
+	size_t count = hdr->args.count;
+	int err;
+	loff_t eof;
+
+	eof = i_size_read(inode);
+	if (unlikely(offset + count > eof)) {
+		if (offset >= eof) {
+			err = 0;
+			hdr->res.count = 0;
+			hdr->res.eof = 1;
+			/*FIXME: do we need to call pnfs_ld_read_done() */
+			goto out;
+		}
+		count = eof - offset;
+	}
+
+	hdr->res.eof = (offset + count) >= eof;
+	_fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+			      &hdr->args.pgbase,
+			      hdr->args.offset, hdr->args.count);
+
+	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+		__func__, inode->i_ino, offset, count, hdr->res.eof);
+
+	err = objio_read_pagelist(hdr);
+ out:
+	if (unlikely(err)) {
+		hdr->pnfs_error = err;
+		dprintk("%s: Returned Error %d\n", __func__, err);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+	struct rpc_task *task;
+	struct nfs_pgio_header *hdr;
+
+	dprintk("%s enter\n", __func__);
+	task = container_of(work, struct rpc_task, u.tk_work);
+	hdr = container_of(task, struct nfs_pgio_header, task);
+
+	pnfs_ld_write_done(hdr);
+}
+
+void
+objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
+{
+	struct nfs_pgio_header *hdr = oir->rpcdata;
+
+	oir->status = hdr->task.tk_status = status;
+	if (status >= 0) {
+		hdr->res.count = status;
+		hdr->verf.committed = oir->committed;
+	} else {
+		hdr->pnfs_error = status;
+	}
+	objlayout_iodone(oir);
+	/* must not use oir after this point */
+
+	dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
+		status, hdr->verf.committed, sync);
+
+	if (sync)
+		pnfs_ld_write_done(hdr);
+	else {
+		INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
+		schedule_work(&hdr->task.u.tk_work);
+	}
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
+{
+	int err;
+
+	_fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+			      &hdr->args.pgbase,
+			      hdr->args.offset, hdr->args.count);
+
+	err = objio_write_pagelist(hdr, how);
+	if (unlikely(err)) {
+		hdr->pnfs_error = err;
+		dprintk("%s: Returned Error %d\n", __func__, err);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	return PNFS_ATTEMPTED;
+}
+
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutcommit_args *args)
+{
+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
+	struct pnfs_osd_layoutupdate lou;
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+
+	spin_lock(&objlay->lock);
+	lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+	lou.dsu_delta = objlay->delta_space_used;
+	objlay->delta_space_used = 0;
+	objlay->delta_space_valid = OBJ_DSU_INIT;
+	lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+	spin_unlock(&objlay->lock);
+
+	start = xdr_reserve_space(xdr, 4);
+
+	BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+	dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+		lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
+static int
+err_prio(u32 oer_errno)
+{
+	switch (oer_errno) {
+	case 0:
+		return 0;
+
+	case PNFS_OSD_ERR_RESOURCE:
+		return OSD_ERR_PRI_RESOURCE;
+	case PNFS_OSD_ERR_BAD_CRED:
+		return OSD_ERR_PRI_BAD_CRED;
+	case PNFS_OSD_ERR_NO_ACCESS:
+		return OSD_ERR_PRI_NO_ACCESS;
+	case PNFS_OSD_ERR_UNREACHABLE:
+		return OSD_ERR_PRI_UNREACHABLE;
+	case PNFS_OSD_ERR_NOT_FOUND:
+		return OSD_ERR_PRI_NOT_FOUND;
+	case PNFS_OSD_ERR_NO_SPACE:
+		return OSD_ERR_PRI_NO_SPACE;
+	default:
+		WARN_ON(1);
+		/* fallthrough */
+	case PNFS_OSD_ERR_EIO:
+		return OSD_ERR_PRI_EIO;
+	}
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+	    const struct pnfs_osd_ioerr *src_err)
+{
+	u64 dest_end, src_end;
+
+	if (!dest_err->oer_errno) {
+		*dest_err = *src_err;
+		/* accumulated device must be blank */
+		memset(&dest_err->oer_component.oid_device_id, 0,
+			sizeof(dest_err->oer_component.oid_device_id));
+
+		return;
+	}
+
+	if (dest_err->oer_component.oid_partition_id !=
+				src_err->oer_component.oid_partition_id)
+		dest_err->oer_component.oid_partition_id = 0;
+
+	if (dest_err->oer_component.oid_object_id !=
+				src_err->oer_component.oid_object_id)
+		dest_err->oer_component.oid_object_id = 0;
+
+	if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+		dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+	dest_end = end_offset(dest_err->oer_comp_offset,
+			      dest_err->oer_comp_length);
+	src_end =  end_offset(src_err->oer_comp_offset,
+			      src_err->oer_comp_length);
+	if (dest_end < src_end)
+		dest_end = src_end;
+
+	dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+	if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+	    (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+			dest_err->oer_errno = src_err->oer_errno;
+	} else if (src_err->oer_iswrite) {
+		dest_err->oer_iswrite = true;
+		dest_err->oer_errno = src_err->oer_errno;
+	}
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, __be32 *p)
+{
+	struct objlayout_io_res *oir, *tmp;
+	struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+	list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
+		unsigned i;
+
+		for (i = 0; i < oir->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
+
+			if (!ioerr->oer_errno)
+				continue;
+
+			printk(KERN_ERR "NFS: %s: err[%d]: errno=%d "
+				"is_write=%d dev(%llx:%llx) par=0x%llx "
+				"obj=0x%llx offset=0x%llx length=0x%llx\n",
+				__func__, i, ioerr->oer_errno,
+				ioerr->oer_iswrite,
+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
+				ioerr->oer_component.oid_partition_id,
+				ioerr->oer_component.oid_object_id,
+				ioerr->oer_comp_offset,
+				ioerr->oer_comp_length);
+
+			merge_ioerr(&accumulated_err, ioerr);
+		}
+		list_del(&oir->err_list);
+		objio_free_result(oir);
+	}
+
+	pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutreturn_args *args)
+{
+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
+	struct objlayout_io_res *oir, *tmp;
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+	start = xdr_reserve_space(xdr, 4);
+	BUG_ON(!start);
+
+	spin_lock(&objlay->lock);
+
+	list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
+		__be32 *last_xdr = NULL, *p;
+		unsigned i;
+		int res = 0;
+
+		for (i = 0; i < oir->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
+
+			if (!ioerr->oer_errno)
+				continue;
+
+			dprintk("%s: err[%d]: errno=%d is_write=%d "
+				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
+				"offset=0x%llx length=0x%llx\n",
+				__func__, i, ioerr->oer_errno,
+				ioerr->oer_iswrite,
+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
+				ioerr->oer_component.oid_partition_id,
+				ioerr->oer_component.oid_object_id,
+				ioerr->oer_comp_offset,
+				ioerr->oer_comp_length);
+
+			p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
+			if (unlikely(!p)) {
+				res = -E2BIG;
+				break; /* accumulated_error */
+			}
+
+			last_xdr = p;
+			pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
+		}
+
+		/* TODO: use xdr_write_pages */
+		if (unlikely(res)) {
+			/* no space for even one error descriptor */
+			BUG_ON(!last_xdr);
+
+			/* we've encountered a situation with lots and lots of
+			 * errors and no space to encode them all. Use the last
+			 * available slot to report the union of all the
+			 * remaining errors.
+			 */
+			encode_accumulated_error(objlay, last_xdr);
+			goto loop_done;
+		}
+		list_del(&oir->err_list);
+		objio_free_result(oir);
+	}
+loop_done:
+	spin_unlock(&objlay->lock);
+
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+	dprintk("%s: Return\n", __func__);
+}
+
+enum {
+	OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
+	OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
+	OSD_LOGIN_UPCALL_PATHLEN  = 256
+};
+
+static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login";
+
+module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog),
+		    0600);
+MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program");
+
+struct __auto_login {
+	char uri[OBJLAYOUT_MAX_URI_LEN];
+	char osdname[OBJLAYOUT_MAX_OSDNAME_LEN];
+	char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN];
+};
+
+static int __objlayout_upcall(struct __auto_login *login)
+{
+	static char *envp[] = { "HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+	char *argv[8];
+	int ret;
+
+	if (unlikely(!osd_login_prog[0])) {
+		dprintk("%s: osd_login_prog is disabled\n", __func__);
+		return -EACCES;
+	}
+
+	dprintk("%s uri: %s\n", __func__, login->uri);
+	dprintk("%s osdname %s\n", __func__, login->osdname);
+	dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex);
+
+	argv[0] = (char *)osd_login_prog;
+	argv[1] = "-u";
+	argv[2] = login->uri;
+	argv[3] = "-o";
+	argv[4] = login->osdname;
+	argv[5] = "-s";
+	argv[6] = login->systemid_hex;
+	argv[7] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	/*
+	 * Disable the upcall mechanism if we're getting an ENOENT or
+	 * EACCES error. The admin can re-enable it on the fly by using
+	 * sysfs to set the objlayoutdriver.osd_login_prog module parameter once
+	 * the problem has been fixed.
+	 */
+	if (ret == -ENOENT || ret == -EACCES) {
+		printk(KERN_ERR "PNFS-OBJ: %s was not found please set "
+			"objlayoutdriver.osd_login_prog kernel parameter!\n",
+			osd_login_prog);
+		osd_login_prog[0] = '\0';
+	}
+	dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret);
+
+	return ret;
+}
+
+/* Assume dest is all zeros */
+static void __copy_nfsS_and_zero_terminate(struct nfs4_string s,
+					   char *dest, int max_len,
+					   const char *var_name)
+{
+	if (!s.len)
+		return;
+
+	if (s.len >= max_len) {
+		pr_warn_ratelimited(
+			"objlayout_autologin: %s: s.len(%d) >= max_len(%d)",
+			var_name, s.len, max_len);
+		s.len = max_len - 1; /* space for null terminator */
+	}
+
+	memcpy(dest, s.data, s.len);
+}
+
+/* Assume sysid is all zeros */
+static void _sysid_2_hex(struct nfs4_string s,
+		  char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN])
+{
+	int i;
+	char *cur;
+
+	if (!s.len)
+		return;
+
+	if (s.len != OSD_SYSTEMID_LEN) {
+		pr_warn_ratelimited(
+		    "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN",
+		    s.len);
+		if (s.len > OSD_SYSTEMID_LEN)
+			s.len = OSD_SYSTEMID_LEN;
+	}
+
+	cur = sysid;
+	for (i = 0; i < s.len; i++)
+		cur = hex_byte_pack(cur, s.data[i]);
+}
+
+int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+	int rc;
+	struct __auto_login login;
+
+	if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len)
+		return -ENODEV;
+
+	memset(&login, 0, sizeof(login));
+	__copy_nfsS_and_zero_terminate(
+		deviceaddr->oda_targetaddr.ota_netaddr.r_addr,
+		login.uri, sizeof(login.uri), "URI");
+
+	__copy_nfsS_and_zero_terminate(
+		deviceaddr->oda_osdname,
+		login.osdname, sizeof(login.osdname), "OSDNAME");
+
+	_sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex);
+
+	rc = __objlayout_upcall(&login);
+	if (rc > 0) /* script returns positive values */
+		rc = -ENODEV;
+
+	return rc;
+}
diff --git a/kernel/fs/nfs/objlayout/objlayout.h b/kernel/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 000000000..2641dbad3
--- /dev/null
+++ b/kernel/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,184 @@
+/*
+ *  Data types and function declerations for interfacing with the
+ *  pNFS standard object layout driver.
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * per-inode layout
+ */
+struct objlayout {
+	struct pnfs_layout_hdr pnfs_layout;
+
+	 /* for layout_commit */
+	enum osd_delta_space_valid_enum {
+		OBJ_DSU_INIT = 0,
+		OBJ_DSU_VALID,
+		OBJ_DSU_INVALID,
+	} delta_space_valid;
+	s64 delta_space_used;  /* consumed by write ops */
+
+	 /* for layout_return */
+	spinlock_t lock;
+	struct list_head err_list;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct objlayout, pnfs_layout);
+}
+
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_res {
+	struct objlayout *objlay;
+
+	void *rpcdata;
+	int status;             /* res */
+	int committed;          /* res */
+
+	/* Error reporting (layout_return) */
+	struct list_head err_list;
+	unsigned num_comps;
+	/* Pointer to array of error descriptors of size num_comps.
+	 * It should contain as many entries as devices in the osd_layout
+	 * that participate in the I/O. It is up to the io_engine to allocate
+	 * needed space and set num_comps.
+	 */
+	struct pnfs_osd_ioerr *ioerrs;
+};
+
+static inline
+void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
+			struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
+			struct pnfs_layout_hdr *pnfs_layout_type)
+{
+	oir->objlay = OBJLAYOUT(pnfs_layout_type);
+	oir->rpcdata = rpcdata;
+	INIT_LIST_HEAD(&oir->err_list);
+	oir->num_comps = num_comps;
+	oir->ioerrs = ioerrs;
+}
+
+/*
+ * Raid engine I/O API
+ */
+extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+	struct pnfs_layout_hdr *pnfslay,
+	struct pnfs_layout_range *range,
+	struct xdr_stream *xdr,
+	gfp_t gfp_flags);
+extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
+
+/* objio_free_result will free these @oir structs received from
+ * objlayout_{read,write}_done
+ */
+extern void objio_free_result(struct objlayout_io_res *oir);
+
+extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
+
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_res *oir,
+			unsigned index, struct pnfs_osd_objid *pooid,
+			int osd_error, u64 offset, u64 length, bool is_write);
+
+static inline void
+objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
+{
+	/* If one of the I/Os errored out and the delta_space_used was
+	 * invalid we render the complete report as invalid. Protocol mandate
+	 * the DSU be accurate or not reported.
+	 */
+	spin_lock(&objlay->lock);
+	if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+		objlay->delta_space_valid = OBJ_DSU_VALID;
+		objlay->delta_space_used += space_used;
+	}
+	spin_unlock(&objlay->lock);
+}
+
+extern void objlayout_read_done(struct objlayout_io_res *oir,
+				ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_res *oir,
+				 ssize_t status, bool sync);
+
+/*
+ * exported generic objects function vectors
+ */
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+	struct pnfs_layout_hdr *,
+	struct nfs4_layoutget_res *,
+	gfp_t gfp_flags);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+extern enum pnfs_try_status objlayout_read_pagelist(
+	struct nfs_pgio_header *);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+	struct nfs_pgio_header *,
+	int how);
+
+extern void objlayout_encode_layoutcommit(
+	struct pnfs_layout_hdr *,
+	struct xdr_stream *,
+	const struct nfs4_layoutcommit_args *);
+
+extern void objlayout_encode_layoutreturn(
+	struct pnfs_layout_hdr *,
+	struct xdr_stream *,
+	const struct nfs4_layoutreturn_args *);
+
+extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr);
+
+#endif /* _OBJLAYOUT_H */
diff --git a/kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 000000000..f093c7ec9
--- /dev/null
+++ b/kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,415 @@
+/*
+ *  Object-Based pNFS Layout XDR layer
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on RFC5664
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ *	struct nfs4_deviceid	oid_device_id;
+ *	u64			oid_partition_id;
+ *	u64			oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static __be32 *
+_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+	p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
+				    sizeof(objid->oid_device_id.data));
+
+	p = xdr_decode_hyper(p, &objid->oid_partition_id);
+	p = xdr_decode_hyper(p, &objid->oid_object_id);
+	return p;
+}
+/*
+ * struct pnfs_osd_opaque_cred {
+ *	u32 cred_len;
+ *	void *cred;
+ * }; // xdr size [variable]
+ * The return pointers are from the xdr buffer
+ */
+static int
+_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
+			    struct xdr_stream *xdr)
+{
+	__be32 *p = xdr_inline_decode(xdr, 1);
+
+	if (!p)
+		return -EINVAL;
+
+	opaque_cred->cred_len = be32_to_cpu(*p++);
+
+	p = xdr_inline_decode(xdr, opaque_cred->cred_len);
+	if (!p)
+		return -EINVAL;
+
+	opaque_cred->cred = p;
+	return 0;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ *	struct pnfs_osd_objid		oc_object_id;
+ *	u32				oc_osd_version;
+ *	u32				oc_cap_key_sec;
+ *	struct pnfs_osd_opaque_cred	oc_cap_key
+ *	struct pnfs_osd_opaque_cred	oc_cap;
+ * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
+ */
+static int
+_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
+			    struct xdr_stream *xdr)
+{
+	__be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
+	int ret;
+
+	if (!p)
+		return -EIO;
+
+	p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+	comp->oc_osd_version = be32_to_cpup(p++);
+	comp->oc_cap_key_sec = be32_to_cpup(p);
+
+	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
+	if (unlikely(ret))
+		return ret;
+
+	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
+	return ret;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ *	u32	odm_num_comps;
+ *	u64	odm_stripe_unit;
+ *	u32	odm_group_width;
+ *	u32	odm_group_depth;
+ *	u32	odm_mirror_cnt;
+ *	u32	odm_raid_algorithm;
+ * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
+ */
+static inline int
+_osd_data_map_xdr_sz(void)
+{
+	return 4 + 8 + 4 + 4 + 4 + 4;
+}
+
+static __be32 *
+_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+	data_map->odm_num_comps = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
+	data_map->odm_group_width = be32_to_cpup(p++);
+	data_map->odm_group_depth = be32_to_cpup(p++);
+	data_map->odm_mirror_cnt = be32_to_cpup(p++);
+	data_map->odm_raid_algorithm = be32_to_cpup(p++);
+	dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+		"odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+		__func__,
+		data_map->odm_num_comps,
+		(unsigned long long)data_map->odm_stripe_unit,
+		data_map->odm_group_width,
+		data_map->odm_group_depth,
+		data_map->odm_mirror_cnt,
+		data_map->odm_raid_algorithm);
+	return p;
+}
+
+int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	memset(iter, 0, sizeof(*iter));
+
+	p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
+	if (unlikely(!p))
+		return -EINVAL;
+
+	p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+	layout->olo_comps_index = be32_to_cpup(p++);
+	layout->olo_num_comps = be32_to_cpup(p++);
+	dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
+		layout->olo_comps_index, layout->olo_num_comps);
+
+	iter->total_comps = layout->olo_num_comps;
+	return 0;
+}
+
+bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+	int *err)
+{
+	BUG_ON(iter->decoded_comps > iter->total_comps);
+	if (iter->decoded_comps == iter->total_comps)
+		return false;
+
+	*err = _osd_xdr_decode_object_cred(comp, xdr);
+	if (unlikely(*err)) {
+		dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
+			"total_comps=%d\n", __func__, *err,
+			iter->decoded_comps, iter->total_comps);
+		return false; /* stop the loop */
+	}
+	dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
+		"key_len=%u cap_len=%u\n",
+		__func__,
+		_DEVID_LO(&comp->oc_object_id.oid_device_id),
+		_DEVID_HI(&comp->oc_object_id.oid_device_id),
+		comp->oc_object_id.oid_partition_id,
+		comp->oc_object_id.oid_object_id,
+		comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+
+	iter->decoded_comps++;
+	return true;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, all
+ *       variable strings fields are left inside the rpc buffer and are only
+ *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ *       should not be freed while the returned information is in use.
+ */
+/*
+ *struct nfs4_string {
+ *	unsigned int len;
+ *	char *data;
+ *}; // size [variable]
+ * NOTE: Returned string points to inside the XDR buffer
+ */
+static __be32 *
+__read_u8_opaque(__be32 *p, struct nfs4_string *str)
+{
+	str->len = be32_to_cpup(p++);
+	str->data = (char *)p;
+
+	p += XDR_QUADLEN(str->len);
+	return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ *	u32			oti_type;
+ *	struct nfs4_string	oti_scsi_device_id;
+ * };// size 4 + [variable]
+ */
+static __be32 *
+__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
+{
+	u32 oti_type;
+
+	oti_type = be32_to_cpup(p++);
+	targetid->oti_type = oti_type;
+
+	switch (oti_type) {
+	case OBJ_TARGET_SCSI_NAME:
+	case OBJ_TARGET_SCSI_DEVICE_ID:
+		p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
+	}
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ *	struct nfs4_string	r_netid;
+ *	struct nfs4_string	r_addr;
+ * };
+ */
+static __be32 *
+__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
+{
+	p = __read_u8_opaque(p, &netaddr->r_netid);
+	p = __read_u8_opaque(p, &netaddr->r_addr);
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ *	u32				ota_available;
+ *	struct pnfs_osd_net_addr	ota_netaddr;
+ * };
+ */
+static __be32 *
+__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
+{
+	u32 ota_available;
+
+	ota_available = be32_to_cpup(p++);
+	targetaddr->ota_available = ota_available;
+
+	if (ota_available)
+		p = __read_net_addr(p, &targetaddr->ota_netaddr);
+
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ *	struct pnfs_osd_targetid	oda_targetid;
+ *	struct pnfs_osd_targetaddr	oda_targetaddr;
+ *	u8				oda_lun[8];
+ *	struct nfs4_string		oda_systemid;
+ *	struct pnfs_osd_object_cred	oda_root_obj_cred;
+ *	struct nfs4_string		oda_osdname;
+ * };
+ */
+
+/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
+ * not have an xdr_stream
+ */
+static __be32 *
+__read_opaque_cred(__be32 *p,
+			      struct pnfs_osd_opaque_cred *opaque_cred)
+{
+	opaque_cred->cred_len = be32_to_cpu(*p++);
+	opaque_cred->cred = p;
+	return p + XDR_QUADLEN(opaque_cred->cred_len);
+}
+
+static __be32 *
+__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
+{
+	p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+	comp->oc_osd_version = be32_to_cpup(p++);
+	comp->oc_cap_key_sec = be32_to_cpup(p++);
+
+	p = __read_opaque_cred(p, &comp->oc_cap_key);
+	p = __read_opaque_cred(p, &comp->oc_cap);
+	return p;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+	struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+	p = __read_targetid(p, &deviceaddr->oda_targetid);
+
+	p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
+
+	p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
+				    sizeof(deviceaddr->oda_lun));
+
+	p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
+
+	p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
+
+	p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
+
+	/* libosd likes this terminated in dbg. It's last, so no problems */
+	deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ *	u32	dsu_valid;
+ *	s64	dsu_delta;
+ *	u32	olu_ioerr_flag;
+ * }; xdr size 4 + 8 + 4
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+				 struct pnfs_osd_layoutupdate *lou)
+{
+	__be32 *p = xdr_reserve_space(xdr,  4 + 8 + 4);
+
+	if (!p)
+		return -E2BIG;
+
+	*p++ = cpu_to_be32(lou->dsu_valid);
+	if (lou->dsu_valid)
+		p = xdr_encode_hyper(p, lou->dsu_delta);
+	*p++ = cpu_to_be32(lou->olu_ioerr_flag);
+	return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ *	struct nfs4_deviceid	oid_device_id;
+ *	u64			oid_partition_id;
+ *	u64			oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static inline __be32 *
+pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
+{
+	p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+				    sizeof(object_id->oid_device_id.data));
+	p = xdr_encode_hyper(p, object_id->oid_partition_id);
+	p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ *	struct pnfs_osd_objid	oer_component;
+ *	u64			oer_comp_offset;
+ *	u64			oer_comp_length;
+ *	u32			oer_iswrite;
+ *	u32			oer_errno;
+ * }; // xdr size 32 + 24 bytes
+ */
+void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
+{
+	p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
+	p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+	p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+	*p++ = cpu_to_be32(ioerr->oer_iswrite);
+	*p   = cpu_to_be32(ioerr->oer_errno);
+}
+
+__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 32 + 24);
+	if (unlikely(!p))
+		dprintk("%s: out of xdr space\n", __func__);
+
+	return p;
+}
diff --git a/kernel/fs/nfs/pagelist.c b/kernel/fs/nfs/pagelist.c
new file mode 100644
index 000000000..282b39369
--- /dev/null
+++ b/kernel/fs/nfs/pagelist.c
@@ -0,0 +1,1309 @@
+/*
+ * linux/fs/nfs/pagelist.c
+ *
+ * A set of helper functions for managing NFS read and write requests.
+ * The main purpose of these routines is to provide support for the
+ * coalescing of several requests into a single RPC call.
+ *
+ * Copyright 2000, 2001 (c) Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/export.h>
+
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+
+static struct kmem_cache *nfs_page_cachep;
+static const struct rpc_call_ops nfs_pgio_common_ops;
+
+static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
+{
+	p->npages = pagecount;
+	if (pagecount <= ARRAY_SIZE(p->page_array))
+		p->pagevec = p->page_array;
+	else {
+		p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+		if (!p->pagevec)
+			p->npages = 0;
+	}
+	return p->pagevec != NULL;
+}
+
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
+{
+	return nfs_pgio_has_mirroring(desc) ?
+		&desc->pg_mirrors[desc->pg_mirror_idx] :
+		&desc->pg_mirrors[0];
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
+
+void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+		       struct nfs_pgio_header *hdr,
+		       void (*release)(struct nfs_pgio_header *hdr))
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+	hdr->req = nfs_list_entry(mirror->pg_list.next);
+	hdr->inode = desc->pg_inode;
+	hdr->cred = hdr->req->wb_context->cred;
+	hdr->io_start = req_offset(hdr->req);
+	hdr->good_bytes = mirror->pg_count;
+	hdr->dreq = desc->pg_dreq;
+	hdr->layout_private = desc->pg_layout_private;
+	hdr->release = release;
+	hdr->completion_ops = desc->pg_completion_ops;
+	if (hdr->completion_ops->init_hdr)
+		hdr->completion_ops->init_hdr(hdr);
+
+	hdr->pgio_mirror_idx = desc->pg_mirror_idx;
+}
+EXPORT_SYMBOL_GPL(nfs_pgheader_init);
+
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
+{
+	spin_lock(&hdr->lock);
+	if (pos < hdr->io_start + hdr->good_bytes) {
+		set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
+		hdr->good_bytes = pos - hdr->io_start;
+		hdr->error = error;
+	}
+	spin_unlock(&hdr->lock);
+}
+
+static inline struct nfs_page *
+nfs_page_alloc(void)
+{
+	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO);
+	if (p)
+		INIT_LIST_HEAD(&p->wb_list);
+	return p;
+}
+
+static inline void
+nfs_page_free(struct nfs_page *p)
+{
+	kmem_cache_free(nfs_page_cachep, p);
+}
+
+static void
+nfs_iocounter_inc(struct nfs_io_counter *c)
+{
+	atomic_inc(&c->io_count);
+}
+
+static void
+nfs_iocounter_dec(struct nfs_io_counter *c)
+{
+	if (atomic_dec_and_test(&c->io_count)) {
+		clear_bit(NFS_IO_INPROGRESS, &c->flags);
+		smp_mb__after_atomic();
+		wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
+	}
+}
+
+static int
+__nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
+	DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
+		set_bit(NFS_IO_INPROGRESS, &c->flags);
+		if (atomic_read(&c->io_count) == 0)
+			break;
+		ret = nfs_wait_bit_killable(&q.key);
+	} while (atomic_read(&c->io_count) != 0 && !ret);
+	finish_wait(wq, &q.wait);
+	return ret;
+}
+
+/**
+ * nfs_iocounter_wait - wait for i/o to complete
+ * @c: nfs_io_counter to use
+ *
+ * returns -ERESTARTSYS if interrupted by a fatal signal.
+ * Otherwise returns 0 once the io_count hits 0.
+ */
+int
+nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+	if (atomic_read(&c->io_count) == 0)
+		return 0;
+	return __nfs_iocounter_wait(c);
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req - request in group that is to be locked
+ * @nonblock - if true don't block waiting for lock
+ *
+ * this lock must be held if modifying the page group list
+ *
+ * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
+ * result from wait_on_bit_lock
+ *
+ * NOTE: calling with nonblock=false should always have set the
+ *       lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
+ *       with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
+ */
+int
+nfs_page_group_lock(struct nfs_page *req, bool nonblock)
+{
+	struct nfs_page *head = req->wb_head;
+
+	WARN_ON_ONCE(head != head->wb_head);
+
+	if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
+		return 0;
+
+	if (!nonblock)
+		return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+				TASK_UNINTERRUPTIBLE);
+
+	return -EAGAIN;
+}
+
+/*
+ * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
+ * @req - a request in the group
+ *
+ * This is a blocking call to wait for the group lock to be cleared.
+ */
+void
+nfs_page_group_lock_wait(struct nfs_page *req)
+{
+	struct nfs_page *head = req->wb_head;
+
+	WARN_ON_ONCE(head != head->wb_head);
+
+	wait_on_bit(&head->wb_flags, PG_HEADLOCK,
+		TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req - request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+	struct nfs_page *head = req->wb_head;
+
+	WARN_ON_ONCE(head != head->wb_head);
+
+	smp_mb__before_atomic();
+	clear_bit(PG_HEADLOCK, &head->wb_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+	struct nfs_page *head = req->wb_head;
+	struct nfs_page *tmp;
+
+	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+	WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+	tmp = req->wb_this_page;
+	while (tmp != req) {
+		if (!test_bit(bit, &tmp->wb_flags))
+			return false;
+		tmp = tmp->wb_this_page;
+	}
+
+	/* true! reset all bits */
+	tmp = req;
+	do {
+		clear_bit(bit, &tmp->wb_flags);
+		tmp = tmp->wb_this_page;
+	} while (tmp != req);
+
+	return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ *   return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+	bool ret;
+
+	nfs_page_group_lock(req, false);
+	ret = nfs_page_group_sync_on_bit_locked(req, bit);
+	nfs_page_group_unlock(req);
+
+	return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ *         or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+	struct inode *inode;
+	WARN_ON_ONCE(prev == req);
+
+	if (!prev) {
+		/* a head request */
+		req->wb_head = req;
+		req->wb_this_page = req;
+	} else {
+		/* a subrequest */
+		WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+		WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+		req->wb_head = prev->wb_head;
+		req->wb_this_page = prev->wb_this_page;
+		prev->wb_this_page = req;
+
+		/* All subrequests take a ref on the head request until
+		 * nfs_page_group_destroy is called */
+		kref_get(&req->wb_head->wb_kref);
+
+		/* grab extra ref and bump the request count if head request
+		 * has extra ref from the write/commit path to handle handoff
+		 * between write and commit lists. */
+		if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) {
+			inode = page_file_mapping(req->wb_page)->host;
+			set_bit(PG_INODE_REF, &req->wb_flags);
+			kref_get(&req->wb_kref);
+			spin_lock(&inode->i_lock);
+			NFS_I(inode)->nrequests++;
+			spin_unlock(&inode->i_lock);
+		}
+	}
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+	struct nfs_page *tmp, *next;
+
+	/* subrequests must release the ref on the head request */
+	if (req->wb_head != req)
+		nfs_release_request(req->wb_head);
+
+	if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+		return;
+
+	tmp = req;
+	do {
+		next = tmp->wb_this_page;
+		/* unlink and free */
+		tmp->wb_this_page = tmp;
+		tmp->wb_head = tmp;
+		nfs_free_request(tmp);
+		tmp = next;
+	} while (tmp != req);
+}
+
+/**
+ * nfs_create_request - Create an NFS read/write request.
+ * @ctx: open context to use
+ * @page: page to write
+ * @last: last nfs request created for this page group or NULL if head
+ * @offset: starting offset within the page for the write
+ * @count: number of bytes to read/write
+ *
+ * The page must be locked by the caller. This makes sure we never
+ * create two different requests for the same page.
+ * User should ensure it is safe to sleep in this function.
+ */
+struct nfs_page *
+nfs_create_request(struct nfs_open_context *ctx, struct page *page,
+		   struct nfs_page *last, unsigned int offset,
+		   unsigned int count)
+{
+	struct nfs_page		*req;
+	struct nfs_lock_context *l_ctx;
+
+	if (test_bit(NFS_CONTEXT_BAD, &ctx->flags))
+		return ERR_PTR(-EBADF);
+	/* try to allocate the request struct */
+	req = nfs_page_alloc();
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	/* get lock context early so we can deal with alloc failures */
+	l_ctx = nfs_get_lock_context(ctx);
+	if (IS_ERR(l_ctx)) {
+		nfs_page_free(req);
+		return ERR_CAST(l_ctx);
+	}
+	req->wb_lock_context = l_ctx;
+	nfs_iocounter_inc(&l_ctx->io_count);
+
+	/* Initialize the request struct. Initially, we assume a
+	 * long write-back delay. This will be adjusted in
+	 * update_nfs_request below if the region is not locked. */
+	req->wb_page    = page;
+	req->wb_index	= page_file_index(page);
+	page_cache_get(page);
+	req->wb_offset  = offset;
+	req->wb_pgbase	= offset;
+	req->wb_bytes   = count;
+	req->wb_context = get_nfs_open_context(ctx);
+	kref_init(&req->wb_kref);
+	nfs_page_group_init(req, last);
+	return req;
+}
+
+/**
+ * nfs_unlock_request - Unlock request and wake up sleepers.
+ * @req:
+ */
+void nfs_unlock_request(struct nfs_page *req)
+{
+	if (!NFS_WBACK_BUSY(req)) {
+		printk(KERN_ERR "NFS: Invalid unlock attempted\n");
+		BUG();
+	}
+	smp_mb__before_atomic();
+	clear_bit(PG_BUSY, &req->wb_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&req->wb_flags, PG_BUSY);
+}
+
+/**
+ * nfs_unlock_and_release_request - Unlock request and release the nfs_page
+ * @req:
+ */
+void nfs_unlock_and_release_request(struct nfs_page *req)
+{
+	nfs_unlock_request(req);
+	nfs_release_request(req);
+}
+
+/*
+ * nfs_clear_request - Free up all resources allocated to the request
+ * @req:
+ *
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
+ */
+static void nfs_clear_request(struct nfs_page *req)
+{
+	struct page *page = req->wb_page;
+	struct nfs_open_context *ctx = req->wb_context;
+	struct nfs_lock_context *l_ctx = req->wb_lock_context;
+
+	if (page != NULL) {
+		page_cache_release(page);
+		req->wb_page = NULL;
+	}
+	if (l_ctx != NULL) {
+		nfs_iocounter_dec(&l_ctx->io_count);
+		nfs_put_lock_context(l_ctx);
+		req->wb_lock_context = NULL;
+	}
+	if (ctx != NULL) {
+		put_nfs_open_context(ctx);
+		req->wb_context = NULL;
+	}
+}
+
+/**
+ * nfs_release_request - Release the count on an NFS read/write request
+ * @req: request to release
+ *
+ * Note: Should never be called with the spinlock held!
+ */
+void nfs_free_request(struct nfs_page *req)
+{
+	WARN_ON_ONCE(req->wb_this_page != req);
+
+	/* extra debug: make sure no sync bits are still set */
+	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
+
+	/* Release struct file and open context */
+	nfs_clear_request(req);
+	nfs_page_free(req);
+}
+
+void nfs_release_request(struct nfs_page *req)
+{
+	kref_put(&req->wb_kref, nfs_page_group_destroy);
+}
+
+/**
+ * nfs_wait_on_request - Wait for a request to complete.
+ * @req: request to wait upon.
+ *
+ * Interruptible by fatal signals only.
+ * The user is responsible for holding a count on the request.
+ */
+int
+nfs_wait_on_request(struct nfs_page *req)
+{
+	return wait_on_bit_io(&req->wb_flags, PG_BUSY,
+			      TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * nfs_generic_pg_test - determine if requests can be coalesced
+ * @desc: pointer to descriptor
+ * @prev: previous request in desc, or NULL
+ * @req: this request
+ *
+ * Returns zero if @req can be coalesced into @desc, otherwise it returns
+ * the size of the request.
+ */
+size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *prev, struct nfs_page *req)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+	if (mirror->pg_count > mirror->pg_bsize) {
+		/* should never happen */
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	/*
+	 * Limit the request size so that we can still allocate a page array
+	 * for it without upsetting the slab allocator.
+	 */
+	if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+			sizeof(struct page) > PAGE_SIZE)
+		return 0;
+
+	return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
+
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
+{
+	struct nfs_pgio_header *hdr = ops->rw_alloc_header();
+
+	if (hdr) {
+		INIT_LIST_HEAD(&hdr->pages);
+		spin_lock_init(&hdr->lock);
+		hdr->rw_ops = ops;
+	}
+	return hdr;
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
+
+/*
+ * nfs_pgio_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
+{
+	hdr->rw_ops->rw_free_header(hdr);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
+
+/**
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
+ */
+void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
+{
+	if (hdr->args.context)
+		put_nfs_open_context(hdr->args.context);
+	if (hdr->page_array.pagevec != hdr->page_array.page_array)
+		kfree(hdr->page_array.pagevec);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
+
+/**
+ * nfs_pgio_rpcsetup - Set up arguments for a pageio call
+ * @hdr: The pageio hdr
+ * @count: Number of bytes to read
+ * @offset: Initial offset
+ * @how: How to commit data (writes only)
+ * @cinfo: Commit information for the call (writes only)
+ */
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
+			      unsigned int count, unsigned int offset,
+			      int how, struct nfs_commit_info *cinfo)
+{
+	struct nfs_page *req = hdr->req;
+
+	/* Set up the RPC argument and reply structs
+	 * NB: take care not to mess about with hdr->commit et al. */
+
+	hdr->args.fh     = NFS_FH(hdr->inode);
+	hdr->args.offset = req_offset(req) + offset;
+	/* pnfs_set_layoutcommit needs this */
+	hdr->mds_offset = hdr->args.offset;
+	hdr->args.pgbase = req->wb_pgbase + offset;
+	hdr->args.pages  = hdr->page_array.pagevec;
+	hdr->args.count  = count;
+	hdr->args.context = get_nfs_open_context(req->wb_context);
+	hdr->args.lock_context = req->wb_lock_context;
+	hdr->args.stable  = NFS_UNSTABLE;
+	switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
+	case 0:
+		break;
+	case FLUSH_COND_STABLE:
+		if (nfs_reqs_to_commit(cinfo))
+			break;
+	default:
+		hdr->args.stable = NFS_FILE_SYNC;
+	}
+
+	hdr->res.fattr   = &hdr->fattr;
+	hdr->res.count   = count;
+	hdr->res.eof     = 0;
+	hdr->res.verf    = &hdr->verf;
+	nfs_fattr_init(&hdr->fattr);
+}
+
+/**
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
+ * @task: The current task
+ * @calldata: pageio header to prepare
+ */
+static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_pgio_header *hdr = calldata;
+	int err;
+	err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
+	if (err)
+		rpc_exit(task, err);
+}
+
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+		      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+		      const struct rpc_call_ops *call_ops, int how, int flags)
+{
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_argp = &hdr->args,
+		.rpc_resp = &hdr->res,
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.task = &hdr->task,
+		.rpc_message = &msg,
+		.callback_ops = call_ops,
+		.callback_data = hdr,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC | flags,
+	};
+	int ret = 0;
+
+	hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
+
+	dprintk("NFS: %5u initiated pgio call "
+		"(req %s/%llu, %u bytes @ offset %llu)\n",
+		hdr->task.tk_pid,
+		hdr->inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(hdr->inode),
+		hdr->args.count,
+		(unsigned long long)hdr->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	if (how & FLUSH_SYNC) {
+		ret = rpc_wait_for_completion_task(task);
+		if (ret == 0)
+			ret = task->tk_status;
+	}
+	rpc_put_task(task);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
+
+/**
+ * nfs_pgio_error - Clean up from a pageio error
+ * @desc: IO descriptor
+ * @hdr: pageio header
+ */
+static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
+			  struct nfs_pgio_header *hdr)
+{
+	struct nfs_pgio_mirror *mirror;
+	u32 midx;
+
+	set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	nfs_pgio_data_destroy(hdr);
+	hdr->completion_ops->completion(hdr);
+	/* TODO: Make sure it's right to clean up all mirrors here
+	 *       and not just hdr->pgio_mirror_idx */
+	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+		mirror = &desc->pg_mirrors[midx];
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+	}
+	return -ENOMEM;
+}
+
+/**
+ * nfs_pgio_release - Release pageio data
+ * @calldata: The pageio header to release
+ */
+static void nfs_pgio_release(void *calldata)
+{
+	struct nfs_pgio_header *hdr = calldata;
+	if (hdr->rw_ops->rw_release)
+		hdr->rw_ops->rw_release(hdr);
+	nfs_pgio_data_destroy(hdr);
+	hdr->completion_ops->completion(hdr);
+}
+
+static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
+				   unsigned int bsize)
+{
+	INIT_LIST_HEAD(&mirror->pg_list);
+	mirror->pg_bytes_written = 0;
+	mirror->pg_count = 0;
+	mirror->pg_bsize = bsize;
+	mirror->pg_base = 0;
+	mirror->pg_recoalesce = 0;
+}
+
+/**
+ * nfs_pageio_init - initialise a page io descriptor
+ * @desc: pointer to descriptor
+ * @inode: pointer to inode
+ * @doio: pointer to io function
+ * @bsize: io block size
+ * @io_flags: extra parameters for the io function
+ */
+void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+		     struct inode *inode,
+		     const struct nfs_pageio_ops *pg_ops,
+		     const struct nfs_pgio_completion_ops *compl_ops,
+		     const struct nfs_rw_ops *rw_ops,
+		     size_t bsize,
+		     int io_flags)
+{
+	struct nfs_pgio_mirror *new;
+	int i;
+
+	desc->pg_moreio = 0;
+	desc->pg_inode = inode;
+	desc->pg_ops = pg_ops;
+	desc->pg_completion_ops = compl_ops;
+	desc->pg_rw_ops = rw_ops;
+	desc->pg_ioflags = io_flags;
+	desc->pg_error = 0;
+	desc->pg_lseg = NULL;
+	desc->pg_dreq = NULL;
+	desc->pg_layout_private = NULL;
+	desc->pg_bsize = bsize;
+
+	desc->pg_mirror_count = 1;
+	desc->pg_mirror_idx = 0;
+
+	if (pg_ops->pg_get_mirror_count) {
+		/* until we have a request, we don't have an lseg and no
+		 * idea how many mirrors there will be */
+		new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
+			      sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
+		desc->pg_mirrors_dynamic = new;
+		desc->pg_mirrors = new;
+
+		for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
+			nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
+	} else {
+		desc->pg_mirrors_dynamic = NULL;
+		desc->pg_mirrors = desc->pg_mirrors_static;
+		nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_init);
+
+/**
+ * nfs_pgio_result - Basic pageio error handling
+ * @task: The task that ran
+ * @calldata: Pageio header to check
+ */
+static void nfs_pgio_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_pgio_header *hdr = calldata;
+	struct inode *inode = hdr->inode;
+
+	dprintk("NFS: %s: %5u, (status %d)\n", __func__,
+		task->tk_pid, task->tk_status);
+
+	if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
+		return;
+	if (task->tk_status < 0)
+		nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
+	else
+		hdr->rw_ops->rw_result(task, hdr);
+}
+
+/*
+ * Create an RPC task for the given read or write request and kick it.
+ * The page must have been locked by the caller.
+ *
+ * It may happen that the page we're passed is not marked dirty.
+ * This is the case if nfs_updatepage detects a conflicting request
+ * that has been written but not committed.
+ */
+int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
+		     struct nfs_pgio_header *hdr)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+	struct nfs_page		*req;
+	struct page		**pages,
+				*last_page;
+	struct list_head *head = &mirror->pg_list;
+	struct nfs_commit_info cinfo;
+	unsigned int pagecount, pageused;
+
+	pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
+	if (!nfs_pgarray_set(&hdr->page_array, pagecount))
+		return nfs_pgio_error(desc, hdr);
+
+	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
+	pages = hdr->page_array.pagevec;
+	last_page = NULL;
+	pageused = 0;
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, &hdr->pages);
+
+		if (!last_page || last_page != req->wb_page) {
+			pageused++;
+			if (pageused > pagecount)
+				break;
+			*pages++ = last_page = req->wb_page;
+		}
+	}
+	if (WARN_ON_ONCE(pageused != pagecount))
+		return nfs_pgio_error(desc, hdr);
+
+	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
+		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+
+	/* Set up the argument struct */
+	nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
+	desc->pg_rpc_callops = &nfs_pgio_common_ops;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pgio);
+
+static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_pgio_mirror *mirror;
+	struct nfs_pgio_header *hdr;
+	int ret;
+
+	mirror = nfs_pgio_current_mirror(desc);
+
+	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+	if (!hdr) {
+		/* TODO: make sure this is right with mirroring - or
+		 *       should it back out all mirrors? */
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+		return -ENOMEM;
+	}
+	nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
+	ret = nfs_generic_pgio(desc, hdr);
+	if (ret == 0)
+		ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
+					hdr,
+					hdr->cred,
+					NFS_PROTO(hdr->inode),
+					desc->pg_rpc_callops,
+					desc->pg_ioflags, 0);
+	return ret;
+}
+
+/*
+ * nfs_pageio_setup_mirroring - determine if mirroring is to be used
+ *				by calling the pg_get_mirror_count op
+ */
+static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+				       struct nfs_page *req)
+{
+	int mirror_count = 1;
+
+	if (!pgio->pg_ops->pg_get_mirror_count)
+		return 0;
+
+	mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+
+	if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
+		return -EINVAL;
+
+	pgio->pg_mirror_count = mirror_count;
+
+	return 0;
+}
+
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+	pgio->pg_mirror_count = 1;
+	pgio->pg_mirror_idx = 0;
+}
+
+static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+	pgio->pg_mirror_count = 1;
+	pgio->pg_mirror_idx = 0;
+	pgio->pg_mirrors = pgio->pg_mirrors_static;
+	kfree(pgio->pg_mirrors_dynamic);
+	pgio->pg_mirrors_dynamic = NULL;
+}
+
+static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+		const struct nfs_open_context *ctx2)
+{
+	return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
+static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
+		const struct nfs_lock_context *l2)
+{
+	return l1->lockowner.l_owner == l2->lockowner.l_owner
+		&& l1->lockowner.l_pid == l2->lockowner.l_pid;
+}
+
+/**
+ * nfs_can_coalesce_requests - test two requests for compatibility
+ * @prev: pointer to nfs_page
+ * @req: pointer to nfs_page
+ *
+ * The nfs_page structures 'prev' and 'req' are compared to ensure that the
+ * page data area they describe is contiguous, and that their RPC
+ * credentials, NFSv4 open state, and lockowners are the same.
+ *
+ * Return 'true' if this is the case, else return 'false'.
+ */
+static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+				      struct nfs_page *req,
+				      struct nfs_pageio_descriptor *pgio)
+{
+	size_t size;
+	struct file_lock_context *flctx;
+
+	if (prev) {
+		if (!nfs_match_open_context(req->wb_context, prev->wb_context))
+			return false;
+		flctx = d_inode(req->wb_context->dentry)->i_flctx;
+		if (flctx != NULL &&
+		    !(list_empty_careful(&flctx->flc_posix) &&
+		      list_empty_careful(&flctx->flc_flock)) &&
+		    !nfs_match_lock_context(req->wb_lock_context,
+					    prev->wb_lock_context))
+			return false;
+		if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+			return false;
+		if (req->wb_page == prev->wb_page) {
+			if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
+				return false;
+		} else {
+			if (req->wb_pgbase != 0 ||
+			    prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+				return false;
+		}
+	}
+	size = pgio->pg_ops->pg_test(pgio, prev, req);
+	WARN_ON_ONCE(size > req->wb_bytes);
+	if (size && size < req->wb_bytes)
+		req->wb_bytes = size;
+	return size > 0;
+}
+
+/**
+ * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+				     struct nfs_page *req)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+	struct nfs_page *prev = NULL;
+
+	if (mirror->pg_count != 0) {
+		prev = nfs_list_entry(mirror->pg_list.prev);
+	} else {
+		if (desc->pg_ops->pg_init)
+			desc->pg_ops->pg_init(desc, req);
+		mirror->pg_base = req->wb_pgbase;
+	}
+	if (!nfs_can_coalesce_requests(prev, req, desc))
+		return 0;
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &mirror->pg_list);
+	mirror->pg_count += req->wb_bytes;
+	return 1;
+}
+
+/*
+ * Helper for nfs_pageio_add_request and nfs_pageio_complete
+ */
+static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+	if (!list_empty(&mirror->pg_list)) {
+		int error = desc->pg_ops->pg_doio(desc);
+		if (error < 0)
+			desc->pg_error = error;
+		else
+			mirror->pg_bytes_written += mirror->pg_count;
+	}
+	if (list_empty(&mirror->pg_list)) {
+		mirror->pg_count = 0;
+		mirror->pg_base = 0;
+	}
+}
+
+/**
+ * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * This may split a request into subrequests which are all part of the
+ * same page group.
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *req)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+	struct nfs_page *subreq;
+	unsigned int bytes_left = 0;
+	unsigned int offset, pgbase;
+
+	nfs_page_group_lock(req, false);
+
+	subreq = req;
+	bytes_left = subreq->wb_bytes;
+	offset = subreq->wb_offset;
+	pgbase = subreq->wb_pgbase;
+
+	do {
+		if (!nfs_pageio_do_add_request(desc, subreq)) {
+			/* make sure pg_test call(s) did nothing */
+			WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
+			WARN_ON_ONCE(subreq->wb_offset != offset);
+			WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
+
+			nfs_page_group_unlock(req);
+			desc->pg_moreio = 1;
+			nfs_pageio_doio(desc);
+			if (desc->pg_error < 0)
+				return 0;
+			if (mirror->pg_recoalesce)
+				return 0;
+			/* retry add_request for this subreq */
+			nfs_page_group_lock(req, false);
+			continue;
+		}
+
+		/* check for buggy pg_test call(s) */
+		WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
+		WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
+		WARN_ON_ONCE(subreq->wb_bytes == 0);
+
+		bytes_left -= subreq->wb_bytes;
+		offset += subreq->wb_bytes;
+		pgbase += subreq->wb_bytes;
+
+		if (bytes_left) {
+			subreq = nfs_create_request(req->wb_context,
+					req->wb_page,
+					subreq, pgbase, bytes_left);
+			if (IS_ERR(subreq))
+				goto err_ptr;
+			nfs_lock_request(subreq);
+			subreq->wb_offset  = offset;
+			subreq->wb_index = req->wb_index;
+		}
+	} while (bytes_left > 0);
+
+	nfs_page_group_unlock(req);
+	return 1;
+err_ptr:
+	desc->pg_error = PTR_ERR(subreq);
+	nfs_page_group_unlock(req);
+	return 0;
+}
+
+static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+	LIST_HEAD(head);
+
+	do {
+		list_splice_init(&mirror->pg_list, &head);
+		mirror->pg_bytes_written -= mirror->pg_count;
+		mirror->pg_count = 0;
+		mirror->pg_base = 0;
+		mirror->pg_recoalesce = 0;
+
+		desc->pg_moreio = 0;
+
+		while (!list_empty(&head)) {
+			struct nfs_page *req;
+
+			req = list_first_entry(&head, struct nfs_page, wb_list);
+			nfs_list_remove_request(req);
+			if (__nfs_pageio_add_request(desc, req))
+				continue;
+			if (desc->pg_error < 0)
+				return 0;
+			break;
+		}
+	} while (mirror->pg_recoalesce);
+	return 1;
+}
+
+static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
+		struct nfs_page *req)
+{
+	int ret;
+
+	do {
+		ret = __nfs_pageio_add_request(desc, req);
+		if (ret)
+			break;
+		if (desc->pg_error < 0)
+			break;
+		ret = nfs_do_recoalesce(desc);
+	} while (ret);
+
+	return ret;
+}
+
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *req)
+{
+	u32 midx;
+	unsigned int pgbase, offset, bytes;
+	struct nfs_page *dupreq, *lastreq;
+
+	pgbase = req->wb_pgbase;
+	offset = req->wb_offset;
+	bytes = req->wb_bytes;
+
+	nfs_pageio_setup_mirroring(desc, req);
+
+	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+		if (midx) {
+			nfs_page_group_lock(req, false);
+
+			/* find the last request */
+			for (lastreq = req->wb_head;
+			     lastreq->wb_this_page != req->wb_head;
+			     lastreq = lastreq->wb_this_page)
+				;
+
+			dupreq = nfs_create_request(req->wb_context,
+					req->wb_page, lastreq, pgbase, bytes);
+
+			if (IS_ERR(dupreq)) {
+				nfs_page_group_unlock(req);
+				return 0;
+			}
+
+			nfs_lock_request(dupreq);
+			nfs_page_group_unlock(req);
+			dupreq->wb_offset = offset;
+			dupreq->wb_index = req->wb_index;
+		} else
+			dupreq = req;
+
+		if (nfs_pgio_has_mirroring(desc))
+			desc->pg_mirror_idx = midx;
+		if (!nfs_pageio_add_request_mirror(desc, dupreq))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
+ *				nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
+				       u32 mirror_idx)
+{
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
+	u32 restore_idx = desc->pg_mirror_idx;
+
+	if (nfs_pgio_has_mirroring(desc))
+		desc->pg_mirror_idx = mirror_idx;
+	for (;;) {
+		nfs_pageio_doio(desc);
+		if (!mirror->pg_recoalesce)
+			break;
+		if (!nfs_do_recoalesce(desc))
+			break;
+	}
+	desc->pg_mirror_idx = restore_idx;
+}
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+		      struct nfs_pgio_header *hdr)
+{
+	LIST_HEAD(failed);
+
+	desc->pg_dreq = hdr->dreq;
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+		nfs_list_remove_request(req);
+		if (!nfs_pageio_add_request(desc, req))
+			nfs_list_add_request(req, &failed);
+	}
+	nfs_pageio_complete(desc);
+	if (!list_empty(&failed)) {
+		list_move(&failed, &hdr->pages);
+		return -EIO;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
+
+/**
+ * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
+{
+	u32 midx;
+
+	for (midx = 0; midx < desc->pg_mirror_count; midx++)
+		nfs_pageio_complete_mirror(desc, midx);
+
+	if (desc->pg_ops->pg_cleanup)
+		desc->pg_ops->pg_cleanup(desc);
+	nfs_pageio_cleanup_mirroring(desc);
+}
+
+/**
+ * nfs_pageio_cond_complete - Conditional I/O completion
+ * @desc: pointer to io descriptor
+ * @index: page index
+ *
+ * It is important to ensure that processes don't try to take locks
+ * on non-contiguous ranges of pages as that might deadlock. This
+ * function should be called before attempting to wait on a locked
+ * nfs_page. It will complete the I/O if the page index 'index'
+ * is not contiguous with the existing list of pages in 'desc'.
+ */
+void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
+{
+	struct nfs_pgio_mirror *mirror;
+	struct nfs_page *prev;
+	u32 midx;
+
+	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+		mirror = &desc->pg_mirrors[midx];
+		if (!list_empty(&mirror->pg_list)) {
+			prev = nfs_list_entry(mirror->pg_list.prev);
+			if (index != prev->wb_index + 1)
+				nfs_pageio_complete_mirror(desc, midx);
+		}
+	}
+}
+
+int __init nfs_init_nfspagecache(void)
+{
+	nfs_page_cachep = kmem_cache_create("nfs_page",
+					    sizeof(struct nfs_page),
+					    0, SLAB_HWCACHE_ALIGN,
+					    NULL);
+	if (nfs_page_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void nfs_destroy_nfspagecache(void)
+{
+	kmem_cache_destroy(nfs_page_cachep);
+}
+
+static const struct rpc_call_ops nfs_pgio_common_ops = {
+	.rpc_call_prepare = nfs_pgio_prepare,
+	.rpc_call_done = nfs_pgio_result,
+	.rpc_release = nfs_pgio_release,
+};
+
+const struct nfs_pageio_ops nfs_pgio_rw_ops = {
+	.pg_test = nfs_generic_pg_test,
+	.pg_doio = nfs_generic_pg_pgios,
+};
diff --git a/kernel/fs/nfs/pnfs.c b/kernel/fs/nfs/pnfs.c
new file mode 100644
index 000000000..230606243
--- /dev/null
+++ b/kernel/fs/nfs/pnfs.c
@@ -0,0 +1,2249 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include "internal.h"
+#include "pnfs.h"
+#include "iostat.h"
+#include "nfs4trace.h"
+#include "delegation.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
+
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+		       enum pnfs_iomode iomode, bool sync);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+		if (local->id == id)
+			goto out;
+	local = NULL;
+out:
+	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+	return local;
+}
+
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	spin_lock(&pnfs_spinlock);
+	local = find_pnfs_driver_locked(id);
+	if (local != NULL && !try_module_get(local->owner)) {
+		dprintk("%s: Could not grab reference on module\n", __func__);
+		local = NULL;
+	}
+	spin_unlock(&pnfs_spinlock);
+	return local;
+}
+
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+	if (nfss->pnfs_curr_ld) {
+		if (nfss->pnfs_curr_ld->clear_layoutdriver)
+			nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+		/* Decrement the MDS count. Purge the deviceid cache if zero */
+		if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
+			nfs4_deviceid_purge_client(nfss->nfs_client);
+		module_put(nfss->pnfs_curr_ld->owner);
+	}
+	nfss->pnfs_curr_ld = NULL;
+}
+
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+		      u32 id)
+{
+	struct pnfs_layoutdriver_type *ld_type = NULL;
+
+	if (id == 0)
+		goto out_no_driver;
+	if (!(server->nfs_client->cl_exchange_flags &
+		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+		printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
+			__func__, id, server->nfs_client->cl_exchange_flags);
+		goto out_no_driver;
+	}
+	ld_type = find_pnfs_driver(id);
+	if (!ld_type) {
+		request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+		ld_type = find_pnfs_driver(id);
+		if (!ld_type) {
+			dprintk("%s: No pNFS module found for %u.\n",
+				__func__, id);
+			goto out_no_driver;
+		}
+	}
+	server->pnfs_curr_ld = ld_type;
+	if (ld_type->set_layoutdriver
+	    && ld_type->set_layoutdriver(server, mntfh)) {
+		printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
+			"driver %u.\n", __func__, id);
+		module_put(ld_type->owner);
+		goto out_no_driver;
+	}
+	/* Bump the MDS count */
+	atomic_inc(&server->nfs_client->cl_mds_count);
+
+	dprintk("%s: pNFS module for %u set\n", __func__, id);
+	return;
+
+out_no_driver:
+	dprintk("%s: Using NFSv4 I/O\n", __func__);
+	server->pnfs_curr_ld = NULL;
+}
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	int status = -EINVAL;
+	struct pnfs_layoutdriver_type *tmp;
+
+	if (ld_type->id == 0) {
+		printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
+		return status;
+	}
+	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+		printk(KERN_ERR "NFS: %s Layout driver must provide "
+		       "alloc_lseg and free_lseg.\n", __func__);
+		return status;
+	}
+
+	spin_lock(&pnfs_spinlock);
+	tmp = find_pnfs_driver_locked(ld_type->id);
+	if (!tmp) {
+		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+		status = 0;
+		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+			ld_type->name);
+	} else {
+		printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
+			__func__, ld_type->id);
+	}
+	spin_unlock(&pnfs_spinlock);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+	spin_lock(&pnfs_spinlock);
+	list_del(&ld_type->pnfs_tblid);
+	spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+
+/*
+ * pNFS client layout cache
+ */
+
+/* Need to hold i_lock if caller does not already hold reference */
+void
+pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	atomic_inc(&lo->plh_refcount);
+}
+
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+	return ld->alloc_layout_hdr(ino, gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct nfs_server *server = NFS_SERVER(lo->plh_inode);
+	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
+
+	if (!list_empty(&lo->plh_layouts)) {
+		struct nfs_client *clp = server->nfs_client;
+
+		spin_lock(&clp->cl_lock);
+		list_del_init(&lo->plh_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
+	put_rpccred(lo->plh_lc_cred);
+	return ld->free_layout_hdr(lo);
+}
+
+static void
+pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
+	dprintk("%s: freeing layout cache %p\n", __func__, lo);
+	nfsi->layout = NULL;
+	/* Reset MDS Threshold I/O counters */
+	nfsi->write_io = 0;
+	nfsi->read_io = 0;
+}
+
+void
+pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct inode *inode = lo->plh_inode;
+
+	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+		if (!list_empty(&lo->plh_segs))
+			WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
+		pnfs_detach_layout_hdr(lo);
+		spin_unlock(&inode->i_lock);
+		pnfs_free_layout_hdr(lo);
+	}
+}
+
+static int
+pnfs_iomode_to_fail_bit(u32 iomode)
+{
+	return iomode == IOMODE_RW ?
+		NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
+static void
+pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+	lo->plh_retry_timestamp = jiffies;
+	if (!test_and_set_bit(fail_bit, &lo->plh_flags))
+		atomic_inc(&lo->plh_refcount);
+}
+
+static void
+pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+	if (test_and_clear_bit(fail_bit, &lo->plh_flags))
+		atomic_dec(&lo->plh_refcount);
+}
+
+static void
+pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+	struct inode *inode = lo->plh_inode;
+	struct pnfs_layout_range range = {
+		.iomode = iomode,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+	LIST_HEAD(head);
+
+	spin_lock(&inode->i_lock);
+	pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+	pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
+	spin_unlock(&inode->i_lock);
+	pnfs_free_lseg_list(&head);
+	dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
+			iomode == IOMODE_RW ?  "RW" : "READ");
+}
+
+static bool
+pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+	unsigned long start, end;
+	int fail_bit = pnfs_iomode_to_fail_bit(iomode);
+
+	if (test_bit(fail_bit, &lo->plh_flags) == 0)
+		return false;
+	end = jiffies;
+	start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
+	if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
+		/* It is time to retry the failed layoutgets */
+		pnfs_layout_clear_fail_bit(lo, fail_bit);
+		return false;
+	}
+	return true;
+}
+
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+	INIT_LIST_HEAD(&lseg->pls_list);
+	INIT_LIST_HEAD(&lseg->pls_lc_list);
+	atomic_set(&lseg->pls_refcount, 1);
+	smp_mb();
+	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+	lseg->pls_layout = lo;
+}
+
+static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct inode *ino = lseg->pls_layout->plh_inode;
+
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+}
+
+static void
+pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = lo->plh_inode;
+
+	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	list_del_init(&lseg->pls_list);
+	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
+	atomic_dec(&lo->plh_refcount);
+	if (list_empty(&lo->plh_segs))
+		clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_segment *s;
+
+	if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+		return false;
+
+	list_for_each_entry(s, &lo->plh_segs, pls_list)
+		if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+			return false;
+
+	return true;
+}
+
+static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
+		struct pnfs_layout_hdr *lo, struct inode *inode)
+{
+	lo = lseg->pls_layout;
+	inode = lo->plh_inode;
+
+	spin_lock(&inode->i_lock);
+	if (pnfs_layout_need_return(lo, lseg)) {
+		nfs4_stateid stateid;
+		enum pnfs_iomode iomode;
+
+		stateid = lo->plh_stateid;
+		iomode = lo->plh_return_iomode;
+		/* decreased in pnfs_send_layoutreturn() */
+		lo->plh_block_lgets++;
+		lo->plh_return_iomode = 0;
+		spin_unlock(&inode->i_lock);
+		pnfs_get_layout_hdr(lo);
+
+		/* Send an async layoutreturn so we dont deadlock */
+		pnfs_send_layoutreturn(lo, stateid, iomode, false);
+	} else
+		spin_unlock(&inode->i_lock);
+}
+
+void
+pnfs_put_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *inode;
+
+	if (!lseg)
+		return;
+
+	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+		atomic_read(&lseg->pls_refcount),
+		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+
+	/* Handle the case where refcount != 1 */
+	if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
+		return;
+
+	lo = lseg->pls_layout;
+	inode = lo->plh_inode;
+	/* Do we need a layoutreturn? */
+	if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+		pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
+
+	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+		pnfs_get_layout_hdr(lo);
+		pnfs_layout_remove_lseg(lo, lseg);
+		spin_unlock(&inode->i_lock);
+		pnfs_free_lseg(lseg);
+		pnfs_put_layout_hdr(lo);
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg);
+
+static void pnfs_free_lseg_async_work(struct work_struct *work)
+{
+	struct pnfs_layout_segment *lseg;
+	struct pnfs_layout_hdr *lo;
+
+	lseg = container_of(work, struct pnfs_layout_segment, pls_work);
+	lo = lseg->pls_layout;
+
+	pnfs_free_lseg(lseg);
+	pnfs_put_layout_hdr(lo);
+}
+
+static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
+{
+	INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
+	schedule_work(&lseg->pls_work);
+}
+
+void
+pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
+{
+	if (!lseg)
+		return;
+
+	assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
+
+	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+		atomic_read(&lseg->pls_refcount),
+		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	if (atomic_dec_and_test(&lseg->pls_refcount)) {
+		struct pnfs_layout_hdr *lo = lseg->pls_layout;
+		pnfs_get_layout_hdr(lo);
+		pnfs_layout_remove_lseg(lo, lseg);
+		pnfs_free_lseg_async(lseg);
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
+
+static u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/*
+ * is l2 fully contained in l1?
+ *   start1                             end1
+ *   [----------------------------------)
+ *           start2           end2
+ *           [----------------)
+ */
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+		 const struct pnfs_layout_range *l2)
+{
+	u64 start1 = l1->offset;
+	u64 end1 = end_offset(start1, l1->length);
+	u64 start2 = l2->offset;
+	u64 end2 = end_offset(start2, l2->length);
+
+	return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ *   start1                             end1
+ *   [----------------------------------)
+ *                              start2           end2
+ *                              [----------------)
+ */
+static bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+		    const struct pnfs_layout_range *l2)
+{
+	u64 start1 = l1->offset;
+	u64 end1 = end_offset(start1, l1->length);
+	u64 start2 = l2->offset;
+	u64 end2 = end_offset(start2, l2->length);
+
+	return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
+
+static bool
+should_free_lseg(const struct pnfs_layout_range *lseg_range,
+		 const struct pnfs_layout_range *recall_range)
+{
+	return (recall_range->iomode == IOMODE_ANY ||
+		lseg_range->iomode == recall_range->iomode) &&
+	       pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+		struct list_head *tmp_list)
+{
+	if (!atomic_dec_and_test(&lseg->pls_refcount))
+		return false;
+	pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
+	list_add(&lseg->pls_list, tmp_list);
+	return true;
+}
+
+/* Returns 1 if lseg is removed from list, 0 otherwise */
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+			     struct list_head *tmp_list)
+{
+	int rv = 0;
+
+	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+		/* Remove the reference keeping the lseg in the
+		 * list.  It will now be removed when all
+		 * outstanding io is finished.
+		 */
+		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+			atomic_read(&lseg->pls_refcount));
+		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
+			rv = 1;
+	}
+	return rv;
+}
+
+/* Returns count of number of matching invalid lsegs remaining in list
+ * after call.
+ */
+int
+pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+			    struct list_head *tmp_list,
+			    struct pnfs_layout_range *recall_range)
+{
+	struct pnfs_layout_segment *lseg, *next;
+	int invalid = 0, removed = 0;
+
+	dprintk("%s:Begin lo %p\n", __func__, lo);
+
+	if (list_empty(&lo->plh_segs))
+		return 0;
+	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+		if (!recall_range ||
+		    should_free_lseg(&lseg->pls_range, recall_range)) {
+			dprintk("%s: freeing lseg %p iomode %d "
+				"offset %llu length %llu\n", __func__,
+				lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+				lseg->pls_range.length);
+			invalid++;
+			removed += mark_lseg_invalid(lseg, tmp_list);
+		}
+	dprintk("%s:Return %i\n", __func__, invalid - removed);
+	return invalid - removed;
+}
+
+/* note free_me must contain lsegs from a single layout_hdr */
+void
+pnfs_free_lseg_list(struct list_head *free_me)
+{
+	struct pnfs_layout_segment *lseg, *tmp;
+
+	if (list_empty(free_me))
+		return;
+
+	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+		list_del(&lseg->pls_list);
+		pnfs_free_lseg(lseg);
+	}
+}
+
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+	struct pnfs_layout_hdr *lo;
+	LIST_HEAD(tmp_list);
+
+	spin_lock(&nfsi->vfs_inode.i_lock);
+	lo = nfsi->layout;
+	if (lo) {
+		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+		pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+		pnfs_get_layout_hdr(lo);
+		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
+		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+		pnfs_clear_retry_layoutget(lo);
+		spin_unlock(&nfsi->vfs_inode.i_lock);
+		pnfs_free_lseg_list(&tmp_list);
+		pnfs_put_layout_hdr(lo);
+	} else
+		spin_unlock(&nfsi->vfs_inode.i_lock);
+}
+EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
+
+static bool
+pnfs_layout_add_bulk_destroy_list(struct inode *inode,
+		struct list_head *layout_list)
+{
+	struct pnfs_layout_hdr *lo;
+	bool ret = false;
+
+	spin_lock(&inode->i_lock);
+	lo = NFS_I(inode)->layout;
+	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
+		pnfs_get_layout_hdr(lo);
+		list_add(&lo->plh_bulk_destroy, layout_list);
+		ret = true;
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/* Caller must hold rcu_read_lock and clp->cl_lock */
+static int
+pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
+		struct nfs_server *server,
+		struct list_head *layout_list)
+{
+	struct pnfs_layout_hdr *lo, *next;
+	struct inode *inode;
+
+	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+		inode = igrab(lo->plh_inode);
+		if (inode == NULL)
+			continue;
+		list_del_init(&lo->plh_layouts);
+		if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
+			continue;
+		rcu_read_unlock();
+		spin_unlock(&clp->cl_lock);
+		iput(inode);
+		spin_lock(&clp->cl_lock);
+		rcu_read_lock();
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static int
+pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
+		bool is_bulk_recall)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *inode;
+	struct pnfs_layout_range range = {
+		.iomode = IOMODE_ANY,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+	LIST_HEAD(lseg_list);
+	int ret = 0;
+
+	while (!list_empty(layout_list)) {
+		lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
+				plh_bulk_destroy);
+		dprintk("%s freeing layout for inode %lu\n", __func__,
+			lo->plh_inode->i_ino);
+		inode = lo->plh_inode;
+
+		pnfs_layoutcommit_inode(inode, false);
+
+		spin_lock(&inode->i_lock);
+		list_del_init(&lo->plh_bulk_destroy);
+		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+		if (is_bulk_recall)
+			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+		if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+			ret = -EAGAIN;
+		spin_unlock(&inode->i_lock);
+		pnfs_free_lseg_list(&lseg_list);
+		pnfs_put_layout_hdr(lo);
+		iput(inode);
+	}
+	return ret;
+}
+
+int
+pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+		struct nfs_fsid *fsid,
+		bool is_recall)
+{
+	struct nfs_server *server;
+	LIST_HEAD(layout_list);
+
+	spin_lock(&clp->cl_lock);
+	rcu_read_lock();
+restart:
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
+			continue;
+		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+				server,
+				&layout_list) != 0)
+			goto restart;
+	}
+	rcu_read_unlock();
+	spin_unlock(&clp->cl_lock);
+
+	if (list_empty(&layout_list))
+		return 0;
+	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+int
+pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+		bool is_recall)
+{
+	struct nfs_server *server;
+	LIST_HEAD(layout_list);
+
+	spin_lock(&clp->cl_lock);
+	rcu_read_lock();
+restart:
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+					server,
+					&layout_list) != 0)
+			goto restart;
+	}
+	rcu_read_unlock();
+	spin_unlock(&clp->cl_lock);
+
+	if (list_empty(&layout_list))
+		return 0;
+	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+	nfs4_deviceid_mark_client_invalid(clp);
+	nfs4_deviceid_purge_client(clp);
+
+	pnfs_destroy_layouts_byclid(clp, false);
+}
+
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+	return (s32)(s1 - s2) > 0;
+}
+
+/* update lo->plh_stateid with new if is more recent */
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+			bool update_barrier)
+{
+	u32 oldseq, newseq, new_barrier;
+	int empty = list_empty(&lo->plh_segs);
+
+	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+	newseq = be32_to_cpu(new->seqid);
+	if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
+		nfs4_stateid_copy(&lo->plh_stateid, new);
+		if (update_barrier) {
+			new_barrier = be32_to_cpu(new->seqid);
+		} else {
+			/* Because of wraparound, we want to keep the barrier
+			 * "close" to the current seqids.
+			 */
+			new_barrier = newseq - atomic_read(&lo->plh_outstanding);
+		}
+		if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+			lo->plh_barrier = new_barrier;
+	}
+}
+
+static bool
+pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
+		const nfs4_stateid *stateid)
+{
+	u32 seqid = be32_to_cpu(stateid->seqid);
+
+	return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
+}
+
+static bool
+pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
+		      struct pnfs_layout_range *range)
+{
+	return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+		(lo->plh_return_iomode == IOMODE_ANY ||
+		 lo->plh_return_iomode == range->iomode);
+}
+
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_range *range, int lget)
+{
+	return lo->plh_block_lgets ||
+		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+		(list_empty(&lo->plh_segs) &&
+		 (atomic_read(&lo->plh_outstanding) > lget)) ||
+		pnfs_layout_returning(lo, range);
+}
+
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			      struct pnfs_layout_range *range,
+			      struct nfs4_state *open_state)
+{
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+	spin_lock(&lo->plh_inode->i_lock);
+	if (pnfs_layoutgets_blocked(lo, range, 1)) {
+		status = -EAGAIN;
+	} else if (!nfs4_valid_open_stateid(open_state)) {
+		status = -EBADF;
+	} else if (list_empty(&lo->plh_segs) ||
+		   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+		int seq;
+
+		do {
+			seq = read_seqbegin(&open_state->seqlock);
+			nfs4_stateid_copy(dst, &open_state->stateid);
+		} while (read_seqretry(&open_state->seqlock, seq));
+	} else
+		nfs4_stateid_copy(dst, &lo->plh_stateid);
+	spin_unlock(&lo->plh_inode->i_lock);
+	dprintk("<-- %s\n", __func__);
+	return status;
+}
+
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+	   struct nfs_open_context *ctx,
+	   struct pnfs_layout_range *range,
+	   gfp_t gfp_flags)
+{
+	struct inode *ino = lo->plh_inode;
+	struct nfs_server *server = NFS_SERVER(ino);
+	struct nfs4_layoutget *lgp;
+	struct pnfs_layout_segment *lseg;
+
+	dprintk("--> %s\n", __func__);
+
+	lgp = kzalloc(sizeof(*lgp), gfp_flags);
+	if (lgp == NULL)
+		return NULL;
+
+	lgp->args.minlength = PAGE_CACHE_SIZE;
+	if (lgp->args.minlength > range->length)
+		lgp->args.minlength = range->length;
+	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+	lgp->args.range = *range;
+	lgp->args.type = server->pnfs_curr_ld->id;
+	lgp->args.inode = ino;
+	lgp->args.ctx = get_nfs_open_context(ctx);
+	lgp->gfp_flags = gfp_flags;
+	lgp->cred = lo->plh_lc_cred;
+
+	/* Synchronously retrieve layout information from server and
+	 * store in lseg.
+	 */
+	lseg = nfs4_proc_layoutget(lgp, gfp_flags);
+	if (IS_ERR(lseg)) {
+		switch (PTR_ERR(lseg)) {
+		case -ENOMEM:
+		case -ERESTARTSYS:
+			break;
+		default:
+			/* remember that LAYOUTGET failed and suspend trying */
+			pnfs_layout_io_set_failed(lo, range->iomode);
+		}
+		return NULL;
+	} else
+		pnfs_layout_clear_fail_bit(lo,
+				pnfs_iomode_to_fail_bit(range->iomode));
+
+	return lseg;
+}
+
+static void pnfs_clear_layoutcommit(struct inode *inode,
+		struct list_head *head)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct pnfs_layout_segment *lseg, *tmp;
+
+	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+		return;
+	list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
+		if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+			continue;
+		pnfs_lseg_dec_and_remove_zero(lseg, head);
+	}
+}
+
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+}
+
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+		       enum pnfs_iomode iomode, bool sync)
+{
+	struct inode *ino = lo->plh_inode;
+	struct nfs4_layoutreturn *lrp;
+	int status = 0;
+
+	lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+	if (unlikely(lrp == NULL)) {
+		status = -ENOMEM;
+		spin_lock(&ino->i_lock);
+		lo->plh_block_lgets--;
+		pnfs_clear_layoutreturn_waitbit(lo);
+		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+		spin_unlock(&ino->i_lock);
+		pnfs_put_layout_hdr(lo);
+		goto out;
+	}
+
+	lrp->args.stateid = stateid;
+	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+	lrp->args.inode = ino;
+	lrp->args.range.iomode = iomode;
+	lrp->args.range.offset = 0;
+	lrp->args.range.length = NFS4_MAX_UINT64;
+	lrp->args.layout = lo;
+	lrp->clp = NFS_SERVER(ino)->nfs_client;
+	lrp->cred = lo->plh_lc_cred;
+
+	status = nfs4_proc_layoutreturn(lrp, sync);
+out:
+	dprintk("<-- %s status: %d\n", __func__, status);
+	return status;
+}
+
+/*
+ * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
+ * when the layout segment list is empty.
+ *
+ * Note that a pnfs_layout_hdr can exist with an empty layout segment
+ * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
+ * deviceid is marked invalid.
+ */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo = NULL;
+	struct nfs_inode *nfsi = NFS_I(ino);
+	LIST_HEAD(tmp_list);
+	nfs4_stateid stateid;
+	int status = 0, empty;
+
+	dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
+
+	spin_lock(&ino->i_lock);
+	lo = nfsi->layout;
+	if (!lo) {
+		spin_unlock(&ino->i_lock);
+		dprintk("NFS: %s no layout to return\n", __func__);
+		goto out;
+	}
+	stateid = nfsi->layout->plh_stateid;
+	/* Reference matched in nfs4_layoutreturn_release */
+	pnfs_get_layout_hdr(lo);
+	empty = list_empty(&lo->plh_segs);
+	pnfs_clear_layoutcommit(ino, &tmp_list);
+	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+
+	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+		struct pnfs_layout_range range = {
+			.iomode		= IOMODE_ANY,
+			.offset		= 0,
+			.length		= NFS4_MAX_UINT64,
+		};
+		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+	}
+
+	/* Don't send a LAYOUTRETURN if list was initially empty */
+	if (empty) {
+		spin_unlock(&ino->i_lock);
+		pnfs_put_layout_hdr(lo);
+		dprintk("NFS: %s no layout segments to return\n", __func__);
+		goto out;
+	}
+
+	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+	lo->plh_block_lgets++;
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+
+	status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+out:
+	dprintk("<-- %s status: %d\n", __func__, status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(_pnfs_return_layout);
+
+int
+pnfs_commit_and_return_layout(struct inode *inode)
+{
+	struct pnfs_layout_hdr *lo;
+	int ret;
+
+	spin_lock(&inode->i_lock);
+	lo = NFS_I(inode)->layout;
+	if (lo == NULL) {
+		spin_unlock(&inode->i_lock);
+		return 0;
+	}
+	pnfs_get_layout_hdr(lo);
+	/* Block new layoutgets and read/write to ds */
+	lo->plh_block_lgets++;
+	spin_unlock(&inode->i_lock);
+	filemap_fdatawait(inode->i_mapping);
+	ret = pnfs_layoutcommit_inode(inode, true);
+	if (ret == 0)
+		ret = _pnfs_return_layout(inode);
+	spin_lock(&inode->i_lock);
+	lo->plh_block_lgets--;
+	spin_unlock(&inode->i_lock);
+	pnfs_put_layout_hdr(lo);
+	return ret;
+}
+
+bool pnfs_roc(struct inode *ino)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg, *tmp;
+	nfs4_stateid stateid;
+	LIST_HEAD(tmp_list);
+	bool found = false, layoutreturn = false;
+
+	spin_lock(&ino->i_lock);
+	lo = nfsi->layout;
+	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+		goto out_noroc;
+
+	/* Don't return layout if we hold a delegation */
+	if (nfs4_check_delegation(ino, FMODE_READ))
+		goto out_noroc;
+
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		state = ctx->state;
+		/* Don't return layout if there is open file state */
+		if (state != NULL && state->state != 0)
+			goto out_noroc;
+	}
+
+	pnfs_clear_retry_layoutget(lo);
+	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			mark_lseg_invalid(lseg, &tmp_list);
+			found = true;
+		}
+	if (!found)
+		goto out_noroc;
+	lo->plh_block_lgets++;
+	pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+	pnfs_layoutcommit_inode(ino, true);
+	return true;
+
+out_noroc:
+	if (lo) {
+		stateid = lo->plh_stateid;
+		layoutreturn =
+			test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					   &lo->plh_flags);
+		if (layoutreturn) {
+			lo->plh_block_lgets++;
+			pnfs_get_layout_hdr(lo);
+		}
+	}
+	spin_unlock(&ino->i_lock);
+	if (layoutreturn) {
+		pnfs_layoutcommit_inode(ino, true);
+		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+	}
+	return false;
+}
+
+void pnfs_roc_release(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	lo->plh_block_lgets--;
+	if (atomic_dec_and_test(&lo->plh_refcount)) {
+		pnfs_detach_layout_hdr(lo);
+		spin_unlock(&ino->i_lock);
+		pnfs_free_layout_hdr(lo);
+	} else
+		spin_unlock(&ino->i_lock);
+}
+
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
+		lo->plh_barrier = barrier;
+	spin_unlock(&ino->i_lock);
+}
+
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg;
+	nfs4_stateid stateid;
+	u32 current_seqid;
+	bool found = false, layoutreturn = false;
+
+	spin_lock(&ino->i_lock);
+	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+			found = true;
+			goto out;
+		}
+	lo = nfsi->layout;
+	current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
+
+	/* Since close does not return a layout stateid for use as
+	 * a barrier, we choose the worst-case barrier.
+	 */
+	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+out:
+	if (!found) {
+		stateid = lo->plh_stateid;
+		layoutreturn =
+			test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					   &lo->plh_flags);
+		if (layoutreturn) {
+			lo->plh_block_lgets++;
+			pnfs_get_layout_hdr(lo);
+		}
+	}
+	spin_unlock(&ino->i_lock);
+	if (layoutreturn) {
+		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
+	}
+	return found;
+}
+
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+	   const struct pnfs_layout_range *l2)
+{
+	s64 d;
+
+	/* high offset > low offset */
+	d = l1->offset - l2->offset;
+	if (d)
+		return d;
+
+	/* short length > long length */
+	d = l2->length - l1->length;
+	if (d)
+		return d;
+
+	/* read > read/write */
+	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
+}
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_segment *lp;
+
+	dprintk("%s:Begin\n", __func__);
+
+	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
+		if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
+			continue;
+		list_add_tail(&lseg->pls_list, &lp->pls_list);
+		dprintk("%s: inserted lseg %p "
+			"iomode %d offset %llu length %llu before "
+			"lp %p iomode %d offset %llu length %llu\n",
+			__func__, lseg, lseg->pls_range.iomode,
+			lseg->pls_range.offset, lseg->pls_range.length,
+			lp, lp->pls_range.iomode, lp->pls_range.offset,
+			lp->pls_range.length);
+		goto out;
+	}
+	list_add_tail(&lseg->pls_list, &lo->plh_segs);
+	dprintk("%s: inserted lseg %p "
+		"iomode %d offset %llu length %llu at tail\n",
+		__func__, lseg, lseg->pls_range.iomode,
+		lseg->pls_range.offset, lseg->pls_range.length);
+out:
+	pnfs_get_layout_hdr(lo);
+
+	dprintk("%s:Return\n", __func__);
+}
+
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino,
+		      struct nfs_open_context *ctx,
+		      gfp_t gfp_flags)
+{
+	struct pnfs_layout_hdr *lo;
+
+	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
+	if (!lo)
+		return NULL;
+	atomic_set(&lo->plh_refcount, 1);
+	INIT_LIST_HEAD(&lo->plh_layouts);
+	INIT_LIST_HEAD(&lo->plh_segs);
+	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
+	lo->plh_inode = ino;
+	lo->plh_lc_cred = get_rpccred(ctx->cred);
+	return lo;
+}
+
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino,
+		       struct nfs_open_context *ctx,
+		       gfp_t gfp_flags)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_hdr *new = NULL;
+
+	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
+	if (nfsi->layout != NULL)
+		goto out_existing;
+	spin_unlock(&ino->i_lock);
+	new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
+	spin_lock(&ino->i_lock);
+
+	if (likely(nfsi->layout == NULL)) {	/* Won the race? */
+		nfsi->layout = new;
+		return new;
+	} else if (new != NULL)
+		pnfs_free_layout_hdr(new);
+out_existing:
+	pnfs_get_layout_hdr(nfsi->layout);
+	return nfsi->layout;
+}
+
+/*
+ * iomode matching rules:
+ * iomode	lseg	match
+ * -----	-----	-----
+ * ANY		READ	true
+ * ANY		RW	true
+ * RW		READ	false
+ * RW		RW	true
+ * READ		READ	true
+ * READ		RW	true
+ */
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+		 const struct pnfs_layout_range *range)
+{
+	struct pnfs_layout_range range1;
+
+	if ((range->iomode == IOMODE_RW &&
+	     ls_range->iomode != IOMODE_RW) ||
+	    !pnfs_lseg_range_intersecting(ls_range, range))
+		return 0;
+
+	/* range1 covers only the first byte in the range */
+	range1 = *range;
+	range1.length = 1;
+	return pnfs_lseg_range_contained(ls_range, &range1);
+}
+
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_range *range)
+{
+	struct pnfs_layout_segment *lseg, *ret = NULL;
+
+	dprintk("%s:Begin\n", __func__);
+
+	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+		    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
+		    pnfs_lseg_range_match(&lseg->pls_range, range)) {
+			ret = pnfs_get_lseg(lseg);
+			break;
+		}
+		if (lseg->pls_range.offset > range->offset)
+			break;
+	}
+
+	dprintk("%s:Return lseg %p ref %d\n",
+		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
+	return ret;
+}
+
+/*
+ * Use mdsthreshold hints set at each OPEN to determine if I/O should go
+ * to the MDS or over pNFS
+ *
+ * The nfs_inode read_io and write_io fields are cumulative counters reset
+ * when there are no layout segments. Note that in pnfs_update_layout iomode
+ * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
+ * WRITE request.
+ *
+ * A return of true means use MDS I/O.
+ *
+ * From rfc 5661:
+ * If a file's size is smaller than the file size threshold, data accesses
+ * SHOULD be sent to the metadata server.  If an I/O request has a length that
+ * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
+ * server.  If both file size and I/O size are provided, the client SHOULD
+ * reach or exceed  both thresholds before sending its read or write
+ * requests to the data server.
+ */
+static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
+				     struct inode *ino, int iomode)
+{
+	struct nfs4_threshold *t = ctx->mdsthreshold;
+	struct nfs_inode *nfsi = NFS_I(ino);
+	loff_t fsize = i_size_read(ino);
+	bool size = false, size_set = false, io = false, io_set = false, ret = false;
+
+	if (t == NULL)
+		return ret;
+
+	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+		__func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
+
+	switch (iomode) {
+	case IOMODE_READ:
+		if (t->bm & THRESHOLD_RD) {
+			dprintk("%s fsize %llu\n", __func__, fsize);
+			size_set = true;
+			if (fsize < t->rd_sz)
+				size = true;
+		}
+		if (t->bm & THRESHOLD_RD_IO) {
+			dprintk("%s nfsi->read_io %llu\n", __func__,
+				nfsi->read_io);
+			io_set = true;
+			if (nfsi->read_io < t->rd_io_sz)
+				io = true;
+		}
+		break;
+	case IOMODE_RW:
+		if (t->bm & THRESHOLD_WR) {
+			dprintk("%s fsize %llu\n", __func__, fsize);
+			size_set = true;
+			if (fsize < t->wr_sz)
+				size = true;
+		}
+		if (t->bm & THRESHOLD_WR_IO) {
+			dprintk("%s nfsi->write_io %llu\n", __func__,
+				nfsi->write_io);
+			io_set = true;
+			if (nfsi->write_io < t->wr_io_sz)
+				io = true;
+		}
+		break;
+	}
+	if (size_set && io_set) {
+		if (size && io)
+			ret = true;
+	} else if (size || io)
+		ret = true;
+
+	dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
+	return ret;
+}
+
+/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
+static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
+{
+	if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
+		return 1;
+	return nfs_wait_bit_killable(key);
+}
+
+static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+	/*
+	 * send layoutcommit as it can hold up layoutreturn due to lseg
+	 * reference
+	 */
+	pnfs_layoutcommit_inode(lo->plh_inode, false);
+	return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
+				   pnfs_layoutget_retry_bit_wait,
+				   TASK_UNINTERRUPTIBLE);
+}
+
+static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+	unsigned long *bitlock = &lo->plh_flags;
+
+	clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
+	smp_mb__after_atomic();
+	wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
+}
+
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+		   struct nfs_open_context *ctx,
+		   loff_t pos,
+		   u64 count,
+		   enum pnfs_iomode iomode,
+		   gfp_t gfp_flags)
+{
+	struct pnfs_layout_range arg = {
+		.iomode = iomode,
+		.offset = pos,
+		.length = count,
+	};
+	unsigned pg_offset;
+	struct nfs_server *server = NFS_SERVER(ino);
+	struct nfs_client *clp = server->nfs_client;
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg = NULL;
+	bool first;
+
+	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+		goto out;
+
+	if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+		goto out;
+
+lookup_again:
+	first = false;
+	spin_lock(&ino->i_lock);
+	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
+	if (lo == NULL) {
+		spin_unlock(&ino->i_lock);
+		goto out;
+	}
+
+	/* Do we even need to bother with this? */
+	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		dprintk("%s matches recall, use MDS\n", __func__);
+		goto out_unlock;
+	}
+
+	/* if LAYOUTGET already failed once we don't try again */
+	if (pnfs_layout_io_test_failed(lo, iomode) &&
+	    !pnfs_should_retry_layoutget(lo))
+		goto out_unlock;
+
+	first = list_empty(&lo->plh_segs);
+	if (first) {
+		/* The first layoutget for the file. Need to serialize per
+		 * RFC 5661 Errata 3208.
+		 */
+		if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
+				     &lo->plh_flags)) {
+			spin_unlock(&ino->i_lock);
+			wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
+				    TASK_UNINTERRUPTIBLE);
+			pnfs_put_layout_hdr(lo);
+			goto lookup_again;
+		}
+	} else {
+		/* Check to see if the layout for the given range
+		 * already exists
+		 */
+		lseg = pnfs_find_lseg(lo, &arg);
+		if (lseg)
+			goto out_unlock;
+	}
+
+	/*
+	 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
+	 * for LAYOUTRETURN even if first is true.
+	 */
+	if (!lseg && pnfs_should_retry_layoutget(lo) &&
+	    test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+		spin_unlock(&ino->i_lock);
+		dprintk("%s wait for layoutreturn\n", __func__);
+		if (pnfs_prepare_to_retry_layoutget(lo)) {
+			if (first)
+				pnfs_clear_first_layoutget(lo);
+			pnfs_put_layout_hdr(lo);
+			dprintk("%s retrying\n", __func__);
+			goto lookup_again;
+		}
+		goto out_put_layout_hdr;
+	}
+
+	if (pnfs_layoutgets_blocked(lo, &arg, 0))
+		goto out_unlock;
+	atomic_inc(&lo->plh_outstanding);
+	spin_unlock(&ino->i_lock);
+
+	if (list_empty(&lo->plh_layouts)) {
+		/* The lo must be on the clp list if there is any
+		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+		 */
+		spin_lock(&clp->cl_lock);
+		if (list_empty(&lo->plh_layouts))
+			list_add_tail(&lo->plh_layouts, &server->layouts);
+		spin_unlock(&clp->cl_lock);
+	}
+
+	pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+	if (pg_offset) {
+		arg.offset -= pg_offset;
+		arg.length += pg_offset;
+	}
+	if (arg.length != NFS4_MAX_UINT64)
+		arg.length = PAGE_CACHE_ALIGN(arg.length);
+
+	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
+	pnfs_clear_retry_layoutget(lo);
+	atomic_dec(&lo->plh_outstanding);
+out_put_layout_hdr:
+	if (first)
+		pnfs_clear_first_layoutget(lo);
+	pnfs_put_layout_hdr(lo);
+out:
+	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
+			"(%s, offset: %llu, length: %llu)\n",
+			__func__, ino->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(ino),
+			lseg == NULL ? "not found" : "found",
+			iomode==IOMODE_RW ?  "read/write" : "read-only",
+			(unsigned long long)pos,
+			(unsigned long long)count);
+	return lseg;
+out_unlock:
+	spin_unlock(&ino->i_lock);
+	goto out_put_layout_hdr;
+}
+EXPORT_SYMBOL_GPL(pnfs_update_layout);
+
+struct pnfs_layout_segment *
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+	struct nfs4_layoutget_res *res = &lgp->res;
+	struct pnfs_layout_segment *lseg;
+	struct inode *ino = lo->plh_inode;
+	LIST_HEAD(free_me);
+	int status = 0;
+
+	/* Inject layout blob into I/O device driver */
+	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
+	if (!lseg || IS_ERR(lseg)) {
+		if (!lseg)
+			status = -ENOMEM;
+		else
+			status = PTR_ERR(lseg);
+		dprintk("%s: Could not allocate layout: error %d\n",
+		       __func__, status);
+		goto out;
+	}
+
+	init_lseg(lo, lseg);
+	lseg->pls_range = res->range;
+
+	spin_lock(&ino->i_lock);
+	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		dprintk("%s forget reply due to recall\n", __func__);
+		goto out_forget_reply;
+	}
+
+	if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
+		dprintk("%s forget reply due to state\n", __func__);
+		goto out_forget_reply;
+	}
+
+	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+		/* existing state ID, make sure the sequence number matches. */
+		if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+			dprintk("%s forget reply due to sequence\n", __func__);
+			goto out_forget_reply;
+		}
+		pnfs_set_layout_stateid(lo, &res->stateid, false);
+	} else {
+		/*
+		 * We got an entirely new state ID.  Mark all segments for the
+		 * inode invalid, and don't bother validating the stateid
+		 * sequence number.
+		 */
+		pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+
+		nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+		lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+	}
+
+	clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+
+	pnfs_get_lseg(lseg);
+	pnfs_layout_insert_lseg(lo, lseg);
+
+	if (res->return_on_close) {
+		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+	}
+
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&free_me);
+	return lseg;
+out:
+	return ERR_PTR(status);
+
+out_forget_reply:
+	spin_unlock(&ino->i_lock);
+	lseg->pls_layout = lo;
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+	goto out;
+}
+
+static void
+pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+				struct list_head *tmp_list,
+				struct pnfs_layout_range *return_range)
+{
+	struct pnfs_layout_segment *lseg, *next;
+
+	dprintk("%s:Begin lo %p\n", __func__, lo);
+
+	if (list_empty(&lo->plh_segs))
+		return;
+
+	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+		if (should_free_lseg(&lseg->pls_range, return_range)) {
+			dprintk("%s: marking lseg %p iomode %d "
+				"offset %llu length %llu\n", __func__,
+				lseg, lseg->pls_range.iomode,
+				lseg->pls_range.offset,
+				lseg->pls_range.length);
+			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+			mark_lseg_invalid(lseg, tmp_list);
+		}
+}
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+				       struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
+	int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
+	struct pnfs_layout_range range = {
+		.iomode = lseg->pls_range.iomode,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+	LIST_HEAD(free_me);
+
+	spin_lock(&inode->i_lock);
+	/* set failure bit so that pnfs path will be retried later */
+	pnfs_layout_set_fail_bit(lo, iomode);
+	set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+	if (lo->plh_return_iomode == 0)
+		lo->plh_return_iomode = range.iomode;
+	else if (lo->plh_return_iomode != range.iomode)
+		lo->plh_return_iomode = IOMODE_ANY;
+	/*
+	 * mark all matching lsegs so that we are sure to have no live
+	 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
+	 * for how it works.
+	 */
+	pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
+	spin_unlock(&inode->i_lock);
+	pnfs_free_lseg_list(&free_me);
+}
+EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
+
+void
+pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+	u64 rd_size = req->wb_bytes;
+
+	if (pgio->pg_lseg == NULL) {
+		if (pgio->pg_dreq == NULL)
+			rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+		else
+			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   req_offset(req),
+						   rd_size,
+						   IOMODE_READ,
+						   GFP_KERNEL);
+	}
+	/* If no lseg, fall back to read through mds */
+	if (pgio->pg_lseg == NULL)
+		nfs_pageio_reset_read_mds(pgio);
+
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
+
+void
+pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+			   struct nfs_page *req, u64 wb_size)
+{
+	if (pgio->pg_lseg == NULL)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   req_offset(req),
+						   wb_size,
+						   IOMODE_RW,
+						   GFP_NOFS);
+	/* If no lseg, fall back to write through mds */
+	if (pgio->pg_lseg == NULL)
+		nfs_pageio_reset_write_mds(pgio);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
+
+void
+pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+	if (desc->pg_lseg) {
+		pnfs_put_lseg(desc->pg_lseg);
+		desc->pg_lseg = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+size_t
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+		     struct nfs_page *prev, struct nfs_page *req)
+{
+	unsigned int size;
+	u64 seg_end, req_start, seg_left;
+
+	size = nfs_generic_pg_test(pgio, prev, req);
+	if (!size)
+		return 0;
+
+	/*
+	 * 'size' contains the number of bytes left in the current page (up
+	 * to the original size asked for in @req->wb_bytes).
+	 *
+	 * Calculate how many bytes are left in the layout segment
+	 * and if there are less bytes than 'size', return that instead.
+	 *
+	 * Please also note that 'end_offset' is actually the offset of the
+	 * first byte that lies outside the pnfs_layout_range. FIXME?
+	 *
+	 */
+	if (pgio->pg_lseg) {
+		seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
+				     pgio->pg_lseg->pls_range.length);
+		req_start = req_offset(req);
+		WARN_ON_ONCE(req_start >= seg_end);
+		/* start of request is past the last byte of this segment */
+		if (req_start >= seg_end) {
+			/* reference the new lseg */
+			if (pgio->pg_ops->pg_cleanup)
+				pgio->pg_ops->pg_cleanup(pgio);
+			if (pgio->pg_ops->pg_init)
+				pgio->pg_ops->pg_init(pgio, req);
+			return 0;
+		}
+
+		/* adjust 'size' iff there are fewer bytes left in the
+		 * segment than what nfs_generic_pg_test returned */
+		seg_left = seg_end - req_start;
+		if (seg_left < size)
+			size = (unsigned int)seg_left;
+	}
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
+
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
+{
+	struct nfs_pageio_descriptor pgio;
+
+	/* Resend all requests through the MDS */
+	nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+			      hdr->completion_ops);
+	return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
+
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
+{
+
+	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
+	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+	    PNFS_LAYOUTRET_ON_ERROR) {
+		pnfs_return_layout(hdr->inode);
+	}
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+		hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
+}
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
+{
+	trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+	if (!hdr->pnfs_error) {
+		pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+				hdr->mds_offset + hdr->res.count);
+		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
+	} else
+		pnfs_ld_handle_write_error(hdr);
+	hdr->mds_ops->rpc_release(hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
+
+static void
+pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
+		struct nfs_pgio_header *hdr)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
+		nfs_pageio_reset_write_mds(desc);
+		mirror->pg_recoalesce = 1;
+	}
+	nfs_pgio_data_destroy(hdr);
+}
+
+static enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
+			const struct rpc_call_ops *call_ops,
+			struct pnfs_layout_segment *lseg,
+			int how)
+{
+	struct inode *inode = hdr->inode;
+	enum pnfs_try_status trypnfs;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+
+	hdr->mds_ops = call_ops;
+
+	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+		inode->i_ino, hdr->args.count, hdr->args.offset, how);
+	trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
+	if (trypnfs != PNFS_NOT_ATTEMPTED)
+		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
+}
+
+static void
+pnfs_do_write(struct nfs_pageio_descriptor *desc,
+	      struct nfs_pgio_header *hdr, int how)
+{
+	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
+	enum pnfs_try_status trypnfs;
+
+	trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
+	if (trypnfs == PNFS_NOT_ATTEMPTED)
+		pnfs_write_through_mds(desc, hdr);
+}
+
+static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+	pnfs_put_lseg(hdr->lseg);
+	nfs_pgio_header_free(hdr);
+}
+
+int
+pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+	struct nfs_pgio_header *hdr;
+	int ret;
+
+	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+	if (!hdr) {
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+		return -ENOMEM;
+	}
+	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
+
+	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
+	ret = nfs_generic_pgio(desc, hdr);
+	if (!ret)
+		pnfs_do_write(desc, hdr, desc->pg_ioflags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
+
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
+{
+	struct nfs_pageio_descriptor pgio;
+
+	/* Resend all requests through the MDS */
+	nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+	return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
+
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
+{
+	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
+	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+	    PNFS_LAYOUTRET_ON_ERROR) {
+		pnfs_return_layout(hdr->inode);
+	}
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+		hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
+}
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
+{
+	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+	if (likely(!hdr->pnfs_error)) {
+		__nfs4_read_done_cb(hdr);
+		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
+	} else
+		pnfs_ld_handle_read_error(hdr);
+	hdr->mds_ops->rpc_release(hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
+static void
+pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
+		struct nfs_pgio_header *hdr)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
+		nfs_pageio_reset_read_mds(desc);
+		mirror->pg_recoalesce = 1;
+	}
+	nfs_pgio_data_destroy(hdr);
+}
+
+/*
+ * Call the appropriate parallel I/O subsystem read function.
+ */
+static enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
+		       const struct rpc_call_ops *call_ops,
+		       struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = hdr->inode;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	enum pnfs_try_status trypnfs;
+
+	hdr->mds_ops = call_ops;
+
+	dprintk("%s: Reading ino:%lu %u@%llu\n",
+		__func__, inode->i_ino, hdr->args.count, hdr->args.offset);
+
+	trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
+	if (trypnfs != PNFS_NOT_ATTEMPTED)
+		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
+}
+
+/* Resend all requests through pnfs. */
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+{
+	struct nfs_pageio_descriptor pgio;
+
+	nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
+	return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
+
+static void
+pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
+{
+	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
+	enum pnfs_try_status trypnfs;
+	int err = 0;
+
+	trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
+	if (trypnfs == PNFS_TRY_AGAIN)
+		err = pnfs_read_resend_pnfs(hdr);
+	if (trypnfs == PNFS_NOT_ATTEMPTED || err)
+		pnfs_read_through_mds(desc, hdr);
+}
+
+static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
+{
+	pnfs_put_lseg(hdr->lseg);
+	nfs_pgio_header_free(hdr);
+}
+
+int
+pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+	struct nfs_pgio_header *hdr;
+	int ret;
+
+	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+	if (!hdr) {
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+		return -ENOMEM;
+	}
+	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
+	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
+	ret = nfs_generic_pgio(desc, hdr);
+	if (!ret)
+		pnfs_do_read(desc, hdr);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
+
+static void pnfs_clear_layoutcommitting(struct inode *inode)
+{
+	unsigned long *bitlock = &NFS_I(inode)->flags;
+
+	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+	smp_mb__after_atomic();
+	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
+
+/*
+ * There can be multiple RW segments.
+ */
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
+{
+	struct pnfs_layout_segment *lseg;
+
+	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+		if (lseg->pls_range.iomode == IOMODE_RW &&
+		    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+			list_add(&lseg->pls_lc_list, listp);
+	}
+}
+
+static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
+{
+	struct pnfs_layout_segment *lseg, *tmp;
+
+	/* Matched by references in pnfs_set_layoutcommit */
+	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
+		list_del_init(&lseg->pls_lc_list);
+		pnfs_put_lseg(lseg);
+	}
+
+	pnfs_clear_layoutcommitting(inode);
+}
+
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+	pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
+
+void
+pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
+		loff_t end_pos)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	bool mark_as_dirty = false;
+
+	spin_lock(&inode->i_lock);
+	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+		nfsi->layout->plh_lwb = end_pos;
+		mark_as_dirty = true;
+		dprintk("%s: Set layoutcommit for inode %lu ",
+			__func__, inode->i_ino);
+	} else if (end_pos > nfsi->layout->plh_lwb)
+		nfsi->layout->plh_lwb = end_pos;
+	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
+		/* references matched in nfs4_layoutcommit_release */
+		pnfs_get_lseg(lseg);
+	}
+	spin_unlock(&inode->i_lock);
+	dprintk("%s: lseg %p end_pos %llu\n",
+		__func__, lseg, nfsi->layout->plh_lwb);
+
+	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+	if (mark_as_dirty)
+		mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
+{
+	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+		nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
+	pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
+}
+
+/*
+ * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
+ * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
+ * data to disk to allow the server to recover the data if it crashes.
+ * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
+ * is off, and a COMMIT is sent to a data server, or
+ * if WRITEs to a data server return NFS_DATA_SYNC.
+ */
+int
+pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+	struct nfs4_layoutcommit_data *data;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	loff_t end_pos;
+	int status;
+
+	if (!pnfs_layoutcommit_outstanding(inode))
+		return 0;
+
+	dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+
+	status = -EAGAIN;
+	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+		if (!sync)
+			goto out;
+		status = wait_on_bit_lock_action(&nfsi->flags,
+				NFS_INO_LAYOUTCOMMITTING,
+				nfs_wait_bit_killable,
+				TASK_KILLABLE);
+		if (status)
+			goto out;
+	}
+
+	status = -ENOMEM;
+	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+	data = kzalloc(sizeof(*data), GFP_NOFS);
+	if (!data)
+		goto clear_layoutcommitting;
+
+	status = 0;
+	spin_lock(&inode->i_lock);
+	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+		goto out_unlock;
+
+	INIT_LIST_HEAD(&data->lseg_list);
+	pnfs_list_write_lseg(inode, &data->lseg_list);
+
+	end_pos = nfsi->layout->plh_lwb;
+
+	nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
+	spin_unlock(&inode->i_lock);
+
+	data->args.inode = inode;
+	data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
+	nfs_fattr_init(&data->fattr);
+	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+	data->res.fattr = &data->fattr;
+	data->args.lastbytewritten = end_pos - 1;
+	data->res.server = NFS_SERVER(inode);
+
+	if (ld->prepare_layoutcommit) {
+		status = ld->prepare_layoutcommit(&data->args);
+		if (status) {
+			spin_lock(&inode->i_lock);
+			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+			if (end_pos > nfsi->layout->plh_lwb)
+				nfsi->layout->plh_lwb = end_pos;
+			spin_unlock(&inode->i_lock);
+			put_rpccred(data->cred);
+			goto clear_layoutcommitting;
+		}
+	}
+
+
+	status = nfs4_proc_layoutcommit(data, sync);
+out:
+	if (status)
+		mark_inode_dirty_sync(inode);
+	dprintk("<-- %s status %d\n", __func__, status);
+	return status;
+out_unlock:
+	spin_unlock(&inode->i_lock);
+	kfree(data);
+clear_layoutcommitting:
+	pnfs_clear_layoutcommitting(inode);
+	goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
+
+int
+pnfs_generic_sync(struct inode *inode, bool datasync)
+{
+	return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_sync);
+
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+	struct nfs4_threshold *thp;
+
+	thp = kzalloc(sizeof(*thp), GFP_NOFS);
+	if (!thp) {
+		dprintk("%s mdsthreshold allocation failed\n", __func__);
+		return NULL;
+	}
+	return thp;
+}
diff --git a/kernel/fs/nfs/pnfs.h b/kernel/fs/nfs/pnfs.h
new file mode 100644
index 000000000..1e6308f82
--- /dev/null
+++ b/kernel/fs/nfs/pnfs.h
@@ -0,0 +1,692 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/workqueue.h>
+
+enum {
+	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
+	NFS_LSEG_ROC,		/* roc bit received from server */
+	NFS_LSEG_LAYOUTCOMMIT,	/* layoutcommit bit set for layoutcommit */
+	NFS_LSEG_LAYOUTRETURN,	/* layoutreturn bit set for layoutreturn */
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds_addr {
+	struct sockaddr_storage	da_addr;
+	size_t			da_addrlen;
+	struct list_head	da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+	char			*da_remotestr;	/* human readable addr+port */
+};
+
+struct nfs4_pnfs_ds {
+	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+	char			*ds_remotestr;	/* comma sep list of addrs */
+	struct list_head	ds_addrs;
+	struct nfs_client	*ds_clp;
+	atomic_t		ds_count;
+	unsigned long		ds_state;
+#define NFS4DS_CONNECTING	0	/* ds is establishing connection */
+};
+
+struct pnfs_layout_segment {
+	struct list_head pls_list;
+	struct list_head pls_lc_list;
+	struct pnfs_layout_range pls_range;
+	atomic_t pls_refcount;
+	unsigned long pls_flags;
+	struct pnfs_layout_hdr *pls_layout;
+	struct work_struct pls_work;
+};
+
+enum pnfs_try_status {
+	PNFS_ATTEMPTED     = 0,
+	PNFS_NOT_ATTEMPTED = 1,
+	PNFS_TRY_AGAIN     = 2,
+};
+
+#ifdef CONFIG_NFS_V4_1
+
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+
+/*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module parameters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */
+#define NFS4_DEF_DS_RETRANS 5
+
+/* error codes for internal use */
+#define NFS4ERR_RESET_TO_MDS   12001
+#define NFS4ERR_RESET_TO_PNFS  12002
+
+enum {
+	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
+	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
+	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
+	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
+	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */
+	NFS_LAYOUT_RETURN_BEFORE_CLOSE,	/* Return this layout before close */
+	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
+	NFS_LAYOUT_FIRST_LAYOUTGET,	/* Serialize first layoutget */
+	NFS_LAYOUT_RETRY_LAYOUTGET,	/* Retry layoutget */
+};
+
+enum layoutdriver_policy_flags {
+	/* Should the pNFS client commit and return the layout upon truncate to
+	 * a smaller size */
+	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
+	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
+	PNFS_READ_WHOLE_PAGE		= 1 << 2,
+};
+
+struct nfs4_deviceid_node;
+
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+	struct list_head pnfs_tblid;
+	const u32 id;
+	const char *name;
+	struct module *owner;
+	unsigned flags;
+	unsigned max_deviceinfo_size;
+
+	int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+	int (*clear_layoutdriver) (struct nfs_server *);
+
+	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+	void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
+	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+	void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
+	void (*return_range) (struct pnfs_layout_hdr *lo,
+			      struct pnfs_layout_range *range);
+
+	/* test for nfs page cache coalescing */
+	const struct nfs_pageio_ops *pg_read_ops;
+	const struct nfs_pageio_ops *pg_write_ops;
+
+	struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
+	void (*mark_request_commit) (struct nfs_page *req,
+				     struct pnfs_layout_segment *lseg,
+				     struct nfs_commit_info *cinfo,
+				     u32 ds_commit_idx);
+	void (*clear_request_commit) (struct nfs_page *req,
+				      struct nfs_commit_info *cinfo);
+	int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+				  int max);
+	void (*recover_commit_reqs) (struct list_head *list,
+				     struct nfs_commit_info *cinfo);
+	struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+						struct page *page);
+	int (*commit_pagelist)(struct inode *inode,
+			       struct list_head *mds_pages,
+			       int how,
+			       struct nfs_commit_info *cinfo);
+
+	int (*sync)(struct inode *inode, bool datasync);
+
+	/*
+	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+	 */
+	enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+	enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
+
+	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+	struct nfs4_deviceid_node * (*alloc_deviceid_node)
+			(struct nfs_server *server, struct pnfs_device *pdev,
+			gfp_t gfp_flags);
+
+	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutreturn_args *args);
+
+	void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
+	int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+	void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutcommit_args *args);
+};
+
+struct pnfs_layout_hdr {
+	atomic_t		plh_refcount;
+	struct list_head	plh_layouts;   /* other client layouts */
+	struct list_head	plh_bulk_destroy;
+	struct list_head	plh_segs;      /* layout segments list */
+	nfs4_stateid		plh_stateid;
+	atomic_t		plh_outstanding; /* number of RPCs out */
+	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
+	u32			plh_barrier; /* ignore lower seqids */
+	unsigned long		plh_retry_timestamp;
+	unsigned long		plh_flags;
+	enum pnfs_iomode	plh_return_iomode;
+	loff_t			plh_lwb; /* last write byte for layoutcommit */
+	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */
+	struct inode		*plh_inode;
+};
+
+struct pnfs_device {
+	struct nfs4_deviceid dev_id;
+	unsigned int  layout_type;
+	unsigned int  mincount;
+	unsigned int  maxcount;	/* gdia_maxcount */
+	struct page **pages;
+	unsigned int  pgbase;
+	unsigned int  pglen;	/* reply buffer length */
+	unsigned char nocache : 1;/* May not be cached */
+};
+
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+
+struct pnfs_devicelist {
+	unsigned int		eof;
+	unsigned int		num_devs;
+	struct nfs4_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
+
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+				   struct pnfs_device *dev,
+				   struct rpc_cred *cred);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
+
+/* pnfs.c */
+void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
+
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
+int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
+void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+			        struct nfs_page *req, u64 wb_size);
+void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
+int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
+size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+			    struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
+struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+		struct nfs_fsid *fsid,
+		bool is_recall);
+int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+		bool is_recall);
+void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+			     const nfs4_stateid *new,
+			     bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+				  struct pnfs_layout_hdr *lo,
+				  struct pnfs_layout_range *range,
+				  struct nfs4_state *open_state);
+int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+				struct list_head *tmp_list,
+				struct pnfs_layout_range *recall_range);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
+void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
+int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int pnfs_generic_sync(struct inode *inode, bool datasync);
+int pnfs_nfs_generic_sync(struct inode *inode, bool datasync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_commit_and_return_layout(struct inode *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
+struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
+					       struct nfs_open_context *ctx,
+					       loff_t pos,
+					       u64 count,
+					       enum pnfs_iomode iomode,
+					       gfp_t gfp_flags);
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
+
+void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+				       struct pnfs_layout_segment *lseg);
+
+/* nfs4_deviceid_flags */
+enum {
+	NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
+	NFS_DEVICEID_UNAVAILABLE,	/* device temporarily unavailable */
+	NFS_DEVICEID_NOCACHE,		/* device may not be cached */
+};
+
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+	struct hlist_node		node;
+	struct hlist_node		tmpnode;
+	const struct pnfs_layoutdriver_type *ld;
+	const struct nfs_client		*nfs_client;
+	unsigned long 			flags;
+	unsigned long			timestamp_unavailable;
+	struct nfs4_deviceid		deviceid;
+	struct rcu_head			rcu;
+	atomic_t			ref;
+};
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		gfp_t gfp_mask);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
+			     const struct nfs4_deviceid *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
+bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
+
+/* pnfs_nfs.c */
+void pnfs_generic_clear_request_commit(struct nfs_page *req,
+				       struct nfs_commit_info *cinfo);
+void pnfs_generic_commit_release(void *calldata);
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
+void pnfs_generic_rw_release(void *data);
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+				      struct nfs_commit_info *cinfo);
+int pnfs_generic_commit_pagelist(struct inode *inode,
+				 struct list_head *mds_pages,
+				 int how,
+				 struct nfs_commit_info *cinfo,
+				 int (*initiate_commit)(struct nfs_commit_data *data,
+							int how));
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+				      gfp_t gfp_flags);
+void nfs4_pnfs_v3_ds_connect_unload(void);
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+			  struct nfs4_deviceid_node *devid, unsigned int timeo,
+			  unsigned int retrans, u32 version, u32 minor_version,
+			  rpc_authflavor_t au_flavor);
+struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
+						 struct xdr_stream *xdr,
+						 gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+				     struct pnfs_layout_segment *lseg,
+				     struct nfs_commit_info *cinfo,
+				     u32 ds_commit_idx);
+
+static inline bool nfs_have_layout(struct inode *inode)
+{
+	return NFS_I(inode)->layout != NULL;
+}
+
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+	atomic_inc(&d->ref);
+	return d;
+}
+
+static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+	if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
+		atomic_inc(&lo->plh_refcount);
+}
+
+static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+	if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
+		atomic_dec(&lo->plh_refcount);
+		/* wake up waiters for LAYOUTRETURN as that is not needed */
+		wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+	}
+}
+
+static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+	return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
+{
+	if (lseg) {
+		atomic_inc(&lseg->pls_refcount);
+		smp_mb__after_atomic();
+	}
+	return lseg;
+}
+
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+	return nfss->pnfs_curr_ld != NULL;
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+		 struct nfs_commit_info *cinfo)
+{
+	if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0)
+		return PNFS_NOT_ATTEMPTED;
+	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo);
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld == NULL || ld->get_ds_info == NULL)
+		return NULL;
+	return ld->get_ds_info(inode);
+}
+
+static inline void
+pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+	set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+	struct inode *inode = d_inode(req->wb_context->dentry);
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (lseg == NULL || ld->mark_request_commit == NULL)
+		return false;
+	ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+	return true;
+}
+
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+	struct inode *inode = d_inode(req->wb_context->dentry);
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld == NULL || ld->clear_request_commit == NULL)
+		return false;
+	ld->clear_request_commit(req, cinfo);
+	return true;
+}
+
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+		       int max)
+{
+	if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
+		return 0;
+	else
+		return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
+}
+
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+			struct page *page)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld == NULL || ld->search_commit_reqs == NULL)
+		return NULL;
+	return ld->search_commit_reqs(cinfo, page);
+}
+
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+		return false;
+	return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+		PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+		return false;
+	return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+		return 0;
+	return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync);
+}
+
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
+		test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct nfs_server *nfss = NFS_SERVER(ino);
+
+	if (pnfs_enabled_sb(nfss) && nfsi->layout)
+		return _pnfs_return_layout(ino);
+
+	return 0;
+}
+
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+		   struct nfs_server *nfss)
+{
+	return (dst && src && src->bm != 0 && nfss->pnfs_curr_ld &&
+					nfss->pnfs_curr_ld->id == src->l_type);
+}
+
+#ifdef NFS_DEBUG
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+#else
+static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
+{
+}
+#endif /* NFS_DEBUG */
+#else  /* CONFIG_NFS_V4_1 */
+
+static inline bool nfs_have_layout(struct inode *inode)
+{
+	return false;
+}
+
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
+{
+	return NULL;
+}
+
+static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+	return 0;
+}
+
+static inline int pnfs_commit_and_return_layout(struct inode *inode)
+{
+	return 0;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+	return false;
+}
+
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+	return false;
+}
+
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+	return 0;
+}
+
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+	return false;
+}
+
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+{
+	return false;
+}
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s,
+					 const struct nfs_fh *mntfh, u32 id)
+{
+}
+
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+		 struct nfs_commit_info *cinfo)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+	return NULL;
+}
+
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+	return false;
+}
+
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+	return false;
+}
+
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+		       int max)
+{
+	return 0;
+}
+
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+			struct page *page)
+{
+	return NULL;
+}
+
+static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+	return 0;
+}
+
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+		   struct nfs_server *nfss)
+{
+	return false;
+}
+
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+	return false;
+}
+
+
+static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+	return NULL;
+}
+
+static inline void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* FS_NFS_PNFS_H */
diff --git a/kernel/fs/nfs/pnfs_dev.c b/kernel/fs/nfs/pnfs_dev.c
new file mode 100644
index 000000000..2961fcd7a
--- /dev/null
+++ b/kernel/fs/nfs/pnfs_dev.c
@@ -0,0 +1,365 @@
+/*
+ *  Device operations for the pnfs client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS	5
+#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+#ifdef NFS_DEBUG
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+	u32 *p = (u32 *)id;
+
+	dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+		p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+#endif
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+		 const struct nfs_client *clp, const struct nfs4_deviceid *id,
+		 long hash)
+{
+	struct nfs4_deviceid_node *d;
+
+	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
+		if (d->ld == ld && d->nfs_client == clp &&
+		    !memcmp(&d->deviceid, id, sizeof(*id))) {
+			if (atomic_read(&d->ref))
+				return d;
+			else
+				continue;
+		}
+	return NULL;
+}
+
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+		const struct nfs4_deviceid *dev_id,
+		struct rpc_cred *cred, gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d = NULL;
+	struct pnfs_device *pdev = NULL;
+	struct page **pages = NULL;
+	u32 max_resp_sz;
+	int max_pages;
+	int rc, i;
+
+	/*
+	 * Use the session max response size as the basis for setting
+	 * GETDEVICEINFO's maxcount
+	 */
+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	if (server->pnfs_curr_ld->max_deviceinfo_size &&
+	    server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+		max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+	max_pages = nfs_page_array_len(0, max_resp_sz);
+	dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+		__func__, server, max_resp_sz, max_pages);
+
+	pdev = kzalloc(sizeof(*pdev), gfp_flags);
+	if (!pdev)
+		return NULL;
+
+	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+	if (!pages)
+		goto out_free_pdev;
+
+	for (i = 0; i < max_pages; i++) {
+		pages[i] = alloc_page(gfp_flags);
+		if (!pages[i])
+			goto out_free_pages;
+	}
+
+	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+	pdev->layout_type = server->pnfs_curr_ld->id;
+	pdev->pages = pages;
+	pdev->pgbase = 0;
+	pdev->pglen = max_resp_sz;
+	pdev->mincount = 0;
+	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+	rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+	dprintk("%s getdevice info returns %d\n", __func__, rc);
+	if (rc)
+		goto out_free_pages;
+
+	/*
+	 * Found new device, need to decode it and then add it to the
+	 * list of known devices for this mountpoint.
+	 */
+	d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+			gfp_flags);
+	if (d && pdev->nocache)
+		set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+
+out_free_pages:
+	for (i = 0; i < max_pages; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+out_free_pdev:
+	kfree(pdev);
+	dprintk("<-- %s d %p\n", __func__, d);
+	return d;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+static struct nfs4_deviceid_node *
+__nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, long hash)
+{
+	struct nfs4_deviceid_node *d;
+
+	rcu_read_lock();
+	d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+			hash);
+	if (d != NULL && !atomic_inc_not_zero(&d->ref))
+		d = NULL;
+	rcu_read_unlock();
+	return d;
+}
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		gfp_t gfp_mask)
+{
+	long hash = nfs4_deviceid_hash(id);
+	struct nfs4_deviceid_node *d, *new;
+
+	d = __nfs4_find_get_deviceid(server, id, hash);
+	if (d)
+		return d;
+
+	new = nfs4_get_device_info(server, id, cred, gfp_mask);
+	if (!new)
+		return new;
+
+	spin_lock(&nfs4_deviceid_lock);
+	d = __nfs4_find_get_deviceid(server, id, hash);
+	if (d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		server->pnfs_curr_ld->free_deviceid_node(new);
+		return d;
+	}
+	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+	atomic_inc(&new->ref);
+	spin_unlock(&nfs4_deviceid_lock);
+
+	return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Remove a deviceid from cache
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+			 const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+	struct nfs4_deviceid_node *d;
+
+	spin_lock(&nfs4_deviceid_lock);
+	rcu_read_lock();
+	d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+	rcu_read_unlock();
+	if (!d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		return;
+	}
+	hlist_del_init_rcu(&d->node);
+	clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+	spin_unlock(&nfs4_deviceid_lock);
+
+	/* balance the initial ref set in pnfs_insert_deviceid */
+	nfs4_put_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
+			const struct nfs4_deviceid *id)
+{
+	INIT_HLIST_NODE(&d->node);
+	INIT_HLIST_NODE(&d->tmpnode);
+	d->ld = server->pnfs_curr_ld;
+	d->nfs_client = server->nfs_client;
+	d->flags = 0;
+	d->deviceid = *id;
+	atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * return true iff the node was deleted
+ * Note that since the test for d->ref == 0 is sufficient to establish
+ * that the node is no longer hashed in the global device id cache.
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) {
+		if (atomic_add_unless(&d->ref, -1, 2))
+			return false;
+		nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid);
+	}
+	if (!atomic_dec_and_test(&d->ref))
+		return false;
+	d->ld->free_deviceid_node(d);
+	return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+void
+nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+	node->timestamp_unavailable = jiffies;
+	set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
+
+bool
+nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+		unsigned long start, end;
+
+		end = jiffies;
+		start = end - PNFS_DEVICE_RETRY_TIMEOUT;
+		if (time_in_range(node->timestamp_unavailable, start, end))
+			return true;
+		clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+	struct nfs4_deviceid_node *d;
+	HLIST_HEAD(tmp);
+
+	spin_lock(&nfs4_deviceid_lock);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
+		if (d->nfs_client == clp && atomic_read(&d->ref)) {
+			hlist_del_init_rcu(&d->node);
+			hlist_add_head(&d->tmpnode, &tmp);
+			clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+		}
+	rcu_read_unlock();
+	spin_unlock(&nfs4_deviceid_lock);
+
+	if (hlist_empty(&tmp))
+		return;
+
+	while (!hlist_empty(&tmp)) {
+		d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
+		hlist_del(&d->tmpnode);
+		nfs4_put_deviceid_node(d);
+	}
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+	long h;
+
+	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+		return;
+	for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+		_deviceid_purge_client(clp, h);
+}
+
+/*
+ * Stop use of all deviceids associated with an nfs_client
+ */
+void
+nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
+{
+	struct nfs4_deviceid_node *d;
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
+		hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node)
+			if (d->nfs_client == clp)
+				set_bit(NFS_DEVICEID_INVALID, &d->flags);
+	}
+	rcu_read_unlock();
+}
diff --git a/kernel/fs/nfs/pnfs_nfs.c b/kernel/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000..f37e25b63
--- /dev/null
+++ b/kernel/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,880 @@
+/*
+ * Common NFS I/O  operations for the pnfs file based
+ * layout drivers.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tom Haynes <loghyr@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/module.h>
+
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+void pnfs_generic_rw_release(void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	nfs_put_client(hdr->ds_clp);
+	hdr->mds_ops->rpc_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
+
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
+{
+	struct nfs_page *first = nfs_list_entry(data->pages.next);
+
+	data->task.tk_status = 0;
+	memcpy(&data->verf.verifier, &first->wb_verf,
+	       sizeof(data->verf.verifier));
+	data->verf.verifier.data[0]++; /* ensure verifier mismatch */
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
+
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *wdata = data;
+
+	/* Note this may cause RPC to be resent */
+	wdata->mds_ops->rpc_call_done(task, data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
+
+void pnfs_generic_commit_release(void *calldata)
+{
+	struct nfs_commit_data *data = calldata;
+
+	data->completion_ops->completion(data);
+	pnfs_put_lseg(data->lseg);
+	nfs_put_client(data->ds_clp);
+	nfs_commitdata_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this must be called holding the inode (/cinfo) lock
+ */
+void
+pnfs_generic_clear_request_commit(struct nfs_page *req,
+				  struct nfs_commit_info *cinfo)
+{
+	struct pnfs_layout_segment *freeme = NULL;
+
+	if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+		goto out;
+	cinfo->ds->nwritten--;
+	if (list_is_singular(&req->wb_list)) {
+		struct pnfs_commit_bucket *bucket;
+
+		bucket = list_first_entry(&req->wb_list,
+					  struct pnfs_commit_bucket,
+					  written);
+		freeme = bucket->wlseg;
+		bucket->wlseg = NULL;
+	}
+out:
+	nfs_request_remove_commit_list(req, cinfo);
+	pnfs_put_lseg_locked(freeme);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+
+static int
+pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
+				  struct nfs_commit_info *cinfo, int max)
+{
+	struct nfs_page *req, *tmp;
+	int ret = 0;
+
+	list_for_each_entry_safe(req, tmp, src, wb_list) {
+		if (!nfs_lock_request(req))
+			continue;
+		kref_get(&req->wb_kref);
+		if (cond_resched_lock(cinfo->lock))
+			list_safe_reset_next(req, tmp, wb_list);
+		nfs_request_remove_commit_list(req, cinfo);
+		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+		nfs_list_add_request(req, dst);
+		ret++;
+		if ((ret == max) && !cinfo->dreq)
+			break;
+	}
+	return ret;
+}
+
+static int
+pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+				 struct nfs_commit_info *cinfo,
+				 int max)
+{
+	struct list_head *src = &bucket->written;
+	struct list_head *dst = &bucket->committing;
+	int ret;
+
+	lockdep_assert_held(cinfo->lock);
+	ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
+	if (ret) {
+		cinfo->ds->nwritten -= ret;
+		cinfo->ds->ncommitting += ret;
+		bucket->clseg = bucket->wlseg;
+		if (list_empty(src))
+			bucket->wlseg = NULL;
+		else
+			pnfs_get_lseg(bucket->clseg);
+	}
+	return ret;
+}
+
+/* Move reqs from written to committing lists, returning count
+ * of number moved.
+ */
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
+				   int max)
+{
+	int i, rv = 0, cnt;
+
+	lockdep_assert_held(cinfo->lock);
+	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
+		cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
+						       cinfo, max);
+		max -= cnt;
+		rv += cnt;
+	}
+	return rv;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
+
+/* Pull everything off the committing lists and dump into @dst.  */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+				      struct nfs_commit_info *cinfo)
+{
+	struct pnfs_commit_bucket *b;
+	struct pnfs_layout_segment *freeme;
+	int i;
+
+	lockdep_assert_held(cinfo->lock);
+restart:
+	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+		if (pnfs_generic_transfer_commit_list(&b->written, dst,
+						      cinfo, 0)) {
+			freeme = b->wlseg;
+			b->wlseg = NULL;
+			spin_unlock(cinfo->lock);
+			pnfs_put_lseg(freeme);
+			spin_lock(cinfo->lock);
+			goto restart;
+		}
+	}
+	cinfo->ds->nwritten = 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
+
+static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+	struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+	struct pnfs_commit_bucket *bucket;
+	struct pnfs_layout_segment *freeme;
+	int i;
+
+	for (i = idx; i < fl_cinfo->nbuckets; i++) {
+		bucket = &fl_cinfo->buckets[i];
+		if (list_empty(&bucket->committing))
+			continue;
+		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
+		spin_lock(cinfo->lock);
+		freeme = bucket->clseg;
+		bucket->clseg = NULL;
+		spin_unlock(cinfo->lock);
+		pnfs_put_lseg(freeme);
+	}
+}
+
+static unsigned int
+pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
+			      struct list_head *list)
+{
+	struct pnfs_ds_commit_info *fl_cinfo;
+	struct pnfs_commit_bucket *bucket;
+	struct nfs_commit_data *data;
+	int i;
+	unsigned int nreq = 0;
+
+	fl_cinfo = cinfo->ds;
+	bucket = fl_cinfo->buckets;
+	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+		if (list_empty(&bucket->committing))
+			continue;
+		data = nfs_commitdata_alloc();
+		if (!data)
+			break;
+		data->ds_commit_index = i;
+		spin_lock(cinfo->lock);
+		data->lseg = bucket->clseg;
+		bucket->clseg = NULL;
+		spin_unlock(cinfo->lock);
+		list_add(&data->pages, list);
+		nreq++;
+	}
+
+	/* Clean up on error */
+	pnfs_generic_retry_commit(cinfo, i);
+	return nreq;
+}
+
+/* This follows nfs_commit_list pretty closely */
+int
+pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+			     int how, struct nfs_commit_info *cinfo,
+			     int (*initiate_commit)(struct nfs_commit_data *data,
+						    int how))
+{
+	struct nfs_commit_data *data, *tmp;
+	LIST_HEAD(list);
+	unsigned int nreq = 0;
+
+	if (!list_empty(mds_pages)) {
+		data = nfs_commitdata_alloc();
+		if (data != NULL) {
+			data->lseg = NULL;
+			list_add(&data->pages, &list);
+			nreq++;
+		} else {
+			nfs_retry_commit(mds_pages, NULL, cinfo, 0);
+			pnfs_generic_retry_commit(cinfo, 0);
+			cinfo->completion_ops->error_cleanup(NFS_I(inode));
+			return -ENOMEM;
+		}
+	}
+
+	nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
+
+	if (nreq == 0) {
+		cinfo->completion_ops->error_cleanup(NFS_I(inode));
+		goto out;
+	}
+
+	atomic_add(nreq, &cinfo->mds->rpcs_out);
+
+	list_for_each_entry_safe(data, tmp, &list, pages) {
+		list_del_init(&data->pages);
+		if (!data->lseg) {
+			nfs_init_commit(data, mds_pages, NULL, cinfo);
+			nfs_initiate_commit(NFS_CLIENT(inode), data,
+					    NFS_PROTO(data->inode),
+					    data->mds_ops, how, 0);
+		} else {
+			struct pnfs_commit_bucket *buckets;
+
+			buckets = cinfo->ds->buckets;
+			nfs_init_commit(data,
+					&buckets[data->ds_commit_index].committing,
+					data->lseg,
+					cinfo);
+			initiate_commit(data, how);
+		}
+	}
+out:
+	cinfo->ds->ncommitting = 0;
+	return PNFS_ATTEMPTED;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+static void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+	if (ds == NULL) {
+		printk(KERN_WARNING "%s NULL device\n", __func__);
+		return;
+	}
+	printk(KERN_WARNING "        ds %s\n"
+		"        ref count %d\n"
+		"        client %p\n"
+		"        cl_exchange_flags %x\n",
+		ds->ds_remotestr,
+		atomic_read(&ds->ds_count), ds->ds_clp,
+		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+static bool
+same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
+{
+	struct sockaddr_in *a, *b;
+	struct sockaddr_in6 *a6, *b6;
+
+	if (addr1->sa_family != addr2->sa_family)
+		return false;
+
+	switch (addr1->sa_family) {
+	case AF_INET:
+		a = (struct sockaddr_in *)addr1;
+		b = (struct sockaddr_in *)addr2;
+
+		if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+		    a->sin_port == b->sin_port)
+			return true;
+		break;
+
+	case AF_INET6:
+		a6 = (struct sockaddr_in6 *)addr1;
+		b6 = (struct sockaddr_in6 *)addr2;
+
+		/* LINKLOCAL addresses must have matching scope_id */
+		if (ipv6_addr_src_scope(&a6->sin6_addr) ==
+		    IPV6_ADDR_SCOPE_LINKLOCAL &&
+		    a6->sin6_scope_id != b6->sin6_scope_id)
+			return false;
+
+		if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+		    a6->sin6_port == b6->sin6_port)
+			return true;
+		break;
+
+	default:
+		dprintk("%s: unhandled address family: %u\n",
+			__func__, addr1->sa_family);
+		return false;
+	}
+
+	return false;
+}
+
+static bool
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+			       const struct list_head *dsaddrs2)
+{
+	struct nfs4_pnfs_ds_addr *da1, *da2;
+
+	/* step through both lists, comparing as we go */
+	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
+	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
+	     da1 != NULL && da2 != NULL;
+	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
+	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
+		if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
+				   (struct sockaddr *)&da2->da_addr))
+			return false;
+	}
+	if (da1 == NULL && da2 == NULL)
+		return true;
+
+	return false;
+}
+
+/*
+ * Lookup DS by addresses.  nfs4_ds_cache_lock is held
+ */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(const struct list_head *dsaddrs)
+{
+	struct nfs4_pnfs_ds *ds;
+
+	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+			return ds;
+	return NULL;
+}
+
+static void destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+	struct nfs4_pnfs_ds_addr *da;
+
+	dprintk("--> %s\n", __func__);
+	ifdebug(FACILITY)
+		print_ds(ds);
+
+	nfs_put_client(ds->ds_clp);
+
+	while (!list_empty(&ds->ds_addrs)) {
+		da = list_first_entry(&ds->ds_addrs,
+				      struct nfs4_pnfs_ds_addr,
+				      da_node);
+		list_del_init(&da->da_node);
+		kfree(da->da_remotestr);
+		kfree(da);
+	}
+
+	kfree(ds->ds_remotestr);
+	kfree(ds);
+}
+
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
+{
+	if (atomic_dec_and_lock(&ds->ds_count,
+				&nfs4_ds_cache_lock)) {
+		list_del_init(&ds->ds_node);
+		spin_unlock(&nfs4_ds_cache_lock);
+		destroy_ds(ds);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
+
+/*
+ * Create a string with a human readable address and port to avoid
+ * complicated setup around many dprinks.
+ */
+static char *
+nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+	struct nfs4_pnfs_ds_addr *da;
+	char *remotestr;
+	size_t len;
+	char *p;
+
+	len = 3;        /* '{', '}' and eol */
+	list_for_each_entry(da, dsaddrs, da_node) {
+		len += strlen(da->da_remotestr) + 1;    /* string plus comma */
+	}
+
+	remotestr = kzalloc(len, gfp_flags);
+	if (!remotestr)
+		return NULL;
+
+	p = remotestr;
+	*(p++) = '{';
+	len--;
+	list_for_each_entry(da, dsaddrs, da_node) {
+		size_t ll = strlen(da->da_remotestr);
+
+		if (ll > len)
+			goto out_err;
+
+		memcpy(p, da->da_remotestr, ll);
+		p += ll;
+		len -= ll;
+
+		if (len < 1)
+			goto out_err;
+		(*p++) = ',';
+		len--;
+	}
+	if (len < 2)
+		goto out_err;
+	*(p++) = '}';
+	*p = '\0';
+	return remotestr;
+out_err:
+	kfree(remotestr);
+	return NULL;
+}
+
+/*
+ * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
+ * uncached and return cached struct nfs4_pnfs_ds.
+ */
+struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+	char *remotestr;
+
+	if (list_empty(dsaddrs)) {
+		dprintk("%s: no addresses defined\n", __func__);
+		goto out;
+	}
+
+	ds = kzalloc(sizeof(*ds), gfp_flags);
+	if (!ds)
+		goto out;
+
+	/* this is only used for debugging, so it's ok if its NULL */
+	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
+
+	spin_lock(&nfs4_ds_cache_lock);
+	tmp_ds = _data_server_lookup_locked(dsaddrs);
+	if (tmp_ds == NULL) {
+		INIT_LIST_HEAD(&ds->ds_addrs);
+		list_splice_init(dsaddrs, &ds->ds_addrs);
+		ds->ds_remotestr = remotestr;
+		atomic_set(&ds->ds_count, 1);
+		INIT_LIST_HEAD(&ds->ds_node);
+		ds->ds_clp = NULL;
+		list_add(&ds->ds_node, &nfs4_data_server_cache);
+		dprintk("%s add new data server %s\n", __func__,
+			ds->ds_remotestr);
+	} else {
+		kfree(remotestr);
+		kfree(ds);
+		atomic_inc(&tmp_ds->ds_count);
+		dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+			__func__, tmp_ds->ds_remotestr,
+			atomic_read(&tmp_ds->ds_count));
+		ds = tmp_ds;
+	}
+	spin_unlock(&nfs4_ds_cache_lock);
+out:
+	return ds;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
+
+static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+	might_sleep();
+	wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+			TASK_KILLABLE);
+}
+
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+	smp_mb__before_atomic();
+	clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
+	smp_mb__after_atomic();
+	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+}
+
+static struct nfs_client *(*get_v3_ds_connect)(
+			struct nfs_client *mds_clp,
+			const struct sockaddr *ds_addr,
+			int ds_addrlen,
+			int ds_proto,
+			unsigned int ds_timeo,
+			unsigned int ds_retrans,
+			rpc_authflavor_t au_flavor);
+
+static bool load_v3_ds_connect(void)
+{
+	if (!get_v3_ds_connect) {
+		get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
+		WARN_ON_ONCE(!get_v3_ds_connect);
+	}
+
+	return(get_v3_ds_connect != NULL);
+}
+
+void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+	if (get_v3_ds_connect) {
+		symbol_put(nfs3_set_ds_client);
+		get_v3_ds_connect = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
+
+static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
+				 struct nfs4_pnfs_ds *ds,
+				 unsigned int timeo,
+				 unsigned int retrans,
+				 rpc_authflavor_t au_flavor)
+{
+	struct nfs_client *clp = ERR_PTR(-EIO);
+	struct nfs4_pnfs_ds_addr *da;
+	int status = 0;
+
+	dprintk("--> %s DS %s au_flavor %d\n", __func__,
+		ds->ds_remotestr, au_flavor);
+
+	if (!load_v3_ds_connect())
+		goto out;
+
+	list_for_each_entry(da, &ds->ds_addrs, da_node) {
+		dprintk("%s: DS %s: trying address %s\n",
+			__func__, ds->ds_remotestr, da->da_remotestr);
+
+		clp = get_v3_ds_connect(mds_srv->nfs_client,
+					(struct sockaddr *)&da->da_addr,
+					da->da_addrlen, IPPROTO_TCP,
+					timeo, retrans, au_flavor);
+		if (!IS_ERR(clp))
+			break;
+	}
+
+	if (IS_ERR(clp)) {
+		status = PTR_ERR(clp);
+		goto out;
+	}
+
+	smp_wmb();
+	ds->ds_clp = clp;
+	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+	return status;
+}
+
+static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
+				 struct nfs4_pnfs_ds *ds,
+				 unsigned int timeo,
+				 unsigned int retrans,
+				 u32 minor_version,
+				 rpc_authflavor_t au_flavor)
+{
+	struct nfs_client *clp = ERR_PTR(-EIO);
+	struct nfs4_pnfs_ds_addr *da;
+	int status = 0;
+
+	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
+		au_flavor);
+
+	list_for_each_entry(da, &ds->ds_addrs, da_node) {
+		dprintk("%s: DS %s: trying address %s\n",
+			__func__, ds->ds_remotestr, da->da_remotestr);
+
+		clp = nfs4_set_ds_client(mds_srv->nfs_client,
+					(struct sockaddr *)&da->da_addr,
+					da->da_addrlen, IPPROTO_TCP,
+					timeo, retrans, minor_version,
+					au_flavor);
+		if (!IS_ERR(clp))
+			break;
+	}
+
+	if (IS_ERR(clp)) {
+		status = PTR_ERR(clp);
+		goto out;
+	}
+
+	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
+	if (status)
+		goto out_put;
+
+	smp_wmb();
+	ds->ds_clp = clp;
+	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+	return status;
+out_put:
+	nfs_put_client(clp);
+	goto out;
+}
+
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server.
+ * Currently only supports IPv4 and IPv6 addresses.
+ * If connection fails, make devid unavailable.
+ */
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+			  struct nfs4_deviceid_node *devid, unsigned int timeo,
+			  unsigned int retrans, u32 version,
+			  u32 minor_version, rpc_authflavor_t au_flavor)
+{
+	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
+		int err = 0;
+
+		if (version == 3) {
+			err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
+						       retrans, au_flavor);
+		} else if (version == 4) {
+			err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
+						       retrans, minor_version,
+						       au_flavor);
+		} else {
+			dprintk("%s: unsupported DS version %d\n", __func__,
+				version);
+			err = -EPROTONOSUPPORT;
+		}
+
+		if (err)
+			nfs4_mark_deviceid_unavailable(devid);
+		nfs4_clear_ds_conn_bit(ds);
+	} else {
+		nfs4_wait_ds_connect(ds);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
+
+/*
+ * Currently only supports ipv4, ipv6 and one multi-path address.
+ */
+struct nfs4_pnfs_ds_addr *
+nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
+{
+	struct nfs4_pnfs_ds_addr *da = NULL;
+	char *buf, *portstr;
+	__be16 port;
+	int nlen, rlen;
+	int tmp[2];
+	__be32 *p;
+	char *netid, *match_netid;
+	size_t len, match_netid_len;
+	char *startsep = "";
+	char *endsep = "";
+
+
+	/* r_netid */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_err;
+	nlen = be32_to_cpup(p++);
+
+	p = xdr_inline_decode(xdr, nlen);
+	if (unlikely(!p))
+		goto out_err;
+
+	netid = kmalloc(nlen+1, gfp_flags);
+	if (unlikely(!netid))
+		goto out_err;
+
+	netid[nlen] = '\0';
+	memcpy(netid, p, nlen);
+
+	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_free_netid;
+	rlen = be32_to_cpup(p);
+
+	p = xdr_inline_decode(xdr, rlen);
+	if (unlikely(!p))
+		goto out_free_netid;
+
+	/* port is ".ABC.DEF", 8 chars max */
+	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
+		dprintk("%s: Invalid address, length %d\n", __func__,
+			rlen);
+		goto out_free_netid;
+	}
+	buf = kmalloc(rlen + 1, gfp_flags);
+	if (!buf) {
+		dprintk("%s: Not enough memory\n", __func__);
+		goto out_free_netid;
+	}
+	buf[rlen] = '\0';
+	memcpy(buf, p, rlen);
+
+	/* replace port '.' with '-' */
+	portstr = strrchr(buf, '.');
+	if (!portstr) {
+		dprintk("%s: Failed finding expected dot in port\n",
+			__func__);
+		goto out_free_buf;
+	}
+	*portstr = '-';
+
+	/* find '.' between address and port */
+	portstr = strrchr(buf, '.');
+	if (!portstr) {
+		dprintk("%s: Failed finding expected dot between address and "
+			"port\n", __func__);
+		goto out_free_buf;
+	}
+	*portstr = '\0';
+
+	da = kzalloc(sizeof(*da), gfp_flags);
+	if (unlikely(!da))
+		goto out_free_buf;
+
+	INIT_LIST_HEAD(&da->da_node);
+
+	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+		      sizeof(da->da_addr))) {
+		dprintk("%s: error parsing address %s\n", __func__, buf);
+		goto out_free_da;
+	}
+
+	portstr++;
+	sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
+	port = htons((tmp[0] << 8) | (tmp[1]));
+
+	switch (da->da_addr.ss_family) {
+	case AF_INET:
+		((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+		da->da_addrlen = sizeof(struct sockaddr_in);
+		match_netid = "tcp";
+		match_netid_len = 3;
+		break;
+
+	case AF_INET6:
+		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+		da->da_addrlen = sizeof(struct sockaddr_in6);
+		match_netid = "tcp6";
+		match_netid_len = 4;
+		startsep = "[";
+		endsep = "]";
+		break;
+
+	default:
+		dprintk("%s: unsupported address family: %u\n",
+			__func__, da->da_addr.ss_family);
+		goto out_free_da;
+	}
+
+	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
+		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
+			__func__, netid, match_netid);
+		goto out_free_da;
+	}
+
+	/* save human readable address */
+	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+	da->da_remotestr = kzalloc(len, gfp_flags);
+
+	/* NULL is ok, only used for dprintk */
+	if (da->da_remotestr)
+		snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+			 buf, endsep, ntohs(port));
+
+	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
+	kfree(buf);
+	kfree(netid);
+	return da;
+
+out_free_da:
+	kfree(da);
+out_free_buf:
+	dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+	kfree(buf);
+out_free_netid:
+	kfree(netid);
+out_err:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+				struct pnfs_layout_segment *lseg,
+				struct nfs_commit_info *cinfo,
+				u32 ds_commit_idx)
+{
+	struct list_head *list;
+	struct pnfs_commit_bucket *buckets;
+
+	spin_lock(cinfo->lock);
+	buckets = cinfo->ds->buckets;
+	list = &buckets[ds_commit_idx].written;
+	if (list_empty(list)) {
+		/* Non-empty buckets hold a reference on the lseg.  That ref
+		 * is normally transferred to the COMMIT call and released
+		 * there.  It could also be released if the last req is pulled
+		 * off due to a rewrite, in which case it will be done in
+		 * pnfs_common_clear_request_commit
+		 */
+		WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+		buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+	}
+	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+	cinfo->ds->nwritten++;
+	spin_unlock(cinfo->lock);
+
+	nfs_request_add_commit_list(req, list, cinfo);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
+
+int
+pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
+{
+	if (datasync)
+		return 0;
+	return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync);
+
diff --git a/kernel/fs/nfs/proc.c b/kernel/fs/nfs/proc.c
new file mode 100644
index 000000000..b417bbcd9
--- /dev/null
+++ b/kernel/fs/nfs/proc.c
@@ -0,0 +1,749 @@
+/*
+ *  linux/fs/nfs/proc.c
+ *
+ *  Copyright (C) 1992, 1993, 1994  Rick Sladkey
+ *
+ *  OS-independent nfs remote procedure call functions
+ *
+ *  Tuned by Alan Cox <A.Cox@swansea.ac.uk> for >3K buffers
+ *  so at last we can have decent(ish) throughput off a 
+ *  Sun server.
+ *
+ *  Coding optimized and cleaned up by Florian La Roche.
+ *  Note: Error returns are optimized for NFS_OK, which isn't translated via
+ *  nfs_stat_to_errno(), but happens to be already the right return code.
+ *
+ *  Also, the code currently doesn't check the size of the packet, when
+ *  it decodes the packet.
+ *
+ *  Feel free to fix it and mail me the diffs if it worries you.
+ *
+ *  Completely rewritten to support the new RPC call interface;
+ *  rewrote and moved the entire XDR stuff to xdr.c
+ *  --Olaf Kirch June 1996
+ *
+ *  The code below initializes all auto variables explicitly, otherwise
+ *  it will fail to work as a module (gcc generates a memset call for an
+ *  incomplete struct).
+ */
+
+#include <linux/types.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/freezer.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PROC
+
+/*
+ * Bare-bones access to getattr: this is for nfs_read_super.
+ */
+static int
+nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		  struct nfs_fsinfo *info)
+{
+	struct nfs_fattr *fattr = info->fattr;
+	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
+	int status;
+
+	dprintk("%s: call getattr\n", __func__);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	/* Retry with default authentication if different */
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+	dprintk("%s: reply getattr: %d\n", __func__, status);
+	if (status)
+		return status;
+	dprintk("%s: call statfs\n", __func__);
+	msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
+	msg.rpc_resp = &fsinfo;
+	status = rpc_call_sync(server->client, &msg, 0);
+	/* Retry with default authentication if different */
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+	dprintk("%s: reply statfs: %d\n", __func__, status);
+	if (status)
+		return status;
+	info->rtmax  = NFS_MAXDATA;
+	info->rtpref = fsinfo.tsize;
+	info->rtmult = fsinfo.bsize;
+	info->wtmax  = NFS_MAXDATA;
+	info->wtpref = fsinfo.tsize;
+	info->wtmult = fsinfo.bsize;
+	info->dtpref = fsinfo.tsize;
+	info->maxfilesize = 0x7FFFFFFF;
+	info->lease_time = 0;
+	return 0;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	dprintk("NFS call  getattr\n");
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply getattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+		 struct iattr *sattr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct nfs_sattrargs	arg = { 
+		.fh	= NFS_FH(inode),
+		.sattr	= sattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	/* Mask out the non-modebit related stuff from attr->ia_mode */
+	sattr->ia_mode &= S_IALLUGO;
+
+	dprintk("NFS call  setattr\n");
+	if (sattr->ia_valid & ATTR_FILE)
+		msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr, fattr);
+	dprintk("NFS reply setattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_lookup(struct inode *dir, struct qstr *name,
+		struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+		struct nfs4_label *label)
+{
+	struct nfs_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct nfs_diropok	res = {
+		.fh		= fhandle,
+		.fattr		= fattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_LOOKUP],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int			status;
+
+	dprintk("NFS call  lookup %s\n", name->name);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	dprintk("NFS reply lookup: %d\n", status);
+	return status;
+}
+
+static int nfs_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs_readlinkargs	args = {
+		.fh		= NFS_FH(inode),
+		.pgbase		= pgbase,
+		.pglen		= pglen,
+		.pages		= &page
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_READLINK],
+		.rpc_argp	= &args,
+	};
+	int			status;
+
+	dprintk("NFS call  readlink\n");
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	dprintk("NFS reply readlink: %d\n", status);
+	return status;
+}
+
+struct nfs_createdata {
+	struct nfs_createargs arg;
+	struct nfs_diropok res;
+	struct nfs_fh fhandle;
+	struct nfs_fattr fattr;
+};
+
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+		struct dentry *dentry, struct iattr *sattr)
+{
+	struct nfs_createdata *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+	if (data != NULL) {
+		data->arg.fh = NFS_FH(dir);
+		data->arg.name = dentry->d_name.name;
+		data->arg.len = dentry->d_name.len;
+		data->arg.sattr = sattr;
+		nfs_fattr_init(&data->fattr);
+		data->fhandle.size = 0;
+		data->res.fh = &data->fhandle;
+		data->res.fattr = &data->fattr;
+	}
+	return data;
+};
+
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+	kfree(data);
+}
+
+static int
+nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+		int flags)
+{
+	struct nfs_createdata *data;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  create %pd\n", dentry);
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+	nfs_free_createdata(data);
+out:
+	dprintk("NFS reply create: %d\n", status);
+	return status;
+}
+
+/*
+ * In NFSv2, mknod is grafted onto the create call.
+ */
+static int
+nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+	       dev_t rdev)
+{
+	struct nfs_createdata *data;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
+	};
+	umode_t mode;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mknod %pd\n", dentry);
+
+	mode = sattr->ia_mode;
+	if (S_ISFIFO(mode)) {
+		sattr->ia_mode = (mode & ~S_IFMT) | S_IFCHR;
+		sattr->ia_valid &= ~ATTR_SIZE;
+	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		sattr->ia_valid |= ATTR_SIZE;
+		sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
+	}
+
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+
+	if (status == -EINVAL && S_ISFIFO(mode)) {
+		sattr->ia_mode = mode;
+		nfs_fattr_init(data->res.fattr);
+		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	}
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+	nfs_free_createdata(data);
+out:
+	dprintk("NFS reply mknod: %d\n", status);
+	return status;
+}
+  
+static int
+nfs_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs_removeargs arg = {
+		.fh = NFS_FH(dir),
+		.name = *name,
+	};
+	struct rpc_message msg = { 
+		.rpc_proc = &nfs_procedures[NFSPROC_REMOVE],
+		.rpc_argp = &arg,
+	};
+	int			status;
+
+	dprintk("NFS call  remove %s\n", name->name);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+
+	dprintk("NFS reply remove: %d\n", status);
+	return status;
+}
+
+static void
+nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
+}
+
+static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+	rpc_call_start(task);
+}
+
+static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+	nfs_mark_for_revalidate(dir);
+	return 1;
+}
+
+static void
+nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+
+static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+	rpc_call_start(task);
+}
+
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		     struct inode *new_dir)
+{
+	nfs_mark_for_revalidate(old_dir);
+	nfs_mark_for_revalidate(new_dir);
+	return 1;
+}
+
+static int
+nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs_linkargs	arg = {
+		.fromfh		= NFS_FH(inode),
+		.tofh		= NFS_FH(dir),
+		.toname		= name->name,
+		.tolen		= name->len
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_LINK],
+		.rpc_argp	= &arg,
+	};
+	int			status;
+
+	dprintk("NFS call  link %s\n", name->name);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_mark_for_revalidate(inode);
+	nfs_mark_for_revalidate(dir);
+	dprintk("NFS reply link: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+		 unsigned int len, struct iattr *sattr)
+{
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+	struct nfs_symlinkargs	arg = {
+		.fromfh		= NFS_FH(dir),
+		.fromname	= dentry->d_name.name,
+		.fromlen	= dentry->d_name.len,
+		.pages		= &page,
+		.pathlen	= len,
+		.sattr		= sattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_SYMLINK],
+		.rpc_argp	= &arg,
+	};
+	int status = -ENAMETOOLONG;
+
+	dprintk("NFS call  symlink %pd\n", dentry);
+
+	if (len > NFS2_MAXPATHLEN)
+		goto out;
+
+	fh = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	status = -ENOMEM;
+	if (fh == NULL || fattr == NULL)
+		goto out_free;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+
+	/*
+	 * V2 SYMLINK requests don't return any attributes.  Setting the
+	 * filehandle size to zero indicates to nfs_instantiate that it
+	 * should fill in the data with a LOOKUP call on the wire.
+	 */
+	if (status == 0)
+		status = nfs_instantiate(dentry, fh, fattr, NULL);
+
+out_free:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fh);
+out:
+	dprintk("NFS reply symlink: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+	struct nfs_createdata *data;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_MKDIR],
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mkdir %pd\n", dentry);
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+	nfs_free_createdata(data);
+out:
+	dprintk("NFS reply mkdir: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_rmdir(struct inode *dir, struct qstr *name)
+{
+	struct nfs_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_RMDIR],
+		.rpc_argp	= &arg,
+	};
+	int			status;
+
+	dprintk("NFS call  rmdir %s\n", name->name);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+	dprintk("NFS reply rmdir: %d\n", status);
+	return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass a temporary
+ * buffer to the encode function, which installs it in the receive
+ * the receive iovec. The decode function just parses the reply to make
+ * sure it is syntactically correct; the entries itself are decoded
+ * from nfs_readdir by calling the decode_entry function directly.
+ */
+static int
+nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		 u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct inode		*dir = d_inode(dentry);
+	struct nfs_readdirargs	arg = {
+		.fh		= NFS_FH(dir),
+		.cookie		= cookie,
+		.count		= count,
+		.pages		= pages,
+	};
+	struct rpc_message	msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
+		.rpc_argp	= &arg,
+		.rpc_cred	= cred,
+	};
+	int			status;
+
+	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+	nfs_invalidate_atime(dir);
+
+	dprintk("NFS reply readdir: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+			struct nfs_fsstat *stat)
+{
+	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_STATFS],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= &fsinfo,
+	};
+	int	status;
+
+	dprintk("NFS call  statfs\n");
+	nfs_fattr_init(stat->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply statfs: %d\n", status);
+	if (status)
+		goto out;
+	stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize;
+	stat->fbytes = (u64)fsinfo.bfree  * fsinfo.bsize;
+	stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize;
+	stat->tfiles = 0;
+	stat->ffiles = 0;
+	stat->afiles = 0;
+out:
+	return status;
+}
+
+static int
+nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+			struct nfs_fsinfo *info)
+{
+	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_STATFS],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= &fsinfo,
+	};
+	int	status;
+
+	dprintk("NFS call  fsinfo\n");
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply fsinfo: %d\n", status);
+	if (status)
+		goto out;
+	info->rtmax  = NFS_MAXDATA;
+	info->rtpref = fsinfo.tsize;
+	info->rtmult = fsinfo.bsize;
+	info->wtmax  = NFS_MAXDATA;
+	info->wtpref = fsinfo.tsize;
+	info->wtmult = fsinfo.bsize;
+	info->dtpref = fsinfo.tsize;
+	info->maxfilesize = 0x7FFFFFFF;
+	info->lease_time = 0;
+out:
+	return status;
+}
+
+static int
+nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		  struct nfs_pathconf *info)
+{
+	info->max_link = 0;
+	info->max_namelen = NFS2_MAXNAMLEN;
+	return 0;
+}
+
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+	struct inode *inode = hdr->inode;
+
+	nfs_invalidate_atime(inode);
+	if (task->tk_status >= 0) {
+		nfs_refresh_inode(inode, hdr->res.fattr);
+		/* Emulate the eof flag, which isn't normally needed in NFSv2
+		 * as it is guaranteed to always return the file attributes
+		 */
+		if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+			hdr->res.eof = 1;
+	}
+	return 0;
+}
+
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+				struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
+}
+
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+				     struct nfs_pgio_header *hdr)
+{
+	rpc_call_start(task);
+	return 0;
+}
+
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+	if (task->tk_status >= 0)
+		nfs_writeback_update_inode(hdr);
+	return 0;
+}
+
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+				 struct rpc_message *msg)
+{
+	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
+	hdr->args.stable = NFS_FILE_SYNC;
+	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
+}
+
+static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	BUG();
+}
+
+static void
+nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
+{
+	BUG();
+}
+
+static int
+nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file_inode(filp);
+
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+}
+
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+	__s32 start, end;
+
+	start = (__s32)fl->fl_start;
+	if ((loff_t)start != fl->fl_start)
+		goto out_einval;
+
+	if (fl->fl_end != OFFSET_MAX) {
+		end = (__s32)fl->fl_end;
+		if ((loff_t)end != fl->fl_end)
+			goto out_einval;
+	} else
+		end = NFS_LOCK32_OFFSET_MAX;
+
+	if (start < 0 || start > end)
+		goto out_einval;
+	return 0;
+out_einval:
+	return -EINVAL;
+}
+
+static int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+	return 0;
+}
+
+static int nfs_return_delegation(struct inode *inode)
+{
+	nfs_wb_all(inode);
+	return 0;
+}
+
+static const struct inode_operations nfs_dir_inode_operations = {
+	.create		= nfs_create,
+	.lookup		= nfs_lookup,
+	.link		= nfs_link,
+	.unlink		= nfs_unlink,
+	.symlink	= nfs_symlink,
+	.mkdir		= nfs_mkdir,
+	.rmdir		= nfs_rmdir,
+	.mknod		= nfs_mknod,
+	.rename		= nfs_rename,
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+};
+
+static const struct inode_operations nfs_file_inode_operations = {
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+};
+
+const struct nfs_rpc_ops nfs_v2_clientops = {
+	.version	= 2,		       /* protocol version */
+	.dentry_ops	= &nfs_dentry_operations,
+	.dir_inode_ops	= &nfs_dir_inode_operations,
+	.file_inode_ops	= &nfs_file_inode_operations,
+	.file_ops	= &nfs_file_operations,
+	.getroot	= nfs_proc_get_root,
+	.submount	= nfs_submount,
+	.try_mount	= nfs_try_mount,
+	.getattr	= nfs_proc_getattr,
+	.setattr	= nfs_proc_setattr,
+	.lookup		= nfs_proc_lookup,
+	.access		= NULL,		       /* access */
+	.readlink	= nfs_proc_readlink,
+	.create		= nfs_proc_create,
+	.remove		= nfs_proc_remove,
+	.unlink_setup	= nfs_proc_unlink_setup,
+	.unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
+	.unlink_done	= nfs_proc_unlink_done,
+	.rename_setup	= nfs_proc_rename_setup,
+	.rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
+	.rename_done	= nfs_proc_rename_done,
+	.link		= nfs_proc_link,
+	.symlink	= nfs_proc_symlink,
+	.mkdir		= nfs_proc_mkdir,
+	.rmdir		= nfs_proc_rmdir,
+	.readdir	= nfs_proc_readdir,
+	.mknod		= nfs_proc_mknod,
+	.statfs		= nfs_proc_statfs,
+	.fsinfo		= nfs_proc_fsinfo,
+	.pathconf	= nfs_proc_pathconf,
+	.decode_dirent	= nfs2_decode_dirent,
+	.pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,
+	.read_setup	= nfs_proc_read_setup,
+	.read_done	= nfs_read_done,
+	.write_setup	= nfs_proc_write_setup,
+	.write_done	= nfs_write_done,
+	.commit_setup	= nfs_proc_commit_setup,
+	.commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
+	.lock		= nfs_proc_lock,
+	.lock_check_bounds = nfs_lock_check_bounds,
+	.close_context	= nfs_close_context,
+	.have_delegation = nfs_have_delegation,
+	.return_delegation = nfs_return_delegation,
+	.alloc_client	= nfs_alloc_client,
+	.init_client	= nfs_init_client,
+	.free_client	= nfs_free_client,
+	.create_server	= nfs_create_server,
+	.clone_server	= nfs_clone_server,
+};
diff --git a/kernel/fs/nfs/read.c b/kernel/fs/nfs/read.c
new file mode 100644
index 000000000..ae0ff7a11
--- /dev/null
+++ b/kernel/fs/nfs/read.c
@@ -0,0 +1,445 @@
+/*
+ * linux/fs/nfs/read.c
+ *
+ * Block I/O for NFS
+ *
+ * Partial copy of Linus' read cache modifications to fs/nfs/file.c
+ * modified for async RPC by okir@monad.swb.de
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+static const struct nfs_rw_ops nfs_rw_read_ops;
+
+static struct kmem_cache *nfs_rdata_cachep;
+
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
+{
+	return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+}
+
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
+{
+	kmem_cache_free(nfs_rdata_cachep, rhdr);
+}
+
+static
+int nfs_return_empty_page(struct page *page)
+{
+	zero_user(page, 0, PAGE_CACHE_SIZE);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+}
+
+void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+			      struct inode *inode, bool force_mds,
+			      const struct nfs_pgio_completion_ops *compl_ops)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+	if (server->pnfs_curr_ld && !force_mds)
+		pg_ops = server->pnfs_curr_ld->pg_read_ops;
+#endif
+	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
+			server->rsize, 0);
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
+
+void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
+{
+	struct nfs_pgio_mirror *mirror;
+
+	pgio->pg_ops = &nfs_pgio_rw_ops;
+
+	/* read path should never have more than one mirror */
+	WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+	mirror = &pgio->pg_mirrors[0];
+	mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+		       struct page *page)
+{
+	struct nfs_page	*new;
+	unsigned int len;
+	struct nfs_pageio_descriptor pgio;
+	struct nfs_pgio_mirror *pgm;
+
+	len = nfs_page_length(page);
+	if (len == 0)
+		return nfs_return_empty_page(page);
+	new = nfs_create_request(ctx, page, NULL, 0, len);
+	if (IS_ERR(new)) {
+		unlock_page(page);
+		return PTR_ERR(new);
+	}
+	if (len < PAGE_CACHE_SIZE)
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+
+	nfs_pageio_init_read(&pgio, inode, false,
+			     &nfs_async_read_completion_ops);
+	nfs_pageio_add_request(&pgio, new);
+	nfs_pageio_complete(&pgio);
+
+	/* It doesn't make sense to do mirrored reads! */
+	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+	pgm = &pgio.pg_mirrors[0];
+	NFS_I(inode)->read_io += pgm->pg_bytes_written;
+
+	return 0;
+}
+
+static void nfs_readpage_release(struct nfs_page *req)
+{
+	struct inode *inode = d_inode(req->wb_context->dentry);
+
+	dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+		(long long)req_offset(req));
+
+	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+		if (PageUptodate(req->wb_page))
+			nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+		unlock_page(req->wb_page);
+	}
+	nfs_release_request(req);
+}
+
+static void nfs_page_group_set_uptodate(struct nfs_page *req)
+{
+	if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
+		SetPageUptodate(req->wb_page);
+}
+
+static void nfs_read_completion(struct nfs_pgio_header *hdr)
+{
+	unsigned long bytes = 0;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out;
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+		struct page *page = req->wb_page;
+		unsigned long start = req->wb_pgbase;
+		unsigned long end = req->wb_pgbase + req->wb_bytes;
+
+		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+			/* note: regions of the page not covered by a
+			 * request are zeroed in nfs_readpage_async /
+			 * readpage_async_filler */
+			if (bytes > hdr->good_bytes) {
+				/* nothing in this request was good, so zero
+				 * the full extent of the request */
+				zero_user_segment(page, start, end);
+
+			} else if (hdr->good_bytes - bytes < req->wb_bytes) {
+				/* part of this request has good bytes, but
+				 * not all. zero the bad bytes */
+				start += hdr->good_bytes - bytes;
+				WARN_ON(start < req->wb_pgbase);
+				zero_user_segment(page, start, end);
+			}
+		}
+		bytes += req->wb_bytes;
+		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+			if (bytes <= hdr->good_bytes)
+				nfs_page_group_set_uptodate(req);
+		} else
+			nfs_page_group_set_uptodate(req);
+		nfs_list_remove_request(req);
+		nfs_readpage_release(req);
+	}
+out:
+	hdr->release(hdr);
+}
+
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+			      struct rpc_message *msg,
+			      const struct nfs_rpc_ops *rpc_ops,
+			      struct rpc_task_setup *task_setup_data, int how)
+{
+	struct inode *inode = hdr->inode;
+	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+
+	task_setup_data->flags |= swap_flags;
+	rpc_ops->read_setup(hdr, msg);
+}
+
+static void
+nfs_async_read_error(struct list_head *head)
+{
+	struct nfs_page	*req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_readpage_release(req);
+	}
+}
+
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+	.error_cleanup = nfs_async_read_error,
+	.completion = nfs_read_completion,
+};
+
+/*
+ * This is the callback from RPC telling us whether a reply was
+ * received or some error occurred (timeout or socket shutdown).
+ */
+static int nfs_readpage_done(struct rpc_task *task,
+			     struct nfs_pgio_header *hdr,
+			     struct inode *inode)
+{
+	int status = NFS_PROTO(inode)->read_done(task, hdr);
+	if (status != 0)
+		return status;
+
+	nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
+
+	if (task->tk_status == -ESTALE) {
+		set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		nfs_mark_for_revalidate(inode);
+	}
+	return 0;
+}
+
+static void nfs_readpage_retry(struct rpc_task *task,
+			       struct nfs_pgio_header *hdr)
+{
+	struct nfs_pgio_args *argp = &hdr->args;
+	struct nfs_pgio_res  *resp = &hdr->res;
+
+	/* This is a short read! */
+	nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
+	/* Has the server at least made some progress? */
+	if (resp->count == 0) {
+		nfs_set_pgio_error(hdr, -EIO, argp->offset);
+		return;
+	}
+	/* Yes, so retry the read at the end of the hdr */
+	hdr->mds_offset += resp->count;
+	argp->offset += resp->count;
+	argp->pgbase += resp->count;
+	argp->count -= resp->count;
+	rpc_restart_call_prepare(task);
+}
+
+static void nfs_readpage_result(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
+{
+	if (hdr->res.eof) {
+		loff_t bound;
+
+		bound = hdr->args.offset + hdr->res.count;
+		spin_lock(&hdr->lock);
+		if (bound < hdr->io_start + hdr->good_bytes) {
+			set_bit(NFS_IOHDR_EOF, &hdr->flags);
+			clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
+			hdr->good_bytes = bound - hdr->io_start;
+		}
+		spin_unlock(&hdr->lock);
+	} else if (hdr->res.count != hdr->args.count)
+		nfs_readpage_retry(task, hdr);
+}
+
+/*
+ * Read a page over NFS.
+ * We read the page synchronously in the following case:
+ *  -	The error flag is set for this page. This happens only when a
+ *	previous async read operation failed.
+ */
+int nfs_readpage(struct file *file, struct page *page)
+{
+	struct nfs_open_context *ctx;
+	struct inode *inode = page_file_mapping(page)->host;
+	int		error;
+
+	dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
+		page, PAGE_CACHE_SIZE, page_file_index(page));
+	nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
+	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
+
+	/*
+	 * Try to flush any pending writes to the file..
+	 *
+	 * NOTE! Because we own the page lock, there cannot
+	 * be any new pending writes generated at this point
+	 * for this page (other pages can be written to).
+	 */
+	error = nfs_wb_page(inode, page);
+	if (error)
+		goto out_unlock;
+	if (PageUptodate(page))
+		goto out_unlock;
+
+	error = -ESTALE;
+	if (NFS_STALE(inode))
+		goto out_unlock;
+
+	if (file == NULL) {
+		error = -EBADF;
+		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+		if (ctx == NULL)
+			goto out_unlock;
+	} else
+		ctx = get_nfs_open_context(nfs_file_open_context(file));
+
+	if (!IS_SYNC(inode)) {
+		error = nfs_readpage_from_fscache(ctx, inode, page);
+		if (error == 0)
+			goto out;
+	}
+
+	error = nfs_readpage_async(ctx, inode, page);
+
+out:
+	put_nfs_open_context(ctx);
+	return error;
+out_unlock:
+	unlock_page(page);
+	return error;
+}
+
+struct nfs_readdesc {
+	struct nfs_pageio_descriptor *pgio;
+	struct nfs_open_context *ctx;
+};
+
+static int
+readpage_async_filler(void *data, struct page *page)
+{
+	struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
+	struct nfs_page *new;
+	unsigned int len;
+	int error;
+
+	len = nfs_page_length(page);
+	if (len == 0)
+		return nfs_return_empty_page(page);
+
+	new = nfs_create_request(desc->ctx, page, NULL, 0, len);
+	if (IS_ERR(new))
+		goto out_error;
+
+	if (len < PAGE_CACHE_SIZE)
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	if (!nfs_pageio_add_request(desc->pgio, new)) {
+		error = desc->pgio->pg_error;
+		goto out_unlock;
+	}
+	return 0;
+out_error:
+	error = PTR_ERR(new);
+out_unlock:
+	unlock_page(page);
+	return error;
+}
+
+int nfs_readpages(struct file *filp, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct nfs_pageio_descriptor pgio;
+	struct nfs_pgio_mirror *pgm;
+	struct nfs_readdesc desc = {
+		.pgio = &pgio,
+	};
+	struct inode *inode = mapping->host;
+	unsigned long npages;
+	int ret = -ESTALE;
+
+	dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
+			inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(inode),
+			nr_pages);
+	nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
+
+	if (NFS_STALE(inode))
+		goto out;
+
+	if (filp == NULL) {
+		desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+		if (desc.ctx == NULL)
+			return -EBADF;
+	} else
+		desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+
+	/* attempt to read as many of the pages as possible from the cache
+	 * - this returns -ENOBUFS immediately if the cookie is negative
+	 */
+	ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+					 pages, &nr_pages);
+	if (ret == 0)
+		goto read_complete; /* all pages were read */
+
+	nfs_pageio_init_read(&pgio, inode, false,
+			     &nfs_async_read_completion_ops);
+
+	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+	nfs_pageio_complete(&pgio);
+
+	/* It doesn't make sense to do mirrored reads! */
+	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+	pgm = &pgio.pg_mirrors[0];
+	NFS_I(inode)->read_io += pgm->pg_bytes_written;
+	npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
+		 PAGE_CACHE_SHIFT;
+	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
+	put_nfs_open_context(desc.ctx);
+out:
+	return ret;
+}
+
+int __init nfs_init_readpagecache(void)
+{
+	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
+					     sizeof(struct nfs_pgio_header),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (nfs_rdata_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void nfs_destroy_readpagecache(void)
+{
+	kmem_cache_destroy(nfs_rdata_cachep);
+}
+
+static const struct nfs_rw_ops nfs_rw_read_ops = {
+	.rw_mode		= FMODE_READ,
+	.rw_alloc_header	= nfs_readhdr_alloc,
+	.rw_free_header		= nfs_readhdr_free,
+	.rw_done		= nfs_readpage_done,
+	.rw_result		= nfs_readpage_result,
+	.rw_initiate		= nfs_initiate_read,
+};
diff --git a/kernel/fs/nfs/super.c b/kernel/fs/nfs/super.c
new file mode 100644
index 000000000..f175b833b
--- /dev/null
+++ b/kernel/fs/nfs/super.c
@@ -0,0 +1,2877 @@
+/*
+ *  linux/fs/nfs/super.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs superblock handling functions
+ *
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ *  Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a
+ *   particular server are held in the same superblock
+ * - NFS superblocks can have several effective roots to the dentry tree
+ * - directory type roots are spliced into the tree when a path from one root reaches the root
+ *   of another (see nfs_lookup())
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/nfs_xdr.h>
+#include <linux/magic.h>
+#include <linux/parser.h>
+#include <linux/nsproxy.h>
+#include <linux/rcupdate.h>
+
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "nfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+#define NFS_TEXT_DATA		1
+
+#if IS_ENABLED(CONFIG_NFS_V3)
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
+
+enum {
+	/* Mount options that take no arguments */
+	Opt_soft, Opt_hard,
+	Opt_posix, Opt_noposix,
+	Opt_cto, Opt_nocto,
+	Opt_ac, Opt_noac,
+	Opt_lock, Opt_nolock,
+	Opt_udp, Opt_tcp, Opt_rdma,
+	Opt_acl, Opt_noacl,
+	Opt_rdirplus, Opt_nordirplus,
+	Opt_sharecache, Opt_nosharecache,
+	Opt_resvport, Opt_noresvport,
+	Opt_fscache, Opt_nofscache,
+	Opt_migration, Opt_nomigration,
+
+	/* Mount options that take integer arguments */
+	Opt_port,
+	Opt_rsize, Opt_wsize, Opt_bsize,
+	Opt_timeo, Opt_retrans,
+	Opt_acregmin, Opt_acregmax,
+	Opt_acdirmin, Opt_acdirmax,
+	Opt_actimeo,
+	Opt_namelen,
+	Opt_mountport,
+	Opt_mountvers,
+	Opt_minorversion,
+
+	/* Mount options that take string arguments */
+	Opt_nfsvers,
+	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
+	Opt_addr, Opt_mountaddr, Opt_clientaddr,
+	Opt_lookupcache,
+	Opt_fscache_uniq,
+	Opt_local_lock,
+
+	/* Special mount options */
+	Opt_userspace, Opt_deprecated, Opt_sloppy,
+
+	Opt_err
+};
+
+static const match_table_t nfs_mount_option_tokens = {
+	{ Opt_userspace, "bg" },
+	{ Opt_userspace, "fg" },
+	{ Opt_userspace, "retry=%s" },
+
+	{ Opt_sloppy, "sloppy" },
+
+	{ Opt_soft, "soft" },
+	{ Opt_hard, "hard" },
+	{ Opt_deprecated, "intr" },
+	{ Opt_deprecated, "nointr" },
+	{ Opt_posix, "posix" },
+	{ Opt_noposix, "noposix" },
+	{ Opt_cto, "cto" },
+	{ Opt_nocto, "nocto" },
+	{ Opt_ac, "ac" },
+	{ Opt_noac, "noac" },
+	{ Opt_lock, "lock" },
+	{ Opt_nolock, "nolock" },
+	{ Opt_udp, "udp" },
+	{ Opt_tcp, "tcp" },
+	{ Opt_rdma, "rdma" },
+	{ Opt_acl, "acl" },
+	{ Opt_noacl, "noacl" },
+	{ Opt_rdirplus, "rdirplus" },
+	{ Opt_nordirplus, "nordirplus" },
+	{ Opt_sharecache, "sharecache" },
+	{ Opt_nosharecache, "nosharecache" },
+	{ Opt_resvport, "resvport" },
+	{ Opt_noresvport, "noresvport" },
+	{ Opt_fscache, "fsc" },
+	{ Opt_nofscache, "nofsc" },
+	{ Opt_migration, "migration" },
+	{ Opt_nomigration, "nomigration" },
+
+	{ Opt_port, "port=%s" },
+	{ Opt_rsize, "rsize=%s" },
+	{ Opt_wsize, "wsize=%s" },
+	{ Opt_bsize, "bsize=%s" },
+	{ Opt_timeo, "timeo=%s" },
+	{ Opt_retrans, "retrans=%s" },
+	{ Opt_acregmin, "acregmin=%s" },
+	{ Opt_acregmax, "acregmax=%s" },
+	{ Opt_acdirmin, "acdirmin=%s" },
+	{ Opt_acdirmax, "acdirmax=%s" },
+	{ Opt_actimeo, "actimeo=%s" },
+	{ Opt_namelen, "namlen=%s" },
+	{ Opt_mountport, "mountport=%s" },
+	{ Opt_mountvers, "mountvers=%s" },
+	{ Opt_minorversion, "minorversion=%s" },
+
+	{ Opt_nfsvers, "nfsvers=%s" },
+	{ Opt_nfsvers, "vers=%s" },
+
+	{ Opt_sec, "sec=%s" },
+	{ Opt_proto, "proto=%s" },
+	{ Opt_mountproto, "mountproto=%s" },
+	{ Opt_addr, "addr=%s" },
+	{ Opt_clientaddr, "clientaddr=%s" },
+	{ Opt_mounthost, "mounthost=%s" },
+	{ Opt_mountaddr, "mountaddr=%s" },
+
+	{ Opt_lookupcache, "lookupcache=%s" },
+	{ Opt_fscache_uniq, "fsc=%s" },
+	{ Opt_local_lock, "local_lock=%s" },
+
+	/* The following needs to be listed after all other options */
+	{ Opt_nfsvers, "v%s" },
+
+	{ Opt_err, NULL }
+};
+
+enum {
+	Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
+
+	Opt_xprt_err
+};
+
+static const match_table_t nfs_xprt_protocol_tokens = {
+	{ Opt_xprt_udp, "udp" },
+	{ Opt_xprt_udp6, "udp6" },
+	{ Opt_xprt_tcp, "tcp" },
+	{ Opt_xprt_tcp6, "tcp6" },
+	{ Opt_xprt_rdma, "rdma" },
+
+	{ Opt_xprt_err, NULL }
+};
+
+enum {
+	Opt_sec_none, Opt_sec_sys,
+	Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p,
+	Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp,
+	Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp,
+
+	Opt_sec_err
+};
+
+static const match_table_t nfs_secflavor_tokens = {
+	{ Opt_sec_none, "none" },
+	{ Opt_sec_none, "null" },
+	{ Opt_sec_sys, "sys" },
+
+	{ Opt_sec_krb5, "krb5" },
+	{ Opt_sec_krb5i, "krb5i" },
+	{ Opt_sec_krb5p, "krb5p" },
+
+	{ Opt_sec_lkey, "lkey" },
+	{ Opt_sec_lkeyi, "lkeyi" },
+	{ Opt_sec_lkeyp, "lkeyp" },
+
+	{ Opt_sec_spkm, "spkm3" },
+	{ Opt_sec_spkmi, "spkm3i" },
+	{ Opt_sec_spkmp, "spkm3p" },
+
+	{ Opt_sec_err, NULL }
+};
+
+enum {
+	Opt_lookupcache_all, Opt_lookupcache_positive,
+	Opt_lookupcache_none,
+
+	Opt_lookupcache_err
+};
+
+static match_table_t nfs_lookupcache_tokens = {
+	{ Opt_lookupcache_all, "all" },
+	{ Opt_lookupcache_positive, "pos" },
+	{ Opt_lookupcache_positive, "positive" },
+	{ Opt_lookupcache_none, "none" },
+
+	{ Opt_lookupcache_err, NULL }
+};
+
+enum {
+	Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
+	Opt_local_lock_none,
+
+	Opt_local_lock_err
+};
+
+static match_table_t nfs_local_lock_tokens = {
+	{ Opt_local_lock_all, "all" },
+	{ Opt_local_lock_flock, "flock" },
+	{ Opt_local_lock_posix, "posix" },
+	{ Opt_local_lock_none, "none" },
+
+	{ Opt_local_lock_err, NULL }
+};
+
+enum {
+	Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
+	Opt_vers_4_1, Opt_vers_4_2,
+
+	Opt_vers_err
+};
+
+static match_table_t nfs_vers_tokens = {
+	{ Opt_vers_2, "2" },
+	{ Opt_vers_3, "3" },
+	{ Opt_vers_4, "4" },
+	{ Opt_vers_4_0, "4.0" },
+	{ Opt_vers_4_1, "4.1" },
+	{ Opt_vers_4_2, "4.2" },
+
+	{ Opt_vers_err, NULL }
+};
+
+static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data);
+
+struct file_system_type nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.mount		= nfs_fs_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("nfs");
+EXPORT_SYMBOL_GPL(nfs_fs_type);
+
+struct file_system_type nfs_xdev_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.mount		= nfs_xdev_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+
+const struct super_operations nfs_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs_write_inode,
+	.drop_inode	= nfs_drop_inode,
+	.statfs		= nfs_statfs,
+	.evict_inode	= nfs_evict_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_devname	= nfs_show_devname,
+	.show_path	= nfs_show_path,
+	.show_stats	= nfs_show_stats,
+	.remount_fs	= nfs_remount,
+};
+EXPORT_SYMBOL_GPL(nfs_sops);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *);
+static int nfs4_validate_mount_data(void *options,
+	struct nfs_parsed_mount_data *args, const char *dev_name);
+
+struct file_system_type nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs_fs_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("nfs4");
+MODULE_ALIAS("nfs4");
+EXPORT_SYMBOL_GPL(nfs4_fs_type);
+
+static int __init register_nfs4_fs(void)
+{
+	return register_filesystem(&nfs4_fs_type);
+}
+
+static void unregister_nfs4_fs(void)
+{
+	unregister_filesystem(&nfs4_fs_type);
+}
+#else
+static int __init register_nfs4_fs(void)
+{
+	return 0;
+}
+
+static void unregister_nfs4_fs(void)
+{
+}
+#endif
+
+static struct shrinker acl_shrinker = {
+	.count_objects	= nfs_access_cache_count,
+	.scan_objects	= nfs_access_cache_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+	int ret;
+
+        ret = register_filesystem(&nfs_fs_type);
+	if (ret < 0)
+		goto error_0;
+
+	ret = register_nfs4_fs();
+	if (ret < 0)
+		goto error_1;
+
+	ret = nfs_register_sysctl();
+	if (ret < 0)
+		goto error_2;
+	register_shrinker(&acl_shrinker);
+	return 0;
+
+error_2:
+	unregister_nfs4_fs();
+error_1:
+	unregister_filesystem(&nfs_fs_type);
+error_0:
+	return ret;
+}
+
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+	unregister_shrinker(&acl_shrinker);
+	nfs_unregister_sysctl();
+	unregister_nfs4_fs();
+	unregister_filesystem(&nfs_fs_type);
+}
+
+bool nfs_sb_active(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	if (!atomic_inc_not_zero(&sb->s_active))
+		return false;
+	if (atomic_inc_return(&server->active) != 1)
+		atomic_dec(&sb->s_active);
+	return true;
+}
+EXPORT_SYMBOL_GPL(nfs_sb_active);
+
+void nfs_sb_deactive(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
+}
+EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+
+/*
+ * Deliver file system statistics to userspace
+ */
+int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct nfs_server *server = NFS_SB(dentry->d_sb);
+	unsigned char blockbits;
+	unsigned long blockres;
+	struct nfs_fh *fh = NFS_FH(d_inode(dentry));
+	struct nfs_fsstat res;
+	int error = -ENOMEM;
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		goto out_err;
+
+	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+	if (unlikely(error == -ESTALE)) {
+		struct dentry *pd_dentry;
+
+		pd_dentry = dget_parent(dentry);
+		if (pd_dentry != NULL) {
+			nfs_zap_caches(d_inode(pd_dentry));
+			dput(pd_dentry);
+		}
+	}
+	nfs_free_fattr(res.fattr);
+	if (error < 0)
+		goto out_err;
+
+	buf->f_type = NFS_SUPER_MAGIC;
+
+	/*
+	 * Current versions of glibc do not correctly handle the
+	 * case where f_frsize != f_bsize.  Eventually we want to
+	 * report the value of wtmult in this field.
+	 */
+	buf->f_frsize = dentry->d_sb->s_blocksize;
+
+	/*
+	 * On most *nix systems, f_blocks, f_bfree, and f_bavail
+	 * are reported in units of f_frsize.  Linux hasn't had
+	 * an f_frsize field in its statfs struct until recently,
+	 * thus historically Linux's sys_statfs reports these
+	 * fields in units of f_bsize.
+	 */
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	blockbits = dentry->d_sb->s_blocksize_bits;
+	blockres = (1 << blockbits) - 1;
+	buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+	buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+	buf->f_bavail = (res.abytes + blockres) >> blockbits;
+
+	buf->f_files = res.tfiles;
+	buf->f_ffree = res.afiles;
+
+	buf->f_namelen = server->namelen;
+
+	return 0;
+
+ out_err:
+	dprintk("%s: statfs error = %d\n", __func__, -error);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_statfs);
+
+/*
+ * Map the security flavour number to a name
+ */
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+	static const struct {
+		rpc_authflavor_t flavour;
+		const char *str;
+	} sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = {
+		/* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */
+		{ RPC_AUTH_NULL, "null" },
+		{ RPC_AUTH_UNIX, "sys" },
+		{ RPC_AUTH_GSS_KRB5, "krb5" },
+		{ RPC_AUTH_GSS_KRB5I, "krb5i" },
+		{ RPC_AUTH_GSS_KRB5P, "krb5p" },
+		{ RPC_AUTH_GSS_LKEY, "lkey" },
+		{ RPC_AUTH_GSS_LKEYI, "lkeyi" },
+		{ RPC_AUTH_GSS_LKEYP, "lkeyp" },
+		{ RPC_AUTH_GSS_SPKM, "spkm" },
+		{ RPC_AUTH_GSS_SPKMI, "spkmi" },
+		{ RPC_AUTH_GSS_SPKMP, "spkmp" },
+		{ UINT_MAX, "unknown" }
+	};
+	int i;
+
+	for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) {
+		if (sec_flavours[i].flavour == flavour)
+			break;
+	}
+	return sec_flavours[i].str;
+}
+
+static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
+				  int showdefaults)
+{
+	struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
+
+	seq_printf(m, ",mountproto=");
+	switch (sap->sa_family) {
+	case AF_INET:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	case AF_INET6:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP6);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP6);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	default:
+		if (showdefaults)
+			seq_printf(m, "auto");
+	}
+}
+
+static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+	struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
+
+	if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+		return;
+
+	switch (sap->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+		seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr);
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+		seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
+		break;
+	}
+	default:
+		if (showdefaults)
+			seq_printf(m, ",mountaddr=unspecified");
+	}
+
+	if (nfss->mountd_version || showdefaults)
+		seq_printf(m, ",mountvers=%u", nfss->mountd_version);
+	if ((nfss->mountd_port &&
+		nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+		showdefaults)
+		seq_printf(m, ",mountport=%u", nfss->mountd_port);
+
+	nfs_show_mountd_netid(m, nfss, showdefaults);
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+	struct nfs_client *clp = nfss->nfs_client;
+
+	seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+}
+#endif
+
+static void nfs_show_nfs_version(struct seq_file *m,
+		unsigned int version,
+		unsigned int minorversion)
+{
+	seq_printf(m, ",vers=%u", version);
+	if (version == 4)
+		seq_printf(m, ".%u", minorversion);
+}
+
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
+				   int showdefaults)
+{
+	static const struct proc_nfs_info {
+		int flag;
+		const char *str;
+		const char *nostr;
+	} nfs_info[] = {
+		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
+		{ NFS_MOUNT_POSIX, ",posix", "" },
+		{ NFS_MOUNT_NOCTO, ",nocto", "" },
+		{ NFS_MOUNT_NOAC, ",noac", "" },
+		{ NFS_MOUNT_NONLM, ",nolock", "" },
+		{ NFS_MOUNT_NOACL, ",noacl", "" },
+		{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
+		{ NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+		{ NFS_MOUNT_NORESVPORT, ",noresvport", "" },
+		{ 0, NULL, NULL }
+	};
+	const struct proc_nfs_info *nfs_infop;
+	struct nfs_client *clp = nfss->nfs_client;
+	u32 version = clp->rpc_ops->version;
+	int local_flock, local_fcntl;
+
+	nfs_show_nfs_version(m, version, clp->cl_minorversion);
+	seq_printf(m, ",rsize=%u", nfss->rsize);
+	seq_printf(m, ",wsize=%u", nfss->wsize);
+	if (nfss->bsize != 0)
+		seq_printf(m, ",bsize=%u", nfss->bsize);
+	seq_printf(m, ",namlen=%u", nfss->namelen);
+	if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
+		seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
+	if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
+		seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
+	if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
+		seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
+	if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
+		seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
+	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+		if (nfss->flags & nfs_infop->flag)
+			seq_puts(m, nfs_infop->str);
+		else
+			seq_puts(m, nfs_infop->nostr);
+	}
+	rcu_read_lock();
+	seq_printf(m, ",proto=%s",
+		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
+	rcu_read_unlock();
+	if (version == 4) {
+		if (nfss->port != NFS_PORT)
+			seq_printf(m, ",port=%u", nfss->port);
+	} else
+		if (nfss->port)
+			seq_printf(m, ",port=%u", nfss->port);
+
+	seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
+	seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
+	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+
+	if (version != 4)
+		nfs_show_mountd_options(m, nfss, showdefaults);
+	else
+		nfs_show_nfsv4_options(m, nfss, showdefaults);
+
+	if (nfss->options & NFS_OPTION_FSCACHE)
+		seq_printf(m, ",fsc");
+
+	if (nfss->options & NFS_OPTION_MIGRATION)
+		seq_printf(m, ",migration");
+
+	if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+		if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+			seq_printf(m, ",lookupcache=none");
+		else
+			seq_printf(m, ",lookupcache=pos");
+	}
+
+	local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+	local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+
+	if (!local_flock && !local_fcntl)
+		seq_printf(m, ",local_lock=none");
+	else if (local_flock && local_fcntl)
+		seq_printf(m, ",local_lock=all");
+	else if (local_flock)
+		seq_printf(m, ",local_lock=flock");
+	else
+		seq_printf(m, ",local_lock=posix");
+}
+
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+int nfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct nfs_server *nfss = NFS_SB(root->d_sb);
+
+	nfs_show_mount_options(m, nfss, 0);
+
+	rcu_read_lock();
+	seq_printf(m, ",addr=%s",
+			rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
+							RPC_DISPLAY_ADDR));
+	rcu_read_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_options);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+#ifdef CONFIG_NFS_V4_1
+static void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+	if (nfs4_has_session(server->nfs_client))
+		seq_printf(m, ",sessions");
+}
+#else
+static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+	seq_printf(m, ",pnfs=");
+	if (server->pnfs_curr_ld)
+		seq_printf(m, "%s", server->pnfs_curr_ld->name);
+	else
+		seq_printf(m, "not configured");
+}
+
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+	if (nfss->nfs_client && nfss->nfs_client->cl_implid) {
+		struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid;
+		seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
+			   "date='%llu,%u'",
+			   impl_id->name, impl_id->domain,
+			   impl_id->date.seconds, impl_id->date.nseconds);
+	}
+}
+#else
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+}
+#endif
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+}
+#endif
+
+int nfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+	char *page = (char *) __get_free_page(GFP_KERNEL);
+	char *devname, *dummy;
+	int err = 0;
+	if (!page)
+		return -ENOMEM;
+	devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0);
+	if (IS_ERR(devname))
+		err = PTR_ERR(devname);
+	else
+		seq_escape(m, devname, " \t\n\\");
+	free_page((unsigned long)page);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nfs_show_devname);
+
+int nfs_show_path(struct seq_file *m, struct dentry *dentry)
+{
+	seq_puts(m, "/");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_path);
+
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+int nfs_show_stats(struct seq_file *m, struct dentry *root)
+{
+	int i, cpu;
+	struct nfs_server *nfss = NFS_SB(root->d_sb);
+	struct rpc_auth *auth = nfss->client->cl_auth;
+	struct nfs_iostats totals = { };
+
+	seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+
+	/*
+	 * Display all mount option settings
+	 */
+	seq_printf(m, "\n\topts:\t");
+	seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+	seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+	seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	nfs_show_mount_options(m, nfss, 1);
+
+	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+
+	show_implementation_id(m, nfss);
+
+	seq_printf(m, "\n\tcaps:\t");
+	seq_printf(m, "caps=0x%x", nfss->caps);
+	seq_printf(m, ",wtmult=%u", nfss->wtmult);
+	seq_printf(m, ",dtsize=%u", nfss->dtsize);
+	seq_printf(m, ",bsize=%u", nfss->bsize);
+	seq_printf(m, ",namlen=%u", nfss->namelen);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+	if (nfss->nfs_client->rpc_ops->version == 4) {
+		seq_printf(m, "\n\tnfsv4:\t");
+		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+		seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
+		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+		show_sessions(m, nfss);
+		show_pnfs(m, nfss);
+	}
+#endif
+
+	/*
+	 * Display security flavor in effect for this mount
+	 */
+	seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor);
+	if (auth->au_flavor)
+		seq_printf(m, ",pseudoflavor=%u", auth->au_flavor);
+
+	/*
+	 * Display superblock I/O counters
+	 */
+	for_each_possible_cpu(cpu) {
+		struct nfs_iostats *stats;
+
+		preempt_disable();
+		stats = per_cpu_ptr(nfss->io_stats, cpu);
+
+		for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+			totals.events[i] += stats->events[i];
+		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+			totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			totals.fscache[i] += stats->fscache[i];
+#endif
+
+		preempt_enable();
+	}
+
+	seq_printf(m, "\n\tevents:\t");
+	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+		seq_printf(m, "%lu ", totals.events[i]);
+	seq_printf(m, "\n\tbytes:\t");
+	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+		seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+	if (nfss->options & NFS_OPTION_FSCACHE) {
+		seq_printf(m, "\n\tfsc:\t");
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			seq_printf(m, "%Lu ", totals.bytes[i]);
+	}
+#endif
+	seq_printf(m, "\n");
+
+	rpc_print_iostats(m, nfss->client);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_stats);
+
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to xdev traversals and referrals
+ */
+void nfs_umount_begin(struct super_block *sb)
+{
+	struct nfs_server *server;
+	struct rpc_clnt *rpc;
+
+	server = NFS_SB(sb);
+	/* -EIO all pending I/O */
+	rpc = server->client_acl;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+	rpc = server->client;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+}
+EXPORT_SYMBOL_GPL(nfs_umount_begin);
+
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
+{
+	struct nfs_parsed_mount_data *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data) {
+		data->acregmin		= NFS_DEF_ACREGMIN;
+		data->acregmax		= NFS_DEF_ACREGMAX;
+		data->acdirmin		= NFS_DEF_ACDIRMIN;
+		data->acdirmax		= NFS_DEF_ACDIRMAX;
+		data->mount_server.port	= NFS_UNSPEC_PORT;
+		data->nfs_server.port	= NFS_UNSPEC_PORT;
+		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+		data->selected_flavor	= RPC_AUTH_MAXFLAVOR;
+		data->minorversion	= 0;
+		data->need_mount	= true;
+		data->net		= current->nsproxy->net_ns;
+		security_init_mnt_opts(&data->lsm_opts);
+	}
+	return data;
+}
+
+static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
+{
+	if (data) {
+		kfree(data->client_address);
+		kfree(data->mount_server.hostname);
+		kfree(data->nfs_server.export_path);
+		kfree(data->nfs_server.hostname);
+		kfree(data->fscache_uniq);
+		security_free_mnt_opts(&data->lsm_opts);
+		kfree(data);
+	}
+}
+
+/*
+ * Sanity-check a server address provided by the mount command.
+ *
+ * Address family must be initialized, and address must not be
+ * the ANY address for that family.
+ */
+static int nfs_verify_server_address(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sa = (struct sockaddr_in *)addr;
+		return sa->sin_addr.s_addr != htonl(INADDR_ANY);
+	}
+	case AF_INET6: {
+		struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
+		return !ipv6_addr_any(sa);
+	}
+	}
+
+	dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
+	return 0;
+}
+
+/*
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
+ */
+static void nfs_set_port(struct sockaddr *sap, int *port,
+				 const unsigned short default_port)
+{
+	if (*port == NFS_UNSPEC_PORT)
+		*port = default_port;
+
+	rpc_set_port(sap, *port);
+}
+
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+	switch (mnt->nfs_server.protocol) {
+	case XPRT_TRANSPORT_UDP:
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		break;
+	default:
+		mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+	}
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+	nfs_validate_transport_protocol(mnt);
+
+	if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+	    mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
+			return;
+	switch (mnt->nfs_server.protocol) {
+	case XPRT_TRANSPORT_UDP:
+		mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+		break;
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
+	}
+}
+
+/*
+ * Add 'flavor' to 'auth_info' if not already present.
+ * Returns true if 'flavor' ends up in the list, false otherwise
+ */
+static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
+			      rpc_authflavor_t flavor)
+{
+	unsigned int i;
+	unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
+
+	/* make sure this flavor isn't already in the list */
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (flavor == auth_info->flavors[i])
+			return true;
+	}
+
+	if (auth_info->flavor_len + 1 >= max_flavor_len) {
+		dfprintk(MOUNT, "NFS: too many sec= flavors\n");
+		return false;
+	}
+
+	auth_info->flavors[auth_info->flavor_len++] = flavor;
+	return true;
+}
+
+/*
+ * Return true if 'match' is in auth_info or auth_info is empty.
+ * Return false otherwise.
+ */
+bool nfs_auth_info_match(const struct nfs_auth_info *auth_info,
+			 rpc_authflavor_t match)
+{
+	int i;
+
+	if (!auth_info->flavor_len)
+		return true;
+
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (auth_info->flavors[i] == match)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(nfs_auth_info_match);
+
+/*
+ * Parse the value of the 'sec=' option.
+ */
+static int nfs_parse_security_flavors(char *value,
+				      struct nfs_parsed_mount_data *mnt)
+{
+	substring_t args[MAX_OPT_ARGS];
+	rpc_authflavor_t pseudoflavor;
+	char *p;
+
+	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+
+	while ((p = strsep(&value, ":")) != NULL) {
+		switch (match_token(p, nfs_secflavor_tokens, args)) {
+		case Opt_sec_none:
+			pseudoflavor = RPC_AUTH_NULL;
+			break;
+		case Opt_sec_sys:
+			pseudoflavor = RPC_AUTH_UNIX;
+			break;
+		case Opt_sec_krb5:
+			pseudoflavor = RPC_AUTH_GSS_KRB5;
+			break;
+		case Opt_sec_krb5i:
+			pseudoflavor = RPC_AUTH_GSS_KRB5I;
+			break;
+		case Opt_sec_krb5p:
+			pseudoflavor = RPC_AUTH_GSS_KRB5P;
+			break;
+		case Opt_sec_lkey:
+			pseudoflavor = RPC_AUTH_GSS_LKEY;
+			break;
+		case Opt_sec_lkeyi:
+			pseudoflavor = RPC_AUTH_GSS_LKEYI;
+			break;
+		case Opt_sec_lkeyp:
+			pseudoflavor = RPC_AUTH_GSS_LKEYP;
+			break;
+		case Opt_sec_spkm:
+			pseudoflavor = RPC_AUTH_GSS_SPKM;
+			break;
+		case Opt_sec_spkmi:
+			pseudoflavor = RPC_AUTH_GSS_SPKMI;
+			break;
+		case Opt_sec_spkmp:
+			pseudoflavor = RPC_AUTH_GSS_SPKMP;
+			break;
+		default:
+			dfprintk(MOUNT,
+				 "NFS: sec= option '%s' not recognized\n", p);
+			return 0;
+		}
+
+		if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int nfs_parse_version_string(char *string,
+		struct nfs_parsed_mount_data *mnt,
+		substring_t *args)
+{
+	mnt->flags &= ~NFS_MOUNT_VER3;
+	switch (match_token(string, nfs_vers_tokens, args)) {
+	case Opt_vers_2:
+		mnt->version = 2;
+		break;
+	case Opt_vers_3:
+		mnt->flags |= NFS_MOUNT_VER3;
+		mnt->version = 3;
+		break;
+	case Opt_vers_4:
+		/* Backward compatibility option. In future,
+		 * the mount program should always supply
+		 * a NFSv4 minor version number.
+		 */
+		mnt->version = 4;
+		break;
+	case Opt_vers_4_0:
+		mnt->version = 4;
+		mnt->minorversion = 0;
+		break;
+	case Opt_vers_4_1:
+		mnt->version = 4;
+		mnt->minorversion = 1;
+		break;
+	case Opt_vers_4_2:
+		mnt->version = 4;
+		mnt->minorversion = 2;
+		break;
+	default:
+		return 0;
+	}
+	return 1;
+}
+
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+	kfree(*option);
+	*option = match_strdup(args);
+	return !*option;
+}
+
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+	int rc;
+	char *string;
+
+	string = match_strdup(args);
+	if (string == NULL)
+		return -ENOMEM;
+	rc = kstrtoul(string, 10, option);
+	kfree(string);
+
+	return rc;
+}
+
+/*
+ * Error-check and convert a string of mount options from user space into
+ * a data structure.  The whole mount string is processed; bad options are
+ * skipped as they are encountered.  If there were no errors, return 1;
+ * otherwise return 0 (zero).
+ */
+static int nfs_parse_mount_options(char *raw,
+				   struct nfs_parsed_mount_data *mnt)
+{
+	char *p, *string, *secdata;
+	int rc, sloppy = 0, invalid_option = 0;
+	unsigned short protofamily = AF_UNSPEC;
+	unsigned short mountfamily = AF_UNSPEC;
+
+	if (!raw) {
+		dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
+		return 1;
+	}
+	dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw);
+
+	secdata = alloc_secdata();
+	if (!secdata)
+		goto out_nomem;
+
+	rc = security_sb_copy_data(raw, secdata);
+	if (rc)
+		goto out_security_failure;
+
+	rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts);
+	if (rc)
+		goto out_security_failure;
+
+	free_secdata(secdata);
+
+	while ((p = strsep(&raw, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		unsigned long option;
+		int token;
+
+		if (!*p)
+			continue;
+
+		dfprintk(MOUNT, "NFS:   parsing nfs mount option '%s'\n", p);
+
+		token = match_token(p, nfs_mount_option_tokens, args);
+		switch (token) {
+
+		/*
+		 * boolean options:  foo/nofoo
+		 */
+		case Opt_soft:
+			mnt->flags |= NFS_MOUNT_SOFT;
+			break;
+		case Opt_hard:
+			mnt->flags &= ~NFS_MOUNT_SOFT;
+			break;
+		case Opt_posix:
+			mnt->flags |= NFS_MOUNT_POSIX;
+			break;
+		case Opt_noposix:
+			mnt->flags &= ~NFS_MOUNT_POSIX;
+			break;
+		case Opt_cto:
+			mnt->flags &= ~NFS_MOUNT_NOCTO;
+			break;
+		case Opt_nocto:
+			mnt->flags |= NFS_MOUNT_NOCTO;
+			break;
+		case Opt_ac:
+			mnt->flags &= ~NFS_MOUNT_NOAC;
+			break;
+		case Opt_noac:
+			mnt->flags |= NFS_MOUNT_NOAC;
+			break;
+		case Opt_lock:
+			mnt->flags &= ~NFS_MOUNT_NONLM;
+			mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+					NFS_MOUNT_LOCAL_FCNTL);
+			break;
+		case Opt_nolock:
+			mnt->flags |= NFS_MOUNT_NONLM;
+			mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+				       NFS_MOUNT_LOCAL_FCNTL);
+			break;
+		case Opt_udp:
+			mnt->flags &= ~NFS_MOUNT_TCP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+			break;
+		case Opt_tcp:
+			mnt->flags |= NFS_MOUNT_TCP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+			break;
+		case Opt_rdma:
+			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+			xprt_load_transport(p);
+			break;
+		case Opt_acl:
+			mnt->flags &= ~NFS_MOUNT_NOACL;
+			break;
+		case Opt_noacl:
+			mnt->flags |= NFS_MOUNT_NOACL;
+			break;
+		case Opt_rdirplus:
+			mnt->flags &= ~NFS_MOUNT_NORDIRPLUS;
+			break;
+		case Opt_nordirplus:
+			mnt->flags |= NFS_MOUNT_NORDIRPLUS;
+			break;
+		case Opt_sharecache:
+			mnt->flags &= ~NFS_MOUNT_UNSHARED;
+			break;
+		case Opt_nosharecache:
+			mnt->flags |= NFS_MOUNT_UNSHARED;
+			break;
+		case Opt_resvport:
+			mnt->flags &= ~NFS_MOUNT_NORESVPORT;
+			break;
+		case Opt_noresvport:
+			mnt->flags |= NFS_MOUNT_NORESVPORT;
+			break;
+		case Opt_fscache:
+			mnt->options |= NFS_OPTION_FSCACHE;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = NULL;
+			break;
+		case Opt_nofscache:
+			mnt->options &= ~NFS_OPTION_FSCACHE;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = NULL;
+			break;
+		case Opt_migration:
+			mnt->options |= NFS_OPTION_MIGRATION;
+			break;
+		case Opt_nomigration:
+			mnt->options &= NFS_OPTION_MIGRATION;
+			break;
+
+		/*
+		 * options that take numeric values
+		 */
+		case Opt_port:
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
+				goto out_invalid_value;
+			mnt->nfs_server.port = option;
+			break;
+		case Opt_rsize:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->rsize = option;
+			break;
+		case Opt_wsize:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->wsize = option;
+			break;
+		case Opt_bsize:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->bsize = option;
+			break;
+		case Opt_timeo:
+			if (nfs_get_option_ul(args, &option) || option == 0)
+				goto out_invalid_value;
+			mnt->timeo = option;
+			break;
+		case Opt_retrans:
+			if (nfs_get_option_ul(args, &option) || option == 0)
+				goto out_invalid_value;
+			mnt->retrans = option;
+			break;
+		case Opt_acregmin:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acregmin = option;
+			break;
+		case Opt_acregmax:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acregmax = option;
+			break;
+		case Opt_acdirmin:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acdirmin = option;
+			break;
+		case Opt_acdirmax:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acdirmax = option;
+			break;
+		case Opt_actimeo:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acregmin = mnt->acregmax =
+			mnt->acdirmin = mnt->acdirmax = option;
+			break;
+		case Opt_namelen:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->namlen = option;
+			break;
+		case Opt_mountport:
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
+				goto out_invalid_value;
+			mnt->mount_server.port = option;
+			break;
+		case Opt_mountvers:
+			if (nfs_get_option_ul(args, &option) ||
+			    option < NFS_MNT_VERSION ||
+			    option > NFS_MNT3_VERSION)
+				goto out_invalid_value;
+			mnt->mount_server.version = option;
+			break;
+		case Opt_minorversion:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			if (option > NFS4_MAX_MINOR_VERSION)
+				goto out_invalid_value;
+			mnt->minorversion = option;
+			break;
+
+		/*
+		 * options that take text values
+		 */
+		case Opt_nfsvers:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			rc = nfs_parse_version_string(string, mnt, args);
+			kfree(string);
+			if (!rc)
+				goto out_invalid_value;
+			break;
+		case Opt_sec:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			rc = nfs_parse_security_flavors(string, mnt);
+			kfree(string);
+			if (!rc) {
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"security flavor\n");
+				return 0;
+			}
+			break;
+		case Opt_proto:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string,
+					    nfs_xprt_protocol_tokens, args);
+
+			protofamily = AF_INET;
+			switch (token) {
+			case Opt_xprt_udp6:
+				protofamily = AF_INET6;
+			case Opt_xprt_udp:
+				mnt->flags &= ~NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+				break;
+			case Opt_xprt_tcp6:
+				protofamily = AF_INET6;
+			case Opt_xprt_tcp:
+				mnt->flags |= NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+				break;
+			case Opt_xprt_rdma:
+				/* vector side protocols to TCP */
+				mnt->flags |= NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+				xprt_load_transport(string);
+				break;
+			default:
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"transport protocol\n");
+				kfree(string);
+				return 0;
+			}
+			kfree(string);
+			break;
+		case Opt_mountproto:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string,
+					    nfs_xprt_protocol_tokens, args);
+			kfree(string);
+
+			mountfamily = AF_INET;
+			switch (token) {
+			case Opt_xprt_udp6:
+				mountfamily = AF_INET6;
+			case Opt_xprt_udp:
+				mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+				break;
+			case Opt_xprt_tcp6:
+				mountfamily = AF_INET6;
+			case Opt_xprt_tcp:
+				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
+				break;
+			case Opt_xprt_rdma: /* not used for side protocols */
+			default:
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"transport protocol\n");
+				return 0;
+			}
+			break;
+		case Opt_addr:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			mnt->nfs_server.addrlen =
+				rpc_pton(mnt->net, string, strlen(string),
+					(struct sockaddr *)
+					&mnt->nfs_server.address,
+					sizeof(mnt->nfs_server.address));
+			kfree(string);
+			if (mnt->nfs_server.addrlen == 0)
+				goto out_invalid_address;
+			break;
+		case Opt_clientaddr:
+			if (nfs_get_option_str(args, &mnt->client_address))
+				goto out_nomem;
+			break;
+		case Opt_mounthost:
+			if (nfs_get_option_str(args,
+					       &mnt->mount_server.hostname))
+				goto out_nomem;
+			break;
+		case Opt_mountaddr:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			mnt->mount_server.addrlen =
+				rpc_pton(mnt->net, string, strlen(string),
+					(struct sockaddr *)
+					&mnt->mount_server.address,
+					sizeof(mnt->mount_server.address));
+			kfree(string);
+			if (mnt->mount_server.addrlen == 0)
+				goto out_invalid_address;
+			break;
+		case Opt_lookupcache:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string,
+					nfs_lookupcache_tokens, args);
+			kfree(string);
+			switch (token) {
+				case Opt_lookupcache_all:
+					mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+					break;
+				case Opt_lookupcache_positive:
+					mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+					mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+					break;
+				case Opt_lookupcache_none:
+					mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+					break;
+				default:
+					dfprintk(MOUNT, "NFS:   invalid "
+							"lookupcache argument\n");
+					return 0;
+			};
+			break;
+		case Opt_fscache_uniq:
+			if (nfs_get_option_str(args, &mnt->fscache_uniq))
+				goto out_nomem;
+			mnt->options |= NFS_OPTION_FSCACHE;
+			break;
+		case Opt_local_lock:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string, nfs_local_lock_tokens,
+					args);
+			kfree(string);
+			switch (token) {
+			case Opt_local_lock_all:
+				mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+					       NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			case Opt_local_lock_flock:
+				mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
+				break;
+			case Opt_local_lock_posix:
+				mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
+				break;
+			case Opt_local_lock_none:
+				mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+						NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			default:
+				dfprintk(MOUNT, "NFS:	invalid	"
+						"local_lock argument\n");
+				return 0;
+			};
+			break;
+
+		/*
+		 * Special options
+		 */
+		case Opt_sloppy:
+			sloppy = 1;
+			dfprintk(MOUNT, "NFS:   relaxing parsing rules\n");
+			break;
+		case Opt_userspace:
+		case Opt_deprecated:
+			dfprintk(MOUNT, "NFS:   ignoring mount option "
+					"'%s'\n", p);
+			break;
+
+		default:
+			invalid_option = 1;
+			dfprintk(MOUNT, "NFS:   unrecognized mount option "
+					"'%s'\n", p);
+		}
+	}
+
+	if (!sloppy && invalid_option)
+		return 0;
+
+	if (mnt->minorversion && mnt->version != 4)
+		goto out_minorversion_mismatch;
+
+	if (mnt->options & NFS_OPTION_MIGRATION &&
+	    (mnt->version != 4 || mnt->minorversion != 0))
+		goto out_migration_misuse;
+
+	/*
+	 * verify that any proto=/mountproto= options match the address
+	 * families in the addr=/mountaddr= options.
+	 */
+	if (protofamily != AF_UNSPEC &&
+	    protofamily != mnt->nfs_server.address.ss_family)
+		goto out_proto_mismatch;
+
+	if (mountfamily != AF_UNSPEC) {
+		if (mnt->mount_server.addrlen) {
+			if (mountfamily != mnt->mount_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		} else {
+			if (mountfamily != mnt->nfs_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		}
+	}
+
+	return 1;
+
+out_mountproto_mismatch:
+	printk(KERN_INFO "NFS: mount server address does not match mountproto= "
+			 "option\n");
+	return 0;
+out_proto_mismatch:
+	printk(KERN_INFO "NFS: server address does not match proto= option\n");
+	return 0;
+out_invalid_address:
+	printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
+	return 0;
+out_invalid_value:
+	printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
+	return 0;
+out_minorversion_mismatch:
+	printk(KERN_INFO "NFS: mount option vers=%u does not support "
+			 "minorversion=%u\n", mnt->version, mnt->minorversion);
+	return 0;
+out_migration_misuse:
+	printk(KERN_INFO
+		"NFS: 'migration' not supported for this NFS version\n");
+	return 0;
+out_nomem:
+	printk(KERN_INFO "NFS: not enough memory to parse option\n");
+	return 0;
+out_security_failure:
+	free_secdata(secdata);
+	printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
+	return 0;
+}
+
+/*
+ * Ensure that a specified authtype in args->auth_info is supported by
+ * the server. Returns 0 and sets args->selected_flavor if it's ok, and
+ * -EACCES if not.
+ */
+static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
+			rpc_authflavor_t *server_authlist, unsigned int count)
+{
+	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+	unsigned int i;
+
+	/*
+	 * If the sec= mount option is used, the specified flavor or AUTH_NULL
+	 * must be in the list returned by the server.
+	 *
+	 * AUTH_NULL has a special meaning when it's in the server list - it
+	 * means that the server will ignore the rpc creds, so any flavor
+	 * can be used.
+	 */
+	for (i = 0; i < count; i++) {
+		flavor = server_authlist[i];
+
+		if (nfs_auth_info_match(&args->auth_info, flavor) ||
+		    flavor == RPC_AUTH_NULL)
+			goto out;
+	}
+
+	dfprintk(MOUNT,
+		 "NFS: specified auth flavors not supported by server\n");
+	return -EACCES;
+
+out:
+	args->selected_flavor = flavor;
+	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);
+	return 0;
+}
+
+/*
+ * Use the remote server's MOUNT service to request the NFS file handle
+ * corresponding to the provided path.
+ */
+static int nfs_request_mount(struct nfs_parsed_mount_data *args,
+			     struct nfs_fh *root_fh,
+			     rpc_authflavor_t *server_authlist,
+			     unsigned int *server_authlist_len)
+{
+	struct nfs_mount_request request = {
+		.sap		= (struct sockaddr *)
+						&args->mount_server.address,
+		.dirpath	= args->nfs_server.export_path,
+		.protocol	= args->mount_server.protocol,
+		.fh		= root_fh,
+		.noresvport	= args->flags & NFS_MOUNT_NORESVPORT,
+		.auth_flav_len	= server_authlist_len,
+		.auth_flavs	= server_authlist,
+		.net		= args->net,
+	};
+	int status;
+
+	if (args->mount_server.version == 0) {
+		switch (args->version) {
+			default:
+				args->mount_server.version = NFS_MNT3_VERSION;
+				break;
+			case 2:
+				args->mount_server.version = NFS_MNT_VERSION;
+		}
+	}
+	request.version = args->mount_server.version;
+
+	if (args->mount_server.hostname)
+		request.hostname = args->mount_server.hostname;
+	else
+		request.hostname = args->nfs_server.hostname;
+
+	/*
+	 * Construct the mount server's address.
+	 */
+	if (args->mount_server.address.ss_family == AF_UNSPEC) {
+		memcpy(request.sap, &args->nfs_server.address,
+		       args->nfs_server.addrlen);
+		args->mount_server.addrlen = args->nfs_server.addrlen;
+	}
+	request.salen = args->mount_server.addrlen;
+	nfs_set_port(request.sap, &args->mount_server.port, 0);
+
+	/*
+	 * Now ask the mount server to map our export path
+	 * to a file handle.
+	 */
+	status = nfs_mount(&request);
+	if (status != 0) {
+		dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+				request.hostname, status);
+		return status;
+	}
+
+	return 0;
+}
+
+static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
+					struct nfs_subversion *nfs_mod)
+{
+	int status;
+	unsigned int i;
+	bool tried_auth_unix = false;
+	bool auth_null_in_list = false;
+	struct nfs_server *server = ERR_PTR(-EACCES);
+	struct nfs_parsed_mount_data *args = mount_info->parsed;
+	rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+	unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+	status = nfs_request_mount(args, mount_info->mntfh, authlist,
+					&authlist_len);
+	if (status)
+		return ERR_PTR(status);
+
+	/*
+	 * Was a sec= authflavor specified in the options? First, verify
+	 * whether the server supports it, and then just try to use it if so.
+	 */
+	if (args->auth_info.flavor_len > 0) {
+		status = nfs_verify_authflavors(args, authlist, authlist_len);
+		dfprintk(MOUNT, "NFS: using auth flavor %u\n",
+			 args->selected_flavor);
+		if (status)
+			return ERR_PTR(status);
+		return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+	}
+
+	/*
+	 * No sec= option was provided. RFC 2623, section 2.7 suggests we
+	 * SHOULD prefer the flavor listed first. However, some servers list
+	 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+	 */
+	for (i = 0; i < authlist_len; ++i) {
+		rpc_authflavor_t flavor;
+		struct rpcsec_gss_info info;
+
+		flavor = authlist[i];
+		switch (flavor) {
+		case RPC_AUTH_UNIX:
+			tried_auth_unix = true;
+			break;
+		case RPC_AUTH_NULL:
+			auth_null_in_list = true;
+			continue;
+		default:
+			if (rpcauth_get_gssinfo(flavor, &info) != 0)
+				continue;
+			/* Fallthrough */
+		}
+		dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+		args->selected_flavor = flavor;
+		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+		if (!IS_ERR(server))
+			return server;
+	}
+
+	/*
+	 * Nothing we tried so far worked. At this point, give up if we've
+	 * already tried AUTH_UNIX or if the server's list doesn't contain
+	 * AUTH_NULL
+	 */
+	if (tried_auth_unix || !auth_null_in_list)
+		return server;
+
+	/* Last chance! Try AUTH_UNIX */
+	dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+	args->selected_flavor = RPC_AUTH_UNIX;
+	return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+}
+
+struct dentry *nfs_try_mount(int flags, const char *dev_name,
+			     struct nfs_mount_info *mount_info,
+			     struct nfs_subversion *nfs_mod)
+{
+	struct nfs_server *server;
+
+	if (mount_info->parsed->need_mount)
+		server = nfs_try_mount_request(mount_info, nfs_mod);
+	else
+		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+
+	if (IS_ERR(server))
+		return ERR_CAST(server);
+
+	return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod);
+}
+EXPORT_SYMBOL_GPL(nfs_try_mount);
+
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+			     char **hostname, size_t maxnamlen,
+			     char **export_path, size_t maxpathlen)
+{
+	size_t len;
+	char *end;
+
+	/* Is the host name protected with square brakcets? */
+	if (*dev_name == '[') {
+		end = strchr(++dev_name, ']');
+		if (end == NULL || end[1] != ':')
+			goto out_bad_devname;
+
+		len = end - dev_name;
+		end++;
+	} else {
+		char *comma;
+
+		end = strchr(dev_name, ':');
+		if (end == NULL)
+			goto out_bad_devname;
+		len = end - dev_name;
+
+		/* kill possible hostname list: not supported */
+		comma = strchr(dev_name, ',');
+		if (comma != NULL && comma < end)
+			*comma = 0;
+	}
+
+	if (len > maxnamlen)
+		goto out_hostname;
+
+	/* N.B. caller will free nfs_server.hostname in all cases */
+	*hostname = kstrndup(dev_name, len, GFP_KERNEL);
+	if (*hostname == NULL)
+		goto out_nomem;
+	len = strlen(++end);
+	if (len > maxpathlen)
+		goto out_path;
+	*export_path = kstrndup(end, len, GFP_KERNEL);
+	if (!*export_path)
+		goto out_nomem;
+
+	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+	return 0;
+
+out_bad_devname:
+	dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+	return -EINVAL;
+
+out_nomem:
+	dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+	return -ENOMEM;
+
+out_hostname:
+	dfprintk(MOUNT, "NFS: server hostname too long\n");
+	return -ENAMETOOLONG;
+
+out_path:
+	dfprintk(MOUNT, "NFS: export pathname too long\n");
+	return -ENAMETOOLONG;
+}
+
+/*
+ * Validate the NFS2/NFS3 mount data
+ * - fills in the mount root filehandle
+ *
+ * For option strings, user space handles the following behaviors:
+ *
+ * + DNS: mapping server host name to IP address ("addr=" option)
+ *
+ * + failure mode: how to behave if a mount request can't be handled
+ *   immediately ("fg/bg" option)
+ *
+ * + retry: how often to retry a mount request ("retry=" option)
+ *
+ * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
+ *   mountproto=tcp after mountproto=udp, and so on
+ */
+static int nfs23_validate_mount_data(void *options,
+				     struct nfs_parsed_mount_data *args,
+				     struct nfs_fh *mntfh,
+				     const char *dev_name)
+{
+	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+	int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
+
+	if (data == NULL)
+		goto out_no_data;
+
+	args->version = NFS_DEFAULT_VERSION;
+	switch (data->version) {
+	case 1:
+		data->namlen = 0;
+	case 2:
+		data->bsize = 0;
+	case 3:
+		if (data->flags & NFS_MOUNT_VER3)
+			goto out_no_v3;
+		data->root.size = NFS2_FHSIZE;
+		memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+		/* Turn off security negotiation */
+		extra_flags |= NFS_MOUNT_SECFLAVOUR;
+	case 4:
+		if (data->flags & NFS_MOUNT_SECFLAVOUR)
+			goto out_no_sec;
+	case 5:
+		memset(data->context, 0, sizeof(data->context));
+	case 6:
+		if (data->flags & NFS_MOUNT_VER3) {
+			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+				goto out_invalid_fh;
+			mntfh->size = data->root.size;
+			args->version = 3;
+		} else {
+			mntfh->size = NFS2_FHSIZE;
+			args->version = 2;
+		}
+
+
+		memcpy(mntfh->data, data->root.data, mntfh->size);
+		if (mntfh->size < sizeof(mntfh->data))
+			memset(mntfh->data + mntfh->size, 0,
+			       sizeof(mntfh->data) - mntfh->size);
+
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs_fill_super
+		 * can deal with.
+		 */
+		args->flags		= data->flags & NFS_MOUNT_FLAGMASK;
+		args->flags		|= extra_flags;
+		args->rsize		= data->rsize;
+		args->wsize		= data->wsize;
+		args->timeo		= data->timeo;
+		args->retrans		= data->retrans;
+		args->acregmin		= data->acregmin;
+		args->acregmax		= data->acregmax;
+		args->acdirmin		= data->acdirmin;
+		args->acdirmax		= data->acdirmax;
+		args->need_mount	= false;
+
+		memcpy(sap, &data->addr, sizeof(data->addr));
+		args->nfs_server.addrlen = sizeof(data->addr);
+		args->nfs_server.port = ntohs(data->addr.sin_port);
+		if (!nfs_verify_server_address(sap))
+			goto out_no_address;
+
+		if (!(data->flags & NFS_MOUNT_TCP))
+			args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+		/* N.B. caller will free nfs_server.hostname in all cases */
+		args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
+		args->namlen		= data->namlen;
+		args->bsize		= data->bsize;
+
+		if (data->flags & NFS_MOUNT_SECFLAVOUR)
+			args->selected_flavor = data->pseudoflavor;
+		else
+			args->selected_flavor = RPC_AUTH_UNIX;
+		if (!args->nfs_server.hostname)
+			goto out_nomem;
+
+		if (!(data->flags & NFS_MOUNT_NONLM))
+			args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+					 NFS_MOUNT_LOCAL_FCNTL);
+		else
+			args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+					NFS_MOUNT_LOCAL_FCNTL);
+		/*
+		 * The legacy version 6 binary mount data from userspace has a
+		 * field used only to transport selinux information into the
+		 * the kernel.  To continue to support that functionality we
+		 * have a touch of selinux knowledge here in the NFS code. The
+		 * userspace code converted context=blah to just blah so we are
+		 * converting back to the full string selinux understands.
+		 */
+		if (data->context[0]){
+#ifdef CONFIG_SECURITY_SELINUX
+			int rc;
+			char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL);
+			if (!opts_str)
+				return -ENOMEM;
+			strcpy(opts_str, "context=");
+			data->context[NFS_MAX_CONTEXT_LEN] = '\0';
+			strcat(opts_str, &data->context[0]);
+			rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts);
+			kfree(opts_str);
+			if (rc)
+				return rc;
+#else
+			return -EINVAL;
+#endif
+		}
+
+		break;
+	default:
+		return NFS_TEXT_DATA;
+	}
+
+	return 0;
+
+out_no_data:
+	dfprintk(MOUNT, "NFS: mount program didn't pass any mount data\n");
+	return -EINVAL;
+
+out_no_v3:
+	dfprintk(MOUNT, "NFS: nfs_mount_data version %d does not support v3\n",
+		 data->version);
+	return -EINVAL;
+
+out_no_sec:
+	dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
+	return -EINVAL;
+
+out_nomem:
+	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
+	return -ENOMEM;
+
+out_no_address:
+	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
+	return -EINVAL;
+
+out_invalid_fh:
+	dfprintk(MOUNT, "NFS: invalid root filehandle\n");
+	return -EINVAL;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static int nfs_validate_mount_data(struct file_system_type *fs_type,
+				   void *options,
+				   struct nfs_parsed_mount_data *args,
+				   struct nfs_fh *mntfh,
+				   const char *dev_name)
+{
+	if (fs_type == &nfs_fs_type)
+		return nfs23_validate_mount_data(options, args, mntfh, dev_name);
+	return nfs4_validate_mount_data(options, args, dev_name);
+}
+#else
+static int nfs_validate_mount_data(struct file_system_type *fs_type,
+				   void *options,
+				   struct nfs_parsed_mount_data *args,
+				   struct nfs_fh *mntfh,
+				   const char *dev_name)
+{
+	return nfs23_validate_mount_data(options, args, mntfh, dev_name);
+}
+#endif
+
+static int nfs_validate_text_mount_data(void *options,
+					struct nfs_parsed_mount_data *args,
+					const char *dev_name)
+{
+	int port = 0;
+	int max_namelen = PAGE_SIZE;
+	int max_pathlen = NFS_MAXPATHLEN;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+	if (nfs_parse_mount_options((char *)options, args) == 0)
+		return -EINVAL;
+
+	if (!nfs_verify_server_address(sap))
+		goto out_no_address;
+
+	if (args->version == 4) {
+#if IS_ENABLED(CONFIG_NFS_V4)
+		port = NFS_PORT;
+		max_namelen = NFS4_MAXNAMLEN;
+		max_pathlen = NFS4_MAXPATHLEN;
+		nfs_validate_transport_protocol(args);
+		if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+			goto out_invalid_transport_udp;
+		nfs4_validate_mount_flags(args);
+#else
+		goto out_v4_not_compiled;
+#endif /* CONFIG_NFS_V4 */
+	} else
+		nfs_set_mount_transport_protocol(args);
+
+	nfs_set_port(sap, &args->nfs_server.port, port);
+
+	return nfs_parse_devname(dev_name,
+				   &args->nfs_server.hostname,
+				   max_namelen,
+				   &args->nfs_server.export_path,
+				   max_pathlen);
+
+#if !IS_ENABLED(CONFIG_NFS_V4)
+out_v4_not_compiled:
+	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+	return -EPROTONOSUPPORT;
+#else
+out_invalid_transport_udp:
+	dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
+	return -EINVAL;
+#endif /* !CONFIG_NFS_V4 */
+
+out_no_address:
+	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
+	return -EINVAL;
+}
+
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+		| NFS_MOUNT_SECURE \
+		| NFS_MOUNT_TCP \
+		| NFS_MOUNT_VER3 \
+		| NFS_MOUNT_KERBEROS \
+		| NFS_MOUNT_NONLM \
+		| NFS_MOUNT_BROKEN_SUID \
+		| NFS_MOUNT_STRICTLOCK \
+		| NFS_MOUNT_LEGACY_INTERFACE)
+
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+		~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+			 struct nfs_parsed_mount_data *data)
+{
+	if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
+	    data->rsize != nfss->rsize ||
+	    data->wsize != nfss->wsize ||
+	    data->version != nfss->nfs_client->rpc_ops->version ||
+	    data->minorversion != nfss->nfs_client->cl_minorversion ||
+	    data->retrans != nfss->client->cl_timeout->to_retries ||
+	    !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) ||
+	    data->acregmin != nfss->acregmin / HZ ||
+	    data->acregmax != nfss->acregmax / HZ ||
+	    data->acdirmin != nfss->acdirmin / HZ ||
+	    data->acdirmax != nfss->acdirmax / HZ ||
+	    data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+	    data->nfs_server.port != nfss->port ||
+	    data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+	    !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
+			  (struct sockaddr *)&nfss->nfs_client->cl_addr))
+		return -EINVAL;
+
+	return 0;
+}
+
+int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+	int error;
+	struct nfs_server *nfss = sb->s_fs_info;
+	struct nfs_parsed_mount_data *data;
+	struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+	struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+	u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+
+	sync_filesystem(sb);
+
+	/*
+	 * Userspace mount programs that send binary options generally send
+	 * them populated with default values. We have no way to know which
+	 * ones were explicitly specified. Fall back to legacy behavior and
+	 * just return success.
+	 */
+	if ((nfsvers == 4 && (!options4 || options4->version == 1)) ||
+	    (nfsvers <= 3 && (!options || (options->version >= 1 &&
+					   options->version <= 6))))
+		return 0;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+
+	/* fill out struct with values from existing mount */
+	data->flags = nfss->flags;
+	data->rsize = nfss->rsize;
+	data->wsize = nfss->wsize;
+	data->retrans = nfss->client->cl_timeout->to_retries;
+	data->selected_flavor = nfss->client->cl_auth->au_flavor;
+	data->acregmin = nfss->acregmin / HZ;
+	data->acregmax = nfss->acregmax / HZ;
+	data->acdirmin = nfss->acdirmin / HZ;
+	data->acdirmax = nfss->acdirmax / HZ;
+	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+	data->nfs_server.port = nfss->port;
+	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+	data->version = nfsvers;
+	data->minorversion = nfss->nfs_client->cl_minorversion;
+	data->net = current->nsproxy->net_ns;
+	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+		data->nfs_server.addrlen);
+
+	/* overwrite those values with any that were specified */
+	error = -EINVAL;
+	if (!nfs_parse_mount_options((char *)options, data))
+		goto out;
+
+	/*
+	 * noac is a special case. It implies -o sync, but that's not
+	 * necessarily reflected in the mtab options. do_remount_sb
+	 * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the
+	 * remount options, so we have to explicitly reset it.
+	 */
+	if (data->flags & NFS_MOUNT_NOAC)
+		*flags |= MS_SYNCHRONOUS;
+
+	/* compare new mount options with old ones */
+	error = nfs_compare_remount_data(nfss, data);
+out:
+	kfree(data);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_remount);
+
+/*
+ * Initialise the common bits of the superblock
+ */
+inline void nfs_initialise_sb(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	sb->s_magic = NFS_SUPER_MAGIC;
+
+	/* We probably want something more informative here */
+	snprintf(sb->s_id, sizeof(sb->s_id),
+		 "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+	if (sb->s_blocksize == 0)
+		sb->s_blocksize = nfs_block_bits(server->wsize,
+						 &sb->s_blocksize_bits);
+
+	sb->s_bdi = &server->backing_dev_info;
+
+	nfs_super_set_maxbytes(sb, server->maxfilesize);
+}
+
+/*
+ * Finish setting up an NFS2/3 superblock
+ */
+void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+{
+	struct nfs_parsed_mount_data *data = mount_info->parsed;
+	struct nfs_server *server = NFS_SB(sb);
+
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr;
+	sb->s_op = server->nfs_client->cl_nfs_mod->sops;
+	if (data && data->bsize)
+		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
+
+	if (server->nfs_client->rpc_ops->version != 2) {
+		/* The VFS shouldn't apply the umask to mode bits. We will do
+		 * so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
+		sb->s_time_gran = 1;
+	}
+
+ 	nfs_initialise_sb(sb);
+}
+EXPORT_SYMBOL_GPL(nfs_fill_super);
+
+/*
+ * Finish setting up a cloned NFS2/3/4 superblock
+ */
+void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+{
+	const struct super_block *old_sb = mount_info->cloned->sb;
+	struct nfs_server *server = NFS_SB(sb);
+
+	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
+	sb->s_blocksize = old_sb->s_blocksize;
+	sb->s_maxbytes = old_sb->s_maxbytes;
+	sb->s_xattr = old_sb->s_xattr;
+	sb->s_op = old_sb->s_op;
+	sb->s_time_gran = 1;
+
+	if (server->nfs_client->rpc_ops->version != 2) {
+		/* The VFS shouldn't apply the umask to mode bits. We will do
+		 * so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
+	}
+
+ 	nfs_initialise_sb(sb);
+}
+
+static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
+{
+	const struct nfs_server *a = s->s_fs_info;
+	const struct rpc_clnt *clnt_a = a->client;
+	const struct rpc_clnt *clnt_b = b->client;
+
+	if ((s->s_flags & NFS_MS_MASK) != (flags & NFS_MS_MASK))
+		goto Ebusy;
+	if (a->nfs_client != b->nfs_client)
+		goto Ebusy;
+	if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK)
+		goto Ebusy;
+	if (a->wsize != b->wsize)
+		goto Ebusy;
+	if (a->rsize != b->rsize)
+		goto Ebusy;
+	if (a->acregmin != b->acregmin)
+		goto Ebusy;
+	if (a->acregmax != b->acregmax)
+		goto Ebusy;
+	if (a->acdirmin != b->acdirmin)
+		goto Ebusy;
+	if (a->acdirmax != b->acdirmax)
+		goto Ebusy;
+	if (b->auth_info.flavor_len > 0 &&
+	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+		goto Ebusy;
+	return 1;
+Ebusy:
+	return 0;
+}
+
+struct nfs_sb_mountdata {
+	struct nfs_server *server;
+	int mntflags;
+};
+
+static int nfs_set_super(struct super_block *s, void *data)
+{
+	struct nfs_sb_mountdata *sb_mntdata = data;
+	struct nfs_server *server = sb_mntdata->server;
+	int ret;
+
+	s->s_flags = sb_mntdata->mntflags;
+	s->s_fs_info = server;
+	s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
+	ret = set_anon_super(s, server);
+	if (ret == 0)
+		server->s_dev = s->s_dev;
+	return ret;
+}
+
+static int nfs_compare_super_address(struct nfs_server *server1,
+				     struct nfs_server *server2)
+{
+	struct sockaddr *sap1, *sap2;
+
+	sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
+	sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
+
+	if (sap1->sa_family != sap2->sa_family)
+		return 0;
+
+	switch (sap1->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
+		struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
+		if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
+			return 0;
+		if (sin1->sin_port != sin2->sin_port)
+			return 0;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
+		struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
+		if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+			return 0;
+		if (sin1->sin6_port != sin2->sin6_port)
+			return 0;
+		break;
+	}
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static int nfs_compare_super(struct super_block *sb, void *data)
+{
+	struct nfs_sb_mountdata *sb_mntdata = data;
+	struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
+	int mntflags = sb_mntdata->mntflags;
+
+	if (!nfs_compare_super_address(old, server))
+		return 0;
+	/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
+	if (old->flags & NFS_MOUNT_UNSHARED)
+		return 0;
+	if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
+		return 0;
+	return nfs_compare_mount_options(sb, server, mntflags);
+}
+
+#ifdef CONFIG_NFS_FSCACHE
+static void nfs_get_cache_cookie(struct super_block *sb,
+				 struct nfs_parsed_mount_data *parsed,
+				 struct nfs_clone_mount *cloned)
+{
+	struct nfs_server *nfss = NFS_SB(sb);
+	char *uniq = NULL;
+	int ulen = 0;
+
+	nfss->fscache_key = NULL;
+	nfss->fscache = NULL;
+
+	if (parsed) {
+		if (!(parsed->options & NFS_OPTION_FSCACHE))
+			return;
+		if (parsed->fscache_uniq) {
+			uniq = parsed->fscache_uniq;
+			ulen = strlen(parsed->fscache_uniq);
+		}
+	} else if (cloned) {
+		struct nfs_server *mnt_s = NFS_SB(cloned->sb);
+		if (!(mnt_s->options & NFS_OPTION_FSCACHE))
+			return;
+		if (mnt_s->fscache_key) {
+			uniq = mnt_s->fscache_key->key.uniquifier;
+			ulen = mnt_s->fscache_key->key.uniq_len;
+		};
+	} else
+		return;
+
+	nfs_fscache_get_super_cookie(sb, uniq, ulen);
+}
+#else
+static void nfs_get_cache_cookie(struct super_block *sb,
+				 struct nfs_parsed_mount_data *parsed,
+				 struct nfs_clone_mount *cloned)
+{
+}
+#endif
+
+static int nfs_bdi_register(struct nfs_server *server)
+{
+	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
+}
+
+int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
+			struct nfs_mount_info *mount_info)
+{
+	int error;
+	unsigned long kflags = 0, kflags_out = 0;
+	if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+		kflags |= SECURITY_LSM_NATIVE_LABELS;
+
+	error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+						kflags, &kflags_out);
+	if (error)
+		goto err;
+
+	if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+		!(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+		NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+err:
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs_set_sb_security);
+
+int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
+			  struct nfs_mount_info *mount_info)
+{
+	/* clone any lsm security options from the parent to the new sb */
+	if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
+		return -ESTALE;
+	return security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
+}
+EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
+
+struct dentry *nfs_fs_mount_common(struct nfs_server *server,
+				   int flags, const char *dev_name,
+				   struct nfs_mount_info *mount_info,
+				   struct nfs_subversion *nfs_mod)
+{
+	struct super_block *s;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+		.server = server,
+	};
+	int error;
+
+	if (server->flags & NFS_MOUNT_UNSHARED)
+		compare_super = NULL;
+
+	/* -o noac implies -o sync */
+	if (server->flags & NFS_MOUNT_NOAC)
+		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
+	if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
+		if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
+			sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
+	if (IS_ERR(s)) {
+		mntroot = ERR_CAST(s);
+		goto out_err_nosb;
+	}
+
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error) {
+			mntroot = ERR_PTR(error);
+			goto error_splat_super;
+		}
+		server->super = s;
+	}
+
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		mount_info->fill_super(s, mount_info);
+		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
+	}
+
+	mntroot = nfs_get_root(s, mount_info->mntfh, dev_name);
+	if (IS_ERR(mntroot))
+		goto error_splat_super;
+
+	error = mount_info->set_security(s, mntroot, mount_info);
+	if (error)
+		goto error_splat_root;
+
+	s->s_flags |= MS_ACTIVE;
+
+out:
+	return mntroot;
+
+out_err_nosb:
+	nfs_free_server(server);
+	goto out;
+
+error_splat_root:
+	dput(mntroot);
+	mntroot = ERR_PTR(error);
+error_splat_super:
+	deactivate_locked_super(s);
+	goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_fs_mount_common);
+
+struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs_fill_super,
+		.set_security = nfs_set_sb_security,
+	};
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+	struct nfs_subversion *nfs_mod;
+	int error;
+
+	mount_info.parsed = nfs_alloc_parsed_mount_data();
+	mount_info.mntfh = nfs_alloc_fhandle();
+	if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
+		goto out;
+
+	/* Validate the mount data */
+	error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name);
+	if (error == NFS_TEXT_DATA)
+		error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name);
+	if (error < 0) {
+		mntroot = ERR_PTR(error);
+		goto out;
+	}
+
+	nfs_mod = get_nfs_version(mount_info.parsed->version);
+	if (IS_ERR(nfs_mod)) {
+		mntroot = ERR_CAST(nfs_mod);
+		goto out;
+	}
+
+	mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod);
+
+	put_nfs_version(nfs_mod);
+out:
+	nfs_free_parsed_mount_data(mount_info.parsed);
+	nfs_free_fhandle(mount_info.mntfh);
+	return mntroot;
+}
+EXPORT_SYMBOL_GPL(nfs_fs_mount);
+
+/*
+ * Destroy an NFS2/3 superblock
+ */
+void nfs_kill_super(struct super_block *s)
+{
+	struct nfs_server *server = NFS_SB(s);
+	dev_t dev = s->s_dev;
+
+	generic_shutdown_super(s);
+
+	nfs_fscache_release_super_cookie(s);
+
+	nfs_free_server(server);
+	free_anon_bdev(dev);
+}
+EXPORT_SYMBOL_GPL(nfs_kill_super);
+
+/*
+ * Clone an NFS2/3/4 server record on xdev traversal (FSID-change)
+ */
+static struct dentry *
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs_clone_super,
+		.set_security = nfs_clone_sb_security,
+		.cloned = data,
+	};
+	struct nfs_server *server;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+	struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod;
+
+	dprintk("--> nfs_xdev_mount()\n");
+
+	mount_info.mntfh = mount_info.cloned->fh;
+
+	/* create a new volume representation */
+	server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
+
+	if (IS_ERR(server))
+		mntroot = ERR_CAST(server);
+	else
+		mntroot = nfs_fs_mount_common(server, flags,
+				dev_name, &mount_info, nfs_mod);
+
+	dprintk("<-- nfs_xdev_mount() = %ld\n",
+			IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L);
+	return mntroot;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
+{
+	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
+			 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
+}
+
+/*
+ * Validate NFSv4 mount options
+ */
+static int nfs4_validate_mount_data(void *options,
+				    struct nfs_parsed_mount_data *args,
+				    const char *dev_name)
+{
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
+	char *c;
+
+	if (data == NULL)
+		goto out_no_data;
+
+	args->version = 4;
+
+	switch (data->version) {
+	case 1:
+		if (data->host_addrlen > sizeof(args->nfs_server.address))
+			goto out_no_address;
+		if (data->host_addrlen == 0)
+			goto out_no_address;
+		args->nfs_server.addrlen = data->host_addrlen;
+		if (copy_from_user(sap, data->host_addr, data->host_addrlen))
+			return -EFAULT;
+		if (!nfs_verify_server_address(sap))
+			goto out_no_address;
+		args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+
+		if (data->auth_flavourlen) {
+			rpc_authflavor_t pseudoflavor;
+			if (data->auth_flavourlen > 1)
+				goto out_inval_auth;
+			if (copy_from_user(&pseudoflavor,
+					   data->auth_flavours,
+					   sizeof(pseudoflavor)))
+				return -EFAULT;
+			args->selected_flavor = pseudoflavor;
+		} else
+			args->selected_flavor = RPC_AUTH_UNIX;
+
+		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
+		if (IS_ERR(c))
+			return PTR_ERR(c);
+		args->nfs_server.hostname = c;
+
+		c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
+		if (IS_ERR(c))
+			return PTR_ERR(c);
+		args->nfs_server.export_path = c;
+		dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+
+		c = strndup_user(data->client_addr.data, 16);
+		if (IS_ERR(c))
+			return PTR_ERR(c);
+		args->client_address = c;
+
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs4_fill_super
+		 * can deal with.
+		 */
+
+		args->flags	= data->flags & NFS4_MOUNT_FLAGMASK;
+		args->rsize	= data->rsize;
+		args->wsize	= data->wsize;
+		args->timeo	= data->timeo;
+		args->retrans	= data->retrans;
+		args->acregmin	= data->acregmin;
+		args->acregmax	= data->acregmax;
+		args->acdirmin	= data->acdirmin;
+		args->acdirmax	= data->acdirmax;
+		args->nfs_server.protocol = data->proto;
+		nfs_validate_transport_protocol(args);
+		if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+			goto out_invalid_transport_udp;
+
+		break;
+	default:
+		return NFS_TEXT_DATA;
+	}
+
+	return 0;
+
+out_no_data:
+	dfprintk(MOUNT, "NFS4: mount program didn't pass any mount data\n");
+	return -EINVAL;
+
+out_inval_auth:
+	dfprintk(MOUNT, "NFS4: Invalid number of RPC auth flavours %d\n",
+		 data->auth_flavourlen);
+	return -EINVAL;
+
+out_no_address:
+	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
+	return -EINVAL;
+
+out_invalid_transport_udp:
+	dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
+	return -EINVAL;
+}
+
+/*
+ * NFS v4 module parameters need to stay in the
+ * NFS client for backwards compatibility
+ */
+unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_tcpport;
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600;
+/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
+bool nfs4_disable_idmapping = true;
+unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+unsigned short send_implementation_id = 1;
+char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
+bool recover_lost_locks = false;
+
+EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
+EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
+EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
+EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
+EXPORT_SYMBOL_GPL(max_session_slots);
+EXPORT_SYMBOL_GPL(send_implementation_id);
+EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
+EXPORT_SYMBOL_GPL(recover_lost_locks);
+
+#define NFS_CALLBACK_MAXPORTNR (65535U)
+
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
+{
+	unsigned long num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+	ret = kstrtoul(val, 0, &num);
+	if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
+		return -EINVAL;
+	*((unsigned int *)kp->arg) = num;
+	return 0;
+}
+static struct kernel_param_ops param_ops_portnr = {
+	.set = param_set_portnr,
+	.get = param_get_uint,
+};
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+module_param(nfs_idmap_cache_timeout, int, 0644);
+module_param(nfs4_disable_idmapping, bool, 0644);
+module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
+			NFS4_CLIENT_ID_UNIQ_LEN, 0600);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+		"Turn off NFSv4 idmapping when using 'sec=sys'");
+module_param(max_session_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
+		"requests the client will negotiate");
+module_param(send_implementation_id, ushort, 0644);
+MODULE_PARM_DESC(send_implementation_id,
+		"Send implementation ID with NFSv4.1 exchange_id");
+MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
+
+module_param(recover_lost_locks, bool, 0644);
+MODULE_PARM_DESC(recover_lost_locks,
+		 "If the server reports that a lock might be lost, "
+		 "try to recover it risking data corruption.");
+
+
+#endif /* CONFIG_NFS_V4 */
diff --git a/kernel/fs/nfs/symlink.c b/kernel/fs/nfs/symlink.c
new file mode 100644
index 000000000..2d5620065
--- /dev/null
+++ b/kernel/fs/nfs/symlink.c
@@ -0,0 +1,78 @@
+/*
+ *  linux/fs/nfs/symlink.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ *  Jun 7 1999, cache symlink lookups in the page cache.  -DaveM
+ *
+ *  nfs symlink handling code
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/namei.h>
+
+/* Symlink caching in the page cache is even more simplistic
+ * and straight-forward than readdir caching.
+ */
+
+static int nfs_symlink_filler(struct inode *inode, struct page *page)
+{
+	int error;
+
+	error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE);
+	if (error < 0)
+		goto error;
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+error:
+	SetPageError(page);
+	unlock_page(page);
+	return -EIO;
+}
+
+static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = d_inode(dentry);
+	struct page *page;
+	void *err;
+
+	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+	if (err)
+		goto read_failed;
+	page = read_cache_page(&inode->i_data, 0,
+				(filler_t *)nfs_symlink_filler, inode);
+	if (IS_ERR(page)) {
+		err = page;
+		goto read_failed;
+	}
+	nd_set_link(nd, kmap(page));
+	return page;
+
+read_failed:
+	nd_set_link(nd, err);
+	return NULL;
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct inode_operations nfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= nfs_follow_link,
+	.put_link	= page_put_link,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+};
diff --git a/kernel/fs/nfs/sysctl.c b/kernel/fs/nfs/sysctl.c
new file mode 100644
index 000000000..bb6ed810f
--- /dev/null
+++ b/kernel/fs/nfs/sysctl.c
@@ -0,0 +1,64 @@
+/*
+ * linux/fs/nfs/sysctl.c
+ *
+ * Sysctl interface to NFS parameters
+ */
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+
+static struct ctl_table_header *nfs_callback_sysctl_table;
+
+static struct ctl_table nfs_cb_sysctls[] = {
+	{
+		.procname	= "nfs_mountpoint_timeout",
+		.data		= &nfs_mountpoint_expiry_timeout,
+		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nfs_congestion_kb",
+		.data		= &nfs_congestion_kb,
+		.maxlen		= sizeof(nfs_congestion_kb),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_table nfs_cb_sysctl_dir[] = {
+	{
+		.procname = "nfs",
+		.mode = 0555,
+		.child = nfs_cb_sysctls,
+	},
+	{ }
+};
+
+static struct ctl_table nfs_cb_sysctl_root[] = {
+	{
+		.procname = "fs",
+		.mode = 0555,
+		.child = nfs_cb_sysctl_dir,
+	},
+	{ }
+};
+
+int nfs_register_sysctl(void)
+{
+	nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root);
+	if (nfs_callback_sysctl_table == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfs_unregister_sysctl(void)
+{
+	unregister_sysctl_table(nfs_callback_sysctl_table);
+	nfs_callback_sysctl_table = NULL;
+}
diff --git a/kernel/fs/nfs/unlink.c b/kernel/fs/nfs/unlink.c
new file mode 100644
index 000000000..fa538b2ba
--- /dev/null
+++ b/kernel/fs/nfs/unlink.c
@@ -0,0 +1,604 @@
+/*
+ *  linux/fs/nfs/unlink.c
+ *
+ * nfs sillydelete handling
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
+
+#include "nfstrace.h"
+
+/**
+ * nfs_free_unlinkdata - release data from a sillydelete operation.
+ * @data: pointer to unlink structure.
+ */
+static void
+nfs_free_unlinkdata(struct nfs_unlinkdata *data)
+{
+	iput(data->dir);
+	put_rpccred(data->cred);
+	kfree(data->args.name.name);
+	kfree(data);
+}
+
+#define NAME_ALLOC_LEN(len)	((len+16) & ~15)
+/**
+ * nfs_copy_dname - copy dentry name to data structure
+ * @dentry: pointer to dentry
+ * @data: nfs_unlinkdata
+ */
+static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
+{
+	char		*str;
+	int		len = dentry->d_name.len;
+
+	str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+	data->args.name.len = len;
+	data->args.name.name = str;
+	return 0;
+}
+
+static void nfs_free_dname(struct nfs_unlinkdata *data)
+{
+	kfree(data->args.name.name);
+	data->args.name.name = NULL;
+	data->args.name.len = 0;
+}
+
+static void nfs_dec_sillycount(struct inode *dir)
+{
+	struct nfs_inode *nfsi = NFS_I(dir);
+	if (atomic_dec_return(&nfsi->silly_count) == 1)
+		wake_up(&nfsi->waitqueue);
+}
+
+/**
+ * nfs_async_unlink_done - Sillydelete post-processing
+ * @task: rpc_task of the sillydelete
+ *
+ * Do the directory attribute update.
+ */
+static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_unlinkdata *data = calldata;
+	struct inode *dir = data->dir;
+
+	trace_nfs_sillyrename_unlink(data, task->tk_status);
+	if (!NFS_PROTO(dir)->unlink_done(task, dir))
+		rpc_restart_call_prepare(task);
+}
+
+/**
+ * nfs_async_unlink_release - Release the sillydelete data.
+ * @task: rpc_task of the sillydelete
+ *
+ * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
+ * rpc_task would be freed too.
+ */
+static void nfs_async_unlink_release(void *calldata)
+{
+	struct nfs_unlinkdata	*data = calldata;
+	struct super_block *sb = data->dir->i_sb;
+
+	nfs_dec_sillycount(data->dir);
+	nfs_free_unlinkdata(data);
+	nfs_sb_deactive(sb);
+}
+
+static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_unlinkdata *data = calldata;
+	NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);
+}
+
+static const struct rpc_call_ops nfs_unlink_ops = {
+	.rpc_call_done = nfs_async_unlink_done,
+	.rpc_release = nfs_async_unlink_release,
+	.rpc_call_prepare = nfs_unlink_prepare,
+};
+
+static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
+{
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs_unlink_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct rpc_task *task;
+	struct dentry *alias;
+
+	alias = d_lookup(parent, &data->args.name);
+	if (alias != NULL) {
+		int ret;
+		void *devname_garbage = NULL;
+
+		/*
+		 * Hey, we raced with lookup... See if we need to transfer
+		 * the sillyrename information to the aliased dentry.
+		 */
+		nfs_free_dname(data);
+		ret = nfs_copy_dname(alias, data);
+		spin_lock(&alias->d_lock);
+		if (ret == 0 && d_really_is_positive(alias) &&
+		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+			devname_garbage = alias->d_fsdata;
+			alias->d_fsdata = data;
+			alias->d_flags |= DCACHE_NFSFS_RENAMED;
+			ret = 1;
+		} else
+			ret = 0;
+		spin_unlock(&alias->d_lock);
+		nfs_dec_sillycount(dir);
+		dput(alias);
+		/*
+		 * If we'd displaced old cached devname, free it.  At that
+		 * point dentry is definitely not a root, so we won't need
+		 * that anymore.
+		 */
+		kfree(devname_garbage);
+		return ret;
+	}
+	data->dir = igrab(dir);
+	if (!data->dir) {
+		nfs_dec_sillycount(dir);
+		return 0;
+	}
+	nfs_sb_active(dir->i_sb);
+	data->args.fh = NFS_FH(dir);
+	nfs_fattr_init(data->res.dir_attr);
+
+	NFS_PROTO(dir)->unlink_setup(&msg, dir);
+
+	task_setup_data.rpc_client = NFS_CLIENT(dir);
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task_async(task);
+	return 1;
+}
+
+static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
+{
+	struct dentry *parent;
+	struct inode *dir;
+	int ret = 0;
+
+
+	parent = dget_parent(dentry);
+	if (parent == NULL)
+		goto out_free;
+	dir = d_inode(parent);
+	/* Non-exclusive lock protects against concurrent lookup() calls */
+	spin_lock(&dir->i_lock);
+	if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
+		/* Deferred delete */
+		hlist_add_head(&data->list, &NFS_I(dir)->silly_list);
+		spin_unlock(&dir->i_lock);
+		ret = 1;
+		goto out_dput;
+	}
+	spin_unlock(&dir->i_lock);
+	ret = nfs_do_call_unlink(parent, dir, data);
+out_dput:
+	dput(parent);
+out_free:
+	return ret;
+}
+
+void nfs_wait_on_sillyrename(struct dentry *dentry)
+{
+	struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
+
+	wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1);
+}
+
+void nfs_block_sillyrename(struct dentry *dentry)
+{
+	struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
+
+	wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1);
+}
+
+void nfs_unblock_sillyrename(struct dentry *dentry)
+{
+	struct inode *dir = d_inode(dentry);
+	struct nfs_inode *nfsi = NFS_I(dir);
+	struct nfs_unlinkdata *data;
+
+	atomic_inc(&nfsi->silly_count);
+	spin_lock(&dir->i_lock);
+	while (!hlist_empty(&nfsi->silly_list)) {
+		if (!atomic_inc_not_zero(&nfsi->silly_count))
+			break;
+		data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list);
+		hlist_del(&data->list);
+		spin_unlock(&dir->i_lock);
+		if (nfs_do_call_unlink(dentry, dir, data) == 0)
+			nfs_free_unlinkdata(data);
+		spin_lock(&dir->i_lock);
+	}
+	spin_unlock(&dir->i_lock);
+}
+
+/**
+ * nfs_async_unlink - asynchronous unlinking of a file
+ * @dir: parent directory of dentry
+ * @dentry: dentry to unlink
+ */
+static int
+nfs_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct nfs_unlinkdata *data;
+	int status = -ENOMEM;
+	void *devname_garbage = NULL;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		goto out;
+
+	data->cred = rpc_lookup_cred();
+	if (IS_ERR(data->cred)) {
+		status = PTR_ERR(data->cred);
+		goto out_free;
+	}
+	data->res.dir_attr = &data->dir_attr;
+
+	status = -EBUSY;
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto out_unlock;
+	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+	devname_garbage = dentry->d_fsdata;
+	dentry->d_fsdata = data;
+	spin_unlock(&dentry->d_lock);
+	/*
+	 * If we'd displaced old cached devname, free it.  At that
+	 * point dentry is definitely not a root, so we won't need
+	 * that anymore.
+	 */
+	kfree(devname_garbage);
+	return 0;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	put_rpccred(data->cred);
+out_free:
+	kfree(data);
+out:
+	return status;
+}
+
+/**
+ * nfs_complete_unlink - Initialize completion of the sillydelete
+ * @dentry: dentry to delete
+ * @inode: inode
+ *
+ * Since we're most likely to be called by dentry_iput(), we
+ * only use the dentry to find the sillydelete. We then copy the name
+ * into the qstr.
+ */
+void
+nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
+{
+	struct nfs_unlinkdata	*data = NULL;
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		data = dentry->d_fsdata;
+		dentry->d_fsdata = NULL;
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
+		nfs_free_unlinkdata(data);
+}
+
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		struct nfs_unlinkdata *data = dentry->d_fsdata;
+
+		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		dentry->d_fsdata = NULL;
+		spin_unlock(&dentry->d_lock);
+		nfs_free_unlinkdata(data);
+		return;
+	}
+	spin_unlock(&dentry->d_lock);
+}
+
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	struct inode *old_dir = data->old_dir;
+	struct inode *new_dir = data->new_dir;
+	struct dentry *old_dentry = data->old_dentry;
+
+	trace_nfs_sillyrename_rename(old_dir, old_dentry,
+			new_dir, data->new_dentry, task->tk_status);
+	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+		rpc_restart_call_prepare(task);
+		return;
+	}
+
+	if (data->complete)
+		data->complete(task, data);
+}
+
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+	struct nfs_renamedata	*data = calldata;
+	struct super_block *sb = data->old_dir->i_sb;
+
+	if (d_really_is_positive(data->old_dentry))
+		nfs_mark_for_revalidate(d_inode(data->old_dentry));
+
+	dput(data->old_dentry);
+	dput(data->new_dentry);
+	iput(data->old_dir);
+	iput(data->new_dir);
+	nfs_sb_deactive(sb);
+	put_rpccred(data->cred);
+	kfree(data);
+}
+
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);
+}
+
+static const struct rpc_call_ops nfs_rename_ops = {
+	.rpc_call_done = nfs_async_rename_done,
+	.rpc_release = nfs_async_rename_release,
+	.rpc_call_prepare = nfs_rename_prepare,
+};
+
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+		 struct dentry *old_dentry, struct dentry *new_dentry,
+		 void (*complete)(struct rpc_task *, struct nfs_renamedata *))
+{
+	struct nfs_renamedata *data;
+	struct rpc_message msg = { };
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs_rename_ops,
+		.workqueue = nfsiod_workqueue,
+		.rpc_client = NFS_CLIENT(old_dir),
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return ERR_PTR(-ENOMEM);
+	task_setup_data.callback_data = data;
+
+	data->cred = rpc_lookup_cred();
+	if (IS_ERR(data->cred)) {
+		struct rpc_task *task = ERR_CAST(data->cred);
+		kfree(data);
+		return task;
+	}
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	msg.rpc_cred = data->cred;
+
+	/* set up nfs_renamedata */
+	data->old_dir = old_dir;
+	ihold(old_dir);
+	data->new_dir = new_dir;
+	ihold(new_dir);
+	data->old_dentry = dget(old_dentry);
+	data->new_dentry = dget(new_dentry);
+	nfs_fattr_init(&data->old_fattr);
+	nfs_fattr_init(&data->new_fattr);
+	data->complete = complete;
+
+	/* set up nfs_renameargs */
+	data->args.old_dir = NFS_FH(old_dir);
+	data->args.old_name = &old_dentry->d_name;
+	data->args.new_dir = NFS_FH(new_dir);
+	data->args.new_name = &new_dentry->d_name;
+
+	/* set up nfs_renameres */
+	data->res.old_fattr = &data->old_fattr;
+	data->res.new_fattr = &data->new_fattr;
+
+	nfs_sb_active(old_dir->i_sb);
+
+	NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+
+	return rpc_run_task(&task_setup_data);
+}
+
+/*
+ * Perform tasks needed when a sillyrename is done such as cancelling the
+ * queued async unlink if it failed.
+ */
+static void
+nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+	struct dentry *dentry = data->old_dentry;
+
+	if (task->tk_status != 0) {
+		nfs_cancel_async_unlink(dentry);
+		return;
+	}
+
+	/*
+	 * vfs_unlink and the like do not issue this when a file is
+	 * sillyrenamed, so do it here.
+	 */
+	fsnotify_nameremove(dentry, 0);
+}
+
+#define SILLYNAME_PREFIX ".nfs"
+#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
+#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
+#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1)
+#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \
+		SILLYNAME_FILEID_LEN + \
+		SILLYNAME_COUNTER_LEN)
+
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ *
+ * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server
+ * could take responsibility for keeping open files referenced.  The server
+ * would also need to ensure that opened-but-deleted files were kept over
+ * reboots.  However, we may not assume a server does so.  (RFC 5661
+ * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can
+ * use to advertise that it does this; some day we may take advantage of
+ * it.))
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+	static unsigned int sillycounter;
+	unsigned char silly[SILLYNAME_LEN + 1];
+	unsigned long long fileid;
+	struct dentry *sdentry;
+	struct rpc_task *task;
+	int            error = -EBUSY;
+
+	dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n",
+		dentry, d_count(dentry));
+	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+
+	/*
+	 * We don't allow a dentry to be silly-renamed twice.
+	 */
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto out;
+
+	fileid = NFS_FILEID(d_inode(dentry));
+
+	/* Return delegation in anticipation of the rename */
+	NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry));
+
+	sdentry = NULL;
+	do {
+		int slen;
+		dput(sdentry);
+		sillycounter++;
+		slen = scnprintf(silly, sizeof(silly),
+				SILLYNAME_PREFIX "%0*llx%0*x",
+				SILLYNAME_FILEID_LEN, fileid,
+				SILLYNAME_COUNTER_LEN, sillycounter);
+
+		dfprintk(VFS, "NFS: trying to rename %pd to %s\n",
+				dentry, silly);
+
+		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+		/*
+		 * N.B. Better to return EBUSY here ... it could be
+		 * dangerous to delete the file while it's in use.
+		 */
+		if (IS_ERR(sdentry))
+			goto out;
+	} while (d_inode(sdentry) != NULL); /* need negative lookup */
+
+	/* queue unlink first. Can't do this from rpc_release as it
+	 * has to allocate memory
+	 */
+	error = nfs_async_unlink(dir, dentry);
+	if (error)
+		goto out_dput;
+
+	/* populate unlinkdata with the right dname */
+	error = nfs_copy_dname(sdentry,
+				(struct nfs_unlinkdata *)dentry->d_fsdata);
+	if (error) {
+		nfs_cancel_async_unlink(dentry);
+		goto out_dput;
+	}
+
+	/* run the rename task, undo unlink if it fails */
+	task = nfs_async_rename(dir, dir, dentry, sdentry,
+					nfs_complete_sillyrename);
+	if (IS_ERR(task)) {
+		error = -EBUSY;
+		nfs_cancel_async_unlink(dentry);
+		goto out_dput;
+	}
+
+	/* wait for the RPC task to complete, unless a SIGKILL intervenes */
+	error = rpc_wait_for_completion_task(task);
+	if (error == 0)
+		error = task->tk_status;
+	switch (error) {
+	case 0:
+		/* The rename succeeded */
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+		d_move(dentry, sdentry);
+		break;
+	case -ERESTARTSYS:
+		/* The result of the rename is unknown. Play it safe by
+		 * forcing a new lookup */
+		d_drop(dentry);
+		d_drop(sdentry);
+	}
+	rpc_put_task(task);
+out_dput:
+	dput(sdentry);
+out:
+	return error;
+}
diff --git a/kernel/fs/nfs/write.c b/kernel/fs/nfs/write.c
new file mode 100644
index 000000000..dfc19f157
--- /dev/null
+++ b/kernel/fs/nfs/write.c
@@ -0,0 +1,2019 @@
+/*
+ * linux/fs/nfs/write.c
+ *
+ * Write file data over NFS.
+ *
+ * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/migrate.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
+#include <linux/export.h>
+
+#include <asm/uaccess.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "nfs4_fs.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+
+#define MIN_POOL_WRITE		(32)
+#define MIN_POOL_COMMIT		(4)
+
+/*
+ * Local function declarations
+ */
+static void nfs_redirty_request(struct nfs_page *req);
+static const struct rpc_call_ops nfs_commit_ops;
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
+static const struct nfs_rw_ops nfs_rw_write_ops;
+static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+				      struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page);
+
+static struct kmem_cache *nfs_wdata_cachep;
+static mempool_t *nfs_wdata_mempool;
+static struct kmem_cache *nfs_cdata_cachep;
+static mempool_t *nfs_commit_mempool;
+
+struct nfs_commit_data *nfs_commitdata_alloc(void)
+{
+	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+	}
+	return p;
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
+
+void nfs_commit_free(struct nfs_commit_data *p)
+{
+	mempool_free(p, nfs_commit_mempool);
+}
+EXPORT_SYMBOL_GPL(nfs_commit_free);
+
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
+{
+	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+
+	if (p)
+		memset(p, 0, sizeof(*p));
+	return p;
+}
+
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+	mempool_free(hdr, nfs_wdata_mempool);
+}
+
+static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
+{
+	ctx->error = error;
+	smp_wmb();
+	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+}
+
+/*
+ * nfs_page_find_head_request_locked - find head request associated with @page
+ *
+ * must be called while holding the inode lock.
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
+{
+	struct nfs_page *req = NULL;
+
+	if (PagePrivate(page))
+		req = (struct nfs_page *)page_private(page);
+	else if (unlikely(PageSwapCache(page)))
+		req = nfs_page_search_commits_for_head_request_locked(nfsi,
+			page);
+
+	if (req) {
+		WARN_ON_ONCE(req->wb_head != req);
+		kref_get(&req->wb_kref);
+	}
+
+	return req;
+}
+
+/*
+ * nfs_page_find_head_request - find head request associated with @page
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *nfs_page_find_head_request(struct page *page)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_page *req = NULL;
+
+	spin_lock(&inode->i_lock);
+	req = nfs_page_find_head_request_locked(NFS_I(inode), page);
+	spin_unlock(&inode->i_lock);
+	return req;
+}
+
+/* Adjust the file length if we're writing beyond the end */
+static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	loff_t end, i_size;
+	pgoff_t end_index;
+
+	spin_lock(&inode->i_lock);
+	i_size = i_size_read(inode);
+	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	if (i_size > 0 && page_file_index(page) < end_index)
+		goto out;
+	end = page_file_offset(page) + ((loff_t)offset+count);
+	if (i_size >= end)
+		goto out;
+	i_size_write(inode, end);
+	nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+	spin_unlock(&inode->i_lock);
+}
+
+/* A writeback failed: mark the page as bad, and invalidate the page cache */
+static void nfs_set_pageerror(struct page *page)
+{
+	nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
+}
+
+/*
+ * nfs_page_group_search_locked
+ * @head - head request of page group
+ * @page_offset - offset into page
+ *
+ * Search page group with head @head to find a request that contains the
+ * page offset @page_offset.
+ *
+ * Returns a pointer to the first matching nfs request, or NULL if no
+ * match is found.
+ *
+ * Must be called with the page group lock held
+ */
+static struct nfs_page *
+nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
+{
+	struct nfs_page *req;
+
+	WARN_ON_ONCE(head != head->wb_head);
+	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
+
+	req = head;
+	do {
+		if (page_offset >= req->wb_pgbase &&
+		    page_offset < (req->wb_pgbase + req->wb_bytes))
+			return req;
+
+		req = req->wb_this_page;
+	} while (req != head);
+
+	return NULL;
+}
+
+/*
+ * nfs_page_group_covers_page
+ * @head - head request of page group
+ *
+ * Return true if the page group with head @head covers the whole page,
+ * returns false otherwise
+ */
+static bool nfs_page_group_covers_page(struct nfs_page *req)
+{
+	struct nfs_page *tmp;
+	unsigned int pos = 0;
+	unsigned int len = nfs_page_length(req->wb_page);
+
+	nfs_page_group_lock(req, false);
+
+	do {
+		tmp = nfs_page_group_search_locked(req->wb_head, pos);
+		if (tmp) {
+			/* no way this should happen */
+			WARN_ON_ONCE(tmp->wb_pgbase != pos);
+			pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
+		}
+	} while (tmp && pos < len);
+
+	nfs_page_group_unlock(req);
+	WARN_ON_ONCE(pos > len);
+	return pos == len;
+}
+
+/* We can set the PG_uptodate flag if we see that a write request
+ * covers the full page.
+ */
+static void nfs_mark_uptodate(struct nfs_page *req)
+{
+	if (PageUptodate(req->wb_page))
+		return;
+	if (!nfs_page_group_covers_page(req))
+		return;
+	SetPageUptodate(req->wb_page);
+}
+
+static int wb_priority(struct writeback_control *wbc)
+{
+	int ret = 0;
+	if (wbc->for_reclaim)
+		return FLUSH_HIGHPRI | FLUSH_STABLE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ret = FLUSH_COND_STABLE;
+	if (wbc->for_kupdate || wbc->for_background)
+		ret |= FLUSH_LOWPRI;
+	return ret;
+}
+
+/*
+ * NFS congestion control
+ */
+
+int nfs_congestion_kb;
+
+#define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH	\
+	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static void nfs_set_page_writeback(struct page *page)
+{
+	struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+	int ret = test_set_page_writeback(page);
+
+	WARN_ON_ONCE(ret != 0);
+
+	if (atomic_long_inc_return(&nfss->writeback) >
+			NFS_CONGESTION_ON_THRESH) {
+		set_bdi_congested(&nfss->backing_dev_info,
+					BLK_RW_ASYNC);
+	}
+}
+
+static void nfs_end_page_writeback(struct nfs_page *req)
+{
+	struct inode *inode = page_file_mapping(req->wb_page)->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+
+	if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+		return;
+
+	end_page_writeback(req->wb_page);
+	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+}
+
+
+/* nfs_page_group_clear_bits
+ *   @req - an nfs request
+ * clears all page group related bits from @req
+ */
+static void
+nfs_page_group_clear_bits(struct nfs_page *req)
+{
+	clear_bit(PG_TEARDOWN, &req->wb_flags);
+	clear_bit(PG_UNLOCKPAGE, &req->wb_flags);
+	clear_bit(PG_UPTODATE, &req->wb_flags);
+	clear_bit(PG_WB_END, &req->wb_flags);
+	clear_bit(PG_REMOVE, &req->wb_flags);
+}
+
+
+/*
+ * nfs_unroll_locks_and_wait -  unlock all newly locked reqs and wait on @req
+ *
+ * this is a helper function for nfs_lock_and_join_requests
+ *
+ * @inode - inode associated with request page group, must be holding inode lock
+ * @head  - head request of page group, must be holding head lock
+ * @req   - request that couldn't lock and needs to wait on the req bit lock
+ * @nonblock - if true, don't actually wait
+ *
+ * NOTE: this must be called holding page_group bit lock and inode spin lock
+ *       and BOTH will be released before returning.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+static int
+nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
+			  struct nfs_page *req, bool nonblock)
+	__releases(&inode->i_lock)
+{
+	struct nfs_page *tmp;
+	int ret;
+
+	/* relinquish all the locks successfully grabbed this run */
+	for (tmp = head ; tmp != req; tmp = tmp->wb_this_page)
+		nfs_unlock_request(tmp);
+
+	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+
+	/* grab a ref on the request that will be waited on */
+	kref_get(&req->wb_kref);
+
+	nfs_page_group_unlock(head);
+	spin_unlock(&inode->i_lock);
+
+	/* release ref from nfs_page_find_head_request_locked */
+	nfs_release_request(head);
+
+	if (!nonblock)
+		ret = nfs_wait_on_request(req);
+	else
+		ret = -EAGAIN;
+	nfs_release_request(req);
+
+	return ret;
+}
+
+/*
+ * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
+ *
+ * @destroy_list - request list (using wb_this_page) terminated by @old_head
+ * @old_head - the old head of the list
+ *
+ * All subrequests must be locked and removed from all lists, so at this point
+ * they are only "active" in this function, and possibly in nfs_wait_on_request
+ * with a reference held by some other context.
+ */
+static void
+nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
+				 struct nfs_page *old_head)
+{
+	while (destroy_list) {
+		struct nfs_page *subreq = destroy_list;
+
+		destroy_list = (subreq->wb_this_page == old_head) ?
+				   NULL : subreq->wb_this_page;
+
+		WARN_ON_ONCE(old_head != subreq->wb_head);
+
+		/* make sure old group is not used */
+		subreq->wb_head = subreq;
+		subreq->wb_this_page = subreq;
+
+		/* subreq is now totally disconnected from page group or any
+		 * write / commit lists. last chance to wake any waiters */
+		nfs_unlock_request(subreq);
+
+		if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+			/* release ref on old head request */
+			nfs_release_request(old_head);
+
+			nfs_page_group_clear_bits(subreq);
+
+			/* release the PG_INODE_REF reference */
+			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags))
+				nfs_release_request(subreq);
+			else
+				WARN_ON_ONCE(1);
+		} else {
+			WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
+			/* zombie requests have already released the last
+			 * reference and were waiting on the rest of the
+			 * group to complete. Since it's no longer part of a
+			 * group, simply free the request */
+			nfs_page_group_clear_bits(subreq);
+			nfs_free_request(subreq);
+		}
+	}
+}
+
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req and return
+ *                              a locked reference, cancelling any pending
+ *                              operations for this page.
+ *
+ * @page - the page used to lookup the "page group" of nfs_page structures
+ * @nonblock - if true, don't block waiting for request locks
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group.  All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page, bool nonblock)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_page *head, *subreq;
+	struct nfs_page *destroy_list = NULL;
+	unsigned int total_bytes;
+	int ret;
+
+try_again:
+	total_bytes = 0;
+
+	WARN_ON_ONCE(destroy_list);
+
+	spin_lock(&inode->i_lock);
+
+	/*
+	 * A reference is taken only on the head request which acts as a
+	 * reference to the whole page group - the group will not be destroyed
+	 * until the head reference is released.
+	 */
+	head = nfs_page_find_head_request_locked(NFS_I(inode), page);
+
+	if (!head) {
+		spin_unlock(&inode->i_lock);
+		return NULL;
+	}
+
+	/* holding inode lock, so always make a non-blocking call to try the
+	 * page group lock */
+	ret = nfs_page_group_lock(head, true);
+	if (ret < 0) {
+		spin_unlock(&inode->i_lock);
+
+		if (!nonblock && ret == -EAGAIN) {
+			nfs_page_group_lock_wait(head);
+			nfs_release_request(head);
+			goto try_again;
+		}
+
+		nfs_release_request(head);
+		return ERR_PTR(ret);
+	}
+
+	/* lock each request in the page group */
+	subreq = head;
+	do {
+		/*
+		 * Subrequests are always contiguous, non overlapping
+		 * and in order - but may be repeated (mirrored writes).
+		 */
+		if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
+			/* keep track of how many bytes this group covers */
+			total_bytes += subreq->wb_bytes;
+		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
+			    ((subreq->wb_offset + subreq->wb_bytes) >
+			     (head->wb_offset + total_bytes)))) {
+			nfs_page_group_unlock(head);
+			spin_unlock(&inode->i_lock);
+			return ERR_PTR(-EIO);
+		}
+
+		if (!nfs_lock_request(subreq)) {
+			/* releases page group bit lock and
+			 * inode spin lock and all references */
+			ret = nfs_unroll_locks_and_wait(inode, head,
+				subreq, nonblock);
+
+			if (ret == 0)
+				goto try_again;
+
+			return ERR_PTR(ret);
+		}
+
+		subreq = subreq->wb_this_page;
+	} while (subreq != head);
+
+	/* Now that all requests are locked, make sure they aren't on any list.
+	 * Commit list removal accounting is done after locks are dropped */
+	subreq = head;
+	do {
+		nfs_clear_request_commit(subreq);
+		subreq = subreq->wb_this_page;
+	} while (subreq != head);
+
+	/* unlink subrequests from head, destroy them later */
+	if (head->wb_this_page != head) {
+		/* destroy list will be terminated by head */
+		destroy_list = head->wb_this_page;
+		head->wb_this_page = head;
+
+		/* change head request to cover whole range that
+		 * the former page group covered */
+		head->wb_bytes = total_bytes;
+	}
+
+	/*
+	 * prepare head request to be added to new pgio descriptor
+	 */
+	nfs_page_group_clear_bits(head);
+
+	/*
+	 * some part of the group was still on the inode list - otherwise
+	 * the group wouldn't be involved in async write.
+	 * grab a reference for the head request, iff it needs one.
+	 */
+	if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags))
+		kref_get(&head->wb_kref);
+
+	nfs_page_group_unlock(head);
+
+	/* drop lock to clean uprequests on destroy list */
+	spin_unlock(&inode->i_lock);
+
+	nfs_destroy_unlinked_subrequests(destroy_list, head);
+
+	/* still holds ref on head from nfs_page_find_head_request_locked
+	 * and still has lock on head from lock loop */
+	return head;
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+				struct page *page, bool nonblock)
+{
+	struct nfs_page *req;
+	int ret = 0;
+
+	req = nfs_lock_and_join_requests(page, nonblock);
+	if (!req)
+		goto out;
+	ret = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	nfs_set_page_writeback(page);
+	WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
+
+	ret = 0;
+	if (!nfs_pageio_add_request(pgio, req)) {
+		nfs_redirty_request(req);
+		ret = pgio->pg_error;
+	}
+out:
+	return ret;
+}
+
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	int ret;
+
+	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
+
+	nfs_pageio_cond_complete(pgio, page_file_index(page));
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+	if (ret == -EAGAIN) {
+		redirty_page_for_writepage(wbc, page);
+		ret = 0;
+	}
+	return ret;
+}
+
+/*
+ * Write an mmapped page to the server.
+ */
+static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+{
+	struct nfs_pageio_descriptor pgio;
+	int err;
+
+	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+				false, &nfs_async_write_completion_ops);
+	err = nfs_do_writepage(page, wbc, &pgio);
+	nfs_pageio_complete(&pgio);
+	if (err < 0)
+		return err;
+	if (pgio.pg_error < 0)
+		return pgio.pg_error;
+	return 0;
+}
+
+int nfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = nfs_writepage_locked(page, wbc);
+	unlock_page(page);
+	return ret;
+}
+
+static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+{
+	int ret;
+
+	ret = nfs_do_writepage(page, wbc, data);
+	unlock_page(page);
+	return ret;
+}
+
+int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	unsigned long *bitlock = &NFS_I(inode)->flags;
+	struct nfs_pageio_descriptor pgio;
+	int err;
+
+	/* Stop dirtying of new pages while we sync */
+	err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (err)
+		goto out_err;
+
+	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+				&nfs_async_write_completion_ops);
+	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
+	nfs_pageio_complete(&pgio);
+
+	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+	smp_mb__after_atomic();
+	wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
+	if (err < 0)
+		goto out_err;
+	err = pgio.pg_error;
+	if (err < 0)
+		goto out_err;
+	return 0;
+out_err:
+	return err;
+}
+
+/*
+ * Insert a write request into an inode
+ */
+static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	WARN_ON_ONCE(req->wb_this_page != req);
+
+	/* Lock the request! */
+	nfs_lock_request(req);
+
+	spin_lock(&inode->i_lock);
+	if (!nfsi->nrequests &&
+	    NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+		inode->i_version++;
+	/*
+	 * Swap-space should not get truncated. Hence no need to plug the race
+	 * with invalidate/truncate.
+	 */
+	if (likely(!PageSwapCache(req->wb_page))) {
+		set_bit(PG_MAPPED, &req->wb_flags);
+		SetPagePrivate(req->wb_page);
+		set_page_private(req->wb_page, (unsigned long)req);
+	}
+	nfsi->nrequests++;
+	/* this a head request for a page group - mark it as having an
+	 * extra reference so sub groups can follow suit.
+	 * This flag also informs pgio layer when to bump nrequests when
+	 * adding subrequests. */
+	WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
+	kref_get(&req->wb_kref);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Remove a write request from an inode
+ */
+static void nfs_inode_remove_request(struct nfs_page *req)
+{
+	struct inode *inode = d_inode(req->wb_context->dentry);
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_page *head;
+
+	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+		head = req->wb_head;
+
+		spin_lock(&inode->i_lock);
+		if (likely(!PageSwapCache(head->wb_page))) {
+			set_page_private(head->wb_page, 0);
+			ClearPagePrivate(head->wb_page);
+			smp_mb__after_atomic();
+			wake_up_page(head->wb_page, PG_private);
+			clear_bit(PG_MAPPED, &head->wb_flags);
+		}
+		nfsi->nrequests--;
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_lock(&inode->i_lock);
+		nfsi->nrequests--;
+		spin_unlock(&inode->i_lock);
+	}
+
+	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+		nfs_release_request(req);
+}
+
+static void
+nfs_mark_request_dirty(struct nfs_page *req)
+{
+	__set_page_dirty_nobuffers(req->wb_page);
+}
+
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page)
+{
+	struct nfs_page *freq, *t;
+	struct nfs_commit_info cinfo;
+	struct inode *inode = &nfsi->vfs_inode;
+
+	nfs_init_cinfo_from_inode(&cinfo, inode);
+
+	/* search through pnfs commit lists */
+	freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+	if (freq)
+		return freq->wb_head;
+
+	/* Linearly search the commit list for the correct request */
+	list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+		if (freq->wb_page == page)
+			return freq->wb_head;
+	}
+
+	return NULL;
+}
+
+/**
+ * nfs_request_add_commit_list - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must _not_ hold the cinfo->lock, but must be
+ * holding the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+			    struct nfs_commit_info *cinfo)
+{
+	set_bit(PG_CLEAN, &(req)->wb_flags);
+	spin_lock(cinfo->lock);
+	nfs_list_add_request(req, dst);
+	cinfo->mds->ncommit++;
+	spin_unlock(cinfo->lock);
+	if (!cinfo->dreq)
+		nfs_mark_page_unstable(req->wb_page);
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
+
+/**
+ * nfs_request_remove_commit_list - Remove request from a commit list
+ * @req: pointer to a nfs_page
+ * @cinfo: holds list lock and accounting info
+ *
+ * This clears the PG_CLEAN bit, and updates the cinfo's count of
+ * number of outstanding requests requiring a commit
+ * It does not update the MM page stats.
+ *
+ * The caller _must_ hold the cinfo->lock and the nfs_page lock.
+ */
+void
+nfs_request_remove_commit_list(struct nfs_page *req,
+			       struct nfs_commit_info *cinfo)
+{
+	if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
+		return;
+	nfs_list_remove_request(req);
+	cinfo->mds->ncommit--;
+}
+EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
+
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+				      struct inode *inode)
+{
+	cinfo->lock = &inode->i_lock;
+	cinfo->mds = &NFS_I(inode)->commit_info;
+	cinfo->ds = pnfs_get_ds_info(inode);
+	cinfo->dreq = NULL;
+	cinfo->completion_ops = &nfs_commit_completion_ops;
+}
+
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq)
+{
+	if (dreq)
+		nfs_init_cinfo_from_dreq(cinfo, dreq);
+	else
+		nfs_init_cinfo_from_inode(cinfo, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_init_cinfo);
+
+/*
+ * Add a request to the inode's commit list.
+ */
+void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+	if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
+		return;
+	nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+}
+
+static void
+nfs_clear_page_commit(struct page *page)
+{
+	dec_zone_page_state(page, NR_UNSTABLE_NFS);
+	dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+}
+
+/* Called holding inode (/cinfo) lock */
+static void
+nfs_clear_request_commit(struct nfs_page *req)
+{
+	if (test_bit(PG_CLEAN, &req->wb_flags)) {
+		struct inode *inode = d_inode(req->wb_context->dentry);
+		struct nfs_commit_info cinfo;
+
+		nfs_init_cinfo_from_inode(&cinfo, inode);
+		if (!pnfs_clear_request_commit(req, &cinfo)) {
+			nfs_request_remove_commit_list(req, &cinfo);
+		}
+		nfs_clear_page_commit(req->wb_page);
+	}
+}
+
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
+{
+	if (hdr->verf.committed == NFS_DATA_SYNC)
+		return hdr->lseg == NULL;
+	return hdr->verf.committed != NFS_FILE_SYNC;
+}
+
+static void nfs_write_completion(struct nfs_pgio_header *hdr)
+{
+	struct nfs_commit_info cinfo;
+	unsigned long bytes = 0;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out;
+	nfs_init_cinfo_from_inode(&cinfo, hdr->inode);
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+		bytes += req->wb_bytes;
+		nfs_list_remove_request(req);
+		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
+		    (hdr->good_bytes < bytes)) {
+			nfs_set_pageerror(req->wb_page);
+			nfs_context_set_write_error(req->wb_context, hdr->error);
+			goto remove_req;
+		}
+		if (nfs_write_need_commit(hdr)) {
+			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
+			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+				hdr->pgio_mirror_idx);
+			goto next;
+		}
+remove_req:
+		nfs_inode_remove_request(req);
+next:
+		nfs_unlock_request(req);
+		nfs_end_page_writeback(req);
+		nfs_release_request(req);
+	}
+out:
+	hdr->release(hdr);
+}
+
+unsigned long
+nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
+{
+	return cinfo->mds->ncommit;
+}
+
+/* cinfo->lock held by caller */
+int
+nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+		     struct nfs_commit_info *cinfo, int max)
+{
+	struct nfs_page *req, *tmp;
+	int ret = 0;
+
+	list_for_each_entry_safe(req, tmp, src, wb_list) {
+		if (!nfs_lock_request(req))
+			continue;
+		kref_get(&req->wb_kref);
+		if (cond_resched_lock(cinfo->lock))
+			list_safe_reset_next(req, tmp, wb_list);
+		nfs_request_remove_commit_list(req, cinfo);
+		nfs_list_add_request(req, dst);
+		ret++;
+		if ((ret == max) && !cinfo->dreq)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * nfs_scan_commit - Scan an inode for commit requests
+ * @inode: NFS inode to scan
+ * @dst: mds destination list
+ * @cinfo: mds and ds lists of reqs ready to commit
+ *
+ * Moves requests from the inode's 'commit' request list.
+ * The requests are *not* checked to ensure that they form a contiguous set.
+ */
+int
+nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		struct nfs_commit_info *cinfo)
+{
+	int ret = 0;
+
+	spin_lock(cinfo->lock);
+	if (cinfo->mds->ncommit > 0) {
+		const int max = INT_MAX;
+
+		ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
+					   cinfo, max);
+		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
+	}
+	spin_unlock(cinfo->lock);
+	return ret;
+}
+
+/*
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
+ *
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
+ */
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+		struct page *page,
+		unsigned int offset,
+		unsigned int bytes)
+{
+	struct nfs_page *req;
+	unsigned int rqend;
+	unsigned int end;
+	int error;
+
+	if (!PagePrivate(page))
+		return NULL;
+
+	end = offset + bytes;
+	spin_lock(&inode->i_lock);
+
+	for (;;) {
+		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
+		if (req == NULL)
+			goto out_unlock;
+
+		/* should be handled by nfs_flush_incompatible */
+		WARN_ON_ONCE(req->wb_head != req);
+		WARN_ON_ONCE(req->wb_this_page != req);
+
+		rqend = req->wb_offset + req->wb_bytes;
+		/*
+		 * Tell the caller to flush out the request if
+		 * the offsets are non-contiguous.
+		 * Note: nfs_flush_incompatible() will already
+		 * have flushed out requests having wrong owners.
+		 */
+		if (offset > rqend
+		    || end < req->wb_offset)
+			goto out_flushme;
+
+		if (nfs_lock_request(req))
+			break;
+
+		/* The request is locked, so wait and then retry */
+		spin_unlock(&inode->i_lock);
+		error = nfs_wait_on_request(req);
+		nfs_release_request(req);
+		if (error != 0)
+			goto out_err;
+		spin_lock(&inode->i_lock);
+	}
+
+	/* Okay, the request matches. Update the region */
+	if (offset < req->wb_offset) {
+		req->wb_offset = offset;
+		req->wb_pgbase = offset;
+	}
+	if (end > rqend)
+		req->wb_bytes = end - req->wb_offset;
+	else
+		req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+	if (req)
+		nfs_clear_request_commit(req);
+	spin_unlock(&inode->i_lock);
+	return req;
+out_flushme:
+	spin_unlock(&inode->i_lock);
+	nfs_release_request(req);
+	error = nfs_wb_page(inode, page);
+out_err:
+	return ERR_PTR(error);
+}
+
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+		struct page *page, unsigned int offset, unsigned int bytes)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_page	*req;
+
+	req = nfs_try_to_update_request(inode, page, offset, bytes);
+	if (req != NULL)
+		goto out;
+	req = nfs_create_request(ctx, page, NULL, offset, bytes);
+	if (IS_ERR(req))
+		goto out;
+	nfs_inode_add_request(inode, req);
+out:
+	return req;
+}
+
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+		unsigned int offset, unsigned int count)
+{
+	struct nfs_page	*req;
+
+	req = nfs_setup_write_request(ctx, page, offset, count);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	/* Update file length */
+	nfs_grow_file(page, offset, count);
+	nfs_mark_uptodate(req);
+	nfs_mark_request_dirty(req);
+	nfs_unlock_and_release_request(req);
+	return 0;
+}
+
+int nfs_flush_incompatible(struct file *file, struct page *page)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	struct nfs_lock_context *l_ctx;
+	struct file_lock_context *flctx = file_inode(file)->i_flctx;
+	struct nfs_page	*req;
+	int do_flush, status;
+	/*
+	 * Look for a request corresponding to this page. If there
+	 * is one, and it belongs to another file, we flush it out
+	 * before we try to copy anything into the page. Do this
+	 * due to the lack of an ACCESS-type call in NFSv2.
+	 * Also do the same if we find a request from an existing
+	 * dropped page.
+	 */
+	do {
+		req = nfs_page_find_head_request(page);
+		if (req == NULL)
+			return 0;
+		l_ctx = req->wb_lock_context;
+		do_flush = req->wb_page != page || req->wb_context != ctx;
+		/* for now, flush if more than 1 request in page_group */
+		do_flush |= req->wb_this_page != req;
+		if (l_ctx && flctx &&
+		    !(list_empty_careful(&flctx->flc_posix) &&
+		      list_empty_careful(&flctx->flc_flock))) {
+			do_flush |= l_ctx->lockowner.l_owner != current->files
+				|| l_ctx->lockowner.l_pid != current->tgid;
+		}
+		nfs_release_request(req);
+		if (!do_flush)
+			return 0;
+		status = nfs_wb_page(page_file_mapping(page)->host, page);
+	} while (status == 0);
+	return status;
+}
+
+/*
+ * Avoid buffered writes when a open context credential's key would
+ * expire soon.
+ *
+ * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL.
+ *
+ * Return 0 and set a credential flag which triggers the inode to flush
+ * and performs  NFS_FILE_SYNC writes if the key will expired within
+ * RPC_KEY_EXPIRE_TIMEO.
+ */
+int
+nfs_key_timeout_notify(struct file *filp, struct inode *inode)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(filp);
+	struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+
+	return rpcauth_key_timeout_notify(auth, ctx->cred);
+}
+
+/*
+ * Test if the open context credential key is marked to expire soon.
+ */
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
+{
+	return rpcauth_cred_key_to_expire(ctx->cred);
+}
+
+/*
+ * If the page cache is marked as unsafe or invalid, then we can't rely on
+ * the PageUptodate() flag. In this case, we will need to turn off
+ * write optimisations that depend on the page contents being correct.
+ */
+static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfs_have_delegated_attributes(inode))
+		goto out;
+	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		return false;
+	smp_rmb();
+	if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
+		return false;
+out:
+	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+		return false;
+	return PageUptodate(page) != 0;
+}
+
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+	return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+			fl->fl_type == F_WRLCK;
+}
+
+/* If we know the page is up to date, and we're not using byte range locks (or
+ * if we have the whole file locked for writing), it may be more efficient to
+ * extend the write to cover the entire page in order to avoid fragmentation
+ * inefficiencies.
+ *
+ * If the file is opened for synchronous writes then we can just skip the rest
+ * of the checks.
+ */
+static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
+{
+	int ret;
+	struct file_lock_context *flctx = inode->i_flctx;
+	struct file_lock *fl;
+
+	if (file->f_flags & O_DSYNC)
+		return 0;
+	if (!nfs_write_pageuptodate(page, inode))
+		return 0;
+	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+		return 1;
+	if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
+		       list_empty_careful(&flctx->flc_posix)))
+		return 0;
+
+	/* Check to see if there are whole file write locks */
+	ret = 0;
+	spin_lock(&flctx->flc_lock);
+	if (!list_empty(&flctx->flc_posix)) {
+		fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+					fl_list);
+		if (is_whole_file_wrlock(fl))
+			ret = 1;
+	} else if (!list_empty(&flctx->flc_flock)) {
+		fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+					fl_list);
+		if (fl->fl_type == F_WRLCK)
+			ret = 1;
+	}
+	spin_unlock(&flctx->flc_lock);
+	return ret;
+}
+
+/*
+ * Update and possibly write a cached page of an NFS file.
+ *
+ * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
+ * things with a page scheduled for an RPC call (e.g. invalidate it).
+ */
+int nfs_updatepage(struct file *file, struct page *page,
+		unsigned int offset, unsigned int count)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	struct inode	*inode = page_file_mapping(page)->host;
+	int		status = 0;
+
+	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
+
+	dprintk("NFS:       nfs_updatepage(%pD2 %d@%lld)\n",
+		file, count, (long long)(page_file_offset(page) + offset));
+
+	if (nfs_can_extend_write(file, page, inode)) {
+		count = max(count + offset, nfs_page_length(page));
+		offset = 0;
+	}
+
+	status = nfs_writepage_setup(ctx, page, offset, count);
+	if (status < 0)
+		nfs_set_pageerror(page);
+	else
+		__set_page_dirty_nobuffers(page);
+
+	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
+			status, (long long)i_size_read(inode));
+	return status;
+}
+
+static int flush_task_priority(int how)
+{
+	switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
+		case FLUSH_HIGHPRI:
+			return RPC_PRIORITY_HIGH;
+		case FLUSH_LOWPRI:
+			return RPC_PRIORITY_LOW;
+	}
+	return RPC_PRIORITY_NORMAL;
+}
+
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+			       struct rpc_message *msg,
+			       const struct nfs_rpc_ops *rpc_ops,
+			       struct rpc_task_setup *task_setup_data, int how)
+{
+	int priority = flush_task_priority(how);
+
+	task_setup_data->priority = priority;
+	rpc_ops->write_setup(hdr, msg);
+
+	nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
+				 &task_setup_data->rpc_client, msg, hdr);
+}
+
+/* If a nfs_flush_* function fails, it should remove reqs from @head and
+ * call this on each, which will prepare them to be retried on next
+ * writeback using standard nfs.
+ */
+static void nfs_redirty_request(struct nfs_page *req)
+{
+	nfs_mark_request_dirty(req);
+	nfs_unlock_request(req);
+	nfs_end_page_writeback(req);
+	nfs_release_request(req);
+}
+
+static void nfs_async_write_error(struct list_head *head)
+{
+	struct nfs_page	*req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_redirty_request(req);
+	}
+}
+
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
+	.error_cleanup = nfs_async_write_error,
+	.completion = nfs_write_completion,
+};
+
+void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+			       struct inode *inode, int ioflags, bool force_mds,
+			       const struct nfs_pgio_completion_ops *compl_ops)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+	if (server->pnfs_curr_ld && !force_mds)
+		pg_ops = server->pnfs_curr_ld->pg_write_ops;
+#endif
+	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
+			server->wsize, ioflags);
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
+
+void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
+{
+	struct nfs_pgio_mirror *mirror;
+
+	pgio->pg_ops = &nfs_pgio_rw_ops;
+
+	nfs_pageio_stop_mirroring(pgio);
+
+	mirror = &pgio->pg_mirrors[0];
+	mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
+
+
+void nfs_commit_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_commit_data *data = calldata;
+
+	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
+}
+
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
+{
+	/* do nothing! */
+}
+
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static int nfs_should_remove_suid(const struct inode *inode)
+{
+	umode_t mode = inode->i_mode;
+	int kill = 0;
+
+	/* suid always must be killed */
+	if (unlikely(mode & S_ISUID))
+		kill = ATTR_KILL_SUID;
+
+	/*
+	 * sgid without any exec bits is just a mandatory locking mark; leave
+	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
+	 */
+	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+		kill |= ATTR_KILL_SGID;
+
+	if (unlikely(kill && S_ISREG(mode)))
+		return kill;
+
+	return 0;
+}
+
+static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
+		struct nfs_fattr *fattr)
+{
+	struct nfs_pgio_args *argp = &hdr->args;
+	struct nfs_pgio_res *resp = &hdr->res;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		return;
+	if (argp->offset + resp->count != fattr->size)
+		return;
+	if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode))
+		return;
+	/* Set attribute barrier */
+	nfs_fattr_set_barrier(fattr);
+}
+
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
+{
+	struct nfs_fattr *fattr = hdr->res.fattr;
+	struct inode *inode = hdr->inode;
+
+	if (fattr == NULL)
+		return;
+	spin_lock(&inode->i_lock);
+	nfs_writeback_check_extend(hdr, fattr);
+	nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_writeback_update_inode);
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static int nfs_writeback_done(struct rpc_task *task,
+			      struct nfs_pgio_header *hdr,
+			      struct inode *inode)
+{
+	int status;
+
+	/*
+	 * ->write_done will attempt to use post-op attributes to detect
+	 * conflicting writes by other clients.  A strict interpretation
+	 * of close-to-open would allow us to continue caching even if
+	 * another writer had changed the file, but some applications
+	 * depend on tighter cache coherency when writing.
+	 */
+	status = NFS_PROTO(inode)->write_done(task, hdr);
+	if (status != 0)
+		return status;
+	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
+
+	if (hdr->res.verf->committed < hdr->args.stable &&
+	    task->tk_status >= 0) {
+		/* We tried a write call, but the server did not
+		 * commit data to stable storage even though we
+		 * requested it.
+		 * Note: There is a known bug in Tru64 < 5.0 in which
+		 *	 the server reports NFS_DATA_SYNC, but performs
+		 *	 NFS_FILE_SYNC. We therefore implement this checking
+		 *	 as a dprintk() in order to avoid filling syslog.
+		 */
+		static unsigned long    complain;
+
+		/* Note this will print the MDS for a DS write */
+		if (time_before(complain, jiffies)) {
+			dprintk("NFS:       faulty NFS server %s:"
+				" (committed = %d) != (stable = %d)\n",
+				NFS_SERVER(inode)->nfs_client->cl_hostname,
+				hdr->res.verf->committed, hdr->args.stable);
+			complain = jiffies + 300 * HZ;
+		}
+	}
+
+	/* Deal with the suid/sgid bit corner case */
+	if (nfs_should_remove_suid(inode))
+		nfs_mark_for_revalidate(inode);
+	return 0;
+}
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static void nfs_writeback_result(struct rpc_task *task,
+				 struct nfs_pgio_header *hdr)
+{
+	struct nfs_pgio_args	*argp = &hdr->args;
+	struct nfs_pgio_res	*resp = &hdr->res;
+
+	if (resp->count < argp->count) {
+		static unsigned long    complain;
+
+		/* This a short write! */
+		nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
+
+		/* Has the server at least made some progress? */
+		if (resp->count == 0) {
+			if (time_before(complain, jiffies)) {
+				printk(KERN_WARNING
+				       "NFS: Server wrote zero bytes, expected %u.\n",
+				       argp->count);
+				complain = jiffies + 300 * HZ;
+			}
+			nfs_set_pgio_error(hdr, -EIO, argp->offset);
+			task->tk_status = -EIO;
+			return;
+		}
+		/* Was this an NFSv2 write or an NFSv3 stable write? */
+		if (resp->verf->committed != NFS_UNSTABLE) {
+			/* Resend from where the server left off */
+			hdr->mds_offset += resp->count;
+			argp->offset += resp->count;
+			argp->pgbase += resp->count;
+			argp->count -= resp->count;
+		} else {
+			/* Resend as a stable write in order to avoid
+			 * headaches in the case of a server crash.
+			 */
+			argp->stable = NFS_FILE_SYNC;
+		}
+		rpc_restart_call_prepare(task);
+	}
+}
+
+
+static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+{
+	int ret;
+
+	if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
+		return 1;
+	if (!may_wait)
+		return 0;
+	ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
+				NFS_INO_COMMIT,
+				nfs_wait_bit_killable,
+				TASK_KILLABLE);
+	return (ret < 0) ? ret : 1;
+}
+
+static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+{
+	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+}
+
+void nfs_commitdata_release(struct nfs_commit_data *data)
+{
+	put_nfs_open_context(data->context);
+	nfs_commit_free(data);
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_release);
+
+int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
+			const struct nfs_rpc_ops *nfs_ops,
+			const struct rpc_call_ops *call_ops,
+			int how, int flags)
+{
+	struct rpc_task *task;
+	int priority = flush_task_priority(how);
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = call_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC | flags,
+		.priority = priority,
+	};
+	/* Set up the initial task struct.  */
+	nfs_ops->commit_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+	nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
+		NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	if (how & FLUSH_SYNC)
+		rpc_wait_for_completion_task(task);
+	rpc_put_task(task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+	loff_t lwb = 0;
+	struct nfs_page *req;
+
+	list_for_each_entry(req, head, wb_list)
+		if (lwb < (req_offset(req) + req->wb_bytes))
+			lwb = req_offset(req) + req->wb_bytes;
+
+	return lwb;
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+void nfs_init_commit(struct nfs_commit_data *data,
+		     struct list_head *head,
+		     struct pnfs_layout_segment *lseg,
+		     struct nfs_commit_info *cinfo)
+{
+	struct nfs_page *first = nfs_list_entry(head->next);
+	struct inode *inode = d_inode(first->wb_context->dentry);
+
+	/* Set up the RPC argument and reply structs
+	 * NB: take care not to mess about with data->commit et al. */
+
+	list_splice_init(head, &data->pages);
+
+	data->inode	  = inode;
+	data->cred	  = first->wb_context->cred;
+	data->lseg	  = lseg; /* reference transferred */
+	/* only set lwb for pnfs commit */
+	if (lseg)
+		data->lwb = nfs_get_lwb(&data->pages);
+	data->mds_ops     = &nfs_commit_ops;
+	data->completion_ops = cinfo->completion_ops;
+	data->dreq	  = cinfo->dreq;
+
+	data->args.fh     = NFS_FH(data->inode);
+	/* Note: we always request a commit of the entire inode */
+	data->args.offset = 0;
+	data->args.count  = 0;
+	data->context     = get_nfs_open_context(first->wb_context);
+	data->res.fattr   = &data->fattr;
+	data->res.verf    = &data->verf;
+	nfs_fattr_init(&data->fattr);
+}
+EXPORT_SYMBOL_GPL(nfs_init_commit);
+
+void nfs_retry_commit(struct list_head *page_list,
+		      struct pnfs_layout_segment *lseg,
+		      struct nfs_commit_info *cinfo,
+		      u32 ds_commit_idx)
+{
+	struct nfs_page *req;
+
+	while (!list_empty(page_list)) {
+		req = nfs_list_entry(page_list->next);
+		nfs_list_remove_request(req);
+		nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+		if (!cinfo->dreq)
+			nfs_clear_page_commit(req->wb_page);
+		nfs_unlock_and_release_request(req);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_retry_commit);
+
+/*
+ * Commit dirty pages
+ */
+static int
+nfs_commit_list(struct inode *inode, struct list_head *head, int how,
+		struct nfs_commit_info *cinfo)
+{
+	struct nfs_commit_data	*data;
+
+	data = nfs_commitdata_alloc();
+
+	if (!data)
+		goto out_bad;
+
+	/* Set up the argument struct */
+	nfs_init_commit(data, head, NULL, cinfo);
+	atomic_inc(&cinfo->mds->rpcs_out);
+	return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
+				   data->mds_ops, how, 0);
+ out_bad:
+	nfs_retry_commit(head, NULL, cinfo, 0);
+	cinfo->completion_ops->error_cleanup(NFS_I(inode));
+	return -ENOMEM;
+}
+
+/*
+ * COMMIT call returned
+ */
+static void nfs_commit_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_commit_data	*data = calldata;
+
+        dprintk("NFS: %5u nfs_commit_done (status %d)\n",
+                                task->tk_pid, task->tk_status);
+
+	/* Call the NFS version-specific code */
+	NFS_PROTO(data->inode)->commit_done(task, data);
+}
+
+static void nfs_commit_release_pages(struct nfs_commit_data *data)
+{
+	struct nfs_page	*req;
+	int status = data->task.tk_status;
+	struct nfs_commit_info cinfo;
+	struct nfs_server *nfss;
+
+	while (!list_empty(&data->pages)) {
+		req = nfs_list_entry(data->pages.next);
+		nfs_list_remove_request(req);
+		nfs_clear_page_commit(req->wb_page);
+
+		dprintk("NFS:       commit (%s/%llu %d@%lld)",
+			req->wb_context->dentry->d_sb->s_id,
+			(unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
+			req->wb_bytes,
+			(long long)req_offset(req));
+		if (status < 0) {
+			nfs_context_set_write_error(req->wb_context, status);
+			nfs_inode_remove_request(req);
+			dprintk(", error = %d\n", status);
+			goto next;
+		}
+
+		/* Okay, COMMIT succeeded, apparently. Check the verifier
+		 * returned by the server against all stored verfs. */
+		if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) {
+			/* We have a match */
+			nfs_inode_remove_request(req);
+			dprintk(" OK\n");
+			goto next;
+		}
+		/* We have a mismatch. Write the page again */
+		dprintk(" mismatch\n");
+		nfs_mark_request_dirty(req);
+		set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
+	next:
+		nfs_unlock_and_release_request(req);
+	}
+	nfss = NFS_SERVER(data->inode);
+	if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
+	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
+	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+		nfs_commit_clear_lock(NFS_I(data->inode));
+}
+
+static void nfs_commit_release(void *calldata)
+{
+	struct nfs_commit_data *data = calldata;
+
+	data->completion_ops->completion(data);
+	nfs_commitdata_release(calldata);
+}
+
+static const struct rpc_call_ops nfs_commit_ops = {
+	.rpc_call_prepare = nfs_commit_prepare,
+	.rpc_call_done = nfs_commit_done,
+	.rpc_release = nfs_commit_release,
+};
+
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
+	.completion = nfs_commit_release_pages,
+	.error_cleanup = nfs_commit_clear_lock,
+};
+
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+			    int how, struct nfs_commit_info *cinfo)
+{
+	int status;
+
+	status = pnfs_commit_list(inode, head, how, cinfo);
+	if (status == PNFS_NOT_ATTEMPTED)
+		status = nfs_commit_list(inode, head, how, cinfo);
+	return status;
+}
+
+int nfs_commit_inode(struct inode *inode, int how)
+{
+	LIST_HEAD(head);
+	struct nfs_commit_info cinfo;
+	int may_wait = how & FLUSH_SYNC;
+	int res;
+
+	res = nfs_commit_set_lock(NFS_I(inode), may_wait);
+	if (res <= 0)
+		goto out_mark_dirty;
+	nfs_init_cinfo_from_inode(&cinfo, inode);
+	res = nfs_scan_commit(inode, &head, &cinfo);
+	if (res) {
+		int error;
+
+		error = nfs_generic_commit_list(inode, &head, how, &cinfo);
+		if (error < 0)
+			return error;
+		if (!may_wait)
+			goto out_mark_dirty;
+		error = wait_on_bit_action(&NFS_I(inode)->flags,
+				NFS_INO_COMMIT,
+				nfs_wait_bit_killable,
+				TASK_KILLABLE);
+		if (error < 0)
+			return error;
+	} else
+		nfs_commit_clear_lock(NFS_I(inode));
+	return res;
+	/* Note: If we exit without ensuring that the commit is complete,
+	 * we must mark the inode as dirty. Otherwise, future calls to
+	 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+	 * that the data is on the disk.
+	 */
+out_mark_dirty:
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	return res;
+}
+
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int flags = FLUSH_SYNC;
+	int ret = 0;
+
+	/* no commits means nothing needs to be done */
+	if (!nfsi->commit_info.ncommit)
+		return ret;
+
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		/* Don't commit yet if this is a non-blocking flush and there
+		 * are a lot of outstanding writes for this mapping.
+		 */
+		if (nfsi->commit_info.ncommit <= (nfsi->nrequests >> 1))
+			goto out_mark_dirty;
+
+		/* don't wait for the COMMIT response */
+		flags = 0;
+	}
+
+	ret = nfs_commit_inode(inode, flags);
+	if (ret >= 0) {
+		if (wbc->sync_mode == WB_SYNC_NONE) {
+			if (ret < wbc->nr_to_write)
+				wbc->nr_to_write -= ret;
+			else
+				wbc->nr_to_write = 0;
+		}
+		return 0;
+	}
+out_mark_dirty:
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	return ret;
+}
+
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return nfs_commit_unstable_pages(inode, wbc);
+}
+EXPORT_SYMBOL_GPL(nfs_write_inode);
+
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
+{
+	int ret;
+
+	trace_nfs_writeback_inode_enter(inode);
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		goto out;
+	ret = nfs_commit_inode(inode, FLUSH_SYNC);
+	if (ret < 0)
+		goto out;
+	pnfs_sync_inode(inode, true);
+	ret = 0;
+
+out:
+	trace_nfs_writeback_inode_exit(inode, ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_wb_all);
+
+int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+{
+	struct nfs_page *req;
+	int ret = 0;
+
+	wait_on_page_writeback(page);
+
+	/* blocking call to cancel all requests and join to a single (head)
+	 * request */
+	req = nfs_lock_and_join_requests(page, false);
+
+	if (IS_ERR(req)) {
+		ret = PTR_ERR(req);
+	} else if (req) {
+		/* all requests from this page have been cancelled by
+		 * nfs_lock_and_join_requests, so just remove the head
+		 * request from the inode / page_private pointer and
+		 * release it */
+		nfs_inode_remove_request(req);
+		nfs_unlock_and_release_request(req);
+	}
+
+	return ret;
+}
+
+/*
+ * Write back all requests on one page - we do this before reading it.
+ */
+int nfs_wb_page(struct inode *inode, struct page *page)
+{
+	loff_t range_start = page_file_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+	int ret;
+
+	trace_nfs_writeback_page_enter(inode);
+
+	for (;;) {
+		wait_on_page_writeback(page);
+		if (clear_page_dirty_for_io(page)) {
+			ret = nfs_writepage_locked(page, &wbc);
+			if (ret < 0)
+				goto out_error;
+			continue;
+		}
+		ret = 0;
+		if (!PagePrivate(page))
+			break;
+		ret = nfs_commit_inode(inode, FLUSH_SYNC);
+		if (ret < 0)
+			goto out_error;
+	}
+out_error:
+	trace_nfs_writeback_page_exit(inode, ret);
+	return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	/*
+	 * If PagePrivate is set, then the page is currently associated with
+	 * an in-progress read or write request. Don't try to migrate it.
+	 *
+	 * FIXME: we could do this in principle, but we'll need a way to ensure
+	 *        that we can safely release the inode reference while holding
+	 *        the page lock.
+	 */
+	if (PagePrivate(page))
+		return -EBUSY;
+
+	if (!nfs_fscache_release_page(page, GFP_KERNEL))
+		return -EBUSY;
+
+	return migrate_page(mapping, newpage, page, mode);
+}
+#endif
+
+int __init nfs_init_writepagecache(void)
+{
+	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
+					     sizeof(struct nfs_pgio_header),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (nfs_wdata_cachep == NULL)
+		return -ENOMEM;
+
+	nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+						     nfs_wdata_cachep);
+	if (nfs_wdata_mempool == NULL)
+		goto out_destroy_write_cache;
+
+	nfs_cdata_cachep = kmem_cache_create("nfs_commit_data",
+					     sizeof(struct nfs_commit_data),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (nfs_cdata_cachep == NULL)
+		goto out_destroy_write_mempool;
+
+	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+						      nfs_cdata_cachep);
+	if (nfs_commit_mempool == NULL)
+		goto out_destroy_commit_cache;
+
+	/*
+	 * NFS congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (nfs_congestion_kb > 256*1024)
+		nfs_congestion_kb = 256*1024;
+
+	return 0;
+
+out_destroy_commit_cache:
+	kmem_cache_destroy(nfs_cdata_cachep);
+out_destroy_write_mempool:
+	mempool_destroy(nfs_wdata_mempool);
+out_destroy_write_cache:
+	kmem_cache_destroy(nfs_wdata_cachep);
+	return -ENOMEM;
+}
+
+void nfs_destroy_writepagecache(void)
+{
+	mempool_destroy(nfs_commit_mempool);
+	kmem_cache_destroy(nfs_cdata_cachep);
+	mempool_destroy(nfs_wdata_mempool);
+	kmem_cache_destroy(nfs_wdata_cachep);
+}
+
+static const struct nfs_rw_ops nfs_rw_write_ops = {
+	.rw_mode		= FMODE_WRITE,
+	.rw_alloc_header	= nfs_writehdr_alloc,
+	.rw_free_header		= nfs_writehdr_free,
+	.rw_release		= nfs_writeback_release_common,
+	.rw_done		= nfs_writeback_done,
+	.rw_result		= nfs_writeback_result,
+	.rw_initiate		= nfs_initiate_write,
+};
diff --git a/kernel/fs/nfs_common/Makefile b/kernel/fs/nfs_common/Makefile
new file mode 100644
index 000000000..d153ca3ea
--- /dev/null
+++ b/kernel/fs/nfs_common/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for Linux filesystem routines that are shared by client and server.
+#
+
+obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
+nfs_acl-objs := nfsacl.o
+
+obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/kernel/fs/nfs_common/grace.c b/kernel/fs/nfs_common/grace.c
new file mode 100644
index 000000000..ae6e58ea4
--- /dev/null
+++ b/kernel/fs/nfs_common/grace.c
@@ -0,0 +1,113 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ *
+ * Transplanted from lockd code
+ */
+
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/fs.h>
+
+static int grace_net_id;
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @net: net namespace that this lock manager belongs to
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out.  Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void
+locks_start_grace(struct net *net, struct lock_manager *lm)
+{
+	struct list_head *grace_list = net_generic(net, grace_net_id);
+
+	spin_lock(&grace_lock);
+	list_add(&lm->list, grace_list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @net: net namespace that this lock manager belongs to
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking.  The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void
+locks_end_grace(struct lock_manager *lm)
+{
+	spin_lock(&grace_lock);
+	list_del_init(&lm->list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int
+locks_in_grace(struct net *net)
+{
+	struct list_head *grace_list = net_generic(net, grace_net_id);
+
+	return !list_empty(grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
+
+static int __net_init
+grace_init_net(struct net *net)
+{
+	struct list_head *grace_list = net_generic(net, grace_net_id);
+
+	INIT_LIST_HEAD(grace_list);
+	return 0;
+}
+
+static void __net_exit
+grace_exit_net(struct net *net)
+{
+	struct list_head *grace_list = net_generic(net, grace_net_id);
+
+	BUG_ON(!list_empty(grace_list));
+}
+
+static struct pernet_operations grace_net_ops = {
+	.init = grace_init_net,
+	.exit = grace_exit_net,
+	.id   = &grace_net_id,
+	.size = sizeof(struct list_head),
+};
+
+static int __init
+init_grace(void)
+{
+	return register_pernet_subsys(&grace_net_ops);
+}
+
+static void __exit
+exit_grace(void)
+{
+	unregister_pernet_subsys(&grace_net_ops);
+}
+
+MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
+MODULE_LICENSE("GPL");
+module_init(init_grace)
+module_exit(exit_grace)
diff --git a/kernel/fs/nfs_common/nfsacl.c b/kernel/fs/nfs_common/nfsacl.c
new file mode 100644
index 000000000..538f14293
--- /dev/null
+++ b/kernel/fs/nfs_common/nfsacl.c
@@ -0,0 +1,296 @@
+/*
+ * fs/nfs_common/nfsacl.c
+ *
+ *  Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+/*
+ * The Solaris nfsacl protocol represents some ACLs slightly differently
+ * than POSIX 1003.1e draft 17 does (and we do):
+ *
+ *  - Minimal ACLs always have an ACL_MASK entry, so they have
+ *    four instead of three entries.
+ *  - The ACL_MASK entry in such minimal ACLs always has the same
+ *    permissions as the ACL_GROUP_OBJ entry. (In extended ACLs
+ *    the ACL_MASK and ACL_GROUP_OBJ entries may differ.)
+ *  - The identifier fields of the ACL_USER_OBJ and ACL_GROUP_OBJ
+ *    entries contain the identifiers of the owner and owning group.
+ *    (In POSIX ACLs we always set them to ACL_UNDEFINED_ID).
+ *  - ACL entries in the kernel are kept sorted in ascending order
+ *    of (e_tag, e_id). Solaris ACLs are unsorted.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/nfsacl.h>
+#include <linux/nfs3.h>
+#include <linux/sort.h>
+
+MODULE_LICENSE("GPL");
+
+struct nfsacl_encode_desc {
+	struct xdr_array2_desc desc;
+	unsigned int count;
+	struct posix_acl *acl;
+	int typeflag;
+	kuid_t uid;
+	kgid_t gid;
+};
+
+struct nfsacl_simple_acl {
+	struct posix_acl acl;
+	struct posix_acl_entry ace[4];
+};
+
+static int
+xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
+{
+	struct nfsacl_encode_desc *nfsacl_desc =
+		(struct nfsacl_encode_desc *) desc;
+	__be32 *p = elem;
+
+	struct posix_acl_entry *entry =
+		&nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+
+	*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
+	switch(entry->e_tag) {
+		case ACL_USER_OBJ:
+			*p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid));
+			break;
+		case ACL_GROUP_OBJ:
+			*p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid));
+			break;
+		case ACL_USER:
+			*p++ = htonl(from_kuid(&init_user_ns, entry->e_uid));
+			break;
+		case ACL_GROUP:
+			*p++ = htonl(from_kgid(&init_user_ns, entry->e_gid));
+			break;
+		default:  /* Solaris depends on that! */
+			*p++ = 0;
+			break;
+	}
+	*p++ = htonl(entry->e_perm & S_IRWXO);
+	return 0;
+}
+
+/**
+ * nfsacl_encode - Encode an NFSv3 ACL
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+		  struct posix_acl *acl, int encode_entries, int typeflag)
+{
+	int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
+	struct nfsacl_encode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = 12,
+			.array_len = encode_entries ? entries : 0,
+			.xcode = xdr_nfsace_encode,
+		},
+		.acl = acl,
+		.typeflag = typeflag,
+		.uid = inode->i_uid,
+		.gid = inode->i_gid,
+	};
+	struct nfsacl_simple_acl aclbuf;
+	int err;
+
+	if (entries > NFS_ACL_MAX_ENTRIES ||
+	    xdr_encode_word(buf, base, entries))
+		return -EINVAL;
+	if (encode_entries && acl && acl->a_count == 3) {
+		struct posix_acl *acl2 = &aclbuf.acl;
+
+		/* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
+		 * invoked in contexts where a memory allocation failure is
+		 * fatal.  Fortunately this fake ACL is small enough to
+		 * construct on the stack. */
+		posix_acl_init(acl2, 4);
+
+		/* Insert entries in canonical order: other orders seem
+		 to confuse Solaris VxFS. */
+		acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
+		acl2->a_entries[1] = acl->a_entries[1];  /* ACL_GROUP_OBJ */
+		acl2->a_entries[2] = acl->a_entries[1];  /* ACL_MASK */
+		acl2->a_entries[2].e_tag = ACL_MASK;
+		acl2->a_entries[3] = acl->a_entries[2];  /* ACL_OTHER */
+		nfsacl_desc.acl = acl2;
+	}
+	err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
+	if (!err)
+		err = 8 + nfsacl_desc.desc.elem_size *
+			  nfsacl_desc.desc.array_len;
+	return err;
+}
+EXPORT_SYMBOL_GPL(nfsacl_encode);
+
+struct nfsacl_decode_desc {
+	struct xdr_array2_desc desc;
+	unsigned int count;
+	struct posix_acl *acl;
+};
+
+static int
+xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
+{
+	struct nfsacl_decode_desc *nfsacl_desc =
+		(struct nfsacl_decode_desc *) desc;
+	__be32 *p = elem;
+	struct posix_acl_entry *entry;
+	unsigned int id;
+
+	if (!nfsacl_desc->acl) {
+		if (desc->array_len > NFS_ACL_MAX_ENTRIES)
+			return -EINVAL;
+		nfsacl_desc->acl = posix_acl_alloc(desc->array_len, GFP_KERNEL);
+		if (!nfsacl_desc->acl)
+			return -ENOMEM;
+		nfsacl_desc->count = 0;
+	}
+
+	entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+	entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT;
+	id = ntohl(*p++);
+	entry->e_perm = ntohl(*p++);
+
+	switch(entry->e_tag) {
+		case ACL_USER:
+			entry->e_uid = make_kuid(&init_user_ns, id);
+			if (!uid_valid(entry->e_uid))
+				return -EINVAL;
+			break;
+		case ACL_GROUP:
+			entry->e_gid = make_kgid(&init_user_ns, id);
+			if (!gid_valid(entry->e_gid))
+				return -EINVAL;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_OTHER:
+			if (entry->e_perm & ~S_IRWXO)
+				return -EINVAL;
+			break;
+		case ACL_MASK:
+			/* Solaris sometimes sets additional bits in the mask */
+			entry->e_perm &= S_IRWXO;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+cmp_acl_entry(const void *x, const void *y)
+{
+	const struct posix_acl_entry *a = x, *b = y;
+
+	if (a->e_tag != b->e_tag)
+		return a->e_tag - b->e_tag;
+	else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid))
+		return 1;
+	else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid))
+		return -1;
+	else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid))
+		return 1;
+	else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid))
+		return -1;
+	else
+		return 0;
+}
+
+/*
+ * Convert from a Solaris ACL to a POSIX 1003.1e draft 17 ACL.
+ */
+static int
+posix_acl_from_nfsacl(struct posix_acl *acl)
+{
+	struct posix_acl_entry *pa, *pe,
+	       *group_obj = NULL, *mask = NULL;
+
+	if (!acl)
+		return 0;
+
+	sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry),
+	     cmp_acl_entry, NULL);
+
+	/* Find the ACL_GROUP_OBJ and ACL_MASK entries. */
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch(pa->e_tag) {
+			case ACL_USER_OBJ:
+				break;
+			case ACL_GROUP_OBJ:
+				group_obj = pa;
+				break;
+			case ACL_MASK:
+				mask = pa;
+				/* fall through */
+			case ACL_OTHER:
+				break;
+		}
+	}
+	if (acl->a_count == 4 && group_obj && mask &&
+	    mask->e_perm == group_obj->e_perm) {
+		/* remove bogus ACL_MASK entry */
+		memmove(mask, mask+1, (3 - (mask - acl->a_entries)) *
+				      sizeof(struct posix_acl_entry));
+		acl->a_count = 3;
+	}
+	return 0;
+}
+
+/**
+ * nfsacl_decode - Decode an NFSv3 ACL
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+		  struct posix_acl **pacl)
+{
+	struct nfsacl_decode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = 12,
+			.xcode = pacl ? xdr_nfsace_decode : NULL,
+		},
+	};
+	u32 entries;
+	int err;
+
+	if (xdr_decode_word(buf, base, &entries) ||
+	    entries > NFS_ACL_MAX_ENTRIES)
+		return -EINVAL;
+	nfsacl_desc.desc.array_maxlen = entries;
+	err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc);
+	if (err)
+		return err;
+	if (pacl) {
+		if (entries != nfsacl_desc.desc.array_len ||
+		    posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) {
+			posix_acl_release(nfsacl_desc.acl);
+			return -EINVAL;
+		}
+		*pacl = nfsacl_desc.acl;
+	}
+	if (aclcnt)
+		*aclcnt = entries;
+	return 8 + nfsacl_desc.desc.elem_size *
+		   nfsacl_desc.desc.array_len;
+}
+EXPORT_SYMBOL_GPL(nfsacl_decode);
diff --git a/kernel/fs/nfsd/Kconfig b/kernel/fs/nfsd/Kconfig
new file mode 100644
index 000000000..a0b77fc1b
--- /dev/null
+++ b/kernel/fs/nfsd/Kconfig
@@ -0,0 +1,117 @@
+config NFSD
+	tristate "NFS server support"
+	depends on INET
+	depends on FILE_LOCKING
+	select LOCKD
+	select SUNRPC
+	select EXPORTFS
+	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	depends on MULTIUSER
+	help
+	  Choose Y here if you want to allow other computers to access
+	  files residing on this system using Sun's Network File System
+	  protocol.  To compile the NFS server support as a module,
+	  choose M here: the module will be called nfsd.
+
+	  You may choose to use a user-space NFS server instead, in which
+	  case you can choose N here.
+
+	  To export local file systems using NFS, you also need to install
+	  user space programs which can be found in the Linux nfs-utils
+	  package, available from http://linux-nfs.org/.  More detail about
+	  the Linux NFS server implementation is available via the
+	  exports(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available to clients mounting the NFS server on this system.
+	  Support for NFS version 2 (RFC 1094) is always available when
+	  CONFIG_NFSD is selected.
+
+	  If unsure, say N.
+
+config NFSD_V2_ACL
+	bool
+	depends on NFSD
+
+config NFSD_V3
+	bool "NFS server support for NFS version 3"
+	depends on NFSD
+	help
+	  This option enables support in your system's NFS server for
+	  version 3 of the NFS protocol (RFC 1813).
+
+	  If unsure, say Y.
+
+config NFSD_V3_ACL
+	bool "NFS server support for the NFSv3 ACL protocol extension"
+	depends on NFSD_V3
+	select NFSD_V2_ACL
+	help
+	  Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
+	  never became an official part of the NFS version 3 protocol.
+	  This protocol extension allows applications on NFS clients to
+	  manipulate POSIX Access Control Lists on files residing on NFS
+	  servers.  NFS servers enforce POSIX ACLs on local files whether
+	  this protocol is available or not.
+
+	  This option enables support in your system's NFS server for the
+	  NFSv3 ACL protocol extension allowing NFS clients to manipulate
+	  POSIX ACLs on files exported by your system's NFS server.  NFS
+	  clients which support the Solaris NFSv3 ACL protocol can then
+	  access and modify ACLs on your NFS server.
+
+	  To store ACLs on your NFS server, you also need to enable ACL-
+	  related CONFIG options for your local file systems of choice.
+
+	  If unsure, say N.
+
+config NFSD_V4
+	bool "NFS server support for NFS version 4"
+	depends on NFSD && PROC_FS
+	select NFSD_V3
+	select FS_POSIX_ACL
+	select SUNRPC_GSS
+	select CRYPTO
+	select GRACE_PERIOD
+	help
+	  This option enables support in your system's NFS server for
+	  version 4 of the NFS protocol (RFC 3530).
+
+	  To export files using NFSv4, you need to install additional user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say N.
+
+config NFSD_PNFS
+	bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+	depends on NFSD_V4
+	help
+	  This option enables support for the parallel NFS features of the
+	  minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+	  server.
+
+	  If unsure, say N.
+
+config NFSD_V4_SECURITY_LABEL
+	bool "Provide Security Label support for NFSv4 server"
+	depends on NFSD_V4 && SECURITY
+	help
+
+	Say Y here if you want enable fine-grained security label attribute
+	support for NFS version 4.  Security labels allow security modules like
+	SELinux and Smack to label files to facilitate enforcement of their policies.
+	Without this an NFSv4 mount will have the same label on each file.
+
+	If you do not wish to enable fine-grained security labels SELinux or
+	Smack policies on NFSv4 files, say N.
+
+config NFSD_FAULT_INJECTION
+	bool "NFS server manual fault injection"
+	depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS
+	help
+	  This option enables support for manually injecting faults
+	  into the NFS server.  This is intended to be used for
+	  testing error recovery on the NFS client.
+
+	  If unsure, say N.
diff --git a/kernel/fs/nfsd/Makefile b/kernel/fs/nfsd/Makefile
new file mode 100644
index 000000000..9a6028e12
--- /dev/null
+++ b/kernel/fs/nfsd/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for the Linux nfs server
+#
+
+ccflags-y += -I$(src)			# needed for trace events
+
+obj-$(CONFIG_NFSD)	+= nfsd.o
+
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y			+= trace.o
+
+nfsd-y 			+= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
+nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
+nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
+nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
+nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+			   nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/kernel/fs/nfsd/acl.h b/kernel/fs/nfsd/acl.h
new file mode 100644
index 000000000..4cd7c69a6
--- /dev/null
+++ b/kernel/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
+/*
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+
+struct nfs4_acl;
+struct svc_fh;
+struct svc_rqst;
+
+/*
+ * Maximum ACL we'll accept from a client; chosen (somewhat
+ * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
+ * high-order allocation.  This allows 204 ACEs on x86_64:
+ */
+#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
+			/ sizeof(struct nfs4_ace))
+
+int nfs4_acl_bytes(int entries);
+int nfs4_acl_get_whotype(char *, u32);
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
+
+int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+		struct nfs4_acl **acl);
+__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct nfs4_acl *acl);
+
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/kernel/fs/nfsd/auth.c b/kernel/fs/nfsd/auth.c
new file mode 100644
index 000000000..9d46a0bdd
--- /dev/null
+++ b/kernel/fs/nfsd/auth.c
@@ -0,0 +1,90 @@
+/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
+
+#include <linux/sched.h>
+#include "nfsd.h"
+#include "auth.h"
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
+			return f->flags;
+	}
+	return exp->ex_flags;
+
+}
+
+int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	struct group_info *rqgi;
+	struct group_info *gi;
+	struct cred *new;
+	int i;
+	int flags = nfsexp_flags(rqstp, exp);
+
+	validate_process_creds();
+
+	/* discard any old override before preparing the new set */
+	revert_creds(get_cred(current_real_cred()));
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = rqstp->rq_cred.cr_uid;
+	new->fsgid = rqstp->rq_cred.cr_gid;
+
+	rqgi = rqstp->rq_cred.cr_group_info;
+
+	if (flags & NFSEXP_ALLSQUASH) {
+		new->fsuid = exp->ex_anon_uid;
+		new->fsgid = exp->ex_anon_gid;
+		gi = groups_alloc(0);
+		if (!gi)
+			goto oom;
+	} else if (flags & NFSEXP_ROOTSQUASH) {
+		if (uid_eq(new->fsuid, GLOBAL_ROOT_UID))
+			new->fsuid = exp->ex_anon_uid;
+		if (gid_eq(new->fsgid, GLOBAL_ROOT_GID))
+			new->fsgid = exp->ex_anon_gid;
+
+		gi = groups_alloc(rqgi->ngroups);
+		if (!gi)
+			goto oom;
+
+		for (i = 0; i < rqgi->ngroups; i++) {
+			if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
+				GROUP_AT(gi, i) = exp->ex_anon_gid;
+			else
+				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+		}
+	} else {
+		gi = get_group_info(rqgi);
+	}
+
+	if (uid_eq(new->fsuid, INVALID_UID))
+		new->fsuid = exp->ex_anon_uid;
+	if (gid_eq(new->fsgid, INVALID_GID))
+		new->fsgid = exp->ex_anon_gid;
+
+	set_groups(new, gi);
+	put_group_info(gi);
+
+	if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
+		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
+	else
+		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
+							new->cap_permitted);
+	validate_process_creds();
+	put_cred(override_creds(new));
+	put_cred(new);
+	validate_process_creds();
+	return 0;
+
+oom:
+	abort_creds(new);
+	return -ENOMEM;
+}
+
diff --git a/kernel/fs/nfsd/auth.h b/kernel/fs/nfsd/auth.h
new file mode 100644
index 000000000..53325a12b
--- /dev/null
+++ b/kernel/fs/nfsd/auth.h
@@ -0,0 +1,16 @@
+/*
+ * nfsd-specific authentication stuff.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_AUTH_H
+#define LINUX_NFSD_AUTH_H
+
+/*
+ * Set the current process's fsuid/fsgid etc to those of the NFS
+ * client user
+ */
+int nfsd_setuser(struct svc_rqst *, struct svc_export *);
+
+#endif /* LINUX_NFSD_AUTH_H */
diff --git a/kernel/fs/nfsd/blocklayout.c b/kernel/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000..cdefaa331
--- /dev/null
+++ b/kernel/fs/nfsd/blocklayout.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_PNFS
+
+
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev;
+	struct pnfs_block_volume *b;
+
+	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	gdp->gd_device = dev;
+
+	dev->nr_volumes = 1;
+	b = &dev->volumes[0];
+
+	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+			&b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	if (sb->s_bdev != sb->s_bdev->bd_contains)
+		return nfserr_inval;
+	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+		struct nfsd4_layoutget *args)
+{
+	struct nfsd4_layout_seg *seg = &args->lg_seg;
+	struct super_block *sb = inode->i_sb;
+	u32 block_size = (1 << inode->i_blkbits);
+	struct pnfs_block_extent *bex;
+	struct iomap iomap;
+	u32 device_generation = 0;
+	int error;
+
+	/*
+	 * We do not attempt to support I/O smaller than the fs block size,
+	 * or not aligned to it.
+	 */
+	if (args->lg_minlength < block_size) {
+		dprintk("pnfsd: I/O too small\n");
+		goto out_layoutunavailable;
+	}
+	if (seg->offset & (block_size - 1)) {
+		dprintk("pnfsd: I/O misaligned\n");
+		goto out_layoutunavailable;
+	}
+
+	/*
+	 * Some clients barf on non-zero block numbers for NONE or INVALID
+	 * layouts, so make sure to zero the whole structure.
+	 */
+	error = -ENOMEM;
+	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+	if (!bex)
+		goto out_error;
+	args->lg_content = bex;
+
+	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+					    &iomap, seg->iomode != IOMODE_READ,
+					    &device_generation);
+	if (error) {
+		if (error == -ENXIO)
+			goto out_layoutunavailable;
+		goto out_error;
+	}
+
+	if (iomap.length < args->lg_minlength) {
+		dprintk("pnfsd: extent smaller than minlength\n");
+		goto out_layoutunavailable;
+	}
+
+	switch (iomap.type) {
+	case IOMAP_MAPPED:
+		if (seg->iomode == IOMODE_READ)
+			bex->es = PNFS_BLOCK_READ_DATA;
+		else
+			bex->es = PNFS_BLOCK_READWRITE_DATA;
+		bex->soff = (iomap.blkno << 9);
+		break;
+	case IOMAP_UNWRITTEN:
+		if (seg->iomode & IOMODE_RW) {
+			/*
+			 * Crack monkey special case from section 2.3.1.
+			 */
+			if (args->lg_minlength == 0) {
+				dprintk("pnfsd: no soup for you!\n");
+				goto out_layoutunavailable;
+			}
+
+			bex->es = PNFS_BLOCK_INVALID_DATA;
+			bex->soff = (iomap.blkno << 9);
+			break;
+		}
+		/*FALLTHRU*/
+	case IOMAP_HOLE:
+		if (seg->iomode == IOMODE_READ) {
+			bex->es = PNFS_BLOCK_NONE_DATA;
+			break;
+		}
+		/*FALLTHRU*/
+	case IOMAP_DELALLOC:
+	default:
+		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+		goto out_layoutunavailable;
+	}
+
+	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+	if (error)
+		goto out_error;
+	bex->foff = iomap.offset;
+	bex->len = iomap.length;
+
+	seg->offset = iomap.offset;
+	seg->length = iomap.length;
+
+	dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
+	return 0;
+
+out_error:
+	seg->length = 0;
+	return nfserrno(error);
+out_layoutunavailable:
+	seg->length = 0;
+	return nfserr_layoutunavailable;
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+		struct nfsd4_layoutcommit *lcp)
+{
+	loff_t new_size = lcp->lc_last_wr + 1;
+	struct iattr iattr = { .ia_valid = 0 };
+	struct iomap *iomaps;
+	int nr_iomaps;
+	int error;
+
+	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+	if (nr_iomaps < 0)
+		return nfserrno(nr_iomaps);
+
+	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+		lcp->lc_mtime = current_fs_time(inode->i_sb);
+	iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+
+	if (new_size > i_size_read(inode)) {
+		iattr.ia_valid |= ATTR_SIZE;
+		iattr.ia_size = new_size;
+	}
+
+	error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+			nr_iomaps, &iattr);
+	kfree(iomaps);
+	return nfserrno(error);
+}
+
+const struct nfsd4_layout_ops bl_layout_ops = {
+	/*
+	 * Pretend that we send notification to the client.  This is a blatant
+	 * lie to force recent Linux clients to cache our device IDs.
+	 * We rarely ever change the device ID, so the harm of leaking deviceids
+	 * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
+	 * in this regard, but I filed errata 4119 for this a while ago, and
+	 * hopefully the Linux client will eventually start caching deviceids
+	 * without this again.
+	 */
+	.notify_types		=
+			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+	.proc_getdeviceinfo	= nfsd4_block_proc_getdeviceinfo,
+	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
+	.proc_layoutget		= nfsd4_block_proc_layoutget,
+	.encode_layoutget	= nfsd4_block_encode_layoutget,
+	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
+};
diff --git a/kernel/fs/nfsd/blocklayoutxdr.c b/kernel/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000..9aa2796da
--- /dev/null
+++ b/kernel/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_PNFS
+
+
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+		struct nfsd4_layoutget *lgp)
+{
+	struct pnfs_block_extent *b = lgp->lg_content;
+	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+	if (!p)
+		return nfserr_toosmall;
+
+	*p++ = cpu_to_be32(len);
+	*p++ = cpu_to_be32(1);		/* we always return a single extent */
+
+	p = xdr_encode_opaque_fixed(p, &b->vol_id,
+			sizeof(struct nfsd4_deviceid));
+	p = xdr_encode_hyper(p, b->foff);
+	p = xdr_encode_hyper(p, b->len);
+	p = xdr_encode_hyper(p, b->soff);
+	*p++ = cpu_to_be32(b->es);
+	return 0;
+}
+
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+	__be32 *p;
+	int len;
+
+	switch (b->type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+		p = xdr_reserve_space(xdr, len);
+		if (!p)
+			return -ETOOSMALL;
+
+		*p++ = cpu_to_be32(b->type);
+		*p++ = cpu_to_be32(1);	/* single signature */
+		p = xdr_encode_hyper(p, b->simple.offset);
+		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
+	return len;
+}
+
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+	int len = sizeof(__be32), ret, i;
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, len + sizeof(__be32));
+	if (!p)
+		return nfserr_resource;
+
+	for (i = 0; i < dev->nr_volumes; i++) {
+		ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+		if (ret < 0)
+			return nfserrno(ret);
+		len += ret;
+	}
+
+	/*
+	 * Fill in the overall length and number of volumes at the beginning
+	 * of the layout.
+	 */
+	*p++ = cpu_to_be32(len);
+	*p++ = cpu_to_be32(dev->nr_volumes);
+	return 0;
+}
+
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size)
+{
+	struct iomap *iomaps;
+	u32 nr_iomaps, expected, i;
+
+	if (len < sizeof(u32)) {
+		dprintk("%s: extent array too small: %u\n", __func__, len);
+		return -EINVAL;
+	}
+
+	nr_iomaps = be32_to_cpup(p++);
+	expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+	if (len != expected) {
+		dprintk("%s: extent array size mismatch: %u/%u\n",
+			__func__, len, expected);
+		return -EINVAL;
+	}
+
+	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+	if (!iomaps) {
+		dprintk("%s: failed to allocate extent array\n", __func__);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_iomaps; i++) {
+		struct pnfs_block_extent bex;
+
+		memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+		p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+
+		p = xdr_decode_hyper(p, &bex.foff);
+		if (bex.foff & (block_size - 1)) {
+			dprintk("%s: unaligned offset 0x%llx\n",
+				__func__, bex.foff);
+			goto fail;
+		}
+		p = xdr_decode_hyper(p, &bex.len);
+		if (bex.len & (block_size - 1)) {
+			dprintk("%s: unaligned length 0x%llx\n",
+				__func__, bex.foff);
+			goto fail;
+		}
+		p = xdr_decode_hyper(p, &bex.soff);
+		if (bex.soff & (block_size - 1)) {
+			dprintk("%s: unaligned disk offset 0x%llx\n",
+				__func__, bex.soff);
+			goto fail;
+		}
+		bex.es = be32_to_cpup(p++);
+		if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+			dprintk("%s: incorrect extent state %d\n",
+				__func__, bex.es);
+			goto fail;
+		}
+
+		iomaps[i].offset = bex.foff;
+		iomaps[i].length = bex.len;
+	}
+
+	*iomapp = iomaps;
+	return nr_iomaps;
+fail:
+	kfree(iomaps);
+	return -EINVAL;
+}
diff --git a/kernel/fs/nfsd/blocklayoutxdr.h b/kernel/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000..fdc79037c
--- /dev/null
+++ b/kernel/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+
+#include <linux/blkdev.h>
+#include "xdr4.h"
+
+struct iomap;
+struct xdr_stream;
+
+enum pnfs_block_extent_state {
+	PNFS_BLOCK_READWRITE_DATA	= 0,
+	PNFS_BLOCK_READ_DATA		= 1,
+	PNFS_BLOCK_INVALID_DATA		= 2,
+	PNFS_BLOCK_NONE_DATA		= 3,
+};
+
+struct pnfs_block_extent {
+	struct nfsd4_deviceid		vol_id;
+	u64				foff;
+	u64				len;
+	u64				soff;
+	enum pnfs_block_extent_state	es;
+};
+#define NFS4_BLOCK_EXTENT_SIZE		44
+
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+};
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN	128
+
+struct pnfs_block_volume {
+	enum pnfs_block_volume_type	type;
+	union {
+		struct {
+			u64		offset;
+			u32		sig_len;
+			u8		sig[PNFS_BLOCK_UUID_LEN];
+		} simple;
+	};
+};
+
+struct pnfs_block_deviceaddr {
+	u32				nr_volumes;
+	struct pnfs_block_volume	volumes[];
+};
+
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+		struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+		struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size);
+
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/kernel/fs/nfsd/cache.h b/kernel/fs/nfsd/cache.h
new file mode 100644
index 000000000..dd96a3830
--- /dev/null
+++ b/kernel/fs/nfsd/cache.h
@@ -0,0 +1,86 @@
+/*
+ * Request reply cache. This was heavily inspired by the
+ * implementation in 4.3BSD/4.4BSD.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef NFSCACHE_H
+#define NFSCACHE_H
+
+#include <linux/sunrpc/svc.h>
+
+/*
+ * Representation of a reply cache entry.
+ *
+ * Note that we use a sockaddr_in6 to hold the address instead of the more
+ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage
+ * is much larger than a sockaddr_in6.
+ */
+struct svc_cacherep {
+	struct list_head	c_lru;
+
+	unsigned char		c_state,	/* unused, inprog, done */
+				c_type,		/* status, buffer */
+				c_secure : 1;	/* req came from port < 1024 */
+	struct sockaddr_in6	c_addr;
+	__be32			c_xid;
+	u32			c_prot;
+	u32			c_proc;
+	u32			c_vers;
+	unsigned int		c_len;
+	__wsum			c_csum;
+	unsigned long		c_timestamp;
+	union {
+		struct kvec	u_vec;
+		__be32		u_status;
+	}			c_u;
+};
+
+#define c_replvec		c_u.u_vec
+#define c_replstat		c_u.u_status
+
+/* cache entry states */
+enum {
+	RC_UNUSED,
+	RC_INPROG,
+	RC_DONE
+};
+
+/* return values */
+enum {
+	RC_DROPIT,
+	RC_REPLY,
+	RC_DOIT
+};
+
+/*
+ * Cache types.
+ * We may want to add more types one day, e.g. for diropres and
+ * attrstat replies. Using cache entries with fixed length instead
+ * of buffer pointers may be more efficient.
+ */
+enum {
+	RC_NOCACHE,
+	RC_REPLSTAT,
+	RC_REPLBUFF,
+};
+
+/*
+ * If requests are retransmitted within this interval, they're dropped.
+ */
+#define RC_DELAY		(HZ/5)
+
+/* Cache entries expire after this time period */
+#define RC_EXPIRE		(120 * HZ)
+
+/* Checksum this amount of the request */
+#define RC_CSUMLEN		(256U)
+
+int	nfsd_reply_cache_init(void);
+void	nfsd_reply_cache_shutdown(void);
+int	nfsd_cache_lookup(struct svc_rqst *);
+void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+int	nfsd_reply_cache_stats_open(struct inode *, struct file *);
+
+#endif /* NFSCACHE_H */
diff --git a/kernel/fs/nfsd/current_stateid.h b/kernel/fs/nfsd/current_stateid.h
new file mode 100644
index 000000000..412355120
--- /dev/null
+++ b/kernel/fs/nfsd/current_stateid.h
@@ -0,0 +1,28 @@
+#ifndef _NFSD4_CURRENT_STATE_H
+#define _NFSD4_CURRENT_STATE_H
+
+#include "state.h"
+#include "xdr4.h"
+
+extern void clear_current_stateid(struct nfsd4_compound_state *cstate);
+/*
+ * functions to set current state id
+ */
+extern void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *);
+extern void nfsd4_set_openstateid(struct nfsd4_compound_state *, struct nfsd4_open *);
+extern void nfsd4_set_lockstateid(struct nfsd4_compound_state *, struct nfsd4_lock *);
+extern void nfsd4_set_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *);
+
+/*
+ * functions to consume current state id
+ */
+extern void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *);
+extern void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *, struct nfsd4_delegreturn *);
+extern void nfsd4_get_freestateid(struct nfsd4_compound_state *, struct nfsd4_free_stateid *);
+extern void nfsd4_get_setattrstateid(struct nfsd4_compound_state *, struct nfsd4_setattr *);
+extern void nfsd4_get_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *);
+extern void nfsd4_get_lockustateid(struct nfsd4_compound_state *, struct nfsd4_locku *);
+extern void nfsd4_get_readstateid(struct nfsd4_compound_state *, struct nfsd4_read *);
+extern void nfsd4_get_writestateid(struct nfsd4_compound_state *, struct nfsd4_write *);
+
+#endif   /* _NFSD4_CURRENT_STATE_H */
diff --git a/kernel/fs/nfsd/export.c b/kernel/fs/nfsd/export.c
new file mode 100644
index 000000000..f79521a59
--- /dev/null
+++ b/kernel/fs/nfsd/export.c
@@ -0,0 +1,1345 @@
+/*
+ * NFS exporting and validation.
+ *
+ * We maintain a list of clients, each of which has a list of
+ * exports. To export an fs to a given client, you first have
+ * to create the client entry with NFSCTL_ADDCLIENT, which
+ * creates a client control block and adds it to the hash
+ * table. Then, you call NFSCTL_EXPORT for each fs.
+ *
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/module.h>
+#include <linux/exportfs.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#include "nfsd.h"
+#include "nfsfh.h"
+#include "netns.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_EXPORT
+
+/*
+ * We have two caches.
+ * One maps client+vfsmnt+dentry to export options - the export map
+ * The other maps client+filehandle-fragment to export options. - the expkey map
+ *
+ * The export options are actually stored in the first map, and the
+ * second map contains a reference to the entry in the first map.
+ */
+
+#define	EXPKEY_HASHBITS		8
+#define	EXPKEY_HASHMAX		(1 << EXPKEY_HASHBITS)
+#define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
+
+static void expkey_put(struct kref *ref)
+{
+	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
+
+	if (test_bit(CACHE_VALID, &key->h.flags) &&
+	    !test_bit(CACHE_NEGATIVE, &key->h.flags))
+		path_put(&key->ek_path);
+	auth_domain_put(key->ek_client);
+	kfree(key);
+}
+
+static void expkey_request(struct cache_detail *cd,
+			   struct cache_head *h,
+			   char **bpp, int *blen)
+{
+	/* client fsidtype \xfsid */
+	struct svc_expkey *ek = container_of(h, struct svc_expkey, h);
+	char type[5];
+
+	qword_add(bpp, blen, ek->ek_client->name);
+	snprintf(type, 5, "%d", ek->ek_fsidtype);
+	qword_add(bpp, blen, type);
+	qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype));
+	(*bpp)[-1] = '\n';
+}
+
+static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
+					    struct svc_expkey *old);
+static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *);
+
+static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	/* client fsidtype fsid expiry [path] */
+	char *buf;
+	int len;
+	struct auth_domain *dom = NULL;
+	int err;
+	int fsidtype;
+	char *ep;
+	struct svc_expkey key;
+	struct svc_expkey *ek = NULL;
+
+	if (mesg[mlen - 1] != '\n')
+		return -EINVAL;
+	mesg[mlen-1] = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!buf)
+		goto out;
+
+	err = -EINVAL;
+	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out;
+
+	err = -ENOENT;
+	dom = auth_domain_find(buf);
+	if (!dom)
+		goto out;
+	dprintk("found domain %s\n", buf);
+
+	err = -EINVAL;
+	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out;
+	fsidtype = simple_strtoul(buf, &ep, 10);
+	if (*ep)
+		goto out;
+	dprintk("found fsidtype %d\n", fsidtype);
+	if (key_len(fsidtype)==0) /* invalid type */
+		goto out;
+	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out;
+	dprintk("found fsid length %d\n", len);
+	if (len != key_len(fsidtype))
+		goto out;
+
+	/* OK, we seem to have a valid key */
+	key.h.flags = 0;
+	key.h.expiry_time = get_expiry(&mesg);
+	if (key.h.expiry_time == 0)
+		goto out;
+
+	key.ek_client = dom;	
+	key.ek_fsidtype = fsidtype;
+	memcpy(key.ek_fsid, buf, len);
+
+	ek = svc_expkey_lookup(cd, &key);
+	err = -ENOMEM;
+	if (!ek)
+		goto out;
+
+	/* now we want a pathname, or empty meaning NEGATIVE  */
+	err = -EINVAL;
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len < 0)
+		goto out;
+	dprintk("Path seems to be <%s>\n", buf);
+	err = 0;
+	if (len == 0) {
+		set_bit(CACHE_NEGATIVE, &key.h.flags);
+		ek = svc_expkey_update(cd, &key, ek);
+		if (!ek)
+			err = -ENOMEM;
+	} else {
+		err = kern_path(buf, 0, &key.ek_path);
+		if (err)
+			goto out;
+
+		dprintk("Found the path %s\n", buf);
+
+		ek = svc_expkey_update(cd, &key, ek);
+		if (!ek)
+			err = -ENOMEM;
+		path_put(&key.ek_path);
+	}
+	cache_flush();
+ out:
+	if (ek)
+		cache_put(&ek->h, cd);
+	if (dom)
+		auth_domain_put(dom);
+	kfree(buf);
+	return err;
+}
+
+static int expkey_show(struct seq_file *m,
+		       struct cache_detail *cd,
+		       struct cache_head *h)
+{
+	struct svc_expkey *ek ;
+	int i;
+
+	if (h ==NULL) {
+		seq_puts(m, "#domain fsidtype fsid [path]\n");
+		return 0;
+	}
+	ek = container_of(h, struct svc_expkey, h);
+	seq_printf(m, "%s %d 0x", ek->ek_client->name,
+		   ek->ek_fsidtype);
+	for (i=0; i < key_len(ek->ek_fsidtype)/4; i++)
+		seq_printf(m, "%08x", ek->ek_fsid[i]);
+	if (test_bit(CACHE_VALID, &h->flags) && 
+	    !test_bit(CACHE_NEGATIVE, &h->flags)) {
+		seq_printf(m, " ");
+		seq_path(m, &ek->ek_path, "\\ \t\n");
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static inline int expkey_match (struct cache_head *a, struct cache_head *b)
+{
+	struct svc_expkey *orig = container_of(a, struct svc_expkey, h);
+	struct svc_expkey *new = container_of(b, struct svc_expkey, h);
+
+	if (orig->ek_fsidtype != new->ek_fsidtype ||
+	    orig->ek_client != new->ek_client ||
+	    memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0)
+		return 0;
+	return 1;
+}
+
+static inline void expkey_init(struct cache_head *cnew,
+				   struct cache_head *citem)
+{
+	struct svc_expkey *new = container_of(cnew, struct svc_expkey, h);
+	struct svc_expkey *item = container_of(citem, struct svc_expkey, h);
+
+	kref_get(&item->ek_client->ref);
+	new->ek_client = item->ek_client;
+	new->ek_fsidtype = item->ek_fsidtype;
+
+	memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid));
+}
+
+static inline void expkey_update(struct cache_head *cnew,
+				   struct cache_head *citem)
+{
+	struct svc_expkey *new = container_of(cnew, struct svc_expkey, h);
+	struct svc_expkey *item = container_of(citem, struct svc_expkey, h);
+
+	new->ek_path = item->ek_path;
+	path_get(&item->ek_path);
+}
+
+static struct cache_head *expkey_alloc(void)
+{
+	struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL);
+	if (i)
+		return &i->h;
+	else
+		return NULL;
+}
+
+static struct cache_detail svc_expkey_cache_template = {
+	.owner		= THIS_MODULE,
+	.hash_size	= EXPKEY_HASHMAX,
+	.name		= "nfsd.fh",
+	.cache_put	= expkey_put,
+	.cache_request	= expkey_request,
+	.cache_parse	= expkey_parse,
+	.cache_show	= expkey_show,
+	.match		= expkey_match,
+	.init		= expkey_init,
+	.update       	= expkey_update,
+	.alloc		= expkey_alloc,
+};
+
+static int
+svc_expkey_hash(struct svc_expkey *item)
+{
+	int hash = item->ek_fsidtype;
+	char * cp = (char*)item->ek_fsid;
+	int len = key_len(item->ek_fsidtype);
+
+	hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
+	hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
+	hash &= EXPKEY_HASHMASK;
+	return hash;
+}
+
+static struct svc_expkey *
+svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item)
+{
+	struct cache_head *ch;
+	int hash = svc_expkey_hash(item);
+
+	ch = sunrpc_cache_lookup(cd, &item->h, hash);
+	if (ch)
+		return container_of(ch, struct svc_expkey, h);
+	else
+		return NULL;
+}
+
+static struct svc_expkey *
+svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
+		  struct svc_expkey *old)
+{
+	struct cache_head *ch;
+	int hash = svc_expkey_hash(new);
+
+	ch = sunrpc_cache_update(cd, &new->h, &old->h, hash);
+	if (ch)
+		return container_of(ch, struct svc_expkey, h);
+	else
+		return NULL;
+}
+
+
+#define	EXPORT_HASHBITS		8
+#define	EXPORT_HASHMAX		(1<< EXPORT_HASHBITS)
+
+static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
+{
+	struct nfsd4_fs_location *locations = fsloc->locations;
+	int i;
+
+	if (!locations)
+		return;
+
+	for (i = 0; i < fsloc->locations_count; i++) {
+		kfree(locations[i].path);
+		kfree(locations[i].hosts);
+	}
+
+	kfree(locations);
+	fsloc->locations = NULL;
+}
+
+static void svc_export_put(struct kref *ref)
+{
+	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
+	path_put(&exp->ex_path);
+	auth_domain_put(exp->ex_client);
+	nfsd4_fslocs_free(&exp->ex_fslocs);
+	kfree(exp->ex_uuid);
+	kfree(exp);
+}
+
+static void svc_export_request(struct cache_detail *cd,
+			       struct cache_head *h,
+			       char **bpp, int *blen)
+{
+	/*  client path */
+	struct svc_export *exp = container_of(h, struct svc_export, h);
+	char *pth;
+
+	qword_add(bpp, blen, exp->ex_client->name);
+	pth = d_path(&exp->ex_path, *bpp, *blen);
+	if (IS_ERR(pth)) {
+		/* is this correct? */
+		(*bpp)[0] = '\n';
+		return;
+	}
+	qword_add(bpp, blen, pth);
+	(*bpp)[-1] = '\n';
+}
+
+static struct svc_export *svc_export_update(struct svc_export *new,
+					    struct svc_export *old);
+static struct svc_export *svc_export_lookup(struct svc_export *);
+
+static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+{
+
+	/*
+	 * We currently export only dirs, regular files, and (for v4
+	 * pseudoroot) symlinks.
+	 */
+	if (!S_ISDIR(inode->i_mode) &&
+	    !S_ISLNK(inode->i_mode) &&
+	    !S_ISREG(inode->i_mode))
+		return -ENOTDIR;
+
+	/*
+	 * Mountd should never pass down a writeable V4ROOT export, but,
+	 * just to make sure:
+	 */
+	if (*flags & NFSEXP_V4ROOT)
+		*flags |= NFSEXP_READONLY;
+
+	/* There are two requirements on a filesystem to be exportable.
+	 * 1:  We must be able to identify the filesystem from a number.
+	 *       either a device number (so FS_REQUIRES_DEV needed)
+	 *       or an FSID number (so NFSEXP_FSID or ->uuid is needed).
+	 * 2:  We must be able to find an inode from a filehandle.
+	 *       This means that s_export_op must be set.
+	 */
+	if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
+	    !(*flags & NFSEXP_FSID) &&
+	    uuid == NULL) {
+		dprintk("exp_export: export of non-dev fs without fsid\n");
+		return -EINVAL;
+	}
+
+	if (!inode->i_sb->s_export_op ||
+	    !inode->i_sb->s_export_op->fh_to_dentry) {
+		dprintk("exp_export: export of invalid fs type.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+
+}
+
+#ifdef CONFIG_NFSD_V4
+
+static int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
+{
+	int len;
+	int migrated, i, err;
+
+	/* more than one fsloc */
+	if (fsloc->locations)
+		return -EINVAL;
+
+	/* listsize */
+	err = get_uint(mesg, &fsloc->locations_count);
+	if (err)
+		return err;
+	if (fsloc->locations_count > MAX_FS_LOCATIONS)
+		return -EINVAL;
+	if (fsloc->locations_count == 0)
+		return 0;
+
+	fsloc->locations = kzalloc(fsloc->locations_count
+			* sizeof(struct nfsd4_fs_location), GFP_KERNEL);
+	if (!fsloc->locations)
+		return -ENOMEM;
+	for (i=0; i < fsloc->locations_count; i++) {
+		/* colon separated host list */
+		err = -EINVAL;
+		len = qword_get(mesg, buf, PAGE_SIZE);
+		if (len <= 0)
+			goto out_free_all;
+		err = -ENOMEM;
+		fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL);
+		if (!fsloc->locations[i].hosts)
+			goto out_free_all;
+		err = -EINVAL;
+		/* slash separated path component list */
+		len = qword_get(mesg, buf, PAGE_SIZE);
+		if (len <= 0)
+			goto out_free_all;
+		err = -ENOMEM;
+		fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL);
+		if (!fsloc->locations[i].path)
+			goto out_free_all;
+	}
+	/* migrated */
+	err = get_int(mesg, &migrated);
+	if (err)
+		goto out_free_all;
+	err = -EINVAL;
+	if (migrated < 0 || migrated > 1)
+		goto out_free_all;
+	fsloc->migrated = migrated;
+	return 0;
+out_free_all:
+	nfsd4_fslocs_free(fsloc);
+	return err;
+}
+
+static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+	struct exp_flavor_info *f;
+	u32 listsize;
+	int err;
+
+	/* more than one secinfo */
+	if (exp->ex_nflavors)
+		return -EINVAL;
+
+	err = get_uint(mesg, &listsize);
+	if (err)
+		return err;
+	if (listsize > MAX_SECINFO_LIST)
+		return -EINVAL;
+
+	for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
+		err = get_uint(mesg, &f->pseudoflavor);
+		if (err)
+			return err;
+		/*
+		 * XXX: It would be nice to also check whether this
+		 * pseudoflavor is supported, so we can discover the
+		 * problem at export time instead of when a client fails
+		 * to authenticate.
+		 */
+		err = get_uint(mesg, &f->flags);
+		if (err)
+			return err;
+		/* Only some flags are allowed to differ between flavors: */
+		if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags))
+			return -EINVAL;
+	}
+	exp->ex_nflavors = listsize;
+	return 0;
+}
+
+#else /* CONFIG_NFSD_V4 */
+static inline int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;}
+static inline int
+secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
+#endif
+
+static inline int
+uuid_parse(char **mesg, char *buf, unsigned char **puuid)
+{
+	int len;
+
+	/* more than one uuid */
+	if (*puuid)
+		return -EINVAL;
+
+	/* expect a 16 byte uuid encoded as \xXXXX... */
+	len = qword_get(mesg, buf, PAGE_SIZE);
+	if (len != EX_UUID_LEN)
+		return -EINVAL;
+
+	*puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL);
+	if (*puuid == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	/* client path expiry [flags anonuid anongid fsid] */
+	char *buf;
+	int len;
+	int err;
+	struct auth_domain *dom = NULL;
+	struct svc_export exp = {}, *expp;
+	int an_int;
+
+	if (mesg[mlen-1] != '\n')
+		return -EINVAL;
+	mesg[mlen-1] = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* client */
+	err = -EINVAL;
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len <= 0)
+		goto out;
+
+	err = -ENOENT;
+	dom = auth_domain_find(buf);
+	if (!dom)
+		goto out;
+
+	/* path */
+	err = -EINVAL;
+	if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out1;
+
+	err = kern_path(buf, 0, &exp.ex_path);
+	if (err)
+		goto out1;
+
+	exp.ex_client = dom;
+	exp.cd = cd;
+	exp.ex_devid_map = NULL;
+
+	/* expiry */
+	err = -EINVAL;
+	exp.h.expiry_time = get_expiry(&mesg);
+	if (exp.h.expiry_time == 0)
+		goto out3;
+
+	/* flags */
+	err = get_int(&mesg, &an_int);
+	if (err == -ENOENT) {
+		err = 0;
+		set_bit(CACHE_NEGATIVE, &exp.h.flags);
+	} else {
+		if (err || an_int < 0)
+			goto out3;
+		exp.ex_flags= an_int;
+	
+		/* anon uid */
+		err = get_int(&mesg, &an_int);
+		if (err)
+			goto out3;
+		exp.ex_anon_uid= make_kuid(&init_user_ns, an_int);
+
+		/* anon gid */
+		err = get_int(&mesg, &an_int);
+		if (err)
+			goto out3;
+		exp.ex_anon_gid= make_kgid(&init_user_ns, an_int);
+
+		/* fsid */
+		err = get_int(&mesg, &an_int);
+		if (err)
+			goto out3;
+		exp.ex_fsid = an_int;
+
+		while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
+			if (strcmp(buf, "fsloc") == 0)
+				err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
+			else if (strcmp(buf, "uuid") == 0)
+				err = uuid_parse(&mesg, buf, &exp.ex_uuid);
+			else if (strcmp(buf, "secinfo") == 0)
+				err = secinfo_parse(&mesg, buf, &exp);
+			else
+				/* quietly ignore unknown words and anything
+				 * following. Newer user-space can try to set
+				 * new values, then see what the result was.
+				 */
+				break;
+			if (err)
+				goto out4;
+		}
+
+		err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags,
+				   exp.ex_uuid);
+		if (err)
+			goto out4;
+		/*
+		 * No point caching this if it would immediately expire.
+		 * Also, this protects exportfs's dummy export from the
+		 * anon_uid/anon_gid checks:
+		 */
+		if (exp.h.expiry_time < seconds_since_boot())
+			goto out4;
+		/*
+		 * For some reason exportfs has been passing down an
+		 * invalid (-1) uid & gid on the "dummy" export which it
+		 * uses to test export support.  To make sure exportfs
+		 * sees errors from check_export we therefore need to
+		 * delay these checks till after check_export:
+		 */
+		err = -EINVAL;
+		if (!uid_valid(exp.ex_anon_uid))
+			goto out4;
+		if (!gid_valid(exp.ex_anon_gid))
+			goto out4;
+		err = 0;
+
+		nfsd4_setup_layout_type(&exp);
+	}
+
+	expp = svc_export_lookup(&exp);
+	if (expp)
+		expp = svc_export_update(&exp, expp);
+	else
+		err = -ENOMEM;
+	cache_flush();
+	if (expp == NULL)
+		err = -ENOMEM;
+	else
+		exp_put(expp);
+out4:
+	nfsd4_fslocs_free(&exp.ex_fslocs);
+	kfree(exp.ex_uuid);
+out3:
+	path_put(&exp.ex_path);
+out1:
+	auth_domain_put(dom);
+out:
+	kfree(buf);
+	return err;
+}
+
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+		kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
+static void show_secinfo(struct seq_file *m, struct svc_export *exp);
+
+static int svc_export_show(struct seq_file *m,
+			   struct cache_detail *cd,
+			   struct cache_head *h)
+{
+	struct svc_export *exp ;
+
+	if (h ==NULL) {
+		seq_puts(m, "#path domain(flags)\n");
+		return 0;
+	}
+	exp = container_of(h, struct svc_export, h);
+	seq_path(m, &exp->ex_path, " \t\n\\");
+	seq_putc(m, '\t');
+	seq_escape(m, exp->ex_client->name, " \t\n\\");
+	seq_putc(m, '(');
+	if (test_bit(CACHE_VALID, &h->flags) && 
+	    !test_bit(CACHE_NEGATIVE, &h->flags)) {
+		exp_flags(m, exp->ex_flags, exp->ex_fsid,
+			  exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
+		if (exp->ex_uuid) {
+			int i;
+			seq_puts(m, ",uuid=");
+			for (i = 0; i < EX_UUID_LEN; i++) {
+				if ((i&3) == 0 && i)
+					seq_putc(m, ':');
+				seq_printf(m, "%02x", exp->ex_uuid[i]);
+			}
+		}
+		show_secinfo(m, exp);
+	}
+	seq_puts(m, ")\n");
+	return 0;
+}
+static int svc_export_match(struct cache_head *a, struct cache_head *b)
+{
+	struct svc_export *orig = container_of(a, struct svc_export, h);
+	struct svc_export *new = container_of(b, struct svc_export, h);
+	return orig->ex_client == new->ex_client &&
+		path_equal(&orig->ex_path, &new->ex_path);
+}
+
+static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct svc_export *new = container_of(cnew, struct svc_export, h);
+	struct svc_export *item = container_of(citem, struct svc_export, h);
+
+	kref_get(&item->ex_client->ref);
+	new->ex_client = item->ex_client;
+	new->ex_path = item->ex_path;
+	path_get(&item->ex_path);
+	new->ex_fslocs.locations = NULL;
+	new->ex_fslocs.locations_count = 0;
+	new->ex_fslocs.migrated = 0;
+	new->ex_layout_type = 0;
+	new->ex_uuid = NULL;
+	new->cd = item->cd;
+}
+
+static void export_update(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct svc_export *new = container_of(cnew, struct svc_export, h);
+	struct svc_export *item = container_of(citem, struct svc_export, h);
+	int i;
+
+	new->ex_flags = item->ex_flags;
+	new->ex_anon_uid = item->ex_anon_uid;
+	new->ex_anon_gid = item->ex_anon_gid;
+	new->ex_fsid = item->ex_fsid;
+	new->ex_devid_map = item->ex_devid_map;
+	item->ex_devid_map = NULL;
+	new->ex_uuid = item->ex_uuid;
+	item->ex_uuid = NULL;
+	new->ex_fslocs.locations = item->ex_fslocs.locations;
+	item->ex_fslocs.locations = NULL;
+	new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
+	item->ex_fslocs.locations_count = 0;
+	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
+	item->ex_fslocs.migrated = 0;
+	new->ex_layout_type = item->ex_layout_type;
+	new->ex_nflavors = item->ex_nflavors;
+	for (i = 0; i < MAX_SECINFO_LIST; i++) {
+		new->ex_flavors[i] = item->ex_flavors[i];
+	}
+}
+
+static struct cache_head *svc_export_alloc(void)
+{
+	struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL);
+	if (i)
+		return &i->h;
+	else
+		return NULL;
+}
+
+static struct cache_detail svc_export_cache_template = {
+	.owner		= THIS_MODULE,
+	.hash_size	= EXPORT_HASHMAX,
+	.name		= "nfsd.export",
+	.cache_put	= svc_export_put,
+	.cache_request	= svc_export_request,
+	.cache_parse	= svc_export_parse,
+	.cache_show	= svc_export_show,
+	.match		= svc_export_match,
+	.init		= svc_export_init,
+	.update		= export_update,
+	.alloc		= svc_export_alloc,
+};
+
+static int
+svc_export_hash(struct svc_export *exp)
+{
+	int hash;
+
+	hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
+	hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
+	hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+	return hash;
+}
+
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+	struct cache_head *ch;
+	int hash = svc_export_hash(exp);
+
+	ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash);
+	if (ch)
+		return container_of(ch, struct svc_export, h);
+	else
+		return NULL;
+}
+
+static struct svc_export *
+svc_export_update(struct svc_export *new, struct svc_export *old)
+{
+	struct cache_head *ch;
+	int hash = svc_export_hash(old);
+
+	ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash);
+	if (ch)
+		return container_of(ch, struct svc_export, h);
+	else
+		return NULL;
+}
+
+
+static struct svc_expkey *
+exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type,
+	     u32 *fsidv, struct cache_req *reqp)
+{
+	struct svc_expkey key, *ek;
+	int err;
+	
+	if (!clp)
+		return ERR_PTR(-ENOENT);
+
+	key.ek_client = clp;
+	key.ek_fsidtype = fsid_type;
+	memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
+
+	ek = svc_expkey_lookup(cd, &key);
+	if (ek == NULL)
+		return ERR_PTR(-ENOMEM);
+	err = cache_check(cd, &ek->h, reqp);
+	if (err)
+		return ERR_PTR(err);
+	return ek;
+}
+
+static struct svc_export *
+exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp,
+		const struct path *path, struct cache_req *reqp)
+{
+	struct svc_export *exp, key;
+	int err;
+
+	if (!clp)
+		return ERR_PTR(-ENOENT);
+
+	key.ex_client = clp;
+	key.ex_path = *path;
+	key.cd = cd;
+
+	exp = svc_export_lookup(&key);
+	if (exp == NULL)
+		return ERR_PTR(-ENOMEM);
+	err = cache_check(cd, &exp->h, reqp);
+	if (err)
+		return ERR_PTR(err);
+	return exp;
+}
+
+/*
+ * Find the export entry for a given dentry.
+ */
+static struct svc_export *
+exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path)
+{
+	struct dentry *saved = dget(path->dentry);
+	struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = exp_get_by_name(cd, clp, path, NULL);
+	}
+	dput(path->dentry);
+	path->dentry = saved;
+	return exp;
+}
+
+
+
+/*
+ * Obtain the root fh on behalf of a client.
+ * This could be done in user space, but I feel that it adds some safety
+ * since its harder to fool a kernel module than a user space program.
+ */
+int
+exp_rootfh(struct net *net, struct auth_domain *clp, char *name,
+	   struct knfsd_fh *f, int maxsize)
+{
+	struct svc_export	*exp;
+	struct path		path;
+	struct inode		*inode;
+	struct svc_fh		fh;
+	int			err;
+	struct nfsd_net		*nn = net_generic(net, nfsd_net_id);
+	struct cache_detail	*cd = nn->svc_export_cache;
+
+	err = -EPERM;
+	/* NB: we probably ought to check that it's NUL-terminated */
+	if (kern_path(name, 0, &path)) {
+		printk("nfsd: exp_rootfh path not found %s", name);
+		return err;
+	}
+	inode = d_inode(path.dentry);
+
+	dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
+		 name, path.dentry, clp->name,
+		 inode->i_sb->s_id, inode->i_ino);
+	exp = exp_parent(cd, clp, &path);
+	if (IS_ERR(exp)) {
+		err = PTR_ERR(exp);
+		goto out;
+	}
+
+	/*
+	 * fh must be initialized before calling fh_compose
+	 */
+	fh_init(&fh, maxsize);
+	if (fh_compose(&fh, exp, path.dentry, NULL))
+		err = -EINVAL;
+	else
+		err = 0;
+	memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh));
+	fh_put(&fh);
+	exp_put(exp);
+out:
+	path_put(&path);
+	return err;
+}
+
+static struct svc_export *exp_find(struct cache_detail *cd,
+				   struct auth_domain *clp, int fsid_type,
+				   u32 *fsidv, struct cache_req *reqp)
+{
+	struct svc_export *exp;
+	struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id);
+	struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp);
+	if (IS_ERR(ek))
+		return ERR_CAST(ek);
+
+	exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp);
+	cache_put(&ek->h, nn->svc_expkey_cache);
+
+	if (IS_ERR(exp))
+		return ERR_CAST(exp);
+	return exp;
+}
+
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	/* legacy gss-only clients are always OK: */
+	if (exp->ex_client == rqstp->rq_gssclient)
+		return 0;
+	/* ip-address based client; check sec= export option: */
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
+			return 0;
+	}
+	/* defaults in absence of sec= options: */
+	if (exp->ex_nflavors == 0) {
+		if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL ||
+		    rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
+			return 0;
+	}
+	return nfserr_wrongsec;
+}
+
+/*
+ * Uses rq_client and rq_gssclient to find an export; uses rq_client (an
+ * auth_unix client) if it's available and has secinfo information;
+ * otherwise, will try to use rq_gssclient.
+ *
+ * Called from functions that handle requests; functions that do work on
+ * behalf of mountd are passed a single client name to use, and should
+ * use exp_get_by_name() or exp_find().
+ */
+struct svc_export *
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
+{
+	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	struct cache_detail *cd = nn->svc_export_cache;
+
+	if (rqstp->rq_client == NULL)
+		goto gss;
+
+	/* First try the auth_unix client: */
+	exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle);
+	if (PTR_ERR(exp) == -ENOENT)
+		goto gss;
+	if (IS_ERR(exp))
+		return exp;
+	/* If it has secinfo, assume there are no gss/... clients */
+	if (exp->ex_nflavors > 0)
+		return exp;
+gss:
+	/* Otherwise, try falling back on gss client */
+	if (rqstp->rq_gssclient == NULL)
+		return exp;
+	gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle);
+	if (PTR_ERR(gssexp) == -ENOENT)
+		return exp;
+	if (!IS_ERR(exp))
+		exp_put(exp);
+	return gssexp;
+}
+
+struct svc_export *
+rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
+{
+	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	struct cache_detail *cd = nn->svc_export_cache;
+
+	if (rqstp->rq_client == NULL)
+		goto gss;
+
+	/* First try the auth_unix client: */
+	exp = exp_find(cd, rqstp->rq_client, fsid_type,
+		       fsidv, &rqstp->rq_chandle);
+	if (PTR_ERR(exp) == -ENOENT)
+		goto gss;
+	if (IS_ERR(exp))
+		return exp;
+	/* If it has secinfo, assume there are no gss/... clients */
+	if (exp->ex_nflavors > 0)
+		return exp;
+gss:
+	/* Otherwise, try falling back on gss client */
+	if (rqstp->rq_gssclient == NULL)
+		return exp;
+	gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv,
+						&rqstp->rq_chandle);
+	if (PTR_ERR(gssexp) == -ENOENT)
+		return exp;
+	if (!IS_ERR(exp))
+		exp_put(exp);
+	return gssexp;
+}
+
+struct svc_export *
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
+{
+	struct dentry *saved = dget(path->dentry);
+	struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = rqst_exp_get_by_name(rqstp, path);
+	}
+	dput(path->dentry);
+	path->dentry = saved;
+	return exp;
+}
+
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
+{
+	u32 fsidv[2];
+
+	mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
+
+	return rqst_exp_find(rqstp, FSID_NUM, fsidv);
+}
+
+/*
+ * Called when we need the filehandle for the root of the pseudofs,
+ * for a given NFSv4 client.   The root is defined to be the
+ * export point with fsid==0
+ */
+__be32
+exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
+{
+	struct svc_export *exp;
+	__be32 rv;
+
+	exp = rqst_find_fsidzero_export(rqstp);
+	if (IS_ERR(exp))
+		return nfserrno(PTR_ERR(exp));
+	rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
+	exp_put(exp);
+	return rv;
+}
+
+/* Iterator */
+
+static void *e_start(struct seq_file *m, loff_t *pos)
+	__acquires(((struct cache_detail *)m->private)->hash_lock)
+{
+	loff_t n = *pos;
+	unsigned hash, export;
+	struct cache_head *ch;
+	struct cache_detail *cd = m->private;
+	struct cache_head **export_table = cd->hash_table;
+
+	read_lock(&cd->hash_lock);
+	if (!n--)
+		return SEQ_START_TOKEN;
+	hash = n >> 32;
+	export = n & ((1LL<<32) - 1);
+
+	
+	for (ch=export_table[hash]; ch; ch=ch->next)
+		if (!export--)
+			return ch;
+	n &= ~((1LL<<32) - 1);
+	do {
+		hash++;
+		n += 1LL<<32;
+	} while(hash < EXPORT_HASHMAX && export_table[hash]==NULL);
+	if (hash >= EXPORT_HASHMAX)
+		return NULL;
+	*pos = n+1;
+	return export_table[hash];
+}
+
+static void *e_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cache_head *ch = p;
+	int hash = (*pos >> 32);
+	struct cache_detail *cd = m->private;
+	struct cache_head **export_table = cd->hash_table;
+
+	if (p == SEQ_START_TOKEN)
+		hash = 0;
+	else if (ch->next == NULL) {
+		hash++;
+		*pos += 1LL<<32;
+	} else {
+		++*pos;
+		return ch->next;
+	}
+	*pos &= ~((1LL<<32) - 1);
+	while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) {
+		hash++;
+		*pos += 1LL<<32;
+	}
+	if (hash >= EXPORT_HASHMAX)
+		return NULL;
+	++*pos;
+	return export_table[hash];
+}
+
+static void e_stop(struct seq_file *m, void *p)
+	__releases(((struct cache_detail *)m->private)->hash_lock)
+{
+	struct cache_detail *cd = m->private;
+
+	read_unlock(&cd->hash_lock);
+}
+
+static struct flags {
+	int flag;
+	char *name[2];
+} expflags[] = {
+	{ NFSEXP_READONLY, {"ro", "rw"}},
+	{ NFSEXP_INSECURE_PORT, {"insecure", ""}},
+	{ NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}},
+	{ NFSEXP_ALLSQUASH, {"all_squash", ""}},
+	{ NFSEXP_ASYNC, {"async", "sync"}},
+	{ NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
+	{ NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}},
+	{ NFSEXP_NOHIDE, {"nohide", ""}},
+	{ NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
+	{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
+	{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
+	{ NFSEXP_V4ROOT, {"v4root", ""}},
+	{ NFSEXP_PNFS, {"pnfs", ""}},
+	{ 0, {"", ""}}
+};
+
+static void show_expflags(struct seq_file *m, int flags, int mask)
+{
+	struct flags *flg;
+	int state, first = 0;
+
+	for (flg = expflags; flg->flag; flg++) {
+		if (flg->flag & ~mask)
+			continue;
+		state = (flg->flag & flags) ? 0 : 1;
+		if (*flg->name[state])
+			seq_printf(m, "%s%s", first++?",":"", flg->name[state]);
+	}
+}
+
+static void show_secinfo_flags(struct seq_file *m, int flags)
+{
+	seq_printf(m, ",");
+	show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
+}
+
+static bool secinfo_flags_equal(int f, int g)
+{
+	f &= NFSEXP_SECINFO_FLAGS;
+	g &= NFSEXP_SECINFO_FLAGS;
+	return f == g;
+}
+
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+	int flags;
+
+	flags = (*fp)->flags;
+	seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+	(*fp)++;
+	while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+		seq_printf(m, ":%d", (*fp)->pseudoflavor);
+		(*fp)++;
+	}
+	return flags;
+}
+
+static void show_secinfo(struct seq_file *m, struct svc_export *exp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+	int flags;
+
+	if (exp->ex_nflavors == 0)
+		return;
+	f = exp->ex_flavors;
+	flags = show_secinfo_run(m, &f, end);
+	if (!secinfo_flags_equal(flags, exp->ex_flags))
+		show_secinfo_flags(m, flags);
+	while (f != end) {
+		flags = show_secinfo_run(m, &f, end);
+		show_secinfo_flags(m, flags);
+	}
+}
+
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+		kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc)
+{
+	show_expflags(m, flag, NFSEXP_ALLFLAGS);
+	if (flag & NFSEXP_FSID)
+		seq_printf(m, ",fsid=%d", fsid);
+	if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) &&
+	    !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2)))
+		seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu));
+	if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) &&
+	    !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2)))
+		seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong));
+	if (fsloc && fsloc->locations_count > 0) {
+		char *loctype = (fsloc->migrated) ? "refer" : "replicas";
+		int i;
+
+		seq_printf(m, ",%s=", loctype);
+		seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
+		seq_putc(m, '@');
+		seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
+		for (i = 1; i < fsloc->locations_count; i++) {
+			seq_putc(m, ';');
+			seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\");
+			seq_putc(m, '@');
+			seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\");
+		}
+	}
+}
+
+static int e_show(struct seq_file *m, void *p)
+{
+	struct cache_head *cp = p;
+	struct svc_export *exp = container_of(cp, struct svc_export, h);
+	struct cache_detail *cd = m->private;
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m, "# Version 1.1\n");
+		seq_puts(m, "# Path Client(Flags) # IPs\n");
+		return 0;
+	}
+
+	exp_get(exp);
+	if (cache_check(cd, &exp->h, NULL))
+		return 0;
+	exp_put(exp);
+	return svc_export_show(m, cd, cp);
+}
+
+const struct seq_operations nfs_exports_op = {
+	.start	= e_start,
+	.next	= e_next,
+	.stop	= e_stop,
+	.show	= e_show,
+};
+
+/*
+ * Initialize the exports module.
+ */
+int
+nfsd_export_init(struct net *net)
+{
+	int rv;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	dprintk("nfsd: initializing export module (net: %p).\n", net);
+
+	nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net);
+	if (IS_ERR(nn->svc_export_cache))
+		return PTR_ERR(nn->svc_export_cache);
+	rv = cache_register_net(nn->svc_export_cache, net);
+	if (rv)
+		goto destroy_export_cache;
+
+	nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net);
+	if (IS_ERR(nn->svc_expkey_cache)) {
+		rv = PTR_ERR(nn->svc_expkey_cache);
+		goto unregister_export_cache;
+	}
+	rv = cache_register_net(nn->svc_expkey_cache, net);
+	if (rv)
+		goto destroy_expkey_cache;
+	return 0;
+
+destroy_expkey_cache:
+	cache_destroy_net(nn->svc_expkey_cache, net);
+unregister_export_cache:
+	cache_unregister_net(nn->svc_export_cache, net);
+destroy_export_cache:
+	cache_destroy_net(nn->svc_export_cache, net);
+	return rv;
+}
+
+/*
+ * Flush exports table - called when last nfsd thread is killed
+ */
+void
+nfsd_export_flush(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	cache_purge(nn->svc_expkey_cache);
+	cache_purge(nn->svc_export_cache);
+}
+
+/*
+ * Shutdown the exports module.
+ */
+void
+nfsd_export_shutdown(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	dprintk("nfsd: shutting down export module (net: %p).\n", net);
+
+	cache_unregister_net(nn->svc_expkey_cache, net);
+	cache_unregister_net(nn->svc_export_cache, net);
+	cache_destroy_net(nn->svc_expkey_cache, net);
+	cache_destroy_net(nn->svc_export_cache, net);
+	svcauth_unix_purge(net);
+
+	dprintk("nfsd: export shutdown complete (net: %p).\n", net);
+}
diff --git a/kernel/fs/nfsd/export.h b/kernel/fs/nfsd/export.h
new file mode 100644
index 000000000..1f52bfcc4
--- /dev/null
+++ b/kernel/fs/nfsd/export.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef NFSD_EXPORT_H
+#define NFSD_EXPORT_H
+
+#include <linux/sunrpc/cache.h>
+#include <uapi/linux/nfsd/export.h>
+
+struct knfsd_fh;
+struct svc_fh;
+struct svc_rqst;
+
+/*
+ * FS Locations
+ */
+
+#define MAX_FS_LOCATIONS	128
+
+struct nfsd4_fs_location {
+	char *hosts; /* colon separated list of hosts */
+	char *path;  /* slash separated list of path components */
+};
+
+struct nfsd4_fs_locations {
+	uint32_t locations_count;
+	struct nfsd4_fs_location *locations;
+/* If we're not actually serving this data ourselves (only providing a
+ * list of replicas that do serve it) then we set "migrated": */
+	int migrated;
+};
+
+/*
+ * We keep an array of pseudoflavors with the export, in order from most
+ * to least preferred.  For the foreseeable future, we don't expect more
+ * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
+ * spkm3i, and spkm3p (and using all 8 at once should be rare).
+ */
+#define MAX_SECINFO_LIST	8
+#define EX_UUID_LEN		16
+
+struct exp_flavor_info {
+	u32	pseudoflavor;
+	u32	flags;
+};
+
+struct svc_export {
+	struct cache_head	h;
+	struct auth_domain *	ex_client;
+	int			ex_flags;
+	struct path		ex_path;
+	kuid_t			ex_anon_uid;
+	kgid_t			ex_anon_gid;
+	int			ex_fsid;
+	unsigned char *		ex_uuid; /* 16 byte fsid */
+	struct nfsd4_fs_locations ex_fslocs;
+	uint32_t		ex_nflavors;
+	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
+	enum pnfs_layouttype	ex_layout_type;
+	struct nfsd4_deviceid_map *ex_devid_map;
+	struct cache_detail	*cd;
+};
+
+/* an "export key" (expkey) maps a filehandlefragement to an
+ * svc_export for a given client.  There can be several per export,
+ * for the different fsid types.
+ */
+struct svc_expkey {
+	struct cache_head	h;
+
+	struct auth_domain *	ek_client;
+	int			ek_fsidtype;
+	u32			ek_fsid[6];
+
+	struct path		ek_path;
+};
+
+#define EX_ISSYNC(exp)		(!((exp)->ex_flags & NFSEXP_ASYNC))
+#define EX_NOHIDE(exp)		((exp)->ex_flags & NFSEXP_NOHIDE)
+#define EX_WGATHER(exp)		((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
+
+/*
+ * Function declarations
+ */
+int			nfsd_export_init(struct net *);
+void			nfsd_export_shutdown(struct net *);
+void			nfsd_export_flush(struct net *);
+struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
+					     struct path *);
+struct svc_export *	rqst_exp_parent(struct svc_rqst *,
+					struct path *);
+struct svc_export *	rqst_find_fsidzero_export(struct svc_rqst *);
+int			exp_rootfh(struct net *, struct auth_domain *,
+					char *path, struct knfsd_fh *, int maxsize);
+__be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
+__be32			nfserrno(int errno);
+
+static inline void exp_put(struct svc_export *exp)
+{
+	cache_put(&exp->h, exp->cd);
+}
+
+static inline struct svc_export *exp_get(struct svc_export *exp)
+{
+	cache_get(&exp->h);
+	return exp;
+}
+struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
+
+#endif /* NFSD_EXPORT_H */
diff --git a/kernel/fs/nfsd/fault_inject.c b/kernel/fs/nfsd/fault_inject.c
new file mode 100644
index 000000000..c16bf5af6
--- /dev/null
+++ b/kernel/fs/nfsd/fault_inject.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Uses debugfs to create fault injection points for client testing
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/sunrpc/addr.h>
+#include <asm/uaccess.h>
+
+#include "state.h"
+#include "netns.h"
+
+struct nfsd_fault_inject_op {
+	char *file;
+	u64 (*get)(void);
+	u64 (*set_val)(u64);
+	u64 (*set_clnt)(struct sockaddr_storage *, size_t);
+};
+
+static struct dentry *debug_dir;
+
+static ssize_t fault_inject_read(struct file *file, char __user *buf,
+				 size_t len, loff_t *ppos)
+{
+	static u64 val;
+	char read_buf[25];
+	size_t size;
+	loff_t pos = *ppos;
+	struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
+
+	if (!pos)
+		val = op->get();
+	size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
+
+	return simple_read_from_buffer(buf, len, ppos, read_buf, size);
+}
+
+static ssize_t fault_inject_write(struct file *file, const char __user *buf,
+				  size_t len, loff_t *ppos)
+{
+	char write_buf[INET6_ADDRSTRLEN];
+	size_t size = min(sizeof(write_buf) - 1, len);
+	struct net *net = current->nsproxy->net_ns;
+	struct sockaddr_storage sa;
+	struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
+	u64 val;
+	char *nl;
+
+	if (copy_from_user(write_buf, buf, size))
+		return -EFAULT;
+	write_buf[size] = '\0';
+
+	/* Deal with any embedded newlines in the string */
+	nl = strchr(write_buf, '\n');
+	if (nl) {
+		size = nl - write_buf;
+		*nl = '\0';
+	}
+
+	size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
+	if (size > 0) {
+		val = op->set_clnt(&sa, size);
+		if (val)
+			pr_info("NFSD [%s]: Client %s had %llu state object(s)\n",
+				op->file, write_buf, val);
+	} else {
+		val = simple_strtoll(write_buf, NULL, 0);
+		if (val == 0)
+			pr_info("NFSD Fault Injection: %s (all)", op->file);
+		else
+			pr_info("NFSD Fault Injection: %s (n = %llu)",
+				op->file, val);
+		val = op->set_val(val);
+		pr_info("NFSD: %s: found %llu", op->file, val);
+	}
+	return len; /* on success, claim we got the whole input */
+}
+
+static const struct file_operations fops_nfsd = {
+	.owner   = THIS_MODULE,
+	.read    = fault_inject_read,
+	.write   = fault_inject_write,
+};
+
+void nfsd_fault_inject_cleanup(void)
+{
+	debugfs_remove_recursive(debug_dir);
+}
+
+static struct nfsd_fault_inject_op inject_ops[] = {
+	{
+		.file     = "forget_clients",
+		.get	  = nfsd_inject_print_clients,
+		.set_val  = nfsd_inject_forget_clients,
+		.set_clnt = nfsd_inject_forget_client,
+	},
+	{
+		.file     = "forget_locks",
+		.get	  = nfsd_inject_print_locks,
+		.set_val  = nfsd_inject_forget_locks,
+		.set_clnt = nfsd_inject_forget_client_locks,
+	},
+	{
+		.file     = "forget_openowners",
+		.get	  = nfsd_inject_print_openowners,
+		.set_val  = nfsd_inject_forget_openowners,
+		.set_clnt = nfsd_inject_forget_client_openowners,
+	},
+	{
+		.file     = "forget_delegations",
+		.get	  = nfsd_inject_print_delegations,
+		.set_val  = nfsd_inject_forget_delegations,
+		.set_clnt = nfsd_inject_forget_client_delegations,
+	},
+	{
+		.file     = "recall_delegations",
+		.get	  = nfsd_inject_print_delegations,
+		.set_val  = nfsd_inject_recall_delegations,
+		.set_clnt = nfsd_inject_recall_client_delegations,
+	},
+};
+
+#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op))
+
+int nfsd_fault_inject_init(void)
+{
+	unsigned int i;
+	struct nfsd_fault_inject_op *op;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+
+	debug_dir = debugfs_create_dir("nfsd", NULL);
+	if (!debug_dir)
+		goto fail;
+
+	for (i = 0; i < NUM_INJECT_OPS; i++) {
+		op = &inject_ops[i];
+		if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
+			goto fail;
+	}
+	return 0;
+
+fail:
+	nfsd_fault_inject_cleanup();
+	return -ENOMEM;
+}
diff --git a/kernel/fs/nfsd/idmap.h b/kernel/fs/nfsd/idmap.h
new file mode 100644
index 000000000..a3f349000
--- /dev/null
+++ b/kernel/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
+/*
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(struct net *);
+void nfsd_idmap_shutdown(struct net *);
+#else
+static inline int nfsd_idmap_init(struct net *net)
+{
+	return 0;
+}
+static inline void nfsd_idmap_shutdown(struct net *net)
+{
+}
+#endif
+
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
+__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t);
+__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t);
+
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/kernel/fs/nfsd/lockd.c b/kernel/fs/nfsd/lockd.c
new file mode 100644
index 000000000..77e7a5cca
--- /dev/null
+++ b/kernel/fs/nfsd/lockd.c
@@ -0,0 +1,77 @@
+/*
+ * This file contains all the stubs needed when communicating with lockd.
+ * This level of indirection is necessary so we can run nfsd+lockd without
+ * requiring the nfs client to be compiled in/loaded, and vice versa.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/file.h>
+#include <linux/lockd/bind.h>
+#include "nfsd.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_LOCKD
+
+#ifdef CONFIG_LOCKD_V4
+#define nlm_stale_fh	nlm4_stale_fh
+#define nlm_failed	nlm4_failed
+#else
+#define nlm_stale_fh	nlm_lck_denied_nolocks
+#define nlm_failed	nlm_lck_denied_nolocks
+#endif
+/*
+ * Note: we hold the dentry use count while the file is open.
+ */
+static __be32
+nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
+{
+	__be32		nfserr;
+	struct svc_fh	fh;
+
+	/* must initialize before using! but maxsize doesn't matter */
+	fh_init(&fh,0);
+	fh.fh_handle.fh_size = f->size;
+	memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
+	fh.fh_export = NULL;
+
+	nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
+	fh_put(&fh);
+ 	/* We return nlm error codes as nlm doesn't know
+	 * about nfsd, but nfsd does know about nlm..
+	 */
+	switch (nfserr) {
+	case nfs_ok:
+		return 0;
+	case nfserr_dropit:
+		return nlm_drop_reply;
+	case nfserr_stale:
+		return nlm_stale_fh;
+	default:
+		return nlm_failed;
+	}
+}
+
+static void
+nlm_fclose(struct file *filp)
+{
+	fput(filp);
+}
+
+static struct nlmsvc_binding	nfsd_nlm_ops = {
+	.fopen		= nlm_fopen,		/* open file for locking */
+	.fclose		= nlm_fclose,		/* close file */
+};
+
+void
+nfsd_lockd_init(void)
+{
+	dprintk("nfsd: initializing lockd\n");
+	nlmsvc_ops = &nfsd_nlm_ops;
+}
+
+void
+nfsd_lockd_shutdown(void)
+{
+	nlmsvc_ops = NULL;
+}
diff --git a/kernel/fs/nfsd/netns.h b/kernel/fs/nfsd/netns.h
new file mode 100644
index 000000000..ea6749a32
--- /dev/null
+++ b/kernel/fs/nfsd/netns.h
@@ -0,0 +1,121 @@
+/*
+ * per net namespace data structures for nfsd
+ *
+ * Copyright (C) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __NFSD_NETNS_H__
+#define __NFSD_NETNS_H__
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/* Hash tables for nfs4_clientid state */
+#define CLIENT_HASH_BITS                 4
+#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
+#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
+
+#define SESSION_HASH_SIZE	512
+
+struct cld_net;
+struct nfsd4_client_tracking_ops;
+
+/*
+ * Represents a nfsd "container". With respect to nfsv4 state tracking, the
+ * fields of interest are the *_id_hashtbls and the *_name_tree. These track
+ * the nfs4_client objects by either short or long form clientid.
+ *
+ * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean
+ * up expired clients and delegations within the container.
+ */
+struct nfsd_net {
+	struct cld_net *cld_net;
+
+	struct cache_detail *svc_expkey_cache;
+	struct cache_detail *svc_export_cache;
+
+	struct cache_detail *idtoname_cache;
+	struct cache_detail *nametoid_cache;
+
+	struct lock_manager nfsd4_manager;
+	bool grace_ended;
+	time_t boot_time;
+
+	/*
+	 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+	 * used in reboot/reset lease grace period processing
+	 *
+	 * conf_id_hashtbl[], and conf_name_tree hold confirmed
+	 * setclientid_confirmed info.
+	 *
+	 * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed
+	 * setclientid info.
+	 */
+	struct list_head *reclaim_str_hashtbl;
+	int reclaim_str_hashtbl_size;
+	struct list_head *conf_id_hashtbl;
+	struct rb_root conf_name_tree;
+	struct list_head *unconf_id_hashtbl;
+	struct rb_root unconf_name_tree;
+	struct list_head *sessionid_hashtbl;
+	/*
+	 * client_lru holds client queue ordered by nfs4_client.cl_time
+	 * for lease renewal.
+	 *
+	 * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+	 * for last close replay.
+	 *
+	 * All of the above fields are protected by the client_mutex.
+	 */
+	struct list_head client_lru;
+	struct list_head close_lru;
+	struct list_head del_recall_lru;
+
+	struct delayed_work laundromat_work;
+
+	/* client_lock protects the client lru list and session hash table */
+	spinlock_t client_lock;
+
+	struct file *rec_file;
+	bool in_grace;
+	struct nfsd4_client_tracking_ops *client_tracking_ops;
+
+	time_t nfsd4_lease;
+	time_t nfsd4_grace;
+
+	bool nfsd_net_up;
+	bool lockd_up;
+
+	/* Time of server startup */
+	struct timeval nfssvc_boot;
+
+	/*
+	 * Max number of connections this nfsd container will allow. Defaults
+	 * to '0' which is means that it bases this on the number of threads.
+	 */
+	unsigned int max_connections;
+
+	u32 clientid_counter;
+
+	struct svc_serv *nfsd_serv;
+};
+
+/* Simple check to find out if a given net was properly initialized */
+#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
+
+extern int nfsd_net_id;
+#endif /* __NFSD_NETNS_H__ */
diff --git a/kernel/fs/nfsd/nfs2acl.c b/kernel/fs/nfsd/nfs2acl.c
new file mode 100644
index 000000000..d54701f6d
--- /dev/null
+++ b/kernel/fs/nfsd/nfs2acl.c
@@ -0,0 +1,382 @@
+/*
+ * Process version 2 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include "nfsd.h"
+/* FIXME: nfsacl.h is a broken header */
+#include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
+		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
+{
+	struct posix_acl *acl;
+	struct inode *inode;
+	svc_fh *fh;
+	__be32 nfserr = 0;
+
+	dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	inode = d_inode(fh->fh_dentry);
+
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		RETURN_STATUS(nfserr_inval);
+	resp->mask = argp->mask;
+
+	nfserr = fh_getattr(fh, &resp->stat);
+	if (nfserr)
+		goto fail;
+
+	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+		acl = get_acl(inode, ACL_TYPE_ACCESS);
+		if (acl == NULL) {
+			/* Solaris returns the inode's minimum ACL. */
+			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		}
+		if (IS_ERR(acl)) {
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
+		}
+		resp->acl_access = acl;
+	}
+	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+		/* Check how Solaris handles requests for the Default ACL
+		   of a non-directory! */
+		acl = get_acl(inode, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl)) {
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
+		}
+		resp->acl_default = acl;
+	}
+
+	/* resp->acl_{access,default} are released in nfssvc_release_getacl. */
+	RETURN_STATUS(0);
+
+fail:
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
+		struct nfsd3_setaclargs *argp,
+		struct nfsd_attrstat *resp)
+{
+	struct inode *inode;
+	svc_fh *fh;
+	__be32 nfserr = 0;
+	int error;
+
+	dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+	if (nfserr)
+		goto out;
+
+	inode = d_inode(fh->fh_dentry);
+	if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
+		error = -EOPNOTSUPP;
+		goto out_errno;
+	}
+
+	error = fh_want_write(fh);
+	if (error)
+		goto out_errno;
+
+	error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+	if (error)
+		goto out_drop_write;
+	error = inode->i_op->set_acl(inode, argp->acl_default,
+				     ACL_TYPE_DEFAULT);
+	if (error)
+		goto out_drop_write;
+
+	fh_drop_write(fh);
+
+	nfserr = fh_getattr(fh, &resp->stat);
+
+out:
+	/* argp->acl_{access,default} may have been allocated in
+	   nfssvc_decode_setaclargs. */
+	posix_acl_release(argp->acl_access);
+	posix_acl_release(argp->acl_default);
+	return nfserr;
+out_drop_write:
+	fh_drop_write(fh);
+out_errno:
+	nfserr = nfserrno(error);
+	goto out;
+}
+
+/*
+ * Check file attributes
+ */
+static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
+		struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
+{
+	__be32 nfserr;
+	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		return nfserr;
+	nfserr = fh_getattr(&resp->fh, &resp->stat);
+	return nfserr;
+}
+
+/*
+ * Check file access
+ */
+static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+		struct nfsd3_accessres *resp)
+{
+	__be32 nfserr;
+
+	dprintk("nfsd: ACCESS(2acl)   %s 0x%x\n",
+			SVCFH_fmt(&argp->fh),
+			argp->access);
+
+	fh_copy(&resp->fh, &argp->fh);
+	resp->access = argp->access;
+	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+	if (nfserr)
+		return nfserr;
+	nfserr = fh_getattr(&resp->fh, &resp->stat);
+	return nfserr;
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclargs *argp)
+{
+	p = nfs2svc_decode_fh(p, &argp->fh);
+	if (!p)
+		return 0;
+	argp->mask = ntohl(*p); p++;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_setaclargs *argp)
+{
+	struct kvec *head = rqstp->rq_arg.head;
+	unsigned int base;
+	int n;
+
+	p = nfs2svc_decode_fh(p, &argp->fh);
+	if (!p)
+		return 0;
+	argp->mask = ntohl(*p++);
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	    !xdr_argsize_check(rqstp, p))
+		return 0;
+
+	base = (char *)p - (char *)head->iov_base;
+	n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+			  (argp->mask & NFS_ACL) ?
+			  &argp->acl_access : NULL);
+	if (n > 0)
+		n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+				  (argp->mask & NFS_DFACL) ?
+				  &argp->acl_default : NULL);
+	return (n > 0);
+}
+
+static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd_fhandle *argp)
+{
+	p = nfs2svc_decode_fh(p, &argp->fh);
+	if (!p)
+		return 0;
+	return xdr_argsize_check(rqstp, p);
+}
+
+static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_accessargs *argp)
+{
+	p = nfs2svc_decode_fh(p, &argp->fh);
+	if (!p)
+		return 0;
+	argp->access = ntohl(*p++);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* GETACL */
+static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	struct dentry *dentry = resp->fh.fh_dentry;
+	struct inode *inode;
+	struct kvec *head = rqstp->rq_res.head;
+	unsigned int base;
+	int n;
+	int w;
+
+	/*
+	 * Since this is version 2, the check for nfserr in
+	 * nfsd_dispatch actually ensures the following cannot happen.
+	 * However, it seems fragile to depend on that.
+	 */
+	if (dentry == NULL || d_really_is_negative(dentry))
+		return 0;
+	inode = d_inode(dentry);
+
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	*p++ = htonl(resp->mask);
+	if (!xdr_ressize_check(rqstp, p))
+		return 0;
+	base = (char *)p - (char *)head->iov_base;
+
+	rqstp->rq_res.page_len = w = nfsacl_size(
+		(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
+		(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+	while (w > 0) {
+		if (!*(rqstp->rq_next_page++))
+			return 0;
+		w -= PAGE_SIZE;
+	}
+
+	n = nfsacl_encode(&rqstp->rq_res, base, inode,
+			  resp->acl_access,
+			  resp->mask & NFS_ACL, 0);
+	if (n > 0)
+		n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+				  resp->acl_default,
+				  resp->mask & NFS_DFACL,
+				  NFS_ACL_DEFAULT);
+	if (n <= 0)
+		return 0;
+	return 1;
+}
+
+static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd_attrstat *resp)
+{
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* ACCESS */
+static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_accessres *resp)
+{
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	*p++ = htonl(resp->access);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	fh_put(&resp->fh);
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	return 1;
+}
+
+static int nfsaclsvc_release_attrstat(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd_attrstat *resp)
+{
+	fh_put(&resp->fh);
+	return 1;
+}
+
+static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p,
+               struct nfsd3_accessres *resp)
+{
+       fh_put(&resp->fh);
+       return 1;
+}
+
+#define nfsaclsvc_decode_voidargs	NULL
+#define nfsaclsvc_release_void		NULL
+#define nfsd3_fhandleargs	nfsd_fhandle
+#define nfsd3_attrstatres	nfsd_attrstat
+#define nfsd3_voidres		nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsacld_proc_##name,		\
+   (kxdrproc_t) nfsaclsvc_decode_##argt##args,	\
+   (kxdrproc_t) nfsaclsvc_encode_##rest##res,	\
+   (kxdrproc_t) nfsaclsvc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3)  /* Access Control List */
+
+static struct svc_procedure		nfsd_acl_procedures2[] = {
+  PROC(null,	void,		void,		void,	  RC_NOCACHE, ST),
+  PROC(getacl,	getacl,		getacl,		getacl,	  RC_NOCACHE, ST+1+2*(1+ACL)),
+  PROC(setacl,	setacl,		attrstat,	attrstat, RC_NOCACHE, ST+AT),
+  PROC(getattr, fhandle,	attrstat,	attrstat, RC_NOCACHE, ST+AT),
+  PROC(access,	access,		access,		access,   RC_NOCACHE, ST+AT+1),
+};
+
+struct svc_version	nfsd_acl_version2 = {
+		.vs_vers	= 2,
+		.vs_nproc	= 5,
+		.vs_proc	= nfsd_acl_procedures2,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+		.vs_hidden	= 0,
+};
diff --git a/kernel/fs/nfsd/nfs3acl.c b/kernel/fs/nfsd/nfs3acl.c
new file mode 100644
index 000000000..882b1a14b
--- /dev/null
+++ b/kernel/fs/nfsd/nfs3acl.c
@@ -0,0 +1,273 @@
+/*
+ * Process version 3 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include "nfsd.h"
+/* FIXME: nfsacl.h is a broken header */
+#include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
+		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
+{
+	struct posix_acl *acl;
+	struct inode *inode;
+	svc_fh *fh;
+	__be32 nfserr = 0;
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	inode = d_inode(fh->fh_dentry);
+
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		RETURN_STATUS(nfserr_inval);
+	resp->mask = argp->mask;
+
+	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+		acl = get_acl(inode, ACL_TYPE_ACCESS);
+		if (acl == NULL) {
+			/* Solaris returns the inode's minimum ACL. */
+			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		}
+		if (IS_ERR(acl)) {
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
+		}
+		resp->acl_access = acl;
+	}
+	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+		/* Check how Solaris handles requests for the Default ACL
+		   of a non-directory! */
+		acl = get_acl(inode, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl)) {
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
+		}
+		resp->acl_default = acl;
+	}
+
+	/* resp->acl_{access,default} are released in nfs3svc_release_getacl. */
+	RETURN_STATUS(0);
+
+fail:
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
+		struct nfsd3_setaclargs *argp,
+		struct nfsd3_attrstat *resp)
+{
+	struct inode *inode;
+	svc_fh *fh;
+	__be32 nfserr = 0;
+	int error;
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+	if (nfserr)
+		goto out;
+
+	inode = d_inode(fh->fh_dentry);
+	if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
+		error = -EOPNOTSUPP;
+		goto out_errno;
+	}
+
+	error = fh_want_write(fh);
+	if (error)
+		goto out_errno;
+
+	error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+	if (error)
+		goto out_drop_write;
+	error = inode->i_op->set_acl(inode, argp->acl_default,
+				     ACL_TYPE_DEFAULT);
+
+out_drop_write:
+	fh_drop_write(fh);
+out_errno:
+	nfserr = nfserrno(error);
+out:
+	/* argp->acl_{access,default} may have been allocated in
+	   nfs3svc_decode_setaclargs. */
+	posix_acl_release(argp->acl_access);
+	posix_acl_release(argp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclargs *args)
+{
+	p = nfs3svc_decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	args->mask = ntohl(*p); p++;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_setaclargs *args)
+{
+	struct kvec *head = rqstp->rq_arg.head;
+	unsigned int base;
+	int n;
+
+	p = nfs3svc_decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	args->mask = ntohl(*p++);
+	if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	    !xdr_argsize_check(rqstp, p))
+		return 0;
+
+	base = (char *)p - (char *)head->iov_base;
+	n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+			  (args->mask & NFS_ACL) ?
+			  &args->acl_access : NULL);
+	if (n > 0)
+		n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+				  (args->mask & NFS_DFACL) ?
+				  &args->acl_default : NULL);
+	return (n > 0);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/* GETACL */
+static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	struct dentry *dentry = resp->fh.fh_dentry;
+
+	p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0 && dentry && d_really_is_positive(dentry)) {
+		struct inode *inode = d_inode(dentry);
+		struct kvec *head = rqstp->rq_res.head;
+		unsigned int base;
+		int n;
+		int w;
+
+		*p++ = htonl(resp->mask);
+		if (!xdr_ressize_check(rqstp, p))
+			return 0;
+		base = (char *)p - (char *)head->iov_base;
+
+		rqstp->rq_res.page_len = w = nfsacl_size(
+			(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
+			(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+		while (w > 0) {
+			if (!*(rqstp->rq_next_page++))
+				return 0;
+			w -= PAGE_SIZE;
+		}
+
+		n = nfsacl_encode(&rqstp->rq_res, base, inode,
+				  resp->acl_access,
+				  resp->mask & NFS_ACL, 0);
+		if (n > 0)
+			n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+					  resp->acl_default,
+					  resp->mask & NFS_DFACL,
+					  NFS_ACL_DEFAULT);
+		if (n <= 0)
+			return 0;
+	} else
+		if (!xdr_ressize_check(rqstp, p))
+			return 0;
+
+	return 1;
+}
+
+/* SETACL */
+static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_attrstat *resp)
+{
+	p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	fh_put(&resp->fh);
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	return 1;
+}
+
+#define nfs3svc_decode_voidargs		NULL
+#define nfs3svc_release_void		NULL
+#define nfsd3_setaclres			nfsd3_attrstat
+#define nfsd3_voidres			nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsd3_proc_##name,		\
+   (kxdrproc_t) nfs3svc_decode_##argt##args,	\
+   (kxdrproc_t) nfs3svc_encode_##rest##res,	\
+   (kxdrproc_t) nfs3svc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3)  /* Access Control List */
+
+static struct svc_procedure		nfsd_acl_procedures3[] = {
+  PROC(null,	void,		void,		void,	  RC_NOCACHE, ST),
+  PROC(getacl,	getacl,		getacl,		getacl,	  RC_NOCACHE, ST+1+2*(1+ACL)),
+  PROC(setacl,	setacl,		setacl,		fhandle,  RC_NOCACHE, ST+pAT),
+};
+
+struct svc_version	nfsd_acl_version3 = {
+		.vs_vers	= 3,
+		.vs_nproc	= 3,
+		.vs_proc	= nfsd_acl_procedures3,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+		.vs_hidden	= 0,
+};
+
diff --git a/kernel/fs/nfsd/nfs3proc.c b/kernel/fs/nfsd/nfs3proc.c
new file mode 100644
index 000000000..7b755b7f7
--- /dev/null
+++ b/kernel/fs/nfsd/nfs3proc.c
@@ -0,0 +1,891 @@
+/*
+ * Process version 3 NFS requests.
+ *
+ * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/fs.h>
+#include <linux/ext2_fs.h>
+#include <linux/magic.h>
+
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+static int	nfs3_ftypes[] = {
+	0,			/* NF3NON */
+	S_IFREG,		/* NF3REG */
+	S_IFDIR,		/* NF3DIR */
+	S_IFBLK,		/* NF3BLK */
+	S_IFCHR,		/* NF3CHR */
+	S_IFLNK,		/* NF3LNK */
+	S_IFSOCK,		/* NF3SOCK */
+	S_IFIFO,		/* NF3FIFO */
+};
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get a file's attributes
+ */
+static __be32
+nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
+					   struct nfsd3_attrstat *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: GETATTR(3)  %s\n",
+		SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	nfserr = fh_getattr(&resp->fh, &resp->stat);
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set a file's attributes
+ */
+static __be32
+nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
+					   struct nfsd3_attrstat  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: SETATTR(3)  %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
+			      argp->check_guard, argp->guardtime);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Look up a path name component
+ */
+static __be32
+nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
+					  struct nfsd3_diropres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LOOKUP(3)   %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	fh_copy(&resp->dirfh, &argp->fh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+
+	nfserr = nfsd_lookup(rqstp, &resp->dirfh,
+				    argp->name,
+				    argp->len,
+				    &resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Check file access
+ */
+static __be32
+nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+					  struct nfsd3_accessres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: ACCESS(3)   %s 0x%x\n",
+				SVCFH_fmt(&argp->fh),
+				argp->access);
+
+	fh_copy(&resp->fh, &argp->fh);
+	resp->access = argp->access;
+	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a symlink.
+ */
+static __be32
+nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
+					   struct nfsd3_readlinkres *resp)
+{
+	__be32 nfserr;
+
+	dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
+
+	/* Read the symlink. */
+	fh_copy(&resp->fh, &argp->fh);
+	resp->len = NFS3_MAXPATHLEN;
+	nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a portion of a file.
+ */
+static __be32
+nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
+				        struct nfsd3_readres  *resp)
+{
+	__be32	nfserr;
+	u32	max_blocksize = svc_max_payload(rqstp);
+
+	dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
+				SVCFH_fmt(&argp->fh),
+				(unsigned long) argp->count,
+				(unsigned long long) argp->offset);
+
+	/* Obtain buffer pointer for payload.
+	 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
+	 * + 1 (xdr opaque byte count) = 26
+	 */
+	resp->count = min(argp->count, max_blocksize);
+	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_read(rqstp, &resp->fh,
+				  argp->offset,
+			   	  rqstp->rq_vec, argp->vlen,
+				  &resp->count);
+	if (nfserr == 0) {
+		struct inode	*inode = d_inode(resp->fh.fh_dentry);
+
+		resp->eof = (argp->offset + resp->count) >= inode->i_size;
+	}
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Write data to a file
+ */
+static __be32
+nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
+					 struct nfsd3_writeres  *resp)
+{
+	__be32	nfserr;
+	unsigned long cnt = argp->len;
+
+	dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				(unsigned long long) argp->offset,
+				argp->stable? " stable" : "");
+
+	fh_copy(&resp->fh, &argp->fh);
+	resp->committed = argp->stable;
+	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
+				   argp->offset,
+				   rqstp->rq_vec, argp->vlen,
+				   &cnt,
+				   &resp->committed);
+	resp->count = cnt;
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * With NFSv3, CREATE processing is a lot easier than with NFSv2.
+ * At least in theory; we'll see how it fares in practice when the
+ * first reports about SunOS compatibility problems start to pour in...
+ */
+static __be32
+nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
+					  struct nfsd3_diropres   *resp)
+{
+	svc_fh		*dirfhp, *newfhp = NULL;
+	struct iattr	*attr;
+	__be32		nfserr;
+
+	dprintk("nfsd: CREATE(3)   %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	dirfhp = fh_copy(&resp->dirfh, &argp->fh);
+	newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
+	attr   = &argp->attrs;
+
+	/* Unfudge the mode bits */
+	attr->ia_mode &= ~S_IFMT;
+	if (!(attr->ia_valid & ATTR_MODE)) { 
+		attr->ia_valid |= ATTR_MODE;
+		attr->ia_mode = S_IFREG;
+	} else {
+		attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG;
+	}
+
+	/* Now create the file and set attributes */
+	nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
+				attr, newfhp,
+				argp->createmode, (u32 *)argp->verf, NULL, NULL);
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Make directory. This operation is not idempotent.
+ */
+static __be32
+nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
+					 struct nfsd3_diropres   *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: MKDIR(3)    %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	argp->attrs.ia_valid &= ~ATTR_SIZE;
+	fh_copy(&resp->dirfh, &argp->fh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+				    &argp->attrs, S_IFDIR, 0, &resp->fh);
+	fh_unlock(&resp->dirfh);
+	RETURN_STATUS(nfserr);
+}
+
+static __be32
+nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
+					   struct nfsd3_diropres    *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: SYMLINK(3)  %s %.*s -> %.*s\n",
+				SVCFH_fmt(&argp->ffh),
+				argp->flen, argp->fname,
+				argp->tlen, argp->tname);
+
+	fh_copy(&resp->dirfh, &argp->ffh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+	nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen,
+						   argp->tname, &resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Make socket/fifo/device.
+ */
+static __be32
+nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
+					 struct nfsd3_diropres  *resp)
+{
+	__be32	nfserr;
+	int type;
+	dev_t	rdev = 0;
+
+	dprintk("nfsd: MKNOD(3)    %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	fh_copy(&resp->dirfh, &argp->fh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+
+	if (argp->ftype == 0 || argp->ftype >= NF3BAD)
+		RETURN_STATUS(nfserr_inval);
+	if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) {
+		rdev = MKDEV(argp->major, argp->minor);
+		if (MAJOR(rdev) != argp->major ||
+		    MINOR(rdev) != argp->minor)
+			RETURN_STATUS(nfserr_inval);
+	} else
+		if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO)
+			RETURN_STATUS(nfserr_inval);
+
+	type = nfs3_ftypes[argp->ftype];
+	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+				    &argp->attrs, type, rdev, &resp->fh);
+	fh_unlock(&resp->dirfh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Remove file/fifo/socket etc.
+ */
+static __be32
+nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
+					  struct nfsd3_attrstat  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: REMOVE(3)   %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	/* Unlink. -S_IFDIR means file must not be a directory */
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
+	fh_unlock(&resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Remove a directory
+ */
+static __be32
+nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
+					 struct nfsd3_attrstat  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RMDIR(3)    %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
+	fh_unlock(&resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+static __be32
+nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
+					  struct nfsd3_renameres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RENAME(3)   %s %.*s ->\n",
+				SVCFH_fmt(&argp->ffh),
+				argp->flen,
+				argp->fname);
+	dprintk("nfsd: -> %s %.*s\n",
+				SVCFH_fmt(&argp->tfh),
+				argp->tlen,
+				argp->tname);
+
+	fh_copy(&resp->ffh, &argp->ffh);
+	fh_copy(&resp->tfh, &argp->tfh);
+	nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
+				    &resp->tfh, argp->tname, argp->tlen);
+	RETURN_STATUS(nfserr);
+}
+
+static __be32
+nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
+					struct nfsd3_linkres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LINK(3)     %s ->\n",
+				SVCFH_fmt(&argp->ffh));
+	dprintk("nfsd:   -> %s %.*s\n",
+				SVCFH_fmt(&argp->tfh),
+				argp->tlen,
+				argp->tname);
+
+	fh_copy(&resp->fh,  &argp->ffh);
+	fh_copy(&resp->tfh, &argp->tfh);
+	nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
+				  &resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a portion of a directory.
+ */
+static __be32
+nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
+					   struct nfsd3_readdirres  *resp)
+{
+	__be32		nfserr;
+	int		count;
+
+	dprintk("nfsd: READDIR(3)  %s %d bytes at %d\n",
+				SVCFH_fmt(&argp->fh),
+				argp->count, (u32) argp->cookie);
+
+	/* Make sure we've room for the NULL ptr & eof flag, and shrink to
+	 * client read size */
+	count = (argp->count >> 2) - 2;
+
+	/* Read directory and encode entries on the fly */
+	fh_copy(&resp->fh, &argp->fh);
+
+	resp->buflen = count;
+	resp->common.err = nfs_ok;
+	resp->buffer = argp->buffer;
+	resp->rqstp = rqstp;
+	nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, 
+					&resp->common, nfs3svc_encode_entry);
+	memcpy(resp->verf, argp->verf, 8);
+	resp->count = resp->buffer - argp->buffer;
+	if (resp->offset)
+		xdr_encode_hyper(resp->offset, argp->cookie);
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a portion of a directory, including file handles and attrs.
+ * For now, we choose to ignore the dircount parameter.
+ */
+static __be32
+nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
+					       struct nfsd3_readdirres  *resp)
+{
+	__be32	nfserr;
+	int	count = 0;
+	loff_t	offset;
+	struct page **p;
+	caddr_t	page_addr = NULL;
+
+	dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
+				SVCFH_fmt(&argp->fh),
+				argp->count, (u32) argp->cookie);
+
+	/* Convert byte count to number of words (i.e. >> 2),
+	 * and reserve room for the NULL ptr & eof flag (-2 words) */
+	resp->count = (argp->count >> 2) - 2;
+
+	/* Read directory and encode entries on the fly */
+	fh_copy(&resp->fh, &argp->fh);
+
+	resp->common.err = nfs_ok;
+	resp->buffer = argp->buffer;
+	resp->buflen = resp->count;
+	resp->rqstp = rqstp;
+	offset = argp->cookie;
+
+	nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS)
+		RETURN_STATUS(nfserr_notsupp);
+
+	nfserr = nfsd_readdir(rqstp, &resp->fh,
+				     &offset,
+				     &resp->common,
+				     nfs3svc_encode_entry_plus);
+	memcpy(resp->verf, argp->verf, 8);
+	for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+		page_addr = page_address(*p);
+
+		if (((caddr_t)resp->buffer >= page_addr) &&
+		    ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+			count += (caddr_t)resp->buffer - page_addr;
+			break;
+		}
+		count += PAGE_SIZE;
+	}
+	resp->count = count >> 2;
+	if (resp->offset) {
+		if (unlikely(resp->offset1)) {
+			/* we ended up with offset on a page boundary */
+			*resp->offset = htonl(offset >> 32);
+			*resp->offset1 = htonl(offset & 0xffffffff);
+			resp->offset1 = NULL;
+		} else {
+			xdr_encode_hyper(resp->offset, offset);
+		}
+	}
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Get file system stats
+ */
+static __be32
+nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
+					   struct nfsd3_fsstatres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: FSSTAT(3)   %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
+	fh_put(&argp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Get file system info
+ */
+static __be32
+nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
+					   struct nfsd3_fsinfores *resp)
+{
+	__be32	nfserr;
+	u32	max_blocksize = svc_max_payload(rqstp);
+
+	dprintk("nfsd: FSINFO(3)   %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	resp->f_rtmax  = max_blocksize;
+	resp->f_rtpref = max_blocksize;
+	resp->f_rtmult = PAGE_SIZE;
+	resp->f_wtmax  = max_blocksize;
+	resp->f_wtpref = max_blocksize;
+	resp->f_wtmult = PAGE_SIZE;
+	resp->f_dtpref = PAGE_SIZE;
+	resp->f_maxfilesize = ~(u32) 0;
+	resp->f_properties = NFS3_FSF_DEFAULT;
+
+	nfserr = fh_verify(rqstp, &argp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+
+	/* Check special features of the file system. May request
+	 * different read/write sizes for file systems known to have
+	 * problems with large blocks */
+	if (nfserr == 0) {
+		struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
+
+		/* Note that we don't care for remote fs's here */
+		if (sb->s_magic == MSDOS_SUPER_MAGIC) {
+			resp->f_properties = NFS3_FSF_BILLYBOY;
+		}
+		resp->f_maxfilesize = sb->s_maxbytes;
+	}
+
+	fh_put(&argp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Get pathconf info for the specified file
+ */
+static __be32
+nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
+					     struct nfsd3_pathconfres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: PATHCONF(3) %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	/* Set default pathconf */
+	resp->p_link_max = 255;		/* at least */
+	resp->p_name_max = 255;		/* at least */
+	resp->p_no_trunc = 0;
+	resp->p_chown_restricted = 1;
+	resp->p_case_insensitive = 0;
+	resp->p_case_preserving = 1;
+
+	nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+
+	if (nfserr == 0) {
+		struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
+
+		/* Note that we don't care for remote fs's here */
+		switch (sb->s_magic) {
+		case EXT2_SUPER_MAGIC:
+			resp->p_link_max = EXT2_LINK_MAX;
+			resp->p_name_max = EXT2_NAME_LEN;
+			break;
+		case MSDOS_SUPER_MAGIC:
+			resp->p_case_insensitive = 1;
+			resp->p_case_preserving  = 0;
+			break;
+		}
+	}
+
+	fh_put(&argp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+
+/*
+ * Commit a file (range) to stable storage.
+ */
+static __be32
+nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
+					   struct nfsd3_commitres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: COMMIT(3)   %s %u@%Lu\n",
+				SVCFH_fmt(&argp->fh),
+				argp->count,
+				(unsigned long long) argp->offset);
+
+	if (argp->offset > NFS_OFFSET_MAX)
+		RETURN_STATUS(nfserr_inval);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count);
+
+	RETURN_STATUS(nfserr);
+}
+
+
+/*
+ * NFSv3 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+#define nfs3svc_decode_fhandleargs	nfs3svc_decode_fhandle
+#define nfs3svc_encode_attrstatres	nfs3svc_encode_attrstat
+#define nfs3svc_encode_wccstatres	nfs3svc_encode_wccstat
+#define nfsd3_mkdirargs			nfsd3_createargs
+#define nfsd3_readdirplusargs		nfsd3_readdirargs
+#define nfsd3_fhandleargs		nfsd_fhandle
+#define nfsd3_fhandleres		nfsd3_attrstat
+#define nfsd3_attrstatres		nfsd3_attrstat
+#define nfsd3_wccstatres		nfsd3_attrstat
+#define nfsd3_createres			nfsd3_diropres
+#define nfsd3_voidres			nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsd3_proc_##name,		\
+   (kxdrproc_t) nfs3svc_decode_##argt##args,	\
+   (kxdrproc_t) nfs3svc_encode_##rest##res,	\
+   (kxdrproc_t) nfs3svc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define FH 17		/* filehandle with length */
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define WC (7+pAT)	/* WCC attributes */
+
+static struct svc_procedure		nfsd_procedures3[22] = {
+	[NFS3PROC_NULL] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_null,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_voidres,
+		.pc_argsize = sizeof(struct nfsd3_voidargs),
+		.pc_ressize = sizeof(struct nfsd3_voidres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFS3PROC_GETATTR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_getattr,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_attrstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_attrstatres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFS3PROC_SETATTR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_setattr,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_sattrargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_sattrargs),
+		.pc_ressize = sizeof(struct nfsd3_wccstatres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC,
+	},
+	[NFS3PROC_LOOKUP] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_lookup,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_ressize = sizeof(struct nfsd3_diropres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+FH+pAT+pAT,
+	},
+	[NFS3PROC_ACCESS] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_access,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_accessargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_accessres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_accessargs),
+		.pc_ressize = sizeof(struct nfsd3_accessres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+1,
+	},
+	[NFS3PROC_READLINK] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_readlink,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readlinkargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readlinkres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readlinkargs),
+		.pc_ressize = sizeof(struct nfsd3_readlinkres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
+	},
+	[NFS3PROC_READ] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_read,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readargs),
+		.pc_ressize = sizeof(struct nfsd3_readres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
+	},
+	[NFS3PROC_WRITE] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_write,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_writeargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_writeres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_writeargs),
+		.pc_ressize = sizeof(struct nfsd3_writeres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC+4,
+	},
+	[NFS3PROC_CREATE] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_create,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_createargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_createargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_MKDIR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_mkdir,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_mkdirargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_mkdirargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_SYMLINK] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_symlink,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_symlinkargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_symlinkargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_MKNOD] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_mknod,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_mknodargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_mknodargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_REMOVE] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_remove,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_ressize = sizeof(struct nfsd3_wccstatres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC,
+	},
+	[NFS3PROC_RMDIR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_rmdir,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_ressize = sizeof(struct nfsd3_wccstatres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC,
+	},
+	[NFS3PROC_RENAME] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_rename,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_renameargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_renameres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_renameargs),
+		.pc_ressize = sizeof(struct nfsd3_renameres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC+WC,
+	},
+	[NFS3PROC_LINK] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_link,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_linkargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_linkres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_linkargs),
+		.pc_ressize = sizeof(struct nfsd3_linkres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+pAT+WC,
+	},
+	[NFS3PROC_READDIR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_readdir,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readdirargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readdirargs),
+		.pc_ressize = sizeof(struct nfsd3_readdirres),
+		.pc_cachetype = RC_NOCACHE,
+	},
+	[NFS3PROC_READDIRPLUS] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_readdirplus,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readdirplusargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+		.pc_ressize = sizeof(struct nfsd3_readdirres),
+		.pc_cachetype = RC_NOCACHE,
+	},
+	[NFS3PROC_FSSTAT] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_fsstat,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_fsstatres,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_fsstatres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+2*6+1,
+	},
+	[NFS3PROC_FSINFO] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_fsinfo,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_fsinfores,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_fsinfores),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+12,
+	},
+	[NFS3PROC_PATHCONF] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_pathconf,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_pathconfres,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_pathconfres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+6,
+	},
+	[NFS3PROC_COMMIT] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_commit,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_commitargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_commitres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_commitargs),
+		.pc_ressize = sizeof(struct nfsd3_commitres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+WC+2,
+	},
+};
+
+struct svc_version	nfsd_version3 = {
+		.vs_vers	= 3,
+		.vs_nproc	= 22,
+		.vs_proc	= nfsd_procedures3,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+};
diff --git a/kernel/fs/nfsd/nfs3xdr.c b/kernel/fs/nfsd/nfs3xdr.c
new file mode 100644
index 000000000..e4b2b4322
--- /dev/null
+++ b/kernel/fs/nfsd/nfs3xdr.c
@@ -0,0 +1,1114 @@
+/*
+ * XDR support for nfsd/protocol version 3.
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()!
+ */
+
+#include <linux/namei.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include "xdr3.h"
+#include "auth.h"
+#include "netns.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_XDR
+
+
+/*
+ * Mapping of S_IF* types to NFS file types
+ */
+static u32	nfs3_ftypes[] = {
+	NF3NON,  NF3FIFO, NF3CHR, NF3BAD,
+	NF3DIR,  NF3BAD,  NF3BLK, NF3BAD,
+	NF3REG,  NF3BAD,  NF3LNK, NF3BAD,
+	NF3SOCK, NF3BAD,  NF3LNK, NF3BAD,
+};
+
+/*
+ * XDR functions for basic NFS types
+ */
+static __be32 *
+encode_time3(__be32 *p, struct timespec *time)
+{
+	*p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
+	return p;
+}
+
+static __be32 *
+decode_time3(__be32 *p, struct timespec *time)
+{
+	time->tv_sec = ntohl(*p++);
+	time->tv_nsec = ntohl(*p++);
+	return p;
+}
+
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	unsigned int size;
+	fh_init(fhp, NFS3_FHSIZE);
+	size = ntohl(*p++);
+	if (size > NFS3_FHSIZE)
+		return NULL;
+
+	memcpy(&fhp->fh_handle.fh_base, p, size);
+	fhp->fh_handle.fh_size = size;
+	return p + XDR_QUADLEN(size);
+}
+
+/* Helper function for NFSv3 ACL code */
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	return decode_fh(p, fhp);
+}
+
+static __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	unsigned int size = fhp->fh_handle.fh_size;
+	*p++ = htonl(size);
+	if (size) p[XDR_QUADLEN(size)-1]=0;
+	memcpy(p, &fhp->fh_handle.fh_base, size);
+	return p + XDR_QUADLEN(size);
+}
+
+/*
+ * Decode a file name and make sure that the path contains
+ * no slashes or null bytes.
+ */
+static __be32 *
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+{
+	char		*name;
+	unsigned int	i;
+
+	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
+		for (i = 0, name = *namp; i < *lenp; i++, name++) {
+			if (*name == '\0' || *name == '/')
+				return NULL;
+		}
+	}
+
+	return p;
+}
+
+static __be32 *
+decode_sattr3(__be32 *p, struct iattr *iap)
+{
+	u32	tmp;
+
+	iap->ia_valid = 0;
+
+	if (*p++) {
+		iap->ia_valid |= ATTR_MODE;
+		iap->ia_mode = ntohl(*p++);
+	}
+	if (*p++) {
+		iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++));
+		if (uid_valid(iap->ia_uid))
+			iap->ia_valid |= ATTR_UID;
+	}
+	if (*p++) {
+		iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++));
+		if (gid_valid(iap->ia_gid))
+			iap->ia_valid |= ATTR_GID;
+	}
+	if (*p++) {
+		u64	newsize;
+
+		iap->ia_valid |= ATTR_SIZE;
+		p = xdr_decode_hyper(p, &newsize);
+		iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
+	}
+	if ((tmp = ntohl(*p++)) == 1) {	/* set to server time */
+		iap->ia_valid |= ATTR_ATIME;
+	} else if (tmp == 2) {		/* set to client time */
+		iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+		iap->ia_atime.tv_sec = ntohl(*p++);
+		iap->ia_atime.tv_nsec = ntohl(*p++);
+	}
+	if ((tmp = ntohl(*p++)) == 1) {	/* set to server time */
+		iap->ia_valid |= ATTR_MTIME;
+	} else if (tmp == 2) {		/* set to client time */
+		iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+		iap->ia_mtime.tv_sec = ntohl(*p++);
+		iap->ia_mtime.tv_nsec = ntohl(*p++);
+	}
+	return p;
+}
+
+static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
+{
+	u64 f;
+	switch(fsid_source(fhp)) {
+	default:
+	case FSIDSOURCE_DEV:
+		p = xdr_encode_hyper(p, (u64)huge_encode_dev
+				     (d_inode(fhp->fh_dentry)->i_sb->s_dev));
+		break;
+	case FSIDSOURCE_FSID:
+		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
+		break;
+	case FSIDSOURCE_UUID:
+		f = ((u64*)fhp->fh_export->ex_uuid)[0];
+		f ^= ((u64*)fhp->fh_export->ex_uuid)[1];
+		p = xdr_encode_hyper(p, f);
+		break;
+	}
+	return p;
+}
+
+static __be32 *
+encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+	      struct kstat *stat)
+{
+	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+	*p++ = htonl((u32) (stat->mode & S_IALLUGO));
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
+	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
+	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
+		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
+	} else {
+		p = xdr_encode_hyper(p, (u64) stat->size);
+	}
+	p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+	*p++ = htonl((u32) MAJOR(stat->rdev));
+	*p++ = htonl((u32) MINOR(stat->rdev));
+	p = encode_fsid(p, fhp);
+	p = xdr_encode_hyper(p, stat->ino);
+	p = encode_time3(p, &stat->atime);
+	p = encode_time3(p, &stat->mtime);
+	p = encode_time3(p, &stat->ctime);
+
+	return p;
+}
+
+static __be32 *
+encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	/* Attributes to follow */
+	*p++ = xdr_one;
+	return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr);
+}
+
+/*
+ * Encode post-operation attributes.
+ * The inode may be NULL if the call failed because of a stale file
+ * handle. In this case, no attributes are returned.
+ */
+static __be32 *
+encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	struct dentry *dentry = fhp->fh_dentry;
+	if (dentry && d_really_is_positive(dentry)) {
+	        __be32 err;
+		struct kstat stat;
+
+		err = fh_getattr(fhp, &stat);
+		if (!err) {
+			*p++ = xdr_one;		/* attributes follow */
+			lease_get_mtime(d_inode(dentry), &stat.mtime);
+			return encode_fattr3(rqstp, p, fhp, &stat);
+		}
+	}
+	*p++ = xdr_zero;
+	return p;
+}
+
+/* Helper for NFSv3 ACLs */
+__be32 *
+nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	return encode_post_op_attr(rqstp, p, fhp);
+}
+
+/*
+ * Enocde weak cache consistency data
+ */
+static __be32 *
+encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	struct dentry	*dentry = fhp->fh_dentry;
+
+	if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) {
+		if (fhp->fh_pre_saved) {
+			*p++ = xdr_one;
+			p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size);
+			p = encode_time3(p, &fhp->fh_pre_mtime);
+			p = encode_time3(p, &fhp->fh_pre_ctime);
+		} else {
+			*p++ = xdr_zero;
+		}
+		return encode_saved_post_attr(rqstp, p, fhp);
+	}
+	/* no pre- or post-attrs */
+	*p++ = xdr_zero;
+	return encode_post_op_attr(rqstp, p, fhp);
+}
+
+/*
+ * Fill in the post_op attr for the wcc data
+ */
+void fill_post_wcc(struct svc_fh *fhp)
+{
+	__be32 err;
+
+	if (fhp->fh_post_saved)
+		printk("nfsd: inode locked twice during operation.\n");
+
+	err = fh_getattr(fhp, &fhp->fh_post_attr);
+	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
+	if (err) {
+		fhp->fh_post_saved = 0;
+		/* Grab the ctime anyway - set_change_info might use it */
+		fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
+	} else
+		fhp->fh_post_saved = 1;
+}
+
+/*
+ * XDR decode functions
+ */
+int
+nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_sattrargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = decode_sattr3(p, &args->attrs);
+
+	if ((args->check_guard = ntohl(*p++)) != 0) { 
+		struct timespec time; 
+		p = decode_time3(p, &time);
+		args->guardtime = time.tv_sec;
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_diropargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_accessargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	args->access = ntohl(*p++);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readargs *args)
+{
+	unsigned int len;
+	int v;
+	u32 max_blocksize = svc_max_payload(rqstp);
+
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
+
+	args->count = ntohl(*p++);
+	len = min(args->count, max_blocksize);
+
+	/* set up the kvec */
+	v=0;
+	while (len > 0) {
+		struct page *p = *(rqstp->rq_next_page++);
+
+		rqstp->rq_vec[v].iov_base = page_address(p);
+		rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+	}
+	args->vlen = v;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_writeargs *args)
+{
+	unsigned int len, v, hdr, dlen;
+	u32 max_blocksize = svc_max_payload(rqstp);
+
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
+
+	args->count = ntohl(*p++);
+	args->stable = ntohl(*p++);
+	len = args->len = ntohl(*p++);
+	/*
+	 * The count must equal the amount of data passed.
+	 */
+	if (args->count != args->len)
+		return 0;
+
+	/*
+	 * Check to make sure that we got the right number of
+	 * bytes.
+	 */
+	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
+	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+		- hdr;
+	/*
+	 * Round the length of the data which was specified up to
+	 * the next multiple of XDR units and then compare that
+	 * against the length which was actually received.
+	 * Note that when RPCSEC/GSS (for example) is used, the
+	 * data buffer can be padded so dlen might be larger
+	 * than required.  It must never be smaller.
+	 */
+	if (dlen < XDR_QUADLEN(len)*4)
+		return 0;
+
+	if (args->count > max_blocksize) {
+		args->count = max_blocksize;
+		len = args->len = max_blocksize;
+	}
+	rqstp->rq_vec[0].iov_base = (void*)p;
+	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+	v = 0;
+	while (len > rqstp->rq_vec[v].iov_len) {
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
+	}
+	rqstp->rq_vec[v].iov_len = len;
+	args->vlen = v + 1;
+	return 1;
+}
+
+int
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_createargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	switch (args->createmode = ntohl(*p++)) {
+	case NFS3_CREATE_UNCHECKED:
+	case NFS3_CREATE_GUARDED:
+		p = decode_sattr3(p, &args->attrs);
+		break;
+	case NFS3_CREATE_EXCLUSIVE:
+		args->verf = p;
+		p += 2;
+		break;
+	default:
+		return 0;
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+int
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_createargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)) ||
+	    !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+	p = decode_sattr3(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_symlinkargs *args)
+{
+	unsigned int len, avail;
+	char *old, *new;
+	struct kvec *vec;
+
+	if (!(p = decode_fh(p, &args->ffh)) ||
+	    !(p = decode_filename(p, &args->fname, &args->flen))
+		)
+		return 0;
+	p = decode_sattr3(p, &args->attrs);
+
+	/* now decode the pathname, which might be larger than the first page.
+	 * As we have to check for nul's anyway, we copy it into a new page
+	 * This page appears in the rq_res.pages list, but as pages_len is always
+	 * 0, it won't get in the way
+	 */
+	len = ntohl(*p++);
+	if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
+		return 0;
+	args->tname = new = page_address(*(rqstp->rq_next_page++));
+	args->tlen = len;
+	/* first copy and check from the first page */
+	old = (char*)p;
+	vec = &rqstp->rq_arg.head[0];
+	avail = vec->iov_len - (old - (char*)vec->iov_base);
+	while (len && avail && *old) {
+		*new++ = *old++;
+		len--;
+		avail--;
+	}
+	/* now copy next page if there is one */
+	if (len && !avail && rqstp->rq_arg.page_len) {
+		avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE);
+		old = page_address(rqstp->rq_arg.pages[0]);
+	}
+	while (len && avail && *old) {
+		*new++ = *old++;
+		len--;
+		avail--;
+	}
+	*new = '\0';
+	if (len)
+		return 0;
+
+	return 1;
+}
+
+int
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_mknodargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	args->ftype = ntohl(*p++);
+
+	if (args->ftype == NF3BLK  || args->ftype == NF3CHR
+	 || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
+		p = decode_sattr3(p, &args->attrs);
+
+	if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
+		args->major = ntohl(*p++);
+		args->minor = ntohl(*p++);
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_renameargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_filename(p, &args->fname, &args->flen))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readlinkargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	args->buffer = page_address(*(rqstp->rq_next_page++));
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_linkargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readdirargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = xdr_decode_hyper(p, &args->cookie);
+	args->verf   = p; p += 2;
+	args->dircount = ~0;
+	args->count  = ntohl(*p++);
+	args->count  = min_t(u32, args->count, PAGE_SIZE);
+	args->buffer = page_address(*(rqstp->rq_next_page++));
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readdirargs *args)
+{
+	int len;
+	u32 max_blocksize = svc_max_payload(rqstp);
+
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = xdr_decode_hyper(p, &args->cookie);
+	args->verf     = p; p += 2;
+	args->dircount = ntohl(*p++);
+	args->count    = ntohl(*p++);
+
+	len = args->count = min(args->count, max_blocksize);
+	while (len > 0) {
+		struct page *p = *(rqstp->rq_next_page++);
+		if (!args->buffer)
+			args->buffer = page_address(p);
+		len -= PAGE_SIZE;
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_commitargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
+	args->count = ntohl(*p++);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+int
+nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* GETATTR */
+int
+nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_attrstat *resp)
+{
+	if (resp->status == 0) {
+		lease_get_mtime(d_inode(resp->fh.fh_dentry),
+				&resp->stat.mtime);
+		p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* SETATTR, REMOVE, RMDIR */
+int
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_attrstat *resp)
+{
+	p = encode_wcc_data(rqstp, p, &resp->fh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* LOOKUP */
+int
+nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_diropres *resp)
+{
+	if (resp->status == 0) {
+		p = encode_fh(p, &resp->fh);
+		p = encode_post_op_attr(rqstp, p, &resp->fh);
+	}
+	p = encode_post_op_attr(rqstp, p, &resp->dirfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* ACCESS */
+int
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_accessres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0)
+		*p++ = htonl(resp->access);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* READLINK */
+int
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readlinkres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0) {
+		*p++ = htonl(resp->len);
+		xdr_ressize_check(rqstp, p);
+		rqstp->rq_res.page_len = resp->len;
+		if (resp->len & 3) {
+			/* need to pad the tail */
+			rqstp->rq_res.tail[0].iov_base = p;
+			*p = 0;
+			rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
+		}
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
+}
+
+/* READ */
+int
+nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0) {
+		*p++ = htonl(resp->count);
+		*p++ = htonl(resp->eof);
+		*p++ = htonl(resp->count);	/* xdr opaque count */
+		xdr_ressize_check(rqstp, p);
+		/* now update rqstp->rq_res to reflect data as well */
+		rqstp->rq_res.page_len = resp->count;
+		if (resp->count & 3) {
+			/* need to pad the tail */
+			rqstp->rq_res.tail[0].iov_base = p;
+			*p = 0;
+			rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
+		}
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
+}
+
+/* WRITE */
+int
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_writeres *resp)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	p = encode_wcc_data(rqstp, p, &resp->fh);
+	if (resp->status == 0) {
+		*p++ = htonl(resp->count);
+		*p++ = htonl(resp->committed);
+		*p++ = htonl(nn->nfssvc_boot.tv_sec);
+		*p++ = htonl(nn->nfssvc_boot.tv_usec);
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* CREATE, MKDIR, SYMLINK, MKNOD */
+int
+nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_diropres *resp)
+{
+	if (resp->status == 0) {
+		*p++ = xdr_one;
+		p = encode_fh(p, &resp->fh);
+		p = encode_post_op_attr(rqstp, p, &resp->fh);
+	}
+	p = encode_wcc_data(rqstp, p, &resp->dirfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* RENAME */
+int
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_renameres *resp)
+{
+	p = encode_wcc_data(rqstp, p, &resp->ffh);
+	p = encode_wcc_data(rqstp, p, &resp->tfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* LINK */
+int
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_linkres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	p = encode_wcc_data(rqstp, p, &resp->tfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* READDIR */
+int
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readdirres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+
+	if (resp->status == 0) {
+		/* stupid readdir cookie */
+		memcpy(p, resp->verf, 8); p += 2;
+		xdr_ressize_check(rqstp, p);
+		if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE)
+			return 1; /*No room for trailer */
+		rqstp->rq_res.page_len = (resp->count) << 2;
+
+		/* add the 'tail' to the end of the 'head' page - page 0. */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p++ = 0;		/* no more entries */
+		*p++ = htonl(resp->common.err == nfserr_eof);
+		rqstp->rq_res.tail[0].iov_len = 2<<2;
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
+}
+
+static __be32 *
+encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
+	     int namlen, u64 ino)
+{
+	*p++ = xdr_one;				 /* mark entry present */
+	p    = xdr_encode_hyper(p, ino);	 /* file id */
+	p    = xdr_encode_array(p, name, namlen);/* name length & name */
+
+	cd->offset = p;				/* remember pointer */
+	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */
+
+	return p;
+}
+
+static __be32
+compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
+		const char *name, int namlen)
+{
+	struct svc_export	*exp;
+	struct dentry		*dparent, *dchild;
+	__be32 rv = nfserr_noent;
+
+	dparent = cd->fh.fh_dentry;
+	exp  = cd->fh.fh_export;
+
+	if (isdotent(name, namlen)) {
+		if (namlen == 2) {
+			dchild = dget_parent(dparent);
+			/* filesystem root - cannot return filehandle for ".." */
+			if (dchild == dparent)
+				goto out;
+		} else
+			dchild = dget(dparent);
+	} else
+		dchild = lookup_one_len(name, dparent, namlen);
+	if (IS_ERR(dchild))
+		return rv;
+	if (d_mountpoint(dchild))
+		goto out;
+	if (d_really_is_negative(dchild))
+		goto out;
+	rv = fh_compose(fhp, exp, dchild, &cd->fh);
+out:
+	dput(dchild);
+	return rv;
+}
+
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+{
+	struct svc_fh	*fh = &cd->scratch;
+	__be32 err;
+
+	fh_init(fh, NFS3_FHSIZE);
+	err = compose_entry_fh(cd, fh, name, namlen);
+	if (err) {
+		*p++ = 0;
+		*p++ = 0;
+		goto out;
+	}
+	p = encode_post_op_attr(cd->rqstp, p, fh);
+	*p++ = xdr_one;			/* yes, a file handle follows */
+	p = encode_fh(p, fh);
+out:
+	fh_put(fh);
+	return p;
+}
+
+/*
+ * Encode a directory entry. This one works for both normal readdir
+ * and readdirplus.
+ * The normal readdir reply requires 2 (fileid) + 1 (stringlen)
+ * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen.
+ * 
+ * The readdirplus baggage is 1+21 words for post_op_attr, plus the
+ * file handle.
+ */
+
+#define NFS3_ENTRY_BAGGAGE	(2 + 1 + 2 + 1)
+#define NFS3_ENTRYPLUS_BAGGAGE	(1 + 21 + 1 + (NFS3_FHSIZE >> 2))
+static int
+encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
+	     loff_t offset, u64 ino, unsigned int d_type, int plus)
+{
+	struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
+		       					common);
+	__be32		*p = cd->buffer;
+	caddr_t		curr_page_addr = NULL;
+	struct page **	page;
+	int		slen;		/* string (name) length */
+	int		elen;		/* estimated entry length in words */
+	int		num_entry_words = 0;	/* actual number of words */
+
+	if (cd->offset) {
+		u64 offset64 = offset;
+
+		if (unlikely(cd->offset1)) {
+			/* we ended up with offset on a page boundary */
+			*cd->offset = htonl(offset64 >> 32);
+			*cd->offset1 = htonl(offset64 & 0xffffffff);
+			cd->offset1 = NULL;
+		} else {
+			xdr_encode_hyper(cd->offset, offset64);
+		}
+	}
+
+	/*
+	dprintk("encode_entry(%.*s @%ld%s)\n",
+		namlen, name, (long) offset, plus? " plus" : "");
+	 */
+
+	/* truncate filename if too long */
+	namlen = min(namlen, NFS3_MAXNAMLEN);
+
+	slen = XDR_QUADLEN(namlen);
+	elen = slen + NFS3_ENTRY_BAGGAGE
+		+ (plus? NFS3_ENTRYPLUS_BAGGAGE : 0);
+
+	if (cd->buflen < elen) {
+		cd->common.err = nfserr_toosmall;
+		return -EINVAL;
+	}
+
+	/* determine which page in rq_respages[] we are currently filling */
+	for (page = cd->rqstp->rq_respages + 1;
+				page < cd->rqstp->rq_next_page; page++) {
+		curr_page_addr = page_address(*page);
+
+		if (((caddr_t)cd->buffer >= curr_page_addr) &&
+		    ((caddr_t)cd->buffer <  curr_page_addr + PAGE_SIZE))
+			break;
+	}
+
+	if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) {
+		/* encode entry in current page */
+
+		p = encode_entry_baggage(cd, p, name, namlen, ino);
+
+		if (plus)
+			p = encode_entryplus_baggage(cd, p, name, namlen);
+		num_entry_words = p - cd->buffer;
+	} else if (*(page+1) != NULL) {
+		/* temporarily encode entry into next page, then move back to
+		 * current and next page in rq_respages[] */
+		__be32 *p1, *tmp;
+		int len1, len2;
+
+		/* grab next page for temporary storage of entry */
+		p1 = tmp = page_address(*(page+1));
+
+		p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
+
+		if (plus)
+			p1 = encode_entryplus_baggage(cd, p1, name, namlen);
+
+		/* determine entry word length and lengths to go in pages */
+		num_entry_words = p1 - tmp;
+		len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer;
+		if ((num_entry_words << 2) < len1) {
+			/* the actual number of words in the entry is less
+			 * than elen and can still fit in the current page
+			 */
+			memmove(p, tmp, num_entry_words << 2);
+			p += num_entry_words;
+
+			/* update offset */
+			cd->offset = cd->buffer + (cd->offset - tmp);
+		} else {
+			unsigned int offset_r = (cd->offset - tmp) << 2;
+
+			/* update pointer to offset location.
+			 * This is a 64bit quantity, so we need to
+			 * deal with 3 cases:
+			 *  -	entirely in first page
+			 *  -	entirely in second page
+			 *  -	4 bytes in each page
+			 */
+			if (offset_r + 8 <= len1) {
+				cd->offset = p + (cd->offset - tmp);
+			} else if (offset_r >= len1) {
+				cd->offset -= len1 >> 2;
+			} else {
+				/* sitting on the fence */
+				BUG_ON(offset_r != len1 - 4);
+				cd->offset = p + (cd->offset - tmp);
+				cd->offset1 = tmp;
+			}
+
+			len2 = (num_entry_words << 2) - len1;
+
+			/* move from temp page to current and next pages */
+			memmove(p, tmp, len1);
+			memmove(tmp, (caddr_t)tmp+len1, len2);
+
+			p = tmp + (len2 >> 2);
+		}
+	}
+	else {
+		cd->common.err = nfserr_toosmall;
+		return -EINVAL;
+	}
+
+	cd->buflen -= num_entry_words;
+	cd->buffer = p;
+	cd->common.err = nfs_ok;
+	return 0;
+
+}
+
+int
+nfs3svc_encode_entry(void *cd, const char *name,
+		     int namlen, loff_t offset, u64 ino, unsigned int d_type)
+{
+	return encode_entry(cd, name, namlen, offset, ino, d_type, 0);
+}
+
+int
+nfs3svc_encode_entry_plus(void *cd, const char *name,
+			  int namlen, loff_t offset, u64 ino,
+			  unsigned int d_type)
+{
+	return encode_entry(cd, name, namlen, offset, ino, d_type, 1);
+}
+
+/* FSSTAT */
+int
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_fsstatres *resp)
+{
+	struct kstatfs	*s = &resp->stats;
+	u64		bs = s->f_bsize;
+
+	*p++ = xdr_zero;	/* no post_op_attr */
+
+	if (resp->status == 0) {
+		p = xdr_encode_hyper(p, bs * s->f_blocks);	/* total bytes */
+		p = xdr_encode_hyper(p, bs * s->f_bfree);	/* free bytes */
+		p = xdr_encode_hyper(p, bs * s->f_bavail);	/* user available bytes */
+		p = xdr_encode_hyper(p, s->f_files);	/* total inodes */
+		p = xdr_encode_hyper(p, s->f_ffree);	/* free inodes */
+		p = xdr_encode_hyper(p, s->f_ffree);	/* user available inodes */
+		*p++ = htonl(resp->invarsec);	/* mean unchanged time */
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* FSINFO */
+int
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_fsinfores *resp)
+{
+	*p++ = xdr_zero;	/* no post_op_attr */
+
+	if (resp->status == 0) {
+		*p++ = htonl(resp->f_rtmax);
+		*p++ = htonl(resp->f_rtpref);
+		*p++ = htonl(resp->f_rtmult);
+		*p++ = htonl(resp->f_wtmax);
+		*p++ = htonl(resp->f_wtpref);
+		*p++ = htonl(resp->f_wtmult);
+		*p++ = htonl(resp->f_dtpref);
+		p = xdr_encode_hyper(p, resp->f_maxfilesize);
+		*p++ = xdr_one;
+		*p++ = xdr_zero;
+		*p++ = htonl(resp->f_properties);
+	}
+
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* PATHCONF */
+int
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_pathconfres *resp)
+{
+	*p++ = xdr_zero;	/* no post_op_attr */
+
+	if (resp->status == 0) {
+		*p++ = htonl(resp->p_link_max);
+		*p++ = htonl(resp->p_name_max);
+		*p++ = htonl(resp->p_no_trunc);
+		*p++ = htonl(resp->p_chown_restricted);
+		*p++ = htonl(resp->p_case_insensitive);
+		*p++ = htonl(resp->p_case_preserving);
+	}
+
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* COMMIT */
+int
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_commitres *resp)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	p = encode_wcc_data(rqstp, p, &resp->fh);
+	/* Write verifier */
+	if (resp->status == 0) {
+		*p++ = htonl(nn->nfssvc_boot.tv_sec);
+		*p++ = htonl(nn->nfssvc_boot.tv_usec);
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+int
+nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_attrstat *resp)
+{
+	fh_put(&resp->fh);
+	return 1;
+}
+
+int
+nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_fhandle_pair *resp)
+{
+	fh_put(&resp->fh1);
+	fh_put(&resp->fh2);
+	return 1;
+}
diff --git a/kernel/fs/nfsd/nfs4acl.c b/kernel/fs/nfsd/nfs4acl.c
new file mode 100644
index 000000000..67242bf7c
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4acl.c
@@ -0,0 +1,896 @@
+/*
+ *  Common NFSv4 ACL handling code.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *  Jeff Sedlak <jsedlak@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/nfs_fs.h>
+#include "nfsfh.h"
+#include "nfsd.h"
+#include "acl.h"
+#include "vfs.h"
+
+#define NFS4_ACL_TYPE_DEFAULT	0x01
+#define NFS4_ACL_DIR		0x02
+#define NFS4_ACL_OWNER		0x04
+
+/* mode bit translations: */
+#define NFS4_READ_MODE (NFS4_ACE_READ_DATA)
+#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_APPEND_DATA)
+#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE
+#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE)
+#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)
+
+/* We don't support these bits; insist they be neither allowed nor denied */
+#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \
+		| NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS)
+
+/* flags used to simulate posix default ACLs */
+#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
+		| NFS4_ACE_DIRECTORY_INHERIT_ACE)
+
+#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \
+		| NFS4_ACE_INHERIT_ONLY_ACE \
+		| NFS4_ACE_IDENTIFIER_GROUP)
+
+#define MASK_EQUAL(mask1, mask2) \
+	( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
+
+static u32
+mask_from_posix(unsigned short perm, unsigned int flags)
+{
+	int mask = NFS4_ANYONE_MODE;
+
+	if (flags & NFS4_ACL_OWNER)
+		mask |= NFS4_OWNER_MODE;
+	if (perm & ACL_READ)
+		mask |= NFS4_READ_MODE;
+	if (perm & ACL_WRITE)
+		mask |= NFS4_WRITE_MODE;
+	if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
+		mask |= NFS4_ACE_DELETE_CHILD;
+	if (perm & ACL_EXECUTE)
+		mask |= NFS4_EXECUTE_MODE;
+	return mask;
+}
+
+static u32
+deny_mask_from_posix(unsigned short perm, u32 flags)
+{
+	u32 mask = 0;
+
+	if (perm & ACL_READ)
+		mask |= NFS4_READ_MODE;
+	if (perm & ACL_WRITE)
+		mask |= NFS4_WRITE_MODE;
+	if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
+		mask |= NFS4_ACE_DELETE_CHILD;
+	if (perm & ACL_EXECUTE)
+		mask |= NFS4_EXECUTE_MODE;
+	return mask;
+}
+
+/* XXX: modify functions to return NFS errors; they're only ever
+ * used by nfs code, after all.... */
+
+/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
+ * side of being more restrictive, so the mode bit mapping below is
+ * pessimistic.  An optimistic version would be needed to handle DENY's,
+ * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
+ * bits. */
+
+static void
+low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+{
+	u32 write_mode = NFS4_WRITE_MODE;
+
+	if (flags & NFS4_ACL_DIR)
+		write_mode |= NFS4_ACE_DELETE_CHILD;
+	*mode = 0;
+	if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
+		*mode |= ACL_READ;
+	if ((perm & write_mode) == write_mode)
+		*mode |= ACL_WRITE;
+	if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
+		*mode |= ACL_EXECUTE;
+}
+
+struct ace_container {
+	struct nfs4_ace  *ace;
+	struct list_head  ace_l;
+};
+
+static short ace2type(struct nfs4_ace *);
+static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
+				unsigned int);
+
+int
+nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+		struct nfs4_acl **acl)
+{
+	struct inode *inode = d_inode(dentry);
+	int error = 0;
+	struct posix_acl *pacl = NULL, *dpacl = NULL;
+	unsigned int flags = 0;
+	int size = 0;
+
+	pacl = get_acl(inode, ACL_TYPE_ACCESS);
+	if (!pacl)
+		pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+
+	if (IS_ERR(pacl))
+		return PTR_ERR(pacl);
+
+	/* allocate for worst case: one (deny, allow) pair each: */
+	size += 2 * pacl->a_count;
+
+	if (S_ISDIR(inode->i_mode)) {
+		flags = NFS4_ACL_DIR;
+		dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+		if (IS_ERR(dpacl)) {
+			error = PTR_ERR(dpacl);
+			goto rel_pacl;
+		}
+
+		if (dpacl)
+			size += 2 * dpacl->a_count;
+	}
+
+	*acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL);
+	if (*acl == NULL) {
+		error = -ENOMEM;
+		goto out;
+	}
+	(*acl)->naces = 0;
+
+	_posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
+
+	if (dpacl)
+		_posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
+
+out:
+	posix_acl_release(dpacl);
+rel_pacl:
+	posix_acl_release(pacl);
+	return error;
+}
+
+struct posix_acl_summary {
+	unsigned short owner;
+	unsigned short users;
+	unsigned short group;
+	unsigned short groups;
+	unsigned short other;
+	unsigned short mask;
+};
+
+static void
+summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas)
+{
+	struct posix_acl_entry *pa, *pe;
+
+	/*
+	 * Only pas.users and pas.groups need initialization; previous
+	 * posix_acl_valid() calls ensure that the other fields will be
+	 * initialized in the following loop.  But, just to placate gcc:
+	 */
+	memset(pas, 0, sizeof(*pas));
+	pas->mask = 07;
+
+	pe = acl->a_entries + acl->a_count;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch (pa->e_tag) {
+			case ACL_USER_OBJ:
+				pas->owner = pa->e_perm;
+				break;
+			case ACL_GROUP_OBJ:
+				pas->group = pa->e_perm;
+				break;
+			case ACL_USER:
+				pas->users |= pa->e_perm;
+				break;
+			case ACL_GROUP:
+				pas->groups |= pa->e_perm;
+				break;
+			case ACL_OTHER:
+				pas->other = pa->e_perm;
+				break;
+			case ACL_MASK:
+				pas->mask = pa->e_perm;
+				break;
+		}
+	}
+	/* We'll only care about effective permissions: */
+	pas->users &= pas->mask;
+	pas->group &= pas->mask;
+	pas->groups &= pas->mask;
+}
+
+/* We assume the acl has been verified with posix_acl_valid. */
+static void
+_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
+						unsigned int flags)
+{
+	struct posix_acl_entry *pa, *group_owner_entry;
+	struct nfs4_ace *ace;
+	struct posix_acl_summary pas;
+	unsigned short deny;
+	int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ?
+		NFS4_INHERITANCE_FLAGS | NFS4_ACE_INHERIT_ONLY_ACE : 0);
+
+	BUG_ON(pacl->a_count < 3);
+	summarize_posix_acl(pacl, &pas);
+
+	pa = pacl->a_entries;
+	ace = acl->aces + acl->naces;
+
+	/* We could deny everything not granted by the owner: */
+	deny = ~pas.owner;
+	/*
+	 * but it is equivalent (and simpler) to deny only what is not
+	 * granted by later entries:
+	 */
+	deny &= pas.users | pas.group | pas.groups | pas.other;
+	if (deny) {
+		ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+		ace->flag = eflag;
+		ace->access_mask = deny_mask_from_posix(deny, flags);
+		ace->whotype = NFS4_ACL_WHO_OWNER;
+		ace++;
+		acl->naces++;
+	}
+
+	ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+	ace->flag = eflag;
+	ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER);
+	ace->whotype = NFS4_ACL_WHO_OWNER;
+	ace++;
+	acl->naces++;
+	pa++;
+
+	while (pa->e_tag == ACL_USER) {
+		deny = ~(pa->e_perm & pas.mask);
+		deny &= pas.groups | pas.group | pas.other;
+		if (deny) {
+			ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+			ace->flag = eflag;
+			ace->access_mask = deny_mask_from_posix(deny, flags);
+			ace->whotype = NFS4_ACL_WHO_NAMED;
+			ace->who_uid = pa->e_uid;
+			ace++;
+			acl->naces++;
+		}
+		ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+		ace->flag = eflag;
+		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
+						   flags);
+		ace->whotype = NFS4_ACL_WHO_NAMED;
+		ace->who_uid = pa->e_uid;
+		ace++;
+		acl->naces++;
+		pa++;
+	}
+
+	/* In the case of groups, we apply allow ACEs first, then deny ACEs,
+	 * since a user can be in more than one group.  */
+
+	/* allow ACEs */
+
+	group_owner_entry = pa;
+
+	ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+	ace->flag = eflag;
+	ace->access_mask = mask_from_posix(pas.group, flags);
+	ace->whotype = NFS4_ACL_WHO_GROUP;
+	ace++;
+	acl->naces++;
+	pa++;
+
+	while (pa->e_tag == ACL_GROUP) {
+		ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+		ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
+						   flags);
+		ace->whotype = NFS4_ACL_WHO_NAMED;
+		ace->who_gid = pa->e_gid;
+		ace++;
+		acl->naces++;
+		pa++;
+	}
+
+	/* deny ACEs */
+
+	pa = group_owner_entry;
+
+	deny = ~pas.group & pas.other;
+	if (deny) {
+		ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+		ace->flag = eflag;
+		ace->access_mask = deny_mask_from_posix(deny, flags);
+		ace->whotype = NFS4_ACL_WHO_GROUP;
+		ace++;
+		acl->naces++;
+	}
+	pa++;
+
+	while (pa->e_tag == ACL_GROUP) {
+		deny = ~(pa->e_perm & pas.mask);
+		deny &= pas.other;
+		if (deny) {
+			ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+			ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+			ace->access_mask = deny_mask_from_posix(deny, flags);
+			ace->whotype = NFS4_ACL_WHO_NAMED;
+			ace->who_gid = pa->e_gid;
+			ace++;
+			acl->naces++;
+		}
+		pa++;
+	}
+
+	if (pa->e_tag == ACL_MASK)
+		pa++;
+	ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+	ace->flag = eflag;
+	ace->access_mask = mask_from_posix(pa->e_perm, flags);
+	ace->whotype = NFS4_ACL_WHO_EVERYONE;
+	acl->naces++;
+}
+
+static bool
+pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2)
+{
+	if (pace1->e_tag != pace2->e_tag)
+		return pace1->e_tag > pace2->e_tag;
+	if (pace1->e_tag == ACL_USER)
+		return uid_gt(pace1->e_uid, pace2->e_uid);
+	if (pace1->e_tag == ACL_GROUP)
+		return gid_gt(pace1->e_gid, pace2->e_gid);
+	return false;
+}
+
+static void
+sort_pacl_range(struct posix_acl *pacl, int start, int end) {
+	int sorted = 0, i;
+	struct posix_acl_entry tmp;
+
+	/* We just do a bubble sort; easy to do in place, and we're not
+	 * expecting acl's to be long enough to justify anything more. */
+	while (!sorted) {
+		sorted = 1;
+		for (i = start; i < end; i++) {
+			if (pace_gt(&pacl->a_entries[i],
+				    &pacl->a_entries[i+1])) {
+				sorted = 0;
+				tmp = pacl->a_entries[i];
+				pacl->a_entries[i] = pacl->a_entries[i+1];
+				pacl->a_entries[i+1] = tmp;
+			}
+		}
+	}
+}
+
+static void
+sort_pacl(struct posix_acl *pacl)
+{
+	/* posix_acl_valid requires that users and groups be in order
+	 * by uid/gid. */
+	int i, j;
+
+	/* no users or groups */
+	if (!pacl || pacl->a_count <= 4)
+		return;
+
+	i = 1;
+	while (pacl->a_entries[i].e_tag == ACL_USER)
+		i++;
+	sort_pacl_range(pacl, 1, i-1);
+
+	BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
+	j = ++i;
+	while (pacl->a_entries[j].e_tag == ACL_GROUP)
+		j++;
+	sort_pacl_range(pacl, i, j-1);
+	return;
+}
+
+/*
+ * While processing the NFSv4 ACE, this maintains bitmasks representing
+ * which permission bits have been allowed and which denied to a given
+ * entity: */
+struct posix_ace_state {
+	u32 allow;
+	u32 deny;
+};
+
+struct posix_user_ace_state {
+	union {
+		kuid_t uid;
+		kgid_t gid;
+	};
+	struct posix_ace_state perms;
+};
+
+struct posix_ace_state_array {
+	int n;
+	struct posix_user_ace_state aces[];
+};
+
+/*
+ * While processing the NFSv4 ACE, this maintains the partial permissions
+ * calculated so far: */
+
+struct posix_acl_state {
+	int empty;
+	struct posix_ace_state owner;
+	struct posix_ace_state group;
+	struct posix_ace_state other;
+	struct posix_ace_state everyone;
+	struct posix_ace_state mask; /* Deny unused in this case */
+	struct posix_ace_state_array *users;
+	struct posix_ace_state_array *groups;
+};
+
+static int
+init_state(struct posix_acl_state *state, int cnt)
+{
+	int alloc;
+
+	memset(state, 0, sizeof(struct posix_acl_state));
+	state->empty = 1;
+	/*
+	 * In the worst case, each individual acl could be for a distinct
+	 * named user or group, but we don't no which, so we allocate
+	 * enough space for either:
+	 */
+	alloc = sizeof(struct posix_ace_state_array)
+		+ cnt*sizeof(struct posix_user_ace_state);
+	state->users = kzalloc(alloc, GFP_KERNEL);
+	if (!state->users)
+		return -ENOMEM;
+	state->groups = kzalloc(alloc, GFP_KERNEL);
+	if (!state->groups) {
+		kfree(state->users);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void
+free_state(struct posix_acl_state *state) {
+	kfree(state->users);
+	kfree(state->groups);
+}
+
+static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate)
+{
+	state->mask.allow |= astate->allow;
+}
+
+static struct posix_acl *
+posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
+{
+	struct posix_acl_entry *pace;
+	struct posix_acl *pacl;
+	int nace;
+	int i;
+
+	/*
+	 * ACLs with no ACEs are treated differently in the inheritable
+	 * and effective cases: when there are no inheritable ACEs,
+	 * calls ->set_acl with a NULL ACL structure.
+	 */
+	if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
+		return NULL;
+
+	/*
+	 * When there are no effective ACEs, the following will end
+	 * up setting a 3-element effective posix ACL with all
+	 * permissions zero.
+	 */
+	if (!state->users->n && !state->groups->n)
+		nace = 3;
+	else /* Note we also include a MASK ACE in this case: */
+		nace = 4 + state->users->n + state->groups->n;
+	pacl = posix_acl_alloc(nace, GFP_KERNEL);
+	if (!pacl)
+		return ERR_PTR(-ENOMEM);
+
+	pace = pacl->a_entries;
+	pace->e_tag = ACL_USER_OBJ;
+	low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
+
+	for (i=0; i < state->users->n; i++) {
+		pace++;
+		pace->e_tag = ACL_USER;
+		low_mode_from_nfs4(state->users->aces[i].perms.allow,
+					&pace->e_perm, flags);
+		pace->e_uid = state->users->aces[i].uid;
+		add_to_mask(state, &state->users->aces[i].perms);
+	}
+
+	pace++;
+	pace->e_tag = ACL_GROUP_OBJ;
+	low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
+	add_to_mask(state, &state->group);
+
+	for (i=0; i < state->groups->n; i++) {
+		pace++;
+		pace->e_tag = ACL_GROUP;
+		low_mode_from_nfs4(state->groups->aces[i].perms.allow,
+					&pace->e_perm, flags);
+		pace->e_gid = state->groups->aces[i].gid;
+		add_to_mask(state, &state->groups->aces[i].perms);
+	}
+
+	if (state->users->n || state->groups->n) {
+		pace++;
+		pace->e_tag = ACL_MASK;
+		low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+	}
+
+	pace++;
+	pace->e_tag = ACL_OTHER;
+	low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
+
+	return pacl;
+}
+
+static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
+{
+	/* Allow all bits in the mask not already denied: */
+	astate->allow |= mask & ~astate->deny;
+}
+
+static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
+{
+	/* Deny all bits in the mask not already allowed: */
+	astate->deny |= mask & ~astate->allow;
+}
+
+static int find_uid(struct posix_acl_state *state, kuid_t uid)
+{
+	struct posix_ace_state_array *a = state->users;
+	int i;
+
+	for (i = 0; i < a->n; i++)
+		if (uid_eq(a->aces[i].uid, uid))
+			return i;
+	/* Not found: */
+	a->n++;
+	a->aces[i].uid = uid;
+	a->aces[i].perms.allow = state->everyone.allow;
+	a->aces[i].perms.deny  = state->everyone.deny;
+
+	return i;
+}
+
+static int find_gid(struct posix_acl_state *state, kgid_t gid)
+{
+	struct posix_ace_state_array *a = state->groups;
+	int i;
+
+	for (i = 0; i < a->n; i++)
+		if (gid_eq(a->aces[i].gid, gid))
+			return i;
+	/* Not found: */
+	a->n++;
+	a->aces[i].gid = gid;
+	a->aces[i].perms.allow = state->everyone.allow;
+	a->aces[i].perms.deny  = state->everyone.deny;
+
+	return i;
+}
+
+static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
+{
+	int i;
+
+	for (i=0; i < a->n; i++)
+		deny_bits(&a->aces[i].perms, mask);
+}
+
+static void allow_bits_array(struct posix_ace_state_array *a, u32 mask)
+{
+	int i;
+
+	for (i=0; i < a->n; i++)
+		allow_bits(&a->aces[i].perms, mask);
+}
+
+static void process_one_v4_ace(struct posix_acl_state *state,
+				struct nfs4_ace *ace)
+{
+	u32 mask = ace->access_mask;
+	int i;
+
+	state->empty = 0;
+
+	switch (ace2type(ace)) {
+	case ACL_USER_OBJ:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->owner, mask);
+		} else {
+			deny_bits(&state->owner, mask);
+		}
+		break;
+	case ACL_USER:
+		i = find_uid(state, ace->who_uid);
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->users->aces[i].perms, mask);
+		} else {
+			deny_bits(&state->users->aces[i].perms, mask);
+			mask = state->users->aces[i].perms.deny;
+			deny_bits(&state->owner, mask);
+		}
+		break;
+	case ACL_GROUP_OBJ:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->group, mask);
+		} else {
+			deny_bits(&state->group, mask);
+			mask = state->group.deny;
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+		break;
+	case ACL_GROUP:
+		i = find_gid(state, ace->who_gid);
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->groups->aces[i].perms, mask);
+		} else {
+			deny_bits(&state->groups->aces[i].perms, mask);
+			mask = state->groups->aces[i].perms.deny;
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->group, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+		break;
+	case ACL_OTHER:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->owner, mask);
+			allow_bits(&state->group, mask);
+			allow_bits(&state->other, mask);
+			allow_bits(&state->everyone, mask);
+			allow_bits_array(state->users, mask);
+			allow_bits_array(state->groups, mask);
+		} else {
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->group, mask);
+			deny_bits(&state->other, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+	}
+}
+
+static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
+		struct posix_acl **pacl, struct posix_acl **dpacl,
+		unsigned int flags)
+{
+	struct posix_acl_state effective_acl_state, default_acl_state;
+	struct nfs4_ace *ace;
+	int ret;
+
+	ret = init_state(&effective_acl_state, acl->naces);
+	if (ret)
+		return ret;
+	ret = init_state(&default_acl_state, acl->naces);
+	if (ret)
+		goto out_estate;
+	ret = -EINVAL;
+	for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+		if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
+			goto out_dstate;
+		if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
+			goto out_dstate;
+		if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) {
+			process_one_v4_ace(&effective_acl_state, ace);
+			continue;
+		}
+		if (!(flags & NFS4_ACL_DIR))
+			goto out_dstate;
+		/*
+		 * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT
+		 * is set, we're effectively turning on the other.  That's OK,
+		 * according to rfc 3530.
+		 */
+		process_one_v4_ace(&default_acl_state, ace);
+
+		if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE))
+			process_one_v4_ace(&effective_acl_state, ace);
+	}
+	*pacl = posix_state_to_acl(&effective_acl_state, flags);
+	if (IS_ERR(*pacl)) {
+		ret = PTR_ERR(*pacl);
+		*pacl = NULL;
+		goto out_dstate;
+	}
+	*dpacl = posix_state_to_acl(&default_acl_state,
+						flags | NFS4_ACL_TYPE_DEFAULT);
+	if (IS_ERR(*dpacl)) {
+		ret = PTR_ERR(*dpacl);
+		*dpacl = NULL;
+		posix_acl_release(*pacl);
+		*pacl = NULL;
+		goto out_dstate;
+	}
+	sort_pacl(*pacl);
+	sort_pacl(*dpacl);
+	ret = 0;
+out_dstate:
+	free_state(&default_acl_state);
+out_estate:
+	free_state(&effective_acl_state);
+	return ret;
+}
+
+__be32
+nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct nfs4_acl *acl)
+{
+	__be32 error;
+	int host_error;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct posix_acl *pacl = NULL, *dpacl = NULL;
+	unsigned int flags = 0;
+
+	/* Get inode */
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
+	if (error)
+		return error;
+
+	dentry = fhp->fh_dentry;
+	inode = d_inode(dentry);
+
+	if (!inode->i_op->set_acl || !IS_POSIXACL(inode))
+		return nfserr_attrnotsupp;
+
+	if (S_ISDIR(inode->i_mode))
+		flags = NFS4_ACL_DIR;
+
+	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+	if (host_error == -EINVAL)
+		return nfserr_attrnotsupp;
+	if (host_error < 0)
+		goto out_nfserr;
+
+	host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS);
+	if (host_error < 0)
+		goto out_release;
+
+	if (S_ISDIR(inode->i_mode)) {
+		host_error = inode->i_op->set_acl(inode, dpacl,
+						  ACL_TYPE_DEFAULT);
+	}
+
+out_release:
+	posix_acl_release(pacl);
+	posix_acl_release(dpacl);
+out_nfserr:
+	if (host_error == -EOPNOTSUPP)
+		return nfserr_attrnotsupp;
+	else
+		return nfserrno(host_error);
+}
+
+
+static short
+ace2type(struct nfs4_ace *ace)
+{
+	switch (ace->whotype) {
+		case NFS4_ACL_WHO_NAMED:
+			return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ?
+					ACL_GROUP : ACL_USER);
+		case NFS4_ACL_WHO_OWNER:
+			return ACL_USER_OBJ;
+		case NFS4_ACL_WHO_GROUP:
+			return ACL_GROUP_OBJ;
+		case NFS4_ACL_WHO_EVERYONE:
+			return ACL_OTHER;
+	}
+	BUG();
+	return -1;
+}
+
+/*
+ * return the size of the struct nfs4_acl required to represent an acl
+ * with @entries entries.
+ */
+int nfs4_acl_bytes(int entries)
+{
+	return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace);
+}
+
+static struct {
+	char *string;
+	int   stringlen;
+	int type;
+} s2t_map[] = {
+	{
+		.string    = "OWNER@",
+		.stringlen = sizeof("OWNER@") - 1,
+		.type      = NFS4_ACL_WHO_OWNER,
+	},
+	{
+		.string    = "GROUP@",
+		.stringlen = sizeof("GROUP@") - 1,
+		.type      = NFS4_ACL_WHO_GROUP,
+	},
+	{
+		.string    = "EVERYONE@",
+		.stringlen = sizeof("EVERYONE@") - 1,
+		.type      = NFS4_ACL_WHO_EVERYONE,
+	},
+};
+
+int
+nfs4_acl_get_whotype(char *p, u32 len)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
+		if (s2t_map[i].stringlen == len &&
+				0 == memcmp(s2t_map[i].string, p, len))
+			return s2t_map[i].type;
+	}
+	return NFS4_ACL_WHO_NAMED;
+}
+
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who)
+{
+	__be32 *p;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
+		if (s2t_map[i].type != who)
+			continue;
+		p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_opaque(p, s2t_map[i].string,
+					s2t_map[i].stringlen);
+		return 0;
+	}
+	WARN_ON_ONCE(1);
+	return nfserr_serverfault;
+}
diff --git a/kernel/fs/nfsd/nfs4callback.c b/kernel/fs/nfsd/nfs4callback.c
new file mode 100644
index 000000000..5694cfb7a
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4callback.c
@@ -0,0 +1,1099 @@
+/*
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/slab.h>
+#include "nfsd.h"
+#include "state.h"
+#include "netns.h"
+#include "xdr4cb.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PROC
+
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
+#define NFSPROC4_CB_NULL 0
+#define NFSPROC4_CB_COMPOUND 1
+
+/* Index of predefined Linux callback client operations */
+
+struct nfs4_cb_compound_hdr {
+	/* args */
+	u32		ident;	/* minorversion 0 only */
+	u32		nops;
+	__be32		*nops_p;
+	u32		minorversion;
+	/* res */
+	int		status;
+};
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+static __be32 *xdr_encode_empty_array(__be32 *p)
+{
+	*p++ = xdr_zero;
+	return p;
+}
+
+/*
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
+ */
+
+/*
+ *	nfs_cb_opnum4
+ *
+ *	enum nfs_cb_opnum4 {
+ *		OP_CB_GETATTR		= 3,
+ *		  ...
+ *	};
+ */
+enum nfs_cb_opnum4 {
+	OP_CB_GETATTR			= 3,
+	OP_CB_RECALL			= 4,
+	OP_CB_LAYOUTRECALL		= 5,
+	OP_CB_NOTIFY			= 6,
+	OP_CB_PUSH_DELEG		= 7,
+	OP_CB_RECALL_ANY		= 8,
+	OP_CB_RECALLABLE_OBJ_AVAIL	= 9,
+	OP_CB_RECALL_SLOT		= 10,
+	OP_CB_SEQUENCE			= 11,
+	OP_CB_WANTS_CANCELLED		= 12,
+	OP_CB_NOTIFY_LOCK		= 13,
+	OP_CB_NOTIFY_DEVICEID		= 14,
+	OP_CB_ILLEGAL			= 10044
+};
+
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(op);
+}
+
+/*
+ * nfs_fh4
+ *
+ *	typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
+{
+	u32 length = fh->fh_size;
+	__be32 *p;
+
+	BUG_ON(length > NFS4_FHSIZE);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, &fh->fh_base, length);
+}
+
+/*
+ * stateid4
+ *
+ *	struct stateid4 {
+ *		uint32_t	seqid;
+ *		opaque		other[12];
+ *	};
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(sid->si_generation);
+	xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
+}
+
+/*
+ * sessionid4
+ *
+ *	typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+			      const struct nfsd4_session *session)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+	xdr_encode_opaque_fixed(p, session->se_sessionid.data,
+					NFS4_MAX_SESSIONID_LEN);
+}
+
+/*
+ * nfsstat4
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_cb_errtbl[] = {
+	{ NFS4_OK,		0		},
+	{ NFS4ERR_PERM,		-EPERM		},
+	{ NFS4ERR_NOENT,	-ENOENT		},
+	{ NFS4ERR_IO,		-EIO		},
+	{ NFS4ERR_NXIO,		-ENXIO		},
+	{ NFS4ERR_ACCESS,	-EACCES		},
+	{ NFS4ERR_EXIST,	-EEXIST		},
+	{ NFS4ERR_XDEV,		-EXDEV		},
+	{ NFS4ERR_NOTDIR,	-ENOTDIR	},
+	{ NFS4ERR_ISDIR,	-EISDIR		},
+	{ NFS4ERR_INVAL,	-EINVAL		},
+	{ NFS4ERR_FBIG,		-EFBIG		},
+	{ NFS4ERR_NOSPC,	-ENOSPC		},
+	{ NFS4ERR_ROFS,		-EROFS		},
+	{ NFS4ERR_MLINK,	-EMLINK		},
+	{ NFS4ERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFS4ERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFS4ERR_DQUOT,	-EDQUOT		},
+	{ NFS4ERR_STALE,	-ESTALE		},
+	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFS4ERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
+	{ NFS4ERR_LOCKED,	-EAGAIN		},
+	{ NFS4ERR_RESOURCE,	-EREMOTEIO	},
+	{ NFS4ERR_SYMLINK,	-ELOOP		},
+	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
+	{ NFS4ERR_DEADLOCK,	-EDEADLK	},
+	{ -1,			-EIO		}
+};
+
+/*
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+	int i;
+
+	for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+		if (nfs_cb_errtbl[i].stat == status)
+			return nfs_cb_errtbl[i].errno;
+	}
+
+	dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
+	return -status;
+}
+
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+			       int *status)
+{
+	__be32 *p;
+	u32 op;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	op = be32_to_cpup(p++);
+	if (unlikely(op != expected))
+		goto out_unexpected;
+	*status = nfs_cb_stat_to_errno(be32_to_cpup(p));
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+out_unexpected:
+	dprintk("NFSD: Callback server returned operation %d but "
+		"we issued a request for %d\n", op, expected);
+	return -EIO;
+}
+
+/*
+ * CB_COMPOUND4args
+ *
+ *	struct CB_COMPOUND4args {
+ *		utf8str_cs	tag;
+ *		uint32_t	minorversion;
+ *		uint32_t	callback_ident;
+ *		nfs_cb_argop4	argarray<>;
+ *	};
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+				    struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 * p;
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+	p = xdr_encode_empty_array(p);		/* empty tag */
+	*p++ = cpu_to_be32(hdr->minorversion);
+	*p++ = cpu_to_be32(hdr->ident);
+
+	hdr->nops_p = p;
+	*p = cpu_to_be32(hdr->nops);		/* argarray element count */
+}
+
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+	BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+	*hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+
+/*
+ * CB_COMPOUND4res
+ *
+ *	struct CB_COMPOUND4res {
+ *		nfsstat4	status;
+ *		utf8str_cs	tag;
+ *		nfs_cb_resop4	resarray<>;
+ *	};
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	hdr->status = be32_to_cpup(p++);
+	/* Ignore the tag */
+	length = be32_to_cpup(p++);
+	p = xdr_inline_decode(xdr, length + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	hdr->nops = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * CB_RECALL4args
+ *
+ *	struct CB_RECALL4args {
+ *		stateid4	stateid;
+ *		bool		truncate;
+ *		nfs_fh4		fh;
+ *	};
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+				  const struct nfs4_delegation *dp,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+	encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
+
+	p = xdr_reserve_space(xdr, 4);
+	*p++ = xdr_zero;			/* truncate */
+
+	encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle);
+
+	hdr->nops++;
+}
+
+/*
+ * CB_SEQUENCE4args
+ *
+ *	struct CB_SEQUENCE4args {
+ *		sessionid4		csa_sessionid;
+ *		sequenceid4		csa_sequenceid;
+ *		slotid4			csa_slotid;
+ *		slotid4			csa_highest_slotid;
+ *		bool			csa_cachethis;
+ *		referring_call_list4	csa_referring_call_lists<>;
+ *	};
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+				    const struct nfsd4_callback *cb,
+				    struct nfs4_cb_compound_hdr *hdr)
+{
+	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+	__be32 *p;
+
+	if (hdr->minorversion == 0)
+		return;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
+	encode_sessionid4(xdr, session);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
+	*p++ = cpu_to_be32(session->se_cb_seq_nr);	/* csa_sequenceid */
+	*p++ = xdr_zero;			/* csa_slotid */
+	*p++ = xdr_zero;			/* csa_highest_slotid */
+	*p++ = xdr_zero;			/* csa_cachethis */
+	xdr_encode_empty_array(p);		/* csa_referring_call_lists */
+
+	hdr->nops++;
+}
+
+/*
+ * CB_SEQUENCE4resok
+ *
+ *	struct CB_SEQUENCE4resok {
+ *		sessionid4	csr_sessionid;
+ *		sequenceid4	csr_sequenceid;
+ *		slotid4		csr_slotid;
+ *		slotid4		csr_highest_slotid;
+ *		slotid4		csr_target_highest_slotid;
+ *	};
+ *
+ *	union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *	case NFS4_OK:
+ *		CB_SEQUENCE4resok	csr_resok4;
+ *	default:
+ *		void;
+ *	};
+ *
+ * Our current back channel implmentation supports a single backchannel
+ * with a single slot.
+ */
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
+				    struct nfsd4_callback *cb)
+{
+	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+	struct nfs4_sessionid id;
+	int status;
+	__be32 *p;
+	u32 dummy;
+
+	status = -ESERVERFAULT;
+
+	/*
+	 * If the server returns different values for sessionID, slotID or
+	 * sequence number, the server is looney tunes.
+	 */
+	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
+	if (memcmp(id.data, session->se_sessionid.data,
+					NFS4_MAX_SESSIONID_LEN) != 0) {
+		dprintk("NFS: %s Invalid session id\n", __func__);
+		goto out;
+	}
+	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+
+	dummy = be32_to_cpup(p++);
+	if (dummy != session->se_cb_seq_nr) {
+		dprintk("NFS: %s Invalid sequence number\n", __func__);
+		goto out;
+	}
+
+	dummy = be32_to_cpup(p++);
+	if (dummy != 0) {
+		dprintk("NFS: %s Invalid slotid\n", __func__);
+		goto out;
+	}
+
+	/*
+	 * FIXME: process highest slotid and target highest slotid
+	 */
+	status = 0;
+out:
+	if (status)
+		nfsd4_mark_cb_fault(cb->cb_clp, status);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
+{
+	int status;
+
+	if (cb->cb_minorversion == 0)
+		return 0;
+
+	status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status);
+	if (unlikely(status || cb->cb_status))
+		return status;
+
+	return decode_cb_sequence4resok(xdr, cb);
+}
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 void *__unused)
+{
+	xdr_reserve_space(xdr, 0);
+}
+
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+				   const struct nfsd4_callback *cb)
+{
+	const struct nfs4_delegation *dp = cb_to_delegation(cb);
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_recall4args(xdr, dp, &hdr);
+	encode_cb_nops(&hdr);
+}
+
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				void *__unused)
+{
+	return 0;
+}
+
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	if (cb != NULL) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status || cb->cb_status))
+			return status;
+	}
+
+	return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ *	struct layoutrecall_file4 {
+ *		nfs_fh4         lor_fh;
+ *		offset4         lor_offset;
+ *		length4         lor_length;
+ *		stateid4        lor_stateid;
+ *	};
+ *
+ *	union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ *	case LAYOUTRECALL4_FILE:
+ *		layoutrecall_file4 lor_layout;
+ *	case LAYOUTRECALL4_FSID:
+ *		fsid4              lor_fsid;
+ *	case LAYOUTRECALL4_ALL:
+ *		void;
+ *	};
+ *
+ *	struct CB_LAYOUTRECALL4args {
+ *		layouttype4             clora_type;
+ *		layoutiomode4           clora_iomode;
+ *		bool                    clora_changed;
+ *		layoutrecall4           clora_recall;
+ *	};
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+				  const struct nfs4_layout_stateid *ls,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	BUG_ON(hdr->minorversion == 0);
+
+	p = xdr_reserve_space(xdr, 5 * 4);
+	*p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+	*p++ = cpu_to_be32(ls->ls_layout_type);
+	*p++ = cpu_to_be32(IOMODE_ANY);
+	*p++ = cpu_to_be32(1);
+	*p = cpu_to_be32(RETURN_FILE);
+
+	encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+
+	p = xdr_reserve_space(xdr, 2 * 8);
+	p = xdr_encode_hyper(p, 0);
+	xdr_encode_hyper(p, NFS4_MAX_UINT64);
+
+	encode_stateid4(xdr, &ls->ls_recall_sid);
+
+	hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfsd4_callback *cb)
+{
+	const struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = 0,
+		.minorversion = cb->cb_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_layout4args(xdr, ls, &hdr);
+	encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	if (cb) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status || cb->cb_status))
+			return status;
+	}
+	return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+/*
+ * RPC procedure tables
+ */
+#define PROC(proc, call, argtype, restype)				\
+[NFSPROC4_CLNT_##proc] = {						\
+	.p_proc    = NFSPROC4_CB_##call,				\
+	.p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,		\
+	.p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,		\
+	.p_arglen  = NFS4_enc_##argtype##_sz,				\
+	.p_replen  = NFS4_dec_##restype##_sz,				\
+	.p_statidx = NFSPROC4_CB_##call,				\
+	.p_name    = #proc,						\
+}
+
+static struct rpc_procinfo nfs4_cb_procedures[] = {
+	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
+	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+	PROC(CB_LAYOUT,	COMPOUND,	cb_layout,	cb_layout),
+#endif
+};
+
+static struct rpc_version nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use.  The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
+	.number			= 1,
+	.nrprocs		= ARRAY_SIZE(nfs4_cb_procedures),
+	.procs			= nfs4_cb_procedures
+};
+
+static const struct rpc_version *nfs_cb_version[] = {
+	&nfs_cb_version4,
+};
+
+static const struct rpc_program cb_program;
+
+static struct rpc_stat cb_stats = {
+	.program		= &cb_program
+};
+
+#define NFS4_CALLBACK 0x40000000
+static const struct rpc_program cb_program = {
+	.name			= "nfs4_cb",
+	.number			= NFS4_CALLBACK,
+	.nrvers			= ARRAY_SIZE(nfs_cb_version),
+	.version		= nfs_cb_version,
+	.stats			= &cb_stats,
+	.pipe_dir_name		= "nfsd4_cb",
+};
+
+static int max_cb_time(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
+}
+
+static struct rpc_cred *callback_cred;
+
+int set_callback_cred(void)
+{
+	if (callback_cred)
+		return 0;
+	callback_cred = rpc_lookup_machine_cred("nfs");
+	if (!callback_cred)
+		return -ENOMEM;
+	return 0;
+}
+
+static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
+{
+	if (clp->cl_minorversion == 0) {
+		return get_rpccred(callback_cred);
+	} else {
+		struct rpc_auth *auth = client->cl_auth;
+		struct auth_cred acred = {};
+
+		acred.uid = ses->se_cb_sec.uid;
+		acred.gid = ses->se_cb_sec.gid;
+		return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
+	}
+}
+
+static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
+{
+	struct rpc_xprt *xprt;
+
+	if (args->protocol != XPRT_TRANSPORT_BC_TCP)
+		return rpc_create(args);
+
+	xprt = args->bc_xprt->xpt_bc_xprt;
+	if (xprt) {
+		xprt_get(xprt);
+		return rpc_create_xprt(args, xprt);
+	}
+
+	return rpc_create(args);
+}
+
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
+{
+	int maxtime = max_cb_time(clp->net);
+	struct rpc_timeout	timeparms = {
+		.to_initval	= maxtime,
+		.to_retries	= 0,
+		.to_maxval	= maxtime,
+	};
+	struct rpc_create_args args = {
+		.net		= clp->net,
+		.address	= (struct sockaddr *) &conn->cb_addr,
+		.addrsize	= conn->cb_addrlen,
+		.saddress	= (struct sockaddr *) &conn->cb_saddr,
+		.timeout	= &timeparms,
+		.program	= &cb_program,
+		.version	= 0,
+		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+	};
+	struct rpc_clnt *client;
+	struct rpc_cred *cred;
+
+	if (clp->cl_minorversion == 0) {
+		if (!clp->cl_cred.cr_principal &&
+				(clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
+			return -EINVAL;
+		args.client_name = clp->cl_cred.cr_principal;
+		args.prognumber	= conn->cb_prog;
+		args.protocol = XPRT_TRANSPORT_TCP;
+		args.authflavor = clp->cl_cred.cr_flavor;
+		clp->cl_cb_ident = conn->cb_ident;
+	} else {
+		if (!conn->cb_xprt)
+			return -EINVAL;
+		clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+		clp->cl_cb_session = ses;
+		args.bc_xprt = conn->cb_xprt;
+		args.prognumber = clp->cl_cb_session->se_cb_prog;
+		args.protocol = conn->cb_xprt->xpt_class->xcl_ident |
+				XPRT_TRANSPORT_BC;
+		args.authflavor = ses->se_cb_sec.flavor;
+	}
+	/* Create RPC client */
+	client = create_backchannel_client(&args);
+	if (IS_ERR(client)) {
+		dprintk("NFSD: couldn't create callback client: %ld\n",
+			PTR_ERR(client));
+		return PTR_ERR(client);
+	}
+	cred = get_backchannel_cred(clp, client, ses);
+	if (IS_ERR(cred)) {
+		rpc_shutdown_client(client);
+		return PTR_ERR(cred);
+	}
+	clp->cl_cb_client = client;
+	clp->cl_cb_cred = cred;
+	return 0;
+}
+
+static void warn_no_callback_path(struct nfs4_client *clp, int reason)
+{
+	dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+		(int)clp->cl_name.len, clp->cl_name.data, reason);
+}
+
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+	clp->cl_cb_state = NFSD4_CB_DOWN;
+	warn_no_callback_path(clp, reason);
+}
+
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+	clp->cl_cb_state = NFSD4_CB_FAULT;
+	warn_no_callback_path(clp, reason);
+}
+
+static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
+
+	if (task->tk_status)
+		nfsd4_mark_cb_down(clp, task->tk_status);
+	else
+		clp->cl_cb_state = NFSD4_CB_UP;
+}
+
+static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+	/* XXX: release method to ensure we set the cb channel down if
+	 * necessary on early failure? */
+	.rpc_call_done = nfsd4_cb_probe_done,
+};
+
+static struct workqueue_struct *callback_wq;
+
+/*
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
+ */
+void nfsd4_probe_callback(struct nfs4_client *clp)
+{
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+	nfsd4_run_cb(&clp->cl_cb_null);
+}
+
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
+{
+	nfsd4_probe_callback(clp);
+	flush_workqueue(callback_wq);
+}
+
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+	spin_lock(&clp->cl_lock);
+	memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
+	spin_unlock(&clp->cl_lock);
+}
+
+/*
+ * There's currently a single callback channel slot.
+ * If the slot is available, then mark it busy.  Otherwise, set the
+ * thread for sleeping on the callback RPC wait queue.
+ */
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
+{
+	if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
+		/* Race breaker */
+		if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+			dprintk("%s slot is busy\n", __func__);
+			return false;
+		}
+		rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
+	}
+	return true;
+}
+
+/*
+ * TODO: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_client *clp = cb->cb_clp;
+	u32 minorversion = clp->cl_minorversion;
+
+	cb->cb_minorversion = minorversion;
+	if (minorversion) {
+		if (!nfsd41_cb_get_slot(clp, task))
+			return;
+	}
+	rpc_call_start(task);
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_client *clp = cb->cb_clp;
+
+	dprintk("%s: minorversion=%d\n", __func__,
+		clp->cl_minorversion);
+
+	if (clp->cl_minorversion) {
+		/* No need for lock, access serialized in nfsd4_cb_prepare */
+		if (!task->tk_status)
+			++clp->cl_cb_session->se_cb_seq_nr;
+		clear_bit(0, &clp->cl_cb_slot_busy);
+		rpc_wake_up_next(&clp->cl_cb_waitq);
+		dprintk("%s: freed slot, new seqid=%d\n", __func__,
+			clp->cl_cb_session->se_cb_seq_nr);
+	}
+
+	/*
+	 * If the backchannel connection was shut down while this
+	 * task was queued, we need to resubmit it after setting up
+	 * a new backchannel connection.
+	 *
+	 * Note that if we lost our callback connection permanently
+	 * the submission code will error out, so we don't need to
+	 * handle that case here.
+	 */
+	if (task->tk_flags & RPC_TASK_KILLED) {
+		task->tk_status = 0;
+		cb->cb_need_restart = true;
+		return;
+	}
+
+	if (cb->cb_status) {
+		WARN_ON_ONCE(task->tk_status);
+		task->tk_status = cb->cb_status;
+	}
+
+	switch (cb->cb_ops->done(cb, task)) {
+	case 0:
+		task->tk_status = 0;
+		rpc_restart_call_prepare(task);
+		return;
+	case 1:
+		break;
+	case -1:
+		/* Network partition? */
+		nfsd4_mark_cb_down(clp, task->tk_status);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void nfsd4_cb_release(void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+
+	if (cb->cb_need_restart)
+		nfsd4_run_cb(cb);
+	else
+		cb->cb_ops->release(cb);
+
+}
+
+static const struct rpc_call_ops nfsd4_cb_ops = {
+	.rpc_call_prepare = nfsd4_cb_prepare,
+	.rpc_call_done = nfsd4_cb_done,
+	.rpc_release = nfsd4_cb_release,
+};
+
+int nfsd4_create_callback_queue(void)
+{
+	callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+	if (!callback_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfsd4_destroy_callback_queue(void)
+{
+	destroy_workqueue(callback_wq);
+}
+
+/* must be called under the state lock */
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
+{
+	set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
+	/*
+	 * Note this won't actually result in a null callback;
+	 * instead, nfsd4_run_cb_null() will detect the killed
+	 * client, destroy the rpc client, and stop:
+	 */
+	nfsd4_run_cb(&clp->cl_cb_null);
+	flush_workqueue(callback_wq);
+}
+
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+	struct nfsd4_session *s;
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+		list_for_each_entry(c, &s->se_conns, cn_persession) {
+			if (c->cn_flags & NFS4_CDFC4_BACK)
+				return c;
+		}
+	}
+	return NULL;
+}
+
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_conn conn;
+	struct nfs4_client *clp = cb->cb_clp;
+	struct nfsd4_session *ses = NULL;
+	struct nfsd4_conn *c;
+	int err;
+
+	/*
+	 * This is either an update, or the client dying; in either case,
+	 * kill the old client:
+	 */
+	if (clp->cl_cb_client) {
+		rpc_shutdown_client(clp->cl_cb_client);
+		clp->cl_cb_client = NULL;
+		put_rpccred(clp->cl_cb_cred);
+		clp->cl_cb_cred = NULL;
+	}
+	if (clp->cl_cb_conn.cb_xprt) {
+		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+		clp->cl_cb_conn.cb_xprt = NULL;
+	}
+	if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+		return;
+	spin_lock(&clp->cl_lock);
+	/*
+	 * Only serialized callback code is allowed to clear these
+	 * flags; main nfsd code can only set them:
+	 */
+	BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
+	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+	c = __nfsd4_find_backchannel(clp);
+	if (c) {
+		svc_xprt_get(c->cn_xprt);
+		conn.cb_xprt = c->cn_xprt;
+		ses = c->cn_session;
+	}
+	spin_unlock(&clp->cl_lock);
+
+	err = setup_callback_client(clp, &conn, ses);
+	if (err) {
+		nfsd4_mark_cb_down(clp, err);
+		return;
+	}
+}
+
+static void
+nfsd4_run_cb_work(struct work_struct *work)
+{
+	struct nfsd4_callback *cb =
+		container_of(work, struct nfsd4_callback, cb_work);
+	struct nfs4_client *clp = cb->cb_clp;
+	struct rpc_clnt *clnt;
+
+	if (cb->cb_need_restart) {
+		cb->cb_need_restart = false;
+	} else {
+		if (cb->cb_ops && cb->cb_ops->prepare)
+			cb->cb_ops->prepare(cb);
+	}
+
+	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
+		nfsd4_process_cb_update(cb);
+
+	clnt = clp->cl_cb_client;
+	if (!clnt) {
+		/* Callback channel broken, or client killed; give up: */
+		if (cb->cb_ops && cb->cb_ops->release)
+			cb->cb_ops->release(cb);
+		return;
+	}
+
+	/*
+	 * Don't send probe messages for 4.1 or later.
+	 */
+	if (!cb->cb_ops && clp->cl_minorversion) {
+		clp->cl_cb_state = NFSD4_CB_UP;
+		return;
+	}
+
+	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
+	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+			cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
+}
+
+void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+		struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
+{
+	cb->cb_clp = clp;
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
+	cb->cb_msg.rpc_argp = cb;
+	cb->cb_msg.rpc_resp = cb;
+	cb->cb_ops = ops;
+	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
+	cb->cb_status = 0;
+	cb->cb_need_restart = false;
+}
+
+void nfsd4_run_cb(struct nfsd4_callback *cb)
+{
+	queue_work(callback_wq, &cb->cb_work);
+}
diff --git a/kernel/fs/nfsd/nfs4idmap.c b/kernel/fs/nfsd/nfs4idmap.c
new file mode 100644
index 000000000..e1b3d3d47
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4idmap.c
@@ -0,0 +1,666 @@
+/*
+ *  Mapping of UID/GIDs to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <net/net_namespace.h>
+#include "idmap.h"
+#include "nfsd.h"
+#include "netns.h"
+
+/*
+ * Turn off idmapping when using AUTH_SYS.
+ */
+static bool nfs4_disable_idmapping = true;
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+		"Turn off server's NFSv4 idmapping when using 'sec=sys'");
+
+/*
+ * Cache entry
+ */
+
+/*
+ * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on
+ * that.
+ */
+
+#define IDMAP_TYPE_USER  0
+#define IDMAP_TYPE_GROUP 1
+
+struct ent {
+	struct cache_head h;
+	int               type;		       /* User / Group */
+	u32               id;
+	char              name[IDMAP_NAMESZ];
+	char              authname[IDMAP_NAMESZ];
+};
+
+/* Common entry handling */
+
+#define ENT_HASHBITS          8
+#define ENT_HASHMAX           (1 << ENT_HASHBITS)
+
+static void
+ent_init(struct cache_head *cnew, struct cache_head *citm)
+{
+	struct ent *new = container_of(cnew, struct ent, h);
+	struct ent *itm = container_of(citm, struct ent, h);
+
+	new->id = itm->id;
+	new->type = itm->type;
+
+	strlcpy(new->name, itm->name, sizeof(new->name));
+	strlcpy(new->authname, itm->authname, sizeof(new->name));
+}
+
+static void
+ent_put(struct kref *ref)
+{
+	struct ent *map = container_of(ref, struct ent, h.ref);
+	kfree(map);
+}
+
+static struct cache_head *
+ent_alloc(void)
+{
+	struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL);
+	if (e)
+		return &e->h;
+	else
+		return NULL;
+}
+
+/*
+ * ID -> Name cache
+ */
+
+static uint32_t
+idtoname_hash(struct ent *ent)
+{
+	uint32_t hash;
+
+	hash = hash_str(ent->authname, ENT_HASHBITS);
+	hash = hash_long(hash ^ ent->id, ENT_HASHBITS);
+
+	/* Flip LSB for user/group */
+	if (ent->type == IDMAP_TYPE_GROUP)
+		hash ^= 1;
+
+	return hash;
+}
+
+static void
+idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
+    int *blen)
+{
+ 	struct ent *ent = container_of(ch, struct ent, h);
+	char idstr[11];
+
+	qword_add(bpp, blen, ent->authname);
+	snprintf(idstr, sizeof(idstr), "%u", ent->id);
+	qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user");
+	qword_add(bpp, blen, idstr);
+
+	(*bpp)[-1] = '\n';
+}
+
+static int
+idtoname_match(struct cache_head *ca, struct cache_head *cb)
+{
+	struct ent *a = container_of(ca, struct ent, h);
+	struct ent *b = container_of(cb, struct ent, h);
+
+	return (a->id == b->id && a->type == b->type &&
+	    strcmp(a->authname, b->authname) == 0);
+}
+
+static int
+idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
+{
+	struct ent *ent;
+
+	if (h == NULL) {
+		seq_puts(m, "#domain type id [name]\n");
+		return 0;
+	}
+	ent = container_of(h, struct ent, h);
+	seq_printf(m, "%s %s %u", ent->authname,
+			ent->type == IDMAP_TYPE_GROUP ? "group" : "user",
+			ent->id);
+	if (test_bit(CACHE_VALID, &h->flags))
+		seq_printf(m, " %s", ent->name);
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static void
+warn_no_idmapd(struct cache_detail *detail, int has_died)
+{
+	printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
+			has_died ? "died" : "not been started");
+}
+
+
+static int         idtoname_parse(struct cache_detail *, char *, int);
+static struct ent *idtoname_lookup(struct cache_detail *, struct ent *);
+static struct ent *idtoname_update(struct cache_detail *, struct ent *,
+				   struct ent *);
+
+static struct cache_detail idtoname_cache_template = {
+	.owner		= THIS_MODULE,
+	.hash_size	= ENT_HASHMAX,
+	.name		= "nfs4.idtoname",
+	.cache_put	= ent_put,
+	.cache_request	= idtoname_request,
+	.cache_parse	= idtoname_parse,
+	.cache_show	= idtoname_show,
+	.warn_no_listener = warn_no_idmapd,
+	.match		= idtoname_match,
+	.init		= ent_init,
+	.update		= ent_init,
+	.alloc		= ent_alloc,
+};
+
+static int
+idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	struct ent ent, *res;
+	char *buf1, *bp;
+	int len;
+	int error = -EINVAL;
+
+	if (buf[buflen - 1] != '\n')
+		return (-EINVAL);
+	buf[buflen - 1]= '\0';
+
+	buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (buf1 == NULL)
+		return (-ENOMEM);
+
+	memset(&ent, 0, sizeof(ent));
+
+	/* Authentication name */
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len <= 0 || len >= IDMAP_NAMESZ)
+		goto out;
+	memcpy(ent.authname, buf1, sizeof(ent.authname));
+
+	/* Type */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	ent.type = strcmp(buf1, "user") == 0 ?
+		IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
+
+	/* ID */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	ent.id = simple_strtoul(buf1, &bp, 10);
+	if (bp == buf1)
+		goto out;
+
+	/* expiry */
+	ent.h.expiry_time = get_expiry(&buf);
+	if (ent.h.expiry_time == 0)
+		goto out;
+
+	error = -ENOMEM;
+	res = idtoname_lookup(cd, &ent);
+	if (!res)
+		goto out;
+
+	/* Name */
+	error = -EINVAL;
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len < 0 || len >= IDMAP_NAMESZ)
+		goto out;
+	if (len == 0)
+		set_bit(CACHE_NEGATIVE, &ent.h.flags);
+	else
+		memcpy(ent.name, buf1, sizeof(ent.name));
+	error = -ENOMEM;
+	res = idtoname_update(cd, &ent, res);
+	if (res == NULL)
+		goto out;
+
+	cache_put(&res->h, cd);
+	error = 0;
+out:
+	kfree(buf1);
+	return error;
+}
+
+static struct ent *
+idtoname_lookup(struct cache_detail *cd, struct ent *item)
+{
+	struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
+						    idtoname_hash(item));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+static struct ent *
+idtoname_update(struct cache_detail *cd, struct ent *new, struct ent *old)
+{
+	struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h,
+						    idtoname_hash(new));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+
+/*
+ * Name -> ID cache
+ */
+
+static inline int
+nametoid_hash(struct ent *ent)
+{
+	return hash_str(ent->name, ENT_HASHBITS);
+}
+
+static void
+nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
+    int *blen)
+{
+ 	struct ent *ent = container_of(ch, struct ent, h);
+
+	qword_add(bpp, blen, ent->authname);
+	qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user");
+	qword_add(bpp, blen, ent->name);
+
+	(*bpp)[-1] = '\n';
+}
+
+static int
+nametoid_match(struct cache_head *ca, struct cache_head *cb)
+{
+	struct ent *a = container_of(ca, struct ent, h);
+	struct ent *b = container_of(cb, struct ent, h);
+
+	return (a->type == b->type && strcmp(a->name, b->name) == 0 &&
+	    strcmp(a->authname, b->authname) == 0);
+}
+
+static int
+nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
+{
+	struct ent *ent;
+
+	if (h == NULL) {
+		seq_puts(m, "#domain type name [id]\n");
+		return 0;
+	}
+	ent = container_of(h, struct ent, h);
+	seq_printf(m, "%s %s %s", ent->authname,
+			ent->type == IDMAP_TYPE_GROUP ? "group" : "user",
+			ent->name);
+	if (test_bit(CACHE_VALID, &h->flags))
+		seq_printf(m, " %u", ent->id);
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static struct ent *nametoid_lookup(struct cache_detail *, struct ent *);
+static struct ent *nametoid_update(struct cache_detail *, struct ent *,
+				   struct ent *);
+static int         nametoid_parse(struct cache_detail *, char *, int);
+
+static struct cache_detail nametoid_cache_template = {
+	.owner		= THIS_MODULE,
+	.hash_size	= ENT_HASHMAX,
+	.name		= "nfs4.nametoid",
+	.cache_put	= ent_put,
+	.cache_request	= nametoid_request,
+	.cache_parse	= nametoid_parse,
+	.cache_show	= nametoid_show,
+	.warn_no_listener = warn_no_idmapd,
+	.match		= nametoid_match,
+	.init		= ent_init,
+	.update		= ent_init,
+	.alloc		= ent_alloc,
+};
+
+static int
+nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	struct ent ent, *res;
+	char *buf1;
+	int len, error = -EINVAL;
+
+	if (buf[buflen - 1] != '\n')
+		return (-EINVAL);
+	buf[buflen - 1]= '\0';
+
+	buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (buf1 == NULL)
+		return (-ENOMEM);
+
+	memset(&ent, 0, sizeof(ent));
+
+	/* Authentication name */
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len <= 0 || len >= IDMAP_NAMESZ)
+		goto out;
+	memcpy(ent.authname, buf1, sizeof(ent.authname));
+
+	/* Type */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	ent.type = strcmp(buf1, "user") == 0 ?
+		IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
+
+	/* Name */
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len <= 0 || len >= IDMAP_NAMESZ)
+		goto out;
+	memcpy(ent.name, buf1, sizeof(ent.name));
+
+	/* expiry */
+	ent.h.expiry_time = get_expiry(&buf);
+	if (ent.h.expiry_time == 0)
+		goto out;
+
+	/* ID */
+	error = get_int(&buf, &ent.id);
+	if (error == -EINVAL)
+		goto out;
+	if (error == -ENOENT)
+		set_bit(CACHE_NEGATIVE, &ent.h.flags);
+
+	error = -ENOMEM;
+	res = nametoid_lookup(cd, &ent);
+	if (res == NULL)
+		goto out;
+	res = nametoid_update(cd, &ent, res);
+	if (res == NULL)
+		goto out;
+
+	cache_put(&res->h, cd);
+	error = 0;
+out:
+	kfree(buf1);
+	return (error);
+}
+
+
+static struct ent *
+nametoid_lookup(struct cache_detail *cd, struct ent *item)
+{
+	struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
+						    nametoid_hash(item));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+static struct ent *
+nametoid_update(struct cache_detail *cd, struct ent *new, struct ent *old)
+{
+	struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h,
+						    nametoid_hash(new));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+/*
+ * Exported API
+ */
+
+int
+nfsd_idmap_init(struct net *net)
+{
+	int rv;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	nn->idtoname_cache = cache_create_net(&idtoname_cache_template, net);
+	if (IS_ERR(nn->idtoname_cache))
+		return PTR_ERR(nn->idtoname_cache);
+	rv = cache_register_net(nn->idtoname_cache, net);
+	if (rv)
+		goto destroy_idtoname_cache;
+	nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net);
+	if (IS_ERR(nn->nametoid_cache)) {
+		rv = PTR_ERR(nn->nametoid_cache);
+		goto unregister_idtoname_cache;
+	}
+	rv = cache_register_net(nn->nametoid_cache, net);
+	if (rv)
+		goto destroy_nametoid_cache;
+	return 0;
+
+destroy_nametoid_cache:
+	cache_destroy_net(nn->nametoid_cache, net);
+unregister_idtoname_cache:
+	cache_unregister_net(nn->idtoname_cache, net);
+destroy_idtoname_cache:
+	cache_destroy_net(nn->idtoname_cache, net);
+	return rv;
+}
+
+void
+nfsd_idmap_shutdown(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	cache_unregister_net(nn->idtoname_cache, net);
+	cache_unregister_net(nn->nametoid_cache, net);
+	cache_destroy_net(nn->idtoname_cache, net);
+	cache_destroy_net(nn->nametoid_cache, net);
+}
+
+static int
+idmap_lookup(struct svc_rqst *rqstp,
+		struct ent *(*lookup_fn)(struct cache_detail *, struct ent *),
+		struct ent *key, struct cache_detail *detail, struct ent **item)
+{
+	int ret;
+
+	*item = lookup_fn(detail, key);
+	if (!*item)
+		return -ENOMEM;
+ retry:
+	ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
+
+	if (ret == -ETIMEDOUT) {
+		struct ent *prev_item = *item;
+		*item = lookup_fn(detail, key);
+		if (*item != prev_item)
+			goto retry;
+		cache_put(&(*item)->h, detail);
+	}
+	return ret;
+}
+
+static char *
+rqst_authname(struct svc_rqst *rqstp)
+{
+	struct auth_domain *clp;
+
+	clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
+	return clp->name;
+}
+
+static __be32
+idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
+		u32 *id)
+{
+	struct ent *item, key = {
+		.type = type,
+	};
+	int ret;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	if (namelen + 1 > sizeof(key.name))
+		return nfserr_badowner;
+	memcpy(key.name, name, namelen);
+	key.name[namelen] = '\0';
+	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+	ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item);
+	if (ret == -ENOENT)
+		return nfserr_badowner;
+	if (ret)
+		return nfserrno(ret);
+	*id = item->id;
+	cache_put(&item->h, nn->nametoid_cache);
+	return 0;
+}
+
+static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id)
+{
+	char buf[11];
+	int len;
+	__be32 *p;
+
+	len = sprintf(buf, "%u", id);
+	p = xdr_reserve_space(xdr, len + 4);
+	if (!p)
+		return nfserr_resource;
+	p = xdr_encode_opaque(p, buf, len);
+	return 0;
+}
+
+static __be32 idmap_id_to_name(struct xdr_stream *xdr,
+			       struct svc_rqst *rqstp, int type, u32 id)
+{
+	struct ent *item, key = {
+		.id = id,
+		.type = type,
+	};
+	__be32 *p;
+	int ret;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+	ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
+	if (ret == -ENOENT)
+		return encode_ascii_id(xdr, id);
+	if (ret)
+		return nfserrno(ret);
+	ret = strlen(item->name);
+	WARN_ON_ONCE(ret > IDMAP_NAMESZ);
+	p = xdr_reserve_space(xdr, ret + 4);
+	if (!p)
+		return nfserr_resource;
+	p = xdr_encode_opaque(p, item->name, ret);
+	cache_put(&item->h, nn->idtoname_cache);
+	return 0;
+}
+
+static bool
+numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
+{
+	int ret;
+	char buf[11];
+
+	if (namelen + 1 > sizeof(buf))
+		/* too long to represent a 32-bit id: */
+		return false;
+	/* Just to make sure it's null-terminated: */
+	memcpy(buf, name, namelen);
+	buf[namelen] = '\0';
+	ret = kstrtouint(buf, 10, id);
+	return ret == 0;
+}
+
+static __be32
+do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
+{
+	if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
+		if (numeric_name_to_id(rqstp, type, name, namelen, id))
+			return 0;
+		/*
+		 * otherwise, fall through and try idmapping, for
+		 * backwards compatibility with clients sending names:
+		 */
+	return idmap_name_to_id(rqstp, type, name, namelen, id);
+}
+
+static __be32 encode_name_from_id(struct xdr_stream *xdr,
+				  struct svc_rqst *rqstp, int type, u32 id)
+{
+	if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
+		return encode_ascii_id(xdr, id);
+	return idmap_id_to_name(xdr, rqstp, type, id);
+}
+
+__be32
+nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
+		kuid_t *uid)
+{
+	__be32 status;
+	u32 id = -1;
+	status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
+	*uid = make_kuid(&init_user_ns, id);
+	if (!uid_valid(*uid))
+		status = nfserr_badowner;
+	return status;
+}
+
+__be32
+nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
+		kgid_t *gid)
+{
+	__be32 status;
+	u32 id = -1;
+	status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
+	*gid = make_kgid(&init_user_ns, id);
+	if (!gid_valid(*gid))
+		status = nfserr_badowner;
+	return status;
+}
+
+__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+			 kuid_t uid)
+{
+	u32 id = from_kuid(&init_user_ns, uid);
+	return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id);
+}
+
+__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+			  kgid_t gid)
+{
+	u32 id = from_kgid(&init_user_ns, gid);
+	return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id);
+}
diff --git a/kernel/fs/nfsd/nfs4layouts.c b/kernel/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000..6904213a4
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+
+struct nfs4_layout {
+	struct list_head		lo_perstate;
+	struct nfs4_layout_stateid	*lo_state;
+	struct nfsd4_layout_seg		lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS	8
+#define DEVID_HASH_SIZE	(1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK	(DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+	return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+	const struct knfsd_fh *fh = &fhp->fh_handle;
+	size_t fsid_len = key_len(fh->fh_fsid_type);
+	struct nfsd4_deviceid_map *map, *old;
+	int i;
+
+	map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+	if (!map)
+		return;
+
+	map->fsid_type = fh->fh_fsid_type;
+	memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+	spin_lock(&nfsd_devid_lock);
+	if (fhp->fh_export->ex_devid_map)
+		goto out_unlock;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+			if (old->fsid_type != fh->fh_fsid_type)
+				continue;
+			if (memcmp(old->fsid, fh->fh_fsid,
+					key_len(old->fsid_type)))
+				continue;
+
+			fhp->fh_export->ex_devid_map = old;
+			goto out_unlock;
+		}
+	}
+
+	map->idx = nfsd_devid_seq++;
+	list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+	fhp->fh_export->ex_devid_map = map;
+	map = NULL;
+
+out_unlock:
+	spin_unlock(&nfsd_devid_lock);
+	kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+	struct nfsd4_deviceid_map *map, *ret = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+		if (map->idx == idx)
+			ret = map;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation)
+{
+	if (!fhp->fh_export->ex_devid_map) {
+		nfsd4_alloc_devid_map(fhp);
+		if (!fhp->fh_export->ex_devid_map)
+			return -ENOMEM;
+	}
+
+	id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+	id->generation = device_generation;
+	id->pad = 0;
+	return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+	struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+
+	if (!(exp->ex_flags & NFSEXP_PNFS))
+		return;
+
+	if (sb->s_export_op->get_uuid &&
+	    sb->s_export_op->map_blocks &&
+	    sb->s_export_op->commit_blocks)
+		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+	struct nfs4_layout_stateid *ls = layoutstateid(stid);
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+	trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+
+	spin_lock(&clp->cl_lock);
+	list_del_init(&ls->ls_perclnt);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_del_init(&ls->ls_perfile);
+	spin_unlock(&fp->fi_lock);
+
+	vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+	fput(ls->ls_file);
+
+	if (ls->ls_recalled)
+		atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+
+	kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+	struct file_lock *fl;
+	int status;
+
+	fl = locks_alloc_lock();
+	if (!fl)
+		return -ENOMEM;
+	locks_init_lock(fl);
+	fl->fl_lmops = &nfsd4_layouts_lm_ops;
+	fl->fl_flags = FL_LAYOUT;
+	fl->fl_type = F_RDLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner = ls;
+	fl->fl_pid = current->tgid;
+	fl->fl_file = ls->ls_file;
+
+	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+	if (status) {
+		locks_free_lock(fl);
+		return status;
+	}
+	BUG_ON(fl != NULL);
+	return 0;
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+		struct nfs4_stid *parent, u32 layout_type)
+{
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_file *fp = parent->sc_file;
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stp;
+
+	stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+	if (!stp)
+		return NULL;
+	stp->sc_free = nfsd4_free_layout_stateid;
+	get_nfs4_file(fp);
+	stp->sc_file = fp;
+
+	ls = layoutstateid(stp);
+	INIT_LIST_HEAD(&ls->ls_perclnt);
+	INIT_LIST_HEAD(&ls->ls_perfile);
+	spin_lock_init(&ls->ls_lock);
+	INIT_LIST_HEAD(&ls->ls_layouts);
+	ls->ls_layout_type = layout_type;
+	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+			NFSPROC4_CLNT_CB_LAYOUT);
+
+	if (parent->sc_type == NFS4_DELEG_STID)
+		ls->ls_file = get_file(fp->fi_deleg_file);
+	else
+		ls->ls_file = find_any_file(fp);
+	BUG_ON(!ls->ls_file);
+
+	if (nfsd4_layout_setlease(ls)) {
+		put_nfs4_file(fp);
+		kmem_cache_free(nfs4_layout_stateid_cache, ls);
+		return NULL;
+	}
+
+	spin_lock(&clp->cl_lock);
+	stp->sc_type = NFS4_LAYOUT_STID;
+	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_add(&ls->ls_perfile, &fp->fi_lo_states);
+	spin_unlock(&fp->fi_lock);
+
+	trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+	return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stid;
+	unsigned char typemask = NFS4_LAYOUT_STID;
+	__be32 status;
+
+	if (create)
+		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+			net_generic(SVC_NET(rqstp), nfsd_net_id));
+	if (status)
+		goto out;
+
+	if (!fh_match(&cstate->current_fh.fh_handle,
+		      &stid->sc_file->fi_fhandle)) {
+		status = nfserr_bad_stateid;
+		goto out_put_stid;
+	}
+
+	if (stid->sc_type != NFS4_LAYOUT_STID) {
+		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+		nfs4_put_stid(stid);
+
+		status = nfserr_jukebox;
+		if (!ls)
+			goto out;
+	} else {
+		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+		status = nfserr_bad_stateid;
+		if (stateid->si_generation > stid->sc_stateid.si_generation)
+			goto out_put_stid;
+		if (layout_type != ls->ls_layout_type)
+			goto out_put_stid;
+	}
+
+	*lsp = ls;
+	return 0;
+
+out_put_stid:
+	nfs4_put_stid(stid);
+out:
+	return status;
+}
+
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+	spin_lock(&ls->ls_lock);
+	if (ls->ls_recalled)
+		goto out_unlock;
+
+	ls->ls_recalled = true;
+	atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+	if (list_empty(&ls->ls_layouts))
+		goto out_unlock;
+
+	trace_layout_recall(&ls->ls_stid.sc_stateid);
+
+	atomic_inc(&ls->ls_stid.sc_count);
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	nfsd4_run_cb(&ls->ls_recall);
+
+out_unlock:
+	spin_unlock(&ls->ls_lock);
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+	u64 end = seg->offset + seg->length;
+	return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+	if (end == NFS4_MAX_UINT64)
+		lo->length = NFS4_MAX_UINT64;
+	else
+		lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+	if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+		return false;
+	if (layout_end(&lo->lo_seg) <= s->offset)
+		return false;
+	if (layout_end(s) <= lo->lo_seg.offset)
+		return false;
+	return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+	if (lo->iomode != new->iomode)
+		return false;
+	if (layout_end(new) < lo->offset)
+		return false;
+	if (layout_end(lo) < new->offset)
+		return false;
+
+	lo->offset = min(lo->offset, new->offset);
+	layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+	return true;
+}
+
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+	struct nfs4_layout_stateid *l, *n;
+	__be32 nfserr = nfs_ok;
+
+	assert_spin_locked(&fp->fi_lock);
+
+	list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+		if (l != ls) {
+			nfsd4_recall_file_layout(l);
+			nfserr = nfserr_recallconflict;
+		}
+	}
+
+	return nfserr;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+	struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+	struct nfs4_layout *lp, *new = NULL;
+	__be32 nfserr;
+
+	spin_lock(&fp->fi_lock);
+	nfserr = nfsd4_recall_conflict(ls);
+	if (nfserr)
+		goto out;
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+	spin_unlock(&ls->ls_lock);
+	spin_unlock(&fp->fi_lock);
+
+	new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+	if (!new)
+		return nfserr_jukebox;
+	memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+	new->lo_state = ls;
+
+	spin_lock(&fp->fi_lock);
+	nfserr = nfsd4_recall_conflict(ls);
+	if (nfserr)
+		goto out;
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+
+	atomic_inc(&ls->ls_stid.sc_count);
+	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+	new = NULL;
+done:
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	spin_unlock(&ls->ls_lock);
+out:
+	spin_unlock(&fp->fi_lock);
+	if (new)
+		kmem_cache_free(nfs4_layout_cache, new);
+	return nfserr;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+	while (!list_empty(reaplist)) {
+		struct nfs4_layout *lp = list_first_entry(reaplist,
+				struct nfs4_layout, lo_perstate);
+
+		list_del(&lp->lo_perstate);
+		nfs4_put_stid(&lp->lo_state->ls_stid);
+		kmem_cache_free(nfs4_layout_cache, lp);
+	}
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+		struct list_head *reaplist)
+{
+	struct nfsd4_layout_seg *lo = &lp->lo_seg;
+	u64 end = layout_end(lo);
+
+	if (seg->offset <= lo->offset) {
+		if (layout_end(seg) >= end) {
+			list_move_tail(&lp->lo_perstate, reaplist);
+			return;
+		}
+		lo->offset = layout_end(seg);
+	} else {
+		/* retain the whole layout segment on a split. */
+		if (layout_end(seg) < end) {
+			dprintk("%s: split not supported\n", __func__);
+			return;
+		}
+		end = seg->offset;
+	}
+
+	layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_layout *lp, *n;
+	LIST_HEAD(reaplist);
+	__be32 nfserr;
+	int found = 0;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+						false, lrp->lr_layout_type,
+						&ls);
+	if (nfserr) {
+		trace_layout_return_lookup_fail(&lrp->lr_sid);
+		return nfserr;
+	}
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+		if (layouts_overlapping(lp, &lrp->lr_seg)) {
+			nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+			found++;
+		}
+	}
+	if (!list_empty(&ls->ls_layouts)) {
+		if (found) {
+			update_stateid(&ls->ls_stid.sc_stateid);
+			memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+				sizeof(stateid_t));
+		}
+		lrp->lrs_present = 1;
+	} else {
+		trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+		nfs4_unhash_stid(&ls->ls_stid);
+		lrp->lrs_present = 0;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	nfs4_put_stid(&ls->ls_stid);
+	nfsd4_free_layouts(&reaplist);
+	return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_layout *lp, *t;
+	LIST_HEAD(reaplist);
+
+	lrp->lrs_present = 0;
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+		if (ls->ls_layout_type != lrp->lr_layout_type)
+			continue;
+
+		if (lrp->lr_return_type == RETURN_FSID &&
+		    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+				   &cstate->current_fh.fh_handle))
+			continue;
+
+		spin_lock(&ls->ls_lock);
+		list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+			if (lrp->lr_seg.iomode == IOMODE_ANY ||
+			    lrp->lr_seg.iomode == lp->lo_seg.iomode)
+				list_move_tail(&lp->lo_perstate, &reaplist);
+		}
+		spin_unlock(&ls->ls_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+	return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+		struct list_head *reaplist)
+{
+	spin_lock(&ls->ls_lock);
+	list_splice_init(&ls->ls_layouts, reaplist);
+	spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+		nfsd4_return_all_layouts(ls, &reaplist);
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&fp->fi_lock);
+	list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+		if (ls->ls_stid.sc_client == clp)
+			nfsd4_return_all_layouts(ls, &reaplist);
+	}
+	spin_unlock(&fp->fi_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	char addr_str[INET6_ADDRSTRLEN];
+	static char *envp[] = {
+		"HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+	char *argv[8];
+	int error;
+
+	rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+
+	trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+
+	printk(KERN_WARNING
+		"nfsd: client %s failed to respond to layout recall. "
+		"  Fencing..\n", addr_str);
+
+	argv[0] = "/sbin/nfsd-recall-failed";
+	argv[1] = addr_str;
+	argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+	argv[3] = NULL;
+
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (error) {
+		printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+			addr_str, error);
+	}
+}
+
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+	struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+	LIST_HEAD(reaplist);
+
+	switch (task->tk_status) {
+	case 0:
+		return 1;
+	case -NFS4ERR_NOMATCHING_LAYOUT:
+		trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+		task->tk_status = 0;
+		return 1;
+	case -NFS4ERR_DELAY:
+		/* Poll the client until it's done with the layout */
+		/* FIXME: cap number of retries.
+		 * The pnfs standard states that we need to only expire
+		 * the client after at-least "lease time" .eg lease-time * 2
+		 * when failing to communicate a recall
+		 */
+		rpc_delay(task, HZ/100); /* 10 mili-seconds */
+		return 0;
+	default:
+		/*
+		 * Unknown error or non-responding client, we'll need to fence.
+		 */
+		nfsd4_cb_layout_fail(ls);
+		return -1;
+	}
+}
+
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+	LIST_HEAD(reaplist);
+
+	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+
+	nfsd4_return_all_layouts(ls, &reaplist);
+	nfsd4_free_layouts(&reaplist);
+	nfs4_put_stid(&ls->ls_stid);
+}
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+	.done		= nfsd4_cb_layout_done,
+	.release	= nfsd4_cb_layout_release,
+};
+
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+	/*
+	 * We don't want the locks code to timeout the lease for us;
+	 * we'll remove it ourself if a layout isn't returned
+	 * in time:
+	 */
+	fl->fl_break_time = 0;
+	nfsd4_recall_file_layout(fl->fl_owner);
+	return false;
+}
+
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+		struct list_head *dispose)
+{
+	BUG_ON(!(arg & F_UNLCK));
+	return lease_modify(onlist, arg, dispose);
+}
+
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+	.lm_break	= nfsd4_layout_lm_break,
+	.lm_change	= nfsd4_layout_lm_change,
+};
+
+int
+nfsd4_init_pnfs(void)
+{
+	int i;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+			sizeof(struct nfs4_layout), 0, 0, NULL);
+	if (!nfs4_layout_cache)
+		return -ENOMEM;
+
+	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	if (!nfs4_layout_stateid_cache) {
+		kmem_cache_destroy(nfs4_layout_cache);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+	int i;
+
+	kmem_cache_destroy(nfs4_layout_cache);
+	kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		struct nfsd4_deviceid_map *map, *n;
+
+		list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+			kfree(map);
+	}
+}
diff --git a/kernel/fs/nfsd/nfs4proc.c b/kernel/fs/nfsd/nfs4proc.c
new file mode 100644
index 000000000..864e2003e
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4proc.c
@@ -0,0 +1,2368 @@
+/*
+ *  Server-side procedures for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/file.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+
+#include "idmap.h"
+#include "cache.h"
+#include "xdr4.h"
+#include "vfs.h"
+#include "current_stateid.h"
+#include "netns.h"
+#include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{
+	struct inode *inode = d_inode(resfh->fh_dentry);
+	int status;
+
+	mutex_lock(&inode->i_mutex);
+	status = security_inode_setsecctx(resfh->fh_dentry,
+		label->data, label->len);
+	mutex_unlock(&inode->i_mutex);
+
+	if (status)
+		/*
+		 * XXX: We should really fail the whole open, but we may
+		 * already have created a new file, so it may be too
+		 * late.  For now this seems the least of evils:
+		 */
+		bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+	return;
+}
+#else
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{ }
+#endif
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+
+static u32 nfsd_attrmask[] = {
+	NFSD_WRITEABLE_ATTRS_WORD0,
+	NFSD_WRITEABLE_ATTRS_WORD1,
+	NFSD_WRITEABLE_ATTRS_WORD2
+};
+
+static u32 nfsd41_ex_attrmask[] = {
+	NFSD_SUPPATTR_EXCLCREAT_WORD0,
+	NFSD_SUPPATTR_EXCLCREAT_WORD1,
+	NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
+static __be32
+check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		   u32 *bmval, u32 *writable)
+{
+	struct dentry *dentry = cstate->current_fh.fh_dentry;
+
+	/*
+	 * Check about attributes are supported by the NFSv4 server or not.
+	 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP.
+	 */
+	if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) ||
+	    (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) ||
+	    (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+		return nfserr_attrnotsupp;
+
+	/*
+	 * Check FATTR4_WORD0_ACL can be supported
+	 * in current environment or not.
+	 */
+	if (bmval[0] & FATTR4_WORD0_ACL) {
+		if (!IS_POSIXACL(d_inode(dentry)))
+			return nfserr_attrnotsupp;
+	}
+
+	/*
+	 * According to spec, read-only attributes return ERR_INVAL.
+	 */
+	if (writable) {
+		if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+		    (bmval[2] & ~writable[2]))
+			return nfserr_inval;
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_check_open_attributes(struct svc_rqst *rqstp,
+	struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+	__be32 status = nfs_ok;
+
+	if (open->op_create == NFS4_OPEN_CREATE) {
+		if (open->op_createmode == NFS4_CREATE_UNCHECKED
+		    || open->op_createmode == NFS4_CREATE_GUARDED)
+			status = check_attr_support(rqstp, cstate,
+					open->op_bmval, nfsd_attrmask);
+		else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1)
+			status = check_attr_support(rqstp, cstate,
+					open->op_bmval, nfsd41_ex_attrmask);
+	}
+
+	return status;
+}
+
+static int
+is_create_with_attrs(struct nfsd4_open *open)
+{
+	return open->op_create == NFS4_OPEN_CREATE
+		&& (open->op_createmode == NFS4_CREATE_UNCHECKED
+		    || open->op_createmode == NFS4_CREATE_GUARDED
+		    || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
+}
+
+/*
+ * if error occurs when setting the acl, just clear the acl bit
+ * in the returned attr bitmap.
+ */
+static void
+do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct nfs4_acl *acl, u32 *bmval)
+{
+	__be32 status;
+
+	status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
+	if (status)
+		/*
+		 * We should probably fail the whole open at this point,
+		 * but we've already created the file, so it's too late;
+		 * So this seems the least of evils:
+		 */
+		bmval[0] &= ~FATTR4_WORD0_ACL;
+}
+
+static inline void
+fh_dup2(struct svc_fh *dst, struct svc_fh *src)
+{
+	fh_put(dst);
+	dget(src->fh_dentry);
+	if (src->fh_export)
+		exp_get(src->fh_export);
+	*dst = *src;
+}
+
+static __be32
+do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
+{
+	__be32 status;
+
+	if (open->op_truncate &&
+		!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+		return nfserr_inval;
+
+	accmode |= NFSD_MAY_READ_IF_EXEC;
+
+	if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
+		accmode |= NFSD_MAY_READ;
+	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+		accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
+	if (open->op_share_deny & NFS4_SHARE_DENY_READ)
+		accmode |= NFSD_MAY_WRITE;
+
+	status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
+
+	return status;
+}
+
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+	umode_t mode = d_inode(fh->fh_dentry)->i_mode;
+
+	if (S_ISREG(mode))
+		return nfs_ok;
+	if (S_ISDIR(mode))
+		return nfserr_isdir;
+	/*
+	 * Using err_symlink as our catch-all case may look odd; but
+	 * there's no other obvious error for this case in 4.0, and we
+	 * happen to know that it will cause the linux v4 client to do
+	 * the right thing on attempts to open something other than a
+	 * regular file.
+	 */
+	return nfserr_symlink;
+}
+
+static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh)
+{
+	if (nfsd4_has_session(cstate))
+		return;
+	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
+			&resfh->fh_handle);
+}
+
+static __be32
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	int accmode;
+	__be32 status;
+
+	*resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+	if (!*resfh)
+		return nfserr_jukebox;
+	fh_init(*resfh, NFS4_FHSIZE);
+	open->op_truncate = 0;
+
+	if (open->op_create) {
+		/* FIXME: check session persistence and pnfs flags.
+		 * The nfsv4.1 spec requires the following semantics:
+		 *
+		 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+		 * Reply Cache  | server |                 |
+		 * -------------+--------+-----------------+--------------------
+		 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 *              |        |                 | (SHOULD)
+		 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+		 *              |        |                 | (SHOULD NOT)
+		 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 * yes          | no     | GUARDED4        | GUARDED4
+		 * yes          | yes    | GUARDED4        | GUARDED4
+		 */
+
+		/*
+		 * Note: create modes (UNCHECKED,GUARDED...) are the same
+		 * in NFSv4 as in v3 except EXCLUSIVE4_1.
+		 */
+		status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
+					open->op_fname.len, &open->op_iattr,
+					*resfh, open->op_createmode,
+					(u32 *)open->op_verf.data,
+					&open->op_truncate, &open->op_created);
+
+		if (!status && open->op_label.len)
+			nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
+
+		/*
+		 * Following rfc 3530 14.2.16, use the returned bitmask
+		 * to indicate which attributes we used to store the
+		 * verifier:
+		 */
+		if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
+			open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
+							FATTR4_WORD1_TIME_MODIFY);
+	} else
+		/*
+		 * Note this may exit with the parent still locked.
+		 * We will hold the lock until nfsd4_open's final
+		 * lookup, to prevent renames or unlinks until we've had
+		 * a chance to an acquire a delegation if appropriate.
+		 */
+		status = nfsd_lookup(rqstp, current_fh,
+				     open->op_fname.data, open->op_fname.len, *resfh);
+	if (status)
+		goto out;
+	status = nfsd_check_obj_isreg(*resfh);
+	if (status)
+		goto out;
+
+	if (is_create_with_attrs(open) && open->op_acl != NULL)
+		do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
+
+	nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
+	accmode = NFSD_MAY_NOP;
+	if (open->op_created ||
+			open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+		accmode |= NFSD_MAY_OWNER_OVERRIDE;
+	status = do_open_permission(rqstp, *resfh, open, accmode);
+	set_change_info(&open->op_cinfo, current_fh);
+out:
+	return status;
+}
+
+static __be32
+do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	__be32 status;
+	int accmode = 0;
+
+	/* We don't know the target directory, and therefore can not
+	* set the change info
+	*/
+
+	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
+
+	nfsd4_set_open_owner_reply_cache(cstate, open, current_fh);
+
+	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
+		(open->op_iattr.ia_size == 0);
+	/*
+	 * In the delegation case, the client is telling us about an
+	 * open that it *already* performed locally, some time ago.  We
+	 * should let it succeed now if possible.
+	 *
+	 * In the case of a CLAIM_FH open, on the other hand, the client
+	 * may be counting on us to enforce permissions (the Linux 4.1
+	 * client uses this for normal opens, for example).
+	 */
+	if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
+		accmode = NFSD_MAY_OWNER_OVERRIDE;
+
+	status = do_open_permission(rqstp, current_fh, open, accmode);
+
+	return status;
+}
+
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+	struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)session->se_sessionid.data;
+
+	clid->cl_boot = sid->clientid.cl_boot;
+	clid->cl_id = sid->clientid.cl_id;
+}
+
+static __be32
+nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_open *open)
+{
+	__be32 status;
+	struct svc_fh *resfh = NULL;
+	struct nfsd4_compoundres *resp;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
+		(int)open->op_fname.len, open->op_fname.data,
+		open->op_openowner);
+
+	/* This check required by spec. */
+	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
+		return nfserr_inval;
+
+	open->op_created = 0;
+	/*
+	 * RFC5661 18.51.3
+	 * Before RECLAIM_COMPLETE done, server should deny new lock
+	 */
+	if (nfsd4_has_session(cstate) &&
+	    !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+		      &cstate->session->se_client->cl_flags) &&
+	    open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+		return nfserr_grace;
+
+	if (nfsd4_has_session(cstate))
+		copy_clientid(&open->op_clientid, cstate->session);
+
+	/* check seqid for replay. set nfs4_owner */
+	resp = rqstp->rq_resp;
+	status = nfsd4_process_open1(&resp->cstate, open, nn);
+	if (status == nfserr_replay_me) {
+		struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
+		fh_put(&cstate->current_fh);
+		fh_copy_shallow(&cstate->current_fh.fh_handle,
+				&rp->rp_openfh);
+		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+		if (status)
+			dprintk("nfsd4_open: replay failed"
+				" restoring previous filehandle\n");
+		else
+			status = nfserr_replay_me;
+	}
+	if (status)
+		goto out;
+	if (open->op_xdr_error) {
+		status = open->op_xdr_error;
+		goto out;
+	}
+
+	status = nfsd4_check_open_attributes(rqstp, cstate, open);
+	if (status)
+		goto out;
+
+	/* Openowner is now set, so sequence id will get bumped.  Now we need
+	 * these checks before we do any creates: */
+	status = nfserr_grace;
+	if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+		goto out;
+	status = nfserr_no_grace;
+	if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+		goto out;
+
+	switch (open->op_claim_type) {
+		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+		case NFS4_OPEN_CLAIM_NULL:
+			status = do_open_lookup(rqstp, cstate, open, &resfh);
+			if (status)
+				goto out;
+			break;
+		case NFS4_OPEN_CLAIM_PREVIOUS:
+			status = nfs4_check_open_reclaim(&open->op_clientid,
+							 cstate, nn);
+			if (status)
+				goto out;
+			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+		case NFS4_OPEN_CLAIM_FH:
+		case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+			status = do_open_fhandle(rqstp, cstate, open);
+			if (status)
+				goto out;
+			resfh = &cstate->current_fh;
+			break;
+		case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+             	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+			dprintk("NFSD: unsupported OPEN claim type %d\n",
+				open->op_claim_type);
+			status = nfserr_notsupp;
+			goto out;
+		default:
+			dprintk("NFSD: Invalid OPEN claim type %d\n",
+				open->op_claim_type);
+			status = nfserr_inval;
+			goto out;
+	}
+	/*
+	 * nfsd4_process_open2() does the actual opening of the file.  If
+	 * successful, it (1) truncates the file if open->op_truncate was
+	 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
+	 */
+	status = nfsd4_process_open2(rqstp, resfh, open);
+	WARN(status && open->op_created,
+	     "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
+	     be32_to_cpu(status));
+out:
+	if (resfh && resfh != &cstate->current_fh) {
+		fh_dup2(&cstate->current_fh, resfh);
+		fh_put(resfh);
+		kfree(resfh);
+	}
+	nfsd4_cleanup_open_state(cstate, open);
+	nfsd4_bump_seqid(cstate, status);
+	return status;
+}
+
+/*
+ * OPEN is the only seqid-mutating operation whose decoding can fail
+ * with a seqid-mutating error (specifically, decoding of user names in
+ * the attributes).  Therefore we have to do some processing to look up
+ * the stateowner so that we can bump the seqid.
+ */
+static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op)
+{
+	struct nfsd4_open *open = (struct nfsd4_open *)&op->u;
+
+	if (!seqid_mutating_err(ntohl(op->status)))
+		return op->status;
+	if (nfsd4_has_session(cstate))
+		return op->status;
+	open->op_xdr_error = op->status;
+	return nfsd4_open(rqstp, cstate, open);
+}
+
+/*
+ * filehandle-manipulating ops.
+ */
+static __be32
+nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct svc_fh **getfh)
+{
+	if (!cstate->current_fh.fh_dentry)
+		return nfserr_nofilehandle;
+
+	*getfh = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_putfh *putfh)
+{
+	fh_put(&cstate->current_fh);
+	cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
+	memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
+	       putfh->pf_fhlen);
+	return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
+}
+
+static __be32
+nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		void *arg)
+{
+	__be32 status;
+
+	fh_put(&cstate->current_fh);
+	status = exp_pseudoroot(rqstp, &cstate->current_fh);
+	return status;
+}
+
+static __be32
+nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		void *arg)
+{
+	if (!cstate->save_fh.fh_dentry)
+		return nfserr_restorefh;
+
+	fh_dup2(&cstate->current_fh, &cstate->save_fh);
+	if (HAS_STATE_ID(cstate, SAVED_STATE_ID_FLAG)) {
+		memcpy(&cstate->current_stateid, &cstate->save_stateid, sizeof(stateid_t));
+		SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG);
+	}
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     void *arg)
+{
+	if (!cstate->current_fh.fh_dentry)
+		return nfserr_nofilehandle;
+
+	fh_dup2(&cstate->save_fh, &cstate->current_fh);
+	if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) {
+		memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t));
+		SET_STATE_ID(cstate, SAVED_STATE_ID_FLAG);
+	}
+	return nfs_ok;
+}
+
+/*
+ * misc nfsv4 ops
+ */
+static __be32
+nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_access *access)
+{
+	if (access->ac_req_access & ~NFS3_ACCESS_FULL)
+		return nfserr_inval;
+
+	access->ac_resp_access = access->ac_req_access;
+	return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access,
+			   &access->ac_supported);
+}
+
+static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
+{
+	__be32 verf[2];
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	/*
+	 * This is opaque to client, so no need to byte-swap. Use
+	 * __force to keep sparse happy
+	 */
+	verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+	verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec;
+	memcpy(verifier->data, verf, sizeof(verifier->data));
+}
+
+static __be32
+nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_commit *commit)
+{
+	gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp));
+	return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+			     commit->co_count);
+}
+
+static __be32
+nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_create *create)
+{
+	struct svc_fh resfh;
+	__be32 status;
+	dev_t rdev;
+
+	fh_init(&resfh, NFS4_FHSIZE);
+
+	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
+			   NFSD_MAY_CREATE);
+	if (status)
+		return status;
+
+	status = check_attr_support(rqstp, cstate, create->cr_bmval,
+				    nfsd_attrmask);
+	if (status)
+		return status;
+
+	switch (create->cr_type) {
+	case NF4LNK:
+		status = nfsd_symlink(rqstp, &cstate->current_fh,
+				      create->cr_name, create->cr_namelen,
+				      create->cr_data, &resfh);
+		break;
+
+	case NF4BLK:
+		rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
+		if (MAJOR(rdev) != create->cr_specdata1 ||
+		    MINOR(rdev) != create->cr_specdata2)
+			return nfserr_inval;
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFBLK, rdev, &resfh);
+		break;
+
+	case NF4CHR:
+		rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
+		if (MAJOR(rdev) != create->cr_specdata1 ||
+		    MINOR(rdev) != create->cr_specdata2)
+			return nfserr_inval;
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr,S_IFCHR, rdev, &resfh);
+		break;
+
+	case NF4SOCK:
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFSOCK, 0, &resfh);
+		break;
+
+	case NF4FIFO:
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFIFO, 0, &resfh);
+		break;
+
+	case NF4DIR:
+		create->cr_iattr.ia_valid &= ~ATTR_SIZE;
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFDIR, 0, &resfh);
+		break;
+
+	default:
+		status = nfserr_badtype;
+	}
+
+	if (status)
+		goto out;
+
+	if (create->cr_label.len)
+		nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
+
+	if (create->cr_acl != NULL)
+		do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
+				create->cr_bmval);
+
+	fh_unlock(&cstate->current_fh);
+	set_change_info(&create->cr_cinfo, &cstate->current_fh);
+	fh_dup2(&cstate->current_fh, &resfh);
+out:
+	fh_put(&resfh);
+	return status;
+}
+
+static __be32
+nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_getattr *getattr)
+{
+	__be32 status;
+
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+	if (status)
+		return status;
+
+	if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
+		return nfserr_inval;
+
+	getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+
+	getattr->ga_fhp = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_link *link)
+{
+	__be32 status = nfserr_nofilehandle;
+
+	if (!cstate->save_fh.fh_dentry)
+		return status;
+	status = nfsd_link(rqstp, &cstate->current_fh,
+			   link->li_name, link->li_namelen, &cstate->save_fh);
+	if (!status)
+		set_change_info(&link->li_cinfo, &cstate->current_fh);
+	return status;
+}
+
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
+{
+	struct svc_fh tmp_fh;
+	__be32 ret;
+
+	fh_init(&tmp_fh, NFS4_FHSIZE);
+	ret = exp_pseudoroot(rqstp, &tmp_fh);
+	if (ret)
+		return ret;
+	if (tmp_fh.fh_dentry == fh->fh_dentry) {
+		fh_put(&tmp_fh);
+		return nfserr_noent;
+	}
+	fh_put(&tmp_fh);
+	return nfsd_lookup(rqstp, fh, "..", 2, fh);
+}
+
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      void *arg)
+{
+	return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+}
+
+static __be32
+nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_lookup *lookup)
+{
+	return nfsd_lookup(rqstp, &cstate->current_fh,
+			   lookup->lo_name, lookup->lo_len,
+			   &cstate->current_fh);
+}
+
+static __be32
+nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_read *read)
+{
+	__be32 status;
+
+	/* no need to check permission - this will be done in nfsd_read() */
+
+	read->rd_filp = NULL;
+	if (read->rd_offset >= OFFSET_MAX)
+		return nfserr_inval;
+
+	/*
+	 * If we do a zero copy read, then a client will see read data
+	 * that reflects the state of the file *after* performing the
+	 * following compound.
+	 *
+	 * To ensure proper ordering, we therefore turn off zero copy if
+	 * the client wants us to do more in this compound:
+	 */
+	if (!nfsd4_last_compound_op(rqstp))
+		clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+
+	/* check stateid */
+	if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
+						 cstate, &read->rd_stateid,
+						 RD_STATE, &read->rd_filp))) {
+		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
+		goto out;
+	}
+	status = nfs_ok;
+out:
+	read->rd_rqstp = rqstp;
+	read->rd_fhp = &cstate->current_fh;
+	return status;
+}
+
+static __be32
+nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_readdir *readdir)
+{
+	u64 cookie = readdir->rd_cookie;
+	static const nfs4_verifier zeroverf;
+
+	/* no need to check permission - this will be done in nfsd_readdir() */
+
+	if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
+		return nfserr_inval;
+
+	readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+
+	if ((cookie == 1) || (cookie == 2) ||
+	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
+		return nfserr_bad_cookie;
+
+	readdir->rd_rqstp = rqstp;
+	readdir->rd_fhp = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_readlink(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	       struct nfsd4_readlink *readlink)
+{
+	readlink->rl_rqstp = rqstp;
+	readlink->rl_fhp = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_remove *remove)
+{
+	__be32 status;
+
+	if (locks_in_grace(SVC_NET(rqstp)))
+		return nfserr_grace;
+	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
+			     remove->rm_name, remove->rm_namelen);
+	if (!status) {
+		fh_unlock(&cstate->current_fh);
+		set_change_info(&remove->rm_cinfo, &cstate->current_fh);
+	}
+	return status;
+}
+
+static __be32
+nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_rename *rename)
+{
+	__be32 status = nfserr_nofilehandle;
+
+	if (!cstate->save_fh.fh_dentry)
+		return status;
+	if (locks_in_grace(SVC_NET(rqstp)) &&
+		!(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
+		return nfserr_grace;
+	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
+			     rename->rn_snamelen, &cstate->current_fh,
+			     rename->rn_tname, rename->rn_tnamelen);
+	if (status)
+		return status;
+	set_change_info(&rename->rn_sinfo, &cstate->current_fh);
+	set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo *secinfo)
+{
+	struct svc_fh resfh;
+	struct svc_export *exp;
+	struct dentry *dentry;
+	__be32 err;
+
+	fh_init(&resfh, NFS4_FHSIZE);
+	err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		return err;
+	err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
+				    secinfo->si_name, secinfo->si_namelen,
+				    &exp, &dentry);
+	if (err)
+		return err;
+	if (d_really_is_negative(dentry)) {
+		exp_put(exp);
+		err = nfserr_noent;
+	} else
+		secinfo->si_exp = exp;
+	dput(dentry);
+	if (cstate->minorversion)
+		/* See rfc 5661 section 2.6.3.1.1.8 */
+		fh_put(&cstate->current_fh);
+	return err;
+}
+
+static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo_no_name *sin)
+{
+	__be32 err;
+
+	switch (sin->sin_style) {
+	case NFS4_SECINFO_STYLE4_CURRENT_FH:
+		break;
+	case NFS4_SECINFO_STYLE4_PARENT:
+		err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+		if (err)
+			return err;
+		break;
+	default:
+		return nfserr_inval;
+	}
+
+	sin->sin_exp = exp_get(cstate->current_fh.fh_export);
+	fh_put(&cstate->current_fh);
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_setattr *setattr)
+{
+	__be32 status = nfs_ok;
+	int err;
+
+	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
+		status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+			&setattr->sa_stateid, WR_STATE, NULL);
+		if (status) {
+			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+			return status;
+		}
+	}
+	err = fh_want_write(&cstate->current_fh);
+	if (err)
+		return nfserrno(err);
+	status = nfs_ok;
+
+	status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
+				    nfsd_attrmask);
+	if (status)
+		goto out;
+
+	if (setattr->sa_acl != NULL)
+		status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
+					    setattr->sa_acl);
+	if (status)
+		goto out;
+	if (setattr->sa_label.len)
+		status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
+				&setattr->sa_label);
+	if (status)
+		goto out;
+	status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
+				0, (time_t)0);
+out:
+	fh_drop_write(&cstate->current_fh);
+	return status;
+}
+
+static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write)
+{
+        int i = 1;
+        int buflen = write->wr_buflen;
+
+        vec[0].iov_base = write->wr_head.iov_base;
+        vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len);
+        buflen -= vec[0].iov_len;
+
+        while (buflen) {
+                vec[i].iov_base = page_address(write->wr_pagelist[i - 1]);
+                vec[i].iov_len = min_t(int, PAGE_SIZE, buflen);
+                buflen -= vec[i].iov_len;
+                i++;
+        }
+        return i;
+}
+
+static __be32
+nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_write *write)
+{
+	stateid_t *stateid = &write->wr_stateid;
+	struct file *filp = NULL;
+	__be32 status = nfs_ok;
+	unsigned long cnt;
+	int nvecs;
+
+	/* no need to check permission - this will be done in nfsd_write() */
+
+	if (write->wr_offset >= OFFSET_MAX)
+		return nfserr_inval;
+
+	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
+					cstate, stateid, WR_STATE, &filp);
+	if (status) {
+		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+		return status;
+	}
+
+	cnt = write->wr_buflen;
+	write->wr_how_written = write->wr_stable_how;
+	gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
+
+	nvecs = fill_in_write_vector(rqstp->rq_vec, write);
+	WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
+
+	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
+			     write->wr_offset, rqstp->rq_vec, nvecs,
+			     &cnt, &write->wr_how_written);
+	if (filp)
+		fput(filp);
+
+	write->wr_bytes_written = cnt;
+
+	return status;
+}
+
+static __be32
+nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_fallocate *fallocate, int flags)
+{
+	__be32 status = nfserr_notsupp;
+	struct file *file;
+
+	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+					    &fallocate->falloc_stateid,
+					    WR_STATE, &file);
+	if (status != nfs_ok) {
+		dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
+		return status;
+	}
+	if (!file)
+		return nfserr_bad_stateid;
+
+	status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
+				     fallocate->falloc_offset,
+				     fallocate->falloc_length,
+				     flags);
+	fput(file);
+	return status;
+}
+
+static __be32
+nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	       struct nfsd4_fallocate *fallocate)
+{
+	return nfsd4_fallocate(rqstp, cstate, fallocate, 0);
+}
+
+static __be32
+nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		 struct nfsd4_fallocate *fallocate)
+{
+	return nfsd4_fallocate(rqstp, cstate, fallocate,
+			       FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE);
+}
+
+static __be32
+nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_seek *seek)
+{
+	int whence;
+	__be32 status;
+	struct file *file;
+
+	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+					    &seek->seek_stateid,
+					    RD_STATE, &file);
+	if (status) {
+		dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+		return status;
+	}
+	if (!file)
+		return nfserr_bad_stateid;
+
+	switch (seek->seek_whence) {
+	case NFS4_CONTENT_DATA:
+		whence = SEEK_DATA;
+		break;
+	case NFS4_CONTENT_HOLE:
+		whence = SEEK_HOLE;
+		break;
+	default:
+		status = nfserr_union_notsupp;
+		goto out;
+	}
+
+	/*
+	 * Note:  This call does change file->f_pos, but nothing in NFSD
+	 *        should ever file->f_pos.
+	 */
+	seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+	if (seek->seek_pos < 0)
+		status = nfserrno(seek->seek_pos);
+	else if (seek->seek_pos >= i_size_read(file_inode(file)))
+		seek->seek_eof = true;
+
+out:
+	fput(file);
+	return status;
+}
+
+/* This routine never returns NFS_OK!  If there are no other errors, it
+ * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
+ * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
+ * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK.
+ */
+static __be32
+_nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_verify *verify)
+{
+	__be32 *buf, *p;
+	int count;
+	__be32 status;
+
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+	if (status)
+		return status;
+
+	status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL);
+	if (status)
+		return status;
+
+	if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
+	    || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
+		return nfserr_inval;
+	if (verify->ve_attrlen & 3)
+		return nfserr_inval;
+
+	/* count in words:
+	 *   bitmap_len(1) + bitmap(2) + attr_len(1) = 4
+	 */
+	count = 4 + (verify->ve_attrlen >> 2);
+	buf = kmalloc(count << 2, GFP_KERNEL);
+	if (!buf)
+		return nfserr_jukebox;
+
+	p = buf;
+	status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh,
+				    cstate->current_fh.fh_export,
+				    cstate->current_fh.fh_dentry,
+				    verify->ve_bmval,
+				    rqstp, 0);
+	/*
+	 * If nfsd4_encode_fattr() ran out of space, assume that's because
+	 * the attributes are longer (hence different) than those given:
+	 */
+	if (status == nfserr_resource)
+		status = nfserr_not_same;
+	if (status)
+		goto out_kfree;
+
+	/* skip bitmap */
+	p = buf + 1 + ntohl(buf[0]);
+	status = nfserr_not_same;
+	if (ntohl(*p++) != verify->ve_attrlen)
+		goto out_kfree;
+	if (!memcmp(p, verify->ve_attrval, verify->ve_attrlen))
+		status = nfserr_same;
+
+out_kfree:
+	kfree(buf);
+	return status;
+}
+
+static __be32
+nfsd4_nverify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_verify *verify)
+{
+	__be32 status;
+
+	status = _nfsd4_verify(rqstp, cstate, verify);
+	return status == nfserr_not_same ? nfs_ok : status;
+}
+
+static __be32
+nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_verify *verify)
+{
+	__be32 status;
+
+	status = _nfsd4_verify(rqstp, cstate, verify);
+	return status == nfserr_same ? nfs_ok : status;
+}
+
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+	if (!exp->ex_layout_type) {
+		dprintk("%s: export does not support pNFS\n", __func__);
+		return NULL;
+	}
+
+	if (exp->ex_layout_type != layout_type) {
+		dprintk("%s: layout type %d not supported\n",
+			__func__, layout_type);
+		return NULL;
+	}
+
+	return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	const struct nfsd4_layout_ops *ops;
+	struct nfsd4_deviceid_map *map;
+	struct svc_export *exp;
+	__be32 nfserr;
+
+	dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+	       __func__,
+	       gdp->gd_layout_type,
+	       gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+	       gdp->gd_maxcount);
+
+	map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+	if (!map) {
+		dprintk("%s: couldn't find device ID to export mapping!\n",
+			__func__);
+		return nfserr_noent;
+	}
+
+	exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+	if (IS_ERR(exp)) {
+		dprintk("%s: could not find device id\n", __func__);
+		return nfserr_noent;
+	}
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+	if (!ops)
+		goto out;
+
+	nfserr = nfs_ok;
+	if (gdp->gd_maxcount != 0)
+		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+
+	gdp->gd_notify_types &= ops->notify_types;
+out:
+	exp_put(exp);
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutget *lgp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+	int accmode;
+
+	switch (lgp->lg_seg.iomode) {
+	case IOMODE_READ:
+		accmode = NFSD_MAY_READ;
+		break;
+	case IOMODE_RW:
+		accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n",
+			__func__, lgp->lg_seg.iomode);
+		nfserr = nfserr_badiomode;
+		goto out;
+	}
+
+	nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+	if (!ops)
+		goto out;
+
+	/*
+	 * Verify minlength and range as per RFC5661:
+	 *  o  If loga_length is less than loga_minlength,
+	 *     the metadata server MUST return NFS4ERR_INVAL.
+	 *  o  If the sum of loga_offset and loga_minlength exceeds
+	 *     NFS4_UINT64_MAX, and loga_minlength is not
+	 *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+	 *  o  If the sum of loga_offset and loga_length exceeds
+	 *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+	 *     the error NFS4ERR_INVAL MUST result.
+	 */
+	nfserr = nfserr_inval;
+	if (lgp->lg_seg.length < lgp->lg_minlength ||
+	    (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+	     lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+	    (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+	     lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+		goto out;
+	if (lgp->lg_seg.length == 0)
+		goto out;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+						true, lgp->lg_layout_type, &ls);
+	if (nfserr) {
+		trace_layout_get_lookup_fail(&lgp->lg_sid);
+		goto out;
+	}
+
+	nfserr = nfserr_recallconflict;
+	if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+		goto out_put_stid;
+
+	nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry),
+				     current_fh, lgp);
+	if (nfserr)
+		goto out_put_stid;
+
+	nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutcommit *lcp)
+{
+	const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	loff_t new_size = lcp->lc_last_wr + 1;
+	struct inode *inode;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+	if (!ops)
+		goto out;
+	inode = d_inode(current_fh->fh_dentry);
+
+	nfserr = nfserr_inval;
+	if (new_size <= seg->offset) {
+		dprintk("pnfsd: last write before layout segment\n");
+		goto out;
+	}
+	if (new_size > seg->offset + seg->length) {
+		dprintk("pnfsd: last write beyond layout segment\n");
+		goto out;
+	}
+	if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+		dprintk("pnfsd: layoutcommit beyond EOF\n");
+		goto out;
+	}
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+						false, lcp->lc_layout_type,
+						&ls);
+	if (nfserr) {
+		trace_layout_commit_lookup_fail(&lcp->lc_sid);
+		/* fixup error code as per RFC5661 */
+		if (nfserr == nfserr_bad_stateid)
+			nfserr = nfserr_badlayout;
+		goto out;
+	}
+
+	nfserr = ops->proc_layoutcommit(inode, lcp);
+	if (nfserr)
+		goto out_put_stid;
+
+	if (new_size > i_size_read(inode)) {
+		lcp->lc_size_chg = 1;
+		lcp->lc_newsize = new_size;
+	} else {
+		lcp->lc_size_chg = 0;
+	}
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+		goto out;
+
+	switch (lrp->lr_seg.iomode) {
+	case IOMODE_READ:
+	case IOMODE_RW:
+	case IOMODE_ANY:
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n", __func__,
+			lrp->lr_seg.iomode);
+		nfserr = nfserr_inval;
+		goto out;
+	}
+
+	switch (lrp->lr_return_type) {
+	case RETURN_FILE:
+		nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+		break;
+	case RETURN_FSID:
+	case RETURN_ALL:
+		nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+		break;
+	default:
+		dprintk("%s: invalid return_type %d\n", __func__,
+			lrp->lr_return_type);
+		nfserr = nfserr_inval;
+		break;
+	}
+out:
+	return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+static inline void nfsd4_increment_op_stats(u32 opnum)
+{
+	if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
+		nfsdstats.nfs4_opcount[opnum]++;
+}
+
+typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
+			      void *);
+typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
+typedef void(*stateid_setter)(struct nfsd4_compound_state *, void *);
+typedef void(*stateid_getter)(struct nfsd4_compound_state *, void *);
+
+enum nfsd4_op_flags {
+	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
+	ALLOWED_ON_ABSENT_FS = 1 << 1,	/* ops processed on absent fs */
+	ALLOWED_AS_FIRST_OP = 1 << 2,	/* ops reqired first in compound */
+	/* For rfc 5661 section 2.6.3.1.1: */
+	OP_HANDLES_WRONGSEC = 1 << 3,
+	OP_IS_PUTFH_LIKE = 1 << 4,
+	/*
+	 * These are the ops whose result size we estimate before
+	 * encoding, to avoid performing an op then not being able to
+	 * respond or cache a response.  This includes writes and setattrs
+	 * as well as the operations usually called "nonidempotent":
+	 */
+	OP_MODIFIES_SOMETHING = 1 << 5,
+	/*
+	 * Cache compounds containing these ops in the xid-based drc:
+	 * We use the DRC for compounds containing non-idempotent
+	 * operations, *except* those that are 4.1-specific (since
+	 * sessions provide their own EOS), and except for stateful
+	 * operations other than setclientid and setclientid_confirm
+	 * (since sequence numbers provide EOS for open, lock, etc in
+	 * the v4.0 case).
+	 */
+	OP_CACHEME = 1 << 6,
+	/*
+	 * These are ops which clear current state id.
+	 */
+	OP_CLEAR_STATEID = 1 << 7,
+};
+
+struct nfsd4_operation {
+	nfsd4op_func op_func;
+	u32 op_flags;
+	char *op_name;
+	/* Try to get response size before operation */
+	nfsd4op_rsize op_rsize_bop;
+	stateid_getter op_get_currentstateid;
+	stateid_setter op_set_currentstateid;
+};
+
+static struct nfsd4_operation nfsd4_ops[];
+
+static const char *nfsd4_op_name(unsigned opnum);
+
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules:
+ *
+ * Also note, enforced elsewhere:
+ *	- SEQUENCE other than as first op results in
+ *	  NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ *	- BIND_CONN_TO_SESSION must be the only op in its compound.
+ *	  (Enforced in nfsd4_bind_conn_to_session().)
+ *	- DESTROY_SESSION must be the final operation in a compound, if
+ *	  sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ *	  (Enforced in nfsd4_destroy_session().)
+ */
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
+{
+	struct nfsd4_op *op = &args->ops[0];
+
+	/* These ordering requirements don't apply to NFSv4.0: */
+	if (args->minorversion == 0)
+		return nfs_ok;
+	/* This is weird, but OK, not our problem: */
+	if (args->opcnt == 0)
+		return nfs_ok;
+	if (op->status == nfserr_op_illegal)
+		return nfs_ok;
+	if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+		return nfserr_op_not_in_session;
+	if (op->opnum == OP_SEQUENCE)
+		return nfs_ok;
+	if (args->opcnt != 1)
+		return nfserr_not_only_op;
+	return nfs_ok;
+}
+
+static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
+{
+	return &nfsd4_ops[op->opnum];
+}
+
+bool nfsd4_cache_this_op(struct nfsd4_op *op)
+{
+	if (op->opnum == OP_ILLEGAL)
+		return false;
+	return OPDESC(op)->op_flags & OP_CACHEME;
+}
+
+static bool need_wrongsec_check(struct svc_rqst *rqstp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+	struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+	struct nfsd4_op *next = &argp->ops[resp->opcnt];
+	struct nfsd4_operation *thisd;
+	struct nfsd4_operation *nextd;
+
+	thisd = OPDESC(this);
+	/*
+	 * Most ops check wronsec on our own; only the putfh-like ops
+	 * have special rules.
+	 */
+	if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
+		return false;
+	/*
+	 * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
+	 * put-filehandle operation if we're not going to use the
+	 * result:
+	 */
+	if (argp->opcnt == resp->opcnt)
+		return false;
+	if (next->opnum == OP_ILLEGAL)
+		return false;
+	nextd = OPDESC(next);
+	/*
+	 * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
+	 * errors themselves as necessary; others should check for them
+	 * now:
+	 */
+	return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+}
+
+static void svcxdr_init_encode(struct svc_rqst *rqstp,
+			       struct nfsd4_compoundres *resp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_buf *buf = &rqstp->rq_res;
+	struct kvec *head = buf->head;
+
+	xdr->buf = buf;
+	xdr->iov = head;
+	xdr->p   = head->iov_base + head->iov_len;
+	xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
+	/* Tail and page_len should be zero at this point: */
+	buf->len = buf->head[0].iov_len;
+	xdr->scratch.iov_len = 0;
+	xdr->page_ptr = buf->pages - 1;
+	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
+		- rqstp->rq_auth_slack;
+}
+
+/*
+ * COMPOUND call.
+ */
+static __be32
+nfsd4_proc_compound(struct svc_rqst *rqstp,
+		    struct nfsd4_compoundargs *args,
+		    struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op	*op;
+	struct nfsd4_operation *opdesc;
+	struct nfsd4_compound_state *cstate = &resp->cstate;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	struct svc_fh *save_fh = &cstate->save_fh;
+	__be32		status;
+
+	svcxdr_init_encode(rqstp, resp);
+	resp->tagp = resp->xdr.p;
+	/* reserve space for: taglen, tag, and opcnt */
+	xdr_reserve_space(&resp->xdr, 8 + args->taglen);
+	resp->taglen = args->taglen;
+	resp->tag = args->tag;
+	resp->rqstp = rqstp;
+	cstate->minorversion = args->minorversion;
+	fh_init(current_fh, NFS4_FHSIZE);
+	fh_init(save_fh, NFS4_FHSIZE);
+	/*
+	 * Don't use the deferral mechanism for NFSv4; compounds make it
+	 * too hard to avoid non-idempotency problems.
+	 */
+	clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+
+	/*
+	 * According to RFC3010, this takes precedence over all other errors.
+	 */
+	status = nfserr_minor_vers_mismatch;
+	if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
+		goto out;
+
+	status = nfs41_check_op_ordering(args);
+	if (status) {
+		op = &args->ops[0];
+		op->status = status;
+		goto encode_op;
+	}
+
+	while (!status && resp->opcnt < args->opcnt) {
+		op = &args->ops[resp->opcnt++];
+
+		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
+			resp->opcnt, args->opcnt, op->opnum,
+			nfsd4_op_name(op->opnum));
+		/*
+		 * The XDR decode routines may have pre-set op->status;
+		 * for example, if there is a miscellaneous XDR error
+		 * it will be set to nfserr_bad_xdr.
+		 */
+		if (op->status) {
+			if (op->opnum == OP_OPEN)
+				op->status = nfsd4_open_omfg(rqstp, cstate, op);
+			goto encode_op;
+		}
+
+		opdesc = OPDESC(op);
+
+		if (!current_fh->fh_dentry) {
+			if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
+				op->status = nfserr_nofilehandle;
+				goto encode_op;
+			}
+		} else if (current_fh->fh_export->ex_fslocs.migrated &&
+			  !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) {
+			op->status = nfserr_moved;
+			goto encode_op;
+		}
+
+		fh_clear_wcc(current_fh);
+
+		/* If op is non-idempotent */
+		if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+			/*
+			 * Don't execute this op if we couldn't encode a
+			 * succesful reply:
+			 */
+			u32 plen = opdesc->op_rsize_bop(rqstp, op);
+			/*
+			 * Plus if there's another operation, make sure
+			 * we'll have space to at least encode an error:
+			 */
+			if (resp->opcnt < args->opcnt)
+				plen += COMPOUND_ERR_SLACK_SPACE;
+			op->status = nfsd4_check_resp_size(resp, plen);
+		}
+
+		if (op->status)
+			goto encode_op;
+
+		if (opdesc->op_get_currentstateid)
+			opdesc->op_get_currentstateid(cstate, &op->u);
+		op->status = opdesc->op_func(rqstp, cstate, &op->u);
+
+		if (!op->status) {
+			if (opdesc->op_set_currentstateid)
+				opdesc->op_set_currentstateid(cstate, &op->u);
+
+			if (opdesc->op_flags & OP_CLEAR_STATEID)
+				clear_current_stateid(cstate);
+
+			if (need_wrongsec_check(rqstp))
+				op->status = check_nfsd_access(current_fh->fh_export, rqstp);
+		}
+
+encode_op:
+		/* Only from SEQUENCE */
+		if (cstate->status == nfserr_replay_cache) {
+			dprintk("%s NFS4.1 replay from cache\n", __func__);
+			status = op->status;
+			goto out;
+		}
+		if (op->status == nfserr_replay_me) {
+			op->replay = &cstate->replay_owner->so_replay;
+			nfsd4_encode_replay(&resp->xdr, op);
+			status = op->status = op->replay->rp_status;
+		} else {
+			nfsd4_encode_operation(resp, op);
+			status = op->status;
+		}
+
+		dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+			args->ops, args->opcnt, resp->opcnt, op->opnum,
+			be32_to_cpu(status));
+
+		nfsd4_cstate_clear_replay(cstate);
+		/* XXX Ugh, we need to get rid of this kind of special case: */
+		if (op->opnum == OP_READ && op->u.read.rd_filp)
+			fput(op->u.read.rd_filp);
+
+		nfsd4_increment_op_stats(op->opnum);
+	}
+
+	cstate->status = status;
+	fh_put(current_fh);
+	fh_put(save_fh);
+	BUG_ON(cstate->replay_owner);
+out:
+	/* Reset deferral mechanism for RPC deferrals */
+	set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+	dprintk("nfsv4 compound returned %d\n", ntohl(status));
+	return status;
+}
+
+#define op_encode_hdr_size		(2)
+#define op_encode_stateid_maxsz		(XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz	(XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz	(5)
+#define nfs4_fattr_bitmap_maxsz		(4)
+
+/* We'll fall back on returning no lockowner if run out of space: */
+#define op_encode_lockowner_maxsz	(0)
+#define op_encode_lock_denied_maxsz	(8 + op_encode_lockowner_maxsz)
+
+#define nfs4_owner_maxsz		(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+
+#define op_encode_ace_maxsz		(3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz	(1 + op_encode_stateid_maxsz + 1 + \
+					 op_encode_ace_maxsz)
+
+#define op_encode_channel_attrs_maxsz	(6 + 1 + 1)
+
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz
+		+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+/*
+ * Note since this is an idempotent operation we won't insist on failing
+ * the op prematurely if the estimate is too large.  We may turn off splice
+ * reads unnecessarily.
+ */
+static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
+				      struct nfsd4_op *op)
+{
+	u32 *bmap = op->u.getattr.ga_bmval;
+	u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
+	u32 ret = 0;
+
+	if (bmap0 & FATTR4_WORD0_ACL)
+		return svc_max_payload(rqstp);
+	if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
+		return svc_max_payload(rqstp);
+
+	if (bmap1 & FATTR4_WORD1_OWNER) {
+		ret += IDMAP_NAMESZ + 4;
+		bmap1 &= ~FATTR4_WORD1_OWNER;
+	}
+	if (bmap1 & FATTR4_WORD1_OWNER_GROUP) {
+		ret += IDMAP_NAMESZ + 4;
+		bmap1 &= ~FATTR4_WORD1_OWNER_GROUP;
+	}
+	if (bmap0 & FATTR4_WORD0_FILEHANDLE) {
+		ret += NFS4_FHSIZE + 4;
+		bmap0 &= ~FATTR4_WORD0_FILEHANDLE;
+	}
+	if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) {
+		ret += NFS4_MAXLABELLEN + 12;
+		bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+	}
+	/*
+	 * Largest of remaining attributes are 16 bytes (e.g.,
+	 * supported_attributes)
+	 */
+	ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2));
+	/* bitmask, length */
+	ret += 20;
+	return ret;
+}
+
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_stateid_maxsz
+		+ op_encode_change_info_maxsz + 1
+		+ nfs4_fattr_bitmap_maxsz
+		+ op_encode_delegation_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 maxcount = 0, rlen = 0;
+
+	maxcount = svc_max_payload(rqstp);
+	rlen = min(op->u.read.rd_length, maxcount);
+
+	return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 maxcount = 0, rlen = 0;
+
+	maxcount = svc_max_payload(rqstp);
+	rlen = min(op->u.readdir.rd_maxcount, maxcount);
+
+	return (op_encode_hdr_size + op_encode_verifier_maxsz +
+		XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz
+		+ op_encode_change_info_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
+				       struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size
+		+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
+								sizeof(__be32);
+}
+
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+		1 + 1 + /* eir_flags, spr_how */\
+		4 + /* spo_must_enforce & _allow with bitmap */\
+		2 + /*eir_server_owner.so_minor_id */\
+		/* eir_server_owner.so_major_id<> */\
+		XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+		/* eir_server_scope<> */\
+		XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+		1 + /* eir_server_impl_id array length */\
+		0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + \
+		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+		2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + \
+		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+		2 + /* csr_sequence, csr_flags */\
+		op_encode_channel_attrs_maxsz + \
+		op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE		128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* logr_return_on_close */ +
+		op_encode_stateid_maxsz +
+		1 /* nr of layouts */ +
+		MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* locr_newsize */ +
+		2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* lrs_stateid */ +
+		op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+static struct nfsd4_operation nfsd4_ops[] = {
+	[OP_ACCESS] = {
+		.op_func = (nfsd4op_func)nfsd4_access,
+		.op_name = "OP_ACCESS",
+	},
+	[OP_CLOSE] = {
+		.op_func = (nfsd4op_func)nfsd4_close,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_CLOSE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_closestateid,
+		.op_set_currentstateid = (stateid_setter)nfsd4_set_closestateid,
+	},
+	[OP_COMMIT] = {
+		.op_func = (nfsd4op_func)nfsd4_commit,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_COMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
+	},
+	[OP_CREATE] = {
+		.op_func = (nfsd4op_func)nfsd4_create,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME | OP_CLEAR_STATEID,
+		.op_name = "OP_CREATE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
+	},
+	[OP_DELEGRETURN] = {
+		.op_func = (nfsd4op_func)nfsd4_delegreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_DELEGRETURN",
+		.op_rsize_bop = nfsd4_only_status_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_delegreturnstateid,
+	},
+	[OP_GETATTR] = {
+		.op_func = (nfsd4op_func)nfsd4_getattr,
+		.op_flags = ALLOWED_ON_ABSENT_FS,
+		.op_rsize_bop = nfsd4_getattr_rsize,
+		.op_name = "OP_GETATTR",
+	},
+	[OP_GETFH] = {
+		.op_func = (nfsd4op_func)nfsd4_getfh,
+		.op_name = "OP_GETFH",
+	},
+	[OP_LINK] = {
+		.op_func = (nfsd4op_func)nfsd4_link,
+		.op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+				| OP_CACHEME,
+		.op_name = "OP_LINK",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
+	},
+	[OP_LOCK] = {
+		.op_func = (nfsd4op_func)nfsd4_lock,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LOCK",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
+		.op_set_currentstateid = (stateid_setter)nfsd4_set_lockstateid,
+	},
+	[OP_LOCKT] = {
+		.op_func = (nfsd4op_func)nfsd4_lockt,
+		.op_name = "OP_LOCKT",
+	},
+	[OP_LOCKU] = {
+		.op_func = (nfsd4op_func)nfsd4_locku,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LOCKU",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_lockustateid,
+	},
+	[OP_LOOKUP] = {
+		.op_func = (nfsd4op_func)nfsd4_lookup,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
+		.op_name = "OP_LOOKUP",
+	},
+	[OP_LOOKUPP] = {
+		.op_func = (nfsd4op_func)nfsd4_lookupp,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
+		.op_name = "OP_LOOKUPP",
+	},
+	[OP_NVERIFY] = {
+		.op_func = (nfsd4op_func)nfsd4_nverify,
+		.op_name = "OP_NVERIFY",
+	},
+	[OP_OPEN] = {
+		.op_func = (nfsd4op_func)nfsd4_open,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
+		.op_name = "OP_OPEN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
+		.op_set_currentstateid = (stateid_setter)nfsd4_set_openstateid,
+	},
+	[OP_OPEN_CONFIRM] = {
+		.op_func = (nfsd4op_func)nfsd4_open_confirm,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_OPEN_CONFIRM",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
+	},
+	[OP_OPEN_DOWNGRADE] = {
+		.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_OPEN_DOWNGRADE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_opendowngradestateid,
+		.op_set_currentstateid = (stateid_setter)nfsd4_set_opendowngradestateid,
+	},
+	[OP_PUTFH] = {
+		.op_func = (nfsd4op_func)nfsd4_putfh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
+		.op_name = "OP_PUTFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_PUTPUBFH] = {
+		.op_func = (nfsd4op_func)nfsd4_putrootfh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
+		.op_name = "OP_PUTPUBFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_PUTROOTFH] = {
+		.op_func = (nfsd4op_func)nfsd4_putrootfh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
+		.op_name = "OP_PUTROOTFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_READ] = {
+		.op_func = (nfsd4op_func)nfsd4_read,
+		.op_name = "OP_READ",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid,
+	},
+	[OP_READDIR] = {
+		.op_func = (nfsd4op_func)nfsd4_readdir,
+		.op_name = "OP_READDIR",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
+	},
+	[OP_READLINK] = {
+		.op_func = (nfsd4op_func)nfsd4_readlink,
+		.op_name = "OP_READLINK",
+	},
+	[OP_REMOVE] = {
+		.op_func = (nfsd4op_func)nfsd4_remove,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_REMOVE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
+	},
+	[OP_RENAME] = {
+		.op_func = (nfsd4op_func)nfsd4_rename,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_RENAME",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
+	},
+	[OP_RENEW] = {
+		.op_func = (nfsd4op_func)nfsd4_renew,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
+		.op_name = "OP_RENEW",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+
+	},
+	[OP_RESTOREFH] = {
+		.op_func = (nfsd4op_func)nfsd4_restorefh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
+		.op_name = "OP_RESTOREFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_SAVEFH] = {
+		.op_func = (nfsd4op_func)nfsd4_savefh,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
+		.op_name = "OP_SAVEFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_SECINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_SECINFO",
+	},
+	[OP_SETATTR] = {
+		.op_func = (nfsd4op_func)nfsd4_setattr,
+		.op_name = "OP_SETATTR",
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_setattrstateid,
+	},
+	[OP_SETCLIENTID] = {
+		.op_func = (nfsd4op_func)nfsd4_setclientid,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_SETCLIENTID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
+	},
+	[OP_SETCLIENTID_CONFIRM] = {
+		.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_SETCLIENTID_CONFIRM",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_VERIFY] = {
+		.op_func = (nfsd4op_func)nfsd4_verify,
+		.op_name = "OP_VERIFY",
+	},
+	[OP_WRITE] = {
+		.op_func = (nfsd4op_func)nfsd4_write,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_WRITE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
+	},
+	[OP_RELEASE_LOCKOWNER] = {
+		.op_func = (nfsd4op_func)nfsd4_release_lockowner,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
+		.op_name = "OP_RELEASE_LOCKOWNER",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+
+	/* NFSv4.1 operations */
+	[OP_EXCHANGE_ID] = {
+		.op_func = (nfsd4op_func)nfsd4_exchange_id,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
+		.op_name = "OP_EXCHANGE_ID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
+	},
+	[OP_BACKCHANNEL_CTL] = {
+		.op_func = (nfsd4op_func)nfsd4_backchannel_ctl,
+		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+		.op_name = "OP_BACKCHANNEL_CTL",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_BIND_CONN_TO_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
+		.op_name = "OP_BIND_CONN_TO_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
+	},
+	[OP_CREATE_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_create_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
+		.op_name = "OP_CREATE_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
+	},
+	[OP_DESTROY_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_destroy_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
+		.op_name = "OP_DESTROY_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_SEQUENCE] = {
+		.op_func = (nfsd4op_func)nfsd4_sequence,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_SEQUENCE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize,
+	},
+	[OP_DESTROY_CLIENTID] = {
+		.op_func = (nfsd4op_func)nfsd4_destroy_clientid,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
+		.op_name = "OP_DESTROY_CLIENTID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_RECLAIM_COMPLETE] = {
+		.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+		.op_name = "OP_RECLAIM_COMPLETE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_SECINFO_NO_NAME] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_SECINFO_NO_NAME",
+	},
+	[OP_TEST_STATEID] = {
+		.op_func = (nfsd4op_func)nfsd4_test_stateid,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_TEST_STATEID",
+	},
+	[OP_FREE_STATEID] = {
+		.op_func = (nfsd4op_func)nfsd4_free_stateid,
+		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+		.op_name = "OP_FREE_STATEID",
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_GETDEVICEINFO",
+	},
+	[OP_LAYOUTGET] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutget,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTGET",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+	},
+	[OP_LAYOUTCOMMIT] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutcommit,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTCOMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+	},
+	[OP_LAYOUTRETURN] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTRETURN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+	},
+#endif /* CONFIG_NFSD_PNFS */
+
+	/* NFSv4.2 operations */
+	[OP_ALLOCATE] = {
+		.op_func = (nfsd4op_func)nfsd4_allocate,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_ALLOCATE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_DEALLOCATE] = {
+		.op_func = (nfsd4op_func)nfsd4_deallocate,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_DEALLOCATE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
+	[OP_SEEK] = {
+		.op_func = (nfsd4op_func)nfsd4_seek,
+		.op_name = "OP_SEEK",
+	},
+};
+
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	struct nfsd4_operation *opdesc;
+	nfsd4op_rsize estimator;
+
+	if (op->opnum == OP_ILLEGAL)
+		return op_encode_hdr_size * sizeof(__be32);
+	opdesc = OPDESC(op);
+	estimator = opdesc->op_rsize_bop;
+	return estimator ? estimator(rqstp, op) : PAGE_SIZE;
+}
+
+void warn_on_nonidempotent_op(struct nfsd4_op *op)
+{
+	if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) {
+		pr_err("unable to encode reply to nonidempotent op %d (%s)\n",
+			op->opnum, nfsd4_op_name(op->opnum));
+		WARN_ON_ONCE(1);
+	}
+}
+
+static const char *nfsd4_op_name(unsigned opnum)
+{
+	if (opnum < ARRAY_SIZE(nfsd4_ops))
+		return nfsd4_ops[opnum].op_name;
+	return "unknown_operation";
+}
+
+#define nfsd4_voidres			nfsd4_voidargs
+struct nfsd4_voidargs { int dummy; };
+
+static struct svc_procedure		nfsd_procedures4[2] = {
+	[NFSPROC4_NULL] = {
+		.pc_func = (svc_procfunc) nfsd4_proc_null,
+		.pc_encode = (kxdrproc_t) nfs4svc_encode_voidres,
+		.pc_argsize = sizeof(struct nfsd4_voidargs),
+		.pc_ressize = sizeof(struct nfsd4_voidres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = 1,
+	},
+	[NFSPROC4_COMPOUND] = {
+		.pc_func = (svc_procfunc) nfsd4_proc_compound,
+		.pc_decode = (kxdrproc_t) nfs4svc_decode_compoundargs,
+		.pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres,
+		.pc_argsize = sizeof(struct nfsd4_compoundargs),
+		.pc_ressize = sizeof(struct nfsd4_compoundres),
+		.pc_release = nfsd4_release_compoundargs,
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = NFSD_BUFSIZE/4,
+	},
+};
+
+struct svc_version	nfsd_version4 = {
+		.vs_vers	= 4,
+		.vs_nproc	= 2,
+		.vs_proc	= nfsd_procedures4,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
+		.vs_rpcb_optnl	= 1,
+};
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/kernel/fs/nfsd/nfs4recover.c b/kernel/fs/nfsd/nfs4recover.c
new file mode 100644
index 000000000..d88ea7b9a
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4recover.c
@@ -0,0 +1,1551 @@
+/*
+*  Copyright (c) 2004 The Regents of the University of Michigan.
+*  Copyright (c) 2012 Jeff Layton <jlayton@redhat.com>
+*  All rights reserved.
+*
+*  Andy Adamson <andros@citi.umich.edu>
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions
+*  are met:
+*
+*  1. Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*  2. Redistributions in binary form must reproduce the above copyright
+*     notice, this list of conditions and the following disclaimer in the
+*     documentation and/or other materials provided with the distribution.
+*  3. Neither the name of the University nor the names of its
+*     contributors may be used to endorse or promote products derived
+*     from this software without specific prior written permission.
+*
+*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfsd/cld.h>
+
+#include "nfsd.h"
+#include "state.h"
+#include "vfs.h"
+#include "netns.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PROC
+
+/* Declarations */
+struct nfsd4_client_tracking_ops {
+	int (*init)(struct net *);
+	void (*exit)(struct net *);
+	void (*create)(struct nfs4_client *);
+	void (*remove)(struct nfs4_client *);
+	int (*check)(struct nfs4_client *);
+	void (*grace_done)(struct nfsd_net *);
+};
+
+/* Globals */
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+
+static int
+nfs4_save_creds(const struct cred **original_creds)
+{
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = GLOBAL_ROOT_UID;
+	new->fsgid = GLOBAL_ROOT_GID;
+	*original_creds = override_creds(new);
+	put_cred(new);
+	return 0;
+}
+
+static void
+nfs4_reset_creds(const struct cred *original)
+{
+	revert_creds(original);
+}
+
+static void
+md5_to_hex(char *out, char *md5)
+{
+	int i;
+
+	for (i=0; i<16; i++) {
+		unsigned char c = md5[i];
+
+		*out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1);
+		*out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1);
+	}
+	*out = '\0';
+}
+
+static int
+nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
+{
+	struct xdr_netobj cksum;
+	struct hash_desc desc;
+	struct scatterlist sg;
+	int status;
+
+	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
+			clname->len, clname->data);
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(desc.tfm)) {
+		status = PTR_ERR(desc.tfm);
+		goto out_no_tfm;
+	}
+
+	cksum.len = crypto_hash_digestsize(desc.tfm);
+	cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+	if (cksum.data == NULL) {
+		status = -ENOMEM;
+ 		goto out;
+	}
+
+	sg_init_one(&sg, clname->data, clname->len);
+
+	status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
+	if (status)
+		goto out;
+
+	md5_to_hex(dname, cksum.data);
+
+	status = 0;
+out:
+	kfree(cksum.data);
+	crypto_free_hash(desc.tfm);
+out_no_tfm:
+	return status;
+}
+
+/*
+ * If we had an error generating the recdir name for the legacy tracker
+ * then warn the admin. If the error doesn't appear to be transient,
+ * then disable recovery tracking.
+ */
+static void
+legacy_recdir_name_error(struct nfs4_client *clp, int error)
+{
+	printk(KERN_ERR "NFSD: unable to generate recoverydir "
+			"name (%d).\n", error);
+
+	/*
+	 * if the algorithm just doesn't exist, then disable the recovery
+	 * tracker altogether. The crypto libs will generally return this if
+	 * FIPS is enabled as well.
+	 */
+	if (error == -ENOENT) {
+		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
+			"Reboot recovery will not function correctly!\n");
+		nfsd4_client_tracking_exit(clp->net);
+	}
+}
+
+static void
+nfsd4_create_clid_dir(struct nfs4_client *clp)
+{
+	const struct cred *original_cred;
+	char dname[HEXDIR_LEN];
+	struct dentry *dir, *dentry;
+	struct nfs4_client_reclaim *crp;
+	int status;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+	if (!nn->rec_file)
+		return;
+
+	status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+	if (status)
+		return legacy_recdir_name_error(clp, status);
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return;
+
+	status = mnt_want_write_file(nn->rec_file);
+	if (status)
+		goto out_creds;
+
+	dir = nn->rec_file->f_path.dentry;
+	/* lock the parent */
+	mutex_lock(&d_inode(dir)->i_mutex);
+
+	dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
+	if (IS_ERR(dentry)) {
+		status = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+	if (d_really_is_positive(dentry))
+		/*
+		 * In the 4.1 case, where we're called from
+		 * reclaim_complete(), records from the previous reboot
+		 * may still be left, so this is OK.
+		 *
+		 * In the 4.0 case, we should never get here; but we may
+		 * as well be forgiving and just succeed silently.
+		 */
+		goto out_put;
+	status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU);
+out_put:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	if (status == 0) {
+		if (nn->in_grace) {
+			crp = nfs4_client_to_reclaim(dname, nn);
+			if (crp)
+				crp->cr_clp = clp;
+		}
+		vfs_fsync(nn->rec_file, 0);
+	} else {
+		printk(KERN_ERR "NFSD: failed to write recovery record"
+				" (err %d); please check that %s exists"
+				" and is writeable", status,
+				user_recovery_dirname);
+	}
+	mnt_drop_write_file(nn->rec_file);
+out_creds:
+	nfs4_reset_creds(original_cred);
+}
+
+typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);
+
+struct name_list {
+	char name[HEXDIR_LEN];
+	struct list_head list;
+};
+
+struct nfs4_dir_ctx {
+	struct dir_context ctx;
+	struct list_head names;
+};
+
+static int
+nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
+		loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct nfs4_dir_ctx *ctx =
+		container_of(__ctx, struct nfs4_dir_ctx, ctx);
+	struct name_list *entry;
+
+	if (namlen != HEXDIR_LEN - 1)
+		return 0;
+	entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+	memcpy(entry->name, name, HEXDIR_LEN - 1);
+	entry->name[HEXDIR_LEN - 1] = '\0';
+	list_add(&entry->list, &ctx->names);
+	return 0;
+}
+
+static int
+nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
+{
+	const struct cred *original_cred;
+	struct dentry *dir = nn->rec_file->f_path.dentry;
+	struct nfs4_dir_ctx ctx = {
+		.ctx.actor = nfsd4_build_namelist,
+		.names = LIST_HEAD_INIT(ctx.names)
+	};
+	int status;
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
+
+	status = vfs_llseek(nn->rec_file, 0, SEEK_SET);
+	if (status < 0) {
+		nfs4_reset_creds(original_cred);
+		return status;
+	}
+
+	status = iterate_dir(nn->rec_file, &ctx.ctx);
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	while (!list_empty(&ctx.names)) {
+		struct name_list *entry;
+		entry = list_entry(ctx.names.next, struct name_list, list);
+		if (!status) {
+			struct dentry *dentry;
+			dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+			if (IS_ERR(dentry)) {
+				status = PTR_ERR(dentry);
+				break;
+			}
+			status = f(dir, dentry, nn);
+			dput(dentry);
+		}
+		list_del(&entry->list);
+		kfree(entry);
+	}
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	nfs4_reset_creds(original_cred);
+	return status;
+}
+
+static int
+nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
+{
+	struct dentry *dir, *dentry;
+	int status;
+
+	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
+
+	dir = nn->rec_file->f_path.dentry;
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, dir, namlen);
+	if (IS_ERR(dentry)) {
+		status = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+	status = -ENOENT;
+	if (d_really_is_negative(dentry))
+		goto out;
+	status = vfs_rmdir(d_inode(dir), dentry);
+out:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	return status;
+}
+
+static void
+nfsd4_remove_clid_dir(struct nfs4_client *clp)
+{
+	const struct cred *original_cred;
+	struct nfs4_client_reclaim *crp;
+	char dname[HEXDIR_LEN];
+	int status;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+	if (status)
+		return legacy_recdir_name_error(clp, status);
+
+	status = mnt_want_write_file(nn->rec_file);
+	if (status)
+		goto out;
+	clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		goto out_drop_write;
+
+	status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
+	nfs4_reset_creds(original_cred);
+	if (status == 0) {
+		vfs_fsync(nn->rec_file, 0);
+		if (nn->in_grace) {
+			/* remove reclaim record */
+			crp = nfsd4_find_reclaim_client(dname, nn);
+			if (crp)
+				nfs4_remove_reclaim_record(crp, nn);
+		}
+	}
+out_drop_write:
+	mnt_drop_write_file(nn->rec_file);
+out:
+	if (status)
+		printk("NFSD: Failed to remove expired client state directory"
+				" %.*s\n", HEXDIR_LEN, dname);
+}
+
+static int
+purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
+{
+	int status;
+
+	if (nfs4_has_reclaimed_state(child->d_name.name, nn))
+		return 0;
+
+	status = vfs_rmdir(d_inode(parent), child);
+	if (status)
+		printk("failed to remove client recovery directory %pd\n",
+				child);
+	/* Keep trying, success or failure: */
+	return 0;
+}
+
+static void
+nfsd4_recdir_purge_old(struct nfsd_net *nn)
+{
+	int status;
+
+	nn->in_grace = false;
+	if (!nn->rec_file)
+		return;
+	status = mnt_want_write_file(nn->rec_file);
+	if (status)
+		goto out;
+	status = nfsd4_list_rec_dir(purge_old, nn);
+	if (status == 0)
+		vfs_fsync(nn->rec_file, 0);
+	mnt_drop_write_file(nn->rec_file);
+out:
+	nfs4_release_reclaim(nn);
+	if (status)
+		printk("nfsd4: failed to purge old clients from recovery"
+			" directory %pD\n", nn->rec_file);
+}
+
+static int
+load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
+{
+	if (child->d_name.len != HEXDIR_LEN - 1) {
+		printk("nfsd4: illegal name %pd in recovery directory\n",
+				child);
+		/* Keep trying; maybe the others are OK: */
+		return 0;
+	}
+	nfs4_client_to_reclaim(child->d_name.name, nn);
+	return 0;
+}
+
+static int
+nfsd4_recdir_load(struct net *net) {
+	int status;
+	struct nfsd_net *nn =  net_generic(net, nfsd_net_id);
+
+	if (!nn->rec_file)
+		return 0;
+
+	status = nfsd4_list_rec_dir(load_recdir, nn);
+	if (status)
+		printk("nfsd4: failed loading clients from recovery"
+			" directory %pD\n", nn->rec_file);
+	return status;
+}
+
+/*
+ * Hold reference to the recovery directory.
+ */
+
+static int
+nfsd4_init_recdir(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	const struct cred *original_cred;
+	int status;
+
+	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
+			user_recovery_dirname);
+
+	BUG_ON(nn->rec_file);
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0) {
+		printk("NFSD: Unable to change credentials to find recovery"
+		       " directory: error %d\n",
+		       status);
+		return status;
+	}
+
+	nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
+	if (IS_ERR(nn->rec_file)) {
+		printk("NFSD: unable to find recovery directory %s\n",
+				user_recovery_dirname);
+		status = PTR_ERR(nn->rec_file);
+		nn->rec_file = NULL;
+	}
+
+	nfs4_reset_creds(original_cred);
+	if (!status)
+		nn->in_grace = true;
+	return status;
+}
+
+static void
+nfsd4_shutdown_recdir(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (!nn->rec_file)
+		return;
+	fput(nn->rec_file);
+	nn->rec_file = NULL;
+}
+
+static int
+nfs4_legacy_state_init(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int i;
+
+	nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) *
+					  CLIENT_HASH_SIZE, GFP_KERNEL);
+	if (!nn->reclaim_str_hashtbl)
+		return -ENOMEM;
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+	nn->reclaim_str_hashtbl_size = 0;
+
+	return 0;
+}
+
+static void
+nfs4_legacy_state_shutdown(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	kfree(nn->reclaim_str_hashtbl);
+}
+
+static int
+nfsd4_load_reboot_recovery_data(struct net *net)
+{
+	int status;
+
+	status = nfsd4_init_recdir(net);
+	if (status)
+		return status;
+
+	status = nfsd4_recdir_load(net);
+	if (status)
+		nfsd4_shutdown_recdir(net);
+
+	return status;
+}
+
+static int
+nfsd4_legacy_tracking_init(struct net *net)
+{
+	int status;
+
+	/* XXX: The legacy code won't work in a container */
+	if (net != &init_net) {
+		WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
+			"tracking in a container!\n");
+		return -EINVAL;
+	}
+
+	status = nfs4_legacy_state_init(net);
+	if (status)
+		return status;
+
+	status = nfsd4_load_reboot_recovery_data(net);
+	if (status)
+		goto err;
+	return 0;
+
+err:
+	nfs4_legacy_state_shutdown(net);
+	return status;
+}
+
+static void
+nfsd4_legacy_tracking_exit(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	nfs4_release_reclaim(nn);
+	nfsd4_shutdown_recdir(net);
+	nfs4_legacy_state_shutdown(net);
+}
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+	int status;
+	struct path path;
+
+	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+	if (status)
+		return status;
+	status = -ENOTDIR;
+	if (d_is_dir(path.dentry)) {
+		strcpy(user_recovery_dirname, recdir);
+		status = 0;
+	}
+	path_put(&path);
+	return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+	return user_recovery_dirname;
+}
+
+static int
+nfsd4_check_legacy_client(struct nfs4_client *clp)
+{
+	int status;
+	char dname[HEXDIR_LEN];
+	struct nfs4_client_reclaim *crp;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	/* did we already find that this client is stable? */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+	if (status) {
+		legacy_recdir_name_error(clp, status);
+		return status;
+	}
+
+	/* look for it in the reclaim hashtable otherwise */
+	crp = nfsd4_find_reclaim_client(dname, nn);
+	if (crp) {
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+		crp->cr_clp = clp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+	.init		= nfsd4_legacy_tracking_init,
+	.exit		= nfsd4_legacy_tracking_exit,
+	.create		= nfsd4_create_clid_dir,
+	.remove		= nfsd4_remove_clid_dir,
+	.check		= nfsd4_check_legacy_client,
+	.grace_done	= nfsd4_recdir_purge_old,
+};
+
+/* Globals */
+#define NFSD_PIPE_DIR		"nfsd"
+#define NFSD_CLD_PIPE		"cld"
+
+/* per-net-ns structure for holding cld upcall info */
+struct cld_net {
+	struct rpc_pipe		*cn_pipe;
+	spinlock_t		 cn_lock;
+	struct list_head	 cn_list;
+	unsigned int		 cn_xid;
+};
+
+struct cld_upcall {
+	struct list_head	 cu_list;
+	struct cld_net		*cu_net;
+	struct task_struct	*cu_task;
+	struct cld_msg		 cu_msg;
+};
+
+static int
+__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+{
+	int ret;
+	struct rpc_pipe_msg msg;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.data = cmsg;
+	msg.len = sizeof(*cmsg);
+
+	/*
+	 * Set task state before we queue the upcall. That prevents
+	 * wake_up_process in the downcall from racing with schedule.
+	 */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	ret = rpc_queue_upcall(pipe, &msg);
+	if (ret < 0) {
+		set_current_state(TASK_RUNNING);
+		goto out;
+	}
+
+	schedule();
+
+	if (msg.errno < 0)
+		ret = msg.errno;
+out:
+	return ret;
+}
+
+static int
+cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+{
+	int ret;
+
+	/*
+	 * -EAGAIN occurs when pipe is closed and reopened while there are
+	 *  upcalls queued.
+	 */
+	do {
+		ret = __cld_pipe_upcall(pipe, cmsg);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+static ssize_t
+cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+	struct cld_upcall *tmp, *cup;
+	struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
+	uint32_t xid;
+	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
+						nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+
+	if (mlen != sizeof(*cmsg)) {
+		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
+			sizeof(*cmsg));
+		return -EINVAL;
+	}
+
+	/* copy just the xid so we can try to find that */
+	if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) {
+		dprintk("%s: error when copying xid from userspace", __func__);
+		return -EFAULT;
+	}
+
+	/* walk the list and find corresponding xid */
+	cup = NULL;
+	spin_lock(&cn->cn_lock);
+	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
+		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
+			cup = tmp;
+			list_del_init(&cup->cu_list);
+			break;
+		}
+	}
+	spin_unlock(&cn->cn_lock);
+
+	/* couldn't find upcall? */
+	if (!cup) {
+		dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
+		return -EFAULT;
+
+	wake_up_process(cup->cu_task);
+	return mlen;
+}
+
+static void
+cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct cld_msg *cmsg = msg->data;
+	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
+						 cu_msg);
+
+	/* errno >= 0 means we got a downcall */
+	if (msg->errno >= 0)
+		return;
+
+	wake_up_process(cup->cu_task);
+}
+
+static const struct rpc_pipe_ops cld_upcall_ops = {
+	.upcall		= rpc_pipe_generic_upcall,
+	.downcall	= cld_pipe_downcall,
+	.destroy_msg	= cld_pipe_destroy_msg,
+};
+
+static struct dentry *
+nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe)
+{
+	struct dentry *dir, *dentry;
+
+	dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR);
+	if (dir == NULL)
+		return ERR_PTR(-ENOENT);
+	dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
+	dput(dir);
+	return dentry;
+}
+
+static void
+nfsd4_cld_unregister_sb(struct rpc_pipe *pipe)
+{
+	if (pipe->dentry)
+		rpc_unlink(pipe->dentry);
+}
+
+static struct dentry *
+nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe)
+{
+	struct super_block *sb;
+	struct dentry *dentry;
+
+	sb = rpc_get_sb_net(net);
+	if (!sb)
+		return NULL;
+	dentry = nfsd4_cld_register_sb(sb, pipe);
+	rpc_put_sb_net(net);
+	return dentry;
+}
+
+static void
+nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe)
+{
+	struct super_block *sb;
+
+	sb = rpc_get_sb_net(net);
+	if (sb) {
+		nfsd4_cld_unregister_sb(pipe);
+		rpc_put_sb_net(net);
+	}
+}
+
+/* Initialize rpc_pipefs pipe for communication with client tracking daemon */
+static int
+nfsd4_init_cld_pipe(struct net *net)
+{
+	int ret;
+	struct dentry *dentry;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct cld_net *cn;
+
+	if (nn->cld_net)
+		return 0;
+
+	cn = kzalloc(sizeof(*cn), GFP_KERNEL);
+	if (!cn) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+	if (IS_ERR(cn->cn_pipe)) {
+		ret = PTR_ERR(cn->cn_pipe);
+		goto err;
+	}
+	spin_lock_init(&cn->cn_lock);
+	INIT_LIST_HEAD(&cn->cn_list);
+
+	dentry = nfsd4_cld_register_net(net, cn->cn_pipe);
+	if (IS_ERR(dentry)) {
+		ret = PTR_ERR(dentry);
+		goto err_destroy_data;
+	}
+
+	cn->cn_pipe->dentry = dentry;
+	nn->cld_net = cn;
+	return 0;
+
+err_destroy_data:
+	rpc_destroy_pipe_data(cn->cn_pipe);
+err:
+	kfree(cn);
+	printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n",
+			ret);
+	return ret;
+}
+
+static void
+nfsd4_remove_cld_pipe(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+
+	nfsd4_cld_unregister_net(net, cn->cn_pipe);
+	rpc_destroy_pipe_data(cn->cn_pipe);
+	kfree(nn->cld_net);
+	nn->cld_net = NULL;
+}
+
+static struct cld_upcall *
+alloc_cld_upcall(struct cld_net *cn)
+{
+	struct cld_upcall *new, *tmp;
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return new;
+
+	/* FIXME: hard cap on number in flight? */
+restart_search:
+	spin_lock(&cn->cn_lock);
+	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
+		if (tmp->cu_msg.cm_xid == cn->cn_xid) {
+			cn->cn_xid++;
+			spin_unlock(&cn->cn_lock);
+			goto restart_search;
+		}
+	}
+	new->cu_task = current;
+	new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
+	put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
+	new->cu_net = cn;
+	list_add(&new->cu_list, &cn->cn_list);
+	spin_unlock(&cn->cn_lock);
+
+	dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid);
+
+	return new;
+}
+
+static void
+free_cld_upcall(struct cld_upcall *victim)
+{
+	struct cld_net *cn = victim->cu_net;
+
+	spin_lock(&cn->cn_lock);
+	list_del(&victim->cu_list);
+	spin_unlock(&cn->cn_lock);
+	kfree(victim);
+}
+
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_create(struct nfs4_client *clp)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+
+	/* Don't upcall if it's already stored */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	cup = alloc_cld_upcall(cn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cup->cu_msg.cm_cmd = Cld_Create;
+	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+			clp->cl_name.len);
+
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	if (!ret) {
+		ret = cup->cu_msg.cm_status;
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		printk(KERN_ERR "NFSD: Unable to create client "
+				"record on stable storage: %d\n", ret);
+}
+
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_remove(struct nfs4_client *clp)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+
+	/* Don't upcall if it's already removed */
+	if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	cup = alloc_cld_upcall(cn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cup->cu_msg.cm_cmd = Cld_Remove;
+	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+			clp->cl_name.len);
+
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	if (!ret) {
+		ret = cup->cu_msg.cm_status;
+		clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		printk(KERN_ERR "NFSD: Unable to remove client "
+				"record from stable storage: %d\n", ret);
+}
+
+/* Check for presence of a record, and update its timestamp */
+static int
+nfsd4_cld_check(struct nfs4_client *clp)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+
+	/* Don't upcall if one was already stored during this grace pd */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	cup = alloc_cld_upcall(cn);
+	if (!cup) {
+		printk(KERN_ERR "NFSD: Unable to check client record on "
+				"stable storage: %d\n", -ENOMEM);
+		return -ENOMEM;
+	}
+
+	cup->cu_msg.cm_cmd = Cld_Check;
+	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+			clp->cl_name.len);
+
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	if (!ret) {
+		ret = cup->cu_msg.cm_status;
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+
+	free_cld_upcall(cup);
+	return ret;
+}
+
+static void
+nfsd4_cld_grace_done(struct nfsd_net *nn)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+
+	cup = alloc_cld_upcall(cn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cup->cu_msg.cm_cmd = Cld_GraceDone;
+	cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	if (!ret)
+		ret = cup->cu_msg.cm_status;
+
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
+}
+
+static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+	.init		= nfsd4_init_cld_pipe,
+	.exit		= nfsd4_remove_cld_pipe,
+	.create		= nfsd4_cld_create,
+	.remove		= nfsd4_cld_remove,
+	.check		= nfsd4_cld_check,
+	.grace_done	= nfsd4_cld_grace_done,
+};
+
+/* upcall via usermodehelper */
+static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
+module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
+			S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program");
+
+static bool cltrack_legacy_disable;
+module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_legacy_disable,
+		"Disable legacy recoverydir conversion. Default: false");
+
+#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
+#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
+#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
+
+static char *
+nfsd4_cltrack_legacy_topdir(void)
+{
+	int copied;
+	size_t len;
+	char *result;
+
+	if (cltrack_legacy_disable)
+		return NULL;
+
+	len = strlen(LEGACY_TOPDIR_ENV_PREFIX) +
+		strlen(nfs4_recoverydir()) + 1;
+
+	result = kmalloc(len, GFP_KERNEL);
+	if (!result)
+		return result;
+
+	copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s",
+				nfs4_recoverydir());
+	if (copied >= len) {
+		/* just return nothing if output was truncated */
+		kfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+static char *
+nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
+{
+	int copied;
+	size_t len;
+	char *result;
+
+	if (cltrack_legacy_disable)
+		return NULL;
+
+	/* +1 is for '/' between "topdir" and "recdir" */
+	len = strlen(LEGACY_RECDIR_ENV_PREFIX) +
+		strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN;
+
+	result = kmalloc(len, GFP_KERNEL);
+	if (!result)
+		return result;
+
+	copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/",
+				nfs4_recoverydir());
+	if (copied > (len - HEXDIR_LEN)) {
+		/* just return nothing if output will be truncated */
+		kfree(result);
+		return NULL;
+	}
+
+	copied = nfs4_make_rec_clidname(result + copied, name);
+	if (copied) {
+		kfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+static char *
+nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
+{
+	int copied;
+	size_t len;
+	char *result;
+
+	/* prefix + Y/N character + terminating NULL */
+	len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
+
+	result = kmalloc(len, GFP_KERNEL);
+	if (!result)
+		return result;
+
+	copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
+				clp->cl_minorversion ? 'Y' : 'N');
+	if (copied >= len) {
+		/* just return nothing if output was truncated */
+		kfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+static char *
+nfsd4_cltrack_grace_start(time_t grace_start)
+{
+	int copied;
+	size_t len;
+	char *result;
+
+	/* prefix + max width of int64_t string + terminating NULL */
+	len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
+
+	result = kmalloc(len, GFP_KERNEL);
+	if (!result)
+		return result;
+
+	copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
+				grace_start);
+	if (copied >= len) {
+		/* just return nothing if output was truncated */
+		kfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+static int
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
+{
+	char *envp[3];
+	char *argv[4];
+	int ret;
+
+	if (unlikely(!cltrack_prog[0])) {
+		dprintk("%s: cltrack_prog is disabled\n", __func__);
+		return -EACCES;
+	}
+
+	dprintk("%s: cmd: %s\n", __func__, cmd);
+	dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
+	dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
+	dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
+
+	envp[0] = env0;
+	envp[1] = env1;
+	envp[2] = NULL;
+
+	argv[0] = (char *)cltrack_prog;
+	argv[1] = cmd;
+	argv[2] = arg;
+	argv[3] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	/*
+	 * Disable the upcall mechanism if we're getting an ENOENT or EACCES
+	 * error. The admin can re-enable it on the fly by using sysfs
+	 * once the problem has been fixed.
+	 */
+	if (ret == -ENOENT || ret == -EACCES) {
+		dprintk("NFSD: %s was not found or isn't executable (%d). "
+			"Setting cltrack_prog to blank string!",
+			cltrack_prog, ret);
+		cltrack_prog[0] = '\0';
+	}
+	dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret);
+
+	return ret;
+}
+
+static char *
+bin_to_hex_dup(const unsigned char *src, int srclen)
+{
+	int i;
+	char *buf, *hex;
+
+	/* +1 for terminating NULL */
+	buf = kmalloc((srclen * 2) + 1, GFP_KERNEL);
+	if (!buf)
+		return buf;
+
+	hex = buf;
+	for (i = 0; i < srclen; i++) {
+		sprintf(hex, "%2.2x", *src++);
+		hex += 2;
+	}
+	return buf;
+}
+
+static int
+nfsd4_umh_cltrack_init(struct net *net)
+{
+	int ret;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+	/* XXX: The usermode helper s not working in container yet. */
+	if (net != &init_net) {
+		WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
+			"tracking in a container!\n");
+		return -EINVAL;
+	}
+
+	ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+	kfree(grace_start);
+	return ret;
+}
+
+static void
+nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
+{
+	wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
+			 TASK_UNINTERRUPTIBLE);
+}
+
+static void
+nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
+{
+	smp_mb__before_atomic();
+	clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
+}
+
+static void
+nfsd4_umh_cltrack_create(struct nfs4_client *clp)
+{
+	char *hexid, *has_session, *grace_start;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	/*
+	 * With v4.0 clients, there's little difference in outcome between a
+	 * create and check operation, and we can end up calling into this
+	 * function multiple times per client (once for each openowner). So,
+	 * for v4.0 clients skip upcalling once the client has been recorded
+	 * on stable storage.
+	 *
+	 * For v4.1+ clients, the outcome of the two operations is different,
+	 * so we must ensure that we upcall for the create operation. v4.1+
+	 * clients call this on RECLAIM_COMPLETE though, so we should only end
+	 * up doing a single create upcall per client.
+	 */
+	if (clp->cl_minorversion == 0 &&
+	    test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+	if (!hexid) {
+		dprintk("%s: can't allocate memory for upcall!\n", __func__);
+		return;
+	}
+
+	has_session = nfsd4_cltrack_client_has_session(clp);
+	grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+	nfsd4_cltrack_upcall_lock(clp);
+	if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	nfsd4_cltrack_upcall_unlock(clp);
+
+	kfree(has_session);
+	kfree(grace_start);
+	kfree(hexid);
+}
+
+static void
+nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
+{
+	char *hexid;
+
+	if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+	if (!hexid) {
+		dprintk("%s: can't allocate memory for upcall!\n", __func__);
+		return;
+	}
+
+	nfsd4_cltrack_upcall_lock(clp);
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
+	    nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
+		clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	nfsd4_cltrack_upcall_unlock(clp);
+
+	kfree(hexid);
+}
+
+static int
+nfsd4_umh_cltrack_check(struct nfs4_client *clp)
+{
+	int ret;
+	char *hexid, *has_session, *legacy;
+
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+	if (!hexid) {
+		dprintk("%s: can't allocate memory for upcall!\n", __func__);
+		return -ENOMEM;
+	}
+
+	has_session = nfsd4_cltrack_client_has_session(clp);
+	legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
+
+	nfsd4_cltrack_upcall_lock(clp);
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
+		ret = 0;
+	} else {
+		ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
+		if (ret == 0)
+			set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+	nfsd4_cltrack_upcall_unlock(clp);
+	kfree(has_session);
+	kfree(legacy);
+	kfree(hexid);
+
+	return ret;
+}
+
+static void
+nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
+{
+	char *legacy;
+	char timestr[22]; /* FIXME: better way to determine max size? */
+
+	sprintf(timestr, "%ld", nn->boot_time);
+	legacy = nfsd4_cltrack_legacy_topdir();
+	nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
+	kfree(legacy);
+}
+
+static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+	.init		= nfsd4_umh_cltrack_init,
+	.exit		= NULL,
+	.create		= nfsd4_umh_cltrack_create,
+	.remove		= nfsd4_umh_cltrack_remove,
+	.check		= nfsd4_umh_cltrack_check,
+	.grace_done	= nfsd4_umh_cltrack_grace_done,
+};
+
+int
+nfsd4_client_tracking_init(struct net *net)
+{
+	int status;
+	struct path path;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	/* just run the init if it the method is already decided */
+	if (nn->client_tracking_ops)
+		goto do_init;
+
+	/*
+	 * First, try a UMH upcall. It should succeed or fail quickly, so
+	 * there's little harm in trying that first.
+	 */
+	nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
+	status = nn->client_tracking_ops->init(net);
+	if (!status)
+		return status;
+
+	/*
+	 * See if the recoverydir exists and is a directory. If it is,
+	 * then use the legacy ops.
+	 */
+	nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
+	status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+	if (!status) {
+		status = d_is_dir(path.dentry);
+		path_put(&path);
+		if (status)
+			goto do_init;
+	}
+
+	/* Finally, try to use nfsdcld */
+	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
+			"removed in 3.10. Please transition to using "
+			"nfsdcltrack.\n");
+do_init:
+	status = nn->client_tracking_ops->init(net);
+	if (status) {
+		printk(KERN_WARNING "NFSD: Unable to initialize client "
+				    "recovery tracking! (%d)\n", status);
+		nn->client_tracking_ops = NULL;
+	}
+	return status;
+}
+
+void
+nfsd4_client_tracking_exit(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (nn->client_tracking_ops) {
+		if (nn->client_tracking_ops->exit)
+			nn->client_tracking_ops->exit(net);
+		nn->client_tracking_ops = NULL;
+	}
+}
+
+void
+nfsd4_client_record_create(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (nn->client_tracking_ops)
+		nn->client_tracking_ops->create(clp);
+}
+
+void
+nfsd4_client_record_remove(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (nn->client_tracking_ops)
+		nn->client_tracking_ops->remove(clp);
+}
+
+int
+nfsd4_client_record_check(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (nn->client_tracking_ops)
+		return nn->client_tracking_ops->check(clp);
+
+	return -EOPNOTSUPP;
+}
+
+void
+nfsd4_record_grace_done(struct nfsd_net *nn)
+{
+	if (nn->client_tracking_ops)
+		nn->client_tracking_ops->grace_done(nn);
+}
+
+static int
+rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+	struct super_block *sb = ptr;
+	struct net *net = sb->s_fs_info;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	struct dentry *dentry;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return 0;
+
+	if (!cn) {
+		module_put(THIS_MODULE);
+		return 0;
+	}
+
+	switch (event) {
+	case RPC_PIPEFS_MOUNT:
+		dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			break;
+		}
+		cn->cn_pipe->dentry = dentry;
+		break;
+	case RPC_PIPEFS_UMOUNT:
+		if (cn->cn_pipe->dentry)
+			nfsd4_cld_unregister_sb(cn->cn_pipe);
+		break;
+	default:
+		ret = -ENOTSUPP;
+		break;
+	}
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static struct notifier_block nfsd4_cld_block = {
+	.notifier_call = rpc_pipefs_event,
+};
+
+int
+register_cld_notifier(void)
+{
+	return rpc_pipefs_notifier_register(&nfsd4_cld_block);
+}
+
+void
+unregister_cld_notifier(void)
+{
+	rpc_pipefs_notifier_unregister(&nfsd4_cld_block);
+}
diff --git a/kernel/fs/nfsd/nfs4state.c b/kernel/fs/nfsd/nfs4state.c
new file mode 100644
index 000000000..039f9c8a9
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4state.c
@@ -0,0 +1,6716 @@
+/*
+*  Copyright (c) 2001 The Regents of the University of Michigan.
+*  All rights reserved.
+*
+*  Kendrick Smith <kmsmith@umich.edu>
+*  Andy Adamson <kandros@umich.edu>
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions
+*  are met:
+*
+*  1. Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*  2. Redistributions in binary form must reproduce the above copyright
+*     notice, this list of conditions and the following disclaimer in the
+*     documentation and/or other materials provided with the distribution.
+*  3. Neither the name of the University nor the names of its
+*     contributors may be used to endorse or promote products derived
+*     from this software without specific prior written permission.
+*
+*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/jhash.h>
+#include "xdr4.h"
+#include "xdr4cb.h"
+#include "vfs.h"
+#include "current_stateid.h"
+
+#include "netns.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PROC
+
+#define all_ones {{~0,~0},~0}
+static const stateid_t one_stateid = {
+	.si_generation = ~0,
+	.si_opaque = all_ones,
+};
+static const stateid_t zero_stateid = {
+	/* all fields zero */
+};
+static const stateid_t currentstateid = {
+	.si_generation = 1,
+};
+
+static u64 current_sessionid = 1;
+
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid)  (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
+#define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
+
+/* forward declarations */
+static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
+
+/* Locking: */
+
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(state_lock);
+
+/*
+ * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for
+ * the refcount on the open stateid to drop.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(close_wq);
+
+static struct kmem_cache *openowner_slab;
+static struct kmem_cache *lockowner_slab;
+static struct kmem_cache *file_slab;
+static struct kmem_cache *stateid_slab;
+static struct kmem_cache *deleg_slab;
+static struct kmem_cache *odstate_slab;
+
+static void free_session(struct nfsd4_session *);
+
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+
+static bool is_session_dead(struct nfsd4_session *ses)
+{
+	return ses->se_flags & NFS4_SESSION_DEAD;
+}
+
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
+{
+	if (atomic_read(&ses->se_ref) > ref_held_by_me)
+		return nfserr_jukebox;
+	ses->se_flags |= NFS4_SESSION_DEAD;
+	return nfs_ok;
+}
+
+static bool is_client_expired(struct nfs4_client *clp)
+{
+	return clp->cl_time == 0;
+}
+
+static __be32 get_client_locked(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	lockdep_assert_held(&nn->client_lock);
+
+	if (is_client_expired(clp))
+		return nfserr_expired;
+	atomic_inc(&clp->cl_refcount);
+	return nfs_ok;
+}
+
+/* must be called under the client_lock */
+static inline void
+renew_client_locked(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (is_client_expired(clp)) {
+		WARN_ON(1);
+		printk("%s: client (clientid %08x/%08x) already expired\n",
+			__func__,
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+		return;
+	}
+
+	dprintk("renewing client (clientid %08x/%08x)\n",
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+	list_move_tail(&clp->cl_lru, &nn->client_lru);
+	clp->cl_time = get_seconds();
+}
+
+static void put_client_renew_locked(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	lockdep_assert_held(&nn->client_lock);
+
+	if (!atomic_dec_and_test(&clp->cl_refcount))
+		return;
+	if (!is_client_expired(clp))
+		renew_client_locked(clp);
+}
+
+static void put_client_renew(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
+		return;
+	if (!is_client_expired(clp))
+		renew_client_locked(clp);
+	spin_unlock(&nn->client_lock);
+}
+
+static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
+{
+	__be32 status;
+
+	if (is_session_dead(ses))
+		return nfserr_badsession;
+	status = get_client_locked(ses->se_client);
+	if (status)
+		return status;
+	atomic_inc(&ses->se_ref);
+	return nfs_ok;
+}
+
+static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	lockdep_assert_held(&nn->client_lock);
+
+	if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+		free_session(ses);
+	put_client_renew_locked(clp);
+}
+
+static void nfsd4_put_session(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	spin_lock(&nn->client_lock);
+	nfsd4_put_session_locked(ses);
+	spin_unlock(&nn->client_lock);
+}
+
+static inline struct nfs4_stateowner *
+nfs4_get_stateowner(struct nfs4_stateowner *sop)
+{
+	atomic_inc(&sop->so_count);
+	return sop;
+}
+
+static int
+same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner)
+{
+	return (sop->so_owner.len == owner->len) &&
+		0 == memcmp(sop->so_owner.data, owner->data, owner->len);
+}
+
+static struct nfs4_openowner *
+find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open,
+			struct nfs4_client *clp)
+{
+	struct nfs4_stateowner *so;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval],
+			    so_strhash) {
+		if (!so->so_is_open_owner)
+			continue;
+		if (same_owner_str(so, &open->op_owner))
+			return openowner(nfs4_get_stateowner(so));
+	}
+	return NULL;
+}
+
+static struct nfs4_openowner *
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
+			struct nfs4_client *clp)
+{
+	struct nfs4_openowner *oo;
+
+	spin_lock(&clp->cl_lock);
+	oo = find_openstateowner_str_locked(hashval, open, clp);
+	spin_unlock(&clp->cl_lock);
+	return oo;
+}
+
+static inline u32
+opaque_hashval(const void *ptr, int nbytes)
+{
+	unsigned char *cptr = (unsigned char *) ptr;
+
+	u32 x = 0;
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x;
+}
+
+static void nfsd4_free_file_rcu(struct rcu_head *rcu)
+{
+	struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu);
+
+	kmem_cache_free(file_slab, fp);
+}
+
+void
+put_nfs4_file(struct nfs4_file *fi)
+{
+	might_lock(&state_lock);
+
+	if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
+		hlist_del_rcu(&fi->fi_hash);
+		spin_unlock(&state_lock);
+		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
+		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
+		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
+	}
+}
+
+static struct file *
+__nfs4_get_fd(struct nfs4_file *f, int oflag)
+{
+	if (f->fi_fds[oflag])
+		return get_file(f->fi_fds[oflag]);
+	return NULL;
+}
+
+static struct file *
+find_writeable_file_locked(struct nfs4_file *f)
+{
+	struct file *ret;
+
+	lockdep_assert_held(&f->fi_lock);
+
+	ret = __nfs4_get_fd(f, O_WRONLY);
+	if (!ret)
+		ret = __nfs4_get_fd(f, O_RDWR);
+	return ret;
+}
+
+static struct file *
+find_writeable_file(struct nfs4_file *f)
+{
+	struct file *ret;
+
+	spin_lock(&f->fi_lock);
+	ret = find_writeable_file_locked(f);
+	spin_unlock(&f->fi_lock);
+
+	return ret;
+}
+
+static struct file *find_readable_file_locked(struct nfs4_file *f)
+{
+	struct file *ret;
+
+	lockdep_assert_held(&f->fi_lock);
+
+	ret = __nfs4_get_fd(f, O_RDONLY);
+	if (!ret)
+		ret = __nfs4_get_fd(f, O_RDWR);
+	return ret;
+}
+
+static struct file *
+find_readable_file(struct nfs4_file *f)
+{
+	struct file *ret;
+
+	spin_lock(&f->fi_lock);
+	ret = find_readable_file_locked(f);
+	spin_unlock(&f->fi_lock);
+
+	return ret;
+}
+
+struct file *
+find_any_file(struct nfs4_file *f)
+{
+	struct file *ret;
+
+	spin_lock(&f->fi_lock);
+	ret = __nfs4_get_fd(f, O_RDWR);
+	if (!ret) {
+		ret = __nfs4_get_fd(f, O_WRONLY);
+		if (!ret)
+			ret = __nfs4_get_fd(f, O_RDONLY);
+	}
+	spin_unlock(&f->fi_lock);
+	return ret;
+}
+
+static atomic_long_t num_delegations;
+unsigned long max_delegations;
+
+/*
+ * Open owner state (share locks)
+ */
+
+/* hash tables for lock and open owners */
+#define OWNER_HASH_BITS              8
+#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+
+static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
+{
+	unsigned int ret;
+
+	ret = opaque_hashval(ownername->data, ownername->len);
+	return ret & OWNER_HASH_MASK;
+}
+
+/* hash table for nfs4_file */
+#define FILE_HASH_BITS                   8
+#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
+
+static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
+{
+	return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
+}
+
+static unsigned int file_hashval(struct knfsd_fh *fh)
+{
+	return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
+}
+
+static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+
+static void
+__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
+{
+	lockdep_assert_held(&fp->fi_lock);
+
+	if (access & NFS4_SHARE_ACCESS_WRITE)
+		atomic_inc(&fp->fi_access[O_WRONLY]);
+	if (access & NFS4_SHARE_ACCESS_READ)
+		atomic_inc(&fp->fi_access[O_RDONLY]);
+}
+
+static __be32
+nfs4_file_get_access(struct nfs4_file *fp, u32 access)
+{
+	lockdep_assert_held(&fp->fi_lock);
+
+	/* Does this access mode make sense? */
+	if (access & ~NFS4_SHARE_ACCESS_BOTH)
+		return nfserr_inval;
+
+	/* Does it conflict with a deny mode already set? */
+	if ((access & fp->fi_share_deny) != 0)
+		return nfserr_share_denied;
+
+	__nfs4_file_get_access(fp, access);
+	return nfs_ok;
+}
+
+static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny)
+{
+	/* Common case is that there is no deny mode. */
+	if (deny) {
+		/* Does this deny mode make sense? */
+		if (deny & ~NFS4_SHARE_DENY_BOTH)
+			return nfserr_inval;
+
+		if ((deny & NFS4_SHARE_DENY_READ) &&
+		    atomic_read(&fp->fi_access[O_RDONLY]))
+			return nfserr_share_denied;
+
+		if ((deny & NFS4_SHARE_DENY_WRITE) &&
+		    atomic_read(&fp->fi_access[O_WRONLY]))
+			return nfserr_share_denied;
+	}
+	return nfs_ok;
+}
+
+static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+	might_lock(&fp->fi_lock);
+
+	if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
+		struct file *f1 = NULL;
+		struct file *f2 = NULL;
+
+		swap(f1, fp->fi_fds[oflag]);
+		if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
+			swap(f2, fp->fi_fds[O_RDWR]);
+		spin_unlock(&fp->fi_lock);
+		if (f1)
+			fput(f1);
+		if (f2)
+			fput(f2);
+	}
+}
+
+static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
+{
+	WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH);
+
+	if (access & NFS4_SHARE_ACCESS_WRITE)
+		__nfs4_file_put_access(fp, O_WRONLY);
+	if (access & NFS4_SHARE_ACCESS_READ)
+		__nfs4_file_put_access(fp, O_RDONLY);
+}
+
+/*
+ * Allocate a new open/delegation state counter. This is needed for
+ * pNFS for proper return on close semantics.
+ *
+ * Note that we only allocate it for pNFS-enabled exports, otherwise
+ * all pointers to struct nfs4_clnt_odstate are always NULL.
+ */
+static struct nfs4_clnt_odstate *
+alloc_clnt_odstate(struct nfs4_client *clp)
+{
+	struct nfs4_clnt_odstate *co;
+
+	co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL);
+	if (co) {
+		co->co_client = clp;
+		atomic_set(&co->co_odcount, 1);
+	}
+	return co;
+}
+
+static void
+hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co)
+{
+	struct nfs4_file *fp = co->co_file;
+
+	lockdep_assert_held(&fp->fi_lock);
+	list_add(&co->co_perfile, &fp->fi_clnt_odstate);
+}
+
+static inline void
+get_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+	if (co)
+		atomic_inc(&co->co_odcount);
+}
+
+static void
+put_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+	struct nfs4_file *fp;
+
+	if (!co)
+		return;
+
+	fp = co->co_file;
+	if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
+		list_del(&co->co_perfile);
+		spin_unlock(&fp->fi_lock);
+
+		nfsd4_return_all_file_layouts(co->co_client, fp);
+		kmem_cache_free(odstate_slab, co);
+	}
+}
+
+static struct nfs4_clnt_odstate *
+find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new)
+{
+	struct nfs4_clnt_odstate *co;
+	struct nfs4_client *cl;
+
+	if (!new)
+		return NULL;
+
+	cl = new->co_client;
+
+	spin_lock(&fp->fi_lock);
+	list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) {
+		if (co->co_client == cl) {
+			get_clnt_odstate(co);
+			goto out;
+		}
+	}
+	co = new;
+	co->co_file = fp;
+	hash_clnt_odstate_locked(new);
+out:
+	spin_unlock(&fp->fi_lock);
+	return co;
+}
+
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+					 struct kmem_cache *slab)
+{
+	struct nfs4_stid *stid;
+	int new_id;
+
+	stid = kmem_cache_zalloc(slab, GFP_KERNEL);
+	if (!stid)
+		return NULL;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&cl->cl_lock);
+	new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT);
+	spin_unlock(&cl->cl_lock);
+	idr_preload_end();
+	if (new_id < 0)
+		goto out_free;
+	stid->sc_client = cl;
+	stid->sc_stateid.si_opaque.so_id = new_id;
+	stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
+	/* Will be incremented before return to client: */
+	atomic_set(&stid->sc_count, 1);
+
+	/*
+	 * It shouldn't be a problem to reuse an opaque stateid value.
+	 * I don't think it is for 4.1.  But with 4.0 I worry that, for
+	 * example, a stray write retransmission could be accepted by
+	 * the server when it should have been rejected.  Therefore,
+	 * adopt a trick from the sctp code to attempt to maximize the
+	 * amount of time until an id is reused, by ensuring they always
+	 * "increase" (mod INT_MAX):
+	 */
+	return stid;
+out_free:
+	kmem_cache_free(slab, stid);
+	return NULL;
+}
+
+static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
+{
+	struct nfs4_stid *stid;
+	struct nfs4_ol_stateid *stp;
+
+	stid = nfs4_alloc_stid(clp, stateid_slab);
+	if (!stid)
+		return NULL;
+
+	stp = openlockstateid(stid);
+	stp->st_stid.sc_free = nfs4_free_ol_stateid;
+	return stp;
+}
+
+static void nfs4_free_deleg(struct nfs4_stid *stid)
+{
+	kmem_cache_free(deleg_slab, stid);
+	atomic_long_dec(&num_delegations);
+}
+
+/*
+ * When we recall a delegation, we should be careful not to hand it
+ * out again straight away.
+ * To ensure this we keep a pair of bloom filters ('new' and 'old')
+ * in which the filehandles of recalled delegations are "stored".
+ * If a filehandle appear in either filter, a delegation is blocked.
+ * When a delegation is recalled, the filehandle is stored in the "new"
+ * filter.
+ * Every 30 seconds we swap the filters and clear the "new" one,
+ * unless both are empty of course.
+ *
+ * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
+ * low 3 bytes as hash-table indices.
+ *
+ * 'blocked_delegations_lock', which is always taken in block_delegations(),
+ * is used to manage concurrent access.  Testing does not need the lock
+ * except when swapping the two filters.
+ */
+static DEFINE_SPINLOCK(blocked_delegations_lock);
+static struct bloom_pair {
+	int	entries, old_entries;
+	time_t	swap_time;
+	int	new; /* index into 'set' */
+	DECLARE_BITMAP(set[2], 256);
+} blocked_delegations;
+
+static int delegation_blocked(struct knfsd_fh *fh)
+{
+	u32 hash;
+	struct bloom_pair *bd = &blocked_delegations;
+
+	if (bd->entries == 0)
+		return 0;
+	if (seconds_since_boot() - bd->swap_time > 30) {
+		spin_lock(&blocked_delegations_lock);
+		if (seconds_since_boot() - bd->swap_time > 30) {
+			bd->entries -= bd->old_entries;
+			bd->old_entries = bd->entries;
+			memset(bd->set[bd->new], 0,
+			       sizeof(bd->set[0]));
+			bd->new = 1-bd->new;
+			bd->swap_time = seconds_since_boot();
+		}
+		spin_unlock(&blocked_delegations_lock);
+	}
+	hash = jhash(&fh->fh_base, fh->fh_size, 0);
+	if (test_bit(hash&255, bd->set[0]) &&
+	    test_bit((hash>>8)&255, bd->set[0]) &&
+	    test_bit((hash>>16)&255, bd->set[0]))
+		return 1;
+
+	if (test_bit(hash&255, bd->set[1]) &&
+	    test_bit((hash>>8)&255, bd->set[1]) &&
+	    test_bit((hash>>16)&255, bd->set[1]))
+		return 1;
+
+	return 0;
+}
+
+static void block_delegations(struct knfsd_fh *fh)
+{
+	u32 hash;
+	struct bloom_pair *bd = &blocked_delegations;
+
+	hash = jhash(&fh->fh_base, fh->fh_size, 0);
+
+	spin_lock(&blocked_delegations_lock);
+	__set_bit(hash&255, bd->set[bd->new]);
+	__set_bit((hash>>8)&255, bd->set[bd->new]);
+	__set_bit((hash>>16)&255, bd->set[bd->new]);
+	if (bd->entries == 0)
+		bd->swap_time = seconds_since_boot();
+	bd->entries += 1;
+	spin_unlock(&blocked_delegations_lock);
+}
+
+static struct nfs4_delegation *
+alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
+		 struct nfs4_clnt_odstate *odstate)
+{
+	struct nfs4_delegation *dp;
+	long n;
+
+	dprintk("NFSD alloc_init_deleg\n");
+	n = atomic_long_inc_return(&num_delegations);
+	if (n < 0 || n > max_delegations)
+		goto out_dec;
+	if (delegation_blocked(&current_fh->fh_handle))
+		goto out_dec;
+	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
+	if (dp == NULL)
+		goto out_dec;
+
+	dp->dl_stid.sc_free = nfs4_free_deleg;
+	/*
+	 * delegation seqid's are never incremented.  The 4.1 special
+	 * meaning of seqid 0 isn't meaningful, really, but let's avoid
+	 * 0 anyway just for consistency and use 1:
+	 */
+	dp->dl_stid.sc_stateid.si_generation = 1;
+	INIT_LIST_HEAD(&dp->dl_perfile);
+	INIT_LIST_HEAD(&dp->dl_perclnt);
+	INIT_LIST_HEAD(&dp->dl_recall_lru);
+	dp->dl_clnt_odstate = odstate;
+	get_clnt_odstate(odstate);
+	dp->dl_type = NFS4_OPEN_DELEGATE_READ;
+	dp->dl_retries = 1;
+	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+	return dp;
+out_dec:
+	atomic_long_dec(&num_delegations);
+	return NULL;
+}
+
+void
+nfs4_put_stid(struct nfs4_stid *s)
+{
+	struct nfs4_file *fp = s->sc_file;
+	struct nfs4_client *clp = s->sc_client;
+
+	might_lock(&clp->cl_lock);
+
+	if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
+		wake_up_all(&close_wq);
+		return;
+	}
+	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	spin_unlock(&clp->cl_lock);
+	s->sc_free(s);
+	if (fp)
+		put_nfs4_file(fp);
+}
+
+static void nfs4_put_deleg_lease(struct nfs4_file *fp)
+{
+	struct file *filp = NULL;
+
+	spin_lock(&fp->fi_lock);
+	if (fp->fi_deleg_file && --fp->fi_delegees == 0)
+		swap(filp, fp->fi_deleg_file);
+	spin_unlock(&fp->fi_lock);
+
+	if (filp) {
+		vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
+		fput(filp);
+	}
+}
+
+void nfs4_unhash_stid(struct nfs4_stid *s)
+{
+	s->sc_type = 0;
+}
+
+static void
+hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
+{
+	lockdep_assert_held(&state_lock);
+	lockdep_assert_held(&fp->fi_lock);
+
+	atomic_inc(&dp->dl_stid.sc_count);
+	dp->dl_stid.sc_type = NFS4_DELEG_STID;
+	list_add(&dp->dl_perfile, &fp->fi_delegations);
+	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+}
+
+static void
+unhash_delegation_locked(struct nfs4_delegation *dp)
+{
+	struct nfs4_file *fp = dp->dl_stid.sc_file;
+
+	lockdep_assert_held(&state_lock);
+
+	dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
+	/* Ensure that deleg break won't try to requeue it */
+	++dp->dl_time;
+	spin_lock(&fp->fi_lock);
+	list_del_init(&dp->dl_perclnt);
+	list_del_init(&dp->dl_recall_lru);
+	list_del_init(&dp->dl_perfile);
+	spin_unlock(&fp->fi_lock);
+}
+
+static void destroy_delegation(struct nfs4_delegation *dp)
+{
+	spin_lock(&state_lock);
+	unhash_delegation_locked(dp);
+	spin_unlock(&state_lock);
+	put_clnt_odstate(dp->dl_clnt_odstate);
+	nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+	nfs4_put_stid(&dp->dl_stid);
+}
+
+static void revoke_delegation(struct nfs4_delegation *dp)
+{
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+	WARN_ON(!list_empty(&dp->dl_recall_lru));
+
+	put_clnt_odstate(dp->dl_clnt_odstate);
+	nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+
+	if (clp->cl_minorversion == 0)
+		nfs4_put_stid(&dp->dl_stid);
+	else {
+		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+		spin_lock(&clp->cl_lock);
+		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+		spin_unlock(&clp->cl_lock);
+	}
+}
+
+/* 
+ * SETCLIENTID state 
+ */
+
+static unsigned int clientid_hashval(u32 id)
+{
+	return id & CLIENT_HASH_MASK;
+}
+
+static unsigned int clientstr_hashval(const char *name)
+{
+	return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+}
+
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *	OPEN allow read, deny write
+ *	OPEN allow both, deny none
+ *	DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static unsigned int
+bmap_to_share_mode(unsigned long bmap) {
+	int i;
+	unsigned int access = 0;
+
+	for (i = 1; i < 4; i++) {
+		if (test_bit(i, &bmap))
+			access |= i;
+	}
+	return access;
+}
+
+/* set share access for a given stateid */
+static inline void
+set_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+	unsigned char mask = 1 << access;
+
+	WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+	stp->st_access_bmap |= mask;
+}
+
+/* clear share access for a given stateid */
+static inline void
+clear_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+	unsigned char mask = 1 << access;
+
+	WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+	stp->st_access_bmap &= ~mask;
+}
+
+/* test whether a given stateid has access */
+static inline bool
+test_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+	unsigned char mask = 1 << access;
+
+	return (bool)(stp->st_access_bmap & mask);
+}
+
+/* set share deny for a given stateid */
+static inline void
+set_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+	unsigned char mask = 1 << deny;
+
+	WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+	stp->st_deny_bmap |= mask;
+}
+
+/* clear share deny for a given stateid */
+static inline void
+clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+	unsigned char mask = 1 << deny;
+
+	WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+	stp->st_deny_bmap &= ~mask;
+}
+
+/* test whether a given stateid is denying specific access */
+static inline bool
+test_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+	unsigned char mask = 1 << deny;
+
+	return (bool)(stp->st_deny_bmap & mask);
+}
+
+static int nfs4_access_to_omode(u32 access)
+{
+	switch (access & NFS4_SHARE_ACCESS_BOTH) {
+	case NFS4_SHARE_ACCESS_READ:
+		return O_RDONLY;
+	case NFS4_SHARE_ACCESS_WRITE:
+		return O_WRONLY;
+	case NFS4_SHARE_ACCESS_BOTH:
+		return O_RDWR;
+	}
+	WARN_ON_ONCE(1);
+	return O_RDONLY;
+}
+
+/*
+ * A stateid that had a deny mode associated with it is being released
+ * or downgraded. Recalculate the deny mode on the file.
+ */
+static void
+recalculate_deny_mode(struct nfs4_file *fp)
+{
+	struct nfs4_ol_stateid *stp;
+
+	spin_lock(&fp->fi_lock);
+	fp->fi_share_deny = 0;
+	list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
+		fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
+	spin_unlock(&fp->fi_lock);
+}
+
+static void
+reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+	int i;
+	bool change = false;
+
+	for (i = 1; i < 4; i++) {
+		if ((i & deny) != i) {
+			change = true;
+			clear_deny(i, stp);
+		}
+	}
+
+	/* Recalculate per-file deny mode if there was a change */
+	if (change)
+		recalculate_deny_mode(stp->st_stid.sc_file);
+}
+
+/* release all access and file references for a given stateid */
+static void
+release_all_access(struct nfs4_ol_stateid *stp)
+{
+	int i;
+	struct nfs4_file *fp = stp->st_stid.sc_file;
+
+	if (fp && stp->st_deny_bmap != 0)
+		recalculate_deny_mode(fp);
+
+	for (i = 1; i < 4; i++) {
+		if (test_access(i, stp))
+			nfs4_file_put_access(stp->st_stid.sc_file, i);
+		clear_access(i, stp);
+	}
+}
+
+static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_client *clp = sop->so_client;
+
+	might_lock(&clp->cl_lock);
+
+	if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock))
+		return;
+	sop->so_ops->so_unhash(sop);
+	spin_unlock(&clp->cl_lock);
+	kfree(sop->so_owner.data);
+	sop->so_ops->so_free(sop);
+}
+
+static void unhash_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+	struct nfs4_file *fp = stp->st_stid.sc_file;
+
+	lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_del(&stp->st_perfile);
+	spin_unlock(&fp->fi_lock);
+	list_del(&stp->st_perstateowner);
+}
+
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
+{
+	struct nfs4_ol_stateid *stp = openlockstateid(stid);
+
+	put_clnt_odstate(stp->st_clnt_odstate);
+	release_all_access(stp);
+	if (stp->st_stateowner)
+		nfs4_put_stateowner(stp->st_stateowner);
+	kmem_cache_free(stateid_slab, stid);
+}
+
+static void nfs4_free_lock_stateid(struct nfs4_stid *stid)
+{
+	struct nfs4_ol_stateid *stp = openlockstateid(stid);
+	struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+	struct file *file;
+
+	file = find_any_file(stp->st_stid.sc_file);
+	if (file)
+		filp_close(file, (fl_owner_t)lo);
+	nfs4_free_ol_stateid(stid);
+}
+
+/*
+ * Put the persistent reference to an already unhashed generic stateid, while
+ * holding the cl_lock. If it's the last reference, then put it onto the
+ * reaplist for later destruction.
+ */
+static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
+				       struct list_head *reaplist)
+{
+	struct nfs4_stid *s = &stp->st_stid;
+	struct nfs4_client *clp = s->sc_client;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	WARN_ON_ONCE(!list_empty(&stp->st_locks));
+
+	if (!atomic_dec_and_test(&s->sc_count)) {
+		wake_up_all(&close_wq);
+		return;
+	}
+
+	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	list_add(&stp->st_locks, reaplist);
+}
+
+static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
+{
+	struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
+
+	lockdep_assert_held(&oo->oo_owner.so_client->cl_lock);
+
+	list_del_init(&stp->st_locks);
+	unhash_ol_stateid(stp);
+	nfs4_unhash_stid(&stp->st_stid);
+}
+
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
+{
+	struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
+
+	spin_lock(&oo->oo_owner.so_client->cl_lock);
+	unhash_lock_stateid(stp);
+	spin_unlock(&oo->oo_owner.so_client->cl_lock);
+	nfs4_put_stid(&stp->st_stid);
+}
+
+static void unhash_lockowner_locked(struct nfs4_lockowner *lo)
+{
+	struct nfs4_client *clp = lo->lo_owner.so_client;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	list_del_init(&lo->lo_owner.so_strhash);
+}
+
+/*
+ * Free a list of generic stateids that were collected earlier after being
+ * fully unhashed.
+ */
+static void
+free_ol_stateid_reaplist(struct list_head *reaplist)
+{
+	struct nfs4_ol_stateid *stp;
+	struct nfs4_file *fp;
+
+	might_sleep();
+
+	while (!list_empty(reaplist)) {
+		stp = list_first_entry(reaplist, struct nfs4_ol_stateid,
+				       st_locks);
+		list_del(&stp->st_locks);
+		fp = stp->st_stid.sc_file;
+		stp->st_stid.sc_free(&stp->st_stid);
+		if (fp)
+			put_nfs4_file(fp);
+	}
+}
+
+static void release_lockowner(struct nfs4_lockowner *lo)
+{
+	struct nfs4_client *clp = lo->lo_owner.so_client;
+	struct nfs4_ol_stateid *stp;
+	struct list_head reaplist;
+
+	INIT_LIST_HEAD(&reaplist);
+
+	spin_lock(&clp->cl_lock);
+	unhash_lockowner_locked(lo);
+	while (!list_empty(&lo->lo_owner.so_stateids)) {
+		stp = list_first_entry(&lo->lo_owner.so_stateids,
+				struct nfs4_ol_stateid, st_perstateowner);
+		unhash_lock_stateid(stp);
+		put_ol_stateid_locked(stp, &reaplist);
+	}
+	spin_unlock(&clp->cl_lock);
+	free_ol_stateid_reaplist(&reaplist);
+	nfs4_put_stateowner(&lo->lo_owner);
+}
+
+static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
+				       struct list_head *reaplist)
+{
+	struct nfs4_ol_stateid *stp;
+
+	while (!list_empty(&open_stp->st_locks)) {
+		stp = list_entry(open_stp->st_locks.next,
+				struct nfs4_ol_stateid, st_locks);
+		unhash_lock_stateid(stp);
+		put_ol_stateid_locked(stp, reaplist);
+	}
+}
+
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp,
+				struct list_head *reaplist)
+{
+	lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
+
+	unhash_ol_stateid(stp);
+	release_open_stateid_locks(stp, reaplist);
+}
+
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+	LIST_HEAD(reaplist);
+
+	spin_lock(&stp->st_stid.sc_client->cl_lock);
+	unhash_open_stateid(stp, &reaplist);
+	put_ol_stateid_locked(stp, &reaplist);
+	spin_unlock(&stp->st_stid.sc_client->cl_lock);
+	free_ol_stateid_reaplist(&reaplist);
+}
+
+static void unhash_openowner_locked(struct nfs4_openowner *oo)
+{
+	struct nfs4_client *clp = oo->oo_owner.so_client;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	list_del_init(&oo->oo_owner.so_strhash);
+	list_del_init(&oo->oo_perclient);
+}
+
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
+{
+	struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net,
+					  nfsd_net_id);
+	struct nfs4_ol_stateid *s;
+
+	spin_lock(&nn->client_lock);
+	s = oo->oo_last_closed_stid;
+	if (s) {
+		list_del_init(&oo->oo_close_lru);
+		oo->oo_last_closed_stid = NULL;
+	}
+	spin_unlock(&nn->client_lock);
+	if (s)
+		nfs4_put_stid(&s->st_stid);
+}
+
+static void release_openowner(struct nfs4_openowner *oo)
+{
+	struct nfs4_ol_stateid *stp;
+	struct nfs4_client *clp = oo->oo_owner.so_client;
+	struct list_head reaplist;
+
+	INIT_LIST_HEAD(&reaplist);
+
+	spin_lock(&clp->cl_lock);
+	unhash_openowner_locked(oo);
+	while (!list_empty(&oo->oo_owner.so_stateids)) {
+		stp = list_first_entry(&oo->oo_owner.so_stateids,
+				struct nfs4_ol_stateid, st_perstateowner);
+		unhash_open_stateid(stp, &reaplist);
+		put_ol_stateid_locked(stp, &reaplist);
+	}
+	spin_unlock(&clp->cl_lock);
+	free_ol_stateid_reaplist(&reaplist);
+	release_last_closed_stateid(oo);
+	nfs4_put_stateowner(&oo->oo_owner);
+}
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+	return sid->sequence % SESSION_HASH_SIZE;
+}
+
+#ifdef CONFIG_SUNRPC_DEBUG
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+	u32 *ptr = (u32 *)(&sessionid->data[0]);
+	dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+#else
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+}
+#endif
+
+/*
+ * Bump the seqid on cstate->replay_owner, and clear replay_owner if it
+ * won't be used for replay.
+ */
+void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
+{
+	struct nfs4_stateowner *so = cstate->replay_owner;
+
+	if (nfserr == nfserr_replay_me)
+		return;
+
+	if (!seqid_mutating_err(ntohl(nfserr))) {
+		nfsd4_cstate_clear_replay(cstate);
+		return;
+	}
+	if (!so)
+		return;
+	if (so->so_is_open_owner)
+		release_last_closed_stateid(openowner(so));
+	so->so_seqid++;
+	return;
+}
+
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_sessionid *sid;
+
+	sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+	sid->clientid = clp->cl_clientid;
+	sid->sequence = current_sessionid++;
+	sid->reserved = 0;
+}
+
+/*
+ * The protocol defines ca_maxresponssize_cached to include the size of
+ * the rpc header, but all we need to cache is the data starting after
+ * the end of the initial SEQUENCE operation--the rest we regenerate
+ * each time.  Therefore we can advertise a ca_maxresponssize_cached
+ * value that is the number of bytes in our cache plus a few additional
+ * bytes.  In order to stay on the safe side, and not promise more than
+ * we can cache, those additional bytes must be the minimum possible: 24
+ * bytes of rpc header (xid through accept state, with AUTH_NULL
+ * verifier), 12 for the compound header (with zero-length tag), and 44
+ * for the SEQUENCE op response:
+ */
+#define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
+
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+	int i;
+
+	for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+		kfree(ses->se_slots[i]);
+}
+
+/*
+ * We don't actually need to cache the rpc and session headers, so we
+ * can allocate a little less for each slot:
+ */
+static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
+{
+	u32 size;
+
+	if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ)
+		size = 0;
+	else
+		size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+	return size + sizeof(struct nfsd4_slot);
+}
+
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
+ * re-negotiate active sessions and reduce their slot usage to make
+ * room for new connections. For now we just fail the create session.
+ */
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
+{
+	u32 slotsize = slot_bytes(ca);
+	u32 num = ca->maxreqs;
+	int avail;
+
+	spin_lock(&nfsd_drc_lock);
+	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
+		    nfsd_drc_max_mem - nfsd_drc_mem_used);
+	num = min_t(int, num, avail / slotsize);
+	nfsd_drc_mem_used += num * slotsize;
+	spin_unlock(&nfsd_drc_lock);
+
+	return num;
+}
+
+static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
+{
+	int slotsize = slot_bytes(ca);
+
+	spin_lock(&nfsd_drc_lock);
+	nfsd_drc_mem_used -= slotsize * ca->maxreqs;
+	spin_unlock(&nfsd_drc_lock);
+}
+
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
+					   struct nfsd4_channel_attrs *battrs)
+{
+	int numslots = fattrs->maxreqs;
+	int slotsize = slot_bytes(fattrs);
+	struct nfsd4_session *new;
+	int mem, i;
+
+	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
+			+ sizeof(struct nfsd4_session) > PAGE_SIZE);
+	mem = numslots * sizeof(struct nfsd4_slot *);
+
+	new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+	if (!new)
+		return NULL;
+	/* allocate each struct nfsd4_slot and data cache in one piece */
+	for (i = 0; i < numslots; i++) {
+		new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL);
+		if (!new->se_slots[i])
+			goto out_free;
+	}
+
+	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
+	memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs));
+
+	return new;
+out_free:
+	while (i--)
+		kfree(new->se_slots[i]);
+	kfree(new);
+	return NULL;
+}
+
+static void free_conn(struct nfsd4_conn *c)
+{
+	svc_xprt_put(c->cn_xprt);
+	kfree(c);
+}
+
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
+{
+	struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+	struct nfs4_client *clp = c->cn_session->se_client;
+
+	spin_lock(&clp->cl_lock);
+	if (!list_empty(&c->cn_persession)) {
+		list_del(&c->cn_persession);
+		free_conn(c);
+	}
+	nfsd4_probe_callback(clp);
+	spin_unlock(&clp->cl_lock);
+}
+
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
+{
+	struct nfsd4_conn *conn;
+
+	conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
+	if (!conn)
+		return NULL;
+	svc_xprt_get(rqstp->rq_xprt);
+	conn->cn_xprt = rqstp->rq_xprt;
+	conn->cn_flags = flags;
+	INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+	return conn;
+}
+
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	conn->cn_session = ses;
+	list_add(&conn->cn_persession, &ses->se_conns);
+}
+
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+
+	spin_lock(&clp->cl_lock);
+	__nfsd4_hash_conn(conn, ses);
+	spin_unlock(&clp->cl_lock);
+}
+
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
+{
+	conn->cn_xpt_user.callback = nfsd4_conn_lost;
+	return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+}
+
+static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	int ret;
+
+	nfsd4_hash_conn(conn, ses);
+	ret = nfsd4_register_conn(conn);
+	if (ret)
+		/* oops; xprt is already down: */
+		nfsd4_conn_lost(&conn->cn_xpt_user);
+	/* We may have gained or lost a callback channel: */
+	nfsd4_probe_callback_sync(ses->se_client);
+}
+
+static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
+{
+	u32 dir = NFS4_CDFC4_FORE;
+
+	if (cses->flags & SESSION4_BACK_CHAN)
+		dir |= NFS4_CDFC4_BACK;
+	return alloc_conn(rqstp, dir);
+}
+
+/* must be called under client_lock */
+static void nfsd4_del_conns(struct nfsd4_session *s)
+{
+	struct nfs4_client *clp = s->se_client;
+	struct nfsd4_conn *c;
+
+	spin_lock(&clp->cl_lock);
+	while (!list_empty(&s->se_conns)) {
+		c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
+		list_del_init(&c->cn_persession);
+		spin_unlock(&clp->cl_lock);
+
+		unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
+		free_conn(c);
+
+		spin_lock(&clp->cl_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static void __free_session(struct nfsd4_session *ses)
+{
+	free_session_slots(ses);
+	kfree(ses);
+}
+
+static void free_session(struct nfsd4_session *ses)
+{
+	nfsd4_del_conns(ses);
+	nfsd4_put_drc_mem(&ses->se_fchannel);
+	__free_session(ses);
+}
+
+static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+	int idx;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	new->se_client = clp;
+	gen_sessionid(new);
+
+	INIT_LIST_HEAD(&new->se_conns);
+
+	new->se_cb_seq_nr = 1;
+	new->se_flags = cses->flags;
+	new->se_cb_prog = cses->callback_prog;
+	new->se_cb_sec = cses->cb_sec;
+	atomic_set(&new->se_ref, 0);
+	idx = hash_sessionid(&new->se_sessionid);
+	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
+	spin_lock(&clp->cl_lock);
+	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&clp->cl_lock);
+
+	{
+		struct sockaddr *sa = svc_addr(rqstp);
+		/*
+		 * This is a little silly; with sessions there's no real
+		 * use for the callback address.  Use the peer address
+		 * as a reasonable default for now, but consider fixing
+		 * the rpc client not to require an address in the
+		 * future:
+		 */
+		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+	}
+}
+
+/* caller must hold client_lock */
+static struct nfsd4_session *
+__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
+{
+	struct nfsd4_session *elem;
+	int idx;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	lockdep_assert_held(&nn->client_lock);
+
+	dump_sessionid(__func__, sessionid);
+	idx = hash_sessionid(sessionid);
+	/* Search in the appropriate list */
+	list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {
+		if (!memcmp(elem->se_sessionid.data, sessionid->data,
+			    NFS4_MAX_SESSIONID_LEN)) {
+			return elem;
+		}
+	}
+
+	dprintk("%s: session not found\n", __func__);
+	return NULL;
+}
+
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net,
+		__be32 *ret)
+{
+	struct nfsd4_session *session;
+	__be32 status = nfserr_badsession;
+
+	session = __find_in_sessionid_hashtbl(sessionid, net);
+	if (!session)
+		goto out;
+	status = nfsd4_get_session_locked(session);
+	if (status)
+		session = NULL;
+out:
+	*ret = status;
+	return session;
+}
+
+/* caller must hold client_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	lockdep_assert_held(&nn->client_lock);
+
+	list_del(&ses->se_hash);
+	spin_lock(&ses->se_client->cl_lock);
+	list_del(&ses->se_perclnt);
+	spin_unlock(&ses->se_client->cl_lock);
+}
+
+/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
+static int
+STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
+{
+	/*
+	 * We're assuming the clid was not given out from a boot
+	 * precisely 2^32 (about 136 years) before this one.  That seems
+	 * a safe assumption:
+	 */
+	if (clid->cl_boot == (u32)nn->boot_time)
+		return 0;
+	dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
+		clid->cl_boot, clid->cl_id, nn->boot_time);
+	return 1;
+}
+
+/* 
+ * XXX Should we use a slab cache ?
+ * This type of memory management is somewhat inefficient, but we use it
+ * anyway since SETCLIENTID is not a common operation.
+ */
+static struct nfs4_client *alloc_client(struct xdr_netobj name)
+{
+	struct nfs4_client *clp;
+	int i;
+
+	clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
+	if (clp == NULL)
+		return NULL;
+	clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
+	if (clp->cl_name.data == NULL)
+		goto err_no_name;
+	clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
+			OWNER_HASH_SIZE, GFP_KERNEL);
+	if (!clp->cl_ownerstr_hashtbl)
+		goto err_no_hashtbl;
+	for (i = 0; i < OWNER_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
+	clp->cl_name.len = name.len;
+	INIT_LIST_HEAD(&clp->cl_sessions);
+	idr_init(&clp->cl_stateids);
+	atomic_set(&clp->cl_refcount, 0);
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+	INIT_LIST_HEAD(&clp->cl_idhash);
+	INIT_LIST_HEAD(&clp->cl_openowners);
+	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_lru);
+	INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
+	spin_lock_init(&clp->cl_lock);
+	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
+	return clp;
+err_no_hashtbl:
+	kfree(clp->cl_name.data);
+err_no_name:
+	kfree(clp);
+	return NULL;
+}
+
+static void
+free_client(struct nfs4_client *clp)
+{
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				se_perclnt);
+		list_del(&ses->se_perclnt);
+		WARN_ON_ONCE(atomic_read(&ses->se_ref));
+		free_session(ses);
+	}
+	rpc_destroy_wait_queue(&clp->cl_cb_waitq);
+	free_svc_cred(&clp->cl_cred);
+	kfree(clp->cl_ownerstr_hashtbl);
+	kfree(clp->cl_name.data);
+	idr_destroy(&clp->cl_stateids);
+	kfree(clp);
+}
+
+/* must be called under the client_lock */
+static void
+unhash_client_locked(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct nfsd4_session *ses;
+
+	lockdep_assert_held(&nn->client_lock);
+
+	/* Mark the client as expired! */
+	clp->cl_time = 0;
+	/* Make it invisible */
+	if (!list_empty(&clp->cl_idhash)) {
+		list_del_init(&clp->cl_idhash);
+		if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+			rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
+		else
+			rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+	}
+	list_del_init(&clp->cl_lru);
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+		list_del_init(&ses->se_hash);
+	spin_unlock(&clp->cl_lock);
+}
+
+static void
+unhash_client(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	spin_lock(&nn->client_lock);
+	unhash_client_locked(clp);
+	spin_unlock(&nn->client_lock);
+}
+
+static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+{
+	if (atomic_read(&clp->cl_refcount))
+		return nfserr_jukebox;
+	unhash_client_locked(clp);
+	return nfs_ok;
+}
+
+static void
+__destroy_client(struct nfs4_client *clp)
+{
+	struct nfs4_openowner *oo;
+	struct nfs4_delegation *dp;
+	struct list_head reaplist;
+
+	INIT_LIST_HEAD(&reaplist);
+	spin_lock(&state_lock);
+	while (!list_empty(&clp->cl_delegations)) {
+		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
+		unhash_delegation_locked(dp);
+		list_add(&dp->dl_recall_lru, &reaplist);
+	}
+	spin_unlock(&state_lock);
+	while (!list_empty(&reaplist)) {
+		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+		list_del_init(&dp->dl_recall_lru);
+		put_clnt_odstate(dp->dl_clnt_odstate);
+		nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+		nfs4_put_stid(&dp->dl_stid);
+	}
+	while (!list_empty(&clp->cl_revoked)) {
+		dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
+		list_del_init(&dp->dl_recall_lru);
+		nfs4_put_stid(&dp->dl_stid);
+	}
+	while (!list_empty(&clp->cl_openowners)) {
+		oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+		nfs4_get_stateowner(&oo->oo_owner);
+		release_openowner(oo);
+	}
+	nfsd4_return_all_client_layouts(clp);
+	nfsd4_shutdown_callback(clp);
+	if (clp->cl_cb_conn.cb_xprt)
+		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+	free_client(clp);
+}
+
+static void
+destroy_client(struct nfs4_client *clp)
+{
+	unhash_client(clp);
+	__destroy_client(clp);
+}
+
+static void expire_client(struct nfs4_client *clp)
+{
+	unhash_client(clp);
+	nfsd4_client_record_remove(clp);
+	__destroy_client(clp);
+}
+
+static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+{
+	memcpy(target->cl_verifier.data, source->data,
+			sizeof(target->cl_verifier.data));
+}
+
+static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
+{
+	target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 
+	target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
+}
+
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+	if (source->cr_principal) {
+		target->cr_principal =
+				kstrdup(source->cr_principal, GFP_KERNEL);
+		if (target->cr_principal == NULL)
+			return -ENOMEM;
+	} else
+		target->cr_principal = NULL;
+	target->cr_flavor = source->cr_flavor;
+	target->cr_uid = source->cr_uid;
+	target->cr_gid = source->cr_gid;
+	target->cr_group_info = source->cr_group_info;
+	get_group_info(target->cr_group_info);
+	target->cr_gss_mech = source->cr_gss_mech;
+	if (source->cr_gss_mech)
+		gss_mech_get(source->cr_gss_mech);
+	return 0;
+}
+
+static int
+compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
+{
+	if (o1->len < o2->len)
+		return -1;
+	if (o1->len > o2->len)
+		return 1;
+	return memcmp(o1->data, o2->data, o1->len);
+}
+
+static int same_name(const char *n1, const char *n2)
+{
+	return 0 == memcmp(n1, n2, HEXDIR_LEN);
+}
+
+static int
+same_verf(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+	return 0 == memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
+static int
+same_clid(clientid_t *cl1, clientid_t *cl2)
+{
+	return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
+}
+
+static bool groups_equal(struct group_info *g1, struct group_info *g2)
+{
+	int i;
+
+	if (g1->ngroups != g2->ngroups)
+		return false;
+	for (i=0; i<g1->ngroups; i++)
+		if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))
+			return false;
+	return true;
+}
+
+/*
+ * RFC 3530 language requires clid_inuse be returned when the
+ * "principal" associated with a requests differs from that previously
+ * used.  We use uid, gid's, and gss principal string as our best
+ * approximation.  We also don't want to allow non-gss use of a client
+ * established using gss: in theory cr_principal should catch that
+ * change, but in practice cr_principal can be null even in the gss case
+ * since gssd doesn't always pass down a principal string.
+ */
+static bool is_gss_cred(struct svc_cred *cr)
+{
+	/* Is cr_flavor one of the gss "pseudoflavors"?: */
+	return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR);
+}
+
+
+static bool
+same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
+{
+	if ((is_gss_cred(cr1) != is_gss_cred(cr2))
+		|| (!uid_eq(cr1->cr_uid, cr2->cr_uid))
+		|| (!gid_eq(cr1->cr_gid, cr2->cr_gid))
+		|| !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
+		return false;
+	if (cr1->cr_principal == cr2->cr_principal)
+		return true;
+	if (!cr1->cr_principal || !cr2->cr_principal)
+		return false;
+	return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
+}
+
+static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
+{
+	struct svc_cred *cr = &rqstp->rq_cred;
+	u32 service;
+
+	if (!cr->cr_gss_mech)
+		return false;
+	service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
+	return service == RPC_GSS_SVC_INTEGRITY ||
+	       service == RPC_GSS_SVC_PRIVACY;
+}
+
+static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+{
+	struct svc_cred *cr = &rqstp->rq_cred;
+
+	if (!cl->cl_mach_cred)
+		return true;
+	if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
+		return false;
+	if (!svc_rqst_integrity_protected(rqstp))
+		return false;
+	if (!cr->cr_principal)
+		return false;
+	return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
+}
+
+static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn)
+{
+	__be32 verf[2];
+
+	/*
+	 * This is opaque to client, so no need to byte-swap. Use
+	 * __force to keep sparse happy
+	 */
+	verf[0] = (__force __be32)get_seconds();
+	verf[1] = (__force __be32)nn->clientid_counter;
+	memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
+}
+
+static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
+{
+	clp->cl_clientid.cl_boot = nn->boot_time;
+	clp->cl_clientid.cl_id = nn->clientid_counter++;
+	gen_confirm(clp, nn);
+}
+
+static struct nfs4_stid *
+find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
+{
+	struct nfs4_stid *ret;
+
+	ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+	if (!ret || !ret->sc_type)
+		return NULL;
+	return ret;
+}
+
+static struct nfs4_stid *
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+{
+	struct nfs4_stid *s;
+
+	spin_lock(&cl->cl_lock);
+	s = find_stateid_locked(cl, t);
+	if (s != NULL) {
+		if (typemask & s->sc_type)
+			atomic_inc(&s->sc_count);
+		else
+			s = NULL;
+	}
+	spin_unlock(&cl->cl_lock);
+	return s;
+}
+
+static struct nfs4_client *create_client(struct xdr_netobj name,
+		struct svc_rqst *rqstp, nfs4_verifier *verf)
+{
+	struct nfs4_client *clp;
+	struct sockaddr *sa = svc_addr(rqstp);
+	int ret;
+	struct net *net = SVC_NET(rqstp);
+
+	clp = alloc_client(name);
+	if (clp == NULL)
+		return NULL;
+
+	ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+	if (ret) {
+		free_client(clp);
+		return NULL;
+	}
+	nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
+	clp->cl_time = get_seconds();
+	clear_bit(0, &clp->cl_cb_slot_busy);
+	copy_verf(clp, verf);
+	rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
+	clp->cl_cb_session = NULL;
+	clp->net = net;
+	return clp;
+}
+
+static void
+add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct nfs4_client *clp;
+
+	while (*new) {
+		clp = rb_entry(*new, struct nfs4_client, cl_namenode);
+		parent = *new;
+
+		if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&new_clp->cl_namenode, parent, new);
+	rb_insert_color(&new_clp->cl_namenode, root);
+}
+
+static struct nfs4_client *
+find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
+{
+	int cmp;
+	struct rb_node *node = root->rb_node;
+	struct nfs4_client *clp;
+
+	while (node) {
+		clp = rb_entry(node, struct nfs4_client, cl_namenode);
+		cmp = compare_blob(&clp->cl_name, name);
+		if (cmp > 0)
+			node = node->rb_left;
+		else if (cmp < 0)
+			node = node->rb_right;
+		else
+			return clp;
+	}
+	return NULL;
+}
+
+static void
+add_to_unconfirmed(struct nfs4_client *clp)
+{
+	unsigned int idhashval;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	lockdep_assert_held(&nn->client_lock);
+
+	clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+	add_clp_to_name_tree(clp, &nn->unconf_name_tree);
+	idhashval = clientid_hashval(clp->cl_clientid.cl_id);
+	list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
+	renew_client_locked(clp);
+}
+
+static void
+move_to_confirmed(struct nfs4_client *clp)
+{
+	unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	lockdep_assert_held(&nn->client_lock);
+
+	dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
+	list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
+	rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+	add_clp_to_name_tree(clp, &nn->conf_name_tree);
+	set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+	renew_client_locked(clp);
+}
+
+static struct nfs4_client *
+find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
+{
+	struct nfs4_client *clp;
+	unsigned int idhashval = clientid_hashval(clid->cl_id);
+
+	list_for_each_entry(clp, &tbl[idhashval], cl_idhash) {
+		if (same_clid(&clp->cl_clientid, clid)) {
+			if ((bool)clp->cl_minorversion != sessions)
+				return NULL;
+			renew_client_locked(clp);
+			return clp;
+		}
+	}
+	return NULL;
+}
+
+static struct nfs4_client *
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+{
+	struct list_head *tbl = nn->conf_id_hashtbl;
+
+	lockdep_assert_held(&nn->client_lock);
+	return find_client_in_id_table(tbl, clid, sessions);
+}
+
+static struct nfs4_client *
+find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+{
+	struct list_head *tbl = nn->unconf_id_hashtbl;
+
+	lockdep_assert_held(&nn->client_lock);
+	return find_client_in_id_table(tbl, clid, sessions);
+}
+
+static bool clp_used_exchangeid(struct nfs4_client *clp)
+{
+	return clp->cl_exchange_flags != 0;
+} 
+
+static struct nfs4_client *
+find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
+{
+	lockdep_assert_held(&nn->client_lock);
+	return find_clp_in_name_tree(name, &nn->conf_name_tree);
+}
+
+static struct nfs4_client *
+find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
+{
+	lockdep_assert_held(&nn->client_lock);
+	return find_clp_in_name_tree(name, &nn->unconf_name_tree);
+}
+
+static void
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
+{
+	struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+	struct sockaddr	*sa = svc_addr(rqstp);
+	u32 scopeid = rpc_get_scope_id(sa);
+	unsigned short expected_family;
+
+	/* Currently, we only support tcp and tcp6 for the callback channel */
+	if (se->se_callback_netid_len == 3 &&
+	    !memcmp(se->se_callback_netid_val, "tcp", 3))
+		expected_family = AF_INET;
+	else if (se->se_callback_netid_len == 4 &&
+		 !memcmp(se->se_callback_netid_val, "tcp6", 4))
+		expected_family = AF_INET6;
+	else
+		goto out_err;
+
+	conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,
+					    se->se_callback_addr_len,
+					    (struct sockaddr *)&conn->cb_addr,
+					    sizeof(conn->cb_addr));
+
+	if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
+		goto out_err;
+
+	if (conn->cb_addr.ss_family == AF_INET6)
+		((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
+
+	conn->cb_prog = se->se_callback_prog;
+	conn->cb_ident = se->se_callback_ident;
+	memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
+	return;
+out_err:
+	conn->cb_addr.ss_family = AF_UNSPEC;
+	conn->cb_addrlen = 0;
+	dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
+		"will not receive delegations\n",
+		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
+
+	return;
+}
+
+/*
+ * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
+ */
+static void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+	struct xdr_buf *buf = resp->xdr.buf;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	unsigned int base;
+
+	dprintk("--> %s slot %p\n", __func__, slot);
+
+	slot->sl_opcnt = resp->opcnt;
+	slot->sl_status = resp->cstate.status;
+
+	slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
+	if (nfsd4_not_cached(resp)) {
+		slot->sl_datalen = 0;
+		return;
+	}
+	base = resp->cstate.data_offset;
+	slot->sl_datalen = buf->len - base;
+	if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
+		WARN("%s: sessions DRC could not cache compound\n", __func__);
+	return;
+}
+
+/*
+ * Encode the replay sequence operation from the slot values.
+ * If cachethis is FALSE encode the uncached rep error on the next
+ * operation which sets resp->p and increments resp->opcnt for
+ * nfs4svc_encode_compoundres.
+ *
+ */
+static __be32
+nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
+			  struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op *op;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+
+	/* Encode the replayed sequence operation */
+	op = &args->ops[resp->opcnt - 1];
+	nfsd4_encode_operation(resp, op);
+
+	/* Return nfserr_retry_uncached_rep in next operation. */
+	if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) {
+		op = &args->ops[resp->opcnt++];
+		op->status = nfserr_retry_uncached_rep;
+		nfsd4_encode_operation(resp, op);
+	}
+	return op->status;
+}
+
+/*
+ * The sequence operation is not cached because we can use the slot and
+ * session values.
+ */
+static __be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+			 struct nfsd4_sequence *seq)
+{
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+	__be32 status;
+
+	dprintk("--> %s slot %p\n", __func__, slot);
+
+	status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
+	if (status)
+		return status;
+
+	p = xdr_reserve_space(xdr, slot->sl_datalen);
+	if (!p) {
+		WARN_ON_ONCE(1);
+		return nfserr_serverfault;
+	}
+	xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen);
+	xdr_commit_encode(xdr);
+
+	resp->opcnt = slot->sl_opcnt;
+	return slot->sl_status;
+}
+
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+#ifdef CONFIG_NFSD_PNFS
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
+
+	/* Referrals are supported, Migration is not. */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+	/* set the wire flags to return to client. */
+	clid->flags = new->cl_exchange_flags;
+}
+
+static bool client_has_state(struct nfs4_client *clp)
+{
+	/*
+	 * Note clp->cl_openowners check isn't quite right: there's no
+	 * need to count owners without stateid's.
+	 *
+	 * Also note we should probably be using this in 4.0 case too.
+	 */
+	return !list_empty(&clp->cl_openowners)
+		|| !list_empty(&clp->cl_delegations)
+		|| !list_empty(&clp->cl_sessions);
+}
+
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+		  struct nfsd4_compound_state *cstate,
+		  struct nfsd4_exchange_id *exid)
+{
+	struct nfs4_client *conf, *new;
+	struct nfs4_client *unconf = NULL;
+	__be32 status;
+	char			addr_str[INET6_ADDRSTRLEN];
+	nfs4_verifier		verf = exid->verifier;
+	struct sockaddr		*sa = svc_addr(rqstp);
+	bool	update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
+	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	rpc_ntop(sa, addr_str, sizeof(addr_str));
+	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+		"ip_addr=%s flags %x, spa_how %d\n",
+		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
+		addr_str, exid->flags, exid->spa_how);
+
+	if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
+		return nfserr_inval;
+
+	switch (exid->spa_how) {
+	case SP4_MACH_CRED:
+		if (!svc_rqst_integrity_protected(rqstp))
+			return nfserr_inval;
+	case SP4_NONE:
+		break;
+	default:				/* checked by xdr code */
+		WARN_ON_ONCE(1);
+	case SP4_SSV:
+		return nfserr_encr_alg_unsupp;
+	}
+
+	new = create_client(exid->clname, rqstp, &verf);
+	if (new == NULL)
+		return nfserr_jukebox;
+
+	/* Cases below refer to rfc 5661 section 18.35.4: */
+	spin_lock(&nn->client_lock);
+	conf = find_confirmed_client_by_name(&exid->clname, nn);
+	if (conf) {
+		bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
+		bool verfs_match = same_verf(&verf, &conf->cl_verifier);
+
+		if (update) {
+			if (!clp_used_exchangeid(conf)) { /* buggy client */
+				status = nfserr_inval;
+				goto out;
+			}
+			if (!mach_creds_match(conf, rqstp)) {
+				status = nfserr_wrong_cred;
+				goto out;
+			}
+			if (!creds_match) { /* case 9 */
+				status = nfserr_perm;
+				goto out;
+			}
+			if (!verfs_match) { /* case 8 */
+				status = nfserr_not_same;
+				goto out;
+			}
+			/* case 6 */
+			exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+			goto out_copy;
+		}
+		if (!creds_match) { /* case 3 */
+			if (client_has_state(conf)) {
+				status = nfserr_clid_inuse;
+				goto out;
+			}
+			goto out_new;
+		}
+		if (verfs_match) { /* case 2 */
+			conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+			goto out_copy;
+		}
+		/* case 5, client reboot */
+		conf = NULL;
+		goto out_new;
+	}
+
+	if (update) { /* case 7 */
+		status = nfserr_noent;
+		goto out;
+	}
+
+	unconf  = find_unconfirmed_client_by_name(&exid->clname, nn);
+	if (unconf) /* case 4, possible retry or client restart */
+		unhash_client_locked(unconf);
+
+	/* case 1 (normal case) */
+out_new:
+	if (conf) {
+		status = mark_client_expired_locked(conf);
+		if (status)
+			goto out;
+	}
+	new->cl_minorversion = cstate->minorversion;
+	new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
+
+	gen_clid(new, nn);
+	add_to_unconfirmed(new);
+	swap(new, conf);
+out_copy:
+	exid->clientid.cl_boot = conf->cl_clientid.cl_boot;
+	exid->clientid.cl_id = conf->cl_clientid.cl_id;
+
+	exid->seqid = conf->cl_cs_slot.sl_seqid + 1;
+	nfsd4_set_ex_flags(conf, exid);
+
+	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+		conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags);
+	status = nfs_ok;
+
+out:
+	spin_unlock(&nn->client_lock);
+	if (new)
+		expire_client(new);
+	if (unconf)
+		expire_client(unconf);
+	return status;
+}
+
+static __be32
+check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
+{
+	dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
+		slot_seqid);
+
+	/* The slot is in use, and no response has been sent. */
+	if (slot_inuse) {
+		if (seqid == slot_seqid)
+			return nfserr_jukebox;
+		else
+			return nfserr_seq_misordered;
+	}
+	/* Note unsigned 32-bit arithmetic handles wraparound: */
+	if (likely(seqid == slot_seqid + 1))
+		return nfs_ok;
+	if (seqid == slot_seqid)
+		return nfserr_replay_cache;
+	return nfserr_seq_misordered;
+}
+
+/*
+ * Cache the create session result into the create session single DRC
+ * slot cache by saving the xdr structure. sl_seqid has been set.
+ * Do this for solo or embedded create session operations.
+ */
+static void
+nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
+			   struct nfsd4_clid_slot *slot, __be32 nfserr)
+{
+	slot->sl_status = nfserr;
+	memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
+}
+
+static __be32
+nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
+			    struct nfsd4_clid_slot *slot)
+{
+	memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
+	return slot->sl_status;
+}
+
+#define NFSD_MIN_REQ_HDR_SEQ_SZ	((\
+			2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \
+			1 +	/* MIN tag is length with zero, only length */ \
+			3 +	/* version, opcount, opcode */ \
+			XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				/* seqid, slotID, slotID, cache */ \
+			4 ) * sizeof(__be32))
+
+#define NFSD_MIN_RESP_HDR_SEQ_SZ ((\
+			2 +	/* verifier: AUTH_NULL, length 0 */\
+			1 +	/* status */ \
+			1 +	/* MIN tag is length with zero, only length */ \
+			3 +	/* opcount, opcode, opstatus*/ \
+			XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				/* seqid, slotID, slotID, slotID, status */ \
+			5 ) * sizeof(__be32))
+
+static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
+{
+	u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
+
+	if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ)
+		return nfserr_toosmall;
+	if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ)
+		return nfserr_toosmall;
+	ca->headerpadsz = 0;
+	ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
+	ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
+	ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+	ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
+			NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
+	ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
+	/*
+	 * Note decreasing slot size below client's request may make it
+	 * difficult for client to function correctly, whereas
+	 * decreasing the number of slots will (just?) affect
+	 * performance.  When short on memory we therefore prefer to
+	 * decrease number of slots instead of their size.  Clients that
+	 * request larger slots than they need will get poor results:
+	 */
+	ca->maxreqs = nfsd4_get_drc_mem(ca);
+	if (!ca->maxreqs)
+		return nfserr_jukebox;
+
+	return nfs_ok;
+}
+
+#define NFSD_CB_MAX_REQ_SZ	((NFS4_enc_cb_recall_sz + \
+				 RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+#define NFSD_CB_MAX_RESP_SZ	((NFS4_dec_cb_recall_sz + \
+				 RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
+
+static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
+{
+	ca->headerpadsz = 0;
+
+	/*
+	 * These RPC_MAX_HEADER macros are overkill, especially since we
+	 * don't even do gss on the backchannel yet.  But this is still
+	 * less than 1k.  Tighten up this estimate in the unlikely event
+	 * it turns out to be a problem for some client:
+	 */
+	if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
+		return nfserr_toosmall;
+	if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
+		return nfserr_toosmall;
+	ca->maxresp_cached = 0;
+	if (ca->maxops < 2)
+		return nfserr_toosmall;
+
+	return nfs_ok;
+}
+
+static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
+{
+	switch (cbs->flavor) {
+	case RPC_AUTH_NULL:
+	case RPC_AUTH_UNIX:
+		return nfs_ok;
+	default:
+		/*
+		 * GSS case: the spec doesn't allow us to return this
+		 * error.  But it also doesn't allow us not to support
+		 * GSS.
+		 * I'd rather this fail hard than return some error the
+		 * client might think it can already handle:
+		 */
+		return nfserr_encr_alg_unsupp;
+	}
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_create_session *cr_ses)
+{
+	struct sockaddr *sa = svc_addr(rqstp);
+	struct nfs4_client *conf, *unconf;
+	struct nfs4_client *old = NULL;
+	struct nfsd4_session *new;
+	struct nfsd4_conn *conn;
+	struct nfsd4_clid_slot *cs_slot = NULL;
+	__be32 status = 0;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
+		return nfserr_inval;
+	status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
+	if (status)
+		return status;
+	status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
+	if (status)
+		return status;
+	status = check_backchannel_attrs(&cr_ses->back_channel);
+	if (status)
+		goto out_release_drc_mem;
+	status = nfserr_jukebox;
+	new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
+	if (!new)
+		goto out_release_drc_mem;
+	conn = alloc_conn_from_crses(rqstp, cr_ses);
+	if (!conn)
+		goto out_free_session;
+
+	spin_lock(&nn->client_lock);
+	unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
+	conf = find_confirmed_client(&cr_ses->clientid, true, nn);
+	WARN_ON_ONCE(conf && unconf);
+
+	if (conf) {
+		status = nfserr_wrong_cred;
+		if (!mach_creds_match(conf, rqstp))
+			goto out_free_conn;
+		cs_slot = &conf->cl_cs_slot;
+		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+		if (status == nfserr_replay_cache) {
+			status = nfsd4_replay_create_session(cr_ses, cs_slot);
+			goto out_free_conn;
+		} else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
+			status = nfserr_seq_misordered;
+			goto out_free_conn;
+		}
+	} else if (unconf) {
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
+			status = nfserr_clid_inuse;
+			goto out_free_conn;
+		}
+		status = nfserr_wrong_cred;
+		if (!mach_creds_match(unconf, rqstp))
+			goto out_free_conn;
+		cs_slot = &unconf->cl_cs_slot;
+		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+		if (status) {
+			/* an unconfirmed replay returns misordered */
+			status = nfserr_seq_misordered;
+			goto out_free_conn;
+		}
+		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
+		if (old) {
+			status = mark_client_expired_locked(old);
+			if (status) {
+				old = NULL;
+				goto out_free_conn;
+			}
+		}
+		move_to_confirmed(unconf);
+		conf = unconf;
+	} else {
+		status = nfserr_stale_clientid;
+		goto out_free_conn;
+	}
+	status = nfs_ok;
+	/*
+	 * We do not support RDMA or persistent sessions
+	 */
+	cr_ses->flags &= ~SESSION4_PERSIST;
+	cr_ses->flags &= ~SESSION4_RDMA;
+
+	init_session(rqstp, new, conf, cr_ses);
+	nfsd4_get_session_locked(new);
+
+	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+	cs_slot->sl_seqid++;
+	cr_ses->seqid = cs_slot->sl_seqid;
+
+	/* cache solo and embedded create sessions under the client_lock */
+	nfsd4_cache_create_session(cr_ses, cs_slot, status);
+	spin_unlock(&nn->client_lock);
+	/* init connection and backchannel */
+	nfsd4_init_conn(rqstp, conn, new);
+	nfsd4_put_session(new);
+	if (old)
+		expire_client(old);
+	return status;
+out_free_conn:
+	spin_unlock(&nn->client_lock);
+	free_conn(conn);
+	if (old)
+		expire_client(old);
+out_free_session:
+	__free_session(new);
+out_release_drc_mem:
+	nfsd4_put_drc_mem(&cr_ses->fore_channel);
+	return status;
+}
+
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+	switch (*dir) {
+	case NFS4_CDFC4_FORE:
+	case NFS4_CDFC4_BACK:
+		return nfs_ok;
+	case NFS4_CDFC4_FORE_OR_BOTH:
+	case NFS4_CDFC4_BACK_OR_BOTH:
+		*dir = NFS4_CDFC4_BOTH;
+		return nfs_ok;
+	};
+	return nfserr_inval;
+}
+
+__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc)
+{
+	struct nfsd4_session *session = cstate->session;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	__be32 status;
+
+	status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
+	if (status)
+		return status;
+	spin_lock(&nn->client_lock);
+	session->se_cb_prog = bc->bc_cb_program;
+	session->se_cb_sec = bc->bc_cb_sec;
+	spin_unlock(&nn->client_lock);
+
+	nfsd4_probe_callback(session->se_client);
+
+	return nfs_ok;
+}
+
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 status;
+	struct nfsd4_conn *conn;
+	struct nfsd4_session *session;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (!nfsd4_last_compound_op(rqstp))
+		return nfserr_not_only_op;
+	spin_lock(&nn->client_lock);
+	session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status);
+	spin_unlock(&nn->client_lock);
+	if (!session)
+		goto out_no_session;
+	status = nfserr_wrong_cred;
+	if (!mach_creds_match(session->se_client, rqstp))
+		goto out;
+	status = nfsd4_map_bcts_dir(&bcts->dir);
+	if (status)
+		goto out;
+	conn = alloc_conn(rqstp, bcts->dir);
+	status = nfserr_jukebox;
+	if (!conn)
+		goto out;
+	nfsd4_init_conn(rqstp, conn, session);
+	status = nfs_ok;
+out:
+	nfsd4_put_session(session);
+out_no_session:
+	return status;
+}
+
+static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
+{
+	if (!session)
+		return 0;
+	return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+		      struct nfsd4_compound_state *cstate,
+		      struct nfsd4_destroy_session *sessionid)
+{
+	struct nfsd4_session *ses;
+	__be32 status;
+	int ref_held_by_me = 0;
+	struct net *net = SVC_NET(r);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	status = nfserr_not_only_op;
+	if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
+		if (!nfsd4_last_compound_op(r))
+			goto out;
+		ref_held_by_me++;
+	}
+	dump_sessionid(__func__, &sessionid->sessionid);
+	spin_lock(&nn->client_lock);
+	ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status);
+	if (!ses)
+		goto out_client_lock;
+	status = nfserr_wrong_cred;
+	if (!mach_creds_match(ses->se_client, r))
+		goto out_put_session;
+	status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
+	if (status)
+		goto out_put_session;
+	unhash_session(ses);
+	spin_unlock(&nn->client_lock);
+
+	nfsd4_probe_callback_sync(ses->se_client);
+
+	spin_lock(&nn->client_lock);
+	status = nfs_ok;
+out_put_session:
+	nfsd4_put_session_locked(ses);
+out_client_lock:
+	spin_unlock(&nn->client_lock);
+out:
+	return status;
+}
+
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(c, &s->se_conns, cn_persession) {
+		if (c->cn_xprt == xpt) {
+			return c;
+		}
+	}
+	return NULL;
+}
+
+static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_conn *c;
+	__be32 status = nfs_ok;
+	int ret;
+
+	spin_lock(&clp->cl_lock);
+	c = __nfsd4_find_conn(new->cn_xprt, ses);
+	if (c)
+		goto out_free;
+	status = nfserr_conn_not_bound_to_session;
+	if (clp->cl_mach_cred)
+		goto out_free;
+	__nfsd4_hash_conn(new, ses);
+	spin_unlock(&clp->cl_lock);
+	ret = nfsd4_register_conn(new);
+	if (ret)
+		/* oops; xprt is already down: */
+		nfsd4_conn_lost(&new->cn_xpt_user);
+	return nfs_ok;
+out_free:
+	spin_unlock(&clp->cl_lock);
+	free_conn(new);
+	return status;
+}
+
+static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
+{
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+	return args->opcnt > session->se_fchannel.maxops;
+}
+
+static bool nfsd4_request_too_big(struct svc_rqst *rqstp,
+				  struct nfsd4_session *session)
+{
+	struct xdr_buf *xb = &rqstp->rq_arg;
+
+	return xb->len > session->se_fchannel.maxreq_sz;
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+	       struct nfsd4_compound_state *cstate,
+	       struct nfsd4_sequence *seq)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct xdr_stream *xdr = &resp->xdr;
+	struct nfsd4_session *session;
+	struct nfs4_client *clp;
+	struct nfsd4_slot *slot;
+	struct nfsd4_conn *conn;
+	__be32 status;
+	int buflen;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (resp->opcnt != 1)
+		return nfserr_sequence_pos;
+
+	/*
+	 * Will be either used or freed by nfsd4_sequence_check_conn
+	 * below.
+	 */
+	conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+	if (!conn)
+		return nfserr_jukebox;
+
+	spin_lock(&nn->client_lock);
+	session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status);
+	if (!session)
+		goto out_no_session;
+	clp = session->se_client;
+
+	status = nfserr_too_many_ops;
+	if (nfsd4_session_too_many_ops(rqstp, session))
+		goto out_put_session;
+
+	status = nfserr_req_too_big;
+	if (nfsd4_request_too_big(rqstp, session))
+		goto out_put_session;
+
+	status = nfserr_badslot;
+	if (seq->slotid >= session->se_fchannel.maxreqs)
+		goto out_put_session;
+
+	slot = session->se_slots[seq->slotid];
+	dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+	/* We do not negotiate the number of slots yet, so set the
+	 * maxslots to the session maxreqs which is used to encode
+	 * sr_highest_slotid and the sr_target_slot id to maxslots */
+	seq->maxslots = session->se_fchannel.maxreqs;
+
+	status = check_slot_seqid(seq->seqid, slot->sl_seqid,
+					slot->sl_flags & NFSD4_SLOT_INUSE);
+	if (status == nfserr_replay_cache) {
+		status = nfserr_seq_misordered;
+		if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
+			goto out_put_session;
+		cstate->slot = slot;
+		cstate->session = session;
+		cstate->clp = clp;
+		/* Return the cached reply status and set cstate->status
+		 * for nfsd4_proc_compound processing */
+		status = nfsd4_replay_cache_entry(resp, seq);
+		cstate->status = nfserr_replay_cache;
+		goto out;
+	}
+	if (status)
+		goto out_put_session;
+
+	status = nfsd4_sequence_check_conn(conn, session);
+	conn = NULL;
+	if (status)
+		goto out_put_session;
+
+	buflen = (seq->cachethis) ?
+			session->se_fchannel.maxresp_cached :
+			session->se_fchannel.maxresp_sz;
+	status = (seq->cachethis) ? nfserr_rep_too_big_to_cache :
+				    nfserr_rep_too_big;
+	if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack))
+		goto out_put_session;
+	svc_reserve(rqstp, buflen);
+
+	status = nfs_ok;
+	/* Success! bump slot seqid */
+	slot->sl_seqid = seq->seqid;
+	slot->sl_flags |= NFSD4_SLOT_INUSE;
+	if (seq->cachethis)
+		slot->sl_flags |= NFSD4_SLOT_CACHETHIS;
+	else
+		slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS;
+
+	cstate->slot = slot;
+	cstate->session = session;
+	cstate->clp = clp;
+
+out:
+	switch (clp->cl_cb_state) {
+	case NFSD4_CB_DOWN:
+		seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+		break;
+	case NFSD4_CB_FAULT:
+		seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+		break;
+	default:
+		seq->status_flags = 0;
+	}
+	if (!list_empty(&clp->cl_revoked))
+		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+out_no_session:
+	if (conn)
+		free_conn(conn);
+	spin_unlock(&nn->client_lock);
+	return status;
+out_put_session:
+	nfsd4_put_session_locked(session);
+	goto out_no_session;
+}
+
+void
+nfsd4_sequence_done(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_compound_state *cs = &resp->cstate;
+
+	if (nfsd4_has_session(cs)) {
+		if (cs->status != nfserr_replay_cache) {
+			nfsd4_store_cache_entry(resp);
+			cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
+		}
+		/* Drop session reference that was taken in nfsd4_sequence() */
+		nfsd4_put_session(cs->session);
+	} else if (cs->clp)
+		put_client_renew(cs->clp);
+}
+
+__be32
+nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
+{
+	struct nfs4_client *conf, *unconf;
+	struct nfs4_client *clp = NULL;
+	__be32 status = 0;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	spin_lock(&nn->client_lock);
+	unconf = find_unconfirmed_client(&dc->clientid, true, nn);
+	conf = find_confirmed_client(&dc->clientid, true, nn);
+	WARN_ON_ONCE(conf && unconf);
+
+	if (conf) {
+		if (client_has_state(conf)) {
+			status = nfserr_clientid_busy;
+			goto out;
+		}
+		status = mark_client_expired_locked(conf);
+		if (status)
+			goto out;
+		clp = conf;
+	} else if (unconf)
+		clp = unconf;
+	else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+	if (!mach_creds_match(clp, rqstp)) {
+		clp = NULL;
+		status = nfserr_wrong_cred;
+		goto out;
+	}
+	unhash_client_locked(clp);
+out:
+	spin_unlock(&nn->client_lock);
+	if (clp)
+		expire_client(clp);
+	return status;
+}
+
+__be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+	__be32 status = 0;
+
+	if (rc->rca_one_fs) {
+		if (!cstate->current_fh.fh_dentry)
+			return nfserr_nofilehandle;
+		/*
+		 * We don't take advantage of the rca_one_fs case.
+		 * That's OK, it's optional, we can safely ignore it.
+		 */
+		 return nfs_ok;
+	}
+
+	status = nfserr_complete_already;
+	if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+			     &cstate->session->se_client->cl_flags))
+		goto out;
+
+	status = nfserr_stale_clientid;
+	if (is_client_expired(cstate->session->se_client))
+		/*
+		 * The following error isn't really legal.
+		 * But we only get here if the client just explicitly
+		 * destroyed the client.  Surely it no longer cares what
+		 * error it gets back on an operation for the dead
+		 * client.
+		 */
+		goto out;
+
+	status = nfs_ok;
+	nfsd4_client_record_create(cstate->session->se_client);
+out:
+	return status;
+}
+
+__be32
+nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		  struct nfsd4_setclientid *setclid)
+{
+	struct xdr_netobj 	clname = setclid->se_name;
+	nfs4_verifier		clverifier = setclid->se_verf;
+	struct nfs4_client	*conf, *new;
+	struct nfs4_client	*unconf = NULL;
+	__be32 			status;
+	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	new = create_client(clname, rqstp, &clverifier);
+	if (new == NULL)
+		return nfserr_jukebox;
+	/* Cases below refer to rfc 3530 section 14.2.33: */
+	spin_lock(&nn->client_lock);
+	conf = find_confirmed_client_by_name(&clname, nn);
+	if (conf) {
+		/* case 0: */
+		status = nfserr_clid_inuse;
+		if (clp_used_exchangeid(conf))
+			goto out;
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			char addr_str[INET6_ADDRSTRLEN];
+			rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
+				 sizeof(addr_str));
+			dprintk("NFSD: setclientid: string in use by client "
+				"at %s\n", addr_str);
+			goto out;
+		}
+	}
+	unconf = find_unconfirmed_client_by_name(&clname, nn);
+	if (unconf)
+		unhash_client_locked(unconf);
+	if (conf && same_verf(&conf->cl_verifier, &clverifier))
+		/* case 1: probable callback update */
+		copy_clid(new, conf);
+	else /* case 4 (new client) or cases 2, 3 (client reboot): */
+		gen_clid(new, nn);
+	new->cl_minorversion = 0;
+	gen_callback(new, setclid, rqstp);
+	add_to_unconfirmed(new);
+	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
+	setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
+	memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
+	new = NULL;
+	status = nfs_ok;
+out:
+	spin_unlock(&nn->client_lock);
+	if (new)
+		free_client(new);
+	if (unconf)
+		expire_client(unconf);
+	return status;
+}
+
+
+__be32
+nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+			 struct nfsd4_compound_state *cstate,
+			 struct nfsd4_setclientid_confirm *setclientid_confirm)
+{
+	struct nfs4_client *conf, *unconf;
+	struct nfs4_client *old = NULL;
+	nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
+	clientid_t * clid = &setclientid_confirm->sc_clientid;
+	__be32 status;
+	struct nfsd_net	*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	if (STALE_CLIENTID(clid, nn))
+		return nfserr_stale_clientid;
+
+	spin_lock(&nn->client_lock);
+	conf = find_confirmed_client(clid, false, nn);
+	unconf = find_unconfirmed_client(clid, false, nn);
+	/*
+	 * We try hard to give out unique clientid's, so if we get an
+	 * attempt to confirm the same clientid with a different cred,
+	 * there's a bug somewhere.  Let's charitably assume it's our
+	 * bug.
+	 */
+	status = nfserr_serverfault;
+	if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
+		goto out;
+	if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
+		goto out;
+	/* cases below refer to rfc 3530 section 14.2.34: */
+	if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
+		if (conf && !unconf) /* case 2: probable retransmit */
+			status = nfs_ok;
+		else /* case 4: client hasn't noticed we rebooted yet? */
+			status = nfserr_stale_clientid;
+		goto out;
+	}
+	status = nfs_ok;
+	if (conf) { /* case 1: callback update */
+		old = unconf;
+		unhash_client_locked(old);
+		nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+	} else { /* case 3: normal case; new or rebooted client */
+		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
+		if (old) {
+			status = mark_client_expired_locked(old);
+			if (status) {
+				old = NULL;
+				goto out;
+			}
+		}
+		move_to_confirmed(unconf);
+		conf = unconf;
+	}
+	get_client_locked(conf);
+	spin_unlock(&nn->client_lock);
+	nfsd4_probe_callback(conf);
+	spin_lock(&nn->client_lock);
+	put_client_renew_locked(conf);
+out:
+	spin_unlock(&nn->client_lock);
+	if (old)
+		expire_client(old);
+	return status;
+}
+
+static struct nfs4_file *nfsd4_alloc_file(void)
+{
+	return kmem_cache_alloc(file_slab, GFP_KERNEL);
+}
+
+/* OPEN Share state helper functions */
+static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
+				struct nfs4_file *fp)
+{
+	lockdep_assert_held(&state_lock);
+
+	atomic_set(&fp->fi_ref, 1);
+	spin_lock_init(&fp->fi_lock);
+	INIT_LIST_HEAD(&fp->fi_stateids);
+	INIT_LIST_HEAD(&fp->fi_delegations);
+	INIT_LIST_HEAD(&fp->fi_clnt_odstate);
+	fh_copy_shallow(&fp->fi_fhandle, fh);
+	fp->fi_deleg_file = NULL;
+	fp->fi_had_conflict = false;
+	fp->fi_share_deny = 0;
+	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+	memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&fp->fi_lo_states);
+	atomic_set(&fp->fi_lo_recalls, 0);
+#endif
+	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
+}
+
+void
+nfsd4_free_slabs(void)
+{
+	kmem_cache_destroy(odstate_slab);
+	kmem_cache_destroy(openowner_slab);
+	kmem_cache_destroy(lockowner_slab);
+	kmem_cache_destroy(file_slab);
+	kmem_cache_destroy(stateid_slab);
+	kmem_cache_destroy(deleg_slab);
+}
+
+int
+nfsd4_init_slabs(void)
+{
+	openowner_slab = kmem_cache_create("nfsd4_openowners",
+			sizeof(struct nfs4_openowner), 0, 0, NULL);
+	if (openowner_slab == NULL)
+		goto out;
+	lockowner_slab = kmem_cache_create("nfsd4_lockowners",
+			sizeof(struct nfs4_lockowner), 0, 0, NULL);
+	if (lockowner_slab == NULL)
+		goto out_free_openowner_slab;
+	file_slab = kmem_cache_create("nfsd4_files",
+			sizeof(struct nfs4_file), 0, 0, NULL);
+	if (file_slab == NULL)
+		goto out_free_lockowner_slab;
+	stateid_slab = kmem_cache_create("nfsd4_stateids",
+			sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
+	if (stateid_slab == NULL)
+		goto out_free_file_slab;
+	deleg_slab = kmem_cache_create("nfsd4_delegations",
+			sizeof(struct nfs4_delegation), 0, 0, NULL);
+	if (deleg_slab == NULL)
+		goto out_free_stateid_slab;
+	odstate_slab = kmem_cache_create("nfsd4_odstate",
+			sizeof(struct nfs4_clnt_odstate), 0, 0, NULL);
+	if (odstate_slab == NULL)
+		goto out_free_deleg_slab;
+	return 0;
+
+out_free_deleg_slab:
+	kmem_cache_destroy(deleg_slab);
+out_free_stateid_slab:
+	kmem_cache_destroy(stateid_slab);
+out_free_file_slab:
+	kmem_cache_destroy(file_slab);
+out_free_lockowner_slab:
+	kmem_cache_destroy(lockowner_slab);
+out_free_openowner_slab:
+	kmem_cache_destroy(openowner_slab);
+out:
+	dprintk("nfsd4: out of memory while initializing nfsv4\n");
+	return -ENOMEM;
+}
+
+static void init_nfs4_replay(struct nfs4_replay *rp)
+{
+	rp->rp_status = nfserr_serverfault;
+	rp->rp_buflen = 0;
+	rp->rp_buf = rp->rp_ibuf;
+	mutex_init(&rp->rp_mutex);
+}
+
+static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
+		struct nfs4_stateowner *so)
+{
+	if (!nfsd4_has_session(cstate)) {
+		mutex_lock(&so->so_replay.rp_mutex);
+		cstate->replay_owner = nfs4_get_stateowner(so);
+	}
+}
+
+void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
+{
+	struct nfs4_stateowner *so = cstate->replay_owner;
+
+	if (so != NULL) {
+		cstate->replay_owner = NULL;
+		mutex_unlock(&so->so_replay.rp_mutex);
+		nfs4_put_stateowner(so);
+	}
+}
+
+static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
+{
+	struct nfs4_stateowner *sop;
+
+	sop = kmem_cache_alloc(slab, GFP_KERNEL);
+	if (!sop)
+		return NULL;
+
+	sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+	if (!sop->so_owner.data) {
+		kmem_cache_free(slab, sop);
+		return NULL;
+	}
+	sop->so_owner.len = owner->len;
+
+	INIT_LIST_HEAD(&sop->so_stateids);
+	sop->so_client = clp;
+	init_nfs4_replay(&sop->so_replay);
+	atomic_set(&sop->so_count, 1);
+	return sop;
+}
+
+static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
+{
+	lockdep_assert_held(&clp->cl_lock);
+
+	list_add(&oo->oo_owner.so_strhash,
+		 &clp->cl_ownerstr_hashtbl[strhashval]);
+	list_add(&oo->oo_perclient, &clp->cl_openowners);
+}
+
+static void nfs4_unhash_openowner(struct nfs4_stateowner *so)
+{
+	unhash_openowner_locked(openowner(so));
+}
+
+static void nfs4_free_openowner(struct nfs4_stateowner *so)
+{
+	struct nfs4_openowner *oo = openowner(so);
+
+	kmem_cache_free(openowner_slab, oo);
+}
+
+static const struct nfs4_stateowner_operations openowner_ops = {
+	.so_unhash =	nfs4_unhash_openowner,
+	.so_free =	nfs4_free_openowner,
+};
+
+static struct nfs4_openowner *
+alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
+			   struct nfsd4_compound_state *cstate)
+{
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_openowner *oo, *ret;
+
+	oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+	if (!oo)
+		return NULL;
+	oo->oo_owner.so_ops = &openowner_ops;
+	oo->oo_owner.so_is_open_owner = 1;
+	oo->oo_owner.so_seqid = open->op_seqid;
+	oo->oo_flags = 0;
+	if (nfsd4_has_session(cstate))
+		oo->oo_flags |= NFS4_OO_CONFIRMED;
+	oo->oo_time = 0;
+	oo->oo_last_closed_stid = NULL;
+	INIT_LIST_HEAD(&oo->oo_close_lru);
+	spin_lock(&clp->cl_lock);
+	ret = find_openstateowner_str_locked(strhashval, open, clp);
+	if (ret == NULL) {
+		hash_openowner(oo, clp, strhashval);
+		ret = oo;
+	} else
+		nfs4_free_openowner(&oo->oo_owner);
+	spin_unlock(&clp->cl_lock);
+	return ret;
+}
+
+static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+	struct nfs4_openowner *oo = open->op_openowner;
+
+	atomic_inc(&stp->st_stid.sc_count);
+	stp->st_stid.sc_type = NFS4_OPEN_STID;
+	INIT_LIST_HEAD(&stp->st_locks);
+	stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
+	get_nfs4_file(fp);
+	stp->st_stid.sc_file = fp;
+	stp->st_access_bmap = 0;
+	stp->st_deny_bmap = 0;
+	stp->st_openstp = NULL;
+	spin_lock(&oo->oo_owner.so_client->cl_lock);
+	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
+	spin_lock(&fp->fi_lock);
+	list_add(&stp->st_perfile, &fp->fi_stateids);
+	spin_unlock(&fp->fi_lock);
+	spin_unlock(&oo->oo_owner.so_client->cl_lock);
+}
+
+/*
+ * In the 4.0 case we need to keep the owners around a little while to handle
+ * CLOSE replay. We still do need to release any file access that is held by
+ * them before returning however.
+ */
+static void
+move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
+{
+	struct nfs4_ol_stateid *last;
+	struct nfs4_openowner *oo = openowner(s->st_stateowner);
+	struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net,
+						nfsd_net_id);
+
+	dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
+
+	/*
+	 * We know that we hold one reference via nfsd4_close, and another
+	 * "persistent" reference for the client. If the refcount is higher
+	 * than 2, then there are still calls in progress that are using this
+	 * stateid. We can't put the sc_file reference until they are finished.
+	 * Wait for the refcount to drop to 2. Since it has been unhashed,
+	 * there should be no danger of the refcount going back up again at
+	 * this point.
+	 */
+	wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2);
+
+	release_all_access(s);
+	if (s->st_stid.sc_file) {
+		put_nfs4_file(s->st_stid.sc_file);
+		s->st_stid.sc_file = NULL;
+	}
+
+	spin_lock(&nn->client_lock);
+	last = oo->oo_last_closed_stid;
+	oo->oo_last_closed_stid = s;
+	list_move_tail(&oo->oo_close_lru, &nn->close_lru);
+	oo->oo_time = get_seconds();
+	spin_unlock(&nn->client_lock);
+	if (last)
+		nfs4_put_stid(&last->st_stid);
+}
+
+/* search file_hashtbl[] for file */
+static struct nfs4_file *
+find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
+{
+	struct nfs4_file *fp;
+
+	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
+		if (fh_match(&fp->fi_fhandle, fh)) {
+			if (atomic_inc_not_zero(&fp->fi_ref))
+				return fp;
+		}
+	}
+	return NULL;
+}
+
+struct nfs4_file *
+find_file(struct knfsd_fh *fh)
+{
+	struct nfs4_file *fp;
+	unsigned int hashval = file_hashval(fh);
+
+	rcu_read_lock();
+	fp = find_file_locked(fh, hashval);
+	rcu_read_unlock();
+	return fp;
+}
+
+static struct nfs4_file *
+find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
+{
+	struct nfs4_file *fp;
+	unsigned int hashval = file_hashval(fh);
+
+	rcu_read_lock();
+	fp = find_file_locked(fh, hashval);
+	rcu_read_unlock();
+	if (fp)
+		return fp;
+
+	spin_lock(&state_lock);
+	fp = find_file_locked(fh, hashval);
+	if (likely(fp == NULL)) {
+		nfsd4_init_file(fh, hashval, new);
+		fp = new;
+	}
+	spin_unlock(&state_lock);
+
+	return fp;
+}
+
+/*
+ * Called to check deny when READ with all zero stateid or
+ * WRITE with all zero or all one stateid
+ */
+static __be32
+nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
+{
+	struct nfs4_file *fp;
+	__be32 ret = nfs_ok;
+
+	fp = find_file(&current_fh->fh_handle);
+	if (!fp)
+		return ret;
+	/* Check for conflicting share reservations */
+	spin_lock(&fp->fi_lock);
+	if (fp->fi_share_deny & deny_type)
+		ret = nfserr_locked;
+	spin_unlock(&fp->fi_lock);
+	put_nfs4_file(fp);
+	return ret;
+}
+
+static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
+{
+	struct nfs4_delegation *dp = cb_to_delegation(cb);
+	struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
+					  nfsd_net_id);
+
+	block_delegations(&dp->dl_stid.sc_file->fi_fhandle);
+
+	/*
+	 * We can't do this in nfsd_break_deleg_cb because it is
+	 * already holding inode->i_lock.
+	 *
+	 * If the dl_time != 0, then we know that it has already been
+	 * queued for a lease break. Don't queue it again.
+	 */
+	spin_lock(&state_lock);
+	if (dp->dl_time == 0) {
+		dp->dl_time = get_seconds();
+		list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
+	}
+	spin_unlock(&state_lock);
+}
+
+static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
+		struct rpc_task *task)
+{
+	struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+	switch (task->tk_status) {
+	case 0:
+		return 1;
+	case -EBADHANDLE:
+	case -NFS4ERR_BAD_STATEID:
+		/*
+		 * Race: client probably got cb_recall before open reply
+		 * granting delegation.
+		 */
+		if (dp->dl_retries--) {
+			rpc_delay(task, 2 * HZ);
+			return 0;
+		}
+		/*FALLTHRU*/
+	default:
+		return -1;
+	}
+}
+
+static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+	nfs4_put_stid(&dp->dl_stid);
+}
+
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+	.prepare	= nfsd4_cb_recall_prepare,
+	.done		= nfsd4_cb_recall_done,
+	.release	= nfsd4_cb_recall_release,
+};
+
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+{
+	/*
+	 * We're assuming the state code never drops its reference
+	 * without first removing the lease.  Since we're in this lease
+	 * callback (and since the lease code is serialized by the kernel
+	 * lock) we know the server hasn't removed the lease yet, we know
+	 * it's safe to take a reference.
+	 */
+	atomic_inc(&dp->dl_stid.sc_count);
+	nfsd4_run_cb(&dp->dl_recall);
+}
+
+/* Called from break_lease() with i_lock held. */
+static bool
+nfsd_break_deleg_cb(struct file_lock *fl)
+{
+	bool ret = false;
+	struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
+	struct nfs4_delegation *dp;
+
+	if (!fp) {
+		WARN(1, "(%p)->fl_owner NULL\n", fl);
+		return ret;
+	}
+	if (fp->fi_had_conflict) {
+		WARN(1, "duplicate break on %p\n", fp);
+		return ret;
+	}
+	/*
+	 * We don't want the locks code to timeout the lease for us;
+	 * we'll remove it ourself if a delegation isn't returned
+	 * in time:
+	 */
+	fl->fl_break_time = 0;
+
+	spin_lock(&fp->fi_lock);
+	fp->fi_had_conflict = true;
+	/*
+	 * If there are no delegations on the list, then return true
+	 * so that the lease code will go ahead and delete it.
+	 */
+	if (list_empty(&fp->fi_delegations))
+		ret = true;
+	else
+		list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+			nfsd_break_one_deleg(dp);
+	spin_unlock(&fp->fi_lock);
+	return ret;
+}
+
+static int
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+		     struct list_head *dispose)
+{
+	if (arg & F_UNLCK)
+		return lease_modify(onlist, arg, dispose);
+	else
+		return -EAGAIN;
+}
+
+static const struct lock_manager_operations nfsd_lease_mng_ops = {
+	.lm_break = nfsd_break_deleg_cb,
+	.lm_change = nfsd_change_deleg_cb,
+};
+
+static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
+{
+	if (nfsd4_has_session(cstate))
+		return nfs_ok;
+	if (seqid == so->so_seqid - 1)
+		return nfserr_replay_me;
+	if (seqid == so->so_seqid)
+		return nfs_ok;
+	return nfserr_bad_seqid;
+}
+
+static __be32 lookup_clientid(clientid_t *clid,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd_net *nn)
+{
+	struct nfs4_client *found;
+
+	if (cstate->clp) {
+		found = cstate->clp;
+		if (!same_clid(&found->cl_clientid, clid))
+			return nfserr_stale_clientid;
+		return nfs_ok;
+	}
+
+	if (STALE_CLIENTID(clid, nn))
+		return nfserr_stale_clientid;
+
+	/*
+	 * For v4.1+ we get the client in the SEQUENCE op. If we don't have one
+	 * cached already then we know this is for is for v4.0 and "sessions"
+	 * will be false.
+	 */
+	WARN_ON_ONCE(cstate->session);
+	spin_lock(&nn->client_lock);
+	found = find_confirmed_client(clid, false, nn);
+	if (!found) {
+		spin_unlock(&nn->client_lock);
+		return nfserr_expired;
+	}
+	atomic_inc(&found->cl_refcount);
+	spin_unlock(&nn->client_lock);
+
+	/* Cache the nfs4_client in cstate! */
+	cstate->clp = found;
+	return nfs_ok;
+}
+
+__be32
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+		    struct nfsd4_open *open, struct nfsd_net *nn)
+{
+	clientid_t *clientid = &open->op_clientid;
+	struct nfs4_client *clp = NULL;
+	unsigned int strhashval;
+	struct nfs4_openowner *oo = NULL;
+	__be32 status;
+
+	if (STALE_CLIENTID(&open->op_clientid, nn))
+		return nfserr_stale_clientid;
+	/*
+	 * In case we need it later, after we've already created the
+	 * file and don't want to risk a further failure:
+	 */
+	open->op_file = nfsd4_alloc_file();
+	if (open->op_file == NULL)
+		return nfserr_jukebox;
+
+	status = lookup_clientid(clientid, cstate, nn);
+	if (status)
+		return status;
+	clp = cstate->clp;
+
+	strhashval = ownerstr_hashval(&open->op_owner);
+	oo = find_openstateowner_str(strhashval, open, clp);
+	open->op_openowner = oo;
+	if (!oo) {
+		goto new_owner;
+	}
+	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+		/* Replace unconfirmed owners without checking for replay. */
+		release_openowner(oo);
+		open->op_openowner = NULL;
+		goto new_owner;
+	}
+	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+	if (status)
+		return status;
+	goto alloc_stateid;
+new_owner:
+	oo = alloc_init_open_stateowner(strhashval, open, cstate);
+	if (oo == NULL)
+		return nfserr_jukebox;
+	open->op_openowner = oo;
+alloc_stateid:
+	open->op_stp = nfs4_alloc_open_stateid(clp);
+	if (!open->op_stp)
+		return nfserr_jukebox;
+
+	if (nfsd4_has_session(cstate) &&
+	    (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) {
+		open->op_odstate = alloc_clnt_odstate(clp);
+		if (!open->op_odstate)
+			return nfserr_jukebox;
+	}
+
+	return nfs_ok;
+}
+
+static inline __be32
+nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
+{
+	if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
+		return nfserr_openmode;
+	else
+		return nfs_ok;
+}
+
+static int share_access_to_flags(u32 share_access)
+{
+	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+}
+
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
+{
+	struct nfs4_stid *ret;
+
+	ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
+	if (!ret)
+		return NULL;
+	return delegstateid(ret);
+}
+
+static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
+{
+	return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+	       open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
+}
+
+static __be32
+nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
+		struct nfs4_delegation **dp)
+{
+	int flags;
+	__be32 status = nfserr_bad_stateid;
+	struct nfs4_delegation *deleg;
+
+	deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
+	if (deleg == NULL)
+		goto out;
+	flags = share_access_to_flags(open->op_share_access);
+	status = nfs4_check_delegmode(deleg, flags);
+	if (status) {
+		nfs4_put_stid(&deleg->dl_stid);
+		goto out;
+	}
+	*dp = deleg;
+out:
+	if (!nfsd4_is_deleg_cur(open))
+		return nfs_ok;
+	if (status)
+		return status;
+	open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+	return nfs_ok;
+}
+
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+	struct nfs4_ol_stateid *local, *ret = NULL;
+	struct nfs4_openowner *oo = open->op_openowner;
+
+	spin_lock(&fp->fi_lock);
+	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+		/* ignore lock owners */
+		if (local->st_stateowner->so_is_open_owner == 0)
+			continue;
+		if (local->st_stateowner == &oo->oo_owner) {
+			ret = local;
+			atomic_inc(&ret->st_stid.sc_count);
+			break;
+		}
+	}
+	spin_unlock(&fp->fi_lock);
+	return ret;
+}
+
+static inline int nfs4_access_to_access(u32 nfs4_access)
+{
+	int flags = 0;
+
+	if (nfs4_access & NFS4_SHARE_ACCESS_READ)
+		flags |= NFSD_MAY_READ;
+	if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
+		flags |= NFSD_MAY_WRITE;
+	return flags;
+}
+
+static inline __be32
+nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
+		struct nfsd4_open *open)
+{
+	struct iattr iattr = {
+		.ia_valid = ATTR_SIZE,
+		.ia_size = 0,
+	};
+	if (!open->op_truncate)
+		return 0;
+	if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+		return nfserr_inval;
+	return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
+}
+
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+		struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+		struct nfsd4_open *open)
+{
+	struct file *filp = NULL;
+	__be32 status;
+	int oflag = nfs4_access_to_omode(open->op_share_access);
+	int access = nfs4_access_to_access(open->op_share_access);
+	unsigned char old_access_bmap, old_deny_bmap;
+
+	spin_lock(&fp->fi_lock);
+
+	/*
+	 * Are we trying to set a deny mode that would conflict with
+	 * current access?
+	 */
+	status = nfs4_file_check_deny(fp, open->op_share_deny);
+	if (status != nfs_ok) {
+		spin_unlock(&fp->fi_lock);
+		goto out;
+	}
+
+	/* set access to the file */
+	status = nfs4_file_get_access(fp, open->op_share_access);
+	if (status != nfs_ok) {
+		spin_unlock(&fp->fi_lock);
+		goto out;
+	}
+
+	/* Set access bits in stateid */
+	old_access_bmap = stp->st_access_bmap;
+	set_access(open->op_share_access, stp);
+
+	/* Set new deny mask */
+	old_deny_bmap = stp->st_deny_bmap;
+	set_deny(open->op_share_deny, stp);
+	fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+
+	if (!fp->fi_fds[oflag]) {
+		spin_unlock(&fp->fi_lock);
+		status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp);
+		if (status)
+			goto out_put_access;
+		spin_lock(&fp->fi_lock);
+		if (!fp->fi_fds[oflag]) {
+			fp->fi_fds[oflag] = filp;
+			filp = NULL;
+		}
+	}
+	spin_unlock(&fp->fi_lock);
+	if (filp)
+		fput(filp);
+
+	status = nfsd4_truncate(rqstp, cur_fh, open);
+	if (status)
+		goto out_put_access;
+out:
+	return status;
+out_put_access:
+	stp->st_access_bmap = old_access_bmap;
+	nfs4_file_put_access(fp, open->op_share_access);
+	reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp);
+	goto out;
+}
+
+static __be32
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+{
+	__be32 status;
+	unsigned char old_deny_bmap;
+
+	if (!test_access(open->op_share_access, stp))
+		return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
+
+	/* test and set deny mode */
+	spin_lock(&fp->fi_lock);
+	status = nfs4_file_check_deny(fp, open->op_share_deny);
+	if (status == nfs_ok) {
+		old_deny_bmap = stp->st_deny_bmap;
+		set_deny(open->op_share_deny, stp);
+		fp->fi_share_deny |=
+				(open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+	}
+	spin_unlock(&fp->fi_lock);
+
+	if (status != nfs_ok)
+		return status;
+
+	status = nfsd4_truncate(rqstp, cur_fh, open);
+	if (status != nfs_ok)
+		reset_union_bmap_deny(old_deny_bmap, stp);
+	return status;
+}
+
+static void
+nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session)
+{
+	open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+}
+
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+	if (clp->cl_cb_state == NFSD4_CB_UP)
+		return true;
+	/*
+	 * In the sessions case, since we don't have to establish a
+	 * separate connection for callbacks, we assume it's OK
+	 * until we hear otherwise:
+	 */
+	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
+
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
+{
+	struct file_lock *fl;
+
+	fl = locks_alloc_lock();
+	if (!fl)
+		return NULL;
+	fl->fl_lmops = &nfsd_lease_mng_ops;
+	fl->fl_flags = FL_DELEG;
+	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner = (fl_owner_t)fp;
+	fl->fl_pid = current->tgid;
+	return fl;
+}
+
+static int nfs4_setlease(struct nfs4_delegation *dp)
+{
+	struct nfs4_file *fp = dp->dl_stid.sc_file;
+	struct file_lock *fl, *ret;
+	struct file *filp;
+	int status = 0;
+
+	fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ);
+	if (!fl)
+		return -ENOMEM;
+	filp = find_readable_file(fp);
+	if (!filp) {
+		/* We should always have a readable file here */
+		WARN_ON_ONCE(1);
+		return -EBADF;
+	}
+	fl->fl_file = filp;
+	ret = fl;
+	status = vfs_setlease(filp, fl->fl_type, &fl, NULL);
+	if (fl)
+		locks_free_lock(fl);
+	if (status)
+		goto out_fput;
+	spin_lock(&state_lock);
+	spin_lock(&fp->fi_lock);
+	/* Did the lease get broken before we took the lock? */
+	status = -EAGAIN;
+	if (fp->fi_had_conflict)
+		goto out_unlock;
+	/* Race breaker */
+	if (fp->fi_deleg_file) {
+		status = 0;
+		++fp->fi_delegees;
+		hash_delegation_locked(dp, fp);
+		goto out_unlock;
+	}
+	fp->fi_deleg_file = filp;
+	fp->fi_delegees = 1;
+	hash_delegation_locked(dp, fp);
+	spin_unlock(&fp->fi_lock);
+	spin_unlock(&state_lock);
+	return 0;
+out_unlock:
+	spin_unlock(&fp->fi_lock);
+	spin_unlock(&state_lock);
+out_fput:
+	fput(filp);
+	return status;
+}
+
+static struct nfs4_delegation *
+nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+		    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
+{
+	int status;
+	struct nfs4_delegation *dp;
+
+	if (fp->fi_had_conflict)
+		return ERR_PTR(-EAGAIN);
+
+	dp = alloc_init_deleg(clp, fh, odstate);
+	if (!dp)
+		return ERR_PTR(-ENOMEM);
+
+	get_nfs4_file(fp);
+	spin_lock(&state_lock);
+	spin_lock(&fp->fi_lock);
+	dp->dl_stid.sc_file = fp;
+	if (!fp->fi_deleg_file) {
+		spin_unlock(&fp->fi_lock);
+		spin_unlock(&state_lock);
+		status = nfs4_setlease(dp);
+		goto out;
+	}
+	if (fp->fi_had_conflict) {
+		status = -EAGAIN;
+		goto out_unlock;
+	}
+	++fp->fi_delegees;
+	hash_delegation_locked(dp, fp);
+	status = 0;
+out_unlock:
+	spin_unlock(&fp->fi_lock);
+	spin_unlock(&state_lock);
+out:
+	if (status) {
+		put_clnt_odstate(dp->dl_clnt_odstate);
+		nfs4_put_stid(&dp->dl_stid);
+		return ERR_PTR(status);
+	}
+	return dp;
+}
+
+static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
+{
+	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+	if (status == -EAGAIN)
+		open->op_why_no_deleg = WND4_CONTENTION;
+	else {
+		open->op_why_no_deleg = WND4_RESOURCE;
+		switch (open->op_deleg_want) {
+		case NFS4_SHARE_WANT_READ_DELEG:
+		case NFS4_SHARE_WANT_WRITE_DELEG:
+		case NFS4_SHARE_WANT_ANY_DELEG:
+			break;
+		case NFS4_SHARE_WANT_CANCEL:
+			open->op_why_no_deleg = WND4_CANCELLED;
+			break;
+		case NFS4_SHARE_WANT_NO_DELEG:
+			WARN_ON_ONCE(1);
+		}
+	}
+}
+
+/*
+ * Attempt to hand out a delegation.
+ *
+ * Note we don't support write delegations, and won't until the vfs has
+ * proper support for them.
+ */
+static void
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
+			struct nfs4_ol_stateid *stp)
+{
+	struct nfs4_delegation *dp;
+	struct nfs4_openowner *oo = openowner(stp->st_stateowner);
+	struct nfs4_client *clp = stp->st_stid.sc_client;
+	int cb_up;
+	int status = 0;
+
+	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
+	open->op_recall = 0;
+	switch (open->op_claim_type) {
+		case NFS4_OPEN_CLAIM_PREVIOUS:
+			if (!cb_up)
+				open->op_recall = 1;
+			if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
+				goto out_no_deleg;
+			break;
+		case NFS4_OPEN_CLAIM_NULL:
+		case NFS4_OPEN_CLAIM_FH:
+			/*
+			 * Let's not give out any delegations till everyone's
+			 * had the chance to reclaim theirs....
+			 */
+			if (locks_in_grace(clp->net))
+				goto out_no_deleg;
+			if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
+				goto out_no_deleg;
+			/*
+			 * Also, if the file was opened for write or
+			 * create, there's a good chance the client's
+			 * about to write to it, resulting in an
+			 * immediate recall (since we don't support
+			 * write delegations):
+			 */
+			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+				goto out_no_deleg;
+			if (open->op_create == NFS4_OPEN_CREATE)
+				goto out_no_deleg;
+			break;
+		default:
+			goto out_no_deleg;
+	}
+	dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate);
+	if (IS_ERR(dp))
+		goto out_no_deleg;
+
+	memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
+
+	dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
+		STATEID_VAL(&dp->dl_stid.sc_stateid));
+	open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
+	nfs4_put_stid(&dp->dl_stid);
+	return;
+out_no_deleg:
+	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
+	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
+	    open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
+		dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+		open->op_recall = 1;
+	}
+
+	/* 4.1 client asking for a delegation? */
+	if (open->op_deleg_want)
+		nfsd4_open_deleg_none_ext(open, status);
+	return;
+}
+
+static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
+					struct nfs4_delegation *dp)
+{
+	if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG &&
+	    dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
+		open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+		open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
+	} else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG &&
+		   dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
+		open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+		open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
+	}
+	/* Otherwise the client must be confused wanting a delegation
+	 * it already has, therefore we don't return
+	 * NFS4_OPEN_DELEGATE_NONE_EXT and reason.
+	 */
+}
+
+__be32
+nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
+	struct nfs4_file *fp = NULL;
+	struct nfs4_ol_stateid *stp = NULL;
+	struct nfs4_delegation *dp = NULL;
+	__be32 status;
+
+	/*
+	 * Lookup file; if found, lookup stateid and check open request,
+	 * and check for delegations in the process of being recalled.
+	 * If not found, create the nfs4_file struct
+	 */
+	fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
+	if (fp != open->op_file) {
+		status = nfs4_check_deleg(cl, open, &dp);
+		if (status)
+			goto out;
+		stp = nfsd4_find_existing_open(fp, open);
+	} else {
+		open->op_file = NULL;
+		status = nfserr_bad_stateid;
+		if (nfsd4_is_deleg_cur(open))
+			goto out;
+	}
+
+	/*
+	 * OPEN the file, or upgrade an existing OPEN.
+	 * If truncate fails, the OPEN fails.
+	 */
+	if (stp) {
+		/* Stateid was found, this is an OPEN upgrade */
+		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
+		if (status)
+			goto out;
+	} else {
+		stp = open->op_stp;
+		open->op_stp = NULL;
+		init_open_stateid(stp, fp, open);
+		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+		if (status) {
+			release_open_stateid(stp);
+			goto out;
+		}
+
+		stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp,
+							open->op_odstate);
+		if (stp->st_clnt_odstate == open->op_odstate)
+			open->op_odstate = NULL;
+	}
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+
+	if (nfsd4_has_session(&resp->cstate)) {
+		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
+			open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+			open->op_why_no_deleg = WND4_NOT_WANTED;
+			goto nodeleg;
+		}
+	}
+
+	/*
+	* Attempt to hand out a delegation. No error return, because the
+	* OPEN succeeds even if we fail.
+	*/
+	nfs4_open_delegation(current_fh, open, stp);
+nodeleg:
+	status = nfs_ok;
+
+	dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
+		STATEID_VAL(&stp->st_stid.sc_stateid));
+out:
+	/* 4.1 client trying to upgrade/downgrade delegation? */
+	if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp &&
+	    open->op_deleg_want)
+		nfsd4_deleg_xgrade_none_ext(open, dp);
+
+	if (fp)
+		put_nfs4_file(fp);
+	if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+		nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate));
+	/*
+	* To finish the open response, we just need to set the rflags.
+	*/
+	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
+	if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
+	    !nfsd4_has_session(&resp->cstate))
+		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+	if (dp)
+		nfs4_put_stid(&dp->dl_stid);
+	if (stp)
+		nfs4_put_stid(&stp->st_stid);
+
+	return status;
+}
+
+void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+			      struct nfsd4_open *open)
+{
+	if (open->op_openowner) {
+		struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
+
+		nfsd4_cstate_assign_replay(cstate, so);
+		nfs4_put_stateowner(so);
+	}
+	if (open->op_file)
+		kmem_cache_free(file_slab, open->op_file);
+	if (open->op_stp)
+		nfs4_put_stid(&open->op_stp->st_stid);
+	if (open->op_odstate)
+		kmem_cache_free(odstate_slab, open->op_odstate);
+}
+
+__be32
+nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    clientid_t *clid)
+{
+	struct nfs4_client *clp;
+	__be32 status;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	dprintk("process_renew(%08x/%08x): starting\n", 
+			clid->cl_boot, clid->cl_id);
+	status = lookup_clientid(clid, cstate, nn);
+	if (status)
+		goto out;
+	clp = cstate->clp;
+	status = nfserr_cb_path_down;
+	if (!list_empty(&clp->cl_delegations)
+			&& clp->cl_cb_state != NFSD4_CB_UP)
+		goto out;
+	status = nfs_ok;
+out:
+	return status;
+}
+
+void
+nfsd4_end_grace(struct nfsd_net *nn)
+{
+	/* do nothing if grace period already ended */
+	if (nn->grace_ended)
+		return;
+
+	dprintk("NFSD: end of grace period\n");
+	nn->grace_ended = true;
+	/*
+	 * If the server goes down again right now, an NFSv4
+	 * client will still be allowed to reclaim after it comes back up,
+	 * even if it hasn't yet had a chance to reclaim state this time.
+	 *
+	 */
+	nfsd4_record_grace_done(nn);
+	/*
+	 * At this point, NFSv4 clients can still reclaim.  But if the
+	 * server crashes, any that have not yet reclaimed will be out
+	 * of luck on the next boot.
+	 *
+	 * (NFSv4.1+ clients are considered to have reclaimed once they
+	 * call RECLAIM_COMPLETE.  NFSv4.0 clients are considered to
+	 * have reclaimed after their first OPEN.)
+	 */
+	locks_end_grace(&nn->nfsd4_manager);
+	/*
+	 * At this point, and once lockd and/or any other containers
+	 * exit their grace period, further reclaims will fail and
+	 * regular locking can resume.
+	 */
+}
+
+static time_t
+nfs4_laundromat(struct nfsd_net *nn)
+{
+	struct nfs4_client *clp;
+	struct nfs4_openowner *oo;
+	struct nfs4_delegation *dp;
+	struct nfs4_ol_stateid *stp;
+	struct list_head *pos, *next, reaplist;
+	time_t cutoff = get_seconds() - nn->nfsd4_lease;
+	time_t t, new_timeo = nn->nfsd4_lease;
+
+	dprintk("NFSD: laundromat service - starting\n");
+	nfsd4_end_grace(nn);
+	INIT_LIST_HEAD(&reaplist);
+	spin_lock(&nn->client_lock);
+	list_for_each_safe(pos, next, &nn->client_lru) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
+		if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
+			t = clp->cl_time - cutoff;
+			new_timeo = min(new_timeo, t);
+			break;
+		}
+		if (mark_client_expired_locked(clp)) {
+			dprintk("NFSD: client in use (clientid %08x)\n",
+				clp->cl_clientid.cl_id);
+			continue;
+		}
+		list_add(&clp->cl_lru, &reaplist);
+	}
+	spin_unlock(&nn->client_lock);
+	list_for_each_safe(pos, next, &reaplist) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
+		dprintk("NFSD: purging unused client (clientid %08x)\n",
+			clp->cl_clientid.cl_id);
+		list_del_init(&clp->cl_lru);
+		expire_client(clp);
+	}
+	spin_lock(&state_lock);
+	list_for_each_safe(pos, next, &nn->del_recall_lru) {
+		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+		if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
+			continue;
+		if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
+			t = dp->dl_time - cutoff;
+			new_timeo = min(new_timeo, t);
+			break;
+		}
+		unhash_delegation_locked(dp);
+		list_add(&dp->dl_recall_lru, &reaplist);
+	}
+	spin_unlock(&state_lock);
+	while (!list_empty(&reaplist)) {
+		dp = list_first_entry(&reaplist, struct nfs4_delegation,
+					dl_recall_lru);
+		list_del_init(&dp->dl_recall_lru);
+		revoke_delegation(dp);
+	}
+
+	spin_lock(&nn->client_lock);
+	while (!list_empty(&nn->close_lru)) {
+		oo = list_first_entry(&nn->close_lru, struct nfs4_openowner,
+					oo_close_lru);
+		if (time_after((unsigned long)oo->oo_time,
+			       (unsigned long)cutoff)) {
+			t = oo->oo_time - cutoff;
+			new_timeo = min(new_timeo, t);
+			break;
+		}
+		list_del_init(&oo->oo_close_lru);
+		stp = oo->oo_last_closed_stid;
+		oo->oo_last_closed_stid = NULL;
+		spin_unlock(&nn->client_lock);
+		nfs4_put_stid(&stp->st_stid);
+		spin_lock(&nn->client_lock);
+	}
+	spin_unlock(&nn->client_lock);
+
+	new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
+	return new_timeo;
+}
+
+static struct workqueue_struct *laundry_wq;
+static void laundromat_main(struct work_struct *);
+
+static void
+laundromat_main(struct work_struct *laundry)
+{
+	time_t t;
+	struct delayed_work *dwork = container_of(laundry, struct delayed_work,
+						  work);
+	struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+					   laundromat_work);
+
+	t = nfs4_laundromat(nn);
+	dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t);
+	queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
+}
+
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
+{
+	if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+		return nfserr_bad_stateid;
+	return nfs_ok;
+}
+
+static inline int
+access_permit_read(struct nfs4_ol_stateid *stp)
+{
+	return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+		test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+		test_access(NFS4_SHARE_ACCESS_WRITE, stp);
+}
+
+static inline int
+access_permit_write(struct nfs4_ol_stateid *stp)
+{
+	return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+		test_access(NFS4_SHARE_ACCESS_BOTH, stp);
+}
+
+static
+__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
+{
+        __be32 status = nfserr_openmode;
+
+	/* For lock stateid's, we test the parent open, not the lock: */
+	if (stp->st_openstp)
+		stp = stp->st_openstp;
+	if ((flags & WR_STATE) && !access_permit_write(stp))
+                goto out;
+	if ((flags & RD_STATE) && !access_permit_read(stp))
+                goto out;
+	status = nfs_ok;
+out:
+	return status;
+}
+
+static inline __be32
+check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags)
+{
+	if (ONE_STATEID(stateid) && (flags & RD_STATE))
+		return nfs_ok;
+	else if (locks_in_grace(net)) {
+		/* Answer in remaining cases depends on existence of
+		 * conflicting state; so we must wait out the grace period. */
+		return nfserr_grace;
+	} else if (flags & WR_STATE)
+		return nfs4_share_conflict(current_fh,
+				NFS4_SHARE_DENY_WRITE);
+	else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */
+		return nfs4_share_conflict(current_fh,
+				NFS4_SHARE_DENY_READ);
+}
+
+/*
+ * Allow READ/WRITE during grace period on recovered state only for files
+ * that are not able to provide mandatory locking.
+ */
+static inline int
+grace_disallows_io(struct net *net, struct inode *inode)
+{
+	return locks_in_grace(net) && mandatory_lock(inode);
+}
+
+/* Returns true iff a is later than b: */
+static bool stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+	return (s32)(a->si_generation - b->si_generation) > 0;
+}
+
+static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
+{
+	/*
+	 * When sessions are used the stateid generation number is ignored
+	 * when it is zero.
+	 */
+	if (has_session && in->si_generation == 0)
+		return nfs_ok;
+
+	if (in->si_generation == ref->si_generation)
+		return nfs_ok;
+
+	/* If the client sends us a stateid from the future, it's buggy: */
+	if (stateid_generation_after(in, ref))
+		return nfserr_bad_stateid;
+	/*
+	 * However, we could see a stateid from the past, even from a
+	 * non-buggy client.  For example, if the client sends a lock
+	 * while some IO is outstanding, the lock may bump si_generation
+	 * while the IO is still in flight.  The client could avoid that
+	 * situation by waiting for responses on all the IO requests,
+	 * but better performance may result in retrying IO that
+	 * receives an old_stateid error if requests are rarely
+	 * reordered in flight:
+	 */
+	return nfserr_old_stateid;
+}
+
+static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols)
+{
+	if (ols->st_stateowner->so_is_open_owner &&
+	    !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+		return nfserr_bad_stateid;
+	return nfs_ok;
+}
+
+static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
+{
+	struct nfs4_stid *s;
+	__be32 status = nfserr_bad_stateid;
+
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		return status;
+	/* Client debugging aid. */
+	if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
+		char addr_str[INET6_ADDRSTRLEN];
+		rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str,
+				 sizeof(addr_str));
+		pr_warn_ratelimited("NFSD: client %s testing state ID "
+					"with incorrect client ID\n", addr_str);
+		return status;
+	}
+	spin_lock(&cl->cl_lock);
+	s = find_stateid_locked(cl, stateid);
+	if (!s)
+		goto out_unlock;
+	status = check_stateid_generation(stateid, &s->sc_stateid, 1);
+	if (status)
+		goto out_unlock;
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
+		status = nfs_ok;
+		break;
+	case NFS4_REVOKED_DELEG_STID:
+		status = nfserr_deleg_revoked;
+		break;
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
+		status = nfsd4_check_openowner_confirmed(openlockstateid(s));
+		break;
+	default:
+		printk("unknown stateid type %x\n", s->sc_type);
+		/* Fallthrough */
+	case NFS4_CLOSED_STID:
+	case NFS4_CLOSED_DELEG_STID:
+		status = nfserr_bad_stateid;
+	}
+out_unlock:
+	spin_unlock(&cl->cl_lock);
+	return status;
+}
+
+__be32
+nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+		     stateid_t *stateid, unsigned char typemask,
+		     struct nfs4_stid **s, struct nfsd_net *nn)
+{
+	__be32 status;
+
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		return nfserr_bad_stateid;
+	status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn);
+	if (status == nfserr_stale_clientid) {
+		if (cstate->session)
+			return nfserr_bad_stateid;
+		return nfserr_stale_stateid;
+	}
+	if (status)
+		return status;
+	*s = find_stateid_by_type(cstate->clp, stateid, typemask);
+	if (!*s)
+		return nfserr_bad_stateid;
+	return nfs_ok;
+}
+
+/*
+* Checks for stateid operations
+*/
+__be32
+nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
+			   stateid_t *stateid, int flags, struct file **filpp)
+{
+	struct nfs4_stid *s;
+	struct nfs4_ol_stateid *stp = NULL;
+	struct nfs4_delegation *dp = NULL;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	struct inode *ino = d_inode(current_fh->fh_dentry);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct file *file = NULL;
+	__be32 status;
+
+	if (filpp)
+		*filpp = NULL;
+
+	if (grace_disallows_io(net, ino))
+		return nfserr_grace;
+
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		return check_special_stateids(net, current_fh, stateid, flags);
+
+	status = nfsd4_lookup_stateid(cstate, stateid,
+				NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+				&s, nn);
+	if (status)
+		return status;
+	status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
+	if (status)
+		goto out;
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
+		dp = delegstateid(s);
+		status = nfs4_check_delegmode(dp, flags);
+		if (status)
+			goto out;
+		if (filpp) {
+			file = dp->dl_stid.sc_file->fi_deleg_file;
+			if (!file) {
+				WARN_ON_ONCE(1);
+				status = nfserr_serverfault;
+				goto out;
+			}
+			get_file(file);
+		}
+		break;
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
+		stp = openlockstateid(s);
+		status = nfs4_check_fh(current_fh, stp);
+		if (status)
+			goto out;
+		status = nfsd4_check_openowner_confirmed(stp);
+		if (status)
+			goto out;
+		status = nfs4_check_openmode(stp, flags);
+		if (status)
+			goto out;
+		if (filpp) {
+			struct nfs4_file *fp = stp->st_stid.sc_file;
+
+			if (flags & RD_STATE)
+				file = find_readable_file(fp);
+			else
+				file = find_writeable_file(fp);
+		}
+		break;
+	default:
+		status = nfserr_bad_stateid;
+		goto out;
+	}
+	status = nfs_ok;
+	if (file)
+		*filpp = file;
+out:
+	nfs4_put_stid(s);
+	return status;
+}
+
+/*
+ * Test if the stateid is valid
+ */
+__be32
+nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		   struct nfsd4_test_stateid *test_stateid)
+{
+	struct nfsd4_test_stateid_id *stateid;
+	struct nfs4_client *cl = cstate->session->se_client;
+
+	list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
+		stateid->ts_id_status =
+			nfsd4_validate_stateid(cl, &stateid->ts_id_stateid);
+
+	return nfs_ok;
+}
+
+__be32
+nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		   struct nfsd4_free_stateid *free_stateid)
+{
+	stateid_t *stateid = &free_stateid->fr_stateid;
+	struct nfs4_stid *s;
+	struct nfs4_delegation *dp;
+	struct nfs4_ol_stateid *stp;
+	struct nfs4_client *cl = cstate->session->se_client;
+	__be32 ret = nfserr_bad_stateid;
+
+	spin_lock(&cl->cl_lock);
+	s = find_stateid_locked(cl, stateid);
+	if (!s)
+		goto out_unlock;
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
+		ret = nfserr_locks_held;
+		break;
+	case NFS4_OPEN_STID:
+		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+		if (ret)
+			break;
+		ret = nfserr_locks_held;
+		break;
+	case NFS4_LOCK_STID:
+		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+		if (ret)
+			break;
+		stp = openlockstateid(s);
+		ret = nfserr_locks_held;
+		if (check_for_locks(stp->st_stid.sc_file,
+				    lockowner(stp->st_stateowner)))
+			break;
+		unhash_lock_stateid(stp);
+		spin_unlock(&cl->cl_lock);
+		nfs4_put_stid(s);
+		ret = nfs_ok;
+		goto out;
+	case NFS4_REVOKED_DELEG_STID:
+		dp = delegstateid(s);
+		list_del_init(&dp->dl_recall_lru);
+		spin_unlock(&cl->cl_lock);
+		nfs4_put_stid(s);
+		ret = nfs_ok;
+		goto out;
+	/* Default falls through and returns nfserr_bad_stateid */
+	}
+out_unlock:
+	spin_unlock(&cl->cl_lock);
+out:
+	return ret;
+}
+
+static inline int
+setlkflg (int type)
+{
+	return (type == NFS4_READW_LT || type == NFS4_READ_LT) ?
+		RD_STATE : WR_STATE;
+}
+
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	struct nfs4_stateowner *sop = stp->st_stateowner;
+	__be32 status;
+
+	status = nfsd4_check_seqid(cstate, sop, seqid);
+	if (status)
+		return status;
+	if (stp->st_stid.sc_type == NFS4_CLOSED_STID
+		|| stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID)
+		/*
+		 * "Closed" stateid's exist *only* to return
+		 * nfserr_replay_me from the previous step, and
+		 * revoked delegations are kept only for free_stateid.
+		 */
+		return nfserr_bad_stateid;
+	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+	if (status)
+		return status;
+	return nfs4_check_fh(current_fh, stp);
+}
+
+/* 
+ * Checks for sequence id mutating operations. 
+ */
+static __be32
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+			 stateid_t *stateid, char typemask,
+			 struct nfs4_ol_stateid **stpp,
+			 struct nfsd_net *nn)
+{
+	__be32 status;
+	struct nfs4_stid *s;
+	struct nfs4_ol_stateid *stp = NULL;
+
+	dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
+		seqid, STATEID_VAL(stateid));
+
+	*stpp = NULL;
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
+	if (status)
+		return status;
+	stp = openlockstateid(s);
+	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
+
+	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
+	if (!status)
+		*stpp = stp;
+	else
+		nfs4_put_stid(&stp->st_stid);
+	return status;
+}
+
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+						 stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)
+{
+	__be32 status;
+	struct nfs4_openowner *oo;
+	struct nfs4_ol_stateid *stp;
+
+	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
+						NFS4_OPEN_STID, &stp, nn);
+	if (status)
+		return status;
+	oo = openowner(stp->st_stateowner);
+	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+		nfs4_put_stid(&stp->st_stid);
+		return nfserr_bad_stateid;
+	}
+	*stpp = stp;
+	return nfs_ok;
+}
+
+__be32
+nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		   struct nfsd4_open_confirm *oc)
+{
+	__be32 status;
+	struct nfs4_openowner *oo;
+	struct nfs4_ol_stateid *stp;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	dprintk("NFSD: nfsd4_open_confirm on file %pd\n",
+			cstate->current_fh.fh_dentry);
+
+	status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0);
+	if (status)
+		return status;
+
+	status = nfs4_preprocess_seqid_op(cstate,
+					oc->oc_seqid, &oc->oc_req_stateid,
+					NFS4_OPEN_STID, &stp, nn);
+	if (status)
+		goto out;
+	oo = openowner(stp->st_stateowner);
+	status = nfserr_bad_stateid;
+	if (oo->oo_flags & NFS4_OO_CONFIRMED)
+		goto put_stateid;
+	oo->oo_flags |= NFS4_OO_CONFIRMED;
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
+		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
+
+	nfsd4_client_record_create(oo->oo_owner.so_client);
+	status = nfs_ok;
+put_stateid:
+	nfs4_put_stid(&stp->st_stid);
+out:
+	nfsd4_bump_seqid(cstate, status);
+	return status;
+}
+
+static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
+{
+	if (!test_access(access, stp))
+		return;
+	nfs4_file_put_access(stp->st_stid.sc_file, access);
+	clear_access(access, stp);
+}
+
+static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
+{
+	switch (to_access) {
+	case NFS4_SHARE_ACCESS_READ:
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+		break;
+	case NFS4_SHARE_ACCESS_WRITE:
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+		break;
+	case NFS4_SHARE_ACCESS_BOTH:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+__be32
+nfsd4_open_downgrade(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_open_downgrade *od)
+{
+	__be32 status;
+	struct nfs4_ol_stateid *stp;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	dprintk("NFSD: nfsd4_open_downgrade on file %pd\n", 
+			cstate->current_fh.fh_dentry);
+
+	/* We don't yet support WANT bits: */
+	if (od->od_deleg_want)
+		dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__,
+			od->od_deleg_want);
+
+	status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
+					&od->od_stateid, &stp, nn);
+	if (status)
+		goto out; 
+	status = nfserr_inval;
+	if (!test_access(od->od_share_access, stp)) {
+		dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n",
+			stp->st_access_bmap, od->od_share_access);
+		goto put_stateid;
+	}
+	if (!test_deny(od->od_share_deny, stp)) {
+		dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n",
+			stp->st_deny_bmap, od->od_share_deny);
+		goto put_stateid;
+	}
+	nfs4_stateid_downgrade(stp, od->od_share_access);
+
+	reset_union_bmap_deny(od->od_share_deny, stp);
+
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	status = nfs_ok;
+put_stateid:
+	nfs4_put_stid(&stp->st_stid);
+out:
+	nfsd4_bump_seqid(cstate, status);
+	return status;
+}
+
+static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+{
+	struct nfs4_client *clp = s->st_stid.sc_client;
+	LIST_HEAD(reaplist);
+
+	s->st_stid.sc_type = NFS4_CLOSED_STID;
+	spin_lock(&clp->cl_lock);
+	unhash_open_stateid(s, &reaplist);
+
+	if (clp->cl_minorversion) {
+		put_ol_stateid_locked(s, &reaplist);
+		spin_unlock(&clp->cl_lock);
+		free_ol_stateid_reaplist(&reaplist);
+	} else {
+		spin_unlock(&clp->cl_lock);
+		free_ol_stateid_reaplist(&reaplist);
+		move_to_close_lru(s, clp->net);
+	}
+}
+
+/*
+ * nfs4_unlock_state() called after encode
+ */
+__be32
+nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_close *close)
+{
+	__be32 status;
+	struct nfs4_ol_stateid *stp;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	dprintk("NFSD: nfsd4_close on file %pd\n", 
+			cstate->current_fh.fh_dentry);
+
+	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
+					&close->cl_stateid,
+					NFS4_OPEN_STID|NFS4_CLOSED_STID,
+					&stp, nn);
+	nfsd4_bump_seqid(cstate, status);
+	if (status)
+		goto out; 
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+
+	nfsd4_close_open_stateid(stp);
+
+	/* put reference from nfs4_preprocess_seqid_op */
+	nfs4_put_stid(&stp->st_stid);
+out:
+	return status;
+}
+
+__be32
+nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		  struct nfsd4_delegreturn *dr)
+{
+	struct nfs4_delegation *dp;
+	stateid_t *stateid = &dr->dr_stateid;
+	struct nfs4_stid *s;
+	__be32 status;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
+		return status;
+
+	status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
+	if (status)
+		goto out;
+	dp = delegstateid(s);
+	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
+	if (status)
+		goto put_stateid;
+
+	destroy_delegation(dp);
+put_stateid:
+	nfs4_put_stid(&dp->dl_stid);
+out:
+	return status;
+}
+
+
+#define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start))
+
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end: NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	WARN_ON_ONCE(!len);
+	end = start + len;
+	return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
+/*
+ * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
+ * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
+ * byte, because of sign extension problems.  Since NFSv4 calls for 64-bit
+ * locking, this prevents us from being completely protocol-compliant.  The
+ * real solution to this problem is to start using unsigned file offsets in
+ * the VFS, but this is a very deep change!
+ */
+static inline void
+nfs4_transform_lock_offset(struct file_lock *lock)
+{
+	if (lock->fl_start < 0)
+		lock->fl_start = OFFSET_MAX;
+	if (lock->fl_end < 0)
+		lock->fl_end = OFFSET_MAX;
+}
+
+static fl_owner_t
+nfsd4_fl_get_owner(fl_owner_t owner)
+{
+	struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
+
+	nfs4_get_stateowner(&lo->lo_owner);
+	return owner;
+}
+
+static void
+nfsd4_fl_put_owner(fl_owner_t owner)
+{
+	struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
+
+	if (lo)
+		nfs4_put_stateowner(&lo->lo_owner);
+}
+
+static const struct lock_manager_operations nfsd_posix_mng_ops  = {
+	.lm_get_owner = nfsd4_fl_get_owner,
+	.lm_put_owner = nfsd4_fl_put_owner,
+};
+
+static inline void
+nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
+{
+	struct nfs4_lockowner *lo;
+
+	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
+		lo = (struct nfs4_lockowner *) fl->fl_owner;
+		deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
+					lo->lo_owner.so_owner.len, GFP_KERNEL);
+		if (!deny->ld_owner.data)
+			/* We just don't care that much */
+			goto nevermind;
+		deny->ld_owner.len = lo->lo_owner.so_owner.len;
+		deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
+	} else {
+nevermind:
+		deny->ld_owner.len = 0;
+		deny->ld_owner.data = NULL;
+		deny->ld_clientid.cl_boot = 0;
+		deny->ld_clientid.cl_id = 0;
+	}
+	deny->ld_start = fl->fl_start;
+	deny->ld_length = NFS4_MAX_UINT64;
+	if (fl->fl_end != NFS4_MAX_UINT64)
+		deny->ld_length = fl->fl_end - fl->fl_start + 1;        
+	deny->ld_type = NFS4_READ_LT;
+	if (fl->fl_type != F_RDLCK)
+		deny->ld_type = NFS4_WRITE_LT;
+}
+
+static struct nfs4_lockowner *
+find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
+		struct nfs4_client *clp)
+{
+	unsigned int strhashval = ownerstr_hashval(owner);
+	struct nfs4_stateowner *so;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval],
+			    so_strhash) {
+		if (so->so_is_open_owner)
+			continue;
+		if (same_owner_str(so, owner))
+			return lockowner(nfs4_get_stateowner(so));
+	}
+	return NULL;
+}
+
+static struct nfs4_lockowner *
+find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner,
+		struct nfs4_client *clp)
+{
+	struct nfs4_lockowner *lo;
+
+	spin_lock(&clp->cl_lock);
+	lo = find_lockowner_str_locked(clid, owner, clp);
+	spin_unlock(&clp->cl_lock);
+	return lo;
+}
+
+static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop)
+{
+	unhash_lockowner_locked(lockowner(sop));
+}
+
+static void nfs4_free_lockowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_lockowner *lo = lockowner(sop);
+
+	kmem_cache_free(lockowner_slab, lo);
+}
+
+static const struct nfs4_stateowner_operations lockowner_ops = {
+	.so_unhash =	nfs4_unhash_lockowner,
+	.so_free =	nfs4_free_lockowner,
+};
+
+/*
+ * Alloc a lock owner structure.
+ * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
+ * occurred. 
+ *
+ * strhashval = ownerstr_hashval
+ */
+static struct nfs4_lockowner *
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
+			   struct nfs4_ol_stateid *open_stp,
+			   struct nfsd4_lock *lock)
+{
+	struct nfs4_lockowner *lo, *ret;
+
+	lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
+	if (!lo)
+		return NULL;
+	INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
+	lo->lo_owner.so_is_open_owner = 0;
+	lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
+	lo->lo_owner.so_ops = &lockowner_ops;
+	spin_lock(&clp->cl_lock);
+	ret = find_lockowner_str_locked(&clp->cl_clientid,
+			&lock->lk_new_owner, clp);
+	if (ret == NULL) {
+		list_add(&lo->lo_owner.so_strhash,
+			 &clp->cl_ownerstr_hashtbl[strhashval]);
+		ret = lo;
+	} else
+		nfs4_free_lockowner(&lo->lo_owner);
+	spin_unlock(&clp->cl_lock);
+	return ret;
+}
+
+static void
+init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
+		  struct nfs4_file *fp, struct inode *inode,
+		  struct nfs4_ol_stateid *open_stp)
+{
+	struct nfs4_client *clp = lo->lo_owner.so_client;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	atomic_inc(&stp->st_stid.sc_count);
+	stp->st_stid.sc_type = NFS4_LOCK_STID;
+	stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
+	get_nfs4_file(fp);
+	stp->st_stid.sc_file = fp;
+	stp->st_stid.sc_free = nfs4_free_lock_stateid;
+	stp->st_access_bmap = 0;
+	stp->st_deny_bmap = open_stp->st_deny_bmap;
+	stp->st_openstp = open_stp;
+	list_add(&stp->st_locks, &open_stp->st_locks);
+	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+	spin_lock(&fp->fi_lock);
+	list_add(&stp->st_perfile, &fp->fi_stateids);
+	spin_unlock(&fp->fi_lock);
+}
+
+static struct nfs4_ol_stateid *
+find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
+{
+	struct nfs4_ol_stateid *lst;
+	struct nfs4_client *clp = lo->lo_owner.so_client;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
+		if (lst->st_stid.sc_file == fp) {
+			atomic_inc(&lst->st_stid.sc_count);
+			return lst;
+		}
+	}
+	return NULL;
+}
+
+static struct nfs4_ol_stateid *
+find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
+			    struct inode *inode, struct nfs4_ol_stateid *ost,
+			    bool *new)
+{
+	struct nfs4_stid *ns = NULL;
+	struct nfs4_ol_stateid *lst;
+	struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+	struct nfs4_client *clp = oo->oo_owner.so_client;
+
+	spin_lock(&clp->cl_lock);
+	lst = find_lock_stateid(lo, fi);
+	if (lst == NULL) {
+		spin_unlock(&clp->cl_lock);
+		ns = nfs4_alloc_stid(clp, stateid_slab);
+		if (ns == NULL)
+			return NULL;
+
+		spin_lock(&clp->cl_lock);
+		lst = find_lock_stateid(lo, fi);
+		if (likely(!lst)) {
+			lst = openlockstateid(ns);
+			init_lock_stateid(lst, lo, fi, inode, ost);
+			ns = NULL;
+			*new = true;
+		}
+	}
+	spin_unlock(&clp->cl_lock);
+	if (ns)
+		nfs4_put_stid(ns);
+	return lst;
+}
+
+static int
+check_lock_length(u64 offset, u64 length)
+{
+	return ((length == 0)  || ((length != NFS4_MAX_UINT64) &&
+	     LOFF_OVERFLOW(offset, length)));
+}
+
+static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
+{
+	struct nfs4_file *fp = lock_stp->st_stid.sc_file;
+
+	lockdep_assert_held(&fp->fi_lock);
+
+	if (test_access(access, lock_stp))
+		return;
+	__nfs4_file_get_access(fp, access);
+	set_access(access, lock_stp);
+}
+
+static __be32
+lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
+			    struct nfs4_ol_stateid *ost,
+			    struct nfsd4_lock *lock,
+			    struct nfs4_ol_stateid **lst, bool *new)
+{
+	__be32 status;
+	struct nfs4_file *fi = ost->st_stid.sc_file;
+	struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+	struct nfs4_client *cl = oo->oo_owner.so_client;
+	struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
+	struct nfs4_lockowner *lo;
+	unsigned int strhashval;
+
+	lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl);
+	if (!lo) {
+		strhashval = ownerstr_hashval(&lock->v.new.owner);
+		lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
+		if (lo == NULL)
+			return nfserr_jukebox;
+	} else {
+		/* with an existing lockowner, seqids must be the same */
+		status = nfserr_bad_seqid;
+		if (!cstate->minorversion &&
+		    lock->lk_new_lock_seqid != lo->lo_owner.so_seqid)
+			goto out;
+	}
+
+	*lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+	if (*lst == NULL) {
+		status = nfserr_jukebox;
+		goto out;
+	}
+	status = nfs_ok;
+out:
+	nfs4_put_stateowner(&lo->lo_owner);
+	return status;
+}
+
+/*
+ *  LOCK operation 
+ */
+__be32
+nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_lock *lock)
+{
+	struct nfs4_openowner *open_sop = NULL;
+	struct nfs4_lockowner *lock_sop = NULL;
+	struct nfs4_ol_stateid *lock_stp = NULL;
+	struct nfs4_ol_stateid *open_stp = NULL;
+	struct nfs4_file *fp;
+	struct file *filp = NULL;
+	struct file_lock *file_lock = NULL;
+	struct file_lock *conflock = NULL;
+	__be32 status = 0;
+	int lkflg;
+	int err;
+	bool new = false;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
+		(long long) lock->lk_offset,
+		(long long) lock->lk_length);
+
+	if (check_lock_length(lock->lk_offset, lock->lk_length))
+		 return nfserr_inval;
+
+	if ((status = fh_verify(rqstp, &cstate->current_fh,
+				S_IFREG, NFSD_MAY_LOCK))) {
+		dprintk("NFSD: nfsd4_lock: permission denied!\n");
+		return status;
+	}
+
+	if (lock->lk_is_new) {
+		if (nfsd4_has_session(cstate))
+			/* See rfc 5661 18.10.3: given clientid is ignored: */
+			memcpy(&lock->v.new.clientid,
+				&cstate->session->se_client->cl_clientid,
+				sizeof(clientid_t));
+
+		status = nfserr_stale_clientid;
+		if (STALE_CLIENTID(&lock->lk_new_clientid, nn))
+			goto out;
+
+		/* validate and update open stateid and open seqid */
+		status = nfs4_preprocess_confirmed_seqid_op(cstate,
+				        lock->lk_new_open_seqid,
+		                        &lock->lk_new_open_stateid,
+					&open_stp, nn);
+		if (status)
+			goto out;
+		open_sop = openowner(open_stp->st_stateowner);
+		status = nfserr_bad_stateid;
+		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+						&lock->v.new.clientid))
+			goto out;
+		status = lookup_or_create_lock_state(cstate, open_stp, lock,
+							&lock_stp, &new);
+	} else {
+		status = nfs4_preprocess_seqid_op(cstate,
+				       lock->lk_old_lock_seqid,
+				       &lock->lk_old_lock_stateid,
+				       NFS4_LOCK_STID, &lock_stp, nn);
+	}
+	if (status)
+		goto out;
+	lock_sop = lockowner(lock_stp->st_stateowner);
+
+	lkflg = setlkflg(lock->lk_type);
+	status = nfs4_check_openmode(lock_stp, lkflg);
+	if (status)
+		goto out;
+
+	status = nfserr_grace;
+	if (locks_in_grace(net) && !lock->lk_reclaim)
+		goto out;
+	status = nfserr_no_grace;
+	if (!locks_in_grace(net) && lock->lk_reclaim)
+		goto out;
+
+	file_lock = locks_alloc_lock();
+	if (!file_lock) {
+		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+		status = nfserr_jukebox;
+		goto out;
+	}
+
+	fp = lock_stp->st_stid.sc_file;
+	switch (lock->lk_type) {
+		case NFS4_READ_LT:
+		case NFS4_READW_LT:
+			spin_lock(&fp->fi_lock);
+			filp = find_readable_file_locked(fp);
+			if (filp)
+				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
+			spin_unlock(&fp->fi_lock);
+			file_lock->fl_type = F_RDLCK;
+			break;
+		case NFS4_WRITE_LT:
+		case NFS4_WRITEW_LT:
+			spin_lock(&fp->fi_lock);
+			filp = find_writeable_file_locked(fp);
+			if (filp)
+				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
+			spin_unlock(&fp->fi_lock);
+			file_lock->fl_type = F_WRLCK;
+			break;
+		default:
+			status = nfserr_inval;
+		goto out;
+	}
+	if (!filp) {
+		status = nfserr_openmode;
+		goto out;
+	}
+
+	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
+	file_lock->fl_pid = current->tgid;
+	file_lock->fl_file = filp;
+	file_lock->fl_flags = FL_POSIX;
+	file_lock->fl_lmops = &nfsd_posix_mng_ops;
+	file_lock->fl_start = lock->lk_offset;
+	file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
+	nfs4_transform_lock_offset(file_lock);
+
+	conflock = locks_alloc_lock();
+	if (!conflock) {
+		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+		status = nfserr_jukebox;
+		goto out;
+	}
+
+	err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
+	switch (-err) {
+	case 0: /* success! */
+		update_stateid(&lock_stp->st_stid.sc_stateid);
+		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, 
+				sizeof(stateid_t));
+		status = 0;
+		break;
+	case (EAGAIN):		/* conflock holds conflicting lock */
+		status = nfserr_denied;
+		dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
+		nfs4_set_lock_denied(conflock, &lock->lk_denied);
+		break;
+	case (EDEADLK):
+		status = nfserr_deadlock;
+		break;
+	default:
+		dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
+		status = nfserrno(err);
+		break;
+	}
+out:
+	if (filp)
+		fput(filp);
+	if (lock_stp) {
+		/* Bump seqid manually if the 4.0 replay owner is openowner */
+		if (cstate->replay_owner &&
+		    cstate->replay_owner != &lock_sop->lo_owner &&
+		    seqid_mutating_err(ntohl(status)))
+			lock_sop->lo_owner.so_seqid++;
+
+		/*
+		 * If this is a new, never-before-used stateid, and we are
+		 * returning an error, then just go ahead and release it.
+		 */
+		if (status && new)
+			release_lock_stateid(lock_stp);
+
+		nfs4_put_stid(&lock_stp->st_stid);
+	}
+	if (open_stp)
+		nfs4_put_stid(&open_stp->st_stid);
+	nfsd4_bump_seqid(cstate, status);
+	if (file_lock)
+		locks_free_lock(file_lock);
+	if (conflock)
+		locks_free_lock(conflock);
+	return status;
+}
+
+/*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+	struct file *file;
+	__be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	if (!err) {
+		err = nfserrno(vfs_test_lock(file, lock));
+		nfsd_close(file);
+	}
+	return err;
+}
+
+/*
+ * LOCKT operation
+ */
+__be32
+nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_lockt *lockt)
+{
+	struct file_lock *file_lock = NULL;
+	struct nfs4_lockowner *lo = NULL;
+	__be32 status;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	if (locks_in_grace(SVC_NET(rqstp)))
+		return nfserr_grace;
+
+	if (check_lock_length(lockt->lt_offset, lockt->lt_length))
+		 return nfserr_inval;
+
+	if (!nfsd4_has_session(cstate)) {
+		status = lookup_clientid(&lockt->lt_clientid, cstate, nn);
+		if (status)
+			goto out;
+	}
+
+	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
+		goto out;
+
+	file_lock = locks_alloc_lock();
+	if (!file_lock) {
+		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+		status = nfserr_jukebox;
+		goto out;
+	}
+
+	switch (lockt->lt_type) {
+		case NFS4_READ_LT:
+		case NFS4_READW_LT:
+			file_lock->fl_type = F_RDLCK;
+		break;
+		case NFS4_WRITE_LT:
+		case NFS4_WRITEW_LT:
+			file_lock->fl_type = F_WRLCK;
+		break;
+		default:
+			dprintk("NFSD: nfs4_lockt: bad lock type!\n");
+			status = nfserr_inval;
+		goto out;
+	}
+
+	lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner,
+				cstate->clp);
+	if (lo)
+		file_lock->fl_owner = (fl_owner_t)lo;
+	file_lock->fl_pid = current->tgid;
+	file_lock->fl_flags = FL_POSIX;
+
+	file_lock->fl_start = lockt->lt_offset;
+	file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
+
+	nfs4_transform_lock_offset(file_lock);
+
+	status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock);
+	if (status)
+		goto out;
+
+	if (file_lock->fl_type != F_UNLCK) {
+		status = nfserr_denied;
+		nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
+	}
+out:
+	if (lo)
+		nfs4_put_stateowner(&lo->lo_owner);
+	if (file_lock)
+		locks_free_lock(file_lock);
+	return status;
+}
+
+__be32
+nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_locku *locku)
+{
+	struct nfs4_ol_stateid *stp;
+	struct file *filp = NULL;
+	struct file_lock *file_lock = NULL;
+	__be32 status;
+	int err;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
+		(long long) locku->lu_offset,
+		(long long) locku->lu_length);
+
+	if (check_lock_length(locku->lu_offset, locku->lu_length))
+		 return nfserr_inval;
+
+	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
+					&locku->lu_stateid, NFS4_LOCK_STID,
+					&stp, nn);
+	if (status)
+		goto out;
+	filp = find_any_file(stp->st_stid.sc_file);
+	if (!filp) {
+		status = nfserr_lock_range;
+		goto put_stateid;
+	}
+	file_lock = locks_alloc_lock();
+	if (!file_lock) {
+		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+		status = nfserr_jukebox;
+		goto fput;
+	}
+
+	file_lock->fl_type = F_UNLCK;
+	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
+	file_lock->fl_pid = current->tgid;
+	file_lock->fl_file = filp;
+	file_lock->fl_flags = FL_POSIX;
+	file_lock->fl_lmops = &nfsd_posix_mng_ops;
+	file_lock->fl_start = locku->lu_offset;
+
+	file_lock->fl_end = last_byte_offset(locku->lu_offset,
+						locku->lu_length);
+	nfs4_transform_lock_offset(file_lock);
+
+	err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
+	if (err) {
+		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
+		goto out_nfserr;
+	}
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+fput:
+	fput(filp);
+put_stateid:
+	nfs4_put_stid(&stp->st_stid);
+out:
+	nfsd4_bump_seqid(cstate, status);
+	if (file_lock)
+		locks_free_lock(file_lock);
+	return status;
+
+out_nfserr:
+	status = nfserrno(err);
+	goto fput;
+}
+
+/*
+ * returns
+ * 	true:  locks held by lockowner
+ * 	false: no locks held by lockowner
+ */
+static bool
+check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
+{
+	struct file_lock *fl;
+	int status = false;
+	struct file *filp = find_any_file(fp);
+	struct inode *inode;
+	struct file_lock_context *flctx;
+
+	if (!filp) {
+		/* Any valid lock stateid should have some sort of access */
+		WARN_ON_ONCE(1);
+		return status;
+	}
+
+	inode = file_inode(filp);
+	flctx = inode->i_flctx;
+
+	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+		spin_lock(&flctx->flc_lock);
+		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+			if (fl->fl_owner == (fl_owner_t)lowner) {
+				status = true;
+				break;
+			}
+		}
+		spin_unlock(&flctx->flc_lock);
+	}
+	fput(filp);
+	return status;
+}
+
+__be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+			struct nfsd4_compound_state *cstate,
+			struct nfsd4_release_lockowner *rlockowner)
+{
+	clientid_t *clid = &rlockowner->rl_clientid;
+	struct nfs4_stateowner *sop;
+	struct nfs4_lockowner *lo = NULL;
+	struct nfs4_ol_stateid *stp;
+	struct xdr_netobj *owner = &rlockowner->rl_owner;
+	unsigned int hashval = ownerstr_hashval(owner);
+	__be32 status;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	struct nfs4_client *clp;
+
+	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
+		clid->cl_boot, clid->cl_id);
+
+	status = lookup_clientid(clid, cstate, nn);
+	if (status)
+		return status;
+
+	clp = cstate->clp;
+	/* Find the matching lock stateowner */
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
+			    so_strhash) {
+
+		if (sop->so_is_open_owner || !same_owner_str(sop, owner))
+			continue;
+
+		/* see if there are still any locks associated with it */
+		lo = lockowner(sop);
+		list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
+			if (check_for_locks(stp->st_stid.sc_file, lo)) {
+				status = nfserr_locks_held;
+				spin_unlock(&clp->cl_lock);
+				return status;
+			}
+		}
+
+		nfs4_get_stateowner(sop);
+		break;
+	}
+	spin_unlock(&clp->cl_lock);
+	if (lo)
+		release_lockowner(lo);
+	return status;
+}
+
+static inline struct nfs4_client_reclaim *
+alloc_reclaim(void)
+{
+	return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
+}
+
+bool
+nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
+{
+	struct nfs4_client_reclaim *crp;
+
+	crp = nfsd4_find_reclaim_client(name, nn);
+	return (crp && crp->cr_clp);
+}
+
+/*
+ * failure => all reset bets are off, nfserr_no_grace...
+ */
+struct nfs4_client_reclaim *
+nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
+{
+	unsigned int strhashval;
+	struct nfs4_client_reclaim *crp;
+
+	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
+	crp = alloc_reclaim();
+	if (crp) {
+		strhashval = clientstr_hashval(name);
+		INIT_LIST_HEAD(&crp->cr_strhash);
+		list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
+		memcpy(crp->cr_recdir, name, HEXDIR_LEN);
+		crp->cr_clp = NULL;
+		nn->reclaim_str_hashtbl_size++;
+	}
+	return crp;
+}
+
+void
+nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
+{
+	list_del(&crp->cr_strhash);
+	kfree(crp);
+	nn->reclaim_str_hashtbl_size--;
+}
+
+void
+nfs4_release_reclaim(struct nfsd_net *nn)
+{
+	struct nfs4_client_reclaim *crp = NULL;
+	int i;
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+		while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
+			crp = list_entry(nn->reclaim_str_hashtbl[i].next,
+			                struct nfs4_client_reclaim, cr_strhash);
+			nfs4_remove_reclaim_record(crp, nn);
+		}
+	}
+	WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
+}
+
+/*
+ * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
+struct nfs4_client_reclaim *
+nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
+{
+	unsigned int strhashval;
+	struct nfs4_client_reclaim *crp = NULL;
+
+	dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
+
+	strhashval = clientstr_hashval(recdir);
+	list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
+		if (same_name(crp->cr_recdir, recdir)) {
+			return crp;
+		}
+	}
+	return NULL;
+}
+
+/*
+* Called from OPEN. Look for clientid in reclaim list.
+*/
+__be32
+nfs4_check_open_reclaim(clientid_t *clid,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd_net *nn)
+{
+	__be32 status;
+
+	/* find clientid in conf_id_hashtbl */
+	status = lookup_clientid(clid, cstate, nn);
+	if (status)
+		return nfserr_reclaim_bad;
+
+	if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+		return nfserr_no_grace;
+
+	if (nfsd4_client_record_check(cstate->clp))
+		return nfserr_reclaim_bad;
+
+	return nfs_ok;
+}
+
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+static inline void
+put_client(struct nfs4_client *clp)
+{
+	atomic_dec(&clp->cl_refcount);
+}
+
+static struct nfs4_client *
+nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
+{
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+					  nfsd_net_id);
+
+	if (!nfsd_netns_ready(nn))
+		return NULL;
+
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
+			return clp;
+	}
+	return NULL;
+}
+
+u64
+nfsd_inject_print_clients(void)
+{
+	struct nfs4_client *clp;
+	u64 count = 0;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+					  nfsd_net_id);
+	char buf[INET6_ADDRSTRLEN];
+
+	if (!nfsd_netns_ready(nn))
+		return 0;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+		pr_info("NFS Client: %s\n", buf);
+		++count;
+	}
+	spin_unlock(&nn->client_lock);
+
+	return count;
+}
+
+u64
+nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size)
+{
+	u64 count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+					  nfsd_net_id);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	clp = nfsd_find_client(addr, addr_size);
+	if (clp) {
+		if (mark_client_expired_locked(clp) == nfs_ok)
+			++count;
+		else
+			clp = NULL;
+	}
+	spin_unlock(&nn->client_lock);
+
+	if (clp)
+		expire_client(clp);
+
+	return count;
+}
+
+u64
+nfsd_inject_forget_clients(u64 max)
+{
+	u64 count = 0;
+	struct nfs4_client *clp, *next;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+		if (mark_client_expired_locked(clp) == nfs_ok) {
+			list_add(&clp->cl_lru, &reaplist);
+			if (max != 0 && ++count >= max)
+				break;
+		}
+	}
+	spin_unlock(&nn->client_lock);
+
+	list_for_each_entry_safe(clp, next, &reaplist, cl_lru)
+		expire_client(clp);
+
+	return count;
+}
+
+static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
+			     const char *type)
+{
+	char buf[INET6_ADDRSTRLEN];
+	rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+	printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
+}
+
+static void
+nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst,
+			     struct list_head *collect)
+{
+	struct nfs4_client *clp = lst->st_stid.sc_client;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+					  nfsd_net_id);
+
+	if (!collect)
+		return;
+
+	lockdep_assert_held(&nn->client_lock);
+	atomic_inc(&clp->cl_refcount);
+	list_add(&lst->st_locks, collect);
+}
+
+static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
+				    struct list_head *collect,
+				    void (*func)(struct nfs4_ol_stateid *))
+{
+	struct nfs4_openowner *oop;
+	struct nfs4_ol_stateid *stp, *st_next;
+	struct nfs4_ol_stateid *lst, *lst_next;
+	u64 count = 0;
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
+		list_for_each_entry_safe(stp, st_next,
+				&oop->oo_owner.so_stateids, st_perstateowner) {
+			list_for_each_entry_safe(lst, lst_next,
+					&stp->st_locks, st_locks) {
+				if (func) {
+					func(lst);
+					nfsd_inject_add_lock_to_list(lst,
+								collect);
+				}
+				++count;
+				/*
+				 * Despite the fact that these functions deal
+				 * with 64-bit integers for "count", we must
+				 * ensure that it doesn't blow up the
+				 * clp->cl_refcount. Throw a warning if we
+				 * start to approach INT_MAX here.
+				 */
+				WARN_ON_ONCE(count == (INT_MAX / 2));
+				if (count == max)
+					goto out;
+			}
+		}
+	}
+out:
+	spin_unlock(&clp->cl_lock);
+
+	return count;
+}
+
+static u64
+nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect,
+			  u64 max)
+{
+	return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid);
+}
+
+static u64
+nfsd_print_client_locks(struct nfs4_client *clp)
+{
+	u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL);
+	nfsd_print_count(clp, count, "locked files");
+	return count;
+}
+
+u64
+nfsd_inject_print_locks(void)
+{
+	struct nfs4_client *clp;
+	u64 count = 0;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+
+	if (!nfsd_netns_ready(nn))
+		return 0;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(clp, &nn->client_lru, cl_lru)
+		count += nfsd_print_client_locks(clp);
+	spin_unlock(&nn->client_lock);
+
+	return count;
+}
+
+static void
+nfsd_reap_locks(struct list_head *reaplist)
+{
+	struct nfs4_client *clp;
+	struct nfs4_ol_stateid *stp, *next;
+
+	list_for_each_entry_safe(stp, next, reaplist, st_locks) {
+		list_del_init(&stp->st_locks);
+		clp = stp->st_stid.sc_client;
+		nfs4_put_stid(&stp->st_stid);
+		put_client(clp);
+	}
+}
+
+u64
+nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size)
+{
+	unsigned int count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	clp = nfsd_find_client(addr, addr_size);
+	if (clp)
+		count = nfsd_collect_client_locks(clp, &reaplist, 0);
+	spin_unlock(&nn->client_lock);
+	nfsd_reap_locks(&reaplist);
+	return count;
+}
+
+u64
+nfsd_inject_forget_locks(u64 max)
+{
+	u64 count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		count += nfsd_collect_client_locks(clp, &reaplist, max - count);
+		if (max != 0 && count >= max)
+			break;
+	}
+	spin_unlock(&nn->client_lock);
+	nfsd_reap_locks(&reaplist);
+	return count;
+}
+
+static u64
+nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max,
+			      struct list_head *collect,
+			      void (*func)(struct nfs4_openowner *))
+{
+	struct nfs4_openowner *oop, *next;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	u64 count = 0;
+
+	lockdep_assert_held(&nn->client_lock);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
+		if (func) {
+			func(oop);
+			if (collect) {
+				atomic_inc(&clp->cl_refcount);
+				list_add(&oop->oo_perclient, collect);
+			}
+		}
+		++count;
+		/*
+		 * Despite the fact that these functions deal with
+		 * 64-bit integers for "count", we must ensure that
+		 * it doesn't blow up the clp->cl_refcount. Throw a
+		 * warning if we start to approach INT_MAX here.
+		 */
+		WARN_ON_ONCE(count == (INT_MAX / 2));
+		if (count == max)
+			break;
+	}
+	spin_unlock(&clp->cl_lock);
+
+	return count;
+}
+
+static u64
+nfsd_print_client_openowners(struct nfs4_client *clp)
+{
+	u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL);
+
+	nfsd_print_count(clp, count, "openowners");
+	return count;
+}
+
+static u64
+nfsd_collect_client_openowners(struct nfs4_client *clp,
+			       struct list_head *collect, u64 max)
+{
+	return nfsd_foreach_client_openowner(clp, max, collect,
+						unhash_openowner_locked);
+}
+
+u64
+nfsd_inject_print_openowners(void)
+{
+	struct nfs4_client *clp;
+	u64 count = 0;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+
+	if (!nfsd_netns_ready(nn))
+		return 0;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(clp, &nn->client_lru, cl_lru)
+		count += nfsd_print_client_openowners(clp);
+	spin_unlock(&nn->client_lock);
+
+	return count;
+}
+
+static void
+nfsd_reap_openowners(struct list_head *reaplist)
+{
+	struct nfs4_client *clp;
+	struct nfs4_openowner *oop, *next;
+
+	list_for_each_entry_safe(oop, next, reaplist, oo_perclient) {
+		list_del_init(&oop->oo_perclient);
+		clp = oop->oo_owner.so_client;
+		release_openowner(oop);
+		put_client(clp);
+	}
+}
+
+u64
+nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr,
+				     size_t addr_size)
+{
+	unsigned int count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	clp = nfsd_find_client(addr, addr_size);
+	if (clp)
+		count = nfsd_collect_client_openowners(clp, &reaplist, 0);
+	spin_unlock(&nn->client_lock);
+	nfsd_reap_openowners(&reaplist);
+	return count;
+}
+
+u64
+nfsd_inject_forget_openowners(u64 max)
+{
+	u64 count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		count += nfsd_collect_client_openowners(clp, &reaplist,
+							max - count);
+		if (max != 0 && count >= max)
+			break;
+	}
+	spin_unlock(&nn->client_lock);
+	nfsd_reap_openowners(&reaplist);
+	return count;
+}
+
+static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
+				     struct list_head *victims)
+{
+	struct nfs4_delegation *dp, *next;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	u64 count = 0;
+
+	lockdep_assert_held(&nn->client_lock);
+
+	spin_lock(&state_lock);
+	list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
+		if (victims) {
+			/*
+			 * It's not safe to mess with delegations that have a
+			 * non-zero dl_time. They might have already been broken
+			 * and could be processed by the laundromat outside of
+			 * the state_lock. Just leave them be.
+			 */
+			if (dp->dl_time != 0)
+				continue;
+
+			atomic_inc(&clp->cl_refcount);
+			unhash_delegation_locked(dp);
+			list_add(&dp->dl_recall_lru, victims);
+		}
+		++count;
+		/*
+		 * Despite the fact that these functions deal with
+		 * 64-bit integers for "count", we must ensure that
+		 * it doesn't blow up the clp->cl_refcount. Throw a
+		 * warning if we start to approach INT_MAX here.
+		 */
+		WARN_ON_ONCE(count == (INT_MAX / 2));
+		if (count == max)
+			break;
+	}
+	spin_unlock(&state_lock);
+	return count;
+}
+
+static u64
+nfsd_print_client_delegations(struct nfs4_client *clp)
+{
+	u64 count = nfsd_find_all_delegations(clp, 0, NULL);
+
+	nfsd_print_count(clp, count, "delegations");
+	return count;
+}
+
+u64
+nfsd_inject_print_delegations(void)
+{
+	struct nfs4_client *clp;
+	u64 count = 0;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+
+	if (!nfsd_netns_ready(nn))
+		return 0;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(clp, &nn->client_lru, cl_lru)
+		count += nfsd_print_client_delegations(clp);
+	spin_unlock(&nn->client_lock);
+
+	return count;
+}
+
+static void
+nfsd_forget_delegations(struct list_head *reaplist)
+{
+	struct nfs4_client *clp;
+	struct nfs4_delegation *dp, *next;
+
+	list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
+		list_del_init(&dp->dl_recall_lru);
+		clp = dp->dl_stid.sc_client;
+		revoke_delegation(dp);
+		put_client(clp);
+	}
+}
+
+u64
+nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr,
+				      size_t addr_size)
+{
+	u64 count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	clp = nfsd_find_client(addr, addr_size);
+	if (clp)
+		count = nfsd_find_all_delegations(clp, 0, &reaplist);
+	spin_unlock(&nn->client_lock);
+
+	nfsd_forget_delegations(&reaplist);
+	return count;
+}
+
+u64
+nfsd_inject_forget_delegations(u64 max)
+{
+	u64 count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		count += nfsd_find_all_delegations(clp, max - count, &reaplist);
+		if (max != 0 && count >= max)
+			break;
+	}
+	spin_unlock(&nn->client_lock);
+	nfsd_forget_delegations(&reaplist);
+	return count;
+}
+
+static void
+nfsd_recall_delegations(struct list_head *reaplist)
+{
+	struct nfs4_client *clp;
+	struct nfs4_delegation *dp, *next;
+
+	list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
+		list_del_init(&dp->dl_recall_lru);
+		clp = dp->dl_stid.sc_client;
+		/*
+		 * We skipped all entries that had a zero dl_time before,
+		 * so we can now reset the dl_time back to 0. If a delegation
+		 * break comes in now, then it won't make any difference since
+		 * we're recalling it either way.
+		 */
+		spin_lock(&state_lock);
+		dp->dl_time = 0;
+		spin_unlock(&state_lock);
+		nfsd_break_one_deleg(dp);
+		put_client(clp);
+	}
+}
+
+u64
+nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr,
+				      size_t addr_size)
+{
+	u64 count = 0;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	clp = nfsd_find_client(addr, addr_size);
+	if (clp)
+		count = nfsd_find_all_delegations(clp, 0, &reaplist);
+	spin_unlock(&nn->client_lock);
+
+	nfsd_recall_delegations(&reaplist);
+	return count;
+}
+
+u64
+nfsd_inject_recall_delegations(u64 max)
+{
+	u64 count = 0;
+	struct nfs4_client *clp, *next;
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+						nfsd_net_id);
+	LIST_HEAD(reaplist);
+
+	if (!nfsd_netns_ready(nn))
+		return count;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+		count += nfsd_find_all_delegations(clp, max - count, &reaplist);
+		if (max != 0 && ++count >= max)
+			break;
+	}
+	spin_unlock(&nn->client_lock);
+	nfsd_recall_delegations(&reaplist);
+	return count;
+}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
+/*
+ * Since the lifetime of a delegation isn't limited to that of an open, a
+ * client may quite reasonably hang on to a delegation as long as it has
+ * the inode cached.  This becomes an obvious problem the first time a
+ * client's inode cache approaches the size of the server's total memory.
+ *
+ * For now we avoid this problem by imposing a hard limit on the number
+ * of delegations, which varies according to the server's memory size.
+ */
+static void
+set_max_delegations(void)
+{
+	/*
+	 * Allow at most 4 delegations per megabyte of RAM.  Quick
+	 * estimates suggest that in the worst case (where every delegation
+	 * is for a different inode), a delegation could take about 1.5K,
+	 * giving a worst case usage of about 6% of memory.
+	 */
+	max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
+}
+
+static int nfs4_state_create_net(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int i;
+
+	nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+			CLIENT_HASH_SIZE, GFP_KERNEL);
+	if (!nn->conf_id_hashtbl)
+		goto err;
+	nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+			CLIENT_HASH_SIZE, GFP_KERNEL);
+	if (!nn->unconf_id_hashtbl)
+		goto err_unconf_id;
+	nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
+			SESSION_HASH_SIZE, GFP_KERNEL);
+	if (!nn->sessionid_hashtbl)
+		goto err_sessionid;
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
+		INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
+	}
+	for (i = 0; i < SESSION_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
+	nn->conf_name_tree = RB_ROOT;
+	nn->unconf_name_tree = RB_ROOT;
+	INIT_LIST_HEAD(&nn->client_lru);
+	INIT_LIST_HEAD(&nn->close_lru);
+	INIT_LIST_HEAD(&nn->del_recall_lru);
+	spin_lock_init(&nn->client_lock);
+
+	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+	get_net(net);
+
+	return 0;
+
+err_sessionid:
+	kfree(nn->unconf_id_hashtbl);
+err_unconf_id:
+	kfree(nn->conf_id_hashtbl);
+err:
+	return -ENOMEM;
+}
+
+static void
+nfs4_state_destroy_net(struct net *net)
+{
+	int i;
+	struct nfs4_client *clp = NULL;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+		while (!list_empty(&nn->conf_id_hashtbl[i])) {
+			clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+			destroy_client(clp);
+		}
+	}
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+		while (!list_empty(&nn->unconf_id_hashtbl[i])) {
+			clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+			destroy_client(clp);
+		}
+	}
+
+	kfree(nn->sessionid_hashtbl);
+	kfree(nn->unconf_id_hashtbl);
+	kfree(nn->conf_id_hashtbl);
+	put_net(net);
+}
+
+int
+nfs4_state_start_net(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int ret;
+
+	ret = nfs4_state_create_net(net);
+	if (ret)
+		return ret;
+	nn->boot_time = get_seconds();
+	nn->grace_ended = false;
+	locks_start_grace(net, &nn->nfsd4_manager);
+	nfsd4_client_tracking_init(net);
+	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
+	       nn->nfsd4_grace, net);
+	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
+	return 0;
+}
+
+/* initialization to perform when the nfsd service is started: */
+
+int
+nfs4_state_start(void)
+{
+	int ret;
+
+	ret = set_callback_cred();
+	if (ret)
+		return -ENOMEM;
+	laundry_wq = create_singlethread_workqueue("nfsd4");
+	if (laundry_wq == NULL) {
+		ret = -ENOMEM;
+		goto out_recovery;
+	}
+	ret = nfsd4_create_callback_queue();
+	if (ret)
+		goto out_free_laundry;
+
+	set_max_delegations();
+
+	return 0;
+
+out_free_laundry:
+	destroy_workqueue(laundry_wq);
+out_recovery:
+	return ret;
+}
+
+void
+nfs4_state_shutdown_net(struct net *net)
+{
+	struct nfs4_delegation *dp = NULL;
+	struct list_head *pos, *next, reaplist;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	cancel_delayed_work_sync(&nn->laundromat_work);
+	locks_end_grace(&nn->nfsd4_manager);
+
+	INIT_LIST_HEAD(&reaplist);
+	spin_lock(&state_lock);
+	list_for_each_safe(pos, next, &nn->del_recall_lru) {
+		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+		unhash_delegation_locked(dp);
+		list_add(&dp->dl_recall_lru, &reaplist);
+	}
+	spin_unlock(&state_lock);
+	list_for_each_safe(pos, next, &reaplist) {
+		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+		list_del_init(&dp->dl_recall_lru);
+		put_clnt_odstate(dp->dl_clnt_odstate);
+		nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+		nfs4_put_stid(&dp->dl_stid);
+	}
+
+	nfsd4_client_tracking_exit(net);
+	nfs4_state_destroy_net(net);
+}
+
+void
+nfs4_state_shutdown(void)
+{
+	destroy_workqueue(laundry_wq);
+	nfsd4_destroy_callback_queue();
+}
+
+static void
+get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid)
+{
+	if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG) && CURRENT_STATEID(stateid))
+		memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t));
+}
+
+static void
+put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid)
+{
+	if (cstate->minorversion) {
+		memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t));
+		SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG);
+	}
+}
+
+void
+clear_current_stateid(struct nfsd4_compound_state *cstate)
+{
+	CLEAR_STATE_ID(cstate, CURRENT_STATE_ID_FLAG);
+}
+
+/*
+ * functions to set current state id
+ */
+void
+nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp)
+{
+	put_stateid(cstate, &odp->od_stateid);
+}
+
+void
+nfsd4_set_openstateid(struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+	put_stateid(cstate, &open->op_stateid);
+}
+
+void
+nfsd4_set_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close)
+{
+	put_stateid(cstate, &close->cl_stateid);
+}
+
+void
+nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock)
+{
+	put_stateid(cstate, &lock->lk_resp_stateid);
+}
+
+/*
+ * functions to consume current state id
+ */
+
+void
+nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp)
+{
+	get_stateid(cstate, &odp->od_stateid);
+}
+
+void
+nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate, struct nfsd4_delegreturn *drp)
+{
+	get_stateid(cstate, &drp->dr_stateid);
+}
+
+void
+nfsd4_get_freestateid(struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *fsp)
+{
+	get_stateid(cstate, &fsp->fr_stateid);
+}
+
+void
+nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr)
+{
+	get_stateid(cstate, &setattr->sa_stateid);
+}
+
+void
+nfsd4_get_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close)
+{
+	get_stateid(cstate, &close->cl_stateid);
+}
+
+void
+nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku)
+{
+	get_stateid(cstate, &locku->lu_stateid);
+}
+
+void
+nfsd4_get_readstateid(struct nfsd4_compound_state *cstate, struct nfsd4_read *read)
+{
+	get_stateid(cstate, &read->rd_stateid);
+}
+
+void
+nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, struct nfsd4_write *write)
+{
+	get_stateid(cstate, &write->wr_stateid);
+}
diff --git a/kernel/fs/nfsd/nfs4xdr.c b/kernel/fs/nfsd/nfs4xdr.c
new file mode 100644
index 000000000..158badf94
--- /dev/null
+++ b/kernel/fs/nfsd/nfs4xdr.c
@@ -0,0 +1,4453 @@
+/*
+ *  Server-side XDR for NFSv4
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/utsname.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/svcauth_gss.h>
+
+#include "idmap.h"
+#include "acl.h"
+#include "xdr4.h"
+#include "vfs.h"
+#include "state.h"
+#include "cache.h"
+#include "netns.h"
+#include "pnfs.h"
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+#endif
+
+
+#define NFSDDBG_FACILITY		NFSDDBG_XDR
+
+/*
+ * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
+ * directory in order to indicate to the client that a filesystem boundary is present
+ * We use a fixed fsid for a referral
+ */
+#define NFS4_REFERRAL_FSID_MAJOR	0x8000000ULL
+#define NFS4_REFERRAL_FSID_MINOR	0x8000000ULL
+
+static __be32
+check_filename(char *str, int len)
+{
+	int i;
+
+	if (len == 0)
+		return nfserr_inval;
+	if (isdotent(str, len))
+		return nfserr_badname;
+	for (i = 0; i < len; i++)
+		if (str[i] == '/')
+			return nfserr_badname;
+	return 0;
+}
+
+#define DECODE_HEAD				\
+	__be32 *p;				\
+	__be32 status
+#define DECODE_TAIL				\
+	status = 0;				\
+out:						\
+	return status;				\
+xdr_error:					\
+	dprintk("NFSD: xdr error (%s:%d)\n",	\
+			__FILE__, __LINE__);	\
+	status = nfserr_bad_xdr;		\
+	goto out
+
+#define READMEM(x,nbytes) do {			\
+	x = (char *)p;				\
+	p += XDR_QUADLEN(nbytes);		\
+} while (0)
+#define SAVEMEM(x,nbytes) do {			\
+	if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
+ 		savemem(argp, p, nbytes) :	\
+ 		(char *)p)) {			\
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
+		goto xdr_error;			\
+		}				\
+	p += XDR_QUADLEN(nbytes);		\
+} while (0)
+#define COPYMEM(x,nbytes) do {			\
+	memcpy((x), p, nbytes);			\
+	p += XDR_QUADLEN(nbytes);		\
+} while (0)
+
+/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */
+#define READ_BUF(nbytes)  do {			\
+	if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) {	\
+		p = argp->p;			\
+		argp->p += XDR_QUADLEN(nbytes);	\
+	} else if (!(p = read_buf(argp, nbytes))) { \
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
+		goto xdr_error;			\
+	}					\
+} while (0)
+
+static void next_decode_page(struct nfsd4_compoundargs *argp)
+{
+	argp->p = page_address(argp->pagelist[0]);
+	argp->pagelist++;
+	if (argp->pagelen < PAGE_SIZE) {
+		argp->end = argp->p + (argp->pagelen>>2);
+		argp->pagelen = 0;
+	} else {
+		argp->end = argp->p + (PAGE_SIZE>>2);
+		argp->pagelen -= PAGE_SIZE;
+	}
+}
+
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
+{
+	/* We want more bytes than seem to be available.
+	 * Maybe we need a new page, maybe we have just run out
+	 */
+	unsigned int avail = (char *)argp->end - (char *)argp->p;
+	__be32 *p;
+	if (avail + argp->pagelen < nbytes)
+		return NULL;
+	if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
+		return NULL;
+	/* ok, we can do it with the current plus the next page */
+	if (nbytes <= sizeof(argp->tmp))
+		p = argp->tmp;
+	else {
+		kfree(argp->tmpp);
+		p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL);
+		if (!p)
+			return NULL;
+		
+	}
+	/*
+	 * The following memcpy is safe because read_buf is always
+	 * called with nbytes > avail, and the two cases above both
+	 * guarantee p points to at least nbytes bytes.
+	 */
+	memcpy(p, argp->p, avail);
+	next_decode_page(argp);
+	memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
+	argp->p += XDR_QUADLEN(nbytes - avail);
+	return p;
+}
+
+static int zero_clientid(clientid_t *clid)
+{
+	return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
+
+/**
+ * svcxdr_tmpalloc - allocate memory to be freed after compound processing
+ * @argp: NFSv4 compound argument structure
+ * @p: pointer to be freed (with kfree())
+ *
+ * Marks @p to be freed when processing the compound operation
+ * described in @argp finishes.
+ */
+static void *
+svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
+{
+	struct svcxdr_tmpbuf *tb;
+
+	tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL);
+	if (!tb)
+		return NULL;
+	tb->next = argp->to_free;
+	argp->to_free = tb;
+	return tb->buf;
+}
+
+/*
+ * For xdr strings that need to be passed to other kernel api's
+ * as null-terminated strings.
+ *
+ * Note null-terminating in place usually isn't safe since the
+ * buffer might end on a page boundary.
+ */
+static char *
+svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
+{
+	char *p = svcxdr_tmpalloc(argp, len + 1);
+
+	if (!p)
+		return NULL;
+	memcpy(p, buf, len);
+	p[len] = '\0';
+	return p;
+}
+
+/**
+ * savemem - duplicate a chunk of memory for later processing
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @p: pointer to be duplicated
+ * @nbytes: length to be duplicated
+ *
+ * Returns a pointer to a copy of @nbytes bytes of memory at @p
+ * that are preserved until processing of the NFSv4 compound
+ * operation described by @argp finishes.
+ */
+static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
+{
+	void *ret;
+
+	ret = svcxdr_tmpalloc(argp, nbytes);
+	if (!ret)
+		return NULL;
+	memcpy(ret, p, nbytes);
+	return ret;
+}
+
+/*
+ * We require the high 32 bits of 'seconds' to be 0, and
+ * we ignore all 32 bits of 'nseconds'.
+ */
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+{
+	DECODE_HEAD;
+	u64 sec;
+
+	READ_BUF(12);
+	p = xdr_decode_hyper(p, &sec);
+	tv->tv_sec = sec;
+	tv->tv_nsec = be32_to_cpup(p++);
+	if (tv->tv_nsec >= (u32)1000000000)
+		return nfserr_inval;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
+{
+	u32 bmlen;
+	DECODE_HEAD;
+
+	bmval[0] = 0;
+	bmval[1] = 0;
+	bmval[2] = 0;
+
+	READ_BUF(4);
+	bmlen = be32_to_cpup(p++);
+	if (bmlen > 1000)
+		goto xdr_error;
+
+	READ_BUF(bmlen << 2);
+	if (bmlen > 0)
+		bmval[0] = be32_to_cpup(p++);
+	if (bmlen > 1)
+		bmval[1] = be32_to_cpup(p++);
+	if (bmlen > 2)
+		bmval[2] = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
+		   struct iattr *iattr, struct nfs4_acl **acl,
+		   struct xdr_netobj *label)
+{
+	int expected_len, len = 0;
+	u32 dummy32;
+	char *buf;
+
+	DECODE_HEAD;
+	iattr->ia_valid = 0;
+	if ((status = nfsd4_decode_bitmap(argp, bmval)))
+		return status;
+
+	READ_BUF(4);
+	expected_len = be32_to_cpup(p++);
+
+	if (bmval[0] & FATTR4_WORD0_SIZE) {
+		READ_BUF(8);
+		len += 8;
+		p = xdr_decode_hyper(p, &iattr->ia_size);
+		iattr->ia_valid |= ATTR_SIZE;
+	}
+	if (bmval[0] & FATTR4_WORD0_ACL) {
+		u32 nace;
+		struct nfs4_ace *ace;
+
+		READ_BUF(4); len += 4;
+		nace = be32_to_cpup(p++);
+
+		if (nace > NFS4_ACL_MAX)
+			return nfserr_fbig;
+
+		*acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
+		if (*acl == NULL)
+			return nfserr_jukebox;
+
+		(*acl)->naces = nace;
+		for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
+			READ_BUF(16); len += 16;
+			ace->type = be32_to_cpup(p++);
+			ace->flag = be32_to_cpup(p++);
+			ace->access_mask = be32_to_cpup(p++);
+			dummy32 = be32_to_cpup(p++);
+			READ_BUF(dummy32);
+			len += XDR_QUADLEN(dummy32) << 2;
+			READMEM(buf, dummy32);
+			ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
+			status = nfs_ok;
+			if (ace->whotype != NFS4_ACL_WHO_NAMED)
+				;
+			else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+				status = nfsd_map_name_to_gid(argp->rqstp,
+						buf, dummy32, &ace->who_gid);
+			else
+				status = nfsd_map_name_to_uid(argp->rqstp,
+						buf, dummy32, &ace->who_uid);
+			if (status)
+				return status;
+		}
+	} else
+		*acl = NULL;
+	if (bmval[1] & FATTR4_WORD1_MODE) {
+		READ_BUF(4);
+		len += 4;
+		iattr->ia_mode = be32_to_cpup(p++);
+		iattr->ia_mode &= (S_IFMT | S_IALLUGO);
+		iattr->ia_valid |= ATTR_MODE;
+	}
+	if (bmval[1] & FATTR4_WORD1_OWNER) {
+		READ_BUF(4);
+		len += 4;
+		dummy32 = be32_to_cpup(p++);
+		READ_BUF(dummy32);
+		len += (XDR_QUADLEN(dummy32) << 2);
+		READMEM(buf, dummy32);
+		if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+			return status;
+		iattr->ia_valid |= ATTR_UID;
+	}
+	if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
+		READ_BUF(4);
+		len += 4;
+		dummy32 = be32_to_cpup(p++);
+		READ_BUF(dummy32);
+		len += (XDR_QUADLEN(dummy32) << 2);
+		READMEM(buf, dummy32);
+		if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+			return status;
+		iattr->ia_valid |= ATTR_GID;
+	}
+	if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+		READ_BUF(4);
+		len += 4;
+		dummy32 = be32_to_cpup(p++);
+		switch (dummy32) {
+		case NFS4_SET_TO_CLIENT_TIME:
+			len += 12;
+			status = nfsd4_decode_time(argp, &iattr->ia_atime);
+			if (status)
+				return status;
+			iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
+			break;
+		case NFS4_SET_TO_SERVER_TIME:
+			iattr->ia_valid |= ATTR_ATIME;
+			break;
+		default:
+			goto xdr_error;
+		}
+	}
+	if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+		READ_BUF(4);
+		len += 4;
+		dummy32 = be32_to_cpup(p++);
+		switch (dummy32) {
+		case NFS4_SET_TO_CLIENT_TIME:
+			len += 12;
+			status = nfsd4_decode_time(argp, &iattr->ia_mtime);
+			if (status)
+				return status;
+			iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
+			break;
+		case NFS4_SET_TO_SERVER_TIME:
+			iattr->ia_valid |= ATTR_MTIME;
+			break;
+		default:
+			goto xdr_error;
+		}
+	}
+
+	label->len = 0;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+		READ_BUF(4);
+		len += 4;
+		dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */
+		READ_BUF(4);
+		len += 4;
+		dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */
+		READ_BUF(4);
+		len += 4;
+		dummy32 = be32_to_cpup(p++);
+		READ_BUF(dummy32);
+		if (dummy32 > NFS4_MAXLABELLEN)
+			return nfserr_badlabel;
+		len += (XDR_QUADLEN(dummy32) << 2);
+		READMEM(buf, dummy32);
+		label->len = dummy32;
+		label->data = svcxdr_dupstr(argp, buf, dummy32);
+		if (!label->data)
+			return nfserr_jukebox;
+	}
+#endif
+
+	if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
+	    || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+	    || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
+		READ_BUF(expected_len - len);
+	else if (len != expected_len)
+		goto xdr_error;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+	DECODE_HEAD;
+
+	READ_BUF(sizeof(stateid_t));
+	sid->si_generation = be32_to_cpup(p++);
+	COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	access->ac_req_access = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
+{
+	DECODE_HEAD;
+	u32 dummy, uid, gid;
+	char *machine_name;
+	int i;
+	int nr_secflavs;
+
+	/* callback_sec_params4 */
+	READ_BUF(4);
+	nr_secflavs = be32_to_cpup(p++);
+	if (nr_secflavs)
+		cbs->flavor = (u32)(-1);
+	else
+		/* Is this legal? Be generous, take it to mean AUTH_NONE: */
+		cbs->flavor = 0;
+	for (i = 0; i < nr_secflavs; ++i) {
+		READ_BUF(4);
+		dummy = be32_to_cpup(p++);
+		switch (dummy) {
+		case RPC_AUTH_NULL:
+			/* Nothing to read */
+			if (cbs->flavor == (u32)(-1))
+				cbs->flavor = RPC_AUTH_NULL;
+			break;
+		case RPC_AUTH_UNIX:
+			READ_BUF(8);
+			/* stamp */
+			dummy = be32_to_cpup(p++);
+
+			/* machine name */
+			dummy = be32_to_cpup(p++);
+			READ_BUF(dummy);
+			SAVEMEM(machine_name, dummy);
+
+			/* uid, gid */
+			READ_BUF(8);
+			uid = be32_to_cpup(p++);
+			gid = be32_to_cpup(p++);
+
+			/* more gids */
+			READ_BUF(4);
+			dummy = be32_to_cpup(p++);
+			READ_BUF(dummy * 4);
+			if (cbs->flavor == (u32)(-1)) {
+				kuid_t kuid = make_kuid(&init_user_ns, uid);
+				kgid_t kgid = make_kgid(&init_user_ns, gid);
+				if (uid_valid(kuid) && gid_valid(kgid)) {
+					cbs->uid = kuid;
+					cbs->gid = kgid;
+					cbs->flavor = RPC_AUTH_UNIX;
+				} else {
+					dprintk("RPC_AUTH_UNIX with invalid"
+						"uid or gid ignoring!\n");
+				}
+			}
+			break;
+		case RPC_AUTH_GSS:
+			dprintk("RPC_AUTH_GSS callback secflavor "
+				"not supported!\n");
+			READ_BUF(8);
+			/* gcbp_service */
+			dummy = be32_to_cpup(p++);
+			/* gcbp_handle_from_server */
+			dummy = be32_to_cpup(p++);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			/* gcbp_handle_from_client */
+			READ_BUF(4);
+			dummy = be32_to_cpup(p++);
+			READ_BUF(dummy);
+			break;
+		default:
+			dprintk("Illegal callback secflavor\n");
+			return nfserr_inval;
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	bc->bc_cb_program = be32_to_cpup(p++);
+	nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
+
+	DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+	COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	bcts->dir = be32_to_cpup(p++);
+	/* XXX: skipping ctsa_use_conn_in_rdma_mode.  Perhaps Tom Tucker
+	 * could help us figure out we should be using it. */
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	close->cl_seqid = be32_to_cpup(p++);
+	return nfsd4_decode_stateid(argp, &close->cl_stateid);
+
+	DECODE_TAIL;
+}
+
+
+static __be32
+nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
+{
+	DECODE_HEAD;
+
+	READ_BUF(12);
+	p = xdr_decode_hyper(p, &commit->co_offset);
+	commit->co_count = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	create->cr_type = be32_to_cpup(p++);
+	switch (create->cr_type) {
+	case NF4LNK:
+		READ_BUF(4);
+		create->cr_datalen = be32_to_cpup(p++);
+		READ_BUF(create->cr_datalen);
+		create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen);
+		if (!create->cr_data)
+			return nfserr_jukebox;
+		break;
+	case NF4BLK:
+	case NF4CHR:
+		READ_BUF(8);
+		create->cr_specdata1 = be32_to_cpup(p++);
+		create->cr_specdata2 = be32_to_cpup(p++);
+		break;
+	case NF4SOCK:
+	case NF4FIFO:
+	case NF4DIR:
+	default:
+		break;
+	}
+
+	READ_BUF(4);
+	create->cr_namelen = be32_to_cpup(p++);
+	READ_BUF(create->cr_namelen);
+	SAVEMEM(create->cr_name, create->cr_namelen);
+	if ((status = check_filename(create->cr_name, create->cr_namelen)))
+		return status;
+
+	status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
+				    &create->cr_acl, &create->cr_label);
+	if (status)
+		goto out;
+
+	DECODE_TAIL;
+}
+
+static inline __be32
+nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
+{
+	return nfsd4_decode_stateid(argp, &dr->dr_stateid);
+}
+
+static inline __be32
+nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+{
+	return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
+}
+
+static __be32
+nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	link->li_namelen = be32_to_cpup(p++);
+	READ_BUF(link->li_namelen);
+	SAVEMEM(link->li_name, link->li_namelen);
+	if ((status = check_filename(link->li_name, link->li_namelen)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+{
+	DECODE_HEAD;
+
+	/*
+	* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
+	*/
+	READ_BUF(28);
+	lock->lk_type = be32_to_cpup(p++);
+	if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
+		goto xdr_error;
+	lock->lk_reclaim = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &lock->lk_offset);
+	p = xdr_decode_hyper(p, &lock->lk_length);
+	lock->lk_is_new = be32_to_cpup(p++);
+
+	if (lock->lk_is_new) {
+		READ_BUF(4);
+		lock->lk_new_open_seqid = be32_to_cpup(p++);
+		status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+		if (status)
+			return status;
+		READ_BUF(8 + sizeof(clientid_t));
+		lock->lk_new_lock_seqid = be32_to_cpup(p++);
+		COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
+		lock->lk_new_owner.len = be32_to_cpup(p++);
+		READ_BUF(lock->lk_new_owner.len);
+		READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
+	} else {
+		status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
+		lock->lk_old_lock_seqid = be32_to_cpup(p++);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
+{
+	DECODE_HEAD;
+		        
+	READ_BUF(32);
+	lockt->lt_type = be32_to_cpup(p++);
+	if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
+		goto xdr_error;
+	p = xdr_decode_hyper(p, &lockt->lt_offset);
+	p = xdr_decode_hyper(p, &lockt->lt_length);
+	COPYMEM(&lockt->lt_clientid, 8);
+	lockt->lt_owner.len = be32_to_cpup(p++);
+	READ_BUF(lockt->lt_owner.len);
+	READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
+{
+	DECODE_HEAD;
+
+	READ_BUF(8);
+	locku->lu_type = be32_to_cpup(p++);
+	if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
+		goto xdr_error;
+	locku->lu_seqid = be32_to_cpup(p++);
+	status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
+	p = xdr_decode_hyper(p, &locku->lu_offset);
+	p = xdr_decode_hyper(p, &locku->lu_length);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	lookup->lo_len = be32_to_cpup(p++);
+	READ_BUF(lookup->lo_len);
+	SAVEMEM(lookup->lo_name, lookup->lo_len);
+	if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when)
+{
+	__be32 *p;
+	u32 w;
+
+	READ_BUF(4);
+	w = be32_to_cpup(p++);
+	*share_access = w & NFS4_SHARE_ACCESS_MASK;
+	*deleg_want = w & NFS4_SHARE_WANT_MASK;
+	if (deleg_when)
+		*deleg_when = w & NFS4_SHARE_WHEN_MASK;
+
+	switch (w & NFS4_SHARE_ACCESS_MASK) {
+	case NFS4_SHARE_ACCESS_READ:
+	case NFS4_SHARE_ACCESS_WRITE:
+	case NFS4_SHARE_ACCESS_BOTH:
+		break;
+	default:
+		return nfserr_bad_xdr;
+	}
+	w &= ~NFS4_SHARE_ACCESS_MASK;
+	if (!w)
+		return nfs_ok;
+	if (!argp->minorversion)
+		return nfserr_bad_xdr;
+	switch (w & NFS4_SHARE_WANT_MASK) {
+	case NFS4_SHARE_WANT_NO_PREFERENCE:
+	case NFS4_SHARE_WANT_READ_DELEG:
+	case NFS4_SHARE_WANT_WRITE_DELEG:
+	case NFS4_SHARE_WANT_ANY_DELEG:
+	case NFS4_SHARE_WANT_NO_DELEG:
+	case NFS4_SHARE_WANT_CANCEL:
+		break;
+	default:
+		return nfserr_bad_xdr;
+	}
+	w &= ~NFS4_SHARE_WANT_MASK;
+	if (!w)
+		return nfs_ok;
+
+	if (!deleg_when)	/* open_downgrade */
+		return nfserr_inval;
+	switch (w) {
+	case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+	case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+	case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
+	      NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
+		return nfs_ok;
+	}
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+{
+	__be32 *p;
+
+	READ_BUF(4);
+	*x = be32_to_cpup(p++);
+	/* Note: unlinke access bits, deny bits may be zero. */
+	if (*x & ~NFS4_SHARE_DENY_BOTH)
+		return nfserr_bad_xdr;
+	return nfs_ok;
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+	__be32 *p;
+
+	READ_BUF(4);
+	o->len = be32_to_cpup(p++);
+
+	if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+		return nfserr_bad_xdr;
+
+	READ_BUF(o->len);
+	SAVEMEM(o->data, o->len);
+	return nfs_ok;
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
+static __be32
+nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+{
+	DECODE_HEAD;
+	u32 dummy;
+
+	memset(open->op_bmval, 0, sizeof(open->op_bmval));
+	open->op_iattr.ia_valid = 0;
+	open->op_openowner = NULL;
+
+	open->op_xdr_error = 0;
+	/* seqid, share_access, share_deny, clientid, ownerlen */
+	READ_BUF(4);
+	open->op_seqid = be32_to_cpup(p++);
+	/* decode, yet ignore deleg_when until supported */
+	status = nfsd4_decode_share_access(argp, &open->op_share_access,
+					   &open->op_deleg_want, &dummy);
+	if (status)
+		goto xdr_error;
+	status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+	if (status)
+		goto xdr_error;
+	READ_BUF(sizeof(clientid_t));
+	COPYMEM(&open->op_clientid, sizeof(clientid_t));
+	status = nfsd4_decode_opaque(argp, &open->op_owner);
+	if (status)
+		goto xdr_error;
+	READ_BUF(4);
+	open->op_create = be32_to_cpup(p++);
+	switch (open->op_create) {
+	case NFS4_OPEN_NOCREATE:
+		break;
+	case NFS4_OPEN_CREATE:
+		READ_BUF(4);
+		open->op_createmode = be32_to_cpup(p++);
+		switch (open->op_createmode) {
+		case NFS4_CREATE_UNCHECKED:
+		case NFS4_CREATE_GUARDED:
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				&open->op_iattr, &open->op_acl, &open->op_label);
+			if (status)
+				goto out;
+			break;
+		case NFS4_CREATE_EXCLUSIVE:
+			READ_BUF(NFS4_VERIFIER_SIZE);
+			COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
+			break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (argp->minorversion < 1)
+				goto xdr_error;
+			READ_BUF(NFS4_VERIFIER_SIZE);
+			COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				&open->op_iattr, &open->op_acl, &open->op_label);
+			if (status)
+				goto out;
+			break;
+		default:
+			goto xdr_error;
+		}
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* open_claim */
+	READ_BUF(4);
+	open->op_claim_type = be32_to_cpup(p++);
+	switch (open->op_claim_type) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+		READ_BUF(4);
+		open->op_fname.len = be32_to_cpup(p++);
+		READ_BUF(open->op_fname.len);
+		SAVEMEM(open->op_fname.data, open->op_fname.len);
+		if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
+			return status;
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+		READ_BUF(4);
+		open->op_delegate_type = be32_to_cpup(p++);
+		break;
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+		status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
+		open->op_fname.len = be32_to_cpup(p++);
+		READ_BUF(open->op_fname.len);
+		SAVEMEM(open->op_fname.data, open->op_fname.len);
+		if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
+			return status;
+		break;
+	case NFS4_OPEN_CLAIM_FH:
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		if (argp->minorversion < 1)
+			goto xdr_error;
+		/* void */
+		break;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		if (argp->minorversion < 1)
+			goto xdr_error;
+		status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+		if (status)
+			return status;
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
+{
+	DECODE_HEAD;
+
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
+	status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+	if (status)
+		return status;
+	READ_BUF(4);
+	open_conf->oc_seqid = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
+{
+	DECODE_HEAD;
+		    
+	status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+	if (status)
+		return status;
+	READ_BUF(4);
+	open_down->od_seqid = be32_to_cpup(p++);
+	status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
+					   &open_down->od_deleg_want, NULL);
+	if (status)
+		return status;
+	status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+	if (status)
+		return status;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	putfh->pf_fhlen = be32_to_cpup(p++);
+	if (putfh->pf_fhlen > NFS4_FHSIZE)
+		goto xdr_error;
+	READ_BUF(putfh->pf_fhlen);
+	SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+{
+	if (argp->minorversion == 0)
+		return nfs_ok;
+	return nfserr_notsupp;
+}
+
+static __be32
+nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
+{
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+	if (status)
+		return status;
+	READ_BUF(12);
+	p = xdr_decode_hyper(p, &read->rd_offset);
+	read->rd_length = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
+{
+	DECODE_HEAD;
+
+	READ_BUF(24);
+	p = xdr_decode_hyper(p, &readdir->rd_cookie);
+	COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data));
+	readdir->rd_dircount = be32_to_cpup(p++);
+	readdir->rd_maxcount = be32_to_cpup(p++);
+	if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval)))
+		goto out;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	remove->rm_namelen = be32_to_cpup(p++);
+	READ_BUF(remove->rm_namelen);
+	SAVEMEM(remove->rm_name, remove->rm_namelen);
+	if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	rename->rn_snamelen = be32_to_cpup(p++);
+	READ_BUF(rename->rn_snamelen + 4);
+	SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+	rename->rn_tnamelen = be32_to_cpup(p++);
+	READ_BUF(rename->rn_tnamelen);
+	SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
+	if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
+		return status;
+	if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+{
+	DECODE_HEAD;
+
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
+	READ_BUF(sizeof(clientid_t));
+	COPYMEM(clientid, sizeof(clientid_t));
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo *secinfo)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	secinfo->si_namelen = be32_to_cpup(p++);
+	READ_BUF(secinfo->si_namelen);
+	SAVEMEM(secinfo->si_name, secinfo->si_namelen);
+	status = check_filename(secinfo->si_name, secinfo->si_namelen);
+	if (status)
+		return status;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo_no_name *sin)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	sin->sin_style = be32_to_cpup(p++);
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
+{
+	__be32 status;
+
+	status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+	if (status)
+		return status;
+	return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
+				  &setattr->sa_acl, &setattr->sa_label);
+}
+
+static __be32
+nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
+{
+	DECODE_HEAD;
+
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
+	READ_BUF(NFS4_VERIFIER_SIZE);
+	COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE);
+
+	status = nfsd4_decode_opaque(argp, &setclientid->se_name);
+	if (status)
+		return nfserr_bad_xdr;
+	READ_BUF(8);
+	setclientid->se_callback_prog = be32_to_cpup(p++);
+	setclientid->se_callback_netid_len = be32_to_cpup(p++);
+
+	READ_BUF(setclientid->se_callback_netid_len + 4);
+	SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+	setclientid->se_callback_addr_len = be32_to_cpup(p++);
+
+	READ_BUF(setclientid->se_callback_addr_len + 4);
+	SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+	setclientid->se_callback_ident = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
+{
+	DECODE_HEAD;
+
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
+	READ_BUF(8 + NFS4_VERIFIER_SIZE);
+	COPYMEM(&scd_c->sc_clientid, 8);
+	COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE);
+
+	DECODE_TAIL;
+}
+
+/* Also used for NVERIFY */
+static __be32
+nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
+{
+	DECODE_HEAD;
+
+	if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
+		goto out;
+
+	/* For convenience's sake, we compare raw xdr'd attributes in
+	 * nfsd4_proc_verify */
+
+	READ_BUF(4);
+	verify->ve_attrlen = be32_to_cpup(p++);
+	READ_BUF(verify->ve_attrlen);
+	SAVEMEM(verify->ve_attrval, verify->ve_attrlen);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
+{
+	int avail;
+	int len;
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
+	p = xdr_decode_hyper(p, &write->wr_offset);
+	write->wr_stable_how = be32_to_cpup(p++);
+	if (write->wr_stable_how > 2)
+		goto xdr_error;
+	write->wr_buflen = be32_to_cpup(p++);
+
+	/* Sorry .. no magic macros for this.. *
+	 * READ_BUF(write->wr_buflen);
+	 * SAVEMEM(write->wr_buf, write->wr_buflen);
+	 */
+	avail = (char*)argp->end - (char*)argp->p;
+	if (avail + argp->pagelen < write->wr_buflen) {
+		dprintk("NFSD: xdr error (%s:%d)\n",
+				__FILE__, __LINE__);
+		goto xdr_error;
+	}
+	write->wr_head.iov_base = p;
+	write->wr_head.iov_len = avail;
+	write->wr_pagelist = argp->pagelist;
+
+	len = XDR_QUADLEN(write->wr_buflen) << 2;
+	if (len >= avail) {
+		int pages;
+
+		len -= avail;
+
+		pages = len >> PAGE_SHIFT;
+		argp->pagelist += pages;
+		argp->pagelen -= pages * PAGE_SIZE;
+		len -= pages * PAGE_SIZE;
+
+		argp->p = (__be32 *)page_address(argp->pagelist[0]);
+		argp->pagelist++;
+		argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
+	}
+	argp->p += XDR_QUADLEN(len);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
+{
+	DECODE_HEAD;
+
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
+	READ_BUF(12);
+	COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
+	rlockowner->rl_owner.len = be32_to_cpup(p++);
+	READ_BUF(rlockowner->rl_owner.len);
+	READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+
+	if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+		return nfserr_inval;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+			 struct nfsd4_exchange_id *exid)
+{
+	int dummy, tmp;
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_VERIFIER_SIZE);
+	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+	status = nfsd4_decode_opaque(argp, &exid->clname);
+	if (status)
+		return nfserr_bad_xdr;
+
+	READ_BUF(4);
+	exid->flags = be32_to_cpup(p++);
+
+	/* Ignore state_protect4_a */
+	READ_BUF(4);
+	exid->spa_how = be32_to_cpup(p++);
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		/* spo_must_enforce */
+		READ_BUF(4);
+		dummy = be32_to_cpup(p++);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* spo_must_allow */
+		READ_BUF(4);
+		dummy = be32_to_cpup(p++);
+		READ_BUF(dummy * 4);
+		p += dummy;
+		break;
+	case SP4_SSV:
+		/* ssp_ops */
+		READ_BUF(4);
+		dummy = be32_to_cpup(p++);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		READ_BUF(4);
+		dummy = be32_to_cpup(p++);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* ssp_hash_algs<> */
+		READ_BUF(4);
+		tmp = be32_to_cpup(p++);
+		while (tmp--) {
+			READ_BUF(4);
+			dummy = be32_to_cpup(p++);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
+
+		/* ssp_encr_algs<> */
+		READ_BUF(4);
+		tmp = be32_to_cpup(p++);
+		while (tmp--) {
+			READ_BUF(4);
+			dummy = be32_to_cpup(p++);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
+
+		/* ssp_window and ssp_num_gss_handles */
+		READ_BUF(8);
+		dummy = be32_to_cpup(p++);
+		dummy = be32_to_cpup(p++);
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* Ignore Implementation ID */
+	READ_BUF(4);    /* nfs_impl_id4 array length */
+	dummy = be32_to_cpup(p++);
+
+	if (dummy > 1)
+		goto xdr_error;
+
+	if (dummy == 1) {
+		/* nii_domain */
+		READ_BUF(4);
+		dummy = be32_to_cpup(p++);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_name */
+		READ_BUF(4);
+		dummy = be32_to_cpup(p++);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_date */
+		READ_BUF(12);
+		p += 3;
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+			    struct nfsd4_create_session *sess)
+{
+	DECODE_HEAD;
+	u32 dummy;
+
+	READ_BUF(16);
+	COPYMEM(&sess->clientid, 8);
+	sess->seqid = be32_to_cpup(p++);
+	sess->flags = be32_to_cpup(p++);
+
+	/* Fore channel attrs */
+	READ_BUF(28);
+	dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+	sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
+	sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
+	sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
+	sess->fore_channel.maxops = be32_to_cpup(p++);
+	sess->fore_channel.maxreqs = be32_to_cpup(p++);
+	sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++);
+	if (sess->fore_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		sess->fore_channel.rdma_attrs = be32_to_cpup(p++);
+	} else if (sess->fore_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many fore channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	/* Back channel attrs */
+	READ_BUF(28);
+	dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+	sess->back_channel.maxreq_sz = be32_to_cpup(p++);
+	sess->back_channel.maxresp_sz = be32_to_cpup(p++);
+	sess->back_channel.maxresp_cached = be32_to_cpup(p++);
+	sess->back_channel.maxops = be32_to_cpup(p++);
+	sess->back_channel.maxreqs = be32_to_cpup(p++);
+	sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++);
+	if (sess->back_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		sess->back_channel.rdma_attrs = be32_to_cpup(p++);
+	} else if (sess->back_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many back channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	READ_BUF(4);
+	sess->callback_prog = be32_to_cpup(p++);
+	nfsd4_decode_cb_sec(argp, &sess->cb_sec);
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	DECODE_HEAD;
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
+			  struct nfsd4_free_stateid *free_stateid)
+{
+	DECODE_HEAD;
+
+	READ_BUF(sizeof(stateid_t));
+	free_stateid->fr_stateid.si_generation = be32_to_cpup(p++);
+	COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t));
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+		      struct nfsd4_sequence *seq)
+{
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	seq->seqid = be32_to_cpup(p++);
+	seq->slotid = be32_to_cpup(p++);
+	seq->maxslots = be32_to_cpup(p++);
+	seq->cachethis = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
+{
+	int i;
+	__be32 *p, status;
+	struct nfsd4_test_stateid_id *stateid;
+
+	READ_BUF(4);
+	test_stateid->ts_num_ids = ntohl(*p++);
+
+	INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
+
+	for (i = 0; i < test_stateid->ts_num_ids; i++) {
+		stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
+		if (!stateid) {
+			status = nfserrno(-ENOMEM);
+			goto out;
+		}
+
+		INIT_LIST_HEAD(&stateid->ts_id_list);
+		list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
+
+		status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid);
+		if (status)
+			goto out;
+	}
+
+	status = 0;
+out:
+	return status;
+xdr_error:
+	dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__);
+	status = nfserr_bad_xdr;
+	goto out;
+}
+
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(8);
+	COPYMEM(&dc->clientid, 8);
+
+	DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	rc->rca_one_fs = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	DECODE_HEAD;
+	u32 num, i;
+
+	READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+	COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+	gdev->gd_layout_type = be32_to_cpup(p++);
+	gdev->gd_maxcount = be32_to_cpup(p++);
+	num = be32_to_cpup(p++);
+	if (num) {
+		READ_BUF(4 * num);
+		gdev->gd_notify_types = be32_to_cpup(p++);
+		for (i = 1; i < num; i++) {
+			if (be32_to_cpup(p++)) {
+				status = nfserr_inval;
+				goto out;
+			}
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutget *lgp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(36);
+	lgp->lg_signal = be32_to_cpup(p++);
+	lgp->lg_layout_type = be32_to_cpup(p++);
+	lgp->lg_seg.iomode = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+	p = xdr_decode_hyper(p, &lgp->lg_minlength);
+
+	status = nfsd4_decode_stateid(argp, &lgp->lg_sid);
+	if (status)
+		return status;
+
+	READ_BUF(4);
+	lgp->lg_maxcount = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutcommit *lcp)
+{
+	DECODE_HEAD;
+	u32 timechange;
+
+	READ_BUF(20);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+	lcp->lc_reclaim = be32_to_cpup(p++);
+
+	status = nfsd4_decode_stateid(argp, &lcp->lc_sid);
+	if (status)
+		return status;
+
+	READ_BUF(4);
+	lcp->lc_newoffset = be32_to_cpup(p++);
+	if (lcp->lc_newoffset) {
+		READ_BUF(8);
+		p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+	} else
+		lcp->lc_last_wr = 0;
+	READ_BUF(4);
+	timechange = be32_to_cpup(p++);
+	if (timechange) {
+		status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+		if (status)
+			return status;
+	} else {
+		lcp->lc_mtime.tv_nsec = UTIME_NOW;
+	}
+	READ_BUF(8);
+	lcp->lc_layout_type = be32_to_cpup(p++);
+
+	/*
+	 * Save the layout update in XDR format and let the layout driver deal
+	 * with it later.
+	 */
+	lcp->lc_up_len = be32_to_cpup(p++);
+	if (lcp->lc_up_len > 0) {
+		READ_BUF(lcp->lc_up_len);
+		READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutreturn *lrp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(16);
+	lrp->lr_reclaim = be32_to_cpup(p++);
+	lrp->lr_layout_type = be32_to_cpup(p++);
+	lrp->lr_seg.iomode = be32_to_cpup(p++);
+	lrp->lr_return_type = be32_to_cpup(p++);
+	if (lrp->lr_return_type == RETURN_FILE) {
+		READ_BUF(16);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+
+		status = nfsd4_decode_stateid(argp, &lrp->lr_sid);
+		if (status)
+			return status;
+
+		READ_BUF(4);
+		lrp->lrf_body_len = be32_to_cpup(p++);
+		if (lrp->lrf_body_len > 0) {
+			READ_BUF(lrp->lrf_body_len);
+			READMEM(lrp->lrf_body, lrp->lrf_body_len);
+		}
+	} else {
+		lrp->lr_seg.offset = 0;
+		lrp->lr_seg.length = NFS4_MAX_UINT64;
+	}
+
+	DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+static __be32
+nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
+		       struct nfsd4_fallocate *fallocate)
+{
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(16);
+	p = xdr_decode_hyper(p, &fallocate->falloc_offset);
+	xdr_decode_hyper(p, &fallocate->falloc_length);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+{
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(8 + 4);
+	p = xdr_decode_hyper(p, &seek->seek_offset);
+	seek->seek_whence = be32_to_cpup(p);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfserr_notsupp;
+}
+
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+
+static nfsd4_dec nfsd4_dec_ops[] = {
+	[OP_ACCESS]		= (nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		= (nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		= (nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		= (nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	= (nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		= (nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		= (nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		= (nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		= (nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		= (nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		= (nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		= (nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_putpubfh,
+	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		= (nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		= (nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		= (nfsd4_dec)nfsd4_decode_renew,
+	[OP_RESTOREFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		= (nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		= (nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_dec)nfsd4_decode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
+
+	/* new operations for NFSv4.1 */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_backchannel_ctl,
+	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
+	[OP_EXCHANGE_ID]	= (nfsd4_dec)nfsd4_decode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_dec)nfsd4_decode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
+	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_free_stateid,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
+	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
+	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
+	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_test_stateid,
+	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_destroy_clientid,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
+
+	/* new operations for NFSv4.2 */
+	[OP_ALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
+	[OP_COPY]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DEALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
+	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTERROR]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTSTATS]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
+	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
+};
+
+static inline bool
+nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
+{
+	if (op->opnum < FIRST_NFS4_OP)
+		return false;
+	else if (argp->minorversion == 0 && op->opnum > LAST_NFS40_OP)
+		return false;
+	else if (argp->minorversion == 1 && op->opnum > LAST_NFS41_OP)
+		return false;
+	else if (argp->minorversion == 2 && op->opnum > LAST_NFS42_OP)
+		return false;
+	return true;
+}
+
+static __be32
+nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
+{
+	DECODE_HEAD;
+	struct nfsd4_op *op;
+	bool cachethis = false;
+	int auth_slack= argp->rqstp->rq_auth_slack;
+	int max_reply = auth_slack + 8; /* opcnt, status */
+	int readcount = 0;
+	int readbytes = 0;
+	int i;
+
+	READ_BUF(4);
+	argp->taglen = be32_to_cpup(p++);
+	READ_BUF(argp->taglen + 8);
+	SAVEMEM(argp->tag, argp->taglen);
+	argp->minorversion = be32_to_cpup(p++);
+	argp->opcnt = be32_to_cpup(p++);
+	max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
+
+	if (argp->taglen > NFSD4_MAX_TAGLEN)
+		goto xdr_error;
+	if (argp->opcnt > 100)
+		goto xdr_error;
+
+	if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
+		argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+		if (!argp->ops) {
+			argp->ops = argp->iops;
+			dprintk("nfsd: couldn't allocate room for COMPOUND\n");
+			goto xdr_error;
+		}
+	}
+
+	if (argp->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+		argp->opcnt = 0;
+
+	for (i = 0; i < argp->opcnt; i++) {
+		op = &argp->ops[i];
+		op->replay = NULL;
+
+		READ_BUF(4);
+		op->opnum = be32_to_cpup(p++);
+
+		if (nfsd4_opnum_in_range(argp, op))
+			op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
+		else {
+			op->opnum = OP_ILLEGAL;
+			op->status = nfserr_op_illegal;
+		}
+		/*
+		 * We'll try to cache the result in the DRC if any one
+		 * op in the compound wants to be cached:
+		 */
+		cachethis |= nfsd4_cache_this_op(op);
+
+		if (op->opnum == OP_READ) {
+			readcount++;
+			readbytes += nfsd4_max_reply(argp->rqstp, op);
+		} else
+			max_reply += nfsd4_max_reply(argp->rqstp, op);
+		/*
+		 * OP_LOCK may return a conflicting lock.  (Special case
+		 * because it will just skip encoding this if it runs
+		 * out of xdr buffer space, and it is the only operation
+		 * that behaves this way.)
+		 */
+		if (op->opnum == OP_LOCK)
+			max_reply += NFS4_OPAQUE_LIMIT;
+
+		if (op->status) {
+			argp->opcnt = i+1;
+			break;
+		}
+	}
+	/* Sessions make the DRC unnecessary: */
+	if (argp->minorversion)
+		cachethis = false;
+	svc_reserve(argp->rqstp, max_reply + readbytes);
+	argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
+
+	if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
+		clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+
+	DECODE_TAIL;
+}
+
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode)
+{
+	if (IS_I_VERSION(inode)) {
+		p = xdr_encode_hyper(p, inode->i_version);
+	} else {
+		*p++ = cpu_to_be32(stat->ctime.tv_sec);
+		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
+	}
+	return p;
+}
+
+static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
+{
+	*p++ = cpu_to_be32(c->atomic);
+	if (c->change_supported) {
+		p = xdr_encode_hyper(p, c->before_change);
+		p = xdr_encode_hyper(p, c->after_change);
+	} else {
+		*p++ = cpu_to_be32(c->before_ctime_sec);
+		*p++ = cpu_to_be32(c->before_ctime_nsec);
+		*p++ = cpu_to_be32(c->after_ctime_sec);
+		*p++ = cpu_to_be32(c->after_ctime_nsec);
+	}
+	return p;
+}
+
+/* Encode as an array of strings the string given with components
+ * separated @sep, escaped with esc_enter and esc_exit.
+ */
+static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
+					  char *components, char esc_enter,
+					  char esc_exit)
+{
+	__be32 *p;
+	__be32 pathlen;
+	int pathlen_offset;
+	int strlen, count=0;
+	char *str, *end, *next;
+
+	dprintk("nfsd4_encode_components(%s)\n", components);
+
+	pathlen_offset = xdr->buf->len;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	p++; /* We will fill this in with @count later */
+
+	end = str = components;
+	while (*end) {
+		bool found_esc = false;
+
+		/* try to parse as esc_start, ..., esc_end, sep */
+		if (*str == esc_enter) {
+			for (; *end && (*end != esc_exit); end++)
+				/* find esc_exit or end of string */;
+			next = end + 1;
+			if (*end && (!*next || *next == sep)) {
+				str++;
+				found_esc = true;
+			}
+		}
+
+		if (!found_esc)
+			for (; *end && (*end != sep); end++)
+				/* find sep or end of string */;
+
+		strlen = end - str;
+		if (strlen) {
+			p = xdr_reserve_space(xdr, strlen + 4);
+			if (!p)
+				return nfserr_resource;
+			p = xdr_encode_opaque(p, str, strlen);
+			count++;
+		}
+		else
+			end++;
+		if (found_esc)
+			end = next;
+
+		str = end;
+	}
+	pathlen = htonl(count);
+	write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4);
+	return 0;
+}
+
+/* Encode as an array of strings the string given with components
+ * separated @sep.
+ */
+static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
+				      char *components)
+{
+	return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
+}
+
+/*
+ * encode a location element of a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
+					struct nfsd4_fs_location *location)
+{
+	__be32 status;
+
+	status = nfsd4_encode_components_esc(xdr, ':', location->hosts,
+						'[', ']');
+	if (status)
+		return status;
+	status = nfsd4_encode_components(xdr, '/', location->path);
+	if (status)
+		return status;
+	return 0;
+}
+
+/*
+ * Encode a path in RFC3530 'pathname4' format
+ */
+static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
+				const struct path *root,
+				const struct path *path)
+{
+	struct path cur = *path;
+	__be32 *p;
+	struct dentry **components = NULL;
+	unsigned int ncomponents = 0;
+	__be32 err = nfserr_jukebox;
+
+	dprintk("nfsd4_encode_components(");
+
+	path_get(&cur);
+	/* First walk the path up to the nfsd root, and store the
+	 * dentries/path components in an array.
+	 */
+	for (;;) {
+		if (path_equal(&cur, root))
+			break;
+		if (cur.dentry == cur.mnt->mnt_root) {
+			if (follow_up(&cur))
+				continue;
+			goto out_free;
+		}
+		if ((ncomponents & 15) == 0) {
+			struct dentry **new;
+			new = krealloc(components,
+					sizeof(*new) * (ncomponents + 16),
+					GFP_KERNEL);
+			if (!new)
+				goto out_free;
+			components = new;
+		}
+		components[ncomponents++] = cur.dentry;
+		cur.dentry = dget_parent(cur.dentry);
+	}
+	err = nfserr_resource;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out_free;
+	*p++ = cpu_to_be32(ncomponents);
+
+	while (ncomponents) {
+		struct dentry *dentry = components[ncomponents - 1];
+		unsigned int len;
+
+		spin_lock(&dentry->d_lock);
+		len = dentry->d_name.len;
+		p = xdr_reserve_space(xdr, len + 4);
+		if (!p) {
+			spin_unlock(&dentry->d_lock);
+			goto out_free;
+		}
+		p = xdr_encode_opaque(p, dentry->d_name.name, len);
+		dprintk("/%pd", dentry);
+		spin_unlock(&dentry->d_lock);
+		dput(dentry);
+		ncomponents--;
+	}
+
+	err = 0;
+out_free:
+	dprintk(")\n");
+	while (ncomponents)
+		dput(components[--ncomponents]);
+	kfree(components);
+	path_put(&cur);
+	return err;
+}
+
+static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
+			struct svc_rqst *rqstp, const struct path *path)
+{
+	struct svc_export *exp_ps;
+	__be32 res;
+
+	exp_ps = rqst_find_fsidzero_export(rqstp);
+	if (IS_ERR(exp_ps))
+		return nfserrno(PTR_ERR(exp_ps));
+	res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
+	exp_put(exp_ps);
+	return res;
+}
+
+/*
+ *  encode a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
+			struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	__be32 status;
+	int i;
+	__be32 *p;
+	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
+
+	status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
+	if (status)
+		return status;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(fslocs->locations_count);
+	for (i=0; i<fslocs->locations_count; i++) {
+		status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
+		if (status)
+			return status;
+	}
+	return 0;
+}
+
+static u32 nfs4_file_type(umode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFIFO:	return NF4FIFO;
+	case S_IFCHR:	return NF4CHR;
+	case S_IFDIR:	return NF4DIR;
+	case S_IFBLK:	return NF4BLK;
+	case S_IFLNK:	return NF4LNK;
+	case S_IFREG:	return NF4REG;
+	case S_IFSOCK:	return NF4SOCK;
+	default:	return NF4BAD;
+	};
+}
+
+static inline __be32
+nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+		     struct nfs4_ace *ace)
+{
+	if (ace->whotype != NFS4_ACL_WHO_NAMED)
+		return nfs4_acl_write_who(xdr, ace->whotype);
+	else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+		return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
+	else
+		return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
+}
+
+#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
+			      FATTR4_WORD0_RDATTR_ERROR)
+#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static inline __be32
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+			    void *context, int len)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, len + 4 + 4 + 4);
+	if (!p)
+		return nfserr_resource;
+
+	/*
+	 * For now we use a 0 here to indicate the null translation; in
+	 * the future we may place a call to translation code here.
+	 */
+	*p++ = cpu_to_be32(0); /* lfs */
+	*p++ = cpu_to_be32(0); /* pi */
+	p = xdr_encode_opaque(p, context, len);
+	return 0;
+}
+#else
+static inline __be32
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+			    void *context, int len)
+{ return 0; }
+#endif
+
+static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
+{
+	/* As per referral draft:  */
+	if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
+	    *bmval1 & ~WORD1_ABSENT_FS_ATTRS) {
+		if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR ||
+	            *bmval0 & FATTR4_WORD0_FS_LOCATIONS)
+			*rdattr_err = NFSERR_MOVED;
+		else
+			return nfserr_moved;
+	}
+	*bmval0 &= WORD0_ABSENT_FS_ATTRS;
+	*bmval1 &= WORD1_ABSENT_FS_ATTRS;
+	return 0;
+}
+
+
+static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+{
+	struct path path = exp->ex_path;
+	int err;
+
+	path_get(&path);
+	while (follow_up(&path)) {
+		if (path.dentry != path.mnt->mnt_root)
+			break;
+	}
+	err = vfs_getattr(&path, stat);
+	path_put(&path);
+	return err;
+}
+
+/*
+ * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
+ * ourselves.
+ */
+static __be32
+nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+		struct svc_export *exp,
+		struct dentry *dentry, u32 *bmval,
+		struct svc_rqst *rqstp, int ignore_crossmnt)
+{
+	u32 bmval0 = bmval[0];
+	u32 bmval1 = bmval[1];
+	u32 bmval2 = bmval[2];
+	struct kstat stat;
+	struct svc_fh *tempfh = NULL;
+	struct kstatfs statfs;
+	__be32 *p;
+	int starting_len = xdr->buf->len;
+	int attrlen_offset;
+	__be32 attrlen;
+	u32 dummy;
+	u64 dummy64;
+	u32 rdattr_err = 0;
+	__be32 status;
+	int err;
+	int aclsupport = 0;
+	struct nfs4_acl *acl = NULL;
+	void *context = NULL;
+	int contextlen;
+	bool contextsupport = false;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
+	struct path path = {
+		.mnt	= exp->ex_path.mnt,
+		.dentry	= dentry,
+	};
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
+	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
+	BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+	BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
+
+	if (exp->ex_fslocs.migrated) {
+		BUG_ON(bmval[2]);
+		status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
+		if (status)
+			goto out;
+	}
+
+	err = vfs_getattr(&path, &stat);
+	if (err)
+		goto out_nfserr;
+	if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+			FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
+	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+		       FATTR4_WORD1_SPACE_TOTAL))) {
+		err = vfs_statfs(&path, &statfs);
+		if (err)
+			goto out_nfserr;
+	}
+	if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+		tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+		status = nfserr_jukebox;
+		if (!tempfh)
+			goto out;
+		fh_init(tempfh, NFS4_FHSIZE);
+		status = fh_compose(tempfh, exp, dentry, NULL);
+		if (status)
+			goto out;
+		fhp = tempfh;
+	}
+	if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
+			| FATTR4_WORD0_SUPPORTED_ATTRS)) {
+		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+		aclsupport = (err == 0);
+		if (bmval0 & FATTR4_WORD0_ACL) {
+			if (err == -EOPNOTSUPP)
+				bmval0 &= ~FATTR4_WORD0_ACL;
+			else if (err == -EINVAL) {
+				status = nfserr_attrnotsupp;
+				goto out;
+			} else if (err != 0)
+				goto out_nfserr;
+		}
+	}
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+			bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+		err = security_inode_getsecctx(d_inode(dentry),
+						&context, &contextlen);
+		contextsupport = (err == 0);
+		if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+			if (err == -EOPNOTSUPP)
+				bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+			else if (err)
+				goto out_nfserr;
+		}
+	}
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+
+	if (bmval2) {
+		p = xdr_reserve_space(xdr, 16);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(3);
+		*p++ = cpu_to_be32(bmval0);
+		*p++ = cpu_to_be32(bmval1);
+		*p++ = cpu_to_be32(bmval2);
+	} else if (bmval1) {
+		p = xdr_reserve_space(xdr, 12);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(2);
+		*p++ = cpu_to_be32(bmval0);
+		*p++ = cpu_to_be32(bmval1);
+	} else {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+		*p++ = cpu_to_be32(bmval0);
+	}
+
+	attrlen_offset = xdr->buf->len;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out_resource;
+	p++;                /* to be backfilled later */
+
+	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+		u32 word0 = nfsd_suppattrs0(minorversion);
+		u32 word1 = nfsd_suppattrs1(minorversion);
+		u32 word2 = nfsd_suppattrs2(minorversion);
+
+		if (!aclsupport)
+			word0 &= ~FATTR4_WORD0_ACL;
+		if (!contextsupport)
+			word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+		if (!word2) {
+			p = xdr_reserve_space(xdr, 12);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(2);
+			*p++ = cpu_to_be32(word0);
+			*p++ = cpu_to_be32(word1);
+		} else {
+			p = xdr_reserve_space(xdr, 16);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(3);
+			*p++ = cpu_to_be32(word0);
+			*p++ = cpu_to_be32(word1);
+			*p++ = cpu_to_be32(word2);
+		}
+	}
+	if (bmval0 & FATTR4_WORD0_TYPE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		dummy = nfs4_file_type(stat.mode);
+		if (dummy == NF4BAD) {
+			status = nfserr_serverfault;
+			goto out;
+		}
+		*p++ = cpu_to_be32(dummy);
+	}
+	if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
+			*p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
+		else
+			*p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
+						NFS4_FH_VOL_RENAME);
+	}
+	if (bmval0 & FATTR4_WORD0_CHANGE) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = encode_change(p, &stat, d_inode(dentry));
+	}
+	if (bmval0 & FATTR4_WORD0_SIZE) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, stat.size);
+	}
+	if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(0);
+	}
+	if (bmval0 & FATTR4_WORD0_FSID) {
+		p = xdr_reserve_space(xdr, 16);
+		if (!p)
+			goto out_resource;
+		if (exp->ex_fslocs.migrated) {
+			p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+			p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
+		} else switch(fsid_source(fhp)) {
+		case FSIDSOURCE_FSID:
+			p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
+			p = xdr_encode_hyper(p, (u64)0);
+			break;
+		case FSIDSOURCE_DEV:
+			*p++ = cpu_to_be32(0);
+			*p++ = cpu_to_be32(MAJOR(stat.dev));
+			*p++ = cpu_to_be32(0);
+			*p++ = cpu_to_be32(MINOR(stat.dev));
+			break;
+		case FSIDSOURCE_UUID:
+			p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
+								EX_UUID_LEN);
+			break;
+		}
+	}
+	if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(0);
+	}
+	if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(nn->nfsd4_lease);
+	}
+	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(rdattr_err);
+	}
+	if (bmval0 & FATTR4_WORD0_ACL) {
+		struct nfs4_ace *ace;
+
+		if (acl == NULL) {
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				goto out_resource;
+
+			*p++ = cpu_to_be32(0);
+			goto out_acl;
+		}
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(acl->naces);
+
+		for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+			p = xdr_reserve_space(xdr, 4*3);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(ace->type);
+			*p++ = cpu_to_be32(ace->flag);
+			*p++ = cpu_to_be32(ace->access_mask &
+							NFS4_ACE_MASK_ALL);
+			status = nfsd4_encode_aclname(xdr, rqstp, ace);
+			if (status)
+				goto out;
+		}
+	}
+out_acl:
+	if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(aclsupport ?
+			ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
+	}
+	if (bmval0 & FATTR4_WORD0_CANSETTIME) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(0);
+	}
+	if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
+		p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+					fhp->fh_handle.fh_size);
+	}
+	if (bmval0 & FATTR4_WORD0_FILEID) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, stat.ino);
+	}
+	if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
+	}
+	if (bmval0 & FATTR4_WORD0_FILES_FREE) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
+	}
+	if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (u64) statfs.f_files);
+	}
+	if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+		status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
+		if (status)
+			goto out;
+	}
+	if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXLINK) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(255);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXNAME) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(statfs.f_namelen);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXREAD) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
+	}
+	if (bmval0 & FATTR4_WORD0_MAXWRITE) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
+	}
+	if (bmval1 & FATTR4_WORD1_MODE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(stat.mode & S_IALLUGO);
+	}
+	if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(1);
+	}
+	if (bmval1 & FATTR4_WORD1_NUMLINKS) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(stat.nlink);
+	}
+	if (bmval1 & FATTR4_WORD1_OWNER) {
+		status = nfsd4_encode_user(xdr, rqstp, stat.uid);
+		if (status)
+			goto out;
+	}
+	if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
+		status = nfsd4_encode_group(xdr, rqstp, stat.gid);
+		if (status)
+			goto out;
+	}
+	if (bmval1 & FATTR4_WORD1_RAWDEV) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
+		*p++ = cpu_to_be32((u32) MINOR(stat.rdev));
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
+		p = xdr_encode_hyper(p, dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
+		p = xdr_encode_hyper(p, dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
+		p = xdr_encode_hyper(p, dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_USED) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			goto out_resource;
+		dummy64 = (u64)stat.blocks << 9;
+		p = xdr_encode_hyper(p, dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
+		p = xdr_reserve_space(xdr, 12);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
+		*p++ = cpu_to_be32(stat.atime.tv_nsec);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
+		p = xdr_reserve_space(xdr, 12);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(1);
+		*p++ = cpu_to_be32(0);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
+		p = xdr_reserve_space(xdr, 12);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
+		*p++ = cpu_to_be32(stat.ctime.tv_nsec);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
+		p = xdr_reserve_space(xdr, 12);
+		if (!p)
+			goto out_resource;
+		p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
+		*p++ = cpu_to_be32(stat.mtime.tv_nsec);
+	}
+	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+                	goto out_resource;
+		/*
+		 * Get parent's attributes if not ignoring crossmount
+		 * and this is the root of a cross-mounted filesystem.
+		 */
+		if (ignore_crossmnt == 0 &&
+		    dentry == exp->ex_path.mnt->mnt_root)
+			get_parent_attributes(exp, &stat);
+		p = xdr_encode_hyper(p, stat.ino);
+	}
+#ifdef CONFIG_NFSD_PNFS
+	if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+	    (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+		if (exp->ex_layout_type) {
+			p = xdr_reserve_space(xdr, 8);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(1);
+			*p++ = cpu_to_be32(exp->ex_layout_type);
+		} else {
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(0);
+		}
+	}
+
+	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(stat.blksize);
+	}
+#endif /* CONFIG_NFSD_PNFS */
+	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+		status = nfsd4_encode_security_label(xdr, rqstp, context,
+								contextlen);
+		if (status)
+			goto out;
+	}
+	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+		p = xdr_reserve_space(xdr, 16);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(3);
+		*p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+		*p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+		*p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+	}
+
+	attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
+	write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
+	status = nfs_ok;
+
+out:
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	if (context)
+		security_release_secctx(context, contextlen);
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+	kfree(acl);
+	if (tempfh) {
+		fh_put(tempfh);
+		kfree(tempfh);
+	}
+	if (status)
+		xdr_truncate_encode(xdr, starting_len);
+	return status;
+out_nfserr:
+	status = nfserrno(err);
+	goto out;
+out_resource:
+	status = nfserr_resource;
+	goto out;
+}
+
+static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
+				struct xdr_buf *buf, __be32 *p, int bytes)
+{
+	xdr->scratch.iov_len = 0;
+	memset(buf, 0, sizeof(struct xdr_buf));
+	buf->head[0].iov_base = p;
+	buf->head[0].iov_len = 0;
+	buf->len = 0;
+	xdr->buf = buf;
+	xdr->iov = buf->head;
+	xdr->p = p;
+	xdr->end = (void *)p + bytes;
+	buf->buflen = bytes;
+}
+
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+			struct svc_fh *fhp, struct svc_export *exp,
+			struct dentry *dentry, u32 *bmval,
+			struct svc_rqst *rqstp, int ignore_crossmnt)
+{
+	struct xdr_buf dummy;
+	struct xdr_stream xdr;
+	__be32 ret;
+
+	svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
+	ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
+							ignore_crossmnt);
+	*p = xdr.p;
+	return ret;
+}
+
+static inline int attributes_need_mount(u32 *bmval)
+{
+	if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
+		return 1;
+	if (bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID)
+		return 1;
+	return 0;
+}
+
+static __be32
+nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
+			const char *name, int namlen)
+{
+	struct svc_export *exp = cd->rd_fhp->fh_export;
+	struct dentry *dentry;
+	__be32 nfserr;
+	int ignore_crossmnt = 0;
+
+	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	if (IS_ERR(dentry))
+		return nfserrno(PTR_ERR(dentry));
+	if (d_really_is_negative(dentry)) {
+		/*
+		 * nfsd_buffered_readdir drops the i_mutex between
+		 * readdir and calling this callback, leaving a window
+		 * where this directory entry could have gone away.
+		 */
+		dput(dentry);
+		return nfserr_noent;
+	}
+
+	exp_get(exp);
+	/*
+	 * In the case of a mountpoint, the client may be asking for
+	 * attributes that are only properties of the underlying filesystem
+	 * as opposed to the cross-mounted file system. In such a case,
+	 * we will not follow the cross mount and will fill the attribtutes
+	 * directly from the mountpoint dentry.
+	 */
+	if (nfsd_mountpoint(dentry, exp)) {
+		int err;
+
+		if (!(exp->ex_flags & NFSEXP_V4ROOT)
+				&& !attributes_need_mount(cd->rd_bmval)) {
+			ignore_crossmnt = 1;
+			goto out_encode;
+		}
+		/*
+		 * Why the heck aren't we just using nfsd_lookup??
+		 * Different "."/".." handling?  Something else?
+		 * At least, add a comment here to explain....
+		 */
+		err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp);
+		if (err) {
+			nfserr = nfserrno(err);
+			goto out_put;
+		}
+		nfserr = check_nfsd_access(exp, cd->rd_rqstp);
+		if (nfserr)
+			goto out_put;
+
+	}
+out_encode:
+	nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
+					cd->rd_rqstp, ignore_crossmnt);
+out_put:
+	dput(dentry);
+	exp_put(exp);
+	return nfserr;
+}
+
+static __be32 *
+nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 20);
+	if (!p)
+		return NULL;
+	*p++ = htonl(2);
+	*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
+	*p++ = htonl(0);			 /* bmval1 */
+
+	*p++ = htonl(4);     /* attribute length */
+	*p++ = nfserr;       /* no htonl */
+	return p;
+}
+
+static int
+nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+		    loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_cd *ccd = ccdv;
+	struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
+	struct xdr_stream *xdr = cd->xdr;
+	int start_offset = xdr->buf->len;
+	int cookie_offset;
+	u32 name_and_cookie;
+	int entry_bytes;
+	__be32 nfserr = nfserr_toosmall;
+	__be64 wire_offset;
+	__be32 *p;
+
+	/* In nfsv4, "." and ".." never make it onto the wire.. */
+	if (name && isdotent(name, namlen)) {
+		cd->common.err = nfs_ok;
+		return 0;
+	}
+
+	if (cd->cookie_offset) {
+		wire_offset = cpu_to_be64(offset);
+		write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
+							&wire_offset, 8);
+	}
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto fail;
+	*p++ = xdr_one;                             /* mark entry present */
+	cookie_offset = xdr->buf->len;
+	p = xdr_reserve_space(xdr, 3*4 + namlen);
+	if (!p)
+		goto fail;
+	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);    /* offset of next entry */
+	p = xdr_encode_array(p, name, namlen);      /* name length & name */
+
+	nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
+	switch (nfserr) {
+	case nfs_ok:
+		break;
+	case nfserr_resource:
+		nfserr = nfserr_toosmall;
+		goto fail;
+	case nfserr_noent:
+		xdr_truncate_encode(xdr, start_offset);
+		goto skip_entry;
+	default:
+		/*
+		 * If the client requested the RDATTR_ERROR attribute,
+		 * we stuff the error code into this attribute
+		 * and continue.  If this attribute was not requested,
+		 * then in accordance with the spec, we fail the
+		 * entire READDIR operation(!)
+		 */
+		if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
+			goto fail;
+		p = nfsd4_encode_rdattr_error(xdr, nfserr);
+		if (p == NULL) {
+			nfserr = nfserr_toosmall;
+			goto fail;
+		}
+	}
+	nfserr = nfserr_toosmall;
+	entry_bytes = xdr->buf->len - start_offset;
+	if (entry_bytes > cd->rd_maxcount)
+		goto fail;
+	cd->rd_maxcount -= entry_bytes;
+	/*
+	 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+	 * let's always let through the first entry, at least:
+	 */
+	if (!cd->rd_dircount)
+		goto fail;
+	name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
+	if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+		goto fail;
+	cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+
+	cd->cookie_offset = cookie_offset;
+skip_entry:
+	cd->common.err = nfs_ok;
+	return 0;
+fail:
+	xdr_truncate_encode(xdr, start_offset);
+	cd->common.err = nfserr;
+	return -EINVAL;
+}
+
+static __be32
+nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, sizeof(stateid_t));
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(sid->si_generation);
+	p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
+					sizeof(stateid_opaque_t));
+	return 0;
+}
+
+static __be32
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			return nfserr_resource;
+		*p++ = cpu_to_be32(access->ac_supported);
+		*p++ = cpu_to_be32(access->ac_resp_access);
+	}
+	return nfserr;
+}
+
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
+						NFS4_MAX_SESSIONID_LEN);
+		*p++ = cpu_to_be32(bcts->dir);
+		/* Sorry, we do not yet support RDMA over 4.1: */
+		*p++ = cpu_to_be32(0);
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	if (!nfserr)
+		nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid);
+
+	return nfserr;
+}
+
+
+static __be32
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_opaque_fixed(p, commit->co_verf.data,
+						NFS4_VERIFIER_SIZE);
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 32);
+		if (!p)
+			return nfserr_resource;
+		p = encode_cinfo(p, &create->cr_cinfo);
+		*p++ = cpu_to_be32(2);
+		*p++ = cpu_to_be32(create->cr_bmval[0]);
+		*p++ = cpu_to_be32(create->cr_bmval[1]);
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
+{
+	struct svc_fh *fhp = getattr->ga_fhp;
+	struct xdr_stream *xdr = &resp->xdr;
+
+	if (nfserr)
+		return nfserr;
+
+	nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
+				    getattr->ga_bmval,
+				    resp->rqstp, 0);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	struct svc_fh *fhp = *fhpp;
+	unsigned int len;
+	__be32 *p;
+
+	if (!nfserr) {
+		len = fhp->fh_handle.fh_size;
+		p = xdr_reserve_space(xdr, len + 4);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
+	}
+	return nfserr;
+}
+
+/*
+* Including all fields other than the name, a LOCK4denied structure requires
+*   8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
+*/
+static __be32
+nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
+{
+	struct xdr_netobj *conf = &ld->ld_owner;
+	__be32 *p;
+
+again:
+	p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
+	if (!p) {
+		/*
+		 * Don't fail to return the result just because we can't
+		 * return the conflicting open:
+		 */
+		if (conf->len) {
+			kfree(conf->data);
+			conf->len = 0;
+			conf->data = NULL;
+			goto again;
+		}
+		return nfserr_resource;
+	}
+	p = xdr_encode_hyper(p, ld->ld_start);
+	p = xdr_encode_hyper(p, ld->ld_length);
+	*p++ = cpu_to_be32(ld->ld_type);
+	if (conf->len) {
+		p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
+		p = xdr_encode_opaque(p, conf->data, conf->len);
+		kfree(conf->data);
+	}  else {  /* non - nfsv4 lock in conflict, no clientid nor owner */
+		p = xdr_encode_hyper(p, (u64)0); /* clientid */
+		*p++ = cpu_to_be32(0); /* length of owner name */
+	}
+	return nfserr_denied;
+}
+
+static __be32
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	if (!nfserr)
+		nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
+	else if (nfserr == nfserr_denied)
+		nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
+
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	if (nfserr == nfserr_denied)
+		nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	if (!nfserr)
+		nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid);
+
+	return nfserr;
+}
+
+
+static __be32
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 20);
+		if (!p)
+			return nfserr_resource;
+		p = encode_cinfo(p, &link->li_cinfo);
+	}
+	return nfserr;
+}
+
+
+static __be32
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		goto out;
+
+	nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
+	if (nfserr)
+		goto out;
+	p = xdr_reserve_space(xdr, 40);
+	if (!p)
+		return nfserr_resource;
+	p = encode_cinfo(p, &open->op_cinfo);
+	*p++ = cpu_to_be32(open->op_rflags);
+	*p++ = cpu_to_be32(2);
+	*p++ = cpu_to_be32(open->op_bmval[0]);
+	*p++ = cpu_to_be32(open->op_bmval[1]);
+	*p++ = cpu_to_be32(open->op_delegate_type);
+
+	switch (open->op_delegate_type) {
+	case NFS4_OPEN_DELEGATE_NONE:
+		break;
+	case NFS4_OPEN_DELEGATE_READ:
+		nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+		if (nfserr)
+			return nfserr;
+		p = xdr_reserve_space(xdr, 20);
+		if (!p)
+			return nfserr_resource;
+		*p++ = cpu_to_be32(open->op_recall);
+
+		/*
+		 * TODO: ACE's in delegations
+		 */
+		*p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(0);   /* XXX: is NULL principal ok? */
+		break;
+	case NFS4_OPEN_DELEGATE_WRITE:
+		nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+		if (nfserr)
+			return nfserr;
+		p = xdr_reserve_space(xdr, 32);
+		if (!p)
+			return nfserr_resource;
+		*p++ = cpu_to_be32(0);
+
+		/*
+		 * TODO: space_limit's in delegations
+		 */
+		*p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
+		*p++ = cpu_to_be32(~(u32)0);
+		*p++ = cpu_to_be32(~(u32)0);
+
+		/*
+		 * TODO: ACE's in delegations
+		 */
+		*p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(0);   /* XXX: is NULL principal ok? */
+		break;
+	case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
+		switch (open->op_why_no_deleg) {
+		case WND4_CONTENTION:
+		case WND4_RESOURCE:
+			p = xdr_reserve_space(xdr, 8);
+			if (!p)
+				return nfserr_resource;
+			*p++ = cpu_to_be32(open->op_why_no_deleg);
+			/* deleg signaling not supported yet: */
+			*p++ = cpu_to_be32(0);
+			break;
+		default:
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				return nfserr_resource;
+			*p++ = cpu_to_be32(open->op_why_no_deleg);
+		}
+		break;
+	default:
+		BUG();
+	}
+	/* XXX save filehandle here */
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	if (!nfserr)
+		nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
+
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	if (!nfserr)
+		nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid);
+
+	return nfserr;
+}
+
+static __be32 nfsd4_encode_splice_read(
+				struct nfsd4_compoundres *resp,
+				struct nfsd4_read *read,
+				struct file *file, unsigned long maxcount)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_buf *buf = xdr->buf;
+	u32 eof;
+	int space_left;
+	__be32 nfserr;
+	__be32 *p = xdr->p - 2;
+
+	/* Make sure there will be room for padding if needed */
+	if (xdr->end - xdr->p < 1)
+		return nfserr_resource;
+
+	nfserr = nfsd_splice_read(read->rd_rqstp, file,
+				  read->rd_offset, &maxcount);
+	if (nfserr) {
+		/*
+		 * nfsd_splice_actor may have already messed with the
+		 * page length; reset it so as not to confuse
+		 * xdr_truncate_encode:
+		 */
+		buf->page_len = 0;
+		return nfserr;
+	}
+
+	eof = (read->rd_offset + maxcount >=
+	       d_inode(read->rd_fhp->fh_dentry)->i_size);
+
+	*(p++) = htonl(eof);
+	*(p++) = htonl(maxcount);
+
+	buf->page_len = maxcount;
+	buf->len += maxcount;
+	xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+							/ PAGE_SIZE;
+
+	/* Use rest of head for padding and remaining ops: */
+	buf->tail[0].iov_base = xdr->p;
+	buf->tail[0].iov_len = 0;
+	xdr->iov = buf->tail;
+	if (maxcount&3) {
+		int pad = 4 - (maxcount&3);
+
+		*(xdr->p++) = 0;
+
+		buf->tail[0].iov_base += maxcount&3;
+		buf->tail[0].iov_len = pad;
+		buf->len += pad;
+	}
+
+	space_left = min_t(int, (void *)xdr->end - (void *)xdr->p,
+				buf->buflen - buf->len);
+	buf->buflen = buf->len + space_left;
+	xdr->end = (__be32 *)((void *)xdr->end + space_left);
+
+	return 0;
+}
+
+static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
+				 struct nfsd4_read *read,
+				 struct file *file, unsigned long maxcount)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	u32 eof;
+	int v;
+	int starting_len = xdr->buf->len - 8;
+	long len;
+	int thislen;
+	__be32 nfserr;
+	__be32 tmp;
+	__be32 *p;
+	u32 zzz = 0;
+	int pad;
+
+	len = maxcount;
+	v = 0;
+
+	thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
+	p = xdr_reserve_space(xdr, (thislen+3)&~3);
+	WARN_ON_ONCE(!p);
+	resp->rqstp->rq_vec[v].iov_base = p;
+	resp->rqstp->rq_vec[v].iov_len = thislen;
+	v++;
+	len -= thislen;
+
+	while (len) {
+		thislen = min_t(long, len, PAGE_SIZE);
+		p = xdr_reserve_space(xdr, (thislen+3)&~3);
+		WARN_ON_ONCE(!p);
+		resp->rqstp->rq_vec[v].iov_base = p;
+		resp->rqstp->rq_vec[v].iov_len = thislen;
+		v++;
+		len -= thislen;
+	}
+	read->rd_vlen = v;
+
+	nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
+			read->rd_vlen, &maxcount);
+	if (nfserr)
+		return nfserr;
+	xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
+
+	eof = (read->rd_offset + maxcount >=
+	       d_inode(read->rd_fhp->fh_dentry)->i_size);
+
+	tmp = htonl(eof);
+	write_bytes_to_xdr_buf(xdr->buf, starting_len    , &tmp, 4);
+	tmp = htonl(maxcount);
+	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+
+	pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
+	write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
+								&zzz, pad);
+	return 0;
+
+}
+
+static __be32
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_read *read)
+{
+	unsigned long maxcount;
+	struct xdr_stream *xdr = &resp->xdr;
+	struct file *file = read->rd_filp;
+	struct svc_fh *fhp = read->rd_fhp;
+	int starting_len = xdr->buf->len;
+	struct raparms *ra;
+	__be32 *p;
+	__be32 err;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
+	if (!p) {
+		WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
+		return nfserr_resource;
+	}
+	if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
+		WARN_ON_ONCE(1);
+		return nfserr_resource;
+	}
+	xdr_commit_encode(xdr);
+
+	maxcount = svc_max_payload(resp->rqstp);
+	maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
+	maxcount = min_t(unsigned long, maxcount, read->rd_length);
+
+	if (read->rd_filp)
+		err = nfsd_permission(resp->rqstp, fhp->fh_export,
+				fhp->fh_dentry,
+				NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
+	else
+		err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
+						&file, &ra);
+	if (err)
+		goto err_truncate;
+
+	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+		err = nfsd4_encode_splice_read(resp, read, file, maxcount);
+	else
+		err = nfsd4_encode_readv(resp, read, file, maxcount);
+
+	if (!read->rd_filp)
+		nfsd_put_tmp_read_open(file, ra);
+
+err_truncate:
+	if (err)
+		xdr_truncate_encode(xdr, starting_len);
+	return err;
+}
+
+static __be32
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+{
+	int maxcount;
+	__be32 wire_count;
+	int zero = 0;
+	struct xdr_stream *xdr = &resp->xdr;
+	int length_offset = xdr->buf->len;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	maxcount = PAGE_SIZE;
+
+	p = xdr_reserve_space(xdr, maxcount);
+	if (!p)
+		return nfserr_resource;
+	/*
+	 * XXX: By default, the ->readlink() VFS op will truncate symlinks
+	 * if they would overflow the buffer.  Is this kosher in NFSv4?  If
+	 * not, one easy fix is: if ->readlink() precisely fills the buffer,
+	 * assume that truncation occurred, and return NFS4ERR_RESOURCE.
+	 */
+	nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp,
+						(char *)p, &maxcount);
+	if (nfserr == nfserr_isdir)
+		nfserr = nfserr_inval;
+	if (nfserr) {
+		xdr_truncate_encode(xdr, length_offset);
+		return nfserr;
+	}
+
+	wire_count = htonl(maxcount);
+	write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
+	xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4));
+	if (maxcount & 3)
+		write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
+						&zero, 4 - (maxcount&3));
+	return 0;
+}
+
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
+{
+	int maxcount;
+	int bytes_left;
+	loff_t offset;
+	__be64 wire_offset;
+	struct xdr_stream *xdr = &resp->xdr;
+	int starting_len = xdr->buf->len;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+	if (!p)
+		return nfserr_resource;
+
+	/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
+	*p++ = cpu_to_be32(0);
+	*p++ = cpu_to_be32(0);
+	resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p)
+				- (char *)resp->xdr.buf->head[0].iov_base;
+
+	/*
+	 * Number of bytes left for directory entries allowing for the
+	 * final 8 bytes of the readdir and a following failed op:
+	 */
+	bytes_left = xdr->buf->buflen - xdr->buf->len
+			- COMPOUND_ERR_SLACK_SPACE - 8;
+	if (bytes_left < 0) {
+		nfserr = nfserr_resource;
+		goto err_no_verf;
+	}
+	maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX);
+	/*
+	 * Note the rfc defines rd_maxcount as the size of the
+	 * READDIR4resok structure, which includes the verifier above
+	 * and the 8 bytes encoded at the end of this function:
+	 */
+	if (maxcount < 16) {
+		nfserr = nfserr_toosmall;
+		goto err_no_verf;
+	}
+	maxcount = min_t(int, maxcount-16, bytes_left);
+
+	/* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+	if (!readdir->rd_dircount)
+		readdir->rd_dircount = INT_MAX;
+
+	readdir->xdr = xdr;
+	readdir->rd_maxcount = maxcount;
+	readdir->common.err = 0;
+	readdir->cookie_offset = 0;
+
+	offset = readdir->rd_cookie;
+	nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
+			      &offset,
+			      &readdir->common, nfsd4_encode_dirent);
+	if (nfserr == nfs_ok &&
+	    readdir->common.err == nfserr_toosmall &&
+	    xdr->buf->len == starting_len + 8) {
+		/* nothing encoded; which limit did we hit?: */
+		if (maxcount - 16 < bytes_left)
+			/* It was the fault of rd_maxcount: */
+			nfserr = nfserr_toosmall;
+		else
+			/* We ran out of buffer space: */
+			nfserr = nfserr_resource;
+	}
+	if (nfserr)
+		goto err_no_verf;
+
+	if (readdir->cookie_offset) {
+		wire_offset = cpu_to_be64(offset);
+		write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
+							&wire_offset, 8);
+	}
+
+	p = xdr_reserve_space(xdr, 8);
+	if (!p) {
+		WARN_ON_ONCE(1);
+		goto err_no_verf;
+	}
+	*p++ = 0;	/* no more entries */
+	*p++ = htonl(readdir->common.err == nfserr_eof);
+
+	return 0;
+err_no_verf:
+	xdr_truncate_encode(xdr, starting_len);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 20);
+		if (!p)
+			return nfserr_resource;
+		p = encode_cinfo(p, &remove->rm_cinfo);
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 40);
+		if (!p)
+			return nfserr_resource;
+		p = encode_cinfo(p, &rename->rn_sinfo);
+		p = encode_cinfo(p, &rename->rn_tinfo);
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_do_encode_secinfo(struct xdr_stream *xdr,
+			 __be32 nfserr, struct svc_export *exp)
+{
+	u32 i, nflavs, supported;
+	struct exp_flavor_info *flavs;
+	struct exp_flavor_info def_flavs[2];
+	__be32 *p, *flavorsp;
+	static bool report = true;
+
+	if (nfserr)
+		goto out;
+	nfserr = nfserr_resource;
+	if (exp->ex_nflavors) {
+		flavs = exp->ex_flavors;
+		nflavs = exp->ex_nflavors;
+	} else { /* Handling of some defaults in absence of real secinfo: */
+		flavs = def_flavs;
+		if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) {
+			nflavs = 2;
+			flavs[0].pseudoflavor = RPC_AUTH_UNIX;
+			flavs[1].pseudoflavor = RPC_AUTH_NULL;
+		} else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) {
+			nflavs = 1;
+			flavs[0].pseudoflavor
+					= svcauth_gss_flavor(exp->ex_client);
+		} else {
+			nflavs = 1;
+			flavs[0].pseudoflavor
+					= exp->ex_client->flavour->flavour;
+		}
+	}
+
+	supported = 0;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out;
+	flavorsp = p++;		/* to be backfilled later */
+
+	for (i = 0; i < nflavs; i++) {
+		rpc_authflavor_t pf = flavs[i].pseudoflavor;
+		struct rpcsec_gss_info info;
+
+		if (rpcauth_get_gssinfo(pf, &info) == 0) {
+			supported++;
+			p = xdr_reserve_space(xdr, 4 + 4 +
+					      XDR_LEN(info.oid.len) + 4 + 4);
+			if (!p)
+				goto out;
+			*p++ = cpu_to_be32(RPC_AUTH_GSS);
+			p = xdr_encode_opaque(p,  info.oid.data, info.oid.len);
+			*p++ = cpu_to_be32(info.qop);
+			*p++ = cpu_to_be32(info.service);
+		} else if (pf < RPC_AUTH_MAXFLAVOR) {
+			supported++;
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				goto out;
+			*p++ = cpu_to_be32(pf);
+		} else {
+			if (report)
+				pr_warn("NFS: SECINFO: security flavor %u "
+					"is not supported\n", pf);
+		}
+	}
+
+	if (nflavs != supported)
+		report = false;
+	*flavorsp = htonl(supported);
+	nfserr = 0;
+out:
+	if (exp)
+		exp_put(exp);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo *secinfo)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp);
+}
+
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo_no_name *secinfo)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+
+	return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp);
+}
+
+/*
+ * The SETATTR encode routine is special -- it always encodes a bitmap,
+ * regardless of the error status.
+ */
+static __be32
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 16);
+	if (!p)
+		return nfserr_resource;
+	if (nfserr) {
+		*p++ = cpu_to_be32(3);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(0);
+	}
+	else {
+		*p++ = cpu_to_be32(3);
+		*p++ = cpu_to_be32(setattr->sa_bmval[0]);
+		*p++ = cpu_to_be32(setattr->sa_bmval[1]);
+		*p++ = cpu_to_be32(setattr->sa_bmval[2]);
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8);
+		p = xdr_encode_opaque_fixed(p, &scd->se_confirm,
+						NFS4_VERIFIER_SIZE);
+	}
+	else if (nfserr == nfserr_clid_inuse) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			return nfserr_resource;
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(0);
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 16);
+		if (!p)
+			return nfserr_resource;
+		*p++ = cpu_to_be32(write->wr_bytes_written);
+		*p++ = cpu_to_be32(write->wr_how_written);
+		p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+							NFS4_VERIFIER_SIZE);
+	}
+	return nfserr;
+}
+
+static const u32 nfs4_minimal_spo_must_enforce[2] = {
+	[1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+	      1 << (OP_EXCHANGE_ID - 32) |
+	      1 << (OP_CREATE_SESSION - 32) |
+	      1 << (OP_DESTROY_SESSION - 32) |
+	      1 << (OP_DESTROY_CLIENTID - 32)
+};
+
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+			 struct nfsd4_exchange_id *exid)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+	char *major_id;
+	char *server_scope;
+	int major_id_sz;
+	int server_scope_sz;
+	uint64_t minor_id = 0;
+
+	if (nfserr)
+		return nfserr;
+
+	major_id = utsname()->nodename;
+	major_id_sz = strlen(major_id);
+	server_scope = utsname()->nodename;
+	server_scope_sz = strlen(server_scope);
+
+	p = xdr_reserve_space(xdr,
+		8 /* eir_clientid */ +
+		4 /* eir_sequenceid */ +
+		4 /* eir_flags */ +
+		4 /* spr_how */);
+	if (!p)
+		return nfserr_resource;
+
+	p = xdr_encode_opaque_fixed(p, &exid->clientid, 8);
+	*p++ = cpu_to_be32(exid->seqid);
+	*p++ = cpu_to_be32(exid->flags);
+
+	*p++ = cpu_to_be32(exid->spa_how);
+
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		/* spo_must_enforce, spo_must_allow */
+		p = xdr_reserve_space(xdr, 16);
+		if (!p)
+			return nfserr_resource;
+
+		/* spo_must_enforce bitmap: */
+		*p++ = cpu_to_be32(2);
+		*p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]);
+		*p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]);
+		/* empty spo_must_allow bitmap: */
+		*p++ = cpu_to_be32(0);
+
+		break;
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	p = xdr_reserve_space(xdr,
+		8 /* so_minor_id */ +
+		4 /* so_major_id.len */ +
+		(XDR_QUADLEN(major_id_sz) * 4) +
+		4 /* eir_server_scope.len */ +
+		(XDR_QUADLEN(server_scope_sz) * 4) +
+		4 /* eir_server_impl_id.count (0) */);
+	if (!p)
+		return nfserr_resource;
+
+	/* The server_owner struct */
+	p = xdr_encode_hyper(p, minor_id);      /* Minor id */
+	/* major id */
+	p = xdr_encode_opaque(p, major_id, major_id_sz);
+
+	/* Server scope */
+	p = xdr_encode_opaque(p, server_scope, server_scope_sz);
+
+	/* Implementation id */
+	*p++ = cpu_to_be32(0);	/* zero length nfs_impl_id4 array */
+	return 0;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
+			    struct nfsd4_create_session *sess)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 24);
+	if (!p)
+		return nfserr_resource;
+	p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
+					NFS4_MAX_SESSIONID_LEN);
+	*p++ = cpu_to_be32(sess->seqid);
+	*p++ = cpu_to_be32(sess->flags);
+
+	p = xdr_reserve_space(xdr, 28);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(0); /* headerpadsz */
+	*p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
+	*p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
+	*p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
+	*p++ = cpu_to_be32(sess->fore_channel.maxops);
+	*p++ = cpu_to_be32(sess->fore_channel.maxreqs);
+	*p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
+
+	if (sess->fore_channel.nr_rdma_attrs) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			return nfserr_resource;
+		*p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
+	}
+
+	p = xdr_reserve_space(xdr, 28);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(0); /* headerpadsz */
+	*p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
+	*p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
+	*p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
+	*p++ = cpu_to_be32(sess->back_channel.maxops);
+	*p++ = cpu_to_be32(sess->back_channel.maxreqs);
+	*p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
+
+	if (sess->back_channel.nr_rdma_attrs) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			return nfserr_resource;
+		*p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
+	}
+	return 0;
+}
+
+static __be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
+		      struct nfsd4_sequence *seq)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
+	if (!p)
+		return nfserr_resource;
+	p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
+					NFS4_MAX_SESSIONID_LEN);
+	*p++ = cpu_to_be32(seq->seqid);
+	*p++ = cpu_to_be32(seq->slotid);
+	/* Note slotid's are numbered from zero: */
+	*p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
+	*p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
+	*p++ = cpu_to_be32(seq->status_flags);
+
+	resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
+	return 0;
+}
+
+static __be32
+nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
+			  struct nfsd4_test_stateid *test_stateid)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	struct nfsd4_test_stateid_id *stateid, *next;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
+	if (!p)
+		return nfserr_resource;
+	*p++ = htonl(test_stateid->ts_num_ids);
+
+	list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
+		*p++ = stateid->ts_id_status;
+	}
+
+	return nfserr;
+}
+
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[gdev->gd_layout_type];
+	u32 starting_len = xdr->buf->len, needed_len;
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(gdev->gd_layout_type);
+
+	/* If maxcount is 0 then just update notifications */
+	if (gdev->gd_maxcount != 0) {
+		nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+		if (nfserr) {
+			/*
+			 * We don't bother to burden the layout drivers with
+			 * enforcing gd_maxcount, just tell the client to
+			 * come back with a bigger buffer if it's not enough.
+			 */
+			if (xdr->buf->len + 4 > gdev->gd_maxcount)
+				goto toosmall;
+			goto out;
+		}
+	}
+
+	nfserr = nfserr_resource;
+	if (gdev->gd_notify_types) {
+		p = xdr_reserve_space(xdr, 4 + 4);
+		if (!p)
+			goto out;
+		*p++ = cpu_to_be32(1);			/* bitmap length */
+		*p++ = cpu_to_be32(gdev->gd_notify_types);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out;
+		*p++ = 0;
+	}
+
+	nfserr = 0;
+out:
+	kfree(gdev->gd_device);
+	dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+	return nfserr;
+
+toosmall:
+	dprintk("%s: maxcount too small\n", __func__);
+	needed_len = xdr->buf->len + 4 /* notifications */;
+	xdr_truncate_encode(xdr, starting_len);
+	p = xdr_reserve_space(xdr, 4);
+	if (!p) {
+		nfserr = nfserr_resource;
+	} else {
+		*p++ = cpu_to_be32(needed_len);
+		nfserr = nfserr_toosmall;
+	}
+	goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutget *lgp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[lgp->lg_layout_type];
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(1);	/* we always set return-on-close */
+	*p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+	p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+				    sizeof(stateid_opaque_t));
+
+	*p++ = cpu_to_be32(1);	/* we always return a single layout */
+	p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+	p = xdr_encode_hyper(p, lgp->lg_seg.length);
+	*p++ = cpu_to_be32(lgp->lg_seg.iomode);
+	*p++ = cpu_to_be32(lgp->lg_layout_type);
+
+	nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+	kfree(lgp->lg_content);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+			  struct nfsd4_layoutcommit *lcp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lcp->lc_size_chg);
+	if (lcp->lc_size_chg) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_hyper(p, lcp->lc_newsize);
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lrp->lrs_present);
+	if (lrp->lrs_present)
+		return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+	return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+static __be32
+nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_seek *seek)
+{
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(&resp->xdr, 4 + 8);
+	*p++ = cpu_to_be32(seek->seek_eof);
+	p = xdr_encode_hyper(p, seek->seek_pos);
+
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+{
+	return nfserr;
+}
+
+typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
+static nfsd4_enc nfsd4_enc_ops[] = {
+	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
+	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
+	[OP_COMMIT]		= (nfsd4_enc)nfsd4_encode_commit,
+	[OP_CREATE]		= (nfsd4_enc)nfsd4_encode_create,
+	[OP_DELEGPURGE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DELEGRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETATTR]		= (nfsd4_enc)nfsd4_encode_getattr,
+	[OP_GETFH]		= (nfsd4_enc)nfsd4_encode_getfh,
+	[OP_LINK]		= (nfsd4_enc)nfsd4_encode_link,
+	[OP_LOCK]		= (nfsd4_enc)nfsd4_encode_lock,
+	[OP_LOCKT]		= (nfsd4_enc)nfsd4_encode_lockt,
+	[OP_LOCKU]		= (nfsd4_enc)nfsd4_encode_locku,
+	[OP_LOOKUP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LOOKUPP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_NVERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OPEN]		= (nfsd4_enc)nfsd4_encode_open,
+	[OP_OPENATTR]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OPEN_CONFIRM]	= (nfsd4_enc)nfsd4_encode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_enc)nfsd4_encode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTPUBFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTROOTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ]		= (nfsd4_enc)nfsd4_encode_read,
+	[OP_READDIR]		= (nfsd4_enc)nfsd4_encode_readdir,
+	[OP_READLINK]		= (nfsd4_enc)nfsd4_encode_readlink,
+	[OP_REMOVE]		= (nfsd4_enc)nfsd4_encode_remove,
+	[OP_RENAME]		= (nfsd4_enc)nfsd4_encode_rename,
+	[OP_RENEW]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RESTOREFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SAVEFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO]		= (nfsd4_enc)nfsd4_encode_secinfo,
+	[OP_SETATTR]		= (nfsd4_enc)nfsd4_encode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_enc)nfsd4_encode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.1 operations */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
+	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+#endif
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
+	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
+	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_test_stateid,
+	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.2 operations */
+	[OP_ALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_COPY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DEALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTERROR]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTSTATS]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OFFLOAD_CANCEL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
+	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,
+};
+
+/*
+ * Calculate whether we still have space to encode repsize bytes.
+ * There are two considerations:
+ *     - For NFS versions >=4.1, the size of the reply must stay within
+ *       session limits
+ *     - For all NFS versions, we must stay within limited preallocated
+ *       buffer space.
+ *
+ * This is called before the operation is processed, so can only provide
+ * an upper estimate.  For some nonidempotent operations (such as
+ * getattr), it's not necessarily a problem if that estimate is wrong,
+ * as we can fail it after processing without significant side effects.
+ */
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
+{
+	struct xdr_buf *buf = &resp->rqstp->rq_res;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+
+	if (buf->len + respsize <= buf->buflen)
+		return nfs_ok;
+	if (!nfsd4_has_session(&resp->cstate))
+		return nfserr_resource;
+	if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) {
+		WARN_ON_ONCE(1);
+		return nfserr_rep_too_big_to_cache;
+	}
+	return nfserr_rep_too_big;
+}
+
+void
+nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	struct nfs4_stateowner *so = resp->cstate.replay_owner;
+	struct svc_rqst *rqstp = resp->rqstp;
+	int post_err_offset;
+	nfsd4_enc encoder;
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 8);
+	if (!p) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+	*p++ = cpu_to_be32(op->opnum);
+	post_err_offset = xdr->buf->len;
+
+	if (op->opnum == OP_ILLEGAL)
+		goto status;
+	BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+	       !nfsd4_enc_ops[op->opnum]);
+	encoder = nfsd4_enc_ops[op->opnum];
+	op->status = encoder(resp, op->status, &op->u);
+	xdr_commit_encode(xdr);
+
+	/* nfsd4_check_resp_size guarantees enough room for error status */
+	if (!op->status) {
+		int space_needed = 0;
+		if (!nfsd4_last_compound_op(rqstp))
+			space_needed = COMPOUND_ERR_SLACK_SPACE;
+		op->status = nfsd4_check_resp_size(resp, space_needed);
+	}
+	if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
+		struct nfsd4_slot *slot = resp->cstate.slot;
+
+		if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+			op->status = nfserr_rep_too_big_to_cache;
+		else
+			op->status = nfserr_rep_too_big;
+	}
+	if (op->status == nfserr_resource ||
+	    op->status == nfserr_rep_too_big ||
+	    op->status == nfserr_rep_too_big_to_cache) {
+		/*
+		 * The operation may have already been encoded or
+		 * partially encoded.  No op returns anything additional
+		 * in the case of one of these three errors, so we can
+		 * just truncate back to after the status.  But it's a
+		 * bug if we had to do this on a non-idempotent op:
+		 */
+		warn_on_nonidempotent_op(op);
+		xdr_truncate_encode(xdr, post_err_offset);
+	}
+	if (so) {
+		int len = xdr->buf->len - post_err_offset;
+
+		so->so_replay.rp_status = op->status;
+		so->so_replay.rp_buflen = len;
+		read_bytes_from_xdr_buf(xdr->buf, post_err_offset,
+						so->so_replay.rp_buf, len);
+	}
+status:
+	/* Note that op->status is already in network byte order: */
+	write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4);
+}
+
+/* 
+ * Encode the reply stored in the stateowner reply cache 
+ * 
+ * XDR note: do not encode rp->rp_buflen: the buffer contains the
+ * previously sent already encoded operation.
+ */
+void
+nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
+{
+	__be32 *p;
+	struct nfs4_replay *rp = op->replay;
+
+	BUG_ON(!rp);
+
+	p = xdr_reserve_space(xdr, 8 + rp->rp_buflen);
+	if (!p) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+	*p++ = cpu_to_be32(op->opnum);
+	*p++ = rp->rp_status;  /* already xdr'ed */
+
+	p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
+}
+
+int
+nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+        return xdr_ressize_check(rqstp, p);
+}
+
+int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
+{
+	struct svc_rqst *rqstp = rq;
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+	if (args->ops != args->iops) {
+		kfree(args->ops);
+		args->ops = args->iops;
+	}
+	kfree(args->tmpp);
+	args->tmpp = NULL;
+	while (args->to_free) {
+		struct svcxdr_tmpbuf *tb = args->to_free;
+		args->to_free = tb->next;
+		kfree(tb);
+	}
+	return 1;
+}
+
+int
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
+{
+	if (rqstp->rq_arg.head[0].iov_len % 4) {
+		/* client is nuts */
+		dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
+			__func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
+		return 0;
+	}
+	args->p = p;
+	args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
+	args->pagelist = rqstp->rq_arg.pages;
+	args->pagelen = rqstp->rq_arg.page_len;
+	args->tmpp = NULL;
+	args->to_free = NULL;
+	args->ops = args->iops;
+	args->rqstp = rqstp;
+
+	return !nfsd4_decode_compound(args);
+}
+
+int
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp)
+{
+	/*
+	 * All that remains is to write the tag and operation count...
+	 */
+	struct xdr_buf *buf = resp->xdr.buf;
+
+	WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
+				 buf->tail[0].iov_len);
+
+	rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
+	p = resp->tagp;
+	*p++ = htonl(resp->taglen);
+	memcpy(p, resp->tag, resp->taglen);
+	p += XDR_QUADLEN(resp->taglen);
+	*p++ = htonl(resp->opcnt);
+
+	nfsd4_sequence_done(resp);
+	return 1;
+}
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/kernel/fs/nfsd/nfscache.c b/kernel/fs/nfsd/nfscache.c
new file mode 100644
index 000000000..46ec934f5
--- /dev/null
+++ b/kernel/fs/nfsd/nfscache.c
@@ -0,0 +1,637 @@
+/*
+ * Request reply cache. This is currently a global cache, but this may
+ * change in the future and be a per-client cache.
+ *
+ * This code is heavily inspired by the 44BSD implementation, although
+ * it does things a bit differently.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
+#include <net/checksum.h>
+
+#include "nfsd.h"
+#include "cache.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_REPCACHE
+
+/*
+ * We use this value to determine the number of hash buckets from the max
+ * cache size, the idea being that when the cache is at its maximum number
+ * of entries, then this should be the average number of entries per bucket.
+ */
+#define TARGET_BUCKET_SIZE	64
+
+struct nfsd_drc_bucket {
+	struct list_head lru_head;
+	spinlock_t cache_lock;
+};
+
+static struct nfsd_drc_bucket	*drc_hashtbl;
+static struct kmem_cache	*drc_slab;
+
+/* max number of entries allowed in the cache */
+static unsigned int		max_drc_entries;
+
+/* number of significant bits in the hash value */
+static unsigned int		maskbits;
+static unsigned int		drc_hashsize;
+
+/*
+ * Stats and other tracking of on the duplicate reply cache. All of these and
+ * the "rc" fields in nfsdstats are protected by the cache_lock
+ */
+
+/* total number of entries */
+static atomic_t			num_drc_entries;
+
+/* cache misses due only to checksum comparison failures */
+static unsigned int		payload_misses;
+
+/* amount of memory (in bytes) currently consumed by the DRC */
+static unsigned int		drc_mem_usage;
+
+/* longest hash chain seen */
+static unsigned int		longest_chain;
+
+/* size of cache when we saw the longest hash chain */
+static unsigned int		longest_chain_cachesize;
+
+static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
+static void	cache_cleaner_func(struct work_struct *unused);
+static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
+
+static struct shrinker nfsd_reply_cache_shrinker = {
+	.scan_objects = nfsd_reply_cache_scan,
+	.count_objects = nfsd_reply_cache_count,
+	.seeks	= 1,
+};
+
+/*
+ * locking for the reply cache:
+ * A cache entry is "single use" if c_state == RC_INPROG
+ * Otherwise, it when accessing _prev or _next, the lock must be held.
+ */
+static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
+
+/*
+ * Put a cap on the size of the DRC based on the amount of available
+ * low memory in the machine.
+ *
+ *  64MB:    8192
+ * 128MB:   11585
+ * 256MB:   16384
+ * 512MB:   23170
+ *   1GB:   32768
+ *   2GB:   46340
+ *   4GB:   65536
+ *   8GB:   92681
+ *  16GB:  131072
+ *
+ * ...with a hard cap of 256k entries. In the worst case, each entry will be
+ * ~1k, so the above numbers should give a rough max of the amount of memory
+ * used in k.
+ */
+static unsigned int
+nfsd_cache_size_limit(void)
+{
+	unsigned int limit;
+	unsigned long low_pages = totalram_pages - totalhigh_pages;
+
+	limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
+	return min_t(unsigned int, limit, 256*1024);
+}
+
+/*
+ * Compute the number of hash buckets we need. Divide the max cachesize by
+ * the "target" max bucket size, and round up to next power of two.
+ */
+static unsigned int
+nfsd_hashsize(unsigned int limit)
+{
+	return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
+}
+
+static u32
+nfsd_cache_hash(__be32 xid)
+{
+	return hash_32(be32_to_cpu(xid), maskbits);
+}
+
+static struct svc_cacherep *
+nfsd_reply_cache_alloc(void)
+{
+	struct svc_cacherep	*rp;
+
+	rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
+	if (rp) {
+		rp->c_state = RC_UNUSED;
+		rp->c_type = RC_NOCACHE;
+		INIT_LIST_HEAD(&rp->c_lru);
+	}
+	return rp;
+}
+
+static void
+nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+{
+	if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
+		drc_mem_usage -= rp->c_replvec.iov_len;
+		kfree(rp->c_replvec.iov_base);
+	}
+	list_del(&rp->c_lru);
+	atomic_dec(&num_drc_entries);
+	drc_mem_usage -= sizeof(*rp);
+	kmem_cache_free(drc_slab, rp);
+}
+
+static void
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
+{
+	spin_lock(&b->cache_lock);
+	nfsd_reply_cache_free_locked(rp);
+	spin_unlock(&b->cache_lock);
+}
+
+int nfsd_reply_cache_init(void)
+{
+	unsigned int hashsize;
+	unsigned int i;
+	int status = 0;
+
+	max_drc_entries = nfsd_cache_size_limit();
+	atomic_set(&num_drc_entries, 0);
+	hashsize = nfsd_hashsize(max_drc_entries);
+	maskbits = ilog2(hashsize);
+
+	status = register_shrinker(&nfsd_reply_cache_shrinker);
+	if (status)
+		return status;
+
+	drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
+					0, 0, NULL);
+	if (!drc_slab)
+		goto out_nomem;
+
+	drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
+	if (!drc_hashtbl)
+		goto out_nomem;
+	for (i = 0; i < hashsize; i++) {
+		INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
+		spin_lock_init(&drc_hashtbl[i].cache_lock);
+	}
+	drc_hashsize = hashsize;
+
+	return 0;
+out_nomem:
+	printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+	nfsd_reply_cache_shutdown();
+	return -ENOMEM;
+}
+
+void nfsd_reply_cache_shutdown(void)
+{
+	struct svc_cacherep	*rp;
+	unsigned int i;
+
+	unregister_shrinker(&nfsd_reply_cache_shrinker);
+	cancel_delayed_work_sync(&cache_cleaner);
+
+	for (i = 0; i < drc_hashsize; i++) {
+		struct list_head *head = &drc_hashtbl[i].lru_head;
+		while (!list_empty(head)) {
+			rp = list_first_entry(head, struct svc_cacherep, c_lru);
+			nfsd_reply_cache_free_locked(rp);
+		}
+	}
+
+	kfree (drc_hashtbl);
+	drc_hashtbl = NULL;
+	drc_hashsize = 0;
+
+	if (drc_slab) {
+		kmem_cache_destroy(drc_slab);
+		drc_slab = NULL;
+	}
+}
+
+/*
+ * Move cache entry to end of LRU list, and queue the cleaner to run if it's
+ * not already scheduled.
+ */
+static void
+lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
+{
+	rp->c_timestamp = jiffies;
+	list_move_tail(&rp->c_lru, &b->lru_head);
+	schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
+}
+
+static long
+prune_bucket(struct nfsd_drc_bucket *b)
+{
+	struct svc_cacherep *rp, *tmp;
+	long freed = 0;
+
+	list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
+		/*
+		 * Don't free entries attached to calls that are still
+		 * in-progress, but do keep scanning the list.
+		 */
+		if (rp->c_state == RC_INPROG)
+			continue;
+		if (atomic_read(&num_drc_entries) <= max_drc_entries &&
+		    time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
+			break;
+		nfsd_reply_cache_free_locked(rp);
+		freed++;
+	}
+	return freed;
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static long
+prune_cache_entries(void)
+{
+	unsigned int i;
+	long freed = 0;
+	bool cancel = true;
+
+	for (i = 0; i < drc_hashsize; i++) {
+		struct nfsd_drc_bucket *b = &drc_hashtbl[i];
+
+		if (list_empty(&b->lru_head))
+			continue;
+		spin_lock(&b->cache_lock);
+		freed += prune_bucket(b);
+		if (!list_empty(&b->lru_head))
+			cancel = false;
+		spin_unlock(&b->cache_lock);
+	}
+
+	/*
+	 * Conditionally rearm the job to run in RC_EXPIRE since we just
+	 * ran the pruner.
+	 */
+	if (!cancel)
+		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
+	return freed;
+}
+
+static void
+cache_cleaner_func(struct work_struct *unused)
+{
+	prune_cache_entries();
+}
+
+static unsigned long
+nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return atomic_read(&num_drc_entries);
+}
+
+static unsigned long
+nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return prune_cache_entries();
+}
+/*
+ * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+ */
+static __wsum
+nfsd_cache_csum(struct svc_rqst *rqstp)
+{
+	int idx;
+	unsigned int base;
+	__wsum csum;
+	struct xdr_buf *buf = &rqstp->rq_arg;
+	const unsigned char *p = buf->head[0].iov_base;
+	size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
+				RC_CSUMLEN);
+	size_t len = min(buf->head[0].iov_len, csum_len);
+
+	/* rq_arg.head first */
+	csum = csum_partial(p, len, 0);
+	csum_len -= len;
+
+	/* Continue into page array */
+	idx = buf->page_base / PAGE_SIZE;
+	base = buf->page_base & ~PAGE_MASK;
+	while (csum_len) {
+		p = page_address(buf->pages[idx]) + base;
+		len = min_t(size_t, PAGE_SIZE - base, csum_len);
+		csum = csum_partial(p, len, csum);
+		csum_len -= len;
+		base = 0;
+		++idx;
+	}
+	return csum;
+}
+
+static bool
+nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+{
+	/* Check RPC XID first */
+	if (rqstp->rq_xid != rp->c_xid)
+		return false;
+	/* compare checksum of NFS data */
+	if (csum != rp->c_csum) {
+		++payload_misses;
+		return false;
+	}
+
+	/* Other discriminators */
+	if (rqstp->rq_proc != rp->c_proc ||
+	    rqstp->rq_prot != rp->c_prot ||
+	    rqstp->rq_vers != rp->c_vers ||
+	    rqstp->rq_arg.len != rp->c_len ||
+	    !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+	    rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+		return false;
+
+	return true;
+}
+
+/*
+ * Search the request hash for an entry that matches the given rqstp.
+ * Must be called with cache_lock held. Returns the found entry or
+ * NULL on failure.
+ */
+static struct svc_cacherep *
+nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
+		__wsum csum)
+{
+	struct svc_cacherep	*rp, *ret = NULL;
+	struct list_head 	*rh = &b->lru_head;
+	unsigned int		entries = 0;
+
+	list_for_each_entry(rp, rh, c_lru) {
+		++entries;
+		if (nfsd_cache_match(rqstp, csum, rp)) {
+			ret = rp;
+			break;
+		}
+	}
+
+	/* tally hash chain length stats */
+	if (entries > longest_chain) {
+		longest_chain = entries;
+		longest_chain_cachesize = atomic_read(&num_drc_entries);
+	} else if (entries == longest_chain) {
+		/* prefer to keep the smallest cachesize possible here */
+		longest_chain_cachesize = min_t(unsigned int,
+				longest_chain_cachesize,
+				atomic_read(&num_drc_entries));
+	}
+
+	return ret;
+}
+
+/*
+ * Try to find an entry matching the current call in the cache. When none
+ * is found, we try to grab the oldest expired entry off the LRU list. If
+ * a suitable one isn't there, then drop the cache_lock and allocate a
+ * new one, then search again in case one got inserted while this thread
+ * didn't hold the lock.
+ */
+int
+nfsd_cache_lookup(struct svc_rqst *rqstp)
+{
+	struct svc_cacherep	*rp, *found;
+	__be32			xid = rqstp->rq_xid;
+	u32			proto =  rqstp->rq_prot,
+				vers = rqstp->rq_vers,
+				proc = rqstp->rq_proc;
+	__wsum			csum;
+	u32 hash = nfsd_cache_hash(xid);
+	struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
+	unsigned long		age;
+	int type = rqstp->rq_cachetype;
+	int rtn = RC_DOIT;
+
+	rqstp->rq_cacherep = NULL;
+	if (type == RC_NOCACHE) {
+		nfsdstats.rcnocache++;
+		return rtn;
+	}
+
+	csum = nfsd_cache_csum(rqstp);
+
+	/*
+	 * Since the common case is a cache miss followed by an insert,
+	 * preallocate an entry.
+	 */
+	rp = nfsd_reply_cache_alloc();
+	spin_lock(&b->cache_lock);
+	if (likely(rp)) {
+		atomic_inc(&num_drc_entries);
+		drc_mem_usage += sizeof(*rp);
+	}
+
+	/* go ahead and prune the cache */
+	prune_bucket(b);
+
+	found = nfsd_cache_search(b, rqstp, csum);
+	if (found) {
+		if (likely(rp))
+			nfsd_reply_cache_free_locked(rp);
+		rp = found;
+		goto found_entry;
+	}
+
+	if (!rp) {
+		dprintk("nfsd: unable to allocate DRC entry!\n");
+		goto out;
+	}
+
+	nfsdstats.rcmisses++;
+	rqstp->rq_cacherep = rp;
+	rp->c_state = RC_INPROG;
+	rp->c_xid = xid;
+	rp->c_proc = proc;
+	rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
+	rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
+	rp->c_prot = proto;
+	rp->c_vers = vers;
+	rp->c_len = rqstp->rq_arg.len;
+	rp->c_csum = csum;
+
+	lru_put_end(b, rp);
+
+	/* release any buffer */
+	if (rp->c_type == RC_REPLBUFF) {
+		drc_mem_usage -= rp->c_replvec.iov_len;
+		kfree(rp->c_replvec.iov_base);
+		rp->c_replvec.iov_base = NULL;
+	}
+	rp->c_type = RC_NOCACHE;
+ out:
+	spin_unlock(&b->cache_lock);
+	return rtn;
+
+found_entry:
+	nfsdstats.rchits++;
+	/* We found a matching entry which is either in progress or done. */
+	age = jiffies - rp->c_timestamp;
+	lru_put_end(b, rp);
+
+	rtn = RC_DROPIT;
+	/* Request being processed or excessive rexmits */
+	if (rp->c_state == RC_INPROG || age < RC_DELAY)
+		goto out;
+
+	/* From the hall of fame of impractical attacks:
+	 * Is this a user who tries to snoop on the cache? */
+	rtn = RC_DOIT;
+	if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure)
+		goto out;
+
+	/* Compose RPC reply header */
+	switch (rp->c_type) {
+	case RC_NOCACHE:
+		break;
+	case RC_REPLSTAT:
+		svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat);
+		rtn = RC_REPLY;
+		break;
+	case RC_REPLBUFF:
+		if (!nfsd_cache_append(rqstp, &rp->c_replvec))
+			goto out;	/* should not happen */
+		rtn = RC_REPLY;
+		break;
+	default:
+		printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
+		nfsd_reply_cache_free_locked(rp);
+	}
+
+	goto out;
+}
+
+/*
+ * Update a cache entry. This is called from nfsd_dispatch when
+ * the procedure has been executed and the complete reply is in
+ * rqstp->rq_res.
+ *
+ * We're copying around data here rather than swapping buffers because
+ * the toplevel loop requires max-sized buffers, which would be a waste
+ * of memory for a cache with a max reply size of 100 bytes (diropokres).
+ *
+ * If we should start to use different types of cache entries tailored
+ * specifically for attrstat and fh's, we may save even more space.
+ *
+ * Also note that a cachetype of RC_NOCACHE can legally be passed when
+ * nfsd failed to encode a reply that otherwise would have been cached.
+ * In this case, nfsd_cache_update is called with statp == NULL.
+ */
+void
+nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+{
+	struct svc_cacherep *rp = rqstp->rq_cacherep;
+	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
+	u32		hash;
+	struct nfsd_drc_bucket *b;
+	int		len;
+	size_t		bufsize = 0;
+
+	if (!rp)
+		return;
+
+	hash = nfsd_cache_hash(rp->c_xid);
+	b = &drc_hashtbl[hash];
+
+	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
+	len >>= 2;
+
+	/* Don't cache excessive amounts of data and XDR failures */
+	if (!statp || len > (256 >> 2)) {
+		nfsd_reply_cache_free(b, rp);
+		return;
+	}
+
+	switch (cachetype) {
+	case RC_REPLSTAT:
+		if (len != 1)
+			printk("nfsd: RC_REPLSTAT/reply len %d!\n",len);
+		rp->c_replstat = *statp;
+		break;
+	case RC_REPLBUFF:
+		cachv = &rp->c_replvec;
+		bufsize = len << 2;
+		cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
+		if (!cachv->iov_base) {
+			nfsd_reply_cache_free(b, rp);
+			return;
+		}
+		cachv->iov_len = bufsize;
+		memcpy(cachv->iov_base, statp, bufsize);
+		break;
+	case RC_NOCACHE:
+		nfsd_reply_cache_free(b, rp);
+		return;
+	}
+	spin_lock(&b->cache_lock);
+	drc_mem_usage += bufsize;
+	lru_put_end(b, rp);
+	rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags);
+	rp->c_type = cachetype;
+	rp->c_state = RC_DONE;
+	spin_unlock(&b->cache_lock);
+	return;
+}
+
+/*
+ * Copy cached reply to current reply buffer. Should always fit.
+ * FIXME as reply is in a page, we should just attach the page, and
+ * keep a refcount....
+ */
+static int
+nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
+{
+	struct kvec	*vec = &rqstp->rq_res.head[0];
+
+	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
+		printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n",
+				data->iov_len);
+		return 0;
+	}
+	memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
+	vec->iov_len += data->iov_len;
+	return 1;
+}
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "max entries:           %u\n", max_drc_entries);
+	seq_printf(m, "num entries:           %u\n",
+			atomic_read(&num_drc_entries));
+	seq_printf(m, "hash buckets:          %u\n", 1 << maskbits);
+	seq_printf(m, "mem usage:             %u\n", drc_mem_usage);
+	seq_printf(m, "cache hits:            %u\n", nfsdstats.rchits);
+	seq_printf(m, "cache misses:          %u\n", nfsdstats.rcmisses);
+	seq_printf(m, "not cached:            %u\n", nfsdstats.rcnocache);
+	seq_printf(m, "payload misses:        %u\n", payload_misses);
+	seq_printf(m, "longest chain len:     %u\n", longest_chain);
+	seq_printf(m, "cachesize at longest:  %u\n", longest_chain_cachesize);
+	return 0;
+}
+
+int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nfsd_reply_cache_stats_show, NULL);
+}
diff --git a/kernel/fs/nfsd/nfsctl.c b/kernel/fs/nfsd/nfsctl.c
new file mode 100644
index 000000000..9690cb4dd
--- /dev/null
+++ b/kernel/fs/nfsd/nfsctl.c
@@ -0,0 +1,1318 @@
+/*
+ * Syscall interface to knfsd.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+
+#include <linux/sunrpc/svcsock.h>
+#include <linux/lockd/lockd.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/module.h>
+
+#include "idmap.h"
+#include "nfsd.h"
+#include "cache.h"
+#include "state.h"
+#include "netns.h"
+#include "pnfs.h"
+
+/*
+ *	We have a single directory with several nodes in it.
+ */
+enum {
+	NFSD_Root = 1,
+	NFSD_List,
+	NFSD_Export_features,
+	NFSD_Fh,
+	NFSD_FO_UnlockIP,
+	NFSD_FO_UnlockFS,
+	NFSD_Threads,
+	NFSD_Pool_Threads,
+	NFSD_Pool_Stats,
+	NFSD_Reply_Cache_Stats,
+	NFSD_Versions,
+	NFSD_Ports,
+	NFSD_MaxBlkSize,
+	NFSD_MaxConnections,
+	NFSD_SupportedEnctypes,
+	/*
+	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
+	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
+	 */
+#ifdef CONFIG_NFSD_V4
+	NFSD_Leasetime,
+	NFSD_Gracetime,
+	NFSD_RecoveryDir,
+	NFSD_V4EndGrace,
+#endif
+};
+
+/*
+ * write() for these nodes.
+ */
+static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
+static ssize_t write_threads(struct file *file, char *buf, size_t size);
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
+static ssize_t write_versions(struct file *file, char *buf, size_t size);
+static ssize_t write_ports(struct file *file, char *buf, size_t size);
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
+#ifdef CONFIG_NFSD_V4
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
+#endif
+
+static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+	[NFSD_Fh] = write_filehandle,
+	[NFSD_FO_UnlockIP] = write_unlock_ip,
+	[NFSD_FO_UnlockFS] = write_unlock_fs,
+	[NFSD_Threads] = write_threads,
+	[NFSD_Pool_Threads] = write_pool_threads,
+	[NFSD_Versions] = write_versions,
+	[NFSD_Ports] = write_ports,
+	[NFSD_MaxBlkSize] = write_maxblksize,
+	[NFSD_MaxConnections] = write_maxconn,
+#ifdef CONFIG_NFSD_V4
+	[NFSD_Leasetime] = write_leasetime,
+	[NFSD_Gracetime] = write_gracetime,
+	[NFSD_RecoveryDir] = write_recoverydir,
+	[NFSD_V4EndGrace] = write_v4_end_grace,
+#endif
+};
+
+static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+{
+	ino_t ino =  file_inode(file)->i_ino;
+	char *data;
+	ssize_t rv;
+
+	if (ino >= ARRAY_SIZE(write_op) || !write_op[ino])
+		return -EINVAL;
+
+	data = simple_transaction_get(file, buf, size);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	rv =  write_op[ino](file, data, size);
+	if (rv >= 0) {
+		simple_transaction_set(file, rv);
+		rv = size;
+	}
+	return rv;
+}
+
+static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+	if (! file->private_data) {
+		/* An attempt to read a transaction file without writing
+		 * causes a 0-byte write so that the file can return
+		 * state information
+		 */
+		ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos);
+		if (rv < 0)
+			return rv;
+	}
+	return simple_transaction_read(file, buf, size, pos);
+}
+
+static const struct file_operations transaction_ops = {
+	.write		= nfsctl_transaction_write,
+	.read		= nfsctl_transaction_read,
+	.release	= simple_transaction_release,
+	.llseek		= default_llseek,
+};
+
+static int exports_net_open(struct net *net, struct file *file)
+{
+	int err;
+	struct seq_file *seq;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	err = seq_open(file, &nfs_exports_op);
+	if (err)
+		return err;
+
+	seq = file->private_data;
+	seq->private = nn->svc_export_cache;
+	return 0;
+}
+
+static int exports_proc_open(struct inode *inode, struct file *file)
+{
+	return exports_net_open(current->nsproxy->net_ns, file);
+}
+
+static const struct file_operations exports_proc_operations = {
+	.open		= exports_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
+static int exports_nfsd_open(struct inode *inode, struct file *file)
+{
+	return exports_net_open(inode->i_sb->s_fs_info, file);
+}
+
+static const struct file_operations exports_nfsd_operations = {
+	.open		= exports_nfsd_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
+static int export_features_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS);
+	return 0;
+}
+
+static int export_features_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, export_features_show, NULL);
+}
+
+static const struct file_operations export_features_operations = {
+	.open		= export_features_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+static int supported_enctypes_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, KRB5_SUPPORTED_ENCTYPES);
+	return 0;
+}
+
+static int supported_enctypes_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, supported_enctypes_show, NULL);
+}
+
+static const struct file_operations supported_enctypes_ops = {
+	.open		= supported_enctypes_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+
+static const struct file_operations pool_stats_operations = {
+	.open		= nfsd_pool_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= nfsd_pool_stats_release,
+	.owner		= THIS_MODULE,
+};
+
+static struct file_operations reply_cache_stats_operations = {
+	.open		= nfsd_reply_cache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/*----------------------------------------------------------------------------*/
+/*
+ * payload - write methods
+ */
+
+static inline struct net *netns(struct file *file)
+{
+	return file_inode(file)->i_sb->s_fs_info;
+}
+
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing a
+ *				presentation format IP address
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
+{
+	struct sockaddr_storage address;
+	struct sockaddr *sap = (struct sockaddr *)&address;
+	size_t salen = sizeof(address);
+	char *fo_path;
+	struct net *net = netns(file);
+
+	/* sanity check */
+	if (size == 0)
+		return -EINVAL;
+
+	if (buf[size-1] != '\n')
+		return -EINVAL;
+
+	fo_path = buf;
+	if (qword_get(&buf, fo_path, size) < 0)
+		return -EINVAL;
+
+	if (rpc_pton(net, fo_path, size, sap, salen) == 0)
+		return -EINVAL;
+
+	return nlmsvc_unlock_all_by_ip(sap);
+}
+
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing the
+ *				absolute pathname of a local file system
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
+{
+	struct path path;
+	char *fo_path;
+	int error;
+
+	/* sanity check */
+	if (size == 0)
+		return -EINVAL;
+
+	if (buf[size-1] != '\n')
+		return -EINVAL;
+
+	fo_path = buf;
+	if (qword_get(&buf, fo_path, size) < 0)
+		return -EINVAL;
+
+	error = kern_path(fo_path, 0, &path);
+	if (error)
+		return error;
+
+	/*
+	 * XXX: Needs better sanity checking.  Otherwise we could end up
+	 * releasing locks on the wrong file system.
+	 *
+	 * For example:
+	 * 1.  Does the path refer to a directory?
+	 * 2.  Is that directory a mount point, or
+	 * 3.  Is that directory the root of an exported file system?
+	 */
+	error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
+
+	path_put(&path);
+	return error;
+}
+
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace.  The string may
+ * contain escape sequences.
+ *
+ * Input:
+ *			buf:
+ *				domain:		client domain name
+ *				path:		export pathname
+ *				maxsize:	numeric maximum size of
+ *						@buf
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing a ASCII hex text version
+ *			of the NFS file handle;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
+{
+	char *dname, *path;
+	int uninitialized_var(maxsize);
+	char *mesg = buf;
+	int len;
+	struct auth_domain *dom;
+	struct knfsd_fh fh;
+
+	if (size == 0)
+		return -EINVAL;
+
+	if (buf[size-1] != '\n')
+		return -EINVAL;
+	buf[size-1] = 0;
+
+	dname = mesg;
+	len = qword_get(&mesg, dname, size);
+	if (len <= 0)
+		return -EINVAL;
+	
+	path = dname+len+1;
+	len = qword_get(&mesg, path, size);
+	if (len <= 0)
+		return -EINVAL;
+
+	len = get_int(&mesg, &maxsize);
+	if (len)
+		return len;
+
+	if (maxsize < NFS_FHSIZE)
+		return -EINVAL;
+	maxsize = min(maxsize, NFS3_FHSIZE);
+
+	if (qword_get(&mesg, mesg, size)>0)
+		return -EINVAL;
+
+	/* we have all the words, they are in buf.. */
+	dom = unix_domain_find(dname);
+	if (!dom)
+		return -ENOMEM;
+
+	len = exp_rootfh(netns(file), dom, path, &fh,  maxsize);
+	auth_domain_put(dom);
+	if (len)
+		return len;
+	
+	mesg = buf;
+	len = SIMPLE_TRANSACTION_LIMIT;
+	qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
+	mesg[-1] = '\n';
+	return mesg - buf;	
+}
+
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the
+ *					number of NFSD threads to start
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_threads(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	int rv;
+	struct net *net = netns(file);
+
+	if (size > 0) {
+		int newthreads;
+		rv = get_int(&mesg, &newthreads);
+		if (rv)
+			return rv;
+		if (newthreads < 0)
+			return -EINVAL;
+		rv = nfsd_svc(newthreads, net);
+		if (rv < 0)
+			return rv;
+	} else
+		rv = nfsd_nrthreads(net);
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
+}
+
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated unsigned integer values
+ *					representing the number of NFSD
+ *					threads to start in each pool
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing integer values representing the
+ *			number of NFSD threads in each pool;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
+{
+	/* if size > 0, look for an array of number of threads per node
+	 * and apply them  then write out number of threads per node as reply
+	 */
+	char *mesg = buf;
+	int i;
+	int rv;
+	int len;
+	int npools;
+	int *nthreads;
+	struct net *net = netns(file);
+
+	mutex_lock(&nfsd_mutex);
+	npools = nfsd_nrpools(net);
+	if (npools == 0) {
+		/*
+		 * NFS is shut down.  The admin can start it by
+		 * writing to the threads file but NOT the pool_threads
+		 * file, sorry.  Report zero threads.
+		 */
+		mutex_unlock(&nfsd_mutex);
+		strcpy(buf, "0\n");
+		return strlen(buf);
+	}
+
+	nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+	rv = -ENOMEM;
+	if (nthreads == NULL)
+		goto out_free;
+
+	if (size > 0) {
+		for (i = 0; i < npools; i++) {
+			rv = get_int(&mesg, &nthreads[i]);
+			if (rv == -ENOENT)
+				break;		/* fewer numbers than pools */
+			if (rv)
+				goto out_free;	/* syntax error */
+			rv = -EINVAL;
+			if (nthreads[i] < 0)
+				goto out_free;
+		}
+		rv = nfsd_set_nrthreads(i, nthreads, net);
+		if (rv)
+			goto out_free;
+	}
+
+	rv = nfsd_get_nrthreads(npools, nthreads, net);
+	if (rv)
+		goto out_free;
+
+	mesg = buf;
+	size = SIMPLE_TRANSACTION_LIMIT;
+	for (i = 0; i < npools && size > 0; i++) {
+		snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' '));
+		len = strlen(mesg);
+		size -= len;
+		mesg += len;
+	}
+	rv = mesg - buf;
+out_free:
+	kfree(nthreads);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+static ssize_t __write_versions(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	char *vers, *minorp, sign;
+	int len, num, remaining;
+	unsigned minor;
+	ssize_t tlen = 0;
+	char *sep;
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+	if (size>0) {
+		if (nn->nfsd_serv)
+			/* Cannot change versions without updating
+			 * nn->nfsd_serv->sv_xdrsize, and reallocing
+			 * rq_argp and rq_resp
+			 */
+			return -EBUSY;
+		if (buf[size-1] != '\n')
+			return -EINVAL;
+		buf[size-1] = 0;
+
+		vers = mesg;
+		len = qword_get(&mesg, vers, size);
+		if (len <= 0) return -EINVAL;
+		do {
+			sign = *vers;
+			if (sign == '+' || sign == '-')
+				num = simple_strtol((vers+1), &minorp, 0);
+			else
+				num = simple_strtol(vers, &minorp, 0);
+			if (*minorp == '.') {
+				if (num != 4)
+					return -EINVAL;
+				minor = simple_strtoul(minorp+1, NULL, 0);
+				if (minor == 0)
+					return -EINVAL;
+				if (nfsd_minorversion(minor, sign == '-' ?
+						     NFSD_CLEAR : NFSD_SET) < 0)
+					return -EINVAL;
+				goto next;
+			}
+			switch(num) {
+			case 2:
+			case 3:
+			case 4:
+				nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET);
+				break;
+			default:
+				return -EINVAL;
+			}
+		next:
+			vers += len + 1;
+		} while ((len = qword_get(&mesg, vers, size)) > 0);
+		/* If all get turned off, turn them back on, as
+		 * having no versions is BAD
+		 */
+		nfsd_reset_versions();
+	}
+
+	/* Now write current state into reply buffer */
+	len = 0;
+	sep = "";
+	remaining = SIMPLE_TRANSACTION_LIMIT;
+	for (num=2 ; num <= 4 ; num++)
+		if (nfsd_vers(num, NFSD_AVAIL)) {
+			len = snprintf(buf, remaining, "%s%c%d", sep,
+				       nfsd_vers(num, NFSD_TEST)?'+':'-',
+				       num);
+			sep = " ";
+
+			if (len >= remaining)
+				break;
+			remaining -= len;
+			buf += len;
+			tlen += len;
+		}
+	if (nfsd_vers(4, NFSD_AVAIL))
+		for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
+		     minor++) {
+			len = snprintf(buf, remaining, " %c4.%u",
+					(nfsd_vers(4, NFSD_TEST) &&
+					 nfsd_minorversion(minor, NFSD_TEST)) ?
+						'+' : '-',
+					minor);
+
+			if (len >= remaining)
+				break;
+			remaining -= len;
+			buf += len;
+			tlen += len;
+		}
+
+	len = snprintf(buf, remaining, "\n");
+	if (len >= remaining)
+		return -EINVAL;
+	return tlen + len;
+}
+
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing positive or negative integer
+ *			values representing the current status of each
+ *			protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated positive or negative
+ * 					integer values representing NFS
+ * 					protocol versions to enable ("+n")
+ * 					or disable ("-n")
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	status of zero or more protocol versions has
+ *			been updated; passed-in buffer filled with
+ *			'\n'-terminated C string containing positive
+ *			or negative integer values representing the
+ *			current status of each protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_versions(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_versions(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+/*
+ * Zero-length write.  Return a list of NFSD's current listener
+ * transports.
+ */
+static ssize_t __write_ports_names(char *buf, struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (nn->nfsd_serv == NULL)
+		return 0;
+	return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+}
+
+/*
+ * A single 'fd' number was written, in which case it must be for
+ * a socket of a supported family/protocol, and we use it as an
+ * nfsd listener.
+ */
+static ssize_t __write_ports_addfd(char *buf, struct net *net)
+{
+	char *mesg = buf;
+	int fd, err;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	err = get_int(&mesg, &fd);
+	if (err != 0 || fd < 0)
+		return -EINVAL;
+
+	if (svc_alien_sock(net, fd)) {
+		printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__);
+		return -EINVAL;
+	}
+
+	err = nfsd_create_serv(net);
+	if (err != 0)
+		return err;
+
+	err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
+	if (err < 0) {
+		nfsd_destroy(net);
+		return err;
+	}
+
+	/* Decrease the count, but don't shut down the service */
+	nn->nfsd_serv->sv_nrthreads--;
+	return err;
+}
+
+/*
+ * A transport listener is added by writing it's transport name and
+ * a port number.
+ */
+static ssize_t __write_ports_addxprt(char *buf, struct net *net)
+{
+	char transport[16];
+	struct svc_xprt *xprt;
+	int port, err;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (sscanf(buf, "%15s %5u", transport, &port) != 2)
+		return -EINVAL;
+
+	if (port < 1 || port > USHRT_MAX)
+		return -EINVAL;
+
+	err = nfsd_create_serv(net);
+	if (err != 0)
+		return err;
+
+	err = svc_create_xprt(nn->nfsd_serv, transport, net,
+				PF_INET, port, SVC_SOCK_ANONYMOUS);
+	if (err < 0)
+		goto out_err;
+
+	err = svc_create_xprt(nn->nfsd_serv, transport, net,
+				PF_INET6, port, SVC_SOCK_ANONYMOUS);
+	if (err < 0 && err != -EAFNOSUPPORT)
+		goto out_close;
+
+	/* Decrease the count, but don't shut down the service */
+	nn->nfsd_serv->sv_nrthreads--;
+	return 0;
+out_close:
+	xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
+	if (xprt != NULL) {
+		svc_close_xprt(xprt);
+		svc_xprt_put(xprt);
+	}
+out_err:
+	nfsd_destroy(net);
+	return err;
+}
+
+static ssize_t __write_ports(struct file *file, char *buf, size_t size,
+			     struct net *net)
+{
+	if (size == 0)
+		return __write_ports_names(buf, net);
+
+	if (isdigit(buf[0]))
+		return __write_ports_addfd(buf, net);
+
+	if (isalpha(buf[0]))
+		return __write_ports_addxprt(buf, net);
+
+	return -EINVAL;
+}
+
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with a '\n'-terminated C
+ *			string containing a whitespace-separated list of
+ *			named NFSD listeners;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing a bound
+ *					but unconnected socket that is to be
+ *					used as an NFSD listener; listen(3)
+ *					must be called for a SOCK_STREAM
+ *					socket, otherwise it is ignored
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with a '\n'-terminated C
+ *			string containing a unique alphanumeric name of
+ *			the listener;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a transport
+ *					name and an unsigned integer value
+ *					representing the port to listen on,
+ *					separated by whitespace
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	returns zero; NFS service is started
+ *	On error:	return code is a negative errno value
+ */
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_ports(file, buf, size, netns(file));
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+
+int nfsd_max_blksize;
+
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing an unsigned
+ * 					integer value representing the new
+ * 					NFS blksize
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of the current NFS blksize
+ *			setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+	if (size > 0) {
+		int bsize;
+		int rv = get_int(&mesg, &bsize);
+		if (rv)
+			return rv;
+		/* force bsize into allowed range and
+		 * required alignment.
+		 */
+		bsize = max_t(int, bsize, 1024);
+		bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE);
+		bsize &= ~(1024-1);
+		mutex_lock(&nfsd_mutex);
+		if (nn->nfsd_serv) {
+			mutex_unlock(&nfsd_mutex);
+			return -EBUSY;
+		}
+		nfsd_max_blksize = bsize;
+		mutex_unlock(&nfsd_mutex);
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n",
+							nfsd_max_blksize);
+}
+
+/**
+ * write_maxconn - Set or report the current max number of connections
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing an unsigned
+ * 					integer value representing the new
+ * 					number of max connections
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of max_connections setting
+ *			for this net namespace;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+	unsigned int maxconn = nn->max_connections;
+
+	if (size > 0) {
+		int rv = get_uint(&mesg, &maxconn);
+
+		if (rv)
+			return rv;
+		nn->max_connections = maxconn;
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
+}
+
+#ifdef CONFIG_NFSD_V4
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
+				  time_t *time, struct nfsd_net *nn)
+{
+	char *mesg = buf;
+	int rv, i;
+
+	if (size > 0) {
+		if (nn->nfsd_serv)
+			return -EBUSY;
+		rv = get_int(&mesg, &i);
+		if (rv)
+			return rv;
+		/*
+		 * Some sanity checking.  We don't have a reason for
+		 * these particular numbers, but problems with the
+		 * extremes are:
+		 *	- Too short: the briefest network outage may
+		 *	  cause clients to lose all their locks.  Also,
+		 *	  the frequent polling may be wasteful.
+		 *	- Too long: do you really want reboot recovery
+		 *	  to take more than an hour?  Or to make other
+		 *	  clients wait an hour before being able to
+		 *	  revoke a dead client's locks?
+		 */
+		if (i < 10 || i > 3600)
+			return -EINVAL;
+		*time = i;
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
+}
+
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
+				time_t *time, struct nfsd_net *nn)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __nfsd4_write_time(file, buf, size, time, nn);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the new
+ *					NFSv4 lease expiry time
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing unsigned integer value of the
+ *			current lease expiry time;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+{
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+	return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
+}
+
+/**
+ * write_gracetime - Set or report current NFSv4 grace period time
+ *
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this.  (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+	return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
+}
+
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
+				   struct nfsd_net *nn)
+{
+	char *mesg = buf;
+	char *recdir;
+	int len, status;
+
+	if (size > 0) {
+		if (nn->nfsd_serv)
+			return -EBUSY;
+		if (size > PATH_MAX || buf[size-1] != '\n')
+			return -EINVAL;
+		buf[size-1] = 0;
+
+		recdir = mesg;
+		len = qword_get(&mesg, recdir, size);
+		if (len <= 0)
+			return -EINVAL;
+
+		status = nfs4_reset_recoverydir(recdir);
+		if (status)
+			return status;
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
+							nfs4_recoverydir());
+}
+
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing the pathname
+ *					of the directory on a local file
+ *					system containing permanent NFSv4
+ *					recovery data
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing the current recovery pathname setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_recoverydir(file, buf, size, nn);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+/**
+ * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * OR
+ *
+ * Input:
+ * 			buf:		any value
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *			passed-in buffer filled with "Y" or "N" with a newline
+ *			and NULL-terminated C string. This indicates whether
+ *			the grace period has ended in the current net
+ *			namespace. Return code is the size in bytes of the
+ *			string. Writing a string that starts with 'Y', 'y', or
+ *			'1' to the file will end the grace period for nfsd's v4
+ *			lock manager.
+ */
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
+{
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+	if (size > 0) {
+		switch(buf[0]) {
+		case 'Y':
+		case 'y':
+		case '1':
+			nfsd4_end_grace(nn);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
+			 nn->grace_ended ? 'Y' : 'N');
+}
+
+#endif
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+
+static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
+{
+	static struct tree_descr nfsd_files[] = {
+		[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
+		[NFSD_Export_features] = {"export_features",
+					&export_features_operations, S_IRUGO},
+		[NFSD_FO_UnlockIP] = {"unlock_ip",
+					&transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_FO_UnlockFS] = {"unlock_filesystem",
+					&transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
+		[NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
+		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+#ifdef CONFIG_NFSD_V4
+		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
+#endif
+		/* last one */ {""}
+	};
+	struct net *net = data;
+	int ret;
+
+	ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
+	if (ret)
+		return ret;
+	sb->s_fs_info = get_net(net);
+	return 0;
+}
+
+static struct dentry *nfsd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+}
+
+static void nfsd_umount(struct super_block *sb)
+{
+	struct net *net = sb->s_fs_info;
+
+	kill_litter_super(sb);
+	put_net(net);
+}
+
+static struct file_system_type nfsd_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfsd",
+	.mount		= nfsd_mount,
+	.kill_sb	= nfsd_umount,
+};
+MODULE_ALIAS_FS("nfsd");
+
+#ifdef CONFIG_PROC_FS
+static int create_proc_exports_entry(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_mkdir("fs/nfs", NULL);
+	if (!entry)
+		return -ENOMEM;
+	entry = proc_create("exports", 0, entry,
+				 &exports_proc_operations);
+	if (!entry) {
+		remove_proc_entry("fs/nfs", NULL);
+		return -ENOMEM;
+	}
+	return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int create_proc_exports_entry(void)
+{
+	return 0;
+}
+#endif
+
+int nfsd_net_id;
+
+static __net_init int nfsd_init_net(struct net *net)
+{
+	int retval;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	retval = nfsd_export_init(net);
+	if (retval)
+		goto out_export_error;
+	retval = nfsd_idmap_init(net);
+	if (retval)
+		goto out_idmap_error;
+	nn->nfsd4_lease = 90;	/* default lease time */
+	nn->nfsd4_grace = 90;
+	return 0;
+
+out_idmap_error:
+	nfsd_export_shutdown(net);
+out_export_error:
+	return retval;
+}
+
+static __net_exit void nfsd_exit_net(struct net *net)
+{
+	nfsd_idmap_shutdown(net);
+	nfsd_export_shutdown(net);
+}
+
+static struct pernet_operations nfsd_net_ops = {
+	.init = nfsd_init_net,
+	.exit = nfsd_exit_net,
+	.id   = &nfsd_net_id,
+	.size = sizeof(struct nfsd_net),
+};
+
+static int __init init_nfsd(void)
+{
+	int retval;
+	printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
+
+	retval = register_pernet_subsys(&nfsd_net_ops);
+	if (retval < 0)
+		return retval;
+	retval = register_cld_notifier();
+	if (retval)
+		goto out_unregister_pernet;
+	retval = nfsd4_init_slabs();
+	if (retval)
+		goto out_unregister_notifier;
+	retval = nfsd4_init_pnfs();
+	if (retval)
+		goto out_free_slabs;
+	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	if (retval)
+		goto out_exit_pnfs;
+	nfsd_stat_init();	/* Statistics */
+	retval = nfsd_reply_cache_init();
+	if (retval)
+		goto out_free_stat;
+	nfsd_lockd_init();	/* lockd->nfsd callbacks */
+	retval = create_proc_exports_entry();
+	if (retval)
+		goto out_free_lockd;
+	retval = register_filesystem(&nfsd_fs_type);
+	if (retval)
+		goto out_free_all;
+	return 0;
+out_free_all:
+	remove_proc_entry("fs/nfs/exports", NULL);
+	remove_proc_entry("fs/nfs", NULL);
+out_free_lockd:
+	nfsd_lockd_shutdown();
+	nfsd_reply_cache_shutdown();
+out_free_stat:
+	nfsd_stat_shutdown();
+	nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+	nfsd4_exit_pnfs();
+out_free_slabs:
+	nfsd4_free_slabs();
+out_unregister_notifier:
+	unregister_cld_notifier();
+out_unregister_pernet:
+	unregister_pernet_subsys(&nfsd_net_ops);
+	return retval;
+}
+
+static void __exit exit_nfsd(void)
+{
+	nfsd_reply_cache_shutdown();
+	remove_proc_entry("fs/nfs/exports", NULL);
+	remove_proc_entry("fs/nfs", NULL);
+	nfsd_stat_shutdown();
+	nfsd_lockd_shutdown();
+	nfsd4_free_slabs();
+	nfsd4_exit_pnfs();
+	nfsd_fault_inject_cleanup();
+	unregister_filesystem(&nfsd_fs_type);
+	unregister_cld_notifier();
+	unregister_pernet_subsys(&nfsd_net_ops);
+}
+
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_init(init_nfsd)
+module_exit(exit_nfsd)
diff --git a/kernel/fs/nfsd/nfsd.h b/kernel/fs/nfsd/nfsd.h
new file mode 100644
index 000000000..cf9805238
--- /dev/null
+++ b/kernel/fs/nfsd/nfsd.h
@@ -0,0 +1,422 @@
+/*
+ * Hodge-podge collection of knfsd-related stuff.
+ * I will sort this out later.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_NFSD_H
+#define LINUX_NFSD_NFSD_H
+
+#include <linux/types.h>
+#include <linux/mount.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/msg_prot.h>
+
+#include <uapi/linux/nfsd/debug.h>
+
+#include "stats.h"
+#include "export.h"
+
+#undef ifdebug
+#ifdef CONFIG_SUNRPC_DEBUG
+# define ifdebug(flag)		if (nfsd_debug & NFSDDBG_##flag)
+#else
+# define ifdebug(flag)		if (0)
+#endif
+
+/*
+ * nfsd version
+ */
+#define NFSD_SUPPORTED_MINOR_VERSION	2
+/*
+ * Maximum blocksizes supported by daemon under various circumstances.
+ */
+#define NFSSVC_MAXBLKSIZE       RPCSVC_MAXPAYLOAD
+/* NFSv2 is limited by the protocol specification, see RFC 1094 */
+#define NFSSVC_MAXBLKSIZE_V2    (8*1024)
+
+
+/*
+ * Largest number of bytes we need to allocate for an NFS
+ * call or reply.  Used to control buffer sizes.  We use
+ * the length of v3 WRITE, READDIR and READDIR replies
+ * which are an RPC header, up to 26 XDR units of reply
+ * data, and some page data.
+ *
+ * Note that accuracy here doesn't matter too much as the
+ * size is rounded up to a page size when allocating space.
+ */
+#define NFSD_BUFSIZE            ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
+
+struct readdir_cd {
+	__be32			err;	/* 0, nfserr, or nfserr_eof */
+};
+
+
+extern struct svc_program	nfsd_program;
+extern struct svc_version	nfsd_version2, nfsd_version3,
+				nfsd_version4;
+extern struct mutex		nfsd_mutex;
+extern spinlock_t		nfsd_drc_lock;
+extern unsigned long		nfsd_drc_max_mem;
+extern unsigned long		nfsd_drc_mem_used;
+
+extern const struct seq_operations nfs_exports_op;
+
+/*
+ * Function prototypes.
+ */
+int		nfsd_svc(int nrservs, struct net *net);
+int		nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
+
+int		nfsd_nrthreads(struct net *);
+int		nfsd_nrpools(struct net *);
+int		nfsd_get_nrthreads(int n, int *, struct net *);
+int		nfsd_set_nrthreads(int n, int *, struct net *);
+int		nfsd_pool_stats_open(struct inode *, struct file *);
+int		nfsd_pool_stats_release(struct inode *, struct file *);
+
+void		nfsd_destroy(struct net *net);
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+#ifdef CONFIG_NFSD_V2_ACL
+extern struct svc_version nfsd_acl_version2;
+#else
+#define nfsd_acl_version2 NULL
+#endif
+#ifdef CONFIG_NFSD_V3_ACL
+extern struct svc_version nfsd_acl_version3;
+#else
+#define nfsd_acl_version3 NULL
+#endif
+#endif
+
+enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
+int nfsd_vers(int vers, enum vers_op change);
+int nfsd_minorversion(u32 minorversion, enum vers_op change);
+void nfsd_reset_versions(void);
+int nfsd_create_serv(struct net *net);
+
+extern int nfsd_max_blksize;
+
+static inline int nfsd_v4client(struct svc_rqst *rq)
+{
+	return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+}
+
+/* 
+ * NFSv4 State
+ */
+#ifdef CONFIG_NFSD_V4
+extern unsigned long max_delegations;
+int nfsd4_init_slabs(void);
+void nfsd4_free_slabs(void);
+int nfs4_state_start(void);
+int nfs4_state_start_net(struct net *net);
+void nfs4_state_shutdown(void);
+void nfs4_state_shutdown_net(struct net *net);
+void nfs4_reset_lease(time_t leasetime);
+int nfs4_reset_recoverydir(char *recdir);
+char * nfs4_recoverydir(void);
+#else
+static inline int nfsd4_init_slabs(void) { return 0; }
+static inline void nfsd4_free_slabs(void) { }
+static inline int nfs4_state_start(void) { return 0; }
+static inline int nfs4_state_start_net(struct net *net) { return 0; }
+static inline void nfs4_state_shutdown(void) { }
+static inline void nfs4_state_shutdown_net(struct net *net) { }
+static inline void nfs4_reset_lease(time_t leasetime) { }
+static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
+static inline char * nfs4_recoverydir(void) {return NULL; }
+#endif
+
+/*
+ * lockd binding
+ */
+void		nfsd_lockd_init(void);
+void		nfsd_lockd_shutdown(void);
+
+
+/*
+ * These macros provide pre-xdr'ed values for faster operation.
+ */
+#define	nfs_ok			cpu_to_be32(NFS_OK)
+#define	nfserr_perm		cpu_to_be32(NFSERR_PERM)
+#define	nfserr_noent		cpu_to_be32(NFSERR_NOENT)
+#define	nfserr_io		cpu_to_be32(NFSERR_IO)
+#define	nfserr_nxio		cpu_to_be32(NFSERR_NXIO)
+#define	nfserr_eagain		cpu_to_be32(NFSERR_EAGAIN)
+#define	nfserr_acces		cpu_to_be32(NFSERR_ACCES)
+#define	nfserr_exist		cpu_to_be32(NFSERR_EXIST)
+#define	nfserr_xdev		cpu_to_be32(NFSERR_XDEV)
+#define	nfserr_nodev		cpu_to_be32(NFSERR_NODEV)
+#define	nfserr_notdir		cpu_to_be32(NFSERR_NOTDIR)
+#define	nfserr_isdir		cpu_to_be32(NFSERR_ISDIR)
+#define	nfserr_inval		cpu_to_be32(NFSERR_INVAL)
+#define	nfserr_fbig		cpu_to_be32(NFSERR_FBIG)
+#define	nfserr_nospc		cpu_to_be32(NFSERR_NOSPC)
+#define	nfserr_rofs		cpu_to_be32(NFSERR_ROFS)
+#define	nfserr_mlink		cpu_to_be32(NFSERR_MLINK)
+#define	nfserr_opnotsupp	cpu_to_be32(NFSERR_OPNOTSUPP)
+#define	nfserr_nametoolong	cpu_to_be32(NFSERR_NAMETOOLONG)
+#define	nfserr_notempty		cpu_to_be32(NFSERR_NOTEMPTY)
+#define	nfserr_dquot		cpu_to_be32(NFSERR_DQUOT)
+#define	nfserr_stale		cpu_to_be32(NFSERR_STALE)
+#define	nfserr_remote		cpu_to_be32(NFSERR_REMOTE)
+#define	nfserr_wflush		cpu_to_be32(NFSERR_WFLUSH)
+#define	nfserr_badhandle	cpu_to_be32(NFSERR_BADHANDLE)
+#define	nfserr_notsync		cpu_to_be32(NFSERR_NOT_SYNC)
+#define	nfserr_badcookie	cpu_to_be32(NFSERR_BAD_COOKIE)
+#define	nfserr_notsupp		cpu_to_be32(NFSERR_NOTSUPP)
+#define	nfserr_toosmall		cpu_to_be32(NFSERR_TOOSMALL)
+#define	nfserr_serverfault	cpu_to_be32(NFSERR_SERVERFAULT)
+#define	nfserr_badtype		cpu_to_be32(NFSERR_BADTYPE)
+#define	nfserr_jukebox		cpu_to_be32(NFSERR_JUKEBOX)
+#define	nfserr_denied		cpu_to_be32(NFSERR_DENIED)
+#define	nfserr_deadlock		cpu_to_be32(NFSERR_DEADLOCK)
+#define nfserr_expired          cpu_to_be32(NFSERR_EXPIRED)
+#define	nfserr_bad_cookie	cpu_to_be32(NFSERR_BAD_COOKIE)
+#define	nfserr_same		cpu_to_be32(NFSERR_SAME)
+#define	nfserr_clid_inuse	cpu_to_be32(NFSERR_CLID_INUSE)
+#define	nfserr_stale_clientid	cpu_to_be32(NFSERR_STALE_CLIENTID)
+#define	nfserr_resource		cpu_to_be32(NFSERR_RESOURCE)
+#define	nfserr_moved		cpu_to_be32(NFSERR_MOVED)
+#define	nfserr_nofilehandle	cpu_to_be32(NFSERR_NOFILEHANDLE)
+#define	nfserr_minor_vers_mismatch	cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_share_denied	cpu_to_be32(NFSERR_SHARE_DENIED)
+#define nfserr_stale_stateid	cpu_to_be32(NFSERR_STALE_STATEID)
+#define nfserr_old_stateid	cpu_to_be32(NFSERR_OLD_STATEID)
+#define nfserr_bad_stateid	cpu_to_be32(NFSERR_BAD_STATEID)
+#define nfserr_bad_seqid	cpu_to_be32(NFSERR_BAD_SEQID)
+#define	nfserr_symlink		cpu_to_be32(NFSERR_SYMLINK)
+#define	nfserr_not_same		cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_lock_range	cpu_to_be32(NFSERR_LOCK_RANGE)
+#define	nfserr_restorefh	cpu_to_be32(NFSERR_RESTOREFH)
+#define	nfserr_attrnotsupp	cpu_to_be32(NFSERR_ATTRNOTSUPP)
+#define	nfserr_bad_xdr		cpu_to_be32(NFSERR_BAD_XDR)
+#define	nfserr_openmode		cpu_to_be32(NFSERR_OPENMODE)
+#define	nfserr_badowner		cpu_to_be32(NFSERR_BADOWNER)
+#define	nfserr_locks_held	cpu_to_be32(NFSERR_LOCKS_HELD)
+#define	nfserr_op_illegal	cpu_to_be32(NFSERR_OP_ILLEGAL)
+#define	nfserr_grace		cpu_to_be32(NFSERR_GRACE)
+#define	nfserr_no_grace		cpu_to_be32(NFSERR_NO_GRACE)
+#define	nfserr_reclaim_bad	cpu_to_be32(NFSERR_RECLAIM_BAD)
+#define	nfserr_badname		cpu_to_be32(NFSERR_BADNAME)
+#define	nfserr_cb_path_down	cpu_to_be32(NFSERR_CB_PATH_DOWN)
+#define	nfserr_locked		cpu_to_be32(NFSERR_LOCKED)
+#define	nfserr_wrongsec		cpu_to_be32(NFSERR_WRONGSEC)
+#define nfserr_badiomode		cpu_to_be32(NFS4ERR_BADIOMODE)
+#define nfserr_badlayout		cpu_to_be32(NFS4ERR_BADLAYOUT)
+#define nfserr_bad_session_digest	cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
+#define nfserr_badsession		cpu_to_be32(NFS4ERR_BADSESSION)
+#define nfserr_badslot			cpu_to_be32(NFS4ERR_BADSLOT)
+#define nfserr_complete_already		cpu_to_be32(NFS4ERR_COMPLETE_ALREADY)
+#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+#define nfserr_deleg_already_wanted	cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED)
+#define nfserr_back_chan_busy		cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY)
+#define nfserr_layouttrylater		cpu_to_be32(NFS4ERR_LAYOUTTRYLATER)
+#define nfserr_layoutunavailable	cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE)
+#define nfserr_nomatching_layout	cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT)
+#define nfserr_recallconflict		cpu_to_be32(NFS4ERR_RECALLCONFLICT)
+#define nfserr_unknown_layouttype	cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE)
+#define nfserr_seq_misordered		cpu_to_be32(NFS4ERR_SEQ_MISORDERED)
+#define nfserr_sequence_pos		cpu_to_be32(NFS4ERR_SEQUENCE_POS)
+#define nfserr_req_too_big		cpu_to_be32(NFS4ERR_REQ_TOO_BIG)
+#define nfserr_rep_too_big		cpu_to_be32(NFS4ERR_REP_TOO_BIG)
+#define nfserr_rep_too_big_to_cache	cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE)
+#define nfserr_retry_uncached_rep	cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP)
+#define nfserr_unsafe_compound		cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND)
+#define nfserr_too_many_ops		cpu_to_be32(NFS4ERR_TOO_MANY_OPS)
+#define nfserr_op_not_in_session	cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION)
+#define nfserr_hash_alg_unsupp		cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP)
+#define nfserr_clientid_busy		cpu_to_be32(NFS4ERR_CLIENTID_BUSY)
+#define nfserr_pnfs_io_hole		cpu_to_be32(NFS4ERR_PNFS_IO_HOLE)
+#define nfserr_seq_false_retry		cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY)
+#define nfserr_bad_high_slot		cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT)
+#define nfserr_deadsession		cpu_to_be32(NFS4ERR_DEADSESSION)
+#define nfserr_encr_alg_unsupp		cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP)
+#define nfserr_pnfs_no_layout		cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT)
+#define nfserr_not_only_op		cpu_to_be32(NFS4ERR_NOT_ONLY_OP)
+#define nfserr_wrong_cred		cpu_to_be32(NFS4ERR_WRONG_CRED)
+#define nfserr_wrong_type		cpu_to_be32(NFS4ERR_WRONG_TYPE)
+#define nfserr_dirdeleg_unavail		cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL)
+#define nfserr_reject_deleg		cpu_to_be32(NFS4ERR_REJECT_DELEG)
+#define nfserr_returnconflict		cpu_to_be32(NFS4ERR_RETURNCONFLICT)
+#define nfserr_deleg_revoked		cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp		cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth		cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_union_notsupp		cpu_to_be32(NFS4ERR_UNION_NOTSUPP)
+#define nfserr_offload_denied		cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs		cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel		cpu_to_be32(NFS4ERR_BADLABEL)
+
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ *  Client should resend eventually
+ */
+#define	nfserr_dropit		cpu_to_be32(30000)
+/* end-of-file indicator in readdir */
+#define	nfserr_eof		cpu_to_be32(30001)
+/* replay detected */
+#define	nfserr_replay_me	cpu_to_be32(11001)
+/* nfs41 replay detected */
+#define	nfserr_replay_cache	cpu_to_be32(11002)
+
+/* Check for dir entries '.' and '..' */
+#define isdotent(n, l)	(l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
+
+#ifdef CONFIG_NFSD_V4
+
+/* before processing a COMPOUND operation, we have to check that there
+ * is enough space in the buffer for XDR encode to succeed.  otherwise,
+ * we might process an operation with side effects, and be unable to
+ * tell the client that the operation succeeded.
+ *
+ * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an "ordinary" _successful_ operation.  (GETATTR,
+ * READ, READDIR, and READLINK have their own buffer checks.)  if we
+ * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
+ *
+ * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
+ * care is taken to ensure that we never fall below this level for any
+ * reason.
+ */
+#define	COMPOUND_SLACK_SPACE		140    /* OP_GETFH */
+#define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
+
+#define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
+
+/*
+ * The following attributes are currently not supported by the NFSv4 server:
+ *    ARCHIVE       (deprecated anyway)
+ *    HIDDEN        (unlikely to be supported any time soon)
+ *    MIMETYPE      (unlikely to be supported any time soon)
+ *    QUOTA_*       (will be supported in a forthcoming patch)
+ *    SYSTEM        (unlikely to be supported any time soon)
+ *    TIME_BACKUP   (unlikely to be supported any time soon)
+ *    TIME_CREATE   (unlikely to be supported any time soon)
+ */
+#define NFSD4_SUPPORTED_ATTRS_WORD0                                                         \
+(FATTR4_WORD0_SUPPORTED_ATTRS   | FATTR4_WORD0_TYPE         | FATTR4_WORD0_FH_EXPIRE_TYPE   \
+ | FATTR4_WORD0_CHANGE          | FATTR4_WORD0_SIZE         | FATTR4_WORD0_LINK_SUPPORT     \
+ | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR   | FATTR4_WORD0_FSID             \
+ | FATTR4_WORD0_UNIQUE_HANDLES  | FATTR4_WORD0_LEASE_TIME   | FATTR4_WORD0_RDATTR_ERROR     \
+ | FATTR4_WORD0_ACLSUPPORT      | FATTR4_WORD0_CANSETTIME   | FATTR4_WORD0_CASE_INSENSITIVE \
+ | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED                             \
+ | FATTR4_WORD0_FILEHANDLE      | FATTR4_WORD0_FILEID       | FATTR4_WORD0_FILES_AVAIL      \
+ | FATTR4_WORD0_FILES_FREE      | FATTR4_WORD0_FILES_TOTAL  | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS      \
+ | FATTR4_WORD0_MAXFILESIZE     | FATTR4_WORD0_MAXLINK      | FATTR4_WORD0_MAXNAME          \
+ | FATTR4_WORD0_MAXREAD         | FATTR4_WORD0_MAXWRITE     | FATTR4_WORD0_ACL)
+
+#define NFSD4_SUPPORTED_ATTRS_WORD1                                                         \
+(FATTR4_WORD1_MODE              | FATTR4_WORD1_NO_TRUNC     | FATTR4_WORD1_NUMLINKS         \
+ | FATTR4_WORD1_OWNER	        | FATTR4_WORD1_OWNER_GROUP  | FATTR4_WORD1_RAWDEV           \
+ | FATTR4_WORD1_SPACE_AVAIL     | FATTR4_WORD1_SPACE_FREE   | FATTR4_WORD1_SPACE_TOTAL      \
+ | FATTR4_WORD1_SPACE_USED      | FATTR4_WORD1_TIME_ACCESS  | FATTR4_WORD1_TIME_ACCESS_SET  \
+ | FATTR4_WORD1_TIME_DELTA   | FATTR4_WORD1_TIME_METADATA    \
+ | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
+
+#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1	FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE	| FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1	0
+#define PNFSD_SUPPORTED_ATTRS_WORD2	0
+#endif /* CONFIG_NFSD_PNFS */
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+	NFSD4_SUPPORTED_ATTRS_WORD0
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+	(NFSD4_SUPPORTED_ATTRS_WORD1	| PNFSD_SUPPORTED_ATTRS_WORD1)
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+	(NFSD4_SUPPORTED_ATTRS_WORD2	| PNFSD_SUPPORTED_ATTRS_WORD2 | \
+	 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+
+/* 4.2 */
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD4_2_SECURITY_ATTRS		FATTR4_WORD2_SECURITY_LABEL
+#else
+#define NFSD4_2_SECURITY_ATTRS		0
+#endif
+
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+	(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+	NFSD4_2_SECURITY_ATTRS)
+
+static inline u32 nfsd_suppattrs0(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
+			    : NFSD4_SUPPORTED_ATTRS_WORD0;
+}
+
+static inline u32 nfsd_suppattrs1(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
+			    : NFSD4_SUPPORTED_ATTRS_WORD1;
+}
+
+static inline u32 nfsd_suppattrs2(u32 minorversion)
+{
+	switch (minorversion) {
+	default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
+	case 1:  return NFSD4_1_SUPPORTED_ATTRS_WORD2;
+	case 0:  return NFSD4_SUPPORTED_ATTRS_WORD2;
+	}
+}
+
+/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
+	(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
+
+/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+#define NFSD_WRITEABLE_ATTRS_WORD0 \
+	(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
+	(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+	| FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
+#else
+#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#endif
+
+#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
+	NFSD_WRITEABLE_ATTRS_WORD0
+/*
+ * we currently store the exclusive create verifier in the v_{a,m}time
+ * attributes so the client can't set these at create time using EXCLUSIVE4_1
+ */
+#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \
+	(NFSD_WRITEABLE_ATTRS_WORD1 & \
+	 ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET))
+#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
+	NFSD_WRITEABLE_ATTRS_WORD2
+
+extern int nfsd4_is_junction(struct dentry *dentry);
+extern int register_cld_notifier(void);
+extern void unregister_cld_notifier(void);
+#else /* CONFIG_NFSD_V4 */
+static inline int nfsd4_is_junction(struct dentry *dentry)
+{
+	return 0;
+}
+
+#define register_cld_notifier() 0
+#define unregister_cld_notifier() do { } while(0)
+
+#endif /* CONFIG_NFSD_V4 */
+
+#endif /* LINUX_NFSD_NFSD_H */
diff --git a/kernel/fs/nfsd/nfsfh.c b/kernel/fs/nfsd/nfsfh.c
new file mode 100644
index 000000000..350041a40
--- /dev/null
+++ b/kernel/fs/nfsd/nfsfh.c
@@ -0,0 +1,692 @@
+/*
+ * NFS server file handle treatment.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ * Portions Copyright (C) 1999 G. Allen Morris III <gam3@acm.org>
+ * Extensive rewrite by Neil Brown <neilb@cse.unsw.edu.au> Southern-Spring 1999
+ * ... and again Southern-Winter 2001 to support export_operations
+ */
+
+#include <linux/exportfs.h>
+
+#include <linux/sunrpc/svcauth_gss.h>
+#include "nfsd.h"
+#include "vfs.h"
+#include "auth.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_FH
+
+
+/*
+ * our acceptability function.
+ * if NOSUBTREECHECK, accept anything
+ * if not, require that we can walk up to exp->ex_dentry
+ * doing some checks on the 'x' bits
+ */
+static int nfsd_acceptable(void *expv, struct dentry *dentry)
+{
+	struct svc_export *exp = expv;
+	int rv;
+	struct dentry *tdentry;
+	struct dentry *parent;
+
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
+		return 1;
+
+	tdentry = dget(dentry);
+	while (tdentry != exp->ex_path.dentry && !IS_ROOT(tdentry)) {
+		/* make sure parents give x permission to user */
+		int err;
+		parent = dget_parent(tdentry);
+		err = inode_permission(d_inode(parent), MAY_EXEC);
+		if (err < 0) {
+			dput(parent);
+			break;
+		}
+		dput(tdentry);
+		tdentry = parent;
+	}
+	if (tdentry != exp->ex_path.dentry)
+		dprintk("nfsd_acceptable failed at %p %pd\n", tdentry, tdentry);
+	rv = (tdentry == exp->ex_path.dentry);
+	dput(tdentry);
+	return rv;
+}
+
+/* Type check. The correct error return for type mismatches does not seem to be
+ * generally agreed upon. SunOS seems to use EISDIR if file isn't S_IFREG; a
+ * comment in the NFSv3 spec says this is incorrect (implementation notes for
+ * the write call).
+ */
+static inline __be32
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested)
+{
+	mode &= S_IFMT;
+
+	if (requested == 0) /* the caller doesn't care */
+		return nfs_ok;
+	if (mode == requested)
+		return nfs_ok;
+	/*
+	 * v4 has an error more specific than err_notdir which we should
+	 * return in preference to err_notdir:
+	 */
+	if (rqstp->rq_vers == 4 && mode == S_IFLNK)
+		return nfserr_symlink;
+	if (requested == S_IFDIR)
+		return nfserr_notdir;
+	if (mode == S_IFDIR)
+		return nfserr_isdir;
+	return nfserr_inval;
+}
+
+static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
+					  struct svc_export *exp)
+{
+	int flags = nfsexp_flags(rqstp, exp);
+
+	/* Check if the request originated from a secure port. */
+	if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) {
+		RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+		dprintk("nfsd: request from insecure port %s!\n",
+		        svc_print_addr(rqstp, buf, sizeof(buf)));
+		return nfserr_perm;
+	}
+
+	/* Set user creds for this exportpoint */
+	return nfserrno(nfsd_setuser(rqstp, exp));
+}
+
+static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
+	struct dentry *dentry, struct svc_export *exp)
+{
+	if (!(exp->ex_flags & NFSEXP_V4ROOT))
+		return nfs_ok;
+	/*
+	 * v2/v3 clients have no need for the V4ROOT export--they use
+	 * the mount protocl instead; also, further V4ROOT checks may be
+	 * in v4-specific code, in which case v2/v3 clients could bypass
+	 * them.
+	 */
+	if (!nfsd_v4client(rqstp))
+		return nfserr_stale;
+	/*
+	 * We're exposing only the directories and symlinks that have to be
+	 * traversed on the way to real exports:
+	 */
+	if (unlikely(!d_is_dir(dentry) &&
+		     !d_is_symlink(dentry)))
+		return nfserr_stale;
+	/*
+	 * A pseudoroot export gives permission to access only one
+	 * single directory; the kernel has to make another upcall
+	 * before granting access to anything else under it:
+	 */
+	if (unlikely(dentry != exp->ex_path.dentry))
+		return nfserr_stale;
+	return nfs_ok;
+}
+
+/*
+ * Use the given filehandle to look up the corresponding export and
+ * dentry.  On success, the results are used to set fh_export and
+ * fh_dentry.
+ */
+static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+{
+	struct knfsd_fh	*fh = &fhp->fh_handle;
+	struct fid *fid = NULL, sfid;
+	struct svc_export *exp;
+	struct dentry *dentry;
+	int fileid_type;
+	int data_left = fh->fh_size/4;
+	__be32 error;
+
+	error = nfserr_stale;
+	if (rqstp->rq_vers > 2)
+		error = nfserr_badhandle;
+	if (rqstp->rq_vers == 4 && fh->fh_size == 0)
+		return nfserr_nofilehandle;
+
+	if (fh->fh_version == 1) {
+		int len;
+
+		if (--data_left < 0)
+			return error;
+		if (fh->fh_auth_type != 0)
+			return error;
+		len = key_len(fh->fh_fsid_type) / 4;
+		if (len == 0)
+			return error;
+		if  (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+			/* deprecated, convert to type 3 */
+			len = key_len(FSID_ENCODE_DEV)/4;
+			fh->fh_fsid_type = FSID_ENCODE_DEV;
+			/*
+			 * struct knfsd_fh uses host-endian fields, which are
+			 * sometimes used to hold net-endian values. This
+			 * confuses sparse, so we must use __force here to
+			 * keep it from complaining.
+			 */
+			fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+							ntohl((__force __be32)fh->fh_fsid[1])));
+			fh->fh_fsid[1] = fh->fh_fsid[2];
+		}
+		data_left -= len;
+		if (data_left < 0)
+			return error;
+		exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+		fid = (struct fid *)(fh->fh_fsid + len);
+	} else {
+		__u32 tfh[2];
+		dev_t xdev;
+		ino_t xino;
+
+		if (fh->fh_size != NFS_FHSIZE)
+			return error;
+		/* assume old filehandle format */
+		xdev = old_decode_dev(fh->ofh_xdev);
+		xino = u32_to_ino_t(fh->ofh_xino);
+		mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
+		exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
+	}
+
+	error = nfserr_stale;
+	if (PTR_ERR(exp) == -ENOENT)
+		return error;
+
+	if (IS_ERR(exp))
+		return nfserrno(PTR_ERR(exp));
+
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+		/* Elevate privileges so that the lack of 'r' or 'x'
+		 * permission on some parent directory will
+		 * not stop exportfs_decode_fh from being able
+		 * to reconnect a directory into the dentry cache.
+		 * The same problem can affect "SUBTREECHECK" exports,
+		 * but as nfsd_acceptable depends on correct
+		 * access control settings being in effect, we cannot
+		 * fix that case easily.
+		 */
+		struct cred *new = prepare_creds();
+		if (!new) {
+			error =  nfserrno(-ENOMEM);
+			goto out;
+		}
+		new->cap_effective =
+			cap_raise_nfsd_set(new->cap_effective,
+					   new->cap_permitted);
+		put_cred(override_creds(new));
+		put_cred(new);
+	} else {
+		error = nfsd_setuser_and_check_port(rqstp, exp);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Look up the dentry using the NFS file handle.
+	 */
+	error = nfserr_stale;
+	if (rqstp->rq_vers > 2)
+		error = nfserr_badhandle;
+
+	if (fh->fh_version != 1) {
+		sfid.i32.ino = fh->ofh_ino;
+		sfid.i32.gen = fh->ofh_generation;
+		sfid.i32.parent_ino = fh->ofh_dirino;
+		fid = &sfid;
+		data_left = 3;
+		if (fh->ofh_dirino == 0)
+			fileid_type = FILEID_INO32_GEN;
+		else
+			fileid_type = FILEID_INO32_GEN_PARENT;
+	} else
+		fileid_type = fh->fh_fileid_type;
+
+	if (fileid_type == FILEID_ROOT)
+		dentry = dget(exp->ex_path.dentry);
+	else {
+		dentry = exportfs_decode_fh(exp->ex_path.mnt, fid,
+				data_left, fileid_type,
+				nfsd_acceptable, exp);
+	}
+	if (dentry == NULL)
+		goto out;
+	if (IS_ERR(dentry)) {
+		if (PTR_ERR(dentry) != -EINVAL)
+			error = nfserrno(PTR_ERR(dentry));
+		goto out;
+	}
+
+	if (d_is_dir(dentry) &&
+			(dentry->d_flags & DCACHE_DISCONNECTED)) {
+		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n",
+				dentry);
+	}
+
+	fhp->fh_dentry = dentry;
+	fhp->fh_export = exp;
+	return 0;
+out:
+	exp_put(exp);
+	return error;
+}
+
+/**
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
+ *
+ * fh_verify() may be called multiple times on a given filehandle, for
+ * example, when processing an NFSv4 compound.  The first call will look
+ * up a dentry using the on-the-wire filehandle.  Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
+ *
+ * @type specifies the type of object expected using one of the S_IF*
+ * constants defined in include/linux/stat.h.  The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
+ */
+__be32
+fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
+{
+	struct svc_export *exp;
+	struct dentry	*dentry;
+	__be32		error;
+
+	dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
+
+	if (!fhp->fh_dentry) {
+		error = nfsd_set_fh_dentry(rqstp, fhp);
+		if (error)
+			goto out;
+	}
+	dentry = fhp->fh_dentry;
+	exp = fhp->fh_export;
+	/*
+	 * We still have to do all these permission checks, even when
+	 * fh_dentry is already set:
+	 * 	- fh_verify may be called multiple times with different
+	 * 	  "access" arguments (e.g. nfsd_proc_create calls
+	 * 	  fh_verify(...,NFSD_MAY_EXEC) first, then later (in
+	 * 	  nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE).
+	 *	- in the NFSv4 case, the filehandle may have been filled
+	 *	  in by fh_compose, and given a dentry, but further
+	 *	  compound operations performed with that filehandle
+	 *	  still need permissions checks.  In the worst case, a
+	 *	  mountpoint crossing may have changed the export
+	 *	  options, and we may now need to use a different uid
+	 *	  (for example, if different id-squashing options are in
+	 *	  effect on the new filesystem).
+	 */
+	error = check_pseudo_root(rqstp, dentry, exp);
+	if (error)
+		goto out;
+
+	error = nfsd_setuser_and_check_port(rqstp, exp);
+	if (error)
+		goto out;
+
+	error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type);
+	if (error)
+		goto out;
+
+	/*
+	 * pseudoflavor restrictions are not enforced on NLM,
+	 * which clients virtually always use auth_sys for,
+	 * even while using RPCSEC_GSS for NFS.
+	 */
+	if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
+		goto skip_pseudoflavor_check;
+	/*
+	 * Clients may expect to be able to use auth_sys during mount,
+	 * even if they use gss for everything else; see section 2.3.2
+	 * of rfc 2623.
+	 */
+	if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+			&& exp->ex_path.dentry == dentry)
+		goto skip_pseudoflavor_check;
+
+	error = check_nfsd_access(exp, rqstp);
+	if (error)
+		goto out;
+
+skip_pseudoflavor_check:
+	/* Finally, check access permissions. */
+	error = nfsd_permission(rqstp, exp, dentry, access);
+
+	if (error) {
+		dprintk("fh_verify: %pd2 permission failure, "
+			"acc=%x, error=%d\n",
+			dentry,
+			access, ntohl(error));
+	}
+out:
+	if (error == nfserr_stale)
+		nfsdstats.fh_stale++;
+	return error;
+}
+
+
+/*
+ * Compose a file handle for an NFS reply.
+ *
+ * Note that when first composed, the dentry may not yet have
+ * an inode.  In this case a call to fh_update should be made
+ * before the fh goes out on the wire ...
+ */
+static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
+		struct dentry *dentry)
+{
+	if (dentry != exp->ex_path.dentry) {
+		struct fid *fid = (struct fid *)
+			(fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1);
+		int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
+		int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK);
+
+		fhp->fh_handle.fh_fileid_type =
+			exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck);
+		fhp->fh_handle.fh_size += maxsize * 4;
+	} else {
+		fhp->fh_handle.fh_fileid_type = FILEID_ROOT;
+	}
+}
+
+/*
+ * for composing old style file handles
+ */
+static inline void _fh_update_old(struct dentry *dentry,
+				  struct svc_export *exp,
+				  struct knfsd_fh *fh)
+{
+	fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
+	fh->ofh_generation = d_inode(dentry)->i_generation;
+	if (d_is_dir(dentry) ||
+	    (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+		fh->ofh_dirino = 0;
+}
+
+static bool is_root_export(struct svc_export *exp)
+{
+	return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
+}
+
+static struct super_block *exp_sb(struct svc_export *exp)
+{
+	return d_inode(exp->ex_path.dentry)->i_sb;
+}
+
+static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
+{
+	switch (fsid_type) {
+	case FSID_DEV:
+		if (!old_valid_dev(exp_sb(exp)->s_dev))
+			return 0;
+		/* FALL THROUGH */
+	case FSID_MAJOR_MINOR:
+	case FSID_ENCODE_DEV:
+		return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
+	case FSID_NUM:
+		return exp->ex_flags & NFSEXP_FSID;
+	case FSID_UUID8:
+	case FSID_UUID16:
+		if (!is_root_export(exp))
+			return 0;
+		/* fall through */
+	case FSID_UUID4_INUM:
+	case FSID_UUID16_INUM:
+		return exp->ex_uuid != NULL;
+	}
+	return 1;
+}
+
+
+static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
+{
+	u8 version;
+	u8 fsid_type;
+retry:
+	version = 1;
+	if (ref_fh && ref_fh->fh_export == exp) {
+		version = ref_fh->fh_handle.fh_version;
+		fsid_type = ref_fh->fh_handle.fh_fsid_type;
+
+		ref_fh = NULL;
+
+		switch (version) {
+		case 0xca:
+			fsid_type = FSID_DEV;
+			break;
+		case 1:
+			break;
+		default:
+			goto retry;
+		}
+
+		/*
+		 * As the fsid -> filesystem mapping was guided by
+		 * user-space, there is no guarantee that the filesystem
+		 * actually supports that fsid type. If it doesn't we
+		 * loop around again without ref_fh set.
+		 */
+		if (!fsid_type_ok_for_exp(fsid_type, exp))
+			goto retry;
+	} else if (exp->ex_flags & NFSEXP_FSID) {
+		fsid_type = FSID_NUM;
+	} else if (exp->ex_uuid) {
+		if (fhp->fh_maxsize >= 64) {
+			if (is_root_export(exp))
+				fsid_type = FSID_UUID16;
+			else
+				fsid_type = FSID_UUID16_INUM;
+		} else {
+			if (is_root_export(exp))
+				fsid_type = FSID_UUID8;
+			else
+				fsid_type = FSID_UUID4_INUM;
+		}
+	} else if (!old_valid_dev(exp_sb(exp)->s_dev))
+		/* for newer device numbers, we must use a newer fsid format */
+		fsid_type = FSID_ENCODE_DEV;
+	else
+		fsid_type = FSID_DEV;
+	fhp->fh_handle.fh_version = version;
+	if (version)
+		fhp->fh_handle.fh_fsid_type = fsid_type;
+}
+
+__be32
+fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+	   struct svc_fh *ref_fh)
+{
+	/* ref_fh is a reference file handle.
+	 * if it is non-null and for the same filesystem, then we should compose
+	 * a filehandle which is of the same version, where possible.
+	 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
+	 * Then create a 32byte filehandle using nfs_fhbase_old
+	 *
+	 */
+
+	struct inode * inode = d_inode(dentry);
+	dev_t ex_dev = exp_sb(exp)->s_dev;
+
+	dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
+		MAJOR(ex_dev), MINOR(ex_dev),
+		(long) d_inode(exp->ex_path.dentry)->i_ino,
+		dentry,
+		(inode ? inode->i_ino : 0));
+
+	/* Choose filehandle version and fsid type based on
+	 * the reference filehandle (if it is in the same export)
+	 * or the export options.
+	 */
+	 set_version_and_fsid_type(fhp, exp, ref_fh);
+
+	if (ref_fh == fhp)
+		fh_put(ref_fh);
+
+	if (fhp->fh_locked || fhp->fh_dentry) {
+		printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n",
+		       dentry);
+	}
+	if (fhp->fh_maxsize < NFS_FHSIZE)
+		printk(KERN_ERR "fh_compose: called with maxsize %d! %pd2\n",
+		       fhp->fh_maxsize,
+		       dentry);
+
+	fhp->fh_dentry = dget(dentry); /* our internal copy */
+	fhp->fh_export = exp_get(exp);
+
+	if (fhp->fh_handle.fh_version == 0xca) {
+		/* old style filehandle please */
+		memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
+		fhp->fh_handle.fh_size = NFS_FHSIZE;
+		fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
+		fhp->fh_handle.ofh_dev =  old_encode_dev(ex_dev);
+		fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
+		fhp->fh_handle.ofh_xino =
+			ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
+		fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
+		if (inode)
+			_fh_update_old(dentry, exp, &fhp->fh_handle);
+	} else {
+		fhp->fh_handle.fh_size =
+			key_len(fhp->fh_handle.fh_fsid_type) + 4;
+		fhp->fh_handle.fh_auth_type = 0;
+
+		mk_fsid(fhp->fh_handle.fh_fsid_type,
+			fhp->fh_handle.fh_fsid,
+			ex_dev,
+			d_inode(exp->ex_path.dentry)->i_ino,
+			exp->ex_fsid, exp->ex_uuid);
+
+		if (inode)
+			_fh_update(fhp, exp, dentry);
+		if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
+			fh_put(fhp);
+			return nfserr_opnotsupp;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Update file handle information after changing a dentry.
+ * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create
+ */
+__be32
+fh_update(struct svc_fh *fhp)
+{
+	struct dentry *dentry;
+
+	if (!fhp->fh_dentry)
+		goto out_bad;
+
+	dentry = fhp->fh_dentry;
+	if (d_really_is_negative(dentry))
+		goto out_negative;
+	if (fhp->fh_handle.fh_version != 1) {
+		_fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
+	} else {
+		if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+			return 0;
+
+		_fh_update(fhp, fhp->fh_export, dentry);
+		if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
+			return nfserr_opnotsupp;
+	}
+	return 0;
+out_bad:
+	printk(KERN_ERR "fh_update: fh not verified!\n");
+	return nfserr_serverfault;
+out_negative:
+	printk(KERN_ERR "fh_update: %pd2 still negative!\n",
+		dentry);
+	return nfserr_serverfault;
+}
+
+/*
+ * Release a file handle.
+ */
+void
+fh_put(struct svc_fh *fhp)
+{
+	struct dentry * dentry = fhp->fh_dentry;
+	struct svc_export * exp = fhp->fh_export;
+	if (dentry) {
+		fh_unlock(fhp);
+		fhp->fh_dentry = NULL;
+		dput(dentry);
+#ifdef CONFIG_NFSD_V3
+		fhp->fh_pre_saved = 0;
+		fhp->fh_post_saved = 0;
+#endif
+	}
+	fh_drop_write(fhp);
+	if (exp) {
+		exp_put(exp);
+		fhp->fh_export = NULL;
+	}
+	return;
+}
+
+/*
+ * Shorthand for dprintk()'s
+ */
+char * SVCFH_fmt(struct svc_fh *fhp)
+{
+	struct knfsd_fh *fh = &fhp->fh_handle;
+
+	static char buf[80];
+	sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
+		fh->fh_size,
+		fh->fh_base.fh_pad[0],
+		fh->fh_base.fh_pad[1],
+		fh->fh_base.fh_pad[2],
+		fh->fh_base.fh_pad[3],
+		fh->fh_base.fh_pad[4],
+		fh->fh_base.fh_pad[5]);
+	return buf;
+}
+
+enum fsid_source fsid_source(struct svc_fh *fhp)
+{
+	if (fhp->fh_handle.fh_version != 1)
+		return FSIDSOURCE_DEV;
+	switch(fhp->fh_handle.fh_fsid_type) {
+	case FSID_DEV:
+	case FSID_ENCODE_DEV:
+	case FSID_MAJOR_MINOR:
+		if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
+			return FSIDSOURCE_DEV;
+		break;
+	case FSID_NUM:
+		if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+			return FSIDSOURCE_FSID;
+		break;
+	default:
+		break;
+	}
+	/* either a UUID type filehandle, or the filehandle doesn't
+	 * match the export.
+	 */
+	if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+		return FSIDSOURCE_FSID;
+	if (fhp->fh_export->ex_uuid)
+		return FSIDSOURCE_UUID;
+	return FSIDSOURCE_DEV;
+}
diff --git a/kernel/fs/nfsd/nfsfh.h b/kernel/fs/nfsd/nfsfh.h
new file mode 100644
index 000000000..1e90dad49
--- /dev/null
+++ b/kernel/fs/nfsd/nfsfh.h
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * This file describes the layout of the file handles as passed
+ * over the wire.
+ */
+#ifndef _LINUX_NFSD_NFSFH_H
+#define _LINUX_NFSD_NFSFH_H
+
+#include <linux/sunrpc/svc.h>
+#include <uapi/linux/nfsd/nfsfh.h>
+
+static inline __u32 ino_t_to_u32(ino_t ino)
+{
+	return (__u32) ino;
+}
+
+static inline ino_t u32_to_ino_t(__u32 uino)
+{
+	return (ino_t) uino;
+}
+
+/*
+ * This is the internal representation of an NFS handle used in knfsd.
+ * pre_mtime/post_version will be used to support wcc_attr's in NFSv3.
+ */
+typedef struct svc_fh {
+	struct knfsd_fh		fh_handle;	/* FH data */
+	struct dentry *		fh_dentry;	/* validated dentry */
+	struct svc_export *	fh_export;	/* export pointer */
+	int			fh_maxsize;	/* max size for fh_handle */
+
+	unsigned char		fh_locked;	/* inode locked by us */
+	unsigned char		fh_want_write;	/* remount protection taken */
+
+#ifdef CONFIG_NFSD_V3
+	unsigned char		fh_post_saved;	/* post-op attrs saved */
+	unsigned char		fh_pre_saved;	/* pre-op attrs saved */
+
+	/* Pre-op attributes saved during fh_lock */
+	__u64			fh_pre_size;	/* size before operation */
+	struct timespec		fh_pre_mtime;	/* mtime before oper */
+	struct timespec		fh_pre_ctime;	/* ctime before oper */
+	/*
+	 * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
+	 *  to find out if it is valid.
+	 */
+	u64			fh_pre_change;
+
+	/* Post-op attributes saved in fh_unlock */
+	struct kstat		fh_post_attr;	/* full attrs after operation */
+	u64			fh_post_change; /* nfsv4 change; see above */
+#endif /* CONFIG_NFSD_V3 */
+
+} svc_fh;
+
+enum nfsd_fsid {
+	FSID_DEV = 0,
+	FSID_NUM,
+	FSID_MAJOR_MINOR,
+	FSID_ENCODE_DEV,
+	FSID_UUID4_INUM,
+	FSID_UUID8,
+	FSID_UUID16,
+	FSID_UUID16_INUM,
+};
+
+enum fsid_source {
+	FSIDSOURCE_DEV,
+	FSIDSOURCE_FSID,
+	FSIDSOURCE_UUID,
+};
+extern enum fsid_source fsid_source(struct svc_fh *fhp);
+
+
+/*
+ * This might look a little large to "inline" but in all calls except
+ * one, 'vers' is constant so moste of the function disappears.
+ *
+ * In some cases the values are considered to be host endian and in
+ * others, net endian. fsidv is always considered to be u32 as the
+ * callers don't know which it will be. So we must use __force to keep
+ * sparse from complaining. Since these values are opaque to the
+ * client, that shouldn't be a problem.
+ */
+static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
+			   u32 fsid, unsigned char *uuid)
+{
+	u32 *up;
+	switch(vers) {
+	case FSID_DEV:
+		fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) |
+				 MINOR(dev));
+		fsidv[1] = ino_t_to_u32(ino);
+		break;
+	case FSID_NUM:
+		fsidv[0] = fsid;
+		break;
+	case FSID_MAJOR_MINOR:
+		fsidv[0] = (__force __u32)htonl(MAJOR(dev));
+		fsidv[1] = (__force __u32)htonl(MINOR(dev));
+		fsidv[2] = ino_t_to_u32(ino);
+		break;
+
+	case FSID_ENCODE_DEV:
+		fsidv[0] = new_encode_dev(dev);
+		fsidv[1] = ino_t_to_u32(ino);
+		break;
+
+	case FSID_UUID4_INUM:
+		/* 4 byte fsid and inode number */
+		up = (u32*)uuid;
+		fsidv[0] = ino_t_to_u32(ino);
+		fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3];
+		break;
+
+	case FSID_UUID8:
+		/* 8 byte fsid  */
+		up = (u32*)uuid;
+		fsidv[0] = up[0] ^ up[2];
+		fsidv[1] = up[1] ^ up[3];
+		break;
+
+	case FSID_UUID16:
+		/* 16 byte fsid - NFSv3+ only */
+		memcpy(fsidv, uuid, 16);
+		break;
+
+	case FSID_UUID16_INUM:
+		/* 8 byte inode and 16 byte fsid */
+		*(u64*)fsidv = (u64)ino;
+		memcpy(fsidv+2, uuid, 16);
+		break;
+	default: BUG();
+	}
+}
+
+static inline int key_len(int type)
+{
+	switch(type) {
+	case FSID_DEV:		return 8;
+	case FSID_NUM: 		return 4;
+	case FSID_MAJOR_MINOR:	return 12;
+	case FSID_ENCODE_DEV:	return 8;
+	case FSID_UUID4_INUM:	return 8;
+	case FSID_UUID8:	return 8;
+	case FSID_UUID16:	return 16;
+	case FSID_UUID16_INUM:	return 24;
+	default: return 0;
+	}
+}
+
+/*
+ * Shorthand for dprintk()'s
+ */
+extern char * SVCFH_fmt(struct svc_fh *fhp);
+
+/*
+ * Function prototypes
+ */
+__be32	fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int);
+__be32	fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
+__be32	fh_update(struct svc_fh *);
+void	fh_put(struct svc_fh *);
+
+static __inline__ struct svc_fh *
+fh_copy(struct svc_fh *dst, struct svc_fh *src)
+{
+	WARN_ON(src->fh_dentry || src->fh_locked);
+			
+	*dst = *src;
+	return dst;
+}
+
+static inline void
+fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+{
+	dst->fh_size = src->fh_size;
+	memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+}
+
+static __inline__ struct svc_fh *
+fh_init(struct svc_fh *fhp, int maxsize)
+{
+	memset(fhp, 0, sizeof(*fhp));
+	fhp->fh_maxsize = maxsize;
+	return fhp;
+}
+
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+	if (fh1->fh_size != fh2->fh_size)
+		return false;
+	if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+		return false;
+	return true;
+}
+
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+	if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+		return false;
+	if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
+		return false;
+	return true;
+}
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * The wcc data stored in current_fh should be cleared
+ * between compound ops.
+ */
+static inline void
+fh_clear_wcc(struct svc_fh *fhp)
+{
+	fhp->fh_post_saved = 0;
+	fhp->fh_pre_saved = 0;
+}
+
+/*
+ * Fill in the pre_op attr for the wcc data
+ */
+static inline void
+fill_pre_wcc(struct svc_fh *fhp)
+{
+	struct inode    *inode;
+
+	inode = d_inode(fhp->fh_dentry);
+	if (!fhp->fh_pre_saved) {
+		fhp->fh_pre_mtime = inode->i_mtime;
+		fhp->fh_pre_ctime = inode->i_ctime;
+		fhp->fh_pre_size  = inode->i_size;
+		fhp->fh_pre_change = inode->i_version;
+		fhp->fh_pre_saved = 1;
+	}
+}
+
+extern void fill_post_wcc(struct svc_fh *);
+#else
+#define fh_clear_wcc(ignored)
+#define fill_pre_wcc(ignored)
+#define fill_post_wcc(notused)
+#endif /* CONFIG_NFSD_V3 */
+
+
+/*
+ * Lock a file handle/inode
+ * NOTE: both fh_lock and fh_unlock are done "by hand" in
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
+ * so, any changes here should be reflected there.
+ */
+
+static inline void
+fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
+{
+	struct dentry	*dentry = fhp->fh_dentry;
+	struct inode	*inode;
+
+	BUG_ON(!dentry);
+
+	if (fhp->fh_locked) {
+		printk(KERN_WARNING "fh_lock: %pd2 already locked!\n",
+			dentry);
+		return;
+	}
+
+	inode = d_inode(dentry);
+	mutex_lock_nested(&inode->i_mutex, subclass);
+	fill_pre_wcc(fhp);
+	fhp->fh_locked = 1;
+}
+
+static inline void
+fh_lock(struct svc_fh *fhp)
+{
+	fh_lock_nested(fhp, I_MUTEX_NORMAL);
+}
+
+/*
+ * Unlock a file handle/inode
+ */
+static inline void
+fh_unlock(struct svc_fh *fhp)
+{
+	if (fhp->fh_locked) {
+		fill_post_wcc(fhp);
+		mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
+		fhp->fh_locked = 0;
+	}
+}
+
+#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/kernel/fs/nfsd/nfsproc.c b/kernel/fs/nfsd/nfsproc.c
new file mode 100644
index 000000000..aecbcd34d
--- /dev/null
+++ b/kernel/fs/nfsd/nfsproc.c
@@ -0,0 +1,759 @@
+/*
+ * Process version 2 NFS requests.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/namei.h>
+
+#include "cache.h"
+#include "xdr.h"
+#include "vfs.h"
+
+typedef struct svc_rqst	svc_rqst;
+typedef struct svc_buf	svc_buf;
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+
+
+static __be32
+nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+static __be32
+nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
+{
+	if (err) return err;
+	return fh_getattr(&resp->fh, &resp->stat);
+}
+static __be32
+nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
+{
+	if (err) return err;
+	return fh_getattr(&resp->fh, &resp->stat);
+}
+/*
+ * Get a file's attributes
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
+					  struct nfsd_attrstat *resp)
+{
+	__be32 nfserr;
+	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	return nfsd_return_attrs(nfserr, resp);
+}
+
+/*
+ * Set a file's attributes
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
+					  struct nfsd_attrstat  *resp)
+{
+	__be32 nfserr;
+	dprintk("nfsd: SETATTR  %s, valid=%x, size=%ld\n",
+		SVCFH_fmt(&argp->fh),
+		argp->attrs.ia_valid, (long) argp->attrs.ia_size);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0);
+	return nfsd_return_attrs(nfserr, resp);
+}
+
+/*
+ * Look up a path name component
+ * Note: the dentry in the resp->fh may be negative if the file
+ * doesn't exist yet.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
+					 struct nfsd_diropres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LOOKUP   %s %.*s\n",
+		SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+	fh_init(&resp->fh, NFS_FHSIZE);
+	nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
+				 &resp->fh);
+
+	fh_put(&argp->fh);
+	return nfsd_return_dirop(nfserr, resp);
+}
+
+/*
+ * Read a symlink.
+ */
+static __be32
+nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
+					   struct nfsd_readlinkres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
+
+	/* Read the symlink. */
+	resp->len = NFS_MAXPATHLEN;
+	nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
+
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * Read a portion of a file.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
+				       struct nfsd_readres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: READ    %s %d bytes at %d\n",
+		SVCFH_fmt(&argp->fh),
+		argp->count, argp->offset);
+
+	/* Obtain buffer pointer for payload. 19 is 1 word for
+	 * status, 17 words for fattr, and 1 word for the byte count.
+	 */
+
+	if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
+		char buf[RPC_MAX_ADDRBUFLEN];
+		printk(KERN_NOTICE
+			"oversized read request from %s (%d bytes)\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)),
+				argp->count);
+		argp->count = NFSSVC_MAXBLKSIZE_V2;
+	}
+	svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
+
+	resp->count = argp->count;
+	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
+				  argp->offset,
+			   	  rqstp->rq_vec, argp->vlen,
+				  &resp->count);
+
+	if (nfserr) return nfserr;
+	return fh_getattr(&resp->fh, &resp->stat);
+}
+
+/*
+ * Write data to a file
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
+					struct nfsd_attrstat  *resp)
+{
+	__be32	nfserr;
+	int	stable = 1;
+	unsigned long cnt = argp->len;
+
+	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
+		SVCFH_fmt(&argp->fh),
+		argp->len, argp->offset);
+
+	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
+				   argp->offset,
+				   rqstp->rq_vec, argp->vlen,
+			           &cnt,
+				   &stable);
+	return nfsd_return_attrs(nfserr, resp);
+}
+
+/*
+ * CREATE processing is complicated. The keyword here is `overloaded.'
+ * The parent directory is kept locked between the check for existence
+ * and the actual create() call in compliance with VFS protocols.
+ * N.B. After this call _both_ argp->fh and resp->fh need an fh_put
+ */
+static __be32
+nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
+					 struct nfsd_diropres   *resp)
+{
+	svc_fh		*dirfhp = &argp->fh;
+	svc_fh		*newfhp = &resp->fh;
+	struct iattr	*attr = &argp->attrs;
+	struct inode	*inode;
+	struct dentry	*dchild;
+	int		type, mode;
+	__be32		nfserr;
+	int		hosterr;
+	dev_t		rdev = 0, wanted = new_decode_dev(attr->ia_size);
+
+	dprintk("nfsd: CREATE   %s %.*s\n",
+		SVCFH_fmt(dirfhp), argp->len, argp->name);
+
+	/* First verify the parent file handle */
+	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
+	if (nfserr)
+		goto done; /* must fh_put dirfhp even on error */
+
+	/* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
+
+	nfserr = nfserr_acces;
+	if (!argp->len)
+		goto done;
+	nfserr = nfserr_exist;
+	if (isdotent(argp->name, argp->len))
+		goto done;
+	hosterr = fh_want_write(dirfhp);
+	if (hosterr) {
+		nfserr = nfserrno(hosterr);
+		goto done;
+	}
+
+	fh_lock_nested(dirfhp, I_MUTEX_PARENT);
+	dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
+	if (IS_ERR(dchild)) {
+		nfserr = nfserrno(PTR_ERR(dchild));
+		goto out_unlock;
+	}
+	fh_init(newfhp, NFS_FHSIZE);
+	nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
+	if (!nfserr && d_really_is_negative(dchild))
+		nfserr = nfserr_noent;
+	dput(dchild);
+	if (nfserr) {
+		if (nfserr != nfserr_noent)
+			goto out_unlock;
+		/*
+		 * If the new file handle wasn't verified, we can't tell
+		 * whether the file exists or not. Time to bail ...
+		 */
+		nfserr = nfserr_acces;
+		if (!newfhp->fh_dentry) {
+			printk(KERN_WARNING 
+				"nfsd_proc_create: file handle not verified\n");
+			goto out_unlock;
+		}
+	}
+
+	inode = d_inode(newfhp->fh_dentry);
+
+	/* Unfudge the mode bits */
+	if (attr->ia_valid & ATTR_MODE) {
+		type = attr->ia_mode & S_IFMT;
+		mode = attr->ia_mode & ~S_IFMT;
+		if (!type) {
+			/* no type, so if target exists, assume same as that,
+			 * else assume a file */
+			if (inode) {
+				type = inode->i_mode & S_IFMT;
+				switch(type) {
+				case S_IFCHR:
+				case S_IFBLK:
+					/* reserve rdev for later checking */
+					rdev = inode->i_rdev;
+					attr->ia_valid |= ATTR_SIZE;
+
+					/* FALLTHROUGH */
+				case S_IFIFO:
+					/* this is probably a permission check..
+					 * at least IRIX implements perm checking on
+					 *   echo thing > device-special-file-or-pipe
+					 * by doing a CREATE with type==0
+					 */
+					nfserr = nfsd_permission(rqstp,
+								 newfhp->fh_export,
+								 newfhp->fh_dentry,
+								 NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
+					if (nfserr && nfserr != nfserr_rofs)
+						goto out_unlock;
+				}
+			} else
+				type = S_IFREG;
+		}
+	} else if (inode) {
+		type = inode->i_mode & S_IFMT;
+		mode = inode->i_mode & ~S_IFMT;
+	} else {
+		type = S_IFREG;
+		mode = 0;	/* ??? */
+	}
+
+	attr->ia_valid |= ATTR_MODE;
+	attr->ia_mode = mode;
+
+	/* Special treatment for non-regular files according to the
+	 * gospel of sun micro
+	 */
+	if (type != S_IFREG) {
+		if (type != S_IFBLK && type != S_IFCHR) {
+			rdev = 0;
+		} else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
+			/* If you think you've seen the worst, grok this. */
+			type = S_IFIFO;
+		} else {
+			/* Okay, char or block special */
+			if (!rdev)
+				rdev = wanted;
+		}
+
+		/* we've used the SIZE information, so discard it */
+		attr->ia_valid &= ~ATTR_SIZE;
+
+		/* Make sure the type and device matches */
+		nfserr = nfserr_exist;
+		if (inode && type != (inode->i_mode & S_IFMT))
+			goto out_unlock;
+	}
+
+	nfserr = 0;
+	if (!inode) {
+		/* File doesn't exist. Create it and set attrs */
+		nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len,
+					attr, type, rdev, newfhp);
+	} else if (type == S_IFREG) {
+		dprintk("nfsd:   existing %s, valid=%x, size=%ld\n",
+			argp->name, attr->ia_valid, (long) attr->ia_size);
+		/* File already exists. We ignore all attributes except
+		 * size, so that creat() behaves exactly like
+		 * open(..., O_CREAT|O_TRUNC|O_WRONLY).
+		 */
+		attr->ia_valid &= ATTR_SIZE;
+		if (attr->ia_valid)
+			nfserr = nfsd_setattr(rqstp, newfhp, attr, 0, (time_t)0);
+	}
+
+out_unlock:
+	/* We don't really need to unlock, as fh_put does it. */
+	fh_unlock(dirfhp);
+	fh_drop_write(dirfhp);
+done:
+	fh_put(dirfhp);
+	return nfsd_return_dirop(nfserr, resp);
+}
+
+static __be32
+nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
+					 void		       *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: REMOVE   %s %.*s\n", SVCFH_fmt(&argp->fh),
+		argp->len, argp->name);
+
+	/* Unlink. -SIFDIR means file must not be a directory */
+	nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len);
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+static __be32
+nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
+				  	 void		        *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RENAME   %s %.*s -> \n",
+		SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
+	dprintk("nfsd:        ->  %s %.*s\n",
+		SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname);
+
+	nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
+				    &argp->tfh, argp->tname, argp->tlen);
+	fh_put(&argp->ffh);
+	fh_put(&argp->tfh);
+	return nfserr;
+}
+
+static __be32
+nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
+				void			    *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LINK     %s ->\n",
+		SVCFH_fmt(&argp->ffh));
+	dprintk("nfsd:    %s %.*s\n",
+		SVCFH_fmt(&argp->tfh),
+		argp->tlen,
+		argp->tname);
+
+	nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
+				  &argp->ffh);
+	fh_put(&argp->ffh);
+	fh_put(&argp->tfh);
+	return nfserr;
+}
+
+static __be32
+nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
+				          void			  *resp)
+{
+	struct svc_fh	newfh;
+	__be32		nfserr;
+
+	dprintk("nfsd: SYMLINK  %s %.*s -> %.*s\n",
+		SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
+		argp->tlen, argp->tname);
+
+	fh_init(&newfh, NFS_FHSIZE);
+	/*
+	 * Crazy hack: the request fits in a page, and already-decoded
+	 * attributes follow argp->tname, so it's safe to just write a
+	 * null to ensure it's null-terminated:
+	 */
+	argp->tname[argp->tlen] = '\0';
+	nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
+						 argp->tname, &newfh);
+
+	fh_put(&argp->ffh);
+	fh_put(&newfh);
+	return nfserr;
+}
+
+/*
+ * Make directory. This operation is not idempotent.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
+					struct nfsd_diropres   *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: MKDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+	if (resp->fh.fh_dentry) {
+		printk(KERN_WARNING
+			"nfsd_proc_mkdir: response already verified??\n");
+	}
+
+	argp->attrs.ia_valid &= ~ATTR_SIZE;
+	fh_init(&resp->fh, NFS_FHSIZE);
+	nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
+				    &argp->attrs, S_IFDIR, 0, &resp->fh);
+	fh_put(&argp->fh);
+	return nfsd_return_dirop(nfserr, resp);
+}
+
+/*
+ * Remove a directory
+ */
+static __be32
+nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
+				 	void		      *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RMDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+	nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len);
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * Read a portion of a directory.
+ */
+static __be32
+nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
+					  struct nfsd_readdirres  *resp)
+{
+	int		count;
+	__be32		nfserr;
+	loff_t		offset;
+
+	dprintk("nfsd: READDIR  %s %d bytes at %d\n",
+		SVCFH_fmt(&argp->fh),		
+		argp->count, argp->cookie);
+
+	/* Shrink to the client read size */
+	count = (argp->count >> 2) - 2;
+
+	/* Make sure we've room for the NULL ptr & eof flag */
+	count -= 2;
+	if (count < 0)
+		count = 0;
+
+	resp->buffer = argp->buffer;
+	resp->offset = NULL;
+	resp->buflen = count;
+	resp->common.err = nfs_ok;
+	/* Read directory and encode entries on the fly */
+	offset = argp->cookie;
+	nfserr = nfsd_readdir(rqstp, &argp->fh, &offset, 
+			      &resp->common, nfssvc_encode_entry);
+
+	resp->count = resp->buffer - argp->buffer;
+	if (resp->offset)
+		*resp->offset = htonl(offset);
+
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * Get file system info
+ */
+static __be32
+nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
+					  struct nfsd_statfsres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: STATFS   %s\n", SVCFH_fmt(&argp->fh));
+
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+			NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * NFSv2 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+struct nfsd_void { int dummy; };
+
+#define ST 1		/* status */
+#define FH 8		/* filehandle */
+#define	AT 18		/* attributes */
+
+static struct svc_procedure		nfsd_procedures2[18] = {
+	[NFSPROC_NULL] = {
+		.pc_func = (svc_procfunc) nfsd_proc_null,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_void,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_void),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_GETATTR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_getattr,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_ressize = sizeof(struct nfsd_attrstat),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFSPROC_SETATTR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_setattr,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_sattrargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_sattrargs),
+		.pc_ressize = sizeof(struct nfsd_attrstat),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFSPROC_ROOT] = {
+		.pc_decode = (kxdrproc_t) nfssvc_decode_void,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_void),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_LOOKUP] = {
+		.pc_func = (svc_procfunc) nfsd_proc_lookup,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_ressize = sizeof(struct nfsd_diropres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+FH+AT,
+	},
+	[NFSPROC_READLINK] = {
+		.pc_func = (svc_procfunc) nfsd_proc_readlink,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_readlinkargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_readlinkres,
+		.pc_argsize = sizeof(struct nfsd_readlinkargs),
+		.pc_ressize = sizeof(struct nfsd_readlinkres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+	},
+	[NFSPROC_READ] = {
+		.pc_func = (svc_procfunc) nfsd_proc_read,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_readargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_readres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_readargs),
+		.pc_ressize = sizeof(struct nfsd_readres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+	},
+	[NFSPROC_WRITECACHE] = {
+		.pc_decode = (kxdrproc_t) nfssvc_decode_void,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_void),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_WRITE] = {
+		.pc_func = (svc_procfunc) nfsd_proc_write,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_writeargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_writeargs),
+		.pc_ressize = sizeof(struct nfsd_attrstat),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFSPROC_CREATE] = {
+		.pc_func = (svc_procfunc) nfsd_proc_create,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_createargs),
+		.pc_ressize = sizeof(struct nfsd_diropres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+FH+AT,
+	},
+	[NFSPROC_REMOVE] = {
+		.pc_func = (svc_procfunc) nfsd_proc_remove,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_RENAME] = {
+		.pc_func = (svc_procfunc) nfsd_proc_rename,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_renameargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_renameargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_LINK] = {
+		.pc_func = (svc_procfunc) nfsd_proc_link,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_linkargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_linkargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_SYMLINK] = {
+		.pc_func = (svc_procfunc) nfsd_proc_symlink,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_symlinkargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_symlinkargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_MKDIR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_mkdir,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_createargs),
+		.pc_ressize = sizeof(struct nfsd_diropres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+FH+AT,
+	},
+	[NFSPROC_RMDIR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_rmdir,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_READDIR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_readdir,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_readdirargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_readdirres,
+		.pc_argsize = sizeof(struct nfsd_readdirargs),
+		.pc_ressize = sizeof(struct nfsd_readdirres),
+		.pc_cachetype = RC_NOCACHE,
+	},
+	[NFSPROC_STATFS] = {
+		.pc_func = (svc_procfunc) nfsd_proc_statfs,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_statfsres,
+		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_ressize = sizeof(struct nfsd_statfsres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+5,
+	},
+};
+
+
+struct svc_version	nfsd_version2 = {
+		.vs_vers	= 2,
+		.vs_nproc	= 18,
+		.vs_proc	= nfsd_procedures2,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS2_SVC_XDRSIZE,
+};
+
+/*
+ * Map errnos to NFS errnos.
+ */
+__be32
+nfserrno (int errno)
+{
+	static struct {
+		__be32	nfserr;
+		int	syserr;
+	} nfs_errtbl[] = {
+		{ nfs_ok, 0 },
+		{ nfserr_perm, -EPERM },
+		{ nfserr_noent, -ENOENT },
+		{ nfserr_io, -EIO },
+		{ nfserr_nxio, -ENXIO },
+		{ nfserr_fbig, -E2BIG },
+		{ nfserr_acces, -EACCES },
+		{ nfserr_exist, -EEXIST },
+		{ nfserr_xdev, -EXDEV },
+		{ nfserr_mlink, -EMLINK },
+		{ nfserr_nodev, -ENODEV },
+		{ nfserr_notdir, -ENOTDIR },
+		{ nfserr_isdir, -EISDIR },
+		{ nfserr_inval, -EINVAL },
+		{ nfserr_fbig, -EFBIG },
+		{ nfserr_nospc, -ENOSPC },
+		{ nfserr_rofs, -EROFS },
+		{ nfserr_mlink, -EMLINK },
+		{ nfserr_nametoolong, -ENAMETOOLONG },
+		{ nfserr_notempty, -ENOTEMPTY },
+#ifdef EDQUOT
+		{ nfserr_dquot, -EDQUOT },
+#endif
+		{ nfserr_stale, -ESTALE },
+		{ nfserr_jukebox, -ETIMEDOUT },
+		{ nfserr_jukebox, -ERESTARTSYS },
+		{ nfserr_jukebox, -EAGAIN },
+		{ nfserr_jukebox, -EWOULDBLOCK },
+		{ nfserr_jukebox, -ENOMEM },
+		{ nfserr_io, -ETXTBSY },
+		{ nfserr_notsupp, -EOPNOTSUPP },
+		{ nfserr_toosmall, -ETOOSMALL },
+		{ nfserr_serverfault, -ESERVERFAULT },
+		{ nfserr_serverfault, -ENFILE },
+	};
+	int	i;
+
+	for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
+		if (nfs_errtbl[i].syserr == errno)
+			return nfs_errtbl[i].nfserr;
+	}
+	WARN(1, "nfsd: non-standard errno: %d\n", errno);
+	return nfserr_io;
+}
+
diff --git a/kernel/fs/nfsd/nfssvc.c b/kernel/fs/nfsd/nfssvc.c
new file mode 100644
index 000000000..9277cc91c
--- /dev/null
+++ b/kernel/fs/nfsd/nfssvc.c
@@ -0,0 +1,752 @@
+/*
+ * Central processing for nfsd.
+ *
+ * Authors:	Olaf Kirch (okir@monad.swb.de)
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
+#include <linux/fs_struct.h>
+#include <linux/swap.h>
+
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfsacl.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include "nfsd.h"
+#include "cache.h"
+#include "vfs.h"
+#include "netns.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_SVC
+
+extern struct svc_program	nfsd_program;
+static int			nfsd(void *vrqstp);
+
+/*
+ * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
+ * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+ * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ *
+ * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+ * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+ * entry of ->sv_pools[].
+ *
+ * Transitions of the thread count between zero and non-zero are of particular
+ * interest since the svc_serv needs to be created and initialized at that
+ * point, or freed.
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+ * nfsctl.c. In particular:
+ *
+ *	user_recovery_dirname
+ *	user_lease_time
+ *	nfsd_versions
+ */
+DEFINE_MUTEX(nfsd_mutex);
+
+/*
+ * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
+ * nfsd_drc_max_pages limits the total amount of memory available for
+ * version 4.1 DRC caches.
+ * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
+ */
+spinlock_t	nfsd_drc_lock;
+unsigned long	nfsd_drc_max_mem;
+unsigned long	nfsd_drc_mem_used;
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static struct svc_stat	nfsd_acl_svcstats;
+static struct svc_version *	nfsd_acl_version[] = {
+	[2] = &nfsd_acl_version2,
+	[3] = &nfsd_acl_version3,
+};
+
+#define NFSD_ACL_MINVERS            2
+#define NFSD_ACL_NRVERS		ARRAY_SIZE(nfsd_acl_version)
+static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS];
+
+static struct svc_program	nfsd_acl_program = {
+	.pg_prog		= NFS_ACL_PROGRAM,
+	.pg_nvers		= NFSD_ACL_NRVERS,
+	.pg_vers		= nfsd_acl_versions,
+	.pg_name		= "nfsacl",
+	.pg_class		= "nfsd",
+	.pg_stats		= &nfsd_acl_svcstats,
+	.pg_authenticate	= &svc_set_client,
+};
+
+static struct svc_stat	nfsd_acl_svcstats = {
+	.program	= &nfsd_acl_program,
+};
+#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
+
+static struct svc_version *	nfsd_version[] = {
+	[2] = &nfsd_version2,
+#if defined(CONFIG_NFSD_V3)
+	[3] = &nfsd_version3,
+#endif
+#if defined(CONFIG_NFSD_V4)
+	[4] = &nfsd_version4,
+#endif
+};
+
+#define NFSD_MINVERS    	2
+#define NFSD_NRVERS		ARRAY_SIZE(nfsd_version)
+static struct svc_version *nfsd_versions[NFSD_NRVERS];
+
+struct svc_program		nfsd_program = {
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+	.pg_next		= &nfsd_acl_program,
+#endif
+	.pg_prog		= NFS_PROGRAM,		/* program number */
+	.pg_nvers		= NFSD_NRVERS,		/* nr of entries in nfsd_version */
+	.pg_vers		= nfsd_versions,	/* version table */
+	.pg_name		= "nfsd",		/* program name */
+	.pg_class		= "nfsd",		/* authentication class */
+	.pg_stats		= &nfsd_svcstats,	/* version table */
+	.pg_authenticate	= &svc_set_client,	/* export authentication */
+
+};
+
+static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
+	[0] = 1,
+	[1] = 1,
+	[2] = 1,
+};
+
+int nfsd_vers(int vers, enum vers_op change)
+{
+	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
+		return 0;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_versions[vers] = nfsd_version[vers];
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		if (vers < NFSD_ACL_NRVERS)
+			nfsd_acl_versions[vers] = nfsd_acl_version[vers];
+#endif
+		break;
+	case NFSD_CLEAR:
+		nfsd_versions[vers] = NULL;
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		if (vers < NFSD_ACL_NRVERS)
+			nfsd_acl_versions[vers] = NULL;
+#endif
+		break;
+	case NFSD_TEST:
+		return nfsd_versions[vers] != NULL;
+	case NFSD_AVAIL:
+		return nfsd_version[vers] != NULL;
+	}
+	return 0;
+}
+
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+		return -1;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_supported_minorversions[minorversion] = true;
+		break;
+	case NFSD_CLEAR:
+		nfsd_supported_minorversions[minorversion] = false;
+		break;
+	case NFSD_TEST:
+		return nfsd_supported_minorversions[minorversion];
+	case NFSD_AVAIL:
+		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+	}
+	return 0;
+}
+
+/*
+ * Maximum number of nfsd processes
+ */
+#define	NFSD_MAXSERVS		8192
+
+int nfsd_nrthreads(struct net *net)
+{
+	int rv = 0;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	mutex_lock(&nfsd_mutex);
+	if (nn->nfsd_serv)
+		rv = nn->nfsd_serv->sv_nrthreads;
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+static int nfsd_init_socks(struct net *net)
+{
+	int error;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+		return 0;
+
+	error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
+					SVC_SOCK_DEFAULTS);
+	if (error < 0)
+		return error;
+
+	error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
+					SVC_SOCK_DEFAULTS);
+	if (error < 0)
+		return error;
+
+	return 0;
+}
+
+static int nfsd_users = 0;
+
+static int nfsd_startup_generic(int nrservs)
+{
+	int ret;
+
+	if (nfsd_users++)
+		return 0;
+
+	/*
+	 * Readahead param cache - will no-op if it already exists.
+	 * (Note therefore results will be suboptimal if number of
+	 * threads is modified after nfsd start.)
+	 */
+	ret = nfsd_racache_init(2*nrservs);
+	if (ret)
+		goto dec_users;
+
+	ret = nfs4_state_start();
+	if (ret)
+		goto out_racache;
+	return 0;
+
+out_racache:
+	nfsd_racache_shutdown();
+dec_users:
+	nfsd_users--;
+	return ret;
+}
+
+static void nfsd_shutdown_generic(void)
+{
+	if (--nfsd_users)
+		return;
+
+	nfs4_state_shutdown();
+	nfsd_racache_shutdown();
+}
+
+static bool nfsd_needs_lockd(void)
+{
+#if defined(CONFIG_NFSD_V3)
+	return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
+#else
+	return (nfsd_versions[2] != NULL);
+#endif
+}
+
+static int nfsd_startup_net(int nrservs, struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int ret;
+
+	if (nn->nfsd_net_up)
+		return 0;
+
+	ret = nfsd_startup_generic(nrservs);
+	if (ret)
+		return ret;
+	ret = nfsd_init_socks(net);
+	if (ret)
+		goto out_socks;
+
+	if (nfsd_needs_lockd() && !nn->lockd_up) {
+		ret = lockd_up(net);
+		if (ret)
+			goto out_socks;
+		nn->lockd_up = 1;
+	}
+
+	ret = nfs4_state_start_net(net);
+	if (ret)
+		goto out_lockd;
+
+	nn->nfsd_net_up = true;
+	return 0;
+
+out_lockd:
+	if (nn->lockd_up) {
+		lockd_down(net);
+		nn->lockd_up = 0;
+	}
+out_socks:
+	nfsd_shutdown_generic();
+	return ret;
+}
+
+static void nfsd_shutdown_net(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	nfs4_state_shutdown_net(net);
+	if (nn->lockd_up) {
+		lockd_down(net);
+		nn->lockd_up = 0;
+	}
+	nn->nfsd_net_up = false;
+	nfsd_shutdown_generic();
+}
+
+static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	/*
+	 * write_ports can create the server without actually starting
+	 * any threads--if we get shut down before any threads are
+	 * started, then nfsd_last_thread will be run before any of this
+	 * other initialization has been done.
+	 */
+	if (!nn->nfsd_net_up)
+		return;
+	nfsd_shutdown_net(net);
+
+	svc_rpcb_cleanup(serv, net);
+
+	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
+			    "cache\n");
+	nfsd_export_flush(net);
+}
+
+void nfsd_reset_versions(void)
+{
+	int found_one = 0;
+	int i;
+
+	for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) {
+		if (nfsd_program.pg_vers[i])
+			found_one = 1;
+	}
+
+	if (!found_one) {
+		for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++)
+			nfsd_program.pg_vers[i] = nfsd_version[i];
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++)
+			nfsd_acl_program.pg_vers[i] =
+				nfsd_acl_version[i];
+#endif
+	}
+}
+
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+	#define NFSD_DRC_SIZE_SHIFT	10
+	nfsd_drc_max_mem = (nr_free_buffer_pages()
+					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
+	nfsd_drc_mem_used = 0;
+	spin_lock_init(&nfsd_drc_lock);
+	dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
+}
+
+static int nfsd_get_default_max_blksize(void)
+{
+	struct sysinfo i;
+	unsigned long long target;
+	unsigned long ret;
+
+	si_meminfo(&i);
+	target = (i.totalram - i.totalhigh) << PAGE_SHIFT;
+	/*
+	 * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig
+	 * machines, but only uses 32K on 128M machines.  Bottom out at
+	 * 8K on 32M and smaller.  Of course, this is only a default.
+	 */
+	target >>= 12;
+
+	ret = NFSSVC_MAXBLKSIZE;
+	while (ret > target && ret >= 8*1024*2)
+		ret /= 2;
+	return ret;
+}
+
+int nfsd_create_serv(struct net *net)
+{
+	int error;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
+	if (nn->nfsd_serv) {
+		svc_get(nn->nfsd_serv);
+		return 0;
+	}
+	if (nfsd_max_blksize == 0)
+		nfsd_max_blksize = nfsd_get_default_max_blksize();
+	nfsd_reset_versions();
+	nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+				      nfsd_last_thread, nfsd, THIS_MODULE);
+	if (nn->nfsd_serv == NULL)
+		return -ENOMEM;
+
+	nn->nfsd_serv->sv_maxconn = nn->max_connections;
+	error = svc_bind(nn->nfsd_serv, net);
+	if (error < 0) {
+		svc_destroy(nn->nfsd_serv);
+		return error;
+	}
+
+	set_max_drc();
+	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
+	return 0;
+}
+
+int nfsd_nrpools(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (nn->nfsd_serv == NULL)
+		return 0;
+	else
+		return nn->nfsd_serv->sv_nrpools;
+}
+
+int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
+{
+	int i = 0;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (nn->nfsd_serv != NULL) {
+		for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
+			nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
+	}
+
+	return 0;
+}
+
+void nfsd_destroy(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
+
+	if (destroy)
+		svc_shutdown_net(nn->nfsd_serv, net);
+	svc_destroy(nn->nfsd_serv);
+	if (destroy)
+		nn->nfsd_serv = NULL;
+}
+
+int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
+{
+	int i = 0;
+	int tot = 0;
+	int err = 0;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
+	if (nn->nfsd_serv == NULL || n <= 0)
+		return 0;
+
+	if (n > nn->nfsd_serv->sv_nrpools)
+		n = nn->nfsd_serv->sv_nrpools;
+
+	/* enforce a global maximum number of threads */
+	tot = 0;
+	for (i = 0; i < n; i++) {
+		nthreads[i] = min(nthreads[i], NFSD_MAXSERVS);
+		tot += nthreads[i];
+	}
+	if (tot > NFSD_MAXSERVS) {
+		/* total too large: scale down requested numbers */
+		for (i = 0; i < n && tot > 0; i++) {
+		    	int new = nthreads[i] * NFSD_MAXSERVS / tot;
+			tot -= (nthreads[i] - new);
+			nthreads[i] = new;
+		}
+		for (i = 0; i < n && tot > 0; i++) {
+			nthreads[i]--;
+			tot--;
+		}
+	}
+
+	/*
+	 * There must always be a thread in pool 0; the admin
+	 * can't shut down NFS completely using pool_threads.
+	 */
+	if (nthreads[0] == 0)
+		nthreads[0] = 1;
+
+	/* apply the new numbers */
+	svc_get(nn->nfsd_serv);
+	for (i = 0; i < n; i++) {
+		err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
+				    	  nthreads[i]);
+		if (err)
+			break;
+	}
+	nfsd_destroy(net);
+	return err;
+}
+
+/*
+ * Adjust the number of threads and return the new number of threads.
+ * This is also the function that starts the server if necessary, if
+ * this is the first time nrservs is nonzero.
+ */
+int
+nfsd_svc(int nrservs, struct net *net)
+{
+	int	error;
+	bool	nfsd_up_before;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	mutex_lock(&nfsd_mutex);
+	dprintk("nfsd: creating service\n");
+
+	nrservs = max(nrservs, 0);
+	nrservs = min(nrservs, NFSD_MAXSERVS);
+	error = 0;
+
+	if (nrservs == 0 && nn->nfsd_serv == NULL)
+		goto out;
+
+	error = nfsd_create_serv(net);
+	if (error)
+		goto out;
+
+	nfsd_up_before = nn->nfsd_net_up;
+
+	error = nfsd_startup_net(nrservs, net);
+	if (error)
+		goto out_destroy;
+	error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
+	if (error)
+		goto out_shutdown;
+	/* We are holding a reference to nn->nfsd_serv which
+	 * we don't want to count in the return value,
+	 * so subtract 1
+	 */
+	error = nn->nfsd_serv->sv_nrthreads - 1;
+out_shutdown:
+	if (error < 0 && !nfsd_up_before)
+		nfsd_shutdown_net(net);
+out_destroy:
+	nfsd_destroy(net);		/* Release server */
+out:
+	mutex_unlock(&nfsd_mutex);
+	return error;
+}
+
+
+/*
+ * This is the NFS server kernel thread
+ */
+static int
+nfsd(void *vrqstp)
+{
+	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+	struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
+	struct net *net = perm_sock->xpt_net;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int err;
+
+	/* Lock module and set up kernel thread */
+	mutex_lock(&nfsd_mutex);
+
+	/* At this point, the thread shares current->fs
+	 * with the init process. We need to create files with a
+	 * umask of 0 instead of init's umask. */
+	if (unshare_fs_struct() < 0) {
+		printk("Unable to start nfsd thread: out of memory\n");
+		goto out;
+	}
+
+	current->fs->umask = 0;
+
+	/*
+	 * thread is spawned with all signals set to SIG_IGN, re-enable
+	 * the ones that will bring down the thread
+	 */
+	allow_signal(SIGKILL);
+	allow_signal(SIGHUP);
+	allow_signal(SIGINT);
+	allow_signal(SIGQUIT);
+
+	nfsdstats.th_cnt++;
+	mutex_unlock(&nfsd_mutex);
+
+	set_freezable();
+
+	/*
+	 * The main request loop
+	 */
+	for (;;) {
+		/* Update sv_maxconn if it has changed */
+		rqstp->rq_server->sv_maxconn = nn->max_connections;
+
+		/*
+		 * Find a socket with data available and call its
+		 * recvfrom routine.
+		 */
+		while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
+			;
+		if (err == -EINTR)
+			break;
+		validate_process_creds();
+		svc_process(rqstp);
+		validate_process_creds();
+	}
+
+	/* Clear signals before calling svc_exit_thread() */
+	flush_signals(current);
+
+	mutex_lock(&nfsd_mutex);
+	nfsdstats.th_cnt --;
+
+out:
+	rqstp->rq_server = NULL;
+
+	/* Release the thread */
+	svc_exit_thread(rqstp);
+
+	nfsd_destroy(net);
+
+	/* Release module */
+	mutex_unlock(&nfsd_mutex);
+	module_put_and_exit(0);
+	return 0;
+}
+
+static __be32 map_new_errors(u32 vers, __be32 nfserr)
+{
+	if (nfserr == nfserr_jukebox && vers == 2)
+		return nfserr_dropit;
+	if (nfserr == nfserr_wrongsec && vers < 4)
+		return nfserr_acces;
+	return nfserr;
+}
+
+int
+nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+{
+	struct svc_procedure	*proc;
+	kxdrproc_t		xdr;
+	__be32			nfserr;
+	__be32			*nfserrp;
+
+	dprintk("nfsd_dispatch: vers %d proc %d\n",
+				rqstp->rq_vers, rqstp->rq_proc);
+	proc = rqstp->rq_procinfo;
+
+	/*
+	 * Give the xdr decoder a chance to change this if it wants
+	 * (necessary in the NFSv4.0 compound case)
+	 */
+	rqstp->rq_cachetype = proc->pc_cachetype;
+	/* Decode arguments */
+	xdr = proc->pc_decode;
+	if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
+			rqstp->rq_argp)) {
+		dprintk("nfsd: failed to decode arguments!\n");
+		*statp = rpc_garbage_args;
+		return 1;
+	}
+
+	/* Check whether we have this call in the cache. */
+	switch (nfsd_cache_lookup(rqstp)) {
+	case RC_DROPIT:
+		return 0;
+	case RC_REPLY:
+		return 1;
+	case RC_DOIT:;
+		/* do it */
+	}
+
+	/* need to grab the location to store the status, as
+	 * nfsv4 does some encoding while processing 
+	 */
+	nfserrp = rqstp->rq_res.head[0].iov_base
+		+ rqstp->rq_res.head[0].iov_len;
+	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
+
+	/* Now call the procedure handler, and encode NFS status. */
+	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
+	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
+	if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+		dprintk("nfsd: Dropping request; may be revisited later\n");
+		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+		return 0;
+	}
+
+	if (rqstp->rq_proc != 0)
+		*nfserrp++ = nfserr;
+
+	/* Encode result.
+	 * For NFSv2, additional info is never returned in case of an error.
+	 */
+	if (!(nfserr && rqstp->rq_vers == 2)) {
+		xdr = proc->pc_encode;
+		if (xdr && !xdr(rqstp, nfserrp,
+				rqstp->rq_resp)) {
+			/* Failed to encode result. Release cache entry */
+			dprintk("nfsd: failed to encode result!\n");
+			nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+			*statp = rpc_system_err;
+			return 1;
+		}
+	}
+
+	/* Store reply in cache. */
+	nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
+	return 1;
+}
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
+
+	mutex_lock(&nfsd_mutex);
+	if (nn->nfsd_serv == NULL) {
+		mutex_unlock(&nfsd_mutex);
+		return -ENODEV;
+	}
+	/* bump up the psudo refcount while traversing */
+	svc_get(nn->nfsd_serv);
+	ret = svc_pool_stats_open(nn->nfsd_serv, file);
+	mutex_unlock(&nfsd_mutex);
+	return ret;
+}
+
+int nfsd_pool_stats_release(struct inode *inode, struct file *file)
+{
+	int ret = seq_release(inode, file);
+	struct net *net = inode->i_sb->s_fs_info;
+
+	mutex_lock(&nfsd_mutex);
+	/* this function really, really should have been called svc_put() */
+	nfsd_destroy(net);
+	mutex_unlock(&nfsd_mutex);
+	return ret;
+}
diff --git a/kernel/fs/nfsd/nfsxdr.c b/kernel/fs/nfsd/nfsxdr.c
new file mode 100644
index 000000000..79d964aa8
--- /dev/null
+++ b/kernel/fs/nfsd/nfsxdr.c
@@ -0,0 +1,550 @@
+/*
+ * XDR support for nfsd
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include "vfs.h"
+#include "xdr.h"
+#include "auth.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_XDR
+
+/*
+ * Mapping of S_IF* types to NFS file types
+ */
+static u32	nfs_ftypes[] = {
+	NFNON,  NFCHR,  NFCHR, NFBAD,
+	NFDIR,  NFBAD,  NFBLK, NFBAD,
+	NFREG,  NFBAD,  NFLNK, NFBAD,
+	NFSOCK, NFBAD,  NFLNK, NFBAD,
+};
+
+
+/*
+ * XDR functions for basic NFS types
+ */
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	fh_init(fhp, NFS_FHSIZE);
+	memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
+	fhp->fh_handle.fh_size = NFS_FHSIZE;
+
+	/* FIXME: Look up export pointer here and verify
+	 * Sun Secure RPC if requested */
+	return p + (NFS_FHSIZE >> 2);
+}
+
+/* Helper function for NFSv2 ACL code */
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	return decode_fh(p, fhp);
+}
+
+static __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+	return p + (NFS_FHSIZE>> 2);
+}
+
+/*
+ * Decode a file name and make sure that the path contains
+ * no slashes or null bytes.
+ */
+static __be32 *
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+{
+	char		*name;
+	unsigned int	i;
+
+	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
+		for (i = 0, name = *namp; i < *lenp; i++, name++) {
+			if (*name == '\0' || *name == '/')
+				return NULL;
+		}
+	}
+
+	return p;
+}
+
+static __be32 *
+decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
+{
+	char		*name;
+	unsigned int	i;
+
+	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
+		for (i = 0, name = *namp; i < *lenp; i++, name++) {
+			if (*name == '\0')
+				return NULL;
+		}
+	}
+
+	return p;
+}
+
+static __be32 *
+decode_sattr(__be32 *p, struct iattr *iap)
+{
+	u32	tmp, tmp1;
+
+	iap->ia_valid = 0;
+
+	/* Sun client bug compatibility check: some sun clients seem to
+	 * put 0xffff in the mode field when they mean 0xffffffff.
+	 * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah.
+	 */
+	if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) {
+		iap->ia_valid |= ATTR_MODE;
+		iap->ia_mode = tmp;
+	}
+	if ((tmp = ntohl(*p++)) != (u32)-1) {
+		iap->ia_uid = make_kuid(&init_user_ns, tmp);
+		if (uid_valid(iap->ia_uid))
+			iap->ia_valid |= ATTR_UID;
+	}
+	if ((tmp = ntohl(*p++)) != (u32)-1) {
+		iap->ia_gid = make_kgid(&init_user_ns, tmp);
+		if (gid_valid(iap->ia_gid))
+			iap->ia_valid |= ATTR_GID;
+	}
+	if ((tmp = ntohl(*p++)) != (u32)-1) {
+		iap->ia_valid |= ATTR_SIZE;
+		iap->ia_size = tmp;
+	}
+	tmp  = ntohl(*p++); tmp1 = ntohl(*p++);
+	if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+		iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+		iap->ia_atime.tv_sec = tmp;
+		iap->ia_atime.tv_nsec = tmp1 * 1000; 
+	}
+	tmp  = ntohl(*p++); tmp1 = ntohl(*p++);
+	if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+		iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+		iap->ia_mtime.tv_sec = tmp;
+		iap->ia_mtime.tv_nsec = tmp1 * 1000; 
+		/*
+		 * Passing the invalid value useconds=1000000 for mtime
+		 * is a Sun convention for "set both mtime and atime to
+		 * current server time".  It's needed to make permissions
+		 * checks for the "touch" program across v2 mounts to
+		 * Solaris and Irix boxes work correctly. See description of
+		 * sattr in section 6.1 of "NFS Illustrated" by
+		 * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+		 */
+		if (tmp1 == 1000000)
+			iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET);
+	}
+	return p;
+}
+
+static __be32 *
+encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+	     struct kstat *stat)
+{
+	struct dentry	*dentry = fhp->fh_dentry;
+	int type;
+	struct timespec time;
+	u32 f;
+
+	type = (stat->mode & S_IFMT);
+
+	*p++ = htonl(nfs_ftypes[type >> 12]);
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
+	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
+
+	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
+		*p++ = htonl(NFS_MAXPATHLEN);
+	} else {
+		*p++ = htonl((u32) stat->size);
+	}
+	*p++ = htonl((u32) stat->blksize);
+	if (S_ISCHR(type) || S_ISBLK(type))
+		*p++ = htonl(new_encode_dev(stat->rdev));
+	else
+		*p++ = htonl(0xffffffff);
+	*p++ = htonl((u32) stat->blocks);
+	switch (fsid_source(fhp)) {
+	default:
+	case FSIDSOURCE_DEV:
+		*p++ = htonl(new_encode_dev(stat->dev));
+		break;
+	case FSIDSOURCE_FSID:
+		*p++ = htonl((u32) fhp->fh_export->ex_fsid);
+		break;
+	case FSIDSOURCE_UUID:
+		f = ((u32*)fhp->fh_export->ex_uuid)[0];
+		f ^= ((u32*)fhp->fh_export->ex_uuid)[1];
+		f ^= ((u32*)fhp->fh_export->ex_uuid)[2];
+		f ^= ((u32*)fhp->fh_export->ex_uuid)[3];
+		*p++ = htonl(f);
+		break;
+	}
+	*p++ = htonl((u32) stat->ino);
+	*p++ = htonl((u32) stat->atime.tv_sec);
+	*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
+	lease_get_mtime(d_inode(dentry), &time); 
+	*p++ = htonl((u32) time.tv_sec);
+	*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 
+	*p++ = htonl((u32) stat->ctime.tv_sec);
+	*p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
+
+	return p;
+}
+
+/* Helper function for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat)
+{
+	return encode_fattr(rqstp, p, fhp, stat);
+}
+
+/*
+ * XDR decode functions
+ */
+int
+nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_sattrargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = decode_sattr(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_diropargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	 return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readargs *args)
+{
+	unsigned int len;
+	int v;
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+
+	args->offset    = ntohl(*p++);
+	len = args->count     = ntohl(*p++);
+	p++; /* totalcount - unused */
+
+	len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
+
+	/* set up somewhere to store response.
+	 * We take pages, put them on reslist and include in iovec
+	 */
+	v=0;
+	while (len > 0) {
+		struct page *p = *(rqstp->rq_next_page++);
+
+		rqstp->rq_vec[v].iov_base = page_address(p);
+		rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+	}
+	args->vlen = v;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_writeargs *args)
+{
+	unsigned int len, hdr, dlen;
+	int v;
+
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+
+	p++;				/* beginoffset */
+	args->offset = ntohl(*p++);	/* offset */
+	p++;				/* totalcount */
+	len = args->len = ntohl(*p++);
+	/*
+	 * The protocol specifies a maximum of 8192 bytes.
+	 */
+	if (len > NFSSVC_MAXBLKSIZE_V2)
+		return 0;
+
+	/*
+	 * Check to make sure that we got the right number of
+	 * bytes.
+	 */
+	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
+	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+		- hdr;
+
+	/*
+	 * Round the length of the data which was specified up to
+	 * the next multiple of XDR units and then compare that
+	 * against the length which was actually received.
+	 * Note that when RPCSEC/GSS (for example) is used, the
+	 * data buffer can be padded so dlen might be larger
+	 * than required.  It must never be smaller.
+	 */
+	if (dlen < XDR_QUADLEN(len)*4)
+		return 0;
+
+	rqstp->rq_vec[0].iov_base = (void*)p;
+	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+	v = 0;
+	while (len > rqstp->rq_vec[v].iov_len) {
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
+	}
+	rqstp->rq_vec[v].iov_len = len;
+	args->vlen = v + 1;
+	return 1;
+}
+
+int
+nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_createargs *args)
+{
+	if (   !(p = decode_fh(p, &args->fh))
+	    || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+	p = decode_sattr(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_renameargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_filename(p, &args->fname, &args->flen))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	args->buffer = page_address(*(rqstp->rq_next_page++));
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_linkargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_symlinkargs *args)
+{
+	if (   !(p = decode_fh(p, &args->ffh))
+	    || !(p = decode_filename(p, &args->fname, &args->flen))
+	    || !(p = decode_pathname(p, &args->tname, &args->tlen)))
+		return 0;
+	p = decode_sattr(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readdirargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	args->cookie = ntohl(*p++);
+	args->count  = ntohl(*p++);
+	args->count  = min_t(u32, args->count, PAGE_SIZE);
+	args->buffer = page_address(*(rqstp->rq_next_page++));
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+int
+nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_attrstat *resp)
+{
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_diropres *resp)
+{
+	p = encode_fh(p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readlinkres *resp)
+{
+	*p++ = htonl(resp->len);
+	xdr_ressize_check(rqstp, p);
+	rqstp->rq_res.page_len = resp->len;
+	if (resp->len & 3) {
+		/* need to pad the tail */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p = 0;
+		rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
+	}
+	return 1;
+}
+
+int
+nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readres *resp)
+{
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	*p++ = htonl(resp->count);
+	xdr_ressize_check(rqstp, p);
+
+	/* now update rqstp->rq_res to reflect data as well */
+	rqstp->rq_res.page_len = resp->count;
+	if (resp->count & 3) {
+		/* need to pad the tail */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p = 0;
+		rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
+	}
+	return 1;
+}
+
+int
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readdirres *resp)
+{
+	xdr_ressize_check(rqstp, p);
+	p = resp->buffer;
+	*p++ = 0;			/* no more entries */
+	*p++ = htonl((resp->common.err == nfserr_eof));
+	rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1;
+
+	return 1;
+}
+
+int
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_statfsres *resp)
+{
+	struct kstatfs	*stat = &resp->stats;
+
+	*p++ = htonl(NFSSVC_MAXBLKSIZE_V2);	/* max transfer size */
+	*p++ = htonl(stat->f_bsize);
+	*p++ = htonl(stat->f_blocks);
+	*p++ = htonl(stat->f_bfree);
+	*p++ = htonl(stat->f_bavail);
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_entry(void *ccdv, const char *name,
+		    int namlen, loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_cd *ccd = ccdv;
+	struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
+	__be32	*p = cd->buffer;
+	int	buflen, slen;
+
+	/*
+	dprintk("nfsd: entry(%.*s off %ld ino %ld)\n",
+			namlen, name, offset, ino);
+	 */
+
+	if (offset > ~((u32) 0)) {
+		cd->common.err = nfserr_fbig;
+		return -EINVAL;
+	}
+	if (cd->offset)
+		*cd->offset = htonl(offset);
+
+	/* truncate filename */
+	namlen = min(namlen, NFS2_MAXNAMLEN);
+	slen = XDR_QUADLEN(namlen);
+
+	if ((buflen = cd->buflen - slen - 4) < 0) {
+		cd->common.err = nfserr_toosmall;
+		return -EINVAL;
+	}
+	if (ino > ~((u32) 0)) {
+		cd->common.err = nfserr_fbig;
+		return -EINVAL;
+	}
+	*p++ = xdr_one;				/* mark entry present */
+	*p++ = htonl((u32) ino);		/* file id */
+	p    = xdr_encode_array(p, name, namlen);/* name length & name */
+	cd->offset = p;			/* remember pointer */
+	*p++ = htonl(~0U);		/* offset of next entry */
+
+	cd->buflen = buflen;
+	cd->buffer = p;
+	cd->common.err = nfs_ok;
+	return 0;
+}
+
+/*
+ * XDR release functions
+ */
+int
+nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_fhandle *resp)
+{
+	fh_put(&resp->fh);
+	return 1;
+}
diff --git a/kernel/fs/nfsd/pnfs.h b/kernel/fs/nfsd/pnfs.h
new file mode 100644
index 000000000..d4c445367
--- /dev/null
+++ b/kernel/fs/nfsd/pnfs.h
@@ -0,0 +1,86 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#ifdef CONFIG_NFSD_V4
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+	struct list_head	hash;
+	u64			idx;
+	int			fsid_type;
+	u32			fsid[];
+};
+
+struct nfsd4_layout_ops {
+	u32		notify_types;
+
+	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+			struct nfsd4_getdeviceinfo *gdevp);
+	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+			struct nfsd4_getdeviceinfo *gdevp);
+
+	__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+			struct nfsd4_layoutget *lgp);
+	__be32 (*encode_layoutget)(struct xdr_stream *,
+			struct nfsd4_layoutget *lgp);
+
+	__be32 (*proc_layoutcommit)(struct inode *inode,
+			struct nfsd4_layoutcommit *lcp);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+extern const struct nfsd4_layout_ops bl_layout_ops;
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+		struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+#endif /* CONFIG_NFSD_V4 */
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+struct nfs4_client;
+struct nfs4_file;
+
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/kernel/fs/nfsd/state.h b/kernel/fs/nfsd/state.h
new file mode 100644
index 000000000..dbc4f85a5
--- /dev/null
+++ b/kernel/fs/nfsd/state.h
@@ -0,0 +1,662 @@
+/*
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson <andros@umich.edu>
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _NFSD4_STATE_H
+#define _NFSD4_STATE_H
+
+#include <linux/idr.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include "nfsfh.h"
+
+typedef struct {
+	u32             cl_boot;
+	u32             cl_id;
+} clientid_t;
+
+typedef struct {
+	clientid_t	so_clid;
+	u32		so_id;
+} stateid_opaque_t;
+
+typedef struct {
+	u32                     si_generation;
+	stateid_opaque_t        si_opaque;
+} stateid_t;
+
+#define STATEID_FMT	"(%08x/%08x/%08x/%08x)"
+#define STATEID_VAL(s) \
+	(s)->si_opaque.so_clid.cl_boot, \
+	(s)->si_opaque.so_clid.cl_id, \
+	(s)->si_opaque.so_id, \
+	(s)->si_generation
+
+struct nfsd4_callback {
+	struct nfs4_client *cb_clp;
+	u32 cb_minorversion;
+	struct rpc_message cb_msg;
+	struct nfsd4_callback_ops *cb_ops;
+	struct work_struct cb_work;
+	int cb_status;
+	bool cb_need_restart;
+};
+
+struct nfsd4_callback_ops {
+	void (*prepare)(struct nfsd4_callback *);
+	int (*done)(struct nfsd4_callback *, struct rpc_task *);
+	void (*release)(struct nfsd4_callback *);
+};
+
+/*
+ * A core object that represents a "common" stateid. These are generally
+ * embedded within the different (more specific) stateid objects and contain
+ * fields that are of general use to any stateid.
+ */
+struct nfs4_stid {
+	atomic_t sc_count;
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+#define NFS4_DELEG_STID 4
+/* For an open stateid kept around *only* to process close replays: */
+#define NFS4_CLOSED_STID 8
+/* For a deleg stateid kept around only to process free_stateid's: */
+#define NFS4_REVOKED_DELEG_STID 16
+#define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
+	unsigned char sc_type;
+	stateid_t sc_stateid;
+	struct nfs4_client *sc_client;
+	struct nfs4_file *sc_file;
+	void (*sc_free)(struct nfs4_stid *);
+};
+
+/*
+ * Represents a delegation stateid. The nfs4_client holds references to these
+ * and they are put when it is being destroyed or when the delegation is
+ * returned by the client:
+ *
+ * o 1 reference as long as a delegation is still in force (taken when it's
+ *   alloc'd, put when it's returned or revoked)
+ *
+ * o 1 reference as long as a recall rpc is in progress (taken when the lease
+ *   is broken, put when the rpc exits)
+ *
+ * o 1 more ephemeral reference for each nfsd thread currently doing something
+ *   with that delegation without holding the cl_lock
+ *
+ * If the server attempts to recall a delegation and the client doesn't do so
+ * before a timeout, the server may also revoke the delegation. In that case,
+ * the object will either be destroyed (v4.0) or moved to a per-client list of
+ * revoked delegations (v4.1+).
+ *
+ * This object is a superset of the nfs4_stid.
+ */
+struct nfs4_delegation {
+	struct nfs4_stid	dl_stid; /* must be first field */
+	struct list_head	dl_perfile;
+	struct list_head	dl_perclnt;
+	struct list_head	dl_recall_lru;  /* delegation recalled */
+	struct nfs4_clnt_odstate *dl_clnt_odstate;
+	u32			dl_type;
+	time_t			dl_time;
+/* For recall: */
+	int			dl_retries;
+	struct nfsd4_callback	dl_recall;
+};
+
+#define cb_to_delegation(cb) \
+	container_of(cb, struct nfs4_delegation, dl_recall)
+
+/* client delegation callback info */
+struct nfs4_cb_conn {
+	/* SETCLIENTID info */
+	struct sockaddr_storage	cb_addr;
+	struct sockaddr_storage	cb_saddr;
+	size_t			cb_addrlen;
+	u32                     cb_prog; /* used only in 4.0 case;
+					    per-session otherwise */
+	u32                     cb_ident;	/* minorversion 0 only */
+	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
+};
+
+static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_delegation, dl_stid);
+}
+
+/* Maximum number of slots per session. 160 is useful for long haul TCP */
+#define NFSD_MAX_SLOTS_PER_SESSION     160
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND	16
+/* Maximum  session per slot cache size */
+#define NFSD_SLOT_CACHE_SIZE		2048
+/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
+#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION	32
+#define NFSD_MAX_MEM_PER_SESSION  \
+		(NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
+
+struct nfsd4_slot {
+	u32	sl_seqid;
+	__be32	sl_status;
+	u32	sl_datalen;
+	u16	sl_opcnt;
+#define NFSD4_SLOT_INUSE	(1 << 0)
+#define NFSD4_SLOT_CACHETHIS	(1 << 1)
+#define NFSD4_SLOT_INITIALIZED	(1 << 2)
+	u8	sl_flags;
+	char	sl_data[];
+};
+
+struct nfsd4_channel_attrs {
+	u32		headerpadsz;
+	u32		maxreq_sz;
+	u32		maxresp_sz;
+	u32		maxresp_cached;
+	u32		maxops;
+	u32		maxreqs;
+	u32		nr_rdma_attrs;
+	u32		rdma_attrs;
+};
+
+struct nfsd4_cb_sec {
+	u32	flavor; /* (u32)(-1) used to mean "no valid flavor" */
+	kuid_t	uid;
+	kgid_t	gid;
+};
+
+struct nfsd4_create_session {
+	clientid_t			clientid;
+	struct nfs4_sessionid		sessionid;
+	u32				seqid;
+	u32				flags;
+	struct nfsd4_channel_attrs	fore_channel;
+	struct nfsd4_channel_attrs	back_channel;
+	u32				callback_prog;
+	struct nfsd4_cb_sec		cb_sec;
+};
+
+struct nfsd4_backchannel_ctl {
+	u32	bc_cb_program;
+	struct nfsd4_cb_sec		bc_cb_sec;
+};
+
+struct nfsd4_bind_conn_to_session {
+	struct nfs4_sessionid		sessionid;
+	u32				dir;
+};
+
+/* The single slot clientid cache structure */
+struct nfsd4_clid_slot {
+	u32				sl_seqid;
+	__be32				sl_status;
+	struct nfsd4_create_session	sl_cr_ses;
+};
+
+struct nfsd4_conn {
+	struct list_head cn_persession;
+	struct svc_xprt *cn_xprt;
+	struct svc_xpt_user cn_xpt_user;
+	struct nfsd4_session *cn_session;
+/* CDFC4_FORE, CDFC4_BACK: */
+	unsigned char cn_flags;
+};
+
+/*
+ * Representation of a v4.1+ session. These are refcounted in a similar fashion
+ * to the nfs4_client. References are only taken when the server is actively
+ * working on the object (primarily during the processing of compounds).
+ */
+struct nfsd4_session {
+	atomic_t		se_ref;
+	struct list_head	se_hash;	/* hash by sessionid */
+	struct list_head	se_perclnt;
+/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */
+#define NFS4_SESSION_DEAD	0x010
+	u32			se_flags;
+	struct nfs4_client	*se_client;
+	struct nfs4_sessionid	se_sessionid;
+	struct nfsd4_channel_attrs se_fchannel;
+	struct nfsd4_channel_attrs se_bchannel;
+	struct nfsd4_cb_sec	se_cb_sec;
+	struct list_head	se_conns;
+	u32			se_cb_prog;
+	u32			se_cb_seq_nr;
+	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
+};
+
+/* formatted contents of nfs4_sessionid */
+struct nfsd4_sessionid {
+	clientid_t	clientid;
+	u32		sequence;
+	u32		reserved;
+};
+
+#define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
+
+/*
+ * struct nfs4_client - one per client.  Clientids live here.
+ *
+ * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
+ * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped.
+ * Each nfsd_net_ns object contains a set of these and they are tracked via
+ * short and long form clientid. They are hashed and searched for under the
+ * per-nfsd_net client_lock spinlock.
+ *
+ * References to it are only held during the processing of compounds, and in
+ * certain other operations. In their "resting state" they have a refcount of
+ * 0. If they are not renewed within a lease period, they become eligible for
+ * destruction by the laundromat.
+ *
+ * These objects can also be destroyed prematurely by the fault injection code,
+ * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
+ * Care is taken *not* to do this however when the objects have an elevated
+ * refcount.
+ *
+ * o Each nfs4_client is hashed by clientid
+ *
+ * o Each nfs4_clients is also hashed by name (the opaque quantity initially
+ *   sent by the client to identify itself).
+ * 	  
+ * o cl_perclient list is used to ensure no dangling stateowner references
+ *   when we expire the nfs4_client
+ */
+struct nfs4_client {
+	struct list_head	cl_idhash; 	/* hash by cl_clientid.id */
+	struct rb_node		cl_namenode;	/* link into by-name trees */
+	struct list_head	*cl_ownerstr_hashtbl;
+	struct list_head	cl_openowners;
+	struct idr		cl_stateids;	/* stateid lookup */
+	struct list_head	cl_delegations;
+	struct list_head	cl_revoked;	/* unacknowledged, revoked 4.1 state */
+	struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	cl_lo_states;	/* outstanding layout states */
+#endif
+	struct xdr_netobj	cl_name; 	/* id generated by client */
+	nfs4_verifier		cl_verifier; 	/* generated by client */
+	time_t                  cl_time;        /* time of last lease renewal */
+	struct sockaddr_storage	cl_addr; 	/* client ipaddress */
+	bool			cl_mach_cred;	/* SP4_MACH_CRED in force */
+	struct svc_cred		cl_cred; 	/* setclientid principal */
+	clientid_t		cl_clientid;	/* generated by server */
+	nfs4_verifier		cl_confirm;	/* generated by server */
+	u32			cl_minorversion;
+
+	/* for v4.0 and v4.1 callbacks: */
+	struct nfs4_cb_conn	cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE		(0)
+#define NFSD4_CLIENT_CB_KILL		(1)
+#define NFSD4_CLIENT_STABLE		(2)	/* client on stable storage */
+#define NFSD4_CLIENT_RECLAIM_COMPLETE	(3)	/* reclaim_complete done */
+#define NFSD4_CLIENT_CONFIRMED		(4)	/* client is confirmed */
+#define NFSD4_CLIENT_UPCALL_LOCK	(5)	/* upcall serialization */
+#define NFSD4_CLIENT_CB_FLAG_MASK	(1 << NFSD4_CLIENT_CB_UPDATE | \
+					 1 << NFSD4_CLIENT_CB_KILL)
+	unsigned long		cl_flags;
+	struct rpc_cred		*cl_cb_cred;
+	struct rpc_clnt		*cl_cb_client;
+	u32			cl_cb_ident;
+#define NFSD4_CB_UP		0
+#define NFSD4_CB_UNKNOWN	1
+#define NFSD4_CB_DOWN		2
+#define NFSD4_CB_FAULT		3
+	int			cl_cb_state;
+	struct nfsd4_callback	cl_cb_null;
+	struct nfsd4_session	*cl_cb_session;
+
+	/* for all client information that callback code might need: */
+	spinlock_t		cl_lock;
+
+	/* for nfs41 */
+	struct list_head	cl_sessions;
+	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
+	u32			cl_exchange_flags;
+	/* number of rpc's in progress over an associated session: */
+	atomic_t		cl_refcount;
+
+	/* for nfs41 callbacks */
+	/* We currently support a single back channel with a single slot */
+	unsigned long		cl_cb_slot_busy;
+	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
+						/* wait here for slots */
+	struct net		*net;
+};
+
+/* struct nfs4_client_reset
+ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
+ * upon lease reset, or from upcall to state_daemon (to read in state
+ * from non-volitile storage) upon reboot.
+ */
+struct nfs4_client_reclaim {
+	struct list_head	cr_strhash;	/* hash by cr_name */
+	struct nfs4_client	*cr_clp;	/* pointer to associated clp */
+	char			cr_recdir[HEXDIR_LEN]; /* recover dir */
+};
+
+static inline void
+update_stateid(stateid_t *stateid)
+{
+	stateid->si_generation++;
+	/* Wraparound recommendation from 3530bis-13 9.1.3.2: */
+	if (stateid->si_generation == 0)
+		stateid->si_generation = 1;
+}
+
+/* A reasonable value for REPLAY_ISIZE was estimated as follows:  
+ * The OPEN response, typically the largest, requires 
+ *   4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) +  8(verifier) + 
+ *   4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + 
+ *   20(deleg. space limit) + ~32(deleg. ace) = 112 bytes 
+ */
+
+#define NFSD4_REPLAY_ISIZE       112 
+
+/*
+ * Replay buffer, where the result of the last seqid-mutating operation 
+ * is cached. 
+ */
+struct nfs4_replay {
+	__be32			rp_status;
+	unsigned int		rp_buflen;
+	char			*rp_buf;
+	struct knfsd_fh		rp_openfh;
+	struct mutex		rp_mutex;
+	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
+};
+
+struct nfs4_stateowner;
+
+struct nfs4_stateowner_operations {
+	void (*so_unhash)(struct nfs4_stateowner *);
+	void (*so_free)(struct nfs4_stateowner *);
+};
+
+/*
+ * A core object that represents either an open or lock owner. The object and
+ * lock owner objects have one of these embedded within them. Refcounts and
+ * other fields common to both owner types are contained within these
+ * structures.
+ */
+struct nfs4_stateowner {
+	struct list_head			so_strhash;
+	struct list_head			so_stateids;
+	struct nfs4_client			*so_client;
+	const struct nfs4_stateowner_operations	*so_ops;
+	/* after increment in nfsd4_bump_seqid, represents the next
+	 * sequence id expected from the client: */
+	atomic_t				so_count;
+	u32					so_seqid;
+	struct xdr_netobj			so_owner; /* open owner name */
+	struct nfs4_replay			so_replay;
+	bool					so_is_open_owner;
+};
+
+/*
+ * When a file is opened, the client provides an open state owner opaque string
+ * that indicates the "owner" of that open. These objects are refcounted.
+ * References to it are held by each open state associated with it. This object
+ * is a superset of the nfs4_stateowner struct.
+ */
+struct nfs4_openowner {
+	struct nfs4_stateowner	oo_owner; /* must be first field */
+	struct list_head        oo_perclient;
+	/*
+	 * We keep around openowners a little while after last close,
+	 * which saves clients from having to confirm, and allows us to
+	 * handle close replays if they come soon enough.  The close_lru
+	 * is a list of such openowners, to be reaped by the laundromat
+	 * thread eventually if they remain unused:
+	 */
+	struct list_head	oo_close_lru;
+	struct nfs4_ol_stateid *oo_last_closed_stid;
+	time_t			oo_time; /* time of placement on so_close_lru */
+#define NFS4_OO_CONFIRMED   1
+	unsigned char		oo_flags;
+};
+
+/*
+ * Represents a generic "lockowner". Similar to an openowner. References to it
+ * are held by the lock stateids that are created on its behalf. This object is
+ * a superset of the nfs4_stateowner struct (or would be if it needed any extra
+ * fields).
+ */
+struct nfs4_lockowner {
+	struct nfs4_stateowner	lo_owner; /* must be first element */
+};
+
+static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
+{
+	return container_of(so, struct nfs4_openowner, oo_owner);
+}
+
+static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
+{
+	return container_of(so, struct nfs4_lockowner, lo_owner);
+}
+
+/*
+ * Per-client state indicating no. of opens and outstanding delegations
+ * on a file from a particular client.'od' stands for 'open & delegation'
+ */
+struct nfs4_clnt_odstate {
+	struct nfs4_client	*co_client;
+	struct nfs4_file	*co_file;
+	struct list_head	co_perfile;
+	atomic_t		co_odcount;
+};
+
+/*
+ * nfs4_file: a file opened by some number of (open) nfs4_stateowners.
+ *
+ * These objects are global. nfsd keeps one instance of a nfs4_file per
+ * filehandle (though it may keep multiple file descriptors for each). Each
+ * inode can have multiple filehandles associated with it, so there is
+ * (potentially) a many to one relationship between this struct and struct
+ * inode.
+ *
+ * These are hashed by filehandle in the file_hashtbl, which is protected by
+ * the global state_lock spinlock.
+ */
+struct nfs4_file {
+	atomic_t		fi_ref;
+	spinlock_t		fi_lock;
+	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
+	struct list_head        fi_stateids;
+	union {
+		struct list_head	fi_delegations;
+		struct rcu_head		fi_rcu;
+	};
+	struct list_head	fi_clnt_odstate;
+	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
+	struct file *		fi_fds[3];
+	/*
+	 * Each open or lock stateid contributes 0-4 to the counts
+	 * below depending on which bits are set in st_access_bitmap:
+	 *     1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
+	 *   + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
+	 *   + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
+	 */
+	atomic_t		fi_access[2];
+	u32			fi_share_deny;
+	struct file		*fi_deleg_file;
+	int			fi_delegees;
+	struct knfsd_fh		fi_fhandle;
+	bool			fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	fi_lo_states;
+	atomic_t		fi_lo_recalls;
+#endif
+};
+
+/*
+ * A generic struct representing either a open or lock stateid. The nfs4_client
+ * holds a reference to each of these objects, and they in turn hold a
+ * reference to their respective stateowners. The client's reference is
+ * released in response to a close or unlock (depending on whether it's an open
+ * or lock stateid) or when the client is being destroyed.
+ *
+ * In the case of v4.0 open stateids, these objects are preserved for a little
+ * while after close in order to handle CLOSE replays. Those are eventually
+ * reclaimed via a LRU scheme by the laundromat.
+ *
+ * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock".
+ * Better suggestions welcome.
+ */
+struct nfs4_ol_stateid {
+	struct nfs4_stid    st_stid; /* must be first field */
+	struct list_head              st_perfile;
+	struct list_head              st_perstateowner;
+	struct list_head              st_locks;
+	struct nfs4_stateowner      * st_stateowner;
+	struct nfs4_clnt_odstate    * st_clnt_odstate;
+	unsigned char                 st_access_bmap;
+	unsigned char                 st_deny_bmap;
+	struct nfs4_ol_stateid         * st_openstp;
+};
+
+static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_ol_stateid, st_stid);
+}
+
+struct nfs4_layout_stateid {
+	struct nfs4_stid		ls_stid;
+	struct list_head		ls_perclnt;
+	struct list_head		ls_perfile;
+	spinlock_t			ls_lock;
+	struct list_head		ls_layouts;
+	u32				ls_layout_type;
+	struct file			*ls_file;
+	struct nfsd4_callback		ls_recall;
+	stateid_t			ls_recall_sid;
+	bool				ls_recalled;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
+/* flags for preprocess_seqid_op() */
+#define RD_STATE	        0x00000010
+#define WR_STATE	        0x00000020
+
+enum nfsd4_cb_op {
+	NFSPROC4_CLNT_CB_NULL = 0,
+	NFSPROC4_CLNT_CB_RECALL,
+	NFSPROC4_CLNT_CB_LAYOUT,
+	NFSPROC4_CLNT_CB_SEQUENCE,
+};
+
+
+struct nfsd4_compound_state;
+struct nfsd_net;
+
+extern __be32 nfs4_preprocess_stateid_op(struct net *net,
+		struct nfsd4_compound_state *cstate,
+		stateid_t *stateid, int flags, struct file **filp);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+		     stateid_t *stateid, unsigned char typemask,
+		     struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+		struct kmem_cache *slab);
+void nfs4_unhash_stid(struct nfs4_stid *s);
+void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
+extern void nfs4_release_reclaim(struct nfsd_net *);
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
+							struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
+		struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
+extern int set_callback_cred(void);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+		struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+extern void nfsd4_run_cb(struct nfsd4_callback *cb);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
+extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
+extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
+							struct nfsd_net *nn);
+extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+	atomic_inc(&fi->fi_ref);
+}
+struct file *find_any_file(struct nfs4_file *f);
+
+/* grace period management */
+void nfsd4_end_grace(struct nfsd_net *nn);
+
+/* nfs4recover operations */
+extern int nfsd4_client_tracking_init(struct net *net);
+extern void nfsd4_client_tracking_exit(struct net *net);
+extern void nfsd4_client_record_create(struct nfs4_client *clp);
+extern void nfsd4_client_record_remove(struct nfs4_client *clp);
+extern int nfsd4_client_record_check(struct nfs4_client *clp);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn);
+
+/* nfs fault injection functions */
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+
+u64 nfsd_inject_print_clients(void);
+u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_clients(u64);
+
+u64 nfsd_inject_print_locks(void);
+u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_locks(u64);
+
+u64 nfsd_inject_print_openowners(void);
+u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_openowners(u64);
+
+u64 nfsd_inject_print_delegations(void);
+u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_delegations(u64);
+u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_recall_delegations(u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
+#endif   /* NFSD4_STATE_H */
diff --git a/kernel/fs/nfsd/stats.c b/kernel/fs/nfsd/stats.c
new file mode 100644
index 000000000..cd90878a7
--- /dev/null
+++ b/kernel/fs/nfsd/stats.c
@@ -0,0 +1,104 @@
+/*
+ * procfs-based user access to knfsd statistics
+ *
+ * /proc/net/rpc/nfsd
+ *
+ * Format:
+ *	rc <hits> <misses> <nocache>
+ *			Statistsics for the reply cache
+ *	fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
+ *			statistics for filehandle lookup
+ *	io <bytes-read> <bytes-written>
+ *			statistics for IO throughput
+ *	th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> 
+ *			time (seconds) when nfsd thread usage above thresholds
+ *			and number of times that all threads were in use
+ *	ra cache-size  <10%  <20%  <30% ... <100% not-found
+ *			number of times that read-ahead entry was found that deep in
+ *			the cache.
+ *	plus generic RPC stats (see net/sunrpc/stats.c)
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/sunrpc/stats.h>
+#include <net/net_namespace.h>
+
+#include "nfsd.h"
+
+struct nfsd_stats	nfsdstats;
+struct svc_stat		nfsd_svcstats = {
+	.program	= &nfsd_program,
+};
+
+static int nfsd_proc_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n",
+		      nfsdstats.rchits,
+		      nfsdstats.rcmisses,
+		      nfsdstats.rcnocache,
+		      nfsdstats.fh_stale,
+		      nfsdstats.fh_lookup,
+		      nfsdstats.fh_anon,
+		      nfsdstats.fh_nocache_dir,
+		      nfsdstats.fh_nocache_nondir,
+		      nfsdstats.io_read,
+		      nfsdstats.io_write);
+	/* thread usage: */
+	seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt);
+	for (i=0; i<10; i++) {
+		unsigned int jifs = nfsdstats.th_usage[i];
+		unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ;
+		seq_printf(seq, " %u.%03u", sec, msec);
+	}
+
+	/* newline and ra-cache */
+	seq_printf(seq, "\nra %u", nfsdstats.ra_size);
+	for (i=0; i<11; i++)
+		seq_printf(seq, " %u", nfsdstats.ra_depth[i]);
+	seq_putc(seq, '\n');
+	
+	/* show my rpc info */
+	svc_seq_show(seq, &nfsd_svcstats);
+
+#ifdef CONFIG_NFSD_V4
+	/* Show count for individual nfsv4 operations */
+	/* Writing operation numbers 0 1 2 also for maintaining uniformity */
+	seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+	for (i = 0; i <= LAST_NFS4_OP; i++)
+		seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]);
+
+	seq_putc(seq, '\n');
+#endif
+
+	return 0;
+}
+
+static int nfsd_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nfsd_proc_show, NULL);
+}
+
+static const struct file_operations nfsd_proc_fops = {
+	.owner = THIS_MODULE,
+	.open = nfsd_proc_open,
+	.read  = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void
+nfsd_stat_init(void)
+{
+	svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops);
+}
+
+void
+nfsd_stat_shutdown(void)
+{
+	svc_proc_unregister(&init_net, "nfsd");
+}
diff --git a/kernel/fs/nfsd/stats.h b/kernel/fs/nfsd/stats.h
new file mode 100644
index 000000000..a5c944b77
--- /dev/null
+++ b/kernel/fs/nfsd/stats.h
@@ -0,0 +1,43 @@
+/*
+ * Statistics for NFS server.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _NFSD_STATS_H
+#define _NFSD_STATS_H
+
+#include <uapi/linux/nfsd/stats.h>
+
+
+struct nfsd_stats {
+	unsigned int	rchits;		/* repcache hits */
+	unsigned int	rcmisses;	/* repcache hits */
+	unsigned int	rcnocache;	/* uncached reqs */
+	unsigned int	fh_stale;	/* FH stale error */
+	unsigned int	fh_lookup;	/* dentry cached */
+	unsigned int	fh_anon;	/* anon file dentry returned */
+	unsigned int	fh_nocache_dir;	/* filehandle not found in dcache */
+	unsigned int	fh_nocache_nondir;	/* filehandle not found in dcache */
+	unsigned int	io_read;	/* bytes returned to read requests */
+	unsigned int	io_write;	/* bytes passed in write requests */
+	unsigned int	th_cnt;		/* number of available threads */
+	unsigned int	th_usage[10];	/* number of ticks during which n perdeciles
+					 * of available threads were in use */
+	unsigned int	th_fullcnt;	/* number of times last free thread was used */
+	unsigned int	ra_size;	/* size of ra cache */
+	unsigned int	ra_depth[11];	/* number of times ra entry was found that deep
+					 * in the cache (10percentiles). [10] = not found */
+#ifdef CONFIG_NFSD_V4
+	unsigned int	nfs4_opcount[LAST_NFS4_OP + 1];	/* count of individual nfsv4 operations */
+#endif
+
+};
+
+
+extern struct nfsd_stats	nfsdstats;
+extern struct svc_stat		nfsd_svcstats;
+
+void	nfsd_stat_init(void);
+void	nfsd_stat_shutdown(void);
+
+#endif /* _NFSD_STATS_H */
diff --git a/kernel/fs/nfsd/trace.c b/kernel/fs/nfsd/trace.c
new file mode 100644
index 000000000..82f890705
--- /dev/null
+++ b/kernel/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
+
+#include "state.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/kernel/fs/nfsd/trace.h b/kernel/fs/nfsd/trace.h
new file mode 100644
index 000000000..c668520c3
--- /dev/null
+++ b/kernel/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+	TP_PROTO(stateid_t *stp),
+	TP_ARGS(stp),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, si_id)
+		__field(u32, si_generation)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+		__entry->cl_id = stp->si_opaque.so_clid.cl_id;
+		__entry->si_id = stp->si_opaque.so_id;
+		__entry->si_generation = stp->si_generation;
+	),
+	TP_printk("client %08x:%08x stateid %08x:%08x",
+		__entry->cl_boot,
+		__entry->cl_id,
+		__entry->si_id,
+		__entry->si_generation)
+)
+
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, name, \
+	TP_PROTO(stateid_t *stp), \
+	TP_ARGS(stp))
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+
+#endif /* _NFSD_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/kernel/fs/nfsd/vfs.c b/kernel/fs/nfsd/vfs.c
new file mode 100644
index 000000000..84d770be0
--- /dev/null
+++ b/kernel/fs/nfsd/vfs.c
@@ -0,0 +1,2154 @@
+/*
+ * File operations used by nfsd. Some of these have been ripped from
+ * other parts of the kernel because they weren't exported, others
+ * are partial duplicates with added or changed functionality.
+ *
+ * Note that several functions dget() the dentry upon which they want
+ * to act, most notably those that create directory entries. Response
+ * dentry's are dput()'d if necessary in the release callback.
+ * So if you notice code paths that apparently fail to dput() the
+ * dentry, don't worry--they have been taken care of.
+ *
+ * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>
+ * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/namei.h>
+#include <linux/delay.h>
+#include <linux/fsnotify.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+#include <linux/jhash.h>
+#include <linux/ima.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/security.h>
+
+#ifdef CONFIG_NFSD_V3
+#include "xdr3.h"
+#endif /* CONFIG_NFSD_V3 */
+
+#ifdef CONFIG_NFSD_V4
+#include "acl.h"
+#include "idmap.h"
+#endif /* CONFIG_NFSD_V4 */
+
+#include "nfsd.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_FILEOP
+
+
+/*
+ * This is a cache of readahead params that help us choose the proper
+ * readahead strategy. Initially, we set all readahead parameters to 0
+ * and let the VFS handle things.
+ * If you increase the number of cached files very much, you'll need to
+ * add a hash table here.
+ */
+struct raparms {
+	struct raparms		*p_next;
+	unsigned int		p_count;
+	ino_t			p_ino;
+	dev_t			p_dev;
+	int			p_set;
+	struct file_ra_state	p_ra;
+	unsigned int		p_hindex;
+};
+
+struct raparm_hbucket {
+	struct raparms		*pb_head;
+	spinlock_t		pb_lock;
+} ____cacheline_aligned_in_smp;
+
+#define RAPARM_HASH_BITS	4
+#define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
+#define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
+static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
+
+/* 
+ * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
+ * a mount point.
+ * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
+ *  or nfs_ok having possibly changed *dpp and *expp
+ */
+int
+nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 
+		        struct svc_export **expp)
+{
+	struct svc_export *exp = *expp, *exp2 = NULL;
+	struct dentry *dentry = *dpp;
+	struct path path = {.mnt = mntget(exp->ex_path.mnt),
+			    .dentry = dget(dentry)};
+	int err = 0;
+
+	err = follow_down(&path);
+	if (err < 0)
+		goto out;
+
+	exp2 = rqst_exp_get_by_name(rqstp, &path);
+	if (IS_ERR(exp2)) {
+		err = PTR_ERR(exp2);
+		/*
+		 * We normally allow NFS clients to continue
+		 * "underneath" a mountpoint that is not exported.
+		 * The exception is V4ROOT, where no traversal is ever
+		 * allowed without an explicit export of the new
+		 * directory.
+		 */
+		if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
+			err = 0;
+		path_put(&path);
+		goto out;
+	}
+	if (nfsd_v4client(rqstp) ||
+		(exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
+		/* successfully crossed mount point */
+		/*
+		 * This is subtle: path.dentry is *not* on path.mnt
+		 * at this point.  The only reason we are safe is that
+		 * original mnt is pinned down by exp, so we should
+		 * put path *before* putting exp
+		 */
+		*dpp = path.dentry;
+		path.dentry = dentry;
+		*expp = exp2;
+		exp2 = exp;
+	}
+	path_put(&path);
+	exp_put(exp2);
+out:
+	return err;
+}
+
+static void follow_to_parent(struct path *path)
+{
+	struct dentry *dp;
+
+	while (path->dentry == path->mnt->mnt_root && follow_up(path))
+		;
+	dp = dget_parent(path->dentry);
+	dput(path->dentry);
+	path->dentry = dp;
+}
+
+static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
+{
+	struct svc_export *exp2;
+	struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
+			    .dentry = dget(dparent)};
+
+	follow_to_parent(&path);
+
+	exp2 = rqst_exp_parent(rqstp, &path);
+	if (PTR_ERR(exp2) == -ENOENT) {
+		*dentryp = dget(dparent);
+	} else if (IS_ERR(exp2)) {
+		path_put(&path);
+		return PTR_ERR(exp2);
+	} else {
+		*dentryp = dget(path.dentry);
+		exp_put(*exp);
+		*exp = exp2;
+	}
+	path_put(&path);
+	return 0;
+}
+
+/*
+ * For nfsd purposes, we treat V4ROOT exports as though there was an
+ * export at *every* directory.
+ */
+int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
+{
+	if (d_mountpoint(dentry))
+		return 1;
+	if (nfsd4_is_junction(dentry))
+		return 1;
+	if (!(exp->ex_flags & NFSEXP_V4ROOT))
+		return 0;
+	return d_inode(dentry) != NULL;
+}
+
+__be32
+nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		   const char *name, unsigned int len,
+		   struct svc_export **exp_ret, struct dentry **dentry_ret)
+{
+	struct svc_export	*exp;
+	struct dentry		*dparent;
+	struct dentry		*dentry;
+	int			host_err;
+
+	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
+
+	dparent = fhp->fh_dentry;
+	exp = exp_get(fhp->fh_export);
+
+	/* Lookup the name, but don't follow links */
+	if (isdotent(name, len)) {
+		if (len==1)
+			dentry = dget(dparent);
+		else if (dparent != exp->ex_path.dentry)
+			dentry = dget_parent(dparent);
+		else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
+			dentry = dget(dparent); /* .. == . just like at / */
+		else {
+			/* checking mountpoint crossing is very different when stepping up */
+			host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
+			if (host_err)
+				goto out_nfserr;
+		}
+	} else {
+		/*
+		 * In the nfsd4_open() case, this may be held across
+		 * subsequent open and delegation acquisition which may
+		 * need to take the child's i_mutex:
+		 */
+		fh_lock_nested(fhp, I_MUTEX_PARENT);
+		dentry = lookup_one_len(name, dparent, len);
+		host_err = PTR_ERR(dentry);
+		if (IS_ERR(dentry))
+			goto out_nfserr;
+		/*
+		 * check if we have crossed a mount point ...
+		 */
+		if (nfsd_mountpoint(dentry, exp)) {
+			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+				dput(dentry);
+				goto out_nfserr;
+			}
+		}
+	}
+	*dentry_ret = dentry;
+	*exp_ret = exp;
+	return 0;
+
+out_nfserr:
+	exp_put(exp);
+	return nfserrno(host_err);
+}
+
+/*
+ * Look up one component of a pathname.
+ * N.B. After this call _both_ fhp and resfh need an fh_put
+ *
+ * If the lookup would cross a mountpoint, and the mounted filesystem
+ * is exported to the client with NFSEXP_NOHIDE, then the lookup is
+ * accepted as it stands and the mounted directory is
+ * returned. Otherwise the covered directory is returned.
+ * NOTE: this mountpoint crossing is not supported properly by all
+ *   clients and is explicitly disallowed for NFSv3
+ *      NeilBrown <neilb@cse.unsw.edu.au>
+ */
+__be32
+nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
+				unsigned int len, struct svc_fh *resfh)
+{
+	struct svc_export	*exp;
+	struct dentry		*dentry;
+	__be32 err;
+
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		return err;
+	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
+	if (err)
+		return err;
+	err = check_nfsd_access(exp, rqstp);
+	if (err)
+		goto out;
+	/*
+	 * Note: we compose the file handle now, but as the
+	 * dentry may be negative, it may need to be updated.
+	 */
+	err = fh_compose(resfh, exp, dentry, fhp);
+	if (!err && d_really_is_negative(dentry))
+		err = nfserr_noent;
+out:
+	dput(dentry);
+	exp_put(exp);
+	return err;
+}
+
+/*
+ * Commit metadata changes to stable storage.
+ */
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+	struct inode *inode = d_inode(fhp->fh_dentry);
+	const struct export_operations *export_ops = inode->i_sb->s_export_op;
+
+	if (!EX_ISSYNC(fhp->fh_export))
+		return 0;
+
+	if (export_ops->commit_metadata)
+		return export_ops->commit_metadata(inode);
+	return sync_inode_metadata(inode, 1);
+}
+
+/*
+ * Go over the attributes and take care of the small differences between
+ * NFS semantics and what Linux expects.
+ */
+static void
+nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
+{
+	/*
+	 * NFSv2 does not differentiate between "set-[ac]time-to-now"
+	 * which only requires access, and "set-[ac]time-to-X" which
+	 * requires ownership.
+	 * So if it looks like it might be "set both to the same time which
+	 * is close to now", and if inode_change_ok fails, then we
+	 * convert to "set to now" instead of "set to explicit time"
+	 *
+	 * We only call inode_change_ok as the last test as technically
+	 * it is not an interface that we should be using.
+	 */
+#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
+#define	MAX_TOUCH_TIME_ERROR (30*60)
+	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
+	    iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
+		/*
+		 * Looks probable.
+		 *
+		 * Now just make sure time is in the right ballpark.
+		 * Solaris, at least, doesn't seem to care what the time
+		 * request is.  We require it be within 30 minutes of now.
+		 */
+		time_t delta = iap->ia_atime.tv_sec - get_seconds();
+		if (delta < 0)
+			delta = -delta;
+		if (delta < MAX_TOUCH_TIME_ERROR &&
+		    inode_change_ok(inode, iap) != 0) {
+			/*
+			 * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
+			 * This will cause notify_change to set these times
+			 * to "now"
+			 */
+			iap->ia_valid &= ~BOTH_TIME_SET;
+		}
+	}
+
+	/* sanitize the mode change */
+	if (iap->ia_valid & ATTR_MODE) {
+		iap->ia_mode &= S_IALLUGO;
+		iap->ia_mode |= (inode->i_mode & ~S_IALLUGO);
+	}
+
+	/* Revoke setuid/setgid on chown */
+	if (!S_ISDIR(inode->i_mode) &&
+	    ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) {
+		iap->ia_valid |= ATTR_KILL_PRIV;
+		if (iap->ia_valid & ATTR_MODE) {
+			/* we're setting mode too, just clear the s*id bits */
+			iap->ia_mode &= ~S_ISUID;
+			if (iap->ia_mode & S_IXGRP)
+				iap->ia_mode &= ~S_ISGID;
+		} else {
+			/* set ATTR_KILL_* bits and let VFS handle it */
+			iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
+		}
+	}
+}
+
+static __be32
+nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct iattr *iap)
+{
+	struct inode *inode = d_inode(fhp->fh_dentry);
+	int host_err;
+
+	if (iap->ia_size < inode->i_size) {
+		__be32 err;
+
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+				NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+		if (err)
+			return err;
+	}
+
+	host_err = get_write_access(inode);
+	if (host_err)
+		goto out_nfserrno;
+
+	host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+	if (host_err)
+		goto out_put_write_access;
+	return 0;
+
+out_put_write_access:
+	put_write_access(inode);
+out_nfserrno:
+	return nfserrno(host_err);
+}
+
+/*
+ * Set various file attributes.  After this call fhp needs an fh_put.
+ */
+__be32
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+	     int check_guard, time_t guardtime)
+{
+	struct dentry	*dentry;
+	struct inode	*inode;
+	int		accmode = NFSD_MAY_SATTR;
+	umode_t		ftype = 0;
+	__be32		err;
+	int		host_err;
+	bool		get_write_count;
+	int		size_change = 0;
+
+	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+	if (iap->ia_valid & ATTR_SIZE)
+		ftype = S_IFREG;
+
+	/* Callers that do fh_verify should do the fh_want_write: */
+	get_write_count = !fhp->fh_dentry;
+
+	/* Get inode */
+	err = fh_verify(rqstp, fhp, ftype, accmode);
+	if (err)
+		goto out;
+	if (get_write_count) {
+		host_err = fh_want_write(fhp);
+		if (host_err)
+			return nfserrno(host_err);
+	}
+
+	dentry = fhp->fh_dentry;
+	inode = d_inode(dentry);
+
+	/* Ignore any mode updates on symlinks */
+	if (S_ISLNK(inode->i_mode))
+		iap->ia_valid &= ~ATTR_MODE;
+
+	if (!iap->ia_valid)
+		goto out;
+
+	nfsd_sanitize_attrs(inode, iap);
+
+	/*
+	 * The size case is special, it changes the file in addition to the
+	 * attributes.
+	 */
+	if (iap->ia_valid & ATTR_SIZE) {
+		err = nfsd_get_write_access(rqstp, fhp, iap);
+		if (err)
+			goto out;
+		size_change = 1;
+
+		/*
+		 * RFC5661, Section 18.30.4:
+		 *   Changing the size of a file with SETATTR indirectly
+		 *   changes the time_modify and change attributes.
+		 *
+		 * (and similar for the older RFCs)
+		 */
+		if (iap->ia_size != i_size_read(inode))
+			iap->ia_valid |= ATTR_MTIME;
+	}
+
+	iap->ia_valid |= ATTR_CTIME;
+
+	if (check_guard && guardtime != inode->i_ctime.tv_sec) {
+		err = nfserr_notsync;
+		goto out_put_write_access;
+	}
+
+	fh_lock(fhp);
+	host_err = notify_change(dentry, iap, NULL);
+	fh_unlock(fhp);
+	err = nfserrno(host_err);
+
+out_put_write_access:
+	if (size_change)
+		put_write_access(inode);
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
+out:
+	return err;
+}
+
+#if defined(CONFIG_NFSD_V4)
+/*
+ * NFS junction information is stored in an extended attribute.
+ */
+#define NFSD_JUNCTION_XATTR_NAME	XATTR_TRUSTED_PREFIX "junction.nfs"
+
+/**
+ * nfsd4_is_junction - Test if an object could be an NFS junction
+ *
+ * @dentry: object to test
+ *
+ * Returns 1 if "dentry" appears to contain NFS junction information.
+ * Otherwise 0 is returned.
+ */
+int nfsd4_is_junction(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (inode == NULL)
+		return 0;
+	if (inode->i_mode & S_IXUGO)
+		return 0;
+	if (!(inode->i_mode & S_ISVTX))
+		return 0;
+	if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
+		return 0;
+	return 1;
+}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct xdr_netobj *label)
+{
+	__be32 error;
+	int host_error;
+	struct dentry *dentry;
+
+	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+	if (error)
+		return error;
+
+	dentry = fhp->fh_dentry;
+
+	mutex_lock(&d_inode(dentry)->i_mutex);
+	host_error = security_inode_setsecctx(dentry, label->data, label->len);
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+	return nfserrno(host_error);
+}
+#else
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct xdr_netobj *label)
+{
+	return nfserr_notsupp;
+}
+#endif
+
+__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
+			   struct file *file, loff_t offset, loff_t len,
+			   int flags)
+{
+	__be32 err;
+	int error;
+
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return nfserr_inval;
+
+	err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE);
+	if (err)
+		return err;
+
+	error = vfs_fallocate(file, flags, offset, len);
+	if (!error)
+		error = commit_metadata(fhp);
+
+	return nfserrno(error);
+}
+#endif /* defined(CONFIG_NFSD_V4) */
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * Check server access rights to a file system object
+ */
+struct accessmap {
+	u32		access;
+	int		how;
+};
+static struct accessmap	nfs3_regaccess[] = {
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_TRUNC	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE			},
+
+    {	0,			0				}
+};
+
+static struct accessmap	nfs3_diraccess[] = {
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_LOOKUP,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_EXEC|NFSD_MAY_WRITE	},
+    {	NFS3_ACCESS_DELETE,	NFSD_MAY_REMOVE			},
+
+    {	0,			0				}
+};
+
+static struct accessmap	nfs3_anyaccess[] = {
+	/* Some clients - Solaris 2.6 at least, make an access call
+	 * to the server to check for access for things like /dev/null
+	 * (which really, the server doesn't care about).  So
+	 * We provide simple access checking for them, looking
+	 * mainly at mode bits, and we make sure to ignore read-only
+	 * filesystem checks
+	 */
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
+
+    {	0,			0				}
+};
+
+__be32
+nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
+{
+	struct accessmap	*map;
+	struct svc_export	*export;
+	struct dentry		*dentry;
+	u32			query, result = 0, sresult = 0;
+	__be32			error;
+
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+	if (error)
+		goto out;
+
+	export = fhp->fh_export;
+	dentry = fhp->fh_dentry;
+
+	if (d_is_reg(dentry))
+		map = nfs3_regaccess;
+	else if (d_is_dir(dentry))
+		map = nfs3_diraccess;
+	else
+		map = nfs3_anyaccess;
+
+
+	query = *access;
+	for  (; map->access; map++) {
+		if (map->access & query) {
+			__be32 err2;
+
+			sresult |= map->access;
+
+			err2 = nfsd_permission(rqstp, export, dentry, map->how);
+			switch (err2) {
+			case nfs_ok:
+				result |= map->access;
+				break;
+				
+			/* the following error codes just mean the access was not allowed,
+			 * rather than an error occurred */
+			case nfserr_rofs:
+			case nfserr_acces:
+			case nfserr_perm:
+				/* simply don't "or" in the access bit. */
+				break;
+			default:
+				error = err2;
+				goto out;
+			}
+		}
+	}
+	*access = result;
+	if (supported)
+		*supported = sresult;
+
+ out:
+	return error;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+static int nfsd_open_break_lease(struct inode *inode, int access)
+{
+	unsigned int mode;
+
+	if (access & NFSD_MAY_NOT_BREAK_LEASE)
+		return 0;
+	mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY;
+	return break_lease(inode, mode | O_NONBLOCK);
+}
+
+/*
+ * Open an existing file or directory.
+ * The may_flags argument indicates the type of open (read/write/lock)
+ * and additional flags.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+			int may_flags, struct file **filp)
+{
+	struct path	path;
+	struct inode	*inode;
+	struct file	*file;
+	int		flags = O_RDONLY|O_LARGEFILE;
+	__be32		err;
+	int		host_err = 0;
+
+	validate_process_creds();
+
+	/*
+	 * If we get here, then the client has already done an "open",
+	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
+	 * in case a chmod has now revoked permission.
+	 *
+	 * Arguably we should also allow the owner override for
+	 * directories, but we never have and it doesn't seem to have
+	 * caused anyone a problem.  If we were to change this, note
+	 * also that our filldir callbacks would need a variant of
+	 * lookup_one_len that doesn't check permissions.
+	 */
+	if (type == S_IFREG)
+		may_flags |= NFSD_MAY_OWNER_OVERRIDE;
+	err = fh_verify(rqstp, fhp, type, may_flags);
+	if (err)
+		goto out;
+
+	path.mnt = fhp->fh_export->ex_path.mnt;
+	path.dentry = fhp->fh_dentry;
+	inode = d_inode(path.dentry);
+
+	/* Disallow write access to files with the append-only bit set
+	 * or any access when mandatory locking enabled
+	 */
+	err = nfserr_perm;
+	if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
+		goto out;
+	/*
+	 * We must ignore files (but only files) which might have mandatory
+	 * locks on them because there is no way to know if the accesser has
+	 * the lock.
+	 */
+	if (S_ISREG((inode)->i_mode) && mandatory_lock(inode))
+		goto out;
+
+	if (!inode->i_fop)
+		goto out;
+
+	host_err = nfsd_open_break_lease(inode, may_flags);
+	if (host_err) /* NOMEM or WOULDBLOCK */
+		goto out_nfserr;
+
+	if (may_flags & NFSD_MAY_WRITE) {
+		if (may_flags & NFSD_MAY_READ)
+			flags = O_RDWR|O_LARGEFILE;
+		else
+			flags = O_WRONLY|O_LARGEFILE;
+	}
+
+	file = dentry_open(&path, flags, current_cred());
+	if (IS_ERR(file)) {
+		host_err = PTR_ERR(file);
+		goto out_nfserr;
+	}
+
+	host_err = ima_file_check(file, may_flags, 0);
+	if (host_err) {
+		nfsd_close(file);
+		goto out_nfserr;
+	}
+
+	if (may_flags & NFSD_MAY_64BIT_COOKIE)
+		file->f_mode |= FMODE_64BITHASH;
+	else
+		file->f_mode |= FMODE_32BITHASH;
+
+	*filp = file;
+out_nfserr:
+	err = nfserrno(host_err);
+out:
+	validate_process_creds();
+	return err;
+}
+
+/*
+ * Close a file.
+ */
+void
+nfsd_close(struct file *filp)
+{
+	fput(filp);
+}
+
+/*
+ * Obtain the readahead parameters for the file
+ * specified by (dev, ino).
+ */
+
+static inline struct raparms *
+nfsd_get_raparms(dev_t dev, ino_t ino)
+{
+	struct raparms	*ra, **rap, **frap = NULL;
+	int depth = 0;
+	unsigned int hash;
+	struct raparm_hbucket *rab;
+
+	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
+	rab = &raparm_hash[hash];
+
+	spin_lock(&rab->pb_lock);
+	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
+		if (ra->p_ino == ino && ra->p_dev == dev)
+			goto found;
+		depth++;
+		if (ra->p_count == 0)
+			frap = rap;
+	}
+	depth = nfsdstats.ra_size;
+	if (!frap) {	
+		spin_unlock(&rab->pb_lock);
+		return NULL;
+	}
+	rap = frap;
+	ra = *frap;
+	ra->p_dev = dev;
+	ra->p_ino = ino;
+	ra->p_set = 0;
+	ra->p_hindex = hash;
+found:
+	if (rap != &rab->pb_head) {
+		*rap = ra->p_next;
+		ra->p_next   = rab->pb_head;
+		rab->pb_head = ra;
+	}
+	ra->p_count++;
+	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
+	spin_unlock(&rab->pb_lock);
+	return ra;
+}
+
+/*
+ * Grab and keep cached pages associated with a file in the svc_rqst
+ * so that they can be passed to the network sendmsg/sendpage routines
+ * directly. They will be released after the sending has completed.
+ */
+static int
+nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		  struct splice_desc *sd)
+{
+	struct svc_rqst *rqstp = sd->u.data;
+	struct page **pp = rqstp->rq_next_page;
+	struct page *page = buf->page;
+	size_t size;
+
+	size = sd->len;
+
+	if (rqstp->rq_res.page_len == 0) {
+		get_page(page);
+		put_page(*rqstp->rq_next_page);
+		*(rqstp->rq_next_page++) = page;
+		rqstp->rq_res.page_base = buf->offset;
+		rqstp->rq_res.page_len = size;
+	} else if (page != pp[-1]) {
+		get_page(page);
+		if (*rqstp->rq_next_page)
+			put_page(*rqstp->rq_next_page);
+		*(rqstp->rq_next_page++) = page;
+		rqstp->rq_res.page_len += size;
+	} else
+		rqstp->rq_res.page_len += size;
+
+	return size;
+}
+
+static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
+				    struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
+}
+
+static __be32
+nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
+{
+	if (host_err >= 0) {
+		nfsdstats.io_read += host_err;
+		*count = host_err;
+		fsnotify_access(file);
+		return 0;
+	} else 
+		return nfserrno(host_err);
+}
+
+__be32 nfsd_splice_read(struct svc_rqst *rqstp,
+		     struct file *file, loff_t offset, unsigned long *count)
+{
+	struct splice_desc sd = {
+		.len		= 0,
+		.total_len	= *count,
+		.pos		= offset,
+		.u.data		= rqstp,
+	};
+	int host_err;
+
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+	host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
+	return nfsd_finish_read(file, count, host_err);
+}
+
+__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
+		unsigned long *count)
+{
+	mm_segment_t oldfs;
+	int host_err;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+	set_fs(oldfs);
+	return nfsd_finish_read(file, count, host_err);
+}
+
+static __be32
+nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file,
+	      loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
+		return nfsd_splice_read(rqstp, file, offset, count);
+	else
+		return nfsd_readv(file, offset, vec, vlen, count);
+}
+
+/*
+ * Gathered writes: If another process is currently writing to the file,
+ * there's a high chance this is another nfsd (triggered by a bulk write
+ * from a client's biod). Rather than syncing the file with each write
+ * request, we sleep for 10 msec.
+ *
+ * I don't know if this roughly approximates C. Juszak's idea of
+ * gathered writes, but it's a nice and simple solution (IMHO), and it
+ * seems to work:-)
+ *
+ * Note: we do this only in the NFSv2 case, since v3 and higher have a
+ * better tool (separate unstable writes and commits) for solving this
+ * problem.
+ */
+static int wait_for_concurrent_writes(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	static ino_t last_ino;
+	static dev_t last_dev;
+	int err = 0;
+
+	if (atomic_read(&inode->i_writecount) > 1
+	    || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+		dprintk("nfsd: write defer %d\n", task_pid_nr(current));
+		msleep(10);
+		dprintk("nfsd: write resume %d\n", task_pid_nr(current));
+	}
+
+	if (inode->i_state & I_DIRTY) {
+		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+		err = vfs_fsync(file, 0);
+	}
+	last_ino = inode->i_ino;
+	last_dev = inode->i_sb->s_dev;
+	return err;
+}
+
+static __be32
+nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+				loff_t offset, struct kvec *vec, int vlen,
+				unsigned long *cnt, int *stablep)
+{
+	struct svc_export	*exp;
+	struct inode		*inode;
+	mm_segment_t		oldfs;
+	__be32			err = 0;
+	int			host_err;
+	int			stable = *stablep;
+	int			use_wgather;
+	loff_t			pos = offset;
+	loff_t			end = LLONG_MAX;
+	unsigned int		pflags = current->flags;
+
+	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+		/*
+		 * We want less throttling in balance_dirty_pages()
+		 * and shrink_inactive_list() so that nfs to
+		 * localhost doesn't cause nfsd to lock up due to all
+		 * the client's dirty pages or its congested queue.
+		 */
+		current->flags |= PF_LESS_THROTTLE;
+
+	inode = file_inode(file);
+	exp   = fhp->fh_export;
+
+	use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
+
+	if (!EX_ISSYNC(exp))
+		stable = 0;
+
+	/* Write the data. */
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
+	set_fs(oldfs);
+	if (host_err < 0)
+		goto out_nfserr;
+	*cnt = host_err;
+	nfsdstats.io_write += host_err;
+	fsnotify_modify(file);
+
+	if (stable) {
+		if (use_wgather) {
+			host_err = wait_for_concurrent_writes(file);
+		} else {
+			if (*cnt)
+				end = offset + *cnt - 1;
+			host_err = vfs_fsync_range(file, offset, end, 0);
+		}
+	}
+
+out_nfserr:
+	dprintk("nfsd: write complete host_err=%d\n", host_err);
+	if (host_err >= 0)
+		err = 0;
+	else
+		err = nfserrno(host_err);
+	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+		tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+	return err;
+}
+
+__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct file **file, struct raparms **ra)
+{
+	struct inode *inode;
+	__be32 err;
+
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file);
+	if (err)
+		return err;
+
+	inode = file_inode(*file);
+
+	/* Get readahead parameters */
+	*ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+
+	if (*ra && (*ra)->p_set)
+		(*file)->f_ra = (*ra)->p_ra;
+	return nfs_ok;
+}
+
+void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra)
+{
+	/* Write back readahead params */
+	if (ra) {
+		struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+		spin_lock(&rab->pb_lock);
+		ra->p_ra = file->f_ra;
+		ra->p_set = 1;
+		ra->p_count--;
+		spin_unlock(&rab->pb_lock);
+	}
+	nfsd_close(file);
+}
+
+/*
+ * Read data from a file. count must contain the requested read count
+ * on entry. On return, *count contains the number of bytes actually read.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+	loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+	struct file *file;
+	struct raparms	*ra;
+	__be32 err;
+
+	err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra);
+	if (err)
+		return err;
+
+	err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+
+	nfsd_put_tmp_read_open(file, ra);
+
+	return err;
+}
+
+/*
+ * Write data to a file.
+ * The stable flag requests synchronous writes.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+		loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
+		int *stablep)
+{
+	__be32			err = 0;
+
+	if (file) {
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+				NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
+		if (err)
+			goto out;
+		err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
+				stablep);
+	} else {
+		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+		if (err)
+			goto out;
+
+		if (cnt)
+			err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
+					     cnt, stablep);
+		nfsd_close(file);
+	}
+out:
+	return err;
+}
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * Commit all pending writes to stable storage.
+ *
+ * Note: we only guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced.
+ *
+ * Unfortunately we cannot lock the file to make sure we return full WCC
+ * data to the client, as locking happens lower down in the filesystem.
+ */
+__be32
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
+               loff_t offset, unsigned long count)
+{
+	struct file	*file;
+	loff_t		end = LLONG_MAX;
+	__be32		err = nfserr_inval;
+
+	if (offset < 0)
+		goto out;
+	if (count != 0) {
+		end = offset + (loff_t)count - 1;
+		if (end < offset)
+			goto out;
+	}
+
+	err = nfsd_open(rqstp, fhp, S_IFREG,
+			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
+	if (err)
+		goto out;
+	if (EX_ISSYNC(fhp->fh_export)) {
+		int err2 = vfs_fsync_range(file, offset, end, 0);
+
+		if (err2 != -EINVAL)
+			err = nfserrno(err2);
+		else
+			err = nfserr_notsupp;
+	}
+
+	nfsd_close(file);
+out:
+	return err;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+static __be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+			struct iattr *iap)
+{
+	/*
+	 * Mode has already been set earlier in create:
+	 */
+	iap->ia_valid &= ~ATTR_MODE;
+	/*
+	 * Setting uid/gid works only for root.  Irix appears to
+	 * send along the gid on create when it tries to implement
+	 * setgid directories via NFS:
+	 */
+	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
+		iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+	if (iap->ia_valid)
+		return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+	/* Callers expect file metadata to be committed here */
+	return nfserrno(commit_metadata(resfhp));
+}
+
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+	if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+		iap->ia_valid &= ~ATTR_SIZE;
+}
+
+/*
+ * Create a file (regular, directory, device, fifo); UNIX sockets 
+ * not yet implemented.
+ * If the response fh has been verified, the parent directory should
+ * already be locked. Note that the parent directory is left locked.
+ *
+ * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
+ */
+__be32
+nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		char *fname, int flen, struct iattr *iap,
+		int type, dev_t rdev, struct svc_fh *resfhp)
+{
+	struct dentry	*dentry, *dchild = NULL;
+	struct inode	*dirp;
+	__be32		err;
+	__be32		err2;
+	int		host_err;
+
+	err = nfserr_perm;
+	if (!flen)
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(fname, flen))
+		goto out;
+
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	dirp = d_inode(dentry);
+
+	err = nfserr_notdir;
+	if (!dirp->i_op->lookup)
+		goto out;
+	/*
+	 * Check whether the response file handle has been verified yet.
+	 * If it has, the parent directory should already be locked.
+	 */
+	if (!resfhp->fh_dentry) {
+		host_err = fh_want_write(fhp);
+		if (host_err)
+			goto out_nfserr;
+
+		/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
+		fh_lock_nested(fhp, I_MUTEX_PARENT);
+		dchild = lookup_one_len(fname, dentry, flen);
+		host_err = PTR_ERR(dchild);
+		if (IS_ERR(dchild))
+			goto out_nfserr;
+		err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+		if (err)
+			goto out;
+	} else {
+		/* called from nfsd_proc_create */
+		dchild = dget(resfhp->fh_dentry);
+		if (!fhp->fh_locked) {
+			/* not actually possible */
+			printk(KERN_ERR
+				"nfsd_create: parent %pd2 not locked!\n",
+				dentry);
+			err = nfserr_io;
+			goto out;
+		}
+	}
+	/*
+	 * Make sure the child dentry is still negative ...
+	 */
+	err = nfserr_exist;
+	if (d_really_is_positive(dchild)) {
+		dprintk("nfsd_create: dentry %pd/%pd not negative!\n",
+			dentry, dchild);
+		goto out; 
+	}
+
+	if (!(iap->ia_valid & ATTR_MODE))
+		iap->ia_mode = 0;
+	iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
+
+	err = nfserr_inval;
+	if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
+		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+		       type);
+		goto out;
+	}
+
+	/*
+	 * Get the dir op function pointer.
+	 */
+	err = 0;
+	host_err = 0;
+	switch (type) {
+	case S_IFREG:
+		host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+		if (!host_err)
+			nfsd_check_ignore_resizing(iap);
+		break;
+	case S_IFDIR:
+		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+		break;
+	}
+	if (host_err < 0)
+		goto out_nfserr;
+
+	err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+	/*
+	 * nfsd_create_setattr already committed the child.  Transactional
+	 * filesystems had a chance to commit changes for both parent and
+	 * child * simultaneously making the following commit_metadata a
+	 * noop.
+	 */
+	err2 = nfserrno(commit_metadata(fhp));
+	if (err2)
+		err = err2;
+	/*
+	 * Update the file handle to get the new inode info.
+	 */
+	if (!err)
+		err = fh_update(resfhp);
+out:
+	if (dchild && !IS_ERR(dchild))
+		dput(dchild);
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+
+#ifdef CONFIG_NFSD_V3
+
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+	return createmode == NFS3_CREATE_EXCLUSIVE
+	       || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
+/*
+ * NFSv3 and NFSv4 version of nfsd_create
+ */
+__be32
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		char *fname, int flen, struct iattr *iap,
+		struct svc_fh *resfhp, int createmode, u32 *verifier,
+	        bool *truncp, bool *created)
+{
+	struct dentry	*dentry, *dchild = NULL;
+	struct inode	*dirp;
+	__be32		err;
+	int		host_err;
+	__u32		v_mtime=0, v_atime=0;
+
+	err = nfserr_perm;
+	if (!flen)
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(fname, flen))
+		goto out;
+	if (!(iap->ia_valid & ATTR_MODE))
+		iap->ia_mode = 0;
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	dirp = d_inode(dentry);
+
+	/* Get all the sanity checks out of the way before
+	 * we lock the parent. */
+	err = nfserr_notdir;
+	if (!dirp->i_op->lookup)
+		goto out;
+
+	host_err = fh_want_write(fhp);
+	if (host_err)
+		goto out_nfserr;
+
+	fh_lock_nested(fhp, I_MUTEX_PARENT);
+
+	/*
+	 * Compose the response file handle.
+	 */
+	dchild = lookup_one_len(fname, dentry, flen);
+	host_err = PTR_ERR(dchild);
+	if (IS_ERR(dchild))
+		goto out_nfserr;
+
+	/* If file doesn't exist, check for permissions to create one */
+	if (d_really_is_negative(dchild)) {
+		err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+		if (err)
+			goto out;
+	}
+
+	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+	if (err)
+		goto out;
+
+	if (nfsd_create_is_exclusive(createmode)) {
+		/* solaris7 gets confused (bugid 4218508) if these have
+		 * the high bit set, so just clear the high bits. If this is
+		 * ever changed to use different attrs for storing the
+		 * verifier, then do_open_lookup() will also need to be fixed
+		 * accordingly.
+		 */
+		v_mtime = verifier[0]&0x7fffffff;
+		v_atime = verifier[1]&0x7fffffff;
+	}
+	
+	if (d_really_is_positive(dchild)) {
+		err = 0;
+
+		switch (createmode) {
+		case NFS3_CREATE_UNCHECKED:
+			if (! d_is_reg(dchild))
+				goto out;
+			else if (truncp) {
+				/* in nfsv4, we need to treat this case a little
+				 * differently.  we don't want to truncate the
+				 * file now; this would be wrong if the OPEN
+				 * fails for some other reason.  furthermore,
+				 * if the size is nonzero, we should ignore it
+				 * according to spec!
+				 */
+				*truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
+			}
+			else {
+				iap->ia_valid &= ATTR_SIZE;
+				goto set_attr;
+			}
+			break;
+		case NFS3_CREATE_EXCLUSIVE:
+			if (   d_inode(dchild)->i_mtime.tv_sec == v_mtime
+			    && d_inode(dchild)->i_atime.tv_sec == v_atime
+			    && d_inode(dchild)->i_size  == 0 ) {
+				if (created)
+					*created = 1;
+				break;
+			}
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (   d_inode(dchild)->i_mtime.tv_sec == v_mtime
+			    && d_inode(dchild)->i_atime.tv_sec == v_atime
+			    && d_inode(dchild)->i_size  == 0 ) {
+				if (created)
+					*created = 1;
+				goto set_attr;
+			}
+			 /* fallthru */
+		case NFS3_CREATE_GUARDED:
+			err = nfserr_exist;
+		}
+		fh_drop_write(fhp);
+		goto out;
+	}
+
+	host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+	if (host_err < 0) {
+		fh_drop_write(fhp);
+		goto out_nfserr;
+	}
+	if (created)
+		*created = 1;
+
+	nfsd_check_ignore_resizing(iap);
+
+	if (nfsd_create_is_exclusive(createmode)) {
+		/* Cram the verifier into atime/mtime */
+		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
+			| ATTR_MTIME_SET|ATTR_ATIME_SET;
+		/* XXX someone who knows this better please fix it for nsec */ 
+		iap->ia_mtime.tv_sec = v_mtime;
+		iap->ia_atime.tv_sec = v_atime;
+		iap->ia_mtime.tv_nsec = 0;
+		iap->ia_atime.tv_nsec = 0;
+	}
+
+ set_attr:
+	err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+	/*
+	 * nfsd_create_setattr already committed the child
+	 * (and possibly also the parent).
+	 */
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
+
+	/*
+	 * Update the filehandle to get the new inode info.
+	 */
+	if (!err)
+		err = fh_update(resfhp);
+
+ out:
+	fh_unlock(fhp);
+	if (dchild && !IS_ERR(dchild))
+		dput(dchild);
+	fh_drop_write(fhp);
+ 	return err;
+ 
+ out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+/*
+ * Read a symlink. On entry, *lenp must contain the maximum path length that
+ * fits into the buffer. On return, it contains the true length.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
+{
+	struct inode	*inode;
+	mm_segment_t	oldfs;
+	__be32		err;
+	int		host_err;
+	struct path path;
+
+	err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
+	if (err)
+		goto out;
+
+	path.mnt = fhp->fh_export->ex_path.mnt;
+	path.dentry = fhp->fh_dentry;
+	inode = d_inode(path.dentry);
+
+	err = nfserr_inval;
+	if (!inode->i_op->readlink)
+		goto out;
+
+	touch_atime(&path);
+	/* N.B. Why does this call need a get_fs()??
+	 * Remove the set_fs and watch the fireworks:-) --okir
+	 */
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp);
+	set_fs(oldfs);
+
+	if (host_err < 0)
+		goto out_nfserr;
+	*lenp = host_err;
+	err = 0;
+out:
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+
+/*
+ * Create a symlink and look up its inode
+ * N.B. After this call _both_ fhp and resfhp need an fh_put
+ */
+__be32
+nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
+				char *fname, int flen,
+				char *path,
+				struct svc_fh *resfhp)
+{
+	struct dentry	*dentry, *dnew;
+	__be32		err, cerr;
+	int		host_err;
+
+	err = nfserr_noent;
+	if (!flen || path[0] == '\0')
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(fname, flen))
+		goto out;
+
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+
+	host_err = fh_want_write(fhp);
+	if (host_err)
+		goto out_nfserr;
+
+	fh_lock(fhp);
+	dentry = fhp->fh_dentry;
+	dnew = lookup_one_len(fname, dentry, flen);
+	host_err = PTR_ERR(dnew);
+	if (IS_ERR(dnew))
+		goto out_nfserr;
+
+	host_err = vfs_symlink(d_inode(dentry), dnew, path);
+	err = nfserrno(host_err);
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
+	fh_unlock(fhp);
+
+	fh_drop_write(fhp);
+
+	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
+	dput(dnew);
+	if (err==0) err = cerr;
+out:
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+
+/*
+ * Create a hardlink
+ * N.B. After this call _both_ ffhp and tfhp need an fh_put
+ */
+__be32
+nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
+				char *name, int len, struct svc_fh *tfhp)
+{
+	struct dentry	*ddir, *dnew, *dold;
+	struct inode	*dirp;
+	__be32		err;
+	int		host_err;
+
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+	err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
+	if (err)
+		goto out;
+	err = nfserr_isdir;
+	if (d_is_dir(tfhp->fh_dentry))
+		goto out;
+	err = nfserr_perm;
+	if (!len)
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(name, len))
+		goto out;
+
+	host_err = fh_want_write(tfhp);
+	if (host_err) {
+		err = nfserrno(host_err);
+		goto out;
+	}
+
+	fh_lock_nested(ffhp, I_MUTEX_PARENT);
+	ddir = ffhp->fh_dentry;
+	dirp = d_inode(ddir);
+
+	dnew = lookup_one_len(name, ddir, len);
+	host_err = PTR_ERR(dnew);
+	if (IS_ERR(dnew))
+		goto out_nfserr;
+
+	dold = tfhp->fh_dentry;
+
+	err = nfserr_noent;
+	if (d_really_is_negative(dold))
+		goto out_dput;
+	host_err = vfs_link(dold, dirp, dnew, NULL);
+	if (!host_err) {
+		err = nfserrno(commit_metadata(ffhp));
+		if (!err)
+			err = nfserrno(commit_metadata(tfhp));
+	} else {
+		if (host_err == -EXDEV && rqstp->rq_vers == 2)
+			err = nfserr_acces;
+		else
+			err = nfserrno(host_err);
+	}
+out_dput:
+	dput(dnew);
+out_unlock:
+	fh_unlock(ffhp);
+	fh_drop_write(tfhp);
+out:
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out_unlock;
+}
+
+/*
+ * Rename a file
+ * N.B. After this call _both_ ffhp and tfhp need an fh_put
+ */
+__be32
+nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+			    struct svc_fh *tfhp, char *tname, int tlen)
+{
+	struct dentry	*fdentry, *tdentry, *odentry, *ndentry, *trap;
+	struct inode	*fdir, *tdir;
+	__be32		err;
+	int		host_err;
+
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
+	if (err)
+		goto out;
+	err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+
+	fdentry = ffhp->fh_dentry;
+	fdir = d_inode(fdentry);
+
+	tdentry = tfhp->fh_dentry;
+	tdir = d_inode(tdentry);
+
+	err = nfserr_perm;
+	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
+		goto out;
+
+	host_err = fh_want_write(ffhp);
+	if (host_err) {
+		err = nfserrno(host_err);
+		goto out;
+	}
+
+	/* cannot use fh_lock as we need deadlock protective ordering
+	 * so do it by hand */
+	trap = lock_rename(tdentry, fdentry);
+	ffhp->fh_locked = tfhp->fh_locked = 1;
+	fill_pre_wcc(ffhp);
+	fill_pre_wcc(tfhp);
+
+	odentry = lookup_one_len(fname, fdentry, flen);
+	host_err = PTR_ERR(odentry);
+	if (IS_ERR(odentry))
+		goto out_nfserr;
+
+	host_err = -ENOENT;
+	if (d_really_is_negative(odentry))
+		goto out_dput_old;
+	host_err = -EINVAL;
+	if (odentry == trap)
+		goto out_dput_old;
+
+	ndentry = lookup_one_len(tname, tdentry, tlen);
+	host_err = PTR_ERR(ndentry);
+	if (IS_ERR(ndentry))
+		goto out_dput_old;
+	host_err = -ENOTEMPTY;
+	if (ndentry == trap)
+		goto out_dput_new;
+
+	host_err = -EXDEV;
+	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+		goto out_dput_new;
+	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+		goto out_dput_new;
+
+	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+	if (!host_err) {
+		host_err = commit_metadata(tfhp);
+		if (!host_err)
+			host_err = commit_metadata(ffhp);
+	}
+ out_dput_new:
+	dput(ndentry);
+ out_dput_old:
+	dput(odentry);
+ out_nfserr:
+	err = nfserrno(host_err);
+	/*
+	 * We cannot rely on fh_unlock on the two filehandles,
+	 * as that would do the wrong thing if the two directories
+	 * were the same, so again we do it by hand.
+	 */
+	fill_post_wcc(ffhp);
+	fill_post_wcc(tfhp);
+	unlock_rename(tdentry, fdentry);
+	ffhp->fh_locked = tfhp->fh_locked = 0;
+	fh_drop_write(ffhp);
+
+out:
+	return err;
+}
+
+/*
+ * Unlink a file or directory
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+				char *fname, int flen)
+{
+	struct dentry	*dentry, *rdentry;
+	struct inode	*dirp;
+	__be32		err;
+	int		host_err;
+
+	err = nfserr_acces;
+	if (!flen || isdotent(fname, flen))
+		goto out;
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
+	if (err)
+		goto out;
+
+	host_err = fh_want_write(fhp);
+	if (host_err)
+		goto out_nfserr;
+
+	fh_lock_nested(fhp, I_MUTEX_PARENT);
+	dentry = fhp->fh_dentry;
+	dirp = d_inode(dentry);
+
+	rdentry = lookup_one_len(fname, dentry, flen);
+	host_err = PTR_ERR(rdentry);
+	if (IS_ERR(rdentry))
+		goto out_nfserr;
+
+	if (d_really_is_negative(rdentry)) {
+		dput(rdentry);
+		err = nfserr_noent;
+		goto out;
+	}
+
+	if (!type)
+		type = d_inode(rdentry)->i_mode & S_IFMT;
+
+	if (type != S_IFDIR)
+		host_err = vfs_unlink(dirp, rdentry, NULL);
+	else
+		host_err = vfs_rmdir(dirp, rdentry);
+	if (!host_err)
+		host_err = commit_metadata(fhp);
+	dput(rdentry);
+
+out_nfserr:
+	err = nfserrno(host_err);
+out:
+	return err;
+}
+
+/*
+ * We do this buffering because we must not call back into the file
+ * system's ->lookup() method from the filldir callback. That may well
+ * deadlock a number of file systems.
+ *
+ * This is based heavily on the implementation of same in XFS.
+ */
+struct buffered_dirent {
+	u64		ino;
+	loff_t		offset;
+	int		namlen;
+	unsigned int	d_type;
+	char		name[];
+};
+
+struct readdir_data {
+	struct dir_context ctx;
+	char		*dirent;
+	size_t		used;
+	int		full;
+};
+
+static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+				 int namlen, loff_t offset, u64 ino,
+				 unsigned int d_type)
+{
+	struct readdir_data *buf =
+		container_of(ctx, struct readdir_data, ctx);
+	struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
+	unsigned int reclen;
+
+	reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
+	if (buf->used + reclen > PAGE_SIZE) {
+		buf->full = 1;
+		return -EINVAL;
+	}
+
+	de->namlen = namlen;
+	de->offset = offset;
+	de->ino = ino;
+	de->d_type = d_type;
+	memcpy(de->name, name, namlen);
+	buf->used += reclen;
+
+	return 0;
+}
+
+static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
+				    struct readdir_cd *cdp, loff_t *offsetp)
+{
+	struct buffered_dirent *de;
+	int host_err;
+	int size;
+	loff_t offset;
+	struct readdir_data buf = {
+		.ctx.actor = nfsd_buffered_filldir,
+		.dirent = (void *)__get_free_page(GFP_KERNEL)
+	};
+
+	if (!buf.dirent)
+		return nfserrno(-ENOMEM);
+
+	offset = *offsetp;
+
+	while (1) {
+		struct inode *dir_inode = file_inode(file);
+		unsigned int reclen;
+
+		cdp->err = nfserr_eof; /* will be cleared on successful read */
+		buf.used = 0;
+		buf.full = 0;
+
+		host_err = iterate_dir(file, &buf.ctx);
+		if (buf.full)
+			host_err = 0;
+
+		if (host_err < 0)
+			break;
+
+		size = buf.used;
+
+		if (!size)
+			break;
+
+		/*
+		 * Various filldir functions may end up calling back into
+		 * lookup_one_len() and the file system's ->lookup() method.
+		 * These expect i_mutex to be held, as it would within readdir.
+		 */
+		host_err = mutex_lock_killable(&dir_inode->i_mutex);
+		if (host_err)
+			break;
+
+		de = (struct buffered_dirent *)buf.dirent;
+		while (size > 0) {
+			offset = de->offset;
+
+			if (func(cdp, de->name, de->namlen, de->offset,
+				 de->ino, de->d_type))
+				break;
+
+			if (cdp->err != nfs_ok)
+				break;
+
+			reclen = ALIGN(sizeof(*de) + de->namlen,
+				       sizeof(u64));
+			size -= reclen;
+			de = (struct buffered_dirent *)((char *)de + reclen);
+		}
+		mutex_unlock(&dir_inode->i_mutex);
+		if (size > 0) /* We bailed out early */
+			break;
+
+		offset = vfs_llseek(file, 0, SEEK_CUR);
+	}
+
+	free_page((unsigned long)(buf.dirent));
+
+	if (host_err)
+		return nfserrno(host_err);
+
+	*offsetp = offset;
+	return cdp->err;
+}
+
+/*
+ * Read entries from a directory.
+ * The  NFSv3/4 verifier we ignore for now.
+ */
+__be32
+nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 
+	     struct readdir_cd *cdp, nfsd_filldir_t func)
+{
+	__be32		err;
+	struct file	*file;
+	loff_t		offset = *offsetp;
+	int             may_flags = NFSD_MAY_READ;
+
+	/* NFSv2 only supports 32 bit cookies */
+	if (rqstp->rq_vers > 2)
+		may_flags |= NFSD_MAY_64BIT_COOKIE;
+
+	err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file);
+	if (err)
+		goto out;
+
+	offset = vfs_llseek(file, offset, SEEK_SET);
+	if (offset < 0) {
+		err = nfserrno((int)offset);
+		goto out_close;
+	}
+
+	err = nfsd_buffered_readdir(file, func, cdp, offsetp);
+
+	if (err == nfserr_eof || err == nfserr_toosmall)
+		err = nfs_ok; /* can still be found in ->err */
+out_close:
+	nfsd_close(file);
+out:
+	return err;
+}
+
+/*
+ * Get file system stats
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
+{
+	__be32 err;
+
+	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+	if (!err) {
+		struct path path = {
+			.mnt	= fhp->fh_export->ex_path.mnt,
+			.dentry	= fhp->fh_dentry,
+		};
+		if (vfs_statfs(&path, stat))
+			err = nfserr_io;
+	}
+	return err;
+}
+
+static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY;
+}
+
+/*
+ * Check for a user's access permissions to this inode.
+ */
+__be32
+nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
+					struct dentry *dentry, int acc)
+{
+	struct inode	*inode = d_inode(dentry);
+	int		err;
+
+	if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
+		return 0;
+#if 0
+	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
+		acc,
+		(acc & NFSD_MAY_READ)?	" read"  : "",
+		(acc & NFSD_MAY_WRITE)?	" write" : "",
+		(acc & NFSD_MAY_EXEC)?	" exec"  : "",
+		(acc & NFSD_MAY_SATTR)?	" sattr" : "",
+		(acc & NFSD_MAY_TRUNC)?	" trunc" : "",
+		(acc & NFSD_MAY_LOCK)?	" lock"  : "",
+		(acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
+		inode->i_mode,
+		IS_IMMUTABLE(inode)?	" immut" : "",
+		IS_APPEND(inode)?	" append" : "",
+		__mnt_is_readonly(exp->ex_path.mnt)?	" ro" : "");
+	dprintk("      owner %d/%d user %d/%d\n",
+		inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid());
+#endif
+
+	/* Normally we reject any write/sattr etc access on a read-only file
+	 * system.  But if it is IRIX doing check on write-access for a 
+	 * device special file, we ignore rofs.
+	 */
+	if (!(acc & NFSD_MAY_LOCAL_ACCESS))
+		if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
+			if (exp_rdonly(rqstp, exp) ||
+			    __mnt_is_readonly(exp->ex_path.mnt))
+				return nfserr_rofs;
+			if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
+				return nfserr_perm;
+		}
+	if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
+		return nfserr_perm;
+
+	if (acc & NFSD_MAY_LOCK) {
+		/* If we cannot rely on authentication in NLM requests,
+		 * just allow locks, otherwise require read permission, or
+		 * ownership
+		 */
+		if (exp->ex_flags & NFSEXP_NOAUTHNLM)
+			return 0;
+		else
+			acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
+	}
+	/*
+	 * The file owner always gets access permission for accesses that
+	 * would normally be checked at open time. This is to make
+	 * file access work even when the client has done a fchmod(fd, 0).
+	 *
+	 * However, `cp foo bar' should fail nevertheless when bar is
+	 * readonly. A sensible way to do this might be to reject all
+	 * attempts to truncate a read-only file, because a creat() call
+	 * always implies file truncation.
+	 * ... but this isn't really fair.  A process may reasonably call
+	 * ftruncate on an open file descriptor on a file with perm 000.
+	 * We must trust the client to do permission checking - using "ACCESS"
+	 * with NFSv3.
+	 */
+	if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
+	    uid_eq(inode->i_uid, current_fsuid()))
+		return 0;
+
+	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
+	err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+
+	/* Allow read access to binaries even when mode 111 */
+	if (err == -EACCES && S_ISREG(inode->i_mode) &&
+	     (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
+	      acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
+		err = inode_permission(inode, MAY_EXEC);
+
+	return err? nfserrno(err) : 0;
+}
+
+void
+nfsd_racache_shutdown(void)
+{
+	struct raparms *raparm, *last_raparm;
+	unsigned int i;
+
+	dprintk("nfsd: freeing readahead buffers.\n");
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+		raparm = raparm_hash[i].pb_head;
+		while(raparm) {
+			last_raparm = raparm;
+			raparm = raparm->p_next;
+			kfree(last_raparm);
+		}
+		raparm_hash[i].pb_head = NULL;
+	}
+}
+/*
+ * Initialize readahead param cache
+ */
+int
+nfsd_racache_init(int cache_size)
+{
+	int	i;
+	int	j = 0;
+	int	nperbucket;
+	struct raparms **raparm = NULL;
+
+
+	if (raparm_hash[0].pb_head)
+		return 0;
+	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+	nperbucket = max(2, nperbucket);
+	cache_size = nperbucket * RAPARM_HASH_SIZE;
+
+	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+		spin_lock_init(&raparm_hash[i].pb_lock);
+
+		raparm = &raparm_hash[i].pb_head;
+		for (j = 0; j < nperbucket; j++) {
+			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+			if (!*raparm)
+				goto out_nomem;
+			raparm = &(*raparm)->p_next;
+		}
+		*raparm = NULL;
+	}
+
+	nfsdstats.ra_size = cache_size;
+	return 0;
+
+out_nomem:
+	dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+	nfsd_racache_shutdown();
+	return -ENOMEM;
+}
diff --git a/kernel/fs/nfsd/vfs.h b/kernel/fs/nfsd/vfs.h
new file mode 100644
index 000000000..2050cb016
--- /dev/null
+++ b/kernel/fs/nfsd/vfs.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_VFS_H
+#define LINUX_NFSD_VFS_H
+
+#include "nfsfh.h"
+#include "nfsd.h"
+
+/*
+ * Flags for nfsd_permission
+ */
+#define NFSD_MAY_NOP			0
+#define NFSD_MAY_EXEC			0x001 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE			0x002 /* == MAY_WRITE */
+#define NFSD_MAY_READ			0x004 /* == MAY_READ */
+#define NFSD_MAY_SATTR			0x008
+#define NFSD_MAY_TRUNC			0x010
+#define NFSD_MAY_LOCK			0x020
+#define NFSD_MAY_MASK			0x03f
+
+/* extra hints to permission and open routines: */
+#define NFSD_MAY_OWNER_OVERRIDE		0x040
+#define NFSD_MAY_LOCAL_ACCESS		0x080 /* for device special files */
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT	0x100
+#define NFSD_MAY_NOT_BREAK_LEASE	0x200
+#define NFSD_MAY_BYPASS_GSS		0x400
+#define NFSD_MAY_READ_IF_EXEC		0x800
+
+#define NFSD_MAY_64BIT_COOKIE		0x1000 /* 64 bit readdir cookies for >= NFSv3 */
+
+#define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
+
+/*
+ * Callback function for readdir
+ */
+typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+
+/* nfsd/vfs.c */
+int		nfsd_racache_init(int);
+void		nfsd_racache_shutdown(void);
+int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+		                struct svc_export **expp);
+__be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+				const char *, unsigned int, struct svc_fh *);
+__be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+				const char *, unsigned int,
+				struct svc_export **, struct dentry **);
+__be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+				struct iattr *, int, time_t);
+int nfsd_mountpoint(struct dentry *, struct svc_export *);
+#ifdef CONFIG_NFSD_V4
+__be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
+		    struct xdr_netobj *);
+__be32		nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
+				    struct file *, loff_t, loff_t, int);
+#endif /* CONFIG_NFSD_V4 */
+__be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
+				char *name, int len, struct iattr *attrs,
+				int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+__be32		nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+__be32		do_nfsd_create(struct svc_rqst *, struct svc_fh *,
+				char *name, int len, struct iattr *attrs,
+				struct svc_fh *res, int createmode,
+				u32 *verifier, bool *truncp, bool *created);
+__be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
+				loff_t, unsigned long);
+#endif /* CONFIG_NFSD_V3 */
+__be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
+				int, struct file **);
+void		nfsd_close(struct file *);
+struct raparms;
+__be32		nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
+				struct file **, struct raparms **);
+void		nfsd_put_tmp_read_open(struct file *, struct raparms *);
+__be32		nfsd_splice_read(struct svc_rqst *,
+				struct file *, loff_t, unsigned long *);
+__be32		nfsd_readv(struct file *, loff_t, struct kvec *, int,
+				unsigned long *);
+__be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
+				loff_t, struct kvec *, int, unsigned long *);
+__be32 		nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
+				loff_t, struct kvec *,int, unsigned long *, int *);
+__be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+				char *, int *);
+__be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+				char *name, int len, char *path,
+				struct svc_fh *res);
+__be32		nfsd_link(struct svc_rqst *, struct svc_fh *,
+				char *, int, struct svc_fh *);
+__be32		nfsd_rename(struct svc_rqst *,
+				struct svc_fh *, char *, int,
+				struct svc_fh *, char *, int);
+__be32		nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+				char *name, int len);
+__be32		nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+			     loff_t *, struct readdir_cd *, nfsd_filldir_t);
+__be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+				struct kstatfs *, int access);
+
+__be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
+				struct dentry *, int);
+
+static inline int fh_want_write(struct svc_fh *fh)
+{
+	int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
+
+	if (!ret)
+		fh->fh_want_write = 1;
+	return ret;
+}
+
+static inline void fh_drop_write(struct svc_fh *fh)
+{
+	if (fh->fh_want_write) {
+		fh->fh_want_write = 0;
+		mnt_drop_write(fh->fh_export->ex_path.mnt);
+	}
+}
+
+static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
+{
+	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
+			 .dentry = fh->fh_dentry};
+	return nfserrno(vfs_getattr(&p, stat));
+}
+
+#endif /* LINUX_NFSD_VFS_H */
diff --git a/kernel/fs/nfsd/xdr.h b/kernel/fs/nfsd/xdr.h
new file mode 100644
index 000000000..4f0481d63
--- /dev/null
+++ b/kernel/fs/nfsd/xdr.h
@@ -0,0 +1,173 @@
+/* XDR types for nfsd. This is mainly a typing exercise. */
+
+#ifndef LINUX_NFSD_H
+#define LINUX_NFSD_H
+
+#include <linux/vfs.h>
+#include "nfsd.h"
+#include "nfsfh.h"
+
+struct nfsd_fhandle {
+	struct svc_fh		fh;
+};
+
+struct nfsd_sattrargs {
+	struct svc_fh		fh;
+	struct iattr		attrs;
+};
+
+struct nfsd_diropargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+};
+
+struct nfsd_readargs {
+	struct svc_fh		fh;
+	__u32			offset;
+	__u32			count;
+	int			vlen;
+};
+
+struct nfsd_writeargs {
+	svc_fh			fh;
+	__u32			offset;
+	int			len;
+	int			vlen;
+};
+
+struct nfsd_createargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+	struct iattr		attrs;
+};
+
+struct nfsd_renameargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd_readlinkargs {
+	struct svc_fh		fh;
+	char *			buffer;
+};
+	
+struct nfsd_linkargs {
+	struct svc_fh		ffh;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd_symlinkargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	char *			tname;
+	unsigned int		tlen;
+	struct iattr		attrs;
+};
+
+struct nfsd_readdirargs {
+	struct svc_fh		fh;
+	__u32			cookie;
+	__u32			count;
+	__be32 *		buffer;
+};
+
+struct nfsd_attrstat {
+	struct svc_fh		fh;
+	struct kstat		stat;
+};
+
+struct nfsd_diropres  {
+	struct svc_fh		fh;
+	struct kstat		stat;
+};
+
+struct nfsd_readlinkres {
+	int			len;
+};
+
+struct nfsd_readres {
+	struct svc_fh		fh;
+	unsigned long		count;
+	struct kstat		stat;
+};
+
+struct nfsd_readdirres {
+	int			count;
+
+	struct readdir_cd	common;
+	__be32 *		buffer;
+	int			buflen;
+	__be32 *		offset;
+};
+
+struct nfsd_statfsres {
+	struct kstatfs		stats;
+};
+
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd_xdrstore {
+	struct nfsd_sattrargs	sattr;
+	struct nfsd_diropargs	dirop;
+	struct nfsd_readargs	read;
+	struct nfsd_writeargs	write;
+	struct nfsd_createargs	create;
+	struct nfsd_renameargs	rename;
+	struct nfsd_linkargs	link;
+	struct nfsd_symlinkargs	symlink;
+	struct nfsd_readdirargs	readdir;
+};
+
+#define NFS2_SVC_XDRSIZE	sizeof(union nfsd_xdrstore)
+
+
+int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *,
+				struct nfsd_sattrargs *);
+int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *,
+				struct nfsd_diropargs *);
+int nfssvc_decode_readargs(struct svc_rqst *, __be32 *,
+				struct nfsd_readargs *);
+int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *,
+				struct nfsd_writeargs *);
+int nfssvc_decode_createargs(struct svc_rqst *, __be32 *,
+				struct nfsd_createargs *);
+int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *,
+				struct nfsd_renameargs *);
+int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd_readlinkargs *);
+int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *,
+				struct nfsd_linkargs *);
+int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd_symlinkargs *);
+int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *,
+				struct nfsd_readdirargs *);
+int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *);
+int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *);
+int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *);
+int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *);
+int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *);
+int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *);
+
+int nfssvc_encode_entry(void *, const char *name,
+			int namlen, loff_t offset, u64 ino, unsigned int);
+
+int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+
+/* Helper functions for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+
+#endif /* LINUX_NFSD_H */
diff --git a/kernel/fs/nfsd/xdr3.h b/kernel/fs/nfsd/xdr3.h
new file mode 100644
index 000000000..335e04aaf
--- /dev/null
+++ b/kernel/fs/nfsd/xdr3.h
@@ -0,0 +1,349 @@
+/*
+ * XDR types for NFSv3 in nfsd.
+ *
+ * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_NFSD_XDR3_H
+#define _LINUX_NFSD_XDR3_H
+
+#include "xdr.h"
+
+struct nfsd3_sattrargs {
+	struct svc_fh		fh;
+	struct iattr		attrs;
+	int			check_guard;
+	time_t			guardtime;
+};
+
+struct nfsd3_diropargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+};
+
+struct nfsd3_accessargs {
+	struct svc_fh		fh;
+	unsigned int		access;
+};
+
+struct nfsd3_readargs {
+	struct svc_fh		fh;
+	__u64			offset;
+	__u32			count;
+	int			vlen;
+};
+
+struct nfsd3_writeargs {
+	svc_fh			fh;
+	__u64			offset;
+	__u32			count;
+	int			stable;
+	__u32			len;
+	int			vlen;
+};
+
+struct nfsd3_createargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+	int			createmode;
+	struct iattr		attrs;
+	__be32 *		verf;
+};
+
+struct nfsd3_mknodargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+	__u32			ftype;
+	__u32			major, minor;
+	struct iattr		attrs;
+};
+
+struct nfsd3_renameargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd3_readlinkargs {
+	struct svc_fh		fh;
+	char *			buffer;
+};
+
+struct nfsd3_linkargs {
+	struct svc_fh		ffh;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd3_symlinkargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	char *			tname;
+	unsigned int		tlen;
+	struct iattr		attrs;
+};
+
+struct nfsd3_readdirargs {
+	struct svc_fh		fh;
+	__u64			cookie;
+	__u32			dircount;
+	__u32			count;
+	__be32 *		verf;
+	__be32 *		buffer;
+};
+
+struct nfsd3_commitargs {
+	struct svc_fh		fh;
+	__u64			offset;
+	__u32			count;
+};
+
+struct nfsd3_getaclargs {
+	struct svc_fh		fh;
+	int			mask;
+};
+
+struct posix_acl;
+struct nfsd3_setaclargs {
+	struct svc_fh		fh;
+	int			mask;
+	struct posix_acl	*acl_access;
+	struct posix_acl	*acl_default;
+};
+
+struct nfsd3_attrstat {
+	__be32			status;
+	struct svc_fh		fh;
+	struct kstat            stat;
+};
+
+/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
+struct nfsd3_diropres  {
+	__be32			status;
+	struct svc_fh		dirfh;
+	struct svc_fh		fh;
+};
+
+struct nfsd3_accessres {
+	__be32			status;
+	struct svc_fh		fh;
+	__u32			access;
+	struct kstat		stat;
+};
+
+struct nfsd3_readlinkres {
+	__be32			status;
+	struct svc_fh		fh;
+	__u32			len;
+};
+
+struct nfsd3_readres {
+	__be32			status;
+	struct svc_fh		fh;
+	unsigned long		count;
+	int			eof;
+};
+
+struct nfsd3_writeres {
+	__be32			status;
+	struct svc_fh		fh;
+	unsigned long		count;
+	int			committed;
+};
+
+struct nfsd3_renameres {
+	__be32			status;
+	struct svc_fh		ffh;
+	struct svc_fh		tfh;
+};
+
+struct nfsd3_linkres {
+	__be32			status;
+	struct svc_fh		tfh;
+	struct svc_fh		fh;
+};
+
+struct nfsd3_readdirres {
+	__be32			status;
+	struct svc_fh		fh;
+	/* Just to save kmalloc on every readdirplus entry (svc_fh is a
+	 * little large for the stack): */
+	struct svc_fh		scratch;
+	int			count;
+	__be32			verf[2];
+
+	struct readdir_cd	common;
+	__be32 *		buffer;
+	int			buflen;
+	__be32 *		offset;
+	__be32 *		offset1;
+	struct svc_rqst *	rqstp;
+
+};
+
+struct nfsd3_fsstatres {
+	__be32			status;
+	struct kstatfs		stats;
+	__u32			invarsec;
+};
+
+struct nfsd3_fsinfores {
+	__be32			status;
+	__u32			f_rtmax;
+	__u32			f_rtpref;
+	__u32			f_rtmult;
+	__u32			f_wtmax;
+	__u32			f_wtpref;
+	__u32			f_wtmult;
+	__u32			f_dtpref;
+	__u64			f_maxfilesize;
+	__u32			f_properties;
+};
+
+struct nfsd3_pathconfres {
+	__be32			status;
+	__u32			p_link_max;
+	__u32			p_name_max;
+	__u32			p_no_trunc;
+	__u32			p_chown_restricted;
+	__u32			p_case_insensitive;
+	__u32			p_case_preserving;
+};
+
+struct nfsd3_commitres {
+	__be32			status;
+	struct svc_fh		fh;
+};
+
+struct nfsd3_getaclres {
+	__be32			status;
+	struct svc_fh		fh;
+	int			mask;
+	struct posix_acl	*acl_access;
+	struct posix_acl	*acl_default;
+	struct kstat		stat;
+};
+
+/* dummy type for release */
+struct nfsd3_fhandle_pair {
+	__u32			dummy;
+	struct svc_fh		fh1;
+	struct svc_fh		fh2;
+};
+
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd3_xdrstore {
+	struct nfsd3_sattrargs		sattrargs;
+	struct nfsd3_diropargs		diropargs;
+	struct nfsd3_readargs		readargs;
+	struct nfsd3_writeargs		writeargs;
+	struct nfsd3_createargs		createargs;
+	struct nfsd3_renameargs		renameargs;
+	struct nfsd3_linkargs		linkargs;
+	struct nfsd3_symlinkargs	symlinkargs;
+	struct nfsd3_readdirargs	readdirargs;
+	struct nfsd3_diropres 		diropres;
+	struct nfsd3_accessres		accessres;
+	struct nfsd3_readlinkres	readlinkres;
+	struct nfsd3_readres		readres;
+	struct nfsd3_writeres		writeres;
+	struct nfsd3_renameres		renameres;
+	struct nfsd3_linkres		linkres;
+	struct nfsd3_readdirres		readdirres;
+	struct nfsd3_fsstatres		fsstatres;
+	struct nfsd3_fsinfores		fsinfores;
+	struct nfsd3_pathconfres	pathconfres;
+	struct nfsd3_commitres		commitres;
+	struct nfsd3_getaclres		getaclres;
+};
+
+#define NFS3_SVC_XDRSIZE		sizeof(union nfsd3_xdrstore)
+
+int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_sattrargs *);
+int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_diropargs *);
+int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_accessargs *);
+int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readargs *);
+int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_writeargs *);
+int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_createargs *);
+int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_createargs *);
+int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_mknodargs *);
+int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_renameargs *);
+int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readlinkargs *);
+int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_linkargs *);
+int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_symlinkargs *);
+int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readdirargs *);
+int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readdirargs *);
+int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_commitargs *);
+int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *,
+				struct nfsd3_attrstat *);
+int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *,
+				struct nfsd3_attrstat *);
+int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *,
+				struct nfsd3_diropres *);
+int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *,
+				struct nfsd3_accessres *);
+int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *,
+				struct nfsd3_readlinkres *);
+int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *);
+int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *);
+int nfs3svc_encode_createres(struct svc_rqst *, __be32 *,
+				struct nfsd3_diropres *);
+int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *,
+				struct nfsd3_renameres *);
+int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *,
+				struct nfsd3_linkres *);
+int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *,
+				struct nfsd3_readdirres *);
+int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *,
+				struct nfsd3_fsstatres *);
+int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *,
+				struct nfsd3_fsinfores *);
+int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *,
+				struct nfsd3_pathconfres *);
+int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *,
+				struct nfsd3_commitres *);
+
+int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *,
+				struct nfsd3_attrstat *);
+int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *,
+				struct nfsd3_fhandle_pair *);
+int nfs3svc_encode_entry(void *, const char *name,
+				int namlen, loff_t offset, u64 ino,
+				unsigned int);
+int nfs3svc_encode_entry_plus(void *, const char *name,
+				int namlen, loff_t offset, u64 ino,
+				unsigned int);
+/* Helper functions for NFSv3 ACL code */
+__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
+				struct svc_fh *fhp);
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+
+
+#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/kernel/fs/nfsd/xdr4.h b/kernel/fs/nfsd/xdr4.h
new file mode 100644
index 000000000..2f8c092be
--- /dev/null
+++ b/kernel/fs/nfsd/xdr4.h
@@ -0,0 +1,724 @@
+/*
+ *  Server-side types for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _LINUX_NFSD_XDR4_H
+#define _LINUX_NFSD_XDR4_H
+
+#include "state.h"
+#include "nfsd.h"
+
+#define NFSD4_MAX_TAGLEN	128
+#define XDR_LEN(n)                     (((n) + 3) & ~3)
+
+#define CURRENT_STATE_ID_FLAG (1<<0)
+#define SAVED_STATE_ID_FLAG (1<<1)
+
+#define SET_STATE_ID(c, f) ((c)->sid_flags |= (f))
+#define HAS_STATE_ID(c, f) ((c)->sid_flags & (f))
+#define CLEAR_STATE_ID(c, f) ((c)->sid_flags &= ~(f))
+
+struct nfsd4_compound_state {
+	struct svc_fh		current_fh;
+	struct svc_fh		save_fh;
+	struct nfs4_stateowner	*replay_owner;
+	struct nfs4_client	*clp;
+	/* For sessions DRC */
+	struct nfsd4_session	*session;
+	struct nfsd4_slot	*slot;
+	int			data_offset;
+	size_t			iovlen;
+	u32			minorversion;
+	__be32			status;
+	stateid_t	current_stateid;
+	stateid_t	save_stateid;
+	/* to indicate current and saved state id presents */
+	u32		sid_flags;
+};
+
+static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
+{
+	return cs->slot != NULL;
+}
+
+struct nfsd4_change_info {
+	u32		atomic;
+	bool		change_supported;
+	u32		before_ctime_sec;
+	u32		before_ctime_nsec;
+	u64		before_change;
+	u32		after_ctime_sec;
+	u32		after_ctime_nsec;
+	u64		after_change;
+};
+
+struct nfsd4_access {
+	u32		ac_req_access;      /* request */
+	u32		ac_supported;       /* response */
+	u32		ac_resp_access;     /* response */
+};
+
+struct nfsd4_close {
+	u32		cl_seqid;           /* request */
+	stateid_t	cl_stateid;         /* request+response */
+};
+
+struct nfsd4_commit {
+	u64		co_offset;          /* request */
+	u32		co_count;           /* request */
+	nfs4_verifier	co_verf;            /* response */
+};
+
+struct nfsd4_create {
+	u32		cr_namelen;         /* request */
+	char *		cr_name;            /* request */
+	u32		cr_type;            /* request */
+	union {                             /* request */
+		struct {
+			u32 datalen;
+			char *data;
+		} link;   /* NF4LNK */
+		struct {
+			u32 specdata1;
+			u32 specdata2;
+		} dev;    /* NF4BLK, NF4CHR */
+	} u;
+	u32		cr_bmval[3];        /* request */
+	struct iattr	cr_iattr;           /* request */
+	struct nfsd4_change_info  cr_cinfo; /* response */
+	struct nfs4_acl *cr_acl;
+	struct xdr_netobj cr_label;
+};
+#define cr_datalen	u.link.datalen
+#define cr_data		u.link.data
+#define cr_specdata1	u.dev.specdata1
+#define cr_specdata2	u.dev.specdata2
+
+struct nfsd4_delegreturn {
+	stateid_t	dr_stateid;
+};
+
+struct nfsd4_getattr {
+	u32		ga_bmval[3];        /* request */
+	struct svc_fh	*ga_fhp;            /* response */
+};
+
+struct nfsd4_link {
+	u32		li_namelen;         /* request */
+	char *		li_name;            /* request */
+	struct nfsd4_change_info  li_cinfo; /* response */
+};
+
+struct nfsd4_lock_denied {
+	clientid_t	ld_clientid;
+	struct xdr_netobj	ld_owner;
+	u64             ld_start;
+	u64             ld_length;
+	u32             ld_type;
+};
+
+struct nfsd4_lock {
+	/* request */
+	u32             lk_type;
+	u32             lk_reclaim;         /* boolean */
+	u64             lk_offset;
+	u64             lk_length;
+	u32             lk_is_new;
+	union {
+		struct {
+			u32             open_seqid;
+			stateid_t       open_stateid;
+			u32             lock_seqid;
+			clientid_t      clientid;
+			struct xdr_netobj owner;
+		} new;
+		struct {
+			stateid_t       lock_stateid;
+			u32             lock_seqid;
+		} old;
+	} v;
+
+	/* response */
+	union {
+		struct {
+			stateid_t               stateid;
+		} ok;
+		struct nfsd4_lock_denied        denied;
+	} u;
+};
+#define lk_new_open_seqid       v.new.open_seqid
+#define lk_new_open_stateid     v.new.open_stateid
+#define lk_new_lock_seqid       v.new.lock_seqid
+#define lk_new_clientid         v.new.clientid
+#define lk_new_owner            v.new.owner
+#define lk_old_lock_stateid     v.old.lock_stateid
+#define lk_old_lock_seqid       v.old.lock_seqid
+
+#define lk_resp_stateid u.ok.stateid
+#define lk_denied       u.denied
+
+
+struct nfsd4_lockt {
+	u32				lt_type;
+	clientid_t			lt_clientid;
+	struct xdr_netobj		lt_owner;
+	u64				lt_offset;
+	u64				lt_length;
+	struct nfsd4_lock_denied  	lt_denied;
+};
+
+ 
+struct nfsd4_locku {
+	u32             lu_type;
+	u32             lu_seqid;
+	stateid_t       lu_stateid;
+	u64             lu_offset;
+	u64             lu_length;
+};
+
+
+struct nfsd4_lookup {
+	u32		lo_len;             /* request */
+	char *		lo_name;            /* request */
+};
+
+struct nfsd4_putfh {
+	u32		pf_fhlen;           /* request */
+	char		*pf_fhval;          /* request */
+};
+
+struct nfsd4_open {
+	u32		op_claim_type;      /* request */
+	struct xdr_netobj op_fname;	    /* request - everything but CLAIM_PREV */
+	u32		op_delegate_type;   /* request - CLAIM_PREV only */
+	stateid_t       op_delegate_stateid; /* request - response */
+	u32		op_why_no_deleg;    /* response - DELEG_NONE_EXT only */
+	u32		op_create;     	    /* request */
+	u32		op_createmode;      /* request */
+	u32		op_bmval[3];        /* request */
+	struct iattr	op_iattr;           /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+	nfs4_verifier	op_verf __attribute__((aligned(32)));
+					    /* EXCLUSIVE4 */
+	clientid_t	op_clientid;        /* request */
+	struct xdr_netobj op_owner;           /* request */
+	u32		op_seqid;           /* request */
+	u32		op_share_access;    /* request */
+	u32		op_share_deny;      /* request */
+	u32		op_deleg_want;      /* request */
+	stateid_t	op_stateid;         /* response */
+	__be32		op_xdr_error;       /* see nfsd4_open_omfg() */
+	u32		op_recall;          /* recall */
+	struct nfsd4_change_info  op_cinfo; /* response */
+	u32		op_rflags;          /* response */
+	bool		op_truncate;        /* used during processing */
+	bool		op_created;         /* used during processing */
+	struct nfs4_openowner *op_openowner; /* used during processing */
+	struct nfs4_file *op_file;          /* used during processing */
+	struct nfs4_ol_stateid *op_stp;	    /* used during processing */
+	struct nfs4_clnt_odstate *op_odstate; /* used during processing */
+	struct nfs4_acl *op_acl;
+	struct xdr_netobj op_label;
+};
+
+struct nfsd4_open_confirm {
+	stateid_t	oc_req_stateid		/* request */;
+	u32		oc_seqid    		/* request */;
+	stateid_t	oc_resp_stateid		/* response */;
+};
+
+struct nfsd4_open_downgrade {
+	stateid_t       od_stateid;
+	u32             od_seqid;
+	u32             od_share_access;	/* request */
+	u32		od_deleg_want;		/* request */
+	u32             od_share_deny;		/* request */
+};
+
+
+struct nfsd4_read {
+	stateid_t	rd_stateid;         /* request */
+	u64		rd_offset;          /* request */
+	u32		rd_length;          /* request */
+	int		rd_vlen;
+	struct file     *rd_filp;
+	
+	struct svc_rqst *rd_rqstp;          /* response */
+	struct svc_fh * rd_fhp;             /* response */
+};
+
+struct nfsd4_readdir {
+	u64		rd_cookie;          /* request */
+	nfs4_verifier	rd_verf;            /* request */
+	u32		rd_dircount;        /* request */
+	u32		rd_maxcount;        /* request */
+	u32		rd_bmval[3];        /* request */
+	struct svc_rqst *rd_rqstp;          /* response */
+	struct svc_fh * rd_fhp;             /* response */
+
+	struct readdir_cd	common;
+	struct xdr_stream	*xdr;
+	int			cookie_offset;
+};
+
+struct nfsd4_release_lockowner {
+	clientid_t        rl_clientid;
+	struct xdr_netobj rl_owner;
+};
+struct nfsd4_readlink {
+	struct svc_rqst *rl_rqstp;          /* request */
+	struct svc_fh *	rl_fhp;             /* request */
+};
+
+struct nfsd4_remove {
+	u32		rm_namelen;         /* request */
+	char *		rm_name;            /* request */
+	struct nfsd4_change_info  rm_cinfo; /* response */
+};
+
+struct nfsd4_rename {
+	u32		rn_snamelen;        /* request */
+	char *		rn_sname;           /* request */
+	u32		rn_tnamelen;        /* request */
+	char *		rn_tname;           /* request */
+	struct nfsd4_change_info  rn_sinfo; /* response */
+	struct nfsd4_change_info  rn_tinfo; /* response */
+};
+
+struct nfsd4_secinfo {
+	u32 si_namelen;					/* request */
+	char *si_name;					/* request */
+	struct svc_export *si_exp;			/* response */
+};
+
+struct nfsd4_secinfo_no_name {
+	u32 sin_style;					/* request */
+	struct svc_export *sin_exp;			/* response */
+};
+
+struct nfsd4_setattr {
+	stateid_t	sa_stateid;         /* request */
+	u32		sa_bmval[3];        /* request */
+	struct iattr	sa_iattr;           /* request */
+	struct nfs4_acl *sa_acl;
+	struct xdr_netobj sa_label;
+};
+
+struct nfsd4_setclientid {
+	nfs4_verifier	se_verf;            /* request */
+	struct xdr_netobj se_name;
+	u32		se_callback_prog;   /* request */
+	u32		se_callback_netid_len;  /* request */
+	char *		se_callback_netid_val;  /* request */
+	u32		se_callback_addr_len;   /* request */
+	char *		se_callback_addr_val;   /* request */
+	u32		se_callback_ident;  /* request */
+	clientid_t	se_clientid;        /* response */
+	nfs4_verifier	se_confirm;         /* response */
+};
+
+struct nfsd4_setclientid_confirm {
+	clientid_t	sc_clientid;
+	nfs4_verifier	sc_confirm;
+};
+
+struct nfsd4_saved_compoundargs {
+	__be32 *p;
+	__be32 *end;
+	int pagelen;
+	struct page **pagelist;
+};
+
+struct nfsd4_test_stateid_id {
+	__be32			ts_id_status;
+	stateid_t		ts_id_stateid;
+	struct list_head	ts_id_list;
+};
+
+struct nfsd4_test_stateid {
+	u32		ts_num_ids;
+	struct list_head ts_stateid_list;
+};
+
+struct nfsd4_free_stateid {
+	stateid_t	fr_stateid;         /* request */
+};
+
+/* also used for NVERIFY */
+struct nfsd4_verify {
+	u32		ve_bmval[3];        /* request */
+	u32		ve_attrlen;         /* request */
+	char *		ve_attrval;         /* request */
+};
+
+struct nfsd4_write {
+	stateid_t	wr_stateid;         /* request */
+	u64		wr_offset;          /* request */
+	u32		wr_stable_how;      /* request */
+	u32		wr_buflen;          /* request */
+	struct kvec	wr_head;
+	struct page **	wr_pagelist;        /* request */
+
+	u32		wr_bytes_written;   /* response */
+	u32		wr_how_written;     /* response */
+	nfs4_verifier	wr_verifier;        /* response */
+};
+
+struct nfsd4_exchange_id {
+	nfs4_verifier	verifier;
+	struct xdr_netobj clname;
+	u32		flags;
+	clientid_t	clientid;
+	u32		seqid;
+	int		spa_how;
+};
+
+struct nfsd4_sequence {
+	struct nfs4_sessionid	sessionid;		/* request/response */
+	u32			seqid;			/* request/response */
+	u32			slotid;			/* request/response */
+	u32			maxslots;		/* request/response */
+	u32			cachethis;		/* request */
+#if 0
+	u32			target_maxslots;	/* response */
+#endif /* not yet */
+	u32			status_flags;		/* response */
+};
+
+struct nfsd4_destroy_session {
+	struct nfs4_sessionid	sessionid;
+};
+
+struct nfsd4_destroy_clientid {
+	clientid_t clientid;
+};
+
+struct nfsd4_reclaim_complete {
+	u32 rca_one_fs;
+};
+
+struct nfsd4_deviceid {
+	u64			fsid_idx;
+	u32			generation;
+	u32			pad;
+};
+
+struct nfsd4_layout_seg {
+	u32			iomode;
+	u64			offset;
+	u64			length;
+};
+
+struct nfsd4_getdeviceinfo {
+	struct nfsd4_deviceid	gd_devid;	/* request */
+	u32			gd_layout_type;	/* request */
+	u32			gd_maxcount;	/* request */
+	u32			gd_notify_types;/* request - response */
+	void			*gd_device;	/* response */
+};
+
+struct nfsd4_layoutget {
+	u64			lg_minlength;	/* request */
+	u32			lg_signal;	/* request */
+	u32			lg_layout_type;	/* request */
+	u32			lg_maxcount;	/* request */
+	stateid_t		lg_sid;		/* request/response */
+	struct nfsd4_layout_seg	lg_seg;		/* request/response */
+	void			*lg_content;	/* response */
+};
+
+struct nfsd4_layoutcommit {
+	stateid_t		lc_sid;		/* request */
+	struct nfsd4_layout_seg	lc_seg;		/* request */
+	u32			lc_reclaim;	/* request */
+	u32			lc_newoffset;	/* request */
+	u64			lc_last_wr;	/* request */
+	struct timespec		lc_mtime;	/* request */
+	u32			lc_layout_type;	/* request */
+	u32			lc_up_len;	/* layout length */
+	void			*lc_up_layout;	/* decoded by callback */
+	u32			lc_size_chg;	/* boolean for response */
+	u64			lc_newsize;	/* response */
+};
+
+struct nfsd4_layoutreturn {
+	u32			lr_return_type;	/* request */
+	u32			lr_layout_type;	/* request */
+	struct nfsd4_layout_seg	lr_seg;		/* request */
+	u32			lr_reclaim;	/* request */
+	u32			lrf_body_len;	/* request */
+	void			*lrf_body;	/* request */
+	stateid_t		lr_sid;		/* request/response */
+	u32			lrs_present;	/* response */
+};
+
+struct nfsd4_fallocate {
+	/* request */
+	stateid_t	falloc_stateid;
+	loff_t		falloc_offset;
+	u64		falloc_length;
+};
+
+struct nfsd4_seek {
+	/* request */
+	stateid_t	seek_stateid;
+	loff_t		seek_offset;
+	u32		seek_whence;
+
+	/* response */
+	u32		seek_eof;
+	loff_t		seek_pos;
+};
+
+struct nfsd4_op {
+	int					opnum;
+	__be32					status;
+	union {
+		struct nfsd4_access		access;
+		struct nfsd4_close		close;
+		struct nfsd4_commit		commit;
+		struct nfsd4_create		create;
+		struct nfsd4_delegreturn	delegreturn;
+		struct nfsd4_getattr		getattr;
+		struct svc_fh *			getfh;
+		struct nfsd4_link		link;
+		struct nfsd4_lock		lock;
+		struct nfsd4_lockt		lockt;
+		struct nfsd4_locku		locku;
+		struct nfsd4_lookup		lookup;
+		struct nfsd4_verify		nverify;
+		struct nfsd4_open		open;
+		struct nfsd4_open_confirm	open_confirm;
+		struct nfsd4_open_downgrade	open_downgrade;
+		struct nfsd4_putfh		putfh;
+		struct nfsd4_read		read;
+		struct nfsd4_readdir		readdir;
+		struct nfsd4_readlink		readlink;
+		struct nfsd4_remove		remove;
+		struct nfsd4_rename		rename;
+		clientid_t			renew;
+		struct nfsd4_secinfo		secinfo;
+		struct nfsd4_setattr		setattr;
+		struct nfsd4_setclientid	setclientid;
+		struct nfsd4_setclientid_confirm setclientid_confirm;
+		struct nfsd4_verify		verify;
+		struct nfsd4_write		write;
+		struct nfsd4_release_lockowner	release_lockowner;
+
+		/* NFSv4.1 */
+		struct nfsd4_exchange_id	exchange_id;
+		struct nfsd4_backchannel_ctl	backchannel_ctl;
+		struct nfsd4_bind_conn_to_session bind_conn_to_session;
+		struct nfsd4_create_session	create_session;
+		struct nfsd4_destroy_session	destroy_session;
+		struct nfsd4_sequence		sequence;
+		struct nfsd4_reclaim_complete	reclaim_complete;
+		struct nfsd4_test_stateid	test_stateid;
+		struct nfsd4_free_stateid	free_stateid;
+		struct nfsd4_getdeviceinfo	getdeviceinfo;
+		struct nfsd4_layoutget		layoutget;
+		struct nfsd4_layoutcommit	layoutcommit;
+		struct nfsd4_layoutreturn	layoutreturn;
+
+		/* NFSv4.2 */
+		struct nfsd4_fallocate		allocate;
+		struct nfsd4_fallocate		deallocate;
+		struct nfsd4_seek		seek;
+	} u;
+	struct nfs4_replay *			replay;
+};
+
+bool nfsd4_cache_this_op(struct nfsd4_op *);
+
+/*
+ * Memory needed just for the duration of processing one compound:
+ */
+struct svcxdr_tmpbuf {
+	struct svcxdr_tmpbuf *next;
+	char buf[];
+};
+
+struct nfsd4_compoundargs {
+	/* scratch variables for XDR decode */
+	__be32 *			p;
+	__be32 *			end;
+	struct page **			pagelist;
+	int				pagelen;
+	__be32				tmp[8];
+	__be32 *			tmpp;
+	struct svcxdr_tmpbuf		*to_free;
+
+	struct svc_rqst			*rqstp;
+
+	u32				taglen;
+	char *				tag;
+	u32				minorversion;
+	u32				opcnt;
+	struct nfsd4_op			*ops;
+	struct nfsd4_op			iops[8];
+	int				cachetype;
+};
+
+struct nfsd4_compoundres {
+	/* scratch variables for XDR encode */
+	struct xdr_stream		xdr;
+	struct svc_rqst *		rqstp;
+
+	u32				taglen;
+	char *				tag;
+	u32				opcnt;
+	__be32 *			tagp; /* tag, opcount encode location */
+	struct nfsd4_compound_state	cstate;
+};
+
+static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+	return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
+}
+
+static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
+{
+	return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+		|| nfsd4_is_solo_sequence(resp);
+}
+
+static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+	return argp->opcnt == resp->opcnt;
+}
+
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op);
+void warn_on_nonidempotent_op(struct nfsd4_op *op);
+
+#define NFS4_SVC_XDRSIZE		sizeof(struct nfsd4_compoundargs)
+
+static inline void
+set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
+{
+	BUG_ON(!fhp->fh_pre_saved);
+	cinfo->atomic = fhp->fh_post_saved;
+	cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
+
+	cinfo->before_change = fhp->fh_pre_change;
+	cinfo->after_change = fhp->fh_post_change;
+	cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+	cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+	cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+	cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+
+}
+
+int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
+		struct nfsd4_compoundargs *);
+int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
+		struct nfsd4_compoundres *);
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
+void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
+void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op);
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+		struct svc_fh *fhp, struct svc_export *exp,
+		struct dentry *dentry,
+		u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_setclientid *setclid);
+extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_setclientid_confirm *setclientid_confirm);
+extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_create_session *);
+extern __be32 nfsd4_sequence(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_sequence *);
+extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp);
+extern __be32 nfsd4_destroy_session(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_destroy_session *);
+extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
+extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
+		struct nfsd4_open *open, struct nfsd_net *nn);
+extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
+		struct svc_fh *current_fh, struct nfsd4_open *open);
+extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
+extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+		struct nfsd4_open *open);
+extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
+extern __be32 nfsd4_close(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_close *close);
+extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_open_downgrade *od);
+extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+		struct nfsd4_lock *lock);
+extern __be32 nfsd4_lockt(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_lockt *lockt);
+extern __be32 nfsd4_locku(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_locku *locku);
+extern __be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_release_lockowner *rlockowner);
+extern int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp);
+extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr);
+extern __be32 nfsd4_renew(struct svc_rqst *rqstp,
+			  struct nfsd4_compound_state *, clientid_t *clid);
+extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid);
+extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
+extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
+
+#endif
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/kernel/fs/nfsd/xdr4cb.h b/kernel/fs/nfsd/xdr4cb.h
new file mode 100644
index 000000000..c47f6fdb1
--- /dev/null
+++ b/kernel/fs/nfsd/xdr4cb.h
@@ -0,0 +1,30 @@
+#define NFS4_MAXTAGLEN		20
+
+#define NFS4_enc_cb_null_sz		0
+#define NFS4_dec_cb_null_sz		0
+#define cb_compound_enc_hdr_sz		4
+#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
+					1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
+
+#define op_enc_sz			1
+#define op_dec_sz			2
+#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
+#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
+#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_stateid_sz +            \
+					enc_nfs4_fh_sz)
+
+#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
+#define NFS4_enc_cb_layout_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + 3 +                         \
+					enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
diff --git a/kernel/fs/nilfs2/Kconfig b/kernel/fs/nilfs2/Kconfig
new file mode 100644
index 000000000..80da8eb27
--- /dev/null
+++ b/kernel/fs/nilfs2/Kconfig
@@ -0,0 +1,24 @@
+config NILFS2_FS
+	tristate "NILFS2 file system support"
+	select CRC32
+	help
+	  NILFS2 is a log-structured file system (LFS) supporting continuous
+	  snapshotting.  In addition to versioning capability of the entire
+	  file system, users can even restore files mistakenly overwritten or
+	  destroyed just a few seconds ago.  Since this file system can keep
+	  consistency like conventional LFS, it achieves quick recovery after
+	  system crashes.
+
+	  NILFS2 creates a number of checkpoints every few seconds or per
+	  synchronous write basis (unless there is no change).  Users can
+	  select significant versions among continuously created checkpoints,
+	  and can change them into snapshots which will be preserved for long
+	  periods until they are changed back to checkpoints.  Each
+	  snapshot is mountable as a read-only file system concurrently with
+	  its writable mount, and this feature is convenient for online backup.
+
+	  Some features including atime, extended attributes, and POSIX ACLs,
+	  are not supported yet.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called nilfs2.  If unsure, say N.
diff --git a/kernel/fs/nilfs2/Makefile b/kernel/fs/nilfs2/Makefile
new file mode 100644
index 000000000..fc603e043
--- /dev/null
+++ b/kernel/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_NILFS2_FS) += nilfs2.o
+nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
+	btnode.o bmap.o btree.o direct.o dat.o recovery.o \
+	the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
+	ifile.o alloc.o gcinode.o ioctl.o sysfs.o
diff --git a/kernel/fs/nilfs2/alloc.c b/kernel/fs/nilfs2/alloc.c
new file mode 100644
index 000000000..8df0f3b78
--- /dev/null
+++ b/kernel/fs/nilfs2/alloc.c
@@ -0,0 +1,785 @@
+/*
+ * alloc.c - NILFS dat/inode allocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "mdt.h"
+#include "alloc.h"
+
+
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ *					descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
+static inline unsigned long
+nilfs_palloc_groups_per_desc_block(const struct inode *inode)
+{
+	return (1UL << inode->i_blkbits) /
+		sizeof(struct nilfs_palloc_group_desc);
+}
+
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
+static inline unsigned long
+nilfs_palloc_groups_count(const struct inode *inode)
+{
+	return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
+}
+
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
+	if (!mi->mi_bgl)
+		return -ENOMEM;
+
+	bgl_lock_init(mi->mi_bgl);
+
+	nilfs_mdt_set_entry_size(inode, entry_size, 0);
+
+	mi->mi_blocks_per_group =
+		DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
+			     mi->mi_entries_per_block) + 1;
+		/* Number of blocks in a group including entry blocks and
+		   a bitmap block */
+	mi->mi_blocks_per_desc_block =
+		nilfs_palloc_groups_per_desc_block(inode) *
+		mi->mi_blocks_per_group + 1;
+		/* Number of blocks per descriptor including the
+		   descriptor block */
+	return 0;
+}
+
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
+static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
+					unsigned long *offset)
+{
+	__u64 group = nr;
+
+	*offset = do_div(group, nilfs_palloc_entries_per_group(inode));
+	return group;
+}
+
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
+static unsigned long
+nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_block =
+		group / nilfs_palloc_groups_per_desc_block(inode);
+	return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
+}
+
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
+static unsigned long
+nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_offset =
+		group % nilfs_palloc_groups_per_desc_block(inode);
+	return nilfs_palloc_desc_blkoff(inode, group) + 1 +
+		desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
+}
+
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
+static unsigned long
+nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
+			       const struct nilfs_palloc_group_desc *desc)
+{
+	unsigned long nfree;
+
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	nfree = le32_to_cpu(desc->pg_nfrees);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+	return nfree;
+}
+
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
+static void
+nilfs_palloc_group_desc_add_entries(struct inode *inode,
+				    unsigned long group,
+				    struct nilfs_palloc_group_desc *desc,
+				    u32 n)
+{
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	le32_add_cpu(&desc->pg_nfrees, n);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+}
+
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
+static unsigned long
+nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
+{
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, nr, &group_offset);
+
+	return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
+		group_offset / NILFS_MDT(inode)->mi_entries_per_block;
+}
+
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
+static void nilfs_palloc_desc_block_init(struct inode *inode,
+					 struct buffer_head *bh, void *kaddr)
+{
+	struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
+	unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
+	__le32 nfrees;
+
+	nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
+	while (n-- > 0) {
+		desc->pg_nfrees = nfrees;
+		desc++;
+	}
+}
+
+static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
+				  int create,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *),
+				  struct buffer_head **bhp,
+				  struct nilfs_bh_assoc *prev,
+				  spinlock_t *lock)
+{
+	int ret;
+
+	spin_lock(lock);
+	if (prev->bh && blkoff == prev->blkoff) {
+		get_bh(prev->bh);
+		*bhp = prev->bh;
+		spin_unlock(lock);
+		return 0;
+	}
+	spin_unlock(lock);
+
+	ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
+	if (!ret) {
+		spin_lock(lock);
+		/*
+		 * The following code must be safe for change of the
+		 * cache contents during the get block call.
+		 */
+		brelse(prev->bh);
+		get_bh(*bhp);
+		prev->bh = *bhp;
+		prev->blkoff = blkoff;
+		spin_unlock(lock);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
+static int nilfs_palloc_get_desc_block(struct inode *inode,
+				       unsigned long group,
+				       int create, struct buffer_head **bhp)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_desc_blkoff(inode, group),
+				      create, nilfs_palloc_desc_block_init,
+				      bhp, &cache->prev_desc, &cache->lock);
+}
+
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
+static int nilfs_palloc_get_bitmap_block(struct inode *inode,
+					 unsigned long group,
+					 int create, struct buffer_head **bhp)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_bitmap_blkoff(inode, group),
+				      create, NULL, bhp,
+				      &cache->prev_bitmap, &cache->lock);
+}
+
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
+int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
+				 int create, struct buffer_head **bhp)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_entry_blkoff(inode, nr),
+				      create, NULL, bhp,
+				      &cache->prev_entry, &cache->lock);
+}
+
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
+static struct nilfs_palloc_group_desc *
+nilfs_palloc_block_get_group_desc(const struct inode *inode,
+				  unsigned long group,
+				  const struct buffer_head *bh, void *kaddr)
+{
+	return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
+		group % nilfs_palloc_groups_per_desc_block(inode);
+}
+
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
+void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
+				   const struct buffer_head *bh, void *kaddr)
+{
+	unsigned long entry_offset, group_offset;
+
+	nilfs_palloc_group(inode, nr, &group_offset);
+	entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
+
+	return kaddr + bh_offset(bh) +
+		entry_offset * NILFS_MDT(inode)->mi_entry_size;
+}
+
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
+static int nilfs_palloc_find_available_slot(struct inode *inode,
+					    unsigned long group,
+					    unsigned long target,
+					    unsigned char *bitmap,
+					    int bsize)
+{
+	int curr, pos, end, i;
+
+	if (target > 0) {
+		end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
+		if (end > bsize)
+			end = bsize;
+		pos = nilfs_find_next_zero_bit(bitmap, end, target);
+		if (pos < end &&
+		    !nilfs_set_bit_atomic(
+			    nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+			return pos;
+	} else
+		end = 0;
+
+	for (i = 0, curr = end;
+	     i < bsize;
+	     i += BITS_PER_LONG, curr += BITS_PER_LONG) {
+		/* wrap around */
+		if (curr >= bsize)
+			curr = 0;
+		while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
+		       != ~0UL) {
+			end = curr + BITS_PER_LONG;
+			if (end > bsize)
+				end = bsize;
+			pos = nilfs_find_next_zero_bit(bitmap, end, curr);
+			if ((pos < end) &&
+			    !nilfs_set_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group), pos,
+				    bitmap))
+				return pos;
+		}
+	}
+	return -ENOSPC;
+}
+
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ *					    in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
+static unsigned long
+nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
+				       unsigned long curr, unsigned long max)
+{
+	return min_t(unsigned long,
+		     nilfs_palloc_groups_per_desc_block(inode) -
+		     curr % nilfs_palloc_groups_per_desc_block(inode),
+		     max - curr + 1);
+}
+
+/**
+ * nilfs_palloc_count_desc_blocks - count descriptor blocks number
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: descriptor blocks number [out]
+ */
+static int nilfs_palloc_count_desc_blocks(struct inode *inode,
+					    unsigned long *desc_blocks)
+{
+	__u64 blknum;
+	int ret;
+
+	ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
+	if (likely(!ret))
+		*desc_blocks = DIV_ROUND_UP(
+			(unsigned long)blknum,
+			NILFS_MDT(inode)->mi_blocks_per_desc_block);
+	return ret;
+}
+
+/**
+ * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
+ *					MDT file growing
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: known current descriptor blocks count
+ */
+static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
+						    unsigned long desc_blocks)
+{
+	return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
+			nilfs_palloc_groups_count(inode);
+}
+
+/**
+ * nilfs_palloc_count_max_entries - count max number of entries that can be
+ *					described by descriptor blocks count
+ * @inode: inode of metadata file using this allocator
+ * @nused: current number of used entries
+ * @nmaxp: max number of entries [out]
+ */
+int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
+{
+	unsigned long desc_blocks = 0;
+	u64 entries_per_desc_block, nmax;
+	int err;
+
+	err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
+	if (unlikely(err))
+		return err;
+
+	entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
+				nilfs_palloc_groups_per_desc_block(inode);
+	nmax = entries_per_desc_block * desc_blocks;
+
+	if (nused == nmax &&
+			nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
+		nmax += entries_per_desc_block;
+
+	if (nused > nmax)
+		return -ERANGE;
+
+	*nmaxp = nmax;
+	return 0;
+}
+
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, maxgroup, ngroups;
+	unsigned long group_offset, maxgroup_offset;
+	unsigned long n, entries_per_group, groups_per_desc_block;
+	unsigned long i, j;
+	int pos, ret;
+
+	ngroups = nilfs_palloc_groups_count(inode);
+	maxgroup = ngroups - 1;
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	entries_per_group = nilfs_palloc_entries_per_group(inode);
+	groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
+
+	for (i = 0; i < ngroups; i += n) {
+		if (group >= ngroups) {
+			/* wrap around */
+			group = 0;
+			maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
+						      &maxgroup_offset) - 1;
+		}
+		ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+		if (ret < 0)
+			return ret;
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
+							   maxgroup);
+		for (j = 0; j < n; j++, desc++, group++) {
+			if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
+			    > 0) {
+				ret = nilfs_palloc_get_bitmap_block(
+					inode, group, 1, &bitmap_bh);
+				if (ret < 0)
+					goto out_desc;
+				bitmap_kaddr = kmap(bitmap_bh->b_page);
+				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+				pos = nilfs_palloc_find_available_slot(
+					inode, group, group_offset, bitmap,
+					entries_per_group);
+				if (pos >= 0) {
+					/* found a free entry */
+					nilfs_palloc_group_desc_add_entries(
+						inode, group, desc, -1);
+					req->pr_entry_nr =
+						entries_per_group * group + pos;
+					kunmap(desc_bh->b_page);
+					kunmap(bitmap_bh->b_page);
+
+					req->pr_desc_bh = desc_bh;
+					req->pr_bitmap_bh = bitmap_bh;
+					return 0;
+				}
+				kunmap(bitmap_bh->b_page);
+				brelse(bitmap_bh);
+			}
+
+			group_offset = 0;
+		}
+
+		kunmap(desc_bh->b_page);
+		brelse(desc_bh);
+	}
+
+	/* no entries left */
+	return -ENOSPC;
+
+ out_desc:
+	kunmap(desc_bh->b_page);
+	brelse(desc_bh);
+	return ret;
+}
+
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
+void nilfs_palloc_commit_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	mark_buffer_dirty(req->pr_bitmap_bh);
+	mark_buffer_dirty(req->pr_desc_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
+void nilfs_palloc_commit_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	unsigned long group, group_offset;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+	else
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	mark_buffer_dirty(req->pr_desc_bh);
+	mark_buffer_dirty(req->pr_bitmap_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
+void nilfs_palloc_abort_alloc_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned char *bitmap;
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+	else
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
+int nilfs_palloc_prepare_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	unsigned long group, group_offset;
+	int ret;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+	if (ret < 0)
+		return ret;
+	ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
+	if (ret < 0) {
+		brelse(desc_bh);
+		return ret;
+	}
+
+	req->pr_desc_bh = desc_bh;
+	req->pr_bitmap_bh = bitmap_bh;
+	return 0;
+}
+
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
+void nilfs_palloc_abort_free_entry(struct inode *inode,
+				   struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
+static int
+nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
+{
+	__u64 first, last;
+
+	first = group * nilfs_palloc_entries_per_group(inode);
+	last = first + nilfs_palloc_entries_per_group(inode) - 1;
+	return (nr >= first) && (nr <= last);
+}
+
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
+int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, group_offset;
+	int i, j, n, ret;
+
+	for (i = 0; i < nitems; i = j) {
+		group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
+		ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
+		if (ret < 0)
+			return ret;
+		ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
+						    &bitmap_bh);
+		if (ret < 0) {
+			brelse(desc_bh);
+			return ret;
+		}
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		bitmap_kaddr = kmap(bitmap_bh->b_page);
+		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+		for (j = i, n = 0;
+		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
+							      entry_nrs[j]);
+		     j++) {
+			nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+			if (!nilfs_clear_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap)) {
+				printk(KERN_WARNING
+				       "%s: entry number %llu already freed\n",
+				       __func__,
+				       (unsigned long long)entry_nrs[j]);
+			} else {
+				n++;
+			}
+		}
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+		kunmap(bitmap_bh->b_page);
+		kunmap(desc_bh->b_page);
+
+		mark_buffer_dirty(desc_bh);
+		mark_buffer_dirty(bitmap_bh);
+		nilfs_mdt_mark_dirty(inode);
+
+		brelse(bitmap_bh);
+		brelse(desc_bh);
+	}
+	return 0;
+}
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache)
+{
+	NILFS_MDT(inode)->mi_palloc_cache = cache;
+	spin_lock_init(&cache->lock);
+}
+
+void nilfs_palloc_clear_cache(struct inode *inode)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	spin_lock(&cache->lock);
+	brelse(cache->prev_desc.bh);
+	brelse(cache->prev_bitmap.bh);
+	brelse(cache->prev_entry.bh);
+	cache->prev_desc.bh = NULL;
+	cache->prev_bitmap.bh = NULL;
+	cache->prev_entry.bh = NULL;
+	spin_unlock(&cache->lock);
+}
+
+void nilfs_palloc_destroy_cache(struct inode *inode)
+{
+	nilfs_palloc_clear_cache(inode);
+	NILFS_MDT(inode)->mi_palloc_cache = NULL;
+}
diff --git a/kernel/fs/nilfs2/alloc.h b/kernel/fs/nilfs2/alloc.h
new file mode 100644
index 000000000..4bd6451b5
--- /dev/null
+++ b/kernel/fs/nilfs2/alloc.h
@@ -0,0 +1,110 @@
+/*
+ * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#ifndef _NILFS_ALLOC_H
+#define _NILFS_ALLOC_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
+static inline unsigned long
+nilfs_palloc_entries_per_group(const struct inode *inode)
+{
+	return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
+}
+
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
+				 struct buffer_head **);
+void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
+				   const struct buffer_head *, void *);
+
+int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
+
+/**
+ * nilfs_palloc_req - persistent allocator request and reply
+ * @pr_entry_nr: entry number (vblocknr or inode number)
+ * @pr_desc_bh: buffer head of the buffer containing block group descriptors
+ * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
+ * @pr_entry_bh: buffer head of the buffer containing translation entries
+ */
+struct nilfs_palloc_req {
+	__u64 pr_entry_nr;
+	struct buffer_head *pr_desc_bh;
+	struct buffer_head *pr_bitmap_bh;
+	struct buffer_head *pr_entry_bh;
+};
+
+int nilfs_palloc_prepare_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_commit_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
+
+#define nilfs_set_bit_atomic		ext2_set_bit_atomic
+#define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
+#define nilfs_find_next_zero_bit	find_next_zero_bit_le
+
+/**
+ * struct nilfs_bh_assoc - block offset and buffer head association
+ * @blkoff: block offset
+ * @bh: buffer head
+ */
+struct nilfs_bh_assoc {
+	unsigned long blkoff;
+	struct buffer_head *bh;
+};
+
+/**
+ * struct nilfs_palloc_cache - persistent object allocator cache
+ * @lock: cache protecting lock
+ * @prev_desc: blockgroup descriptors cache
+ * @prev_bitmap: blockgroup bitmap cache
+ * @prev_entry: translation entries cache
+ */
+struct nilfs_palloc_cache {
+	spinlock_t lock;
+	struct nilfs_bh_assoc prev_desc;
+	struct nilfs_bh_assoc prev_bitmap;
+	struct nilfs_bh_assoc prev_entry;
+};
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache);
+void nilfs_palloc_clear_cache(struct inode *inode);
+void nilfs_palloc_destroy_cache(struct inode *inode);
+
+#endif	/* _NILFS_ALLOC_H */
diff --git a/kernel/fs/nilfs2/bmap.c b/kernel/fs/nilfs2/bmap.c
new file mode 100644
index 000000000..27f75bcbe
--- /dev/null
+++ b/kernel/fs/nilfs2/bmap.c
@@ -0,0 +1,593 @@
+/*
+ * bmap.c - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "bmap.h"
+#include "btree.h"
+#include "direct.h"
+#include "btnode.h"
+#include "mdt.h"
+#include "dat.h"
+#include "alloc.h"
+
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+	struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info;
+
+	return nilfs->ns_dat;
+}
+
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+				     const char *fname, int err)
+{
+	struct inode *inode = bmap->b_inode;
+
+	if (err == -EINVAL) {
+		nilfs_error(inode->i_sb, fname,
+			    "broken bmap (inode number=%lu)\n", inode->i_ino);
+		err = -EIO;
+	}
+	return err;
+}
+
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
+			       __u64 *ptrp)
+{
+	sector_t blocknr;
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
+	if (ret < 0) {
+		ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+		goto out;
+	}
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
+					  &blocknr);
+		if (!ret)
+			*ptrp = blocknr;
+	}
+
+ out:
+	up_read(&bmap->b_sem);
+	return ret;
+}
+
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
+			     unsigned maxblocks)
+{
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
+	up_read(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
+	__u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_insert != NULL) {
+		ret = bmap->b_ops->bop_check_insert(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_btree_convert_and_insert(
+				bmap, key, ptr, keys, ptrs, n);
+			if (ret == 0)
+				bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_insert(bmap, key, ptr);
+}
+
+/**
+ * nilfs_bmap_insert - insert a new key-record pair into a bmap
+ * @bmap: bmap
+ * @key: key
+ * @rec: record
+ *
+ * Description: nilfs_bmap_insert() inserts the new key-record pair specified
+ * by @key and @rec into @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EEXIST - A record associated with @key already exist.
+ */
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_insert(bmap, key, rec);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	__u64 keys[NILFS_BMAP_LARGE_LOW + 1];
+	__u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_delete != NULL) {
+		ret = bmap->b_ops->bop_check_delete(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_direct_delete_and_convert(
+				bmap, key, keys, ptrs, n);
+			if (ret == 0)
+				bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_delete(bmap, key);
+}
+
+/**
+ * nilfs_bmap_seek_key - seek a valid entry and return its key
+ * @bmap: bmap struct
+ * @start: start key number
+ * @keyp: place to store valid key
+ *
+ * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap
+ * starting from @start, and stores it to @keyp if found.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No valid entry was found
+ */
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp)
+{
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_seek_key(bmap, start, keyp);
+	up_read(&bmap->b_sem);
+
+	if (ret < 0)
+		ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+	return ret;
+}
+
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
+{
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_last_key(bmap, keyp);
+	up_read(&bmap->b_sem);
+
+	if (ret < 0)
+		ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_delete - delete a key-record pair from a bmap
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_delete() deletes the key-record pair specified by
+ * @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_delete(bmap, key);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
+{
+	__u64 lastkey;
+	int ret;
+
+	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		return ret;
+	}
+
+	while (key <= lastkey) {
+		ret = nilfs_bmap_do_delete(bmap, lastkey);
+		if (ret < 0)
+			return ret;
+		ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * nilfs_bmap_truncate - truncate a bmap to a specified key
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
+ * greater than or equal to @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_truncate(bmap, key);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_clear - free resources a bmap holds
+ * @bmap: bmap
+ *
+ * Description: nilfs_bmap_clear() frees resources associated with @bmap.
+ */
+void nilfs_bmap_clear(struct nilfs_bmap *bmap)
+{
+	down_write(&bmap->b_sem);
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+	up_write(&bmap->b_sem);
+}
+
+/**
+ * nilfs_bmap_propagate - propagate dirty state
+ * @bmap: bmap
+ * @bh: buffer head
+ *
+ * Description: nilfs_bmap_propagate() marks the buffers that directly or
+ * indirectly refer to the block specified by @bh dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_propagate(bmap, bh);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_lookup_dirty_buffers -
+ * @bmap: bmap
+ * @listp: pointer to buffer head list
+ */
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+				     struct list_head *listp)
+{
+	if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
+		bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
+}
+
+/**
+ * nilfs_bmap_assign - assign a new block number to a block
+ * @bmap: bmap
+ * @bhp: pointer to buffer head
+ * @blocknr: block number
+ * @binfo: block information
+ *
+ * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
+ * buffer specified by @bh.
+ *
+ * Return Value: On success, 0 is returned and the buffer head of a newly
+ * create buffer and the block information associated with the buffer are
+ * stored in the place pointed by @bh and @binfo, respectively. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_assign(struct nilfs_bmap *bmap,
+		      struct buffer_head **bh,
+		      unsigned long blocknr,
+		      union nilfs_binfo *binfo)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_mark - mark block dirty
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ *
+ * Description: nilfs_bmap_mark() marks the block specified by @key and @level
+ * as dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+	int ret;
+
+	if (bmap->b_ops->bop_mark == NULL)
+		return 0;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_mark(bmap, key, level);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
+ * @bmap: bmap
+ *
+ * Description: nilfs_test_and_clear() is the atomic operation to test and
+ * clear the dirty state of @bmap.
+ *
+ * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
+ */
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_dirty(bmap);
+	nilfs_bmap_clear_dirty(bmap);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+
+/*
+ * Internal use only
+ */
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
+			      const struct buffer_head *bh)
+{
+	struct buffer_head *pbh;
+	__u64 key;
+
+	key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+					 bmap->b_inode->i_blkbits);
+	for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
+		key++;
+
+	return key;
+}
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
+{
+	__s64 diff;
+
+	diff = key - bmap->b_last_allocated_key;
+	if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
+	    (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
+	    (bmap->b_last_allocated_ptr + diff > 0))
+		return bmap->b_last_allocated_ptr + diff;
+	else
+		return NILFS_BMAP_INVALID_PTR;
+}
+
+#define NILFS_BMAP_GROUP_DIV	8
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
+{
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
+	unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
+	unsigned long group = bmap->b_inode->i_ino / entries_per_group;
+
+	return group * entries_per_group +
+		(bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
+		(entries_per_group / NILFS_BMAP_GROUP_DIV);
+}
+
+static struct lock_class_key nilfs_bmap_dat_lock_key;
+static struct lock_class_key nilfs_bmap_mdt_lock_key;
+
+/**
+ * nilfs_bmap_read - read a bmap from an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_read() initializes the bmap @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	if (raw_inode == NULL)
+		memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
+	else
+		memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
+
+	init_rwsem(&bmap->b_sem);
+	bmap->b_state = 0;
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		bmap->b_ptr_type = NILFS_BMAP_PTR_P;
+		bmap->b_last_allocated_key = 0;
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+		break;
+	case NILFS_CPFILE_INO:
+	case NILFS_SUFILE_INO:
+		bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
+		bmap->b_last_allocated_key = 0;
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
+		break;
+	case NILFS_IFILE_INO:
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
+		/* Fall through */
+	default:
+		bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
+		bmap->b_last_allocated_key = 0;
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		break;
+	}
+
+	return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
+		nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
+}
+
+/**
+ * nilfs_bmap_write - write back a bmap to an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
+ */
+void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	down_write(&bmap->b_sem);
+	memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
+	       NILFS_INODE_BMAP_SIZE * sizeof(__le64));
+	if (bmap->b_inode->i_ino == NILFS_DAT_INO)
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+
+	up_write(&bmap->b_sem);
+}
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
+{
+	memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
+	init_rwsem(&bmap->b_sem);
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	bmap->b_ptr_type = NILFS_BMAP_PTR_U;
+	bmap->b_last_allocated_key = 0;
+	bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+	bmap->b_state = 0;
+	nilfs_btree_init_gc(bmap);
+}
+
+void nilfs_bmap_save(const struct nilfs_bmap *bmap,
+		     struct nilfs_bmap_store *store)
+{
+	memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
+	store->last_allocated_key = bmap->b_last_allocated_key;
+	store->last_allocated_ptr = bmap->b_last_allocated_ptr;
+	store->state = bmap->b_state;
+}
+
+void nilfs_bmap_restore(struct nilfs_bmap *bmap,
+			const struct nilfs_bmap_store *store)
+{
+	memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
+	bmap->b_last_allocated_key = store->last_allocated_key;
+	bmap->b_last_allocated_ptr = store->last_allocated_ptr;
+	bmap->b_state = store->state;
+}
diff --git a/kernel/fs/nilfs2/bmap.h b/kernel/fs/nilfs2/bmap.h
new file mode 100644
index 000000000..bfa817ce4
--- /dev/null
+++ b/kernel/fs/nilfs2/bmap.h
@@ -0,0 +1,280 @@
+/*
+ * bmap.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BMAP_H
+#define _NILFS_BMAP_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "alloc.h"
+#include "dat.h"
+
+#define NILFS_BMAP_INVALID_PTR	0
+
+#define nilfs_bmap_keydiff_abs(diff)	((diff) < 0 ? -(diff) : (diff))
+
+
+struct nilfs_bmap;
+
+/**
+ * union nilfs_bmap_ptr_req - request for bmap ptr
+ * @bpr_ptr: bmap pointer
+ * @bpr_req: request for persistent allocator
+ */
+union nilfs_bmap_ptr_req {
+	__u64 bpr_ptr;
+	struct nilfs_palloc_req bpr_req;
+};
+
+/**
+ * struct nilfs_bmap_stats - bmap statistics
+ * @bs_nblocks: number of blocks created or deleted
+ */
+struct nilfs_bmap_stats {
+	unsigned int bs_nblocks;
+};
+
+/**
+ * struct nilfs_bmap_operations - bmap operation table
+ */
+struct nilfs_bmap_operations {
+	int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+	int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
+				 unsigned);
+	int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
+	int (*bop_delete)(struct nilfs_bmap *, __u64);
+	void (*bop_clear)(struct nilfs_bmap *);
+
+	int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
+	void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
+					 struct list_head *);
+
+	int (*bop_assign)(struct nilfs_bmap *,
+			  struct buffer_head **,
+			  sector_t,
+			  union nilfs_binfo *);
+	int (*bop_mark)(struct nilfs_bmap *, __u64, int);
+
+	int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *);
+	int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+
+	/* The following functions are internal use only. */
+	int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
+	int (*bop_check_delete)(struct nilfs_bmap *, __u64);
+	int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
+};
+
+
+#define NILFS_BMAP_SIZE		(NILFS_INODE_BMAP_SIZE * sizeof(__le64))
+#define NILFS_BMAP_KEY_BIT	(sizeof(unsigned long) * 8 /* CHAR_BIT */)
+#define NILFS_BMAP_NEW_PTR_INIT	\
+	(1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
+
+static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
+{
+	return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
+}
+
+
+/**
+ * struct nilfs_bmap - bmap structure
+ * @b_u: raw data
+ * @b_sem: semaphore
+ * @b_inode: owner of bmap
+ * @b_ops: bmap operation table
+ * @b_last_allocated_key: last allocated key for data block
+ * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_ptr_type: pointer type
+ * @b_state: state
+ * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
+ */
+struct nilfs_bmap {
+	union {
+		__u8 u_flags;
+		__le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	} b_u;
+	struct rw_semaphore b_sem;
+	struct inode *b_inode;
+	const struct nilfs_bmap_operations *b_ops;
+	__u64 b_last_allocated_key;
+	__u64 b_last_allocated_ptr;
+	int b_ptr_type;
+	int b_state;
+	__u16 b_nchildren_per_block;
+};
+
+/* pointer type */
+#define NILFS_BMAP_PTR_P	0	/* physical block number (i.e. LBN) */
+#define NILFS_BMAP_PTR_VS	1	/* virtual block number (single
+					   version) */
+#define NILFS_BMAP_PTR_VM	2	/* virtual block number (has multiple
+					   versions) */
+#define NILFS_BMAP_PTR_U	(-1)	/* never perform pointer operations */
+
+#define NILFS_BMAP_USE_VBN(bmap)	((bmap)->b_ptr_type > 0)
+
+/* state */
+#define NILFS_BMAP_DIRTY	0x00000001
+
+/**
+ * struct nilfs_bmap_store - shadow copy of bmap state
+ * @data: cached raw block mapping of on-disk inode
+ * @last_allocated_key: cached value of last allocated key for data block
+ * @last_allocated_ptr: cached value of last allocated ptr for data block
+ * @state: cached value of state field of bmap structure
+ */
+struct nilfs_bmap_store {
+	__le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	__u64 last_allocated_key;
+	__u64 last_allocated_ptr;
+	int state;
+};
+
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
+int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
+void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
+void nilfs_bmap_clear(struct nilfs_bmap *);
+int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
+int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
+		      unsigned long, union nilfs_binfo *);
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
+int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *);
+
+void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
+void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
+
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+				    __u64 *ptr)
+{
+	return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
+
+/*
+ * Internal use only
+ */
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
+
+static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
+{
+	if (dat)
+		return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
+	/* ignore target ptr */
+	req->bpr_ptr = bmap->b_last_allocated_ptr++;
+	return 0;
+}
+
+static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_commit_alloc(dat, &req->bpr_req);
+}
+
+static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
+					      union nilfs_bmap_ptr_req *req,
+					      struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_abort_alloc(dat, &req->bpr_req);
+	else
+		bmap->b_last_allocated_ptr--;
+}
+
+static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
+{
+	return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
+}
+
+static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_commit_end(dat, &req->bpr_req,
+				     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
+}
+
+static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
+					    union nilfs_bmap_ptr_req *req,
+					    struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_abort_end(dat, &req->bpr_req);
+}
+
+static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
+					   __u64 ptr)
+{
+	bmap->b_last_allocated_key = key;
+	bmap->b_last_allocated_ptr = ptr;
+}
+
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
+			      const struct buffer_head *);
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
+
+
+/* Assume that bmap semaphore is locked. */
+static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
+{
+	return !!(bmap->b_state & NILFS_BMAP_DIRTY);
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state |= NILFS_BMAP_DIRTY;
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state &= ~NILFS_BMAP_DIRTY;
+}
+
+
+#define NILFS_BMAP_LARGE	0x1
+
+#define NILFS_BMAP_SMALL_LOW	NILFS_DIRECT_KEY_MIN
+#define NILFS_BMAP_SMALL_HIGH	NILFS_DIRECT_KEY_MAX
+#define NILFS_BMAP_LARGE_LOW	NILFS_BTREE_ROOT_NCHILDREN_MAX
+#define NILFS_BMAP_LARGE_HIGH	NILFS_BTREE_KEY_MAX
+
+#endif	/* _NILFS_BMAP_H */
diff --git a/kernel/fs/nilfs2/btnode.c b/kernel/fs/nilfs2/btnode.c
new file mode 100644
index 000000000..a35ae35e6
--- /dev/null
+++ b/kernel/fs/nilfs2/btnode.c
@@ -0,0 +1,297 @@
+/*
+ * btnode.c - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * This file was originally written by Seiji Kihara <kihara@osrg.net>
+ * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
+ * stabilization and simplification.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/gfp.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "dat.h"
+#include "page.h"
+#include "btnode.h"
+
+void nilfs_btnode_cache_clear(struct address_space *btnc)
+{
+	invalidate_mapping_pages(btnc, 0, -1);
+	truncate_inode_pages(btnc, 0);
+}
+
+struct buffer_head *
+nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
+{
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	struct buffer_head *bh;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return NULL;
+
+	if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+		     buffer_dirty(bh))) {
+		brelse(bh);
+		BUG();
+	}
+	memset(bh->b_data, 0, 1 << inode->i_blkbits);
+	bh->b_bdev = inode->i_sb->s_bdev;
+	bh->b_blocknr = blocknr;
+	set_buffer_mapped(bh);
+	set_buffer_uptodate(bh);
+
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return bh;
+}
+
+int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
+			      sector_t pblocknr, int mode,
+			      struct buffer_head **pbh, sector_t *submit_ptr)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	struct page *page;
+	int err;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	err = -EEXIST; /* internal code */
+	page = bh->b_page;
+
+	if (buffer_uptodate(bh) || buffer_dirty(bh))
+		goto found;
+
+	if (pblocknr == 0) {
+		pblocknr = blocknr;
+		if (inode->i_ino != NILFS_DAT_INO) {
+			struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+			/* blocknr is a virtual block number */
+			err = nilfs_dat_translate(nilfs->ns_dat, blocknr,
+						  &pblocknr);
+			if (unlikely(err)) {
+				brelse(bh);
+				goto out_locked;
+			}
+		}
+	}
+
+	if (mode == READA) {
+		if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
+			err = -EBUSY; /* internal code */
+			brelse(bh);
+			goto out_locked;
+		}
+	} else { /* mode == READ */
+		lock_buffer(bh);
+	}
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		err = -EEXIST; /* internal code */
+		goto found;
+	}
+	set_buffer_mapped(bh);
+	bh->b_bdev = inode->i_sb->s_bdev;
+	bh->b_blocknr = pblocknr; /* set block address for read */
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(mode, bh);
+	bh->b_blocknr = blocknr; /* set back to the given block address */
+	*submit_ptr = pblocknr;
+	err = 0;
+found:
+	*pbh = bh;
+
+out_locked:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/**
+ * nilfs_btnode_delete - delete B-tree node buffer
+ * @bh: buffer to be deleted
+ *
+ * nilfs_btnode_delete() invalidates the specified buffer and delete the page
+ * including the buffer if the page gets unbusy.
+ */
+void nilfs_btnode_delete(struct buffer_head *bh)
+{
+	struct address_space *mapping;
+	struct page *page = bh->b_page;
+	pgoff_t index = page_index(page);
+	int still_dirty;
+
+	page_cache_get(page);
+	lock_page(page);
+	wait_on_page_writeback(page);
+
+	nilfs_forget_buffer(bh);
+	still_dirty = PageDirty(page);
+	mapping = page->mapping;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (!still_dirty && mapping)
+		invalidate_inode_pages2_range(mapping, index, index);
+}
+
+/**
+ * nilfs_btnode_prepare_change_key
+ *  prepare to move contents of the block for old key to one of new key.
+ *  the old buffer will not be removed, but might be reused for new buffer.
+ *  it might return -ENOMEM because of memory allocation errors,
+ *  and might return -EIO because of disk read errors.
+ */
+int nilfs_btnode_prepare_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh, *nbh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	int err;
+
+	if (oldkey == newkey)
+		return 0;
+
+	obh = ctxt->bh;
+	ctxt->newbh = NULL;
+
+	if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+		lock_page(obh->b_page);
+		/*
+		 * We cannot call radix_tree_preload for the kernels older
+		 * than 2.6.23, because it is not exported for modules.
+		 */
+retry:
+		err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+		if (err)
+			goto failed_unlock;
+		/* BUG_ON(oldkey != obh->b_page->index); */
+		if (unlikely(oldkey != obh->b_page->index))
+			NILFS_PAGE_BUG(obh->b_page,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+
+		spin_lock_irq(&btnc->tree_lock);
+		err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
+		spin_unlock_irq(&btnc->tree_lock);
+		/*
+		 * Note: page->index will not change to newkey until
+		 * nilfs_btnode_commit_change_key() will be called.
+		 * To protect the page in intermediate state, the page lock
+		 * is held.
+		 */
+		radix_tree_preload_end();
+		if (!err)
+			return 0;
+		else if (err != -EEXIST)
+			goto failed_unlock;
+
+		err = invalidate_inode_pages2_range(btnc, newkey, newkey);
+		if (!err)
+			goto retry;
+		/* fallback to copy mode */
+		unlock_page(obh->b_page);
+	}
+
+	nbh = nilfs_btnode_create_block(btnc, newkey);
+	if (!nbh)
+		return -ENOMEM;
+
+	BUG_ON(nbh == obh);
+	ctxt->newbh = nbh;
+	return 0;
+
+ failed_unlock:
+	unlock_page(obh->b_page);
+	return err;
+}
+
+/**
+ * nilfs_btnode_commit_change_key
+ *  commit the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_commit_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	struct page *opage;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		opage = obh->b_page;
+		if (unlikely(oldkey != opage->index))
+			NILFS_PAGE_BUG(opage,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+		mark_buffer_dirty(obh);
+
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, oldkey);
+		radix_tree_tag_set(&btnc->page_tree, newkey,
+				   PAGECACHE_TAG_DIRTY);
+		spin_unlock_irq(&btnc->tree_lock);
+
+		opage->index = obh->b_blocknr = newkey;
+		unlock_page(opage);
+	} else {
+		nilfs_copy_buffer(nbh, obh);
+		mark_buffer_dirty(nbh);
+
+		nbh->b_blocknr = newkey;
+		ctxt->bh = nbh;
+		nilfs_btnode_delete(obh); /* will decrement bh->b_count */
+	}
+}
+
+/**
+ * nilfs_btnode_abort_change_key
+ *  abort the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_abort_change_key(struct address_space *btnc,
+				   struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, newkey);
+		spin_unlock_irq(&btnc->tree_lock);
+		unlock_page(ctxt->bh->b_page);
+	} else
+		brelse(nbh);
+}
diff --git a/kernel/fs/nilfs2/btnode.h b/kernel/fs/nilfs2/btnode.h
new file mode 100644
index 000000000..d876b565c
--- /dev/null
+++ b/kernel/fs/nilfs2/btnode.h
@@ -0,0 +1,59 @@
+/*
+ * btnode.h - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_BTNODE_H
+#define _NILFS_BTNODE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+
+/**
+ * struct nilfs_btnode_chkey_ctxt - change key context
+ * @oldkey: old key of block's moving content
+ * @newkey: new key for block's content
+ * @bh: buffer head of old buffer
+ * @newbh: buffer head of new buffer
+ */
+struct nilfs_btnode_chkey_ctxt {
+	__u64 oldkey;
+	__u64 newkey;
+	struct buffer_head *bh;
+	struct buffer_head *newbh;
+};
+
+void nilfs_btnode_cache_clear(struct address_space *);
+struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
+					      __u64 blocknr);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
+			      struct buffer_head **, sector_t *);
+void nilfs_btnode_delete(struct buffer_head *);
+int nilfs_btnode_prepare_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_commit_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_abort_change_key(struct address_space *,
+				   struct nilfs_btnode_chkey_ctxt *);
+
+#endif	/* _NILFS_BTNODE_H */
diff --git a/kernel/fs/nilfs2/btree.c b/kernel/fs/nilfs2/btree.c
new file mode 100644
index 000000000..919fd5bb1
--- /dev/null
+++ b/kernel/fs/nilfs2/btree.c
@@ -0,0 +1,2414 @@
+/*
+ * btree.c - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "btnode.h"
+#include "btree.h"
+#include "alloc.h"
+#include "dat.h"
+
+static void __nilfs_btree_init(struct nilfs_bmap *bmap);
+
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
+{
+	struct nilfs_btree_path *path;
+	int level = NILFS_BTREE_LEVEL_DATA;
+
+	path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+	if (path == NULL)
+		goto out;
+
+	for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
+		path[level].bp_bh = NULL;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index = 0;
+		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_op = NULL;
+	}
+
+out:
+	return path;
+}
+
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
+{
+	int level = NILFS_BTREE_LEVEL_DATA;
+
+	for (; level < NILFS_BTREE_LEVEL_MAX; level++)
+		brelse(path[level].bp_bh);
+
+	kmem_cache_free(nilfs_btree_path_cache, path);
+}
+
+/*
+ * B-tree node operations
+ */
+static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
+				     __u64 ptr, struct buffer_head **bhp)
+{
+	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+	struct buffer_head *bh;
+
+	bh = nilfs_btnode_create_block(btnc, ptr);
+	if (!bh)
+		return -ENOMEM;
+
+	set_buffer_nilfs_volatile(bh);
+	*bhp = bh;
+	return 0;
+}
+
+static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
+{
+	return node->bn_flags;
+}
+
+static void
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
+{
+	node->bn_flags = flags;
+}
+
+static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
+}
+
+static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
+{
+	return node->bn_level;
+}
+
+static void
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
+{
+	node->bn_level = level;
+}
+
+static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
+{
+	return le16_to_cpu(node->bn_nchildren);
+}
+
+static void
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
+{
+	node->bn_nchildren = cpu_to_le16(nchildren);
+}
+
+static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
+{
+	return 1 << btree->b_inode->i_blkbits;
+}
+
+static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
+{
+	return btree->b_nchildren_per_block;
+}
+
+static __le64 *
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
+{
+	return (__le64 *)((char *)(node + 1) +
+			  (nilfs_btree_node_root(node) ?
+			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
+}
+
+static __le64 *
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
+{
+	return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
+}
+
+static __u64
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
+{
+	return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
+}
+
+static void
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
+{
+	*(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
+}
+
+static __u64
+nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
+			 int ncmax)
+{
+	return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
+}
+
+static void
+nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
+			 int ncmax)
+{
+	*(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
+}
+
+static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
+				  int level, int nchildren, int ncmax,
+				  const __u64 *keys, const __u64 *ptrs)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int i;
+
+	nilfs_btree_node_set_flags(node, flags);
+	nilfs_btree_node_set_level(node, level);
+	nilfs_btree_node_set_nchildren(node, nchildren);
+
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	for (i = 0; i < nchildren; i++) {
+		dkeys[i] = cpu_to_le64(keys[i]);
+		dptrs[i] = cpu_to_le64(ptrs[i]);
+	}
+}
+
+/* Assume the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
+				       struct nilfs_btree_node *right,
+				       int n, int lncmax, int rncmax)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+
+	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
+	memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
+	memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
+	memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
+
+	lnchildren += n;
+	rnchildren -= n;
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
+}
+
+/* Assume that the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
+					struct nilfs_btree_node *right,
+					int n, int lncmax, int rncmax)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+
+	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
+	memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
+	memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
+	memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
+
+	lnchildren -= n;
+	rnchildren += n;
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
+				    __u64 key, __u64 ptr, int ncmax)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	if (index < nchildren) {
+		memmove(dkeys + index + 1, dkeys + index,
+			(nchildren - index) * sizeof(*dkeys));
+		memmove(dptrs + index + 1, dptrs + index,
+			(nchildren - index) * sizeof(*dptrs));
+	}
+	dkeys[index] = cpu_to_le64(key);
+	dptrs[index] = cpu_to_le64(ptr);
+	nchildren++;
+	nilfs_btree_node_set_nchildren(node, nchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
+				    __u64 *keyp, __u64 *ptrp, int ncmax)
+{
+	__u64 key;
+	__u64 ptr;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	key = le64_to_cpu(dkeys[index]);
+	ptr = le64_to_cpu(dptrs[index]);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	if (keyp != NULL)
+		*keyp = key;
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	if (index < nchildren - 1) {
+		memmove(dkeys + index, dkeys + index + 1,
+			(nchildren - index - 1) * sizeof(*dkeys));
+		memmove(dptrs + index, dptrs + index + 1,
+			(nchildren - index - 1) * sizeof(*dptrs));
+	}
+	nchildren--;
+	nilfs_btree_node_set_nchildren(node, nchildren);
+}
+
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
+				   __u64 key, int *indexp)
+{
+	__u64 nkey;
+	int index, low, high, s;
+
+	/* binary search */
+	low = 0;
+	high = nilfs_btree_node_get_nchildren(node) - 1;
+	index = 0;
+	s = 0;
+	while (low <= high) {
+		index = (low + high) / 2;
+		nkey = nilfs_btree_node_get_key(node, index);
+		if (nkey == key) {
+			s = 0;
+			goto out;
+		} else if (nkey < key) {
+			low = index + 1;
+			s = -1;
+		} else {
+			high = index - 1;
+			s = 1;
+		}
+	}
+
+	/* adjust index */
+	if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
+		if (s > 0 && index > 0)
+			index--;
+	} else if (s < 0)
+		index++;
+
+ out:
+	*indexp = index;
+
+	return s == 0;
+}
+
+/**
+ * nilfs_btree_node_broken - verify consistency of btree node
+ * @node: btree node block to be examined
+ * @size: node size (in bytes)
+ * @blocknr: block number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
+				   size_t size, sector_t blocknr)
+{
+	int level, flags, nchildren;
+	int ret = 0;
+
+	level = nilfs_btree_node_get_level(node);
+	flags = nilfs_btree_node_get_flags(node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+
+	if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+		     level >= NILFS_BTREE_LEVEL_MAX ||
+		     (flags & NILFS_BTREE_NODE_ROOT) ||
+		     nchildren < 0 ||
+		     nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
+		printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
+		       "level = %d, flags = 0x%x, nchildren = %d\n",
+		       (unsigned long long)blocknr, level, flags, nchildren);
+		ret = 1;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_btree_root_broken - verify consistency of btree root node
+ * @node: btree root node to be examined
+ * @ino: inode number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
+				   unsigned long ino)
+{
+	int level, flags, nchildren;
+	int ret = 0;
+
+	level = nilfs_btree_node_get_level(node);
+	flags = nilfs_btree_node_get_flags(node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+
+	if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+		     level >= NILFS_BTREE_LEVEL_MAX ||
+		     nchildren < 0 ||
+		     nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+		pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
+			ino, level, flags, nchildren);
+		ret = 1;
+	}
+	return ret;
+}
+
+int nilfs_btree_broken_node_block(struct buffer_head *bh)
+{
+	int ret;
+
+	if (buffer_nilfs_checked(bh))
+		return 0;
+
+	ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
+				       bh->b_size, bh->b_blocknr);
+	if (likely(!ret))
+		set_buffer_nilfs_checked(bh);
+	return ret;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_bmap *btree)
+{
+	return (struct nilfs_btree_node *)btree->b_u.u_data;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
+}
+
+static int nilfs_btree_height(const struct nilfs_bmap *btree)
+{
+	return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_node(const struct nilfs_bmap *btree,
+		     const struct nilfs_btree_path *path,
+		     int level, int *ncmaxp)
+{
+	struct nilfs_btree_node *node;
+
+	if (level == nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_root(btree);
+		*ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
+	} else {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		*ncmaxp = nilfs_btree_nchildren_per_block(btree);
+	}
+	return node;
+}
+
+static int
+nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+{
+	if (unlikely(nilfs_btree_node_get_level(node) != level)) {
+		dump_stack();
+		printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
+		       nilfs_btree_node_get_level(node), level);
+		return 1;
+	}
+	return 0;
+}
+
+struct nilfs_btree_readahead_info {
+	struct nilfs_btree_node *node;	/* parent node */
+	int max_ra_blocks;		/* max nof blocks to read ahead */
+	int index;			/* current index on the parent node */
+	int ncmax;			/* nof children in the parent node */
+};
+
+static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+				   struct buffer_head **bhp,
+				   const struct nilfs_btree_readahead_info *ra)
+{
+	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+	struct buffer_head *bh, *ra_bh;
+	sector_t submit_ptr = 0;
+	int ret;
+
+	ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
+	if (ret) {
+		if (ret != -EEXIST)
+			return ret;
+		goto out_check;
+	}
+
+	if (ra) {
+		int i, n;
+		__u64 ptr2;
+
+		/* read ahead sibling nodes */
+		for (n = ra->max_ra_blocks, i = ra->index + 1;
+		     n > 0 && i < ra->ncmax; n--, i++) {
+			ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
+
+			ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
+							&ra_bh, &submit_ptr);
+			if (likely(!ret || ret == -EEXIST))
+				brelse(ra_bh);
+			else if (ret != -EBUSY)
+				break;
+			if (!buffer_locked(bh))
+				goto out_no_wait;
+		}
+	}
+
+	wait_on_buffer(bh);
+
+ out_no_wait:
+	if (!buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EIO;
+	}
+
+ out_check:
+	if (nilfs_btree_broken_node_block(bh)) {
+		clear_buffer_uptodate(bh);
+		brelse(bh);
+		return -EINVAL;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+				   struct buffer_head **bhp)
+{
+	return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
+}
+
+static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
+				 struct nilfs_btree_path *path,
+				 __u64 key, __u64 *ptrp, int minlevel,
+				 int readahead)
+{
+	struct nilfs_btree_node *node;
+	struct nilfs_btree_readahead_info p, *ra;
+	__u64 ptr;
+	int level, index, found, ncmax, ret;
+
+	node = nilfs_btree_get_root(btree);
+	level = nilfs_btree_node_get_level(node);
+	if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
+		return -ENOENT;
+
+	found = nilfs_btree_node_lookup(node, key, &index);
+	ptr = nilfs_btree_node_get_ptr(node, index,
+				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+
+	ncmax = nilfs_btree_nchildren_per_block(btree);
+
+	while (--level >= minlevel) {
+		ra = NULL;
+		if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
+			p.node = nilfs_btree_get_node(btree, path, level + 1,
+						      &p.ncmax);
+			p.index = index;
+			p.max_ra_blocks = 7;
+			ra = &p;
+		}
+		ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
+					      ra);
+		if (ret < 0)
+			return ret;
+
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
+		if (!found)
+			found = nilfs_btree_node_lookup(node, key, &index);
+		else
+			index = 0;
+		if (index < ncmax) {
+			ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
+		} else {
+			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
+			/* insert */
+			ptr = NILFS_BMAP_INVALID_PTR;
+		}
+		path[level].bp_index = index;
+	}
+	if (!found)
+		return -ENOENT;
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	__u64 ptr;
+	int index, level, ncmax, ret;
+
+	node = nilfs_btree_get_root(btree);
+	index = nilfs_btree_node_get_nchildren(node) - 1;
+	if (index < 0)
+		return -ENOENT;
+	level = nilfs_btree_node_get_level(node);
+	ptr = nilfs_btree_node_get_ptr(node, index,
+				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+	ncmax = nilfs_btree_nchildren_per_block(btree);
+
+	for (level--; level > 0; level--) {
+		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
+		if (ret < 0)
+			return ret;
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
+		index = nilfs_btree_node_get_nchildren(node) - 1;
+		ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
+		path[level].bp_index = index;
+	}
+
+	if (keyp != NULL)
+		*keyp = nilfs_btree_node_get_key(node, index);
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+/**
+ * nilfs_btree_get_next_key - get next valid key from btree path array
+ * @btree: bmap struct of btree
+ * @path: array of nilfs_btree_path struct
+ * @minlevel: start level
+ * @nextkey: place to store the next valid key
+ *
+ * Return Value: If a next key was found, 0 is returned. Otherwise,
+ * -ENOENT is returned.
+ */
+static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
+				    const struct nilfs_btree_path *path,
+				    int minlevel, __u64 *nextkey)
+{
+	struct nilfs_btree_node *node;
+	int maxlevel = nilfs_btree_height(btree) - 1;
+	int index, next_adj, level;
+
+	/* Next index is already set to bp_index for leaf nodes. */
+	next_adj = 0;
+	for (level = minlevel; level <= maxlevel; level++) {
+		if (level == maxlevel)
+			node = nilfs_btree_get_root(btree);
+		else
+			node = nilfs_btree_get_nonroot_node(path, level);
+
+		index = path[level].bp_index + next_adj;
+		if (index < nilfs_btree_node_get_nchildren(node)) {
+			/* Next key is in this node */
+			*nextkey = nilfs_btree_node_get_key(node, index);
+			return 0;
+		}
+		/* For non-leaf nodes, next index is stored at bp_index + 1. */
+		next_adj = 1;
+	}
+	return -ENOENT;
+}
+
+static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
+			      __u64 key, int level, __u64 *ptrp)
+{
+	struct nilfs_btree_path *path;
+	int ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
+
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
+				     __u64 key, __u64 *ptrp, unsigned maxblocks)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	struct inode *dat = NULL;
+	__u64 ptr, ptr2;
+	sector_t blocknr;
+	int level = NILFS_BTREE_LEVEL_NODE_MIN;
+	int ret, cnt, index, maxlevel, ncmax;
+	struct nilfs_btree_readahead_info p;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
+	if (ret < 0)
+		goto out;
+
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		dat = nilfs_bmap_get_dat(btree);
+		ret = nilfs_dat_translate(dat, ptr, &blocknr);
+		if (ret < 0)
+			goto out;
+		ptr = blocknr;
+	}
+	cnt = 1;
+	if (cnt == maxblocks)
+		goto end;
+
+	maxlevel = nilfs_btree_height(btree) - 1;
+	node = nilfs_btree_get_node(btree, path, level, &ncmax);
+	index = path[level].bp_index + 1;
+	for (;;) {
+		while (index < nilfs_btree_node_get_nchildren(node)) {
+			if (nilfs_btree_node_get_key(node, index) !=
+			    key + cnt)
+				goto end;
+			ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
+			if (dat) {
+				ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+				if (ret < 0)
+					goto out;
+				ptr2 = blocknr;
+			}
+			if (ptr2 != ptr + cnt || ++cnt == maxblocks)
+				goto end;
+			index++;
+			continue;
+		}
+		if (level == maxlevel)
+			break;
+
+		/* look-up right sibling node */
+		p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
+		p.index = path[level + 1].bp_index + 1;
+		p.max_ra_blocks = 7;
+		if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
+		    nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
+			break;
+		ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
+		path[level + 1].bp_index = p.index;
+
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = NULL;
+
+		ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
+					      &p);
+		if (ret < 0)
+			goto out;
+		node = nilfs_btree_get_nonroot_node(path, level);
+		ncmax = nilfs_btree_nchildren_per_block(btree);
+		index = 0;
+		path[level].bp_index = index;
+	}
+ end:
+	*ptrp = ptr;
+	ret = cnt;
+ out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 key)
+{
+	if (level < nilfs_btree_height(btree) - 1) {
+		do {
+			nilfs_btree_node_set_key(
+				nilfs_btree_get_nonroot_node(path, level),
+				path[level].bp_index, key);
+			if (!buffer_dirty(path[level].bp_bh))
+				mark_buffer_dirty(path[level].bp_bh);
+		} while ((path[level].bp_index == 0) &&
+			 (++level < nilfs_btree_height(btree) - 1));
+	}
+
+	/* root */
+	if (level == nilfs_btree_height(btree) - 1) {
+		nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
+					 path[level].bp_index, key);
+	}
+}
+
+static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	int ncblk;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_insert(node, path[level].bp_index,
+					*keyp, *ptrp, ncblk);
+		if (!buffer_dirty(path[level].bp_bh))
+			mark_buffer_dirty(path[level].bp_bh);
+
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+						nilfs_btree_node_get_key(node,
+									 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_insert(node, path[level].bp_index,
+					*keyp, *ptrp,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	}
+}
+
+static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
+				   struct nilfs_btree_path *path,
+				   int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n, move, ncblk;
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+	move = 0;
+
+	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
+	if (n > path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(node, 0));
+
+	if (move) {
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index += lnchildren;
+		path[level + 1].bp_index--;
+	} else {
+		brelse(path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -= n;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n, move, ncblk;
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+	move = 0;
+
+	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
+	if (n > nchildren - path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(right, 0));
+	path[level + 1].bp_index--;
+
+	if (move) {
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
+		path[level + 1].bp_index++;
+	} else {
+		brelse(path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_split(struct nilfs_bmap *btree,
+			      struct nilfs_btree_path *path,
+			      int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	__u64 newkey;
+	__u64 newptr;
+	int nchildren, n, move, ncblk;
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+	move = 0;
+
+	n = (nchildren + 1) / 2;
+	if (n > nchildren - path[level].bp_index) {
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	newkey = nilfs_btree_node_get_key(right, 0);
+	newptr = path[level].bp_newreq.bpr_ptr;
+
+	if (move) {
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
+		nilfs_btree_node_insert(right, path[level].bp_index,
+					*keyp, *ptrp, ncblk);
+
+		*keyp = nilfs_btree_node_get_key(right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+	} else {
+		nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+		*keyp = nilfs_btree_node_get_key(right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		brelse(path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_grow(struct nilfs_bmap *btree,
+			     struct nilfs_btree_path *path,
+			     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n, ncblk;
+
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = nilfs_btree_node_get_nchildren(root);
+
+	nilfs_btree_node_move_right(root, child, n,
+				    NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
+	nilfs_btree_node_set_level(root, level + 1);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+	*keyp = nilfs_btree_node_get_key(child, 0);
+	*ptrp = path[level].bp_newreq.bpr_ptr;
+}
+
+static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
+				   const struct nilfs_btree_path *path)
+{
+	struct nilfs_btree_node *node;
+	int level, ncmax;
+
+	if (path == NULL)
+		return NILFS_BMAP_INVALID_PTR;
+
+	/* left sibling */
+	level = NILFS_BTREE_LEVEL_NODE_MIN;
+	if (path[level].bp_index > 0) {
+		node = nilfs_btree_get_node(btree, path, level, &ncmax);
+		return nilfs_btree_node_get_ptr(node,
+						path[level].bp_index - 1,
+						ncmax);
+	}
+
+	/* parent */
+	level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
+	if (level <= nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_node(btree, path, level, &ncmax);
+		return nilfs_btree_node_get_ptr(node, path[level].bp_index,
+						ncmax);
+	}
+
+	return NILFS_BMAP_INVALID_PTR;
+}
+
+static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
+				       const struct nilfs_btree_path *path,
+				       __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(btree, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else {
+		ptr = nilfs_btree_find_near(btree, path);
+		if (ptr != NILFS_BMAP_INVALID_PTR)
+			/* near */
+			return ptr;
+	}
+	/* block group */
+	return nilfs_bmap_find_target_in_group(btree);
+}
+
+static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp, __u64 key, __u64 ptr,
+				      struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, level, ncmax, ncblk, ret;
+	struct inode *dat = NULL;
+
+	stats->bs_nblocks = 0;
+	level = NILFS_BTREE_LEVEL_DATA;
+
+	/* allocate a new ptr for data block */
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		path[level].bp_newreq.bpr_ptr =
+			nilfs_btree_find_target_v(btree, path, key);
+		dat = nilfs_bmap_get_dat(btree);
+	}
+
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
+	if (ret < 0)
+		goto err_out_data;
+
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_node_get_nchildren(node) < ncblk) {
+			path[level].bp_op = nilfs_btree_do_insert;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+		pindex = path[level + 1].bp_index;
+
+		/* left sibling */
+		if (pindex > 0) {
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				brelse(bh);
+			}
+		}
+
+		/* right sibling */
+		if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				brelse(bh);
+			}
+		}
+
+		/* split */
+		path[level].bp_newreq.bpr_ptr =
+			path[level - 1].bp_newreq.bpr_ptr + 1;
+		ret = nilfs_bmap_prepare_alloc_ptr(btree,
+						   &path[level].bp_newreq, dat);
+		if (ret < 0)
+			goto err_out_child_node;
+		ret = nilfs_btree_get_new_block(btree,
+						path[level].bp_newreq.bpr_ptr,
+						&bh);
+		if (ret < 0)
+			goto err_out_curr_node;
+
+		stats->bs_nblocks++;
+
+		sib = (struct nilfs_btree_node *)bh->b_data;
+		nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
+		path[level].bp_sib_bh = bh;
+		path[level].bp_op = nilfs_btree_split;
+	}
+
+	/* root */
+	node = nilfs_btree_get_root(btree);
+	if (nilfs_btree_node_get_nchildren(node) <
+	    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+		path[level].bp_op = nilfs_btree_do_insert;
+		stats->bs_nblocks++;
+		goto out;
+	}
+
+	/* grow */
+	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
+	if (ret < 0)
+		goto err_out_child_node;
+	ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
+					&bh);
+	if (ret < 0)
+		goto err_out_curr_node;
+
+	nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
+			      0, level, 0, ncblk, NULL, NULL);
+	path[level].bp_sib_bh = bh;
+	path[level].bp_op = nilfs_btree_grow;
+
+	level++;
+	path[level].bp_op = nilfs_btree_do_insert;
+
+	/* a newly-created node block and a data block are added */
+	stats->bs_nblocks += 2;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
+ err_out_child_node:
+	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
+		nilfs_btnode_delete(path[level].bp_sib_bh);
+		nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
+
+	}
+
+	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
+ err_out_data:
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel, __u64 key, __u64 ptr)
+{
+	struct inode *dat = NULL;
+	int level;
+
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		nilfs_bmap_set_target_v(btree, key, ptr);
+		dat = nilfs_bmap_get_dat(btree);
+	}
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		nilfs_bmap_commit_alloc_ptr(btree,
+					    &path[level - 1].bp_newreq, dat);
+		path[level].bp_op(btree, path, level, &key, &ptr);
+	}
+
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
+}
+
+static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	int level, ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
+	if (ret != -ENOENT) {
+		if (ret == 0)
+			ret = -EEXIST;
+		goto out;
+	}
+
+	ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_insert(btree, path, level, key, ptr);
+	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
+
+ out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	int ncblk;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_delete(node, path[level].bp_index,
+					keyp, ptrp, ncblk);
+		if (!buffer_dirty(path[level].bp_bh))
+			mark_buffer_dirty(path[level].bp_bh);
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(node, 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_delete(node, path[level].bp_index,
+					keyp, ptrp,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	}
+}
+
+static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = (nchildren + lnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(node, 0));
+
+	brelse(path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += n;
+}
+
+static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = (nchildren + rnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(right, 0));
+	path[level + 1].bp_index--;
+
+	brelse(path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+}
+
+static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = nilfs_btree_node_get_nchildren(node);
+
+	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	nilfs_btnode_delete(path[level].bp_bh);
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += nilfs_btree_node_get_nchildren(left);
+}
+
+static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = nilfs_btree_node_get_nchildren(right);
+
+	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+
+	nilfs_btnode_delete(path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_shrink(struct nilfs_bmap *btree,
+			       struct nilfs_btree_path *path,
+			       int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_nonroot_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	nilfs_btree_node_delete(root, 0, NULL, NULL,
+				NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	nilfs_btree_node_set_level(root, level);
+	n = nilfs_btree_node_get_nchildren(child);
+	nilfs_btree_node_move_left(root, child, n,
+				   NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
+
+	nilfs_btnode_delete(path[level].bp_bh);
+	path[level].bp_bh = NULL;
+}
+
+static void nilfs_btree_nop(struct nilfs_bmap *btree,
+			    struct nilfs_btree_path *path,
+			    int level, __u64 *keyp, __u64 *ptrp)
+{
+}
+
+static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp,
+				      struct nilfs_bmap_stats *stats,
+				      struct inode *dat)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, dindex, level, ncmin, ncmax, ncblk, ret;
+
+	ret = 0;
+	stats->bs_nblocks = 0;
+	ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		path[level].bp_oldreq.bpr_ptr =
+			nilfs_btree_node_get_ptr(node, dindex, ncblk);
+		ret = nilfs_bmap_prepare_end_ptr(btree,
+						 &path[level].bp_oldreq, dat);
+		if (ret < 0)
+			goto err_out_child_node;
+
+		if (nilfs_btree_node_get_nchildren(node) > ncmin) {
+			path[level].bp_op = nilfs_btree_do_delete;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+		pindex = path[level + 1].bp_index;
+		dindex = pindex;
+
+		if (pindex > 0) {
+			/* left sibling */
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_left;
+				stats->bs_nblocks++;
+				/* continue; */
+			}
+		} else if (pindex <
+			   nilfs_btree_node_get_nchildren(parent) - 1) {
+			/* right sibling */
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_right;
+				stats->bs_nblocks++;
+				/*
+				 * When merging right sibling node
+				 * into the current node, pointer to
+				 * the right sibling node must be
+				 * terminated instead.  The adjustment
+				 * below is required for that.
+				 */
+				dindex = pindex + 1;
+				/* continue; */
+			}
+		} else {
+			/* no siblings */
+			/* the only child of the root node */
+			WARN_ON(level != nilfs_btree_height(btree) - 2);
+			if (nilfs_btree_node_get_nchildren(node) - 1 <=
+			    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+				path[level].bp_op = nilfs_btree_shrink;
+				stats->bs_nblocks += 2;
+				level++;
+				path[level].bp_op = nilfs_btree_nop;
+				goto shrink_root_child;
+			} else {
+				path[level].bp_op = nilfs_btree_do_delete;
+				stats->bs_nblocks++;
+				goto out;
+			}
+		}
+	}
+
+	/* child of the root node is deleted */
+	path[level].bp_op = nilfs_btree_do_delete;
+	stats->bs_nblocks++;
+
+shrink_root_child:
+	node = nilfs_btree_get_root(btree);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(node, dindex,
+					 NILFS_BTREE_ROOT_NCHILDREN_MAX);
+
+	ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
+	if (ret < 0)
+		goto err_out_child_node;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
+ err_out_child_node:
+	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
+		brelse(path[level].bp_sib_bh);
+		nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
+	}
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel, struct inode *dat)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
+		path[level].bp_op(btree, path, level, NULL, NULL);
+	}
+
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
+}
+
+static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
+
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	struct inode *dat;
+	int level, ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
+	if (ret < 0)
+		goto out;
+
+
+	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
+
+	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_delete(btree, path, level, dat);
+	nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
+
+out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start,
+				__u64 *keyp)
+{
+	struct nilfs_btree_path *path;
+	const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN;
+	int ret;
+
+	path = nilfs_btree_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0);
+	if (!ret)
+		*keyp = start;
+	else if (ret == -ENOENT)
+		ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp);
+
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
+{
+	struct nilfs_btree_path *path;
+	int ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
+
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *root, *node;
+	__u64 maxkey, nextmaxkey;
+	__u64 ptr;
+	int nchildren, ret;
+
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(root);
+		if (nchildren > 1)
+			return 0;
+		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+		ret = nilfs_btree_get_block(btree, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		break;
+	default:
+		return 0;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
+	nextmaxkey = (nchildren > 1) ?
+		nilfs_btree_node_get_key(node, nchildren - 2) : 0;
+	if (bh != NULL)
+		brelse(bh);
+
+	return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
+}
+
+static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
+				   __u64 *keys, __u64 *ptrs, int nitems)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *root;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	__u64 ptr;
+	int nchildren, ncmax, i, ret;
+
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(root);
+		WARN_ON(nchildren > 1);
+		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+		ret = nilfs_btree_get_block(btree, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		ncmax = nilfs_btree_nchildren_per_block(btree);
+		break;
+	default:
+		node = NULL;
+		return -EINVAL;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	if (nchildren < nitems)
+		nitems = nchildren;
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	for (i = 0; i < nitems; i++) {
+		keys[i] = le64_to_cpu(dkeys[i]);
+		ptrs[i] = le64_to_cpu(dptrs[i]);
+	}
+
+	if (bh != NULL)
+		brelse(bh);
+
+	return nitems;
+}
+
+static int
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
+				       union nilfs_bmap_ptr_req *dreq,
+				       union nilfs_bmap_ptr_req *nreq,
+				       struct buffer_head **bhp,
+				       struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct inode *dat = NULL;
+	int ret;
+
+	stats->bs_nblocks = 0;
+
+	/* for data */
+	/* cannot find near ptr */
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+		dat = nilfs_bmap_get_dat(btree);
+	}
+
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
+	if (ret < 0)
+		return ret;
+
+	*bhp = NULL;
+	stats->bs_nblocks++;
+	if (nreq != NULL) {
+		nreq->bpr_ptr = dreq->bpr_ptr + 1;
+		ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
+		if (ret < 0)
+			goto err_out_dreq;
+
+		ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
+		if (ret < 0)
+			goto err_out_nreq;
+
+		*bhp = bh;
+		stats->bs_nblocks++;
+	}
+
+	/* success */
+	return 0;
+
+	/* error */
+ err_out_nreq:
+	nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
+ err_out_dreq:
+	nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
+	stats->bs_nblocks = 0;
+	return ret;
+
+}
+
+static void
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
+				      __u64 key, __u64 ptr,
+				      const __u64 *keys, const __u64 *ptrs,
+				      int n,
+				      union nilfs_bmap_ptr_req *dreq,
+				      union nilfs_bmap_ptr_req *nreq,
+				      struct buffer_head *bh)
+{
+	struct nilfs_btree_node *node;
+	struct inode *dat;
+	__u64 tmpptr;
+	int ncblk;
+
+	/* free resources */
+	if (btree->b_ops->bop_clear != NULL)
+		btree->b_ops->bop_clear(btree);
+
+	/* ptr must be a pointer to a buffer head. */
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+
+	/* convert and insert */
+	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
+	__nilfs_btree_init(btree);
+	if (nreq != NULL) {
+		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
+		nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
+
+		/* create child node at level 1 */
+		node = (struct nilfs_btree_node *)bh->b_data;
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
+		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
+		if (!buffer_dirty(bh))
+			mark_buffer_dirty(bh);
+		if (!nilfs_bmap_dirty(btree))
+			nilfs_bmap_set_dirty(btree);
+
+		brelse(bh);
+
+		/* create root node at level 2 */
+		node = nilfs_btree_get_root(btree);
+		tmpptr = nreq->bpr_ptr;
+		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
+				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+				      &keys[0], &tmpptr);
+	} else {
+		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
+
+		/* create root node at level 1 */
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
+				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+				      keys, ptrs);
+		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
+		if (!nilfs_bmap_dirty(btree))
+			nilfs_bmap_set_dirty(btree);
+	}
+
+	if (NILFS_BMAP_USE_VBN(btree))
+		nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
+}
+
+/**
+ * nilfs_btree_convert_and_insert -
+ * @bmap:
+ * @key:
+ * @ptr:
+ * @keys:
+ * @ptrs:
+ * @n:
+ */
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
+				   __u64 key, __u64 ptr,
+				   const __u64 *keys, const __u64 *ptrs, int n)
+{
+	struct buffer_head *bh;
+	union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+		di = &dreq;
+		ni = NULL;
+	} else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
+			   1 << btree->b_inode->i_blkbits)) {
+		di = &dreq;
+		ni = &nreq;
+	} else {
+		di = NULL;
+		ni = NULL;
+		BUG();
+	}
+
+	ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
+						     &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
+					      di, ni, bh);
+	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
+	return 0;
+}
+
+static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
+				   struct nilfs_btree_path *path,
+				   int level,
+				   struct buffer_head *bh)
+{
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+
+	return 0;
+}
+
+static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
+					struct nilfs_btree_path *path,
+					int level, struct inode *dat)
+{
+	struct nilfs_btree_node *parent;
+	int ncmax, ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+					 ncmax);
+	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
+	ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
+				       &path[level].bp_newreq.bpr_req);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
+		path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
+		path[level].bp_ctxt.bh = path[level].bp_bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0) {
+			nilfs_dat_abort_update(dat,
+					       &path[level].bp_oldreq.bpr_req,
+					       &path[level].bp_newreq.bpr_req);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
+					struct nilfs_btree_path *path,
+					int level, struct inode *dat)
+{
+	struct nilfs_btree_node *parent;
+	int ncmax;
+
+	nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
+				&path[level].bp_newreq.bpr_req,
+				btree->b_ptr_type == NILFS_BMAP_PTR_VS);
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		path[level].bp_bh = path[level].bp_ctxt.bh;
+	}
+	set_buffer_nilfs_volatile(path[level].bp_bh);
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
+				 path[level].bp_newreq.bpr_ptr, ncmax);
+}
+
+static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
+				       struct nilfs_btree_path *path,
+				       int level, struct inode *dat)
+{
+	nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
+			       &path[level].bp_newreq.bpr_req);
+	if (buffer_nilfs_node(path[level].bp_bh))
+		nilfs_btnode_abort_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+}
+
+static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel, int *maxlevelp,
+					   struct inode *dat)
+{
+	int level, ret;
+
+	level = minlevel;
+	if (!buffer_nilfs_volatile(path[level].bp_bh)) {
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
+		if (ret < 0)
+			return ret;
+	}
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh)) {
+
+		WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* success */
+	*maxlevelp = level - 1;
+	return 0;
+
+	/* error */
+ out:
+	while (--level > minlevel)
+		nilfs_btree_abort_update_v(btree, path, level, dat);
+	if (!buffer_nilfs_volatile(path[level].bp_bh))
+		nilfs_btree_abort_update_v(btree, path, level, dat);
+	return ret;
+}
+
+static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel, int maxlevel,
+					   struct buffer_head *bh,
+					   struct inode *dat)
+{
+	int level;
+
+	if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
+		nilfs_btree_commit_update_v(btree, path, minlevel, dat);
+
+	for (level = minlevel + 1; level <= maxlevel; level++)
+		nilfs_btree_commit_update_v(btree, path, level, dat);
+}
+
+static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
+				   struct nilfs_btree_path *path,
+				   int level, struct buffer_head *bh)
+{
+	int maxlevel = 0, ret;
+	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(btree);
+	__u64 ptr;
+	int ncmax;
+
+	get_bh(bh);
+	path[level].bp_bh = bh;
+	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+					      dat);
+	if (ret < 0)
+		goto out;
+
+	if (buffer_nilfs_volatile(path[level].bp_bh)) {
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+		ptr = nilfs_btree_node_get_ptr(parent,
+					       path[level + 1].bp_index,
+					       ncmax);
+		ret = nilfs_dat_mark_dirty(dat, ptr);
+		if (ret < 0)
+			goto out;
+	}
+
+	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
+
+ out:
+	brelse(path[level].bp_bh);
+	path[level].bp_bh = NULL;
+	return ret;
+}
+
+static int nilfs_btree_propagate(struct nilfs_bmap *btree,
+				 struct buffer_head *bh)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	WARN_ON(!buffer_dirty(bh));
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	if (buffer_nilfs_node(bh)) {
+		node = (struct nilfs_btree_node *)bh->b_data;
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
+	} else {
+		key = nilfs_bmap_data_get_key(btree, bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
+	if (ret < 0) {
+		if (unlikely(ret == -ENOENT))
+			printk(KERN_CRIT "%s: key = %llu, level == %d\n",
+			       __func__, (unsigned long long)key, level);
+		goto out;
+	}
+
+	ret = NILFS_BMAP_USE_VBN(btree) ?
+		nilfs_btree_propagate_v(btree, path, level, bh) :
+		nilfs_btree_propagate_p(btree, path, level, bh);
+
+ out:
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
+				    struct buffer_head *bh)
+{
+	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
+}
+
+static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
+					 struct list_head *lists,
+					 struct buffer_head *bh)
+{
+	struct list_head *head;
+	struct buffer_head *cbh;
+	struct nilfs_btree_node *node, *cnode;
+	__u64 key, ckey;
+	int level;
+
+	get_bh(bh);
+	node = (struct nilfs_btree_node *)bh->b_data;
+	key = nilfs_btree_node_get_key(node, 0);
+	level = nilfs_btree_node_get_level(node);
+	if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
+	    level >= NILFS_BTREE_LEVEL_MAX) {
+		dump_stack();
+		printk(KERN_WARNING
+		       "%s: invalid btree level: %d (key=%llu, ino=%lu, "
+		       "blocknr=%llu)\n",
+		       __func__, level, (unsigned long long)key,
+		       NILFS_BMAP_I(btree)->vfs_inode.i_ino,
+		       (unsigned long long)bh->b_blocknr);
+		return;
+	}
+
+	list_for_each(head, &lists[level]) {
+		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
+		cnode = (struct nilfs_btree_node *)cbh->b_data;
+		ckey = nilfs_btree_node_get_key(cnode, 0);
+		if (key < ckey)
+			break;
+	}
+	list_add_tail(&bh->b_assoc_buffers, head);
+}
+
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
+					     struct list_head *listp)
+{
+	struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
+	struct list_head lists[NILFS_BTREE_LEVEL_MAX];
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	pgoff_t index = 0;
+	int level, i;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		INIT_LIST_HEAD(&lists[level]);
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh))
+					nilfs_btree_add_dirty_buffer(btree,
+								     lists, bh);
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		list_splice_tail(&lists[level], listp);
+}
+
+static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	__u64 key;
+	__u64 ptr;
+	int ncmax, ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+				       ncmax);
+	if (buffer_nilfs_node(*bh)) {
+		path[level].bp_ctxt.oldkey = ptr;
+		path[level].bp_ctxt.newkey = blocknr;
+		path[level].bp_ctxt.bh = *bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0)
+			return ret;
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		*bh = path[level].bp_ctxt.bh;
+	}
+
+	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
+				 ncmax);
+
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
+	binfo->bi_dat.bi_level = level;
+
+	return 0;
+}
+
+static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(btree);
+	__u64 key;
+	__u64 ptr;
+	union nilfs_bmap_ptr_req req;
+	int ncmax, ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+				       ncmax);
+	req.bpr_ptr = ptr;
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (ret < 0)
+		return ret;
+	nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
+	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
+
+	return 0;
+}
+
+static int nilfs_btree_assign(struct nilfs_bmap *btree,
+			      struct buffer_head **bh,
+			      sector_t blocknr,
+			      union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
+	} else {
+		key = nilfs_bmap_data_get_key(btree, *bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	ret = NILFS_BMAP_USE_VBN(btree) ?
+		nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
+		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
+
+ out:
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int ret;
+
+	ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
+			     blocknr);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(node, 0);
+	} else
+		key = nilfs_bmap_data_get_key(btree, *bh);
+
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
+	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
+
+	return 0;
+}
+
+static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_path *path;
+	__u64 ptr;
+	int ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+	ret = nilfs_btree_get_block(btree, ptr, &bh);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	if (!buffer_dirty(bh))
+		mark_buffer_dirty(bh);
+	brelse(bh);
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
+
+ out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static const struct nilfs_bmap_operations nilfs_btree_ops = {
+	.bop_lookup		=	nilfs_btree_lookup,
+	.bop_lookup_contig	=	nilfs_btree_lookup_contig,
+	.bop_insert		=	nilfs_btree_insert,
+	.bop_delete		=	nilfs_btree_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign,
+	.bop_mark		=	nilfs_btree_mark,
+
+	.bop_seek_key		=	nilfs_btree_seek_key,
+	.bop_last_key		=	nilfs_btree_last_key,
+
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	nilfs_btree_check_delete,
+	.bop_gather_data	=	nilfs_btree_gather_data,
+};
+
+static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
+	.bop_lookup		=	NULL,
+	.bop_lookup_contig	=	NULL,
+	.bop_insert		=	NULL,
+	.bop_delete		=	NULL,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate_gc,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign_gc,
+	.bop_mark		=	NULL,
+
+	.bop_seek_key		=	NULL,
+	.bop_last_key		=	NULL,
+
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	NULL,
+};
+
+static void __nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+	bmap->b_ops = &nilfs_btree_ops;
+	bmap->b_nchildren_per_block =
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
+}
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+	int ret = 0;
+
+	__nilfs_btree_init(bmap);
+
+	if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
+				    bmap->b_inode->i_ino))
+		ret = -EIO;
+	return ret;
+}
+
+void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
+{
+	bmap->b_ops = &nilfs_btree_ops_gc;
+	bmap->b_nchildren_per_block =
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
+}
diff --git a/kernel/fs/nilfs2/btree.h b/kernel/fs/nilfs2/btree.h
new file mode 100644
index 000000000..22c02e35b
--- /dev/null
+++ b/kernel/fs/nilfs2/btree.h
@@ -0,0 +1,77 @@
+/*
+ * btree.h - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BTREE_H
+#define _NILFS_BTREE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/list.h>
+#include <linux/nilfs2_fs.h>
+#include "btnode.h"
+#include "bmap.h"
+
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+	struct buffer_head *bp_bh;
+	struct buffer_head *bp_sib_bh;
+	int bp_index;
+	union nilfs_bmap_ptr_req bp_oldreq;
+	union nilfs_bmap_ptr_req bp_newreq;
+	struct nilfs_btnode_chkey_ctxt bp_ctxt;
+	void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
+		      int, __u64 *, __u64 *);
+};
+
+#define NILFS_BTREE_ROOT_SIZE		NILFS_BMAP_SIZE
+#define NILFS_BTREE_ROOT_NCHILDREN_MAX					\
+	((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) /	\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_ROOT_NCHILDREN_MIN	0
+#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE	(sizeof(__le64))
+#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize)			\
+	(((nodesize) - sizeof(struct nilfs_btree_node) -		\
+		NILFS_BTREE_NODE_EXTRA_PAD_SIZE) /			\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize)			\
+	((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
+#define NILFS_BTREE_KEY_MIN	((__u64)0)
+#define NILFS_BTREE_KEY_MAX	(~(__u64)0)
+
+extern struct kmem_cache *nilfs_btree_path_cache;
+
+int nilfs_btree_init(struct nilfs_bmap *);
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
+				   const __u64 *, const __u64 *, int);
+void nilfs_btree_init_gc(struct nilfs_bmap *);
+
+int nilfs_btree_broken_node_block(struct buffer_head *bh);
+
+#endif	/* _NILFS_BTREE_H */
diff --git a/kernel/fs/nilfs2/cpfile.c b/kernel/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000..b6596cab9
--- /dev/null
+++ b/kernel/fs/nilfs2/cpfile.c
@@ -0,0 +1,1027 @@
+/*
+ * cpfile.c - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "cpfile.h"
+
+
+static inline unsigned long
+nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
+{
+	return NILFS_MDT(cpfile)->mi_entries_per_block;
+}
+
+/* block number from the beginning of the file */
+static unsigned long
+nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+	return (unsigned long)tcno;
+}
+
+/* offset in block */
+static unsigned long
+nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+}
+
+static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile,
+						    unsigned long blkoff)
+{
+	return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff
+		+ 1 - NILFS_MDT(cpfile)->mi_first_entry_offset;
+}
+
+static unsigned long
+nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
+				  __u64 curr,
+				  __u64 max)
+{
+	return min_t(__u64,
+		     nilfs_cpfile_checkpoints_per_block(cpfile) -
+		     nilfs_cpfile_get_offset(cpfile, curr),
+		     max - curr);
+}
+
+static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
+					   __u64 cno)
+{
+	return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
+}
+
+static unsigned int
+nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	count = le32_to_cpu(cp->cp_checkpoints_count) + n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static unsigned int
+nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
+	count = le32_to_cpu(cp->cp_checkpoints_count) - n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static inline struct nilfs_cpfile_header *
+nilfs_cpfile_block_get_header(const struct inode *cpfile,
+			      struct buffer_head *bh,
+			      void *kaddr)
+{
+	return kaddr + bh_offset(bh);
+}
+
+static struct nilfs_checkpoint *
+nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
+				  struct buffer_head *bh,
+				  void *kaddr)
+{
+	return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
+		NILFS_MDT(cpfile)->mi_entry_size;
+}
+
+static void nilfs_cpfile_block_init(struct inode *cpfile,
+				    struct buffer_head *bh,
+				    void *kaddr)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	int n = nilfs_cpfile_checkpoints_per_block(cpfile);
+
+	while (n-- > 0) {
+		nilfs_checkpoint_set_invalid(cp);
+		cp = (void *)cp + cpsz;
+	}
+}
+
+static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+}
+
+static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
+						    __u64 cno,
+						    int create,
+						    struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile,
+				   nilfs_cpfile_get_blkoff(cpfile, cno),
+				   create, nilfs_cpfile_block_init, bhp);
+}
+
+/**
+ * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile
+ * @cpfile: inode of cpfile
+ * @start_cno: start checkpoint number (inclusive)
+ * @end_cno: end checkpoint number (inclusive)
+ * @cnop: place to store the next checkpoint number
+ * @bhp: place to store a pointer to buffer_head struct
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block exists in the range.
+ */
+static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
+					      __u64 start_cno, __u64 end_cno,
+					      __u64 *cnop,
+					      struct buffer_head **bhp)
+{
+	unsigned long start, end, blkoff;
+	int ret;
+
+	if (unlikely(start_cno > end_cno))
+		return -ENOENT;
+
+	start = nilfs_cpfile_get_blkoff(cpfile, start_cno);
+	end = nilfs_cpfile_get_blkoff(cpfile, end_cno);
+
+	ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp);
+	if (!ret)
+		*cnop = (blkoff == start) ? start_cno :
+			nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff);
+	return ret;
+}
+
+static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
+						       __u64 cno)
+{
+	return nilfs_mdt_delete_block(cpfile,
+				      nilfs_cpfile_get_blkoff(cpfile, cno));
+}
+
+/**
+ * nilfs_cpfile_get_checkpoint - get a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @create: create flag
+ * @cpp: pointer to a checkpoint
+ * @bhp: pointer to a buffer head
+ *
+ * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
+ * specified by @cno. A new checkpoint will be created if @cno is the current
+ * checkpoint number and @create is nonzero.
+ *
+ * Return Value: On success, 0 is returned, and the checkpoint and the
+ * buffer head of the buffer on which the checkpoint is located are stored in
+ * the place pointed by @cpp and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ *
+ * %-EINVAL - invalid checkpoint.
+ */
+int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
+				__u64 cno,
+				int create,
+				struct nilfs_checkpoint **cpp,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
+		     (cno < nilfs_mdt_cno(cpfile) && create)))
+		return -EINVAL;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
+	if (ret < 0)
+		goto out_header;
+	kaddr = kmap(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		if (!create) {
+			kunmap(cp_bh->b_page);
+			brelse(cp_bh);
+			ret = -ENOENT;
+			goto out_header;
+		}
+		/* a newly-created checkpoint */
+		nilfs_checkpoint_clear_invalid(cp);
+		if (!nilfs_cpfile_is_in_first(cpfile, cno))
+			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+								 kaddr, 1);
+		mark_buffer_dirty(cp_bh);
+
+		kaddr = kmap_atomic(header_bh->b_page);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, 1);
+		kunmap_atomic(kaddr);
+		mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+	}
+
+	if (cpp != NULL)
+		*cpp = cp;
+	*bhp = cp_bh;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_put_checkpoint - put a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @bh: buffer head
+ *
+ * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
+ * specified by @cno. @bh must be the buffer head which has been returned by
+ * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ */
+void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct buffer_head *bh)
+{
+	kunmap(bh->b_page);
+	brelse(bh);
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoints - delete checkpoints
+ * @cpfile: inode of checkpoint file
+ * @start: start checkpoint number
+ * @end: end checkpoint numer
+ *
+ * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
+ * the period from @start to @end, excluding @end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
+int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
+				    __u64 start,
+				    __u64 end)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cno;
+	void *kaddr;
+	unsigned long tnicps;
+	int ret, ncps, nicps, nss, count, i;
+
+	if (unlikely(start == 0 || start > end)) {
+		printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
+		       "[%llu, %llu)\n", __func__,
+		       (unsigned long long)start, (unsigned long long)end);
+		return -EINVAL;
+	}
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	tnicps = 0;
+	nss = 0;
+
+	for (cno = start; cno < end; cno += ncps) {
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				break;
+			/* skip hole */
+			ret = 0;
+			continue;
+		}
+
+		kaddr = kmap_atomic(cp_bh->b_page);
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, cno, cp_bh, kaddr);
+		nicps = 0;
+		for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
+			if (nilfs_checkpoint_snapshot(cp)) {
+				nss++;
+			} else if (!nilfs_checkpoint_invalid(cp)) {
+				nilfs_checkpoint_set_invalid(cp);
+				nicps++;
+			}
+		}
+		if (nicps > 0) {
+			tnicps += nicps;
+			mark_buffer_dirty(cp_bh);
+			nilfs_mdt_mark_dirty(cpfile);
+			if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
+				count =
+				  nilfs_cpfile_block_sub_valid_checkpoints(
+						cpfile, cp_bh, kaddr, nicps);
+				if (count == 0) {
+					/* make hole */
+					kunmap_atomic(kaddr);
+					brelse(cp_bh);
+					ret =
+					  nilfs_cpfile_delete_checkpoint_block(
+								   cpfile, cno);
+					if (ret == 0)
+						continue;
+					printk(KERN_ERR
+					       "%s: cannot delete block\n",
+					       __func__);
+					break;
+				}
+			}
+		}
+
+		kunmap_atomic(kaddr);
+		brelse(cp_bh);
+	}
+
+	if (tnicps > 0) {
+		kaddr = kmap_atomic(header_bh->b_page);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
+		mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+		kunmap_atomic(kaddr);
+	}
+
+	brelse(header_bh);
+	if (nss > 0)
+		ret = -EBUSY;
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
+					      struct nilfs_checkpoint *cp,
+					      struct nilfs_cpinfo *ci)
+{
+	ci->ci_flags = le32_to_cpu(cp->cp_flags);
+	ci->ci_cno = le64_to_cpu(cp->cp_cno);
+	ci->ci_create = le64_to_cpu(cp->cp_create);
+	ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
+	ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
+	ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
+	ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+}
+
+static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
+					  void *buf, unsigned cisz, size_t nci)
+{
+	struct nilfs_checkpoint *cp;
+	struct nilfs_cpinfo *ci = buf;
+	struct buffer_head *bh;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
+	void *kaddr;
+	int n, ret;
+	int ncps, i;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	for (n = 0; n < nci; cno += ncps) {
+		ret = nilfs_cpfile_find_checkpoint_block(
+			cpfile, cno, cur_cno - 1, &cno, &bh);
+		if (ret < 0) {
+			if (likely(ret == -ENOENT))
+				break;
+			goto out;
+		}
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
+
+		kaddr = kmap_atomic(bh->b_page);
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
+			if (!nilfs_checkpoint_invalid(cp)) {
+				nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
+								  ci);
+				ci = (void *)ci + cisz;
+				n++;
+			}
+		}
+		kunmap_atomic(kaddr);
+		brelse(bh);
+	}
+
+	ret = n;
+	if (n > 0) {
+		ci = (void *)ci - cisz;
+		*cnop = ci->ci_cno + 1;
+	}
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
+					  void *buf, unsigned cisz, size_t nci)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_cpinfo *ci = buf;
+	__u64 curr = *cnop, next;
+	unsigned long curr_blkoff, next_blkoff;
+	void *kaddr;
+	int n = 0, ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	if (curr == 0) {
+		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+		if (ret < 0)
+			goto out;
+		kaddr = kmap_atomic(bh->b_page);
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
+		kunmap_atomic(kaddr);
+		brelse(bh);
+		if (curr == 0) {
+			ret = 0;
+			goto out;
+		}
+	} else if (unlikely(curr == ~(__u64)0)) {
+		ret = 0;
+		goto out;
+	}
+
+	curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			ret = 0; /* No snapshots (started from a hole block) */
+		goto out;
+	}
+	kaddr = kmap_atomic(bh->b_page);
+	while (n < nci) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
+		curr = ~(__u64)0; /* Terminator */
+		if (unlikely(nilfs_checkpoint_invalid(cp) ||
+			     !nilfs_checkpoint_snapshot(cp)))
+			break;
+		nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
+		ci = (void *)ci + cisz;
+		n++;
+		next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+		if (next == 0)
+			break; /* reach end of the snapshot list */
+
+		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
+		if (curr_blkoff != next_blkoff) {
+			kunmap_atomic(kaddr);
+			brelse(bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
+								0, &bh);
+			if (unlikely(ret < 0)) {
+				WARN_ON(ret == -ENOENT);
+				goto out;
+			}
+			kaddr = kmap_atomic(bh->b_page);
+		}
+		curr = next;
+		curr_blkoff = next_blkoff;
+	}
+	kunmap_atomic(kaddr);
+	brelse(bh);
+	*cnop = curr;
+	ret = n;
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_get_cpinfo -
+ * @cpfile:
+ * @cno:
+ * @ci:
+ * @nci:
+ */
+
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
+				void *buf, unsigned cisz, size_t nci)
+{
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoint -
+ * @cpfile:
+ * @cno:
+ */
+int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
+{
+	struct nilfs_cpinfo ci;
+	__u64 tcno = cno;
+	ssize_t nci;
+
+	nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
+	if (nci < 0)
+		return nci;
+	else if (nci == 0 || ci.ci_cno != cno)
+		return -ENOENT;
+	else if (nilfs_cpinfo_snapshot(&ci))
+		return -EBUSY;
+
+	return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
+}
+
+static struct nilfs_snapshot_list *
+nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
+				     __u64 cno,
+				     struct buffer_head *bh,
+				     void *kaddr)
+{
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+
+	if (cno != 0) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		list = &cp->cp_snapshot_list;
+	} else {
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		list = &header->ch_snapshot_list;
+	}
+	return list;
+}
+
+static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 curr, prev;
+	unsigned long curr_blkoff, prev_blkoff;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr);
+		goto out_cp;
+	}
+	if (nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr);
+		goto out_cp;
+	}
+	kunmap_atomic(kaddr);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	list = &header->ch_snapshot_list;
+	curr_bh = header_bh;
+	get_bh(curr_bh);
+	curr = 0;
+	curr_blkoff = 0;
+	prev = le64_to_cpu(list->ssl_prev);
+	while (prev > cno) {
+		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
+		curr = prev;
+		if (curr_blkoff != prev_blkoff) {
+			kunmap_atomic(kaddr);
+			brelse(curr_bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
+								0, &curr_bh);
+			if (ret < 0)
+				goto out_header;
+			kaddr = kmap_atomic(curr_bh->b_page);
+		}
+		curr_blkoff = prev_blkoff;
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, curr, curr_bh, kaddr);
+		list = &cp->cp_snapshot_list;
+		prev = le64_to_cpu(list->ssl_prev);
+	}
+	kunmap_atomic(kaddr);
+
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_curr;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(curr_bh->b_page);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, curr, curr_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(cno);
+	kunmap_atomic(kaddr);
+
+	kaddr = kmap_atomic(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
+	nilfs_checkpoint_set_snapshot(cp);
+	kunmap_atomic(kaddr);
+
+	kaddr = kmap_atomic(prev_bh->b_page);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(cno);
+	kunmap_atomic(kaddr);
+
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, 1);
+	kunmap_atomic(kaddr);
+
+	mark_buffer_dirty(prev_bh);
+	mark_buffer_dirty(curr_bh);
+	mark_buffer_dirty(cp_bh);
+	mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_curr:
+	brelse(curr_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 next, prev;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr);
+		goto out_cp;
+	}
+	if (!nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr);
+		goto out_cp;
+	}
+
+	list = &cp->cp_snapshot_list;
+	next = le64_to_cpu(list->ssl_next);
+	prev = le64_to_cpu(list->ssl_prev);
+	kunmap_atomic(kaddr);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	if (next != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
+							&next_bh);
+		if (ret < 0)
+			goto out_header;
+	} else {
+		next_bh = header_bh;
+		get_bh(next_bh);
+	}
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_next;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(next_bh->b_page);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, next, next_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(prev);
+	kunmap_atomic(kaddr);
+
+	kaddr = kmap_atomic(prev_bh->b_page);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(next);
+	kunmap_atomic(kaddr);
+
+	kaddr = kmap_atomic(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
+	nilfs_checkpoint_clear_snapshot(cp);
+	kunmap_atomic(kaddr);
+
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, -1);
+	kunmap_atomic(kaddr);
+
+	mark_buffer_dirty(next_bh);
+	mark_buffer_dirty(prev_bh);
+	mark_buffer_dirty(cp_bh);
+	mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_next:
+	brelse(next_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_is_snapshot -
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ *
+ * Description:
+ *
+ * Return Value: On success, 1 is returned if the checkpoint specified by
+ * @cno is a snapshot, or 0 if not. On error, one of the following negative
+ * error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	/* CP number is invalid if it's zero or larger than the
+	largest	exist one.*/
+	if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+		return -ENOENT;
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+	if (ret < 0)
+		goto out;
+	kaddr = kmap_atomic(bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp))
+		ret = -ENOENT;
+	else
+		ret = nilfs_checkpoint_snapshot(cp);
+	kunmap_atomic(kaddr);
+	brelse(bh);
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_change_cpmode - change checkpoint mode
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @status: mode of checkpoint
+ *
+ * Description: nilfs_change_cpmode() changes the mode of the checkpoint
+ * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
+{
+	int ret;
+
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
+			/*
+			 * Current implementation does not have to protect
+			 * plain read-only mounts since they are exclusive
+			 * with a read/write mount and are protected from the
+			 * cleaner.
+			 */
+			ret = -EBUSY;
+		else
+			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
+		return ret;
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_set_snapshot(cpfile, cno);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_get_stat - get checkpoint statistics
+ * @cpfile: inode of checkpoint file
+ * @stat: pointer to a structure of checkpoint statistics
+ *
+ * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(bh->b_page);
+	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+	cpstat->cs_cno = nilfs_mdt_cno(cpfile);
+	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
+	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
+	kunmap_atomic(kaddr);
+	brelse(bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_read - read or get cpfile inode
+ * @sb: super block instance
+ * @cpsize: size of a checkpoint entry
+ * @raw_inode: on-disk cpfile inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
+{
+	struct inode *cpfile;
+	int err;
+
+	if (cpsize > sb->s_blocksize) {
+		printk(KERN_ERR
+		       "NILFS: too large checkpoint size: %zu bytes.\n",
+		       cpsize);
+		return -EINVAL;
+	} else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
+		printk(KERN_ERR
+		       "NILFS: too small checkpoint size: %zu bytes.\n",
+		       cpsize);
+		return -EINVAL;
+	}
+
+	cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
+	if (unlikely(!cpfile))
+		return -ENOMEM;
+	if (!(cpfile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
+	if (err)
+		goto failed;
+
+	nilfs_mdt_set_entry_size(cpfile, cpsize,
+				 sizeof(struct nilfs_cpfile_header));
+
+	err = nilfs_read_inode_common(cpfile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(cpfile);
+ out:
+	*inodep = cpfile;
+	return 0;
+ failed:
+	iget_failed(cpfile);
+	return err;
+}
diff --git a/kernel/fs/nilfs2/cpfile.h b/kernel/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000..a242b9a31
--- /dev/null
+++ b/kernel/fs/nilfs2/cpfile.h
@@ -0,0 +1,46 @@
+/*
+ * cpfile.h - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_CPFILE_H
+#define _NILFS_CPFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+
+
+int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
+				struct nilfs_checkpoint **,
+				struct buffer_head **);
+void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
+int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
+int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
+int nilfs_cpfile_is_snapshot(struct inode *, __u64);
+int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
+				size_t);
+
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
+
+#endif	/* _NILFS_CPFILE_H */
diff --git a/kernel/fs/nilfs2/dat.c b/kernel/fs/nilfs2/dat.c
new file mode 100644
index 000000000..0d5fada91
--- /dev/null
+++ b/kernel/fs/nilfs2/dat.c
@@ -0,0 +1,529 @@
+/*
+ * dat.c - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "dat.h"
+
+
+#define NILFS_CNO_MIN	((__u64)1)
+#define NILFS_CNO_MAX	(~(__u64)0)
+
+/**
+ * struct nilfs_dat_info - on-memory private data of DAT file
+ * @mi: on-memory private data of metadata file
+ * @palloc_cache: persistent object allocator cache of DAT file
+ * @shadow: shadow map of DAT file
+ */
+struct nilfs_dat_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+	struct nilfs_shadow_map shadow;
+};
+
+static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
+{
+	return (struct nilfs_dat_info *)NILFS_MDT(dat);
+}
+
+static int nilfs_dat_prepare_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req, int create)
+{
+	return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+					    create, &req->pr_entry_bh);
+}
+
+static void nilfs_dat_commit_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req)
+{
+	mark_buffer_dirty(req->pr_entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+	brelse(req->pr_entry_bh);
+}
+
+static void nilfs_dat_abort_entry(struct inode *dat,
+				  struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_entry_bh);
+}
+
+int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+	if (ret < 0)
+		return ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 1);
+	if (ret < 0)
+		nilfs_palloc_abort_alloc_entry(dat, req);
+
+	return ret;
+}
+
+void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr);
+
+	nilfs_palloc_commit_alloc_entry(dat, req);
+	nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+	nilfs_palloc_abort_alloc_entry(dat, req);
+}
+
+static void nilfs_dat_commit_free(struct inode *dat,
+				  struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr);
+
+	nilfs_dat_commit_entry(dat, req);
+	nilfs_palloc_commit_free_entry(dat, req);
+}
+
+int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	WARN_ON(ret == -ENOENT);
+	return ret;
+}
+
+void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+			    sector_t blocknr)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr);
+
+	nilfs_dat_commit_entry(dat, req);
+}
+
+int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr);
+
+	if (blocknr == 0) {
+		ret = nilfs_palloc_prepare_free_entry(dat, req);
+		if (ret < 0) {
+			nilfs_dat_abort_entry(dat, req);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
+			  int dead)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start, end;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	end = start = le64_to_cpu(entry->de_start);
+	if (!dead) {
+		end = nilfs_mdt_cno(dat);
+		WARN_ON(start > end);
+	}
+	entry->de_end = cpu_to_le64(end);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr);
+
+	if (blocknr == 0)
+		nilfs_dat_commit_free(dat, req);
+	else
+		nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr);
+
+	if (start == nilfs_mdt_cno(dat) && blocknr == 0)
+		nilfs_palloc_abort_free_entry(dat, req);
+	nilfs_dat_abort_entry(dat, req);
+}
+
+int nilfs_dat_prepare_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_end(dat, oldreq);
+	if (!ret) {
+		ret = nilfs_dat_prepare_alloc(dat, newreq);
+		if (ret < 0)
+			nilfs_dat_abort_end(dat, oldreq);
+	}
+	return ret;
+}
+
+void nilfs_dat_commit_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq, int dead)
+{
+	nilfs_dat_commit_end(dat, oldreq, dead);
+	nilfs_dat_commit_alloc(dat, newreq);
+}
+
+void nilfs_dat_abort_update(struct inode *dat,
+			    struct nilfs_palloc_req *oldreq,
+			    struct nilfs_palloc_req *newreq)
+{
+	nilfs_dat_abort_end(dat, oldreq);
+	nilfs_dat_abort_alloc(dat, newreq);
+}
+
+/**
+ * nilfs_dat_mark_dirty -
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = vblocknr;
+	ret = nilfs_dat_prepare_entry(dat, &req, 0);
+	if (ret == 0)
+		nilfs_dat_commit_entry(dat, &req);
+	return ret;
+}
+
+/**
+ * nilfs_dat_freev - free virtual block numbers
+ * @dat: DAT file inode
+ * @vblocknrs: array of virtual block numbers
+ * @nitems: number of virtual block numbers
+ *
+ * Description: nilfs_dat_freev() frees the virtual block numbers specified by
+ * @vblocknrs and @nitems.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
+int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
+{
+	return nilfs_palloc_freev(dat, vblocknrs, nitems);
+}
+
+/**
+ * nilfs_dat_move - change a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknr: block number
+ *
+ * Description: nilfs_dat_move() changes the block number associated with
+ * @vblocknr to @blocknr.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * The given disk block number (blocknr) is not yet written to
+	 * the device at this point.
+	 *
+	 * To prevent nilfs_dat_translate() from returning the
+	 * uncommitted block number, this makes a copy of the entry
+	 * buffer and redirects nilfs_dat_translate() to the copy.
+	 */
+	if (!buffer_nilfs_redirected(entry_bh)) {
+		ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
+		if (ret) {
+			brelse(entry_bh);
+			return ret;
+		}
+	}
+
+	kaddr = kmap_atomic(entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
+		printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
+		       (unsigned long long)vblocknr,
+		       (unsigned long long)le64_to_cpu(entry->de_start),
+		       (unsigned long long)le64_to_cpu(entry->de_end));
+		kunmap_atomic(kaddr);
+		brelse(entry_bh);
+		return -EINVAL;
+	}
+	WARN_ON(blocknr == 0);
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr);
+
+	mark_buffer_dirty(entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+
+	brelse(entry_bh);
+
+	return 0;
+}
+
+/**
+ * nilfs_dat_translate - translate a virtual block number to a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknrp: pointer to a block number
+ *
+ * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
+ * to the corresponding block number.
+ *
+ * Return Value: On success, 0 is returned and the block number associated
+ * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A block number associated with @vblocknr does not exist.
+ */
+int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
+{
+	struct buffer_head *entry_bh, *bh;
+	struct nilfs_dat_entry *entry;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+
+	if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
+		bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
+		if (bh) {
+			WARN_ON(!buffer_uptodate(bh));
+			brelse(entry_bh);
+			entry_bh = bh;
+		}
+	}
+
+	kaddr = kmap_atomic(entry_bh->b_page);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	if (blocknr == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*blocknrp = blocknr;
+
+ out:
+	kunmap_atomic(kaddr);
+	brelse(entry_bh);
+	return ret;
+}
+
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
+			    size_t nvi)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	struct nilfs_vinfo *vinfo = buf;
+	__u64 first, last;
+	void *kaddr;
+	unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
+	int i, j, n, ret;
+
+	for (i = 0; i < nvi; i += n) {
+		ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
+						   0, &entry_bh);
+		if (ret < 0)
+			return ret;
+		kaddr = kmap_atomic(entry_bh->b_page);
+		/* last virtual block number in this block */
+		first = vinfo->vi_vblocknr;
+		do_div(first, entries_per_block);
+		first *= entries_per_block;
+		last = first + entries_per_block - 1;
+		for (j = i, n = 0;
+		     j < nvi && vinfo->vi_vblocknr >= first &&
+			     vinfo->vi_vblocknr <= last;
+		     j++, n++, vinfo = (void *)vinfo + visz) {
+			entry = nilfs_palloc_block_get_entry(
+				dat, vinfo->vi_vblocknr, entry_bh, kaddr);
+			vinfo->vi_start = le64_to_cpu(entry->de_start);
+			vinfo->vi_end = le64_to_cpu(entry->de_end);
+			vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
+		}
+		kunmap_atomic(kaddr);
+		brelse(entry_bh);
+	}
+
+	return nvi;
+}
+
+/**
+ * nilfs_dat_read - read or get dat inode
+ * @sb: super block instance
+ * @entry_size: size of a dat entry
+ * @raw_inode: on-disk dat inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep)
+{
+	static struct lock_class_key dat_lock_key;
+	struct inode *dat;
+	struct nilfs_dat_info *di;
+	int err;
+
+	if (entry_size > sb->s_blocksize) {
+		printk(KERN_ERR
+		       "NILFS: too large DAT entry size: %zu bytes.\n",
+		       entry_size);
+		return -EINVAL;
+	} else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
+		printk(KERN_ERR
+		       "NILFS: too small DAT entry size: %zu bytes.\n",
+		       entry_size);
+		return -EINVAL;
+	}
+
+	dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
+	if (unlikely(!dat))
+		return -ENOMEM;
+	if (!(dat->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(dat, entry_size);
+	if (err)
+		goto failed;
+
+	di = NILFS_DAT_I(dat);
+	lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+	nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+	nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+
+	err = nilfs_read_inode_common(dat, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(dat);
+ out:
+	*inodep = dat;
+	return 0;
+ failed:
+	iget_failed(dat);
+	return err;
+}
diff --git a/kernel/fs/nilfs2/dat.h b/kernel/fs/nilfs2/dat.h
new file mode 100644
index 000000000..cbd8e9732
--- /dev/null
+++ b/kernel/fs/nilfs2/dat.h
@@ -0,0 +1,59 @@
+/*
+ * dat.h - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DAT_H
+#define _NILFS_DAT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+
+struct nilfs_palloc_req;
+
+int nilfs_dat_translate(struct inode *, __u64, sector_t *);
+
+int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
+			    sector_t);
+int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+			    struct nilfs_palloc_req *);
+
+int nilfs_dat_mark_dirty(struct inode *, __u64);
+int nilfs_dat_freev(struct inode *, __u64 *, size_t);
+int nilfs_dat_move(struct inode *, __u64, sector_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep);
+
+#endif	/* _NILFS_DAT_H */
diff --git a/kernel/fs/nilfs2/dir.c b/kernel/fs/nilfs2/dir.c
new file mode 100644
index 000000000..0ee0bed36
--- /dev/null
+++ b/kernel/fs/nilfs2/dir.c
@@ -0,0 +1,676 @@
+/*
+ * dir.c - NILFS directory entry operations
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+
+#include <linux/pagemap.h>
+#include "nilfs.h"
+#include "page.h"
+
+/*
+ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned nilfs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void nilfs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
+{
+	loff_t pos = page_offset(page) + from;
+	return __block_write_begin(page, pos, to - from, nilfs_get_block);
+}
+
+static void nilfs_commit_chunk(struct page *page,
+			       struct address_space *mapping,
+			       unsigned from, unsigned to)
+{
+	struct inode *dir = mapping->host;
+	loff_t pos = page_offset(page) + from;
+	unsigned len = to - from;
+	unsigned nr_dirty, copied;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
+	copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos + copied > dir->i_size)
+		i_size_write(dir, pos + copied);
+	if (IS_DIRSYNC(dir))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	err = nilfs_set_file_dirty(dir, nr_dirty);
+	WARN_ON(err); /* do not happen */
+	unlock_page(page);
+}
+
+static void nilfs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct nilfs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct nilfs_dir_entry *)(kaddr + offs);
+		rec_len = nilfs_rec_len_from_disk(p->rec_len);
+
+		if (rec_len < NILFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	nilfs_error(sb, "nilfs_check_page",
+		    "size of directory #%lu is not a multiple of chunk size",
+		    dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+bad_entry:
+	nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
+		    "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+		    dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode),
+		    rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct nilfs_dir_entry *)(kaddr + offs);
+	nilfs_error(sb, "nilfs_check_page",
+		    "entry in directory #%lu spans the page boundary"
+		    "offset=%lu, inode=%lu",
+		    dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			nilfs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	nilfs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static int
+nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
+{
+	return (struct nilfs_dir_entry *)((char *)p +
+					  nilfs_rec_len_from_disk(p->rec_len));
+}
+
+static unsigned char
+nilfs_filetype_table[NILFS_FT_MAX] = {
+	[NILFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[NILFS_FT_REG_FILE]	= DT_REG,
+	[NILFS_FT_DIR]		= DT_DIR,
+	[NILFS_FT_CHRDEV]	= DT_CHR,
+	[NILFS_FT_BLKDEV]	= DT_BLK,
+	[NILFS_FT_FIFO]		= DT_FIFO,
+	[NILFS_FT_SOCK]		= DT_SOCK,
+	[NILFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char
+nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= NILFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= NILFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= NILFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= NILFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= NILFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= NILFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= NILFS_FT_SYMLINK,
+};
+
+static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
+{
+	umode_t mode = inode->i_mode;
+
+	de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+static int nilfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+/*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
+
+	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
+		return 0;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct nilfs_dir_entry *de;
+		struct page *page = nilfs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			nilfs_error(sb, __func__, "bad page in #%lu",
+				    inode->i_ino);
+			ctx->pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
+		}
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)(kaddr + offset);
+		limit = kaddr + nilfs_last_byte(inode, n) -
+			NILFS_DIR_REC_LEN(1);
+		for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				nilfs_error(sb, __func__,
+					    "zero-length directory entry");
+				nilfs_put_page(page);
+				return -EIO;
+			}
+			if (de->inode) {
+				unsigned char t;
+
+				if (de->file_type < NILFS_FT_MAX)
+					t = nilfs_filetype_table[de->file_type];
+				else
+					t = DT_UNKNOWN;
+
+				if (!dir_emit(ctx, de->name, de->name_len,
+						le64_to_cpu(de->inode), t)) {
+					nilfs_put_page(page);
+					return 0;
+				}
+			}
+			ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
+		}
+		nilfs_put_page(page);
+	}
+	return 0;
+}
+
+/*
+ *	nilfs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
+		 struct page **res_page)
+{
+	const unsigned char *name = qstr->name;
+	int namelen = qstr->len;
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct nilfs_inode_info *ei = NILFS_I(dir);
+	struct nilfs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ei->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = nilfs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct nilfs_dir_entry *)kaddr;
+			kaddr += nilfs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					nilfs_error(dir->i_sb, __func__,
+						"zero-length directory entry");
+					nilfs_put_page(page);
+					goto out;
+				}
+				if (nilfs_match(namelen, name, de))
+					goto found;
+				de = nilfs_next_entry(de);
+			}
+			nilfs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+		/* next page is past the blocks we've got */
+		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+			nilfs_error(dir->i_sb, __func__,
+			       "dir %lu size %lld exceeds block count %llu",
+			       dir->i_ino, dir->i_size,
+			       (unsigned long long)dir->i_blocks);
+			goto out;
+		}
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ei->i_dir_start_lookup = n;
+	return de;
+}
+
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = nilfs_get_page(dir, 0);
+	struct nilfs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = nilfs_next_entry(
+			(struct nilfs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
+{
+	ino_t res = 0;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+
+	de = nilfs_find_entry(dir, qstr, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode);
+		kunmap(page);
+		page_cache_release(page);
+	}
+	return res;
+}
+
+/* Releases the page */
+void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+		    struct page *page, struct inode *inode)
+{
+	unsigned from = (char *) de - (char *) page_address(page);
+	unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
+	struct address_space *mapping = page->mapping;
+	int err;
+
+	lock_page(page);
+	err = nilfs_prepare_chunk(page, from, to);
+	BUG_ON(err);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+}
+
+/*
+ *	Parent is locked.
+ */
+int nilfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct nilfs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	unsigned from, to;
+	int err;
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = nilfs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + nilfs_last_byte(dir, n);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = nilfs_rec_len_to_disk(chunk_size);
+				de->inode = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				nilfs_error(dir->i_sb, __func__,
+					    "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (nilfs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = NILFS_DIR_REC_LEN(de->name_len);
+			rec_len = nilfs_rec_len_from_disk(de->rec_len);
+			if (!de->inode && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct nilfs_dir_entry *)((char *)de + rec_len);
+		}
+		unlock_page(page);
+		nilfs_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	from = (char *)de - (char *)page_address(page);
+	to = from + rec_len;
+	err = nilfs_prepare_chunk(page, from, to);
+	if (err)
+		goto out_unlock;
+	if (de->inode) {
+		struct nilfs_dir_entry *de1;
+
+		de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = nilfs_rec_len_to_disk(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	nilfs_commit_chunk(page, page->mapping, from, to);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	nilfs_mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	nilfs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+/*
+ * nilfs_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+	unsigned to = ((char *)dir - kaddr) +
+		nilfs_rec_len_from_disk(dir->rec_len);
+	struct nilfs_dir_entry *pde = NULL;
+	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+	int err;
+
+	while ((char *)de < (char *)dir) {
+		if (de->rec_len == 0) {
+			nilfs_error(inode->i_sb, __func__,
+				    "zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = nilfs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	lock_page(page);
+	err = nilfs_prepare_chunk(page, from, to);
+	BUG_ON(err);
+	if (pde)
+		pde->rec_len = nilfs_rec_len_to_disk(to - from);
+	dir->inode = 0;
+	nilfs_commit_chunk(page, mapping, from, to);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+out:
+	nilfs_put_page(page);
+	return err;
+}
+
+/*
+ * Set the first fragment of directory.
+ */
+int nilfs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = nilfs_chunk_size(inode);
+	struct nilfs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = nilfs_prepare_chunk(page, 0, chunk_size);
+	if (unlikely(err)) {
+		unlock_page(page);
+		goto fail;
+	}
+	kaddr = kmap_atomic(page);
+	memset(kaddr, 0, chunk_size);
+	de = (struct nilfs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
+	memcpy(de->name, ".\0\0", 4);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+
+	de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
+	de->inode = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, "..\0", 4);
+	nilfs_set_de_type(de, inode);
+	kunmap_atomic(kaddr);
+	nilfs_commit_chunk(page, mapping, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int nilfs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct nilfs_dir_entry *de;
+
+		page = nilfs_get_page(inode, i);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				nilfs_error(inode->i_sb, __func__,
+					    "zero-length directory entry "
+					    "(kaddr=%p, de=%p)\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (de->inode !=
+					    cpu_to_le64(inode->i_ino))
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = nilfs_next_entry(de);
+		}
+		nilfs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	nilfs_put_page(page);
+	return 0;
+}
+
+const struct file_operations nilfs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= nilfs_readdir,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_compat_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.fsync		= nilfs_sync_file,
+
+};
diff --git a/kernel/fs/nilfs2/direct.c b/kernel/fs/nilfs2/direct.c
new file mode 100644
index 000000000..ebf89fd8a
--- /dev/null
+++ b/kernel/fs/nilfs2/direct.c
@@ -0,0 +1,386 @@
+/*
+ * direct.c - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "page.h"
+#include "direct.h"
+#include "alloc.h"
+#include "dat.h"
+
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
+{
+	return (__le64 *)
+		((struct nilfs_direct_node *)direct->b_u.u_data + 1);
+}
+
+static inline __u64
+nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
+{
+	return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
+}
+
+static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
+					__u64 key, __u64 ptr)
+{
+	*(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
+}
+
+static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
+			       __u64 key, int level, __u64 *ptrp)
+{
+	__u64 ptr;
+
+	if (key > NILFS_DIRECT_KEY_MAX || level != 1)
+		return -ENOENT;
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (ptr == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	*ptrp = ptr;
+	return 0;
+}
+
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
+				      __u64 key, __u64 *ptrp,
+				      unsigned maxblocks)
+{
+	struct inode *dat = NULL;
+	__u64 ptr, ptr2;
+	sector_t blocknr;
+	int ret, cnt;
+
+	if (key > NILFS_DIRECT_KEY_MAX)
+		return -ENOENT;
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (ptr == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	if (NILFS_BMAP_USE_VBN(direct)) {
+		dat = nilfs_bmap_get_dat(direct);
+		ret = nilfs_dat_translate(dat, ptr, &blocknr);
+		if (ret < 0)
+			return ret;
+		ptr = blocknr;
+	}
+
+	maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+	for (cnt = 1; cnt < maxblocks &&
+		     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
+		     NILFS_BMAP_INVALID_PTR;
+	     cnt++) {
+		if (dat) {
+			ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+			if (ret < 0)
+				return ret;
+			ptr2 = blocknr;
+		}
+		if (ptr2 != ptr + cnt)
+			break;
+	}
+	*ptrp = ptr;
+	return cnt;
+}
+
+static __u64
+nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(direct, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else
+		/* block group */
+		return nilfs_bmap_find_target_in_group(direct);
+}
+
+static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	union nilfs_bmap_ptr_req req;
+	struct inode *dat = NULL;
+	struct buffer_head *bh;
+	int ret;
+
+	if (key > NILFS_DIRECT_KEY_MAX)
+		return -ENOENT;
+	if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
+		return -EEXIST;
+
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
+		dat = nilfs_bmap_get_dat(bmap);
+	}
+	ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+	if (!ret) {
+		/* ptr must be a pointer to a buffer head. */
+		bh = (struct buffer_head *)((unsigned long)ptr);
+		set_buffer_nilfs_volatile(bh);
+
+		nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
+
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
+
+		if (NILFS_BMAP_USE_VBN(bmap))
+			nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
+
+		nilfs_inode_add_blocks(bmap->b_inode, 1);
+	}
+	return ret;
+}
+
+static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	union nilfs_bmap_ptr_req req;
+	struct inode *dat;
+	int ret;
+
+	if (key > NILFS_DIRECT_KEY_MAX ||
+	    nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+	req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
+
+	ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+	if (!ret) {
+		nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
+		nilfs_inode_sub_blocks(bmap->b_inode, 1);
+	}
+	return ret;
+}
+
+static int nilfs_direct_seek_key(const struct nilfs_bmap *direct, __u64 start,
+				 __u64 *keyp)
+{
+	__u64 key;
+
+	for (key = start; key <= NILFS_DIRECT_KEY_MAX; key++) {
+		if (nilfs_direct_get_ptr(direct, key) !=
+		    NILFS_BMAP_INVALID_PTR) {
+			*keyp = key;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
+{
+	__u64 key, lastkey;
+
+	lastkey = NILFS_DIRECT_KEY_MAX + 1;
+	for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
+		if (nilfs_direct_get_ptr(direct, key) !=
+		    NILFS_BMAP_INVALID_PTR)
+			lastkey = key;
+
+	if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
+		return -ENOENT;
+
+	*keyp = lastkey;
+
+	return 0;
+}
+
+static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
+{
+	return key > NILFS_DIRECT_KEY_MAX;
+}
+
+static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
+				    __u64 *keys, __u64 *ptrs, int nitems)
+{
+	__u64 key;
+	__u64 ptr;
+	int n;
+
+	if (nitems > NILFS_DIRECT_NBLOCKS)
+		nitems = NILFS_DIRECT_NBLOCKS;
+	n = 0;
+	for (key = 0; key < nitems; key++) {
+		ptr = nilfs_direct_get_ptr(direct, key);
+		if (ptr != NILFS_BMAP_INVALID_PTR) {
+			keys[n] = key;
+			ptrs[n] = ptr;
+			n++;
+		}
+	}
+	return n;
+}
+
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
+				    __u64 key, __u64 *keys, __u64 *ptrs, int n)
+{
+	__le64 *dptrs;
+	int ret, i, j;
+
+	/* no need to allocate any resource for conversion */
+
+	/* delete */
+	ret = bmap->b_ops->bop_delete(bmap, key);
+	if (ret < 0)
+		return ret;
+
+	/* free resources */
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+
+	/* convert */
+	dptrs = nilfs_direct_dptrs(bmap);
+	for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
+		if ((j < n) && (i == keys[j])) {
+			dptrs[i] = (i != key) ?
+				cpu_to_le64(ptrs[j]) :
+				NILFS_BMAP_INVALID_PTR;
+			j++;
+		} else
+			dptrs[i] = NILFS_BMAP_INVALID_PTR;
+	}
+
+	nilfs_direct_init(bmap);
+	return 0;
+}
+
+static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
+				  struct buffer_head *bh)
+{
+	struct nilfs_palloc_req oldreq, newreq;
+	struct inode *dat;
+	__u64 key;
+	__u64 ptr;
+	int ret;
+
+	if (!NILFS_BMAP_USE_VBN(bmap))
+		return 0;
+
+	dat = nilfs_bmap_get_dat(bmap);
+	key = nilfs_bmap_data_get_key(bmap, bh);
+	ptr = nilfs_direct_get_ptr(bmap, key);
+	if (!buffer_nilfs_volatile(bh)) {
+		oldreq.pr_entry_nr = ptr;
+		newreq.pr_entry_nr = ptr;
+		ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
+		if (ret < 0)
+			return ret;
+		nilfs_dat_commit_update(dat, &oldreq, &newreq,
+					bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
+		set_buffer_nilfs_volatile(bh);
+		nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
+	} else
+		ret = nilfs_dat_mark_dirty(dat, ptr);
+
+	return ret;
+}
+
+static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	struct inode *dat = nilfs_bmap_get_dat(direct);
+	union nilfs_bmap_ptr_req req;
+	int ret;
+
+	req.bpr_ptr = ptr;
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (!ret) {
+		nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+		binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
+		binfo->bi_v.bi_blkoff = cpu_to_le64(key);
+	}
+	return ret;
+}
+
+static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	nilfs_direct_set_ptr(direct, key, blocknr);
+
+	binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
+	binfo->bi_dat.bi_level = 0;
+
+	return 0;
+}
+
+static int nilfs_direct_assign(struct nilfs_bmap *bmap,
+			       struct buffer_head **bh,
+			       sector_t blocknr,
+			       union nilfs_binfo *binfo)
+{
+	__u64 key;
+	__u64 ptr;
+
+	key = nilfs_bmap_data_get_key(bmap, *bh);
+	if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
+		printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
+		       (unsigned long long)key);
+		return -EINVAL;
+	}
+	ptr = nilfs_direct_get_ptr(bmap, key);
+	if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
+		printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
+		       (unsigned long long)ptr);
+		return -EINVAL;
+	}
+
+	return NILFS_BMAP_USE_VBN(bmap) ?
+		nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
+		nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
+}
+
+static const struct nilfs_bmap_operations nilfs_direct_ops = {
+	.bop_lookup		=	nilfs_direct_lookup,
+	.bop_lookup_contig	=	nilfs_direct_lookup_contig,
+	.bop_insert		=	nilfs_direct_insert,
+	.bop_delete		=	nilfs_direct_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_direct_propagate,
+
+	.bop_lookup_dirty_buffers	=	NULL,
+
+	.bop_assign		=	nilfs_direct_assign,
+	.bop_mark		=	NULL,
+
+	.bop_seek_key		=	nilfs_direct_seek_key,
+	.bop_last_key		=	nilfs_direct_last_key,
+
+	.bop_check_insert	=	nilfs_direct_check_insert,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	nilfs_direct_gather_data,
+};
+
+
+int nilfs_direct_init(struct nilfs_bmap *bmap)
+{
+	bmap->b_ops = &nilfs_direct_ops;
+	return 0;
+}
diff --git a/kernel/fs/nilfs2/direct.h b/kernel/fs/nilfs2/direct.h
new file mode 100644
index 000000000..dc643de20
--- /dev/null
+++ b/kernel/fs/nilfs2/direct.h
@@ -0,0 +1,51 @@
+/*
+ * direct.h - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DIRECT_H
+#define _NILFS_DIRECT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "bmap.h"
+
+
+/**
+ * struct nilfs_direct_node - direct node
+ * @dn_flags: flags
+ * @dn_pad: padding
+ */
+struct nilfs_direct_node {
+	__u8 dn_flags;
+	__u8 pad[7];
+};
+
+#define NILFS_DIRECT_NBLOCKS	(NILFS_BMAP_SIZE / sizeof(__le64) - 1)
+#define NILFS_DIRECT_KEY_MIN	0
+#define NILFS_DIRECT_KEY_MAX	(NILFS_DIRECT_NBLOCKS - 1)
+
+
+int nilfs_direct_init(struct nilfs_bmap *);
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
+				    __u64 *, int);
+
+
+#endif	/* _NILFS_DIRECT_H */
diff --git a/kernel/fs/nilfs2/export.h b/kernel/fs/nilfs2/export.h
new file mode 100644
index 000000000..19ccbf952
--- /dev/null
+++ b/kernel/fs/nilfs2/export.h
@@ -0,0 +1,25 @@
+#ifndef NILFS_EXPORT_H
+#define NILFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations nilfs_export_ops;
+
+/**
+ * struct nilfs_fid - NILFS file id type
+ * @cno: checkpoint number
+ * @ino: inode number
+ * @gen: file generation (version) for NFS
+ * @parent_gen: parent generation (version) for NFS
+ * @parent_ino: parent inode number
+ */
+struct nilfs_fid {
+	u64 cno;
+	u64 ino;
+	u32 gen;
+
+	u32 parent_gen;
+	u64 parent_ino;
+} __attribute__ ((packed));
+
+#endif
diff --git a/kernel/fs/nilfs2/file.c b/kernel/fs/nilfs2/file.c
new file mode 100644
index 000000000..54575e3cc
--- /dev/null
+++ b/kernel/fs/nilfs2/file.c
@@ -0,0 +1,165 @@
+/*
+ * file.c - NILFS regular file handling primitives including fsync().
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>,
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include "nilfs.h"
+#include "segment.h"
+
+int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	/*
+	 * Called from fsync() system call
+	 * This is the only entry point that can catch write and synch
+	 * timing for both data blocks and intermediate blocks.
+	 *
+	 * This function should be implemented when the writeback function
+	 * will be implemented.
+	 */
+	struct the_nilfs *nilfs;
+	struct inode *inode = file->f_mapping->host;
+	int err = 0;
+
+	if (nilfs_inode_dirty(inode)) {
+		if (datasync)
+			err = nilfs_construct_dsync_segment(inode->i_sb, inode,
+							    start, end);
+		else
+			err = nilfs_construct_segment(inode->i_sb);
+	}
+
+	nilfs = inode->i_sb->s_fs_info;
+	if (!err)
+		err = nilfs_flush_device(nilfs);
+
+	return err;
+}
+
+static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct nilfs_transaction_info ti;
+	int ret = 0;
+
+	if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
+		return VM_FAULT_SIGBUS; /* -ENOSPC */
+
+	sb_start_pagefault(inode->i_sb);
+	lock_page(page);
+	if (page->mapping != inode->i_mapping ||
+	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+		unlock_page(page);
+		ret = -EFAULT;	/* make the VM retry the fault */
+		goto out;
+	}
+
+	/*
+	 * check to see if the page is mapped already (no holes)
+	 */
+	if (PageMappedToDisk(page))
+		goto mapped;
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int fully_mapped = 1;
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_mapped(bh)) {
+				fully_mapped = 0;
+				break;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (fully_mapped) {
+			SetPageMappedToDisk(page);
+			goto mapped;
+		}
+	}
+	unlock_page(page);
+
+	/*
+	 * fill hole blocks
+	 */
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+	/* never returns -ENOMEM, but may return -ENOSPC */
+	if (unlikely(ret))
+		goto out;
+
+	file_update_time(vma->vm_file);
+	ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+	if (ret) {
+		nilfs_transaction_abort(inode->i_sb);
+		goto out;
+	}
+	nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
+	nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
+	wait_for_stable_page(page);
+ out:
+	sb_end_pagefault(inode->i_sb);
+	return block_page_mkwrite_return(ret);
+}
+
+static const struct vm_operations_struct nilfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= nilfs_page_mkwrite,
+};
+
+static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+	vma->vm_ops = &nilfs_file_vm_ops;
+	return 0;
+}
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the nilfs filesystem.
+ */
+const struct file_operations nilfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_compat_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.mmap		= nilfs_file_mmap,
+	.open		= generic_file_open,
+	/* .release	= nilfs_release_file, */
+	.fsync		= nilfs_sync_file,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations nilfs_file_inode_operations = {
+	.setattr	= nilfs_setattr,
+	.permission     = nilfs_permission,
+	.fiemap		= nilfs_fiemap,
+};
+
+/* end of file */
diff --git a/kernel/fs/nilfs2/gcinode.c b/kernel/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000..748ca2389
--- /dev/null
+++ b/kernel/fs/nilfs2/gcinode.c
@@ -0,0 +1,197 @@
+/*
+ * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+/*
+ * This file adds the cache of on-disk blocks to be moved in garbage
+ * collection.  The disk blocks are held with dummy inodes (called
+ * gcinodes), and this file provides lookup function of the dummy
+ * inodes and their buffer read function.
+ *
+ * Buffers and pages held by the dummy inodes will be released each
+ * time after they are copied to a new log.  Dirty blocks made on the
+ * current generation and the blocks to be moved by GC never overlap
+ * because the dirty blocks make a new generation; they rather must be
+ * written individually.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "btree.h"
+#include "btnode.h"
+#include "page.h"
+#include "mdt.h"
+#include "dat.h"
+#include "ifile.h"
+
+/*
+ * nilfs_gccache_submit_read_data() - add data buffer and submit read request
+ * @inode - gc inode
+ * @blkoff - dummy offset treated as the key for the page cache
+ * @pbn - physical block number of the block
+ * @vbn - virtual block number of the block, 0 for non-virtual block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_data() registers the data buffer
+ * specified by @pbn to the GC pagecache with the key @blkoff.
+ * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The block specified with @pbn does not exist.
+ */
+int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
+				   sector_t pbn, __u64 vbn,
+				   struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (pbn == 0) {
+		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+		err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
+		if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
+			brelse(bh);
+			goto failed;
+		}
+	}
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	if (!buffer_mapped(bh)) {
+		bh->b_bdev = inode->i_sb->s_bdev;
+		set_buffer_mapped(bh);
+	}
+	bh->b_blocknr = pbn;
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	if (vbn)
+		bh->b_blocknr = vbn;
+ out:
+	err = 0;
+	*out_bh = bh;
+
+ failed:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return err;
+}
+
+/*
+ * nilfs_gccache_submit_read_node() - add node buffer and submit read request
+ * @inode - gc inode
+ * @pbn - physical block number for the block
+ * @vbn - virtual block number for the block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_node() registers the node buffer
+ * specified by @vbn to the GC pagecache.  @pbn can be supplied by the
+ * caller to avoid translation of the disk block address.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
+				   __u64 vbn, struct buffer_head **out_bh)
+{
+	int ret;
+
+	ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+					vbn ? : pbn, pbn, READ, out_bh, &pbn);
+	if (ret == -EEXIST) /* internal code (cache hit) */
+		ret = 0;
+	return ret;
+}
+
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
+{
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh))
+		return -EIO;
+	if (buffer_dirty(bh))
+		return -EEXIST;
+
+	if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) {
+		clear_buffer_uptodate(bh);
+		return -EIO;
+	}
+	mark_buffer_dirty(bh);
+	return 0;
+}
+
+int nilfs_init_gcinode(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	inode->i_mapping->a_ops = &empty_aops;
+
+	ii->i_flags = 0;
+	nilfs_bmap_init_gc(ii->i_bmap);
+
+	return 0;
+}
+
+/**
+ * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
+ */
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
+{
+	struct list_head *head = &nilfs->ns_gc_inodes;
+	struct nilfs_inode_info *ii;
+
+	while (!list_empty(head)) {
+		ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
+		list_del_init(&ii->i_dirty);
+		truncate_inode_pages(&ii->vfs_inode.i_data, 0);
+		nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+		iput(&ii->vfs_inode);
+	}
+}
diff --git a/kernel/fs/nilfs2/ifile.c b/kernel/fs/nilfs2/ifile.c
new file mode 100644
index 000000000..6548c7851
--- /dev/null
+++ b/kernel/fs/nilfs2/ifile.c
@@ -0,0 +1,227 @@
+/*
+ * ifile.c - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "ifile.h"
+
+/**
+ * struct nilfs_ifile_info - on-memory private data of ifile
+ * @mi: on-memory private data of metadata file
+ * @palloc_cache: persistent object allocator cache of ifile
+ */
+struct nilfs_ifile_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
+{
+	return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
+}
+
+/**
+ * nilfs_ifile_create_inode - create a new disk inode
+ * @ifile: ifile inode
+ * @out_ino: pointer to a variable to store inode number
+ * @out_bh: buffer_head contains newly allocated disk inode
+ *
+ * Return Value: On success, 0 is returned and the newly allocated inode
+ * number is stored in the place pointed by @ino, and buffer_head pointer
+ * that contains newly allocated disk inode structure is stored in the
+ * place pointed by @out_bh
+ * On error, one of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No inode left.
+ */
+int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
+			     struct buffer_head **out_bh)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
+				 a group. dull code!! */
+	req.pr_entry_bh = NULL;
+
+	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_alloc_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+	nilfs_palloc_commit_alloc_entry(ifile, &req);
+	mark_buffer_dirty(req.pr_entry_bh);
+	nilfs_mdt_mark_dirty(ifile);
+	*out_ino = (ino_t)req.pr_entry_nr;
+	*out_bh = req.pr_entry_bh;
+	return 0;
+}
+
+/**
+ * nilfs_ifile_delete_inode - delete a disk inode
+ * @ifile: ifile inode
+ * @ino: inode number
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The inode number @ino have not been allocated.
+ */
+int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
+{
+	struct nilfs_palloc_req req = {
+		.pr_entry_nr = ino, .pr_entry_bh = NULL
+	};
+	struct nilfs_inode *raw_inode;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_prepare_free_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_free_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req.pr_entry_bh->b_page);
+	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
+						 req.pr_entry_bh, kaddr);
+	raw_inode->i_flags = 0;
+	kunmap_atomic(kaddr);
+
+	mark_buffer_dirty(req.pr_entry_bh);
+	brelse(req.pr_entry_bh);
+
+	nilfs_palloc_commit_free_entry(ifile, &req);
+
+	return 0;
+}
+
+int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
+				struct buffer_head **out_bh)
+{
+	struct super_block *sb = ifile->i_sb;
+	int err;
+
+	if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
+		nilfs_error(sb, __func__, "bad inode number: %lu",
+			    (unsigned long) ino);
+		return -EINVAL;
+	}
+
+	err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
+	if (unlikely(err))
+		nilfs_warning(sb, __func__, "unable to read inode: %lu",
+			      (unsigned long) ino);
+	return err;
+}
+
+/**
+ * nilfs_ifile_count_free_inodes - calculate free inodes count
+ * @ifile: ifile inode
+ * @nmaxinodes: current maximum of available inodes count [out]
+ * @nfreeinodes: free inodes count [out]
+ */
+int nilfs_ifile_count_free_inodes(struct inode *ifile,
+				    u64 *nmaxinodes, u64 *nfreeinodes)
+{
+	u64 nused;
+	int err;
+
+	*nmaxinodes = 0;
+	*nfreeinodes = 0;
+
+	nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
+	err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
+	if (likely(!err))
+		*nfreeinodes = *nmaxinodes - nused;
+	return err;
+}
+
+/**
+ * nilfs_ifile_read - read or get ifile inode
+ * @sb: super block instance
+ * @root: root object
+ * @inode_size: size of an inode
+ * @raw_inode: on-disk ifile inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep)
+{
+	struct inode *ifile;
+	int err;
+
+	ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
+	if (unlikely(!ifile))
+		return -ENOMEM;
+	if (!(ifile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
+			     sizeof(struct nilfs_ifile_info));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+	if (err)
+		goto failed;
+
+	nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
+
+	err = nilfs_read_inode_common(ifile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(ifile);
+ out:
+	*inodep = ifile;
+	return 0;
+ failed:
+	iget_failed(ifile);
+	return err;
+}
diff --git a/kernel/fs/nilfs2/ifile.h b/kernel/fs/nilfs2/ifile.h
new file mode 100644
index 000000000..679674d13
--- /dev/null
+++ b/kernel/fs/nilfs2/ifile.h
@@ -0,0 +1,58 @@
+/*
+ * ifile.h - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _NILFS_IFILE_H
+#define _NILFS_IFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "alloc.h"
+
+
+static inline struct nilfs_inode *
+nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
+{
+	void *kaddr = kmap(ibh->b_page);
+	return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
+}
+
+static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
+					   struct buffer_head *ibh)
+{
+	kunmap(ibh->b_page);
+}
+
+int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
+int nilfs_ifile_delete_inode(struct inode *, ino_t);
+int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+
+int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
+
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep);
+
+#endif	/* _NILFS_IFILE_H */
diff --git a/kernel/fs/nilfs2/inode.c b/kernel/fs/nilfs2/inode.c
new file mode 100644
index 000000000..258d9fe25
--- /dev/null
+++ b/kernel/fs/nilfs2/inode.c
@@ -0,0 +1,1135 @@
+/*
+ * inode.c - NILFS inode operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#include "cpfile.h"
+#include "ifile.h"
+
+/**
+ * struct nilfs_iget_args - arguments used during comparison between inodes
+ * @ino: inode number
+ * @cno: checkpoint number
+ * @root: pointer on NILFS root object (mounted checkpoint)
+ * @for_gc: inode for GC flag
+ */
+struct nilfs_iget_args {
+	u64 ino;
+	__u64 cno;
+	struct nilfs_root *root;
+	int for_gc;
+};
+
+static int nilfs_iget_test(struct inode *inode, void *opaque);
+
+void nilfs_inode_add_blocks(struct inode *inode, int n)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
+	if (root)
+		atomic64_add(n, &root->blocks_count);
+}
+
+void nilfs_inode_sub_blocks(struct inode *inode, int n)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
+	if (root)
+		atomic64_sub(n, &root->blocks_count);
+}
+
+/**
+ * nilfs_get_block() - get a file block on the filesystem (callback function)
+ * @inode - inode struct of the target file
+ * @blkoff - file block number
+ * @bh_result - buffer head to be mapped on
+ * @create - indicate whether allocating the block or not when it has not
+ *      been allocated yet.
+ *
+ * This function does not issue actual read request of the specified data
+ * block. It is done by VFS.
+ */
+int nilfs_get_block(struct inode *inode, sector_t blkoff,
+		    struct buffer_head *bh_result, int create)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	__u64 blknum = 0;
+	int err = 0, ret;
+	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
+
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	if (ret >= 0) {	/* found */
+		map_bh(bh_result, inode->i_sb, blknum);
+		if (ret > 0)
+			bh_result->b_size = (ret << inode->i_blkbits);
+		goto out;
+	}
+	/* data block was not found */
+	if (ret == -ENOENT && create) {
+		struct nilfs_transaction_info ti;
+
+		bh_result->b_blocknr = 0;
+		err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+		if (unlikely(err))
+			goto out;
+		err = nilfs_bmap_insert(ii->i_bmap, blkoff,
+					(unsigned long)bh_result);
+		if (unlikely(err != 0)) {
+			if (err == -EEXIST) {
+				/*
+				 * The get_block() function could be called
+				 * from multiple callers for an inode.
+				 * However, the page having this block must
+				 * be locked in this case.
+				 */
+				printk(KERN_WARNING
+				       "nilfs_get_block: a race condition "
+				       "while inserting a data block. "
+				       "(inode number=%lu, file block "
+				       "offset=%llu)\n",
+				       inode->i_ino,
+				       (unsigned long long)blkoff);
+				err = 0;
+			}
+			nilfs_transaction_abort(inode->i_sb);
+			goto out;
+		}
+		nilfs_mark_inode_dirty_sync(inode);
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+		/* Error handling should be detailed */
+		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
+		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
+						      to proper value */
+	} else if (ret == -ENOENT) {
+		/* not found is not error (e.g. hole); must return without
+		   the mapped state flag. */
+		;
+	} else {
+		err = ret;
+	}
+
+ out:
+	return err;
+}
+
+/**
+ * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @page - the page to be read
+ */
+static int nilfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, nilfs_get_block);
+}
+
+/**
+ * nilfs_readpages() - implement readpages() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @mapping - address_space struct used for reading multiple pages
+ * @pages - the pages to be read
+ * @nr_pages - number of pages to be read
+ */
+static int nilfs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
+}
+
+static int nilfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int err = 0;
+
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		nilfs_clear_dirty_pages(mapping, false);
+		return -EROFS;
+	}
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_dsync_segment(inode->i_sb, inode,
+						    wbc->range_start,
+						    wbc->range_end);
+	return err;
+}
+
+static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/*
+		 * It means that filesystem was remounted in read-only
+		 * mode because of error or metadata corruption. But we
+		 * have dirty pages that try to be flushed in background.
+		 * So, here we simply discard this dirty page.
+		 */
+		nilfs_clear_dirty_page(page, false);
+		unlock_page(page);
+		return -EROFS;
+	}
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		err = nilfs_construct_segment(inode->i_sb);
+		if (unlikely(err))
+			return err;
+	} else if (wbc->for_reclaim)
+		nilfs_flush_segment(inode->i_sb, inode->i_ino);
+
+	return 0;
+}
+
+static int nilfs_set_page_dirty(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = __set_page_dirty_nobuffers(page);
+
+	if (page_has_buffers(page)) {
+		unsigned nr_dirty = 0;
+		struct buffer_head *bh, *head;
+
+		/*
+		 * This page is locked by callers, and no other thread
+		 * concurrently marks its buffers dirty since they are
+		 * only dirtied through routines in fs/buffer.c in
+		 * which call sites of mark_buffer_dirty are protected
+		 * by page lock.
+		 */
+		bh = head = page_buffers(page);
+		do {
+			/* Do not mark hole blocks dirty */
+			if (buffer_dirty(bh) || !buffer_mapped(bh))
+				continue;
+
+			set_buffer_dirty(bh);
+			nr_dirty++;
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (nr_dirty)
+			nilfs_set_file_dirty(inode, nr_dirty);
+	} else if (ret) {
+		unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+		nilfs_set_file_dirty(inode, nr_dirty);
+	}
+	return ret;
+}
+
+void nilfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		nilfs_truncate(inode);
+	}
+}
+
+static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+
+{
+	struct inode *inode = mapping->host;
+	int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
+
+	if (unlikely(err))
+		return err;
+
+	err = block_write_begin(mapping, pos, len, flags, pagep,
+				nilfs_get_block);
+	if (unlikely(err)) {
+		nilfs_write_failed(mapping, pos + len);
+		nilfs_transaction_abort(inode->i_sb);
+	}
+	return err;
+}
+
+static int nilfs_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned nr_dirty;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, start,
+						  start + copied);
+	copied = generic_write_end(file, mapping, pos, len, copied, page,
+				   fsdata);
+	nilfs_set_file_dirty(inode, nr_dirty);
+	err = nilfs_transaction_commit(inode->i_sb);
+	return err ? : copied;
+}
+
+static ssize_t
+nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = file->f_mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t size;
+
+	if (iov_iter_rw(iter) == WRITE)
+		return 0;
+
+	/* Needs synchronization with the cleaner */
+	size = blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely(iov_iter_rw(iter) == WRITE && size < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + count;
+
+		if (end > isize)
+			nilfs_write_failed(mapping, end);
+	}
+
+	return size;
+}
+
+const struct address_space_operations nilfs_aops = {
+	.writepage		= nilfs_writepage,
+	.readpage		= nilfs_readpage,
+	.writepages		= nilfs_writepages,
+	.set_page_dirty		= nilfs_set_page_dirty,
+	.readpages		= nilfs_readpages,
+	.write_begin		= nilfs_write_begin,
+	.write_end		= nilfs_write_end,
+	/* .releasepage		= nilfs_releasepage, */
+	.invalidatepage		= block_invalidatepage,
+	.direct_IO		= nilfs_direct_IO,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+};
+
+static int nilfs_insert_inode_locked(struct inode *inode,
+				     struct nilfs_root *root,
+				     unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
+}
+
+struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct inode *inode;
+	struct nilfs_inode_info *ii;
+	struct nilfs_root *root;
+	int err = -ENOMEM;
+	ino_t ino;
+
+	inode = new_inode(sb);
+	if (unlikely(!inode))
+		goto failed;
+
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+
+	root = NILFS_I(dir)->i_root;
+	ii = NILFS_I(inode);
+	ii->i_state = 1 << NILFS_I_NEW;
+	ii->i_root = root;
+
+	err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
+	if (unlikely(err))
+		goto failed_ifile_create_inode;
+	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+
+	atomic64_inc(&root->inodes_count);
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = ino;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, NULL);
+		if (err < 0)
+			goto failed_after_creation;
+
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+
+	ii->i_flags = nilfs_mask_flags(
+		mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
+
+	/* ii->i_file_acl = 0; */
+	/* ii->i_dir_acl = 0; */
+	ii->i_dir_start_lookup = 0;
+	nilfs_set_inode_flags(inode);
+	spin_lock(&nilfs->ns_next_gen_lock);
+	inode->i_generation = nilfs->ns_next_generation++;
+	spin_unlock(&nilfs->ns_next_gen_lock);
+	if (nilfs_insert_inode_locked(inode, root, ino) < 0) {
+		err = -EIO;
+		goto failed_after_creation;
+	}
+
+	err = nilfs_init_acl(inode, dir);
+	if (unlikely(err))
+		goto failed_after_creation; /* never occur. When supporting
+				    nilfs_init_acl(), proper cancellation of
+				    above jobs should be considered */
+
+	return inode;
+
+ failed_after_creation:
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);  /* raw_inode will be deleted through
+			 nilfs_evict_inode() */
+	goto failed;
+
+ failed_ifile_create_inode:
+	make_bad_inode(inode);
+	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
+			 called */
+ failed:
+	return ERR_PTR(err);
+}
+
+void nilfs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = NILFS_I(inode)->i_flags;
+	unsigned int new_fl = 0;
+
+	if (flags & FS_SYNC_FL)
+		new_fl |= S_SYNC;
+	if (flags & FS_APPEND_FL)
+		new_fl |= S_APPEND;
+	if (flags & FS_IMMUTABLE_FL)
+		new_fl |= S_IMMUTABLE;
+	if (flags & FS_NOATIME_FL)
+		new_fl |= S_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		new_fl |= S_DIRSYNC;
+	inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE |
+			S_NOATIME | S_DIRSYNC);
+}
+
+int nilfs_read_inode_common(struct inode *inode,
+			    struct nilfs_inode *raw_inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	i_uid_write(inode, le32_to_cpu(raw_inode->i_uid));
+	i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
+	inode->i_size = le64_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	if (inode->i_nlink == 0)
+		return -ESTALE; /* this inode is deleted */
+
+	inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
+	ii->i_flags = le32_to_cpu(raw_inode->i_flags);
+#if 0
+	ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	ii->i_dir_acl = S_ISREG(inode->i_mode) ?
+		0 : le32_to_cpu(raw_inode->i_dir_acl);
+#endif
+	ii->i_dir_start_lookup = 0;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+
+	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, raw_inode);
+		if (err < 0)
+			return err;
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+	return 0;
+}
+
+static int __nilfs_read_inode(struct super_block *sb,
+			      struct nilfs_root *root, unsigned long ino,
+			      struct inode *inode)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct buffer_head *bh;
+	struct nilfs_inode *raw_inode;
+	int err;
+
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
+	if (unlikely(err))
+		goto bad_inode;
+
+	raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
+
+	err = nilfs_read_inode_common(inode, raw_inode);
+	if (err)
+		goto failed_unmap;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &nilfs_dir_inode_operations;
+		inode->i_fop = &nilfs_dir_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &nilfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else {
+		inode->i_op = &nilfs_special_inode_operations;
+		init_special_inode(
+			inode, inode->i_mode,
+			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+	}
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	brelse(bh);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	nilfs_set_inode_flags(inode);
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+	return 0;
+
+ failed_unmap:
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	brelse(bh);
+
+ bad_inode:
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	return err;
+}
+
+static int nilfs_iget_test(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+	struct nilfs_inode_info *ii;
+
+	if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
+		return 0;
+
+	ii = NILFS_I(inode);
+	if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
+		return !args->for_gc;
+
+	return args->for_gc && args->cno == ii->i_cno;
+}
+
+static int nilfs_iget_set(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+
+	inode->i_ino = args->ino;
+	if (args->for_gc) {
+		NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+		NILFS_I(inode)->i_cno = args->cno;
+		NILFS_I(inode)->i_root = NULL;
+	} else {
+		if (args->root && args->ino == NILFS_ROOT_INO)
+			nilfs_get_root(args->root);
+		NILFS_I(inode)->i_root = args->root;
+	}
+	return 0;
+}
+
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return ilookup5(sb, ino, nilfs_iget_test, &args);
+}
+
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+}
+
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = nilfs_iget_locked(sb, root, ino);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = __nilfs_read_inode(sb, root, ino, inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
+				__u64 cno)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+	};
+	struct inode *inode;
+	int err;
+
+	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = nilfs_init_gcinode(inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+void nilfs_write_inode_common(struct inode *inode,
+			      struct nilfs_inode *raw_inode, int has_bmap)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	raw_inode->i_uid = cpu_to_le32(i_uid_read(inode));
+	raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le64(inode->i_size);
+	raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
+
+	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
+		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+		/* zero-fill unused portion in the case of super root block */
+		raw_inode->i_xattr = 0;
+		raw_inode->i_pad = 0;
+		memset((void *)raw_inode + sizeof(*raw_inode), 0,
+		       nilfs->ns_inode_size - sizeof(*raw_inode));
+	}
+
+	if (has_bmap)
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_device_code =
+			cpu_to_le64(huge_encode_dev(inode->i_rdev));
+	/* When extending inode, nilfs->ns_inode_size should be checked
+	   for substitutions of appended fields */
+}
+
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
+{
+	ino_t ino = inode->i_ino;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct inode *ifile = ii->i_root->ifile;
+	struct nilfs_inode *raw_inode;
+
+	raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
+
+	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
+		memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
+	if (flags & I_DIRTY_DATASYNC)
+		set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
+
+	nilfs_write_inode_common(inode, raw_inode, 0);
+		/* XXX: call with has_bmap = 0 is a workaround to avoid
+		   deadlock of bmap. This delays update of i_bmap to just
+		   before writing */
+	nilfs_ifile_unmap_inode(ifile, ino, ibh);
+}
+
+#define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
+
+static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
+				unsigned long from)
+{
+	__u64 b;
+	int ret;
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+repeat:
+	ret = nilfs_bmap_last_key(ii->i_bmap, &b);
+	if (ret == -ENOENT)
+		return;
+	else if (ret < 0)
+		goto failed;
+
+	if (b < from)
+		return;
+
+	b -= min_t(__u64, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+	ret = nilfs_bmap_truncate(ii->i_bmap, b);
+	nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
+	if (!ret || (ret == -ENOMEM &&
+		     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
+		goto repeat;
+
+failed:
+	nilfs_warning(ii->vfs_inode.i_sb, __func__,
+		      "failed to truncate bmap (ino=%lu, err=%d)",
+		      ii->vfs_inode.i_ino, ret);
+}
+
+void nilfs_truncate(struct inode *inode)
+{
+	unsigned long blkoff;
+	unsigned int blocksize;
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	blocksize = sb->s_blocksize;
+	blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
+
+	nilfs_truncate_bmap(ii, blkoff);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+
+	nilfs_mark_inode_dirty(inode);
+	nilfs_set_file_dirty(inode, 0);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But truncate has no return value. */
+}
+
+static void nilfs_clear_inode(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	/*
+	 * Free resources allocated in nilfs_read_inode(), here.
+	 */
+	BUG_ON(!list_empty(&ii->i_dirty));
+	brelse(ii->i_bh);
+	ii->i_bh = NULL;
+
+	if (mdi && mdi->mi_palloc_cache)
+		nilfs_palloc_destroy_cache(inode);
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state))
+		nilfs_bmap_clear(ii->i_bmap);
+
+	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+
+	if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
+		nilfs_put_root(ii->i_root);
+}
+
+void nilfs_evict_inode(struct inode *inode)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int ret;
+
+	if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
+		truncate_inode_pages_final(&inode->i_data);
+		clear_inode(inode);
+		nilfs_clear_inode(inode);
+		return;
+	}
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	/* TODO: some of the following operations may fail.  */
+	nilfs_truncate_bmap(ii, 0);
+	nilfs_mark_inode_dirty(inode);
+	clear_inode(inode);
+
+	ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+	if (!ret)
+		atomic64_dec(&ii->i_root->inodes_count);
+
+	nilfs_clear_inode(inode);
+
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But delete_inode has no return value. */
+}
+
+int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct nilfs_transaction_info ti;
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = inode_change_ok(inode, iattr);
+	if (err)
+		return err;
+
+	err = nilfs_transaction_begin(sb, &ti, 0);
+	if (unlikely(err))
+		return err;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
+		truncate_setsize(inode, iattr->ia_size);
+		nilfs_truncate(inode);
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	if (iattr->ia_valid & ATTR_MODE) {
+		err = nilfs_acl_chmod(inode);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	return nilfs_transaction_commit(sb);
+
+out_err:
+	nilfs_transaction_abort(sb);
+	return err;
+}
+
+int nilfs_permission(struct inode *inode, int mask)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	if ((mask & MAY_WRITE) && root &&
+	    root->cno != NILFS_CPTREE_CURRENT_CNO)
+		return -EROFS; /* snapshot is not writable */
+
+	return generic_permission(inode, mask);
+}
+
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (ii->i_bh == NULL) {
+		spin_unlock(&nilfs->ns_inode_lock);
+		err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
+						  inode->i_ino, pbh);
+		if (unlikely(err))
+			return err;
+		spin_lock(&nilfs->ns_inode_lock);
+		if (ii->i_bh == NULL)
+			ii->i_bh = *pbh;
+		else {
+			brelse(*pbh);
+			*pbh = ii->i_bh;
+		}
+	} else
+		*pbh = ii->i_bh;
+
+	get_bh(*pbh);
+	spin_unlock(&nilfs->ns_inode_lock);
+	return 0;
+}
+
+int nilfs_inode_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	int ret = 0;
+
+	if (!list_empty(&ii->i_dirty)) {
+		spin_lock(&nilfs->ns_inode_lock);
+		ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
+			test_bit(NILFS_I_BUSY, &ii->i_state);
+		spin_unlock(&nilfs->ns_inode_lock);
+	}
+	return ret;
+}
+
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+	atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
+
+	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
+		return 0;
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		/* Because this routine may race with nilfs_dispose_list(),
+		   we have to check NILFS_I_QUEUED here, too. */
+		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
+			/* This will happen when somebody is freeing
+			   this inode. */
+			nilfs_warning(inode->i_sb, __func__,
+				      "cannot get inode (ino=%lu)\n",
+				      inode->i_ino);
+			spin_unlock(&nilfs->ns_inode_lock);
+			return -EINVAL; /* NILFS_I_DIRTY may remain for
+					   freeing inode */
+		}
+		list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
+		set_bit(NILFS_I_QUEUED, &ii->i_state);
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+	return 0;
+}
+
+int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
+{
+	struct buffer_head *ibh;
+	int err;
+
+	err = nilfs_load_inode_block(inode, &ibh);
+	if (unlikely(err)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "failed to reget inode block.\n");
+		return err;
+	}
+	nilfs_update_inode(inode, ibh, flags);
+	mark_buffer_dirty(ibh);
+	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
+	brelse(ibh);
+	return 0;
+}
+
+/**
+ * nilfs_dirty_inode - reflect changes on given inode to an inode block.
+ * @inode: inode of the file to be registered.
+ *
+ * nilfs_dirty_inode() loads a inode block containing the specified
+ * @inode and copies data from a nilfs_inode to a corresponding inode
+ * entry in the inode block. This operation is excluded from the segment
+ * construction. This function can be called both as a single operation
+ * and as a part of indivisible file operations.
+ */
+void nilfs_dirty_inode(struct inode *inode, int flags)
+{
+	struct nilfs_transaction_info ti;
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	if (is_bad_inode(inode)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "tried to mark bad_inode dirty. ignored.\n");
+		dump_stack();
+		return;
+	}
+	if (mdi) {
+		nilfs_mdt_mark_dirty(inode);
+		return;
+	}
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	__nilfs_mark_inode_dirty(inode, flags);
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+}
+
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 __u64 start, __u64 len)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	__u64 logical = 0, phys = 0, size = 0;
+	__u32 flags = 0;
+	loff_t isize;
+	sector_t blkoff, end_blkoff;
+	sector_t delalloc_blkoff;
+	unsigned long delalloc_blklen;
+	unsigned int blkbits = inode->i_blkbits;
+	int ret, n;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+
+	blkoff = start >> blkbits;
+	end_blkoff = (start + len - 1) >> blkbits;
+
+	delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+							&delalloc_blkoff);
+
+	do {
+		__u64 blkphy;
+		unsigned int maxblocks;
+
+		if (delalloc_blklen && blkoff == delalloc_blkoff) {
+			if (size) {
+				/* End of the current extent */
+				ret = fiemap_fill_next_extent(
+					fieinfo, logical, phys, size, flags);
+				if (ret)
+					break;
+			}
+			if (blkoff > end_blkoff)
+				break;
+
+			flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+			logical = blkoff << blkbits;
+			phys = 0;
+			size = delalloc_blklen << blkbits;
+
+			blkoff = delalloc_blkoff + delalloc_blklen;
+			delalloc_blklen = nilfs_find_uncommitted_extent(
+				inode, blkoff, &delalloc_blkoff);
+			continue;
+		}
+
+		/*
+		 * Limit the number of blocks that we look up so as
+		 * not to get into the next delayed allocation extent.
+		 */
+		maxblocks = INT_MAX;
+		if (delalloc_blklen)
+			maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+					  maxblocks);
+		blkphy = 0;
+
+		down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+		n = nilfs_bmap_lookup_contig(
+			NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+		up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+
+		if (n < 0) {
+			int past_eof;
+
+			if (unlikely(n != -ENOENT))
+				break; /* error */
+
+			/* HOLE */
+			blkoff++;
+			past_eof = ((blkoff << blkbits) >= isize);
+
+			if (size) {
+				/* End of the current extent */
+
+				if (past_eof)
+					flags |= FIEMAP_EXTENT_LAST;
+
+				ret = fiemap_fill_next_extent(
+					fieinfo, logical, phys, size, flags);
+				if (ret)
+					break;
+				size = 0;
+			}
+			if (blkoff > end_blkoff || past_eof)
+				break;
+		} else {
+			if (size) {
+				if (phys && blkphy << blkbits == phys + size) {
+					/* The current extent goes on */
+					size += n << blkbits;
+				} else {
+					/* Terminate the current extent */
+					ret = fiemap_fill_next_extent(
+						fieinfo, logical, phys, size,
+						flags);
+					if (ret || blkoff > end_blkoff)
+						break;
+
+					/* Start another extent */
+					flags = FIEMAP_EXTENT_MERGED;
+					logical = blkoff << blkbits;
+					phys = blkphy << blkbits;
+					size = n << blkbits;
+				}
+			} else {
+				/* Start a new extent */
+				flags = FIEMAP_EXTENT_MERGED;
+				logical = blkoff << blkbits;
+				phys = blkphy << blkbits;
+				size = n << blkbits;
+			}
+			blkoff += n;
+		}
+		cond_resched();
+	} while (true);
+
+	/* If ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
diff --git a/kernel/fs/nilfs2/ioctl.c b/kernel/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000..9a20e513d
--- /dev/null
+++ b/kernel/fs/nilfs2/ioctl.c
@@ -0,0 +1,1379 @@
+/*
+ * ioctl.c - NILFS ioctl operations.
+ *
+ * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/capability.h>	/* capable() */
+#include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
+#include <linux/vmalloc.h>
+#include <linux/compat.h>	/* compat_ptr() */
+#include <linux/mount.h>	/* mnt_want_write_file(), mnt_drop_write_file() */
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "bmap.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+
+/**
+ * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @dir: set of direction flags
+ * @dofunc: concrete function of get/set metadata info
+ *
+ * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of
+ * calling dofunc() function on the basis of @argv argument.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
+static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
+				 struct nilfs_argv *argv, int dir,
+				 ssize_t (*dofunc)(struct the_nilfs *,
+						   __u64 *, int,
+						   void *, size_t, size_t))
+{
+	void *buf;
+	void __user *base = (void __user *)(unsigned long)argv->v_base;
+	size_t maxmembs, total, n;
+	ssize_t nr;
+	int ret, i;
+	__u64 pos, ppos;
+
+	if (argv->v_nmembs == 0)
+		return 0;
+
+	if (argv->v_size > PAGE_SIZE)
+		return -EINVAL;
+
+	/*
+	 * Reject pairs of a start item position (argv->v_index) and a
+	 * total count (argv->v_nmembs) which leads position 'pos' to
+	 * overflow by the increment at the end of the loop.
+	 */
+	if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
+		return -EINVAL;
+
+	buf = (void *)__get_free_pages(GFP_NOFS, 0);
+	if (unlikely(!buf))
+		return -ENOMEM;
+	maxmembs = PAGE_SIZE / argv->v_size;
+
+	ret = 0;
+	total = 0;
+	pos = argv->v_index;
+	for (i = 0; i < argv->v_nmembs; i += n) {
+		n = (argv->v_nmembs - i < maxmembs) ?
+			argv->v_nmembs - i : maxmembs;
+		if ((dir & _IOC_WRITE) &&
+		    copy_from_user(buf, base + argv->v_size * i,
+				   argv->v_size * n)) {
+			ret = -EFAULT;
+			break;
+		}
+		ppos = pos;
+		nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
+			       n);
+		if (nr < 0) {
+			ret = nr;
+			break;
+		}
+		if ((dir & _IOC_READ) &&
+		    copy_to_user(base + argv->v_size * i, buf,
+				 argv->v_size * nr)) {
+			ret = -EFAULT;
+			break;
+		}
+		total += nr;
+		if ((size_t)nr < n)
+			break;
+		if (pos == ppos)
+			pos += n;
+	}
+	argv->v_nmembs = total;
+
+	free_pages((unsigned long)buf, 0);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_getflags - ioctl to support lsattr
+ */
+static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
+{
+	unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
+
+	return put_user(flags, (int __user *)argp);
+}
+
+/**
+ * nilfs_ioctl_setflags - ioctl to support chattr
+ */
+static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
+				void __user *argp)
+{
+	struct nilfs_transaction_info ti;
+	unsigned int flags, oldflags;
+	int ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (get_user(flags, (int __user *)argp))
+		return -EFAULT;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	flags = nilfs_mask_flags(inode->i_mode, flags);
+
+	mutex_lock(&inode->i_mutex);
+
+	oldflags = NILFS_I(inode)->i_flags;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
+	 * relevant capability.
+	 */
+	ret = -EPERM;
+	if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	if (ret)
+		goto out;
+
+	NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
+		(flags & FS_FL_USER_MODIFIABLE);
+
+	nilfs_set_inode_flags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+
+	nilfs_mark_inode_dirty(inode);
+	ret = nilfs_transaction_commit(inode->i_sb);
+out:
+	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_getversion - get info about a file's version (generation number)
+ */
+static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
+{
+	return put_user(inode->i_generation, (int __user *)argp);
+}
+
+/**
+ * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_change_cpmode() function changes mode of
+ * given checkpoint between checkpoint and snapshot state. This ioctl
+ * is used in chcp and mkcp utilities.
+ *
+ * Return Value: On success, 0 is returned and mode of a checkpoint is
+ * changed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint mode changing.
+ */
+static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
+				     unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_transaction_info ti;
+	struct nilfs_cpmode cpmode;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
+		goto out;
+
+	mutex_lock(&nilfs->ns_snapshot_mount_mutex);
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_change_cpmode(
+		nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
+	if (unlikely(ret < 0))
+		nilfs_transaction_abort(inode->i_sb);
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+
+	mutex_unlock(&nilfs->ns_snapshot_mount_mutex);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_delete_checkpoint - remove checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_delete_checkpoint() function removes
+ * checkpoint from NILFS2 file system. This ioctl is used in rmcp
+ * utility.
+ *
+ * Return Value: On success, 0 is returned and a checkpoint is
+ * removed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint removing.
+ */
+static int
+nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
+			      unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_transaction_info ti;
+	__u64 cno;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(&cno, argp, sizeof(cno)))
+		goto out;
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno);
+	if (unlikely(ret < 0))
+		nilfs_transaction_abort(inode->i_sb);
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints
+ * @nilfs: nilfs object
+ * @posp: pointer on array of checkpoint's numbers
+ * @flags: checkpoint mode (checkpoint or snapshot)
+ * @buf: buffer for storing checkponts' info
+ * @size: size in bytes of one checkpoint info item in array
+ * @nmembs: number of checkpoints in array (numbers and infos)
+ *
+ * Description: nilfs_ioctl_do_get_cpinfo() function returns info about
+ * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in
+ * lscp utility and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_cpinfo structures in output buffer.
+ */
+static ssize_t
+nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
+				      size, nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_get_cpstat - get checkpoints statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints.
+ * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting checkpoints statistics.
+ */
+static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_cpstat cpstat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info
+ * @nilfs: nilfs object
+ * @posp: pointer on array of segment numbers
+ * @flags: *not used*
+ * @buf: buffer for storing suinfo array
+ * @size: size in bytes of one suinfo item in array
+ * @nmembs: count of segment numbers and suinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage
+ * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used
+ * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_suinfo structures in output buffer.
+ */
+static ssize_t
+nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size,
+				      nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_get_sustat - get segment usage statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_sustat() returns segment usage statistics.
+ * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting segment usage statistics.
+ */
+static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_sustat sustat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &sustat, sizeof(sustat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_vinfo structures
+ * @size: size in bytes of one vinfo item in array
+ * @nmembs: count of vinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_vinfo() function returns information
+ * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used
+ * by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_vinfo structures in output buffer.
+ */
+static ssize_t
+nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			 void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_bdesc structures
+ * @size: size in bytes of one bdesc item in array
+ * @nmembs: count of bdescs in array
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_bdescs structures in output buffer.
+ */
+static ssize_t
+nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	down_read(&nilfs->ns_segctor_sem);
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT) {
+				up_read(&nilfs->ns_segctor_sem);
+				return ret;
+			}
+			bdescs[i].bd_blocknr = 0;
+		}
+	}
+	up_read(&nilfs->ns_segctor_sem);
+	return nmembs;
+}
+
+/**
+ * nilfs_ioctl_get_bdescs - get disk block descriptors
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and disk block descriptors are
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting disk block descriptors.
+ */
+static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	if (argv.v_size != sizeof(struct nilfs_bdesc))
+		return -EINVAL;
+
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+				    nilfs_ioctl_do_get_bdescs);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC
+ * @inode: inode object
+ * @vdesc: descriptor of virtual block number
+ * @buffers: list of moving buffers
+ *
+ * Description: nilfs_ioctl_move_inode_block() function registers data/node
+ * buffer in the GC pagecache and submit read request.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Requested block doesn't exist.
+ *
+ * %-EEXIST - Blocks conflict is detected.
+ */
+static int nilfs_ioctl_move_inode_block(struct inode *inode,
+					struct nilfs_vdesc *vdesc,
+					struct list_head *buffers)
+{
+	struct buffer_head *bh;
+	int ret;
+
+	if (vdesc->vd_flags == 0)
+		ret = nilfs_gccache_submit_read_data(
+			inode, vdesc->vd_offset, vdesc->vd_blocknr,
+			vdesc->vd_vblocknr, &bh);
+	else
+		ret = nilfs_gccache_submit_read_node(
+			inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
+
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			printk(KERN_CRIT
+			       "%s: invalid virtual block address (%s): "
+			       "ino=%llu, cno=%llu, offset=%llu, "
+			       "blocknr=%llu, vblocknr=%llu\n",
+			       __func__, vdesc->vd_flags ? "node" : "data",
+			       (unsigned long long)vdesc->vd_ino,
+			       (unsigned long long)vdesc->vd_cno,
+			       (unsigned long long)vdesc->vd_offset,
+			       (unsigned long long)vdesc->vd_blocknr,
+			       (unsigned long long)vdesc->vd_vblocknr);
+		return ret;
+	}
+	if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
+		printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
+		       "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
+		       __func__, vdesc->vd_flags ? "node" : "data",
+		       (unsigned long long)vdesc->vd_ino,
+		       (unsigned long long)vdesc->vd_cno,
+		       (unsigned long long)vdesc->vd_offset,
+		       (unsigned long long)vdesc->vd_blocknr,
+		       (unsigned long long)vdesc->vd_vblocknr);
+		brelse(bh);
+		return -EEXIST;
+	}
+	list_add_tail(&bh->b_assoc_buffers, buffers);
+	return 0;
+}
+
+/**
+ * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection
+ * @sb: superblock object
+ * @argv: vector of arguments from userspace
+ * @buf: array of nilfs_vdesc structures
+ *
+ * Description: nilfs_ioctl_move_blocks() function reads valid data/node
+ * blocks that garbage collector specified with the array of nilfs_vdesc
+ * structures and stores them into page caches of GC inodes.
+ *
+ * Return Value: Number of processed nilfs_vdesc structures or
+ * error code, otherwise.
+ */
+static int nilfs_ioctl_move_blocks(struct super_block *sb,
+				   struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct inode *inode;
+	struct nilfs_vdesc *vdesc;
+	struct buffer_head *bh, *n;
+	LIST_HEAD(buffers);
+	ino_t ino;
+	__u64 cno;
+	int i, ret;
+
+	for (i = 0, vdesc = buf; i < nmembs; ) {
+		ino = vdesc->vd_ino;
+		cno = vdesc->vd_cno;
+		inode = nilfs_iget_for_gc(sb, ino, cno);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			goto failed;
+		}
+		if (list_empty(&NILFS_I(inode)->i_dirty)) {
+			/*
+			 * Add the inode to GC inode list. Garbage Collection
+			 * is serialized and no two processes manipulate the
+			 * list simultaneously.
+			 */
+			igrab(inode);
+			list_add(&NILFS_I(inode)->i_dirty,
+				 &nilfs->ns_gc_inodes);
+		}
+
+		do {
+			ret = nilfs_ioctl_move_inode_block(inode, vdesc,
+							   &buffers);
+			if (unlikely(ret < 0)) {
+				iput(inode);
+				goto failed;
+			}
+			vdesc++;
+		} while (++i < nmembs &&
+			 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+
+		iput(inode); /* The inode still remains in GC inode list */
+	}
+
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		ret = nilfs_gccache_wait_and_mark_dirty(bh);
+		if (unlikely(ret < 0)) {
+			WARN_ON(ret == -EEXIST);
+			goto failed;
+		}
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return nmembs;
+
+ failed:
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_delete_checkpoints - delete checkpoints
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of periods of checkpoints numbers
+ *
+ * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints
+ * in the period from p_start to p_end, excluding p_end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: Number of processed nilfs_period structures or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
+static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
+					  struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct inode *cpfile = nilfs->ns_cpfile;
+	struct nilfs_period *periods = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_cpfile_delete_checkpoints(
+			cpfile, periods[i].p_start, periods[i].p_end);
+		if (ret < 0)
+			return ret;
+	}
+	return nmembs;
+}
+
+/**
+ * nilfs_ioctl_free_vblocknrs - free virtual block numbers
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of virtual block numbers
+ *
+ * Description: nilfs_ioctl_free_vblocknrs() function frees
+ * the virtual block numbers specified by @buf and @argv->v_nmembs.
+ *
+ * Return Value: Number of processed virtual block numbers or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
+static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
+				      struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	int ret;
+
+	ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
+
+	return (ret < 0) ? ret : nmembs;
+}
+
+/**
+ * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of block descriptors
+ *
+ * Description: nilfs_ioctl_mark_blocks_dirty() function marks
+ * metadata file or data blocks as dirty.
+ *
+ * Return Value: Number of processed block descriptors or
+ * error code, otherwise.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ */
+static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
+					 struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		/* XXX: use macro or inline func to check liveness */
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				return ret;
+			bdescs[i].bd_blocknr = 0;
+		}
+		if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
+			/* skip dead block */
+			continue;
+		if (bdescs[i].bd_level == 0) {
+			ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
+							 bdescs[i].bd_offset);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		} else {
+			ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
+					      bdescs[i].bd_level);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		}
+	}
+	return nmembs;
+}
+
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
+				       struct nilfs_argv *argv, void **kbufs)
+{
+	const char *msg;
+	int ret;
+
+	ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because checkpoints can be removed
+		 * independently.
+		 */
+		msg = "cannot delete checkpoints";
+		goto failed;
+	}
+	ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because DAT file is updated atomically
+		 * using a copy-on-write technique.
+		 */
+		msg = "cannot delete virtual blocks from DAT file";
+		goto failed;
+	}
+	ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because the operation is nondestructive.
+		 */
+		msg = "cannot mark copying blocks dirty";
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
+	       msg, ret);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_clean_segments - clean segments
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_clean_segments() function makes garbage
+ * collection operation in the environment of requested parameters
+ * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by
+ * nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
+static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
+				      unsigned int cmd, void __user *argp)
+{
+	struct nilfs_argv argv[5];
+	static const size_t argsz[5] = {
+		sizeof(struct nilfs_vdesc),
+		sizeof(struct nilfs_period),
+		sizeof(__u64),
+		sizeof(struct nilfs_bdesc),
+		sizeof(__u64),
+	};
+	void __user *base;
+	void *kbufs[5];
+	struct the_nilfs *nilfs;
+	size_t len, nsegs;
+	int n, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(argv, argp, sizeof(argv)))
+		goto out;
+
+	ret = -EINVAL;
+	nsegs = argv[4].v_nmembs;
+	if (argv[4].v_size != argsz[4])
+		goto out;
+	if (nsegs > UINT_MAX / sizeof(__u64))
+		goto out;
+
+	/*
+	 * argv[4] points to segment numbers this ioctl cleans.  We
+	 * use kmalloc() for its buffer because memory used for the
+	 * segment numbers is enough small.
+	 */
+	kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
+			       nsegs * sizeof(__u64));
+	if (IS_ERR(kbufs[4])) {
+		ret = PTR_ERR(kbufs[4]);
+		goto out;
+	}
+	nilfs = inode->i_sb->s_fs_info;
+
+	for (n = 0; n < 4; n++) {
+		ret = -EINVAL;
+		if (argv[n].v_size != argsz[n])
+			goto out_free;
+
+		if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
+			goto out_free;
+
+		if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size)
+			goto out_free;
+
+		len = argv[n].v_size * argv[n].v_nmembs;
+		base = (void __user *)(unsigned long)argv[n].v_base;
+		if (len == 0) {
+			kbufs[n] = NULL;
+			continue;
+		}
+
+		kbufs[n] = vmalloc(len);
+		if (!kbufs[n]) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		if (copy_from_user(kbufs[n], base, len)) {
+			ret = -EFAULT;
+			vfree(kbufs[n]);
+			goto out_free;
+		}
+	}
+
+	/*
+	 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
+	 * which will operates an inode list without blocking.
+	 * To protect the list from concurrent operations,
+	 * nilfs_ioctl_move_blocks should be atomic operation.
+	 */
+	if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+		ret = -EBUSY;
+		goto out_free;
+	}
+
+	ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
+	if (ret < 0)
+		printk(KERN_ERR "NILFS: GC failed during preparation: "
+			"cannot read source blocks: err=%d\n", ret);
+	else {
+		if (nilfs_sb_need_update(nilfs))
+			set_nilfs_discontinued(nilfs);
+		ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+	}
+
+	nilfs_remove_all_gcinodes(nilfs);
+	clear_nilfs_gc_running(nilfs);
+
+out_free:
+	while (--n >= 0)
+		vfree(kbufs[n]);
+	kfree(kbufs[4]);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_sync - make a checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_sync() function constructs a logical segment
+ * for checkpointing.  This function guarantees that all modified data
+ * and metadata are written out to the device when it successfully
+ * returned.
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
+static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
+			    unsigned int cmd, void __user *argp)
+{
+	__u64 cno;
+	int ret;
+	struct the_nilfs *nilfs;
+
+	ret = nilfs_construct_segment(inode->i_sb);
+	if (ret < 0)
+		return ret;
+
+	nilfs = inode->i_sb->s_fs_info;
+	ret = nilfs_flush_device(nilfs);
+	if (ret < 0)
+		return ret;
+
+	if (argp != NULL) {
+		down_read(&nilfs->ns_segctor_sem);
+		cno = nilfs->ns_cno - 1;
+		up_read(&nilfs->ns_segctor_sem);
+		if (copy_to_user(argp, &cno, sizeof(cno)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+/**
+ * nilfs_ioctl_resize - resize NILFS2 volume
+ * @inode: inode object
+ * @filp: file object
+ * @argp: pointer on argument from userspace
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
+static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
+			      void __user *argp)
+{
+	__u64 newsize;
+	int ret = -EPERM;
+
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&newsize, argp, sizeof(newsize)))
+		goto out_drop_write;
+
+	ret = nilfs_resize_fs(inode->i_sb, newsize);
+
+out_drop_write:
+	mnt_drop_write_file(filp);
+out:
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_trim_fs() - trim ioctl handle function
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
+ * performs the actual trim operation.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
+	struct fstrim_range range;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&range, argp, sizeof(range)))
+		return -EFAULT;
+
+	range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
+	up_read(&nilfs->ns_segctor_sem);
+
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &range, sizeof(range)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
+ * of segments in bytes and upper limit of segments in bytes.
+ * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
+static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	__u64 range[2];
+	__u64 minseg, maxseg;
+	unsigned long segbytes;
+	int ret = -EPERM;
+
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(range, argp, sizeof(__u64[2])))
+		goto out;
+
+	ret = -ERANGE;
+	if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+		goto out;
+
+	segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
+
+	minseg = range[0] + segbytes - 1;
+	do_div(minseg, segbytes);
+	maxseg = NILFS_SB2_OFFSET_BYTES(range[1]);
+	do_div(maxseg, segbytes);
+	maxseg--;
+
+	ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
+out:
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_get_info - wrapping function of get metadata info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ * @membsz: size of an item in bytes
+ * @dofunc: concrete function of getting metadata info
+ *
+ * Description: nilfs_ioctl_get_info() gets metadata info by means of
+ * calling dofunc() function.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
+static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
+				unsigned int cmd, void __user *argp,
+				size_t membsz,
+				ssize_t (*dofunc)(struct the_nilfs *,
+						  __u64 *, int,
+						  void *, size_t, size_t))
+
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	if (argv.v_size < membsz)
+		return -EINVAL;
+
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+/**
+ * nilfs_ioctl_set_suinfo - set segment usage info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: Expects an array of nilfs_suinfo_update structures
+ * encapsulated in nilfs_argv and updates the segment usage info
+ * according to the flags in nilfs_suinfo_update.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EPERM - Not enough permissions
+ *
+ * %-EFAULT - Error copying input data
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ */
+static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp,
+				unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_transaction_info ti;
+	struct nilfs_argv argv;
+	size_t len;
+	void __user *base;
+	void *kbuf;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		goto out;
+
+	ret = -EINVAL;
+	if (argv.v_size < sizeof(struct nilfs_suinfo_update))
+		goto out;
+
+	if (argv.v_nmembs > nilfs->ns_nsegments)
+		goto out;
+
+	if (argv.v_nmembs >= UINT_MAX / argv.v_size)
+		goto out;
+
+	len = argv.v_size * argv.v_nmembs;
+	if (!len) {
+		ret = 0;
+		goto out;
+	}
+
+	base = (void __user *)(unsigned long)argv.v_base;
+	kbuf = vmalloc(len);
+	if (!kbuf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (copy_from_user(kbuf, base, len)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size,
+			argv.v_nmembs);
+	if (unlikely(ret < 0))
+		nilfs_transaction_abort(inode->i_sb);
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+
+out_free:
+	vfree(kbuf);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return nilfs_ioctl_getflags(inode, argp);
+	case FS_IOC_SETFLAGS:
+		return nilfs_ioctl_setflags(inode, filp, argp);
+	case FS_IOC_GETVERSION:
+		return nilfs_ioctl_getversion(inode, argp);
+	case NILFS_IOCTL_CHANGE_CPMODE:
+		return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
+	case NILFS_IOCTL_DELETE_CHECKPOINT:
+		return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_CPINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_cpinfo),
+					    nilfs_ioctl_do_get_cpinfo);
+	case NILFS_IOCTL_GET_CPSTAT:
+		return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_SUINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_suinfo),
+					    nilfs_ioctl_do_get_suinfo);
+	case NILFS_IOCTL_SET_SUINFO:
+		return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_SUSTAT:
+		return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_VINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_vinfo),
+					    nilfs_ioctl_do_get_vinfo);
+	case NILFS_IOCTL_GET_BDESCS:
+		return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
+	case NILFS_IOCTL_CLEAN_SEGMENTS:
+		return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
+	case NILFS_IOCTL_SYNC:
+		return nilfs_ioctl_sync(inode, filp, cmd, argp);
+	case NILFS_IOCTL_RESIZE:
+		return nilfs_ioctl_resize(inode, filp, argp);
+	case NILFS_IOCTL_SET_ALLOC_RANGE:
+		return nilfs_ioctl_set_alloc_range(inode, argp);
+	case FITRIM:
+		return nilfs_ioctl_trim_fs(inode, argp);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	case FS_IOC32_GETVERSION:
+		cmd = FS_IOC_GETVERSION;
+		break;
+	case NILFS_IOCTL_CHANGE_CPMODE:
+	case NILFS_IOCTL_DELETE_CHECKPOINT:
+	case NILFS_IOCTL_GET_CPINFO:
+	case NILFS_IOCTL_GET_CPSTAT:
+	case NILFS_IOCTL_GET_SUINFO:
+	case NILFS_IOCTL_SET_SUINFO:
+	case NILFS_IOCTL_GET_SUSTAT:
+	case NILFS_IOCTL_GET_VINFO:
+	case NILFS_IOCTL_GET_BDESCS:
+	case NILFS_IOCTL_CLEAN_SEGMENTS:
+	case NILFS_IOCTL_SYNC:
+	case NILFS_IOCTL_RESIZE:
+	case NILFS_IOCTL_SET_ALLOC_RANGE:
+	case FITRIM:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/kernel/fs/nilfs2/mdt.c b/kernel/fs/nilfs2/mdt.c
new file mode 100644
index 000000000..dee34d990
--- /dev/null
+++ b/kernel/fs/nilfs2/mdt.c
@@ -0,0 +1,652 @@
+/*
+ * mdt.c - meta data file for NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_MDT_MAX_RA_BLOCKS		(16 - 1)
+
+
+static int
+nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
+			   struct buffer_head *bh,
+			   void (*init_block)(struct inode *,
+					      struct buffer_head *, void *))
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	void *kaddr;
+	int ret;
+
+	/* Caller exclude read accesses using page lock */
+
+	/* set_buffer_new(bh); */
+	bh->b_blocknr = 0;
+
+	ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
+	if (unlikely(ret))
+		return ret;
+
+	set_buffer_mapped(bh);
+
+	kaddr = kmap_atomic(bh->b_page);
+	memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+	if (init_block)
+		init_block(inode, bh, kaddr);
+	flush_dcache_page(bh->b_page);
+	kunmap_atomic(kaddr);
+
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	return 0;
+}
+
+static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
+				  struct buffer_head **out_bh,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *))
+{
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_transaction_info ti;
+	struct buffer_head *bh;
+	int err;
+
+	nilfs_transaction_begin(sb, &ti, 0);
+
+	err = -ENOMEM;
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
+	if (unlikely(!bh))
+		goto failed_unlock;
+
+	err = -EEXIST;
+	if (buffer_uptodate(bh))
+		goto failed_bh;
+
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		goto failed_bh;
+
+	bh->b_bdev = sb->s_bdev;
+	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
+	if (likely(!err)) {
+		get_bh(bh);
+		*out_bh = bh;
+	}
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+
+ failed_unlock:
+	if (likely(!err))
+		err = nilfs_transaction_commit(sb);
+	else
+		nilfs_transaction_abort(sb);
+
+	return err;
+}
+
+static int
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
+		       int mode, struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	__u64 blknum = 0;
+	int ret = -ENOMEM;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		goto failed;
+
+	ret = -EEXIST; /* internal code */
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (mode == READA) {
+		if (!trylock_buffer(bh)) {
+			ret = -EBUSY;
+			goto failed_bh;
+		}
+	} else /* mode == READ */
+		lock_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
+	if (unlikely(ret)) {
+		unlock_buffer(bh);
+		goto failed_bh;
+	}
+	map_bh(bh, inode->i_sb, (sector_t)blknum);
+
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(mode, bh);
+	ret = 0;
+ out:
+	get_bh(bh);
+	*out_bh = bh;
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+ failed:
+	return ret;
+}
+
+static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
+				int readahead, struct buffer_head **out_bh)
+{
+	struct buffer_head *first_bh, *bh;
+	unsigned long blkoff;
+	int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
+	int err;
+
+	err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
+	if (err == -EEXIST) /* internal code */
+		goto out;
+
+	if (unlikely(err))
+		goto failed;
+
+	if (readahead) {
+		blkoff = block + 1;
+		for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+			err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+			if (likely(!err || err == -EEXIST))
+				brelse(bh);
+			else if (err != -EBUSY)
+				break;
+				/* abort readahead if bmap lookup failed */
+			if (!buffer_locked(first_bh))
+				goto out_no_wait;
+		}
+	}
+
+	wait_on_buffer(first_bh);
+
+ out_no_wait:
+	err = -EIO;
+	if (!buffer_uptodate(first_bh))
+		goto failed_bh;
+ out:
+	*out_bh = first_bh;
+	return 0;
+
+ failed_bh:
+	brelse(first_bh);
+ failed:
+	return err;
+}
+
+/**
+ * nilfs_mdt_get_block - read or create a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @blkoff: block offset
+ * @create: create flag
+ * @init_block: initializer used for newly allocated block
+ * @out_bh: output of a pointer to the buffer_head
+ *
+ * nilfs_mdt_get_block() looks up the specified buffer and tries to create
+ * a new buffer if @create is not zero.  On success, the returned buffer is
+ * assured to be either existing or formatted using a buffer lock on success.
+ * @out_bh is substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EROFS - Read only filesystem (for create mode)
+ */
+int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **out_bh)
+{
+	int ret;
+
+	/* Should be rewritten with merging nilfs_mdt_read_block() */
+ retry:
+	ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
+	if (!create || ret != -ENOENT)
+		return ret;
+
+	ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
+	if (unlikely(ret == -EEXIST)) {
+		/* create = 0; */  /* limit read-create loop retries */
+		goto retry;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_mdt_find_block - find and get a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @start: start block offset (inclusive)
+ * @end: end block offset (inclusive)
+ * @blkoff: block offset
+ * @out_bh: place to store a pointer to buffer_head struct
+ *
+ * nilfs_mdt_find_block() looks up an existing block in range of
+ * [@start, @end] and stores pointer to a buffer head of the block to
+ * @out_bh, and block offset to @blkoff, respectively.  @out_bh and
+ * @blkoff are substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block was found in the range
+ */
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+			 unsigned long end, unsigned long *blkoff,
+			 struct buffer_head **out_bh)
+{
+	__u64 next;
+	int ret;
+
+	if (unlikely(start > end))
+		return -ENOENT;
+
+	ret = nilfs_mdt_read_block(inode, start, true, out_bh);
+	if (!ret) {
+		*blkoff = start;
+		goto out;
+	}
+	if (unlikely(ret != -ENOENT || start == ULONG_MAX))
+		goto out;
+
+	ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next);
+	if (!ret) {
+		if (next <= end) {
+			ret = nilfs_mdt_read_block(inode, next, true, out_bh);
+			if (!ret)
+				*blkoff = next;
+		} else {
+			ret = -ENOENT;
+		}
+	}
+out:
+	return ret;
+}
+
+/**
+ * nilfs_mdt_delete_block - make a hole on the meta data file.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, zero is returned.
+ * On error, one of the following negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	err = nilfs_bmap_delete(ii->i_bmap, block);
+	if (!err || err == -ENOENT) {
+		nilfs_mdt_mark_dirty(inode);
+		nilfs_mdt_forget_block(inode, block);
+	}
+	return err;
+}
+
+/**
+ * nilfs_mdt_forget_block - discard dirty state and try to remove the page
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
+ * tries to release the page including the buffer from a page cache.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EBUSY - page has an active buffer.
+ *
+ * %-ENOENT - page cache has no page addressed by the offset.
+ */
+int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
+{
+	pgoff_t index = (pgoff_t)block >>
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	struct page *page;
+	unsigned long first_block;
+	int ret = 0;
+	int still_dirty;
+
+	page = find_lock_page(inode->i_mapping, index);
+	if (!page)
+		return -ENOENT;
+
+	wait_on_page_writeback(page);
+
+	first_block = (unsigned long)index <<
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh;
+
+		bh = nilfs_page_get_nth_block(page, block - first_block);
+		nilfs_forget_buffer(bh);
+	}
+	still_dirty = PageDirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (still_dirty ||
+	    invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
+		ret = -EBUSY;
+	return ret;
+}
+
+/**
+ * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ */
+int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
+{
+	struct buffer_head *bh;
+	int err;
+
+	err = nilfs_mdt_read_block(inode, block, 0, &bh);
+	if (unlikely(err))
+		return err;
+	mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	brelse(bh);
+	return 0;
+}
+
+int nilfs_mdt_fetch_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
+		set_bit(NILFS_I_DIRTY, &ii->i_state);
+		return 1;
+	}
+	return test_bit(NILFS_I_DIRTY, &ii->i_state);
+}
+
+static int
+nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb;
+	int err = 0;
+
+	if (inode && (inode->i_sb->s_flags & MS_RDONLY)) {
+		/*
+		 * It means that filesystem was remounted in read-only
+		 * mode because of error or metadata corruption. But we
+		 * have dirty pages that try to be flushed in background.
+		 * So, here we simply discard this dirty page.
+		 */
+		nilfs_clear_dirty_page(page, false);
+		unlock_page(page);
+		return -EROFS;
+	}
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	if (!inode)
+		return 0;
+
+	sb = inode->i_sb;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_segment(sb);
+	else if (wbc->for_reclaim)
+		nilfs_flush_segment(sb, inode->i_ino);
+
+	return err;
+}
+
+
+static const struct address_space_operations def_mdt_aops = {
+	.writepage		= nilfs_mdt_write_page,
+};
+
+static const struct inode_operations def_mdt_iops;
+static const struct file_operations def_mdt_fops;
+
+
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
+{
+	struct nilfs_mdt_info *mi;
+
+	mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
+	if (!mi)
+		return -ENOMEM;
+
+	init_rwsem(&mi->mi_sem);
+	inode->i_private = mi;
+
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
+
+	inode->i_op = &def_mdt_iops;
+	inode->i_fop = &def_mdt_fops;
+	inode->i_mapping->a_ops = &def_mdt_aops;
+
+	return 0;
+}
+
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
+			      unsigned header_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_entry_size = entry_size;
+	mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
+}
+
+/**
+ * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
+ * @inode: inode of the metadata file
+ * @shadow: shadow mapping
+ */
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	INIT_LIST_HEAD(&shadow->frozen_buffers);
+	address_space_init_once(&shadow->frozen_data);
+	nilfs_mapping_init(&shadow->frozen_data, inode);
+	address_space_init_once(&shadow->frozen_btnodes);
+	nilfs_mapping_init(&shadow->frozen_btnodes, inode);
+	mi->mi_shadow = shadow;
+	return 0;
+}
+
+/**
+ * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
+ * @inode: inode of the metadata file
+ */
+int nilfs_mdt_save_to_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+	int ret;
+
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+	if (ret)
+		goto out;
+
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
+				     &ii->i_btnode_cache);
+	if (ret)
+		goto out;
+
+	nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
+ out:
+	return ret;
+}
+
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
+{
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen;
+	struct page *page;
+	int blkbits = inode->i_blkbits;
+
+	page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+	if (!page)
+		return -ENOMEM;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, 0);
+
+	bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+
+	if (!buffer_uptodate(bh_frozen))
+		nilfs_copy_buffer(bh_frozen, bh);
+	if (list_empty(&bh_frozen->b_assoc_buffers)) {
+		list_add_tail(&bh_frozen->b_assoc_buffers,
+			      &shadow->frozen_buffers);
+		set_buffer_nilfs_redirected(bh);
+	} else {
+		brelse(bh_frozen); /* already frozen */
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+	return 0;
+}
+
+struct buffer_head *
+nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
+{
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen = NULL;
+	struct page *page;
+	int n;
+
+	page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+	if (page) {
+		if (page_has_buffers(page)) {
+			n = bh_offset(bh) >> inode->i_blkbits;
+			bh_frozen = nilfs_page_get_nth_block(page, n);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return bh_frozen;
+}
+
+static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
+{
+	struct list_head *head = &shadow->frozen_buffers;
+	struct buffer_head *bh;
+
+	while (!list_empty(head)) {
+		bh = list_first_entry(head, struct buffer_head,
+				      b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh); /* drop ref-count to make it releasable */
+	}
+}
+
+/**
+ * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+
+	down_write(&mi->mi_sem);
+
+	if (mi->mi_palloc_cache)
+		nilfs_palloc_clear_cache(inode);
+
+	nilfs_clear_dirty_pages(inode->i_mapping, true);
+	nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+
+	nilfs_clear_dirty_pages(&ii->i_btnode_cache, true);
+	nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+
+	nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
+
+	up_write(&mi->mi_sem);
+}
+
+/**
+ * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+
+	down_write(&mi->mi_sem);
+	nilfs_release_frozen_buffers(shadow);
+	truncate_inode_pages(&shadow->frozen_data, 0);
+	truncate_inode_pages(&shadow->frozen_btnodes, 0);
+	up_write(&mi->mi_sem);
+}
diff --git a/kernel/fs/nilfs2/mdt.h b/kernel/fs/nilfs2/mdt.h
new file mode 100644
index 000000000..fe529a87a
--- /dev/null
+++ b/kernel/fs/nilfs2/mdt.h
@@ -0,0 +1,123 @@
+/*
+ * mdt.h - NILFS meta data file prototype and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_MDT_H
+#define _NILFS_MDT_H
+
+#include <linux/buffer_head.h>
+#include <linux/blockgroup_lock.h>
+#include "nilfs.h"
+#include "page.h"
+
+/**
+ * struct nilfs_shadow_map - shadow mapping of meta data file
+ * @bmap_store: shadow copy of bmap state
+ * @frozen_data: shadowed dirty data pages
+ * @frozen_btnodes: shadowed dirty b-tree nodes' pages
+ * @frozen_buffers: list of frozen buffers
+ */
+struct nilfs_shadow_map {
+	struct nilfs_bmap_store bmap_store;
+	struct address_space frozen_data;
+	struct address_space frozen_btnodes;
+	struct list_head frozen_buffers;
+};
+
+/**
+ * struct nilfs_mdt_info - on-memory private data of meta data files
+ * @mi_sem: reader/writer semaphore for meta data operations
+ * @mi_bgl: per-blockgroup locking
+ * @mi_entry_size: size of an entry
+ * @mi_first_entry_offset: offset to the first entry
+ * @mi_entries_per_block: number of entries in a block
+ * @mi_palloc_cache: persistent object allocator cache
+ * @mi_shadow: shadow of bmap and page caches
+ * @mi_blocks_per_group: number of blocks in a group
+ * @mi_blocks_per_desc_block: number of blocks per descriptor block
+ */
+struct nilfs_mdt_info {
+	struct rw_semaphore	mi_sem;
+	struct blockgroup_lock *mi_bgl;
+	unsigned		mi_entry_size;
+	unsigned		mi_first_entry_offset;
+	unsigned long		mi_entries_per_block;
+	struct nilfs_palloc_cache *mi_palloc_cache;
+	struct nilfs_shadow_map *mi_shadow;
+	unsigned long		mi_blocks_per_group;
+	unsigned long		mi_blocks_per_desc_block;
+};
+
+static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
+{
+	return inode->i_private;
+}
+
+/* Default GFP flags using highmem */
+#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+
+int nilfs_mdt_get_block(struct inode *, unsigned long, int,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **);
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+			 unsigned long end, unsigned long *blkoff,
+			 struct buffer_head **out_bh);
+int nilfs_mdt_delete_block(struct inode *, unsigned long);
+int nilfs_mdt_forget_block(struct inode *, unsigned long);
+int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
+int nilfs_mdt_fetch_dirty(struct inode *);
+
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow);
+int nilfs_mdt_save_to_shadow_map(struct inode *inode);
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
+void nilfs_mdt_clear_shadow_map(struct inode *inode);
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
+struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
+						struct buffer_head *bh);
+
+static inline void nilfs_mdt_mark_dirty(struct inode *inode)
+{
+	if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
+		set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline void nilfs_mdt_clear_dirty(struct inode *inode)
+{
+	clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline __u64 nilfs_mdt_cno(struct inode *inode)
+{
+	return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
+}
+
+static inline spinlock_t *
+nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group)
+{
+	return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group);
+}
+
+#endif /* _NILFS_MDT_H */
diff --git a/kernel/fs/nilfs2/namei.c b/kernel/fs/nilfs2/namei.c
new file mode 100644
index 000000000..22180836e
--- /dev/null
+++ b/kernel/fs/nilfs2/namei.c
@@ -0,0 +1,585 @@
+/*
+ * namei.c - NILFS pathname lookup operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
+ *                       Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/pagemap.h>
+#include "nilfs.h"
+#include "export.h"
+
+#define NILFS_FID_SIZE_NON_CONNECTABLE \
+	(offsetof(struct nilfs_fid, parent_gen) / 4)
+#define NILFS_FID_SIZE_CONNECTABLE	(sizeof(struct nilfs_fid) / 4)
+
+static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = nilfs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * Methods themselves.
+ */
+
+static struct dentry *
+nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > NILFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = nilfs_inode_by_name(dir, &dentry->d_name);
+	inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL;
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			bool excl)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+		nilfs_mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int
+nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		nilfs_mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = dir->i_sb;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	int err;
+
+	if (l > sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	/* slow symlink */
+	inode->i_op = &nilfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+	err = page_symlink(inode, symname, l);
+	if (err)
+		goto out_fail;
+
+	/* mark_inode_dirty(inode); */
+	/* page_symlink() do this */
+
+	err = nilfs_add_nondir(dentry, inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	goto out;
+}
+
+static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	err = nilfs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		err = nilfs_transaction_commit(dir->i_sb);
+	} else {
+		inode_dec_link_count(inode);
+		iput(inode);
+		nilfs_transaction_abort(dir->i_sb);
+	}
+
+	return err;
+}
+
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inc_nlink(dir);
+
+	inode = nilfs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &nilfs_dir_inode_operations;
+	inode->i_fop = &nilfs_dir_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+
+	inc_nlink(inode);
+
+	err = nilfs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = nilfs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	nilfs_mark_inode_dirty(inode);
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	drop_nlink(inode);
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+out_dir:
+	drop_nlink(dir);
+	nilfs_mark_inode_dirty(dir);
+	goto out;
+}
+
+static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+	int err;
+
+	err = -ENOENT;
+	de = nilfs_find_entry(dir, &dentry->d_name, &page);
+	if (!de)
+		goto out;
+
+	inode = d_inode(dentry);
+	err = -EIO;
+	if (le64_to_cpu(de->inode) != inode->i_ino)
+		goto out;
+
+	if (!inode->i_nlink) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "deleting nonexistent file (%lu), %d\n",
+			      inode->i_ino, inode->i_nlink);
+		set_nlink(inode, 1);
+	}
+	err = nilfs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	drop_nlink(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = nilfs_do_unlink(dir, dentry);
+
+	if (!err) {
+		nilfs_mark_inode_dirty(dir);
+		nilfs_mark_inode_dirty(d_inode(dentry));
+		err = nilfs_transaction_commit(dir->i_sb);
+	} else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = -ENOTEMPTY;
+	if (nilfs_empty_dir(inode)) {
+		err = nilfs_do_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			drop_nlink(inode);
+			nilfs_mark_inode_dirty(inode);
+			drop_nlink(dir);
+			nilfs_mark_inode_dirty(dir);
+		}
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir,	struct dentry *new_dentry)
+{
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct page *dir_page = NULL;
+	struct nilfs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct nilfs_dir_entry *old_de;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
+	if (unlikely(err))
+		return err;
+
+	err = -ENOENT;
+	old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = nilfs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct nilfs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !nilfs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+		if (!new_de)
+			goto out_dir;
+		nilfs_set_link(new_dir, new_de, new_page, old_inode);
+		nilfs_mark_inode_dirty(new_dir);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		drop_nlink(new_inode);
+		nilfs_mark_inode_dirty(new_inode);
+	} else {
+		err = nilfs_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de) {
+			inc_nlink(new_dir);
+			nilfs_mark_inode_dirty(new_dir);
+		}
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME;
+
+	nilfs_delete_entry(old_de, old_page);
+
+	if (dir_de) {
+		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+		drop_nlink(old_dir);
+	}
+	nilfs_mark_inode_dirty(old_dir);
+	nilfs_mark_inode_dirty(old_inode);
+
+	err = nilfs_transaction_commit(old_dir->i_sb);
+	return err;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	nilfs_transaction_abort(old_dir->i_sb);
+	return err;
+}
+
+/*
+ * Export operations
+ */
+static struct dentry *nilfs_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct inode *inode;
+	struct qstr dotdot = QSTR_INIT("..", 2);
+	struct nilfs_root *root;
+
+	ino = nilfs_inode_by_name(d_inode(child), &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+
+	root = NILFS_I(d_inode(child))->i_root;
+
+	inode = nilfs_iget(d_inode(child)->i_sb, root, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
+				       u64 ino, u32 gen)
+{
+	struct nilfs_root *root;
+	struct inode *inode;
+
+	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	root = nilfs_lookup_root(sb->s_fs_info, cno);
+	if (!root)
+		return ERR_PTR(-ESTALE);
+
+	inode = nilfs_iget(sb, root, ino);
+	nilfs_put_root(root);
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (gen && inode->i_generation != gen) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
+	     fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+	    (fh_type != FILEID_NILFS_WITH_PARENT &&
+	     fh_type != FILEID_NILFS_WITHOUT_PARENT))
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
+}
+
+static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+	    fh_type != FILEID_NILFS_WITH_PARENT)
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
+}
+
+static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+			   struct inode *parent)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	int type;
+
+	if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {
+		*lenp = NILFS_FID_SIZE_CONNECTABLE;
+		return FILEID_INVALID;
+	}
+	if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {
+		*lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
+		return FILEID_INVALID;
+	}
+
+	fid->cno = root->cno;
+	fid->ino = inode->i_ino;
+	fid->gen = inode->i_generation;
+
+	if (parent) {
+		fid->parent_ino = parent->i_ino;
+		fid->parent_gen = parent->i_generation;
+		type = FILEID_NILFS_WITH_PARENT;
+		*lenp = NILFS_FID_SIZE_CONNECTABLE;
+	} else {
+		type = FILEID_NILFS_WITHOUT_PARENT;
+		*lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
+	}
+
+	return type;
+}
+
+const struct inode_operations nilfs_dir_inode_operations = {
+	.create		= nilfs_create,
+	.lookup		= nilfs_lookup,
+	.link		= nilfs_link,
+	.unlink		= nilfs_unlink,
+	.symlink	= nilfs_symlink,
+	.mkdir		= nilfs_mkdir,
+	.rmdir		= nilfs_rmdir,
+	.mknod		= nilfs_mknod,
+	.rename		= nilfs_rename,
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+	.fiemap		= nilfs_fiemap,
+};
+
+const struct inode_operations nilfs_special_inode_operations = {
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+};
+
+const struct inode_operations nilfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.permission     = nilfs_permission,
+};
+
+const struct export_operations nilfs_export_ops = {
+	.encode_fh = nilfs_encode_fh,
+	.fh_to_dentry = nilfs_fh_to_dentry,
+	.fh_to_parent = nilfs_fh_to_parent,
+	.get_parent = nilfs_get_parent,
+};
diff --git a/kernel/fs/nilfs2/nilfs.h b/kernel/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000..385704027
--- /dev/null
+++ b/kernel/fs/nilfs2/nilfs.h
@@ -0,0 +1,354 @@
+/*
+ * nilfs.h - NILFS local header file.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_H
+#define _NILFS_H
+
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/nilfs2_fs.h>
+#include "the_nilfs.h"
+#include "bmap.h"
+
+/**
+ * struct nilfs_inode_info - nilfs inode data in memory
+ * @i_flags: inode flags
+ * @i_state: dynamic state flags
+ * @i_bmap: pointer on i_bmap_data
+ * @i_bmap_data: raw block mapping
+ * @i_xattr: <TODO>
+ * @i_dir_start_lookup: page index of last successful search
+ * @i_cno: checkpoint number for GC inode
+ * @i_btnode_cache: cached pages of b-tree nodes
+ * @i_dirty: list for connecting dirty files
+ * @xattr_sem: semaphore for extended attributes processing
+ * @i_bh: buffer contains disk inode
+ * @i_root: root object of the current filesystem tree
+ * @vfs_inode: VFS inode object
+ */
+struct nilfs_inode_info {
+	__u32 i_flags;
+	unsigned long  i_state;		/* Dynamic state flags */
+	struct nilfs_bmap *i_bmap;
+	struct nilfs_bmap i_bmap_data;
+	__u64 i_xattr;	/* sector_t ??? */
+	__u32 i_dir_start_lookup;
+	__u64 i_cno;		/* check point number for GC inode */
+	struct address_space i_btnode_cache;
+	struct list_head i_dirty;	/* List for connecting dirty files */
+
+#ifdef CONFIG_NILFS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
+					   disk inode */
+	struct nilfs_root *i_root;
+	struct inode vfs_inode;
+};
+
+static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
+{
+	return container_of(inode, struct nilfs_inode_info, vfs_inode);
+}
+
+static inline struct nilfs_inode_info *
+NILFS_BMAP_I(const struct nilfs_bmap *bmap)
+{
+	return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
+}
+
+static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
+{
+	struct nilfs_inode_info *ii =
+		container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
+	return &ii->vfs_inode;
+}
+
+/*
+ * Dynamic state flags of NILFS on-memory inode (i_state)
+ */
+enum {
+	NILFS_I_NEW = 0,		/* Inode is newly created */
+	NILFS_I_DIRTY,			/* The file is dirty */
+	NILFS_I_QUEUED,			/* inode is in dirty_files list */
+	NILFS_I_BUSY,			/* inode is grabbed by a segment
+					   constructor */
+	NILFS_I_COLLECTED,		/* All dirty blocks are collected */
+	NILFS_I_UPDATED,		/* The file has been written back */
+	NILFS_I_INODE_SYNC,		/* dsync is not allowed for inode */
+	NILFS_I_BMAP,			/* has bmap and btnode_cache */
+	NILFS_I_GCINODE,		/* inode for GC, on memory only */
+};
+
+/*
+ * commit flags for nilfs_commit_super and nilfs_sync_super
+ */
+enum {
+	NILFS_SB_COMMIT = 0,	/* Commit a super block alternately */
+	NILFS_SB_COMMIT_ALL	/* Commit both super blocks */
+};
+
+/*
+ * Macros to check inode numbers
+ */
+#define NILFS_MDT_INO_BITS   \
+	((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |	\
+			1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |	\
+			1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+
+#define NILFS_SYS_INO_BITS   \
+	((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+
+#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
+
+#define NILFS_MDT_INODE(sb, ino) \
+	((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+#define NILFS_VALID_INODE(sb, ino) \
+	((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+
+/**
+ * struct nilfs_transaction_info: context information for synchronization
+ * @ti_magic: Magic number
+ * @ti_save: Backup of journal_info field of task_struct
+ * @ti_flags: Flags
+ * @ti_count: Nest level
+ */
+struct nilfs_transaction_info {
+	u32			ti_magic;
+	void		       *ti_save;
+				/* This should never used. If this happens,
+				   one of other filesystems has a bug. */
+	unsigned short		ti_flags;
+	unsigned short		ti_count;
+};
+
+/* ti_magic */
+#define NILFS_TI_MAGIC		0xd9e392fb
+
+/* ti_flags */
+#define NILFS_TI_DYNAMIC_ALLOC	0x0001  /* Allocated from slab */
+#define NILFS_TI_SYNC		0x0002	/* Force to construct segment at the
+					   end of transaction. */
+#define NILFS_TI_GC		0x0004	/* GC context */
+#define NILFS_TI_COMMIT		0x0008	/* Change happened or not */
+#define NILFS_TI_WRITER		0x0010	/* Constructor context */
+
+
+int nilfs_transaction_begin(struct super_block *,
+			    struct nilfs_transaction_info *, int);
+int nilfs_transaction_commit(struct super_block *);
+void nilfs_transaction_abort(struct super_block *);
+
+static inline void nilfs_set_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	ti->ti_flags |= flag;
+}
+
+static inline int nilfs_test_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
+		return 0;
+	return !!(ti->ti_flags & flag);
+}
+
+static inline int nilfs_doing_gc(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_GC);
+}
+
+static inline int nilfs_doing_construction(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_WRITER);
+}
+
+/*
+ * function prototype
+ */
+#ifdef CONFIG_NILFS_POSIX_ACL
+#error "NILFS: not yet supported POSIX ACL"
+extern int nilfs_acl_chmod(struct inode *);
+extern int nilfs_init_acl(struct inode *, struct inode *);
+#else
+static inline int nilfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	inode->i_mode &= ~current_umask();
+	return 0;
+}
+#endif
+
+#define NILFS_ATIME_DISABLE
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define NILFS_FL_INHERITED						\
+	(FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL |		\
+	 FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
+	 FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
+	else
+		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/* dir.c */
+extern int nilfs_add_link(struct dentry *, struct inode *);
+extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
+extern int nilfs_make_empty(struct inode *, struct inode *);
+extern struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
+extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
+extern int nilfs_empty_dir(struct inode *);
+extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
+extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+			   struct page *, struct inode *);
+
+/* file.c */
+extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
+
+/* ioctl.c */
+long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
+				       void **);
+
+/* inode.c */
+void nilfs_inode_add_blocks(struct inode *inode, int n);
+void nilfs_inode_sub_blocks(struct inode *inode, int n);
+extern struct inode *nilfs_new_inode(struct inode *, umode_t);
+extern void nilfs_free_inode(struct inode *);
+extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern void nilfs_set_inode_flags(struct inode *);
+extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
+extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino);
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino);
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino);
+extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
+				       unsigned long ino, __u64 cno);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
+extern void nilfs_truncate(struct inode *);
+extern void nilfs_evict_inode(struct inode *);
+extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
+int nilfs_permission(struct inode *inode, int mask);
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
+extern int nilfs_inode_dirty(struct inode *);
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
+extern int __nilfs_mark_inode_dirty(struct inode *, int);
+extern void nilfs_dirty_inode(struct inode *, int flags);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 __u64 start, __u64 len);
+static inline int nilfs_mark_inode_dirty(struct inode *inode)
+{
+	return __nilfs_mark_inode_dirty(inode, I_DIRTY);
+}
+static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
+{
+	return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC);
+}
+
+/* super.c */
+extern struct inode *nilfs_alloc_inode(struct super_block *);
+extern void nilfs_destroy_inode(struct inode *);
+extern __printf(3, 4)
+void nilfs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void nilfs_warning(struct super_block *, const char *, const char *, ...);
+extern struct nilfs_super_block *
+nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
+extern int nilfs_store_magic_and_option(struct super_block *,
+					struct nilfs_super_block *, char *);
+extern int nilfs_check_feature_compatibility(struct super_block *,
+					     struct nilfs_super_block *);
+extern void nilfs_set_log_cursor(struct nilfs_super_block *,
+				 struct the_nilfs *);
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
+					       int flip);
+int nilfs_commit_super(struct super_block *sb, int flag);
+int nilfs_cleanup_super(struct super_block *sb);
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize);
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
+			    struct nilfs_root **root);
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
+
+/* gcinode.c */
+int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
+int nilfs_init_gcinode(struct inode *inode);
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
+
+/* sysfs.c */
+int __init nilfs_sysfs_init(void);
+void nilfs_sysfs_exit(void);
+int nilfs_sysfs_create_device_group(struct super_block *);
+void nilfs_sysfs_delete_device_group(struct the_nilfs *);
+int nilfs_sysfs_create_snapshot_group(struct nilfs_root *);
+void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *);
+
+/*
+ * Inodes and files operations
+ */
+extern const struct file_operations nilfs_dir_operations;
+extern const struct inode_operations nilfs_file_inode_operations;
+extern const struct file_operations nilfs_file_operations;
+extern const struct address_space_operations nilfs_aops;
+extern const struct inode_operations nilfs_dir_inode_operations;
+extern const struct inode_operations nilfs_special_inode_operations;
+extern const struct inode_operations nilfs_symlink_inode_operations;
+
+/*
+ * filesystem type
+ */
+extern struct file_system_type nilfs_fs_type;
+
+
+#endif	/* _NILFS_H */
diff --git a/kernel/fs/nilfs2/page.c b/kernel/fs/nilfs2/page.c
new file mode 100644
index 000000000..45d650add
--- /dev/null
+++ b/kernel/fs/nilfs2/page.c
@@ -0,0 +1,581 @@
+/*
+ * page.c - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include <linux/gfp.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_BUFFER_INHERENT_BITS  \
+	((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
+	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
+
+static struct buffer_head *
+__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
+		       int blkbits, unsigned long b_state)
+
+{
+	unsigned long first_block;
+	struct buffer_head *bh;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, b_state);
+
+	first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+	bh = nilfs_page_get_nth_block(page, block - first_block);
+
+	touch_buffer(bh);
+	wait_on_buffer(bh);
+	return bh;
+}
+
+struct buffer_head *nilfs_grab_buffer(struct inode *inode,
+				      struct address_space *mapping,
+				      unsigned long blkoff,
+				      unsigned long b_state)
+{
+	int blkbits = inode->i_blkbits;
+	pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+	struct page *page;
+	struct buffer_head *bh;
+
+	page = grab_cache_page(mapping, index);
+	if (unlikely(!page))
+		return NULL;
+
+	bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+	if (unlikely(!bh)) {
+		unlock_page(page);
+		page_cache_release(page);
+		return NULL;
+	}
+	return bh;
+}
+
+/**
+ * nilfs_forget_buffer - discard dirty state
+ * @inode: owner inode of the buffer
+ * @bh: buffer head of the buffer to be discarded
+ */
+void nilfs_forget_buffer(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	const unsigned long clear_bits =
+		(1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+		 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+		 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+
+	lock_buffer(bh);
+	set_mask_bits(&bh->b_state, clear_bits, 0);
+	if (nilfs_page_buffers_clean(page))
+		__nilfs_clear_page_dirty(page);
+
+	bh->b_blocknr = -1;
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	unlock_buffer(bh);
+	brelse(bh);
+}
+
+/**
+ * nilfs_copy_buffer -- copy buffer data and flags
+ * @dbh: destination buffer
+ * @sbh: source buffer
+ */
+void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
+{
+	void *kaddr0, *kaddr1;
+	unsigned long bits;
+	struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+	struct buffer_head *bh;
+
+	kaddr0 = kmap_atomic(spage);
+	kaddr1 = kmap_atomic(dpage);
+	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
+	kunmap_atomic(kaddr1);
+	kunmap_atomic(kaddr0);
+
+	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
+	dbh->b_blocknr = sbh->b_blocknr;
+	dbh->b_bdev = sbh->b_bdev;
+
+	bh = dbh;
+	bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+	while ((bh = bh->b_this_page) != dbh) {
+		lock_buffer(bh);
+		bits &= bh->b_state;
+		unlock_buffer(bh);
+	}
+	if (bits & (1UL << BH_Uptodate))
+		SetPageUptodate(dpage);
+	else
+		ClearPageUptodate(dpage);
+	if (bits & (1UL << BH_Mapped))
+		SetPageMappedToDisk(dpage);
+	else
+		ClearPageMappedToDisk(dpage);
+}
+
+/**
+ * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
+ * @page: page to be checked
+ *
+ * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
+ * Otherwise, it returns non-zero value.
+ */
+int nilfs_page_buffers_clean(struct page *page)
+{
+	struct buffer_head *bh, *head;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_dirty(bh))
+			return 0;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return 1;
+}
+
+void nilfs_page_bug(struct page *page)
+{
+	struct address_space *m;
+	unsigned long ino;
+
+	if (unlikely(!page)) {
+		printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+		return;
+	}
+
+	m = page->mapping;
+	ino = m ? m->host->i_ino : 0;
+
+	printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+	       "mapping=%p ino=%lu\n",
+	       page, atomic_read(&page->_count),
+	       (unsigned long long)page->index, page->flags, m, ino);
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int i = 0;
+
+		bh = head = page_buffers(page);
+		do {
+			printk(KERN_CRIT
+			       " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
+			       i++, bh, atomic_read(&bh->b_count),
+			       (unsigned long long)bh->b_blocknr, bh->b_state);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+}
+
+/**
+ * nilfs_copy_page -- copy the page with buffers
+ * @dst: destination page
+ * @src: source page
+ * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ *
+ * This function is for both data pages and btnode pages.  The dirty flag
+ * should be treated by caller.  The page must not be under i/o.
+ * Both src and dst page must be locked
+ */
+static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+{
+	struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
+	unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
+
+	BUG_ON(PageWriteback(dst));
+
+	sbh = sbufs = page_buffers(src);
+	if (!page_has_buffers(dst))
+		create_empty_buffers(dst, sbh->b_size, 0);
+
+	if (copy_dirty)
+		mask |= (1UL << BH_Dirty);
+
+	dbh = dbufs = page_buffers(dst);
+	do {
+		lock_buffer(sbh);
+		lock_buffer(dbh);
+		dbh->b_state = sbh->b_state & mask;
+		dbh->b_blocknr = sbh->b_blocknr;
+		dbh->b_bdev = sbh->b_bdev;
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+
+	copy_highpage(dst, src);
+
+	if (PageUptodate(src) && !PageUptodate(dst))
+		SetPageUptodate(dst);
+	else if (!PageUptodate(src) && PageUptodate(dst))
+		ClearPageUptodate(dst);
+	if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
+		SetPageMappedToDisk(dst);
+	else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
+		ClearPageMappedToDisk(dst);
+
+	do {
+		unlock_buffer(sbh);
+		unlock_buffer(dbh);
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+}
+
+int nilfs_copy_dirty_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+	int err = 0;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
+				PAGEVEC_SIZE))
+		return 0;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+
+		lock_page(page);
+		if (unlikely(!PageDirty(page)))
+			NILFS_PAGE_BUG(page, "inconsistent dirty state");
+
+		dpage = grab_cache_page(dmap, page->index);
+		if (unlikely(!dpage)) {
+			/* No empty page is added to the page cache */
+			err = -ENOMEM;
+			unlock_page(page);
+			break;
+		}
+		if (unlikely(!page_has_buffers(page)))
+			NILFS_PAGE_BUG(page,
+				       "found empty page in dat page cache");
+
+		nilfs_copy_page(dpage, page, 1);
+		__set_page_dirty_nobuffers(dpage);
+
+		unlock_page(dpage);
+		page_cache_release(dpage);
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+/**
+ * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
+ * @dmap: destination page cache
+ * @smap: source page cache
+ *
+ * No pages must no be added to the cache during this process.
+ * This must be ensured by the caller.
+ */
+void nilfs_copy_back_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i, n;
+	pgoff_t index = 0;
+	int err;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+	if (!n)
+		return;
+	index = pvec.pages[n - 1]->index + 1;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+		pgoff_t offset = page->index;
+
+		lock_page(page);
+		dpage = find_lock_page(dmap, offset);
+		if (dpage) {
+			/* override existing page on the destination cache */
+			WARN_ON(PageDirty(dpage));
+			nilfs_copy_page(dpage, page, 0);
+			unlock_page(dpage);
+			page_cache_release(dpage);
+		} else {
+			struct page *page2;
+
+			/* move the page to the destination cache */
+			spin_lock_irq(&smap->tree_lock);
+			page2 = radix_tree_delete(&smap->page_tree, offset);
+			WARN_ON(page2 != page);
+
+			smap->nrpages--;
+			spin_unlock_irq(&smap->tree_lock);
+
+			spin_lock_irq(&dmap->tree_lock);
+			err = radix_tree_insert(&dmap->page_tree, offset, page);
+			if (unlikely(err < 0)) {
+				WARN_ON(err == -EEXIST);
+				page->mapping = NULL;
+				page_cache_release(page); /* for cache */
+			} else {
+				page->mapping = dmap;
+				dmap->nrpages++;
+				if (PageDirty(page))
+					radix_tree_tag_set(&dmap->page_tree,
+							   offset,
+							   PAGECACHE_TAG_DIRTY);
+			}
+			spin_unlock_irq(&dmap->tree_lock);
+		}
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	goto repeat;
+}
+
+/**
+ * nilfs_clear_dirty_pages - discard dirty pages in address space
+ * @mapping: address space with dirty pages for discarding
+ * @silent: suppress [true] or print [false] warning messages
+ */
+void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			nilfs_clear_dirty_page(page, silent);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/**
+ * nilfs_clear_dirty_page - discard dirty page
+ * @page: dirty page that will be discarded
+ * @silent: suppress [true] or print [false] warning messages
+ */
+void nilfs_clear_dirty_page(struct page *page, bool silent)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!silent) {
+		nilfs_warning(sb, __func__,
+				"discard page: offset %lld, ino %lu",
+				page_offset(page), inode->i_ino);
+	}
+
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		const unsigned long clear_bits =
+			(1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+			 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+			 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+
+		bh = head = page_buffers(page);
+		do {
+			lock_buffer(bh);
+			if (!silent) {
+				nilfs_warning(sb, __func__,
+					"discard block %llu, size %zu",
+					(u64)bh->b_blocknr, bh->b_size);
+			}
+			set_mask_bits(&bh->b_state, clear_bits, 0);
+			unlock_buffer(bh);
+		} while (bh = bh->b_this_page, bh != head);
+	}
+
+	__nilfs_clear_page_dirty(page);
+}
+
+unsigned nilfs_page_count_clean_buffers(struct page *page,
+					unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	struct buffer_head *bh, *head;
+	unsigned nc = 0;
+
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + bh->b_size;
+		if (block_end > from && block_start < to && !buffer_dirty(bh))
+			nc++;
+	}
+	return nc;
+}
+
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
+{
+	mapping->host = inode;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	mapping->private_data = NULL;
+	mapping->a_ops = &empty_aops;
+}
+
+/*
+ * NILFS2 needs clear_page_dirty() in the following two cases:
+ *
+ * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
+ *    page dirty flags when it copies back pages from the shadow cache
+ *    (gcdat->{i_mapping,i_btnode_cache}) to its original cache
+ *    (dat->{i_mapping,i_btnode_cache}).
+ *
+ * 2) Some B-tree operations like insertion or deletion may dispose buffers
+ *    in dirty state, and this needs to cancel the dirty state of their pages.
+ */
+int __nilfs_clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		spin_lock_irq(&mapping->tree_lock);
+		if (test_bit(PG_dirty, &page->flags)) {
+			radix_tree_tag_clear(&mapping->page_tree,
+					     page_index(page),
+					     PAGECACHE_TAG_DIRTY);
+			spin_unlock_irq(&mapping->tree_lock);
+			return clear_page_dirty_for_io(page);
+		}
+		spin_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+					    sector_t start_blk,
+					    sector_t *blkoff)
+{
+	unsigned int i;
+	pgoff_t index;
+	unsigned int nblocks_in_page;
+	unsigned long length = 0;
+	sector_t b;
+	struct pagevec pvec;
+	struct page *page;
+
+	if (inode->i_mapping->nrpages == 0)
+		return 0;
+
+	index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+
+repeat:
+	pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+					pvec.pages);
+	if (pvec.nr == 0)
+		return length;
+
+	if (length > 0 && pvec.pages[0]->index > index)
+		goto out;
+
+	b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	i = 0;
+	do {
+		page = pvec.pages[i];
+
+		lock_page(page);
+		if (page_has_buffers(page)) {
+			struct buffer_head *bh, *head;
+
+			bh = head = page_buffers(page);
+			do {
+				if (b < start_blk)
+					continue;
+				if (buffer_delay(bh)) {
+					if (length == 0)
+						*blkoff = b;
+					length++;
+				} else if (length > 0) {
+					goto out_locked;
+				}
+			} while (++b, bh = bh->b_this_page, bh != head);
+		} else {
+			if (length > 0)
+				goto out_locked;
+
+			b += nblocks_in_page;
+		}
+		unlock_page(page);
+
+	} while (++i < pagevec_count(&pvec));
+
+	index = page->index + 1;
+	pagevec_release(&pvec);
+	cond_resched();
+	goto repeat;
+
+out_locked:
+	unlock_page(page);
+out:
+	pagevec_release(&pvec);
+	return length;
+}
diff --git a/kernel/fs/nilfs2/page.h b/kernel/fs/nilfs2/page.h
new file mode 100644
index 000000000..a43b8287d
--- /dev/null
+++ b/kernel/fs/nilfs2/page.h
@@ -0,0 +1,80 @@
+/*
+ * page.h - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#ifndef _NILFS_PAGE_H
+#define _NILFS_PAGE_H
+
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+
+/*
+ * Extended buffer state bits
+ */
+enum {
+	BH_NILFS_Allocated = BH_PrivateStart,
+	BH_NILFS_Node,
+	BH_NILFS_Volatile,
+	BH_NILFS_Checked,
+	BH_NILFS_Redirected,
+};
+
+BUFFER_FNS(NILFS_Node, nilfs_node)		/* nilfs node buffers */
+BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+BUFFER_FNS(NILFS_Checked, nilfs_checked)	/* buffer is verified */
+BUFFER_FNS(NILFS_Redirected, nilfs_redirected)	/* redirected to a copy */
+
+
+int __nilfs_clear_page_dirty(struct page *);
+
+struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
+				      unsigned long, unsigned long);
+void nilfs_forget_buffer(struct buffer_head *);
+void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
+int nilfs_page_buffers_clean(struct page *);
+void nilfs_page_bug(struct page *);
+
+int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
+void nilfs_copy_back_pages(struct address_space *, struct address_space *);
+void nilfs_clear_dirty_page(struct page *, bool);
+void nilfs_clear_dirty_pages(struct address_space *, bool);
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
+unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+					    sector_t start_blk,
+					    sector_t *blkoff);
+
+#define NILFS_PAGE_BUG(page, m, a...) \
+	do { nilfs_page_bug(page); BUG(); } while (0)
+
+static inline struct buffer_head *
+nilfs_page_get_nth_block(struct page *page, unsigned int count)
+{
+	struct buffer_head *bh = page_buffers(page);
+
+	while (count-- > 0)
+		bh = bh->b_this_page;
+	get_bh(bh);
+	return bh;
+}
+
+#endif /* _NILFS_PAGE_H */
diff --git a/kernel/fs/nilfs2/recovery.c b/kernel/fs/nilfs2/recovery.c
new file mode 100644
index 000000000..ff00a0b7a
--- /dev/null
+++ b/kernel/fs/nilfs2/recovery.c
@@ -0,0 +1,964 @@
+/*
+ * recovery.c - NILFS recovery logic
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "sufile.h"
+#include "page.h"
+#include "segbuf.h"
+
+/*
+ * Segment check result
+ */
+enum {
+	NILFS_SEG_VALID,
+	NILFS_SEG_NO_SUPER_ROOT,
+	NILFS_SEG_FAIL_IO,
+	NILFS_SEG_FAIL_MAGIC,
+	NILFS_SEG_FAIL_SEQ,
+	NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
+	NILFS_SEG_FAIL_CHECKSUM_FULL,
+	NILFS_SEG_FAIL_CONSISTENCY,
+};
+
+/* work structure for recovery */
+struct nilfs_recovery_block {
+	ino_t ino;		/* Inode number of the file that this block
+				   belongs to */
+	sector_t blocknr;	/* block number */
+	__u64 vblocknr;		/* virtual block number */
+	unsigned long blkoff;	/* File offset of the data block (per block) */
+	struct list_head list;
+};
+
+
+static int nilfs_warn_segment_error(int err)
+{
+	switch (err) {
+	case NILFS_SEG_FAIL_IO:
+		printk(KERN_WARNING
+		       "NILFS warning: I/O error on loading last segment\n");
+		return -EIO;
+	case NILFS_SEG_FAIL_MAGIC:
+		printk(KERN_WARNING
+		       "NILFS warning: Segment magic number invalid\n");
+		break;
+	case NILFS_SEG_FAIL_SEQ:
+		printk(KERN_WARNING
+		       "NILFS warning: Sequence number mismatch\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in super root\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_FULL:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in segment payload\n");
+		break;
+	case NILFS_SEG_FAIL_CONSISTENCY:
+		printk(KERN_WARNING
+		       "NILFS warning: Inconsistent segment\n");
+		break;
+	case NILFS_SEG_NO_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: No super root in the last segment\n");
+		break;
+	}
+	return -EINVAL;
+}
+
+/**
+ * nilfs_compute_checksum - compute checksum of blocks continuously
+ * @nilfs: nilfs object
+ * @bhs: buffer head of start block
+ * @sum: place to store result
+ * @offset: offset bytes in the first block
+ * @check_bytes: number of bytes to be checked
+ * @start: DBN of start block
+ * @nblock: number of blocks to be checked
+ */
+static int nilfs_compute_checksum(struct the_nilfs *nilfs,
+				  struct buffer_head *bhs, u32 *sum,
+				  unsigned long offset, u64 check_bytes,
+				  sector_t start, unsigned long nblock)
+{
+	unsigned int blocksize = nilfs->ns_blocksize;
+	unsigned long size;
+	u32 crc;
+
+	BUG_ON(offset >= blocksize);
+	check_bytes -= offset;
+	size = min_t(u64, check_bytes, blocksize - offset);
+	crc = crc32_le(nilfs->ns_crc_seed,
+		       (unsigned char *)bhs->b_data + offset, size);
+	if (--nblock > 0) {
+		do {
+			struct buffer_head *bh;
+
+			bh = __bread(nilfs->ns_bdev, ++start, blocksize);
+			if (!bh)
+				return -EIO;
+			check_bytes -= size;
+			size = min_t(u64, check_bytes, blocksize);
+			crc = crc32_le(crc, bh->b_data, size);
+			brelse(bh);
+		} while (--nblock > 0);
+	}
+	*sum = crc;
+	return 0;
+}
+
+/**
+ * nilfs_read_super_root_block - read super root block
+ * @nilfs: nilfs object
+ * @sr_block: disk block number of the super root block
+ * @pbh: address of a buffer_head pointer to return super root buffer
+ * @check: CRC check flag
+ */
+int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
+				struct buffer_head **pbh, int check)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *sr;
+	u32 crc;
+	int ret;
+
+	*pbh = NULL;
+	bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
+	if (unlikely(!bh_sr)) {
+		ret = NILFS_SEG_FAIL_IO;
+		goto failed;
+	}
+
+	sr = (struct nilfs_super_root *)bh_sr->b_data;
+	if (check) {
+		unsigned bytes = le16_to_cpu(sr->sr_bytes);
+
+		if (bytes == 0 || bytes > nilfs->ns_blocksize) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+		if (nilfs_compute_checksum(
+			    nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
+			    sr_block, 1)) {
+			ret = NILFS_SEG_FAIL_IO;
+			goto failed_bh;
+		}
+		if (crc != le32_to_cpu(sr->sr_sum)) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+	}
+	*pbh = bh_sr;
+	return 0;
+
+ failed_bh:
+	brelse(bh_sr);
+
+ failed:
+	return nilfs_warn_segment_error(ret);
+}
+
+/**
+ * nilfs_read_log_header - read summary header of the specified log
+ * @nilfs: nilfs object
+ * @start_blocknr: start block number of the log
+ * @sum: pointer to return segment summary structure
+ */
+static struct buffer_head *
+nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
+		      struct nilfs_segment_summary **sum)
+{
+	struct buffer_head *bh_sum;
+
+	bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
+	if (bh_sum)
+		*sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+	return bh_sum;
+}
+
+/**
+ * nilfs_validate_log - verify consistency of log
+ * @nilfs: nilfs object
+ * @seg_seq: sequence number of segment
+ * @bh_sum: buffer head of summary block
+ * @sum: segment summary struct
+ */
+static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
+			      struct buffer_head *bh_sum,
+			      struct nilfs_segment_summary *sum)
+{
+	unsigned long nblock;
+	u32 crc;
+	int ret;
+
+	ret = NILFS_SEG_FAIL_MAGIC;
+	if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
+		goto out;
+
+	ret = NILFS_SEG_FAIL_SEQ;
+	if (le64_to_cpu(sum->ss_seq) != seg_seq)
+		goto out;
+
+	nblock = le32_to_cpu(sum->ss_nblocks);
+	ret = NILFS_SEG_FAIL_CONSISTENCY;
+	if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
+		/* This limits the number of blocks read in the CRC check */
+		goto out;
+
+	ret = NILFS_SEG_FAIL_IO;
+	if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
+				   ((u64)nblock << nilfs->ns_blocksize_bits),
+				   bh_sum->b_blocknr, nblock))
+		goto out;
+
+	ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+	if (crc != le32_to_cpu(sum->ss_datasum))
+		goto out;
+	ret = 0;
+out:
+	return ret;
+}
+
+/**
+ * nilfs_read_summary_info - read an item on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be read
+ */
+static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
+				     struct buffer_head **pbh,
+				     unsigned int *offset, unsigned int bytes)
+{
+	void *ptr;
+	sector_t blocknr;
+
+	BUG_ON((*pbh)->b_size < *offset);
+	if (bytes > (*pbh)->b_size - *offset) {
+		blocknr = (*pbh)->b_blocknr;
+		brelse(*pbh);
+		*pbh = __bread(nilfs->ns_bdev, blocknr + 1,
+			       nilfs->ns_blocksize);
+		if (unlikely(!*pbh))
+			return NULL;
+		*offset = 0;
+	}
+	ptr = (*pbh)->b_data + *offset;
+	*offset += bytes;
+	return ptr;
+}
+
+/**
+ * nilfs_skip_summary_info - skip items on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be skipped
+ * @count: number of items to be skipped
+ */
+static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
+				    struct buffer_head **pbh,
+				    unsigned int *offset, unsigned int bytes,
+				    unsigned long count)
+{
+	unsigned int rest_item_in_current_block
+		= ((*pbh)->b_size - *offset) / bytes;
+
+	if (count <= rest_item_in_current_block) {
+		*offset += bytes * count;
+	} else {
+		sector_t blocknr = (*pbh)->b_blocknr;
+		unsigned int nitem_per_block = (*pbh)->b_size / bytes;
+		unsigned int bcnt;
+
+		count -= rest_item_in_current_block;
+		bcnt = DIV_ROUND_UP(count, nitem_per_block);
+		*offset = bytes * (count - (bcnt - 1) * nitem_per_block);
+
+		brelse(*pbh);
+		*pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
+			       nilfs->ns_blocksize);
+	}
+}
+
+/**
+ * nilfs_scan_dsync_log - get block information of a log written for data sync
+ * @nilfs: nilfs object
+ * @start_blocknr: start block number of the log
+ * @sum: log summary information
+ * @head: list head to add nilfs_recovery_block struct
+ */
+static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
+				struct nilfs_segment_summary *sum,
+				struct list_head *head)
+{
+	struct buffer_head *bh;
+	unsigned int offset;
+	u32 nfinfo, sumbytes;
+	sector_t blocknr;
+	ino_t ino;
+	int err = -EIO;
+
+	nfinfo = le32_to_cpu(sum->ss_nfinfo);
+	if (!nfinfo)
+		return 0;
+
+	sumbytes = le32_to_cpu(sum->ss_sumbytes);
+	blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
+	bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
+	if (unlikely(!bh))
+		goto out;
+
+	offset = le16_to_cpu(sum->ss_bytes);
+	for (;;) {
+		unsigned long nblocks, ndatablk, nnodeblk;
+		struct nilfs_finfo *finfo;
+
+		finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+						sizeof(*finfo));
+		if (unlikely(!finfo))
+			goto out;
+
+		ino = le64_to_cpu(finfo->fi_ino);
+		nblocks = le32_to_cpu(finfo->fi_nblocks);
+		ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+		nnodeblk = nblocks - ndatablk;
+
+		while (ndatablk-- > 0) {
+			struct nilfs_recovery_block *rb;
+			struct nilfs_binfo_v *binfo;
+
+			binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+							sizeof(*binfo));
+			if (unlikely(!binfo))
+				goto out;
+
+			rb = kmalloc(sizeof(*rb), GFP_NOFS);
+			if (unlikely(!rb)) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb->ino = ino;
+			rb->blocknr = blocknr++;
+			rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
+			rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
+			/* INIT_LIST_HEAD(&rb->list); */
+			list_add_tail(&rb->list, head);
+		}
+		if (--nfinfo == 0)
+			break;
+		blocknr += nnodeblk; /* always 0 for data sync logs */
+		nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
+					nnodeblk);
+		if (unlikely(!bh))
+			goto out;
+	}
+	err = 0;
+ out:
+	brelse(bh);   /* brelse(NULL) is just ignored */
+	return err;
+}
+
+static void dispose_recovery_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_recovery_block *rb;
+
+		rb = list_first_entry(head, struct nilfs_recovery_block, list);
+		list_del(&rb->list);
+		kfree(rb);
+	}
+}
+
+struct nilfs_segment_entry {
+	struct list_head	list;
+	__u64			segnum;
+};
+
+static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
+{
+	struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+
+	if (unlikely(!ent))
+		return -ENOMEM;
+
+	ent->segnum = segnum;
+	INIT_LIST_HEAD(&ent->list);
+	list_add_tail(&ent->list, head);
+	return 0;
+}
+
+void nilfs_dispose_segment_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_segment_entry *ent;
+
+		ent = list_first_entry(head, struct nilfs_segment_entry, list);
+		list_del(&ent->list);
+		kfree(ent);
+	}
+}
+
+static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
+					      struct super_block *sb,
+					      struct nilfs_recovery_info *ri)
+{
+	struct list_head *head = &ri->ri_used_segments;
+	struct nilfs_segment_entry *ent, *n;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 segnum[4];
+	int err;
+	int i;
+
+	segnum[0] = nilfs->ns_segnum;
+	segnum[1] = nilfs->ns_nextnum;
+	segnum[2] = ri->ri_segnum;
+	segnum[3] = ri->ri_nextnum;
+
+	/*
+	 * Releasing the next segment of the latest super root.
+	 * The next segment is invalidated by this recovery.
+	 */
+	err = nilfs_sufile_free(sufile, segnum[1]);
+	if (unlikely(err))
+		goto failed;
+
+	for (i = 1; i < 4; i++) {
+		err = nilfs_segment_list_add(head, segnum[i]);
+		if (unlikely(err))
+			goto failed;
+	}
+
+	/*
+	 * Collecting segments written after the latest super root.
+	 * These are marked dirty to avoid being reallocated in the next write.
+	 */
+	list_for_each_entry_safe(ent, n, head, list) {
+		if (ent->segnum != segnum[0]) {
+			err = nilfs_sufile_scrap(sufile, ent->segnum);
+			if (unlikely(err))
+				goto failed;
+		}
+		list_del(&ent->list);
+		kfree(ent);
+	}
+
+	/* Allocate new segments for recovery */
+	err = nilfs_sufile_alloc(sufile, &segnum[0]);
+	if (unlikely(err))
+		goto failed;
+
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq = ri->ri_seq + 2;
+	nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
+
+ failed:
+	/* No need to recover sufile because it will be destroyed on error */
+	return err;
+}
+
+static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
+				     struct nilfs_recovery_block *rb,
+				     struct page *page)
+{
+	struct buffer_head *bh_org;
+	void *kaddr;
+
+	bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
+	if (unlikely(!bh_org))
+		return -EIO;
+
+	kaddr = kmap_atomic(page);
+	memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+	kunmap_atomic(kaddr);
+	brelse(bh_org);
+	return 0;
+}
+
+static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
+				      struct super_block *sb,
+				      struct nilfs_root *root,
+				      struct list_head *head,
+				      unsigned long *nr_salvaged_blocks)
+{
+	struct inode *inode;
+	struct nilfs_recovery_block *rb, *n;
+	unsigned blocksize = nilfs->ns_blocksize;
+	struct page *page;
+	loff_t pos;
+	int err = 0, err2 = 0;
+
+	list_for_each_entry_safe(rb, n, head, list) {
+		inode = nilfs_iget(sb, root, rb->ino);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			inode = NULL;
+			goto failed_inode;
+		}
+
+		pos = rb->blkoff << inode->i_blkbits;
+		err = block_write_begin(inode->i_mapping, pos, blocksize,
+					0, &page, nilfs_get_block);
+		if (unlikely(err)) {
+			loff_t isize = inode->i_size;
+			if (pos + blocksize > isize)
+				nilfs_write_failed(inode->i_mapping,
+							pos + blocksize);
+			goto failed_inode;
+		}
+
+		err = nilfs_recovery_copy_block(nilfs, rb, page);
+		if (unlikely(err))
+			goto failed_page;
+
+		err = nilfs_set_file_dirty(inode, 1);
+		if (unlikely(err))
+			goto failed_page;
+
+		block_write_end(NULL, inode->i_mapping, pos, blocksize,
+				blocksize, page, NULL);
+
+		unlock_page(page);
+		page_cache_release(page);
+
+		(*nr_salvaged_blocks)++;
+		goto next;
+
+ failed_page:
+		unlock_page(page);
+		page_cache_release(page);
+
+ failed_inode:
+		printk(KERN_WARNING
+		       "NILFS warning: error recovering data block "
+		       "(err=%d, ino=%lu, block-offset=%llu)\n",
+		       err, (unsigned long)rb->ino,
+		       (unsigned long long)rb->blkoff);
+		if (!err2)
+			err2 = err;
+ next:
+		iput(inode); /* iput(NULL) is just ignored */
+		list_del_init(&rb->list);
+		kfree(rb);
+	}
+	return err2;
+}
+
+/**
+ * nilfs_do_roll_forward - salvage logical segments newer than the latest
+ * checkpoint
+ * @nilfs: nilfs object
+ * @sb: super block instance
+ * @ri: pointer to a nilfs_recovery_info
+ */
+static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
+				 struct super_block *sb,
+				 struct nilfs_root *root,
+				 struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh_sum = NULL;
+	struct nilfs_segment_summary *sum;
+	sector_t pseg_start;
+	sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
+	unsigned long nsalvaged_blocks = 0;
+	unsigned int flags;
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	int empty_seg = 0;
+	int err = 0, ret;
+	LIST_HEAD(dsync_blocks);  /* list of data blocks to be recovered */
+	enum {
+		RF_INIT_ST,
+		RF_DSYNC_ST,   /* scanning data-sync segments */
+	};
+	int state = RF_INIT_ST;
+
+	pseg_start = ri->ri_lsegs_start;
+	seg_seq = ri->ri_lsegs_start_seq;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+		brelse(bh_sum);
+		bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+		if (!bh_sum) {
+			err = -EIO;
+			goto failed;
+		}
+
+		ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO) {
+				err = -EIO;
+				goto failed;
+			}
+			goto strayed;
+		}
+
+		flags = le16_to_cpu(sum->ss_flags);
+		if (flags & NILFS_SS_SR)
+			goto confused;
+
+		/* Found a valid partial segment; do recovery actions */
+		nextnum = nilfs_get_segnum_of_block(nilfs,
+						    le64_to_cpu(sum->ss_next));
+		empty_seg = 0;
+		nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
+		if (!(flags & NILFS_SS_GC))
+			nilfs->ns_nongc_ctime = nilfs->ns_ctime;
+
+		switch (state) {
+		case RF_INIT_ST:
+			if (!(flags & NILFS_SS_LOGBGN) ||
+			    !(flags & NILFS_SS_SYNDT))
+				goto try_next_pseg;
+			state = RF_DSYNC_ST;
+			/* Fall through */
+		case RF_DSYNC_ST:
+			if (!(flags & NILFS_SS_SYNDT))
+				goto confused;
+
+			err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
+						   &dsync_blocks);
+			if (unlikely(err))
+				goto failed;
+			if (flags & NILFS_SS_LOGEND) {
+				err = nilfs_recover_dsync_blocks(
+					nilfs, sb, root, &dsync_blocks,
+					&nsalvaged_blocks);
+				if (unlikely(err))
+					goto failed;
+				state = RF_INIT_ST;
+			}
+			break; /* Fall through to try_next_pseg */
+		}
+
+ try_next_pseg:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+		pseg_start += le32_to_cpu(sum->ss_nblocks);
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			break;
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+	if (nsalvaged_blocks) {
+		printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
+		       sb->s_id, nsalvaged_blocks);
+		ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
+	}
+ out:
+	brelse(bh_sum);
+	dispose_recovery_list(&dsync_blocks);
+	return err;
+
+ confused:
+	err = -EINVAL;
+ failed:
+	printk(KERN_ERR
+	       "NILFS (device %s): Error roll-forwarding "
+	       "(err=%d, pseg block=%llu). ",
+	       sb->s_id, err, (unsigned long long)pseg_start);
+	goto out;
+}
+
+static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
+				      struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh;
+	int err;
+
+	if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
+	    nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
+		return;
+
+	bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
+	BUG_ON(!bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_dirty(bh);
+	err = sync_dirty_buffer(bh);
+	if (unlikely(err))
+		printk(KERN_WARNING
+		       "NILFS warning: buffer sync write failed during "
+		       "post-cleaning of recovery.\n");
+	brelse(bh);
+}
+
+/**
+ * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
+ * @nilfs: nilfs object
+ * @sb: super block instance
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - Inconsistent filesystem state.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
+			      struct super_block *sb,
+			      struct nilfs_recovery_info *ri)
+{
+	struct nilfs_root *root;
+	int err;
+
+	if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
+		return 0;
+
+	err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: error loading the latest checkpoint.\n");
+		return err;
+	}
+
+	err = nilfs_do_roll_forward(nilfs, sb, root, ri);
+	if (unlikely(err))
+		goto failed;
+
+	if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
+		err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Error preparing segments for "
+			       "recovery.\n");
+			goto failed;
+		}
+
+		err = nilfs_attach_log_writer(sb, root);
+		if (unlikely(err))
+			goto failed;
+
+		set_nilfs_discontinued(nilfs);
+		err = nilfs_construct_segment(sb);
+		nilfs_detach_log_writer(sb);
+
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Oops! recovery failed. "
+			       "(err=%d)\n", err);
+			goto failed;
+		}
+
+		nilfs_finish_roll_forward(nilfs, ri);
+	}
+
+ failed:
+	nilfs_put_root(root);
+	return err;
+}
+
+/**
+ * nilfs_search_super_root - search the latest valid super root
+ * @nilfs: the_nilfs
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * nilfs_search_super_root() looks for the latest super-root from a partial
+ * segment pointed by the superblock.  It sets up struct the_nilfs through
+ * this search. It fills nilfs_recovery_info (ri) required for recovery.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - No valid segment found
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_search_super_root(struct the_nilfs *nilfs,
+			    struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh_sum = NULL;
+	struct nilfs_segment_summary *sum;
+	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
+	sector_t seg_start, seg_end; /* range of full segment (block number) */
+	sector_t b, end;
+	unsigned long nblocks;
+	unsigned int flags;
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	__u64 cno;
+	LIST_HEAD(segments);
+	int empty_seg = 0, scan_newer = 0;
+	int ret;
+
+	pseg_start = nilfs->ns_last_pseg;
+	seg_seq = nilfs->ns_last_seq;
+	cno = nilfs->ns_last_cno;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+
+	/* Calculate range of segment */
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	/* Read ahead segment */
+	b = seg_start;
+	while (b <= seg_end)
+		__breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
+
+	for (;;) {
+		brelse(bh_sum);
+		ret = NILFS_SEG_FAIL_IO;
+		bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+		if (!bh_sum)
+			goto failed;
+
+		ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO)
+				goto failed;
+			goto strayed;
+		}
+
+		nblocks = le32_to_cpu(sum->ss_nblocks);
+		pseg_end = pseg_start + nblocks - 1;
+		if (unlikely(pseg_end > seg_end)) {
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto strayed;
+		}
+
+		/* A valid partial segment */
+		ri->ri_pseg_start = pseg_start;
+		ri->ri_seq = seg_seq;
+		ri->ri_segnum = segnum;
+		nextnum = nilfs_get_segnum_of_block(nilfs,
+						    le64_to_cpu(sum->ss_next));
+		ri->ri_nextnum = nextnum;
+		empty_seg = 0;
+
+		flags = le16_to_cpu(sum->ss_flags);
+		if (!(flags & NILFS_SS_SR) && !scan_newer) {
+			/* This will never happen because a superblock
+			   (last_segment) always points to a pseg
+			   having a super root. */
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto failed;
+		}
+
+		if (pseg_start == seg_start) {
+			nilfs_get_segment_range(nilfs, nextnum, &b, &end);
+			while (b <= end)
+				__breadahead(nilfs->ns_bdev, b++,
+					     nilfs->ns_blocksize);
+		}
+		if (!(flags & NILFS_SS_SR)) {
+			if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
+				ri->ri_lsegs_start = pseg_start;
+				ri->ri_lsegs_start_seq = seg_seq;
+			}
+			if (flags & NILFS_SS_LOGEND)
+				ri->ri_lsegs_end = pseg_start;
+			goto try_next_pseg;
+		}
+
+		/* A valid super root was found. */
+		ri->ri_cno = cno++;
+		ri->ri_super_root = pseg_end;
+		ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
+
+		nilfs_dispose_segment_list(&segments);
+		sr_pseg_start = pseg_start;
+		nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
+		nilfs->ns_seg_seq = seg_seq;
+		nilfs->ns_segnum = segnum;
+		nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
+		nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
+		nilfs->ns_nextnum = nextnum;
+
+		if (scan_newer)
+			ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
+		else {
+			if (nilfs->ns_mount_state & NILFS_VALID_FS)
+				goto super_root_found;
+			scan_newer = 1;
+		}
+
+ try_next_pseg:
+		/* Standing on a course, or met an inconsistent state */
+		pseg_start += nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		/* Off the trail */
+		if (!scan_newer)
+			/*
+			 * This can happen if a checkpoint was written without
+			 * barriers, or as a result of an I/O failure.
+			 */
+			goto failed;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			goto super_root_found; /* found a valid super root */
+
+		ret = nilfs_segment_list_add(&segments, segnum);
+		if (unlikely(ret))
+			goto failed;
+
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+ super_root_found:
+	/* Updating pointers relating to the latest checkpoint */
+	brelse(bh_sum);
+	list_splice_tail(&segments, &ri->ri_used_segments);
+	nilfs->ns_last_pseg = sr_pseg_start;
+	nilfs->ns_last_seq = nilfs->ns_seg_seq;
+	nilfs->ns_last_cno = ri->ri_cno;
+	return 0;
+
+ failed:
+	brelse(bh_sum);
+	nilfs_dispose_segment_list(&segments);
+	return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+}
diff --git a/kernel/fs/nilfs2/segbuf.c b/kernel/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000..dc3a9efda
--- /dev/null
+++ b/kernel/fs/nilfs2/segbuf.c
@@ -0,0 +1,536 @@
+/*
+ * segbuf.c - NILFS segment buffer
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/crc32.h>
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
+#include "page.h"
+#include "segbuf.h"
+
+
+struct nilfs_write_info {
+	struct the_nilfs       *nilfs;
+	struct bio	       *bio;
+	int			start, end; /* The region to be submitted */
+	int			rest_blocks;
+	int			max_pages;
+	int			nr_vecs;
+	sector_t		blocknr;
+};
+
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs);
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
+
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
+	if (unlikely(!segbuf))
+		return NULL;
+
+	segbuf->sb_super = sb;
+	INIT_LIST_HEAD(&segbuf->sb_list);
+	INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
+	INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+	segbuf->sb_super_root = NULL;
+
+	init_completion(&segbuf->sb_bio_event);
+	atomic_set(&segbuf->sb_err, 0);
+	segbuf->sb_nbio = 0;
+
+	return segbuf;
+}
+
+void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
+{
+	kmem_cache_free(nilfs_segbuf_cachep, segbuf);
+}
+
+void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
+		     unsigned long offset, struct the_nilfs *nilfs)
+{
+	segbuf->sb_segnum = segnum;
+	nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
+				&segbuf->sb_fseg_end);
+
+	segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
+/**
+ * nilfs_segbuf_map_cont - map a new log behind a given log
+ * @segbuf: new segment buffer
+ * @prev: segment buffer containing a log to be continued
+ */
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev)
+{
+	segbuf->sb_segnum = prev->sb_segnum;
+	segbuf->sb_fseg_start = prev->sb_fseg_start;
+	segbuf->sb_fseg_end = prev->sb_fseg_end;
+	segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
+				  __u64 nextnum, struct the_nilfs *nilfs)
+{
+	segbuf->sb_nextnum = nextnum;
+	segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
+}
+
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_segsum_buffer(segbuf, bh);
+	return 0;
+}
+
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	*bhp = bh;
+	return 0;
+}
+
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+		       time_t ctime, __u64 cno)
+{
+	int err;
+
+	segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
+	err = nilfs_segbuf_extend_segsum(segbuf);
+	if (unlikely(err))
+		return err;
+
+	segbuf->sb_sum.flags = flags;
+	segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
+	segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
+	segbuf->sb_sum.ctime = ctime;
+	segbuf->sb_sum.cno = cno;
+	return 0;
+}
+
+/*
+ * Setup segment summary
+ */
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct nilfs_segment_summary *raw_sum;
+	struct buffer_head *bh_sum;
+
+	bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
+			    struct buffer_head, b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+
+	raw_sum->ss_magic    = cpu_to_le32(NILFS_SEGSUM_MAGIC);
+	raw_sum->ss_bytes    = cpu_to_le16(sizeof(*raw_sum));
+	raw_sum->ss_flags    = cpu_to_le16(segbuf->sb_sum.flags);
+	raw_sum->ss_seq      = cpu_to_le64(segbuf->sb_sum.seg_seq);
+	raw_sum->ss_create   = cpu_to_le64(segbuf->sb_sum.ctime);
+	raw_sum->ss_next     = cpu_to_le64(segbuf->sb_sum.next);
+	raw_sum->ss_nblocks  = cpu_to_le32(segbuf->sb_sum.nblocks);
+	raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
+	raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
+	raw_sum->ss_pad      = 0;
+	raw_sum->ss_cno      = cpu_to_le64(segbuf->sb_sum.cno);
+}
+
+/*
+ * CRC calculation routines
+ */
+static void
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	unsigned long size, bytes = segbuf->sb_sum.sumbytes;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	size = min_t(unsigned long, bytes, bh->b_size);
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum +
+		       sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
+		       size - (sizeof(raw_sum->ss_datasum) +
+			       sizeof(raw_sum->ss_sumsum)));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		bytes -= size;
+		size = min_t(unsigned long, bytes, bh->b_size);
+		crc = crc32_le(crc, bh->b_data, size);
+	}
+	raw_sum->ss_sumsum = cpu_to_le32(crc);
+}
+
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+					  u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	void *kaddr;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
+		       bh->b_size - sizeof(raw_sum->ss_datasum));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		crc = crc32_le(crc, bh->b_data, bh->b_size);
+	}
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		kaddr = kmap_atomic(bh->b_page);
+		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
+		kunmap_atomic(kaddr);
+	}
+	raw_sum->ss_datasum = cpu_to_le32(crc);
+}
+
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+				    u32 seed)
+{
+	struct nilfs_super_root *raw_sr;
+	struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
+	unsigned srsize;
+	u32 crc;
+
+	raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+	srsize = NILFS_SR_BYTES(nilfs->ns_inode_size);
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+		       srsize - sizeof(raw_sr->sr_sum));
+	raw_sr->sr_sum = cpu_to_le32(crc);
+}
+
+static void nilfs_release_buffers(struct list_head *list)
+{
+	struct buffer_head *bh, *n;
+
+	list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+}
+
+static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+	segbuf->sb_super_root = NULL;
+}
+
+/*
+ * Iterators for segment buffers
+ */
+void nilfs_clear_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, logs, sb_list)
+		nilfs_segbuf_clear(segbuf);
+}
+
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last)
+{
+	struct nilfs_segment_buffer *n, *segbuf;
+
+	segbuf = list_prepare_entry(last, logs, sb_list);
+	list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) {
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_clear(segbuf);
+		nilfs_segbuf_free(segbuf);
+	}
+}
+
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret = 0;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		ret = nilfs_segbuf_write(segbuf, nilfs);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+int nilfs_wait_on_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err, ret = 0;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		err = nilfs_segbuf_wait(segbuf);
+		if (err && !ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		if (segbuf->sb_super_root)
+			nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+	}
+}
+
+/*
+ * BIO operations
+ */
+static void nilfs_end_bio_write(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct nilfs_segment_buffer *segbuf = bio->bi_private;
+
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		/* to be detected by nilfs_segbuf_submit_bio() */
+	}
+
+	if (!uptodate)
+		atomic_inc(&segbuf->sb_err);
+
+	bio_put(bio);
+	complete(&segbuf->sb_bio_event);
+}
+
+static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
+				   struct nilfs_write_info *wi, int mode)
+{
+	struct bio *bio = wi->bio;
+	int err;
+
+	if (segbuf->sb_nbio > 0 &&
+	    bdi_write_congested(segbuf->sb_super->s_bdi)) {
+		wait_for_completion(&segbuf->sb_bio_event);
+		segbuf->sb_nbio--;
+		if (unlikely(atomic_read(&segbuf->sb_err))) {
+			bio_put(bio);
+			err = -EIO;
+			goto failed;
+		}
+	}
+
+	bio->bi_end_io = nilfs_end_bio_write;
+	bio->bi_private = segbuf;
+	bio_get(bio);
+	submit_bio(mode, bio);
+	segbuf->sb_nbio++;
+	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+		bio_put(bio);
+		err = -EOPNOTSUPP;
+		goto failed;
+	}
+	bio_put(bio);
+
+	wi->bio = NULL;
+	wi->rest_blocks -= wi->end - wi->start;
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end;
+	return 0;
+
+ failed:
+	wi->bio = NULL;
+	return err;
+}
+
+/**
+ * nilfs_alloc_seg_bio - allocate a new bio for writing log
+ * @nilfs: nilfs object
+ * @start: start block number of the bio
+ * @nr_vecs: request size of page vector.
+ *
+ * Return Value: On success, pointer to the struct bio is returned.
+ * On error, NULL is returned.
+ */
+static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
+				       int nr_vecs)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOIO, nr_vecs);
+	if (bio == NULL) {
+		while (!bio && (nr_vecs >>= 1))
+			bio = bio_alloc(GFP_NOIO, nr_vecs);
+	}
+	if (likely(bio)) {
+		bio->bi_bdev = nilfs->ns_bdev;
+		bio->bi_iter.bi_sector =
+			start << (nilfs->ns_blocksize_bits - 9);
+	}
+	return bio;
+}
+
+static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+				       struct nilfs_write_info *wi)
+{
+	wi->bio = NULL;
+	wi->rest_blocks = segbuf->sb_sum.nblocks;
+	wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end = 0;
+	wi->blocknr = segbuf->sb_pseg_start;
+}
+
+static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
+				  struct nilfs_write_info *wi,
+				  struct buffer_head *bh, int mode)
+{
+	int len, err;
+
+	BUG_ON(wi->nr_vecs <= 0);
+ repeat:
+	if (!wi->bio) {
+		wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
+					      wi->nr_vecs);
+		if (unlikely(!wi->bio))
+			return -ENOMEM;
+	}
+
+	len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (len == bh->b_size) {
+		wi->end++;
+		return 0;
+	}
+	/* bio is FULL */
+	err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
+	/* never submit current bh */
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+/**
+ * nilfs_segbuf_write - submit write requests of a log
+ * @segbuf: buffer storing a log to be written
+ * @nilfs: nilfs object
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs)
+{
+	struct nilfs_write_info wi;
+	struct buffer_head *bh;
+	int res = 0, rw = WRITE;
+
+	wi.nilfs = nilfs;
+	nilfs_segbuf_prepare_write(segbuf, &wi);
+
+	list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	if (wi.bio) {
+		/*
+		 * Last BIO is always sent through the following
+		 * submission.
+		 */
+		rw |= REQ_SYNC;
+		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
+	}
+
+ failed_bio:
+	return res;
+}
+
+/**
+ * nilfs_segbuf_wait - wait for completion of requested BIOs
+ * @segbuf: segment buffer
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ */
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
+{
+	int err = 0;
+
+	if (!segbuf->sb_nbio)
+		return 0;
+
+	do {
+		wait_for_completion(&segbuf->sb_bio_event);
+	} while (--segbuf->sb_nbio > 0);
+
+	if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
+		printk(KERN_ERR "NILFS: IO error writing segment\n");
+		err = -EIO;
+	}
+	return err;
+}
diff --git a/kernel/fs/nilfs2/segbuf.h b/kernel/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000..b04f08cc2
--- /dev/null
+++ b/kernel/fs/nilfs2/segbuf.h
@@ -0,0 +1,184 @@
+/*
+ * segbuf.h - NILFS Segment buffer prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGBUF_H
+#define _NILFS_SEGBUF_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+
+/**
+ * struct nilfs_segsum_info - On-memory segment summary
+ * @flags: Flags
+ * @nfinfo: Number of file information structures
+ * @nblocks: Number of blocks included in the partial segment
+ * @nsumblk: Number of summary blocks
+ * @sumbytes: Byte count of segment summary
+ * @nfileblk: Total number of file blocks
+ * @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
+ * @ctime: Creation time
+ * @next: Block number of the next full segment
+ */
+struct nilfs_segsum_info {
+	unsigned int		flags;
+	unsigned long		nfinfo;
+	unsigned long		nblocks;
+	unsigned long		nsumblk;
+	unsigned long		sumbytes;
+	unsigned long		nfileblk;
+	u64			seg_seq;
+	__u64			cno;
+	time_t			ctime;
+	sector_t		next;
+};
+
+/**
+ * struct nilfs_segment_buffer - Segment buffer
+ * @sb_super: back pointer to a superblock struct
+ * @sb_list: List head to chain this structure
+ * @sb_sum: On-memory segment summary
+ * @sb_segnum: Index number of the full segment
+ * @sb_nextnum: Index number of the next full segment
+ * @sb_fseg_start: Start block number of the full segment
+ * @sb_fseg_end: End block number of the full segment
+ * @sb_pseg_start: Disk block number of partial segment
+ * @sb_rest_blocks: Number of residual blocks in the current segment
+ * @sb_segsum_buffers: List of buffers for segment summaries
+ * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
+ * @sb_nbio: Number of flying bio requests
+ * @sb_err: I/O error status
+ * @sb_bio_event: Completion event of log writing
+ */
+struct nilfs_segment_buffer {
+	struct super_block     *sb_super;
+	struct list_head	sb_list;
+
+	/* Segment information */
+	struct nilfs_segsum_info sb_sum;
+	__u64			sb_segnum;
+	__u64			sb_nextnum;
+	sector_t		sb_fseg_start, sb_fseg_end;
+	sector_t		sb_pseg_start;
+	unsigned		sb_rest_blocks;
+
+	/* Buffers */
+	struct list_head	sb_segsum_buffers;
+	struct list_head	sb_payload_buffers; /* including super root */
+	struct buffer_head     *sb_super_root;
+
+	/* io status */
+	int			sb_nbio;
+	atomic_t		sb_err;
+	struct completion	sb_bio_event;
+};
+
+#define NILFS_LIST_SEGBUF(head)  \
+	list_entry((head), struct nilfs_segment_buffer, sb_list)
+#define NILFS_NEXT_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
+#define NILFS_PREV_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
+#define NILFS_LAST_SEGBUF(head)    NILFS_LIST_SEGBUF((head)->prev)
+#define NILFS_FIRST_SEGBUF(head)   NILFS_LIST_SEGBUF((head)->next)
+#define NILFS_SEGBUF_IS_LAST(segbuf, head)  ((segbuf)->sb_list.next == (head))
+
+#define nilfs_for_each_segbuf_before(s, t, h) \
+	for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
+	     (s) = NILFS_NEXT_SEGBUF(s))
+
+#define NILFS_SEGBUF_FIRST_BH(head)  \
+	(list_entry((head)->next, struct buffer_head, b_assoc_buffers))
+#define NILFS_SEGBUF_NEXT_BH(bh)  \
+	(list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
+		    b_assoc_buffers))
+#define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+
+extern struct kmem_cache *nilfs_segbuf_cachep;
+
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
+void nilfs_segbuf_free(struct nilfs_segment_buffer *);
+void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
+		      struct the_nilfs *);
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev);
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
+				  struct the_nilfs *);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
+				struct buffer_head **);
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
+
+static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
+{
+	unsigned int flags = segbuf->sb_sum.flags;
+
+	return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
+		(NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
+}
+
+static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
+{
+	return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
+}
+
+static inline void
+nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
+			       struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
+	segbuf->sb_sum.nblocks++;
+	segbuf->sb_sum.nsumblk++;
+}
+
+static inline void
+nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
+	segbuf->sb_sum.nblocks++;
+}
+
+static inline void
+nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
+			     struct buffer_head *bh)
+{
+	get_bh(bh);
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	segbuf->sb_sum.nfileblk++;
+}
+
+void nilfs_clear_logs(struct list_head *logs);
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last);
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
+int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
+
+static inline void nilfs_destroy_logs(struct list_head *logs)
+{
+	nilfs_truncate_logs(logs, NULL);
+}
+
+#endif /* _NILFS_SEGBUF_H */
diff --git a/kernel/fs/nilfs2/segment.c b/kernel/fs/nilfs2/segment.c
new file mode 100644
index 000000000..c6abbad9b
--- /dev/null
+++ b/kernel/fs/nilfs2/segment.c
@@ -0,0 +1,2758 @@
+/*
+ * segment.c - NILFS segment constructor.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bitops.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "page.h"
+#include "segment.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "segbuf.h"
+
+
+/*
+ * Segment constructor
+ */
+#define SC_N_INODEVEC	16   /* Size of locally allocated inode vector */
+
+#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
+				appended in collection retry loop */
+
+/* Construction mode */
+enum {
+	SC_LSEG_SR = 1,	/* Make a logical segment having a super root */
+	SC_LSEG_DSYNC,	/* Flush data blocks of a given file and make
+			   a logical segment without a super root */
+	SC_FLUSH_FILE,	/* Flush data files, leads to segment writes without
+			   creating a checkpoint */
+	SC_FLUSH_DAT,	/* Flush DAT file. This also creates segments without
+			   a checkpoint */
+};
+
+/* Stage numbers of dirty block collection */
+enum {
+	NILFS_ST_INIT = 0,
+	NILFS_ST_GC,		/* Collecting dirty blocks for GC */
+	NILFS_ST_FILE,
+	NILFS_ST_IFILE,
+	NILFS_ST_CPFILE,
+	NILFS_ST_SUFILE,
+	NILFS_ST_DAT,
+	NILFS_ST_SR,		/* Super root */
+	NILFS_ST_DSYNC,		/* Data sync blocks */
+	NILFS_ST_DONE,
+};
+
+/* State flags of collection */
+#define NILFS_CF_NODE		0x0001	/* Collecting node blocks */
+#define NILFS_CF_IFILE_STARTED	0x0002	/* IFILE stage has started */
+#define NILFS_CF_SUFREED	0x0004	/* segment usages has been freed */
+#define NILFS_CF_HISTORY_MASK	(NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)
+
+/* Operations depending on the construction mode and file type */
+struct nilfs_sc_operations {
+	int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	void (*write_data_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+	void (*write_node_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+};
+
+/*
+ * Other definitions
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
+static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
+
+#define nilfs_cnt32_gt(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(b) - (__s32)(a) < 0))
+#define nilfs_cnt32_ge(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(a) - (__s32)(b) >= 0))
+#define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
+#define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
+
+static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+	void *save = NULL;
+
+	if (cur_ti) {
+		if (cur_ti->ti_magic == NILFS_TI_MAGIC)
+			return ++cur_ti->ti_count;
+		else {
+			/*
+			 * If journal_info field is occupied by other FS,
+			 * it is saved and will be restored on
+			 * nilfs_transaction_commit().
+			 */
+			printk(KERN_WARNING
+			       "NILFS warning: journal info from a different "
+			       "FS\n");
+			save = current->journal_info;
+		}
+	}
+	if (!ti) {
+		ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
+		if (!ti)
+			return -ENOMEM;
+		ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
+	} else {
+		ti->ti_flags = 0;
+	}
+	ti->ti_count = 0;
+	ti->ti_save = save;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	current->journal_info = ti;
+	return 0;
+}
+
+/**
+ * nilfs_transaction_begin - start indivisible file operations.
+ * @sb: super block
+ * @ti: nilfs_transaction_info
+ * @vacancy_check: flags for vacancy rate checks
+ *
+ * nilfs_transaction_begin() acquires a reader/writer semaphore, called
+ * the segment semaphore, to make a segment construction and write tasks
+ * exclusive.  The function is used with nilfs_transaction_commit() in pairs.
+ * The region enclosed by these two functions can be nested.  To avoid a
+ * deadlock, the semaphore is only acquired or released in the outermost call.
+ *
+ * This function allocates a nilfs_transaction_info struct to keep context
+ * information on it.  It is initialized and hooked onto the current task in
+ * the outermost call.  If a pre-allocated struct is given to @ti, it is used
+ * instead; otherwise a new struct is assigned from a slab.
+ *
+ * When @vacancy_check flag is set, this function will check the amount of
+ * free space, and will wait for the GC to reclaim disk space if low capacity.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-ENOSPC - No space left on device
+ */
+int nilfs_transaction_begin(struct super_block *sb,
+			    struct nilfs_transaction_info *ti,
+			    int vacancy_check)
+{
+	struct the_nilfs *nilfs;
+	int ret = nilfs_prepare_segment_lock(ti);
+
+	if (unlikely(ret < 0))
+		return ret;
+	if (ret > 0)
+		return 0;
+
+	sb_start_intwrite(sb);
+
+	nilfs = sb->s_fs_info;
+	down_read(&nilfs->ns_segctor_sem);
+	if (vacancy_check && nilfs_near_disk_full(nilfs)) {
+		up_read(&nilfs->ns_segctor_sem);
+		ret = -ENOSPC;
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	ti = current->journal_info;
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	sb_end_intwrite(sb);
+	return ret;
+}
+
+/**
+ * nilfs_transaction_commit - commit indivisible file operations.
+ * @sb: super block
+ *
+ * nilfs_transaction_commit() releases the read semaphore which is
+ * acquired by nilfs_transaction_begin(). This is only performed
+ * in outermost call of this function.  If a commit flag is set,
+ * nilfs_transaction_commit() sets a timer to start the segment
+ * constructor.  If a sync flag is set, it starts construction
+ * directly.
+ */
+int nilfs_transaction_commit(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err = 0;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	ti->ti_flags |= NILFS_TI_COMMIT;
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return 0;
+	}
+	if (nilfs->ns_writer) {
+		struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+		if (ti->ti_flags & NILFS_TI_COMMIT)
+			nilfs_segctor_start_timer(sci);
+		if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
+			nilfs_segctor_do_flush(sci, 0);
+	}
+	up_read(&nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+
+	if (ti->ti_flags & NILFS_TI_SYNC)
+		err = nilfs_construct_segment(sb);
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	sb_end_intwrite(sb);
+	return err;
+}
+
+void nilfs_transaction_abort(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return;
+	}
+	up_read(&nilfs->ns_segctor_sem);
+
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	sb_end_intwrite(sb);
+}
+
+void nilfs_relax_pressure_in_lock(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+	if (!sci || !sci->sc_flush_request)
+		return;
+
+	set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+	up_read(&nilfs->ns_segctor_sem);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (sci->sc_flush_request &&
+	    test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
+		struct nilfs_transaction_info *ti = current->journal_info;
+
+		ti->ti_flags |= NILFS_TI_WRITER;
+		nilfs_segctor_do_immediate_flush(sci);
+		ti->ti_flags &= ~NILFS_TI_WRITER;
+	}
+	downgrade_write(&nilfs->ns_segctor_sem);
+}
+
+static void nilfs_transaction_lock(struct super_block *sb,
+				   struct nilfs_transaction_info *ti,
+				   int gcflag)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+	WARN_ON(cur_ti);
+	ti->ti_flags = NILFS_TI_WRITER;
+	ti->ti_count = 0;
+	ti->ti_save = cur_ti;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	current->journal_info = ti;
+
+	for (;;) {
+		down_write(&nilfs->ns_segctor_sem);
+		if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
+			break;
+
+		nilfs_segctor_do_immediate_flush(sci);
+
+		up_write(&nilfs->ns_segctor_sem);
+		yield();
+	}
+	if (gcflag)
+		ti->ti_flags |= NILFS_TI_GC;
+}
+
+static void nilfs_transaction_unlock(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	BUG_ON(ti->ti_count > 0);
+
+	up_write(&nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+}
+
+static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
+					    struct nilfs_segsum_pointer *ssp,
+					    unsigned bytes)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	void *p;
+
+	if (unlikely(ssp->offset + bytes > blocksize)) {
+		ssp->offset = 0;
+		BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
+					       &segbuf->sb_segsum_buffers));
+		ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
+	}
+	p = ssp->bh->b_data + ssp->offset;
+	ssp->offset += bytes;
+	return p;
+}
+
+/**
+ * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
+ * @sci: nilfs_sc_info
+ */
+static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	struct buffer_head *sumbh;
+	unsigned sumbytes;
+	unsigned flags = 0;
+	int err;
+
+	if (nilfs_doing_gc())
+		flags = NILFS_SS_GC;
+	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
+	if (unlikely(err))
+		return err;
+
+	sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	sumbytes = segbuf->sb_sum.sumbytes;
+	sci->sc_finfo_ptr.bh = sumbh;  sci->sc_finfo_ptr.offset = sumbytes;
+	sci->sc_binfo_ptr.bh = sumbh;  sci->sc_binfo_ptr.offset = sumbytes;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+	return 0;
+}
+
+static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
+{
+	sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+	if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
+		return -E2BIG; /* The current segment is filled up
+				  (internal code) */
+	sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
+	return nilfs_segctor_reset_segment_buffer(sci);
+}
+
+static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	int err;
+
+	if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		segbuf = sci->sc_curseg;
+	}
+	err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
+	if (likely(!err))
+		segbuf->sb_sum.flags |= NILFS_SS_SR;
+	return err;
+}
+
+/*
+ * Functions for making segment summary and payloads
+ */
+static int nilfs_segctor_segsum_block_required(
+	struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
+	unsigned binfo_size)
+{
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	/* Size of finfo and binfo is enough small against blocksize */
+
+	return ssp->offset + binfo_size +
+		(!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
+		blocksize;
+}
+
+static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
+				      struct inode *inode)
+{
+	sci->sc_curseg->sb_sum.nfinfo++;
+	sci->sc_binfo_ptr = sci->sc_finfo_ptr;
+	nilfs_segctor_map_segsum_entry(
+		sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
+
+	if (NILFS_I(inode)->i_root &&
+	    !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+	/* skip finfo */
+}
+
+static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
+				    struct inode *inode)
+{
+	struct nilfs_finfo *finfo;
+	struct nilfs_inode_info *ii;
+	struct nilfs_segment_buffer *segbuf;
+	__u64 cno;
+
+	if (sci->sc_blk_cnt == 0)
+		return;
+
+	ii = NILFS_I(inode);
+
+	if (test_bit(NILFS_I_GCINODE, &ii->i_state))
+		cno = ii->i_cno;
+	else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
+		cno = 0;
+	else
+		cno = sci->sc_cno;
+
+	finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
+						 sizeof(*finfo));
+	finfo->fi_ino = cpu_to_le64(inode->i_ino);
+	finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
+	finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
+	finfo->fi_cno = cpu_to_le64(cno);
+
+	segbuf = sci->sc_curseg;
+	segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
+		sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
+	sci->sc_finfo_ptr = sci->sc_binfo_ptr;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+}
+
+static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
+					struct buffer_head *bh,
+					struct inode *inode,
+					unsigned binfo_size)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int required, err = 0;
+
+ retry:
+	segbuf = sci->sc_curseg;
+	required = nilfs_segctor_segsum_block_required(
+		sci, &sci->sc_binfo_ptr, binfo_size);
+	if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
+		nilfs_segctor_end_finfo(sci, inode);
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		goto retry;
+	}
+	if (unlikely(required)) {
+		err = nilfs_segbuf_extend_segsum(segbuf);
+		if (unlikely(err))
+			goto failed;
+	}
+	if (sci->sc_blk_cnt == 0)
+		nilfs_segctor_begin_finfo(sci, inode);
+
+	nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
+	/* Substitution to vblocknr is delayed until update_blocknr() */
+	nilfs_segbuf_add_file_buffer(segbuf, bh);
+	sci->sc_blk_cnt++;
+ failed:
+	return err;
+}
+
+/*
+ * Callback functions that enumerate, mark, and collect dirty blocks
+ */
+static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (err < 0)
+		return err;
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode,
+					   sizeof(struct nilfs_binfo_v));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+}
+
+static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+}
+
+static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*binfo_v));
+	*binfo_v = binfo->bi_v;
+}
+
+static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	__le64 *vblocknr = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*vblocknr));
+	*vblocknr = binfo->bi_v.bi_vblocknr;
+}
+
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_file_bmap,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = nilfs_write_file_node_binfo,
+};
+
+static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (err < 0)
+		return err;
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode,
+					    sizeof(struct nilfs_binfo_dat));
+}
+
+static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	__le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
+							  sizeof(*blkoff));
+	*blkoff = binfo->bi_dat.bi_blkoff;
+}
+
+static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_dat *binfo_dat =
+		nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
+	*binfo_dat = binfo->bi_dat;
+}
+
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
+	.collect_data = nilfs_collect_dat_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_dat_bmap,
+	.write_data_binfo = nilfs_write_dat_data_binfo,
+	.write_node_binfo = nilfs_write_dat_node_binfo,
+};
+
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = NULL,
+	.collect_bmap = NULL,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = NULL,
+};
+
+static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
+					      struct list_head *listp,
+					      size_t nlimit,
+					      loff_t start, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t index = 0, last = ULONG_MAX;
+	size_t ndirties = 0;
+	int i;
+
+	if (unlikely(start != 0 || end != LLONG_MAX)) {
+		/*
+		 * A valid range is given for sync-ing data pages. The
+		 * range is rounded to per-page; extra dirty buffers
+		 * may be included if blocksize < pagesize.
+		 */
+		index = start >> PAGE_SHIFT;
+		last = end >> PAGE_SHIFT;
+	}
+	pagevec_init(&pvec, 0);
+ repeat:
+	if (unlikely(index > last) ||
+	    !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				min_t(pgoff_t, last - index,
+				      PAGEVEC_SIZE - 1) + 1))
+		return ndirties;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct buffer_head *bh, *head;
+		struct page *page = pvec.pages[i];
+
+		if (unlikely(page->index > last))
+			break;
+
+		lock_page(page);
+		if (!page_has_buffers(page))
+			create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+		unlock_page(page);
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_dirty(bh) || buffer_async_write(bh))
+				continue;
+			get_bh(bh);
+			list_add_tail(&bh->b_assoc_buffers, listp);
+			ndirties++;
+			if (unlikely(ndirties >= nlimit)) {
+				pagevec_release(&pvec);
+				cond_resched();
+				return ndirties;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+	goto repeat;
+}
+
+static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
+					    struct list_head *listp)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct address_space *mapping = &ii->i_btnode_cache;
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh) &&
+						!buffer_async_write(bh)) {
+					get_bh(bh);
+					list_add_tail(&bh->b_assoc_buffers,
+						      listp);
+				}
+				bh = bh->b_this_page;
+			} while (bh != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+static void nilfs_dispose_list(struct the_nilfs *nilfs,
+			       struct list_head *head, int force)
+{
+	struct nilfs_inode_info *ii, *n;
+	struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
+	unsigned nv = 0;
+
+	while (!list_empty(head)) {
+		spin_lock(&nilfs->ns_inode_lock);
+		list_for_each_entry_safe(ii, n, head, i_dirty) {
+			list_del_init(&ii->i_dirty);
+			if (force) {
+				if (unlikely(ii->i_bh)) {
+					brelse(ii->i_bh);
+					ii->i_bh = NULL;
+				}
+			} else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+				set_bit(NILFS_I_QUEUED, &ii->i_state);
+				list_add_tail(&ii->i_dirty,
+					      &nilfs->ns_dirty_files);
+				continue;
+			}
+			ivec[nv++] = ii;
+			if (nv == SC_N_INODEVEC)
+				break;
+		}
+		spin_unlock(&nilfs->ns_inode_lock);
+
+		for (pii = ivec; nv > 0; pii++, nv--)
+			iput(&(*pii)->vfs_inode);
+	}
+}
+
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+	struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+						 sc_iput_work);
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+	nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
+static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
+				     struct nilfs_root *root)
+{
+	int ret = 0;
+
+	if (nilfs_mdt_fetch_dirty(root->ifile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
+		ret++;
+	if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
+		ret++;
+	return ret;
+}
+
+static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
+{
+	return list_empty(&sci->sc_dirty_files) &&
+		!test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
+		sci->sc_nfreesegs == 0 &&
+		(!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
+}
+
+static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int ret = 0;
+
+	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
+		ret++;
+
+	spin_unlock(&nilfs->ns_inode_lock);
+	return ret;
+}
+
+static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+	nilfs_mdt_clear_dirty(sci->sc_root->ifile);
+	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
+	nilfs_mdt_clear_dirty(nilfs->ns_sufile);
+	nilfs_mdt_clear_dirty(nilfs->ns_dat);
+}
+
+static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	/* XXX: this interface will be changed */
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
+					  &raw_cp, &bh_cp);
+	if (likely(!err)) {
+		/* The following code is duplicated with cpfile.  But, it is
+		   needed to collect the checkpoint even if it was not newly
+		   created */
+		mark_buffer_dirty(bh_cp);
+		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
+		nilfs_cpfile_put_checkpoint(
+			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	} else
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+
+	return err;
+}
+
+static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
+					  &raw_cp, &bh_cp);
+	if (unlikely(err)) {
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+		goto failed_ibh;
+	}
+	raw_cp->cp_snapshot_list.ssl_next = 0;
+	raw_cp->cp_snapshot_list.ssl_prev = 0;
+	raw_cp->cp_inodes_count =
+		cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
+	raw_cp->cp_blocks_count =
+		cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
+	raw_cp->cp_nblk_inc =
+		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
+	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
+	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
+
+	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		nilfs_checkpoint_clear_minor(raw_cp);
+	else
+		nilfs_checkpoint_set_minor(raw_cp);
+
+	nilfs_write_inode_common(sci->sc_root->ifile,
+				 &raw_cp->cp_ifile_inode, 1);
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	return 0;
+
+ failed_ibh:
+	return err;
+}
+
+static void nilfs_fill_in_file_bmap(struct inode *ifile,
+				    struct nilfs_inode_info *ii)
+
+{
+	struct buffer_head *ibh;
+	struct nilfs_inode *raw_inode;
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
+		ibh = ii->i_bh;
+		BUG_ON(!ibh);
+		raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
+						  ibh);
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+		nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+	}
+}
+
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
+		nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
+		set_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *raw_sr;
+	unsigned isz, srsz;
+
+	bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+	isz = nilfs->ns_inode_size;
+	srsz = NILFS_SR_BYTES(isz);
+
+	raw_sr->sr_bytes = cpu_to_le16(srsz);
+	raw_sr->sr_nongc_ctime
+		= cpu_to_le64(nilfs_doing_gc() ?
+			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
+	raw_sr->sr_flags = 0;
+
+	nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
+				 NILFS_SR_DAT_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
+				 NILFS_SR_CPFILE_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
+				 NILFS_SR_SUFILE_OFFSET(isz), 1);
+	memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
+}
+
+static void nilfs_redirty_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
+			clear_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+static void nilfs_drop_collected_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
+			continue;
+
+		clear_bit(NILFS_I_INODE_SYNC, &ii->i_state);
+		set_bit(NILFS_I_UPDATED, &ii->i_state);
+	}
+}
+
+static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
+				       struct inode *inode,
+				       struct list_head *listp,
+				       int (*collect)(struct nilfs_sc_info *,
+						      struct buffer_head *,
+						      struct inode *))
+{
+	struct buffer_head *bh, *n;
+	int err = 0;
+
+	if (collect) {
+		list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
+			list_del_init(&bh->b_assoc_buffers);
+			err = collect(sci, bh, inode);
+			brelse(bh);
+			if (unlikely(err))
+				goto dispose_buffers;
+		}
+		return 0;
+	}
+
+ dispose_buffers:
+	while (!list_empty(listp)) {
+		bh = list_first_entry(listp, struct buffer_head,
+				      b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return err;
+}
+
+static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
+{
+	/* Remaining number of blocks within segment buffer */
+	return sci->sc_segbuf_nblocks -
+		(sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
+}
+
+static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
+				   struct inode *inode,
+				   struct nilfs_sc_operations *sc_ops)
+{
+	LIST_HEAD(data_buffers);
+	LIST_HEAD(node_buffers);
+	int err;
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		size_t n, rest = nilfs_segctor_buffer_rest(sci);
+
+		n = nilfs_lookup_dirty_data_buffers(
+			inode, &data_buffers, rest + 1, 0, LLONG_MAX);
+		if (n > rest) {
+			err = nilfs_segctor_apply_buffers(
+				sci, inode, &data_buffers,
+				sc_ops->collect_data);
+			BUG_ON(!err); /* always receive -E2BIG or true error */
+			goto break_or_fail;
+		}
+	}
+	nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		err = nilfs_segctor_apply_buffers(
+			sci, inode, &data_buffers, sc_ops->collect_data);
+		if (unlikely(err)) {
+			/* dispose node list */
+			nilfs_segctor_apply_buffers(
+				sci, inode, &node_buffers, NULL);
+			goto break_or_fail;
+		}
+		sci->sc_stage.flags |= NILFS_CF_NODE;
+	}
+	/* Collect node */
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_node);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_bmap);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_segctor_end_finfo(sci, inode);
+	sci->sc_stage.flags &= ~NILFS_CF_NODE;
+
+ break_or_fail:
+	return err;
+}
+
+static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
+					 struct inode *inode)
+{
+	LIST_HEAD(data_buffers);
+	size_t n, rest = nilfs_segctor_buffer_rest(sci);
+	int err;
+
+	n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
+					    sci->sc_dsync_start,
+					    sci->sc_dsync_end);
+
+	err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
+					  nilfs_collect_file_data);
+	if (!err) {
+		nilfs_segctor_end_finfo(sci, inode);
+		BUG_ON(n > rest);
+		/* always receive -E2BIG or true error if n > rest */
+	}
+	return err;
+}
+
+static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct list_head *head;
+	struct nilfs_inode_info *ii;
+	size_t ndone;
+	int err = 0;
+
+	switch (sci->sc_stage.scnt) {
+	case NILFS_ST_INIT:
+		/* Pre-processes */
+		sci->sc_stage.flags = 0;
+
+		if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
+			sci->sc_nblk_inc = 0;
+			sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
+			if (mode == SC_LSEG_DSYNC) {
+				sci->sc_stage.scnt = NILFS_ST_DSYNC;
+				goto dsync_mode;
+			}
+		}
+
+		sci->sc_stage.dirty_file_ptr = NULL;
+		sci->sc_stage.gc_inode_ptr = NULL;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DAT;
+			goto dat_stage;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_GC:
+		if (nilfs_doing_gc()) {
+			head = &sci->sc_gc_inodes;
+			ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
+						head, i_dirty);
+			list_for_each_entry_continue(ii, head, i_dirty) {
+				err = nilfs_segctor_scan_file(
+					sci, &ii->vfs_inode,
+					&nilfs_sc_file_ops);
+				if (unlikely(err)) {
+					sci->sc_stage.gc_inode_ptr = list_entry(
+						ii->i_dirty.prev,
+						struct nilfs_inode_info,
+						i_dirty);
+					goto break_or_fail;
+				}
+				set_bit(NILFS_I_COLLECTED, &ii->i_state);
+			}
+			sci->sc_stage.gc_inode_ptr = NULL;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_FILE:
+		head = &sci->sc_dirty_files;
+		ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
+					i_dirty);
+		list_for_each_entry_continue(ii, head, i_dirty) {
+			clear_bit(NILFS_I_DIRTY, &ii->i_state);
+
+			err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
+						      &nilfs_sc_file_ops);
+			if (unlikely(err)) {
+				sci->sc_stage.dirty_file_ptr =
+					list_entry(ii->i_dirty.prev,
+						   struct nilfs_inode_info,
+						   i_dirty);
+				goto break_or_fail;
+			}
+			/* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
+			/* XXX: required ? */
+		}
+		sci->sc_stage.dirty_file_ptr = NULL;
+		if (mode == SC_FLUSH_FILE) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;
+		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
+		/* Fall through */
+	case NILFS_ST_IFILE:
+		err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;
+		/* Creating a checkpoint */
+		err = nilfs_segctor_create_checkpoint(sci);
+		if (unlikely(err))
+			break;
+		/* Fall through */
+	case NILFS_ST_CPFILE:
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SUFILE:
+		err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
+					 sci->sc_nfreesegs, &ndone);
+		if (unlikely(err)) {
+			nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+						  sci->sc_freesegs, ndone,
+						  NULL);
+			break;
+		}
+		sci->sc_stage.flags |= NILFS_CF_SUFREED;
+
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_DAT:
+ dat_stage:
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
+					      &nilfs_sc_dat_ops);
+		if (unlikely(err))
+			break;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SR:
+		if (mode == SC_LSEG_SR) {
+			/* Appending a super root */
+			err = nilfs_segctor_add_super_root(sci);
+			if (unlikely(err))
+				break;
+		}
+		/* End of a logical segment */
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DSYNC:
+ dsync_mode:
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
+		ii = sci->sc_dsync_inode;
+		if (!test_bit(NILFS_I_BUSY, &ii->i_state))
+			break;
+
+		err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
+		if (unlikely(err))
+			break;
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DONE:
+		return 0;
+	default:
+		BUG();
+	}
+
+ break_or_fail:
+	return err;
+}
+
+/**
+ * nilfs_segctor_begin_construction - setup segment buffer to make a new log
+ * @sci: nilfs_sc_info
+ * @nilfs: nilfs object
+ */
+static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
+					    struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf, *prev;
+	__u64 nextnum;
+	int err, alloc = 0;
+
+	segbuf = nilfs_segbuf_new(sci->sc_super);
+	if (unlikely(!segbuf))
+		return -ENOMEM;
+
+	if (list_empty(&sci->sc_write_logs)) {
+		nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
+				 nilfs->ns_pseg_offset, nilfs);
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_shift_to_next_segment(nilfs);
+			nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+		}
+
+		segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+		nextnum = nilfs->ns_nextnum;
+
+		if (nilfs->ns_segnum == nilfs->ns_nextnum)
+			/* Start from the head of a new full segment */
+			alloc++;
+	} else {
+		/* Continue logs */
+		prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+		nilfs_segbuf_map_cont(segbuf, prev);
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
+		nextnum = prev->sb_nextnum;
+
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+			segbuf->sb_sum.seg_seq++;
+			alloc++;
+		}
+	}
+
+	err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
+	if (err)
+		goto failed;
+
+	if (alloc) {
+		err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
+		if (err)
+			goto failed;
+	}
+	nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
+
+	BUG_ON(!list_empty(&sci->sc_segbufs));
+	list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
+	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
+	return 0;
+
+ failed:
+	nilfs_segbuf_free(segbuf);
+	return err;
+}
+
+static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
+					 struct the_nilfs *nilfs, int nadd)
+{
+	struct nilfs_segment_buffer *segbuf, *prev;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 nextnextnum;
+	LIST_HEAD(list);
+	int err, ret, i;
+
+	prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	/*
+	 * Since the segment specified with nextnum might be allocated during
+	 * the previous construction, the buffer including its segusage may
+	 * not be dirty.  The following call ensures that the buffer is dirty
+	 * and will pin the buffer on memory until the sufile is written.
+	 */
+	err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
+	if (unlikely(err))
+		return err;
+
+	for (i = 0; i < nadd; i++) {
+		/* extend segment info */
+		err = -ENOMEM;
+		segbuf = nilfs_segbuf_new(sci->sc_super);
+		if (unlikely(!segbuf))
+			goto failed;
+
+		/* map this buffer to region of segment on-disk */
+		nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+		sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
+
+		/* allocate the next next full segment */
+		err = nilfs_sufile_alloc(sufile, &nextnextnum);
+		if (unlikely(err))
+			goto failed_segbuf;
+
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
+		nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
+
+		list_add_tail(&segbuf->sb_list, &list);
+		prev = segbuf;
+	}
+	list_splice_tail(&list, &sci->sc_segbufs);
+	return 0;
+
+ failed_segbuf:
+	nilfs_segbuf_free(segbuf);
+ failed:
+	list_for_each_entry(segbuf, &list, sb_list) {
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+	}
+	nilfs_destroy_logs(&list);
+	return err;
+}
+
+static void nilfs_free_incomplete_logs(struct list_head *logs,
+				       struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf, *prev;
+	struct inode *sufile = nilfs->ns_sufile;
+	int ret;
+
+	segbuf = NILFS_FIRST_SEGBUF(logs);
+	if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+	}
+	if (atomic_read(&segbuf->sb_err)) {
+		/* Case 1: The first segment failed */
+		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
+			/* Case 1a:  Partial segment appended into an existing
+			   segment */
+			nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
+						segbuf->sb_fseg_end);
+		else /* Case 1b:  New full segment */
+			set_nilfs_discontinued(nilfs);
+	}
+
+	prev = segbuf;
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		if (prev->sb_nextnum != segbuf->sb_nextnum) {
+			ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+			WARN_ON(ret); /* never fails */
+		}
+		if (atomic_read(&segbuf->sb_err) &&
+		    segbuf->sb_segnum != nilfs->ns_nextnum)
+			/* Case 2: extended segment (!= next) failed */
+			nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
+		prev = segbuf;
+	}
+}
+
+static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
+					  struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	unsigned long live_blocks;
+	int ret;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		live_blocks = segbuf->sb_sum.nblocks +
+			(segbuf->sb_pseg_start - segbuf->sb_fseg_start);
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     live_blocks,
+						     sci->sc_seg_ctime);
+		WARN_ON(ret); /* always succeed because the segusage is dirty */
+	}
+}
+
+static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret;
+
+	segbuf = NILFS_FIRST_SEGBUF(logs);
+	ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+					     segbuf->sb_pseg_start -
+					     segbuf->sb_fseg_start, 0);
+	WARN_ON(ret); /* always succeed because the segusage is dirty */
+
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     0, 0);
+		WARN_ON(ret); /* always succeed */
+	}
+}
+
+static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
+					    struct nilfs_segment_buffer *last,
+					    struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf = last;
+	int ret;
+
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+		sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret);
+	}
+	nilfs_truncate_logs(&sci->sc_segbufs, last);
+}
+
+
+static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
+				 struct the_nilfs *nilfs, int mode)
+{
+	struct nilfs_cstage prev_stage = sci->sc_stage;
+	int err, nadd = 1;
+
+	/* Collection retry loop */
+	for (;;) {
+		sci->sc_nblk_this_inc = 0;
+		sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+
+		err = nilfs_segctor_reset_segment_buffer(sci);
+		if (unlikely(err))
+			goto failed;
+
+		err = nilfs_segctor_collect_blocks(sci, mode);
+		sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+		if (!err)
+			break;
+
+		if (unlikely(err != -E2BIG))
+			goto failed;
+
+		/* The current segment is filled up */
+		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+			break;
+
+		nilfs_clear_logs(&sci->sc_segbufs);
+
+		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+							sci->sc_freesegs,
+							sci->sc_nfreesegs,
+							NULL);
+			WARN_ON(err); /* do not happen */
+			sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
+		}
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
+		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
+		sci->sc_stage = prev_stage;
+	}
+	nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
+	return 0;
+
+ failed:
+	return err;
+}
+
+static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
+				      struct buffer_head *new_bh)
+{
+	BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
+
+	list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
+	/* The caller must release old_bh */
+}
+
+static int
+nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
+				     struct nilfs_segment_buffer *segbuf,
+				     int mode)
+{
+	struct inode *inode = NULL;
+	sector_t blocknr;
+	unsigned long nfinfo = segbuf->sb_sum.nfinfo;
+	unsigned long nblocks = 0, ndatablk = 0;
+	struct nilfs_sc_operations *sc_op = NULL;
+	struct nilfs_segsum_pointer ssp;
+	struct nilfs_finfo *finfo = NULL;
+	union nilfs_binfo binfo;
+	struct buffer_head *bh, *bh_org;
+	ino_t ino = 0;
+	int err = 0;
+
+	if (!nfinfo)
+		goto out;
+
+	blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
+	ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	ssp.offset = sizeof(struct nilfs_segment_summary);
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		if (bh == segbuf->sb_super_root)
+			break;
+		if (!finfo) {
+			finfo =	nilfs_segctor_map_segsum_entry(
+				sci, &ssp, sizeof(*finfo));
+			ino = le64_to_cpu(finfo->fi_ino);
+			nblocks = le32_to_cpu(finfo->fi_nblocks);
+			ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+
+			inode = bh->b_page->mapping->host;
+
+			if (mode == SC_LSEG_DSYNC)
+				sc_op = &nilfs_sc_dsync_ops;
+			else if (ino == NILFS_DAT_INO)
+				sc_op = &nilfs_sc_dat_ops;
+			else /* file blocks */
+				sc_op = &nilfs_sc_file_ops;
+		}
+		bh_org = bh;
+		get_bh(bh_org);
+		err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
+					&binfo);
+		if (bh != bh_org)
+			nilfs_list_replace_buffer(bh_org, bh);
+		brelse(bh_org);
+		if (unlikely(err))
+			goto failed_bmap;
+
+		if (ndatablk > 0)
+			sc_op->write_data_binfo(sci, &ssp, &binfo);
+		else
+			sc_op->write_node_binfo(sci, &ssp, &binfo);
+
+		blocknr++;
+		if (--nblocks == 0) {
+			finfo = NULL;
+			if (--nfinfo == 0)
+				break;
+		} else if (ndatablk > 0)
+			ndatablk--;
+	}
+ out:
+	return 0;
+
+ failed_bmap:
+	return err;
+}
+
+static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
+		if (unlikely(err))
+			return err;
+		nilfs_segbuf_fill_in_segsum(segbuf);
+	}
+	return 0;
+}
+
+static void nilfs_begin_page_io(struct page *page)
+{
+	if (!page || PageWriteback(page))
+		/* For split b-tree node pages, this function may be called
+		   twice.  We ignore the 2nd or later calls by this check. */
+		return;
+
+	lock_page(page);
+	clear_page_dirty_for_io(page);
+	set_page_writeback(page);
+	unlock_page(page);
+}
+
+static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+				}
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			set_buffer_async_write(bh);
+			if (bh == segbuf->sb_super_root) {
+				if (bh->b_page != bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_begin_page_io(fs_page);
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page) {
+		lock_page(bd_page);
+		clear_page_dirty_for_io(bd_page);
+		set_page_writeback(bd_page);
+		unlock_page(bd_page);
+	}
+	nilfs_begin_page_io(fs_page);
+}
+
+static int nilfs_segctor_write(struct nilfs_sc_info *sci,
+			       struct the_nilfs *nilfs)
+{
+	int ret;
+
+	ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
+	list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
+	return ret;
+}
+
+static void nilfs_end_page_io(struct page *page, int err)
+{
+	if (!page)
+		return;
+
+	if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
+		/*
+		 * For b-tree node pages, this function may be called twice
+		 * or more because they might be split in a segment.
+		 */
+		if (PageDirty(page)) {
+			/*
+			 * For pages holding split b-tree node buffers, dirty
+			 * flag on the buffers may be cleared discretely.
+			 * In that case, the page is once redirtied for
+			 * remaining buffers, and it must be cancelled if
+			 * all the buffers get cleaned later.
+			 */
+			lock_page(page);
+			if (nilfs_page_buffers_clean(page))
+				__nilfs_clear_page_dirty(page);
+			unlock_page(page);
+		}
+		return;
+	}
+
+	if (!err) {
+		if (!nilfs_page_buffers_clean(page))
+			__set_page_dirty_nobuffers(page);
+		ClearPageError(page);
+	} else {
+		__set_page_dirty_nobuffers(page);
+		SetPageError(page);
+	}
+
+	end_page_writeback(page);
+}
+
+static void nilfs_abort_logs(struct list_head *logs, int err)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct buffer_head *bh;
+
+	if (list_empty(logs))
+		return;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			clear_buffer_async_write(bh);
+			if (bh == segbuf->sb_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, err);
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, err);
+}
+
+static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs, int err)
+{
+	LIST_HEAD(logs);
+	int ret;
+
+	list_splice_tail_init(&sci->sc_write_logs, &logs);
+	ret = nilfs_wait_on_logs(&logs);
+	nilfs_abort_logs(&logs, ret ? : err);
+
+	list_splice_tail_init(&sci->sc_segbufs, &logs);
+	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
+	nilfs_free_incomplete_logs(&logs, nilfs);
+
+	if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+		ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+						sci->sc_freesegs,
+						sci->sc_nfreesegs,
+						NULL);
+		WARN_ON(ret); /* do not happen */
+	}
+
+	nilfs_destroy_logs(&logs);
+}
+
+static void nilfs_set_next_segment(struct the_nilfs *nilfs,
+				   struct nilfs_segment_buffer *segbuf)
+{
+	nilfs->ns_segnum = segbuf->sb_segnum;
+	nilfs->ns_nextnum = segbuf->sb_nextnum;
+	nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
+		+ segbuf->sb_sum.nblocks;
+	nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
+	nilfs->ns_ctime = segbuf->sb_sum.ctime;
+}
+
+static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int update_sr = false;
+
+	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+		/*
+		 * We assume that the buffers which belong to the same page
+		 * continue over the buffer list.
+		 * Under this assumption, the last BHs of pages is
+		 * identifiable by the discontinuity of bh->b_page
+		 * (page != fs_page).
+		 *
+		 * For B-tree node blocks, however, this assumption is not
+		 * guaranteed.  The cleanup code of B-tree node pages needs
+		 * special care.
+		 */
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			const unsigned long set_bits = (1 << BH_Uptodate);
+			const unsigned long clear_bits =
+				(1 << BH_Dirty | 1 << BH_Async_Write |
+				 1 << BH_Delay | 1 << BH_NILFS_Volatile |
+				 1 << BH_NILFS_Redirected);
+
+			set_mask_bits(&bh->b_state, clear_bits, set_bits);
+			if (bh == segbuf->sb_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				update_sr = true;
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, 0);
+				fs_page = bh->b_page;
+			}
+		}
+
+		if (!nilfs_segbuf_simplex(segbuf)) {
+			if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
+				set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+				sci->sc_lseg_stime = jiffies;
+			}
+			if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
+				clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+		}
+	}
+	/*
+	 * Since pages may continue over multiple segment buffers,
+	 * end of the last page must be checked outside of the loop.
+	 */
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, 0);
+
+	nilfs_drop_collected_inodes(&sci->sc_dirty_files);
+
+	if (nilfs_doing_gc())
+		nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
+	else
+		nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
+
+	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
+
+	segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+	nilfs_set_next_segment(nilfs, segbuf);
+
+	if (update_sr) {
+		nilfs->ns_flushed_device = 0;
+		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
+				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
+
+		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+		set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+		nilfs_segctor_clear_metadata_dirty(sci);
+	} else
+		clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+}
+
+static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
+{
+	int ret;
+
+	ret = nilfs_wait_on_logs(&sci->sc_write_logs);
+	if (!ret) {
+		nilfs_segctor_complete_write(sci);
+		nilfs_destroy_logs(&sci->sc_write_logs);
+	}
+	return ret;
+}
+
+static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct nilfs_inode_info *ii, *n;
+	struct inode *ifile = sci->sc_root->ifile;
+
+	spin_lock(&nilfs->ns_inode_lock);
+ retry:
+	list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
+		if (!ii->i_bh) {
+			struct buffer_head *ibh;
+			int err;
+
+			spin_unlock(&nilfs->ns_inode_lock);
+			err = nilfs_ifile_get_inode_block(
+				ifile, ii->vfs_inode.i_ino, &ibh);
+			if (unlikely(err)) {
+				nilfs_warning(sci->sc_super, __func__,
+					      "failed to get inode block.\n");
+				return err;
+			}
+			mark_buffer_dirty(ibh);
+			nilfs_mdt_mark_dirty(ifile);
+			spin_lock(&nilfs->ns_inode_lock);
+			if (likely(!ii->i_bh))
+				ii->i_bh = ibh;
+			else
+				brelse(ibh);
+			goto retry;
+		}
+
+		clear_bit(NILFS_I_QUEUED, &ii->i_state);
+		set_bit(NILFS_I_BUSY, &ii->i_state);
+		list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+
+	return 0;
+}
+
+static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct nilfs_inode_info *ii, *n;
+	int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
+	int defer_iput = false;
+
+	spin_lock(&nilfs->ns_inode_lock);
+	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
+		    test_bit(NILFS_I_DIRTY, &ii->i_state))
+			continue;
+
+		clear_bit(NILFS_I_BUSY, &ii->i_state);
+		brelse(ii->i_bh);
+		ii->i_bh = NULL;
+		list_del_init(&ii->i_dirty);
+		if (!ii->vfs_inode.i_nlink || during_mount) {
+			/*
+			 * Defer calling iput() to avoid deadlocks if
+			 * i_nlink == 0 or mount is not yet finished.
+			 */
+			list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+			defer_iput = true;
+		} else {
+			spin_unlock(&nilfs->ns_inode_lock);
+			iput(&ii->vfs_inode);
+			spin_lock(&nilfs->ns_inode_lock);
+		}
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+
+	if (defer_iput)
+		schedule_work(&sci->sc_iput_work);
+}
+
+/*
+ * Main procedure of segment constructor
+ */
+static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int err;
+
+	sci->sc_stage.scnt = NILFS_ST_INIT;
+	sci->sc_cno = nilfs->ns_cno;
+
+	err = nilfs_segctor_collect_dirty_files(sci, nilfs);
+	if (unlikely(err))
+		goto out;
+
+	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	if (nilfs_segctor_clean(sci))
+		goto out;
+
+	do {
+		sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
+
+		err = nilfs_segctor_begin_construction(sci, nilfs);
+		if (unlikely(err))
+			goto out;
+
+		/* Update time stamp */
+		sci->sc_seg_ctime = get_seconds();
+
+		err = nilfs_segctor_collect(sci, nilfs, mode);
+		if (unlikely(err))
+			goto failed;
+
+		/* Avoid empty segment */
+		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+		    nilfs_segbuf_empty(sci->sc_curseg)) {
+			nilfs_segctor_abort_construction(sci, nilfs, 1);
+			goto out;
+		}
+
+		err = nilfs_segctor_assign(sci, mode);
+		if (unlikely(err))
+			goto failed;
+
+		if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+			nilfs_segctor_fill_in_file_bmap(sci);
+
+		if (mode == SC_LSEG_SR &&
+		    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+			err = nilfs_segctor_fill_in_checkpoint(sci);
+			if (unlikely(err))
+				goto failed_to_write;
+
+			nilfs_segctor_fill_in_super_root(sci, nilfs);
+		}
+		nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
+
+		/* Write partial segments */
+		nilfs_segctor_prepare_write(sci);
+
+		nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+					    nilfs->ns_crc_seed);
+
+		err = nilfs_segctor_write(sci, nilfs);
+		if (unlikely(err))
+			goto failed_to_write;
+
+		if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+		    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
+			/*
+			 * At this point, we avoid double buffering
+			 * for blocksize < pagesize because page dirty
+			 * flag is turned off during write and dirty
+			 * buffers are not properly collected for
+			 * pages crossing over segments.
+			 */
+			err = nilfs_segctor_wait(sci);
+			if (err)
+				goto failed_to_write;
+		}
+	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
+
+ out:
+	nilfs_segctor_drop_written_files(sci, nilfs);
+	return err;
+
+ failed_to_write:
+	if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+		nilfs_redirty_inodes(&sci->sc_dirty_files);
+
+ failed:
+	if (nilfs_doing_gc())
+		nilfs_redirty_inodes(&sci->sc_gc_inodes);
+	nilfs_segctor_abort_construction(sci, nilfs, err);
+	goto out;
+}
+
+/**
+ * nilfs_segctor_start_timer - set timer of background write
+ * @sci: nilfs_sc_info
+ *
+ * If the timer has already been set, it ignores the new request.
+ * This function MUST be called within a section locking the segment
+ * semaphore.
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+		sci->sc_timer.expires = jiffies + sci->sc_interval;
+		add_timer(&sci->sc_timer);
+		sci->sc_state |= NILFS_SEGCTOR_COMMIT;
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (!(sci->sc_flush_request & (1 << bn))) {
+		unsigned long prev_req = sci->sc_flush_request;
+
+		sci->sc_flush_request |= (1 << bn);
+		if (!prev_req)
+			wake_up(&sci->sc_wait_daemon);
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+/**
+ * nilfs_flush_segment - trigger a segment construction for resource control
+ * @sb: super block
+ * @ino: inode number of the file to be flushed out.
+ */
+void nilfs_flush_segment(struct super_block *sb, ino_t ino)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+	if (!sci || nilfs_doing_construction())
+		return;
+	nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
+					/* assign bit 0 to data files */
+}
+
+struct nilfs_segctor_wait_request {
+	wait_queue_t	wq;
+	__u32		seq;
+	int		err;
+	atomic_t	done;
+};
+
+static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segctor_wait_request wait_req;
+	int err = 0;
+
+	spin_lock(&sci->sc_state_lock);
+	init_wait(&wait_req.wq);
+	wait_req.err = 0;
+	atomic_set(&wait_req.done, 0);
+	wait_req.seq = ++sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+
+	init_waitqueue_entry(&wait_req.wq, current);
+	add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
+	set_current_state(TASK_INTERRUPTIBLE);
+	wake_up(&sci->sc_wait_daemon);
+
+	for (;;) {
+		if (atomic_read(&wait_req.done)) {
+			err = wait_req.err;
+			break;
+		}
+		if (!signal_pending(current)) {
+			schedule();
+			continue;
+		}
+		err = -ERESTARTSYS;
+		break;
+	}
+	finish_wait(&sci->sc_wait_request, &wait_req.wq);
+	return err;
+}
+
+static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
+{
+	struct nilfs_segctor_wait_request *wrq, *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
+	list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
+				 wq.task_list) {
+		if (!atomic_read(&wrq->done) &&
+		    nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
+			wrq->err = err;
+			atomic_set(&wrq->done, 1);
+		}
+		if (atomic_read(&wrq->done)) {
+			wrq->wq.func(&wrq->wq,
+				     TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+				     0, NULL);
+		}
+	}
+	spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
+}
+
+/**
+ * nilfs_construct_segment - construct a logical segment
+ * @sb: super block
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_segment(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+	struct nilfs_transaction_info *ti;
+	int err;
+
+	if (!sci)
+		return -EROFS;
+
+	/* A call inside transactions causes a deadlock. */
+	BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
+
+	err = nilfs_segctor_sync(sci);
+	return err;
+}
+
+/**
+ * nilfs_construct_dsync_segment - construct a data-only logical segment
+ * @sb: super block
+ * @inode: inode whose data blocks should be written out
+ * @start: start byte offset
+ * @end: end byte offset (inclusive)
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
+				  loff_t start, loff_t end)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+	struct nilfs_inode_info *ii;
+	struct nilfs_transaction_info ti;
+	int err = 0;
+
+	if (!sci)
+		return -EROFS;
+
+	nilfs_transaction_lock(sb, &ti, 0);
+
+	ii = NILFS_I(inode);
+	if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) ||
+	    nilfs_test_opt(nilfs, STRICT_ORDER) ||
+	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    nilfs_discontinued(nilfs)) {
+		nilfs_transaction_unlock(sb);
+		err = nilfs_segctor_sync(sci);
+		return err;
+	}
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		spin_unlock(&nilfs->ns_inode_lock);
+		nilfs_transaction_unlock(sb);
+		return 0;
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+	sci->sc_dsync_inode = ii;
+	sci->sc_dsync_start = start;
+	sci->sc_dsync_end = end;
+
+	err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+	if (!err)
+		nilfs->ns_flushed_device = 0;
+
+	nilfs_transaction_unlock(sb);
+	return err;
+}
+
+#define FLUSH_FILE_BIT	(0x1) /* data file only */
+#define FLUSH_DAT_BIT	(1 << NILFS_DAT_INO) /* DAT only */
+
+/**
+ * nilfs_segctor_accept - record accepted sequence count of log-write requests
+ * @sci: segment constructor object
+ */
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
+{
+	spin_lock(&sci->sc_state_lock);
+	sci->sc_seq_accepted = sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+	del_timer_sync(&sci->sc_timer);
+}
+
+/**
+ * nilfs_segctor_notify - notify the result of request to caller threads
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ * @err: error code to be notified
+ */
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
+{
+	/* Clear requests (even when the construction failed) */
+	spin_lock(&sci->sc_state_lock);
+
+	if (mode == SC_LSEG_SR) {
+		sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
+		sci->sc_seq_done = sci->sc_seq_accepted;
+		nilfs_segctor_wakeup(sci, err);
+		sci->sc_flush_request = 0;
+	} else {
+		if (mode == SC_FLUSH_FILE)
+			sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+		else if (mode == SC_FLUSH_DAT)
+			sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+
+		/* re-enable timer if checkpoint creation was not done */
+		if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+		    time_before(jiffies, sci->sc_timer.expires))
+			add_timer(&sci->sc_timer);
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+/**
+ * nilfs_segctor_construct - form logs and write them to disk
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ */
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int err = 0;
+
+	nilfs_segctor_accept(sci);
+
+	if (nilfs_discontinued(nilfs))
+		mode = SC_LSEG_SR;
+	if (!nilfs_segctor_confirm(sci))
+		err = nilfs_segctor_do_construct(sci, mode);
+
+	if (likely(!err)) {
+		if (mode != SC_FLUSH_DAT)
+			atomic_set(&nilfs->ns_ndirtyblks, 0);
+		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
+		    nilfs_discontinued(nilfs)) {
+			down_write(&nilfs->ns_sem);
+			err = -EIO;
+			sbp = nilfs_prepare_super(sci->sc_super,
+						  nilfs_sb_will_flip(nilfs));
+			if (likely(sbp)) {
+				nilfs_set_log_cursor(sbp[0], nilfs);
+				err = nilfs_commit_super(sci->sc_super,
+							 NILFS_SB_COMMIT);
+			}
+			up_write(&nilfs->ns_sem);
+		}
+	}
+
+	nilfs_segctor_notify(sci, mode, err);
+	return err;
+}
+
+static void nilfs_construction_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
+static void
+nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
+{
+	struct nilfs_inode_info *ii, *n;
+
+	list_for_each_entry_safe(ii, n, head, i_dirty) {
+		if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
+			continue;
+		list_del_init(&ii->i_dirty);
+		truncate_inode_pages(&ii->vfs_inode.i_data, 0);
+		nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+		iput(&ii->vfs_inode);
+	}
+}
+
+int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
+			 void **kbufs)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (unlikely(!sci))
+		return -EROFS;
+
+	nilfs_transaction_lock(sb, &ti, 1);
+
+	err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
+	if (unlikely(err))
+		goto out_unlock;
+
+	err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
+	if (unlikely(err)) {
+		nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
+		goto out_unlock;
+	}
+
+	sci->sc_freesegs = kbufs[4];
+	sci->sc_nfreesegs = argv[4].v_nmembs;
+	list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
+
+	for (;;) {
+		err = nilfs_segctor_construct(sci, SC_LSEG_SR);
+		nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
+
+		if (likely(!err))
+			break;
+
+		nilfs_warning(sb, __func__,
+			      "segment construction failed. (err=%d)", err);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(sci->sc_interval);
+	}
+	if (nilfs_test_opt(nilfs, DISCARD)) {
+		int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
+						 sci->sc_nfreesegs);
+		if (ret) {
+			printk(KERN_WARNING
+			       "NILFS warning: error %d on discard request, "
+			       "turning discards off for the device\n", ret);
+			nilfs_clear_opt(nilfs, DISCARD);
+		}
+	}
+
+ out_unlock:
+	sci->sc_freesegs = NULL;
+	sci->sc_nfreesegs = 0;
+	nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
+	nilfs_transaction_unlock(sb);
+	return err;
+}
+
+static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_transaction_info ti;
+
+	nilfs_transaction_lock(sci->sc_super, &ti, 0);
+	nilfs_segctor_construct(sci, mode);
+
+	/*
+	 * Unclosed segment should be retried.  We do this using sc_timer.
+	 * Timeout of sc_timer will invoke complete construction which leads
+	 * to close the current logical segment.
+	 */
+	if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
+		nilfs_segctor_start_timer(sci);
+
+	nilfs_transaction_unlock(sci->sc_super);
+}
+
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
+{
+	int mode = 0;
+	int err;
+
+	spin_lock(&sci->sc_state_lock);
+	mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
+		SC_FLUSH_DAT : SC_FLUSH_FILE;
+	spin_unlock(&sci->sc_state_lock);
+
+	if (mode) {
+		err = nilfs_segctor_do_construct(sci, mode);
+
+		spin_lock(&sci->sc_state_lock);
+		sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
+			~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
+		spin_unlock(&sci->sc_state_lock);
+	}
+	clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+}
+
+static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
+{
+	if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
+		if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
+			return SC_FLUSH_FILE;
+		else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
+			return SC_FLUSH_DAT;
+	}
+	return SC_LSEG_SR;
+}
+
+/**
+ * nilfs_segctor_thread - main loop of the segment constructor thread.
+ * @arg: pointer to a struct nilfs_sc_info.
+ *
+ * nilfs_segctor_thread() initializes a timer and serves as a daemon
+ * to execute segment constructions.
+ */
+static int nilfs_segctor_thread(void *arg)
+{
+	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int timeout = 0;
+
+	sci->sc_timer.data = (unsigned long)current;
+	sci->sc_timer.function = nilfs_construction_timeout;
+
+	/* start sync. */
+	sci->sc_task = current;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
+	printk(KERN_INFO
+	       "segctord starting. Construction interval = %lu seconds, "
+	       "CP frequency < %lu seconds\n",
+	       sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+
+	spin_lock(&sci->sc_state_lock);
+ loop:
+	for (;;) {
+		int mode;
+
+		if (sci->sc_state & NILFS_SEGCTOR_QUIT)
+			goto end_thread;
+
+		if (timeout || sci->sc_seq_request != sci->sc_seq_done)
+			mode = SC_LSEG_SR;
+		else if (!sci->sc_flush_request)
+			break;
+		else
+			mode = nilfs_segctor_flush_mode(sci);
+
+		spin_unlock(&sci->sc_state_lock);
+		nilfs_segctor_thread_construct(sci, mode);
+		spin_lock(&sci->sc_state_lock);
+		timeout = 0;
+	}
+
+
+	if (freezing(current)) {
+		spin_unlock(&sci->sc_state_lock);
+		try_to_freeze();
+		spin_lock(&sci->sc_state_lock);
+	} else {
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&sci->sc_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+
+		if (sci->sc_seq_request != sci->sc_seq_done)
+			should_sleep = 0;
+		else if (sci->sc_flush_request)
+			should_sleep = 0;
+		else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
+			should_sleep = time_before(jiffies,
+					sci->sc_timer.expires);
+
+		if (should_sleep) {
+			spin_unlock(&sci->sc_state_lock);
+			schedule();
+			spin_lock(&sci->sc_state_lock);
+		}
+		finish_wait(&sci->sc_wait_daemon, &wait);
+		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+			   time_after_eq(jiffies, sci->sc_timer.expires));
+
+		if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
+			set_nilfs_discontinued(nilfs);
+	}
+	goto loop;
+
+ end_thread:
+	spin_unlock(&sci->sc_state_lock);
+
+	/* end sync. */
+	sci->sc_task = NULL;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+	return 0;
+}
+
+static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
+{
+	struct task_struct *t;
+
+	t = kthread_run(nilfs_segctor_thread, sci, "segctord");
+	if (IS_ERR(t)) {
+		int err = PTR_ERR(t);
+
+		printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
+		       err);
+		return err;
+	}
+	wait_event(sci->sc_wait_task, sci->sc_task != NULL);
+	return 0;
+}
+
+static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+	__acquires(&sci->sc_state_lock)
+	__releases(&sci->sc_state_lock)
+{
+	sci->sc_state |= NILFS_SEGCTOR_QUIT;
+
+	while (sci->sc_task) {
+		wake_up(&sci->sc_wait_daemon);
+		spin_unlock(&sci->sc_state_lock);
+		wait_event(sci->sc_wait_task, sci->sc_task == NULL);
+		spin_lock(&sci->sc_state_lock);
+	}
+}
+
+/*
+ * Setup & clean-up functions
+ */
+static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
+					       struct nilfs_root *root)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci;
+
+	sci = kzalloc(sizeof(*sci), GFP_KERNEL);
+	if (!sci)
+		return NULL;
+
+	sci->sc_super = sb;
+
+	nilfs_get_root(root);
+	sci->sc_root = root;
+
+	init_waitqueue_head(&sci->sc_wait_request);
+	init_waitqueue_head(&sci->sc_wait_daemon);
+	init_waitqueue_head(&sci->sc_wait_task);
+	spin_lock_init(&sci->sc_state_lock);
+	INIT_LIST_HEAD(&sci->sc_dirty_files);
+	INIT_LIST_HEAD(&sci->sc_segbufs);
+	INIT_LIST_HEAD(&sci->sc_write_logs);
+	INIT_LIST_HEAD(&sci->sc_gc_inodes);
+	INIT_LIST_HEAD(&sci->sc_iput_queue);
+	INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
+	init_timer(&sci->sc_timer);
+
+	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
+	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
+	sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
+
+	if (nilfs->ns_interval)
+		sci->sc_interval = HZ * nilfs->ns_interval;
+	if (nilfs->ns_watermark)
+		sci->sc_watermark = nilfs->ns_watermark;
+	return sci;
+}
+
+static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
+{
+	int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
+
+	/* The segctord thread was stopped and its timer was removed.
+	   But some tasks remain. */
+	do {
+		struct nilfs_transaction_info ti;
+
+		nilfs_transaction_lock(sci->sc_super, &ti, 0);
+		ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
+		nilfs_transaction_unlock(sci->sc_super);
+
+		flush_work(&sci->sc_iput_work);
+
+	} while (ret && retrycount-- > 0);
+}
+
+/**
+ * nilfs_segctor_destroy - destroy the segment constructor.
+ * @sci: nilfs_sc_info
+ *
+ * nilfs_segctor_destroy() kills the segctord thread and frees
+ * the nilfs_sc_info struct.
+ * Caller must hold the segment semaphore.
+ */
+static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int flag;
+
+	up_write(&nilfs->ns_segctor_sem);
+
+	spin_lock(&sci->sc_state_lock);
+	nilfs_segctor_kill_thread(sci);
+	flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
+		|| sci->sc_seq_request != sci->sc_seq_done);
+	spin_unlock(&sci->sc_state_lock);
+
+	if (flush_work(&sci->sc_iput_work))
+		flag = true;
+
+	if (flag || !nilfs_segctor_confirm(sci))
+		nilfs_segctor_write_out(sci);
+
+	if (!list_empty(&sci->sc_dirty_files)) {
+		nilfs_warning(sci->sc_super, __func__,
+			      "dirty file(s) after the final construction\n");
+		nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
+	}
+
+	if (!list_empty(&sci->sc_iput_queue)) {
+		nilfs_warning(sci->sc_super, __func__,
+			      "iput queue is not empty\n");
+		nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+	}
+
+	WARN_ON(!list_empty(&sci->sc_segbufs));
+	WARN_ON(!list_empty(&sci->sc_write_logs));
+
+	nilfs_put_root(sci->sc_root);
+
+	down_write(&nilfs->ns_segctor_sem);
+
+	del_timer_sync(&sci->sc_timer);
+	kfree(sci);
+}
+
+/**
+ * nilfs_attach_log_writer - attach log writer
+ * @sb: super block instance
+ * @root: root object of the current filesystem tree
+ *
+ * This allocates a log writer object, initializes it, and starts the
+ * log writer.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err;
+
+	if (nilfs->ns_writer) {
+		/*
+		 * This happens if the filesystem was remounted
+		 * read/write after nilfs_error degenerated it into a
+		 * read-only mount.
+		 */
+		nilfs_detach_log_writer(sb);
+	}
+
+	nilfs->ns_writer = nilfs_segctor_new(sb, root);
+	if (!nilfs->ns_writer)
+		return -ENOMEM;
+
+	err = nilfs_segctor_start_thread(nilfs->ns_writer);
+	if (err) {
+		kfree(nilfs->ns_writer);
+		nilfs->ns_writer = NULL;
+	}
+	return err;
+}
+
+/**
+ * nilfs_detach_log_writer - destroy log writer
+ * @sb: super block instance
+ *
+ * This kills log writer daemon, frees the log writer object, and
+ * destroys list of dirty files.
+ */
+void nilfs_detach_log_writer(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	LIST_HEAD(garbage_list);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (nilfs->ns_writer) {
+		nilfs_segctor_destroy(nilfs->ns_writer);
+		nilfs->ns_writer = NULL;
+	}
+
+	/* Force to free the list of dirty files */
+	spin_lock(&nilfs->ns_inode_lock);
+	if (!list_empty(&nilfs->ns_dirty_files)) {
+		list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
+		nilfs_warning(sb, __func__,
+			      "Hit dirty file after stopped log writer\n");
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+	up_write(&nilfs->ns_segctor_sem);
+
+	nilfs_dispose_list(nilfs, &garbage_list, 1);
+}
diff --git a/kernel/fs/nilfs2/segment.h b/kernel/fs/nilfs2/segment.h
new file mode 100644
index 000000000..a48d6de1e
--- /dev/null
+++ b/kernel/fs/nilfs2/segment.h
@@ -0,0 +1,251 @@
+/*
+ * segment.h - NILFS Segment constructor prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGMENT_H
+#define _NILFS_SEGMENT_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+
+struct nilfs_root;
+
+/**
+ * struct nilfs_recovery_info - Recovery information
+ * @ri_need_recovery: Recovery status
+ * @ri_super_root: Block number of the last super root
+ * @ri_ri_cno: Number of the last checkpoint
+ * @ri_lsegs_start: Region for roll-forwarding (start block number)
+ * @ri_lsegs_end: Region for roll-forwarding (end block number)
+ * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
+ * @ri_used_segments: List of segments to be mark active
+ * @ri_pseg_start: Block number of the last partial segment
+ * @ri_seq: Sequence number on the last partial segment
+ * @ri_segnum: Segment number on the last partial segment
+ * @ri_nextnum: Next segment number on the last partial segment
+ */
+struct nilfs_recovery_info {
+	int			ri_need_recovery;
+	sector_t		ri_super_root;
+	__u64			ri_cno;
+
+	sector_t		ri_lsegs_start;
+	sector_t		ri_lsegs_end;
+	u64			ri_lsegs_start_seq;
+	struct list_head	ri_used_segments;
+	sector_t		ri_pseg_start;
+	u64			ri_seq;
+	__u64			ri_segnum;
+	__u64			ri_nextnum;
+};
+
+/* ri_need_recovery */
+#define NILFS_RECOVERY_SR_UPDATED	 1  /* The super root was updated */
+#define NILFS_RECOVERY_ROLLFORWARD_DONE	 2  /* Rollforward was carried out */
+
+/**
+ * struct nilfs_cstage - Context of collection stage
+ * @scnt: Stage count
+ * @flags: State flags
+ * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
+ * @gc_inode_ptr: Pointer on the list of gc-inodes
+ */
+struct nilfs_cstage {
+	int			scnt;
+	unsigned		flags;
+	struct nilfs_inode_info *dirty_file_ptr;
+	struct nilfs_inode_info *gc_inode_ptr;
+};
+
+struct nilfs_segment_buffer;
+
+struct nilfs_segsum_pointer {
+	struct buffer_head     *bh;
+	unsigned		offset; /* offset in bytes */
+};
+
+/**
+ * struct nilfs_sc_info - Segment constructor information
+ * @sc_super: Back pointer to super_block struct
+ * @sc_root: root object of the current filesystem tree
+ * @sc_nblk_inc: Block count of current generation
+ * @sc_dirty_files: List of files to be written
+ * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
+ * @sc_freesegs: array of segment numbers to be freed
+ * @sc_nfreesegs: number of segments on @sc_freesegs
+ * @sc_dsync_inode: inode whose data pages are written for a sync operation
+ * @sc_dsync_start: start byte offset of data pages
+ * @sc_dsync_end: end byte offset of data pages (inclusive)
+ * @sc_segbufs: List of segment buffers
+ * @sc_write_logs: List of segment buffers to hold logs under writing
+ * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
+ * @sc_curseg: Current segment buffer
+ * @sc_stage: Collection stage
+ * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
+ * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
+ * @sc_blk_cnt:	Block count of a file
+ * @sc_datablk_cnt: Data block count of a file
+ * @sc_nblk_this_inc: Number of blocks included in the current logical segment
+ * @sc_seg_ctime: Creation time
+ * @sc_cno: checkpoint number of current log
+ * @sc_flags: Internal flags
+ * @sc_state_lock: spinlock for sc_state and so on
+ * @sc_state: Segctord state flags
+ * @sc_flush_request: inode bitmap of metadata files to be flushed
+ * @sc_wait_request: Client request queue
+ * @sc_wait_daemon: Daemon wait queue
+ * @sc_wait_task: Start/end wait queue to control segctord task
+ * @sc_seq_request: Request counter
+ * @sc_seq_accept: Accepted request count
+ * @sc_seq_done: Completion counter
+ * @sc_sync: Request of explicit sync operation
+ * @sc_interval: Timeout value of background construction
+ * @sc_mjcp_freq: Frequency of creating checkpoints
+ * @sc_lseg_stime: Start time of the latest logical segment
+ * @sc_watermark: Watermark for the number of dirty buffers
+ * @sc_timer: Timer for segctord
+ * @sc_task: current thread of segctord
+ */
+struct nilfs_sc_info {
+	struct super_block     *sc_super;
+	struct nilfs_root      *sc_root;
+
+	unsigned long		sc_nblk_inc;
+
+	struct list_head	sc_dirty_files;
+	struct list_head	sc_gc_inodes;
+	struct list_head	sc_iput_queue;
+	struct work_struct	sc_iput_work;
+
+	__u64		       *sc_freesegs;
+	size_t			sc_nfreesegs;
+
+	struct nilfs_inode_info *sc_dsync_inode;
+	loff_t			sc_dsync_start;
+	loff_t			sc_dsync_end;
+
+	/* Segment buffers */
+	struct list_head	sc_segbufs;
+	struct list_head	sc_write_logs;
+	unsigned long		sc_segbuf_nblocks;
+	struct nilfs_segment_buffer *sc_curseg;
+
+	struct nilfs_cstage	sc_stage;
+
+	struct nilfs_segsum_pointer sc_finfo_ptr;
+	struct nilfs_segsum_pointer sc_binfo_ptr;
+	unsigned long		sc_blk_cnt;
+	unsigned long		sc_datablk_cnt;
+	unsigned long		sc_nblk_this_inc;
+	time_t			sc_seg_ctime;
+	__u64			sc_cno;
+	unsigned long		sc_flags;
+
+	spinlock_t		sc_state_lock;
+	unsigned long		sc_state;
+	unsigned long		sc_flush_request;
+
+	wait_queue_head_t	sc_wait_request;
+	wait_queue_head_t	sc_wait_daemon;
+	wait_queue_head_t	sc_wait_task;
+
+	__u32			sc_seq_request;
+	__u32			sc_seq_accepted;
+	__u32			sc_seq_done;
+
+	int			sc_sync;
+	unsigned long		sc_interval;
+	unsigned long		sc_mjcp_freq;
+	unsigned long		sc_lseg_stime;	/* in 1/HZ seconds */
+	unsigned long		sc_watermark;
+
+	struct timer_list	sc_timer;
+	struct task_struct     *sc_task;
+};
+
+/* sc_flags */
+enum {
+	NILFS_SC_DIRTY,		/* One or more dirty meta-data blocks exist */
+	NILFS_SC_UNCLOSED,	/* Logical segment is not closed */
+	NILFS_SC_SUPER_ROOT,	/* The latest segment has a super root */
+	NILFS_SC_PRIOR_FLUSH,	/* Requesting immediate flush without making a
+				   checkpoint */
+	NILFS_SC_HAVE_DELTA,	/* Next checkpoint will have update of files
+				   other than DAT, cpfile, sufile, or files
+				   moved by GC */
+};
+
+/* sc_state */
+#define NILFS_SEGCTOR_QUIT	    0x0001  /* segctord is being destroyed */
+#define NILFS_SEGCTOR_COMMIT	    0x0004  /* committed transaction exists */
+
+/*
+ * Constant parameters
+ */
+#define NILFS_SC_CLEANUP_RETRY	    3  /* Retry count of construction when
+					  destroying segctord */
+
+/*
+ * Default values of timeout, in seconds.
+ */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
+					   It triggers construction of a
+					   logical segment with a super root */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
+					   creation */
+
+/*
+ * The default threshold amount of data, in block counts.
+ */
+#define NILFS_SC_DEFAULT_WATERMARK  3600
+
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
+
+/* segment.c */
+extern void nilfs_relax_pressure_in_lock(struct super_block *);
+
+extern int nilfs_construct_segment(struct super_block *);
+extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
+					 loff_t, loff_t);
+extern void nilfs_flush_segment(struct super_block *, ino_t);
+extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
+				void **);
+
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root);
+void nilfs_detach_log_writer(struct super_block *sb);
+
+/* recovery.c */
+extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
+				       struct buffer_head **, int);
+extern int nilfs_search_super_root(struct the_nilfs *,
+				   struct nilfs_recovery_info *);
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb,
+			      struct nilfs_recovery_info *ri);
+extern void nilfs_dispose_segment_list(struct list_head *);
+
+#endif /* _NILFS_SEGMENT_H */
diff --git a/kernel/fs/nilfs2/sufile.c b/kernel/fs/nilfs2/sufile.c
new file mode 100644
index 000000000..2a869c35c
--- /dev/null
+++ b/kernel/fs/nilfs2/sufile.c
@@ -0,0 +1,1222 @@
+/*
+ * sufile.c - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "sufile.h"
+
+/**
+ * struct nilfs_sufile_info - on-memory private data of sufile
+ * @mi: on-memory private data of metadata file
+ * @ncleansegs: number of clean segments
+ * @allocmin: lower limit of allocatable segment range
+ * @allocmax: upper limit of allocatable segment range
+ */
+struct nilfs_sufile_info {
+	struct nilfs_mdt_info mi;
+	unsigned long ncleansegs;/* number of clean segments */
+	__u64 allocmin;		/* lower limit of allocatable segment range */
+	__u64 allocmax;		/* upper limit of allocatable segment range */
+};
+
+static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
+{
+	return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
+}
+
+static inline unsigned long
+nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
+{
+	return NILFS_MDT(sufile)->mi_entries_per_block;
+}
+
+static unsigned long
+nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+	return (unsigned long)t;
+}
+
+static unsigned long
+nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+}
+
+static unsigned long
+nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
+				     __u64 max)
+{
+	return min_t(unsigned long,
+		     nilfs_sufile_segment_usages_per_block(sufile) -
+		     nilfs_sufile_get_offset(sufile, curr),
+		     max - curr + 1);
+}
+
+static struct nilfs_segment_usage *
+nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
+				     struct buffer_head *bh, void *kaddr)
+{
+	return kaddr + bh_offset(bh) +
+		nilfs_sufile_get_offset(sufile, segnum) *
+		NILFS_MDT(sufile)->mi_entry_size;
+}
+
+static inline int nilfs_sufile_get_header_block(struct inode *sufile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+}
+
+static inline int
+nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
+				     int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile,
+				   nilfs_sufile_get_blkoff(sufile, segnum),
+				   create, NULL, bhp);
+}
+
+static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile,
+						   __u64 segnum)
+{
+	return nilfs_mdt_delete_block(sufile,
+				      nilfs_sufile_get_blkoff(sufile, segnum));
+}
+
+static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
+				     u64 ncleanadd, u64 ndirtyadd)
+{
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = kaddr + bh_offset(header_bh);
+	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
+	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
+	kunmap_atomic(kaddr);
+
+	mark_buffer_dirty(header_bh);
+}
+
+/**
+ * nilfs_sufile_get_ncleansegs - return the number of clean segments
+ * @sufile: inode of segment usage file
+ */
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
+{
+	return NILFS_SUI(sufile)->ncleansegs;
+}
+
+/**
+ * nilfs_sufile_updatev - modify multiple segment usages at a time
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @create: creation flag
+ * @ndone: place to store number of modified segments on @segnumv
+ * @dofunc: primitive operation for the update
+ *
+ * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
+ * against the given array of segments.  The @dofunc is called with
+ * buffers of a header block and the sufile block in which the target
+ * segment usage entry is contained.  If @ndone is given, the number
+ * of successfully modified segments from the head is stored in the
+ * place @ndone points to.
+ *
+ * Return Value: On success, zero is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Given segment usage is in hole block (may be returned if
+ *            @create is zero)
+ *
+ * %-EINVAL - Invalid segment usage number
+ */
+int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
+			 int create, size_t *ndone,
+			 void (*dofunc)(struct inode *, __u64,
+					struct buffer_head *,
+					struct buffer_head *))
+{
+	struct buffer_head *header_bh, *bh;
+	unsigned long blkoff, prev_blkoff;
+	__u64 *seg;
+	size_t nerr = 0, n = 0;
+	int ret = 0;
+
+	if (unlikely(nsegs == 0))
+		goto out;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	for (seg = segnumv; seg < segnumv + nsegs; seg++) {
+		if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
+			printk(KERN_WARNING
+			       "%s: invalid segment number: %llu\n", __func__,
+			       (unsigned long long)*seg);
+			nerr++;
+		}
+	}
+	if (nerr > 0) {
+		ret = -EINVAL;
+		goto out_sem;
+	}
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	seg = segnumv;
+	blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+	ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+	if (ret < 0)
+		goto out_header;
+
+	for (;;) {
+		dofunc(sufile, *seg, header_bh, bh);
+
+		if (++seg >= segnumv + nsegs)
+			break;
+		prev_blkoff = blkoff;
+		blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+		if (blkoff == prev_blkoff)
+			continue;
+
+		/* get different block */
+		brelse(bh);
+		ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+		if (unlikely(ret < 0))
+			goto out_header;
+	}
+	brelse(bh);
+
+ out_header:
+	n = seg - segnumv;
+	brelse(header_bh);
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+ out:
+	if (ndone)
+		*ndone = n;
+	return ret;
+}
+
+int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *))
+{
+	struct buffer_head *header_bh, *bh;
+	int ret;
+
+	if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+		printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+		       __func__, (unsigned long long)segnum);
+		return -EINVAL;
+	}
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
+	if (!ret) {
+		dofunc(sufile, segnum, header_bh, bh);
+		brelse(bh);
+	}
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_set_alloc_range - limit range of segment to be allocated
+ * @sufile: inode of segment usage file
+ * @start: minimum segment number of allocatable region (inclusive)
+ * @end: maximum segment number of allocatable region (inclusive)
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-ERANGE - invalid segment region
+ */
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
+{
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	__u64 nsegs;
+	int ret = -ERANGE;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+
+	if (start <= end && end < nsegs) {
+		sui->allocmin = start;
+		sui->allocmax = end;
+		ret = 0;
+	}
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_alloc - allocate a segment
+ * @sufile: inode of segment usage file
+ * @segnump: pointer to segment number
+ *
+ * Description: nilfs_sufile_alloc() allocates a clean segment.
+ *
+ * Return Value: On success, 0 is returned and the segment number of the
+ * allocated segment is stored in the place pointed by @segnump. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No clean segment left.
+ */
+int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
+{
+	struct buffer_head *header_bh, *su_bh;
+	struct nilfs_sufile_header *header;
+	struct nilfs_segment_usage *su;
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	__u64 segnum, maxsegnum, last_alloc;
+	void *kaddr;
+	unsigned long nsegments, ncleansegs, nsus, cnt;
+	int ret, j;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = kaddr + bh_offset(header_bh);
+	ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	last_alloc = le64_to_cpu(header->sh_last_alloc);
+	kunmap_atomic(kaddr);
+
+	nsegments = nilfs_sufile_get_nsegments(sufile);
+	maxsegnum = sui->allocmax;
+	segnum = last_alloc + 1;
+	if (segnum < sui->allocmin || segnum > sui->allocmax)
+		segnum = sui->allocmin;
+
+	for (cnt = 0; cnt < nsegments; cnt += nsus) {
+		if (segnum > maxsegnum) {
+			if (cnt < sui->allocmax - sui->allocmin + 1) {
+				/*
+				 * wrap around in the limited region.
+				 * if allocation started from
+				 * sui->allocmin, this never happens.
+				 */
+				segnum = sui->allocmin;
+				maxsegnum = last_alloc;
+			} else if (segnum > sui->allocmin &&
+				   sui->allocmax + 1 < nsegments) {
+				segnum = sui->allocmax + 1;
+				maxsegnum = nsegments - 1;
+			} else if (sui->allocmin > 0)  {
+				segnum = 0;
+				maxsegnum = sui->allocmin - 1;
+			} else {
+				break; /* never happens */
+			}
+		}
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
+							   &su_bh);
+		if (ret < 0)
+			goto out_header;
+		kaddr = kmap_atomic(su_bh->b_page);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+
+		nsus = nilfs_sufile_segment_usages_in_block(
+			sufile, segnum, maxsegnum);
+		for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
+			if (!nilfs_segment_usage_clean(su))
+				continue;
+			/* found a clean segment */
+			nilfs_segment_usage_set_dirty(su);
+			kunmap_atomic(kaddr);
+
+			kaddr = kmap_atomic(header_bh->b_page);
+			header = kaddr + bh_offset(header_bh);
+			le64_add_cpu(&header->sh_ncleansegs, -1);
+			le64_add_cpu(&header->sh_ndirtysegs, 1);
+			header->sh_last_alloc = cpu_to_le64(segnum);
+			kunmap_atomic(kaddr);
+
+			sui->ncleansegs--;
+			mark_buffer_dirty(header_bh);
+			mark_buffer_dirty(su_bh);
+			nilfs_mdt_mark_dirty(sufile);
+			brelse(su_bh);
+			*segnump = segnum;
+			goto out_header;
+		}
+
+		kunmap_atomic(kaddr);
+		brelse(su_bh);
+	}
+
+	/* no segments left */
+	ret = -ENOSPC;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
+				 struct buffer_head *header_bh,
+				 struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+
+	kaddr = kmap_atomic(su_bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (unlikely(!nilfs_segment_usage_clean(su))) {
+		printk(KERN_WARNING "%s: segment %llu must be clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr);
+		return;
+	}
+	nilfs_segment_usage_set_dirty(su);
+	kunmap_atomic(kaddr);
+
+	nilfs_sufile_mod_counter(header_bh, -1, 1);
+	NILFS_SUI(sufile)->ncleansegs--;
+
+	mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
+			   struct buffer_head *header_bh,
+			   struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int clean, dirty;
+
+	kaddr = kmap_atomic(su_bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+	    su->su_nblocks == cpu_to_le32(0)) {
+		kunmap_atomic(kaddr);
+		return;
+	}
+	clean = nilfs_segment_usage_clean(su);
+	dirty = nilfs_segment_usage_dirty(su);
+
+	/* make the segment garbage */
+	su->su_lastmod = cpu_to_le64(0);
+	su->su_nblocks = cpu_to_le32(0);
+	su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+	kunmap_atomic(kaddr);
+
+	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+	NILFS_SUI(sufile)->ncleansegs -= clean;
+
+	mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
+			  struct buffer_head *header_bh,
+			  struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int sudirty;
+
+	kaddr = kmap_atomic(su_bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_clean(su)) {
+		printk(KERN_WARNING "%s: segment %llu is already clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr);
+		return;
+	}
+	WARN_ON(nilfs_segment_usage_error(su));
+	WARN_ON(!nilfs_segment_usage_dirty(su));
+
+	sudirty = nilfs_segment_usage_dirty(su);
+	nilfs_segment_usage_set_clean(su);
+	kunmap_atomic(kaddr);
+	mark_buffer_dirty(su_bh);
+
+	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+	NILFS_SUI(sufile)->ncleansegs++;
+
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+ * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ */
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
+{
+	struct buffer_head *bh;
+	int ret;
+
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+	if (!ret) {
+		mark_buffer_dirty(bh);
+		nilfs_mdt_mark_dirty(sufile);
+		brelse(bh);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_sufile_set_segment_usage - set usage of a segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @nblocks: number of live blocks in the segment
+ * @modtime: modification time (option)
+ */
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int ret;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+	if (ret < 0)
+		goto out_sem;
+
+	kaddr = kmap_atomic(bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	WARN_ON(nilfs_segment_usage_error(su));
+	if (modtime)
+		su->su_lastmod = cpu_to_le64(modtime);
+	su->su_nblocks = cpu_to_le32(nblocks);
+	kunmap_atomic(kaddr);
+
+	mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(sufile);
+	brelse(bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_get_stat - get segment usage statistics
+ * @sufile: inode of segment usage file
+ * @stat: pointer to a structure of segment usage statistics
+ *
+ * Description: nilfs_sufile_get_stat() returns information about segment
+ * usage.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
+{
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = kaddr + bh_offset(header_bh);
+	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
+	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
+	sustat->ss_ctime = nilfs->ns_ctime;
+	sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sustat->ss_prot_seq = nilfs->ns_prot_seq;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	kunmap_atomic(kaddr);
+	brelse(header_bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
+			       struct buffer_head *header_bh,
+			       struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int suclean;
+
+	kaddr = kmap_atomic(su_bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_error(su)) {
+		kunmap_atomic(kaddr);
+		return;
+	}
+	suclean = nilfs_segment_usage_clean(su);
+	nilfs_segment_usage_set_error(su);
+	kunmap_atomic(kaddr);
+
+	if (suclean) {
+		nilfs_sufile_mod_counter(header_bh, -1, 0);
+		NILFS_SUI(sufile)->ncleansegs--;
+	}
+	mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+  * nilfs_sufile_truncate_range - truncate range of segment array
+  * @sufile: inode of segment usage file
+  * @start: start segment number (inclusive)
+  * @end: end segment number (inclusive)
+  *
+  * Return Value: On success, 0 is returned.  On error, one of the
+  * following negative error codes is returned.
+  *
+  * %-EIO - I/O error.
+  *
+  * %-ENOMEM - Insufficient amount of memory available.
+  *
+  * %-EINVAL - Invalid number of segments specified
+  *
+  * %-EBUSY - Dirty or active segments are present in the range
+  */
+static int nilfs_sufile_truncate_range(struct inode *sufile,
+				       __u64 start, __u64 end)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *header_bh;
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su, *su2;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	unsigned long segusages_per_block;
+	unsigned long nsegs, ncleaned;
+	__u64 segnum;
+	void *kaddr;
+	ssize_t n, nc;
+	int ret;
+	int j;
+
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+
+	ret = -EINVAL;
+	if (start > end || start >= nsegs)
+		goto out;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out;
+
+	segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+	ncleaned = 0;
+
+	for (segnum = start; segnum <= end; segnum += n) {
+		n = min_t(unsigned long,
+			  segusages_per_block -
+				  nilfs_sufile_get_offset(sufile, segnum),
+			  end - segnum + 1);
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out_header;
+			/* hole */
+			continue;
+		}
+		kaddr = kmap_atomic(su_bh->b_page);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+		su2 = su;
+		for (j = 0; j < n; j++, su = (void *)su + susz) {
+			if ((le32_to_cpu(su->su_flags) &
+			     ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+			    nilfs_segment_is_active(nilfs, segnum + j)) {
+				ret = -EBUSY;
+				kunmap_atomic(kaddr);
+				brelse(su_bh);
+				goto out_header;
+			}
+		}
+		nc = 0;
+		for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) {
+			if (nilfs_segment_usage_error(su)) {
+				nilfs_segment_usage_set_clean(su);
+				nc++;
+			}
+		}
+		kunmap_atomic(kaddr);
+		if (nc > 0) {
+			mark_buffer_dirty(su_bh);
+			ncleaned += nc;
+		}
+		brelse(su_bh);
+
+		if (n == segusages_per_block) {
+			/* make hole */
+			nilfs_sufile_delete_segment_usage_block(sufile, segnum);
+		}
+	}
+	ret = 0;
+
+out_header:
+	if (ncleaned > 0) {
+		NILFS_SUI(sufile)->ncleansegs += ncleaned;
+		nilfs_sufile_mod_counter(header_bh, ncleaned, 0);
+		nilfs_mdt_mark_dirty(sufile);
+	}
+	brelse(header_bh);
+out:
+	return ret;
+}
+
+/**
+ * nilfs_sufile_resize - resize segment array
+ * @sufile: inode of segment usage file
+ * @newnsegs: new number of segments
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - Enough free space is not left for shrinking
+ *
+ * %-EBUSY - Dirty or active segments exist in the region to be truncated
+ */
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	void *kaddr;
+	unsigned long nsegs, nrsvsegs;
+	int ret = 0;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+	if (nsegs == newnsegs)
+		goto out;
+
+	ret = -ENOSPC;
+	nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs);
+	if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs)
+		goto out;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out;
+
+	if (newnsegs > nsegs) {
+		sui->ncleansegs += newnsegs - nsegs;
+	} else /* newnsegs < nsegs */ {
+		ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1);
+		if (ret < 0)
+			goto out_header;
+
+		sui->ncleansegs -= nsegs - newnsegs;
+	}
+
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = kaddr + bh_offset(header_bh);
+	header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
+	kunmap_atomic(kaddr);
+
+	mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(sufile);
+	nilfs_set_nsegments(nilfs, newnsegs);
+
+out_header:
+	brelse(header_bh);
+out:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_get_suinfo -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to start looking
+ * @buf: array of suinfo
+ * @sisz: byte size of suinfo
+ * @nsi: size of suinfo array
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned and .... On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
+				unsigned sisz, size_t nsi)
+{
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su;
+	struct nilfs_suinfo *si = buf;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	void *kaddr;
+	unsigned long nsegs, segusages_per_block;
+	ssize_t n;
+	int ret, i, j;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+	nsegs = min_t(unsigned long,
+		      nilfs_sufile_get_nsegments(sufile) - segnum,
+		      nsi);
+	for (i = 0; i < nsegs; i += n, segnum += n) {
+		n = min_t(unsigned long,
+			  segusages_per_block -
+				  nilfs_sufile_get_offset(sufile, segnum),
+			  nsegs - i);
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out;
+			/* hole */
+			memset(si, 0, sisz * n);
+			si = (void *)si + sisz * n;
+			continue;
+		}
+
+		kaddr = kmap_atomic(su_bh->b_page);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+		for (j = 0; j < n;
+		     j++, su = (void *)su + susz, si = (void *)si + sisz) {
+			si->sui_lastmod = le64_to_cpu(su->su_lastmod);
+			si->sui_nblocks = le32_to_cpu(su->su_nblocks);
+			si->sui_flags = le32_to_cpu(su->su_flags) &
+				~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+			if (nilfs_segment_is_active(nilfs, segnum + j))
+				si->sui_flags |=
+					(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+		}
+		kunmap_atomic(kaddr);
+		brelse(su_bh);
+	}
+	ret = nsegs;
+
+ out:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_set_suinfo - sets segment usage info
+ * @sufile: inode of segment usage file
+ * @buf: array of suinfo_update
+ * @supsz: byte size of suinfo_update
+ * @nsup: size of suinfo_update array
+ *
+ * Description: Takes an array of nilfs_suinfo_update structs and updates
+ * segment usage accordingly. Only the fields indicated by the sup_flags
+ * are updated.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ */
+ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
+				unsigned supsz, size_t nsup)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *header_bh, *bh;
+	struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup;
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	unsigned long blkoff, prev_blkoff;
+	int cleansi, cleansu, dirtysi, dirtysu;
+	long ncleaned = 0, ndirtied = 0;
+	int ret = 0;
+
+	if (unlikely(nsup == 0))
+		return ret;
+
+	for (sup = buf; sup < supend; sup = (void *)sup + supsz) {
+		if (sup->sup_segnum >= nilfs->ns_nsegments
+			|| (sup->sup_flags &
+				(~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
+			|| (nilfs_suinfo_update_nblocks(sup) &&
+				sup->sup_sui.sui_nblocks >
+				nilfs->ns_blocks_per_segment))
+			return -EINVAL;
+	}
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	sup = buf;
+	blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
+	ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
+	if (ret < 0)
+		goto out_header;
+
+	for (;;) {
+		kaddr = kmap_atomic(bh->b_page);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, sup->sup_segnum, bh, kaddr);
+
+		if (nilfs_suinfo_update_lastmod(sup))
+			su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod);
+
+		if (nilfs_suinfo_update_nblocks(sup))
+			su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);
+
+		if (nilfs_suinfo_update_flags(sup)) {
+			/*
+			 * Active flag is a virtual flag projected by running
+			 * nilfs kernel code - drop it not to write it to
+			 * disk.
+			 */
+			sup->sup_sui.sui_flags &=
+					~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+
+			cleansi = nilfs_suinfo_clean(&sup->sup_sui);
+			cleansu = nilfs_segment_usage_clean(su);
+			dirtysi = nilfs_suinfo_dirty(&sup->sup_sui);
+			dirtysu = nilfs_segment_usage_dirty(su);
+
+			if (cleansi && !cleansu)
+				++ncleaned;
+			else if (!cleansi && cleansu)
+				--ncleaned;
+
+			if (dirtysi && !dirtysu)
+				++ndirtied;
+			else if (!dirtysi && dirtysu)
+				--ndirtied;
+
+			su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
+		}
+
+		kunmap_atomic(kaddr);
+
+		sup = (void *)sup + supsz;
+		if (sup >= supend)
+			break;
+
+		prev_blkoff = blkoff;
+		blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
+		if (blkoff == prev_blkoff)
+			continue;
+
+		/* get different block */
+		mark_buffer_dirty(bh);
+		put_bh(bh);
+		ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
+		if (unlikely(ret < 0))
+			goto out_mark;
+	}
+	mark_buffer_dirty(bh);
+	put_bh(bh);
+
+ out_mark:
+	if (ncleaned || ndirtied) {
+		nilfs_sufile_mod_counter(header_bh, (u64)ncleaned,
+				(u64)ndirtied);
+		NILFS_SUI(sufile)->ncleansegs += ncleaned;
+	}
+	nilfs_mdt_mark_dirty(sufile);
+ out_header:
+	put_bh(header_bh);
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ * @sufile: inode of segment usage file
+ * @range: fstrim_range structure
+ *
+ * start:	First Byte to trim
+ * len:		number of Bytes to trim from start
+ * minlen:	minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. start is rounded up to the next block boundary
+ * and start+len is rounded down. For each clean segment blkdev_issue_discard
+ * function is invoked.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
+	sector_t seg_start, seg_end, start_block, end_block;
+	sector_t start = 0, nblocks = 0;
+	u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0;
+	int ret = 0;
+	unsigned int sects_per_block;
+
+	sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+			bdev_logical_block_size(nilfs->ns_bdev);
+	len = range->len >> nilfs->ns_blocksize_bits;
+	minlen = range->minlen >> nilfs->ns_blocksize_bits;
+	max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment);
+
+	if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits)
+		return -EINVAL;
+
+	start_block = (range->start + nilfs->ns_blocksize - 1) >>
+			nilfs->ns_blocksize_bits;
+
+	/*
+	 * range->len can be very large (actually, it is set to
+	 * ULLONG_MAX by default) - truncate upper end of the range
+	 * carefully so as not to overflow.
+	 */
+	if (max_blocks - start_block < len)
+		end_block = max_blocks - 1;
+	else
+		end_block = start_block + len - 1;
+
+	segnum = nilfs_get_segnum_of_block(nilfs, start_block);
+	segnum_end = nilfs_get_segnum_of_block(nilfs, end_block);
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	while (segnum <= segnum_end) {
+		n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
+				segnum_end);
+
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out_sem;
+			/* hole */
+			segnum += n;
+			continue;
+		}
+
+		kaddr = kmap_atomic(su_bh->b_page);
+		su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
+				su_bh, kaddr);
+		for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+			if (!nilfs_segment_usage_clean(su))
+				continue;
+
+			nilfs_get_segment_range(nilfs, segnum, &seg_start,
+						&seg_end);
+
+			if (!nblocks) {
+				/* start new extent */
+				start = seg_start;
+				nblocks = seg_end - seg_start + 1;
+				continue;
+			}
+
+			if (start + nblocks == seg_start) {
+				/* add to previous extent */
+				nblocks += seg_end - seg_start + 1;
+				continue;
+			}
+
+			/* discard previous extent */
+			if (start < start_block) {
+				nblocks -= start_block - start;
+				start = start_block;
+			}
+
+			if (nblocks >= minlen) {
+				kunmap_atomic(kaddr);
+
+				ret = blkdev_issue_discard(nilfs->ns_bdev,
+						start * sects_per_block,
+						nblocks * sects_per_block,
+						GFP_NOFS, 0);
+				if (ret < 0) {
+					put_bh(su_bh);
+					goto out_sem;
+				}
+
+				ndiscarded += nblocks;
+				kaddr = kmap_atomic(su_bh->b_page);
+				su = nilfs_sufile_block_get_segment_usage(
+					sufile, segnum, su_bh, kaddr);
+			}
+
+			/* start new extent */
+			start = seg_start;
+			nblocks = seg_end - seg_start + 1;
+		}
+		kunmap_atomic(kaddr);
+		put_bh(su_bh);
+	}
+
+
+	if (nblocks) {
+		/* discard last extent */
+		if (start < start_block) {
+			nblocks -= start_block - start;
+			start = start_block;
+		}
+		if (start + nblocks > end_block + 1)
+			nblocks = end_block - start + 1;
+
+		if (nblocks >= minlen) {
+			ret = blkdev_issue_discard(nilfs->ns_bdev,
+					start * sects_per_block,
+					nblocks * sects_per_block,
+					GFP_NOFS, 0);
+			if (!ret)
+				ndiscarded += nblocks;
+		}
+	}
+
+out_sem:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+
+	range->len = ndiscarded << nilfs->ns_blocksize_bits;
+	return ret;
+}
+
+/**
+ * nilfs_sufile_read - read or get sufile inode
+ * @sb: super block instance
+ * @susize: size of a segment usage entry
+ * @raw_inode: on-disk sufile inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
+{
+	struct inode *sufile;
+	struct nilfs_sufile_info *sui;
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+	int err;
+
+	if (susize > sb->s_blocksize) {
+		printk(KERN_ERR
+		       "NILFS: too large segment usage size: %zu bytes.\n",
+		       susize);
+		return -EINVAL;
+	} else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
+		printk(KERN_ERR
+		       "NILFS: too small segment usage size: %zu bytes.\n",
+		       susize);
+		return -EINVAL;
+	}
+
+	sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
+	if (unlikely(!sufile))
+		return -ENOMEM;
+	if (!(sufile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
+	if (err)
+		goto failed;
+
+	nilfs_mdt_set_entry_size(sufile, susize,
+				 sizeof(struct nilfs_sufile_header));
+
+	err = nilfs_read_inode_common(sufile, raw_inode);
+	if (err)
+		goto failed;
+
+	err = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (err)
+		goto failed;
+
+	sui = NILFS_SUI(sufile);
+	kaddr = kmap_atomic(header_bh->b_page);
+	header = kaddr + bh_offset(header_bh);
+	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	kunmap_atomic(kaddr);
+	brelse(header_bh);
+
+	sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
+	sui->allocmin = 0;
+
+	unlock_new_inode(sufile);
+ out:
+	*inodep = sufile;
+	return 0;
+ failed:
+	iget_failed(sufile);
+	return err;
+}
diff --git a/kernel/fs/nilfs2/sufile.h b/kernel/fs/nilfs2/sufile.h
new file mode 100644
index 000000000..b8afd72f2
--- /dev/null
+++ b/kernel/fs/nilfs2/sufile.h
@@ -0,0 +1,146 @@
+/*
+ * sufile.h - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_SUFILE_H
+#define _NILFS_SUFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+
+
+static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
+{
+	return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments;
+}
+
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
+
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end);
+int nilfs_sufile_alloc(struct inode *, __u64 *);
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime);
+int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
+				size_t);
+ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
+
+int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
+			 void (*dofunc)(struct inode *, __u64,
+					struct buffer_head *,
+					struct buffer_head *));
+int nilfs_sufile_update(struct inode *, __u64, int,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *));
+void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
+			   struct buffer_head *);
+void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
+			  struct buffer_head *);
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+				 struct buffer_head *);
+void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
+			       struct buffer_head *);
+
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
+
+/**
+ * nilfs_sufile_scrap - make a segment garbage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
+}
+
+/**
+ * nilfs_sufile_free - free segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
+}
+
+/**
+ * nilfs_sufile_freev - free segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of freed segments
+ */
+static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
+				     size_t nsegs, size_t *ndone)
+{
+	return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+				    nilfs_sufile_do_free);
+}
+
+/**
+ * nilfs_sufile_cancel_freev - reallocate freeing segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of cancelled segments
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error codes
+ * is returned.
+ */
+static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
+					    __u64 *segnumv, size_t nsegs,
+					    size_t *ndone)
+{
+	return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+				    nilfs_sufile_do_cancel_free);
+}
+
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0,
+				   nilfs_sufile_do_set_error);
+}
+
+#endif	/* _NILFS_SUFILE_H */
diff --git a/kernel/fs/nilfs2/super.c b/kernel/fs/nilfs2/super.c
new file mode 100644
index 000000000..f47585bfe
--- /dev/null
+++ b/kernel/fs/nilfs2/super.c
@@ -0,0 +1,1486 @@
+/*
+ * super.c - NILFS module and super block management.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/crc32.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include "nilfs.h"
+#include "export.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "btree.h"
+#include "btnode.h"
+#include "page.h"
+#include "cpfile.h"
+#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */
+#include "ifile.h"
+#include "dat.h"
+#include "segment.h"
+#include "segbuf.h"
+
+MODULE_AUTHOR("NTT Corp.");
+MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
+		   "(NILFS)");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
+
+static int nilfs_setup_super(struct super_block *sb, int is_mount);
+static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+
+static void nilfs_set_error(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+
+	down_write(&nilfs->ns_sem);
+	if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+		nilfs->ns_mount_state |= NILFS_ERROR_FS;
+		sbp = nilfs_prepare_super(sb, 0);
+		if (likely(sbp)) {
+			sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+			if (sbp[1])
+				sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+			nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+		}
+	}
+	up_write(&nilfs->ns_sem);
+}
+
+/**
+ * nilfs_error() - report failure condition on a filesystem
+ *
+ * nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message.  It should be called when NILFS detects
+ * incoherences or defects of meta data on disk.  As for sustainable
+ * errors such as a single-shot I/O error, nilfs_warning() or the printk()
+ * function should be used instead.
+ *
+ * The segment constructor must not call this function because it can
+ * kill itself.
+ */
+void nilfs_error(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		nilfs_set_error(sb);
+
+		if (nilfs_test_opt(nilfs, ERRORS_RO)) {
+			printk(KERN_CRIT "Remounting filesystem read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+	}
+
+	if (nilfs_test_opt(nilfs, ERRORS_PANIC))
+		panic("NILFS (device %s): panic forced after error\n",
+		      sb->s_id);
+}
+
+void nilfs_warning(struct super_block *sb, const char *function,
+		   const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+}
+
+
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+	struct nilfs_inode_info *ii;
+
+	ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+	if (!ii)
+		return NULL;
+	ii->i_bh = NULL;
+	ii->i_state = 0;
+	ii->i_cno = 0;
+	ii->vfs_inode.i_version = 1;
+	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
+	return &ii->vfs_inode;
+}
+
+static void nilfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	if (mdi) {
+		kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+		kfree(mdi);
+	}
+	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
+}
+
+void nilfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nilfs_i_callback);
+}
+
+static int nilfs_sync_super(struct super_block *sb, int flag)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err;
+
+ retry:
+	set_buffer_dirty(nilfs->ns_sbh[0]);
+	if (nilfs_test_opt(nilfs, BARRIER)) {
+		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
+					  WRITE_SYNC | WRITE_FLUSH_FUA);
+	} else {
+		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
+	}
+
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: unable to write superblock (err=%d)\n", err);
+		if (err == -EIO && nilfs->ns_sbh[1]) {
+			/*
+			 * sbp[0] points to newer log than sbp[1],
+			 * so copy sbp[0] to sbp[1] to take over sbp[0].
+			 */
+			memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
+			       nilfs->ns_sbsize);
+			nilfs_fall_back_super_block(nilfs);
+			goto retry;
+		}
+	} else {
+		struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+
+		nilfs->ns_sbwcount++;
+
+		/*
+		 * The latest segment becomes trailable from the position
+		 * written in superblock.
+		 */
+		clear_nilfs_discontinued(nilfs);
+
+		/* update GC protection for recent segments */
+		if (nilfs->ns_sbh[1]) {
+			if (flag == NILFS_SB_COMMIT_ALL) {
+				set_buffer_dirty(nilfs->ns_sbh[1]);
+				if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
+					goto out;
+			}
+			if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
+			    le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
+				sbp = nilfs->ns_sbp[1];
+		}
+
+		spin_lock(&nilfs->ns_last_segment_lock);
+		nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+		spin_unlock(&nilfs->ns_last_segment_lock);
+	}
+ out:
+	return err;
+}
+
+void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
+			  struct the_nilfs *nilfs)
+{
+	sector_t nfreeblocks;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+	sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+	sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+	spin_unlock(&nilfs->ns_last_segment_lock);
+}
+
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
+					       int flip)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+		if (sbp[1] &&
+		    sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
+			memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+		} else {
+			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
+			       sb->s_id);
+			return NULL;
+		}
+	} else if (sbp[1] &&
+		   sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+			memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+	}
+
+	if (flip && sbp[1])
+		nilfs_swap_super_block(nilfs);
+
+	return sbp;
+}
+
+int nilfs_commit_super(struct super_block *sb, int flag)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	time_t t;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	t = get_seconds();
+	nilfs->ns_sbwtime = t;
+	sbp[0]->s_wtime = cpu_to_le64(t);
+	sbp[0]->s_sum = 0;
+	sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					     (unsigned char *)sbp[0],
+					     nilfs->ns_sbsize));
+	if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
+		sbp[1]->s_wtime = sbp[0]->s_wtime;
+		sbp[1]->s_sum = 0;
+		sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					    (unsigned char *)sbp[1],
+					    nilfs->ns_sbsize));
+	}
+	clear_nilfs_sb_dirty(nilfs);
+	nilfs->ns_flushed_device = 1;
+	/* make sure store to ns_flushed_device cannot be reordered */
+	smp_wmb();
+	return nilfs_sync_super(sb, flag);
+}
+
+/**
+ * nilfs_cleanup_super() - write filesystem state for cleanup
+ * @sb: super block instance to be unmounted or degraded to read-only
+ *
+ * This function restores state flags in the on-disk super block.
+ * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
+ * filesystem was not clean previously.
+ */
+int nilfs_cleanup_super(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int flag = NILFS_SB_COMMIT;
+	int ret = -EIO;
+
+	sbp = nilfs_prepare_super(sb, 0);
+	if (sbp) {
+		sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		nilfs_set_log_cursor(sbp[0], nilfs);
+		if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
+			/*
+			 * make the "clean" flag also to the opposite
+			 * super block if both super blocks point to
+			 * the same checkpoint.
+			 */
+			sbp[1]->s_state = sbp[0]->s_state;
+			flag = NILFS_SB_COMMIT_ALL;
+		}
+		ret = nilfs_commit_super(sb, flag);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_move_2nd_super - relocate secondary super block
+ * @sb: super block instance
+ * @sb2off: new offset of the secondary super block (in bytes)
+ */
+static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct buffer_head *nsbh;
+	struct nilfs_super_block *nsbp;
+	sector_t blocknr, newblocknr;
+	unsigned long offset;
+	int sb2i = -1;  /* array index of the secondary superblock */
+	int ret = 0;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	if (nilfs->ns_sbh[1] &&
+	    nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) {
+		sb2i = 1;
+		blocknr = nilfs->ns_sbh[1]->b_blocknr;
+	} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
+		sb2i = 0;
+		blocknr = nilfs->ns_sbh[0]->b_blocknr;
+	}
+	if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
+		goto out;  /* super block location is unchanged */
+
+	/* Get new super block buffer */
+	newblocknr = sb2off >> nilfs->ns_blocksize_bits;
+	offset = sb2off & (nilfs->ns_blocksize - 1);
+	nsbh = sb_getblk(sb, newblocknr);
+	if (!nsbh) {
+		printk(KERN_WARNING
+		       "NILFS warning: unable to move secondary superblock "
+		       "to block %llu\n", (unsigned long long)newblocknr);
+		ret = -EIO;
+		goto out;
+	}
+	nsbp = (void *)nsbh->b_data + offset;
+	memset(nsbp, 0, nilfs->ns_blocksize);
+
+	if (sb2i >= 0) {
+		memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+		brelse(nilfs->ns_sbh[sb2i]);
+		nilfs->ns_sbh[sb2i] = nsbh;
+		nilfs->ns_sbp[sb2i] = nsbp;
+	} else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) {
+		/* secondary super block will be restored to index 1 */
+		nilfs->ns_sbh[1] = nsbh;
+		nilfs->ns_sbp[1] = nsbp;
+	} else {
+		brelse(nsbh);
+	}
+out:
+	return ret;
+}
+
+/**
+ * nilfs_resize_fs - resize the filesystem
+ * @sb: super block instance
+ * @newsize: new size of the filesystem (in bytes)
+ */
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	__u64 devsize, newnsegs;
+	loff_t sb2off;
+	int ret;
+
+	ret = -ERANGE;
+	devsize = i_size_read(sb->s_bdev->bd_inode);
+	if (newsize > devsize)
+		goto out;
+
+	/*
+	 * Write lock is required to protect some functions depending
+	 * on the number of segments, the number of reserved segments,
+	 * and so forth.
+	 */
+	down_write(&nilfs->ns_segctor_sem);
+
+	sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
+	newnsegs = sb2off >> nilfs->ns_blocksize_bits;
+	do_div(newnsegs, nilfs->ns_blocks_per_segment);
+
+	ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
+	up_write(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		goto out;
+
+	ret = nilfs_construct_segment(sb);
+	if (ret < 0)
+		goto out;
+
+	down_write(&nilfs->ns_sem);
+	nilfs_move_2nd_super(sb, sb2off);
+	ret = -EIO;
+	sbp = nilfs_prepare_super(sb, 0);
+	if (likely(sbp)) {
+		nilfs_set_log_cursor(sbp[0], nilfs);
+		/*
+		 * Drop NILFS_RESIZE_FS flag for compatibility with
+		 * mount-time resize which may be implemented in a
+		 * future release.
+		 */
+		sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) &
+					      ~NILFS_RESIZE_FS);
+		sbp[0]->s_dev_size = cpu_to_le64(newsize);
+		sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments);
+		if (sbp[1])
+			memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+		ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+	}
+	up_write(&nilfs->ns_sem);
+
+	/*
+	 * Reset the range of allocatable segments last.  This order
+	 * is important in the case of expansion because the secondary
+	 * superblock must be protected from log write until migration
+	 * completes.
+	 */
+	if (!ret)
+		nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1);
+out:
+	return ret;
+}
+
+static void nilfs_put_super(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	nilfs_detach_log_writer(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs_cleanup_super(sb);
+		up_write(&nilfs->ns_sem);
+	}
+
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
+	destroy_nilfs(nilfs);
+	sb->s_fs_info = NULL;
+}
+
+static int nilfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int err = 0;
+
+	/* This function is called when super block should be written back */
+	if (wait)
+		err = nilfs_construct_segment(sb);
+
+	down_write(&nilfs->ns_sem);
+	if (nilfs_sb_dirty(nilfs)) {
+		sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
+		if (likely(sbp)) {
+			nilfs_set_log_cursor(sbp[0], nilfs);
+			nilfs_commit_super(sb, NILFS_SB_COMMIT);
+		}
+	}
+	up_write(&nilfs->ns_sem);
+
+	if (!err)
+		err = nilfs_flush_device(nilfs);
+
+	return err;
+}
+
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
+			    struct nilfs_root **rootp)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_root *root;
+	struct nilfs_checkpoint *raw_cp;
+	struct buffer_head *bh_cp;
+	int err = -ENOMEM;
+
+	root = nilfs_find_or_create_root(
+		nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
+	if (!root)
+		return err;
+
+	if (root->ifile)
+		goto reuse; /* already attached checkpoint */
+
+	down_read(&nilfs->ns_segctor_sem);
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
+					  &bh_cp);
+	up_read(&nilfs->ns_segctor_sem);
+	if (unlikely(err)) {
+		if (err == -ENOENT || err == -EINVAL) {
+			printk(KERN_ERR
+			       "NILFS: Invalid checkpoint "
+			       "(checkpoint number=%llu)\n",
+			       (unsigned long long)cno);
+			err = -EINVAL;
+		}
+		goto failed;
+	}
+
+	err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
+			       &raw_cp->cp_ifile_inode, &root->ifile);
+	if (err)
+		goto failed_bh;
+
+	atomic64_set(&root->inodes_count,
+			le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic64_set(&root->blocks_count,
+			le64_to_cpu(raw_cp->cp_blocks_count));
+
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+
+ reuse:
+	*rootp = root;
+	return 0;
+
+ failed_bh:
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ failed:
+	nilfs_put_root(root);
+
+	return err;
+}
+
+static int nilfs_freeze(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	/* Mark super block clean */
+	down_write(&nilfs->ns_sem);
+	err = nilfs_cleanup_super(sb);
+	up_write(&nilfs->ns_sem);
+	return err;
+}
+
+static int nilfs_unfreeze(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	down_write(&nilfs->ns_sem);
+	nilfs_setup_super(sb, false);
+	up_write(&nilfs->ns_sem);
+	return 0;
+}
+
+static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root;
+	struct the_nilfs *nilfs = root->nilfs;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	unsigned long long blocks;
+	unsigned long overhead;
+	unsigned long nrsvblocks;
+	sector_t nfreeblocks;
+	u64 nmaxinodes, nfreeinodes;
+	int err;
+
+	/*
+	 * Compute all of the segment blocks
+	 *
+	 * The blocks before first segment and after last segment
+	 * are excluded.
+	 */
+	blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
+		- nilfs->ns_first_data_block;
+	nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
+
+	/*
+	 * Compute the overhead
+	 *
+	 * When distributing meta data blocks outside segment structure,
+	 * We must count them as the overhead.
+	 */
+	overhead = 0;
+
+	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	if (unlikely(err))
+		return err;
+
+	err = nilfs_ifile_count_free_inodes(root->ifile,
+					    &nmaxinodes, &nfreeinodes);
+	if (unlikely(err)) {
+		printk(KERN_WARNING
+			"NILFS warning: fail to count free inodes: err %d.\n",
+			err);
+		if (err == -ERANGE) {
+			/*
+			 * If nilfs_palloc_count_max_entries() returns
+			 * -ERANGE error code then we simply treat
+			 * curent inodes count as maximum possible and
+			 * zero as free inodes value.
+			 */
+			nmaxinodes = atomic64_read(&root->inodes_count);
+			nfreeinodes = 0;
+			err = 0;
+		} else
+			return err;
+	}
+
+	buf->f_type = NILFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = blocks - overhead;
+	buf->f_bfree = nfreeblocks;
+	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
+		(buf->f_bfree - nrsvblocks) : 0;
+	buf->f_files = nmaxinodes;
+	buf->f_ffree = nfreeinodes;
+	buf->f_namelen = NILFS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root;
+
+	if (!nilfs_test_opt(nilfs, BARRIER))
+		seq_puts(seq, ",nobarrier");
+	if (root->cno != NILFS_CPTREE_CURRENT_CNO)
+		seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
+	if (nilfs_test_opt(nilfs, ERRORS_PANIC))
+		seq_puts(seq, ",errors=panic");
+	if (nilfs_test_opt(nilfs, ERRORS_CONT))
+		seq_puts(seq, ",errors=continue");
+	if (nilfs_test_opt(nilfs, STRICT_ORDER))
+		seq_puts(seq, ",order=strict");
+	if (nilfs_test_opt(nilfs, NORECOVERY))
+		seq_puts(seq, ",norecovery");
+	if (nilfs_test_opt(nilfs, DISCARD))
+		seq_puts(seq, ",discard");
+
+	return 0;
+}
+
+static const struct super_operations nilfs_sops = {
+	.alloc_inode    = nilfs_alloc_inode,
+	.destroy_inode  = nilfs_destroy_inode,
+	.dirty_inode    = nilfs_dirty_inode,
+	.evict_inode    = nilfs_evict_inode,
+	.put_super      = nilfs_put_super,
+	.sync_fs        = nilfs_sync_fs,
+	.freeze_fs	= nilfs_freeze,
+	.unfreeze_fs	= nilfs_unfreeze,
+	.statfs         = nilfs_statfs,
+	.remount_fs     = nilfs_remount,
+	.show_options = nilfs_show_options
+};
+
+enum {
+	Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
+	Opt_discard, Opt_nodiscard, Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_snapshot, "cp=%u"},
+	{Opt_order, "order=%s"},
+	{Opt_norecovery, "norecovery"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct super_block *sb, int is_remount)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_barrier:
+			nilfs_set_opt(nilfs, BARRIER);
+			break;
+		case Opt_nobarrier:
+			nilfs_clear_opt(nilfs, BARRIER);
+			break;
+		case Opt_order:
+			if (strcmp(args[0].from, "relaxed") == 0)
+				/* Ordered data semantics */
+				nilfs_clear_opt(nilfs, STRICT_ORDER);
+			else if (strcmp(args[0].from, "strict") == 0)
+				/* Strict in-order semantics */
+				nilfs_set_opt(nilfs, STRICT_ORDER);
+			else
+				return 0;
+			break;
+		case Opt_err_panic:
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
+			break;
+		case Opt_snapshot:
+			if (is_remount) {
+				printk(KERN_ERR
+				       "NILFS: \"%s\" option is invalid "
+				       "for remount.\n", p);
+				return 0;
+			}
+			break;
+		case Opt_norecovery:
+			nilfs_set_opt(nilfs, NORECOVERY);
+			break;
+		case Opt_discard:
+			nilfs_set_opt(nilfs, DISCARD);
+			break;
+		case Opt_nodiscard:
+			nilfs_clear_opt(nilfs, DISCARD);
+			break;
+		default:
+			printk(KERN_ERR
+			       "NILFS: Unrecognized mount option \"%s\"\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static inline void
+nilfs_set_default_options(struct super_block *sb,
+			  struct nilfs_super_block *sbp)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	nilfs->ns_mount_opt =
+		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
+}
+
+static int nilfs_setup_super(struct super_block *sb, int is_mount)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int max_mnt_count;
+	int mnt_count;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	sbp = nilfs_prepare_super(sb, 0);
+	if (!sbp)
+		return -EIO;
+
+	if (!is_mount)
+		goto skip_mount_setup;
+
+	max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
+	mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
+
+	if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+		printk(KERN_WARNING
+		       "NILFS warning: mounting fs with errors\n");
+#if 0
+	} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
+		printk(KERN_WARNING
+		       "NILFS warning: maximal mount count reached\n");
+#endif
+	}
+	if (!max_mnt_count)
+		sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+
+	sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
+	sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+
+skip_mount_setup:
+	sbp[0]->s_state =
+		cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
+	/* synchronize sbp[1] with sbp[0] */
+	if (sbp[1])
+		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+	return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+}
+
+struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
+						 u64 pos, int blocksize,
+						 struct buffer_head **pbh)
+{
+	unsigned long long sb_index = pos;
+	unsigned long offset;
+
+	offset = do_div(sb_index, blocksize);
+	*pbh = sb_bread(sb, sb_index);
+	if (!*pbh)
+		return NULL;
+	return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
+}
+
+int nilfs_store_magic_and_option(struct super_block *sb,
+				 struct nilfs_super_block *sbp,
+				 char *data)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	sb->s_magic = le16_to_cpu(sbp->s_magic);
+
+	/* FS independent flags */
+#ifdef NILFS_ATIME_DISABLE
+	sb->s_flags |= MS_NOATIME;
+#endif
+
+	nilfs_set_default_options(sb, sbp);
+
+	nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
+	nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
+	nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
+	nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
+
+	return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+}
+
+int nilfs_check_feature_compatibility(struct super_block *sb,
+				      struct nilfs_super_block *sbp)
+{
+	__u64 features;
+
+	features = le64_to_cpu(sbp->s_feature_incompat) &
+		~NILFS_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "optional features (%llx)\n",
+		       (unsigned long long)features);
+		return -EINVAL;
+	}
+	features = le64_to_cpu(sbp->s_feature_compat_ro) &
+		~NILFS_FEATURE_COMPAT_RO_SUPP;
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
+		       "unsupported optional features (%llx)\n",
+		       (unsigned long long)features);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int nilfs_get_root_dentry(struct super_block *sb,
+				 struct nilfs_root *root,
+				 struct dentry **root_dentry)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret = 0;
+
+	inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR "NILFS: get root inode failed\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+	if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
+		iput(inode);
+		printk(KERN_ERR "NILFS: corrupt root inode.\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+		dentry = d_find_alias(inode);
+		if (!dentry) {
+			dentry = d_make_root(inode);
+			if (!dentry) {
+				ret = -ENOMEM;
+				goto failed_dentry;
+			}
+		} else {
+			iput(inode);
+		}
+	} else {
+		dentry = d_obtain_root(inode);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			goto failed_dentry;
+		}
+	}
+	*root_dentry = dentry;
+ out:
+	return ret;
+
+ failed_dentry:
+	printk(KERN_ERR "NILFS: get root dentry failed\n");
+	goto out;
+}
+
+static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
+				 struct dentry **root_dentry)
+{
+	struct the_nilfs *nilfs = s->s_fs_info;
+	struct nilfs_root *root;
+	int ret;
+
+	mutex_lock(&nilfs->ns_snapshot_mount_mutex);
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0) {
+		ret = (ret == -ENOENT) ? -EINVAL : ret;
+		goto out;
+	} else if (!ret) {
+		printk(KERN_ERR "NILFS: The specified checkpoint is "
+		       "not a snapshot (checkpoint number=%llu).\n",
+		       (unsigned long long)cno);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = nilfs_attach_checkpoint(s, cno, false, &root);
+	if (ret) {
+		printk(KERN_ERR "NILFS: error loading snapshot "
+		       "(checkpoint number=%llu).\n",
+	       (unsigned long long)cno);
+		goto out;
+	}
+	ret = nilfs_get_root_dentry(s, root, root_dentry);
+	nilfs_put_root(root);
+ out:
+	mutex_unlock(&nilfs->ns_snapshot_mount_mutex);
+	return ret;
+}
+
+/**
+ * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint
+ * @root_dentry: root dentry of the tree to be shrunk
+ *
+ * This function returns true if the tree was in-use.
+ */
+static bool nilfs_tree_is_busy(struct dentry *root_dentry)
+{
+	shrink_dcache_parent(root_dentry);
+	return d_count(root_dentry) > 1;
+}
+
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_root *root;
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret;
+
+	if (cno > nilfs->ns_cno)
+		return false;
+
+	if (cno >= nilfs_last_cno(nilfs))
+		return true;	/* protect recent checkpoints */
+
+	ret = false;
+	root = nilfs_lookup_root(nilfs, cno);
+	if (root) {
+		inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
+		if (inode) {
+			dentry = d_find_alias(inode);
+			if (dentry) {
+				ret = nilfs_tree_is_busy(dentry);
+				dput(dentry);
+			}
+			iput(inode);
+		}
+		nilfs_put_root(root);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_fill_super() - initialize a super block instance
+ * @sb: super_block
+ * @data: mount options
+ * @silent: silent mode flag
+ *
+ * This function is called exclusively by nilfs->ns_mount_mutex.
+ * So, the recovery process is protected from other simultaneous mounts.
+ */
+static int
+nilfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct the_nilfs *nilfs;
+	struct nilfs_root *fsroot;
+	__u64 cno;
+	int err;
+
+	nilfs = alloc_nilfs(sb->s_bdev);
+	if (!nilfs)
+		return -ENOMEM;
+
+	sb->s_fs_info = nilfs;
+
+	err = init_nilfs(nilfs, sb, (char *)data);
+	if (err)
+		goto failed_nilfs;
+
+	sb->s_op = &nilfs_sops;
+	sb->s_export_op = &nilfs_export_ops;
+	sb->s_root = NULL;
+	sb->s_time_gran = 1;
+	sb->s_max_links = NILFS_LINK_MAX;
+
+	sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+
+	err = load_nilfs(nilfs, sb);
+	if (err)
+		goto failed_nilfs;
+
+	cno = nilfs_last_cno(nilfs);
+	err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
+	if (err) {
+		printk(KERN_ERR "NILFS: error loading last checkpoint "
+		       "(checkpoint number=%llu).\n", (unsigned long long)cno);
+		goto failed_unload;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		err = nilfs_attach_log_writer(sb, fsroot);
+		if (err)
+			goto failed_checkpoint;
+	}
+
+	err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
+	if (err)
+		goto failed_segctor;
+
+	nilfs_put_root(fsroot);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sb, true);
+		up_write(&nilfs->ns_sem);
+	}
+
+	return 0;
+
+ failed_segctor:
+	nilfs_detach_log_writer(sb);
+
+ failed_checkpoint:
+	nilfs_put_root(fsroot);
+
+ failed_unload:
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
+ failed_nilfs:
+	destroy_nilfs(nilfs);
+	return err;
+}
+
+static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	unsigned long old_sb_flags;
+	unsigned long old_mount_opt;
+	int err;
+
+	sync_filesystem(sb);
+	old_sb_flags = sb->s_flags;
+	old_mount_opt = nilfs->ns_mount_opt;
+
+	if (!parse_options(data, sb, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+
+	err = -EINVAL;
+
+	if (!nilfs_valid_fs(nilfs)) {
+		printk(KERN_WARNING "NILFS (device %s): couldn't "
+		       "remount because the filesystem is in an "
+		       "incomplete recovery state.\n", sb->s_id);
+		goto restore_opts;
+	}
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out;
+	if (*flags & MS_RDONLY) {
+		/* Shutting down log writer */
+		nilfs_detach_log_writer(sb);
+		sb->s_flags |= MS_RDONLY;
+
+		/*
+		 * Remounting a valid RW partition RDONLY, so set
+		 * the RDONLY flag and then mark the partition as valid again.
+		 */
+		down_write(&nilfs->ns_sem);
+		nilfs_cleanup_super(sb);
+		up_write(&nilfs->ns_sem);
+	} else {
+		__u64 features;
+		struct nilfs_root *root;
+
+		/*
+		 * Mounting a RDONLY partition read-write, so reread and
+		 * store the current valid flag.  (It may have been changed
+		 * by fsck since we originally mounted the partition.)
+		 */
+		down_read(&nilfs->ns_sem);
+		features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+			~NILFS_FEATURE_COMPAT_RO_SUPP;
+		up_read(&nilfs->ns_sem);
+		if (features) {
+			printk(KERN_WARNING "NILFS (device %s): couldn't "
+			       "remount RDWR because of unsupported optional "
+			       "features (%llx)\n",
+			       sb->s_id, (unsigned long long)features);
+			err = -EROFS;
+			goto restore_opts;
+		}
+
+		sb->s_flags &= ~MS_RDONLY;
+
+		root = NILFS_I(d_inode(sb->s_root))->i_root;
+		err = nilfs_attach_log_writer(sb, root);
+		if (err)
+			goto restore_opts;
+
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sb, true);
+		up_write(&nilfs->ns_sem);
+	}
+ out:
+	return 0;
+
+ restore_opts:
+	sb->s_flags = old_sb_flags;
+	nilfs->ns_mount_opt = old_mount_opt;
+	return err;
+}
+
+struct nilfs_super_data {
+	struct block_device *bdev;
+	__u64 cno;
+	int flags;
+};
+
+/**
+ * nilfs_identify - pre-read mount options needed to identify mount instance
+ * @data: mount options
+ * @sd: nilfs_super_data
+ */
+static int nilfs_identify(char *data, struct nilfs_super_data *sd)
+{
+	char *p, *options = data;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+	int ret = 0;
+
+	do {
+		p = strsep(&options, ",");
+		if (p != NULL && *p) {
+			token = match_token(p, tokens, args);
+			if (token == Opt_snapshot) {
+				if (!(sd->flags & MS_RDONLY)) {
+					ret++;
+				} else {
+					sd->cno = simple_strtoull(args[0].from,
+								  NULL, 0);
+					/*
+					 * No need to see the end pointer;
+					 * match_token() has done syntax
+					 * checking.
+					 */
+					if (sd->cno == 0)
+						ret++;
+				}
+			}
+			if (ret)
+				printk(KERN_ERR
+				       "NILFS: invalid mount option: %s\n", p);
+		}
+		if (!options)
+			break;
+		BUG_ON(options == data);
+		*(options - 1) = ',';
+	} while (!ret);
+	return ret;
+}
+
+static int nilfs_set_bdev_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+	return 0;
+}
+
+static int nilfs_test_bdev_super(struct super_block *s, void *data)
+{
+	return (void *)s->s_bdev == data;
+}
+
+static struct dentry *
+nilfs_mount(struct file_system_type *fs_type, int flags,
+	     const char *dev_name, void *data)
+{
+	struct nilfs_super_data sd;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
+	struct dentry *root_dentry;
+	int err, s_new = false;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	if (IS_ERR(sd.bdev))
+		return ERR_CAST(sd.bdev);
+
+	sd.cno = 0;
+	sd.flags = flags;
+	if (nilfs_identify((char *)data, &sd)) {
+		err = -EINVAL;
+		goto failed;
+	}
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
+	if (sd.bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+		err = -EBUSY;
+		goto failed;
+	}
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
+		 sd.bdev);
+	mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+	if (IS_ERR(s)) {
+		err = PTR_ERR(s);
+		goto failed;
+	}
+
+	if (!s->s_root) {
+		char b[BDEVNAME_SIZE];
+
+		s_new = true;
+
+		/* New superblock instance created */
+		s->s_mode = mode;
+		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(sd.bdev));
+
+		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (err)
+			goto failed_super;
+
+		s->s_flags |= MS_ACTIVE;
+	} else if (!sd.cno) {
+		if (nilfs_tree_is_busy(s->s_root)) {
+			if ((flags ^ s->s_flags) & MS_RDONLY) {
+				printk(KERN_ERR "NILFS: the device already "
+				       "has a %s mount.\n",
+				       (s->s_flags & MS_RDONLY) ?
+				       "read-only" : "read/write");
+				err = -EBUSY;
+				goto failed_super;
+			}
+		} else {
+			/*
+			 * Try remount to setup mount states if the current
+			 * tree is not mounted and only snapshots use this sb.
+			 */
+			err = nilfs_remount(s, &flags, data);
+			if (err)
+				goto failed_super;
+		}
+	}
+
+	if (sd.cno) {
+		err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
+		if (err)
+			goto failed_super;
+	} else {
+		root_dentry = dget(s->s_root);
+	}
+
+	if (!s_new)
+		blkdev_put(sd.bdev, mode);
+
+	return root_dentry;
+
+ failed_super:
+	deactivate_locked_super(s);
+
+ failed:
+	if (!s_new)
+		blkdev_put(sd.bdev, mode);
+	return ERR_PTR(err);
+}
+
+struct file_system_type nilfs_fs_type = {
+	.owner    = THIS_MODULE,
+	.name     = "nilfs2",
+	.mount    = nilfs_mount,
+	.kill_sb  = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("nilfs2");
+
+static void nilfs_inode_init_once(void *obj)
+{
+	struct nilfs_inode_info *ii = obj;
+
+	INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+	init_rwsem(&ii->xattr_sem);
+#endif
+	address_space_init_once(&ii->i_btnode_cache);
+	ii->i_bmap = &ii->i_bmap_data;
+	inode_init_once(&ii->vfs_inode);
+}
+
+static void nilfs_segbuf_init_once(void *obj)
+{
+	memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
+
+static void nilfs_destroy_cachep(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
+	if (nilfs_inode_cachep)
+		kmem_cache_destroy(nilfs_inode_cachep);
+	if (nilfs_transaction_cachep)
+		kmem_cache_destroy(nilfs_transaction_cachep);
+	if (nilfs_segbuf_cachep)
+		kmem_cache_destroy(nilfs_segbuf_cachep);
+	if (nilfs_btree_path_cache)
+		kmem_cache_destroy(nilfs_btree_path_cache);
+}
+
+static int __init nilfs_init_cachep(void)
+{
+	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+			sizeof(struct nilfs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+	if (!nilfs_inode_cachep)
+		goto fail;
+
+	nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+			sizeof(struct nilfs_transaction_info), 0,
+			SLAB_RECLAIM_ACCOUNT, NULL);
+	if (!nilfs_transaction_cachep)
+		goto fail;
+
+	nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+			sizeof(struct nilfs_segment_buffer), 0,
+			SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+	if (!nilfs_segbuf_cachep)
+		goto fail;
+
+	nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+			sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+			0, 0, NULL);
+	if (!nilfs_btree_path_cache)
+		goto fail;
+
+	return 0;
+
+fail:
+	nilfs_destroy_cachep();
+	return -ENOMEM;
+}
+
+static int __init init_nilfs_fs(void)
+{
+	int err;
+
+	err = nilfs_init_cachep();
+	if (err)
+		goto fail;
+
+	err = nilfs_sysfs_init();
+	if (err)
+		goto free_cachep;
+
+	err = register_filesystem(&nilfs_fs_type);
+	if (err)
+		goto deinit_sysfs_entry;
+
+	printk(KERN_INFO "NILFS version 2 loaded\n");
+	return 0;
+
+deinit_sysfs_entry:
+	nilfs_sysfs_exit();
+free_cachep:
+	nilfs_destroy_cachep();
+fail:
+	return err;
+}
+
+static void __exit exit_nilfs_fs(void)
+{
+	nilfs_destroy_cachep();
+	nilfs_sysfs_exit();
+	unregister_filesystem(&nilfs_fs_type);
+}
+
+module_init(init_nilfs_fs)
+module_exit(exit_nilfs_fs)
diff --git a/kernel/fs/nilfs2/sysfs.c b/kernel/fs/nilfs2/sysfs.c
new file mode 100644
index 000000000..bbb0dcc35
--- /dev/null
+++ b/kernel/fs/nilfs2/sysfs.c
@@ -0,0 +1,1137 @@
+/*
+ * sysfs.c - sysfs support implementation.
+ *
+ * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
+ */
+
+#include <linux/kobject.h>
+
+#include "nilfs.h"
+#include "mdt.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "sysfs.h"
+
+/* /sys/fs/<nilfs>/ */
+static struct kset *nilfs_kset;
+
+#define NILFS_SHOW_TIME(time_t_val, buf) ({ \
+		struct tm res; \
+		int count = 0; \
+		time_to_tm(time_t_val, 0, &res); \
+		res.tm_year += 1900; \
+		res.tm_mon += 1; \
+		count = scnprintf(buf, PAGE_SIZE, \
+				    "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \
+				    res.tm_year, res.tm_mon, res.tm_mday, \
+				    res.tm_hour, res.tm_min, res.tm_sec);\
+		count; \
+})
+
+#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \
+static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \
+					struct attribute *attr, char *buf) \
+{ \
+	struct the_nilfs *nilfs = container_of(kobj->parent, \
+						struct the_nilfs, \
+						ns_##parent_name##_kobj); \
+	struct nilfs_##name##_attr *a = container_of(attr, \
+						struct nilfs_##name##_attr, \
+						attr); \
+	return a->show ? a->show(a, nilfs, buf) : 0; \
+} \
+static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
+					 struct attribute *attr, \
+					 const char *buf, size_t len) \
+{ \
+	struct the_nilfs *nilfs = container_of(kobj->parent, \
+						struct the_nilfs, \
+						ns_##parent_name##_kobj); \
+	struct nilfs_##name##_attr *a = container_of(attr, \
+						struct nilfs_##name##_attr, \
+						attr); \
+	return a->store ? a->store(a, nilfs, buf, len) : 0; \
+} \
+static const struct sysfs_ops nilfs_##name##_attr_ops = { \
+	.show	= nilfs_##name##_attr_show, \
+	.store	= nilfs_##name##_attr_store, \
+};
+
+#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
+static void nilfs_##name##_attr_release(struct kobject *kobj) \
+{ \
+	struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
+	struct the_nilfs *nilfs = container_of(kobj->parent, \
+						struct the_nilfs, \
+						ns_##parent_name##_kobj); \
+	subgroups = nilfs->ns_##parent_name##_subgroups; \
+	complete(&subgroups->sg_##name##_kobj_unregister); \
+} \
+static struct kobj_type nilfs_##name##_ktype = { \
+	.default_attrs	= nilfs_##name##_attrs, \
+	.sysfs_ops	= &nilfs_##name##_attr_ops, \
+	.release	= nilfs_##name##_attr_release, \
+};
+
+#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
+static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
+{ \
+	struct kobject *parent; \
+	struct kobject *kobj; \
+	struct completion *kobj_unregister; \
+	struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
+	int err; \
+	subgroups = nilfs->ns_##parent_name##_subgroups; \
+	kobj = &subgroups->sg_##name##_kobj; \
+	kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \
+	parent = &nilfs->ns_##parent_name##_kobj; \
+	kobj->kset = nilfs_kset; \
+	init_completion(kobj_unregister); \
+	err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
+				    #name); \
+	if (err) \
+		return err; \
+	return 0; \
+} \
+static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
+{ \
+	kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
+}
+
+/************************************************************************
+ *                        NILFS snapshot attrs                          *
+ ************************************************************************/
+
+static ssize_t
+nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
+				 struct nilfs_root *root, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)atomic64_read(&root->inodes_count));
+}
+
+static ssize_t
+nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
+				 struct nilfs_root *root, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)atomic64_read(&root->blocks_count));
+}
+
+static const char snapshot_readme_str[] =
+	"The group contains details about mounted snapshot.\n\n"
+	"(1) inodes_count\n\tshow number of inodes for snapshot.\n\n"
+	"(2) blocks_count\n\tshow number of blocks for snapshot.\n\n";
+
+static ssize_t
+nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
+			    struct nilfs_root *root, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, snapshot_readme_str);
+}
+
+NILFS_SNAPSHOT_RO_ATTR(inodes_count);
+NILFS_SNAPSHOT_RO_ATTR(blocks_count);
+NILFS_SNAPSHOT_RO_ATTR(README);
+
+static struct attribute *nilfs_snapshot_attrs[] = {
+	NILFS_SNAPSHOT_ATTR_LIST(inodes_count),
+	NILFS_SNAPSHOT_ATTR_LIST(blocks_count),
+	NILFS_SNAPSHOT_ATTR_LIST(README),
+	NULL,
+};
+
+static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj,
+					struct attribute *attr, char *buf)
+{
+	struct nilfs_root *root =
+			container_of(kobj, struct nilfs_root, snapshot_kobj);
+	struct nilfs_snapshot_attr *a =
+			container_of(attr, struct nilfs_snapshot_attr, attr);
+
+	return a->show ? a->show(a, root, buf) : 0;
+}
+
+static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj,
+					 struct attribute *attr,
+					 const char *buf, size_t len)
+{
+	struct nilfs_root *root =
+			container_of(kobj, struct nilfs_root, snapshot_kobj);
+	struct nilfs_snapshot_attr *a =
+			container_of(attr, struct nilfs_snapshot_attr, attr);
+
+	return a->store ? a->store(a, root, buf, len) : 0;
+}
+
+static void nilfs_snapshot_attr_release(struct kobject *kobj)
+{
+	struct nilfs_root *root = container_of(kobj, struct nilfs_root,
+						snapshot_kobj);
+	complete(&root->snapshot_kobj_unregister);
+}
+
+static const struct sysfs_ops nilfs_snapshot_attr_ops = {
+	.show	= nilfs_snapshot_attr_show,
+	.store	= nilfs_snapshot_attr_store,
+};
+
+static struct kobj_type nilfs_snapshot_ktype = {
+	.default_attrs	= nilfs_snapshot_attrs,
+	.sysfs_ops	= &nilfs_snapshot_attr_ops,
+	.release	= nilfs_snapshot_attr_release,
+};
+
+int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
+{
+	struct the_nilfs *nilfs;
+	struct kobject *parent;
+	int err;
+
+	nilfs = root->nilfs;
+	parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj;
+	root->snapshot_kobj.kset = nilfs_kset;
+	init_completion(&root->snapshot_kobj_unregister);
+
+	if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+		err = kobject_init_and_add(&root->snapshot_kobj,
+					    &nilfs_snapshot_ktype,
+					    &nilfs->ns_dev_kobj,
+					    "current_checkpoint");
+	} else {
+		err = kobject_init_and_add(&root->snapshot_kobj,
+					    &nilfs_snapshot_ktype,
+					    parent,
+					    "%llu", root->cno);
+	}
+
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
+{
+	kobject_del(&root->snapshot_kobj);
+}
+
+/************************************************************************
+ *                    NILFS mounted snapshots attrs                     *
+ ************************************************************************/
+
+static const char mounted_snapshots_readme_str[] =
+	"The mounted_snapshots group contains group for\n"
+	"every mounted snapshot.\n";
+
+static ssize_t
+nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
+				    struct the_nilfs *nilfs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str);
+}
+
+NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);
+
+static struct attribute *nilfs_mounted_snapshots_attrs[] = {
+	NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README),
+	NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev);
+NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev);
+NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev);
+
+/************************************************************************
+ *                      NILFS checkpoints attrs                         *
+ ************************************************************************/
+
+static ssize_t
+nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
+					    struct the_nilfs *nilfs,
+					    char *buf)
+{
+	__u64 ncheckpoints;
+	struct nilfs_cpstat cpstat;
+	int err;
+
+	down_read(&nilfs->ns_segctor_sem);
+	err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (err < 0) {
+		printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
+			err);
+		return err;
+	}
+
+	ncheckpoints = cpstat.cs_ncps;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints);
+}
+
+static ssize_t
+nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	__u64 nsnapshots;
+	struct nilfs_cpstat cpstat;
+	int err;
+
+	down_read(&nilfs->ns_segctor_sem);
+	err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (err < 0) {
+		printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
+			err);
+		return err;
+	}
+
+	nsnapshots = cpstat.cs_nsss;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots);
+}
+
+static ssize_t
+nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
+					    struct the_nilfs *nilfs,
+					    char *buf)
+{
+	__u64 last_cno;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	last_cno = nilfs->ns_last_cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+}
+
+static ssize_t
+nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	__u64 cno;
+
+	down_read(&nilfs->ns_sem);
+	cno = nilfs->ns_cno;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+}
+
+static const char checkpoints_readme_str[] =
+	"The checkpoints group contains attributes that describe\n"
+	"details about volume's checkpoints.\n\n"
+	"(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n"
+	"(2) snapshots_number\n\tshow number of snapshots on volume.\n\n"
+	"(3) last_seg_checkpoint\n"
+	"\tshow checkpoint number of the latest segment.\n\n"
+	"(4) next_checkpoint\n\tshow next checkpoint number.\n\n";
+
+static ssize_t
+nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
+				struct the_nilfs *nilfs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, checkpoints_readme_str);
+}
+
+NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
+NILFS_CHECKPOINTS_RO_ATTR(snapshots_number);
+NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint);
+NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint);
+NILFS_CHECKPOINTS_RO_ATTR(README);
+
+static struct attribute *nilfs_checkpoints_attrs[] = {
+	NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number),
+	NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number),
+	NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint),
+	NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint),
+	NILFS_CHECKPOINTS_ATTR_LIST(README),
+	NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(checkpoints, dev);
+NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev);
+NILFS_DEV_INT_GROUP_FNS(checkpoints, dev);
+
+/************************************************************************
+ *                        NILFS segments attrs                          *
+ ************************************************************************/
+
+static ssize_t
+nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
+				     struct the_nilfs *nilfs,
+				     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments);
+}
+
+static ssize_t
+nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment);
+}
+
+static ssize_t
+nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
+				    struct the_nilfs *nilfs,
+				    char *buf)
+{
+	unsigned long ncleansegs;
+
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs);
+}
+
+static ssize_t
+nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
+				    struct the_nilfs *nilfs,
+				    char *buf)
+{
+	struct nilfs_sustat sustat;
+	int err;
+
+	down_read(&nilfs->ns_segctor_sem);
+	err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (err < 0) {
+		printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n",
+			err);
+		return err;
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs);
+}
+
+static const char segments_readme_str[] =
+	"The segments group contains attributes that describe\n"
+	"details about volume's segments.\n\n"
+	"(1) segments_number\n\tshow number of segments on volume.\n\n"
+	"(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n"
+	"(3) clean_segments\n\tshow count of clean segments.\n\n"
+	"(4) dirty_segments\n\tshow count of dirty segments.\n\n";
+
+static ssize_t
+nilfs_segments_README_show(struct nilfs_segments_attr *attr,
+			    struct the_nilfs *nilfs,
+			    char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, segments_readme_str);
+}
+
+NILFS_SEGMENTS_RO_ATTR(segments_number);
+NILFS_SEGMENTS_RO_ATTR(blocks_per_segment);
+NILFS_SEGMENTS_RO_ATTR(clean_segments);
+NILFS_SEGMENTS_RO_ATTR(dirty_segments);
+NILFS_SEGMENTS_RO_ATTR(README);
+
+static struct attribute *nilfs_segments_attrs[] = {
+	NILFS_SEGMENTS_ATTR_LIST(segments_number),
+	NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment),
+	NILFS_SEGMENTS_ATTR_LIST(clean_segments),
+	NILFS_SEGMENTS_ATTR_LIST(dirty_segments),
+	NILFS_SEGMENTS_ATTR_LIST(README),
+	NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(segments, dev);
+NILFS_DEV_INT_GROUP_TYPE(segments, dev);
+NILFS_DEV_INT_GROUP_FNS(segments, dev);
+
+/************************************************************************
+ *                        NILFS segctor attrs                           *
+ ************************************************************************/
+
+static ssize_t
+nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
+				    struct the_nilfs *nilfs,
+				    char *buf)
+{
+	sector_t last_pseg;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	last_pseg = nilfs->ns_last_pseg;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)last_pseg);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	u64 last_seq;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	last_seq = nilfs->ns_last_seq;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	__u64 last_cno;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	last_cno = nilfs->ns_last_cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+}
+
+static ssize_t
+nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	u64 seg_seq;
+
+	down_read(&nilfs->ns_sem);
+	seg_seq = nilfs->ns_seg_seq;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
+}
+
+static ssize_t
+nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
+					 struct the_nilfs *nilfs,
+					 char *buf)
+{
+	__u64 segnum;
+
+	down_read(&nilfs->ns_sem);
+	segnum = nilfs->ns_segnum;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
+}
+
+static ssize_t
+nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
+				 struct the_nilfs *nilfs,
+				 char *buf)
+{
+	__u64 nextnum;
+
+	down_read(&nilfs->ns_sem);
+	nextnum = nilfs->ns_nextnum;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
+}
+
+static ssize_t
+nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	unsigned long pseg_offset;
+
+	down_read(&nilfs->ns_sem);
+	pseg_offset = nilfs->ns_pseg_offset;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
+}
+
+static ssize_t
+nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	__u64 cno;
+
+	down_read(&nilfs->ns_sem);
+	cno = nilfs->ns_cno;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
+					struct the_nilfs *nilfs,
+					char *buf)
+{
+	time_t ctime;
+
+	down_read(&nilfs->ns_sem);
+	ctime = nilfs->ns_ctime;
+	up_read(&nilfs->ns_sem);
+
+	return NILFS_SHOW_TIME(ctime, buf);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
+					    struct the_nilfs *nilfs,
+					    char *buf)
+{
+	time_t ctime;
+
+	down_read(&nilfs->ns_sem);
+	ctime = nilfs->ns_ctime;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
+}
+
+static ssize_t
+nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
+					 struct the_nilfs *nilfs,
+					 char *buf)
+{
+	time_t nongc_ctime;
+
+	down_read(&nilfs->ns_sem);
+	nongc_ctime = nilfs->ns_nongc_ctime;
+	up_read(&nilfs->ns_sem);
+
+	return NILFS_SHOW_TIME(nongc_ctime, buf);
+}
+
+static ssize_t
+nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
+						struct the_nilfs *nilfs,
+						char *buf)
+{
+	time_t nongc_ctime;
+
+	down_read(&nilfs->ns_sem);
+	nongc_ctime = nilfs->ns_nongc_ctime;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)nongc_ctime);
+}
+
+static ssize_t
+nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
+					    struct the_nilfs *nilfs,
+					    char *buf)
+{
+	u32 ndirtyblks;
+
+	down_read(&nilfs->ns_sem);
+	ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
+}
+
+static const char segctor_readme_str[] =
+	"The segctor group contains attributes that describe\n"
+	"segctor thread activity details.\n\n"
+	"(1) last_pseg_block\n"
+	"\tshow start block number of the latest segment.\n\n"
+	"(2) last_seg_sequence\n"
+	"\tshow sequence value of the latest segment.\n\n"
+	"(3) last_seg_checkpoint\n"
+	"\tshow checkpoint number of the latest segment.\n\n"
+	"(4) current_seg_sequence\n\tshow segment sequence counter.\n\n"
+	"(5) current_last_full_seg\n"
+	"\tshow index number of the latest full segment.\n\n"
+	"(6) next_full_seg\n"
+	"\tshow index number of the full segment index to be used next.\n\n"
+	"(7) next_pseg_offset\n"
+	"\tshow offset of next partial segment in the current full segment.\n\n"
+	"(8) next_checkpoint\n\tshow next checkpoint number.\n\n"
+	"(9) last_seg_write_time\n"
+	"\tshow write time of the last segment in human-readable format.\n\n"
+	"(10) last_seg_write_time_secs\n"
+	"\tshow write time of the last segment in seconds.\n\n"
+	"(11) last_nongc_write_time\n"
+	"\tshow write time of the last segment not for cleaner operation "
+	"in human-readable format.\n\n"
+	"(12) last_nongc_write_time_secs\n"
+	"\tshow write time of the last segment not for cleaner operation "
+	"in seconds.\n\n"
+	"(13) dirty_data_blocks_count\n"
+	"\tshow number of dirty data blocks.\n\n";
+
+static ssize_t
+nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
+			  struct the_nilfs *nilfs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, segctor_readme_str);
+}
+
+NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
+NILFS_SEGCTOR_RO_ATTR(last_seg_sequence);
+NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint);
+NILFS_SEGCTOR_RO_ATTR(current_seg_sequence);
+NILFS_SEGCTOR_RO_ATTR(current_last_full_seg);
+NILFS_SEGCTOR_RO_ATTR(next_full_seg);
+NILFS_SEGCTOR_RO_ATTR(next_pseg_offset);
+NILFS_SEGCTOR_RO_ATTR(next_checkpoint);
+NILFS_SEGCTOR_RO_ATTR(last_seg_write_time);
+NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs);
+NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time);
+NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs);
+NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count);
+NILFS_SEGCTOR_RO_ATTR(README);
+
+static struct attribute *nilfs_segctor_attrs[] = {
+	NILFS_SEGCTOR_ATTR_LIST(last_pseg_block),
+	NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence),
+	NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint),
+	NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence),
+	NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg),
+	NILFS_SEGCTOR_ATTR_LIST(next_full_seg),
+	NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset),
+	NILFS_SEGCTOR_ATTR_LIST(next_checkpoint),
+	NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time),
+	NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs),
+	NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time),
+	NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs),
+	NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count),
+	NILFS_SEGCTOR_ATTR_LIST(README),
+	NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(segctor, dev);
+NILFS_DEV_INT_GROUP_TYPE(segctor, dev);
+NILFS_DEV_INT_GROUP_FNS(segctor, dev);
+
+/************************************************************************
+ *                        NILFS superblock attrs                        *
+ ************************************************************************/
+
+static ssize_t
+nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr,
+				     struct the_nilfs *nilfs,
+				     char *buf)
+{
+	time_t sbwtime;
+
+	down_read(&nilfs->ns_sem);
+	sbwtime = nilfs->ns_sbwtime;
+	up_read(&nilfs->ns_sem);
+
+	return NILFS_SHOW_TIME(sbwtime, buf);
+}
+
+static ssize_t
+nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
+					 struct the_nilfs *nilfs,
+					 char *buf)
+{
+	time_t sbwtime;
+
+	down_read(&nilfs->ns_sem);
+	sbwtime = nilfs->ns_sbwtime;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime);
+}
+
+static ssize_t
+nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
+				      struct the_nilfs *nilfs,
+				      char *buf)
+{
+	unsigned sbwcount;
+
+	down_read(&nilfs->ns_sem);
+	sbwcount = nilfs->ns_sbwcount;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount);
+}
+
+static ssize_t
+nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
+					    struct the_nilfs *nilfs,
+					    char *buf)
+{
+	unsigned sb_update_freq;
+
+	down_read(&nilfs->ns_sem);
+	sb_update_freq = nilfs->ns_sb_update_freq;
+	up_read(&nilfs->ns_sem);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq);
+}
+
+static ssize_t
+nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
+					    struct the_nilfs *nilfs,
+					    const char *buf, size_t count)
+{
+	unsigned val;
+	int err;
+
+	err = kstrtouint(skip_spaces(buf), 0, &val);
+	if (err) {
+		printk(KERN_ERR "NILFS: unable to convert string: err=%d\n",
+			err);
+		return err;
+	}
+
+	if (val < NILFS_SB_FREQ) {
+		val = NILFS_SB_FREQ;
+		printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n");
+	}
+
+	down_write(&nilfs->ns_sem);
+	nilfs->ns_sb_update_freq = val;
+	up_write(&nilfs->ns_sem);
+
+	return count;
+}
+
+static const char sb_readme_str[] =
+	"The superblock group contains attributes that describe\n"
+	"superblock's details.\n\n"
+	"(1) sb_write_time\n\tshow previous write time of super block "
+	"in human-readable format.\n\n"
+	"(2) sb_write_time_secs\n\tshow previous write time of super block "
+	"in seconds.\n\n"
+	"(3) sb_write_count\n\tshow write count of super block.\n\n"
+	"(4) sb_update_frequency\n"
+	"\tshow/set interval of periodical update of superblock (in seconds).\n\n"
+	"\tYou can set preferable frequency of superblock update by command:\n\n"
+	"\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n";
+
+static ssize_t
+nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
+				struct the_nilfs *nilfs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, sb_readme_str);
+}
+
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs);
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_count);
+NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency);
+NILFS_SUPERBLOCK_RO_ATTR(README);
+
+static struct attribute *nilfs_superblock_attrs[] = {
+	NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time),
+	NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs),
+	NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count),
+	NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency),
+	NILFS_SUPERBLOCK_ATTR_LIST(README),
+	NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(superblock, dev);
+NILFS_DEV_INT_GROUP_TYPE(superblock, dev);
+NILFS_DEV_INT_GROUP_FNS(superblock, dev);
+
+/************************************************************************
+ *                        NILFS device attrs                            *
+ ************************************************************************/
+
+static
+ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
+				struct the_nilfs *nilfs,
+				char *buf)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	u32 major = le32_to_cpu(sbp[0]->s_rev_level);
+	u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
+
+	return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor);
+}
+
+static
+ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
+				 struct the_nilfs *nilfs,
+				 char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize);
+}
+
+static
+ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
+				    struct the_nilfs *nilfs,
+				    char *buf)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size);
+}
+
+static
+ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
+				   struct the_nilfs *nilfs,
+				   char *buf)
+{
+	sector_t free_blocks = 0;
+
+	nilfs_count_free_blocks(nilfs, &free_blocks);
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)free_blocks);
+}
+
+static
+ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
+			    struct the_nilfs *nilfs,
+			    char *buf)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+
+	return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid);
+}
+
+static
+ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr,
+				    struct the_nilfs *nilfs,
+				    char *buf)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+
+	return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n",
+			 sbp[0]->s_volume_name);
+}
+
+static const char dev_readme_str[] =
+	"The <device> group contains attributes that describe file system\n"
+	"partition's details.\n\n"
+	"(1) revision\n\tshow NILFS file system revision.\n\n"
+	"(2) blocksize\n\tshow volume block size in bytes.\n\n"
+	"(3) device_size\n\tshow volume size in bytes.\n\n"
+	"(4) free_blocks\n\tshow count of free blocks on volume.\n\n"
+	"(5) uuid\n\tshow volume's UUID.\n\n"
+	"(6) volume_name\n\tshow volume's name.\n\n";
+
+static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
+				     struct the_nilfs *nilfs,
+				     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, dev_readme_str);
+}
+
+NILFS_DEV_RO_ATTR(revision);
+NILFS_DEV_RO_ATTR(blocksize);
+NILFS_DEV_RO_ATTR(device_size);
+NILFS_DEV_RO_ATTR(free_blocks);
+NILFS_DEV_RO_ATTR(uuid);
+NILFS_DEV_RO_ATTR(volume_name);
+NILFS_DEV_RO_ATTR(README);
+
+static struct attribute *nilfs_dev_attrs[] = {
+	NILFS_DEV_ATTR_LIST(revision),
+	NILFS_DEV_ATTR_LIST(blocksize),
+	NILFS_DEV_ATTR_LIST(device_size),
+	NILFS_DEV_ATTR_LIST(free_blocks),
+	NILFS_DEV_ATTR_LIST(uuid),
+	NILFS_DEV_ATTR_LIST(volume_name),
+	NILFS_DEV_ATTR_LIST(README),
+	NULL,
+};
+
+static ssize_t nilfs_dev_attr_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+						ns_dev_kobj);
+	struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
+						attr);
+
+	return a->show ? a->show(a, nilfs, buf) : 0;
+}
+
+static ssize_t nilfs_dev_attr_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buf, size_t len)
+{
+	struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+						ns_dev_kobj);
+	struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
+						attr);
+
+	return a->store ? a->store(a, nilfs, buf, len) : 0;
+}
+
+static void nilfs_dev_attr_release(struct kobject *kobj)
+{
+	struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+						ns_dev_kobj);
+	complete(&nilfs->ns_dev_kobj_unregister);
+}
+
+static const struct sysfs_ops nilfs_dev_attr_ops = {
+	.show	= nilfs_dev_attr_show,
+	.store	= nilfs_dev_attr_store,
+};
+
+static struct kobj_type nilfs_dev_ktype = {
+	.default_attrs	= nilfs_dev_attrs,
+	.sysfs_ops	= &nilfs_dev_attr_ops,
+	.release	= nilfs_dev_attr_release,
+};
+
+int nilfs_sysfs_create_device_group(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups);
+	int err;
+
+	nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
+	if (unlikely(!nilfs->ns_dev_subgroups)) {
+		err = -ENOMEM;
+		printk(KERN_ERR "NILFS: unable to allocate memory for device group\n");
+		goto failed_create_device_group;
+	}
+
+	nilfs->ns_dev_kobj.kset = nilfs_kset;
+	init_completion(&nilfs->ns_dev_kobj_unregister);
+	err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
+				    "%s", sb->s_id);
+	if (err)
+		goto free_dev_subgroups;
+
+	err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
+	if (err)
+		goto cleanup_dev_kobject;
+
+	err = nilfs_sysfs_create_checkpoints_group(nilfs);
+	if (err)
+		goto delete_mounted_snapshots_group;
+
+	err = nilfs_sysfs_create_segments_group(nilfs);
+	if (err)
+		goto delete_checkpoints_group;
+
+	err = nilfs_sysfs_create_superblock_group(nilfs);
+	if (err)
+		goto delete_segments_group;
+
+	err = nilfs_sysfs_create_segctor_group(nilfs);
+	if (err)
+		goto delete_superblock_group;
+
+	return 0;
+
+delete_superblock_group:
+	nilfs_sysfs_delete_superblock_group(nilfs);
+
+delete_segments_group:
+	nilfs_sysfs_delete_segments_group(nilfs);
+
+delete_checkpoints_group:
+	nilfs_sysfs_delete_checkpoints_group(nilfs);
+
+delete_mounted_snapshots_group:
+	nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
+
+cleanup_dev_kobject:
+	kobject_del(&nilfs->ns_dev_kobj);
+
+free_dev_subgroups:
+	kfree(nilfs->ns_dev_subgroups);
+
+failed_create_device_group:
+	return err;
+}
+
+void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
+{
+	nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
+	nilfs_sysfs_delete_checkpoints_group(nilfs);
+	nilfs_sysfs_delete_segments_group(nilfs);
+	nilfs_sysfs_delete_superblock_group(nilfs);
+	nilfs_sysfs_delete_segctor_group(nilfs);
+	kobject_del(&nilfs->ns_dev_kobj);
+	kfree(nilfs->ns_dev_subgroups);
+}
+
+/************************************************************************
+ *                        NILFS feature attrs                           *
+ ************************************************************************/
+
+static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
+					    struct attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d.%d\n",
+			NILFS_CURRENT_REV, NILFS_MINOR_REV);
+}
+
+static const char features_readme_str[] =
+	"The features group contains attributes that describe NILFS file\n"
+	"system driver features.\n\n"
+	"(1) revision\n\tshow current revision of NILFS file system driver.\n";
+
+static ssize_t nilfs_feature_README_show(struct kobject *kobj,
+					 struct attribute *attr,
+					 char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, features_readme_str);
+}
+
+NILFS_FEATURE_RO_ATTR(revision);
+NILFS_FEATURE_RO_ATTR(README);
+
+static struct attribute *nilfs_feature_attrs[] = {
+	NILFS_FEATURE_ATTR_LIST(revision),
+	NILFS_FEATURE_ATTR_LIST(README),
+	NULL,
+};
+
+static const struct attribute_group nilfs_feature_attr_group = {
+	.name = "features",
+	.attrs = nilfs_feature_attrs,
+};
+
+int __init nilfs_sysfs_init(void)
+{
+	int err;
+
+	nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
+	if (!nilfs_kset) {
+		err = -ENOMEM;
+		printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n",
+			err);
+		goto failed_sysfs_init;
+	}
+
+	err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: unable to create feature group: err %d\n",
+			err);
+		goto cleanup_sysfs_init;
+	}
+
+	return 0;
+
+cleanup_sysfs_init:
+	kset_unregister(nilfs_kset);
+
+failed_sysfs_init:
+	return err;
+}
+
+void nilfs_sysfs_exit(void)
+{
+	sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
+	kset_unregister(nilfs_kset);
+}
diff --git a/kernel/fs/nilfs2/sysfs.h b/kernel/fs/nilfs2/sysfs.h
new file mode 100644
index 000000000..677e3a1a8
--- /dev/null
+++ b/kernel/fs/nilfs2/sysfs.h
@@ -0,0 +1,176 @@
+/*
+ * sysfs.h - sysfs support declarations.
+ *
+ * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
+ */
+
+#ifndef _NILFS_SYSFS_H
+#define _NILFS_SYSFS_H
+
+#include <linux/sysfs.h>
+
+#define NILFS_ROOT_GROUP_NAME	"nilfs2"
+
+/*
+ * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects
+ * @sg_superblock_kobj: /sys/fs/<nilfs>/<device>/superblock
+ * @sg_superblock_kobj_unregister: completion state
+ * @sg_segctor_kobj: /sys/fs/<nilfs>/<device>/segctor
+ * @sg_segctor_kobj_unregister: completion state
+ * @sg_mounted_snapshots_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots
+ * @sg_mounted_snapshots_kobj_unregister: completion state
+ * @sg_checkpoints_kobj: /sys/fs/<nilfs>/<device>/checkpoints
+ * @sg_checkpoints_kobj_unregister: completion state
+ * @sg_segments_kobj: /sys/fs/<nilfs>/<device>/segments
+ * @sg_segments_kobj_unregister: completion state
+ */
+struct nilfs_sysfs_dev_subgroups {
+	/* /sys/fs/<nilfs>/<device>/superblock */
+	struct kobject sg_superblock_kobj;
+	struct completion sg_superblock_kobj_unregister;
+
+	/* /sys/fs/<nilfs>/<device>/segctor */
+	struct kobject sg_segctor_kobj;
+	struct completion sg_segctor_kobj_unregister;
+
+	/* /sys/fs/<nilfs>/<device>/mounted_snapshots */
+	struct kobject sg_mounted_snapshots_kobj;
+	struct completion sg_mounted_snapshots_kobj_unregister;
+
+	/* /sys/fs/<nilfs>/<device>/checkpoints */
+	struct kobject sg_checkpoints_kobj;
+	struct completion sg_checkpoints_kobj_unregister;
+
+	/* /sys/fs/<nilfs>/<device>/segments */
+	struct kobject sg_segments_kobj;
+	struct completion sg_segments_kobj_unregister;
+};
+
+#define NILFS_COMMON_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+	struct attribute attr; \
+	ssize_t (*show)(struct kobject *, struct attribute *, \
+			char *); \
+	ssize_t (*store)(struct kobject *, struct attribute *, \
+			 const char *, size_t); \
+};
+
+NILFS_COMMON_ATTR_STRUCT(feature);
+
+#define NILFS_DEV_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+	struct attribute attr; \
+	ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \
+			char *); \
+	ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \
+			 const char *, size_t); \
+};
+
+NILFS_DEV_ATTR_STRUCT(dev);
+NILFS_DEV_ATTR_STRUCT(segments);
+NILFS_DEV_ATTR_STRUCT(mounted_snapshots);
+NILFS_DEV_ATTR_STRUCT(checkpoints);
+NILFS_DEV_ATTR_STRUCT(superblock);
+NILFS_DEV_ATTR_STRUCT(segctor);
+
+#define NILFS_CP_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+	struct attribute attr; \
+	ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \
+			char *); \
+	ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \
+			 const char *, size_t); \
+};
+
+NILFS_CP_ATTR_STRUCT(snapshot);
+
+#define NILFS_ATTR(type, name, mode, show, store) \
+	static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \
+		__ATTR(name, mode, show, store)
+
+#define NILFS_INFO_ATTR(type, name) \
+	NILFS_ATTR(type, name, 0444, NULL, NULL)
+#define NILFS_RO_ATTR(type, name) \
+	NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL)
+#define NILFS_RW_ATTR(type, name) \
+	NILFS_ATTR(type, name, 0644, \
+		    nilfs_##type##_##name##_show, \
+		    nilfs_##type##_##name##_store)
+
+#define NILFS_FEATURE_INFO_ATTR(name) \
+	NILFS_INFO_ATTR(feature, name)
+#define NILFS_FEATURE_RO_ATTR(name) \
+	NILFS_RO_ATTR(feature, name)
+#define NILFS_FEATURE_RW_ATTR(name) \
+	NILFS_RW_ATTR(feature, name)
+
+#define NILFS_DEV_INFO_ATTR(name) \
+	NILFS_INFO_ATTR(dev, name)
+#define NILFS_DEV_RO_ATTR(name) \
+	NILFS_RO_ATTR(dev, name)
+#define NILFS_DEV_RW_ATTR(name) \
+	NILFS_RW_ATTR(dev, name)
+
+#define NILFS_SEGMENTS_RO_ATTR(name) \
+	NILFS_RO_ATTR(segments, name)
+#define NILFS_SEGMENTS_RW_ATTR(name) \
+	NILFS_RW_ATTR(segs_info, name)
+
+#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \
+	NILFS_RO_ATTR(mounted_snapshots, name)
+
+#define NILFS_CHECKPOINTS_RO_ATTR(name) \
+	NILFS_RO_ATTR(checkpoints, name)
+#define NILFS_CHECKPOINTS_RW_ATTR(name) \
+	NILFS_RW_ATTR(checkpoints, name)
+
+#define NILFS_SNAPSHOT_INFO_ATTR(name) \
+	NILFS_INFO_ATTR(snapshot, name)
+#define NILFS_SNAPSHOT_RO_ATTR(name) \
+	NILFS_RO_ATTR(snapshot, name)
+#define NILFS_SNAPSHOT_RW_ATTR(name) \
+	NILFS_RW_ATTR(snapshot, name)
+
+#define NILFS_SUPERBLOCK_RO_ATTR(name) \
+	NILFS_RO_ATTR(superblock, name)
+#define NILFS_SUPERBLOCK_RW_ATTR(name) \
+	NILFS_RW_ATTR(superblock, name)
+
+#define NILFS_SEGCTOR_INFO_ATTR(name) \
+	NILFS_INFO_ATTR(segctor, name)
+#define NILFS_SEGCTOR_RO_ATTR(name) \
+	NILFS_RO_ATTR(segctor, name)
+#define NILFS_SEGCTOR_RW_ATTR(name) \
+	NILFS_RW_ATTR(segctor, name)
+
+#define NILFS_FEATURE_ATTR_LIST(name) \
+	(&nilfs_feature_attr_##name.attr)
+#define NILFS_DEV_ATTR_LIST(name) \
+	(&nilfs_dev_attr_##name.attr)
+#define NILFS_SEGMENTS_ATTR_LIST(name) \
+	(&nilfs_segments_attr_##name.attr)
+#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \
+	(&nilfs_mounted_snapshots_attr_##name.attr)
+#define NILFS_CHECKPOINTS_ATTR_LIST(name) \
+	(&nilfs_checkpoints_attr_##name.attr)
+#define NILFS_SNAPSHOT_ATTR_LIST(name) \
+	(&nilfs_snapshot_attr_##name.attr)
+#define NILFS_SUPERBLOCK_ATTR_LIST(name) \
+	(&nilfs_superblock_attr_##name.attr)
+#define NILFS_SEGCTOR_ATTR_LIST(name) \
+	(&nilfs_segctor_attr_##name.attr)
+
+#endif /* _NILFS_SYSFS_H */
diff --git a/kernel/fs/nilfs2/the_nilfs.c b/kernel/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000..69bd801af
--- /dev/null
+++ b/kernel/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,815 @@
+/*
+ * the_nilfs.c - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "alloc.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+#include "segbuf.h"
+
+
+static int nilfs_valid_sb(struct nilfs_super_block *sbp);
+
+void nilfs_set_last_segment(struct the_nilfs *nilfs,
+			    sector_t start_blocknr, u64 seq, __u64 cno)
+{
+	spin_lock(&nilfs->ns_last_segment_lock);
+	nilfs->ns_last_pseg = start_blocknr;
+	nilfs->ns_last_seq = seq;
+	nilfs->ns_last_cno = cno;
+
+	if (!nilfs_sb_dirty(nilfs)) {
+		if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
+			goto stay_cursor;
+
+		set_nilfs_sb_dirty(nilfs);
+	}
+	nilfs->ns_prev_seq = nilfs->ns_last_seq;
+
+ stay_cursor:
+	spin_unlock(&nilfs->ns_last_segment_lock);
+}
+
+/**
+ * alloc_nilfs - allocate a nilfs object
+ * @bdev: block device to which the_nilfs is related
+ *
+ * Return Value: On success, pointer to the_nilfs is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+{
+	struct the_nilfs *nilfs;
+
+	nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
+	if (!nilfs)
+		return NULL;
+
+	nilfs->ns_bdev = bdev;
+	atomic_set(&nilfs->ns_ndirtyblks, 0);
+	init_rwsem(&nilfs->ns_sem);
+	mutex_init(&nilfs->ns_snapshot_mount_mutex);
+	INIT_LIST_HEAD(&nilfs->ns_dirty_files);
+	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+	spin_lock_init(&nilfs->ns_inode_lock);
+	spin_lock_init(&nilfs->ns_next_gen_lock);
+	spin_lock_init(&nilfs->ns_last_segment_lock);
+	nilfs->ns_cptree = RB_ROOT;
+	spin_lock_init(&nilfs->ns_cptree_lock);
+	init_rwsem(&nilfs->ns_segctor_sem);
+	nilfs->ns_sb_update_freq = NILFS_SB_FREQ;
+
+	return nilfs;
+}
+
+/**
+ * destroy_nilfs - destroy nilfs object
+ * @nilfs: nilfs object to be released
+ */
+void destroy_nilfs(struct the_nilfs *nilfs)
+{
+	might_sleep();
+	if (nilfs_init(nilfs)) {
+		nilfs_sysfs_delete_device_group(nilfs);
+		brelse(nilfs->ns_sbh[0]);
+		brelse(nilfs->ns_sbh[1]);
+	}
+	kfree(nilfs);
+}
+
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+				 struct super_block *sb, sector_t sr_block)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *raw_sr;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct nilfs_inode *rawi;
+	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
+	unsigned inode_size;
+	int err;
+
+	err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
+	if (unlikely(err))
+		return err;
+
+	down_read(&nilfs->ns_sem);
+	dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
+	checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
+	segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
+	up_read(&nilfs->ns_sem);
+
+	inode_size = nilfs->ns_inode_size;
+
+	rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
+	err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
+	if (err)
+		goto failed;
+
+	rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
+	err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
+	if (err)
+		goto failed_dat;
+
+	rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
+	err = nilfs_sufile_read(sb, segment_usage_size, rawi,
+				&nilfs->ns_sufile);
+	if (err)
+		goto failed_cpfile;
+
+	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+	nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
+
+ failed:
+	brelse(bh_sr);
+	return err;
+
+ failed_cpfile:
+	iput(nilfs->ns_cpfile);
+
+ failed_dat:
+	iput(nilfs->ns_dat);
+	goto failed;
+}
+
+static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
+{
+	memset(ri, 0, sizeof(*ri));
+	INIT_LIST_HEAD(&ri->ri_used_segments);
+}
+
+static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
+{
+	nilfs_dispose_segment_list(&ri->ri_used_segments);
+}
+
+/**
+ * nilfs_store_log_cursor - load log cursor from a super block
+ * @nilfs: nilfs object
+ * @sbp: buffer storing super block to be read
+ *
+ * nilfs_store_log_cursor() reads the last position of the log
+ * containing a super root from a given super block, and initializes
+ * relevant information on the nilfs object preparatory for log
+ * scanning and recovery.
+ */
+static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
+				  struct nilfs_super_block *sbp)
+{
+	int ret = 0;
+
+	nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+	nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+	nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+
+	nilfs->ns_prev_seq = nilfs->ns_last_seq;
+	nilfs->ns_seg_seq = nilfs->ns_last_seq;
+	nilfs->ns_segnum =
+		nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+	nilfs->ns_cno = nilfs->ns_last_cno + 1;
+	if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+		printk(KERN_ERR "NILFS invalid last segment number.\n");
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/**
+ * load_nilfs - load and recover the nilfs
+ * @nilfs: the_nilfs structure to be released
+ * @sb: super block isntance used to recover past segment
+ *
+ * load_nilfs() searches and load the latest super root,
+ * attaches the last segment, and does recovery if needed.
+ * The caller must call this exclusively for simultaneous mounts.
+ */
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
+{
+	struct nilfs_recovery_info ri;
+	unsigned int s_flags = sb->s_flags;
+	int really_read_only = bdev_read_only(nilfs->ns_bdev);
+	int valid_fs = nilfs_valid_fs(nilfs);
+	int err;
+
+	if (!valid_fs) {
+		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+		if (s_flags & MS_RDONLY) {
+			printk(KERN_INFO "NILFS: INFO: recovery "
+			       "required for readonly filesystem.\n");
+			printk(KERN_INFO "NILFS: write access will "
+			       "be enabled during recovery.\n");
+		}
+	}
+
+	nilfs_init_recovery_info(&ri);
+
+	err = nilfs_search_super_root(nilfs, &ri);
+	if (unlikely(err)) {
+		struct nilfs_super_block **sbp = nilfs->ns_sbp;
+		int blocksize;
+
+		if (err != -EINVAL)
+			goto scan_error;
+
+		if (!nilfs_valid_sb(sbp[1])) {
+			printk(KERN_WARNING
+			       "NILFS warning: unable to fall back to spare"
+			       "super block\n");
+			goto scan_error;
+		}
+		printk(KERN_INFO
+		       "NILFS: try rollback from an earlier position\n");
+
+		/*
+		 * restore super block with its spare and reconfigure
+		 * relevant states of the nilfs object.
+		 */
+		memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+		nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
+		nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
+
+		/* verify consistency between two super blocks */
+		blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+		if (blocksize != nilfs->ns_blocksize) {
+			printk(KERN_WARNING
+			       "NILFS warning: blocksize differs between "
+			       "two super blocks (%d != %d)\n",
+			       blocksize, nilfs->ns_blocksize);
+			goto scan_error;
+		}
+
+		err = nilfs_store_log_cursor(nilfs, sbp[0]);
+		if (err)
+			goto scan_error;
+
+		/* drop clean flag to allow roll-forward and recovery */
+		nilfs->ns_mount_state &= ~NILFS_VALID_FS;
+		valid_fs = 0;
+
+		err = nilfs_search_super_root(nilfs, &ri);
+		if (err)
+			goto scan_error;
+	}
+
+	err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: error loading super root.\n");
+		goto failed;
+	}
+
+	if (valid_fs)
+		goto skip_recovery;
+
+	if (s_flags & MS_RDONLY) {
+		__u64 features;
+
+		if (nilfs_test_opt(nilfs, NORECOVERY)) {
+			printk(KERN_INFO "NILFS: norecovery option specified. "
+			       "skipping roll-forward recovery\n");
+			goto skip_recovery;
+		}
+		features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+			~NILFS_FEATURE_COMPAT_RO_SUPP;
+		if (features) {
+			printk(KERN_ERR "NILFS: couldn't proceed with "
+			       "recovery because of unsupported optional "
+			       "features (%llx)\n",
+			       (unsigned long long)features);
+			err = -EROFS;
+			goto failed_unload;
+		}
+		if (really_read_only) {
+			printk(KERN_ERR "NILFS: write access "
+			       "unavailable, cannot proceed.\n");
+			err = -EROFS;
+			goto failed_unload;
+		}
+		sb->s_flags &= ~MS_RDONLY;
+	} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
+		printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
+		       "option was specified for a read/write mount\n");
+		err = -EINVAL;
+		goto failed_unload;
+	}
+
+	err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
+	if (err)
+		goto failed_unload;
+
+	down_write(&nilfs->ns_sem);
+	nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
+	err = nilfs_cleanup_super(sb);
+	up_write(&nilfs->ns_sem);
+
+	if (err) {
+		printk(KERN_ERR "NILFS: failed to update super block. "
+		       "recovery unfinished.\n");
+		goto failed_unload;
+	}
+	printk(KERN_INFO "NILFS: recovery complete.\n");
+
+ skip_recovery:
+	nilfs_clear_recovery_info(&ri);
+	sb->s_flags = s_flags;
+	return 0;
+
+ scan_error:
+	printk(KERN_ERR "NILFS: error searching super root.\n");
+	goto failed;
+
+ failed_unload:
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_dat);
+
+ failed:
+	nilfs_clear_recovery_info(&ri);
+	sb->s_flags = s_flags;
+	return err;
+}
+
+static unsigned long long nilfs_max_size(unsigned int blkbits)
+{
+	unsigned int max_bits;
+	unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
+
+	max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
+	if (max_bits < 64)
+		res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
+	return res;
+}
+
+/**
+ * nilfs_nrsvsegs - calculate the number of reserved segments
+ * @nilfs: nilfs object
+ * @nsegs: total number of segments
+ */
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+	return max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+		     DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage,
+				  100));
+}
+
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+	nilfs->ns_nsegments = nsegs;
+	nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs);
+}
+
+static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
+				   struct nilfs_super_block *sbp)
+{
+	if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
+		printk(KERN_ERR "NILFS: unsupported revision "
+		       "(superblock rev.=%d.%d, current rev.=%d.%d). "
+		       "Please check the version of mkfs.nilfs.\n",
+		       le32_to_cpu(sbp->s_rev_level),
+		       le16_to_cpu(sbp->s_minor_rev_level),
+		       NILFS_CURRENT_REV, NILFS_MINOR_REV);
+		return -EINVAL;
+	}
+	nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
+	if (nilfs->ns_sbsize > BLOCK_SIZE)
+		return -EINVAL;
+
+	nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+	if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
+		printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
+		       nilfs->ns_inode_size);
+		return -EINVAL;
+	} else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
+		printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
+		       nilfs->ns_inode_size);
+		return -EINVAL;
+	}
+
+	nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+
+	nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+	if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
+		printk(KERN_ERR "NILFS: too short segment.\n");
+		return -EINVAL;
+	}
+
+	nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
+	nilfs->ns_r_segments_percentage =
+		le32_to_cpu(sbp->s_r_segments_percentage);
+	if (nilfs->ns_r_segments_percentage < 1 ||
+	    nilfs->ns_r_segments_percentage > 99) {
+		printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n");
+		return -EINVAL;
+	}
+
+	nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
+	nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
+	return 0;
+}
+
+static int nilfs_valid_sb(struct nilfs_super_block *sbp)
+{
+	static unsigned char sum[4];
+	const int sumoff = offsetof(struct nilfs_super_block, s_sum);
+	size_t bytes;
+	u32 crc;
+
+	if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+		return 0;
+	bytes = le16_to_cpu(sbp->s_bytes);
+	if (bytes > BLOCK_SIZE)
+		return 0;
+	crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+		       sumoff);
+	crc = crc32_le(crc, sum, 4);
+	crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
+		       bytes - sumoff - 4);
+	return crc == le32_to_cpu(sbp->s_sum);
+}
+
+static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+{
+	return offset < ((le64_to_cpu(sbp->s_nsegments) *
+			  le32_to_cpu(sbp->s_blocks_per_segment)) <<
+			 (le32_to_cpu(sbp->s_log_block_size) + 10));
+}
+
+static void nilfs_release_super_block(struct the_nilfs *nilfs)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		if (nilfs->ns_sbp[i]) {
+			brelse(nilfs->ns_sbh[i]);
+			nilfs->ns_sbh[i] = NULL;
+			nilfs->ns_sbp[i] = NULL;
+		}
+	}
+}
+
+void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
+{
+	brelse(nilfs->ns_sbh[0]);
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = NULL;
+	nilfs->ns_sbp[1] = NULL;
+}
+
+void nilfs_swap_super_block(struct the_nilfs *nilfs)
+{
+	struct buffer_head *tsbh = nilfs->ns_sbh[0];
+	struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
+
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = tsbh;
+	nilfs->ns_sbp[1] = tsbp;
+}
+
+static int nilfs_load_super_block(struct the_nilfs *nilfs,
+				  struct super_block *sb, int blocksize,
+				  struct nilfs_super_block **sbpp)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct buffer_head **sbh = nilfs->ns_sbh;
+	u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+	int valid[2], swp = 0;
+
+	sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
+					&sbh[0]);
+	sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
+
+	if (!sbp[0]) {
+		if (!sbp[1]) {
+			printk(KERN_ERR "NILFS: unable to read superblock\n");
+			return -EIO;
+		}
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read primary superblock "
+		       "(blocksize = %d)\n", blocksize);
+	} else if (!sbp[1]) {
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read secondary superblock "
+		       "(blocksize = %d)\n", blocksize);
+	}
+
+	/*
+	 * Compare two super blocks and set 1 in swp if the secondary
+	 * super block is valid and newer.  Otherwise, set 0 in swp.
+	 */
+	valid[0] = nilfs_valid_sb(sbp[0]);
+	valid[1] = nilfs_valid_sb(sbp[1]);
+	swp = valid[1] && (!valid[0] ||
+			   le64_to_cpu(sbp[1]->s_last_cno) >
+			   le64_to_cpu(sbp[0]->s_last_cno));
+
+	if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
+		brelse(sbh[1]);
+		sbh[1] = NULL;
+		sbp[1] = NULL;
+		valid[1] = 0;
+		swp = 0;
+	}
+	if (!valid[swp]) {
+		nilfs_release_super_block(nilfs);
+		printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
+		       sb->s_id);
+		return -EINVAL;
+	}
+
+	if (!valid[!swp])
+		printk(KERN_WARNING "NILFS warning: broken superblock. "
+		       "using spare superblock (blocksize = %d).\n", blocksize);
+	if (swp)
+		nilfs_swap_super_block(nilfs);
+
+	nilfs->ns_sbwcount = 0;
+	nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
+	nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+	*sbpp = sbp[0];
+	return 0;
+}
+
+/**
+ * init_nilfs - initialize a NILFS instance.
+ * @nilfs: the_nilfs structure
+ * @sb: super block
+ * @data: mount options
+ *
+ * init_nilfs() performs common initialization per block device (e.g.
+ * reading the super block, getting disk layout information, initializing
+ * shared fields in the_nilfs).
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error
+ * code is returned.
+ */
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
+{
+	struct nilfs_super_block *sbp;
+	int blocksize;
+	int err;
+
+	down_write(&nilfs->ns_sem);
+
+	blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		printk(KERN_ERR "NILFS: unable to set blocksize\n");
+		err = -EINVAL;
+		goto out;
+	}
+	err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+	if (err)
+		goto out;
+
+	err = nilfs_store_magic_and_option(sb, sbp, data);
+	if (err)
+		goto failed_sbh;
+
+	err = nilfs_check_feature_compatibility(sb, sbp);
+	if (err)
+		goto failed_sbh;
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+	if (blocksize < NILFS_MIN_BLOCK_SIZE ||
+	    blocksize > NILFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "filesystem blocksize %d\n", blocksize);
+		err = -EINVAL;
+		goto failed_sbh;
+	}
+	if (sb->s_blocksize != blocksize) {
+		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
+
+		if (blocksize < hw_blocksize) {
+			printk(KERN_ERR
+			       "NILFS: blocksize %d too small for device "
+			       "(sector-size = %d).\n",
+			       blocksize, hw_blocksize);
+			err = -EINVAL;
+			goto failed_sbh;
+		}
+		nilfs_release_super_block(nilfs);
+		sb_set_blocksize(sb, blocksize);
+
+		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+		if (err)
+			goto out;
+			/* not failed_sbh; sbh is released automatically
+			   when reloading fails. */
+	}
+	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+	nilfs->ns_blocksize = blocksize;
+
+	get_random_bytes(&nilfs->ns_next_generation,
+			 sizeof(nilfs->ns_next_generation));
+
+	err = nilfs_store_disk_layout(nilfs, sbp);
+	if (err)
+		goto failed_sbh;
+
+	sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+
+	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
+
+	err = nilfs_store_log_cursor(nilfs, sbp);
+	if (err)
+		goto failed_sbh;
+
+	err = nilfs_sysfs_create_device_group(sb);
+	if (err)
+		goto failed_sbh;
+
+	set_nilfs_init(nilfs);
+	err = 0;
+ out:
+	up_write(&nilfs->ns_sem);
+	return err;
+
+ failed_sbh:
+	nilfs_release_super_block(nilfs);
+	goto out;
+}
+
+int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
+			    size_t nsegs)
+{
+	sector_t seg_start, seg_end;
+	sector_t start = 0, nblocks = 0;
+	unsigned int sects_per_block;
+	__u64 *sn;
+	int ret = 0;
+
+	sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+		bdev_logical_block_size(nilfs->ns_bdev);
+	for (sn = segnump; sn < segnump + nsegs; sn++) {
+		nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
+
+		if (!nblocks) {
+			start = seg_start;
+			nblocks = seg_end - seg_start + 1;
+		} else if (start + nblocks == seg_start) {
+			nblocks += seg_end - seg_start + 1;
+		} else {
+			ret = blkdev_issue_discard(nilfs->ns_bdev,
+						   start * sects_per_block,
+						   nblocks * sects_per_block,
+						   GFP_NOFS, 0);
+			if (ret < 0)
+				return ret;
+			nblocks = 0;
+		}
+	}
+	if (nblocks)
+		ret = blkdev_issue_discard(nilfs->ns_bdev,
+					   start * sects_per_block,
+					   nblocks * sects_per_block,
+					   GFP_NOFS, 0);
+	return ret;
+}
+
+int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
+{
+	unsigned long ncleansegs;
+
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+	return 0;
+}
+
+int nilfs_near_disk_full(struct the_nilfs *nilfs)
+{
+	unsigned long ncleansegs, nincsegs;
+
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+	nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+		nilfs->ns_blocks_per_segment + 1;
+
+	return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
+}
+
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
+{
+	struct rb_node *n;
+	struct nilfs_root *root;
+
+	spin_lock(&nilfs->ns_cptree_lock);
+	n = nilfs->ns_cptree.rb_node;
+	while (n) {
+		root = rb_entry(n, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			n = n->rb_left;
+		} else if (cno > root->cno) {
+			n = n->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			return root;
+		}
+	}
+	spin_unlock(&nilfs->ns_cptree_lock);
+
+	return NULL;
+}
+
+struct nilfs_root *
+nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
+{
+	struct rb_node **p, *parent;
+	struct nilfs_root *root, *new;
+	int err;
+
+	root = nilfs_lookup_root(nilfs, cno);
+	if (root)
+		return root;
+
+	new = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	spin_lock(&nilfs->ns_cptree_lock);
+
+	p = &nilfs->ns_cptree.rb_node;
+	parent = NULL;
+
+	while (*p) {
+		parent = *p;
+		root = rb_entry(parent, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			p = &(*p)->rb_left;
+		} else if (cno > root->cno) {
+			p = &(*p)->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			kfree(new);
+			return root;
+		}
+	}
+
+	new->cno = cno;
+	new->ifile = NULL;
+	new->nilfs = nilfs;
+	atomic_set(&new->count, 1);
+	atomic64_set(&new->inodes_count, 0);
+	atomic64_set(&new->blocks_count, 0);
+
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
+
+	spin_unlock(&nilfs->ns_cptree_lock);
+
+	err = nilfs_sysfs_create_snapshot_group(new);
+	if (err) {
+		kfree(new);
+		new = NULL;
+	}
+
+	return new;
+}
+
+void nilfs_put_root(struct nilfs_root *root)
+{
+	if (atomic_dec_and_test(&root->count)) {
+		struct the_nilfs *nilfs = root->nilfs;
+
+		nilfs_sysfs_delete_snapshot_group(root);
+
+		spin_lock(&nilfs->ns_cptree_lock);
+		rb_erase(&root->rb_node, &nilfs->ns_cptree);
+		spin_unlock(&nilfs->ns_cptree_lock);
+		iput(root->ifile);
+
+		kfree(root);
+	}
+}
diff --git a/kernel/fs/nilfs2/the_nilfs.h b/kernel/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000..23778d385
--- /dev/null
+++ b/kernel/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,396 @@
+/*
+ * the_nilfs.h - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _THE_NILFS_H
+#define _THE_NILFS_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
+
+struct nilfs_sc_info;
+struct nilfs_sysfs_dev_subgroups;
+
+/* the_nilfs struct */
+enum {
+	THE_NILFS_INIT = 0,     /* Information from super_block is set */
+	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
+	THE_NILFS_GC_RUNNING,	/* gc process is running */
+	THE_NILFS_SB_DIRTY,	/* super block is dirty */
+};
+
+/**
+ * struct the_nilfs - struct to supervise multiple nilfs mount points
+ * @ns_flags: flags
+ * @ns_flushed_device: flag indicating if all volatile data was flushed
+ * @ns_bdev: block device
+ * @ns_sem: semaphore for shared states
+ * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts
+ * @ns_sbh: buffer heads of on-disk super blocks
+ * @ns_sbp: pointers to super block data
+ * @ns_sbwtime: previous write time of super block
+ * @ns_sbwcount: write count of super block
+ * @ns_sbsize: size of valid data in super block
+ * @ns_mount_state: file system state
+ * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds)
+ * @ns_seg_seq: segment sequence counter
+ * @ns_segnum: index number of the latest full segment.
+ * @ns_nextnum: index number of the full segment index to be used next
+ * @ns_pseg_offset: offset of next partial segment in the current full segment
+ * @ns_cno: next checkpoint number
+ * @ns_ctime: write time of the last segment
+ * @ns_nongc_ctime: write time of the last segment not for cleaner operation
+ * @ns_ndirtyblks: Number of dirty data blocks
+ * @ns_last_segment_lock: lock protecting fields for the latest segment
+ * @ns_last_pseg: start block number of the latest segment
+ * @ns_last_seq: sequence value of the latest segment
+ * @ns_last_cno: checkpoint number of the latest segment
+ * @ns_prot_seq: least sequence number of segments which must not be reclaimed
+ * @ns_prev_seq: base sequence number used to decide if advance log cursor
+ * @ns_writer: log writer
+ * @ns_segctor_sem: semaphore protecting log write
+ * @ns_dat: DAT file inode
+ * @ns_cpfile: checkpoint file inode
+ * @ns_sufile: segusage file inode
+ * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
+ * @ns_cptree_lock: lock protecting @ns_cptree
+ * @ns_dirty_files: list of dirty files
+ * @ns_inode_lock: lock protecting @ns_dirty_files
+ * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_next_generation: next generation number for inodes
+ * @ns_next_gen_lock: lock protecting @ns_next_generation
+ * @ns_mount_opt: mount options
+ * @ns_resuid: uid for reserved blocks
+ * @ns_resgid: gid for reserved blocks
+ * @ns_interval: checkpoint creation interval
+ * @ns_watermark: watermark for the number of dirty buffers
+ * @ns_blocksize_bits: bit length of block size
+ * @ns_blocksize: block size
+ * @ns_nsegments: number of segments in filesystem
+ * @ns_blocks_per_segment: number of blocks per segment
+ * @ns_r_segments_percentage: reserved segments percentage
+ * @ns_nrsvsegs: number of reserved segments
+ * @ns_first_data_block: block number of first data block
+ * @ns_inode_size: size of on-disk inode
+ * @ns_first_ino: first not-special inode number
+ * @ns_crc_seed: seed value of CRC32 calculation
+ * @ns_dev_kobj: /sys/fs/<nilfs>/<device>
+ * @ns_dev_kobj_unregister: completion state
+ * @ns_dev_subgroups: <device> subgroups pointer
+ */
+struct the_nilfs {
+	unsigned long		ns_flags;
+	int			ns_flushed_device;
+
+	struct block_device    *ns_bdev;
+	struct rw_semaphore	ns_sem;
+	struct mutex		ns_snapshot_mount_mutex;
+
+	/*
+	 * used for
+	 * - loading the latest checkpoint exclusively.
+	 * - allocating a new full segment.
+	 */
+	struct buffer_head     *ns_sbh[2];
+	struct nilfs_super_block *ns_sbp[2];
+	time_t			ns_sbwtime;
+	unsigned		ns_sbwcount;
+	unsigned		ns_sbsize;
+	unsigned		ns_mount_state;
+	unsigned		ns_sb_update_freq;
+
+	/*
+	 * Following fields are dedicated to a writable FS-instance.
+	 * Except for the period seeking checkpoint, code outside the segment
+	 * constructor must lock a segment semaphore while accessing these
+	 * fields.
+	 * The writable FS-instance is sole during a lifetime of the_nilfs.
+	 */
+	u64			ns_seg_seq;
+	__u64			ns_segnum;
+	__u64			ns_nextnum;
+	unsigned long		ns_pseg_offset;
+	__u64			ns_cno;
+	time_t			ns_ctime;
+	time_t			ns_nongc_ctime;
+	atomic_t		ns_ndirtyblks;
+
+	/*
+	 * The following fields hold information on the latest partial segment
+	 * written to disk with a super root.  These fields are protected by
+	 * ns_last_segment_lock.
+	 */
+	spinlock_t		ns_last_segment_lock;
+	sector_t		ns_last_pseg;
+	u64			ns_last_seq;
+	__u64			ns_last_cno;
+	u64			ns_prot_seq;
+	u64			ns_prev_seq;
+
+	struct nilfs_sc_info   *ns_writer;
+	struct rw_semaphore	ns_segctor_sem;
+
+	/*
+	 * Following fields are lock free except for the period before
+	 * the_nilfs is initialized.
+	 */
+	struct inode	       *ns_dat;
+	struct inode	       *ns_cpfile;
+	struct inode	       *ns_sufile;
+
+	/* Checkpoint tree */
+	struct rb_root		ns_cptree;
+	spinlock_t		ns_cptree_lock;
+
+	/* Dirty inode list */
+	struct list_head	ns_dirty_files;
+	spinlock_t		ns_inode_lock;
+
+	/* GC inode list */
+	struct list_head	ns_gc_inodes;
+
+	/* Inode allocator */
+	u32			ns_next_generation;
+	spinlock_t		ns_next_gen_lock;
+
+	/* Mount options */
+	unsigned long		ns_mount_opt;
+
+	uid_t			ns_resuid;
+	gid_t			ns_resgid;
+	unsigned long		ns_interval;
+	unsigned long		ns_watermark;
+
+	/* Disk layout information (static) */
+	unsigned int		ns_blocksize_bits;
+	unsigned int		ns_blocksize;
+	unsigned long		ns_nsegments;
+	unsigned long		ns_blocks_per_segment;
+	unsigned long		ns_r_segments_percentage;
+	unsigned long		ns_nrsvsegs;
+	unsigned long		ns_first_data_block;
+	int			ns_inode_size;
+	int			ns_first_ino;
+	u32			ns_crc_seed;
+
+	/* /sys/fs/<nilfs>/<device> */
+	struct kobject ns_dev_kobj;
+	struct completion ns_dev_kobj_unregister;
+	struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
+};
+
+#define THE_NILFS_FNS(bit, name)					\
+static inline void set_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline void clear_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline int nilfs_##name(struct the_nilfs *nilfs)			\
+{									\
+	return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);		\
+}
+
+THE_NILFS_FNS(INIT, init)
+THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
+THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+
+/*
+ * Mount option operations
+ */
+#define nilfs_clear_opt(nilfs, opt)  \
+	do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(nilfs, opt)  \
+	do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(nilfs, mask, opt)				\
+	do { (nilfs)->ns_mount_opt =					\
+		(((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) |	\
+		 NILFS_MOUNT_##opt);					\
+	} while (0)
+
+/**
+ * struct nilfs_root - nilfs root object
+ * @cno: checkpoint number
+ * @rb_node: red-black tree node
+ * @count: refcount of this structure
+ * @nilfs: nilfs object
+ * @ifile: inode file
+ * @inodes_count: number of inodes
+ * @blocks_count: number of blocks
+ * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot>
+ * @snapshot_kobj_unregister: completion state for kernel object
+ */
+struct nilfs_root {
+	__u64 cno;
+	struct rb_node rb_node;
+
+	atomic_t count;
+	struct the_nilfs *nilfs;
+	struct inode *ifile;
+
+	atomic64_t inodes_count;
+	atomic64_t blocks_count;
+
+	/* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */
+	struct kobject snapshot_kobj;
+	struct completion snapshot_kobj_unregister;
+};
+
+/* Special checkpoint number */
+#define NILFS_CPTREE_CURRENT_CNO	0
+
+/* Minimum interval of periodical update of superblocks (in seconds) */
+#define NILFS_SB_FREQ		10
+
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+	u64 t = get_seconds();
+	return t < nilfs->ns_sbwtime ||
+		t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
+}
+
+static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
+{
+	int flip_bits = nilfs->ns_sbwcount & 0x0FL;
+	return (flip_bits != 0x08 && flip_bits != 0x0F);
+}
+
+void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
+struct the_nilfs *alloc_nilfs(struct block_device *bdev);
+void destroy_nilfs(struct the_nilfs *nilfs);
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
+int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
+int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
+struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
+					     __u64 cno);
+void nilfs_put_root(struct nilfs_root *root);
+int nilfs_near_disk_full(struct the_nilfs *);
+void nilfs_fall_back_super_block(struct the_nilfs *);
+void nilfs_swap_super_block(struct the_nilfs *);
+
+
+static inline void nilfs_get_root(struct nilfs_root *root)
+{
+	atomic_inc(&root->count);
+}
+
+static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
+{
+	unsigned valid_fs;
+
+	down_read(&nilfs->ns_sem);
+	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+	up_read(&nilfs->ns_sem);
+	return valid_fs;
+}
+
+static inline void
+nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
+			sector_t *seg_start, sector_t *seg_end)
+{
+	*seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
+	*seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
+	if (segnum == 0)
+		*seg_start = nilfs->ns_first_data_block;
+}
+
+static inline sector_t
+nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
+{
+	return (segnum == 0) ? nilfs->ns_first_data_block :
+		(sector_t)nilfs->ns_blocks_per_segment * segnum;
+}
+
+static inline __u64
+nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
+{
+	sector_t segnum = blocknr;
+
+	sector_div(segnum, nilfs->ns_blocks_per_segment);
+	return segnum;
+}
+
+static inline void
+nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
+			sector_t seg_end)
+{
+	/* terminate the current full segment (used in case of I/O-error) */
+	nilfs->ns_pseg_offset = seg_end - seg_start + 1;
+}
+
+static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
+{
+	/* move forward with a full segment */
+	nilfs->ns_segnum = nilfs->ns_nextnum;
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq++;
+}
+
+static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
+{
+	__u64 cno;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	cno = nilfs->ns_last_cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	return cno;
+}
+
+static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
+{
+	return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
+}
+
+static inline int nilfs_flush_device(struct the_nilfs *nilfs)
+{
+	int err;
+
+	if (!nilfs_test_opt(nilfs, BARRIER) || nilfs->ns_flushed_device)
+		return 0;
+
+	nilfs->ns_flushed_device = 1;
+	/*
+	 * the store to ns_flushed_device must not be reordered after
+	 * blkdev_issue_flush().
+	 */
+	smp_wmb();
+
+	err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL);
+	if (err != -EIO)
+		err = 0;
+	return err;
+}
+
+#endif /* _THE_NILFS_H */
diff --git a/kernel/fs/nls/Kconfig b/kernel/fs/nls/Kconfig
new file mode 100644
index 000000000..e2ce79ef4
--- /dev/null
+++ b/kernel/fs/nls/Kconfig
@@ -0,0 +1,619 @@
+#
+# Native language support configuration
+#
+
+menuconfig NLS
+	tristate "Native language support"
+	---help---
+	  The base Native Language Support. A number of filesystems
+	  depend on it (e.g. FAT, JOLIET, NT, BEOS filesystems), as well
+	  as the ability of some filesystems to use native languages
+	  (NCP, SMB).
+
+	  If unsure, say Y.
+
+	  To compile this code as a module, choose M here: the module
+	  will be called nls_base.
+
+if NLS
+
+config NLS_DEFAULT
+	string "Default NLS Option"
+	default "iso8859-1"
+	---help---
+	  The default NLS used when mounting file system. Note, that this is
+	  the NLS used by your console, not the NLS used by a specific file
+	  system (if different) to store data (filenames) on a disk.
+	  Currently, the valid values are:
+	  big5, cp437, cp737, cp775, cp850, cp852, cp855, cp857, cp860, cp861,
+	  cp862, cp863, cp864, cp865, cp866, cp869, cp874, cp932, cp936,
+	  cp949, cp950, cp1251, cp1255, euc-jp, euc-kr, gb2312, iso8859-1,
+	  iso8859-2, iso8859-3, iso8859-4, iso8859-5, iso8859-6, iso8859-7,
+	  iso8859-8, iso8859-9, iso8859-13, iso8859-14, iso8859-15,
+	  koi8-r, koi8-ru, koi8-u, sjis, tis-620, macroman, utf8.
+	  If you specify a wrong value, it will use the built-in NLS;
+	  compatible with iso8859-1.
+
+	  If unsure, specify it as "iso8859-1".
+
+config NLS_CODEPAGE_437
+	tristate "Codepage 437 (United States, Canada)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored
+	  in so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used in
+	  the United States and parts of Canada. This is recommended.
+
+config NLS_CODEPAGE_737
+	tristate "Codepage 737 (Greek)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored
+	  in so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used for
+	  Greek. If unsure, say N.
+
+config NLS_CODEPAGE_775
+	tristate "Codepage 775 (Baltic Rim)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored
+	  in so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used
+	  for the Baltic Rim Languages (Latvian and Lithuanian). If unsure,
+	  say N.
+
+config NLS_CODEPAGE_850
+	tristate "Codepage 850 (Europe)"
+	---help---
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used for
+	  much of Europe -- United Kingdom, Germany, Spain, Italy, and [add
+	  more countries here]. It has some characters useful to many European
+	  languages that are not part of the US codepage 437.
+
+	  If unsure, say Y.
+
+config NLS_CODEPAGE_852
+	tristate "Codepage 852 (Central/Eastern Europe)"
+	---help---
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Latin 2 codepage used by DOS
+	  for much of Central and Eastern Europe. It has all the required
+	  characters for these languages: Albanian, Croatian, Czech, English,
+	  Finnish, Hungarian, Irish, German, Polish, Romanian, Serbian (Latin
+	  transcription), Slovak, Slovenian, and Sorbian.
+
+config NLS_CODEPAGE_855
+	tristate "Codepage 855 (Cyrillic)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Cyrillic.
+
+config NLS_CODEPAGE_857
+	tristate "Codepage 857 (Turkish)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Turkish.
+
+config NLS_CODEPAGE_860
+	tristate "Codepage 860 (Portuguese)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Portuguese.
+
+config NLS_CODEPAGE_861
+	tristate "Codepage 861 (Icelandic)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Icelandic.
+
+config NLS_CODEPAGE_862
+	tristate "Codepage 862 (Hebrew)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Hebrew.
+
+config NLS_CODEPAGE_863
+	tristate "Codepage 863 (Canadian French)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Canadian
+	  French.
+
+config NLS_CODEPAGE_864
+	tristate "Codepage 864 (Arabic)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Arabic.
+
+config NLS_CODEPAGE_865
+	tristate "Codepage 865 (Norwegian, Danish)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for the Nordic
+	  European countries.
+
+config NLS_CODEPAGE_866
+	tristate "Codepage 866 (Cyrillic/Russian)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for
+	  Cyrillic/Russian.
+
+config NLS_CODEPAGE_869
+	tristate "Codepage 869 (Greek)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Greek.
+
+config NLS_CODEPAGE_936
+	tristate "Simplified Chinese charset (CP936, GB2312)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Simplified
+	  Chinese(GBK).
+
+config NLS_CODEPAGE_950
+	tristate "Traditional Chinese charset (Big5)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Traditional
+	  Chinese(Big5).
+
+config NLS_CODEPAGE_932
+	tristate "Japanese charsets (Shift-JIS, EUC-JP)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Shift-JIS
+	  or EUC-JP. To use EUC-JP, you can use 'euc-jp' as mount option or
+	  NLS Default value during kernel configuration, instead of 'cp932'.
+
+config NLS_CODEPAGE_949
+	tristate "Korean charset (CP949, EUC-KR)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for UHC.
+
+config NLS_CODEPAGE_874
+	tristate "Thai charset (CP874, TIS-620)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Thai.
+
+config NLS_ISO8859_8
+	tristate "Hebrew charsets (ISO-8859-8, CP1255)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-8, the Hebrew
+	  character set.
+
+config NLS_CODEPAGE_1250
+	tristate "Windows CP1250 (Slavic/Central European Languages)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CDROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Windows CP-1250
+	  character set, which works for most Latin-written Slavic and Central
+	  European languages: Czech, German, Hungarian, Polish, Rumanian, Croatian,
+	  Slovak, Slovene.
+
+config NLS_CODEPAGE_1251
+	tristate "Windows CP1251 (Bulgarian, Belarusian)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Russian and
+	  Bulgarian and Belarusian.
+
+config NLS_ASCII
+	tristate "ASCII (United States)"
+	help
+	  An ASCII NLS module is needed if you want to override the
+	  DEFAULT NLS with this very basic charset and don't want any
+	  non-ASCII characters to be translated.
+
+config NLS_ISO8859_1
+	tristate "NLS ISO 8859-1  (Latin 1; Western European Languages)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 1 character
+	  set, which covers most West European languages such as Albanian,
+	  Catalan, Danish, Dutch, English, Faeroese, Finnish, French, German,
+	  Galician, Irish, Icelandic, Italian, Norwegian, Portuguese, Spanish,
+	  and Swedish. It is also the default for the US. If unsure, say Y.
+
+config NLS_ISO8859_2
+	tristate "NLS ISO 8859-2  (Latin 2; Slavic/Central European Languages)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 2 character
+	  set, which works for most Latin-written Slavic and Central European
+	  languages: Czech, German, Hungarian, Polish, Rumanian, Croatian,
+	  Slovak, Slovene.
+
+config NLS_ISO8859_3
+	tristate "NLS ISO 8859-3  (Latin 3; Esperanto, Galician, Maltese, Turkish)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 3 character
+	  set, which is popular with authors of Esperanto, Galician, Maltese,
+	  and Turkish.
+
+config NLS_ISO8859_4
+	tristate "NLS ISO 8859-4  (Latin 4; old Baltic charset)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 4 character
+	  set which introduces letters for Estonian, Latvian, and
+	  Lithuanian. It is an incomplete predecessor of Latin 7.
+
+config NLS_ISO8859_5
+	tristate "NLS ISO 8859-5  (Cyrillic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-5, a Cyrillic
+	  character set with which you can type Bulgarian, Belarusian,
+	  Macedonian, Russian, Serbian, and Ukrainian. Note that the charset
+	  KOI8-R is preferred in Russia.
+
+config NLS_ISO8859_6
+	tristate "NLS ISO 8859-6  (Arabic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-6, the Arabic
+	  character set.
+
+config NLS_ISO8859_7
+	tristate "NLS ISO 8859-7  (Modern Greek)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-7, the Modern
+	  Greek character set.
+
+config NLS_ISO8859_9
+	tristate "NLS ISO 8859-9  (Latin 5; Turkish)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 5 character
+	  set, and it replaces the rarely needed Icelandic letters in Latin 1
+	  with the Turkish ones. Useful in Turkey.
+
+config NLS_ISO8859_13
+	tristate "NLS ISO 8859-13 (Latin 7; Baltic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 7 character
+	  set, which supports modern Baltic languages including Latvian
+	  and Lithuanian.
+
+config NLS_ISO8859_14
+	tristate "NLS ISO 8859-14 (Latin 8; Celtic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 8 character
+	  set, which adds the last accented vowels for Welsh (aka Cymraeg)
+	  (and Manx Gaelic) that were missing in Latin 1.
+	  <http://linux.speech.cymru.org/> has further information.
+
+config NLS_ISO8859_15
+	tristate "NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)"
+	---help---
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 9 character
+	  set, which covers most West European languages such as Albanian,
+	  Catalan, Danish, Dutch, English, Estonian, Faeroese, Finnish,
+	  French, German, Galician, Irish, Icelandic, Italian, Norwegian,
+	  Portuguese, Spanish, and Swedish. Latin 9 is an update to
+	  Latin 1 (ISO 8859-1) that removes a handful of rarely used
+	  characters and instead adds support for Estonian, corrects the
+	  support for French and Finnish, and adds the new Euro character.
+	  If unsure, say Y.
+
+config NLS_KOI8_R
+	tristate "NLS KOI8-R (Russian)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the preferred Russian
+	  character set.
+
+config NLS_KOI8_U
+	tristate "NLS KOI8-U/RU (Ukrainian, Belarusian)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the preferred Ukrainian
+	  (koi8-u) and Belarusian (koi8-ru) character sets.
+
+config NLS_MAC_ROMAN
+	tristate "Codepage macroman"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  much of Europe -- United Kingdom, Germany, Spain, Italy, and [add
+	  more countries here].
+
+	  If unsure, say Y.
+
+config NLS_MAC_CELTIC
+	tristate "Codepage macceltic"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Celtic.
+
+	  If unsure, say Y.
+
+config NLS_MAC_CENTEURO
+	tristate "Codepage maccenteuro"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Central Europe.
+
+	  If unsure, say Y.
+
+config NLS_MAC_CROATIAN
+	tristate "Codepage maccroatian"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Croatian.
+
+	  If unsure, say Y.
+
+config NLS_MAC_CYRILLIC
+	tristate "Codepage maccyrillic"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Cyrillic.
+
+	  If unsure, say Y.
+
+config NLS_MAC_GAELIC
+	tristate "Codepage macgaelic"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Gaelic.
+
+	  If unsure, say Y.
+
+config NLS_MAC_GREEK
+	tristate "Codepage macgreek"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Greek.
+
+	  If unsure, say Y.
+
+config NLS_MAC_ICELAND
+	tristate "Codepage maciceland"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Iceland.
+
+	  If unsure, say Y.
+
+config NLS_MAC_INUIT
+	tristate "Codepage macinuit"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Inuit.
+
+	  If unsure, say Y.
+
+config NLS_MAC_ROMANIAN
+	tristate "Codepage macromanian"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Romanian.
+
+	  If unsure, say Y.
+
+config NLS_MAC_TURKISH
+	tristate "Codepage macturkish"
+	---help---
+	  The Apple HFS file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called MAC codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  Mac partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Mac codepage that is used for
+	  Turkish.
+
+	  If unsure, say Y.
+
+config NLS_UTF8
+	tristate "NLS UTF-8"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the UTF-8 encoding of
+	  the Unicode/ISO9646 universal character set.
+
+endif # NLS
diff --git a/kernel/fs/nls/Makefile b/kernel/fs/nls/Makefile
new file mode 100644
index 000000000..8ae37c1b5
--- /dev/null
+++ b/kernel/fs/nls/Makefile
@@ -0,0 +1,55 @@
+#
+# Makefile for native language support
+#
+
+obj-$(CONFIG_NLS)		+= nls_base.o
+
+obj-$(CONFIG_NLS_CODEPAGE_437)	+= nls_cp437.o
+obj-$(CONFIG_NLS_CODEPAGE_737)	+= nls_cp737.o
+obj-$(CONFIG_NLS_CODEPAGE_775)	+= nls_cp775.o
+obj-$(CONFIG_NLS_CODEPAGE_850)	+= nls_cp850.o
+obj-$(CONFIG_NLS_CODEPAGE_852)	+= nls_cp852.o
+obj-$(CONFIG_NLS_CODEPAGE_855)	+= nls_cp855.o
+obj-$(CONFIG_NLS_CODEPAGE_857)	+= nls_cp857.o
+obj-$(CONFIG_NLS_CODEPAGE_860)	+= nls_cp860.o
+obj-$(CONFIG_NLS_CODEPAGE_861)	+= nls_cp861.o
+obj-$(CONFIG_NLS_CODEPAGE_862)	+= nls_cp862.o
+obj-$(CONFIG_NLS_CODEPAGE_863)	+= nls_cp863.o
+obj-$(CONFIG_NLS_CODEPAGE_864)	+= nls_cp864.o
+obj-$(CONFIG_NLS_CODEPAGE_865)	+= nls_cp865.o
+obj-$(CONFIG_NLS_CODEPAGE_866)	+= nls_cp866.o
+obj-$(CONFIG_NLS_CODEPAGE_869)	+= nls_cp869.o
+obj-$(CONFIG_NLS_CODEPAGE_874)	+= nls_cp874.o
+obj-$(CONFIG_NLS_CODEPAGE_932)	+= nls_cp932.o nls_euc-jp.o
+obj-$(CONFIG_NLS_CODEPAGE_936)	+= nls_cp936.o
+obj-$(CONFIG_NLS_CODEPAGE_949)	+= nls_cp949.o
+obj-$(CONFIG_NLS_CODEPAGE_950)	+= nls_cp950.o
+obj-$(CONFIG_NLS_CODEPAGE_1250) += nls_cp1250.o
+obj-$(CONFIG_NLS_CODEPAGE_1251)	+= nls_cp1251.o
+obj-$(CONFIG_NLS_ASCII)		+= nls_ascii.o
+obj-$(CONFIG_NLS_ISO8859_1)	+= nls_iso8859-1.o
+obj-$(CONFIG_NLS_ISO8859_2)	+= nls_iso8859-2.o
+obj-$(CONFIG_NLS_ISO8859_3)	+= nls_iso8859-3.o
+obj-$(CONFIG_NLS_ISO8859_4)	+= nls_iso8859-4.o
+obj-$(CONFIG_NLS_ISO8859_5)	+= nls_iso8859-5.o
+obj-$(CONFIG_NLS_ISO8859_6)	+= nls_iso8859-6.o
+obj-$(CONFIG_NLS_ISO8859_7)	+= nls_iso8859-7.o
+obj-$(CONFIG_NLS_ISO8859_8)	+= nls_cp1255.o
+obj-$(CONFIG_NLS_ISO8859_9)	+= nls_iso8859-9.o
+obj-$(CONFIG_NLS_ISO8859_13)	+= nls_iso8859-13.o
+obj-$(CONFIG_NLS_ISO8859_14)	+= nls_iso8859-14.o
+obj-$(CONFIG_NLS_ISO8859_15)	+= nls_iso8859-15.o
+obj-$(CONFIG_NLS_KOI8_R)	+= nls_koi8-r.o
+obj-$(CONFIG_NLS_KOI8_U)	+= nls_koi8-u.o nls_koi8-ru.o
+obj-$(CONFIG_NLS_UTF8)		+= nls_utf8.o
+obj-$(CONFIG_NLS_MAC_CELTIC)    += mac-celtic.o
+obj-$(CONFIG_NLS_MAC_CENTEURO)  += mac-centeuro.o
+obj-$(CONFIG_NLS_MAC_CROATIAN)  += mac-croatian.o
+obj-$(CONFIG_NLS_MAC_CYRILLIC)  += mac-cyrillic.o
+obj-$(CONFIG_NLS_MAC_GAELIC)    += mac-gaelic.o
+obj-$(CONFIG_NLS_MAC_GREEK)     += mac-greek.o
+obj-$(CONFIG_NLS_MAC_ICELAND)   += mac-iceland.o
+obj-$(CONFIG_NLS_MAC_INUIT)     += mac-inuit.o
+obj-$(CONFIG_NLS_MAC_ROMANIAN)  += mac-romanian.o
+obj-$(CONFIG_NLS_MAC_ROMAN)     += mac-roman.o
+obj-$(CONFIG_NLS_MAC_TURKISH)   += mac-turkish.o
diff --git a/kernel/fs/nls/mac-celtic.c b/kernel/fs/nls/mac-celtic.c
new file mode 100644
index 000000000..266c2d7d5
--- /dev/null
+++ b/kernel/fs/nls/mac-celtic.c
@@ -0,0 +1,601 @@
+/*
+ * linux/fs/nls/mac-celtic.c
+ *
+ * Charset macceltic translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00c5, 0x00c7, 0x00c9,
+	0x00d1, 0x00d6, 0x00dc, 0x00e1,
+	0x00e0, 0x00e2, 0x00e4, 0x00e3,
+	0x00e5, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00ed, 0x00ec,
+	0x00ee, 0x00ef, 0x00f1, 0x00f3,
+	0x00f2, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x00a2, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x00a9, 0x2122, 0x00b4,
+	0x00a8, 0x2260, 0x00c6, 0x00d8,
+	/* 0xb0 */
+	0x221e, 0x00b1, 0x2264, 0x2265,
+	0x00a5, 0x00b5, 0x2202, 0x2211,
+	0x220f, 0x03c0, 0x222b, 0x00aa,
+	0x00ba, 0x03a9, 0x00e6, 0x00f8,
+	/* 0xc0 */
+	0x00bf, 0x00a1, 0x00ac, 0x221a,
+	0x0192, 0x2248, 0x2206, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x00c0,
+	0x00c3, 0x00d5, 0x0152, 0x0153,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x25ca,
+	0x00ff, 0x0178, 0x2044, 0x20ac,
+	0x2039, 0x203a, 0x0176, 0x0177,
+	/* 0xe0 */
+	0x2021, 0x00b7, 0x1ef2, 0x1ef3,
+	0x2030, 0x00c2, 0x00ca, 0x00c1,
+	0x00cb, 0x00c8, 0x00cd, 0x00ce,
+	0x00cf, 0x00cc, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0x2663, 0x00d2, 0x00da, 0x00db,
+	0x00d9, 0x0131, 0x00dd, 0x00fd,
+	0x0174, 0x0175, 0x1e84, 0x1e85,
+	0x1e80, 0x1e81, 0x1e82, 0x1e83,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */
+	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */
+	0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */
+	0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0xf6, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */
+	0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0xf7, 0x00, 0xd8, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xde, 0xdf, /* 0x70-0x77 */
+	0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page1e[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0xfc, 0xfd, 0xfe, 0xff, 0xfa, 0xfb, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0xe2, 0xe3, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */
+	0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */
+	0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page25[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page26[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   page1e, NULL,
+	page20, page21, page22, NULL,   NULL,   page25, page26, NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x00-0x07 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x08-0x0f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x10-0x17 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x18-0x1f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x20-0x27 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x28-0x2f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x30-0x37 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x38-0x3f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x40-0x47 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x48-0x4f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x50-0x57 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x58-0x5f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x60-0x67 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x68-0x6f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x70-0x77 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x78-0x7f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x80-0x87 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x88-0x8f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x90-0x97 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x98-0x9f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa0-0xa7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa8-0xaf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb0-0xb7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb8-0xbf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc0-0xc7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc8-0xcf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd0-0xd7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd8-0xdf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe0-0xe7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe8-0xef */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0-0xf7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "macceltic",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_macceltic(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_macceltic(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_macceltic)
+module_exit(exit_nls_macceltic)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-centeuro.c b/kernel/fs/nls/mac-centeuro.c
new file mode 100644
index 000000000..9789c6057
--- /dev/null
+++ b/kernel/fs/nls/mac-centeuro.c
@@ -0,0 +1,531 @@
+/*
+ * linux/fs/nls/mac-centeuro.c
+ *
+ * Charset maccenteuro translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x0100, 0x0101, 0x00c9,
+	0x0104, 0x00d6, 0x00dc, 0x00e1,
+	0x0105, 0x010c, 0x00e4, 0x010d,
+	0x0106, 0x0107, 0x00e9, 0x0179,
+	/* 0x90 */
+	0x017a, 0x010e, 0x00ed, 0x010f,
+	0x0112, 0x0113, 0x0116, 0x00f3,
+	0x0117, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x011a, 0x011b, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x0118, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x00a9, 0x2122, 0x0119,
+	0x00a8, 0x2260, 0x0123, 0x012e,
+	/* 0xb0 */
+	0x012f, 0x012a, 0x2264, 0x2265,
+	0x012b, 0x0136, 0x2202, 0x2211,
+	0x0142, 0x013b, 0x013c, 0x013d,
+	0x013e, 0x0139, 0x013a, 0x0145,
+	/* 0xc0 */
+	0x0146, 0x0143, 0x00ac, 0x221a,
+	0x0144, 0x0147, 0x2206, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x0148,
+	0x0150, 0x00d5, 0x0151, 0x014c,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x25ca,
+	0x014d, 0x0154, 0x0155, 0x0158,
+	0x2039, 0x203a, 0x0159, 0x0156,
+	/* 0xe0 */
+	0x0157, 0x0160, 0x201a, 0x201e,
+	0x0161, 0x015a, 0x015b, 0x00c1,
+	0x0164, 0x0165, 0x00cd, 0x017d,
+	0x017e, 0x016a, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0x016b, 0x016e, 0x00da, 0x016f,
+	0x0170, 0x0171, 0x0172, 0x0173,
+	0x00dd, 0x00fd, 0x0137, 0x017b,
+	0x0141, 0x017c, 0x0122, 0x02c7,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xa9, 0x00, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */
+	0xa1, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xe7, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x83, 0x00, 0x00, 0x00, 0xea, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xf2, 0x00, 0x86, 0xf8, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x00, 0x87, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x8e, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x9c, 0x00, 0x9f, 0xf9, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x81, 0x82, 0x00, 0x00, 0x84, 0x88, 0x8c, 0x8d, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x8b, 0x91, 0x93, /* 0x08-0x0f */
+	0x00, 0x00, 0x94, 0x95, 0x00, 0x00, 0x96, 0x98, /* 0x10-0x17 */
+	0xa2, 0xab, 0x9d, 0x9e, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xfe, 0xae, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0xb1, 0xb4, 0x00, 0x00, 0xaf, 0xb0, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb5, 0xfa, /* 0x30-0x37 */
+	0x00, 0xbd, 0xbe, 0xb9, 0xba, 0xbb, 0xbc, 0x00, /* 0x38-0x3f */
+	0x00, 0xfc, 0xb8, 0xc1, 0xc4, 0xbf, 0xc0, 0xc5, /* 0x40-0x47 */
+	0xcb, 0x00, 0x00, 0x00, 0xcf, 0xd8, 0x00, 0x00, /* 0x48-0x4f */
+	0xcc, 0xce, 0x00, 0x00, 0xd9, 0xda, 0xdf, 0xe0, /* 0x50-0x57 */
+	0xdb, 0xde, 0xe5, 0xe6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xe1, 0xe4, 0x00, 0x00, 0xe8, 0xe9, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xed, 0xf0, 0x00, 0x00, 0xf1, 0xf3, /* 0x68-0x6f */
+	0xf4, 0xf5, 0xf6, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x8f, 0x90, 0xfb, 0xfd, 0xeb, 0xec, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */
+	0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page25[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   page25, NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "maccenteuro",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_maccenteuro(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_maccenteuro(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_maccenteuro)
+module_exit(exit_nls_maccenteuro)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-croatian.c b/kernel/fs/nls/mac-croatian.c
new file mode 100644
index 000000000..bb19e7a07
--- /dev/null
+++ b/kernel/fs/nls/mac-croatian.c
@@ -0,0 +1,601 @@
+/*
+ * linux/fs/nls/mac-croatian.c
+ *
+ * Charset maccroatian translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00c5, 0x00c7, 0x00c9,
+	0x00d1, 0x00d6, 0x00dc, 0x00e1,
+	0x00e0, 0x00e2, 0x00e4, 0x00e3,
+	0x00e5, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00ed, 0x00ec,
+	0x00ee, 0x00ef, 0x00f1, 0x00f3,
+	0x00f2, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x00a2, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x0160, 0x2122, 0x00b4,
+	0x00a8, 0x2260, 0x017d, 0x00d8,
+	/* 0xb0 */
+	0x221e, 0x00b1, 0x2264, 0x2265,
+	0x2206, 0x00b5, 0x2202, 0x2211,
+	0x220f, 0x0161, 0x222b, 0x00aa,
+	0x00ba, 0x03a9, 0x017e, 0x00f8,
+	/* 0xc0 */
+	0x00bf, 0x00a1, 0x00ac, 0x221a,
+	0x0192, 0x2248, 0x0106, 0x00ab,
+	0x010c, 0x2026, 0x00a0, 0x00c0,
+	0x00c3, 0x00d5, 0x0152, 0x0153,
+	/* 0xd0 */
+	0x0110, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x25ca,
+	0xf8ff, 0x00a9, 0x2044, 0x20ac,
+	0x2039, 0x203a, 0x00c6, 0x00bb,
+	/* 0xe0 */
+	0x2013, 0x00b7, 0x201a, 0x201e,
+	0x2030, 0x00c2, 0x0107, 0x00c1,
+	0x010d, 0x00c8, 0x00cd, 0x00ce,
+	0x00cf, 0x00cc, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0x0111, 0x00d2, 0x00da, 0x00db,
+	0x00d9, 0x0131, 0x02c6, 0x02dc,
+	0x00af, 0x03c0, 0x00cb, 0x02da,
+	0x00b8, 0x00ca, 0x00e6, 0x02c7,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0xc1, 0xa2, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xd9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */
+	0xfc, 0x00, 0xbc, 0xdf, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */
+	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xde, 0x82, /* 0xc0-0xc7 */
+	0xe9, 0x83, 0xfd, 0xfa, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */
+	0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xfe, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */
+	0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xe6, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */
+	0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xfb, 0x00, 0xf7, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xe0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */
+	0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xb4, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */
+	0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page25[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char pagef8[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, page03, NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   page25, NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	pagef8, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "maccroatian",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_maccroatian(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_maccroatian(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_maccroatian)
+module_exit(exit_nls_maccroatian)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-cyrillic.c b/kernel/fs/nls/mac-cyrillic.c
new file mode 100644
index 000000000..2a7dea36a
--- /dev/null
+++ b/kernel/fs/nls/mac-cyrillic.c
@@ -0,0 +1,496 @@
+/*
+ * linux/fs/nls/mac-cyrillic.c
+ *
+ * Charset maccyrillic translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x0410, 0x0411, 0x0412, 0x0413,
+	0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b,
+	0x041c, 0x041d, 0x041e, 0x041f,
+	/* 0x90 */
+	0x0420, 0x0421, 0x0422, 0x0423,
+	0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b,
+	0x042c, 0x042d, 0x042e, 0x042f,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x0490, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x0406,
+	0x00ae, 0x00a9, 0x2122, 0x0402,
+	0x0452, 0x2260, 0x0403, 0x0453,
+	/* 0xb0 */
+	0x221e, 0x00b1, 0x2264, 0x2265,
+	0x0456, 0x00b5, 0x0491, 0x0408,
+	0x0404, 0x0454, 0x0407, 0x0457,
+	0x0409, 0x0459, 0x040a, 0x045a,
+	/* 0xc0 */
+	0x0458, 0x0405, 0x00ac, 0x221a,
+	0x0192, 0x2248, 0x2206, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x040b,
+	0x045b, 0x040c, 0x045c, 0x0455,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x201e,
+	0x040e, 0x045e, 0x040f, 0x045f,
+	0x2116, 0x0401, 0x0451, 0x044f,
+	/* 0xe0 */
+	0x0430, 0x0431, 0x0432, 0x0433,
+	0x0434, 0x0435, 0x0436, 0x0437,
+	0x0438, 0x0439, 0x043a, 0x043b,
+	0x043c, 0x043d, 0x043e, 0x043f,
+	/* 0xf0 */
+	0x0440, 0x0441, 0x0442, 0x0443,
+	0x0444, 0x0445, 0x0446, 0x0447,
+	0x0448, 0x0449, 0x044a, 0x044b,
+	0x044c, 0x044d, 0x044e, 0x20ac,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xa6, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd6, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xdd, 0xab, 0xae, 0xb8, 0xc1, 0xa7, 0xba, /* 0x00-0x07 */
+	0xb7, 0xbc, 0xbe, 0xcb, 0xcd, 0x00, 0xd8, 0xda, /* 0x08-0x0f */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0x48-0x4f */
+	0x00, 0xde, 0xac, 0xaf, 0xb9, 0xcf, 0xb4, 0xbb, /* 0x50-0x57 */
+	0xc0, 0xbd, 0xbf, 0xcc, 0xce, 0x00, 0xd9, 0xdb, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xa2, 0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0xd7, 0x00, /* 0x18-0x1f */
+	0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   page04, NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "maccyrillic",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_maccyrillic(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_maccyrillic(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_maccyrillic)
+module_exit(exit_nls_maccyrillic)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-gaelic.c b/kernel/fs/nls/mac-gaelic.c
new file mode 100644
index 000000000..77b001653
--- /dev/null
+++ b/kernel/fs/nls/mac-gaelic.c
@@ -0,0 +1,566 @@
+/*
+ * linux/fs/nls/mac-gaelic.c
+ *
+ * Charset macgaelic translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00c5, 0x00c7, 0x00c9,
+	0x00d1, 0x00d6, 0x00dc, 0x00e1,
+	0x00e0, 0x00e2, 0x00e4, 0x00e3,
+	0x00e5, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00ed, 0x00ec,
+	0x00ee, 0x00ef, 0x00f1, 0x00f3,
+	0x00f2, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x00a2, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x00a9, 0x2122, 0x00b4,
+	0x00a8, 0x2260, 0x00c6, 0x00d8,
+	/* 0xb0 */
+	0x1e02, 0x00b1, 0x2264, 0x2265,
+	0x1e03, 0x010a, 0x010b, 0x1e0a,
+	0x1e0b, 0x1e1e, 0x1e1f, 0x0120,
+	0x0121, 0x1e40, 0x00e6, 0x00f8,
+	/* 0xc0 */
+	0x1e41, 0x1e56, 0x1e57, 0x027c,
+	0x0192, 0x017f, 0x1e60, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x00c0,
+	0x00c3, 0x00d5, 0x0152, 0x0153,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x1e61, 0x1e9b,
+	0x00ff, 0x0178, 0x1e6a, 0x20ac,
+	0x2039, 0x203a, 0x0176, 0x0177,
+	/* 0xe0 */
+	0x1e6b, 0x00b7, 0x1ef2, 0x1ef3,
+	0x204a, 0x00c2, 0x00ca, 0x00c1,
+	0x00cb, 0x00c8, 0x00cd, 0x00ce,
+	0x00cf, 0x00cc, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0x2663, 0x00d2, 0x00da, 0x00db,
+	0x00d9, 0x0131, 0x00dd, 0x00fd,
+	0x0174, 0x0175, 0x1e84, 0x1e85,
+	0x1e80, 0x1e81, 0x1e82, 0x1e83,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0x00, 0xa2, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xa9, 0x00, 0xc7, 0x00, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0xab, 0x00, 0xa6, 0xe1, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */
+	0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */
+	0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0xf6, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */
+	0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0x00, /* 0xf0-0xf7 */
+	0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0xf7, 0x00, 0xd8, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0xb5, 0xb6, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xbb, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xde, 0xdf, /* 0x70-0x77 */
+	0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page1e[256] = {
+	0x00, 0x00, 0xb0, 0xb4, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0xb7, 0xb8, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0xba, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xbd, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc1, 0xc2, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xda, 0xe0, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0xfc, 0xfd, 0xfe, 0xff, 0xfa, 0xfb, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0xe2, 0xe3, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */
+	0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page26[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   page1e, NULL,
+	page20, page21, page22, NULL,   NULL,   NULL,   page26, NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x00-0x07 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x08-0x0f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x10-0x17 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x18-0x1f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x20-0x27 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x28-0x2f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x30-0x37 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x38-0x3f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x40-0x47 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x48-0x4f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x50-0x57 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x58-0x5f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x60-0x67 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x68-0x6f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x70-0x77 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x78-0x7f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x80-0x87 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x88-0x8f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x90-0x97 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x98-0x9f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa0-0xa7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa8-0xaf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb0-0xb7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb8-0xbf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc0-0xc7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc8-0xcf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd0-0xd7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd8-0xdf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe0-0xe7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe8-0xef */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0-0xf7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "macgaelic",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_macgaelic(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_macgaelic(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_macgaelic)
+module_exit(exit_nls_macgaelic)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-greek.c b/kernel/fs/nls/mac-greek.c
new file mode 100644
index 000000000..1eccf499e
--- /dev/null
+++ b/kernel/fs/nls/mac-greek.c
@@ -0,0 +1,496 @@
+/*
+ * linux/fs/nls/mac-greek.c
+ *
+ * Charset macgreek translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00b9, 0x00b2, 0x00c9,
+	0x00b3, 0x00d6, 0x00dc, 0x0385,
+	0x00e0, 0x00e2, 0x00e4, 0x0384,
+	0x00a8, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00a3, 0x2122,
+	0x00ee, 0x00ef, 0x2022, 0x00bd,
+	0x2030, 0x00f4, 0x00f6, 0x00a6,
+	0x20ac, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x0393, 0x0394, 0x0398,
+	0x039b, 0x039e, 0x03a0, 0x00df,
+	0x00ae, 0x00a9, 0x03a3, 0x03aa,
+	0x00a7, 0x2260, 0x00b0, 0x00b7,
+	/* 0xb0 */
+	0x0391, 0x00b1, 0x2264, 0x2265,
+	0x00a5, 0x0392, 0x0395, 0x0396,
+	0x0397, 0x0399, 0x039a, 0x039c,
+	0x03a6, 0x03ab, 0x03a8, 0x03a9,
+	/* 0xc0 */
+	0x03ac, 0x039d, 0x00ac, 0x039f,
+	0x03a1, 0x2248, 0x03a4, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x03a5,
+	0x03a7, 0x0386, 0x0388, 0x0153,
+	/* 0xd0 */
+	0x2013, 0x2015, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x0389,
+	0x038a, 0x038c, 0x038e, 0x03ad,
+	0x03ae, 0x03af, 0x03cc, 0x038f,
+	/* 0xe0 */
+	0x03cd, 0x03b1, 0x03b2, 0x03c8,
+	0x03b4, 0x03b5, 0x03c6, 0x03b3,
+	0x03b7, 0x03b9, 0x03be, 0x03ba,
+	0x03bb, 0x03bc, 0x03bd, 0x03bf,
+	/* 0xf0 */
+	0x03c0, 0x03ce, 0x03c1, 0x03c3,
+	0x03c4, 0x03b8, 0x03c9, 0x03c2,
+	0x03c7, 0x03c5, 0x03b6, 0x03ca,
+	0x03cb, 0x0390, 0x03b0, 0x00ad,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0x00, 0x00, 0x92, 0x00, 0xb4, 0x9b, 0xac, /* 0xa0-0xa7 */
+	0x8c, 0xa9, 0x00, 0xc7, 0xc2, 0xff, 0xa8, 0x00, /* 0xa8-0xaf */
+	0xae, 0xb1, 0x82, 0x84, 0x00, 0x00, 0x00, 0xaf, /* 0xb0-0xb7 */
+	0x00, 0x81, 0x00, 0xc8, 0x00, 0x97, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x00, 0x89, 0x00, 0x8a, 0x00, 0x00, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x00, 0x00, 0x94, 0x95, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0x00, 0x9d, 0x00, 0x9e, 0x9f, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x8b, 0x87, 0xcd, 0x00, /* 0x80-0x87 */
+	0xce, 0xd7, 0xd8, 0x00, 0xd9, 0x00, 0xda, 0xdf, /* 0x88-0x8f */
+	0xfd, 0xb0, 0xb5, 0xa1, 0xa2, 0xb6, 0xb7, 0xb8, /* 0x90-0x97 */
+	0xa3, 0xb9, 0xba, 0xa4, 0xbb, 0xc1, 0xa5, 0xc3, /* 0x98-0x9f */
+	0xa6, 0xc4, 0x00, 0xaa, 0xc6, 0xcb, 0xbc, 0xcc, /* 0xa0-0xa7 */
+	0xbe, 0xbf, 0xab, 0xbd, 0xc0, 0xdb, 0xdc, 0xdd, /* 0xa8-0xaf */
+	0xfe, 0xe1, 0xe2, 0xe7, 0xe4, 0xe5, 0xfa, 0xe8, /* 0xb0-0xb7 */
+	0xf5, 0xe9, 0xeb, 0xec, 0xed, 0xee, 0xea, 0xef, /* 0xb8-0xbf */
+	0xf0, 0xf2, 0xf7, 0xf3, 0xf4, 0xf9, 0xe6, 0xf8, /* 0xc0-0xc7 */
+	0xe3, 0xf6, 0xfb, 0xfc, 0xde, 0xe0, 0xf1, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */
+	0xa0, 0x00, 0x96, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "macgreek",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_macgreek(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_macgreek(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_macgreek)
+module_exit(exit_nls_macgreek)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-iceland.c b/kernel/fs/nls/mac-iceland.c
new file mode 100644
index 000000000..cbd0875c6
--- /dev/null
+++ b/kernel/fs/nls/mac-iceland.c
@@ -0,0 +1,601 @@
+/*
+ * linux/fs/nls/mac-iceland.c
+ *
+ * Charset maciceland translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00c5, 0x00c7, 0x00c9,
+	0x00d1, 0x00d6, 0x00dc, 0x00e1,
+	0x00e0, 0x00e2, 0x00e4, 0x00e3,
+	0x00e5, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00ed, 0x00ec,
+	0x00ee, 0x00ef, 0x00f1, 0x00f3,
+	0x00f2, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x00dd, 0x00b0, 0x00a2, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x00a9, 0x2122, 0x00b4,
+	0x00a8, 0x2260, 0x00c6, 0x00d8,
+	/* 0xb0 */
+	0x221e, 0x00b1, 0x2264, 0x2265,
+	0x00a5, 0x00b5, 0x2202, 0x2211,
+	0x220f, 0x03c0, 0x222b, 0x00aa,
+	0x00ba, 0x03a9, 0x00e6, 0x00f8,
+	/* 0xc0 */
+	0x00bf, 0x00a1, 0x00ac, 0x221a,
+	0x0192, 0x2248, 0x2206, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x00c0,
+	0x00c3, 0x00d5, 0x0152, 0x0153,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x25ca,
+	0x00ff, 0x0178, 0x2044, 0x20ac,
+	0x00d0, 0x00f0, 0x00de, 0x00fe,
+	/* 0xe0 */
+	0x00fd, 0x00b7, 0x201a, 0x201e,
+	0x2030, 0x00c2, 0x00ca, 0x00c1,
+	0x00cb, 0x00c8, 0x00cd, 0x00ce,
+	0x00cf, 0x00cc, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0xf8ff, 0x00d2, 0x00da, 0x00db,
+	0x00d9, 0x0131, 0x02c6, 0x02dc,
+	0x00af, 0x02d8, 0x02d9, 0x02da,
+	0x00b8, 0x02dd, 0x02db, 0x02c7,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */
+	0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */
+	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */
+	0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */
+	0xdc, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0xa0, 0xde, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */
+	0xdd, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0xe0, 0xdf, 0xd8, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */
+	0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page25[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char pagef8[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, page03, NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   page25, NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	pagef8, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "maciceland",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_maciceland(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_maciceland(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_maciceland)
+module_exit(exit_nls_maciceland)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-inuit.c b/kernel/fs/nls/mac-inuit.c
new file mode 100644
index 000000000..fba8357aa
--- /dev/null
+++ b/kernel/fs/nls/mac-inuit.c
@@ -0,0 +1,531 @@
+/*
+ * linux/fs/nls/mac-inuit.c
+ *
+ * Charset macinuit translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x1403, 0x1404, 0x1405, 0x1406,
+	0x140a, 0x140b, 0x1431, 0x1432,
+	0x1433, 0x1434, 0x1438, 0x1439,
+	0x1449, 0x144e, 0x144f, 0x1450,
+	/* 0x90 */
+	0x1451, 0x1455, 0x1456, 0x1466,
+	0x146d, 0x146e, 0x146f, 0x1470,
+	0x1472, 0x1473, 0x1483, 0x148b,
+	0x148c, 0x148d, 0x148e, 0x1490,
+	/* 0xa0 */
+	0x1491, 0x00b0, 0x14a1, 0x14a5,
+	0x14a6, 0x2022, 0x00b6, 0x14a7,
+	0x00ae, 0x00a9, 0x2122, 0x14a8,
+	0x14aa, 0x14ab, 0x14bb, 0x14c2,
+	/* 0xb0 */
+	0x14c3, 0x14c4, 0x14c5, 0x14c7,
+	0x14c8, 0x14d0, 0x14ef, 0x14f0,
+	0x14f1, 0x14f2, 0x14f4, 0x14f5,
+	0x1505, 0x14d5, 0x14d6, 0x14d7,
+	/* 0xc0 */
+	0x14d8, 0x14da, 0x14db, 0x14ea,
+	0x1528, 0x1529, 0x152a, 0x152b,
+	0x152d, 0x2026, 0x00a0, 0x152e,
+	0x153e, 0x1555, 0x1556, 0x1557,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x1558, 0x1559,
+	0x155a, 0x155d, 0x1546, 0x1547,
+	0x1548, 0x1549, 0x154b, 0x154c,
+	/* 0xe0 */
+	0x1550, 0x157f, 0x1580, 0x1581,
+	0x1582, 0x1583, 0x1584, 0x1585,
+	0x158f, 0x1590, 0x1591, 0x1592,
+	0x1593, 0x1594, 0x1595, 0x1671,
+	/* 0xf0 */
+	0x1672, 0x1673, 0x1674, 0x1675,
+	0x1676, 0x1596, 0x15a0, 0x15a1,
+	0x15a2, 0x15a3, 0x15a4, 0x15a5,
+	0x15a6, 0x157c, 0x0141, 0x0142,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0x00, 0x00, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */
+	0xa1, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0xfe, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page14[256] = {
+	0x00, 0x00, 0x00, 0x80, 0x81, 0x82, 0x83, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x84, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x86, 0x87, 0x88, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x8a, 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x8d, 0x8e, /* 0x48-0x4f */
+	0x8f, 0x90, 0x00, 0x00, 0x00, 0x91, 0x92, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x95, 0x96, /* 0x68-0x6f */
+	0x97, 0x00, 0x98, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x88-0x8f */
+	0x9f, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0xa2, 0x00, 0x00, 0x00, 0xa3, 0xa4, 0xa7, /* 0xa0-0xa7 */
+	0xab, 0x00, 0xac, 0xad, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0xaf, 0xb0, 0xb1, 0xb2, 0x00, 0xb3, /* 0xc0-0xc7 */
+	0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0xb5, 0x00, 0x00, 0x00, 0x00, 0xbd, 0xbe, 0xbf, /* 0xd0-0xd7 */
+	0xc0, 0x00, 0xc1, 0xc2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x00, 0xb6, /* 0xe8-0xef */
+	0xb7, 0xb8, 0xb9, 0x00, 0xba, 0xbb, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page15[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0xc4, 0xc5, 0xc6, 0xc7, 0x00, 0xc8, 0xcb, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xda, 0xdb, /* 0x40-0x47 */
+	0xdc, 0xdd, 0x00, 0xde, 0xdf, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xe0, 0x00, 0x00, 0x00, 0x00, 0xcd, 0xce, 0xcf, /* 0x50-0x57 */
+	0xd6, 0xd7, 0xd8, 0x00, 0x00, 0xd9, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x00, 0xe1, /* 0x78-0x7f */
+	0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, /* 0x88-0x8f */
+	0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xf5, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page16[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   page14, page15, page16, NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x00-0x07 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x08-0x0f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x10-0x17 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x18-0x1f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x20-0x27 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x28-0x2f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x30-0x37 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x38-0x3f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x40-0x47 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x48-0x4f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x50-0x57 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x58-0x5f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x60-0x67 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x68-0x6f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x70-0x77 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x78-0x7f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x80-0x87 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x88-0x8f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x90-0x97 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x98-0x9f */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa0-0xa7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa8-0xaf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb0-0xb7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb8-0xbf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc0-0xc7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc8-0xcf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd0-0xd7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd8-0xdf */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe0-0xe7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe8-0xef */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0-0xf7 */
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "macinuit",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_macinuit(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_macinuit(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_macinuit)
+module_exit(exit_nls_macinuit)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-roman.c b/kernel/fs/nls/mac-roman.c
new file mode 100644
index 000000000..b6a98a520
--- /dev/null
+++ b/kernel/fs/nls/mac-roman.c
@@ -0,0 +1,636 @@
+/*
+ * linux/fs/nls/mac-roman.c
+ *
+ * Charset macroman translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00c5, 0x00c7, 0x00c9,
+	0x00d1, 0x00d6, 0x00dc, 0x00e1,
+	0x00e0, 0x00e2, 0x00e4, 0x00e3,
+	0x00e5, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00ed, 0x00ec,
+	0x00ee, 0x00ef, 0x00f1, 0x00f3,
+	0x00f2, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x00a2, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x00a9, 0x2122, 0x00b4,
+	0x00a8, 0x2260, 0x00c6, 0x00d8,
+	/* 0xb0 */
+	0x221e, 0x00b1, 0x2264, 0x2265,
+	0x00a5, 0x00b5, 0x2202, 0x2211,
+	0x220f, 0x03c0, 0x222b, 0x00aa,
+	0x00ba, 0x03a9, 0x00e6, 0x00f8,
+	/* 0xc0 */
+	0x00bf, 0x00a1, 0x00ac, 0x221a,
+	0x0192, 0x2248, 0x2206, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x00c0,
+	0x00c3, 0x00d5, 0x0152, 0x0153,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x25ca,
+	0x00ff, 0x0178, 0x2044, 0x20ac,
+	0x2039, 0x203a, 0xfb01, 0xfb02,
+	/* 0xe0 */
+	0x2021, 0x00b7, 0x201a, 0x201e,
+	0x2030, 0x00c2, 0x00ca, 0x00c1,
+	0x00cb, 0x00c8, 0x00cd, 0x00ce,
+	0x00cf, 0x00cc, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0xf8ff, 0x00d2, 0x00da, 0x00db,
+	0x00d9, 0x0131, 0x02c6, 0x02dc,
+	0x00af, 0x02d8, 0x02d9, 0x02da,
+	0x00b8, 0x02dd, 0x02db, 0x02c7,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */
+	0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */
+	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */
+	0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */
+	0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */
+	0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */
+	0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */
+	0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page25[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char pagef8[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */
+};
+
+static const unsigned char pagefb[256] = {
+	0x00, 0xde, 0xdf, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, page03, NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   page25, NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	pagef8, NULL,   NULL,   pagefb, NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "macroman",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_macroman(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_macroman(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_macroman)
+module_exit(exit_nls_macroman)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-romanian.c b/kernel/fs/nls/mac-romanian.c
new file mode 100644
index 000000000..25547f023
--- /dev/null
+++ b/kernel/fs/nls/mac-romanian.c
@@ -0,0 +1,601 @@
+/*
+ * linux/fs/nls/mac-romanian.c
+ *
+ * Charset macromanian translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00c5, 0x00c7, 0x00c9,
+	0x00d1, 0x00d6, 0x00dc, 0x00e1,
+	0x00e0, 0x00e2, 0x00e4, 0x00e3,
+	0x00e5, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00ed, 0x00ec,
+	0x00ee, 0x00ef, 0x00f1, 0x00f3,
+	0x00f2, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x00a2, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x00a9, 0x2122, 0x00b4,
+	0x00a8, 0x2260, 0x0102, 0x0218,
+	/* 0xb0 */
+	0x221e, 0x00b1, 0x2264, 0x2265,
+	0x00a5, 0x00b5, 0x2202, 0x2211,
+	0x220f, 0x03c0, 0x222b, 0x00aa,
+	0x00ba, 0x03a9, 0x0103, 0x0219,
+	/* 0xc0 */
+	0x00bf, 0x00a1, 0x00ac, 0x221a,
+	0x0192, 0x2248, 0x2206, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x00c0,
+	0x00c3, 0x00d5, 0x0152, 0x0153,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x25ca,
+	0x00ff, 0x0178, 0x2044, 0x20ac,
+	0x2039, 0x203a, 0x021a, 0x021b,
+	/* 0xe0 */
+	0x2021, 0x00b7, 0x201a, 0x201e,
+	0x2030, 0x00c2, 0x00ca, 0x00c1,
+	0x00cb, 0x00c8, 0x00cd, 0x00ce,
+	0x00cf, 0x00cc, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0xf8ff, 0x00d2, 0x00da, 0x00db,
+	0x00d9, 0x0131, 0x02c6, 0x02dc,
+	0x00af, 0x02d8, 0x02d9, 0x02da,
+	0x00b8, 0x02dd, 0x02db, 0x02c7,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */
+	0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */
+	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0x00, 0x82, /* 0xc0-0xc7 */
+	0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */
+	0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0x00, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0x00, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */
+	0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0x00, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xae, 0xbe, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xaf, 0xbf, 0xde, 0xdf, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */
+	0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */
+	0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page25[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char pagef8[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, page03, NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   page25, NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	pagef8, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "macromanian",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_macromanian(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_macromanian(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_macromanian)
+module_exit(exit_nls_macromanian)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/mac-turkish.c b/kernel/fs/nls/mac-turkish.c
new file mode 100644
index 000000000..b5454bc7b
--- /dev/null
+++ b/kernel/fs/nls/mac-turkish.c
@@ -0,0 +1,601 @@
+/*
+ * linux/fs/nls/mac-turkish.c
+ *
+ * Charset macturkish translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of the Unicode data files and any associated documentation (the "Data
+ * Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do
+ * so, provided that (a) the above copyright notice(s) and this permission
+ * notice appear with all copies of the Data Files or Software, (b) both the
+ * above copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written
+ * authorization of the copyright holder.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00 */
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10 */
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20 */
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30 */
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40 */
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50 */
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60 */
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70 */
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80 */
+	0x00c4, 0x00c5, 0x00c7, 0x00c9,
+	0x00d1, 0x00d6, 0x00dc, 0x00e1,
+	0x00e0, 0x00e2, 0x00e4, 0x00e3,
+	0x00e5, 0x00e7, 0x00e9, 0x00e8,
+	/* 0x90 */
+	0x00ea, 0x00eb, 0x00ed, 0x00ec,
+	0x00ee, 0x00ef, 0x00f1, 0x00f3,
+	0x00f2, 0x00f4, 0x00f6, 0x00f5,
+	0x00fa, 0x00f9, 0x00fb, 0x00fc,
+	/* 0xa0 */
+	0x2020, 0x00b0, 0x00a2, 0x00a3,
+	0x00a7, 0x2022, 0x00b6, 0x00df,
+	0x00ae, 0x00a9, 0x2122, 0x00b4,
+	0x00a8, 0x2260, 0x00c6, 0x00d8,
+	/* 0xb0 */
+	0x221e, 0x00b1, 0x2264, 0x2265,
+	0x00a5, 0x00b5, 0x2202, 0x2211,
+	0x220f, 0x03c0, 0x222b, 0x00aa,
+	0x00ba, 0x03a9, 0x00e6, 0x00f8,
+	/* 0xc0 */
+	0x00bf, 0x00a1, 0x00ac, 0x221a,
+	0x0192, 0x2248, 0x2206, 0x00ab,
+	0x00bb, 0x2026, 0x00a0, 0x00c0,
+	0x00c3, 0x00d5, 0x0152, 0x0153,
+	/* 0xd0 */
+	0x2013, 0x2014, 0x201c, 0x201d,
+	0x2018, 0x2019, 0x00f7, 0x25ca,
+	0x00ff, 0x0178, 0x011e, 0x011f,
+	0x0130, 0x0131, 0x015e, 0x015f,
+	/* 0xe0 */
+	0x2021, 0x00b7, 0x201a, 0x201e,
+	0x2030, 0x00c2, 0x00ca, 0x00c1,
+	0x00cb, 0x00c8, 0x00cd, 0x00ce,
+	0x00cf, 0x00cc, 0x00d3, 0x00d4,
+	/* 0xf0 */
+	0xf8ff, 0x00d2, 0x00da, 0x00db,
+	0x00d9, 0xf8a0, 0x02c6, 0x02dc,
+	0x00af, 0x02d8, 0x02d9, 0x02da,
+	0x00b8, 0x02dd, 0x02db, 0x02c7,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */
+	0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */
+	0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */
+	0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */
+	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */
+	0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */
+	0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */
+	0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */
+	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */
+	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */
+	0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */
+	0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xda, 0xdb, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xdf, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */
+	0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */
+	0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page25[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char pagef8[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, page03, NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	page20, page21, page22, NULL,   NULL,   page25, NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+	pagef8, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "macturkish",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_macturkish(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_macturkish(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_macturkish)
+module_exit(exit_nls_macturkish)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_ascii.c b/kernel/fs/nls/nls_ascii.c
new file mode 100644
index 000000000..a2620650d
--- /dev/null
+++ b/kernel/fs/nls/nls_ascii.c
@@ -0,0 +1,166 @@
+/*
+ * linux/fs/nls/nls_ascii.c
+ *
+ * Charset ascii translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00,
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "ascii",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_ascii(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_ascii(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_ascii)
+module_exit(exit_nls_ascii)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_base.c b/kernel/fs/nls/nls_base.c
new file mode 100644
index 000000000..52ccd34b1
--- /dev/null
+++ b/kernel/fs/nls/nls_base.c
@@ -0,0 +1,548 @@
+/*
+ * linux/fs/nls/nls_base.c
+ *
+ * Native language support--charsets and unicode translations.
+ * By Gordon Chaffee 1996, 1997
+ *
+ * Unicode based case conversion 1999 by Wolfram Pienkoss
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/kmod.h>
+#include <linux/spinlock.h>
+#include <asm/byteorder.h>
+
+static struct nls_table default_table;
+static struct nls_table *tables = &default_table;
+static DEFINE_SPINLOCK(nls_lock);
+
+/*
+ * Sample implementation from Unicode home page.
+ * http://www.stonehand.com/unicode/standard/fss-utf.html
+ */
+struct utf8_table {
+	int     cmask;
+	int     cval;
+	int     shift;
+	long    lmask;
+	long    lval;
+};
+
+static const struct utf8_table utf8_table[] =
+{
+    {0x80,  0x00,   0*6,    0x7F,           0,         /* 1 byte sequence */},
+    {0xE0,  0xC0,   1*6,    0x7FF,          0x80,      /* 2 byte sequence */},
+    {0xF0,  0xE0,   2*6,    0xFFFF,         0x800,     /* 3 byte sequence */},
+    {0xF8,  0xF0,   3*6,    0x1FFFFF,       0x10000,   /* 4 byte sequence */},
+    {0xFC,  0xF8,   4*6,    0x3FFFFFF,      0x200000,  /* 5 byte sequence */},
+    {0xFE,  0xFC,   5*6,    0x7FFFFFFF,     0x4000000, /* 6 byte sequence */},
+    {0,						       /* end of table    */}
+};
+
+#define UNICODE_MAX	0x0010ffff
+#define PLANE_SIZE	0x00010000
+
+#define SURROGATE_MASK	0xfffff800
+#define SURROGATE_PAIR	0x0000d800
+#define SURROGATE_LOW	0x00000400
+#define SURROGATE_BITS	0x000003ff
+
+int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu)
+{
+	unsigned long l;
+	int c0, c, nc;
+	const struct utf8_table *t;
+  
+	nc = 0;
+	c0 = *s;
+	l = c0;
+	for (t = utf8_table; t->cmask; t++) {
+		nc++;
+		if ((c0 & t->cmask) == t->cval) {
+			l &= t->lmask;
+			if (l < t->lval || l > UNICODE_MAX ||
+					(l & SURROGATE_MASK) == SURROGATE_PAIR)
+				return -1;
+			*pu = (unicode_t) l;
+			return nc;
+		}
+		if (inlen <= nc)
+			return -1;
+		s++;
+		c = (*s ^ 0x80) & 0xFF;
+		if (c & 0xC0)
+			return -1;
+		l = (l << 6) | c;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(utf8_to_utf32);
+
+int utf32_to_utf8(unicode_t u, u8 *s, int maxout)
+{
+	unsigned long l;
+	int c, nc;
+	const struct utf8_table *t;
+
+	if (!s)
+		return 0;
+
+	l = u;
+	if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+		return -1;
+
+	nc = 0;
+	for (t = utf8_table; t->cmask && maxout; t++, maxout--) {
+		nc++;
+		if (l <= t->lmask) {
+			c = t->shift;
+			*s = (u8) (t->cval | (l >> c));
+			while (c > 0) {
+				c -= 6;
+				s++;
+				*s = (u8) (0x80 | ((l >> c) & 0x3F));
+			}
+			return nc;
+		}
+	}
+	return -1;
+}
+EXPORT_SYMBOL(utf32_to_utf8);
+
+static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
+{
+	switch (endian) {
+	default:
+		*s = (wchar_t) c;
+		break;
+	case UTF16_LITTLE_ENDIAN:
+		*s = __cpu_to_le16(c);
+		break;
+	case UTF16_BIG_ENDIAN:
+		*s = __cpu_to_be16(c);
+		break;
+	}
+}
+
+int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
+		wchar_t *pwcs, int maxout)
+{
+	u16 *op;
+	int size;
+	unicode_t u;
+
+	op = pwcs;
+	while (inlen > 0 && maxout > 0 && *s) {
+		if (*s & 0x80) {
+			size = utf8_to_utf32(s, inlen, &u);
+			if (size < 0)
+				return -EINVAL;
+			s += size;
+			inlen -= size;
+
+			if (u >= PLANE_SIZE) {
+				if (maxout < 2)
+					break;
+				u -= PLANE_SIZE;
+				put_utf16(op++, SURROGATE_PAIR |
+						((u >> 10) & SURROGATE_BITS),
+						endian);
+				put_utf16(op++, SURROGATE_PAIR |
+						SURROGATE_LOW |
+						(u & SURROGATE_BITS),
+						endian);
+				maxout -= 2;
+			} else {
+				put_utf16(op++, u, endian);
+				maxout--;
+			}
+		} else {
+			put_utf16(op++, *s++, endian);
+			inlen--;
+			maxout--;
+		}
+	}
+	return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+	switch (endian) {
+	default:
+		return c;
+	case UTF16_LITTLE_ENDIAN:
+		return __le16_to_cpu(c);
+	case UTF16_BIG_ENDIAN:
+		return __be16_to_cpu(c);
+	}
+}
+
+int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
+		u8 *s, int maxout)
+{
+	u8 *op;
+	int size;
+	unsigned long u, v;
+
+	op = s;
+	while (inlen > 0 && maxout > 0) {
+		u = get_utf16(*pwcs, endian);
+		if (!u)
+			break;
+		pwcs++;
+		inlen--;
+		if (u > 0x7f) {
+			if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+				if (u & SURROGATE_LOW) {
+					/* Ignore character and move on */
+					continue;
+				}
+				if (inlen <= 0)
+					break;
+				v = get_utf16(*pwcs, endian);
+				if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+						!(v & SURROGATE_LOW)) {
+					/* Ignore character and move on */
+					continue;
+				}
+				u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+						+ (v & SURROGATE_BITS);
+				pwcs++;
+				inlen--;
+			}
+			size = utf32_to_utf8(u, op, maxout);
+			if (size == -1) {
+				/* Ignore character and move on */
+			} else {
+				op += size;
+				maxout -= size;
+			}
+		} else {
+			*op++ = (u8) u;
+			maxout--;
+		}
+	}
+	return op - s;
+}
+EXPORT_SYMBOL(utf16s_to_utf8s);
+
+int __register_nls(struct nls_table *nls, struct module *owner)
+{
+	struct nls_table ** tmp = &tables;
+
+	if (nls->next)
+		return -EBUSY;
+
+	nls->owner = owner;
+	spin_lock(&nls_lock);
+	while (*tmp) {
+		if (nls == *tmp) {
+			spin_unlock(&nls_lock);
+			return -EBUSY;
+		}
+		tmp = &(*tmp)->next;
+	}
+	nls->next = tables;
+	tables = nls;
+	spin_unlock(&nls_lock);
+	return 0;	
+}
+EXPORT_SYMBOL(__register_nls);
+
+int unregister_nls(struct nls_table * nls)
+{
+	struct nls_table ** tmp = &tables;
+
+	spin_lock(&nls_lock);
+	while (*tmp) {
+		if (nls == *tmp) {
+			*tmp = nls->next;
+			spin_unlock(&nls_lock);
+			return 0;
+		}
+		tmp = &(*tmp)->next;
+	}
+	spin_unlock(&nls_lock);
+	return -EINVAL;
+}
+
+static struct nls_table *find_nls(char *charset)
+{
+	struct nls_table *nls;
+	spin_lock(&nls_lock);
+	for (nls = tables; nls; nls = nls->next) {
+		if (!strcmp(nls->charset, charset))
+			break;
+		if (nls->alias && !strcmp(nls->alias, charset))
+			break;
+	}
+	if (nls && !try_module_get(nls->owner))
+		nls = NULL;
+	spin_unlock(&nls_lock);
+	return nls;
+}
+
+struct nls_table *load_nls(char *charset)
+{
+	return try_then_request_module(find_nls(charset), "nls_%s", charset);
+}
+
+void unload_nls(struct nls_table *nls)
+{
+	if (nls)
+		module_put(nls->owner);
+}
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x00a4, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00ba, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x00d0, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x00dd, 0x00de, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x00f0, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x00fd, 0x00fe, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table default_table = {
+	.charset	= "default",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+/* Returns a simple default translation table */
+struct nls_table *load_nls_default(void)
+{
+	struct nls_table *default_nls;
+	
+	default_nls = load_nls(CONFIG_NLS_DEFAULT);
+	if (default_nls != NULL)
+		return default_nls;
+	else
+		return &default_table;
+}
+
+EXPORT_SYMBOL(unregister_nls);
+EXPORT_SYMBOL(unload_nls);
+EXPORT_SYMBOL(load_nls);
+EXPORT_SYMBOL(load_nls_default);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp1250.c b/kernel/fs/nls/nls_cp1250.c
new file mode 100644
index 000000000..ace3e19d3
--- /dev/null
+++ b/kernel/fs/nls/nls_cp1250.c
@@ -0,0 +1,346 @@
+/*
+ * linux/fs/nls/nls_cp1250.c
+ *
+ * Charset cp1250 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003, 
+	0x0004, 0x0005, 0x0006, 0x0007, 
+	0x0008, 0x0009, 0x000a, 0x000b, 
+	0x000c, 0x000d, 0x000e, 0x000f, 
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013, 
+	0x0014, 0x0015, 0x0016, 0x0017, 
+	0x0018, 0x0019, 0x001a, 0x001b, 
+	0x001c, 0x001d, 0x001e, 0x001f, 
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023, 
+	0x0024, 0x0025, 0x0026, 0x0027, 
+	0x0028, 0x0029, 0x002a, 0x002b, 
+	0x002c, 0x002d, 0x002e, 0x002f, 
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033, 
+	0x0034, 0x0035, 0x0036, 0x0037, 
+	0x0038, 0x0039, 0x003a, 0x003b, 
+	0x003c, 0x003d, 0x003e, 0x003f, 
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043, 
+	0x0044, 0x0045, 0x0046, 0x0047, 
+	0x0048, 0x0049, 0x004a, 0x004b, 
+	0x004c, 0x004d, 0x004e, 0x004f, 
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053, 
+	0x0054, 0x0055, 0x0056, 0x0057, 
+	0x0058, 0x0059, 0x005a, 0x005b, 
+	0x005c, 0x005d, 0x005e, 0x005f, 
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063, 
+	0x0064, 0x0065, 0x0066, 0x0067, 
+	0x0068, 0x0069, 0x006a, 0x006b, 
+	0x006c, 0x006d, 0x006e, 0x006f, 
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073, 
+	0x0074, 0x0075, 0x0076, 0x0077, 
+	0x0078, 0x0079, 0x007a, 0x007b, 
+	0x007c, 0x007d, 0x007e, 0x007f, 
+	/* 0x80*/
+	0x20ac, 0x0000, 0x201a, 0x0000, 
+	0x201e, 0x2026, 0x2020, 0x2021, 
+	0x0000, 0x2030, 0x0160, 0x2039, 
+	0x015a, 0x0164, 0x017d, 0x0179, 
+	/* 0x90*/
+	0x0000, 0x2018, 0x2019, 0x201c, 
+	0x201d, 0x2022, 0x2013, 0x2014, 
+	0x0000, 0x2122, 0x0161, 0x203a, 
+	0x015b, 0x0165, 0x017e, 0x017a, 
+	/* 0xa0*/
+	0x00a0, 0x02c7, 0x02d8, 0x0141, 
+	0x00a4, 0x0104, 0x00a6, 0x00a7, 
+	0x00a8, 0x00a9, 0x015e, 0x00ab, 
+	0x00ac, 0x00ad, 0x00ae, 0x017b, 
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x02db, 0x0142, 
+	0x00b4, 0x00b5, 0x00b6, 0x00b7, 
+	0x00b8, 0x0105, 0x015f, 0x00bb, 
+	0x013d, 0x02dd, 0x013e, 0x017c, 
+	/* 0xc0*/
+	0x0154, 0x00c1, 0x00c2, 0x0102, 
+	0x00c4, 0x0139, 0x0106, 0x00c7, 
+	0x010c, 0x00c9, 0x0118, 0x00cb, 
+	0x011a, 0x00cd, 0x00ce, 0x010e, 
+	/* 0xd0*/
+	0x0110, 0x0143, 0x0147, 0x00d3, 
+	0x00d4, 0x0150, 0x00d6, 0x00d7, 
+	0x0158, 0x016e, 0x00da, 0x0170, 
+	0x00dc, 0x00dd, 0x0162, 0x00df, 
+	/* 0xe0*/
+	0x0155, 0x00e1, 0x00e2, 0x0103, 
+	0x00e4, 0x013a, 0x0107, 0x00e7, 
+	0x010d, 0x00e9, 0x0119, 0x00eb, 
+	0x011b, 0x00ed, 0x00ee, 0x010f, 
+	/* 0xf0*/
+	0x0111, 0x0144, 0x0148, 0x00f3, 
+	0x00f4, 0x0151, 0x00f6, 0x00f7, 
+	0x0159, 0x016f, 0x00fa, 0x0171, 
+	0x00fc, 0x00fd, 0x0163, 0x02d9, 
+	};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */
+	};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xc3, 0xe3, 0xa5, 0xb9, 0xc6, 0xe6, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */
+	0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xc5, 0xe5, 0x00, 0x00, 0xbc, 0xbe, 0x00, /* 0x38-0x3f */
+	0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */
+	0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */
+	0xd8, 0xf8, 0x8c, 0x9c, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */
+	0x8a, 0x9a, 0xde, 0xfe, 0x8d, 0x9d, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */
+	0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x8f, 0x9f, 0xaf, 0xbf, 0x8e, 0x9e, 0x00, /* 0x78-0x7f */
+
+	};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */
+	};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
+	0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00,	page01,	page02,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	page20,	page21,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x00, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xb9, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbe, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+	};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x00, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa5, 0xaa, 0xbb, 0xbc, 0xbd, 0xbc, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+	};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+        const unsigned char *uni2charset;
+        unsigned char cl = uni & 0x00ff;
+        unsigned char ch = (uni & 0xff00) >> 8;
+
+        if (boundlen <= 0)
+                return -ENAMETOOLONG;
+
+        uni2charset = page_uni2charset[ch];
+        if (uni2charset && uni2charset[cl])
+                out[0] = uni2charset[cl];
+        else
+                return -EINVAL;
+        return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+        *uni = charset2uni[*rawstring];
+        if (*uni == 0x0000)
+                return -EINVAL;
+        return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp1250",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp1250(void)
+{
+        return register_nls(&table);
+}
+static void __exit exit_nls_cp1250(void)
+{
+        unregister_nls(&table);
+}
+
+module_init(init_nls_cp1250)
+module_exit(exit_nls_cp1250)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp1251.c b/kernel/fs/nls/nls_cp1251.c
new file mode 100644
index 000000000..9273ddfd0
--- /dev/null
+++ b/kernel/fs/nls/nls_cp1251.c
@@ -0,0 +1,301 @@
+/*
+ * linux/fs/nls/nls_cp1251.c
+ *
+ * Charset cp1251 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003, 
+	0x0004, 0x0005, 0x0006, 0x0007, 
+	0x0008, 0x0009, 0x000a, 0x000b, 
+	0x000c, 0x000d, 0x000e, 0x000f, 
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013, 
+	0x0014, 0x0015, 0x0016, 0x0017, 
+	0x0018, 0x0019, 0x001a, 0x001b, 
+	0x001c, 0x001d, 0x001e, 0x001f, 
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023, 
+	0x0024, 0x0025, 0x0026, 0x0027, 
+	0x0028, 0x0029, 0x002a, 0x002b, 
+	0x002c, 0x002d, 0x002e, 0x002f, 
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033, 
+	0x0034, 0x0035, 0x0036, 0x0037, 
+	0x0038, 0x0039, 0x003a, 0x003b, 
+	0x003c, 0x003d, 0x003e, 0x003f, 
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043, 
+	0x0044, 0x0045, 0x0046, 0x0047, 
+	0x0048, 0x0049, 0x004a, 0x004b, 
+	0x004c, 0x004d, 0x004e, 0x004f, 
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053, 
+	0x0054, 0x0055, 0x0056, 0x0057, 
+	0x0058, 0x0059, 0x005a, 0x005b, 
+	0x005c, 0x005d, 0x005e, 0x005f, 
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063, 
+	0x0064, 0x0065, 0x0066, 0x0067, 
+	0x0068, 0x0069, 0x006a, 0x006b, 
+	0x006c, 0x006d, 0x006e, 0x006f, 
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073, 
+	0x0074, 0x0075, 0x0076, 0x0077, 
+	0x0078, 0x0079, 0x007a, 0x007b, 
+	0x007c, 0x007d, 0x007e, 0x007f, 
+	/* 0x80*/
+	0x0402, 0x0403, 0x201a, 0x0453, 
+	0x201e, 0x2026, 0x2020, 0x2021, 
+	0x20ac, 0x2030, 0x0409, 0x2039, 
+	0x040a, 0x040c, 0x040b, 0x040f, 
+	/* 0x90*/
+	0x0452, 0x2018, 0x2019, 0x201c, 
+	0x201d, 0x2022, 0x2013, 0x2014, 
+	0x0000, 0x2122, 0x0459, 0x203a, 
+	0x045a, 0x045c, 0x045b, 0x045f, 
+	/* 0xa0*/
+	0x00a0, 0x040e, 0x045e, 0x0408, 
+	0x00a4, 0x0490, 0x00a6, 0x00a7, 
+	0x0401, 0x00a9, 0x0404, 0x00ab, 
+	0x00ac, 0x00ad, 0x00ae, 0x0407, 
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x0406, 0x0456, 
+	0x0491, 0x00b5, 0x00b6, 0x00b7, 
+	0x0451, 0x2116, 0x0454, 0x00bb, 
+	0x0458, 0x0405, 0x0455, 0x0457, 
+	/* 0xc0*/
+	0x0410, 0x0411, 0x0412, 0x0413, 
+	0x0414, 0x0415, 0x0416, 0x0417, 
+	0x0418, 0x0419, 0x041a, 0x041b, 
+	0x041c, 0x041d, 0x041e, 0x041f, 
+	/* 0xd0*/
+	0x0420, 0x0421, 0x0422, 0x0423, 
+	0x0424, 0x0425, 0x0426, 0x0427, 
+	0x0428, 0x0429, 0x042a, 0x042b, 
+	0x042c, 0x042d, 0x042e, 0x042f, 
+	/* 0xe0*/
+	0x0430, 0x0431, 0x0432, 0x0433, 
+	0x0434, 0x0435, 0x0436, 0x0437, 
+	0x0438, 0x0439, 0x043a, 0x043b, 
+	0x043c, 0x043d, 0x043e, 0x043f, 
+	/* 0xf0*/
+	0x0440, 0x0441, 0x0442, 0x0443, 
+	0x0444, 0x0445, 0x0446, 0x0447, 
+	0x0448, 0x0449, 0x044a, 0x044b, 
+	0x044c, 0x044d, 0x044e, 0x044f, 
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, /* 0x00-0x07 */
+	0xa3, 0x8a, 0x8c, 0x8e, 0x8d, 0x00, 0xa1, 0x8f, /* 0x08-0x0f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x10-0x17 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x18-0x1f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x20-0x27 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x28-0x2f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0x48-0x4f */
+	0x00, 0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, /* 0x50-0x57 */
+	0xbc, 0x9a, 0x9c, 0x9e, 0x9d, 0x00, 0xa2, 0x9f, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xa5, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
+	0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, page21, NULL,	NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x90, 0x83, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa2, 0xa2, 0xbc, 0xa4, 0xb4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xb8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb3, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x80, 0x81, 0x82, 0x81, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x80, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa1, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb2, 0xa5, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xa8, 0xb9, 0xaa, 0xbb, 0xa3, 0xbd, 0xbd, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp1251",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp1251(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp1251(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp1251)
+module_exit(exit_nls_cp1251)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp1255.c b/kernel/fs/nls/nls_cp1255.c
new file mode 100644
index 000000000..1caf5dfed
--- /dev/null
+++ b/kernel/fs/nls/nls_cp1255.c
@@ -0,0 +1,384 @@
+/*
+ * linux/fs/nls/nls_cp1255.c
+ *
+ * Charset cp1255 translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x20ac, 0x0000, 0x201a, 0x0192,
+	0x201e, 0x2026, 0x2020, 0x2021,
+	0x02c6, 0x2030, 0x0000, 0x2039,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0x90*/
+	0x0000, 0x2018, 0x2019, 0x201c,
+	0x201d, 0x2022, 0x2013, 0x2014,
+	0x02dc, 0x2122, 0x0000, 0x203a,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x20aa, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00d7, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x203e,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00f7, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x05b0, 0x05b1, 0x05b2, 0x05b3,
+	0x05b4, 0x05b5, 0x05b6, 0x05b7,
+	0x05b8, 0x05b9, 0x0000, 0x05bb,
+	0x05bc, 0x05bd, 0x05be, 0x05bf,
+	/* 0xd0*/
+	0x05c0, 0x05c1, 0x05c2, 0x05c3,
+	0x05f0, 0x05f1, 0x05f2, 0x05f3,
+	0x05f4, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x2017,
+	/* 0xe0*/
+	0x05d0, 0x05d1, 0x05d2, 0x05d3,
+	0x05d4, 0x05d5, 0x05d6, 0x05d7,
+	0x05d8, 0x05d9, 0x05da, 0x05db,
+	0x05dc, 0x05dd, 0x05de, 0x05df,
+	/* 0xf0*/
+	0x05e0, 0x05e1, 0x05e2, 0x05e3,
+	0x05e4, 0x05e5, 0x05e6, 0x05e7,
+	0x05e8, 0x05e9, 0x05ea, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0x00, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xba, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char page05[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xb0-0xb7 */
+	0xc8, 0xc9, 0x00, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xb8-0xbf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xd0-0xd7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xd8-0xdf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xe0-0xe7 */
+	0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xfe, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
+	0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa4, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   page05, NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, page21, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp1255",
+	.alias		= "iso8859-8",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp1255(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp1255(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp1255)
+module_exit(exit_nls_cp1255)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(iso8859-8);
diff --git a/kernel/fs/nls/nls_cp437.c b/kernel/fs/nls/nls_cp437.c
new file mode 100644
index 000000000..7ddb830da
--- /dev/null
+++ b/kernel/fs/nls/nls_cp437.c
@@ -0,0 +1,387 @@
+/*
+ * linux/fs/nls/nls_cp437.c
+ *
+ * Charset cp437 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x00ec, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x00ff, 0x00d6, 0x00dc, 0x00a2,
+	0x00a3, 0x00a5, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp437",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp437(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp437(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp437)
+module_exit(exit_nls_cp437)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp737.c b/kernel/fs/nls/nls_cp737.c
new file mode 100644
index 000000000..c593f683a
--- /dev/null
+++ b/kernel/fs/nls/nls_cp737.c
@@ -0,0 +1,350 @@
+/*
+ * linux/fs/nls/nls_cp737.c
+ *
+ * Charset cp737 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0391, 0x0392, 0x0393, 0x0394,
+	0x0395, 0x0396, 0x0397, 0x0398,
+	0x0399, 0x039a, 0x039b, 0x039c,
+	0x039d, 0x039e, 0x039f, 0x03a0,
+	/* 0x90*/
+	0x03a1, 0x03a3, 0x03a4, 0x03a5,
+	0x03a6, 0x03a7, 0x03a8, 0x03a9,
+	0x03b1, 0x03b2, 0x03b3, 0x03b4,
+	0x03b5, 0x03b6, 0x03b7, 0x03b8,
+	/* 0xa0*/
+	0x03b9, 0x03ba, 0x03bb, 0x03bc,
+	0x03bd, 0x03be, 0x03bf, 0x03c0,
+	0x03c1, 0x03c3, 0x03c2, 0x03c4,
+	0x03c5, 0x03c6, 0x03c7, 0x03c8,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03c9, 0x03ac, 0x03ad, 0x03ae,
+	0x03ca, 0x03af, 0x03cc, 0x03cd,
+	0x03cb, 0x03ce, 0x0386, 0x0388,
+	0x0389, 0x038a, 0x038c, 0x038e,
+	/* 0xf0*/
+	0x038f, 0x00b1, 0x2265, 0x2264,
+	0x03aa, 0x03ab, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xea, 0x00, /* 0x80-0x87 */
+	0xeb, 0xec, 0xed, 0x00, 0xee, 0x00, 0xef, 0xf0, /* 0x88-0x8f */
+	0x00, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, /* 0x90-0x97 */
+	0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, /* 0x98-0x9f */
+	0x8f, 0x90, 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, /* 0xa0-0xa7 */
+	0x96, 0x97, 0xf4, 0xf5, 0xe1, 0xe2, 0xe3, 0xe5, /* 0xa8-0xaf */
+	0x00, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, /* 0xb0-0xb7 */
+	0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, /* 0xb8-0xbf */
+	0xa7, 0xa8, 0xaa, 0xa9, 0xab, 0xac, 0xad, 0xae, /* 0xc0-0xc7 */
+	0xaf, 0xe0, 0xe4, 0xe8, 0xe6, 0xe7, 0xe9, 0x00, /* 0xc8-0xcf */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x80-0x87 */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x88-0x8f */
+	0xa8, 0xa9, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xe0, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xe1, 0xe2, 0xe3, 0xe5, 0xe6, 0xe7, /* 0xe8-0xef */
+	0xe9, 0xf1, 0xf2, 0xf3, 0xe4, 0xe8, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x98-0x9f */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa0-0xa7 */
+	0x90, 0x91, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x97, 0xea, 0xeb, 0xec, 0xf4, 0xed, 0xee, 0xef, /* 0xe0-0xe7 */
+	0xf5, 0xf0, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp737",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp737(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp737(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp737)
+module_exit(exit_nls_cp737)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp775.c b/kernel/fs/nls/nls_cp775.c
new file mode 100644
index 000000000..554c86374
--- /dev/null
+++ b/kernel/fs/nls/nls_cp775.c
@@ -0,0 +1,319 @@
+/*
+ * linux/fs/nls/nls_cp775.c
+ *
+ * Charset cp775 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0106, 0x00fc, 0x00e9, 0x0101,
+	0x00e4, 0x0123, 0x00e5, 0x0107,
+	0x0142, 0x0113, 0x0156, 0x0157,
+	0x012b, 0x0179, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x014d,
+	0x00f6, 0x0122, 0x00a2, 0x015a,
+	0x015b, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x00d7, 0x00a4,
+	/* 0xa0*/
+	0x0100, 0x012a, 0x00f3, 0x017b,
+	0x017c, 0x017a, 0x201d, 0x00a6,
+	0x00a9, 0x00ae, 0x00ac, 0x00bd,
+	0x00bc, 0x0141, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x0104, 0x010c, 0x0118,
+	0x0116, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x012e, 0x0160, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x0172, 0x016a,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x017d,
+	/* 0xd0*/
+	0x0105, 0x010d, 0x0119, 0x0117,
+	0x012f, 0x0161, 0x0173, 0x016b,
+	0x017e, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x014c, 0x0143,
+	0x00f5, 0x00d5, 0x00b5, 0x0144,
+	0x0136, 0x0137, 0x013b, 0x013c,
+	0x0146, 0x0112, 0x0145, 0x2019,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x201c, 0x00be,
+	0x00b6, 0x00a7, 0x00f7, 0x201e,
+	0x00b0, 0x2219, 0x00b7, 0x00b9,
+	0x00b3, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x96, 0x9c, 0x9f, 0x00, 0xa7, 0xf5, /* 0xa0-0xa7 */
+	0x00, 0xa8, 0x00, 0xae, 0xaa, 0xf0, 0xa9, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xfc, 0x00, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0xfb, 0x00, 0xaf, 0xac, 0xab, 0xf3, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xe0, 0x00, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */
+	0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x84, 0x86, 0x91, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xa2, 0x00, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0xa0, 0x83, 0x00, 0x00, 0xb5, 0xd0, 0x80, 0x87, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xb6, 0xd1, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0xed, 0x89, 0x00, 0x00, 0xb8, 0xd3, /* 0x10-0x17 */
+	0xb7, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x95, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0xa1, 0x8c, 0x00, 0x00, 0xbd, 0xd4, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xe9, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0xea, 0xeb, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0xad, 0x88, 0xe3, 0xe7, 0xee, 0xec, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x8b, /* 0x50-0x57 */
+	0x00, 0x00, 0x97, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xbe, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xc7, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x8d, 0xa5, 0xa3, 0xa4, 0xcf, 0xd8, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xef, 0x00, 0x00, 0xf2, 0xa6, 0xf7, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xa5, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x85, 0x96, 0x98, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0x83, 0x8c, 0xa2, 0xa4, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x88, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xd0, 0xd1, 0xd2, /* 0xb0-0xb7 */
+	0xd3, 0xb9, 0xba, 0xbb, 0xbc, 0xd4, 0xd5, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xd6, 0xd7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xd8, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0xe7, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe9, 0xe9, 0xeb, 0xeb, 0xec, 0x89, 0xec, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xa0, 0x8e, 0x95, 0x8f, 0x80, /* 0x80-0x87 */
+	0xad, 0xed, 0x8a, 0x8a, 0xa1, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0xe2, 0x99, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xe0, 0xa3, 0xa3, 0x8d, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, 0xc6, 0xc7, /* 0xd0-0xd7 */
+	0xcf, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe3, /* 0xe0-0xe7 */
+	0xe8, 0xe8, 0xea, 0xea, 0xee, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp775",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp775(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp775(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp775)
+module_exit(exit_nls_cp775)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp850.c b/kernel/fs/nls/nls_cp850.c
new file mode 100644
index 000000000..56cccd14b
--- /dev/null
+++ b/kernel/fs/nls/nls_cp850.c
@@ -0,0 +1,315 @@
+/*
+ * linux/fs/nls/nls_cp850.c
+ *
+ * Charset cp850 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x00ec, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x00ff, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x00d7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x00ae, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x00c1, 0x00c2, 0x00c0,
+	0x00a9, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x00a2, 0x00a5, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x00e3, 0x00c3,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x00f0, 0x00d0, 0x00ca, 0x00cb,
+	0x00c8, 0x0131, 0x00cd, 0x00ce,
+	0x00cf, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x00a6, 0x00cc, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x00d4, 0x00d2,
+	0x00f5, 0x00d5, 0x00b5, 0x00fe,
+	0x00de, 0x00da, 0x00db, 0x00d9,
+	0x00fd, 0x00dd, 0x00af, 0x00b4,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x2017, 0x00be,
+	0x00b6, 0x00a7, 0x00f7, 0x00b8,
+	0x00b0, 0x00a8, 0x00b7, 0x00b9,
+	0x00b3, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0xb8, 0xa6, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */
+	0xf7, 0xfb, 0xa7, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */
+	0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */
+	0xd1, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */
+	0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0xed, 0xe8, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0xd0, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x97, 0xa3, 0x96, 0x81, 0xec, 0xe7, 0x98, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, /* 0x10-0x17 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd0, 0x88, 0x89, 0x8a, 0xd5, 0xa1, 0x8c, /* 0xd0-0xd7 */
+	0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x8d, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe7, 0xa3, 0x96, 0x97, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */
+	0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0xde, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd1, 0xd1, 0xd2, 0xd3, 0xd4, 0x49, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe8, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xed, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp850",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp850(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp850(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp850)
+module_exit(exit_nls_cp850)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp852.c b/kernel/fs/nls/nls_cp852.c
new file mode 100644
index 000000000..7cdc05ac1
--- /dev/null
+++ b/kernel/fs/nls/nls_cp852.c
@@ -0,0 +1,337 @@
+/*
+ * linux/fs/nls/nls_cp852.c
+ *
+ * Charset cp852 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x016f, 0x0107, 0x00e7,
+	0x0142, 0x00eb, 0x0150, 0x0151,
+	0x00ee, 0x0179, 0x00c4, 0x0106,
+	/* 0x90*/
+	0x00c9, 0x0139, 0x013a, 0x00f4,
+	0x00f6, 0x013d, 0x013e, 0x015a,
+	0x015b, 0x00d6, 0x00dc, 0x0164,
+	0x0165, 0x0141, 0x00d7, 0x010d,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x0104, 0x0105, 0x017d, 0x017e,
+	0x0118, 0x0119, 0x00ac, 0x017a,
+	0x010c, 0x015f, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x00c1, 0x00c2, 0x011a,
+	0x015e, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x017b, 0x017c, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x0102, 0x0103,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x0111, 0x0110, 0x010e, 0x00cb,
+	0x010f, 0x0147, 0x00cd, 0x00ce,
+	0x011b, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x0162, 0x016e, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x00d4, 0x0143,
+	0x0144, 0x0148, 0x0160, 0x0161,
+	0x0154, 0x00da, 0x0155, 0x0170,
+	0x00fd, 0x00dd, 0x0163, 0x00b4,
+	/* 0xf0*/
+	0x00ad, 0x02dd, 0x02db, 0x02c7,
+	0x02d8, 0x00a7, 0x00f7, 0x00b8,
+	0x00b0, 0x00a8, 0x02d9, 0x0171,
+	0x0158, 0x0159, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0x00, 0x00, 0xae, 0xaa, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0xf7, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xb5, 0xb6, 0x00, 0x8e, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0xd3, 0x00, 0xd6, 0xd7, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xe0, 0xe2, 0x00, 0x99, 0x9e, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xe9, 0x00, 0x9a, 0xed, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x00, 0xa0, 0x83, 0x00, 0x84, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */
+	0x00, 0x82, 0x00, 0x89, 0x00, 0xa1, 0x8c, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xa3, 0x00, 0x81, 0xec, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xc6, 0xc7, 0xa4, 0xa5, 0x8f, 0x86, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0x9f, 0xd2, 0xd4, /* 0x08-0x0f */
+	0xd1, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xa8, 0xa9, 0xb7, 0xd8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x00, /* 0x38-0x3f */
+	0x00, 0x9d, 0x88, 0xe3, 0xe4, 0x00, 0x00, 0xd5, /* 0x40-0x47 */
+	0xe5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x8a, 0x8b, 0x00, 0x00, 0xe8, 0xea, 0x00, 0x00, /* 0x50-0x57 */
+	0xfc, 0xfd, 0x97, 0x98, 0x00, 0x00, 0xb8, 0xad, /* 0x58-0x5f */
+	0xe6, 0xe7, 0xdd, 0xee, 0x9b, 0x9c, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x85, /* 0x68-0x6f */
+	0xeb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x8d, 0xab, 0xbd, 0xbe, 0xa6, 0xa7, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf3, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xf4, 0xfa, 0x00, 0xf2, 0x00, 0xf1, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xab, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x92, 0x92, 0x93, 0x94, 0x96, 0x96, 0x98, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9c, 0x9c, 0x88, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */
+	0xa9, 0xa9, 0xaa, 0xab, 0x9f, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0xd8, /* 0xb0-0xb7 */
+	0xad, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd0, 0xd4, 0x89, 0xd4, 0xe5, 0xa1, 0x8c, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xee, 0x85, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0xe4, 0xe4, 0xe5, 0xe7, 0xe7, /* 0xe0-0xe7 */
+	0xea, 0xa3, 0xea, 0xfb, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfd, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xde, 0x8f, 0x80, /* 0x80-0x87 */
+	0x9d, 0xd3, 0x8a, 0x8a, 0xd7, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x91, 0xe2, 0x99, 0x95, 0x95, 0x97, /* 0x90-0x97 */
+	0x97, 0x99, 0x9a, 0x9b, 0x9b, 0x9d, 0x9e, 0xac, /* 0x98-0x9f */
+	0xb5, 0xd6, 0xe0, 0xe9, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */
+	0xa8, 0xa8, 0xaa, 0x8d, 0xac, 0xb8, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd1, 0xd1, 0xd2, 0xd3, 0xd2, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xb7, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe3, 0xd5, 0xe6, 0xe6, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xe8, 0xeb, 0xed, 0xed, 0xdd, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xeb, 0xfc, 0xfc, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp852",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp852(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp852(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp852)
+module_exit(exit_nls_cp852)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp855.c b/kernel/fs/nls/nls_cp855.c
new file mode 100644
index 000000000..7426eea05
--- /dev/null
+++ b/kernel/fs/nls/nls_cp855.c
@@ -0,0 +1,299 @@
+/*
+ * linux/fs/nls/nls_cp855.c
+ *
+ * Charset cp855 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0452, 0x0402, 0x0453, 0x0403,
+	0x0451, 0x0401, 0x0454, 0x0404,
+	0x0455, 0x0405, 0x0456, 0x0406,
+	0x0457, 0x0407, 0x0458, 0x0408,
+	/* 0x90*/
+	0x0459, 0x0409, 0x045a, 0x040a,
+	0x045b, 0x040b, 0x045c, 0x040c,
+	0x045e, 0x040e, 0x045f, 0x040f,
+	0x044e, 0x042e, 0x044a, 0x042a,
+	/* 0xa0*/
+	0x0430, 0x0410, 0x0431, 0x0411,
+	0x0446, 0x0426, 0x0434, 0x0414,
+	0x0435, 0x0415, 0x0444, 0x0424,
+	0x0433, 0x0413, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x0445, 0x0425, 0x0438,
+	0x0418, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x0439, 0x0419, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x043a, 0x041a,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x043b, 0x041b, 0x043c, 0x041c,
+	0x043d, 0x041d, 0x043e, 0x041e,
+	0x043f, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x041f, 0x044f, 0x2580,
+	/* 0xe0*/
+	0x042f, 0x0440, 0x0420, 0x0441,
+	0x0421, 0x0442, 0x0422, 0x0443,
+	0x0423, 0x0436, 0x0416, 0x0432,
+	0x0412, 0x044c, 0x042c, 0x2116,
+	/* 0xf0*/
+	0x00ad, 0x044b, 0x042b, 0x0437,
+	0x0417, 0x0448, 0x0428, 0x044d,
+	0x042d, 0x0449, 0x0429, 0x0447,
+	0x0427, 0x00a7, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0xae, 0x00, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0x85, 0x81, 0x83, 0x87, 0x89, 0x8b, 0x8d, /* 0x00-0x07 */
+	0x8f, 0x91, 0x93, 0x95, 0x97, 0x00, 0x99, 0x9b, /* 0x08-0x0f */
+	0xa1, 0xa3, 0xec, 0xad, 0xa7, 0xa9, 0xea, 0xf4, /* 0x10-0x17 */
+	0xb8, 0xbe, 0xc7, 0xd1, 0xd3, 0xd5, 0xd7, 0xdd, /* 0x18-0x1f */
+	0xe2, 0xe4, 0xe6, 0xe8, 0xab, 0xb6, 0xa5, 0xfc, /* 0x20-0x27 */
+	0xf6, 0xfa, 0x9f, 0xf2, 0xee, 0xf8, 0x9d, 0xe0, /* 0x28-0x2f */
+	0xa0, 0xa2, 0xeb, 0xac, 0xa6, 0xa8, 0xe9, 0xf3, /* 0x30-0x37 */
+	0xb7, 0xbd, 0xc6, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, /* 0x38-0x3f */
+	0xe1, 0xe3, 0xe5, 0xe7, 0xaa, 0xb5, 0xa4, 0xfb, /* 0x40-0x47 */
+	0xf5, 0xf9, 0x9e, 0xf1, 0xed, 0xf7, 0x9c, 0xde, /* 0x48-0x4f */
+	0x00, 0x84, 0x80, 0x82, 0x86, 0x88, 0x8a, 0x8c, /* 0x50-0x57 */
+	0x8e, 0x90, 0x92, 0x94, 0x96, 0x00, 0x98, 0x9a, /* 0x58-0x5f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   page21, NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x80, 0x82, 0x82, 0x84, 0x84, 0x86, 0x86, /* 0x80-0x87 */
+	0x88, 0x88, 0x8a, 0x8a, 0x8c, 0x8c, 0x8e, 0x8e, /* 0x88-0x8f */
+	0x90, 0x90, 0x92, 0x92, 0x94, 0x94, 0x96, 0x96, /* 0x90-0x97 */
+	0x98, 0x98, 0x9a, 0x9a, 0x9c, 0x9c, 0x9e, 0x9e, /* 0x98-0x9f */
+	0xa0, 0xa0, 0xa2, 0xa2, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */
+	0xa8, 0xa8, 0xaa, 0xaa, 0xac, 0xac, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb5, 0xb7, /* 0xb0-0xb7 */
+	0xb7, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd0, 0xd2, 0xd2, 0xd4, 0xd4, 0xd6, 0xd6, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xd8, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xde, 0xe1, 0xe1, 0xe3, 0xe3, 0xe5, 0xe5, 0xe7, /* 0xe0-0xe7 */
+	0xe7, 0xe9, 0xe9, 0xeb, 0xeb, 0xed, 0xed, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, /* 0xf0-0xf7 */
+	0xf7, 0xf9, 0xf9, 0xfb, 0xfb, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x81, 0x81, 0x83, 0x83, 0x85, 0x85, 0x87, 0x87, /* 0x80-0x87 */
+	0x89, 0x89, 0x8b, 0x8b, 0x8d, 0x8d, 0x8f, 0x8f, /* 0x88-0x8f */
+	0x91, 0x91, 0x93, 0x93, 0x95, 0x95, 0x97, 0x97, /* 0x90-0x97 */
+	0x99, 0x99, 0x9b, 0x9b, 0x9d, 0x9d, 0x9f, 0x9f, /* 0x98-0x9f */
+	0xa1, 0xa1, 0xa3, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */
+	0xa9, 0xa9, 0xab, 0xab, 0xad, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb6, 0xb8, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd1, 0xd1, 0xd3, 0xd3, 0xd5, 0xd5, 0xd7, 0xd7, /* 0xd0-0xd7 */
+	0xdd, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xe0, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe2, 0xe2, 0xe4, 0xe4, 0xe6, 0xe6, 0xe8, /* 0xe0-0xe7 */
+	0xe8, 0xea, 0xea, 0xec, 0xec, 0xee, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, 0xf8, /* 0xf0-0xf7 */
+	0xf8, 0xfa, 0xfa, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp855",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp855(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp855(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp855)
+module_exit(exit_nls_cp855)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp857.c b/kernel/fs/nls/nls_cp857.c
new file mode 100644
index 000000000..098309733
--- /dev/null
+++ b/kernel/fs/nls/nls_cp857.c
@@ -0,0 +1,301 @@
+/*
+ * linux/fs/nls/nls_cp857.c
+ *
+ * Charset cp857 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x0131, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x0130, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x015e, 0x015f,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x011e, 0x011f,
+	0x00bf, 0x00ae, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x00c1, 0x00c2, 0x00c0,
+	0x00a9, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x00a2, 0x00a5, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x00e3, 0x00c3,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x00ba, 0x00aa, 0x00ca, 0x00cb,
+	0x00c8, 0x0000, 0x00cd, 0x00ce,
+	0x00cf, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x00a6, 0x00cc, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x00d4, 0x00d2,
+	0x00f5, 0x00d5, 0x00b5, 0x0000,
+	0x00d7, 0x00da, 0x00db, 0x00d9,
+	0x00ec, 0x00ff, 0x00af, 0x00b4,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x0000, 0x00be,
+	0x00b6, 0x00a7, 0x00f7, 0x00b8,
+	0x00b0, 0x00a8, 0x00b7, 0x00b9,
+	0x00b3, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0xb8, 0xd1, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */
+	0xf7, 0xfb, 0xd0, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */
+	0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */
+	0x00, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0xe8, /* 0xd0-0xd7 */
+	0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0xec, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0xed, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xa7, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x98, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, 0x9f, /* 0x58-0x5f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x69, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9f, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa7, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0x88, 0x89, 0x8a, 0x00, 0xa1, 0x8c, /* 0xd0-0xd7 */
+	0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xec, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xa3, 0x96, 0x97, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */
+	0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0x49, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9e, /* 0x98-0x9f */
+	0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa6, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xde, 0x00, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp857",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp857(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp857(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp857)
+module_exit(exit_nls_cp857)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp860.c b/kernel/fs/nls/nls_cp860.c
new file mode 100644
index 000000000..84224478e
--- /dev/null
+++ b/kernel/fs/nls/nls_cp860.c
@@ -0,0 +1,364 @@
+/*
+ * linux/fs/nls/nls_cp860.c
+ *
+ * Charset cp860 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e3, 0x00e0, 0x00c1, 0x00e7,
+	0x00ea, 0x00ca, 0x00e8, 0x00cd,
+	0x00d4, 0x00ec, 0x00c3, 0x00c2,
+	/* 0x90*/
+	0x00c9, 0x00c0, 0x00c8, 0x00f4,
+	0x00f5, 0x00f2, 0x00da, 0x00f9,
+	0x00cc, 0x00d5, 0x00dc, 0x00a2,
+	0x00a3, 0x00d9, 0x20a7, 0x00d3,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x00d2, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x9b, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x91, 0x86, 0x8f, 0x8e, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */
+	0x92, 0x90, 0x89, 0x00, 0x98, 0x8b, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0xa9, 0x9f, 0x8c, 0x99, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x9d, 0x96, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x84, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x00, 0x8d, 0xa1, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0x94, 0x00, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x97, 0xa3, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0xa0, 0x87, /* 0x80-0x87 */
+	0x88, 0x88, 0x8a, 0xa1, 0x93, 0x8d, 0x84, 0x83, /* 0x88-0x8f */
+	0x82, 0x85, 0x8a, 0x93, 0x94, 0x95, 0xa3, 0x97, /* 0x90-0x97 */
+	0x8d, 0x94, 0x81, 0x9b, 0x9c, 0x97, 0x9e, 0xa2, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x95, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x8f, 0x8e, 0x91, 0x86, 0x80, /* 0x80-0x87 */
+	0x89, 0x89, 0x92, 0x8b, 0x8c, 0x98, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x8c, 0x99, 0xa9, 0x96, 0x9d, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0x86, 0x8b, 0x9f, 0x96, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp860",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp860(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp860(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp860)
+module_exit(exit_nls_cp860)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp861.c b/kernel/fs/nls/nls_cp861.c
new file mode 100644
index 000000000..dc873e4be
--- /dev/null
+++ b/kernel/fs/nls/nls_cp861.c
@@ -0,0 +1,387 @@
+/*
+ * linux/fs/nls/nls_cp861.c
+ *
+ * Charset cp861 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00d0,
+	0x00f0, 0x00de, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00fe, 0x00fb, 0x00dd,
+	0x00fd, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00c1, 0x00cd, 0x00d3, 0x00da,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0xa4, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, /* 0xc8-0xcf */
+	0x8b, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
+	0x9d, 0x00, 0xa7, 0x00, 0x9a, 0x97, 0x8d, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */
+	0x8c, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x00, 0xa3, 0x96, 0x81, 0x98, 0x95, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8c, 0x8c, 0x95, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x98, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa0, 0xa1, 0xa2, 0xa3, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x8b, 0x8b, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0x00, 0x99, 0x8d, 0x00, 0x97, /* 0x90-0x97 */
+	0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa4, 0xa5, 0xa6, 0xa7, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp861",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp861(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp861(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp861)
+module_exit(exit_nls_cp861)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp862.c b/kernel/fs/nls/nls_cp862.c
new file mode 100644
index 000000000..d5263e3c5
--- /dev/null
+++ b/kernel/fs/nls/nls_cp862.c
@@ -0,0 +1,421 @@
+/*
+ * linux/fs/nls/nls_cp862.c
+ *
+ * Charset cp862 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x05d0, 0x05d1, 0x05d2, 0x05d3,
+	0x05d4, 0x05d5, 0x05d6, 0x05d7,
+	0x05d8, 0x05d9, 0x05da, 0x05db,
+	0x05dc, 0x05dd, 0x05de, 0x05df,
+	/* 0x90*/
+	0x05e0, 0x05e1, 0x05e2, 0x05e3,
+	0x05e4, 0x05e5, 0x05e6, 0x05e7,
+	0x05e8, 0x05e9, 0x05ea, 0x00a2,
+	0x00a3, 0x00a5, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0xa4, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page05[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xd0-0xd7 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xd8-0xdf */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */
+	0x98, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   page05, NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp862",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp862(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp862(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp862)
+module_exit(exit_nls_cp862)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp863.c b/kernel/fs/nls/nls_cp863.c
new file mode 100644
index 000000000..051c9832e
--- /dev/null
+++ b/kernel/fs/nls/nls_cp863.c
@@ -0,0 +1,381 @@
+/*
+ * linux/fs/nls/nls_cp863.c
+ *
+ * Charset cp863 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00c2, 0x00e0, 0x00b6, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x2017, 0x00c0, 0x00a7,
+	/* 0x90*/
+	0x00c9, 0x00c8, 0x00ca, 0x00f4,
+	0x00cb, 0x00cf, 0x00fb, 0x00f9,
+	0x00a4, 0x00d4, 0x00dc, 0x00a2,
+	0x00a3, 0x00d9, 0x00db, 0x0192,
+	/* 0xa0*/
+	0x00a6, 0x00b4, 0x00f3, 0x00fa,
+	0x00a8, 0x00b8, 0x00b3, 0x00af,
+	0x00ce, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00be, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x9b, 0x9c, 0x98, 0x00, 0xa0, 0x8f, /* 0xa0-0xa7 */
+	0xa4, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0xa7, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xa6, 0xa1, 0xe6, 0x86, 0xfa, /* 0xb0-0xb7 */
+	0xa5, 0x00, 0x00, 0xaf, 0xac, 0xab, 0xad, 0x00, /* 0xb8-0xbf */
+	0x8e, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */
+	0x91, 0x90, 0x92, 0x94, 0x00, 0x00, 0xa8, 0x95, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x9d, 0x00, 0x9e, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x00, 0x00, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8d, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x83, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x85, 0x8f, /* 0x88-0x8f */
+	0x82, 0x8a, 0x88, 0x93, 0x89, 0x8b, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x93, 0x81, 0x9b, 0x9c, 0x97, 0x96, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0x8c, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x84, 0x84, 0x8e, 0x86, 0x80, /* 0x80-0x87 */
+	0x92, 0x94, 0x91, 0x95, 0xa8, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x99, 0x94, 0x95, 0x9e, 0x9d, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0x00, 0x00, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp863",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp863(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp863(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp863)
+module_exit(exit_nls_cp863)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp864.c b/kernel/fs/nls/nls_cp864.c
new file mode 100644
index 000000000..97eb1273b
--- /dev/null
+++ b/kernel/fs/nls/nls_cp864.c
@@ -0,0 +1,407 @@
+/*
+ * linux/fs/nls/nls_cp864.c
+ *
+ * Charset cp864 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x066a, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00b0, 0x00b7, 0x2219, 0x221a,
+	0x2592, 0x2500, 0x2502, 0x253c,
+	0x2524, 0x252c, 0x251c, 0x2534,
+	0x2510, 0x250c, 0x2514, 0x2518,
+	/* 0x90*/
+	0x03b2, 0x221e, 0x03c6, 0x00b1,
+	0x00bd, 0x00bc, 0x2248, 0x00ab,
+	0x00bb, 0xfef7, 0xfef8, 0x0000,
+	0x0000, 0xfefb, 0xfefc, 0x0000,
+	/* 0xa0*/
+	0x00a0, 0x00ad, 0xfe82, 0x00a3,
+	0x00a4, 0xfe84, 0x0000, 0x0000,
+	0xfe8e, 0xfe8f, 0xfe95, 0xfe99,
+	0x060c, 0xfe9d, 0xfea1, 0xfea5,
+	/* 0xb0*/
+	0x0660, 0x0661, 0x0662, 0x0663,
+	0x0664, 0x0665, 0x0666, 0x0667,
+	0x0668, 0x0669, 0xfed1, 0x061b,
+	0xfeb1, 0xfeb5, 0xfeb9, 0x061f,
+	/* 0xc0*/
+	0x00a2, 0xfe80, 0xfe81, 0xfe83,
+	0xfe85, 0xfeca, 0xfe8b, 0xfe8d,
+	0xfe91, 0xfe93, 0xfe97, 0xfe9b,
+	0xfe9f, 0xfea3, 0xfea7, 0xfea9,
+	/* 0xd0*/
+	0xfeab, 0xfead, 0xfeaf, 0xfeb3,
+	0xfeb7, 0xfebb, 0xfebf, 0xfec1,
+	0xfec5, 0xfecb, 0xfecf, 0x00a6,
+	0x00ac, 0x00f7, 0x00d7, 0xfec9,
+	/* 0xe0*/
+	0x0640, 0xfed3, 0xfed7, 0xfedb,
+	0xfedf, 0xfee3, 0xfee7, 0xfeeb,
+	0xfeed, 0xfeef, 0xfef3, 0xfebd,
+	0xfecc, 0xfece, 0xfecd, 0xfee1,
+	/* 0xf0*/
+	0xfe7d, 0x0651, 0xfee5, 0xfee9,
+	0xfeec, 0xfef0, 0xfef2, 0xfed0,
+	0xfed5, 0xfef5, 0xfef6, 0xfedd,
+	0xfed9, 0xfef1, 0x25a0, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x00, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xc0, 0xa3, 0xa4, 0x00, 0xdb, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x97, 0xdc, 0xa1, 0x00, 0x00, /* 0xa8-0xaf */
+	0x80, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x98, 0x95, 0x94, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page06[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x60-0x67 */
+	0xb8, 0xb9, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x82, 0x83, 0x00, 0x00, 0x00, 0x91, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+};
+
+static const unsigned char page25[256] = {
+	0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x8c, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x8f, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char pagefe[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xc1, 0xc2, 0xa2, 0xc3, 0xa5, 0xc4, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0xc6, 0x00, 0xc7, 0xa8, 0xa9, /* 0x88-0x8f */
+	0x00, 0xc8, 0x00, 0xc9, 0x00, 0xaa, 0x00, 0xca, /* 0x90-0x97 */
+	0x00, 0xab, 0x00, 0xcb, 0x00, 0xad, 0x00, 0xcc, /* 0x98-0x9f */
+	0x00, 0xae, 0x00, 0xcd, 0x00, 0xaf, 0x00, 0xce, /* 0xa0-0xa7 */
+	0x00, 0xcf, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0xd2, /* 0xa8-0xaf */
+	0x00, 0xbc, 0x00, 0xd3, 0x00, 0xbd, 0x00, 0xd4, /* 0xb0-0xb7 */
+	0x00, 0xbe, 0x00, 0xd5, 0x00, 0xeb, 0x00, 0xd6, /* 0xb8-0xbf */
+	0x00, 0xd7, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0xdf, 0xc5, 0xd9, 0xec, 0xee, 0xed, 0xda, /* 0xc8-0xcf */
+	0xf7, 0xba, 0x00, 0xe1, 0x00, 0xf8, 0x00, 0xe2, /* 0xd0-0xd7 */
+	0x00, 0xfc, 0x00, 0xe3, 0x00, 0xfb, 0x00, 0xe4, /* 0xd8-0xdf */
+	0x00, 0xef, 0x00, 0xe5, 0x00, 0xf2, 0x00, 0xe6, /* 0xe0-0xe7 */
+	0x00, 0xf3, 0x00, 0xe7, 0xf4, 0xe8, 0x00, 0xe9, /* 0xe8-0xef */
+	0xf5, 0xfd, 0xf6, 0xea, 0x00, 0xf9, 0xfa, 0x99, /* 0xf0-0xf7 */
+	0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   page06, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   page22, NULL,   NULL,   page25, NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   pagefe, NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x00, 0x91, 0x00, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp864",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp864(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp864(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp864)
+module_exit(exit_nls_cp864)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp865.c b/kernel/fs/nls/nls_cp865.c
new file mode 100644
index 000000000..111214228
--- /dev/null
+++ b/kernel/fs/nls/nls_cp865.c
@@ -0,0 +1,387 @@
+/*
+ * linux/fs/nls/nls_cp865.c
+ *
+ * Charset cp865 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x00ec, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x00ff, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00a4,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x00, 0x9c, 0xaf, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0x00, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
+	0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp865",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp865(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp865(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp865)
+module_exit(exit_nls_cp865)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp866.c b/kernel/fs/nls/nls_cp866.c
new file mode 100644
index 000000000..ffdcbc3fc
--- /dev/null
+++ b/kernel/fs/nls/nls_cp866.c
@@ -0,0 +1,305 @@
+/*
+ * linux/fs/nls/nls_cp866.c
+ *
+ * Charset cp866 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0410, 0x0411, 0x0412, 0x0413,
+	0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b,
+	0x041c, 0x041d, 0x041e, 0x041f,
+	/* 0x90*/
+	0x0420, 0x0421, 0x0422, 0x0423,
+	0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b,
+	0x042c, 0x042d, 0x042e, 0x042f,
+	/* 0xa0*/
+	0x0430, 0x0431, 0x0432, 0x0433,
+	0x0434, 0x0435, 0x0436, 0x0437,
+	0x0438, 0x0439, 0x043a, 0x043b,
+	0x043c, 0x043d, 0x043e, 0x043f,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x0440, 0x0441, 0x0442, 0x0443,
+	0x0444, 0x0445, 0x0446, 0x0447,
+	0x0448, 0x0449, 0x044a, 0x044b,
+	0x044c, 0x044d, 0x044e, 0x044f,
+	/* 0xf0*/
+	0x0401, 0x0451, 0x0404, 0x0454,
+	0x0407, 0x0457, 0x040e, 0x045e,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x2116, 0x00a4, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xf0, 0x00, 0x00, 0xf2, 0x00, 0x00, 0xf4, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0x00, /* 0x08-0x0f */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x30-0x37 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0x00, 0xf1, 0x00, 0x00, 0xf3, 0x00, 0x00, 0xf5, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf7, 0x00, /* 0x58-0x5f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   page21, page22, NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x80-0x87 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x88-0x8f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x90-0x97 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xa0-0xa7 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0xe8-0xef */
+	0xf0, 0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp866",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp866(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp866(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp866)
+module_exit(exit_nls_cp866)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp869.c b/kernel/fs/nls/nls_cp869.c
new file mode 100644
index 000000000..3b5a34589
--- /dev/null
+++ b/kernel/fs/nls/nls_cp869.c
@@ -0,0 +1,315 @@
+/*
+ * linux/fs/nls/nls_cp869.c
+ *
+ * Charset cp869 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0386, 0x0000,
+	0x00b7, 0x00ac, 0x00a6, 0x2018,
+	0x2019, 0x0388, 0x2015, 0x0389,
+	/* 0x90*/
+	0x038a, 0x03aa, 0x038c, 0x0000,
+	0x0000, 0x038e, 0x03ab, 0x00a9,
+	0x038f, 0x00b2, 0x00b3, 0x03ac,
+	0x00a3, 0x03ad, 0x03ae, 0x03af,
+	/* 0xa0*/
+	0x03ca, 0x0390, 0x03cc, 0x03cd,
+	0x0391, 0x0392, 0x0393, 0x0394,
+	0x0395, 0x0396, 0x0397, 0x00bd,
+	0x0398, 0x0399, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x039a, 0x039b, 0x039c,
+	0x039d, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x039e, 0x039f, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x03a0, 0x03a1,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x03a3,
+	/* 0xd0*/
+	0x03a4, 0x03a5, 0x03a6, 0x03a7,
+	0x03a8, 0x03a9, 0x03b1, 0x03b2,
+	0x03b3, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x03b4, 0x03b5, 0x2580,
+	/* 0xe0*/
+	0x03b6, 0x03b7, 0x03b8, 0x03b9,
+	0x03ba, 0x03bb, 0x03bc, 0x03bd,
+	0x03be, 0x03bf, 0x03c0, 0x03c1,
+	0x03c3, 0x03c2, 0x03c4, 0x0384,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x03c5, 0x03c6,
+	0x03c7, 0x00a7, 0x03c8, 0x0385,
+	0x00b0, 0x00a8, 0x03c9, 0x03cb,
+	0x03b0, 0x03ce, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x8a, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0x97, 0x00, 0xae, 0x89, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x88, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xaf, 0x00, 0xab, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0xef, 0xf7, 0x86, 0x00, /* 0x80-0x87 */
+	0x8d, 0x8f, 0x90, 0x00, 0x92, 0x00, 0x95, 0x98, /* 0x88-0x8f */
+	0xa1, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, /* 0x90-0x97 */
+	0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, /* 0x98-0x9f */
+	0xc6, 0xc7, 0x00, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, /* 0xa0-0xa7 */
+	0xd4, 0xd5, 0x91, 0x96, 0x9b, 0x9d, 0x9e, 0x9f, /* 0xa8-0xaf */
+	0xfc, 0xd6, 0xd7, 0xd8, 0xdd, 0xde, 0xe0, 0xe1, /* 0xb0-0xb7 */
+	0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, /* 0xb8-0xbf */
+	0xea, 0xeb, 0xed, 0xec, 0xee, 0xf2, 0xf3, 0xf4, /* 0xc0-0xc7 */
+	0xf6, 0xfa, 0xa0, 0xfb, 0xa2, 0xa3, 0xfd, 0x00, /* 0xc8-0xcf */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, /* 0x10-0x17 */
+	0x8b, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9b, 0x00, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x9d, 0x8e, 0x9e, /* 0x88-0x8f */
+	0x9f, 0xa0, 0xa2, 0x00, 0x00, 0xa3, 0xfb, 0x97, /* 0x90-0x97 */
+	0xfd, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xd6, 0xd7, 0xd8, 0xdd, /* 0xa0-0xa7 */
+	0xde, 0xe0, 0xe1, 0xab, 0xe2, 0xe3, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xe4, 0xe5, 0xe6, /* 0xb0-0xb7 */
+	0xe7, 0xb9, 0xba, 0xbb, 0xbc, 0xe8, 0xe9, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xea, 0xeb, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xec, /* 0xc8-0xcf */
+	0xee, 0xf2, 0xf3, 0xf4, 0xf6, 0xfa, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x86, 0x9c, 0x8d, 0x8f, 0x90, /* 0x98-0x9f */
+	0x91, 0xa1, 0x92, 0x95, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xa4, 0xa5, /* 0xd0-0xd7 */
+	0xa6, 0xd9, 0xda, 0xdb, 0xdc, 0xa7, 0xa8, 0xdf, /* 0xd8-0xdf */
+	0xa9, 0xaa, 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, /* 0xe0-0xe7 */
+	0xbd, 0xbe, 0xc6, 0xc7, 0xcf, 0xcf, 0xd0, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xd1, 0xd2, 0xd3, 0xf5, 0xd4, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xd5, 0x96, 0xfc, 0x98, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp869",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp869(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp869(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp869)
+module_exit(exit_nls_cp869)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_cp874.c b/kernel/fs/nls/nls_cp874.c
new file mode 100644
index 000000000..8dfaa1071
--- /dev/null
+++ b/kernel/fs/nls/nls_cp874.c
@@ -0,0 +1,275 @@
+/*
+ * linux/fs/nls/nls_cp874.c
+ *
+ * Charset cp874 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2026, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0x90*/
+	0x0000, 0x2018, 0x2019, 0x201c,
+	0x201d, 0x2022, 0x2013, 0x2014,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0xa0*/
+	0x00a0, 0x0e01, 0x0e02, 0x0e03,
+	0x0e04, 0x0e05, 0x0e06, 0x0e07,
+	0x0e08, 0x0e09, 0x0e0a, 0x0e0b,
+	0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f,
+	/* 0xb0*/
+	0x0e10, 0x0e11, 0x0e12, 0x0e13,
+	0x0e14, 0x0e15, 0x0e16, 0x0e17,
+	0x0e18, 0x0e19, 0x0e1a, 0x0e1b,
+	0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f,
+	/* 0xc0*/
+	0x0e20, 0x0e21, 0x0e22, 0x0e23,
+	0x0e24, 0x0e25, 0x0e26, 0x0e27,
+	0x0e28, 0x0e29, 0x0e2a, 0x0e2b,
+	0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f,
+	/* 0xd0*/
+	0x0e30, 0x0e31, 0x0e32, 0x0e33,
+	0x0e34, 0x0e35, 0x0e36, 0x0e37,
+	0x0e38, 0x0e39, 0x0e3a, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0e3f,
+	/* 0xe0*/
+	0x0e40, 0x0e41, 0x0e42, 0x0e43,
+	0x0e44, 0x0e45, 0x0e46, 0x0e47,
+	0x0e48, 0x0e49, 0x0e4a, 0x0e4b,
+	0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f,
+	/* 0xf0*/
+	0x0e50, 0x0e51, 0x0e52, 0x0e53,
+	0x0e54, 0x0e55, 0x0e56, 0x0e57,
+	0x0e58, 0x0e59, 0x0e5a, 0x0e5b,
+	0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page0e[256] = {
+	0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x08-0x0f */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x00, 0x00, 0x93, 0x94, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   page0e, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp874",
+	.alias		= "tis-620",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp874(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp874(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp874)
+module_exit(exit_nls_cp874)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(tis-620);
diff --git a/kernel/fs/nls/nls_cp932.c b/kernel/fs/nls/nls_cp932.c
new file mode 100644
index 000000000..67b7398e8
--- /dev/null
+++ b/kernel/fs/nls/nls_cp932.c
@@ -0,0 +1,7933 @@
+/*
+ * linux/fs/nls/nls_cp932.c
+ *
+ * Charset cp932 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_81[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x3000,0x3001,0x3002,0xFF0C,0xFF0E,0x30FB,0xFF1A,0xFF1B,/* 0x40-0x47 */
+	0xFF1F,0xFF01,0x309B,0x309C,0x00B4,0xFF40,0x00A8,0xFF3E,/* 0x48-0x4F */
+	0xFFE3,0xFF3F,0x30FD,0x30FE,0x309D,0x309E,0x3003,0x4EDD,/* 0x50-0x57 */
+	0x3005,0x3006,0x3007,0x30FC,0x2015,0x2010,0xFF0F,0xFF3C,/* 0x58-0x5F */
+	0xFF5E,0x2225,0xFF5C,0x2026,0x2025,0x2018,0x2019,0x201C,/* 0x60-0x67 */
+	0x201D,0xFF08,0xFF09,0x3014,0x3015,0xFF3B,0xFF3D,0xFF5B,/* 0x68-0x6F */
+	0xFF5D,0x3008,0x3009,0x300A,0x300B,0x300C,0x300D,0x300E,/* 0x70-0x77 */
+	0x300F,0x3010,0x3011,0xFF0B,0xFF0D,0x00B1,0x00D7,0x0000,/* 0x78-0x7F */
+
+	0x00F7,0xFF1D,0x2260,0xFF1C,0xFF1E,0x2266,0x2267,0x221E,/* 0x80-0x87 */
+	0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFFE5,/* 0x88-0x8F */
+	0xFF04,0xFFE0,0xFFE1,0xFF05,0xFF03,0xFF06,0xFF0A,0xFF20,/* 0x90-0x97 */
+	0x00A7,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0x98-0x9F */
+	0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x203B,0x3012,/* 0xA0-0xA7 */
+	0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,0x0000,0x0000,/* 0xA8-0xAF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB0-0xB7 */
+	0x2208,0x220B,0x2286,0x2287,0x2282,0x2283,0x222A,0x2229,/* 0xB8-0xBF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x2227,0x2228,0xFFE2,0x21D2,0x21D4,0x2200,0x2203,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */
+	0x0000,0x0000,0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,/* 0xD8-0xDF */
+	0x2252,0x226A,0x226B,0x221A,0x223D,0x221D,0x2235,0x222B,/* 0xE0-0xE7 */
+	0x222C,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */
+	0x212B,0x2030,0x266F,0x266D,0x266A,0x2020,0x2021,0x00B6,/* 0xF0-0xF7 */
+	0x0000,0x0000,0x0000,0x0000,0x25EF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_82[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xFF10,/* 0x48-0x4F */
+	0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0x50-0x57 */
+	0xFF19,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,/* 0x60-0x67 */
+	0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,/* 0x68-0x6F */
+	0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,/* 0x70-0x77 */
+	0xFF39,0xFF3A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0x80-0x87 */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0x88-0x8F */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0x90-0x97 */
+	0xFF58,0xFF59,0xFF5A,0x0000,0x0000,0x0000,0x0000,0x3041,/* 0x98-0x9F */
+	0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,0x3048,0x3049,/* 0xA0-0xA7 */
+	0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,0x3050,0x3051,/* 0xA8-0xAF */
+	0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,0x3058,0x3059,/* 0xB0-0xB7 */
+	0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,0x3060,0x3061,/* 0xB8-0xBF */
+	0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,0x3068,0x3069,/* 0xC0-0xC7 */
+	0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,0x3070,0x3071,/* 0xC8-0xCF */
+	0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,0x3078,0x3079,/* 0xD0-0xD7 */
+	0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,0x3080,0x3081,/* 0xD8-0xDF */
+	0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,0x3088,0x3089,/* 0xE0-0xE7 */
+	0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,0x3090,0x3091,/* 0xE8-0xEF */
+	0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_83[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,0x30A8,/* 0x40-0x47 */
+	0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,0x30B0,/* 0x48-0x4F */
+	0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,0x30B8,/* 0x50-0x57 */
+	0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,0x30C0,/* 0x58-0x5F */
+	0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,0x30C8,/* 0x60-0x67 */
+	0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,0x30D0,/* 0x68-0x6F */
+	0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,0x30D8,/* 0x70-0x77 */
+	0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,0x0000,/* 0x78-0x7F */
+
+	0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0x80-0x87 */
+	0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0x88-0x8F */
+	0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0391,/* 0x98-0x9F */
+	0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,0x0398,0x0399,/* 0xA0-0xA7 */
+	0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,0x03A0,0x03A1,/* 0xA8-0xAF */
+	0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,0x03A9,0x0000,/* 0xB0-0xB7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x03B1,/* 0xB8-0xBF */
+	0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,/* 0xC0-0xC7 */
+	0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,/* 0xC8-0xCF */
+	0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,0x03C9,0x0000,/* 0xD0-0xD7 */
+};
+
+static const wchar_t c2u_84[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,0x0416,/* 0x40-0x47 */
+	0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,/* 0x48-0x4F */
+	0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,/* 0x50-0x57 */
+	0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,/* 0x58-0x5F */
+	0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,0x0436,/* 0x70-0x77 */
+	0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x0000,/* 0x78-0x7F */
+
+	0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0x80-0x87 */
+	0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0x88-0x8F */
+	0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x2500,/* 0x98-0x9F */
+	0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,0x252C,0x2524,/* 0xA0-0xA7 */
+	0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,0x251B,0x2517,/* 0xA8-0xAF */
+	0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,0x252F,0x2528,/* 0xB0-0xB7 */
+	0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,0x2542,0x0000,/* 0xB8-0xBF */
+};
+
+static const wchar_t c2u_87[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,/* 0x40-0x47 */
+	0x2468,0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x246F,/* 0x48-0x4F */
+	0x2470,0x2471,0x2472,0x2473,0x2160,0x2161,0x2162,0x2163,/* 0x50-0x57 */
+	0x2164,0x2165,0x2166,0x2167,0x2168,0x2169,0x0000,0x3349,/* 0x58-0x5F */
+	0x3314,0x3322,0x334D,0x3318,0x3327,0x3303,0x3336,0x3351,/* 0x60-0x67 */
+	0x3357,0x330D,0x3326,0x3323,0x332B,0x334A,0x333B,0x339C,/* 0x68-0x6F */
+	0x339D,0x339E,0x338E,0x338F,0x33C4,0x33A1,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x337B,0x0000,/* 0x78-0x7F */
+
+	0x301D,0x301F,0x2116,0x33CD,0x2121,0x32A4,0x32A5,0x32A6,/* 0x80-0x87 */
+	0x32A7,0x32A8,0x3231,0x3232,0x3239,0x337E,0x337D,0x337C,/* 0x88-0x8F */
+	0x2252,0x2261,0x222B,0x222E,0x2211,0x221A,0x22A5,0x2220,/* 0x90-0x97 */
+	0x221F,0x22BF,0x2235,0x2229,0x222A,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+};
+
+static const wchar_t c2u_88[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x4E9C,/* 0x98-0x9F */
+	0x5516,0x5A03,0x963F,0x54C0,0x611B,0x6328,0x59F6,0x9022,/* 0xA0-0xA7 */
+	0x8475,0x831C,0x7A50,0x60AA,0x63E1,0x6E25,0x65ED,0x8466,/* 0xA8-0xAF */
+	0x82A6,0x9BF5,0x6893,0x5727,0x65A1,0x6271,0x5B9B,0x59D0,/* 0xB0-0xB7 */
+	0x867B,0x98F4,0x7D62,0x7DBE,0x9B8E,0x6216,0x7C9F,0x88B7,/* 0xB8-0xBF */
+	0x5B89,0x5EB5,0x6309,0x6697,0x6848,0x95C7,0x978D,0x674F,/* 0xC0-0xC7 */
+	0x4EE5,0x4F0A,0x4F4D,0x4F9D,0x5049,0x56F2,0x5937,0x59D4,/* 0xC8-0xCF */
+	0x5A01,0x5C09,0x60DF,0x610F,0x6170,0x6613,0x6905,0x70BA,/* 0xD0-0xD7 */
+	0x754F,0x7570,0x79FB,0x7DAD,0x7DEF,0x80C3,0x840E,0x8863,/* 0xD8-0xDF */
+	0x8B02,0x9055,0x907A,0x533B,0x4E95,0x4EA5,0x57DF,0x80B2,/* 0xE0-0xE7 */
+	0x90C1,0x78EF,0x4E00,0x58F1,0x6EA2,0x9038,0x7A32,0x8328,/* 0xE8-0xEF */
+	0x828B,0x9C2F,0x5141,0x5370,0x54BD,0x54E1,0x56E0,0x59FB,/* 0xF0-0xF7 */
+	0x5F15,0x98F2,0x6DEB,0x80E4,0x852D,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_89[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9662,0x9670,0x96A0,0x97FB,0x540B,0x53F3,0x5B87,0x70CF,/* 0x40-0x47 */
+	0x7FBD,0x8FC2,0x96E8,0x536F,0x9D5C,0x7ABA,0x4E11,0x7893,/* 0x48-0x4F */
+	0x81FC,0x6E26,0x5618,0x5504,0x6B1D,0x851A,0x9C3B,0x59E5,/* 0x50-0x57 */
+	0x53A9,0x6D66,0x74DC,0x958F,0x5642,0x4E91,0x904B,0x96F2,/* 0x58-0x5F */
+	0x834F,0x990C,0x53E1,0x55B6,0x5B30,0x5F71,0x6620,0x66F3,/* 0x60-0x67 */
+	0x6804,0x6C38,0x6CF3,0x6D29,0x745B,0x76C8,0x7A4E,0x9834,/* 0x68-0x6F */
+	0x82F1,0x885B,0x8A60,0x92ED,0x6DB2,0x75AB,0x76CA,0x99C5,/* 0x70-0x77 */
+	0x60A6,0x8B01,0x8D8A,0x95B2,0x698E,0x53AD,0x5186,0x0000,/* 0x78-0x7F */
+
+	0x5712,0x5830,0x5944,0x5BB4,0x5EF6,0x6028,0x63A9,0x63F4,/* 0x80-0x87 */
+	0x6CBF,0x6F14,0x708E,0x7114,0x7159,0x71D5,0x733F,0x7E01,/* 0x88-0x8F */
+	0x8276,0x82D1,0x8597,0x9060,0x925B,0x9D1B,0x5869,0x65BC,/* 0x90-0x97 */
+	0x6C5A,0x7525,0x51F9,0x592E,0x5965,0x5F80,0x5FDC,0x62BC,/* 0x98-0x9F */
+	0x65FA,0x6A2A,0x6B27,0x6BB4,0x738B,0x7FC1,0x8956,0x9D2C,/* 0xA0-0xA7 */
+	0x9D0E,0x9EC4,0x5CA1,0x6C96,0x837B,0x5104,0x5C4B,0x61B6,/* 0xA8-0xAF */
+	0x81C6,0x6876,0x7261,0x4E59,0x4FFA,0x5378,0x6069,0x6E29,/* 0xB0-0xB7 */
+	0x7A4F,0x97F3,0x4E0B,0x5316,0x4EEE,0x4F55,0x4F3D,0x4FA1,/* 0xB8-0xBF */
+	0x4F73,0x52A0,0x53EF,0x5609,0x590F,0x5AC1,0x5BB6,0x5BE1,/* 0xC0-0xC7 */
+	0x79D1,0x6687,0x679C,0x67B6,0x6B4C,0x6CB3,0x706B,0x73C2,/* 0xC8-0xCF */
+	0x798D,0x79BE,0x7A3C,0x7B87,0x82B1,0x82DB,0x8304,0x8377,/* 0xD0-0xD7 */
+	0x83EF,0x83D3,0x8766,0x8AB2,0x5629,0x8CA8,0x8FE6,0x904E,/* 0xD8-0xDF */
+	0x971E,0x868A,0x4FC4,0x5CE8,0x6211,0x7259,0x753B,0x81E5,/* 0xE0-0xE7 */
+	0x82BD,0x86FE,0x8CC0,0x96C5,0x9913,0x99D5,0x4ECB,0x4F1A,/* 0xE8-0xEF */
+	0x89E3,0x56DE,0x584A,0x58CA,0x5EFB,0x5FEB,0x602A,0x6094,/* 0xF0-0xF7 */
+	0x6062,0x61D0,0x6212,0x62D0,0x6539,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9B41,0x6666,0x68B0,0x6D77,0x7070,0x754C,0x7686,0x7D75,/* 0x40-0x47 */
+	0x82A5,0x87F9,0x958B,0x968E,0x8C9D,0x51F1,0x52BE,0x5916,/* 0x48-0x4F */
+	0x54B3,0x5BB3,0x5D16,0x6168,0x6982,0x6DAF,0x788D,0x84CB,/* 0x50-0x57 */
+	0x8857,0x8A72,0x93A7,0x9AB8,0x6D6C,0x99A8,0x86D9,0x57A3,/* 0x58-0x5F */
+	0x67FF,0x86CE,0x920E,0x5283,0x5687,0x5404,0x5ED3,0x62E1,/* 0x60-0x67 */
+	0x64B9,0x683C,0x6838,0x6BBB,0x7372,0x78BA,0x7A6B,0x899A,/* 0x68-0x6F */
+	0x89D2,0x8D6B,0x8F03,0x90ED,0x95A3,0x9694,0x9769,0x5B66,/* 0x70-0x77 */
+	0x5CB3,0x697D,0x984D,0x984E,0x639B,0x7B20,0x6A2B,0x0000,/* 0x78-0x7F */
+
+	0x6A7F,0x68B6,0x9C0D,0x6F5F,0x5272,0x559D,0x6070,0x62EC,/* 0x80-0x87 */
+	0x6D3B,0x6E07,0x6ED1,0x845B,0x8910,0x8F44,0x4E14,0x9C39,/* 0x88-0x8F */
+	0x53F6,0x691B,0x6A3A,0x9784,0x682A,0x515C,0x7AC3,0x84B2,/* 0x90-0x97 */
+	0x91DC,0x938C,0x565B,0x9D28,0x6822,0x8305,0x8431,0x7CA5,/* 0x98-0x9F */
+	0x5208,0x82C5,0x74E6,0x4E7E,0x4F83,0x51A0,0x5BD2,0x520A,/* 0xA0-0xA7 */
+	0x52D8,0x52E7,0x5DFB,0x559A,0x582A,0x59E6,0x5B8C,0x5B98,/* 0xA8-0xAF */
+	0x5BDB,0x5E72,0x5E79,0x60A3,0x611F,0x6163,0x61BE,0x63DB,/* 0xB0-0xB7 */
+	0x6562,0x67D1,0x6853,0x68FA,0x6B3E,0x6B53,0x6C57,0x6F22,/* 0xB8-0xBF */
+	0x6F97,0x6F45,0x74B0,0x7518,0x76E3,0x770B,0x7AFF,0x7BA1,/* 0xC0-0xC7 */
+	0x7C21,0x7DE9,0x7F36,0x7FF0,0x809D,0x8266,0x839E,0x89B3,/* 0xC8-0xCF */
+	0x8ACC,0x8CAB,0x9084,0x9451,0x9593,0x9591,0x95A2,0x9665,/* 0xD0-0xD7 */
+	0x97D3,0x9928,0x8218,0x4E38,0x542B,0x5CB8,0x5DCC,0x73A9,/* 0xD8-0xDF */
+	0x764C,0x773C,0x5CA9,0x7FEB,0x8D0B,0x96C1,0x9811,0x9854,/* 0xE0-0xE7 */
+	0x9858,0x4F01,0x4F0E,0x5371,0x559C,0x5668,0x57FA,0x5947,/* 0xE8-0xEF */
+	0x5B09,0x5BC4,0x5C90,0x5E0C,0x5E7E,0x5FCC,0x63EE,0x673A,/* 0xF0-0xF7 */
+	0x65D7,0x65E2,0x671F,0x68CB,0x68C4,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A5F,0x5E30,0x6BC5,0x6C17,0x6C7D,0x757F,0x7948,0x5B63,/* 0x40-0x47 */
+	0x7A00,0x7D00,0x5FBD,0x898F,0x8A18,0x8CB4,0x8D77,0x8ECC,/* 0x48-0x4F */
+	0x8F1D,0x98E2,0x9A0E,0x9B3C,0x4E80,0x507D,0x5100,0x5993,/* 0x50-0x57 */
+	0x5B9C,0x622F,0x6280,0x64EC,0x6B3A,0x72A0,0x7591,0x7947,/* 0x58-0x5F */
+	0x7FA9,0x87FB,0x8ABC,0x8B70,0x63AC,0x83CA,0x97A0,0x5409,/* 0x60-0x67 */
+	0x5403,0x55AB,0x6854,0x6A58,0x8A70,0x7827,0x6775,0x9ECD,/* 0x68-0x6F */
+	0x5374,0x5BA2,0x811A,0x8650,0x9006,0x4E18,0x4E45,0x4EC7,/* 0x70-0x77 */
+	0x4F11,0x53CA,0x5438,0x5BAE,0x5F13,0x6025,0x6551,0x0000,/* 0x78-0x7F */
+
+	0x673D,0x6C42,0x6C72,0x6CE3,0x7078,0x7403,0x7A76,0x7AAE,/* 0x80-0x87 */
+	0x7B08,0x7D1A,0x7CFE,0x7D66,0x65E7,0x725B,0x53BB,0x5C45,/* 0x88-0x8F */
+	0x5DE8,0x62D2,0x62E0,0x6319,0x6E20,0x865A,0x8A31,0x8DDD,/* 0x90-0x97 */
+	0x92F8,0x6F01,0x79A6,0x9B5A,0x4EA8,0x4EAB,0x4EAC,0x4F9B,/* 0x98-0x9F */
+	0x4FA0,0x50D1,0x5147,0x7AF6,0x5171,0x51F6,0x5354,0x5321,/* 0xA0-0xA7 */
+	0x537F,0x53EB,0x55AC,0x5883,0x5CE1,0x5F37,0x5F4A,0x602F,/* 0xA8-0xAF */
+	0x6050,0x606D,0x631F,0x6559,0x6A4B,0x6CC1,0x72C2,0x72ED,/* 0xB0-0xB7 */
+	0x77EF,0x80F8,0x8105,0x8208,0x854E,0x90F7,0x93E1,0x97FF,/* 0xB8-0xBF */
+	0x9957,0x9A5A,0x4EF0,0x51DD,0x5C2D,0x6681,0x696D,0x5C40,/* 0xC0-0xC7 */
+	0x66F2,0x6975,0x7389,0x6850,0x7C81,0x50C5,0x52E4,0x5747,/* 0xC8-0xCF */
+	0x5DFE,0x9326,0x65A4,0x6B23,0x6B3D,0x7434,0x7981,0x79BD,/* 0xD0-0xD7 */
+	0x7B4B,0x7DCA,0x82B9,0x83CC,0x887F,0x895F,0x8B39,0x8FD1,/* 0xD8-0xDF */
+	0x91D1,0x541F,0x9280,0x4E5D,0x5036,0x53E5,0x533A,0x72D7,/* 0xE0-0xE7 */
+	0x7396,0x77E9,0x82E6,0x8EAF,0x99C6,0x99C8,0x99D2,0x5177,/* 0xE8-0xEF */
+	0x611A,0x865E,0x55B0,0x7A7A,0x5076,0x5BD3,0x9047,0x9685,/* 0xF0-0xF7 */
+	0x4E32,0x6ADB,0x91E7,0x5C51,0x5C48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6398,0x7A9F,0x6C93,0x9774,0x8F61,0x7AAA,0x718A,0x9688,/* 0x40-0x47 */
+	0x7C82,0x6817,0x7E70,0x6851,0x936C,0x52F2,0x541B,0x85AB,/* 0x48-0x4F */
+	0x8A13,0x7FA4,0x8ECD,0x90E1,0x5366,0x8888,0x7941,0x4FC2,/* 0x50-0x57 */
+	0x50BE,0x5211,0x5144,0x5553,0x572D,0x73EA,0x578B,0x5951,/* 0x58-0x5F */
+	0x5F62,0x5F84,0x6075,0x6176,0x6167,0x61A9,0x63B2,0x643A,/* 0x60-0x67 */
+	0x656C,0x666F,0x6842,0x6E13,0x7566,0x7A3D,0x7CFB,0x7D4C,/* 0x68-0x6F */
+	0x7D99,0x7E4B,0x7F6B,0x830E,0x834A,0x86CD,0x8A08,0x8A63,/* 0x70-0x77 */
+	0x8B66,0x8EFD,0x981A,0x9D8F,0x82B8,0x8FCE,0x9BE8,0x0000,/* 0x78-0x7F */
+
+	0x5287,0x621F,0x6483,0x6FC0,0x9699,0x6841,0x5091,0x6B20,/* 0x80-0x87 */
+	0x6C7A,0x6F54,0x7A74,0x7D50,0x8840,0x8A23,0x6708,0x4EF6,/* 0x88-0x8F */
+	0x5039,0x5026,0x5065,0x517C,0x5238,0x5263,0x55A7,0x570F,/* 0x90-0x97 */
+	0x5805,0x5ACC,0x5EFA,0x61B2,0x61F8,0x62F3,0x6372,0x691C,/* 0x98-0x9F */
+	0x6A29,0x727D,0x72AC,0x732E,0x7814,0x786F,0x7D79,0x770C,/* 0xA0-0xA7 */
+	0x80A9,0x898B,0x8B19,0x8CE2,0x8ED2,0x9063,0x9375,0x967A,/* 0xA8-0xAF */
+	0x9855,0x9A13,0x9E78,0x5143,0x539F,0x53B3,0x5E7B,0x5F26,/* 0xB0-0xB7 */
+	0x6E1B,0x6E90,0x7384,0x73FE,0x7D43,0x8237,0x8A00,0x8AFA,/* 0xB8-0xBF */
+	0x9650,0x4E4E,0x500B,0x53E4,0x547C,0x56FA,0x59D1,0x5B64,/* 0xC0-0xC7 */
+	0x5DF1,0x5EAB,0x5F27,0x6238,0x6545,0x67AF,0x6E56,0x72D0,/* 0xC8-0xCF */
+	0x7CCA,0x88B4,0x80A1,0x80E1,0x83F0,0x864E,0x8A87,0x8DE8,/* 0xD0-0xD7 */
+	0x9237,0x96C7,0x9867,0x9F13,0x4E94,0x4E92,0x4F0D,0x5348,/* 0xD8-0xDF */
+	0x5449,0x543E,0x5A2F,0x5F8C,0x5FA1,0x609F,0x68A7,0x6A8E,/* 0xE0-0xE7 */
+	0x745A,0x7881,0x8A9E,0x8AA4,0x8B77,0x9190,0x4E5E,0x9BC9,/* 0xE8-0xEF */
+	0x4EA4,0x4F7C,0x4FAF,0x5019,0x5016,0x5149,0x516C,0x529F,/* 0xF0-0xF7 */
+	0x52B9,0x52FE,0x539A,0x53E3,0x5411,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x540E,0x5589,0x5751,0x57A2,0x597D,0x5B54,0x5B5D,0x5B8F,/* 0x40-0x47 */
+	0x5DE5,0x5DE7,0x5DF7,0x5E78,0x5E83,0x5E9A,0x5EB7,0x5F18,/* 0x48-0x4F */
+	0x6052,0x614C,0x6297,0x62D8,0x63A7,0x653B,0x6602,0x6643,/* 0x50-0x57 */
+	0x66F4,0x676D,0x6821,0x6897,0x69CB,0x6C5F,0x6D2A,0x6D69,/* 0x58-0x5F */
+	0x6E2F,0x6E9D,0x7532,0x7687,0x786C,0x7A3F,0x7CE0,0x7D05,/* 0x60-0x67 */
+	0x7D18,0x7D5E,0x7DB1,0x8015,0x8003,0x80AF,0x80B1,0x8154,/* 0x68-0x6F */
+	0x818F,0x822A,0x8352,0x884C,0x8861,0x8B1B,0x8CA2,0x8CFC,/* 0x70-0x77 */
+	0x90CA,0x9175,0x9271,0x783F,0x92FC,0x95A4,0x964D,0x0000,/* 0x78-0x7F */
+
+	0x9805,0x9999,0x9AD8,0x9D3B,0x525B,0x52AB,0x53F7,0x5408,/* 0x80-0x87 */
+	0x58D5,0x62F7,0x6FE0,0x8C6A,0x8F5F,0x9EB9,0x514B,0x523B,/* 0x88-0x8F */
+	0x544A,0x56FD,0x7A40,0x9177,0x9D60,0x9ED2,0x7344,0x6F09,/* 0x90-0x97 */
+	0x8170,0x7511,0x5FFD,0x60DA,0x9AA8,0x72DB,0x8FBC,0x6B64,/* 0x98-0x9F */
+	0x9803,0x4ECA,0x56F0,0x5764,0x58BE,0x5A5A,0x6068,0x61C7,/* 0xA0-0xA7 */
+	0x660F,0x6606,0x6839,0x68B1,0x6DF7,0x75D5,0x7D3A,0x826E,/* 0xA8-0xAF */
+	0x9B42,0x4E9B,0x4F50,0x53C9,0x5506,0x5D6F,0x5DE6,0x5DEE,/* 0xB0-0xB7 */
+	0x67FB,0x6C99,0x7473,0x7802,0x8A50,0x9396,0x88DF,0x5750,/* 0xB8-0xBF */
+	0x5EA7,0x632B,0x50B5,0x50AC,0x518D,0x6700,0x54C9,0x585E,/* 0xC0-0xC7 */
+	0x59BB,0x5BB0,0x5F69,0x624D,0x63A1,0x683D,0x6B73,0x6E08,/* 0xC8-0xCF */
+	0x707D,0x91C7,0x7280,0x7815,0x7826,0x796D,0x658E,0x7D30,/* 0xD0-0xD7 */
+	0x83DC,0x88C1,0x8F09,0x969B,0x5264,0x5728,0x6750,0x7F6A,/* 0xD8-0xDF */
+	0x8CA1,0x51B4,0x5742,0x962A,0x583A,0x698A,0x80B4,0x54B2,/* 0xE0-0xE7 */
+	0x5D0E,0x57FC,0x7895,0x9DFA,0x4F5C,0x524A,0x548B,0x643E,/* 0xE8-0xEF */
+	0x6628,0x6714,0x67F5,0x7A84,0x7B56,0x7D22,0x932F,0x685C,/* 0xF0-0xF7 */
+	0x9BAD,0x7B39,0x5319,0x518A,0x5237,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5BDF,0x62F6,0x64AE,0x64E6,0x672D,0x6BBA,0x85A9,0x96D1,/* 0x40-0x47 */
+	0x7690,0x9BD6,0x634C,0x9306,0x9BAB,0x76BF,0x6652,0x4E09,/* 0x48-0x4F */
+	0x5098,0x53C2,0x5C71,0x60E8,0x6492,0x6563,0x685F,0x71E6,/* 0x50-0x57 */
+	0x73CA,0x7523,0x7B97,0x7E82,0x8695,0x8B83,0x8CDB,0x9178,/* 0x58-0x5F */
+	0x9910,0x65AC,0x66AB,0x6B8B,0x4ED5,0x4ED4,0x4F3A,0x4F7F,/* 0x60-0x67 */
+	0x523A,0x53F8,0x53F2,0x55E3,0x56DB,0x58EB,0x59CB,0x59C9,/* 0x68-0x6F */
+	0x59FF,0x5B50,0x5C4D,0x5E02,0x5E2B,0x5FD7,0x601D,0x6307,/* 0x70-0x77 */
+	0x652F,0x5B5C,0x65AF,0x65BD,0x65E8,0x679D,0x6B62,0x0000,/* 0x78-0x7F */
+
+	0x6B7B,0x6C0F,0x7345,0x7949,0x79C1,0x7CF8,0x7D19,0x7D2B,/* 0x80-0x87 */
+	0x80A2,0x8102,0x81F3,0x8996,0x8A5E,0x8A69,0x8A66,0x8A8C,/* 0x88-0x8F */
+	0x8AEE,0x8CC7,0x8CDC,0x96CC,0x98FC,0x6B6F,0x4E8B,0x4F3C,/* 0x90-0x97 */
+	0x4F8D,0x5150,0x5B57,0x5BFA,0x6148,0x6301,0x6642,0x6B21,/* 0x98-0x9F */
+	0x6ECB,0x6CBB,0x723E,0x74BD,0x75D4,0x78C1,0x793A,0x800C,/* 0xA0-0xA7 */
+	0x8033,0x81EA,0x8494,0x8F9E,0x6C50,0x9E7F,0x5F0F,0x8B58,/* 0xA8-0xAF */
+	0x9D2B,0x7AFA,0x8EF8,0x5B8D,0x96EB,0x4E03,0x53F1,0x57F7,/* 0xB0-0xB7 */
+	0x5931,0x5AC9,0x5BA4,0x6089,0x6E7F,0x6F06,0x75BE,0x8CEA,/* 0xB8-0xBF */
+	0x5B9F,0x8500,0x7BE0,0x5072,0x67F4,0x829D,0x5C61,0x854A,/* 0xC0-0xC7 */
+	0x7E1E,0x820E,0x5199,0x5C04,0x6368,0x8D66,0x659C,0x716E,/* 0xC8-0xCF */
+	0x793E,0x7D17,0x8005,0x8B1D,0x8ECA,0x906E,0x86C7,0x90AA,/* 0xD0-0xD7 */
+	0x501F,0x52FA,0x5C3A,0x6753,0x707C,0x7235,0x914C,0x91C8,/* 0xD8-0xDF */
+	0x932B,0x82E5,0x5BC2,0x5F31,0x60F9,0x4E3B,0x53D6,0x5B88,/* 0xE0-0xE7 */
+	0x624B,0x6731,0x6B8A,0x72E9,0x73E0,0x7A2E,0x816B,0x8DA3,/* 0xE8-0xEF */
+	0x9152,0x9996,0x5112,0x53D7,0x546A,0x5BFF,0x6388,0x6A39,/* 0xF0-0xF7 */
+	0x7DAC,0x9700,0x56DA,0x53CE,0x5468,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5B97,0x5C31,0x5DDE,0x4FEE,0x6101,0x62FE,0x6D32,0x79C0,/* 0x40-0x47 */
+	0x79CB,0x7D42,0x7E4D,0x7FD2,0x81ED,0x821F,0x8490,0x8846,/* 0x48-0x4F */
+	0x8972,0x8B90,0x8E74,0x8F2F,0x9031,0x914B,0x916C,0x96C6,/* 0x50-0x57 */
+	0x919C,0x4EC0,0x4F4F,0x5145,0x5341,0x5F93,0x620E,0x67D4,/* 0x58-0x5F */
+	0x6C41,0x6E0B,0x7363,0x7E26,0x91CD,0x9283,0x53D4,0x5919,/* 0x60-0x67 */
+	0x5BBF,0x6DD1,0x795D,0x7E2E,0x7C9B,0x587E,0x719F,0x51FA,/* 0x68-0x6F */
+	0x8853,0x8FF0,0x4FCA,0x5CFB,0x6625,0x77AC,0x7AE3,0x821C,/* 0x70-0x77 */
+	0x99FF,0x51C6,0x5FAA,0x65EC,0x696F,0x6B89,0x6DF3,0x0000,/* 0x78-0x7F */
+
+	0x6E96,0x6F64,0x76FE,0x7D14,0x5DE1,0x9075,0x9187,0x9806,/* 0x80-0x87 */
+	0x51E6,0x521D,0x6240,0x6691,0x66D9,0x6E1A,0x5EB6,0x7DD2,/* 0x88-0x8F */
+	0x7F72,0x66F8,0x85AF,0x85F7,0x8AF8,0x52A9,0x53D9,0x5973,/* 0x90-0x97 */
+	0x5E8F,0x5F90,0x6055,0x92E4,0x9664,0x50B7,0x511F,0x52DD,/* 0x98-0x9F */
+	0x5320,0x5347,0x53EC,0x54E8,0x5546,0x5531,0x5617,0x5968,/* 0xA0-0xA7 */
+	0x59BE,0x5A3C,0x5BB5,0x5C06,0x5C0F,0x5C11,0x5C1A,0x5E84,/* 0xA8-0xAF */
+	0x5E8A,0x5EE0,0x5F70,0x627F,0x6284,0x62DB,0x638C,0x6377,/* 0xB0-0xB7 */
+	0x6607,0x660C,0x662D,0x6676,0x677E,0x68A2,0x6A1F,0x6A35,/* 0xB8-0xBF */
+	0x6CBC,0x6D88,0x6E09,0x6E58,0x713C,0x7126,0x7167,0x75C7,/* 0xC0-0xC7 */
+	0x7701,0x785D,0x7901,0x7965,0x79F0,0x7AE0,0x7B11,0x7CA7,/* 0xC8-0xCF */
+	0x7D39,0x8096,0x83D6,0x848B,0x8549,0x885D,0x88F3,0x8A1F,/* 0xD0-0xD7 */
+	0x8A3C,0x8A54,0x8A73,0x8C61,0x8CDE,0x91A4,0x9266,0x937E,/* 0xD8-0xDF */
+	0x9418,0x969C,0x9798,0x4E0A,0x4E08,0x4E1E,0x4E57,0x5197,/* 0xE0-0xE7 */
+	0x5270,0x57CE,0x5834,0x58CC,0x5B22,0x5E38,0x60C5,0x64FE,/* 0xE8-0xEF */
+	0x6761,0x6756,0x6D44,0x72B6,0x7573,0x7A63,0x84B8,0x8B72,/* 0xF0-0xF7 */
+	0x91B8,0x9320,0x5631,0x57F4,0x98FE,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_90[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x62ED,0x690D,0x6B96,0x71ED,0x7E54,0x8077,0x8272,0x89E6,/* 0x40-0x47 */
+	0x98DF,0x8755,0x8FB1,0x5C3B,0x4F38,0x4FE1,0x4FB5,0x5507,/* 0x48-0x4F */
+	0x5A20,0x5BDD,0x5BE9,0x5FC3,0x614E,0x632F,0x65B0,0x664B,/* 0x50-0x57 */
+	0x68EE,0x699B,0x6D78,0x6DF1,0x7533,0x75B9,0x771F,0x795E,/* 0x58-0x5F */
+	0x79E6,0x7D33,0x81E3,0x82AF,0x85AA,0x89AA,0x8A3A,0x8EAB,/* 0x60-0x67 */
+	0x8F9B,0x9032,0x91DD,0x9707,0x4EBA,0x4EC1,0x5203,0x5875,/* 0x68-0x6F */
+	0x58EC,0x5C0B,0x751A,0x5C3D,0x814E,0x8A0A,0x8FC5,0x9663,/* 0x70-0x77 */
+	0x976D,0x7B25,0x8ACF,0x9808,0x9162,0x56F3,0x53A8,0x0000,/* 0x78-0x7F */
+
+	0x9017,0x5439,0x5782,0x5E25,0x63A8,0x6C34,0x708A,0x7761,/* 0x80-0x87 */
+	0x7C8B,0x7FE0,0x8870,0x9042,0x9154,0x9310,0x9318,0x968F,/* 0x88-0x8F */
+	0x745E,0x9AC4,0x5D07,0x5D69,0x6570,0x67A2,0x8DA8,0x96DB,/* 0x90-0x97 */
+	0x636E,0x6749,0x6919,0x83C5,0x9817,0x96C0,0x88FE,0x6F84,/* 0x98-0x9F */
+	0x647A,0x5BF8,0x4E16,0x702C,0x755D,0x662F,0x51C4,0x5236,/* 0xA0-0xA7 */
+	0x52E2,0x59D3,0x5F81,0x6027,0x6210,0x653F,0x6574,0x661F,/* 0xA8-0xAF */
+	0x6674,0x68F2,0x6816,0x6B63,0x6E05,0x7272,0x751F,0x76DB,/* 0xB0-0xB7 */
+	0x7CBE,0x8056,0x58F0,0x88FD,0x897F,0x8AA0,0x8A93,0x8ACB,/* 0xB8-0xBF */
+	0x901D,0x9192,0x9752,0x9759,0x6589,0x7A0E,0x8106,0x96BB,/* 0xC0-0xC7 */
+	0x5E2D,0x60DC,0x621A,0x65A5,0x6614,0x6790,0x77F3,0x7A4D,/* 0xC8-0xCF */
+	0x7C4D,0x7E3E,0x810A,0x8CAC,0x8D64,0x8DE1,0x8E5F,0x78A9,/* 0xD0-0xD7 */
+	0x5207,0x62D9,0x63A5,0x6442,0x6298,0x8A2D,0x7A83,0x7BC0,/* 0xD8-0xDF */
+	0x8AAC,0x96EA,0x7D76,0x820C,0x8749,0x4ED9,0x5148,0x5343,/* 0xE0-0xE7 */
+	0x5360,0x5BA3,0x5C02,0x5C16,0x5DDD,0x6226,0x6247,0x64B0,/* 0xE8-0xEF */
+	0x6813,0x6834,0x6CC9,0x6D45,0x6D17,0x67D3,0x6F5C,0x714E,/* 0xF0-0xF7 */
+	0x717D,0x65CB,0x7A7F,0x7BAD,0x7DDA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_91[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E4A,0x7FA8,0x817A,0x821B,0x8239,0x85A6,0x8A6E,0x8CCE,/* 0x40-0x47 */
+	0x8DF5,0x9078,0x9077,0x92AD,0x9291,0x9583,0x9BAE,0x524D,/* 0x48-0x4F */
+	0x5584,0x6F38,0x7136,0x5168,0x7985,0x7E55,0x81B3,0x7CCE,/* 0x50-0x57 */
+	0x564C,0x5851,0x5CA8,0x63AA,0x66FE,0x66FD,0x695A,0x72D9,/* 0x58-0x5F */
+	0x758F,0x758E,0x790E,0x7956,0x79DF,0x7C97,0x7D20,0x7D44,/* 0x60-0x67 */
+	0x8607,0x8A34,0x963B,0x9061,0x9F20,0x50E7,0x5275,0x53CC,/* 0x68-0x6F */
+	0x53E2,0x5009,0x55AA,0x58EE,0x594F,0x723D,0x5B8B,0x5C64,/* 0x70-0x77 */
+	0x531D,0x60E3,0x60F3,0x635C,0x6383,0x633F,0x63BB,0x0000,/* 0x78-0x7F */
+
+	0x64CD,0x65E9,0x66F9,0x5DE3,0x69CD,0x69FD,0x6F15,0x71E5,/* 0x80-0x87 */
+	0x4E89,0x75E9,0x76F8,0x7A93,0x7CDF,0x7DCF,0x7D9C,0x8061,/* 0x88-0x8F */
+	0x8349,0x8358,0x846C,0x84BC,0x85FB,0x88C5,0x8D70,0x9001,/* 0x90-0x97 */
+	0x906D,0x9397,0x971C,0x9A12,0x50CF,0x5897,0x618E,0x81D3,/* 0x98-0x9F */
+	0x8535,0x8D08,0x9020,0x4FC3,0x5074,0x5247,0x5373,0x606F,/* 0xA0-0xA7 */
+	0x6349,0x675F,0x6E2C,0x8DB3,0x901F,0x4FD7,0x5C5E,0x8CCA,/* 0xA8-0xAF */
+	0x65CF,0x7D9A,0x5352,0x8896,0x5176,0x63C3,0x5B58,0x5B6B,/* 0xB0-0xB7 */
+	0x5C0A,0x640D,0x6751,0x905C,0x4ED6,0x591A,0x592A,0x6C70,/* 0xB8-0xBF */
+	0x8A51,0x553E,0x5815,0x59A5,0x60F0,0x6253,0x67C1,0x8235,/* 0xC0-0xC7 */
+	0x6955,0x9640,0x99C4,0x9A28,0x4F53,0x5806,0x5BFE,0x8010,/* 0xC8-0xCF */
+	0x5CB1,0x5E2F,0x5F85,0x6020,0x614B,0x6234,0x66FF,0x6CF0,/* 0xD0-0xD7 */
+	0x6EDE,0x80CE,0x817F,0x82D4,0x888B,0x8CB8,0x9000,0x902E,/* 0xD8-0xDF */
+	0x968A,0x9EDB,0x9BDB,0x4EE3,0x53F0,0x5927,0x7B2C,0x918D,/* 0xE0-0xE7 */
+	0x984C,0x9DF9,0x6EDD,0x7027,0x5353,0x5544,0x5B85,0x6258,/* 0xE8-0xEF */
+	0x629E,0x62D3,0x6CA2,0x6FEF,0x7422,0x8A17,0x9438,0x6FC1,/* 0xF0-0xF7 */
+	0x8AFE,0x8338,0x51E7,0x86F8,0x53EA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_92[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x53E9,0x4F46,0x9054,0x8FB0,0x596A,0x8131,0x5DFD,0x7AEA,/* 0x40-0x47 */
+	0x8FBF,0x68DA,0x8C37,0x72F8,0x9C48,0x6A3D,0x8AB0,0x4E39,/* 0x48-0x4F */
+	0x5358,0x5606,0x5766,0x62C5,0x63A2,0x65E6,0x6B4E,0x6DE1,/* 0x50-0x57 */
+	0x6E5B,0x70AD,0x77ED,0x7AEF,0x7BAA,0x7DBB,0x803D,0x80C6,/* 0x58-0x5F */
+	0x86CB,0x8A95,0x935B,0x56E3,0x58C7,0x5F3E,0x65AD,0x6696,/* 0x60-0x67 */
+	0x6A80,0x6BB5,0x7537,0x8AC7,0x5024,0x77E5,0x5730,0x5F1B,/* 0x68-0x6F */
+	0x6065,0x667A,0x6C60,0x75F4,0x7A1A,0x7F6E,0x81F4,0x8718,/* 0x70-0x77 */
+	0x9045,0x99B3,0x7BC9,0x755C,0x7AF9,0x7B51,0x84C4,0x0000,/* 0x78-0x7F */
+
+	0x9010,0x79E9,0x7A92,0x8336,0x5AE1,0x7740,0x4E2D,0x4EF2,/* 0x80-0x87 */
+	0x5B99,0x5FE0,0x62BD,0x663C,0x67F1,0x6CE8,0x866B,0x8877,/* 0x88-0x8F */
+	0x8A3B,0x914E,0x92F3,0x99D0,0x6A17,0x7026,0x732A,0x82E7,/* 0x90-0x97 */
+	0x8457,0x8CAF,0x4E01,0x5146,0x51CB,0x558B,0x5BF5,0x5E16,/* 0x98-0x9F */
+	0x5E33,0x5E81,0x5F14,0x5F35,0x5F6B,0x5FB4,0x61F2,0x6311,/* 0xA0-0xA7 */
+	0x66A2,0x671D,0x6F6E,0x7252,0x753A,0x773A,0x8074,0x8139,/* 0xA8-0xAF */
+	0x8178,0x8776,0x8ABF,0x8ADC,0x8D85,0x8DF3,0x929A,0x9577,/* 0xB0-0xB7 */
+	0x9802,0x9CE5,0x52C5,0x6357,0x76F4,0x6715,0x6C88,0x73CD,/* 0xB8-0xBF */
+	0x8CC3,0x93AE,0x9673,0x6D25,0x589C,0x690E,0x69CC,0x8FFD,/* 0xC0-0xC7 */
+	0x939A,0x75DB,0x901A,0x585A,0x6802,0x63B4,0x69FB,0x4F43,/* 0xC8-0xCF */
+	0x6F2C,0x67D8,0x8FBB,0x8526,0x7DB4,0x9354,0x693F,0x6F70,/* 0xD0-0xD7 */
+	0x576A,0x58F7,0x5B2C,0x7D2C,0x722A,0x540A,0x91E3,0x9DB4,/* 0xD8-0xDF */
+	0x4EAD,0x4F4E,0x505C,0x5075,0x5243,0x8C9E,0x5448,0x5824,/* 0xE0-0xE7 */
+	0x5B9A,0x5E1D,0x5E95,0x5EAD,0x5EF7,0x5F1F,0x608C,0x62B5,/* 0xE8-0xEF */
+	0x633A,0x63D0,0x68AF,0x6C40,0x7887,0x798E,0x7A0B,0x7DE0,/* 0xF0-0xF7 */
+	0x8247,0x8A02,0x8AE6,0x8E44,0x9013,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_93[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x90B8,0x912D,0x91D8,0x9F0E,0x6CE5,0x6458,0x64E2,0x6575,/* 0x40-0x47 */
+	0x6EF4,0x7684,0x7B1B,0x9069,0x93D1,0x6EBA,0x54F2,0x5FB9,/* 0x48-0x4F */
+	0x64A4,0x8F4D,0x8FED,0x9244,0x5178,0x586B,0x5929,0x5C55,/* 0x50-0x57 */
+	0x5E97,0x6DFB,0x7E8F,0x751C,0x8CBC,0x8EE2,0x985B,0x70B9,/* 0x58-0x5F */
+	0x4F1D,0x6BBF,0x6FB1,0x7530,0x96FB,0x514E,0x5410,0x5835,/* 0x60-0x67 */
+	0x5857,0x59AC,0x5C60,0x5F92,0x6597,0x675C,0x6E21,0x767B,/* 0x68-0x6F */
+	0x83DF,0x8CED,0x9014,0x90FD,0x934D,0x7825,0x783A,0x52AA,/* 0x70-0x77 */
+	0x5EA6,0x571F,0x5974,0x6012,0x5012,0x515A,0x51AC,0x0000,/* 0x78-0x7F */
+
+	0x51CD,0x5200,0x5510,0x5854,0x5858,0x5957,0x5B95,0x5CF6,/* 0x80-0x87 */
+	0x5D8B,0x60BC,0x6295,0x642D,0x6771,0x6843,0x68BC,0x68DF,/* 0x88-0x8F */
+	0x76D7,0x6DD8,0x6E6F,0x6D9B,0x706F,0x71C8,0x5F53,0x75D8,/* 0x90-0x97 */
+	0x7977,0x7B49,0x7B54,0x7B52,0x7CD6,0x7D71,0x5230,0x8463,/* 0x98-0x9F */
+	0x8569,0x85E4,0x8A0E,0x8B04,0x8C46,0x8E0F,0x9003,0x900F,/* 0xA0-0xA7 */
+	0x9419,0x9676,0x982D,0x9A30,0x95D8,0x50CD,0x52D5,0x540C,/* 0xA8-0xAF */
+	0x5802,0x5C0E,0x61A7,0x649E,0x6D1E,0x77B3,0x7AE5,0x80F4,/* 0xB0-0xB7 */
+	0x8404,0x9053,0x9285,0x5CE0,0x9D07,0x533F,0x5F97,0x5FB3,/* 0xB8-0xBF */
+	0x6D9C,0x7279,0x7763,0x79BF,0x7BE4,0x6BD2,0x72EC,0x8AAD,/* 0xC0-0xC7 */
+	0x6803,0x6A61,0x51F8,0x7A81,0x6934,0x5C4A,0x9CF6,0x82EB,/* 0xC8-0xCF */
+	0x5BC5,0x9149,0x701E,0x5678,0x5C6F,0x60C7,0x6566,0x6C8C,/* 0xD0-0xD7 */
+	0x8C5A,0x9041,0x9813,0x5451,0x66C7,0x920D,0x5948,0x90A3,/* 0xD8-0xDF */
+	0x5185,0x4E4D,0x51EA,0x8599,0x8B0E,0x7058,0x637A,0x934B,/* 0xE0-0xE7 */
+	0x6962,0x99B4,0x7E04,0x7577,0x5357,0x6960,0x8EDF,0x96E3,/* 0xE8-0xEF */
+	0x6C5D,0x4E8C,0x5C3C,0x5F10,0x8FE9,0x5302,0x8CD1,0x8089,/* 0xF0-0xF7 */
+	0x8679,0x5EFF,0x65E5,0x4E73,0x5165,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_94[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5982,0x5C3F,0x97EE,0x4EFB,0x598A,0x5FCD,0x8A8D,0x6FE1,/* 0x40-0x47 */
+	0x79B0,0x7962,0x5BE7,0x8471,0x732B,0x71B1,0x5E74,0x5FF5,/* 0x48-0x4F */
+	0x637B,0x649A,0x71C3,0x7C98,0x4E43,0x5EFC,0x4E4B,0x57DC,/* 0x50-0x57 */
+	0x56A2,0x60A9,0x6FC3,0x7D0D,0x80FD,0x8133,0x81BF,0x8FB2,/* 0x58-0x5F */
+	0x8997,0x86A4,0x5DF4,0x628A,0x64AD,0x8987,0x6777,0x6CE2,/* 0x60-0x67 */
+	0x6D3E,0x7436,0x7834,0x5A46,0x7F75,0x82AD,0x99AC,0x4FF3,/* 0x68-0x6F */
+	0x5EC3,0x62DD,0x6392,0x6557,0x676F,0x76C3,0x724C,0x80CC,/* 0x70-0x77 */
+	0x80BA,0x8F29,0x914D,0x500D,0x57F9,0x5A92,0x6885,0x0000,/* 0x78-0x7F */
+
+	0x6973,0x7164,0x72FD,0x8CB7,0x58F2,0x8CE0,0x966A,0x9019,/* 0x80-0x87 */
+	0x877F,0x79E4,0x77E7,0x8429,0x4F2F,0x5265,0x535A,0x62CD,/* 0x88-0x8F */
+	0x67CF,0x6CCA,0x767D,0x7B94,0x7C95,0x8236,0x8584,0x8FEB,/* 0x90-0x97 */
+	0x66DD,0x6F20,0x7206,0x7E1B,0x83AB,0x99C1,0x9EA6,0x51FD,/* 0x98-0x9F */
+	0x7BB1,0x7872,0x7BB8,0x8087,0x7B48,0x6AE8,0x5E61,0x808C,/* 0xA0-0xA7 */
+	0x7551,0x7560,0x516B,0x9262,0x6E8C,0x767A,0x9197,0x9AEA,/* 0xA8-0xAF */
+	0x4F10,0x7F70,0x629C,0x7B4F,0x95A5,0x9CE9,0x567A,0x5859,/* 0xB0-0xB7 */
+	0x86E4,0x96BC,0x4F34,0x5224,0x534A,0x53CD,0x53DB,0x5E06,/* 0xB8-0xBF */
+	0x642C,0x6591,0x677F,0x6C3E,0x6C4E,0x7248,0x72AF,0x73ED,/* 0xC0-0xC7 */
+	0x7554,0x7E41,0x822C,0x85E9,0x8CA9,0x7BC4,0x91C6,0x7169,/* 0xC8-0xCF */
+	0x9812,0x98EF,0x633D,0x6669,0x756A,0x76E4,0x78D0,0x8543,/* 0xD0-0xD7 */
+	0x86EE,0x532A,0x5351,0x5426,0x5983,0x5E87,0x5F7C,0x60B2,/* 0xD8-0xDF */
+	0x6249,0x6279,0x62AB,0x6590,0x6BD4,0x6CCC,0x75B2,0x76AE,/* 0xE0-0xE7 */
+	0x7891,0x79D8,0x7DCB,0x7F77,0x80A5,0x88AB,0x8AB9,0x8CBB,/* 0xE8-0xEF */
+	0x907F,0x975E,0x98DB,0x6A0B,0x7C38,0x5099,0x5C3E,0x5FAE,/* 0xF0-0xF7 */
+	0x6787,0x6BD8,0x7435,0x7709,0x7F8E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_95[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9F3B,0x67CA,0x7A17,0x5339,0x758B,0x9AED,0x5F66,0x819D,/* 0x40-0x47 */
+	0x83F1,0x8098,0x5F3C,0x5FC5,0x7562,0x7B46,0x903C,0x6867,/* 0x48-0x4F */
+	0x59EB,0x5A9B,0x7D10,0x767E,0x8B2C,0x4FF5,0x5F6A,0x6A19,/* 0x50-0x57 */
+	0x6C37,0x6F02,0x74E2,0x7968,0x8868,0x8A55,0x8C79,0x5EDF,/* 0x58-0x5F */
+	0x63CF,0x75C5,0x79D2,0x82D7,0x9328,0x92F2,0x849C,0x86ED,/* 0x60-0x67 */
+	0x9C2D,0x54C1,0x5F6C,0x658C,0x6D5C,0x7015,0x8CA7,0x8CD3,/* 0x68-0x6F */
+	0x983B,0x654F,0x74F6,0x4E0D,0x4ED8,0x57E0,0x592B,0x5A66,/* 0x70-0x77 */
+	0x5BCC,0x51A8,0x5E03,0x5E9C,0x6016,0x6276,0x6577,0x0000,/* 0x78-0x7F */
+
+	0x65A7,0x666E,0x6D6E,0x7236,0x7B26,0x8150,0x819A,0x8299,/* 0x80-0x87 */
+	0x8B5C,0x8CA0,0x8CE6,0x8D74,0x961C,0x9644,0x4FAE,0x64AB,/* 0x88-0x8F */
+	0x6B66,0x821E,0x8461,0x856A,0x90E8,0x5C01,0x6953,0x98A8,/* 0x90-0x97 */
+	0x847A,0x8557,0x4F0F,0x526F,0x5FA9,0x5E45,0x670D,0x798F,/* 0x98-0x9F */
+	0x8179,0x8907,0x8986,0x6DF5,0x5F17,0x6255,0x6CB8,0x4ECF,/* 0xA0-0xA7 */
+	0x7269,0x9B92,0x5206,0x543B,0x5674,0x58B3,0x61A4,0x626E,/* 0xA8-0xAF */
+	0x711A,0x596E,0x7C89,0x7CDE,0x7D1B,0x96F0,0x6587,0x805E,/* 0xB0-0xB7 */
+	0x4E19,0x4F75,0x5175,0x5840,0x5E63,0x5E73,0x5F0A,0x67C4,/* 0xB8-0xBF */
+	0x4E26,0x853D,0x9589,0x965B,0x7C73,0x9801,0x50FB,0x58C1,/* 0xC0-0xC7 */
+	0x7656,0x78A7,0x5225,0x77A5,0x8511,0x7B86,0x504F,0x5909,/* 0xC8-0xCF */
+	0x7247,0x7BC7,0x7DE8,0x8FBA,0x8FD4,0x904D,0x4FBF,0x52C9,/* 0xD0-0xD7 */
+	0x5A29,0x5F01,0x97AD,0x4FDD,0x8217,0x92EA,0x5703,0x6355,/* 0xD8-0xDF */
+	0x6B69,0x752B,0x88DC,0x8F14,0x7A42,0x52DF,0x5893,0x6155,/* 0xE0-0xE7 */
+	0x620A,0x66AE,0x6BCD,0x7C3F,0x83E9,0x5023,0x4FF8,0x5305,/* 0xE8-0xEF */
+	0x5446,0x5831,0x5949,0x5B9D,0x5CF0,0x5CEF,0x5D29,0x5E96,/* 0xF0-0xF7 */
+	0x62B1,0x6367,0x653E,0x65B9,0x670B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_96[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6CD5,0x6CE1,0x70F9,0x7832,0x7E2B,0x80DE,0x82B3,0x840C,/* 0x40-0x47 */
+	0x84EC,0x8702,0x8912,0x8A2A,0x8C4A,0x90A6,0x92D2,0x98FD,/* 0x48-0x4F */
+	0x9CF3,0x9D6C,0x4E4F,0x4EA1,0x508D,0x5256,0x574A,0x59A8,/* 0x50-0x57 */
+	0x5E3D,0x5FD8,0x5FD9,0x623F,0x66B4,0x671B,0x67D0,0x68D2,/* 0x58-0x5F */
+	0x5192,0x7D21,0x80AA,0x81A8,0x8B00,0x8C8C,0x8CBF,0x927E,/* 0x60-0x67 */
+	0x9632,0x5420,0x982C,0x5317,0x50D5,0x535C,0x58A8,0x64B2,/* 0x68-0x6F */
+	0x6734,0x7267,0x7766,0x7A46,0x91E6,0x52C3,0x6CA1,0x6B86,/* 0x70-0x77 */
+	0x5800,0x5E4C,0x5954,0x672C,0x7FFB,0x51E1,0x76C6,0x0000,/* 0x78-0x7F */
+
+	0x6469,0x78E8,0x9B54,0x9EBB,0x57CB,0x59B9,0x6627,0x679A,/* 0x80-0x87 */
+	0x6BCE,0x54E9,0x69D9,0x5E55,0x819C,0x6795,0x9BAA,0x67FE,/* 0x88-0x8F */
+	0x9C52,0x685D,0x4EA6,0x4FE3,0x53C8,0x62B9,0x672B,0x6CAB,/* 0x90-0x97 */
+	0x8FC4,0x4FAD,0x7E6D,0x9EBF,0x4E07,0x6162,0x6E80,0x6F2B,/* 0x98-0x9F */
+	0x8513,0x5473,0x672A,0x9B45,0x5DF3,0x7B95,0x5CAC,0x5BC6,/* 0xA0-0xA7 */
+	0x871C,0x6E4A,0x84D1,0x7A14,0x8108,0x5999,0x7C8D,0x6C11,/* 0xA8-0xAF */
+	0x7720,0x52D9,0x5922,0x7121,0x725F,0x77DB,0x9727,0x9D61,/* 0xB0-0xB7 */
+	0x690B,0x5A7F,0x5A18,0x51A5,0x540D,0x547D,0x660E,0x76DF,/* 0xB8-0xBF */
+	0x8FF7,0x9298,0x9CF4,0x59EA,0x725D,0x6EC5,0x514D,0x68C9,/* 0xC0-0xC7 */
+	0x7DBF,0x7DEC,0x9762,0x9EBA,0x6478,0x6A21,0x8302,0x5984,/* 0xC8-0xCF */
+	0x5B5F,0x6BDB,0x731B,0x76F2,0x7DB2,0x8017,0x8499,0x5132,/* 0xD0-0xD7 */
+	0x6728,0x9ED9,0x76EE,0x6762,0x52FF,0x9905,0x5C24,0x623B,/* 0xD8-0xDF */
+	0x7C7E,0x8CB0,0x554F,0x60B6,0x7D0B,0x9580,0x5301,0x4E5F,/* 0xE0-0xE7 */
+	0x51B6,0x591C,0x723A,0x8036,0x91CE,0x5F25,0x77E2,0x5384,/* 0xE8-0xEF */
+	0x5F79,0x7D04,0x85AC,0x8A33,0x8E8D,0x9756,0x67F3,0x85AE,/* 0xF0-0xF7 */
+	0x9453,0x6109,0x6108,0x6CB9,0x7652,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_97[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8AED,0x8F38,0x552F,0x4F51,0x512A,0x52C7,0x53CB,0x5BA5,/* 0x40-0x47 */
+	0x5E7D,0x60A0,0x6182,0x63D6,0x6709,0x67DA,0x6E67,0x6D8C,/* 0x48-0x4F */
+	0x7336,0x7337,0x7531,0x7950,0x88D5,0x8A98,0x904A,0x9091,/* 0x50-0x57 */
+	0x90F5,0x96C4,0x878D,0x5915,0x4E88,0x4F59,0x4E0E,0x8A89,/* 0x58-0x5F */
+	0x8F3F,0x9810,0x50AD,0x5E7C,0x5996,0x5BB9,0x5EB8,0x63DA,/* 0x60-0x67 */
+	0x63FA,0x64C1,0x66DC,0x694A,0x69D8,0x6D0B,0x6EB6,0x7194,/* 0x68-0x6F */
+	0x7528,0x7AAF,0x7F8A,0x8000,0x8449,0x84C9,0x8981,0x8B21,/* 0x70-0x77 */
+	0x8E0A,0x9065,0x967D,0x990A,0x617E,0x6291,0x6B32,0x0000,/* 0x78-0x7F */
+
+	0x6C83,0x6D74,0x7FCC,0x7FFC,0x6DC0,0x7F85,0x87BA,0x88F8,/* 0x80-0x87 */
+	0x6765,0x83B1,0x983C,0x96F7,0x6D1B,0x7D61,0x843D,0x916A,/* 0x88-0x8F */
+	0x4E71,0x5375,0x5D50,0x6B04,0x6FEB,0x85CD,0x862D,0x89A7,/* 0x90-0x97 */
+	0x5229,0x540F,0x5C65,0x674E,0x68A8,0x7406,0x7483,0x75E2,/* 0x98-0x9F */
+	0x88CF,0x88E1,0x91CC,0x96E2,0x9678,0x5F8B,0x7387,0x7ACB,/* 0xA0-0xA7 */
+	0x844E,0x63A0,0x7565,0x5289,0x6D41,0x6E9C,0x7409,0x7559,/* 0xA8-0xAF */
+	0x786B,0x7C92,0x9686,0x7ADC,0x9F8D,0x4FB6,0x616E,0x65C5,/* 0xB0-0xB7 */
+	0x865C,0x4E86,0x4EAE,0x50DA,0x4E21,0x51CC,0x5BEE,0x6599,/* 0xB8-0xBF */
+	0x6881,0x6DBC,0x731F,0x7642,0x77AD,0x7A1C,0x7CE7,0x826F,/* 0xC0-0xC7 */
+	0x8AD2,0x907C,0x91CF,0x9675,0x9818,0x529B,0x7DD1,0x502B,/* 0xC8-0xCF */
+	0x5398,0x6797,0x6DCB,0x71D0,0x7433,0x81E8,0x8F2A,0x96A3,/* 0xD0-0xD7 */
+	0x9C57,0x9E9F,0x7460,0x5841,0x6D99,0x7D2F,0x985E,0x4EE4,/* 0xD8-0xDF */
+	0x4F36,0x4F8B,0x51B7,0x52B1,0x5DBA,0x601C,0x73B2,0x793C,/* 0xE0-0xE7 */
+	0x82D3,0x9234,0x96B7,0x96F6,0x970A,0x9E97,0x9F62,0x66A6,/* 0xE8-0xEF */
+	0x6B74,0x5217,0x52A3,0x70C8,0x88C2,0x5EC9,0x604B,0x6190,/* 0xF0-0xF7 */
+	0x6F23,0x7149,0x7C3E,0x7DF4,0x806F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_98[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x84EE,0x9023,0x932C,0x5442,0x9B6F,0x6AD3,0x7089,0x8CC2,/* 0x40-0x47 */
+	0x8DEF,0x9732,0x52B4,0x5A41,0x5ECA,0x5F04,0x6717,0x697C,/* 0x48-0x4F */
+	0x6994,0x6D6A,0x6F0F,0x7262,0x72FC,0x7BED,0x8001,0x807E,/* 0x50-0x57 */
+	0x874B,0x90CE,0x516D,0x9E93,0x7984,0x808B,0x9332,0x8AD6,/* 0x58-0x5F */
+	0x502D,0x548C,0x8A71,0x6B6A,0x8CC4,0x8107,0x60D1,0x67A0,/* 0x60-0x67 */
+	0x9DF2,0x4E99,0x4E98,0x9C10,0x8A6B,0x85C1,0x8568,0x6900,/* 0x68-0x6F */
+	0x6E7E,0x7897,0x8155,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x5F0C,/* 0x98-0x9F */
+	0x4E10,0x4E15,0x4E2A,0x4E31,0x4E36,0x4E3C,0x4E3F,0x4E42,/* 0xA0-0xA7 */
+	0x4E56,0x4E58,0x4E82,0x4E85,0x8C6B,0x4E8A,0x8212,0x5F0D,/* 0xA8-0xAF */
+	0x4E8E,0x4E9E,0x4E9F,0x4EA0,0x4EA2,0x4EB0,0x4EB3,0x4EB6,/* 0xB0-0xB7 */
+	0x4ECE,0x4ECD,0x4EC4,0x4EC6,0x4EC2,0x4ED7,0x4EDE,0x4EED,/* 0xB8-0xBF */
+	0x4EDF,0x4EF7,0x4F09,0x4F5A,0x4F30,0x4F5B,0x4F5D,0x4F57,/* 0xC0-0xC7 */
+	0x4F47,0x4F76,0x4F88,0x4F8F,0x4F98,0x4F7B,0x4F69,0x4F70,/* 0xC8-0xCF */
+	0x4F91,0x4F6F,0x4F86,0x4F96,0x5118,0x4FD4,0x4FDF,0x4FCE,/* 0xD0-0xD7 */
+	0x4FD8,0x4FDB,0x4FD1,0x4FDA,0x4FD0,0x4FE4,0x4FE5,0x501A,/* 0xD8-0xDF */
+	0x5028,0x5014,0x502A,0x5025,0x5005,0x4F1C,0x4FF6,0x5021,/* 0xE0-0xE7 */
+	0x5029,0x502C,0x4FFE,0x4FEF,0x5011,0x5006,0x5043,0x5047,/* 0xE8-0xEF */
+	0x6703,0x5055,0x5050,0x5048,0x505A,0x5056,0x506C,0x5078,/* 0xF0-0xF7 */
+	0x5080,0x509A,0x5085,0x50B4,0x50B2,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_99[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x50C9,0x50CA,0x50B3,0x50C2,0x50D6,0x50DE,0x50E5,0x50ED,/* 0x40-0x47 */
+	0x50E3,0x50EE,0x50F9,0x50F5,0x5109,0x5101,0x5102,0x5116,/* 0x48-0x4F */
+	0x5115,0x5114,0x511A,0x5121,0x513A,0x5137,0x513C,0x513B,/* 0x50-0x57 */
+	0x513F,0x5140,0x5152,0x514C,0x5154,0x5162,0x7AF8,0x5169,/* 0x58-0x5F */
+	0x516A,0x516E,0x5180,0x5182,0x56D8,0x518C,0x5189,0x518F,/* 0x60-0x67 */
+	0x5191,0x5193,0x5195,0x5196,0x51A4,0x51A6,0x51A2,0x51A9,/* 0x68-0x6F */
+	0x51AA,0x51AB,0x51B3,0x51B1,0x51B2,0x51B0,0x51B5,0x51BD,/* 0x70-0x77 */
+	0x51C5,0x51C9,0x51DB,0x51E0,0x8655,0x51E9,0x51ED,0x0000,/* 0x78-0x7F */
+
+	0x51F0,0x51F5,0x51FE,0x5204,0x520B,0x5214,0x520E,0x5227,/* 0x80-0x87 */
+	0x522A,0x522E,0x5233,0x5239,0x524F,0x5244,0x524B,0x524C,/* 0x88-0x8F */
+	0x525E,0x5254,0x526A,0x5274,0x5269,0x5273,0x527F,0x527D,/* 0x90-0x97 */
+	0x528D,0x5294,0x5292,0x5271,0x5288,0x5291,0x8FA8,0x8FA7,/* 0x98-0x9F */
+	0x52AC,0x52AD,0x52BC,0x52B5,0x52C1,0x52CD,0x52D7,0x52DE,/* 0xA0-0xA7 */
+	0x52E3,0x52E6,0x98ED,0x52E0,0x52F3,0x52F5,0x52F8,0x52F9,/* 0xA8-0xAF */
+	0x5306,0x5308,0x7538,0x530D,0x5310,0x530F,0x5315,0x531A,/* 0xB0-0xB7 */
+	0x5323,0x532F,0x5331,0x5333,0x5338,0x5340,0x5346,0x5345,/* 0xB8-0xBF */
+	0x4E17,0x5349,0x534D,0x51D6,0x535E,0x5369,0x536E,0x5918,/* 0xC0-0xC7 */
+	0x537B,0x5377,0x5382,0x5396,0x53A0,0x53A6,0x53A5,0x53AE,/* 0xC8-0xCF */
+	0x53B0,0x53B6,0x53C3,0x7C12,0x96D9,0x53DF,0x66FC,0x71EE,/* 0xD0-0xD7 */
+	0x53EE,0x53E8,0x53ED,0x53FA,0x5401,0x543D,0x5440,0x542C,/* 0xD8-0xDF */
+	0x542D,0x543C,0x542E,0x5436,0x5429,0x541D,0x544E,0x548F,/* 0xE0-0xE7 */
+	0x5475,0x548E,0x545F,0x5471,0x5477,0x5470,0x5492,0x547B,/* 0xE8-0xEF */
+	0x5480,0x5476,0x5484,0x5490,0x5486,0x54C7,0x54A2,0x54B8,/* 0xF0-0xF7 */
+	0x54A5,0x54AC,0x54C4,0x54C8,0x54A8,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54AB,0x54C2,0x54A4,0x54BE,0x54BC,0x54D8,0x54E5,0x54E6,/* 0x40-0x47 */
+	0x550F,0x5514,0x54FD,0x54EE,0x54ED,0x54FA,0x54E2,0x5539,/* 0x48-0x4F */
+	0x5540,0x5563,0x554C,0x552E,0x555C,0x5545,0x5556,0x5557,/* 0x50-0x57 */
+	0x5538,0x5533,0x555D,0x5599,0x5580,0x54AF,0x558A,0x559F,/* 0x58-0x5F */
+	0x557B,0x557E,0x5598,0x559E,0x55AE,0x557C,0x5583,0x55A9,/* 0x60-0x67 */
+	0x5587,0x55A8,0x55DA,0x55C5,0x55DF,0x55C4,0x55DC,0x55E4,/* 0x68-0x6F */
+	0x55D4,0x5614,0x55F7,0x5616,0x55FE,0x55FD,0x561B,0x55F9,/* 0x70-0x77 */
+	0x564E,0x5650,0x71DF,0x5634,0x5636,0x5632,0x5638,0x0000,/* 0x78-0x7F */
+
+	0x566B,0x5664,0x562F,0x566C,0x566A,0x5686,0x5680,0x568A,/* 0x80-0x87 */
+	0x56A0,0x5694,0x568F,0x56A5,0x56AE,0x56B6,0x56B4,0x56C2,/* 0x88-0x8F */
+	0x56BC,0x56C1,0x56C3,0x56C0,0x56C8,0x56CE,0x56D1,0x56D3,/* 0x90-0x97 */
+	0x56D7,0x56EE,0x56F9,0x5700,0x56FF,0x5704,0x5709,0x5708,/* 0x98-0x9F */
+	0x570B,0x570D,0x5713,0x5718,0x5716,0x55C7,0x571C,0x5726,/* 0xA0-0xA7 */
+	0x5737,0x5738,0x574E,0x573B,0x5740,0x574F,0x5769,0x57C0,/* 0xA8-0xAF */
+	0x5788,0x5761,0x577F,0x5789,0x5793,0x57A0,0x57B3,0x57A4,/* 0xB0-0xB7 */
+	0x57AA,0x57B0,0x57C3,0x57C6,0x57D4,0x57D2,0x57D3,0x580A,/* 0xB8-0xBF */
+	0x57D6,0x57E3,0x580B,0x5819,0x581D,0x5872,0x5821,0x5862,/* 0xC0-0xC7 */
+	0x584B,0x5870,0x6BC0,0x5852,0x583D,0x5879,0x5885,0x58B9,/* 0xC8-0xCF */
+	0x589F,0x58AB,0x58BA,0x58DE,0x58BB,0x58B8,0x58AE,0x58C5,/* 0xD0-0xD7 */
+	0x58D3,0x58D1,0x58D7,0x58D9,0x58D8,0x58E5,0x58DC,0x58E4,/* 0xD8-0xDF */
+	0x58DF,0x58EF,0x58FA,0x58F9,0x58FB,0x58FC,0x58FD,0x5902,/* 0xE0-0xE7 */
+	0x590A,0x5910,0x591B,0x68A6,0x5925,0x592C,0x592D,0x5932,/* 0xE8-0xEF */
+	0x5938,0x593E,0x7AD2,0x5955,0x5950,0x594E,0x595A,0x5958,/* 0xF0-0xF7 */
+	0x5962,0x5960,0x5967,0x596C,0x5969,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5978,0x5981,0x599D,0x4F5E,0x4FAB,0x59A3,0x59B2,0x59C6,/* 0x40-0x47 */
+	0x59E8,0x59DC,0x598D,0x59D9,0x59DA,0x5A25,0x5A1F,0x5A11,/* 0x48-0x4F */
+	0x5A1C,0x5A09,0x5A1A,0x5A40,0x5A6C,0x5A49,0x5A35,0x5A36,/* 0x50-0x57 */
+	0x5A62,0x5A6A,0x5A9A,0x5ABC,0x5ABE,0x5ACB,0x5AC2,0x5ABD,/* 0x58-0x5F */
+	0x5AE3,0x5AD7,0x5AE6,0x5AE9,0x5AD6,0x5AFA,0x5AFB,0x5B0C,/* 0x60-0x67 */
+	0x5B0B,0x5B16,0x5B32,0x5AD0,0x5B2A,0x5B36,0x5B3E,0x5B43,/* 0x68-0x6F */
+	0x5B45,0x5B40,0x5B51,0x5B55,0x5B5A,0x5B5B,0x5B65,0x5B69,/* 0x70-0x77 */
+	0x5B70,0x5B73,0x5B75,0x5B78,0x6588,0x5B7A,0x5B80,0x0000,/* 0x78-0x7F */
+
+	0x5B83,0x5BA6,0x5BB8,0x5BC3,0x5BC7,0x5BC9,0x5BD4,0x5BD0,/* 0x80-0x87 */
+	0x5BE4,0x5BE6,0x5BE2,0x5BDE,0x5BE5,0x5BEB,0x5BF0,0x5BF6,/* 0x88-0x8F */
+	0x5BF3,0x5C05,0x5C07,0x5C08,0x5C0D,0x5C13,0x5C20,0x5C22,/* 0x90-0x97 */
+	0x5C28,0x5C38,0x5C39,0x5C41,0x5C46,0x5C4E,0x5C53,0x5C50,/* 0x98-0x9F */
+	0x5C4F,0x5B71,0x5C6C,0x5C6E,0x4E62,0x5C76,0x5C79,0x5C8C,/* 0xA0-0xA7 */
+	0x5C91,0x5C94,0x599B,0x5CAB,0x5CBB,0x5CB6,0x5CBC,0x5CB7,/* 0xA8-0xAF */
+	0x5CC5,0x5CBE,0x5CC7,0x5CD9,0x5CE9,0x5CFD,0x5CFA,0x5CED,/* 0xB0-0xB7 */
+	0x5D8C,0x5CEA,0x5D0B,0x5D15,0x5D17,0x5D5C,0x5D1F,0x5D1B,/* 0xB8-0xBF */
+	0x5D11,0x5D14,0x5D22,0x5D1A,0x5D19,0x5D18,0x5D4C,0x5D52,/* 0xC0-0xC7 */
+	0x5D4E,0x5D4B,0x5D6C,0x5D73,0x5D76,0x5D87,0x5D84,0x5D82,/* 0xC8-0xCF */
+	0x5DA2,0x5D9D,0x5DAC,0x5DAE,0x5DBD,0x5D90,0x5DB7,0x5DBC,/* 0xD0-0xD7 */
+	0x5DC9,0x5DCD,0x5DD3,0x5DD2,0x5DD6,0x5DDB,0x5DEB,0x5DF2,/* 0xD8-0xDF */
+	0x5DF5,0x5E0B,0x5E1A,0x5E19,0x5E11,0x5E1B,0x5E36,0x5E37,/* 0xE0-0xE7 */
+	0x5E44,0x5E43,0x5E40,0x5E4E,0x5E57,0x5E54,0x5E5F,0x5E62,/* 0xE8-0xEF */
+	0x5E64,0x5E47,0x5E75,0x5E76,0x5E7A,0x9EBC,0x5E7F,0x5EA0,/* 0xF0-0xF7 */
+	0x5EC1,0x5EC2,0x5EC8,0x5ED0,0x5ECF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5ED6,0x5EE3,0x5EDD,0x5EDA,0x5EDB,0x5EE2,0x5EE1,0x5EE8,/* 0x40-0x47 */
+	0x5EE9,0x5EEC,0x5EF1,0x5EF3,0x5EF0,0x5EF4,0x5EF8,0x5EFE,/* 0x48-0x4F */
+	0x5F03,0x5F09,0x5F5D,0x5F5C,0x5F0B,0x5F11,0x5F16,0x5F29,/* 0x50-0x57 */
+	0x5F2D,0x5F38,0x5F41,0x5F48,0x5F4C,0x5F4E,0x5F2F,0x5F51,/* 0x58-0x5F */
+	0x5F56,0x5F57,0x5F59,0x5F61,0x5F6D,0x5F73,0x5F77,0x5F83,/* 0x60-0x67 */
+	0x5F82,0x5F7F,0x5F8A,0x5F88,0x5F91,0x5F87,0x5F9E,0x5F99,/* 0x68-0x6F */
+	0x5F98,0x5FA0,0x5FA8,0x5FAD,0x5FBC,0x5FD6,0x5FFB,0x5FE4,/* 0x70-0x77 */
+	0x5FF8,0x5FF1,0x5FDD,0x60B3,0x5FFF,0x6021,0x6060,0x0000,/* 0x78-0x7F */
+
+	0x6019,0x6010,0x6029,0x600E,0x6031,0x601B,0x6015,0x602B,/* 0x80-0x87 */
+	0x6026,0x600F,0x603A,0x605A,0x6041,0x606A,0x6077,0x605F,/* 0x88-0x8F */
+	0x604A,0x6046,0x604D,0x6063,0x6043,0x6064,0x6042,0x606C,/* 0x90-0x97 */
+	0x606B,0x6059,0x6081,0x608D,0x60E7,0x6083,0x609A,0x6084,/* 0x98-0x9F */
+	0x609B,0x6096,0x6097,0x6092,0x60A7,0x608B,0x60E1,0x60B8,/* 0xA0-0xA7 */
+	0x60E0,0x60D3,0x60B4,0x5FF0,0x60BD,0x60C6,0x60B5,0x60D8,/* 0xA8-0xAF */
+	0x614D,0x6115,0x6106,0x60F6,0x60F7,0x6100,0x60F4,0x60FA,/* 0xB0-0xB7 */
+	0x6103,0x6121,0x60FB,0x60F1,0x610D,0x610E,0x6147,0x613E,/* 0xB8-0xBF */
+	0x6128,0x6127,0x614A,0x613F,0x613C,0x612C,0x6134,0x613D,/* 0xC0-0xC7 */
+	0x6142,0x6144,0x6173,0x6177,0x6158,0x6159,0x615A,0x616B,/* 0xC8-0xCF */
+	0x6174,0x616F,0x6165,0x6171,0x615F,0x615D,0x6153,0x6175,/* 0xD0-0xD7 */
+	0x6199,0x6196,0x6187,0x61AC,0x6194,0x619A,0x618A,0x6191,/* 0xD8-0xDF */
+	0x61AB,0x61AE,0x61CC,0x61CA,0x61C9,0x61F7,0x61C8,0x61C3,/* 0xE0-0xE7 */
+	0x61C6,0x61BA,0x61CB,0x7F79,0x61CD,0x61E6,0x61E3,0x61F6,/* 0xE8-0xEF */
+	0x61FA,0x61F4,0x61FF,0x61FD,0x61FC,0x61FE,0x6200,0x6208,/* 0xF0-0xF7 */
+	0x6209,0x620D,0x620C,0x6214,0x621B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x621E,0x6221,0x622A,0x622E,0x6230,0x6232,0x6233,0x6241,/* 0x40-0x47 */
+	0x624E,0x625E,0x6263,0x625B,0x6260,0x6268,0x627C,0x6282,/* 0x48-0x4F */
+	0x6289,0x627E,0x6292,0x6293,0x6296,0x62D4,0x6283,0x6294,/* 0x50-0x57 */
+	0x62D7,0x62D1,0x62BB,0x62CF,0x62FF,0x62C6,0x64D4,0x62C8,/* 0x58-0x5F */
+	0x62DC,0x62CC,0x62CA,0x62C2,0x62C7,0x629B,0x62C9,0x630C,/* 0x60-0x67 */
+	0x62EE,0x62F1,0x6327,0x6302,0x6308,0x62EF,0x62F5,0x6350,/* 0x68-0x6F */
+	0x633E,0x634D,0x641C,0x634F,0x6396,0x638E,0x6380,0x63AB,/* 0x70-0x77 */
+	0x6376,0x63A3,0x638F,0x6389,0x639F,0x63B5,0x636B,0x0000,/* 0x78-0x7F */
+
+	0x6369,0x63BE,0x63E9,0x63C0,0x63C6,0x63E3,0x63C9,0x63D2,/* 0x80-0x87 */
+	0x63F6,0x63C4,0x6416,0x6434,0x6406,0x6413,0x6426,0x6436,/* 0x88-0x8F */
+	0x651D,0x6417,0x6428,0x640F,0x6467,0x646F,0x6476,0x644E,/* 0x90-0x97 */
+	0x652A,0x6495,0x6493,0x64A5,0x64A9,0x6488,0x64BC,0x64DA,/* 0x98-0x9F */
+	0x64D2,0x64C5,0x64C7,0x64BB,0x64D8,0x64C2,0x64F1,0x64E7,/* 0xA0-0xA7 */
+	0x8209,0x64E0,0x64E1,0x62AC,0x64E3,0x64EF,0x652C,0x64F6,/* 0xA8-0xAF */
+	0x64F4,0x64F2,0x64FA,0x6500,0x64FD,0x6518,0x651C,0x6505,/* 0xB0-0xB7 */
+	0x6524,0x6523,0x652B,0x6534,0x6535,0x6537,0x6536,0x6538,/* 0xB8-0xBF */
+	0x754B,0x6548,0x6556,0x6555,0x654D,0x6558,0x655E,0x655D,/* 0xC0-0xC7 */
+	0x6572,0x6578,0x6582,0x6583,0x8B8A,0x659B,0x659F,0x65AB,/* 0xC8-0xCF */
+	0x65B7,0x65C3,0x65C6,0x65C1,0x65C4,0x65CC,0x65D2,0x65DB,/* 0xD0-0xD7 */
+	0x65D9,0x65E0,0x65E1,0x65F1,0x6772,0x660A,0x6603,0x65FB,/* 0xD8-0xDF */
+	0x6773,0x6635,0x6636,0x6634,0x661C,0x664F,0x6644,0x6649,/* 0xE0-0xE7 */
+	0x6641,0x665E,0x665D,0x6664,0x6667,0x6668,0x665F,0x6662,/* 0xE8-0xEF */
+	0x6670,0x6683,0x6688,0x668E,0x6689,0x6684,0x6698,0x669D,/* 0xF0-0xF7 */
+	0x66C1,0x66B9,0x66C9,0x66BE,0x66BC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x66C4,0x66B8,0x66D6,0x66DA,0x66E0,0x663F,0x66E6,0x66E9,/* 0x40-0x47 */
+	0x66F0,0x66F5,0x66F7,0x670F,0x6716,0x671E,0x6726,0x6727,/* 0x48-0x4F */
+	0x9738,0x672E,0x673F,0x6736,0x6741,0x6738,0x6737,0x6746,/* 0x50-0x57 */
+	0x675E,0x6760,0x6759,0x6763,0x6764,0x6789,0x6770,0x67A9,/* 0x58-0x5F */
+	0x677C,0x676A,0x678C,0x678B,0x67A6,0x67A1,0x6785,0x67B7,/* 0x60-0x67 */
+	0x67EF,0x67B4,0x67EC,0x67B3,0x67E9,0x67B8,0x67E4,0x67DE,/* 0x68-0x6F */
+	0x67DD,0x67E2,0x67EE,0x67B9,0x67CE,0x67C6,0x67E7,0x6A9C,/* 0x70-0x77 */
+	0x681E,0x6846,0x6829,0x6840,0x684D,0x6832,0x684E,0x0000,/* 0x78-0x7F */
+
+	0x68B3,0x682B,0x6859,0x6863,0x6877,0x687F,0x689F,0x688F,/* 0x80-0x87 */
+	0x68AD,0x6894,0x689D,0x689B,0x6883,0x6AAE,0x68B9,0x6874,/* 0x88-0x8F */
+	0x68B5,0x68A0,0x68BA,0x690F,0x688D,0x687E,0x6901,0x68CA,/* 0x90-0x97 */
+	0x6908,0x68D8,0x6922,0x6926,0x68E1,0x690C,0x68CD,0x68D4,/* 0x98-0x9F */
+	0x68E7,0x68D5,0x6936,0x6912,0x6904,0x68D7,0x68E3,0x6925,/* 0xA0-0xA7 */
+	0x68F9,0x68E0,0x68EF,0x6928,0x692A,0x691A,0x6923,0x6921,/* 0xA8-0xAF */
+	0x68C6,0x6979,0x6977,0x695C,0x6978,0x696B,0x6954,0x697E,/* 0xB0-0xB7 */
+	0x696E,0x6939,0x6974,0x693D,0x6959,0x6930,0x6961,0x695E,/* 0xB8-0xBF */
+	0x695D,0x6981,0x696A,0x69B2,0x69AE,0x69D0,0x69BF,0x69C1,/* 0xC0-0xC7 */
+	0x69D3,0x69BE,0x69CE,0x5BE8,0x69CA,0x69DD,0x69BB,0x69C3,/* 0xC8-0xCF */
+	0x69A7,0x6A2E,0x6991,0x69A0,0x699C,0x6995,0x69B4,0x69DE,/* 0xD0-0xD7 */
+	0x69E8,0x6A02,0x6A1B,0x69FF,0x6B0A,0x69F9,0x69F2,0x69E7,/* 0xD8-0xDF */
+	0x6A05,0x69B1,0x6A1E,0x69ED,0x6A14,0x69EB,0x6A0A,0x6A12,/* 0xE0-0xE7 */
+	0x6AC1,0x6A23,0x6A13,0x6A44,0x6A0C,0x6A72,0x6A36,0x6A78,/* 0xE8-0xEF */
+	0x6A47,0x6A62,0x6A59,0x6A66,0x6A48,0x6A38,0x6A22,0x6A90,/* 0xF0-0xF7 */
+	0x6A8D,0x6AA0,0x6A84,0x6AA2,0x6AA3,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A97,0x8617,0x6ABB,0x6AC3,0x6AC2,0x6AB8,0x6AB3,0x6AAC,/* 0x40-0x47 */
+	0x6ADE,0x6AD1,0x6ADF,0x6AAA,0x6ADA,0x6AEA,0x6AFB,0x6B05,/* 0x48-0x4F */
+	0x8616,0x6AFA,0x6B12,0x6B16,0x9B31,0x6B1F,0x6B38,0x6B37,/* 0x50-0x57 */
+	0x76DC,0x6B39,0x98EE,0x6B47,0x6B43,0x6B49,0x6B50,0x6B59,/* 0x58-0x5F */
+	0x6B54,0x6B5B,0x6B5F,0x6B61,0x6B78,0x6B79,0x6B7F,0x6B80,/* 0x60-0x67 */
+	0x6B84,0x6B83,0x6B8D,0x6B98,0x6B95,0x6B9E,0x6BA4,0x6BAA,/* 0x68-0x6F */
+	0x6BAB,0x6BAF,0x6BB2,0x6BB1,0x6BB3,0x6BB7,0x6BBC,0x6BC6,/* 0x70-0x77 */
+	0x6BCB,0x6BD3,0x6BDF,0x6BEC,0x6BEB,0x6BF3,0x6BEF,0x0000,/* 0x78-0x7F */
+
+	0x9EBE,0x6C08,0x6C13,0x6C14,0x6C1B,0x6C24,0x6C23,0x6C5E,/* 0x80-0x87 */
+	0x6C55,0x6C62,0x6C6A,0x6C82,0x6C8D,0x6C9A,0x6C81,0x6C9B,/* 0x88-0x8F */
+	0x6C7E,0x6C68,0x6C73,0x6C92,0x6C90,0x6CC4,0x6CF1,0x6CD3,/* 0x90-0x97 */
+	0x6CBD,0x6CD7,0x6CC5,0x6CDD,0x6CAE,0x6CB1,0x6CBE,0x6CBA,/* 0x98-0x9F */
+	0x6CDB,0x6CEF,0x6CD9,0x6CEA,0x6D1F,0x884D,0x6D36,0x6D2B,/* 0xA0-0xA7 */
+	0x6D3D,0x6D38,0x6D19,0x6D35,0x6D33,0x6D12,0x6D0C,0x6D63,/* 0xA8-0xAF */
+	0x6D93,0x6D64,0x6D5A,0x6D79,0x6D59,0x6D8E,0x6D95,0x6FE4,/* 0xB0-0xB7 */
+	0x6D85,0x6DF9,0x6E15,0x6E0A,0x6DB5,0x6DC7,0x6DE6,0x6DB8,/* 0xB8-0xBF */
+	0x6DC6,0x6DEC,0x6DDE,0x6DCC,0x6DE8,0x6DD2,0x6DC5,0x6DFA,/* 0xC0-0xC7 */
+	0x6DD9,0x6DE4,0x6DD5,0x6DEA,0x6DEE,0x6E2D,0x6E6E,0x6E2E,/* 0xC8-0xCF */
+	0x6E19,0x6E72,0x6E5F,0x6E3E,0x6E23,0x6E6B,0x6E2B,0x6E76,/* 0xD0-0xD7 */
+	0x6E4D,0x6E1F,0x6E43,0x6E3A,0x6E4E,0x6E24,0x6EFF,0x6E1D,/* 0xD8-0xDF */
+	0x6E38,0x6E82,0x6EAA,0x6E98,0x6EC9,0x6EB7,0x6ED3,0x6EBD,/* 0xE0-0xE7 */
+	0x6EAF,0x6EC4,0x6EB2,0x6ED4,0x6ED5,0x6E8F,0x6EA5,0x6EC2,/* 0xE8-0xEF */
+	0x6E9F,0x6F41,0x6F11,0x704C,0x6EEC,0x6EF8,0x6EFE,0x6F3F,/* 0xF0-0xF7 */
+	0x6EF2,0x6F31,0x6EEF,0x6F32,0x6ECC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6F3E,0x6F13,0x6EF7,0x6F86,0x6F7A,0x6F78,0x6F81,0x6F80,/* 0x40-0x47 */
+	0x6F6F,0x6F5B,0x6FF3,0x6F6D,0x6F82,0x6F7C,0x6F58,0x6F8E,/* 0x48-0x4F */
+	0x6F91,0x6FC2,0x6F66,0x6FB3,0x6FA3,0x6FA1,0x6FA4,0x6FB9,/* 0x50-0x57 */
+	0x6FC6,0x6FAA,0x6FDF,0x6FD5,0x6FEC,0x6FD4,0x6FD8,0x6FF1,/* 0x58-0x5F */
+	0x6FEE,0x6FDB,0x7009,0x700B,0x6FFA,0x7011,0x7001,0x700F,/* 0x60-0x67 */
+	0x6FFE,0x701B,0x701A,0x6F74,0x701D,0x7018,0x701F,0x7030,/* 0x68-0x6F */
+	0x703E,0x7032,0x7051,0x7063,0x7099,0x7092,0x70AF,0x70F1,/* 0x70-0x77 */
+	0x70AC,0x70B8,0x70B3,0x70AE,0x70DF,0x70CB,0x70DD,0x0000,/* 0x78-0x7F */
+
+	0x70D9,0x7109,0x70FD,0x711C,0x7119,0x7165,0x7155,0x7188,/* 0x80-0x87 */
+	0x7166,0x7162,0x714C,0x7156,0x716C,0x718F,0x71FB,0x7184,/* 0x88-0x8F */
+	0x7195,0x71A8,0x71AC,0x71D7,0x71B9,0x71BE,0x71D2,0x71C9,/* 0x90-0x97 */
+	0x71D4,0x71CE,0x71E0,0x71EC,0x71E7,0x71F5,0x71FC,0x71F9,/* 0x98-0x9F */
+	0x71FF,0x720D,0x7210,0x721B,0x7228,0x722D,0x722C,0x7230,/* 0xA0-0xA7 */
+	0x7232,0x723B,0x723C,0x723F,0x7240,0x7246,0x724B,0x7258,/* 0xA8-0xAF */
+	0x7274,0x727E,0x7282,0x7281,0x7287,0x7292,0x7296,0x72A2,/* 0xB0-0xB7 */
+	0x72A7,0x72B9,0x72B2,0x72C3,0x72C6,0x72C4,0x72CE,0x72D2,/* 0xB8-0xBF */
+	0x72E2,0x72E0,0x72E1,0x72F9,0x72F7,0x500F,0x7317,0x730A,/* 0xC0-0xC7 */
+	0x731C,0x7316,0x731D,0x7334,0x732F,0x7329,0x7325,0x733E,/* 0xC8-0xCF */
+	0x734E,0x734F,0x9ED8,0x7357,0x736A,0x7368,0x7370,0x7378,/* 0xD0-0xD7 */
+	0x7375,0x737B,0x737A,0x73C8,0x73B3,0x73CE,0x73BB,0x73C0,/* 0xD8-0xDF */
+	0x73E5,0x73EE,0x73DE,0x74A2,0x7405,0x746F,0x7425,0x73F8,/* 0xE0-0xE7 */
+	0x7432,0x743A,0x7455,0x743F,0x745F,0x7459,0x7441,0x745C,/* 0xE8-0xEF */
+	0x7469,0x7470,0x7463,0x746A,0x7476,0x747E,0x748B,0x749E,/* 0xF0-0xF7 */
+	0x74A7,0x74CA,0x74CF,0x74D4,0x73F1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x74E0,0x74E3,0x74E7,0x74E9,0x74EE,0x74F2,0x74F0,0x74F1,/* 0x40-0x47 */
+	0x74F8,0x74F7,0x7504,0x7503,0x7505,0x750C,0x750E,0x750D,/* 0x48-0x4F */
+	0x7515,0x7513,0x751E,0x7526,0x752C,0x753C,0x7544,0x754D,/* 0x50-0x57 */
+	0x754A,0x7549,0x755B,0x7546,0x755A,0x7569,0x7564,0x7567,/* 0x58-0x5F */
+	0x756B,0x756D,0x7578,0x7576,0x7586,0x7587,0x7574,0x758A,/* 0x60-0x67 */
+	0x7589,0x7582,0x7594,0x759A,0x759D,0x75A5,0x75A3,0x75C2,/* 0x68-0x6F */
+	0x75B3,0x75C3,0x75B5,0x75BD,0x75B8,0x75BC,0x75B1,0x75CD,/* 0x70-0x77 */
+	0x75CA,0x75D2,0x75D9,0x75E3,0x75DE,0x75FE,0x75FF,0x0000,/* 0x78-0x7F */
+
+	0x75FC,0x7601,0x75F0,0x75FA,0x75F2,0x75F3,0x760B,0x760D,/* 0x80-0x87 */
+	0x7609,0x761F,0x7627,0x7620,0x7621,0x7622,0x7624,0x7634,/* 0x88-0x8F */
+	0x7630,0x763B,0x7647,0x7648,0x7646,0x765C,0x7658,0x7661,/* 0x90-0x97 */
+	0x7662,0x7668,0x7669,0x766A,0x7667,0x766C,0x7670,0x7672,/* 0x98-0x9F */
+	0x7676,0x7678,0x767C,0x7680,0x7683,0x7688,0x768B,0x768E,/* 0xA0-0xA7 */
+	0x7696,0x7693,0x7699,0x769A,0x76B0,0x76B4,0x76B8,0x76B9,/* 0xA8-0xAF */
+	0x76BA,0x76C2,0x76CD,0x76D6,0x76D2,0x76DE,0x76E1,0x76E5,/* 0xB0-0xB7 */
+	0x76E7,0x76EA,0x862F,0x76FB,0x7708,0x7707,0x7704,0x7729,/* 0xB8-0xBF */
+	0x7724,0x771E,0x7725,0x7726,0x771B,0x7737,0x7738,0x7747,/* 0xC0-0xC7 */
+	0x775A,0x7768,0x776B,0x775B,0x7765,0x777F,0x777E,0x7779,/* 0xC8-0xCF */
+	0x778E,0x778B,0x7791,0x77A0,0x779E,0x77B0,0x77B6,0x77B9,/* 0xD0-0xD7 */
+	0x77BF,0x77BC,0x77BD,0x77BB,0x77C7,0x77CD,0x77D7,0x77DA,/* 0xD8-0xDF */
+	0x77DC,0x77E3,0x77EE,0x77FC,0x780C,0x7812,0x7926,0x7820,/* 0xE0-0xE7 */
+	0x792A,0x7845,0x788E,0x7874,0x7886,0x787C,0x789A,0x788C,/* 0xE8-0xEF */
+	0x78A3,0x78B5,0x78AA,0x78AF,0x78D1,0x78C6,0x78CB,0x78D4,/* 0xF0-0xF7 */
+	0x78BE,0x78BC,0x78C5,0x78CA,0x78EC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x78E7,0x78DA,0x78FD,0x78F4,0x7907,0x7912,0x7911,0x7919,/* 0x40-0x47 */
+	0x792C,0x792B,0x7940,0x7960,0x7957,0x795F,0x795A,0x7955,/* 0x48-0x4F */
+	0x7953,0x797A,0x797F,0x798A,0x799D,0x79A7,0x9F4B,0x79AA,/* 0x50-0x57 */
+	0x79AE,0x79B3,0x79B9,0x79BA,0x79C9,0x79D5,0x79E7,0x79EC,/* 0x58-0x5F */
+	0x79E1,0x79E3,0x7A08,0x7A0D,0x7A18,0x7A19,0x7A20,0x7A1F,/* 0x60-0x67 */
+	0x7980,0x7A31,0x7A3B,0x7A3E,0x7A37,0x7A43,0x7A57,0x7A49,/* 0x68-0x6F */
+	0x7A61,0x7A62,0x7A69,0x9F9D,0x7A70,0x7A79,0x7A7D,0x7A88,/* 0x70-0x77 */
+	0x7A97,0x7A95,0x7A98,0x7A96,0x7AA9,0x7AC8,0x7AB0,0x0000,/* 0x78-0x7F */
+
+	0x7AB6,0x7AC5,0x7AC4,0x7ABF,0x9083,0x7AC7,0x7ACA,0x7ACD,/* 0x80-0x87 */
+	0x7ACF,0x7AD5,0x7AD3,0x7AD9,0x7ADA,0x7ADD,0x7AE1,0x7AE2,/* 0x88-0x8F */
+	0x7AE6,0x7AED,0x7AF0,0x7B02,0x7B0F,0x7B0A,0x7B06,0x7B33,/* 0x90-0x97 */
+	0x7B18,0x7B19,0x7B1E,0x7B35,0x7B28,0x7B36,0x7B50,0x7B7A,/* 0x98-0x9F */
+	0x7B04,0x7B4D,0x7B0B,0x7B4C,0x7B45,0x7B75,0x7B65,0x7B74,/* 0xA0-0xA7 */
+	0x7B67,0x7B70,0x7B71,0x7B6C,0x7B6E,0x7B9D,0x7B98,0x7B9F,/* 0xA8-0xAF */
+	0x7B8D,0x7B9C,0x7B9A,0x7B8B,0x7B92,0x7B8F,0x7B5D,0x7B99,/* 0xB0-0xB7 */
+	0x7BCB,0x7BC1,0x7BCC,0x7BCF,0x7BB4,0x7BC6,0x7BDD,0x7BE9,/* 0xB8-0xBF */
+	0x7C11,0x7C14,0x7BE6,0x7BE5,0x7C60,0x7C00,0x7C07,0x7C13,/* 0xC0-0xC7 */
+	0x7BF3,0x7BF7,0x7C17,0x7C0D,0x7BF6,0x7C23,0x7C27,0x7C2A,/* 0xC8-0xCF */
+	0x7C1F,0x7C37,0x7C2B,0x7C3D,0x7C4C,0x7C43,0x7C54,0x7C4F,/* 0xD0-0xD7 */
+	0x7C40,0x7C50,0x7C58,0x7C5F,0x7C64,0x7C56,0x7C65,0x7C6C,/* 0xD8-0xDF */
+	0x7C75,0x7C83,0x7C90,0x7CA4,0x7CAD,0x7CA2,0x7CAB,0x7CA1,/* 0xE0-0xE7 */
+	0x7CA8,0x7CB3,0x7CB2,0x7CB1,0x7CAE,0x7CB9,0x7CBD,0x7CC0,/* 0xE8-0xEF */
+	0x7CC5,0x7CC2,0x7CD8,0x7CD2,0x7CDC,0x7CE2,0x9B3B,0x7CEF,/* 0xF0-0xF7 */
+	0x7CF2,0x7CF4,0x7CF6,0x7CFA,0x7D06,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7D02,0x7D1C,0x7D15,0x7D0A,0x7D45,0x7D4B,0x7D2E,0x7D32,/* 0x40-0x47 */
+	0x7D3F,0x7D35,0x7D46,0x7D73,0x7D56,0x7D4E,0x7D72,0x7D68,/* 0x48-0x4F */
+	0x7D6E,0x7D4F,0x7D63,0x7D93,0x7D89,0x7D5B,0x7D8F,0x7D7D,/* 0x50-0x57 */
+	0x7D9B,0x7DBA,0x7DAE,0x7DA3,0x7DB5,0x7DC7,0x7DBD,0x7DAB,/* 0x58-0x5F */
+	0x7E3D,0x7DA2,0x7DAF,0x7DDC,0x7DB8,0x7D9F,0x7DB0,0x7DD8,/* 0x60-0x67 */
+	0x7DDD,0x7DE4,0x7DDE,0x7DFB,0x7DF2,0x7DE1,0x7E05,0x7E0A,/* 0x68-0x6F */
+	0x7E23,0x7E21,0x7E12,0x7E31,0x7E1F,0x7E09,0x7E0B,0x7E22,/* 0x70-0x77 */
+	0x7E46,0x7E66,0x7E3B,0x7E35,0x7E39,0x7E43,0x7E37,0x0000,/* 0x78-0x7F */
+
+	0x7E32,0x7E3A,0x7E67,0x7E5D,0x7E56,0x7E5E,0x7E59,0x7E5A,/* 0x80-0x87 */
+	0x7E79,0x7E6A,0x7E69,0x7E7C,0x7E7B,0x7E83,0x7DD5,0x7E7D,/* 0x88-0x8F */
+	0x8FAE,0x7E7F,0x7E88,0x7E89,0x7E8C,0x7E92,0x7E90,0x7E93,/* 0x90-0x97 */
+	0x7E94,0x7E96,0x7E8E,0x7E9B,0x7E9C,0x7F38,0x7F3A,0x7F45,/* 0x98-0x9F */
+	0x7F4C,0x7F4D,0x7F4E,0x7F50,0x7F51,0x7F55,0x7F54,0x7F58,/* 0xA0-0xA7 */
+	0x7F5F,0x7F60,0x7F68,0x7F69,0x7F67,0x7F78,0x7F82,0x7F86,/* 0xA8-0xAF */
+	0x7F83,0x7F88,0x7F87,0x7F8C,0x7F94,0x7F9E,0x7F9D,0x7F9A,/* 0xB0-0xB7 */
+	0x7FA3,0x7FAF,0x7FB2,0x7FB9,0x7FAE,0x7FB6,0x7FB8,0x8B71,/* 0xB8-0xBF */
+	0x7FC5,0x7FC6,0x7FCA,0x7FD5,0x7FD4,0x7FE1,0x7FE6,0x7FE9,/* 0xC0-0xC7 */
+	0x7FF3,0x7FF9,0x98DC,0x8006,0x8004,0x800B,0x8012,0x8018,/* 0xC8-0xCF */
+	0x8019,0x801C,0x8021,0x8028,0x803F,0x803B,0x804A,0x8046,/* 0xD0-0xD7 */
+	0x8052,0x8058,0x805A,0x805F,0x8062,0x8068,0x8073,0x8072,/* 0xD8-0xDF */
+	0x8070,0x8076,0x8079,0x807D,0x807F,0x8084,0x8086,0x8085,/* 0xE0-0xE7 */
+	0x809B,0x8093,0x809A,0x80AD,0x5190,0x80AC,0x80DB,0x80E5,/* 0xE8-0xEF */
+	0x80D9,0x80DD,0x80C4,0x80DA,0x80D6,0x8109,0x80EF,0x80F1,/* 0xF0-0xF7 */
+	0x811B,0x8129,0x8123,0x812F,0x814B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x968B,0x8146,0x813E,0x8153,0x8151,0x80FC,0x8171,0x816E,/* 0x40-0x47 */
+	0x8165,0x8166,0x8174,0x8183,0x8188,0x818A,0x8180,0x8182,/* 0x48-0x4F */
+	0x81A0,0x8195,0x81A4,0x81A3,0x815F,0x8193,0x81A9,0x81B0,/* 0x50-0x57 */
+	0x81B5,0x81BE,0x81B8,0x81BD,0x81C0,0x81C2,0x81BA,0x81C9,/* 0x58-0x5F */
+	0x81CD,0x81D1,0x81D9,0x81D8,0x81C8,0x81DA,0x81DF,0x81E0,/* 0x60-0x67 */
+	0x81E7,0x81FA,0x81FB,0x81FE,0x8201,0x8202,0x8205,0x8207,/* 0x68-0x6F */
+	0x820A,0x820D,0x8210,0x8216,0x8229,0x822B,0x8238,0x8233,/* 0x70-0x77 */
+	0x8240,0x8259,0x8258,0x825D,0x825A,0x825F,0x8264,0x0000,/* 0x78-0x7F */
+
+	0x8262,0x8268,0x826A,0x826B,0x822E,0x8271,0x8277,0x8278,/* 0x80-0x87 */
+	0x827E,0x828D,0x8292,0x82AB,0x829F,0x82BB,0x82AC,0x82E1,/* 0x88-0x8F */
+	0x82E3,0x82DF,0x82D2,0x82F4,0x82F3,0x82FA,0x8393,0x8303,/* 0x90-0x97 */
+	0x82FB,0x82F9,0x82DE,0x8306,0x82DC,0x8309,0x82D9,0x8335,/* 0x98-0x9F */
+	0x8334,0x8316,0x8332,0x8331,0x8340,0x8339,0x8350,0x8345,/* 0xA0-0xA7 */
+	0x832F,0x832B,0x8317,0x8318,0x8385,0x839A,0x83AA,0x839F,/* 0xA8-0xAF */
+	0x83A2,0x8396,0x8323,0x838E,0x8387,0x838A,0x837C,0x83B5,/* 0xB0-0xB7 */
+	0x8373,0x8375,0x83A0,0x8389,0x83A8,0x83F4,0x8413,0x83EB,/* 0xB8-0xBF */
+	0x83CE,0x83FD,0x8403,0x83D8,0x840B,0x83C1,0x83F7,0x8407,/* 0xC0-0xC7 */
+	0x83E0,0x83F2,0x840D,0x8422,0x8420,0x83BD,0x8438,0x8506,/* 0xC8-0xCF */
+	0x83FB,0x846D,0x842A,0x843C,0x855A,0x8484,0x8477,0x846B,/* 0xD0-0xD7 */
+	0x84AD,0x846E,0x8482,0x8469,0x8446,0x842C,0x846F,0x8479,/* 0xD8-0xDF */
+	0x8435,0x84CA,0x8462,0x84B9,0x84BF,0x849F,0x84D9,0x84CD,/* 0xE0-0xE7 */
+	0x84BB,0x84DA,0x84D0,0x84C1,0x84C6,0x84D6,0x84A1,0x8521,/* 0xE8-0xEF */
+	0x84FF,0x84F4,0x8517,0x8518,0x852C,0x851F,0x8515,0x8514,/* 0xF0-0xF7 */
+	0x84FC,0x8540,0x8563,0x8558,0x8548,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8541,0x8602,0x854B,0x8555,0x8580,0x85A4,0x8588,0x8591,/* 0x40-0x47 */
+	0x858A,0x85A8,0x856D,0x8594,0x859B,0x85EA,0x8587,0x859C,/* 0x48-0x4F */
+	0x8577,0x857E,0x8590,0x85C9,0x85BA,0x85CF,0x85B9,0x85D0,/* 0x50-0x57 */
+	0x85D5,0x85DD,0x85E5,0x85DC,0x85F9,0x860A,0x8613,0x860B,/* 0x58-0x5F */
+	0x85FE,0x85FA,0x8606,0x8622,0x861A,0x8630,0x863F,0x864D,/* 0x60-0x67 */
+	0x4E55,0x8654,0x865F,0x8667,0x8671,0x8693,0x86A3,0x86A9,/* 0x68-0x6F */
+	0x86AA,0x868B,0x868C,0x86B6,0x86AF,0x86C4,0x86C6,0x86B0,/* 0x70-0x77 */
+	0x86C9,0x8823,0x86AB,0x86D4,0x86DE,0x86E9,0x86EC,0x0000,/* 0x78-0x7F */
+
+	0x86DF,0x86DB,0x86EF,0x8712,0x8706,0x8708,0x8700,0x8703,/* 0x80-0x87 */
+	0x86FB,0x8711,0x8709,0x870D,0x86F9,0x870A,0x8734,0x873F,/* 0x88-0x8F */
+	0x8737,0x873B,0x8725,0x8729,0x871A,0x8760,0x875F,0x8778,/* 0x90-0x97 */
+	0x874C,0x874E,0x8774,0x8757,0x8768,0x876E,0x8759,0x8753,/* 0x98-0x9F */
+	0x8763,0x876A,0x8805,0x87A2,0x879F,0x8782,0x87AF,0x87CB,/* 0xA0-0xA7 */
+	0x87BD,0x87C0,0x87D0,0x96D6,0x87AB,0x87C4,0x87B3,0x87C7,/* 0xA8-0xAF */
+	0x87C6,0x87BB,0x87EF,0x87F2,0x87E0,0x880F,0x880D,0x87FE,/* 0xB0-0xB7 */
+	0x87F6,0x87F7,0x880E,0x87D2,0x8811,0x8816,0x8815,0x8822,/* 0xB8-0xBF */
+	0x8821,0x8831,0x8836,0x8839,0x8827,0x883B,0x8844,0x8842,/* 0xC0-0xC7 */
+	0x8852,0x8859,0x885E,0x8862,0x886B,0x8881,0x887E,0x889E,/* 0xC8-0xCF */
+	0x8875,0x887D,0x88B5,0x8872,0x8882,0x8897,0x8892,0x88AE,/* 0xD0-0xD7 */
+	0x8899,0x88A2,0x888D,0x88A4,0x88B0,0x88BF,0x88B1,0x88C3,/* 0xD8-0xDF */
+	0x88C4,0x88D4,0x88D8,0x88D9,0x88DD,0x88F9,0x8902,0x88FC,/* 0xE0-0xE7 */
+	0x88F4,0x88E8,0x88F2,0x8904,0x890C,0x890A,0x8913,0x8943,/* 0xE8-0xEF */
+	0x891E,0x8925,0x892A,0x892B,0x8941,0x8944,0x893B,0x8936,/* 0xF0-0xF7 */
+	0x8938,0x894C,0x891D,0x8960,0x895E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8966,0x8964,0x896D,0x896A,0x896F,0x8974,0x8977,0x897E,/* 0x40-0x47 */
+	0x8983,0x8988,0x898A,0x8993,0x8998,0x89A1,0x89A9,0x89A6,/* 0x48-0x4F */
+	0x89AC,0x89AF,0x89B2,0x89BA,0x89BD,0x89BF,0x89C0,0x89DA,/* 0x50-0x57 */
+	0x89DC,0x89DD,0x89E7,0x89F4,0x89F8,0x8A03,0x8A16,0x8A10,/* 0x58-0x5F */
+	0x8A0C,0x8A1B,0x8A1D,0x8A25,0x8A36,0x8A41,0x8A5B,0x8A52,/* 0x60-0x67 */
+	0x8A46,0x8A48,0x8A7C,0x8A6D,0x8A6C,0x8A62,0x8A85,0x8A82,/* 0x68-0x6F */
+	0x8A84,0x8AA8,0x8AA1,0x8A91,0x8AA5,0x8AA6,0x8A9A,0x8AA3,/* 0x70-0x77 */
+	0x8AC4,0x8ACD,0x8AC2,0x8ADA,0x8AEB,0x8AF3,0x8AE7,0x0000,/* 0x78-0x7F */
+
+	0x8AE4,0x8AF1,0x8B14,0x8AE0,0x8AE2,0x8AF7,0x8ADE,0x8ADB,/* 0x80-0x87 */
+	0x8B0C,0x8B07,0x8B1A,0x8AE1,0x8B16,0x8B10,0x8B17,0x8B20,/* 0x88-0x8F */
+	0x8B33,0x97AB,0x8B26,0x8B2B,0x8B3E,0x8B28,0x8B41,0x8B4C,/* 0x90-0x97 */
+	0x8B4F,0x8B4E,0x8B49,0x8B56,0x8B5B,0x8B5A,0x8B6B,0x8B5F,/* 0x98-0x9F */
+	0x8B6C,0x8B6F,0x8B74,0x8B7D,0x8B80,0x8B8C,0x8B8E,0x8B92,/* 0xA0-0xA7 */
+	0x8B93,0x8B96,0x8B99,0x8B9A,0x8C3A,0x8C41,0x8C3F,0x8C48,/* 0xA8-0xAF */
+	0x8C4C,0x8C4E,0x8C50,0x8C55,0x8C62,0x8C6C,0x8C78,0x8C7A,/* 0xB0-0xB7 */
+	0x8C82,0x8C89,0x8C85,0x8C8A,0x8C8D,0x8C8E,0x8C94,0x8C7C,/* 0xB8-0xBF */
+	0x8C98,0x621D,0x8CAD,0x8CAA,0x8CBD,0x8CB2,0x8CB3,0x8CAE,/* 0xC0-0xC7 */
+	0x8CB6,0x8CC8,0x8CC1,0x8CE4,0x8CE3,0x8CDA,0x8CFD,0x8CFA,/* 0xC8-0xCF */
+	0x8CFB,0x8D04,0x8D05,0x8D0A,0x8D07,0x8D0F,0x8D0D,0x8D10,/* 0xD0-0xD7 */
+	0x9F4E,0x8D13,0x8CCD,0x8D14,0x8D16,0x8D67,0x8D6D,0x8D71,/* 0xD8-0xDF */
+	0x8D73,0x8D81,0x8D99,0x8DC2,0x8DBE,0x8DBA,0x8DCF,0x8DDA,/* 0xE0-0xE7 */
+	0x8DD6,0x8DCC,0x8DDB,0x8DCB,0x8DEA,0x8DEB,0x8DDF,0x8DE3,/* 0xE8-0xEF */
+	0x8DFC,0x8E08,0x8E09,0x8DFF,0x8E1D,0x8E1E,0x8E10,0x8E1F,/* 0xF0-0xF7 */
+	0x8E42,0x8E35,0x8E30,0x8E34,0x8E4A,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E47,0x8E49,0x8E4C,0x8E50,0x8E48,0x8E59,0x8E64,0x8E60,/* 0x40-0x47 */
+	0x8E2A,0x8E63,0x8E55,0x8E76,0x8E72,0x8E7C,0x8E81,0x8E87,/* 0x48-0x4F */
+	0x8E85,0x8E84,0x8E8B,0x8E8A,0x8E93,0x8E91,0x8E94,0x8E99,/* 0x50-0x57 */
+	0x8EAA,0x8EA1,0x8EAC,0x8EB0,0x8EC6,0x8EB1,0x8EBE,0x8EC5,/* 0x58-0x5F */
+	0x8EC8,0x8ECB,0x8EDB,0x8EE3,0x8EFC,0x8EFB,0x8EEB,0x8EFE,/* 0x60-0x67 */
+	0x8F0A,0x8F05,0x8F15,0x8F12,0x8F19,0x8F13,0x8F1C,0x8F1F,/* 0x68-0x6F */
+	0x8F1B,0x8F0C,0x8F26,0x8F33,0x8F3B,0x8F39,0x8F45,0x8F42,/* 0x70-0x77 */
+	0x8F3E,0x8F4C,0x8F49,0x8F46,0x8F4E,0x8F57,0x8F5C,0x0000,/* 0x78-0x7F */
+
+	0x8F62,0x8F63,0x8F64,0x8F9C,0x8F9F,0x8FA3,0x8FAD,0x8FAF,/* 0x80-0x87 */
+	0x8FB7,0x8FDA,0x8FE5,0x8FE2,0x8FEA,0x8FEF,0x9087,0x8FF4,/* 0x88-0x8F */
+	0x9005,0x8FF9,0x8FFA,0x9011,0x9015,0x9021,0x900D,0x901E,/* 0x90-0x97 */
+	0x9016,0x900B,0x9027,0x9036,0x9035,0x9039,0x8FF8,0x904F,/* 0x98-0x9F */
+	0x9050,0x9051,0x9052,0x900E,0x9049,0x903E,0x9056,0x9058,/* 0xA0-0xA7 */
+	0x905E,0x9068,0x906F,0x9076,0x96A8,0x9072,0x9082,0x907D,/* 0xA8-0xAF */
+	0x9081,0x9080,0x908A,0x9089,0x908F,0x90A8,0x90AF,0x90B1,/* 0xB0-0xB7 */
+	0x90B5,0x90E2,0x90E4,0x6248,0x90DB,0x9102,0x9112,0x9119,/* 0xB8-0xBF */
+	0x9132,0x9130,0x914A,0x9156,0x9158,0x9163,0x9165,0x9169,/* 0xC0-0xC7 */
+	0x9173,0x9172,0x918B,0x9189,0x9182,0x91A2,0x91AB,0x91AF,/* 0xC8-0xCF */
+	0x91AA,0x91B5,0x91B4,0x91BA,0x91C0,0x91C1,0x91C9,0x91CB,/* 0xD0-0xD7 */
+	0x91D0,0x91D6,0x91DF,0x91E1,0x91DB,0x91FC,0x91F5,0x91F6,/* 0xD8-0xDF */
+	0x921E,0x91FF,0x9214,0x922C,0x9215,0x9211,0x925E,0x9257,/* 0xE0-0xE7 */
+	0x9245,0x9249,0x9264,0x9248,0x9295,0x923F,0x924B,0x9250,/* 0xE8-0xEF */
+	0x929C,0x9296,0x9293,0x929B,0x925A,0x92CF,0x92B9,0x92B7,/* 0xF0-0xF7 */
+	0x92E9,0x930F,0x92FA,0x9344,0x932E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9319,0x9322,0x931A,0x9323,0x933A,0x9335,0x933B,0x935C,/* 0x40-0x47 */
+	0x9360,0x937C,0x936E,0x9356,0x93B0,0x93AC,0x93AD,0x9394,/* 0x48-0x4F */
+	0x93B9,0x93D6,0x93D7,0x93E8,0x93E5,0x93D8,0x93C3,0x93DD,/* 0x50-0x57 */
+	0x93D0,0x93C8,0x93E4,0x941A,0x9414,0x9413,0x9403,0x9407,/* 0x58-0x5F */
+	0x9410,0x9436,0x942B,0x9435,0x9421,0x943A,0x9441,0x9452,/* 0x60-0x67 */
+	0x9444,0x945B,0x9460,0x9462,0x945E,0x946A,0x9229,0x9470,/* 0x68-0x6F */
+	0x9475,0x9477,0x947D,0x945A,0x947C,0x947E,0x9481,0x947F,/* 0x70-0x77 */
+	0x9582,0x9587,0x958A,0x9594,0x9596,0x9598,0x9599,0x0000,/* 0x78-0x7F */
+
+	0x95A0,0x95A8,0x95A7,0x95AD,0x95BC,0x95BB,0x95B9,0x95BE,/* 0x80-0x87 */
+	0x95CA,0x6FF6,0x95C3,0x95CD,0x95CC,0x95D5,0x95D4,0x95D6,/* 0x88-0x8F */
+	0x95DC,0x95E1,0x95E5,0x95E2,0x9621,0x9628,0x962E,0x962F,/* 0x90-0x97 */
+	0x9642,0x964C,0x964F,0x964B,0x9677,0x965C,0x965E,0x965D,/* 0x98-0x9F */
+	0x965F,0x9666,0x9672,0x966C,0x968D,0x9698,0x9695,0x9697,/* 0xA0-0xA7 */
+	0x96AA,0x96A7,0x96B1,0x96B2,0x96B0,0x96B4,0x96B6,0x96B8,/* 0xA8-0xAF */
+	0x96B9,0x96CE,0x96CB,0x96C9,0x96CD,0x894D,0x96DC,0x970D,/* 0xB0-0xB7 */
+	0x96D5,0x96F9,0x9704,0x9706,0x9708,0x9713,0x970E,0x9711,/* 0xB8-0xBF */
+	0x970F,0x9716,0x9719,0x9724,0x972A,0x9730,0x9739,0x973D,/* 0xC0-0xC7 */
+	0x973E,0x9744,0x9746,0x9748,0x9742,0x9749,0x975C,0x9760,/* 0xC8-0xCF */
+	0x9764,0x9766,0x9768,0x52D2,0x976B,0x9771,0x9779,0x9785,/* 0xD0-0xD7 */
+	0x977C,0x9781,0x977A,0x9786,0x978B,0x978F,0x9790,0x979C,/* 0xD8-0xDF */
+	0x97A8,0x97A6,0x97A3,0x97B3,0x97B4,0x97C3,0x97C6,0x97C8,/* 0xE0-0xE7 */
+	0x97CB,0x97DC,0x97ED,0x9F4F,0x97F2,0x7ADF,0x97F6,0x97F5,/* 0xE8-0xEF */
+	0x980F,0x980C,0x9838,0x9824,0x9821,0x9837,0x983D,0x9846,/* 0xF0-0xF7 */
+	0x984F,0x984B,0x986B,0x986F,0x9870,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9871,0x9874,0x9873,0x98AA,0x98AF,0x98B1,0x98B6,0x98C4,/* 0x40-0x47 */
+	0x98C3,0x98C6,0x98E9,0x98EB,0x9903,0x9909,0x9912,0x9914,/* 0x48-0x4F */
+	0x9918,0x9921,0x991D,0x991E,0x9924,0x9920,0x992C,0x992E,/* 0x50-0x57 */
+	0x993D,0x993E,0x9942,0x9949,0x9945,0x9950,0x994B,0x9951,/* 0x58-0x5F */
+	0x9952,0x994C,0x9955,0x9997,0x9998,0x99A5,0x99AD,0x99AE,/* 0x60-0x67 */
+	0x99BC,0x99DF,0x99DB,0x99DD,0x99D8,0x99D1,0x99ED,0x99EE,/* 0x68-0x6F */
+	0x99F1,0x99F2,0x99FB,0x99F8,0x9A01,0x9A0F,0x9A05,0x99E2,/* 0x70-0x77 */
+	0x9A19,0x9A2B,0x9A37,0x9A45,0x9A42,0x9A40,0x9A43,0x0000,/* 0x78-0x7F */
+
+	0x9A3E,0x9A55,0x9A4D,0x9A5B,0x9A57,0x9A5F,0x9A62,0x9A65,/* 0x80-0x87 */
+	0x9A64,0x9A69,0x9A6B,0x9A6A,0x9AAD,0x9AB0,0x9ABC,0x9AC0,/* 0x88-0x8F */
+	0x9ACF,0x9AD1,0x9AD3,0x9AD4,0x9ADE,0x9ADF,0x9AE2,0x9AE3,/* 0x90-0x97 */
+	0x9AE6,0x9AEF,0x9AEB,0x9AEE,0x9AF4,0x9AF1,0x9AF7,0x9AFB,/* 0x98-0x9F */
+	0x9B06,0x9B18,0x9B1A,0x9B1F,0x9B22,0x9B23,0x9B25,0x9B27,/* 0xA0-0xA7 */
+	0x9B28,0x9B29,0x9B2A,0x9B2E,0x9B2F,0x9B32,0x9B44,0x9B43,/* 0xA8-0xAF */
+	0x9B4F,0x9B4D,0x9B4E,0x9B51,0x9B58,0x9B74,0x9B93,0x9B83,/* 0xB0-0xB7 */
+	0x9B91,0x9B96,0x9B97,0x9B9F,0x9BA0,0x9BA8,0x9BB4,0x9BC0,/* 0xB8-0xBF */
+	0x9BCA,0x9BB9,0x9BC6,0x9BCF,0x9BD1,0x9BD2,0x9BE3,0x9BE2,/* 0xC0-0xC7 */
+	0x9BE4,0x9BD4,0x9BE1,0x9C3A,0x9BF2,0x9BF1,0x9BF0,0x9C15,/* 0xC8-0xCF */
+	0x9C14,0x9C09,0x9C13,0x9C0C,0x9C06,0x9C08,0x9C12,0x9C0A,/* 0xD0-0xD7 */
+	0x9C04,0x9C2E,0x9C1B,0x9C25,0x9C24,0x9C21,0x9C30,0x9C47,/* 0xD8-0xDF */
+	0x9C32,0x9C46,0x9C3E,0x9C5A,0x9C60,0x9C67,0x9C76,0x9C78,/* 0xE0-0xE7 */
+	0x9CE7,0x9CEC,0x9CF0,0x9D09,0x9D08,0x9CEB,0x9D03,0x9D06,/* 0xE8-0xEF */
+	0x9D2A,0x9D26,0x9DAF,0x9D23,0x9D1F,0x9D44,0x9D15,0x9D12,/* 0xF0-0xF7 */
+	0x9D41,0x9D3F,0x9D3E,0x9D46,0x9D48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9D5D,0x9D5E,0x9D64,0x9D51,0x9D50,0x9D59,0x9D72,0x9D89,/* 0x40-0x47 */
+	0x9D87,0x9DAB,0x9D6F,0x9D7A,0x9D9A,0x9DA4,0x9DA9,0x9DB2,/* 0x48-0x4F */
+	0x9DC4,0x9DC1,0x9DBB,0x9DB8,0x9DBA,0x9DC6,0x9DCF,0x9DC2,/* 0x50-0x57 */
+	0x9DD9,0x9DD3,0x9DF8,0x9DE6,0x9DED,0x9DEF,0x9DFD,0x9E1A,/* 0x58-0x5F */
+	0x9E1B,0x9E1E,0x9E75,0x9E79,0x9E7D,0x9E81,0x9E88,0x9E8B,/* 0x60-0x67 */
+	0x9E8C,0x9E92,0x9E95,0x9E91,0x9E9D,0x9EA5,0x9EA9,0x9EB8,/* 0x68-0x6F */
+	0x9EAA,0x9EAD,0x9761,0x9ECC,0x9ECE,0x9ECF,0x9ED0,0x9ED4,/* 0x70-0x77 */
+	0x9EDC,0x9EDE,0x9EDD,0x9EE0,0x9EE5,0x9EE8,0x9EEF,0x0000,/* 0x78-0x7F */
+
+	0x9EF4,0x9EF6,0x9EF7,0x9EF9,0x9EFB,0x9EFC,0x9EFD,0x9F07,/* 0x80-0x87 */
+	0x9F08,0x76B7,0x9F15,0x9F21,0x9F2C,0x9F3E,0x9F4A,0x9F52,/* 0x88-0x8F */
+	0x9F54,0x9F63,0x9F5F,0x9F60,0x9F61,0x9F66,0x9F67,0x9F6C,/* 0x90-0x97 */
+	0x9F6A,0x9F77,0x9F72,0x9F76,0x9F95,0x9F9C,0x9FA0,0x582F,/* 0x98-0x9F */
+	0x69C7,0x9059,0x7464,0x51DC,0x7199,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E8A,0x891C,0x9348,0x9288,0x84DC,0x4FC9,0x70BB,0x6631,/* 0x40-0x47 */
+	0x68C8,0x92F9,0x66FB,0x5F45,0x4E28,0x4EE1,0x4EFC,0x4F00,/* 0x48-0x4F */
+	0x4F03,0x4F39,0x4F56,0x4F92,0x4F8A,0x4F9A,0x4F94,0x4FCD,/* 0x50-0x57 */
+	0x5040,0x5022,0x4FFF,0x501E,0x5046,0x5070,0x5042,0x5094,/* 0x58-0x5F */
+	0x50F4,0x50D8,0x514A,0x5164,0x519D,0x51BE,0x51EC,0x5215,/* 0x60-0x67 */
+	0x529C,0x52A6,0x52C0,0x52DB,0x5300,0x5307,0x5324,0x5372,/* 0x68-0x6F */
+	0x5393,0x53B2,0x53DD,0xFA0E,0x549C,0x548A,0x54A9,0x54FF,/* 0x70-0x77 */
+	0x5586,0x5759,0x5765,0x57AC,0x57C8,0x57C7,0xFA0F,0x0000,/* 0x78-0x7F */
+
+	0xFA10,0x589E,0x58B2,0x590B,0x5953,0x595B,0x595D,0x5963,/* 0x80-0x87 */
+	0x59A4,0x59BA,0x5B56,0x5BC0,0x752F,0x5BD8,0x5BEC,0x5C1E,/* 0x88-0x8F */
+	0x5CA6,0x5CBA,0x5CF5,0x5D27,0x5D53,0xFA11,0x5D42,0x5D6D,/* 0x90-0x97 */
+	0x5DB8,0x5DB9,0x5DD0,0x5F21,0x5F34,0x5F67,0x5FB7,0x5FDE,/* 0x98-0x9F */
+	0x605D,0x6085,0x608A,0x60DE,0x60D5,0x6120,0x60F2,0x6111,/* 0xA0-0xA7 */
+	0x6137,0x6130,0x6198,0x6213,0x62A6,0x63F5,0x6460,0x649D,/* 0xA8-0xAF */
+	0x64CE,0x654E,0x6600,0x6615,0x663B,0x6609,0x662E,0x661E,/* 0xB0-0xB7 */
+	0x6624,0x6665,0x6657,0x6659,0xFA12,0x6673,0x6699,0x66A0,/* 0xB8-0xBF */
+	0x66B2,0x66BF,0x66FA,0x670E,0xF929,0x6766,0x67BB,0x6852,/* 0xC0-0xC7 */
+	0x67C0,0x6801,0x6844,0x68CF,0xFA13,0x6968,0xFA14,0x6998,/* 0xC8-0xCF */
+	0x69E2,0x6A30,0x6A6B,0x6A46,0x6A73,0x6A7E,0x6AE2,0x6AE4,/* 0xD0-0xD7 */
+	0x6BD6,0x6C3F,0x6C5C,0x6C86,0x6C6F,0x6CDA,0x6D04,0x6D87,/* 0xD8-0xDF */
+	0x6D6F,0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,/* 0xE0-0xE7 */
+	0x6E5C,0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,/* 0xE8-0xEF */
+	0x7007,0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,/* 0xF0-0xF7 */
+	0x7147,0xFA15,0x71C1,0x71FE,0x72B1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x72BE,0x7324,0xFA16,0x7377,0x73BD,0x73C9,0x73D6,0x73E3,/* 0x40-0x47 */
+	0x73D2,0x7407,0x73F5,0x7426,0x742A,0x7429,0x742E,0x7462,/* 0x48-0x4F */
+	0x7489,0x749F,0x7501,0x756F,0x7682,0x769C,0x769E,0x769B,/* 0x50-0x57 */
+	0x76A6,0xFA17,0x7746,0x52AF,0x7821,0x784E,0x7864,0x787A,/* 0x58-0x5F */
+	0x7930,0xFA18,0xFA19,0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,/* 0x60-0x67 */
+	0x7AE7,0xFA1C,0x7AEB,0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,/* 0x68-0x6F */
+	0x7DA0,0x7DD6,0x7E52,0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,/* 0x70-0x77 */
+	0x837F,0x83C7,0x83F6,0x8448,0x84B4,0x8553,0x8559,0x0000,/* 0x78-0x7F */
+
+	0x856B,0xFA1F,0x85B0,0xFA20,0xFA21,0x8807,0x88F5,0x8A12,/* 0x80-0x87 */
+	0x8A37,0x8A79,0x8AA7,0x8ABE,0x8ADF,0xFA22,0x8AF6,0x8B53,/* 0x88-0x8F */
+	0x8B7F,0x8CF0,0x8CF4,0x8D12,0x8D76,0xFA23,0x8ECF,0xFA24,/* 0x90-0x97 */
+	0xFA25,0x9067,0x90DE,0xFA26,0x9115,0x9127,0x91DA,0x91D7,/* 0x98-0x9F */
+	0x91DE,0x91ED,0x91EE,0x91E4,0x91E5,0x9206,0x9210,0x920A,/* 0xA0-0xA7 */
+	0x923A,0x9240,0x923C,0x924E,0x9259,0x9251,0x9239,0x9267,/* 0xA8-0xAF */
+	0x92A7,0x9277,0x9278,0x92E7,0x92D7,0x92D9,0x92D0,0xFA27,/* 0xB0-0xB7 */
+	0x92D5,0x92E0,0x92D3,0x9325,0x9321,0x92FB,0xFA28,0x931E,/* 0xB8-0xBF */
+	0x92FF,0x931D,0x9302,0x9370,0x9357,0x93A4,0x93C6,0x93DE,/* 0xC0-0xC7 */
+	0x93F8,0x9431,0x9445,0x9448,0x9592,0xF9DC,0xFA29,0x969D,/* 0xC8-0xCF */
+	0x96AF,0x9733,0x973B,0x9743,0x974D,0x974F,0x9751,0x9755,/* 0xD0-0xD7 */
+	0x9857,0x9865,0xFA2A,0xFA2B,0x9927,0xFA2C,0x999E,0x9A4E,/* 0xD8-0xDF */
+	0x9AD9,0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,/* 0xE0-0xE7 */
+	0x9D70,0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x2170,/* 0xE8-0xEF */
+	0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,0x2178,/* 0xF0-0xF7 */
+	0x2179,0xFFE2,0xFFE4,0xFF07,0xFF02,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,/* 0x40-0x47 */
+	0x2178,0x2179,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,/* 0x48-0x4F */
+	0x2166,0x2167,0x2168,0x2169,0xFFE2,0xFFE4,0xFF07,0xFF02,/* 0x50-0x57 */
+	0x3231,0x2116,0x2121,0x2235,0x7E8A,0x891C,0x9348,0x9288,/* 0x58-0x5F */
+	0x84DC,0x4FC9,0x70BB,0x6631,0x68C8,0x92F9,0x66FB,0x5F45,/* 0x60-0x67 */
+	0x4E28,0x4EE1,0x4EFC,0x4F00,0x4F03,0x4F39,0x4F56,0x4F92,/* 0x68-0x6F */
+	0x4F8A,0x4F9A,0x4F94,0x4FCD,0x5040,0x5022,0x4FFF,0x501E,/* 0x70-0x77 */
+	0x5046,0x5070,0x5042,0x5094,0x50F4,0x50D8,0x514A,0x0000,/* 0x78-0x7F */
+
+	0x5164,0x519D,0x51BE,0x51EC,0x5215,0x529C,0x52A6,0x52C0,/* 0x80-0x87 */
+	0x52DB,0x5300,0x5307,0x5324,0x5372,0x5393,0x53B2,0x53DD,/* 0x88-0x8F */
+	0xFA0E,0x549C,0x548A,0x54A9,0x54FF,0x5586,0x5759,0x5765,/* 0x90-0x97 */
+	0x57AC,0x57C8,0x57C7,0xFA0F,0xFA10,0x589E,0x58B2,0x590B,/* 0x98-0x9F */
+	0x5953,0x595B,0x595D,0x5963,0x59A4,0x59BA,0x5B56,0x5BC0,/* 0xA0-0xA7 */
+	0x752F,0x5BD8,0x5BEC,0x5C1E,0x5CA6,0x5CBA,0x5CF5,0x5D27,/* 0xA8-0xAF */
+	0x5D53,0xFA11,0x5D42,0x5D6D,0x5DB8,0x5DB9,0x5DD0,0x5F21,/* 0xB0-0xB7 */
+	0x5F34,0x5F67,0x5FB7,0x5FDE,0x605D,0x6085,0x608A,0x60DE,/* 0xB8-0xBF */
+	0x60D5,0x6120,0x60F2,0x6111,0x6137,0x6130,0x6198,0x6213,/* 0xC0-0xC7 */
+	0x62A6,0x63F5,0x6460,0x649D,0x64CE,0x654E,0x6600,0x6615,/* 0xC8-0xCF */
+	0x663B,0x6609,0x662E,0x661E,0x6624,0x6665,0x6657,0x6659,/* 0xD0-0xD7 */
+	0xFA12,0x6673,0x6699,0x66A0,0x66B2,0x66BF,0x66FA,0x670E,/* 0xD8-0xDF */
+	0xF929,0x6766,0x67BB,0x6852,0x67C0,0x6801,0x6844,0x68CF,/* 0xE0-0xE7 */
+	0xFA13,0x6968,0xFA14,0x6998,0x69E2,0x6A30,0x6A6B,0x6A46,/* 0xE8-0xEF */
+	0x6A73,0x6A7E,0x6AE2,0x6AE4,0x6BD6,0x6C3F,0x6C5C,0x6C86,/* 0xF0-0xF7 */
+	0x6C6F,0x6CDA,0x6D04,0x6D87,0x6D6F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,0x6E5C,/* 0x40-0x47 */
+	0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,0x7007,/* 0x48-0x4F */
+	0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,0x7147,/* 0x50-0x57 */
+	0xFA15,0x71C1,0x71FE,0x72B1,0x72BE,0x7324,0xFA16,0x7377,/* 0x58-0x5F */
+	0x73BD,0x73C9,0x73D6,0x73E3,0x73D2,0x7407,0x73F5,0x7426,/* 0x60-0x67 */
+	0x742A,0x7429,0x742E,0x7462,0x7489,0x749F,0x7501,0x756F,/* 0x68-0x6F */
+	0x7682,0x769C,0x769E,0x769B,0x76A6,0xFA17,0x7746,0x52AF,/* 0x70-0x77 */
+	0x7821,0x784E,0x7864,0x787A,0x7930,0xFA18,0xFA19,0x0000,/* 0x78-0x7F */
+
+	0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,0x7AE7,0xFA1C,0x7AEB,/* 0x80-0x87 */
+	0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,0x7DA0,0x7DD6,0x7E52,/* 0x88-0x8F */
+	0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,0x837F,0x83C7,0x83F6,/* 0x90-0x97 */
+	0x8448,0x84B4,0x8553,0x8559,0x856B,0xFA1F,0x85B0,0xFA20,/* 0x98-0x9F */
+	0xFA21,0x8807,0x88F5,0x8A12,0x8A37,0x8A79,0x8AA7,0x8ABE,/* 0xA0-0xA7 */
+	0x8ADF,0xFA22,0x8AF6,0x8B53,0x8B7F,0x8CF0,0x8CF4,0x8D12,/* 0xA8-0xAF */
+	0x8D76,0xFA23,0x8ECF,0xFA24,0xFA25,0x9067,0x90DE,0xFA26,/* 0xB0-0xB7 */
+	0x9115,0x9127,0x91DA,0x91D7,0x91DE,0x91ED,0x91EE,0x91E4,/* 0xB8-0xBF */
+	0x91E5,0x9206,0x9210,0x920A,0x923A,0x9240,0x923C,0x924E,/* 0xC0-0xC7 */
+	0x9259,0x9251,0x9239,0x9267,0x92A7,0x9277,0x9278,0x92E7,/* 0xC8-0xCF */
+	0x92D7,0x92D9,0x92D0,0xFA27,0x92D5,0x92E0,0x92D3,0x9325,/* 0xD0-0xD7 */
+	0x9321,0x92FB,0xFA28,0x931E,0x92FF,0x931D,0x9302,0x9370,/* 0xD8-0xDF */
+	0x9357,0x93A4,0x93C6,0x93DE,0x93F8,0x9431,0x9445,0x9448,/* 0xE0-0xE7 */
+	0x9592,0xF9DC,0xFA29,0x969D,0x96AF,0x9733,0x973B,0x9743,/* 0xE8-0xEF */
+	0x974D,0x974F,0x9751,0x9755,0x9857,0x9865,0xFA2A,0xFA2B,/* 0xF0-0xF7 */
+	0x9927,0xFA2C,0x999E,0x9A4E,0x9AD9,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,0x9D70,/* 0x40-0x47 */
+	0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_81, c2u_82, c2u_83, c2u_84, NULL,   NULL,   c2u_87, 
+	c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, 
+	c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, 
+	c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, NULL,   NULL,   c2u_ED, c2u_EE, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   c2u_FA, c2u_FB, c2u_FC, NULL,   NULL,   NULL,   
+};
+
+static const unsigned char u2c_00hi[256 - 0xA0][2] = {
+	{0x00, 0x00}, {0x00, 0x00}, {0x81, 0x91}, {0x81, 0x92},/* 0xA0-0xA3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x98},/* 0xA4-0xA7 */
+	{0x81, 0x4E}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xA8-0xAB */
+	{0x81, 0xCA}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xAC-0xAF */
+	{0x81, 0x8B}, {0x81, 0x7D}, {0x00, 0x00}, {0x00, 0x00},/* 0xB0-0xB3 */
+	{0x81, 0x4C}, {0x00, 0x00}, {0x81, 0xF7}, {0x00, 0x00},/* 0xB4-0xB7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xB8-0xBB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xBC-0xBF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC0-0xC3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC4-0xC7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC8-0xCB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xCC-0xCF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD0-0xD3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x7E},/* 0xD4-0xD7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD8-0xDB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xDC-0xDF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE0-0xE3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE4-0xE7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE8-0xEB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xEC-0xEF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF0-0xF3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x80},/* 0xF4-0xF7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF8-0xFB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x90-0x93 */
+	0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x94-0x97 */
+	0x83, 0xA6, 0x83, 0xA7, 0x83, 0xA8, 0x83, 0xA9, /* 0x98-0x9B */
+	0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, 0x83, 0xAD, /* 0x9C-0x9F */
+	0x83, 0xAE, 0x83, 0xAF, 0x00, 0x00, 0x83, 0xB0, /* 0xA0-0xA3 */
+	0x83, 0xB1, 0x83, 0xB2, 0x83, 0xB3, 0x83, 0xB4, /* 0xA4-0xA7 */
+	0x83, 0xB5, 0x83, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0xB0-0xB3 */
+	0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0xB4-0xB7 */
+	0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xB8-0xBB */
+	0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xBC-0xBF */
+	0x83, 0xCE, 0x83, 0xCF, 0x00, 0x00, 0x83, 0xD0, /* 0xC0-0xC3 */
+	0x83, 0xD1, 0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, /* 0xC4-0xC7 */
+	0x83, 0xD5, 0x83, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_04[512] = {
+	0x00, 0x00, 0x84, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0x84, 0x43, /* 0x10-0x13 */
+	0x84, 0x44, 0x84, 0x45, 0x84, 0x47, 0x84, 0x48, /* 0x14-0x17 */
+	0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, 0x84, 0x4C, /* 0x18-0x1B */
+	0x84, 0x4D, 0x84, 0x4E, 0x84, 0x4F, 0x84, 0x50, /* 0x1C-0x1F */
+	0x84, 0x51, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0x20-0x23 */
+	0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x24-0x27 */
+	0x84, 0x59, 0x84, 0x5A, 0x84, 0x5B, 0x84, 0x5C, /* 0x28-0x2B */
+	0x84, 0x5D, 0x84, 0x5E, 0x84, 0x5F, 0x84, 0x60, /* 0x2C-0x2F */
+	0x84, 0x70, 0x84, 0x71, 0x84, 0x72, 0x84, 0x73, /* 0x30-0x33 */
+	0x84, 0x74, 0x84, 0x75, 0x84, 0x77, 0x84, 0x78, /* 0x34-0x37 */
+	0x84, 0x79, 0x84, 0x7A, 0x84, 0x7B, 0x84, 0x7C, /* 0x38-0x3B */
+	0x84, 0x7D, 0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, /* 0x3C-0x3F */
+	0x84, 0x82, 0x84, 0x83, 0x84, 0x84, 0x84, 0x85, /* 0x40-0x43 */
+	0x84, 0x86, 0x84, 0x87, 0x84, 0x88, 0x84, 0x89, /* 0x44-0x47 */
+	0x84, 0x8A, 0x84, 0x8B, 0x84, 0x8C, 0x84, 0x8D, /* 0x48-0x4B */
+	0x84, 0x8E, 0x84, 0x8F, 0x84, 0x90, 0x84, 0x91, /* 0x4C-0x4F */
+	0x00, 0x00, 0x84, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x81, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x81, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x81, 0x65, 0x81, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x81, 0x67, 0x81, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x81, 0xF5, 0x81, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x81, 0x64, 0x81, 0x63, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x81, 0xF1, 0x00, 0x00, 0x81, 0x8C, 0x81, 0x8D, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xA6, /* 0x38-0x3B */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x8E, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0x59, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xFA, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xF0, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, 0xFA, 0x4D, /* 0x60-0x63 */
+	0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, 0xFA, 0x51, /* 0x64-0x67 */
+	0xFA, 0x52, 0xFA, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEE, 0xEF, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0x70-0x73 */
+	0xEE, 0xF3, 0xEE, 0xF4, 0xEE, 0xF5, 0xEE, 0xF6, /* 0x74-0x77 */
+	0xEE, 0xF7, 0xEE, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x81, 0xA9, 0x81, 0xAA, 0x81, 0xA8, 0x81, 0xAB, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x81, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_22[512] = {
+	0x81, 0xCD, 0x00, 0x00, 0x81, 0xDD, 0x81, 0xCE, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xDE, /* 0x04-0x07 */
+	0x81, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x81, 0xB9, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x87, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x95, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x81, 0xE5, 0x81, 0x87, 0x87, 0x98, /* 0x1C-0x1F */
+	0x87, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x81, 0x61, 0x00, 0x00, 0x81, 0xC8, /* 0x24-0x27 */
+	0x81, 0xC9, 0x87, 0x9B, 0x87, 0x9C, 0x87, 0x92, /* 0x28-0x2B */
+	0x81, 0xE8, 0x00, 0x00, 0x87, 0x93, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x81, 0x88, 0xFA, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x81, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x90, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x81, 0x82, 0x87, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0x85, 0x81, 0x86, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xE1, 0x81, 0xE2, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xBC, 0x81, 0xBD, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xBA, 0x81, 0xBB, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x87, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x99, /* 0xBC-0xBF */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xDC, 0x00, 0x00, /* 0x10-0x13 */
+};
+
+static const unsigned char u2c_24[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x87, 0x40, 0x87, 0x41, 0x87, 0x42, 0x87, 0x43, /* 0x60-0x63 */
+	0x87, 0x44, 0x87, 0x45, 0x87, 0x46, 0x87, 0x47, /* 0x64-0x67 */
+	0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, /* 0x68-0x6B */
+	0x87, 0x4C, 0x87, 0x4D, 0x87, 0x4E, 0x87, 0x4F, /* 0x6C-0x6F */
+	0x87, 0x50, 0x87, 0x51, 0x87, 0x52, 0x87, 0x53, /* 0x70-0x73 */
+};
+
+static const unsigned char u2c_25[512] = {
+	0x84, 0x9F, 0x84, 0xAA, 0x84, 0xA0, 0x84, 0xAB, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x84, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAC, /* 0x0C-0x0F */
+	0x84, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAD, /* 0x10-0x13 */
+	0x84, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAF, /* 0x14-0x17 */
+	0x84, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAE, /* 0x18-0x1B */
+	0x84, 0xA5, 0x84, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x84, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB0, /* 0x20-0x23 */
+	0x84, 0xA7, 0x84, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x84, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB2, /* 0x28-0x2B */
+	0x84, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB6, /* 0x2C-0x2F */
+	0x84, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB1, /* 0x30-0x33 */
+	0x84, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB8, /* 0x34-0x37 */
+	0x84, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB3, /* 0x38-0x3B */
+	0x84, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB9, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x84, 0xBE, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB4, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x81, 0xA1, 0x81, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xA3, 0x81, 0xA2, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x81, 0xA5, 0x81, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0x9F, 0x81, 0x9E, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9B, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0x9D, 0x81, 0x9C, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xFC, /* 0xEC-0xEF */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x81, 0x9A, 0x81, 0x99, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x81, 0x8A, 0x00, 0x00, 0x81, 0x89, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xF4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x81, 0xF3, 0x00, 0x00, 0x81, 0xF2, /* 0x6C-0x6F */
+};
+
+static const unsigned char u2c_30[512] = {
+	0x81, 0x40, 0x81, 0x41, 0x81, 0x42, 0x81, 0x56, /* 0x00-0x03 */
+	0x00, 0x00, 0x81, 0x58, 0x81, 0x59, 0x81, 0x5A, /* 0x04-0x07 */
+	0x81, 0x71, 0x81, 0x72, 0x81, 0x73, 0x81, 0x74, /* 0x08-0x0B */
+	0x81, 0x75, 0x81, 0x76, 0x81, 0x77, 0x81, 0x78, /* 0x0C-0x0F */
+	0x81, 0x79, 0x81, 0x7A, 0x81, 0xA7, 0x81, 0xAC, /* 0x10-0x13 */
+	0x81, 0x6B, 0x81, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x87, 0x80, 0x00, 0x00, 0x87, 0x81, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, /* 0x40-0x43 */
+	0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x44-0x47 */
+	0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, 0x82, 0xA9, /* 0x48-0x4B */
+	0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, 0x82, 0xAD, /* 0x4C-0x4F */
+	0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, 0x82, 0xB1, /* 0x50-0x53 */
+	0x82, 0xB2, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x54-0x57 */
+	0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x58-0x5B */
+	0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0x5C-0x5F */
+	0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0x60-0x63 */
+	0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0x64-0x67 */
+	0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0x82, 0xC9, /* 0x68-0x6B */
+	0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0x82, 0xCD, /* 0x6C-0x6F */
+	0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x70-0x73 */
+	0x82, 0xD2, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0x74-0x77 */
+	0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, 0x82, 0xD9, /* 0x78-0x7B */
+	0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, /* 0x7C-0x7F */
+	
+	0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0x80-0x83 */
+	0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0x84-0x87 */
+	0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, /* 0x88-0x8B */
+	0x82, 0xEA, 0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, /* 0x8C-0x8F */
+	0x82, 0xEE, 0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, /* 0x90-0x93 */
+	0x83, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x4A, /* 0x98-0x9B */
+	0x81, 0x4B, 0x81, 0x54, 0x81, 0x55, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xA0-0xA3 */
+	0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xA4-0xA7 */
+	0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xA8-0xAB */
+	0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xAC-0xAF */
+	0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0xB0-0xB3 */
+	0x83, 0x53, 0x83, 0x54, 0x83, 0x55, 0x83, 0x56, /* 0xB4-0xB7 */
+	0x83, 0x57, 0x83, 0x58, 0x83, 0x59, 0x83, 0x5A, /* 0xB8-0xBB */
+	0x83, 0x5B, 0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, /* 0xBC-0xBF */
+	0x83, 0x5F, 0x83, 0x60, 0x83, 0x61, 0x83, 0x62, /* 0xC0-0xC3 */
+	0x83, 0x63, 0x83, 0x64, 0x83, 0x65, 0x83, 0x66, /* 0xC4-0xC7 */
+	0x83, 0x67, 0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, /* 0xC8-0xCB */
+	0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, /* 0xCC-0xCF */
+	0x83, 0x6F, 0x83, 0x70, 0x83, 0x71, 0x83, 0x72, /* 0xD0-0xD3 */
+	0x83, 0x73, 0x83, 0x74, 0x83, 0x75, 0x83, 0x76, /* 0xD4-0xD7 */
+	0x83, 0x77, 0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, /* 0xD8-0xDB */
+	0x83, 0x7B, 0x83, 0x7C, 0x83, 0x7D, 0x83, 0x7E, /* 0xDC-0xDF */
+	0x83, 0x80, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0xE0-0xE3 */
+	0x83, 0x84, 0x83, 0x85, 0x83, 0x86, 0x83, 0x87, /* 0xE4-0xE7 */
+	0x83, 0x88, 0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, /* 0xE8-0xEB */
+	0x83, 0x8C, 0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, /* 0xEC-0xEF */
+	0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0xF0-0xF3 */
+	0x83, 0x94, 0x83, 0x95, 0x83, 0x96, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x45, /* 0xF8-0xFB */
+	0x81, 0x5B, 0x81, 0x52, 0x81, 0x53, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_32[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xFA, 0x58, 0x87, 0x8B, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x87, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x87, 0x85, 0x87, 0x86, 0x87, 0x87, 0x87, 0x88, /* 0xA4-0xA7 */
+	0x87, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x65, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x87, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x87, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x87, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x61, 0x87, 0x6B, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x6A, 0x87, 0x64, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6C, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x66, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6E, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x87, 0x5F, 0x87, 0x6D, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x87, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x87, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x68, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x7E, /* 0x78-0x7B */
+	0x87, 0x8F, 0x87, 0x8E, 0x87, 0x8D, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x72, 0x87, 0x73, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x87, 0x6F, 0x87, 0x70, 0x87, 0x71, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x87, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x87, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x87, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0x88, 0xEA, 0x92, 0x9A, 0x00, 0x00, 0x8E, 0xB5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9C, /* 0x04-0x07 */
+	0x8F, 0xE4, 0x8E, 0x4F, 0x8F, 0xE3, 0x89, 0xBA, /* 0x08-0x0B */
+	0x00, 0x00, 0x95, 0x73, 0x97, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */
+	0x98, 0xA0, 0x89, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x8A, 0x8E, 0x98, 0xA1, 0x90, 0xA2, 0x99, 0xC0, /* 0x14-0x17 */
+	0x8B, 0x75, 0x95, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xE5, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x97, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xC0, 0x00, 0x00, /* 0x24-0x27 */
+	0xED, 0x4C, 0x00, 0x00, 0x98, 0xA2, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x92, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x98, 0xA3, 0x8B, 0xF8, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xA4, 0x00, 0x00, /* 0x34-0x37 */
+	0x8A, 0xDB, 0x92, 0x4F, 0x00, 0x00, 0x8E, 0xE5, /* 0x38-0x3B */
+	0x98, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xA6, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xA7, 0x94, 0x54, /* 0x40-0x43 */
+	0x00, 0x00, 0x8B, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x56, /* 0x48-0x4B */
+	0x00, 0x00, 0x93, 0xE1, 0x8C, 0xC1, 0x96, 0x52, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE5, 0x68, 0x98, 0xA8, 0x8F, 0xE6, /* 0x54-0x57 */
+	0x98, 0xA9, 0x89, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x8B, 0xE3, 0x8C, 0xEE, 0x96, 0xE7, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xA4, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x97, 0x90, 0x00, 0x00, 0x93, 0xFB, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x8B, 0x54, 0x00, 0x00, 0x98, 0xAA, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x98, 0xAB, 0x97, 0xB9, 0x00, 0x00, /* 0x84-0x87 */
+	0x97, 0x5C, 0x91, 0x88, 0x98, 0xAD, 0x8E, 0x96, /* 0x88-0x8B */
+	0x93, 0xF1, 0x00, 0x00, 0x98, 0xB0, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x89, 0x5D, 0x8C, 0xDD, 0x00, 0x00, /* 0x90-0x93 */
+	0x8C, 0xDC, 0x88, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x98, 0x6A, 0x98, 0x69, 0x00, 0x00, 0x8D, 0xB1, /* 0x98-0x9B */
+	0x88, 0x9F, 0x00, 0x00, 0x98, 0xB1, 0x98, 0xB2, /* 0x9C-0x9F */
+	0x98, 0xB3, 0x96, 0x53, 0x98, 0xB4, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x8C, 0xF0, 0x88, 0xE5, 0x96, 0x92, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x8B, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x9D, /* 0xA8-0xAB */
+	0x8B, 0x9E, 0x92, 0xE0, 0x97, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */
+	0x98, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xB6, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xB7, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x8F, 0x59, 0x90, 0x6D, 0x98, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x98, 0xBA, 0x00, 0x00, 0x98, 0xBB, 0x8B, 0x77, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA1, 0x89, 0xEE, /* 0xC8-0xCB */
+	0x00, 0x00, 0x98, 0xB9, 0x98, 0xB8, 0x95, 0xA7, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x8E, 0x65, 0x8E, 0x64, 0x91, 0xBC, 0x98, 0xBD, /* 0xD4-0xD7 */
+	0x95, 0x74, 0x90, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x81, 0x57, 0x98, 0xBE, 0x98, 0xC0, /* 0xDC-0xDF */
+	0x00, 0x00, 0xED, 0x4D, 0x00, 0x00, 0x91, 0xE3, /* 0xE0-0xE3 */
+	0x97, 0xDF, 0x88, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x98, 0xBF, 0x89, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8B, 0xC2, 0x00, 0x00, 0x92, 0x87, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x8F, 0x98, 0xC1, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x43, /* 0xF8-0xFB */
+	0xED, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0xED, 0x4F, 0x8A, 0xE9, 0x00, 0x00, 0xED, 0x50, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x98, 0xC2, 0x88, 0xC9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x8C, 0xDE, 0x8A, 0xEA, 0x95, 0x9A, /* 0x0C-0x0F */
+	0x94, 0xB0, 0x8B, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xEF, 0x00, 0x00, /* 0x18-0x1B */
+	0x98, 0xE5, 0x93, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x8C, /* 0x2C-0x2F */
+	0x98, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x94, 0xBA, 0x00, 0x00, 0x97, 0xE0, 0x00, 0x00, /* 0x34-0x37 */
+	0x90, 0x4C, 0xED, 0x51, 0x8E, 0x66, 0x00, 0x00, /* 0x38-0x3B */
+	0x8E, 0x97, 0x89, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xCF, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x41, 0x98, 0xC8, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x88, 0xCA, 0x92, 0xE1, 0x8F, 0x5A, /* 0x4C-0x4F */
+	0x8D, 0xB2, 0x97, 0x43, 0x00, 0x00, 0x91, 0xCC, /* 0x50-0x53 */
+	0x00, 0x00, 0x89, 0xBD, 0xED, 0x52, 0x98, 0xC7, /* 0x54-0x57 */
+	0x00, 0x00, 0x97, 0x5D, 0x98, 0xC3, 0x98, 0xC5, /* 0x58-0x5B */
+	0x8D, 0xEC, 0x98, 0xC6, 0x9B, 0x43, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x98, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xD1, /* 0x6C-0x6F */
+	0x98, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC0, /* 0x70-0x73 */
+	0x00, 0x00, 0x95, 0xB9, 0x98, 0xC9, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xCD, /* 0x78-0x7B */
+	0x8C, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x67, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA4, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xD2, 0x00, 0x00, /* 0x84-0x87 */
+	0x98, 0xCA, 0x00, 0x00, 0xED, 0x54, 0x97, 0xE1, /* 0x88-0x8B */
+	0x00, 0x00, 0x8E, 0x98, 0x00, 0x00, 0x98, 0xCB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x98, 0xD0, 0xED, 0x53, 0x00, 0x00, /* 0x90-0x93 */
+	0xED, 0x56, 0x00, 0x00, 0x98, 0xD3, 0x00, 0x00, /* 0x94-0x97 */
+	0x98, 0xCC, 0x00, 0x00, 0xED, 0x55, 0x8B, 0x9F, /* 0x98-0x9B */
+	0x00, 0x00, 0x88, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8B, 0xA0, 0x89, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x44, /* 0xA8-0xAB */
+	0x00, 0x00, 0x96, 0x99, 0x95, 0x8E, 0x8C, 0xF2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x90, 0x4E, 0x97, 0xB5, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xD6, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x57, 0x91, 0xA3, /* 0xC0-0xC3 */
+	0x89, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xED, 0x45, 0x8F, 0x72, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xED, 0x57, 0x98, 0xD7, 0x00, 0x00, /* 0xCC-0xCF */
+	0x98, 0xDC, 0x98, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x98, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAD, /* 0xD4-0xD7 */
+	0x98, 0xD8, 0x00, 0x00, 0x98, 0xDB, 0x98, 0xD9, /* 0xD8-0xDB */
+	0x00, 0x00, 0x95, 0xDB, 0x00, 0x00, 0x98, 0xD6, /* 0xDC-0xDF */
+	0x00, 0x00, 0x90, 0x4D, 0x00, 0x00, 0x96, 0x93, /* 0xE0-0xE3 */
+	0x98, 0xDD, 0x98, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x43, 0x98, 0xEB, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x6F, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x95, 0x55, 0x98, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x95, 0xEE, 0x00, 0x00, 0x89, 0xB4, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xEA, 0xED, 0x5A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_50[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x98, 0xE4, 0x98, 0xED, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x91, 0x71, 0x00, 0x00, 0x8C, 0xC2, /* 0x08-0x0B */
+	0x00, 0x00, 0x94, 0x7B, 0x00, 0x00, 0xE0, 0xC5, /* 0x0C-0x0F */
+	0x00, 0x00, 0x98, 0xEC, 0x93, 0x7C, 0x00, 0x00, /* 0x10-0x13 */
+	0x98, 0xE1, 0x00, 0x00, 0x8C, 0xF4, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x8C, 0xF3, 0x98, 0xDF, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x5B, 0x8E, 0xD8, /* 0x1C-0x1F */
+	0x00, 0x00, 0x98, 0xE7, 0xED, 0x59, 0x95, 0xED, /* 0x20-0x23 */
+	0x92, 0x6C, 0x98, 0xE3, 0x8C, 0x91, 0x00, 0x00, /* 0x24-0x27 */
+	0x98, 0xE0, 0x98, 0xE8, 0x98, 0xE2, 0x97, 0xCF, /* 0x28-0x2B */
+	0x98, 0xE9, 0x98, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xE4, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8C, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xED, 0x58, 0x00, 0x00, 0xED, 0x5E, 0x98, 0xEE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x5C, 0x98, 0xEF, /* 0x44-0x47 */
+	0x98, 0xF3, 0x88, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xCE, /* 0x4C-0x4F */
+	0x98, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x98, 0xF1, 0x98, 0xF5, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xF4, 0x00, 0x00, /* 0x58-0x5B */
+	0x92, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x8C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x98, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xED, 0x5D, 0x00, 0x00, 0x8E, 0xC3, 0x00, 0x00, /* 0x70-0x73 */
+	0x91, 0xA4, 0x92, 0xE3, 0x8B, 0xF4, 0x00, 0x00, /* 0x74-0x77 */
+	0x98, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x98, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x98, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x96, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8C, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xED, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x8E, 0x50, 0x94, 0xF5, 0x98, 0xF9, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x8D, 0xC3, 0x97, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xFC, 0x99, 0x42, /* 0xB0-0xB3 */
+	0x98, 0xFB, 0x8D, 0xC2, 0x00, 0x00, 0x8F, 0x9D, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x58, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x43, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8B, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x99, 0x40, 0x99, 0x41, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x93, 0xAD, 0x00, 0x00, 0x91, 0x9C, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8B, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x96, 0x6C, 0x99, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xED, 0x61, 0x00, 0x00, 0x97, 0xBB, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x45, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x48, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x99, 0x46, 0x00, 0x00, 0x91, 0x6D, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x99, 0x47, 0x99, 0x49, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xED, 0x60, 0x99, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x99, 0x4A, 0x00, 0x00, 0x95, 0xC6, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_51[512] = {
+	0x8B, 0x56, 0x99, 0x4D, 0x99, 0x4E, 0x00, 0x00, /* 0x00-0x03 */
+	0x89, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x99, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xF2, 0x00, 0x00, /* 0x10-0x13 */
+	0x99, 0x51, 0x99, 0x50, 0x99, 0x4F, 0x00, 0x00, /* 0x14-0x17 */
+	0x98, 0xD4, 0x00, 0x00, 0x99, 0x52, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x9E, /* 0x1C-0x1F */
+	0x00, 0x00, 0x99, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x44, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xD7, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x55, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x54, 0x99, 0x57, /* 0x38-0x3B */
+	0x99, 0x56, 0x00, 0x00, 0x00, 0x00, 0x99, 0x58, /* 0x3C-0x3F */
+	0x99, 0x59, 0x88, 0xF2, 0x00, 0x00, 0x8C, 0xB3, /* 0x40-0x43 */
+	0x8C, 0x5A, 0x8F, 0x5B, 0x92, 0x9B, 0x8B, 0xA2, /* 0x44-0x47 */
+	0x90, 0xE6, 0x8C, 0xF5, 0xED, 0x62, 0x8D, 0x8E, /* 0x48-0x4B */
+	0x99, 0x5B, 0x96, 0xC6, 0x93, 0x65, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8E, 0x99, 0x00, 0x00, 0x99, 0x5A, 0x00, 0x00, /* 0x50-0x53 */
+	0x99, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x7D, 0x00, 0x00, /* 0x58-0x5B */
+	0x8A, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x5D, 0x00, 0x00, /* 0x60-0x63 */
+	0xED, 0x63, 0x93, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x91, 0x53, 0x99, 0x5F, 0x99, 0x60, 0x94, 0xAA, /* 0x68-0x6B */
+	0x8C, 0xF6, 0x98, 0x5A, 0x99, 0x61, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x8B, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x95, 0xBA, 0x91, 0xB4, 0x8B, 0xEF, /* 0x74-0x77 */
+	0x93, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x8C, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x99, 0x62, 0x00, 0x00, 0x99, 0x63, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x93, 0xE0, 0x89, 0x7E, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x99, 0x66, 0x8D, 0xFB, 0x00, 0x00, /* 0x88-0x8B */
+	0x99, 0x65, 0x8D, 0xC4, 0x00, 0x00, 0x99, 0x67, /* 0x8C-0x8F */
+	0xE3, 0xEC, 0x99, 0x68, 0x96, 0x60, 0x99, 0x69, /* 0x90-0x93 */
+	0x00, 0x00, 0x99, 0x6A, 0x99, 0x6B, 0x8F, 0xE7, /* 0x94-0x97 */
+	0x00, 0x00, 0x8E, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xED, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8A, 0xA5, 0x00, 0x00, 0x99, 0x6E, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x99, 0x6C, 0x96, 0xBB, 0x99, 0x6D, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x95, 0x79, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0xA8-0xAB */
+	0x93, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x99, 0x75, 0x99, 0x73, 0x99, 0x74, 0x99, 0x72, /* 0xB0-0xB3 */
+	0x8D, 0xE1, 0x99, 0x76, 0x96, 0xE8, 0x97, 0xE2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x99, 0x77, 0xED, 0x65, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x90, 0xA6, 0x99, 0x78, 0x8F, 0x79, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x99, 0x79, 0x00, 0x00, 0x92, 0x9C, /* 0xC8-0xCB */
+	0x97, 0xBD, 0x93, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xC3, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x7A, /* 0xD8-0xDB */
+	0xEA, 0xA3, 0x8B, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x99, 0x7B, 0x96, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x88, 0x91, 0xFA, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x99, 0x7D, 0x93, 0xE2, 0x00, 0x00, /* 0xE8-0xEB */
+	0xED, 0x66, 0x99, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x99, 0x80, 0x8A, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x99, 0x81, 0x8B, 0xA5, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x93, 0xCA, 0x89, 0x9A, 0x8F, 0x6F, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x94, 0x9F, 0x99, 0x82, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0x93, 0x81, 0x00, 0x00, 0x00, 0x00, 0x90, 0x6E, /* 0x00-0x03 */
+	0x99, 0x83, 0x00, 0x00, 0x95, 0xAA, 0x90, 0xD8, /* 0x04-0x07 */
+	0x8A, 0xA0, 0x00, 0x00, 0x8A, 0xA7, 0x99, 0x84, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x86, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8C, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x99, 0x85, 0xED, 0x67, 0x00, 0x00, 0x97, 0xF1, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x8F, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x94, 0xBB, 0x95, 0xCA, 0x00, 0x00, 0x99, 0x87, /* 0x24-0x27 */
+	0x00, 0x00, 0x97, 0x98, 0x99, 0x88, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x89, 0x00, 0x00, /* 0x2C-0x2F */
+	0x93, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x99, 0x8A, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xA7, 0x8D, 0xFC, /* 0x34-0x37 */
+	0x8C, 0x94, 0x99, 0x8B, 0x8E, 0x68, 0x8D, 0x8F, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xE4, /* 0x40-0x43 */
+	0x99, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x91, 0xA5, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xED, 0x99, 0x8E, /* 0x48-0x4B */
+	0x99, 0x8F, 0x91, 0x4F, 0x00, 0x00, 0x99, 0x8C, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x99, 0x91, 0x00, 0x00, 0x96, 0x55, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x84, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x90, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x95, /* 0x60-0x63 */
+	0x8D, 0xDC, 0x94, 0x8D, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x99, 0x94, 0x99, 0x92, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x9B, /* 0x6C-0x6F */
+	0x8F, 0xE8, 0x99, 0x9B, 0x8A, 0x84, 0x99, 0x95, /* 0x70-0x73 */
+	0x99, 0x93, 0x91, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x99, 0x97, 0x00, 0x00, 0x99, 0x96, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x63, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x80, /* 0x84-0x87 */
+	0x99, 0x9C, 0x97, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x99, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x99, 0x9D, 0x99, 0x9A, 0x00, 0x00, /* 0x90-0x93 */
+	0x99, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xCD, /* 0x98-0x9B */
+	0xED, 0x68, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xF7, /* 0x9C-0x9F */
+	0x89, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x97, 0xF2, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8F, 0x95, 0x93, 0x77, 0x8D, 0x85, /* 0xA8-0xAB */
+	0x99, 0xA0, 0x99, 0xA1, 0x00, 0x00, 0xEE, 0x5B, /* 0xAC-0xAF */
+	0x00, 0x00, 0x97, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x98, 0x4A, 0x99, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x8C, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x99, 0xA2, 0x00, 0x00, 0x8A, 0x4E, 0x00, 0x00, /* 0xBC-0xBF */
+	0xED, 0x6A, 0x99, 0xA4, 0x00, 0x00, 0x96, 0x75, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x92, 0xBA, 0x00, 0x00, 0x97, 0x45, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x95, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x99, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x93, 0xAE, 0x00, 0x00, 0x99, 0xA6, /* 0xD4-0xD7 */
+	0x8A, 0xA8, 0x96, 0xB1, 0x00, 0x00, 0xED, 0x6B, /* 0xD8-0xDB */
+	0x00, 0x00, 0x8F, 0x9F, 0x99, 0xA7, 0x95, 0xE5, /* 0xDC-0xDF */
+	0x99, 0xAB, 0x00, 0x00, 0x90, 0xA8, 0x99, 0xA8, /* 0xE0-0xE3 */
+	0x8B, 0xCE, 0x00, 0x00, 0x99, 0xA9, 0x8A, 0xA9, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x4D, 0x99, 0xAC, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x99, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x99, 0xAE, 0x99, 0xAF, 0x8E, 0xD9, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xF9, 0x96, 0xDC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0xED, 0x6C, 0x96, 0xE6, 0x93, 0xF5, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x95, 0xEF, 0x99, 0xB0, 0xED, 0x6D, /* 0x04-0x07 */
+	0x99, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x99, 0xB3, 0x00, 0x00, 0x99, 0xB5, /* 0x0C-0x0F */
+	0x99, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x99, 0xB6, 0x89, 0xBB, 0x96, 0x6B, /* 0x14-0x17 */
+	0x00, 0x00, 0x8D, 0xFA, 0x99, 0xB7, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x91, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x8F, 0xA0, 0x8B, 0xA7, 0x00, 0x00, 0x99, 0xB8, /* 0x20-0x23 */
+	0xED, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xD9, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xB9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x99, 0xBA, 0x00, 0x00, 0x99, 0xBB, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x99, 0xBC, 0x95, 0x43, 0x8B, 0xE6, 0x88, 0xE3, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBD, /* 0x3C-0x3F */
+	0x99, 0xBD, 0x8F, 0x5C, 0x00, 0x00, 0x90, 0xE7, /* 0x40-0x43 */
+	0x00, 0x00, 0x99, 0xBF, 0x99, 0xBE, 0x8F, 0xA1, /* 0x44-0x47 */
+	0x8C, 0xDF, 0x99, 0xC1, 0x94, 0xBC, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x99, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x94, 0xDA, 0x91, 0xB2, 0x91, 0xEC, /* 0x50-0x53 */
+	0x8B, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEC, /* 0x54-0x57 */
+	0x92, 0x50, 0x00, 0x00, 0x94, 0x8E, 0x00, 0x00, /* 0x58-0x5B */
+	0x96, 0x6D, 0x00, 0x00, 0x99, 0xC4, 0x00, 0x00, /* 0x5C-0x5F */
+	0x90, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x54, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x99, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xC6, 0x89, 0x4B, /* 0x6C-0x6F */
+	0x88, 0xF3, 0x8A, 0xEB, 0xED, 0x6F, 0x91, 0xA6, /* 0x70-0x73 */
+	0x8B, 0x70, 0x97, 0x91, 0x00, 0x00, 0x99, 0xC9, /* 0x74-0x77 */
+	0x89, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x99, 0xC8, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xA8, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xCA, 0x00, 0x00, /* 0x80-0x83 */
+	0x96, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x70, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xCB, 0x00, 0x00, /* 0x94-0x97 */
+	0x97, 0xD0, 0x00, 0x00, 0x8C, 0xFA, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xB4, /* 0x9C-0x9F */
+	0x99, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x99, 0xCE, 0x99, 0xCD, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x90, 0x7E, 0x89, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x89, 0x7D, 0x99, 0xCF, 0x00, 0x00, /* 0xAC-0xAF */
+	0x99, 0xD0, 0x00, 0x00, 0xED, 0x71, 0x8C, 0xB5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xD1, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8E, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x51, 0x99, 0xD2, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x96, 0x94, 0x8D, 0xB3, 0x8B, 0x79, 0x97, 0x46, /* 0xC8-0xCB */
+	0x91, 0x6F, 0x94, 0xBD, 0x8E, 0xFB, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x8F, 0x66, 0x00, 0x00, 0x8E, 0xE6, 0x8E, 0xF3, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x8F, 0x96, 0x00, 0x00, 0x94, 0xBE, /* 0xD8-0xDB */
+	0x00, 0x00, 0xED, 0x72, 0x00, 0x00, 0x99, 0xD5, /* 0xDC-0xDF */
+	0x00, 0x00, 0x89, 0x62, 0x91, 0x70, 0x8C, 0xFB, /* 0xE0-0xE3 */
+	0x8C, 0xC3, 0x8B, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x99, 0xD9, 0x92, 0x40, 0x91, 0xFC, 0x8B, 0xA9, /* 0xE8-0xEB */
+	0x8F, 0xA2, 0x99, 0xDA, 0x99, 0xD8, 0x89, 0xC2, /* 0xEC-0xEF */
+	0x91, 0xE4, 0x8E, 0xB6, 0x8E, 0x6A, 0x89, 0x45, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x90, 0x8D, 0x86, /* 0xF4-0xF7 */
+	0x8E, 0x69, 0x00, 0x00, 0x99, 0xDB, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x00, 0x00, 0x99, 0xDC, 0x00, 0x00, 0x8B, 0x68, /* 0x00-0x03 */
+	0x8A, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x8D, 0x87, 0x8B, 0x67, 0x92, 0xDD, 0x89, 0x44, /* 0x08-0x0B */
+	0x93, 0xAF, 0x96, 0xBC, 0x8D, 0x40, 0x97, 0x99, /* 0x0C-0x0F */
+	0x93, 0x66, 0x8C, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x4E, /* 0x18-0x1B */
+	0x00, 0x00, 0x99, 0xE5, 0x00, 0x00, 0x8B, 0xE1, /* 0x1C-0x1F */
+	0x96, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xDB, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x99, 0xE4, 0x00, 0x00, 0x8A, 0xDC, /* 0x28-0x2B */
+	0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE2, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xE3, 0x00, 0x00, /* 0x34-0x37 */
+	0x8B, 0x7A, 0x90, 0x81, 0x00, 0x00, 0x95, 0xAB, /* 0x38-0x3B */
+	0x99, 0xE1, 0x99, 0xDD, 0x8C, 0xE1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x99, 0xDE, 0x00, 0x00, 0x98, 0x43, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xF0, 0x00, 0x00, /* 0x44-0x47 */
+	0x92, 0xE6, 0x8C, 0xE0, 0x8D, 0x90, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xE6, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x93, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEA, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x8E, 0xFC, 0x00, 0x00, 0x8E, 0xF4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x99, 0xED, 0x99, 0xEB, 0x00, 0x00, 0x96, 0xA1, /* 0x70-0x73 */
+	0x00, 0x00, 0x99, 0xE8, 0x99, 0xF1, 0x99, 0xEC, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEF, /* 0x78-0x7B */
+	0x8C, 0xC4, 0x96, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x99, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x99, 0xF2, 0x00, 0x00, 0x99, 0xF4, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x75, 0x8D, 0xEE, /* 0x88-0x8B */
+	0x98, 0x61, 0x00, 0x00, 0x99, 0xE9, 0x99, 0xE7, /* 0x8C-0x8F */
+	0x99, 0xF3, 0x00, 0x00, 0x99, 0xEE, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xED, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xF6, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x9A, 0x42, 0x99, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x99, 0xFC, 0xED, 0x76, 0x00, 0x00, 0x9A, 0x40, /* 0xA8-0xAB */
+	0x99, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x5D, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE7, 0x8A, 0x50, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x99, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9A, 0x44, 0x88, 0xF4, 0x9A, 0x43, 0x00, 0x00, /* 0xBC-0xBF */
+	0x88, 0xA3, 0x95, 0x69, 0x9A, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x99, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x99, 0xF5, /* 0xC4-0xC7 */
+	0x99, 0xFB, 0x8D, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x9A, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x88, 0xF5, 0x9A, 0x4E, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x9A, 0x46, 0x9A, 0x47, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x8F, 0xA3, 0x96, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x9A, 0x4C, 0x9A, 0x4B, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x4E, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x4D, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9A, 0x4A, 0x00, 0x00, 0xED, 0x77, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x89, 0x53, 0x00, 0x00, 0x8D, 0xB4, 0x90, 0x4F, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x48, /* 0x0C-0x0F */
+	0x93, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x9A, 0x49, 0x00, 0x00, 0x88, 0xA0, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x53, 0x97, 0x42, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0xA5, 0x00, 0x00, 0x9A, 0x59, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9A, 0x58, 0x9A, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xC1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9A, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x91, 0xED, 0x9A, 0x55, 0x8F, 0xA4, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x9A, 0x52, 0x00, 0x00, 0x00, 0x00, 0x96, 0xE2, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x5B, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x56, 0x9A, 0x57, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x9A, 0x54, 0x9A, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x51, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x60, /* 0x78-0x7B */
+	0x9A, 0x65, 0x00, 0x00, 0x9A, 0x61, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9A, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x66, /* 0x80-0x83 */
+	0x91, 0x50, 0x00, 0x00, 0xED, 0x78, 0x9A, 0x68, /* 0x84-0x87 */
+	0x00, 0x00, 0x8D, 0x41, 0x9A, 0x5E, 0x92, 0x9D, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x9A, 0x62, 0x9A, 0x5B, 0x8A, 0xAB, 0x00, 0x00, /* 0x98-0x9B */
+	0x8A, 0xEC, 0x8A, 0x85, 0x9A, 0x63, 0x9A, 0x5F, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x96, /* 0xA4-0xA7 */
+	0x9A, 0x69, 0x9A, 0x67, 0x91, 0x72, 0x8B, 0x69, /* 0xA8-0xAB */
+	0x8B, 0xAA, 0x00, 0x00, 0x9A, 0x64, 0x00, 0x00, /* 0xAC-0xAF */
+	0x8B, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x63, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9A, 0x6D, 0x9A, 0x6B, 0x00, 0x00, 0x9A, 0xA5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x9A, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */
+	0x9A, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x6C, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6B, /* 0xE0-0xE3 */
+	0x9A, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x72, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9A, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9A, 0x75, 0x9A, 0x74, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x51, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x89, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x9A, 0x71, 0x00, 0x00, 0x9A, 0x73, 0x8F, 0xA6, /* 0x14-0x17 */
+	0x89, 0x52, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x76, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x89, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x82, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0xFA, 0x9A, 0x7D, 0x00, 0x00, /* 0x30-0x33 */
+	0x9A, 0x7B, 0x00, 0x00, 0x9A, 0x7C, 0x00, 0x00, /* 0x34-0x37 */
+	0x9A, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x5C, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x91, 0x58, 0x00, 0x00, 0x9A, 0x78, 0x00, 0x00, /* 0x4C-0x4F */
+	0x9A, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x9A, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x9A, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x8A, 0xED, 0x00, 0x00, 0x9A, 0x84, 0x9A, 0x80, /* 0x68-0x6B */
+	0x9A, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x95, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x93, 0xD3, 0x00, 0x00, 0x94, 0xB6, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x85, 0x8A, 0x64, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x87, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x8A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x9A, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9A, 0x88, 0x00, 0x00, 0x94, 0x58, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x9A, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x8C, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x9A, 0x8E, 0x00, 0x00, 0x9A, 0x8D, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9A, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x9A, 0x93, 0x9A, 0x91, 0x9A, 0x8F, 0x9A, 0x92, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x9A, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x95, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x9A, 0x96, 0x00, 0x00, 0x9A, 0x97, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x98, /* 0xD4-0xD7 */
+	0x99, 0x64, 0x00, 0x00, 0x8E, 0xFA, 0x8E, 0x6C, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xF1, 0x00, 0x00, /* 0xDC-0xDF */
+	0x88, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x92, 0x63, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x99, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8D, 0xA2, 0x00, 0x00, 0x88, 0xCD, 0x90, 0x7D, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9A, 0x9A, 0x8C, 0xC5, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x8D, 0x91, 0x00, 0x00, 0x9A, 0x9C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x9A, 0x9B, 0x00, 0x00, 0x00, 0x00, 0x95, 0xDE, /* 0x00-0x03 */
+	0x9A, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9A, 0x9F, 0x9A, 0x9E, 0x00, 0x00, 0x9A, 0xA0, /* 0x08-0x0B */
+	0x00, 0x00, 0x9A, 0xA1, 0x00, 0x00, 0x8C, 0x97, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x80, 0x9A, 0xA2, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xA4, 0x00, 0x00, /* 0x14-0x17 */
+	0x9A, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9A, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0x79, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xA7, 0x88, 0xB3, /* 0x24-0x27 */
+	0x8D, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x8C, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x92, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA8, /* 0x34-0x37 */
+	0x9A, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xAB, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9A, 0xAC, 0x00, 0x00, 0x8D, 0xE2, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xCF, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x56, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xAA, 0x9A, 0xAD, /* 0x4C-0x4F */
+	0x8D, 0xBF, 0x8D, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xED, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x9A, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x8D, 0xA3, 0xED, 0x7A, 0x92, 0x52, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x9A, 0xAE, 0x92, 0xD8, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x82, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x9A, 0xB0, 0x9A, 0xB3, 0x00, 0x00, 0x8C, 0x5E, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB4, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9A, 0xB5, 0x00, 0x00, 0x8D, 0x43, 0x8A, 0x5F, /* 0xA0-0xA3 */
+	0x9A, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */
+	0xED, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x9A, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB6, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x9A, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBA, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xBB, 0xED, 0x7D, /* 0xC4-0xC7 */
+	0xED, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x96, 0x84, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xE9, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xD0-0xD3 */
+	0x9A, 0xBC, 0x00, 0x00, 0x9A, 0xC0, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x94, 0x57, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE6, /* 0xDC-0xDF */
+	0x95, 0x75, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xC1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x8F, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xB7, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x94, 0x7C, 0x8A, 0xEE, 0x00, 0x00, /* 0xF8-0xFB */
+	0x8D, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0x96, 0x78, 0x00, 0x00, 0x93, 0xB0, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x8C, 0x98, 0x91, 0xCD, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xBF, 0x9A, 0xC2, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x91, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9A, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x9A, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x9A, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x92, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xAC, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9F, /* 0x2C-0x2F */
+	0x89, 0x81, 0x95, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x8F, 0xEA, 0x93, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE4, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x9A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x95, 0xBB, 0x97, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xF2, 0x9A, 0xC8, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x91, 0x59, 0x9A, 0xCB, 0x00, 0x00, /* 0x50-0x53 */
+	0x93, 0x83, 0x00, 0x00, 0x00, 0x00, 0x93, 0x68, /* 0x54-0x57 */
+	0x93, 0x84, 0x94, 0xB7, 0x92, 0xCB, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xC7, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x89, 0x96, 0x00, 0x00, 0x93, 0x55, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x9A, 0xC9, 0x00, 0x00, 0x9A, 0xC5, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x90, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x9A, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x6D, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAB, /* 0x80-0x83 */
+	0x00, 0x00, 0x9A, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE6, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x9D, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x92, 0xC4, 0x00, 0x00, 0xED, 0x81, 0x9A, 0xD0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x96, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xD1, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xD6, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x82, 0x95, 0xAD, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9A, 0xD5, 0x9A, 0xCF, 0x9A, 0xD2, 0x9A, 0xD4, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA4, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x95, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9A, 0xD7, 0x00, 0x00, 0x92, 0x64, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xF3, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8F, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x9A, 0xD9, 0x00, 0x00, 0x9A, 0xD8, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x8D, 0x88, 0x00, 0x00, 0x9A, 0xDA, /* 0xD4-0xD7 */
+	0x9A, 0xDC, 0x9A, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x9A, 0xDE, 0x00, 0x00, 0x9A, 0xD3, 0x9A, 0xE0, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9A, 0xDF, 0x9A, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6D, /* 0xE8-0xEB */
+	0x90, 0x70, 0x00, 0x00, 0x91, 0x73, 0x9A, 0xE1, /* 0xEC-0xEF */
+	0x90, 0xBA, 0x88, 0xEB, 0x94, 0x84, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xD9, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9A, 0xE3, 0x9A, 0xE2, 0x9A, 0xE4, /* 0xF8-0xFB */
+	0x9A, 0xE5, 0x9A, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xE7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x95, 0xCF, 0x9A, 0xE8, 0xED, 0x83, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC4, /* 0x0C-0x0F */
+	0x9A, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x97, 0x5B, 0x8A, 0x4F, 0x00, 0x00, /* 0x14-0x17 */
+	0x99, 0xC7, 0x8F, 0x67, 0x91, 0xBD, 0x9A, 0xEA, /* 0x18-0x1B */
+	0x96, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xB2, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x9A, 0xEC, 0x00, 0x00, 0x91, 0xE5, /* 0x24-0x27 */
+	0x00, 0x00, 0x93, 0x56, 0x91, 0xBE, 0x95, 0x76, /* 0x28-0x2B */
+	0x9A, 0xED, 0x9A, 0xEE, 0x89, 0x9B, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8E, 0xB8, 0x9A, 0xEF, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xCE, /* 0x34-0x37 */
+	0x9A, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x89, 0x82, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xEF, /* 0x44-0x47 */
+	0x93, 0xDE, 0x95, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xF5, 0x91, 0x74, /* 0x4C-0x4F */
+	0x9A, 0xF4, 0x8C, 0x5F, 0x00, 0x00, 0xED, 0x84, /* 0x50-0x53 */
+	0x96, 0x7A, 0x9A, 0xF3, 0x00, 0x00, 0x93, 0x85, /* 0x54-0x57 */
+	0x9A, 0xF7, 0x00, 0x00, 0x9A, 0xF6, 0xED, 0x85, /* 0x58-0x5B */
+	0x00, 0x00, 0xED, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x9A, 0xF9, 0x00, 0x00, 0x9A, 0xF8, 0xED, 0x87, /* 0x60-0x63 */
+	0x00, 0x00, 0x89, 0x9C, 0x00, 0x00, 0x9A, 0xFA, /* 0x64-0x67 */
+	0x8F, 0xA7, 0x9A, 0xFC, 0x92, 0x44, 0x00, 0x00, /* 0x68-0x6B */
+	0x9A, 0xFB, 0x00, 0x00, 0x95, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x97, /* 0x70-0x73 */
+	0x93, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9B, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8D, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9B, 0x41, 0x94, 0x40, 0x94, 0xDC, /* 0x80-0x83 */
+	0x96, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x44, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x9B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x57, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x64, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x96, 0xAD, 0x00, 0x00, 0x9B, 0xAA, /* 0x98-0x9B */
+	0x00, 0x00, 0x9B, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x45, /* 0xA0-0xA3 */
+	0xED, 0x88, 0x91, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x96, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x93, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x96, 0x85, 0xED, 0x89, 0x8D, 0xC8, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xA8, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x47, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x8E, 0x6F, 0x00, 0x00, 0x8E, 0x6E, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x88, 0xB7, 0x8C, 0xC6, 0x00, 0x00, 0x90, 0xA9, /* 0xD0-0xD3 */
+	0x88, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9B, 0x4B, 0x9B, 0x4C, 0x00, 0x00, /* 0xD8-0xDB */
+	0x9B, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x89, 0x57, 0x8A, 0xAD, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x9B, 0x48, 0x00, 0x00, 0x96, 0xC3, 0x95, 0x50, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xA6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF7, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x70, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0x00, 0x00, 0x88, 0xD0, 0x00, 0x00, 0x88, 0xA1, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x9B, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9B, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x96, 0xBA, 0x00, 0x00, 0x9B, 0x52, 0x00, 0x00, /* 0x18-0x1B */
+	0x9B, 0x50, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x4E, /* 0x1C-0x1F */
+	0x90, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x9B, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x95, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE2, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x9B, 0x56, 0x9B, 0x57, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x8F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9B, 0x53, 0x98, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x6B, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x9B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA5, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x58, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x77, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x59, 0x00, 0x00, /* 0x68-0x6B */
+	0x9B, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB9, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x7D, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x5A, 0x95, 0x51, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9B, 0x5B, 0x9B, 0x5F, 0x9B, 0x5C, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x89, 0xC5, 0x9B, 0x5E, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x8E, 0xB9, 0x00, 0x00, 0x9B, 0x5D, /* 0xC8-0xCB */
+	0x8C, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x9B, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x64, 0x9B, 0x61, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x92, 0x84, 0x00, 0x00, 0x9B, 0x60, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x62, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9B, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x65, 0x9B, 0x66, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x8A, 0xF0, 0x00, 0x00, 0x9B, 0x68, /* 0x08-0x0B */
+	0x9B, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x69, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xEC, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x6C, 0x00, 0x00, /* 0x28-0x2B */
+	0x92, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x89, 0x64, 0x00, 0x00, 0x9B, 0x6A, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x6D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x6E, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9B, 0x71, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6F, /* 0x40-0x43 */
+	0x00, 0x00, 0x9B, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8E, 0x71, 0x9B, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x8D, 0x45, 0x9B, 0x73, 0xED, 0x8A, 0x8E, 0x9A, /* 0x54-0x57 */
+	0x91, 0xB6, 0x00, 0x00, 0x9B, 0x74, 0x9B, 0x75, /* 0x58-0x5B */
+	0x8E, 0x79, 0x8D, 0x46, 0x00, 0x00, 0x96, 0xD0, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x47, /* 0x60-0x63 */
+	0x8C, 0xC7, 0x9B, 0x76, 0x8A, 0x77, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x9B, 0x77, 0x00, 0x00, 0x91, 0xB7, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x9B, 0x78, 0x9B, 0xA1, 0x00, 0x00, 0x9B, 0x79, /* 0x70-0x73 */
+	0x00, 0x00, 0x9B, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9B, 0x7B, 0x00, 0x00, 0x9B, 0x7D, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9B, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x80, /* 0x80-0x83 */
+	0x00, 0x00, 0x91, 0xEE, 0x00, 0x00, 0x89, 0x46, /* 0x84-0x87 */
+	0x8E, 0xE7, 0x88, 0xC0, 0x00, 0x00, 0x91, 0x76, /* 0x88-0x8B */
+	0x8A, 0xAE, 0x8E, 0xB3, 0x00, 0x00, 0x8D, 0x47, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x93, 0x86, 0x00, 0x00, 0x8F, 0x40, /* 0x94-0x97 */
+	0x8A, 0xAF, 0x92, 0x88, 0x92, 0xE8, 0x88, 0xB6, /* 0x98-0x9B */
+	0x8B, 0x58, 0x95, 0xF3, 0x00, 0x00, 0x8E, 0xC0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x71, 0x90, 0xE9, /* 0xA0-0xA3 */
+	0x8E, 0xBA, 0x97, 0x47, 0x9B, 0x81, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x7B, 0x00, 0x00, /* 0xAC-0xAF */
+	0x8D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x51, /* 0xB0-0xB3 */
+	0x89, 0x83, 0x8F, 0xAA, 0x89, 0xC6, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9B, 0x82, 0x97, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x68, /* 0xBC-0xBF */
+	0xED, 0x8B, 0x00, 0x00, 0x8E, 0xE2, 0x9B, 0x83, /* 0xC0-0xC3 */
+	0x8A, 0xF1, 0x93, 0xD0, 0x96, 0xA7, 0x9B, 0x84, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9B, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x95, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x9B, 0x87, 0x00, 0x00, 0x8A, 0xA6, 0x8B, 0xF5, /* 0xD0-0xD3 */
+	0x9B, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xED, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB0, /* 0xD8-0xDB */
+	0x00, 0x00, 0x90, 0x51, 0x9B, 0x8B, 0x8E, 0x40, /* 0xDC-0xDF */
+	0x00, 0x00, 0x89, 0xC7, 0x9B, 0x8A, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9B, 0x88, 0x9B, 0x8C, 0x9B, 0x89, 0x94, 0x4A, /* 0xE4-0xE7 */
+	0x9E, 0xCB, 0x90, 0x52, 0x00, 0x00, 0x9B, 0x8D, /* 0xE8-0xEB */
+	0xED, 0x8E, 0x00, 0x00, 0x97, 0xBE, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9B, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x90, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x92, 0x9E, 0x9B, 0x8F, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x90, 0xA1, 0x00, 0x00, 0x8E, 0x9B, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xCE, 0x8E, 0xF5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x00, 0x00, 0x95, 0x95, 0x90, 0xEA, 0x00, 0x00, /* 0x00-0x03 */
+	0x8E, 0xCB, 0x9B, 0x91, 0x8F, 0xAB, 0x9B, 0x92, /* 0x04-0x07 */
+	0x9B, 0x93, 0x88, 0xD1, 0x91, 0xB8, 0x90, 0x71, /* 0x08-0x0B */
+	0x00, 0x00, 0x9B, 0x94, 0x93, 0xB1, 0x8F, 0xAC, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8F, 0xAD, 0x00, 0x00, 0x9B, 0x95, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xEB, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xAE, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x8F, 0x00, 0x00, /* 0x1C-0x1F */
+	0x9B, 0x96, 0x00, 0x00, 0x9B, 0x97, 0x00, 0x00, /* 0x20-0x23 */
+	0x96, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x9B, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x8B, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9B, 0x99, 0x9B, 0x9A, 0x8E, 0xDA, 0x90, 0x4B, /* 0x38-0x3B */
+	0x93, 0xF2, 0x90, 0x73, 0x94, 0xF6, 0x94, 0x41, /* 0x3C-0x3F */
+	0x8B, 0xC7, 0x9B, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x8B, 0x8F, 0x9B, 0x9C, 0x00, 0x00, /* 0x44-0x47 */
+	0x8B, 0xFC, 0x00, 0x00, 0x93, 0xCD, 0x89, 0xAE, /* 0x48-0x4B */
+	0x00, 0x00, 0x8E, 0x72, 0x9B, 0x9D, 0x9B, 0xA0, /* 0x4C-0x4F */
+	0x9B, 0x9F, 0x8B, 0xFB, 0x00, 0x00, 0x9B, 0x9E, /* 0x50-0x53 */
+	0x00, 0x00, 0x93, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xAE, 0x00, 0x00, /* 0x5C-0x5F */
+	0x93, 0x6A, 0x8E, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x91, 0x77, 0x97, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x9B, 0xA2, 0x00, 0x00, 0x9B, 0xA3, 0x93, 0xD4, /* 0x6C-0x6F */
+	0x00, 0x00, 0x8E, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xA5, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x9B, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x9B, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8A, 0xF2, 0x9B, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x9B, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x89, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x90, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x91, 0x5A, 0x8A, 0xE2, 0x00, 0x00, 0x9B, 0xAB, /* 0xA8-0xAB */
+	0x96, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x91, 0xD0, 0x00, 0x00, 0x8A, 0x78, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xAD, 0x9B, 0xAF, /* 0xB4-0xB7 */
+	0x8A, 0xDD, 0x00, 0x00, 0xED, 0x91, 0x9B, 0xAC, /* 0xB8-0xBB */
+	0x9B, 0xAE, 0x00, 0x00, 0x9B, 0xB1, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9B, 0xB0, 0x00, 0x00, 0x9B, 0xB2, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x93, 0xBB, 0x8B, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x89, 0xE3, 0x9B, 0xB4, 0x9B, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x9B, 0xB7, 0x00, 0x00, 0x95, 0xF5, /* 0xEC-0xEF */
+	0x95, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xED, 0x92, 0x93, 0x87, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xB6, 0x8F, 0x73, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x92, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBA, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9B, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x9B, 0xC1, 0x9B, 0xBB, 0x8A, 0x52, 0x9B, 0xBC, /* 0x14-0x17 */
+	0x9B, 0xC5, 0x9B, 0xC4, 0x9B, 0xC3, 0x9B, 0xBF, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBE, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xC2, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x93, /* 0x24-0x27 */
+	0x00, 0x00, 0x95, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x96, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xC9, /* 0x48-0x4B */
+	0x9B, 0xC6, 0x00, 0x00, 0x9B, 0xC8, 0x00, 0x00, /* 0x4C-0x4F */
+	0x97, 0x92, 0x00, 0x00, 0x9B, 0xC7, 0xED, 0x94, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x9B, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x90, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x9B, 0xCA, 0xED, 0x97, 0x00, 0x00, 0x8D, 0xB5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCB, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xCC, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xCF, 0x00, 0x00, /* 0x80-0x83 */
+	0x9B, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCD, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x88, /* 0x88-0x8B */
+	0x9B, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x9B, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x9B, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xD0, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x9B, 0xD2, 0x00, 0x00, 0x9B, 0xD3, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xD6, /* 0xB4-0xB7 */
+	0xED, 0x98, 0xED, 0x99, 0x97, 0xE4, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9B, 0xD7, 0x9B, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9B, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8A, 0xDE, 0x9B, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xED, 0x9A, 0x00, 0x00, 0x9B, 0xDB, 0x9B, 0xDA, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDD, /* 0xD8-0xDB */
+	0x00, 0x00, 0x90, 0xEC, 0x8F, 0x42, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8F, 0x84, 0x00, 0x00, 0x91, 0x83, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x8D, 0x48, 0x8D, 0xB6, 0x8D, 0x49, /* 0xE4-0xE7 */
+	0x8B, 0x90, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDE, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x8C, 0xC8, 0x9B, 0xDF, 0x96, 0xA4, /* 0xF0-0xF3 */
+	0x94, 0x62, 0x9B, 0xE0, 0x00, 0x00, 0x8D, 0x4A, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xAA, /* 0xF8-0xFB */
+	0x00, 0x00, 0x92, 0x46, 0x8B, 0xD0, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x73, 0x95, 0x7A, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xBF, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE1, /* 0x08-0x0B */
+	0x8A, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9B, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x9F, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9B, 0xE3, 0x9B, 0xE2, 0x9B, 0xE5, /* 0x18-0x1B */
+	0x00, 0x00, 0x92, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x90, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x74, /* 0x28-0x2B */
+	0x00, 0x00, 0x90, 0xC8, 0x00, 0x00, 0x91, 0xD1, /* 0x2C-0x2F */
+	0x8B, 0x41, 0x00, 0x00, 0x00, 0x00, 0x92, 0xA0, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xE6, 0x9B, 0xE7, /* 0x34-0x37 */
+	0x8F, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x96, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9B, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE9, /* 0x40-0x43 */
+	0x9B, 0xE8, 0x95, 0x9D, 0x00, 0x00, 0x9B, 0xF1, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x96, 0x79, 0x00, 0x00, 0x9B, 0xEB, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x9B, 0xED, 0x96, 0x8B, 0x00, 0x00, 0x9B, 0xEC, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xEE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x94, 0xA6, 0x9B, 0xEF, 0x95, 0xBC, /* 0x60-0x63 */
+	0x9B, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xB1, 0x95, 0xBD, /* 0x70-0x73 */
+	0x94, 0x4E, 0x9B, 0xF2, 0x9B, 0xF3, 0x00, 0x00, /* 0x74-0x77 */
+	0x8D, 0x4B, 0x8A, 0xB2, 0x9B, 0xF4, 0x8C, 0xB6, /* 0x78-0x7B */
+	0x97, 0x63, 0x97, 0x48, 0x8A, 0xF4, 0x9B, 0xF6, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x92, 0xA1, 0x00, 0x00, 0x8D, 0x4C, /* 0x80-0x83 */
+	0x8F, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x94, 0xDD, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xB0, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x98, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x92, 0xEA, 0x95, 0xF7, 0x93, 0x58, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0x4D, 0x00, 0x00, /* 0x98-0x9B */
+	0x95, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9B, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x78, 0x8D, 0xC0, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xC9, /* 0xA8-0xAB */
+	0x00, 0x00, 0x92, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x88, 0xC1, 0x8F, 0x8E, 0x8D, 0x4E, /* 0xB4-0xB7 */
+	0x97, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9B, 0xF8, 0x9B, 0xF9, 0x94, 0x70, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x9B, 0xFA, 0x97, 0xF5, 0x98, 0x4C, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xFC, /* 0xCC-0xCF */
+	0x9B, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x66, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x40, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x43, 0x9C, 0x44, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9C, 0x42, 0x00, 0x00, 0x95, 0x5F, /* 0xDC-0xDF */
+	0x8F, 0xB1, 0x9C, 0x46, 0x9C, 0x45, 0x9C, 0x41, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x9C, 0x47, 0x9C, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x9C, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9C, 0x4C, 0x9C, 0x4A, 0x00, 0x00, 0x9C, 0x4B, /* 0xF0-0xF3 */
+	0x9C, 0x4D, 0x00, 0x00, 0x89, 0x84, 0x92, 0xEC, /* 0xF4-0xF7 */
+	0x9C, 0x4E, 0x00, 0x00, 0x8C, 0x9A, 0x89, 0xF4, /* 0xF8-0xFB */
+	0x94, 0x55, 0x00, 0x00, 0x9C, 0x4F, 0x93, 0xF9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0x00, 0x00, 0x95, 0xD9, 0x00, 0x00, 0x9C, 0x50, /* 0x00-0x03 */
+	0x98, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x9C, 0x51, 0x95, 0xBE, 0x9C, 0x54, /* 0x08-0x0B */
+	0x98, 0x9F, 0x98, 0xAF, 0x00, 0x00, 0x8E, 0xAE, /* 0x0C-0x0F */
+	0x93, 0xF3, 0x9C, 0x55, 0x00, 0x00, 0x8B, 0x7C, /* 0x10-0x13 */
+	0x92, 0xA2, 0x88, 0xF8, 0x9C, 0x56, 0x95, 0xA4, /* 0x14-0x17 */
+	0x8D, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6F, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xED, /* 0x1C-0x1F */
+	0x00, 0x00, 0xED, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x96, 0xED, 0x8C, 0xB7, 0x8C, 0xCA, /* 0x24-0x27 */
+	0x00, 0x00, 0x9C, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x9C, 0x58, 0x00, 0x00, 0x9C, 0x5E, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xED, 0x9C, 0x92, 0xA3, 0x00, 0x00, 0x8B, 0xAD, /* 0x34-0x37 */
+	0x9C, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x95, 0x4A, 0x00, 0x00, 0x92, 0x65, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9C, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xED, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x9C, 0x5B, 0x00, 0x00, 0x8B, 0xAE, 0x00, 0x00, /* 0x48-0x4B */
+	0x9C, 0x5C, 0x00, 0x00, 0x9C, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x9C, 0x5F, 0x00, 0x00, 0x93, 0x96, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x60, 0x9C, 0x61, /* 0x54-0x57 */
+	0x00, 0x00, 0x9C, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x9C, 0x53, 0x9C, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x9C, 0x63, 0x8C, 0x60, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x46, 0xED, 0x9D, /* 0x64-0x67 */
+	0x00, 0x00, 0x8D, 0xCA, 0x95, 0x56, 0x92, 0xA4, /* 0x68-0x6B */
+	0x95, 0x6A, 0x9C, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8F, 0xB2, 0x89, 0x65, 0x00, 0x00, 0x9C, 0x65, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x66, /* 0x74-0x77 */
+	0x00, 0x00, 0x96, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x94, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x69, /* 0x7C-0x7F */
+	
+	0x89, 0x9D, 0x90, 0xAA, 0x9C, 0x68, 0x9C, 0x67, /* 0x80-0x83 */
+	0x8C, 0x61, 0x91, 0xD2, 0x00, 0x00, 0x9C, 0x6D, /* 0x84-0x87 */
+	0x9C, 0x6B, 0x00, 0x00, 0x9C, 0x6A, 0x97, 0xA5, /* 0x88-0x8B */
+	0x8C, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8F, 0x99, 0x9C, 0x6C, 0x93, 0x6B, 0x8F, 0x5D, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBE, /* 0x94-0x97 */
+	0x9C, 0x70, 0x9C, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x6E, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9C, 0x71, 0x8C, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x9C, 0x72, 0x95, 0x9C, 0x8F, 0x7A, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x9C, 0x73, 0x94, 0xF7, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBF, /* 0xB0-0xB3 */
+	0x92, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xED, 0x9E, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x93, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9C, 0x74, 0x8B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x53, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x95, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8A, 0xF5, 0x94, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x75, 0x8E, 0x75, /* 0xD4-0xD7 */
+	0x96, 0x59, 0x96, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x89, 0x9E, 0x9C, 0x7A, 0xED, 0x9F, 0x00, 0x00, /* 0xDC-0xDF */
+	0x92, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9C, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF5, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9C, 0xAB, 0x9C, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x94, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x9C, 0x78, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x76, /* 0xF8-0xFB */
+	0x00, 0x00, 0x8D, 0x9A, 0x00, 0x00, 0x9C, 0x7C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x83, 0x9C, 0x89, /* 0x0C-0x0F */
+	0x9C, 0x81, 0x00, 0x00, 0x93, 0x7B, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x9C, 0x86, 0x95, 0x7C, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9C, 0x80, 0x00, 0x00, 0x9C, 0x85, /* 0x18-0x1B */
+	0x97, 0xE5, 0x8E, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x91, 0xD3, 0x9C, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x8B, 0x7D, 0x9C, 0x88, 0x90, 0xAB, /* 0x24-0x27 */
+	0x89, 0x85, 0x9C, 0x82, 0x89, 0xF6, 0x9C, 0x87, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAF, /* 0x2C-0x2F */
+	0x00, 0x00, 0x9C, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x8A, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9C, 0x8C, 0x9C, 0x96, 0x9C, 0x94, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x91, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x90, 0x97, 0xF6, /* 0x48-0x4B */
+	0x00, 0x00, 0x9C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8B, 0xB0, 0x00, 0x00, 0x8D, 0x50, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x8F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9C, 0x99, 0x9C, 0x8B, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xED, 0xA0, 0x00, 0x00, 0x9C, 0x8F, /* 0x5C-0x5F */
+	0x9C, 0x7E, 0x00, 0x00, 0x89, 0xF8, 0x9C, 0x93, /* 0x60-0x63 */
+	0x9C, 0x95, 0x92, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x8D, 0xA6, 0x89, 0xB6, 0x9C, 0x8D, 0x9C, 0x98, /* 0x68-0x6B */
+	0x9C, 0x97, 0x8B, 0xB1, 0x00, 0x00, 0x91, 0xA7, /* 0x6C-0x6F */
+	0x8A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x8C, 0x62, 0x00, 0x00, 0x9C, 0x8E, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9C, 0x9A, 0x00, 0x00, 0x9C, 0x9D, /* 0x80-0x83 */
+	0x9C, 0x9F, 0xED, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x8E, 0xBB, 0xED, 0xA2, 0x9C, 0xA5, /* 0x88-0x8B */
+	0x92, 0xEE, 0x9C, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xA3, 0x00, 0x00, /* 0x90-0x93 */
+	0x89, 0xF7, 0x00, 0x00, 0x9C, 0xA1, 0x9C, 0xA2, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x9E, 0x9C, 0xA0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE5, /* 0x9C-0x9F */
+	0x97, 0x49, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB3, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x78, 0x9C, 0xA4, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x94, 0x59, 0x88, 0xAB, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xDF, 0x9C, 0x7B, /* 0xB0-0xB3 */
+	0x9C, 0xAA, 0x9C, 0xAE, 0x96, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x93, 0x89, 0x9C, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8F, 0xEE, 0x9C, 0xAD, 0x93, 0xD5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x98, 0x66, 0x00, 0x00, 0x9C, 0xA9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x9C, 0xAF, 0x00, 0x00, 0x8D, 0x9B, 0x00, 0x00, /* 0xD8-0xDB */
+	0x90, 0xC9, 0x00, 0x00, 0xED, 0xA3, 0x88, 0xD2, /* 0xDC-0xDF */
+	0x9C, 0xA8, 0x9C, 0xA6, 0x00, 0x00, 0x91, 0x79, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x9C, /* 0xE4-0xE7 */
+	0x8E, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x91, 0xC4, 0x9C, 0xBB, 0xED, 0xA6, 0x91, 0x7A, /* 0xF0-0xF3 */
+	0x9C, 0xB6, 0x00, 0x00, 0x9C, 0xB3, 0x9C, 0xB4, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x8E, 0xE4, 0x9C, 0xB7, 0x9C, 0xBA, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_61[512] = {
+	0x9C, 0xB5, 0x8F, 0x44, 0x00, 0x00, 0x9C, 0xB8, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xB2, 0x00, 0x00, /* 0x04-0x07 */
+	0x96, 0xFA, 0x96, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x9C, 0xBC, 0x9C, 0xBD, 0x88, 0xD3, /* 0x0C-0x0F */
+	0x00, 0x00, 0xED, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x9C, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xF0, 0x88, 0xA4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB4, /* 0x1C-0x1F */
+	0xED, 0xA5, 0x9C, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xC1, /* 0x24-0x27 */
+	0x9C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x9C, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xED, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9C, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xED, 0xA8, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x9C, 0xC4, 0x9C, 0xC7, 0x9C, 0xBF, 0x9C, 0xC3, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xC8, 0x00, 0x00, /* 0x40-0x43 */
+	0x9C, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xBE, /* 0x44-0x47 */
+	0x8E, 0x9C, 0x00, 0x00, 0x9C, 0xC2, 0x91, 0xD4, /* 0x48-0x4B */
+	0x8D, 0x51, 0x9C, 0xB0, 0x90, 0x54, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xD6, /* 0x50-0x53 */
+	0x00, 0x00, 0x95, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x9C, 0xD5, 0x00, 0x00, 0x9C, 0xD4, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x9D, 0x8A, 0xB5, /* 0x60-0x63 */
+	0x00, 0x00, 0x9C, 0xD2, 0x00, 0x00, 0x8C, 0x64, /* 0x64-0x67 */
+	0x8A, 0x53, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xCF, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xB6, 0x9C, 0xD1, /* 0x6C-0x6F */
+	0x88, 0xD4, 0x9C, 0xD3, 0x00, 0x00, 0x9C, 0xCA, /* 0x70-0x73 */
+	0x9C, 0xD0, 0x9C, 0xD7, 0x8C, 0x63, 0x9C, 0xCB, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x7C, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x4A, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xDA, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xDE, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x9E, 0x00, 0x00, /* 0x8C-0x8F */
+	0x97, 0xF7, 0x9C, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x9C, 0xDC, 0x00, 0x00, 0x9C, 0xD9, 0x00, 0x00, /* 0x94-0x97 */
+	0xED, 0xAA, 0x9C, 0xD8, 0x9C, 0xDD, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x95, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB2, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8C, 0x65, 0x00, 0x00, 0x9C, 0xE0, /* 0xA8-0xAB */
+	0x9C, 0xDB, 0x00, 0x00, 0x9C, 0xE1, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x9B, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xAF, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xE9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE7, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xE8, 0x8D, 0xA7, /* 0xC4-0xC7 */
+	0x9C, 0xE6, 0x9C, 0xE4, 0x9C, 0xE3, 0x9C, 0xEA, /* 0xC8-0xCB */
+	0x9C, 0xE2, 0x9C, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x89, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xEE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xED, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x9C, 0xF1, 0x00, 0x00, 0x9C, 0xEF, 0x9C, 0xE5, /* 0xF4-0xF7 */
+	0x8C, 0x9C, 0x00, 0x00, 0x9C, 0xF0, 0x00, 0x00, /* 0xF8-0xFB */
+	0x9C, 0xF4, 0x9C, 0xF3, 0x9C, 0xF5, 0x9C, 0xF2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0x9C, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9C, 0xF7, 0x9C, 0xF8, 0x95, 0xE8, 0x00, 0x00, /* 0x08-0x0B */
+	0x9C, 0xFA, 0x9C, 0xF9, 0x8F, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */
+	0x90, 0xAC, 0x89, 0xE4, 0x89, 0xFA, 0xED, 0xAB, /* 0x10-0x13 */
+	0x9C, 0xFB, 0x00, 0x00, 0x88, 0xBD, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xCA, 0x9C, 0xFC, /* 0x18-0x1B */
+	0x00, 0x00, 0xE6, 0xC1, 0x9D, 0x40, 0x8C, 0x81, /* 0x1C-0x1F */
+	0x00, 0x00, 0x9D, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xED, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x42, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x43, 0x8B, 0x59, /* 0x2C-0x2F */
+	0x9D, 0x44, 0x00, 0x00, 0x9D, 0x45, 0x9D, 0x46, /* 0x30-0x33 */
+	0x91, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x8C, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x96, 0xDF, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5B, /* 0x3C-0x3F */
+	0x8F, 0x8A, 0x9D, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xEE, /* 0x44-0x47 */
+	0xE7, 0xBB, 0x94, 0xE0, 0x00, 0x00, 0x8E, 0xE8, /* 0x48-0x4B */
+	0x00, 0x00, 0x8D, 0xCB, 0x9D, 0x48, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xC5, /* 0x50-0x53 */
+	0x00, 0x00, 0x95, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x91, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4B, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x49, 0x00, 0x00, /* 0x5C-0x5F */
+	0x9D, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4A, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x9D, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xAF, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x88, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x7D, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x94, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x9D, 0x4E, 0x00, 0x00, 0x9D, 0x51, 0x8F, 0xB3, /* 0x7C-0x7F */
+	
+	0x8B, 0x5A, 0x00, 0x00, 0x9D, 0x4F, 0x9D, 0x56, /* 0x80-0x83 */
+	0x8F, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x9D, 0x50, 0x94, 0x63, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x97, 0x7D, 0x9D, 0x52, 0x9D, 0x53, /* 0x90-0x93 */
+	0x9D, 0x57, 0x93, 0x8A, 0x9D, 0x54, 0x8D, 0x52, /* 0x94-0x97 */
+	0x90, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x65, /* 0x98-0x9B */
+	0x94, 0xB2, 0x00, 0x00, 0x91, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xE2, /* 0xA8-0xAB */
+	0x9D, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x95, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x92, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x96, 0x95, 0x00, 0x00, 0x9D, 0x5A, /* 0xB8-0xBB */
+	0x89, 0x9F, 0x92, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x63, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x92, 0x53, 0x9D, 0x5D, 0x9D, 0x64, /* 0xC4-0xC7 */
+	0x9D, 0x5F, 0x9D, 0x66, 0x9D, 0x62, 0x00, 0x00, /* 0xC8-0xCB */
+	0x9D, 0x61, 0x94, 0x8F, 0x00, 0x00, 0x9D, 0x5B, /* 0xCC-0xCF */
+	0x89, 0xFB, 0x9D, 0x59, 0x8B, 0x91, 0x91, 0xF1, /* 0xD0-0xD3 */
+	0x9D, 0x55, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x58, /* 0xD4-0xD7 */
+	0x8D, 0x53, 0x90, 0xD9, 0x00, 0x00, 0x8F, 0xB5, /* 0xD8-0xDB */
+	0x9D, 0x60, 0x94, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x8B, 0x92, 0x8A, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x8A, 0x87, 0x90, 0x40, 0x9D, 0x68, 0x9D, 0x6D, /* 0xEC-0xEF */
+	0x00, 0x00, 0x9D, 0x69, 0x00, 0x00, 0x8C, 0x9D, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x9D, 0x6E, 0x8E, 0x41, 0x8D, 0x89, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x45, 0x9D, 0x5C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0x00, 0x00, 0x8E, 0x9D, 0x9D, 0x6B, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x77, /* 0x04-0x07 */
+	0x9D, 0x6C, 0x88, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x9D, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x92, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x8B, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB2, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x6A, /* 0x24-0x27 */
+	0x88, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xC1, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x55, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xF0, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x94, 0xD2, 0x9D, 0x70, 0x91, 0x7D, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x91, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x8E, 0x4A, 0x9D, 0x71, 0x00, 0x00, 0x9D, 0x73, /* 0x4C-0x4F */
+	0x9D, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x95, 0xDF, 0x00, 0x00, 0x92, 0xBB, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x91, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xF9, /* 0x64-0x67 */
+	0x8E, 0xCC, 0x9D, 0x80, 0x00, 0x00, 0x9D, 0x7E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x98, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x9E, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x78, 0x8F, 0xB7, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xE6, 0x94, 0x50, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9D, 0x76, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7C, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x8E, 0xF6, 0x9D, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x8F, 0xB6, 0x00, 0x00, 0x9D, 0x75, 0x9D, 0x7A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x72, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x74, 0x00, 0x00, /* 0x94-0x97 */
+	0x8C, 0x40, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x7C, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x7C, /* 0x9C-0x9F */
+	0x97, 0xA9, 0x8D, 0xCC, 0x92, 0x54, 0x9D, 0x79, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x90, 0xDA, 0x00, 0x00, 0x8D, 0x54, /* 0xA4-0xA7 */
+	0x90, 0x84, 0x89, 0x86, 0x91, 0x5B, 0x9D, 0x77, /* 0xA8-0xAB */
+	0x8B, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x66, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x92, 0xCD, 0x9D, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7E, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x81, 0x00, 0x00, /* 0xBC-0xBF */
+	0x9D, 0x83, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB5, /* 0xC0-0xC3 */
+	0x9D, 0x89, 0x00, 0x00, 0x9D, 0x84, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9D, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x60, /* 0xCC-0xCF */
+	0x92, 0xF1, 0x00, 0x00, 0x9D, 0x87, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x67, 0x8A, 0xB7, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x88, 0xAC, 0x00, 0x00, 0x9D, 0x85, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9D, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x89, 0x87, 0xED, 0xAD, 0x9D, 0x88, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x68, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_64[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x8C, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x91, 0xB9, 0x00, 0x00, 0x9D, 0x93, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8D, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x8A, 0x9D, 0x91, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9D, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x8E, 0x00, 0x00, /* 0x24-0x27 */
+	0x9D, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x94, 0xC0, 0x93, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9D, 0x8B, 0x00, 0x00, 0x9D, 0x8F, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x67, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xDB, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x97, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x93, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xED, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x94, /* 0x64-0x67 */
+	0x00, 0x00, 0x96, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x95, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x96, 0x00, 0x00, /* 0x74-0x77 */
+	0x96, 0xCC, 0x00, 0x00, 0x90, 0xA0, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x82, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x9D, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x54, 0x9D, 0x9A, /* 0x90-0x93 */
+	0x00, 0x00, 0x9D, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x51, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xED, 0xAF, 0x93, 0xB3, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x93, 0x50, 0x9D, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x9D, 0x9C, 0x00, 0x00, 0x95, 0x8F, /* 0xA8-0xAB */
+	0x00, 0x00, 0x94, 0x64, 0x8E, 0x42, 0x00, 0x00, /* 0xAC-0xAF */
+	0x90, 0xEF, 0x00, 0x00, 0x96, 0x6F, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x8A, 0x68, 0x00, 0x00, 0x9D, 0xA3, /* 0xB8-0xBB */
+	0x9D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x97, 0x69, 0x9D, 0xA5, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9D, 0xA1, 0x00, 0x00, 0x9D, 0xA2, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x91, 0x80, 0xED, 0xB0, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xA0, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x9D, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x9D, 0xA4, 0x00, 0x00, 0x9D, 0x9F, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x9D, 0xA9, 0x9D, 0xAA, 0x93, 0x46, 0x9D, 0xAC, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x43, 0x9D, 0xA7, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x8B, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xAD, /* 0xEC-0xEF */
+	0x00, 0x00, 0x9D, 0xA6, 0x9D, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x9D, 0xB0, 0x00, 0x00, 0x9D, 0xAF, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9D, 0xB4, 0x8F, 0xEF, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0x9D, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x9D, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x9D, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9D, 0xB6, 0x9D, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xB9, /* 0x20-0x23 */
+	0x9D, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x98, 0x9D, 0xBA, /* 0x28-0x2B */
+	0x9D, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x78, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBE, 0x9D, 0xBD, /* 0x34-0x37 */
+	0x9D, 0xBF, 0x89, 0xFC, 0x00, 0x00, 0x8D, 0x55, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xFA, 0x90, 0xAD, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x8C, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x9D, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x9D, 0xC4, 0xED, 0xB1, 0x95, 0x71, /* 0x4C-0x4F */
+	0x00, 0x00, 0x8B, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x9D, 0xC3, 0x9D, 0xC2, 0x94, 0x73, /* 0x54-0x57 */
+	0x9D, 0xC5, 0x8B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x9D, 0xC7, 0x9D, 0xC6, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xB8, 0x8E, 0x55, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xD6, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x8C, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x90, 0x94, 0x00, 0x00, 0x9D, 0xC8, 0x00, 0x00, /* 0x70-0x73 */
+	0x90, 0xAE, 0x93, 0x47, 0x00, 0x00, 0x95, 0x7E, /* 0x74-0x77 */
+	0x9D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xB6, /* 0x84-0x87 */
+	0x9B, 0x7C, 0x90, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x95, 0x6B, 0x00, 0x00, 0x8D, 0xD6, 0x00, 0x00, /* 0x8C-0x8F */
+	0x94, 0xE3, 0x94, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x6C, /* 0x94-0x97 */
+	0x00, 0x00, 0x97, 0xBF, 0x00, 0x00, 0x9D, 0xCD, /* 0x98-0x9B */
+	0x8E, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCE, /* 0x9C-0x9F */
+	0x00, 0x00, 0x88, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x8B, 0xD2, 0x90, 0xCB, 0x00, 0x00, 0x95, 0x80, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCF, /* 0xA8-0xAB */
+	0x8E, 0x61, 0x92, 0x66, 0x00, 0x00, 0x8E, 0x7A, /* 0xAC-0xAF */
+	0x90, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xD0, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x95, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x89, 0x97, 0x8E, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9D, 0xD3, 0x00, 0x00, 0x9D, 0xD1, /* 0xC0-0xC3 */
+	0x9D, 0xD4, 0x97, 0xB7, 0x9D, 0xD2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF9, /* 0xC8-0xCB */
+	0x9D, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB0, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xF8, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9D, 0xD8, 0x00, 0x00, 0x9D, 0xD7, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x9D, 0xD9, 0x9D, 0xDA, 0x8A, 0xF9, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x93, 0xFA, 0x92, 0x55, 0x8B, 0x8C, /* 0xE4-0xE7 */
+	0x8E, 0x7C, 0x91, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x8F, 0x7B, 0x88, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x9D, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xA0, 0x9D, 0xDF, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_66[512] = {
+	0xED, 0xB2, 0x00, 0x00, 0x8D, 0x56, 0x9D, 0xDE, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA9, 0x8F, 0xB8, /* 0x04-0x07 */
+	0x00, 0x00, 0xED, 0xB5, 0x9D, 0xDD, 0x00, 0x00, /* 0x08-0x0B */
+	0x8F, 0xB9, 0x00, 0x00, 0x96, 0xBE, 0x8D, 0xA8, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xD5, /* 0x10-0x13 */
+	0x90, 0xCC, 0xED, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9D, 0xE4, 0x00, 0x00, 0xED, 0xB7, 0x90, 0xAF, /* 0x1C-0x1F */
+	0x89, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xED, 0xB8, 0x8F, 0x74, 0x00, 0x00, 0x96, 0x86, /* 0x24-0x27 */
+	0x8D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x8F, 0xBA, 0xED, 0xB6, 0x90, 0xA5, /* 0x2C-0x2F */
+	0x00, 0x00, 0xED, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9D, 0xE3, 0x9D, 0xE1, 0x9D, 0xE2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB4, /* 0x38-0x3B */
+	0x92, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x45, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9D, 0xE8, 0x8E, 0x9E, 0x8D, 0x57, /* 0x40-0x43 */
+	0x9D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x9D, 0xE7, 0x00, 0x00, 0x90, 0x57, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xE5, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x4E, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBA, /* 0x54-0x57 */
+	0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x9D, 0xEA, 0x9D, 0xE9, 0x9D, 0xEE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xEF, 0x00, 0x00, /* 0x60-0x63 */
+	0x9D, 0xEB, 0xED, 0xB9, 0x8A, 0x41, 0x9D, 0xEC, /* 0x64-0x67 */
+	0x9D, 0xED, 0x94, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x81, 0x8C, 0x69, /* 0x6C-0x6F */
+	0x9D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x70-0x73 */
+	0x90, 0xB0, 0x00, 0x00, 0x8F, 0xBB, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x71, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x8B, 0xC5, 0x00, 0x00, 0x9D, 0xF1, /* 0x80-0x83 */
+	0x9D, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC9, /* 0x84-0x87 */
+	0x9D, 0xF2, 0x9D, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8F, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x67, 0x88, 0xC3, /* 0x94-0x97 */
+	0x9D, 0xF6, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x9D, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xED, 0xBF, 0x00, 0x00, 0x92, 0xA8, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xEF, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x62, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xE9, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x96, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9E, 0x41, 0x9D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9D, 0xFC, 0x00, 0x00, 0x9D, 0xFB, 0xED, 0xC1, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9D, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9E, 0x40, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9D, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x42, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x8F, 0x8C, 0x9E, 0x43, 0x00, 0x00, /* 0xD8-0xDB */
+	0x97, 0x6A, 0x94, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x9E, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x46, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9E, 0x48, 0x00, 0x00, 0x8B, 0xC8, 0x89, 0x67, /* 0xF0-0xF3 */
+	0x8D, 0x58, 0x9E, 0x49, 0x00, 0x00, 0x9E, 0x4A, /* 0xF4-0xF7 */
+	0x8F, 0x91, 0x91, 0x82, 0xED, 0xC2, 0xED, 0x4A, /* 0xF8-0xFB */
+	0x99, 0xD6, 0x91, 0x5D, 0x91, 0x5C, 0x91, 0xD6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0x8D, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xF0, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x8C, 0x8E, 0x97, 0x4C, 0x00, 0x00, 0x95, 0xFC, /* 0x08-0x0B */
+	0x00, 0x00, 0x95, 0x9E, 0xED, 0xC3, 0x9E, 0x4B, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x8D, 0xF1, 0x92, 0xBD, 0x9E, 0x4C, 0x98, 0x4E, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5D, /* 0x18-0x1B */
+	0x00, 0x00, 0x92, 0xA9, 0x9E, 0x4D, 0x8A, 0xFA, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x4E, 0x9E, 0x4F, /* 0x24-0x27 */
+	0x96, 0xD8, 0x00, 0x00, 0x96, 0xA2, 0x96, 0x96, /* 0x28-0x2B */
+	0x96, 0x7B, 0x8E, 0x44, 0x9E, 0x51, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8E, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x96, 0x70, 0x00, 0x00, 0x9E, 0x53, 0x9E, 0x56, /* 0x34-0x37 */
+	0x9E, 0x55, 0x00, 0x00, 0x8A, 0xF7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x8B, 0x80, 0x00, 0x00, 0x9E, 0x52, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9E, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x57, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x90, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x9B, 0x88, 0xC7, /* 0x4C-0x4F */
+	0x8D, 0xDE, 0x91, 0xBA, 0x00, 0x00, 0x8E, 0xDB, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xF1, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9E, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x93, 0x6D, 0x00, 0x00, 0x9E, 0x58, 0x91, 0xA9, /* 0x5C-0x5F */
+	0x9E, 0x59, 0x8F, 0xF0, 0x96, 0xDB, 0x9E, 0x5B, /* 0x60-0x63 */
+	0x9E, 0x5C, 0x97, 0x88, 0xED, 0xC5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x61, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x8D, 0x59, 0x00, 0x00, 0x94, 0x74, /* 0x6C-0x6F */
+	0x9E, 0x5E, 0x93, 0x8C, 0x9D, 0xDC, 0x9D, 0xE0, /* 0x70-0x73 */
+	0x00, 0x00, 0x8B, 0x6E, 0x00, 0x00, 0x94, 0x66, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x9E, 0x60, 0x00, 0x00, 0x8F, 0xBC, 0x94, 0xC2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x9E, 0x66, 0x00, 0x00, 0x94, 0xF8, /* 0x84-0x87 */
+	0x00, 0x00, 0x9E, 0x5D, 0x00, 0x00, 0x9E, 0x63, /* 0x88-0x8B */
+	0x9E, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x90, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x96, 0x8D, 0x00, 0x00, 0x97, 0xD1, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x87, 0x00, 0x00, /* 0x98-0x9B */
+	0x89, 0xCA, 0x8E, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x98, 0x67, 0x9E, 0x65, 0x90, 0x95, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x64, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x9E, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xCD, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x6B, /* 0xB0-0xB3 */
+	0x9E, 0x69, 0x00, 0x00, 0x89, 0xCB, 0x9E, 0x67, /* 0xB4-0xB7 */
+	0x9E, 0x6D, 0x9E, 0x73, 0x00, 0x00, 0xED, 0xC6, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xED, 0xC8, 0x91, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x95, 0xBF, 0x00, 0x00, 0x9E, 0x75, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x41, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x74, 0x94, 0x90, /* 0xCC-0xCF */
+	0x96, 0x5E, 0x8A, 0xB9, 0x00, 0x00, 0x90, 0xF5, /* 0xD0-0xD3 */
+	0x8F, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x92, 0xD1, 0x00, 0x00, 0x97, 0x4D, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9E, 0x70, 0x9E, 0x6F, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x71, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9E, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x76, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9E, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x9E, 0x6A, 0x00, 0x00, 0x9E, 0x72, 0x9E, 0x68, /* 0xEC-0xEF */
+	0x00, 0x00, 0x92, 0x8C, 0x00, 0x00, 0x96, 0xF6, /* 0xF0-0xF3 */
+	0x8E, 0xC4, 0x8D, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xB8, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x8F, 0x8A, 0x60, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0x00, 0x00, 0xED, 0xC9, 0x92, 0xCC, 0x93, 0xC8, /* 0x00-0x03 */
+	0x89, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF0, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB2, 0x8C, 0x49, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x78, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x8D, 0x5A, 0x8A, 0x9C, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x9E, 0x7A, 0x8A, 0x94, 0x9E, 0x81, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x7D, 0x00, 0x00, /* 0x30-0x33 */
+	0x90, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x8A, 0x6A, 0x8D, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x8A, 0x69, 0x8D, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9E, 0x7B, 0x8C, 0x85, 0x8C, 0x6A, 0x93, 0x8D, /* 0x40-0x43 */
+	0xED, 0xCA, 0x00, 0x00, 0x9E, 0x79, 0x00, 0x00, /* 0x44-0x47 */
+	0x88, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x9E, 0x7C, 0x9E, 0x7E, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8B, 0xCB, 0x8C, 0x4B, 0xED, 0xC7, 0x8A, 0xBA, /* 0x50-0x53 */
+	0x8B, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x8D, 0xF7, 0x96, 0x91, 0x00, 0x00, 0x8E, 0x56, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x83, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x4F, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x9E, 0x8F, 0x00, 0x00, 0x89, 0xB1, 0x9E, 0x84, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x95, 0x9E, 0x85, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x97, 0xC0, 0x00, 0x00, 0x9E, 0x8C, /* 0x80-0x83 */
+	0x00, 0x00, 0x94, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x9E, 0x94, 0x00, 0x00, 0x9E, 0x87, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xB2, /* 0x90-0x93 */
+	0x9E, 0x89, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x5B, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x8B, /* 0x98-0x9B */
+	0x00, 0x00, 0x9E, 0x8A, 0x00, 0x00, 0x9E, 0x86, /* 0x9C-0x9F */
+	0x9E, 0x91, 0x00, 0x00, 0x8F, 0xBD, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xEB, 0x8C, 0xE6, /* 0xA4-0xA7 */
+	0x97, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x9E, 0x88, 0x00, 0x00, 0x92, 0xF2, /* 0xAC-0xAF */
+	0x8A, 0x42, 0x8D, 0xAB, 0x00, 0x00, 0x9E, 0x80, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x9E, 0x90, 0x8A, 0x81, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x9E, 0x8E, 0x9E, 0x92, 0x00, 0x00, /* 0xB8-0xBB */
+	0x93, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x8A, 0xFC, 0x00, 0x00, 0x9E, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xED, 0x48, 0x96, 0xC7, 0x9E, 0x97, 0x8A, 0xFB, /* 0xC8-0xCB */
+	0x00, 0x00, 0x9E, 0x9E, 0x00, 0x00, 0xED, 0xCB, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x5F, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x9E, 0x9F, 0x9E, 0xA1, 0x00, 0x00, 0x9E, 0xA5, /* 0xD4-0xD7 */
+	0x9E, 0x99, 0x00, 0x00, 0x92, 0x49, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x8F, /* 0xDC-0xDF */
+	0x9E, 0xA9, 0x9E, 0x9C, 0x00, 0x00, 0x9E, 0xA6, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xA0, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x58, 0x9E, 0xAA, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9E, 0xA8, 0x8A, 0xBB, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_69[512] = {
+	0x98, 0x6F, 0x9E, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x9E, 0xA4, 0x88, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9E, 0x98, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB8, /* 0x08-0x0B */
+	0x9E, 0x9D, 0x90, 0x41, 0x92, 0xC5, 0x9E, 0x93, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xA3, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x90, 0x9A, 0x9E, 0xAD, 0x8A, 0x91, /* 0x18-0x1B */
+	0x8C, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x9E, 0xAF, 0x9E, 0x9A, 0x9E, 0xAE, /* 0x20-0x23 */
+	0x00, 0x00, 0x9E, 0xA7, 0x9E, 0x9B, 0x00, 0x00, /* 0x24-0x27 */
+	0x9E, 0xAB, 0x00, 0x00, 0x9E, 0xAC, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x9E, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x93, 0xCC, 0x00, 0x00, 0x9E, 0xA2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x9E, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x9E, 0xBB, 0x00, 0x00, 0x92, 0xD6, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x6B, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x96, /* 0x50-0x53 */
+	0x9E, 0xB6, 0x91, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9E, 0xBC, 0x91, 0x5E, 0x00, 0x00, /* 0x58-0x5B */
+	0x9E, 0xB3, 0x9E, 0xC0, 0x9E, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */
+	0x93, 0xED, 0x9E, 0xBE, 0x93, 0xE8, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xED, 0xCD, 0x00, 0x00, 0x9E, 0xC2, 0x9E, 0xB5, /* 0x68-0x6B */
+	0x00, 0x00, 0x8B, 0xC6, 0x9E, 0xB8, 0x8F, 0x7C, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x80, /* 0x70-0x73 */
+	0x9E, 0xBA, 0x8B, 0xC9, 0x00, 0x00, 0x9E, 0xB2, /* 0x74-0x77 */
+	0x9E, 0xB4, 0x9E, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x98, 0x4F, 0x8A, 0x79, 0x9E, 0xB7, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9E, 0xC1, 0x8A, 0x54, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE5, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x7C, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x9E, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x98, 0x50, 0x9E, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x90, 0x59, /* 0x98-0x9B */
+	0x9E, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9E, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD0, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xC4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x9E, 0xE1, 0x9E, 0xC3, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x9E, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xCE, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xC9, 0x9E, 0xC6, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9E, 0xC7, 0x00, 0x00, 0x9E, 0xCF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA0, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xCC, 0x8D, 0x5C, /* 0xC8-0xCB */
+	0x92, 0xC6, 0x91, 0x84, 0x9E, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x9E, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC8, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x97, 0x6C, 0x96, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9E, 0xCD, 0x9E, 0xD7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDF, /* 0xE4-0xE7 */
+	0x9E, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE5, /* 0xE8-0xEB */
+	0x00, 0x00, 0x9E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xDE, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9E, 0xDD, 0x00, 0x00, 0x92, 0xCE, /* 0xF8-0xFB */
+	0x00, 0x00, 0x91, 0x85, 0x00, 0x00, 0x9E, 0xDB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xD9, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x9E, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xE6, 0x94, 0xF3, /* 0x08-0x0B */
+	0x9E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xE7, 0x9E, 0xEA, /* 0x10-0x13 */
+	0x9E, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x92, 0x94, /* 0x14-0x17 */
+	0x00, 0x00, 0x95, 0x57, 0x00, 0x00, 0x9E, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xE2, 0x8F, 0xBE, /* 0x1C-0x1F */
+	0x00, 0x00, 0x96, 0xCD, 0x9E, 0xF6, 0x9E, 0xE9, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x8C, 0xA0, 0x89, 0xA1, 0x8A, 0x7E, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xD1, 0x00, 0x00, /* 0x2C-0x2F */
+	0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x8F, 0xBF, 0x9E, 0xEE, 0x00, 0x00, /* 0x34-0x37 */
+	0x9E, 0xF5, 0x8E, 0xF7, 0x8A, 0x92, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x92, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x9E, 0xEB, 0x00, 0x00, 0xED, 0xD3, 0x9E, 0xF0, /* 0x44-0x47 */
+	0x9E, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB4, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x8B, 0x6B, 0x9E, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x40, /* 0x5C-0x5F */
+	0x00, 0x00, 0x93, 0xC9, 0x9E, 0xF1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xF3, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xED, 0xED, 0xD4, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9E, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, 0x8A, 0x80, /* 0x7C-0x7F */
+	
+	0x92, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x9E, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x9E, 0xF8, 0x8C, 0xE7, 0x00, 0x00, /* 0x8C-0x8F */
+	0x9E, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x40, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x9E, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9E, 0xF9, 0x00, 0x00, 0x9E, 0xFB, 0x9E, 0xFC, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x4B, 0x00, 0x00, /* 0xA8-0xAB */
+	0x9F, 0x47, 0x00, 0x00, 0x9E, 0x8D, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x46, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9F, 0x45, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x42, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9E, 0xE8, 0x9F, 0x44, 0x9F, 0x43, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x9F, 0x49, 0x00, 0x00, 0x98, 0x45, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x4C, 0x8B, 0xF9, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x48, 0x9F, 0x4A, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x94, 0xA5, 0x00, 0x00, 0x9F, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x51, 0x9F, 0x4E, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x97, 0x93, 0x9F, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xDC, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x52, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x53, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x89, 0x54, 0x00, 0x00, 0x9F, 0x55, /* 0x1C-0x1F */
+	0x8C, 0x87, 0x8E, 0x9F, 0x00, 0x00, 0x8B, 0xD3, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xA2, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x7E, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x57, /* 0x34-0x37 */
+	0x9F, 0x56, 0x9F, 0x59, 0x8B, 0x5C, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x8B, 0xD4, 0x8A, 0xBC, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5C, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5B, /* 0x44-0x47 */
+	0x00, 0x00, 0x9F, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x89, 0xCC, 0x00, 0x00, 0x92, 0x56, 0x00, 0x00, /* 0x4C-0x4F */
+	0x9F, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xBD, /* 0x50-0x53 */
+	0x9F, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9F, 0x5F, 0x00, 0x00, 0x9F, 0x61, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x62, /* 0x5C-0x5F */
+	0x00, 0x00, 0x9F, 0x63, 0x8E, 0x7E, 0x90, 0xB3, /* 0x60-0x63 */
+	0x8D, 0x9F, 0x00, 0x00, 0x95, 0x90, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x95, 0xE0, 0x98, 0x63, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x95, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xCE, /* 0x70-0x73 */
+	0x97, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9F, 0x64, 0x9F, 0x65, 0x00, 0x00, 0x8E, 0x80, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x66, /* 0x7C-0x7F */
+	
+	0x9F, 0x67, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x69, /* 0x80-0x83 */
+	0x9F, 0x68, 0x00, 0x00, 0x96, 0x77, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x8F, 0x7D, 0x8E, 0xEA, 0x8E, 0x63, /* 0x88-0x8B */
+	0x00, 0x00, 0x9F, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x9F, 0x6C, 0x90, 0x42, 0x00, 0x00, /* 0x94-0x97 */
+	0x9F, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x6D, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x9F, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x6F, 0x9F, 0x70, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x71, /* 0xAC-0xAF */
+	0x00, 0x00, 0x9F, 0x73, 0x9F, 0x72, 0x9F, 0x74, /* 0xB0-0xB3 */
+	0x89, 0xA3, 0x92, 0x69, 0x00, 0x00, 0x9F, 0x75, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x45, 0x8A, 0x6B, /* 0xB8-0xBB */
+	0x9F, 0x76, 0x00, 0x00, 0x00, 0x00, 0x93, 0x61, /* 0xBC-0xBF */
+	0x9A, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8B, 0x42, 0x9F, 0x77, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x78, /* 0xC8-0xCB */
+	0x00, 0x00, 0x95, 0xEA, 0x96, 0x88, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xC5, 0x9F, 0x79, /* 0xD0-0xD3 */
+	0x94, 0xE4, 0x00, 0x00, 0xED, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x94, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD1, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7A, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7C, /* 0xE8-0xEB */
+	0x9F, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7E, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7D, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x81, /* 0x0C-0x0F */
+	0x00, 0x00, 0x96, 0xAF, 0x00, 0x00, 0x9F, 0x82, /* 0x10-0x13 */
+	0x9F, 0x83, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x43, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x84, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x86, /* 0x20-0x23 */
+	0x9F, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x90, 0x85, 0x00, 0x00, 0x00, 0x00, 0x95, 0x58, /* 0x34-0x37 */
+	0x89, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xC3, 0xED, 0xD9, /* 0x3C-0x3F */
+	0x92, 0xF3, 0x8F, 0x60, 0x8B, 0x81, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8E, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x9F, 0x88, 0x00, 0x00, 0x8A, 0xBE, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x98, 0x00, 0x00, /* 0x58-0x5B */
+	0xED, 0xDA, 0x93, 0xF0, 0x9F, 0x87, 0x8D, 0x5D, /* 0x5C-0x5F */
+	0x92, 0x72, 0x00, 0x00, 0x9F, 0x89, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x9F, 0x91, 0x00, 0x00, 0x9F, 0x8A, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xDC, /* 0x6C-0x6F */
+	0x91, 0xBF, 0x00, 0x00, 0x8B, 0x82, 0x9F, 0x92, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x88, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8B, 0x44, 0x9F, 0x90, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9F, 0x8E, 0x9F, 0x8B, 0x97, 0x80, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xDB, 0x00, 0x00, /* 0x84-0x87 */
+	0x92, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x93, 0xD7, 0x9F, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x9F, 0x94, 0x00, 0x00, 0x9F, 0x93, 0x8C, 0x42, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xAB, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x8D, 0xB9, 0x9F, 0x8D, 0x9F, 0x8F, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x96, 0x76, 0x91, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x97, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x9C, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x9F, 0x9D, 0x00, 0x00, 0x89, 0xCD, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x95, 0xA6, 0x96, 0xFB, 0x9F, 0x9F, 0x8E, 0xA1, /* 0xB8-0xBB */
+	0x8F, 0xC0, 0x9F, 0x98, 0x9F, 0x9E, 0x89, 0x88, /* 0xBC-0xBF */
+	0x00, 0x00, 0x8B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9F, 0x95, 0x9F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x90, 0xF2, 0x94, 0x91, 0x00, 0x00, /* 0xC8-0xCB */
+	0x94, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x97, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x96, 0x40, 0x00, 0x00, 0x9F, 0x99, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9F, 0xA2, 0xED, 0xDD, 0x9F, 0xA0, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9F, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x96, 0x41, 0x94, 0x67, 0x8B, 0x83, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x93, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x92, 0x8D, 0x00, 0x00, 0x9F, 0xA3, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xA1, /* 0xEC-0xEF */
+	0x91, 0xD7, 0x9F, 0x96, 0x00, 0x00, 0x89, 0x6A, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xED, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x6D, /* 0x08-0x0B */
+	0x9F, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xAD, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF4, /* 0x14-0x17 */
+	0x00, 0x00, 0x9F, 0xAA, 0x00, 0x00, 0x97, 0x8C, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xB4, 0x9F, 0xA4, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x92, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x89, 0x6B, 0x8D, 0x5E, 0x9F, 0xA7, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x46, 0x9F, 0xAC, /* 0x30-0x33 */
+	0x00, 0x00, 0x9F, 0xAB, 0x9F, 0xA6, 0x00, 0x00, /* 0x34-0x37 */
+	0x9F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x88, /* 0x38-0x3B */
+	0x00, 0x00, 0x9F, 0xA8, 0x94, 0x68, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x97, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x8F, 0xF2, 0x90, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9F, 0xB4, 0x9F, 0xB2, 0x00, 0x00, /* 0x58-0x5B */
+	0x95, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xAF, /* 0x60-0x63 */
+	0x9F, 0xB1, 0x00, 0x00, 0x89, 0x59, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x8D, 0x5F, 0x98, 0x51, 0x00, 0x00, /* 0x68-0x6B */
+	0x8A, 0x5C, 0x00, 0x00, 0x95, 0x82, 0xED, 0xE0, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x97, 0x81, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x43, /* 0x74-0x77 */
+	0x90, 0x5A, 0x9F, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x9F, 0xB8, 0x00, 0x00, 0xED, 0xDF, /* 0x84-0x87 */
+	0x8F, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x97, 0x4F, 0x00, 0x00, 0x9F, 0xB5, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xB0, /* 0x90-0x93 */
+	0x00, 0x00, 0x9F, 0xB6, 0xED, 0xE1, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x97, 0xDC, 0x00, 0x00, 0x93, 0x93, /* 0x98-0x9B */
+	0x93, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xED, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x55, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x74, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x9F, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9F, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x97, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x97, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9F, 0xC6, 0x9F, 0xC0, 0x9F, 0xBD, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD2, /* 0xC8-0xCB */
+	0x9F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8F, 0x69, 0x9F, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x9F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x93, 0x91, 0x9F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xC2, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x92, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9F, 0xC9, 0x00, 0x00, 0x9F, 0xBE, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x9F, 0xC4, 0x00, 0x00, 0x9F, 0xCB, 0x88, 0xFA, /* 0xE8-0xEB */
+	0x9F, 0xC1, 0x00, 0x00, 0x9F, 0xCC, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x90, 0x5B, 0xED, 0xE5, 0x8F, 0x7E, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x95, 0xA3, 0x00, 0x00, 0x8D, 0xAC, /* 0xF4-0xF7 */
+	0xED, 0xE4, 0x9F, 0xB9, 0x9F, 0xC7, 0x93, 0x59, /* 0xF8-0xFB */
+	0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x90, 0xB4, 0x00, 0x00, 0x8A, 0x89, /* 0x04-0x07 */
+	0x8D, 0xCF, 0x8F, 0xC2, 0x9F, 0xBB, 0x8F, 0x61, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x6B, /* 0x10-0x13 */
+	0x00, 0x00, 0x9F, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9F, 0xD0, 0x8F, 0x8D, 0x8C, 0xB8, /* 0x18-0x1B */
+	0x00, 0x00, 0x9F, 0xDF, 0x00, 0x00, 0x9F, 0xD9, /* 0x1C-0x1F */
+	0x8B, 0x94, 0x93, 0x6E, 0x00, 0x00, 0x9F, 0xD4, /* 0x20-0x23 */
+	0x9F, 0xDD, 0x88, 0xAD, 0x89, 0x51, 0xED, 0xE9, /* 0x24-0x27 */
+	0x00, 0x00, 0x89, 0xB7, 0x00, 0x00, 0x9F, 0xD6, /* 0x28-0x2B */
+	0x91, 0xAA, 0x9F, 0xCD, 0x9F, 0xCF, 0x8D, 0x60, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9F, 0xE0, 0xED, 0xE7, 0x9F, 0xDB, 0x00, 0x00, /* 0x38-0x3B */
+	0xED, 0xEA, 0x00, 0x00, 0x9F, 0xD3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xDA, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xA9, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x9F, 0xD8, 0x9F, 0xDC, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xCE, 0x00, 0x00, /* 0x54-0x57 */
+	0x8F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x92, 0x58, /* 0x58-0x5B */
+	0xED, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD2, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x4E, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD5, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xCE, 0x93, 0x92, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xD1, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xD7, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x70, 0x8E, 0xBC, /* 0x7C-0x7F */
+	
+	0x96, 0x9E, 0x00, 0x00, 0x9F, 0xE1, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x94, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xED, /* 0x8C-0x8F */
+	0x8C, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x80, 0x00, 0x00, /* 0x94-0x97 */
+	0x9F, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x97, 0xAD, 0x8D, 0x61, 0x00, 0x00, 0x9F, 0xF0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x9F, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xE2, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xE8, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xEA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x6E, 0x9F, 0xE5, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x9F, 0xE7, 0x00, 0x00, 0xED, 0xEB, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xEF, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9F, 0xE9, 0x96, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9F, 0xE4, 0x00, 0x00, 0x8E, 0xA0, /* 0xC8-0xCB */
+	0x9F, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8A, 0x8A, 0x00, 0x00, 0x9F, 0xE6, /* 0xD0-0xD3 */
+	0x9F, 0xEB, 0x9F, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x91, 0xEA, 0x91, 0xD8, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x9F, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xFA, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x93, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x42, /* 0xF4-0xF7 */
+	0x9F, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xF6, 0x9F, 0xDE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0x00, 0x00, 0x8B, 0x99, 0x95, 0x59, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xBD, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x8D, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x52, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9F, 0xF2, 0x00, 0x00, 0xE0, 0x41, /* 0x10-0x13 */
+	0x89, 0x89, 0x91, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x94, 0x99, 0x00, 0x00, 0x8A, 0xBF, 0x97, 0xF8, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9F, /* 0x28-0x2B */
+	0x92, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x9F, 0xF9, 0x9F, 0xFB, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x91, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x40, 0x9F, 0xF7, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9F, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x8A, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x8C, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE0, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x49, /* 0x58-0x5B */
+	0x90, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x83, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x8F, 0x81, 0x00, 0x00, 0xE0, 0x52, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE0, 0x4B, 0x92, 0xAA, 0xE0, 0x48, /* 0x6C-0x6F */
+	0x92, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE0, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xE0, 0x45, 0x00, 0x00, 0xE0, 0x44, 0x00, 0x00, /* 0x78-0x7B */
+	0xE0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE0, 0x47, 0xE0, 0x46, 0xE0, 0x4C, 0x00, 0x00, /* 0x80-0x83 */
+	0x90, 0x9F, 0x00, 0x00, 0xE0, 0x43, 0x00, 0x00, /* 0x84-0x87 */
+	0xED, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x4F, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE0, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC0, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE0, 0x55, 0x00, 0x00, 0xE0, 0x54, /* 0xA0-0xA3 */
+	0xE0, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x59, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x93, 0x62, 0x00, 0x00, 0xE0, 0x53, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xED, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x8C, 0x83, 0x91, 0xF7, 0xE0, 0x51, 0x94, 0x5A, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE0, 0x5D, 0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE0, 0x5E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x61, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x5A, /* 0xDC-0xDF */
+	0x8D, 0x8A, 0x94, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9F, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x94, /* 0xE8-0xEB */
+	0xE0, 0x5C, 0x00, 0x00, 0xE0, 0x60, 0x91, 0xF3, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE0, 0x5F, 0x00, 0x00, 0xE0, 0x4A, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xED, 0xEE, 0xE8, 0x89, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x64, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x68, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0x00, 0x00, 0xE0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xED, 0xEF, 0x00, 0x00, 0xED, 0xF0, /* 0x04-0x07 */
+	0x00, 0x00, 0xE0, 0x62, 0x00, 0x00, 0xE0, 0x63, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x67, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE0, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x95, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE0, 0x6D, 0x00, 0x00, 0xE0, 0x6A, 0xE0, 0x69, /* 0x18-0x1B */
+	0x00, 0x00, 0xE0, 0x6C, 0x93, 0xD2, 0xE0, 0x6E, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x95, 0x91, 0xEB, /* 0x24-0x27 */
+	0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x90, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE0, 0x6F, 0x00, 0x00, 0xE0, 0x71, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x70, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x9F, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE0, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x93, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x73, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xCE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x94, /* 0x6C-0x6F */
+	0x8A, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x8B, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x8E, 0xDC, 0x8D, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xED, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x98, 0x46, 0x90, 0x86, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x8A, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x75, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE0, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF3, /* 0xA8-0xAB */
+	0xE0, 0x78, 0x92, 0x59, 0xE0, 0x7B, 0xE0, 0x76, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7A, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE0, 0x79, 0x93, 0x5F, 0x88, 0xD7, 0xED, 0x46, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x97, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7D, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x47, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE0, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE0, 0x7E, 0x00, 0x00, 0xE0, 0x7C, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x96, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE0, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xED, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE0, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x89, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE0, 0x84, 0x95, 0xB0, 0x00, 0x00, /* 0x18-0x1B */
+	0xE0, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x96, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xC5, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x52, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x8F, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xF7, 0xED, 0xF8, /* 0x44-0x47 */
+	0x00, 0x00, 0x97, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE0, 0x8A, 0x00, 0x00, 0x90, 0xF7, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE0, 0x86, 0xE0, 0x8B, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x89, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xED, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x89, 0x00, 0x00, /* 0x60-0x63 */
+	0x94, 0x81, 0xE0, 0x85, 0xE0, 0x88, 0x8F, 0xC6, /* 0x64-0x67 */
+	0x00, 0x00, 0x94, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0x8C, 0x00, 0x00, 0x8E, 0xCF, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x90, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE0, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE0, 0x87, 0x00, 0x00, 0x8C, 0x46, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x8D, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x97, 0x6F, 0xE0, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6E, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE0, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE0, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x94, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x95, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xED, 0xFA, 0x00, 0x00, 0x94, 0x52, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x93, 0x95, 0xE0, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x99, 0x00, 0x00, /* 0xCC-0xCF */
+	0x97, 0xD3, 0x00, 0x00, 0xE0, 0x96, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE0, 0x98, 0x89, 0x8D, 0x00, 0x00, 0xE0, 0x93, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x7A, /* 0xDC-0xDF */
+	0xE0, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x91, 0x87, 0x8E, 0x57, 0xE0, 0x9C, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE0, 0x9B, 0x90, 0x43, 0x99, 0xD7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE0, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE0, 0x9F, 0x00, 0x00, 0xE0, 0x8E, /* 0xF8-0xFB */
+	0xE0, 0x9E, 0x00, 0x00, 0xED, 0xFB, 0xE0, 0xA0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x9A, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE0, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xE0, 0xA4, 0x00, 0x00, 0x92, 0xDC, 0x00, 0x00, /* 0x28-0x2B */
+	0xE0, 0xA6, 0xE0, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE0, 0xA7, 0x00, 0x00, 0xE0, 0xA8, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x8E, 0xDD, 0x95, 0x83, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xEA, 0xE0, 0xA9, /* 0x38-0x3B */
+	0xE0, 0xAA, 0x91, 0x75, 0x8E, 0xA2, 0xE0, 0xAB, /* 0x3C-0x3F */
+	0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xAD, 0x95, 0xD0, /* 0x44-0x47 */
+	0x94, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAE, /* 0x48-0x4B */
+	0x94, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xAB, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE0, 0xAF, 0x89, 0xE5, 0x00, 0x00, 0x8B, 0x8D, /* 0x58-0x5B */
+	0x00, 0x00, 0x96, 0xC4, 0x00, 0x00, 0x96, 0xB4, /* 0x5C-0x5F */
+	0x00, 0x00, 0x89, 0xB2, 0x98, 0x53, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x71, /* 0x64-0x67 */
+	0x00, 0x00, 0x95, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB5, 0x00, 0x00, /* 0x70-0x73 */
+	0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x93, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8C, 0xA1, 0xE0, 0xB1, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x8D, 0xD2, 0xE0, 0xB3, 0xE0, 0xB2, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8B, 0x5D, 0x00, 0x00, 0xE0, 0xB7, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x8C, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x94, 0xC6, /* 0xAC-0xAF */
+	0x00, 0x00, 0xED, 0xFC, 0xE0, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xF3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x40, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xB6, 0xE0, 0xBB, /* 0xC0-0xC3 */
+	0xE0, 0xBD, 0x00, 0x00, 0xE0, 0xBC, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xBE, 0x00, 0x00, /* 0xCC-0xCF */
+	0x8C, 0xCF, 0x00, 0x00, 0xE0, 0xBF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE7, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x91, 0x5F, 0x00, 0x00, 0x8D, 0x9D, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE0, 0xC1, 0xE0, 0xC2, 0xE0, 0xC0, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x8E, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x93, 0xC6, 0x8B, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC4, /* 0xF4-0xF7 */
+	0x92, 0x4B, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x98, 0x54, 0x94, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0xE0, 0xC6, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD2, /* 0x18-0x1B */
+	0xE0, 0xC8, 0xE0, 0xCA, 0x00, 0x00, 0x97, 0xC2, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xEE, 0x41, 0xE0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE0, 0xCD, 0x92, 0x96, 0x94, 0x4C, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xA3, 0xE0, 0xCC, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE0, 0xCB, 0x00, 0x00, 0x97, 0x50, 0x97, 0x51, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xCF, 0x89, 0x8E, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x8D, 0x96, 0x8E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0xE0, 0xD1, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD3, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x62, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xE0, 0xD5, 0x00, 0x00, 0xE0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE0, 0xD6, 0x00, 0x00, 0x8A, 0x6C, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, 0xEE, 0x43, /* 0x74-0x77 */
+	0xE0, 0xD7, 0x00, 0x00, 0xE0, 0xDA, 0xE0, 0xD9, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x8C, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA6, /* 0x84-0x87 */
+	0x00, 0x00, 0x8B, 0xCA, 0x00, 0x00, 0x89, 0xA4, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xE8, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8A, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xE6, 0xE0, 0xDC, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEE, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE0, 0xDF, 0x00, 0x00, 0x89, 0xCF, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE0, 0xDB, 0xEE, 0x45, 0x8E, 0x58, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x92, 0xBF, 0xE0, 0xDD, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x48, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x46, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xDC-0xDF */
+	0x8E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x47, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE0, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x5D, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x94, 0xC7, 0xE0, 0xE1, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE0, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xEE, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xBB, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x85, /* 0x00-0x03 */
+	0x00, 0x00, 0xE0, 0xE4, 0x97, 0x9D, 0xEE, 0x49, /* 0x04-0x07 */
+	0x00, 0x00, 0x97, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xF4, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE0, 0xE6, 0xEE, 0x4B, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xEE, 0x4D, 0xEE, 0x4C, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x4E, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x97, 0xD4, /* 0x30-0x33 */
+	0x8B, 0xD5, 0x94, 0xFA, 0x94, 0x69, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEB, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE0, 0xED, 0x8C, 0xE8, 0x89, 0x6C, /* 0x58-0x5B */
+	0xE0, 0xEF, 0x00, 0x00, 0x90, 0x90, 0xE0, 0xEC, /* 0x5C-0x5F */
+	0x97, 0xDA, 0x00, 0x00, 0xEE, 0x4F, 0xE0, 0xF2, /* 0x60-0x63 */
+	0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE0, 0xF0, 0xE0, 0xF3, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x6C-0x6F */
+	0xE0, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xBA, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF4, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x9E, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xEE, 0x50, 0x00, 0x00, 0xE0, 0xF6, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF7, 0xEE, 0x51, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x8A, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x8E, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF9, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x89, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE1, 0x40, 0x00, 0x00, 0x95, 0x5A, 0xE1, 0x41, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xA2, 0xE1, 0x42, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x44, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE1, 0x46, 0xE1, 0x47, 0xE1, 0x45, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x72, 0xE1, 0x49, /* 0xF4-0xF7 */
+	0xE1, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_75[512] = {
+	0x00, 0x00, 0xEE, 0x52, 0x00, 0x00, 0xE1, 0x4B, /* 0x00-0x03 */
+	0xE1, 0x4A, 0xE1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0x4D, 0xE1, 0x4F, 0xE1, 0x4E, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8D, 0x99, 0x00, 0x00, 0xE1, 0x51, /* 0x10-0x13 */
+	0x00, 0x00, 0xE1, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x8A, 0xC3, 0x00, 0x00, 0x90, 0x72, 0x00, 0x00, /* 0x18-0x1B */
+	0x93, 0x5B, 0x00, 0x00, 0xE1, 0x52, 0x90, 0xB6, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x59, /* 0x20-0x23 */
+	0x00, 0x00, 0x89, 0x99, 0xE1, 0x53, 0x00, 0x00, /* 0x24-0x27 */
+	0x97, 0x70, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE1, /* 0x28-0x2B */
+	0xE1, 0x54, 0x00, 0x00, 0x00, 0x00, 0xED, 0x8C, /* 0x2C-0x2F */
+	0x93, 0x63, 0x97, 0x52, 0x8D, 0x62, 0x90, 0x5C, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6A, /* 0x34-0x37 */
+	0x99, 0xB2, 0x00, 0x00, 0x92, 0xAC, 0x89, 0xE6, /* 0x38-0x3B */
+	0xE1, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE1, 0x56, 0x00, 0x00, 0xE1, 0x5B, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE1, 0x59, 0xE1, 0x58, 0x9D, 0xC0, /* 0x48-0x4B */
+	0x8A, 0x45, 0xE1, 0x57, 0x00, 0x00, 0x88, 0xD8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x94, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x94, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x97, 0xAF, 0xE1, 0x5C, 0xE1, 0x5A, /* 0x58-0x5B */
+	0x92, 0x7B, 0x90, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x94, 0xA9, 0x00, 0x00, 0x95, 0x4C, 0x00, 0x00, /* 0x60-0x63 */
+	0xE1, 0x5E, 0x97, 0xAA, 0x8C, 0x6C, 0xE1, 0x5F, /* 0x64-0x67 */
+	0x00, 0x00, 0xE1, 0x5D, 0x94, 0xD4, 0xE1, 0x60, /* 0x68-0x6B */
+	0x00, 0x00, 0xE1, 0x61, 0x00, 0x00, 0xEE, 0x53, /* 0x6C-0x6F */
+	0x88, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF4, /* 0x70-0x73 */
+	0xE1, 0x66, 0x00, 0x00, 0xE1, 0x63, 0x93, 0xEB, /* 0x74-0x77 */
+	0xE1, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x45, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x69, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x64, 0xE1, 0x65, /* 0x84-0x87 */
+	0x00, 0x00, 0xE1, 0x68, 0xE1, 0x67, 0x95, 0x44, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x61, 0x91, 0x60, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8B, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE1, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x6B, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xE1, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6E, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE1, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x75, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE1, 0x76, 0x94, 0xE6, 0xE1, 0x70, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE1, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE1, 0x74, 0x90, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE1, 0x75, 0xE1, 0x73, 0x8E, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x6F, 0xE1, 0x71, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x95, 0x61, 0x00, 0x00, 0x8F, 0xC7, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x78, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE1, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x79, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x8E, 0xA4, 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x93, 0x97, 0xE1, 0x7A, 0x00, 0x00, 0x92, 0xC9, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x7C, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x9F, 0xE1, 0x7B, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x91, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE1, 0x82, 0x00, 0x00, 0xE1, 0x84, 0xE1, 0x85, /* 0xF0-0xF3 */
+	0x92, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x83, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE1, 0x80, 0x00, 0x00, 0xE1, 0x7D, 0xE1, 0x7E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0x00, 0x00, 0xE1, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE1, 0x88, 0x00, 0x00, 0xE1, 0x86, /* 0x08-0x0B */
+	0x00, 0x00, 0xE1, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x89, /* 0x1C-0x1F */
+	0xE1, 0x8B, 0xE1, 0x8C, 0xE1, 0x8D, 0x00, 0x00, /* 0x20-0x23 */
+	0xE1, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x8A, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE1, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE1, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x91, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xC3, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x94, 0xE1, 0x92, /* 0x44-0x47 */
+	0xE1, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x8A, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xFC, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xC8, 0x00, 0x00, /* 0x54-0x57 */
+	0xE1, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE1, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE1, 0x97, 0xE1, 0x98, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x9C, /* 0x64-0x67 */
+	0xE1, 0x99, 0xE1, 0x9A, 0xE1, 0x9B, 0x00, 0x00, /* 0x68-0x6B */
+	0xE1, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE1, 0x9E, 0x00, 0x00, 0xE1, 0x9F, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA0, 0x00, 0x00, /* 0x74-0x77 */
+	0xE1, 0xA1, 0x00, 0x00, 0x94, 0xAD, 0x93, 0x6F, /* 0x78-0x7B */
+	0xE1, 0xA2, 0x94, 0x92, 0x95, 0x53, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE1, 0xA3, 0x00, 0x00, 0xEE, 0x54, 0xE1, 0xA4, /* 0x80-0x83 */
+	0x93, 0x49, 0x00, 0x00, 0x8A, 0x46, 0x8D, 0x63, /* 0x84-0x87 */
+	0xE1, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA6, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8E, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE1, 0xAA, 0xE1, 0xAB, 0xEE, 0x57, /* 0x98-0x9B */
+	0xEE, 0x55, 0x00, 0x00, 0xEE, 0x56, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x58, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xE7, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE1, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x89, /* 0xB4-0xB7 */
+	0xE1, 0xAE, 0xE1, 0xAF, 0xE1, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4D, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB1, 0x94, 0x75, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x7E, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x89, 0x6D, 0x00, 0x00, 0x89, 0x76, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE1, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB3, 0x93, 0x90, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xB7, /* 0xD8-0xDB */
+	0x9F, 0x58, 0x00, 0x00, 0xE1, 0xB5, 0x96, 0xBF, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE1, 0xB6, 0x00, 0x00, 0x8A, 0xC4, /* 0xE0-0xE3 */
+	0x94, 0xD5, 0xE1, 0xB7, 0x00, 0x00, 0xE1, 0xB8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xDA, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xD3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x92, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x91, 0x8A, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBB, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x82, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0x00, 0x00, 0x8F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE1, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0x04-0x07 */
+	0xE1, 0xBC, 0x94, 0xFB, 0x00, 0x00, 0x8A, 0xC5, /* 0x08-0x0B */
+	0x8C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC1, 0x90, 0x5E, /* 0x1C-0x1F */
+	0x96, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE1, 0xC0, 0xE1, 0xC2, 0xE1, 0xC3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC5, /* 0x34-0x37 */
+	0xE1, 0xC6, 0x00, 0x00, 0x92, 0xAD, 0x00, 0x00, /* 0x38-0x3B */
+	0x8A, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x92, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x5A, 0xE1, 0xC7, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xCB, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x90, 0x87, 0x00, 0x00, 0x93, 0xC2, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xCC, 0x96, 0x72, 0x00, 0x00, /* 0x64-0x67 */
+	0xE1, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xCA, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE1, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xCE, 0xE1, 0xCD, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD1, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE1, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x95, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x8F, 0x75, 0x97, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD6, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE1, 0xDB, /* 0xB8-0xBB */
+	0xE1, 0xD9, 0xE1, 0xDA, 0x00, 0x00, 0xE1, 0xD8, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE1, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDE, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xDF, 0x96, 0xB5, /* 0xD8-0xDB */
+	0xE1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xEE, 0xE1, 0xE1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x92, 0x6D, 0x00, 0x00, 0x94, 0x8A, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x8B, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x92, 0x5A, 0xE1, 0xE2, 0x8B, 0xB8, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xCE, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE1, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_78[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xBB, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0x00, 0x00, /* 0x10-0x13 */
+	0x8C, 0xA4, 0x8D, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE1, 0xE7, 0xEE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x93, 0x75, 0x8D, 0xD4, 0x8B, 0x6D, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x43, 0x00, 0x00, /* 0x30-0x33 */
+	0x94, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x76, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x7B, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x8F, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEE, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xB0, /* 0x68-0x6B */
+	0x8D, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xA5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xA1, 0x00, 0x00, /* 0x70-0x73 */
+	0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x5F, 0x00, 0x00, /* 0x78-0x7B */
+	0xE1, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x8C, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xEC, 0x92, 0xF4, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xE1, 0xEF, 0x8A, 0x56, 0xE1, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x94, 0xE8, 0x00, 0x00, 0x89, 0x4F, /* 0x90-0x93 */
+	0x00, 0x00, 0x8D, 0xEA, 0x00, 0x00, 0x98, 0x71, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xEE, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF0, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC9, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x90, 0xD7, 0xE1, 0xF2, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x6D, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE1, 0xF9, 0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x8E, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE1, 0xFA, 0xE1, 0xF5, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xFB, 0xE1, 0xF6, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x94, 0xD6, 0xE1, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE1, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x41, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x40, /* 0xE4-0xE7 */
+	0x96, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE9, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE2, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE2, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_79[512] = {
+	0x00, 0x00, 0x8F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x44, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x62, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE2, 0x46, 0xE2, 0x45, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE2, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE8, 0xE2, 0x49, /* 0x28-0x2B */
+	0xE2, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEE, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xA6, 0x00, 0x00, /* 0x38-0x3B */
+	0x97, 0xE7, 0x00, 0x00, 0x8E, 0xD0, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE2, 0x4A, 0x8C, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x5F, /* 0x44-0x47 */
+	0x8B, 0x46, 0x8E, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x97, 0x53, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x50, /* 0x50-0x53 */
+	0x00, 0x00, 0xE2, 0x4F, 0x91, 0x63, 0xE2, 0x4C, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x4E, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x8F, 0x6A, 0x90, 0x5F, 0xE2, 0x4D, /* 0x5C-0x5F */
+	0xE2, 0x4B, 0x00, 0x00, 0x94, 0x49, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x8F, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x95, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x8D, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x98, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x51, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x52, /* 0x7C-0x7F */
+	
+	0xE2, 0x68, 0x8B, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x98, 0x5C, 0x91, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x89, 0xD0, 0x92, 0xF5, 0x95, 0x9F, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xEE, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x66, /* 0x98-0x9B */
+	0x00, 0x00, 0xE2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x9A, 0xE2, 0x55, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x57, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x58, 0x00, 0x00, /* 0xAC-0xAF */
+	0x94, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x59, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0x5A, 0xE2, 0x5B, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x8B, 0xD7, 0x89, 0xD1, 0x93, 0xC3, /* 0xBC-0xBF */
+	0x8F, 0x47, 0x8E, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x8F, 0x48, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x89, 0xC8, 0x95, 0x62, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE2, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x94, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x64, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE2, 0x60, 0x00, 0x00, 0xE2, 0x61, /* 0xE0-0xE3 */
+	0x94, 0x89, 0x00, 0x00, 0x90, 0x60, 0xE2, 0x5E, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x92, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE2, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8F, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDA, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0x8B, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE2, 0x62, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF6, /* 0x08-0x0B */
+	0x00, 0x00, 0xE2, 0x63, 0x90, 0xC5, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x96, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x42, /* 0x14-0x17 */
+	0xE2, 0x64, 0xE2, 0x65, 0x92, 0x74, 0x00, 0x00, /* 0x18-0x1B */
+	0x97, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x67, /* 0x1C-0x1F */
+	0xE2, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xED, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE2, 0x69, 0x88, 0xEE, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6C, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6A, /* 0x38-0x3B */
+	0x89, 0xD2, 0x8C, 0x6D, 0xE2, 0x6B, 0x8D, 0x65, /* 0x3C-0x3F */
+	0x8D, 0x92, 0x00, 0x00, 0x95, 0xE4, 0xE2, 0x6D, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x73, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE2, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x90, 0xCF, 0x89, 0x6E, 0x89, 0xB8, /* 0x4C-0x4F */
+	0x88, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6E, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE2, 0x70, 0xE2, 0x71, 0x8F, 0xF5, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE2, 0x72, 0x00, 0x00, 0x8A, 0x6E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x8C, 0x8A, 0x00, 0x00, 0x8B, 0x86, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE2, 0x75, 0x8B, 0xF3, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE2, 0x76, 0x00, 0x00, 0x90, 0xFA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x93, 0xCB, 0x00, 0x00, 0x90, 0xDE, /* 0x80-0x83 */
+	0x8D, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE2, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x82, 0x91, 0x8B, /* 0x90-0x93 */
+	0x00, 0x00, 0xE2, 0x79, 0xE2, 0x7B, 0xE2, 0x78, /* 0x94-0x97 */
+	0xE2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x41, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE2, 0x7C, 0x8C, 0x45, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x87, 0x97, 0x71, /* 0xAC-0xAF */
+	0xE2, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x80, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x83, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x96, /* 0xC0-0xC3 */
+	0xE2, 0x82, 0xE2, 0x81, 0x00, 0x00, 0xE2, 0x85, /* 0xC4-0xC7 */
+	0xE2, 0x7D, 0x00, 0x00, 0xE2, 0x86, 0x97, 0xA7, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE2, 0x87, 0x00, 0x00, 0xE2, 0x88, /* 0xCC-0xCF */
+	0x00, 0x00, 0xEE, 0x67, 0x9A, 0xF2, 0xE2, 0x8A, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE2, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE2, 0x8B, 0xE2, 0x8C, 0x00, 0x00, /* 0xD8-0xDB */
+	0x97, 0xB3, 0xE2, 0x8D, 0x00, 0x00, 0xE8, 0xED, /* 0xDC-0xDF */
+	0x8F, 0xCD, 0xE2, 0x8E, 0xE2, 0x8F, 0x8F, 0x76, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x93, 0xB6, 0xE2, 0x90, 0xEE, 0x68, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x47, 0xEE, 0x6A, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE2, 0x91, 0x00, 0x00, 0x92, 0x5B, /* 0xEC-0xEF */
+	0xE2, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xA3, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x99, 0x5E, 0x92, 0x7C, 0x8E, 0xB1, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x93, 0x00, 0x00, /* 0x00-0x03 */
+	0xE2, 0xA0, 0x00, 0x00, 0xE2, 0x96, 0x00, 0x00, /* 0x04-0x07 */
+	0x8B, 0x88, 0x00, 0x00, 0xE2, 0x95, 0xE2, 0xA2, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x94, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8F, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE2, 0x98, 0xE2, 0x99, 0x00, 0x00, 0x93, 0x4A, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x9A, 0x00, 0x00, /* 0x1C-0x1F */
+	0x8A, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x90, 0x79, 0x95, 0x84, 0x00, 0x00, /* 0x24-0x27 */
+	0xE2, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x91, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x97, /* 0x30-0x33 */
+	0x00, 0x00, 0xE2, 0x9B, 0xE2, 0x9D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE2, 0xA4, 0x95, 0x4D, 0x00, 0x00, /* 0x44-0x47 */
+	0x94, 0xA4, 0x93, 0x99, 0x00, 0x00, 0x8B, 0xD8, /* 0x48-0x4B */
+	0xE2, 0xA3, 0xE2, 0xA1, 0x00, 0x00, 0x94, 0xB3, /* 0x4C-0x4F */
+	0xE2, 0x9E, 0x92, 0x7D, 0x93, 0x9B, 0x00, 0x00, /* 0x50-0x53 */
+	0x93, 0x9A, 0x00, 0x00, 0x8D, 0xF4, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE2, 0xA6, 0x00, 0x00, 0xE2, 0xA8, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE2, 0xAB, 0x00, 0x00, 0xE2, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE2, 0xA9, 0xE2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE2, 0xA7, 0xE2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x9F, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xCD, 0x89, 0xD3, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB3, /* 0x88-0x8B */
+	0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, 0xE2, 0xB5, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, 0x00, 0x00, /* 0x90-0x93 */
+	0x94, 0x93, 0x96, 0xA5, 0x00, 0x00, 0x8E, 0x5A, /* 0x94-0x97 */
+	0xE2, 0xAE, 0xE2, 0xB7, 0xE2, 0xB2, 0x00, 0x00, /* 0x98-0x9B */
+	0xE2, 0xB1, 0xE2, 0xAD, 0xEE, 0x6B, 0xE2, 0xAF, /* 0x9C-0x9F */
+	0x00, 0x00, 0x8A, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x90, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x94, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x94, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x90, 0xDF, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x94, 0xCD, 0x00, 0x00, 0xE2, 0xBD, 0x95, 0xD1, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x92, 0x7A, 0x00, 0x00, 0xE2, 0xB8, /* 0xC8-0xCB */
+	0xE2, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBB, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x8E, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x93, 0xC4, 0xE2, 0xC3, 0xE2, 0xC2, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE2, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x98, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC8, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCC, 0xE2, 0xC9, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0xE2, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC6, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE2, 0xC0, 0x99, 0xD3, 0xE2, 0xC7, /* 0x10-0x13 */
+	0xE2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0x1C-0x1F */
+	0x00, 0x00, 0x8A, 0xC8, 0x00, 0x00, 0xE2, 0xCD, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0xE2, 0xD2, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, /* 0x34-0x37 */
+	0x94, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xE2, 0xD3, 0x97, 0xFA, 0x95, 0xEB, /* 0x3C-0x3F */
+	0xE2, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD5, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE2, 0xD4, 0x90, 0xD0, 0x00, 0x00, 0xE2, 0xD7, /* 0x4C-0x4F */
+	0xE2, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE2, 0xD6, 0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, /* 0x54-0x57 */
+	0xE2, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xDB, /* 0x5C-0x5F */
+	0xE2, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE2, 0xDC, 0xE2, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC4, /* 0x70-0x73 */
+	0x00, 0x00, 0xE2, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xE0, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x8B, 0xCC, 0x8C, 0x48, 0xE2, 0xE1, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x95, 0xB2, 0x00, 0x00, 0x90, 0x88, /* 0x88-0x8B */
+	0x00, 0x00, 0x96, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0xE2, 0x00, 0x00, 0x97, 0xB1, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x94, 0x94, 0x00, 0x00, 0x91, 0x65, /* 0x94-0x97 */
+	0x94, 0x53, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6C, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xBE, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE2, 0xE7, 0xE2, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE2, 0xE3, 0x8A, 0x9F, 0x00, 0x00, 0x8F, 0xCF, /* 0xA4-0xA7 */
+	0xE2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE6, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE2, 0xE4, 0xE2, 0xEC, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE2, 0xEB, 0xE2, 0xEA, 0xE2, 0xE9, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE2, 0xEE, 0x90, 0xB8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE2, 0xEF, 0x00, 0x00, 0xE2, 0xF1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xD0, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x57, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF3, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x9C, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE2, 0xF4, 0x00, 0x00, 0x95, 0xB3, 0x91, 0x8C, /* 0xDC-0xDF */
+	0x8D, 0x66, 0x00, 0x00, 0xE2, 0xF5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xC6, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF7, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE2, 0xF9, 0x00, 0x00, 0xE2, 0xFA, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8E, 0x85, 0x00, 0x00, 0xE2, 0xFB, 0x8C, 0x6E, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x8A, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0x8B, 0x49, 0x00, 0x00, 0xE3, 0x40, 0x00, 0x00, /* 0x00-0x03 */
+	0x96, 0xF1, 0x8D, 0x67, 0xE2, 0xFC, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x43, 0x96, 0xE4, /* 0x08-0x0B */
+	0x00, 0x00, 0x94, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x95, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x8F, 0x83, 0xE3, 0x42, 0x00, 0x00, 0x8E, 0xD1, /* 0x14-0x17 */
+	0x8D, 0x68, 0x8E, 0x86, 0x8B, 0x89, 0x95, 0xB4, /* 0x18-0x1B */
+	0xE3, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x91, 0x66, 0x96, 0x61, 0x8D, 0xF5, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x87, /* 0x28-0x2B */
+	0x92, 0xDB, 0x00, 0x00, 0xE3, 0x46, 0x97, 0xDD, /* 0x2C-0x2F */
+	0x8D, 0xD7, 0x00, 0x00, 0xE3, 0x47, 0x90, 0x61, /* 0x30-0x33 */
+	0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8F, 0xD0, 0x8D, 0xAE, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x48, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x49, 0x8C, 0xBC, /* 0x40-0x43 */
+	0x91, 0x67, 0xE3, 0x44, 0xE3, 0x4A, 0x00, 0x00, /* 0x44-0x47 */
+	0xEE, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x45, /* 0x48-0x4B */
+	0x8C, 0x6F, 0x00, 0x00, 0xE3, 0x4D, 0xE3, 0x51, /* 0x4C-0x4F */
+	0x8C, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x4C, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x55, /* 0x58-0x5B */
+	0xEE, 0x6E, 0x00, 0x00, 0x8D, 0x69, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x97, 0x8D, 0x88, 0xBA, 0xE3, 0x52, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x8B, 0x00, 0x00, /* 0x64-0x67 */
+	0xE3, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x50, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x93, 0x9D, 0xE3, 0x4E, 0xE3, 0x4B, /* 0x70-0x73 */
+	0x00, 0x00, 0x8A, 0x47, 0x90, 0xE2, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x8C, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE3, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE3, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x56, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x53, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x8C, 0x70, 0x91, 0xB1, 0xE3, 0x58, /* 0x98-0x9B */
+	0x91, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x65, /* 0x9C-0x9F */
+	0xEE, 0x70, 0x00, 0x00, 0xE3, 0x61, 0xE3, 0x5B, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5F, /* 0xA8-0xAB */
+	0x8E, 0xF8, 0x88, 0xDB, 0xE3, 0x5A, 0xE3, 0x62, /* 0xAC-0xAF */
+	0xE3, 0x66, 0x8D, 0x6A, 0x96, 0xD4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x92, 0xD4, 0xE3, 0x5C, 0x00, 0x00, 0xEE, 0x6F, /* 0xB4-0xB7 */
+	0xE3, 0x64, 0x00, 0x00, 0xE3, 0x59, 0x92, 0x5D, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE3, 0x5E, 0x88, 0xBB, 0x96, 0xC8, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xD9, 0x94, 0xEA, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x8D, /* 0xCC-0xCF */
+	0x00, 0x00, 0x97, 0xCE, 0x8F, 0x8F, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE3, 0x8E, 0xEE, 0x71, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE3, 0x67, 0x00, 0x00, 0x90, 0xFC, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE3, 0x63, 0xE3, 0x68, 0xE3, 0x6A, 0x00, 0x00, /* 0xDC-0xDF */
+	0x92, 0xF7, 0xE3, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE3, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x95, 0xD2, 0x8A, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x96, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDC, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x6C, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x97, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x6B, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0x00, 0x00, 0x89, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x93, 0xEA, 0xE3, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE3, 0x75, 0xE3, 0x6F, 0xE3, 0x76, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x9B, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xC8, 0xE3, 0x74, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE3, 0x71, 0xE3, 0x77, 0xE3, 0x70, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x63, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x44, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x6B, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE3, 0x73, 0xE3, 0x80, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE3, 0x7B, 0x00, 0x00, 0xE3, 0x7E, /* 0x34-0x37 */
+	0x00, 0x00, 0xE3, 0x7C, 0xE3, 0x81, 0xE3, 0x7A, /* 0x38-0x3B */
+	0x00, 0x00, 0xE3, 0x60, 0x90, 0xD1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x94, 0xC9, 0x00, 0x00, 0xE3, 0x7D, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x78, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x40, 0x8C, 0x71, /* 0x48-0x4B */
+	0x00, 0x00, 0x8F, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x72, 0x00, 0x00, /* 0x50-0x53 */
+	0x90, 0x44, 0x91, 0x55, 0xE3, 0x84, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE3, 0x86, 0xE3, 0x87, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE3, 0x83, 0xE3, 0x85, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x79, 0xE3, 0x82, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0x8A, 0xE3, 0x89, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x96, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8C, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE3, 0x88, 0x00, 0x00, 0xE3, 0x8C, /* 0x78-0x7B */
+	0xE3, 0x8B, 0xE3, 0x8F, 0x00, 0x00, 0xE3, 0x91, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x5B, 0xE3, 0x8D, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE3, 0x92, 0xE3, 0x93, 0xED, 0x40, 0x00, 0x00, /* 0x88-0x8B */
+	0xE3, 0x94, 0x00, 0x00, 0xE3, 0x9A, 0x93, 0x5A, /* 0x8C-0x8F */
+	0xE3, 0x96, 0x00, 0x00, 0xE3, 0x95, 0xE3, 0x97, /* 0x90-0x93 */
+	0xE3, 0x98, 0x00, 0x00, 0xE3, 0x99, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x9B, /* 0x98-0x9B */
+	0xE3, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xCA, 0x00, 0x00, /* 0x34-0x37 */
+	0xE3, 0x9D, 0x00, 0x00, 0xE3, 0x9E, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE3, 0x9F, 0x00, 0x00, 0xEE, 0x73, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE3, 0xA0, 0xE3, 0xA1, 0xE3, 0xA2, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE3, 0xA3, 0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE3, 0xA6, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, /* 0x5C-0x5F */
+	0xE3, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x64-0x67 */
+	0xE3, 0xAA, 0xE3, 0xAB, 0x8D, 0xDF, 0x8C, 0x72, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x75, 0x00, 0x00, /* 0x6C-0x6F */
+	0x94, 0xB1, 0x00, 0x00, 0x8F, 0x90, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x94, 0x6C, 0x00, 0x00, 0x94, 0xEB, /* 0x74-0x77 */
+	0xE3, 0xAD, 0x9C, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, 0xE3, 0xB0, /* 0x80-0x83 */
+	0x00, 0x00, 0x97, 0x85, 0xE3, 0xAF, 0xE3, 0xB2, /* 0x84-0x87 */
+	0xE3, 0xB1, 0x00, 0x00, 0x97, 0x72, 0x00, 0x00, /* 0x88-0x8B */
+	0xE3, 0xB3, 0x00, 0x00, 0x94, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE3, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xB7, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xE3, 0xB6, 0xE3, 0xB5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEE, 0x74, 0x00, 0x00, 0xE3, 0xB8, /* 0xA0-0xA3 */
+	0x8C, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x91, 0x41, 0x8B, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xE3, 0xB9, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE3, 0xBE, 0xE3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x89, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x89, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE3, 0xC0, 0xE3, 0xC1, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xC8-0xCB */
+	0x97, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x4B, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE3, 0xC4, 0xE3, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x90, 0x89, 0xE3, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x8A, 0xE3, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8A, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC8, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x96, 0x7C, /* 0xF8-0xFB */
+	0x97, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0x97, 0x73, 0x98, 0x56, 0x00, 0x00, 0x8D, 0x6C, /* 0x00-0x03 */
+	0xE3, 0xCC, 0x8E, 0xD2, 0xE3, 0xCB, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCD, /* 0x08-0x0B */
+	0x8E, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x91, 0xCF, 0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x8D, 0x6B, 0x00, 0x00, 0x96, 0xD5, /* 0x14-0x17 */
+	0xE3, 0xCF, 0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xA8, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xEB, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD5, /* 0x38-0x3B */
+	0x00, 0x00, 0x92, 0x5E, 0x00, 0x00, 0xE3, 0xD4, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD7, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD8, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB9, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xD9, 0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xB7, 0xE3, 0xDB, /* 0x5C-0x5F */
+	0x00, 0x00, 0x91, 0x8F, 0xE3, 0xDC, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xE3, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xFC, /* 0x6C-0x6F */
+	0xE3, 0xE0, 0x00, 0x00, 0xE3, 0xDF, 0xE3, 0xDE, /* 0x70-0x73 */
+	0x92, 0xAE, 0x00, 0x00, 0xE3, 0xE1, 0x90, 0x45, /* 0x74-0x77 */
+	0x00, 0x00, 0xE3, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE3, 0xE3, 0x98, 0x57, 0xE3, 0xE4, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE3, 0xE5, 0xE3, 0xE7, 0xE3, 0xE6, 0x94, 0xA3, /* 0x84-0x87 */
+	0x00, 0x00, 0x93, 0xF7, 0x00, 0x00, 0x98, 0x5D, /* 0x88-0x8B */
+	0x94, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE9, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xD1, 0x00, 0x00, /* 0x94-0x97 */
+	0x95, 0x49, 0x00, 0x00, 0xE3, 0xEA, 0xE3, 0xE8, /* 0x98-0x9B */
+	0x00, 0x00, 0x8A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x8C, 0xD2, 0x8E, 0x88, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x94, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8C, 0xA8, 0x96, 0x62, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE3, 0xED, 0xE3, 0xEB, 0x00, 0x00, 0x8D, 0x6D, /* 0xAC-0xAF */
+	0x00, 0x00, 0x8D, 0x6E, 0x88, 0xE7, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x8D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x78, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDD, /* 0xC0-0xC3 */
+	0xE3, 0xF2, 0x00, 0x00, 0x92, 0x5F, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x94, 0x77, 0x00, 0x00, 0x91, 0xD9, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE3, 0xF0, 0xE3, 0xF3, 0xE3, 0xEE, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE3, 0xF1, 0x96, 0x45, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8C, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x88, 0xFB, 0xE3, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x93, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8B, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE4, 0x45, 0x94, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x89, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x8B, 0xBA, 0x90, 0xC6, 0x98, 0x65, /* 0x04-0x07 */
+	0x96, 0xAC, 0xE3, 0xF5, 0x90, 0xD2, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x72, 0xE3, 0xF8, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFA, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE3, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFB, /* 0x2C-0x2F */
+	0x00, 0x00, 0x92, 0x45, 0x00, 0x00, 0x94, 0x5D, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x92, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x41, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFC, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x74, 0x00, 0x00, /* 0x4C-0x4F */
+	0x95, 0x85, 0xE4, 0x44, 0x00, 0x00, 0xE4, 0x43, /* 0x50-0x53 */
+	0x8D, 0x6F, 0x98, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE4, 0x48, 0xE4, 0x49, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x47, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8D, 0x98, 0xE4, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE4, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x92, 0xB0, 0x95, 0xA0, 0x91, 0x42, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDA, /* 0x7C-0x7F */
+	
+	0xE4, 0x4E, 0x00, 0x00, 0xE4, 0x4F, 0xE4, 0x4B, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE4, 0x4C, 0x00, 0x00, 0xE4, 0x4D, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x70, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x55, /* 0x90-0x93 */
+	0x00, 0x00, 0xE4, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x86, 0x00, 0x00, /* 0x98-0x9B */
+	0x96, 0x8C, 0x95, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x53, /* 0xA0-0xA3 */
+	0xE4, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x96, 0x63, 0xE4, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE4, 0x57, 0x00, 0x00, 0x00, 0x00, 0x91, 0x56, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE4, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE4, 0x5A, 0x00, 0x00, 0xE4, 0x5E, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE4, 0x5B, 0xE4, 0x59, 0x94, 0x5E, /* 0xBC-0xBF */
+	0xE4, 0x5C, 0x00, 0x00, 0xE4, 0x5D, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE4, 0x64, 0xE4, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE4, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE4, 0x61, 0x00, 0x00, 0x91, 0x9F, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE4, 0x63, 0xE4, 0x62, 0xE4, 0x65, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x66, /* 0xDC-0xDF */
+	0xE4, 0x67, 0x00, 0x00, 0x00, 0x00, 0x90, 0x62, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x89, 0xE7, 0x00, 0x00, 0xE4, 0x68, /* 0xE4-0xE7 */
+	0x97, 0xD5, 0x00, 0x00, 0x8E, 0xA9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x8F, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8A, /* 0xF0-0xF3 */
+	0x92, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x69, 0xE4, 0x6A, /* 0xF8-0xFB */
+	0x89, 0x50, 0x00, 0x00, 0xE4, 0x6B, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0x00, 0x00, 0xE4, 0x6C, 0xE4, 0x6D, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE4, 0x6E, 0x00, 0x00, 0xE4, 0x6F, /* 0x04-0x07 */
+	0x8B, 0xBB, 0x9D, 0xA8, 0xE4, 0x70, 0x00, 0x00, /* 0x08-0x0B */
+	0x90, 0xE3, 0xE4, 0x71, 0x8E, 0xC9, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE4, 0x72, 0x00, 0x00, 0x98, 0xAE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0x95, 0xDC, /* 0x14-0x17 */
+	0x8A, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x91, 0x43, /* 0x18-0x1B */
+	0x8F, 0x77, 0x00, 0x00, 0x95, 0x91, 0x8F, 0x4D, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE4, 0x74, 0x8D, 0x71, 0xE4, 0x75, /* 0x28-0x2B */
+	0x94, 0xCA, 0x00, 0x00, 0xE4, 0x84, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x77, /* 0x30-0x33 */
+	0x00, 0x00, 0x91, 0xC7, 0x94, 0x95, 0x8C, 0xBD, /* 0x34-0x37 */
+	0xE4, 0x76, 0x91, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE4, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF8, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE4, 0x7A, 0xE4, 0x79, 0xE4, 0x7C, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE4, 0x7B, 0x00, 0x00, 0xE4, 0x7D, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x80, 0x00, 0x00, /* 0x60-0x63 */
+	0xE4, 0x7E, 0x00, 0x00, 0x8A, 0xCD, 0x00, 0x00, /* 0x64-0x67 */
+	0xE4, 0x81, 0x00, 0x00, 0xE4, 0x82, 0xE4, 0x83, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xAF, 0x97, 0xC7, /* 0x6C-0x6F */
+	0x00, 0x00, 0xE4, 0x85, 0x90, 0x46, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x90, 0xE4, 0x86, /* 0x74-0x77 */
+	0xE4, 0x87, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x88, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF0, /* 0x88-0x8B */
+	0x00, 0x00, 0xE4, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x8A, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x95, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x8E, 0xC5, 0x00, 0x00, 0xE4, 0x8C, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x8A, 0x48, 0x88, 0xB0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x8B, /* 0xA8-0xAB */
+	0xE4, 0x8E, 0x94, 0x6D, 0x00, 0x00, 0x90, 0x63, /* 0xAC-0xAF */
+	0x00, 0x00, 0x89, 0xD4, 0x00, 0x00, 0x96, 0x46, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8C, 0x7C, 0x8B, 0xDA, 0x00, 0x00, 0xE4, 0x8D, /* 0xB8-0xBB */
+	0x00, 0x00, 0x89, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8A, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x89, 0x91, 0xE4, 0x92, 0x97, 0xE8, /* 0xD0-0xD3 */
+	0x91, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x63, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE4, 0x9E, 0x00, 0x00, 0x89, 0xD5, /* 0xD8-0xDB */
+	0xE4, 0x9C, 0x00, 0x00, 0xE4, 0x9A, 0xE4, 0x91, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE4, 0x8F, 0x00, 0x00, 0xE4, 0x90, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x8E, 0xE1, 0x8B, 0xEA, 0x92, 0x97, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xCF, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x89, 0x70, 0x00, 0x00, 0xE4, 0x94, /* 0xF0-0xF3 */
+	0xE4, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE4, 0x99, 0xE4, 0x95, 0xE4, 0x98, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_83[512] = {
+	0x00, 0x00, 0xEE, 0x76, 0x96, 0xCE, 0xE4, 0x97, /* 0x00-0x03 */
+	0x89, 0xD6, 0x8A, 0x9D, 0xE4, 0x9B, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x73, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xA1, 0xE4, 0xAA, /* 0x14-0x17 */
+	0xE4, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x88, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB2, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x88, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA9, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE4, 0xA3, 0xE4, 0xA2, 0x00, 0x00, /* 0x30-0x33 */
+	0xE4, 0xA0, 0xE4, 0x9F, 0x92, 0x83, 0x00, 0x00, /* 0x34-0x37 */
+	0x91, 0xF9, 0xE4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE4, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE4, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x91, 0x90, 0x8C, 0x74, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x60, /* 0x4C-0x4F */
+	0xE4, 0xA6, 0x00, 0x00, 0x8D, 0x72, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x91, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x77, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB8, /* 0x70-0x73 */
+	0x00, 0x00, 0xE4, 0xB9, 0x00, 0x00, 0x89, 0xD7, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xAC, /* 0x78-0x7B */
+	0xE4, 0xB6, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x78, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, 0xE4, 0xB4, /* 0x84-0x87 */
+	0x00, 0x00, 0xE4, 0xBB, 0xE4, 0xB5, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xB3, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x96, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xB1, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAD, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xCE, 0xE4, 0xAF, /* 0x9C-0x9F */
+	0xE4, 0xBA, 0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE4, 0xBC, 0x00, 0x00, 0xE4, 0xAE, 0x94, 0x9C, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x97, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x90, 0x9B, 0x00, 0x00, 0xEE, 0x79, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x65, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8B, 0xDB, 0x00, 0x00, 0xE4, 0xC0, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xD2, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x8D, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x70, /* 0xDC-0xDF */
+	0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x95, 0xEC, 0x00, 0x00, 0xE4, 0xBF, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD8, /* 0xEC-0xEF */
+	0x8C, 0xD4, 0x95, 0x48, 0xE4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE4, 0xBD, 0x00, 0x00, 0xEE, 0x7A, 0xE4, 0xC6, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD0, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE4, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC2, /* 0x00-0x03 */
+	0x93, 0xB8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC4, /* 0x08-0x0B */
+	0x96, 0x47, 0xE4, 0xCA, 0x88, 0xDE, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBE, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE4, 0xCC, 0x00, 0x00, 0xE4, 0xCB, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x94, 0x8B, 0xE4, 0xD2, 0x00, 0x00, /* 0x28-0x2B */
+	0xE4, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8A, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xE4, 0xD3, 0x97, 0x8E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0x44-0x47 */
+	0xEE, 0x7B, 0x97, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xA8, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x98, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x8B, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x95, 0x92, 0xE4, 0xE2, 0x93, 0x9F, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xAF, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE4, 0xD7, /* 0x68-0x6B */
+	0x91, 0x92, 0xE4, 0xD1, 0xE4, 0xD9, 0xE4, 0xDE, /* 0x6C-0x6F */
+	0x00, 0x00, 0x94, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x88, 0xA8, 0x00, 0x00, 0xE4, 0xD6, /* 0x74-0x77 */
+	0x00, 0x00, 0xE4, 0xDF, 0x95, 0x98, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, /* 0x80-0x83 */
+	0xE4, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD3, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8F, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x8E, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x96, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x95, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE5, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x97, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xEE, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8F, 0xF6, 0xE4, 0xE3, 0x00, 0x00, 0xE4, 0xE8, /* 0xB8-0xBB */
+	0x91, 0x93, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE4, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x92, 0x7E, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x97, 0x75, 0xE4, 0xE1, 0x8A, 0x57, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE4, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xE4, 0xEA, 0x96, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE4, 0xE6, 0xE4, 0xE9, 0x00, 0x00, /* 0xD8-0xDB */
+	0xED, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x96, 0x48, 0x00, 0x00, 0x98, 0x40, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE4, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0x8E, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x95, 0xCC, 0x00, 0x00, 0x96, 0xA0, /* 0x10-0x13 */
+	0xE4, 0xF7, 0xE4, 0xF6, 0x00, 0x00, 0xE4, 0xF2, /* 0x14-0x17 */
+	0xE4, 0xF3, 0x00, 0x00, 0x89, 0x55, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xD3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE4, 0xF4, 0x88, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x91, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x95, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE4, 0xF9, 0xE5, 0x40, 0x00, 0x00, 0x94, 0xD7, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE4, 0xFC, 0x8F, 0xD4, 0x8E, 0xC7, 0xE5, 0x42, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xBC, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x7D, /* 0x50-0x53 */
+	0x00, 0x00, 0xE5, 0x43, 0x00, 0x00, 0x95, 0x99, /* 0x54-0x57 */
+	0xE4, 0xFB, 0xEE, 0x7E, 0xE4, 0xD4, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x98, 0x6E, 0x93, 0xA0, 0x95, 0x93, 0xEE, 0x80, /* 0x68-0x6B */
+	0x00, 0x00, 0xE5, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x50, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x51, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE5, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x94, 0x96, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x4E, /* 0x84-0x87 */
+	0xE5, 0x46, 0x00, 0x00, 0xE5, 0x48, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE5, 0x52, 0xE5, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE5, 0x4B, 0x00, 0x00, 0x00, 0x00, 0x89, 0x92, /* 0x94-0x97 */
+	0x00, 0x00, 0x93, 0xE3, 0x00, 0x00, 0xE5, 0x4C, /* 0x98-0x9B */
+	0xE5, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE5, 0x45, 0x00, 0x00, 0x91, 0x45, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE5, 0x49, 0x8E, 0x46, 0x90, 0x64, 0x8C, 0x4F, /* 0xA8-0xAB */
+	0x96, 0xF2, 0x00, 0x00, 0x96, 0xF7, 0x8F, 0x92, /* 0xAC-0xAF */
+	0xEE, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE5, 0x56, 0xE5, 0x54, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x98, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE5, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x97, 0x95, 0x00, 0x00, 0xE5, 0x55, /* 0xCC-0xCF */
+	0xE5, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE5, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE5, 0x5B, 0xE5, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x93, 0xA1, 0xE5, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x94, 0xCB, 0xE5, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x93, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE5, 0x5C, 0xE5, 0x61, 0x91, 0x94, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x41, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x62, 0x91, 0x68, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5F, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x5E, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x50, 0x9F, 0x41, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x64, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x63, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x97, 0x96, 0x00, 0x00, 0xE1, 0xBA, /* 0x2C-0x2F */
+	0xE5, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x66, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xE5, 0x67, 0x8C, 0xD5, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8B, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE5, 0x69, 0x99, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x95, 0x00, 0x00, /* 0x58-0x5B */
+	0x97, 0xB8, 0x00, 0x00, 0x8B, 0xF1, 0xE5, 0x6A, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6B, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x8E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xE5, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x93, 0xF8, 0x00, 0x00, 0x88, 0xB8, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xE1, 0xE5, 0x71, /* 0x88-0x8B */
+	0xE5, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6D, /* 0x90-0x93 */
+	0x00, 0x00, 0x8E, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6E, /* 0xA0-0xA3 */
+	0x94, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x7A, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x74, /* 0xAC-0xAF */
+	0xE5, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x73, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE5, 0x75, 0x00, 0x00, 0xE5, 0x76, 0x8E, 0xD6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE5, 0x78, 0x00, 0x00, 0x92, 0x60, /* 0xC8-0xCB */
+	0x00, 0x00, 0x8C, 0x75, 0x8A, 0x61, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE5, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x8A, 0x5E, 0x00, 0x00, 0xE5, 0x81, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x7C, 0xE5, 0x80, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x94, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE5, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE5, 0x7E, 0x95, 0x67, 0x94, 0xD8, 0xE5, 0x82, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x91, 0xFB, 0xE5, 0x8C, 0x00, 0x00, 0xE5, 0x88, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xE9, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xE5, 0x86, 0x00, 0x00, 0x96, 0x49, 0xE5, 0x87, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x84, 0x00, 0x00, /* 0x04-0x07 */
+	0xE5, 0x85, 0xE5, 0x8A, 0xE5, 0x8D, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE5, 0x89, 0xE5, 0x83, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x92, 0x77, 0x00, 0x00, 0xE5, 0x94, 0x00, 0x00, /* 0x18-0x1B */
+	0x96, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE5, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE5, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE5, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x90, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x91, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x8F, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x90, 0xE4, 0x00, 0x00, 0x98, 0x58, /* 0x48-0x4B */
+	0xE5, 0x98, 0x00, 0x00, 0xE5, 0x99, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x9F, /* 0x50-0x53 */
+	0x00, 0x00, 0x90, 0x49, 0x00, 0x00, 0xE5, 0x9B, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x96, /* 0x5C-0x5F */
+	0xE5, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA0, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xDA, 0x00, 0x00, /* 0x64-0x67 */
+	0xE5, 0x9C, 0x00, 0x00, 0xE5, 0xA1, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x9D, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE5, 0x9A, 0x00, 0x00, 0x92, 0xB1, 0x00, 0x00, /* 0x74-0x77 */
+	0xE5, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x88, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA5, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x97, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x86, 0xE5, 0xB1, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE5, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE5, 0xAD, 0x00, 0x00, 0xE5, 0xB0, 0xE5, 0xAF, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xE5, 0xAA, 0x00, 0x00, 0xE5, 0xBB, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE5, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB2, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xB8, 0xE5, 0xB9, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x8A, 0x49, 0x00, 0x00, 0x8B, 0x61, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE5, 0xA2, 0x00, 0x00, 0xEE, 0x85, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0xB6, 0xE5, 0xBA, 0xE5, 0xB5, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE5, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE5, 0xBE, 0xE5, 0xBD, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE5, 0xC0, 0xE5, 0xBF, 0xE5, 0x79, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC4, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC5, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x8C, 0x8C, 0x00, 0x00, 0xE5, 0xC7, 0x00, 0x00, /* 0x40-0x43 */
+	0xE5, 0xC6, 0x00, 0x00, 0x8F, 0x4F, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x8D, 0x73, 0x9F, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC8, 0x8F, 0x70, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x58, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0xC9, 0x00, 0x00, 0x89, 0x71, /* 0x58-0x5B */
+	0x00, 0x00, 0x8F, 0xD5, 0xE5, 0xCA, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x8D, 0x74, 0xE5, 0xCB, 0x88, 0xDF, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x95, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCC, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x90, 0x8A, 0x00, 0x00, 0xE5, 0xD3, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE5, 0xD0, 0x00, 0x00, 0x92, 0x8F, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE5, 0xD1, 0xE5, 0xCE, 0x8B, 0xDC, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE5, 0xCD, 0xE5, 0xD4, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x8C, 0x55, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDC, /* 0x88-0x8B */
+	0x00, 0x00, 0xE5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD6, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xB3, 0xE5, 0xD5, /* 0x94-0x97 */
+	0x00, 0x00, 0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD9, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE5, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xED, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD7, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE5, 0xDC, 0xE5, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x8C, 0xD1, 0xE5, 0xD2, 0x00, 0x00, 0x88, 0xBF, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, /* 0xBC-0xBF */
+	0x00, 0x00, 0x8D, 0xD9, 0x97, 0xF4, 0xE5, 0xDF, /* 0xC0-0xC3 */
+	0xE5, 0xE0, 0x91, 0x95, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA0, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE5, 0xE1, 0x97, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE5, 0xE2, 0xE5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x95, 0xE2, 0xE5, 0xE4, 0x00, 0x00, 0x8D, 0xBE, /* 0xDC-0xDF */
+	0x00, 0x00, 0x97, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE5, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xEA, 0x8F, 0xD6, /* 0xF0-0xF3 */
+	0xE5, 0xE8, 0xEE, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x97, 0x87, 0xE5, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE5, 0xE7, 0x90, 0xBB, 0x90, 0x9E, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x95, 0xA1, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xED, 0x00, 0x00, /* 0x08-0x0B */
+	0xE5, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x8A, 0x8C, 0x00, 0x00, 0x96, 0x4A, 0xE5, 0xEE, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xED, 0x41, 0xE5, 0xFA, 0xE5, 0xF0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF2, 0xE5, 0xF3, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x34-0x37 */
+	0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE5, 0xF4, 0x00, 0x00, 0xE5, 0xEF, /* 0x40-0x43 */
+	0xE5, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE5, 0xF9, 0xE8, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xA6, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xFC, 0x8B, 0xDD, /* 0x5C-0x5F */
+	0xE5, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE6, 0x41, 0x00, 0x00, 0xE6, 0x40, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x43, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE6, 0x42, 0x00, 0x00, 0xE6, 0x44, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x50, 0x00, 0x00, /* 0x70-0x73 */
+	0xE6, 0x45, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x46, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x47, 0x90, 0xBC, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x97, 0x76, 0x00, 0x00, 0xE6, 0x48, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xA2, 0x94, 0x65, /* 0x84-0x87 */
+	0xE6, 0x49, 0x00, 0x00, 0xE6, 0x4A, 0x8C, 0xA9, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x4B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x4B, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x8B, 0x94, 0x60, /* 0x94-0x97 */
+	0xE6, 0x4C, 0x00, 0x00, 0x8A, 0x6F, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE6, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x4F, 0x97, 0x97, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE6, 0x4E, 0x90, 0x65, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE6, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x51, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x52, 0x8A, 0xCF, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x53, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE6, 0x54, 0x00, 0x00, 0xE6, 0x55, /* 0xBC-0xBF */
+	0xE6, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x70, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x57, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE6, 0x58, 0xE6, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF0, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x47, 0xE6, 0x5A, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE6, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0x8C, 0xBE, 0x00, 0x00, 0x92, 0xF9, 0xE6, 0x5D, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x8C, 0x76, 0x00, 0x00, 0x90, 0x75, 0x00, 0x00, /* 0x08-0x0B */
+	0xE6, 0x60, 0x00, 0x00, 0x93, 0xA2, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE6, 0x5F, 0x00, 0x00, 0xEE, 0x87, 0x8C, 0x50, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x5E, 0x91, 0xF5, /* 0x14-0x17 */
+	0x8B, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x61, /* 0x18-0x1B */
+	0x00, 0x00, 0xE6, 0x62, 0x00, 0x00, 0x8F, 0xD7, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x8D, /* 0x20-0x23 */
+	0x00, 0x00, 0xE6, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x4B, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x90, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8B, 0x96, 0x00, 0x00, 0x96, 0xF3, /* 0x30-0x33 */
+	0x91, 0x69, 0x00, 0x00, 0xE6, 0x64, 0xEE, 0x88, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x66, 0x92, 0x90, /* 0x38-0x3B */
+	0x8F, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE6, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x68, 0x00, 0x00, /* 0x44-0x47 */
+	0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8D, 0xBC, 0x91, 0xC0, 0xE6, 0x67, 0x00, 0x00, /* 0x50-0x53 */
+	0x8F, 0xD9, 0x95, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x66, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x8C, 0x00, 0x00, /* 0x5C-0x5F */
+	0x89, 0x72, 0x00, 0x00, 0xE6, 0x6D, 0x8C, 0x77, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x8E, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x8E, 0x8D, 0x00, 0x00, 0x98, 0x6C, /* 0x68-0x6B */
+	0xE6, 0x6C, 0xE6, 0x6B, 0x91, 0x46, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8B, 0x6C, 0x98, 0x62, 0x8A, 0x59, 0x8F, 0xDA, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xEE, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x6F, 0x00, 0x00, /* 0x80-0x83 */
+	0xE6, 0x70, 0xE6, 0x6E, 0x00, 0x00, 0x8C, 0xD6, /* 0x84-0x87 */
+	0x00, 0x00, 0x97, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x8E, 0x8F, 0x94, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE6, 0x73, 0x00, 0x00, 0x90, 0xBE, /* 0x90-0x93 */
+	0x00, 0x00, 0x92, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x97, 0x55, 0x00, 0x00, 0xE6, 0x76, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xEA, 0x00, 0x00, /* 0x9C-0x9F */
+	0x90, 0xBD, 0xE6, 0x72, 0x00, 0x00, 0xE6, 0x77, /* 0xA0-0xA3 */
+	0x8C, 0xEB, 0xE6, 0x74, 0xE6, 0x75, 0xEE, 0x8A, /* 0xA4-0xA7 */
+	0xE6, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x90, 0xE0, 0x93, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x92, 0x4E, 0x00, 0x00, 0x89, 0xDB, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x94, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x8B, 0x62, 0x00, 0x00, 0xEE, 0x8B, 0x92, 0xB2, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x7A, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE6, 0x78, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6B, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xBF, /* 0xC8-0xCB */
+	0x8A, 0xD0, 0xE6, 0x79, 0x00, 0x00, 0x90, 0x7A, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xC8, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x5F, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x7B, 0xE6, 0x87, /* 0xD8-0xDB */
+	0x92, 0xB3, 0x00, 0x00, 0xE6, 0x86, 0xEE, 0x8C, /* 0xDC-0xDF */
+	0xE6, 0x83, 0xE6, 0x8B, 0xE6, 0x84, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE6, 0x80, 0x00, 0x00, 0x92, 0xFA, 0xE6, 0x7E, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7C, /* 0xE8-0xEB */
+	0x00, 0x00, 0x97, 0x40, 0x8E, 0x90, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE6, 0x81, 0x00, 0x00, 0xE6, 0x7D, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x8E, 0xE6, 0x85, /* 0xF4-0xF7 */
+	0x8F, 0x94, 0x00, 0x00, 0x8C, 0xBF, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xF8, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0x96, 0x64, 0x89, 0x79, 0x88, 0xE0, 0x00, 0x00, /* 0x00-0x03 */
+	0x93, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x89, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE6, 0x88, 0x00, 0x00, 0x93, 0xE4, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE6, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE6, 0x82, 0x00, 0x00, 0xE6, 0x8C, 0xE6, 0x8E, /* 0x14-0x17 */
+	0x00, 0x00, 0x8C, 0xAA, 0xE6, 0x8A, 0x8D, 0x75, /* 0x18-0x1B */
+	0x00, 0x00, 0x8E, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE6, 0x8F, 0x97, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x92, 0x00, 0x00, /* 0x24-0x27 */
+	0xE6, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x93, /* 0x28-0x2B */
+	0x95, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x90, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8B, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x94, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE6, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE6, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE6, 0x97, 0x00, 0x00, 0xE6, 0x99, 0xE6, 0x98, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x8F, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x9B, 0x00, 0x00, /* 0x54-0x57 */
+	0x8E, 0xAF, 0x00, 0x00, 0xE6, 0x9D, 0xE6, 0x9C, /* 0x58-0x5B */
+	0x95, 0x88, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9F, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x78, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9E, /* 0x68-0x6B */
+	0xE6, 0xA0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA1, /* 0x6C-0x6F */
+	0x8B, 0x63, 0xE3, 0xBF, 0x8F, 0xF7, 0x00, 0x00, /* 0x70-0x73 */
+	0xE6, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xEC, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE6, 0xA3, 0x00, 0x00, 0xEE, 0x90, /* 0x7C-0x7F */
+	
+	0xE6, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x5D, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xCC, 0x00, 0x00, /* 0x88-0x8B */
+	0xE6, 0xA5, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8F, 0x51, 0x00, 0x00, 0xE6, 0xA7, 0xE6, 0xA8, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE6, 0xAA, 0xE6, 0xAB, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x4A, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAE, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xA4, 0x00, 0x00, /* 0x44-0x47 */
+	0xE6, 0xAF, 0x00, 0x00, 0x96, 0x4C, 0x00, 0x00, /* 0x48-0x4B */
+	0xE6, 0xB0, 0x00, 0x00, 0xE6, 0xB1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE6, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xD8, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x8F, 0xDB, 0xE6, 0xB4, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0x8B, 0x98, 0xAC, /* 0x68-0x6B */
+	0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xE6, 0xB6, 0x95, 0x5E, 0xE6, 0xB7, 0x00, 0x00, /* 0x78-0x7B */
+	0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xB8, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE6, 0xB9, 0xE6, 0xBB, 0x00, 0x00, /* 0x88-0x8B */
+	0x96, 0x65, 0xE6, 0xBC, 0xE6, 0xBD, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE6, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xE6, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x8A, 0x4C, 0x92, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x95, 0x89, 0x8D, 0xE0, 0x8D, 0x76, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x6E, /* 0xA4-0xA7 */
+	0x89, 0xDD, 0x94, 0xCC, 0xE6, 0xC3, 0x8A, 0xD1, /* 0xA8-0xAB */
+	0x90, 0xD3, 0xE6, 0xC2, 0xE6, 0xC7, 0x92, 0x99, /* 0xAC-0xAF */
+	0x96, 0xE1, 0x00, 0x00, 0xE6, 0xC5, 0xE6, 0xC6, /* 0xB0-0xB3 */
+	0x8B, 0x4D, 0x00, 0x00, 0xE6, 0xC8, 0x94, 0x83, /* 0xB4-0xB7 */
+	0x91, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x94, 0xEF, /* 0xB8-0xBB */
+	0x93, 0x5C, 0xE6, 0xC4, 0x00, 0x00, 0x96, 0x66, /* 0xBC-0xBF */
+	0x89, 0xEA, 0xE6, 0xCA, 0x98, 0x47, 0x92, 0xC0, /* 0xC0-0xC3 */
+	0x98, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x91, /* 0xC4-0xC7 */
+	0xE6, 0xC9, 0x00, 0x00, 0x91, 0xAF, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE6, 0xDA, 0x91, 0x47, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x93, 0xF6, 0x00, 0x00, 0x95, 0x6F, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xCD, 0x8E, 0x5E, /* 0xD8-0xDB */
+	0x8E, 0x92, 0x00, 0x00, 0x8F, 0xDC, 0x00, 0x00, /* 0xDC-0xDF */
+	0x94, 0x85, 0x00, 0x00, 0x8C, 0xAB, 0xE6, 0xCC, /* 0xE0-0xE3 */
+	0xE6, 0xCB, 0x00, 0x00, 0x95, 0x8A, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x93, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEE, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEE, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xCF, 0xE6, 0xD0, /* 0xF8-0xFB */
+	0x8D, 0x77, 0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE6, 0xD1, 0xE6, 0xD2, 0x00, 0x00, 0xE6, 0xD4, /* 0x04-0x07 */
+	0x91, 0xA1, 0x00, 0x00, 0xE6, 0xD3, 0x8A, 0xE4, /* 0x08-0x0B */
+	0x00, 0x00, 0xE6, 0xD6, 0x00, 0x00, 0xE6, 0xD5, /* 0x0C-0x0F */
+	0xE6, 0xD7, 0x00, 0x00, 0xEE, 0x93, 0xE6, 0xD9, /* 0x10-0x13 */
+	0xE6, 0xDB, 0x00, 0x00, 0xE6, 0xDC, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x90, 0xD4, 0x00, 0x00, 0x8E, 0xCD, 0xE6, 0xDD, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x71, /* 0x68-0x6B */
+	0x00, 0x00, 0xE6, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x91, 0x96, 0xE6, 0xDF, 0x00, 0x00, 0xE6, 0xE0, /* 0x70-0x73 */
+	0x95, 0x8B, 0x00, 0x00, 0xEE, 0x94, 0x8B, 0x4E, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE6, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x92, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x7A, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x90, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAB, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE5, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE4, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, /* 0xC8-0xCB */
+	0xE6, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE6, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, 0xE6, 0xEA, /* 0xD8-0xDB */
+	0x00, 0x00, 0x8B, 0x97, 0x00, 0x00, 0xE6, 0xEE, /* 0xDC-0xDF */
+	0x00, 0x00, 0x90, 0xD5, 0x00, 0x00, 0xE6, 0xEF, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x8C, 0xD7, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xED, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x48, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x91, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE6, 0xF1, 0xE6, 0xF2, 0x97, 0x78, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xA5, /* 0x0C-0x0F */
+	0xE6, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE6, 0xF4, 0xE6, 0xF5, 0xE6, 0xF7, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x48, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE6, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE6, 0xFB, 0xE6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xF8, 0x00, 0x00, /* 0x40-0x43 */
+	0x92, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x40, /* 0x44-0x47 */
+	0xE7, 0x44, 0xE7, 0x41, 0xE6, 0xFC, 0x00, 0x00, /* 0x48-0x4B */
+	0xE7, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE7, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE7, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xD6, /* 0x5C-0x5F */
+	0xE7, 0x47, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x49, /* 0x60-0x63 */
+	0xE7, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x4C, 0x00, 0x00, /* 0x70-0x73 */
+	0x8F, 0x52, 0x00, 0x00, 0xE7, 0x4B, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE7, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE7, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE7, 0x51, 0xE7, 0x50, 0x00, 0x00, 0xE7, 0x4F, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x53, 0xE7, 0x52, /* 0x88-0x8B */
+	0x00, 0x00, 0x96, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE7, 0x55, 0x00, 0x00, 0xE7, 0x54, /* 0x90-0x93 */
+	0xE7, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE7, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE7, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x58, 0x90, 0x67, /* 0xA8-0xAB */
+	0xE7, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xEB, /* 0xAC-0xAF */
+	0xE7, 0x5B, 0xE7, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x5E, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE7, 0x5F, 0xE7, 0x5C, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE7, 0x60, 0x00, 0x00, 0x8E, 0xD4, 0xE7, 0x61, /* 0xC8-0xCB */
+	0x8B, 0x4F, 0x8C, 0x52, 0x00, 0x00, 0xEE, 0x96, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xAC, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x62, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEE, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x5D, 0xE7, 0x63, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x66, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8E, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x65, /* 0xF8-0xFB */
+	0xE7, 0x64, 0x8C, 0x79, 0xE7, 0x67, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x72, /* 0x00-0x03 */
+	0x00, 0x00, 0xE7, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x8D, 0xDA, 0xE7, 0x68, 0x00, 0x00, /* 0x08-0x0B */
+	0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x6B, 0xE7, 0x6D, /* 0x10-0x13 */
+	0x95, 0xE3, 0xE7, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE7, 0x6C, 0x00, 0x00, 0xE7, 0x70, /* 0x18-0x1B */
+	0xE7, 0x6E, 0x8B, 0x50, 0x00, 0x00, 0xE7, 0x6F, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x72, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x94, 0x79, 0x97, 0xD6, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x53, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x97, 0x41, 0xE7, 0x75, 0x00, 0x00, 0xE7, 0x74, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x78, 0x97, 0x60, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x77, 0x00, 0x00, /* 0x40-0x43 */
+	0x8A, 0x8D, 0xE7, 0x76, 0xE7, 0x7B, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE7, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE7, 0x79, 0x93, 0x51, 0xE7, 0x7C, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x7D, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE7, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x8C, /* 0x5C-0x5F */
+	0x00, 0x00, 0x8C, 0x44, 0xE7, 0x80, 0xE7, 0x81, /* 0x60-0x63 */
+	0xE7, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x68, /* 0x98-0x9B */
+	0xE7, 0x83, 0x00, 0x00, 0x8E, 0xAB, 0xE7, 0x84, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x85, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x9F, /* 0xA4-0xA7 */
+	0x99, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE7, 0x86, 0xE3, 0x90, 0xE7, 0x87, /* 0xAC-0xAF */
+	0x92, 0x43, 0x90, 0x4A, 0x94, 0x5F, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x88, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xD3, 0x92, 0xD2, /* 0xB8-0xBB */
+	0x8D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x92, 0x48, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x96, 0x98, 0x90, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x7D, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8B, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x95, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x89, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x8B, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE7, 0x8A, 0x89, 0xDE, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x93, 0xF4, 0xE7, 0x8C, 0x94, 0x97, /* 0xE8-0xEB */
+	0x00, 0x00, 0x93, 0x52, 0x00, 0x00, 0xE7, 0x8D, /* 0xEC-0xEF */
+	0x8F, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE7, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x96, 0xC0, /* 0xF4-0xF7 */
+	0xE7, 0x9E, 0xE7, 0x91, 0xE7, 0x92, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x92, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0x91, 0xDE, 0x91, 0x97, 0x00, 0x00, 0x93, 0xA6, /* 0x00-0x03 */
+	0x00, 0x00, 0xE7, 0x90, 0x8B, 0x74, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x99, /* 0x08-0x0B */
+	0x00, 0x00, 0xE7, 0x96, 0xE7, 0xA3, 0x93, 0xA7, /* 0x0C-0x0F */
+	0x92, 0x80, 0xE7, 0x93, 0x00, 0x00, 0x92, 0xFC, /* 0x10-0x13 */
+	0x93, 0x72, 0xE7, 0x94, 0xE7, 0x98, 0x90, 0x80, /* 0x14-0x17 */
+	0x00, 0x00, 0x94, 0x87, 0x92, 0xCA, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x90, 0xC0, 0xE7, 0x97, 0x91, 0xAC, /* 0x1C-0x1F */
+	0x91, 0xA2, 0xE7, 0x95, 0x88, 0xA7, 0x98, 0x41, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x9A, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xDF, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0x54, 0x90, 0x69, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE7, 0x9C, 0xE7, 0x9B, 0x00, 0x00, /* 0x34-0x37 */
+	0x88, 0xED, 0xE7, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x95, 0x4E, 0x00, 0x00, 0xE7, 0xA5, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x93, 0xD9, 0x90, 0x8B, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x92, 0x78, 0x00, 0x00, 0x8B, 0xF6, /* 0x44-0x47 */
+	0x00, 0x00, 0xE7, 0xA4, 0x97, 0x56, 0x89, 0x5E, /* 0x48-0x4B */
+	0x00, 0x00, 0x95, 0xD5, 0x89, 0xDF, 0xE7, 0x9F, /* 0x4C-0x4F */
+	0xE7, 0xA0, 0xE7, 0xA1, 0xE7, 0xA2, 0x93, 0xB9, /* 0x50-0x53 */
+	0x92, 0x42, 0x88, 0xE1, 0xE7, 0xA6, 0x00, 0x00, /* 0x54-0x57 */
+	0xE7, 0xA7, 0xEA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x91, 0xBB, 0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, /* 0x5C-0x5F */
+	0x89, 0x93, 0x91, 0x6B, 0x00, 0x00, 0x8C, 0xAD, /* 0x60-0x63 */
+	0x00, 0x00, 0x97, 0x79, 0x00, 0x00, 0xEE, 0x99, /* 0x64-0x67 */
+	0xE7, 0xA9, 0x93, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x91, 0x98, 0x8E, 0xD5, 0xE7, 0xAA, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xAD, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x8F, 0x85, 0xE7, 0xAB, 0x91, 0x4A, /* 0x74-0x77 */
+	0x91, 0x49, 0x00, 0x00, 0x88, 0xE2, 0x00, 0x00, /* 0x78-0x7B */
+	0x97, 0xC9, 0xE7, 0xAF, 0x00, 0x00, 0x94, 0xF0, /* 0x7C-0x7F */
+	
+	0xE7, 0xB1, 0xE7, 0xB0, 0xE7, 0xAE, 0xE2, 0x84, /* 0x80-0x83 */
+	0x8A, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x8E, /* 0x84-0x87 */
+	0x00, 0x00, 0xE7, 0xB3, 0xE7, 0xB2, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, /* 0x8C-0x8F */
+	0x00, 0x00, 0x97, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x4D, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE7, 0xB5, 0x00, 0x00, 0x8E, 0xD7, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x93, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x88, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0x78, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x59, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBC, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x9A, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8C, 0x53, 0xE7, 0xB9, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE7, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x95, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x8A, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x97, 0x58, 0x00, 0x00, 0x8B, 0xBD, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x93, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xBE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xEE, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE7, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x9D, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x93, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE7, 0xC1, 0x00, 0x00, 0xE7, 0xC0, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x93, 0xD1, 0xE7, 0xC2, 0x8F, 0x55, /* 0x48-0x4B */
+	0x8E, 0xDE, 0x94, 0x7A, 0x92, 0x91, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xF0, 0x00, 0x00, /* 0x50-0x53 */
+	0x90, 0x8C, 0x00, 0x00, 0xE7, 0xC3, 0x00, 0x00, /* 0x54-0x57 */
+	0xE7, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x7C, 0xE7, 0xC5, /* 0x60-0x63 */
+	0x00, 0x00, 0xE7, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE7, 0xC7, 0x97, 0x8F, 0x00, 0x00, /* 0x68-0x6B */
+	0x8F, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xC9, 0xE7, 0xC8, /* 0x70-0x73 */
+	0x00, 0x00, 0x8D, 0x79, 0x00, 0x00, 0x8D, 0x93, /* 0x74-0x77 */
+	0x8E, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x86, /* 0x84-0x87 */
+	0x00, 0x00, 0xE7, 0xCB, 0x00, 0x00, 0xE7, 0xCA, /* 0x88-0x8B */
+	0x00, 0x00, 0x91, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8C, 0xED, 0x00, 0x00, 0x90, 0xC1, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xAE, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x8F, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xCD, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x8F, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD0, 0xE7, 0xCE, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCF, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE7, 0xD2, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8F, 0xF8, 0x00, 0x00, 0xE7, 0xD3, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE7, 0xD4, 0xE7, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xCE, 0x8D, 0xD1, /* 0xC4-0xC7 */
+	0x8E, 0xDF, 0xE7, 0xD6, 0x00, 0x00, 0xE7, 0xD7, /* 0xC8-0xCB */
+	0x97, 0xA2, 0x8F, 0x64, 0x96, 0xEC, 0x97, 0xCA, /* 0xCC-0xCF */
+	0xE7, 0xD8, 0x8B, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0xEE, 0x9F, /* 0xD4-0xD7 */
+	0x93, 0x42, 0x00, 0x00, 0xEE, 0x9E, 0xE7, 0xDC, /* 0xD8-0xDB */
+	0x8A, 0x98, 0x90, 0x6A, 0xEE, 0xA0, 0xE7, 0xDA, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE7, 0xDB, 0x00, 0x00, 0x92, 0xDE, /* 0xE0-0xE3 */
+	0xEE, 0xA3, 0xEE, 0xA4, 0x96, 0x74, 0x8B, 0xFA, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEE, 0xA1, 0xEE, 0xA2, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE7, 0xDE, 0xE7, 0xDF, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_92[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xA5, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xA7, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x93, 0xDD, 0x8A, 0x62, 0x00, 0x00, /* 0x0C-0x0F */
+	0xEE, 0xA6, 0xE7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE7, 0xE2, 0xE7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE8, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE7, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x97, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xD8, /* 0x34-0x37 */
+	0x00, 0x00, 0xEE, 0xAE, 0xEE, 0xA8, 0x00, 0x00, /* 0x38-0x3B */
+	0xEE, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xED, /* 0x3C-0x3F */
+	0xEE, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x93, 0x53, 0xE7, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE7, 0xEB, 0xE7, 0xE9, 0x00, 0x00, 0xE7, 0xEE, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE7, 0xEF, 0xEE, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE7, /* 0x54-0x57 */
+	0x00, 0x00, 0xEE, 0xAC, 0xE7, 0xF4, 0x89, 0x94, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xE6, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xAB, 0x00, 0x00, /* 0x60-0x63 */
+	0xE7, 0xEA, 0x00, 0x00, 0x8F, 0xDE, 0xEE, 0xAF, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x8D, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB1, /* 0x74-0x77 */
+	0xEE, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x67, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x8B, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x65, /* 0x80-0x83 */
+	0x00, 0x00, 0x93, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xED, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x91, 0x4C, 0x00, 0x00, 0xE7, 0xF2, /* 0x90-0x93 */
+	0x00, 0x00, 0xE7, 0xEC, 0xE7, 0xF1, 0x00, 0x00, /* 0x94-0x97 */
+	0x96, 0xC1, 0x00, 0x00, 0x92, 0xB6, 0xE7, 0xF3, /* 0x98-0x9B */
+	0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB0, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x91, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE7, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF5, /* 0xCC-0xCF */
+	0xEE, 0xB6, 0x00, 0x00, 0x96, 0x4E, 0xEE, 0xBA, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xEE, 0xB8, 0x00, 0x00, 0xEE, 0xB4, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xEE, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x8F, 0x9B, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB3, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE7, 0xF8, 0x95, 0xDD, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x89, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x65, 0x92, 0x92, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8B, 0x98, 0xED, 0x49, 0xE7, 0xFA, 0xEE, 0xBD, /* 0xF8-0xFB */
+	0x8D, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xC2, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x4B, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, /* 0x0C-0x0F */
+	0x90, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x90, 0x8E, 0xE8, 0x40, 0xE8, 0x42, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xEE, 0xC1, 0xEE, 0xBF, 0x00, 0x00, /* 0x1C-0x1F */
+	0x8F, 0xF9, 0xEE, 0xBC, 0xE8, 0x41, 0xE8, 0x43, /* 0x20-0x23 */
+	0x00, 0x00, 0xEE, 0xBB, 0x8B, 0xD1, 0x00, 0x00, /* 0x24-0x27 */
+	0x95, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xE0, /* 0x28-0x2B */
+	0x98, 0x42, 0x00, 0x00, 0xE7, 0xFC, 0x8D, 0xF6, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x5E, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE8, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x44, 0xE8, 0x46, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE7, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x93, 0xE7, /* 0x48-0x4B */
+	0x00, 0x00, 0x93, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x92, 0xD5, 0x00, 0x00, 0xE8, 0x4B, 0xEE, 0xC4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x62, /* 0x58-0x5B */
+	0xE8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x8C, 0x4C, 0x00, 0x00, 0xE8, 0x4A, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x8C, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE8, 0x49, 0x00, 0x00, 0x8F, 0xDF, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x8A, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE8, 0x4F, 0x00, 0x00, 0x8D, 0xBD, 0x91, 0x99, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xC8, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEE, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x5A, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE8, 0x4D, 0xE8, 0x4E, 0x92, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE8, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE8, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x56, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xC6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE8, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xE8, 0x58, 0x93, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x51, 0xE8, 0x52, /* 0xD4-0xD7 */
+	0xE8, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE8, 0x57, 0xEE, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8B, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE8, 0x5A, 0xE8, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE8, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xEE, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_94[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5E, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5F, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE8, 0x60, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5D, /* 0x10-0x13 */
+	0xE8, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x8F, 0xE0, 0x93, 0xA8, 0xE8, 0x5B, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE8, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x62, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE8, 0x63, 0xE8, 0x61, 0x00, 0x00, /* 0x34-0x37 */
+	0x91, 0xF6, 0x00, 0x00, 0xE8, 0x65, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE8, 0x68, 0xEE, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xEE, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x8A, 0xD3, 0xE8, 0x67, 0x96, 0xF8, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x73, 0xE8, 0x69, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x6C, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0x6A, 0x00, 0x00, 0xE8, 0x6B, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x6D, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE8, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE8, 0x70, 0x00, 0x00, 0xE8, 0x71, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE8, 0x74, 0xE8, 0x72, 0xE8, 0x75, 0xE8, 0x77, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE8, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+};
+
+static const unsigned char u2c_95[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB7, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x96, 0xE5, 0x00, 0x00, 0xE8, 0x78, 0x91, 0x4D, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x79, /* 0x84-0x87 */
+	0x00, 0x00, 0x95, 0xC2, 0xE8, 0x7A, 0x8A, 0x4A, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x5B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8A, 0xD5, 0xEE, 0xCC, 0x8A, 0xD4, /* 0x90-0x93 */
+	0xE8, 0x7B, 0x00, 0x00, 0xE8, 0x7C, 0x00, 0x00, /* 0x94-0x97 */
+	0xE8, 0x7D, 0xE8, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE8, 0x80, 0x00, 0x00, 0x8A, 0xD6, 0x8A, 0x74, /* 0xA0-0xA3 */
+	0x8D, 0x7D, 0x94, 0xB4, 0x00, 0x00, 0xE8, 0x82, /* 0xA4-0xA7 */
+	0xE8, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE8, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x7B, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE8, 0x86, 0x00, 0x00, 0xE8, 0x85, /* 0xB8-0xBB */
+	0xE8, 0x84, 0x00, 0x00, 0xE8, 0x87, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x8A, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xC5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x88, 0x00, 0x00, /* 0xC8-0xCB */
+	0xE8, 0x8C, 0xE8, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE8, 0x8E, 0xE8, 0x8D, 0xE8, 0x8F, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x93, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE8, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE8, 0x91, 0xE8, 0x93, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char u2c_96[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x95, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE8, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xE8, 0x95, 0x00, 0x00, 0x8D, 0xE3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x96, 0xE8, 0x97, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x68, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x6A, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xA2, /* 0x3C-0x3F */
+	0x91, 0xC9, 0x00, 0x00, 0xE8, 0x98, 0x00, 0x00, /* 0x40-0x43 */
+	0x95, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x9B, /* 0x48-0x4B */
+	0xE8, 0x99, 0x8D, 0x7E, 0x00, 0x00, 0xE8, 0x9A, /* 0x4C-0x4F */
+	0x8C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC3, /* 0x58-0x5B */
+	0xE8, 0x9D, 0xE8, 0x9F, 0xE8, 0x9E, 0xE8, 0xA0, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x40, 0x90, 0x77, /* 0x60-0x63 */
+	0x8F, 0x9C, 0x8A, 0xD7, 0xE8, 0xA1, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x86, 0x00, 0x00, /* 0x68-0x6B */
+	0xE8, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x89, 0x41, 0x00, 0x00, 0xE8, 0xA2, 0x92, 0xC2, /* 0x70-0x73 */
+	0x00, 0x00, 0x97, 0xCB, 0x93, 0xA9, 0xE8, 0x9C, /* 0x74-0x77 */
+	0x97, 0xA4, 0x00, 0x00, 0x8C, 0xAF, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x97, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x8B, 0xF7, 0x97, 0xB2, 0x00, 0x00, /* 0x84-0x87 */
+	0x8C, 0x47, 0x00, 0x00, 0x91, 0xE0, 0xE4, 0x40, /* 0x88-0x8B */
+	0x00, 0x00, 0xE8, 0xA4, 0x8A, 0x4B, 0x90, 0x8F, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x8A, 0x75, 0xE8, 0xA6, 0x00, 0x00, 0xE8, 0xA7, /* 0x94-0x97 */
+	0xE8, 0xA5, 0x8C, 0x84, 0x00, 0x00, 0x8D, 0xDB, /* 0x98-0x9B */
+	0x8F, 0xE1, 0xEE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x89, 0x42, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD7, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0xA4-0xA7 */
+	0xE7, 0xAC, 0x00, 0x00, 0xE8, 0xA8, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD0, /* 0xAC-0xAF */
+	0xE8, 0xAC, 0xE8, 0xAA, 0xE8, 0xAB, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE8, 0xAD, 0x00, 0x00, 0xE8, 0xAE, 0x97, 0xEA, /* 0xB4-0xB7 */
+	0xE8, 0xAF, 0xE8, 0xB0, 0x00, 0x00, 0x90, 0xC7, /* 0xB8-0xBB */
+	0x94, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x90, 0x9D, 0x8A, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x97, 0x59, 0x89, 0xEB, 0x8F, 0x57, 0x8C, 0xD9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB2, /* 0xC8-0xCB */
+	0x8E, 0x93, 0xE8, 0xB4, 0xE8, 0xB1, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE8, 0xB8, 0xE5, 0xAB, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x99, 0xD4, 0x00, 0x00, 0x90, 0x97, /* 0xD8-0xDB */
+	0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xA3, 0x93, 0xEF, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x89, 0x4A, 0x00, 0x00, 0x90, 0xE1, 0x8E, 0xB4, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x95, 0xB5, 0x00, 0x00, 0x89, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xEB, 0x97, 0x8B, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE8, 0xB9, 0x00, 0x00, 0x93, 0x64, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_97[512] = {
+	0x8E, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE8, 0xBA, 0x00, 0x00, 0xE8, 0xBB, 0x90, 0x6B, /* 0x04-0x07 */
+	0xE8, 0xBC, 0x00, 0x00, 0x97, 0xEC, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE8, 0xB7, 0xE8, 0xBE, 0xE8, 0xC0, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, 0xE8, 0xBD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xC1, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x91, 0x9A, 0x00, 0x00, 0x89, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB6, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xC4, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE8, 0xC5, 0x00, 0x00, 0x98, 0x49, 0xEE, 0xD1, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9E, 0x50, 0xE8, 0xC6, 0x00, 0x00, 0xEE, 0xD2, /* 0x38-0x3B */
+	0x00, 0x00, 0xE8, 0xC7, 0xE8, 0xC8, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xCC, 0xEE, 0xD3, /* 0x40-0x43 */
+	0xE8, 0xC9, 0x00, 0x00, 0xE8, 0xCA, 0x00, 0x00, /* 0x44-0x47 */
+	0xE8, 0xCB, 0xE8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEE, 0xD4, 0x00, 0x00, 0xEE, 0xD5, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEE, 0xD6, 0x90, 0xC2, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xEE, 0xD7, 0x96, 0xF5, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x90, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE8, 0xCE, 0x00, 0x00, 0x94, 0xF1, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0xCF, 0xEA, 0x72, 0x96, 0xCA, 0x00, 0x00, /* 0x60-0x63 */
+	0xE8, 0xD0, 0x00, 0x00, 0xE8, 0xD1, 0x00, 0x00, /* 0x64-0x67 */
+	0xE8, 0xD2, 0x8A, 0x76, 0x00, 0x00, 0xE8, 0xD4, /* 0x68-0x6B */
+	0x00, 0x00, 0x90, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x8C, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE8, 0xD6, 0xE8, 0xDA, 0x00, 0x00, /* 0x78-0x7B */
+	0xE8, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x8A, 0x93, 0xE8, 0xD7, 0xE8, 0xDB, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xDC, /* 0x88-0x8B */
+	0x00, 0x00, 0x88, 0xC6, 0x00, 0x00, 0xE8, 0xDD, /* 0x8C-0x8F */
+	0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x8F, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xE8, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8B, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE2, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xE1, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE8, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x91, /* 0xA8-0xAB */
+	0x00, 0x00, 0x95, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE3, /* 0xB0-0xB3 */
+	0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE5, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE8, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE8, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xD8, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE8, 0xEA, 0x94, 0x42, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xEC, 0x89, 0xB9, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE8, 0xEF, 0xE8, 0xEE, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x43, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xBF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0x00, 0x00, 0x95, 0xC5, 0x92, 0xB8, 0x8D, 0xA0, /* 0x00-0x03 */
+	0x00, 0x00, 0x8D, 0x80, 0x8F, 0x87, 0x00, 0x00, /* 0x04-0x07 */
+	0x90, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE8, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF0, /* 0x0C-0x0F */
+	0x97, 0x61, 0x8A, 0xE6, 0x94, 0xD0, 0x93, 0xDA, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x9C, /* 0x14-0x17 */
+	0x97, 0xCC, 0x00, 0x00, 0x8C, 0x7A, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x96, 0x6A, 0x93, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x89, 0x6F, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF5, /* 0x34-0x37 */
+	0xE8, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x95, 0x70, /* 0x38-0x3B */
+	0x97, 0x8A, 0xE8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF9, /* 0x48-0x4B */
+	0x91, 0xE8, 0x8A, 0x7A, 0x8A, 0x7B, 0xE8, 0xF8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x8A, 0xE7, 0x8C, 0xB0, 0x00, 0x00, 0xEE, 0xD8, /* 0x54-0x57 */
+	0x8A, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x5E, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xDE, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xEE, 0xD9, 0x00, 0x00, 0x8C, 0xDA, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFA, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFB, /* 0x6C-0x6F */
+	0xE8, 0xFC, 0xE9, 0x40, 0x00, 0x00, 0xE9, 0x42, /* 0x70-0x73 */
+	0xE9, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x95, 0x97, 0x00, 0x00, 0xE9, 0x43, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x44, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x46, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x48, /* 0xC0-0xC3 */
+	0xE9, 0x47, 0x00, 0x00, 0xE9, 0x49, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xF2, /* 0xD8-0xDB */
+	0xE3, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x90, 0x48, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x51, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE9, 0x4A, 0x00, 0x00, 0xE9, 0x4B, /* 0xE8-0xEB */
+	0x00, 0x00, 0x99, 0xAA, 0x9F, 0x5A, 0x94, 0xD1, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x88, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x8E, 0x94, 0x96, 0x4F, 0x8F, 0xFC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x4C, /* 0x00-0x03 */
+	0x00, 0x00, 0x96, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE9, 0x4D, 0x97, 0x7B, 0x00, 0x00, /* 0x08-0x0B */
+	0x89, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x8E, 0x60, 0x00, 0x00, 0xE9, 0x4E, 0x89, 0xEC, /* 0x10-0x13 */
+	0xE9, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE9, 0x52, 0xE9, 0x53, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE9, 0x55, 0xE9, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE9, 0x54, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDC, /* 0x24-0x27 */
+	0x8A, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE9, 0x56, 0x00, 0x00, 0xE9, 0x57, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xE9, 0x58, 0xE9, 0x59, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x5A, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE9, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE9, 0x5B, 0x00, 0x00, 0xE9, 0x5E, /* 0x48-0x4B */
+	0xE9, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE9, 0x5D, 0xE9, 0x5F, 0xE9, 0x60, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE9, 0x62, 0x00, 0x00, 0x8B, 0xC0, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xF1, 0xE9, 0x63, /* 0x94-0x97 */
+	0xE9, 0x64, 0x8D, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE9, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x8A, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x94, 0x6E, 0xE9, 0x66, 0xE9, 0x67, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x79, /* 0xB0-0xB3 */
+	0x93, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE9, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x94, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x91, 0xCA, 0x89, 0x77, 0x8B, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x8B, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x92, 0x93, 0xE9, 0x6D, 0x8B, 0xEE, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x89, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE9, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x6A, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE9, 0x6B, 0x00, 0x00, 0xE9, 0x69, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x77, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE9, 0x6E, 0xE9, 0x6F, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE9, 0x70, 0xE9, 0x71, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE9, 0x73, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x72, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x78, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0x00, 0x00, 0xE9, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x52, 0xE9, 0x75, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x9B, 0x8C, 0xB1, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x91, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x79, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x93, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x7A, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x80, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE9, 0x7D, 0x00, 0x00, 0xE9, 0x7C, 0xE9, 0x7E, /* 0x40-0x43 */
+	0x00, 0x00, 0xE9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xE9, 0x82, 0xEE, 0xDF, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE9, 0x81, 0x00, 0x00, 0xE9, 0x84, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xC1, 0xE9, 0x83, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x85, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x86, 0x00, 0x00, /* 0x60-0x63 */
+	0xE9, 0x88, 0xE9, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE9, 0x89, 0xE9, 0x8B, 0xE9, 0x8A, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x8D, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE9, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE9, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8A, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE9, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE9, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x90, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x90, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE9, 0x91, 0x00, 0x00, 0xE9, 0x92, /* 0xD0-0xD3 */
+	0xE9, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x8D, 0x82, 0xEE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xEE, 0xE1, 0x00, 0x00, 0xE9, 0x94, 0xE9, 0x95, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x96, 0xE9, 0x97, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x98, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xAF, 0xE9, 0x9A, /* 0xE8-0xEB */
+	0x00, 0x00, 0x95, 0x45, 0xE9, 0x9B, 0xE9, 0x99, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE9, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE9, 0x9C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9E, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9F, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xA0, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE9, 0xA1, 0x00, 0x00, 0xE9, 0xA2, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA3, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0xE9, 0xA5, /* 0x20-0x23 */
+	0x00, 0x00, 0xE9, 0xA6, 0x00, 0x00, 0xE9, 0xA7, /* 0x24-0x27 */
+	0xE9, 0xA8, 0xE9, 0xA9, 0xE9, 0xAA, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xAB, 0xE9, 0xAC, /* 0x2C-0x2F */
+	0x00, 0x00, 0x9F, 0x54, 0xE9, 0xAD, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, /* 0x38-0x3B */
+	0x8B, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x8A, 0x40, 0x8D, 0xB0, 0xE9, 0xAF, /* 0x40-0x43 */
+	0xE9, 0xAE, 0x96, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xE9, 0xB1, 0xE9, 0xB2, 0xE9, 0xB0, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x96, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE9, 0xB4, 0x00, 0x00, 0x8B, 0x9B, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x44, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, 0x00, 0x00, /* 0x70-0x73 */
+	0xE9, 0xB5, 0xEE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB7, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xBC, 0xEE, 0xE4, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE9, 0xB8, 0x95, 0xA9, 0xE9, 0xB6, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xBA, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xBB, /* 0x9C-0x9F */
+	0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE9, 0xBD, 0x00, 0x00, 0x96, 0x8E, 0x8E, 0x4C, /* 0xA8-0xAB */
+	0x00, 0x00, 0x8D, 0xF8, 0x91, 0x4E, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEE, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, 0xEE, 0xE6, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE9, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x8C, 0xEF, 0xE9, 0xC0, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC3, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE9, 0xC4, 0xE9, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE9, 0xC9, 0x00, 0x00, 0x8E, 0x49, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xE2, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE9, 0xCA, 0xE9, 0xC7, 0xE9, 0xC6, /* 0xE0-0xE3 */
+	0xE9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x8C, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE9, 0xCE, 0xE9, 0xCD, 0xE9, 0xCC, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x88, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0xEE, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE9, 0xD8, 0x00, 0x00, 0xE9, 0xD4, 0x00, 0x00, /* 0x04-0x07 */
+	0xE9, 0xD5, 0xE9, 0xD1, 0xE9, 0xD7, 0x00, 0x00, /* 0x08-0x0B */
+	0xE9, 0xD3, 0x8A, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x98, 0x6B, 0x00, 0x00, 0xE9, 0xD6, 0xE9, 0xD2, /* 0x10-0x13 */
+	0xE9, 0xD0, 0xE9, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE9, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE9, 0xDC, 0xE9, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x95, 0x68, 0xE9, 0xD9, 0x88, 0xF1, /* 0x2C-0x2F */
+	0xE9, 0xDE, 0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8A, 0x8F, 0xE9, 0xCB, 0x89, 0x56, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE1, 0xE9, 0xDF, /* 0x44-0x47 */
+	0x92, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x90, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD8, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE5, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0x74-0x77 */
+	0xE9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x92, 0xB9, 0x00, 0x00, 0xE9, 0xE8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x94, 0xB5, 0x00, 0x00, 0xE9, 0xED, /* 0xE8-0xEB */
+	0xE9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE9, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x96, 0x50, /* 0xF0-0xF3 */
+	0x96, 0xC2, 0x00, 0x00, 0x93, 0xCE, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xEE, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xEF, 0x93, 0xBC, /* 0x04-0x07 */
+	0xE9, 0xEC, 0xE9, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF7, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE9, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x95, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF4, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, 0x00, 0x00, /* 0x24-0x27 */
+	0x8A, 0x9B, 0x00, 0x00, 0xE9, 0xF0, 0x8E, 0xB0, /* 0x28-0x2B */
+	0x89, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x83, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xFA, 0xE9, 0xF9, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE9, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE9, 0xF5, 0x00, 0x00, 0xE9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */
+	0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xEA, 0x44, 0xEA, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xEA, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x89, 0x4C, 0xEA, 0x40, 0xEA, 0x41, 0x00, 0x00, /* 0x5C-0x5F */
+	0x8D, 0x94, 0x96, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEA, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE9, /* 0x68-0x6B */
+	0x96, 0x51, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4A, /* 0x6C-0x6F */
+	0xEE, 0xE8, 0x00, 0x00, 0xEA, 0x46, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x4B, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x48, /* 0x84-0x87 */
+	0x00, 0x00, 0xEA, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x7B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x4C, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEA, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xEA, 0x4E, 0x00, 0x00, 0xEA, 0x49, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x4F, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x92, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEA, 0x53, 0x00, 0x00, 0xEA, 0x54, 0xEA, 0x52, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xEA, 0x51, 0xEA, 0x57, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0x50, 0x00, 0x00, 0xEA, 0x55, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x56, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x59, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEA, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x5B, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEA, 0x5C, 0x00, 0x00, 0xEA, 0x5D, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x68, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xEA, 0x5A, 0x91, 0xE9, 0x8D, 0xEB, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xEA, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xEE, 0xEB, 0xEA, 0x5F, 0xEA, 0x60, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x61, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xEA, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x8C, 0xB2, 0xEA, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xEA, 0x64, 0x00, 0x00, 0x8E, 0xAD, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xEA, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xEA, 0x66, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x67, /* 0x88-0x8B */
+	0xEA, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEA, 0x6B, 0xEA, 0x69, 0x98, 0x5B, /* 0x90-0x93 */
+	0x00, 0x00, 0xEA, 0x6A, 0x00, 0x00, 0x97, 0xED, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0x97, 0xD9, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xEA, 0x6D, 0x94, 0x9E, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xEA, 0x6E, 0xEA, 0x70, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEA, 0x6F, 0x8D, 0x8D, 0x96, 0xCB, 0x96, 0x83, /* 0xB8-0xBB */
+	0x9B, 0xF5, 0x00, 0x00, 0x9F, 0x80, 0x96, 0x9B, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x89, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xEA, 0x73, 0x8B, 0x6F, 0xEA, 0x74, 0xEA, 0x75, /* 0xCC-0xCF */
+	0xEA, 0x76, 0xEE, 0xEC, 0x8D, 0x95, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE0, 0xD2, 0x96, 0xD9, 0x00, 0x00, 0x91, 0xE1, /* 0xD8-0xDB */
+	0xEA, 0x78, 0xEA, 0x7A, 0xEA, 0x79, 0x00, 0x00, /* 0xDC-0xDF */
+	0xEA, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xEA, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEA, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x7E, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEA, 0x80, 0x00, 0x00, 0xEA, 0x81, 0xEA, 0x82, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEA, 0x83, 0x00, 0x00, 0xEA, 0x84, /* 0xF8-0xFB */
+	0xEA, 0x85, 0xEA, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x87, /* 0x04-0x07 */
+	0xEA, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x43, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xDB, /* 0x10-0x13 */
+	0x00, 0x00, 0xEA, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x91, 0x6C, 0xEA, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xEA, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x40, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x8D, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x8E, 0xE2, 0x56, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0xE8, 0xEB, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x8F, 0x00, 0x00, /* 0x50-0x53 */
+	0xEA, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x92, /* 0x5C-0x5F */
+	0xEA, 0x93, 0xEA, 0x94, 0x97, 0xEE, 0xEA, 0x91, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x95, 0xEA, 0x96, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x98, 0x00, 0x00, /* 0x68-0x6B */
+	0xEA, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x9A, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x9B, 0xEA, 0x99, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x97, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xEA, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xEA, 0x9D, 0xE2, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEA, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xEE, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x73, 0xED, 0x7E, /* 0x0C-0x0F */
+	0xED, 0x80, 0xED, 0x95, 0xED, 0xBC, 0xED, 0xCC, /* 0x10-0x13 */
+	0xED, 0xCE, 0xED, 0xF9, 0xEE, 0x42, 0xEE, 0x59, /* 0x14-0x17 */
+	0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x65, /* 0x18-0x1B */
+	0xEE, 0x69, 0xEE, 0x6C, 0xEE, 0x75, 0xEE, 0x81, /* 0x1C-0x1F */
+	0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x8D, 0xEE, 0x95, /* 0x20-0x23 */
+	0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x9B, 0xEE, 0xB7, /* 0x24-0x27 */
+	0xEE, 0xBE, 0xEE, 0xCE, 0xEE, 0xDA, 0xEE, 0xDB, /* 0x28-0x2B */
+	0xEE, 0xDD, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0x81, 0x49, 0xEE, 0xFC, 0x81, 0x94, /* 0x00-0x03 */
+	0x81, 0x90, 0x81, 0x93, 0x81, 0x95, 0xEE, 0xFB, /* 0x04-0x07 */
+	0x81, 0x69, 0x81, 0x6A, 0x81, 0x96, 0x81, 0x7B, /* 0x08-0x0B */
+	0x81, 0x43, 0x81, 0x7C, 0x81, 0x44, 0x81, 0x5E, /* 0x0C-0x0F */
+	0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, 0x82, 0x52, /* 0x10-0x13 */
+	0x82, 0x53, 0x82, 0x54, 0x82, 0x55, 0x82, 0x56, /* 0x14-0x17 */
+	0x82, 0x57, 0x82, 0x58, 0x81, 0x46, 0x81, 0x47, /* 0x18-0x1B */
+	0x81, 0x83, 0x81, 0x81, 0x81, 0x84, 0x81, 0x48, /* 0x1C-0x1F */
+	0x81, 0x97, 0x82, 0x60, 0x82, 0x61, 0x82, 0x62, /* 0x20-0x23 */
+	0x82, 0x63, 0x82, 0x64, 0x82, 0x65, 0x82, 0x66, /* 0x24-0x27 */
+	0x82, 0x67, 0x82, 0x68, 0x82, 0x69, 0x82, 0x6A, /* 0x28-0x2B */
+	0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0x2C-0x2F */
+	0x82, 0x6F, 0x82, 0x70, 0x82, 0x71, 0x82, 0x72, /* 0x30-0x33 */
+	0x82, 0x73, 0x82, 0x74, 0x82, 0x75, 0x82, 0x76, /* 0x34-0x37 */
+	0x82, 0x77, 0x82, 0x78, 0x82, 0x79, 0x81, 0x6D, /* 0x38-0x3B */
+	0x81, 0x5F, 0x81, 0x6E, 0x81, 0x4F, 0x81, 0x51, /* 0x3C-0x3F */
+	0x81, 0x4D, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x40-0x43 */
+	0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x44-0x47 */
+	0x82, 0x88, 0x82, 0x89, 0x82, 0x8A, 0x82, 0x8B, /* 0x48-0x4B */
+	0x82, 0x8C, 0x82, 0x8D, 0x82, 0x8E, 0x82, 0x8F, /* 0x4C-0x4F */
+	0x82, 0x90, 0x82, 0x91, 0x82, 0x92, 0x82, 0x93, /* 0x50-0x53 */
+	0x82, 0x94, 0x82, 0x95, 0x82, 0x96, 0x82, 0x97, /* 0x54-0x57 */
+	0x82, 0x98, 0x82, 0x99, 0x82, 0x9A, 0x81, 0x6F, /* 0x58-0x5B */
+	0x81, 0x62, 0x81, 0x70, 0x81, 0x60, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0xA1, 0x00, 0xA2, 0x00, 0xA3, /* 0x60-0x63 */
+	0x00, 0xA4, 0x00, 0xA5, 0x00, 0xA6, 0x00, 0xA7, /* 0x64-0x67 */
+	0x00, 0xA8, 0x00, 0xA9, 0x00, 0xAA, 0x00, 0xAB, /* 0x68-0x6B */
+	0x00, 0xAC, 0x00, 0xAD, 0x00, 0xAE, 0x00, 0xAF, /* 0x6C-0x6F */
+	0x00, 0xB0, 0x00, 0xB1, 0x00, 0xB2, 0x00, 0xB3, /* 0x70-0x73 */
+	0x00, 0xB4, 0x00, 0xB5, 0x00, 0xB6, 0x00, 0xB7, /* 0x74-0x77 */
+	0x00, 0xB8, 0x00, 0xB9, 0x00, 0xBA, 0x00, 0xBB, /* 0x78-0x7B */
+	0x00, 0xBC, 0x00, 0xBD, 0x00, 0xBE, 0x00, 0xBF, /* 0x7C-0x7F */
+	
+	0x00, 0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, /* 0x80-0x83 */
+	0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, /* 0x84-0x87 */
+	0x00, 0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, /* 0x88-0x8B */
+	0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, /* 0x8C-0x8F */
+	0x00, 0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, /* 0x90-0x93 */
+	0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xD7, /* 0x94-0x97 */
+	0x00, 0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, /* 0x98-0x9B */
+	0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0xDF, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x81, 0x91, 0x81, 0x92, 0x81, 0xCA, 0x81, 0x50, /* 0xE0-0xE3 */
+	0xEE, 0xFA, 0x81, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	NULL,   NULL,   NULL,   u2c_03, u2c_04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, NULL,   u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   NULL,   u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+		    unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (ch == 0xFF && 0x61 <= cl && cl <= 0x9F) {
+		out[0] = cl + 0x40;
+		return 1;
+	}
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen < 2)
+			return -ENAMETOOLONG;
+
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		return 2;
+	} else if (ch == 0) {
+		if (cl <= 0x7F) {
+			out[0] = cl;
+			return 1;
+		} else if (0xA0 <= cl) {
+			out[0] = u2c_00hi[cl - 0xA0][0];
+			out[1] = u2c_00hi[cl - 0xA0][1];
+			if (out[0] && out[1])
+				return 2;
+		}
+		return -EINVAL;
+	}
+	else
+		return -EINVAL;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+		    wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (rawstring[0] <= 0x7F) {
+		*uni = rawstring[0];
+		return 1;
+	}
+	if (0xA1 <= rawstring[0] && rawstring[0] <= 0xDF) {
+		*uni = 0xFF00 | (rawstring[0] - 0x40);
+		return 1;
+	}
+
+	if (boundlen < 2)
+		return -ENAMETOOLONG;
+	ch = rawstring[0];
+	cl = rawstring[1];
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		return 2;
+	}
+	else
+		return -EINVAL;
+}
+
+static struct nls_table table = {
+	.charset	= "cp932",
+	.alias		= "sjis",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp932(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp932(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp932)
+module_exit(exit_nls_cp932)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(sjis);
diff --git a/kernel/fs/nls/nls_cp936.c b/kernel/fs/nls/nls_cp936.c
new file mode 100644
index 000000000..c96546cfe
--- /dev/null
+++ b/kernel/fs/nls/nls_cp936.c
@@ -0,0 +1,11111 @@
+/*
+ * linux/fs/nls/nls_cp936.c
+ *
+ * Charset cp936 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_81[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E02,0x4E04,0x4E05,0x4E06,0x4E0F,0x4E12,0x4E17,0x4E1F,/* 0x40-0x47 */
+	0x4E20,0x4E21,0x4E23,0x4E26,0x4E29,0x4E2E,0x4E2F,0x4E31,/* 0x48-0x4F */
+	0x4E33,0x4E35,0x4E37,0x4E3C,0x4E40,0x4E41,0x4E42,0x4E44,/* 0x50-0x57 */
+	0x4E46,0x4E4A,0x4E51,0x4E55,0x4E57,0x4E5A,0x4E5B,0x4E62,/* 0x58-0x5F */
+	0x4E63,0x4E64,0x4E65,0x4E67,0x4E68,0x4E6A,0x4E6B,0x4E6C,/* 0x60-0x67 */
+	0x4E6D,0x4E6E,0x4E6F,0x4E72,0x4E74,0x4E75,0x4E76,0x4E77,/* 0x68-0x6F */
+	0x4E78,0x4E79,0x4E7A,0x4E7B,0x4E7C,0x4E7D,0x4E7F,0x4E80,/* 0x70-0x77 */
+	0x4E81,0x4E82,0x4E83,0x4E84,0x4E85,0x4E87,0x4E8A,0x0000,/* 0x78-0x7F */
+
+	0x4E90,0x4E96,0x4E97,0x4E99,0x4E9C,0x4E9D,0x4E9E,0x4EA3,/* 0x80-0x87 */
+	0x4EAA,0x4EAF,0x4EB0,0x4EB1,0x4EB4,0x4EB6,0x4EB7,0x4EB8,/* 0x88-0x8F */
+	0x4EB9,0x4EBC,0x4EBD,0x4EBE,0x4EC8,0x4ECC,0x4ECF,0x4ED0,/* 0x90-0x97 */
+	0x4ED2,0x4EDA,0x4EDB,0x4EDC,0x4EE0,0x4EE2,0x4EE6,0x4EE7,/* 0x98-0x9F */
+	0x4EE9,0x4EED,0x4EEE,0x4EEF,0x4EF1,0x4EF4,0x4EF8,0x4EF9,/* 0xA0-0xA7 */
+	0x4EFA,0x4EFC,0x4EFE,0x4F00,0x4F02,0x4F03,0x4F04,0x4F05,/* 0xA8-0xAF */
+	0x4F06,0x4F07,0x4F08,0x4F0B,0x4F0C,0x4F12,0x4F13,0x4F14,/* 0xB0-0xB7 */
+	0x4F15,0x4F16,0x4F1C,0x4F1D,0x4F21,0x4F23,0x4F28,0x4F29,/* 0xB8-0xBF */
+	0x4F2C,0x4F2D,0x4F2E,0x4F31,0x4F33,0x4F35,0x4F37,0x4F39,/* 0xC0-0xC7 */
+	0x4F3B,0x4F3E,0x4F3F,0x4F40,0x4F41,0x4F42,0x4F44,0x4F45,/* 0xC8-0xCF */
+	0x4F47,0x4F48,0x4F49,0x4F4A,0x4F4B,0x4F4C,0x4F52,0x4F54,/* 0xD0-0xD7 */
+	0x4F56,0x4F61,0x4F62,0x4F66,0x4F68,0x4F6A,0x4F6B,0x4F6D,/* 0xD8-0xDF */
+	0x4F6E,0x4F71,0x4F72,0x4F75,0x4F77,0x4F78,0x4F79,0x4F7A,/* 0xE0-0xE7 */
+	0x4F7D,0x4F80,0x4F81,0x4F82,0x4F85,0x4F86,0x4F87,0x4F8A,/* 0xE8-0xEF */
+	0x4F8C,0x4F8E,0x4F90,0x4F92,0x4F93,0x4F95,0x4F96,0x4F98,/* 0xF0-0xF7 */
+	0x4F99,0x4F9A,0x4F9C,0x4F9E,0x4F9F,0x4FA1,0x4FA2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_82[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4FA4,0x4FAB,0x4FAD,0x4FB0,0x4FB1,0x4FB2,0x4FB3,0x4FB4,/* 0x40-0x47 */
+	0x4FB6,0x4FB7,0x4FB8,0x4FB9,0x4FBA,0x4FBB,0x4FBC,0x4FBD,/* 0x48-0x4F */
+	0x4FBE,0x4FC0,0x4FC1,0x4FC2,0x4FC6,0x4FC7,0x4FC8,0x4FC9,/* 0x50-0x57 */
+	0x4FCB,0x4FCC,0x4FCD,0x4FD2,0x4FD3,0x4FD4,0x4FD5,0x4FD6,/* 0x58-0x5F */
+	0x4FD9,0x4FDB,0x4FE0,0x4FE2,0x4FE4,0x4FE5,0x4FE7,0x4FEB,/* 0x60-0x67 */
+	0x4FEC,0x4FF0,0x4FF2,0x4FF4,0x4FF5,0x4FF6,0x4FF7,0x4FF9,/* 0x68-0x6F */
+	0x4FFB,0x4FFC,0x4FFD,0x4FFF,0x5000,0x5001,0x5002,0x5003,/* 0x70-0x77 */
+	0x5004,0x5005,0x5006,0x5007,0x5008,0x5009,0x500A,0x0000,/* 0x78-0x7F */
+
+	0x500B,0x500E,0x5010,0x5011,0x5013,0x5015,0x5016,0x5017,/* 0x80-0x87 */
+	0x501B,0x501D,0x501E,0x5020,0x5022,0x5023,0x5024,0x5027,/* 0x88-0x8F */
+	0x502B,0x502F,0x5030,0x5031,0x5032,0x5033,0x5034,0x5035,/* 0x90-0x97 */
+	0x5036,0x5037,0x5038,0x5039,0x503B,0x503D,0x503F,0x5040,/* 0x98-0x9F */
+	0x5041,0x5042,0x5044,0x5045,0x5046,0x5049,0x504A,0x504B,/* 0xA0-0xA7 */
+	0x504D,0x5050,0x5051,0x5052,0x5053,0x5054,0x5056,0x5057,/* 0xA8-0xAF */
+	0x5058,0x5059,0x505B,0x505D,0x505E,0x505F,0x5060,0x5061,/* 0xB0-0xB7 */
+	0x5062,0x5063,0x5064,0x5066,0x5067,0x5068,0x5069,0x506A,/* 0xB8-0xBF */
+	0x506B,0x506D,0x506E,0x506F,0x5070,0x5071,0x5072,0x5073,/* 0xC0-0xC7 */
+	0x5074,0x5075,0x5078,0x5079,0x507A,0x507C,0x507D,0x5081,/* 0xC8-0xCF */
+	0x5082,0x5083,0x5084,0x5086,0x5087,0x5089,0x508A,0x508B,/* 0xD0-0xD7 */
+	0x508C,0x508E,0x508F,0x5090,0x5091,0x5092,0x5093,0x5094,/* 0xD8-0xDF */
+	0x5095,0x5096,0x5097,0x5098,0x5099,0x509A,0x509B,0x509C,/* 0xE0-0xE7 */
+	0x509D,0x509E,0x509F,0x50A0,0x50A1,0x50A2,0x50A4,0x50A6,/* 0xE8-0xEF */
+	0x50AA,0x50AB,0x50AD,0x50AE,0x50AF,0x50B0,0x50B1,0x50B3,/* 0xF0-0xF7 */
+	0x50B4,0x50B5,0x50B6,0x50B7,0x50B8,0x50B9,0x50BC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_83[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x50BD,0x50BE,0x50BF,0x50C0,0x50C1,0x50C2,0x50C3,0x50C4,/* 0x40-0x47 */
+	0x50C5,0x50C6,0x50C7,0x50C8,0x50C9,0x50CA,0x50CB,0x50CC,/* 0x48-0x4F */
+	0x50CD,0x50CE,0x50D0,0x50D1,0x50D2,0x50D3,0x50D4,0x50D5,/* 0x50-0x57 */
+	0x50D7,0x50D8,0x50D9,0x50DB,0x50DC,0x50DD,0x50DE,0x50DF,/* 0x58-0x5F */
+	0x50E0,0x50E1,0x50E2,0x50E3,0x50E4,0x50E5,0x50E8,0x50E9,/* 0x60-0x67 */
+	0x50EA,0x50EB,0x50EF,0x50F0,0x50F1,0x50F2,0x50F4,0x50F6,/* 0x68-0x6F */
+	0x50F7,0x50F8,0x50F9,0x50FA,0x50FC,0x50FD,0x50FE,0x50FF,/* 0x70-0x77 */
+	0x5100,0x5101,0x5102,0x5103,0x5104,0x5105,0x5108,0x0000,/* 0x78-0x7F */
+
+	0x5109,0x510A,0x510C,0x510D,0x510E,0x510F,0x5110,0x5111,/* 0x80-0x87 */
+	0x5113,0x5114,0x5115,0x5116,0x5117,0x5118,0x5119,0x511A,/* 0x88-0x8F */
+	0x511B,0x511C,0x511D,0x511E,0x511F,0x5120,0x5122,0x5123,/* 0x90-0x97 */
+	0x5124,0x5125,0x5126,0x5127,0x5128,0x5129,0x512A,0x512B,/* 0x98-0x9F */
+	0x512C,0x512D,0x512E,0x512F,0x5130,0x5131,0x5132,0x5133,/* 0xA0-0xA7 */
+	0x5134,0x5135,0x5136,0x5137,0x5138,0x5139,0x513A,0x513B,/* 0xA8-0xAF */
+	0x513C,0x513D,0x513E,0x5142,0x5147,0x514A,0x514C,0x514E,/* 0xB0-0xB7 */
+	0x514F,0x5150,0x5152,0x5153,0x5157,0x5158,0x5159,0x515B,/* 0xB8-0xBF */
+	0x515D,0x515E,0x515F,0x5160,0x5161,0x5163,0x5164,0x5166,/* 0xC0-0xC7 */
+	0x5167,0x5169,0x516A,0x516F,0x5172,0x517A,0x517E,0x517F,/* 0xC8-0xCF */
+	0x5183,0x5184,0x5186,0x5187,0x518A,0x518B,0x518E,0x518F,/* 0xD0-0xD7 */
+	0x5190,0x5191,0x5193,0x5194,0x5198,0x519A,0x519D,0x519E,/* 0xD8-0xDF */
+	0x519F,0x51A1,0x51A3,0x51A6,0x51A7,0x51A8,0x51A9,0x51AA,/* 0xE0-0xE7 */
+	0x51AD,0x51AE,0x51B4,0x51B8,0x51B9,0x51BA,0x51BE,0x51BF,/* 0xE8-0xEF */
+	0x51C1,0x51C2,0x51C3,0x51C5,0x51C8,0x51CA,0x51CD,0x51CE,/* 0xF0-0xF7 */
+	0x51D0,0x51D2,0x51D3,0x51D4,0x51D5,0x51D6,0x51D7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_84[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x51D8,0x51D9,0x51DA,0x51DC,0x51DE,0x51DF,0x51E2,0x51E3,/* 0x40-0x47 */
+	0x51E5,0x51E6,0x51E7,0x51E8,0x51E9,0x51EA,0x51EC,0x51EE,/* 0x48-0x4F */
+	0x51F1,0x51F2,0x51F4,0x51F7,0x51FE,0x5204,0x5205,0x5209,/* 0x50-0x57 */
+	0x520B,0x520C,0x520F,0x5210,0x5213,0x5214,0x5215,0x521C,/* 0x58-0x5F */
+	0x521E,0x521F,0x5221,0x5222,0x5223,0x5225,0x5226,0x5227,/* 0x60-0x67 */
+	0x522A,0x522C,0x522F,0x5231,0x5232,0x5234,0x5235,0x523C,/* 0x68-0x6F */
+	0x523E,0x5244,0x5245,0x5246,0x5247,0x5248,0x5249,0x524B,/* 0x70-0x77 */
+	0x524E,0x524F,0x5252,0x5253,0x5255,0x5257,0x5258,0x0000,/* 0x78-0x7F */
+
+	0x5259,0x525A,0x525B,0x525D,0x525F,0x5260,0x5262,0x5263,/* 0x80-0x87 */
+	0x5264,0x5266,0x5268,0x526B,0x526C,0x526D,0x526E,0x5270,/* 0x88-0x8F */
+	0x5271,0x5273,0x5274,0x5275,0x5276,0x5277,0x5278,0x5279,/* 0x90-0x97 */
+	0x527A,0x527B,0x527C,0x527E,0x5280,0x5283,0x5284,0x5285,/* 0x98-0x9F */
+	0x5286,0x5287,0x5289,0x528A,0x528B,0x528C,0x528D,0x528E,/* 0xA0-0xA7 */
+	0x528F,0x5291,0x5292,0x5294,0x5295,0x5296,0x5297,0x5298,/* 0xA8-0xAF */
+	0x5299,0x529A,0x529C,0x52A4,0x52A5,0x52A6,0x52A7,0x52AE,/* 0xB0-0xB7 */
+	0x52AF,0x52B0,0x52B4,0x52B5,0x52B6,0x52B7,0x52B8,0x52B9,/* 0xB8-0xBF */
+	0x52BA,0x52BB,0x52BC,0x52BD,0x52C0,0x52C1,0x52C2,0x52C4,/* 0xC0-0xC7 */
+	0x52C5,0x52C6,0x52C8,0x52CA,0x52CC,0x52CD,0x52CE,0x52CF,/* 0xC8-0xCF */
+	0x52D1,0x52D3,0x52D4,0x52D5,0x52D7,0x52D9,0x52DA,0x52DB,/* 0xD0-0xD7 */
+	0x52DC,0x52DD,0x52DE,0x52E0,0x52E1,0x52E2,0x52E3,0x52E5,/* 0xD8-0xDF */
+	0x52E6,0x52E7,0x52E8,0x52E9,0x52EA,0x52EB,0x52EC,0x52ED,/* 0xE0-0xE7 */
+	0x52EE,0x52EF,0x52F1,0x52F2,0x52F3,0x52F4,0x52F5,0x52F6,/* 0xE8-0xEF */
+	0x52F7,0x52F8,0x52FB,0x52FC,0x52FD,0x5301,0x5302,0x5303,/* 0xF0-0xF7 */
+	0x5304,0x5307,0x5309,0x530A,0x530B,0x530C,0x530E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_85[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5311,0x5312,0x5313,0x5314,0x5318,0x531B,0x531C,0x531E,/* 0x40-0x47 */
+	0x531F,0x5322,0x5324,0x5325,0x5327,0x5328,0x5329,0x532B,/* 0x48-0x4F */
+	0x532C,0x532D,0x532F,0x5330,0x5331,0x5332,0x5333,0x5334,/* 0x50-0x57 */
+	0x5335,0x5336,0x5337,0x5338,0x533C,0x533D,0x5340,0x5342,/* 0x58-0x5F */
+	0x5344,0x5346,0x534B,0x534C,0x534D,0x5350,0x5354,0x5358,/* 0x60-0x67 */
+	0x5359,0x535B,0x535D,0x5365,0x5368,0x536A,0x536C,0x536D,/* 0x68-0x6F */
+	0x5372,0x5376,0x5379,0x537B,0x537C,0x537D,0x537E,0x5380,/* 0x70-0x77 */
+	0x5381,0x5383,0x5387,0x5388,0x538A,0x538E,0x538F,0x0000,/* 0x78-0x7F */
+
+	0x5390,0x5391,0x5392,0x5393,0x5394,0x5396,0x5397,0x5399,/* 0x80-0x87 */
+	0x539B,0x539C,0x539E,0x53A0,0x53A1,0x53A4,0x53A7,0x53AA,/* 0x88-0x8F */
+	0x53AB,0x53AC,0x53AD,0x53AF,0x53B0,0x53B1,0x53B2,0x53B3,/* 0x90-0x97 */
+	0x53B4,0x53B5,0x53B7,0x53B8,0x53B9,0x53BA,0x53BC,0x53BD,/* 0x98-0x9F */
+	0x53BE,0x53C0,0x53C3,0x53C4,0x53C5,0x53C6,0x53C7,0x53CE,/* 0xA0-0xA7 */
+	0x53CF,0x53D0,0x53D2,0x53D3,0x53D5,0x53DA,0x53DC,0x53DD,/* 0xA8-0xAF */
+	0x53DE,0x53E1,0x53E2,0x53E7,0x53F4,0x53FA,0x53FE,0x53FF,/* 0xB0-0xB7 */
+	0x5400,0x5402,0x5405,0x5407,0x540B,0x5414,0x5418,0x5419,/* 0xB8-0xBF */
+	0x541A,0x541C,0x5422,0x5424,0x5425,0x542A,0x5430,0x5433,/* 0xC0-0xC7 */
+	0x5436,0x5437,0x543A,0x543D,0x543F,0x5441,0x5442,0x5444,/* 0xC8-0xCF */
+	0x5445,0x5447,0x5449,0x544C,0x544D,0x544E,0x544F,0x5451,/* 0xD0-0xD7 */
+	0x545A,0x545D,0x545E,0x545F,0x5460,0x5461,0x5463,0x5465,/* 0xD8-0xDF */
+	0x5467,0x5469,0x546A,0x546B,0x546C,0x546D,0x546E,0x546F,/* 0xE0-0xE7 */
+	0x5470,0x5474,0x5479,0x547A,0x547E,0x547F,0x5481,0x5483,/* 0xE8-0xEF */
+	0x5485,0x5487,0x5488,0x5489,0x548A,0x548D,0x5491,0x5493,/* 0xF0-0xF7 */
+	0x5497,0x5498,0x549C,0x549E,0x549F,0x54A0,0x54A1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_86[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54A2,0x54A5,0x54AE,0x54B0,0x54B2,0x54B5,0x54B6,0x54B7,/* 0x40-0x47 */
+	0x54B9,0x54BA,0x54BC,0x54BE,0x54C3,0x54C5,0x54CA,0x54CB,/* 0x48-0x4F */
+	0x54D6,0x54D8,0x54DB,0x54E0,0x54E1,0x54E2,0x54E3,0x54E4,/* 0x50-0x57 */
+	0x54EB,0x54EC,0x54EF,0x54F0,0x54F1,0x54F4,0x54F5,0x54F6,/* 0x58-0x5F */
+	0x54F7,0x54F8,0x54F9,0x54FB,0x54FE,0x5500,0x5502,0x5503,/* 0x60-0x67 */
+	0x5504,0x5505,0x5508,0x550A,0x550B,0x550C,0x550D,0x550E,/* 0x68-0x6F */
+	0x5512,0x5513,0x5515,0x5516,0x5517,0x5518,0x5519,0x551A,/* 0x70-0x77 */
+	0x551C,0x551D,0x551E,0x551F,0x5521,0x5525,0x5526,0x0000,/* 0x78-0x7F */
+
+	0x5528,0x5529,0x552B,0x552D,0x5532,0x5534,0x5535,0x5536,/* 0x80-0x87 */
+	0x5538,0x5539,0x553A,0x553B,0x553D,0x5540,0x5542,0x5545,/* 0x88-0x8F */
+	0x5547,0x5548,0x554B,0x554C,0x554D,0x554E,0x554F,0x5551,/* 0x90-0x97 */
+	0x5552,0x5553,0x5554,0x5557,0x5558,0x5559,0x555A,0x555B,/* 0x98-0x9F */
+	0x555D,0x555E,0x555F,0x5560,0x5562,0x5563,0x5568,0x5569,/* 0xA0-0xA7 */
+	0x556B,0x556F,0x5570,0x5571,0x5572,0x5573,0x5574,0x5579,/* 0xA8-0xAF */
+	0x557A,0x557D,0x557F,0x5585,0x5586,0x558C,0x558D,0x558E,/* 0xB0-0xB7 */
+	0x5590,0x5592,0x5593,0x5595,0x5596,0x5597,0x559A,0x559B,/* 0xB8-0xBF */
+	0x559E,0x55A0,0x55A1,0x55A2,0x55A3,0x55A4,0x55A5,0x55A6,/* 0xC0-0xC7 */
+	0x55A8,0x55A9,0x55AA,0x55AB,0x55AC,0x55AD,0x55AE,0x55AF,/* 0xC8-0xCF */
+	0x55B0,0x55B2,0x55B4,0x55B6,0x55B8,0x55BA,0x55BC,0x55BF,/* 0xD0-0xD7 */
+	0x55C0,0x55C1,0x55C2,0x55C3,0x55C6,0x55C7,0x55C8,0x55CA,/* 0xD8-0xDF */
+	0x55CB,0x55CE,0x55CF,0x55D0,0x55D5,0x55D7,0x55D8,0x55D9,/* 0xE0-0xE7 */
+	0x55DA,0x55DB,0x55DE,0x55E0,0x55E2,0x55E7,0x55E9,0x55ED,/* 0xE8-0xEF */
+	0x55EE,0x55F0,0x55F1,0x55F4,0x55F6,0x55F8,0x55F9,0x55FA,/* 0xF0-0xF7 */
+	0x55FB,0x55FC,0x55FF,0x5602,0x5603,0x5604,0x5605,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_87[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5606,0x5607,0x560A,0x560B,0x560D,0x5610,0x5611,0x5612,/* 0x40-0x47 */
+	0x5613,0x5614,0x5615,0x5616,0x5617,0x5619,0x561A,0x561C,/* 0x48-0x4F */
+	0x561D,0x5620,0x5621,0x5622,0x5625,0x5626,0x5628,0x5629,/* 0x50-0x57 */
+	0x562A,0x562B,0x562E,0x562F,0x5630,0x5633,0x5635,0x5637,/* 0x58-0x5F */
+	0x5638,0x563A,0x563C,0x563D,0x563E,0x5640,0x5641,0x5642,/* 0x60-0x67 */
+	0x5643,0x5644,0x5645,0x5646,0x5647,0x5648,0x5649,0x564A,/* 0x68-0x6F */
+	0x564B,0x564F,0x5650,0x5651,0x5652,0x5653,0x5655,0x5656,/* 0x70-0x77 */
+	0x565A,0x565B,0x565D,0x565E,0x565F,0x5660,0x5661,0x0000,/* 0x78-0x7F */
+
+	0x5663,0x5665,0x5666,0x5667,0x566D,0x566E,0x566F,0x5670,/* 0x80-0x87 */
+	0x5672,0x5673,0x5674,0x5675,0x5677,0x5678,0x5679,0x567A,/* 0x88-0x8F */
+	0x567D,0x567E,0x567F,0x5680,0x5681,0x5682,0x5683,0x5684,/* 0x90-0x97 */
+	0x5687,0x5688,0x5689,0x568A,0x568B,0x568C,0x568D,0x5690,/* 0x98-0x9F */
+	0x5691,0x5692,0x5694,0x5695,0x5696,0x5697,0x5698,0x5699,/* 0xA0-0xA7 */
+	0x569A,0x569B,0x569C,0x569D,0x569E,0x569F,0x56A0,0x56A1,/* 0xA8-0xAF */
+	0x56A2,0x56A4,0x56A5,0x56A6,0x56A7,0x56A8,0x56A9,0x56AA,/* 0xB0-0xB7 */
+	0x56AB,0x56AC,0x56AD,0x56AE,0x56B0,0x56B1,0x56B2,0x56B3,/* 0xB8-0xBF */
+	0x56B4,0x56B5,0x56B6,0x56B8,0x56B9,0x56BA,0x56BB,0x56BD,/* 0xC0-0xC7 */
+	0x56BE,0x56BF,0x56C0,0x56C1,0x56C2,0x56C3,0x56C4,0x56C5,/* 0xC8-0xCF */
+	0x56C6,0x56C7,0x56C8,0x56C9,0x56CB,0x56CC,0x56CD,0x56CE,/* 0xD0-0xD7 */
+	0x56CF,0x56D0,0x56D1,0x56D2,0x56D3,0x56D5,0x56D6,0x56D8,/* 0xD8-0xDF */
+	0x56D9,0x56DC,0x56E3,0x56E5,0x56E6,0x56E7,0x56E8,0x56E9,/* 0xE0-0xE7 */
+	0x56EA,0x56EC,0x56EE,0x56EF,0x56F2,0x56F3,0x56F6,0x56F7,/* 0xE8-0xEF */
+	0x56F8,0x56FB,0x56FC,0x5700,0x5701,0x5702,0x5705,0x5707,/* 0xF0-0xF7 */
+	0x570B,0x570C,0x570D,0x570E,0x570F,0x5710,0x5711,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_88[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5712,0x5713,0x5714,0x5715,0x5716,0x5717,0x5718,0x5719,/* 0x40-0x47 */
+	0x571A,0x571B,0x571D,0x571E,0x5720,0x5721,0x5722,0x5724,/* 0x48-0x4F */
+	0x5725,0x5726,0x5727,0x572B,0x5731,0x5732,0x5734,0x5735,/* 0x50-0x57 */
+	0x5736,0x5737,0x5738,0x573C,0x573D,0x573F,0x5741,0x5743,/* 0x58-0x5F */
+	0x5744,0x5745,0x5746,0x5748,0x5749,0x574B,0x5752,0x5753,/* 0x60-0x67 */
+	0x5754,0x5755,0x5756,0x5758,0x5759,0x5762,0x5763,0x5765,/* 0x68-0x6F */
+	0x5767,0x576C,0x576E,0x5770,0x5771,0x5772,0x5774,0x5775,/* 0x70-0x77 */
+	0x5778,0x5779,0x577A,0x577D,0x577E,0x577F,0x5780,0x0000,/* 0x78-0x7F */
+
+	0x5781,0x5787,0x5788,0x5789,0x578A,0x578D,0x578E,0x578F,/* 0x80-0x87 */
+	0x5790,0x5791,0x5794,0x5795,0x5796,0x5797,0x5798,0x5799,/* 0x88-0x8F */
+	0x579A,0x579C,0x579D,0x579E,0x579F,0x57A5,0x57A8,0x57AA,/* 0x90-0x97 */
+	0x57AC,0x57AF,0x57B0,0x57B1,0x57B3,0x57B5,0x57B6,0x57B7,/* 0x98-0x9F */
+	0x57B9,0x57BA,0x57BB,0x57BC,0x57BD,0x57BE,0x57BF,0x57C0,/* 0xA0-0xA7 */
+	0x57C1,0x57C4,0x57C5,0x57C6,0x57C7,0x57C8,0x57C9,0x57CA,/* 0xA8-0xAF */
+	0x57CC,0x57CD,0x57D0,0x57D1,0x57D3,0x57D6,0x57D7,0x57DB,/* 0xB0-0xB7 */
+	0x57DC,0x57DE,0x57E1,0x57E2,0x57E3,0x57E5,0x57E6,0x57E7,/* 0xB8-0xBF */
+	0x57E8,0x57E9,0x57EA,0x57EB,0x57EC,0x57EE,0x57F0,0x57F1,/* 0xC0-0xC7 */
+	0x57F2,0x57F3,0x57F5,0x57F6,0x57F7,0x57FB,0x57FC,0x57FE,/* 0xC8-0xCF */
+	0x57FF,0x5801,0x5803,0x5804,0x5805,0x5808,0x5809,0x580A,/* 0xD0-0xD7 */
+	0x580C,0x580E,0x580F,0x5810,0x5812,0x5813,0x5814,0x5816,/* 0xD8-0xDF */
+	0x5817,0x5818,0x581A,0x581B,0x581C,0x581D,0x581F,0x5822,/* 0xE0-0xE7 */
+	0x5823,0x5825,0x5826,0x5827,0x5828,0x5829,0x582B,0x582C,/* 0xE8-0xEF */
+	0x582D,0x582E,0x582F,0x5831,0x5832,0x5833,0x5834,0x5836,/* 0xF0-0xF7 */
+	0x5837,0x5838,0x5839,0x583A,0x583B,0x583C,0x583D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_89[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x583E,0x583F,0x5840,0x5841,0x5842,0x5843,0x5845,0x5846,/* 0x40-0x47 */
+	0x5847,0x5848,0x5849,0x584A,0x584B,0x584E,0x584F,0x5850,/* 0x48-0x4F */
+	0x5852,0x5853,0x5855,0x5856,0x5857,0x5859,0x585A,0x585B,/* 0x50-0x57 */
+	0x585C,0x585D,0x585F,0x5860,0x5861,0x5862,0x5863,0x5864,/* 0x58-0x5F */
+	0x5866,0x5867,0x5868,0x5869,0x586A,0x586D,0x586E,0x586F,/* 0x60-0x67 */
+	0x5870,0x5871,0x5872,0x5873,0x5874,0x5875,0x5876,0x5877,/* 0x68-0x6F */
+	0x5878,0x5879,0x587A,0x587B,0x587C,0x587D,0x587F,0x5882,/* 0x70-0x77 */
+	0x5884,0x5886,0x5887,0x5888,0x588A,0x588B,0x588C,0x0000,/* 0x78-0x7F */
+
+	0x588D,0x588E,0x588F,0x5890,0x5891,0x5894,0x5895,0x5896,/* 0x80-0x87 */
+	0x5897,0x5898,0x589B,0x589C,0x589D,0x58A0,0x58A1,0x58A2,/* 0x88-0x8F */
+	0x58A3,0x58A4,0x58A5,0x58A6,0x58A7,0x58AA,0x58AB,0x58AC,/* 0x90-0x97 */
+	0x58AD,0x58AE,0x58AF,0x58B0,0x58B1,0x58B2,0x58B3,0x58B4,/* 0x98-0x9F */
+	0x58B5,0x58B6,0x58B7,0x58B8,0x58B9,0x58BA,0x58BB,0x58BD,/* 0xA0-0xA7 */
+	0x58BE,0x58BF,0x58C0,0x58C2,0x58C3,0x58C4,0x58C6,0x58C7,/* 0xA8-0xAF */
+	0x58C8,0x58C9,0x58CA,0x58CB,0x58CC,0x58CD,0x58CE,0x58CF,/* 0xB0-0xB7 */
+	0x58D0,0x58D2,0x58D3,0x58D4,0x58D6,0x58D7,0x58D8,0x58D9,/* 0xB8-0xBF */
+	0x58DA,0x58DB,0x58DC,0x58DD,0x58DE,0x58DF,0x58E0,0x58E1,/* 0xC0-0xC7 */
+	0x58E2,0x58E3,0x58E5,0x58E6,0x58E7,0x58E8,0x58E9,0x58EA,/* 0xC8-0xCF */
+	0x58ED,0x58EF,0x58F1,0x58F2,0x58F4,0x58F5,0x58F7,0x58F8,/* 0xD0-0xD7 */
+	0x58FA,0x58FB,0x58FC,0x58FD,0x58FE,0x58FF,0x5900,0x5901,/* 0xD8-0xDF */
+	0x5903,0x5905,0x5906,0x5908,0x5909,0x590A,0x590B,0x590C,/* 0xE0-0xE7 */
+	0x590E,0x5910,0x5911,0x5912,0x5913,0x5917,0x5918,0x591B,/* 0xE8-0xEF */
+	0x591D,0x591E,0x5920,0x5921,0x5922,0x5923,0x5926,0x5928,/* 0xF0-0xF7 */
+	0x592C,0x5930,0x5932,0x5933,0x5935,0x5936,0x593B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x593D,0x593E,0x593F,0x5940,0x5943,0x5945,0x5946,0x594A,/* 0x40-0x47 */
+	0x594C,0x594D,0x5950,0x5952,0x5953,0x5959,0x595B,0x595C,/* 0x48-0x4F */
+	0x595D,0x595E,0x595F,0x5961,0x5963,0x5964,0x5966,0x5967,/* 0x50-0x57 */
+	0x5968,0x5969,0x596A,0x596B,0x596C,0x596D,0x596E,0x596F,/* 0x58-0x5F */
+	0x5970,0x5971,0x5972,0x5975,0x5977,0x597A,0x597B,0x597C,/* 0x60-0x67 */
+	0x597E,0x597F,0x5980,0x5985,0x5989,0x598B,0x598C,0x598E,/* 0x68-0x6F */
+	0x598F,0x5990,0x5991,0x5994,0x5995,0x5998,0x599A,0x599B,/* 0x70-0x77 */
+	0x599C,0x599D,0x599F,0x59A0,0x59A1,0x59A2,0x59A6,0x0000,/* 0x78-0x7F */
+
+	0x59A7,0x59AC,0x59AD,0x59B0,0x59B1,0x59B3,0x59B4,0x59B5,/* 0x80-0x87 */
+	0x59B6,0x59B7,0x59B8,0x59BA,0x59BC,0x59BD,0x59BF,0x59C0,/* 0x88-0x8F */
+	0x59C1,0x59C2,0x59C3,0x59C4,0x59C5,0x59C7,0x59C8,0x59C9,/* 0x90-0x97 */
+	0x59CC,0x59CD,0x59CE,0x59CF,0x59D5,0x59D6,0x59D9,0x59DB,/* 0x98-0x9F */
+	0x59DE,0x59DF,0x59E0,0x59E1,0x59E2,0x59E4,0x59E6,0x59E7,/* 0xA0-0xA7 */
+	0x59E9,0x59EA,0x59EB,0x59ED,0x59EE,0x59EF,0x59F0,0x59F1,/* 0xA8-0xAF */
+	0x59F2,0x59F3,0x59F4,0x59F5,0x59F6,0x59F7,0x59F8,0x59FA,/* 0xB0-0xB7 */
+	0x59FC,0x59FD,0x59FE,0x5A00,0x5A02,0x5A0A,0x5A0B,0x5A0D,/* 0xB8-0xBF */
+	0x5A0E,0x5A0F,0x5A10,0x5A12,0x5A14,0x5A15,0x5A16,0x5A17,/* 0xC0-0xC7 */
+	0x5A19,0x5A1A,0x5A1B,0x5A1D,0x5A1E,0x5A21,0x5A22,0x5A24,/* 0xC8-0xCF */
+	0x5A26,0x5A27,0x5A28,0x5A2A,0x5A2B,0x5A2C,0x5A2D,0x5A2E,/* 0xD0-0xD7 */
+	0x5A2F,0x5A30,0x5A33,0x5A35,0x5A37,0x5A38,0x5A39,0x5A3A,/* 0xD8-0xDF */
+	0x5A3B,0x5A3D,0x5A3E,0x5A3F,0x5A41,0x5A42,0x5A43,0x5A44,/* 0xE0-0xE7 */
+	0x5A45,0x5A47,0x5A48,0x5A4B,0x5A4C,0x5A4D,0x5A4E,0x5A4F,/* 0xE8-0xEF */
+	0x5A50,0x5A51,0x5A52,0x5A53,0x5A54,0x5A56,0x5A57,0x5A58,/* 0xF0-0xF7 */
+	0x5A59,0x5A5B,0x5A5C,0x5A5D,0x5A5E,0x5A5F,0x5A60,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5A61,0x5A63,0x5A64,0x5A65,0x5A66,0x5A68,0x5A69,0x5A6B,/* 0x40-0x47 */
+	0x5A6C,0x5A6D,0x5A6E,0x5A6F,0x5A70,0x5A71,0x5A72,0x5A73,/* 0x48-0x4F */
+	0x5A78,0x5A79,0x5A7B,0x5A7C,0x5A7D,0x5A7E,0x5A80,0x5A81,/* 0x50-0x57 */
+	0x5A82,0x5A83,0x5A84,0x5A85,0x5A86,0x5A87,0x5A88,0x5A89,/* 0x58-0x5F */
+	0x5A8A,0x5A8B,0x5A8C,0x5A8D,0x5A8E,0x5A8F,0x5A90,0x5A91,/* 0x60-0x67 */
+	0x5A93,0x5A94,0x5A95,0x5A96,0x5A97,0x5A98,0x5A99,0x5A9C,/* 0x68-0x6F */
+	0x5A9D,0x5A9E,0x5A9F,0x5AA0,0x5AA1,0x5AA2,0x5AA3,0x5AA4,/* 0x70-0x77 */
+	0x5AA5,0x5AA6,0x5AA7,0x5AA8,0x5AA9,0x5AAB,0x5AAC,0x0000,/* 0x78-0x7F */
+
+	0x5AAD,0x5AAE,0x5AAF,0x5AB0,0x5AB1,0x5AB4,0x5AB6,0x5AB7,/* 0x80-0x87 */
+	0x5AB9,0x5ABA,0x5ABB,0x5ABC,0x5ABD,0x5ABF,0x5AC0,0x5AC3,/* 0x88-0x8F */
+	0x5AC4,0x5AC5,0x5AC6,0x5AC7,0x5AC8,0x5ACA,0x5ACB,0x5ACD,/* 0x90-0x97 */
+	0x5ACE,0x5ACF,0x5AD0,0x5AD1,0x5AD3,0x5AD5,0x5AD7,0x5AD9,/* 0x98-0x9F */
+	0x5ADA,0x5ADB,0x5ADD,0x5ADE,0x5ADF,0x5AE2,0x5AE4,0x5AE5,/* 0xA0-0xA7 */
+	0x5AE7,0x5AE8,0x5AEA,0x5AEC,0x5AED,0x5AEE,0x5AEF,0x5AF0,/* 0xA8-0xAF */
+	0x5AF2,0x5AF3,0x5AF4,0x5AF5,0x5AF6,0x5AF7,0x5AF8,0x5AF9,/* 0xB0-0xB7 */
+	0x5AFA,0x5AFB,0x5AFC,0x5AFD,0x5AFE,0x5AFF,0x5B00,0x5B01,/* 0xB8-0xBF */
+	0x5B02,0x5B03,0x5B04,0x5B05,0x5B06,0x5B07,0x5B08,0x5B0A,/* 0xC0-0xC7 */
+	0x5B0B,0x5B0C,0x5B0D,0x5B0E,0x5B0F,0x5B10,0x5B11,0x5B12,/* 0xC8-0xCF */
+	0x5B13,0x5B14,0x5B15,0x5B18,0x5B19,0x5B1A,0x5B1B,0x5B1C,/* 0xD0-0xD7 */
+	0x5B1D,0x5B1E,0x5B1F,0x5B20,0x5B21,0x5B22,0x5B23,0x5B24,/* 0xD8-0xDF */
+	0x5B25,0x5B26,0x5B27,0x5B28,0x5B29,0x5B2A,0x5B2B,0x5B2C,/* 0xE0-0xE7 */
+	0x5B2D,0x5B2E,0x5B2F,0x5B30,0x5B31,0x5B33,0x5B35,0x5B36,/* 0xE8-0xEF */
+	0x5B38,0x5B39,0x5B3A,0x5B3B,0x5B3C,0x5B3D,0x5B3E,0x5B3F,/* 0xF0-0xF7 */
+	0x5B41,0x5B42,0x5B43,0x5B44,0x5B45,0x5B46,0x5B47,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5B48,0x5B49,0x5B4A,0x5B4B,0x5B4C,0x5B4D,0x5B4E,0x5B4F,/* 0x40-0x47 */
+	0x5B52,0x5B56,0x5B5E,0x5B60,0x5B61,0x5B67,0x5B68,0x5B6B,/* 0x48-0x4F */
+	0x5B6D,0x5B6E,0x5B6F,0x5B72,0x5B74,0x5B76,0x5B77,0x5B78,/* 0x50-0x57 */
+	0x5B79,0x5B7B,0x5B7C,0x5B7E,0x5B7F,0x5B82,0x5B86,0x5B8A,/* 0x58-0x5F */
+	0x5B8D,0x5B8E,0x5B90,0x5B91,0x5B92,0x5B94,0x5B96,0x5B9F,/* 0x60-0x67 */
+	0x5BA7,0x5BA8,0x5BA9,0x5BAC,0x5BAD,0x5BAE,0x5BAF,0x5BB1,/* 0x68-0x6F */
+	0x5BB2,0x5BB7,0x5BBA,0x5BBB,0x5BBC,0x5BC0,0x5BC1,0x5BC3,/* 0x70-0x77 */
+	0x5BC8,0x5BC9,0x5BCA,0x5BCB,0x5BCD,0x5BCE,0x5BCF,0x0000,/* 0x78-0x7F */
+
+	0x5BD1,0x5BD4,0x5BD5,0x5BD6,0x5BD7,0x5BD8,0x5BD9,0x5BDA,/* 0x80-0x87 */
+	0x5BDB,0x5BDC,0x5BE0,0x5BE2,0x5BE3,0x5BE6,0x5BE7,0x5BE9,/* 0x88-0x8F */
+	0x5BEA,0x5BEB,0x5BEC,0x5BED,0x5BEF,0x5BF1,0x5BF2,0x5BF3,/* 0x90-0x97 */
+	0x5BF4,0x5BF5,0x5BF6,0x5BF7,0x5BFD,0x5BFE,0x5C00,0x5C02,/* 0x98-0x9F */
+	0x5C03,0x5C05,0x5C07,0x5C08,0x5C0B,0x5C0C,0x5C0D,0x5C0E,/* 0xA0-0xA7 */
+	0x5C10,0x5C12,0x5C13,0x5C17,0x5C19,0x5C1B,0x5C1E,0x5C1F,/* 0xA8-0xAF */
+	0x5C20,0x5C21,0x5C23,0x5C26,0x5C28,0x5C29,0x5C2A,0x5C2B,/* 0xB0-0xB7 */
+	0x5C2D,0x5C2E,0x5C2F,0x5C30,0x5C32,0x5C33,0x5C35,0x5C36,/* 0xB8-0xBF */
+	0x5C37,0x5C43,0x5C44,0x5C46,0x5C47,0x5C4C,0x5C4D,0x5C52,/* 0xC0-0xC7 */
+	0x5C53,0x5C54,0x5C56,0x5C57,0x5C58,0x5C5A,0x5C5B,0x5C5C,/* 0xC8-0xCF */
+	0x5C5D,0x5C5F,0x5C62,0x5C64,0x5C67,0x5C68,0x5C69,0x5C6A,/* 0xD0-0xD7 */
+	0x5C6B,0x5C6C,0x5C6D,0x5C70,0x5C72,0x5C73,0x5C74,0x5C75,/* 0xD8-0xDF */
+	0x5C76,0x5C77,0x5C78,0x5C7B,0x5C7C,0x5C7D,0x5C7E,0x5C80,/* 0xE0-0xE7 */
+	0x5C83,0x5C84,0x5C85,0x5C86,0x5C87,0x5C89,0x5C8A,0x5C8B,/* 0xE8-0xEF */
+	0x5C8E,0x5C8F,0x5C92,0x5C93,0x5C95,0x5C9D,0x5C9E,0x5C9F,/* 0xF0-0xF7 */
+	0x5CA0,0x5CA1,0x5CA4,0x5CA5,0x5CA6,0x5CA7,0x5CA8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5CAA,0x5CAE,0x5CAF,0x5CB0,0x5CB2,0x5CB4,0x5CB6,0x5CB9,/* 0x40-0x47 */
+	0x5CBA,0x5CBB,0x5CBC,0x5CBE,0x5CC0,0x5CC2,0x5CC3,0x5CC5,/* 0x48-0x4F */
+	0x5CC6,0x5CC7,0x5CC8,0x5CC9,0x5CCA,0x5CCC,0x5CCD,0x5CCE,/* 0x50-0x57 */
+	0x5CCF,0x5CD0,0x5CD1,0x5CD3,0x5CD4,0x5CD5,0x5CD6,0x5CD7,/* 0x58-0x5F */
+	0x5CD8,0x5CDA,0x5CDB,0x5CDC,0x5CDD,0x5CDE,0x5CDF,0x5CE0,/* 0x60-0x67 */
+	0x5CE2,0x5CE3,0x5CE7,0x5CE9,0x5CEB,0x5CEC,0x5CEE,0x5CEF,/* 0x68-0x6F */
+	0x5CF1,0x5CF2,0x5CF3,0x5CF4,0x5CF5,0x5CF6,0x5CF7,0x5CF8,/* 0x70-0x77 */
+	0x5CF9,0x5CFA,0x5CFC,0x5CFD,0x5CFE,0x5CFF,0x5D00,0x0000,/* 0x78-0x7F */
+
+	0x5D01,0x5D04,0x5D05,0x5D08,0x5D09,0x5D0A,0x5D0B,0x5D0C,/* 0x80-0x87 */
+	0x5D0D,0x5D0F,0x5D10,0x5D11,0x5D12,0x5D13,0x5D15,0x5D17,/* 0x88-0x8F */
+	0x5D18,0x5D19,0x5D1A,0x5D1C,0x5D1D,0x5D1F,0x5D20,0x5D21,/* 0x90-0x97 */
+	0x5D22,0x5D23,0x5D25,0x5D28,0x5D2A,0x5D2B,0x5D2C,0x5D2F,/* 0x98-0x9F */
+	0x5D30,0x5D31,0x5D32,0x5D33,0x5D35,0x5D36,0x5D37,0x5D38,/* 0xA0-0xA7 */
+	0x5D39,0x5D3A,0x5D3B,0x5D3C,0x5D3F,0x5D40,0x5D41,0x5D42,/* 0xA8-0xAF */
+	0x5D43,0x5D44,0x5D45,0x5D46,0x5D48,0x5D49,0x5D4D,0x5D4E,/* 0xB0-0xB7 */
+	0x5D4F,0x5D50,0x5D51,0x5D52,0x5D53,0x5D54,0x5D55,0x5D56,/* 0xB8-0xBF */
+	0x5D57,0x5D59,0x5D5A,0x5D5C,0x5D5E,0x5D5F,0x5D60,0x5D61,/* 0xC0-0xC7 */
+	0x5D62,0x5D63,0x5D64,0x5D65,0x5D66,0x5D67,0x5D68,0x5D6A,/* 0xC8-0xCF */
+	0x5D6D,0x5D6E,0x5D70,0x5D71,0x5D72,0x5D73,0x5D75,0x5D76,/* 0xD0-0xD7 */
+	0x5D77,0x5D78,0x5D79,0x5D7A,0x5D7B,0x5D7C,0x5D7D,0x5D7E,/* 0xD8-0xDF */
+	0x5D7F,0x5D80,0x5D81,0x5D83,0x5D84,0x5D85,0x5D86,0x5D87,/* 0xE0-0xE7 */
+	0x5D88,0x5D89,0x5D8A,0x5D8B,0x5D8C,0x5D8D,0x5D8E,0x5D8F,/* 0xE8-0xEF */
+	0x5D90,0x5D91,0x5D92,0x5D93,0x5D94,0x5D95,0x5D96,0x5D97,/* 0xF0-0xF7 */
+	0x5D98,0x5D9A,0x5D9B,0x5D9C,0x5D9E,0x5D9F,0x5DA0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5DA1,0x5DA2,0x5DA3,0x5DA4,0x5DA5,0x5DA6,0x5DA7,0x5DA8,/* 0x40-0x47 */
+	0x5DA9,0x5DAA,0x5DAB,0x5DAC,0x5DAD,0x5DAE,0x5DAF,0x5DB0,/* 0x48-0x4F */
+	0x5DB1,0x5DB2,0x5DB3,0x5DB4,0x5DB5,0x5DB6,0x5DB8,0x5DB9,/* 0x50-0x57 */
+	0x5DBA,0x5DBB,0x5DBC,0x5DBD,0x5DBE,0x5DBF,0x5DC0,0x5DC1,/* 0x58-0x5F */
+	0x5DC2,0x5DC3,0x5DC4,0x5DC6,0x5DC7,0x5DC8,0x5DC9,0x5DCA,/* 0x60-0x67 */
+	0x5DCB,0x5DCC,0x5DCE,0x5DCF,0x5DD0,0x5DD1,0x5DD2,0x5DD3,/* 0x68-0x6F */
+	0x5DD4,0x5DD5,0x5DD6,0x5DD7,0x5DD8,0x5DD9,0x5DDA,0x5DDC,/* 0x70-0x77 */
+	0x5DDF,0x5DE0,0x5DE3,0x5DE4,0x5DEA,0x5DEC,0x5DED,0x0000,/* 0x78-0x7F */
+
+	0x5DF0,0x5DF5,0x5DF6,0x5DF8,0x5DF9,0x5DFA,0x5DFB,0x5DFC,/* 0x80-0x87 */
+	0x5DFF,0x5E00,0x5E04,0x5E07,0x5E09,0x5E0A,0x5E0B,0x5E0D,/* 0x88-0x8F */
+	0x5E0E,0x5E12,0x5E13,0x5E17,0x5E1E,0x5E1F,0x5E20,0x5E21,/* 0x90-0x97 */
+	0x5E22,0x5E23,0x5E24,0x5E25,0x5E28,0x5E29,0x5E2A,0x5E2B,/* 0x98-0x9F */
+	0x5E2C,0x5E2F,0x5E30,0x5E32,0x5E33,0x5E34,0x5E35,0x5E36,/* 0xA0-0xA7 */
+	0x5E39,0x5E3A,0x5E3E,0x5E3F,0x5E40,0x5E41,0x5E43,0x5E46,/* 0xA8-0xAF */
+	0x5E47,0x5E48,0x5E49,0x5E4A,0x5E4B,0x5E4D,0x5E4E,0x5E4F,/* 0xB0-0xB7 */
+	0x5E50,0x5E51,0x5E52,0x5E53,0x5E56,0x5E57,0x5E58,0x5E59,/* 0xB8-0xBF */
+	0x5E5A,0x5E5C,0x5E5D,0x5E5F,0x5E60,0x5E63,0x5E64,0x5E65,/* 0xC0-0xC7 */
+	0x5E66,0x5E67,0x5E68,0x5E69,0x5E6A,0x5E6B,0x5E6C,0x5E6D,/* 0xC8-0xCF */
+	0x5E6E,0x5E6F,0x5E70,0x5E71,0x5E75,0x5E77,0x5E79,0x5E7E,/* 0xD0-0xD7 */
+	0x5E81,0x5E82,0x5E83,0x5E85,0x5E88,0x5E89,0x5E8C,0x5E8D,/* 0xD8-0xDF */
+	0x5E8E,0x5E92,0x5E98,0x5E9B,0x5E9D,0x5EA1,0x5EA2,0x5EA3,/* 0xE0-0xE7 */
+	0x5EA4,0x5EA8,0x5EA9,0x5EAA,0x5EAB,0x5EAC,0x5EAE,0x5EAF,/* 0xE8-0xEF */
+	0x5EB0,0x5EB1,0x5EB2,0x5EB4,0x5EBA,0x5EBB,0x5EBC,0x5EBD,/* 0xF0-0xF7 */
+	0x5EBF,0x5EC0,0x5EC1,0x5EC2,0x5EC3,0x5EC4,0x5EC5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5EC6,0x5EC7,0x5EC8,0x5ECB,0x5ECC,0x5ECD,0x5ECE,0x5ECF,/* 0x40-0x47 */
+	0x5ED0,0x5ED4,0x5ED5,0x5ED7,0x5ED8,0x5ED9,0x5EDA,0x5EDC,/* 0x48-0x4F */
+	0x5EDD,0x5EDE,0x5EDF,0x5EE0,0x5EE1,0x5EE2,0x5EE3,0x5EE4,/* 0x50-0x57 */
+	0x5EE5,0x5EE6,0x5EE7,0x5EE9,0x5EEB,0x5EEC,0x5EED,0x5EEE,/* 0x58-0x5F */
+	0x5EEF,0x5EF0,0x5EF1,0x5EF2,0x5EF3,0x5EF5,0x5EF8,0x5EF9,/* 0x60-0x67 */
+	0x5EFB,0x5EFC,0x5EFD,0x5F05,0x5F06,0x5F07,0x5F09,0x5F0C,/* 0x68-0x6F */
+	0x5F0D,0x5F0E,0x5F10,0x5F12,0x5F14,0x5F16,0x5F19,0x5F1A,/* 0x70-0x77 */
+	0x5F1C,0x5F1D,0x5F1E,0x5F21,0x5F22,0x5F23,0x5F24,0x0000,/* 0x78-0x7F */
+
+	0x5F28,0x5F2B,0x5F2C,0x5F2E,0x5F30,0x5F32,0x5F33,0x5F34,/* 0x80-0x87 */
+	0x5F35,0x5F36,0x5F37,0x5F38,0x5F3B,0x5F3D,0x5F3E,0x5F3F,/* 0x88-0x8F */
+	0x5F41,0x5F42,0x5F43,0x5F44,0x5F45,0x5F46,0x5F47,0x5F48,/* 0x90-0x97 */
+	0x5F49,0x5F4A,0x5F4B,0x5F4C,0x5F4D,0x5F4E,0x5F4F,0x5F51,/* 0x98-0x9F */
+	0x5F54,0x5F59,0x5F5A,0x5F5B,0x5F5C,0x5F5E,0x5F5F,0x5F60,/* 0xA0-0xA7 */
+	0x5F63,0x5F65,0x5F67,0x5F68,0x5F6B,0x5F6E,0x5F6F,0x5F72,/* 0xA8-0xAF */
+	0x5F74,0x5F75,0x5F76,0x5F78,0x5F7A,0x5F7D,0x5F7E,0x5F7F,/* 0xB0-0xB7 */
+	0x5F83,0x5F86,0x5F8D,0x5F8E,0x5F8F,0x5F91,0x5F93,0x5F94,/* 0xB8-0xBF */
+	0x5F96,0x5F9A,0x5F9B,0x5F9D,0x5F9E,0x5F9F,0x5FA0,0x5FA2,/* 0xC0-0xC7 */
+	0x5FA3,0x5FA4,0x5FA5,0x5FA6,0x5FA7,0x5FA9,0x5FAB,0x5FAC,/* 0xC8-0xCF */
+	0x5FAF,0x5FB0,0x5FB1,0x5FB2,0x5FB3,0x5FB4,0x5FB6,0x5FB8,/* 0xD0-0xD7 */
+	0x5FB9,0x5FBA,0x5FBB,0x5FBE,0x5FBF,0x5FC0,0x5FC1,0x5FC2,/* 0xD8-0xDF */
+	0x5FC7,0x5FC8,0x5FCA,0x5FCB,0x5FCE,0x5FD3,0x5FD4,0x5FD5,/* 0xE0-0xE7 */
+	0x5FDA,0x5FDB,0x5FDC,0x5FDE,0x5FDF,0x5FE2,0x5FE3,0x5FE5,/* 0xE8-0xEF */
+	0x5FE6,0x5FE8,0x5FE9,0x5FEC,0x5FEF,0x5FF0,0x5FF2,0x5FF3,/* 0xF0-0xF7 */
+	0x5FF4,0x5FF6,0x5FF7,0x5FF9,0x5FFA,0x5FFC,0x6007,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_90[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6008,0x6009,0x600B,0x600C,0x6010,0x6011,0x6013,0x6017,/* 0x40-0x47 */
+	0x6018,0x601A,0x601E,0x601F,0x6022,0x6023,0x6024,0x602C,/* 0x48-0x4F */
+	0x602D,0x602E,0x6030,0x6031,0x6032,0x6033,0x6034,0x6036,/* 0x50-0x57 */
+	0x6037,0x6038,0x6039,0x603A,0x603D,0x603E,0x6040,0x6044,/* 0x58-0x5F */
+	0x6045,0x6046,0x6047,0x6048,0x6049,0x604A,0x604C,0x604E,/* 0x60-0x67 */
+	0x604F,0x6051,0x6053,0x6054,0x6056,0x6057,0x6058,0x605B,/* 0x68-0x6F */
+	0x605C,0x605E,0x605F,0x6060,0x6061,0x6065,0x6066,0x606E,/* 0x70-0x77 */
+	0x6071,0x6072,0x6074,0x6075,0x6077,0x607E,0x6080,0x0000,/* 0x78-0x7F */
+
+	0x6081,0x6082,0x6085,0x6086,0x6087,0x6088,0x608A,0x608B,/* 0x80-0x87 */
+	0x608E,0x608F,0x6090,0x6091,0x6093,0x6095,0x6097,0x6098,/* 0x88-0x8F */
+	0x6099,0x609C,0x609E,0x60A1,0x60A2,0x60A4,0x60A5,0x60A7,/* 0x90-0x97 */
+	0x60A9,0x60AA,0x60AE,0x60B0,0x60B3,0x60B5,0x60B6,0x60B7,/* 0x98-0x9F */
+	0x60B9,0x60BA,0x60BD,0x60BE,0x60BF,0x60C0,0x60C1,0x60C2,/* 0xA0-0xA7 */
+	0x60C3,0x60C4,0x60C7,0x60C8,0x60C9,0x60CC,0x60CD,0x60CE,/* 0xA8-0xAF */
+	0x60CF,0x60D0,0x60D2,0x60D3,0x60D4,0x60D6,0x60D7,0x60D9,/* 0xB0-0xB7 */
+	0x60DB,0x60DE,0x60E1,0x60E2,0x60E3,0x60E4,0x60E5,0x60EA,/* 0xB8-0xBF */
+	0x60F1,0x60F2,0x60F5,0x60F7,0x60F8,0x60FB,0x60FC,0x60FD,/* 0xC0-0xC7 */
+	0x60FE,0x60FF,0x6102,0x6103,0x6104,0x6105,0x6107,0x610A,/* 0xC8-0xCF */
+	0x610B,0x610C,0x6110,0x6111,0x6112,0x6113,0x6114,0x6116,/* 0xD0-0xD7 */
+	0x6117,0x6118,0x6119,0x611B,0x611C,0x611D,0x611E,0x6121,/* 0xD8-0xDF */
+	0x6122,0x6125,0x6128,0x6129,0x612A,0x612C,0x612D,0x612E,/* 0xE0-0xE7 */
+	0x612F,0x6130,0x6131,0x6132,0x6133,0x6134,0x6135,0x6136,/* 0xE8-0xEF */
+	0x6137,0x6138,0x6139,0x613A,0x613B,0x613C,0x613D,0x613E,/* 0xF0-0xF7 */
+	0x6140,0x6141,0x6142,0x6143,0x6144,0x6145,0x6146,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_91[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6147,0x6149,0x614B,0x614D,0x614F,0x6150,0x6152,0x6153,/* 0x40-0x47 */
+	0x6154,0x6156,0x6157,0x6158,0x6159,0x615A,0x615B,0x615C,/* 0x48-0x4F */
+	0x615E,0x615F,0x6160,0x6161,0x6163,0x6164,0x6165,0x6166,/* 0x50-0x57 */
+	0x6169,0x616A,0x616B,0x616C,0x616D,0x616E,0x616F,0x6171,/* 0x58-0x5F */
+	0x6172,0x6173,0x6174,0x6176,0x6178,0x6179,0x617A,0x617B,/* 0x60-0x67 */
+	0x617C,0x617D,0x617E,0x617F,0x6180,0x6181,0x6182,0x6183,/* 0x68-0x6F */
+	0x6184,0x6185,0x6186,0x6187,0x6188,0x6189,0x618A,0x618C,/* 0x70-0x77 */
+	0x618D,0x618F,0x6190,0x6191,0x6192,0x6193,0x6195,0x0000,/* 0x78-0x7F */
+
+	0x6196,0x6197,0x6198,0x6199,0x619A,0x619B,0x619C,0x619E,/* 0x80-0x87 */
+	0x619F,0x61A0,0x61A1,0x61A2,0x61A3,0x61A4,0x61A5,0x61A6,/* 0x88-0x8F */
+	0x61AA,0x61AB,0x61AD,0x61AE,0x61AF,0x61B0,0x61B1,0x61B2,/* 0x90-0x97 */
+	0x61B3,0x61B4,0x61B5,0x61B6,0x61B8,0x61B9,0x61BA,0x61BB,/* 0x98-0x9F */
+	0x61BC,0x61BD,0x61BF,0x61C0,0x61C1,0x61C3,0x61C4,0x61C5,/* 0xA0-0xA7 */
+	0x61C6,0x61C7,0x61C9,0x61CC,0x61CD,0x61CE,0x61CF,0x61D0,/* 0xA8-0xAF */
+	0x61D3,0x61D5,0x61D6,0x61D7,0x61D8,0x61D9,0x61DA,0x61DB,/* 0xB0-0xB7 */
+	0x61DC,0x61DD,0x61DE,0x61DF,0x61E0,0x61E1,0x61E2,0x61E3,/* 0xB8-0xBF */
+	0x61E4,0x61E5,0x61E7,0x61E8,0x61E9,0x61EA,0x61EB,0x61EC,/* 0xC0-0xC7 */
+	0x61ED,0x61EE,0x61EF,0x61F0,0x61F1,0x61F2,0x61F3,0x61F4,/* 0xC8-0xCF */
+	0x61F6,0x61F7,0x61F8,0x61F9,0x61FA,0x61FB,0x61FC,0x61FD,/* 0xD0-0xD7 */
+	0x61FE,0x6200,0x6201,0x6202,0x6203,0x6204,0x6205,0x6207,/* 0xD8-0xDF */
+	0x6209,0x6213,0x6214,0x6219,0x621C,0x621D,0x621E,0x6220,/* 0xE0-0xE7 */
+	0x6223,0x6226,0x6227,0x6228,0x6229,0x622B,0x622D,0x622F,/* 0xE8-0xEF */
+	0x6230,0x6231,0x6232,0x6235,0x6236,0x6238,0x6239,0x623A,/* 0xF0-0xF7 */
+	0x623B,0x623C,0x6242,0x6244,0x6245,0x6246,0x624A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_92[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x624F,0x6250,0x6255,0x6256,0x6257,0x6259,0x625A,0x625C,/* 0x40-0x47 */
+	0x625D,0x625E,0x625F,0x6260,0x6261,0x6262,0x6264,0x6265,/* 0x48-0x4F */
+	0x6268,0x6271,0x6272,0x6274,0x6275,0x6277,0x6278,0x627A,/* 0x50-0x57 */
+	0x627B,0x627D,0x6281,0x6282,0x6283,0x6285,0x6286,0x6287,/* 0x58-0x5F */
+	0x6288,0x628B,0x628C,0x628D,0x628E,0x628F,0x6290,0x6294,/* 0x60-0x67 */
+	0x6299,0x629C,0x629D,0x629E,0x62A3,0x62A6,0x62A7,0x62A9,/* 0x68-0x6F */
+	0x62AA,0x62AD,0x62AE,0x62AF,0x62B0,0x62B2,0x62B3,0x62B4,/* 0x70-0x77 */
+	0x62B6,0x62B7,0x62B8,0x62BA,0x62BE,0x62C0,0x62C1,0x0000,/* 0x78-0x7F */
+
+	0x62C3,0x62CB,0x62CF,0x62D1,0x62D5,0x62DD,0x62DE,0x62E0,/* 0x80-0x87 */
+	0x62E1,0x62E4,0x62EA,0x62EB,0x62F0,0x62F2,0x62F5,0x62F8,/* 0x88-0x8F */
+	0x62F9,0x62FA,0x62FB,0x6300,0x6303,0x6304,0x6305,0x6306,/* 0x90-0x97 */
+	0x630A,0x630B,0x630C,0x630D,0x630F,0x6310,0x6312,0x6313,/* 0x98-0x9F */
+	0x6314,0x6315,0x6317,0x6318,0x6319,0x631C,0x6326,0x6327,/* 0xA0-0xA7 */
+	0x6329,0x632C,0x632D,0x632E,0x6330,0x6331,0x6333,0x6334,/* 0xA8-0xAF */
+	0x6335,0x6336,0x6337,0x6338,0x633B,0x633C,0x633E,0x633F,/* 0xB0-0xB7 */
+	0x6340,0x6341,0x6344,0x6347,0x6348,0x634A,0x6351,0x6352,/* 0xB8-0xBF */
+	0x6353,0x6354,0x6356,0x6357,0x6358,0x6359,0x635A,0x635B,/* 0xC0-0xC7 */
+	0x635C,0x635D,0x6360,0x6364,0x6365,0x6366,0x6368,0x636A,/* 0xC8-0xCF */
+	0x636B,0x636C,0x636F,0x6370,0x6372,0x6373,0x6374,0x6375,/* 0xD0-0xD7 */
+	0x6378,0x6379,0x637C,0x637D,0x637E,0x637F,0x6381,0x6383,/* 0xD8-0xDF */
+	0x6384,0x6385,0x6386,0x638B,0x638D,0x6391,0x6393,0x6394,/* 0xE0-0xE7 */
+	0x6395,0x6397,0x6399,0x639A,0x639B,0x639C,0x639D,0x639E,/* 0xE8-0xEF */
+	0x639F,0x63A1,0x63A4,0x63A6,0x63AB,0x63AF,0x63B1,0x63B2,/* 0xF0-0xF7 */
+	0x63B5,0x63B6,0x63B9,0x63BB,0x63BD,0x63BF,0x63C0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_93[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x63C1,0x63C2,0x63C3,0x63C5,0x63C7,0x63C8,0x63CA,0x63CB,/* 0x40-0x47 */
+	0x63CC,0x63D1,0x63D3,0x63D4,0x63D5,0x63D7,0x63D8,0x63D9,/* 0x48-0x4F */
+	0x63DA,0x63DB,0x63DC,0x63DD,0x63DF,0x63E2,0x63E4,0x63E5,/* 0x50-0x57 */
+	0x63E6,0x63E7,0x63E8,0x63EB,0x63EC,0x63EE,0x63EF,0x63F0,/* 0x58-0x5F */
+	0x63F1,0x63F3,0x63F5,0x63F7,0x63F9,0x63FA,0x63FB,0x63FC,/* 0x60-0x67 */
+	0x63FE,0x6403,0x6404,0x6406,0x6407,0x6408,0x6409,0x640A,/* 0x68-0x6F */
+	0x640D,0x640E,0x6411,0x6412,0x6415,0x6416,0x6417,0x6418,/* 0x70-0x77 */
+	0x6419,0x641A,0x641D,0x641F,0x6422,0x6423,0x6424,0x0000,/* 0x78-0x7F */
+
+	0x6425,0x6427,0x6428,0x6429,0x642B,0x642E,0x642F,0x6430,/* 0x80-0x87 */
+	0x6431,0x6432,0x6433,0x6435,0x6436,0x6437,0x6438,0x6439,/* 0x88-0x8F */
+	0x643B,0x643C,0x643E,0x6440,0x6442,0x6443,0x6449,0x644B,/* 0x90-0x97 */
+	0x644C,0x644D,0x644E,0x644F,0x6450,0x6451,0x6453,0x6455,/* 0x98-0x9F */
+	0x6456,0x6457,0x6459,0x645A,0x645B,0x645C,0x645D,0x645F,/* 0xA0-0xA7 */
+	0x6460,0x6461,0x6462,0x6463,0x6464,0x6465,0x6466,0x6468,/* 0xA8-0xAF */
+	0x646A,0x646B,0x646C,0x646E,0x646F,0x6470,0x6471,0x6472,/* 0xB0-0xB7 */
+	0x6473,0x6474,0x6475,0x6476,0x6477,0x647B,0x647C,0x647D,/* 0xB8-0xBF */
+	0x647E,0x647F,0x6480,0x6481,0x6483,0x6486,0x6488,0x6489,/* 0xC0-0xC7 */
+	0x648A,0x648B,0x648C,0x648D,0x648E,0x648F,0x6490,0x6493,/* 0xC8-0xCF */
+	0x6494,0x6497,0x6498,0x649A,0x649B,0x649C,0x649D,0x649F,/* 0xD0-0xD7 */
+	0x64A0,0x64A1,0x64A2,0x64A3,0x64A5,0x64A6,0x64A7,0x64A8,/* 0xD8-0xDF */
+	0x64AA,0x64AB,0x64AF,0x64B1,0x64B2,0x64B3,0x64B4,0x64B6,/* 0xE0-0xE7 */
+	0x64B9,0x64BB,0x64BD,0x64BE,0x64BF,0x64C1,0x64C3,0x64C4,/* 0xE8-0xEF */
+	0x64C6,0x64C7,0x64C8,0x64C9,0x64CA,0x64CB,0x64CC,0x64CF,/* 0xF0-0xF7 */
+	0x64D1,0x64D3,0x64D4,0x64D5,0x64D6,0x64D9,0x64DA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_94[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x64DB,0x64DC,0x64DD,0x64DF,0x64E0,0x64E1,0x64E3,0x64E5,/* 0x40-0x47 */
+	0x64E7,0x64E8,0x64E9,0x64EA,0x64EB,0x64EC,0x64ED,0x64EE,/* 0x48-0x4F */
+	0x64EF,0x64F0,0x64F1,0x64F2,0x64F3,0x64F4,0x64F5,0x64F6,/* 0x50-0x57 */
+	0x64F7,0x64F8,0x64F9,0x64FA,0x64FB,0x64FC,0x64FD,0x64FE,/* 0x58-0x5F */
+	0x64FF,0x6501,0x6502,0x6503,0x6504,0x6505,0x6506,0x6507,/* 0x60-0x67 */
+	0x6508,0x650A,0x650B,0x650C,0x650D,0x650E,0x650F,0x6510,/* 0x68-0x6F */
+	0x6511,0x6513,0x6514,0x6515,0x6516,0x6517,0x6519,0x651A,/* 0x70-0x77 */
+	0x651B,0x651C,0x651D,0x651E,0x651F,0x6520,0x6521,0x0000,/* 0x78-0x7F */
+
+	0x6522,0x6523,0x6524,0x6526,0x6527,0x6528,0x6529,0x652A,/* 0x80-0x87 */
+	0x652C,0x652D,0x6530,0x6531,0x6532,0x6533,0x6537,0x653A,/* 0x88-0x8F */
+	0x653C,0x653D,0x6540,0x6541,0x6542,0x6543,0x6544,0x6546,/* 0x90-0x97 */
+	0x6547,0x654A,0x654B,0x654D,0x654E,0x6550,0x6552,0x6553,/* 0x98-0x9F */
+	0x6554,0x6557,0x6558,0x655A,0x655C,0x655F,0x6560,0x6561,/* 0xA0-0xA7 */
+	0x6564,0x6565,0x6567,0x6568,0x6569,0x656A,0x656D,0x656E,/* 0xA8-0xAF */
+	0x656F,0x6571,0x6573,0x6575,0x6576,0x6578,0x6579,0x657A,/* 0xB0-0xB7 */
+	0x657B,0x657C,0x657D,0x657E,0x657F,0x6580,0x6581,0x6582,/* 0xB8-0xBF */
+	0x6583,0x6584,0x6585,0x6586,0x6588,0x6589,0x658A,0x658D,/* 0xC0-0xC7 */
+	0x658E,0x658F,0x6592,0x6594,0x6595,0x6596,0x6598,0x659A,/* 0xC8-0xCF */
+	0x659D,0x659E,0x65A0,0x65A2,0x65A3,0x65A6,0x65A8,0x65AA,/* 0xD0-0xD7 */
+	0x65AC,0x65AE,0x65B1,0x65B2,0x65B3,0x65B4,0x65B5,0x65B6,/* 0xD8-0xDF */
+	0x65B7,0x65B8,0x65BA,0x65BB,0x65BE,0x65BF,0x65C0,0x65C2,/* 0xE0-0xE7 */
+	0x65C7,0x65C8,0x65C9,0x65CA,0x65CD,0x65D0,0x65D1,0x65D3,/* 0xE8-0xEF */
+	0x65D4,0x65D5,0x65D8,0x65D9,0x65DA,0x65DB,0x65DC,0x65DD,/* 0xF0-0xF7 */
+	0x65DE,0x65DF,0x65E1,0x65E3,0x65E4,0x65EA,0x65EB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_95[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x65F2,0x65F3,0x65F4,0x65F5,0x65F8,0x65F9,0x65FB,0x65FC,/* 0x40-0x47 */
+	0x65FD,0x65FE,0x65FF,0x6601,0x6604,0x6605,0x6607,0x6608,/* 0x48-0x4F */
+	0x6609,0x660B,0x660D,0x6610,0x6611,0x6612,0x6616,0x6617,/* 0x50-0x57 */
+	0x6618,0x661A,0x661B,0x661C,0x661E,0x6621,0x6622,0x6623,/* 0x58-0x5F */
+	0x6624,0x6626,0x6629,0x662A,0x662B,0x662C,0x662E,0x6630,/* 0x60-0x67 */
+	0x6632,0x6633,0x6637,0x6638,0x6639,0x663A,0x663B,0x663D,/* 0x68-0x6F */
+	0x663F,0x6640,0x6642,0x6644,0x6645,0x6646,0x6647,0x6648,/* 0x70-0x77 */
+	0x6649,0x664A,0x664D,0x664E,0x6650,0x6651,0x6658,0x0000,/* 0x78-0x7F */
+
+	0x6659,0x665B,0x665C,0x665D,0x665E,0x6660,0x6662,0x6663,/* 0x80-0x87 */
+	0x6665,0x6667,0x6669,0x666A,0x666B,0x666C,0x666D,0x6671,/* 0x88-0x8F */
+	0x6672,0x6673,0x6675,0x6678,0x6679,0x667B,0x667C,0x667D,/* 0x90-0x97 */
+	0x667F,0x6680,0x6681,0x6683,0x6685,0x6686,0x6688,0x6689,/* 0x98-0x9F */
+	0x668A,0x668B,0x668D,0x668E,0x668F,0x6690,0x6692,0x6693,/* 0xA0-0xA7 */
+	0x6694,0x6695,0x6698,0x6699,0x669A,0x669B,0x669C,0x669E,/* 0xA8-0xAF */
+	0x669F,0x66A0,0x66A1,0x66A2,0x66A3,0x66A4,0x66A5,0x66A6,/* 0xB0-0xB7 */
+	0x66A9,0x66AA,0x66AB,0x66AC,0x66AD,0x66AF,0x66B0,0x66B1,/* 0xB8-0xBF */
+	0x66B2,0x66B3,0x66B5,0x66B6,0x66B7,0x66B8,0x66BA,0x66BB,/* 0xC0-0xC7 */
+	0x66BC,0x66BD,0x66BF,0x66C0,0x66C1,0x66C2,0x66C3,0x66C4,/* 0xC8-0xCF */
+	0x66C5,0x66C6,0x66C7,0x66C8,0x66C9,0x66CA,0x66CB,0x66CC,/* 0xD0-0xD7 */
+	0x66CD,0x66CE,0x66CF,0x66D0,0x66D1,0x66D2,0x66D3,0x66D4,/* 0xD8-0xDF */
+	0x66D5,0x66D6,0x66D7,0x66D8,0x66DA,0x66DE,0x66DF,0x66E0,/* 0xE0-0xE7 */
+	0x66E1,0x66E2,0x66E3,0x66E4,0x66E5,0x66E7,0x66E8,0x66EA,/* 0xE8-0xEF */
+	0x66EB,0x66EC,0x66ED,0x66EE,0x66EF,0x66F1,0x66F5,0x66F6,/* 0xF0-0xF7 */
+	0x66F8,0x66FA,0x66FB,0x66FD,0x6701,0x6702,0x6703,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_96[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6704,0x6705,0x6706,0x6707,0x670C,0x670E,0x670F,0x6711,/* 0x40-0x47 */
+	0x6712,0x6713,0x6716,0x6718,0x6719,0x671A,0x671C,0x671E,/* 0x48-0x4F */
+	0x6720,0x6721,0x6722,0x6723,0x6724,0x6725,0x6727,0x6729,/* 0x50-0x57 */
+	0x672E,0x6730,0x6732,0x6733,0x6736,0x6737,0x6738,0x6739,/* 0x58-0x5F */
+	0x673B,0x673C,0x673E,0x673F,0x6741,0x6744,0x6745,0x6747,/* 0x60-0x67 */
+	0x674A,0x674B,0x674D,0x6752,0x6754,0x6755,0x6757,0x6758,/* 0x68-0x6F */
+	0x6759,0x675A,0x675B,0x675D,0x6762,0x6763,0x6764,0x6766,/* 0x70-0x77 */
+	0x6767,0x676B,0x676C,0x676E,0x6771,0x6774,0x6776,0x0000,/* 0x78-0x7F */
+
+	0x6778,0x6779,0x677A,0x677B,0x677D,0x6780,0x6782,0x6783,/* 0x80-0x87 */
+	0x6785,0x6786,0x6788,0x678A,0x678C,0x678D,0x678E,0x678F,/* 0x88-0x8F */
+	0x6791,0x6792,0x6793,0x6794,0x6796,0x6799,0x679B,0x679F,/* 0x90-0x97 */
+	0x67A0,0x67A1,0x67A4,0x67A6,0x67A9,0x67AC,0x67AE,0x67B1,/* 0x98-0x9F */
+	0x67B2,0x67B4,0x67B9,0x67BA,0x67BB,0x67BC,0x67BD,0x67BE,/* 0xA0-0xA7 */
+	0x67BF,0x67C0,0x67C2,0x67C5,0x67C6,0x67C7,0x67C8,0x67C9,/* 0xA8-0xAF */
+	0x67CA,0x67CB,0x67CC,0x67CD,0x67CE,0x67D5,0x67D6,0x67D7,/* 0xB0-0xB7 */
+	0x67DB,0x67DF,0x67E1,0x67E3,0x67E4,0x67E6,0x67E7,0x67E8,/* 0xB8-0xBF */
+	0x67EA,0x67EB,0x67ED,0x67EE,0x67F2,0x67F5,0x67F6,0x67F7,/* 0xC0-0xC7 */
+	0x67F8,0x67F9,0x67FA,0x67FB,0x67FC,0x67FE,0x6801,0x6802,/* 0xC8-0xCF */
+	0x6803,0x6804,0x6806,0x680D,0x6810,0x6812,0x6814,0x6815,/* 0xD0-0xD7 */
+	0x6818,0x6819,0x681A,0x681B,0x681C,0x681E,0x681F,0x6820,/* 0xD8-0xDF */
+	0x6822,0x6823,0x6824,0x6825,0x6826,0x6827,0x6828,0x682B,/* 0xE0-0xE7 */
+	0x682C,0x682D,0x682E,0x682F,0x6830,0x6831,0x6834,0x6835,/* 0xE8-0xEF */
+	0x6836,0x683A,0x683B,0x683F,0x6847,0x684B,0x684D,0x684F,/* 0xF0-0xF7 */
+	0x6852,0x6856,0x6857,0x6858,0x6859,0x685A,0x685B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_97[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x685C,0x685D,0x685E,0x685F,0x686A,0x686C,0x686D,0x686E,/* 0x40-0x47 */
+	0x686F,0x6870,0x6871,0x6872,0x6873,0x6875,0x6878,0x6879,/* 0x48-0x4F */
+	0x687A,0x687B,0x687C,0x687D,0x687E,0x687F,0x6880,0x6882,/* 0x50-0x57 */
+	0x6884,0x6887,0x6888,0x6889,0x688A,0x688B,0x688C,0x688D,/* 0x58-0x5F */
+	0x688E,0x6890,0x6891,0x6892,0x6894,0x6895,0x6896,0x6898,/* 0x60-0x67 */
+	0x6899,0x689A,0x689B,0x689C,0x689D,0x689E,0x689F,0x68A0,/* 0x68-0x6F */
+	0x68A1,0x68A3,0x68A4,0x68A5,0x68A9,0x68AA,0x68AB,0x68AC,/* 0x70-0x77 */
+	0x68AE,0x68B1,0x68B2,0x68B4,0x68B6,0x68B7,0x68B8,0x0000,/* 0x78-0x7F */
+
+	0x68B9,0x68BA,0x68BB,0x68BC,0x68BD,0x68BE,0x68BF,0x68C1,/* 0x80-0x87 */
+	0x68C3,0x68C4,0x68C5,0x68C6,0x68C7,0x68C8,0x68CA,0x68CC,/* 0x88-0x8F */
+	0x68CE,0x68CF,0x68D0,0x68D1,0x68D3,0x68D4,0x68D6,0x68D7,/* 0x90-0x97 */
+	0x68D9,0x68DB,0x68DC,0x68DD,0x68DE,0x68DF,0x68E1,0x68E2,/* 0x98-0x9F */
+	0x68E4,0x68E5,0x68E6,0x68E7,0x68E8,0x68E9,0x68EA,0x68EB,/* 0xA0-0xA7 */
+	0x68EC,0x68ED,0x68EF,0x68F2,0x68F3,0x68F4,0x68F6,0x68F7,/* 0xA8-0xAF */
+	0x68F8,0x68FB,0x68FD,0x68FE,0x68FF,0x6900,0x6902,0x6903,/* 0xB0-0xB7 */
+	0x6904,0x6906,0x6907,0x6908,0x6909,0x690A,0x690C,0x690F,/* 0xB8-0xBF */
+	0x6911,0x6913,0x6914,0x6915,0x6916,0x6917,0x6918,0x6919,/* 0xC0-0xC7 */
+	0x691A,0x691B,0x691C,0x691D,0x691E,0x6921,0x6922,0x6923,/* 0xC8-0xCF */
+	0x6925,0x6926,0x6927,0x6928,0x6929,0x692A,0x692B,0x692C,/* 0xD0-0xD7 */
+	0x692E,0x692F,0x6931,0x6932,0x6933,0x6935,0x6936,0x6937,/* 0xD8-0xDF */
+	0x6938,0x693A,0x693B,0x693C,0x693E,0x6940,0x6941,0x6943,/* 0xE0-0xE7 */
+	0x6944,0x6945,0x6946,0x6947,0x6948,0x6949,0x694A,0x694B,/* 0xE8-0xEF */
+	0x694C,0x694D,0x694E,0x694F,0x6950,0x6951,0x6952,0x6953,/* 0xF0-0xF7 */
+	0x6955,0x6956,0x6958,0x6959,0x695B,0x695C,0x695F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_98[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6961,0x6962,0x6964,0x6965,0x6967,0x6968,0x6969,0x696A,/* 0x40-0x47 */
+	0x696C,0x696D,0x696F,0x6970,0x6972,0x6973,0x6974,0x6975,/* 0x48-0x4F */
+	0x6976,0x697A,0x697B,0x697D,0x697E,0x697F,0x6981,0x6983,/* 0x50-0x57 */
+	0x6985,0x698A,0x698B,0x698C,0x698E,0x698F,0x6990,0x6991,/* 0x58-0x5F */
+	0x6992,0x6993,0x6996,0x6997,0x6999,0x699A,0x699D,0x699E,/* 0x60-0x67 */
+	0x699F,0x69A0,0x69A1,0x69A2,0x69A3,0x69A4,0x69A5,0x69A6,/* 0x68-0x6F */
+	0x69A9,0x69AA,0x69AC,0x69AE,0x69AF,0x69B0,0x69B2,0x69B3,/* 0x70-0x77 */
+	0x69B5,0x69B6,0x69B8,0x69B9,0x69BA,0x69BC,0x69BD,0x0000,/* 0x78-0x7F */
+
+	0x69BE,0x69BF,0x69C0,0x69C2,0x69C3,0x69C4,0x69C5,0x69C6,/* 0x80-0x87 */
+	0x69C7,0x69C8,0x69C9,0x69CB,0x69CD,0x69CF,0x69D1,0x69D2,/* 0x88-0x8F */
+	0x69D3,0x69D5,0x69D6,0x69D7,0x69D8,0x69D9,0x69DA,0x69DC,/* 0x90-0x97 */
+	0x69DD,0x69DE,0x69E1,0x69E2,0x69E3,0x69E4,0x69E5,0x69E6,/* 0x98-0x9F */
+	0x69E7,0x69E8,0x69E9,0x69EA,0x69EB,0x69EC,0x69EE,0x69EF,/* 0xA0-0xA7 */
+	0x69F0,0x69F1,0x69F3,0x69F4,0x69F5,0x69F6,0x69F7,0x69F8,/* 0xA8-0xAF */
+	0x69F9,0x69FA,0x69FB,0x69FC,0x69FE,0x6A00,0x6A01,0x6A02,/* 0xB0-0xB7 */
+	0x6A03,0x6A04,0x6A05,0x6A06,0x6A07,0x6A08,0x6A09,0x6A0B,/* 0xB8-0xBF */
+	0x6A0C,0x6A0D,0x6A0E,0x6A0F,0x6A10,0x6A11,0x6A12,0x6A13,/* 0xC0-0xC7 */
+	0x6A14,0x6A15,0x6A16,0x6A19,0x6A1A,0x6A1B,0x6A1C,0x6A1D,/* 0xC8-0xCF */
+	0x6A1E,0x6A20,0x6A22,0x6A23,0x6A24,0x6A25,0x6A26,0x6A27,/* 0xD0-0xD7 */
+	0x6A29,0x6A2B,0x6A2C,0x6A2D,0x6A2E,0x6A30,0x6A32,0x6A33,/* 0xD8-0xDF */
+	0x6A34,0x6A36,0x6A37,0x6A38,0x6A39,0x6A3A,0x6A3B,0x6A3C,/* 0xE0-0xE7 */
+	0x6A3F,0x6A40,0x6A41,0x6A42,0x6A43,0x6A45,0x6A46,0x6A48,/* 0xE8-0xEF */
+	0x6A49,0x6A4A,0x6A4B,0x6A4C,0x6A4D,0x6A4E,0x6A4F,0x6A51,/* 0xF0-0xF7 */
+	0x6A52,0x6A53,0x6A54,0x6A55,0x6A56,0x6A57,0x6A5A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_99[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A5C,0x6A5D,0x6A5E,0x6A5F,0x6A60,0x6A62,0x6A63,0x6A64,/* 0x40-0x47 */
+	0x6A66,0x6A67,0x6A68,0x6A69,0x6A6A,0x6A6B,0x6A6C,0x6A6D,/* 0x48-0x4F */
+	0x6A6E,0x6A6F,0x6A70,0x6A72,0x6A73,0x6A74,0x6A75,0x6A76,/* 0x50-0x57 */
+	0x6A77,0x6A78,0x6A7A,0x6A7B,0x6A7D,0x6A7E,0x6A7F,0x6A81,/* 0x58-0x5F */
+	0x6A82,0x6A83,0x6A85,0x6A86,0x6A87,0x6A88,0x6A89,0x6A8A,/* 0x60-0x67 */
+	0x6A8B,0x6A8C,0x6A8D,0x6A8F,0x6A92,0x6A93,0x6A94,0x6A95,/* 0x68-0x6F */
+	0x6A96,0x6A98,0x6A99,0x6A9A,0x6A9B,0x6A9C,0x6A9D,0x6A9E,/* 0x70-0x77 */
+	0x6A9F,0x6AA1,0x6AA2,0x6AA3,0x6AA4,0x6AA5,0x6AA6,0x0000,/* 0x78-0x7F */
+
+	0x6AA7,0x6AA8,0x6AAA,0x6AAD,0x6AAE,0x6AAF,0x6AB0,0x6AB1,/* 0x80-0x87 */
+	0x6AB2,0x6AB3,0x6AB4,0x6AB5,0x6AB6,0x6AB7,0x6AB8,0x6AB9,/* 0x88-0x8F */
+	0x6ABA,0x6ABB,0x6ABC,0x6ABD,0x6ABE,0x6ABF,0x6AC0,0x6AC1,/* 0x90-0x97 */
+	0x6AC2,0x6AC3,0x6AC4,0x6AC5,0x6AC6,0x6AC7,0x6AC8,0x6AC9,/* 0x98-0x9F */
+	0x6ACA,0x6ACB,0x6ACC,0x6ACD,0x6ACE,0x6ACF,0x6AD0,0x6AD1,/* 0xA0-0xA7 */
+	0x6AD2,0x6AD3,0x6AD4,0x6AD5,0x6AD6,0x6AD7,0x6AD8,0x6AD9,/* 0xA8-0xAF */
+	0x6ADA,0x6ADB,0x6ADC,0x6ADD,0x6ADE,0x6ADF,0x6AE0,0x6AE1,/* 0xB0-0xB7 */
+	0x6AE2,0x6AE3,0x6AE4,0x6AE5,0x6AE6,0x6AE7,0x6AE8,0x6AE9,/* 0xB8-0xBF */
+	0x6AEA,0x6AEB,0x6AEC,0x6AED,0x6AEE,0x6AEF,0x6AF0,0x6AF1,/* 0xC0-0xC7 */
+	0x6AF2,0x6AF3,0x6AF4,0x6AF5,0x6AF6,0x6AF7,0x6AF8,0x6AF9,/* 0xC8-0xCF */
+	0x6AFA,0x6AFB,0x6AFC,0x6AFD,0x6AFE,0x6AFF,0x6B00,0x6B01,/* 0xD0-0xD7 */
+	0x6B02,0x6B03,0x6B04,0x6B05,0x6B06,0x6B07,0x6B08,0x6B09,/* 0xD8-0xDF */
+	0x6B0A,0x6B0B,0x6B0C,0x6B0D,0x6B0E,0x6B0F,0x6B10,0x6B11,/* 0xE0-0xE7 */
+	0x6B12,0x6B13,0x6B14,0x6B15,0x6B16,0x6B17,0x6B18,0x6B19,/* 0xE8-0xEF */
+	0x6B1A,0x6B1B,0x6B1C,0x6B1D,0x6B1E,0x6B1F,0x6B25,0x6B26,/* 0xF0-0xF7 */
+	0x6B28,0x6B29,0x6B2A,0x6B2B,0x6B2C,0x6B2D,0x6B2E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6B2F,0x6B30,0x6B31,0x6B33,0x6B34,0x6B35,0x6B36,0x6B38,/* 0x40-0x47 */
+	0x6B3B,0x6B3C,0x6B3D,0x6B3F,0x6B40,0x6B41,0x6B42,0x6B44,/* 0x48-0x4F */
+	0x6B45,0x6B48,0x6B4A,0x6B4B,0x6B4D,0x6B4E,0x6B4F,0x6B50,/* 0x50-0x57 */
+	0x6B51,0x6B52,0x6B53,0x6B54,0x6B55,0x6B56,0x6B57,0x6B58,/* 0x58-0x5F */
+	0x6B5A,0x6B5B,0x6B5C,0x6B5D,0x6B5E,0x6B5F,0x6B60,0x6B61,/* 0x60-0x67 */
+	0x6B68,0x6B69,0x6B6B,0x6B6C,0x6B6D,0x6B6E,0x6B6F,0x6B70,/* 0x68-0x6F */
+	0x6B71,0x6B72,0x6B73,0x6B74,0x6B75,0x6B76,0x6B77,0x6B78,/* 0x70-0x77 */
+	0x6B7A,0x6B7D,0x6B7E,0x6B7F,0x6B80,0x6B85,0x6B88,0x0000,/* 0x78-0x7F */
+
+	0x6B8C,0x6B8E,0x6B8F,0x6B90,0x6B91,0x6B94,0x6B95,0x6B97,/* 0x80-0x87 */
+	0x6B98,0x6B99,0x6B9C,0x6B9D,0x6B9E,0x6B9F,0x6BA0,0x6BA2,/* 0x88-0x8F */
+	0x6BA3,0x6BA4,0x6BA5,0x6BA6,0x6BA7,0x6BA8,0x6BA9,0x6BAB,/* 0x90-0x97 */
+	0x6BAC,0x6BAD,0x6BAE,0x6BAF,0x6BB0,0x6BB1,0x6BB2,0x6BB6,/* 0x98-0x9F */
+	0x6BB8,0x6BB9,0x6BBA,0x6BBB,0x6BBC,0x6BBD,0x6BBE,0x6BC0,/* 0xA0-0xA7 */
+	0x6BC3,0x6BC4,0x6BC6,0x6BC7,0x6BC8,0x6BC9,0x6BCA,0x6BCC,/* 0xA8-0xAF */
+	0x6BCE,0x6BD0,0x6BD1,0x6BD8,0x6BDA,0x6BDC,0x6BDD,0x6BDE,/* 0xB0-0xB7 */
+	0x6BDF,0x6BE0,0x6BE2,0x6BE3,0x6BE4,0x6BE5,0x6BE6,0x6BE7,/* 0xB8-0xBF */
+	0x6BE8,0x6BE9,0x6BEC,0x6BED,0x6BEE,0x6BF0,0x6BF1,0x6BF2,/* 0xC0-0xC7 */
+	0x6BF4,0x6BF6,0x6BF7,0x6BF8,0x6BFA,0x6BFB,0x6BFC,0x6BFE,/* 0xC8-0xCF */
+	0x6BFF,0x6C00,0x6C01,0x6C02,0x6C03,0x6C04,0x6C08,0x6C09,/* 0xD0-0xD7 */
+	0x6C0A,0x6C0B,0x6C0C,0x6C0E,0x6C12,0x6C17,0x6C1C,0x6C1D,/* 0xD8-0xDF */
+	0x6C1E,0x6C20,0x6C23,0x6C25,0x6C2B,0x6C2C,0x6C2D,0x6C31,/* 0xE0-0xE7 */
+	0x6C33,0x6C36,0x6C37,0x6C39,0x6C3A,0x6C3B,0x6C3C,0x6C3E,/* 0xE8-0xEF */
+	0x6C3F,0x6C43,0x6C44,0x6C45,0x6C48,0x6C4B,0x6C4C,0x6C4D,/* 0xF0-0xF7 */
+	0x6C4E,0x6C4F,0x6C51,0x6C52,0x6C53,0x6C56,0x6C58,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6C59,0x6C5A,0x6C62,0x6C63,0x6C65,0x6C66,0x6C67,0x6C6B,/* 0x40-0x47 */
+	0x6C6C,0x6C6D,0x6C6E,0x6C6F,0x6C71,0x6C73,0x6C75,0x6C77,/* 0x48-0x4F */
+	0x6C78,0x6C7A,0x6C7B,0x6C7C,0x6C7F,0x6C80,0x6C84,0x6C87,/* 0x50-0x57 */
+	0x6C8A,0x6C8B,0x6C8D,0x6C8E,0x6C91,0x6C92,0x6C95,0x6C96,/* 0x58-0x5F */
+	0x6C97,0x6C98,0x6C9A,0x6C9C,0x6C9D,0x6C9E,0x6CA0,0x6CA2,/* 0x60-0x67 */
+	0x6CA8,0x6CAC,0x6CAF,0x6CB0,0x6CB4,0x6CB5,0x6CB6,0x6CB7,/* 0x68-0x6F */
+	0x6CBA,0x6CC0,0x6CC1,0x6CC2,0x6CC3,0x6CC6,0x6CC7,0x6CC8,/* 0x70-0x77 */
+	0x6CCB,0x6CCD,0x6CCE,0x6CCF,0x6CD1,0x6CD2,0x6CD8,0x0000,/* 0x78-0x7F */
+
+	0x6CD9,0x6CDA,0x6CDC,0x6CDD,0x6CDF,0x6CE4,0x6CE6,0x6CE7,/* 0x80-0x87 */
+	0x6CE9,0x6CEC,0x6CED,0x6CF2,0x6CF4,0x6CF9,0x6CFF,0x6D00,/* 0x88-0x8F */
+	0x6D02,0x6D03,0x6D05,0x6D06,0x6D08,0x6D09,0x6D0A,0x6D0D,/* 0x90-0x97 */
+	0x6D0F,0x6D10,0x6D11,0x6D13,0x6D14,0x6D15,0x6D16,0x6D18,/* 0x98-0x9F */
+	0x6D1C,0x6D1D,0x6D1F,0x6D20,0x6D21,0x6D22,0x6D23,0x6D24,/* 0xA0-0xA7 */
+	0x6D26,0x6D28,0x6D29,0x6D2C,0x6D2D,0x6D2F,0x6D30,0x6D34,/* 0xA8-0xAF */
+	0x6D36,0x6D37,0x6D38,0x6D3A,0x6D3F,0x6D40,0x6D42,0x6D44,/* 0xB0-0xB7 */
+	0x6D49,0x6D4C,0x6D50,0x6D55,0x6D56,0x6D57,0x6D58,0x6D5B,/* 0xB8-0xBF */
+	0x6D5D,0x6D5F,0x6D61,0x6D62,0x6D64,0x6D65,0x6D67,0x6D68,/* 0xC0-0xC7 */
+	0x6D6B,0x6D6C,0x6D6D,0x6D70,0x6D71,0x6D72,0x6D73,0x6D75,/* 0xC8-0xCF */
+	0x6D76,0x6D79,0x6D7A,0x6D7B,0x6D7D,0x6D7E,0x6D7F,0x6D80,/* 0xD0-0xD7 */
+	0x6D81,0x6D83,0x6D84,0x6D86,0x6D87,0x6D8A,0x6D8B,0x6D8D,/* 0xD8-0xDF */
+	0x6D8F,0x6D90,0x6D92,0x6D96,0x6D97,0x6D98,0x6D99,0x6D9A,/* 0xE0-0xE7 */
+	0x6D9C,0x6DA2,0x6DA5,0x6DAC,0x6DAD,0x6DB0,0x6DB1,0x6DB3,/* 0xE8-0xEF */
+	0x6DB4,0x6DB6,0x6DB7,0x6DB9,0x6DBA,0x6DBB,0x6DBC,0x6DBD,/* 0xF0-0xF7 */
+	0x6DBE,0x6DC1,0x6DC2,0x6DC3,0x6DC8,0x6DC9,0x6DCA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6DCD,0x6DCE,0x6DCF,0x6DD0,0x6DD2,0x6DD3,0x6DD4,0x6DD5,/* 0x40-0x47 */
+	0x6DD7,0x6DDA,0x6DDB,0x6DDC,0x6DDF,0x6DE2,0x6DE3,0x6DE5,/* 0x48-0x4F */
+	0x6DE7,0x6DE8,0x6DE9,0x6DEA,0x6DED,0x6DEF,0x6DF0,0x6DF2,/* 0x50-0x57 */
+	0x6DF4,0x6DF5,0x6DF6,0x6DF8,0x6DFA,0x6DFD,0x6DFE,0x6DFF,/* 0x58-0x5F */
+	0x6E00,0x6E01,0x6E02,0x6E03,0x6E04,0x6E06,0x6E07,0x6E08,/* 0x60-0x67 */
+	0x6E09,0x6E0B,0x6E0F,0x6E12,0x6E13,0x6E15,0x6E18,0x6E19,/* 0x68-0x6F */
+	0x6E1B,0x6E1C,0x6E1E,0x6E1F,0x6E22,0x6E26,0x6E27,0x6E28,/* 0x70-0x77 */
+	0x6E2A,0x6E2C,0x6E2E,0x6E30,0x6E31,0x6E33,0x6E35,0x0000,/* 0x78-0x7F */
+
+	0x6E36,0x6E37,0x6E39,0x6E3B,0x6E3C,0x6E3D,0x6E3E,0x6E3F,/* 0x80-0x87 */
+	0x6E40,0x6E41,0x6E42,0x6E45,0x6E46,0x6E47,0x6E48,0x6E49,/* 0x88-0x8F */
+	0x6E4A,0x6E4B,0x6E4C,0x6E4F,0x6E50,0x6E51,0x6E52,0x6E55,/* 0x90-0x97 */
+	0x6E57,0x6E59,0x6E5A,0x6E5C,0x6E5D,0x6E5E,0x6E60,0x6E61,/* 0x98-0x9F */
+	0x6E62,0x6E63,0x6E64,0x6E65,0x6E66,0x6E67,0x6E68,0x6E69,/* 0xA0-0xA7 */
+	0x6E6A,0x6E6C,0x6E6D,0x6E6F,0x6E70,0x6E71,0x6E72,0x6E73,/* 0xA8-0xAF */
+	0x6E74,0x6E75,0x6E76,0x6E77,0x6E78,0x6E79,0x6E7A,0x6E7B,/* 0xB0-0xB7 */
+	0x6E7C,0x6E7D,0x6E80,0x6E81,0x6E82,0x6E84,0x6E87,0x6E88,/* 0xB8-0xBF */
+	0x6E8A,0x6E8B,0x6E8C,0x6E8D,0x6E8E,0x6E91,0x6E92,0x6E93,/* 0xC0-0xC7 */
+	0x6E94,0x6E95,0x6E96,0x6E97,0x6E99,0x6E9A,0x6E9B,0x6E9D,/* 0xC8-0xCF */
+	0x6E9E,0x6EA0,0x6EA1,0x6EA3,0x6EA4,0x6EA6,0x6EA8,0x6EA9,/* 0xD0-0xD7 */
+	0x6EAB,0x6EAC,0x6EAD,0x6EAE,0x6EB0,0x6EB3,0x6EB5,0x6EB8,/* 0xD8-0xDF */
+	0x6EB9,0x6EBC,0x6EBE,0x6EBF,0x6EC0,0x6EC3,0x6EC4,0x6EC5,/* 0xE0-0xE7 */
+	0x6EC6,0x6EC8,0x6EC9,0x6ECA,0x6ECC,0x6ECD,0x6ECE,0x6ED0,/* 0xE8-0xEF */
+	0x6ED2,0x6ED6,0x6ED8,0x6ED9,0x6EDB,0x6EDC,0x6EDD,0x6EE3,/* 0xF0-0xF7 */
+	0x6EE7,0x6EEA,0x6EEB,0x6EEC,0x6EED,0x6EEE,0x6EEF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6EF0,0x6EF1,0x6EF2,0x6EF3,0x6EF5,0x6EF6,0x6EF7,0x6EF8,/* 0x40-0x47 */
+	0x6EFA,0x6EFB,0x6EFC,0x6EFD,0x6EFE,0x6EFF,0x6F00,0x6F01,/* 0x48-0x4F */
+	0x6F03,0x6F04,0x6F05,0x6F07,0x6F08,0x6F0A,0x6F0B,0x6F0C,/* 0x50-0x57 */
+	0x6F0D,0x6F0E,0x6F10,0x6F11,0x6F12,0x6F16,0x6F17,0x6F18,/* 0x58-0x5F */
+	0x6F19,0x6F1A,0x6F1B,0x6F1C,0x6F1D,0x6F1E,0x6F1F,0x6F21,/* 0x60-0x67 */
+	0x6F22,0x6F23,0x6F25,0x6F26,0x6F27,0x6F28,0x6F2C,0x6F2E,/* 0x68-0x6F */
+	0x6F30,0x6F32,0x6F34,0x6F35,0x6F37,0x6F38,0x6F39,0x6F3A,/* 0x70-0x77 */
+	0x6F3B,0x6F3C,0x6F3D,0x6F3F,0x6F40,0x6F41,0x6F42,0x0000,/* 0x78-0x7F */
+
+	0x6F43,0x6F44,0x6F45,0x6F48,0x6F49,0x6F4A,0x6F4C,0x6F4E,/* 0x80-0x87 */
+	0x6F4F,0x6F50,0x6F51,0x6F52,0x6F53,0x6F54,0x6F55,0x6F56,/* 0x88-0x8F */
+	0x6F57,0x6F59,0x6F5A,0x6F5B,0x6F5D,0x6F5F,0x6F60,0x6F61,/* 0x90-0x97 */
+	0x6F63,0x6F64,0x6F65,0x6F67,0x6F68,0x6F69,0x6F6A,0x6F6B,/* 0x98-0x9F */
+	0x6F6C,0x6F6F,0x6F70,0x6F71,0x6F73,0x6F75,0x6F76,0x6F77,/* 0xA0-0xA7 */
+	0x6F79,0x6F7B,0x6F7D,0x6F7E,0x6F7F,0x6F80,0x6F81,0x6F82,/* 0xA8-0xAF */
+	0x6F83,0x6F85,0x6F86,0x6F87,0x6F8A,0x6F8B,0x6F8F,0x6F90,/* 0xB0-0xB7 */
+	0x6F91,0x6F92,0x6F93,0x6F94,0x6F95,0x6F96,0x6F97,0x6F98,/* 0xB8-0xBF */
+	0x6F99,0x6F9A,0x6F9B,0x6F9D,0x6F9E,0x6F9F,0x6FA0,0x6FA2,/* 0xC0-0xC7 */
+	0x6FA3,0x6FA4,0x6FA5,0x6FA6,0x6FA8,0x6FA9,0x6FAA,0x6FAB,/* 0xC8-0xCF */
+	0x6FAC,0x6FAD,0x6FAE,0x6FAF,0x6FB0,0x6FB1,0x6FB2,0x6FB4,/* 0xD0-0xD7 */
+	0x6FB5,0x6FB7,0x6FB8,0x6FBA,0x6FBB,0x6FBC,0x6FBD,0x6FBE,/* 0xD8-0xDF */
+	0x6FBF,0x6FC1,0x6FC3,0x6FC4,0x6FC5,0x6FC6,0x6FC7,0x6FC8,/* 0xE0-0xE7 */
+	0x6FCA,0x6FCB,0x6FCC,0x6FCD,0x6FCE,0x6FCF,0x6FD0,0x6FD3,/* 0xE8-0xEF */
+	0x6FD4,0x6FD5,0x6FD6,0x6FD7,0x6FD8,0x6FD9,0x6FDA,0x6FDB,/* 0xF0-0xF7 */
+	0x6FDC,0x6FDD,0x6FDF,0x6FE2,0x6FE3,0x6FE4,0x6FE5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6FE6,0x6FE7,0x6FE8,0x6FE9,0x6FEA,0x6FEB,0x6FEC,0x6FED,/* 0x40-0x47 */
+	0x6FF0,0x6FF1,0x6FF2,0x6FF3,0x6FF4,0x6FF5,0x6FF6,0x6FF7,/* 0x48-0x4F */
+	0x6FF8,0x6FF9,0x6FFA,0x6FFB,0x6FFC,0x6FFD,0x6FFE,0x6FFF,/* 0x50-0x57 */
+	0x7000,0x7001,0x7002,0x7003,0x7004,0x7005,0x7006,0x7007,/* 0x58-0x5F */
+	0x7008,0x7009,0x700A,0x700B,0x700C,0x700D,0x700E,0x700F,/* 0x60-0x67 */
+	0x7010,0x7012,0x7013,0x7014,0x7015,0x7016,0x7017,0x7018,/* 0x68-0x6F */
+	0x7019,0x701C,0x701D,0x701E,0x701F,0x7020,0x7021,0x7022,/* 0x70-0x77 */
+	0x7024,0x7025,0x7026,0x7027,0x7028,0x7029,0x702A,0x0000,/* 0x78-0x7F */
+
+	0x702B,0x702C,0x702D,0x702E,0x702F,0x7030,0x7031,0x7032,/* 0x80-0x87 */
+	0x7033,0x7034,0x7036,0x7037,0x7038,0x703A,0x703B,0x703C,/* 0x88-0x8F */
+	0x703D,0x703E,0x703F,0x7040,0x7041,0x7042,0x7043,0x7044,/* 0x90-0x97 */
+	0x7045,0x7046,0x7047,0x7048,0x7049,0x704A,0x704B,0x704D,/* 0x98-0x9F */
+	0x704E,0x7050,0x7051,0x7052,0x7053,0x7054,0x7055,0x7056,/* 0xA0-0xA7 */
+	0x7057,0x7058,0x7059,0x705A,0x705B,0x705C,0x705D,0x705F,/* 0xA8-0xAF */
+	0x7060,0x7061,0x7062,0x7063,0x7064,0x7065,0x7066,0x7067,/* 0xB0-0xB7 */
+	0x7068,0x7069,0x706A,0x706E,0x7071,0x7072,0x7073,0x7074,/* 0xB8-0xBF */
+	0x7077,0x7079,0x707A,0x707B,0x707D,0x7081,0x7082,0x7083,/* 0xC0-0xC7 */
+	0x7084,0x7086,0x7087,0x7088,0x708B,0x708C,0x708D,0x708F,/* 0xC8-0xCF */
+	0x7090,0x7091,0x7093,0x7097,0x7098,0x709A,0x709B,0x709E,/* 0xD0-0xD7 */
+	0x709F,0x70A0,0x70A1,0x70A2,0x70A3,0x70A4,0x70A5,0x70A6,/* 0xD8-0xDF */
+	0x70A7,0x70A8,0x70A9,0x70AA,0x70B0,0x70B2,0x70B4,0x70B5,/* 0xE0-0xE7 */
+	0x70B6,0x70BA,0x70BE,0x70BF,0x70C4,0x70C5,0x70C6,0x70C7,/* 0xE8-0xEF */
+	0x70C9,0x70CB,0x70CC,0x70CD,0x70CE,0x70CF,0x70D0,0x70D1,/* 0xF0-0xF7 */
+	0x70D2,0x70D3,0x70D4,0x70D5,0x70D6,0x70D7,0x70DA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x70DC,0x70DD,0x70DE,0x70E0,0x70E1,0x70E2,0x70E3,0x70E5,/* 0x40-0x47 */
+	0x70EA,0x70EE,0x70F0,0x70F1,0x70F2,0x70F3,0x70F4,0x70F5,/* 0x48-0x4F */
+	0x70F6,0x70F8,0x70FA,0x70FB,0x70FC,0x70FE,0x70FF,0x7100,/* 0x50-0x57 */
+	0x7101,0x7102,0x7103,0x7104,0x7105,0x7106,0x7107,0x7108,/* 0x58-0x5F */
+	0x710B,0x710C,0x710D,0x710E,0x710F,0x7111,0x7112,0x7114,/* 0x60-0x67 */
+	0x7117,0x711B,0x711C,0x711D,0x711E,0x711F,0x7120,0x7121,/* 0x68-0x6F */
+	0x7122,0x7123,0x7124,0x7125,0x7127,0x7128,0x7129,0x712A,/* 0x70-0x77 */
+	0x712B,0x712C,0x712D,0x712E,0x7132,0x7133,0x7134,0x0000,/* 0x78-0x7F */
+
+	0x7135,0x7137,0x7138,0x7139,0x713A,0x713B,0x713C,0x713D,/* 0x80-0x87 */
+	0x713E,0x713F,0x7140,0x7141,0x7142,0x7143,0x7144,0x7146,/* 0x88-0x8F */
+	0x7147,0x7148,0x7149,0x714B,0x714D,0x714F,0x7150,0x7151,/* 0x90-0x97 */
+	0x7152,0x7153,0x7154,0x7155,0x7156,0x7157,0x7158,0x7159,/* 0x98-0x9F */
+	0x715A,0x715B,0x715D,0x715F,0x7160,0x7161,0x7162,0x7163,/* 0xA0-0xA7 */
+	0x7165,0x7169,0x716A,0x716B,0x716C,0x716D,0x716F,0x7170,/* 0xA8-0xAF */
+	0x7171,0x7174,0x7175,0x7176,0x7177,0x7179,0x717B,0x717C,/* 0xB0-0xB7 */
+	0x717E,0x717F,0x7180,0x7181,0x7182,0x7183,0x7185,0x7186,/* 0xB8-0xBF */
+	0x7187,0x7188,0x7189,0x718B,0x718C,0x718D,0x718E,0x7190,/* 0xC0-0xC7 */
+	0x7191,0x7192,0x7193,0x7195,0x7196,0x7197,0x719A,0x719B,/* 0xC8-0xCF */
+	0x719C,0x719D,0x719E,0x71A1,0x71A2,0x71A3,0x71A4,0x71A5,/* 0xD0-0xD7 */
+	0x71A6,0x71A7,0x71A9,0x71AA,0x71AB,0x71AD,0x71AE,0x71AF,/* 0xD8-0xDF */
+	0x71B0,0x71B1,0x71B2,0x71B4,0x71B6,0x71B7,0x71B8,0x71BA,/* 0xE0-0xE7 */
+	0x71BB,0x71BC,0x71BD,0x71BE,0x71BF,0x71C0,0x71C1,0x71C2,/* 0xE8-0xEF */
+	0x71C4,0x71C5,0x71C6,0x71C7,0x71C8,0x71C9,0x71CA,0x71CB,/* 0xF0-0xF7 */
+	0x71CC,0x71CD,0x71CF,0x71D0,0x71D1,0x71D2,0x71D3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x71D6,0x71D7,0x71D8,0x71D9,0x71DA,0x71DB,0x71DC,0x71DD,/* 0x40-0x47 */
+	0x71DE,0x71DF,0x71E1,0x71E2,0x71E3,0x71E4,0x71E6,0x71E8,/* 0x48-0x4F */
+	0x71E9,0x71EA,0x71EB,0x71EC,0x71ED,0x71EF,0x71F0,0x71F1,/* 0x50-0x57 */
+	0x71F2,0x71F3,0x71F4,0x71F5,0x71F6,0x71F7,0x71F8,0x71FA,/* 0x58-0x5F */
+	0x71FB,0x71FC,0x71FD,0x71FE,0x71FF,0x7200,0x7201,0x7202,/* 0x60-0x67 */
+	0x7203,0x7204,0x7205,0x7207,0x7208,0x7209,0x720A,0x720B,/* 0x68-0x6F */
+	0x720C,0x720D,0x720E,0x720F,0x7210,0x7211,0x7212,0x7213,/* 0x70-0x77 */
+	0x7214,0x7215,0x7216,0x7217,0x7218,0x7219,0x721A,0x0000,/* 0x78-0x7F */
+
+	0x721B,0x721C,0x721E,0x721F,0x7220,0x7221,0x7222,0x7223,/* 0x80-0x87 */
+	0x7224,0x7225,0x7226,0x7227,0x7229,0x722B,0x722D,0x722E,/* 0x88-0x8F */
+	0x722F,0x7232,0x7233,0x7234,0x723A,0x723C,0x723E,0x7240,/* 0x90-0x97 */
+	0x7241,0x7242,0x7243,0x7244,0x7245,0x7246,0x7249,0x724A,/* 0x98-0x9F */
+	0x724B,0x724E,0x724F,0x7250,0x7251,0x7253,0x7254,0x7255,/* 0xA0-0xA7 */
+	0x7257,0x7258,0x725A,0x725C,0x725E,0x7260,0x7263,0x7264,/* 0xA8-0xAF */
+	0x7265,0x7268,0x726A,0x726B,0x726C,0x726D,0x7270,0x7271,/* 0xB0-0xB7 */
+	0x7273,0x7274,0x7276,0x7277,0x7278,0x727B,0x727C,0x727D,/* 0xB8-0xBF */
+	0x7282,0x7283,0x7285,0x7286,0x7287,0x7288,0x7289,0x728C,/* 0xC0-0xC7 */
+	0x728E,0x7290,0x7291,0x7293,0x7294,0x7295,0x7296,0x7297,/* 0xC8-0xCF */
+	0x7298,0x7299,0x729A,0x729B,0x729C,0x729D,0x729E,0x72A0,/* 0xD0-0xD7 */
+	0x72A1,0x72A2,0x72A3,0x72A4,0x72A5,0x72A6,0x72A7,0x72A8,/* 0xD8-0xDF */
+	0x72A9,0x72AA,0x72AB,0x72AE,0x72B1,0x72B2,0x72B3,0x72B5,/* 0xE0-0xE7 */
+	0x72BA,0x72BB,0x72BC,0x72BD,0x72BE,0x72BF,0x72C0,0x72C5,/* 0xE8-0xEF */
+	0x72C6,0x72C7,0x72C9,0x72CA,0x72CB,0x72CC,0x72CF,0x72D1,/* 0xF0-0xF7 */
+	0x72D3,0x72D4,0x72D5,0x72D6,0x72D8,0x72DA,0x72DB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x3000,0x3001,0x3002,0x00B7,0x02C9,0x02C7,0x00A8,/* 0xA0-0xA7 */
+	0x3003,0x3005,0x2014,0xFF5E,0x2016,0x2026,0x2018,0x2019,/* 0xA8-0xAF */
+	0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */
+	0x300C,0x300D,0x300E,0x300F,0x3016,0x3017,0x3010,0x3011,/* 0xB8-0xBF */
+	0x00B1,0x00D7,0x00F7,0x2236,0x2227,0x2228,0x2211,0x220F,/* 0xC0-0xC7 */
+	0x222A,0x2229,0x2208,0x2237,0x221A,0x22A5,0x2225,0x2220,/* 0xC8-0xCF */
+	0x2312,0x2299,0x222B,0x222E,0x2261,0x224C,0x2248,0x223D,/* 0xD0-0xD7 */
+	0x221D,0x2260,0x226E,0x226F,0x2264,0x2265,0x221E,0x2235,/* 0xD8-0xDF */
+	0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFF04,/* 0xE0-0xE7 */
+	0x00A4,0xFFE0,0xFFE1,0x2030,0x00A7,0x2116,0x2606,0x2605,/* 0xE8-0xEF */
+	0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,0x25A1,0x25A0,0x25B3,/* 0xF0-0xF7 */
+	0x25B2,0x203B,0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */
+	0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */
+	0x0000,0x2488,0x2489,0x248A,0x248B,0x248C,0x248D,0x248E,/* 0xB0-0xB7 */
+	0x248F,0x2490,0x2491,0x2492,0x2493,0x2494,0x2495,0x2496,/* 0xB8-0xBF */
+	0x2497,0x2498,0x2499,0x249A,0x249B,0x2474,0x2475,0x2476,/* 0xC0-0xC7 */
+	0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,0x247D,0x247E,/* 0xC8-0xCF */
+	0x247F,0x2480,0x2481,0x2482,0x2483,0x2484,0x2485,0x2486,/* 0xD0-0xD7 */
+	0x2487,0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,/* 0xD8-0xDF */
+	0x2467,0x2468,0x2469,0x0000,0x0000,0x3220,0x3221,0x3222,/* 0xE0-0xE7 */
+	0x3223,0x3224,0x3225,0x3226,0x3227,0x3228,0x3229,0x0000,/* 0xE8-0xEF */
+	0x0000,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xF0-0xF7 */
+	0x2167,0x2168,0x2169,0x216A,0x216B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xFF01,0xFF02,0xFF03,0xFFE5,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */
+	0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */
+	0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */
+	0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */
+	0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */
+	0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */
+	0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */
+	0xFF38,0xFF39,0xFF3A,0xFF3B,0xFF3C,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */
+	0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */
+	0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */
+	0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */
+	0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */
+	0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */
+	0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */
+	0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */
+	0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */
+	0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */
+	0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */
+	0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */
+	0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */
+	0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */
+	0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */
+	0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */
+	0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */
+	0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */
+	0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */
+	0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */
+	0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */
+	0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */
+	0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xA0-0xA7 */
+	0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xA8-0xAF */
+	0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xB0-0xB7 */
+	0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */
+	0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xC0-0xC7 */
+	0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xC8-0xCF */
+	0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xD0-0xD7 */
+	0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */
+	0xFE35,0xFE36,0xFE39,0xFE3A,0xFE3F,0xFE40,0xFE3D,0xFE3E,/* 0xE0-0xE7 */
+	0xFE41,0xFE42,0xFE43,0xFE44,0x0000,0x0000,0xFE3B,0xFE3C,/* 0xE8-0xEF */
+	0xFE37,0xFE38,0xFE31,0x0000,0xFE33,0xFE34,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */
+	0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */
+	0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */
+	0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */
+	0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */
+	0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */
+	0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */
+	0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */
+	0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x02CA,0x02CB,0x02D9,0x2013,0x2015,0x2025,0x2035,0x2105,/* 0x40-0x47 */
+	0x2109,0x2196,0x2197,0x2198,0x2199,0x2215,0x221F,0x2223,/* 0x48-0x4F */
+	0x2252,0x2266,0x2267,0x22BF,0x2550,0x2551,0x2552,0x2553,/* 0x50-0x57 */
+	0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255A,0x255B,/* 0x58-0x5F */
+	0x255C,0x255D,0x255E,0x255F,0x2560,0x2561,0x2562,0x2563,/* 0x60-0x67 */
+	0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256A,0x256B,/* 0x68-0x6F */
+	0x256C,0x256D,0x256E,0x256F,0x2570,0x2571,0x2572,0x2573,/* 0x70-0x77 */
+	0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,0x2587,0x0000,/* 0x78-0x7F */
+
+	0x2588,0x2589,0x258A,0x258B,0x258C,0x258D,0x258E,0x258F,/* 0x80-0x87 */
+	0x2593,0x2594,0x2595,0x25BC,0x25BD,0x25E2,0x25E3,0x25E4,/* 0x88-0x8F */
+	0x25E5,0x2609,0x2295,0x3012,0x301D,0x301E,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0101,0x00E1,0x01CE,0x00E0,0x0113,0x00E9,0x011B,/* 0xA0-0xA7 */
+	0x00E8,0x012B,0x00ED,0x01D0,0x00EC,0x014D,0x00F3,0x01D2,/* 0xA8-0xAF */
+	0x00F2,0x016B,0x00FA,0x01D4,0x00F9,0x01D6,0x01D8,0x01DA,/* 0xB0-0xB7 */
+	0x01DC,0x00FC,0x00EA,0x0251,0x0000,0x0144,0x0148,0x0000,/* 0xB8-0xBF */
+	0x0261,0x0000,0x0000,0x0000,0x0000,0x3105,0x3106,0x3107,/* 0xC0-0xC7 */
+	0x3108,0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,/* 0xC8-0xCF */
+	0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,0x3117,/* 0xD0-0xD7 */
+	0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,0x311F,/* 0xD8-0xDF */
+	0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,0x3127,/* 0xE0-0xE7 */
+	0x3128,0x3129,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */
+};
+
+static const wchar_t c2u_A9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x3021,0x3022,0x3023,0x3024,0x3025,0x3026,0x3027,0x3028,/* 0x40-0x47 */
+	0x3029,0x32A3,0x338E,0x338F,0x339C,0x339D,0x339E,0x33A1,/* 0x48-0x4F */
+	0x33C4,0x33CE,0x33D1,0x33D2,0x33D5,0xFE30,0xFFE2,0xFFE4,/* 0x50-0x57 */
+	0x0000,0x2121,0x3231,0x0000,0x2010,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x30FC,0x309B,0x309C,0x30FD,0x30FE,0x3006,0x309D,0x309E,/* 0x60-0x67 */
+	0xFE49,0xFE4A,0xFE4B,0xFE4C,0xFE4D,0xFE4E,0xFE4F,0xFE50,/* 0x68-0x6F */
+	0xFE51,0xFE52,0xFE54,0xFE55,0xFE56,0xFE57,0xFE59,0xFE5A,/* 0x70-0x77 */
+	0xFE5B,0xFE5C,0xFE5D,0xFE5E,0xFE5F,0xFE60,0xFE61,0x0000,/* 0x78-0x7F */
+
+	0xFE62,0xFE63,0xFE64,0xFE65,0xFE66,0xFE68,0xFE69,0xFE6A,/* 0x80-0x87 */
+	0xFE6B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x3007,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0000,0x0000,0x0000,0x2500,0x2501,0x2502,0x2503,/* 0xA0-0xA7 */
+	0x2504,0x2505,0x2506,0x2507,0x2508,0x2509,0x250A,0x250B,/* 0xA8-0xAF */
+	0x250C,0x250D,0x250E,0x250F,0x2510,0x2511,0x2512,0x2513,/* 0xB0-0xB7 */
+	0x2514,0x2515,0x2516,0x2517,0x2518,0x2519,0x251A,0x251B,/* 0xB8-0xBF */
+	0x251C,0x251D,0x251E,0x251F,0x2520,0x2521,0x2522,0x2523,/* 0xC0-0xC7 */
+	0x2524,0x2525,0x2526,0x2527,0x2528,0x2529,0x252A,0x252B,/* 0xC8-0xCF */
+	0x252C,0x252D,0x252E,0x252F,0x2530,0x2531,0x2532,0x2533,/* 0xD0-0xD7 */
+	0x2534,0x2535,0x2536,0x2537,0x2538,0x2539,0x253A,0x253B,/* 0xD8-0xDF */
+	0x253C,0x253D,0x253E,0x253F,0x2540,0x2541,0x2542,0x2543,/* 0xE0-0xE7 */
+	0x2544,0x2545,0x2546,0x2547,0x2548,0x2549,0x254A,0x254B,/* 0xE8-0xEF */
+};
+
+static const wchar_t c2u_AA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x72DC,0x72DD,0x72DF,0x72E2,0x72E3,0x72E4,0x72E5,0x72E6,/* 0x40-0x47 */
+	0x72E7,0x72EA,0x72EB,0x72F5,0x72F6,0x72F9,0x72FD,0x72FE,/* 0x48-0x4F */
+	0x72FF,0x7300,0x7302,0x7304,0x7305,0x7306,0x7307,0x7308,/* 0x50-0x57 */
+	0x7309,0x730B,0x730C,0x730D,0x730F,0x7310,0x7311,0x7312,/* 0x58-0x5F */
+	0x7314,0x7318,0x7319,0x731A,0x731F,0x7320,0x7323,0x7324,/* 0x60-0x67 */
+	0x7326,0x7327,0x7328,0x732D,0x732F,0x7330,0x7332,0x7333,/* 0x68-0x6F */
+	0x7335,0x7336,0x733A,0x733B,0x733C,0x733D,0x7340,0x7341,/* 0x70-0x77 */
+	0x7342,0x7343,0x7344,0x7345,0x7346,0x7347,0x7348,0x0000,/* 0x78-0x7F */
+
+	0x7349,0x734A,0x734B,0x734C,0x734E,0x734F,0x7351,0x7353,/* 0x80-0x87 */
+	0x7354,0x7355,0x7356,0x7358,0x7359,0x735A,0x735B,0x735C,/* 0x88-0x8F */
+	0x735D,0x735E,0x735F,0x7361,0x7362,0x7363,0x7364,0x7365,/* 0x90-0x97 */
+	0x7366,0x7367,0x7368,0x7369,0x736A,0x736B,0x736E,0x7370,/* 0x98-0x9F */
+	0x7371,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7372,0x7373,0x7374,0x7375,0x7376,0x7377,0x7378,0x7379,/* 0x40-0x47 */
+	0x737A,0x737B,0x737C,0x737D,0x737F,0x7380,0x7381,0x7382,/* 0x48-0x4F */
+	0x7383,0x7385,0x7386,0x7388,0x738A,0x738C,0x738D,0x738F,/* 0x50-0x57 */
+	0x7390,0x7392,0x7393,0x7394,0x7395,0x7397,0x7398,0x7399,/* 0x58-0x5F */
+	0x739A,0x739C,0x739D,0x739E,0x73A0,0x73A1,0x73A3,0x73A4,/* 0x60-0x67 */
+	0x73A5,0x73A6,0x73A7,0x73A8,0x73AA,0x73AC,0x73AD,0x73B1,/* 0x68-0x6F */
+	0x73B4,0x73B5,0x73B6,0x73B8,0x73B9,0x73BC,0x73BD,0x73BE,/* 0x70-0x77 */
+	0x73BF,0x73C1,0x73C3,0x73C4,0x73C5,0x73C6,0x73C7,0x0000,/* 0x78-0x7F */
+
+	0x73CB,0x73CC,0x73CE,0x73D2,0x73D3,0x73D4,0x73D5,0x73D6,/* 0x80-0x87 */
+	0x73D7,0x73D8,0x73DA,0x73DB,0x73DC,0x73DD,0x73DF,0x73E1,/* 0x88-0x8F */
+	0x73E2,0x73E3,0x73E4,0x73E6,0x73E8,0x73EA,0x73EB,0x73EC,/* 0x90-0x97 */
+	0x73EE,0x73EF,0x73F0,0x73F1,0x73F3,0x73F4,0x73F5,0x73F6,/* 0x98-0x9F */
+	0x73F7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x73F8,0x73F9,0x73FA,0x73FB,0x73FC,0x73FD,0x73FE,0x73FF,/* 0x40-0x47 */
+	0x7400,0x7401,0x7402,0x7404,0x7407,0x7408,0x740B,0x740C,/* 0x48-0x4F */
+	0x740D,0x740E,0x7411,0x7412,0x7413,0x7414,0x7415,0x7416,/* 0x50-0x57 */
+	0x7417,0x7418,0x7419,0x741C,0x741D,0x741E,0x741F,0x7420,/* 0x58-0x5F */
+	0x7421,0x7423,0x7424,0x7427,0x7429,0x742B,0x742D,0x742F,/* 0x60-0x67 */
+	0x7431,0x7432,0x7437,0x7438,0x7439,0x743A,0x743B,0x743D,/* 0x68-0x6F */
+	0x743E,0x743F,0x7440,0x7442,0x7443,0x7444,0x7445,0x7446,/* 0x70-0x77 */
+	0x7447,0x7448,0x7449,0x744A,0x744B,0x744C,0x744D,0x0000,/* 0x78-0x7F */
+
+	0x744E,0x744F,0x7450,0x7451,0x7452,0x7453,0x7454,0x7456,/* 0x80-0x87 */
+	0x7458,0x745D,0x7460,0x7461,0x7462,0x7463,0x7464,0x7465,/* 0x88-0x8F */
+	0x7466,0x7467,0x7468,0x7469,0x746A,0x746B,0x746C,0x746E,/* 0x90-0x97 */
+	0x746F,0x7471,0x7472,0x7473,0x7474,0x7475,0x7478,0x7479,/* 0x98-0x9F */
+	0x747A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x747B,0x747C,0x747D,0x747F,0x7482,0x7484,0x7485,0x7486,/* 0x40-0x47 */
+	0x7488,0x7489,0x748A,0x748C,0x748D,0x748F,0x7491,0x7492,/* 0x48-0x4F */
+	0x7493,0x7494,0x7495,0x7496,0x7497,0x7498,0x7499,0x749A,/* 0x50-0x57 */
+	0x749B,0x749D,0x749F,0x74A0,0x74A1,0x74A2,0x74A3,0x74A4,/* 0x58-0x5F */
+	0x74A5,0x74A6,0x74AA,0x74AB,0x74AC,0x74AD,0x74AE,0x74AF,/* 0x60-0x67 */
+	0x74B0,0x74B1,0x74B2,0x74B3,0x74B4,0x74B5,0x74B6,0x74B7,/* 0x68-0x6F */
+	0x74B8,0x74B9,0x74BB,0x74BC,0x74BD,0x74BE,0x74BF,0x74C0,/* 0x70-0x77 */
+	0x74C1,0x74C2,0x74C3,0x74C4,0x74C5,0x74C6,0x74C7,0x0000,/* 0x78-0x7F */
+
+	0x74C8,0x74C9,0x74CA,0x74CB,0x74CC,0x74CD,0x74CE,0x74CF,/* 0x80-0x87 */
+	0x74D0,0x74D1,0x74D3,0x74D4,0x74D5,0x74D6,0x74D7,0x74D8,/* 0x88-0x8F */
+	0x74D9,0x74DA,0x74DB,0x74DD,0x74DF,0x74E1,0x74E5,0x74E7,/* 0x90-0x97 */
+	0x74E8,0x74E9,0x74EA,0x74EB,0x74EC,0x74ED,0x74F0,0x74F1,/* 0x98-0x9F */
+	0x74F2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x74F3,0x74F5,0x74F8,0x74F9,0x74FA,0x74FB,0x74FC,0x74FD,/* 0x40-0x47 */
+	0x74FE,0x7500,0x7501,0x7502,0x7503,0x7505,0x7506,0x7507,/* 0x48-0x4F */
+	0x7508,0x7509,0x750A,0x750B,0x750C,0x750E,0x7510,0x7512,/* 0x50-0x57 */
+	0x7514,0x7515,0x7516,0x7517,0x751B,0x751D,0x751E,0x7520,/* 0x58-0x5F */
+	0x7521,0x7522,0x7523,0x7524,0x7526,0x7527,0x752A,0x752E,/* 0x60-0x67 */
+	0x7534,0x7536,0x7539,0x753C,0x753D,0x753F,0x7541,0x7542,/* 0x68-0x6F */
+	0x7543,0x7544,0x7546,0x7547,0x7549,0x754A,0x754D,0x7550,/* 0x70-0x77 */
+	0x7551,0x7552,0x7553,0x7555,0x7556,0x7557,0x7558,0x0000,/* 0x78-0x7F */
+
+	0x755D,0x755E,0x755F,0x7560,0x7561,0x7562,0x7563,0x7564,/* 0x80-0x87 */
+	0x7567,0x7568,0x7569,0x756B,0x756C,0x756D,0x756E,0x756F,/* 0x88-0x8F */
+	0x7570,0x7571,0x7573,0x7575,0x7576,0x7577,0x757A,0x757B,/* 0x90-0x97 */
+	0x757C,0x757D,0x757E,0x7580,0x7581,0x7582,0x7584,0x7585,/* 0x98-0x9F */
+	0x7587,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7588,0x7589,0x758A,0x758C,0x758D,0x758E,0x7590,0x7593,/* 0x40-0x47 */
+	0x7595,0x7598,0x759B,0x759C,0x759E,0x75A2,0x75A6,0x75A7,/* 0x48-0x4F */
+	0x75A8,0x75A9,0x75AA,0x75AD,0x75B6,0x75B7,0x75BA,0x75BB,/* 0x50-0x57 */
+	0x75BF,0x75C0,0x75C1,0x75C6,0x75CB,0x75CC,0x75CE,0x75CF,/* 0x58-0x5F */
+	0x75D0,0x75D1,0x75D3,0x75D7,0x75D9,0x75DA,0x75DC,0x75DD,/* 0x60-0x67 */
+	0x75DF,0x75E0,0x75E1,0x75E5,0x75E9,0x75EC,0x75ED,0x75EE,/* 0x68-0x6F */
+	0x75EF,0x75F2,0x75F3,0x75F5,0x75F6,0x75F7,0x75F8,0x75FA,/* 0x70-0x77 */
+	0x75FB,0x75FD,0x75FE,0x7602,0x7604,0x7606,0x7607,0x0000,/* 0x78-0x7F */
+
+	0x7608,0x7609,0x760B,0x760D,0x760E,0x760F,0x7611,0x7612,/* 0x80-0x87 */
+	0x7613,0x7614,0x7616,0x761A,0x761C,0x761D,0x761E,0x7621,/* 0x88-0x8F */
+	0x7623,0x7627,0x7628,0x762C,0x762E,0x762F,0x7631,0x7632,/* 0x90-0x97 */
+	0x7636,0x7637,0x7639,0x763A,0x763B,0x763D,0x7641,0x7642,/* 0x98-0x9F */
+	0x7644,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_B0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7645,0x7646,0x7647,0x7648,0x7649,0x764A,0x764B,0x764E,/* 0x40-0x47 */
+	0x764F,0x7650,0x7651,0x7652,0x7653,0x7655,0x7657,0x7658,/* 0x48-0x4F */
+	0x7659,0x765A,0x765B,0x765D,0x765F,0x7660,0x7661,0x7662,/* 0x50-0x57 */
+	0x7664,0x7665,0x7666,0x7667,0x7668,0x7669,0x766A,0x766C,/* 0x58-0x5F */
+	0x766D,0x766E,0x7670,0x7671,0x7672,0x7673,0x7674,0x7675,/* 0x60-0x67 */
+	0x7676,0x7677,0x7679,0x767A,0x767C,0x767F,0x7680,0x7681,/* 0x68-0x6F */
+	0x7683,0x7685,0x7689,0x768A,0x768C,0x768D,0x768F,0x7690,/* 0x70-0x77 */
+	0x7692,0x7694,0x7695,0x7697,0x7698,0x769A,0x769B,0x0000,/* 0x78-0x7F */
+
+	0x769C,0x769D,0x769E,0x769F,0x76A0,0x76A1,0x76A2,0x76A3,/* 0x80-0x87 */
+	0x76A5,0x76A6,0x76A7,0x76A8,0x76A9,0x76AA,0x76AB,0x76AC,/* 0x88-0x8F */
+	0x76AD,0x76AF,0x76B0,0x76B3,0x76B5,0x76B6,0x76B7,0x76B8,/* 0x90-0x97 */
+	0x76B9,0x76BA,0x76BB,0x76BC,0x76BD,0x76BE,0x76C0,0x76C1,/* 0x98-0x9F */
+	0x76C3,0x554A,0x963F,0x57C3,0x6328,0x54CE,0x5509,0x54C0,/* 0xA0-0xA7 */
+	0x7691,0x764C,0x853C,0x77EE,0x827E,0x788D,0x7231,0x9698,/* 0xA8-0xAF */
+	0x978D,0x6C28,0x5B89,0x4FFA,0x6309,0x6697,0x5CB8,0x80FA,/* 0xB0-0xB7 */
+	0x6848,0x80AE,0x6602,0x76CE,0x51F9,0x6556,0x71AC,0x7FF1,/* 0xB8-0xBF */
+	0x8884,0x50B2,0x5965,0x61CA,0x6FB3,0x82AD,0x634C,0x6252,/* 0xC0-0xC7 */
+	0x53ED,0x5427,0x7B06,0x516B,0x75A4,0x5DF4,0x62D4,0x8DCB,/* 0xC8-0xCF */
+	0x9776,0x628A,0x8019,0x575D,0x9738,0x7F62,0x7238,0x767D,/* 0xD0-0xD7 */
+	0x67CF,0x767E,0x6446,0x4F70,0x8D25,0x62DC,0x7A17,0x6591,/* 0xD8-0xDF */
+	0x73ED,0x642C,0x6273,0x822C,0x9881,0x677F,0x7248,0x626E,/* 0xE0-0xE7 */
+	0x62CC,0x4F34,0x74E3,0x534A,0x529E,0x7ECA,0x90A6,0x5E2E,/* 0xE8-0xEF */
+	0x6886,0x699C,0x8180,0x7ED1,0x68D2,0x78C5,0x868C,0x9551,/* 0xF0-0xF7 */
+	0x508D,0x8C24,0x82DE,0x80DE,0x5305,0x8912,0x5265,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x76C4,0x76C7,0x76C9,0x76CB,0x76CC,0x76D3,0x76D5,0x76D9,/* 0x40-0x47 */
+	0x76DA,0x76DC,0x76DD,0x76DE,0x76E0,0x76E1,0x76E2,0x76E3,/* 0x48-0x4F */
+	0x76E4,0x76E6,0x76E7,0x76E8,0x76E9,0x76EA,0x76EB,0x76EC,/* 0x50-0x57 */
+	0x76ED,0x76F0,0x76F3,0x76F5,0x76F6,0x76F7,0x76FA,0x76FB,/* 0x58-0x5F */
+	0x76FD,0x76FF,0x7700,0x7702,0x7703,0x7705,0x7706,0x770A,/* 0x60-0x67 */
+	0x770C,0x770E,0x770F,0x7710,0x7711,0x7712,0x7713,0x7714,/* 0x68-0x6F */
+	0x7715,0x7716,0x7717,0x7718,0x771B,0x771C,0x771D,0x771E,/* 0x70-0x77 */
+	0x7721,0x7723,0x7724,0x7725,0x7727,0x772A,0x772B,0x0000,/* 0x78-0x7F */
+
+	0x772C,0x772E,0x7730,0x7731,0x7732,0x7733,0x7734,0x7739,/* 0x80-0x87 */
+	0x773B,0x773D,0x773E,0x773F,0x7742,0x7744,0x7745,0x7746,/* 0x88-0x8F */
+	0x7748,0x7749,0x774A,0x774B,0x774C,0x774D,0x774E,0x774F,/* 0x90-0x97 */
+	0x7752,0x7753,0x7754,0x7755,0x7756,0x7757,0x7758,0x7759,/* 0x98-0x9F */
+	0x775C,0x8584,0x96F9,0x4FDD,0x5821,0x9971,0x5B9D,0x62B1,/* 0xA0-0xA7 */
+	0x62A5,0x66B4,0x8C79,0x9C8D,0x7206,0x676F,0x7891,0x60B2,/* 0xA8-0xAF */
+	0x5351,0x5317,0x8F88,0x80CC,0x8D1D,0x94A1,0x500D,0x72C8,/* 0xB0-0xB7 */
+	0x5907,0x60EB,0x7119,0x88AB,0x5954,0x82EF,0x672C,0x7B28,/* 0xB8-0xBF */
+	0x5D29,0x7EF7,0x752D,0x6CF5,0x8E66,0x8FF8,0x903C,0x9F3B,/* 0xC0-0xC7 */
+	0x6BD4,0x9119,0x7B14,0x5F7C,0x78A7,0x84D6,0x853D,0x6BD5,/* 0xC8-0xCF */
+	0x6BD9,0x6BD6,0x5E01,0x5E87,0x75F9,0x95ED,0x655D,0x5F0A,/* 0xD0-0xD7 */
+	0x5FC5,0x8F9F,0x58C1,0x81C2,0x907F,0x965B,0x97AD,0x8FB9,/* 0xD8-0xDF */
+	0x7F16,0x8D2C,0x6241,0x4FBF,0x53D8,0x535E,0x8FA8,0x8FA9,/* 0xE0-0xE7 */
+	0x8FAB,0x904D,0x6807,0x5F6A,0x8198,0x8868,0x9CD6,0x618B,/* 0xE8-0xEF */
+	0x522B,0x762A,0x5F6C,0x658C,0x6FD2,0x6EE8,0x5BBE,0x6448,/* 0xF0-0xF7 */
+	0x5175,0x51B0,0x67C4,0x4E19,0x79C9,0x997C,0x70B3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x775D,0x775E,0x775F,0x7760,0x7764,0x7767,0x7769,0x776A,/* 0x40-0x47 */
+	0x776D,0x776E,0x776F,0x7770,0x7771,0x7772,0x7773,0x7774,/* 0x48-0x4F */
+	0x7775,0x7776,0x7777,0x7778,0x777A,0x777B,0x777C,0x7781,/* 0x50-0x57 */
+	0x7782,0x7783,0x7786,0x7787,0x7788,0x7789,0x778A,0x778B,/* 0x58-0x5F */
+	0x778F,0x7790,0x7793,0x7794,0x7795,0x7796,0x7797,0x7798,/* 0x60-0x67 */
+	0x7799,0x779A,0x779B,0x779C,0x779D,0x779E,0x77A1,0x77A3,/* 0x68-0x6F */
+	0x77A4,0x77A6,0x77A8,0x77AB,0x77AD,0x77AE,0x77AF,0x77B1,/* 0x70-0x77 */
+	0x77B2,0x77B4,0x77B6,0x77B7,0x77B8,0x77B9,0x77BA,0x0000,/* 0x78-0x7F */
+
+	0x77BC,0x77BE,0x77C0,0x77C1,0x77C2,0x77C3,0x77C4,0x77C5,/* 0x80-0x87 */
+	0x77C6,0x77C7,0x77C8,0x77C9,0x77CA,0x77CB,0x77CC,0x77CE,/* 0x88-0x8F */
+	0x77CF,0x77D0,0x77D1,0x77D2,0x77D3,0x77D4,0x77D5,0x77D6,/* 0x90-0x97 */
+	0x77D8,0x77D9,0x77DA,0x77DD,0x77DE,0x77DF,0x77E0,0x77E1,/* 0x98-0x9F */
+	0x77E4,0x75C5,0x5E76,0x73BB,0x83E0,0x64AD,0x62E8,0x94B5,/* 0xA0-0xA7 */
+	0x6CE2,0x535A,0x52C3,0x640F,0x94C2,0x7B94,0x4F2F,0x5E1B,/* 0xA8-0xAF */
+	0x8236,0x8116,0x818A,0x6E24,0x6CCA,0x9A73,0x6355,0x535C,/* 0xB0-0xB7 */
+	0x54FA,0x8865,0x57E0,0x4E0D,0x5E03,0x6B65,0x7C3F,0x90E8,/* 0xB8-0xBF */
+	0x6016,0x64E6,0x731C,0x88C1,0x6750,0x624D,0x8D22,0x776C,/* 0xC0-0xC7 */
+	0x8E29,0x91C7,0x5F69,0x83DC,0x8521,0x9910,0x53C2,0x8695,/* 0xC8-0xCF */
+	0x6B8B,0x60ED,0x60E8,0x707F,0x82CD,0x8231,0x4ED3,0x6CA7,/* 0xD0-0xD7 */
+	0x85CF,0x64CD,0x7CD9,0x69FD,0x66F9,0x8349,0x5395,0x7B56,/* 0xD8-0xDF */
+	0x4FA7,0x518C,0x6D4B,0x5C42,0x8E6D,0x63D2,0x53C9,0x832C,/* 0xE0-0xE7 */
+	0x8336,0x67E5,0x78B4,0x643D,0x5BDF,0x5C94,0x5DEE,0x8BE7,/* 0xE8-0xEF */
+	0x62C6,0x67F4,0x8C7A,0x6400,0x63BA,0x8749,0x998B,0x8C17,/* 0xF0-0xF7 */
+	0x7F20,0x94F2,0x4EA7,0x9610,0x98A4,0x660C,0x7316,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x77E6,0x77E8,0x77EA,0x77EF,0x77F0,0x77F1,0x77F2,0x77F4,/* 0x40-0x47 */
+	0x77F5,0x77F7,0x77F9,0x77FA,0x77FB,0x77FC,0x7803,0x7804,/* 0x48-0x4F */
+	0x7805,0x7806,0x7807,0x7808,0x780A,0x780B,0x780E,0x780F,/* 0x50-0x57 */
+	0x7810,0x7813,0x7815,0x7819,0x781B,0x781E,0x7820,0x7821,/* 0x58-0x5F */
+	0x7822,0x7824,0x7828,0x782A,0x782B,0x782E,0x782F,0x7831,/* 0x60-0x67 */
+	0x7832,0x7833,0x7835,0x7836,0x783D,0x783F,0x7841,0x7842,/* 0x68-0x6F */
+	0x7843,0x7844,0x7846,0x7848,0x7849,0x784A,0x784B,0x784D,/* 0x70-0x77 */
+	0x784F,0x7851,0x7853,0x7854,0x7858,0x7859,0x785A,0x0000,/* 0x78-0x7F */
+
+	0x785B,0x785C,0x785E,0x785F,0x7860,0x7861,0x7862,0x7863,/* 0x80-0x87 */
+	0x7864,0x7865,0x7866,0x7867,0x7868,0x7869,0x786F,0x7870,/* 0x88-0x8F */
+	0x7871,0x7872,0x7873,0x7874,0x7875,0x7876,0x7878,0x7879,/* 0x90-0x97 */
+	0x787A,0x787B,0x787D,0x787E,0x787F,0x7880,0x7881,0x7882,/* 0x98-0x9F */
+	0x7883,0x573A,0x5C1D,0x5E38,0x957F,0x507F,0x80A0,0x5382,/* 0xA0-0xA7 */
+	0x655E,0x7545,0x5531,0x5021,0x8D85,0x6284,0x949E,0x671D,/* 0xA8-0xAF */
+	0x5632,0x6F6E,0x5DE2,0x5435,0x7092,0x8F66,0x626F,0x64A4,/* 0xB0-0xB7 */
+	0x63A3,0x5F7B,0x6F88,0x90F4,0x81E3,0x8FB0,0x5C18,0x6668,/* 0xB8-0xBF */
+	0x5FF1,0x6C89,0x9648,0x8D81,0x886C,0x6491,0x79F0,0x57CE,/* 0xC0-0xC7 */
+	0x6A59,0x6210,0x5448,0x4E58,0x7A0B,0x60E9,0x6F84,0x8BDA,/* 0xC8-0xCF */
+	0x627F,0x901E,0x9A8B,0x79E4,0x5403,0x75F4,0x6301,0x5319,/* 0xD0-0xD7 */
+	0x6C60,0x8FDF,0x5F1B,0x9A70,0x803B,0x9F7F,0x4F88,0x5C3A,/* 0xD8-0xDF */
+	0x8D64,0x7FC5,0x65A5,0x70BD,0x5145,0x51B2,0x866B,0x5D07,/* 0xE0-0xE7 */
+	0x5BA0,0x62BD,0x916C,0x7574,0x8E0C,0x7A20,0x6101,0x7B79,/* 0xE8-0xEF */
+	0x4EC7,0x7EF8,0x7785,0x4E11,0x81ED,0x521D,0x51FA,0x6A71,/* 0xF0-0xF7 */
+	0x53A8,0x8E87,0x9504,0x96CF,0x6EC1,0x9664,0x695A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7884,0x7885,0x7886,0x7888,0x788A,0x788B,0x788F,0x7890,/* 0x40-0x47 */
+	0x7892,0x7894,0x7895,0x7896,0x7899,0x789D,0x789E,0x78A0,/* 0x48-0x4F */
+	0x78A2,0x78A4,0x78A6,0x78A8,0x78A9,0x78AA,0x78AB,0x78AC,/* 0x50-0x57 */
+	0x78AD,0x78AE,0x78AF,0x78B5,0x78B6,0x78B7,0x78B8,0x78BA,/* 0x58-0x5F */
+	0x78BB,0x78BC,0x78BD,0x78BF,0x78C0,0x78C2,0x78C3,0x78C4,/* 0x60-0x67 */
+	0x78C6,0x78C7,0x78C8,0x78CC,0x78CD,0x78CE,0x78CF,0x78D1,/* 0x68-0x6F */
+	0x78D2,0x78D3,0x78D6,0x78D7,0x78D8,0x78DA,0x78DB,0x78DC,/* 0x70-0x77 */
+	0x78DD,0x78DE,0x78DF,0x78E0,0x78E1,0x78E2,0x78E3,0x0000,/* 0x78-0x7F */
+
+	0x78E4,0x78E5,0x78E6,0x78E7,0x78E9,0x78EA,0x78EB,0x78ED,/* 0x80-0x87 */
+	0x78EE,0x78EF,0x78F0,0x78F1,0x78F3,0x78F5,0x78F6,0x78F8,/* 0x88-0x8F */
+	0x78F9,0x78FB,0x78FC,0x78FD,0x78FE,0x78FF,0x7900,0x7902,/* 0x90-0x97 */
+	0x7903,0x7904,0x7906,0x7907,0x7908,0x7909,0x790A,0x790B,/* 0x98-0x9F */
+	0x790C,0x7840,0x50A8,0x77D7,0x6410,0x89E6,0x5904,0x63E3,/* 0xA0-0xA7 */
+	0x5DDD,0x7A7F,0x693D,0x4F20,0x8239,0x5598,0x4E32,0x75AE,/* 0xA8-0xAF */
+	0x7A97,0x5E62,0x5E8A,0x95EF,0x521B,0x5439,0x708A,0x6376,/* 0xB0-0xB7 */
+	0x9524,0x5782,0x6625,0x693F,0x9187,0x5507,0x6DF3,0x7EAF,/* 0xB8-0xBF */
+	0x8822,0x6233,0x7EF0,0x75B5,0x8328,0x78C1,0x96CC,0x8F9E,/* 0xC0-0xC7 */
+	0x6148,0x74F7,0x8BCD,0x6B64,0x523A,0x8D50,0x6B21,0x806A,/* 0xC8-0xCF */
+	0x8471,0x56F1,0x5306,0x4ECE,0x4E1B,0x51D1,0x7C97,0x918B,/* 0xD0-0xD7 */
+	0x7C07,0x4FC3,0x8E7F,0x7BE1,0x7A9C,0x6467,0x5D14,0x50AC,/* 0xD8-0xDF */
+	0x8106,0x7601,0x7CB9,0x6DEC,0x7FE0,0x6751,0x5B58,0x5BF8,/* 0xE0-0xE7 */
+	0x78CB,0x64AE,0x6413,0x63AA,0x632B,0x9519,0x642D,0x8FBE,/* 0xE8-0xEF */
+	0x7B54,0x7629,0x6253,0x5927,0x5446,0x6B79,0x50A3,0x6234,/* 0xF0-0xF7 */
+	0x5E26,0x6B86,0x4EE3,0x8D37,0x888B,0x5F85,0x902E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x790D,0x790E,0x790F,0x7910,0x7911,0x7912,0x7914,0x7915,/* 0x40-0x47 */
+	0x7916,0x7917,0x7918,0x7919,0x791A,0x791B,0x791C,0x791D,/* 0x48-0x4F */
+	0x791F,0x7920,0x7921,0x7922,0x7923,0x7925,0x7926,0x7927,/* 0x50-0x57 */
+	0x7928,0x7929,0x792A,0x792B,0x792C,0x792D,0x792E,0x792F,/* 0x58-0x5F */
+	0x7930,0x7931,0x7932,0x7933,0x7935,0x7936,0x7937,0x7938,/* 0x60-0x67 */
+	0x7939,0x793D,0x793F,0x7942,0x7943,0x7944,0x7945,0x7947,/* 0x68-0x6F */
+	0x794A,0x794B,0x794C,0x794D,0x794E,0x794F,0x7950,0x7951,/* 0x70-0x77 */
+	0x7952,0x7954,0x7955,0x7958,0x7959,0x7961,0x7963,0x0000,/* 0x78-0x7F */
+
+	0x7964,0x7966,0x7969,0x796A,0x796B,0x796C,0x796E,0x7970,/* 0x80-0x87 */
+	0x7971,0x7972,0x7973,0x7974,0x7975,0x7976,0x7979,0x797B,/* 0x88-0x8F */
+	0x797C,0x797D,0x797E,0x797F,0x7982,0x7983,0x7986,0x7987,/* 0x90-0x97 */
+	0x7988,0x7989,0x798B,0x798C,0x798D,0x798E,0x7990,0x7991,/* 0x98-0x9F */
+	0x7992,0x6020,0x803D,0x62C5,0x4E39,0x5355,0x90F8,0x63B8,/* 0xA0-0xA7 */
+	0x80C6,0x65E6,0x6C2E,0x4F46,0x60EE,0x6DE1,0x8BDE,0x5F39,/* 0xA8-0xAF */
+	0x86CB,0x5F53,0x6321,0x515A,0x8361,0x6863,0x5200,0x6363,/* 0xB0-0xB7 */
+	0x8E48,0x5012,0x5C9B,0x7977,0x5BFC,0x5230,0x7A3B,0x60BC,/* 0xB8-0xBF */
+	0x9053,0x76D7,0x5FB7,0x5F97,0x7684,0x8E6C,0x706F,0x767B,/* 0xC0-0xC7 */
+	0x7B49,0x77AA,0x51F3,0x9093,0x5824,0x4F4E,0x6EF4,0x8FEA,/* 0xC8-0xCF */
+	0x654C,0x7B1B,0x72C4,0x6DA4,0x7FDF,0x5AE1,0x62B5,0x5E95,/* 0xD0-0xD7 */
+	0x5730,0x8482,0x7B2C,0x5E1D,0x5F1F,0x9012,0x7F14,0x98A0,/* 0xD8-0xDF */
+	0x6382,0x6EC7,0x7898,0x70B9,0x5178,0x975B,0x57AB,0x7535,/* 0xE0-0xE7 */
+	0x4F43,0x7538,0x5E97,0x60E6,0x5960,0x6DC0,0x6BBF,0x7889,/* 0xE8-0xEF */
+	0x53FC,0x96D5,0x51CB,0x5201,0x6389,0x540A,0x9493,0x8C03,/* 0xF0-0xF7 */
+	0x8DCC,0x7239,0x789F,0x8776,0x8FED,0x8C0D,0x53E0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7993,0x7994,0x7995,0x7996,0x7997,0x7998,0x7999,0x799B,/* 0x40-0x47 */
+	0x799C,0x799D,0x799E,0x799F,0x79A0,0x79A1,0x79A2,0x79A3,/* 0x48-0x4F */
+	0x79A4,0x79A5,0x79A6,0x79A8,0x79A9,0x79AA,0x79AB,0x79AC,/* 0x50-0x57 */
+	0x79AD,0x79AE,0x79AF,0x79B0,0x79B1,0x79B2,0x79B4,0x79B5,/* 0x58-0x5F */
+	0x79B6,0x79B7,0x79B8,0x79BC,0x79BF,0x79C2,0x79C4,0x79C5,/* 0x60-0x67 */
+	0x79C7,0x79C8,0x79CA,0x79CC,0x79CE,0x79CF,0x79D0,0x79D3,/* 0x68-0x6F */
+	0x79D4,0x79D6,0x79D7,0x79D9,0x79DA,0x79DB,0x79DC,0x79DD,/* 0x70-0x77 */
+	0x79DE,0x79E0,0x79E1,0x79E2,0x79E5,0x79E8,0x79EA,0x0000,/* 0x78-0x7F */
+
+	0x79EC,0x79EE,0x79F1,0x79F2,0x79F3,0x79F4,0x79F5,0x79F6,/* 0x80-0x87 */
+	0x79F7,0x79F9,0x79FA,0x79FC,0x79FE,0x79FF,0x7A01,0x7A04,/* 0x88-0x8F */
+	0x7A05,0x7A07,0x7A08,0x7A09,0x7A0A,0x7A0C,0x7A0F,0x7A10,/* 0x90-0x97 */
+	0x7A11,0x7A12,0x7A13,0x7A15,0x7A16,0x7A18,0x7A19,0x7A1B,/* 0x98-0x9F */
+	0x7A1C,0x4E01,0x76EF,0x53EE,0x9489,0x9876,0x9F0E,0x952D,/* 0xA0-0xA7 */
+	0x5B9A,0x8BA2,0x4E22,0x4E1C,0x51AC,0x8463,0x61C2,0x52A8,/* 0xA8-0xAF */
+	0x680B,0x4F97,0x606B,0x51BB,0x6D1E,0x515C,0x6296,0x6597,/* 0xB0-0xB7 */
+	0x9661,0x8C46,0x9017,0x75D8,0x90FD,0x7763,0x6BD2,0x728A,/* 0xB8-0xBF */
+	0x72EC,0x8BFB,0x5835,0x7779,0x8D4C,0x675C,0x9540,0x809A,/* 0xC0-0xC7 */
+	0x5EA6,0x6E21,0x5992,0x7AEF,0x77ED,0x953B,0x6BB5,0x65AD,/* 0xC8-0xCF */
+	0x7F0E,0x5806,0x5151,0x961F,0x5BF9,0x58A9,0x5428,0x8E72,/* 0xD0-0xD7 */
+	0x6566,0x987F,0x56E4,0x949D,0x76FE,0x9041,0x6387,0x54C6,/* 0xD8-0xDF */
+	0x591A,0x593A,0x579B,0x8EB2,0x6735,0x8DFA,0x8235,0x5241,/* 0xE0-0xE7 */
+	0x60F0,0x5815,0x86FE,0x5CE8,0x9E45,0x4FC4,0x989D,0x8BB9,/* 0xE8-0xEF */
+	0x5A25,0x6076,0x5384,0x627C,0x904F,0x9102,0x997F,0x6069,/* 0xF0-0xF7 */
+	0x800C,0x513F,0x8033,0x5C14,0x9975,0x6D31,0x4E8C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7A1D,0x7A1F,0x7A21,0x7A22,0x7A24,0x7A25,0x7A26,0x7A27,/* 0x40-0x47 */
+	0x7A28,0x7A29,0x7A2A,0x7A2B,0x7A2C,0x7A2D,0x7A2E,0x7A2F,/* 0x48-0x4F */
+	0x7A30,0x7A31,0x7A32,0x7A34,0x7A35,0x7A36,0x7A38,0x7A3A,/* 0x50-0x57 */
+	0x7A3E,0x7A40,0x7A41,0x7A42,0x7A43,0x7A44,0x7A45,0x7A47,/* 0x58-0x5F */
+	0x7A48,0x7A49,0x7A4A,0x7A4B,0x7A4C,0x7A4D,0x7A4E,0x7A4F,/* 0x60-0x67 */
+	0x7A50,0x7A52,0x7A53,0x7A54,0x7A55,0x7A56,0x7A58,0x7A59,/* 0x68-0x6F */
+	0x7A5A,0x7A5B,0x7A5C,0x7A5D,0x7A5E,0x7A5F,0x7A60,0x7A61,/* 0x70-0x77 */
+	0x7A62,0x7A63,0x7A64,0x7A65,0x7A66,0x7A67,0x7A68,0x0000,/* 0x78-0x7F */
+
+	0x7A69,0x7A6A,0x7A6B,0x7A6C,0x7A6D,0x7A6E,0x7A6F,0x7A71,/* 0x80-0x87 */
+	0x7A72,0x7A73,0x7A75,0x7A7B,0x7A7C,0x7A7D,0x7A7E,0x7A82,/* 0x88-0x8F */
+	0x7A85,0x7A87,0x7A89,0x7A8A,0x7A8B,0x7A8C,0x7A8E,0x7A8F,/* 0x90-0x97 */
+	0x7A90,0x7A93,0x7A94,0x7A99,0x7A9A,0x7A9B,0x7A9E,0x7AA1,/* 0x98-0x9F */
+	0x7AA2,0x8D30,0x53D1,0x7F5A,0x7B4F,0x4F10,0x4E4F,0x9600,/* 0xA0-0xA7 */
+	0x6CD5,0x73D0,0x85E9,0x5E06,0x756A,0x7FFB,0x6A0A,0x77FE,/* 0xA8-0xAF */
+	0x9492,0x7E41,0x51E1,0x70E6,0x53CD,0x8FD4,0x8303,0x8D29,/* 0xB0-0xB7 */
+	0x72AF,0x996D,0x6CDB,0x574A,0x82B3,0x65B9,0x80AA,0x623F,/* 0xB8-0xBF */
+	0x9632,0x59A8,0x4EFF,0x8BBF,0x7EBA,0x653E,0x83F2,0x975E,/* 0xC0-0xC7 */
+	0x5561,0x98DE,0x80A5,0x532A,0x8BFD,0x5420,0x80BA,0x5E9F,/* 0xC8-0xCF */
+	0x6CB8,0x8D39,0x82AC,0x915A,0x5429,0x6C1B,0x5206,0x7EB7,/* 0xD0-0xD7 */
+	0x575F,0x711A,0x6C7E,0x7C89,0x594B,0x4EFD,0x5FFF,0x6124,/* 0xD8-0xDF */
+	0x7CAA,0x4E30,0x5C01,0x67AB,0x8702,0x5CF0,0x950B,0x98CE,/* 0xE0-0xE7 */
+	0x75AF,0x70FD,0x9022,0x51AF,0x7F1D,0x8BBD,0x5949,0x51E4,/* 0xE8-0xEF */
+	0x4F5B,0x5426,0x592B,0x6577,0x80A4,0x5B75,0x6276,0x62C2,/* 0xF0-0xF7 */
+	0x8F90,0x5E45,0x6C1F,0x7B26,0x4F0F,0x4FD8,0x670D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7AA3,0x7AA4,0x7AA7,0x7AA9,0x7AAA,0x7AAB,0x7AAE,0x7AAF,/* 0x40-0x47 */
+	0x7AB0,0x7AB1,0x7AB2,0x7AB4,0x7AB5,0x7AB6,0x7AB7,0x7AB8,/* 0x48-0x4F */
+	0x7AB9,0x7ABA,0x7ABB,0x7ABC,0x7ABD,0x7ABE,0x7AC0,0x7AC1,/* 0x50-0x57 */
+	0x7AC2,0x7AC3,0x7AC4,0x7AC5,0x7AC6,0x7AC7,0x7AC8,0x7AC9,/* 0x58-0x5F */
+	0x7ACA,0x7ACC,0x7ACD,0x7ACE,0x7ACF,0x7AD0,0x7AD1,0x7AD2,/* 0x60-0x67 */
+	0x7AD3,0x7AD4,0x7AD5,0x7AD7,0x7AD8,0x7ADA,0x7ADB,0x7ADC,/* 0x68-0x6F */
+	0x7ADD,0x7AE1,0x7AE2,0x7AE4,0x7AE7,0x7AE8,0x7AE9,0x7AEA,/* 0x70-0x77 */
+	0x7AEB,0x7AEC,0x7AEE,0x7AF0,0x7AF1,0x7AF2,0x7AF3,0x0000,/* 0x78-0x7F */
+
+	0x7AF4,0x7AF5,0x7AF6,0x7AF7,0x7AF8,0x7AFB,0x7AFC,0x7AFE,/* 0x80-0x87 */
+	0x7B00,0x7B01,0x7B02,0x7B05,0x7B07,0x7B09,0x7B0C,0x7B0D,/* 0x88-0x8F */
+	0x7B0E,0x7B10,0x7B12,0x7B13,0x7B16,0x7B17,0x7B18,0x7B1A,/* 0x90-0x97 */
+	0x7B1C,0x7B1D,0x7B1F,0x7B21,0x7B22,0x7B23,0x7B27,0x7B29,/* 0x98-0x9F */
+	0x7B2D,0x6D6E,0x6DAA,0x798F,0x88B1,0x5F17,0x752B,0x629A,/* 0xA0-0xA7 */
+	0x8F85,0x4FEF,0x91DC,0x65A7,0x812F,0x8151,0x5E9C,0x8150,/* 0xA8-0xAF */
+	0x8D74,0x526F,0x8986,0x8D4B,0x590D,0x5085,0x4ED8,0x961C,/* 0xB0-0xB7 */
+	0x7236,0x8179,0x8D1F,0x5BCC,0x8BA3,0x9644,0x5987,0x7F1A,/* 0xB8-0xBF */
+	0x5490,0x5676,0x560E,0x8BE5,0x6539,0x6982,0x9499,0x76D6,/* 0xC0-0xC7 */
+	0x6E89,0x5E72,0x7518,0x6746,0x67D1,0x7AFF,0x809D,0x8D76,/* 0xC8-0xCF */
+	0x611F,0x79C6,0x6562,0x8D63,0x5188,0x521A,0x94A2,0x7F38,/* 0xD0-0xD7 */
+	0x809B,0x7EB2,0x5C97,0x6E2F,0x6760,0x7BD9,0x768B,0x9AD8,/* 0xD8-0xDF */
+	0x818F,0x7F94,0x7CD5,0x641E,0x9550,0x7A3F,0x544A,0x54E5,/* 0xE0-0xE7 */
+	0x6B4C,0x6401,0x6208,0x9E3D,0x80F3,0x7599,0x5272,0x9769,/* 0xE8-0xEF */
+	0x845B,0x683C,0x86E4,0x9601,0x9694,0x94EC,0x4E2A,0x5404,/* 0xF0-0xF7 */
+	0x7ED9,0x6839,0x8DDF,0x8015,0x66F4,0x5E9A,0x7FB9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7B2F,0x7B30,0x7B32,0x7B34,0x7B35,0x7B36,0x7B37,0x7B39,/* 0x40-0x47 */
+	0x7B3B,0x7B3D,0x7B3F,0x7B40,0x7B41,0x7B42,0x7B43,0x7B44,/* 0x48-0x4F */
+	0x7B46,0x7B48,0x7B4A,0x7B4D,0x7B4E,0x7B53,0x7B55,0x7B57,/* 0x50-0x57 */
+	0x7B59,0x7B5C,0x7B5E,0x7B5F,0x7B61,0x7B63,0x7B64,0x7B65,/* 0x58-0x5F */
+	0x7B66,0x7B67,0x7B68,0x7B69,0x7B6A,0x7B6B,0x7B6C,0x7B6D,/* 0x60-0x67 */
+	0x7B6F,0x7B70,0x7B73,0x7B74,0x7B76,0x7B78,0x7B7A,0x7B7C,/* 0x68-0x6F */
+	0x7B7D,0x7B7F,0x7B81,0x7B82,0x7B83,0x7B84,0x7B86,0x7B87,/* 0x70-0x77 */
+	0x7B88,0x7B89,0x7B8A,0x7B8B,0x7B8C,0x7B8E,0x7B8F,0x0000,/* 0x78-0x7F */
+
+	0x7B91,0x7B92,0x7B93,0x7B96,0x7B98,0x7B99,0x7B9A,0x7B9B,/* 0x80-0x87 */
+	0x7B9E,0x7B9F,0x7BA0,0x7BA3,0x7BA4,0x7BA5,0x7BAE,0x7BAF,/* 0x88-0x8F */
+	0x7BB0,0x7BB2,0x7BB3,0x7BB5,0x7BB6,0x7BB7,0x7BB9,0x7BBA,/* 0x90-0x97 */
+	0x7BBB,0x7BBC,0x7BBD,0x7BBE,0x7BBF,0x7BC0,0x7BC2,0x7BC3,/* 0x98-0x9F */
+	0x7BC4,0x57C2,0x803F,0x6897,0x5DE5,0x653B,0x529F,0x606D,/* 0xA0-0xA7 */
+	0x9F9A,0x4F9B,0x8EAC,0x516C,0x5BAB,0x5F13,0x5DE9,0x6C5E,/* 0xA8-0xAF */
+	0x62F1,0x8D21,0x5171,0x94A9,0x52FE,0x6C9F,0x82DF,0x72D7,/* 0xB0-0xB7 */
+	0x57A2,0x6784,0x8D2D,0x591F,0x8F9C,0x83C7,0x5495,0x7B8D,/* 0xB8-0xBF */
+	0x4F30,0x6CBD,0x5B64,0x59D1,0x9F13,0x53E4,0x86CA,0x9AA8,/* 0xC0-0xC7 */
+	0x8C37,0x80A1,0x6545,0x987E,0x56FA,0x96C7,0x522E,0x74DC,/* 0xC8-0xCF */
+	0x5250,0x5BE1,0x6302,0x8902,0x4E56,0x62D0,0x602A,0x68FA,/* 0xD0-0xD7 */
+	0x5173,0x5B98,0x51A0,0x89C2,0x7BA1,0x9986,0x7F50,0x60EF,/* 0xD8-0xDF */
+	0x704C,0x8D2F,0x5149,0x5E7F,0x901B,0x7470,0x89C4,0x572D,/* 0xE0-0xE7 */
+	0x7845,0x5F52,0x9F9F,0x95FA,0x8F68,0x9B3C,0x8BE1,0x7678,/* 0xE8-0xEF */
+	0x6842,0x67DC,0x8DEA,0x8D35,0x523D,0x8F8A,0x6EDA,0x68CD,/* 0xF0-0xF7 */
+	0x9505,0x90ED,0x56FD,0x679C,0x88F9,0x8FC7,0x54C8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7BC5,0x7BC8,0x7BC9,0x7BCA,0x7BCB,0x7BCD,0x7BCE,0x7BCF,/* 0x40-0x47 */
+	0x7BD0,0x7BD2,0x7BD4,0x7BD5,0x7BD6,0x7BD7,0x7BD8,0x7BDB,/* 0x48-0x4F */
+	0x7BDC,0x7BDE,0x7BDF,0x7BE0,0x7BE2,0x7BE3,0x7BE4,0x7BE7,/* 0x50-0x57 */
+	0x7BE8,0x7BE9,0x7BEB,0x7BEC,0x7BED,0x7BEF,0x7BF0,0x7BF2,/* 0x58-0x5F */
+	0x7BF3,0x7BF4,0x7BF5,0x7BF6,0x7BF8,0x7BF9,0x7BFA,0x7BFB,/* 0x60-0x67 */
+	0x7BFD,0x7BFF,0x7C00,0x7C01,0x7C02,0x7C03,0x7C04,0x7C05,/* 0x68-0x6F */
+	0x7C06,0x7C08,0x7C09,0x7C0A,0x7C0D,0x7C0E,0x7C10,0x7C11,/* 0x70-0x77 */
+	0x7C12,0x7C13,0x7C14,0x7C15,0x7C17,0x7C18,0x7C19,0x0000,/* 0x78-0x7F */
+
+	0x7C1A,0x7C1B,0x7C1C,0x7C1D,0x7C1E,0x7C20,0x7C21,0x7C22,/* 0x80-0x87 */
+	0x7C23,0x7C24,0x7C25,0x7C28,0x7C29,0x7C2B,0x7C2C,0x7C2D,/* 0x88-0x8F */
+	0x7C2E,0x7C2F,0x7C30,0x7C31,0x7C32,0x7C33,0x7C34,0x7C35,/* 0x90-0x97 */
+	0x7C36,0x7C37,0x7C39,0x7C3A,0x7C3B,0x7C3C,0x7C3D,0x7C3E,/* 0x98-0x9F */
+	0x7C42,0x9AB8,0x5B69,0x6D77,0x6C26,0x4EA5,0x5BB3,0x9A87,/* 0xA0-0xA7 */
+	0x9163,0x61A8,0x90AF,0x97E9,0x542B,0x6DB5,0x5BD2,0x51FD,/* 0xA8-0xAF */
+	0x558A,0x7F55,0x7FF0,0x64BC,0x634D,0x65F1,0x61BE,0x608D,/* 0xB0-0xB7 */
+	0x710A,0x6C57,0x6C49,0x592F,0x676D,0x822A,0x58D5,0x568E,/* 0xB8-0xBF */
+	0x8C6A,0x6BEB,0x90DD,0x597D,0x8017,0x53F7,0x6D69,0x5475,/* 0xC0-0xC7 */
+	0x559D,0x8377,0x83CF,0x6838,0x79BE,0x548C,0x4F55,0x5408,/* 0xC8-0xCF */
+	0x76D2,0x8C89,0x9602,0x6CB3,0x6DB8,0x8D6B,0x8910,0x9E64,/* 0xD0-0xD7 */
+	0x8D3A,0x563F,0x9ED1,0x75D5,0x5F88,0x72E0,0x6068,0x54FC,/* 0xD8-0xDF */
+	0x4EA8,0x6A2A,0x8861,0x6052,0x8F70,0x54C4,0x70D8,0x8679,/* 0xE0-0xE7 */
+	0x9E3F,0x6D2A,0x5B8F,0x5F18,0x7EA2,0x5589,0x4FAF,0x7334,/* 0xE8-0xEF */
+	0x543C,0x539A,0x5019,0x540E,0x547C,0x4E4E,0x5FFD,0x745A,/* 0xF0-0xF7 */
+	0x58F6,0x846B,0x80E1,0x8774,0x72D0,0x7CCA,0x6E56,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7C43,0x7C44,0x7C45,0x7C46,0x7C47,0x7C48,0x7C49,0x7C4A,/* 0x40-0x47 */
+	0x7C4B,0x7C4C,0x7C4E,0x7C4F,0x7C50,0x7C51,0x7C52,0x7C53,/* 0x48-0x4F */
+	0x7C54,0x7C55,0x7C56,0x7C57,0x7C58,0x7C59,0x7C5A,0x7C5B,/* 0x50-0x57 */
+	0x7C5C,0x7C5D,0x7C5E,0x7C5F,0x7C60,0x7C61,0x7C62,0x7C63,/* 0x58-0x5F */
+	0x7C64,0x7C65,0x7C66,0x7C67,0x7C68,0x7C69,0x7C6A,0x7C6B,/* 0x60-0x67 */
+	0x7C6C,0x7C6D,0x7C6E,0x7C6F,0x7C70,0x7C71,0x7C72,0x7C75,/* 0x68-0x6F */
+	0x7C76,0x7C77,0x7C78,0x7C79,0x7C7A,0x7C7E,0x7C7F,0x7C80,/* 0x70-0x77 */
+	0x7C81,0x7C82,0x7C83,0x7C84,0x7C85,0x7C86,0x7C87,0x0000,/* 0x78-0x7F */
+
+	0x7C88,0x7C8A,0x7C8B,0x7C8C,0x7C8D,0x7C8E,0x7C8F,0x7C90,/* 0x80-0x87 */
+	0x7C93,0x7C94,0x7C96,0x7C99,0x7C9A,0x7C9B,0x7CA0,0x7CA1,/* 0x88-0x8F */
+	0x7CA3,0x7CA6,0x7CA7,0x7CA8,0x7CA9,0x7CAB,0x7CAC,0x7CAD,/* 0x90-0x97 */
+	0x7CAF,0x7CB0,0x7CB4,0x7CB5,0x7CB6,0x7CB7,0x7CB8,0x7CBA,/* 0x98-0x9F */
+	0x7CBB,0x5F27,0x864E,0x552C,0x62A4,0x4E92,0x6CAA,0x6237,/* 0xA0-0xA7 */
+	0x82B1,0x54D7,0x534E,0x733E,0x6ED1,0x753B,0x5212,0x5316,/* 0xA8-0xAF */
+	0x8BDD,0x69D0,0x5F8A,0x6000,0x6DEE,0x574F,0x6B22,0x73AF,/* 0xB0-0xB7 */
+	0x6853,0x8FD8,0x7F13,0x6362,0x60A3,0x5524,0x75EA,0x8C62,/* 0xB8-0xBF */
+	0x7115,0x6DA3,0x5BA6,0x5E7B,0x8352,0x614C,0x9EC4,0x78FA,/* 0xC0-0xC7 */
+	0x8757,0x7C27,0x7687,0x51F0,0x60F6,0x714C,0x6643,0x5E4C,/* 0xC8-0xCF */
+	0x604D,0x8C0E,0x7070,0x6325,0x8F89,0x5FBD,0x6062,0x86D4,/* 0xD0-0xD7 */
+	0x56DE,0x6BC1,0x6094,0x6167,0x5349,0x60E0,0x6666,0x8D3F,/* 0xD8-0xDF */
+	0x79FD,0x4F1A,0x70E9,0x6C47,0x8BB3,0x8BF2,0x7ED8,0x8364,/* 0xE0-0xE7 */
+	0x660F,0x5A5A,0x9B42,0x6D51,0x6DF7,0x8C41,0x6D3B,0x4F19,/* 0xE8-0xEF */
+	0x706B,0x83B7,0x6216,0x60D1,0x970D,0x8D27,0x7978,0x51FB,/* 0xF0-0xF7 */
+	0x573E,0x57FA,0x673A,0x7578,0x7A3D,0x79EF,0x7B95,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7CBF,0x7CC0,0x7CC2,0x7CC3,0x7CC4,0x7CC6,0x7CC9,0x7CCB,/* 0x40-0x47 */
+	0x7CCE,0x7CCF,0x7CD0,0x7CD1,0x7CD2,0x7CD3,0x7CD4,0x7CD8,/* 0x48-0x4F */
+	0x7CDA,0x7CDB,0x7CDD,0x7CDE,0x7CE1,0x7CE2,0x7CE3,0x7CE4,/* 0x50-0x57 */
+	0x7CE5,0x7CE6,0x7CE7,0x7CE9,0x7CEA,0x7CEB,0x7CEC,0x7CED,/* 0x58-0x5F */
+	0x7CEE,0x7CF0,0x7CF1,0x7CF2,0x7CF3,0x7CF4,0x7CF5,0x7CF6,/* 0x60-0x67 */
+	0x7CF7,0x7CF9,0x7CFA,0x7CFC,0x7CFD,0x7CFE,0x7CFF,0x7D00,/* 0x68-0x6F */
+	0x7D01,0x7D02,0x7D03,0x7D04,0x7D05,0x7D06,0x7D07,0x7D08,/* 0x70-0x77 */
+	0x7D09,0x7D0B,0x7D0C,0x7D0D,0x7D0E,0x7D0F,0x7D10,0x0000,/* 0x78-0x7F */
+
+	0x7D11,0x7D12,0x7D13,0x7D14,0x7D15,0x7D16,0x7D17,0x7D18,/* 0x80-0x87 */
+	0x7D19,0x7D1A,0x7D1B,0x7D1C,0x7D1D,0x7D1E,0x7D1F,0x7D21,/* 0x88-0x8F */
+	0x7D23,0x7D24,0x7D25,0x7D26,0x7D28,0x7D29,0x7D2A,0x7D2C,/* 0x90-0x97 */
+	0x7D2D,0x7D2E,0x7D30,0x7D31,0x7D32,0x7D33,0x7D34,0x7D35,/* 0x98-0x9F */
+	0x7D36,0x808C,0x9965,0x8FF9,0x6FC0,0x8BA5,0x9E21,0x59EC,/* 0xA0-0xA7 */
+	0x7EE9,0x7F09,0x5409,0x6781,0x68D8,0x8F91,0x7C4D,0x96C6,/* 0xA8-0xAF */
+	0x53CA,0x6025,0x75BE,0x6C72,0x5373,0x5AC9,0x7EA7,0x6324,/* 0xB0-0xB7 */
+	0x51E0,0x810A,0x5DF1,0x84DF,0x6280,0x5180,0x5B63,0x4F0E,/* 0xB8-0xBF */
+	0x796D,0x5242,0x60B8,0x6D4E,0x5BC4,0x5BC2,0x8BA1,0x8BB0,/* 0xC0-0xC7 */
+	0x65E2,0x5FCC,0x9645,0x5993,0x7EE7,0x7EAA,0x5609,0x67B7,/* 0xC8-0xCF */
+	0x5939,0x4F73,0x5BB6,0x52A0,0x835A,0x988A,0x8D3E,0x7532,/* 0xD0-0xD7 */
+	0x94BE,0x5047,0x7A3C,0x4EF7,0x67B6,0x9A7E,0x5AC1,0x6B7C,/* 0xD8-0xDF */
+	0x76D1,0x575A,0x5C16,0x7B3A,0x95F4,0x714E,0x517C,0x80A9,/* 0xE0-0xE7 */
+	0x8270,0x5978,0x7F04,0x8327,0x68C0,0x67EC,0x78B1,0x7877,/* 0xE8-0xEF */
+	0x62E3,0x6361,0x7B80,0x4FED,0x526A,0x51CF,0x8350,0x69DB,/* 0xF0-0xF7 */
+	0x9274,0x8DF5,0x8D31,0x89C1,0x952E,0x7BAD,0x4EF6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7D37,0x7D38,0x7D39,0x7D3A,0x7D3B,0x7D3C,0x7D3D,0x7D3E,/* 0x40-0x47 */
+	0x7D3F,0x7D40,0x7D41,0x7D42,0x7D43,0x7D44,0x7D45,0x7D46,/* 0x48-0x4F */
+	0x7D47,0x7D48,0x7D49,0x7D4A,0x7D4B,0x7D4C,0x7D4D,0x7D4E,/* 0x50-0x57 */
+	0x7D4F,0x7D50,0x7D51,0x7D52,0x7D53,0x7D54,0x7D55,0x7D56,/* 0x58-0x5F */
+	0x7D57,0x7D58,0x7D59,0x7D5A,0x7D5B,0x7D5C,0x7D5D,0x7D5E,/* 0x60-0x67 */
+	0x7D5F,0x7D60,0x7D61,0x7D62,0x7D63,0x7D64,0x7D65,0x7D66,/* 0x68-0x6F */
+	0x7D67,0x7D68,0x7D69,0x7D6A,0x7D6B,0x7D6C,0x7D6D,0x7D6F,/* 0x70-0x77 */
+	0x7D70,0x7D71,0x7D72,0x7D73,0x7D74,0x7D75,0x7D76,0x0000,/* 0x78-0x7F */
+
+	0x7D78,0x7D79,0x7D7A,0x7D7B,0x7D7C,0x7D7D,0x7D7E,0x7D7F,/* 0x80-0x87 */
+	0x7D80,0x7D81,0x7D82,0x7D83,0x7D84,0x7D85,0x7D86,0x7D87,/* 0x88-0x8F */
+	0x7D88,0x7D89,0x7D8A,0x7D8B,0x7D8C,0x7D8D,0x7D8E,0x7D8F,/* 0x90-0x97 */
+	0x7D90,0x7D91,0x7D92,0x7D93,0x7D94,0x7D95,0x7D96,0x7D97,/* 0x98-0x9F */
+	0x7D98,0x5065,0x8230,0x5251,0x996F,0x6E10,0x6E85,0x6DA7,/* 0xA0-0xA7 */
+	0x5EFA,0x50F5,0x59DC,0x5C06,0x6D46,0x6C5F,0x7586,0x848B,/* 0xA8-0xAF */
+	0x6868,0x5956,0x8BB2,0x5320,0x9171,0x964D,0x8549,0x6912,/* 0xB0-0xB7 */
+	0x7901,0x7126,0x80F6,0x4EA4,0x90CA,0x6D47,0x9A84,0x5A07,/* 0xB8-0xBF */
+	0x56BC,0x6405,0x94F0,0x77EB,0x4FA5,0x811A,0x72E1,0x89D2,/* 0xC0-0xC7 */
+	0x997A,0x7F34,0x7EDE,0x527F,0x6559,0x9175,0x8F7F,0x8F83,/* 0xC8-0xCF */
+	0x53EB,0x7A96,0x63ED,0x63A5,0x7686,0x79F8,0x8857,0x9636,/* 0xD0-0xD7 */
+	0x622A,0x52AB,0x8282,0x6854,0x6770,0x6377,0x776B,0x7AED,/* 0xD8-0xDF */
+	0x6D01,0x7ED3,0x89E3,0x59D0,0x6212,0x85C9,0x82A5,0x754C,/* 0xE0-0xE7 */
+	0x501F,0x4ECB,0x75A5,0x8BEB,0x5C4A,0x5DFE,0x7B4B,0x65A4,/* 0xE8-0xEF */
+	0x91D1,0x4ECA,0x6D25,0x895F,0x7D27,0x9526,0x4EC5,0x8C28,/* 0xF0-0xF7 */
+	0x8FDB,0x9773,0x664B,0x7981,0x8FD1,0x70EC,0x6D78,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7D99,0x7D9A,0x7D9B,0x7D9C,0x7D9D,0x7D9E,0x7D9F,0x7DA0,/* 0x40-0x47 */
+	0x7DA1,0x7DA2,0x7DA3,0x7DA4,0x7DA5,0x7DA7,0x7DA8,0x7DA9,/* 0x48-0x4F */
+	0x7DAA,0x7DAB,0x7DAC,0x7DAD,0x7DAF,0x7DB0,0x7DB1,0x7DB2,/* 0x50-0x57 */
+	0x7DB3,0x7DB4,0x7DB5,0x7DB6,0x7DB7,0x7DB8,0x7DB9,0x7DBA,/* 0x58-0x5F */
+	0x7DBB,0x7DBC,0x7DBD,0x7DBE,0x7DBF,0x7DC0,0x7DC1,0x7DC2,/* 0x60-0x67 */
+	0x7DC3,0x7DC4,0x7DC5,0x7DC6,0x7DC7,0x7DC8,0x7DC9,0x7DCA,/* 0x68-0x6F */
+	0x7DCB,0x7DCC,0x7DCD,0x7DCE,0x7DCF,0x7DD0,0x7DD1,0x7DD2,/* 0x70-0x77 */
+	0x7DD3,0x7DD4,0x7DD5,0x7DD6,0x7DD7,0x7DD8,0x7DD9,0x0000,/* 0x78-0x7F */
+
+	0x7DDA,0x7DDB,0x7DDC,0x7DDD,0x7DDE,0x7DDF,0x7DE0,0x7DE1,/* 0x80-0x87 */
+	0x7DE2,0x7DE3,0x7DE4,0x7DE5,0x7DE6,0x7DE7,0x7DE8,0x7DE9,/* 0x88-0x8F */
+	0x7DEA,0x7DEB,0x7DEC,0x7DED,0x7DEE,0x7DEF,0x7DF0,0x7DF1,/* 0x90-0x97 */
+	0x7DF2,0x7DF3,0x7DF4,0x7DF5,0x7DF6,0x7DF7,0x7DF8,0x7DF9,/* 0x98-0x9F */
+	0x7DFA,0x5C3D,0x52B2,0x8346,0x5162,0x830E,0x775B,0x6676,/* 0xA0-0xA7 */
+	0x9CB8,0x4EAC,0x60CA,0x7CBE,0x7CB3,0x7ECF,0x4E95,0x8B66,/* 0xA8-0xAF */
+	0x666F,0x9888,0x9759,0x5883,0x656C,0x955C,0x5F84,0x75C9,/* 0xB0-0xB7 */
+	0x9756,0x7ADF,0x7ADE,0x51C0,0x70AF,0x7A98,0x63EA,0x7A76,/* 0xB8-0xBF */
+	0x7EA0,0x7396,0x97ED,0x4E45,0x7078,0x4E5D,0x9152,0x53A9,/* 0xC0-0xC7 */
+	0x6551,0x65E7,0x81FC,0x8205,0x548E,0x5C31,0x759A,0x97A0,/* 0xC8-0xCF */
+	0x62D8,0x72D9,0x75BD,0x5C45,0x9A79,0x83CA,0x5C40,0x5480,/* 0xD0-0xD7 */
+	0x77E9,0x4E3E,0x6CAE,0x805A,0x62D2,0x636E,0x5DE8,0x5177,/* 0xD8-0xDF */
+	0x8DDD,0x8E1E,0x952F,0x4FF1,0x53E5,0x60E7,0x70AC,0x5267,/* 0xE0-0xE7 */
+	0x6350,0x9E43,0x5A1F,0x5026,0x7737,0x5377,0x7EE2,0x6485,/* 0xE8-0xEF */
+	0x652B,0x6289,0x6398,0x5014,0x7235,0x89C9,0x51B3,0x8BC0,/* 0xF0-0xF7 */
+	0x7EDD,0x5747,0x83CC,0x94A7,0x519B,0x541B,0x5CFB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7DFB,0x7DFC,0x7DFD,0x7DFE,0x7DFF,0x7E00,0x7E01,0x7E02,/* 0x40-0x47 */
+	0x7E03,0x7E04,0x7E05,0x7E06,0x7E07,0x7E08,0x7E09,0x7E0A,/* 0x48-0x4F */
+	0x7E0B,0x7E0C,0x7E0D,0x7E0E,0x7E0F,0x7E10,0x7E11,0x7E12,/* 0x50-0x57 */
+	0x7E13,0x7E14,0x7E15,0x7E16,0x7E17,0x7E18,0x7E19,0x7E1A,/* 0x58-0x5F */
+	0x7E1B,0x7E1C,0x7E1D,0x7E1E,0x7E1F,0x7E20,0x7E21,0x7E22,/* 0x60-0x67 */
+	0x7E23,0x7E24,0x7E25,0x7E26,0x7E27,0x7E28,0x7E29,0x7E2A,/* 0x68-0x6F */
+	0x7E2B,0x7E2C,0x7E2D,0x7E2E,0x7E2F,0x7E30,0x7E31,0x7E32,/* 0x70-0x77 */
+	0x7E33,0x7E34,0x7E35,0x7E36,0x7E37,0x7E38,0x7E39,0x0000,/* 0x78-0x7F */
+
+	0x7E3A,0x7E3C,0x7E3D,0x7E3E,0x7E3F,0x7E40,0x7E42,0x7E43,/* 0x80-0x87 */
+	0x7E44,0x7E45,0x7E46,0x7E48,0x7E49,0x7E4A,0x7E4B,0x7E4C,/* 0x88-0x8F */
+	0x7E4D,0x7E4E,0x7E4F,0x7E50,0x7E51,0x7E52,0x7E53,0x7E54,/* 0x90-0x97 */
+	0x7E55,0x7E56,0x7E57,0x7E58,0x7E59,0x7E5A,0x7E5B,0x7E5C,/* 0x98-0x9F */
+	0x7E5D,0x4FCA,0x7AE3,0x6D5A,0x90E1,0x9A8F,0x5580,0x5496,/* 0xA0-0xA7 */
+	0x5361,0x54AF,0x5F00,0x63E9,0x6977,0x51EF,0x6168,0x520A,/* 0xA8-0xAF */
+	0x582A,0x52D8,0x574E,0x780D,0x770B,0x5EB7,0x6177,0x7CE0,/* 0xB0-0xB7 */
+	0x625B,0x6297,0x4EA2,0x7095,0x8003,0x62F7,0x70E4,0x9760,/* 0xB8-0xBF */
+	0x5777,0x82DB,0x67EF,0x68F5,0x78D5,0x9897,0x79D1,0x58F3,/* 0xC0-0xC7 */
+	0x54B3,0x53EF,0x6E34,0x514B,0x523B,0x5BA2,0x8BFE,0x80AF,/* 0xC8-0xCF */
+	0x5543,0x57A6,0x6073,0x5751,0x542D,0x7A7A,0x6050,0x5B54,/* 0xD0-0xD7 */
+	0x63A7,0x62A0,0x53E3,0x6263,0x5BC7,0x67AF,0x54ED,0x7A9F,/* 0xD8-0xDF */
+	0x82E6,0x9177,0x5E93,0x88E4,0x5938,0x57AE,0x630E,0x8DE8,/* 0xE0-0xE7 */
+	0x80EF,0x5757,0x7B77,0x4FA9,0x5FEB,0x5BBD,0x6B3E,0x5321,/* 0xE8-0xEF */
+	0x7B50,0x72C2,0x6846,0x77FF,0x7736,0x65F7,0x51B5,0x4E8F,/* 0xF0-0xF7 */
+	0x76D4,0x5CBF,0x7AA5,0x8475,0x594E,0x9B41,0x5080,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E5E,0x7E5F,0x7E60,0x7E61,0x7E62,0x7E63,0x7E64,0x7E65,/* 0x40-0x47 */
+	0x7E66,0x7E67,0x7E68,0x7E69,0x7E6A,0x7E6B,0x7E6C,0x7E6D,/* 0x48-0x4F */
+	0x7E6E,0x7E6F,0x7E70,0x7E71,0x7E72,0x7E73,0x7E74,0x7E75,/* 0x50-0x57 */
+	0x7E76,0x7E77,0x7E78,0x7E79,0x7E7A,0x7E7B,0x7E7C,0x7E7D,/* 0x58-0x5F */
+	0x7E7E,0x7E7F,0x7E80,0x7E81,0x7E83,0x7E84,0x7E85,0x7E86,/* 0x60-0x67 */
+	0x7E87,0x7E88,0x7E89,0x7E8A,0x7E8B,0x7E8C,0x7E8D,0x7E8E,/* 0x68-0x6F */
+	0x7E8F,0x7E90,0x7E91,0x7E92,0x7E93,0x7E94,0x7E95,0x7E96,/* 0x70-0x77 */
+	0x7E97,0x7E98,0x7E99,0x7E9A,0x7E9C,0x7E9D,0x7E9E,0x0000,/* 0x78-0x7F */
+
+	0x7EAE,0x7EB4,0x7EBB,0x7EBC,0x7ED6,0x7EE4,0x7EEC,0x7EF9,/* 0x80-0x87 */
+	0x7F0A,0x7F10,0x7F1E,0x7F37,0x7F39,0x7F3B,0x7F3C,0x7F3D,/* 0x88-0x8F */
+	0x7F3E,0x7F3F,0x7F40,0x7F41,0x7F43,0x7F46,0x7F47,0x7F48,/* 0x90-0x97 */
+	0x7F49,0x7F4A,0x7F4B,0x7F4C,0x7F4D,0x7F4E,0x7F4F,0x7F52,/* 0x98-0x9F */
+	0x7F53,0x9988,0x6127,0x6E83,0x5764,0x6606,0x6346,0x56F0,/* 0xA0-0xA7 */
+	0x62EC,0x6269,0x5ED3,0x9614,0x5783,0x62C9,0x5587,0x8721,/* 0xA8-0xAF */
+	0x814A,0x8FA3,0x5566,0x83B1,0x6765,0x8D56,0x84DD,0x5A6A,/* 0xB0-0xB7 */
+	0x680F,0x62E6,0x7BEE,0x9611,0x5170,0x6F9C,0x8C30,0x63FD,/* 0xB8-0xBF */
+	0x89C8,0x61D2,0x7F06,0x70C2,0x6EE5,0x7405,0x6994,0x72FC,/* 0xC0-0xC7 */
+	0x5ECA,0x90CE,0x6717,0x6D6A,0x635E,0x52B3,0x7262,0x8001,/* 0xC8-0xCF */
+	0x4F6C,0x59E5,0x916A,0x70D9,0x6D9D,0x52D2,0x4E50,0x96F7,/* 0xD0-0xD7 */
+	0x956D,0x857E,0x78CA,0x7D2F,0x5121,0x5792,0x64C2,0x808B,/* 0xD8-0xDF */
+	0x7C7B,0x6CEA,0x68F1,0x695E,0x51B7,0x5398,0x68A8,0x7281,/* 0xE0-0xE7 */
+	0x9ECE,0x7BF1,0x72F8,0x79BB,0x6F13,0x7406,0x674E,0x91CC,/* 0xE8-0xEF */
+	0x9CA4,0x793C,0x8389,0x8354,0x540F,0x6817,0x4E3D,0x5389,/* 0xF0-0xF7 */
+	0x52B1,0x783E,0x5386,0x5229,0x5088,0x4F8B,0x4FD0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7F56,0x7F59,0x7F5B,0x7F5C,0x7F5D,0x7F5E,0x7F60,0x7F63,/* 0x40-0x47 */
+	0x7F64,0x7F65,0x7F66,0x7F67,0x7F6B,0x7F6C,0x7F6D,0x7F6F,/* 0x48-0x4F */
+	0x7F70,0x7F73,0x7F75,0x7F76,0x7F77,0x7F78,0x7F7A,0x7F7B,/* 0x50-0x57 */
+	0x7F7C,0x7F7D,0x7F7F,0x7F80,0x7F82,0x7F83,0x7F84,0x7F85,/* 0x58-0x5F */
+	0x7F86,0x7F87,0x7F88,0x7F89,0x7F8B,0x7F8D,0x7F8F,0x7F90,/* 0x60-0x67 */
+	0x7F91,0x7F92,0x7F93,0x7F95,0x7F96,0x7F97,0x7F98,0x7F99,/* 0x68-0x6F */
+	0x7F9B,0x7F9C,0x7FA0,0x7FA2,0x7FA3,0x7FA5,0x7FA6,0x7FA8,/* 0x70-0x77 */
+	0x7FA9,0x7FAA,0x7FAB,0x7FAC,0x7FAD,0x7FAE,0x7FB1,0x0000,/* 0x78-0x7F */
+
+	0x7FB3,0x7FB4,0x7FB5,0x7FB6,0x7FB7,0x7FBA,0x7FBB,0x7FBE,/* 0x80-0x87 */
+	0x7FC0,0x7FC2,0x7FC3,0x7FC4,0x7FC6,0x7FC7,0x7FC8,0x7FC9,/* 0x88-0x8F */
+	0x7FCB,0x7FCD,0x7FCF,0x7FD0,0x7FD1,0x7FD2,0x7FD3,0x7FD6,/* 0x90-0x97 */
+	0x7FD7,0x7FD9,0x7FDA,0x7FDB,0x7FDC,0x7FDD,0x7FDE,0x7FE2,/* 0x98-0x9F */
+	0x7FE3,0x75E2,0x7ACB,0x7C92,0x6CA5,0x96B6,0x529B,0x7483,/* 0xA0-0xA7 */
+	0x54E9,0x4FE9,0x8054,0x83B2,0x8FDE,0x9570,0x5EC9,0x601C,/* 0xA8-0xAF */
+	0x6D9F,0x5E18,0x655B,0x8138,0x94FE,0x604B,0x70BC,0x7EC3,/* 0xB0-0xB7 */
+	0x7CAE,0x51C9,0x6881,0x7CB1,0x826F,0x4E24,0x8F86,0x91CF,/* 0xB8-0xBF */
+	0x667E,0x4EAE,0x8C05,0x64A9,0x804A,0x50DA,0x7597,0x71CE,/* 0xC0-0xC7 */
+	0x5BE5,0x8FBD,0x6F66,0x4E86,0x6482,0x9563,0x5ED6,0x6599,/* 0xC8-0xCF */
+	0x5217,0x88C2,0x70C8,0x52A3,0x730E,0x7433,0x6797,0x78F7,/* 0xD0-0xD7 */
+	0x9716,0x4E34,0x90BB,0x9CDE,0x6DCB,0x51DB,0x8D41,0x541D,/* 0xD8-0xDF */
+	0x62CE,0x73B2,0x83F1,0x96F6,0x9F84,0x94C3,0x4F36,0x7F9A,/* 0xE0-0xE7 */
+	0x51CC,0x7075,0x9675,0x5CAD,0x9886,0x53E6,0x4EE4,0x6E9C,/* 0xE8-0xEF */
+	0x7409,0x69B4,0x786B,0x998F,0x7559,0x5218,0x7624,0x6D41,/* 0xF0-0xF7 */
+	0x67F3,0x516D,0x9F99,0x804B,0x5499,0x7B3C,0x7ABF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7FE4,0x7FE7,0x7FE8,0x7FEA,0x7FEB,0x7FEC,0x7FED,0x7FEF,/* 0x40-0x47 */
+	0x7FF2,0x7FF4,0x7FF5,0x7FF6,0x7FF7,0x7FF8,0x7FF9,0x7FFA,/* 0x48-0x4F */
+	0x7FFD,0x7FFE,0x7FFF,0x8002,0x8007,0x8008,0x8009,0x800A,/* 0x50-0x57 */
+	0x800E,0x800F,0x8011,0x8013,0x801A,0x801B,0x801D,0x801E,/* 0x58-0x5F */
+	0x801F,0x8021,0x8023,0x8024,0x802B,0x802C,0x802D,0x802E,/* 0x60-0x67 */
+	0x802F,0x8030,0x8032,0x8034,0x8039,0x803A,0x803C,0x803E,/* 0x68-0x6F */
+	0x8040,0x8041,0x8044,0x8045,0x8047,0x8048,0x8049,0x804E,/* 0x70-0x77 */
+	0x804F,0x8050,0x8051,0x8053,0x8055,0x8056,0x8057,0x0000,/* 0x78-0x7F */
+
+	0x8059,0x805B,0x805C,0x805D,0x805E,0x805F,0x8060,0x8061,/* 0x80-0x87 */
+	0x8062,0x8063,0x8064,0x8065,0x8066,0x8067,0x8068,0x806B,/* 0x88-0x8F */
+	0x806C,0x806D,0x806E,0x806F,0x8070,0x8072,0x8073,0x8074,/* 0x90-0x97 */
+	0x8075,0x8076,0x8077,0x8078,0x8079,0x807A,0x807B,0x807C,/* 0x98-0x9F */
+	0x807D,0x9686,0x5784,0x62E2,0x9647,0x697C,0x5A04,0x6402,/* 0xA0-0xA7 */
+	0x7BD3,0x6F0F,0x964B,0x82A6,0x5362,0x9885,0x5E90,0x7089,/* 0xA8-0xAF */
+	0x63B3,0x5364,0x864F,0x9C81,0x9E93,0x788C,0x9732,0x8DEF,/* 0xB0-0xB7 */
+	0x8D42,0x9E7F,0x6F5E,0x7984,0x5F55,0x9646,0x622E,0x9A74,/* 0xB8-0xBF */
+	0x5415,0x94DD,0x4FA3,0x65C5,0x5C65,0x5C61,0x7F15,0x8651,/* 0xC0-0xC7 */
+	0x6C2F,0x5F8B,0x7387,0x6EE4,0x7EFF,0x5CE6,0x631B,0x5B6A,/* 0xC8-0xCF */
+	0x6EE6,0x5375,0x4E71,0x63A0,0x7565,0x62A1,0x8F6E,0x4F26,/* 0xD0-0xD7 */
+	0x4ED1,0x6CA6,0x7EB6,0x8BBA,0x841D,0x87BA,0x7F57,0x903B,/* 0xD8-0xDF */
+	0x9523,0x7BA9,0x9AA1,0x88F8,0x843D,0x6D1B,0x9A86,0x7EDC,/* 0xE0-0xE7 */
+	0x5988,0x9EBB,0x739B,0x7801,0x8682,0x9A6C,0x9A82,0x561B,/* 0xE8-0xEF */
+	0x5417,0x57CB,0x4E70,0x9EA6,0x5356,0x8FC8,0x8109,0x7792,/* 0xF0-0xF7 */
+	0x9992,0x86EE,0x6EE1,0x8513,0x66FC,0x6162,0x6F2B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x807E,0x8081,0x8082,0x8085,0x8088,0x808A,0x808D,0x808E,/* 0x40-0x47 */
+	0x808F,0x8090,0x8091,0x8092,0x8094,0x8095,0x8097,0x8099,/* 0x48-0x4F */
+	0x809E,0x80A3,0x80A6,0x80A7,0x80A8,0x80AC,0x80B0,0x80B3,/* 0x50-0x57 */
+	0x80B5,0x80B6,0x80B8,0x80B9,0x80BB,0x80C5,0x80C7,0x80C8,/* 0x58-0x5F */
+	0x80C9,0x80CA,0x80CB,0x80CF,0x80D0,0x80D1,0x80D2,0x80D3,/* 0x60-0x67 */
+	0x80D4,0x80D5,0x80D8,0x80DF,0x80E0,0x80E2,0x80E3,0x80E6,/* 0x68-0x6F */
+	0x80EE,0x80F5,0x80F7,0x80F9,0x80FB,0x80FE,0x80FF,0x8100,/* 0x70-0x77 */
+	0x8101,0x8103,0x8104,0x8105,0x8107,0x8108,0x810B,0x0000,/* 0x78-0x7F */
+
+	0x810C,0x8115,0x8117,0x8119,0x811B,0x811C,0x811D,0x811F,/* 0x80-0x87 */
+	0x8120,0x8121,0x8122,0x8123,0x8124,0x8125,0x8126,0x8127,/* 0x88-0x8F */
+	0x8128,0x8129,0x812A,0x812B,0x812D,0x812E,0x8130,0x8133,/* 0x90-0x97 */
+	0x8134,0x8135,0x8137,0x8139,0x813A,0x813B,0x813C,0x813D,/* 0x98-0x9F */
+	0x813F,0x8C29,0x8292,0x832B,0x76F2,0x6C13,0x5FD9,0x83BD,/* 0xA0-0xA7 */
+	0x732B,0x8305,0x951A,0x6BDB,0x77DB,0x94C6,0x536F,0x8302,/* 0xA8-0xAF */
+	0x5192,0x5E3D,0x8C8C,0x8D38,0x4E48,0x73AB,0x679A,0x6885,/* 0xB0-0xB7 */
+	0x9176,0x9709,0x7164,0x6CA1,0x7709,0x5A92,0x9541,0x6BCF,/* 0xB8-0xBF */
+	0x7F8E,0x6627,0x5BD0,0x59B9,0x5A9A,0x95E8,0x95F7,0x4EEC,/* 0xC0-0xC7 */
+	0x840C,0x8499,0x6AAC,0x76DF,0x9530,0x731B,0x68A6,0x5B5F,/* 0xC8-0xCF */
+	0x772F,0x919A,0x9761,0x7CDC,0x8FF7,0x8C1C,0x5F25,0x7C73,/* 0xD0-0xD7 */
+	0x79D8,0x89C5,0x6CCC,0x871C,0x5BC6,0x5E42,0x68C9,0x7720,/* 0xD8-0xDF */
+	0x7EF5,0x5195,0x514D,0x52C9,0x5A29,0x7F05,0x9762,0x82D7,/* 0xE0-0xE7 */
+	0x63CF,0x7784,0x85D0,0x79D2,0x6E3A,0x5E99,0x5999,0x8511,/* 0xE8-0xEF */
+	0x706D,0x6C11,0x62BF,0x76BF,0x654F,0x60AF,0x95FD,0x660E,/* 0xF0-0xF7 */
+	0x879F,0x9E23,0x94ED,0x540D,0x547D,0x8C2C,0x6478,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8140,0x8141,0x8142,0x8143,0x8144,0x8145,0x8147,0x8149,/* 0x40-0x47 */
+	0x814D,0x814E,0x814F,0x8152,0x8156,0x8157,0x8158,0x815B,/* 0x48-0x4F */
+	0x815C,0x815D,0x815E,0x815F,0x8161,0x8162,0x8163,0x8164,/* 0x50-0x57 */
+	0x8166,0x8168,0x816A,0x816B,0x816C,0x816F,0x8172,0x8173,/* 0x58-0x5F */
+	0x8175,0x8176,0x8177,0x8178,0x8181,0x8183,0x8184,0x8185,/* 0x60-0x67 */
+	0x8186,0x8187,0x8189,0x818B,0x818C,0x818D,0x818E,0x8190,/* 0x68-0x6F */
+	0x8192,0x8193,0x8194,0x8195,0x8196,0x8197,0x8199,0x819A,/* 0x70-0x77 */
+	0x819E,0x819F,0x81A0,0x81A1,0x81A2,0x81A4,0x81A5,0x0000,/* 0x78-0x7F */
+
+	0x81A7,0x81A9,0x81AB,0x81AC,0x81AD,0x81AE,0x81AF,0x81B0,/* 0x80-0x87 */
+	0x81B1,0x81B2,0x81B4,0x81B5,0x81B6,0x81B7,0x81B8,0x81B9,/* 0x88-0x8F */
+	0x81BC,0x81BD,0x81BE,0x81BF,0x81C4,0x81C5,0x81C7,0x81C8,/* 0x90-0x97 */
+	0x81C9,0x81CB,0x81CD,0x81CE,0x81CF,0x81D0,0x81D1,0x81D2,/* 0x98-0x9F */
+	0x81D3,0x6479,0x8611,0x6A21,0x819C,0x78E8,0x6469,0x9B54,/* 0xA0-0xA7 */
+	0x62B9,0x672B,0x83AB,0x58A8,0x9ED8,0x6CAB,0x6F20,0x5BDE,/* 0xA8-0xAF */
+	0x964C,0x8C0B,0x725F,0x67D0,0x62C7,0x7261,0x4EA9,0x59C6,/* 0xB0-0xB7 */
+	0x6BCD,0x5893,0x66AE,0x5E55,0x52DF,0x6155,0x6728,0x76EE,/* 0xB8-0xBF */
+	0x7766,0x7267,0x7A46,0x62FF,0x54EA,0x5450,0x94A0,0x90A3,/* 0xC0-0xC7 */
+	0x5A1C,0x7EB3,0x6C16,0x4E43,0x5976,0x8010,0x5948,0x5357,/* 0xC8-0xCF */
+	0x7537,0x96BE,0x56CA,0x6320,0x8111,0x607C,0x95F9,0x6DD6,/* 0xD0-0xD7 */
+	0x5462,0x9981,0x5185,0x5AE9,0x80FD,0x59AE,0x9713,0x502A,/* 0xD8-0xDF */
+	0x6CE5,0x5C3C,0x62DF,0x4F60,0x533F,0x817B,0x9006,0x6EBA,/* 0xE0-0xE7 */
+	0x852B,0x62C8,0x5E74,0x78BE,0x64B5,0x637B,0x5FF5,0x5A18,/* 0xE8-0xEF */
+	0x917F,0x9E1F,0x5C3F,0x634F,0x8042,0x5B7D,0x556E,0x954A,/* 0xF0-0xF7 */
+	0x954D,0x6D85,0x60A8,0x67E0,0x72DE,0x51DD,0x5B81,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x81D4,0x81D5,0x81D6,0x81D7,0x81D8,0x81D9,0x81DA,0x81DB,/* 0x40-0x47 */
+	0x81DC,0x81DD,0x81DE,0x81DF,0x81E0,0x81E1,0x81E2,0x81E4,/* 0x48-0x4F */
+	0x81E5,0x81E6,0x81E8,0x81E9,0x81EB,0x81EE,0x81EF,0x81F0,/* 0x50-0x57 */
+	0x81F1,0x81F2,0x81F5,0x81F6,0x81F7,0x81F8,0x81F9,0x81FA,/* 0x58-0x5F */
+	0x81FD,0x81FF,0x8203,0x8207,0x8208,0x8209,0x820A,0x820B,/* 0x60-0x67 */
+	0x820E,0x820F,0x8211,0x8213,0x8215,0x8216,0x8217,0x8218,/* 0x68-0x6F */
+	0x8219,0x821A,0x821D,0x8220,0x8224,0x8225,0x8226,0x8227,/* 0x70-0x77 */
+	0x8229,0x822E,0x8232,0x823A,0x823C,0x823D,0x823F,0x0000,/* 0x78-0x7F */
+
+	0x8240,0x8241,0x8242,0x8243,0x8245,0x8246,0x8248,0x824A,/* 0x80-0x87 */
+	0x824C,0x824D,0x824E,0x8250,0x8251,0x8252,0x8253,0x8254,/* 0x88-0x8F */
+	0x8255,0x8256,0x8257,0x8259,0x825B,0x825C,0x825D,0x825E,/* 0x90-0x97 */
+	0x8260,0x8261,0x8262,0x8263,0x8264,0x8265,0x8266,0x8267,/* 0x98-0x9F */
+	0x8269,0x62E7,0x6CDE,0x725B,0x626D,0x94AE,0x7EBD,0x8113,/* 0xA0-0xA7 */
+	0x6D53,0x519C,0x5F04,0x5974,0x52AA,0x6012,0x5973,0x6696,/* 0xA8-0xAF */
+	0x8650,0x759F,0x632A,0x61E6,0x7CEF,0x8BFA,0x54E6,0x6B27,/* 0xB0-0xB7 */
+	0x9E25,0x6BB4,0x85D5,0x5455,0x5076,0x6CA4,0x556A,0x8DB4,/* 0xB8-0xBF */
+	0x722C,0x5E15,0x6015,0x7436,0x62CD,0x6392,0x724C,0x5F98,/* 0xC0-0xC7 */
+	0x6E43,0x6D3E,0x6500,0x6F58,0x76D8,0x78D0,0x76FC,0x7554,/* 0xC8-0xCF */
+	0x5224,0x53DB,0x4E53,0x5E9E,0x65C1,0x802A,0x80D6,0x629B,/* 0xD0-0xD7 */
+	0x5486,0x5228,0x70AE,0x888D,0x8DD1,0x6CE1,0x5478,0x80DA,/* 0xD8-0xDF */
+	0x57F9,0x88F4,0x8D54,0x966A,0x914D,0x4F69,0x6C9B,0x55B7,/* 0xE0-0xE7 */
+	0x76C6,0x7830,0x62A8,0x70F9,0x6F8E,0x5F6D,0x84EC,0x68DA,/* 0xE8-0xEF */
+	0x787C,0x7BF7,0x81A8,0x670B,0x9E4F,0x6367,0x78B0,0x576F,/* 0xF0-0xF7 */
+	0x7812,0x9739,0x6279,0x62AB,0x5288,0x7435,0x6BD7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x826A,0x826B,0x826C,0x826D,0x8271,0x8275,0x8276,0x8277,/* 0x40-0x47 */
+	0x8278,0x827B,0x827C,0x8280,0x8281,0x8283,0x8285,0x8286,/* 0x48-0x4F */
+	0x8287,0x8289,0x828C,0x8290,0x8293,0x8294,0x8295,0x8296,/* 0x50-0x57 */
+	0x829A,0x829B,0x829E,0x82A0,0x82A2,0x82A3,0x82A7,0x82B2,/* 0x58-0x5F */
+	0x82B5,0x82B6,0x82BA,0x82BB,0x82BC,0x82BF,0x82C0,0x82C2,/* 0x60-0x67 */
+	0x82C3,0x82C5,0x82C6,0x82C9,0x82D0,0x82D6,0x82D9,0x82DA,/* 0x68-0x6F */
+	0x82DD,0x82E2,0x82E7,0x82E8,0x82E9,0x82EA,0x82EC,0x82ED,/* 0x70-0x77 */
+	0x82EE,0x82F0,0x82F2,0x82F3,0x82F5,0x82F6,0x82F8,0x0000,/* 0x78-0x7F */
+
+	0x82FA,0x82FC,0x82FD,0x82FE,0x82FF,0x8300,0x830A,0x830B,/* 0x80-0x87 */
+	0x830D,0x8310,0x8312,0x8313,0x8316,0x8318,0x8319,0x831D,/* 0x88-0x8F */
+	0x831E,0x831F,0x8320,0x8321,0x8322,0x8323,0x8324,0x8325,/* 0x90-0x97 */
+	0x8326,0x8329,0x832A,0x832E,0x8330,0x8332,0x8337,0x833B,/* 0x98-0x9F */
+	0x833D,0x5564,0x813E,0x75B2,0x76AE,0x5339,0x75DE,0x50FB,/* 0xA0-0xA7 */
+	0x5C41,0x8B6C,0x7BC7,0x504F,0x7247,0x9A97,0x98D8,0x6F02,/* 0xA8-0xAF */
+	0x74E2,0x7968,0x6487,0x77A5,0x62FC,0x9891,0x8D2B,0x54C1,/* 0xB0-0xB7 */
+	0x8058,0x4E52,0x576A,0x82F9,0x840D,0x5E73,0x51ED,0x74F6,/* 0xB8-0xBF */
+	0x8BC4,0x5C4F,0x5761,0x6CFC,0x9887,0x5A46,0x7834,0x9B44,/* 0xC0-0xC7 */
+	0x8FEB,0x7C95,0x5256,0x6251,0x94FA,0x4EC6,0x8386,0x8461,/* 0xC8-0xCF */
+	0x83E9,0x84B2,0x57D4,0x6734,0x5703,0x666E,0x6D66,0x8C31,/* 0xD0-0xD7 */
+	0x66DD,0x7011,0x671F,0x6B3A,0x6816,0x621A,0x59BB,0x4E03,/* 0xD8-0xDF */
+	0x51C4,0x6F06,0x67D2,0x6C8F,0x5176,0x68CB,0x5947,0x6B67,/* 0xE0-0xE7 */
+	0x7566,0x5D0E,0x8110,0x9F50,0x65D7,0x7948,0x7941,0x9A91,/* 0xE8-0xEF */
+	0x8D77,0x5C82,0x4E5E,0x4F01,0x542F,0x5951,0x780C,0x5668,/* 0xF0-0xF7 */
+	0x6C14,0x8FC4,0x5F03,0x6C7D,0x6CE3,0x8BAB,0x6390,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x833E,0x833F,0x8341,0x8342,0x8344,0x8345,0x8348,0x834A,/* 0x40-0x47 */
+	0x834B,0x834C,0x834D,0x834E,0x8353,0x8355,0x8356,0x8357,/* 0x48-0x4F */
+	0x8358,0x8359,0x835D,0x8362,0x8370,0x8371,0x8372,0x8373,/* 0x50-0x57 */
+	0x8374,0x8375,0x8376,0x8379,0x837A,0x837E,0x837F,0x8380,/* 0x58-0x5F */
+	0x8381,0x8382,0x8383,0x8384,0x8387,0x8388,0x838A,0x838B,/* 0x60-0x67 */
+	0x838C,0x838D,0x838F,0x8390,0x8391,0x8394,0x8395,0x8396,/* 0x68-0x6F */
+	0x8397,0x8399,0x839A,0x839D,0x839F,0x83A1,0x83A2,0x83A3,/* 0x70-0x77 */
+	0x83A4,0x83A5,0x83A6,0x83A7,0x83AC,0x83AD,0x83AE,0x0000,/* 0x78-0x7F */
+
+	0x83AF,0x83B5,0x83BB,0x83BE,0x83BF,0x83C2,0x83C3,0x83C4,/* 0x80-0x87 */
+	0x83C6,0x83C8,0x83C9,0x83CB,0x83CD,0x83CE,0x83D0,0x83D1,/* 0x88-0x8F */
+	0x83D2,0x83D3,0x83D5,0x83D7,0x83D9,0x83DA,0x83DB,0x83DE,/* 0x90-0x97 */
+	0x83E2,0x83E3,0x83E4,0x83E6,0x83E7,0x83E8,0x83EB,0x83EC,/* 0x98-0x9F */
+	0x83ED,0x6070,0x6D3D,0x7275,0x6266,0x948E,0x94C5,0x5343,/* 0xA0-0xA7 */
+	0x8FC1,0x7B7E,0x4EDF,0x8C26,0x4E7E,0x9ED4,0x94B1,0x94B3,/* 0xA8-0xAF */
+	0x524D,0x6F5C,0x9063,0x6D45,0x8C34,0x5811,0x5D4C,0x6B20,/* 0xB0-0xB7 */
+	0x6B49,0x67AA,0x545B,0x8154,0x7F8C,0x5899,0x8537,0x5F3A,/* 0xB8-0xBF */
+	0x62A2,0x6A47,0x9539,0x6572,0x6084,0x6865,0x77A7,0x4E54,/* 0xC0-0xC7 */
+	0x4FA8,0x5DE7,0x9798,0x64AC,0x7FD8,0x5CED,0x4FCF,0x7A8D,/* 0xC8-0xCF */
+	0x5207,0x8304,0x4E14,0x602F,0x7A83,0x94A6,0x4FB5,0x4EB2,/* 0xD0-0xD7 */
+	0x79E6,0x7434,0x52E4,0x82B9,0x64D2,0x79BD,0x5BDD,0x6C81,/* 0xD8-0xDF */
+	0x9752,0x8F7B,0x6C22,0x503E,0x537F,0x6E05,0x64CE,0x6674,/* 0xE0-0xE7 */
+	0x6C30,0x60C5,0x9877,0x8BF7,0x5E86,0x743C,0x7A77,0x79CB,/* 0xE8-0xEF */
+	0x4E18,0x90B1,0x7403,0x6C42,0x56DA,0x914B,0x6CC5,0x8D8B,/* 0xF0-0xF7 */
+	0x533A,0x86C6,0x66F2,0x8EAF,0x5C48,0x9A71,0x6E20,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x83EE,0x83EF,0x83F3,0x83F4,0x83F5,0x83F6,0x83F7,0x83FA,/* 0x40-0x47 */
+	0x83FB,0x83FC,0x83FE,0x83FF,0x8400,0x8402,0x8405,0x8407,/* 0x48-0x4F */
+	0x8408,0x8409,0x840A,0x8410,0x8412,0x8413,0x8414,0x8415,/* 0x50-0x57 */
+	0x8416,0x8417,0x8419,0x841A,0x841B,0x841E,0x841F,0x8420,/* 0x58-0x5F */
+	0x8421,0x8422,0x8423,0x8429,0x842A,0x842B,0x842C,0x842D,/* 0x60-0x67 */
+	0x842E,0x842F,0x8430,0x8432,0x8433,0x8434,0x8435,0x8436,/* 0x68-0x6F */
+	0x8437,0x8439,0x843A,0x843B,0x843E,0x843F,0x8440,0x8441,/* 0x70-0x77 */
+	0x8442,0x8443,0x8444,0x8445,0x8447,0x8448,0x8449,0x0000,/* 0x78-0x7F */
+
+	0x844A,0x844B,0x844C,0x844D,0x844E,0x844F,0x8450,0x8452,/* 0x80-0x87 */
+	0x8453,0x8454,0x8455,0x8456,0x8458,0x845D,0x845E,0x845F,/* 0x88-0x8F */
+	0x8460,0x8462,0x8464,0x8465,0x8466,0x8467,0x8468,0x846A,/* 0x90-0x97 */
+	0x846E,0x846F,0x8470,0x8472,0x8474,0x8477,0x8479,0x847B,/* 0x98-0x9F */
+	0x847C,0x53D6,0x5A36,0x9F8B,0x8DA3,0x53BB,0x5708,0x98A7,/* 0xA0-0xA7 */
+	0x6743,0x919B,0x6CC9,0x5168,0x75CA,0x62F3,0x72AC,0x5238,/* 0xA8-0xAF */
+	0x529D,0x7F3A,0x7094,0x7638,0x5374,0x9E4A,0x69B7,0x786E,/* 0xB0-0xB7 */
+	0x96C0,0x88D9,0x7FA4,0x7136,0x71C3,0x5189,0x67D3,0x74E4,/* 0xB8-0xBF */
+	0x58E4,0x6518,0x56B7,0x8BA9,0x9976,0x6270,0x7ED5,0x60F9,/* 0xC0-0xC7 */
+	0x70ED,0x58EC,0x4EC1,0x4EBA,0x5FCD,0x97E7,0x4EFB,0x8BA4,/* 0xC8-0xCF */
+	0x5203,0x598A,0x7EAB,0x6254,0x4ECD,0x65E5,0x620E,0x8338,/* 0xD0-0xD7 */
+	0x84C9,0x8363,0x878D,0x7194,0x6EB6,0x5BB9,0x7ED2,0x5197,/* 0xD8-0xDF */
+	0x63C9,0x67D4,0x8089,0x8339,0x8815,0x5112,0x5B7A,0x5982,/* 0xE0-0xE7 */
+	0x8FB1,0x4E73,0x6C5D,0x5165,0x8925,0x8F6F,0x962E,0x854A,/* 0xE8-0xEF */
+	0x745E,0x9510,0x95F0,0x6DA6,0x82E5,0x5F31,0x6492,0x6D12,/* 0xF0-0xF7 */
+	0x8428,0x816E,0x9CC3,0x585E,0x8D5B,0x4E09,0x53C1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x847D,0x847E,0x847F,0x8480,0x8481,0x8483,0x8484,0x8485,/* 0x40-0x47 */
+	0x8486,0x848A,0x848D,0x848F,0x8490,0x8491,0x8492,0x8493,/* 0x48-0x4F */
+	0x8494,0x8495,0x8496,0x8498,0x849A,0x849B,0x849D,0x849E,/* 0x50-0x57 */
+	0x849F,0x84A0,0x84A2,0x84A3,0x84A4,0x84A5,0x84A6,0x84A7,/* 0x58-0x5F */
+	0x84A8,0x84A9,0x84AA,0x84AB,0x84AC,0x84AD,0x84AE,0x84B0,/* 0x60-0x67 */
+	0x84B1,0x84B3,0x84B5,0x84B6,0x84B7,0x84BB,0x84BC,0x84BE,/* 0x68-0x6F */
+	0x84C0,0x84C2,0x84C3,0x84C5,0x84C6,0x84C7,0x84C8,0x84CB,/* 0x70-0x77 */
+	0x84CC,0x84CE,0x84CF,0x84D2,0x84D4,0x84D5,0x84D7,0x0000,/* 0x78-0x7F */
+
+	0x84D8,0x84D9,0x84DA,0x84DB,0x84DC,0x84DE,0x84E1,0x84E2,/* 0x80-0x87 */
+	0x84E4,0x84E7,0x84E8,0x84E9,0x84EA,0x84EB,0x84ED,0x84EE,/* 0x88-0x8F */
+	0x84EF,0x84F1,0x84F2,0x84F3,0x84F4,0x84F5,0x84F6,0x84F7,/* 0x90-0x97 */
+	0x84F8,0x84F9,0x84FA,0x84FB,0x84FD,0x84FE,0x8500,0x8501,/* 0x98-0x9F */
+	0x8502,0x4F1E,0x6563,0x6851,0x55D3,0x4E27,0x6414,0x9A9A,/* 0xA0-0xA7 */
+	0x626B,0x5AC2,0x745F,0x8272,0x6DA9,0x68EE,0x50E7,0x838E,/* 0xA8-0xAF */
+	0x7802,0x6740,0x5239,0x6C99,0x7EB1,0x50BB,0x5565,0x715E,/* 0xB0-0xB7 */
+	0x7B5B,0x6652,0x73CA,0x82EB,0x6749,0x5C71,0x5220,0x717D,/* 0xB8-0xBF */
+	0x886B,0x95EA,0x9655,0x64C5,0x8D61,0x81B3,0x5584,0x6C55,/* 0xC0-0xC7 */
+	0x6247,0x7F2E,0x5892,0x4F24,0x5546,0x8D4F,0x664C,0x4E0A,/* 0xC8-0xCF */
+	0x5C1A,0x88F3,0x68A2,0x634E,0x7A0D,0x70E7,0x828D,0x52FA,/* 0xD0-0xD7 */
+	0x97F6,0x5C11,0x54E8,0x90B5,0x7ECD,0x5962,0x8D4A,0x86C7,/* 0xD8-0xDF */
+	0x820C,0x820D,0x8D66,0x6444,0x5C04,0x6151,0x6D89,0x793E,/* 0xE0-0xE7 */
+	0x8BBE,0x7837,0x7533,0x547B,0x4F38,0x8EAB,0x6DF1,0x5A20,/* 0xE8-0xEF */
+	0x7EC5,0x795E,0x6C88,0x5BA1,0x5A76,0x751A,0x80BE,0x614E,/* 0xF0-0xF7 */
+	0x6E17,0x58F0,0x751F,0x7525,0x7272,0x5347,0x7EF3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8503,0x8504,0x8505,0x8506,0x8507,0x8508,0x8509,0x850A,/* 0x40-0x47 */
+	0x850B,0x850D,0x850E,0x850F,0x8510,0x8512,0x8514,0x8515,/* 0x48-0x4F */
+	0x8516,0x8518,0x8519,0x851B,0x851C,0x851D,0x851E,0x8520,/* 0x50-0x57 */
+	0x8522,0x8523,0x8524,0x8525,0x8526,0x8527,0x8528,0x8529,/* 0x58-0x5F */
+	0x852A,0x852D,0x852E,0x852F,0x8530,0x8531,0x8532,0x8533,/* 0x60-0x67 */
+	0x8534,0x8535,0x8536,0x853E,0x853F,0x8540,0x8541,0x8542,/* 0x68-0x6F */
+	0x8544,0x8545,0x8546,0x8547,0x854B,0x854C,0x854D,0x854E,/* 0x70-0x77 */
+	0x854F,0x8550,0x8551,0x8552,0x8553,0x8554,0x8555,0x0000,/* 0x78-0x7F */
+
+	0x8557,0x8558,0x855A,0x855B,0x855C,0x855D,0x855F,0x8560,/* 0x80-0x87 */
+	0x8561,0x8562,0x8563,0x8565,0x8566,0x8567,0x8569,0x856A,/* 0x88-0x8F */
+	0x856B,0x856C,0x856D,0x856E,0x856F,0x8570,0x8571,0x8573,/* 0x90-0x97 */
+	0x8575,0x8576,0x8577,0x8578,0x857C,0x857D,0x857F,0x8580,/* 0x98-0x9F */
+	0x8581,0x7701,0x76DB,0x5269,0x80DC,0x5723,0x5E08,0x5931,/* 0xA0-0xA7 */
+	0x72EE,0x65BD,0x6E7F,0x8BD7,0x5C38,0x8671,0x5341,0x77F3,/* 0xA8-0xAF */
+	0x62FE,0x65F6,0x4EC0,0x98DF,0x8680,0x5B9E,0x8BC6,0x53F2,/* 0xB0-0xB7 */
+	0x77E2,0x4F7F,0x5C4E,0x9A76,0x59CB,0x5F0F,0x793A,0x58EB,/* 0xB8-0xBF */
+	0x4E16,0x67FF,0x4E8B,0x62ED,0x8A93,0x901D,0x52BF,0x662F,/* 0xC0-0xC7 */
+	0x55DC,0x566C,0x9002,0x4ED5,0x4F8D,0x91CA,0x9970,0x6C0F,/* 0xC8-0xCF */
+	0x5E02,0x6043,0x5BA4,0x89C6,0x8BD5,0x6536,0x624B,0x9996,/* 0xD0-0xD7 */
+	0x5B88,0x5BFF,0x6388,0x552E,0x53D7,0x7626,0x517D,0x852C,/* 0xD8-0xDF */
+	0x67A2,0x68B3,0x6B8A,0x6292,0x8F93,0x53D4,0x8212,0x6DD1,/* 0xE0-0xE7 */
+	0x758F,0x4E66,0x8D4E,0x5B70,0x719F,0x85AF,0x6691,0x66D9,/* 0xE8-0xEF */
+	0x7F72,0x8700,0x9ECD,0x9F20,0x5C5E,0x672F,0x8FF0,0x6811,/* 0xF0-0xF7 */
+	0x675F,0x620D,0x7AD6,0x5885,0x5EB6,0x6570,0x6F31,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8582,0x8583,0x8586,0x8588,0x8589,0x858A,0x858B,0x858C,/* 0x40-0x47 */
+	0x858D,0x858E,0x8590,0x8591,0x8592,0x8593,0x8594,0x8595,/* 0x48-0x4F */
+	0x8596,0x8597,0x8598,0x8599,0x859A,0x859D,0x859E,0x859F,/* 0x50-0x57 */
+	0x85A0,0x85A1,0x85A2,0x85A3,0x85A5,0x85A6,0x85A7,0x85A9,/* 0x58-0x5F */
+	0x85AB,0x85AC,0x85AD,0x85B1,0x85B2,0x85B3,0x85B4,0x85B5,/* 0x60-0x67 */
+	0x85B6,0x85B8,0x85BA,0x85BB,0x85BC,0x85BD,0x85BE,0x85BF,/* 0x68-0x6F */
+	0x85C0,0x85C2,0x85C3,0x85C4,0x85C5,0x85C6,0x85C7,0x85C8,/* 0x70-0x77 */
+	0x85CA,0x85CB,0x85CC,0x85CD,0x85CE,0x85D1,0x85D2,0x0000,/* 0x78-0x7F */
+
+	0x85D4,0x85D6,0x85D7,0x85D8,0x85D9,0x85DA,0x85DB,0x85DD,/* 0x80-0x87 */
+	0x85DE,0x85DF,0x85E0,0x85E1,0x85E2,0x85E3,0x85E5,0x85E6,/* 0x88-0x8F */
+	0x85E7,0x85E8,0x85EA,0x85EB,0x85EC,0x85ED,0x85EE,0x85EF,/* 0x90-0x97 */
+	0x85F0,0x85F1,0x85F2,0x85F3,0x85F4,0x85F5,0x85F6,0x85F7,/* 0x98-0x9F */
+	0x85F8,0x6055,0x5237,0x800D,0x6454,0x8870,0x7529,0x5E05,/* 0xA0-0xA7 */
+	0x6813,0x62F4,0x971C,0x53CC,0x723D,0x8C01,0x6C34,0x7761,/* 0xA8-0xAF */
+	0x7A0E,0x542E,0x77AC,0x987A,0x821C,0x8BF4,0x7855,0x6714,/* 0xB0-0xB7 */
+	0x70C1,0x65AF,0x6495,0x5636,0x601D,0x79C1,0x53F8,0x4E1D,/* 0xB8-0xBF */
+	0x6B7B,0x8086,0x5BFA,0x55E3,0x56DB,0x4F3A,0x4F3C,0x9972,/* 0xC0-0xC7 */
+	0x5DF3,0x677E,0x8038,0x6002,0x9882,0x9001,0x5B8B,0x8BBC,/* 0xC8-0xCF */
+	0x8BF5,0x641C,0x8258,0x64DE,0x55FD,0x82CF,0x9165,0x4FD7,/* 0xD0-0xD7 */
+	0x7D20,0x901F,0x7C9F,0x50F3,0x5851,0x6EAF,0x5BBF,0x8BC9,/* 0xD8-0xDF */
+	0x8083,0x9178,0x849C,0x7B97,0x867D,0x968B,0x968F,0x7EE5,/* 0xE0-0xE7 */
+	0x9AD3,0x788E,0x5C81,0x7A57,0x9042,0x96A7,0x795F,0x5B59,/* 0xE8-0xEF */
+	0x635F,0x7B0B,0x84D1,0x68AD,0x5506,0x7F29,0x7410,0x7D22,/* 0xF0-0xF7 */
+	0x9501,0x6240,0x584C,0x4ED6,0x5B83,0x5979,0x5854,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x85F9,0x85FA,0x85FC,0x85FD,0x85FE,0x8600,0x8601,0x8602,/* 0x40-0x47 */
+	0x8603,0x8604,0x8606,0x8607,0x8608,0x8609,0x860A,0x860B,/* 0x48-0x4F */
+	0x860C,0x860D,0x860E,0x860F,0x8610,0x8612,0x8613,0x8614,/* 0x50-0x57 */
+	0x8615,0x8617,0x8618,0x8619,0x861A,0x861B,0x861C,0x861D,/* 0x58-0x5F */
+	0x861E,0x861F,0x8620,0x8621,0x8622,0x8623,0x8624,0x8625,/* 0x60-0x67 */
+	0x8626,0x8628,0x862A,0x862B,0x862C,0x862D,0x862E,0x862F,/* 0x68-0x6F */
+	0x8630,0x8631,0x8632,0x8633,0x8634,0x8635,0x8636,0x8637,/* 0x70-0x77 */
+	0x8639,0x863A,0x863B,0x863D,0x863E,0x863F,0x8640,0x0000,/* 0x78-0x7F */
+
+	0x8641,0x8642,0x8643,0x8644,0x8645,0x8646,0x8647,0x8648,/* 0x80-0x87 */
+	0x8649,0x864A,0x864B,0x864C,0x8652,0x8653,0x8655,0x8656,/* 0x88-0x8F */
+	0x8657,0x8658,0x8659,0x865B,0x865C,0x865D,0x865F,0x8660,/* 0x90-0x97 */
+	0x8661,0x8663,0x8664,0x8665,0x8666,0x8667,0x8668,0x8669,/* 0x98-0x9F */
+	0x866A,0x736D,0x631E,0x8E4B,0x8E0F,0x80CE,0x82D4,0x62AC,/* 0xA0-0xA7 */
+	0x53F0,0x6CF0,0x915E,0x592A,0x6001,0x6C70,0x574D,0x644A,/* 0xA8-0xAF */
+	0x8D2A,0x762B,0x6EE9,0x575B,0x6A80,0x75F0,0x6F6D,0x8C2D,/* 0xB0-0xB7 */
+	0x8C08,0x5766,0x6BEF,0x8892,0x78B3,0x63A2,0x53F9,0x70AD,/* 0xB8-0xBF */
+	0x6C64,0x5858,0x642A,0x5802,0x68E0,0x819B,0x5510,0x7CD6,/* 0xC0-0xC7 */
+	0x5018,0x8EBA,0x6DCC,0x8D9F,0x70EB,0x638F,0x6D9B,0x6ED4,/* 0xC8-0xCF */
+	0x7EE6,0x8404,0x6843,0x9003,0x6DD8,0x9676,0x8BA8,0x5957,/* 0xD0-0xD7 */
+	0x7279,0x85E4,0x817E,0x75BC,0x8A8A,0x68AF,0x5254,0x8E22,/* 0xD8-0xDF */
+	0x9511,0x63D0,0x9898,0x8E44,0x557C,0x4F53,0x66FF,0x568F,/* 0xE0-0xE7 */
+	0x60D5,0x6D95,0x5243,0x5C49,0x5929,0x6DFB,0x586B,0x7530,/* 0xE8-0xEF */
+	0x751C,0x606C,0x8214,0x8146,0x6311,0x6761,0x8FE2,0x773A,/* 0xF0-0xF7 */
+	0x8DF3,0x8D34,0x94C1,0x5E16,0x5385,0x542C,0x70C3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x866D,0x866F,0x8670,0x8672,0x8673,0x8674,0x8675,0x8676,/* 0x40-0x47 */
+	0x8677,0x8678,0x8683,0x8684,0x8685,0x8686,0x8687,0x8688,/* 0x48-0x4F */
+	0x8689,0x868E,0x868F,0x8690,0x8691,0x8692,0x8694,0x8696,/* 0x50-0x57 */
+	0x8697,0x8698,0x8699,0x869A,0x869B,0x869E,0x869F,0x86A0,/* 0x58-0x5F */
+	0x86A1,0x86A2,0x86A5,0x86A6,0x86AB,0x86AD,0x86AE,0x86B2,/* 0x60-0x67 */
+	0x86B3,0x86B7,0x86B8,0x86B9,0x86BB,0x86BC,0x86BD,0x86BE,/* 0x68-0x6F */
+	0x86BF,0x86C1,0x86C2,0x86C3,0x86C5,0x86C8,0x86CC,0x86CD,/* 0x70-0x77 */
+	0x86D2,0x86D3,0x86D5,0x86D6,0x86D7,0x86DA,0x86DC,0x0000,/* 0x78-0x7F */
+
+	0x86DD,0x86E0,0x86E1,0x86E2,0x86E3,0x86E5,0x86E6,0x86E7,/* 0x80-0x87 */
+	0x86E8,0x86EA,0x86EB,0x86EC,0x86EF,0x86F5,0x86F6,0x86F7,/* 0x88-0x8F */
+	0x86FA,0x86FB,0x86FC,0x86FD,0x86FF,0x8701,0x8704,0x8705,/* 0x90-0x97 */
+	0x8706,0x870B,0x870C,0x870E,0x870F,0x8710,0x8711,0x8714,/* 0x98-0x9F */
+	0x8716,0x6C40,0x5EF7,0x505C,0x4EAD,0x5EAD,0x633A,0x8247,/* 0xA0-0xA7 */
+	0x901A,0x6850,0x916E,0x77B3,0x540C,0x94DC,0x5F64,0x7AE5,/* 0xA8-0xAF */
+	0x6876,0x6345,0x7B52,0x7EDF,0x75DB,0x5077,0x6295,0x5934,/* 0xB0-0xB7 */
+	0x900F,0x51F8,0x79C3,0x7A81,0x56FE,0x5F92,0x9014,0x6D82,/* 0xB8-0xBF */
+	0x5C60,0x571F,0x5410,0x5154,0x6E4D,0x56E2,0x63A8,0x9893,/* 0xC0-0xC7 */
+	0x817F,0x8715,0x892A,0x9000,0x541E,0x5C6F,0x81C0,0x62D6,/* 0xC8-0xCF */
+	0x6258,0x8131,0x9E35,0x9640,0x9A6E,0x9A7C,0x692D,0x59A5,/* 0xD0-0xD7 */
+	0x62D3,0x553E,0x6316,0x54C7,0x86D9,0x6D3C,0x5A03,0x74E6,/* 0xD8-0xDF */
+	0x889C,0x6B6A,0x5916,0x8C4C,0x5F2F,0x6E7E,0x73A9,0x987D,/* 0xE0-0xE7 */
+	0x4E38,0x70F7,0x5B8C,0x7897,0x633D,0x665A,0x7696,0x60CB,/* 0xE8-0xEF */
+	0x5B9B,0x5A49,0x4E07,0x8155,0x6C6A,0x738B,0x4EA1,0x6789,/* 0xF0-0xF7 */
+	0x7F51,0x5F80,0x65FA,0x671B,0x5FD8,0x5984,0x5A01,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8719,0x871B,0x871D,0x871F,0x8720,0x8724,0x8726,0x8727,/* 0x40-0x47 */
+	0x8728,0x872A,0x872B,0x872C,0x872D,0x872F,0x8730,0x8732,/* 0x48-0x4F */
+	0x8733,0x8735,0x8736,0x8738,0x8739,0x873A,0x873C,0x873D,/* 0x50-0x57 */
+	0x8740,0x8741,0x8742,0x8743,0x8744,0x8745,0x8746,0x874A,/* 0x58-0x5F */
+	0x874B,0x874D,0x874F,0x8750,0x8751,0x8752,0x8754,0x8755,/* 0x60-0x67 */
+	0x8756,0x8758,0x875A,0x875B,0x875C,0x875D,0x875E,0x875F,/* 0x68-0x6F */
+	0x8761,0x8762,0x8766,0x8767,0x8768,0x8769,0x876A,0x876B,/* 0x70-0x77 */
+	0x876C,0x876D,0x876F,0x8771,0x8772,0x8773,0x8775,0x0000,/* 0x78-0x7F */
+
+	0x8777,0x8778,0x8779,0x877A,0x877F,0x8780,0x8781,0x8784,/* 0x80-0x87 */
+	0x8786,0x8787,0x8789,0x878A,0x878C,0x878E,0x878F,0x8790,/* 0x88-0x8F */
+	0x8791,0x8792,0x8794,0x8795,0x8796,0x8798,0x8799,0x879A,/* 0x90-0x97 */
+	0x879B,0x879C,0x879D,0x879E,0x87A0,0x87A1,0x87A2,0x87A3,/* 0x98-0x9F */
+	0x87A4,0x5DCD,0x5FAE,0x5371,0x97E6,0x8FDD,0x6845,0x56F4,/* 0xA0-0xA7 */
+	0x552F,0x60DF,0x4E3A,0x6F4D,0x7EF4,0x82C7,0x840E,0x59D4,/* 0xA8-0xAF */
+	0x4F1F,0x4F2A,0x5C3E,0x7EAC,0x672A,0x851A,0x5473,0x754F,/* 0xB0-0xB7 */
+	0x80C3,0x5582,0x9B4F,0x4F4D,0x6E2D,0x8C13,0x5C09,0x6170,/* 0xB8-0xBF */
+	0x536B,0x761F,0x6E29,0x868A,0x6587,0x95FB,0x7EB9,0x543B,/* 0xC0-0xC7 */
+	0x7A33,0x7D0A,0x95EE,0x55E1,0x7FC1,0x74EE,0x631D,0x8717,/* 0xC8-0xCF */
+	0x6DA1,0x7A9D,0x6211,0x65A1,0x5367,0x63E1,0x6C83,0x5DEB,/* 0xD0-0xD7 */
+	0x545C,0x94A8,0x4E4C,0x6C61,0x8BEC,0x5C4B,0x65E0,0x829C,/* 0xD8-0xDF */
+	0x68A7,0x543E,0x5434,0x6BCB,0x6B66,0x4E94,0x6342,0x5348,/* 0xE0-0xE7 */
+	0x821E,0x4F0D,0x4FAE,0x575E,0x620A,0x96FE,0x6664,0x7269,/* 0xE8-0xEF */
+	0x52FF,0x52A1,0x609F,0x8BEF,0x6614,0x7199,0x6790,0x897F,/* 0xF0-0xF7 */
+	0x7852,0x77FD,0x6670,0x563B,0x5438,0x9521,0x727A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x87A5,0x87A6,0x87A7,0x87A9,0x87AA,0x87AE,0x87B0,0x87B1,/* 0x40-0x47 */
+	0x87B2,0x87B4,0x87B6,0x87B7,0x87B8,0x87B9,0x87BB,0x87BC,/* 0x48-0x4F */
+	0x87BE,0x87BF,0x87C1,0x87C2,0x87C3,0x87C4,0x87C5,0x87C7,/* 0x50-0x57 */
+	0x87C8,0x87C9,0x87CC,0x87CD,0x87CE,0x87CF,0x87D0,0x87D4,/* 0x58-0x5F */
+	0x87D5,0x87D6,0x87D7,0x87D8,0x87D9,0x87DA,0x87DC,0x87DD,/* 0x60-0x67 */
+	0x87DE,0x87DF,0x87E1,0x87E2,0x87E3,0x87E4,0x87E6,0x87E7,/* 0x68-0x6F */
+	0x87E8,0x87E9,0x87EB,0x87EC,0x87ED,0x87EF,0x87F0,0x87F1,/* 0x70-0x77 */
+	0x87F2,0x87F3,0x87F4,0x87F5,0x87F6,0x87F7,0x87F8,0x0000,/* 0x78-0x7F */
+
+	0x87FA,0x87FB,0x87FC,0x87FD,0x87FF,0x8800,0x8801,0x8802,/* 0x80-0x87 */
+	0x8804,0x8805,0x8806,0x8807,0x8808,0x8809,0x880B,0x880C,/* 0x88-0x8F */
+	0x880D,0x880E,0x880F,0x8810,0x8811,0x8812,0x8814,0x8817,/* 0x90-0x97 */
+	0x8818,0x8819,0x881A,0x881C,0x881D,0x881E,0x881F,0x8820,/* 0x98-0x9F */
+	0x8823,0x7A00,0x606F,0x5E0C,0x6089,0x819D,0x5915,0x60DC,/* 0xA0-0xA7 */
+	0x7184,0x70EF,0x6EAA,0x6C50,0x7280,0x6A84,0x88AD,0x5E2D,/* 0xA8-0xAF */
+	0x4E60,0x5AB3,0x559C,0x94E3,0x6D17,0x7CFB,0x9699,0x620F,/* 0xB0-0xB7 */
+	0x7EC6,0x778E,0x867E,0x5323,0x971E,0x8F96,0x6687,0x5CE1,/* 0xB8-0xBF */
+	0x4FA0,0x72ED,0x4E0B,0x53A6,0x590F,0x5413,0x6380,0x9528,/* 0xC0-0xC7 */
+	0x5148,0x4ED9,0x9C9C,0x7EA4,0x54B8,0x8D24,0x8854,0x8237,/* 0xC8-0xCF */
+	0x95F2,0x6D8E,0x5F26,0x5ACC,0x663E,0x9669,0x73B0,0x732E,/* 0xD0-0xD7 */
+	0x53BF,0x817A,0x9985,0x7FA1,0x5BAA,0x9677,0x9650,0x7EBF,/* 0xD8-0xDF */
+	0x76F8,0x53A2,0x9576,0x9999,0x7BB1,0x8944,0x6E58,0x4E61,/* 0xE0-0xE7 */
+	0x7FD4,0x7965,0x8BE6,0x60F3,0x54CD,0x4EAB,0x9879,0x5DF7,/* 0xE8-0xEF */
+	0x6A61,0x50CF,0x5411,0x8C61,0x8427,0x785D,0x9704,0x524A,/* 0xF0-0xF7 */
+	0x54EE,0x56A3,0x9500,0x6D88,0x5BB5,0x6DC6,0x6653,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8824,0x8825,0x8826,0x8827,0x8828,0x8829,0x882A,0x882B,/* 0x40-0x47 */
+	0x882C,0x882D,0x882E,0x882F,0x8830,0x8831,0x8833,0x8834,/* 0x48-0x4F */
+	0x8835,0x8836,0x8837,0x8838,0x883A,0x883B,0x883D,0x883E,/* 0x50-0x57 */
+	0x883F,0x8841,0x8842,0x8843,0x8846,0x8847,0x8848,0x8849,/* 0x58-0x5F */
+	0x884A,0x884B,0x884E,0x884F,0x8850,0x8851,0x8852,0x8853,/* 0x60-0x67 */
+	0x8855,0x8856,0x8858,0x885A,0x885B,0x885C,0x885D,0x885E,/* 0x68-0x6F */
+	0x885F,0x8860,0x8866,0x8867,0x886A,0x886D,0x886F,0x8871,/* 0x70-0x77 */
+	0x8873,0x8874,0x8875,0x8876,0x8878,0x8879,0x887A,0x0000,/* 0x78-0x7F */
+
+	0x887B,0x887C,0x8880,0x8883,0x8886,0x8887,0x8889,0x888A,/* 0x80-0x87 */
+	0x888C,0x888E,0x888F,0x8890,0x8891,0x8893,0x8894,0x8895,/* 0x88-0x8F */
+	0x8897,0x8898,0x8899,0x889A,0x889B,0x889D,0x889E,0x889F,/* 0x90-0x97 */
+	0x88A0,0x88A1,0x88A3,0x88A5,0x88A6,0x88A7,0x88A8,0x88A9,/* 0x98-0x9F */
+	0x88AA,0x5C0F,0x5B5D,0x6821,0x8096,0x5578,0x7B11,0x6548,/* 0xA0-0xA7 */
+	0x6954,0x4E9B,0x6B47,0x874E,0x978B,0x534F,0x631F,0x643A,/* 0xA8-0xAF */
+	0x90AA,0x659C,0x80C1,0x8C10,0x5199,0x68B0,0x5378,0x87F9,/* 0xB0-0xB7 */
+	0x61C8,0x6CC4,0x6CFB,0x8C22,0x5C51,0x85AA,0x82AF,0x950C,/* 0xB8-0xBF */
+	0x6B23,0x8F9B,0x65B0,0x5FFB,0x5FC3,0x4FE1,0x8845,0x661F,/* 0xC0-0xC7 */
+	0x8165,0x7329,0x60FA,0x5174,0x5211,0x578B,0x5F62,0x90A2,/* 0xC8-0xCF */
+	0x884C,0x9192,0x5E78,0x674F,0x6027,0x59D3,0x5144,0x51F6,/* 0xD0-0xD7 */
+	0x80F8,0x5308,0x6C79,0x96C4,0x718A,0x4F11,0x4FEE,0x7F9E,/* 0xD8-0xDF */
+	0x673D,0x55C5,0x9508,0x79C0,0x8896,0x7EE3,0x589F,0x620C,/* 0xE0-0xE7 */
+	0x9700,0x865A,0x5618,0x987B,0x5F90,0x8BB8,0x84C4,0x9157,/* 0xE8-0xEF */
+	0x53D9,0x65ED,0x5E8F,0x755C,0x6064,0x7D6E,0x5A7F,0x7EEA,/* 0xF0-0xF7 */
+	0x7EED,0x8F69,0x55A7,0x5BA3,0x60AC,0x65CB,0x7384,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x88AC,0x88AE,0x88AF,0x88B0,0x88B2,0x88B3,0x88B4,0x88B5,/* 0x40-0x47 */
+	0x88B6,0x88B8,0x88B9,0x88BA,0x88BB,0x88BD,0x88BE,0x88BF,/* 0x48-0x4F */
+	0x88C0,0x88C3,0x88C4,0x88C7,0x88C8,0x88CA,0x88CB,0x88CC,/* 0x50-0x57 */
+	0x88CD,0x88CF,0x88D0,0x88D1,0x88D3,0x88D6,0x88D7,0x88DA,/* 0x58-0x5F */
+	0x88DB,0x88DC,0x88DD,0x88DE,0x88E0,0x88E1,0x88E6,0x88E7,/* 0x60-0x67 */
+	0x88E9,0x88EA,0x88EB,0x88EC,0x88ED,0x88EE,0x88EF,0x88F2,/* 0x68-0x6F */
+	0x88F5,0x88F6,0x88F7,0x88FA,0x88FB,0x88FD,0x88FF,0x8900,/* 0x70-0x77 */
+	0x8901,0x8903,0x8904,0x8905,0x8906,0x8907,0x8908,0x0000,/* 0x78-0x7F */
+
+	0x8909,0x890B,0x890C,0x890D,0x890E,0x890F,0x8911,0x8914,/* 0x80-0x87 */
+	0x8915,0x8916,0x8917,0x8918,0x891C,0x891D,0x891E,0x891F,/* 0x88-0x8F */
+	0x8920,0x8922,0x8923,0x8924,0x8926,0x8927,0x8928,0x8929,/* 0x90-0x97 */
+	0x892C,0x892D,0x892E,0x892F,0x8931,0x8932,0x8933,0x8935,/* 0x98-0x9F */
+	0x8937,0x9009,0x7663,0x7729,0x7EDA,0x9774,0x859B,0x5B66,/* 0xA0-0xA7 */
+	0x7A74,0x96EA,0x8840,0x52CB,0x718F,0x5FAA,0x65EC,0x8BE2,/* 0xA8-0xAF */
+	0x5BFB,0x9A6F,0x5DE1,0x6B89,0x6C5B,0x8BAD,0x8BAF,0x900A,/* 0xB0-0xB7 */
+	0x8FC5,0x538B,0x62BC,0x9E26,0x9E2D,0x5440,0x4E2B,0x82BD,/* 0xB8-0xBF */
+	0x7259,0x869C,0x5D16,0x8859,0x6DAF,0x96C5,0x54D1,0x4E9A,/* 0xC0-0xC7 */
+	0x8BB6,0x7109,0x54BD,0x9609,0x70DF,0x6DF9,0x76D0,0x4E25,/* 0xC8-0xCF */
+	0x7814,0x8712,0x5CA9,0x5EF6,0x8A00,0x989C,0x960E,0x708E,/* 0xD0-0xD7 */
+	0x6CBF,0x5944,0x63A9,0x773C,0x884D,0x6F14,0x8273,0x5830,/* 0xD8-0xDF */
+	0x71D5,0x538C,0x781A,0x96C1,0x5501,0x5F66,0x7130,0x5BB4,/* 0xE0-0xE7 */
+	0x8C1A,0x9A8C,0x6B83,0x592E,0x9E2F,0x79E7,0x6768,0x626C,/* 0xE8-0xEF */
+	0x4F6F,0x75A1,0x7F8A,0x6D0B,0x9633,0x6C27,0x4EF0,0x75D2,/* 0xF0-0xF7 */
+	0x517B,0x6837,0x6F3E,0x9080,0x8170,0x5996,0x7476,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8938,0x8939,0x893A,0x893B,0x893C,0x893D,0x893E,0x893F,/* 0x40-0x47 */
+	0x8940,0x8942,0x8943,0x8945,0x8946,0x8947,0x8948,0x8949,/* 0x48-0x4F */
+	0x894A,0x894B,0x894C,0x894D,0x894E,0x894F,0x8950,0x8951,/* 0x50-0x57 */
+	0x8952,0x8953,0x8954,0x8955,0x8956,0x8957,0x8958,0x8959,/* 0x58-0x5F */
+	0x895A,0x895B,0x895C,0x895D,0x8960,0x8961,0x8962,0x8963,/* 0x60-0x67 */
+	0x8964,0x8965,0x8967,0x8968,0x8969,0x896A,0x896B,0x896C,/* 0x68-0x6F */
+	0x896D,0x896E,0x896F,0x8970,0x8971,0x8972,0x8973,0x8974,/* 0x70-0x77 */
+	0x8975,0x8976,0x8977,0x8978,0x8979,0x897A,0x897C,0x0000,/* 0x78-0x7F */
+
+	0x897D,0x897E,0x8980,0x8982,0x8984,0x8985,0x8987,0x8988,/* 0x80-0x87 */
+	0x8989,0x898A,0x898B,0x898C,0x898D,0x898E,0x898F,0x8990,/* 0x88-0x8F */
+	0x8991,0x8992,0x8993,0x8994,0x8995,0x8996,0x8997,0x8998,/* 0x90-0x97 */
+	0x8999,0x899A,0x899B,0x899C,0x899D,0x899E,0x899F,0x89A0,/* 0x98-0x9F */
+	0x89A1,0x6447,0x5C27,0x9065,0x7A91,0x8C23,0x59DA,0x54AC,/* 0xA0-0xA7 */
+	0x8200,0x836F,0x8981,0x8000,0x6930,0x564E,0x8036,0x7237,/* 0xA8-0xAF */
+	0x91CE,0x51B6,0x4E5F,0x9875,0x6396,0x4E1A,0x53F6,0x66F3,/* 0xB0-0xB7 */
+	0x814B,0x591C,0x6DB2,0x4E00,0x58F9,0x533B,0x63D6,0x94F1,/* 0xB8-0xBF */
+	0x4F9D,0x4F0A,0x8863,0x9890,0x5937,0x9057,0x79FB,0x4EEA,/* 0xC0-0xC7 */
+	0x80F0,0x7591,0x6C82,0x5B9C,0x59E8,0x5F5D,0x6905,0x8681,/* 0xC8-0xCF */
+	0x501A,0x5DF2,0x4E59,0x77E3,0x4EE5,0x827A,0x6291,0x6613,/* 0xD0-0xD7 */
+	0x9091,0x5C79,0x4EBF,0x5F79,0x81C6,0x9038,0x8084,0x75AB,/* 0xD8-0xDF */
+	0x4EA6,0x88D4,0x610F,0x6BC5,0x5FC6,0x4E49,0x76CA,0x6EA2,/* 0xE0-0xE7 */
+	0x8BE3,0x8BAE,0x8C0A,0x8BD1,0x5F02,0x7FFC,0x7FCC,0x7ECE,/* 0xE8-0xEF */
+	0x8335,0x836B,0x56E0,0x6BB7,0x97F3,0x9634,0x59FB,0x541F,/* 0xF0-0xF7 */
+	0x94F6,0x6DEB,0x5BC5,0x996E,0x5C39,0x5F15,0x9690,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x89A2,0x89A3,0x89A4,0x89A5,0x89A6,0x89A7,0x89A8,0x89A9,/* 0x40-0x47 */
+	0x89AA,0x89AB,0x89AC,0x89AD,0x89AE,0x89AF,0x89B0,0x89B1,/* 0x48-0x4F */
+	0x89B2,0x89B3,0x89B4,0x89B5,0x89B6,0x89B7,0x89B8,0x89B9,/* 0x50-0x57 */
+	0x89BA,0x89BB,0x89BC,0x89BD,0x89BE,0x89BF,0x89C0,0x89C3,/* 0x58-0x5F */
+	0x89CD,0x89D3,0x89D4,0x89D5,0x89D7,0x89D8,0x89D9,0x89DB,/* 0x60-0x67 */
+	0x89DD,0x89DF,0x89E0,0x89E1,0x89E2,0x89E4,0x89E7,0x89E8,/* 0x68-0x6F */
+	0x89E9,0x89EA,0x89EC,0x89ED,0x89EE,0x89F0,0x89F1,0x89F2,/* 0x70-0x77 */
+	0x89F4,0x89F5,0x89F6,0x89F7,0x89F8,0x89F9,0x89FA,0x0000,/* 0x78-0x7F */
+
+	0x89FB,0x89FC,0x89FD,0x89FE,0x89FF,0x8A01,0x8A02,0x8A03,/* 0x80-0x87 */
+	0x8A04,0x8A05,0x8A06,0x8A08,0x8A09,0x8A0A,0x8A0B,0x8A0C,/* 0x88-0x8F */
+	0x8A0D,0x8A0E,0x8A0F,0x8A10,0x8A11,0x8A12,0x8A13,0x8A14,/* 0x90-0x97 */
+	0x8A15,0x8A16,0x8A17,0x8A18,0x8A19,0x8A1A,0x8A1B,0x8A1C,/* 0x98-0x9F */
+	0x8A1D,0x5370,0x82F1,0x6A31,0x5A74,0x9E70,0x5E94,0x7F28,/* 0xA0-0xA7 */
+	0x83B9,0x8424,0x8425,0x8367,0x8747,0x8FCE,0x8D62,0x76C8,/* 0xA8-0xAF */
+	0x5F71,0x9896,0x786C,0x6620,0x54DF,0x62E5,0x4F63,0x81C3,/* 0xB0-0xB7 */
+	0x75C8,0x5EB8,0x96CD,0x8E0A,0x86F9,0x548F,0x6CF3,0x6D8C,/* 0xB8-0xBF */
+	0x6C38,0x607F,0x52C7,0x7528,0x5E7D,0x4F18,0x60A0,0x5FE7,/* 0xC0-0xC7 */
+	0x5C24,0x7531,0x90AE,0x94C0,0x72B9,0x6CB9,0x6E38,0x9149,/* 0xC8-0xCF */
+	0x6709,0x53CB,0x53F3,0x4F51,0x91C9,0x8BF1,0x53C8,0x5E7C,/* 0xD0-0xD7 */
+	0x8FC2,0x6DE4,0x4E8E,0x76C2,0x6986,0x865E,0x611A,0x8206,/* 0xD8-0xDF */
+	0x4F59,0x4FDE,0x903E,0x9C7C,0x6109,0x6E1D,0x6E14,0x9685,/* 0xE0-0xE7 */
+	0x4E88,0x5A31,0x96E8,0x4E0E,0x5C7F,0x79B9,0x5B87,0x8BED,/* 0xE8-0xEF */
+	0x7FBD,0x7389,0x57DF,0x828B,0x90C1,0x5401,0x9047,0x55BB,/* 0xF0-0xF7 */
+	0x5CEA,0x5FA1,0x6108,0x6B32,0x72F1,0x80B2,0x8A89,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8A1E,0x8A1F,0x8A20,0x8A21,0x8A22,0x8A23,0x8A24,0x8A25,/* 0x40-0x47 */
+	0x8A26,0x8A27,0x8A28,0x8A29,0x8A2A,0x8A2B,0x8A2C,0x8A2D,/* 0x48-0x4F */
+	0x8A2E,0x8A2F,0x8A30,0x8A31,0x8A32,0x8A33,0x8A34,0x8A35,/* 0x50-0x57 */
+	0x8A36,0x8A37,0x8A38,0x8A39,0x8A3A,0x8A3B,0x8A3C,0x8A3D,/* 0x58-0x5F */
+	0x8A3F,0x8A40,0x8A41,0x8A42,0x8A43,0x8A44,0x8A45,0x8A46,/* 0x60-0x67 */
+	0x8A47,0x8A49,0x8A4A,0x8A4B,0x8A4C,0x8A4D,0x8A4E,0x8A4F,/* 0x68-0x6F */
+	0x8A50,0x8A51,0x8A52,0x8A53,0x8A54,0x8A55,0x8A56,0x8A57,/* 0x70-0x77 */
+	0x8A58,0x8A59,0x8A5A,0x8A5B,0x8A5C,0x8A5D,0x8A5E,0x0000,/* 0x78-0x7F */
+
+	0x8A5F,0x8A60,0x8A61,0x8A62,0x8A63,0x8A64,0x8A65,0x8A66,/* 0x80-0x87 */
+	0x8A67,0x8A68,0x8A69,0x8A6A,0x8A6B,0x8A6C,0x8A6D,0x8A6E,/* 0x88-0x8F */
+	0x8A6F,0x8A70,0x8A71,0x8A72,0x8A73,0x8A74,0x8A75,0x8A76,/* 0x90-0x97 */
+	0x8A77,0x8A78,0x8A7A,0x8A7B,0x8A7C,0x8A7D,0x8A7E,0x8A7F,/* 0x98-0x9F */
+	0x8A80,0x6D74,0x5BD3,0x88D5,0x9884,0x8C6B,0x9A6D,0x9E33,/* 0xA0-0xA7 */
+	0x6E0A,0x51A4,0x5143,0x57A3,0x8881,0x539F,0x63F4,0x8F95,/* 0xA8-0xAF */
+	0x56ED,0x5458,0x5706,0x733F,0x6E90,0x7F18,0x8FDC,0x82D1,/* 0xB0-0xB7 */
+	0x613F,0x6028,0x9662,0x66F0,0x7EA6,0x8D8A,0x8DC3,0x94A5,/* 0xB8-0xBF */
+	0x5CB3,0x7CA4,0x6708,0x60A6,0x9605,0x8018,0x4E91,0x90E7,/* 0xC0-0xC7 */
+	0x5300,0x9668,0x5141,0x8FD0,0x8574,0x915D,0x6655,0x97F5,/* 0xC8-0xCF */
+	0x5B55,0x531D,0x7838,0x6742,0x683D,0x54C9,0x707E,0x5BB0,/* 0xD0-0xD7 */
+	0x8F7D,0x518D,0x5728,0x54B1,0x6512,0x6682,0x8D5E,0x8D43,/* 0xD8-0xDF */
+	0x810F,0x846C,0x906D,0x7CDF,0x51FF,0x85FB,0x67A3,0x65E9,/* 0xE0-0xE7 */
+	0x6FA1,0x86A4,0x8E81,0x566A,0x9020,0x7682,0x7076,0x71E5,/* 0xE8-0xEF */
+	0x8D23,0x62E9,0x5219,0x6CFD,0x8D3C,0x600E,0x589E,0x618E,/* 0xF0-0xF7 */
+	0x66FE,0x8D60,0x624E,0x55B3,0x6E23,0x672D,0x8F67,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8A81,0x8A82,0x8A83,0x8A84,0x8A85,0x8A86,0x8A87,0x8A88,/* 0x40-0x47 */
+	0x8A8B,0x8A8C,0x8A8D,0x8A8E,0x8A8F,0x8A90,0x8A91,0x8A92,/* 0x48-0x4F */
+	0x8A94,0x8A95,0x8A96,0x8A97,0x8A98,0x8A99,0x8A9A,0x8A9B,/* 0x50-0x57 */
+	0x8A9C,0x8A9D,0x8A9E,0x8A9F,0x8AA0,0x8AA1,0x8AA2,0x8AA3,/* 0x58-0x5F */
+	0x8AA4,0x8AA5,0x8AA6,0x8AA7,0x8AA8,0x8AA9,0x8AAA,0x8AAB,/* 0x60-0x67 */
+	0x8AAC,0x8AAD,0x8AAE,0x8AAF,0x8AB0,0x8AB1,0x8AB2,0x8AB3,/* 0x68-0x6F */
+	0x8AB4,0x8AB5,0x8AB6,0x8AB7,0x8AB8,0x8AB9,0x8ABA,0x8ABB,/* 0x70-0x77 */
+	0x8ABC,0x8ABD,0x8ABE,0x8ABF,0x8AC0,0x8AC1,0x8AC2,0x0000,/* 0x78-0x7F */
+
+	0x8AC3,0x8AC4,0x8AC5,0x8AC6,0x8AC7,0x8AC8,0x8AC9,0x8ACA,/* 0x80-0x87 */
+	0x8ACB,0x8ACC,0x8ACD,0x8ACE,0x8ACF,0x8AD0,0x8AD1,0x8AD2,/* 0x88-0x8F */
+	0x8AD3,0x8AD4,0x8AD5,0x8AD6,0x8AD7,0x8AD8,0x8AD9,0x8ADA,/* 0x90-0x97 */
+	0x8ADB,0x8ADC,0x8ADD,0x8ADE,0x8ADF,0x8AE0,0x8AE1,0x8AE2,/* 0x98-0x9F */
+	0x8AE3,0x94E1,0x95F8,0x7728,0x6805,0x69A8,0x548B,0x4E4D,/* 0xA0-0xA7 */
+	0x70B8,0x8BC8,0x6458,0x658B,0x5B85,0x7A84,0x503A,0x5BE8,/* 0xA8-0xAF */
+	0x77BB,0x6BE1,0x8A79,0x7C98,0x6CBE,0x76CF,0x65A9,0x8F97,/* 0xB0-0xB7 */
+	0x5D2D,0x5C55,0x8638,0x6808,0x5360,0x6218,0x7AD9,0x6E5B,/* 0xB8-0xBF */
+	0x7EFD,0x6A1F,0x7AE0,0x5F70,0x6F33,0x5F20,0x638C,0x6DA8,/* 0xC0-0xC7 */
+	0x6756,0x4E08,0x5E10,0x8D26,0x4ED7,0x80C0,0x7634,0x969C,/* 0xC8-0xCF */
+	0x62DB,0x662D,0x627E,0x6CBC,0x8D75,0x7167,0x7F69,0x5146,/* 0xD0-0xD7 */
+	0x8087,0x53EC,0x906E,0x6298,0x54F2,0x86F0,0x8F99,0x8005,/* 0xD8-0xDF */
+	0x9517,0x8517,0x8FD9,0x6D59,0x73CD,0x659F,0x771F,0x7504,/* 0xE0-0xE7 */
+	0x7827,0x81FB,0x8D1E,0x9488,0x4FA6,0x6795,0x75B9,0x8BCA,/* 0xE8-0xEF */
+	0x9707,0x632F,0x9547,0x9635,0x84B8,0x6323,0x7741,0x5F81,/* 0xF0-0xF7 */
+	0x72F0,0x4E89,0x6014,0x6574,0x62EF,0x6B63,0x653F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8AE4,0x8AE5,0x8AE6,0x8AE7,0x8AE8,0x8AE9,0x8AEA,0x8AEB,/* 0x40-0x47 */
+	0x8AEC,0x8AED,0x8AEE,0x8AEF,0x8AF0,0x8AF1,0x8AF2,0x8AF3,/* 0x48-0x4F */
+	0x8AF4,0x8AF5,0x8AF6,0x8AF7,0x8AF8,0x8AF9,0x8AFA,0x8AFB,/* 0x50-0x57 */
+	0x8AFC,0x8AFD,0x8AFE,0x8AFF,0x8B00,0x8B01,0x8B02,0x8B03,/* 0x58-0x5F */
+	0x8B04,0x8B05,0x8B06,0x8B08,0x8B09,0x8B0A,0x8B0B,0x8B0C,/* 0x60-0x67 */
+	0x8B0D,0x8B0E,0x8B0F,0x8B10,0x8B11,0x8B12,0x8B13,0x8B14,/* 0x68-0x6F */
+	0x8B15,0x8B16,0x8B17,0x8B18,0x8B19,0x8B1A,0x8B1B,0x8B1C,/* 0x70-0x77 */
+	0x8B1D,0x8B1E,0x8B1F,0x8B20,0x8B21,0x8B22,0x8B23,0x0000,/* 0x78-0x7F */
+
+	0x8B24,0x8B25,0x8B27,0x8B28,0x8B29,0x8B2A,0x8B2B,0x8B2C,/* 0x80-0x87 */
+	0x8B2D,0x8B2E,0x8B2F,0x8B30,0x8B31,0x8B32,0x8B33,0x8B34,/* 0x88-0x8F */
+	0x8B35,0x8B36,0x8B37,0x8B38,0x8B39,0x8B3A,0x8B3B,0x8B3C,/* 0x90-0x97 */
+	0x8B3D,0x8B3E,0x8B3F,0x8B40,0x8B41,0x8B42,0x8B43,0x8B44,/* 0x98-0x9F */
+	0x8B45,0x5E27,0x75C7,0x90D1,0x8BC1,0x829D,0x679D,0x652F,/* 0xA0-0xA7 */
+	0x5431,0x8718,0x77E5,0x80A2,0x8102,0x6C41,0x4E4B,0x7EC7,/* 0xA8-0xAF */
+	0x804C,0x76F4,0x690D,0x6B96,0x6267,0x503C,0x4F84,0x5740,/* 0xB0-0xB7 */
+	0x6307,0x6B62,0x8DBE,0x53EA,0x65E8,0x7EB8,0x5FD7,0x631A,/* 0xB8-0xBF */
+	0x63B7,0x81F3,0x81F4,0x7F6E,0x5E1C,0x5CD9,0x5236,0x667A,/* 0xC0-0xC7 */
+	0x79E9,0x7A1A,0x8D28,0x7099,0x75D4,0x6EDE,0x6CBB,0x7A92,/* 0xC8-0xCF */
+	0x4E2D,0x76C5,0x5FE0,0x949F,0x8877,0x7EC8,0x79CD,0x80BF,/* 0xD0-0xD7 */
+	0x91CD,0x4EF2,0x4F17,0x821F,0x5468,0x5DDE,0x6D32,0x8BCC,/* 0xD8-0xDF */
+	0x7CA5,0x8F74,0x8098,0x5E1A,0x5492,0x76B1,0x5B99,0x663C,/* 0xE0-0xE7 */
+	0x9AA4,0x73E0,0x682A,0x86DB,0x6731,0x732A,0x8BF8,0x8BDB,/* 0xE8-0xEF */
+	0x9010,0x7AF9,0x70DB,0x716E,0x62C4,0x77A9,0x5631,0x4E3B,/* 0xF0-0xF7 */
+	0x8457,0x67F1,0x52A9,0x86C0,0x8D2E,0x94F8,0x7B51,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B46,0x8B47,0x8B48,0x8B49,0x8B4A,0x8B4B,0x8B4C,0x8B4D,/* 0x40-0x47 */
+	0x8B4E,0x8B4F,0x8B50,0x8B51,0x8B52,0x8B53,0x8B54,0x8B55,/* 0x48-0x4F */
+	0x8B56,0x8B57,0x8B58,0x8B59,0x8B5A,0x8B5B,0x8B5C,0x8B5D,/* 0x50-0x57 */
+	0x8B5E,0x8B5F,0x8B60,0x8B61,0x8B62,0x8B63,0x8B64,0x8B65,/* 0x58-0x5F */
+	0x8B67,0x8B68,0x8B69,0x8B6A,0x8B6B,0x8B6D,0x8B6E,0x8B6F,/* 0x60-0x67 */
+	0x8B70,0x8B71,0x8B72,0x8B73,0x8B74,0x8B75,0x8B76,0x8B77,/* 0x68-0x6F */
+	0x8B78,0x8B79,0x8B7A,0x8B7B,0x8B7C,0x8B7D,0x8B7E,0x8B7F,/* 0x70-0x77 */
+	0x8B80,0x8B81,0x8B82,0x8B83,0x8B84,0x8B85,0x8B86,0x0000,/* 0x78-0x7F */
+
+	0x8B87,0x8B88,0x8B89,0x8B8A,0x8B8B,0x8B8C,0x8B8D,0x8B8E,/* 0x80-0x87 */
+	0x8B8F,0x8B90,0x8B91,0x8B92,0x8B93,0x8B94,0x8B95,0x8B96,/* 0x88-0x8F */
+	0x8B97,0x8B98,0x8B99,0x8B9A,0x8B9B,0x8B9C,0x8B9D,0x8B9E,/* 0x90-0x97 */
+	0x8B9F,0x8BAC,0x8BB1,0x8BBB,0x8BC7,0x8BD0,0x8BEA,0x8C09,/* 0x98-0x9F */
+	0x8C1E,0x4F4F,0x6CE8,0x795D,0x9A7B,0x6293,0x722A,0x62FD,/* 0xA0-0xA7 */
+	0x4E13,0x7816,0x8F6C,0x64B0,0x8D5A,0x7BC6,0x6869,0x5E84,/* 0xA8-0xAF */
+	0x88C5,0x5986,0x649E,0x58EE,0x72B6,0x690E,0x9525,0x8FFD,/* 0xB0-0xB7 */
+	0x8D58,0x5760,0x7F00,0x8C06,0x51C6,0x6349,0x62D9,0x5353,/* 0xB8-0xBF */
+	0x684C,0x7422,0x8301,0x914C,0x5544,0x7740,0x707C,0x6D4A,/* 0xC0-0xC7 */
+	0x5179,0x54A8,0x8D44,0x59FF,0x6ECB,0x6DC4,0x5B5C,0x7D2B,/* 0xC8-0xCF */
+	0x4ED4,0x7C7D,0x6ED3,0x5B50,0x81EA,0x6E0D,0x5B57,0x9B03,/* 0xD0-0xD7 */
+	0x68D5,0x8E2A,0x5B97,0x7EFC,0x603B,0x7EB5,0x90B9,0x8D70,/* 0xD8-0xDF */
+	0x594F,0x63CD,0x79DF,0x8DB3,0x5352,0x65CF,0x7956,0x8BC5,/* 0xE0-0xE7 */
+	0x963B,0x7EC4,0x94BB,0x7E82,0x5634,0x9189,0x6700,0x7F6A,/* 0xE8-0xEF */
+	0x5C0A,0x9075,0x6628,0x5DE6,0x4F50,0x67DE,0x505A,0x4F5C,/* 0xF0-0xF7 */
+	0x5750,0x5EA7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8C38,0x8C39,0x8C3A,0x8C3B,0x8C3C,0x8C3D,0x8C3E,0x8C3F,/* 0x40-0x47 */
+	0x8C40,0x8C42,0x8C43,0x8C44,0x8C45,0x8C48,0x8C4A,0x8C4B,/* 0x48-0x4F */
+	0x8C4D,0x8C4E,0x8C4F,0x8C50,0x8C51,0x8C52,0x8C53,0x8C54,/* 0x50-0x57 */
+	0x8C56,0x8C57,0x8C58,0x8C59,0x8C5B,0x8C5C,0x8C5D,0x8C5E,/* 0x58-0x5F */
+	0x8C5F,0x8C60,0x8C63,0x8C64,0x8C65,0x8C66,0x8C67,0x8C68,/* 0x60-0x67 */
+	0x8C69,0x8C6C,0x8C6D,0x8C6E,0x8C6F,0x8C70,0x8C71,0x8C72,/* 0x68-0x6F */
+	0x8C74,0x8C75,0x8C76,0x8C77,0x8C7B,0x8C7C,0x8C7D,0x8C7E,/* 0x70-0x77 */
+	0x8C7F,0x8C80,0x8C81,0x8C83,0x8C84,0x8C86,0x8C87,0x0000,/* 0x78-0x7F */
+
+	0x8C88,0x8C8B,0x8C8D,0x8C8E,0x8C8F,0x8C90,0x8C91,0x8C92,/* 0x80-0x87 */
+	0x8C93,0x8C95,0x8C96,0x8C97,0x8C99,0x8C9A,0x8C9B,0x8C9C,/* 0x88-0x8F */
+	0x8C9D,0x8C9E,0x8C9F,0x8CA0,0x8CA1,0x8CA2,0x8CA3,0x8CA4,/* 0x90-0x97 */
+	0x8CA5,0x8CA6,0x8CA7,0x8CA8,0x8CA9,0x8CAA,0x8CAB,0x8CAC,/* 0x98-0x9F */
+	0x8CAD,0x4E8D,0x4E0C,0x5140,0x4E10,0x5EFF,0x5345,0x4E15,/* 0xA0-0xA7 */
+	0x4E98,0x4E1E,0x9B32,0x5B6C,0x5669,0x4E28,0x79BA,0x4E3F,/* 0xA8-0xAF */
+	0x5315,0x4E47,0x592D,0x723B,0x536E,0x6C10,0x56DF,0x80E4,/* 0xB0-0xB7 */
+	0x9997,0x6BD3,0x777E,0x9F17,0x4E36,0x4E9F,0x9F10,0x4E5C,/* 0xB8-0xBF */
+	0x4E69,0x4E93,0x8288,0x5B5B,0x556C,0x560F,0x4EC4,0x538D,/* 0xC0-0xC7 */
+	0x539D,0x53A3,0x53A5,0x53AE,0x9765,0x8D5D,0x531A,0x53F5,/* 0xC8-0xCF */
+	0x5326,0x532E,0x533E,0x8D5C,0x5366,0x5363,0x5202,0x5208,/* 0xD0-0xD7 */
+	0x520E,0x522D,0x5233,0x523F,0x5240,0x524C,0x525E,0x5261,/* 0xD8-0xDF */
+	0x525C,0x84AF,0x527D,0x5282,0x5281,0x5290,0x5293,0x5182,/* 0xE0-0xE7 */
+	0x7F54,0x4EBB,0x4EC3,0x4EC9,0x4EC2,0x4EE8,0x4EE1,0x4EEB,/* 0xE8-0xEF */
+	0x4EDE,0x4F1B,0x4EF3,0x4F22,0x4F64,0x4EF5,0x4F25,0x4F27,/* 0xF0-0xF7 */
+	0x4F09,0x4F2B,0x4F5E,0x4F67,0x6538,0x4F5A,0x4F5D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8CAE,0x8CAF,0x8CB0,0x8CB1,0x8CB2,0x8CB3,0x8CB4,0x8CB5,/* 0x40-0x47 */
+	0x8CB6,0x8CB7,0x8CB8,0x8CB9,0x8CBA,0x8CBB,0x8CBC,0x8CBD,/* 0x48-0x4F */
+	0x8CBE,0x8CBF,0x8CC0,0x8CC1,0x8CC2,0x8CC3,0x8CC4,0x8CC5,/* 0x50-0x57 */
+	0x8CC6,0x8CC7,0x8CC8,0x8CC9,0x8CCA,0x8CCB,0x8CCC,0x8CCD,/* 0x58-0x5F */
+	0x8CCE,0x8CCF,0x8CD0,0x8CD1,0x8CD2,0x8CD3,0x8CD4,0x8CD5,/* 0x60-0x67 */
+	0x8CD6,0x8CD7,0x8CD8,0x8CD9,0x8CDA,0x8CDB,0x8CDC,0x8CDD,/* 0x68-0x6F */
+	0x8CDE,0x8CDF,0x8CE0,0x8CE1,0x8CE2,0x8CE3,0x8CE4,0x8CE5,/* 0x70-0x77 */
+	0x8CE6,0x8CE7,0x8CE8,0x8CE9,0x8CEA,0x8CEB,0x8CEC,0x0000,/* 0x78-0x7F */
+
+	0x8CED,0x8CEE,0x8CEF,0x8CF0,0x8CF1,0x8CF2,0x8CF3,0x8CF4,/* 0x80-0x87 */
+	0x8CF5,0x8CF6,0x8CF7,0x8CF8,0x8CF9,0x8CFA,0x8CFB,0x8CFC,/* 0x88-0x8F */
+	0x8CFD,0x8CFE,0x8CFF,0x8D00,0x8D01,0x8D02,0x8D03,0x8D04,/* 0x90-0x97 */
+	0x8D05,0x8D06,0x8D07,0x8D08,0x8D09,0x8D0A,0x8D0B,0x8D0C,/* 0x98-0x9F */
+	0x8D0D,0x4F5F,0x4F57,0x4F32,0x4F3D,0x4F76,0x4F74,0x4F91,/* 0xA0-0xA7 */
+	0x4F89,0x4F83,0x4F8F,0x4F7E,0x4F7B,0x4FAA,0x4F7C,0x4FAC,/* 0xA8-0xAF */
+	0x4F94,0x4FE6,0x4FE8,0x4FEA,0x4FC5,0x4FDA,0x4FE3,0x4FDC,/* 0xB0-0xB7 */
+	0x4FD1,0x4FDF,0x4FF8,0x5029,0x504C,0x4FF3,0x502C,0x500F,/* 0xB8-0xBF */
+	0x502E,0x502D,0x4FFE,0x501C,0x500C,0x5025,0x5028,0x507E,/* 0xC0-0xC7 */
+	0x5043,0x5055,0x5048,0x504E,0x506C,0x507B,0x50A5,0x50A7,/* 0xC8-0xCF */
+	0x50A9,0x50BA,0x50D6,0x5106,0x50ED,0x50EC,0x50E6,0x50EE,/* 0xD0-0xD7 */
+	0x5107,0x510B,0x4EDD,0x6C3D,0x4F58,0x4F65,0x4FCE,0x9FA0,/* 0xD8-0xDF */
+	0x6C46,0x7C74,0x516E,0x5DFD,0x9EC9,0x9998,0x5181,0x5914,/* 0xE0-0xE7 */
+	0x52F9,0x530D,0x8A07,0x5310,0x51EB,0x5919,0x5155,0x4EA0,/* 0xE8-0xEF */
+	0x5156,0x4EB3,0x886E,0x88A4,0x4EB5,0x8114,0x88D2,0x7980,/* 0xF0-0xF7 */
+	0x5B34,0x8803,0x7FB8,0x51AB,0x51B1,0x51BD,0x51BC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8D0E,0x8D0F,0x8D10,0x8D11,0x8D12,0x8D13,0x8D14,0x8D15,/* 0x40-0x47 */
+	0x8D16,0x8D17,0x8D18,0x8D19,0x8D1A,0x8D1B,0x8D1C,0x8D20,/* 0x48-0x4F */
+	0x8D51,0x8D52,0x8D57,0x8D5F,0x8D65,0x8D68,0x8D69,0x8D6A,/* 0x50-0x57 */
+	0x8D6C,0x8D6E,0x8D6F,0x8D71,0x8D72,0x8D78,0x8D79,0x8D7A,/* 0x58-0x5F */
+	0x8D7B,0x8D7C,0x8D7D,0x8D7E,0x8D7F,0x8D80,0x8D82,0x8D83,/* 0x60-0x67 */
+	0x8D86,0x8D87,0x8D88,0x8D89,0x8D8C,0x8D8D,0x8D8E,0x8D8F,/* 0x68-0x6F */
+	0x8D90,0x8D92,0x8D93,0x8D95,0x8D96,0x8D97,0x8D98,0x8D99,/* 0x70-0x77 */
+	0x8D9A,0x8D9B,0x8D9C,0x8D9D,0x8D9E,0x8DA0,0x8DA1,0x0000,/* 0x78-0x7F */
+
+	0x8DA2,0x8DA4,0x8DA5,0x8DA6,0x8DA7,0x8DA8,0x8DA9,0x8DAA,/* 0x80-0x87 */
+	0x8DAB,0x8DAC,0x8DAD,0x8DAE,0x8DAF,0x8DB0,0x8DB2,0x8DB6,/* 0x88-0x8F */
+	0x8DB7,0x8DB9,0x8DBB,0x8DBD,0x8DC0,0x8DC1,0x8DC2,0x8DC5,/* 0x90-0x97 */
+	0x8DC7,0x8DC8,0x8DC9,0x8DCA,0x8DCD,0x8DD0,0x8DD2,0x8DD3,/* 0x98-0x9F */
+	0x8DD4,0x51C7,0x5196,0x51A2,0x51A5,0x8BA0,0x8BA6,0x8BA7,/* 0xA0-0xA7 */
+	0x8BAA,0x8BB4,0x8BB5,0x8BB7,0x8BC2,0x8BC3,0x8BCB,0x8BCF,/* 0xA8-0xAF */
+	0x8BCE,0x8BD2,0x8BD3,0x8BD4,0x8BD6,0x8BD8,0x8BD9,0x8BDC,/* 0xB0-0xB7 */
+	0x8BDF,0x8BE0,0x8BE4,0x8BE8,0x8BE9,0x8BEE,0x8BF0,0x8BF3,/* 0xB8-0xBF */
+	0x8BF6,0x8BF9,0x8BFC,0x8BFF,0x8C00,0x8C02,0x8C04,0x8C07,/* 0xC0-0xC7 */
+	0x8C0C,0x8C0F,0x8C11,0x8C12,0x8C14,0x8C15,0x8C16,0x8C19,/* 0xC8-0xCF */
+	0x8C1B,0x8C18,0x8C1D,0x8C1F,0x8C20,0x8C21,0x8C25,0x8C27,/* 0xD0-0xD7 */
+	0x8C2A,0x8C2B,0x8C2E,0x8C2F,0x8C32,0x8C33,0x8C35,0x8C36,/* 0xD8-0xDF */
+	0x5369,0x537A,0x961D,0x9622,0x9621,0x9631,0x962A,0x963D,/* 0xE0-0xE7 */
+	0x963C,0x9642,0x9649,0x9654,0x965F,0x9667,0x966C,0x9672,/* 0xE8-0xEF */
+	0x9674,0x9688,0x968D,0x9697,0x96B0,0x9097,0x909B,0x909D,/* 0xF0-0xF7 */
+	0x9099,0x90AC,0x90A1,0x90B4,0x90B3,0x90B6,0x90BA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8DD5,0x8DD8,0x8DD9,0x8DDC,0x8DE0,0x8DE1,0x8DE2,0x8DE5,/* 0x40-0x47 */
+	0x8DE6,0x8DE7,0x8DE9,0x8DED,0x8DEE,0x8DF0,0x8DF1,0x8DF2,/* 0x48-0x4F */
+	0x8DF4,0x8DF6,0x8DFC,0x8DFE,0x8DFF,0x8E00,0x8E01,0x8E02,/* 0x50-0x57 */
+	0x8E03,0x8E04,0x8E06,0x8E07,0x8E08,0x8E0B,0x8E0D,0x8E0E,/* 0x58-0x5F */
+	0x8E10,0x8E11,0x8E12,0x8E13,0x8E15,0x8E16,0x8E17,0x8E18,/* 0x60-0x67 */
+	0x8E19,0x8E1A,0x8E1B,0x8E1C,0x8E20,0x8E21,0x8E24,0x8E25,/* 0x68-0x6F */
+	0x8E26,0x8E27,0x8E28,0x8E2B,0x8E2D,0x8E30,0x8E32,0x8E33,/* 0x70-0x77 */
+	0x8E34,0x8E36,0x8E37,0x8E38,0x8E3B,0x8E3C,0x8E3E,0x0000,/* 0x78-0x7F */
+
+	0x8E3F,0x8E43,0x8E45,0x8E46,0x8E4C,0x8E4D,0x8E4E,0x8E4F,/* 0x80-0x87 */
+	0x8E50,0x8E53,0x8E54,0x8E55,0x8E56,0x8E57,0x8E58,0x8E5A,/* 0x88-0x8F */
+	0x8E5B,0x8E5C,0x8E5D,0x8E5E,0x8E5F,0x8E60,0x8E61,0x8E62,/* 0x90-0x97 */
+	0x8E63,0x8E64,0x8E65,0x8E67,0x8E68,0x8E6A,0x8E6B,0x8E6E,/* 0x98-0x9F */
+	0x8E71,0x90B8,0x90B0,0x90CF,0x90C5,0x90BE,0x90D0,0x90C4,/* 0xA0-0xA7 */
+	0x90C7,0x90D3,0x90E6,0x90E2,0x90DC,0x90D7,0x90DB,0x90EB,/* 0xA8-0xAF */
+	0x90EF,0x90FE,0x9104,0x9122,0x911E,0x9123,0x9131,0x912F,/* 0xB0-0xB7 */
+	0x9139,0x9143,0x9146,0x520D,0x5942,0x52A2,0x52AC,0x52AD,/* 0xB8-0xBF */
+	0x52BE,0x54FF,0x52D0,0x52D6,0x52F0,0x53DF,0x71EE,0x77CD,/* 0xC0-0xC7 */
+	0x5EF4,0x51F5,0x51FC,0x9B2F,0x53B6,0x5F01,0x755A,0x5DEF,/* 0xC8-0xCF */
+	0x574C,0x57A9,0x57A1,0x587E,0x58BC,0x58C5,0x58D1,0x5729,/* 0xD0-0xD7 */
+	0x572C,0x572A,0x5733,0x5739,0x572E,0x572F,0x575C,0x573B,/* 0xD8-0xDF */
+	0x5742,0x5769,0x5785,0x576B,0x5786,0x577C,0x577B,0x5768,/* 0xE0-0xE7 */
+	0x576D,0x5776,0x5773,0x57AD,0x57A4,0x578C,0x57B2,0x57CF,/* 0xE8-0xEF */
+	0x57A7,0x57B4,0x5793,0x57A0,0x57D5,0x57D8,0x57DA,0x57D9,/* 0xF0-0xF7 */
+	0x57D2,0x57B8,0x57F4,0x57EF,0x57F8,0x57E4,0x57DD,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E73,0x8E75,0x8E77,0x8E78,0x8E79,0x8E7A,0x8E7B,0x8E7D,/* 0x40-0x47 */
+	0x8E7E,0x8E80,0x8E82,0x8E83,0x8E84,0x8E86,0x8E88,0x8E89,/* 0x48-0x4F */
+	0x8E8A,0x8E8B,0x8E8C,0x8E8D,0x8E8E,0x8E91,0x8E92,0x8E93,/* 0x50-0x57 */
+	0x8E95,0x8E96,0x8E97,0x8E98,0x8E99,0x8E9A,0x8E9B,0x8E9D,/* 0x58-0x5F */
+	0x8E9F,0x8EA0,0x8EA1,0x8EA2,0x8EA3,0x8EA4,0x8EA5,0x8EA6,/* 0x60-0x67 */
+	0x8EA7,0x8EA8,0x8EA9,0x8EAA,0x8EAD,0x8EAE,0x8EB0,0x8EB1,/* 0x68-0x6F */
+	0x8EB3,0x8EB4,0x8EB5,0x8EB6,0x8EB7,0x8EB8,0x8EB9,0x8EBB,/* 0x70-0x77 */
+	0x8EBC,0x8EBD,0x8EBE,0x8EBF,0x8EC0,0x8EC1,0x8EC2,0x0000,/* 0x78-0x7F */
+
+	0x8EC3,0x8EC4,0x8EC5,0x8EC6,0x8EC7,0x8EC8,0x8EC9,0x8ECA,/* 0x80-0x87 */
+	0x8ECB,0x8ECC,0x8ECD,0x8ECF,0x8ED0,0x8ED1,0x8ED2,0x8ED3,/* 0x88-0x8F */
+	0x8ED4,0x8ED5,0x8ED6,0x8ED7,0x8ED8,0x8ED9,0x8EDA,0x8EDB,/* 0x90-0x97 */
+	0x8EDC,0x8EDD,0x8EDE,0x8EDF,0x8EE0,0x8EE1,0x8EE2,0x8EE3,/* 0x98-0x9F */
+	0x8EE4,0x580B,0x580D,0x57FD,0x57ED,0x5800,0x581E,0x5819,/* 0xA0-0xA7 */
+	0x5844,0x5820,0x5865,0x586C,0x5881,0x5889,0x589A,0x5880,/* 0xA8-0xAF */
+	0x99A8,0x9F19,0x61FF,0x8279,0x827D,0x827F,0x828F,0x828A,/* 0xB0-0xB7 */
+	0x82A8,0x8284,0x828E,0x8291,0x8297,0x8299,0x82AB,0x82B8,/* 0xB8-0xBF */
+	0x82BE,0x82B0,0x82C8,0x82CA,0x82E3,0x8298,0x82B7,0x82AE,/* 0xC0-0xC7 */
+	0x82CB,0x82CC,0x82C1,0x82A9,0x82B4,0x82A1,0x82AA,0x829F,/* 0xC8-0xCF */
+	0x82C4,0x82CE,0x82A4,0x82E1,0x8309,0x82F7,0x82E4,0x830F,/* 0xD0-0xD7 */
+	0x8307,0x82DC,0x82F4,0x82D2,0x82D8,0x830C,0x82FB,0x82D3,/* 0xD8-0xDF */
+	0x8311,0x831A,0x8306,0x8314,0x8315,0x82E0,0x82D5,0x831C,/* 0xE0-0xE7 */
+	0x8351,0x835B,0x835C,0x8308,0x8392,0x833C,0x8334,0x8331,/* 0xE8-0xEF */
+	0x839B,0x835E,0x832F,0x834F,0x8347,0x8343,0x835F,0x8340,/* 0xF0-0xF7 */
+	0x8317,0x8360,0x832D,0x833A,0x8333,0x8366,0x8365,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8EE5,0x8EE6,0x8EE7,0x8EE8,0x8EE9,0x8EEA,0x8EEB,0x8EEC,/* 0x40-0x47 */
+	0x8EED,0x8EEE,0x8EEF,0x8EF0,0x8EF1,0x8EF2,0x8EF3,0x8EF4,/* 0x48-0x4F */
+	0x8EF5,0x8EF6,0x8EF7,0x8EF8,0x8EF9,0x8EFA,0x8EFB,0x8EFC,/* 0x50-0x57 */
+	0x8EFD,0x8EFE,0x8EFF,0x8F00,0x8F01,0x8F02,0x8F03,0x8F04,/* 0x58-0x5F */
+	0x8F05,0x8F06,0x8F07,0x8F08,0x8F09,0x8F0A,0x8F0B,0x8F0C,/* 0x60-0x67 */
+	0x8F0D,0x8F0E,0x8F0F,0x8F10,0x8F11,0x8F12,0x8F13,0x8F14,/* 0x68-0x6F */
+	0x8F15,0x8F16,0x8F17,0x8F18,0x8F19,0x8F1A,0x8F1B,0x8F1C,/* 0x70-0x77 */
+	0x8F1D,0x8F1E,0x8F1F,0x8F20,0x8F21,0x8F22,0x8F23,0x0000,/* 0x78-0x7F */
+
+	0x8F24,0x8F25,0x8F26,0x8F27,0x8F28,0x8F29,0x8F2A,0x8F2B,/* 0x80-0x87 */
+	0x8F2C,0x8F2D,0x8F2E,0x8F2F,0x8F30,0x8F31,0x8F32,0x8F33,/* 0x88-0x8F */
+	0x8F34,0x8F35,0x8F36,0x8F37,0x8F38,0x8F39,0x8F3A,0x8F3B,/* 0x90-0x97 */
+	0x8F3C,0x8F3D,0x8F3E,0x8F3F,0x8F40,0x8F41,0x8F42,0x8F43,/* 0x98-0x9F */
+	0x8F44,0x8368,0x831B,0x8369,0x836C,0x836A,0x836D,0x836E,/* 0xA0-0xA7 */
+	0x83B0,0x8378,0x83B3,0x83B4,0x83A0,0x83AA,0x8393,0x839C,/* 0xA8-0xAF */
+	0x8385,0x837C,0x83B6,0x83A9,0x837D,0x83B8,0x837B,0x8398,/* 0xB0-0xB7 */
+	0x839E,0x83A8,0x83BA,0x83BC,0x83C1,0x8401,0x83E5,0x83D8,/* 0xB8-0xBF */
+	0x5807,0x8418,0x840B,0x83DD,0x83FD,0x83D6,0x841C,0x8438,/* 0xC0-0xC7 */
+	0x8411,0x8406,0x83D4,0x83DF,0x840F,0x8403,0x83F8,0x83F9,/* 0xC8-0xCF */
+	0x83EA,0x83C5,0x83C0,0x8426,0x83F0,0x83E1,0x845C,0x8451,/* 0xD0-0xD7 */
+	0x845A,0x8459,0x8473,0x8487,0x8488,0x847A,0x8489,0x8478,/* 0xD8-0xDF */
+	0x843C,0x8446,0x8469,0x8476,0x848C,0x848E,0x8431,0x846D,/* 0xE0-0xE7 */
+	0x84C1,0x84CD,0x84D0,0x84E6,0x84BD,0x84D3,0x84CA,0x84BF,/* 0xE8-0xEF */
+	0x84BA,0x84E0,0x84A1,0x84B9,0x84B4,0x8497,0x84E5,0x84E3,/* 0xF0-0xF7 */
+	0x850C,0x750D,0x8538,0x84F0,0x8539,0x851F,0x853A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8F45,0x8F46,0x8F47,0x8F48,0x8F49,0x8F4A,0x8F4B,0x8F4C,/* 0x40-0x47 */
+	0x8F4D,0x8F4E,0x8F4F,0x8F50,0x8F51,0x8F52,0x8F53,0x8F54,/* 0x48-0x4F */
+	0x8F55,0x8F56,0x8F57,0x8F58,0x8F59,0x8F5A,0x8F5B,0x8F5C,/* 0x50-0x57 */
+	0x8F5D,0x8F5E,0x8F5F,0x8F60,0x8F61,0x8F62,0x8F63,0x8F64,/* 0x58-0x5F */
+	0x8F65,0x8F6A,0x8F80,0x8F8C,0x8F92,0x8F9D,0x8FA0,0x8FA1,/* 0x60-0x67 */
+	0x8FA2,0x8FA4,0x8FA5,0x8FA6,0x8FA7,0x8FAA,0x8FAC,0x8FAD,/* 0x68-0x6F */
+	0x8FAE,0x8FAF,0x8FB2,0x8FB3,0x8FB4,0x8FB5,0x8FB7,0x8FB8,/* 0x70-0x77 */
+	0x8FBA,0x8FBB,0x8FBC,0x8FBF,0x8FC0,0x8FC3,0x8FC6,0x0000,/* 0x78-0x7F */
+
+	0x8FC9,0x8FCA,0x8FCB,0x8FCC,0x8FCD,0x8FCF,0x8FD2,0x8FD6,/* 0x80-0x87 */
+	0x8FD7,0x8FDA,0x8FE0,0x8FE1,0x8FE3,0x8FE7,0x8FEC,0x8FEF,/* 0x88-0x8F */
+	0x8FF1,0x8FF2,0x8FF4,0x8FF5,0x8FF6,0x8FFA,0x8FFB,0x8FFC,/* 0x90-0x97 */
+	0x8FFE,0x8FFF,0x9007,0x9008,0x900C,0x900E,0x9013,0x9015,/* 0x98-0x9F */
+	0x9018,0x8556,0x853B,0x84FF,0x84FC,0x8559,0x8548,0x8568,/* 0xA0-0xA7 */
+	0x8564,0x855E,0x857A,0x77A2,0x8543,0x8572,0x857B,0x85A4,/* 0xA8-0xAF */
+	0x85A8,0x8587,0x858F,0x8579,0x85AE,0x859C,0x8585,0x85B9,/* 0xB0-0xB7 */
+	0x85B7,0x85B0,0x85D3,0x85C1,0x85DC,0x85FF,0x8627,0x8605,/* 0xB8-0xBF */
+	0x8629,0x8616,0x863C,0x5EFE,0x5F08,0x593C,0x5941,0x8037,/* 0xC0-0xC7 */
+	0x5955,0x595A,0x5958,0x530F,0x5C22,0x5C25,0x5C2C,0x5C34,/* 0xC8-0xCF */
+	0x624C,0x626A,0x629F,0x62BB,0x62CA,0x62DA,0x62D7,0x62EE,/* 0xD0-0xD7 */
+	0x6322,0x62F6,0x6339,0x634B,0x6343,0x63AD,0x63F6,0x6371,/* 0xD8-0xDF */
+	0x637A,0x638E,0x63B4,0x636D,0x63AC,0x638A,0x6369,0x63AE,/* 0xE0-0xE7 */
+	0x63BC,0x63F2,0x63F8,0x63E0,0x63FF,0x63C4,0x63DE,0x63CE,/* 0xE8-0xEF */
+	0x6452,0x63C6,0x63BE,0x6445,0x6441,0x640B,0x641B,0x6420,/* 0xF0-0xF7 */
+	0x640C,0x6426,0x6421,0x645E,0x6484,0x646D,0x6496,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9019,0x901C,0x9023,0x9024,0x9025,0x9027,0x9028,0x9029,/* 0x40-0x47 */
+	0x902A,0x902B,0x902C,0x9030,0x9031,0x9032,0x9033,0x9034,/* 0x48-0x4F */
+	0x9037,0x9039,0x903A,0x903D,0x903F,0x9040,0x9043,0x9045,/* 0x50-0x57 */
+	0x9046,0x9048,0x9049,0x904A,0x904B,0x904C,0x904E,0x9054,/* 0x58-0x5F */
+	0x9055,0x9056,0x9059,0x905A,0x905C,0x905D,0x905E,0x905F,/* 0x60-0x67 */
+	0x9060,0x9061,0x9064,0x9066,0x9067,0x9069,0x906A,0x906B,/* 0x68-0x6F */
+	0x906C,0x906F,0x9070,0x9071,0x9072,0x9073,0x9076,0x9077,/* 0x70-0x77 */
+	0x9078,0x9079,0x907A,0x907B,0x907C,0x907E,0x9081,0x0000,/* 0x78-0x7F */
+
+	0x9084,0x9085,0x9086,0x9087,0x9089,0x908A,0x908C,0x908D,/* 0x80-0x87 */
+	0x908E,0x908F,0x9090,0x9092,0x9094,0x9096,0x9098,0x909A,/* 0x88-0x8F */
+	0x909C,0x909E,0x909F,0x90A0,0x90A4,0x90A5,0x90A7,0x90A8,/* 0x90-0x97 */
+	0x90A9,0x90AB,0x90AD,0x90B2,0x90B7,0x90BC,0x90BD,0x90BF,/* 0x98-0x9F */
+	0x90C0,0x647A,0x64B7,0x64B8,0x6499,0x64BA,0x64C0,0x64D0,/* 0xA0-0xA7 */
+	0x64D7,0x64E4,0x64E2,0x6509,0x6525,0x652E,0x5F0B,0x5FD2,/* 0xA8-0xAF */
+	0x7519,0x5F11,0x535F,0x53F1,0x53FD,0x53E9,0x53E8,0x53FB,/* 0xB0-0xB7 */
+	0x5412,0x5416,0x5406,0x544B,0x5452,0x5453,0x5454,0x5456,/* 0xB8-0xBF */
+	0x5443,0x5421,0x5457,0x5459,0x5423,0x5432,0x5482,0x5494,/* 0xC0-0xC7 */
+	0x5477,0x5471,0x5464,0x549A,0x549B,0x5484,0x5476,0x5466,/* 0xC8-0xCF */
+	0x549D,0x54D0,0x54AD,0x54C2,0x54B4,0x54D2,0x54A7,0x54A6,/* 0xD0-0xD7 */
+	0x54D3,0x54D4,0x5472,0x54A3,0x54D5,0x54BB,0x54BF,0x54CC,/* 0xD8-0xDF */
+	0x54D9,0x54DA,0x54DC,0x54A9,0x54AA,0x54A4,0x54DD,0x54CF,/* 0xE0-0xE7 */
+	0x54DE,0x551B,0x54E7,0x5520,0x54FD,0x5514,0x54F3,0x5522,/* 0xE8-0xEF */
+	0x5523,0x550F,0x5511,0x5527,0x552A,0x5567,0x558F,0x55B5,/* 0xF0-0xF7 */
+	0x5549,0x556D,0x5541,0x5555,0x553F,0x5550,0x553C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x90C2,0x90C3,0x90C6,0x90C8,0x90C9,0x90CB,0x90CC,0x90CD,/* 0x40-0x47 */
+	0x90D2,0x90D4,0x90D5,0x90D6,0x90D8,0x90D9,0x90DA,0x90DE,/* 0x48-0x4F */
+	0x90DF,0x90E0,0x90E3,0x90E4,0x90E5,0x90E9,0x90EA,0x90EC,/* 0x50-0x57 */
+	0x90EE,0x90F0,0x90F1,0x90F2,0x90F3,0x90F5,0x90F6,0x90F7,/* 0x58-0x5F */
+	0x90F9,0x90FA,0x90FB,0x90FC,0x90FF,0x9100,0x9101,0x9103,/* 0x60-0x67 */
+	0x9105,0x9106,0x9107,0x9108,0x9109,0x910A,0x910B,0x910C,/* 0x68-0x6F */
+	0x910D,0x910E,0x910F,0x9110,0x9111,0x9112,0x9113,0x9114,/* 0x70-0x77 */
+	0x9115,0x9116,0x9117,0x9118,0x911A,0x911B,0x911C,0x0000,/* 0x78-0x7F */
+
+	0x911D,0x911F,0x9120,0x9121,0x9124,0x9125,0x9126,0x9127,/* 0x80-0x87 */
+	0x9128,0x9129,0x912A,0x912B,0x912C,0x912D,0x912E,0x9130,/* 0x88-0x8F */
+	0x9132,0x9133,0x9134,0x9135,0x9136,0x9137,0x9138,0x913A,/* 0x90-0x97 */
+	0x913B,0x913C,0x913D,0x913E,0x913F,0x9140,0x9141,0x9142,/* 0x98-0x9F */
+	0x9144,0x5537,0x5556,0x5575,0x5576,0x5577,0x5533,0x5530,/* 0xA0-0xA7 */
+	0x555C,0x558B,0x55D2,0x5583,0x55B1,0x55B9,0x5588,0x5581,/* 0xA8-0xAF */
+	0x559F,0x557E,0x55D6,0x5591,0x557B,0x55DF,0x55BD,0x55BE,/* 0xB0-0xB7 */
+	0x5594,0x5599,0x55EA,0x55F7,0x55C9,0x561F,0x55D1,0x55EB,/* 0xB8-0xBF */
+	0x55EC,0x55D4,0x55E6,0x55DD,0x55C4,0x55EF,0x55E5,0x55F2,/* 0xC0-0xC7 */
+	0x55F3,0x55CC,0x55CD,0x55E8,0x55F5,0x55E4,0x8F94,0x561E,/* 0xC8-0xCF */
+	0x5608,0x560C,0x5601,0x5624,0x5623,0x55FE,0x5600,0x5627,/* 0xD0-0xD7 */
+	0x562D,0x5658,0x5639,0x5657,0x562C,0x564D,0x5662,0x5659,/* 0xD8-0xDF */
+	0x565C,0x564C,0x5654,0x5686,0x5664,0x5671,0x566B,0x567B,/* 0xE0-0xE7 */
+	0x567C,0x5685,0x5693,0x56AF,0x56D4,0x56D7,0x56DD,0x56E1,/* 0xE8-0xEF */
+	0x56F5,0x56EB,0x56F9,0x56FF,0x5704,0x570A,0x5709,0x571C,/* 0xF0-0xF7 */
+	0x5E0F,0x5E19,0x5E14,0x5E11,0x5E31,0x5E3B,0x5E3C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9145,0x9147,0x9148,0x9151,0x9153,0x9154,0x9155,0x9156,/* 0x40-0x47 */
+	0x9158,0x9159,0x915B,0x915C,0x915F,0x9160,0x9166,0x9167,/* 0x48-0x4F */
+	0x9168,0x916B,0x916D,0x9173,0x917A,0x917B,0x917C,0x9180,/* 0x50-0x57 */
+	0x9181,0x9182,0x9183,0x9184,0x9186,0x9188,0x918A,0x918E,/* 0x58-0x5F */
+	0x918F,0x9193,0x9194,0x9195,0x9196,0x9197,0x9198,0x9199,/* 0x60-0x67 */
+	0x919C,0x919D,0x919E,0x919F,0x91A0,0x91A1,0x91A4,0x91A5,/* 0x68-0x6F */
+	0x91A6,0x91A7,0x91A8,0x91A9,0x91AB,0x91AC,0x91B0,0x91B1,/* 0x70-0x77 */
+	0x91B2,0x91B3,0x91B6,0x91B7,0x91B8,0x91B9,0x91BB,0x0000,/* 0x78-0x7F */
+
+	0x91BC,0x91BD,0x91BE,0x91BF,0x91C0,0x91C1,0x91C2,0x91C3,/* 0x80-0x87 */
+	0x91C4,0x91C5,0x91C6,0x91C8,0x91CB,0x91D0,0x91D2,0x91D3,/* 0x88-0x8F */
+	0x91D4,0x91D5,0x91D6,0x91D7,0x91D8,0x91D9,0x91DA,0x91DB,/* 0x90-0x97 */
+	0x91DD,0x91DE,0x91DF,0x91E0,0x91E1,0x91E2,0x91E3,0x91E4,/* 0x98-0x9F */
+	0x91E5,0x5E37,0x5E44,0x5E54,0x5E5B,0x5E5E,0x5E61,0x5C8C,/* 0xA0-0xA7 */
+	0x5C7A,0x5C8D,0x5C90,0x5C96,0x5C88,0x5C98,0x5C99,0x5C91,/* 0xA8-0xAF */
+	0x5C9A,0x5C9C,0x5CB5,0x5CA2,0x5CBD,0x5CAC,0x5CAB,0x5CB1,/* 0xB0-0xB7 */
+	0x5CA3,0x5CC1,0x5CB7,0x5CC4,0x5CD2,0x5CE4,0x5CCB,0x5CE5,/* 0xB8-0xBF */
+	0x5D02,0x5D03,0x5D27,0x5D26,0x5D2E,0x5D24,0x5D1E,0x5D06,/* 0xC0-0xC7 */
+	0x5D1B,0x5D58,0x5D3E,0x5D34,0x5D3D,0x5D6C,0x5D5B,0x5D6F,/* 0xC8-0xCF */
+	0x5D5D,0x5D6B,0x5D4B,0x5D4A,0x5D69,0x5D74,0x5D82,0x5D99,/* 0xD0-0xD7 */
+	0x5D9D,0x8C73,0x5DB7,0x5DC5,0x5F73,0x5F77,0x5F82,0x5F87,/* 0xD8-0xDF */
+	0x5F89,0x5F8C,0x5F95,0x5F99,0x5F9C,0x5FA8,0x5FAD,0x5FB5,/* 0xE0-0xE7 */
+	0x5FBC,0x8862,0x5F61,0x72AD,0x72B0,0x72B4,0x72B7,0x72B8,/* 0xE8-0xEF */
+	0x72C3,0x72C1,0x72CE,0x72CD,0x72D2,0x72E8,0x72EF,0x72E9,/* 0xF0-0xF7 */
+	0x72F2,0x72F4,0x72F7,0x7301,0x72F3,0x7303,0x72FA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x91E6,0x91E7,0x91E8,0x91E9,0x91EA,0x91EB,0x91EC,0x91ED,/* 0x40-0x47 */
+	0x91EE,0x91EF,0x91F0,0x91F1,0x91F2,0x91F3,0x91F4,0x91F5,/* 0x48-0x4F */
+	0x91F6,0x91F7,0x91F8,0x91F9,0x91FA,0x91FB,0x91FC,0x91FD,/* 0x50-0x57 */
+	0x91FE,0x91FF,0x9200,0x9201,0x9202,0x9203,0x9204,0x9205,/* 0x58-0x5F */
+	0x9206,0x9207,0x9208,0x9209,0x920A,0x920B,0x920C,0x920D,/* 0x60-0x67 */
+	0x920E,0x920F,0x9210,0x9211,0x9212,0x9213,0x9214,0x9215,/* 0x68-0x6F */
+	0x9216,0x9217,0x9218,0x9219,0x921A,0x921B,0x921C,0x921D,/* 0x70-0x77 */
+	0x921E,0x921F,0x9220,0x9221,0x9222,0x9223,0x9224,0x0000,/* 0x78-0x7F */
+
+	0x9225,0x9226,0x9227,0x9228,0x9229,0x922A,0x922B,0x922C,/* 0x80-0x87 */
+	0x922D,0x922E,0x922F,0x9230,0x9231,0x9232,0x9233,0x9234,/* 0x88-0x8F */
+	0x9235,0x9236,0x9237,0x9238,0x9239,0x923A,0x923B,0x923C,/* 0x90-0x97 */
+	0x923D,0x923E,0x923F,0x9240,0x9241,0x9242,0x9243,0x9244,/* 0x98-0x9F */
+	0x9245,0x72FB,0x7317,0x7313,0x7321,0x730A,0x731E,0x731D,/* 0xA0-0xA7 */
+	0x7315,0x7322,0x7339,0x7325,0x732C,0x7338,0x7331,0x7350,/* 0xA8-0xAF */
+	0x734D,0x7357,0x7360,0x736C,0x736F,0x737E,0x821B,0x5925,/* 0xB0-0xB7 */
+	0x98E7,0x5924,0x5902,0x9963,0x9967,0x9968,0x9969,0x996A,/* 0xB8-0xBF */
+	0x996B,0x996C,0x9974,0x9977,0x997D,0x9980,0x9984,0x9987,/* 0xC0-0xC7 */
+	0x998A,0x998D,0x9990,0x9991,0x9993,0x9994,0x9995,0x5E80,/* 0xC8-0xCF */
+	0x5E91,0x5E8B,0x5E96,0x5EA5,0x5EA0,0x5EB9,0x5EB5,0x5EBE,/* 0xD0-0xD7 */
+	0x5EB3,0x8D53,0x5ED2,0x5ED1,0x5EDB,0x5EE8,0x5EEA,0x81BA,/* 0xD8-0xDF */
+	0x5FC4,0x5FC9,0x5FD6,0x5FCF,0x6003,0x5FEE,0x6004,0x5FE1,/* 0xE0-0xE7 */
+	0x5FE4,0x5FFE,0x6005,0x6006,0x5FEA,0x5FED,0x5FF8,0x6019,/* 0xE8-0xEF */
+	0x6035,0x6026,0x601B,0x600F,0x600D,0x6029,0x602B,0x600A,/* 0xF0-0xF7 */
+	0x603F,0x6021,0x6078,0x6079,0x607B,0x607A,0x6042,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9246,0x9247,0x9248,0x9249,0x924A,0x924B,0x924C,0x924D,/* 0x40-0x47 */
+	0x924E,0x924F,0x9250,0x9251,0x9252,0x9253,0x9254,0x9255,/* 0x48-0x4F */
+	0x9256,0x9257,0x9258,0x9259,0x925A,0x925B,0x925C,0x925D,/* 0x50-0x57 */
+	0x925E,0x925F,0x9260,0x9261,0x9262,0x9263,0x9264,0x9265,/* 0x58-0x5F */
+	0x9266,0x9267,0x9268,0x9269,0x926A,0x926B,0x926C,0x926D,/* 0x60-0x67 */
+	0x926E,0x926F,0x9270,0x9271,0x9272,0x9273,0x9275,0x9276,/* 0x68-0x6F */
+	0x9277,0x9278,0x9279,0x927A,0x927B,0x927C,0x927D,0x927E,/* 0x70-0x77 */
+	0x927F,0x9280,0x9281,0x9282,0x9283,0x9284,0x9285,0x0000,/* 0x78-0x7F */
+
+	0x9286,0x9287,0x9288,0x9289,0x928A,0x928B,0x928C,0x928D,/* 0x80-0x87 */
+	0x928F,0x9290,0x9291,0x9292,0x9293,0x9294,0x9295,0x9296,/* 0x88-0x8F */
+	0x9297,0x9298,0x9299,0x929A,0x929B,0x929C,0x929D,0x929E,/* 0x90-0x97 */
+	0x929F,0x92A0,0x92A1,0x92A2,0x92A3,0x92A4,0x92A5,0x92A6,/* 0x98-0x9F */
+	0x92A7,0x606A,0x607D,0x6096,0x609A,0x60AD,0x609D,0x6083,/* 0xA0-0xA7 */
+	0x6092,0x608C,0x609B,0x60EC,0x60BB,0x60B1,0x60DD,0x60D8,/* 0xA8-0xAF */
+	0x60C6,0x60DA,0x60B4,0x6120,0x6126,0x6115,0x6123,0x60F4,/* 0xB0-0xB7 */
+	0x6100,0x610E,0x612B,0x614A,0x6175,0x61AC,0x6194,0x61A7,/* 0xB8-0xBF */
+	0x61B7,0x61D4,0x61F5,0x5FDD,0x96B3,0x95E9,0x95EB,0x95F1,/* 0xC0-0xC7 */
+	0x95F3,0x95F5,0x95F6,0x95FC,0x95FE,0x9603,0x9604,0x9606,/* 0xC8-0xCF */
+	0x9608,0x960A,0x960B,0x960C,0x960D,0x960F,0x9612,0x9615,/* 0xD0-0xD7 */
+	0x9616,0x9617,0x9619,0x961A,0x4E2C,0x723F,0x6215,0x6C35,/* 0xD8-0xDF */
+	0x6C54,0x6C5C,0x6C4A,0x6CA3,0x6C85,0x6C90,0x6C94,0x6C8C,/* 0xE0-0xE7 */
+	0x6C68,0x6C69,0x6C74,0x6C76,0x6C86,0x6CA9,0x6CD0,0x6CD4,/* 0xE8-0xEF */
+	0x6CAD,0x6CF7,0x6CF8,0x6CF1,0x6CD7,0x6CB2,0x6CE0,0x6CD6,/* 0xF0-0xF7 */
+	0x6CFA,0x6CEB,0x6CEE,0x6CB1,0x6CD3,0x6CEF,0x6CFE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x92A8,0x92A9,0x92AA,0x92AB,0x92AC,0x92AD,0x92AF,0x92B0,/* 0x40-0x47 */
+	0x92B1,0x92B2,0x92B3,0x92B4,0x92B5,0x92B6,0x92B7,0x92B8,/* 0x48-0x4F */
+	0x92B9,0x92BA,0x92BB,0x92BC,0x92BD,0x92BE,0x92BF,0x92C0,/* 0x50-0x57 */
+	0x92C1,0x92C2,0x92C3,0x92C4,0x92C5,0x92C6,0x92C7,0x92C9,/* 0x58-0x5F */
+	0x92CA,0x92CB,0x92CC,0x92CD,0x92CE,0x92CF,0x92D0,0x92D1,/* 0x60-0x67 */
+	0x92D2,0x92D3,0x92D4,0x92D5,0x92D6,0x92D7,0x92D8,0x92D9,/* 0x68-0x6F */
+	0x92DA,0x92DB,0x92DC,0x92DD,0x92DE,0x92DF,0x92E0,0x92E1,/* 0x70-0x77 */
+	0x92E2,0x92E3,0x92E4,0x92E5,0x92E6,0x92E7,0x92E8,0x0000,/* 0x78-0x7F */
+
+	0x92E9,0x92EA,0x92EB,0x92EC,0x92ED,0x92EE,0x92EF,0x92F0,/* 0x80-0x87 */
+	0x92F1,0x92F2,0x92F3,0x92F4,0x92F5,0x92F6,0x92F7,0x92F8,/* 0x88-0x8F */
+	0x92F9,0x92FA,0x92FB,0x92FC,0x92FD,0x92FE,0x92FF,0x9300,/* 0x90-0x97 */
+	0x9301,0x9302,0x9303,0x9304,0x9305,0x9306,0x9307,0x9308,/* 0x98-0x9F */
+	0x9309,0x6D39,0x6D27,0x6D0C,0x6D43,0x6D48,0x6D07,0x6D04,/* 0xA0-0xA7 */
+	0x6D19,0x6D0E,0x6D2B,0x6D4D,0x6D2E,0x6D35,0x6D1A,0x6D4F,/* 0xA8-0xAF */
+	0x6D52,0x6D54,0x6D33,0x6D91,0x6D6F,0x6D9E,0x6DA0,0x6D5E,/* 0xB0-0xB7 */
+	0x6D93,0x6D94,0x6D5C,0x6D60,0x6D7C,0x6D63,0x6E1A,0x6DC7,/* 0xB8-0xBF */
+	0x6DC5,0x6DDE,0x6E0E,0x6DBF,0x6DE0,0x6E11,0x6DE6,0x6DDD,/* 0xC0-0xC7 */
+	0x6DD9,0x6E16,0x6DAB,0x6E0C,0x6DAE,0x6E2B,0x6E6E,0x6E4E,/* 0xC8-0xCF */
+	0x6E6B,0x6EB2,0x6E5F,0x6E86,0x6E53,0x6E54,0x6E32,0x6E25,/* 0xD0-0xD7 */
+	0x6E44,0x6EDF,0x6EB1,0x6E98,0x6EE0,0x6F2D,0x6EE2,0x6EA5,/* 0xD8-0xDF */
+	0x6EA7,0x6EBD,0x6EBB,0x6EB7,0x6ED7,0x6EB4,0x6ECF,0x6E8F,/* 0xE0-0xE7 */
+	0x6EC2,0x6E9F,0x6F62,0x6F46,0x6F47,0x6F24,0x6F15,0x6EF9,/* 0xE8-0xEF */
+	0x6F2F,0x6F36,0x6F4B,0x6F74,0x6F2A,0x6F09,0x6F29,0x6F89,/* 0xF0-0xF7 */
+	0x6F8D,0x6F8C,0x6F78,0x6F72,0x6F7C,0x6F7A,0x6FD1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x930A,0x930B,0x930C,0x930D,0x930E,0x930F,0x9310,0x9311,/* 0x40-0x47 */
+	0x9312,0x9313,0x9314,0x9315,0x9316,0x9317,0x9318,0x9319,/* 0x48-0x4F */
+	0x931A,0x931B,0x931C,0x931D,0x931E,0x931F,0x9320,0x9321,/* 0x50-0x57 */
+	0x9322,0x9323,0x9324,0x9325,0x9326,0x9327,0x9328,0x9329,/* 0x58-0x5F */
+	0x932A,0x932B,0x932C,0x932D,0x932E,0x932F,0x9330,0x9331,/* 0x60-0x67 */
+	0x9332,0x9333,0x9334,0x9335,0x9336,0x9337,0x9338,0x9339,/* 0x68-0x6F */
+	0x933A,0x933B,0x933C,0x933D,0x933F,0x9340,0x9341,0x9342,/* 0x70-0x77 */
+	0x9343,0x9344,0x9345,0x9346,0x9347,0x9348,0x9349,0x0000,/* 0x78-0x7F */
+
+	0x934A,0x934B,0x934C,0x934D,0x934E,0x934F,0x9350,0x9351,/* 0x80-0x87 */
+	0x9352,0x9353,0x9354,0x9355,0x9356,0x9357,0x9358,0x9359,/* 0x88-0x8F */
+	0x935A,0x935B,0x935C,0x935D,0x935E,0x935F,0x9360,0x9361,/* 0x90-0x97 */
+	0x9362,0x9363,0x9364,0x9365,0x9366,0x9367,0x9368,0x9369,/* 0x98-0x9F */
+	0x936B,0x6FC9,0x6FA7,0x6FB9,0x6FB6,0x6FC2,0x6FE1,0x6FEE,/* 0xA0-0xA7 */
+	0x6FDE,0x6FE0,0x6FEF,0x701A,0x7023,0x701B,0x7039,0x7035,/* 0xA8-0xAF */
+	0x704F,0x705E,0x5B80,0x5B84,0x5B95,0x5B93,0x5BA5,0x5BB8,/* 0xB0-0xB7 */
+	0x752F,0x9A9E,0x6434,0x5BE4,0x5BEE,0x8930,0x5BF0,0x8E47,/* 0xB8-0xBF */
+	0x8B07,0x8FB6,0x8FD3,0x8FD5,0x8FE5,0x8FEE,0x8FE4,0x8FE9,/* 0xC0-0xC7 */
+	0x8FE6,0x8FF3,0x8FE8,0x9005,0x9004,0x900B,0x9026,0x9011,/* 0xC8-0xCF */
+	0x900D,0x9016,0x9021,0x9035,0x9036,0x902D,0x902F,0x9044,/* 0xD0-0xD7 */
+	0x9051,0x9052,0x9050,0x9068,0x9058,0x9062,0x905B,0x66B9,/* 0xD8-0xDF */
+	0x9074,0x907D,0x9082,0x9088,0x9083,0x908B,0x5F50,0x5F57,/* 0xE0-0xE7 */
+	0x5F56,0x5F58,0x5C3B,0x54AB,0x5C50,0x5C59,0x5B71,0x5C63,/* 0xE8-0xEF */
+	0x5C66,0x7FBC,0x5F2A,0x5F29,0x5F2D,0x8274,0x5F3C,0x9B3B,/* 0xF0-0xF7 */
+	0x5C6E,0x5981,0x5983,0x598D,0x59A9,0x59AA,0x59A3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x936C,0x936D,0x936E,0x936F,0x9370,0x9371,0x9372,0x9373,/* 0x40-0x47 */
+	0x9374,0x9375,0x9376,0x9377,0x9378,0x9379,0x937A,0x937B,/* 0x48-0x4F */
+	0x937C,0x937D,0x937E,0x937F,0x9380,0x9381,0x9382,0x9383,/* 0x50-0x57 */
+	0x9384,0x9385,0x9386,0x9387,0x9388,0x9389,0x938A,0x938B,/* 0x58-0x5F */
+	0x938C,0x938D,0x938E,0x9390,0x9391,0x9392,0x9393,0x9394,/* 0x60-0x67 */
+	0x9395,0x9396,0x9397,0x9398,0x9399,0x939A,0x939B,0x939C,/* 0x68-0x6F */
+	0x939D,0x939E,0x939F,0x93A0,0x93A1,0x93A2,0x93A3,0x93A4,/* 0x70-0x77 */
+	0x93A5,0x93A6,0x93A7,0x93A8,0x93A9,0x93AA,0x93AB,0x0000,/* 0x78-0x7F */
+
+	0x93AC,0x93AD,0x93AE,0x93AF,0x93B0,0x93B1,0x93B2,0x93B3,/* 0x80-0x87 */
+	0x93B4,0x93B5,0x93B6,0x93B7,0x93B8,0x93B9,0x93BA,0x93BB,/* 0x88-0x8F */
+	0x93BC,0x93BD,0x93BE,0x93BF,0x93C0,0x93C1,0x93C2,0x93C3,/* 0x90-0x97 */
+	0x93C4,0x93C5,0x93C6,0x93C7,0x93C8,0x93C9,0x93CB,0x93CC,/* 0x98-0x9F */
+	0x93CD,0x5997,0x59CA,0x59AB,0x599E,0x59A4,0x59D2,0x59B2,/* 0xA0-0xA7 */
+	0x59AF,0x59D7,0x59BE,0x5A05,0x5A06,0x59DD,0x5A08,0x59E3,/* 0xA8-0xAF */
+	0x59D8,0x59F9,0x5A0C,0x5A09,0x5A32,0x5A34,0x5A11,0x5A23,/* 0xB0-0xB7 */
+	0x5A13,0x5A40,0x5A67,0x5A4A,0x5A55,0x5A3C,0x5A62,0x5A75,/* 0xB8-0xBF */
+	0x80EC,0x5AAA,0x5A9B,0x5A77,0x5A7A,0x5ABE,0x5AEB,0x5AB2,/* 0xC0-0xC7 */
+	0x5AD2,0x5AD4,0x5AB8,0x5AE0,0x5AE3,0x5AF1,0x5AD6,0x5AE6,/* 0xC8-0xCF */
+	0x5AD8,0x5ADC,0x5B09,0x5B17,0x5B16,0x5B32,0x5B37,0x5B40,/* 0xD0-0xD7 */
+	0x5C15,0x5C1C,0x5B5A,0x5B65,0x5B73,0x5B51,0x5B53,0x5B62,/* 0xD8-0xDF */
+	0x9A75,0x9A77,0x9A78,0x9A7A,0x9A7F,0x9A7D,0x9A80,0x9A81,/* 0xE0-0xE7 */
+	0x9A85,0x9A88,0x9A8A,0x9A90,0x9A92,0x9A93,0x9A96,0x9A98,/* 0xE8-0xEF */
+	0x9A9B,0x9A9C,0x9A9D,0x9A9F,0x9AA0,0x9AA2,0x9AA3,0x9AA5,/* 0xF0-0xF7 */
+	0x9AA7,0x7E9F,0x7EA1,0x7EA3,0x7EA5,0x7EA8,0x7EA9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x93CE,0x93CF,0x93D0,0x93D1,0x93D2,0x93D3,0x93D4,0x93D5,/* 0x40-0x47 */
+	0x93D7,0x93D8,0x93D9,0x93DA,0x93DB,0x93DC,0x93DD,0x93DE,/* 0x48-0x4F */
+	0x93DF,0x93E0,0x93E1,0x93E2,0x93E3,0x93E4,0x93E5,0x93E6,/* 0x50-0x57 */
+	0x93E7,0x93E8,0x93E9,0x93EA,0x93EB,0x93EC,0x93ED,0x93EE,/* 0x58-0x5F */
+	0x93EF,0x93F0,0x93F1,0x93F2,0x93F3,0x93F4,0x93F5,0x93F6,/* 0x60-0x67 */
+	0x93F7,0x93F8,0x93F9,0x93FA,0x93FB,0x93FC,0x93FD,0x93FE,/* 0x68-0x6F */
+	0x93FF,0x9400,0x9401,0x9402,0x9403,0x9404,0x9405,0x9406,/* 0x70-0x77 */
+	0x9407,0x9408,0x9409,0x940A,0x940B,0x940C,0x940D,0x0000,/* 0x78-0x7F */
+
+	0x940E,0x940F,0x9410,0x9411,0x9412,0x9413,0x9414,0x9415,/* 0x80-0x87 */
+	0x9416,0x9417,0x9418,0x9419,0x941A,0x941B,0x941C,0x941D,/* 0x88-0x8F */
+	0x941E,0x941F,0x9420,0x9421,0x9422,0x9423,0x9424,0x9425,/* 0x90-0x97 */
+	0x9426,0x9427,0x9428,0x9429,0x942A,0x942B,0x942C,0x942D,/* 0x98-0x9F */
+	0x942E,0x7EAD,0x7EB0,0x7EBE,0x7EC0,0x7EC1,0x7EC2,0x7EC9,/* 0xA0-0xA7 */
+	0x7ECB,0x7ECC,0x7ED0,0x7ED4,0x7ED7,0x7EDB,0x7EE0,0x7EE1,/* 0xA8-0xAF */
+	0x7EE8,0x7EEB,0x7EEE,0x7EEF,0x7EF1,0x7EF2,0x7F0D,0x7EF6,/* 0xB0-0xB7 */
+	0x7EFA,0x7EFB,0x7EFE,0x7F01,0x7F02,0x7F03,0x7F07,0x7F08,/* 0xB8-0xBF */
+	0x7F0B,0x7F0C,0x7F0F,0x7F11,0x7F12,0x7F17,0x7F19,0x7F1C,/* 0xC0-0xC7 */
+	0x7F1B,0x7F1F,0x7F21,0x7F22,0x7F23,0x7F24,0x7F25,0x7F26,/* 0xC8-0xCF */
+	0x7F27,0x7F2A,0x7F2B,0x7F2C,0x7F2D,0x7F2F,0x7F30,0x7F31,/* 0xD0-0xD7 */
+	0x7F32,0x7F33,0x7F35,0x5E7A,0x757F,0x5DDB,0x753E,0x9095,/* 0xD8-0xDF */
+	0x738E,0x7391,0x73AE,0x73A2,0x739F,0x73CF,0x73C2,0x73D1,/* 0xE0-0xE7 */
+	0x73B7,0x73B3,0x73C0,0x73C9,0x73C8,0x73E5,0x73D9,0x987C,/* 0xE8-0xEF */
+	0x740A,0x73E9,0x73E7,0x73DE,0x73BA,0x73F2,0x740F,0x742A,/* 0xF0-0xF7 */
+	0x745B,0x7426,0x7425,0x7428,0x7430,0x742E,0x742C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x942F,0x9430,0x9431,0x9432,0x9433,0x9434,0x9435,0x9436,/* 0x40-0x47 */
+	0x9437,0x9438,0x9439,0x943A,0x943B,0x943C,0x943D,0x943F,/* 0x48-0x4F */
+	0x9440,0x9441,0x9442,0x9443,0x9444,0x9445,0x9446,0x9447,/* 0x50-0x57 */
+	0x9448,0x9449,0x944A,0x944B,0x944C,0x944D,0x944E,0x944F,/* 0x58-0x5F */
+	0x9450,0x9451,0x9452,0x9453,0x9454,0x9455,0x9456,0x9457,/* 0x60-0x67 */
+	0x9458,0x9459,0x945A,0x945B,0x945C,0x945D,0x945E,0x945F,/* 0x68-0x6F */
+	0x9460,0x9461,0x9462,0x9463,0x9464,0x9465,0x9466,0x9467,/* 0x70-0x77 */
+	0x9468,0x9469,0x946A,0x946C,0x946D,0x946E,0x946F,0x0000,/* 0x78-0x7F */
+
+	0x9470,0x9471,0x9472,0x9473,0x9474,0x9475,0x9476,0x9477,/* 0x80-0x87 */
+	0x9478,0x9479,0x947A,0x947B,0x947C,0x947D,0x947E,0x947F,/* 0x88-0x8F */
+	0x9480,0x9481,0x9482,0x9483,0x9484,0x9491,0x9496,0x9498,/* 0x90-0x97 */
+	0x94C7,0x94CF,0x94D3,0x94D4,0x94DA,0x94E6,0x94FB,0x951C,/* 0x98-0x9F */
+	0x9520,0x741B,0x741A,0x7441,0x745C,0x7457,0x7455,0x7459,/* 0xA0-0xA7 */
+	0x7477,0x746D,0x747E,0x749C,0x748E,0x7480,0x7481,0x7487,/* 0xA8-0xAF */
+	0x748B,0x749E,0x74A8,0x74A9,0x7490,0x74A7,0x74D2,0x74BA,/* 0xB0-0xB7 */
+	0x97EA,0x97EB,0x97EC,0x674C,0x6753,0x675E,0x6748,0x6769,/* 0xB8-0xBF */
+	0x67A5,0x6787,0x676A,0x6773,0x6798,0x67A7,0x6775,0x67A8,/* 0xC0-0xC7 */
+	0x679E,0x67AD,0x678B,0x6777,0x677C,0x67F0,0x6809,0x67D8,/* 0xC8-0xCF */
+	0x680A,0x67E9,0x67B0,0x680C,0x67D9,0x67B5,0x67DA,0x67B3,/* 0xD0-0xD7 */
+	0x67DD,0x6800,0x67C3,0x67B8,0x67E2,0x680E,0x67C1,0x67FD,/* 0xD8-0xDF */
+	0x6832,0x6833,0x6860,0x6861,0x684E,0x6862,0x6844,0x6864,/* 0xE0-0xE7 */
+	0x6883,0x681D,0x6855,0x6866,0x6841,0x6867,0x6840,0x683E,/* 0xE8-0xEF */
+	0x684A,0x6849,0x6829,0x68B5,0x688F,0x6874,0x6877,0x6893,/* 0xF0-0xF7 */
+	0x686B,0x68C2,0x696E,0x68FC,0x691F,0x6920,0x68F9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9527,0x9533,0x953D,0x9543,0x9548,0x954B,0x9555,0x955A,/* 0x40-0x47 */
+	0x9560,0x956E,0x9574,0x9575,0x9577,0x9578,0x9579,0x957A,/* 0x48-0x4F */
+	0x957B,0x957C,0x957D,0x957E,0x9580,0x9581,0x9582,0x9583,/* 0x50-0x57 */
+	0x9584,0x9585,0x9586,0x9587,0x9588,0x9589,0x958A,0x958B,/* 0x58-0x5F */
+	0x958C,0x958D,0x958E,0x958F,0x9590,0x9591,0x9592,0x9593,/* 0x60-0x67 */
+	0x9594,0x9595,0x9596,0x9597,0x9598,0x9599,0x959A,0x959B,/* 0x68-0x6F */
+	0x959C,0x959D,0x959E,0x959F,0x95A0,0x95A1,0x95A2,0x95A3,/* 0x70-0x77 */
+	0x95A4,0x95A5,0x95A6,0x95A7,0x95A8,0x95A9,0x95AA,0x0000,/* 0x78-0x7F */
+
+	0x95AB,0x95AC,0x95AD,0x95AE,0x95AF,0x95B0,0x95B1,0x95B2,/* 0x80-0x87 */
+	0x95B3,0x95B4,0x95B5,0x95B6,0x95B7,0x95B8,0x95B9,0x95BA,/* 0x88-0x8F */
+	0x95BB,0x95BC,0x95BD,0x95BE,0x95BF,0x95C0,0x95C1,0x95C2,/* 0x90-0x97 */
+	0x95C3,0x95C4,0x95C5,0x95C6,0x95C7,0x95C8,0x95C9,0x95CA,/* 0x98-0x9F */
+	0x95CB,0x6924,0x68F0,0x690B,0x6901,0x6957,0x68E3,0x6910,/* 0xA0-0xA7 */
+	0x6971,0x6939,0x6960,0x6942,0x695D,0x6984,0x696B,0x6980,/* 0xA8-0xAF */
+	0x6998,0x6978,0x6934,0x69CC,0x6987,0x6988,0x69CE,0x6989,/* 0xB0-0xB7 */
+	0x6966,0x6963,0x6979,0x699B,0x69A7,0x69BB,0x69AB,0x69AD,/* 0xB8-0xBF */
+	0x69D4,0x69B1,0x69C1,0x69CA,0x69DF,0x6995,0x69E0,0x698D,/* 0xC0-0xC7 */
+	0x69FF,0x6A2F,0x69ED,0x6A17,0x6A18,0x6A65,0x69F2,0x6A44,/* 0xC8-0xCF */
+	0x6A3E,0x6AA0,0x6A50,0x6A5B,0x6A35,0x6A8E,0x6A79,0x6A3D,/* 0xD0-0xD7 */
+	0x6A28,0x6A58,0x6A7C,0x6A91,0x6A90,0x6AA9,0x6A97,0x6AAB,/* 0xD8-0xDF */
+	0x7337,0x7352,0x6B81,0x6B82,0x6B87,0x6B84,0x6B92,0x6B93,/* 0xE0-0xE7 */
+	0x6B8D,0x6B9A,0x6B9B,0x6BA1,0x6BAA,0x8F6B,0x8F6D,0x8F71,/* 0xE8-0xEF */
+	0x8F72,0x8F73,0x8F75,0x8F76,0x8F78,0x8F77,0x8F79,0x8F7A,/* 0xF0-0xF7 */
+	0x8F7C,0x8F7E,0x8F81,0x8F82,0x8F84,0x8F87,0x8F8B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x95CC,0x95CD,0x95CE,0x95CF,0x95D0,0x95D1,0x95D2,0x95D3,/* 0x40-0x47 */
+	0x95D4,0x95D5,0x95D6,0x95D7,0x95D8,0x95D9,0x95DA,0x95DB,/* 0x48-0x4F */
+	0x95DC,0x95DD,0x95DE,0x95DF,0x95E0,0x95E1,0x95E2,0x95E3,/* 0x50-0x57 */
+	0x95E4,0x95E5,0x95E6,0x95E7,0x95EC,0x95FF,0x9607,0x9613,/* 0x58-0x5F */
+	0x9618,0x961B,0x961E,0x9620,0x9623,0x9624,0x9625,0x9626,/* 0x60-0x67 */
+	0x9627,0x9628,0x9629,0x962B,0x962C,0x962D,0x962F,0x9630,/* 0x68-0x6F */
+	0x9637,0x9638,0x9639,0x963A,0x963E,0x9641,0x9643,0x964A,/* 0x70-0x77 */
+	0x964E,0x964F,0x9651,0x9652,0x9653,0x9656,0x9657,0x0000,/* 0x78-0x7F */
+
+	0x9658,0x9659,0x965A,0x965C,0x965D,0x965E,0x9660,0x9663,/* 0x80-0x87 */
+	0x9665,0x9666,0x966B,0x966D,0x966E,0x966F,0x9670,0x9671,/* 0x88-0x8F */
+	0x9673,0x9678,0x9679,0x967A,0x967B,0x967C,0x967D,0x967E,/* 0x90-0x97 */
+	0x967F,0x9680,0x9681,0x9682,0x9683,0x9684,0x9687,0x9689,/* 0x98-0x9F */
+	0x968A,0x8F8D,0x8F8E,0x8F8F,0x8F98,0x8F9A,0x8ECE,0x620B,/* 0xA0-0xA7 */
+	0x6217,0x621B,0x621F,0x6222,0x6221,0x6225,0x6224,0x622C,/* 0xA8-0xAF */
+	0x81E7,0x74EF,0x74F4,0x74FF,0x750F,0x7511,0x7513,0x6534,/* 0xB0-0xB7 */
+	0x65EE,0x65EF,0x65F0,0x660A,0x6619,0x6772,0x6603,0x6615,/* 0xB8-0xBF */
+	0x6600,0x7085,0x66F7,0x661D,0x6634,0x6631,0x6636,0x6635,/* 0xC0-0xC7 */
+	0x8006,0x665F,0x6654,0x6641,0x664F,0x6656,0x6661,0x6657,/* 0xC8-0xCF */
+	0x6677,0x6684,0x668C,0x66A7,0x669D,0x66BE,0x66DB,0x66DC,/* 0xD0-0xD7 */
+	0x66E6,0x66E9,0x8D32,0x8D33,0x8D36,0x8D3B,0x8D3D,0x8D40,/* 0xD8-0xDF */
+	0x8D45,0x8D46,0x8D48,0x8D49,0x8D47,0x8D4D,0x8D55,0x8D59,/* 0xE0-0xE7 */
+	0x89C7,0x89CA,0x89CB,0x89CC,0x89CE,0x89CF,0x89D0,0x89D1,/* 0xE8-0xEF */
+	0x726E,0x729F,0x725D,0x7266,0x726F,0x727E,0x727F,0x7284,/* 0xF0-0xF7 */
+	0x728B,0x728D,0x728F,0x7292,0x6308,0x6332,0x63B0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x968C,0x968E,0x9691,0x9692,0x9693,0x9695,0x9696,0x969A,/* 0x40-0x47 */
+	0x969B,0x969D,0x969E,0x969F,0x96A0,0x96A1,0x96A2,0x96A3,/* 0x48-0x4F */
+	0x96A4,0x96A5,0x96A6,0x96A8,0x96A9,0x96AA,0x96AB,0x96AC,/* 0x50-0x57 */
+	0x96AD,0x96AE,0x96AF,0x96B1,0x96B2,0x96B4,0x96B5,0x96B7,/* 0x58-0x5F */
+	0x96B8,0x96BA,0x96BB,0x96BF,0x96C2,0x96C3,0x96C8,0x96CA,/* 0x60-0x67 */
+	0x96CB,0x96D0,0x96D1,0x96D3,0x96D4,0x96D6,0x96D7,0x96D8,/* 0x68-0x6F */
+	0x96D9,0x96DA,0x96DB,0x96DC,0x96DD,0x96DE,0x96DF,0x96E1,/* 0x70-0x77 */
+	0x96E2,0x96E3,0x96E4,0x96E5,0x96E6,0x96E7,0x96EB,0x0000,/* 0x78-0x7F */
+
+	0x96EC,0x96ED,0x96EE,0x96F0,0x96F1,0x96F2,0x96F4,0x96F5,/* 0x80-0x87 */
+	0x96F8,0x96FA,0x96FB,0x96FC,0x96FD,0x96FF,0x9702,0x9703,/* 0x88-0x8F */
+	0x9705,0x970A,0x970B,0x970C,0x9710,0x9711,0x9712,0x9714,/* 0x90-0x97 */
+	0x9715,0x9717,0x9718,0x9719,0x971A,0x971B,0x971D,0x971F,/* 0x98-0x9F */
+	0x9720,0x643F,0x64D8,0x8004,0x6BEA,0x6BF3,0x6BFD,0x6BF5,/* 0xA0-0xA7 */
+	0x6BF9,0x6C05,0x6C07,0x6C06,0x6C0D,0x6C15,0x6C18,0x6C19,/* 0xA8-0xAF */
+	0x6C1A,0x6C21,0x6C29,0x6C24,0x6C2A,0x6C32,0x6535,0x6555,/* 0xB0-0xB7 */
+	0x656B,0x724D,0x7252,0x7256,0x7230,0x8662,0x5216,0x809F,/* 0xB8-0xBF */
+	0x809C,0x8093,0x80BC,0x670A,0x80BD,0x80B1,0x80AB,0x80AD,/* 0xC0-0xC7 */
+	0x80B4,0x80B7,0x80E7,0x80E8,0x80E9,0x80EA,0x80DB,0x80C2,/* 0xC8-0xCF */
+	0x80C4,0x80D9,0x80CD,0x80D7,0x6710,0x80DD,0x80EB,0x80F1,/* 0xD0-0xD7 */
+	0x80F4,0x80ED,0x810D,0x810E,0x80F2,0x80FC,0x6715,0x8112,/* 0xD8-0xDF */
+	0x8C5A,0x8136,0x811E,0x812C,0x8118,0x8132,0x8148,0x814C,/* 0xE0-0xE7 */
+	0x8153,0x8174,0x8159,0x815A,0x8171,0x8160,0x8169,0x817C,/* 0xE8-0xEF */
+	0x817D,0x816D,0x8167,0x584D,0x5AB5,0x8188,0x8182,0x8191,/* 0xF0-0xF7 */
+	0x6ED5,0x81A3,0x81AA,0x81CC,0x6726,0x81CA,0x81BB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9721,0x9722,0x9723,0x9724,0x9725,0x9726,0x9727,0x9728,/* 0x40-0x47 */
+	0x9729,0x972B,0x972C,0x972E,0x972F,0x9731,0x9733,0x9734,/* 0x48-0x4F */
+	0x9735,0x9736,0x9737,0x973A,0x973B,0x973C,0x973D,0x973F,/* 0x50-0x57 */
+	0x9740,0x9741,0x9742,0x9743,0x9744,0x9745,0x9746,0x9747,/* 0x58-0x5F */
+	0x9748,0x9749,0x974A,0x974B,0x974C,0x974D,0x974E,0x974F,/* 0x60-0x67 */
+	0x9750,0x9751,0x9754,0x9755,0x9757,0x9758,0x975A,0x975C,/* 0x68-0x6F */
+	0x975D,0x975F,0x9763,0x9764,0x9766,0x9767,0x9768,0x976A,/* 0x70-0x77 */
+	0x976B,0x976C,0x976D,0x976E,0x976F,0x9770,0x9771,0x0000,/* 0x78-0x7F */
+
+	0x9772,0x9775,0x9777,0x9778,0x9779,0x977A,0x977B,0x977D,/* 0x80-0x87 */
+	0x977E,0x977F,0x9780,0x9781,0x9782,0x9783,0x9784,0x9786,/* 0x88-0x8F */
+	0x9787,0x9788,0x9789,0x978A,0x978C,0x978E,0x978F,0x9790,/* 0x90-0x97 */
+	0x9793,0x9795,0x9796,0x9797,0x9799,0x979A,0x979B,0x979C,/* 0x98-0x9F */
+	0x979D,0x81C1,0x81A6,0x6B24,0x6B37,0x6B39,0x6B43,0x6B46,/* 0xA0-0xA7 */
+	0x6B59,0x98D1,0x98D2,0x98D3,0x98D5,0x98D9,0x98DA,0x6BB3,/* 0xA8-0xAF */
+	0x5F40,0x6BC2,0x89F3,0x6590,0x9F51,0x6593,0x65BC,0x65C6,/* 0xB0-0xB7 */
+	0x65C4,0x65C3,0x65CC,0x65CE,0x65D2,0x65D6,0x7080,0x709C,/* 0xB8-0xBF */
+	0x7096,0x709D,0x70BB,0x70C0,0x70B7,0x70AB,0x70B1,0x70E8,/* 0xC0-0xC7 */
+	0x70CA,0x7110,0x7113,0x7116,0x712F,0x7131,0x7173,0x715C,/* 0xC8-0xCF */
+	0x7168,0x7145,0x7172,0x714A,0x7178,0x717A,0x7198,0x71B3,/* 0xD0-0xD7 */
+	0x71B5,0x71A8,0x71A0,0x71E0,0x71D4,0x71E7,0x71F9,0x721D,/* 0xD8-0xDF */
+	0x7228,0x706C,0x7118,0x7166,0x71B9,0x623E,0x623D,0x6243,/* 0xE0-0xE7 */
+	0x6248,0x6249,0x793B,0x7940,0x7946,0x7949,0x795B,0x795C,/* 0xE8-0xEF */
+	0x7953,0x795A,0x7962,0x7957,0x7960,0x796F,0x7967,0x797A,/* 0xF0-0xF7 */
+	0x7985,0x798A,0x799A,0x79A7,0x79B3,0x5FD1,0x5FD0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x979E,0x979F,0x97A1,0x97A2,0x97A4,0x97A5,0x97A6,0x97A7,/* 0x40-0x47 */
+	0x97A8,0x97A9,0x97AA,0x97AC,0x97AE,0x97B0,0x97B1,0x97B3,/* 0x48-0x4F */
+	0x97B5,0x97B6,0x97B7,0x97B8,0x97B9,0x97BA,0x97BB,0x97BC,/* 0x50-0x57 */
+	0x97BD,0x97BE,0x97BF,0x97C0,0x97C1,0x97C2,0x97C3,0x97C4,/* 0x58-0x5F */
+	0x97C5,0x97C6,0x97C7,0x97C8,0x97C9,0x97CA,0x97CB,0x97CC,/* 0x60-0x67 */
+	0x97CD,0x97CE,0x97CF,0x97D0,0x97D1,0x97D2,0x97D3,0x97D4,/* 0x68-0x6F */
+	0x97D5,0x97D6,0x97D7,0x97D8,0x97D9,0x97DA,0x97DB,0x97DC,/* 0x70-0x77 */
+	0x97DD,0x97DE,0x97DF,0x97E0,0x97E1,0x97E2,0x97E3,0x0000,/* 0x78-0x7F */
+
+	0x97E4,0x97E5,0x97E8,0x97EE,0x97EF,0x97F0,0x97F1,0x97F2,/* 0x80-0x87 */
+	0x97F4,0x97F7,0x97F8,0x97F9,0x97FA,0x97FB,0x97FC,0x97FD,/* 0x88-0x8F */
+	0x97FE,0x97FF,0x9800,0x9801,0x9802,0x9803,0x9804,0x9805,/* 0x90-0x97 */
+	0x9806,0x9807,0x9808,0x9809,0x980A,0x980B,0x980C,0x980D,/* 0x98-0x9F */
+	0x980E,0x603C,0x605D,0x605A,0x6067,0x6041,0x6059,0x6063,/* 0xA0-0xA7 */
+	0x60AB,0x6106,0x610D,0x615D,0x61A9,0x619D,0x61CB,0x61D1,/* 0xA8-0xAF */
+	0x6206,0x8080,0x807F,0x6C93,0x6CF6,0x6DFC,0x77F6,0x77F8,/* 0xB0-0xB7 */
+	0x7800,0x7809,0x7817,0x7818,0x7811,0x65AB,0x782D,0x781C,/* 0xB8-0xBF */
+	0x781D,0x7839,0x783A,0x783B,0x781F,0x783C,0x7825,0x782C,/* 0xC0-0xC7 */
+	0x7823,0x7829,0x784E,0x786D,0x7856,0x7857,0x7826,0x7850,/* 0xC8-0xCF */
+	0x7847,0x784C,0x786A,0x789B,0x7893,0x789A,0x7887,0x789C,/* 0xD0-0xD7 */
+	0x78A1,0x78A3,0x78B2,0x78B9,0x78A5,0x78D4,0x78D9,0x78C9,/* 0xD8-0xDF */
+	0x78EC,0x78F2,0x7905,0x78F4,0x7913,0x7924,0x791E,0x7934,/* 0xE0-0xE7 */
+	0x9F9B,0x9EF9,0x9EFB,0x9EFC,0x76F1,0x7704,0x770D,0x76F9,/* 0xE8-0xEF */
+	0x7707,0x7708,0x771A,0x7722,0x7719,0x772D,0x7726,0x7735,/* 0xF0-0xF7 */
+	0x7738,0x7750,0x7751,0x7747,0x7743,0x775A,0x7768,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x980F,0x9810,0x9811,0x9812,0x9813,0x9814,0x9815,0x9816,/* 0x40-0x47 */
+	0x9817,0x9818,0x9819,0x981A,0x981B,0x981C,0x981D,0x981E,/* 0x48-0x4F */
+	0x981F,0x9820,0x9821,0x9822,0x9823,0x9824,0x9825,0x9826,/* 0x50-0x57 */
+	0x9827,0x9828,0x9829,0x982A,0x982B,0x982C,0x982D,0x982E,/* 0x58-0x5F */
+	0x982F,0x9830,0x9831,0x9832,0x9833,0x9834,0x9835,0x9836,/* 0x60-0x67 */
+	0x9837,0x9838,0x9839,0x983A,0x983B,0x983C,0x983D,0x983E,/* 0x68-0x6F */
+	0x983F,0x9840,0x9841,0x9842,0x9843,0x9844,0x9845,0x9846,/* 0x70-0x77 */
+	0x9847,0x9848,0x9849,0x984A,0x984B,0x984C,0x984D,0x0000,/* 0x78-0x7F */
+
+	0x984E,0x984F,0x9850,0x9851,0x9852,0x9853,0x9854,0x9855,/* 0x80-0x87 */
+	0x9856,0x9857,0x9858,0x9859,0x985A,0x985B,0x985C,0x985D,/* 0x88-0x8F */
+	0x985E,0x985F,0x9860,0x9861,0x9862,0x9863,0x9864,0x9865,/* 0x90-0x97 */
+	0x9866,0x9867,0x9868,0x9869,0x986A,0x986B,0x986C,0x986D,/* 0x98-0x9F */
+	0x986E,0x7762,0x7765,0x777F,0x778D,0x777D,0x7780,0x778C,/* 0xA0-0xA7 */
+	0x7791,0x779F,0x77A0,0x77B0,0x77B5,0x77BD,0x753A,0x7540,/* 0xA8-0xAF */
+	0x754E,0x754B,0x7548,0x755B,0x7572,0x7579,0x7583,0x7F58,/* 0xB0-0xB7 */
+	0x7F61,0x7F5F,0x8A48,0x7F68,0x7F74,0x7F71,0x7F79,0x7F81,/* 0xB8-0xBF */
+	0x7F7E,0x76CD,0x76E5,0x8832,0x9485,0x9486,0x9487,0x948B,/* 0xC0-0xC7 */
+	0x948A,0x948C,0x948D,0x948F,0x9490,0x9494,0x9497,0x9495,/* 0xC8-0xCF */
+	0x949A,0x949B,0x949C,0x94A3,0x94A4,0x94AB,0x94AA,0x94AD,/* 0xD0-0xD7 */
+	0x94AC,0x94AF,0x94B0,0x94B2,0x94B4,0x94B6,0x94B7,0x94B8,/* 0xD8-0xDF */
+	0x94B9,0x94BA,0x94BC,0x94BD,0x94BF,0x94C4,0x94C8,0x94C9,/* 0xE0-0xE7 */
+	0x94CA,0x94CB,0x94CC,0x94CD,0x94CE,0x94D0,0x94D1,0x94D2,/* 0xE8-0xEF */
+	0x94D5,0x94D6,0x94D7,0x94D9,0x94D8,0x94DB,0x94DE,0x94DF,/* 0xF0-0xF7 */
+	0x94E0,0x94E2,0x94E4,0x94E5,0x94E7,0x94E8,0x94EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x986F,0x9870,0x9871,0x9872,0x9873,0x9874,0x988B,0x988E,/* 0x40-0x47 */
+	0x9892,0x9895,0x9899,0x98A3,0x98A8,0x98A9,0x98AA,0x98AB,/* 0x48-0x4F */
+	0x98AC,0x98AD,0x98AE,0x98AF,0x98B0,0x98B1,0x98B2,0x98B3,/* 0x50-0x57 */
+	0x98B4,0x98B5,0x98B6,0x98B7,0x98B8,0x98B9,0x98BA,0x98BB,/* 0x58-0x5F */
+	0x98BC,0x98BD,0x98BE,0x98BF,0x98C0,0x98C1,0x98C2,0x98C3,/* 0x60-0x67 */
+	0x98C4,0x98C5,0x98C6,0x98C7,0x98C8,0x98C9,0x98CA,0x98CB,/* 0x68-0x6F */
+	0x98CC,0x98CD,0x98CF,0x98D0,0x98D4,0x98D6,0x98D7,0x98DB,/* 0x70-0x77 */
+	0x98DC,0x98DD,0x98E0,0x98E1,0x98E2,0x98E3,0x98E4,0x0000,/* 0x78-0x7F */
+
+	0x98E5,0x98E6,0x98E9,0x98EA,0x98EB,0x98EC,0x98ED,0x98EE,/* 0x80-0x87 */
+	0x98EF,0x98F0,0x98F1,0x98F2,0x98F3,0x98F4,0x98F5,0x98F6,/* 0x88-0x8F */
+	0x98F7,0x98F8,0x98F9,0x98FA,0x98FB,0x98FC,0x98FD,0x98FE,/* 0x90-0x97 */
+	0x98FF,0x9900,0x9901,0x9902,0x9903,0x9904,0x9905,0x9906,/* 0x98-0x9F */
+	0x9907,0x94E9,0x94EB,0x94EE,0x94EF,0x94F3,0x94F4,0x94F5,/* 0xA0-0xA7 */
+	0x94F7,0x94F9,0x94FC,0x94FD,0x94FF,0x9503,0x9502,0x9506,/* 0xA8-0xAF */
+	0x9507,0x9509,0x950A,0x950D,0x950E,0x950F,0x9512,0x9513,/* 0xB0-0xB7 */
+	0x9514,0x9515,0x9516,0x9518,0x951B,0x951D,0x951E,0x951F,/* 0xB8-0xBF */
+	0x9522,0x952A,0x952B,0x9529,0x952C,0x9531,0x9532,0x9534,/* 0xC0-0xC7 */
+	0x9536,0x9537,0x9538,0x953C,0x953E,0x953F,0x9542,0x9535,/* 0xC8-0xCF */
+	0x9544,0x9545,0x9546,0x9549,0x954C,0x954E,0x954F,0x9552,/* 0xD0-0xD7 */
+	0x9553,0x9554,0x9556,0x9557,0x9558,0x9559,0x955B,0x955E,/* 0xD8-0xDF */
+	0x955F,0x955D,0x9561,0x9562,0x9564,0x9565,0x9566,0x9567,/* 0xE0-0xE7 */
+	0x9568,0x9569,0x956A,0x956B,0x956C,0x956F,0x9571,0x9572,/* 0xE8-0xEF */
+	0x9573,0x953A,0x77E7,0x77EC,0x96C9,0x79D5,0x79ED,0x79E3,/* 0xF0-0xF7 */
+	0x79EB,0x7A06,0x5D47,0x7A03,0x7A02,0x7A1E,0x7A14,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9908,0x9909,0x990A,0x990B,0x990C,0x990E,0x990F,0x9911,/* 0x40-0x47 */
+	0x9912,0x9913,0x9914,0x9915,0x9916,0x9917,0x9918,0x9919,/* 0x48-0x4F */
+	0x991A,0x991B,0x991C,0x991D,0x991E,0x991F,0x9920,0x9921,/* 0x50-0x57 */
+	0x9922,0x9923,0x9924,0x9925,0x9926,0x9927,0x9928,0x9929,/* 0x58-0x5F */
+	0x992A,0x992B,0x992C,0x992D,0x992F,0x9930,0x9931,0x9932,/* 0x60-0x67 */
+	0x9933,0x9934,0x9935,0x9936,0x9937,0x9938,0x9939,0x993A,/* 0x68-0x6F */
+	0x993B,0x993C,0x993D,0x993E,0x993F,0x9940,0x9941,0x9942,/* 0x70-0x77 */
+	0x9943,0x9944,0x9945,0x9946,0x9947,0x9948,0x9949,0x0000,/* 0x78-0x7F */
+
+	0x994A,0x994B,0x994C,0x994D,0x994E,0x994F,0x9950,0x9951,/* 0x80-0x87 */
+	0x9952,0x9953,0x9956,0x9957,0x9958,0x9959,0x995A,0x995B,/* 0x88-0x8F */
+	0x995C,0x995D,0x995E,0x995F,0x9960,0x9961,0x9962,0x9964,/* 0x90-0x97 */
+	0x9966,0x9973,0x9978,0x9979,0x997B,0x997E,0x9982,0x9983,/* 0x98-0x9F */
+	0x9989,0x7A39,0x7A37,0x7A51,0x9ECF,0x99A5,0x7A70,0x7688,/* 0xA0-0xA7 */
+	0x768E,0x7693,0x7699,0x76A4,0x74DE,0x74E0,0x752C,0x9E20,/* 0xA8-0xAF */
+	0x9E22,0x9E28,0x9E29,0x9E2A,0x9E2B,0x9E2C,0x9E32,0x9E31,/* 0xB0-0xB7 */
+	0x9E36,0x9E38,0x9E37,0x9E39,0x9E3A,0x9E3E,0x9E41,0x9E42,/* 0xB8-0xBF */
+	0x9E44,0x9E46,0x9E47,0x9E48,0x9E49,0x9E4B,0x9E4C,0x9E4E,/* 0xC0-0xC7 */
+	0x9E51,0x9E55,0x9E57,0x9E5A,0x9E5B,0x9E5C,0x9E5E,0x9E63,/* 0xC8-0xCF */
+	0x9E66,0x9E67,0x9E68,0x9E69,0x9E6A,0x9E6B,0x9E6C,0x9E71,/* 0xD0-0xD7 */
+	0x9E6D,0x9E73,0x7592,0x7594,0x7596,0x75A0,0x759D,0x75AC,/* 0xD8-0xDF */
+	0x75A3,0x75B3,0x75B4,0x75B8,0x75C4,0x75B1,0x75B0,0x75C3,/* 0xE0-0xE7 */
+	0x75C2,0x75D6,0x75CD,0x75E3,0x75E8,0x75E6,0x75E4,0x75EB,/* 0xE8-0xEF */
+	0x75E7,0x7603,0x75F1,0x75FC,0x75FF,0x7610,0x7600,0x7605,/* 0xF0-0xF7 */
+	0x760C,0x7617,0x760A,0x7625,0x7618,0x7615,0x7619,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x998C,0x998E,0x999A,0x999B,0x999C,0x999D,0x999E,0x999F,/* 0x40-0x47 */
+	0x99A0,0x99A1,0x99A2,0x99A3,0x99A4,0x99A6,0x99A7,0x99A9,/* 0x48-0x4F */
+	0x99AA,0x99AB,0x99AC,0x99AD,0x99AE,0x99AF,0x99B0,0x99B1,/* 0x50-0x57 */
+	0x99B2,0x99B3,0x99B4,0x99B5,0x99B6,0x99B7,0x99B8,0x99B9,/* 0x58-0x5F */
+	0x99BA,0x99BB,0x99BC,0x99BD,0x99BE,0x99BF,0x99C0,0x99C1,/* 0x60-0x67 */
+	0x99C2,0x99C3,0x99C4,0x99C5,0x99C6,0x99C7,0x99C8,0x99C9,/* 0x68-0x6F */
+	0x99CA,0x99CB,0x99CC,0x99CD,0x99CE,0x99CF,0x99D0,0x99D1,/* 0x70-0x77 */
+	0x99D2,0x99D3,0x99D4,0x99D5,0x99D6,0x99D7,0x99D8,0x0000,/* 0x78-0x7F */
+
+	0x99D9,0x99DA,0x99DB,0x99DC,0x99DD,0x99DE,0x99DF,0x99E0,/* 0x80-0x87 */
+	0x99E1,0x99E2,0x99E3,0x99E4,0x99E5,0x99E6,0x99E7,0x99E8,/* 0x88-0x8F */
+	0x99E9,0x99EA,0x99EB,0x99EC,0x99ED,0x99EE,0x99EF,0x99F0,/* 0x90-0x97 */
+	0x99F1,0x99F2,0x99F3,0x99F4,0x99F5,0x99F6,0x99F7,0x99F8,/* 0x98-0x9F */
+	0x99F9,0x761B,0x763C,0x7622,0x7620,0x7640,0x762D,0x7630,/* 0xA0-0xA7 */
+	0x763F,0x7635,0x7643,0x763E,0x7633,0x764D,0x765E,0x7654,/* 0xA8-0xAF */
+	0x765C,0x7656,0x766B,0x766F,0x7FCA,0x7AE6,0x7A78,0x7A79,/* 0xB0-0xB7 */
+	0x7A80,0x7A86,0x7A88,0x7A95,0x7AA6,0x7AA0,0x7AAC,0x7AA8,/* 0xB8-0xBF */
+	0x7AAD,0x7AB3,0x8864,0x8869,0x8872,0x887D,0x887F,0x8882,/* 0xC0-0xC7 */
+	0x88A2,0x88C6,0x88B7,0x88BC,0x88C9,0x88E2,0x88CE,0x88E3,/* 0xC8-0xCF */
+	0x88E5,0x88F1,0x891A,0x88FC,0x88E8,0x88FE,0x88F0,0x8921,/* 0xD0-0xD7 */
+	0x8919,0x8913,0x891B,0x890A,0x8934,0x892B,0x8936,0x8941,/* 0xD8-0xDF */
+	0x8966,0x897B,0x758B,0x80E5,0x76B2,0x76B4,0x77DC,0x8012,/* 0xE0-0xE7 */
+	0x8014,0x8016,0x801C,0x8020,0x8022,0x8025,0x8026,0x8027,/* 0xE8-0xEF */
+	0x8029,0x8028,0x8031,0x800B,0x8035,0x8043,0x8046,0x804D,/* 0xF0-0xF7 */
+	0x8052,0x8069,0x8071,0x8983,0x9878,0x9880,0x9883,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x99FA,0x99FB,0x99FC,0x99FD,0x99FE,0x99FF,0x9A00,0x9A01,/* 0x40-0x47 */
+	0x9A02,0x9A03,0x9A04,0x9A05,0x9A06,0x9A07,0x9A08,0x9A09,/* 0x48-0x4F */
+	0x9A0A,0x9A0B,0x9A0C,0x9A0D,0x9A0E,0x9A0F,0x9A10,0x9A11,/* 0x50-0x57 */
+	0x9A12,0x9A13,0x9A14,0x9A15,0x9A16,0x9A17,0x9A18,0x9A19,/* 0x58-0x5F */
+	0x9A1A,0x9A1B,0x9A1C,0x9A1D,0x9A1E,0x9A1F,0x9A20,0x9A21,/* 0x60-0x67 */
+	0x9A22,0x9A23,0x9A24,0x9A25,0x9A26,0x9A27,0x9A28,0x9A29,/* 0x68-0x6F */
+	0x9A2A,0x9A2B,0x9A2C,0x9A2D,0x9A2E,0x9A2F,0x9A30,0x9A31,/* 0x70-0x77 */
+	0x9A32,0x9A33,0x9A34,0x9A35,0x9A36,0x9A37,0x9A38,0x0000,/* 0x78-0x7F */
+
+	0x9A39,0x9A3A,0x9A3B,0x9A3C,0x9A3D,0x9A3E,0x9A3F,0x9A40,/* 0x80-0x87 */
+	0x9A41,0x9A42,0x9A43,0x9A44,0x9A45,0x9A46,0x9A47,0x9A48,/* 0x88-0x8F */
+	0x9A49,0x9A4A,0x9A4B,0x9A4C,0x9A4D,0x9A4E,0x9A4F,0x9A50,/* 0x90-0x97 */
+	0x9A51,0x9A52,0x9A53,0x9A54,0x9A55,0x9A56,0x9A57,0x9A58,/* 0x98-0x9F */
+	0x9A59,0x9889,0x988C,0x988D,0x988F,0x9894,0x989A,0x989B,/* 0xA0-0xA7 */
+	0x989E,0x989F,0x98A1,0x98A2,0x98A5,0x98A6,0x864D,0x8654,/* 0xA8-0xAF */
+	0x866C,0x866E,0x867F,0x867A,0x867C,0x867B,0x86A8,0x868D,/* 0xB0-0xB7 */
+	0x868B,0x86AC,0x869D,0x86A7,0x86A3,0x86AA,0x8693,0x86A9,/* 0xB8-0xBF */
+	0x86B6,0x86C4,0x86B5,0x86CE,0x86B0,0x86BA,0x86B1,0x86AF,/* 0xC0-0xC7 */
+	0x86C9,0x86CF,0x86B4,0x86E9,0x86F1,0x86F2,0x86ED,0x86F3,/* 0xC8-0xCF */
+	0x86D0,0x8713,0x86DE,0x86F4,0x86DF,0x86D8,0x86D1,0x8703,/* 0xD0-0xD7 */
+	0x8707,0x86F8,0x8708,0x870A,0x870D,0x8709,0x8723,0x873B,/* 0xD8-0xDF */
+	0x871E,0x8725,0x872E,0x871A,0x873E,0x8748,0x8734,0x8731,/* 0xE0-0xE7 */
+	0x8729,0x8737,0x873F,0x8782,0x8722,0x877D,0x877E,0x877B,/* 0xE8-0xEF */
+	0x8760,0x8770,0x874C,0x876E,0x878B,0x8753,0x8763,0x877C,/* 0xF0-0xF7 */
+	0x8764,0x8759,0x8765,0x8793,0x87AF,0x87A8,0x87D2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9A5A,0x9A5B,0x9A5C,0x9A5D,0x9A5E,0x9A5F,0x9A60,0x9A61,/* 0x40-0x47 */
+	0x9A62,0x9A63,0x9A64,0x9A65,0x9A66,0x9A67,0x9A68,0x9A69,/* 0x48-0x4F */
+	0x9A6A,0x9A6B,0x9A72,0x9A83,0x9A89,0x9A8D,0x9A8E,0x9A94,/* 0x50-0x57 */
+	0x9A95,0x9A99,0x9AA6,0x9AA9,0x9AAA,0x9AAB,0x9AAC,0x9AAD,/* 0x58-0x5F */
+	0x9AAE,0x9AAF,0x9AB2,0x9AB3,0x9AB4,0x9AB5,0x9AB9,0x9ABB,/* 0x60-0x67 */
+	0x9ABD,0x9ABE,0x9ABF,0x9AC3,0x9AC4,0x9AC6,0x9AC7,0x9AC8,/* 0x68-0x6F */
+	0x9AC9,0x9ACA,0x9ACD,0x9ACE,0x9ACF,0x9AD0,0x9AD2,0x9AD4,/* 0x70-0x77 */
+	0x9AD5,0x9AD6,0x9AD7,0x9AD9,0x9ADA,0x9ADB,0x9ADC,0x0000,/* 0x78-0x7F */
+
+	0x9ADD,0x9ADE,0x9AE0,0x9AE2,0x9AE3,0x9AE4,0x9AE5,0x9AE7,/* 0x80-0x87 */
+	0x9AE8,0x9AE9,0x9AEA,0x9AEC,0x9AEE,0x9AF0,0x9AF1,0x9AF2,/* 0x88-0x8F */
+	0x9AF3,0x9AF4,0x9AF5,0x9AF6,0x9AF7,0x9AF8,0x9AFA,0x9AFC,/* 0x90-0x97 */
+	0x9AFD,0x9AFE,0x9AFF,0x9B00,0x9B01,0x9B02,0x9B04,0x9B05,/* 0x98-0x9F */
+	0x9B06,0x87C6,0x8788,0x8785,0x87AD,0x8797,0x8783,0x87AB,/* 0xA0-0xA7 */
+	0x87E5,0x87AC,0x87B5,0x87B3,0x87CB,0x87D3,0x87BD,0x87D1,/* 0xA8-0xAF */
+	0x87C0,0x87CA,0x87DB,0x87EA,0x87E0,0x87EE,0x8816,0x8813,/* 0xB0-0xB7 */
+	0x87FE,0x880A,0x881B,0x8821,0x8839,0x883C,0x7F36,0x7F42,/* 0xB8-0xBF */
+	0x7F44,0x7F45,0x8210,0x7AFA,0x7AFD,0x7B08,0x7B03,0x7B04,/* 0xC0-0xC7 */
+	0x7B15,0x7B0A,0x7B2B,0x7B0F,0x7B47,0x7B38,0x7B2A,0x7B19,/* 0xC8-0xCF */
+	0x7B2E,0x7B31,0x7B20,0x7B25,0x7B24,0x7B33,0x7B3E,0x7B1E,/* 0xD0-0xD7 */
+	0x7B58,0x7B5A,0x7B45,0x7B75,0x7B4C,0x7B5D,0x7B60,0x7B6E,/* 0xD8-0xDF */
+	0x7B7B,0x7B62,0x7B72,0x7B71,0x7B90,0x7BA6,0x7BA7,0x7BB8,/* 0xE0-0xE7 */
+	0x7BAC,0x7B9D,0x7BA8,0x7B85,0x7BAA,0x7B9C,0x7BA2,0x7BAB,/* 0xE8-0xEF */
+	0x7BB4,0x7BD1,0x7BC1,0x7BCC,0x7BDD,0x7BDA,0x7BE5,0x7BE6,/* 0xF0-0xF7 */
+	0x7BEA,0x7C0C,0x7BFE,0x7BFC,0x7C0F,0x7C16,0x7C0B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9B07,0x9B09,0x9B0A,0x9B0B,0x9B0C,0x9B0D,0x9B0E,0x9B10,/* 0x40-0x47 */
+	0x9B11,0x9B12,0x9B14,0x9B15,0x9B16,0x9B17,0x9B18,0x9B19,/* 0x48-0x4F */
+	0x9B1A,0x9B1B,0x9B1C,0x9B1D,0x9B1E,0x9B20,0x9B21,0x9B22,/* 0x50-0x57 */
+	0x9B24,0x9B25,0x9B26,0x9B27,0x9B28,0x9B29,0x9B2A,0x9B2B,/* 0x58-0x5F */
+	0x9B2C,0x9B2D,0x9B2E,0x9B30,0x9B31,0x9B33,0x9B34,0x9B35,/* 0x60-0x67 */
+	0x9B36,0x9B37,0x9B38,0x9B39,0x9B3A,0x9B3D,0x9B3E,0x9B3F,/* 0x68-0x6F */
+	0x9B40,0x9B46,0x9B4A,0x9B4B,0x9B4C,0x9B4E,0x9B50,0x9B52,/* 0x70-0x77 */
+	0x9B53,0x9B55,0x9B56,0x9B57,0x9B58,0x9B59,0x9B5A,0x0000,/* 0x78-0x7F */
+
+	0x9B5B,0x9B5C,0x9B5D,0x9B5E,0x9B5F,0x9B60,0x9B61,0x9B62,/* 0x80-0x87 */
+	0x9B63,0x9B64,0x9B65,0x9B66,0x9B67,0x9B68,0x9B69,0x9B6A,/* 0x88-0x8F */
+	0x9B6B,0x9B6C,0x9B6D,0x9B6E,0x9B6F,0x9B70,0x9B71,0x9B72,/* 0x90-0x97 */
+	0x9B73,0x9B74,0x9B75,0x9B76,0x9B77,0x9B78,0x9B79,0x9B7A,/* 0x98-0x9F */
+	0x9B7B,0x7C1F,0x7C2A,0x7C26,0x7C38,0x7C41,0x7C40,0x81FE,/* 0xA0-0xA7 */
+	0x8201,0x8202,0x8204,0x81EC,0x8844,0x8221,0x8222,0x8223,/* 0xA8-0xAF */
+	0x822D,0x822F,0x8228,0x822B,0x8238,0x823B,0x8233,0x8234,/* 0xB0-0xB7 */
+	0x823E,0x8244,0x8249,0x824B,0x824F,0x825A,0x825F,0x8268,/* 0xB8-0xBF */
+	0x887E,0x8885,0x8888,0x88D8,0x88DF,0x895E,0x7F9D,0x7F9F,/* 0xC0-0xC7 */
+	0x7FA7,0x7FAF,0x7FB0,0x7FB2,0x7C7C,0x6549,0x7C91,0x7C9D,/* 0xC8-0xCF */
+	0x7C9C,0x7C9E,0x7CA2,0x7CB2,0x7CBC,0x7CBD,0x7CC1,0x7CC7,/* 0xD0-0xD7 */
+	0x7CCC,0x7CCD,0x7CC8,0x7CC5,0x7CD7,0x7CE8,0x826E,0x66A8,/* 0xD8-0xDF */
+	0x7FBF,0x7FCE,0x7FD5,0x7FE5,0x7FE1,0x7FE6,0x7FE9,0x7FEE,/* 0xE0-0xE7 */
+	0x7FF3,0x7CF8,0x7D77,0x7DA6,0x7DAE,0x7E47,0x7E9B,0x9EB8,/* 0xE8-0xEF */
+	0x9EB4,0x8D73,0x8D84,0x8D94,0x8D91,0x8DB1,0x8D67,0x8D6D,/* 0xF0-0xF7 */
+	0x8C47,0x8C49,0x914A,0x9150,0x914E,0x914F,0x9164,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9B7C,0x9B7D,0x9B7E,0x9B7F,0x9B80,0x9B81,0x9B82,0x9B83,/* 0x40-0x47 */
+	0x9B84,0x9B85,0x9B86,0x9B87,0x9B88,0x9B89,0x9B8A,0x9B8B,/* 0x48-0x4F */
+	0x9B8C,0x9B8D,0x9B8E,0x9B8F,0x9B90,0x9B91,0x9B92,0x9B93,/* 0x50-0x57 */
+	0x9B94,0x9B95,0x9B96,0x9B97,0x9B98,0x9B99,0x9B9A,0x9B9B,/* 0x58-0x5F */
+	0x9B9C,0x9B9D,0x9B9E,0x9B9F,0x9BA0,0x9BA1,0x9BA2,0x9BA3,/* 0x60-0x67 */
+	0x9BA4,0x9BA5,0x9BA6,0x9BA7,0x9BA8,0x9BA9,0x9BAA,0x9BAB,/* 0x68-0x6F */
+	0x9BAC,0x9BAD,0x9BAE,0x9BAF,0x9BB0,0x9BB1,0x9BB2,0x9BB3,/* 0x70-0x77 */
+	0x9BB4,0x9BB5,0x9BB6,0x9BB7,0x9BB8,0x9BB9,0x9BBA,0x0000,/* 0x78-0x7F */
+
+	0x9BBB,0x9BBC,0x9BBD,0x9BBE,0x9BBF,0x9BC0,0x9BC1,0x9BC2,/* 0x80-0x87 */
+	0x9BC3,0x9BC4,0x9BC5,0x9BC6,0x9BC7,0x9BC8,0x9BC9,0x9BCA,/* 0x88-0x8F */
+	0x9BCB,0x9BCC,0x9BCD,0x9BCE,0x9BCF,0x9BD0,0x9BD1,0x9BD2,/* 0x90-0x97 */
+	0x9BD3,0x9BD4,0x9BD5,0x9BD6,0x9BD7,0x9BD8,0x9BD9,0x9BDA,/* 0x98-0x9F */
+	0x9BDB,0x9162,0x9161,0x9170,0x9169,0x916F,0x917D,0x917E,/* 0xA0-0xA7 */
+	0x9172,0x9174,0x9179,0x918C,0x9185,0x9190,0x918D,0x9191,/* 0xA8-0xAF */
+	0x91A2,0x91A3,0x91AA,0x91AD,0x91AE,0x91AF,0x91B5,0x91B4,/* 0xB0-0xB7 */
+	0x91BA,0x8C55,0x9E7E,0x8DB8,0x8DEB,0x8E05,0x8E59,0x8E69,/* 0xB8-0xBF */
+	0x8DB5,0x8DBF,0x8DBC,0x8DBA,0x8DC4,0x8DD6,0x8DD7,0x8DDA,/* 0xC0-0xC7 */
+	0x8DDE,0x8DCE,0x8DCF,0x8DDB,0x8DC6,0x8DEC,0x8DF7,0x8DF8,/* 0xC8-0xCF */
+	0x8DE3,0x8DF9,0x8DFB,0x8DE4,0x8E09,0x8DFD,0x8E14,0x8E1D,/* 0xD0-0xD7 */
+	0x8E1F,0x8E2C,0x8E2E,0x8E23,0x8E2F,0x8E3A,0x8E40,0x8E39,/* 0xD8-0xDF */
+	0x8E35,0x8E3D,0x8E31,0x8E49,0x8E41,0x8E42,0x8E51,0x8E52,/* 0xE0-0xE7 */
+	0x8E4A,0x8E70,0x8E76,0x8E7C,0x8E6F,0x8E74,0x8E85,0x8E8F,/* 0xE8-0xEF */
+	0x8E94,0x8E90,0x8E9C,0x8E9E,0x8C78,0x8C82,0x8C8A,0x8C85,/* 0xF0-0xF7 */
+	0x8C98,0x8C94,0x659B,0x89D6,0x89DE,0x89DA,0x89DC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9BDC,0x9BDD,0x9BDE,0x9BDF,0x9BE0,0x9BE1,0x9BE2,0x9BE3,/* 0x40-0x47 */
+	0x9BE4,0x9BE5,0x9BE6,0x9BE7,0x9BE8,0x9BE9,0x9BEA,0x9BEB,/* 0x48-0x4F */
+	0x9BEC,0x9BED,0x9BEE,0x9BEF,0x9BF0,0x9BF1,0x9BF2,0x9BF3,/* 0x50-0x57 */
+	0x9BF4,0x9BF5,0x9BF6,0x9BF7,0x9BF8,0x9BF9,0x9BFA,0x9BFB,/* 0x58-0x5F */
+	0x9BFC,0x9BFD,0x9BFE,0x9BFF,0x9C00,0x9C01,0x9C02,0x9C03,/* 0x60-0x67 */
+	0x9C04,0x9C05,0x9C06,0x9C07,0x9C08,0x9C09,0x9C0A,0x9C0B,/* 0x68-0x6F */
+	0x9C0C,0x9C0D,0x9C0E,0x9C0F,0x9C10,0x9C11,0x9C12,0x9C13,/* 0x70-0x77 */
+	0x9C14,0x9C15,0x9C16,0x9C17,0x9C18,0x9C19,0x9C1A,0x0000,/* 0x78-0x7F */
+
+	0x9C1B,0x9C1C,0x9C1D,0x9C1E,0x9C1F,0x9C20,0x9C21,0x9C22,/* 0x80-0x87 */
+	0x9C23,0x9C24,0x9C25,0x9C26,0x9C27,0x9C28,0x9C29,0x9C2A,/* 0x88-0x8F */
+	0x9C2B,0x9C2C,0x9C2D,0x9C2E,0x9C2F,0x9C30,0x9C31,0x9C32,/* 0x90-0x97 */
+	0x9C33,0x9C34,0x9C35,0x9C36,0x9C37,0x9C38,0x9C39,0x9C3A,/* 0x98-0x9F */
+	0x9C3B,0x89E5,0x89EB,0x89EF,0x8A3E,0x8B26,0x9753,0x96E9,/* 0xA0-0xA7 */
+	0x96F3,0x96EF,0x9706,0x9701,0x9708,0x970F,0x970E,0x972A,/* 0xA8-0xAF */
+	0x972D,0x9730,0x973E,0x9F80,0x9F83,0x9F85,0x9F86,0x9F87,/* 0xB0-0xB7 */
+	0x9F88,0x9F89,0x9F8A,0x9F8C,0x9EFE,0x9F0B,0x9F0D,0x96B9,/* 0xB8-0xBF */
+	0x96BC,0x96BD,0x96CE,0x96D2,0x77BF,0x96E0,0x928E,0x92AE,/* 0xC0-0xC7 */
+	0x92C8,0x933E,0x936A,0x93CA,0x938F,0x943E,0x946B,0x9C7F,/* 0xC8-0xCF */
+	0x9C82,0x9C85,0x9C86,0x9C87,0x9C88,0x7A23,0x9C8B,0x9C8E,/* 0xD0-0xD7 */
+	0x9C90,0x9C91,0x9C92,0x9C94,0x9C95,0x9C9A,0x9C9B,0x9C9E,/* 0xD8-0xDF */
+	0x9C9F,0x9CA0,0x9CA1,0x9CA2,0x9CA3,0x9CA5,0x9CA6,0x9CA7,/* 0xE0-0xE7 */
+	0x9CA8,0x9CA9,0x9CAB,0x9CAD,0x9CAE,0x9CB0,0x9CB1,0x9CB2,/* 0xE8-0xEF */
+	0x9CB3,0x9CB4,0x9CB5,0x9CB6,0x9CB7,0x9CBA,0x9CBB,0x9CBC,/* 0xF0-0xF7 */
+	0x9CBD,0x9CC4,0x9CC5,0x9CC6,0x9CC7,0x9CCA,0x9CCB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9C3C,0x9C3D,0x9C3E,0x9C3F,0x9C40,0x9C41,0x9C42,0x9C43,/* 0x40-0x47 */
+	0x9C44,0x9C45,0x9C46,0x9C47,0x9C48,0x9C49,0x9C4A,0x9C4B,/* 0x48-0x4F */
+	0x9C4C,0x9C4D,0x9C4E,0x9C4F,0x9C50,0x9C51,0x9C52,0x9C53,/* 0x50-0x57 */
+	0x9C54,0x9C55,0x9C56,0x9C57,0x9C58,0x9C59,0x9C5A,0x9C5B,/* 0x58-0x5F */
+	0x9C5C,0x9C5D,0x9C5E,0x9C5F,0x9C60,0x9C61,0x9C62,0x9C63,/* 0x60-0x67 */
+	0x9C64,0x9C65,0x9C66,0x9C67,0x9C68,0x9C69,0x9C6A,0x9C6B,/* 0x68-0x6F */
+	0x9C6C,0x9C6D,0x9C6E,0x9C6F,0x9C70,0x9C71,0x9C72,0x9C73,/* 0x70-0x77 */
+	0x9C74,0x9C75,0x9C76,0x9C77,0x9C78,0x9C79,0x9C7A,0x0000,/* 0x78-0x7F */
+
+	0x9C7B,0x9C7D,0x9C7E,0x9C80,0x9C83,0x9C84,0x9C89,0x9C8A,/* 0x80-0x87 */
+	0x9C8C,0x9C8F,0x9C93,0x9C96,0x9C97,0x9C98,0x9C99,0x9C9D,/* 0x88-0x8F */
+	0x9CAA,0x9CAC,0x9CAF,0x9CB9,0x9CBE,0x9CBF,0x9CC0,0x9CC1,/* 0x90-0x97 */
+	0x9CC2,0x9CC8,0x9CC9,0x9CD1,0x9CD2,0x9CDA,0x9CDB,0x9CE0,/* 0x98-0x9F */
+	0x9CE1,0x9CCC,0x9CCD,0x9CCE,0x9CCF,0x9CD0,0x9CD3,0x9CD4,/* 0xA0-0xA7 */
+	0x9CD5,0x9CD7,0x9CD8,0x9CD9,0x9CDC,0x9CDD,0x9CDF,0x9CE2,/* 0xA8-0xAF */
+	0x977C,0x9785,0x9791,0x9792,0x9794,0x97AF,0x97AB,0x97A3,/* 0xB0-0xB7 */
+	0x97B2,0x97B4,0x9AB1,0x9AB0,0x9AB7,0x9E58,0x9AB6,0x9ABA,/* 0xB8-0xBF */
+	0x9ABC,0x9AC1,0x9AC0,0x9AC5,0x9AC2,0x9ACB,0x9ACC,0x9AD1,/* 0xC0-0xC7 */
+	0x9B45,0x9B43,0x9B47,0x9B49,0x9B48,0x9B4D,0x9B51,0x98E8,/* 0xC8-0xCF */
+	0x990D,0x992E,0x9955,0x9954,0x9ADF,0x9AE1,0x9AE6,0x9AEF,/* 0xD0-0xD7 */
+	0x9AEB,0x9AFB,0x9AED,0x9AF9,0x9B08,0x9B0F,0x9B13,0x9B1F,/* 0xD8-0xDF */
+	0x9B23,0x9EBD,0x9EBE,0x7E3B,0x9E82,0x9E87,0x9E88,0x9E8B,/* 0xE0-0xE7 */
+	0x9E92,0x93D6,0x9E9D,0x9E9F,0x9EDB,0x9EDC,0x9EDD,0x9EE0,/* 0xE8-0xEF */
+	0x9EDF,0x9EE2,0x9EE9,0x9EE7,0x9EE5,0x9EEA,0x9EEF,0x9F22,/* 0xF0-0xF7 */
+	0x9F2C,0x9F2F,0x9F39,0x9F37,0x9F3D,0x9F3E,0x9F44,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9CE3,0x9CE4,0x9CE5,0x9CE6,0x9CE7,0x9CE8,0x9CE9,0x9CEA,/* 0x40-0x47 */
+	0x9CEB,0x9CEC,0x9CED,0x9CEE,0x9CEF,0x9CF0,0x9CF1,0x9CF2,/* 0x48-0x4F */
+	0x9CF3,0x9CF4,0x9CF5,0x9CF6,0x9CF7,0x9CF8,0x9CF9,0x9CFA,/* 0x50-0x57 */
+	0x9CFB,0x9CFC,0x9CFD,0x9CFE,0x9CFF,0x9D00,0x9D01,0x9D02,/* 0x58-0x5F */
+	0x9D03,0x9D04,0x9D05,0x9D06,0x9D07,0x9D08,0x9D09,0x9D0A,/* 0x60-0x67 */
+	0x9D0B,0x9D0C,0x9D0D,0x9D0E,0x9D0F,0x9D10,0x9D11,0x9D12,/* 0x68-0x6F */
+	0x9D13,0x9D14,0x9D15,0x9D16,0x9D17,0x9D18,0x9D19,0x9D1A,/* 0x70-0x77 */
+	0x9D1B,0x9D1C,0x9D1D,0x9D1E,0x9D1F,0x9D20,0x9D21,0x0000,/* 0x78-0x7F */
+
+	0x9D22,0x9D23,0x9D24,0x9D25,0x9D26,0x9D27,0x9D28,0x9D29,/* 0x80-0x87 */
+	0x9D2A,0x9D2B,0x9D2C,0x9D2D,0x9D2E,0x9D2F,0x9D30,0x9D31,/* 0x88-0x8F */
+	0x9D32,0x9D33,0x9D34,0x9D35,0x9D36,0x9D37,0x9D38,0x9D39,/* 0x90-0x97 */
+	0x9D3A,0x9D3B,0x9D3C,0x9D3D,0x9D3E,0x9D3F,0x9D40,0x9D41,/* 0x98-0x9F */
+	0x9D42,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_F9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9D43,0x9D44,0x9D45,0x9D46,0x9D47,0x9D48,0x9D49,0x9D4A,/* 0x40-0x47 */
+	0x9D4B,0x9D4C,0x9D4D,0x9D4E,0x9D4F,0x9D50,0x9D51,0x9D52,/* 0x48-0x4F */
+	0x9D53,0x9D54,0x9D55,0x9D56,0x9D57,0x9D58,0x9D59,0x9D5A,/* 0x50-0x57 */
+	0x9D5B,0x9D5C,0x9D5D,0x9D5E,0x9D5F,0x9D60,0x9D61,0x9D62,/* 0x58-0x5F */
+	0x9D63,0x9D64,0x9D65,0x9D66,0x9D67,0x9D68,0x9D69,0x9D6A,/* 0x60-0x67 */
+	0x9D6B,0x9D6C,0x9D6D,0x9D6E,0x9D6F,0x9D70,0x9D71,0x9D72,/* 0x68-0x6F */
+	0x9D73,0x9D74,0x9D75,0x9D76,0x9D77,0x9D78,0x9D79,0x9D7A,/* 0x70-0x77 */
+	0x9D7B,0x9D7C,0x9D7D,0x9D7E,0x9D7F,0x9D80,0x9D81,0x0000,/* 0x78-0x7F */
+
+	0x9D82,0x9D83,0x9D84,0x9D85,0x9D86,0x9D87,0x9D88,0x9D89,/* 0x80-0x87 */
+	0x9D8A,0x9D8B,0x9D8C,0x9D8D,0x9D8E,0x9D8F,0x9D90,0x9D91,/* 0x88-0x8F */
+	0x9D92,0x9D93,0x9D94,0x9D95,0x9D96,0x9D97,0x9D98,0x9D99,/* 0x90-0x97 */
+	0x9D9A,0x9D9B,0x9D9C,0x9D9D,0x9D9E,0x9D9F,0x9DA0,0x9DA1,/* 0x98-0x9F */
+	0x9DA2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9DA3,0x9DA4,0x9DA5,0x9DA6,0x9DA7,0x9DA8,0x9DA9,0x9DAA,/* 0x40-0x47 */
+	0x9DAB,0x9DAC,0x9DAD,0x9DAE,0x9DAF,0x9DB0,0x9DB1,0x9DB2,/* 0x48-0x4F */
+	0x9DB3,0x9DB4,0x9DB5,0x9DB6,0x9DB7,0x9DB8,0x9DB9,0x9DBA,/* 0x50-0x57 */
+	0x9DBB,0x9DBC,0x9DBD,0x9DBE,0x9DBF,0x9DC0,0x9DC1,0x9DC2,/* 0x58-0x5F */
+	0x9DC3,0x9DC4,0x9DC5,0x9DC6,0x9DC7,0x9DC8,0x9DC9,0x9DCA,/* 0x60-0x67 */
+	0x9DCB,0x9DCC,0x9DCD,0x9DCE,0x9DCF,0x9DD0,0x9DD1,0x9DD2,/* 0x68-0x6F */
+	0x9DD3,0x9DD4,0x9DD5,0x9DD6,0x9DD7,0x9DD8,0x9DD9,0x9DDA,/* 0x70-0x77 */
+	0x9DDB,0x9DDC,0x9DDD,0x9DDE,0x9DDF,0x9DE0,0x9DE1,0x0000,/* 0x78-0x7F */
+
+	0x9DE2,0x9DE3,0x9DE4,0x9DE5,0x9DE6,0x9DE7,0x9DE8,0x9DE9,/* 0x80-0x87 */
+	0x9DEA,0x9DEB,0x9DEC,0x9DED,0x9DEE,0x9DEF,0x9DF0,0x9DF1,/* 0x88-0x8F */
+	0x9DF2,0x9DF3,0x9DF4,0x9DF5,0x9DF6,0x9DF7,0x9DF8,0x9DF9,/* 0x90-0x97 */
+	0x9DFA,0x9DFB,0x9DFC,0x9DFD,0x9DFE,0x9DFF,0x9E00,0x9E01,/* 0x98-0x9F */
+	0x9E02,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9E03,0x9E04,0x9E05,0x9E06,0x9E07,0x9E08,0x9E09,0x9E0A,/* 0x40-0x47 */
+	0x9E0B,0x9E0C,0x9E0D,0x9E0E,0x9E0F,0x9E10,0x9E11,0x9E12,/* 0x48-0x4F */
+	0x9E13,0x9E14,0x9E15,0x9E16,0x9E17,0x9E18,0x9E19,0x9E1A,/* 0x50-0x57 */
+	0x9E1B,0x9E1C,0x9E1D,0x9E1E,0x9E24,0x9E27,0x9E2E,0x9E30,/* 0x58-0x5F */
+	0x9E34,0x9E3B,0x9E3C,0x9E40,0x9E4D,0x9E50,0x9E52,0x9E53,/* 0x60-0x67 */
+	0x9E54,0x9E56,0x9E59,0x9E5D,0x9E5F,0x9E60,0x9E61,0x9E62,/* 0x68-0x6F */
+	0x9E65,0x9E6E,0x9E6F,0x9E72,0x9E74,0x9E75,0x9E76,0x9E77,/* 0x70-0x77 */
+	0x9E78,0x9E79,0x9E7A,0x9E7B,0x9E7C,0x9E7D,0x9E80,0x0000,/* 0x78-0x7F */
+
+	0x9E81,0x9E83,0x9E84,0x9E85,0x9E86,0x9E89,0x9E8A,0x9E8C,/* 0x80-0x87 */
+	0x9E8D,0x9E8E,0x9E8F,0x9E90,0x9E91,0x9E94,0x9E95,0x9E96,/* 0x88-0x8F */
+	0x9E97,0x9E98,0x9E99,0x9E9A,0x9E9B,0x9E9C,0x9E9E,0x9EA0,/* 0x90-0x97 */
+	0x9EA1,0x9EA2,0x9EA3,0x9EA4,0x9EA5,0x9EA7,0x9EA8,0x9EA9,/* 0x98-0x9F */
+	0x9EAA,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9EAB,0x9EAC,0x9EAD,0x9EAE,0x9EAF,0x9EB0,0x9EB1,0x9EB2,/* 0x40-0x47 */
+	0x9EB3,0x9EB5,0x9EB6,0x9EB7,0x9EB9,0x9EBA,0x9EBC,0x9EBF,/* 0x48-0x4F */
+	0x9EC0,0x9EC1,0x9EC2,0x9EC3,0x9EC5,0x9EC6,0x9EC7,0x9EC8,/* 0x50-0x57 */
+	0x9ECA,0x9ECB,0x9ECC,0x9ED0,0x9ED2,0x9ED3,0x9ED5,0x9ED6,/* 0x58-0x5F */
+	0x9ED7,0x9ED9,0x9EDA,0x9EDE,0x9EE1,0x9EE3,0x9EE4,0x9EE6,/* 0x60-0x67 */
+	0x9EE8,0x9EEB,0x9EEC,0x9EED,0x9EEE,0x9EF0,0x9EF1,0x9EF2,/* 0x68-0x6F */
+	0x9EF3,0x9EF4,0x9EF5,0x9EF6,0x9EF7,0x9EF8,0x9EFA,0x9EFD,/* 0x70-0x77 */
+	0x9EFF,0x9F00,0x9F01,0x9F02,0x9F03,0x9F04,0x9F05,0x0000,/* 0x78-0x7F */
+
+	0x9F06,0x9F07,0x9F08,0x9F09,0x9F0A,0x9F0C,0x9F0F,0x9F11,/* 0x80-0x87 */
+	0x9F12,0x9F14,0x9F15,0x9F16,0x9F18,0x9F1A,0x9F1B,0x9F1C,/* 0x88-0x8F */
+	0x9F1D,0x9F1E,0x9F1F,0x9F21,0x9F23,0x9F24,0x9F25,0x9F26,/* 0x90-0x97 */
+	0x9F27,0x9F28,0x9F29,0x9F2A,0x9F2B,0x9F2D,0x9F2E,0x9F30,/* 0x98-0x9F */
+	0x9F31,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9F32,0x9F33,0x9F34,0x9F35,0x9F36,0x9F38,0x9F3A,0x9F3C,/* 0x40-0x47 */
+	0x9F3F,0x9F40,0x9F41,0x9F42,0x9F43,0x9F45,0x9F46,0x9F47,/* 0x48-0x4F */
+	0x9F48,0x9F49,0x9F4A,0x9F4B,0x9F4C,0x9F4D,0x9F4E,0x9F4F,/* 0x50-0x57 */
+	0x9F52,0x9F53,0x9F54,0x9F55,0x9F56,0x9F57,0x9F58,0x9F59,/* 0x58-0x5F */
+	0x9F5A,0x9F5B,0x9F5C,0x9F5D,0x9F5E,0x9F5F,0x9F60,0x9F61,/* 0x60-0x67 */
+	0x9F62,0x9F63,0x9F64,0x9F65,0x9F66,0x9F67,0x9F68,0x9F69,/* 0x68-0x6F */
+	0x9F6A,0x9F6B,0x9F6C,0x9F6D,0x9F6E,0x9F6F,0x9F70,0x9F71,/* 0x70-0x77 */
+	0x9F72,0x9F73,0x9F74,0x9F75,0x9F76,0x9F77,0x9F78,0x0000,/* 0x78-0x7F */
+
+	0x9F79,0x9F7A,0x9F7B,0x9F7C,0x9F7D,0x9F7E,0x9F81,0x9F82,/* 0x80-0x87 */
+	0x9F8D,0x9F8E,0x9F8F,0x9F90,0x9F91,0x9F92,0x9F93,0x9F94,/* 0x88-0x8F */
+	0x9F95,0x9F96,0x9F97,0x9F98,0x9F9C,0x9F9D,0x9F9E,0x9FA1,/* 0x90-0x97 */
+	0x9FA2,0x9FA3,0x9FA4,0x9FA5,0xF92C,0xF979,0xF995,0xF9E7,/* 0x98-0x9F */
+	0xF9F1,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0xFA0C,0xFA0D,0xFA0E,0xFA0F,0xFA11,0xFA13,0xFA14,0xFA18,/* 0x40-0x47 */
+	0xFA1F,0xFA20,0xFA21,0xFA23,0xFA24,0xFA27,0xFA28,0xFA29,/* 0x48-0x4F */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, 
+	c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, 
+	c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, 
+	c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, 
+	c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, 
+	c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, 
+	c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, 
+	c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, 
+	c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, 
+	c2u_C8, c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, 
+	c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, 
+	c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, 
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, 
+	c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, 
+	c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, c2u_FE, NULL,   
+};
+
+static const unsigned char u2c_00[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xA1, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, /* 0xA4-0xA7 */
+	0xA1, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xA1, 0xE3, 0xA1, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA4, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC1, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA8, 0xA4, 0xA8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xA8, 0xA8, 0xA8, 0xA6, 0xA8, 0xBA, 0x00, 0x00, /* 0xE8-0xEB */
+	0xA8, 0xAC, 0xA8, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB0, 0xA8, 0xAE, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC2, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xA8, 0xB4, 0xA8, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
+	0xA8, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_01[512] = {
+	0xA8, 0xA1, 0xA8, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA5, 0xA8, 0xA5, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA7, 0xA8, 0xA7, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA9, 0xA8, 0xA9, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xA8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xA8, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xA8, 0xAD, 0xA8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0xA8, 0xB1, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA8, 0xA3, 0xA8, 0xA3, 0xA8, 0xAB, /* 0xCC-0xCF */
+	0xA8, 0xAB, 0xA8, 0xAF, 0xA8, 0xAF, 0xA8, 0xB3, /* 0xD0-0xD3 */
+	0xA8, 0xB3, 0xA8, 0xB5, 0xA8, 0xB5, 0xA8, 0xB6, /* 0xD4-0xD7 */
+	0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB7, 0xA8, 0xB8, /* 0xD8-0xDB */
+	0xA8, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_02[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xA8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA8, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xA1, 0xA5, 0xA8, 0x40, 0xA8, 0x41, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xA8, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA6, 0xA1, 0xA6, 0xA2, 0xA6, 0xA3, /* 0x90-0x93 */
+	0xA6, 0xA4, 0xA6, 0xA5, 0xA6, 0xA6, 0xA6, 0xA7, /* 0x94-0x97 */
+	0xA6, 0xA8, 0xA6, 0xA9, 0xA6, 0xAA, 0xA6, 0xAB, /* 0x98-0x9B */
+	0xA6, 0xAC, 0xA6, 0xAD, 0xA6, 0xAE, 0xA6, 0xAF, /* 0x9C-0x9F */
+	0xA6, 0xB0, 0xA6, 0xB1, 0x00, 0x00, 0xA6, 0xB2, /* 0xA0-0xA3 */
+	0xA6, 0xB3, 0xA6, 0xB4, 0xA6, 0xB5, 0xA6, 0xB6, /* 0xA4-0xA7 */
+	0xA6, 0xB7, 0xA6, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xA6, 0xC1, 0xA6, 0xC2, 0xA6, 0xC3, /* 0xB0-0xB3 */
+	0xA6, 0xC4, 0xA6, 0xC5, 0xA6, 0xC6, 0xA6, 0xC7, /* 0xB4-0xB7 */
+	0xA6, 0xC8, 0xA6, 0xC9, 0xA6, 0xCA, 0xA6, 0xCB, /* 0xB8-0xBB */
+	0xA6, 0xCC, 0xA6, 0xCD, 0xA6, 0xCE, 0xA6, 0xCF, /* 0xBC-0xBF */
+	0xA6, 0xD0, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0xC0-0xC3 */
+	0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xD5, 0xA6, 0xD6, /* 0xC4-0xC7 */
+	0xA6, 0xD7, 0xA6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_04[512] = {
+	0x00, 0x00, 0xA7, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, 0xA7, 0xA4, /* 0x10-0x13 */
+	0xA7, 0xA5, 0xA7, 0xA6, 0xA7, 0xA8, 0xA7, 0xA9, /* 0x14-0x17 */
+	0xA7, 0xAA, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x18-0x1B */
+	0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x1C-0x1F */
+	0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xB5, /* 0x20-0x23 */
+	0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, 0xA7, 0xB9, /* 0x24-0x27 */
+	0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xBC, 0xA7, 0xBD, /* 0x28-0x2B */
+	0xA7, 0xBE, 0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, /* 0x2C-0x2F */
+	0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, 0xA7, 0xD4, /* 0x30-0x33 */
+	0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD8, 0xA7, 0xD9, /* 0x34-0x37 */
+	0xA7, 0xDA, 0xA7, 0xDB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x38-0x3B */
+	0xA7, 0xDE, 0xA7, 0xDF, 0xA7, 0xE0, 0xA7, 0xE1, /* 0x3C-0x3F */
+	0xA7, 0xE2, 0xA7, 0xE3, 0xA7, 0xE4, 0xA7, 0xE5, /* 0x40-0x43 */
+	0xA7, 0xE6, 0xA7, 0xE7, 0xA7, 0xE8, 0xA7, 0xE9, /* 0x44-0x47 */
+	0xA7, 0xEA, 0xA7, 0xEB, 0xA7, 0xEC, 0xA7, 0xED, /* 0x48-0x4B */
+	0xA7, 0xEE, 0xA7, 0xEF, 0xA7, 0xF0, 0xA7, 0xF1, /* 0x4C-0x4F */
+	0x00, 0x00, 0xA7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA9, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x43, /* 0x10-0x13 */
+	0xA1, 0xAA, 0xA8, 0x44, 0xA1, 0xAC, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xA8, 0x45, 0xA1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA1, 0xEB, 0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, /* 0x30-0x33 */
+	0x00, 0x00, 0xA8, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF9, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA3, 0xFE, 0x00, 0x00, /* 0x3C-0x3F */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE6, /* 0x00-0x03 */
+	0x00, 0x00, 0xA8, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA8, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA9, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, 0xA2, 0xF4, /* 0x60-0x63 */
+	0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, 0xA2, 0xF8, /* 0x64-0x67 */
+	0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, 0xA2, 0xFC, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xA2, 0xA1, 0xA2, 0xA2, 0xA2, 0xA3, 0xA2, 0xA4, /* 0x70-0x73 */
+	0xA2, 0xA5, 0xA2, 0xA6, 0xA2, 0xA7, 0xA2, 0xA8, /* 0x74-0x77 */
+	0xA2, 0xA9, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA1, 0xFB, 0xA1, 0xFC, 0xA1, 0xFA, 0xA1, 0xFD, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0x49, 0xA8, 0x4A, /* 0x94-0x97 */
+	0xA8, 0x4B, 0xA8, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_22[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC7, /* 0x0C-0x0F */
+	0x00, 0x00, 0xA1, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xE3, 0x00, 0x00, 0xA1, 0xCC, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xDE, 0xA8, 0x4E, /* 0x1C-0x1F */
+	0xA1, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x4F, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0xA1, 0xC4, /* 0x24-0x27 */
+	0xA1, 0xC5, 0xA1, 0xC9, 0xA1, 0xC8, 0xA1, 0xD2, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD3, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA1, 0xE0, 0xA1, 0xDF, 0xA1, 0xC3, 0xA1, 0xCB, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA1, 0xAB, 0xA1, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xA1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0x50, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA1, 0xD9, 0xA1, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA1, 0xDC, 0xA1, 0xDD, 0xA8, 0x51, 0xA8, 0x52, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xDB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xA1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x53, /* 0xBC-0xBF */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD0, 0x00, 0x00, /* 0x10-0x13 */
+};
+
+static const unsigned char u2c_24[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xD9, 0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, /* 0x60-0x63 */
+	0xA2, 0xDD, 0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, /* 0x64-0x67 */
+	0xA2, 0xE1, 0xA2, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA2, 0xC5, 0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, /* 0x74-0x77 */
+	0xA2, 0xC9, 0xA2, 0xCA, 0xA2, 0xCB, 0xA2, 0xCC, /* 0x78-0x7B */
+	0xA2, 0xCD, 0xA2, 0xCE, 0xA2, 0xCF, 0xA2, 0xD0, /* 0x7C-0x7F */
+	
+	0xA2, 0xD1, 0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, /* 0x80-0x83 */
+	0xA2, 0xD5, 0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, /* 0x84-0x87 */
+	0xA2, 0xB1, 0xA2, 0xB2, 0xA2, 0xB3, 0xA2, 0xB4, /* 0x88-0x8B */
+	0xA2, 0xB5, 0xA2, 0xB6, 0xA2, 0xB7, 0xA2, 0xB8, /* 0x8C-0x8F */
+	0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x90-0x93 */
+	0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x94-0x97 */
+	0xA2, 0xC1, 0xA2, 0xC2, 0xA2, 0xC3, 0xA2, 0xC4, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_25[512] = {
+	0xA9, 0xA4, 0xA9, 0xA5, 0xA9, 0xA6, 0xA9, 0xA7, /* 0x00-0x03 */
+	0xA9, 0xA8, 0xA9, 0xA9, 0xA9, 0xAA, 0xA9, 0xAB, /* 0x04-0x07 */
+	0xA9, 0xAC, 0xA9, 0xAD, 0xA9, 0xAE, 0xA9, 0xAF, /* 0x08-0x0B */
+	0xA9, 0xB0, 0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, /* 0x0C-0x0F */
+	0xA9, 0xB4, 0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x10-0x13 */
+	0xA9, 0xB8, 0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, /* 0x14-0x17 */
+	0xA9, 0xBC, 0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, /* 0x18-0x1B */
+	0xA9, 0xC0, 0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, /* 0x1C-0x1F */
+	0xA9, 0xC4, 0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, /* 0x20-0x23 */
+	0xA9, 0xC8, 0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, /* 0x24-0x27 */
+	0xA9, 0xCC, 0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, /* 0x28-0x2B */
+	0xA9, 0xD0, 0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, /* 0x2C-0x2F */
+	0xA9, 0xD4, 0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, /* 0x30-0x33 */
+	0xA9, 0xD8, 0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, /* 0x34-0x37 */
+	0xA9, 0xDC, 0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, /* 0x38-0x3B */
+	0xA9, 0xE0, 0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, /* 0x3C-0x3F */
+	0xA9, 0xE4, 0xA9, 0xE5, 0xA9, 0xE6, 0xA9, 0xE7, /* 0x40-0x43 */
+	0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, 0xA9, 0xEB, /* 0x44-0x47 */
+	0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, 0xA9, 0xEF, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xA8, 0x54, 0xA8, 0x55, 0xA8, 0x56, 0xA8, 0x57, /* 0x50-0x53 */
+	0xA8, 0x58, 0xA8, 0x59, 0xA8, 0x5A, 0xA8, 0x5B, /* 0x54-0x57 */
+	0xA8, 0x5C, 0xA8, 0x5D, 0xA8, 0x5E, 0xA8, 0x5F, /* 0x58-0x5B */
+	0xA8, 0x60, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x5C-0x5F */
+	0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x60-0x63 */
+	0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x64-0x67 */
+	0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x68-0x6B */
+	0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x6C-0x6F */
+	0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, 0xA8, 0x77, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0x80-0x83 */
+	0xA8, 0x7B, 0xA8, 0x7C, 0xA8, 0x7D, 0xA8, 0x7E, /* 0x84-0x87 */
+	0xA8, 0x80, 0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, /* 0x88-0x8B */
+	0xA8, 0x84, 0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x88, /* 0x90-0x93 */
+	0xA8, 0x89, 0xA8, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA1, 0xF6, 0xA1, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF7, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xA8, 0x8B, 0xA8, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF4, 0xA1, 0xF3, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF0, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF2, 0xA1, 0xF1, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0x8D, 0xA8, 0x8E, /* 0xE0-0xE3 */
+	0xA8, 0x8F, 0xA8, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA8, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA1, 0xE2, 0x00, 0x00, 0xA1, 0xE1, 0x00, 0x00, /* 0x40-0x43 */
+};
+
+static const unsigned char u2c_30[512] = {
+	0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xA9, 0xA9, 0x65, 0xA9, 0x96, /* 0x04-0x07 */
+	0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */
+	0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */
+	0xA1, 0xBE, 0xA1, 0xBF, 0xA8, 0x93, 0xA1, 0xFE, /* 0x10-0x13 */
+	0xA1, 0xB2, 0xA1, 0xB3, 0xA1, 0xBC, 0xA1, 0xBD, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xA8, 0x94, 0xA8, 0x95, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA9, 0x40, 0xA9, 0x41, 0xA9, 0x42, /* 0x20-0x23 */
+	0xA9, 0x43, 0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, /* 0x24-0x27 */
+	0xA9, 0x47, 0xA9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x40-0x43 */
+	0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x44-0x47 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x48-0x4B */
+	0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x4C-0x4F */
+	0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x50-0x53 */
+	0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x54-0x57 */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x58-0x5B */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x5C-0x5F */
+	0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x60-0x63 */
+	0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x64-0x67 */
+	0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x68-0x6B */
+	0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x6C-0x6F */
+	0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x70-0x73 */
+	0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x74-0x77 */
+	0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x78-0x7B */
+	0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x7C-0x7F */
+	
+	0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x80-0x83 */
+	0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x84-0x87 */
+	0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x88-0x8B */
+	0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x8C-0x8F */
+	0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x61, /* 0x98-0x9B */
+	0xA9, 0x62, 0xA9, 0x66, 0xA9, 0x67, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, /* 0xA0-0xA3 */
+	0xA5, 0xA4, 0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, /* 0xA4-0xA7 */
+	0xA5, 0xA8, 0xA5, 0xA9, 0xA5, 0xAA, 0xA5, 0xAB, /* 0xA8-0xAB */
+	0xA5, 0xAC, 0xA5, 0xAD, 0xA5, 0xAE, 0xA5, 0xAF, /* 0xAC-0xAF */
+	0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0xB0-0xB3 */
+	0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0xB4-0xB7 */
+	0xA5, 0xB8, 0xA5, 0xB9, 0xA5, 0xBA, 0xA5, 0xBB, /* 0xB8-0xBB */
+	0xA5, 0xBC, 0xA5, 0xBD, 0xA5, 0xBE, 0xA5, 0xBF, /* 0xBC-0xBF */
+	0xA5, 0xC0, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0xC0-0xC3 */
+	0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0xC4-0xC7 */
+	0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0xC8-0xCB */
+	0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0xCC-0xCF */
+	0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0xD0-0xD3 */
+	0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, 0xA5, 0xD7, /* 0xD4-0xD7 */
+	0xA5, 0xD8, 0xA5, 0xD9, 0xA5, 0xDA, 0xA5, 0xDB, /* 0xD8-0xDB */
+	0xA5, 0xDC, 0xA5, 0xDD, 0xA5, 0xDE, 0xA5, 0xDF, /* 0xDC-0xDF */
+	0xA5, 0xE0, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xE0-0xE3 */
+	0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xE4-0xE7 */
+	0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xE8-0xEB */
+	0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xEC-0xEF */
+	0xA5, 0xF0, 0xA5, 0xF1, 0xA5, 0xF2, 0xA5, 0xF3, /* 0xF0-0xF3 */
+	0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xA9, 0x60, 0xA9, 0x63, 0xA9, 0x64, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_31[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, /* 0x04-0x07 */
+	0xA8, 0xC8, 0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, /* 0x08-0x0B */
+	0xA8, 0xCC, 0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, /* 0x0C-0x0F */
+	0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, /* 0x10-0x13 */
+	0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, /* 0x14-0x17 */
+	0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, /* 0x18-0x1B */
+	0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, /* 0x1C-0x1F */
+	0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, /* 0x20-0x23 */
+	0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, 0xA8, 0xE7, /* 0x24-0x27 */
+	0xA8, 0xE8, 0xA8, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xBB, 0xB6, 0xFE, /* 0x90-0x93 */
+	0xC8, 0xFD, 0xCB, 0xC4, 0xC9, 0xCF, 0xD6, 0xD0, /* 0x94-0x97 */
+	0xCF, 0xC2, 0xBC, 0xD7, 0xD2, 0xD2, 0xB1, 0xFB, /* 0x98-0x9B */
+	0xB6, 0xA1, 0xCC, 0xEC, 0xB5, 0xD8, 0xC8, 0xCB, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_32[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA2, 0xE5, 0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, /* 0x20-0x23 */
+	0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, 0xA2, 0xEC, /* 0x24-0x27 */
+	0xA2, 0xED, 0xA2, 0xEE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x28-0x2B */
+	0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x2C-0x2F */
+	0xC8, 0xD5, 0xA9, 0x5A, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x30-0x33 */
+	0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x34-0x37 */
+	0xC0, 0xCD, 0xB4, 0xFA, 0xBA, 0xF4, 0xD1, 0xA7, /* 0x38-0x3B */
+	0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0x3C-0x3F */
+	0xBC, 0xC0, 0xD0, 0xDD, 0xD7, 0xD4, 0xD6, 0xC1, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD2, 0xBB, 0xB6, 0xFE, 0xC8, 0xFD, 0xCB, 0xC4, /* 0x80-0x83 */
+	0xCE, 0xE5, 0xC1, 0xF9, 0xC6, 0xDF, 0xB0, 0xCB, /* 0x84-0x87 */
+	0xBE, 0xC5, 0xCA, 0xAE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x88-0x8B */
+	0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x8C-0x8F */
+	0xC8, 0xD5, 0xD6, 0xEA, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x90-0x93 */
+	0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x94-0x97 */
+	0xC0, 0xCD, 0xC3, 0xD8, 0xC4, 0xD0, 0xC5, 0xAE, /* 0x98-0x9B */
+	0xCA, 0xCA, 0xD3, 0xC5, 0x00, 0x00, 0xD7, 0xA2, /* 0x9C-0x9F */
+	0xCF, 0xEE, 0xD0, 0xDD, 0xD0, 0xB4, 0xA9, 0x49, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD2, 0xBD, 0xD7, 0xDA, 0xD1, 0xA7, /* 0xA8-0xAB */
+	0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0xAC-0xAF */
+	0xD2, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x4A, 0xA9, 0x4B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA9, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xA9, 0x52, 0xA9, 0x53, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xA9, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0xD2, 0xBB, 0xB6, 0xA1, 0x81, 0x40, 0xC6, 0xDF, /* 0x00-0x03 */
+	0x81, 0x41, 0x81, 0x42, 0x81, 0x43, 0xCD, 0xF2, /* 0x04-0x07 */
+	0xD5, 0xC9, 0xC8, 0xFD, 0xC9, 0xCF, 0xCF, 0xC2, /* 0x08-0x0B */
+	0xD8, 0xA2, 0xB2, 0xBB, 0xD3, 0xEB, 0x81, 0x44, /* 0x0C-0x0F */
+	0xD8, 0xA4, 0xB3, 0xF3, 0x81, 0x45, 0xD7, 0xA8, /* 0x10-0x13 */
+	0xC7, 0xD2, 0xD8, 0xA7, 0xCA, 0xC0, 0x81, 0x46, /* 0x14-0x17 */
+	0xC7, 0xF0, 0xB1, 0xFB, 0xD2, 0xB5, 0xB4, 0xD4, /* 0x18-0x1B */
+	0xB6, 0xAB, 0xCB, 0xBF, 0xD8, 0xA9, 0x81, 0x47, /* 0x1C-0x1F */
+	0x81, 0x48, 0x81, 0x49, 0xB6, 0xAA, 0x81, 0x4A, /* 0x20-0x23 */
+	0xC1, 0xBD, 0xD1, 0xCF, 0x81, 0x4B, 0xC9, 0xA5, /* 0x24-0x27 */
+	0xD8, 0xAD, 0x81, 0x4C, 0xB8, 0xF6, 0xD1, 0xBE, /* 0x28-0x2B */
+	0xE3, 0xDC, 0xD6, 0xD0, 0x81, 0x4D, 0x81, 0x4E, /* 0x2C-0x2F */
+	0xB7, 0xE1, 0x81, 0x4F, 0xB4, 0xAE, 0x81, 0x50, /* 0x30-0x33 */
+	0xC1, 0xD9, 0x81, 0x51, 0xD8, 0xBC, 0x81, 0x52, /* 0x34-0x37 */
+	0xCD, 0xE8, 0xB5, 0xA4, 0xCE, 0xAA, 0xD6, 0xF7, /* 0x38-0x3B */
+	0x81, 0x53, 0xC0, 0xF6, 0xBE, 0xD9, 0xD8, 0xAF, /* 0x3C-0x3F */
+	0x81, 0x54, 0x81, 0x55, 0x81, 0x56, 0xC4, 0xCB, /* 0x40-0x43 */
+	0x81, 0x57, 0xBE, 0xC3, 0x81, 0x58, 0xD8, 0xB1, /* 0x44-0x47 */
+	0xC3, 0xB4, 0xD2, 0xE5, 0x81, 0x59, 0xD6, 0xAE, /* 0x48-0x4B */
+	0xCE, 0xDA, 0xD5, 0xA7, 0xBA, 0xF5, 0xB7, 0xA6, /* 0x4C-0x4F */
+	0xC0, 0xD6, 0x81, 0x5A, 0xC6, 0xB9, 0xC5, 0xD2, /* 0x50-0x53 */
+	0xC7, 0xC7, 0x81, 0x5B, 0xB9, 0xD4, 0x81, 0x5C, /* 0x54-0x57 */
+	0xB3, 0xCB, 0xD2, 0xD2, 0x81, 0x5D, 0x81, 0x5E, /* 0x58-0x5B */
+	0xD8, 0xBF, 0xBE, 0xC5, 0xC6, 0xF2, 0xD2, 0xB2, /* 0x5C-0x5F */
+	0xCF, 0xB0, 0xCF, 0xE7, 0x81, 0x5F, 0x81, 0x60, /* 0x60-0x63 */
+	0x81, 0x61, 0x81, 0x62, 0xCA, 0xE9, 0x81, 0x63, /* 0x64-0x67 */
+	0x81, 0x64, 0xD8, 0xC0, 0x81, 0x65, 0x81, 0x66, /* 0x68-0x6B */
+	0x81, 0x67, 0x81, 0x68, 0x81, 0x69, 0x81, 0x6A, /* 0x6C-0x6F */
+	0xC2, 0xF2, 0xC2, 0xD2, 0x81, 0x6B, 0xC8, 0xE9, /* 0x70-0x73 */
+	0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x74-0x77 */
+	0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0x81, 0x73, /* 0x78-0x7B */
+	0x81, 0x74, 0x81, 0x75, 0xC7, 0xAC, 0x81, 0x76, /* 0x7C-0x7F */
+	
+	0x81, 0x77, 0x81, 0x78, 0x81, 0x79, 0x81, 0x7A, /* 0x80-0x83 */
+	0x81, 0x7B, 0x81, 0x7C, 0xC1, 0xCB, 0x81, 0x7D, /* 0x84-0x87 */
+	0xD3, 0xE8, 0xD5, 0xF9, 0x81, 0x7E, 0xCA, 0xC2, /* 0x88-0x8B */
+	0xB6, 0xFE, 0xD8, 0xA1, 0xD3, 0xDA, 0xBF, 0xF7, /* 0x8C-0x8F */
+	0x81, 0x80, 0xD4, 0xC6, 0xBB, 0xA5, 0xD8, 0xC1, /* 0x90-0x93 */
+	0xCE, 0xE5, 0xBE, 0xAE, 0x81, 0x81, 0x81, 0x82, /* 0x94-0x97 */
+	0xD8, 0xA8, 0x81, 0x83, 0xD1, 0xC7, 0xD0, 0xA9, /* 0x98-0x9B */
+	0x81, 0x84, 0x81, 0x85, 0x81, 0x86, 0xD8, 0xBD, /* 0x9C-0x9F */
+	0xD9, 0xEF, 0xCD, 0xF6, 0xBF, 0xBA, 0x81, 0x87, /* 0xA0-0xA3 */
+	0xBD, 0xBB, 0xBA, 0xA5, 0xD2, 0xE0, 0xB2, 0xFA, /* 0xA4-0xA7 */
+	0xBA, 0xE0, 0xC4, 0xB6, 0x81, 0x88, 0xCF, 0xED, /* 0xA8-0xAB */
+	0xBE, 0xA9, 0xCD, 0xA4, 0xC1, 0xC1, 0x81, 0x89, /* 0xAC-0xAF */
+	0x81, 0x8A, 0x81, 0x8B, 0xC7, 0xD7, 0xD9, 0xF1, /* 0xB0-0xB3 */
+	0x81, 0x8C, 0xD9, 0xF4, 0x81, 0x8D, 0x81, 0x8E, /* 0xB4-0xB7 */
+	0x81, 0x8F, 0x81, 0x90, 0xC8, 0xCB, 0xD8, 0xE9, /* 0xB8-0xBB */
+	0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0xD2, 0xDA, /* 0xBC-0xBF */
+	0xCA, 0xB2, 0xC8, 0xCA, 0xD8, 0xEC, 0xD8, 0xEA, /* 0xC0-0xC3 */
+	0xD8, 0xC6, 0xBD, 0xF6, 0xC6, 0xCD, 0xB3, 0xF0, /* 0xC4-0xC7 */
+	0x81, 0x94, 0xD8, 0xEB, 0xBD, 0xF1, 0xBD, 0xE9, /* 0xC8-0xCB */
+	0x81, 0x95, 0xC8, 0xD4, 0xB4, 0xD3, 0x81, 0x96, /* 0xCC-0xCF */
+	0x81, 0x97, 0xC2, 0xD8, 0x81, 0x98, 0xB2, 0xD6, /* 0xD0-0xD3 */
+	0xD7, 0xD0, 0xCA, 0xCB, 0xCB, 0xFB, 0xD5, 0xCC, /* 0xD4-0xD7 */
+	0xB8, 0xB6, 0xCF, 0xC9, 0x81, 0x99, 0x81, 0x9A, /* 0xD8-0xDB */
+	0x81, 0x9B, 0xD9, 0xDA, 0xD8, 0xF0, 0xC7, 0xAA, /* 0xDC-0xDF */
+	0x81, 0x9C, 0xD8, 0xEE, 0x81, 0x9D, 0xB4, 0xFA, /* 0xE0-0xE3 */
+	0xC1, 0xEE, 0xD2, 0xD4, 0x81, 0x9E, 0x81, 0x9F, /* 0xE4-0xE7 */
+	0xD8, 0xED, 0x81, 0xA0, 0xD2, 0xC7, 0xD8, 0xEF, /* 0xE8-0xEB */
+	0xC3, 0xC7, 0x81, 0xA1, 0x81, 0xA2, 0x81, 0xA3, /* 0xEC-0xEF */
+	0xD1, 0xF6, 0x81, 0xA4, 0xD6, 0xD9, 0xD8, 0xF2, /* 0xF0-0xF3 */
+	0x81, 0xA5, 0xD8, 0xF5, 0xBC, 0xFE, 0xBC, 0xDB, /* 0xF4-0xF7 */
+	0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, 0xC8, 0xCE, /* 0xF8-0xFB */
+	0x81, 0xA9, 0xB7, 0xDD, 0x81, 0xAA, 0xB7, 0xC2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0x81, 0xAB, 0xC6, 0xF3, 0x81, 0xAC, 0x81, 0xAD, /* 0x00-0x03 */
+	0x81, 0xAE, 0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, /* 0x04-0x07 */
+	0x81, 0xB2, 0xD8, 0xF8, 0xD2, 0xC1, 0x81, 0xB3, /* 0x08-0x0B */
+	0x81, 0xB4, 0xCE, 0xE9, 0xBC, 0xBF, 0xB7, 0xFC, /* 0x0C-0x0F */
+	0xB7, 0xA5, 0xD0, 0xDD, 0x81, 0xB5, 0x81, 0xB6, /* 0x10-0x13 */
+	0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, 0xD6, 0xDA, /* 0x14-0x17 */
+	0xD3, 0xC5, 0xBB, 0xEF, 0xBB, 0xE1, 0xD8, 0xF1, /* 0x18-0x1B */
+	0x81, 0xBA, 0x81, 0xBB, 0xC9, 0xA1, 0xCE, 0xB0, /* 0x1C-0x1F */
+	0xB4, 0xAB, 0x81, 0xBC, 0xD8, 0xF3, 0x81, 0xBD, /* 0x20-0x23 */
+	0xC9, 0xCB, 0xD8, 0xF6, 0xC2, 0xD7, 0xD8, 0xF7, /* 0x24-0x27 */
+	0x81, 0xBE, 0x81, 0xBF, 0xCE, 0xB1, 0xD8, 0xF9, /* 0x28-0x2B */
+	0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0xB2, 0xAE, /* 0x2C-0x2F */
+	0xB9, 0xC0, 0x81, 0xC3, 0xD9, 0xA3, 0x81, 0xC4, /* 0x30-0x33 */
+	0xB0, 0xE9, 0x81, 0xC5, 0xC1, 0xE6, 0x81, 0xC6, /* 0x34-0x37 */
+	0xC9, 0xEC, 0x81, 0xC7, 0xCB, 0xC5, 0x81, 0xC8, /* 0x38-0x3B */
+	0xCB, 0xC6, 0xD9, 0xA4, 0x81, 0xC9, 0x81, 0xCA, /* 0x3C-0x3F */
+	0x81, 0xCB, 0x81, 0xCC, 0x81, 0xCD, 0xB5, 0xE8, /* 0x40-0x43 */
+	0x81, 0xCE, 0x81, 0xCF, 0xB5, 0xAB, 0x81, 0xD0, /* 0x44-0x47 */
+	0x81, 0xD1, 0x81, 0xD2, 0x81, 0xD3, 0x81, 0xD4, /* 0x48-0x4B */
+	0x81, 0xD5, 0xCE, 0xBB, 0xB5, 0xCD, 0xD7, 0xA1, /* 0x4C-0x4F */
+	0xD7, 0xF4, 0xD3, 0xD3, 0x81, 0xD6, 0xCC, 0xE5, /* 0x50-0x53 */
+	0x81, 0xD7, 0xBA, 0xCE, 0x81, 0xD8, 0xD9, 0xA2, /* 0x54-0x57 */
+	0xD9, 0xDC, 0xD3, 0xE0, 0xD8, 0xFD, 0xB7, 0xF0, /* 0x58-0x5B */
+	0xD7, 0xF7, 0xD8, 0xFE, 0xD8, 0xFA, 0xD9, 0xA1, /* 0x5C-0x5F */
+	0xC4, 0xE3, 0x81, 0xD9, 0x81, 0xDA, 0xD3, 0xB6, /* 0x60-0x63 */
+	0xD8, 0xF4, 0xD9, 0xDD, 0x81, 0xDB, 0xD8, 0xFB, /* 0x64-0x67 */
+	0x81, 0xDC, 0xC5, 0xE5, 0x81, 0xDD, 0x81, 0xDE, /* 0x68-0x6B */
+	0xC0, 0xD0, 0x81, 0xDF, 0x81, 0xE0, 0xD1, 0xF0, /* 0x6C-0x6F */
+	0xB0, 0xDB, 0x81, 0xE1, 0x81, 0xE2, 0xBC, 0xD1, /* 0x70-0x73 */
+	0xD9, 0xA6, 0x81, 0xE3, 0xD9, 0xA5, 0x81, 0xE4, /* 0x74-0x77 */
+	0x81, 0xE5, 0x81, 0xE6, 0x81, 0xE7, 0xD9, 0xAC, /* 0x78-0x7B */
+	0xD9, 0xAE, 0x81, 0xE8, 0xD9, 0xAB, 0xCA, 0xB9, /* 0x7C-0x7F */
+	
+	0x81, 0xE9, 0x81, 0xEA, 0x81, 0xEB, 0xD9, 0xA9, /* 0x80-0x83 */
+	0xD6, 0xB6, 0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, /* 0x84-0x87 */
+	0xB3, 0xDE, 0xD9, 0xA8, 0x81, 0xEF, 0xC0, 0xFD, /* 0x88-0x8B */
+	0x81, 0xF0, 0xCA, 0xCC, 0x81, 0xF1, 0xD9, 0xAA, /* 0x8C-0x8F */
+	0x81, 0xF2, 0xD9, 0xA7, 0x81, 0xF3, 0x81, 0xF4, /* 0x90-0x93 */
+	0xD9, 0xB0, 0x81, 0xF5, 0x81, 0xF6, 0xB6, 0xB1, /* 0x94-0x97 */
+	0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0xB9, 0xA9, /* 0x98-0x9B */
+	0x81, 0xFA, 0xD2, 0xC0, 0x81, 0xFB, 0x81, 0xFC, /* 0x9C-0x9F */
+	0xCF, 0xC0, 0x81, 0xFD, 0x81, 0xFE, 0xC2, 0xC2, /* 0xA0-0xA3 */
+	0x82, 0x40, 0xBD, 0xC4, 0xD5, 0xEC, 0xB2, 0xE0, /* 0xA4-0xA7 */
+	0xC7, 0xC8, 0xBF, 0xEB, 0xD9, 0xAD, 0x82, 0x41, /* 0xA8-0xAB */
+	0xD9, 0xAF, 0x82, 0x42, 0xCE, 0xEA, 0xBA, 0xEE, /* 0xAC-0xAF */
+	0x82, 0x43, 0x82, 0x44, 0x82, 0x45, 0x82, 0x46, /* 0xB0-0xB3 */
+	0x82, 0x47, 0xC7, 0xD6, 0x82, 0x48, 0x82, 0x49, /* 0xB4-0xB7 */
+	0x82, 0x4A, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0xB8-0xBB */
+	0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0xB1, 0xE3, /* 0xBC-0xBF */
+	0x82, 0x51, 0x82, 0x52, 0x82, 0x53, 0xB4, 0xD9, /* 0xC0-0xC3 */
+	0xB6, 0xED, 0xD9, 0xB4, 0x82, 0x54, 0x82, 0x55, /* 0xC4-0xC7 */
+	0x82, 0x56, 0x82, 0x57, 0xBF, 0xA1, 0x82, 0x58, /* 0xC8-0xCB */
+	0x82, 0x59, 0x82, 0x5A, 0xD9, 0xDE, 0xC7, 0xCE, /* 0xCC-0xCF */
+	0xC0, 0xFE, 0xD9, 0xB8, 0x82, 0x5B, 0x82, 0x5C, /* 0xD0-0xD3 */
+	0x82, 0x5D, 0x82, 0x5E, 0x82, 0x5F, 0xCB, 0xD7, /* 0xD4-0xD7 */
+	0xB7, 0xFD, 0x82, 0x60, 0xD9, 0xB5, 0x82, 0x61, /* 0xD8-0xDB */
+	0xD9, 0xB7, 0xB1, 0xA3, 0xD3, 0xE1, 0xD9, 0xB9, /* 0xDC-0xDF */
+	0x82, 0x62, 0xD0, 0xC5, 0x82, 0x63, 0xD9, 0xB6, /* 0xE0-0xE3 */
+	0x82, 0x64, 0x82, 0x65, 0xD9, 0xB1, 0x82, 0x66, /* 0xE4-0xE7 */
+	0xD9, 0xB2, 0xC1, 0xA9, 0xD9, 0xB3, 0x82, 0x67, /* 0xE8-0xEB */
+	0x82, 0x68, 0xBC, 0xF3, 0xD0, 0xDE, 0xB8, 0xA9, /* 0xEC-0xEF */
+	0x82, 0x69, 0xBE, 0xE3, 0x82, 0x6A, 0xD9, 0xBD, /* 0xF0-0xF3 */
+	0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0xF4-0xF7 */
+	0xD9, 0xBA, 0x82, 0x6F, 0xB0, 0xB3, 0x82, 0x70, /* 0xF8-0xFB */
+	0x82, 0x71, 0x82, 0x72, 0xD9, 0xC2, 0x82, 0x73, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_50[512] = {
+	0x82, 0x74, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x00-0x03 */
+	0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, 0x82, 0x7B, /* 0x04-0x07 */
+	0x82, 0x7C, 0x82, 0x7D, 0x82, 0x7E, 0x82, 0x80, /* 0x08-0x0B */
+	0xD9, 0xC4, 0xB1, 0xB6, 0x82, 0x81, 0xD9, 0xBF, /* 0x0C-0x0F */
+	0x82, 0x82, 0x82, 0x83, 0xB5, 0xB9, 0x82, 0x84, /* 0x10-0x13 */
+	0xBE, 0xF3, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x14-0x17 */
+	0xCC, 0xC8, 0xBA, 0xF2, 0xD2, 0xD0, 0x82, 0x88, /* 0x18-0x1B */
+	0xD9, 0xC3, 0x82, 0x89, 0x82, 0x8A, 0xBD, 0xE8, /* 0x1C-0x1F */
+	0x82, 0x8B, 0xB3, 0xAB, 0x82, 0x8C, 0x82, 0x8D, /* 0x20-0x23 */
+	0x82, 0x8E, 0xD9, 0xC5, 0xBE, 0xEB, 0x82, 0x8F, /* 0x24-0x27 */
+	0xD9, 0xC6, 0xD9, 0xBB, 0xC4, 0xDF, 0x82, 0x90, /* 0x28-0x2B */
+	0xD9, 0xBE, 0xD9, 0xC1, 0xD9, 0xC0, 0x82, 0x91, /* 0x2C-0x2F */
+	0x82, 0x92, 0x82, 0x93, 0x82, 0x94, 0x82, 0x95, /* 0x30-0x33 */
+	0x82, 0x96, 0x82, 0x97, 0x82, 0x98, 0x82, 0x99, /* 0x34-0x37 */
+	0x82, 0x9A, 0x82, 0x9B, 0xD5, 0xAE, 0x82, 0x9C, /* 0x38-0x3B */
+	0xD6, 0xB5, 0x82, 0x9D, 0xC7, 0xE3, 0x82, 0x9E, /* 0x3C-0x3F */
+	0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, 0xD9, 0xC8, /* 0x40-0x43 */
+	0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0xBC, 0xD9, /* 0x44-0x47 */
+	0xD9, 0xCA, 0x82, 0xA5, 0x82, 0xA6, 0x82, 0xA7, /* 0x48-0x4B */
+	0xD9, 0xBC, 0x82, 0xA8, 0xD9, 0xCB, 0xC6, 0xAB, /* 0x4C-0x4F */
+	0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x50-0x53 */
+	0x82, 0xAD, 0xD9, 0xC9, 0x82, 0xAE, 0x82, 0xAF, /* 0x54-0x57 */
+	0x82, 0xB0, 0x82, 0xB1, 0xD7, 0xF6, 0x82, 0xB2, /* 0x58-0x5B */
+	0xCD, 0xA3, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x5C-0x5F */
+	0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x60-0x63 */
+	0x82, 0xBA, 0xBD, 0xA1, 0x82, 0xBB, 0x82, 0xBC, /* 0x64-0x67 */
+	0x82, 0xBD, 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, /* 0x68-0x6B */
+	0xD9, 0xCC, 0x82, 0xC1, 0x82, 0xC2, 0x82, 0xC3, /* 0x6C-0x6F */
+	0x82, 0xC4, 0x82, 0xC5, 0x82, 0xC6, 0x82, 0xC7, /* 0x70-0x73 */
+	0x82, 0xC8, 0x82, 0xC9, 0xC5, 0xBC, 0xCD, 0xB5, /* 0x74-0x77 */
+	0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0xD9, 0xCD, /* 0x78-0x7B */
+	0x82, 0xCD, 0x82, 0xCE, 0xD9, 0xC7, 0xB3, 0xA5, /* 0x7C-0x7F */
+	
+	0xBF, 0xFE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x80-0x83 */
+	0x82, 0xD2, 0xB8, 0xB5, 0x82, 0xD3, 0x82, 0xD4, /* 0x84-0x87 */
+	0xC0, 0xFC, 0x82, 0xD5, 0x82, 0xD6, 0x82, 0xD7, /* 0x88-0x8B */
+	0x82, 0xD8, 0xB0, 0xF8, 0x82, 0xD9, 0x82, 0xDA, /* 0x8C-0x8F */
+	0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, 0x82, 0xDE, /* 0x90-0x93 */
+	0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, 0x82, 0xE2, /* 0x94-0x97 */
+	0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, 0x82, 0xE6, /* 0x98-0x9B */
+	0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, 0x82, 0xEA, /* 0x9C-0x9F */
+	0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, 0xB4, 0xF6, /* 0xA0-0xA3 */
+	0x82, 0xEE, 0xD9, 0xCE, 0x82, 0xEF, 0xD9, 0xCF, /* 0xA4-0xA7 */
+	0xB4, 0xA2, 0xD9, 0xD0, 0x82, 0xF0, 0x82, 0xF1, /* 0xA8-0xAB */
+	0xB4, 0xDF, 0x82, 0xF2, 0x82, 0xF3, 0x82, 0xF4, /* 0xAC-0xAF */
+	0x82, 0xF5, 0x82, 0xF6, 0xB0, 0xC1, 0x82, 0xF7, /* 0xB0-0xB3 */
+	0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, 0x82, 0xFB, /* 0xB4-0xB7 */
+	0x82, 0xFC, 0x82, 0xFD, 0xD9, 0xD1, 0xC9, 0xB5, /* 0xB8-0xBB */
+	0x82, 0xFE, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xBC-0xBF */
+	0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xC0-0xC3 */
+	0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xC4-0xC7 */
+	0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xC8-0xCB */
+	0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0xCF, 0xF1, /* 0xCC-0xCF */
+	0x83, 0x52, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0xD0-0xD3 */
+	0x83, 0x56, 0x83, 0x57, 0xD9, 0xD2, 0x83, 0x58, /* 0xD4-0xD7 */
+	0x83, 0x59, 0x83, 0x5A, 0xC1, 0xC5, 0x83, 0x5B, /* 0xD8-0xDB */
+	0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, 0x83, 0x5F, /* 0xDC-0xDF */
+	0x83, 0x60, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0xE0-0xE3 */
+	0x83, 0x64, 0x83, 0x65, 0xD9, 0xD6, 0xC9, 0xAE, /* 0xE4-0xE7 */
+	0x83, 0x66, 0x83, 0x67, 0x83, 0x68, 0x83, 0x69, /* 0xE8-0xEB */
+	0xD9, 0xD5, 0xD9, 0xD4, 0xD9, 0xD7, 0x83, 0x6A, /* 0xEC-0xEF */
+	0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0xCB, 0xDB, /* 0xF0-0xF3 */
+	0x83, 0x6E, 0xBD, 0xA9, 0x83, 0x6F, 0x83, 0x70, /* 0xF4-0xF7 */
+	0x83, 0x71, 0x83, 0x72, 0x83, 0x73, 0xC6, 0xA7, /* 0xF8-0xFB */
+	0x83, 0x74, 0x83, 0x75, 0x83, 0x76, 0x83, 0x77, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_51[512] = {
+	0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, 0x83, 0x7B, /* 0x00-0x03 */
+	0x83, 0x7C, 0x83, 0x7D, 0xD9, 0xD3, 0xD9, 0xD8, /* 0x04-0x07 */
+	0x83, 0x7E, 0x83, 0x80, 0x83, 0x81, 0xD9, 0xD9, /* 0x08-0x0B */
+	0x83, 0x82, 0x83, 0x83, 0x83, 0x84, 0x83, 0x85, /* 0x0C-0x0F */
+	0x83, 0x86, 0x83, 0x87, 0xC8, 0xE5, 0x83, 0x88, /* 0x10-0x13 */
+	0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, 0x83, 0x8C, /* 0x14-0x17 */
+	0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, 0x83, 0x90, /* 0x18-0x1B */
+	0x83, 0x91, 0x83, 0x92, 0x83, 0x93, 0x83, 0x94, /* 0x1C-0x1F */
+	0x83, 0x95, 0xC0, 0xDC, 0x83, 0x96, 0x83, 0x97, /* 0x20-0x23 */
+	0x83, 0x98, 0x83, 0x99, 0x83, 0x9A, 0x83, 0x9B, /* 0x24-0x27 */
+	0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, 0x83, 0x9F, /* 0x28-0x2B */
+	0x83, 0xA0, 0x83, 0xA1, 0x83, 0xA2, 0x83, 0xA3, /* 0x2C-0x2F */
+	0x83, 0xA4, 0x83, 0xA5, 0x83, 0xA6, 0x83, 0xA7, /* 0x30-0x33 */
+	0x83, 0xA8, 0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, /* 0x34-0x37 */
+	0x83, 0xAC, 0x83, 0xAD, 0x83, 0xAE, 0x83, 0xAF, /* 0x38-0x3B */
+	0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, 0xB6, 0xF9, /* 0x3C-0x3F */
+	0xD8, 0xA3, 0xD4, 0xCA, 0x83, 0xB3, 0xD4, 0xAA, /* 0x40-0x43 */
+	0xD0, 0xD6, 0xB3, 0xE4, 0xD5, 0xD7, 0x83, 0xB4, /* 0x44-0x47 */
+	0xCF, 0xC8, 0xB9, 0xE2, 0x83, 0xB5, 0xBF, 0xCB, /* 0x48-0x4B */
+	0x83, 0xB6, 0xC3, 0xE2, 0x83, 0xB7, 0x83, 0xB8, /* 0x4C-0x4F */
+	0x83, 0xB9, 0xB6, 0xD2, 0x83, 0xBA, 0x83, 0xBB, /* 0x50-0x53 */
+	0xCD, 0xC3, 0xD9, 0xEE, 0xD9, 0xF0, 0x83, 0xBC, /* 0x54-0x57 */
+	0x83, 0xBD, 0x83, 0xBE, 0xB5, 0xB3, 0x83, 0xBF, /* 0x58-0x5B */
+	0xB6, 0xB5, 0x83, 0xC0, 0x83, 0xC1, 0x83, 0xC2, /* 0x5C-0x5F */
+	0x83, 0xC3, 0x83, 0xC4, 0xBE, 0xA4, 0x83, 0xC5, /* 0x60-0x63 */
+	0x83, 0xC6, 0xC8, 0xEB, 0x83, 0xC7, 0x83, 0xC8, /* 0x64-0x67 */
+	0xC8, 0xAB, 0x83, 0xC9, 0x83, 0xCA, 0xB0, 0xCB, /* 0x68-0x6B */
+	0xB9, 0xAB, 0xC1, 0xF9, 0xD9, 0xE2, 0x83, 0xCB, /* 0x6C-0x6F */
+	0xC0, 0xBC, 0xB9, 0xB2, 0x83, 0xCC, 0xB9, 0xD8, /* 0x70-0x73 */
+	0xD0, 0xCB, 0xB1, 0xF8, 0xC6, 0xE4, 0xBE, 0xDF, /* 0x74-0x77 */
+	0xB5, 0xE4, 0xD7, 0xC8, 0x83, 0xCD, 0xD1, 0xF8, /* 0x78-0x7B */
+	0xBC, 0xE6, 0xCA, 0xDE, 0x83, 0xCE, 0x83, 0xCF, /* 0x7C-0x7F */
+	
+	0xBC, 0xBD, 0xD9, 0xE6, 0xD8, 0xE7, 0x83, 0xD0, /* 0x80-0x83 */
+	0x83, 0xD1, 0xC4, 0xDA, 0x83, 0xD2, 0x83, 0xD3, /* 0x84-0x87 */
+	0xB8, 0xD4, 0xC8, 0xBD, 0x83, 0xD4, 0x83, 0xD5, /* 0x88-0x8B */
+	0xB2, 0xE1, 0xD4, 0xD9, 0x83, 0xD6, 0x83, 0xD7, /* 0x8C-0x8F */
+	0x83, 0xD8, 0x83, 0xD9, 0xC3, 0xB0, 0x83, 0xDA, /* 0x90-0x93 */
+	0x83, 0xDB, 0xC3, 0xE1, 0xDA, 0xA2, 0xC8, 0xDF, /* 0x94-0x97 */
+	0x83, 0xDC, 0xD0, 0xB4, 0x83, 0xDD, 0xBE, 0xFC, /* 0x98-0x9B */
+	0xC5, 0xA9, 0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, /* 0x9C-0x9F */
+	0xB9, 0xDA, 0x83, 0xE1, 0xDA, 0xA3, 0x83, 0xE2, /* 0xA0-0xA3 */
+	0xD4, 0xA9, 0xDA, 0xA4, 0x83, 0xE3, 0x83, 0xE4, /* 0xA4-0xA7 */
+	0x83, 0xE5, 0x83, 0xE6, 0x83, 0xE7, 0xD9, 0xFB, /* 0xA8-0xAB */
+	0xB6, 0xAC, 0x83, 0xE8, 0x83, 0xE9, 0xB7, 0xEB, /* 0xAC-0xAF */
+	0xB1, 0xF9, 0xD9, 0xFC, 0xB3, 0xE5, 0xBE, 0xF6, /* 0xB0-0xB3 */
+	0x83, 0xEA, 0xBF, 0xF6, 0xD2, 0xB1, 0xC0, 0xE4, /* 0xB4-0xB7 */
+	0x83, 0xEB, 0x83, 0xEC, 0x83, 0xED, 0xB6, 0xB3, /* 0xB8-0xBB */
+	0xD9, 0xFE, 0xD9, 0xFD, 0x83, 0xEE, 0x83, 0xEF, /* 0xBC-0xBF */
+	0xBE, 0xBB, 0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, /* 0xC0-0xC3 */
+	0xC6, 0xE0, 0x83, 0xF3, 0xD7, 0xBC, 0xDA, 0xA1, /* 0xC4-0xC7 */
+	0x83, 0xF4, 0xC1, 0xB9, 0x83, 0xF5, 0xB5, 0xF2, /* 0xC8-0xCB */
+	0xC1, 0xE8, 0x83, 0xF6, 0x83, 0xF7, 0xBC, 0xF5, /* 0xCC-0xCF */
+	0x83, 0xF8, 0xB4, 0xD5, 0x83, 0xF9, 0x83, 0xFA, /* 0xD0-0xD3 */
+	0x83, 0xFB, 0x83, 0xFC, 0x83, 0xFD, 0x83, 0xFE, /* 0xD4-0xD7 */
+	0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0xC1, 0xDD, /* 0xD8-0xDB */
+	0x84, 0x43, 0xC4, 0xFD, 0x84, 0x44, 0x84, 0x45, /* 0xDC-0xDF */
+	0xBC, 0xB8, 0xB7, 0xB2, 0x84, 0x46, 0x84, 0x47, /* 0xE0-0xE3 */
+	0xB7, 0xEF, 0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, /* 0xE4-0xE7 */
+	0x84, 0x4B, 0x84, 0x4C, 0x84, 0x4D, 0xD9, 0xEC, /* 0xE8-0xEB */
+	0x84, 0x4E, 0xC6, 0xBE, 0x84, 0x4F, 0xBF, 0xAD, /* 0xEC-0xEF */
+	0xBB, 0xCB, 0x84, 0x50, 0x84, 0x51, 0xB5, 0xCA, /* 0xF0-0xF3 */
+	0x84, 0x52, 0xDB, 0xC9, 0xD0, 0xD7, 0x84, 0x53, /* 0xF4-0xF7 */
+	0xCD, 0xB9, 0xB0, 0xBC, 0xB3, 0xF6, 0xBB, 0xF7, /* 0xF8-0xFB */
+	0xDB, 0xCA, 0xBA, 0xAF, 0x84, 0x54, 0xD4, 0xE4, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0xB5, 0xB6, 0xB5, 0xF3, 0xD8, 0xD6, 0xC8, 0xD0, /* 0x00-0x03 */
+	0x84, 0x55, 0x84, 0x56, 0xB7, 0xD6, 0xC7, 0xD0, /* 0x04-0x07 */
+	0xD8, 0xD7, 0x84, 0x57, 0xBF, 0xAF, 0x84, 0x58, /* 0x08-0x0B */
+	0x84, 0x59, 0xDB, 0xBB, 0xD8, 0xD8, 0x84, 0x5A, /* 0x0C-0x0F */
+	0x84, 0x5B, 0xD0, 0xCC, 0xBB, 0xAE, 0x84, 0x5C, /* 0x10-0x13 */
+	0x84, 0x5D, 0x84, 0x5E, 0xEB, 0xBE, 0xC1, 0xD0, /* 0x14-0x17 */
+	0xC1, 0xF5, 0xD4, 0xF2, 0xB8, 0xD5, 0xB4, 0xB4, /* 0x18-0x1B */
+	0x84, 0x5F, 0xB3, 0xF5, 0x84, 0x60, 0x84, 0x61, /* 0x1C-0x1F */
+	0xC9, 0xBE, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x20-0x23 */
+	0xC5, 0xD0, 0x84, 0x65, 0x84, 0x66, 0x84, 0x67, /* 0x24-0x27 */
+	0xC5, 0xD9, 0xC0, 0xFB, 0x84, 0x68, 0xB1, 0xF0, /* 0x28-0x2B */
+	0x84, 0x69, 0xD8, 0xD9, 0xB9, 0xCE, 0x84, 0x6A, /* 0x2C-0x2F */
+	0xB5, 0xBD, 0x84, 0x6B, 0x84, 0x6C, 0xD8, 0xDA, /* 0x30-0x33 */
+	0x84, 0x6D, 0x84, 0x6E, 0xD6, 0xC6, 0xCB, 0xA2, /* 0x34-0x37 */
+	0xC8, 0xAF, 0xC9, 0xB2, 0xB4, 0xCC, 0xBF, 0xCC, /* 0x38-0x3B */
+	0x84, 0x6F, 0xB9, 0xF4, 0x84, 0x70, 0xD8, 0xDB, /* 0x3C-0x3F */
+	0xD8, 0xDC, 0xB6, 0xE7, 0xBC, 0xC1, 0xCC, 0xEA, /* 0x40-0x43 */
+	0x84, 0x71, 0x84, 0x72, 0x84, 0x73, 0x84, 0x74, /* 0x44-0x47 */
+	0x84, 0x75, 0x84, 0x76, 0xCF, 0xF7, 0x84, 0x77, /* 0x48-0x4B */
+	0xD8, 0xDD, 0xC7, 0xB0, 0x84, 0x78, 0x84, 0x79, /* 0x4C-0x4F */
+	0xB9, 0xD0, 0xBD, 0xA3, 0x84, 0x7A, 0x84, 0x7B, /* 0x50-0x53 */
+	0xCC, 0xDE, 0x84, 0x7C, 0xC6, 0xCA, 0x84, 0x7D, /* 0x54-0x57 */
+	0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, 0x84, 0x82, /* 0x58-0x5B */
+	0xD8, 0xE0, 0x84, 0x83, 0xD8, 0xDE, 0x84, 0x84, /* 0x5C-0x5F */
+	0x84, 0x85, 0xD8, 0xDF, 0x84, 0x86, 0x84, 0x87, /* 0x60-0x63 */
+	0x84, 0x88, 0xB0, 0xFE, 0x84, 0x89, 0xBE, 0xE7, /* 0x64-0x67 */
+	0x84, 0x8A, 0xCA, 0xA3, 0xBC, 0xF4, 0x84, 0x8B, /* 0x68-0x6B */
+	0x84, 0x8C, 0x84, 0x8D, 0x84, 0x8E, 0xB8, 0xB1, /* 0x6C-0x6F */
+	0x84, 0x8F, 0x84, 0x90, 0xB8, 0xEE, 0x84, 0x91, /* 0x70-0x73 */
+	0x84, 0x92, 0x84, 0x93, 0x84, 0x94, 0x84, 0x95, /* 0x74-0x77 */
+	0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x78-0x7B */
+	0x84, 0x9A, 0xD8, 0xE2, 0x84, 0x9B, 0xBD, 0xCB, /* 0x7C-0x7F */
+	
+	0x84, 0x9C, 0xD8, 0xE4, 0xD8, 0xE3, 0x84, 0x9D, /* 0x80-0x83 */
+	0x84, 0x9E, 0x84, 0x9F, 0x84, 0xA0, 0x84, 0xA1, /* 0x84-0x87 */
+	0xC5, 0xFC, 0x84, 0xA2, 0x84, 0xA3, 0x84, 0xA4, /* 0x88-0x8B */
+	0x84, 0xA5, 0x84, 0xA6, 0x84, 0xA7, 0x84, 0xA8, /* 0x8C-0x8F */
+	0xD8, 0xE5, 0x84, 0xA9, 0x84, 0xAA, 0xD8, 0xE6, /* 0x90-0x93 */
+	0x84, 0xAB, 0x84, 0xAC, 0x84, 0xAD, 0x84, 0xAE, /* 0x94-0x97 */
+	0x84, 0xAF, 0x84, 0xB0, 0x84, 0xB1, 0xC1, 0xA6, /* 0x98-0x9B */
+	0x84, 0xB2, 0xC8, 0xB0, 0xB0, 0xEC, 0xB9, 0xA6, /* 0x9C-0x9F */
+	0xBC, 0xD3, 0xCE, 0xF1, 0xDB, 0xBD, 0xC1, 0xD3, /* 0xA0-0xA3 */
+	0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0xA4-0xA7 */
+	0xB6, 0xAF, 0xD6, 0xFA, 0xC5, 0xAC, 0xBD, 0xD9, /* 0xA8-0xAB */
+	0xDB, 0xBE, 0xDB, 0xBF, 0x84, 0xB7, 0x84, 0xB8, /* 0xAC-0xAF */
+	0x84, 0xB9, 0xC0, 0xF8, 0xBE, 0xA2, 0xC0, 0xCD, /* 0xB0-0xB3 */
+	0x84, 0xBA, 0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, /* 0xB4-0xB7 */
+	0x84, 0xBE, 0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, /* 0xB8-0xBB */
+	0x84, 0xC2, 0x84, 0xC3, 0xDB, 0xC0, 0xCA, 0xC6, /* 0xBC-0xBF */
+	0x84, 0xC4, 0x84, 0xC5, 0x84, 0xC6, 0xB2, 0xAA, /* 0xC0-0xC3 */
+	0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, 0xD3, 0xC2, /* 0xC4-0xC7 */
+	0x84, 0xCA, 0xC3, 0xE3, 0x84, 0xCB, 0xD1, 0xAB, /* 0xC8-0xCB */
+	0x84, 0xCC, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0xCC-0xCF */
+	0xDB, 0xC2, 0x84, 0xD0, 0xC0, 0xD5, 0x84, 0xD1, /* 0xD0-0xD3 */
+	0x84, 0xD2, 0x84, 0xD3, 0xDB, 0xC3, 0x84, 0xD4, /* 0xD4-0xD7 */
+	0xBF, 0xB1, 0x84, 0xD5, 0x84, 0xD6, 0x84, 0xD7, /* 0xD8-0xDB */
+	0x84, 0xD8, 0x84, 0xD9, 0x84, 0xDA, 0xC4, 0xBC, /* 0xDC-0xDF */
+	0x84, 0xDB, 0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, /* 0xE0-0xE3 */
+	0xC7, 0xDA, 0x84, 0xDF, 0x84, 0xE0, 0x84, 0xE1, /* 0xE4-0xE7 */
+	0x84, 0xE2, 0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, /* 0xE8-0xEB */
+	0x84, 0xE6, 0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, /* 0xEC-0xEF */
+	0xDB, 0xC4, 0x84, 0xEA, 0x84, 0xEB, 0x84, 0xEC, /* 0xF0-0xF3 */
+	0x84, 0xED, 0x84, 0xEE, 0x84, 0xEF, 0x84, 0xF0, /* 0xF4-0xF7 */
+	0x84, 0xF1, 0xD9, 0xE8, 0xC9, 0xD7, 0x84, 0xF2, /* 0xF8-0xFB */
+	0x84, 0xF3, 0x84, 0xF4, 0xB9, 0xB4, 0xCE, 0xF0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0xD4, 0xC8, 0x84, 0xF5, 0x84, 0xF6, 0x84, 0xF7, /* 0x00-0x03 */
+	0x84, 0xF8, 0xB0, 0xFC, 0xB4, 0xD2, 0x84, 0xF9, /* 0x04-0x07 */
+	0xD0, 0xD9, 0x84, 0xFA, 0x84, 0xFB, 0x84, 0xFC, /* 0x08-0x0B */
+	0x84, 0xFD, 0xD9, 0xE9, 0x84, 0xFE, 0xDE, 0xCB, /* 0x0C-0x0F */
+	0xD9, 0xEB, 0x85, 0x40, 0x85, 0x41, 0x85, 0x42, /* 0x10-0x13 */
+	0x85, 0x43, 0xD8, 0xB0, 0xBB, 0xAF, 0xB1, 0xB1, /* 0x14-0x17 */
+	0x85, 0x44, 0xB3, 0xD7, 0xD8, 0xCE, 0x85, 0x45, /* 0x18-0x1B */
+	0x85, 0x46, 0xD4, 0xD1, 0x85, 0x47, 0x85, 0x48, /* 0x1C-0x1F */
+	0xBD, 0xB3, 0xBF, 0xEF, 0x85, 0x49, 0xCF, 0xBB, /* 0x20-0x23 */
+	0x85, 0x4A, 0x85, 0x4B, 0xD8, 0xD0, 0x85, 0x4C, /* 0x24-0x27 */
+	0x85, 0x4D, 0x85, 0x4E, 0xB7, 0xCB, 0x85, 0x4F, /* 0x28-0x2B */
+	0x85, 0x50, 0x85, 0x51, 0xD8, 0xD1, 0x85, 0x52, /* 0x2C-0x2F */
+	0x85, 0x53, 0x85, 0x54, 0x85, 0x55, 0x85, 0x56, /* 0x30-0x33 */
+	0x85, 0x57, 0x85, 0x58, 0x85, 0x59, 0x85, 0x5A, /* 0x34-0x37 */
+	0x85, 0x5B, 0xC6, 0xA5, 0xC7, 0xF8, 0xD2, 0xBD, /* 0x38-0x3B */
+	0x85, 0x5C, 0x85, 0x5D, 0xD8, 0xD2, 0xC4, 0xE4, /* 0x3C-0x3F */
+	0x85, 0x5E, 0xCA, 0xAE, 0x85, 0x5F, 0xC7, 0xA7, /* 0x40-0x43 */
+	0x85, 0x60, 0xD8, 0xA6, 0x85, 0x61, 0xC9, 0xFD, /* 0x44-0x47 */
+	0xCE, 0xE7, 0xBB, 0xDC, 0xB0, 0xEB, 0x85, 0x62, /* 0x48-0x4B */
+	0x85, 0x63, 0x85, 0x64, 0xBB, 0xAA, 0xD0, 0xAD, /* 0x4C-0x4F */
+	0x85, 0x65, 0xB1, 0xB0, 0xD7, 0xE4, 0xD7, 0xBF, /* 0x50-0x53 */
+	0x85, 0x66, 0xB5, 0xA5, 0xC2, 0xF4, 0xC4, 0xCF, /* 0x54-0x57 */
+	0x85, 0x67, 0x85, 0x68, 0xB2, 0xA9, 0x85, 0x69, /* 0x58-0x5B */
+	0xB2, 0xB7, 0x85, 0x6A, 0xB1, 0xE5, 0xDF, 0xB2, /* 0x5C-0x5F */
+	0xD5, 0xBC, 0xBF, 0xA8, 0xC2, 0xAC, 0xD8, 0xD5, /* 0x60-0x63 */
+	0xC2, 0xB1, 0x85, 0x6B, 0xD8, 0xD4, 0xCE, 0xD4, /* 0x64-0x67 */
+	0x85, 0x6C, 0xDA, 0xE0, 0x85, 0x6D, 0xCE, 0xC0, /* 0x68-0x6B */
+	0x85, 0x6E, 0x85, 0x6F, 0xD8, 0xB4, 0xC3, 0xAE, /* 0x6C-0x6F */
+	0xD3, 0xA1, 0xCE, 0xA3, 0x85, 0x70, 0xBC, 0xB4, /* 0x70-0x73 */
+	0xC8, 0xB4, 0xC2, 0xD1, 0x85, 0x71, 0xBE, 0xED, /* 0x74-0x77 */
+	0xD0, 0xB6, 0x85, 0x72, 0xDA, 0xE1, 0x85, 0x73, /* 0x78-0x7B */
+	0x85, 0x74, 0x85, 0x75, 0x85, 0x76, 0xC7, 0xE4, /* 0x7C-0x7F */
+	
+	0x85, 0x77, 0x85, 0x78, 0xB3, 0xA7, 0x85, 0x79, /* 0x80-0x83 */
+	0xB6, 0xF2, 0xCC, 0xFC, 0xC0, 0xFA, 0x85, 0x7A, /* 0x84-0x87 */
+	0x85, 0x7B, 0xC0, 0xF7, 0x85, 0x7C, 0xD1, 0xB9, /* 0x88-0x8B */
+	0xD1, 0xE1, 0xD8, 0xC7, 0x85, 0x7D, 0x85, 0x7E, /* 0x8C-0x8F */
+	0x85, 0x80, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x90-0x93 */
+	0x85, 0x84, 0xB2, 0xDE, 0x85, 0x85, 0x85, 0x86, /* 0x94-0x97 */
+	0xC0, 0xE5, 0x85, 0x87, 0xBA, 0xF1, 0x85, 0x88, /* 0x98-0x9B */
+	0x85, 0x89, 0xD8, 0xC8, 0x85, 0x8A, 0xD4, 0xAD, /* 0x9C-0x9F */
+	0x85, 0x8B, 0x85, 0x8C, 0xCF, 0xE1, 0xD8, 0xC9, /* 0xA0-0xA3 */
+	0x85, 0x8D, 0xD8, 0xCA, 0xCF, 0xC3, 0x85, 0x8E, /* 0xA4-0xA7 */
+	0xB3, 0xF8, 0xBE, 0xC7, 0x85, 0x8F, 0x85, 0x90, /* 0xA8-0xAB */
+	0x85, 0x91, 0x85, 0x92, 0xD8, 0xCB, 0x85, 0x93, /* 0xAC-0xAF */
+	0x85, 0x94, 0x85, 0x95, 0x85, 0x96, 0x85, 0x97, /* 0xB0-0xB3 */
+	0x85, 0x98, 0x85, 0x99, 0xDB, 0xCC, 0x85, 0x9A, /* 0xB4-0xB7 */
+	0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0xC8, 0xA5, /* 0xB8-0xBB */
+	0x85, 0x9E, 0x85, 0x9F, 0x85, 0xA0, 0xCF, 0xD8, /* 0xBC-0xBF */
+	0x85, 0xA1, 0xC8, 0xFE, 0xB2, 0xCE, 0x85, 0xA2, /* 0xC0-0xC3 */
+	0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, 0x85, 0xA6, /* 0xC4-0xC7 */
+	0xD3, 0xD6, 0xB2, 0xE6, 0xBC, 0xB0, 0xD3, 0xD1, /* 0xC8-0xCB */
+	0xCB, 0xAB, 0xB7, 0xB4, 0x85, 0xA7, 0x85, 0xA8, /* 0xCC-0xCF */
+	0x85, 0xA9, 0xB7, 0xA2, 0x85, 0xAA, 0x85, 0xAB, /* 0xD0-0xD3 */
+	0xCA, 0xE5, 0x85, 0xAC, 0xC8, 0xA1, 0xCA, 0xDC, /* 0xD4-0xD7 */
+	0xB1, 0xE4, 0xD0, 0xF0, 0x85, 0xAD, 0xC5, 0xD1, /* 0xD8-0xDB */
+	0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0xDB, 0xC5, /* 0xDC-0xDF */
+	0xB5, 0xFE, 0x85, 0xB1, 0x85, 0xB2, 0xBF, 0xDA, /* 0xE0-0xE3 */
+	0xB9, 0xC5, 0xBE, 0xE4, 0xC1, 0xED, 0x85, 0xB3, /* 0xE4-0xE7 */
+	0xDF, 0xB6, 0xDF, 0xB5, 0xD6, 0xBB, 0xBD, 0xD0, /* 0xE8-0xEB */
+	0xD5, 0xD9, 0xB0, 0xC8, 0xB6, 0xA3, 0xBF, 0xC9, /* 0xEC-0xEF */
+	0xCC, 0xA8, 0xDF, 0xB3, 0xCA, 0xB7, 0xD3, 0xD2, /* 0xF0-0xF3 */
+	0x85, 0xB4, 0xD8, 0xCF, 0xD2, 0xB6, 0xBA, 0xC5, /* 0xF4-0xF7 */
+	0xCB, 0xBE, 0xCC, 0xBE, 0x85, 0xB5, 0xDF, 0xB7, /* 0xF8-0xFB */
+	0xB5, 0xF0, 0xDF, 0xB4, 0x85, 0xB6, 0x85, 0xB7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x85, 0xB8, 0xD3, 0xF5, 0x85, 0xB9, 0xB3, 0xD4, /* 0x00-0x03 */
+	0xB8, 0xF7, 0x85, 0xBA, 0xDF, 0xBA, 0x85, 0xBB, /* 0x04-0x07 */
+	0xBA, 0xCF, 0xBC, 0xAA, 0xB5, 0xF5, 0x85, 0xBC, /* 0x08-0x0B */
+	0xCD, 0xAC, 0xC3, 0xFB, 0xBA, 0xF3, 0xC0, 0xF4, /* 0x0C-0x0F */
+	0xCD, 0xC2, 0xCF, 0xF2, 0xDF, 0xB8, 0xCF, 0xC5, /* 0x10-0x13 */
+	0x85, 0xBD, 0xC2, 0xC0, 0xDF, 0xB9, 0xC2, 0xF0, /* 0x14-0x17 */
+	0x85, 0xBE, 0x85, 0xBF, 0x85, 0xC0, 0xBE, 0xFD, /* 0x18-0x1B */
+	0x85, 0xC1, 0xC1, 0xDF, 0xCD, 0xCC, 0xD2, 0xF7, /* 0x1C-0x1F */
+	0xB7, 0xCD, 0xDF, 0xC1, 0x85, 0xC2, 0xDF, 0xC4, /* 0x20-0x23 */
+	0x85, 0xC3, 0x85, 0xC4, 0xB7, 0xF1, 0xB0, 0xC9, /* 0x24-0x27 */
+	0xB6, 0xD6, 0xB7, 0xD4, 0x85, 0xC5, 0xBA, 0xAC, /* 0x28-0x2B */
+	0xCC, 0xFD, 0xBF, 0xD4, 0xCB, 0xB1, 0xC6, 0xF4, /* 0x2C-0x2F */
+	0x85, 0xC6, 0xD6, 0xA8, 0xDF, 0xC5, 0x85, 0xC7, /* 0x30-0x33 */
+	0xCE, 0xE2, 0xB3, 0xB3, 0x85, 0xC8, 0x85, 0xC9, /* 0x34-0x37 */
+	0xCE, 0xFC, 0xB4, 0xB5, 0x85, 0xCA, 0xCE, 0xC7, /* 0x38-0x3B */
+	0xBA, 0xF0, 0x85, 0xCB, 0xCE, 0xE1, 0x85, 0xCC, /* 0x3C-0x3F */
+	0xD1, 0xBD, 0x85, 0xCD, 0x85, 0xCE, 0xDF, 0xC0, /* 0x40-0x43 */
+	0x85, 0xCF, 0x85, 0xD0, 0xB4, 0xF4, 0x85, 0xD1, /* 0x44-0x47 */
+	0xB3, 0xCA, 0x85, 0xD2, 0xB8, 0xE6, 0xDF, 0xBB, /* 0x48-0x4B */
+	0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, 0x85, 0xD6, /* 0x4C-0x4F */
+	0xC4, 0xC5, 0x85, 0xD7, 0xDF, 0xBC, 0xDF, 0xBD, /* 0x50-0x53 */
+	0xDF, 0xBE, 0xC5, 0xBB, 0xDF, 0xBF, 0xDF, 0xC2, /* 0x54-0x57 */
+	0xD4, 0xB1, 0xDF, 0xC3, 0x85, 0xD8, 0xC7, 0xBA, /* 0x58-0x5B */
+	0xCE, 0xD8, 0x85, 0xD9, 0x85, 0xDA, 0x85, 0xDB, /* 0x5C-0x5F */
+	0x85, 0xDC, 0x85, 0xDD, 0xC4, 0xD8, 0x85, 0xDE, /* 0x60-0x63 */
+	0xDF, 0xCA, 0x85, 0xDF, 0xDF, 0xCF, 0x85, 0xE0, /* 0x64-0x67 */
+	0xD6, 0xDC, 0x85, 0xE1, 0x85, 0xE2, 0x85, 0xE3, /* 0x68-0x6B */
+	0x85, 0xE4, 0x85, 0xE5, 0x85, 0xE6, 0x85, 0xE7, /* 0x6C-0x6F */
+	0x85, 0xE8, 0xDF, 0xC9, 0xDF, 0xDA, 0xCE, 0xB6, /* 0x70-0x73 */
+	0x85, 0xE9, 0xBA, 0xC7, 0xDF, 0xCE, 0xDF, 0xC8, /* 0x74-0x77 */
+	0xC5, 0xDE, 0x85, 0xEA, 0x85, 0xEB, 0xC9, 0xEB, /* 0x78-0x7B */
+	0xBA, 0xF4, 0xC3, 0xFC, 0x85, 0xEC, 0x85, 0xED, /* 0x7C-0x7F */
+	
+	0xBE, 0xD7, 0x85, 0xEE, 0xDF, 0xC6, 0x85, 0xEF, /* 0x80-0x83 */
+	0xDF, 0xCD, 0x85, 0xF0, 0xC5, 0xD8, 0x85, 0xF1, /* 0x84-0x87 */
+	0x85, 0xF2, 0x85, 0xF3, 0x85, 0xF4, 0xD5, 0xA6, /* 0x88-0x8B */
+	0xBA, 0xCD, 0x85, 0xF5, 0xBE, 0xCC, 0xD3, 0xBD, /* 0x8C-0x8F */
+	0xB8, 0xC0, 0x85, 0xF6, 0xD6, 0xE4, 0x85, 0xF7, /* 0x90-0x93 */
+	0xDF, 0xC7, 0xB9, 0xBE, 0xBF, 0xA7, 0x85, 0xF8, /* 0x94-0x97 */
+	0x85, 0xF9, 0xC1, 0xFC, 0xDF, 0xCB, 0xDF, 0xCC, /* 0x98-0x9B */
+	0x85, 0xFA, 0xDF, 0xD0, 0x85, 0xFB, 0x85, 0xFC, /* 0x9C-0x9F */
+	0x85, 0xFD, 0x85, 0xFE, 0x86, 0x40, 0xDF, 0xDB, /* 0xA0-0xA3 */
+	0xDF, 0xE5, 0x86, 0x41, 0xDF, 0xD7, 0xDF, 0xD6, /* 0xA4-0xA7 */
+	0xD7, 0xC9, 0xDF, 0xE3, 0xDF, 0xE4, 0xE5, 0xEB, /* 0xA8-0xAB */
+	0xD2, 0xA7, 0xDF, 0xD2, 0x86, 0x42, 0xBF, 0xA9, /* 0xAC-0xAF */
+	0x86, 0x43, 0xD4, 0xDB, 0x86, 0x44, 0xBF, 0xC8, /* 0xB0-0xB3 */
+	0xDF, 0xD4, 0x86, 0x45, 0x86, 0x46, 0x86, 0x47, /* 0xB4-0xB7 */
+	0xCF, 0xCC, 0x86, 0x48, 0x86, 0x49, 0xDF, 0xDD, /* 0xB8-0xBB */
+	0x86, 0x4A, 0xD1, 0xCA, 0x86, 0x4B, 0xDF, 0xDE, /* 0xBC-0xBF */
+	0xB0, 0xA7, 0xC6, 0xB7, 0xDF, 0xD3, 0x86, 0x4C, /* 0xC0-0xC3 */
+	0xBA, 0xE5, 0x86, 0x4D, 0xB6, 0xDF, 0xCD, 0xDB, /* 0xC4-0xC7 */
+	0xB9, 0xFE, 0xD4, 0xD5, 0x86, 0x4E, 0x86, 0x4F, /* 0xC8-0xCB */
+	0xDF, 0xDF, 0xCF, 0xEC, 0xB0, 0xA5, 0xDF, 0xE7, /* 0xCC-0xCF */
+	0xDF, 0xD1, 0xD1, 0xC6, 0xDF, 0xD5, 0xDF, 0xD8, /* 0xD0-0xD3 */
+	0xDF, 0xD9, 0xDF, 0xDC, 0x86, 0x50, 0xBB, 0xA9, /* 0xD4-0xD7 */
+	0x86, 0x51, 0xDF, 0xE0, 0xDF, 0xE1, 0x86, 0x52, /* 0xD8-0xDB */
+	0xDF, 0xE2, 0xDF, 0xE6, 0xDF, 0xE8, 0xD3, 0xB4, /* 0xDC-0xDF */
+	0x86, 0x53, 0x86, 0x54, 0x86, 0x55, 0x86, 0x56, /* 0xE0-0xE3 */
+	0x86, 0x57, 0xB8, 0xE7, 0xC5, 0xB6, 0xDF, 0xEA, /* 0xE4-0xE7 */
+	0xC9, 0xDA, 0xC1, 0xA8, 0xC4, 0xC4, 0x86, 0x58, /* 0xE8-0xEB */
+	0x86, 0x59, 0xBF, 0xDE, 0xCF, 0xF8, 0x86, 0x5A, /* 0xEC-0xEF */
+	0x86, 0x5B, 0x86, 0x5C, 0xD5, 0xDC, 0xDF, 0xEE, /* 0xF0-0xF3 */
+	0x86, 0x5D, 0x86, 0x5E, 0x86, 0x5F, 0x86, 0x60, /* 0xF4-0xF7 */
+	0x86, 0x61, 0x86, 0x62, 0xB2, 0xB8, 0x86, 0x63, /* 0xF8-0xFB */
+	0xBA, 0xDF, 0xDF, 0xEC, 0x86, 0x64, 0xDB, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x86, 0x65, 0xD1, 0xE4, 0x86, 0x66, 0x86, 0x67, /* 0x00-0x03 */
+	0x86, 0x68, 0x86, 0x69, 0xCB, 0xF4, 0xB4, 0xBD, /* 0x04-0x07 */
+	0x86, 0x6A, 0xB0, 0xA6, 0x86, 0x6B, 0x86, 0x6C, /* 0x08-0x0B */
+	0x86, 0x6D, 0x86, 0x6E, 0x86, 0x6F, 0xDF, 0xF1, /* 0x0C-0x0F */
+	0xCC, 0xC6, 0xDF, 0xF2, 0x86, 0x70, 0x86, 0x71, /* 0x10-0x13 */
+	0xDF, 0xED, 0x86, 0x72, 0x86, 0x73, 0x86, 0x74, /* 0x14-0x17 */
+	0x86, 0x75, 0x86, 0x76, 0x86, 0x77, 0xDF, 0xE9, /* 0x18-0x1B */
+	0x86, 0x78, 0x86, 0x79, 0x86, 0x7A, 0x86, 0x7B, /* 0x1C-0x1F */
+	0xDF, 0xEB, 0x86, 0x7C, 0xDF, 0xEF, 0xDF, 0xF0, /* 0x20-0x23 */
+	0xBB, 0xBD, 0x86, 0x7D, 0x86, 0x7E, 0xDF, 0xF3, /* 0x24-0x27 */
+	0x86, 0x80, 0x86, 0x81, 0xDF, 0xF4, 0x86, 0x82, /* 0x28-0x2B */
+	0xBB, 0xA3, 0x86, 0x83, 0xCA, 0xDB, 0xCE, 0xA8, /* 0x2C-0x2F */
+	0xE0, 0xA7, 0xB3, 0xAA, 0x86, 0x84, 0xE0, 0xA6, /* 0x30-0x33 */
+	0x86, 0x85, 0x86, 0x86, 0x86, 0x87, 0xE0, 0xA1, /* 0x34-0x37 */
+	0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0x38-0x3B */
+	0xDF, 0xFE, 0x86, 0x8C, 0xCD, 0xD9, 0xDF, 0xFC, /* 0x3C-0x3F */
+	0x86, 0x8D, 0xDF, 0xFA, 0x86, 0x8E, 0xBF, 0xD0, /* 0x40-0x43 */
+	0xD7, 0xC4, 0x86, 0x8F, 0xC9, 0xCC, 0x86, 0x90, /* 0x44-0x47 */
+	0x86, 0x91, 0xDF, 0xF8, 0xB0, 0xA1, 0x86, 0x92, /* 0x48-0x4B */
+	0x86, 0x93, 0x86, 0x94, 0x86, 0x95, 0x86, 0x96, /* 0x4C-0x4F */
+	0xDF, 0xFD, 0x86, 0x97, 0x86, 0x98, 0x86, 0x99, /* 0x50-0x53 */
+	0x86, 0x9A, 0xDF, 0xFB, 0xE0, 0xA2, 0x86, 0x9B, /* 0x54-0x57 */
+	0x86, 0x9C, 0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, /* 0x58-0x5B */
+	0xE0, 0xA8, 0x86, 0xA0, 0x86, 0xA1, 0x86, 0xA2, /* 0x5C-0x5F */
+	0x86, 0xA3, 0xB7, 0xC8, 0x86, 0xA4, 0x86, 0xA5, /* 0x60-0x63 */
+	0xC6, 0xA1, 0xC9, 0xB6, 0xC0, 0xB2, 0xDF, 0xF5, /* 0x64-0x67 */
+	0x86, 0xA6, 0x86, 0xA7, 0xC5, 0xBE, 0x86, 0xA8, /* 0x68-0x6B */
+	0xD8, 0xC4, 0xDF, 0xF9, 0xC4, 0xF6, 0x86, 0xA9, /* 0x6C-0x6F */
+	0x86, 0xAA, 0x86, 0xAB, 0x86, 0xAC, 0x86, 0xAD, /* 0x70-0x73 */
+	0x86, 0xAE, 0xE0, 0xA3, 0xE0, 0xA4, 0xE0, 0xA5, /* 0x74-0x77 */
+	0xD0, 0xA5, 0x86, 0xAF, 0x86, 0xB0, 0xE0, 0xB4, /* 0x78-0x7B */
+	0xCC, 0xE4, 0x86, 0xB1, 0xE0, 0xB1, 0x86, 0xB2, /* 0x7C-0x7F */
+	
+	0xBF, 0xA6, 0xE0, 0xAF, 0xCE, 0xB9, 0xE0, 0xAB, /* 0x80-0x83 */
+	0xC9, 0xC6, 0x86, 0xB3, 0x86, 0xB4, 0xC0, 0xAE, /* 0x84-0x87 */
+	0xE0, 0xAE, 0xBA, 0xED, 0xBA, 0xB0, 0xE0, 0xA9, /* 0x88-0x8B */
+	0x86, 0xB5, 0x86, 0xB6, 0x86, 0xB7, 0xDF, 0xF6, /* 0x8C-0x8F */
+	0x86, 0xB8, 0xE0, 0xB3, 0x86, 0xB9, 0x86, 0xBA, /* 0x90-0x93 */
+	0xE0, 0xB8, 0x86, 0xBB, 0x86, 0xBC, 0x86, 0xBD, /* 0x94-0x97 */
+	0xB4, 0xAD, 0xE0, 0xB9, 0x86, 0xBE, 0x86, 0xBF, /* 0x98-0x9B */
+	0xCF, 0xB2, 0xBA, 0xC8, 0x86, 0xC0, 0xE0, 0xB0, /* 0x9C-0x9F */
+	0x86, 0xC1, 0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, /* 0xA0-0xA3 */
+	0x86, 0xC5, 0x86, 0xC6, 0x86, 0xC7, 0xD0, 0xFA, /* 0xA4-0xA7 */
+	0x86, 0xC8, 0x86, 0xC9, 0x86, 0xCA, 0x86, 0xCB, /* 0xA8-0xAB */
+	0x86, 0xCC, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0xAC-0xAF */
+	0x86, 0xD0, 0xE0, 0xAC, 0x86, 0xD1, 0xD4, 0xFB, /* 0xB0-0xB3 */
+	0x86, 0xD2, 0xDF, 0xF7, 0x86, 0xD3, 0xC5, 0xE7, /* 0xB4-0xB7 */
+	0x86, 0xD4, 0xE0, 0xAD, 0x86, 0xD5, 0xD3, 0xF7, /* 0xB8-0xBB */
+	0x86, 0xD6, 0xE0, 0xB6, 0xE0, 0xB7, 0x86, 0xD7, /* 0xBC-0xBF */
+	0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, 0x86, 0xDB, /* 0xC0-0xC3 */
+	0xE0, 0xC4, 0xD0, 0xE1, 0x86, 0xDC, 0x86, 0xDD, /* 0xC4-0xC7 */
+	0x86, 0xDE, 0xE0, 0xBC, 0x86, 0xDF, 0x86, 0xE0, /* 0xC8-0xCB */
+	0xE0, 0xC9, 0xE0, 0xCA, 0x86, 0xE1, 0x86, 0xE2, /* 0xCC-0xCF */
+	0x86, 0xE3, 0xE0, 0xBE, 0xE0, 0xAA, 0xC9, 0xA4, /* 0xD0-0xD3 */
+	0xE0, 0xC1, 0x86, 0xE4, 0xE0, 0xB2, 0x86, 0xE5, /* 0xD4-0xD7 */
+	0x86, 0xE6, 0x86, 0xE7, 0x86, 0xE8, 0x86, 0xE9, /* 0xD8-0xDB */
+	0xCA, 0xC8, 0xE0, 0xC3, 0x86, 0xEA, 0xE0, 0xB5, /* 0xDC-0xDF */
+	0x86, 0xEB, 0xCE, 0xCB, 0x86, 0xEC, 0xCB, 0xC3, /* 0xE0-0xE3 */
+	0xE0, 0xCD, 0xE0, 0xC6, 0xE0, 0xC2, 0x86, 0xED, /* 0xE4-0xE7 */
+	0xE0, 0xCB, 0x86, 0xEE, 0xE0, 0xBA, 0xE0, 0xBF, /* 0xE8-0xEB */
+	0xE0, 0xC0, 0x86, 0xEF, 0x86, 0xF0, 0xE0, 0xC5, /* 0xEC-0xEF */
+	0x86, 0xF1, 0x86, 0xF2, 0xE0, 0xC7, 0xE0, 0xC8, /* 0xF0-0xF3 */
+	0x86, 0xF3, 0xE0, 0xCC, 0x86, 0xF4, 0xE0, 0xBB, /* 0xF4-0xF7 */
+	0x86, 0xF5, 0x86, 0xF6, 0x86, 0xF7, 0x86, 0xF8, /* 0xF8-0xFB */
+	0x86, 0xF9, 0xCB, 0xD4, 0xE0, 0xD5, 0x86, 0xFA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0xE0, 0xD6, 0xE0, 0xD2, 0x86, 0xFB, 0x86, 0xFC, /* 0x00-0x03 */
+	0x86, 0xFD, 0x86, 0xFE, 0x87, 0x40, 0x87, 0x41, /* 0x04-0x07 */
+	0xE0, 0xD0, 0xBC, 0xCE, 0x87, 0x42, 0x87, 0x43, /* 0x08-0x0B */
+	0xE0, 0xD1, 0x87, 0x44, 0xB8, 0xC2, 0xD8, 0xC5, /* 0x0C-0x0F */
+	0x87, 0x45, 0x87, 0x46, 0x87, 0x47, 0x87, 0x48, /* 0x10-0x13 */
+	0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, 0x87, 0x4C, /* 0x14-0x17 */
+	0xD0, 0xEA, 0x87, 0x4D, 0x87, 0x4E, 0xC2, 0xEF, /* 0x18-0x1B */
+	0x87, 0x4F, 0x87, 0x50, 0xE0, 0xCF, 0xE0, 0xBD, /* 0x1C-0x1F */
+	0x87, 0x51, 0x87, 0x52, 0x87, 0x53, 0xE0, 0xD4, /* 0x20-0x23 */
+	0xE0, 0xD3, 0x87, 0x54, 0x87, 0x55, 0xE0, 0xD7, /* 0x24-0x27 */
+	0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0x28-0x2B */
+	0xE0, 0xDC, 0xE0, 0xD8, 0x87, 0x5A, 0x87, 0x5B, /* 0x2C-0x2F */
+	0x87, 0x5C, 0xD6, 0xF6, 0xB3, 0xB0, 0x87, 0x5D, /* 0x30-0x33 */
+	0xD7, 0xEC, 0x87, 0x5E, 0xCB, 0xBB, 0x87, 0x5F, /* 0x34-0x37 */
+	0x87, 0x60, 0xE0, 0xDA, 0x87, 0x61, 0xCE, 0xFB, /* 0x38-0x3B */
+	0x87, 0x62, 0x87, 0x63, 0x87, 0x64, 0xBA, 0xD9, /* 0x3C-0x3F */
+	0x87, 0x65, 0x87, 0x66, 0x87, 0x67, 0x87, 0x68, /* 0x40-0x43 */
+	0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, 0x87, 0x6C, /* 0x44-0x47 */
+	0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, 0x87, 0x70, /* 0x48-0x4B */
+	0xE0, 0xE1, 0xE0, 0xDD, 0xD2, 0xAD, 0x87, 0x71, /* 0x4C-0x4F */
+	0x87, 0x72, 0x87, 0x73, 0x87, 0x74, 0x87, 0x75, /* 0x50-0x53 */
+	0xE0, 0xE2, 0x87, 0x76, 0x87, 0x77, 0xE0, 0xDB, /* 0x54-0x57 */
+	0xE0, 0xD9, 0xE0, 0xDF, 0x87, 0x78, 0x87, 0x79, /* 0x58-0x5B */
+	0xE0, 0xE0, 0x87, 0x7A, 0x87, 0x7B, 0x87, 0x7C, /* 0x5C-0x5F */
+	0x87, 0x7D, 0x87, 0x7E, 0xE0, 0xDE, 0x87, 0x80, /* 0x60-0x63 */
+	0xE0, 0xE4, 0x87, 0x81, 0x87, 0x82, 0x87, 0x83, /* 0x64-0x67 */
+	0xC6, 0xF7, 0xD8, 0xAC, 0xD4, 0xEB, 0xE0, 0xE6, /* 0x68-0x6B */
+	0xCA, 0xC9, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0x6C-0x6F */
+	0x87, 0x87, 0xE0, 0xE5, 0x87, 0x88, 0x87, 0x89, /* 0x70-0x73 */
+	0x87, 0x8A, 0x87, 0x8B, 0xB8, 0xC1, 0x87, 0x8C, /* 0x74-0x77 */
+	0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, 0xE0, 0xE7, /* 0x78-0x7B */
+	0xE0, 0xE8, 0x87, 0x90, 0x87, 0x91, 0x87, 0x92, /* 0x7C-0x7F */
+	
+	0x87, 0x93, 0x87, 0x94, 0x87, 0x95, 0x87, 0x96, /* 0x80-0x83 */
+	0x87, 0x97, 0xE0, 0xE9, 0xE0, 0xE3, 0x87, 0x98, /* 0x84-0x87 */
+	0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, 0x87, 0x9C, /* 0x88-0x8B */
+	0x87, 0x9D, 0x87, 0x9E, 0xBA, 0xBF, 0xCC, 0xE7, /* 0x8C-0x8F */
+	0x87, 0x9F, 0x87, 0xA0, 0x87, 0xA1, 0xE0, 0xEA, /* 0x90-0x93 */
+	0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, 0x87, 0xA5, /* 0x94-0x97 */
+	0x87, 0xA6, 0x87, 0xA7, 0x87, 0xA8, 0x87, 0xA9, /* 0x98-0x9B */
+	0x87, 0xAA, 0x87, 0xAB, 0x87, 0xAC, 0x87, 0xAD, /* 0x9C-0x9F */
+	0x87, 0xAE, 0x87, 0xAF, 0x87, 0xB0, 0xCF, 0xF9, /* 0xA0-0xA3 */
+	0x87, 0xB1, 0x87, 0xB2, 0x87, 0xB3, 0x87, 0xB4, /* 0xA4-0xA7 */
+	0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, 0x87, 0xB8, /* 0xA8-0xAB */
+	0x87, 0xB9, 0x87, 0xBA, 0x87, 0xBB, 0xE0, 0xEB, /* 0xAC-0xAF */
+	0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, 0x87, 0xBF, /* 0xB0-0xB3 */
+	0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, 0xC8, 0xC2, /* 0xB4-0xB7 */
+	0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0xB8-0xBB */
+	0xBD, 0xC0, 0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, /* 0xBC-0xBF */
+	0x87, 0xCA, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0xC0-0xC3 */
+	0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0xC4-0xC7 */
+	0x87, 0xD2, 0x87, 0xD3, 0xC4, 0xD2, 0x87, 0xD4, /* 0xC8-0xCB */
+	0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0xCC-0xCF */
+	0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0xD0-0xD3 */
+	0xE0, 0xEC, 0x87, 0xDD, 0x87, 0xDE, 0xE0, 0xED, /* 0xD4-0xD7 */
+	0x87, 0xDF, 0x87, 0xE0, 0xC7, 0xF4, 0xCB, 0xC4, /* 0xD8-0xDB */
+	0x87, 0xE1, 0xE0, 0xEE, 0xBB, 0xD8, 0xD8, 0xB6, /* 0xDC-0xDF */
+	0xD2, 0xF2, 0xE0, 0xEF, 0xCD, 0xC5, 0x87, 0xE2, /* 0xE0-0xE3 */
+	0xB6, 0xDA, 0x87, 0xE3, 0x87, 0xE4, 0x87, 0xE5, /* 0xE4-0xE7 */
+	0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, 0xE0, 0xF1, /* 0xE8-0xEB */
+	0x87, 0xE9, 0xD4, 0xB0, 0x87, 0xEA, 0x87, 0xEB, /* 0xEC-0xEF */
+	0xC0, 0xA7, 0xB4, 0xD1, 0x87, 0xEC, 0x87, 0xED, /* 0xF0-0xF3 */
+	0xCE, 0xA7, 0xE0, 0xF0, 0x87, 0xEE, 0x87, 0xEF, /* 0xF4-0xF7 */
+	0x87, 0xF0, 0xE0, 0xF2, 0xB9, 0xCC, 0x87, 0xF1, /* 0xF8-0xFB */
+	0x87, 0xF2, 0xB9, 0xFA, 0xCD, 0xBC, 0xE0, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, 0xC6, 0xD4, /* 0x00-0x03 */
+	0xE0, 0xF4, 0x87, 0xF6, 0xD4, 0xB2, 0x87, 0xF7, /* 0x04-0x07 */
+	0xC8, 0xA6, 0xE0, 0xF6, 0xE0, 0xF5, 0x87, 0xF8, /* 0x08-0x0B */
+	0x87, 0xF9, 0x87, 0xFA, 0x87, 0xFB, 0x87, 0xFC, /* 0x0C-0x0F */
+	0x87, 0xFD, 0x87, 0xFE, 0x88, 0x40, 0x88, 0x41, /* 0x10-0x13 */
+	0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x14-0x17 */
+	0x88, 0x46, 0x88, 0x47, 0x88, 0x48, 0x88, 0x49, /* 0x18-0x1B */
+	0xE0, 0xF7, 0x88, 0x4A, 0x88, 0x4B, 0xCD, 0xC1, /* 0x1C-0x1F */
+	0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, 0xCA, 0xA5, /* 0x20-0x23 */
+	0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x24-0x27 */
+	0xD4, 0xDA, 0xDB, 0xD7, 0xDB, 0xD9, 0x88, 0x53, /* 0x28-0x2B */
+	0xDB, 0xD8, 0xB9, 0xE7, 0xDB, 0xDC, 0xDB, 0xDD, /* 0x2C-0x2F */
+	0xB5, 0xD8, 0x88, 0x54, 0x88, 0x55, 0xDB, 0xDA, /* 0x30-0x33 */
+	0x88, 0x56, 0x88, 0x57, 0x88, 0x58, 0x88, 0x59, /* 0x34-0x37 */
+	0x88, 0x5A, 0xDB, 0xDB, 0xB3, 0xA1, 0xDB, 0xDF, /* 0x38-0x3B */
+	0x88, 0x5B, 0x88, 0x5C, 0xBB, 0xF8, 0x88, 0x5D, /* 0x3C-0x3F */
+	0xD6, 0xB7, 0x88, 0x5E, 0xDB, 0xE0, 0x88, 0x5F, /* 0x40-0x43 */
+	0x88, 0x60, 0x88, 0x61, 0x88, 0x62, 0xBE, 0xF9, /* 0x44-0x47 */
+	0x88, 0x63, 0x88, 0x64, 0xB7, 0xBB, 0x88, 0x65, /* 0x48-0x4B */
+	0xDB, 0xD0, 0xCC, 0xAE, 0xBF, 0xB2, 0xBB, 0xB5, /* 0x4C-0x4F */
+	0xD7, 0xF8, 0xBF, 0xD3, 0x88, 0x66, 0x88, 0x67, /* 0x50-0x53 */
+	0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0xBF, 0xE9, /* 0x54-0x57 */
+	0x88, 0x6B, 0x88, 0x6C, 0xBC, 0xE1, 0xCC, 0xB3, /* 0x58-0x5B */
+	0xDB, 0xDE, 0xB0, 0xD3, 0xCE, 0xEB, 0xB7, 0xD8, /* 0x5C-0x5F */
+	0xD7, 0xB9, 0xC6, 0xC2, 0x88, 0x6D, 0x88, 0x6E, /* 0x60-0x63 */
+	0xC0, 0xA4, 0x88, 0x6F, 0xCC, 0xB9, 0x88, 0x70, /* 0x64-0x67 */
+	0xDB, 0xE7, 0xDB, 0xE1, 0xC6, 0xBA, 0xDB, 0xE3, /* 0x68-0x6B */
+	0x88, 0x71, 0xDB, 0xE8, 0x88, 0x72, 0xC5, 0xF7, /* 0x6C-0x6F */
+	0x88, 0x73, 0x88, 0x74, 0x88, 0x75, 0xDB, 0xEA, /* 0x70-0x73 */
+	0x88, 0x76, 0x88, 0x77, 0xDB, 0xE9, 0xBF, 0xC0, /* 0x74-0x77 */
+	0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, 0xDB, 0xE6, /* 0x78-0x7B */
+	0xDB, 0xE5, 0x88, 0x7B, 0x88, 0x7C, 0x88, 0x7D, /* 0x7C-0x7F */
+	
+	0x88, 0x7E, 0x88, 0x80, 0xB4, 0xB9, 0xC0, 0xAC, /* 0x80-0x83 */
+	0xC2, 0xA2, 0xDB, 0xE2, 0xDB, 0xE4, 0x88, 0x81, /* 0x84-0x87 */
+	0x88, 0x82, 0x88, 0x83, 0x88, 0x84, 0xD0, 0xCD, /* 0x88-0x8B */
+	0xDB, 0xED, 0x88, 0x85, 0x88, 0x86, 0x88, 0x87, /* 0x8C-0x8F */
+	0x88, 0x88, 0x88, 0x89, 0xC0, 0xDD, 0xDB, 0xF2, /* 0x90-0x93 */
+	0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, 0x88, 0x8D, /* 0x94-0x97 */
+	0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, 0xB6, 0xE2, /* 0x98-0x9B */
+	0x88, 0x91, 0x88, 0x92, 0x88, 0x93, 0x88, 0x94, /* 0x9C-0x9F */
+	0xDB, 0xF3, 0xDB, 0xD2, 0xB9, 0xB8, 0xD4, 0xAB, /* 0xA0-0xA3 */
+	0xDB, 0xEC, 0x88, 0x95, 0xBF, 0xD1, 0xDB, 0xF0, /* 0xA4-0xA7 */
+	0x88, 0x96, 0xDB, 0xD1, 0x88, 0x97, 0xB5, 0xE6, /* 0xA8-0xAB */
+	0x88, 0x98, 0xDB, 0xEB, 0xBF, 0xE5, 0x88, 0x99, /* 0xAC-0xAF */
+	0x88, 0x9A, 0x88, 0x9B, 0xDB, 0xEE, 0x88, 0x9C, /* 0xB0-0xB3 */
+	0xDB, 0xF1, 0x88, 0x9D, 0x88, 0x9E, 0x88, 0x9F, /* 0xB4-0xB7 */
+	0xDB, 0xF9, 0x88, 0xA0, 0x88, 0xA1, 0x88, 0xA2, /* 0xB8-0xBB */
+	0x88, 0xA3, 0x88, 0xA4, 0x88, 0xA5, 0x88, 0xA6, /* 0xBC-0xBF */
+	0x88, 0xA7, 0x88, 0xA8, 0xB9, 0xA1, 0xB0, 0xA3, /* 0xC0-0xC3 */
+	0x88, 0xA9, 0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, /* 0xC4-0xC7 */
+	0x88, 0xAD, 0x88, 0xAE, 0x88, 0xAF, 0xC2, 0xF1, /* 0xC8-0xCB */
+	0x88, 0xB0, 0x88, 0xB1, 0xB3, 0xC7, 0xDB, 0xEF, /* 0xCC-0xCF */
+	0x88, 0xB2, 0x88, 0xB3, 0xDB, 0xF8, 0x88, 0xB4, /* 0xD0-0xD3 */
+	0xC6, 0xD2, 0xDB, 0xF4, 0x88, 0xB5, 0x88, 0xB6, /* 0xD4-0xD7 */
+	0xDB, 0xF5, 0xDB, 0xF7, 0xDB, 0xF6, 0x88, 0xB7, /* 0xD8-0xDB */
+	0x88, 0xB8, 0xDB, 0xFE, 0x88, 0xB9, 0xD3, 0xF2, /* 0xDC-0xDF */
+	0xB2, 0xBA, 0x88, 0xBA, 0x88, 0xBB, 0x88, 0xBC, /* 0xE0-0xE3 */
+	0xDB, 0xFD, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0xE4-0xE7 */
+	0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, 0x88, 0xC3, /* 0xE8-0xEB */
+	0x88, 0xC4, 0xDC, 0xA4, 0x88, 0xC5, 0xDB, 0xFB, /* 0xEC-0xEF */
+	0x88, 0xC6, 0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, /* 0xF0-0xF3 */
+	0xDB, 0xFA, 0x88, 0xCA, 0x88, 0xCB, 0x88, 0xCC, /* 0xF4-0xF7 */
+	0xDB, 0xFC, 0xC5, 0xE0, 0xBB, 0xF9, 0x88, 0xCD, /* 0xF8-0xFB */
+	0x88, 0xCE, 0xDC, 0xA3, 0x88, 0xCF, 0x88, 0xD0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0xDC, 0xA5, 0x88, 0xD1, 0xCC, 0xC3, 0x88, 0xD2, /* 0x00-0x03 */
+	0x88, 0xD3, 0x88, 0xD4, 0xB6, 0xD1, 0xDD, 0xC0, /* 0x04-0x07 */
+	0x88, 0xD5, 0x88, 0xD6, 0x88, 0xD7, 0xDC, 0xA1, /* 0x08-0x0B */
+	0x88, 0xD8, 0xDC, 0xA2, 0x88, 0xD9, 0x88, 0xDA, /* 0x0C-0x0F */
+	0x88, 0xDB, 0xC7, 0xB5, 0x88, 0xDC, 0x88, 0xDD, /* 0x10-0x13 */
+	0x88, 0xDE, 0xB6, 0xE9, 0x88, 0xDF, 0x88, 0xE0, /* 0x14-0x17 */
+	0x88, 0xE1, 0xDC, 0xA7, 0x88, 0xE2, 0x88, 0xE3, /* 0x18-0x1B */
+	0x88, 0xE4, 0x88, 0xE5, 0xDC, 0xA6, 0x88, 0xE6, /* 0x1C-0x1F */
+	0xDC, 0xA9, 0xB1, 0xA4, 0x88, 0xE7, 0x88, 0xE8, /* 0x20-0x23 */
+	0xB5, 0xCC, 0x88, 0xE9, 0x88, 0xEA, 0x88, 0xEB, /* 0x24-0x27 */
+	0x88, 0xEC, 0x88, 0xED, 0xBF, 0xB0, 0x88, 0xEE, /* 0x28-0x2B */
+	0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x2C-0x2F */
+	0xD1, 0xDF, 0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, /* 0x30-0x33 */
+	0x88, 0xF6, 0xB6, 0xC2, 0x88, 0xF7, 0x88, 0xF8, /* 0x34-0x37 */
+	0x88, 0xF9, 0x88, 0xFA, 0x88, 0xFB, 0x88, 0xFC, /* 0x38-0x3B */
+	0x88, 0xFD, 0x88, 0xFE, 0x89, 0x40, 0x89, 0x41, /* 0x3C-0x3F */
+	0x89, 0x42, 0x89, 0x43, 0x89, 0x44, 0x89, 0x45, /* 0x40-0x43 */
+	0xDC, 0xA8, 0x89, 0x46, 0x89, 0x47, 0x89, 0x48, /* 0x44-0x47 */
+	0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, 0x89, 0x4C, /* 0x48-0x4B */
+	0xCB, 0xFA, 0xEB, 0xF3, 0x89, 0x4D, 0x89, 0x4E, /* 0x4C-0x4F */
+	0x89, 0x4F, 0xCB, 0xDC, 0x89, 0x50, 0x89, 0x51, /* 0x50-0x53 */
+	0xCB, 0xFE, 0x89, 0x52, 0x89, 0x53, 0x89, 0x54, /* 0x54-0x57 */
+	0xCC, 0xC1, 0x89, 0x55, 0x89, 0x56, 0x89, 0x57, /* 0x58-0x5B */
+	0x89, 0x58, 0x89, 0x59, 0xC8, 0xFB, 0x89, 0x5A, /* 0x5C-0x5F */
+	0x89, 0x5B, 0x89, 0x5C, 0x89, 0x5D, 0x89, 0x5E, /* 0x60-0x63 */
+	0x89, 0x5F, 0xDC, 0xAA, 0x89, 0x60, 0x89, 0x61, /* 0x64-0x67 */
+	0x89, 0x62, 0x89, 0x63, 0x89, 0x64, 0xCC, 0xEE, /* 0x68-0x6B */
+	0xDC, 0xAB, 0x89, 0x65, 0x89, 0x66, 0x89, 0x67, /* 0x6C-0x6F */
+	0x89, 0x68, 0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, /* 0x70-0x73 */
+	0x89, 0x6C, 0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, /* 0x74-0x77 */
+	0x89, 0x70, 0x89, 0x71, 0x89, 0x72, 0x89, 0x73, /* 0x78-0x7B */
+	0x89, 0x74, 0x89, 0x75, 0xDB, 0xD3, 0x89, 0x76, /* 0x7C-0x7F */
+	
+	0xDC, 0xAF, 0xDC, 0xAC, 0x89, 0x77, 0xBE, 0xB3, /* 0x80-0x83 */
+	0x89, 0x78, 0xCA, 0xFB, 0x89, 0x79, 0x89, 0x7A, /* 0x84-0x87 */
+	0x89, 0x7B, 0xDC, 0xAD, 0x89, 0x7C, 0x89, 0x7D, /* 0x88-0x8B */
+	0x89, 0x7E, 0x89, 0x80, 0x89, 0x81, 0x89, 0x82, /* 0x8C-0x8F */
+	0x89, 0x83, 0x89, 0x84, 0xC9, 0xCA, 0xC4, 0xB9, /* 0x90-0x93 */
+	0x89, 0x85, 0x89, 0x86, 0x89, 0x87, 0x89, 0x88, /* 0x94-0x97 */
+	0x89, 0x89, 0xC7, 0xBD, 0xDC, 0xAE, 0x89, 0x8A, /* 0x98-0x9B */
+	0x89, 0x8B, 0x89, 0x8C, 0xD4, 0xF6, 0xD0, 0xE6, /* 0x9C-0x9F */
+	0x89, 0x8D, 0x89, 0x8E, 0x89, 0x8F, 0x89, 0x90, /* 0xA0-0xA3 */
+	0x89, 0x91, 0x89, 0x92, 0x89, 0x93, 0x89, 0x94, /* 0xA4-0xA7 */
+	0xC4, 0xAB, 0xB6, 0xD5, 0x89, 0x95, 0x89, 0x96, /* 0xA8-0xAB */
+	0x89, 0x97, 0x89, 0x98, 0x89, 0x99, 0x89, 0x9A, /* 0xAC-0xAF */
+	0x89, 0x9B, 0x89, 0x9C, 0x89, 0x9D, 0x89, 0x9E, /* 0xB0-0xB3 */
+	0x89, 0x9F, 0x89, 0xA0, 0x89, 0xA1, 0x89, 0xA2, /* 0xB4-0xB7 */
+	0x89, 0xA3, 0x89, 0xA4, 0x89, 0xA5, 0x89, 0xA6, /* 0xB8-0xBB */
+	0xDB, 0xD4, 0x89, 0xA7, 0x89, 0xA8, 0x89, 0xA9, /* 0xBC-0xBF */
+	0x89, 0xAA, 0xB1, 0xDA, 0x89, 0xAB, 0x89, 0xAC, /* 0xC0-0xC3 */
+	0x89, 0xAD, 0xDB, 0xD5, 0x89, 0xAE, 0x89, 0xAF, /* 0xC4-0xC7 */
+	0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, 0x89, 0xB3, /* 0xC8-0xCB */
+	0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, 0x89, 0xB7, /* 0xCC-0xCF */
+	0x89, 0xB8, 0xDB, 0xD6, 0x89, 0xB9, 0x89, 0xBA, /* 0xD0-0xD3 */
+	0x89, 0xBB, 0xBA, 0xBE, 0x89, 0xBC, 0x89, 0xBD, /* 0xD4-0xD7 */
+	0x89, 0xBE, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xD8-0xDB */
+	0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0xDC-0xDF */
+	0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0xE0-0xE3 */
+	0xC8, 0xC0, 0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, /* 0xE4-0xE7 */
+	0x89, 0xCD, 0x89, 0xCE, 0x89, 0xCF, 0xCA, 0xBF, /* 0xE8-0xEB */
+	0xC8, 0xC9, 0x89, 0xD0, 0xD7, 0xB3, 0x89, 0xD1, /* 0xEC-0xEF */
+	0xC9, 0xF9, 0x89, 0xD2, 0x89, 0xD3, 0xBF, 0xC7, /* 0xF0-0xF3 */
+	0x89, 0xD4, 0x89, 0xD5, 0xBA, 0xF8, 0x89, 0xD6, /* 0xF4-0xF7 */
+	0x89, 0xD7, 0xD2, 0xBC, 0x89, 0xD8, 0x89, 0xD9, /* 0xF8-0xFB */
+	0x89, 0xDA, 0x89, 0xDB, 0x89, 0xDC, 0x89, 0xDD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x89, 0xDE, 0x89, 0xDF, 0xE2, 0xBA, 0x89, 0xE0, /* 0x00-0x03 */
+	0xB4, 0xA6, 0x89, 0xE1, 0x89, 0xE2, 0xB1, 0xB8, /* 0x04-0x07 */
+	0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, 0x89, 0xE6, /* 0x08-0x0B */
+	0x89, 0xE7, 0xB8, 0xB4, 0x89, 0xE8, 0xCF, 0xC4, /* 0x0C-0x0F */
+	0x89, 0xE9, 0x89, 0xEA, 0x89, 0xEB, 0x89, 0xEC, /* 0x10-0x13 */
+	0xD9, 0xE7, 0xCF, 0xA6, 0xCD, 0xE2, 0x89, 0xED, /* 0x14-0x17 */
+	0x89, 0xEE, 0xD9, 0xED, 0xB6, 0xE0, 0x89, 0xEF, /* 0x18-0x1B */
+	0xD2, 0xB9, 0x89, 0xF0, 0x89, 0xF1, 0xB9, 0xBB, /* 0x1C-0x1F */
+	0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x20-0x23 */
+	0xE2, 0xB9, 0xE2, 0xB7, 0x89, 0xF6, 0xB4, 0xF3, /* 0x24-0x27 */
+	0x89, 0xF7, 0xCC, 0xEC, 0xCC, 0xAB, 0xB7, 0xF2, /* 0x28-0x2B */
+	0x89, 0xF8, 0xD8, 0xB2, 0xD1, 0xEB, 0xBA, 0xBB, /* 0x2C-0x2F */
+	0x89, 0xF9, 0xCA, 0xA7, 0x89, 0xFA, 0x89, 0xFB, /* 0x30-0x33 */
+	0xCD, 0xB7, 0x89, 0xFC, 0x89, 0xFD, 0xD2, 0xC4, /* 0x34-0x37 */
+	0xBF, 0xE4, 0xBC, 0xD0, 0xB6, 0xE1, 0x89, 0xFE, /* 0x38-0x3B */
+	0xDE, 0xC5, 0x8A, 0x40, 0x8A, 0x41, 0x8A, 0x42, /* 0x3C-0x3F */
+	0x8A, 0x43, 0xDE, 0xC6, 0xDB, 0xBC, 0x8A, 0x44, /* 0x40-0x43 */
+	0xD1, 0xD9, 0x8A, 0x45, 0x8A, 0x46, 0xC6, 0xE6, /* 0x44-0x47 */
+	0xC4, 0xCE, 0xB7, 0xEE, 0x8A, 0x47, 0xB7, 0xDC, /* 0x48-0x4B */
+	0x8A, 0x48, 0x8A, 0x49, 0xBF, 0xFC, 0xD7, 0xE0, /* 0x4C-0x4F */
+	0x8A, 0x4A, 0xC6, 0xF5, 0x8A, 0x4B, 0x8A, 0x4C, /* 0x50-0x53 */
+	0xB1, 0xBC, 0xDE, 0xC8, 0xBD, 0xB1, 0xCC, 0xD7, /* 0x54-0x57 */
+	0xDE, 0xCA, 0x8A, 0x4D, 0xDE, 0xC9, 0x8A, 0x4E, /* 0x58-0x5B */
+	0x8A, 0x4F, 0x8A, 0x50, 0x8A, 0x51, 0x8A, 0x52, /* 0x5C-0x5F */
+	0xB5, 0xEC, 0x8A, 0x53, 0xC9, 0xDD, 0x8A, 0x54, /* 0x60-0x63 */
+	0x8A, 0x55, 0xB0, 0xC2, 0x8A, 0x56, 0x8A, 0x57, /* 0x64-0x67 */
+	0x8A, 0x58, 0x8A, 0x59, 0x8A, 0x5A, 0x8A, 0x5B, /* 0x68-0x6B */
+	0x8A, 0x5C, 0x8A, 0x5D, 0x8A, 0x5E, 0x8A, 0x5F, /* 0x6C-0x6F */
+	0x8A, 0x60, 0x8A, 0x61, 0x8A, 0x62, 0xC5, 0xAE, /* 0x70-0x73 */
+	0xC5, 0xAB, 0x8A, 0x63, 0xC4, 0xCC, 0x8A, 0x64, /* 0x74-0x77 */
+	0xBC, 0xE9, 0xCB, 0xFD, 0x8A, 0x65, 0x8A, 0x66, /* 0x78-0x7B */
+	0x8A, 0x67, 0xBA, 0xC3, 0x8A, 0x68, 0x8A, 0x69, /* 0x7C-0x7F */
+	
+	0x8A, 0x6A, 0xE5, 0xF9, 0xC8, 0xE7, 0xE5, 0xFA, /* 0x80-0x83 */
+	0xCD, 0xFD, 0x8A, 0x6B, 0xD7, 0xB1, 0xB8, 0xBE, /* 0x84-0x87 */
+	0xC2, 0xE8, 0x8A, 0x6C, 0xC8, 0xD1, 0x8A, 0x6D, /* 0x88-0x8B */
+	0x8A, 0x6E, 0xE5, 0xFB, 0x8A, 0x6F, 0x8A, 0x70, /* 0x8C-0x8F */
+	0x8A, 0x71, 0x8A, 0x72, 0xB6, 0xCA, 0xBC, 0xCB, /* 0x90-0x93 */
+	0x8A, 0x73, 0x8A, 0x74, 0xD1, 0xFD, 0xE6, 0xA1, /* 0x94-0x97 */
+	0x8A, 0x75, 0xC3, 0xEE, 0x8A, 0x76, 0x8A, 0x77, /* 0x98-0x9B */
+	0x8A, 0x78, 0x8A, 0x79, 0xE6, 0xA4, 0x8A, 0x7A, /* 0x9C-0x9F */
+	0x8A, 0x7B, 0x8A, 0x7C, 0x8A, 0x7D, 0xE5, 0xFE, /* 0xA0-0xA3 */
+	0xE6, 0xA5, 0xCD, 0xD7, 0x8A, 0x7E, 0x8A, 0x80, /* 0xA4-0xA7 */
+	0xB7, 0xC1, 0xE5, 0xFC, 0xE5, 0xFD, 0xE6, 0xA3, /* 0xA8-0xAB */
+	0x8A, 0x81, 0x8A, 0x82, 0xC4, 0xDD, 0xE6, 0xA8, /* 0xAC-0xAF */
+	0x8A, 0x83, 0x8A, 0x84, 0xE6, 0xA7, 0x8A, 0x85, /* 0xB0-0xB3 */
+	0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, 0x8A, 0x89, /* 0xB4-0xB7 */
+	0x8A, 0x8A, 0xC3, 0xC3, 0x8A, 0x8B, 0xC6, 0xDE, /* 0xB8-0xBB */
+	0x8A, 0x8C, 0x8A, 0x8D, 0xE6, 0xAA, 0x8A, 0x8E, /* 0xBC-0xBF */
+	0x8A, 0x8F, 0x8A, 0x90, 0x8A, 0x91, 0x8A, 0x92, /* 0xC0-0xC3 */
+	0x8A, 0x93, 0x8A, 0x94, 0xC4, 0xB7, 0x8A, 0x95, /* 0xC4-0xC7 */
+	0x8A, 0x96, 0x8A, 0x97, 0xE6, 0xA2, 0xCA, 0xBC, /* 0xC8-0xCB */
+	0x8A, 0x98, 0x8A, 0x99, 0x8A, 0x9A, 0x8A, 0x9B, /* 0xCC-0xCF */
+	0xBD, 0xE3, 0xB9, 0xC3, 0xE6, 0xA6, 0xD0, 0xD5, /* 0xD0-0xD3 */
+	0xCE, 0xAF, 0x8A, 0x9C, 0x8A, 0x9D, 0xE6, 0xA9, /* 0xD4-0xD7 */
+	0xE6, 0xB0, 0x8A, 0x9E, 0xD2, 0xA6, 0x8A, 0x9F, /* 0xD8-0xDB */
+	0xBD, 0xAA, 0xE6, 0xAD, 0x8A, 0xA0, 0x8A, 0xA1, /* 0xDC-0xDF */
+	0x8A, 0xA2, 0x8A, 0xA3, 0x8A, 0xA4, 0xE6, 0xAF, /* 0xE0-0xE3 */
+	0x8A, 0xA5, 0xC0, 0xD1, 0x8A, 0xA6, 0x8A, 0xA7, /* 0xE4-0xE7 */
+	0xD2, 0xCC, 0x8A, 0xA8, 0x8A, 0xA9, 0x8A, 0xAA, /* 0xE8-0xEB */
+	0xBC, 0xA7, 0x8A, 0xAB, 0x8A, 0xAC, 0x8A, 0xAD, /* 0xEC-0xEF */
+	0x8A, 0xAE, 0x8A, 0xAF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xF0-0xF3 */
+	0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xF4-0xF7 */
+	0x8A, 0xB6, 0xE6, 0xB1, 0x8A, 0xB7, 0xD2, 0xF6, /* 0xF8-0xFB */
+	0x8A, 0xB8, 0x8A, 0xB9, 0x8A, 0xBA, 0xD7, 0xCB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0x8A, 0xBB, 0xCD, 0xFE, 0x8A, 0xBC, 0xCD, 0xDE, /* 0x00-0x03 */
+	0xC2, 0xA6, 0xE6, 0xAB, 0xE6, 0xAC, 0xBD, 0xBF, /* 0x04-0x07 */
+	0xE6, 0xAE, 0xE6, 0xB3, 0x8A, 0xBD, 0x8A, 0xBE, /* 0x08-0x0B */
+	0xE6, 0xB2, 0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, /* 0x0C-0x0F */
+	0x8A, 0xC2, 0xE6, 0xB6, 0x8A, 0xC3, 0xE6, 0xB8, /* 0x10-0x13 */
+	0x8A, 0xC4, 0x8A, 0xC5, 0x8A, 0xC6, 0x8A, 0xC7, /* 0x14-0x17 */
+	0xC4, 0xEF, 0x8A, 0xC8, 0x8A, 0xC9, 0x8A, 0xCA, /* 0x18-0x1B */
+	0xC4, 0xC8, 0x8A, 0xCB, 0x8A, 0xCC, 0xBE, 0xEA, /* 0x1C-0x1F */
+	0xC9, 0xEF, 0x8A, 0xCD, 0x8A, 0xCE, 0xE6, 0xB7, /* 0x20-0x23 */
+	0x8A, 0xCF, 0xB6, 0xF0, 0x8A, 0xD0, 0x8A, 0xD1, /* 0x24-0x27 */
+	0x8A, 0xD2, 0xC3, 0xE4, 0x8A, 0xD3, 0x8A, 0xD4, /* 0x28-0x2B */
+	0x8A, 0xD5, 0x8A, 0xD6, 0x8A, 0xD7, 0x8A, 0xD8, /* 0x2C-0x2F */
+	0x8A, 0xD9, 0xD3, 0xE9, 0xE6, 0xB4, 0x8A, 0xDA, /* 0x30-0x33 */
+	0xE6, 0xB5, 0x8A, 0xDB, 0xC8, 0xA2, 0x8A, 0xDC, /* 0x34-0x37 */
+	0x8A, 0xDD, 0x8A, 0xDE, 0x8A, 0xDF, 0x8A, 0xE0, /* 0x38-0x3B */
+	0xE6, 0xBD, 0x8A, 0xE1, 0x8A, 0xE2, 0x8A, 0xE3, /* 0x3C-0x3F */
+	0xE6, 0xB9, 0x8A, 0xE4, 0x8A, 0xE5, 0x8A, 0xE6, /* 0x40-0x43 */
+	0x8A, 0xE7, 0x8A, 0xE8, 0xC6, 0xC5, 0x8A, 0xE9, /* 0x44-0x47 */
+	0x8A, 0xEA, 0xCD, 0xF1, 0xE6, 0xBB, 0x8A, 0xEB, /* 0x48-0x4B */
+	0x8A, 0xEC, 0x8A, 0xED, 0x8A, 0xEE, 0x8A, 0xEF, /* 0x4C-0x4F */
+	0x8A, 0xF0, 0x8A, 0xF1, 0x8A, 0xF2, 0x8A, 0xF3, /* 0x50-0x53 */
+	0x8A, 0xF4, 0xE6, 0xBC, 0x8A, 0xF5, 0x8A, 0xF6, /* 0x54-0x57 */
+	0x8A, 0xF7, 0x8A, 0xF8, 0xBB, 0xE9, 0x8A, 0xF9, /* 0x58-0x5B */
+	0x8A, 0xFA, 0x8A, 0xFB, 0x8A, 0xFC, 0x8A, 0xFD, /* 0x5C-0x5F */
+	0x8A, 0xFE, 0x8B, 0x40, 0xE6, 0xBE, 0x8B, 0x41, /* 0x60-0x63 */
+	0x8B, 0x42, 0x8B, 0x43, 0x8B, 0x44, 0xE6, 0xBA, /* 0x64-0x67 */
+	0x8B, 0x45, 0x8B, 0x46, 0xC0, 0xB7, 0x8B, 0x47, /* 0x68-0x6B */
+	0x8B, 0x48, 0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, /* 0x6C-0x6F */
+	0x8B, 0x4C, 0x8B, 0x4D, 0x8B, 0x4E, 0x8B, 0x4F, /* 0x70-0x73 */
+	0xD3, 0xA4, 0xE6, 0xBF, 0xC9, 0xF4, 0xE6, 0xC3, /* 0x74-0x77 */
+	0x8B, 0x50, 0x8B, 0x51, 0xE6, 0xC4, 0x8B, 0x52, /* 0x78-0x7B */
+	0x8B, 0x53, 0x8B, 0x54, 0x8B, 0x55, 0xD0, 0xF6, /* 0x7C-0x7F */
+	
+	0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, 0x8B, 0x59, /* 0x80-0x83 */
+	0x8B, 0x5A, 0x8B, 0x5B, 0x8B, 0x5C, 0x8B, 0x5D, /* 0x84-0x87 */
+	0x8B, 0x5E, 0x8B, 0x5F, 0x8B, 0x60, 0x8B, 0x61, /* 0x88-0x8B */
+	0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0x8B, 0x65, /* 0x8C-0x8F */
+	0x8B, 0x66, 0x8B, 0x67, 0xC3, 0xBD, 0x8B, 0x68, /* 0x90-0x93 */
+	0x8B, 0x69, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x94-0x97 */
+	0x8B, 0x6D, 0x8B, 0x6E, 0xC3, 0xC4, 0xE6, 0xC2, /* 0x98-0x9B */
+	0x8B, 0x6F, 0x8B, 0x70, 0x8B, 0x71, 0x8B, 0x72, /* 0x9C-0x9F */
+	0x8B, 0x73, 0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, /* 0xA0-0xA3 */
+	0x8B, 0x77, 0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, /* 0xA4-0xA7 */
+	0x8B, 0x7B, 0x8B, 0x7C, 0xE6, 0xC1, 0x8B, 0x7D, /* 0xA8-0xAB */
+	0x8B, 0x7E, 0x8B, 0x80, 0x8B, 0x81, 0x8B, 0x82, /* 0xAC-0xAF */
+	0x8B, 0x83, 0x8B, 0x84, 0xE6, 0xC7, 0xCF, 0xB1, /* 0xB0-0xB3 */
+	0x8B, 0x85, 0xEB, 0xF4, 0x8B, 0x86, 0x8B, 0x87, /* 0xB4-0xB7 */
+	0xE6, 0xCA, 0x8B, 0x88, 0x8B, 0x89, 0x8B, 0x8A, /* 0xB8-0xBB */
+	0x8B, 0x8B, 0x8B, 0x8C, 0xE6, 0xC5, 0x8B, 0x8D, /* 0xBC-0xBF */
+	0x8B, 0x8E, 0xBC, 0xDE, 0xC9, 0xA9, 0x8B, 0x8F, /* 0xC0-0xC3 */
+	0x8B, 0x90, 0x8B, 0x91, 0x8B, 0x92, 0x8B, 0x93, /* 0xC4-0xC7 */
+	0x8B, 0x94, 0xBC, 0xB5, 0x8B, 0x95, 0x8B, 0x96, /* 0xC8-0xCB */
+	0xCF, 0xD3, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0xCC-0xCF */
+	0x8B, 0x9A, 0x8B, 0x9B, 0xE6, 0xC8, 0x8B, 0x9C, /* 0xD0-0xD3 */
+	0xE6, 0xC9, 0x8B, 0x9D, 0xE6, 0xCE, 0x8B, 0x9E, /* 0xD4-0xD7 */
+	0xE6, 0xD0, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0xD8-0xDB */
+	0xE6, 0xD1, 0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, /* 0xDC-0xDF */
+	0xE6, 0xCB, 0xB5, 0xD5, 0x8B, 0xA5, 0xE6, 0xCC, /* 0xE0-0xE3 */
+	0x8B, 0xA6, 0x8B, 0xA7, 0xE6, 0xCF, 0x8B, 0xA8, /* 0xE4-0xE7 */
+	0x8B, 0xA9, 0xC4, 0xDB, 0x8B, 0xAA, 0xE6, 0xC6, /* 0xE8-0xEB */
+	0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, 0x8B, 0xAE, /* 0xEC-0xEF */
+	0x8B, 0xAF, 0xE6, 0xCD, 0x8B, 0xB0, 0x8B, 0xB1, /* 0xF0-0xF3 */
+	0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0xF4-0xF7 */
+	0x8B, 0xB6, 0x8B, 0xB7, 0x8B, 0xB8, 0x8B, 0xB9, /* 0xF8-0xFB */
+	0x8B, 0xBA, 0x8B, 0xBB, 0x8B, 0xBC, 0x8B, 0xBD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x8B, 0xBE, 0x8B, 0xBF, 0x8B, 0xC0, 0x8B, 0xC1, /* 0x00-0x03 */
+	0x8B, 0xC2, 0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, /* 0x04-0x07 */
+	0x8B, 0xC6, 0xE6, 0xD2, 0x8B, 0xC7, 0x8B, 0xC8, /* 0x08-0x0B */
+	0x8B, 0xC9, 0x8B, 0xCA, 0x8B, 0xCB, 0x8B, 0xCC, /* 0x0C-0x0F */
+	0x8B, 0xCD, 0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, /* 0x10-0x13 */
+	0x8B, 0xD1, 0x8B, 0xD2, 0xE6, 0xD4, 0xE6, 0xD3, /* 0x14-0x17 */
+	0x8B, 0xD3, 0x8B, 0xD4, 0x8B, 0xD5, 0x8B, 0xD6, /* 0x18-0x1B */
+	0x8B, 0xD7, 0x8B, 0xD8, 0x8B, 0xD9, 0x8B, 0xDA, /* 0x1C-0x1F */
+	0x8B, 0xDB, 0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, /* 0x20-0x23 */
+	0x8B, 0xDF, 0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, /* 0x24-0x27 */
+	0x8B, 0xE3, 0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, /* 0x28-0x2B */
+	0x8B, 0xE7, 0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, /* 0x2C-0x2F */
+	0x8B, 0xEB, 0x8B, 0xEC, 0xE6, 0xD5, 0x8B, 0xED, /* 0x30-0x33 */
+	0xD9, 0xF8, 0x8B, 0xEE, 0x8B, 0xEF, 0xE6, 0xD6, /* 0x34-0x37 */
+	0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, 0x8B, 0xF3, /* 0x38-0x3B */
+	0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, 0x8B, 0xF7, /* 0x3C-0x3F */
+	0xE6, 0xD7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0x40-0x43 */
+	0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0x44-0x47 */
+	0x8C, 0x40, 0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, /* 0x48-0x4B */
+	0x8C, 0x44, 0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, /* 0x4C-0x4F */
+	0xD7, 0xD3, 0xE6, 0xDD, 0x8C, 0x48, 0xE6, 0xDE, /* 0x50-0x53 */
+	0xBF, 0xD7, 0xD4, 0xD0, 0x8C, 0x49, 0xD7, 0xD6, /* 0x54-0x57 */
+	0xB4, 0xE6, 0xCB, 0xEF, 0xE6, 0xDA, 0xD8, 0xC3, /* 0x58-0x5B */
+	0xD7, 0xCE, 0xD0, 0xA2, 0x8C, 0x4A, 0xC3, 0xCF, /* 0x5C-0x5F */
+	0x8C, 0x4B, 0x8C, 0x4C, 0xE6, 0xDF, 0xBC, 0xBE, /* 0x60-0x63 */
+	0xB9, 0xC2, 0xE6, 0xDB, 0xD1, 0xA7, 0x8C, 0x4D, /* 0x64-0x67 */
+	0x8C, 0x4E, 0xBA, 0xA2, 0xC2, 0xCF, 0x8C, 0x4F, /* 0x68-0x6B */
+	0xD8, 0xAB, 0x8C, 0x50, 0x8C, 0x51, 0x8C, 0x52, /* 0x6C-0x6F */
+	0xCA, 0xEB, 0xE5, 0xEE, 0x8C, 0x53, 0xE6, 0xDC, /* 0x70-0x73 */
+	0x8C, 0x54, 0xB7, 0xF5, 0x8C, 0x55, 0x8C, 0x56, /* 0x74-0x77 */
+	0x8C, 0x57, 0x8C, 0x58, 0xC8, 0xE6, 0x8C, 0x59, /* 0x78-0x7B */
+	0x8C, 0x5A, 0xC4, 0xF5, 0x8C, 0x5B, 0x8C, 0x5C, /* 0x7C-0x7F */
+	
+	0xE5, 0xB2, 0xC4, 0xFE, 0x8C, 0x5D, 0xCB, 0xFC, /* 0x80-0x83 */
+	0xE5, 0xB3, 0xD5, 0xAC, 0x8C, 0x5E, 0xD3, 0xEE, /* 0x84-0x87 */
+	0xCA, 0xD8, 0xB0, 0xB2, 0x8C, 0x5F, 0xCB, 0xCE, /* 0x88-0x8B */
+	0xCD, 0xEA, 0x8C, 0x60, 0x8C, 0x61, 0xBA, 0xEA, /* 0x8C-0x8F */
+	0x8C, 0x62, 0x8C, 0x63, 0x8C, 0x64, 0xE5, 0xB5, /* 0x90-0x93 */
+	0x8C, 0x65, 0xE5, 0xB4, 0x8C, 0x66, 0xD7, 0xDA, /* 0x94-0x97 */
+	0xB9, 0xD9, 0xD6, 0xE6, 0xB6, 0xA8, 0xCD, 0xF0, /* 0x98-0x9B */
+	0xD2, 0xCB, 0xB1, 0xA6, 0xCA, 0xB5, 0x8C, 0x67, /* 0x9C-0x9F */
+	0xB3, 0xE8, 0xC9, 0xF3, 0xBF, 0xCD, 0xD0, 0xFB, /* 0xA0-0xA3 */
+	0xCA, 0xD2, 0xE5, 0xB6, 0xBB, 0xC2, 0x8C, 0x68, /* 0xA4-0xA7 */
+	0x8C, 0x69, 0x8C, 0x6A, 0xCF, 0xDC, 0xB9, 0xAC, /* 0xA8-0xAB */
+	0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, 0x8C, 0x6E, /* 0xAC-0xAF */
+	0xD4, 0xD7, 0x8C, 0x6F, 0x8C, 0x70, 0xBA, 0xA6, /* 0xB0-0xB3 */
+	0xD1, 0xE7, 0xCF, 0xFC, 0xBC, 0xD2, 0x8C, 0x71, /* 0xB4-0xB7 */
+	0xE5, 0xB7, 0xC8, 0xDD, 0x8C, 0x72, 0x8C, 0x73, /* 0xB8-0xBB */
+	0x8C, 0x74, 0xBF, 0xED, 0xB1, 0xF6, 0xCB, 0xDE, /* 0xBC-0xBF */
+	0x8C, 0x75, 0x8C, 0x76, 0xBC, 0xC5, 0x8C, 0x77, /* 0xC0-0xC3 */
+	0xBC, 0xC4, 0xD2, 0xFA, 0xC3, 0xDC, 0xBF, 0xDC, /* 0xC4-0xC7 */
+	0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x7B, /* 0xC8-0xCB */
+	0xB8, 0xBB, 0x8C, 0x7C, 0x8C, 0x7D, 0x8C, 0x7E, /* 0xCC-0xCF */
+	0xC3, 0xC2, 0x8C, 0x80, 0xBA, 0xAE, 0xD4, 0xA2, /* 0xD0-0xD3 */
+	0x8C, 0x81, 0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, /* 0xD4-0xD7 */
+	0x8C, 0x85, 0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, /* 0xD8-0xDB */
+	0x8C, 0x89, 0xC7, 0xDE, 0xC4, 0xAF, 0xB2, 0xEC, /* 0xDC-0xDF */
+	0x8C, 0x8A, 0xB9, 0xD1, 0x8C, 0x8B, 0x8C, 0x8C, /* 0xE0-0xE3 */
+	0xE5, 0xBB, 0xC1, 0xC8, 0x8C, 0x8D, 0x8C, 0x8E, /* 0xE4-0xE7 */
+	0xD5, 0xAF, 0x8C, 0x8F, 0x8C, 0x90, 0x8C, 0x91, /* 0xE8-0xEB */
+	0x8C, 0x92, 0x8C, 0x93, 0xE5, 0xBC, 0x8C, 0x94, /* 0xEC-0xEF */
+	0xE5, 0xBE, 0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, /* 0xF0-0xF3 */
+	0x8C, 0x98, 0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, /* 0xF4-0xF7 */
+	0xB4, 0xE7, 0xB6, 0xD4, 0xCB, 0xC2, 0xD1, 0xB0, /* 0xF8-0xFB */
+	0xB5, 0xBC, 0x8C, 0x9C, 0x8C, 0x9D, 0xCA, 0xD9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x8C, 0x9E, 0xB7, 0xE2, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x00-0x03 */
+	0xC9, 0xE4, 0x8C, 0xA1, 0xBD, 0xAB, 0x8C, 0xA2, /* 0x04-0x07 */
+	0x8C, 0xA3, 0xCE, 0xBE, 0xD7, 0xF0, 0x8C, 0xA4, /* 0x08-0x0B */
+	0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0xD0, 0xA1, /* 0x0C-0x0F */
+	0x8C, 0xA8, 0xC9, 0xD9, 0x8C, 0xA9, 0x8C, 0xAA, /* 0x10-0x13 */
+	0xB6, 0xFB, 0xE6, 0xD8, 0xBC, 0xE2, 0x8C, 0xAB, /* 0x14-0x17 */
+	0xB3, 0xBE, 0x8C, 0xAC, 0xC9, 0xD0, 0x8C, 0xAD, /* 0x18-0x1B */
+	0xE6, 0xD9, 0xB3, 0xA2, 0x8C, 0xAE, 0x8C, 0xAF, /* 0x1C-0x1F */
+	0x8C, 0xB0, 0x8C, 0xB1, 0xDE, 0xCC, 0x8C, 0xB2, /* 0x20-0x23 */
+	0xD3, 0xC8, 0xDE, 0xCD, 0x8C, 0xB3, 0xD2, 0xA2, /* 0x24-0x27 */
+	0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, 0x8C, 0xB7, /* 0x28-0x2B */
+	0xDE, 0xCE, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x2C-0x2F */
+	0x8C, 0xBB, 0xBE, 0xCD, 0x8C, 0xBC, 0x8C, 0xBD, /* 0x30-0x33 */
+	0xDE, 0xCF, 0x8C, 0xBE, 0x8C, 0xBF, 0x8C, 0xC0, /* 0x34-0x37 */
+	0xCA, 0xAC, 0xD2, 0xFC, 0xB3, 0xDF, 0xE5, 0xEA, /* 0x38-0x3B */
+	0xC4, 0xE1, 0xBE, 0xA1, 0xCE, 0xB2, 0xC4, 0xF2, /* 0x3C-0x3F */
+	0xBE, 0xD6, 0xC6, 0xA8, 0xB2, 0xE3, 0x8C, 0xC1, /* 0x40-0x43 */
+	0x8C, 0xC2, 0xBE, 0xD3, 0x8C, 0xC3, 0x8C, 0xC4, /* 0x44-0x47 */
+	0xC7, 0xFC, 0xCC, 0xEB, 0xBD, 0xEC, 0xCE, 0xDD, /* 0x48-0x4B */
+	0x8C, 0xC5, 0x8C, 0xC6, 0xCA, 0xBA, 0xC6, 0xC1, /* 0x4C-0x4F */
+	0xE5, 0xEC, 0xD0, 0xBC, 0x8C, 0xC7, 0x8C, 0xC8, /* 0x50-0x53 */
+	0x8C, 0xC9, 0xD5, 0xB9, 0x8C, 0xCA, 0x8C, 0xCB, /* 0x54-0x57 */
+	0x8C, 0xCC, 0xE5, 0xED, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x58-0x5B */
+	0x8C, 0xCF, 0x8C, 0xD0, 0xCA, 0xF4, 0x8C, 0xD1, /* 0x5C-0x5F */
+	0xCD, 0xC0, 0xC2, 0xC5, 0x8C, 0xD2, 0xE5, 0xEF, /* 0x60-0x63 */
+	0x8C, 0xD3, 0xC2, 0xC4, 0xE5, 0xF0, 0x8C, 0xD4, /* 0x64-0x67 */
+	0x8C, 0xD5, 0x8C, 0xD6, 0x8C, 0xD7, 0x8C, 0xD8, /* 0x68-0x6B */
+	0x8C, 0xD9, 0x8C, 0xDA, 0xE5, 0xF8, 0xCD, 0xCD, /* 0x6C-0x6F */
+	0x8C, 0xDB, 0xC9, 0xBD, 0x8C, 0xDC, 0x8C, 0xDD, /* 0x70-0x73 */
+	0x8C, 0xDE, 0x8C, 0xDF, 0x8C, 0xE0, 0x8C, 0xE1, /* 0x74-0x77 */
+	0x8C, 0xE2, 0xD2, 0xD9, 0xE1, 0xA8, 0x8C, 0xE3, /* 0x78-0x7B */
+	0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, 0xD3, 0xEC, /* 0x7C-0x7F */
+	
+	0x8C, 0xE7, 0xCB, 0xEA, 0xC6, 0xF1, 0x8C, 0xE8, /* 0x80-0x83 */
+	0x8C, 0xE9, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0x84-0x87 */
+	0xE1, 0xAC, 0x8C, 0xED, 0x8C, 0xEE, 0x8C, 0xEF, /* 0x88-0x8B */
+	0xE1, 0xA7, 0xE1, 0xA9, 0x8C, 0xF0, 0x8C, 0xF1, /* 0x8C-0x8F */
+	0xE1, 0xAA, 0xE1, 0xAF, 0x8C, 0xF2, 0x8C, 0xF3, /* 0x90-0x93 */
+	0xB2, 0xED, 0x8C, 0xF4, 0xE1, 0xAB, 0xB8, 0xDA, /* 0x94-0x97 */
+	0xE1, 0xAD, 0xE1, 0xAE, 0xE1, 0xB0, 0xB5, 0xBA, /* 0x98-0x9B */
+	0xE1, 0xB1, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0x9C-0x9F */
+	0x8C, 0xF8, 0x8C, 0xF9, 0xE1, 0xB3, 0xE1, 0xB8, /* 0xA0-0xA3 */
+	0x8C, 0xFA, 0x8C, 0xFB, 0x8C, 0xFC, 0x8C, 0xFD, /* 0xA4-0xA7 */
+	0x8C, 0xFE, 0xD1, 0xD2, 0x8D, 0x40, 0xE1, 0xB6, /* 0xA8-0xAB */
+	0xE1, 0xB5, 0xC1, 0xEB, 0x8D, 0x41, 0x8D, 0x42, /* 0xAC-0xAF */
+	0x8D, 0x43, 0xE1, 0xB7, 0x8D, 0x44, 0xD4, 0xC0, /* 0xB0-0xB3 */
+	0x8D, 0x45, 0xE1, 0xB2, 0x8D, 0x46, 0xE1, 0xBA, /* 0xB4-0xB7 */
+	0xB0, 0xB6, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xB8-0xBB */
+	0x8D, 0x4A, 0xE1, 0xB4, 0x8D, 0x4B, 0xBF, 0xF9, /* 0xBC-0xBF */
+	0x8D, 0x4C, 0xE1, 0xB9, 0x8D, 0x4D, 0x8D, 0x4E, /* 0xC0-0xC3 */
+	0xE1, 0xBB, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xC4-0xC7 */
+	0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, 0xE1, 0xBE, /* 0xC8-0xCB */
+	0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xCC-0xCF */
+	0x8D, 0x59, 0x8D, 0x5A, 0xE1, 0xBC, 0x8D, 0x5B, /* 0xD0-0xD3 */
+	0x8D, 0x5C, 0x8D, 0x5D, 0x8D, 0x5E, 0x8D, 0x5F, /* 0xD4-0xD7 */
+	0x8D, 0x60, 0xD6, 0xC5, 0x8D, 0x61, 0x8D, 0x62, /* 0xD8-0xDB */
+	0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xDC-0xDF */
+	0x8D, 0x67, 0xCF, 0xBF, 0x8D, 0x68, 0x8D, 0x69, /* 0xE0-0xE3 */
+	0xE1, 0xBD, 0xE1, 0xBF, 0xC2, 0xCD, 0x8D, 0x6A, /* 0xE4-0xE7 */
+	0xB6, 0xEB, 0x8D, 0x6B, 0xD3, 0xF8, 0x8D, 0x6C, /* 0xE8-0xEB */
+	0x8D, 0x6D, 0xC7, 0xCD, 0x8D, 0x6E, 0x8D, 0x6F, /* 0xEC-0xEF */
+	0xB7, 0xE5, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xF0-0xF3 */
+	0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, 0x8D, 0x76, /* 0xF4-0xF7 */
+	0x8D, 0x77, 0x8D, 0x78, 0x8D, 0x79, 0xBE, 0xFE, /* 0xF8-0xFB */
+	0x8D, 0x7A, 0x8D, 0x7B, 0x8D, 0x7C, 0x8D, 0x7D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0x8D, 0x7E, 0x8D, 0x80, 0xE1, 0xC0, 0xE1, 0xC1, /* 0x00-0x03 */
+	0x8D, 0x81, 0x8D, 0x82, 0xE1, 0xC7, 0xB3, 0xE7, /* 0x04-0x07 */
+	0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, 0x8D, 0x86, /* 0x08-0x0B */
+	0x8D, 0x87, 0x8D, 0x88, 0xC6, 0xE9, 0x8D, 0x89, /* 0x0C-0x0F */
+	0x8D, 0x8A, 0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, /* 0x10-0x13 */
+	0xB4, 0xDE, 0x8D, 0x8E, 0xD1, 0xC2, 0x8D, 0x8F, /* 0x14-0x17 */
+	0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, 0xE1, 0xC8, /* 0x18-0x1B */
+	0x8D, 0x93, 0x8D, 0x94, 0xE1, 0xC6, 0x8D, 0x95, /* 0x1C-0x1F */
+	0x8D, 0x96, 0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, /* 0x20-0x23 */
+	0xE1, 0xC5, 0x8D, 0x9A, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x24-0x27 */
+	0x8D, 0x9B, 0xB1, 0xC0, 0x8D, 0x9C, 0x8D, 0x9D, /* 0x28-0x2B */
+	0x8D, 0x9E, 0xD5, 0xB8, 0xE1, 0xC4, 0x8D, 0x9F, /* 0x2C-0x2F */
+	0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, 0x8D, 0xA3, /* 0x30-0x33 */
+	0xE1, 0xCB, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x34-0x37 */
+	0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x38-0x3B */
+	0x8D, 0xAB, 0xE1, 0xCC, 0xE1, 0xCA, 0x8D, 0xAC, /* 0x3C-0x3F */
+	0x8D, 0xAD, 0x8D, 0xAE, 0x8D, 0xAF, 0x8D, 0xB0, /* 0x40-0x43 */
+	0x8D, 0xB1, 0x8D, 0xB2, 0x8D, 0xB3, 0xEF, 0xFA, /* 0x44-0x47 */
+	0x8D, 0xB4, 0x8D, 0xB5, 0xE1, 0xD3, 0xE1, 0xD2, /* 0x48-0x4B */
+	0xC7, 0xB6, 0x8D, 0xB6, 0x8D, 0xB7, 0x8D, 0xB8, /* 0x4C-0x4F */
+	0x8D, 0xB9, 0x8D, 0xBA, 0x8D, 0xBB, 0x8D, 0xBC, /* 0x50-0x53 */
+	0x8D, 0xBD, 0x8D, 0xBE, 0x8D, 0xBF, 0x8D, 0xC0, /* 0x54-0x57 */
+	0xE1, 0xC9, 0x8D, 0xC1, 0x8D, 0xC2, 0xE1, 0xCE, /* 0x58-0x5B */
+	0x8D, 0xC3, 0xE1, 0xD0, 0x8D, 0xC4, 0x8D, 0xC5, /* 0x5C-0x5F */
+	0x8D, 0xC6, 0x8D, 0xC7, 0x8D, 0xC8, 0x8D, 0xC9, /* 0x60-0x63 */
+	0x8D, 0xCA, 0x8D, 0xCB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x64-0x67 */
+	0x8D, 0xCE, 0xE1, 0xD4, 0x8D, 0xCF, 0xE1, 0xD1, /* 0x68-0x6B */
+	0xE1, 0xCD, 0x8D, 0xD0, 0x8D, 0xD1, 0xE1, 0xCF, /* 0x6C-0x6F */
+	0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x70-0x73 */
+	0xE1, 0xD5, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x74-0x77 */
+	0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, 0x8D, 0xDC, /* 0x78-0x7B */
+	0x8D, 0xDD, 0x8D, 0xDE, 0x8D, 0xDF, 0x8D, 0xE0, /* 0x7C-0x7F */
+	
+	0x8D, 0xE1, 0x8D, 0xE2, 0xE1, 0xD6, 0x8D, 0xE3, /* 0x80-0x83 */
+	0x8D, 0xE4, 0x8D, 0xE5, 0x8D, 0xE6, 0x8D, 0xE7, /* 0x84-0x87 */
+	0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, 0x8D, 0xEB, /* 0x88-0x8B */
+	0x8D, 0xEC, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x8C-0x8F */
+	0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, 0x8D, 0xF3, /* 0x90-0x93 */
+	0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, 0x8D, 0xF7, /* 0x94-0x97 */
+	0x8D, 0xF8, 0xE1, 0xD7, 0x8D, 0xF9, 0x8D, 0xFA, /* 0x98-0x9B */
+	0x8D, 0xFB, 0xE1, 0xD8, 0x8D, 0xFC, 0x8D, 0xFD, /* 0x9C-0x9F */
+	0x8D, 0xFE, 0x8E, 0x40, 0x8E, 0x41, 0x8E, 0x42, /* 0xA0-0xA3 */
+	0x8E, 0x43, 0x8E, 0x44, 0x8E, 0x45, 0x8E, 0x46, /* 0xA4-0xA7 */
+	0x8E, 0x47, 0x8E, 0x48, 0x8E, 0x49, 0x8E, 0x4A, /* 0xA8-0xAB */
+	0x8E, 0x4B, 0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, /* 0xAC-0xAF */
+	0x8E, 0x4F, 0x8E, 0x50, 0x8E, 0x51, 0x8E, 0x52, /* 0xB0-0xB3 */
+	0x8E, 0x53, 0x8E, 0x54, 0x8E, 0x55, 0xE1, 0xDA, /* 0xB4-0xB7 */
+	0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, 0x8E, 0x59, /* 0xB8-0xBB */
+	0x8E, 0x5A, 0x8E, 0x5B, 0x8E, 0x5C, 0x8E, 0x5D, /* 0xBC-0xBF */
+	0x8E, 0x5E, 0x8E, 0x5F, 0x8E, 0x60, 0x8E, 0x61, /* 0xC0-0xC3 */
+	0x8E, 0x62, 0xE1, 0xDB, 0x8E, 0x63, 0x8E, 0x64, /* 0xC4-0xC7 */
+	0x8E, 0x65, 0x8E, 0x66, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */
+	0x8E, 0x69, 0xCE, 0xA1, 0x8E, 0x6A, 0x8E, 0x6B, /* 0xCC-0xCF */
+	0x8E, 0x6C, 0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, /* 0xD0-0xD3 */
+	0x8E, 0x70, 0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, /* 0xD4-0xD7 */
+	0x8E, 0x74, 0x8E, 0x75, 0x8E, 0x76, 0xE7, 0xDD, /* 0xD8-0xDB */
+	0x8E, 0x77, 0xB4, 0xA8, 0xD6, 0xDD, 0x8E, 0x78, /* 0xDC-0xDF */
+	0x8E, 0x79, 0xD1, 0xB2, 0xB3, 0xB2, 0x8E, 0x7A, /* 0xE0-0xE3 */
+	0x8E, 0x7B, 0xB9, 0xA4, 0xD7, 0xF3, 0xC7, 0xC9, /* 0xE4-0xE7 */
+	0xBE, 0xDE, 0xB9, 0xAE, 0x8E, 0x7C, 0xCE, 0xD7, /* 0xE8-0xEB */
+	0x8E, 0x7D, 0x8E, 0x7E, 0xB2, 0xEE, 0xDB, 0xCF, /* 0xEC-0xEF */
+	0x8E, 0x80, 0xBC, 0xBA, 0xD2, 0xD1, 0xCB, 0xC8, /* 0xF0-0xF3 */
+	0xB0, 0xCD, 0x8E, 0x81, 0x8E, 0x82, 0xCF, 0xEF, /* 0xF4-0xF7 */
+	0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xF8-0xFB */
+	0x8E, 0x87, 0xD9, 0xE3, 0xBD, 0xED, 0x8E, 0x88, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x8E, 0x89, 0xB1, 0xD2, 0xCA, 0xD0, 0xB2, 0xBC, /* 0x00-0x03 */
+	0x8E, 0x8A, 0xCB, 0xA7, 0xB7, 0xAB, 0x8E, 0x8B, /* 0x04-0x07 */
+	0xCA, 0xA6, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0x08-0x0B */
+	0xCF, 0xA3, 0x8E, 0x8F, 0x8E, 0x90, 0xE0, 0xF8, /* 0x0C-0x0F */
+	0xD5, 0xCA, 0xE0, 0xFB, 0x8E, 0x91, 0x8E, 0x92, /* 0x10-0x13 */
+	0xE0, 0xFA, 0xC5, 0xC1, 0xCC, 0xFB, 0x8E, 0x93, /* 0x14-0x17 */
+	0xC1, 0xB1, 0xE0, 0xF9, 0xD6, 0xE3, 0xB2, 0xAF, /* 0x18-0x1B */
+	0xD6, 0xC4, 0xB5, 0xDB, 0x8E, 0x94, 0x8E, 0x95, /* 0x1C-0x1F */
+	0x8E, 0x96, 0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, /* 0x20-0x23 */
+	0x8E, 0x9A, 0x8E, 0x9B, 0xB4, 0xF8, 0xD6, 0xA1, /* 0x24-0x27 */
+	0x8E, 0x9C, 0x8E, 0x9D, 0x8E, 0x9E, 0x8E, 0x9F, /* 0x28-0x2B */
+	0x8E, 0xA0, 0xCF, 0xAF, 0xB0, 0xEF, 0x8E, 0xA1, /* 0x2C-0x2F */
+	0x8E, 0xA2, 0xE0, 0xFC, 0x8E, 0xA3, 0x8E, 0xA4, /* 0x30-0x33 */
+	0x8E, 0xA5, 0x8E, 0xA6, 0x8E, 0xA7, 0xE1, 0xA1, /* 0x34-0x37 */
+	0xB3, 0xA3, 0x8E, 0xA8, 0x8E, 0xA9, 0xE0, 0xFD, /* 0x38-0x3B */
+	0xE0, 0xFE, 0xC3, 0xB1, 0x8E, 0xAA, 0x8E, 0xAB, /* 0x3C-0x3F */
+	0x8E, 0xAC, 0x8E, 0xAD, 0xC3, 0xDD, 0x8E, 0xAE, /* 0x40-0x43 */
+	0xE1, 0xA2, 0xB7, 0xF9, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x44-0x47 */
+	0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x48-0x4B */
+	0xBB, 0xCF, 0x8E, 0xB5, 0x8E, 0xB6, 0x8E, 0xB7, /* 0x4C-0x4F */
+	0x8E, 0xB8, 0x8E, 0xB9, 0x8E, 0xBA, 0x8E, 0xBB, /* 0x50-0x53 */
+	0xE1, 0xA3, 0xC4, 0xBB, 0x8E, 0xBC, 0x8E, 0xBD, /* 0x54-0x57 */
+	0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, 0xE1, 0xA4, /* 0x58-0x5B */
+	0x8E, 0xC1, 0x8E, 0xC2, 0xE1, 0xA5, 0x8E, 0xC3, /* 0x5C-0x5F */
+	0x8E, 0xC4, 0xE1, 0xA6, 0xB4, 0xB1, 0x8E, 0xC5, /* 0x60-0x63 */
+	0x8E, 0xC6, 0x8E, 0xC7, 0x8E, 0xC8, 0x8E, 0xC9, /* 0x64-0x67 */
+	0x8E, 0xCA, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x68-0x6B */
+	0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x6C-0x6F */
+	0x8E, 0xD2, 0x8E, 0xD3, 0xB8, 0xC9, 0xC6, 0xBD, /* 0x70-0x73 */
+	0xC4, 0xEA, 0x8E, 0xD4, 0xB2, 0xA2, 0x8E, 0xD5, /* 0x74-0x77 */
+	0xD0, 0xD2, 0x8E, 0xD6, 0xE7, 0xDB, 0xBB, 0xC3, /* 0x78-0x7B */
+	0xD3, 0xD7, 0xD3, 0xC4, 0x8E, 0xD7, 0xB9, 0xE3, /* 0x7C-0x7F */
+	
+	0xE2, 0xCF, 0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, /* 0x80-0x83 */
+	0xD7, 0xAF, 0x8E, 0xDB, 0xC7, 0xEC, 0xB1, 0xD3, /* 0x84-0x87 */
+	0x8E, 0xDC, 0x8E, 0xDD, 0xB4, 0xB2, 0xE2, 0xD1, /* 0x88-0x8B */
+	0x8E, 0xDE, 0x8E, 0xDF, 0x8E, 0xE0, 0xD0, 0xF2, /* 0x8C-0x8F */
+	0xC2, 0xAE, 0xE2, 0xD0, 0x8E, 0xE1, 0xBF, 0xE2, /* 0x90-0x93 */
+	0xD3, 0xA6, 0xB5, 0xD7, 0xE2, 0xD2, 0xB5, 0xEA, /* 0x94-0x97 */
+	0x8E, 0xE2, 0xC3, 0xED, 0xB8, 0xFD, 0x8E, 0xE3, /* 0x98-0x9B */
+	0xB8, 0xAE, 0x8E, 0xE4, 0xC5, 0xD3, 0xB7, 0xCF, /* 0x9C-0x9F */
+	0xE2, 0xD4, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0xA0-0xA3 */
+	0x8E, 0xE8, 0xE2, 0xD3, 0xB6, 0xC8, 0xD7, 0xF9, /* 0xA4-0xA7 */
+	0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, 0x8E, 0xEC, /* 0xA8-0xAB */
+	0x8E, 0xED, 0xCD, 0xA5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0xAC-0xAF */
+	0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0xE2, 0xD8, /* 0xB0-0xB3 */
+	0x8E, 0xF3, 0xE2, 0xD6, 0xCA, 0xFC, 0xBF, 0xB5, /* 0xB4-0xB7 */
+	0xD3, 0xB9, 0xE2, 0xD5, 0x8E, 0xF4, 0x8E, 0xF5, /* 0xB8-0xBB */
+	0x8E, 0xF6, 0x8E, 0xF7, 0xE2, 0xD7, 0x8E, 0xF8, /* 0xBC-0xBF */
+	0x8E, 0xF9, 0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, /* 0xC0-0xC3 */
+	0x8E, 0xFD, 0x8E, 0xFE, 0x8F, 0x40, 0x8F, 0x41, /* 0xC4-0xC7 */
+	0x8F, 0x42, 0xC1, 0xAE, 0xC0, 0xC8, 0x8F, 0x43, /* 0xC8-0xCB */
+	0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0xCC-0xCF */
+	0x8F, 0x48, 0xE2, 0xDB, 0xE2, 0xDA, 0xC0, 0xAA, /* 0xD0-0xD3 */
+	0x8F, 0x49, 0x8F, 0x4A, 0xC1, 0xCE, 0x8F, 0x4B, /* 0xD4-0xD7 */
+	0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, 0xE2, 0xDC, /* 0xD8-0xDB */
+	0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0xDC-0xDF */
+	0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0xE0-0xE3 */
+	0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0xE4-0xE7 */
+	0xE2, 0xDD, 0x8F, 0x5B, 0xE2, 0xDE, 0x8F, 0x5C, /* 0xE8-0xEB */
+	0x8F, 0x5D, 0x8F, 0x5E, 0x8F, 0x5F, 0x8F, 0x60, /* 0xEC-0xEF */
+	0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xF0-0xF3 */
+	0xDB, 0xC8, 0x8F, 0x65, 0xD1, 0xD3, 0xCD, 0xA2, /* 0xF4-0xF7 */
+	0x8F, 0x66, 0x8F, 0x67, 0xBD, 0xA8, 0x8F, 0x68, /* 0xF8-0xFB */
+	0x8F, 0x69, 0x8F, 0x6A, 0xDE, 0xC3, 0xD8, 0xA5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0xBF, 0xAA, 0xDB, 0xCD, 0xD2, 0xEC, 0xC6, 0xFA, /* 0x00-0x03 */
+	0xC5, 0xAA, 0x8F, 0x6B, 0x8F, 0x6C, 0x8F, 0x6D, /* 0x04-0x07 */
+	0xDE, 0xC4, 0x8F, 0x6E, 0xB1, 0xD7, 0xDF, 0xAE, /* 0x08-0x0B */
+	0x8F, 0x6F, 0x8F, 0x70, 0x8F, 0x71, 0xCA, 0xBD, /* 0x0C-0x0F */
+	0x8F, 0x72, 0xDF, 0xB1, 0x8F, 0x73, 0xB9, 0xAD, /* 0x10-0x13 */
+	0x8F, 0x74, 0xD2, 0xFD, 0x8F, 0x75, 0xB8, 0xA5, /* 0x14-0x17 */
+	0xBA, 0xEB, 0x8F, 0x76, 0x8F, 0x77, 0xB3, 0xDA, /* 0x18-0x1B */
+	0x8F, 0x78, 0x8F, 0x79, 0x8F, 0x7A, 0xB5, 0xDC, /* 0x1C-0x1F */
+	0xD5, 0xC5, 0x8F, 0x7B, 0x8F, 0x7C, 0x8F, 0x7D, /* 0x20-0x23 */
+	0x8F, 0x7E, 0xC3, 0xD6, 0xCF, 0xD2, 0xBB, 0xA1, /* 0x24-0x27 */
+	0x8F, 0x80, 0xE5, 0xF3, 0xE5, 0xF2, 0x8F, 0x81, /* 0x28-0x2B */
+	0x8F, 0x82, 0xE5, 0xF4, 0x8F, 0x83, 0xCD, 0xE4, /* 0x2C-0x2F */
+	0x8F, 0x84, 0xC8, 0xF5, 0x8F, 0x85, 0x8F, 0x86, /* 0x30-0x33 */
+	0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0x34-0x37 */
+	0x8F, 0x8B, 0xB5, 0xAF, 0xC7, 0xBF, 0x8F, 0x8C, /* 0x38-0x3B */
+	0xE5, 0xF6, 0x8F, 0x8D, 0x8F, 0x8E, 0x8F, 0x8F, /* 0x3C-0x3F */
+	0xEC, 0xB0, 0x8F, 0x90, 0x8F, 0x91, 0x8F, 0x92, /* 0x40-0x43 */
+	0x8F, 0x93, 0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, /* 0x44-0x47 */
+	0x8F, 0x97, 0x8F, 0x98, 0x8F, 0x99, 0x8F, 0x9A, /* 0x48-0x4B */
+	0x8F, 0x9B, 0x8F, 0x9C, 0x8F, 0x9D, 0x8F, 0x9E, /* 0x4C-0x4F */
+	0xE5, 0xE6, 0x8F, 0x9F, 0xB9, 0xE9, 0xB5, 0xB1, /* 0x50-0x53 */
+	0x8F, 0xA0, 0xC2, 0xBC, 0xE5, 0xE8, 0xE5, 0xE7, /* 0x54-0x57 */
+	0xE5, 0xE9, 0x8F, 0xA1, 0x8F, 0xA2, 0x8F, 0xA3, /* 0x58-0x5B */
+	0x8F, 0xA4, 0xD2, 0xCD, 0x8F, 0xA5, 0x8F, 0xA6, /* 0x5C-0x5F */
+	0x8F, 0xA7, 0xE1, 0xEA, 0xD0, 0xCE, 0x8F, 0xA8, /* 0x60-0x63 */
+	0xCD, 0xAE, 0x8F, 0xA9, 0xD1, 0xE5, 0x8F, 0xAA, /* 0x64-0x67 */
+	0x8F, 0xAB, 0xB2, 0xCA, 0xB1, 0xEB, 0x8F, 0xAC, /* 0x68-0x6B */
+	0xB1, 0xF2, 0xC5, 0xED, 0x8F, 0xAD, 0x8F, 0xAE, /* 0x6C-0x6F */
+	0xD5, 0xC3, 0xD3, 0xB0, 0x8F, 0xAF, 0xE1, 0xDC, /* 0x70-0x73 */
+	0x8F, 0xB0, 0x8F, 0xB1, 0x8F, 0xB2, 0xE1, 0xDD, /* 0x74-0x77 */
+	0x8F, 0xB3, 0xD2, 0xDB, 0x8F, 0xB4, 0xB3, 0xB9, /* 0x78-0x7B */
+	0xB1, 0xCB, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x7C-0x7F */
+	
+	0xCD, 0xF9, 0xD5, 0xF7, 0xE1, 0xDE, 0x8F, 0xB8, /* 0x80-0x83 */
+	0xBE, 0xB6, 0xB4, 0xFD, 0x8F, 0xB9, 0xE1, 0xDF, /* 0x84-0x87 */
+	0xBA, 0xDC, 0xE1, 0xE0, 0xBB, 0xB2, 0xC2, 0xC9, /* 0x88-0x8B */
+	0xE1, 0xE1, 0x8F, 0xBA, 0x8F, 0xBB, 0x8F, 0xBC, /* 0x8C-0x8F */
+	0xD0, 0xEC, 0x8F, 0xBD, 0xCD, 0xBD, 0x8F, 0xBE, /* 0x90-0x93 */
+	0x8F, 0xBF, 0xE1, 0xE2, 0x8F, 0xC0, 0xB5, 0xC3, /* 0x94-0x97 */
+	0xC5, 0xC7, 0xE1, 0xE3, 0x8F, 0xC1, 0x8F, 0xC2, /* 0x98-0x9B */
+	0xE1, 0xE4, 0x8F, 0xC3, 0x8F, 0xC4, 0x8F, 0xC5, /* 0x9C-0x9F */
+	0x8F, 0xC6, 0xD3, 0xF9, 0x8F, 0xC7, 0x8F, 0xC8, /* 0xA0-0xA3 */
+	0x8F, 0xC9, 0x8F, 0xCA, 0x8F, 0xCB, 0x8F, 0xCC, /* 0xA4-0xA7 */
+	0xE1, 0xE5, 0x8F, 0xCD, 0xD1, 0xAD, 0x8F, 0xCE, /* 0xA8-0xAB */
+	0x8F, 0xCF, 0xE1, 0xE6, 0xCE, 0xA2, 0x8F, 0xD0, /* 0xAC-0xAF */
+	0x8F, 0xD1, 0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, /* 0xB0-0xB3 */
+	0x8F, 0xD5, 0xE1, 0xE7, 0x8F, 0xD6, 0xB5, 0xC2, /* 0xB4-0xB7 */
+	0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, 0x8F, 0xDA, /* 0xB8-0xBB */
+	0xE1, 0xE8, 0xBB, 0xD5, 0x8F, 0xDB, 0x8F, 0xDC, /* 0xBC-0xBF */
+	0x8F, 0xDD, 0x8F, 0xDE, 0x8F, 0xDF, 0xD0, 0xC4, /* 0xC0-0xC3 */
+	0xE2, 0xE0, 0xB1, 0xD8, 0xD2, 0xE4, 0x8F, 0xE0, /* 0xC4-0xC7 */
+	0x8F, 0xE1, 0xE2, 0xE1, 0x8F, 0xE2, 0x8F, 0xE3, /* 0xC8-0xCB */
+	0xBC, 0xC9, 0xC8, 0xCC, 0x8F, 0xE4, 0xE2, 0xE3, /* 0xCC-0xCF */
+	0xEC, 0xFE, 0xEC, 0xFD, 0xDF, 0xAF, 0x8F, 0xE5, /* 0xD0-0xD3 */
+	0x8F, 0xE6, 0x8F, 0xE7, 0xE2, 0xE2, 0xD6, 0xBE, /* 0xD4-0xD7 */
+	0xCD, 0xFC, 0xC3, 0xA6, 0x8F, 0xE8, 0x8F, 0xE9, /* 0xD8-0xDB */
+	0x8F, 0xEA, 0xE3, 0xC3, 0x8F, 0xEB, 0x8F, 0xEC, /* 0xDC-0xDF */
+	0xD6, 0xD2, 0xE2, 0xE7, 0x8F, 0xED, 0x8F, 0xEE, /* 0xE0-0xE3 */
+	0xE2, 0xE8, 0x8F, 0xEF, 0x8F, 0xF0, 0xD3, 0xC7, /* 0xE4-0xE7 */
+	0x8F, 0xF1, 0x8F, 0xF2, 0xE2, 0xEC, 0xBF, 0xEC, /* 0xE8-0xEB */
+	0x8F, 0xF3, 0xE2, 0xED, 0xE2, 0xE5, 0x8F, 0xF4, /* 0xEC-0xEF */
+	0x8F, 0xF5, 0xB3, 0xC0, 0x8F, 0xF6, 0x8F, 0xF7, /* 0xF0-0xF3 */
+	0x8F, 0xF8, 0xC4, 0xEE, 0x8F, 0xF9, 0x8F, 0xFA, /* 0xF4-0xF7 */
+	0xE2, 0xEE, 0x8F, 0xFB, 0x8F, 0xFC, 0xD0, 0xC3, /* 0xF8-0xFB */
+	0x8F, 0xFD, 0xBA, 0xF6, 0xE2, 0xE9, 0xB7, 0xDE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0xBB, 0xB3, 0xCC, 0xAC, 0xCB, 0xCB, 0xE2, 0xE4, /* 0x00-0x03 */
+	0xE2, 0xE6, 0xE2, 0xEA, 0xE2, 0xEB, 0x8F, 0xFE, /* 0x04-0x07 */
+	0x90, 0x40, 0x90, 0x41, 0xE2, 0xF7, 0x90, 0x42, /* 0x08-0x0B */
+	0x90, 0x43, 0xE2, 0xF4, 0xD4, 0xF5, 0xE2, 0xF3, /* 0x0C-0x0F */
+	0x90, 0x44, 0x90, 0x45, 0xC5, 0xAD, 0x90, 0x46, /* 0x10-0x13 */
+	0xD5, 0xFA, 0xC5, 0xC2, 0xB2, 0xC0, 0x90, 0x47, /* 0x14-0x17 */
+	0x90, 0x48, 0xE2, 0xEF, 0x90, 0x49, 0xE2, 0xF2, /* 0x18-0x1B */
+	0xC1, 0xAF, 0xCB, 0xBC, 0x90, 0x4A, 0x90, 0x4B, /* 0x1C-0x1F */
+	0xB5, 0xA1, 0xE2, 0xF9, 0x90, 0x4C, 0x90, 0x4D, /* 0x20-0x23 */
+	0x90, 0x4E, 0xBC, 0xB1, 0xE2, 0xF1, 0xD0, 0xD4, /* 0x24-0x27 */
+	0xD4, 0xB9, 0xE2, 0xF5, 0xB9, 0xD6, 0xE2, 0xF6, /* 0x28-0x2B */
+	0x90, 0x4F, 0x90, 0x50, 0x90, 0x51, 0xC7, 0xD3, /* 0x2C-0x2F */
+	0x90, 0x52, 0x90, 0x53, 0x90, 0x54, 0x90, 0x55, /* 0x30-0x33 */
+	0x90, 0x56, 0xE2, 0xF0, 0x90, 0x57, 0x90, 0x58, /* 0x34-0x37 */
+	0x90, 0x59, 0x90, 0x5A, 0x90, 0x5B, 0xD7, 0xDC, /* 0x38-0x3B */
+	0xED, 0xA1, 0x90, 0x5C, 0x90, 0x5D, 0xE2, 0xF8, /* 0x3C-0x3F */
+	0x90, 0x5E, 0xED, 0xA5, 0xE2, 0xFE, 0xCA, 0xD1, /* 0x40-0x43 */
+	0x90, 0x5F, 0x90, 0x60, 0x90, 0x61, 0x90, 0x62, /* 0x44-0x47 */
+	0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0xC1, 0xB5, /* 0x48-0x4B */
+	0x90, 0x66, 0xBB, 0xD0, 0x90, 0x67, 0x90, 0x68, /* 0x4C-0x4F */
+	0xBF, 0xD6, 0x90, 0x69, 0xBA, 0xE3, 0x90, 0x6A, /* 0x50-0x53 */
+	0x90, 0x6B, 0xCB, 0xA1, 0x90, 0x6C, 0x90, 0x6D, /* 0x54-0x57 */
+	0x90, 0x6E, 0xED, 0xA6, 0xED, 0xA3, 0x90, 0x6F, /* 0x58-0x5B */
+	0x90, 0x70, 0xED, 0xA2, 0x90, 0x71, 0x90, 0x72, /* 0x5C-0x5F */
+	0x90, 0x73, 0x90, 0x74, 0xBB, 0xD6, 0xED, 0xA7, /* 0x60-0x63 */
+	0xD0, 0xF4, 0x90, 0x75, 0x90, 0x76, 0xED, 0xA4, /* 0x64-0x67 */
+	0xBA, 0xDE, 0xB6, 0xF7, 0xE3, 0xA1, 0xB6, 0xB2, /* 0x68-0x6B */
+	0xCC, 0xF1, 0xB9, 0xA7, 0x90, 0x77, 0xCF, 0xA2, /* 0x6C-0x6F */
+	0xC7, 0xA1, 0x90, 0x78, 0x90, 0x79, 0xBF, 0xD2, /* 0x70-0x73 */
+	0x90, 0x7A, 0x90, 0x7B, 0xB6, 0xF1, 0x90, 0x7C, /* 0x74-0x77 */
+	0xE2, 0xFA, 0xE2, 0xFB, 0xE2, 0xFD, 0xE2, 0xFC, /* 0x78-0x7B */
+	0xC4, 0xD5, 0xE3, 0xA2, 0x90, 0x7D, 0xD3, 0xC1, /* 0x7C-0x7F */
+	
+	0x90, 0x7E, 0x90, 0x80, 0x90, 0x81, 0xE3, 0xA7, /* 0x80-0x83 */
+	0xC7, 0xC4, 0x90, 0x82, 0x90, 0x83, 0x90, 0x84, /* 0x84-0x87 */
+	0x90, 0x85, 0xCF, 0xA4, 0x90, 0x86, 0x90, 0x87, /* 0x88-0x8B */
+	0xE3, 0xA9, 0xBA, 0xB7, 0x90, 0x88, 0x90, 0x89, /* 0x8C-0x8F */
+	0x90, 0x8A, 0x90, 0x8B, 0xE3, 0xA8, 0x90, 0x8C, /* 0x90-0x93 */
+	0xBB, 0xDA, 0x90, 0x8D, 0xE3, 0xA3, 0x90, 0x8E, /* 0x94-0x97 */
+	0x90, 0x8F, 0x90, 0x90, 0xE3, 0xA4, 0xE3, 0xAA, /* 0x98-0x9B */
+	0x90, 0x91, 0xE3, 0xA6, 0x90, 0x92, 0xCE, 0xF2, /* 0x9C-0x9F */
+	0xD3, 0xC6, 0x90, 0x93, 0x90, 0x94, 0xBB, 0xBC, /* 0xA0-0xA3 */
+	0x90, 0x95, 0x90, 0x96, 0xD4, 0xC3, 0x90, 0x97, /* 0xA4-0xA7 */
+	0xC4, 0xFA, 0x90, 0x98, 0x90, 0x99, 0xED, 0xA8, /* 0xA8-0xAB */
+	0xD0, 0xFC, 0xE3, 0xA5, 0x90, 0x9A, 0xC3, 0xF5, /* 0xAC-0xAF */
+	0x90, 0x9B, 0xE3, 0xAD, 0xB1, 0xAF, 0x90, 0x9C, /* 0xB0-0xB3 */
+	0xE3, 0xB2, 0x90, 0x9D, 0x90, 0x9E, 0x90, 0x9F, /* 0xB4-0xB7 */
+	0xBC, 0xC2, 0x90, 0xA0, 0x90, 0xA1, 0xE3, 0xAC, /* 0xB8-0xBB */
+	0xB5, 0xBF, 0x90, 0xA2, 0x90, 0xA3, 0x90, 0xA4, /* 0xBC-0xBF */
+	0x90, 0xA5, 0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, /* 0xC0-0xC3 */
+	0x90, 0xA9, 0xC7, 0xE9, 0xE3, 0xB0, 0x90, 0xAA, /* 0xC4-0xC7 */
+	0x90, 0xAB, 0x90, 0xAC, 0xBE, 0xAA, 0xCD, 0xEF, /* 0xC8-0xCB */
+	0x90, 0xAD, 0x90, 0xAE, 0x90, 0xAF, 0x90, 0xB0, /* 0xCC-0xCF */
+	0x90, 0xB1, 0xBB, 0xF3, 0x90, 0xB2, 0x90, 0xB3, /* 0xD0-0xD3 */
+	0x90, 0xB4, 0xCC, 0xE8, 0x90, 0xB5, 0x90, 0xB6, /* 0xD4-0xD7 */
+	0xE3, 0xAF, 0x90, 0xB7, 0xE3, 0xB1, 0x90, 0xB8, /* 0xD8-0xDB */
+	0xCF, 0xA7, 0xE3, 0xAE, 0x90, 0xB9, 0xCE, 0xA9, /* 0xDC-0xDF */
+	0xBB, 0xDD, 0x90, 0xBA, 0x90, 0xBB, 0x90, 0xBC, /* 0xE0-0xE3 */
+	0x90, 0xBD, 0x90, 0xBE, 0xB5, 0xEB, 0xBE, 0xE5, /* 0xE4-0xE7 */
+	0xB2, 0xD2, 0xB3, 0xCD, 0x90, 0xBF, 0xB1, 0xB9, /* 0xE8-0xEB */
+	0xE3, 0xAB, 0xB2, 0xD1, 0xB5, 0xAC, 0xB9, 0xDF, /* 0xEC-0xEF */
+	0xB6, 0xE8, 0x90, 0xC0, 0x90, 0xC1, 0xCF, 0xEB, /* 0xF0-0xF3 */
+	0xE3, 0xB7, 0x90, 0xC2, 0xBB, 0xCC, 0x90, 0xC3, /* 0xF4-0xF7 */
+	0x90, 0xC4, 0xC8, 0xC7, 0xD0, 0xCA, 0x90, 0xC5, /* 0xF8-0xFB */
+	0x90, 0xC6, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_61[512] = {
+	0xE3, 0xB8, 0xB3, 0xEE, 0x90, 0xCA, 0x90, 0xCB, /* 0x00-0x03 */
+	0x90, 0xCC, 0x90, 0xCD, 0xED, 0xA9, 0x90, 0xCE, /* 0x04-0x07 */
+	0xD3, 0xFA, 0xD3, 0xE4, 0x90, 0xCF, 0x90, 0xD0, /* 0x08-0x0B */
+	0x90, 0xD1, 0xED, 0xAA, 0xE3, 0xB9, 0xD2, 0xE2, /* 0x0C-0x0F */
+	0x90, 0xD2, 0x90, 0xD3, 0x90, 0xD4, 0x90, 0xD5, /* 0x10-0x13 */
+	0x90, 0xD6, 0xE3, 0xB5, 0x90, 0xD7, 0x90, 0xD8, /* 0x14-0x17 */
+	0x90, 0xD9, 0x90, 0xDA, 0xD3, 0xDE, 0x90, 0xDB, /* 0x18-0x1B */
+	0x90, 0xDC, 0x90, 0xDD, 0x90, 0xDE, 0xB8, 0xD0, /* 0x1C-0x1F */
+	0xE3, 0xB3, 0x90, 0xDF, 0x90, 0xE0, 0xE3, 0xB6, /* 0x20-0x23 */
+	0xB7, 0xDF, 0x90, 0xE1, 0xE3, 0xB4, 0xC0, 0xA2, /* 0x24-0x27 */
+	0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, 0xE3, 0xBA, /* 0x28-0x2B */
+	0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x2C-0x2F */
+	0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x30-0x33 */
+	0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x34-0x37 */
+	0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x38-0x3B */
+	0x90, 0xF5, 0x90, 0xF6, 0x90, 0xF7, 0xD4, 0xB8, /* 0x3C-0x3F */
+	0x90, 0xF8, 0x90, 0xF9, 0x90, 0xFA, 0x90, 0xFB, /* 0x40-0x43 */
+	0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x40, /* 0x44-0x47 */
+	0xB4, 0xC8, 0x91, 0x41, 0xE3, 0xBB, 0x91, 0x42, /* 0x48-0x4B */
+	0xBB, 0xC5, 0x91, 0x43, 0xC9, 0xF7, 0x91, 0x44, /* 0x4C-0x4F */
+	0x91, 0x45, 0xC9, 0xE5, 0x91, 0x46, 0x91, 0x47, /* 0x50-0x53 */
+	0x91, 0x48, 0xC4, 0xBD, 0x91, 0x49, 0x91, 0x4A, /* 0x54-0x57 */
+	0x91, 0x4B, 0x91, 0x4C, 0x91, 0x4D, 0x91, 0x4E, /* 0x58-0x5B */
+	0x91, 0x4F, 0xED, 0xAB, 0x91, 0x50, 0x91, 0x51, /* 0x5C-0x5F */
+	0x91, 0x52, 0x91, 0x53, 0xC2, 0xFD, 0x91, 0x54, /* 0x60-0x63 */
+	0x91, 0x55, 0x91, 0x56, 0x91, 0x57, 0xBB, 0xDB, /* 0x64-0x67 */
+	0xBF, 0xAE, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x68-0x6B */
+	0x91, 0x5B, 0x91, 0x5C, 0x91, 0x5D, 0x91, 0x5E, /* 0x6C-0x6F */
+	0xCE, 0xBF, 0x91, 0x5F, 0x91, 0x60, 0x91, 0x61, /* 0x70-0x73 */
+	0x91, 0x62, 0xE3, 0xBC, 0x91, 0x63, 0xBF, 0xB6, /* 0x74-0x77 */
+	0x91, 0x64, 0x91, 0x65, 0x91, 0x66, 0x91, 0x67, /* 0x78-0x7B */
+	0x91, 0x68, 0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, /* 0x7C-0x7F */
+	
+	0x91, 0x6C, 0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, /* 0x80-0x83 */
+	0x91, 0x70, 0x91, 0x71, 0x91, 0x72, 0x91, 0x73, /* 0x84-0x87 */
+	0x91, 0x74, 0x91, 0x75, 0x91, 0x76, 0xB1, 0xEF, /* 0x88-0x8B */
+	0x91, 0x77, 0x91, 0x78, 0xD4, 0xF7, 0x91, 0x79, /* 0x8C-0x8F */
+	0x91, 0x7A, 0x91, 0x7B, 0x91, 0x7C, 0x91, 0x7D, /* 0x90-0x93 */
+	0xE3, 0xBE, 0x91, 0x7E, 0x91, 0x80, 0x91, 0x81, /* 0x94-0x97 */
+	0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x98-0x9B */
+	0x91, 0x86, 0xED, 0xAD, 0x91, 0x87, 0x91, 0x88, /* 0x9C-0x9F */
+	0x91, 0x89, 0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, /* 0xA0-0xA3 */
+	0x91, 0x8D, 0x91, 0x8E, 0x91, 0x8F, 0xE3, 0xBF, /* 0xA4-0xA7 */
+	0xBA, 0xA9, 0xED, 0xAC, 0x91, 0x90, 0x91, 0x91, /* 0xA8-0xAB */
+	0xE3, 0xBD, 0x91, 0x92, 0x91, 0x93, 0x91, 0x94, /* 0xAC-0xAF */
+	0x91, 0x95, 0x91, 0x96, 0x91, 0x97, 0x91, 0x98, /* 0xB0-0xB3 */
+	0x91, 0x99, 0x91, 0x9A, 0x91, 0x9B, 0xE3, 0xC0, /* 0xB4-0xB7 */
+	0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB8-0xBB */
+	0x91, 0xA0, 0x91, 0xA1, 0xBA, 0xB6, 0x91, 0xA2, /* 0xBC-0xBF */
+	0x91, 0xA3, 0x91, 0xA4, 0xB6, 0xAE, 0x91, 0xA5, /* 0xC0-0xC3 */
+	0x91, 0xA6, 0x91, 0xA7, 0x91, 0xA8, 0x91, 0xA9, /* 0xC4-0xC7 */
+	0xD0, 0xB8, 0x91, 0xAA, 0xB0, 0xC3, 0xED, 0xAE, /* 0xC8-0xCB */
+	0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, 0x91, 0xAE, /* 0xCC-0xCF */
+	0x91, 0xAF, 0xED, 0xAF, 0xC0, 0xC1, 0x91, 0xB0, /* 0xD0-0xD3 */
+	0xE3, 0xC1, 0x91, 0xB1, 0x91, 0xB2, 0x91, 0xB3, /* 0xD4-0xD7 */
+	0x91, 0xB4, 0x91, 0xB5, 0x91, 0xB6, 0x91, 0xB7, /* 0xD8-0xDB */
+	0x91, 0xB8, 0x91, 0xB9, 0x91, 0xBA, 0x91, 0xBB, /* 0xDC-0xDF */
+	0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xE0-0xE3 */
+	0x91, 0xC0, 0x91, 0xC1, 0xC5, 0xB3, 0x91, 0xC2, /* 0xE4-0xE7 */
+	0x91, 0xC3, 0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, /* 0xE8-0xEB */
+	0x91, 0xC7, 0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, /* 0xEC-0xEF */
+	0x91, 0xCB, 0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, /* 0xF0-0xF3 */
+	0x91, 0xCF, 0xE3, 0xC2, 0x91, 0xD0, 0x91, 0xD1, /* 0xF4-0xF7 */
+	0x91, 0xD2, 0x91, 0xD3, 0x91, 0xD4, 0x91, 0xD5, /* 0xF8-0xFB */
+	0x91, 0xD6, 0x91, 0xD7, 0x91, 0xD8, 0xDC, 0xB2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, 0x91, 0xDC, /* 0x00-0x03 */
+	0x91, 0xDD, 0x91, 0xDE, 0xED, 0xB0, 0x91, 0xDF, /* 0x04-0x07 */
+	0xB8, 0xEA, 0x91, 0xE0, 0xCE, 0xEC, 0xEA, 0xA7, /* 0x08-0x0B */
+	0xD0, 0xE7, 0xCA, 0xF9, 0xC8, 0xD6, 0xCF, 0xB7, /* 0x0C-0x0F */
+	0xB3, 0xC9, 0xCE, 0xD2, 0xBD, 0xE4, 0x91, 0xE1, /* 0x10-0x13 */
+	0x91, 0xE2, 0xE3, 0xDE, 0xBB, 0xF2, 0xEA, 0xA8, /* 0x14-0x17 */
+	0xD5, 0xBD, 0x91, 0xE3, 0xC6, 0xDD, 0xEA, 0xA9, /* 0x18-0x1B */
+	0x91, 0xE4, 0x91, 0xE5, 0x91, 0xE6, 0xEA, 0xAA, /* 0x1C-0x1F */
+	0x91, 0xE7, 0xEA, 0xAC, 0xEA, 0xAB, 0x91, 0xE8, /* 0x20-0x23 */
+	0xEA, 0xAE, 0xEA, 0xAD, 0x91, 0xE9, 0x91, 0xEA, /* 0x24-0x27 */
+	0x91, 0xEB, 0x91, 0xEC, 0xBD, 0xD8, 0x91, 0xED, /* 0x28-0x2B */
+	0xEA, 0xAF, 0x91, 0xEE, 0xC2, 0xBE, 0x91, 0xEF, /* 0x2C-0x2F */
+	0x91, 0xF0, 0x91, 0xF1, 0x91, 0xF2, 0xB4, 0xC1, /* 0x30-0x33 */
+	0xB4, 0xF7, 0x91, 0xF3, 0x91, 0xF4, 0xBB, 0xA7, /* 0x34-0x37 */
+	0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, 0x91, 0xF8, /* 0x38-0x3B */
+	0x91, 0xF9, 0xEC, 0xE6, 0xEC, 0xE5, 0xB7, 0xBF, /* 0x3C-0x3F */
+	0xCB, 0xF9, 0xB1, 0xE2, 0x91, 0xFA, 0xEC, 0xE7, /* 0x40-0x43 */
+	0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0xC9, 0xC8, /* 0x44-0x47 */
+	0xEC, 0xE8, 0xEC, 0xE9, 0x91, 0xFE, 0xCA, 0xD6, /* 0x48-0x4B */
+	0xDE, 0xD0, 0xB2, 0xC5, 0xD4, 0xFA, 0x92, 0x40, /* 0x4C-0x4F */
+	0x92, 0x41, 0xC6, 0xCB, 0xB0, 0xC7, 0xB4, 0xF2, /* 0x50-0x53 */
+	0xC8, 0xD3, 0x92, 0x42, 0x92, 0x43, 0x92, 0x44, /* 0x54-0x57 */
+	0xCD, 0xD0, 0x92, 0x45, 0x92, 0x46, 0xBF, 0xB8, /* 0x58-0x5B */
+	0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x5C-0x5F */
+	0x92, 0x4B, 0x92, 0x4C, 0x92, 0x4D, 0xBF, 0xDB, /* 0x60-0x63 */
+	0x92, 0x4E, 0x92, 0x4F, 0xC7, 0xA4, 0xD6, 0xB4, /* 0x64-0x67 */
+	0x92, 0x50, 0xC0, 0xA9, 0xDE, 0xD1, 0xC9, 0xA8, /* 0x68-0x6B */
+	0xD1, 0xEF, 0xC5, 0xA4, 0xB0, 0xE7, 0xB3, 0xB6, /* 0x6C-0x6F */
+	0xC8, 0xC5, 0x92, 0x51, 0x92, 0x52, 0xB0, 0xE2, /* 0x70-0x73 */
+	0x92, 0x53, 0x92, 0x54, 0xB7, 0xF6, 0x92, 0x55, /* 0x74-0x77 */
+	0x92, 0x56, 0xC5, 0xFA, 0x92, 0x57, 0x92, 0x58, /* 0x78-0x7B */
+	0xB6, 0xF3, 0x92, 0x59, 0xD5, 0xD2, 0xB3, 0xD0, /* 0x7C-0x7F */
+	
+	0xBC, 0xBC, 0x92, 0x5A, 0x92, 0x5B, 0x92, 0x5C, /* 0x80-0x83 */
+	0xB3, 0xAD, 0x92, 0x5D, 0x92, 0x5E, 0x92, 0x5F, /* 0x84-0x87 */
+	0x92, 0x60, 0xBE, 0xF1, 0xB0, 0xD1, 0x92, 0x61, /* 0x88-0x8B */
+	0x92, 0x62, 0x92, 0x63, 0x92, 0x64, 0x92, 0x65, /* 0x8C-0x8F */
+	0x92, 0x66, 0xD2, 0xD6, 0xCA, 0xE3, 0xD7, 0xA5, /* 0x90-0x93 */
+	0x92, 0x67, 0xCD, 0xB6, 0xB6, 0xB6, 0xBF, 0xB9, /* 0x94-0x97 */
+	0xD5, 0xDB, 0x92, 0x68, 0xB8, 0xA7, 0xC5, 0xD7, /* 0x98-0x9B */
+	0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, 0xDE, 0xD2, /* 0x9C-0x9F */
+	0xBF, 0xD9, 0xC2, 0xD5, 0xC7, 0xC0, 0x92, 0x6C, /* 0xA0-0xA3 */
+	0xBB, 0xA4, 0xB1, 0xA8, 0x92, 0x6D, 0x92, 0x6E, /* 0xA4-0xA7 */
+	0xC5, 0xEA, 0x92, 0x6F, 0x92, 0x70, 0xC5, 0xFB, /* 0xA8-0xAB */
+	0xCC, 0xA7, 0x92, 0x71, 0x92, 0x72, 0x92, 0x73, /* 0xAC-0xAF */
+	0x92, 0x74, 0xB1, 0xA7, 0x92, 0x75, 0x92, 0x76, /* 0xB0-0xB3 */
+	0x92, 0x77, 0xB5, 0xD6, 0x92, 0x78, 0x92, 0x79, /* 0xB4-0xB7 */
+	0x92, 0x7A, 0xC4, 0xA8, 0x92, 0x7B, 0xDE, 0xD3, /* 0xB8-0xBB */
+	0xD1, 0xBA, 0xB3, 0xE9, 0x92, 0x7C, 0xC3, 0xF2, /* 0xBC-0xBF */
+	0x92, 0x7D, 0x92, 0x7E, 0xB7, 0xF7, 0x92, 0x80, /* 0xC0-0xC3 */
+	0xD6, 0xF4, 0xB5, 0xA3, 0xB2, 0xF0, 0xC4, 0xB4, /* 0xC4-0xC7 */
+	0xC4, 0xE9, 0xC0, 0xAD, 0xDE, 0xD4, 0x92, 0x81, /* 0xC8-0xCB */
+	0xB0, 0xE8, 0xC5, 0xC4, 0xC1, 0xE0, 0x92, 0x82, /* 0xCC-0xCF */
+	0xB9, 0xD5, 0x92, 0x83, 0xBE, 0xDC, 0xCD, 0xD8, /* 0xD0-0xD3 */
+	0xB0, 0xCE, 0x92, 0x84, 0xCD, 0xCF, 0xDE, 0xD6, /* 0xD4-0xD7 */
+	0xBE, 0xD0, 0xD7, 0xBE, 0xDE, 0xD5, 0xD5, 0xD0, /* 0xD8-0xDB */
+	0xB0, 0xDD, 0x92, 0x85, 0x92, 0x86, 0xC4, 0xE2, /* 0xDC-0xDF */
+	0x92, 0x87, 0x92, 0x88, 0xC2, 0xA3, 0xBC, 0xF0, /* 0xE0-0xE3 */
+	0x92, 0x89, 0xD3, 0xB5, 0xC0, 0xB9, 0xC5, 0xA1, /* 0xE4-0xE7 */
+	0xB2, 0xA6, 0xD4, 0xF1, 0x92, 0x8A, 0x92, 0x8B, /* 0xE8-0xEB */
+	0xC0, 0xA8, 0xCA, 0xC3, 0xDE, 0xD7, 0xD5, 0xFC, /* 0xEC-0xEF */
+	0x92, 0x8C, 0xB9, 0xB0, 0x92, 0x8D, 0xC8, 0xAD, /* 0xF0-0xF3 */
+	0xCB, 0xA9, 0x92, 0x8E, 0xDE, 0xD9, 0xBF, 0xBD, /* 0xF4-0xF7 */
+	0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0xF8-0xFB */
+	0xC6, 0xB4, 0xD7, 0xA7, 0xCA, 0xB0, 0xC4, 0xC3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0x92, 0x93, 0xB3, 0xD6, 0xB9, 0xD2, 0x92, 0x94, /* 0x00-0x03 */
+	0x92, 0x95, 0x92, 0x96, 0x92, 0x97, 0xD6, 0xB8, /* 0x04-0x07 */
+	0xEA, 0xFC, 0xB0, 0xB4, 0x92, 0x98, 0x92, 0x99, /* 0x08-0x0B */
+	0x92, 0x9A, 0x92, 0x9B, 0xBF, 0xE6, 0x92, 0x9C, /* 0x0C-0x0F */
+	0x92, 0x9D, 0xCC, 0xF4, 0x92, 0x9E, 0x92, 0x9F, /* 0x10-0x13 */
+	0x92, 0xA0, 0x92, 0xA1, 0xCD, 0xDA, 0x92, 0xA2, /* 0x14-0x17 */
+	0x92, 0xA3, 0x92, 0xA4, 0xD6, 0xBF, 0xC2, 0xCE, /* 0x18-0x1B */
+	0x92, 0xA5, 0xCE, 0xCE, 0xCC, 0xA2, 0xD0, 0xAE, /* 0x1C-0x1F */
+	0xC4, 0xD3, 0xB5, 0xB2, 0xDE, 0xD8, 0xD5, 0xF5, /* 0x20-0x23 */
+	0xBC, 0xB7, 0xBB, 0xD3, 0x92, 0xA6, 0x92, 0xA7, /* 0x24-0x27 */
+	0xB0, 0xA4, 0x92, 0xA8, 0xC5, 0xB2, 0xB4, 0xEC, /* 0x28-0x2B */
+	0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, 0xD5, 0xF1, /* 0x2C-0x2F */
+	0x92, 0xAC, 0x92, 0xAD, 0xEA, 0xFD, 0x92, 0xAE, /* 0x30-0x33 */
+	0x92, 0xAF, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0x34-0x37 */
+	0x92, 0xB3, 0xDE, 0xDA, 0xCD, 0xA6, 0x92, 0xB4, /* 0x38-0x3B */
+	0x92, 0xB5, 0xCD, 0xEC, 0x92, 0xB6, 0x92, 0xB7, /* 0x3C-0x3F */
+	0x92, 0xB8, 0x92, 0xB9, 0xCE, 0xE6, 0xDE, 0xDC, /* 0x40-0x43 */
+	0x92, 0xBA, 0xCD, 0xB1, 0xC0, 0xA6, 0x92, 0xBB, /* 0x44-0x47 */
+	0x92, 0xBC, 0xD7, 0xBD, 0x92, 0xBD, 0xDE, 0xDB, /* 0x48-0x4B */
+	0xB0, 0xC6, 0xBA, 0xB4, 0xC9, 0xD3, 0xC4, 0xF3, /* 0x4C-0x4F */
+	0xBE, 0xE8, 0x92, 0xBE, 0x92, 0xBF, 0x92, 0xC0, /* 0x50-0x53 */
+	0x92, 0xC1, 0xB2, 0xB6, 0x92, 0xC2, 0x92, 0xC3, /* 0x54-0x57 */
+	0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, 0x92, 0xC7, /* 0x58-0x5B */
+	0x92, 0xC8, 0x92, 0xC9, 0xC0, 0xCC, 0xCB, 0xF0, /* 0x5C-0x5F */
+	0x92, 0xCA, 0xBC, 0xF1, 0xBB, 0xBB, 0xB5, 0xB7, /* 0x60-0x63 */
+	0x92, 0xCB, 0x92, 0xCC, 0x92, 0xCD, 0xC5, 0xF5, /* 0x64-0x67 */
+	0x92, 0xCE, 0xDE, 0xE6, 0x92, 0xCF, 0x92, 0xD0, /* 0x68-0x6B */
+	0x92, 0xD1, 0xDE, 0xE3, 0xBE, 0xDD, 0x92, 0xD2, /* 0x6C-0x6F */
+	0x92, 0xD3, 0xDE, 0xDF, 0x92, 0xD4, 0x92, 0xD5, /* 0x70-0x73 */
+	0x92, 0xD6, 0x92, 0xD7, 0xB4, 0xB7, 0xBD, 0xDD, /* 0x74-0x77 */
+	0x92, 0xD8, 0x92, 0xD9, 0xDE, 0xE0, 0xC4, 0xED, /* 0x78-0x7B */
+	0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0x7C-0x7F */
+	
+	0xCF, 0xC6, 0x92, 0xDE, 0xB5, 0xE0, 0x92, 0xDF, /* 0x80-0x83 */
+	0x92, 0xE0, 0x92, 0xE1, 0x92, 0xE2, 0xB6, 0xDE, /* 0x84-0x87 */
+	0xCA, 0xDA, 0xB5, 0xF4, 0xDE, 0xE5, 0x92, 0xE3, /* 0x88-0x8B */
+	0xD5, 0xC6, 0x92, 0xE4, 0xDE, 0xE1, 0xCC, 0xCD, /* 0x8C-0x8F */
+	0xC6, 0xFE, 0x92, 0xE5, 0xC5, 0xC5, 0x92, 0xE6, /* 0x90-0x93 */
+	0x92, 0xE7, 0x92, 0xE8, 0xD2, 0xB4, 0x92, 0xE9, /* 0x94-0x97 */
+	0xBE, 0xF2, 0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, /* 0x98-0x9B */
+	0x92, 0xED, 0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, /* 0x9C-0x9F */
+	0xC2, 0xD3, 0x92, 0xF1, 0xCC, 0xBD, 0xB3, 0xB8, /* 0xA0-0xA3 */
+	0x92, 0xF2, 0xBD, 0xD3, 0x92, 0xF3, 0xBF, 0xD8, /* 0xA4-0xA7 */
+	0xCD, 0xC6, 0xD1, 0xDA, 0xB4, 0xEB, 0x92, 0xF4, /* 0xA8-0xAB */
+	0xDE, 0xE4, 0xDE, 0xDD, 0xDE, 0xE7, 0x92, 0xF5, /* 0xAC-0xAF */
+	0xEA, 0xFE, 0x92, 0xF6, 0x92, 0xF7, 0xC2, 0xB0, /* 0xB0-0xB3 */
+	0xDE, 0xE2, 0x92, 0xF8, 0x92, 0xF9, 0xD6, 0xC0, /* 0xB4-0xB7 */
+	0xB5, 0xA7, 0x92, 0xFA, 0xB2, 0xF4, 0x92, 0xFB, /* 0xB8-0xBB */
+	0xDE, 0xE8, 0x92, 0xFC, 0xDE, 0xF2, 0x92, 0xFD, /* 0xBC-0xBF */
+	0x92, 0xFE, 0x93, 0x40, 0x93, 0x41, 0x93, 0x42, /* 0xC0-0xC3 */
+	0xDE, 0xED, 0x93, 0x43, 0xDE, 0xF1, 0x93, 0x44, /* 0xC4-0xC7 */
+	0x93, 0x45, 0xC8, 0xE0, 0x93, 0x46, 0x93, 0x47, /* 0xC8-0xCB */
+	0x93, 0x48, 0xD7, 0xE1, 0xDE, 0xEF, 0xC3, 0xE8, /* 0xCC-0xCF */
+	0xCC, 0xE1, 0x93, 0x49, 0xB2, 0xE5, 0x93, 0x4A, /* 0xD0-0xD3 */
+	0x93, 0x4B, 0x93, 0x4C, 0xD2, 0xBE, 0x93, 0x4D, /* 0xD4-0xD7 */
+	0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, 0x93, 0x51, /* 0xD8-0xDB */
+	0x93, 0x52, 0x93, 0x53, 0xDE, 0xEE, 0x93, 0x54, /* 0xDC-0xDF */
+	0xDE, 0xEB, 0xCE, 0xD5, 0x93, 0x55, 0xB4, 0xA7, /* 0xE0-0xE3 */
+	0x93, 0x56, 0x93, 0x57, 0x93, 0x58, 0x93, 0x59, /* 0xE4-0xE7 */
+	0x93, 0x5A, 0xBF, 0xAB, 0xBE, 0xBE, 0x93, 0x5B, /* 0xE8-0xEB */
+	0x93, 0x5C, 0xBD, 0xD2, 0x93, 0x5D, 0x93, 0x5E, /* 0xEC-0xEF */
+	0x93, 0x5F, 0x93, 0x60, 0xDE, 0xE9, 0x93, 0x61, /* 0xF0-0xF3 */
+	0xD4, 0xAE, 0x93, 0x62, 0xDE, 0xDE, 0x93, 0x63, /* 0xF4-0xF7 */
+	0xDE, 0xEA, 0x93, 0x64, 0x93, 0x65, 0x93, 0x66, /* 0xF8-0xFB */
+	0x93, 0x67, 0xC0, 0xBF, 0x93, 0x68, 0xDE, 0xEC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_64[512] = {
+	0xB2, 0xF3, 0xB8, 0xE9, 0xC2, 0xA7, 0x93, 0x69, /* 0x00-0x03 */
+	0x93, 0x6A, 0xBD, 0xC1, 0x93, 0x6B, 0x93, 0x6C, /* 0x04-0x07 */
+	0x93, 0x6D, 0x93, 0x6E, 0x93, 0x6F, 0xDE, 0xF5, /* 0x08-0x0B */
+	0xDE, 0xF8, 0x93, 0x70, 0x93, 0x71, 0xB2, 0xAB, /* 0x0C-0x0F */
+	0xB4, 0xA4, 0x93, 0x72, 0x93, 0x73, 0xB4, 0xEA, /* 0x10-0x13 */
+	0xC9, 0xA6, 0x93, 0x74, 0x93, 0x75, 0x93, 0x76, /* 0x14-0x17 */
+	0x93, 0x77, 0x93, 0x78, 0x93, 0x79, 0xDE, 0xF6, /* 0x18-0x1B */
+	0xCB, 0xD1, 0x93, 0x7A, 0xB8, 0xE3, 0x93, 0x7B, /* 0x1C-0x1F */
+	0xDE, 0xF7, 0xDE, 0xFA, 0x93, 0x7C, 0x93, 0x7D, /* 0x20-0x23 */
+	0x93, 0x7E, 0x93, 0x80, 0xDE, 0xF9, 0x93, 0x81, /* 0x24-0x27 */
+	0x93, 0x82, 0x93, 0x83, 0xCC, 0xC2, 0x93, 0x84, /* 0x28-0x2B */
+	0xB0, 0xE1, 0xB4, 0xEE, 0x93, 0x85, 0x93, 0x86, /* 0x2C-0x2F */
+	0x93, 0x87, 0x93, 0x88, 0x93, 0x89, 0x93, 0x8A, /* 0x30-0x33 */
+	0xE5, 0xBA, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x34-0x37 */
+	0x93, 0x8E, 0x93, 0x8F, 0xD0, 0xAF, 0x93, 0x90, /* 0x38-0x3B */
+	0x93, 0x91, 0xB2, 0xEB, 0x93, 0x92, 0xEB, 0xA1, /* 0x3C-0x3F */
+	0x93, 0x93, 0xDE, 0xF4, 0x93, 0x94, 0x93, 0x95, /* 0x40-0x43 */
+	0xC9, 0xE3, 0xDE, 0xF3, 0xB0, 0xDA, 0xD2, 0xA1, /* 0x44-0x47 */
+	0xB1, 0xF7, 0x93, 0x96, 0xCC, 0xAF, 0x93, 0x97, /* 0x48-0x4B */
+	0x93, 0x98, 0x93, 0x99, 0x93, 0x9A, 0x93, 0x9B, /* 0x4C-0x4F */
+	0x93, 0x9C, 0x93, 0x9D, 0xDE, 0xF0, 0x93, 0x9E, /* 0x50-0x53 */
+	0xCB, 0xA4, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x54-0x57 */
+	0xD5, 0xAA, 0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, /* 0x58-0x5B */
+	0x93, 0xA5, 0x93, 0xA6, 0xDE, 0xFB, 0x93, 0xA7, /* 0x5C-0x5F */
+	0x93, 0xA8, 0x93, 0xA9, 0x93, 0xAA, 0x93, 0xAB, /* 0x60-0x63 */
+	0x93, 0xAC, 0x93, 0xAD, 0x93, 0xAE, 0xB4, 0xDD, /* 0x64-0x67 */
+	0x93, 0xAF, 0xC4, 0xA6, 0x93, 0xB0, 0x93, 0xB1, /* 0x68-0x6B */
+	0x93, 0xB2, 0xDE, 0xFD, 0x93, 0xB3, 0x93, 0xB4, /* 0x6C-0x6F */
+	0x93, 0xB5, 0x93, 0xB6, 0x93, 0xB7, 0x93, 0xB8, /* 0x70-0x73 */
+	0x93, 0xB9, 0x93, 0xBA, 0x93, 0xBB, 0x93, 0xBC, /* 0x74-0x77 */
+	0xC3, 0xFE, 0xC4, 0xA1, 0xDF, 0xA1, 0x93, 0xBD, /* 0x78-0x7B */
+	0x93, 0xBE, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0x7C-0x7F */
+	
+	0x93, 0xC2, 0x93, 0xC3, 0xC1, 0xCC, 0x93, 0xC4, /* 0x80-0x83 */
+	0xDE, 0xFC, 0xBE, 0xEF, 0x93, 0xC5, 0xC6, 0xB2, /* 0x84-0x87 */
+	0x93, 0xC6, 0x93, 0xC7, 0x93, 0xC8, 0x93, 0xC9, /* 0x88-0x8B */
+	0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, 0x93, 0xCD, /* 0x8C-0x8F */
+	0x93, 0xCE, 0xB3, 0xC5, 0xC8, 0xF6, 0x93, 0xCF, /* 0x90-0x93 */
+	0x93, 0xD0, 0xCB, 0xBA, 0xDE, 0xFE, 0x93, 0xD1, /* 0x94-0x97 */
+	0x93, 0xD2, 0xDF, 0xA4, 0x93, 0xD3, 0x93, 0xD4, /* 0x98-0x9B */
+	0x93, 0xD5, 0x93, 0xD6, 0xD7, 0xB2, 0x93, 0xD7, /* 0x9C-0x9F */
+	0x93, 0xD8, 0x93, 0xD9, 0x93, 0xDA, 0x93, 0xDB, /* 0xA0-0xA3 */
+	0xB3, 0xB7, 0x93, 0xDC, 0x93, 0xDD, 0x93, 0xDE, /* 0xA4-0xA7 */
+	0x93, 0xDF, 0xC1, 0xC3, 0x93, 0xE0, 0x93, 0xE1, /* 0xA8-0xAB */
+	0xC7, 0xCB, 0xB2, 0xA5, 0xB4, 0xE9, 0x93, 0xE2, /* 0xAC-0xAF */
+	0xD7, 0xAB, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xB0-0xB3 */
+	0x93, 0xE6, 0xC4, 0xEC, 0x93, 0xE7, 0xDF, 0xA2, /* 0xB4-0xB7 */
+	0xDF, 0xA3, 0x93, 0xE8, 0xDF, 0xA5, 0x93, 0xE9, /* 0xB8-0xBB */
+	0xBA, 0xB3, 0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, /* 0xBC-0xBF */
+	0xDF, 0xA6, 0x93, 0xED, 0xC0, 0xDE, 0x93, 0xEE, /* 0xC0-0xC3 */
+	0x93, 0xEF, 0xC9, 0xC3, 0x93, 0xF0, 0x93, 0xF1, /* 0xC4-0xC7 */
+	0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xC8-0xCB */
+	0x93, 0xF6, 0xB2, 0xD9, 0xC7, 0xE6, 0x93, 0xF7, /* 0xCC-0xCF */
+	0xDF, 0xA7, 0x93, 0xF8, 0xC7, 0xDC, 0x93, 0xF9, /* 0xD0-0xD3 */
+	0x93, 0xFA, 0x93, 0xFB, 0x93, 0xFC, 0xDF, 0xA8, /* 0xD4-0xD7 */
+	0xEB, 0xA2, 0x93, 0xFD, 0x93, 0xFE, 0x94, 0x40, /* 0xD8-0xDB */
+	0x94, 0x41, 0x94, 0x42, 0xCB, 0xD3, 0x94, 0x43, /* 0xDC-0xDF */
+	0x94, 0x44, 0x94, 0x45, 0xDF, 0xAA, 0x94, 0x46, /* 0xE0-0xE3 */
+	0xDF, 0xA9, 0x94, 0x47, 0xB2, 0xC1, 0x94, 0x48, /* 0xE4-0xE7 */
+	0x94, 0x49, 0x94, 0x4A, 0x94, 0x4B, 0x94, 0x4C, /* 0xE8-0xEB */
+	0x94, 0x4D, 0x94, 0x4E, 0x94, 0x4F, 0x94, 0x50, /* 0xEC-0xEF */
+	0x94, 0x51, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0xF0-0xF3 */
+	0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0xF4-0xF7 */
+	0x94, 0x59, 0x94, 0x5A, 0x94, 0x5B, 0x94, 0x5C, /* 0xF8-0xFB */
+	0x94, 0x5D, 0x94, 0x5E, 0x94, 0x5F, 0x94, 0x60, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0xC5, 0xCA, 0x94, 0x61, 0x94, 0x62, 0x94, 0x63, /* 0x00-0x03 */
+	0x94, 0x64, 0x94, 0x65, 0x94, 0x66, 0x94, 0x67, /* 0x04-0x07 */
+	0x94, 0x68, 0xDF, 0xAB, 0x94, 0x69, 0x94, 0x6A, /* 0x08-0x0B */
+	0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, 0x94, 0x6E, /* 0x0C-0x0F */
+	0x94, 0x6F, 0x94, 0x70, 0xD4, 0xDC, 0x94, 0x71, /* 0x10-0x13 */
+	0x94, 0x72, 0x94, 0x73, 0x94, 0x74, 0x94, 0x75, /* 0x14-0x17 */
+	0xC8, 0xC1, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x18-0x1B */
+	0x94, 0x79, 0x94, 0x7A, 0x94, 0x7B, 0x94, 0x7C, /* 0x1C-0x1F */
+	0x94, 0x7D, 0x94, 0x7E, 0x94, 0x80, 0x94, 0x81, /* 0x20-0x23 */
+	0x94, 0x82, 0xDF, 0xAC, 0x94, 0x83, 0x94, 0x84, /* 0x24-0x27 */
+	0x94, 0x85, 0x94, 0x86, 0x94, 0x87, 0xBE, 0xF0, /* 0x28-0x2B */
+	0x94, 0x88, 0x94, 0x89, 0xDF, 0xAD, 0xD6, 0xA7, /* 0x2C-0x2F */
+	0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x30-0x33 */
+	0xEA, 0xB7, 0xEB, 0xB6, 0xCA, 0xD5, 0x94, 0x8E, /* 0x34-0x37 */
+	0xD8, 0xFC, 0xB8, 0xC4, 0x94, 0x8F, 0xB9, 0xA5, /* 0x38-0x3B */
+	0x94, 0x90, 0x94, 0x91, 0xB7, 0xC5, 0xD5, 0xFE, /* 0x3C-0x3F */
+	0x94, 0x92, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x40-0x43 */
+	0x94, 0x96, 0xB9, 0xCA, 0x94, 0x97, 0x94, 0x98, /* 0x44-0x47 */
+	0xD0, 0xA7, 0xF4, 0xCD, 0x94, 0x99, 0x94, 0x9A, /* 0x48-0x4B */
+	0xB5, 0xD0, 0x94, 0x9B, 0x94, 0x9C, 0xC3, 0xF4, /* 0x4C-0x4F */
+	0x94, 0x9D, 0xBE, 0xC8, 0x94, 0x9E, 0x94, 0x9F, /* 0x50-0x53 */
+	0x94, 0xA0, 0xEB, 0xB7, 0xB0, 0xBD, 0x94, 0xA1, /* 0x54-0x57 */
+	0x94, 0xA2, 0xBD, 0xCC, 0x94, 0xA3, 0xC1, 0xB2, /* 0x58-0x5B */
+	0x94, 0xA4, 0xB1, 0xD6, 0xB3, 0xA8, 0x94, 0xA5, /* 0x5C-0x5F */
+	0x94, 0xA6, 0x94, 0xA7, 0xB8, 0xD2, 0xC9, 0xA2, /* 0x60-0x63 */
+	0x94, 0xA8, 0x94, 0xA9, 0xB6, 0xD8, 0x94, 0xAA, /* 0x64-0x67 */
+	0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, 0xEB, 0xB8, /* 0x68-0x6B */
+	0xBE, 0xB4, 0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, /* 0x6C-0x6F */
+	0xCA, 0xFD, 0x94, 0xB1, 0xC7, 0xC3, 0x94, 0xB2, /* 0x70-0x73 */
+	0xD5, 0xFB, 0x94, 0xB3, 0x94, 0xB4, 0xB7, 0xF3, /* 0x74-0x77 */
+	0x94, 0xB5, 0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, /* 0x78-0x7B */
+	0x94, 0xB9, 0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, /* 0x7C-0x7F */
+	
+	0x94, 0xBD, 0x94, 0xBE, 0x94, 0xBF, 0x94, 0xC0, /* 0x80-0x83 */
+	0x94, 0xC1, 0x94, 0xC2, 0x94, 0xC3, 0xCE, 0xC4, /* 0x84-0x87 */
+	0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, 0xD5, 0xAB, /* 0x88-0x8B */
+	0xB1, 0xF3, 0x94, 0xC7, 0x94, 0xC8, 0x94, 0xC9, /* 0x8C-0x8F */
+	0xEC, 0xB3, 0xB0, 0xDF, 0x94, 0xCA, 0xEC, 0xB5, /* 0x90-0x93 */
+	0x94, 0xCB, 0x94, 0xCC, 0x94, 0xCD, 0xB6, 0xB7, /* 0x94-0x97 */
+	0x94, 0xCE, 0xC1, 0xCF, 0x94, 0xCF, 0xF5, 0xFA, /* 0x98-0x9B */
+	0xD0, 0xB1, 0x94, 0xD0, 0x94, 0xD1, 0xD5, 0xE5, /* 0x9C-0x9F */
+	0x94, 0xD2, 0xCE, 0xD3, 0x94, 0xD3, 0x94, 0xD4, /* 0xA0-0xA3 */
+	0xBD, 0xEF, 0xB3, 0xE2, 0x94, 0xD5, 0xB8, 0xAB, /* 0xA4-0xA7 */
+	0x94, 0xD6, 0xD5, 0xB6, 0x94, 0xD7, 0xED, 0xBD, /* 0xA8-0xAB */
+	0x94, 0xD8, 0xB6, 0xCF, 0x94, 0xD9, 0xCB, 0xB9, /* 0xAC-0xAF */
+	0xD0, 0xC2, 0x94, 0xDA, 0x94, 0xDB, 0x94, 0xDC, /* 0xB0-0xB3 */
+	0x94, 0xDD, 0x94, 0xDE, 0x94, 0xDF, 0x94, 0xE0, /* 0xB4-0xB7 */
+	0x94, 0xE1, 0xB7, 0xBD, 0x94, 0xE2, 0x94, 0xE3, /* 0xB8-0xBB */
+	0xEC, 0xB6, 0xCA, 0xA9, 0x94, 0xE4, 0x94, 0xE5, /* 0xBC-0xBF */
+	0x94, 0xE6, 0xC5, 0xD4, 0x94, 0xE7, 0xEC, 0xB9, /* 0xC0-0xC3 */
+	0xEC, 0xB8, 0xC2, 0xC3, 0xEC, 0xB7, 0x94, 0xE8, /* 0xC4-0xC7 */
+	0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0xD0, 0xFD, /* 0xC8-0xCB */
+	0xEC, 0xBA, 0x94, 0xEC, 0xEC, 0xBB, 0xD7, 0xE5, /* 0xCC-0xCF */
+	0x94, 0xED, 0x94, 0xEE, 0xEC, 0xBC, 0x94, 0xEF, /* 0xD0-0xD3 */
+	0x94, 0xF0, 0x94, 0xF1, 0xEC, 0xBD, 0xC6, 0xEC, /* 0xD4-0xD7 */
+	0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, 0x94, 0xF5, /* 0xD8-0xDB */
+	0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, 0x94, 0xF9, /* 0xDC-0xDF */
+	0xCE, 0xDE, 0x94, 0xFA, 0xBC, 0xC8, 0x94, 0xFB, /* 0xE0-0xE3 */
+	0x94, 0xFC, 0xC8, 0xD5, 0xB5, 0xA9, 0xBE, 0xC9, /* 0xE4-0xE7 */
+	0xD6, 0xBC, 0xD4, 0xE7, 0x94, 0xFD, 0x94, 0xFE, /* 0xE8-0xEB */
+	0xD1, 0xAE, 0xD0, 0xF1, 0xEA, 0xB8, 0xEA, 0xB9, /* 0xEC-0xEF */
+	0xEA, 0xBA, 0xBA, 0xB5, 0x95, 0x40, 0x95, 0x41, /* 0xF0-0xF3 */
+	0x95, 0x42, 0x95, 0x43, 0xCA, 0xB1, 0xBF, 0xF5, /* 0xF4-0xF7 */
+	0x95, 0x44, 0x95, 0x45, 0xCD, 0xFA, 0x95, 0x46, /* 0xF8-0xFB */
+	0x95, 0x47, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_66[512] = {
+	0xEA, 0xC0, 0x95, 0x4B, 0xB0, 0xBA, 0xEA, 0xBE, /* 0x00-0x03 */
+	0x95, 0x4C, 0x95, 0x4D, 0xC0, 0xA5, 0x95, 0x4E, /* 0x04-0x07 */
+	0x95, 0x4F, 0x95, 0x50, 0xEA, 0xBB, 0x95, 0x51, /* 0x08-0x0B */
+	0xB2, 0xFD, 0x95, 0x52, 0xC3, 0xF7, 0xBB, 0xE8, /* 0x0C-0x0F */
+	0x95, 0x53, 0x95, 0x54, 0x95, 0x55, 0xD2, 0xD7, /* 0x10-0x13 */
+	0xCE, 0xF4, 0xEA, 0xBF, 0x95, 0x56, 0x95, 0x57, /* 0x14-0x17 */
+	0x95, 0x58, 0xEA, 0xBC, 0x95, 0x59, 0x95, 0x5A, /* 0x18-0x1B */
+	0x95, 0x5B, 0xEA, 0xC3, 0x95, 0x5C, 0xD0, 0xC7, /* 0x1C-0x1F */
+	0xD3, 0xB3, 0x95, 0x5D, 0x95, 0x5E, 0x95, 0x5F, /* 0x20-0x23 */
+	0x95, 0x60, 0xB4, 0xBA, 0x95, 0x61, 0xC3, 0xC1, /* 0x24-0x27 */
+	0xD7, 0xF2, 0x95, 0x62, 0x95, 0x63, 0x95, 0x64, /* 0x28-0x2B */
+	0x95, 0x65, 0xD5, 0xD1, 0x95, 0x66, 0xCA, 0xC7, /* 0x2C-0x2F */
+	0x95, 0x67, 0xEA, 0xC5, 0x95, 0x68, 0x95, 0x69, /* 0x30-0x33 */
+	0xEA, 0xC4, 0xEA, 0xC7, 0xEA, 0xC6, 0x95, 0x6A, /* 0x34-0x37 */
+	0x95, 0x6B, 0x95, 0x6C, 0x95, 0x6D, 0x95, 0x6E, /* 0x38-0x3B */
+	0xD6, 0xE7, 0x95, 0x6F, 0xCF, 0xD4, 0x95, 0x70, /* 0x3C-0x3F */
+	0x95, 0x71, 0xEA, 0xCB, 0x95, 0x72, 0xBB, 0xCE, /* 0x40-0x43 */
+	0x95, 0x73, 0x95, 0x74, 0x95, 0x75, 0x95, 0x76, /* 0x44-0x47 */
+	0x95, 0x77, 0x95, 0x78, 0x95, 0x79, 0xBD, 0xFA, /* 0x48-0x4B */
+	0xC9, 0xCE, 0x95, 0x7A, 0x95, 0x7B, 0xEA, 0xCC, /* 0x4C-0x4F */
+	0x95, 0x7C, 0x95, 0x7D, 0xC9, 0xB9, 0xCF, 0xFE, /* 0x50-0x53 */
+	0xEA, 0xCA, 0xD4, 0xCE, 0xEA, 0xCD, 0xEA, 0xCF, /* 0x54-0x57 */
+	0x95, 0x7E, 0x95, 0x80, 0xCD, 0xED, 0x95, 0x81, /* 0x58-0x5B */
+	0x95, 0x82, 0x95, 0x83, 0x95, 0x84, 0xEA, 0xC9, /* 0x5C-0x5F */
+	0x95, 0x85, 0xEA, 0xCE, 0x95, 0x86, 0x95, 0x87, /* 0x60-0x63 */
+	0xCE, 0xEE, 0x95, 0x88, 0xBB, 0xDE, 0x95, 0x89, /* 0x64-0x67 */
+	0xB3, 0xBF, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x68-0x6B */
+	0x95, 0x8D, 0x95, 0x8E, 0xC6, 0xD5, 0xBE, 0xB0, /* 0x6C-0x6F */
+	0xCE, 0xFA, 0x95, 0x8F, 0x95, 0x90, 0x95, 0x91, /* 0x70-0x73 */
+	0xC7, 0xE7, 0x95, 0x92, 0xBE, 0xA7, 0xEA, 0xD0, /* 0x74-0x77 */
+	0x95, 0x93, 0x95, 0x94, 0xD6, 0xC7, 0x95, 0x95, /* 0x78-0x7B */
+	0x95, 0x96, 0x95, 0x97, 0xC1, 0xC0, 0x95, 0x98, /* 0x7C-0x7F */
+	
+	0x95, 0x99, 0x95, 0x9A, 0xD4, 0xDD, 0x95, 0x9B, /* 0x80-0x83 */
+	0xEA, 0xD1, 0x95, 0x9C, 0x95, 0x9D, 0xCF, 0xBE, /* 0x84-0x87 */
+	0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, 0x95, 0xA1, /* 0x88-0x8B */
+	0xEA, 0xD2, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x8C-0x8F */
+	0x95, 0xA5, 0xCA, 0xEE, 0x95, 0xA6, 0x95, 0xA7, /* 0x90-0x93 */
+	0x95, 0xA8, 0x95, 0xA9, 0xC5, 0xAF, 0xB0, 0xB5, /* 0x94-0x97 */
+	0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, 0x95, 0xAD, /* 0x98-0x9B */
+	0x95, 0xAE, 0xEA, 0xD4, 0x95, 0xAF, 0x95, 0xB0, /* 0x9C-0x9F */
+	0x95, 0xB1, 0x95, 0xB2, 0x95, 0xB3, 0x95, 0xB4, /* 0xA0-0xA3 */
+	0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, 0xEA, 0xD3, /* 0xA4-0xA7 */
+	0xF4, 0xDF, 0x95, 0xB8, 0x95, 0xB9, 0x95, 0xBA, /* 0xA8-0xAB */
+	0x95, 0xBB, 0x95, 0xBC, 0xC4, 0xBA, 0x95, 0xBD, /* 0xAC-0xAF */
+	0x95, 0xBE, 0x95, 0xBF, 0x95, 0xC0, 0x95, 0xC1, /* 0xB0-0xB3 */
+	0xB1, 0xA9, 0x95, 0xC2, 0x95, 0xC3, 0x95, 0xC4, /* 0xB4-0xB7 */
+	0x95, 0xC5, 0xE5, 0xDF, 0x95, 0xC6, 0x95, 0xC7, /* 0xB8-0xBB */
+	0x95, 0xC8, 0x95, 0xC9, 0xEA, 0xD5, 0x95, 0xCA, /* 0xBC-0xBF */
+	0x95, 0xCB, 0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, /* 0xC0-0xC3 */
+	0x95, 0xCF, 0x95, 0xD0, 0x95, 0xD1, 0x95, 0xD2, /* 0xC4-0xC7 */
+	0x95, 0xD3, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0xC8-0xCB */
+	0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0xCC-0xCF */
+	0x95, 0xDB, 0x95, 0xDC, 0x95, 0xDD, 0x95, 0xDE, /* 0xD0-0xD3 */
+	0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, 0x95, 0xE2, /* 0xD4-0xD7 */
+	0x95, 0xE3, 0xCA, 0xEF, 0x95, 0xE4, 0xEA, 0xD6, /* 0xD8-0xDB */
+	0xEA, 0xD7, 0xC6, 0xD8, 0x95, 0xE5, 0x95, 0xE6, /* 0xDC-0xDF */
+	0x95, 0xE7, 0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, /* 0xE0-0xE3 */
+	0x95, 0xEB, 0x95, 0xEC, 0xEA, 0xD8, 0x95, 0xED, /* 0xE4-0xE7 */
+	0x95, 0xEE, 0xEA, 0xD9, 0x95, 0xEF, 0x95, 0xF0, /* 0xE8-0xEB */
+	0x95, 0xF1, 0x95, 0xF2, 0x95, 0xF3, 0x95, 0xF4, /* 0xEC-0xEF */
+	0xD4, 0xBB, 0x95, 0xF5, 0xC7, 0xFA, 0xD2, 0xB7, /* 0xF0-0xF3 */
+	0xB8, 0xFC, 0x95, 0xF6, 0x95, 0xF7, 0xEA, 0xC2, /* 0xF4-0xF7 */
+	0x95, 0xF8, 0xB2, 0xDC, 0x95, 0xF9, 0x95, 0xFA, /* 0xF8-0xFB */
+	0xC2, 0xFC, 0x95, 0xFB, 0xD4, 0xF8, 0xCC, 0xE6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0xD7, 0xEE, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0x00-0x03 */
+	0x96, 0x40, 0x96, 0x41, 0x96, 0x42, 0x96, 0x43, /* 0x04-0x07 */
+	0xD4, 0xC2, 0xD3, 0xD0, 0xEB, 0xC3, 0xC5, 0xF3, /* 0x08-0x0B */
+	0x96, 0x44, 0xB7, 0xFE, 0x96, 0x45, 0x96, 0x46, /* 0x0C-0x0F */
+	0xEB, 0xD4, 0x96, 0x47, 0x96, 0x48, 0x96, 0x49, /* 0x10-0x13 */
+	0xCB, 0xB7, 0xEB, 0xDE, 0x96, 0x4A, 0xC0, 0xCA, /* 0x14-0x17 */
+	0x96, 0x4B, 0x96, 0x4C, 0x96, 0x4D, 0xCD, 0xFB, /* 0x18-0x1B */
+	0x96, 0x4E, 0xB3, 0xAF, 0x96, 0x4F, 0xC6, 0xDA, /* 0x1C-0x1F */
+	0x96, 0x50, 0x96, 0x51, 0x96, 0x52, 0x96, 0x53, /* 0x20-0x23 */
+	0x96, 0x54, 0x96, 0x55, 0xEB, 0xFC, 0x96, 0x56, /* 0x24-0x27 */
+	0xC4, 0xBE, 0x96, 0x57, 0xCE, 0xB4, 0xC4, 0xA9, /* 0x28-0x2B */
+	0xB1, 0xBE, 0xD4, 0xFD, 0x96, 0x58, 0xCA, 0xF5, /* 0x2C-0x2F */
+	0x96, 0x59, 0xD6, 0xEC, 0x96, 0x5A, 0x96, 0x5B, /* 0x30-0x33 */
+	0xC6, 0xD3, 0xB6, 0xE4, 0x96, 0x5C, 0x96, 0x5D, /* 0x34-0x37 */
+	0x96, 0x5E, 0x96, 0x5F, 0xBB, 0xFA, 0x96, 0x60, /* 0x38-0x3B */
+	0x96, 0x61, 0xD0, 0xE0, 0x96, 0x62, 0x96, 0x63, /* 0x3C-0x3F */
+	0xC9, 0xB1, 0x96, 0x64, 0xD4, 0xD3, 0xC8, 0xA8, /* 0x40-0x43 */
+	0x96, 0x65, 0x96, 0x66, 0xB8, 0xCB, 0x96, 0x67, /* 0x44-0x47 */
+	0xE8, 0xBE, 0xC9, 0xBC, 0x96, 0x68, 0x96, 0x69, /* 0x48-0x4B */
+	0xE8, 0xBB, 0x96, 0x6A, 0xC0, 0xEE, 0xD0, 0xD3, /* 0x4C-0x4F */
+	0xB2, 0xC4, 0xB4, 0xE5, 0x96, 0x6B, 0xE8, 0xBC, /* 0x50-0x53 */
+	0x96, 0x6C, 0x96, 0x6D, 0xD5, 0xC8, 0x96, 0x6E, /* 0x54-0x57 */
+	0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, 0x96, 0x72, /* 0x58-0x5B */
+	0xB6, 0xC5, 0x96, 0x73, 0xE8, 0xBD, 0xCA, 0xF8, /* 0x5C-0x5F */
+	0xB8, 0xDC, 0xCC, 0xF5, 0x96, 0x74, 0x96, 0x75, /* 0x60-0x63 */
+	0x96, 0x76, 0xC0, 0xB4, 0x96, 0x77, 0x96, 0x78, /* 0x64-0x67 */
+	0xD1, 0xEE, 0xE8, 0xBF, 0xE8, 0xC2, 0x96, 0x79, /* 0x68-0x6B */
+	0x96, 0x7A, 0xBA, 0xBC, 0x96, 0x7B, 0xB1, 0xAD, /* 0x6C-0x6F */
+	0xBD, 0xDC, 0x96, 0x7C, 0xEA, 0xBD, 0xE8, 0xC3, /* 0x70-0x73 */
+	0x96, 0x7D, 0xE8, 0xC6, 0x96, 0x7E, 0xE8, 0xCB, /* 0x74-0x77 */
+	0x96, 0x80, 0x96, 0x81, 0x96, 0x82, 0x96, 0x83, /* 0x78-0x7B */
+	0xE8, 0xCC, 0x96, 0x84, 0xCB, 0xC9, 0xB0, 0xE5, /* 0x7C-0x7F */
+	
+	0x96, 0x85, 0xBC, 0xAB, 0x96, 0x86, 0x96, 0x87, /* 0x80-0x83 */
+	0xB9, 0xB9, 0x96, 0x88, 0x96, 0x89, 0xE8, 0xC1, /* 0x84-0x87 */
+	0x96, 0x8A, 0xCD, 0xF7, 0x96, 0x8B, 0xE8, 0xCA, /* 0x88-0x8B */
+	0x96, 0x8C, 0x96, 0x8D, 0x96, 0x8E, 0x96, 0x8F, /* 0x8C-0x8F */
+	0xCE, 0xF6, 0x96, 0x90, 0x96, 0x91, 0x96, 0x92, /* 0x90-0x93 */
+	0x96, 0x93, 0xD5, 0xED, 0x96, 0x94, 0xC1, 0xD6, /* 0x94-0x97 */
+	0xE8, 0xC4, 0x96, 0x95, 0xC3, 0xB6, 0x96, 0x96, /* 0x98-0x9B */
+	0xB9, 0xFB, 0xD6, 0xA6, 0xE8, 0xC8, 0x96, 0x97, /* 0x9C-0x9F */
+	0x96, 0x98, 0x96, 0x99, 0xCA, 0xE0, 0xD4, 0xE6, /* 0xA0-0xA3 */
+	0x96, 0x9A, 0xE8, 0xC0, 0x96, 0x9B, 0xE8, 0xC5, /* 0xA4-0xA7 */
+	0xE8, 0xC7, 0x96, 0x9C, 0xC7, 0xB9, 0xB7, 0xE3, /* 0xA8-0xAB */
+	0x96, 0x9D, 0xE8, 0xC9, 0x96, 0x9E, 0xBF, 0xDD, /* 0xAC-0xAF */
+	0xE8, 0xD2, 0x96, 0x9F, 0x96, 0xA0, 0xE8, 0xD7, /* 0xB0-0xB3 */
+	0x96, 0xA1, 0xE8, 0xD5, 0xBC, 0xDC, 0xBC, 0xCF, /* 0xB4-0xB7 */
+	0xE8, 0xDB, 0x96, 0xA2, 0x96, 0xA3, 0x96, 0xA4, /* 0xB8-0xBB */
+	0x96, 0xA5, 0x96, 0xA6, 0x96, 0xA7, 0x96, 0xA8, /* 0xBC-0xBF */
+	0x96, 0xA9, 0xE8, 0xDE, 0x96, 0xAA, 0xE8, 0xDA, /* 0xC0-0xC3 */
+	0xB1, 0xFA, 0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, /* 0xC4-0xC7 */
+	0x96, 0xAE, 0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, /* 0xC8-0xCB */
+	0x96, 0xB2, 0x96, 0xB3, 0x96, 0xB4, 0xB0, 0xD8, /* 0xCC-0xCF */
+	0xC4, 0xB3, 0xB8, 0xCC, 0xC6, 0xE2, 0xC8, 0xBE, /* 0xD0-0xD3 */
+	0xC8, 0xE1, 0x96, 0xB5, 0x96, 0xB6, 0x96, 0xB7, /* 0xD4-0xD7 */
+	0xE8, 0xCF, 0xE8, 0xD4, 0xE8, 0xD6, 0x96, 0xB8, /* 0xD8-0xDB */
+	0xB9, 0xF1, 0xE8, 0xD8, 0xD7, 0xF5, 0x96, 0xB9, /* 0xDC-0xDF */
+	0xC4, 0xFB, 0x96, 0xBA, 0xE8, 0xDC, 0x96, 0xBB, /* 0xE0-0xE3 */
+	0x96, 0xBC, 0xB2, 0xE9, 0x96, 0xBD, 0x96, 0xBE, /* 0xE4-0xE7 */
+	0x96, 0xBF, 0xE8, 0xD1, 0x96, 0xC0, 0x96, 0xC1, /* 0xE8-0xEB */
+	0xBC, 0xED, 0x96, 0xC2, 0x96, 0xC3, 0xBF, 0xC2, /* 0xEC-0xEF */
+	0xE8, 0xCD, 0xD6, 0xF9, 0x96, 0xC4, 0xC1, 0xF8, /* 0xF0-0xF3 */
+	0xB2, 0xF1, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0xF4-0xF7 */
+	0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, 0x96, 0xCB, /* 0xF8-0xFB */
+	0x96, 0xCC, 0xE8, 0xDF, 0x96, 0xCD, 0xCA, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0xE8, 0xD9, 0x96, 0xCE, 0x96, 0xCF, 0x96, 0xD0, /* 0x00-0x03 */
+	0x96, 0xD1, 0xD5, 0xA4, 0x96, 0xD2, 0xB1, 0xEA, /* 0x04-0x07 */
+	0xD5, 0xBB, 0xE8, 0xCE, 0xE8, 0xD0, 0xB6, 0xB0, /* 0x08-0x0B */
+	0xE8, 0xD3, 0x96, 0xD3, 0xE8, 0xDD, 0xC0, 0xB8, /* 0x0C-0x0F */
+	0x96, 0xD4, 0xCA, 0xF7, 0x96, 0xD5, 0xCB, 0xA8, /* 0x10-0x13 */
+	0x96, 0xD6, 0x96, 0xD7, 0xC6, 0xDC, 0xC0, 0xF5, /* 0x14-0x17 */
+	0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x18-0x1B */
+	0x96, 0xDC, 0xE8, 0xE9, 0x96, 0xDD, 0x96, 0xDE, /* 0x1C-0x1F */
+	0x96, 0xDF, 0xD0, 0xA3, 0x96, 0xE0, 0x96, 0xE1, /* 0x20-0x23 */
+	0x96, 0xE2, 0x96, 0xE3, 0x96, 0xE4, 0x96, 0xE5, /* 0x24-0x27 */
+	0x96, 0xE6, 0xE8, 0xF2, 0xD6, 0xEA, 0x96, 0xE7, /* 0x28-0x2B */
+	0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x2C-0x2F */
+	0x96, 0xEC, 0x96, 0xED, 0xE8, 0xE0, 0xE8, 0xE1, /* 0x30-0x33 */
+	0x96, 0xEE, 0x96, 0xEF, 0x96, 0xF0, 0xD1, 0xF9, /* 0x34-0x37 */
+	0xBA, 0xCB, 0xB8, 0xF9, 0x96, 0xF1, 0x96, 0xF2, /* 0x38-0x3B */
+	0xB8, 0xF1, 0xD4, 0xD4, 0xE8, 0xEF, 0x96, 0xF3, /* 0x3C-0x3F */
+	0xE8, 0xEE, 0xE8, 0xEC, 0xB9, 0xF0, 0xCC, 0xD2, /* 0x40-0x43 */
+	0xE8, 0xE6, 0xCE, 0xA6, 0xBF, 0xF2, 0x96, 0xF4, /* 0x44-0x47 */
+	0xB0, 0xB8, 0xE8, 0xF1, 0xE8, 0xF0, 0x96, 0xF5, /* 0x48-0x4B */
+	0xD7, 0xC0, 0x96, 0xF6, 0xE8, 0xE4, 0x96, 0xF7, /* 0x4C-0x4F */
+	0xCD, 0xA9, 0xC9, 0xA3, 0x96, 0xF8, 0xBB, 0xB8, /* 0x50-0x53 */
+	0xBD, 0xDB, 0xE8, 0xEA, 0x96, 0xF9, 0x96, 0xFA, /* 0x54-0x57 */
+	0x96, 0xFB, 0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, /* 0x58-0x5B */
+	0x97, 0x40, 0x97, 0x41, 0x97, 0x42, 0x97, 0x43, /* 0x5C-0x5F */
+	0xE8, 0xE2, 0xE8, 0xE3, 0xE8, 0xE5, 0xB5, 0xB5, /* 0x60-0x63 */
+	0xE8, 0xE7, 0xC7, 0xC5, 0xE8, 0xEB, 0xE8, 0xED, /* 0x64-0x67 */
+	0xBD, 0xB0, 0xD7, 0xAE, 0x97, 0x44, 0xE8, 0xF8, /* 0x68-0x6B */
+	0x97, 0x45, 0x97, 0x46, 0x97, 0x47, 0x97, 0x48, /* 0x6C-0x6F */
+	0x97, 0x49, 0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, /* 0x70-0x73 */
+	0xE8, 0xF5, 0x97, 0x4D, 0xCD, 0xB0, 0xE8, 0xF6, /* 0x74-0x77 */
+	0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x78-0x7B */
+	0x97, 0x52, 0x97, 0x53, 0x97, 0x54, 0x97, 0x55, /* 0x7C-0x7F */
+	
+	0x97, 0x56, 0xC1, 0xBA, 0x97, 0x57, 0xE8, 0xE8, /* 0x80-0x83 */
+	0x97, 0x58, 0xC3, 0xB7, 0xB0, 0xF0, 0x97, 0x59, /* 0x84-0x87 */
+	0x97, 0x5A, 0x97, 0x5B, 0x97, 0x5C, 0x97, 0x5D, /* 0x88-0x8B */
+	0x97, 0x5E, 0x97, 0x5F, 0x97, 0x60, 0xE8, 0xF4, /* 0x8C-0x8F */
+	0x97, 0x61, 0x97, 0x62, 0x97, 0x63, 0xE8, 0xF7, /* 0x90-0x93 */
+	0x97, 0x64, 0x97, 0x65, 0x97, 0x66, 0xB9, 0xA3, /* 0x94-0x97 */
+	0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0x98-0x9B */
+	0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0x9C-0x9F */
+	0x97, 0x6F, 0x97, 0x70, 0xC9, 0xD2, 0x97, 0x71, /* 0xA0-0xA3 */
+	0x97, 0x72, 0x97, 0x73, 0xC3, 0xCE, 0xCE, 0xE0, /* 0xA4-0xA7 */
+	0xC0, 0xE6, 0x97, 0x74, 0x97, 0x75, 0x97, 0x76, /* 0xA8-0xAB */
+	0x97, 0x77, 0xCB, 0xF3, 0x97, 0x78, 0xCC, 0xDD, /* 0xAC-0xAF */
+	0xD0, 0xB5, 0x97, 0x79, 0x97, 0x7A, 0xCA, 0xE1, /* 0xB0-0xB3 */
+	0x97, 0x7B, 0xE8, 0xF3, 0x97, 0x7C, 0x97, 0x7D, /* 0xB4-0xB7 */
+	0x97, 0x7E, 0x97, 0x80, 0x97, 0x81, 0x97, 0x82, /* 0xB8-0xBB */
+	0x97, 0x83, 0x97, 0x84, 0x97, 0x85, 0x97, 0x86, /* 0xBC-0xBF */
+	0xBC, 0xEC, 0x97, 0x87, 0xE8, 0xF9, 0x97, 0x88, /* 0xC0-0xC3 */
+	0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, 0x97, 0x8C, /* 0xC4-0xC7 */
+	0x97, 0x8D, 0xC3, 0xDE, 0x97, 0x8E, 0xC6, 0xE5, /* 0xC8-0xCB */
+	0x97, 0x8F, 0xB9, 0xF7, 0x97, 0x90, 0x97, 0x91, /* 0xCC-0xCF */
+	0x97, 0x92, 0x97, 0x93, 0xB0, 0xF4, 0x97, 0x94, /* 0xD0-0xD3 */
+	0x97, 0x95, 0xD7, 0xD8, 0x97, 0x96, 0x97, 0x97, /* 0xD4-0xD7 */
+	0xBC, 0xAC, 0x97, 0x98, 0xC5, 0xEF, 0x97, 0x99, /* 0xD8-0xDB */
+	0x97, 0x9A, 0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, /* 0xDC-0xDF */
+	0xCC, 0xC4, 0x97, 0x9E, 0x97, 0x9F, 0xE9, 0xA6, /* 0xE0-0xE3 */
+	0x97, 0xA0, 0x97, 0xA1, 0x97, 0xA2, 0x97, 0xA3, /* 0xE4-0xE7 */
+	0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE8-0xEB */
+	0x97, 0xA8, 0x97, 0xA9, 0xC9, 0xAD, 0x97, 0xAA, /* 0xEC-0xEF */
+	0xE9, 0xA2, 0xC0, 0xE2, 0x97, 0xAB, 0x97, 0xAC, /* 0xF0-0xF3 */
+	0x97, 0xAD, 0xBF, 0xC3, 0x97, 0xAE, 0x97, 0xAF, /* 0xF4-0xF7 */
+	0x97, 0xB0, 0xE8, 0xFE, 0xB9, 0xD7, 0x97, 0xB1, /* 0xF8-0xFB */
+	0xE8, 0xFB, 0x97, 0xB2, 0x97, 0xB3, 0x97, 0xB4, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_69[512] = {
+	0x97, 0xB5, 0xE9, 0xA4, 0x97, 0xB6, 0x97, 0xB7, /* 0x00-0x03 */
+	0x97, 0xB8, 0xD2, 0xCE, 0x97, 0xB9, 0x97, 0xBA, /* 0x04-0x07 */
+	0x97, 0xBB, 0x97, 0xBC, 0x97, 0xBD, 0xE9, 0xA3, /* 0x08-0x0B */
+	0x97, 0xBE, 0xD6, 0xB2, 0xD7, 0xB5, 0x97, 0xBF, /* 0x0C-0x0F */
+	0xE9, 0xA7, 0x97, 0xC0, 0xBD, 0xB7, 0x97, 0xC1, /* 0x10-0x13 */
+	0x97, 0xC2, 0x97, 0xC3, 0x97, 0xC4, 0x97, 0xC5, /* 0x14-0x17 */
+	0x97, 0xC6, 0x97, 0xC7, 0x97, 0xC8, 0x97, 0xC9, /* 0x18-0x1B */
+	0x97, 0xCA, 0x97, 0xCB, 0x97, 0xCC, 0xE8, 0xFC, /* 0x1C-0x1F */
+	0xE8, 0xFD, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x20-0x23 */
+	0xE9, 0xA1, 0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, /* 0x24-0x27 */
+	0x97, 0xD3, 0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, /* 0x28-0x2B */
+	0x97, 0xD7, 0xCD, 0xD6, 0x97, 0xD8, 0x97, 0xD9, /* 0x2C-0x2F */
+	0xD2, 0xAC, 0x97, 0xDA, 0x97, 0xDB, 0x97, 0xDC, /* 0x30-0x33 */
+	0xE9, 0xB2, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x34-0x37 */
+	0x97, 0xE0, 0xE9, 0xA9, 0x97, 0xE1, 0x97, 0xE2, /* 0x38-0x3B */
+	0x97, 0xE3, 0xB4, 0xAA, 0x97, 0xE4, 0xB4, 0xBB, /* 0x3C-0x3F */
+	0x97, 0xE5, 0x97, 0xE6, 0xE9, 0xAB, 0x97, 0xE7, /* 0x40-0x43 */
+	0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x44-0x47 */
+	0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x48-0x4B */
+	0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x4C-0x4F */
+	0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x50-0x53 */
+	0xD0, 0xA8, 0x97, 0xF8, 0x97, 0xF9, 0xE9, 0xA5, /* 0x54-0x57 */
+	0x97, 0xFA, 0x97, 0xFB, 0xB3, 0xFE, 0x97, 0xFC, /* 0x58-0x5B */
+	0x97, 0xFD, 0xE9, 0xAC, 0xC0, 0xE3, 0x97, 0xFE, /* 0x5C-0x5F */
+	0xE9, 0xAA, 0x98, 0x40, 0x98, 0x41, 0xE9, 0xB9, /* 0x60-0x63 */
+	0x98, 0x42, 0x98, 0x43, 0xE9, 0xB8, 0x98, 0x44, /* 0x64-0x67 */
+	0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0xE9, 0xAE, /* 0x68-0x6B */
+	0x98, 0x48, 0x98, 0x49, 0xE8, 0xFA, 0x98, 0x4A, /* 0x6C-0x6F */
+	0x98, 0x4B, 0xE9, 0xA8, 0x98, 0x4C, 0x98, 0x4D, /* 0x70-0x73 */
+	0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, 0xBF, 0xAC, /* 0x74-0x77 */
+	0xE9, 0xB1, 0xE9, 0xBA, 0x98, 0x51, 0x98, 0x52, /* 0x78-0x7B */
+	0xC2, 0xA5, 0x98, 0x53, 0x98, 0x54, 0x98, 0x55, /* 0x7C-0x7F */
+	
+	0xE9, 0xAF, 0x98, 0x56, 0xB8, 0xC5, 0x98, 0x57, /* 0x80-0x83 */
+	0xE9, 0xAD, 0x98, 0x58, 0xD3, 0xDC, 0xE9, 0xB4, /* 0x84-0x87 */
+	0xE9, 0xB5, 0xE9, 0xB7, 0x98, 0x59, 0x98, 0x5A, /* 0x88-0x8B */
+	0x98, 0x5B, 0xE9, 0xC7, 0x98, 0x5C, 0x98, 0x5D, /* 0x8C-0x8F */
+	0x98, 0x5E, 0x98, 0x5F, 0x98, 0x60, 0x98, 0x61, /* 0x90-0x93 */
+	0xC0, 0xC6, 0xE9, 0xC5, 0x98, 0x62, 0x98, 0x63, /* 0x94-0x97 */
+	0xE9, 0xB0, 0x98, 0x64, 0x98, 0x65, 0xE9, 0xBB, /* 0x98-0x9B */
+	0xB0, 0xF1, 0x98, 0x66, 0x98, 0x67, 0x98, 0x68, /* 0x9C-0x9F */
+	0x98, 0x69, 0x98, 0x6A, 0x98, 0x6B, 0x98, 0x6C, /* 0xA0-0xA3 */
+	0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0xE9, 0xBC, /* 0xA4-0xA7 */
+	0xD5, 0xA5, 0x98, 0x70, 0x98, 0x71, 0xE9, 0xBE, /* 0xA8-0xAB */
+	0x98, 0x72, 0xE9, 0xBF, 0x98, 0x73, 0x98, 0x74, /* 0xAC-0xAF */
+	0x98, 0x75, 0xE9, 0xC1, 0x98, 0x76, 0x98, 0x77, /* 0xB0-0xB3 */
+	0xC1, 0xF1, 0x98, 0x78, 0x98, 0x79, 0xC8, 0xB6, /* 0xB4-0xB7 */
+	0x98, 0x7A, 0x98, 0x7B, 0x98, 0x7C, 0xE9, 0xBD, /* 0xB8-0xBB */
+	0x98, 0x7D, 0x98, 0x7E, 0x98, 0x80, 0x98, 0x81, /* 0xBC-0xBF */
+	0x98, 0x82, 0xE9, 0xC2, 0x98, 0x83, 0x98, 0x84, /* 0xC0-0xC3 */
+	0x98, 0x85, 0x98, 0x86, 0x98, 0x87, 0x98, 0x88, /* 0xC4-0xC7 */
+	0x98, 0x89, 0x98, 0x8A, 0xE9, 0xC3, 0x98, 0x8B, /* 0xC8-0xCB */
+	0xE9, 0xB3, 0x98, 0x8C, 0xE9, 0xB6, 0x98, 0x8D, /* 0xCC-0xCF */
+	0xBB, 0xB1, 0x98, 0x8E, 0x98, 0x8F, 0x98, 0x90, /* 0xD0-0xD3 */
+	0xE9, 0xC0, 0x98, 0x91, 0x98, 0x92, 0x98, 0x93, /* 0xD4-0xD7 */
+	0x98, 0x94, 0x98, 0x95, 0x98, 0x96, 0xBC, 0xF7, /* 0xD8-0xDB */
+	0x98, 0x97, 0x98, 0x98, 0x98, 0x99, 0xE9, 0xC4, /* 0xDC-0xDF */
+	0xE9, 0xC6, 0x98, 0x9A, 0x98, 0x9B, 0x98, 0x9C, /* 0xE0-0xE3 */
+	0x98, 0x9D, 0x98, 0x9E, 0x98, 0x9F, 0x98, 0xA0, /* 0xE4-0xE7 */
+	0x98, 0xA1, 0x98, 0xA2, 0x98, 0xA3, 0x98, 0xA4, /* 0xE8-0xEB */
+	0x98, 0xA5, 0xE9, 0xCA, 0x98, 0xA6, 0x98, 0xA7, /* 0xEC-0xEF */
+	0x98, 0xA8, 0x98, 0xA9, 0xE9, 0xCE, 0x98, 0xAA, /* 0xF0-0xF3 */
+	0x98, 0xAB, 0x98, 0xAC, 0x98, 0xAD, 0x98, 0xAE, /* 0xF4-0xF7 */
+	0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xF8-0xFB */
+	0x98, 0xB3, 0xB2, 0xDB, 0x98, 0xB4, 0xE9, 0xC8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0x98, 0xB5, 0x98, 0xB6, 0x98, 0xB7, 0x98, 0xB8, /* 0x00-0x03 */
+	0x98, 0xB9, 0x98, 0xBA, 0x98, 0xBB, 0x98, 0xBC, /* 0x04-0x07 */
+	0x98, 0xBD, 0x98, 0xBE, 0xB7, 0xAE, 0x98, 0xBF, /* 0x08-0x0B */
+	0x98, 0xC0, 0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, /* 0x0C-0x0F */
+	0x98, 0xC4, 0x98, 0xC5, 0x98, 0xC6, 0x98, 0xC7, /* 0x10-0x13 */
+	0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0xE9, 0xCB, /* 0x14-0x17 */
+	0xE9, 0xCC, 0x98, 0xCB, 0x98, 0xCC, 0x98, 0xCD, /* 0x18-0x1B */
+	0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, 0xD5, 0xC1, /* 0x1C-0x1F */
+	0x98, 0xD1, 0xC4, 0xA3, 0x98, 0xD2, 0x98, 0xD3, /* 0x20-0x23 */
+	0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0x24-0x27 */
+	0xE9, 0xD8, 0x98, 0xD8, 0xBA, 0xE1, 0x98, 0xD9, /* 0x28-0x2B */
+	0x98, 0xDA, 0x98, 0xDB, 0x98, 0xDC, 0xE9, 0xC9, /* 0x2C-0x2F */
+	0x98, 0xDD, 0xD3, 0xA3, 0x98, 0xDE, 0x98, 0xDF, /* 0x30-0x33 */
+	0x98, 0xE0, 0xE9, 0xD4, 0x98, 0xE1, 0x98, 0xE2, /* 0x34-0x37 */
+	0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, 0x98, 0xE6, /* 0x38-0x3B */
+	0x98, 0xE7, 0xE9, 0xD7, 0xE9, 0xD0, 0x98, 0xE8, /* 0x3C-0x3F */
+	0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x40-0x43 */
+	0xE9, 0xCF, 0x98, 0xED, 0x98, 0xEE, 0xC7, 0xC1, /* 0x44-0x47 */
+	0x98, 0xEF, 0x98, 0xF0, 0x98, 0xF1, 0x98, 0xF2, /* 0x48-0x4B */
+	0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x4C-0x4F */
+	0xE9, 0xD2, 0x98, 0xF7, 0x98, 0xF8, 0x98, 0xF9, /* 0x50-0x53 */
+	0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x54-0x57 */
+	0xE9, 0xD9, 0xB3, 0xC8, 0x98, 0xFE, 0xE9, 0xD3, /* 0x58-0x5B */
+	0x99, 0x40, 0x99, 0x41, 0x99, 0x42, 0x99, 0x43, /* 0x5C-0x5F */
+	0x99, 0x44, 0xCF, 0xF0, 0x99, 0x45, 0x99, 0x46, /* 0x60-0x63 */
+	0x99, 0x47, 0xE9, 0xCD, 0x99, 0x48, 0x99, 0x49, /* 0x64-0x67 */
+	0x99, 0x4A, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x68-0x6B */
+	0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x6C-0x6F */
+	0x99, 0x52, 0xB3, 0xF7, 0x99, 0x53, 0x99, 0x54, /* 0x70-0x73 */
+	0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x74-0x77 */
+	0x99, 0x59, 0xE9, 0xD6, 0x99, 0x5A, 0x99, 0x5B, /* 0x78-0x7B */
+	0xE9, 0xDA, 0x99, 0x5C, 0x99, 0x5D, 0x99, 0x5E, /* 0x7C-0x7F */
+	
+	0xCC, 0xB4, 0x99, 0x5F, 0x99, 0x60, 0x99, 0x61, /* 0x80-0x83 */
+	0xCF, 0xAD, 0x99, 0x62, 0x99, 0x63, 0x99, 0x64, /* 0x84-0x87 */
+	0x99, 0x65, 0x99, 0x66, 0x99, 0x67, 0x99, 0x68, /* 0x88-0x8B */
+	0x99, 0x69, 0x99, 0x6A, 0xE9, 0xD5, 0x99, 0x6B, /* 0x8C-0x8F */
+	0xE9, 0xDC, 0xE9, 0xDB, 0x99, 0x6C, 0x99, 0x6D, /* 0x90-0x93 */
+	0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0xE9, 0xDE, /* 0x94-0x97 */
+	0x99, 0x71, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x98-0x9B */
+	0x99, 0x75, 0x99, 0x76, 0x99, 0x77, 0x99, 0x78, /* 0x9C-0x9F */
+	0xE9, 0xD1, 0x99, 0x79, 0x99, 0x7A, 0x99, 0x7B, /* 0xA0-0xA3 */
+	0x99, 0x7C, 0x99, 0x7D, 0x99, 0x7E, 0x99, 0x80, /* 0xA4-0xA7 */
+	0x99, 0x81, 0xE9, 0xDD, 0x99, 0x82, 0xE9, 0xDF, /* 0xA8-0xAB */
+	0xC3, 0xCA, 0x99, 0x83, 0x99, 0x84, 0x99, 0x85, /* 0xAC-0xAF */
+	0x99, 0x86, 0x99, 0x87, 0x99, 0x88, 0x99, 0x89, /* 0xB0-0xB3 */
+	0x99, 0x8A, 0x99, 0x8B, 0x99, 0x8C, 0x99, 0x8D, /* 0xB4-0xB7 */
+	0x99, 0x8E, 0x99, 0x8F, 0x99, 0x90, 0x99, 0x91, /* 0xB8-0xBB */
+	0x99, 0x92, 0x99, 0x93, 0x99, 0x94, 0x99, 0x95, /* 0xBC-0xBF */
+	0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0xC0-0xC3 */
+	0x99, 0x9A, 0x99, 0x9B, 0x99, 0x9C, 0x99, 0x9D, /* 0xC4-0xC7 */
+	0x99, 0x9E, 0x99, 0x9F, 0x99, 0xA0, 0x99, 0xA1, /* 0xC8-0xCB */
+	0x99, 0xA2, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xCC-0xCF */
+	0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, 0x99, 0xA9, /* 0xD0-0xD3 */
+	0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, 0x99, 0xAD, /* 0xD4-0xD7 */
+	0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, 0x99, 0xB1, /* 0xD8-0xDB */
+	0x99, 0xB2, 0x99, 0xB3, 0x99, 0xB4, 0x99, 0xB5, /* 0xDC-0xDF */
+	0x99, 0xB6, 0x99, 0xB7, 0x99, 0xB8, 0x99, 0xB9, /* 0xE0-0xE3 */
+	0x99, 0xBA, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xE4-0xE7 */
+	0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, 0x99, 0xC1, /* 0xE8-0xEB */
+	0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, 0x99, 0xC5, /* 0xEC-0xEF */
+	0x99, 0xC6, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xF0-0xF3 */
+	0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xF4-0xF7 */
+	0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, 0x99, 0xD1, /* 0xF8-0xFB */
+	0x99, 0xD2, 0x99, 0xD3, 0x99, 0xD4, 0x99, 0xD5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0x99, 0xD6, 0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, /* 0x00-0x03 */
+	0x99, 0xDA, 0x99, 0xDB, 0x99, 0xDC, 0x99, 0xDD, /* 0x04-0x07 */
+	0x99, 0xDE, 0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE1, /* 0x08-0x0B */
+	0x99, 0xE2, 0x99, 0xE3, 0x99, 0xE4, 0x99, 0xE5, /* 0x0C-0x0F */
+	0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, 0x99, 0xE9, /* 0x10-0x13 */
+	0x99, 0xEA, 0x99, 0xEB, 0x99, 0xEC, 0x99, 0xED, /* 0x14-0x17 */
+	0x99, 0xEE, 0x99, 0xEF, 0x99, 0xF0, 0x99, 0xF1, /* 0x18-0x1B */
+	0x99, 0xF2, 0x99, 0xF3, 0x99, 0xF4, 0x99, 0xF5, /* 0x1C-0x1F */
+	0xC7, 0xB7, 0xB4, 0xCE, 0xBB, 0xB6, 0xD0, 0xC0, /* 0x20-0x23 */
+	0xEC, 0xA3, 0x99, 0xF6, 0x99, 0xF7, 0xC5, 0xB7, /* 0x24-0x27 */
+	0x99, 0xF8, 0x99, 0xF9, 0x99, 0xFA, 0x99, 0xFB, /* 0x28-0x2B */
+	0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, 0x9A, 0x40, /* 0x2C-0x2F */
+	0x9A, 0x41, 0x9A, 0x42, 0xD3, 0xFB, 0x9A, 0x43, /* 0x30-0x33 */
+	0x9A, 0x44, 0x9A, 0x45, 0x9A, 0x46, 0xEC, 0xA4, /* 0x34-0x37 */
+	0x9A, 0x47, 0xEC, 0xA5, 0xC6, 0xDB, 0x9A, 0x48, /* 0x38-0x3B */
+	0x9A, 0x49, 0x9A, 0x4A, 0xBF, 0xEE, 0x9A, 0x4B, /* 0x3C-0x3F */
+	0x9A, 0x4C, 0x9A, 0x4D, 0x9A, 0x4E, 0xEC, 0xA6, /* 0x40-0x43 */
+	0x9A, 0x4F, 0x9A, 0x50, 0xEC, 0xA7, 0xD0, 0xAA, /* 0x44-0x47 */
+	0x9A, 0x51, 0xC7, 0xB8, 0x9A, 0x52, 0x9A, 0x53, /* 0x48-0x4B */
+	0xB8, 0xE8, 0x9A, 0x54, 0x9A, 0x55, 0x9A, 0x56, /* 0x4C-0x4F */
+	0x9A, 0x57, 0x9A, 0x58, 0x9A, 0x59, 0x9A, 0x5A, /* 0x50-0x53 */
+	0x9A, 0x5B, 0x9A, 0x5C, 0x9A, 0x5D, 0x9A, 0x5E, /* 0x54-0x57 */
+	0x9A, 0x5F, 0xEC, 0xA8, 0x9A, 0x60, 0x9A, 0x61, /* 0x58-0x5B */
+	0x9A, 0x62, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x5C-0x5F */
+	0x9A, 0x66, 0x9A, 0x67, 0xD6, 0xB9, 0xD5, 0xFD, /* 0x60-0x63 */
+	0xB4, 0xCB, 0xB2, 0xBD, 0xCE, 0xE4, 0xC6, 0xE7, /* 0x64-0x67 */
+	0x9A, 0x68, 0x9A, 0x69, 0xCD, 0xE1, 0x9A, 0x6A, /* 0x68-0x6B */
+	0x9A, 0x6B, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x6C-0x6F */
+	0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, 0x9A, 0x72, /* 0x70-0x73 */
+	0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, 0x9A, 0x76, /* 0x74-0x77 */
+	0x9A, 0x77, 0xB4, 0xF5, 0x9A, 0x78, 0xCB, 0xC0, /* 0x78-0x7B */
+	0xBC, 0xDF, 0x9A, 0x79, 0x9A, 0x7A, 0x9A, 0x7B, /* 0x7C-0x7F */
+	
+	0x9A, 0x7C, 0xE9, 0xE2, 0xE9, 0xE3, 0xD1, 0xEA, /* 0x80-0x83 */
+	0xE9, 0xE5, 0x9A, 0x7D, 0xB4, 0xF9, 0xE9, 0xE4, /* 0x84-0x87 */
+	0x9A, 0x7E, 0xD1, 0xB3, 0xCA, 0xE2, 0xB2, 0xD0, /* 0x88-0x8B */
+	0x9A, 0x80, 0xE9, 0xE8, 0x9A, 0x81, 0x9A, 0x82, /* 0x8C-0x8F */
+	0x9A, 0x83, 0x9A, 0x84, 0xE9, 0xE6, 0xE9, 0xE7, /* 0x90-0x93 */
+	0x9A, 0x85, 0x9A, 0x86, 0xD6, 0xB3, 0x9A, 0x87, /* 0x94-0x97 */
+	0x9A, 0x88, 0x9A, 0x89, 0xE9, 0xE9, 0xE9, 0xEA, /* 0x98-0x9B */
+	0x9A, 0x8A, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x9C-0x9F */
+	0x9A, 0x8E, 0xE9, 0xEB, 0x9A, 0x8F, 0x9A, 0x90, /* 0xA0-0xA3 */
+	0x9A, 0x91, 0x9A, 0x92, 0x9A, 0x93, 0x9A, 0x94, /* 0xA4-0xA7 */
+	0x9A, 0x95, 0x9A, 0x96, 0xE9, 0xEC, 0x9A, 0x97, /* 0xA8-0xAB */
+	0x9A, 0x98, 0x9A, 0x99, 0x9A, 0x9A, 0x9A, 0x9B, /* 0xAC-0xAF */
+	0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0xEC, 0xAF, /* 0xB0-0xB3 */
+	0xC5, 0xB9, 0xB6, 0xCE, 0x9A, 0x9F, 0xD2, 0xF3, /* 0xB4-0xB7 */
+	0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, 0x9A, 0xA3, /* 0xB8-0xBB */
+	0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, 0xB5, 0xEE, /* 0xBC-0xBF */
+	0x9A, 0xA7, 0xBB, 0xD9, 0xEC, 0xB1, 0x9A, 0xA8, /* 0xC0-0xC3 */
+	0x9A, 0xA9, 0xD2, 0xE3, 0x9A, 0xAA, 0x9A, 0xAB, /* 0xC4-0xC7 */
+	0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0xCE, 0xE3, /* 0xC8-0xCB */
+	0x9A, 0xAF, 0xC4, 0xB8, 0x9A, 0xB0, 0xC3, 0xBF, /* 0xCC-0xCF */
+	0x9A, 0xB1, 0x9A, 0xB2, 0xB6, 0xBE, 0xD8, 0xB9, /* 0xD0-0xD3 */
+	0xB1, 0xC8, 0xB1, 0xCF, 0xB1, 0xD1, 0xC5, 0xFE, /* 0xD4-0xD7 */
+	0x9A, 0xB3, 0xB1, 0xD0, 0x9A, 0xB4, 0xC3, 0xAB, /* 0xD8-0xDB */
+	0x9A, 0xB5, 0x9A, 0xB6, 0x9A, 0xB7, 0x9A, 0xB8, /* 0xDC-0xDF */
+	0x9A, 0xB9, 0xD5, 0xB1, 0x9A, 0xBA, 0x9A, 0xBB, /* 0xE0-0xE3 */
+	0x9A, 0xBC, 0x9A, 0xBD, 0x9A, 0xBE, 0x9A, 0xBF, /* 0xE4-0xE7 */
+	0x9A, 0xC0, 0x9A, 0xC1, 0xEB, 0xA4, 0xBA, 0xC1, /* 0xE8-0xEB */
+	0x9A, 0xC2, 0x9A, 0xC3, 0x9A, 0xC4, 0xCC, 0xBA, /* 0xEC-0xEF */
+	0x9A, 0xC5, 0x9A, 0xC6, 0x9A, 0xC7, 0xEB, 0xA5, /* 0xF0-0xF3 */
+	0x9A, 0xC8, 0xEB, 0xA7, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xF4-0xF7 */
+	0x9A, 0xCB, 0xEB, 0xA8, 0x9A, 0xCC, 0x9A, 0xCD, /* 0xF8-0xFB */
+	0x9A, 0xCE, 0xEB, 0xA6, 0x9A, 0xCF, 0x9A, 0xD0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0x9A, 0xD1, 0x9A, 0xD2, 0x9A, 0xD3, 0x9A, 0xD4, /* 0x00-0x03 */
+	0x9A, 0xD5, 0xEB, 0xA9, 0xEB, 0xAB, 0xEB, 0xAA, /* 0x04-0x07 */
+	0x9A, 0xD6, 0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, /* 0x08-0x0B */
+	0x9A, 0xDA, 0xEB, 0xAC, 0x9A, 0xDB, 0xCA, 0xCF, /* 0x0C-0x0F */
+	0xD8, 0xB5, 0xC3, 0xF1, 0x9A, 0xDC, 0xC3, 0xA5, /* 0x10-0x13 */
+	0xC6, 0xF8, 0xEB, 0xAD, 0xC4, 0xCA, 0x9A, 0xDD, /* 0x14-0x17 */
+	0xEB, 0xAE, 0xEB, 0xAF, 0xEB, 0xB0, 0xB7, 0xD5, /* 0x18-0x1B */
+	0x9A, 0xDE, 0x9A, 0xDF, 0x9A, 0xE0, 0xB7, 0xFA, /* 0x1C-0x1F */
+	0x9A, 0xE1, 0xEB, 0xB1, 0xC7, 0xE2, 0x9A, 0xE2, /* 0x20-0x23 */
+	0xEB, 0xB3, 0x9A, 0xE3, 0xBA, 0xA4, 0xD1, 0xF5, /* 0x24-0x27 */
+	0xB0, 0xB1, 0xEB, 0xB2, 0xEB, 0xB4, 0x9A, 0xE4, /* 0x28-0x2B */
+	0x9A, 0xE5, 0x9A, 0xE6, 0xB5, 0xAA, 0xC2, 0xC8, /* 0x2C-0x2F */
+	0xC7, 0xE8, 0x9A, 0xE7, 0xEB, 0xB5, 0x9A, 0xE8, /* 0x30-0x33 */
+	0xCB, 0xAE, 0xE3, 0xDF, 0x9A, 0xE9, 0x9A, 0xEA, /* 0x34-0x37 */
+	0xD3, 0xC0, 0x9A, 0xEB, 0x9A, 0xEC, 0x9A, 0xED, /* 0x38-0x3B */
+	0x9A, 0xEE, 0xD9, 0xDB, 0x9A, 0xEF, 0x9A, 0xF0, /* 0x3C-0x3F */
+	0xCD, 0xA1, 0xD6, 0xAD, 0xC7, 0xF3, 0x9A, 0xF1, /* 0x40-0x43 */
+	0x9A, 0xF2, 0x9A, 0xF3, 0xD9, 0xE0, 0xBB, 0xE3, /* 0x44-0x47 */
+	0x9A, 0xF4, 0xBA, 0xBA, 0xE3, 0xE2, 0x9A, 0xF5, /* 0x48-0x4B */
+	0x9A, 0xF6, 0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, /* 0x4C-0x4F */
+	0xCF, 0xAB, 0x9A, 0xFA, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x50-0x53 */
+	0xE3, 0xE0, 0xC9, 0xC7, 0x9A, 0xFD, 0xBA, 0xB9, /* 0x54-0x57 */
+	0x9A, 0xFE, 0x9B, 0x40, 0x9B, 0x41, 0xD1, 0xB4, /* 0x58-0x5B */
+	0xE3, 0xE1, 0xC8, 0xEA, 0xB9, 0xAF, 0xBD, 0xAD, /* 0x5C-0x5F */
+	0xB3, 0xD8, 0xCE, 0xDB, 0x9B, 0x42, 0x9B, 0x43, /* 0x60-0x63 */
+	0xCC, 0xC0, 0x9B, 0x44, 0x9B, 0x45, 0x9B, 0x46, /* 0x64-0x67 */
+	0xE3, 0xE8, 0xE3, 0xE9, 0xCD, 0xF4, 0x9B, 0x47, /* 0x68-0x6B */
+	0x9B, 0x48, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x6C-0x6F */
+	0xCC, 0xAD, 0x9B, 0x4C, 0xBC, 0xB3, 0x9B, 0x4D, /* 0x70-0x73 */
+	0xE3, 0xEA, 0x9B, 0x4E, 0xE3, 0xEB, 0x9B, 0x4F, /* 0x74-0x77 */
+	0x9B, 0x50, 0xD0, 0xDA, 0x9B, 0x51, 0x9B, 0x52, /* 0x78-0x7B */
+	0x9B, 0x53, 0xC6, 0xFB, 0xB7, 0xDA, 0x9B, 0x54, /* 0x7C-0x7F */
+	
+	0x9B, 0x55, 0xC7, 0xDF, 0xD2, 0xCA, 0xCE, 0xD6, /* 0x80-0x83 */
+	0x9B, 0x56, 0xE3, 0xE4, 0xE3, 0xEC, 0x9B, 0x57, /* 0x84-0x87 */
+	0xC9, 0xF2, 0xB3, 0xC1, 0x9B, 0x58, 0x9B, 0x59, /* 0x88-0x8B */
+	0xE3, 0xE7, 0x9B, 0x5A, 0x9B, 0x5B, 0xC6, 0xE3, /* 0x8C-0x8F */
+	0xE3, 0xE5, 0x9B, 0x5C, 0x9B, 0x5D, 0xED, 0xB3, /* 0x90-0x93 */
+	0xE3, 0xE6, 0x9B, 0x5E, 0x9B, 0x5F, 0x9B, 0x60, /* 0x94-0x97 */
+	0x9B, 0x61, 0xC9, 0xB3, 0x9B, 0x62, 0xC5, 0xE6, /* 0x98-0x9B */
+	0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, 0xB9, 0xB5, /* 0x9C-0x9F */
+	0x9B, 0x66, 0xC3, 0xBB, 0x9B, 0x67, 0xE3, 0xE3, /* 0xA0-0xA3 */
+	0xC5, 0xBD, 0xC1, 0xA4, 0xC2, 0xD9, 0xB2, 0xD7, /* 0xA4-0xA7 */
+	0x9B, 0x68, 0xE3, 0xED, 0xBB, 0xA6, 0xC4, 0xAD, /* 0xA8-0xAB */
+	0x9B, 0x69, 0xE3, 0xF0, 0xBE, 0xDA, 0x9B, 0x6A, /* 0xAC-0xAF */
+	0x9B, 0x6B, 0xE3, 0xFB, 0xE3, 0xF5, 0xBA, 0xD3, /* 0xB0-0xB3 */
+	0x9B, 0x6C, 0x9B, 0x6D, 0x9B, 0x6E, 0x9B, 0x6F, /* 0xB4-0xB7 */
+	0xB7, 0xD0, 0xD3, 0xCD, 0x9B, 0x70, 0xD6, 0xCE, /* 0xB8-0xBB */
+	0xD5, 0xD3, 0xB9, 0xC1, 0xD5, 0xB4, 0xD1, 0xD8, /* 0xBC-0xBF */
+	0x9B, 0x71, 0x9B, 0x72, 0x9B, 0x73, 0x9B, 0x74, /* 0xC0-0xC3 */
+	0xD0, 0xB9, 0xC7, 0xF6, 0x9B, 0x75, 0x9B, 0x76, /* 0xC4-0xC7 */
+	0x9B, 0x77, 0xC8, 0xAA, 0xB2, 0xB4, 0x9B, 0x78, /* 0xC8-0xCB */
+	0xC3, 0xDA, 0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x7B, /* 0xCC-0xCF */
+	0xE3, 0xEE, 0x9B, 0x7C, 0x9B, 0x7D, 0xE3, 0xFC, /* 0xD0-0xD3 */
+	0xE3, 0xEF, 0xB7, 0xA8, 0xE3, 0xF7, 0xE3, 0xF4, /* 0xD4-0xD7 */
+	0x9B, 0x7E, 0x9B, 0x80, 0x9B, 0x81, 0xB7, 0xBA, /* 0xD8-0xDB */
+	0x9B, 0x82, 0x9B, 0x83, 0xC5, 0xA2, 0x9B, 0x84, /* 0xDC-0xDF */
+	0xE3, 0xF6, 0xC5, 0xDD, 0xB2, 0xA8, 0xC6, 0xFC, /* 0xE0-0xE3 */
+	0x9B, 0x85, 0xC4, 0xE0, 0x9B, 0x86, 0x9B, 0x87, /* 0xE4-0xE7 */
+	0xD7, 0xA2, 0x9B, 0x88, 0xC0, 0xE1, 0xE3, 0xF9, /* 0xE8-0xEB */
+	0x9B, 0x89, 0x9B, 0x8A, 0xE3, 0xFA, 0xE3, 0xFD, /* 0xEC-0xEF */
+	0xCC, 0xA9, 0xE3, 0xF3, 0x9B, 0x8B, 0xD3, 0xBE, /* 0xF0-0xF3 */
+	0x9B, 0x8C, 0xB1, 0xC3, 0xED, 0xB4, 0xE3, 0xF1, /* 0xF4-0xF7 */
+	0xE3, 0xF2, 0x9B, 0x8D, 0xE3, 0xF8, 0xD0, 0xBA, /* 0xF8-0xFB */
+	0xC6, 0xC3, 0xD4, 0xF3, 0xE3, 0xFE, 0x9B, 0x8E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0x9B, 0x8F, 0xBD, 0xE0, 0x9B, 0x90, 0x9B, 0x91, /* 0x00-0x03 */
+	0xE4, 0xA7, 0x9B, 0x92, 0x9B, 0x93, 0xE4, 0xA6, /* 0x04-0x07 */
+	0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, 0xD1, 0xF3, /* 0x08-0x0B */
+	0xE4, 0xA3, 0x9B, 0x97, 0xE4, 0xA9, 0x9B, 0x98, /* 0x0C-0x0F */
+	0x9B, 0x99, 0x9B, 0x9A, 0xC8, 0xF7, 0x9B, 0x9B, /* 0x10-0x13 */
+	0x9B, 0x9C, 0x9B, 0x9D, 0x9B, 0x9E, 0xCF, 0xB4, /* 0x14-0x17 */
+	0x9B, 0x9F, 0xE4, 0xA8, 0xE4, 0xAE, 0xC2, 0xE5, /* 0x18-0x1B */
+	0x9B, 0xA0, 0x9B, 0xA1, 0xB6, 0xB4, 0x9B, 0xA2, /* 0x1C-0x1F */
+	0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, 0x9B, 0xA6, /* 0x20-0x23 */
+	0x9B, 0xA7, 0xBD, 0xF2, 0x9B, 0xA8, 0xE4, 0xA2, /* 0x24-0x27 */
+	0x9B, 0xA9, 0x9B, 0xAA, 0xBA, 0xE9, 0xE4, 0xAA, /* 0x28-0x2B */
+	0x9B, 0xAB, 0x9B, 0xAC, 0xE4, 0xAC, 0x9B, 0xAD, /* 0x2C-0x2F */
+	0x9B, 0xAE, 0xB6, 0xFD, 0xD6, 0xDE, 0xE4, 0xB2, /* 0x30-0x33 */
+	0x9B, 0xAF, 0xE4, 0xAD, 0x9B, 0xB0, 0x9B, 0xB1, /* 0x34-0x37 */
+	0x9B, 0xB2, 0xE4, 0xA1, 0x9B, 0xB3, 0xBB, 0xEE, /* 0x38-0x3B */
+	0xCD, 0xDD, 0xC7, 0xA2, 0xC5, 0xC9, 0x9B, 0xB4, /* 0x3C-0x3F */
+	0x9B, 0xB5, 0xC1, 0xF7, 0x9B, 0xB6, 0xE4, 0xA4, /* 0x40-0x43 */
+	0x9B, 0xB7, 0xC7, 0xB3, 0xBD, 0xAC, 0xBD, 0xBD, /* 0x44-0x47 */
+	0xE4, 0xA5, 0x9B, 0xB8, 0xD7, 0xC7, 0xB2, 0xE2, /* 0x48-0x4B */
+	0x9B, 0xB9, 0xE4, 0xAB, 0xBC, 0xC3, 0xE4, 0xAF, /* 0x4C-0x4F */
+	0x9B, 0xBA, 0xBB, 0xEB, 0xE4, 0xB0, 0xC5, 0xA8, /* 0x50-0x53 */
+	0xE4, 0xB1, 0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, /* 0x54-0x57 */
+	0x9B, 0xBE, 0xD5, 0xE3, 0xBF, 0xA3, 0x9B, 0xBF, /* 0x58-0x5B */
+	0xE4, 0xBA, 0x9B, 0xC0, 0xE4, 0xB7, 0x9B, 0xC1, /* 0x5C-0x5F */
+	0xE4, 0xBB, 0x9B, 0xC2, 0x9B, 0xC3, 0xE4, 0xBD, /* 0x60-0x63 */
+	0x9B, 0xC4, 0x9B, 0xC5, 0xC6, 0xD6, 0x9B, 0xC6, /* 0x64-0x67 */
+	0x9B, 0xC7, 0xBA, 0xC6, 0xC0, 0xCB, 0x9B, 0xC8, /* 0x68-0x6B */
+	0x9B, 0xC9, 0x9B, 0xCA, 0xB8, 0xA1, 0xE4, 0xB4, /* 0x6C-0x6F */
+	0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0x70-0x73 */
+	0xD4, 0xA1, 0x9B, 0xCF, 0x9B, 0xD0, 0xBA, 0xA3, /* 0x74-0x77 */
+	0xBD, 0xFE, 0x9B, 0xD1, 0x9B, 0xD2, 0x9B, 0xD3, /* 0x78-0x7B */
+	0xE4, 0xBC, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0x7C-0x7F */
+	
+	0x9B, 0xD7, 0x9B, 0xD8, 0xCD, 0xBF, 0x9B, 0xD9, /* 0x80-0x83 */
+	0x9B, 0xDA, 0xC4, 0xF9, 0x9B, 0xDB, 0x9B, 0xDC, /* 0x84-0x87 */
+	0xCF, 0xFB, 0xC9, 0xE6, 0x9B, 0xDD, 0x9B, 0xDE, /* 0x88-0x8B */
+	0xD3, 0xBF, 0x9B, 0xDF, 0xCF, 0xD1, 0x9B, 0xE0, /* 0x8C-0x8F */
+	0x9B, 0xE1, 0xE4, 0xB3, 0x9B, 0xE2, 0xE4, 0xB8, /* 0x90-0x93 */
+	0xE4, 0xB9, 0xCC, 0xE9, 0x9B, 0xE3, 0x9B, 0xE4, /* 0x94-0x97 */
+	0x9B, 0xE5, 0x9B, 0xE6, 0x9B, 0xE7, 0xCC, 0xCE, /* 0x98-0x9B */
+	0x9B, 0xE8, 0xC0, 0xD4, 0xE4, 0xB5, 0xC1, 0xB0, /* 0x9C-0x9F */
+	0xE4, 0xB6, 0xCE, 0xD0, 0x9B, 0xE9, 0xBB, 0xC1, /* 0xA0-0xA3 */
+	0xB5, 0xD3, 0x9B, 0xEA, 0xC8, 0xF3, 0xBD, 0xA7, /* 0xA4-0xA7 */
+	0xD5, 0xC7, 0xC9, 0xAC, 0xB8, 0xA2, 0xE4, 0xCA, /* 0xA8-0xAB */
+	0x9B, 0xEB, 0x9B, 0xEC, 0xE4, 0xCC, 0xD1, 0xC4, /* 0xAC-0xAF */
+	0x9B, 0xED, 0x9B, 0xEE, 0xD2, 0xBA, 0x9B, 0xEF, /* 0xB0-0xB3 */
+	0x9B, 0xF0, 0xBA, 0xAD, 0x9B, 0xF1, 0x9B, 0xF2, /* 0xB4-0xB7 */
+	0xBA, 0xD4, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xB8-0xBB */
+	0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0xE4, 0xC3, /* 0xBC-0xBF */
+	0xB5, 0xED, 0x9B, 0xF9, 0x9B, 0xFA, 0x9B, 0xFB, /* 0xC0-0xC3 */
+	0xD7, 0xCD, 0xE4, 0xC0, 0xCF, 0xFD, 0xE4, 0xBF, /* 0xC4-0xC7 */
+	0x9B, 0xFC, 0x9B, 0xFD, 0x9B, 0xFE, 0xC1, 0xDC, /* 0xC8-0xCB */
+	0xCC, 0xCA, 0x9C, 0x40, 0x9C, 0x41, 0x9C, 0x42, /* 0xCC-0xCF */
+	0x9C, 0x43, 0xCA, 0xE7, 0x9C, 0x44, 0x9C, 0x45, /* 0xD0-0xD3 */
+	0x9C, 0x46, 0x9C, 0x47, 0xC4, 0xD7, 0x9C, 0x48, /* 0xD4-0xD7 */
+	0xCC, 0xD4, 0xE4, 0xC8, 0x9C, 0x49, 0x9C, 0x4A, /* 0xD8-0xDB */
+	0x9C, 0x4B, 0xE4, 0xC7, 0xE4, 0xC1, 0x9C, 0x4C, /* 0xDC-0xDF */
+	0xE4, 0xC4, 0xB5, 0xAD, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xE0-0xE3 */
+	0xD3, 0xD9, 0x9C, 0x4F, 0xE4, 0xC6, 0x9C, 0x50, /* 0xE4-0xE7 */
+	0x9C, 0x51, 0x9C, 0x52, 0x9C, 0x53, 0xD2, 0xF9, /* 0xE8-0xEB */
+	0xB4, 0xE3, 0x9C, 0x54, 0xBB, 0xB4, 0x9C, 0x55, /* 0xEC-0xEF */
+	0x9C, 0x56, 0xC9, 0xEE, 0x9C, 0x57, 0xB4, 0xBE, /* 0xF0-0xF3 */
+	0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0xBB, 0xEC, /* 0xF4-0xF7 */
+	0x9C, 0x5B, 0xD1, 0xCD, 0x9C, 0x5C, 0xCC, 0xED, /* 0xF8-0xFB */
+	0xED, 0xB5, 0x9C, 0x5D, 0x9C, 0x5E, 0x9C, 0x5F, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0x9C, 0x60, 0x9C, 0x61, 0x9C, 0x62, 0x9C, 0x63, /* 0x00-0x03 */
+	0x9C, 0x64, 0xC7, 0xE5, 0x9C, 0x65, 0x9C, 0x66, /* 0x04-0x07 */
+	0x9C, 0x67, 0x9C, 0x68, 0xD4, 0xA8, 0x9C, 0x69, /* 0x08-0x0B */
+	0xE4, 0xCB, 0xD7, 0xD5, 0xE4, 0xC2, 0x9C, 0x6A, /* 0x0C-0x0F */
+	0xBD, 0xA5, 0xE4, 0xC5, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x10-0x13 */
+	0xD3, 0xE6, 0x9C, 0x6D, 0xE4, 0xC9, 0xC9, 0xF8, /* 0x14-0x17 */
+	0x9C, 0x6E, 0x9C, 0x6F, 0xE4, 0xBE, 0x9C, 0x70, /* 0x18-0x1B */
+	0x9C, 0x71, 0xD3, 0xE5, 0x9C, 0x72, 0x9C, 0x73, /* 0x1C-0x1F */
+	0xC7, 0xFE, 0xB6, 0xC9, 0x9C, 0x74, 0xD4, 0xFC, /* 0x20-0x23 */
+	0xB2, 0xB3, 0xE4, 0xD7, 0x9C, 0x75, 0x9C, 0x76, /* 0x24-0x27 */
+	0x9C, 0x77, 0xCE, 0xC2, 0x9C, 0x78, 0xE4, 0xCD, /* 0x28-0x2B */
+	0x9C, 0x79, 0xCE, 0xBC, 0x9C, 0x7A, 0xB8, 0xDB, /* 0x2C-0x2F */
+	0x9C, 0x7B, 0x9C, 0x7C, 0xE4, 0xD6, 0x9C, 0x7D, /* 0x30-0x33 */
+	0xBF, 0xCA, 0x9C, 0x7E, 0x9C, 0x80, 0x9C, 0x81, /* 0x34-0x37 */
+	0xD3, 0xCE, 0x9C, 0x82, 0xC3, 0xEC, 0x9C, 0x83, /* 0x38-0x3B */
+	0x9C, 0x84, 0x9C, 0x85, 0x9C, 0x86, 0x9C, 0x87, /* 0x3C-0x3F */
+	0x9C, 0x88, 0x9C, 0x89, 0x9C, 0x8A, 0xC5, 0xC8, /* 0x40-0x43 */
+	0xE4, 0xD8, 0x9C, 0x8B, 0x9C, 0x8C, 0x9C, 0x8D, /* 0x44-0x47 */
+	0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, 0x9C, 0x91, /* 0x48-0x4B */
+	0x9C, 0x92, 0xCD, 0xC4, 0xE4, 0xCF, 0x9C, 0x93, /* 0x4C-0x4F */
+	0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, 0xE4, 0xD4, /* 0x50-0x53 */
+	0xE4, 0xD5, 0x9C, 0x97, 0xBA, 0xFE, 0x9C, 0x98, /* 0x54-0x57 */
+	0xCF, 0xE6, 0x9C, 0x99, 0x9C, 0x9A, 0xD5, 0xBF, /* 0x58-0x5B */
+	0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, 0xE4, 0xD2, /* 0x5C-0x5F */
+	0x9C, 0x9E, 0x9C, 0x9F, 0x9C, 0xA0, 0x9C, 0xA1, /* 0x60-0x63 */
+	0x9C, 0xA2, 0x9C, 0xA3, 0x9C, 0xA4, 0x9C, 0xA5, /* 0x64-0x67 */
+	0x9C, 0xA6, 0x9C, 0xA7, 0x9C, 0xA8, 0xE4, 0xD0, /* 0x68-0x6B */
+	0x9C, 0xA9, 0x9C, 0xAA, 0xE4, 0xCE, 0x9C, 0xAB, /* 0x6C-0x6F */
+	0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, 0x9C, 0xAF, /* 0x70-0x73 */
+	0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, 0x9C, 0xB3, /* 0x74-0x77 */
+	0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, 0x9C, 0xB7, /* 0x78-0x7B */
+	0x9C, 0xB8, 0x9C, 0xB9, 0xCD, 0xE5, 0xCA, 0xAA, /* 0x7C-0x7F */
+	
+	0x9C, 0xBA, 0x9C, 0xBB, 0x9C, 0xBC, 0xC0, 0xA3, /* 0x80-0x83 */
+	0x9C, 0xBD, 0xBD, 0xA6, 0xE4, 0xD3, 0x9C, 0xBE, /* 0x84-0x87 */
+	0x9C, 0xBF, 0xB8, 0xC8, 0x9C, 0xC0, 0x9C, 0xC1, /* 0x88-0x8B */
+	0x9C, 0xC2, 0x9C, 0xC3, 0x9C, 0xC4, 0xE4, 0xE7, /* 0x8C-0x8F */
+	0xD4, 0xB4, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x90-0x93 */
+	0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, 0x9C, 0xCB, /* 0x94-0x97 */
+	0xE4, 0xDB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x98-0x9B */
+	0xC1, 0xEF, 0x9C, 0xCF, 0x9C, 0xD0, 0xE4, 0xE9, /* 0x9C-0x9F */
+	0x9C, 0xD1, 0x9C, 0xD2, 0xD2, 0xE7, 0x9C, 0xD3, /* 0xA0-0xA3 */
+	0x9C, 0xD4, 0xE4, 0xDF, 0x9C, 0xD5, 0xE4, 0xE0, /* 0xA4-0xA7 */
+	0x9C, 0xD6, 0x9C, 0xD7, 0xCF, 0xAA, 0x9C, 0xD8, /* 0xA8-0xAB */
+	0x9C, 0xD9, 0x9C, 0xDA, 0x9C, 0xDB, 0xCB, 0xDD, /* 0xAC-0xAF */
+	0x9C, 0xDC, 0xE4, 0xDA, 0xE4, 0xD1, 0x9C, 0xDD, /* 0xB0-0xB3 */
+	0xE4, 0xE5, 0x9C, 0xDE, 0xC8, 0xDC, 0xE4, 0xE3, /* 0xB4-0xB7 */
+	0x9C, 0xDF, 0x9C, 0xE0, 0xC4, 0xE7, 0xE4, 0xE2, /* 0xB8-0xBB */
+	0x9C, 0xE1, 0xE4, 0xE1, 0x9C, 0xE2, 0x9C, 0xE3, /* 0xBC-0xBF */
+	0x9C, 0xE4, 0xB3, 0xFC, 0xE4, 0xE8, 0x9C, 0xE5, /* 0xC0-0xC3 */
+	0x9C, 0xE6, 0x9C, 0xE7, 0x9C, 0xE8, 0xB5, 0xE1, /* 0xC4-0xC7 */
+	0x9C, 0xE9, 0x9C, 0xEA, 0x9C, 0xEB, 0xD7, 0xCC, /* 0xC8-0xCB */
+	0x9C, 0xEC, 0x9C, 0xED, 0x9C, 0xEE, 0xE4, 0xE6, /* 0xCC-0xCF */
+	0x9C, 0xEF, 0xBB, 0xAC, 0x9C, 0xF0, 0xD7, 0xD2, /* 0xD0-0xD3 */
+	0xCC, 0xCF, 0xEB, 0xF8, 0x9C, 0xF1, 0xE4, 0xE4, /* 0xD4-0xD7 */
+	0x9C, 0xF2, 0x9C, 0xF3, 0xB9, 0xF6, 0x9C, 0xF4, /* 0xD8-0xDB */
+	0x9C, 0xF5, 0x9C, 0xF6, 0xD6, 0xCD, 0xE4, 0xD9, /* 0xDC-0xDF */
+	0xE4, 0xDC, 0xC2, 0xFA, 0xE4, 0xDE, 0x9C, 0xF7, /* 0xE0-0xE3 */
+	0xC2, 0xCB, 0xC0, 0xC4, 0xC2, 0xD0, 0x9C, 0xF8, /* 0xE4-0xE7 */
+	0xB1, 0xF5, 0xCC, 0xB2, 0x9C, 0xF9, 0x9C, 0xFA, /* 0xE8-0xEB */
+	0x9C, 0xFB, 0x9C, 0xFC, 0x9C, 0xFD, 0x9C, 0xFE, /* 0xEC-0xEF */
+	0x9D, 0x40, 0x9D, 0x41, 0x9D, 0x42, 0x9D, 0x43, /* 0xF0-0xF3 */
+	0xB5, 0xCE, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xF4-0xF7 */
+	0x9D, 0x47, 0xE4, 0xEF, 0x9D, 0x48, 0x9D, 0x49, /* 0xF8-0xFB */
+	0x9D, 0x4A, 0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0x9D, 0x4E, 0x9D, 0x4F, 0xC6, 0xAF, 0x9D, 0x50, /* 0x00-0x03 */
+	0x9D, 0x51, 0x9D, 0x52, 0xC6, 0xE1, 0x9D, 0x53, /* 0x04-0x07 */
+	0x9D, 0x54, 0xE4, 0xF5, 0x9D, 0x55, 0x9D, 0x56, /* 0x08-0x0B */
+	0x9D, 0x57, 0x9D, 0x58, 0x9D, 0x59, 0xC2, 0xA9, /* 0x0C-0x0F */
+	0x9D, 0x5A, 0x9D, 0x5B, 0x9D, 0x5C, 0xC0, 0xEC, /* 0x10-0x13 */
+	0xD1, 0xDD, 0xE4, 0xEE, 0x9D, 0x5D, 0x9D, 0x5E, /* 0x14-0x17 */
+	0x9D, 0x5F, 0x9D, 0x60, 0x9D, 0x61, 0x9D, 0x62, /* 0x18-0x1B */
+	0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0x1C-0x1F */
+	0xC4, 0xAE, 0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, /* 0x20-0x23 */
+	0xE4, 0xED, 0x9D, 0x6A, 0x9D, 0x6B, 0x9D, 0x6C, /* 0x24-0x27 */
+	0x9D, 0x6D, 0xE4, 0xF6, 0xE4, 0xF4, 0xC2, 0xFE, /* 0x28-0x2B */
+	0x9D, 0x6E, 0xE4, 0xDD, 0x9D, 0x6F, 0xE4, 0xF0, /* 0x2C-0x2F */
+	0x9D, 0x70, 0xCA, 0xFE, 0x9D, 0x71, 0xD5, 0xC4, /* 0x30-0x33 */
+	0x9D, 0x72, 0x9D, 0x73, 0xE4, 0xF1, 0x9D, 0x74, /* 0x34-0x37 */
+	0x9D, 0x75, 0x9D, 0x76, 0x9D, 0x77, 0x9D, 0x78, /* 0x38-0x3B */
+	0x9D, 0x79, 0x9D, 0x7A, 0xD1, 0xFA, 0x9D, 0x7B, /* 0x3C-0x3F */
+	0x9D, 0x7C, 0x9D, 0x7D, 0x9D, 0x7E, 0x9D, 0x80, /* 0x40-0x43 */
+	0x9D, 0x81, 0x9D, 0x82, 0xE4, 0xEB, 0xE4, 0xEC, /* 0x44-0x47 */
+	0x9D, 0x83, 0x9D, 0x84, 0x9D, 0x85, 0xE4, 0xF2, /* 0x48-0x4B */
+	0x9D, 0x86, 0xCE, 0xAB, 0x9D, 0x87, 0x9D, 0x88, /* 0x4C-0x4F */
+	0x9D, 0x89, 0x9D, 0x8A, 0x9D, 0x8B, 0x9D, 0x8C, /* 0x50-0x53 */
+	0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, 0x9D, 0x90, /* 0x54-0x57 */
+	0xC5, 0xCB, 0x9D, 0x91, 0x9D, 0x92, 0x9D, 0x93, /* 0x58-0x5B */
+	0xC7, 0xB1, 0x9D, 0x94, 0xC2, 0xBA, 0x9D, 0x95, /* 0x5C-0x5F */
+	0x9D, 0x96, 0x9D, 0x97, 0xE4, 0xEA, 0x9D, 0x98, /* 0x60-0x63 */
+	0x9D, 0x99, 0x9D, 0x9A, 0xC1, 0xCA, 0x9D, 0x9B, /* 0x64-0x67 */
+	0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x68-0x6B */
+	0x9D, 0xA0, 0xCC, 0xB6, 0xB3, 0xB1, 0x9D, 0xA1, /* 0x6C-0x6F */
+	0x9D, 0xA2, 0x9D, 0xA3, 0xE4, 0xFB, 0x9D, 0xA4, /* 0x70-0x73 */
+	0xE4, 0xF3, 0x9D, 0xA5, 0x9D, 0xA6, 0x9D, 0xA7, /* 0x74-0x77 */
+	0xE4, 0xFA, 0x9D, 0xA8, 0xE4, 0xFD, 0x9D, 0xA9, /* 0x78-0x7B */
+	0xE4, 0xFC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x7C-0x7F */
+	
+	0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x80-0x83 */
+	0xB3, 0xCE, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x84-0x87 */
+	0xB3, 0xBA, 0xE4, 0xF7, 0x9D, 0xB4, 0x9D, 0xB5, /* 0x88-0x8B */
+	0xE4, 0xF9, 0xE4, 0xF8, 0xC5, 0xEC, 0x9D, 0xB6, /* 0x8C-0x8F */
+	0x9D, 0xB7, 0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, /* 0x90-0x93 */
+	0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBD, 0x9D, 0xBE, /* 0x94-0x97 */
+	0x9D, 0xBF, 0x9D, 0xC0, 0x9D, 0xC1, 0x9D, 0xC2, /* 0x98-0x9B */
+	0xC0, 0xBD, 0x9D, 0xC3, 0x9D, 0xC4, 0x9D, 0xC5, /* 0x9C-0x9F */
+	0x9D, 0xC6, 0xD4, 0xE8, 0x9D, 0xC7, 0x9D, 0xC8, /* 0xA0-0xA3 */
+	0x9D, 0xC9, 0x9D, 0xCA, 0x9D, 0xCB, 0xE5, 0xA2, /* 0xA4-0xA7 */
+	0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0xA8-0xAB */
+	0x9D, 0xD0, 0x9D, 0xD1, 0x9D, 0xD2, 0x9D, 0xD3, /* 0xAC-0xAF */
+	0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xB0, 0xC4, /* 0xB0-0xB3 */
+	0x9D, 0xD7, 0x9D, 0xD8, 0xE5, 0xA4, 0x9D, 0xD9, /* 0xB4-0xB7 */
+	0x9D, 0xDA, 0xE5, 0xA3, 0x9D, 0xDB, 0x9D, 0xDC, /* 0xB8-0xBB */
+	0x9D, 0xDD, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0xBC-0xBF */
+	0xBC, 0xA4, 0x9D, 0xE1, 0xE5, 0xA5, 0x9D, 0xE2, /* 0xC0-0xC3 */
+	0x9D, 0xE3, 0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, /* 0xC4-0xC7 */
+	0x9D, 0xE7, 0xE5, 0xA1, 0x9D, 0xE8, 0x9D, 0xE9, /* 0xC8-0xCB */
+	0x9D, 0xEA, 0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, /* 0xCC-0xCF */
+	0x9D, 0xEE, 0xE4, 0xFE, 0xB1, 0xF4, 0x9D, 0xEF, /* 0xD0-0xD3 */
+	0x9D, 0xF0, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0xD4-0xD7 */
+	0x9D, 0xF4, 0x9D, 0xF5, 0x9D, 0xF6, 0x9D, 0xF7, /* 0xD8-0xDB */
+	0x9D, 0xF8, 0x9D, 0xF9, 0xE5, 0xA8, 0x9D, 0xFA, /* 0xDC-0xDF */
+	0xE5, 0xA9, 0xE5, 0xA6, 0x9D, 0xFB, 0x9D, 0xFC, /* 0xE0-0xE3 */
+	0x9D, 0xFD, 0x9D, 0xFE, 0x9E, 0x40, 0x9E, 0x41, /* 0xE4-0xE7 */
+	0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, 0x9E, 0x45, /* 0xE8-0xEB */
+	0x9E, 0x46, 0x9E, 0x47, 0xE5, 0xA7, 0xE5, 0xAA, /* 0xEC-0xEF */
+	0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, 0x9E, 0x4B, /* 0xF0-0xF3 */
+	0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, 0x9E, 0x4F, /* 0xF4-0xF7 */
+	0x9E, 0x50, 0x9E, 0x51, 0x9E, 0x52, 0x9E, 0x53, /* 0xF8-0xFB */
+	0x9E, 0x54, 0x9E, 0x55, 0x9E, 0x56, 0x9E, 0x57, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0x9E, 0x58, 0x9E, 0x59, 0x9E, 0x5A, 0x9E, 0x5B, /* 0x00-0x03 */
+	0x9E, 0x5C, 0x9E, 0x5D, 0x9E, 0x5E, 0x9E, 0x5F, /* 0x04-0x07 */
+	0x9E, 0x60, 0x9E, 0x61, 0x9E, 0x62, 0x9E, 0x63, /* 0x08-0x0B */
+	0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0x0C-0x0F */
+	0x9E, 0x68, 0xC6, 0xD9, 0x9E, 0x69, 0x9E, 0x6A, /* 0x10-0x13 */
+	0x9E, 0x6B, 0x9E, 0x6C, 0x9E, 0x6D, 0x9E, 0x6E, /* 0x14-0x17 */
+	0x9E, 0x6F, 0x9E, 0x70, 0xE5, 0xAB, 0xE5, 0xAD, /* 0x18-0x1B */
+	0x9E, 0x71, 0x9E, 0x72, 0x9E, 0x73, 0x9E, 0x74, /* 0x1C-0x1F */
+	0x9E, 0x75, 0x9E, 0x76, 0x9E, 0x77, 0xE5, 0xAC, /* 0x20-0x23 */
+	0x9E, 0x78, 0x9E, 0x79, 0x9E, 0x7A, 0x9E, 0x7B, /* 0x24-0x27 */
+	0x9E, 0x7C, 0x9E, 0x7D, 0x9E, 0x7E, 0x9E, 0x80, /* 0x28-0x2B */
+	0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0x2C-0x2F */
+	0x9E, 0x85, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0x30-0x33 */
+	0x9E, 0x89, 0xE5, 0xAF, 0x9E, 0x8A, 0x9E, 0x8B, /* 0x34-0x37 */
+	0x9E, 0x8C, 0xE5, 0xAE, 0x9E, 0x8D, 0x9E, 0x8E, /* 0x38-0x3B */
+	0x9E, 0x8F, 0x9E, 0x90, 0x9E, 0x91, 0x9E, 0x92, /* 0x3C-0x3F */
+	0x9E, 0x93, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x40-0x43 */
+	0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, 0x9E, 0x9A, /* 0x44-0x47 */
+	0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, 0x9E, 0x9E, /* 0x48-0x4B */
+	0xB9, 0xE0, 0x9E, 0x9F, 0x9E, 0xA0, 0xE5, 0xB0, /* 0x4C-0x4F */
+	0x9E, 0xA1, 0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, /* 0x50-0x53 */
+	0x9E, 0xA5, 0x9E, 0xA6, 0x9E, 0xA7, 0x9E, 0xA8, /* 0x54-0x57 */
+	0x9E, 0xA9, 0x9E, 0xAA, 0x9E, 0xAB, 0x9E, 0xAC, /* 0x58-0x5B */
+	0x9E, 0xAD, 0x9E, 0xAE, 0xE5, 0xB1, 0x9E, 0xAF, /* 0x5C-0x5F */
+	0x9E, 0xB0, 0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, /* 0x60-0x63 */
+	0x9E, 0xB4, 0x9E, 0xB5, 0x9E, 0xB6, 0x9E, 0xB7, /* 0x64-0x67 */
+	0x9E, 0xB8, 0x9E, 0xB9, 0x9E, 0xBA, 0xBB, 0xF0, /* 0x68-0x6B */
+	0xEC, 0xE1, 0xC3, 0xF0, 0x9E, 0xBB, 0xB5, 0xC6, /* 0x6C-0x6F */
+	0xBB, 0xD2, 0x9E, 0xBC, 0x9E, 0xBD, 0x9E, 0xBE, /* 0x70-0x73 */
+	0x9E, 0xBF, 0xC1, 0xE9, 0xD4, 0xEE, 0x9E, 0xC0, /* 0x74-0x77 */
+	0xBE, 0xC4, 0x9E, 0xC1, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x78-0x7B */
+	0xD7, 0xC6, 0x9E, 0xC4, 0xD4, 0xD6, 0xB2, 0xD3, /* 0x7C-0x7F */
+	
+	0xEC, 0xBE, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x80-0x83 */
+	0x9E, 0xC8, 0xEA, 0xC1, 0x9E, 0xC9, 0x9E, 0xCA, /* 0x84-0x87 */
+	0x9E, 0xCB, 0xC2, 0xAF, 0xB4, 0xB6, 0x9E, 0xCC, /* 0x88-0x8B */
+	0x9E, 0xCD, 0x9E, 0xCE, 0xD1, 0xD7, 0x9E, 0xCF, /* 0x8C-0x8F */
+	0x9E, 0xD0, 0x9E, 0xD1, 0xB3, 0xB4, 0x9E, 0xD2, /* 0x90-0x93 */
+	0xC8, 0xB2, 0xBF, 0xBB, 0xEC, 0xC0, 0x9E, 0xD3, /* 0x94-0x97 */
+	0x9E, 0xD4, 0xD6, 0xCB, 0x9E, 0xD5, 0x9E, 0xD6, /* 0x98-0x9B */
+	0xEC, 0xBF, 0xEC, 0xC1, 0x9E, 0xD7, 0x9E, 0xD8, /* 0x9C-0x9F */
+	0x9E, 0xD9, 0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, /* 0xA0-0xA3 */
+	0x9E, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, 0x9E, 0xE0, /* 0xA4-0xA7 */
+	0x9E, 0xE1, 0x9E, 0xE2, 0x9E, 0xE3, 0xEC, 0xC5, /* 0xA8-0xAB */
+	0xBE, 0xE6, 0xCC, 0xBF, 0xC5, 0xDA, 0xBE, 0xBC, /* 0xAC-0xAF */
+	0x9E, 0xE4, 0xEC, 0xC6, 0x9E, 0xE5, 0xB1, 0xFE, /* 0xB0-0xB3 */
+	0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0xEC, 0xC4, /* 0xB4-0xB7 */
+	0xD5, 0xA8, 0xB5, 0xE3, 0x9E, 0xE9, 0xEC, 0xC2, /* 0xB8-0xBB */
+	0xC1, 0xB6, 0xB3, 0xE3, 0x9E, 0xEA, 0x9E, 0xEB, /* 0xBC-0xBF */
+	0xEC, 0xC3, 0xCB, 0xB8, 0xC0, 0xC3, 0xCC, 0xFE, /* 0xC0-0xC3 */
+	0x9E, 0xEC, 0x9E, 0xED, 0x9E, 0xEE, 0x9E, 0xEF, /* 0xC4-0xC7 */
+	0xC1, 0xD2, 0x9E, 0xF0, 0xEC, 0xC8, 0x9E, 0xF1, /* 0xC8-0xCB */
+	0x9E, 0xF2, 0x9E, 0xF3, 0x9E, 0xF4, 0x9E, 0xF5, /* 0xCC-0xCF */
+	0x9E, 0xF6, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0xD0-0xD3 */
+	0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xD4-0xD7 */
+	0xBA, 0xE6, 0xC0, 0xD3, 0x9E, 0xFE, 0xD6, 0xF2, /* 0xD8-0xDB */
+	0x9F, 0x40, 0x9F, 0x41, 0x9F, 0x42, 0xD1, 0xCC, /* 0xDC-0xDF */
+	0x9F, 0x43, 0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, /* 0xE0-0xE3 */
+	0xBF, 0xBE, 0x9F, 0x47, 0xB7, 0xB3, 0xC9, 0xD5, /* 0xE4-0xE7 */
+	0xEC, 0xC7, 0xBB, 0xE2, 0x9F, 0x48, 0xCC, 0xCC, /* 0xE8-0xEB */
+	0xBD, 0xFD, 0xC8, 0xC8, 0x9F, 0x49, 0xCF, 0xA9, /* 0xEC-0xEF */
+	0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, 0x9F, 0x4D, /* 0xF0-0xF3 */
+	0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0xCD, 0xE9, /* 0xF4-0xF7 */
+	0x9F, 0x51, 0xC5, 0xEB, 0x9F, 0x52, 0x9F, 0x53, /* 0xF8-0xFB */
+	0x9F, 0x54, 0xB7, 0xE9, 0x9F, 0x55, 0x9F, 0x56, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, 0x9F, 0x5A, /* 0x00-0x03 */
+	0x9F, 0x5B, 0x9F, 0x5C, 0x9F, 0x5D, 0x9F, 0x5E, /* 0x04-0x07 */
+	0x9F, 0x5F, 0xD1, 0xC9, 0xBA, 0xB8, 0x9F, 0x60, /* 0x08-0x0B */
+	0x9F, 0x61, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0x0C-0x0F */
+	0xEC, 0xC9, 0x9F, 0x65, 0x9F, 0x66, 0xEC, 0xCA, /* 0x10-0x13 */
+	0x9F, 0x67, 0xBB, 0xC0, 0xEC, 0xCB, 0x9F, 0x68, /* 0x14-0x17 */
+	0xEC, 0xE2, 0xB1, 0xBA, 0xB7, 0xD9, 0x9F, 0x69, /* 0x18-0x1B */
+	0x9F, 0x6A, 0x9F, 0x6B, 0x9F, 0x6C, 0x9F, 0x6D, /* 0x1C-0x1F */
+	0x9F, 0x6E, 0x9F, 0x6F, 0x9F, 0x70, 0x9F, 0x71, /* 0x20-0x23 */
+	0x9F, 0x72, 0x9F, 0x73, 0xBD, 0xB9, 0x9F, 0x74, /* 0x24-0x27 */
+	0x9F, 0x75, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0x28-0x2B */
+	0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x7B, 0xEC, 0xCC, /* 0x2C-0x2F */
+	0xD1, 0xE6, 0xEC, 0xCD, 0x9F, 0x7C, 0x9F, 0x7D, /* 0x30-0x33 */
+	0x9F, 0x7E, 0x9F, 0x80, 0xC8, 0xBB, 0x9F, 0x81, /* 0x34-0x37 */
+	0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0x38-0x3B */
+	0x9F, 0x86, 0x9F, 0x87, 0x9F, 0x88, 0x9F, 0x89, /* 0x3C-0x3F */
+	0x9F, 0x8A, 0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, /* 0x40-0x43 */
+	0x9F, 0x8E, 0xEC, 0xD1, 0x9F, 0x8F, 0x9F, 0x90, /* 0x44-0x47 */
+	0x9F, 0x91, 0x9F, 0x92, 0xEC, 0xD3, 0x9F, 0x93, /* 0x48-0x4B */
+	0xBB, 0xCD, 0x9F, 0x94, 0xBC, 0xE5, 0x9F, 0x95, /* 0x4C-0x4F */
+	0x9F, 0x96, 0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, /* 0x50-0x53 */
+	0x9F, 0x9A, 0x9F, 0x9B, 0x9F, 0x9C, 0x9F, 0x9D, /* 0x54-0x57 */
+	0x9F, 0x9E, 0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, /* 0x58-0x5B */
+	0xEC, 0xCF, 0x9F, 0xA2, 0xC9, 0xB7, 0x9F, 0xA3, /* 0x5C-0x5F */
+	0x9F, 0xA4, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x60-0x63 */
+	0xC3, 0xBA, 0x9F, 0xA8, 0xEC, 0xE3, 0xD5, 0xD5, /* 0x64-0x67 */
+	0xEC, 0xD0, 0x9F, 0xA9, 0x9F, 0xAA, 0x9F, 0xAB, /* 0x68-0x6B */
+	0x9F, 0xAC, 0x9F, 0xAD, 0xD6, 0xF3, 0x9F, 0xAE, /* 0x6C-0x6F */
+	0x9F, 0xAF, 0x9F, 0xB0, 0xEC, 0xD2, 0xEC, 0xCE, /* 0x70-0x73 */
+	0x9F, 0xB1, 0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, /* 0x74-0x77 */
+	0xEC, 0xD4, 0x9F, 0xB5, 0xEC, 0xD5, 0x9F, 0xB6, /* 0x78-0x7B */
+	0x9F, 0xB7, 0xC9, 0xBF, 0x9F, 0xB8, 0x9F, 0xB9, /* 0x7C-0x7F */
+	
+	0x9F, 0xBA, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x80-0x83 */
+	0xCF, 0xA8, 0x9F, 0xBE, 0x9F, 0xBF, 0x9F, 0xC0, /* 0x84-0x87 */
+	0x9F, 0xC1, 0x9F, 0xC2, 0xD0, 0xDC, 0x9F, 0xC3, /* 0x88-0x8B */
+	0x9F, 0xC4, 0x9F, 0xC5, 0x9F, 0xC6, 0xD1, 0xAC, /* 0x8C-0x8F */
+	0x9F, 0xC7, 0x9F, 0xC8, 0x9F, 0xC9, 0x9F, 0xCA, /* 0x90-0x93 */
+	0xC8, 0xDB, 0x9F, 0xCB, 0x9F, 0xCC, 0x9F, 0xCD, /* 0x94-0x97 */
+	0xEC, 0xD6, 0xCE, 0xF5, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x98-0x9B */
+	0x9F, 0xD0, 0x9F, 0xD1, 0x9F, 0xD2, 0xCA, 0xEC, /* 0x9C-0x9F */
+	0xEC, 0xDA, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0xA0-0xA3 */
+	0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0xA4-0xA7 */
+	0xEC, 0xD9, 0x9F, 0xDA, 0x9F, 0xDB, 0x9F, 0xDC, /* 0xA8-0xAB */
+	0xB0, 0xBE, 0x9F, 0xDD, 0x9F, 0xDE, 0x9F, 0xDF, /* 0xAC-0xAF */
+	0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xEC, 0xD7, /* 0xB0-0xB3 */
+	0x9F, 0xE3, 0xEC, 0xD8, 0x9F, 0xE4, 0x9F, 0xE5, /* 0xB4-0xB7 */
+	0x9F, 0xE6, 0xEC, 0xE4, 0x9F, 0xE7, 0x9F, 0xE8, /* 0xB8-0xBB */
+	0x9F, 0xE9, 0x9F, 0xEA, 0x9F, 0xEB, 0x9F, 0xEC, /* 0xBC-0xBF */
+	0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0xC8, 0xBC, /* 0xC0-0xC3 */
+	0x9F, 0xF0, 0x9F, 0xF1, 0x9F, 0xF2, 0x9F, 0xF3, /* 0xC4-0xC7 */
+	0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, 0x9F, 0xF7, /* 0xC8-0xCB */
+	0x9F, 0xF8, 0x9F, 0xF9, 0xC1, 0xC7, 0x9F, 0xFA, /* 0xCC-0xCF */
+	0x9F, 0xFB, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xD0-0xD3 */
+	0xEC, 0xDC, 0xD1, 0xE0, 0xA0, 0x40, 0xA0, 0x41, /* 0xD4-0xD7 */
+	0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, 0xA0, 0x45, /* 0xD8-0xDB */
+	0xA0, 0x46, 0xA0, 0x47, 0xA0, 0x48, 0xA0, 0x49, /* 0xDC-0xDF */
+	0xEC, 0xDB, 0xA0, 0x4A, 0xA0, 0x4B, 0xA0, 0x4C, /* 0xE0-0xE3 */
+	0xA0, 0x4D, 0xD4, 0xEF, 0xA0, 0x4E, 0xEC, 0xDD, /* 0xE4-0xE7 */
+	0xA0, 0x4F, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xE8-0xEB */
+	0xA0, 0x53, 0xA0, 0x54, 0xDB, 0xC6, 0xA0, 0x55, /* 0xEC-0xEF */
+	0xA0, 0x56, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xF0-0xF3 */
+	0xA0, 0x5A, 0xA0, 0x5B, 0xA0, 0x5C, 0xA0, 0x5D, /* 0xF4-0xF7 */
+	0xA0, 0x5E, 0xEC, 0xDE, 0xA0, 0x5F, 0xA0, 0x60, /* 0xF8-0xFB */
+	0xA0, 0x61, 0xA0, 0x62, 0xA0, 0x63, 0xA0, 0x64, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0xA0, 0x65, 0xA0, 0x66, 0xA0, 0x67, 0xA0, 0x68, /* 0x00-0x03 */
+	0xA0, 0x69, 0xA0, 0x6A, 0xB1, 0xAC, 0xA0, 0x6B, /* 0x04-0x07 */
+	0xA0, 0x6C, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0x08-0x0B */
+	0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0x0C-0x0F */
+	0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0x10-0x13 */
+	0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x7B, /* 0x14-0x17 */
+	0xA0, 0x7C, 0xA0, 0x7D, 0xA0, 0x7E, 0xA0, 0x80, /* 0x18-0x1B */
+	0xA0, 0x81, 0xEC, 0xDF, 0xA0, 0x82, 0xA0, 0x83, /* 0x1C-0x1F */
+	0xA0, 0x84, 0xA0, 0x85, 0xA0, 0x86, 0xA0, 0x87, /* 0x20-0x23 */
+	0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, 0xA0, 0x8B, /* 0x24-0x27 */
+	0xEC, 0xE0, 0xA0, 0x8C, 0xD7, 0xA6, 0xA0, 0x8D, /* 0x28-0x2B */
+	0xC5, 0xC0, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x2C-0x2F */
+	0xEB, 0xBC, 0xB0, 0xAE, 0xA0, 0x91, 0xA0, 0x92, /* 0x30-0x33 */
+	0xA0, 0x93, 0xBE, 0xF4, 0xB8, 0xB8, 0xD2, 0xAF, /* 0x34-0x37 */
+	0xB0, 0xD6, 0xB5, 0xF9, 0xA0, 0x94, 0xD8, 0xB3, /* 0x38-0x3B */
+	0xA0, 0x95, 0xCB, 0xAC, 0xA0, 0x96, 0xE3, 0xDD, /* 0x3C-0x3F */
+	0xA0, 0x97, 0xA0, 0x98, 0xA0, 0x99, 0xA0, 0x9A, /* 0x40-0x43 */
+	0xA0, 0x9B, 0xA0, 0x9C, 0xA0, 0x9D, 0xC6, 0xAC, /* 0x44-0x47 */
+	0xB0, 0xE6, 0xA0, 0x9E, 0xA0, 0x9F, 0xA0, 0xA0, /* 0x48-0x4B */
+	0xC5, 0xC6, 0xEB, 0xB9, 0xA0, 0xA1, 0xA0, 0xA2, /* 0x4C-0x4F */
+	0xA0, 0xA3, 0xA0, 0xA4, 0xEB, 0xBA, 0xA0, 0xA5, /* 0x50-0x53 */
+	0xA0, 0xA6, 0xA0, 0xA7, 0xEB, 0xBB, 0xA0, 0xA8, /* 0x54-0x57 */
+	0xA0, 0xA9, 0xD1, 0xC0, 0xA0, 0xAA, 0xC5, 0xA3, /* 0x58-0x5B */
+	0xA0, 0xAB, 0xEA, 0xF2, 0xA0, 0xAC, 0xC4, 0xB2, /* 0x5C-0x5F */
+	0xA0, 0xAD, 0xC4, 0xB5, 0xC0, 0xCE, 0xA0, 0xAE, /* 0x60-0x63 */
+	0xA0, 0xAF, 0xA0, 0xB0, 0xEA, 0xF3, 0xC4, 0xC1, /* 0x64-0x67 */
+	0xA0, 0xB1, 0xCE, 0xEF, 0xA0, 0xB2, 0xA0, 0xB3, /* 0x68-0x6B */
+	0xA0, 0xB4, 0xA0, 0xB5, 0xEA, 0xF0, 0xEA, 0xF4, /* 0x6C-0x6F */
+	0xA0, 0xB6, 0xA0, 0xB7, 0xC9, 0xFC, 0xA0, 0xB8, /* 0x70-0x73 */
+	0xA0, 0xB9, 0xC7, 0xA3, 0xA0, 0xBA, 0xA0, 0xBB, /* 0x74-0x77 */
+	0xA0, 0xBC, 0xCC, 0xD8, 0xCE, 0xFE, 0xA0, 0xBD, /* 0x78-0x7B */
+	0xA0, 0xBE, 0xA0, 0xBF, 0xEA, 0xF5, 0xEA, 0xF6, /* 0x7C-0x7F */
+	
+	0xCF, 0xAC, 0xC0, 0xE7, 0xA0, 0xC0, 0xA0, 0xC1, /* 0x80-0x83 */
+	0xEA, 0xF7, 0xA0, 0xC2, 0xA0, 0xC3, 0xA0, 0xC4, /* 0x84-0x87 */
+	0xA0, 0xC5, 0xA0, 0xC6, 0xB6, 0xBF, 0xEA, 0xF8, /* 0x88-0x8B */
+	0xA0, 0xC7, 0xEA, 0xF9, 0xA0, 0xC8, 0xEA, 0xFA, /* 0x8C-0x8F */
+	0xA0, 0xC9, 0xA0, 0xCA, 0xEA, 0xFB, 0xA0, 0xCB, /* 0x90-0x93 */
+	0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x94-0x97 */
+	0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x98-0x9B */
+	0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xEA, 0xF1, /* 0x9C-0x9F */
+	0xA0, 0xD7, 0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, /* 0xA0-0xA3 */
+	0xA0, 0xDB, 0xA0, 0xDC, 0xA0, 0xDD, 0xA0, 0xDE, /* 0xA4-0xA7 */
+	0xA0, 0xDF, 0xA0, 0xE0, 0xA0, 0xE1, 0xA0, 0xE2, /* 0xA8-0xAB */
+	0xC8, 0xAE, 0xE1, 0xEB, 0xA0, 0xE3, 0xB7, 0xB8, /* 0xAC-0xAF */
+	0xE1, 0xEC, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0xB0-0xB3 */
+	0xE1, 0xED, 0xA0, 0xE7, 0xD7, 0xB4, 0xE1, 0xEE, /* 0xB4-0xB7 */
+	0xE1, 0xEF, 0xD3, 0xCC, 0xA0, 0xE8, 0xA0, 0xE9, /* 0xB8-0xBB */
+	0xA0, 0xEA, 0xA0, 0xEB, 0xA0, 0xEC, 0xA0, 0xED, /* 0xBC-0xBF */
+	0xA0, 0xEE, 0xE1, 0xF1, 0xBF, 0xF1, 0xE1, 0xF0, /* 0xC0-0xC3 */
+	0xB5, 0xD2, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0xC4-0xC7 */
+	0xB1, 0xB7, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0xC8-0xCB */
+	0xA0, 0xF5, 0xE1, 0xF3, 0xE1, 0xF2, 0xA0, 0xF6, /* 0xCC-0xCF */
+	0xBA, 0xFC, 0xA0, 0xF7, 0xE1, 0xF4, 0xA0, 0xF8, /* 0xD0-0xD3 */
+	0xA0, 0xF9, 0xA0, 0xFA, 0xA0, 0xFB, 0xB9, 0xB7, /* 0xD4-0xD7 */
+	0xA0, 0xFC, 0xBE, 0xD1, 0xA0, 0xFD, 0xA0, 0xFE, /* 0xD8-0xDB */
+	0xAA, 0x40, 0xAA, 0x41, 0xC4, 0xFC, 0xAA, 0x42, /* 0xDC-0xDF */
+	0xBA, 0xDD, 0xBD, 0xC6, 0xAA, 0x43, 0xAA, 0x44, /* 0xE0-0xE3 */
+	0xAA, 0x45, 0xAA, 0x46, 0xAA, 0x47, 0xAA, 0x48, /* 0xE4-0xE7 */
+	0xE1, 0xF5, 0xE1, 0xF7, 0xAA, 0x49, 0xAA, 0x4A, /* 0xE8-0xEB */
+	0xB6, 0xC0, 0xCF, 0xC1, 0xCA, 0xA8, 0xE1, 0xF6, /* 0xEC-0xEF */
+	0xD5, 0xF8, 0xD3, 0xFC, 0xE1, 0xF8, 0xE1, 0xFC, /* 0xF0-0xF3 */
+	0xE1, 0xF9, 0xAA, 0x4B, 0xAA, 0x4C, 0xE1, 0xFA, /* 0xF4-0xF7 */
+	0xC0, 0xEA, 0xAA, 0x4D, 0xE1, 0xFE, 0xE2, 0xA1, /* 0xF8-0xFB */
+	0xC0, 0xC7, 0xAA, 0x4E, 0xAA, 0x4F, 0xAA, 0x50, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0xAA, 0x51, 0xE1, 0xFB, 0xAA, 0x52, 0xE1, 0xFD, /* 0x00-0x03 */
+	0xAA, 0x53, 0xAA, 0x54, 0xAA, 0x55, 0xAA, 0x56, /* 0x04-0x07 */
+	0xAA, 0x57, 0xAA, 0x58, 0xE2, 0xA5, 0xAA, 0x59, /* 0x08-0x0B */
+	0xAA, 0x5A, 0xAA, 0x5B, 0xC1, 0xD4, 0xAA, 0x5C, /* 0x0C-0x0F */
+	0xAA, 0x5D, 0xAA, 0x5E, 0xAA, 0x5F, 0xE2, 0xA3, /* 0x10-0x13 */
+	0xAA, 0x60, 0xE2, 0xA8, 0xB2, 0xFE, 0xE2, 0xA2, /* 0x14-0x17 */
+	0xAA, 0x61, 0xAA, 0x62, 0xAA, 0x63, 0xC3, 0xCD, /* 0x18-0x1B */
+	0xB2, 0xC2, 0xE2, 0xA7, 0xE2, 0xA6, 0xAA, 0x64, /* 0x1C-0x1F */
+	0xAA, 0x65, 0xE2, 0xA4, 0xE2, 0xA9, 0xAA, 0x66, /* 0x20-0x23 */
+	0xAA, 0x67, 0xE2, 0xAB, 0xAA, 0x68, 0xAA, 0x69, /* 0x24-0x27 */
+	0xAA, 0x6A, 0xD0, 0xC9, 0xD6, 0xED, 0xC3, 0xA8, /* 0x28-0x2B */
+	0xE2, 0xAC, 0xAA, 0x6B, 0xCF, 0xD7, 0xAA, 0x6C, /* 0x2C-0x2F */
+	0xAA, 0x6D, 0xE2, 0xAE, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x30-0x33 */
+	0xBA, 0xEF, 0xAA, 0x70, 0xAA, 0x71, 0xE9, 0xE0, /* 0x34-0x37 */
+	0xE2, 0xAD, 0xE2, 0xAA, 0xAA, 0x72, 0xAA, 0x73, /* 0x38-0x3B */
+	0xAA, 0x74, 0xAA, 0x75, 0xBB, 0xAB, 0xD4, 0xB3, /* 0x3C-0x3F */
+	0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, 0xAA, 0x79, /* 0x40-0x43 */
+	0xAA, 0x7A, 0xAA, 0x7B, 0xAA, 0x7C, 0xAA, 0x7D, /* 0x44-0x47 */
+	0xAA, 0x7E, 0xAA, 0x80, 0xAA, 0x81, 0xAA, 0x82, /* 0x48-0x4B */
+	0xAA, 0x83, 0xE2, 0xB0, 0xAA, 0x84, 0xAA, 0x85, /* 0x4C-0x4F */
+	0xE2, 0xAF, 0xAA, 0x86, 0xE9, 0xE1, 0xAA, 0x87, /* 0x50-0x53 */
+	0xAA, 0x88, 0xAA, 0x89, 0xAA, 0x8A, 0xE2, 0xB1, /* 0x54-0x57 */
+	0xAA, 0x8B, 0xAA, 0x8C, 0xAA, 0x8D, 0xAA, 0x8E, /* 0x58-0x5B */
+	0xAA, 0x8F, 0xAA, 0x90, 0xAA, 0x91, 0xAA, 0x92, /* 0x5C-0x5F */
+	0xE2, 0xB2, 0xAA, 0x93, 0xAA, 0x94, 0xAA, 0x95, /* 0x60-0x63 */
+	0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, 0xAA, 0x99, /* 0x64-0x67 */
+	0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, 0xAA, 0x9D, /* 0x68-0x6B */
+	0xE2, 0xB3, 0xCC, 0xA1, 0xAA, 0x9E, 0xE2, 0xB4, /* 0x6C-0x6F */
+	0xAA, 0x9F, 0xAA, 0xA0, 0xAB, 0x40, 0xAB, 0x41, /* 0x70-0x73 */
+	0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, 0xAB, 0x45, /* 0x74-0x77 */
+	0xAB, 0x46, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x78-0x7B */
+	0xAB, 0x4A, 0xAB, 0x4B, 0xE2, 0xB5, 0xAB, 0x4C, /* 0x7C-0x7F */
+	
+	0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0x80-0x83 */
+	0xD0, 0xFE, 0xAB, 0x51, 0xAB, 0x52, 0xC2, 0xCA, /* 0x84-0x87 */
+	0xAB, 0x53, 0xD3, 0xF1, 0xAB, 0x54, 0xCD, 0xF5, /* 0x88-0x8B */
+	0xAB, 0x55, 0xAB, 0x56, 0xE7, 0xE0, 0xAB, 0x57, /* 0x8C-0x8F */
+	0xAB, 0x58, 0xE7, 0xE1, 0xAB, 0x59, 0xAB, 0x5A, /* 0x90-0x93 */
+	0xAB, 0x5B, 0xAB, 0x5C, 0xBE, 0xC1, 0xAB, 0x5D, /* 0x94-0x97 */
+	0xAB, 0x5E, 0xAB, 0x5F, 0xAB, 0x60, 0xC2, 0xEA, /* 0x98-0x9B */
+	0xAB, 0x61, 0xAB, 0x62, 0xAB, 0x63, 0xE7, 0xE4, /* 0x9C-0x9F */
+	0xAB, 0x64, 0xAB, 0x65, 0xE7, 0xE3, 0xAB, 0x66, /* 0xA0-0xA3 */
+	0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, 0xAB, 0x6A, /* 0xA4-0xA7 */
+	0xAB, 0x6B, 0xCD, 0xE6, 0xAB, 0x6C, 0xC3, 0xB5, /* 0xA8-0xAB */
+	0xAB, 0x6D, 0xAB, 0x6E, 0xE7, 0xE2, 0xBB, 0xB7, /* 0xAC-0xAF */
+	0xCF, 0xD6, 0xAB, 0x6F, 0xC1, 0xE1, 0xE7, 0xE9, /* 0xB0-0xB3 */
+	0xAB, 0x70, 0xAB, 0x71, 0xAB, 0x72, 0xE7, 0xE8, /* 0xB4-0xB7 */
+	0xAB, 0x73, 0xAB, 0x74, 0xE7, 0xF4, 0xB2, 0xA3, /* 0xB8-0xBB */
+	0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, 0xAB, 0x78, /* 0xBC-0xBF */
+	0xE7, 0xEA, 0xAB, 0x79, 0xE7, 0xE6, 0xAB, 0x7A, /* 0xC0-0xC3 */
+	0xAB, 0x7B, 0xAB, 0x7C, 0xAB, 0x7D, 0xAB, 0x7E, /* 0xC4-0xC7 */
+	0xE7, 0xEC, 0xE7, 0xEB, 0xC9, 0xBA, 0xAB, 0x80, /* 0xC8-0xCB */
+	0xAB, 0x81, 0xD5, 0xE4, 0xAB, 0x82, 0xE7, 0xE5, /* 0xCC-0xCF */
+	0xB7, 0xA9, 0xE7, 0xE7, 0xAB, 0x83, 0xAB, 0x84, /* 0xD0-0xD3 */
+	0xAB, 0x85, 0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, /* 0xD4-0xD7 */
+	0xAB, 0x89, 0xE7, 0xEE, 0xAB, 0x8A, 0xAB, 0x8B, /* 0xD8-0xDB */
+	0xAB, 0x8C, 0xAB, 0x8D, 0xE7, 0xF3, 0xAB, 0x8E, /* 0xDC-0xDF */
+	0xD6, 0xE9, 0xAB, 0x8F, 0xAB, 0x90, 0xAB, 0x91, /* 0xE0-0xE3 */
+	0xAB, 0x92, 0xE7, 0xED, 0xAB, 0x93, 0xE7, 0xF2, /* 0xE4-0xE7 */
+	0xAB, 0x94, 0xE7, 0xF1, 0xAB, 0x95, 0xAB, 0x96, /* 0xE8-0xEB */
+	0xAB, 0x97, 0xB0, 0xE0, 0xAB, 0x98, 0xAB, 0x99, /* 0xEC-0xEF */
+	0xAB, 0x9A, 0xAB, 0x9B, 0xE7, 0xF5, 0xAB, 0x9C, /* 0xF0-0xF3 */
+	0xAB, 0x9D, 0xAB, 0x9E, 0xAB, 0x9F, 0xAB, 0xA0, /* 0xF4-0xF7 */
+	0xAC, 0x40, 0xAC, 0x41, 0xAC, 0x42, 0xAC, 0x43, /* 0xF8-0xFB */
+	0xAC, 0x44, 0xAC, 0x45, 0xAC, 0x46, 0xAC, 0x47, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0xAC, 0x48, 0xAC, 0x49, 0xAC, 0x4A, 0xC7, 0xF2, /* 0x00-0x03 */
+	0xAC, 0x4B, 0xC0, 0xC5, 0xC0, 0xED, 0xAC, 0x4C, /* 0x04-0x07 */
+	0xAC, 0x4D, 0xC1, 0xF0, 0xE7, 0xF0, 0xAC, 0x4E, /* 0x08-0x0B */
+	0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, 0xE7, 0xF6, /* 0x0C-0x0F */
+	0xCB, 0xF6, 0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, /* 0x10-0x13 */
+	0xAC, 0x55, 0xAC, 0x56, 0xAC, 0x57, 0xAC, 0x58, /* 0x14-0x17 */
+	0xAC, 0x59, 0xAC, 0x5A, 0xE8, 0xA2, 0xE8, 0xA1, /* 0x18-0x1B */
+	0xAC, 0x5B, 0xAC, 0x5C, 0xAC, 0x5D, 0xAC, 0x5E, /* 0x1C-0x1F */
+	0xAC, 0x5F, 0xAC, 0x60, 0xD7, 0xC1, 0xAC, 0x61, /* 0x20-0x23 */
+	0xAC, 0x62, 0xE7, 0xFA, 0xE7, 0xF9, 0xAC, 0x63, /* 0x24-0x27 */
+	0xE7, 0xFB, 0xAC, 0x64, 0xE7, 0xF7, 0xAC, 0x65, /* 0x28-0x2B */
+	0xE7, 0xFE, 0xAC, 0x66, 0xE7, 0xFD, 0xAC, 0x67, /* 0x2C-0x2F */
+	0xE7, 0xFC, 0xAC, 0x68, 0xAC, 0x69, 0xC1, 0xD5, /* 0x30-0x33 */
+	0xC7, 0xD9, 0xC5, 0xFD, 0xC5, 0xC3, 0xAC, 0x6A, /* 0x34-0x37 */
+	0xAC, 0x6B, 0xAC, 0x6C, 0xAC, 0x6D, 0xAC, 0x6E, /* 0x38-0x3B */
+	0xC7, 0xED, 0xAC, 0x6F, 0xAC, 0x70, 0xAC, 0x71, /* 0x3C-0x3F */
+	0xAC, 0x72, 0xE8, 0xA3, 0xAC, 0x73, 0xAC, 0x74, /* 0x40-0x43 */
+	0xAC, 0x75, 0xAC, 0x76, 0xAC, 0x77, 0xAC, 0x78, /* 0x44-0x47 */
+	0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x7B, 0xAC, 0x7C, /* 0x48-0x4B */
+	0xAC, 0x7D, 0xAC, 0x7E, 0xAC, 0x80, 0xAC, 0x81, /* 0x4C-0x4F */
+	0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x50-0x53 */
+	0xAC, 0x86, 0xE8, 0xA6, 0xAC, 0x87, 0xE8, 0xA5, /* 0x54-0x57 */
+	0xAC, 0x88, 0xE8, 0xA7, 0xBA, 0xF7, 0xE7, 0xF8, /* 0x58-0x5B */
+	0xE8, 0xA4, 0xAC, 0x89, 0xC8, 0xF0, 0xC9, 0xAA, /* 0x5C-0x5F */
+	0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x60-0x63 */
+	0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x64-0x67 */
+	0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x68-0x6B */
+	0xAC, 0x96, 0xE8, 0xA9, 0xAC, 0x97, 0xAC, 0x98, /* 0x6C-0x6F */
+	0xB9, 0xE5, 0xAC, 0x99, 0xAC, 0x9A, 0xAC, 0x9B, /* 0x70-0x73 */
+	0xAC, 0x9C, 0xAC, 0x9D, 0xD1, 0xFE, 0xE8, 0xA8, /* 0x74-0x77 */
+	0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, 0xAD, 0x40, /* 0x78-0x7B */
+	0xAD, 0x41, 0xAD, 0x42, 0xE8, 0xAA, 0xAD, 0x43, /* 0x7C-0x7F */
+	
+	0xE8, 0xAD, 0xE8, 0xAE, 0xAD, 0x44, 0xC1, 0xA7, /* 0x80-0x83 */
+	0xAD, 0x45, 0xAD, 0x46, 0xAD, 0x47, 0xE8, 0xAF, /* 0x84-0x87 */
+	0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, 0xE8, 0xB0, /* 0x88-0x8B */
+	0xAD, 0x4B, 0xAD, 0x4C, 0xE8, 0xAC, 0xAD, 0x4D, /* 0x8C-0x8F */
+	0xE8, 0xB4, 0xAD, 0x4E, 0xAD, 0x4F, 0xAD, 0x50, /* 0x90-0x93 */
+	0xAD, 0x51, 0xAD, 0x52, 0xAD, 0x53, 0xAD, 0x54, /* 0x94-0x97 */
+	0xAD, 0x55, 0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, /* 0x98-0x9B */
+	0xE8, 0xAB, 0xAD, 0x59, 0xE8, 0xB1, 0xAD, 0x5A, /* 0x9C-0x9F */
+	0xAD, 0x5B, 0xAD, 0x5C, 0xAD, 0x5D, 0xAD, 0x5E, /* 0xA0-0xA3 */
+	0xAD, 0x5F, 0xAD, 0x60, 0xAD, 0x61, 0xE8, 0xB5, /* 0xA4-0xA7 */
+	0xE8, 0xB2, 0xE8, 0xB3, 0xAD, 0x62, 0xAD, 0x63, /* 0xA8-0xAB */
+	0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0xAC-0xAF */
+	0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, 0xAD, 0x6B, /* 0xB0-0xB3 */
+	0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, 0xAD, 0x6F, /* 0xB4-0xB7 */
+	0xAD, 0x70, 0xAD, 0x71, 0xE8, 0xB7, 0xAD, 0x72, /* 0xB8-0xBB */
+	0xAD, 0x73, 0xAD, 0x74, 0xAD, 0x75, 0xAD, 0x76, /* 0xBC-0xBF */
+	0xAD, 0x77, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0xC0-0xC3 */
+	0xAD, 0x7B, 0xAD, 0x7C, 0xAD, 0x7D, 0xAD, 0x7E, /* 0xC4-0xC7 */
+	0xAD, 0x80, 0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, /* 0xC8-0xCB */
+	0xAD, 0x84, 0xAD, 0x85, 0xAD, 0x86, 0xAD, 0x87, /* 0xCC-0xCF */
+	0xAD, 0x88, 0xAD, 0x89, 0xE8, 0xB6, 0xAD, 0x8A, /* 0xD0-0xD3 */
+	0xAD, 0x8B, 0xAD, 0x8C, 0xAD, 0x8D, 0xAD, 0x8E, /* 0xD4-0xD7 */
+	0xAD, 0x8F, 0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, /* 0xD8-0xDB */
+	0xB9, 0xCF, 0xAD, 0x93, 0xF0, 0xAC, 0xAD, 0x94, /* 0xDC-0xDF */
+	0xF0, 0xAD, 0xAD, 0x95, 0xC6, 0xB0, 0xB0, 0xEA, /* 0xE0-0xE3 */
+	0xC8, 0xBF, 0xAD, 0x96, 0xCD, 0xDF, 0xAD, 0x97, /* 0xE4-0xE7 */
+	0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xE8-0xEB */
+	0xAD, 0x9C, 0xAD, 0x9D, 0xCE, 0xCD, 0xEA, 0xB1, /* 0xEC-0xEF */
+	0xAD, 0x9E, 0xAD, 0x9F, 0xAD, 0xA0, 0xAE, 0x40, /* 0xF0-0xF3 */
+	0xEA, 0xB2, 0xAE, 0x41, 0xC6, 0xBF, 0xB4, 0xC9, /* 0xF4-0xF7 */
+	0xAE, 0x42, 0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, /* 0xF8-0xFB */
+	0xAE, 0x46, 0xAE, 0x47, 0xAE, 0x48, 0xEA, 0xB3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_75[512] = {
+	0xAE, 0x49, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0x00-0x03 */
+	0xD5, 0xE7, 0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, /* 0x04-0x07 */
+	0xAE, 0x50, 0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, /* 0x08-0x0B */
+	0xAE, 0x54, 0xDD, 0xF9, 0xAE, 0x55, 0xEA, 0xB4, /* 0x0C-0x0F */
+	0xAE, 0x56, 0xEA, 0xB5, 0xAE, 0x57, 0xEA, 0xB6, /* 0x10-0x13 */
+	0xAE, 0x58, 0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x5B, /* 0x14-0x17 */
+	0xB8, 0xCA, 0xDF, 0xB0, 0xC9, 0xF5, 0xAE, 0x5C, /* 0x18-0x1B */
+	0xCC, 0xF0, 0xAE, 0x5D, 0xAE, 0x5E, 0xC9, 0xFA, /* 0x1C-0x1F */
+	0xAE, 0x5F, 0xAE, 0x60, 0xAE, 0x61, 0xAE, 0x62, /* 0x20-0x23 */
+	0xAE, 0x63, 0xC9, 0xFB, 0xAE, 0x64, 0xAE, 0x65, /* 0x24-0x27 */
+	0xD3, 0xC3, 0xCB, 0xA6, 0xAE, 0x66, 0xB8, 0xA6, /* 0x28-0x2B */
+	0xF0, 0xAE, 0xB1, 0xC2, 0xAE, 0x67, 0xE5, 0xB8, /* 0x2C-0x2F */
+	0xCC, 0xEF, 0xD3, 0xC9, 0xBC, 0xD7, 0xC9, 0xEA, /* 0x30-0x33 */
+	0xAE, 0x68, 0xB5, 0xE7, 0xAE, 0x69, 0xC4, 0xD0, /* 0x34-0x37 */
+	0xB5, 0xE9, 0xAE, 0x6A, 0xEE, 0xAE, 0xBB, 0xAD, /* 0x38-0x3B */
+	0xAE, 0x6B, 0xAE, 0x6C, 0xE7, 0xDE, 0xAE, 0x6D, /* 0x3C-0x3F */
+	0xEE, 0xAF, 0xAE, 0x6E, 0xAE, 0x6F, 0xAE, 0x70, /* 0x40-0x43 */
+	0xAE, 0x71, 0xB3, 0xA9, 0xAE, 0x72, 0xAE, 0x73, /* 0x44-0x47 */
+	0xEE, 0xB2, 0xAE, 0x74, 0xAE, 0x75, 0xEE, 0xB1, /* 0x48-0x4B */
+	0xBD, 0xE7, 0xAE, 0x76, 0xEE, 0xB0, 0xCE, 0xB7, /* 0x4C-0x4F */
+	0xAE, 0x77, 0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, /* 0x50-0x53 */
+	0xC5, 0xCF, 0xAE, 0x7B, 0xAE, 0x7C, 0xAE, 0x7D, /* 0x54-0x57 */
+	0xAE, 0x7E, 0xC1, 0xF4, 0xDB, 0xCE, 0xEE, 0xB3, /* 0x58-0x5B */
+	0xD0, 0xF3, 0xAE, 0x80, 0xAE, 0x81, 0xAE, 0x82, /* 0x5C-0x5F */
+	0xAE, 0x83, 0xAE, 0x84, 0xAE, 0x85, 0xAE, 0x86, /* 0x60-0x63 */
+	0xAE, 0x87, 0xC2, 0xD4, 0xC6, 0xE8, 0xAE, 0x88, /* 0x64-0x67 */
+	0xAE, 0x89, 0xAE, 0x8A, 0xB7, 0xAC, 0xAE, 0x8B, /* 0x68-0x6B */
+	0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, 0xAE, 0x8F, /* 0x6C-0x6F */
+	0xAE, 0x90, 0xAE, 0x91, 0xEE, 0xB4, 0xAE, 0x92, /* 0x70-0x73 */
+	0xB3, 0xEB, 0xAE, 0x93, 0xAE, 0x94, 0xAE, 0x95, /* 0x74-0x77 */
+	0xBB, 0xFB, 0xEE, 0xB5, 0xAE, 0x96, 0xAE, 0x97, /* 0x78-0x7B */
+	0xAE, 0x98, 0xAE, 0x99, 0xAE, 0x9A, 0xE7, 0xDC, /* 0x7C-0x7F */
+	
+	0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, 0xEE, 0xB6, /* 0x80-0x83 */
+	0xAE, 0x9E, 0xAE, 0x9F, 0xBD, 0xAE, 0xAE, 0xA0, /* 0x84-0x87 */
+	0xAF, 0x40, 0xAF, 0x41, 0xAF, 0x42, 0xF1, 0xE2, /* 0x88-0x8B */
+	0xAF, 0x43, 0xAF, 0x44, 0xAF, 0x45, 0xCA, 0xE8, /* 0x8C-0x8F */
+	0xAF, 0x46, 0xD2, 0xC9, 0xF0, 0xDA, 0xAF, 0x47, /* 0x90-0x93 */
+	0xF0, 0xDB, 0xAF, 0x48, 0xF0, 0xDC, 0xC1, 0xC6, /* 0x94-0x97 */
+	0xAF, 0x49, 0xB8, 0xED, 0xBE, 0xCE, 0xAF, 0x4A, /* 0x98-0x9B */
+	0xAF, 0x4B, 0xF0, 0xDE, 0xAF, 0x4C, 0xC5, 0xB1, /* 0x9C-0x9F */
+	0xF0, 0xDD, 0xD1, 0xF1, 0xAF, 0x4D, 0xF0, 0xE0, /* 0xA0-0xA3 */
+	0xB0, 0xCC, 0xBD, 0xEA, 0xAF, 0x4E, 0xAF, 0x4F, /* 0xA4-0xA7 */
+	0xAF, 0x50, 0xAF, 0x51, 0xAF, 0x52, 0xD2, 0xDF, /* 0xA8-0xAB */
+	0xF0, 0xDF, 0xAF, 0x53, 0xB4, 0xAF, 0xB7, 0xE8, /* 0xAC-0xAF */
+	0xF0, 0xE6, 0xF0, 0xE5, 0xC6, 0xA3, 0xF0, 0xE1, /* 0xB0-0xB3 */
+	0xF0, 0xE2, 0xB4, 0xC3, 0xAF, 0x54, 0xAF, 0x55, /* 0xB4-0xB7 */
+	0xF0, 0xE3, 0xD5, 0xEE, 0xAF, 0x56, 0xAF, 0x57, /* 0xB8-0xBB */
+	0xCC, 0xDB, 0xBE, 0xD2, 0xBC, 0xB2, 0xAF, 0x58, /* 0xBC-0xBF */
+	0xAF, 0x59, 0xAF, 0x5A, 0xF0, 0xE8, 0xF0, 0xE7, /* 0xC0-0xC3 */
+	0xF0, 0xE4, 0xB2, 0xA1, 0xAF, 0x5B, 0xD6, 0xA2, /* 0xC4-0xC7 */
+	0xD3, 0xB8, 0xBE, 0xB7, 0xC8, 0xAC, 0xAF, 0x5C, /* 0xC8-0xCB */
+	0xAF, 0x5D, 0xF0, 0xEA, 0xAF, 0x5E, 0xAF, 0x5F, /* 0xCC-0xCF */
+	0xAF, 0x60, 0xAF, 0x61, 0xD1, 0xF7, 0xAF, 0x62, /* 0xD0-0xD3 */
+	0xD6, 0xCC, 0xBA, 0xDB, 0xF0, 0xE9, 0xAF, 0x63, /* 0xD4-0xD7 */
+	0xB6, 0xBB, 0xAF, 0x64, 0xAF, 0x65, 0xCD, 0xB4, /* 0xD8-0xDB */
+	0xAF, 0x66, 0xAF, 0x67, 0xC6, 0xA6, 0xAF, 0x68, /* 0xDC-0xDF */
+	0xAF, 0x69, 0xAF, 0x6A, 0xC1, 0xA1, 0xF0, 0xEB, /* 0xE0-0xE3 */
+	0xF0, 0xEE, 0xAF, 0x6B, 0xF0, 0xED, 0xF0, 0xF0, /* 0xE4-0xE7 */
+	0xF0, 0xEC, 0xAF, 0x6C, 0xBB, 0xBE, 0xF0, 0xEF, /* 0xE8-0xEB */
+	0xAF, 0x6D, 0xAF, 0x6E, 0xAF, 0x6F, 0xAF, 0x70, /* 0xEC-0xEF */
+	0xCC, 0xB5, 0xF0, 0xF2, 0xAF, 0x71, 0xAF, 0x72, /* 0xF0-0xF3 */
+	0xB3, 0xD5, 0xAF, 0x73, 0xAF, 0x74, 0xAF, 0x75, /* 0xF4-0xF7 */
+	0xAF, 0x76, 0xB1, 0xD4, 0xAF, 0x77, 0xAF, 0x78, /* 0xF8-0xFB */
+	0xF0, 0xF3, 0xAF, 0x79, 0xAF, 0x7A, 0xF0, 0xF4, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0xF0, 0xF6, 0xB4, 0xE1, 0xAF, 0x7B, 0xF0, 0xF1, /* 0x00-0x03 */
+	0xAF, 0x7C, 0xF0, 0xF7, 0xAF, 0x7D, 0xAF, 0x7E, /* 0x04-0x07 */
+	0xAF, 0x80, 0xAF, 0x81, 0xF0, 0xFA, 0xAF, 0x82, /* 0x08-0x0B */
+	0xF0, 0xF8, 0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, /* 0x0C-0x0F */
+	0xF0, 0xF5, 0xAF, 0x86, 0xAF, 0x87, 0xAF, 0x88, /* 0x10-0x13 */
+	0xAF, 0x89, 0xF0, 0xFD, 0xAF, 0x8A, 0xF0, 0xF9, /* 0x14-0x17 */
+	0xF0, 0xFC, 0xF0, 0xFE, 0xAF, 0x8B, 0xF1, 0xA1, /* 0x18-0x1B */
+	0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, 0xCE, 0xC1, /* 0x1C-0x1F */
+	0xF1, 0xA4, 0xAF, 0x8F, 0xF1, 0xA3, 0xAF, 0x90, /* 0x20-0x23 */
+	0xC1, 0xF6, 0xF0, 0xFB, 0xCA, 0xDD, 0xAF, 0x91, /* 0x24-0x27 */
+	0xAF, 0x92, 0xB4, 0xF1, 0xB1, 0xF1, 0xCC, 0xB1, /* 0x28-0x2B */
+	0xAF, 0x93, 0xF1, 0xA6, 0xAF, 0x94, 0xAF, 0x95, /* 0x2C-0x2F */
+	0xF1, 0xA7, 0xAF, 0x96, 0xAF, 0x97, 0xF1, 0xAC, /* 0x30-0x33 */
+	0xD5, 0xCE, 0xF1, 0xA9, 0xAF, 0x98, 0xAF, 0x99, /* 0x34-0x37 */
+	0xC8, 0xB3, 0xAF, 0x9A, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x38-0x3B */
+	0xF1, 0xA2, 0xAF, 0x9D, 0xF1, 0xAB, 0xF1, 0xA8, /* 0x3C-0x3F */
+	0xF1, 0xA5, 0xAF, 0x9E, 0xAF, 0x9F, 0xF1, 0xAA, /* 0x40-0x43 */
+	0xAF, 0xA0, 0xB0, 0x40, 0xB0, 0x41, 0xB0, 0x42, /* 0x44-0x47 */
+	0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x48-0x4B */
+	0xB0, 0xA9, 0xF1, 0xAD, 0xB0, 0x47, 0xB0, 0x48, /* 0x4C-0x4F */
+	0xB0, 0x49, 0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, /* 0x50-0x53 */
+	0xF1, 0xAF, 0xB0, 0x4D, 0xF1, 0xB1, 0xB0, 0x4E, /* 0x54-0x57 */
+	0xB0, 0x4F, 0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, /* 0x58-0x5B */
+	0xF1, 0xB0, 0xB0, 0x53, 0xF1, 0xAE, 0xB0, 0x54, /* 0x5C-0x5F */
+	0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, 0xD1, 0xA2, /* 0x60-0x63 */
+	0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x5B, /* 0x64-0x67 */
+	0xB0, 0x5C, 0xB0, 0x5D, 0xB0, 0x5E, 0xF1, 0xB2, /* 0x68-0x6B */
+	0xB0, 0x5F, 0xB0, 0x60, 0xB0, 0x61, 0xF1, 0xB3, /* 0x6C-0x6F */
+	0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0x70-0x73 */
+	0xB0, 0x66, 0xB0, 0x67, 0xB0, 0x68, 0xB0, 0x69, /* 0x74-0x77 */
+	0xB9, 0xEF, 0xB0, 0x6A, 0xB0, 0x6B, 0xB5, 0xC7, /* 0x78-0x7B */
+	0xB0, 0x6C, 0xB0, 0xD7, 0xB0, 0xD9, 0xB0, 0x6D, /* 0x7C-0x7F */
+	
+	0xB0, 0x6E, 0xB0, 0x6F, 0xD4, 0xED, 0xB0, 0x70, /* 0x80-0x83 */
+	0xB5, 0xC4, 0xB0, 0x71, 0xBD, 0xD4, 0xBB, 0xCA, /* 0x84-0x87 */
+	0xF0, 0xA7, 0xB0, 0x72, 0xB0, 0x73, 0xB8, 0xDE, /* 0x88-0x8B */
+	0xB0, 0x74, 0xB0, 0x75, 0xF0, 0xA8, 0xB0, 0x76, /* 0x8C-0x8F */
+	0xB0, 0x77, 0xB0, 0xA8, 0xB0, 0x78, 0xF0, 0xA9, /* 0x90-0x93 */
+	0xB0, 0x79, 0xB0, 0x7A, 0xCD, 0xEE, 0xB0, 0x7B, /* 0x94-0x97 */
+	0xB0, 0x7C, 0xF0, 0xAA, 0xB0, 0x7D, 0xB0, 0x7E, /* 0x98-0x9B */
+	0xB0, 0x80, 0xB0, 0x81, 0xB0, 0x82, 0xB0, 0x83, /* 0x9C-0x9F */
+	0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, 0xB0, 0x87, /* 0xA0-0xA3 */
+	0xF0, 0xAB, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xA4-0xA7 */
+	0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xA8-0xAB */
+	0xB0, 0x8F, 0xB0, 0x90, 0xC6, 0xA4, 0xB0, 0x91, /* 0xAC-0xAF */
+	0xB0, 0x92, 0xD6, 0xE5, 0xF1, 0xE4, 0xB0, 0x93, /* 0xB0-0xB3 */
+	0xF1, 0xE5, 0xB0, 0x94, 0xB0, 0x95, 0xB0, 0x96, /* 0xB4-0xB7 */
+	0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, 0xB0, 0x9A, /* 0xB8-0xBB */
+	0xB0, 0x9B, 0xB0, 0x9C, 0xB0, 0x9D, 0xC3, 0xF3, /* 0xBC-0xBF */
+	0xB0, 0x9E, 0xB0, 0x9F, 0xD3, 0xDB, 0xB0, 0xA0, /* 0xC0-0xC3 */
+	0xB1, 0x40, 0xD6, 0xD1, 0xC5, 0xE8, 0xB1, 0x41, /* 0xC4-0xC7 */
+	0xD3, 0xAF, 0xB1, 0x42, 0xD2, 0xE6, 0xB1, 0x43, /* 0xC8-0xCB */
+	0xB1, 0x44, 0xEE, 0xC1, 0xB0, 0xBB, 0xD5, 0xB5, /* 0xCC-0xCF */
+	0xD1, 0xCE, 0xBC, 0xE0, 0xBA, 0xD0, 0xB1, 0x45, /* 0xD0-0xD3 */
+	0xBF, 0xF8, 0xB1, 0x46, 0xB8, 0xC7, 0xB5, 0xC1, /* 0xD4-0xD7 */
+	0xC5, 0xCC, 0xB1, 0x47, 0xB1, 0x48, 0xCA, 0xA2, /* 0xD8-0xDB */
+	0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xC3, 0xCB, /* 0xDC-0xDF */
+	0xB1, 0x4C, 0xB1, 0x4D, 0xB1, 0x4E, 0xB1, 0x4F, /* 0xE0-0xE3 */
+	0xB1, 0x50, 0xEE, 0xC2, 0xB1, 0x51, 0xB1, 0x52, /* 0xE4-0xE7 */
+	0xB1, 0x53, 0xB1, 0x54, 0xB1, 0x55, 0xB1, 0x56, /* 0xE8-0xEB */
+	0xB1, 0x57, 0xB1, 0x58, 0xC4, 0xBF, 0xB6, 0xA2, /* 0xEC-0xEF */
+	0xB1, 0x59, 0xED, 0xEC, 0xC3, 0xA4, 0xB1, 0x5A, /* 0xF0-0xF3 */
+	0xD6, 0xB1, 0xB1, 0x5B, 0xB1, 0x5C, 0xB1, 0x5D, /* 0xF4-0xF7 */
+	0xCF, 0xE0, 0xED, 0xEF, 0xB1, 0x5E, 0xB1, 0x5F, /* 0xF8-0xFB */
+	0xC5, 0xCE, 0xB1, 0x60, 0xB6, 0xDC, 0xB1, 0x61, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0xB1, 0x62, 0xCA, 0xA1, 0xB1, 0x63, 0xB1, 0x64, /* 0x00-0x03 */
+	0xED, 0xED, 0xB1, 0x65, 0xB1, 0x66, 0xED, 0xF0, /* 0x04-0x07 */
+	0xED, 0xF1, 0xC3, 0xBC, 0xB1, 0x67, 0xBF, 0xB4, /* 0x08-0x0B */
+	0xB1, 0x68, 0xED, 0xEE, 0xB1, 0x69, 0xB1, 0x6A, /* 0x0C-0x0F */
+	0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x10-0x13 */
+	0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, 0xB1, 0x72, /* 0x14-0x17 */
+	0xB1, 0x73, 0xED, 0xF4, 0xED, 0xF2, 0xB1, 0x74, /* 0x18-0x1B */
+	0xB1, 0x75, 0xB1, 0x76, 0xB1, 0x77, 0xD5, 0xE6, /* 0x1C-0x1F */
+	0xC3, 0xDF, 0xB1, 0x78, 0xED, 0xF3, 0xB1, 0x79, /* 0x20-0x23 */
+	0xB1, 0x7A, 0xB1, 0x7B, 0xED, 0xF6, 0xB1, 0x7C, /* 0x24-0x27 */
+	0xD5, 0xA3, 0xD1, 0xA3, 0xB1, 0x7D, 0xB1, 0x7E, /* 0x28-0x2B */
+	0xB1, 0x80, 0xED, 0xF5, 0xB1, 0x81, 0xC3, 0xD0, /* 0x2C-0x2F */
+	0xB1, 0x82, 0xB1, 0x83, 0xB1, 0x84, 0xB1, 0x85, /* 0x30-0x33 */
+	0xB1, 0x86, 0xED, 0xF7, 0xBF, 0xF4, 0xBE, 0xEC, /* 0x34-0x37 */
+	0xED, 0xF8, 0xB1, 0x87, 0xCC, 0xF7, 0xB1, 0x88, /* 0x38-0x3B */
+	0xD1, 0xDB, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x3C-0x3F */
+	0xD7, 0xC5, 0xD5, 0xF6, 0xB1, 0x8C, 0xED, 0xFC, /* 0x40-0x43 */
+	0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, 0xED, 0xFB, /* 0x44-0x47 */
+	0xB1, 0x90, 0xB1, 0x91, 0xB1, 0x92, 0xB1, 0x93, /* 0x48-0x4B */
+	0xB1, 0x94, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x4C-0x4F */
+	0xED, 0xF9, 0xED, 0xFA, 0xB1, 0x98, 0xB1, 0x99, /* 0x50-0x53 */
+	0xB1, 0x9A, 0xB1, 0x9B, 0xB1, 0x9C, 0xB1, 0x9D, /* 0x54-0x57 */
+	0xB1, 0x9E, 0xB1, 0x9F, 0xED, 0xFD, 0xBE, 0xA6, /* 0x58-0x5B */
+	0xB1, 0xA0, 0xB2, 0x40, 0xB2, 0x41, 0xB2, 0x42, /* 0x5C-0x5F */
+	0xB2, 0x43, 0xCB, 0xAF, 0xEE, 0xA1, 0xB6, 0xBD, /* 0x60-0x63 */
+	0xB2, 0x44, 0xEE, 0xA2, 0xC4, 0xC0, 0xB2, 0x45, /* 0x64-0x67 */
+	0xED, 0xFE, 0xB2, 0x46, 0xB2, 0x47, 0xBD, 0xDE, /* 0x68-0x6B */
+	0xB2, 0xC7, 0xB2, 0x48, 0xB2, 0x49, 0xB2, 0x4A, /* 0x6C-0x6F */
+	0xB2, 0x4B, 0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, /* 0x70-0x73 */
+	0xB2, 0x4F, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x74-0x77 */
+	0xB2, 0x53, 0xB6, 0xC3, 0xB2, 0x54, 0xB2, 0x55, /* 0x78-0x7B */
+	0xB2, 0x56, 0xEE, 0xA5, 0xD8, 0xBA, 0xEE, 0xA3, /* 0x7C-0x7F */
+	
+	0xEE, 0xA6, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x80-0x83 */
+	0xC3, 0xE9, 0xB3, 0xF2, 0xB2, 0x5A, 0xB2, 0x5B, /* 0x84-0x87 */
+	0xB2, 0x5C, 0xB2, 0x5D, 0xB2, 0x5E, 0xB2, 0x5F, /* 0x88-0x8B */
+	0xEE, 0xA7, 0xEE, 0xA4, 0xCF, 0xB9, 0xB2, 0x60, /* 0x8C-0x8F */
+	0xB2, 0x61, 0xEE, 0xA8, 0xC2, 0xF7, 0xB2, 0x62, /* 0x90-0x93 */
+	0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x94-0x97 */
+	0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x98-0x9B */
+	0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xEE, 0xA9, /* 0x9C-0x9F */
+	0xEE, 0xAA, 0xB2, 0x6E, 0xDE, 0xAB, 0xB2, 0x6F, /* 0xA0-0xA3 */
+	0xB2, 0x70, 0xC6, 0xB3, 0xB2, 0x71, 0xC7, 0xC6, /* 0xA4-0xA7 */
+	0xB2, 0x72, 0xD6, 0xF5, 0xB5, 0xC9, 0xB2, 0x73, /* 0xA8-0xAB */
+	0xCB, 0xB2, 0xB2, 0x74, 0xB2, 0x75, 0xB2, 0x76, /* 0xAC-0xAF */
+	0xEE, 0xAB, 0xB2, 0x77, 0xB2, 0x78, 0xCD, 0xAB, /* 0xB0-0xB3 */
+	0xB2, 0x79, 0xEE, 0xAC, 0xB2, 0x7A, 0xB2, 0x7B, /* 0xB4-0xB7 */
+	0xB2, 0x7C, 0xB2, 0x7D, 0xB2, 0x7E, 0xD5, 0xB0, /* 0xB8-0xBB */
+	0xB2, 0x80, 0xEE, 0xAD, 0xB2, 0x81, 0xF6, 0xC4, /* 0xBC-0xBF */
+	0xB2, 0x82, 0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, /* 0xC0-0xC3 */
+	0xB2, 0x86, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xC4-0xC7 */
+	0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xC8-0xCB */
+	0xB2, 0x8E, 0xDB, 0xC7, 0xB2, 0x8F, 0xB2, 0x90, /* 0xCC-0xCF */
+	0xB2, 0x91, 0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, /* 0xD0-0xD3 */
+	0xB2, 0x95, 0xB2, 0x96, 0xB2, 0x97, 0xB4, 0xA3, /* 0xD4-0xD7 */
+	0xB2, 0x98, 0xB2, 0x99, 0xB2, 0x9A, 0xC3, 0xAC, /* 0xD8-0xDB */
+	0xF1, 0xE6, 0xB2, 0x9B, 0xB2, 0x9C, 0xB2, 0x9D, /* 0xDC-0xDF */
+	0xB2, 0x9E, 0xB2, 0x9F, 0xCA, 0xB8, 0xD2, 0xD3, /* 0xE0-0xE3 */
+	0xB2, 0xA0, 0xD6, 0xAA, 0xB3, 0x40, 0xEF, 0xF2, /* 0xE4-0xE7 */
+	0xB3, 0x41, 0xBE, 0xD8, 0xB3, 0x42, 0xBD, 0xC3, /* 0xE8-0xEB */
+	0xEF, 0xF3, 0xB6, 0xCC, 0xB0, 0xAB, 0xB3, 0x43, /* 0xEC-0xEF */
+	0xB3, 0x44, 0xB3, 0x45, 0xB3, 0x46, 0xCA, 0xAF, /* 0xF0-0xF3 */
+	0xB3, 0x47, 0xB3, 0x48, 0xED, 0xB6, 0xB3, 0x49, /* 0xF4-0xF7 */
+	0xED, 0xB7, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xF8-0xFB */
+	0xB3, 0x4D, 0xCE, 0xF9, 0xB7, 0xAF, 0xBF, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_78[512] = {
+	0xED, 0xB8, 0xC2, 0xEB, 0xC9, 0xB0, 0xB3, 0x4E, /* 0x00-0x03 */
+	0xB3, 0x4F, 0xB3, 0x50, 0xB3, 0x51, 0xB3, 0x52, /* 0x04-0x07 */
+	0xB3, 0x53, 0xED, 0xB9, 0xB3, 0x54, 0xB3, 0x55, /* 0x08-0x0B */
+	0xC6, 0xF6, 0xBF, 0xB3, 0xB3, 0x56, 0xB3, 0x57, /* 0x0C-0x0F */
+	0xB3, 0x58, 0xED, 0xBC, 0xC5, 0xF8, 0xB3, 0x59, /* 0x10-0x13 */
+	0xD1, 0xD0, 0xB3, 0x5A, 0xD7, 0xA9, 0xED, 0xBA, /* 0x14-0x17 */
+	0xED, 0xBB, 0xB3, 0x5B, 0xD1, 0xE2, 0xB3, 0x5C, /* 0x18-0x1B */
+	0xED, 0xBF, 0xED, 0xC0, 0xB3, 0x5D, 0xED, 0xC4, /* 0x1C-0x1F */
+	0xB3, 0x5E, 0xB3, 0x5F, 0xB3, 0x60, 0xED, 0xC8, /* 0x20-0x23 */
+	0xB3, 0x61, 0xED, 0xC6, 0xED, 0xCE, 0xD5, 0xE8, /* 0x24-0x27 */
+	0xB3, 0x62, 0xED, 0xC9, 0xB3, 0x63, 0xB3, 0x64, /* 0x28-0x2B */
+	0xED, 0xC7, 0xED, 0xBE, 0xB3, 0x65, 0xB3, 0x66, /* 0x2C-0x2F */
+	0xC5, 0xE9, 0xB3, 0x67, 0xB3, 0x68, 0xB3, 0x69, /* 0x30-0x33 */
+	0xC6, 0xC6, 0xB3, 0x6A, 0xB3, 0x6B, 0xC9, 0xE9, /* 0x34-0x37 */
+	0xD4, 0xD2, 0xED, 0xC1, 0xED, 0xC2, 0xED, 0xC3, /* 0x38-0x3B */
+	0xED, 0xC5, 0xB3, 0x6C, 0xC0, 0xF9, 0xB3, 0x6D, /* 0x3C-0x3F */
+	0xB4, 0xA1, 0xB3, 0x6E, 0xB3, 0x6F, 0xB3, 0x70, /* 0x40-0x43 */
+	0xB3, 0x71, 0xB9, 0xE8, 0xB3, 0x72, 0xED, 0xD0, /* 0x44-0x47 */
+	0xB3, 0x73, 0xB3, 0x74, 0xB3, 0x75, 0xB3, 0x76, /* 0x48-0x4B */
+	0xED, 0xD1, 0xB3, 0x77, 0xED, 0xCA, 0xB3, 0x78, /* 0x4C-0x4F */
+	0xED, 0xCF, 0xB3, 0x79, 0xCE, 0xF8, 0xB3, 0x7A, /* 0x50-0x53 */
+	0xB3, 0x7B, 0xCB, 0xB6, 0xED, 0xCC, 0xED, 0xCD, /* 0x54-0x57 */
+	0xB3, 0x7C, 0xB3, 0x7D, 0xB3, 0x7E, 0xB3, 0x80, /* 0x58-0x5B */
+	0xB3, 0x81, 0xCF, 0xF5, 0xB3, 0x82, 0xB3, 0x83, /* 0x5C-0x5F */
+	0xB3, 0x84, 0xB3, 0x85, 0xB3, 0x86, 0xB3, 0x87, /* 0x60-0x63 */
+	0xB3, 0x88, 0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, /* 0x64-0x67 */
+	0xB3, 0x8C, 0xB3, 0x8D, 0xED, 0xD2, 0xC1, 0xF2, /* 0x68-0x6B */
+	0xD3, 0xB2, 0xED, 0xCB, 0xC8, 0xB7, 0xB3, 0x8E, /* 0x6C-0x6F */
+	0xB3, 0x8F, 0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, /* 0x70-0x73 */
+	0xB3, 0x93, 0xB3, 0x94, 0xB3, 0x95, 0xBC, 0xEF, /* 0x74-0x77 */
+	0xB3, 0x96, 0xB3, 0x97, 0xB3, 0x98, 0xB3, 0x99, /* 0x78-0x7B */
+	0xC5, 0xF0, 0xB3, 0x9A, 0xB3, 0x9B, 0xB3, 0x9C, /* 0x7C-0x7F */
+	
+	0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, 0xB3, 0xA0, /* 0x80-0x83 */
+	0xB4, 0x40, 0xB4, 0x41, 0xB4, 0x42, 0xED, 0xD6, /* 0x84-0x87 */
+	0xB4, 0x43, 0xB5, 0xEF, 0xB4, 0x44, 0xB4, 0x45, /* 0x88-0x8B */
+	0xC2, 0xB5, 0xB0, 0xAD, 0xCB, 0xE9, 0xB4, 0x46, /* 0x8C-0x8F */
+	0xB4, 0x47, 0xB1, 0xAE, 0xB4, 0x48, 0xED, 0xD4, /* 0x90-0x93 */
+	0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, 0xCD, 0xEB, /* 0x94-0x97 */
+	0xB5, 0xE2, 0xB4, 0x4C, 0xED, 0xD5, 0xED, 0xD3, /* 0x98-0x9B */
+	0xED, 0xD7, 0xB4, 0x4D, 0xB4, 0x4E, 0xB5, 0xFA, /* 0x9C-0x9F */
+	0xB4, 0x4F, 0xED, 0xD8, 0xB4, 0x50, 0xED, 0xD9, /* 0xA0-0xA3 */
+	0xB4, 0x51, 0xED, 0xDC, 0xB4, 0x52, 0xB1, 0xCC, /* 0xA4-0xA7 */
+	0xB4, 0x53, 0xB4, 0x54, 0xB4, 0x55, 0xB4, 0x56, /* 0xA8-0xAB */
+	0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0xAC-0xAF */
+	0xC5, 0xF6, 0xBC, 0xEE, 0xED, 0xDA, 0xCC, 0xBC, /* 0xB0-0xB3 */
+	0xB2, 0xEA, 0xB4, 0x5B, 0xB4, 0x5C, 0xB4, 0x5D, /* 0xB4-0xB7 */
+	0xB4, 0x5E, 0xED, 0xDB, 0xB4, 0x5F, 0xB4, 0x60, /* 0xB8-0xBB */
+	0xB4, 0x61, 0xB4, 0x62, 0xC4, 0xEB, 0xB4, 0x63, /* 0xBC-0xBF */
+	0xB4, 0x64, 0xB4, 0xC5, 0xB4, 0x65, 0xB4, 0x66, /* 0xC0-0xC3 */
+	0xB4, 0x67, 0xB0, 0xF5, 0xB4, 0x68, 0xB4, 0x69, /* 0xC4-0xC7 */
+	0xB4, 0x6A, 0xED, 0xDF, 0xC0, 0xDA, 0xB4, 0xE8, /* 0xC8-0xCB */
+	0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, 0xB4, 0x6E, /* 0xCC-0xCF */
+	0xC5, 0xCD, 0xB4, 0x6F, 0xB4, 0x70, 0xB4, 0x71, /* 0xD0-0xD3 */
+	0xED, 0xDD, 0xBF, 0xC4, 0xB4, 0x72, 0xB4, 0x73, /* 0xD4-0xD7 */
+	0xB4, 0x74, 0xED, 0xDE, 0xB4, 0x75, 0xB4, 0x76, /* 0xD8-0xDB */
+	0xB4, 0x77, 0xB4, 0x78, 0xB4, 0x79, 0xB4, 0x7A, /* 0xDC-0xDF */
+	0xB4, 0x7B, 0xB4, 0x7C, 0xB4, 0x7D, 0xB4, 0x7E, /* 0xE0-0xE3 */
+	0xB4, 0x80, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0xE4-0xE7 */
+	0xC4, 0xA5, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0xE8-0xEB */
+	0xED, 0xE0, 0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, /* 0xEC-0xEF */
+	0xB4, 0x8A, 0xB4, 0x8B, 0xED, 0xE1, 0xB4, 0x8C, /* 0xF0-0xF3 */
+	0xED, 0xE3, 0xB4, 0x8D, 0xB4, 0x8E, 0xC1, 0xD7, /* 0xF4-0xF7 */
+	0xB4, 0x8F, 0xB4, 0x90, 0xBB, 0xC7, 0xB4, 0x91, /* 0xF8-0xFB */
+	0xB4, 0x92, 0xB4, 0x93, 0xB4, 0x94, 0xB4, 0x95, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_79[512] = {
+	0xB4, 0x96, 0xBD, 0xB8, 0xB4, 0x97, 0xB4, 0x98, /* 0x00-0x03 */
+	0xB4, 0x99, 0xED, 0xE2, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x04-0x07 */
+	0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x08-0x0B */
+	0xB4, 0xA0, 0xB5, 0x40, 0xB5, 0x41, 0xB5, 0x42, /* 0x0C-0x0F */
+	0xB5, 0x43, 0xB5, 0x44, 0xB5, 0x45, 0xED, 0xE4, /* 0x10-0x13 */
+	0xB5, 0x46, 0xB5, 0x47, 0xB5, 0x48, 0xB5, 0x49, /* 0x14-0x17 */
+	0xB5, 0x4A, 0xB5, 0x4B, 0xB5, 0x4C, 0xB5, 0x4D, /* 0x18-0x1B */
+	0xB5, 0x4E, 0xB5, 0x4F, 0xED, 0xE6, 0xB5, 0x50, /* 0x1C-0x1F */
+	0xB5, 0x51, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0x20-0x23 */
+	0xED, 0xE5, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0x24-0x27 */
+	0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x5B, /* 0x28-0x2B */
+	0xB5, 0x5C, 0xB5, 0x5D, 0xB5, 0x5E, 0xB5, 0x5F, /* 0x2C-0x2F */
+	0xB5, 0x60, 0xB5, 0x61, 0xB5, 0x62, 0xB5, 0x63, /* 0x30-0x33 */
+	0xED, 0xE7, 0xB5, 0x64, 0xB5, 0x65, 0xB5, 0x66, /* 0x34-0x37 */
+	0xB5, 0x67, 0xB5, 0x68, 0xCA, 0xBE, 0xEC, 0xEA, /* 0x38-0x3B */
+	0xC0, 0xF1, 0xB5, 0x69, 0xC9, 0xE7, 0xB5, 0x6A, /* 0x3C-0x3F */
+	0xEC, 0xEB, 0xC6, 0xEE, 0xB5, 0x6B, 0xB5, 0x6C, /* 0x40-0x43 */
+	0xB5, 0x6D, 0xB5, 0x6E, 0xEC, 0xEC, 0xB5, 0x6F, /* 0x44-0x47 */
+	0xC6, 0xED, 0xEC, 0xED, 0xB5, 0x70, 0xB5, 0x71, /* 0x48-0x4B */
+	0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, 0xB5, 0x75, /* 0x4C-0x4F */
+	0xB5, 0x76, 0xB5, 0x77, 0xB5, 0x78, 0xEC, 0xF0, /* 0x50-0x53 */
+	0xB5, 0x79, 0xB5, 0x7A, 0xD7, 0xE6, 0xEC, 0xF3, /* 0x54-0x57 */
+	0xB5, 0x7B, 0xB5, 0x7C, 0xEC, 0xF1, 0xEC, 0xEE, /* 0x58-0x5B */
+	0xEC, 0xEF, 0xD7, 0xA3, 0xC9, 0xF1, 0xCB, 0xEE, /* 0x5C-0x5F */
+	0xEC, 0xF4, 0xB5, 0x7D, 0xEC, 0xF2, 0xB5, 0x7E, /* 0x60-0x63 */
+	0xB5, 0x80, 0xCF, 0xE9, 0xB5, 0x81, 0xEC, 0xF6, /* 0x64-0x67 */
+	0xC6, 0xB1, 0xB5, 0x82, 0xB5, 0x83, 0xB5, 0x84, /* 0x68-0x6B */
+	0xB5, 0x85, 0xBC, 0xC0, 0xB5, 0x86, 0xEC, 0xF5, /* 0x6C-0x6F */
+	0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, 0xB5, 0x8A, /* 0x70-0x73 */
+	0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, 0xB5, 0xBB, /* 0x74-0x77 */
+	0xBB, 0xF6, 0xB5, 0x8E, 0xEC, 0xF7, 0xB5, 0x8F, /* 0x78-0x7B */
+	0xB5, 0x90, 0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, /* 0x7C-0x7F */
+	
+	0xD9, 0xF7, 0xBD, 0xFB, 0xB5, 0x94, 0xB5, 0x95, /* 0x80-0x83 */
+	0xC2, 0xBB, 0xEC, 0xF8, 0xB5, 0x96, 0xB5, 0x97, /* 0x84-0x87 */
+	0xB5, 0x98, 0xB5, 0x99, 0xEC, 0xF9, 0xB5, 0x9A, /* 0x88-0x8B */
+	0xB5, 0x9B, 0xB5, 0x9C, 0xB5, 0x9D, 0xB8, 0xA3, /* 0x8C-0x8F */
+	0xB5, 0x9E, 0xB5, 0x9F, 0xB5, 0xA0, 0xB6, 0x40, /* 0x90-0x93 */
+	0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, 0xB6, 0x44, /* 0x94-0x97 */
+	0xB6, 0x45, 0xB6, 0x46, 0xEC, 0xFA, 0xB6, 0x47, /* 0x98-0x9B */
+	0xB6, 0x48, 0xB6, 0x49, 0xB6, 0x4A, 0xB6, 0x4B, /* 0x9C-0x9F */
+	0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, 0xB6, 0x4F, /* 0xA0-0xA3 */
+	0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, 0xEC, 0xFB, /* 0xA4-0xA7 */
+	0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0xA8-0xAB */
+	0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0xAC-0xAF */
+	0xB6, 0x5B, 0xB6, 0x5C, 0xB6, 0x5D, 0xEC, 0xFC, /* 0xB0-0xB3 */
+	0xB6, 0x5E, 0xB6, 0x5F, 0xB6, 0x60, 0xB6, 0x61, /* 0xB4-0xB7 */
+	0xB6, 0x62, 0xD3, 0xED, 0xD8, 0xAE, 0xC0, 0xEB, /* 0xB8-0xBB */
+	0xB6, 0x63, 0xC7, 0xDD, 0xBA, 0xCC, 0xB6, 0x64, /* 0xBC-0xBF */
+	0xD0, 0xE3, 0xCB, 0xBD, 0xB6, 0x65, 0xCD, 0xBA, /* 0xC0-0xC3 */
+	0xB6, 0x66, 0xB6, 0x67, 0xB8, 0xD1, 0xB6, 0x68, /* 0xC4-0xC7 */
+	0xB6, 0x69, 0xB1, 0xFC, 0xB6, 0x6A, 0xC7, 0xEF, /* 0xC8-0xCB */
+	0xB6, 0x6B, 0xD6, 0xD6, 0xB6, 0x6C, 0xB6, 0x6D, /* 0xCC-0xCF */
+	0xB6, 0x6E, 0xBF, 0xC6, 0xC3, 0xEB, 0xB6, 0x6F, /* 0xD0-0xD3 */
+	0xB6, 0x70, 0xEF, 0xF5, 0xB6, 0x71, 0xB6, 0x72, /* 0xD4-0xD7 */
+	0xC3, 0xD8, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0xD8-0xDB */
+	0xB6, 0x76, 0xB6, 0x77, 0xB6, 0x78, 0xD7, 0xE2, /* 0xDC-0xDF */
+	0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x7B, 0xEF, 0xF7, /* 0xE0-0xE3 */
+	0xB3, 0xD3, 0xB6, 0x7C, 0xC7, 0xD8, 0xD1, 0xED, /* 0xE4-0xE7 */
+	0xB6, 0x7D, 0xD6, 0xC8, 0xB6, 0x7E, 0xEF, 0xF8, /* 0xE8-0xEB */
+	0xB6, 0x80, 0xEF, 0xF6, 0xB6, 0x81, 0xBB, 0xFD, /* 0xEC-0xEF */
+	0xB3, 0xC6, 0xB6, 0x82, 0xB6, 0x83, 0xB6, 0x84, /* 0xF0-0xF3 */
+	0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0xF4-0xF7 */
+	0xBD, 0xD5, 0xB6, 0x89, 0xB6, 0x8A, 0xD2, 0xC6, /* 0xF8-0xFB */
+	0xB6, 0x8B, 0xBB, 0xE0, 0xB6, 0x8C, 0xB6, 0x8D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0xCF, 0xA1, 0xB6, 0x8E, 0xEF, 0xFC, 0xEF, 0xFB, /* 0x00-0x03 */
+	0xB6, 0x8F, 0xB6, 0x90, 0xEF, 0xF9, 0xB6, 0x91, /* 0x04-0x07 */
+	0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, 0xB3, 0xCC, /* 0x08-0x0B */
+	0xB6, 0x95, 0xC9, 0xD4, 0xCB, 0xB0, 0xB6, 0x96, /* 0x0C-0x0F */
+	0xB6, 0x97, 0xB6, 0x98, 0xB6, 0x99, 0xB6, 0x9A, /* 0x10-0x13 */
+	0xEF, 0xFE, 0xB6, 0x9B, 0xB6, 0x9C, 0xB0, 0xDE, /* 0x14-0x17 */
+	0xB6, 0x9D, 0xB6, 0x9E, 0xD6, 0xC9, 0xB6, 0x9F, /* 0x18-0x1B */
+	0xB6, 0xA0, 0xB7, 0x40, 0xEF, 0xFD, 0xB7, 0x41, /* 0x1C-0x1F */
+	0xB3, 0xED, 0xB7, 0x42, 0xB7, 0x43, 0xF6, 0xD5, /* 0x20-0x23 */
+	0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, 0xB7, 0x47, /* 0x24-0x27 */
+	0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, 0xB7, 0x4B, /* 0x28-0x2B */
+	0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, 0xB7, 0x4F, /* 0x2C-0x2F */
+	0xB7, 0x50, 0xB7, 0x51, 0xB7, 0x52, 0xCE, 0xC8, /* 0x30-0x33 */
+	0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, 0xF0, 0xA2, /* 0x34-0x37 */
+	0xB7, 0x56, 0xF0, 0xA1, 0xB7, 0x57, 0xB5, 0xBE, /* 0x38-0x3B */
+	0xBC, 0xDA, 0xBB, 0xFC, 0xB7, 0x58, 0xB8, 0xE5, /* 0x3C-0x3F */
+	0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x5B, 0xB7, 0x5C, /* 0x40-0x43 */
+	0xB7, 0x5D, 0xB7, 0x5E, 0xC4, 0xC2, 0xB7, 0x5F, /* 0x44-0x47 */
+	0xB7, 0x60, 0xB7, 0x61, 0xB7, 0x62, 0xB7, 0x63, /* 0x48-0x4B */
+	0xB7, 0x64, 0xB7, 0x65, 0xB7, 0x66, 0xB7, 0x67, /* 0x4C-0x4F */
+	0xB7, 0x68, 0xF0, 0xA3, 0xB7, 0x69, 0xB7, 0x6A, /* 0x50-0x53 */
+	0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, 0xCB, 0xEB, /* 0x54-0x57 */
+	0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x58-0x5B */
+	0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x5C-0x5F */
+	0xB7, 0x76, 0xB7, 0x77, 0xB7, 0x78, 0xB7, 0x79, /* 0x60-0x63 */
+	0xB7, 0x7A, 0xB7, 0x7B, 0xB7, 0x7C, 0xB7, 0x7D, /* 0x64-0x67 */
+	0xB7, 0x7E, 0xB7, 0x80, 0xB7, 0x81, 0xB7, 0x82, /* 0x68-0x6B */
+	0xB7, 0x83, 0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, /* 0x6C-0x6F */
+	0xF0, 0xA6, 0xB7, 0x87, 0xB7, 0x88, 0xB7, 0x89, /* 0x70-0x73 */
+	0xD1, 0xA8, 0xB7, 0x8A, 0xBE, 0xBF, 0xC7, 0xEE, /* 0x74-0x77 */
+	0xF1, 0xB6, 0xF1, 0xB7, 0xBF, 0xD5, 0xB7, 0x8B, /* 0x78-0x7B */
+	0xB7, 0x8C, 0xB7, 0x8D, 0xB7, 0x8E, 0xB4, 0xA9, /* 0x7C-0x7F */
+	
+	0xF1, 0xB8, 0xCD, 0xBB, 0xB7, 0x8F, 0xC7, 0xD4, /* 0x80-0x83 */
+	0xD5, 0xAD, 0xB7, 0x90, 0xF1, 0xB9, 0xB7, 0x91, /* 0x84-0x87 */
+	0xF1, 0xBA, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0x88-0x8B */
+	0xB7, 0x95, 0xC7, 0xCF, 0xB7, 0x96, 0xB7, 0x97, /* 0x8C-0x8F */
+	0xB7, 0x98, 0xD2, 0xA4, 0xD6, 0xCF, 0xB7, 0x99, /* 0x90-0x93 */
+	0xB7, 0x9A, 0xF1, 0xBB, 0xBD, 0xD1, 0xB4, 0xB0, /* 0x94-0x97 */
+	0xBE, 0xBD, 0xB7, 0x9B, 0xB7, 0x9C, 0xB7, 0x9D, /* 0x98-0x9B */
+	0xB4, 0xDC, 0xCE, 0xD1, 0xB7, 0x9E, 0xBF, 0xDF, /* 0x9C-0x9F */
+	0xF1, 0xBD, 0xB7, 0x9F, 0xB7, 0xA0, 0xB8, 0x40, /* 0xA0-0xA3 */
+	0xB8, 0x41, 0xBF, 0xFA, 0xF1, 0xBC, 0xB8, 0x42, /* 0xA4-0xA7 */
+	0xF1, 0xBF, 0xB8, 0x43, 0xB8, 0x44, 0xB8, 0x45, /* 0xA8-0xAB */
+	0xF1, 0xBE, 0xF1, 0xC0, 0xB8, 0x46, 0xB8, 0x47, /* 0xAC-0xAF */
+	0xB8, 0x48, 0xB8, 0x49, 0xB8, 0x4A, 0xF1, 0xC1, /* 0xB0-0xB3 */
+	0xB8, 0x4B, 0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, /* 0xB4-0xB7 */
+	0xB8, 0x4F, 0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, /* 0xB8-0xBB */
+	0xB8, 0x53, 0xB8, 0x54, 0xB8, 0x55, 0xC1, 0xFE, /* 0xBC-0xBF */
+	0xB8, 0x56, 0xB8, 0x57, 0xB8, 0x58, 0xB8, 0x59, /* 0xC0-0xC3 */
+	0xB8, 0x5A, 0xB8, 0x5B, 0xB8, 0x5C, 0xB8, 0x5D, /* 0xC4-0xC7 */
+	0xB8, 0x5E, 0xB8, 0x5F, 0xB8, 0x60, 0xC1, 0xA2, /* 0xC8-0xCB */
+	0xB8, 0x61, 0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, /* 0xCC-0xCF */
+	0xB8, 0x65, 0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, /* 0xD0-0xD3 */
+	0xB8, 0x69, 0xB8, 0x6A, 0xCA, 0xFA, 0xB8, 0x6B, /* 0xD4-0xD7 */
+	0xB8, 0x6C, 0xD5, 0xBE, 0xB8, 0x6D, 0xB8, 0x6E, /* 0xD8-0xDB */
+	0xB8, 0x6F, 0xB8, 0x70, 0xBE, 0xBA, 0xBE, 0xB9, /* 0xDC-0xDF */
+	0xD5, 0xC2, 0xB8, 0x71, 0xB8, 0x72, 0xBF, 0xA2, /* 0xE0-0xE3 */
+	0xB8, 0x73, 0xCD, 0xAF, 0xF1, 0xB5, 0xB8, 0x74, /* 0xE4-0xE7 */
+	0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, 0xB8, 0x78, /* 0xE8-0xEB */
+	0xB8, 0x79, 0xBD, 0xDF, 0xB8, 0x7A, 0xB6, 0xCB, /* 0xEC-0xEF */
+	0xB8, 0x7B, 0xB8, 0x7C, 0xB8, 0x7D, 0xB8, 0x7E, /* 0xF0-0xF3 */
+	0xB8, 0x80, 0xB8, 0x81, 0xB8, 0x82, 0xB8, 0x83, /* 0xF4-0xF7 */
+	0xB8, 0x84, 0xD6, 0xF1, 0xF3, 0xC3, 0xB8, 0x85, /* 0xF8-0xFB */
+	0xB8, 0x86, 0xF3, 0xC4, 0xB8, 0x87, 0xB8, 0xCD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, 0xF3, 0xC6, /* 0x00-0x03 */
+	0xF3, 0xC7, 0xB8, 0x8B, 0xB0, 0xCA, 0xB8, 0x8C, /* 0x04-0x07 */
+	0xF3, 0xC5, 0xB8, 0x8D, 0xF3, 0xC9, 0xCB, 0xF1, /* 0x08-0x0B */
+	0xB8, 0x8E, 0xB8, 0x8F, 0xB8, 0x90, 0xF3, 0xCB, /* 0x0C-0x0F */
+	0xB8, 0x91, 0xD0, 0xA6, 0xB8, 0x92, 0xB8, 0x93, /* 0x10-0x13 */
+	0xB1, 0xCA, 0xF3, 0xC8, 0xB8, 0x94, 0xB8, 0x95, /* 0x14-0x17 */
+	0xB8, 0x96, 0xF3, 0xCF, 0xB8, 0x97, 0xB5, 0xD1, /* 0x18-0x1B */
+	0xB8, 0x98, 0xB8, 0x99, 0xF3, 0xD7, 0xB8, 0x9A, /* 0x1C-0x1F */
+	0xF3, 0xD2, 0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, /* 0x20-0x23 */
+	0xF3, 0xD4, 0xF3, 0xD3, 0xB7, 0xFB, 0xB8, 0x9E, /* 0x24-0x27 */
+	0xB1, 0xBF, 0xB8, 0x9F, 0xF3, 0xCE, 0xF3, 0xCA, /* 0x28-0x2B */
+	0xB5, 0xDA, 0xB8, 0xA0, 0xF3, 0xD0, 0xB9, 0x40, /* 0x2C-0x2F */
+	0xB9, 0x41, 0xF3, 0xD1, 0xB9, 0x42, 0xF3, 0xD5, /* 0x30-0x33 */
+	0xB9, 0x43, 0xB9, 0x44, 0xB9, 0x45, 0xB9, 0x46, /* 0x34-0x37 */
+	0xF3, 0xCD, 0xB9, 0x47, 0xBC, 0xE3, 0xB9, 0x48, /* 0x38-0x3B */
+	0xC1, 0xFD, 0xB9, 0x49, 0xF3, 0xD6, 0xB9, 0x4A, /* 0x3C-0x3F */
+	0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x40-0x43 */
+	0xB9, 0x4F, 0xF3, 0xDA, 0xB9, 0x50, 0xF3, 0xCC, /* 0x44-0x47 */
+	0xB9, 0x51, 0xB5, 0xC8, 0xB9, 0x52, 0xBD, 0xEE, /* 0x48-0x4B */
+	0xF3, 0xDC, 0xB9, 0x53, 0xB9, 0x54, 0xB7, 0xA4, /* 0x4C-0x4F */
+	0xBF, 0xF0, 0xD6, 0xFE, 0xCD, 0xB2, 0xB9, 0x55, /* 0x50-0x53 */
+	0xB4, 0xF0, 0xB9, 0x56, 0xB2, 0xDF, 0xB9, 0x57, /* 0x54-0x57 */
+	0xF3, 0xD8, 0xB9, 0x58, 0xF3, 0xD9, 0xC9, 0xB8, /* 0x58-0x5B */
+	0xB9, 0x59, 0xF3, 0xDD, 0xB9, 0x5A, 0xB9, 0x5B, /* 0x5C-0x5F */
+	0xF3, 0xDE, 0xB9, 0x5C, 0xF3, 0xE1, 0xB9, 0x5D, /* 0x60-0x63 */
+	0xB9, 0x5E, 0xB9, 0x5F, 0xB9, 0x60, 0xB9, 0x61, /* 0x64-0x67 */
+	0xB9, 0x62, 0xB9, 0x63, 0xB9, 0x64, 0xB9, 0x65, /* 0x68-0x6B */
+	0xB9, 0x66, 0xB9, 0x67, 0xF3, 0xDF, 0xB9, 0x68, /* 0x6C-0x6F */
+	0xB9, 0x69, 0xF3, 0xE3, 0xF3, 0xE2, 0xB9, 0x6A, /* 0x70-0x73 */
+	0xB9, 0x6B, 0xF3, 0xDB, 0xB9, 0x6C, 0xBF, 0xEA, /* 0x74-0x77 */
+	0xB9, 0x6D, 0xB3, 0xEF, 0xB9, 0x6E, 0xF3, 0xE0, /* 0x78-0x7B */
+	0xB9, 0x6F, 0xB9, 0x70, 0xC7, 0xA9, 0xB9, 0x71, /* 0x7C-0x7F */
+	
+	0xBC, 0xF2, 0xB9, 0x72, 0xB9, 0x73, 0xB9, 0x74, /* 0x80-0x83 */
+	0xB9, 0x75, 0xF3, 0xEB, 0xB9, 0x76, 0xB9, 0x77, /* 0x84-0x87 */
+	0xB9, 0x78, 0xB9, 0x79, 0xB9, 0x7A, 0xB9, 0x7B, /* 0x88-0x8B */
+	0xB9, 0x7C, 0xB9, 0xBF, 0xB9, 0x7D, 0xB9, 0x7E, /* 0x8C-0x8F */
+	0xF3, 0xE4, 0xB9, 0x80, 0xB9, 0x81, 0xB9, 0x82, /* 0x90-0x93 */
+	0xB2, 0xAD, 0xBB, 0xFE, 0xB9, 0x83, 0xCB, 0xE3, /* 0x94-0x97 */
+	0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x98-0x9B */
+	0xF3, 0xED, 0xF3, 0xE9, 0xB9, 0x88, 0xB9, 0x89, /* 0x9C-0x9F */
+	0xB9, 0x8A, 0xB9, 0xDC, 0xF3, 0xEE, 0xB9, 0x8B, /* 0xA0-0xA3 */
+	0xB9, 0x8C, 0xB9, 0x8D, 0xF3, 0xE5, 0xF3, 0xE6, /* 0xA4-0xA7 */
+	0xF3, 0xEA, 0xC2, 0xE1, 0xF3, 0xEC, 0xF3, 0xEF, /* 0xA8-0xAB */
+	0xF3, 0xE8, 0xBC, 0xFD, 0xB9, 0x8E, 0xB9, 0x8F, /* 0xAC-0xAF */
+	0xB9, 0x90, 0xCF, 0xE4, 0xB9, 0x91, 0xB9, 0x92, /* 0xB0-0xB3 */
+	0xF3, 0xF0, 0xB9, 0x93, 0xB9, 0x94, 0xB9, 0x95, /* 0xB4-0xB7 */
+	0xF3, 0xE7, 0xB9, 0x96, 0xB9, 0x97, 0xB9, 0x98, /* 0xB8-0xBB */
+	0xB9, 0x99, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0xBC-0xBF */
+	0xB9, 0x9D, 0xF3, 0xF2, 0xB9, 0x9E, 0xB9, 0x9F, /* 0xC0-0xC3 */
+	0xB9, 0xA0, 0xBA, 0x40, 0xD7, 0xAD, 0xC6, 0xAA, /* 0xC4-0xC7 */
+	0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, 0xBA, 0x44, /* 0xC8-0xCB */
+	0xF3, 0xF3, 0xBA, 0x45, 0xBA, 0x46, 0xBA, 0x47, /* 0xCC-0xCF */
+	0xBA, 0x48, 0xF3, 0xF1, 0xBA, 0x49, 0xC2, 0xA8, /* 0xD0-0xD3 */
+	0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, 0xBA, 0x4D, /* 0xD4-0xD7 */
+	0xBA, 0x4E, 0xB8, 0xDD, 0xF3, 0xF5, 0xBA, 0x4F, /* 0xD8-0xDB */
+	0xBA, 0x50, 0xF3, 0xF4, 0xBA, 0x51, 0xBA, 0x52, /* 0xDC-0xDF */
+	0xBA, 0x53, 0xB4, 0xDB, 0xBA, 0x54, 0xBA, 0x55, /* 0xE0-0xE3 */
+	0xBA, 0x56, 0xF3, 0xF6, 0xF3, 0xF7, 0xBA, 0x57, /* 0xE4-0xE7 */
+	0xBA, 0x58, 0xBA, 0x59, 0xF3, 0xF8, 0xBA, 0x5A, /* 0xE8-0xEB */
+	0xBA, 0x5B, 0xBA, 0x5C, 0xC0, 0xBA, 0xBA, 0x5D, /* 0xEC-0xEF */
+	0xBA, 0x5E, 0xC0, 0xE9, 0xBA, 0x5F, 0xBA, 0x60, /* 0xF0-0xF3 */
+	0xBA, 0x61, 0xBA, 0x62, 0xBA, 0x63, 0xC5, 0xF1, /* 0xF4-0xF7 */
+	0xBA, 0x64, 0xBA, 0x65, 0xBA, 0x66, 0xBA, 0x67, /* 0xF8-0xFB */
+	0xF3, 0xFB, 0xBA, 0x68, 0xF3, 0xFA, 0xBA, 0x69, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0xBA, 0x6A, 0xBA, 0x6B, 0xBA, 0x6C, 0xBA, 0x6D, /* 0x00-0x03 */
+	0xBA, 0x6E, 0xBA, 0x6F, 0xBA, 0x70, 0xB4, 0xD8, /* 0x04-0x07 */
+	0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, 0xF3, 0xFE, /* 0x08-0x0B */
+	0xF3, 0xF9, 0xBA, 0x74, 0xBA, 0x75, 0xF3, 0xFC, /* 0x0C-0x0F */
+	0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, 0xBA, 0x79, /* 0x10-0x13 */
+	0xBA, 0x7A, 0xBA, 0x7B, 0xF3, 0xFD, 0xBA, 0x7C, /* 0x14-0x17 */
+	0xBA, 0x7D, 0xBA, 0x7E, 0xBA, 0x80, 0xBA, 0x81, /* 0x18-0x1B */
+	0xBA, 0x82, 0xBA, 0x83, 0xBA, 0x84, 0xF4, 0xA1, /* 0x1C-0x1F */
+	0xBA, 0x85, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0x20-0x23 */
+	0xBA, 0x89, 0xBA, 0x8A, 0xF4, 0xA3, 0xBB, 0xC9, /* 0x24-0x27 */
+	0xBA, 0x8B, 0xBA, 0x8C, 0xF4, 0xA2, 0xBA, 0x8D, /* 0x28-0x2B */
+	0xBA, 0x8E, 0xBA, 0x8F, 0xBA, 0x90, 0xBA, 0x91, /* 0x2C-0x2F */
+	0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0x30-0x33 */
+	0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0x34-0x37 */
+	0xF4, 0xA4, 0xBA, 0x9A, 0xBA, 0x9B, 0xBA, 0x9C, /* 0x38-0x3B */
+	0xBA, 0x9D, 0xBA, 0x9E, 0xBA, 0x9F, 0xB2, 0xBE, /* 0x3C-0x3F */
+	0xF4, 0xA6, 0xF4, 0xA5, 0xBA, 0xA0, 0xBB, 0x40, /* 0x40-0x43 */
+	0xBB, 0x41, 0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, /* 0x44-0x47 */
+	0xBB, 0x45, 0xBB, 0x46, 0xBB, 0x47, 0xBB, 0x48, /* 0x48-0x4B */
+	0xBB, 0x49, 0xBC, 0xAE, 0xBB, 0x4A, 0xBB, 0x4B, /* 0x4C-0x4F */
+	0xBB, 0x4C, 0xBB, 0x4D, 0xBB, 0x4E, 0xBB, 0x4F, /* 0x50-0x53 */
+	0xBB, 0x50, 0xBB, 0x51, 0xBB, 0x52, 0xBB, 0x53, /* 0x54-0x57 */
+	0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x58-0x5B */
+	0xBB, 0x58, 0xBB, 0x59, 0xBB, 0x5A, 0xBB, 0x5B, /* 0x5C-0x5F */
+	0xBB, 0x5C, 0xBB, 0x5D, 0xBB, 0x5E, 0xBB, 0x5F, /* 0x60-0x63 */
+	0xBB, 0x60, 0xBB, 0x61, 0xBB, 0x62, 0xBB, 0x63, /* 0x64-0x67 */
+	0xBB, 0x64, 0xBB, 0x65, 0xBB, 0x66, 0xBB, 0x67, /* 0x68-0x6B */
+	0xBB, 0x68, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x6C-0x6F */
+	0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xC3, 0xD7, /* 0x70-0x73 */
+	0xD9, 0xE1, 0xBB, 0x6F, 0xBB, 0x70, 0xBB, 0x71, /* 0x74-0x77 */
+	0xBB, 0x72, 0xBB, 0x73, 0xBB, 0x74, 0xC0, 0xE0, /* 0x78-0x7B */
+	0xF4, 0xCC, 0xD7, 0xD1, 0xBB, 0x75, 0xBB, 0x76, /* 0x7C-0x7F */
+	
+	0xBB, 0x77, 0xBB, 0x78, 0xBB, 0x79, 0xBB, 0x7A, /* 0x80-0x83 */
+	0xBB, 0x7B, 0xBB, 0x7C, 0xBB, 0x7D, 0xBB, 0x7E, /* 0x84-0x87 */
+	0xBB, 0x80, 0xB7, 0xDB, 0xBB, 0x81, 0xBB, 0x82, /* 0x88-0x8B */
+	0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x8C-0x8F */
+	0xBB, 0x87, 0xF4, 0xCE, 0xC1, 0xA3, 0xBB, 0x88, /* 0x90-0x93 */
+	0xBB, 0x89, 0xC6, 0xC9, 0xBB, 0x8A, 0xB4, 0xD6, /* 0x94-0x97 */
+	0xD5, 0xB3, 0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, /* 0x98-0x9B */
+	0xF4, 0xD0, 0xF4, 0xCF, 0xF4, 0xD1, 0xCB, 0xDA, /* 0x9C-0x9F */
+	0xBB, 0x8E, 0xBB, 0x8F, 0xF4, 0xD2, 0xBB, 0x90, /* 0xA0-0xA3 */
+	0xD4, 0xC1, 0xD6, 0xE0, 0xBB, 0x91, 0xBB, 0x92, /* 0xA4-0xA7 */
+	0xBB, 0x93, 0xBB, 0x94, 0xB7, 0xE0, 0xBB, 0x95, /* 0xA8-0xAB */
+	0xBB, 0x96, 0xBB, 0x97, 0xC1, 0xB8, 0xBB, 0x98, /* 0xAC-0xAF */
+	0xBB, 0x99, 0xC1, 0xBB, 0xF4, 0xD3, 0xBE, 0xAC, /* 0xB0-0xB3 */
+	0xBB, 0x9A, 0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, /* 0xB4-0xB7 */
+	0xBB, 0x9E, 0xB4, 0xE2, 0xBB, 0x9F, 0xBB, 0xA0, /* 0xB8-0xBB */
+	0xF4, 0xD4, 0xF4, 0xD5, 0xBE, 0xAB, 0xBC, 0x40, /* 0xBC-0xBF */
+	0xBC, 0x41, 0xF4, 0xD6, 0xBC, 0x42, 0xBC, 0x43, /* 0xC0-0xC3 */
+	0xBC, 0x44, 0xF4, 0xDB, 0xBC, 0x45, 0xF4, 0xD7, /* 0xC4-0xC7 */
+	0xF4, 0xDA, 0xBC, 0x46, 0xBA, 0xFD, 0xBC, 0x47, /* 0xC8-0xCB */
+	0xF4, 0xD8, 0xF4, 0xD9, 0xBC, 0x48, 0xBC, 0x49, /* 0xCC-0xCF */
+	0xBC, 0x4A, 0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, /* 0xD0-0xD3 */
+	0xBC, 0x4E, 0xB8, 0xE2, 0xCC, 0xC7, 0xF4, 0xDC, /* 0xD4-0xD7 */
+	0xBC, 0x4F, 0xB2, 0xDA, 0xBC, 0x50, 0xBC, 0x51, /* 0xD8-0xDB */
+	0xC3, 0xD3, 0xBC, 0x52, 0xBC, 0x53, 0xD4, 0xE3, /* 0xDC-0xDF */
+	0xBF, 0xB7, 0xBC, 0x54, 0xBC, 0x55, 0xBC, 0x56, /* 0xE0-0xE3 */
+	0xBC, 0x57, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0xE4-0xE7 */
+	0xF4, 0xDD, 0xBC, 0x5B, 0xBC, 0x5C, 0xBC, 0x5D, /* 0xE8-0xEB */
+	0xBC, 0x5E, 0xBC, 0x5F, 0xBC, 0x60, 0xC5, 0xB4, /* 0xEC-0xEF */
+	0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0xF0-0xF3 */
+	0xBC, 0x65, 0xBC, 0x66, 0xBC, 0x67, 0xBC, 0x68, /* 0xF4-0xF7 */
+	0xF4, 0xE9, 0xBC, 0x69, 0xBC, 0x6A, 0xCF, 0xB5, /* 0xF8-0xFB */
+	0xBC, 0x6B, 0xBC, 0x6C, 0xBC, 0x6D, 0xBC, 0x6E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0xBC, 0x6F, 0xBC, 0x70, 0xBC, 0x71, 0xBC, 0x72, /* 0x00-0x03 */
+	0xBC, 0x73, 0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, /* 0x04-0x07 */
+	0xBC, 0x77, 0xBC, 0x78, 0xCE, 0xC9, 0xBC, 0x79, /* 0x08-0x0B */
+	0xBC, 0x7A, 0xBC, 0x7B, 0xBC, 0x7C, 0xBC, 0x7D, /* 0x0C-0x0F */
+	0xBC, 0x7E, 0xBC, 0x80, 0xBC, 0x81, 0xBC, 0x82, /* 0x10-0x13 */
+	0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, 0xBC, 0x86, /* 0x14-0x17 */
+	0xBC, 0x87, 0xBC, 0x88, 0xBC, 0x89, 0xBC, 0x8A, /* 0x18-0x1B */
+	0xBC, 0x8B, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0x1C-0x1F */
+	0xCB, 0xD8, 0xBC, 0x8F, 0xCB, 0xF7, 0xBC, 0x90, /* 0x20-0x23 */
+	0xBC, 0x91, 0xBC, 0x92, 0xBC, 0x93, 0xBD, 0xF4, /* 0x24-0x27 */
+	0xBC, 0x94, 0xBC, 0x95, 0xBC, 0x96, 0xD7, 0xCF, /* 0x28-0x2B */
+	0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xC0, 0xDB, /* 0x2C-0x2F */
+	0xBC, 0x9A, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0x30-0x33 */
+	0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x40, /* 0x34-0x37 */
+	0xBD, 0x41, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0x38-0x3B */
+	0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0x3C-0x3F */
+	0xBD, 0x49, 0xBD, 0x4A, 0xBD, 0x4B, 0xBD, 0x4C, /* 0x40-0x43 */
+	0xBD, 0x4D, 0xBD, 0x4E, 0xBD, 0x4F, 0xBD, 0x50, /* 0x44-0x47 */
+	0xBD, 0x51, 0xBD, 0x52, 0xBD, 0x53, 0xBD, 0x54, /* 0x48-0x4B */
+	0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, 0xBD, 0x58, /* 0x4C-0x4F */
+	0xBD, 0x59, 0xBD, 0x5A, 0xBD, 0x5B, 0xBD, 0x5C, /* 0x50-0x53 */
+	0xBD, 0x5D, 0xBD, 0x5E, 0xBD, 0x5F, 0xBD, 0x60, /* 0x54-0x57 */
+	0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0x58-0x5B */
+	0xBD, 0x65, 0xBD, 0x66, 0xBD, 0x67, 0xBD, 0x68, /* 0x5C-0x5F */
+	0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x60-0x63 */
+	0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, 0xBD, 0x70, /* 0x64-0x67 */
+	0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, 0xBD, 0x74, /* 0x68-0x6B */
+	0xBD, 0x75, 0xBD, 0x76, 0xD0, 0xF5, 0xBD, 0x77, /* 0x6C-0x6F */
+	0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x7B, /* 0x70-0x73 */
+	0xBD, 0x7C, 0xBD, 0x7D, 0xBD, 0x7E, 0xF4, 0xEA, /* 0x74-0x77 */
+	0xBD, 0x80, 0xBD, 0x81, 0xBD, 0x82, 0xBD, 0x83, /* 0x78-0x7B */
+	0xBD, 0x84, 0xBD, 0x85, 0xBD, 0x86, 0xBD, 0x87, /* 0x7C-0x7F */
+	
+	0xBD, 0x88, 0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, /* 0x80-0x83 */
+	0xBD, 0x8C, 0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, /* 0x84-0x87 */
+	0xBD, 0x90, 0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, /* 0x88-0x8B */
+	0xBD, 0x94, 0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, /* 0x8C-0x8F */
+	0xBD, 0x98, 0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, /* 0x90-0x93 */
+	0xBD, 0x9C, 0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, /* 0x94-0x97 */
+	0xBD, 0xA0, 0xBE, 0x40, 0xBE, 0x41, 0xBE, 0x42, /* 0x98-0x9B */
+	0xBE, 0x43, 0xBE, 0x44, 0xBE, 0x45, 0xBE, 0x46, /* 0x9C-0x9F */
+	0xBE, 0x47, 0xBE, 0x48, 0xBE, 0x49, 0xBE, 0x4A, /* 0xA0-0xA3 */
+	0xBE, 0x4B, 0xBE, 0x4C, 0xF4, 0xEB, 0xBE, 0x4D, /* 0xA4-0xA7 */
+	0xBE, 0x4E, 0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, /* 0xA8-0xAB */
+	0xBE, 0x52, 0xBE, 0x53, 0xF4, 0xEC, 0xBE, 0x54, /* 0xAC-0xAF */
+	0xBE, 0x55, 0xBE, 0x56, 0xBE, 0x57, 0xBE, 0x58, /* 0xB0-0xB3 */
+	0xBE, 0x59, 0xBE, 0x5A, 0xBE, 0x5B, 0xBE, 0x5C, /* 0xB4-0xB7 */
+	0xBE, 0x5D, 0xBE, 0x5E, 0xBE, 0x5F, 0xBE, 0x60, /* 0xB8-0xBB */
+	0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0xBC-0xBF */
+	0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0xC0-0xC3 */
+	0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, 0xBE, 0x6C, /* 0xC4-0xC7 */
+	0xBE, 0x6D, 0xBE, 0x6E, 0xBE, 0x6F, 0xBE, 0x70, /* 0xC8-0xCB */
+	0xBE, 0x71, 0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, /* 0xCC-0xCF */
+	0xBE, 0x75, 0xBE, 0x76, 0xBE, 0x77, 0xBE, 0x78, /* 0xD0-0xD3 */
+	0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x7B, 0xBE, 0x7C, /* 0xD4-0xD7 */
+	0xBE, 0x7D, 0xBE, 0x7E, 0xBE, 0x80, 0xBE, 0x81, /* 0xD8-0xDB */
+	0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0xDC-0xDF */
+	0xBE, 0x86, 0xBE, 0x87, 0xBE, 0x88, 0xBE, 0x89, /* 0xE0-0xE3 */
+	0xBE, 0x8A, 0xBE, 0x8B, 0xBE, 0x8C, 0xBE, 0x8D, /* 0xE4-0xE7 */
+	0xBE, 0x8E, 0xBE, 0x8F, 0xBE, 0x90, 0xBE, 0x91, /* 0xE8-0xEB */
+	0xBE, 0x92, 0xBE, 0x93, 0xBE, 0x94, 0xBE, 0x95, /* 0xEC-0xEF */
+	0xBE, 0x96, 0xBE, 0x97, 0xBE, 0x98, 0xBE, 0x99, /* 0xF0-0xF3 */
+	0xBE, 0x9A, 0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, /* 0xF4-0xF7 */
+	0xBE, 0x9E, 0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x40, /* 0xF8-0xFB */
+	0xBF, 0x41, 0xBF, 0x42, 0xBF, 0x43, 0xBF, 0x44, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0xBF, 0x45, 0xBF, 0x46, 0xBF, 0x47, 0xBF, 0x48, /* 0x00-0x03 */
+	0xBF, 0x49, 0xBF, 0x4A, 0xBF, 0x4B, 0xBF, 0x4C, /* 0x04-0x07 */
+	0xBF, 0x4D, 0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, /* 0x08-0x0B */
+	0xBF, 0x51, 0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, /* 0x0C-0x0F */
+	0xBF, 0x55, 0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, /* 0x10-0x13 */
+	0xBF, 0x59, 0xBF, 0x5A, 0xBF, 0x5B, 0xBF, 0x5C, /* 0x14-0x17 */
+	0xBF, 0x5D, 0xBF, 0x5E, 0xBF, 0x5F, 0xBF, 0x60, /* 0x18-0x1B */
+	0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, 0xBF, 0x64, /* 0x1C-0x1F */
+	0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, 0xBF, 0x68, /* 0x20-0x23 */
+	0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, 0xBF, 0x6C, /* 0x24-0x27 */
+	0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, 0xBF, 0x70, /* 0x28-0x2B */
+	0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, 0xBF, 0x74, /* 0x2C-0x2F */
+	0xBF, 0x75, 0xBF, 0x76, 0xBF, 0x77, 0xBF, 0x78, /* 0x30-0x33 */
+	0xBF, 0x79, 0xBF, 0x7A, 0xBF, 0x7B, 0xBF, 0x7C, /* 0x34-0x37 */
+	0xBF, 0x7D, 0xBF, 0x7E, 0xBF, 0x80, 0xF7, 0xE3, /* 0x38-0x3B */
+	0xBF, 0x81, 0xBF, 0x82, 0xBF, 0x83, 0xBF, 0x84, /* 0x3C-0x3F */
+	0xBF, 0x85, 0xB7, 0xB1, 0xBF, 0x86, 0xBF, 0x87, /* 0x40-0x43 */
+	0xBF, 0x88, 0xBF, 0x89, 0xBF, 0x8A, 0xF4, 0xED, /* 0x44-0x47 */
+	0xBF, 0x8B, 0xBF, 0x8C, 0xBF, 0x8D, 0xBF, 0x8E, /* 0x48-0x4B */
+	0xBF, 0x8F, 0xBF, 0x90, 0xBF, 0x91, 0xBF, 0x92, /* 0x4C-0x4F */
+	0xBF, 0x93, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0x50-0x53 */
+	0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, 0xBF, 0x9A, /* 0x54-0x57 */
+	0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, 0xBF, 0x9E, /* 0x58-0x5B */
+	0xBF, 0x9F, 0xBF, 0xA0, 0xC0, 0x40, 0xC0, 0x41, /* 0x5C-0x5F */
+	0xC0, 0x42, 0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, /* 0x60-0x63 */
+	0xC0, 0x46, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x64-0x67 */
+	0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, 0xC0, 0x4D, /* 0x68-0x6B */
+	0xC0, 0x4E, 0xC0, 0x4F, 0xC0, 0x50, 0xC0, 0x51, /* 0x6C-0x6F */
+	0xC0, 0x52, 0xC0, 0x53, 0xC0, 0x54, 0xC0, 0x55, /* 0x70-0x73 */
+	0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, 0xC0, 0x59, /* 0x74-0x77 */
+	0xC0, 0x5A, 0xC0, 0x5B, 0xC0, 0x5C, 0xC0, 0x5D, /* 0x78-0x7B */
+	0xC0, 0x5E, 0xC0, 0x5F, 0xC0, 0x60, 0xC0, 0x61, /* 0x7C-0x7F */
+	
+	0xC0, 0x62, 0xC0, 0x63, 0xD7, 0xEB, 0xC0, 0x64, /* 0x80-0x83 */
+	0xC0, 0x65, 0xC0, 0x66, 0xC0, 0x67, 0xC0, 0x68, /* 0x84-0x87 */
+	0xC0, 0x69, 0xC0, 0x6A, 0xC0, 0x6B, 0xC0, 0x6C, /* 0x88-0x8B */
+	0xC0, 0x6D, 0xC0, 0x6E, 0xC0, 0x6F, 0xC0, 0x70, /* 0x8C-0x8F */
+	0xC0, 0x71, 0xC0, 0x72, 0xC0, 0x73, 0xC0, 0x74, /* 0x90-0x93 */
+	0xC0, 0x75, 0xC0, 0x76, 0xC0, 0x77, 0xC0, 0x78, /* 0x94-0x97 */
+	0xC0, 0x79, 0xC0, 0x7A, 0xC0, 0x7B, 0xF4, 0xEE, /* 0x98-0x9B */
+	0xC0, 0x7C, 0xC0, 0x7D, 0xC0, 0x7E, 0xE6, 0xF9, /* 0x9C-0x9F */
+	0xBE, 0xC0, 0xE6, 0xFA, 0xBA, 0xEC, 0xE6, 0xFB, /* 0xA0-0xA3 */
+	0xCF, 0xCB, 0xE6, 0xFC, 0xD4, 0xBC, 0xBC, 0xB6, /* 0xA4-0xA7 */
+	0xE6, 0xFD, 0xE6, 0xFE, 0xBC, 0xCD, 0xC8, 0xD2, /* 0xA8-0xAB */
+	0xCE, 0xB3, 0xE7, 0xA1, 0xC0, 0x80, 0xB4, 0xBF, /* 0xAC-0xAF */
+	0xE7, 0xA2, 0xC9, 0xB4, 0xB8, 0xD9, 0xC4, 0xC9, /* 0xB0-0xB3 */
+	0xC0, 0x81, 0xD7, 0xDD, 0xC2, 0xDA, 0xB7, 0xD7, /* 0xB4-0xB7 */
+	0xD6, 0xBD, 0xCE, 0xC6, 0xB7, 0xC4, 0xC0, 0x82, /* 0xB8-0xBB */
+	0xC0, 0x83, 0xC5, 0xA6, 0xE7, 0xA3, 0xCF, 0xDF, /* 0xBC-0xBF */
+	0xE7, 0xA4, 0xE7, 0xA5, 0xE7, 0xA6, 0xC1, 0xB7, /* 0xC0-0xC3 */
+	0xD7, 0xE9, 0xC9, 0xF0, 0xCF, 0xB8, 0xD6, 0xAF, /* 0xC4-0xC7 */
+	0xD6, 0xD5, 0xE7, 0xA7, 0xB0, 0xED, 0xE7, 0xA8, /* 0xC8-0xCB */
+	0xE7, 0xA9, 0xC9, 0xDC, 0xD2, 0xEF, 0xBE, 0xAD, /* 0xCC-0xCF */
+	0xE7, 0xAA, 0xB0, 0xF3, 0xC8, 0xDE, 0xBD, 0xE1, /* 0xD0-0xD3 */
+	0xE7, 0xAB, 0xC8, 0xC6, 0xC0, 0x84, 0xE7, 0xAC, /* 0xD4-0xD7 */
+	0xBB, 0xE6, 0xB8, 0xF8, 0xD1, 0xA4, 0xE7, 0xAD, /* 0xD8-0xDB */
+	0xC2, 0xE7, 0xBE, 0xF8, 0xBD, 0xCA, 0xCD, 0xB3, /* 0xDC-0xDF */
+	0xE7, 0xAE, 0xE7, 0xAF, 0xBE, 0xEE, 0xD0, 0xE5, /* 0xE0-0xE3 */
+	0xC0, 0x85, 0xCB, 0xE7, 0xCC, 0xD0, 0xBC, 0xCC, /* 0xE4-0xE7 */
+	0xE7, 0xB0, 0xBC, 0xA8, 0xD0, 0xF7, 0xE7, 0xB1, /* 0xE8-0xEB */
+	0xC0, 0x86, 0xD0, 0xF8, 0xE7, 0xB2, 0xE7, 0xB3, /* 0xEC-0xEF */
+	0xB4, 0xC2, 0xE7, 0xB4, 0xE7, 0xB5, 0xC9, 0xFE, /* 0xF0-0xF3 */
+	0xCE, 0xAC, 0xC3, 0xE0, 0xE7, 0xB7, 0xB1, 0xC1, /* 0xF4-0xF7 */
+	0xB3, 0xF1, 0xC0, 0x87, 0xE7, 0xB8, 0xE7, 0xB9, /* 0xF8-0xFB */
+	0xD7, 0xDB, 0xD5, 0xC0, 0xE7, 0xBA, 0xC2, 0xCC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0xD7, 0xBA, 0xE7, 0xBB, 0xE7, 0xBC, 0xE7, 0xBD, /* 0x00-0x03 */
+	0xBC, 0xEA, 0xC3, 0xE5, 0xC0, 0xC2, 0xE7, 0xBE, /* 0x04-0x07 */
+	0xE7, 0xBF, 0xBC, 0xA9, 0xC0, 0x88, 0xE7, 0xC0, /* 0x08-0x0B */
+	0xE7, 0xC1, 0xE7, 0xB6, 0xB6, 0xD0, 0xE7, 0xC2, /* 0x0C-0x0F */
+	0xC0, 0x89, 0xE7, 0xC3, 0xE7, 0xC4, 0xBB, 0xBA, /* 0x10-0x13 */
+	0xB5, 0xDE, 0xC2, 0xC6, 0xB1, 0xE0, 0xE7, 0xC5, /* 0x14-0x17 */
+	0xD4, 0xB5, 0xE7, 0xC6, 0xB8, 0xBF, 0xE7, 0xC8, /* 0x18-0x1B */
+	0xE7, 0xC7, 0xB7, 0xEC, 0xC0, 0x8A, 0xE7, 0xC9, /* 0x1C-0x1F */
+	0xB2, 0xF8, 0xE7, 0xCA, 0xE7, 0xCB, 0xE7, 0xCC, /* 0x20-0x23 */
+	0xE7, 0xCD, 0xE7, 0xCE, 0xE7, 0xCF, 0xE7, 0xD0, /* 0x24-0x27 */
+	0xD3, 0xA7, 0xCB, 0xF5, 0xE7, 0xD1, 0xE7, 0xD2, /* 0x28-0x2B */
+	0xE7, 0xD3, 0xE7, 0xD4, 0xC9, 0xC9, 0xE7, 0xD5, /* 0x2C-0x2F */
+	0xE7, 0xD6, 0xE7, 0xD7, 0xE7, 0xD8, 0xE7, 0xD9, /* 0x30-0x33 */
+	0xBD, 0xC9, 0xE7, 0xDA, 0xF3, 0xBE, 0xC0, 0x8B, /* 0x34-0x37 */
+	0xB8, 0xD7, 0xC0, 0x8C, 0xC8, 0xB1, 0xC0, 0x8D, /* 0x38-0x3B */
+	0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, 0xC0, 0x91, /* 0x3C-0x3F */
+	0xC0, 0x92, 0xC0, 0x93, 0xF3, 0xBF, 0xC0, 0x94, /* 0x40-0x43 */
+	0xF3, 0xC0, 0xF3, 0xC1, 0xC0, 0x95, 0xC0, 0x96, /* 0x44-0x47 */
+	0xC0, 0x97, 0xC0, 0x98, 0xC0, 0x99, 0xC0, 0x9A, /* 0x48-0x4B */
+	0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, 0xC0, 0x9E, /* 0x4C-0x4F */
+	0xB9, 0xDE, 0xCD, 0xF8, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x50-0x53 */
+	0xD8, 0xE8, 0xBA, 0xB1, 0xC1, 0x40, 0xC2, 0xDE, /* 0x54-0x57 */
+	0xEE, 0xB7, 0xC1, 0x41, 0xB7, 0xA3, 0xC1, 0x42, /* 0x58-0x5B */
+	0xC1, 0x43, 0xC1, 0x44, 0xC1, 0x45, 0xEE, 0xB9, /* 0x5C-0x5F */
+	0xC1, 0x46, 0xEE, 0xB8, 0xB0, 0xD5, 0xC1, 0x47, /* 0x60-0x63 */
+	0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x64-0x67 */
+	0xEE, 0xBB, 0xD5, 0xD6, 0xD7, 0xEF, 0xC1, 0x4C, /* 0x68-0x6B */
+	0xC1, 0x4D, 0xC1, 0x4E, 0xD6, 0xC3, 0xC1, 0x4F, /* 0x6C-0x6F */
+	0xC1, 0x50, 0xEE, 0xBD, 0xCA, 0xF0, 0xC1, 0x51, /* 0x70-0x73 */
+	0xEE, 0xBC, 0xC1, 0x52, 0xC1, 0x53, 0xC1, 0x54, /* 0x74-0x77 */
+	0xC1, 0x55, 0xEE, 0xBE, 0xC1, 0x56, 0xC1, 0x57, /* 0x78-0x7B */
+	0xC1, 0x58, 0xC1, 0x59, 0xEE, 0xC0, 0xC1, 0x5A, /* 0x7C-0x7F */
+	
+	0xC1, 0x5B, 0xEE, 0xBF, 0xC1, 0x5C, 0xC1, 0x5D, /* 0x80-0x83 */
+	0xC1, 0x5E, 0xC1, 0x5F, 0xC1, 0x60, 0xC1, 0x61, /* 0x84-0x87 */
+	0xC1, 0x62, 0xC1, 0x63, 0xD1, 0xF2, 0xC1, 0x64, /* 0x88-0x8B */
+	0xC7, 0xBC, 0xC1, 0x65, 0xC3, 0xC0, 0xC1, 0x66, /* 0x8C-0x8F */
+	0xC1, 0x67, 0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, /* 0x90-0x93 */
+	0xB8, 0xE1, 0xC1, 0x6B, 0xC1, 0x6C, 0xC1, 0x6D, /* 0x94-0x97 */
+	0xC1, 0x6E, 0xC1, 0x6F, 0xC1, 0xE7, 0xC1, 0x70, /* 0x98-0x9B */
+	0xC1, 0x71, 0xF4, 0xC6, 0xD0, 0xDF, 0xF4, 0xC7, /* 0x9C-0x9F */
+	0xC1, 0x72, 0xCF, 0xDB, 0xC1, 0x73, 0xC1, 0x74, /* 0xA0-0xA3 */
+	0xC8, 0xBA, 0xC1, 0x75, 0xC1, 0x76, 0xF4, 0xC8, /* 0xA4-0xA7 */
+	0xC1, 0x77, 0xC1, 0x78, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA8-0xAB */
+	0xC1, 0x7B, 0xC1, 0x7C, 0xC1, 0x7D, 0xF4, 0xC9, /* 0xAC-0xAF */
+	0xF4, 0xCA, 0xC1, 0x7E, 0xF4, 0xCB, 0xC1, 0x80, /* 0xB0-0xB3 */
+	0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xB4-0xB7 */
+	0xD9, 0xFA, 0xB8, 0xFE, 0xC1, 0x85, 0xC1, 0x86, /* 0xB8-0xBB */
+	0xE5, 0xF1, 0xD3, 0xF0, 0xC1, 0x87, 0xF4, 0xE0, /* 0xBC-0xBF */
+	0xC1, 0x88, 0xCE, 0xCC, 0xC1, 0x89, 0xC1, 0x8A, /* 0xC0-0xC3 */
+	0xC1, 0x8B, 0xB3, 0xE1, 0xC1, 0x8C, 0xC1, 0x8D, /* 0xC4-0xC7 */
+	0xC1, 0x8E, 0xC1, 0x8F, 0xF1, 0xB4, 0xC1, 0x90, /* 0xC8-0xCB */
+	0xD2, 0xEE, 0xC1, 0x91, 0xF4, 0xE1, 0xC1, 0x92, /* 0xCC-0xCF */
+	0xC1, 0x93, 0xC1, 0x94, 0xC1, 0x95, 0xC1, 0x96, /* 0xD0-0xD3 */
+	0xCF, 0xE8, 0xF4, 0xE2, 0xC1, 0x97, 0xC1, 0x98, /* 0xD4-0xD7 */
+	0xC7, 0xCC, 0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, /* 0xD8-0xDB */
+	0xC1, 0x9C, 0xC1, 0x9D, 0xC1, 0x9E, 0xB5, 0xD4, /* 0xDC-0xDF */
+	0xB4, 0xE4, 0xF4, 0xE4, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xE0-0xE3 */
+	0xC2, 0x40, 0xF4, 0xE3, 0xF4, 0xE5, 0xC2, 0x41, /* 0xE4-0xE7 */
+	0xC2, 0x42, 0xF4, 0xE6, 0xC2, 0x43, 0xC2, 0x44, /* 0xE8-0xEB */
+	0xC2, 0x45, 0xC2, 0x46, 0xF4, 0xE7, 0xC2, 0x47, /* 0xEC-0xEF */
+	0xBA, 0xB2, 0xB0, 0xBF, 0xC2, 0x48, 0xF4, 0xE8, /* 0xF0-0xF3 */
+	0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x4C, /* 0xF4-0xF7 */
+	0xC2, 0x4D, 0xC2, 0x4E, 0xC2, 0x4F, 0xB7, 0xAD, /* 0xF8-0xFB */
+	0xD2, 0xED, 0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0xD2, 0xAB, 0xC0, 0xCF, 0xC2, 0x53, 0xBF, 0xBC, /* 0x00-0x03 */
+	0xEB, 0xA3, 0xD5, 0xDF, 0xEA, 0xC8, 0xC2, 0x54, /* 0x04-0x07 */
+	0xC2, 0x55, 0xC2, 0x56, 0xC2, 0x57, 0xF1, 0xF3, /* 0x08-0x0B */
+	0xB6, 0xF8, 0xCB, 0xA3, 0xC2, 0x58, 0xC2, 0x59, /* 0x0C-0x0F */
+	0xC4, 0xCD, 0xC2, 0x5A, 0xF1, 0xE7, 0xC2, 0x5B, /* 0x10-0x13 */
+	0xF1, 0xE8, 0xB8, 0xFB, 0xF1, 0xE9, 0xBA, 0xC4, /* 0x14-0x17 */
+	0xD4, 0xC5, 0xB0, 0xD2, 0xC2, 0x5C, 0xC2, 0x5D, /* 0x18-0x1B */
+	0xF1, 0xEA, 0xC2, 0x5E, 0xC2, 0x5F, 0xC2, 0x60, /* 0x1C-0x1F */
+	0xF1, 0xEB, 0xC2, 0x61, 0xF1, 0xEC, 0xC2, 0x62, /* 0x20-0x23 */
+	0xC2, 0x63, 0xF1, 0xED, 0xF1, 0xEE, 0xF1, 0xEF, /* 0x24-0x27 */
+	0xF1, 0xF1, 0xF1, 0xF0, 0xC5, 0xD5, 0xC2, 0x64, /* 0x28-0x2B */
+	0xC2, 0x65, 0xC2, 0x66, 0xC2, 0x67, 0xC2, 0x68, /* 0x2C-0x2F */
+	0xC2, 0x69, 0xF1, 0xF2, 0xC2, 0x6A, 0xB6, 0xFA, /* 0x30-0x33 */
+	0xC2, 0x6B, 0xF1, 0xF4, 0xD2, 0xAE, 0xDE, 0xC7, /* 0x34-0x37 */
+	0xCB, 0xCA, 0xC2, 0x6C, 0xC2, 0x6D, 0xB3, 0xDC, /* 0x38-0x3B */
+	0xC2, 0x6E, 0xB5, 0xA2, 0xC2, 0x6F, 0xB9, 0xA2, /* 0x3C-0x3F */
+	0xC2, 0x70, 0xC2, 0x71, 0xC4, 0xF4, 0xF1, 0xF5, /* 0x40-0x43 */
+	0xC2, 0x72, 0xC2, 0x73, 0xF1, 0xF6, 0xC2, 0x74, /* 0x44-0x47 */
+	0xC2, 0x75, 0xC2, 0x76, 0xC1, 0xC4, 0xC1, 0xFB, /* 0x48-0x4B */
+	0xD6, 0xB0, 0xF1, 0xF7, 0xC2, 0x77, 0xC2, 0x78, /* 0x4C-0x4F */
+	0xC2, 0x79, 0xC2, 0x7A, 0xF1, 0xF8, 0xC2, 0x7B, /* 0x50-0x53 */
+	0xC1, 0xAA, 0xC2, 0x7C, 0xC2, 0x7D, 0xC2, 0x7E, /* 0x54-0x57 */
+	0xC6, 0xB8, 0xC2, 0x80, 0xBE, 0xDB, 0xC2, 0x81, /* 0x58-0x5B */
+	0xC2, 0x82, 0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, /* 0x5C-0x5F */
+	0xC2, 0x86, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x60-0x63 */
+	0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, 0xC2, 0x8D, /* 0x64-0x67 */
+	0xC2, 0x8E, 0xF1, 0xF9, 0xB4, 0xCF, 0xC2, 0x8F, /* 0x68-0x6B */
+	0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x6C-0x6F */
+	0xC2, 0x94, 0xF1, 0xFA, 0xC2, 0x95, 0xC2, 0x96, /* 0x70-0x73 */
+	0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x74-0x77 */
+	0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x78-0x7B */
+	0xC2, 0x9F, 0xC2, 0xA0, 0xC3, 0x40, 0xED, 0xB2, /* 0x7C-0x7F */
+	
+	0xED, 0xB1, 0xC3, 0x41, 0xC3, 0x42, 0xCB, 0xE0, /* 0x80-0x83 */
+	0xD2, 0xDE, 0xC3, 0x43, 0xCB, 0xC1, 0xD5, 0xD8, /* 0x84-0x87 */
+	0xC3, 0x44, 0xC8, 0xE2, 0xC3, 0x45, 0xC0, 0xDF, /* 0x88-0x8B */
+	0xBC, 0xA1, 0xC3, 0x46, 0xC3, 0x47, 0xC3, 0x48, /* 0x8C-0x8F */
+	0xC3, 0x49, 0xC3, 0x4A, 0xC3, 0x4B, 0xEB, 0xC1, /* 0x90-0x93 */
+	0xC3, 0x4C, 0xC3, 0x4D, 0xD0, 0xA4, 0xC3, 0x4E, /* 0x94-0x97 */
+	0xD6, 0xE2, 0xC3, 0x4F, 0xB6, 0xC7, 0xB8, 0xD8, /* 0x98-0x9B */
+	0xEB, 0xC0, 0xB8, 0xCE, 0xC3, 0x50, 0xEB, 0xBF, /* 0x9C-0x9F */
+	0xB3, 0xA6, 0xB9, 0xC9, 0xD6, 0xAB, 0xC3, 0x51, /* 0xA0-0xA3 */
+	0xB7, 0xF4, 0xB7, 0xCA, 0xC3, 0x52, 0xC3, 0x53, /* 0xA4-0xA7 */
+	0xC3, 0x54, 0xBC, 0xE7, 0xB7, 0xBE, 0xEB, 0xC6, /* 0xA8-0xAB */
+	0xC3, 0x55, 0xEB, 0xC7, 0xB0, 0xB9, 0xBF, 0xCF, /* 0xAC-0xAF */
+	0xC3, 0x56, 0xEB, 0xC5, 0xD3, 0xFD, 0xC3, 0x57, /* 0xB0-0xB3 */
+	0xEB, 0xC8, 0xC3, 0x58, 0xC3, 0x59, 0xEB, 0xC9, /* 0xB4-0xB7 */
+	0xC3, 0x5A, 0xC3, 0x5B, 0xB7, 0xCE, 0xC3, 0x5C, /* 0xB8-0xBB */
+	0xEB, 0xC2, 0xEB, 0xC4, 0xC9, 0xF6, 0xD6, 0xD7, /* 0xBC-0xBF */
+	0xD5, 0xCD, 0xD0, 0xB2, 0xEB, 0xCF, 0xCE, 0xB8, /* 0xC0-0xC3 */
+	0xEB, 0xD0, 0xC3, 0x5D, 0xB5, 0xA8, 0xC3, 0x5E, /* 0xC4-0xC7 */
+	0xC3, 0x5F, 0xC3, 0x60, 0xC3, 0x61, 0xC3, 0x62, /* 0xC8-0xCB */
+	0xB1, 0xB3, 0xEB, 0xD2, 0xCC, 0xA5, 0xC3, 0x63, /* 0xCC-0xCF */
+	0xC3, 0x64, 0xC3, 0x65, 0xC3, 0x66, 0xC3, 0x67, /* 0xD0-0xD3 */
+	0xC3, 0x68, 0xC3, 0x69, 0xC5, 0xD6, 0xEB, 0xD3, /* 0xD4-0xD7 */
+	0xC3, 0x6A, 0xEB, 0xD1, 0xC5, 0xDF, 0xEB, 0xCE, /* 0xD8-0xDB */
+	0xCA, 0xA4, 0xEB, 0xD5, 0xB0, 0xFB, 0xC3, 0x6B, /* 0xDC-0xDF */
+	0xC3, 0x6C, 0xBA, 0xFA, 0xC3, 0x6D, 0xC3, 0x6E, /* 0xE0-0xE3 */
+	0xD8, 0xB7, 0xF1, 0xE3, 0xC3, 0x6F, 0xEB, 0xCA, /* 0xE4-0xE7 */
+	0xEB, 0xCB, 0xEB, 0xCC, 0xEB, 0xCD, 0xEB, 0xD6, /* 0xE8-0xEB */
+	0xE6, 0xC0, 0xEB, 0xD9, 0xC3, 0x70, 0xBF, 0xE8, /* 0xEC-0xEF */
+	0xD2, 0xC8, 0xEB, 0xD7, 0xEB, 0xDC, 0xB8, 0xEC, /* 0xF0-0xF3 */
+	0xEB, 0xD8, 0xC3, 0x71, 0xBD, 0xBA, 0xC3, 0x72, /* 0xF4-0xF7 */
+	0xD0, 0xD8, 0xC3, 0x73, 0xB0, 0xB7, 0xC3, 0x74, /* 0xF8-0xFB */
+	0xEB, 0xDD, 0xC4, 0xDC, 0xC3, 0x75, 0xC3, 0x76, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0xC3, 0x77, 0xC3, 0x78, 0xD6, 0xAC, 0xC3, 0x79, /* 0x00-0x03 */
+	0xC3, 0x7A, 0xC3, 0x7B, 0xB4, 0xE0, 0xC3, 0x7C, /* 0x04-0x07 */
+	0xC3, 0x7D, 0xC2, 0xF6, 0xBC, 0xB9, 0xC3, 0x7E, /* 0x08-0x0B */
+	0xC3, 0x80, 0xEB, 0xDA, 0xEB, 0xDB, 0xD4, 0xE0, /* 0x0C-0x0F */
+	0xC6, 0xEA, 0xC4, 0xD4, 0xEB, 0xDF, 0xC5, 0xA7, /* 0x10-0x13 */
+	0xD9, 0xF5, 0xC3, 0x81, 0xB2, 0xB1, 0xC3, 0x82, /* 0x14-0x17 */
+	0xEB, 0xE4, 0xC3, 0x83, 0xBD, 0xC5, 0xC3, 0x84, /* 0x18-0x1B */
+	0xC3, 0x85, 0xC3, 0x86, 0xEB, 0xE2, 0xC3, 0x87, /* 0x1C-0x1F */
+	0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x20-0x23 */
+	0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, 0xC3, 0x8F, /* 0x24-0x27 */
+	0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, 0xC3, 0x93, /* 0x28-0x2B */
+	0xEB, 0xE3, 0xC3, 0x94, 0xC3, 0x95, 0xB8, 0xAC, /* 0x2C-0x2F */
+	0xC3, 0x96, 0xCD, 0xD1, 0xEB, 0xE5, 0xC3, 0x97, /* 0x30-0x33 */
+	0xC3, 0x98, 0xC3, 0x99, 0xEB, 0xE1, 0xC3, 0x9A, /* 0x34-0x37 */
+	0xC1, 0xB3, 0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, /* 0x38-0x3B */
+	0xC3, 0x9E, 0xC3, 0x9F, 0xC6, 0xA2, 0xC3, 0xA0, /* 0x3C-0x3F */
+	0xC4, 0x40, 0xC4, 0x41, 0xC4, 0x42, 0xC4, 0x43, /* 0x40-0x43 */
+	0xC4, 0x44, 0xC4, 0x45, 0xCC, 0xF3, 0xC4, 0x46, /* 0x44-0x47 */
+	0xEB, 0xE6, 0xC4, 0x47, 0xC0, 0xB0, 0xD2, 0xB8, /* 0x48-0x4B */
+	0xEB, 0xE7, 0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, /* 0x4C-0x4F */
+	0xB8, 0xAF, 0xB8, 0xAD, 0xC4, 0x4B, 0xEB, 0xE8, /* 0x50-0x53 */
+	0xC7, 0xBB, 0xCD, 0xF3, 0xC4, 0x4C, 0xC4, 0x4D, /* 0x54-0x57 */
+	0xC4, 0x4E, 0xEB, 0xEA, 0xEB, 0xEB, 0xC4, 0x4F, /* 0x58-0x5B */
+	0xC4, 0x50, 0xC4, 0x51, 0xC4, 0x52, 0xC4, 0x53, /* 0x5C-0x5F */
+	0xEB, 0xED, 0xC4, 0x54, 0xC4, 0x55, 0xC4, 0x56, /* 0x60-0x63 */
+	0xC4, 0x57, 0xD0, 0xC8, 0xC4, 0x58, 0xEB, 0xF2, /* 0x64-0x67 */
+	0xC4, 0x59, 0xEB, 0xEE, 0xC4, 0x5A, 0xC4, 0x5B, /* 0x68-0x6B */
+	0xC4, 0x5C, 0xEB, 0xF1, 0xC8, 0xF9, 0xC4, 0x5D, /* 0x6C-0x6F */
+	0xD1, 0xFC, 0xEB, 0xEC, 0xC4, 0x5E, 0xC4, 0x5F, /* 0x70-0x73 */
+	0xEB, 0xE9, 0xC4, 0x60, 0xC4, 0x61, 0xC4, 0x62, /* 0x74-0x77 */
+	0xC4, 0x63, 0xB8, 0xB9, 0xCF, 0xD9, 0xC4, 0xE5, /* 0x78-0x7B */
+	0xEB, 0xEF, 0xEB, 0xF0, 0xCC, 0xDA, 0xCD, 0xC8, /* 0x7C-0x7F */
+	
+	0xB0, 0xF2, 0xC4, 0x64, 0xEB, 0xF6, 0xC4, 0x65, /* 0x80-0x83 */
+	0xC4, 0x66, 0xC4, 0x67, 0xC4, 0x68, 0xC4, 0x69, /* 0x84-0x87 */
+	0xEB, 0xF5, 0xC4, 0x6A, 0xB2, 0xB2, 0xC4, 0x6B, /* 0x88-0x8B */
+	0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xB8, 0xE0, /* 0x8C-0x8F */
+	0xC4, 0x6F, 0xEB, 0xF7, 0xC4, 0x70, 0xC4, 0x71, /* 0x90-0x93 */
+	0xC4, 0x72, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0x94-0x97 */
+	0xB1, 0xEC, 0xC4, 0x76, 0xC4, 0x77, 0xCC, 0xC5, /* 0x98-0x9B */
+	0xC4, 0xA4, 0xCF, 0xA5, 0xC4, 0x78, 0xC4, 0x79, /* 0x9C-0x9F */
+	0xC4, 0x7A, 0xC4, 0x7B, 0xC4, 0x7C, 0xEB, 0xF9, /* 0xA0-0xA3 */
+	0xC4, 0x7D, 0xC4, 0x7E, 0xEC, 0xA2, 0xC4, 0x80, /* 0xA4-0xA7 */
+	0xC5, 0xF2, 0xC4, 0x81, 0xEB, 0xFA, 0xC4, 0x82, /* 0xA8-0xAB */
+	0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, 0xC4, 0x86, /* 0xAC-0xAF */
+	0xC4, 0x87, 0xC4, 0x88, 0xC4, 0x89, 0xC9, 0xC5, /* 0xB0-0xB3 */
+	0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, 0xC4, 0x8D, /* 0xB4-0xB7 */
+	0xC4, 0x8E, 0xC4, 0x8F, 0xE2, 0xDF, 0xEB, 0xFE, /* 0xB8-0xBB */
+	0xC4, 0x90, 0xC4, 0x91, 0xC4, 0x92, 0xC4, 0x93, /* 0xBC-0xBF */
+	0xCD, 0xCE, 0xEC, 0xA1, 0xB1, 0xDB, 0xD3, 0xB7, /* 0xC0-0xC3 */
+	0xC4, 0x94, 0xC4, 0x95, 0xD2, 0xDC, 0xC4, 0x96, /* 0xC4-0xC7 */
+	0xC4, 0x97, 0xC4, 0x98, 0xEB, 0xFD, 0xC4, 0x99, /* 0xC8-0xCB */
+	0xEB, 0xFB, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0xCC-0xCF */
+	0xC4, 0x9D, 0xC4, 0x9E, 0xC4, 0x9F, 0xC4, 0xA0, /* 0xD0-0xD3 */
+	0xC5, 0x40, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0xD4-0xD7 */
+	0xC5, 0x44, 0xC5, 0x45, 0xC5, 0x46, 0xC5, 0x47, /* 0xD8-0xDB */
+	0xC5, 0x48, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0xDC-0xDF */
+	0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xB3, 0xBC, /* 0xE0-0xE3 */
+	0xC5, 0x4F, 0xC5, 0x50, 0xC5, 0x51, 0xEA, 0xB0, /* 0xE4-0xE7 */
+	0xC5, 0x52, 0xC5, 0x53, 0xD7, 0xD4, 0xC5, 0x54, /* 0xE8-0xEB */
+	0xF4, 0xAB, 0xB3, 0xF4, 0xC5, 0x55, 0xC5, 0x56, /* 0xEC-0xEF */
+	0xC5, 0x57, 0xC5, 0x58, 0xC5, 0x59, 0xD6, 0xC1, /* 0xF0-0xF3 */
+	0xD6, 0xC2, 0xC5, 0x5A, 0xC5, 0x5B, 0xC5, 0x5C, /* 0xF4-0xF7 */
+	0xC5, 0x5D, 0xC5, 0x5E, 0xC5, 0x5F, 0xD5, 0xE9, /* 0xF8-0xFB */
+	0xBE, 0xCA, 0xC5, 0x60, 0xF4, 0xA7, 0xC5, 0x61, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0xD2, 0xA8, 0xF4, 0xA8, 0xF4, 0xA9, 0xC5, 0x62, /* 0x00-0x03 */
+	0xF4, 0xAA, 0xBE, 0xCB, 0xD3, 0xDF, 0xC5, 0x63, /* 0x04-0x07 */
+	0xC5, 0x64, 0xC5, 0x65, 0xC5, 0x66, 0xC5, 0x67, /* 0x08-0x0B */
+	0xC9, 0xE0, 0xC9, 0xE1, 0xC5, 0x68, 0xC5, 0x69, /* 0x0C-0x0F */
+	0xF3, 0xC2, 0xC5, 0x6A, 0xCA, 0xE6, 0xC5, 0x6B, /* 0x10-0x13 */
+	0xCC, 0xF2, 0xC5, 0x6C, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x14-0x17 */
+	0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xE2, 0xB6, /* 0x18-0x1B */
+	0xCB, 0xB4, 0xC5, 0x72, 0xCE, 0xE8, 0xD6, 0xDB, /* 0x1C-0x1F */
+	0xC5, 0x73, 0xF4, 0xAD, 0xF4, 0xAE, 0xF4, 0xAF, /* 0x20-0x23 */
+	0xC5, 0x74, 0xC5, 0x75, 0xC5, 0x76, 0xC5, 0x77, /* 0x24-0x27 */
+	0xF4, 0xB2, 0xC5, 0x78, 0xBA, 0xBD, 0xF4, 0xB3, /* 0x28-0x2B */
+	0xB0, 0xE3, 0xF4, 0xB0, 0xC5, 0x79, 0xF4, 0xB1, /* 0x2C-0x2F */
+	0xBD, 0xA2, 0xB2, 0xD5, 0xC5, 0x7A, 0xF4, 0xB6, /* 0x30-0x33 */
+	0xF4, 0xB7, 0xB6, 0xE6, 0xB2, 0xB0, 0xCF, 0xCF, /* 0x34-0x37 */
+	0xF4, 0xB4, 0xB4, 0xAC, 0xC5, 0x7B, 0xF4, 0xB5, /* 0x38-0x3B */
+	0xC5, 0x7C, 0xC5, 0x7D, 0xF4, 0xB8, 0xC5, 0x7E, /* 0x3C-0x3F */
+	0xC5, 0x80, 0xC5, 0x81, 0xC5, 0x82, 0xC5, 0x83, /* 0x40-0x43 */
+	0xF4, 0xB9, 0xC5, 0x84, 0xC5, 0x85, 0xCD, 0xA7, /* 0x44-0x47 */
+	0xC5, 0x86, 0xF4, 0xBA, 0xC5, 0x87, 0xF4, 0xBB, /* 0x48-0x4B */
+	0xC5, 0x88, 0xC5, 0x89, 0xC5, 0x8A, 0xF4, 0xBC, /* 0x4C-0x4F */
+	0xC5, 0x8B, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x50-0x53 */
+	0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, 0xC5, 0x92, /* 0x54-0x57 */
+	0xCB, 0xD2, 0xC5, 0x93, 0xF4, 0xBD, 0xC5, 0x94, /* 0x58-0x5B */
+	0xC5, 0x95, 0xC5, 0x96, 0xC5, 0x97, 0xF4, 0xBE, /* 0x5C-0x5F */
+	0xC5, 0x98, 0xC5, 0x99, 0xC5, 0x9A, 0xC5, 0x9B, /* 0x60-0x63 */
+	0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, 0xC5, 0x9F, /* 0x64-0x67 */
+	0xF4, 0xBF, 0xC5, 0xA0, 0xC6, 0x40, 0xC6, 0x41, /* 0x68-0x6B */
+	0xC6, 0x42, 0xC6, 0x43, 0xF4, 0xDE, 0xC1, 0xBC, /* 0x6C-0x6F */
+	0xBC, 0xE8, 0xC6, 0x44, 0xC9, 0xAB, 0xD1, 0xDE, /* 0x70-0x73 */
+	0xE5, 0xF5, 0xC6, 0x45, 0xC6, 0x46, 0xC6, 0x47, /* 0x74-0x77 */
+	0xC6, 0x48, 0xDC, 0xB3, 0xD2, 0xD5, 0xC6, 0x49, /* 0x78-0x7B */
+	0xC6, 0x4A, 0xDC, 0xB4, 0xB0, 0xAC, 0xDC, 0xB5, /* 0x7C-0x7F */
+	
+	0xC6, 0x4B, 0xC6, 0x4C, 0xBD, 0xDA, 0xC6, 0x4D, /* 0x80-0x83 */
+	0xDC, 0xB9, 0xC6, 0x4E, 0xC6, 0x4F, 0xC6, 0x50, /* 0x84-0x87 */
+	0xD8, 0xC2, 0xC6, 0x51, 0xDC, 0xB7, 0xD3, 0xF3, /* 0x88-0x8B */
+	0xC6, 0x52, 0xC9, 0xD6, 0xDC, 0xBA, 0xDC, 0xB6, /* 0x8C-0x8F */
+	0xC6, 0x53, 0xDC, 0xBB, 0xC3, 0xA2, 0xC6, 0x54, /* 0x90-0x93 */
+	0xC6, 0x55, 0xC6, 0x56, 0xC6, 0x57, 0xDC, 0xBC, /* 0x94-0x97 */
+	0xDC, 0xC5, 0xDC, 0xBD, 0xC6, 0x58, 0xC6, 0x59, /* 0x98-0x9B */
+	0xCE, 0xDF, 0xD6, 0xA5, 0xC6, 0x5A, 0xDC, 0xCF, /* 0x9C-0x9F */
+	0xC6, 0x5B, 0xDC, 0xCD, 0xC6, 0x5C, 0xC6, 0x5D, /* 0xA0-0xA3 */
+	0xDC, 0xD2, 0xBD, 0xE6, 0xC2, 0xAB, 0xC6, 0x5E, /* 0xA4-0xA7 */
+	0xDC, 0xB8, 0xDC, 0xCB, 0xDC, 0xCE, 0xDC, 0xBE, /* 0xA8-0xAB */
+	0xB7, 0xD2, 0xB0, 0xC5, 0xDC, 0xC7, 0xD0, 0xBE, /* 0xAC-0xAF */
+	0xDC, 0xC1, 0xBB, 0xA8, 0xC6, 0x5F, 0xB7, 0xBC, /* 0xB0-0xB3 */
+	0xDC, 0xCC, 0xC6, 0x60, 0xC6, 0x61, 0xDC, 0xC6, /* 0xB4-0xB7 */
+	0xDC, 0xBF, 0xC7, 0xDB, 0xC6, 0x62, 0xC6, 0x63, /* 0xB8-0xBB */
+	0xC6, 0x64, 0xD1, 0xBF, 0xDC, 0xC0, 0xC6, 0x65, /* 0xBC-0xBF */
+	0xC6, 0x66, 0xDC, 0xCA, 0xC6, 0x67, 0xC6, 0x68, /* 0xC0-0xC3 */
+	0xDC, 0xD0, 0xC6, 0x69, 0xC6, 0x6A, 0xCE, 0xAD, /* 0xC4-0xC7 */
+	0xDC, 0xC2, 0xC6, 0x6B, 0xDC, 0xC3, 0xDC, 0xC8, /* 0xC8-0xCB */
+	0xDC, 0xC9, 0xB2, 0xD4, 0xDC, 0xD1, 0xCB, 0xD5, /* 0xCC-0xCF */
+	0xC6, 0x6C, 0xD4, 0xB7, 0xDC, 0xDB, 0xDC, 0xDF, /* 0xD0-0xD3 */
+	0xCC, 0xA6, 0xDC, 0xE6, 0xC6, 0x6D, 0xC3, 0xE7, /* 0xD4-0xD7 */
+	0xDC, 0xDC, 0xC6, 0x6E, 0xC6, 0x6F, 0xBF, 0xC1, /* 0xD8-0xDB */
+	0xDC, 0xD9, 0xC6, 0x70, 0xB0, 0xFA, 0xB9, 0xB6, /* 0xDC-0xDF */
+	0xDC, 0xE5, 0xDC, 0xD3, 0xC6, 0x71, 0xDC, 0xC4, /* 0xE0-0xE3 */
+	0xDC, 0xD6, 0xC8, 0xF4, 0xBF, 0xE0, 0xC6, 0x72, /* 0xE4-0xE7 */
+	0xC6, 0x73, 0xC6, 0x74, 0xC6, 0x75, 0xC9, 0xBB, /* 0xE8-0xEB */
+	0xC6, 0x76, 0xC6, 0x77, 0xC6, 0x78, 0xB1, 0xBD, /* 0xEC-0xEF */
+	0xC6, 0x79, 0xD3, 0xA2, 0xC6, 0x7A, 0xC6, 0x7B, /* 0xF0-0xF3 */
+	0xDC, 0xDA, 0xC6, 0x7C, 0xC6, 0x7D, 0xDC, 0xD5, /* 0xF4-0xF7 */
+	0xC6, 0x7E, 0xC6, 0xBB, 0xC6, 0x80, 0xDC, 0xDE, /* 0xF8-0xFB */
+	0xC6, 0x81, 0xC6, 0x82, 0xC6, 0x83, 0xC6, 0x84, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_83[512] = {
+	0xC6, 0x85, 0xD7, 0xC2, 0xC3, 0xAF, 0xB7, 0xB6, /* 0x00-0x03 */
+	0xC7, 0xD1, 0xC3, 0xA9, 0xDC, 0xE2, 0xDC, 0xD8, /* 0x04-0x07 */
+	0xDC, 0xEB, 0xDC, 0xD4, 0xC6, 0x86, 0xC6, 0x87, /* 0x08-0x0B */
+	0xDC, 0xDD, 0xC6, 0x88, 0xBE, 0xA5, 0xDC, 0xD7, /* 0x0C-0x0F */
+	0xC6, 0x89, 0xDC, 0xE0, 0xC6, 0x8A, 0xC6, 0x8B, /* 0x10-0x13 */
+	0xDC, 0xE3, 0xDC, 0xE4, 0xC6, 0x8C, 0xDC, 0xF8, /* 0x14-0x17 */
+	0xC6, 0x8D, 0xC6, 0x8E, 0xDC, 0xE1, 0xDD, 0xA2, /* 0x18-0x1B */
+	0xDC, 0xE7, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x91, /* 0x1C-0x1F */
+	0xC6, 0x92, 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x95, /* 0x20-0x23 */
+	0xC6, 0x96, 0xC6, 0x97, 0xC6, 0x98, 0xBC, 0xEB, /* 0x24-0x27 */
+	0xB4, 0xC4, 0xC6, 0x99, 0xC6, 0x9A, 0xC3, 0xA3, /* 0x28-0x2B */
+	0xB2, 0xE7, 0xDC, 0xFA, 0xC6, 0x9B, 0xDC, 0xF2, /* 0x2C-0x2F */
+	0xC6, 0x9C, 0xDC, 0xEF, 0xC6, 0x9D, 0xDC, 0xFC, /* 0x30-0x33 */
+	0xDC, 0xEE, 0xD2, 0xF0, 0xB2, 0xE8, 0xC6, 0x9E, /* 0x34-0x37 */
+	0xC8, 0xD7, 0xC8, 0xE3, 0xDC, 0xFB, 0xC6, 0x9F, /* 0x38-0x3B */
+	0xDC, 0xED, 0xC6, 0xA0, 0xC7, 0x40, 0xC7, 0x41, /* 0x3C-0x3F */
+	0xDC, 0xF7, 0xC7, 0x42, 0xC7, 0x43, 0xDC, 0xF5, /* 0x40-0x43 */
+	0xC7, 0x44, 0xC7, 0x45, 0xBE, 0xA3, 0xDC, 0xF4, /* 0x44-0x47 */
+	0xC7, 0x46, 0xB2, 0xDD, 0xC7, 0x47, 0xC7, 0x48, /* 0x48-0x4B */
+	0xC7, 0x49, 0xC7, 0x4A, 0xC7, 0x4B, 0xDC, 0xF3, /* 0x4C-0x4F */
+	0xBC, 0xF6, 0xDC, 0xE8, 0xBB, 0xC4, 0xC7, 0x4C, /* 0x50-0x53 */
+	0xC0, 0xF3, 0xC7, 0x4D, 0xC7, 0x4E, 0xC7, 0x4F, /* 0x54-0x57 */
+	0xC7, 0x50, 0xC7, 0x51, 0xBC, 0xD4, 0xDC, 0xE9, /* 0x58-0x5B */
+	0xDC, 0xEA, 0xC7, 0x52, 0xDC, 0xF1, 0xDC, 0xF6, /* 0x5C-0x5F */
+	0xDC, 0xF9, 0xB5, 0xB4, 0xC7, 0x53, 0xC8, 0xD9, /* 0x60-0x63 */
+	0xBB, 0xE7, 0xDC, 0xFE, 0xDC, 0xFD, 0xD3, 0xAB, /* 0x64-0x67 */
+	0xDD, 0xA1, 0xDD, 0xA3, 0xDD, 0xA5, 0xD2, 0xF1, /* 0x68-0x6B */
+	0xDD, 0xA4, 0xDD, 0xA6, 0xDD, 0xA7, 0xD2, 0xA9, /* 0x6C-0x6F */
+	0xC7, 0x54, 0xC7, 0x55, 0xC7, 0x56, 0xC7, 0x57, /* 0x70-0x73 */
+	0xC7, 0x58, 0xC7, 0x59, 0xC7, 0x5A, 0xBA, 0xC9, /* 0x74-0x77 */
+	0xDD, 0xA9, 0xC7, 0x5B, 0xC7, 0x5C, 0xDD, 0xB6, /* 0x78-0x7B */
+	0xDD, 0xB1, 0xDD, 0xB4, 0xC7, 0x5D, 0xC7, 0x5E, /* 0x7C-0x7F */
+	
+	0xC7, 0x5F, 0xC7, 0x60, 0xC7, 0x61, 0xC7, 0x62, /* 0x80-0x83 */
+	0xC7, 0x63, 0xDD, 0xB0, 0xC6, 0xCE, 0xC7, 0x64, /* 0x84-0x87 */
+	0xC7, 0x65, 0xC0, 0xF2, 0xC7, 0x66, 0xC7, 0x67, /* 0x88-0x8B */
+	0xC7, 0x68, 0xC7, 0x69, 0xC9, 0xAF, 0xC7, 0x6A, /* 0x8C-0x8F */
+	0xC7, 0x6B, 0xC7, 0x6C, 0xDC, 0xEC, 0xDD, 0xAE, /* 0x90-0x93 */
+	0xC7, 0x6D, 0xC7, 0x6E, 0xC7, 0x6F, 0xC7, 0x70, /* 0x94-0x97 */
+	0xDD, 0xB7, 0xC7, 0x71, 0xC7, 0x72, 0xDC, 0xF0, /* 0x98-0x9B */
+	0xDD, 0xAF, 0xC7, 0x73, 0xDD, 0xB8, 0xC7, 0x74, /* 0x9C-0x9F */
+	0xDD, 0xAC, 0xC7, 0x75, 0xC7, 0x76, 0xC7, 0x77, /* 0xA0-0xA3 */
+	0xC7, 0x78, 0xC7, 0x79, 0xC7, 0x7A, 0xC7, 0x7B, /* 0xA4-0xA7 */
+	0xDD, 0xB9, 0xDD, 0xB3, 0xDD, 0xAD, 0xC4, 0xAA, /* 0xA8-0xAB */
+	0xC7, 0x7C, 0xC7, 0x7D, 0xC7, 0x7E, 0xC7, 0x80, /* 0xAC-0xAF */
+	0xDD, 0xA8, 0xC0, 0xB3, 0xC1, 0xAB, 0xDD, 0xAA, /* 0xB0-0xB3 */
+	0xDD, 0xAB, 0xC7, 0x81, 0xDD, 0xB2, 0xBB, 0xF1, /* 0xB4-0xB7 */
+	0xDD, 0xB5, 0xD3, 0xA8, 0xDD, 0xBA, 0xC7, 0x82, /* 0xB8-0xBB */
+	0xDD, 0xBB, 0xC3, 0xA7, 0xC7, 0x83, 0xC7, 0x84, /* 0xBC-0xBF */
+	0xDD, 0xD2, 0xDD, 0xBC, 0xC7, 0x85, 0xC7, 0x86, /* 0xC0-0xC3 */
+	0xC7, 0x87, 0xDD, 0xD1, 0xC7, 0x88, 0xB9, 0xBD, /* 0xC4-0xC7 */
+	0xC7, 0x89, 0xC7, 0x8A, 0xBE, 0xD5, 0xC7, 0x8B, /* 0xC8-0xCB */
+	0xBE, 0xFA, 0xC7, 0x8C, 0xC7, 0x8D, 0xBA, 0xCA, /* 0xCC-0xCF */
+	0xC7, 0x8E, 0xC7, 0x8F, 0xC7, 0x90, 0xC7, 0x91, /* 0xD0-0xD3 */
+	0xDD, 0xCA, 0xC7, 0x92, 0xDD, 0xC5, 0xC7, 0x93, /* 0xD4-0xD7 */
+	0xDD, 0xBF, 0xC7, 0x94, 0xC7, 0x95, 0xC7, 0x96, /* 0xD8-0xDB */
+	0xB2, 0xCB, 0xDD, 0xC3, 0xC7, 0x97, 0xDD, 0xCB, /* 0xDC-0xDF */
+	0xB2, 0xA4, 0xDD, 0xD5, 0xC7, 0x98, 0xC7, 0x99, /* 0xE0-0xE3 */
+	0xC7, 0x9A, 0xDD, 0xBE, 0xC7, 0x9B, 0xC7, 0x9C, /* 0xE4-0xE7 */
+	0xC7, 0x9D, 0xC6, 0xD0, 0xDD, 0xD0, 0xC7, 0x9E, /* 0xE8-0xEB */
+	0xC7, 0x9F, 0xC7, 0xA0, 0xC8, 0x40, 0xC8, 0x41, /* 0xEC-0xEF */
+	0xDD, 0xD4, 0xC1, 0xE2, 0xB7, 0xC6, 0xC8, 0x42, /* 0xF0-0xF3 */
+	0xC8, 0x43, 0xC8, 0x44, 0xC8, 0x45, 0xC8, 0x46, /* 0xF4-0xF7 */
+	0xDD, 0xCE, 0xDD, 0xCF, 0xC8, 0x47, 0xC8, 0x48, /* 0xF8-0xFB */
+	0xC8, 0x49, 0xDD, 0xC4, 0xC8, 0x4A, 0xC8, 0x4B, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0xC8, 0x4C, 0xDD, 0xBD, 0xC8, 0x4D, 0xDD, 0xCD, /* 0x00-0x03 */
+	0xCC, 0xD1, 0xC8, 0x4E, 0xDD, 0xC9, 0xC8, 0x4F, /* 0x04-0x07 */
+	0xC8, 0x50, 0xC8, 0x51, 0xC8, 0x52, 0xDD, 0xC2, /* 0x08-0x0B */
+	0xC3, 0xC8, 0xC6, 0xBC, 0xCE, 0xAE, 0xDD, 0xCC, /* 0x0C-0x0F */
+	0xC8, 0x53, 0xDD, 0xC8, 0xC8, 0x54, 0xC8, 0x55, /* 0x10-0x13 */
+	0xC8, 0x56, 0xC8, 0x57, 0xC8, 0x58, 0xC8, 0x59, /* 0x14-0x17 */
+	0xDD, 0xC1, 0xC8, 0x5A, 0xC8, 0x5B, 0xC8, 0x5C, /* 0x18-0x1B */
+	0xDD, 0xC6, 0xC2, 0xDC, 0xC8, 0x5D, 0xC8, 0x5E, /* 0x1C-0x1F */
+	0xC8, 0x5F, 0xC8, 0x60, 0xC8, 0x61, 0xC8, 0x62, /* 0x20-0x23 */
+	0xD3, 0xA9, 0xD3, 0xAA, 0xDD, 0xD3, 0xCF, 0xF4, /* 0x24-0x27 */
+	0xC8, 0xF8, 0xC8, 0x63, 0xC8, 0x64, 0xC8, 0x65, /* 0x28-0x2B */
+	0xC8, 0x66, 0xC8, 0x67, 0xC8, 0x68, 0xC8, 0x69, /* 0x2C-0x2F */
+	0xC8, 0x6A, 0xDD, 0xE6, 0xC8, 0x6B, 0xC8, 0x6C, /* 0x30-0x33 */
+	0xC8, 0x6D, 0xC8, 0x6E, 0xC8, 0x6F, 0xC8, 0x70, /* 0x34-0x37 */
+	0xDD, 0xC7, 0xC8, 0x71, 0xC8, 0x72, 0xC8, 0x73, /* 0x38-0x3B */
+	0xDD, 0xE0, 0xC2, 0xE4, 0xC8, 0x74, 0xC8, 0x75, /* 0x3C-0x3F */
+	0xC8, 0x76, 0xC8, 0x77, 0xC8, 0x78, 0xC8, 0x79, /* 0x40-0x43 */
+	0xC8, 0x7A, 0xC8, 0x7B, 0xDD, 0xE1, 0xC8, 0x7C, /* 0x44-0x47 */
+	0xC8, 0x7D, 0xC8, 0x7E, 0xC8, 0x80, 0xC8, 0x81, /* 0x48-0x4B */
+	0xC8, 0x82, 0xC8, 0x83, 0xC8, 0x84, 0xC8, 0x85, /* 0x4C-0x4F */
+	0xC8, 0x86, 0xDD, 0xD7, 0xC8, 0x87, 0xC8, 0x88, /* 0x50-0x53 */
+	0xC8, 0x89, 0xC8, 0x8A, 0xC8, 0x8B, 0xD6, 0xF8, /* 0x54-0x57 */
+	0xC8, 0x8C, 0xDD, 0xD9, 0xDD, 0xD8, 0xB8, 0xF0, /* 0x58-0x5B */
+	0xDD, 0xD6, 0xC8, 0x8D, 0xC8, 0x8E, 0xC8, 0x8F, /* 0x5C-0x5F */
+	0xC8, 0x90, 0xC6, 0xCF, 0xC8, 0x91, 0xB6, 0xAD, /* 0x60-0x63 */
+	0xC8, 0x92, 0xC8, 0x93, 0xC8, 0x94, 0xC8, 0x95, /* 0x64-0x67 */
+	0xC8, 0x96, 0xDD, 0xE2, 0xC8, 0x97, 0xBA, 0xF9, /* 0x68-0x6B */
+	0xD4, 0xE1, 0xDD, 0xE7, 0xC8, 0x98, 0xC8, 0x99, /* 0x6C-0x6F */
+	0xC8, 0x9A, 0xB4, 0xD0, 0xC8, 0x9B, 0xDD, 0xDA, /* 0x70-0x73 */
+	0xC8, 0x9C, 0xBF, 0xFB, 0xDD, 0xE3, 0xC8, 0x9D, /* 0x74-0x77 */
+	0xDD, 0xDF, 0xC8, 0x9E, 0xDD, 0xDD, 0xC8, 0x9F, /* 0x78-0x7B */
+	0xC8, 0xA0, 0xC9, 0x40, 0xC9, 0x41, 0xC9, 0x42, /* 0x7C-0x7F */
+	
+	0xC9, 0x43, 0xC9, 0x44, 0xB5, 0xD9, 0xC9, 0x45, /* 0x80-0x83 */
+	0xC9, 0x46, 0xC9, 0x47, 0xC9, 0x48, 0xDD, 0xDB, /* 0x84-0x87 */
+	0xDD, 0xDC, 0xDD, 0xDE, 0xC9, 0x49, 0xBD, 0xAF, /* 0x88-0x8B */
+	0xDD, 0xE4, 0xC9, 0x4A, 0xDD, 0xE5, 0xC9, 0x4B, /* 0x8C-0x8F */
+	0xC9, 0x4C, 0xC9, 0x4D, 0xC9, 0x4E, 0xC9, 0x4F, /* 0x90-0x93 */
+	0xC9, 0x50, 0xC9, 0x51, 0xC9, 0x52, 0xDD, 0xF5, /* 0x94-0x97 */
+	0xC9, 0x53, 0xC3, 0xC9, 0xC9, 0x54, 0xC9, 0x55, /* 0x98-0x9B */
+	0xCB, 0xE2, 0xC9, 0x56, 0xC9, 0x57, 0xC9, 0x58, /* 0x9C-0x9F */
+	0xC9, 0x59, 0xDD, 0xF2, 0xC9, 0x5A, 0xC9, 0x5B, /* 0xA0-0xA3 */
+	0xC9, 0x5C, 0xC9, 0x5D, 0xC9, 0x5E, 0xC9, 0x5F, /* 0xA4-0xA7 */
+	0xC9, 0x60, 0xC9, 0x61, 0xC9, 0x62, 0xC9, 0x63, /* 0xA8-0xAB */
+	0xC9, 0x64, 0xC9, 0x65, 0xC9, 0x66, 0xD8, 0xE1, /* 0xAC-0xAF */
+	0xC9, 0x67, 0xC9, 0x68, 0xC6, 0xD1, 0xC9, 0x69, /* 0xB0-0xB3 */
+	0xDD, 0xF4, 0xC9, 0x6A, 0xC9, 0x6B, 0xC9, 0x6C, /* 0xB4-0xB7 */
+	0xD5, 0xF4, 0xDD, 0xF3, 0xDD, 0xF0, 0xC9, 0x6D, /* 0xB8-0xBB */
+	0xC9, 0x6E, 0xDD, 0xEC, 0xC9, 0x6F, 0xDD, 0xEF, /* 0xBC-0xBF */
+	0xC9, 0x70, 0xDD, 0xE8, 0xC9, 0x71, 0xC9, 0x72, /* 0xC0-0xC3 */
+	0xD0, 0xEE, 0xC9, 0x73, 0xC9, 0x74, 0xC9, 0x75, /* 0xC4-0xC7 */
+	0xC9, 0x76, 0xC8, 0xD8, 0xDD, 0xEE, 0xC9, 0x77, /* 0xC8-0xCB */
+	0xC9, 0x78, 0xDD, 0xE9, 0xC9, 0x79, 0xC9, 0x7A, /* 0xCC-0xCF */
+	0xDD, 0xEA, 0xCB, 0xF2, 0xC9, 0x7B, 0xDD, 0xED, /* 0xD0-0xD3 */
+	0xC9, 0x7C, 0xC9, 0x7D, 0xB1, 0xCD, 0xC9, 0x7E, /* 0xD4-0xD7 */
+	0xC9, 0x80, 0xC9, 0x81, 0xC9, 0x82, 0xC9, 0x83, /* 0xD8-0xDB */
+	0xC9, 0x84, 0xC0, 0xB6, 0xC9, 0x85, 0xBC, 0xBB, /* 0xDC-0xDF */
+	0xDD, 0xF1, 0xC9, 0x86, 0xC9, 0x87, 0xDD, 0xF7, /* 0xE0-0xE3 */
+	0xC9, 0x88, 0xDD, 0xF6, 0xDD, 0xEB, 0xC9, 0x89, /* 0xE4-0xE7 */
+	0xC9, 0x8A, 0xC9, 0x8B, 0xC9, 0x8C, 0xC9, 0x8D, /* 0xE8-0xEB */
+	0xC5, 0xEE, 0xC9, 0x8E, 0xC9, 0x8F, 0xC9, 0x90, /* 0xEC-0xEF */
+	0xDD, 0xFB, 0xC9, 0x91, 0xC9, 0x92, 0xC9, 0x93, /* 0xF0-0xF3 */
+	0xC9, 0x94, 0xC9, 0x95, 0xC9, 0x96, 0xC9, 0x97, /* 0xF4-0xF7 */
+	0xC9, 0x98, 0xC9, 0x99, 0xC9, 0x9A, 0xC9, 0x9B, /* 0xF8-0xFB */
+	0xDE, 0xA4, 0xC9, 0x9C, 0xC9, 0x9D, 0xDE, 0xA3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0xC9, 0x9E, 0xC9, 0x9F, 0xC9, 0xA0, 0xCA, 0x40, /* 0x00-0x03 */
+	0xCA, 0x41, 0xCA, 0x42, 0xCA, 0x43, 0xCA, 0x44, /* 0x04-0x07 */
+	0xCA, 0x45, 0xCA, 0x46, 0xCA, 0x47, 0xCA, 0x48, /* 0x08-0x0B */
+	0xDD, 0xF8, 0xCA, 0x49, 0xCA, 0x4A, 0xCA, 0x4B, /* 0x0C-0x0F */
+	0xCA, 0x4C, 0xC3, 0xEF, 0xCA, 0x4D, 0xC2, 0xFB, /* 0x10-0x13 */
+	0xCA, 0x4E, 0xCA, 0x4F, 0xCA, 0x50, 0xD5, 0xE1, /* 0x14-0x17 */
+	0xCA, 0x51, 0xCA, 0x52, 0xCE, 0xB5, 0xCA, 0x53, /* 0x18-0x1B */
+	0xCA, 0x54, 0xCA, 0x55, 0xCA, 0x56, 0xDD, 0xFD, /* 0x1C-0x1F */
+	0xCA, 0x57, 0xB2, 0xCC, 0xCA, 0x58, 0xCA, 0x59, /* 0x20-0x23 */
+	0xCA, 0x5A, 0xCA, 0x5B, 0xCA, 0x5C, 0xCA, 0x5D, /* 0x24-0x27 */
+	0xCA, 0x5E, 0xCA, 0x5F, 0xCA, 0x60, 0xC4, 0xE8, /* 0x28-0x2B */
+	0xCA, 0xDF, 0xCA, 0x61, 0xCA, 0x62, 0xCA, 0x63, /* 0x2C-0x2F */
+	0xCA, 0x64, 0xCA, 0x65, 0xCA, 0x66, 0xCA, 0x67, /* 0x30-0x33 */
+	0xCA, 0x68, 0xCA, 0x69, 0xCA, 0x6A, 0xC7, 0xBE, /* 0x34-0x37 */
+	0xDD, 0xFA, 0xDD, 0xFC, 0xDD, 0xFE, 0xDE, 0xA2, /* 0x38-0x3B */
+	0xB0, 0xAA, 0xB1, 0xCE, 0xCA, 0x6B, 0xCA, 0x6C, /* 0x3C-0x3F */
+	0xCA, 0x6D, 0xCA, 0x6E, 0xCA, 0x6F, 0xDE, 0xAC, /* 0x40-0x43 */
+	0xCA, 0x70, 0xCA, 0x71, 0xCA, 0x72, 0xCA, 0x73, /* 0x44-0x47 */
+	0xDE, 0xA6, 0xBD, 0xB6, 0xC8, 0xEF, 0xCA, 0x74, /* 0x48-0x4B */
+	0xCA, 0x75, 0xCA, 0x76, 0xCA, 0x77, 0xCA, 0x78, /* 0x4C-0x4F */
+	0xCA, 0x79, 0xCA, 0x7A, 0xCA, 0x7B, 0xCA, 0x7C, /* 0x50-0x53 */
+	0xCA, 0x7D, 0xCA, 0x7E, 0xDE, 0xA1, 0xCA, 0x80, /* 0x54-0x57 */
+	0xCA, 0x81, 0xDE, 0xA5, 0xCA, 0x82, 0xCA, 0x83, /* 0x58-0x5B */
+	0xCA, 0x84, 0xCA, 0x85, 0xDE, 0xA9, 0xCA, 0x86, /* 0x5C-0x5F */
+	0xCA, 0x87, 0xCA, 0x88, 0xCA, 0x89, 0xCA, 0x8A, /* 0x60-0x63 */
+	0xDE, 0xA8, 0xCA, 0x8B, 0xCA, 0x8C, 0xCA, 0x8D, /* 0x64-0x67 */
+	0xDE, 0xA7, 0xCA, 0x8E, 0xCA, 0x8F, 0xCA, 0x90, /* 0x68-0x6B */
+	0xCA, 0x91, 0xCA, 0x92, 0xCA, 0x93, 0xCA, 0x94, /* 0x6C-0x6F */
+	0xCA, 0x95, 0xCA, 0x96, 0xDE, 0xAD, 0xCA, 0x97, /* 0x70-0x73 */
+	0xD4, 0xCC, 0xCA, 0x98, 0xCA, 0x99, 0xCA, 0x9A, /* 0x74-0x77 */
+	0xCA, 0x9B, 0xDE, 0xB3, 0xDE, 0xAA, 0xDE, 0xAE, /* 0x78-0x7B */
+	0xCA, 0x9C, 0xCA, 0x9D, 0xC0, 0xD9, 0xCA, 0x9E, /* 0x7C-0x7F */
+	
+	0xCA, 0x9F, 0xCA, 0xA0, 0xCB, 0x40, 0xCB, 0x41, /* 0x80-0x83 */
+	0xB1, 0xA1, 0xDE, 0xB6, 0xCB, 0x42, 0xDE, 0xB1, /* 0x84-0x87 */
+	0xCB, 0x43, 0xCB, 0x44, 0xCB, 0x45, 0xCB, 0x46, /* 0x88-0x8B */
+	0xCB, 0x47, 0xCB, 0x48, 0xCB, 0x49, 0xDE, 0xB2, /* 0x8C-0x8F */
+	0xCB, 0x4A, 0xCB, 0x4B, 0xCB, 0x4C, 0xCB, 0x4D, /* 0x90-0x93 */
+	0xCB, 0x4E, 0xCB, 0x4F, 0xCB, 0x50, 0xCB, 0x51, /* 0x94-0x97 */
+	0xCB, 0x52, 0xCB, 0x53, 0xCB, 0x54, 0xD1, 0xA6, /* 0x98-0x9B */
+	0xDE, 0xB5, 0xCB, 0x55, 0xCB, 0x56, 0xCB, 0x57, /* 0x9C-0x9F */
+	0xCB, 0x58, 0xCB, 0x59, 0xCB, 0x5A, 0xCB, 0x5B, /* 0xA0-0xA3 */
+	0xDE, 0xAF, 0xCB, 0x5C, 0xCB, 0x5D, 0xCB, 0x5E, /* 0xA4-0xA7 */
+	0xDE, 0xB0, 0xCB, 0x5F, 0xD0, 0xBD, 0xCB, 0x60, /* 0xA8-0xAB */
+	0xCB, 0x61, 0xCB, 0x62, 0xDE, 0xB4, 0xCA, 0xED, /* 0xAC-0xAF */
+	0xDE, 0xB9, 0xCB, 0x63, 0xCB, 0x64, 0xCB, 0x65, /* 0xB0-0xB3 */
+	0xCB, 0x66, 0xCB, 0x67, 0xCB, 0x68, 0xDE, 0xB8, /* 0xB4-0xB7 */
+	0xCB, 0x69, 0xDE, 0xB7, 0xCB, 0x6A, 0xCB, 0x6B, /* 0xB8-0xBB */
+	0xCB, 0x6C, 0xCB, 0x6D, 0xCB, 0x6E, 0xCB, 0x6F, /* 0xBC-0xBF */
+	0xCB, 0x70, 0xDE, 0xBB, 0xCB, 0x71, 0xCB, 0x72, /* 0xC0-0xC3 */
+	0xCB, 0x73, 0xCB, 0x74, 0xCB, 0x75, 0xCB, 0x76, /* 0xC4-0xC7 */
+	0xCB, 0x77, 0xBD, 0xE5, 0xCB, 0x78, 0xCB, 0x79, /* 0xC8-0xCB */
+	0xCB, 0x7A, 0xCB, 0x7B, 0xCB, 0x7C, 0xB2, 0xD8, /* 0xCC-0xCF */
+	0xC3, 0xEA, 0xCB, 0x7D, 0xCB, 0x7E, 0xDE, 0xBA, /* 0xD0-0xD3 */
+	0xCB, 0x80, 0xC5, 0xBA, 0xCB, 0x81, 0xCB, 0x82, /* 0xD4-0xD7 */
+	0xCB, 0x83, 0xCB, 0x84, 0xCB, 0x85, 0xCB, 0x86, /* 0xD8-0xDB */
+	0xDE, 0xBC, 0xCB, 0x87, 0xCB, 0x88, 0xCB, 0x89, /* 0xDC-0xDF */
+	0xCB, 0x8A, 0xCB, 0x8B, 0xCB, 0x8C, 0xCB, 0x8D, /* 0xE0-0xE3 */
+	0xCC, 0xD9, 0xCB, 0x8E, 0xCB, 0x8F, 0xCB, 0x90, /* 0xE4-0xE7 */
+	0xCB, 0x91, 0xB7, 0xAA, 0xCB, 0x92, 0xCB, 0x93, /* 0xE8-0xEB */
+	0xCB, 0x94, 0xCB, 0x95, 0xCB, 0x96, 0xCB, 0x97, /* 0xEC-0xEF */
+	0xCB, 0x98, 0xCB, 0x99, 0xCB, 0x9A, 0xCB, 0x9B, /* 0xF0-0xF3 */
+	0xCB, 0x9C, 0xCB, 0x9D, 0xCB, 0x9E, 0xCB, 0x9F, /* 0xF4-0xF7 */
+	0xCB, 0xA0, 0xCC, 0x40, 0xCC, 0x41, 0xD4, 0xE5, /* 0xF8-0xFB */
+	0xCC, 0x42, 0xCC, 0x43, 0xCC, 0x44, 0xDE, 0xBD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0xCC, 0x45, 0xCC, 0x46, 0xCC, 0x47, 0xCC, 0x48, /* 0x00-0x03 */
+	0xCC, 0x49, 0xDE, 0xBF, 0xCC, 0x4A, 0xCC, 0x4B, /* 0x04-0x07 */
+	0xCC, 0x4C, 0xCC, 0x4D, 0xCC, 0x4E, 0xCC, 0x4F, /* 0x08-0x0B */
+	0xCC, 0x50, 0xCC, 0x51, 0xCC, 0x52, 0xCC, 0x53, /* 0x0C-0x0F */
+	0xCC, 0x54, 0xC4, 0xA2, 0xCC, 0x55, 0xCC, 0x56, /* 0x10-0x13 */
+	0xCC, 0x57, 0xCC, 0x58, 0xDE, 0xC1, 0xCC, 0x59, /* 0x14-0x17 */
+	0xCC, 0x5A, 0xCC, 0x5B, 0xCC, 0x5C, 0xCC, 0x5D, /* 0x18-0x1B */
+	0xCC, 0x5E, 0xCC, 0x5F, 0xCC, 0x60, 0xCC, 0x61, /* 0x1C-0x1F */
+	0xCC, 0x62, 0xCC, 0x63, 0xCC, 0x64, 0xCC, 0x65, /* 0x20-0x23 */
+	0xCC, 0x66, 0xCC, 0x67, 0xCC, 0x68, 0xDE, 0xBE, /* 0x24-0x27 */
+	0xCC, 0x69, 0xDE, 0xC0, 0xCC, 0x6A, 0xCC, 0x6B, /* 0x28-0x2B */
+	0xCC, 0x6C, 0xCC, 0x6D, 0xCC, 0x6E, 0xCC, 0x6F, /* 0x2C-0x2F */
+	0xCC, 0x70, 0xCC, 0x71, 0xCC, 0x72, 0xCC, 0x73, /* 0x30-0x33 */
+	0xCC, 0x74, 0xCC, 0x75, 0xCC, 0x76, 0xCC, 0x77, /* 0x34-0x37 */
+	0xD5, 0xBA, 0xCC, 0x78, 0xCC, 0x79, 0xCC, 0x7A, /* 0x38-0x3B */
+	0xDE, 0xC2, 0xCC, 0x7B, 0xCC, 0x7C, 0xCC, 0x7D, /* 0x3C-0x3F */
+	0xCC, 0x7E, 0xCC, 0x80, 0xCC, 0x81, 0xCC, 0x82, /* 0x40-0x43 */
+	0xCC, 0x83, 0xCC, 0x84, 0xCC, 0x85, 0xCC, 0x86, /* 0x44-0x47 */
+	0xCC, 0x87, 0xCC, 0x88, 0xCC, 0x89, 0xCC, 0x8A, /* 0x48-0x4B */
+	0xCC, 0x8B, 0xF2, 0xAE, 0xBB, 0xA2, 0xC2, 0xB2, /* 0x4C-0x4F */
+	0xC5, 0xB0, 0xC2, 0xC7, 0xCC, 0x8C, 0xCC, 0x8D, /* 0x50-0x53 */
+	0xF2, 0xAF, 0xCC, 0x8E, 0xCC, 0x8F, 0xCC, 0x90, /* 0x54-0x57 */
+	0xCC, 0x91, 0xCC, 0x92, 0xD0, 0xE9, 0xCC, 0x93, /* 0x58-0x5B */
+	0xCC, 0x94, 0xCC, 0x95, 0xD3, 0xDD, 0xCC, 0x96, /* 0x5C-0x5F */
+	0xCC, 0x97, 0xCC, 0x98, 0xEB, 0xBD, 0xCC, 0x99, /* 0x60-0x63 */
+	0xCC, 0x9A, 0xCC, 0x9B, 0xCC, 0x9C, 0xCC, 0x9D, /* 0x64-0x67 */
+	0xCC, 0x9E, 0xCC, 0x9F, 0xCC, 0xA0, 0xB3, 0xE6, /* 0x68-0x6B */
+	0xF2, 0xB0, 0xCD, 0x40, 0xF2, 0xB1, 0xCD, 0x41, /* 0x6C-0x6F */
+	0xCD, 0x42, 0xCA, 0xAD, 0xCD, 0x43, 0xCD, 0x44, /* 0x70-0x73 */
+	0xCD, 0x45, 0xCD, 0x46, 0xCD, 0x47, 0xCD, 0x48, /* 0x74-0x77 */
+	0xCD, 0x49, 0xBA, 0xE7, 0xF2, 0xB3, 0xF2, 0xB5, /* 0x78-0x7B */
+	0xF2, 0xB4, 0xCB, 0xE4, 0xCF, 0xBA, 0xF2, 0xB2, /* 0x7C-0x7F */
+	
+	0xCA, 0xB4, 0xD2, 0xCF, 0xC2, 0xEC, 0xCD, 0x4A, /* 0x80-0x83 */
+	0xCD, 0x4B, 0xCD, 0x4C, 0xCD, 0x4D, 0xCD, 0x4E, /* 0x84-0x87 */
+	0xCD, 0x4F, 0xCD, 0x50, 0xCE, 0xC3, 0xF2, 0xB8, /* 0x88-0x8B */
+	0xB0, 0xF6, 0xF2, 0xB7, 0xCD, 0x51, 0xCD, 0x52, /* 0x8C-0x8F */
+	0xCD, 0x53, 0xCD, 0x54, 0xCD, 0x55, 0xF2, 0xBE, /* 0x90-0x93 */
+	0xCD, 0x56, 0xB2, 0xCF, 0xCD, 0x57, 0xCD, 0x58, /* 0x94-0x97 */
+	0xCD, 0x59, 0xCD, 0x5A, 0xCD, 0x5B, 0xCD, 0x5C, /* 0x98-0x9B */
+	0xD1, 0xC1, 0xF2, 0xBA, 0xCD, 0x5D, 0xCD, 0x5E, /* 0x9C-0x9F */
+	0xCD, 0x5F, 0xCD, 0x60, 0xCD, 0x61, 0xF2, 0xBC, /* 0xA0-0xA3 */
+	0xD4, 0xE9, 0xCD, 0x62, 0xCD, 0x63, 0xF2, 0xBB, /* 0xA4-0xA7 */
+	0xF2, 0xB6, 0xF2, 0xBF, 0xF2, 0xBD, 0xCD, 0x64, /* 0xA8-0xAB */
+	0xF2, 0xB9, 0xCD, 0x65, 0xCD, 0x66, 0xF2, 0xC7, /* 0xAC-0xAF */
+	0xF2, 0xC4, 0xF2, 0xC6, 0xCD, 0x67, 0xCD, 0x68, /* 0xB0-0xB3 */
+	0xF2, 0xCA, 0xF2, 0xC2, 0xF2, 0xC0, 0xCD, 0x69, /* 0xB4-0xB7 */
+	0xCD, 0x6A, 0xCD, 0x6B, 0xF2, 0xC5, 0xCD, 0x6C, /* 0xB8-0xBB */
+	0xCD, 0x6D, 0xCD, 0x6E, 0xCD, 0x6F, 0xCD, 0x70, /* 0xBC-0xBF */
+	0xD6, 0xFB, 0xCD, 0x71, 0xCD, 0x72, 0xCD, 0x73, /* 0xC0-0xC3 */
+	0xF2, 0xC1, 0xCD, 0x74, 0xC7, 0xF9, 0xC9, 0xDF, /* 0xC4-0xC7 */
+	0xCD, 0x75, 0xF2, 0xC8, 0xB9, 0xC6, 0xB5, 0xB0, /* 0xC8-0xCB */
+	0xCD, 0x76, 0xCD, 0x77, 0xF2, 0xC3, 0xF2, 0xC9, /* 0xCC-0xCF */
+	0xF2, 0xD0, 0xF2, 0xD6, 0xCD, 0x78, 0xCD, 0x79, /* 0xD0-0xD3 */
+	0xBB, 0xD7, 0xCD, 0x7A, 0xCD, 0x7B, 0xCD, 0x7C, /* 0xD4-0xD7 */
+	0xF2, 0xD5, 0xCD, 0xDC, 0xCD, 0x7D, 0xD6, 0xEB, /* 0xD8-0xDB */
+	0xCD, 0x7E, 0xCD, 0x80, 0xF2, 0xD2, 0xF2, 0xD4, /* 0xDC-0xDF */
+	0xCD, 0x81, 0xCD, 0x82, 0xCD, 0x83, 0xCD, 0x84, /* 0xE0-0xE3 */
+	0xB8, 0xF2, 0xCD, 0x85, 0xCD, 0x86, 0xCD, 0x87, /* 0xE4-0xE7 */
+	0xCD, 0x88, 0xF2, 0xCB, 0xCD, 0x89, 0xCD, 0x8A, /* 0xE8-0xEB */
+	0xCD, 0x8B, 0xF2, 0xCE, 0xC2, 0xF9, 0xCD, 0x8C, /* 0xEC-0xEF */
+	0xD5, 0xDD, 0xF2, 0xCC, 0xF2, 0xCD, 0xF2, 0xCF, /* 0xF0-0xF3 */
+	0xF2, 0xD3, 0xCD, 0x8D, 0xCD, 0x8E, 0xCD, 0x8F, /* 0xF4-0xF7 */
+	0xF2, 0xD9, 0xD3, 0xBC, 0xCD, 0x90, 0xCD, 0x91, /* 0xF8-0xFB */
+	0xCD, 0x92, 0xCD, 0x93, 0xB6, 0xEA, 0xCD, 0x94, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xCA, 0xF1, 0xCD, 0x95, 0xB7, 0xE4, 0xF2, 0xD7, /* 0x00-0x03 */
+	0xCD, 0x96, 0xCD, 0x97, 0xCD, 0x98, 0xF2, 0xD8, /* 0x04-0x07 */
+	0xF2, 0xDA, 0xF2, 0xDD, 0xF2, 0xDB, 0xCD, 0x99, /* 0x08-0x0B */
+	0xCD, 0x9A, 0xF2, 0xDC, 0xCD, 0x9B, 0xCD, 0x9C, /* 0x0C-0x0F */
+	0xCD, 0x9D, 0xCD, 0x9E, 0xD1, 0xD1, 0xF2, 0xD1, /* 0x10-0x13 */
+	0xCD, 0x9F, 0xCD, 0xC9, 0xCD, 0xA0, 0xCE, 0xCF, /* 0x14-0x17 */
+	0xD6, 0xA9, 0xCE, 0x40, 0xF2, 0xE3, 0xCE, 0x41, /* 0x18-0x1B */
+	0xC3, 0xDB, 0xCE, 0x42, 0xF2, 0xE0, 0xCE, 0x43, /* 0x1C-0x1F */
+	0xCE, 0x44, 0xC0, 0xAF, 0xF2, 0xEC, 0xF2, 0xDE, /* 0x20-0x23 */
+	0xCE, 0x45, 0xF2, 0xE1, 0xCE, 0x46, 0xCE, 0x47, /* 0x24-0x27 */
+	0xCE, 0x48, 0xF2, 0xE8, 0xCE, 0x49, 0xCE, 0x4A, /* 0x28-0x2B */
+	0xCE, 0x4B, 0xCE, 0x4C, 0xF2, 0xE2, 0xCE, 0x4D, /* 0x2C-0x2F */
+	0xCE, 0x4E, 0xF2, 0xE7, 0xCE, 0x4F, 0xCE, 0x50, /* 0x30-0x33 */
+	0xF2, 0xE6, 0xCE, 0x51, 0xCE, 0x52, 0xF2, 0xE9, /* 0x34-0x37 */
+	0xCE, 0x53, 0xCE, 0x54, 0xCE, 0x55, 0xF2, 0xDF, /* 0x38-0x3B */
+	0xCE, 0x56, 0xCE, 0x57, 0xF2, 0xE4, 0xF2, 0xEA, /* 0x3C-0x3F */
+	0xCE, 0x58, 0xCE, 0x59, 0xCE, 0x5A, 0xCE, 0x5B, /* 0x40-0x43 */
+	0xCE, 0x5C, 0xCE, 0x5D, 0xCE, 0x5E, 0xD3, 0xAC, /* 0x44-0x47 */
+	0xF2, 0xE5, 0xB2, 0xF5, 0xCE, 0x5F, 0xCE, 0x60, /* 0x48-0x4B */
+	0xF2, 0xF2, 0xCE, 0x61, 0xD0, 0xAB, 0xCE, 0x62, /* 0x4C-0x4F */
+	0xCE, 0x63, 0xCE, 0x64, 0xCE, 0x65, 0xF2, 0xF5, /* 0x50-0x53 */
+	0xCE, 0x66, 0xCE, 0x67, 0xCE, 0x68, 0xBB, 0xC8, /* 0x54-0x57 */
+	0xCE, 0x69, 0xF2, 0xF9, 0xCE, 0x6A, 0xCE, 0x6B, /* 0x58-0x5B */
+	0xCE, 0x6C, 0xCE, 0x6D, 0xCE, 0x6E, 0xCE, 0x6F, /* 0x5C-0x5F */
+	0xF2, 0xF0, 0xCE, 0x70, 0xCE, 0x71, 0xF2, 0xF6, /* 0x60-0x63 */
+	0xF2, 0xF8, 0xF2, 0xFA, 0xCE, 0x72, 0xCE, 0x73, /* 0x64-0x67 */
+	0xCE, 0x74, 0xCE, 0x75, 0xCE, 0x76, 0xCE, 0x77, /* 0x68-0x6B */
+	0xCE, 0x78, 0xCE, 0x79, 0xF2, 0xF3, 0xCE, 0x7A, /* 0x6C-0x6F */
+	0xF2, 0xF1, 0xCE, 0x7B, 0xCE, 0x7C, 0xCE, 0x7D, /* 0x70-0x73 */
+	0xBA, 0xFB, 0xCE, 0x7E, 0xB5, 0xFB, 0xCE, 0x80, /* 0x74-0x77 */
+	0xCE, 0x81, 0xCE, 0x82, 0xCE, 0x83, 0xF2, 0xEF, /* 0x78-0x7B */
+	0xF2, 0xF7, 0xF2, 0xED, 0xF2, 0xEE, 0xCE, 0x84, /* 0x7C-0x7F */
+	
+	0xCE, 0x85, 0xCE, 0x86, 0xF2, 0xEB, 0xF3, 0xA6, /* 0x80-0x83 */
+	0xCE, 0x87, 0xF3, 0xA3, 0xCE, 0x88, 0xCE, 0x89, /* 0x84-0x87 */
+	0xF3, 0xA2, 0xCE, 0x8A, 0xCE, 0x8B, 0xF2, 0xF4, /* 0x88-0x8B */
+	0xCE, 0x8C, 0xC8, 0xDA, 0xCE, 0x8D, 0xCE, 0x8E, /* 0x8C-0x8F */
+	0xCE, 0x8F, 0xCE, 0x90, 0xCE, 0x91, 0xF2, 0xFB, /* 0x90-0x93 */
+	0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xF3, 0xA5, /* 0x94-0x97 */
+	0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, /* 0x98-0x9B */
+	0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xC3, 0xF8, /* 0x9C-0x9F */
+	0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, /* 0xA0-0xA3 */
+	0xCE, 0xA0, 0xCF, 0x40, 0xCF, 0x41, 0xCF, 0x42, /* 0xA4-0xA7 */
+	0xF2, 0xFD, 0xCF, 0x43, 0xCF, 0x44, 0xF3, 0xA7, /* 0xA8-0xAB */
+	0xF3, 0xA9, 0xF3, 0xA4, 0xCF, 0x45, 0xF2, 0xFC, /* 0xAC-0xAF */
+	0xCF, 0x46, 0xCF, 0x47, 0xCF, 0x48, 0xF3, 0xAB, /* 0xB0-0xB3 */
+	0xCF, 0x49, 0xF3, 0xAA, 0xCF, 0x4A, 0xCF, 0x4B, /* 0xB4-0xB7 */
+	0xCF, 0x4C, 0xCF, 0x4D, 0xC2, 0xDD, 0xCF, 0x4E, /* 0xB8-0xBB */
+	0xCF, 0x4F, 0xF3, 0xAE, 0xCF, 0x50, 0xCF, 0x51, /* 0xBC-0xBF */
+	0xF3, 0xB0, 0xCF, 0x52, 0xCF, 0x53, 0xCF, 0x54, /* 0xC0-0xC3 */
+	0xCF, 0x55, 0xCF, 0x56, 0xF3, 0xA1, 0xCF, 0x57, /* 0xC4-0xC7 */
+	0xCF, 0x58, 0xCF, 0x59, 0xF3, 0xB1, 0xF3, 0xAC, /* 0xC8-0xCB */
+	0xCF, 0x5A, 0xCF, 0x5B, 0xCF, 0x5C, 0xCF, 0x5D, /* 0xCC-0xCF */
+	0xCF, 0x5E, 0xF3, 0xAF, 0xF2, 0xFE, 0xF3, 0xAD, /* 0xD0-0xD3 */
+	0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x61, 0xCF, 0x62, /* 0xD4-0xD7 */
+	0xCF, 0x63, 0xCF, 0x64, 0xCF, 0x65, 0xF3, 0xB2, /* 0xD8-0xDB */
+	0xCF, 0x66, 0xCF, 0x67, 0xCF, 0x68, 0xCF, 0x69, /* 0xDC-0xDF */
+	0xF3, 0xB4, 0xCF, 0x6A, 0xCF, 0x6B, 0xCF, 0x6C, /* 0xE0-0xE3 */
+	0xCF, 0x6D, 0xF3, 0xA8, 0xCF, 0x6E, 0xCF, 0x6F, /* 0xE4-0xE7 */
+	0xCF, 0x70, 0xCF, 0x71, 0xF3, 0xB3, 0xCF, 0x72, /* 0xE8-0xEB */
+	0xCF, 0x73, 0xCF, 0x74, 0xF3, 0xB5, 0xCF, 0x75, /* 0xEC-0xEF */
+	0xCF, 0x76, 0xCF, 0x77, 0xCF, 0x78, 0xCF, 0x79, /* 0xF0-0xF3 */
+	0xCF, 0x7A, 0xCF, 0x7B, 0xCF, 0x7C, 0xCF, 0x7D, /* 0xF4-0xF7 */
+	0xCF, 0x7E, 0xD0, 0xB7, 0xCF, 0x80, 0xCF, 0x81, /* 0xF8-0xFB */
+	0xCF, 0x82, 0xCF, 0x83, 0xF3, 0xB8, 0xCF, 0x84, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xD9, 0xF9, /* 0x00-0x03 */
+	0xCF, 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, /* 0x04-0x07 */
+	0xCF, 0x8C, 0xCF, 0x8D, 0xF3, 0xB9, 0xCF, 0x8E, /* 0x08-0x0B */
+	0xCF, 0x8F, 0xCF, 0x90, 0xCF, 0x91, 0xCF, 0x92, /* 0x0C-0x0F */
+	0xCF, 0x93, 0xCF, 0x94, 0xCF, 0x95, 0xF3, 0xB7, /* 0x10-0x13 */
+	0xCF, 0x96, 0xC8, 0xE4, 0xF3, 0xB6, 0xCF, 0x97, /* 0x14-0x17 */
+	0xCF, 0x98, 0xCF, 0x99, 0xCF, 0x9A, 0xF3, 0xBA, /* 0x18-0x1B */
+	0xCF, 0x9B, 0xCF, 0x9C, 0xCF, 0x9D, 0xCF, 0x9E, /* 0x1C-0x1F */
+	0xCF, 0x9F, 0xF3, 0xBB, 0xB4, 0xC0, 0xCF, 0xA0, /* 0x20-0x23 */
+	0xD0, 0x40, 0xD0, 0x41, 0xD0, 0x42, 0xD0, 0x43, /* 0x24-0x27 */
+	0xD0, 0x44, 0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x47, /* 0x28-0x2B */
+	0xD0, 0x48, 0xD0, 0x49, 0xD0, 0x4A, 0xD0, 0x4B, /* 0x2C-0x2F */
+	0xD0, 0x4C, 0xD0, 0x4D, 0xEE, 0xC3, 0xD0, 0x4E, /* 0x30-0x33 */
+	0xD0, 0x4F, 0xD0, 0x50, 0xD0, 0x51, 0xD0, 0x52, /* 0x34-0x37 */
+	0xD0, 0x53, 0xF3, 0xBC, 0xD0, 0x54, 0xD0, 0x55, /* 0x38-0x3B */
+	0xF3, 0xBD, 0xD0, 0x56, 0xD0, 0x57, 0xD0, 0x58, /* 0x3C-0x3F */
+	0xD1, 0xAA, 0xD0, 0x59, 0xD0, 0x5A, 0xD0, 0x5B, /* 0x40-0x43 */
+	0xF4, 0xAC, 0xD0, 0xC6, 0xD0, 0x5C, 0xD0, 0x5D, /* 0x44-0x47 */
+	0xD0, 0x5E, 0xD0, 0x5F, 0xD0, 0x60, 0xD0, 0x61, /* 0x48-0x4B */
+	0xD0, 0xD0, 0xD1, 0xDC, 0xD0, 0x62, 0xD0, 0x63, /* 0x4C-0x4F */
+	0xD0, 0x64, 0xD0, 0x65, 0xD0, 0x66, 0xD0, 0x67, /* 0x50-0x53 */
+	0xCF, 0xCE, 0xD0, 0x68, 0xD0, 0x69, 0xBD, 0xD6, /* 0x54-0x57 */
+	0xD0, 0x6A, 0xD1, 0xC3, 0xD0, 0x6B, 0xD0, 0x6C, /* 0x58-0x5B */
+	0xD0, 0x6D, 0xD0, 0x6E, 0xD0, 0x6F, 0xD0, 0x70, /* 0x5C-0x5F */
+	0xD0, 0x71, 0xBA, 0xE2, 0xE1, 0xE9, 0xD2, 0xC2, /* 0x60-0x63 */
+	0xF1, 0xC2, 0xB2, 0xB9, 0xD0, 0x72, 0xD0, 0x73, /* 0x64-0x67 */
+	0xB1, 0xED, 0xF1, 0xC3, 0xD0, 0x74, 0xC9, 0xC0, /* 0x68-0x6B */
+	0xB3, 0xC4, 0xD0, 0x75, 0xD9, 0xF2, 0xD0, 0x76, /* 0x6C-0x6F */
+	0xCB, 0xA5, 0xD0, 0x77, 0xF1, 0xC4, 0xD0, 0x78, /* 0x70-0x73 */
+	0xD0, 0x79, 0xD0, 0x7A, 0xD0, 0x7B, 0xD6, 0xD4, /* 0x74-0x77 */
+	0xD0, 0x7C, 0xD0, 0x7D, 0xD0, 0x7E, 0xD0, 0x80, /* 0x78-0x7B */
+	0xD0, 0x81, 0xF1, 0xC5, 0xF4, 0xC0, 0xF1, 0xC6, /* 0x7C-0x7F */
+	
+	0xD0, 0x82, 0xD4, 0xAC, 0xF1, 0xC7, 0xD0, 0x83, /* 0x80-0x83 */
+	0xB0, 0xC0, 0xF4, 0xC1, 0xD0, 0x84, 0xD0, 0x85, /* 0x84-0x87 */
+	0xF4, 0xC2, 0xD0, 0x86, 0xD0, 0x87, 0xB4, 0xFC, /* 0x88-0x8B */
+	0xD0, 0x88, 0xC5, 0xDB, 0xD0, 0x89, 0xD0, 0x8A, /* 0x8C-0x8F */
+	0xD0, 0x8B, 0xD0, 0x8C, 0xCC, 0xBB, 0xD0, 0x8D, /* 0x90-0x93 */
+	0xD0, 0x8E, 0xD0, 0x8F, 0xD0, 0xE4, 0xD0, 0x90, /* 0x94-0x97 */
+	0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, /* 0x98-0x9B */
+	0xCD, 0xE0, 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, /* 0x9C-0x9F */
+	0xD0, 0x98, 0xD0, 0x99, 0xF1, 0xC8, 0xD0, 0x9A, /* 0xA0-0xA3 */
+	0xD9, 0xF3, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, /* 0xA4-0xA7 */
+	0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xB1, 0xBB, /* 0xA8-0xAB */
+	0xD1, 0x40, 0xCF, 0xAE, 0xD1, 0x41, 0xD1, 0x42, /* 0xAC-0xAF */
+	0xD1, 0x43, 0xB8, 0xA4, 0xD1, 0x44, 0xD1, 0x45, /* 0xB0-0xB3 */
+	0xD1, 0x46, 0xD1, 0x47, 0xD1, 0x48, 0xF1, 0xCA, /* 0xB4-0xB7 */
+	0xD1, 0x49, 0xD1, 0x4A, 0xD1, 0x4B, 0xD1, 0x4C, /* 0xB8-0xBB */
+	0xF1, 0xCB, 0xD1, 0x4D, 0xD1, 0x4E, 0xD1, 0x4F, /* 0xBC-0xBF */
+	0xD1, 0x50, 0xB2, 0xC3, 0xC1, 0xD1, 0xD1, 0x51, /* 0xC0-0xC3 */
+	0xD1, 0x52, 0xD7, 0xB0, 0xF1, 0xC9, 0xD1, 0x53, /* 0xC4-0xC7 */
+	0xD1, 0x54, 0xF1, 0xCC, 0xD1, 0x55, 0xD1, 0x56, /* 0xC8-0xCB */
+	0xD1, 0x57, 0xD1, 0x58, 0xF1, 0xCE, 0xD1, 0x59, /* 0xCC-0xCF */
+	0xD1, 0x5A, 0xD1, 0x5B, 0xD9, 0xF6, 0xD1, 0x5C, /* 0xD0-0xD3 */
+	0xD2, 0xE1, 0xD4, 0xA3, 0xD1, 0x5D, 0xD1, 0x5E, /* 0xD4-0xD7 */
+	0xF4, 0xC3, 0xC8, 0xB9, 0xD1, 0x5F, 0xD1, 0x60, /* 0xD8-0xDB */
+	0xD1, 0x61, 0xD1, 0x62, 0xD1, 0x63, 0xF4, 0xC4, /* 0xDC-0xDF */
+	0xD1, 0x64, 0xD1, 0x65, 0xF1, 0xCD, 0xF1, 0xCF, /* 0xE0-0xE3 */
+	0xBF, 0xE3, 0xF1, 0xD0, 0xD1, 0x66, 0xD1, 0x67, /* 0xE4-0xE7 */
+	0xF1, 0xD4, 0xD1, 0x68, 0xD1, 0x69, 0xD1, 0x6A, /* 0xE8-0xEB */
+	0xD1, 0x6B, 0xD1, 0x6C, 0xD1, 0x6D, 0xD1, 0x6E, /* 0xEC-0xEF */
+	0xF1, 0xD6, 0xF1, 0xD1, 0xD1, 0x6F, 0xC9, 0xD1, /* 0xF0-0xF3 */
+	0xC5, 0xE1, 0xD1, 0x70, 0xD1, 0x71, 0xD1, 0x72, /* 0xF4-0xF7 */
+	0xC2, 0xE3, 0xB9, 0xFC, 0xD1, 0x73, 0xD1, 0x74, /* 0xF8-0xFB */
+	0xF1, 0xD3, 0xD1, 0x75, 0xF1, 0xD5, 0xD1, 0x76, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0xD1, 0x77, 0xD1, 0x78, 0xB9, 0xD3, 0xD1, 0x79, /* 0x00-0x03 */
+	0xD1, 0x7A, 0xD1, 0x7B, 0xD1, 0x7C, 0xD1, 0x7D, /* 0x04-0x07 */
+	0xD1, 0x7E, 0xD1, 0x80, 0xF1, 0xDB, 0xD1, 0x81, /* 0x08-0x0B */
+	0xD1, 0x82, 0xD1, 0x83, 0xD1, 0x84, 0xD1, 0x85, /* 0x0C-0x0F */
+	0xBA, 0xD6, 0xD1, 0x86, 0xB0, 0xFD, 0xF1, 0xD9, /* 0x10-0x13 */
+	0xD1, 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, /* 0x14-0x17 */
+	0xD1, 0x8B, 0xF1, 0xD8, 0xF1, 0xD2, 0xF1, 0xDA, /* 0x18-0x1B */
+	0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, 0x8F, /* 0x1C-0x1F */
+	0xD1, 0x90, 0xF1, 0xD7, 0xD1, 0x91, 0xD1, 0x92, /* 0x20-0x23 */
+	0xD1, 0x93, 0xC8, 0xEC, 0xD1, 0x94, 0xD1, 0x95, /* 0x24-0x27 */
+	0xD1, 0x96, 0xD1, 0x97, 0xCD, 0xCA, 0xF1, 0xDD, /* 0x28-0x2B */
+	0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, 0x9B, /* 0x2C-0x2F */
+	0xE5, 0xBD, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, /* 0x30-0x33 */
+	0xF1, 0xDC, 0xD1, 0x9F, 0xF1, 0xDE, 0xD1, 0xA0, /* 0x34-0x37 */
+	0xD2, 0x40, 0xD2, 0x41, 0xD2, 0x42, 0xD2, 0x43, /* 0x38-0x3B */
+	0xD2, 0x44, 0xD2, 0x45, 0xD2, 0x46, 0xD2, 0x47, /* 0x3C-0x3F */
+	0xD2, 0x48, 0xF1, 0xDF, 0xD2, 0x49, 0xD2, 0x4A, /* 0x40-0x43 */
+	0xCF, 0xE5, 0xD2, 0x4B, 0xD2, 0x4C, 0xD2, 0x4D, /* 0x44-0x47 */
+	0xD2, 0x4E, 0xD2, 0x4F, 0xD2, 0x50, 0xD2, 0x51, /* 0x48-0x4B */
+	0xD2, 0x52, 0xD2, 0x53, 0xD2, 0x54, 0xD2, 0x55, /* 0x4C-0x4F */
+	0xD2, 0x56, 0xD2, 0x57, 0xD2, 0x58, 0xD2, 0x59, /* 0x50-0x53 */
+	0xD2, 0x5A, 0xD2, 0x5B, 0xD2, 0x5C, 0xD2, 0x5D, /* 0x54-0x57 */
+	0xD2, 0x5E, 0xD2, 0x5F, 0xD2, 0x60, 0xD2, 0x61, /* 0x58-0x5B */
+	0xD2, 0x62, 0xD2, 0x63, 0xF4, 0xC5, 0xBD, 0xF3, /* 0x5C-0x5F */
+	0xD2, 0x64, 0xD2, 0x65, 0xD2, 0x66, 0xD2, 0x67, /* 0x60-0x63 */
+	0xD2, 0x68, 0xD2, 0x69, 0xF1, 0xE0, 0xD2, 0x6A, /* 0x64-0x67 */
+	0xD2, 0x6B, 0xD2, 0x6C, 0xD2, 0x6D, 0xD2, 0x6E, /* 0x68-0x6B */
+	0xD2, 0x6F, 0xD2, 0x70, 0xD2, 0x71, 0xD2, 0x72, /* 0x6C-0x6F */
+	0xD2, 0x73, 0xD2, 0x74, 0xD2, 0x75, 0xD2, 0x76, /* 0x70-0x73 */
+	0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, 0xD2, 0x7A, /* 0x74-0x77 */
+	0xD2, 0x7B, 0xD2, 0x7C, 0xD2, 0x7D, 0xF1, 0xE1, /* 0x78-0x7B */
+	0xD2, 0x7E, 0xD2, 0x80, 0xD2, 0x81, 0xCE, 0xF7, /* 0x7C-0x7F */
+	
+	0xD2, 0x82, 0xD2, 0xAA, 0xD2, 0x83, 0xF1, 0xFB, /* 0x80-0x83 */
+	0xD2, 0x84, 0xD2, 0x85, 0xB8, 0xB2, 0xD2, 0x86, /* 0x84-0x87 */
+	0xD2, 0x87, 0xD2, 0x88, 0xD2, 0x89, 0xD2, 0x8A, /* 0x88-0x8B */
+	0xD2, 0x8B, 0xD2, 0x8C, 0xD2, 0x8D, 0xD2, 0x8E, /* 0x8C-0x8F */
+	0xD2, 0x8F, 0xD2, 0x90, 0xD2, 0x91, 0xD2, 0x92, /* 0x90-0x93 */
+	0xD2, 0x93, 0xD2, 0x94, 0xD2, 0x95, 0xD2, 0x96, /* 0x94-0x97 */
+	0xD2, 0x97, 0xD2, 0x98, 0xD2, 0x99, 0xD2, 0x9A, /* 0x98-0x9B */
+	0xD2, 0x9B, 0xD2, 0x9C, 0xD2, 0x9D, 0xD2, 0x9E, /* 0x9C-0x9F */
+	0xD2, 0x9F, 0xD2, 0xA0, 0xD3, 0x40, 0xD3, 0x41, /* 0xA0-0xA3 */
+	0xD3, 0x42, 0xD3, 0x43, 0xD3, 0x44, 0xD3, 0x45, /* 0xA4-0xA7 */
+	0xD3, 0x46, 0xD3, 0x47, 0xD3, 0x48, 0xD3, 0x49, /* 0xA8-0xAB */
+	0xD3, 0x4A, 0xD3, 0x4B, 0xD3, 0x4C, 0xD3, 0x4D, /* 0xAC-0xAF */
+	0xD3, 0x4E, 0xD3, 0x4F, 0xD3, 0x50, 0xD3, 0x51, /* 0xB0-0xB3 */
+	0xD3, 0x52, 0xD3, 0x53, 0xD3, 0x54, 0xD3, 0x55, /* 0xB4-0xB7 */
+	0xD3, 0x56, 0xD3, 0x57, 0xD3, 0x58, 0xD3, 0x59, /* 0xB8-0xBB */
+	0xD3, 0x5A, 0xD3, 0x5B, 0xD3, 0x5C, 0xD3, 0x5D, /* 0xBC-0xBF */
+	0xD3, 0x5E, 0xBC, 0xFB, 0xB9, 0xDB, 0xD3, 0x5F, /* 0xC0-0xC3 */
+	0xB9, 0xE6, 0xC3, 0xD9, 0xCA, 0xD3, 0xEA, 0xE8, /* 0xC4-0xC7 */
+	0xC0, 0xC0, 0xBE, 0xF5, 0xEA, 0xE9, 0xEA, 0xEA, /* 0xC8-0xCB */
+	0xEA, 0xEB, 0xD3, 0x60, 0xEA, 0xEC, 0xEA, 0xED, /* 0xCC-0xCF */
+	0xEA, 0xEE, 0xEA, 0xEF, 0xBD, 0xC7, 0xD3, 0x61, /* 0xD0-0xD3 */
+	0xD3, 0x62, 0xD3, 0x63, 0xF5, 0xFB, 0xD3, 0x64, /* 0xD4-0xD7 */
+	0xD3, 0x65, 0xD3, 0x66, 0xF5, 0xFD, 0xD3, 0x67, /* 0xD8-0xDB */
+	0xF5, 0xFE, 0xD3, 0x68, 0xF5, 0xFC, 0xD3, 0x69, /* 0xDC-0xDF */
+	0xD3, 0x6A, 0xD3, 0x6B, 0xD3, 0x6C, 0xBD, 0xE2, /* 0xE0-0xE3 */
+	0xD3, 0x6D, 0xF6, 0xA1, 0xB4, 0xA5, 0xD3, 0x6E, /* 0xE4-0xE7 */
+	0xD3, 0x6F, 0xD3, 0x70, 0xD3, 0x71, 0xF6, 0xA2, /* 0xE8-0xEB */
+	0xD3, 0x72, 0xD3, 0x73, 0xD3, 0x74, 0xF6, 0xA3, /* 0xEC-0xEF */
+	0xD3, 0x75, 0xD3, 0x76, 0xD3, 0x77, 0xEC, 0xB2, /* 0xF0-0xF3 */
+	0xD3, 0x78, 0xD3, 0x79, 0xD3, 0x7A, 0xD3, 0x7B, /* 0xF4-0xF7 */
+	0xD3, 0x7C, 0xD3, 0x7D, 0xD3, 0x7E, 0xD3, 0x80, /* 0xF8-0xFB */
+	0xD3, 0x81, 0xD3, 0x82, 0xD3, 0x83, 0xD3, 0x84, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0xD1, 0xD4, 0xD3, 0x85, 0xD3, 0x86, 0xD3, 0x87, /* 0x00-0x03 */
+	0xD3, 0x88, 0xD3, 0x89, 0xD3, 0x8A, 0xD9, 0xEA, /* 0x04-0x07 */
+	0xD3, 0x8B, 0xD3, 0x8C, 0xD3, 0x8D, 0xD3, 0x8E, /* 0x08-0x0B */
+	0xD3, 0x8F, 0xD3, 0x90, 0xD3, 0x91, 0xD3, 0x92, /* 0x0C-0x0F */
+	0xD3, 0x93, 0xD3, 0x94, 0xD3, 0x95, 0xD3, 0x96, /* 0x10-0x13 */
+	0xD3, 0x97, 0xD3, 0x98, 0xD3, 0x99, 0xD3, 0x9A, /* 0x14-0x17 */
+	0xD3, 0x9B, 0xD3, 0x9C, 0xD3, 0x9D, 0xD3, 0x9E, /* 0x18-0x1B */
+	0xD3, 0x9F, 0xD3, 0xA0, 0xD4, 0x40, 0xD4, 0x41, /* 0x1C-0x1F */
+	0xD4, 0x42, 0xD4, 0x43, 0xD4, 0x44, 0xD4, 0x45, /* 0x20-0x23 */
+	0xD4, 0x46, 0xD4, 0x47, 0xD4, 0x48, 0xD4, 0x49, /* 0x24-0x27 */
+	0xD4, 0x4A, 0xD4, 0x4B, 0xD4, 0x4C, 0xD4, 0x4D, /* 0x28-0x2B */
+	0xD4, 0x4E, 0xD4, 0x4F, 0xD4, 0x50, 0xD4, 0x51, /* 0x2C-0x2F */
+	0xD4, 0x52, 0xD4, 0x53, 0xD4, 0x54, 0xD4, 0x55, /* 0x30-0x33 */
+	0xD4, 0x56, 0xD4, 0x57, 0xD4, 0x58, 0xD4, 0x59, /* 0x34-0x37 */
+	0xD4, 0x5A, 0xD4, 0x5B, 0xD4, 0x5C, 0xD4, 0x5D, /* 0x38-0x3B */
+	0xD4, 0x5E, 0xD4, 0x5F, 0xF6, 0xA4, 0xD4, 0x60, /* 0x3C-0x3F */
+	0xD4, 0x61, 0xD4, 0x62, 0xD4, 0x63, 0xD4, 0x64, /* 0x40-0x43 */
+	0xD4, 0x65, 0xD4, 0x66, 0xD4, 0x67, 0xD4, 0x68, /* 0x44-0x47 */
+	0xEE, 0xBA, 0xD4, 0x69, 0xD4, 0x6A, 0xD4, 0x6B, /* 0x48-0x4B */
+	0xD4, 0x6C, 0xD4, 0x6D, 0xD4, 0x6E, 0xD4, 0x6F, /* 0x4C-0x4F */
+	0xD4, 0x70, 0xD4, 0x71, 0xD4, 0x72, 0xD4, 0x73, /* 0x50-0x53 */
+	0xD4, 0x74, 0xD4, 0x75, 0xD4, 0x76, 0xD4, 0x77, /* 0x54-0x57 */
+	0xD4, 0x78, 0xD4, 0x79, 0xD4, 0x7A, 0xD4, 0x7B, /* 0x58-0x5B */
+	0xD4, 0x7C, 0xD4, 0x7D, 0xD4, 0x7E, 0xD4, 0x80, /* 0x5C-0x5F */
+	0xD4, 0x81, 0xD4, 0x82, 0xD4, 0x83, 0xD4, 0x84, /* 0x60-0x63 */
+	0xD4, 0x85, 0xD4, 0x86, 0xD4, 0x87, 0xD4, 0x88, /* 0x64-0x67 */
+	0xD4, 0x89, 0xD4, 0x8A, 0xD4, 0x8B, 0xD4, 0x8C, /* 0x68-0x6B */
+	0xD4, 0x8D, 0xD4, 0x8E, 0xD4, 0x8F, 0xD4, 0x90, /* 0x6C-0x6F */
+	0xD4, 0x91, 0xD4, 0x92, 0xD4, 0x93, 0xD4, 0x94, /* 0x70-0x73 */
+	0xD4, 0x95, 0xD4, 0x96, 0xD4, 0x97, 0xD4, 0x98, /* 0x74-0x77 */
+	0xD4, 0x99, 0xD5, 0xB2, 0xD4, 0x9A, 0xD4, 0x9B, /* 0x78-0x7B */
+	0xD4, 0x9C, 0xD4, 0x9D, 0xD4, 0x9E, 0xD4, 0x9F, /* 0x7C-0x7F */
+	
+	0xD4, 0xA0, 0xD5, 0x40, 0xD5, 0x41, 0xD5, 0x42, /* 0x80-0x83 */
+	0xD5, 0x43, 0xD5, 0x44, 0xD5, 0x45, 0xD5, 0x46, /* 0x84-0x87 */
+	0xD5, 0x47, 0xD3, 0xFE, 0xCC, 0xDC, 0xD5, 0x48, /* 0x88-0x8B */
+	0xD5, 0x49, 0xD5, 0x4A, 0xD5, 0x4B, 0xD5, 0x4C, /* 0x8C-0x8F */
+	0xD5, 0x4D, 0xD5, 0x4E, 0xD5, 0x4F, 0xCA, 0xC4, /* 0x90-0x93 */
+	0xD5, 0x50, 0xD5, 0x51, 0xD5, 0x52, 0xD5, 0x53, /* 0x94-0x97 */
+	0xD5, 0x54, 0xD5, 0x55, 0xD5, 0x56, 0xD5, 0x57, /* 0x98-0x9B */
+	0xD5, 0x58, 0xD5, 0x59, 0xD5, 0x5A, 0xD5, 0x5B, /* 0x9C-0x9F */
+	0xD5, 0x5C, 0xD5, 0x5D, 0xD5, 0x5E, 0xD5, 0x5F, /* 0xA0-0xA3 */
+	0xD5, 0x60, 0xD5, 0x61, 0xD5, 0x62, 0xD5, 0x63, /* 0xA4-0xA7 */
+	0xD5, 0x64, 0xD5, 0x65, 0xD5, 0x66, 0xD5, 0x67, /* 0xA8-0xAB */
+	0xD5, 0x68, 0xD5, 0x69, 0xD5, 0x6A, 0xD5, 0x6B, /* 0xAC-0xAF */
+	0xD5, 0x6C, 0xD5, 0x6D, 0xD5, 0x6E, 0xD5, 0x6F, /* 0xB0-0xB3 */
+	0xD5, 0x70, 0xD5, 0x71, 0xD5, 0x72, 0xD5, 0x73, /* 0xB4-0xB7 */
+	0xD5, 0x74, 0xD5, 0x75, 0xD5, 0x76, 0xD5, 0x77, /* 0xB8-0xBB */
+	0xD5, 0x78, 0xD5, 0x79, 0xD5, 0x7A, 0xD5, 0x7B, /* 0xBC-0xBF */
+	0xD5, 0x7C, 0xD5, 0x7D, 0xD5, 0x7E, 0xD5, 0x80, /* 0xC0-0xC3 */
+	0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, /* 0xC4-0xC7 */
+	0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, /* 0xC8-0xCB */
+	0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, /* 0xCC-0xCF */
+	0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, /* 0xD0-0xD3 */
+	0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, /* 0xD4-0xD7 */
+	0xD5, 0x95, 0xD5, 0x96, 0xD5, 0x97, 0xD5, 0x98, /* 0xD8-0xDB */
+	0xD5, 0x99, 0xD5, 0x9A, 0xD5, 0x9B, 0xD5, 0x9C, /* 0xDC-0xDF */
+	0xD5, 0x9D, 0xD5, 0x9E, 0xD5, 0x9F, 0xD5, 0xA0, /* 0xE0-0xE3 */
+	0xD6, 0x40, 0xD6, 0x41, 0xD6, 0x42, 0xD6, 0x43, /* 0xE4-0xE7 */
+	0xD6, 0x44, 0xD6, 0x45, 0xD6, 0x46, 0xD6, 0x47, /* 0xE8-0xEB */
+	0xD6, 0x48, 0xD6, 0x49, 0xD6, 0x4A, 0xD6, 0x4B, /* 0xEC-0xEF */
+	0xD6, 0x4C, 0xD6, 0x4D, 0xD6, 0x4E, 0xD6, 0x4F, /* 0xF0-0xF3 */
+	0xD6, 0x50, 0xD6, 0x51, 0xD6, 0x52, 0xD6, 0x53, /* 0xF4-0xF7 */
+	0xD6, 0x54, 0xD6, 0x55, 0xD6, 0x56, 0xD6, 0x57, /* 0xF8-0xFB */
+	0xD6, 0x58, 0xD6, 0x59, 0xD6, 0x5A, 0xD6, 0x5B, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0xD6, 0x5C, 0xD6, 0x5D, 0xD6, 0x5E, 0xD6, 0x5F, /* 0x00-0x03 */
+	0xD6, 0x60, 0xD6, 0x61, 0xD6, 0x62, 0xE5, 0xC0, /* 0x04-0x07 */
+	0xD6, 0x63, 0xD6, 0x64, 0xD6, 0x65, 0xD6, 0x66, /* 0x08-0x0B */
+	0xD6, 0x67, 0xD6, 0x68, 0xD6, 0x69, 0xD6, 0x6A, /* 0x0C-0x0F */
+	0xD6, 0x6B, 0xD6, 0x6C, 0xD6, 0x6D, 0xD6, 0x6E, /* 0x10-0x13 */
+	0xD6, 0x6F, 0xD6, 0x70, 0xD6, 0x71, 0xD6, 0x72, /* 0x14-0x17 */
+	0xD6, 0x73, 0xD6, 0x74, 0xD6, 0x75, 0xD6, 0x76, /* 0x18-0x1B */
+	0xD6, 0x77, 0xD6, 0x78, 0xD6, 0x79, 0xD6, 0x7A, /* 0x1C-0x1F */
+	0xD6, 0x7B, 0xD6, 0x7C, 0xD6, 0x7D, 0xD6, 0x7E, /* 0x20-0x23 */
+	0xD6, 0x80, 0xD6, 0x81, 0xF6, 0xA5, 0xD6, 0x82, /* 0x24-0x27 */
+	0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, 0x86, /* 0x28-0x2B */
+	0xD6, 0x87, 0xD6, 0x88, 0xD6, 0x89, 0xD6, 0x8A, /* 0x2C-0x2F */
+	0xD6, 0x8B, 0xD6, 0x8C, 0xD6, 0x8D, 0xD6, 0x8E, /* 0x30-0x33 */
+	0xD6, 0x8F, 0xD6, 0x90, 0xD6, 0x91, 0xD6, 0x92, /* 0x34-0x37 */
+	0xD6, 0x93, 0xD6, 0x94, 0xD6, 0x95, 0xD6, 0x96, /* 0x38-0x3B */
+	0xD6, 0x97, 0xD6, 0x98, 0xD6, 0x99, 0xD6, 0x9A, /* 0x3C-0x3F */
+	0xD6, 0x9B, 0xD6, 0x9C, 0xD6, 0x9D, 0xD6, 0x9E, /* 0x40-0x43 */
+	0xD6, 0x9F, 0xD6, 0xA0, 0xD7, 0x40, 0xD7, 0x41, /* 0x44-0x47 */
+	0xD7, 0x42, 0xD7, 0x43, 0xD7, 0x44, 0xD7, 0x45, /* 0x48-0x4B */
+	0xD7, 0x46, 0xD7, 0x47, 0xD7, 0x48, 0xD7, 0x49, /* 0x4C-0x4F */
+	0xD7, 0x4A, 0xD7, 0x4B, 0xD7, 0x4C, 0xD7, 0x4D, /* 0x50-0x53 */
+	0xD7, 0x4E, 0xD7, 0x4F, 0xD7, 0x50, 0xD7, 0x51, /* 0x54-0x57 */
+	0xD7, 0x52, 0xD7, 0x53, 0xD7, 0x54, 0xD7, 0x55, /* 0x58-0x5B */
+	0xD7, 0x56, 0xD7, 0x57, 0xD7, 0x58, 0xD7, 0x59, /* 0x5C-0x5F */
+	0xD7, 0x5A, 0xD7, 0x5B, 0xD7, 0x5C, 0xD7, 0x5D, /* 0x60-0x63 */
+	0xD7, 0x5E, 0xD7, 0x5F, 0xBE, 0xAF, 0xD7, 0x60, /* 0x64-0x67 */
+	0xD7, 0x61, 0xD7, 0x62, 0xD7, 0x63, 0xD7, 0x64, /* 0x68-0x6B */
+	0xC6, 0xA9, 0xD7, 0x65, 0xD7, 0x66, 0xD7, 0x67, /* 0x6C-0x6F */
+	0xD7, 0x68, 0xD7, 0x69, 0xD7, 0x6A, 0xD7, 0x6B, /* 0x70-0x73 */
+	0xD7, 0x6C, 0xD7, 0x6D, 0xD7, 0x6E, 0xD7, 0x6F, /* 0x74-0x77 */
+	0xD7, 0x70, 0xD7, 0x71, 0xD7, 0x72, 0xD7, 0x73, /* 0x78-0x7B */
+	0xD7, 0x74, 0xD7, 0x75, 0xD7, 0x76, 0xD7, 0x77, /* 0x7C-0x7F */
+	
+	0xD7, 0x78, 0xD7, 0x79, 0xD7, 0x7A, 0xD7, 0x7B, /* 0x80-0x83 */
+	0xD7, 0x7C, 0xD7, 0x7D, 0xD7, 0x7E, 0xD7, 0x80, /* 0x84-0x87 */
+	0xD7, 0x81, 0xD7, 0x82, 0xD7, 0x83, 0xD7, 0x84, /* 0x88-0x8B */
+	0xD7, 0x85, 0xD7, 0x86, 0xD7, 0x87, 0xD7, 0x88, /* 0x8C-0x8F */
+	0xD7, 0x89, 0xD7, 0x8A, 0xD7, 0x8B, 0xD7, 0x8C, /* 0x90-0x93 */
+	0xD7, 0x8D, 0xD7, 0x8E, 0xD7, 0x8F, 0xD7, 0x90, /* 0x94-0x97 */
+	0xD7, 0x91, 0xD7, 0x92, 0xD7, 0x93, 0xD7, 0x94, /* 0x98-0x9B */
+	0xD7, 0x95, 0xD7, 0x96, 0xD7, 0x97, 0xD7, 0x98, /* 0x9C-0x9F */
+	0xDA, 0xA5, 0xBC, 0xC6, 0xB6, 0xA9, 0xB8, 0xBC, /* 0xA0-0xA3 */
+	0xC8, 0xCF, 0xBC, 0xA5, 0xDA, 0xA6, 0xDA, 0xA7, /* 0xA4-0xA7 */
+	0xCC, 0xD6, 0xC8, 0xC3, 0xDA, 0xA8, 0xC6, 0xFD, /* 0xA8-0xAB */
+	0xD7, 0x99, 0xD1, 0xB5, 0xD2, 0xE9, 0xD1, 0xB6, /* 0xAC-0xAF */
+	0xBC, 0xC7, 0xD7, 0x9A, 0xBD, 0xB2, 0xBB, 0xE4, /* 0xB0-0xB3 */
+	0xDA, 0xA9, 0xDA, 0xAA, 0xD1, 0xC8, 0xDA, 0xAB, /* 0xB4-0xB7 */
+	0xD0, 0xED, 0xB6, 0xEF, 0xC2, 0xDB, 0xD7, 0x9B, /* 0xB8-0xBB */
+	0xCB, 0xCF, 0xB7, 0xED, 0xC9, 0xE8, 0xB7, 0xC3, /* 0xBC-0xBF */
+	0xBE, 0xF7, 0xD6, 0xA4, 0xDA, 0xAC, 0xDA, 0xAD, /* 0xC0-0xC3 */
+	0xC6, 0xC0, 0xD7, 0xE7, 0xCA, 0xB6, 0xD7, 0x9C, /* 0xC4-0xC7 */
+	0xD5, 0xA9, 0xCB, 0xDF, 0xD5, 0xEF, 0xDA, 0xAE, /* 0xC8-0xCB */
+	0xD6, 0xDF, 0xB4, 0xCA, 0xDA, 0xB0, 0xDA, 0xAF, /* 0xCC-0xCF */
+	0xD7, 0x9D, 0xD2, 0xEB, 0xDA, 0xB1, 0xDA, 0xB2, /* 0xD0-0xD3 */
+	0xDA, 0xB3, 0xCA, 0xD4, 0xDA, 0xB4, 0xCA, 0xAB, /* 0xD4-0xD7 */
+	0xDA, 0xB5, 0xDA, 0xB6, 0xB3, 0xCF, 0xD6, 0xEF, /* 0xD8-0xDB */
+	0xDA, 0xB7, 0xBB, 0xB0, 0xB5, 0xAE, 0xDA, 0xB8, /* 0xDC-0xDF */
+	0xDA, 0xB9, 0xB9, 0xEE, 0xD1, 0xAF, 0xD2, 0xE8, /* 0xE0-0xE3 */
+	0xDA, 0xBA, 0xB8, 0xC3, 0xCF, 0xEA, 0xB2, 0xEF, /* 0xE4-0xE7 */
+	0xDA, 0xBB, 0xDA, 0xBC, 0xD7, 0x9E, 0xBD, 0xEB, /* 0xE8-0xEB */
+	0xCE, 0xDC, 0xD3, 0xEF, 0xDA, 0xBD, 0xCE, 0xF3, /* 0xEC-0xEF */
+	0xDA, 0xBE, 0xD3, 0xD5, 0xBB, 0xE5, 0xDA, 0xBF, /* 0xF0-0xF3 */
+	0xCB, 0xB5, 0xCB, 0xD0, 0xDA, 0xC0, 0xC7, 0xEB, /* 0xF4-0xF7 */
+	0xD6, 0xEE, 0xDA, 0xC1, 0xC5, 0xB5, 0xB6, 0xC1, /* 0xF8-0xFB */
+	0xDA, 0xC2, 0xB7, 0xCC, 0xBF, 0xCE, 0xDA, 0xC3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0xDA, 0xC4, 0xCB, 0xAD, 0xDA, 0xC5, 0xB5, 0xF7, /* 0x00-0x03 */
+	0xDA, 0xC6, 0xC1, 0xC2, 0xD7, 0xBB, 0xDA, 0xC7, /* 0x04-0x07 */
+	0xCC, 0xB8, 0xD7, 0x9F, 0xD2, 0xEA, 0xC4, 0xB1, /* 0x08-0x0B */
+	0xDA, 0xC8, 0xB5, 0xFD, 0xBB, 0xD1, 0xDA, 0xC9, /* 0x0C-0x0F */
+	0xD0, 0xB3, 0xDA, 0xCA, 0xDA, 0xCB, 0xCE, 0xBD, /* 0x10-0x13 */
+	0xDA, 0xCC, 0xDA, 0xCD, 0xDA, 0xCE, 0xB2, 0xF7, /* 0x14-0x17 */
+	0xDA, 0xD1, 0xDA, 0xCF, 0xD1, 0xE8, 0xDA, 0xD0, /* 0x18-0x1B */
+	0xC3, 0xD5, 0xDA, 0xD2, 0xD7, 0xA0, 0xDA, 0xD3, /* 0x1C-0x1F */
+	0xDA, 0xD4, 0xDA, 0xD5, 0xD0, 0xBB, 0xD2, 0xA5, /* 0x20-0x23 */
+	0xB0, 0xF9, 0xDA, 0xD6, 0xC7, 0xAB, 0xDA, 0xD7, /* 0x24-0x27 */
+	0xBD, 0xF7, 0xC3, 0xA1, 0xDA, 0xD8, 0xDA, 0xD9, /* 0x28-0x2B */
+	0xC3, 0xFD, 0xCC, 0xB7, 0xDA, 0xDA, 0xDA, 0xDB, /* 0x2C-0x2F */
+	0xC0, 0xBE, 0xC6, 0xD7, 0xDA, 0xDC, 0xDA, 0xDD, /* 0x30-0x33 */
+	0xC7, 0xB4, 0xDA, 0xDE, 0xDA, 0xDF, 0xB9, 0xC8, /* 0x34-0x37 */
+	0xD8, 0x40, 0xD8, 0x41, 0xD8, 0x42, 0xD8, 0x43, /* 0x38-0x3B */
+	0xD8, 0x44, 0xD8, 0x45, 0xD8, 0x46, 0xD8, 0x47, /* 0x3C-0x3F */
+	0xD8, 0x48, 0xBB, 0xED, 0xD8, 0x49, 0xD8, 0x4A, /* 0x40-0x43 */
+	0xD8, 0x4B, 0xD8, 0x4C, 0xB6, 0xB9, 0xF4, 0xF8, /* 0x44-0x47 */
+	0xD8, 0x4D, 0xF4, 0xF9, 0xD8, 0x4E, 0xD8, 0x4F, /* 0x48-0x4B */
+	0xCD, 0xE3, 0xD8, 0x50, 0xD8, 0x51, 0xD8, 0x52, /* 0x4C-0x4F */
+	0xD8, 0x53, 0xD8, 0x54, 0xD8, 0x55, 0xD8, 0x56, /* 0x50-0x53 */
+	0xD8, 0x57, 0xF5, 0xB9, 0xD8, 0x58, 0xD8, 0x59, /* 0x54-0x57 */
+	0xD8, 0x5A, 0xD8, 0x5B, 0xEB, 0xE0, 0xD8, 0x5C, /* 0x58-0x5B */
+	0xD8, 0x5D, 0xD8, 0x5E, 0xD8, 0x5F, 0xD8, 0x60, /* 0x5C-0x5F */
+	0xD8, 0x61, 0xCF, 0xF3, 0xBB, 0xBF, 0xD8, 0x62, /* 0x60-0x63 */
+	0xD8, 0x63, 0xD8, 0x64, 0xD8, 0x65, 0xD8, 0x66, /* 0x64-0x67 */
+	0xD8, 0x67, 0xD8, 0x68, 0xBA, 0xC0, 0xD4, 0xA5, /* 0x68-0x6B */
+	0xD8, 0x69, 0xD8, 0x6A, 0xD8, 0x6B, 0xD8, 0x6C, /* 0x6C-0x6F */
+	0xD8, 0x6D, 0xD8, 0x6E, 0xD8, 0x6F, 0xE1, 0xD9, /* 0x70-0x73 */
+	0xD8, 0x70, 0xD8, 0x71, 0xD8, 0x72, 0xD8, 0x73, /* 0x74-0x77 */
+	0xF5, 0xF4, 0xB1, 0xAA, 0xB2, 0xF2, 0xD8, 0x74, /* 0x78-0x7B */
+	0xD8, 0x75, 0xD8, 0x76, 0xD8, 0x77, 0xD8, 0x78, /* 0x7C-0x7F */
+	
+	0xD8, 0x79, 0xD8, 0x7A, 0xF5, 0xF5, 0xD8, 0x7B, /* 0x80-0x83 */
+	0xD8, 0x7C, 0xF5, 0xF7, 0xD8, 0x7D, 0xD8, 0x7E, /* 0x84-0x87 */
+	0xD8, 0x80, 0xBA, 0xD1, 0xF5, 0xF6, 0xD8, 0x81, /* 0x88-0x8B */
+	0xC3, 0xB2, 0xD8, 0x82, 0xD8, 0x83, 0xD8, 0x84, /* 0x8C-0x8F */
+	0xD8, 0x85, 0xD8, 0x86, 0xD8, 0x87, 0xD8, 0x88, /* 0x90-0x93 */
+	0xF5, 0xF9, 0xD8, 0x89, 0xD8, 0x8A, 0xD8, 0x8B, /* 0x94-0x97 */
+	0xF5, 0xF8, 0xD8, 0x8C, 0xD8, 0x8D, 0xD8, 0x8E, /* 0x98-0x9B */
+	0xD8, 0x8F, 0xD8, 0x90, 0xD8, 0x91, 0xD8, 0x92, /* 0x9C-0x9F */
+	0xD8, 0x93, 0xD8, 0x94, 0xD8, 0x95, 0xD8, 0x96, /* 0xA0-0xA3 */
+	0xD8, 0x97, 0xD8, 0x98, 0xD8, 0x99, 0xD8, 0x9A, /* 0xA4-0xA7 */
+	0xD8, 0x9B, 0xD8, 0x9C, 0xD8, 0x9D, 0xD8, 0x9E, /* 0xA8-0xAB */
+	0xD8, 0x9F, 0xD8, 0xA0, 0xD9, 0x40, 0xD9, 0x41, /* 0xAC-0xAF */
+	0xD9, 0x42, 0xD9, 0x43, 0xD9, 0x44, 0xD9, 0x45, /* 0xB0-0xB3 */
+	0xD9, 0x46, 0xD9, 0x47, 0xD9, 0x48, 0xD9, 0x49, /* 0xB4-0xB7 */
+	0xD9, 0x4A, 0xD9, 0x4B, 0xD9, 0x4C, 0xD9, 0x4D, /* 0xB8-0xBB */
+	0xD9, 0x4E, 0xD9, 0x4F, 0xD9, 0x50, 0xD9, 0x51, /* 0xBC-0xBF */
+	0xD9, 0x52, 0xD9, 0x53, 0xD9, 0x54, 0xD9, 0x55, /* 0xC0-0xC3 */
+	0xD9, 0x56, 0xD9, 0x57, 0xD9, 0x58, 0xD9, 0x59, /* 0xC4-0xC7 */
+	0xD9, 0x5A, 0xD9, 0x5B, 0xD9, 0x5C, 0xD9, 0x5D, /* 0xC8-0xCB */
+	0xD9, 0x5E, 0xD9, 0x5F, 0xD9, 0x60, 0xD9, 0x61, /* 0xCC-0xCF */
+	0xD9, 0x62, 0xD9, 0x63, 0xD9, 0x64, 0xD9, 0x65, /* 0xD0-0xD3 */
+	0xD9, 0x66, 0xD9, 0x67, 0xD9, 0x68, 0xD9, 0x69, /* 0xD4-0xD7 */
+	0xD9, 0x6A, 0xD9, 0x6B, 0xD9, 0x6C, 0xD9, 0x6D, /* 0xD8-0xDB */
+	0xD9, 0x6E, 0xD9, 0x6F, 0xD9, 0x70, 0xD9, 0x71, /* 0xDC-0xDF */
+	0xD9, 0x72, 0xD9, 0x73, 0xD9, 0x74, 0xD9, 0x75, /* 0xE0-0xE3 */
+	0xD9, 0x76, 0xD9, 0x77, 0xD9, 0x78, 0xD9, 0x79, /* 0xE4-0xE7 */
+	0xD9, 0x7A, 0xD9, 0x7B, 0xD9, 0x7C, 0xD9, 0x7D, /* 0xE8-0xEB */
+	0xD9, 0x7E, 0xD9, 0x80, 0xD9, 0x81, 0xD9, 0x82, /* 0xEC-0xEF */
+	0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, /* 0xF0-0xF3 */
+	0xD9, 0x87, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x8A, /* 0xF4-0xF7 */
+	0xD9, 0x8B, 0xD9, 0x8C, 0xD9, 0x8D, 0xD9, 0x8E, /* 0xF8-0xFB */
+	0xD9, 0x8F, 0xD9, 0x90, 0xD9, 0x91, 0xD9, 0x92, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0xD9, 0x93, 0xD9, 0x94, 0xD9, 0x95, 0xD9, 0x96, /* 0x00-0x03 */
+	0xD9, 0x97, 0xD9, 0x98, 0xD9, 0x99, 0xD9, 0x9A, /* 0x04-0x07 */
+	0xD9, 0x9B, 0xD9, 0x9C, 0xD9, 0x9D, 0xD9, 0x9E, /* 0x08-0x0B */
+	0xD9, 0x9F, 0xD9, 0xA0, 0xDA, 0x40, 0xDA, 0x41, /* 0x0C-0x0F */
+	0xDA, 0x42, 0xDA, 0x43, 0xDA, 0x44, 0xDA, 0x45, /* 0x10-0x13 */
+	0xDA, 0x46, 0xDA, 0x47, 0xDA, 0x48, 0xDA, 0x49, /* 0x14-0x17 */
+	0xDA, 0x4A, 0xDA, 0x4B, 0xDA, 0x4C, 0xDA, 0x4D, /* 0x18-0x1B */
+	0xDA, 0x4E, 0xB1, 0xB4, 0xD5, 0xEA, 0xB8, 0xBA, /* 0x1C-0x1F */
+	0xDA, 0x4F, 0xB9, 0xB1, 0xB2, 0xC6, 0xD4, 0xF0, /* 0x20-0x23 */
+	0xCF, 0xCD, 0xB0, 0xDC, 0xD5, 0xCB, 0xBB, 0xF5, /* 0x24-0x27 */
+	0xD6, 0xCA, 0xB7, 0xB7, 0xCC, 0xB0, 0xC6, 0xB6, /* 0x28-0x2B */
+	0xB1, 0xE1, 0xB9, 0xBA, 0xD6, 0xFC, 0xB9, 0xE1, /* 0x2C-0x2F */
+	0xB7, 0xA1, 0xBC, 0xFA, 0xEA, 0xDA, 0xEA, 0xDB, /* 0x30-0x33 */
+	0xCC, 0xF9, 0xB9, 0xF3, 0xEA, 0xDC, 0xB4, 0xFB, /* 0x34-0x37 */
+	0xC3, 0xB3, 0xB7, 0xD1, 0xBA, 0xD8, 0xEA, 0xDD, /* 0x38-0x3B */
+	0xD4, 0xF4, 0xEA, 0xDE, 0xBC, 0xD6, 0xBB, 0xDF, /* 0x3C-0x3F */
+	0xEA, 0xDF, 0xC1, 0xDE, 0xC2, 0xB8, 0xD4, 0xDF, /* 0x40-0x43 */
+	0xD7, 0xCA, 0xEA, 0xE0, 0xEA, 0xE1, 0xEA, 0xE4, /* 0x44-0x47 */
+	0xEA, 0xE2, 0xEA, 0xE3, 0xC9, 0xDE, 0xB8, 0xB3, /* 0x48-0x4B */
+	0xB6, 0xC4, 0xEA, 0xE5, 0xCA, 0xEA, 0xC9, 0xCD, /* 0x4C-0x4F */
+	0xB4, 0xCD, 0xDA, 0x50, 0xDA, 0x51, 0xE2, 0xD9, /* 0x50-0x53 */
+	0xC5, 0xE2, 0xEA, 0xE6, 0xC0, 0xB5, 0xDA, 0x52, /* 0x54-0x57 */
+	0xD7, 0xB8, 0xEA, 0xE7, 0xD7, 0xAC, 0xC8, 0xFC, /* 0x58-0x5B */
+	0xD8, 0xD3, 0xD8, 0xCD, 0xD4, 0xDE, 0xDA, 0x53, /* 0x5C-0x5F */
+	0xD4, 0xF9, 0xC9, 0xC4, 0xD3, 0xAE, 0xB8, 0xD3, /* 0x60-0x63 */
+	0xB3, 0xE0, 0xDA, 0x54, 0xC9, 0xE2, 0xF4, 0xF6, /* 0x64-0x67 */
+	0xDA, 0x55, 0xDA, 0x56, 0xDA, 0x57, 0xBA, 0xD5, /* 0x68-0x6B */
+	0xDA, 0x58, 0xF4, 0xF7, 0xDA, 0x59, 0xDA, 0x5A, /* 0x6C-0x6F */
+	0xD7, 0xDF, 0xDA, 0x5B, 0xDA, 0x5C, 0xF4, 0xF1, /* 0x70-0x73 */
+	0xB8, 0xB0, 0xD5, 0xD4, 0xB8, 0xCF, 0xC6, 0xF0, /* 0x74-0x77 */
+	0xDA, 0x5D, 0xDA, 0x5E, 0xDA, 0x5F, 0xDA, 0x60, /* 0x78-0x7B */
+	0xDA, 0x61, 0xDA, 0x62, 0xDA, 0x63, 0xDA, 0x64, /* 0x7C-0x7F */
+	
+	0xDA, 0x65, 0xB3, 0xC3, 0xDA, 0x66, 0xDA, 0x67, /* 0x80-0x83 */
+	0xF4, 0xF2, 0xB3, 0xAC, 0xDA, 0x68, 0xDA, 0x69, /* 0x84-0x87 */
+	0xDA, 0x6A, 0xDA, 0x6B, 0xD4, 0xBD, 0xC7, 0xF7, /* 0x88-0x8B */
+	0xDA, 0x6C, 0xDA, 0x6D, 0xDA, 0x6E, 0xDA, 0x6F, /* 0x8C-0x8F */
+	0xDA, 0x70, 0xF4, 0xF4, 0xDA, 0x71, 0xDA, 0x72, /* 0x90-0x93 */
+	0xF4, 0xF3, 0xDA, 0x73, 0xDA, 0x74, 0xDA, 0x75, /* 0x94-0x97 */
+	0xDA, 0x76, 0xDA, 0x77, 0xDA, 0x78, 0xDA, 0x79, /* 0x98-0x9B */
+	0xDA, 0x7A, 0xDA, 0x7B, 0xDA, 0x7C, 0xCC, 0xCB, /* 0x9C-0x9F */
+	0xDA, 0x7D, 0xDA, 0x7E, 0xDA, 0x80, 0xC8, 0xA4, /* 0xA0-0xA3 */
+	0xDA, 0x81, 0xDA, 0x82, 0xDA, 0x83, 0xDA, 0x84, /* 0xA4-0xA7 */
+	0xDA, 0x85, 0xDA, 0x86, 0xDA, 0x87, 0xDA, 0x88, /* 0xA8-0xAB */
+	0xDA, 0x89, 0xDA, 0x8A, 0xDA, 0x8B, 0xDA, 0x8C, /* 0xAC-0xAF */
+	0xDA, 0x8D, 0xF4, 0xF5, 0xDA, 0x8E, 0xD7, 0xE3, /* 0xB0-0xB3 */
+	0xC5, 0xBF, 0xF5, 0xC0, 0xDA, 0x8F, 0xDA, 0x90, /* 0xB4-0xB7 */
+	0xF5, 0xBB, 0xDA, 0x91, 0xF5, 0xC3, 0xDA, 0x92, /* 0xB8-0xBB */
+	0xF5, 0xC2, 0xDA, 0x93, 0xD6, 0xBA, 0xF5, 0xC1, /* 0xBC-0xBF */
+	0xDA, 0x94, 0xDA, 0x95, 0xDA, 0x96, 0xD4, 0xBE, /* 0xC0-0xC3 */
+	0xF5, 0xC4, 0xDA, 0x97, 0xF5, 0xCC, 0xDA, 0x98, /* 0xC4-0xC7 */
+	0xDA, 0x99, 0xDA, 0x9A, 0xDA, 0x9B, 0xB0, 0xCF, /* 0xC8-0xCB */
+	0xB5, 0xF8, 0xDA, 0x9C, 0xF5, 0xC9, 0xF5, 0xCA, /* 0xCC-0xCF */
+	0xDA, 0x9D, 0xC5, 0xDC, 0xDA, 0x9E, 0xDA, 0x9F, /* 0xD0-0xD3 */
+	0xDA, 0xA0, 0xDB, 0x40, 0xF5, 0xC5, 0xF5, 0xC6, /* 0xD4-0xD7 */
+	0xDB, 0x41, 0xDB, 0x42, 0xF5, 0xC7, 0xF5, 0xCB, /* 0xD8-0xDB */
+	0xDB, 0x43, 0xBE, 0xE0, 0xF5, 0xC8, 0xB8, 0xFA, /* 0xDC-0xDF */
+	0xDB, 0x44, 0xDB, 0x45, 0xDB, 0x46, 0xF5, 0xD0, /* 0xE0-0xE3 */
+	0xF5, 0xD3, 0xDB, 0x47, 0xDB, 0x48, 0xDB, 0x49, /* 0xE4-0xE7 */
+	0xBF, 0xE7, 0xDB, 0x4A, 0xB9, 0xF2, 0xF5, 0xBC, /* 0xE8-0xEB */
+	0xF5, 0xCD, 0xDB, 0x4B, 0xDB, 0x4C, 0xC2, 0xB7, /* 0xEC-0xEF */
+	0xDB, 0x4D, 0xDB, 0x4E, 0xDB, 0x4F, 0xCC, 0xF8, /* 0xF0-0xF3 */
+	0xDB, 0x50, 0xBC, 0xF9, 0xDB, 0x51, 0xF5, 0xCE, /* 0xF4-0xF7 */
+	0xF5, 0xCF, 0xF5, 0xD1, 0xB6, 0xE5, 0xF5, 0xD2, /* 0xF8-0xFB */
+	0xDB, 0x52, 0xF5, 0xD5, 0xDB, 0x53, 0xDB, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0xDB, 0x55, 0xDB, 0x56, 0xDB, 0x57, 0xDB, 0x58, /* 0x00-0x03 */
+	0xDB, 0x59, 0xF5, 0xBD, 0xDB, 0x5A, 0xDB, 0x5B, /* 0x04-0x07 */
+	0xDB, 0x5C, 0xF5, 0xD4, 0xD3, 0xBB, 0xDB, 0x5D, /* 0x08-0x0B */
+	0xB3, 0xEC, 0xDB, 0x5E, 0xDB, 0x5F, 0xCC, 0xA4, /* 0x0C-0x0F */
+	0xDB, 0x60, 0xDB, 0x61, 0xDB, 0x62, 0xDB, 0x63, /* 0x10-0x13 */
+	0xF5, 0xD6, 0xDB, 0x64, 0xDB, 0x65, 0xDB, 0x66, /* 0x14-0x17 */
+	0xDB, 0x67, 0xDB, 0x68, 0xDB, 0x69, 0xDB, 0x6A, /* 0x18-0x1B */
+	0xDB, 0x6B, 0xF5, 0xD7, 0xBE, 0xE1, 0xF5, 0xD8, /* 0x1C-0x1F */
+	0xDB, 0x6C, 0xDB, 0x6D, 0xCC, 0xDF, 0xF5, 0xDB, /* 0x20-0x23 */
+	0xDB, 0x6E, 0xDB, 0x6F, 0xDB, 0x70, 0xDB, 0x71, /* 0x24-0x27 */
+	0xDB, 0x72, 0xB2, 0xC8, 0xD7, 0xD9, 0xDB, 0x73, /* 0x28-0x2B */
+	0xF5, 0xD9, 0xDB, 0x74, 0xF5, 0xDA, 0xF5, 0xDC, /* 0x2C-0x2F */
+	0xDB, 0x75, 0xF5, 0xE2, 0xDB, 0x76, 0xDB, 0x77, /* 0x30-0x33 */
+	0xDB, 0x78, 0xF5, 0xE0, 0xDB, 0x79, 0xDB, 0x7A, /* 0x34-0x37 */
+	0xDB, 0x7B, 0xF5, 0xDF, 0xF5, 0xDD, 0xDB, 0x7C, /* 0x38-0x3B */
+	0xDB, 0x7D, 0xF5, 0xE1, 0xDB, 0x7E, 0xDB, 0x80, /* 0x3C-0x3F */
+	0xF5, 0xDE, 0xF5, 0xE4, 0xF5, 0xE5, 0xDB, 0x81, /* 0x40-0x43 */
+	0xCC, 0xE3, 0xDB, 0x82, 0xDB, 0x83, 0xE5, 0xBF, /* 0x44-0x47 */
+	0xB5, 0xB8, 0xF5, 0xE3, 0xF5, 0xE8, 0xCC, 0xA3, /* 0x48-0x4B */
+	0xDB, 0x84, 0xDB, 0x85, 0xDB, 0x86, 0xDB, 0x87, /* 0x4C-0x4F */
+	0xDB, 0x88, 0xF5, 0xE6, 0xF5, 0xE7, 0xDB, 0x89, /* 0x50-0x53 */
+	0xDB, 0x8A, 0xDB, 0x8B, 0xDB, 0x8C, 0xDB, 0x8D, /* 0x54-0x57 */
+	0xDB, 0x8E, 0xF5, 0xBE, 0xDB, 0x8F, 0xDB, 0x90, /* 0x58-0x5B */
+	0xDB, 0x91, 0xDB, 0x92, 0xDB, 0x93, 0xDB, 0x94, /* 0x5C-0x5F */
+	0xDB, 0x95, 0xDB, 0x96, 0xDB, 0x97, 0xDB, 0x98, /* 0x60-0x63 */
+	0xDB, 0x99, 0xDB, 0x9A, 0xB1, 0xC4, 0xDB, 0x9B, /* 0x64-0x67 */
+	0xDB, 0x9C, 0xF5, 0xBF, 0xDB, 0x9D, 0xDB, 0x9E, /* 0x68-0x6B */
+	0xB5, 0xC5, 0xB2, 0xE4, 0xDB, 0x9F, 0xF5, 0xEC, /* 0x6C-0x6F */
+	0xF5, 0xE9, 0xDB, 0xA0, 0xB6, 0xD7, 0xDC, 0x40, /* 0x70-0x73 */
+	0xF5, 0xED, 0xDC, 0x41, 0xF5, 0xEA, 0xDC, 0x42, /* 0x74-0x77 */
+	0xDC, 0x43, 0xDC, 0x44, 0xDC, 0x45, 0xDC, 0x46, /* 0x78-0x7B */
+	0xF5, 0xEB, 0xDC, 0x47, 0xDC, 0x48, 0xB4, 0xDA, /* 0x7C-0x7F */
+	
+	0xDC, 0x49, 0xD4, 0xEA, 0xDC, 0x4A, 0xDC, 0x4B, /* 0x80-0x83 */
+	0xDC, 0x4C, 0xF5, 0xEE, 0xDC, 0x4D, 0xB3, 0xF9, /* 0x84-0x87 */
+	0xDC, 0x4E, 0xDC, 0x4F, 0xDC, 0x50, 0xDC, 0x51, /* 0x88-0x8B */
+	0xDC, 0x52, 0xDC, 0x53, 0xDC, 0x54, 0xF5, 0xEF, /* 0x8C-0x8F */
+	0xF5, 0xF1, 0xDC, 0x55, 0xDC, 0x56, 0xDC, 0x57, /* 0x90-0x93 */
+	0xF5, 0xF0, 0xDC, 0x58, 0xDC, 0x59, 0xDC, 0x5A, /* 0x94-0x97 */
+	0xDC, 0x5B, 0xDC, 0x5C, 0xDC, 0x5D, 0xDC, 0x5E, /* 0x98-0x9B */
+	0xF5, 0xF2, 0xDC, 0x5F, 0xF5, 0xF3, 0xDC, 0x60, /* 0x9C-0x9F */
+	0xDC, 0x61, 0xDC, 0x62, 0xDC, 0x63, 0xDC, 0x64, /* 0xA0-0xA3 */
+	0xDC, 0x65, 0xDC, 0x66, 0xDC, 0x67, 0xDC, 0x68, /* 0xA4-0xA7 */
+	0xDC, 0x69, 0xDC, 0x6A, 0xDC, 0x6B, 0xC9, 0xED, /* 0xA8-0xAB */
+	0xB9, 0xAA, 0xDC, 0x6C, 0xDC, 0x6D, 0xC7, 0xFB, /* 0xAC-0xAF */
+	0xDC, 0x6E, 0xDC, 0x6F, 0xB6, 0xE3, 0xDC, 0x70, /* 0xB0-0xB3 */
+	0xDC, 0x71, 0xDC, 0x72, 0xDC, 0x73, 0xDC, 0x74, /* 0xB4-0xB7 */
+	0xDC, 0x75, 0xDC, 0x76, 0xCC, 0xC9, 0xDC, 0x77, /* 0xB8-0xBB */
+	0xDC, 0x78, 0xDC, 0x79, 0xDC, 0x7A, 0xDC, 0x7B, /* 0xBC-0xBF */
+	0xDC, 0x7C, 0xDC, 0x7D, 0xDC, 0x7E, 0xDC, 0x80, /* 0xC0-0xC3 */
+	0xDC, 0x81, 0xDC, 0x82, 0xDC, 0x83, 0xDC, 0x84, /* 0xC4-0xC7 */
+	0xDC, 0x85, 0xDC, 0x86, 0xDC, 0x87, 0xDC, 0x88, /* 0xC8-0xCB */
+	0xDC, 0x89, 0xDC, 0x8A, 0xEA, 0xA6, 0xDC, 0x8B, /* 0xCC-0xCF */
+	0xDC, 0x8C, 0xDC, 0x8D, 0xDC, 0x8E, 0xDC, 0x8F, /* 0xD0-0xD3 */
+	0xDC, 0x90, 0xDC, 0x91, 0xDC, 0x92, 0xDC, 0x93, /* 0xD4-0xD7 */
+	0xDC, 0x94, 0xDC, 0x95, 0xDC, 0x96, 0xDC, 0x97, /* 0xD8-0xDB */
+	0xDC, 0x98, 0xDC, 0x99, 0xDC, 0x9A, 0xDC, 0x9B, /* 0xDC-0xDF */
+	0xDC, 0x9C, 0xDC, 0x9D, 0xDC, 0x9E, 0xDC, 0x9F, /* 0xE0-0xE3 */
+	0xDC, 0xA0, 0xDD, 0x40, 0xDD, 0x41, 0xDD, 0x42, /* 0xE4-0xE7 */
+	0xDD, 0x43, 0xDD, 0x44, 0xDD, 0x45, 0xDD, 0x46, /* 0xE8-0xEB */
+	0xDD, 0x47, 0xDD, 0x48, 0xDD, 0x49, 0xDD, 0x4A, /* 0xEC-0xEF */
+	0xDD, 0x4B, 0xDD, 0x4C, 0xDD, 0x4D, 0xDD, 0x4E, /* 0xF0-0xF3 */
+	0xDD, 0x4F, 0xDD, 0x50, 0xDD, 0x51, 0xDD, 0x52, /* 0xF4-0xF7 */
+	0xDD, 0x53, 0xDD, 0x54, 0xDD, 0x55, 0xDD, 0x56, /* 0xF8-0xFB */
+	0xDD, 0x57, 0xDD, 0x58, 0xDD, 0x59, 0xDD, 0x5A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0xDD, 0x5B, 0xDD, 0x5C, 0xDD, 0x5D, 0xDD, 0x5E, /* 0x00-0x03 */
+	0xDD, 0x5F, 0xDD, 0x60, 0xDD, 0x61, 0xDD, 0x62, /* 0x04-0x07 */
+	0xDD, 0x63, 0xDD, 0x64, 0xDD, 0x65, 0xDD, 0x66, /* 0x08-0x0B */
+	0xDD, 0x67, 0xDD, 0x68, 0xDD, 0x69, 0xDD, 0x6A, /* 0x0C-0x0F */
+	0xDD, 0x6B, 0xDD, 0x6C, 0xDD, 0x6D, 0xDD, 0x6E, /* 0x10-0x13 */
+	0xDD, 0x6F, 0xDD, 0x70, 0xDD, 0x71, 0xDD, 0x72, /* 0x14-0x17 */
+	0xDD, 0x73, 0xDD, 0x74, 0xDD, 0x75, 0xDD, 0x76, /* 0x18-0x1B */
+	0xDD, 0x77, 0xDD, 0x78, 0xDD, 0x79, 0xDD, 0x7A, /* 0x1C-0x1F */
+	0xDD, 0x7B, 0xDD, 0x7C, 0xDD, 0x7D, 0xDD, 0x7E, /* 0x20-0x23 */
+	0xDD, 0x80, 0xDD, 0x81, 0xDD, 0x82, 0xDD, 0x83, /* 0x24-0x27 */
+	0xDD, 0x84, 0xDD, 0x85, 0xDD, 0x86, 0xDD, 0x87, /* 0x28-0x2B */
+	0xDD, 0x88, 0xDD, 0x89, 0xDD, 0x8A, 0xDD, 0x8B, /* 0x2C-0x2F */
+	0xDD, 0x8C, 0xDD, 0x8D, 0xDD, 0x8E, 0xDD, 0x8F, /* 0x30-0x33 */
+	0xDD, 0x90, 0xDD, 0x91, 0xDD, 0x92, 0xDD, 0x93, /* 0x34-0x37 */
+	0xDD, 0x94, 0xDD, 0x95, 0xDD, 0x96, 0xDD, 0x97, /* 0x38-0x3B */
+	0xDD, 0x98, 0xDD, 0x99, 0xDD, 0x9A, 0xDD, 0x9B, /* 0x3C-0x3F */
+	0xDD, 0x9C, 0xDD, 0x9D, 0xDD, 0x9E, 0xDD, 0x9F, /* 0x40-0x43 */
+	0xDD, 0xA0, 0xDE, 0x40, 0xDE, 0x41, 0xDE, 0x42, /* 0x44-0x47 */
+	0xDE, 0x43, 0xDE, 0x44, 0xDE, 0x45, 0xDE, 0x46, /* 0x48-0x4B */
+	0xDE, 0x47, 0xDE, 0x48, 0xDE, 0x49, 0xDE, 0x4A, /* 0x4C-0x4F */
+	0xDE, 0x4B, 0xDE, 0x4C, 0xDE, 0x4D, 0xDE, 0x4E, /* 0x50-0x53 */
+	0xDE, 0x4F, 0xDE, 0x50, 0xDE, 0x51, 0xDE, 0x52, /* 0x54-0x57 */
+	0xDE, 0x53, 0xDE, 0x54, 0xDE, 0x55, 0xDE, 0x56, /* 0x58-0x5B */
+	0xDE, 0x57, 0xDE, 0x58, 0xDE, 0x59, 0xDE, 0x5A, /* 0x5C-0x5F */
+	0xDE, 0x5B, 0xDE, 0x5C, 0xDE, 0x5D, 0xDE, 0x5E, /* 0x60-0x63 */
+	0xDE, 0x5F, 0xDE, 0x60, 0xB3, 0xB5, 0xD4, 0xFE, /* 0x64-0x67 */
+	0xB9, 0xEC, 0xD0, 0xF9, 0xDE, 0x61, 0xE9, 0xED, /* 0x68-0x6B */
+	0xD7, 0xAA, 0xE9, 0xEE, 0xC2, 0xD6, 0xC8, 0xED, /* 0x6C-0x6F */
+	0xBA, 0xE4, 0xE9, 0xEF, 0xE9, 0xF0, 0xE9, 0xF1, /* 0x70-0x73 */
+	0xD6, 0xE1, 0xE9, 0xF2, 0xE9, 0xF3, 0xE9, 0xF5, /* 0x74-0x77 */
+	0xE9, 0xF4, 0xE9, 0xF6, 0xE9, 0xF7, 0xC7, 0xE1, /* 0x78-0x7B */
+	0xE9, 0xF8, 0xD4, 0xD8, 0xE9, 0xF9, 0xBD, 0xCE, /* 0x7C-0x7F */
+	
+	0xDE, 0x62, 0xE9, 0xFA, 0xE9, 0xFB, 0xBD, 0xCF, /* 0x80-0x83 */
+	0xE9, 0xFC, 0xB8, 0xA8, 0xC1, 0xBE, 0xE9, 0xFD, /* 0x84-0x87 */
+	0xB1, 0xB2, 0xBB, 0xD4, 0xB9, 0xF5, 0xE9, 0xFE, /* 0x88-0x8B */
+	0xDE, 0x63, 0xEA, 0xA1, 0xEA, 0xA2, 0xEA, 0xA3, /* 0x8C-0x8F */
+	0xB7, 0xF8, 0xBC, 0xAD, 0xDE, 0x64, 0xCA, 0xE4, /* 0x90-0x93 */
+	0xE0, 0xCE, 0xD4, 0xAF, 0xCF, 0xBD, 0xD5, 0xB7, /* 0x94-0x97 */
+	0xEA, 0xA4, 0xD5, 0xDE, 0xEA, 0xA5, 0xD0, 0xC1, /* 0x98-0x9B */
+	0xB9, 0xBC, 0xDE, 0x65, 0xB4, 0xC7, 0xB1, 0xD9, /* 0x9C-0x9F */
+	0xDE, 0x66, 0xDE, 0x67, 0xDE, 0x68, 0xC0, 0xB1, /* 0xA0-0xA3 */
+	0xDE, 0x69, 0xDE, 0x6A, 0xDE, 0x6B, 0xDE, 0x6C, /* 0xA4-0xA7 */
+	0xB1, 0xE6, 0xB1, 0xE7, 0xDE, 0x6D, 0xB1, 0xE8, /* 0xA8-0xAB */
+	0xDE, 0x6E, 0xDE, 0x6F, 0xDE, 0x70, 0xDE, 0x71, /* 0xAC-0xAF */
+	0xB3, 0xBD, 0xC8, 0xE8, 0xDE, 0x72, 0xDE, 0x73, /* 0xB0-0xB3 */
+	0xDE, 0x74, 0xDE, 0x75, 0xE5, 0xC1, 0xDE, 0x76, /* 0xB4-0xB7 */
+	0xDE, 0x77, 0xB1, 0xDF, 0xDE, 0x78, 0xDE, 0x79, /* 0xB8-0xBB */
+	0xDE, 0x7A, 0xC1, 0xC9, 0xB4, 0xEF, 0xDE, 0x7B, /* 0xBC-0xBF */
+	0xDE, 0x7C, 0xC7, 0xA8, 0xD3, 0xD8, 0xDE, 0x7D, /* 0xC0-0xC3 */
+	0xC6, 0xF9, 0xD1, 0xB8, 0xDE, 0x7E, 0xB9, 0xFD, /* 0xC4-0xC7 */
+	0xC2, 0xF5, 0xDE, 0x80, 0xDE, 0x81, 0xDE, 0x82, /* 0xC8-0xCB */
+	0xDE, 0x83, 0xDE, 0x84, 0xD3, 0xAD, 0xDE, 0x85, /* 0xCC-0xCF */
+	0xD4, 0xCB, 0xBD, 0xFC, 0xDE, 0x86, 0xE5, 0xC2, /* 0xD0-0xD3 */
+	0xB7, 0xB5, 0xE5, 0xC3, 0xDE, 0x87, 0xDE, 0x88, /* 0xD4-0xD7 */
+	0xBB, 0xB9, 0xD5, 0xE2, 0xDE, 0x89, 0xBD, 0xF8, /* 0xD8-0xDB */
+	0xD4, 0xB6, 0xCE, 0xA5, 0xC1, 0xAC, 0xB3, 0xD9, /* 0xDC-0xDF */
+	0xDE, 0x8A, 0xDE, 0x8B, 0xCC, 0xF6, 0xDE, 0x8C, /* 0xE0-0xE3 */
+	0xE5, 0xC6, 0xE5, 0xC4, 0xE5, 0xC8, 0xDE, 0x8D, /* 0xE4-0xE7 */
+	0xE5, 0xCA, 0xE5, 0xC7, 0xB5, 0xCF, 0xC6, 0xC8, /* 0xE8-0xEB */
+	0xDE, 0x8E, 0xB5, 0xFC, 0xE5, 0xC5, 0xDE, 0x8F, /* 0xEC-0xEF */
+	0xCA, 0xF6, 0xDE, 0x90, 0xDE, 0x91, 0xE5, 0xC9, /* 0xF0-0xF3 */
+	0xDE, 0x92, 0xDE, 0x93, 0xDE, 0x94, 0xC3, 0xD4, /* 0xF4-0xF7 */
+	0xB1, 0xC5, 0xBC, 0xA3, 0xDE, 0x95, 0xDE, 0x96, /* 0xF8-0xFB */
+	0xDE, 0x97, 0xD7, 0xB7, 0xDE, 0x98, 0xDE, 0x99, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0xCD, 0xCB, 0xCB, 0xCD, 0xCA, 0xCA, 0xCC, 0xD3, /* 0x00-0x03 */
+	0xE5, 0xCC, 0xE5, 0xCB, 0xC4, 0xE6, 0xDE, 0x9A, /* 0x04-0x07 */
+	0xDE, 0x9B, 0xD1, 0xA1, 0xD1, 0xB7, 0xE5, 0xCD, /* 0x08-0x0B */
+	0xDE, 0x9C, 0xE5, 0xD0, 0xDE, 0x9D, 0xCD, 0xB8, /* 0x0C-0x0F */
+	0xD6, 0xF0, 0xE5, 0xCF, 0xB5, 0xDD, 0xDE, 0x9E, /* 0x10-0x13 */
+	0xCD, 0xBE, 0xDE, 0x9F, 0xE5, 0xD1, 0xB6, 0xBA, /* 0x14-0x17 */
+	0xDE, 0xA0, 0xDF, 0x40, 0xCD, 0xA8, 0xB9, 0xE4, /* 0x18-0x1B */
+	0xDF, 0x41, 0xCA, 0xC5, 0xB3, 0xD1, 0xCB, 0xD9, /* 0x1C-0x1F */
+	0xD4, 0xEC, 0xE5, 0xD2, 0xB7, 0xEA, 0xDF, 0x42, /* 0x20-0x23 */
+	0xDF, 0x43, 0xDF, 0x44, 0xE5, 0xCE, 0xDF, 0x45, /* 0x24-0x27 */
+	0xDF, 0x46, 0xDF, 0x47, 0xDF, 0x48, 0xDF, 0x49, /* 0x28-0x2B */
+	0xDF, 0x4A, 0xE5, 0xD5, 0xB4, 0xFE, 0xE5, 0xD6, /* 0x2C-0x2F */
+	0xDF, 0x4B, 0xDF, 0x4C, 0xDF, 0x4D, 0xDF, 0x4E, /* 0x30-0x33 */
+	0xDF, 0x4F, 0xE5, 0xD3, 0xE5, 0xD4, 0xDF, 0x50, /* 0x34-0x37 */
+	0xD2, 0xDD, 0xDF, 0x51, 0xDF, 0x52, 0xC2, 0xDF, /* 0x38-0x3B */
+	0xB1, 0xC6, 0xDF, 0x53, 0xD3, 0xE2, 0xDF, 0x54, /* 0x3C-0x3F */
+	0xDF, 0x55, 0xB6, 0xDD, 0xCB, 0xEC, 0xDF, 0x56, /* 0x40-0x43 */
+	0xE5, 0xD7, 0xDF, 0x57, 0xDF, 0x58, 0xD3, 0xF6, /* 0x44-0x47 */
+	0xDF, 0x59, 0xDF, 0x5A, 0xDF, 0x5B, 0xDF, 0x5C, /* 0x48-0x4B */
+	0xDF, 0x5D, 0xB1, 0xE9, 0xDF, 0x5E, 0xB6, 0xF4, /* 0x4C-0x4F */
+	0xE5, 0xDA, 0xE5, 0xD8, 0xE5, 0xD9, 0xB5, 0xC0, /* 0x50-0x53 */
+	0xDF, 0x5F, 0xDF, 0x60, 0xDF, 0x61, 0xD2, 0xC5, /* 0x54-0x57 */
+	0xE5, 0xDC, 0xDF, 0x62, 0xDF, 0x63, 0xE5, 0xDE, /* 0x58-0x5B */
+	0xDF, 0x64, 0xDF, 0x65, 0xDF, 0x66, 0xDF, 0x67, /* 0x5C-0x5F */
+	0xDF, 0x68, 0xDF, 0x69, 0xE5, 0xDD, 0xC7, 0xB2, /* 0x60-0x63 */
+	0xDF, 0x6A, 0xD2, 0xA3, 0xDF, 0x6B, 0xDF, 0x6C, /* 0x64-0x67 */
+	0xE5, 0xDB, 0xDF, 0x6D, 0xDF, 0x6E, 0xDF, 0x6F, /* 0x68-0x6B */
+	0xDF, 0x70, 0xD4, 0xE2, 0xD5, 0xDA, 0xDF, 0x71, /* 0x6C-0x6F */
+	0xDF, 0x72, 0xDF, 0x73, 0xDF, 0x74, 0xDF, 0x75, /* 0x70-0x73 */
+	0xE5, 0xE0, 0xD7, 0xF1, 0xDF, 0x76, 0xDF, 0x77, /* 0x74-0x77 */
+	0xDF, 0x78, 0xDF, 0x79, 0xDF, 0x7A, 0xDF, 0x7B, /* 0x78-0x7B */
+	0xDF, 0x7C, 0xE5, 0xE1, 0xDF, 0x7D, 0xB1, 0xDC, /* 0x7C-0x7F */
+	
+	0xD1, 0xFB, 0xDF, 0x7E, 0xE5, 0xE2, 0xE5, 0xE4, /* 0x80-0x83 */
+	0xDF, 0x80, 0xDF, 0x81, 0xDF, 0x82, 0xDF, 0x83, /* 0x84-0x87 */
+	0xE5, 0xE3, 0xDF, 0x84, 0xDF, 0x85, 0xE5, 0xE5, /* 0x88-0x8B */
+	0xDF, 0x86, 0xDF, 0x87, 0xDF, 0x88, 0xDF, 0x89, /* 0x8C-0x8F */
+	0xDF, 0x8A, 0xD2, 0xD8, 0xDF, 0x8B, 0xB5, 0xCB, /* 0x90-0x93 */
+	0xDF, 0x8C, 0xE7, 0xDF, 0xDF, 0x8D, 0xDA, 0xF5, /* 0x94-0x97 */
+	0xDF, 0x8E, 0xDA, 0xF8, 0xDF, 0x8F, 0xDA, 0xF6, /* 0x98-0x9B */
+	0xDF, 0x90, 0xDA, 0xF7, 0xDF, 0x91, 0xDF, 0x92, /* 0x9C-0x9F */
+	0xDF, 0x93, 0xDA, 0xFA, 0xD0, 0xCF, 0xC4, 0xC7, /* 0xA0-0xA3 */
+	0xDF, 0x94, 0xDF, 0x95, 0xB0, 0xEE, 0xDF, 0x96, /* 0xA4-0xA7 */
+	0xDF, 0x97, 0xDF, 0x98, 0xD0, 0xB0, 0xDF, 0x99, /* 0xA8-0xAB */
+	0xDA, 0xF9, 0xDF, 0x9A, 0xD3, 0xCA, 0xBA, 0xAA, /* 0xAC-0xAF */
+	0xDB, 0xA2, 0xC7, 0xF1, 0xDF, 0x9B, 0xDA, 0xFC, /* 0xB0-0xB3 */
+	0xDA, 0xFB, 0xC9, 0xDB, 0xDA, 0xFD, 0xDF, 0x9C, /* 0xB4-0xB7 */
+	0xDB, 0xA1, 0xD7, 0xDE, 0xDA, 0xFE, 0xC1, 0xDA, /* 0xB8-0xBB */
+	0xDF, 0x9D, 0xDF, 0x9E, 0xDB, 0xA5, 0xDF, 0x9F, /* 0xBC-0xBF */
+	0xDF, 0xA0, 0xD3, 0xF4, 0xE0, 0x40, 0xE0, 0x41, /* 0xC0-0xC3 */
+	0xDB, 0xA7, 0xDB, 0xA4, 0xE0, 0x42, 0xDB, 0xA8, /* 0xC4-0xC7 */
+	0xE0, 0x43, 0xE0, 0x44, 0xBD, 0xBC, 0xE0, 0x45, /* 0xC8-0xCB */
+	0xE0, 0x46, 0xE0, 0x47, 0xC0, 0xC9, 0xDB, 0xA3, /* 0xCC-0xCF */
+	0xDB, 0xA6, 0xD6, 0xA3, 0xE0, 0x48, 0xDB, 0xA9, /* 0xD0-0xD3 */
+	0xE0, 0x49, 0xE0, 0x4A, 0xE0, 0x4B, 0xDB, 0xAD, /* 0xD4-0xD7 */
+	0xE0, 0x4C, 0xE0, 0x4D, 0xE0, 0x4E, 0xDB, 0xAE, /* 0xD8-0xDB */
+	0xDB, 0xAC, 0xBA, 0xC2, 0xE0, 0x4F, 0xE0, 0x50, /* 0xDC-0xDF */
+	0xE0, 0x51, 0xBF, 0xA4, 0xDB, 0xAB, 0xE0, 0x52, /* 0xE0-0xE3 */
+	0xE0, 0x53, 0xE0, 0x54, 0xDB, 0xAA, 0xD4, 0xC7, /* 0xE4-0xE7 */
+	0xB2, 0xBF, 0xE0, 0x55, 0xE0, 0x56, 0xDB, 0xAF, /* 0xE8-0xEB */
+	0xE0, 0x57, 0xB9, 0xF9, 0xE0, 0x58, 0xDB, 0xB0, /* 0xEC-0xEF */
+	0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x5B, 0xE0, 0x5C, /* 0xF0-0xF3 */
+	0xB3, 0xBB, 0xE0, 0x5D, 0xE0, 0x5E, 0xE0, 0x5F, /* 0xF4-0xF7 */
+	0xB5, 0xA6, 0xE0, 0x60, 0xE0, 0x61, 0xE0, 0x62, /* 0xF8-0xFB */
+	0xE0, 0x63, 0xB6, 0xBC, 0xDB, 0xB1, 0xE0, 0x64, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0xE0, 0x65, 0xE0, 0x66, 0xB6, 0xF5, 0xE0, 0x67, /* 0x00-0x03 */
+	0xDB, 0xB2, 0xE0, 0x68, 0xE0, 0x69, 0xE0, 0x6A, /* 0x04-0x07 */
+	0xE0, 0x6B, 0xE0, 0x6C, 0xE0, 0x6D, 0xE0, 0x6E, /* 0x08-0x0B */
+	0xE0, 0x6F, 0xE0, 0x70, 0xE0, 0x71, 0xE0, 0x72, /* 0x0C-0x0F */
+	0xE0, 0x73, 0xE0, 0x74, 0xE0, 0x75, 0xE0, 0x76, /* 0x10-0x13 */
+	0xE0, 0x77, 0xE0, 0x78, 0xE0, 0x79, 0xE0, 0x7A, /* 0x14-0x17 */
+	0xE0, 0x7B, 0xB1, 0xC9, 0xE0, 0x7C, 0xE0, 0x7D, /* 0x18-0x1B */
+	0xE0, 0x7E, 0xE0, 0x80, 0xDB, 0xB4, 0xE0, 0x81, /* 0x1C-0x1F */
+	0xE0, 0x82, 0xE0, 0x83, 0xDB, 0xB3, 0xDB, 0xB5, /* 0x20-0x23 */
+	0xE0, 0x84, 0xE0, 0x85, 0xE0, 0x86, 0xE0, 0x87, /* 0x24-0x27 */
+	0xE0, 0x88, 0xE0, 0x89, 0xE0, 0x8A, 0xE0, 0x8B, /* 0x28-0x2B */
+	0xE0, 0x8C, 0xE0, 0x8D, 0xE0, 0x8E, 0xDB, 0xB7, /* 0x2C-0x2F */
+	0xE0, 0x8F, 0xDB, 0xB6, 0xE0, 0x90, 0xE0, 0x91, /* 0x30-0x33 */
+	0xE0, 0x92, 0xE0, 0x93, 0xE0, 0x94, 0xE0, 0x95, /* 0x34-0x37 */
+	0xE0, 0x96, 0xDB, 0xB8, 0xE0, 0x97, 0xE0, 0x98, /* 0x38-0x3B */
+	0xE0, 0x99, 0xE0, 0x9A, 0xE0, 0x9B, 0xE0, 0x9C, /* 0x3C-0x3F */
+	0xE0, 0x9D, 0xE0, 0x9E, 0xE0, 0x9F, 0xDB, 0xB9, /* 0x40-0x43 */
+	0xE0, 0xA0, 0xE1, 0x40, 0xDB, 0xBA, 0xE1, 0x41, /* 0x44-0x47 */
+	0xE1, 0x42, 0xD3, 0xCF, 0xF4, 0xFA, 0xC7, 0xF5, /* 0x48-0x4B */
+	0xD7, 0xC3, 0xC5, 0xE4, 0xF4, 0xFC, 0xF4, 0xFD, /* 0x4C-0x4F */
+	0xF4, 0xFB, 0xE1, 0x43, 0xBE, 0xC6, 0xE1, 0x44, /* 0x50-0x53 */
+	0xE1, 0x45, 0xE1, 0x46, 0xE1, 0x47, 0xD0, 0xEF, /* 0x54-0x57 */
+	0xE1, 0x48, 0xE1, 0x49, 0xB7, 0xD3, 0xE1, 0x4A, /* 0x58-0x5B */
+	0xE1, 0x4B, 0xD4, 0xCD, 0xCC, 0xAA, 0xE1, 0x4C, /* 0x5C-0x5F */
+	0xE1, 0x4D, 0xF5, 0xA2, 0xF5, 0xA1, 0xBA, 0xA8, /* 0x60-0x63 */
+	0xF4, 0xFE, 0xCB, 0xD6, 0xE1, 0x4E, 0xE1, 0x4F, /* 0x64-0x67 */
+	0xE1, 0x50, 0xF5, 0xA4, 0xC0, 0xD2, 0xE1, 0x51, /* 0x68-0x6B */
+	0xB3, 0xEA, 0xE1, 0x52, 0xCD, 0xAA, 0xF5, 0xA5, /* 0x6C-0x6F */
+	0xF5, 0xA3, 0xBD, 0xB4, 0xF5, 0xA8, 0xE1, 0x53, /* 0x70-0x73 */
+	0xF5, 0xA9, 0xBD, 0xCD, 0xC3, 0xB8, 0xBF, 0xE1, /* 0x74-0x77 */
+	0xCB, 0xE1, 0xF5, 0xAA, 0xE1, 0x54, 0xE1, 0x55, /* 0x78-0x7B */
+	0xE1, 0x56, 0xF5, 0xA6, 0xF5, 0xA7, 0xC4, 0xF0, /* 0x7C-0x7F */
+	
+	0xE1, 0x57, 0xE1, 0x58, 0xE1, 0x59, 0xE1, 0x5A, /* 0x80-0x83 */
+	0xE1, 0x5B, 0xF5, 0xAC, 0xE1, 0x5C, 0xB4, 0xBC, /* 0x84-0x87 */
+	0xE1, 0x5D, 0xD7, 0xED, 0xE1, 0x5E, 0xB4, 0xD7, /* 0x88-0x8B */
+	0xF5, 0xAB, 0xF5, 0xAE, 0xE1, 0x5F, 0xE1, 0x60, /* 0x8C-0x8F */
+	0xF5, 0xAD, 0xF5, 0xAF, 0xD0, 0xD1, 0xE1, 0x61, /* 0x90-0x93 */
+	0xE1, 0x62, 0xE1, 0x63, 0xE1, 0x64, 0xE1, 0x65, /* 0x94-0x97 */
+	0xE1, 0x66, 0xE1, 0x67, 0xC3, 0xD1, 0xC8, 0xA9, /* 0x98-0x9B */
+	0xE1, 0x68, 0xE1, 0x69, 0xE1, 0x6A, 0xE1, 0x6B, /* 0x9C-0x9F */
+	0xE1, 0x6C, 0xE1, 0x6D, 0xF5, 0xB0, 0xF5, 0xB1, /* 0xA0-0xA3 */
+	0xE1, 0x6E, 0xE1, 0x6F, 0xE1, 0x70, 0xE1, 0x71, /* 0xA4-0xA7 */
+	0xE1, 0x72, 0xE1, 0x73, 0xF5, 0xB2, 0xE1, 0x74, /* 0xA8-0xAB */
+	0xE1, 0x75, 0xF5, 0xB3, 0xF5, 0xB4, 0xF5, 0xB5, /* 0xAC-0xAF */
+	0xE1, 0x76, 0xE1, 0x77, 0xE1, 0x78, 0xE1, 0x79, /* 0xB0-0xB3 */
+	0xF5, 0xB7, 0xF5, 0xB6, 0xE1, 0x7A, 0xE1, 0x7B, /* 0xB4-0xB7 */
+	0xE1, 0x7C, 0xE1, 0x7D, 0xF5, 0xB8, 0xE1, 0x7E, /* 0xB8-0xBB */
+	0xE1, 0x80, 0xE1, 0x81, 0xE1, 0x82, 0xE1, 0x83, /* 0xBC-0xBF */
+	0xE1, 0x84, 0xE1, 0x85, 0xE1, 0x86, 0xE1, 0x87, /* 0xC0-0xC3 */
+	0xE1, 0x88, 0xE1, 0x89, 0xE1, 0x8A, 0xB2, 0xC9, /* 0xC4-0xC7 */
+	0xE1, 0x8B, 0xD3, 0xD4, 0xCA, 0xCD, 0xE1, 0x8C, /* 0xC8-0xCB */
+	0xC0, 0xEF, 0xD6, 0xD8, 0xD2, 0xB0, 0xC1, 0xBF, /* 0xCC-0xCF */
+	0xE1, 0x8D, 0xBD, 0xF0, 0xE1, 0x8E, 0xE1, 0x8F, /* 0xD0-0xD3 */
+	0xE1, 0x90, 0xE1, 0x91, 0xE1, 0x92, 0xE1, 0x93, /* 0xD4-0xD7 */
+	0xE1, 0x94, 0xE1, 0x95, 0xE1, 0x96, 0xE1, 0x97, /* 0xD8-0xDB */
+	0xB8, 0xAA, 0xE1, 0x98, 0xE1, 0x99, 0xE1, 0x9A, /* 0xDC-0xDF */
+	0xE1, 0x9B, 0xE1, 0x9C, 0xE1, 0x9D, 0xE1, 0x9E, /* 0xE0-0xE3 */
+	0xE1, 0x9F, 0xE1, 0xA0, 0xE2, 0x40, 0xE2, 0x41, /* 0xE4-0xE7 */
+	0xE2, 0x42, 0xE2, 0x43, 0xE2, 0x44, 0xE2, 0x45, /* 0xE8-0xEB */
+	0xE2, 0x46, 0xE2, 0x47, 0xE2, 0x48, 0xE2, 0x49, /* 0xEC-0xEF */
+	0xE2, 0x4A, 0xE2, 0x4B, 0xE2, 0x4C, 0xE2, 0x4D, /* 0xF0-0xF3 */
+	0xE2, 0x4E, 0xE2, 0x4F, 0xE2, 0x50, 0xE2, 0x51, /* 0xF4-0xF7 */
+	0xE2, 0x52, 0xE2, 0x53, 0xE2, 0x54, 0xE2, 0x55, /* 0xF8-0xFB */
+	0xE2, 0x56, 0xE2, 0x57, 0xE2, 0x58, 0xE2, 0x59, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_92[512] = {
+	0xE2, 0x5A, 0xE2, 0x5B, 0xE2, 0x5C, 0xE2, 0x5D, /* 0x00-0x03 */
+	0xE2, 0x5E, 0xE2, 0x5F, 0xE2, 0x60, 0xE2, 0x61, /* 0x04-0x07 */
+	0xE2, 0x62, 0xE2, 0x63, 0xE2, 0x64, 0xE2, 0x65, /* 0x08-0x0B */
+	0xE2, 0x66, 0xE2, 0x67, 0xE2, 0x68, 0xE2, 0x69, /* 0x0C-0x0F */
+	0xE2, 0x6A, 0xE2, 0x6B, 0xE2, 0x6C, 0xE2, 0x6D, /* 0x10-0x13 */
+	0xE2, 0x6E, 0xE2, 0x6F, 0xE2, 0x70, 0xE2, 0x71, /* 0x14-0x17 */
+	0xE2, 0x72, 0xE2, 0x73, 0xE2, 0x74, 0xE2, 0x75, /* 0x18-0x1B */
+	0xE2, 0x76, 0xE2, 0x77, 0xE2, 0x78, 0xE2, 0x79, /* 0x1C-0x1F */
+	0xE2, 0x7A, 0xE2, 0x7B, 0xE2, 0x7C, 0xE2, 0x7D, /* 0x20-0x23 */
+	0xE2, 0x7E, 0xE2, 0x80, 0xE2, 0x81, 0xE2, 0x82, /* 0x24-0x27 */
+	0xE2, 0x83, 0xE2, 0x84, 0xE2, 0x85, 0xE2, 0x86, /* 0x28-0x2B */
+	0xE2, 0x87, 0xE2, 0x88, 0xE2, 0x89, 0xE2, 0x8A, /* 0x2C-0x2F */
+	0xE2, 0x8B, 0xE2, 0x8C, 0xE2, 0x8D, 0xE2, 0x8E, /* 0x30-0x33 */
+	0xE2, 0x8F, 0xE2, 0x90, 0xE2, 0x91, 0xE2, 0x92, /* 0x34-0x37 */
+	0xE2, 0x93, 0xE2, 0x94, 0xE2, 0x95, 0xE2, 0x96, /* 0x38-0x3B */
+	0xE2, 0x97, 0xE2, 0x98, 0xE2, 0x99, 0xE2, 0x9A, /* 0x3C-0x3F */
+	0xE2, 0x9B, 0xE2, 0x9C, 0xE2, 0x9D, 0xE2, 0x9E, /* 0x40-0x43 */
+	0xE2, 0x9F, 0xE2, 0xA0, 0xE3, 0x40, 0xE3, 0x41, /* 0x44-0x47 */
+	0xE3, 0x42, 0xE3, 0x43, 0xE3, 0x44, 0xE3, 0x45, /* 0x48-0x4B */
+	0xE3, 0x46, 0xE3, 0x47, 0xE3, 0x48, 0xE3, 0x49, /* 0x4C-0x4F */
+	0xE3, 0x4A, 0xE3, 0x4B, 0xE3, 0x4C, 0xE3, 0x4D, /* 0x50-0x53 */
+	0xE3, 0x4E, 0xE3, 0x4F, 0xE3, 0x50, 0xE3, 0x51, /* 0x54-0x57 */
+	0xE3, 0x52, 0xE3, 0x53, 0xE3, 0x54, 0xE3, 0x55, /* 0x58-0x5B */
+	0xE3, 0x56, 0xE3, 0x57, 0xE3, 0x58, 0xE3, 0x59, /* 0x5C-0x5F */
+	0xE3, 0x5A, 0xE3, 0x5B, 0xE3, 0x5C, 0xE3, 0x5D, /* 0x60-0x63 */
+	0xE3, 0x5E, 0xE3, 0x5F, 0xE3, 0x60, 0xE3, 0x61, /* 0x64-0x67 */
+	0xE3, 0x62, 0xE3, 0x63, 0xE3, 0x64, 0xE3, 0x65, /* 0x68-0x6B */
+	0xE3, 0x66, 0xE3, 0x67, 0xE3, 0x68, 0xE3, 0x69, /* 0x6C-0x6F */
+	0xE3, 0x6A, 0xE3, 0x6B, 0xE3, 0x6C, 0xE3, 0x6D, /* 0x70-0x73 */
+	0xBC, 0xF8, 0xE3, 0x6E, 0xE3, 0x6F, 0xE3, 0x70, /* 0x74-0x77 */
+	0xE3, 0x71, 0xE3, 0x72, 0xE3, 0x73, 0xE3, 0x74, /* 0x78-0x7B */
+	0xE3, 0x75, 0xE3, 0x76, 0xE3, 0x77, 0xE3, 0x78, /* 0x7C-0x7F */
+	
+	0xE3, 0x79, 0xE3, 0x7A, 0xE3, 0x7B, 0xE3, 0x7C, /* 0x80-0x83 */
+	0xE3, 0x7D, 0xE3, 0x7E, 0xE3, 0x80, 0xE3, 0x81, /* 0x84-0x87 */
+	0xE3, 0x82, 0xE3, 0x83, 0xE3, 0x84, 0xE3, 0x85, /* 0x88-0x8B */
+	0xE3, 0x86, 0xE3, 0x87, 0xF6, 0xC6, 0xE3, 0x88, /* 0x8C-0x8F */
+	0xE3, 0x89, 0xE3, 0x8A, 0xE3, 0x8B, 0xE3, 0x8C, /* 0x90-0x93 */
+	0xE3, 0x8D, 0xE3, 0x8E, 0xE3, 0x8F, 0xE3, 0x90, /* 0x94-0x97 */
+	0xE3, 0x91, 0xE3, 0x92, 0xE3, 0x93, 0xE3, 0x94, /* 0x98-0x9B */
+	0xE3, 0x95, 0xE3, 0x96, 0xE3, 0x97, 0xE3, 0x98, /* 0x9C-0x9F */
+	0xE3, 0x99, 0xE3, 0x9A, 0xE3, 0x9B, 0xE3, 0x9C, /* 0xA0-0xA3 */
+	0xE3, 0x9D, 0xE3, 0x9E, 0xE3, 0x9F, 0xE3, 0xA0, /* 0xA4-0xA7 */
+	0xE4, 0x40, 0xE4, 0x41, 0xE4, 0x42, 0xE4, 0x43, /* 0xA8-0xAB */
+	0xE4, 0x44, 0xE4, 0x45, 0xF6, 0xC7, 0xE4, 0x46, /* 0xAC-0xAF */
+	0xE4, 0x47, 0xE4, 0x48, 0xE4, 0x49, 0xE4, 0x4A, /* 0xB0-0xB3 */
+	0xE4, 0x4B, 0xE4, 0x4C, 0xE4, 0x4D, 0xE4, 0x4E, /* 0xB4-0xB7 */
+	0xE4, 0x4F, 0xE4, 0x50, 0xE4, 0x51, 0xE4, 0x52, /* 0xB8-0xBB */
+	0xE4, 0x53, 0xE4, 0x54, 0xE4, 0x55, 0xE4, 0x56, /* 0xBC-0xBF */
+	0xE4, 0x57, 0xE4, 0x58, 0xE4, 0x59, 0xE4, 0x5A, /* 0xC0-0xC3 */
+	0xE4, 0x5B, 0xE4, 0x5C, 0xE4, 0x5D, 0xE4, 0x5E, /* 0xC4-0xC7 */
+	0xF6, 0xC8, 0xE4, 0x5F, 0xE4, 0x60, 0xE4, 0x61, /* 0xC8-0xCB */
+	0xE4, 0x62, 0xE4, 0x63, 0xE4, 0x64, 0xE4, 0x65, /* 0xCC-0xCF */
+	0xE4, 0x66, 0xE4, 0x67, 0xE4, 0x68, 0xE4, 0x69, /* 0xD0-0xD3 */
+	0xE4, 0x6A, 0xE4, 0x6B, 0xE4, 0x6C, 0xE4, 0x6D, /* 0xD4-0xD7 */
+	0xE4, 0x6E, 0xE4, 0x6F, 0xE4, 0x70, 0xE4, 0x71, /* 0xD8-0xDB */
+	0xE4, 0x72, 0xE4, 0x73, 0xE4, 0x74, 0xE4, 0x75, /* 0xDC-0xDF */
+	0xE4, 0x76, 0xE4, 0x77, 0xE4, 0x78, 0xE4, 0x79, /* 0xE0-0xE3 */
+	0xE4, 0x7A, 0xE4, 0x7B, 0xE4, 0x7C, 0xE4, 0x7D, /* 0xE4-0xE7 */
+	0xE4, 0x7E, 0xE4, 0x80, 0xE4, 0x81, 0xE4, 0x82, /* 0xE8-0xEB */
+	0xE4, 0x83, 0xE4, 0x84, 0xE4, 0x85, 0xE4, 0x86, /* 0xEC-0xEF */
+	0xE4, 0x87, 0xE4, 0x88, 0xE4, 0x89, 0xE4, 0x8A, /* 0xF0-0xF3 */
+	0xE4, 0x8B, 0xE4, 0x8C, 0xE4, 0x8D, 0xE4, 0x8E, /* 0xF4-0xF7 */
+	0xE4, 0x8F, 0xE4, 0x90, 0xE4, 0x91, 0xE4, 0x92, /* 0xF8-0xFB */
+	0xE4, 0x93, 0xE4, 0x94, 0xE4, 0x95, 0xE4, 0x96, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0xE4, 0x97, 0xE4, 0x98, 0xE4, 0x99, 0xE4, 0x9A, /* 0x00-0x03 */
+	0xE4, 0x9B, 0xE4, 0x9C, 0xE4, 0x9D, 0xE4, 0x9E, /* 0x04-0x07 */
+	0xE4, 0x9F, 0xE4, 0xA0, 0xE5, 0x40, 0xE5, 0x41, /* 0x08-0x0B */
+	0xE5, 0x42, 0xE5, 0x43, 0xE5, 0x44, 0xE5, 0x45, /* 0x0C-0x0F */
+	0xE5, 0x46, 0xE5, 0x47, 0xE5, 0x48, 0xE5, 0x49, /* 0x10-0x13 */
+	0xE5, 0x4A, 0xE5, 0x4B, 0xE5, 0x4C, 0xE5, 0x4D, /* 0x14-0x17 */
+	0xE5, 0x4E, 0xE5, 0x4F, 0xE5, 0x50, 0xE5, 0x51, /* 0x18-0x1B */
+	0xE5, 0x52, 0xE5, 0x53, 0xE5, 0x54, 0xE5, 0x55, /* 0x1C-0x1F */
+	0xE5, 0x56, 0xE5, 0x57, 0xE5, 0x58, 0xE5, 0x59, /* 0x20-0x23 */
+	0xE5, 0x5A, 0xE5, 0x5B, 0xE5, 0x5C, 0xE5, 0x5D, /* 0x24-0x27 */
+	0xE5, 0x5E, 0xE5, 0x5F, 0xE5, 0x60, 0xE5, 0x61, /* 0x28-0x2B */
+	0xE5, 0x62, 0xE5, 0x63, 0xE5, 0x64, 0xE5, 0x65, /* 0x2C-0x2F */
+	0xE5, 0x66, 0xE5, 0x67, 0xE5, 0x68, 0xE5, 0x69, /* 0x30-0x33 */
+	0xE5, 0x6A, 0xE5, 0x6B, 0xE5, 0x6C, 0xE5, 0x6D, /* 0x34-0x37 */
+	0xE5, 0x6E, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x71, /* 0x38-0x3B */
+	0xE5, 0x72, 0xE5, 0x73, 0xF6, 0xC9, 0xE5, 0x74, /* 0x3C-0x3F */
+	0xE5, 0x75, 0xE5, 0x76, 0xE5, 0x77, 0xE5, 0x78, /* 0x40-0x43 */
+	0xE5, 0x79, 0xE5, 0x7A, 0xE5, 0x7B, 0xE5, 0x7C, /* 0x44-0x47 */
+	0xE5, 0x7D, 0xE5, 0x7E, 0xE5, 0x80, 0xE5, 0x81, /* 0x48-0x4B */
+	0xE5, 0x82, 0xE5, 0x83, 0xE5, 0x84, 0xE5, 0x85, /* 0x4C-0x4F */
+	0xE5, 0x86, 0xE5, 0x87, 0xE5, 0x88, 0xE5, 0x89, /* 0x50-0x53 */
+	0xE5, 0x8A, 0xE5, 0x8B, 0xE5, 0x8C, 0xE5, 0x8D, /* 0x54-0x57 */
+	0xE5, 0x8E, 0xE5, 0x8F, 0xE5, 0x90, 0xE5, 0x91, /* 0x58-0x5B */
+	0xE5, 0x92, 0xE5, 0x93, 0xE5, 0x94, 0xE5, 0x95, /* 0x5C-0x5F */
+	0xE5, 0x96, 0xE5, 0x97, 0xE5, 0x98, 0xE5, 0x99, /* 0x60-0x63 */
+	0xE5, 0x9A, 0xE5, 0x9B, 0xE5, 0x9C, 0xE5, 0x9D, /* 0x64-0x67 */
+	0xE5, 0x9E, 0xE5, 0x9F, 0xF6, 0xCA, 0xE5, 0xA0, /* 0x68-0x6B */
+	0xE6, 0x40, 0xE6, 0x41, 0xE6, 0x42, 0xE6, 0x43, /* 0x6C-0x6F */
+	0xE6, 0x44, 0xE6, 0x45, 0xE6, 0x46, 0xE6, 0x47, /* 0x70-0x73 */
+	0xE6, 0x48, 0xE6, 0x49, 0xE6, 0x4A, 0xE6, 0x4B, /* 0x74-0x77 */
+	0xE6, 0x4C, 0xE6, 0x4D, 0xE6, 0x4E, 0xE6, 0x4F, /* 0x78-0x7B */
+	0xE6, 0x50, 0xE6, 0x51, 0xE6, 0x52, 0xE6, 0x53, /* 0x7C-0x7F */
+	
+	0xE6, 0x54, 0xE6, 0x55, 0xE6, 0x56, 0xE6, 0x57, /* 0x80-0x83 */
+	0xE6, 0x58, 0xE6, 0x59, 0xE6, 0x5A, 0xE6, 0x5B, /* 0x84-0x87 */
+	0xE6, 0x5C, 0xE6, 0x5D, 0xE6, 0x5E, 0xE6, 0x5F, /* 0x88-0x8B */
+	0xE6, 0x60, 0xE6, 0x61, 0xE6, 0x62, 0xF6, 0xCC, /* 0x8C-0x8F */
+	0xE6, 0x63, 0xE6, 0x64, 0xE6, 0x65, 0xE6, 0x66, /* 0x90-0x93 */
+	0xE6, 0x67, 0xE6, 0x68, 0xE6, 0x69, 0xE6, 0x6A, /* 0x94-0x97 */
+	0xE6, 0x6B, 0xE6, 0x6C, 0xE6, 0x6D, 0xE6, 0x6E, /* 0x98-0x9B */
+	0xE6, 0x6F, 0xE6, 0x70, 0xE6, 0x71, 0xE6, 0x72, /* 0x9C-0x9F */
+	0xE6, 0x73, 0xE6, 0x74, 0xE6, 0x75, 0xE6, 0x76, /* 0xA0-0xA3 */
+	0xE6, 0x77, 0xE6, 0x78, 0xE6, 0x79, 0xE6, 0x7A, /* 0xA4-0xA7 */
+	0xE6, 0x7B, 0xE6, 0x7C, 0xE6, 0x7D, 0xE6, 0x7E, /* 0xA8-0xAB */
+	0xE6, 0x80, 0xE6, 0x81, 0xE6, 0x82, 0xE6, 0x83, /* 0xAC-0xAF */
+	0xE6, 0x84, 0xE6, 0x85, 0xE6, 0x86, 0xE6, 0x87, /* 0xB0-0xB3 */
+	0xE6, 0x88, 0xE6, 0x89, 0xE6, 0x8A, 0xE6, 0x8B, /* 0xB4-0xB7 */
+	0xE6, 0x8C, 0xE6, 0x8D, 0xE6, 0x8E, 0xE6, 0x8F, /* 0xB8-0xBB */
+	0xE6, 0x90, 0xE6, 0x91, 0xE6, 0x92, 0xE6, 0x93, /* 0xBC-0xBF */
+	0xE6, 0x94, 0xE6, 0x95, 0xE6, 0x96, 0xE6, 0x97, /* 0xC0-0xC3 */
+	0xE6, 0x98, 0xE6, 0x99, 0xE6, 0x9A, 0xE6, 0x9B, /* 0xC4-0xC7 */
+	0xE6, 0x9C, 0xE6, 0x9D, 0xF6, 0xCB, 0xE6, 0x9E, /* 0xC8-0xCB */
+	0xE6, 0x9F, 0xE6, 0xA0, 0xE7, 0x40, 0xE7, 0x41, /* 0xCC-0xCF */
+	0xE7, 0x42, 0xE7, 0x43, 0xE7, 0x44, 0xE7, 0x45, /* 0xD0-0xD3 */
+	0xE7, 0x46, 0xE7, 0x47, 0xF7, 0xE9, 0xE7, 0x48, /* 0xD4-0xD7 */
+	0xE7, 0x49, 0xE7, 0x4A, 0xE7, 0x4B, 0xE7, 0x4C, /* 0xD8-0xDB */
+	0xE7, 0x4D, 0xE7, 0x4E, 0xE7, 0x4F, 0xE7, 0x50, /* 0xDC-0xDF */
+	0xE7, 0x51, 0xE7, 0x52, 0xE7, 0x53, 0xE7, 0x54, /* 0xE0-0xE3 */
+	0xE7, 0x55, 0xE7, 0x56, 0xE7, 0x57, 0xE7, 0x58, /* 0xE4-0xE7 */
+	0xE7, 0x59, 0xE7, 0x5A, 0xE7, 0x5B, 0xE7, 0x5C, /* 0xE8-0xEB */
+	0xE7, 0x5D, 0xE7, 0x5E, 0xE7, 0x5F, 0xE7, 0x60, /* 0xEC-0xEF */
+	0xE7, 0x61, 0xE7, 0x62, 0xE7, 0x63, 0xE7, 0x64, /* 0xF0-0xF3 */
+	0xE7, 0x65, 0xE7, 0x66, 0xE7, 0x67, 0xE7, 0x68, /* 0xF4-0xF7 */
+	0xE7, 0x69, 0xE7, 0x6A, 0xE7, 0x6B, 0xE7, 0x6C, /* 0xF8-0xFB */
+	0xE7, 0x6D, 0xE7, 0x6E, 0xE7, 0x6F, 0xE7, 0x70, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_94[512] = {
+	0xE7, 0x71, 0xE7, 0x72, 0xE7, 0x73, 0xE7, 0x74, /* 0x00-0x03 */
+	0xE7, 0x75, 0xE7, 0x76, 0xE7, 0x77, 0xE7, 0x78, /* 0x04-0x07 */
+	0xE7, 0x79, 0xE7, 0x7A, 0xE7, 0x7B, 0xE7, 0x7C, /* 0x08-0x0B */
+	0xE7, 0x7D, 0xE7, 0x7E, 0xE7, 0x80, 0xE7, 0x81, /* 0x0C-0x0F */
+	0xE7, 0x82, 0xE7, 0x83, 0xE7, 0x84, 0xE7, 0x85, /* 0x10-0x13 */
+	0xE7, 0x86, 0xE7, 0x87, 0xE7, 0x88, 0xE7, 0x89, /* 0x14-0x17 */
+	0xE7, 0x8A, 0xE7, 0x8B, 0xE7, 0x8C, 0xE7, 0x8D, /* 0x18-0x1B */
+	0xE7, 0x8E, 0xE7, 0x8F, 0xE7, 0x90, 0xE7, 0x91, /* 0x1C-0x1F */
+	0xE7, 0x92, 0xE7, 0x93, 0xE7, 0x94, 0xE7, 0x95, /* 0x20-0x23 */
+	0xE7, 0x96, 0xE7, 0x97, 0xE7, 0x98, 0xE7, 0x99, /* 0x24-0x27 */
+	0xE7, 0x9A, 0xE7, 0x9B, 0xE7, 0x9C, 0xE7, 0x9D, /* 0x28-0x2B */
+	0xE7, 0x9E, 0xE7, 0x9F, 0xE7, 0xA0, 0xE8, 0x40, /* 0x2C-0x2F */
+	0xE8, 0x41, 0xE8, 0x42, 0xE8, 0x43, 0xE8, 0x44, /* 0x30-0x33 */
+	0xE8, 0x45, 0xE8, 0x46, 0xE8, 0x47, 0xE8, 0x48, /* 0x34-0x37 */
+	0xE8, 0x49, 0xE8, 0x4A, 0xE8, 0x4B, 0xE8, 0x4C, /* 0x38-0x3B */
+	0xE8, 0x4D, 0xE8, 0x4E, 0xF6, 0xCD, 0xE8, 0x4F, /* 0x3C-0x3F */
+	0xE8, 0x50, 0xE8, 0x51, 0xE8, 0x52, 0xE8, 0x53, /* 0x40-0x43 */
+	0xE8, 0x54, 0xE8, 0x55, 0xE8, 0x56, 0xE8, 0x57, /* 0x44-0x47 */
+	0xE8, 0x58, 0xE8, 0x59, 0xE8, 0x5A, 0xE8, 0x5B, /* 0x48-0x4B */
+	0xE8, 0x5C, 0xE8, 0x5D, 0xE8, 0x5E, 0xE8, 0x5F, /* 0x4C-0x4F */
+	0xE8, 0x60, 0xE8, 0x61, 0xE8, 0x62, 0xE8, 0x63, /* 0x50-0x53 */
+	0xE8, 0x64, 0xE8, 0x65, 0xE8, 0x66, 0xE8, 0x67, /* 0x54-0x57 */
+	0xE8, 0x68, 0xE8, 0x69, 0xE8, 0x6A, 0xE8, 0x6B, /* 0x58-0x5B */
+	0xE8, 0x6C, 0xE8, 0x6D, 0xE8, 0x6E, 0xE8, 0x6F, /* 0x5C-0x5F */
+	0xE8, 0x70, 0xE8, 0x71, 0xE8, 0x72, 0xE8, 0x73, /* 0x60-0x63 */
+	0xE8, 0x74, 0xE8, 0x75, 0xE8, 0x76, 0xE8, 0x77, /* 0x64-0x67 */
+	0xE8, 0x78, 0xE8, 0x79, 0xE8, 0x7A, 0xF6, 0xCE, /* 0x68-0x6B */
+	0xE8, 0x7B, 0xE8, 0x7C, 0xE8, 0x7D, 0xE8, 0x7E, /* 0x6C-0x6F */
+	0xE8, 0x80, 0xE8, 0x81, 0xE8, 0x82, 0xE8, 0x83, /* 0x70-0x73 */
+	0xE8, 0x84, 0xE8, 0x85, 0xE8, 0x86, 0xE8, 0x87, /* 0x74-0x77 */
+	0xE8, 0x88, 0xE8, 0x89, 0xE8, 0x8A, 0xE8, 0x8B, /* 0x78-0x7B */
+	0xE8, 0x8C, 0xE8, 0x8D, 0xE8, 0x8E, 0xE8, 0x8F, /* 0x7C-0x7F */
+	
+	0xE8, 0x90, 0xE8, 0x91, 0xE8, 0x92, 0xE8, 0x93, /* 0x80-0x83 */
+	0xE8, 0x94, 0xEE, 0xC4, 0xEE, 0xC5, 0xEE, 0xC6, /* 0x84-0x87 */
+	0xD5, 0xEB, 0xB6, 0xA4, 0xEE, 0xC8, 0xEE, 0xC7, /* 0x88-0x8B */
+	0xEE, 0xC9, 0xEE, 0xCA, 0xC7, 0xA5, 0xEE, 0xCB, /* 0x8C-0x8F */
+	0xEE, 0xCC, 0xE8, 0x95, 0xB7, 0xB0, 0xB5, 0xF6, /* 0x90-0x93 */
+	0xEE, 0xCD, 0xEE, 0xCF, 0xE8, 0x96, 0xEE, 0xCE, /* 0x94-0x97 */
+	0xE8, 0x97, 0xB8, 0xC6, 0xEE, 0xD0, 0xEE, 0xD1, /* 0x98-0x9B */
+	0xEE, 0xD2, 0xB6, 0xDB, 0xB3, 0xAE, 0xD6, 0xD3, /* 0x9C-0x9F */
+	0xC4, 0xC6, 0xB1, 0xB5, 0xB8, 0xD6, 0xEE, 0xD3, /* 0xA0-0xA3 */
+	0xEE, 0xD4, 0xD4, 0xBF, 0xC7, 0xD5, 0xBE, 0xFB, /* 0xA4-0xA7 */
+	0xCE, 0xD9, 0xB9, 0xB3, 0xEE, 0xD6, 0xEE, 0xD5, /* 0xA8-0xAB */
+	0xEE, 0xD8, 0xEE, 0xD7, 0xC5, 0xA5, 0xEE, 0xD9, /* 0xAC-0xAF */
+	0xEE, 0xDA, 0xC7, 0xAE, 0xEE, 0xDB, 0xC7, 0xAF, /* 0xB0-0xB3 */
+	0xEE, 0xDC, 0xB2, 0xA7, 0xEE, 0xDD, 0xEE, 0xDE, /* 0xB4-0xB7 */
+	0xEE, 0xDF, 0xEE, 0xE0, 0xEE, 0xE1, 0xD7, 0xEA, /* 0xB8-0xBB */
+	0xEE, 0xE2, 0xEE, 0xE3, 0xBC, 0xD8, 0xEE, 0xE4, /* 0xBC-0xBF */
+	0xD3, 0xCB, 0xCC, 0xFA, 0xB2, 0xAC, 0xC1, 0xE5, /* 0xC0-0xC3 */
+	0xEE, 0xE5, 0xC7, 0xA6, 0xC3, 0xAD, 0xE8, 0x98, /* 0xC4-0xC7 */
+	0xEE, 0xE6, 0xEE, 0xE7, 0xEE, 0xE8, 0xEE, 0xE9, /* 0xC8-0xCB */
+	0xEE, 0xEA, 0xEE, 0xEB, 0xEE, 0xEC, 0xE8, 0x99, /* 0xCC-0xCF */
+	0xEE, 0xED, 0xEE, 0xEE, 0xEE, 0xEF, 0xE8, 0x9A, /* 0xD0-0xD3 */
+	0xE8, 0x9B, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0xD4-0xD7 */
+	0xEE, 0xF4, 0xEE, 0xF3, 0xE8, 0x9C, 0xEE, 0xF5, /* 0xD8-0xDB */
+	0xCD, 0xAD, 0xC2, 0xC1, 0xEE, 0xF6, 0xEE, 0xF7, /* 0xDC-0xDF */
+	0xEE, 0xF8, 0xD5, 0xA1, 0xEE, 0xF9, 0xCF, 0xB3, /* 0xE0-0xE3 */
+	0xEE, 0xFA, 0xEE, 0xFB, 0xE8, 0x9D, 0xEE, 0xFC, /* 0xE4-0xE7 */
+	0xEE, 0xFD, 0xEF, 0xA1, 0xEE, 0xFE, 0xEF, 0xA2, /* 0xE8-0xEB */
+	0xB8, 0xF5, 0xC3, 0xFA, 0xEF, 0xA3, 0xEF, 0xA4, /* 0xEC-0xEF */
+	0xBD, 0xC2, 0xD2, 0xBF, 0xB2, 0xF9, 0xEF, 0xA5, /* 0xF0-0xF3 */
+	0xEF, 0xA6, 0xEF, 0xA7, 0xD2, 0xF8, 0xEF, 0xA8, /* 0xF4-0xF7 */
+	0xD6, 0xFD, 0xEF, 0xA9, 0xC6, 0xCC, 0xE8, 0x9E, /* 0xF8-0xFB */
+	0xEF, 0xAA, 0xEF, 0xAB, 0xC1, 0xB4, 0xEF, 0xAC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_95[512] = {
+	0xCF, 0xFA, 0xCB, 0xF8, 0xEF, 0xAE, 0xEF, 0xAD, /* 0x00-0x03 */
+	0xB3, 0xFA, 0xB9, 0xF8, 0xEF, 0xAF, 0xEF, 0xB0, /* 0x04-0x07 */
+	0xD0, 0xE2, 0xEF, 0xB1, 0xEF, 0xB2, 0xB7, 0xE6, /* 0x08-0x0B */
+	0xD0, 0xBF, 0xEF, 0xB3, 0xEF, 0xB4, 0xEF, 0xB5, /* 0x0C-0x0F */
+	0xC8, 0xF1, 0xCC, 0xE0, 0xEF, 0xB6, 0xEF, 0xB7, /* 0x10-0x13 */
+	0xEF, 0xB8, 0xEF, 0xB9, 0xEF, 0xBA, 0xD5, 0xE0, /* 0x14-0x17 */
+	0xEF, 0xBB, 0xB4, 0xED, 0xC3, 0xAA, 0xEF, 0xBC, /* 0x18-0x1B */
+	0xE8, 0x9F, 0xEF, 0xBD, 0xEF, 0xBE, 0xEF, 0xBF, /* 0x1C-0x1F */
+	0xE8, 0xA0, 0xCE, 0xFD, 0xEF, 0xC0, 0xC2, 0xE0, /* 0x20-0x23 */
+	0xB4, 0xB8, 0xD7, 0xB6, 0xBD, 0xF5, 0xE9, 0x40, /* 0x24-0x27 */
+	0xCF, 0xC7, 0xEF, 0xC3, 0xEF, 0xC1, 0xEF, 0xC2, /* 0x28-0x2B */
+	0xEF, 0xC4, 0xB6, 0xA7, 0xBC, 0xFC, 0xBE, 0xE2, /* 0x2C-0x2F */
+	0xC3, 0xCC, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x41, /* 0x30-0x33 */
+	0xEF, 0xC7, 0xEF, 0xCF, 0xEF, 0xC8, 0xEF, 0xC9, /* 0x34-0x37 */
+	0xEF, 0xCA, 0xC7, 0xC2, 0xEF, 0xF1, 0xB6, 0xCD, /* 0x38-0x3B */
+	0xEF, 0xCB, 0xE9, 0x42, 0xEF, 0xCC, 0xEF, 0xCD, /* 0x3C-0x3F */
+	0xB6, 0xC6, 0xC3, 0xBE, 0xEF, 0xCE, 0xE9, 0x43, /* 0x40-0x43 */
+	0xEF, 0xD0, 0xEF, 0xD1, 0xEF, 0xD2, 0xD5, 0xF2, /* 0x44-0x47 */
+	0xE9, 0x44, 0xEF, 0xD3, 0xC4, 0xF7, 0xE9, 0x45, /* 0x48-0x4B */
+	0xEF, 0xD4, 0xC4, 0xF8, 0xEF, 0xD5, 0xEF, 0xD6, /* 0x4C-0x4F */
+	0xB8, 0xE4, 0xB0, 0xF7, 0xEF, 0xD7, 0xEF, 0xD8, /* 0x50-0x53 */
+	0xEF, 0xD9, 0xE9, 0x46, 0xEF, 0xDA, 0xEF, 0xDB, /* 0x54-0x57 */
+	0xEF, 0xDC, 0xEF, 0xDD, 0xE9, 0x47, 0xEF, 0xDE, /* 0x58-0x5B */
+	0xBE, 0xB5, 0xEF, 0xE1, 0xEF, 0xDF, 0xEF, 0xE0, /* 0x5C-0x5F */
+	0xE9, 0x48, 0xEF, 0xE2, 0xEF, 0xE3, 0xC1, 0xCD, /* 0x60-0x63 */
+	0xEF, 0xE4, 0xEF, 0xE5, 0xEF, 0xE6, 0xEF, 0xE7, /* 0x64-0x67 */
+	0xEF, 0xE8, 0xEF, 0xE9, 0xEF, 0xEA, 0xEF, 0xEB, /* 0x68-0x6B */
+	0xEF, 0xEC, 0xC0, 0xD8, 0xE9, 0x49, 0xEF, 0xED, /* 0x6C-0x6F */
+	0xC1, 0xAD, 0xEF, 0xEE, 0xEF, 0xEF, 0xEF, 0xF0, /* 0x70-0x73 */
+	0xE9, 0x4A, 0xE9, 0x4B, 0xCF, 0xE2, 0xE9, 0x4C, /* 0x74-0x77 */
+	0xE9, 0x4D, 0xE9, 0x4E, 0xE9, 0x4F, 0xE9, 0x50, /* 0x78-0x7B */
+	0xE9, 0x51, 0xE9, 0x52, 0xE9, 0x53, 0xB3, 0xA4, /* 0x7C-0x7F */
+	
+	0xE9, 0x54, 0xE9, 0x55, 0xE9, 0x56, 0xE9, 0x57, /* 0x80-0x83 */
+	0xE9, 0x58, 0xE9, 0x59, 0xE9, 0x5A, 0xE9, 0x5B, /* 0x84-0x87 */
+	0xE9, 0x5C, 0xE9, 0x5D, 0xE9, 0x5E, 0xE9, 0x5F, /* 0x88-0x8B */
+	0xE9, 0x60, 0xE9, 0x61, 0xE9, 0x62, 0xE9, 0x63, /* 0x8C-0x8F */
+	0xE9, 0x64, 0xE9, 0x65, 0xE9, 0x66, 0xE9, 0x67, /* 0x90-0x93 */
+	0xE9, 0x68, 0xE9, 0x69, 0xE9, 0x6A, 0xE9, 0x6B, /* 0x94-0x97 */
+	0xE9, 0x6C, 0xE9, 0x6D, 0xE9, 0x6E, 0xE9, 0x6F, /* 0x98-0x9B */
+	0xE9, 0x70, 0xE9, 0x71, 0xE9, 0x72, 0xE9, 0x73, /* 0x9C-0x9F */
+	0xE9, 0x74, 0xE9, 0x75, 0xE9, 0x76, 0xE9, 0x77, /* 0xA0-0xA3 */
+	0xE9, 0x78, 0xE9, 0x79, 0xE9, 0x7A, 0xE9, 0x7B, /* 0xA4-0xA7 */
+	0xE9, 0x7C, 0xE9, 0x7D, 0xE9, 0x7E, 0xE9, 0x80, /* 0xA8-0xAB */
+	0xE9, 0x81, 0xE9, 0x82, 0xE9, 0x83, 0xE9, 0x84, /* 0xAC-0xAF */
+	0xE9, 0x85, 0xE9, 0x86, 0xE9, 0x87, 0xE9, 0x88, /* 0xB0-0xB3 */
+	0xE9, 0x89, 0xE9, 0x8A, 0xE9, 0x8B, 0xE9, 0x8C, /* 0xB4-0xB7 */
+	0xE9, 0x8D, 0xE9, 0x8E, 0xE9, 0x8F, 0xE9, 0x90, /* 0xB8-0xBB */
+	0xE9, 0x91, 0xE9, 0x92, 0xE9, 0x93, 0xE9, 0x94, /* 0xBC-0xBF */
+	0xE9, 0x95, 0xE9, 0x96, 0xE9, 0x97, 0xE9, 0x98, /* 0xC0-0xC3 */
+	0xE9, 0x99, 0xE9, 0x9A, 0xE9, 0x9B, 0xE9, 0x9C, /* 0xC4-0xC7 */
+	0xE9, 0x9D, 0xE9, 0x9E, 0xE9, 0x9F, 0xE9, 0xA0, /* 0xC8-0xCB */
+	0xEA, 0x40, 0xEA, 0x41, 0xEA, 0x42, 0xEA, 0x43, /* 0xCC-0xCF */
+	0xEA, 0x44, 0xEA, 0x45, 0xEA, 0x46, 0xEA, 0x47, /* 0xD0-0xD3 */
+	0xEA, 0x48, 0xEA, 0x49, 0xEA, 0x4A, 0xEA, 0x4B, /* 0xD4-0xD7 */
+	0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x4E, 0xEA, 0x4F, /* 0xD8-0xDB */
+	0xEA, 0x50, 0xEA, 0x51, 0xEA, 0x52, 0xEA, 0x53, /* 0xDC-0xDF */
+	0xEA, 0x54, 0xEA, 0x55, 0xEA, 0x56, 0xEA, 0x57, /* 0xE0-0xE3 */
+	0xEA, 0x58, 0xEA, 0x59, 0xEA, 0x5A, 0xEA, 0x5B, /* 0xE4-0xE7 */
+	0xC3, 0xC5, 0xE3, 0xC5, 0xC9, 0xC1, 0xE3, 0xC6, /* 0xE8-0xEB */
+	0xEA, 0x5C, 0xB1, 0xD5, 0xCE, 0xCA, 0xB4, 0xB3, /* 0xEC-0xEF */
+	0xC8, 0xF2, 0xE3, 0xC7, 0xCF, 0xD0, 0xE3, 0xC8, /* 0xF0-0xF3 */
+	0xBC, 0xE4, 0xE3, 0xC9, 0xE3, 0xCA, 0xC3, 0xC6, /* 0xF4-0xF7 */
+	0xD5, 0xA2, 0xC4, 0xD6, 0xB9, 0xEB, 0xCE, 0xC5, /* 0xF8-0xFB */
+	0xE3, 0xCB, 0xC3, 0xF6, 0xE3, 0xCC, 0xEA, 0x5D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_96[512] = {
+	0xB7, 0xA7, 0xB8, 0xF3, 0xBA, 0xD2, 0xE3, 0xCD, /* 0x00-0x03 */
+	0xE3, 0xCE, 0xD4, 0xC4, 0xE3, 0xCF, 0xEA, 0x5E, /* 0x04-0x07 */
+	0xE3, 0xD0, 0xD1, 0xCB, 0xE3, 0xD1, 0xE3, 0xD2, /* 0x08-0x0B */
+	0xE3, 0xD3, 0xE3, 0xD4, 0xD1, 0xD6, 0xE3, 0xD5, /* 0x0C-0x0F */
+	0xB2, 0xFB, 0xC0, 0xBB, 0xE3, 0xD6, 0xEA, 0x5F, /* 0x10-0x13 */
+	0xC0, 0xAB, 0xE3, 0xD7, 0xE3, 0xD8, 0xE3, 0xD9, /* 0x14-0x17 */
+	0xEA, 0x60, 0xE3, 0xDA, 0xE3, 0xDB, 0xEA, 0x61, /* 0x18-0x1B */
+	0xB8, 0xB7, 0xDA, 0xE2, 0xEA, 0x62, 0xB6, 0xD3, /* 0x1C-0x1F */
+	0xEA, 0x63, 0xDA, 0xE4, 0xDA, 0xE3, 0xEA, 0x64, /* 0x20-0x23 */
+	0xEA, 0x65, 0xEA, 0x66, 0xEA, 0x67, 0xEA, 0x68, /* 0x24-0x27 */
+	0xEA, 0x69, 0xEA, 0x6A, 0xDA, 0xE6, 0xEA, 0x6B, /* 0x28-0x2B */
+	0xEA, 0x6C, 0xEA, 0x6D, 0xC8, 0xEE, 0xEA, 0x6E, /* 0x2C-0x2F */
+	0xEA, 0x6F, 0xDA, 0xE5, 0xB7, 0xC0, 0xD1, 0xF4, /* 0x30-0x33 */
+	0xD2, 0xF5, 0xD5, 0xF3, 0xBD, 0xD7, 0xEA, 0x70, /* 0x34-0x37 */
+	0xEA, 0x71, 0xEA, 0x72, 0xEA, 0x73, 0xD7, 0xE8, /* 0x38-0x3B */
+	0xDA, 0xE8, 0xDA, 0xE7, 0xEA, 0x74, 0xB0, 0xA2, /* 0x3C-0x3F */
+	0xCD, 0xD3, 0xEA, 0x75, 0xDA, 0xE9, 0xEA, 0x76, /* 0x40-0x43 */
+	0xB8, 0xBD, 0xBC, 0xCA, 0xC2, 0xBD, 0xC2, 0xA4, /* 0x44-0x47 */
+	0xB3, 0xC2, 0xDA, 0xEA, 0xEA, 0x77, 0xC2, 0xAA, /* 0x48-0x4B */
+	0xC4, 0xB0, 0xBD, 0xB5, 0xEA, 0x78, 0xEA, 0x79, /* 0x4C-0x4F */
+	0xCF, 0xDE, 0xEA, 0x7A, 0xEA, 0x7B, 0xEA, 0x7C, /* 0x50-0x53 */
+	0xDA, 0xEB, 0xC9, 0xC2, 0xEA, 0x7D, 0xEA, 0x7E, /* 0x54-0x57 */
+	0xEA, 0x80, 0xEA, 0x81, 0xEA, 0x82, 0xB1, 0xDD, /* 0x58-0x5B */
+	0xEA, 0x83, 0xEA, 0x84, 0xEA, 0x85, 0xDA, 0xEC, /* 0x5C-0x5F */
+	0xEA, 0x86, 0xB6, 0xB8, 0xD4, 0xBA, 0xEA, 0x87, /* 0x60-0x63 */
+	0xB3, 0xFD, 0xEA, 0x88, 0xEA, 0x89, 0xDA, 0xED, /* 0x64-0x67 */
+	0xD4, 0xC9, 0xCF, 0xD5, 0xC5, 0xE3, 0xEA, 0x8A, /* 0x68-0x6B */
+	0xDA, 0xEE, 0xEA, 0x8B, 0xEA, 0x8C, 0xEA, 0x8D, /* 0x6C-0x6F */
+	0xEA, 0x8E, 0xEA, 0x8F, 0xDA, 0xEF, 0xEA, 0x90, /* 0x70-0x73 */
+	0xDA, 0xF0, 0xC1, 0xEA, 0xCC, 0xD5, 0xCF, 0xDD, /* 0x74-0x77 */
+	0xEA, 0x91, 0xEA, 0x92, 0xEA, 0x93, 0xEA, 0x94, /* 0x78-0x7B */
+	0xEA, 0x95, 0xEA, 0x96, 0xEA, 0x97, 0xEA, 0x98, /* 0x7C-0x7F */
+	
+	0xEA, 0x99, 0xEA, 0x9A, 0xEA, 0x9B, 0xEA, 0x9C, /* 0x80-0x83 */
+	0xEA, 0x9D, 0xD3, 0xE7, 0xC2, 0xA1, 0xEA, 0x9E, /* 0x84-0x87 */
+	0xDA, 0xF1, 0xEA, 0x9F, 0xEA, 0xA0, 0xCB, 0xE5, /* 0x88-0x8B */
+	0xEB, 0x40, 0xDA, 0xF2, 0xEB, 0x41, 0xCB, 0xE6, /* 0x8C-0x8F */
+	0xD2, 0xFE, 0xEB, 0x42, 0xEB, 0x43, 0xEB, 0x44, /* 0x90-0x93 */
+	0xB8, 0xF4, 0xEB, 0x45, 0xEB, 0x46, 0xDA, 0xF3, /* 0x94-0x97 */
+	0xB0, 0xAF, 0xCF, 0xB6, 0xEB, 0x47, 0xEB, 0x48, /* 0x98-0x9B */
+	0xD5, 0xCF, 0xEB, 0x49, 0xEB, 0x4A, 0xEB, 0x4B, /* 0x9C-0x9F */
+	0xEB, 0x4C, 0xEB, 0x4D, 0xEB, 0x4E, 0xEB, 0x4F, /* 0xA0-0xA3 */
+	0xEB, 0x50, 0xEB, 0x51, 0xEB, 0x52, 0xCB, 0xED, /* 0xA4-0xA7 */
+	0xEB, 0x53, 0xEB, 0x54, 0xEB, 0x55, 0xEB, 0x56, /* 0xA8-0xAB */
+	0xEB, 0x57, 0xEB, 0x58, 0xEB, 0x59, 0xEB, 0x5A, /* 0xAC-0xAF */
+	0xDA, 0xF4, 0xEB, 0x5B, 0xEB, 0x5C, 0xE3, 0xC4, /* 0xB0-0xB3 */
+	0xEB, 0x5D, 0xEB, 0x5E, 0xC1, 0xA5, 0xEB, 0x5F, /* 0xB4-0xB7 */
+	0xEB, 0x60, 0xF6, 0xBF, 0xEB, 0x61, 0xEB, 0x62, /* 0xB8-0xBB */
+	0xF6, 0xC0, 0xF6, 0xC1, 0xC4, 0xD1, 0xEB, 0x63, /* 0xBC-0xBF */
+	0xC8, 0xB8, 0xD1, 0xE3, 0xEB, 0x64, 0xEB, 0x65, /* 0xC0-0xC3 */
+	0xD0, 0xDB, 0xD1, 0xC5, 0xBC, 0xAF, 0xB9, 0xCD, /* 0xC4-0xC7 */
+	0xEB, 0x66, 0xEF, 0xF4, 0xEB, 0x67, 0xEB, 0x68, /* 0xC8-0xCB */
+	0xB4, 0xC6, 0xD3, 0xBA, 0xF6, 0xC2, 0xB3, 0xFB, /* 0xCC-0xCF */
+	0xEB, 0x69, 0xEB, 0x6A, 0xF6, 0xC3, 0xEB, 0x6B, /* 0xD0-0xD3 */
+	0xEB, 0x6C, 0xB5, 0xF1, 0xEB, 0x6D, 0xEB, 0x6E, /* 0xD4-0xD7 */
+	0xEB, 0x6F, 0xEB, 0x70, 0xEB, 0x71, 0xEB, 0x72, /* 0xD8-0xDB */
+	0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x75, 0xEB, 0x76, /* 0xDC-0xDF */
+	0xF6, 0xC5, 0xEB, 0x77, 0xEB, 0x78, 0xEB, 0x79, /* 0xE0-0xE3 */
+	0xEB, 0x7A, 0xEB, 0x7B, 0xEB, 0x7C, 0xEB, 0x7D, /* 0xE4-0xE7 */
+	0xD3, 0xEA, 0xF6, 0xA7, 0xD1, 0xA9, 0xEB, 0x7E, /* 0xE8-0xEB */
+	0xEB, 0x80, 0xEB, 0x81, 0xEB, 0x82, 0xF6, 0xA9, /* 0xEC-0xEF */
+	0xEB, 0x83, 0xEB, 0x84, 0xEB, 0x85, 0xF6, 0xA8, /* 0xF0-0xF3 */
+	0xEB, 0x86, 0xEB, 0x87, 0xC1, 0xE3, 0xC0, 0xD7, /* 0xF4-0xF7 */
+	0xEB, 0x88, 0xB1, 0xA2, 0xEB, 0x89, 0xEB, 0x8A, /* 0xF8-0xFB */
+	0xEB, 0x8B, 0xEB, 0x8C, 0xCE, 0xED, 0xEB, 0x8D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_97[512] = {
+	0xD0, 0xE8, 0xF6, 0xAB, 0xEB, 0x8E, 0xEB, 0x8F, /* 0x00-0x03 */
+	0xCF, 0xF6, 0xEB, 0x90, 0xF6, 0xAA, 0xD5, 0xF0, /* 0x04-0x07 */
+	0xF6, 0xAC, 0xC3, 0xB9, 0xEB, 0x91, 0xEB, 0x92, /* 0x08-0x0B */
+	0xEB, 0x93, 0xBB, 0xF4, 0xF6, 0xAE, 0xF6, 0xAD, /* 0x0C-0x0F */
+	0xEB, 0x94, 0xEB, 0x95, 0xEB, 0x96, 0xC4, 0xDE, /* 0x10-0x13 */
+	0xEB, 0x97, 0xEB, 0x98, 0xC1, 0xD8, 0xEB, 0x99, /* 0x14-0x17 */
+	0xEB, 0x9A, 0xEB, 0x9B, 0xEB, 0x9C, 0xEB, 0x9D, /* 0x18-0x1B */
+	0xCB, 0xAA, 0xEB, 0x9E, 0xCF, 0xBC, 0xEB, 0x9F, /* 0x1C-0x1F */
+	0xEB, 0xA0, 0xEC, 0x40, 0xEC, 0x41, 0xEC, 0x42, /* 0x20-0x23 */
+	0xEC, 0x43, 0xEC, 0x44, 0xEC, 0x45, 0xEC, 0x46, /* 0x24-0x27 */
+	0xEC, 0x47, 0xEC, 0x48, 0xF6, 0xAF, 0xEC, 0x49, /* 0x28-0x2B */
+	0xEC, 0x4A, 0xF6, 0xB0, 0xEC, 0x4B, 0xEC, 0x4C, /* 0x2C-0x2F */
+	0xF6, 0xB1, 0xEC, 0x4D, 0xC2, 0xB6, 0xEC, 0x4E, /* 0x30-0x33 */
+	0xEC, 0x4F, 0xEC, 0x50, 0xEC, 0x51, 0xEC, 0x52, /* 0x34-0x37 */
+	0xB0, 0xD4, 0xC5, 0xF9, 0xEC, 0x53, 0xEC, 0x54, /* 0x38-0x3B */
+	0xEC, 0x55, 0xEC, 0x56, 0xF6, 0xB2, 0xEC, 0x57, /* 0x3C-0x3F */
+	0xEC, 0x58, 0xEC, 0x59, 0xEC, 0x5A, 0xEC, 0x5B, /* 0x40-0x43 */
+	0xEC, 0x5C, 0xEC, 0x5D, 0xEC, 0x5E, 0xEC, 0x5F, /* 0x44-0x47 */
+	0xEC, 0x60, 0xEC, 0x61, 0xEC, 0x62, 0xEC, 0x63, /* 0x48-0x4B */
+	0xEC, 0x64, 0xEC, 0x65, 0xEC, 0x66, 0xEC, 0x67, /* 0x4C-0x4F */
+	0xEC, 0x68, 0xEC, 0x69, 0xC7, 0xE0, 0xF6, 0xA6, /* 0x50-0x53 */
+	0xEC, 0x6A, 0xEC, 0x6B, 0xBE, 0xB8, 0xEC, 0x6C, /* 0x54-0x57 */
+	0xEC, 0x6D, 0xBE, 0xB2, 0xEC, 0x6E, 0xB5, 0xE5, /* 0x58-0x5B */
+	0xEC, 0x6F, 0xEC, 0x70, 0xB7, 0xC7, 0xEC, 0x71, /* 0x5C-0x5F */
+	0xBF, 0xBF, 0xC3, 0xD2, 0xC3, 0xE6, 0xEC, 0x72, /* 0x60-0x63 */
+	0xEC, 0x73, 0xD8, 0xCC, 0xEC, 0x74, 0xEC, 0x75, /* 0x64-0x67 */
+	0xEC, 0x76, 0xB8, 0xEF, 0xEC, 0x77, 0xEC, 0x78, /* 0x68-0x6B */
+	0xEC, 0x79, 0xEC, 0x7A, 0xEC, 0x7B, 0xEC, 0x7C, /* 0x6C-0x6F */
+	0xEC, 0x7D, 0xEC, 0x7E, 0xEC, 0x80, 0xBD, 0xF9, /* 0x70-0x73 */
+	0xD1, 0xA5, 0xEC, 0x81, 0xB0, 0xD0, 0xEC, 0x82, /* 0x74-0x77 */
+	0xEC, 0x83, 0xEC, 0x84, 0xEC, 0x85, 0xEC, 0x86, /* 0x78-0x7B */
+	0xF7, 0xB0, 0xEC, 0x87, 0xEC, 0x88, 0xEC, 0x89, /* 0x7C-0x7F */
+	
+	0xEC, 0x8A, 0xEC, 0x8B, 0xEC, 0x8C, 0xEC, 0x8D, /* 0x80-0x83 */
+	0xEC, 0x8E, 0xF7, 0xB1, 0xEC, 0x8F, 0xEC, 0x90, /* 0x84-0x87 */
+	0xEC, 0x91, 0xEC, 0x92, 0xEC, 0x93, 0xD0, 0xAC, /* 0x88-0x8B */
+	0xEC, 0x94, 0xB0, 0xB0, 0xEC, 0x95, 0xEC, 0x96, /* 0x8C-0x8F */
+	0xEC, 0x97, 0xF7, 0xB2, 0xF7, 0xB3, 0xEC, 0x98, /* 0x90-0x93 */
+	0xF7, 0xB4, 0xEC, 0x99, 0xEC, 0x9A, 0xEC, 0x9B, /* 0x94-0x97 */
+	0xC7, 0xCA, 0xEC, 0x9C, 0xEC, 0x9D, 0xEC, 0x9E, /* 0x98-0x9B */
+	0xEC, 0x9F, 0xEC, 0xA0, 0xED, 0x40, 0xED, 0x41, /* 0x9C-0x9F */
+	0xBE, 0xCF, 0xED, 0x42, 0xED, 0x43, 0xF7, 0xB7, /* 0xA0-0xA3 */
+	0xED, 0x44, 0xED, 0x45, 0xED, 0x46, 0xED, 0x47, /* 0xA4-0xA7 */
+	0xED, 0x48, 0xED, 0x49, 0xED, 0x4A, 0xF7, 0xB6, /* 0xA8-0xAB */
+	0xED, 0x4B, 0xB1, 0xDE, 0xED, 0x4C, 0xF7, 0xB5, /* 0xAC-0xAF */
+	0xED, 0x4D, 0xED, 0x4E, 0xF7, 0xB8, 0xED, 0x4F, /* 0xB0-0xB3 */
+	0xF7, 0xB9, 0xED, 0x50, 0xED, 0x51, 0xED, 0x52, /* 0xB4-0xB7 */
+	0xED, 0x53, 0xED, 0x54, 0xED, 0x55, 0xED, 0x56, /* 0xB8-0xBB */
+	0xED, 0x57, 0xED, 0x58, 0xED, 0x59, 0xED, 0x5A, /* 0xBC-0xBF */
+	0xED, 0x5B, 0xED, 0x5C, 0xED, 0x5D, 0xED, 0x5E, /* 0xC0-0xC3 */
+	0xED, 0x5F, 0xED, 0x60, 0xED, 0x61, 0xED, 0x62, /* 0xC4-0xC7 */
+	0xED, 0x63, 0xED, 0x64, 0xED, 0x65, 0xED, 0x66, /* 0xC8-0xCB */
+	0xED, 0x67, 0xED, 0x68, 0xED, 0x69, 0xED, 0x6A, /* 0xCC-0xCF */
+	0xED, 0x6B, 0xED, 0x6C, 0xED, 0x6D, 0xED, 0x6E, /* 0xD0-0xD3 */
+	0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xED, 0x72, /* 0xD4-0xD7 */
+	0xED, 0x73, 0xED, 0x74, 0xED, 0x75, 0xED, 0x76, /* 0xD8-0xDB */
+	0xED, 0x77, 0xED, 0x78, 0xED, 0x79, 0xED, 0x7A, /* 0xDC-0xDF */
+	0xED, 0x7B, 0xED, 0x7C, 0xED, 0x7D, 0xED, 0x7E, /* 0xE0-0xE3 */
+	0xED, 0x80, 0xED, 0x81, 0xCE, 0xA4, 0xC8, 0xCD, /* 0xE4-0xE7 */
+	0xED, 0x82, 0xBA, 0xAB, 0xE8, 0xB8, 0xE8, 0xB9, /* 0xE8-0xEB */
+	0xE8, 0xBA, 0xBE, 0xC2, 0xED, 0x83, 0xED, 0x84, /* 0xEC-0xEF */
+	0xED, 0x85, 0xED, 0x86, 0xED, 0x87, 0xD2, 0xF4, /* 0xF0-0xF3 */
+	0xED, 0x88, 0xD4, 0xCF, 0xC9, 0xD8, 0xED, 0x89, /* 0xF4-0xF7 */
+	0xED, 0x8A, 0xED, 0x8B, 0xED, 0x8C, 0xED, 0x8D, /* 0xF8-0xFB */
+	0xED, 0x8E, 0xED, 0x8F, 0xED, 0x90, 0xED, 0x91, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0xED, 0x92, 0xED, 0x93, 0xED, 0x94, 0xED, 0x95, /* 0x00-0x03 */
+	0xED, 0x96, 0xED, 0x97, 0xED, 0x98, 0xED, 0x99, /* 0x04-0x07 */
+	0xED, 0x9A, 0xED, 0x9B, 0xED, 0x9C, 0xED, 0x9D, /* 0x08-0x0B */
+	0xED, 0x9E, 0xED, 0x9F, 0xED, 0xA0, 0xEE, 0x40, /* 0x0C-0x0F */
+	0xEE, 0x41, 0xEE, 0x42, 0xEE, 0x43, 0xEE, 0x44, /* 0x10-0x13 */
+	0xEE, 0x45, 0xEE, 0x46, 0xEE, 0x47, 0xEE, 0x48, /* 0x14-0x17 */
+	0xEE, 0x49, 0xEE, 0x4A, 0xEE, 0x4B, 0xEE, 0x4C, /* 0x18-0x1B */
+	0xEE, 0x4D, 0xEE, 0x4E, 0xEE, 0x4F, 0xEE, 0x50, /* 0x1C-0x1F */
+	0xEE, 0x51, 0xEE, 0x52, 0xEE, 0x53, 0xEE, 0x54, /* 0x20-0x23 */
+	0xEE, 0x55, 0xEE, 0x56, 0xEE, 0x57, 0xEE, 0x58, /* 0x24-0x27 */
+	0xEE, 0x59, 0xEE, 0x5A, 0xEE, 0x5B, 0xEE, 0x5C, /* 0x28-0x2B */
+	0xEE, 0x5D, 0xEE, 0x5E, 0xEE, 0x5F, 0xEE, 0x60, /* 0x2C-0x2F */
+	0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x64, /* 0x30-0x33 */
+	0xEE, 0x65, 0xEE, 0x66, 0xEE, 0x67, 0xEE, 0x68, /* 0x34-0x37 */
+	0xEE, 0x69, 0xEE, 0x6A, 0xEE, 0x6B, 0xEE, 0x6C, /* 0x38-0x3B */
+	0xEE, 0x6D, 0xEE, 0x6E, 0xEE, 0x6F, 0xEE, 0x70, /* 0x3C-0x3F */
+	0xEE, 0x71, 0xEE, 0x72, 0xEE, 0x73, 0xEE, 0x74, /* 0x40-0x43 */
+	0xEE, 0x75, 0xEE, 0x76, 0xEE, 0x77, 0xEE, 0x78, /* 0x44-0x47 */
+	0xEE, 0x79, 0xEE, 0x7A, 0xEE, 0x7B, 0xEE, 0x7C, /* 0x48-0x4B */
+	0xEE, 0x7D, 0xEE, 0x7E, 0xEE, 0x80, 0xEE, 0x81, /* 0x4C-0x4F */
+	0xEE, 0x82, 0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x85, /* 0x50-0x53 */
+	0xEE, 0x86, 0xEE, 0x87, 0xEE, 0x88, 0xEE, 0x89, /* 0x54-0x57 */
+	0xEE, 0x8A, 0xEE, 0x8B, 0xEE, 0x8C, 0xEE, 0x8D, /* 0x58-0x5B */
+	0xEE, 0x8E, 0xEE, 0x8F, 0xEE, 0x90, 0xEE, 0x91, /* 0x5C-0x5F */
+	0xEE, 0x92, 0xEE, 0x93, 0xEE, 0x94, 0xEE, 0x95, /* 0x60-0x63 */
+	0xEE, 0x96, 0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x99, /* 0x64-0x67 */
+	0xEE, 0x9A, 0xEE, 0x9B, 0xEE, 0x9C, 0xEE, 0x9D, /* 0x68-0x6B */
+	0xEE, 0x9E, 0xEE, 0x9F, 0xEE, 0xA0, 0xEF, 0x40, /* 0x6C-0x6F */
+	0xEF, 0x41, 0xEF, 0x42, 0xEF, 0x43, 0xEF, 0x44, /* 0x70-0x73 */
+	0xEF, 0x45, 0xD2, 0xB3, 0xB6, 0xA5, 0xC7, 0xEA, /* 0x74-0x77 */
+	0xF1, 0xFC, 0xCF, 0xEE, 0xCB, 0xB3, 0xD0, 0xEB, /* 0x78-0x7B */
+	0xE7, 0xEF, 0xCD, 0xE7, 0xB9, 0xCB, 0xB6, 0xD9, /* 0x7C-0x7F */
+	
+	0xF1, 0xFD, 0xB0, 0xE4, 0xCB, 0xCC, 0xF1, 0xFE, /* 0x80-0x83 */
+	0xD4, 0xA4, 0xC2, 0xAD, 0xC1, 0xEC, 0xC6, 0xC4, /* 0x84-0x87 */
+	0xBE, 0xB1, 0xF2, 0xA1, 0xBC, 0xD5, 0xEF, 0x46, /* 0x88-0x8B */
+	0xF2, 0xA2, 0xF2, 0xA3, 0xEF, 0x47, 0xF2, 0xA4, /* 0x8C-0x8F */
+	0xD2, 0xC3, 0xC6, 0xB5, 0xEF, 0x48, 0xCD, 0xC7, /* 0x90-0x93 */
+	0xF2, 0xA5, 0xEF, 0x49, 0xD3, 0xB1, 0xBF, 0xC5, /* 0x94-0x97 */
+	0xCC, 0xE2, 0xEF, 0x4A, 0xF2, 0xA6, 0xF2, 0xA7, /* 0x98-0x9B */
+	0xD1, 0xD5, 0xB6, 0xEE, 0xF2, 0xA8, 0xF2, 0xA9, /* 0x9C-0x9F */
+	0xB5, 0xDF, 0xF2, 0xAA, 0xF2, 0xAB, 0xEF, 0x4B, /* 0xA0-0xA3 */
+	0xB2, 0xFC, 0xF2, 0xAC, 0xF2, 0xAD, 0xC8, 0xA7, /* 0xA4-0xA7 */
+	0xEF, 0x4C, 0xEF, 0x4D, 0xEF, 0x4E, 0xEF, 0x4F, /* 0xA8-0xAB */
+	0xEF, 0x50, 0xEF, 0x51, 0xEF, 0x52, 0xEF, 0x53, /* 0xAC-0xAF */
+	0xEF, 0x54, 0xEF, 0x55, 0xEF, 0x56, 0xEF, 0x57, /* 0xB0-0xB3 */
+	0xEF, 0x58, 0xEF, 0x59, 0xEF, 0x5A, 0xEF, 0x5B, /* 0xB4-0xB7 */
+	0xEF, 0x5C, 0xEF, 0x5D, 0xEF, 0x5E, 0xEF, 0x5F, /* 0xB8-0xBB */
+	0xEF, 0x60, 0xEF, 0x61, 0xEF, 0x62, 0xEF, 0x63, /* 0xBC-0xBF */
+	0xEF, 0x64, 0xEF, 0x65, 0xEF, 0x66, 0xEF, 0x67, /* 0xC0-0xC3 */
+	0xEF, 0x68, 0xEF, 0x69, 0xEF, 0x6A, 0xEF, 0x6B, /* 0xC4-0xC7 */
+	0xEF, 0x6C, 0xEF, 0x6D, 0xEF, 0x6E, 0xEF, 0x6F, /* 0xC8-0xCB */
+	0xEF, 0x70, 0xEF, 0x71, 0xB7, 0xE7, 0xEF, 0x72, /* 0xCC-0xCF */
+	0xEF, 0x73, 0xEC, 0xA9, 0xEC, 0xAA, 0xEC, 0xAB, /* 0xD0-0xD3 */
+	0xEF, 0x74, 0xEC, 0xAC, 0xEF, 0x75, 0xEF, 0x76, /* 0xD4-0xD7 */
+	0xC6, 0xAE, 0xEC, 0xAD, 0xEC, 0xAE, 0xEF, 0x77, /* 0xD8-0xDB */
+	0xEF, 0x78, 0xEF, 0x79, 0xB7, 0xC9, 0xCA, 0xB3, /* 0xDC-0xDF */
+	0xEF, 0x7A, 0xEF, 0x7B, 0xEF, 0x7C, 0xEF, 0x7D, /* 0xE0-0xE3 */
+	0xEF, 0x7E, 0xEF, 0x80, 0xEF, 0x81, 0xE2, 0xB8, /* 0xE4-0xE7 */
+	0xF7, 0xCF, 0xEF, 0x82, 0xEF, 0x83, 0xEF, 0x84, /* 0xE8-0xEB */
+	0xEF, 0x85, 0xEF, 0x86, 0xEF, 0x87, 0xEF, 0x88, /* 0xEC-0xEF */
+	0xEF, 0x89, 0xEF, 0x8A, 0xEF, 0x8B, 0xEF, 0x8C, /* 0xF0-0xF3 */
+	0xEF, 0x8D, 0xEF, 0x8E, 0xEF, 0x8F, 0xEF, 0x90, /* 0xF4-0xF7 */
+	0xEF, 0x91, 0xEF, 0x92, 0xEF, 0x93, 0xEF, 0x94, /* 0xF8-0xFB */
+	0xEF, 0x95, 0xEF, 0x96, 0xEF, 0x97, 0xEF, 0x98, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0xEF, 0x99, 0xEF, 0x9A, 0xEF, 0x9B, 0xEF, 0x9C, /* 0x00-0x03 */
+	0xEF, 0x9D, 0xEF, 0x9E, 0xEF, 0x9F, 0xEF, 0xA0, /* 0x04-0x07 */
+	0xF0, 0x40, 0xF0, 0x41, 0xF0, 0x42, 0xF0, 0x43, /* 0x08-0x0B */
+	0xF0, 0x44, 0xF7, 0xD0, 0xF0, 0x45, 0xF0, 0x46, /* 0x0C-0x0F */
+	0xB2, 0xCD, 0xF0, 0x47, 0xF0, 0x48, 0xF0, 0x49, /* 0x10-0x13 */
+	0xF0, 0x4A, 0xF0, 0x4B, 0xF0, 0x4C, 0xF0, 0x4D, /* 0x14-0x17 */
+	0xF0, 0x4E, 0xF0, 0x4F, 0xF0, 0x50, 0xF0, 0x51, /* 0x18-0x1B */
+	0xF0, 0x52, 0xF0, 0x53, 0xF0, 0x54, 0xF0, 0x55, /* 0x1C-0x1F */
+	0xF0, 0x56, 0xF0, 0x57, 0xF0, 0x58, 0xF0, 0x59, /* 0x20-0x23 */
+	0xF0, 0x5A, 0xF0, 0x5B, 0xF0, 0x5C, 0xF0, 0x5D, /* 0x24-0x27 */
+	0xF0, 0x5E, 0xF0, 0x5F, 0xF0, 0x60, 0xF0, 0x61, /* 0x28-0x2B */
+	0xF0, 0x62, 0xF0, 0x63, 0xF7, 0xD1, 0xF0, 0x64, /* 0x2C-0x2F */
+	0xF0, 0x65, 0xF0, 0x66, 0xF0, 0x67, 0xF0, 0x68, /* 0x30-0x33 */
+	0xF0, 0x69, 0xF0, 0x6A, 0xF0, 0x6B, 0xF0, 0x6C, /* 0x34-0x37 */
+	0xF0, 0x6D, 0xF0, 0x6E, 0xF0, 0x6F, 0xF0, 0x70, /* 0x38-0x3B */
+	0xF0, 0x71, 0xF0, 0x72, 0xF0, 0x73, 0xF0, 0x74, /* 0x3C-0x3F */
+	0xF0, 0x75, 0xF0, 0x76, 0xF0, 0x77, 0xF0, 0x78, /* 0x40-0x43 */
+	0xF0, 0x79, 0xF0, 0x7A, 0xF0, 0x7B, 0xF0, 0x7C, /* 0x44-0x47 */
+	0xF0, 0x7D, 0xF0, 0x7E, 0xF0, 0x80, 0xF0, 0x81, /* 0x48-0x4B */
+	0xF0, 0x82, 0xF0, 0x83, 0xF0, 0x84, 0xF0, 0x85, /* 0x4C-0x4F */
+	0xF0, 0x86, 0xF0, 0x87, 0xF0, 0x88, 0xF0, 0x89, /* 0x50-0x53 */
+	0xF7, 0xD3, 0xF7, 0xD2, 0xF0, 0x8A, 0xF0, 0x8B, /* 0x54-0x57 */
+	0xF0, 0x8C, 0xF0, 0x8D, 0xF0, 0x8E, 0xF0, 0x8F, /* 0x58-0x5B */
+	0xF0, 0x90, 0xF0, 0x91, 0xF0, 0x92, 0xF0, 0x93, /* 0x5C-0x5F */
+	0xF0, 0x94, 0xF0, 0x95, 0xF0, 0x96, 0xE2, 0xBB, /* 0x60-0x63 */
+	0xF0, 0x97, 0xBC, 0xA2, 0xF0, 0x98, 0xE2, 0xBC, /* 0x64-0x67 */
+	0xE2, 0xBD, 0xE2, 0xBE, 0xE2, 0xBF, 0xE2, 0xC0, /* 0x68-0x6B */
+	0xE2, 0xC1, 0xB7, 0xB9, 0xD2, 0xFB, 0xBD, 0xA4, /* 0x6C-0x6F */
+	0xCA, 0xCE, 0xB1, 0xA5, 0xCB, 0xC7, 0xF0, 0x99, /* 0x70-0x73 */
+	0xE2, 0xC2, 0xB6, 0xFC, 0xC8, 0xC4, 0xE2, 0xC3, /* 0x74-0x77 */
+	0xF0, 0x9A, 0xF0, 0x9B, 0xBD, 0xC8, 0xF0, 0x9C, /* 0x78-0x7B */
+	0xB1, 0xFD, 0xE2, 0xC4, 0xF0, 0x9D, 0xB6, 0xF6, /* 0x7C-0x7F */
+	
+	0xE2, 0xC5, 0xC4, 0xD9, 0xF0, 0x9E, 0xF0, 0x9F, /* 0x80-0x83 */
+	0xE2, 0xC6, 0xCF, 0xDA, 0xB9, 0xDD, 0xE2, 0xC7, /* 0x84-0x87 */
+	0xC0, 0xA1, 0xF0, 0xA0, 0xE2, 0xC8, 0xB2, 0xF6, /* 0x88-0x8B */
+	0xF1, 0x40, 0xE2, 0xC9, 0xF1, 0x41, 0xC1, 0xF3, /* 0x8C-0x8F */
+	0xE2, 0xCA, 0xE2, 0xCB, 0xC2, 0xF8, 0xE2, 0xCC, /* 0x90-0x93 */
+	0xE2, 0xCD, 0xE2, 0xCE, 0xCA, 0xD7, 0xD8, 0xB8, /* 0x94-0x97 */
+	0xD9, 0xE5, 0xCF, 0xE3, 0xF1, 0x42, 0xF1, 0x43, /* 0x98-0x9B */
+	0xF1, 0x44, 0xF1, 0x45, 0xF1, 0x46, 0xF1, 0x47, /* 0x9C-0x9F */
+	0xF1, 0x48, 0xF1, 0x49, 0xF1, 0x4A, 0xF1, 0x4B, /* 0xA0-0xA3 */
+	0xF1, 0x4C, 0xF0, 0xA5, 0xF1, 0x4D, 0xF1, 0x4E, /* 0xA4-0xA7 */
+	0xDC, 0xB0, 0xF1, 0x4F, 0xF1, 0x50, 0xF1, 0x51, /* 0xA8-0xAB */
+	0xF1, 0x52, 0xF1, 0x53, 0xF1, 0x54, 0xF1, 0x55, /* 0xAC-0xAF */
+	0xF1, 0x56, 0xF1, 0x57, 0xF1, 0x58, 0xF1, 0x59, /* 0xB0-0xB3 */
+	0xF1, 0x5A, 0xF1, 0x5B, 0xF1, 0x5C, 0xF1, 0x5D, /* 0xB4-0xB7 */
+	0xF1, 0x5E, 0xF1, 0x5F, 0xF1, 0x60, 0xF1, 0x61, /* 0xB8-0xBB */
+	0xF1, 0x62, 0xF1, 0x63, 0xF1, 0x64, 0xF1, 0x65, /* 0xBC-0xBF */
+	0xF1, 0x66, 0xF1, 0x67, 0xF1, 0x68, 0xF1, 0x69, /* 0xC0-0xC3 */
+	0xF1, 0x6A, 0xF1, 0x6B, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xC4-0xC7 */
+	0xF1, 0x6E, 0xF1, 0x6F, 0xF1, 0x70, 0xF1, 0x71, /* 0xC8-0xCB */
+	0xF1, 0x72, 0xF1, 0x73, 0xF1, 0x74, 0xF1, 0x75, /* 0xCC-0xCF */
+	0xF1, 0x76, 0xF1, 0x77, 0xF1, 0x78, 0xF1, 0x79, /* 0xD0-0xD3 */
+	0xF1, 0x7A, 0xF1, 0x7B, 0xF1, 0x7C, 0xF1, 0x7D, /* 0xD4-0xD7 */
+	0xF1, 0x7E, 0xF1, 0x80, 0xF1, 0x81, 0xF1, 0x82, /* 0xD8-0xDB */
+	0xF1, 0x83, 0xF1, 0x84, 0xF1, 0x85, 0xF1, 0x86, /* 0xDC-0xDF */
+	0xF1, 0x87, 0xF1, 0x88, 0xF1, 0x89, 0xF1, 0x8A, /* 0xE0-0xE3 */
+	0xF1, 0x8B, 0xF1, 0x8C, 0xF1, 0x8D, 0xF1, 0x8E, /* 0xE4-0xE7 */
+	0xF1, 0x8F, 0xF1, 0x90, 0xF1, 0x91, 0xF1, 0x92, /* 0xE8-0xEB */
+	0xF1, 0x93, 0xF1, 0x94, 0xF1, 0x95, 0xF1, 0x96, /* 0xEC-0xEF */
+	0xF1, 0x97, 0xF1, 0x98, 0xF1, 0x99, 0xF1, 0x9A, /* 0xF0-0xF3 */
+	0xF1, 0x9B, 0xF1, 0x9C, 0xF1, 0x9D, 0xF1, 0x9E, /* 0xF4-0xF7 */
+	0xF1, 0x9F, 0xF1, 0xA0, 0xF2, 0x40, 0xF2, 0x41, /* 0xF8-0xFB */
+	0xF2, 0x42, 0xF2, 0x43, 0xF2, 0x44, 0xF2, 0x45, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0xF2, 0x46, 0xF2, 0x47, 0xF2, 0x48, 0xF2, 0x49, /* 0x00-0x03 */
+	0xF2, 0x4A, 0xF2, 0x4B, 0xF2, 0x4C, 0xF2, 0x4D, /* 0x04-0x07 */
+	0xF2, 0x4E, 0xF2, 0x4F, 0xF2, 0x50, 0xF2, 0x51, /* 0x08-0x0B */
+	0xF2, 0x52, 0xF2, 0x53, 0xF2, 0x54, 0xF2, 0x55, /* 0x0C-0x0F */
+	0xF2, 0x56, 0xF2, 0x57, 0xF2, 0x58, 0xF2, 0x59, /* 0x10-0x13 */
+	0xF2, 0x5A, 0xF2, 0x5B, 0xF2, 0x5C, 0xF2, 0x5D, /* 0x14-0x17 */
+	0xF2, 0x5E, 0xF2, 0x5F, 0xF2, 0x60, 0xF2, 0x61, /* 0x18-0x1B */
+	0xF2, 0x62, 0xF2, 0x63, 0xF2, 0x64, 0xF2, 0x65, /* 0x1C-0x1F */
+	0xF2, 0x66, 0xF2, 0x67, 0xF2, 0x68, 0xF2, 0x69, /* 0x20-0x23 */
+	0xF2, 0x6A, 0xF2, 0x6B, 0xF2, 0x6C, 0xF2, 0x6D, /* 0x24-0x27 */
+	0xF2, 0x6E, 0xF2, 0x6F, 0xF2, 0x70, 0xF2, 0x71, /* 0x28-0x2B */
+	0xF2, 0x72, 0xF2, 0x73, 0xF2, 0x74, 0xF2, 0x75, /* 0x2C-0x2F */
+	0xF2, 0x76, 0xF2, 0x77, 0xF2, 0x78, 0xF2, 0x79, /* 0x30-0x33 */
+	0xF2, 0x7A, 0xF2, 0x7B, 0xF2, 0x7C, 0xF2, 0x7D, /* 0x34-0x37 */
+	0xF2, 0x7E, 0xF2, 0x80, 0xF2, 0x81, 0xF2, 0x82, /* 0x38-0x3B */
+	0xF2, 0x83, 0xF2, 0x84, 0xF2, 0x85, 0xF2, 0x86, /* 0x3C-0x3F */
+	0xF2, 0x87, 0xF2, 0x88, 0xF2, 0x89, 0xF2, 0x8A, /* 0x40-0x43 */
+	0xF2, 0x8B, 0xF2, 0x8C, 0xF2, 0x8D, 0xF2, 0x8E, /* 0x44-0x47 */
+	0xF2, 0x8F, 0xF2, 0x90, 0xF2, 0x91, 0xF2, 0x92, /* 0x48-0x4B */
+	0xF2, 0x93, 0xF2, 0x94, 0xF2, 0x95, 0xF2, 0x96, /* 0x4C-0x4F */
+	0xF2, 0x97, 0xF2, 0x98, 0xF2, 0x99, 0xF2, 0x9A, /* 0x50-0x53 */
+	0xF2, 0x9B, 0xF2, 0x9C, 0xF2, 0x9D, 0xF2, 0x9E, /* 0x54-0x57 */
+	0xF2, 0x9F, 0xF2, 0xA0, 0xF3, 0x40, 0xF3, 0x41, /* 0x58-0x5B */
+	0xF3, 0x42, 0xF3, 0x43, 0xF3, 0x44, 0xF3, 0x45, /* 0x5C-0x5F */
+	0xF3, 0x46, 0xF3, 0x47, 0xF3, 0x48, 0xF3, 0x49, /* 0x60-0x63 */
+	0xF3, 0x4A, 0xF3, 0x4B, 0xF3, 0x4C, 0xF3, 0x4D, /* 0x64-0x67 */
+	0xF3, 0x4E, 0xF3, 0x4F, 0xF3, 0x50, 0xF3, 0x51, /* 0x68-0x6B */
+	0xC2, 0xED, 0xD4, 0xA6, 0xCD, 0xD4, 0xD1, 0xB1, /* 0x6C-0x6F */
+	0xB3, 0xDB, 0xC7, 0xFD, 0xF3, 0x52, 0xB2, 0xB5, /* 0x70-0x73 */
+	0xC2, 0xBF, 0xE6, 0xE0, 0xCA, 0xBB, 0xE6, 0xE1, /* 0x74-0x77 */
+	0xE6, 0xE2, 0xBE, 0xD4, 0xE6, 0xE3, 0xD7, 0xA4, /* 0x78-0x7B */
+	0xCD, 0xD5, 0xE6, 0xE5, 0xBC, 0xDD, 0xE6, 0xE4, /* 0x7C-0x7F */
+	
+	0xE6, 0xE6, 0xE6, 0xE7, 0xC2, 0xEE, 0xF3, 0x53, /* 0x80-0x83 */
+	0xBD, 0xBE, 0xE6, 0xE8, 0xC2, 0xE6, 0xBA, 0xA7, /* 0x84-0x87 */
+	0xE6, 0xE9, 0xF3, 0x54, 0xE6, 0xEA, 0xB3, 0xD2, /* 0x88-0x8B */
+	0xD1, 0xE9, 0xF3, 0x55, 0xF3, 0x56, 0xBF, 0xA5, /* 0x8C-0x8F */
+	0xE6, 0xEB, 0xC6, 0xEF, 0xE6, 0xEC, 0xE6, 0xED, /* 0x90-0x93 */
+	0xF3, 0x57, 0xF3, 0x58, 0xE6, 0xEE, 0xC6, 0xAD, /* 0x94-0x97 */
+	0xE6, 0xEF, 0xF3, 0x59, 0xC9, 0xA7, 0xE6, 0xF0, /* 0x98-0x9B */
+	0xE6, 0xF1, 0xE6, 0xF2, 0xE5, 0xB9, 0xE6, 0xF3, /* 0x9C-0x9F */
+	0xE6, 0xF4, 0xC2, 0xE2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */
+	0xD6, 0xE8, 0xE6, 0xF7, 0xF3, 0x5A, 0xE6, 0xF8, /* 0xA4-0xA7 */
+	0xB9, 0xC7, 0xF3, 0x5B, 0xF3, 0x5C, 0xF3, 0x5D, /* 0xA8-0xAB */
+	0xF3, 0x5E, 0xF3, 0x5F, 0xF3, 0x60, 0xF3, 0x61, /* 0xAC-0xAF */
+	0xF7, 0xBB, 0xF7, 0xBA, 0xF3, 0x62, 0xF3, 0x63, /* 0xB0-0xB3 */
+	0xF3, 0x64, 0xF3, 0x65, 0xF7, 0xBE, 0xF7, 0xBC, /* 0xB4-0xB7 */
+	0xBA, 0xA1, 0xF3, 0x66, 0xF7, 0xBF, 0xF3, 0x67, /* 0xB8-0xBB */
+	0xF7, 0xC0, 0xF3, 0x68, 0xF3, 0x69, 0xF3, 0x6A, /* 0xBC-0xBF */
+	0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xC4, 0xF3, 0x6B, /* 0xC0-0xC3 */
+	0xF3, 0x6C, 0xF7, 0xC3, 0xF3, 0x6D, 0xF3, 0x6E, /* 0xC4-0xC7 */
+	0xF3, 0x6F, 0xF3, 0x70, 0xF3, 0x71, 0xF7, 0xC5, /* 0xC8-0xCB */
+	0xF7, 0xC6, 0xF3, 0x72, 0xF3, 0x73, 0xF3, 0x74, /* 0xCC-0xCF */
+	0xF3, 0x75, 0xF7, 0xC7, 0xF3, 0x76, 0xCB, 0xE8, /* 0xD0-0xD3 */
+	0xF3, 0x77, 0xF3, 0x78, 0xF3, 0x79, 0xF3, 0x7A, /* 0xD4-0xD7 */
+	0xB8, 0xDF, 0xF3, 0x7B, 0xF3, 0x7C, 0xF3, 0x7D, /* 0xD8-0xDB */
+	0xF3, 0x7E, 0xF3, 0x80, 0xF3, 0x81, 0xF7, 0xD4, /* 0xDC-0xDF */
+	0xF3, 0x82, 0xF7, 0xD5, 0xF3, 0x83, 0xF3, 0x84, /* 0xE0-0xE3 */
+	0xF3, 0x85, 0xF3, 0x86, 0xF7, 0xD6, 0xF3, 0x87, /* 0xE4-0xE7 */
+	0xF3, 0x88, 0xF3, 0x89, 0xF3, 0x8A, 0xF7, 0xD8, /* 0xE8-0xEB */
+	0xF3, 0x8B, 0xF7, 0xDA, 0xF3, 0x8C, 0xF7, 0xD7, /* 0xEC-0xEF */
+	0xF3, 0x8D, 0xF3, 0x8E, 0xF3, 0x8F, 0xF3, 0x90, /* 0xF0-0xF3 */
+	0xF3, 0x91, 0xF3, 0x92, 0xF3, 0x93, 0xF3, 0x94, /* 0xF4-0xF7 */
+	0xF3, 0x95, 0xF7, 0xDB, 0xF3, 0x96, 0xF7, 0xD9, /* 0xF8-0xFB */
+	0xF3, 0x97, 0xF3, 0x98, 0xF3, 0x99, 0xF3, 0x9A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0xF3, 0x9B, 0xF3, 0x9C, 0xF3, 0x9D, 0xD7, 0xD7, /* 0x00-0x03 */
+	0xF3, 0x9E, 0xF3, 0x9F, 0xF3, 0xA0, 0xF4, 0x40, /* 0x04-0x07 */
+	0xF7, 0xDC, 0xF4, 0x41, 0xF4, 0x42, 0xF4, 0x43, /* 0x08-0x0B */
+	0xF4, 0x44, 0xF4, 0x45, 0xF4, 0x46, 0xF7, 0xDD, /* 0x0C-0x0F */
+	0xF4, 0x47, 0xF4, 0x48, 0xF4, 0x49, 0xF7, 0xDE, /* 0x10-0x13 */
+	0xF4, 0x4A, 0xF4, 0x4B, 0xF4, 0x4C, 0xF4, 0x4D, /* 0x14-0x17 */
+	0xF4, 0x4E, 0xF4, 0x4F, 0xF4, 0x50, 0xF4, 0x51, /* 0x18-0x1B */
+	0xF4, 0x52, 0xF4, 0x53, 0xF4, 0x54, 0xF7, 0xDF, /* 0x1C-0x1F */
+	0xF4, 0x55, 0xF4, 0x56, 0xF4, 0x57, 0xF7, 0xE0, /* 0x20-0x23 */
+	0xF4, 0x58, 0xF4, 0x59, 0xF4, 0x5A, 0xF4, 0x5B, /* 0x24-0x27 */
+	0xF4, 0x5C, 0xF4, 0x5D, 0xF4, 0x5E, 0xF4, 0x5F, /* 0x28-0x2B */
+	0xF4, 0x60, 0xF4, 0x61, 0xF4, 0x62, 0xDB, 0xCB, /* 0x2C-0x2F */
+	0xF4, 0x63, 0xF4, 0x64, 0xD8, 0xAA, 0xF4, 0x65, /* 0x30-0x33 */
+	0xF4, 0x66, 0xF4, 0x67, 0xF4, 0x68, 0xF4, 0x69, /* 0x34-0x37 */
+	0xF4, 0x6A, 0xF4, 0x6B, 0xF4, 0x6C, 0xE5, 0xF7, /* 0x38-0x3B */
+	0xB9, 0xED, 0xF4, 0x6D, 0xF4, 0x6E, 0xF4, 0x6F, /* 0x3C-0x3F */
+	0xF4, 0x70, 0xBF, 0xFD, 0xBB, 0xEA, 0xF7, 0xC9, /* 0x40-0x43 */
+	0xC6, 0xC7, 0xF7, 0xC8, 0xF4, 0x71, 0xF7, 0xCA, /* 0x44-0x47 */
+	0xF7, 0xCC, 0xF7, 0xCB, 0xF4, 0x72, 0xF4, 0x73, /* 0x48-0x4B */
+	0xF4, 0x74, 0xF7, 0xCD, 0xF4, 0x75, 0xCE, 0xBA, /* 0x4C-0x4F */
+	0xF4, 0x76, 0xF7, 0xCE, 0xF4, 0x77, 0xF4, 0x78, /* 0x50-0x53 */
+	0xC4, 0xA7, 0xF4, 0x79, 0xF4, 0x7A, 0xF4, 0x7B, /* 0x54-0x57 */
+	0xF4, 0x7C, 0xF4, 0x7D, 0xF4, 0x7E, 0xF4, 0x80, /* 0x58-0x5B */
+	0xF4, 0x81, 0xF4, 0x82, 0xF4, 0x83, 0xF4, 0x84, /* 0x5C-0x5F */
+	0xF4, 0x85, 0xF4, 0x86, 0xF4, 0x87, 0xF4, 0x88, /* 0x60-0x63 */
+	0xF4, 0x89, 0xF4, 0x8A, 0xF4, 0x8B, 0xF4, 0x8C, /* 0x64-0x67 */
+	0xF4, 0x8D, 0xF4, 0x8E, 0xF4, 0x8F, 0xF4, 0x90, /* 0x68-0x6B */
+	0xF4, 0x91, 0xF4, 0x92, 0xF4, 0x93, 0xF4, 0x94, /* 0x6C-0x6F */
+	0xF4, 0x95, 0xF4, 0x96, 0xF4, 0x97, 0xF4, 0x98, /* 0x70-0x73 */
+	0xF4, 0x99, 0xF4, 0x9A, 0xF4, 0x9B, 0xF4, 0x9C, /* 0x74-0x77 */
+	0xF4, 0x9D, 0xF4, 0x9E, 0xF4, 0x9F, 0xF4, 0xA0, /* 0x78-0x7B */
+	0xF5, 0x40, 0xF5, 0x41, 0xF5, 0x42, 0xF5, 0x43, /* 0x7C-0x7F */
+	
+	0xF5, 0x44, 0xF5, 0x45, 0xF5, 0x46, 0xF5, 0x47, /* 0x80-0x83 */
+	0xF5, 0x48, 0xF5, 0x49, 0xF5, 0x4A, 0xF5, 0x4B, /* 0x84-0x87 */
+	0xF5, 0x4C, 0xF5, 0x4D, 0xF5, 0x4E, 0xF5, 0x4F, /* 0x88-0x8B */
+	0xF5, 0x50, 0xF5, 0x51, 0xF5, 0x52, 0xF5, 0x53, /* 0x8C-0x8F */
+	0xF5, 0x54, 0xF5, 0x55, 0xF5, 0x56, 0xF5, 0x57, /* 0x90-0x93 */
+	0xF5, 0x58, 0xF5, 0x59, 0xF5, 0x5A, 0xF5, 0x5B, /* 0x94-0x97 */
+	0xF5, 0x5C, 0xF5, 0x5D, 0xF5, 0x5E, 0xF5, 0x5F, /* 0x98-0x9B */
+	0xF5, 0x60, 0xF5, 0x61, 0xF5, 0x62, 0xF5, 0x63, /* 0x9C-0x9F */
+	0xF5, 0x64, 0xF5, 0x65, 0xF5, 0x66, 0xF5, 0x67, /* 0xA0-0xA3 */
+	0xF5, 0x68, 0xF5, 0x69, 0xF5, 0x6A, 0xF5, 0x6B, /* 0xA4-0xA7 */
+	0xF5, 0x6C, 0xF5, 0x6D, 0xF5, 0x6E, 0xF5, 0x6F, /* 0xA8-0xAB */
+	0xF5, 0x70, 0xF5, 0x71, 0xF5, 0x72, 0xF5, 0x73, /* 0xAC-0xAF */
+	0xF5, 0x74, 0xF5, 0x75, 0xF5, 0x76, 0xF5, 0x77, /* 0xB0-0xB3 */
+	0xF5, 0x78, 0xF5, 0x79, 0xF5, 0x7A, 0xF5, 0x7B, /* 0xB4-0xB7 */
+	0xF5, 0x7C, 0xF5, 0x7D, 0xF5, 0x7E, 0xF5, 0x80, /* 0xB8-0xBB */
+	0xF5, 0x81, 0xF5, 0x82, 0xF5, 0x83, 0xF5, 0x84, /* 0xBC-0xBF */
+	0xF5, 0x85, 0xF5, 0x86, 0xF5, 0x87, 0xF5, 0x88, /* 0xC0-0xC3 */
+	0xF5, 0x89, 0xF5, 0x8A, 0xF5, 0x8B, 0xF5, 0x8C, /* 0xC4-0xC7 */
+	0xF5, 0x8D, 0xF5, 0x8E, 0xF5, 0x8F, 0xF5, 0x90, /* 0xC8-0xCB */
+	0xF5, 0x91, 0xF5, 0x92, 0xF5, 0x93, 0xF5, 0x94, /* 0xCC-0xCF */
+	0xF5, 0x95, 0xF5, 0x96, 0xF5, 0x97, 0xF5, 0x98, /* 0xD0-0xD3 */
+	0xF5, 0x99, 0xF5, 0x9A, 0xF5, 0x9B, 0xF5, 0x9C, /* 0xD4-0xD7 */
+	0xF5, 0x9D, 0xF5, 0x9E, 0xF5, 0x9F, 0xF5, 0xA0, /* 0xD8-0xDB */
+	0xF6, 0x40, 0xF6, 0x41, 0xF6, 0x42, 0xF6, 0x43, /* 0xDC-0xDF */
+	0xF6, 0x44, 0xF6, 0x45, 0xF6, 0x46, 0xF6, 0x47, /* 0xE0-0xE3 */
+	0xF6, 0x48, 0xF6, 0x49, 0xF6, 0x4A, 0xF6, 0x4B, /* 0xE4-0xE7 */
+	0xF6, 0x4C, 0xF6, 0x4D, 0xF6, 0x4E, 0xF6, 0x4F, /* 0xE8-0xEB */
+	0xF6, 0x50, 0xF6, 0x51, 0xF6, 0x52, 0xF6, 0x53, /* 0xEC-0xEF */
+	0xF6, 0x54, 0xF6, 0x55, 0xF6, 0x56, 0xF6, 0x57, /* 0xF0-0xF3 */
+	0xF6, 0x58, 0xF6, 0x59, 0xF6, 0x5A, 0xF6, 0x5B, /* 0xF4-0xF7 */
+	0xF6, 0x5C, 0xF6, 0x5D, 0xF6, 0x5E, 0xF6, 0x5F, /* 0xF8-0xFB */
+	0xF6, 0x60, 0xF6, 0x61, 0xF6, 0x62, 0xF6, 0x63, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0xF6, 0x64, 0xF6, 0x65, 0xF6, 0x66, 0xF6, 0x67, /* 0x00-0x03 */
+	0xF6, 0x68, 0xF6, 0x69, 0xF6, 0x6A, 0xF6, 0x6B, /* 0x04-0x07 */
+	0xF6, 0x6C, 0xF6, 0x6D, 0xF6, 0x6E, 0xF6, 0x6F, /* 0x08-0x0B */
+	0xF6, 0x70, 0xF6, 0x71, 0xF6, 0x72, 0xF6, 0x73, /* 0x0C-0x0F */
+	0xF6, 0x74, 0xF6, 0x75, 0xF6, 0x76, 0xF6, 0x77, /* 0x10-0x13 */
+	0xF6, 0x78, 0xF6, 0x79, 0xF6, 0x7A, 0xF6, 0x7B, /* 0x14-0x17 */
+	0xF6, 0x7C, 0xF6, 0x7D, 0xF6, 0x7E, 0xF6, 0x80, /* 0x18-0x1B */
+	0xF6, 0x81, 0xF6, 0x82, 0xF6, 0x83, 0xF6, 0x84, /* 0x1C-0x1F */
+	0xF6, 0x85, 0xF6, 0x86, 0xF6, 0x87, 0xF6, 0x88, /* 0x20-0x23 */
+	0xF6, 0x89, 0xF6, 0x8A, 0xF6, 0x8B, 0xF6, 0x8C, /* 0x24-0x27 */
+	0xF6, 0x8D, 0xF6, 0x8E, 0xF6, 0x8F, 0xF6, 0x90, /* 0x28-0x2B */
+	0xF6, 0x91, 0xF6, 0x92, 0xF6, 0x93, 0xF6, 0x94, /* 0x2C-0x2F */
+	0xF6, 0x95, 0xF6, 0x96, 0xF6, 0x97, 0xF6, 0x98, /* 0x30-0x33 */
+	0xF6, 0x99, 0xF6, 0x9A, 0xF6, 0x9B, 0xF6, 0x9C, /* 0x34-0x37 */
+	0xF6, 0x9D, 0xF6, 0x9E, 0xF6, 0x9F, 0xF6, 0xA0, /* 0x38-0x3B */
+	0xF7, 0x40, 0xF7, 0x41, 0xF7, 0x42, 0xF7, 0x43, /* 0x3C-0x3F */
+	0xF7, 0x44, 0xF7, 0x45, 0xF7, 0x46, 0xF7, 0x47, /* 0x40-0x43 */
+	0xF7, 0x48, 0xF7, 0x49, 0xF7, 0x4A, 0xF7, 0x4B, /* 0x44-0x47 */
+	0xF7, 0x4C, 0xF7, 0x4D, 0xF7, 0x4E, 0xF7, 0x4F, /* 0x48-0x4B */
+	0xF7, 0x50, 0xF7, 0x51, 0xF7, 0x52, 0xF7, 0x53, /* 0x4C-0x4F */
+	0xF7, 0x54, 0xF7, 0x55, 0xF7, 0x56, 0xF7, 0x57, /* 0x50-0x53 */
+	0xF7, 0x58, 0xF7, 0x59, 0xF7, 0x5A, 0xF7, 0x5B, /* 0x54-0x57 */
+	0xF7, 0x5C, 0xF7, 0x5D, 0xF7, 0x5E, 0xF7, 0x5F, /* 0x58-0x5B */
+	0xF7, 0x60, 0xF7, 0x61, 0xF7, 0x62, 0xF7, 0x63, /* 0x5C-0x5F */
+	0xF7, 0x64, 0xF7, 0x65, 0xF7, 0x66, 0xF7, 0x67, /* 0x60-0x63 */
+	0xF7, 0x68, 0xF7, 0x69, 0xF7, 0x6A, 0xF7, 0x6B, /* 0x64-0x67 */
+	0xF7, 0x6C, 0xF7, 0x6D, 0xF7, 0x6E, 0xF7, 0x6F, /* 0x68-0x6B */
+	0xF7, 0x70, 0xF7, 0x71, 0xF7, 0x72, 0xF7, 0x73, /* 0x6C-0x6F */
+	0xF7, 0x74, 0xF7, 0x75, 0xF7, 0x76, 0xF7, 0x77, /* 0x70-0x73 */
+	0xF7, 0x78, 0xF7, 0x79, 0xF7, 0x7A, 0xF7, 0x7B, /* 0x74-0x77 */
+	0xF7, 0x7C, 0xF7, 0x7D, 0xF7, 0x7E, 0xF7, 0x80, /* 0x78-0x7B */
+	0xD3, 0xE3, 0xF7, 0x81, 0xF7, 0x82, 0xF6, 0xCF, /* 0x7C-0x7F */
+	
+	0xF7, 0x83, 0xC2, 0xB3, 0xF6, 0xD0, 0xF7, 0x84, /* 0x80-0x83 */
+	0xF7, 0x85, 0xF6, 0xD1, 0xF6, 0xD2, 0xF6, 0xD3, /* 0x84-0x87 */
+	0xF6, 0xD4, 0xF7, 0x86, 0xF7, 0x87, 0xF6, 0xD6, /* 0x88-0x8B */
+	0xF7, 0x88, 0xB1, 0xAB, 0xF6, 0xD7, 0xF7, 0x89, /* 0x8C-0x8F */
+	0xF6, 0xD8, 0xF6, 0xD9, 0xF6, 0xDA, 0xF7, 0x8A, /* 0x90-0x93 */
+	0xF6, 0xDB, 0xF6, 0xDC, 0xF7, 0x8B, 0xF7, 0x8C, /* 0x94-0x97 */
+	0xF7, 0x8D, 0xF7, 0x8E, 0xF6, 0xDD, 0xF6, 0xDE, /* 0x98-0x9B */
+	0xCF, 0xCA, 0xF7, 0x8F, 0xF6, 0xDF, 0xF6, 0xE0, /* 0x9C-0x9F */
+	0xF6, 0xE1, 0xF6, 0xE2, 0xF6, 0xE3, 0xF6, 0xE4, /* 0xA0-0xA3 */
+	0xC0, 0xF0, 0xF6, 0xE5, 0xF6, 0xE6, 0xF6, 0xE7, /* 0xA4-0xA7 */
+	0xF6, 0xE8, 0xF6, 0xE9, 0xF7, 0x90, 0xF6, 0xEA, /* 0xA8-0xAB */
+	0xF7, 0x91, 0xF6, 0xEB, 0xF6, 0xEC, 0xF7, 0x92, /* 0xAC-0xAF */
+	0xF6, 0xED, 0xF6, 0xEE, 0xF6, 0xEF, 0xF6, 0xF0, /* 0xB0-0xB3 */
+	0xF6, 0xF1, 0xF6, 0xF2, 0xF6, 0xF3, 0xF6, 0xF4, /* 0xB4-0xB7 */
+	0xBE, 0xA8, 0xF7, 0x93, 0xF6, 0xF5, 0xF6, 0xF6, /* 0xB8-0xBB */
+	0xF6, 0xF7, 0xF6, 0xF8, 0xF7, 0x94, 0xF7, 0x95, /* 0xBC-0xBF */
+	0xF7, 0x96, 0xF7, 0x97, 0xF7, 0x98, 0xC8, 0xFA, /* 0xC0-0xC3 */
+	0xF6, 0xF9, 0xF6, 0xFA, 0xF6, 0xFB, 0xF6, 0xFC, /* 0xC4-0xC7 */
+	0xF7, 0x99, 0xF7, 0x9A, 0xF6, 0xFD, 0xF6, 0xFE, /* 0xC8-0xCB */
+	0xF7, 0xA1, 0xF7, 0xA2, 0xF7, 0xA3, 0xF7, 0xA4, /* 0xCC-0xCF */
+	0xF7, 0xA5, 0xF7, 0x9B, 0xF7, 0x9C, 0xF7, 0xA6, /* 0xD0-0xD3 */
+	0xF7, 0xA7, 0xF7, 0xA8, 0xB1, 0xEE, 0xF7, 0xA9, /* 0xD4-0xD7 */
+	0xF7, 0xAA, 0xF7, 0xAB, 0xF7, 0x9D, 0xF7, 0x9E, /* 0xD8-0xDB */
+	0xF7, 0xAC, 0xF7, 0xAD, 0xC1, 0xDB, 0xF7, 0xAE, /* 0xDC-0xDF */
+	0xF7, 0x9F, 0xF7, 0xA0, 0xF7, 0xAF, 0xF8, 0x40, /* 0xE0-0xE3 */
+	0xF8, 0x41, 0xF8, 0x42, 0xF8, 0x43, 0xF8, 0x44, /* 0xE4-0xE7 */
+	0xF8, 0x45, 0xF8, 0x46, 0xF8, 0x47, 0xF8, 0x48, /* 0xE8-0xEB */
+	0xF8, 0x49, 0xF8, 0x4A, 0xF8, 0x4B, 0xF8, 0x4C, /* 0xEC-0xEF */
+	0xF8, 0x4D, 0xF8, 0x4E, 0xF8, 0x4F, 0xF8, 0x50, /* 0xF0-0xF3 */
+	0xF8, 0x51, 0xF8, 0x52, 0xF8, 0x53, 0xF8, 0x54, /* 0xF4-0xF7 */
+	0xF8, 0x55, 0xF8, 0x56, 0xF8, 0x57, 0xF8, 0x58, /* 0xF8-0xFB */
+	0xF8, 0x59, 0xF8, 0x5A, 0xF8, 0x5B, 0xF8, 0x5C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0xF8, 0x5D, 0xF8, 0x5E, 0xF8, 0x5F, 0xF8, 0x60, /* 0x00-0x03 */
+	0xF8, 0x61, 0xF8, 0x62, 0xF8, 0x63, 0xF8, 0x64, /* 0x04-0x07 */
+	0xF8, 0x65, 0xF8, 0x66, 0xF8, 0x67, 0xF8, 0x68, /* 0x08-0x0B */
+	0xF8, 0x69, 0xF8, 0x6A, 0xF8, 0x6B, 0xF8, 0x6C, /* 0x0C-0x0F */
+	0xF8, 0x6D, 0xF8, 0x6E, 0xF8, 0x6F, 0xF8, 0x70, /* 0x10-0x13 */
+	0xF8, 0x71, 0xF8, 0x72, 0xF8, 0x73, 0xF8, 0x74, /* 0x14-0x17 */
+	0xF8, 0x75, 0xF8, 0x76, 0xF8, 0x77, 0xF8, 0x78, /* 0x18-0x1B */
+	0xF8, 0x79, 0xF8, 0x7A, 0xF8, 0x7B, 0xF8, 0x7C, /* 0x1C-0x1F */
+	0xF8, 0x7D, 0xF8, 0x7E, 0xF8, 0x80, 0xF8, 0x81, /* 0x20-0x23 */
+	0xF8, 0x82, 0xF8, 0x83, 0xF8, 0x84, 0xF8, 0x85, /* 0x24-0x27 */
+	0xF8, 0x86, 0xF8, 0x87, 0xF8, 0x88, 0xF8, 0x89, /* 0x28-0x2B */
+	0xF8, 0x8A, 0xF8, 0x8B, 0xF8, 0x8C, 0xF8, 0x8D, /* 0x2C-0x2F */
+	0xF8, 0x8E, 0xF8, 0x8F, 0xF8, 0x90, 0xF8, 0x91, /* 0x30-0x33 */
+	0xF8, 0x92, 0xF8, 0x93, 0xF8, 0x94, 0xF8, 0x95, /* 0x34-0x37 */
+	0xF8, 0x96, 0xF8, 0x97, 0xF8, 0x98, 0xF8, 0x99, /* 0x38-0x3B */
+	0xF8, 0x9A, 0xF8, 0x9B, 0xF8, 0x9C, 0xF8, 0x9D, /* 0x3C-0x3F */
+	0xF8, 0x9E, 0xF8, 0x9F, 0xF8, 0xA0, 0xF9, 0x40, /* 0x40-0x43 */
+	0xF9, 0x41, 0xF9, 0x42, 0xF9, 0x43, 0xF9, 0x44, /* 0x44-0x47 */
+	0xF9, 0x45, 0xF9, 0x46, 0xF9, 0x47, 0xF9, 0x48, /* 0x48-0x4B */
+	0xF9, 0x49, 0xF9, 0x4A, 0xF9, 0x4B, 0xF9, 0x4C, /* 0x4C-0x4F */
+	0xF9, 0x4D, 0xF9, 0x4E, 0xF9, 0x4F, 0xF9, 0x50, /* 0x50-0x53 */
+	0xF9, 0x51, 0xF9, 0x52, 0xF9, 0x53, 0xF9, 0x54, /* 0x54-0x57 */
+	0xF9, 0x55, 0xF9, 0x56, 0xF9, 0x57, 0xF9, 0x58, /* 0x58-0x5B */
+	0xF9, 0x59, 0xF9, 0x5A, 0xF9, 0x5B, 0xF9, 0x5C, /* 0x5C-0x5F */
+	0xF9, 0x5D, 0xF9, 0x5E, 0xF9, 0x5F, 0xF9, 0x60, /* 0x60-0x63 */
+	0xF9, 0x61, 0xF9, 0x62, 0xF9, 0x63, 0xF9, 0x64, /* 0x64-0x67 */
+	0xF9, 0x65, 0xF9, 0x66, 0xF9, 0x67, 0xF9, 0x68, /* 0x68-0x6B */
+	0xF9, 0x69, 0xF9, 0x6A, 0xF9, 0x6B, 0xF9, 0x6C, /* 0x6C-0x6F */
+	0xF9, 0x6D, 0xF9, 0x6E, 0xF9, 0x6F, 0xF9, 0x70, /* 0x70-0x73 */
+	0xF9, 0x71, 0xF9, 0x72, 0xF9, 0x73, 0xF9, 0x74, /* 0x74-0x77 */
+	0xF9, 0x75, 0xF9, 0x76, 0xF9, 0x77, 0xF9, 0x78, /* 0x78-0x7B */
+	0xF9, 0x79, 0xF9, 0x7A, 0xF9, 0x7B, 0xF9, 0x7C, /* 0x7C-0x7F */
+	
+	0xF9, 0x7D, 0xF9, 0x7E, 0xF9, 0x80, 0xF9, 0x81, /* 0x80-0x83 */
+	0xF9, 0x82, 0xF9, 0x83, 0xF9, 0x84, 0xF9, 0x85, /* 0x84-0x87 */
+	0xF9, 0x86, 0xF9, 0x87, 0xF9, 0x88, 0xF9, 0x89, /* 0x88-0x8B */
+	0xF9, 0x8A, 0xF9, 0x8B, 0xF9, 0x8C, 0xF9, 0x8D, /* 0x8C-0x8F */
+	0xF9, 0x8E, 0xF9, 0x8F, 0xF9, 0x90, 0xF9, 0x91, /* 0x90-0x93 */
+	0xF9, 0x92, 0xF9, 0x93, 0xF9, 0x94, 0xF9, 0x95, /* 0x94-0x97 */
+	0xF9, 0x96, 0xF9, 0x97, 0xF9, 0x98, 0xF9, 0x99, /* 0x98-0x9B */
+	0xF9, 0x9A, 0xF9, 0x9B, 0xF9, 0x9C, 0xF9, 0x9D, /* 0x9C-0x9F */
+	0xF9, 0x9E, 0xF9, 0x9F, 0xF9, 0xA0, 0xFA, 0x40, /* 0xA0-0xA3 */
+	0xFA, 0x41, 0xFA, 0x42, 0xFA, 0x43, 0xFA, 0x44, /* 0xA4-0xA7 */
+	0xFA, 0x45, 0xFA, 0x46, 0xFA, 0x47, 0xFA, 0x48, /* 0xA8-0xAB */
+	0xFA, 0x49, 0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, /* 0xAC-0xAF */
+	0xFA, 0x4D, 0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, /* 0xB0-0xB3 */
+	0xFA, 0x51, 0xFA, 0x52, 0xFA, 0x53, 0xFA, 0x54, /* 0xB4-0xB7 */
+	0xFA, 0x55, 0xFA, 0x56, 0xFA, 0x57, 0xFA, 0x58, /* 0xB8-0xBB */
+	0xFA, 0x59, 0xFA, 0x5A, 0xFA, 0x5B, 0xFA, 0x5C, /* 0xBC-0xBF */
+	0xFA, 0x5D, 0xFA, 0x5E, 0xFA, 0x5F, 0xFA, 0x60, /* 0xC0-0xC3 */
+	0xFA, 0x61, 0xFA, 0x62, 0xFA, 0x63, 0xFA, 0x64, /* 0xC4-0xC7 */
+	0xFA, 0x65, 0xFA, 0x66, 0xFA, 0x67, 0xFA, 0x68, /* 0xC8-0xCB */
+	0xFA, 0x69, 0xFA, 0x6A, 0xFA, 0x6B, 0xFA, 0x6C, /* 0xCC-0xCF */
+	0xFA, 0x6D, 0xFA, 0x6E, 0xFA, 0x6F, 0xFA, 0x70, /* 0xD0-0xD3 */
+	0xFA, 0x71, 0xFA, 0x72, 0xFA, 0x73, 0xFA, 0x74, /* 0xD4-0xD7 */
+	0xFA, 0x75, 0xFA, 0x76, 0xFA, 0x77, 0xFA, 0x78, /* 0xD8-0xDB */
+	0xFA, 0x79, 0xFA, 0x7A, 0xFA, 0x7B, 0xFA, 0x7C, /* 0xDC-0xDF */
+	0xFA, 0x7D, 0xFA, 0x7E, 0xFA, 0x80, 0xFA, 0x81, /* 0xE0-0xE3 */
+	0xFA, 0x82, 0xFA, 0x83, 0xFA, 0x84, 0xFA, 0x85, /* 0xE4-0xE7 */
+	0xFA, 0x86, 0xFA, 0x87, 0xFA, 0x88, 0xFA, 0x89, /* 0xE8-0xEB */
+	0xFA, 0x8A, 0xFA, 0x8B, 0xFA, 0x8C, 0xFA, 0x8D, /* 0xEC-0xEF */
+	0xFA, 0x8E, 0xFA, 0x8F, 0xFA, 0x90, 0xFA, 0x91, /* 0xF0-0xF3 */
+	0xFA, 0x92, 0xFA, 0x93, 0xFA, 0x94, 0xFA, 0x95, /* 0xF4-0xF7 */
+	0xFA, 0x96, 0xFA, 0x97, 0xFA, 0x98, 0xFA, 0x99, /* 0xF8-0xFB */
+	0xFA, 0x9A, 0xFA, 0x9B, 0xFA, 0x9C, 0xFA, 0x9D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0xFA, 0x9E, 0xFA, 0x9F, 0xFA, 0xA0, 0xFB, 0x40, /* 0x00-0x03 */
+	0xFB, 0x41, 0xFB, 0x42, 0xFB, 0x43, 0xFB, 0x44, /* 0x04-0x07 */
+	0xFB, 0x45, 0xFB, 0x46, 0xFB, 0x47, 0xFB, 0x48, /* 0x08-0x0B */
+	0xFB, 0x49, 0xFB, 0x4A, 0xFB, 0x4B, 0xFB, 0x4C, /* 0x0C-0x0F */
+	0xFB, 0x4D, 0xFB, 0x4E, 0xFB, 0x4F, 0xFB, 0x50, /* 0x10-0x13 */
+	0xFB, 0x51, 0xFB, 0x52, 0xFB, 0x53, 0xFB, 0x54, /* 0x14-0x17 */
+	0xFB, 0x55, 0xFB, 0x56, 0xFB, 0x57, 0xFB, 0x58, /* 0x18-0x1B */
+	0xFB, 0x59, 0xFB, 0x5A, 0xFB, 0x5B, 0xC4, 0xF1, /* 0x1C-0x1F */
+	0xF0, 0xAF, 0xBC, 0xA6, 0xF0, 0xB0, 0xC3, 0xF9, /* 0x20-0x23 */
+	0xFB, 0x5C, 0xC5, 0xB8, 0xD1, 0xBB, 0xFB, 0x5D, /* 0x24-0x27 */
+	0xF0, 0xB1, 0xF0, 0xB2, 0xF0, 0xB3, 0xF0, 0xB4, /* 0x28-0x2B */
+	0xF0, 0xB5, 0xD1, 0xBC, 0xFB, 0x5E, 0xD1, 0xEC, /* 0x2C-0x2F */
+	0xFB, 0x5F, 0xF0, 0xB7, 0xF0, 0xB6, 0xD4, 0xA7, /* 0x30-0x33 */
+	0xFB, 0x60, 0xCD, 0xD2, 0xF0, 0xB8, 0xF0, 0xBA, /* 0x34-0x37 */
+	0xF0, 0xB9, 0xF0, 0xBB, 0xF0, 0xBC, 0xFB, 0x61, /* 0x38-0x3B */
+	0xFB, 0x62, 0xB8, 0xEB, 0xF0, 0xBD, 0xBA, 0xE8, /* 0x3C-0x3F */
+	0xFB, 0x63, 0xF0, 0xBE, 0xF0, 0xBF, 0xBE, 0xE9, /* 0x40-0x43 */
+	0xF0, 0xC0, 0xB6, 0xEC, 0xF0, 0xC1, 0xF0, 0xC2, /* 0x44-0x47 */
+	0xF0, 0xC3, 0xF0, 0xC4, 0xC8, 0xB5, 0xF0, 0xC5, /* 0x48-0x4B */
+	0xF0, 0xC6, 0xFB, 0x64, 0xF0, 0xC7, 0xC5, 0xF4, /* 0x4C-0x4F */
+	0xFB, 0x65, 0xF0, 0xC8, 0xFB, 0x66, 0xFB, 0x67, /* 0x50-0x53 */
+	0xFB, 0x68, 0xF0, 0xC9, 0xFB, 0x69, 0xF0, 0xCA, /* 0x54-0x57 */
+	0xF7, 0xBD, 0xFB, 0x6A, 0xF0, 0xCB, 0xF0, 0xCC, /* 0x58-0x5B */
+	0xF0, 0xCD, 0xFB, 0x6B, 0xF0, 0xCE, 0xFB, 0x6C, /* 0x5C-0x5F */
+	0xFB, 0x6D, 0xFB, 0x6E, 0xFB, 0x6F, 0xF0, 0xCF, /* 0x60-0x63 */
+	0xBA, 0xD7, 0xFB, 0x70, 0xF0, 0xD0, 0xF0, 0xD1, /* 0x64-0x67 */
+	0xF0, 0xD2, 0xF0, 0xD3, 0xF0, 0xD4, 0xF0, 0xD5, /* 0x68-0x6B */
+	0xF0, 0xD6, 0xF0, 0xD8, 0xFB, 0x71, 0xFB, 0x72, /* 0x6C-0x6F */
+	0xD3, 0xA5, 0xF0, 0xD7, 0xFB, 0x73, 0xF0, 0xD9, /* 0x70-0x73 */
+	0xFB, 0x74, 0xFB, 0x75, 0xFB, 0x76, 0xFB, 0x77, /* 0x74-0x77 */
+	0xFB, 0x78, 0xFB, 0x79, 0xFB, 0x7A, 0xFB, 0x7B, /* 0x78-0x7B */
+	0xFB, 0x7C, 0xFB, 0x7D, 0xF5, 0xBA, 0xC2, 0xB9, /* 0x7C-0x7F */
+	
+	0xFB, 0x7E, 0xFB, 0x80, 0xF7, 0xE4, 0xFB, 0x81, /* 0x80-0x83 */
+	0xFB, 0x82, 0xFB, 0x83, 0xFB, 0x84, 0xF7, 0xE5, /* 0x84-0x87 */
+	0xF7, 0xE6, 0xFB, 0x85, 0xFB, 0x86, 0xF7, 0xE7, /* 0x88-0x8B */
+	0xFB, 0x87, 0xFB, 0x88, 0xFB, 0x89, 0xFB, 0x8A, /* 0x8C-0x8F */
+	0xFB, 0x8B, 0xFB, 0x8C, 0xF7, 0xE8, 0xC2, 0xB4, /* 0x90-0x93 */
+	0xFB, 0x8D, 0xFB, 0x8E, 0xFB, 0x8F, 0xFB, 0x90, /* 0x94-0x97 */
+	0xFB, 0x91, 0xFB, 0x92, 0xFB, 0x93, 0xFB, 0x94, /* 0x98-0x9B */
+	0xFB, 0x95, 0xF7, 0xEA, 0xFB, 0x96, 0xF7, 0xEB, /* 0x9C-0x9F */
+	0xFB, 0x97, 0xFB, 0x98, 0xFB, 0x99, 0xFB, 0x9A, /* 0xA0-0xA3 */
+	0xFB, 0x9B, 0xFB, 0x9C, 0xC2, 0xF3, 0xFB, 0x9D, /* 0xA4-0xA7 */
+	0xFB, 0x9E, 0xFB, 0x9F, 0xFB, 0xA0, 0xFC, 0x40, /* 0xA8-0xAB */
+	0xFC, 0x41, 0xFC, 0x42, 0xFC, 0x43, 0xFC, 0x44, /* 0xAC-0xAF */
+	0xFC, 0x45, 0xFC, 0x46, 0xFC, 0x47, 0xFC, 0x48, /* 0xB0-0xB3 */
+	0xF4, 0xF0, 0xFC, 0x49, 0xFC, 0x4A, 0xFC, 0x4B, /* 0xB4-0xB7 */
+	0xF4, 0xEF, 0xFC, 0x4C, 0xFC, 0x4D, 0xC2, 0xE9, /* 0xB8-0xBB */
+	0xFC, 0x4E, 0xF7, 0xE1, 0xF7, 0xE2, 0xFC, 0x4F, /* 0xBC-0xBF */
+	0xFC, 0x50, 0xFC, 0x51, 0xFC, 0x52, 0xFC, 0x53, /* 0xC0-0xC3 */
+	0xBB, 0xC6, 0xFC, 0x54, 0xFC, 0x55, 0xFC, 0x56, /* 0xC4-0xC7 */
+	0xFC, 0x57, 0xD9, 0xE4, 0xFC, 0x58, 0xFC, 0x59, /* 0xC8-0xCB */
+	0xFC, 0x5A, 0xCA, 0xF2, 0xC0, 0xE8, 0xF0, 0xA4, /* 0xCC-0xCF */
+	0xFC, 0x5B, 0xBA, 0xDA, 0xFC, 0x5C, 0xFC, 0x5D, /* 0xD0-0xD3 */
+	0xC7, 0xAD, 0xFC, 0x5E, 0xFC, 0x5F, 0xFC, 0x60, /* 0xD4-0xD7 */
+	0xC4, 0xAC, 0xFC, 0x61, 0xFC, 0x62, 0xF7, 0xEC, /* 0xD8-0xDB */
+	0xF7, 0xED, 0xF7, 0xEE, 0xFC, 0x63, 0xF7, 0xF0, /* 0xDC-0xDF */
+	0xF7, 0xEF, 0xFC, 0x64, 0xF7, 0xF1, 0xFC, 0x65, /* 0xE0-0xE3 */
+	0xFC, 0x66, 0xF7, 0xF4, 0xFC, 0x67, 0xF7, 0xF3, /* 0xE4-0xE7 */
+	0xFC, 0x68, 0xF7, 0xF2, 0xF7, 0xF5, 0xFC, 0x69, /* 0xE8-0xEB */
+	0xFC, 0x6A, 0xFC, 0x6B, 0xFC, 0x6C, 0xF7, 0xF6, /* 0xEC-0xEF */
+	0xFC, 0x6D, 0xFC, 0x6E, 0xFC, 0x6F, 0xFC, 0x70, /* 0xF0-0xF3 */
+	0xFC, 0x71, 0xFC, 0x72, 0xFC, 0x73, 0xFC, 0x74, /* 0xF4-0xF7 */
+	0xFC, 0x75, 0xED, 0xE9, 0xFC, 0x76, 0xED, 0xEA, /* 0xF8-0xFB */
+	0xED, 0xEB, 0xFC, 0x77, 0xF6, 0xBC, 0xFC, 0x78, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0xFC, 0x79, 0xFC, 0x7A, 0xFC, 0x7B, 0xFC, 0x7C, /* 0x00-0x03 */
+	0xFC, 0x7D, 0xFC, 0x7E, 0xFC, 0x80, 0xFC, 0x81, /* 0x04-0x07 */
+	0xFC, 0x82, 0xFC, 0x83, 0xFC, 0x84, 0xF6, 0xBD, /* 0x08-0x0B */
+	0xFC, 0x85, 0xF6, 0xBE, 0xB6, 0xA6, 0xFC, 0x86, /* 0x0C-0x0F */
+	0xD8, 0xBE, 0xFC, 0x87, 0xFC, 0x88, 0xB9, 0xC4, /* 0x10-0x13 */
+	0xFC, 0x89, 0xFC, 0x8A, 0xFC, 0x8B, 0xD8, 0xBB, /* 0x14-0x17 */
+	0xFC, 0x8C, 0xDC, 0xB1, 0xFC, 0x8D, 0xFC, 0x8E, /* 0x18-0x1B */
+	0xFC, 0x8F, 0xFC, 0x90, 0xFC, 0x91, 0xFC, 0x92, /* 0x1C-0x1F */
+	0xCA, 0xF3, 0xFC, 0x93, 0xF7, 0xF7, 0xFC, 0x94, /* 0x20-0x23 */
+	0xFC, 0x95, 0xFC, 0x96, 0xFC, 0x97, 0xFC, 0x98, /* 0x24-0x27 */
+	0xFC, 0x99, 0xFC, 0x9A, 0xFC, 0x9B, 0xFC, 0x9C, /* 0x28-0x2B */
+	0xF7, 0xF8, 0xFC, 0x9D, 0xFC, 0x9E, 0xF7, 0xF9, /* 0x2C-0x2F */
+	0xFC, 0x9F, 0xFC, 0xA0, 0xFD, 0x40, 0xFD, 0x41, /* 0x30-0x33 */
+	0xFD, 0x42, 0xFD, 0x43, 0xFD, 0x44, 0xF7, 0xFB, /* 0x34-0x37 */
+	0xFD, 0x45, 0xF7, 0xFA, 0xFD, 0x46, 0xB1, 0xC7, /* 0x38-0x3B */
+	0xFD, 0x47, 0xF7, 0xFC, 0xF7, 0xFD, 0xFD, 0x48, /* 0x3C-0x3F */
+	0xFD, 0x49, 0xFD, 0x4A, 0xFD, 0x4B, 0xFD, 0x4C, /* 0x40-0x43 */
+	0xF7, 0xFE, 0xFD, 0x4D, 0xFD, 0x4E, 0xFD, 0x4F, /* 0x44-0x47 */
+	0xFD, 0x50, 0xFD, 0x51, 0xFD, 0x52, 0xFD, 0x53, /* 0x48-0x4B */
+	0xFD, 0x54, 0xFD, 0x55, 0xFD, 0x56, 0xFD, 0x57, /* 0x4C-0x4F */
+	0xC6, 0xEB, 0xEC, 0xB4, 0xFD, 0x58, 0xFD, 0x59, /* 0x50-0x53 */
+	0xFD, 0x5A, 0xFD, 0x5B, 0xFD, 0x5C, 0xFD, 0x5D, /* 0x54-0x57 */
+	0xFD, 0x5E, 0xFD, 0x5F, 0xFD, 0x60, 0xFD, 0x61, /* 0x58-0x5B */
+	0xFD, 0x62, 0xFD, 0x63, 0xFD, 0x64, 0xFD, 0x65, /* 0x5C-0x5F */
+	0xFD, 0x66, 0xFD, 0x67, 0xFD, 0x68, 0xFD, 0x69, /* 0x60-0x63 */
+	0xFD, 0x6A, 0xFD, 0x6B, 0xFD, 0x6C, 0xFD, 0x6D, /* 0x64-0x67 */
+	0xFD, 0x6E, 0xFD, 0x6F, 0xFD, 0x70, 0xFD, 0x71, /* 0x68-0x6B */
+	0xFD, 0x72, 0xFD, 0x73, 0xFD, 0x74, 0xFD, 0x75, /* 0x6C-0x6F */
+	0xFD, 0x76, 0xFD, 0x77, 0xFD, 0x78, 0xFD, 0x79, /* 0x70-0x73 */
+	0xFD, 0x7A, 0xFD, 0x7B, 0xFD, 0x7C, 0xFD, 0x7D, /* 0x74-0x77 */
+	0xFD, 0x7E, 0xFD, 0x80, 0xFD, 0x81, 0xFD, 0x82, /* 0x78-0x7B */
+	0xFD, 0x83, 0xFD, 0x84, 0xFD, 0x85, 0xB3, 0xDD, /* 0x7C-0x7F */
+	
+	0xF6, 0xB3, 0xFD, 0x86, 0xFD, 0x87, 0xF6, 0xB4, /* 0x80-0x83 */
+	0xC1, 0xE4, 0xF6, 0xB5, 0xF6, 0xB6, 0xF6, 0xB7, /* 0x84-0x87 */
+	0xF6, 0xB8, 0xF6, 0xB9, 0xF6, 0xBA, 0xC8, 0xA3, /* 0x88-0x8B */
+	0xF6, 0xBB, 0xFD, 0x88, 0xFD, 0x89, 0xFD, 0x8A, /* 0x8C-0x8F */
+	0xFD, 0x8B, 0xFD, 0x8C, 0xFD, 0x8D, 0xFD, 0x8E, /* 0x90-0x93 */
+	0xFD, 0x8F, 0xFD, 0x90, 0xFD, 0x91, 0xFD, 0x92, /* 0x94-0x97 */
+	0xFD, 0x93, 0xC1, 0xFA, 0xB9, 0xA8, 0xED, 0xE8, /* 0x98-0x9B */
+	0xFD, 0x94, 0xFD, 0x95, 0xFD, 0x96, 0xB9, 0xEA, /* 0x9C-0x9F */
+	0xD9, 0xDF, 0xFD, 0x97, 0xFD, 0x98, 0xFD, 0x99, /* 0xA0-0xA3 */
+	0xFD, 0x9A, 0xFD, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0xD8, 0x4D, 0xB8, 0xFC, 0xDC, 0x87, 0xD9, 0x5A, /* 0x00-0x03 */
+	0xBB, 0xAC, 0xB4, 0xAE, 0xBE, 0xE4, 0xFD, 0x94, /* 0x04-0x07 */
+	0xFD, 0x94, 0xC6, 0xF5, 0xBD, 0xF0, 0xC0, 0xAE, /* 0x08-0x0B */
+	0xC4, 0xCE, 0x91, 0xD0, 0xB0, 0x5D, 0xC1, 0x5F, /* 0x0C-0x0F */
+	0xCC, 0x7D, 0xC2, 0xDD, 0xC2, 0xE3, 0xDF, 0x89, /* 0x10-0x13 */
+	0x98, 0xB7, 0xC2, 0xE5, 0xC0, 0xD3, 0xE7, 0xF3, /* 0x14-0x17 */
+	0xC2, 0xE4, 0xC0, 0xD2, 0xF1, 0x98, 0x81, 0x79, /* 0x18-0x1B */
+	0xC2, 0xD1, 0x99, 0xDA, 0xA0, 0x80, 0xCC, 0x6D, /* 0x1C-0x1F */
+	0xFB, 0x5B, 0x8D, 0xB9, 0x9E, 0x45, 0xCB, 0x7B, /* 0x20-0x23 */
+	0xD2, 0x68, 0xC0, 0xAD, 0xC5, 0x44, 0xCF, 0x9E, /* 0x24-0x27 */
+	0xC0, 0xC8, 0xC0, 0xCA, 0xC0, 0xCB, 0xC0, 0xC7, /* 0x28-0x2B */
+	0xFD, 0x9C, 0x81, 0xED, 0xC0, 0xE4, 0x84, 0xDA, /* 0x2C-0x2F */
+	0x93, 0xEF, 0x99, 0xA9, 0xA0, 0x74, 0xB1, 0x52, /* 0x30-0x33 */
+	0xC0, 0xCF, 0xCC, 0x4A, 0xCC, 0x94, 0xC2, 0xB7, /* 0x34-0x37 */
+	0xC2, 0xB6, 0xF4, 0x94, 0xFA, 0x98, 0xC2, 0xB5, /* 0x38-0x3B */
+	0xB5, 0x93, 0xBE, 0x47, 0xC7, 0x8A, 0xE4, 0x9B, /* 0x3C-0x3F */
+	0xC2, 0xB9, 0xD5, 0x93, 0x89, 0xC5, 0xC5, 0xAA, /* 0x40-0x43 */
+	0xBB, 0x5C, 0xC3, 0x40, 0xC0, 0xCE, 0xC0, 0xDA, /* 0x44-0x47 */
+	0xD9, 0x54, 0xC0, 0xD7, 0x89, 0xBE, 0x8C, 0xD2, /* 0x48-0x4B */
+	0x98, 0xC7, 0x9C, 0x49, 0xC2, 0xA9, 0xC0, 0xDB, /* 0x4C-0x4F */
+	0xBF, 0x7C, 0xC2, 0xAA, 0xC0, 0xD5, 0xC0, 0xDF, /* 0x50-0x53 */
+	0x84, 0x43, 0xC1, 0xE8, 0xB6, 0xA0, 0xBE, 0x63, /* 0x54-0x57 */
+	0xC1, 0xE2, 0xC1, 0xEA, 0xD7, 0x78, 0x92, 0x82, /* 0x58-0x5B */
+	0x98, 0xB7, 0xD6, 0x5A, 0xB5, 0xA4, 0x8C, 0x8E, /* 0x5C-0x5F */
+	0xC5, 0xAD, 0xC2, 0xCA, 0xAE, 0x90, 0xB1, 0xB1, /* 0x60-0x63 */
+	0xB4, 0x91, 0xB1, 0xE3, 0x8F, 0xCD, 0xB2, 0xBB, /* 0x64-0x67 */
+	0xC3, 0xDA, 0x94, 0xB5, 0xCB, 0xF7, 0x85, 0xA2, /* 0x68-0x6B */
+	0xC8, 0xFB, 0xCA, 0xA1, 0xC8, 0x7E, 0xD5, 0x66, /* 0x6C-0x6F */
+	0x9A, 0xA2, 0xB3, 0xBD, 0xC9, 0xF2, 0xCA, 0xB0, /* 0x70-0x73 */
+	0xC8, 0xF4, 0xC2, 0xD3, 0xC2, 0xD4, 0xC1, 0xC1, /* 0x74-0x77 */
+	0x83, 0xC9, 0xFD, 0x9D, 0xC1, 0xBA, 0xBC, 0x5A, /* 0x78-0x7B */
+	0xC1, 0xBC, 0xD5, 0x8F, 0xC1, 0xBF, 0x84, 0xEE, /* 0x7C-0x7F */
+	
+	0x85, 0xCE, 0xC5, 0xAE, 0x8F, 0x5D, 0xC2, 0xC3, /* 0x80-0x83 */
+	0x9E, 0x56, 0xB5, 0x5A, 0xE9, 0x82, 0xF3, 0x50, /* 0x84-0x87 */
+	0xFB, 0x90, 0xC0, 0xE8, 0xC1, 0xA6, 0x95, 0xD1, /* 0x88-0x8B */
+	0x9A, 0x76, 0xDE, 0x5D, 0xC4, 0xEA, 0x91, 0x7A, /* 0x8C-0x8F */
+	0x91, 0xD9, 0x93, 0xD3, 0x9D, 0x69, 0x9F, 0x92, /* 0x90-0x93 */
+	0xAD, 0x49, 0xFD, 0x9E, 0xBE, 0x9A, 0xC2, 0x93, /* 0x94-0x97 */
+	0xDD, 0x82, 0xC9, 0x8F, 0xDF, 0x42, 0xE5, 0x80, /* 0x98-0x9B */
+	0xC1, 0xD0, 0xC1, 0xD3, 0xD1, 0xCA, 0xC1, 0xD2, /* 0x9C-0x9F */
+	0xC1, 0xD1, 0xD5, 0x66, 0xC1, 0xAE, 0xC4, 0xEE, /* 0xA0-0xA3 */
+	0xC4, 0xED, 0x9A, 0x9A, 0xBA, 0x9F, 0xAB, 0x43, /* 0xA4-0xA7 */
+	0xC1, 0xEE, 0xE0, 0xF2, 0x8C, 0x8E, 0x8E, 0x58, /* 0xA8-0xAB */
+	0xC1, 0xAF, 0xC1, 0xE1, 0xAC, 0x93, 0xC1, 0xE7, /* 0xAC-0xAF */
+	0xF1, 0xF6, 0xE2, 0x8F, 0xC1, 0xE3, 0xEC, 0x60, /* 0xB0-0xB3 */
+	0xEE, 0x49, 0xC0, 0xFD, 0xB6, 0x59, 0xF5, 0xB7, /* 0xB4-0xB7 */
+	0xEB, 0x60, 0x90, 0xBA, 0xC1, 0xCB, 0xC1, 0xC5, /* 0xB8-0xBB */
+	0xE5, 0xBC, 0xC4, 0xF2, 0xC1, 0xCF, 0x98, 0xB7, /* 0xBC-0xBF */
+	0xC1, 0xC7, 0xAF, 0x9F, 0xDE, 0xA4, 0xDF, 0x7C, /* 0xC0-0xC3 */
+	0xFD, 0x88, 0x95, 0x9E, 0xC8, 0xEE, 0x84, 0xA2, /* 0xC4-0xC7 */
+	0x96, 0x83, 0xC1, 0xF8, 0xC1, 0xF7, 0xC1, 0xEF, /* 0xC8-0xCB */
+	0xC1, 0xF0, 0xC1, 0xF4, 0xC1, 0xF2, 0xBC, 0x7E, /* 0xCC-0xCF */
+	0xEE, 0x90, 0xC1, 0xF9, 0xC2, 0xBE, 0xEA, 0x91, /* 0xD0-0xD3 */
+	0x82, 0x90, 0x8D, 0x91, 0x9C, 0x53, 0xDD, 0x86, /* 0xD4-0xD7 */
+	0xC2, 0xC9, 0x90, 0xFC, 0xC0, 0xF5, 0xC2, 0xCA, /* 0xD8-0xDB */
+	0xC2, 0xA1, 0xC0, 0xFB, 0xC0, 0xF4, 0xC2, 0xC4, /* 0xDC-0xDF */
+	0xD2, 0xD7, 0xC0, 0xEE, 0xC0, 0xE6, 0xC4, 0xE0, /* 0xE0-0xE3 */
+	0xC0, 0xED, 0xC1, 0xA1, 0xEE, 0xBE, 0xFD, 0x9F, /* 0xE4-0xE7 */
+	0xD1, 0x65, 0xC0, 0xEF, 0xEB, 0x78, 0xC4, 0xE4, /* 0xE8-0xEB */
+	0xC4, 0xE7, 0xC1, 0xDF, 0x9F, 0xFB, 0xAD, 0x55, /* 0xEC-0xEF */
+	0xCC, 0x41, 0xFD, 0xA0, 0xF7, 0x5B, 0xF7, 0xEB, /* 0xF0-0xF3 */
+	0xC1, 0xD6, 0xC1, 0xDC, 0xC5, 0x52, 0xC1, 0xA2, /* 0xF4-0xF7 */
+	0xF3, 0xD2, 0xC1, 0xA3, 0xA0, 0xEE, 0xD6, 0xCB, /* 0xF8-0xFB */
+	0xD7, 0x52, 0xCA, 0xB2, 0xB2, 0xE8, 0xB4, 0xCC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0xC7, 0xD0, 0xB6, 0xC8, 0xCD, 0xD8, 0xCC, 0xC7, /* 0x00-0x03 */
+	0xD5, 0xAC, 0xB6, 0xB4, 0xB1, 0xA9, 0xDD, 0x97, /* 0x04-0x07 */
+	0xD0, 0xD0, 0xBD, 0xB5, 0xD2, 0x8A, 0xC0, 0xAA, /* 0x08-0x0B */
+	0xFE, 0x40, 0xFE, 0x41, 0xFE, 0x42, 0xFE, 0x43, /* 0x0C-0x0F */
+	0x89, 0x56, 0xFE, 0x44, 0xC7, 0xE7, 0xFE, 0x45, /* 0x10-0x13 */
+	0xFE, 0x46, 0x84, 0x44, 0xD8, 0x69, 0xD2, 0xE6, /* 0x14-0x17 */
+	0xFE, 0x47, 0xC9, 0xF1, 0xCF, 0xE9, 0xB8, 0xA3, /* 0x18-0x1B */
+	0xBE, 0xB8, 0xBE, 0xAB, 0xD3, 0xF0, 0xFE, 0x48, /* 0x1C-0x1F */
+	0xFE, 0x49, 0xFE, 0x4A, 0xD6, 0x54, 0xFE, 0x4B, /* 0x20-0x23 */
+	0xFE, 0x4C, 0xD2, 0xDD, 0xB6, 0xBC, 0xFE, 0x4D, /* 0x24-0x27 */
+	0xFE, 0x4E, 0xFE, 0x4F, 0xEF, 0x88, 0xEF, 0x95, /* 0x28-0x2B */
+	0xF0, 0x5E, 0xFA, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FE[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA9, 0x55, 0xA6, 0xF2, 0x00, 0x00, 0xA6, 0xF4, /* 0x30-0x33 */
+	0xA6, 0xF5, 0xA6, 0xE0, 0xA6, 0xE1, 0xA6, 0xF0, /* 0x34-0x37 */
+	0xA6, 0xF1, 0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xEE, /* 0x38-0x3B */
+	0xA6, 0xEF, 0xA6, 0xE6, 0xA6, 0xE7, 0xA6, 0xE4, /* 0x3C-0x3F */
+	0xA6, 0xE5, 0xA6, 0xE8, 0xA6, 0xE9, 0xA6, 0xEA, /* 0x40-0x43 */
+	0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xA9, 0x68, 0xA9, 0x69, 0xA9, 0x6A, /* 0x48-0x4B */
+	0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, 0xA9, 0x6E, /* 0x4C-0x4F */
+	0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, 0x00, 0x00, /* 0x50-0x53 */
+	0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0x54-0x57 */
+	0x00, 0x00, 0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, /* 0x58-0x5B */
+	0xA9, 0x79, 0xA9, 0x7A, 0xA9, 0x7B, 0xA9, 0x7C, /* 0x5C-0x5F */
+	0xA9, 0x7D, 0xA9, 0x7E, 0xA9, 0x80, 0xA9, 0x81, /* 0x60-0x63 */
+	0xA9, 0x82, 0xA9, 0x83, 0xA9, 0x84, 0x00, 0x00, /* 0x64-0x67 */
+	0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, 0xA9, 0x88, /* 0x68-0x6B */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */
+	0xA1, 0xE7, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */
+	0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */
+	0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */
+	0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */
+	0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */
+	0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */
+	0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */
+	0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */
+	0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */
+	0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */
+	0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */
+	0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */
+	0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */
+	0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */
+	0xA3, 0xDC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */
+	0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */
+	0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */
+	0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */
+	0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */
+	0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */
+	0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */
+	0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */
+	0xA3, 0xFC, 0xA3, 0xFD, 0xA1, 0xAB, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA1, 0xE9, 0xA1, 0xEA, 0xA9, 0x56, 0xA3, 0xFE, /* 0xE0-0xE3 */
+	0xA9, 0x57, 0xA3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	u2c_00, u2c_01, u2c_02, u2c_03, u2c_04, NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, u2c_31, u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   u2c_FE, u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+			unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+	unsigned char out0,out1;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (uni == 0x20ac) {/* Euro symbol.The only exception with a non-ascii unicode */
+		out[0] = 0x80;
+		return 1;
+	}
+
+	if (ch == 0) { /* handle the U00 plane*/
+		/* if (cl == 0) return -EINVAL;*/ /*U0000 is legal in cp936*/
+		out0 = u2c_00[cl*2];
+		out1 = u2c_00[cl*2+1];
+		if (out0 == 0x00 && out1 == 0x00) {
+			if (cl<0x80) {
+				out[0] = cl;
+				return 1;
+			}
+			return -EINVAL;
+		} else {
+			if (boundlen <= 1)
+				return -ENAMETOOLONG;
+			out[0] = out0;
+			out[1] = out1;
+			return 2;
+		}
+	}
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen <= 1)
+			return -ENAMETOOLONG;
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		return 2;
+	}
+	else
+		return -EINVAL;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+			wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (boundlen == 1) {
+		if (rawstring[0]==0x80) { /* Euro symbol.The only exception with a non-ascii unicode */
+			*uni = 0x20ac;
+		} else {
+			*uni = rawstring[0];
+		}
+		return 1;
+	}
+
+	ch = rawstring[0];
+	cl = rawstring[1];
+
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		n = 2;
+	} else{
+		if (ch==0x80) {/* Euro symbol.The only exception with a non-ascii unicode */
+			*uni = 0x20ac;
+		} else {
+			*uni = ch;
+		}
+		n = 1;
+	}
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "cp936",
+	.alias		= "gb2312",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp936(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp936(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp936)
+module_exit(exit_nls_cp936)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(gb2312);
diff --git a/kernel/fs/nls/nls_cp949.c b/kernel/fs/nls/nls_cp949.c
new file mode 100644
index 000000000..199171e97
--- /dev/null
+++ b/kernel/fs/nls/nls_cp949.c
@@ -0,0 +1,13946 @@
+/*
+ * linux/fs/nls/nls_cp949.c
+ *
+ * Charset cp949 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_81[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAC02,0xAC03,0xAC05,0xAC06,0xAC0B,0xAC0C,0xAC0D,/* 0x40-0x47 */
+	0xAC0E,0xAC0F,0xAC18,0xAC1E,0xAC1F,0xAC21,0xAC22,0xAC23,/* 0x48-0x4F */
+	0xAC25,0xAC26,0xAC27,0xAC28,0xAC29,0xAC2A,0xAC2B,0xAC2E,/* 0x50-0x57 */
+	0xAC32,0xAC33,0xAC34,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAC35,0xAC36,0xAC37,0xAC3A,0xAC3B,0xAC3D,0xAC3E,/* 0x60-0x67 */
+	0xAC3F,0xAC41,0xAC42,0xAC43,0xAC44,0xAC45,0xAC46,0xAC47,/* 0x68-0x6F */
+	0xAC48,0xAC49,0xAC4A,0xAC4C,0xAC4E,0xAC4F,0xAC50,0xAC51,/* 0x70-0x77 */
+	0xAC52,0xAC53,0xAC55,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAC56,0xAC57,0xAC59,0xAC5A,0xAC5B,0xAC5D,0xAC5E,/* 0x80-0x87 */
+	0xAC5F,0xAC60,0xAC61,0xAC62,0xAC63,0xAC64,0xAC65,0xAC66,/* 0x88-0x8F */
+	0xAC67,0xAC68,0xAC69,0xAC6A,0xAC6B,0xAC6C,0xAC6D,0xAC6E,/* 0x90-0x97 */
+	0xAC6F,0xAC72,0xAC73,0xAC75,0xAC76,0xAC79,0xAC7B,0xAC7C,/* 0x98-0x9F */
+	0xAC7D,0xAC7E,0xAC7F,0xAC82,0xAC87,0xAC88,0xAC8D,0xAC8E,/* 0xA0-0xA7 */
+	0xAC8F,0xAC91,0xAC92,0xAC93,0xAC95,0xAC96,0xAC97,0xAC98,/* 0xA8-0xAF */
+	0xAC99,0xAC9A,0xAC9B,0xAC9E,0xACA2,0xACA3,0xACA4,0xACA5,/* 0xB0-0xB7 */
+	0xACA6,0xACA7,0xACAB,0xACAD,0xACAE,0xACB1,0xACB2,0xACB3,/* 0xB8-0xBF */
+	0xACB4,0xACB5,0xACB6,0xACB7,0xACBA,0xACBE,0xACBF,0xACC0,/* 0xC0-0xC7 */
+	0xACC2,0xACC3,0xACC5,0xACC6,0xACC7,0xACC9,0xACCA,0xACCB,/* 0xC8-0xCF */
+	0xACCD,0xACCE,0xACCF,0xACD0,0xACD1,0xACD2,0xACD3,0xACD4,/* 0xD0-0xD7 */
+	0xACD6,0xACD8,0xACD9,0xACDA,0xACDB,0xACDC,0xACDD,0xACDE,/* 0xD8-0xDF */
+	0xACDF,0xACE2,0xACE3,0xACE5,0xACE6,0xACE9,0xACEB,0xACED,/* 0xE0-0xE7 */
+	0xACEE,0xACF2,0xACF4,0xACF7,0xACF8,0xACF9,0xACFA,0xACFB,/* 0xE8-0xEF */
+	0xACFE,0xACFF,0xAD01,0xAD02,0xAD03,0xAD05,0xAD07,0xAD08,/* 0xF0-0xF7 */
+	0xAD09,0xAD0A,0xAD0B,0xAD0E,0xAD10,0xAD12,0xAD13,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_82[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAD14,0xAD15,0xAD16,0xAD17,0xAD19,0xAD1A,0xAD1B,/* 0x40-0x47 */
+	0xAD1D,0xAD1E,0xAD1F,0xAD21,0xAD22,0xAD23,0xAD24,0xAD25,/* 0x48-0x4F */
+	0xAD26,0xAD27,0xAD28,0xAD2A,0xAD2B,0xAD2E,0xAD2F,0xAD30,/* 0x50-0x57 */
+	0xAD31,0xAD32,0xAD33,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAD36,0xAD37,0xAD39,0xAD3A,0xAD3B,0xAD3D,0xAD3E,/* 0x60-0x67 */
+	0xAD3F,0xAD40,0xAD41,0xAD42,0xAD43,0xAD46,0xAD48,0xAD4A,/* 0x68-0x6F */
+	0xAD4B,0xAD4C,0xAD4D,0xAD4E,0xAD4F,0xAD51,0xAD52,0xAD53,/* 0x70-0x77 */
+	0xAD55,0xAD56,0xAD57,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAD59,0xAD5A,0xAD5B,0xAD5C,0xAD5D,0xAD5E,0xAD5F,/* 0x80-0x87 */
+	0xAD60,0xAD62,0xAD64,0xAD65,0xAD66,0xAD67,0xAD68,0xAD69,/* 0x88-0x8F */
+	0xAD6A,0xAD6B,0xAD6E,0xAD6F,0xAD71,0xAD72,0xAD77,0xAD78,/* 0x90-0x97 */
+	0xAD79,0xAD7A,0xAD7E,0xAD80,0xAD83,0xAD84,0xAD85,0xAD86,/* 0x98-0x9F */
+	0xAD87,0xAD8A,0xAD8B,0xAD8D,0xAD8E,0xAD8F,0xAD91,0xAD92,/* 0xA0-0xA7 */
+	0xAD93,0xAD94,0xAD95,0xAD96,0xAD97,0xAD98,0xAD99,0xAD9A,/* 0xA8-0xAF */
+	0xAD9B,0xAD9E,0xAD9F,0xADA0,0xADA1,0xADA2,0xADA3,0xADA5,/* 0xB0-0xB7 */
+	0xADA6,0xADA7,0xADA8,0xADA9,0xADAA,0xADAB,0xADAC,0xADAD,/* 0xB8-0xBF */
+	0xADAE,0xADAF,0xADB0,0xADB1,0xADB2,0xADB3,0xADB4,0xADB5,/* 0xC0-0xC7 */
+	0xADB6,0xADB8,0xADB9,0xADBA,0xADBB,0xADBC,0xADBD,0xADBE,/* 0xC8-0xCF */
+	0xADBF,0xADC2,0xADC3,0xADC5,0xADC6,0xADC7,0xADC9,0xADCA,/* 0xD0-0xD7 */
+	0xADCB,0xADCC,0xADCD,0xADCE,0xADCF,0xADD2,0xADD4,0xADD5,/* 0xD8-0xDF */
+	0xADD6,0xADD7,0xADD8,0xADD9,0xADDA,0xADDB,0xADDD,0xADDE,/* 0xE0-0xE7 */
+	0xADDF,0xADE1,0xADE2,0xADE3,0xADE5,0xADE6,0xADE7,0xADE8,/* 0xE8-0xEF */
+	0xADE9,0xADEA,0xADEB,0xADEC,0xADED,0xADEE,0xADEF,0xADF0,/* 0xF0-0xF7 */
+	0xADF1,0xADF2,0xADF3,0xADF4,0xADF5,0xADF6,0xADF7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_83[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xADFA,0xADFB,0xADFD,0xADFE,0xAE02,0xAE03,0xAE04,/* 0x40-0x47 */
+	0xAE05,0xAE06,0xAE07,0xAE0A,0xAE0C,0xAE0E,0xAE0F,0xAE10,/* 0x48-0x4F */
+	0xAE11,0xAE12,0xAE13,0xAE15,0xAE16,0xAE17,0xAE18,0xAE19,/* 0x50-0x57 */
+	0xAE1A,0xAE1B,0xAE1C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAE1D,0xAE1E,0xAE1F,0xAE20,0xAE21,0xAE22,0xAE23,/* 0x60-0x67 */
+	0xAE24,0xAE25,0xAE26,0xAE27,0xAE28,0xAE29,0xAE2A,0xAE2B,/* 0x68-0x6F */
+	0xAE2C,0xAE2D,0xAE2E,0xAE2F,0xAE32,0xAE33,0xAE35,0xAE36,/* 0x70-0x77 */
+	0xAE39,0xAE3B,0xAE3C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAE3D,0xAE3E,0xAE3F,0xAE42,0xAE44,0xAE47,0xAE48,/* 0x80-0x87 */
+	0xAE49,0xAE4B,0xAE4F,0xAE51,0xAE52,0xAE53,0xAE55,0xAE57,/* 0x88-0x8F */
+	0xAE58,0xAE59,0xAE5A,0xAE5B,0xAE5E,0xAE62,0xAE63,0xAE64,/* 0x90-0x97 */
+	0xAE66,0xAE67,0xAE6A,0xAE6B,0xAE6D,0xAE6E,0xAE6F,0xAE71,/* 0x98-0x9F */
+	0xAE72,0xAE73,0xAE74,0xAE75,0xAE76,0xAE77,0xAE7A,0xAE7E,/* 0xA0-0xA7 */
+	0xAE7F,0xAE80,0xAE81,0xAE82,0xAE83,0xAE86,0xAE87,0xAE88,/* 0xA8-0xAF */
+	0xAE89,0xAE8A,0xAE8B,0xAE8D,0xAE8E,0xAE8F,0xAE90,0xAE91,/* 0xB0-0xB7 */
+	0xAE92,0xAE93,0xAE94,0xAE95,0xAE96,0xAE97,0xAE98,0xAE99,/* 0xB8-0xBF */
+	0xAE9A,0xAE9B,0xAE9C,0xAE9D,0xAE9E,0xAE9F,0xAEA0,0xAEA1,/* 0xC0-0xC7 */
+	0xAEA2,0xAEA3,0xAEA4,0xAEA5,0xAEA6,0xAEA7,0xAEA8,0xAEA9,/* 0xC8-0xCF */
+	0xAEAA,0xAEAB,0xAEAC,0xAEAD,0xAEAE,0xAEAF,0xAEB0,0xAEB1,/* 0xD0-0xD7 */
+	0xAEB2,0xAEB3,0xAEB4,0xAEB5,0xAEB6,0xAEB7,0xAEB8,0xAEB9,/* 0xD8-0xDF */
+	0xAEBA,0xAEBB,0xAEBF,0xAEC1,0xAEC2,0xAEC3,0xAEC5,0xAEC6,/* 0xE0-0xE7 */
+	0xAEC7,0xAEC8,0xAEC9,0xAECA,0xAECB,0xAECE,0xAED2,0xAED3,/* 0xE8-0xEF */
+	0xAED4,0xAED5,0xAED6,0xAED7,0xAEDA,0xAEDB,0xAEDD,0xAEDE,/* 0xF0-0xF7 */
+	0xAEDF,0xAEE0,0xAEE1,0xAEE2,0xAEE3,0xAEE4,0xAEE5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_84[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAEE6,0xAEE7,0xAEE9,0xAEEA,0xAEEC,0xAEEE,0xAEEF,/* 0x40-0x47 */
+	0xAEF0,0xAEF1,0xAEF2,0xAEF3,0xAEF5,0xAEF6,0xAEF7,0xAEF9,/* 0x48-0x4F */
+	0xAEFA,0xAEFB,0xAEFD,0xAEFE,0xAEFF,0xAF00,0xAF01,0xAF02,/* 0x50-0x57 */
+	0xAF03,0xAF04,0xAF05,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAF06,0xAF09,0xAF0A,0xAF0B,0xAF0C,0xAF0E,0xAF0F,/* 0x60-0x67 */
+	0xAF11,0xAF12,0xAF13,0xAF14,0xAF15,0xAF16,0xAF17,0xAF18,/* 0x68-0x6F */
+	0xAF19,0xAF1A,0xAF1B,0xAF1C,0xAF1D,0xAF1E,0xAF1F,0xAF20,/* 0x70-0x77 */
+	0xAF21,0xAF22,0xAF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAF24,0xAF25,0xAF26,0xAF27,0xAF28,0xAF29,0xAF2A,/* 0x80-0x87 */
+	0xAF2B,0xAF2E,0xAF2F,0xAF31,0xAF33,0xAF35,0xAF36,0xAF37,/* 0x88-0x8F */
+	0xAF38,0xAF39,0xAF3A,0xAF3B,0xAF3E,0xAF40,0xAF44,0xAF45,/* 0x90-0x97 */
+	0xAF46,0xAF47,0xAF4A,0xAF4B,0xAF4C,0xAF4D,0xAF4E,0xAF4F,/* 0x98-0x9F */
+	0xAF51,0xAF52,0xAF53,0xAF54,0xAF55,0xAF56,0xAF57,0xAF58,/* 0xA0-0xA7 */
+	0xAF59,0xAF5A,0xAF5B,0xAF5E,0xAF5F,0xAF60,0xAF61,0xAF62,/* 0xA8-0xAF */
+	0xAF63,0xAF66,0xAF67,0xAF68,0xAF69,0xAF6A,0xAF6B,0xAF6C,/* 0xB0-0xB7 */
+	0xAF6D,0xAF6E,0xAF6F,0xAF70,0xAF71,0xAF72,0xAF73,0xAF74,/* 0xB8-0xBF */
+	0xAF75,0xAF76,0xAF77,0xAF78,0xAF7A,0xAF7B,0xAF7C,0xAF7D,/* 0xC0-0xC7 */
+	0xAF7E,0xAF7F,0xAF81,0xAF82,0xAF83,0xAF85,0xAF86,0xAF87,/* 0xC8-0xCF */
+	0xAF89,0xAF8A,0xAF8B,0xAF8C,0xAF8D,0xAF8E,0xAF8F,0xAF92,/* 0xD0-0xD7 */
+	0xAF93,0xAF94,0xAF96,0xAF97,0xAF98,0xAF99,0xAF9A,0xAF9B,/* 0xD8-0xDF */
+	0xAF9D,0xAF9E,0xAF9F,0xAFA0,0xAFA1,0xAFA2,0xAFA3,0xAFA4,/* 0xE0-0xE7 */
+	0xAFA5,0xAFA6,0xAFA7,0xAFA8,0xAFA9,0xAFAA,0xAFAB,0xAFAC,/* 0xE8-0xEF */
+	0xAFAD,0xAFAE,0xAFAF,0xAFB0,0xAFB1,0xAFB2,0xAFB3,0xAFB4,/* 0xF0-0xF7 */
+	0xAFB5,0xAFB6,0xAFB7,0xAFBA,0xAFBB,0xAFBD,0xAFBE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_85[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAFBF,0xAFC1,0xAFC2,0xAFC3,0xAFC4,0xAFC5,0xAFC6,/* 0x40-0x47 */
+	0xAFCA,0xAFCC,0xAFCF,0xAFD0,0xAFD1,0xAFD2,0xAFD3,0xAFD5,/* 0x48-0x4F */
+	0xAFD6,0xAFD7,0xAFD8,0xAFD9,0xAFDA,0xAFDB,0xAFDD,0xAFDE,/* 0x50-0x57 */
+	0xAFDF,0xAFE0,0xAFE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAFE2,0xAFE3,0xAFE4,0xAFE5,0xAFE6,0xAFE7,0xAFEA,/* 0x60-0x67 */
+	0xAFEB,0xAFEC,0xAFED,0xAFEE,0xAFEF,0xAFF2,0xAFF3,0xAFF5,/* 0x68-0x6F */
+	0xAFF6,0xAFF7,0xAFF9,0xAFFA,0xAFFB,0xAFFC,0xAFFD,0xAFFE,/* 0x70-0x77 */
+	0xAFFF,0xB002,0xB003,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB005,0xB006,0xB007,0xB008,0xB009,0xB00A,0xB00B,/* 0x80-0x87 */
+	0xB00D,0xB00E,0xB00F,0xB011,0xB012,0xB013,0xB015,0xB016,/* 0x88-0x8F */
+	0xB017,0xB018,0xB019,0xB01A,0xB01B,0xB01E,0xB01F,0xB020,/* 0x90-0x97 */
+	0xB021,0xB022,0xB023,0xB024,0xB025,0xB026,0xB027,0xB029,/* 0x98-0x9F */
+	0xB02A,0xB02B,0xB02C,0xB02D,0xB02E,0xB02F,0xB030,0xB031,/* 0xA0-0xA7 */
+	0xB032,0xB033,0xB034,0xB035,0xB036,0xB037,0xB038,0xB039,/* 0xA8-0xAF */
+	0xB03A,0xB03B,0xB03C,0xB03D,0xB03E,0xB03F,0xB040,0xB041,/* 0xB0-0xB7 */
+	0xB042,0xB043,0xB046,0xB047,0xB049,0xB04B,0xB04D,0xB04F,/* 0xB8-0xBF */
+	0xB050,0xB051,0xB052,0xB056,0xB058,0xB05A,0xB05B,0xB05C,/* 0xC0-0xC7 */
+	0xB05E,0xB05F,0xB060,0xB061,0xB062,0xB063,0xB064,0xB065,/* 0xC8-0xCF */
+	0xB066,0xB067,0xB068,0xB069,0xB06A,0xB06B,0xB06C,0xB06D,/* 0xD0-0xD7 */
+	0xB06E,0xB06F,0xB070,0xB071,0xB072,0xB073,0xB074,0xB075,/* 0xD8-0xDF */
+	0xB076,0xB077,0xB078,0xB079,0xB07A,0xB07B,0xB07E,0xB07F,/* 0xE0-0xE7 */
+	0xB081,0xB082,0xB083,0xB085,0xB086,0xB087,0xB088,0xB089,/* 0xE8-0xEF */
+	0xB08A,0xB08B,0xB08E,0xB090,0xB092,0xB093,0xB094,0xB095,/* 0xF0-0xF7 */
+	0xB096,0xB097,0xB09B,0xB09D,0xB09E,0xB0A3,0xB0A4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_86[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB0A5,0xB0A6,0xB0A7,0xB0AA,0xB0B0,0xB0B2,0xB0B6,/* 0x40-0x47 */
+	0xB0B7,0xB0B9,0xB0BA,0xB0BB,0xB0BD,0xB0BE,0xB0BF,0xB0C0,/* 0x48-0x4F */
+	0xB0C1,0xB0C2,0xB0C3,0xB0C6,0xB0CA,0xB0CB,0xB0CC,0xB0CD,/* 0x50-0x57 */
+	0xB0CE,0xB0CF,0xB0D2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB0D3,0xB0D5,0xB0D6,0xB0D7,0xB0D9,0xB0DA,0xB0DB,/* 0x60-0x67 */
+	0xB0DC,0xB0DD,0xB0DE,0xB0DF,0xB0E1,0xB0E2,0xB0E3,0xB0E4,/* 0x68-0x6F */
+	0xB0E6,0xB0E7,0xB0E8,0xB0E9,0xB0EA,0xB0EB,0xB0EC,0xB0ED,/* 0x70-0x77 */
+	0xB0EE,0xB0EF,0xB0F0,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB0F1,0xB0F2,0xB0F3,0xB0F4,0xB0F5,0xB0F6,0xB0F7,/* 0x80-0x87 */
+	0xB0F8,0xB0F9,0xB0FA,0xB0FB,0xB0FC,0xB0FD,0xB0FE,0xB0FF,/* 0x88-0x8F */
+	0xB100,0xB101,0xB102,0xB103,0xB104,0xB105,0xB106,0xB107,/* 0x90-0x97 */
+	0xB10A,0xB10D,0xB10E,0xB10F,0xB111,0xB114,0xB115,0xB116,/* 0x98-0x9F */
+	0xB117,0xB11A,0xB11E,0xB11F,0xB120,0xB121,0xB122,0xB126,/* 0xA0-0xA7 */
+	0xB127,0xB129,0xB12A,0xB12B,0xB12D,0xB12E,0xB12F,0xB130,/* 0xA8-0xAF */
+	0xB131,0xB132,0xB133,0xB136,0xB13A,0xB13B,0xB13C,0xB13D,/* 0xB0-0xB7 */
+	0xB13E,0xB13F,0xB142,0xB143,0xB145,0xB146,0xB147,0xB149,/* 0xB8-0xBF */
+	0xB14A,0xB14B,0xB14C,0xB14D,0xB14E,0xB14F,0xB152,0xB153,/* 0xC0-0xC7 */
+	0xB156,0xB157,0xB159,0xB15A,0xB15B,0xB15D,0xB15E,0xB15F,/* 0xC8-0xCF */
+	0xB161,0xB162,0xB163,0xB164,0xB165,0xB166,0xB167,0xB168,/* 0xD0-0xD7 */
+	0xB169,0xB16A,0xB16B,0xB16C,0xB16D,0xB16E,0xB16F,0xB170,/* 0xD8-0xDF */
+	0xB171,0xB172,0xB173,0xB174,0xB175,0xB176,0xB177,0xB17A,/* 0xE0-0xE7 */
+	0xB17B,0xB17D,0xB17E,0xB17F,0xB181,0xB183,0xB184,0xB185,/* 0xE8-0xEF */
+	0xB186,0xB187,0xB18A,0xB18C,0xB18E,0xB18F,0xB190,0xB191,/* 0xF0-0xF7 */
+	0xB195,0xB196,0xB197,0xB199,0xB19A,0xB19B,0xB19D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_87[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB19E,0xB19F,0xB1A0,0xB1A1,0xB1A2,0xB1A3,0xB1A4,/* 0x40-0x47 */
+	0xB1A5,0xB1A6,0xB1A7,0xB1A9,0xB1AA,0xB1AB,0xB1AC,0xB1AD,/* 0x48-0x4F */
+	0xB1AE,0xB1AF,0xB1B0,0xB1B1,0xB1B2,0xB1B3,0xB1B4,0xB1B5,/* 0x50-0x57 */
+	0xB1B6,0xB1B7,0xB1B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB1B9,0xB1BA,0xB1BB,0xB1BC,0xB1BD,0xB1BE,0xB1BF,/* 0x60-0x67 */
+	0xB1C0,0xB1C1,0xB1C2,0xB1C3,0xB1C4,0xB1C5,0xB1C6,0xB1C7,/* 0x68-0x6F */
+	0xB1C8,0xB1C9,0xB1CA,0xB1CB,0xB1CD,0xB1CE,0xB1CF,0xB1D1,/* 0x70-0x77 */
+	0xB1D2,0xB1D3,0xB1D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB1D6,0xB1D7,0xB1D8,0xB1D9,0xB1DA,0xB1DB,0xB1DE,/* 0x80-0x87 */
+	0xB1E0,0xB1E1,0xB1E2,0xB1E3,0xB1E4,0xB1E5,0xB1E6,0xB1E7,/* 0x88-0x8F */
+	0xB1EA,0xB1EB,0xB1ED,0xB1EE,0xB1EF,0xB1F1,0xB1F2,0xB1F3,/* 0x90-0x97 */
+	0xB1F4,0xB1F5,0xB1F6,0xB1F7,0xB1F8,0xB1FA,0xB1FC,0xB1FE,/* 0x98-0x9F */
+	0xB1FF,0xB200,0xB201,0xB202,0xB203,0xB206,0xB207,0xB209,/* 0xA0-0xA7 */
+	0xB20A,0xB20D,0xB20E,0xB20F,0xB210,0xB211,0xB212,0xB213,/* 0xA8-0xAF */
+	0xB216,0xB218,0xB21A,0xB21B,0xB21C,0xB21D,0xB21E,0xB21F,/* 0xB0-0xB7 */
+	0xB221,0xB222,0xB223,0xB224,0xB225,0xB226,0xB227,0xB228,/* 0xB8-0xBF */
+	0xB229,0xB22A,0xB22B,0xB22C,0xB22D,0xB22E,0xB22F,0xB230,/* 0xC0-0xC7 */
+	0xB231,0xB232,0xB233,0xB235,0xB236,0xB237,0xB238,0xB239,/* 0xC8-0xCF */
+	0xB23A,0xB23B,0xB23D,0xB23E,0xB23F,0xB240,0xB241,0xB242,/* 0xD0-0xD7 */
+	0xB243,0xB244,0xB245,0xB246,0xB247,0xB248,0xB249,0xB24A,/* 0xD8-0xDF */
+	0xB24B,0xB24C,0xB24D,0xB24E,0xB24F,0xB250,0xB251,0xB252,/* 0xE0-0xE7 */
+	0xB253,0xB254,0xB255,0xB256,0xB257,0xB259,0xB25A,0xB25B,/* 0xE8-0xEF */
+	0xB25D,0xB25E,0xB25F,0xB261,0xB262,0xB263,0xB264,0xB265,/* 0xF0-0xF7 */
+	0xB266,0xB267,0xB26A,0xB26B,0xB26C,0xB26D,0xB26E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_88[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB26F,0xB270,0xB271,0xB272,0xB273,0xB276,0xB277,/* 0x40-0x47 */
+	0xB278,0xB279,0xB27A,0xB27B,0xB27D,0xB27E,0xB27F,0xB280,/* 0x48-0x4F */
+	0xB281,0xB282,0xB283,0xB286,0xB287,0xB288,0xB28A,0xB28B,/* 0x50-0x57 */
+	0xB28C,0xB28D,0xB28E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB28F,0xB292,0xB293,0xB295,0xB296,0xB297,0xB29B,/* 0x60-0x67 */
+	0xB29C,0xB29D,0xB29E,0xB29F,0xB2A2,0xB2A4,0xB2A7,0xB2A8,/* 0x68-0x6F */
+	0xB2A9,0xB2AB,0xB2AD,0xB2AE,0xB2AF,0xB2B1,0xB2B2,0xB2B3,/* 0x70-0x77 */
+	0xB2B5,0xB2B6,0xB2B7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB2B8,0xB2B9,0xB2BA,0xB2BB,0xB2BC,0xB2BD,0xB2BE,/* 0x80-0x87 */
+	0xB2BF,0xB2C0,0xB2C1,0xB2C2,0xB2C3,0xB2C4,0xB2C5,0xB2C6,/* 0x88-0x8F */
+	0xB2C7,0xB2CA,0xB2CB,0xB2CD,0xB2CE,0xB2CF,0xB2D1,0xB2D3,/* 0x90-0x97 */
+	0xB2D4,0xB2D5,0xB2D6,0xB2D7,0xB2DA,0xB2DC,0xB2DE,0xB2DF,/* 0x98-0x9F */
+	0xB2E0,0xB2E1,0xB2E3,0xB2E7,0xB2E9,0xB2EA,0xB2F0,0xB2F1,/* 0xA0-0xA7 */
+	0xB2F2,0xB2F6,0xB2FC,0xB2FD,0xB2FE,0xB302,0xB303,0xB305,/* 0xA8-0xAF */
+	0xB306,0xB307,0xB309,0xB30A,0xB30B,0xB30C,0xB30D,0xB30E,/* 0xB0-0xB7 */
+	0xB30F,0xB312,0xB316,0xB317,0xB318,0xB319,0xB31A,0xB31B,/* 0xB8-0xBF */
+	0xB31D,0xB31E,0xB31F,0xB320,0xB321,0xB322,0xB323,0xB324,/* 0xC0-0xC7 */
+	0xB325,0xB326,0xB327,0xB328,0xB329,0xB32A,0xB32B,0xB32C,/* 0xC8-0xCF */
+	0xB32D,0xB32E,0xB32F,0xB330,0xB331,0xB332,0xB333,0xB334,/* 0xD0-0xD7 */
+	0xB335,0xB336,0xB337,0xB338,0xB339,0xB33A,0xB33B,0xB33C,/* 0xD8-0xDF */
+	0xB33D,0xB33E,0xB33F,0xB340,0xB341,0xB342,0xB343,0xB344,/* 0xE0-0xE7 */
+	0xB345,0xB346,0xB347,0xB348,0xB349,0xB34A,0xB34B,0xB34C,/* 0xE8-0xEF */
+	0xB34D,0xB34E,0xB34F,0xB350,0xB351,0xB352,0xB353,0xB357,/* 0xF0-0xF7 */
+	0xB359,0xB35A,0xB35D,0xB360,0xB361,0xB362,0xB363,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_89[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB366,0xB368,0xB36A,0xB36C,0xB36D,0xB36F,0xB372,/* 0x40-0x47 */
+	0xB373,0xB375,0xB376,0xB377,0xB379,0xB37A,0xB37B,0xB37C,/* 0x48-0x4F */
+	0xB37D,0xB37E,0xB37F,0xB382,0xB386,0xB387,0xB388,0xB389,/* 0x50-0x57 */
+	0xB38A,0xB38B,0xB38D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB38E,0xB38F,0xB391,0xB392,0xB393,0xB395,0xB396,/* 0x60-0x67 */
+	0xB397,0xB398,0xB399,0xB39A,0xB39B,0xB39C,0xB39D,0xB39E,/* 0x68-0x6F */
+	0xB39F,0xB3A2,0xB3A3,0xB3A4,0xB3A5,0xB3A6,0xB3A7,0xB3A9,/* 0x70-0x77 */
+	0xB3AA,0xB3AB,0xB3AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB3AE,0xB3AF,0xB3B0,0xB3B1,0xB3B2,0xB3B3,0xB3B4,/* 0x80-0x87 */
+	0xB3B5,0xB3B6,0xB3B7,0xB3B8,0xB3B9,0xB3BA,0xB3BB,0xB3BC,/* 0x88-0x8F */
+	0xB3BD,0xB3BE,0xB3BF,0xB3C0,0xB3C1,0xB3C2,0xB3C3,0xB3C6,/* 0x90-0x97 */
+	0xB3C7,0xB3C9,0xB3CA,0xB3CD,0xB3CF,0xB3D1,0xB3D2,0xB3D3,/* 0x98-0x9F */
+	0xB3D6,0xB3D8,0xB3DA,0xB3DC,0xB3DE,0xB3DF,0xB3E1,0xB3E2,/* 0xA0-0xA7 */
+	0xB3E3,0xB3E5,0xB3E6,0xB3E7,0xB3E9,0xB3EA,0xB3EB,0xB3EC,/* 0xA8-0xAF */
+	0xB3ED,0xB3EE,0xB3EF,0xB3F0,0xB3F1,0xB3F2,0xB3F3,0xB3F4,/* 0xB0-0xB7 */
+	0xB3F5,0xB3F6,0xB3F7,0xB3F8,0xB3F9,0xB3FA,0xB3FB,0xB3FD,/* 0xB8-0xBF */
+	0xB3FE,0xB3FF,0xB400,0xB401,0xB402,0xB403,0xB404,0xB405,/* 0xC0-0xC7 */
+	0xB406,0xB407,0xB408,0xB409,0xB40A,0xB40B,0xB40C,0xB40D,/* 0xC8-0xCF */
+	0xB40E,0xB40F,0xB411,0xB412,0xB413,0xB414,0xB415,0xB416,/* 0xD0-0xD7 */
+	0xB417,0xB419,0xB41A,0xB41B,0xB41D,0xB41E,0xB41F,0xB421,/* 0xD8-0xDF */
+	0xB422,0xB423,0xB424,0xB425,0xB426,0xB427,0xB42A,0xB42C,/* 0xE0-0xE7 */
+	0xB42D,0xB42E,0xB42F,0xB430,0xB431,0xB432,0xB433,0xB435,/* 0xE8-0xEF */
+	0xB436,0xB437,0xB438,0xB439,0xB43A,0xB43B,0xB43C,0xB43D,/* 0xF0-0xF7 */
+	0xB43E,0xB43F,0xB440,0xB441,0xB442,0xB443,0xB444,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB445,0xB446,0xB447,0xB448,0xB449,0xB44A,0xB44B,/* 0x40-0x47 */
+	0xB44C,0xB44D,0xB44E,0xB44F,0xB452,0xB453,0xB455,0xB456,/* 0x48-0x4F */
+	0xB457,0xB459,0xB45A,0xB45B,0xB45C,0xB45D,0xB45E,0xB45F,/* 0x50-0x57 */
+	0xB462,0xB464,0xB466,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB467,0xB468,0xB469,0xB46A,0xB46B,0xB46D,0xB46E,/* 0x60-0x67 */
+	0xB46F,0xB470,0xB471,0xB472,0xB473,0xB474,0xB475,0xB476,/* 0x68-0x6F */
+	0xB477,0xB478,0xB479,0xB47A,0xB47B,0xB47C,0xB47D,0xB47E,/* 0x70-0x77 */
+	0xB47F,0xB481,0xB482,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB483,0xB484,0xB485,0xB486,0xB487,0xB489,0xB48A,/* 0x80-0x87 */
+	0xB48B,0xB48C,0xB48D,0xB48E,0xB48F,0xB490,0xB491,0xB492,/* 0x88-0x8F */
+	0xB493,0xB494,0xB495,0xB496,0xB497,0xB498,0xB499,0xB49A,/* 0x90-0x97 */
+	0xB49B,0xB49C,0xB49E,0xB49F,0xB4A0,0xB4A1,0xB4A2,0xB4A3,/* 0x98-0x9F */
+	0xB4A5,0xB4A6,0xB4A7,0xB4A9,0xB4AA,0xB4AB,0xB4AD,0xB4AE,/* 0xA0-0xA7 */
+	0xB4AF,0xB4B0,0xB4B1,0xB4B2,0xB4B3,0xB4B4,0xB4B6,0xB4B8,/* 0xA8-0xAF */
+	0xB4BA,0xB4BB,0xB4BC,0xB4BD,0xB4BE,0xB4BF,0xB4C1,0xB4C2,/* 0xB0-0xB7 */
+	0xB4C3,0xB4C5,0xB4C6,0xB4C7,0xB4C9,0xB4CA,0xB4CB,0xB4CC,/* 0xB8-0xBF */
+	0xB4CD,0xB4CE,0xB4CF,0xB4D1,0xB4D2,0xB4D3,0xB4D4,0xB4D6,/* 0xC0-0xC7 */
+	0xB4D7,0xB4D8,0xB4D9,0xB4DA,0xB4DB,0xB4DE,0xB4DF,0xB4E1,/* 0xC8-0xCF */
+	0xB4E2,0xB4E5,0xB4E7,0xB4E8,0xB4E9,0xB4EA,0xB4EB,0xB4EE,/* 0xD0-0xD7 */
+	0xB4F0,0xB4F2,0xB4F3,0xB4F4,0xB4F5,0xB4F6,0xB4F7,0xB4F9,/* 0xD8-0xDF */
+	0xB4FA,0xB4FB,0xB4FC,0xB4FD,0xB4FE,0xB4FF,0xB500,0xB501,/* 0xE0-0xE7 */
+	0xB502,0xB503,0xB504,0xB505,0xB506,0xB507,0xB508,0xB509,/* 0xE8-0xEF */
+	0xB50A,0xB50B,0xB50C,0xB50D,0xB50E,0xB50F,0xB510,0xB511,/* 0xF0-0xF7 */
+	0xB512,0xB513,0xB516,0xB517,0xB519,0xB51A,0xB51D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB51E,0xB51F,0xB520,0xB521,0xB522,0xB523,0xB526,/* 0x40-0x47 */
+	0xB52B,0xB52C,0xB52D,0xB52E,0xB52F,0xB532,0xB533,0xB535,/* 0x48-0x4F */
+	0xB536,0xB537,0xB539,0xB53A,0xB53B,0xB53C,0xB53D,0xB53E,/* 0x50-0x57 */
+	0xB53F,0xB542,0xB546,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB547,0xB548,0xB549,0xB54A,0xB54E,0xB54F,0xB551,/* 0x60-0x67 */
+	0xB552,0xB553,0xB555,0xB556,0xB557,0xB558,0xB559,0xB55A,/* 0x68-0x6F */
+	0xB55B,0xB55E,0xB562,0xB563,0xB564,0xB565,0xB566,0xB567,/* 0x70-0x77 */
+	0xB568,0xB569,0xB56A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB56B,0xB56C,0xB56D,0xB56E,0xB56F,0xB570,0xB571,/* 0x80-0x87 */
+	0xB572,0xB573,0xB574,0xB575,0xB576,0xB577,0xB578,0xB579,/* 0x88-0x8F */
+	0xB57A,0xB57B,0xB57C,0xB57D,0xB57E,0xB57F,0xB580,0xB581,/* 0x90-0x97 */
+	0xB582,0xB583,0xB584,0xB585,0xB586,0xB587,0xB588,0xB589,/* 0x98-0x9F */
+	0xB58A,0xB58B,0xB58C,0xB58D,0xB58E,0xB58F,0xB590,0xB591,/* 0xA0-0xA7 */
+	0xB592,0xB593,0xB594,0xB595,0xB596,0xB597,0xB598,0xB599,/* 0xA8-0xAF */
+	0xB59A,0xB59B,0xB59C,0xB59D,0xB59E,0xB59F,0xB5A2,0xB5A3,/* 0xB0-0xB7 */
+	0xB5A5,0xB5A6,0xB5A7,0xB5A9,0xB5AC,0xB5AD,0xB5AE,0xB5AF,/* 0xB8-0xBF */
+	0xB5B2,0xB5B6,0xB5B7,0xB5B8,0xB5B9,0xB5BA,0xB5BE,0xB5BF,/* 0xC0-0xC7 */
+	0xB5C1,0xB5C2,0xB5C3,0xB5C5,0xB5C6,0xB5C7,0xB5C8,0xB5C9,/* 0xC8-0xCF */
+	0xB5CA,0xB5CB,0xB5CE,0xB5D2,0xB5D3,0xB5D4,0xB5D5,0xB5D6,/* 0xD0-0xD7 */
+	0xB5D7,0xB5D9,0xB5DA,0xB5DB,0xB5DC,0xB5DD,0xB5DE,0xB5DF,/* 0xD8-0xDF */
+	0xB5E0,0xB5E1,0xB5E2,0xB5E3,0xB5E4,0xB5E5,0xB5E6,0xB5E7,/* 0xE0-0xE7 */
+	0xB5E8,0xB5E9,0xB5EA,0xB5EB,0xB5ED,0xB5EE,0xB5EF,0xB5F0,/* 0xE8-0xEF */
+	0xB5F1,0xB5F2,0xB5F3,0xB5F4,0xB5F5,0xB5F6,0xB5F7,0xB5F8,/* 0xF0-0xF7 */
+	0xB5F9,0xB5FA,0xB5FB,0xB5FC,0xB5FD,0xB5FE,0xB5FF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB600,0xB601,0xB602,0xB603,0xB604,0xB605,0xB606,/* 0x40-0x47 */
+	0xB607,0xB608,0xB609,0xB60A,0xB60B,0xB60C,0xB60D,0xB60E,/* 0x48-0x4F */
+	0xB60F,0xB612,0xB613,0xB615,0xB616,0xB617,0xB619,0xB61A,/* 0x50-0x57 */
+	0xB61B,0xB61C,0xB61D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB61E,0xB61F,0xB620,0xB621,0xB622,0xB623,0xB624,/* 0x60-0x67 */
+	0xB626,0xB627,0xB628,0xB629,0xB62A,0xB62B,0xB62D,0xB62E,/* 0x68-0x6F */
+	0xB62F,0xB630,0xB631,0xB632,0xB633,0xB635,0xB636,0xB637,/* 0x70-0x77 */
+	0xB638,0xB639,0xB63A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB63B,0xB63C,0xB63D,0xB63E,0xB63F,0xB640,0xB641,/* 0x80-0x87 */
+	0xB642,0xB643,0xB644,0xB645,0xB646,0xB647,0xB649,0xB64A,/* 0x88-0x8F */
+	0xB64B,0xB64C,0xB64D,0xB64E,0xB64F,0xB650,0xB651,0xB652,/* 0x90-0x97 */
+	0xB653,0xB654,0xB655,0xB656,0xB657,0xB658,0xB659,0xB65A,/* 0x98-0x9F */
+	0xB65B,0xB65C,0xB65D,0xB65E,0xB65F,0xB660,0xB661,0xB662,/* 0xA0-0xA7 */
+	0xB663,0xB665,0xB666,0xB667,0xB669,0xB66A,0xB66B,0xB66C,/* 0xA8-0xAF */
+	0xB66D,0xB66E,0xB66F,0xB670,0xB671,0xB672,0xB673,0xB674,/* 0xB0-0xB7 */
+	0xB675,0xB676,0xB677,0xB678,0xB679,0xB67A,0xB67B,0xB67C,/* 0xB8-0xBF */
+	0xB67D,0xB67E,0xB67F,0xB680,0xB681,0xB682,0xB683,0xB684,/* 0xC0-0xC7 */
+	0xB685,0xB686,0xB687,0xB688,0xB689,0xB68A,0xB68B,0xB68C,/* 0xC8-0xCF */
+	0xB68D,0xB68E,0xB68F,0xB690,0xB691,0xB692,0xB693,0xB694,/* 0xD0-0xD7 */
+	0xB695,0xB696,0xB697,0xB698,0xB699,0xB69A,0xB69B,0xB69E,/* 0xD8-0xDF */
+	0xB69F,0xB6A1,0xB6A2,0xB6A3,0xB6A5,0xB6A6,0xB6A7,0xB6A8,/* 0xE0-0xE7 */
+	0xB6A9,0xB6AA,0xB6AD,0xB6AE,0xB6AF,0xB6B0,0xB6B2,0xB6B3,/* 0xE8-0xEF */
+	0xB6B4,0xB6B5,0xB6B6,0xB6B7,0xB6B8,0xB6B9,0xB6BA,0xB6BB,/* 0xF0-0xF7 */
+	0xB6BC,0xB6BD,0xB6BE,0xB6BF,0xB6C0,0xB6C1,0xB6C2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB6C3,0xB6C4,0xB6C5,0xB6C6,0xB6C7,0xB6C8,0xB6C9,/* 0x40-0x47 */
+	0xB6CA,0xB6CB,0xB6CC,0xB6CD,0xB6CE,0xB6CF,0xB6D0,0xB6D1,/* 0x48-0x4F */
+	0xB6D2,0xB6D3,0xB6D5,0xB6D6,0xB6D7,0xB6D8,0xB6D9,0xB6DA,/* 0x50-0x57 */
+	0xB6DB,0xB6DC,0xB6DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB6DE,0xB6DF,0xB6E0,0xB6E1,0xB6E2,0xB6E3,0xB6E4,/* 0x60-0x67 */
+	0xB6E5,0xB6E6,0xB6E7,0xB6E8,0xB6E9,0xB6EA,0xB6EB,0xB6EC,/* 0x68-0x6F */
+	0xB6ED,0xB6EE,0xB6EF,0xB6F1,0xB6F2,0xB6F3,0xB6F5,0xB6F6,/* 0x70-0x77 */
+	0xB6F7,0xB6F9,0xB6FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB6FB,0xB6FC,0xB6FD,0xB6FE,0xB6FF,0xB702,0xB703,/* 0x80-0x87 */
+	0xB704,0xB706,0xB707,0xB708,0xB709,0xB70A,0xB70B,0xB70C,/* 0x88-0x8F */
+	0xB70D,0xB70E,0xB70F,0xB710,0xB711,0xB712,0xB713,0xB714,/* 0x90-0x97 */
+	0xB715,0xB716,0xB717,0xB718,0xB719,0xB71A,0xB71B,0xB71C,/* 0x98-0x9F */
+	0xB71D,0xB71E,0xB71F,0xB720,0xB721,0xB722,0xB723,0xB724,/* 0xA0-0xA7 */
+	0xB725,0xB726,0xB727,0xB72A,0xB72B,0xB72D,0xB72E,0xB731,/* 0xA8-0xAF */
+	0xB732,0xB733,0xB734,0xB735,0xB736,0xB737,0xB73A,0xB73C,/* 0xB0-0xB7 */
+	0xB73D,0xB73E,0xB73F,0xB740,0xB741,0xB742,0xB743,0xB745,/* 0xB8-0xBF */
+	0xB746,0xB747,0xB749,0xB74A,0xB74B,0xB74D,0xB74E,0xB74F,/* 0xC0-0xC7 */
+	0xB750,0xB751,0xB752,0xB753,0xB756,0xB757,0xB758,0xB759,/* 0xC8-0xCF */
+	0xB75A,0xB75B,0xB75C,0xB75D,0xB75E,0xB75F,0xB761,0xB762,/* 0xD0-0xD7 */
+	0xB763,0xB765,0xB766,0xB767,0xB769,0xB76A,0xB76B,0xB76C,/* 0xD8-0xDF */
+	0xB76D,0xB76E,0xB76F,0xB772,0xB774,0xB776,0xB777,0xB778,/* 0xE0-0xE7 */
+	0xB779,0xB77A,0xB77B,0xB77E,0xB77F,0xB781,0xB782,0xB783,/* 0xE8-0xEF */
+	0xB785,0xB786,0xB787,0xB788,0xB789,0xB78A,0xB78B,0xB78E,/* 0xF0-0xF7 */
+	0xB793,0xB794,0xB795,0xB79A,0xB79B,0xB79D,0xB79E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB79F,0xB7A1,0xB7A2,0xB7A3,0xB7A4,0xB7A5,0xB7A6,/* 0x40-0x47 */
+	0xB7A7,0xB7AA,0xB7AE,0xB7AF,0xB7B0,0xB7B1,0xB7B2,0xB7B3,/* 0x48-0x4F */
+	0xB7B6,0xB7B7,0xB7B9,0xB7BA,0xB7BB,0xB7BC,0xB7BD,0xB7BE,/* 0x50-0x57 */
+	0xB7BF,0xB7C0,0xB7C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB7C2,0xB7C3,0xB7C4,0xB7C5,0xB7C6,0xB7C8,0xB7CA,/* 0x60-0x67 */
+	0xB7CB,0xB7CC,0xB7CD,0xB7CE,0xB7CF,0xB7D0,0xB7D1,0xB7D2,/* 0x68-0x6F */
+	0xB7D3,0xB7D4,0xB7D5,0xB7D6,0xB7D7,0xB7D8,0xB7D9,0xB7DA,/* 0x70-0x77 */
+	0xB7DB,0xB7DC,0xB7DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB7DE,0xB7DF,0xB7E0,0xB7E1,0xB7E2,0xB7E3,0xB7E4,/* 0x80-0x87 */
+	0xB7E5,0xB7E6,0xB7E7,0xB7E8,0xB7E9,0xB7EA,0xB7EB,0xB7EE,/* 0x88-0x8F */
+	0xB7EF,0xB7F1,0xB7F2,0xB7F3,0xB7F5,0xB7F6,0xB7F7,0xB7F8,/* 0x90-0x97 */
+	0xB7F9,0xB7FA,0xB7FB,0xB7FE,0xB802,0xB803,0xB804,0xB805,/* 0x98-0x9F */
+	0xB806,0xB80A,0xB80B,0xB80D,0xB80E,0xB80F,0xB811,0xB812,/* 0xA0-0xA7 */
+	0xB813,0xB814,0xB815,0xB816,0xB817,0xB81A,0xB81C,0xB81E,/* 0xA8-0xAF */
+	0xB81F,0xB820,0xB821,0xB822,0xB823,0xB826,0xB827,0xB829,/* 0xB0-0xB7 */
+	0xB82A,0xB82B,0xB82D,0xB82E,0xB82F,0xB830,0xB831,0xB832,/* 0xB8-0xBF */
+	0xB833,0xB836,0xB83A,0xB83B,0xB83C,0xB83D,0xB83E,0xB83F,/* 0xC0-0xC7 */
+	0xB841,0xB842,0xB843,0xB845,0xB846,0xB847,0xB848,0xB849,/* 0xC8-0xCF */
+	0xB84A,0xB84B,0xB84C,0xB84D,0xB84E,0xB84F,0xB850,0xB852,/* 0xD0-0xD7 */
+	0xB854,0xB855,0xB856,0xB857,0xB858,0xB859,0xB85A,0xB85B,/* 0xD8-0xDF */
+	0xB85E,0xB85F,0xB861,0xB862,0xB863,0xB865,0xB866,0xB867,/* 0xE0-0xE7 */
+	0xB868,0xB869,0xB86A,0xB86B,0xB86E,0xB870,0xB872,0xB873,/* 0xE8-0xEF */
+	0xB874,0xB875,0xB876,0xB877,0xB879,0xB87A,0xB87B,0xB87D,/* 0xF0-0xF7 */
+	0xB87E,0xB87F,0xB880,0xB881,0xB882,0xB883,0xB884,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB885,0xB886,0xB887,0xB888,0xB889,0xB88A,0xB88B,/* 0x40-0x47 */
+	0xB88C,0xB88E,0xB88F,0xB890,0xB891,0xB892,0xB893,0xB894,/* 0x48-0x4F */
+	0xB895,0xB896,0xB897,0xB898,0xB899,0xB89A,0xB89B,0xB89C,/* 0x50-0x57 */
+	0xB89D,0xB89E,0xB89F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB8A0,0xB8A1,0xB8A2,0xB8A3,0xB8A4,0xB8A5,0xB8A6,/* 0x60-0x67 */
+	0xB8A7,0xB8A9,0xB8AA,0xB8AB,0xB8AC,0xB8AD,0xB8AE,0xB8AF,/* 0x68-0x6F */
+	0xB8B1,0xB8B2,0xB8B3,0xB8B5,0xB8B6,0xB8B7,0xB8B9,0xB8BA,/* 0x70-0x77 */
+	0xB8BB,0xB8BC,0xB8BD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB8BE,0xB8BF,0xB8C2,0xB8C4,0xB8C6,0xB8C7,0xB8C8,/* 0x80-0x87 */
+	0xB8C9,0xB8CA,0xB8CB,0xB8CD,0xB8CE,0xB8CF,0xB8D1,0xB8D2,/* 0x88-0x8F */
+	0xB8D3,0xB8D5,0xB8D6,0xB8D7,0xB8D8,0xB8D9,0xB8DA,0xB8DB,/* 0x90-0x97 */
+	0xB8DC,0xB8DE,0xB8E0,0xB8E2,0xB8E3,0xB8E4,0xB8E5,0xB8E6,/* 0x98-0x9F */
+	0xB8E7,0xB8EA,0xB8EB,0xB8ED,0xB8EE,0xB8EF,0xB8F1,0xB8F2,/* 0xA0-0xA7 */
+	0xB8F3,0xB8F4,0xB8F5,0xB8F6,0xB8F7,0xB8FA,0xB8FC,0xB8FE,/* 0xA8-0xAF */
+	0xB8FF,0xB900,0xB901,0xB902,0xB903,0xB905,0xB906,0xB907,/* 0xB0-0xB7 */
+	0xB908,0xB909,0xB90A,0xB90B,0xB90C,0xB90D,0xB90E,0xB90F,/* 0xB8-0xBF */
+	0xB910,0xB911,0xB912,0xB913,0xB914,0xB915,0xB916,0xB917,/* 0xC0-0xC7 */
+	0xB919,0xB91A,0xB91B,0xB91C,0xB91D,0xB91E,0xB91F,0xB921,/* 0xC8-0xCF */
+	0xB922,0xB923,0xB924,0xB925,0xB926,0xB927,0xB928,0xB929,/* 0xD0-0xD7 */
+	0xB92A,0xB92B,0xB92C,0xB92D,0xB92E,0xB92F,0xB930,0xB931,/* 0xD8-0xDF */
+	0xB932,0xB933,0xB934,0xB935,0xB936,0xB937,0xB938,0xB939,/* 0xE0-0xE7 */
+	0xB93A,0xB93B,0xB93E,0xB93F,0xB941,0xB942,0xB943,0xB945,/* 0xE8-0xEF */
+	0xB946,0xB947,0xB948,0xB949,0xB94A,0xB94B,0xB94D,0xB94E,/* 0xF0-0xF7 */
+	0xB950,0xB952,0xB953,0xB954,0xB955,0xB956,0xB957,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_90[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB95A,0xB95B,0xB95D,0xB95E,0xB95F,0xB961,0xB962,/* 0x40-0x47 */
+	0xB963,0xB964,0xB965,0xB966,0xB967,0xB96A,0xB96C,0xB96E,/* 0x48-0x4F */
+	0xB96F,0xB970,0xB971,0xB972,0xB973,0xB976,0xB977,0xB979,/* 0x50-0x57 */
+	0xB97A,0xB97B,0xB97D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB97E,0xB97F,0xB980,0xB981,0xB982,0xB983,0xB986,/* 0x60-0x67 */
+	0xB988,0xB98B,0xB98C,0xB98F,0xB990,0xB991,0xB992,0xB993,/* 0x68-0x6F */
+	0xB994,0xB995,0xB996,0xB997,0xB998,0xB999,0xB99A,0xB99B,/* 0x70-0x77 */
+	0xB99C,0xB99D,0xB99E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB99F,0xB9A0,0xB9A1,0xB9A2,0xB9A3,0xB9A4,0xB9A5,/* 0x80-0x87 */
+	0xB9A6,0xB9A7,0xB9A8,0xB9A9,0xB9AA,0xB9AB,0xB9AE,0xB9AF,/* 0x88-0x8F */
+	0xB9B1,0xB9B2,0xB9B3,0xB9B5,0xB9B6,0xB9B7,0xB9B8,0xB9B9,/* 0x90-0x97 */
+	0xB9BA,0xB9BB,0xB9BE,0xB9C0,0xB9C2,0xB9C3,0xB9C4,0xB9C5,/* 0x98-0x9F */
+	0xB9C6,0xB9C7,0xB9CA,0xB9CB,0xB9CD,0xB9D3,0xB9D4,0xB9D5,/* 0xA0-0xA7 */
+	0xB9D6,0xB9D7,0xB9DA,0xB9DC,0xB9DF,0xB9E0,0xB9E2,0xB9E6,/* 0xA8-0xAF */
+	0xB9E7,0xB9E9,0xB9EA,0xB9EB,0xB9ED,0xB9EE,0xB9EF,0xB9F0,/* 0xB0-0xB7 */
+	0xB9F1,0xB9F2,0xB9F3,0xB9F6,0xB9FB,0xB9FC,0xB9FD,0xB9FE,/* 0xB8-0xBF */
+	0xB9FF,0xBA02,0xBA03,0xBA04,0xBA05,0xBA06,0xBA07,0xBA09,/* 0xC0-0xC7 */
+	0xBA0A,0xBA0B,0xBA0C,0xBA0D,0xBA0E,0xBA0F,0xBA10,0xBA11,/* 0xC8-0xCF */
+	0xBA12,0xBA13,0xBA14,0xBA16,0xBA17,0xBA18,0xBA19,0xBA1A,/* 0xD0-0xD7 */
+	0xBA1B,0xBA1C,0xBA1D,0xBA1E,0xBA1F,0xBA20,0xBA21,0xBA22,/* 0xD8-0xDF */
+	0xBA23,0xBA24,0xBA25,0xBA26,0xBA27,0xBA28,0xBA29,0xBA2A,/* 0xE0-0xE7 */
+	0xBA2B,0xBA2C,0xBA2D,0xBA2E,0xBA2F,0xBA30,0xBA31,0xBA32,/* 0xE8-0xEF */
+	0xBA33,0xBA34,0xBA35,0xBA36,0xBA37,0xBA3A,0xBA3B,0xBA3D,/* 0xF0-0xF7 */
+	0xBA3E,0xBA3F,0xBA41,0xBA43,0xBA44,0xBA45,0xBA46,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_91[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBA47,0xBA4A,0xBA4C,0xBA4F,0xBA50,0xBA51,0xBA52,/* 0x40-0x47 */
+	0xBA56,0xBA57,0xBA59,0xBA5A,0xBA5B,0xBA5D,0xBA5E,0xBA5F,/* 0x48-0x4F */
+	0xBA60,0xBA61,0xBA62,0xBA63,0xBA66,0xBA6A,0xBA6B,0xBA6C,/* 0x50-0x57 */
+	0xBA6D,0xBA6E,0xBA6F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBA72,0xBA73,0xBA75,0xBA76,0xBA77,0xBA79,0xBA7A,/* 0x60-0x67 */
+	0xBA7B,0xBA7C,0xBA7D,0xBA7E,0xBA7F,0xBA80,0xBA81,0xBA82,/* 0x68-0x6F */
+	0xBA86,0xBA88,0xBA89,0xBA8A,0xBA8B,0xBA8D,0xBA8E,0xBA8F,/* 0x70-0x77 */
+	0xBA90,0xBA91,0xBA92,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBA93,0xBA94,0xBA95,0xBA96,0xBA97,0xBA98,0xBA99,/* 0x80-0x87 */
+	0xBA9A,0xBA9B,0xBA9C,0xBA9D,0xBA9E,0xBA9F,0xBAA0,0xBAA1,/* 0x88-0x8F */
+	0xBAA2,0xBAA3,0xBAA4,0xBAA5,0xBAA6,0xBAA7,0xBAAA,0xBAAD,/* 0x90-0x97 */
+	0xBAAE,0xBAAF,0xBAB1,0xBAB3,0xBAB4,0xBAB5,0xBAB6,0xBAB7,/* 0x98-0x9F */
+	0xBABA,0xBABC,0xBABE,0xBABF,0xBAC0,0xBAC1,0xBAC2,0xBAC3,/* 0xA0-0xA7 */
+	0xBAC5,0xBAC6,0xBAC7,0xBAC9,0xBACA,0xBACB,0xBACC,0xBACD,/* 0xA8-0xAF */
+	0xBACE,0xBACF,0xBAD0,0xBAD1,0xBAD2,0xBAD3,0xBAD4,0xBAD5,/* 0xB0-0xB7 */
+	0xBAD6,0xBAD7,0xBADA,0xBADB,0xBADC,0xBADD,0xBADE,0xBADF,/* 0xB8-0xBF */
+	0xBAE0,0xBAE1,0xBAE2,0xBAE3,0xBAE4,0xBAE5,0xBAE6,0xBAE7,/* 0xC0-0xC7 */
+	0xBAE8,0xBAE9,0xBAEA,0xBAEB,0xBAEC,0xBAED,0xBAEE,0xBAEF,/* 0xC8-0xCF */
+	0xBAF0,0xBAF1,0xBAF2,0xBAF3,0xBAF4,0xBAF5,0xBAF6,0xBAF7,/* 0xD0-0xD7 */
+	0xBAF8,0xBAF9,0xBAFA,0xBAFB,0xBAFD,0xBAFE,0xBAFF,0xBB01,/* 0xD8-0xDF */
+	0xBB02,0xBB03,0xBB05,0xBB06,0xBB07,0xBB08,0xBB09,0xBB0A,/* 0xE0-0xE7 */
+	0xBB0B,0xBB0C,0xBB0E,0xBB10,0xBB12,0xBB13,0xBB14,0xBB15,/* 0xE8-0xEF */
+	0xBB16,0xBB17,0xBB19,0xBB1A,0xBB1B,0xBB1D,0xBB1E,0xBB1F,/* 0xF0-0xF7 */
+	0xBB21,0xBB22,0xBB23,0xBB24,0xBB25,0xBB26,0xBB27,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_92[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBB28,0xBB2A,0xBB2C,0xBB2D,0xBB2E,0xBB2F,0xBB30,/* 0x40-0x47 */
+	0xBB31,0xBB32,0xBB33,0xBB37,0xBB39,0xBB3A,0xBB3F,0xBB40,/* 0x48-0x4F */
+	0xBB41,0xBB42,0xBB43,0xBB46,0xBB48,0xBB4A,0xBB4B,0xBB4C,/* 0x50-0x57 */
+	0xBB4E,0xBB51,0xBB52,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBB53,0xBB55,0xBB56,0xBB57,0xBB59,0xBB5A,0xBB5B,/* 0x60-0x67 */
+	0xBB5C,0xBB5D,0xBB5E,0xBB5F,0xBB60,0xBB62,0xBB64,0xBB65,/* 0x68-0x6F */
+	0xBB66,0xBB67,0xBB68,0xBB69,0xBB6A,0xBB6B,0xBB6D,0xBB6E,/* 0x70-0x77 */
+	0xBB6F,0xBB70,0xBB71,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBB72,0xBB73,0xBB74,0xBB75,0xBB76,0xBB77,0xBB78,/* 0x80-0x87 */
+	0xBB79,0xBB7A,0xBB7B,0xBB7C,0xBB7D,0xBB7E,0xBB7F,0xBB80,/* 0x88-0x8F */
+	0xBB81,0xBB82,0xBB83,0xBB84,0xBB85,0xBB86,0xBB87,0xBB89,/* 0x90-0x97 */
+	0xBB8A,0xBB8B,0xBB8D,0xBB8E,0xBB8F,0xBB91,0xBB92,0xBB93,/* 0x98-0x9F */
+	0xBB94,0xBB95,0xBB96,0xBB97,0xBB98,0xBB99,0xBB9A,0xBB9B,/* 0xA0-0xA7 */
+	0xBB9C,0xBB9D,0xBB9E,0xBB9F,0xBBA0,0xBBA1,0xBBA2,0xBBA3,/* 0xA8-0xAF */
+	0xBBA5,0xBBA6,0xBBA7,0xBBA9,0xBBAA,0xBBAB,0xBBAD,0xBBAE,/* 0xB0-0xB7 */
+	0xBBAF,0xBBB0,0xBBB1,0xBBB2,0xBBB3,0xBBB5,0xBBB6,0xBBB8,/* 0xB8-0xBF */
+	0xBBB9,0xBBBA,0xBBBB,0xBBBC,0xBBBD,0xBBBE,0xBBBF,0xBBC1,/* 0xC0-0xC7 */
+	0xBBC2,0xBBC3,0xBBC5,0xBBC6,0xBBC7,0xBBC9,0xBBCA,0xBBCB,/* 0xC8-0xCF */
+	0xBBCC,0xBBCD,0xBBCE,0xBBCF,0xBBD1,0xBBD2,0xBBD4,0xBBD5,/* 0xD0-0xD7 */
+	0xBBD6,0xBBD7,0xBBD8,0xBBD9,0xBBDA,0xBBDB,0xBBDC,0xBBDD,/* 0xD8-0xDF */
+	0xBBDE,0xBBDF,0xBBE0,0xBBE1,0xBBE2,0xBBE3,0xBBE4,0xBBE5,/* 0xE0-0xE7 */
+	0xBBE6,0xBBE7,0xBBE8,0xBBE9,0xBBEA,0xBBEB,0xBBEC,0xBBED,/* 0xE8-0xEF */
+	0xBBEE,0xBBEF,0xBBF0,0xBBF1,0xBBF2,0xBBF3,0xBBF4,0xBBF5,/* 0xF0-0xF7 */
+	0xBBF6,0xBBF7,0xBBFA,0xBBFB,0xBBFD,0xBBFE,0xBC01,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_93[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBC03,0xBC04,0xBC05,0xBC06,0xBC07,0xBC0A,0xBC0E,/* 0x40-0x47 */
+	0xBC10,0xBC12,0xBC13,0xBC19,0xBC1A,0xBC20,0xBC21,0xBC22,/* 0x48-0x4F */
+	0xBC23,0xBC26,0xBC28,0xBC2A,0xBC2B,0xBC2C,0xBC2E,0xBC2F,/* 0x50-0x57 */
+	0xBC32,0xBC33,0xBC35,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBC36,0xBC37,0xBC39,0xBC3A,0xBC3B,0xBC3C,0xBC3D,/* 0x60-0x67 */
+	0xBC3E,0xBC3F,0xBC42,0xBC46,0xBC47,0xBC48,0xBC4A,0xBC4B,/* 0x68-0x6F */
+	0xBC4E,0xBC4F,0xBC51,0xBC52,0xBC53,0xBC54,0xBC55,0xBC56,/* 0x70-0x77 */
+	0xBC57,0xBC58,0xBC59,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBC5A,0xBC5B,0xBC5C,0xBC5E,0xBC5F,0xBC60,0xBC61,/* 0x80-0x87 */
+	0xBC62,0xBC63,0xBC64,0xBC65,0xBC66,0xBC67,0xBC68,0xBC69,/* 0x88-0x8F */
+	0xBC6A,0xBC6B,0xBC6C,0xBC6D,0xBC6E,0xBC6F,0xBC70,0xBC71,/* 0x90-0x97 */
+	0xBC72,0xBC73,0xBC74,0xBC75,0xBC76,0xBC77,0xBC78,0xBC79,/* 0x98-0x9F */
+	0xBC7A,0xBC7B,0xBC7C,0xBC7D,0xBC7E,0xBC7F,0xBC80,0xBC81,/* 0xA0-0xA7 */
+	0xBC82,0xBC83,0xBC86,0xBC87,0xBC89,0xBC8A,0xBC8D,0xBC8F,/* 0xA8-0xAF */
+	0xBC90,0xBC91,0xBC92,0xBC93,0xBC96,0xBC98,0xBC9B,0xBC9C,/* 0xB0-0xB7 */
+	0xBC9D,0xBC9E,0xBC9F,0xBCA2,0xBCA3,0xBCA5,0xBCA6,0xBCA9,/* 0xB8-0xBF */
+	0xBCAA,0xBCAB,0xBCAC,0xBCAD,0xBCAE,0xBCAF,0xBCB2,0xBCB6,/* 0xC0-0xC7 */
+	0xBCB7,0xBCB8,0xBCB9,0xBCBA,0xBCBB,0xBCBE,0xBCBF,0xBCC1,/* 0xC8-0xCF */
+	0xBCC2,0xBCC3,0xBCC5,0xBCC6,0xBCC7,0xBCC8,0xBCC9,0xBCCA,/* 0xD0-0xD7 */
+	0xBCCB,0xBCCC,0xBCCE,0xBCD2,0xBCD3,0xBCD4,0xBCD6,0xBCD7,/* 0xD8-0xDF */
+	0xBCD9,0xBCDA,0xBCDB,0xBCDD,0xBCDE,0xBCDF,0xBCE0,0xBCE1,/* 0xE0-0xE7 */
+	0xBCE2,0xBCE3,0xBCE4,0xBCE5,0xBCE6,0xBCE7,0xBCE8,0xBCE9,/* 0xE8-0xEF */
+	0xBCEA,0xBCEB,0xBCEC,0xBCED,0xBCEE,0xBCEF,0xBCF0,0xBCF1,/* 0xF0-0xF7 */
+	0xBCF2,0xBCF3,0xBCF7,0xBCF9,0xBCFA,0xBCFB,0xBCFD,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_94[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBCFE,0xBCFF,0xBD00,0xBD01,0xBD02,0xBD03,0xBD06,/* 0x40-0x47 */
+	0xBD08,0xBD0A,0xBD0B,0xBD0C,0xBD0D,0xBD0E,0xBD0F,0xBD11,/* 0x48-0x4F */
+	0xBD12,0xBD13,0xBD15,0xBD16,0xBD17,0xBD18,0xBD19,0xBD1A,/* 0x50-0x57 */
+	0xBD1B,0xBD1C,0xBD1D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBD1E,0xBD1F,0xBD20,0xBD21,0xBD22,0xBD23,0xBD25,/* 0x60-0x67 */
+	0xBD26,0xBD27,0xBD28,0xBD29,0xBD2A,0xBD2B,0xBD2D,0xBD2E,/* 0x68-0x6F */
+	0xBD2F,0xBD30,0xBD31,0xBD32,0xBD33,0xBD34,0xBD35,0xBD36,/* 0x70-0x77 */
+	0xBD37,0xBD38,0xBD39,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBD3A,0xBD3B,0xBD3C,0xBD3D,0xBD3E,0xBD3F,0xBD41,/* 0x80-0x87 */
+	0xBD42,0xBD43,0xBD44,0xBD45,0xBD46,0xBD47,0xBD4A,0xBD4B,/* 0x88-0x8F */
+	0xBD4D,0xBD4E,0xBD4F,0xBD51,0xBD52,0xBD53,0xBD54,0xBD55,/* 0x90-0x97 */
+	0xBD56,0xBD57,0xBD5A,0xBD5B,0xBD5C,0xBD5D,0xBD5E,0xBD5F,/* 0x98-0x9F */
+	0xBD60,0xBD61,0xBD62,0xBD63,0xBD65,0xBD66,0xBD67,0xBD69,/* 0xA0-0xA7 */
+	0xBD6A,0xBD6B,0xBD6C,0xBD6D,0xBD6E,0xBD6F,0xBD70,0xBD71,/* 0xA8-0xAF */
+	0xBD72,0xBD73,0xBD74,0xBD75,0xBD76,0xBD77,0xBD78,0xBD79,/* 0xB0-0xB7 */
+	0xBD7A,0xBD7B,0xBD7C,0xBD7D,0xBD7E,0xBD7F,0xBD82,0xBD83,/* 0xB8-0xBF */
+	0xBD85,0xBD86,0xBD8B,0xBD8C,0xBD8D,0xBD8E,0xBD8F,0xBD92,/* 0xC0-0xC7 */
+	0xBD94,0xBD96,0xBD97,0xBD98,0xBD9B,0xBD9D,0xBD9E,0xBD9F,/* 0xC8-0xCF */
+	0xBDA0,0xBDA1,0xBDA2,0xBDA3,0xBDA5,0xBDA6,0xBDA7,0xBDA8,/* 0xD0-0xD7 */
+	0xBDA9,0xBDAA,0xBDAB,0xBDAC,0xBDAD,0xBDAE,0xBDAF,0xBDB1,/* 0xD8-0xDF */
+	0xBDB2,0xBDB3,0xBDB4,0xBDB5,0xBDB6,0xBDB7,0xBDB9,0xBDBA,/* 0xE0-0xE7 */
+	0xBDBB,0xBDBC,0xBDBD,0xBDBE,0xBDBF,0xBDC0,0xBDC1,0xBDC2,/* 0xE8-0xEF */
+	0xBDC3,0xBDC4,0xBDC5,0xBDC6,0xBDC7,0xBDC8,0xBDC9,0xBDCA,/* 0xF0-0xF7 */
+	0xBDCB,0xBDCC,0xBDCD,0xBDCE,0xBDCF,0xBDD0,0xBDD1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_95[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBDD2,0xBDD3,0xBDD6,0xBDD7,0xBDD9,0xBDDA,0xBDDB,/* 0x40-0x47 */
+	0xBDDD,0xBDDE,0xBDDF,0xBDE0,0xBDE1,0xBDE2,0xBDE3,0xBDE4,/* 0x48-0x4F */
+	0xBDE5,0xBDE6,0xBDE7,0xBDE8,0xBDEA,0xBDEB,0xBDEC,0xBDED,/* 0x50-0x57 */
+	0xBDEE,0xBDEF,0xBDF1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBDF2,0xBDF3,0xBDF5,0xBDF6,0xBDF7,0xBDF9,0xBDFA,/* 0x60-0x67 */
+	0xBDFB,0xBDFC,0xBDFD,0xBDFE,0xBDFF,0xBE01,0xBE02,0xBE04,/* 0x68-0x6F */
+	0xBE06,0xBE07,0xBE08,0xBE09,0xBE0A,0xBE0B,0xBE0E,0xBE0F,/* 0x70-0x77 */
+	0xBE11,0xBE12,0xBE13,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBE15,0xBE16,0xBE17,0xBE18,0xBE19,0xBE1A,0xBE1B,/* 0x80-0x87 */
+	0xBE1E,0xBE20,0xBE21,0xBE22,0xBE23,0xBE24,0xBE25,0xBE26,/* 0x88-0x8F */
+	0xBE27,0xBE28,0xBE29,0xBE2A,0xBE2B,0xBE2C,0xBE2D,0xBE2E,/* 0x90-0x97 */
+	0xBE2F,0xBE30,0xBE31,0xBE32,0xBE33,0xBE34,0xBE35,0xBE36,/* 0x98-0x9F */
+	0xBE37,0xBE38,0xBE39,0xBE3A,0xBE3B,0xBE3C,0xBE3D,0xBE3E,/* 0xA0-0xA7 */
+	0xBE3F,0xBE40,0xBE41,0xBE42,0xBE43,0xBE46,0xBE47,0xBE49,/* 0xA8-0xAF */
+	0xBE4A,0xBE4B,0xBE4D,0xBE4F,0xBE50,0xBE51,0xBE52,0xBE53,/* 0xB0-0xB7 */
+	0xBE56,0xBE58,0xBE5C,0xBE5D,0xBE5E,0xBE5F,0xBE62,0xBE63,/* 0xB8-0xBF */
+	0xBE65,0xBE66,0xBE67,0xBE69,0xBE6B,0xBE6C,0xBE6D,0xBE6E,/* 0xC0-0xC7 */
+	0xBE6F,0xBE72,0xBE76,0xBE77,0xBE78,0xBE79,0xBE7A,0xBE7E,/* 0xC8-0xCF */
+	0xBE7F,0xBE81,0xBE82,0xBE83,0xBE85,0xBE86,0xBE87,0xBE88,/* 0xD0-0xD7 */
+	0xBE89,0xBE8A,0xBE8B,0xBE8E,0xBE92,0xBE93,0xBE94,0xBE95,/* 0xD8-0xDF */
+	0xBE96,0xBE97,0xBE9A,0xBE9B,0xBE9C,0xBE9D,0xBE9E,0xBE9F,/* 0xE0-0xE7 */
+	0xBEA0,0xBEA1,0xBEA2,0xBEA3,0xBEA4,0xBEA5,0xBEA6,0xBEA7,/* 0xE8-0xEF */
+	0xBEA9,0xBEAA,0xBEAB,0xBEAC,0xBEAD,0xBEAE,0xBEAF,0xBEB0,/* 0xF0-0xF7 */
+	0xBEB1,0xBEB2,0xBEB3,0xBEB4,0xBEB5,0xBEB6,0xBEB7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_96[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBEB8,0xBEB9,0xBEBA,0xBEBB,0xBEBC,0xBEBD,0xBEBE,/* 0x40-0x47 */
+	0xBEBF,0xBEC0,0xBEC1,0xBEC2,0xBEC3,0xBEC4,0xBEC5,0xBEC6,/* 0x48-0x4F */
+	0xBEC7,0xBEC8,0xBEC9,0xBECA,0xBECB,0xBECC,0xBECD,0xBECE,/* 0x50-0x57 */
+	0xBECF,0xBED2,0xBED3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBED5,0xBED6,0xBED9,0xBEDA,0xBEDB,0xBEDC,0xBEDD,/* 0x60-0x67 */
+	0xBEDE,0xBEDF,0xBEE1,0xBEE2,0xBEE6,0xBEE7,0xBEE8,0xBEE9,/* 0x68-0x6F */
+	0xBEEA,0xBEEB,0xBEED,0xBEEE,0xBEEF,0xBEF0,0xBEF1,0xBEF2,/* 0x70-0x77 */
+	0xBEF3,0xBEF4,0xBEF5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBEF6,0xBEF7,0xBEF8,0xBEF9,0xBEFA,0xBEFB,0xBEFC,/* 0x80-0x87 */
+	0xBEFD,0xBEFE,0xBEFF,0xBF00,0xBF02,0xBF03,0xBF04,0xBF05,/* 0x88-0x8F */
+	0xBF06,0xBF07,0xBF0A,0xBF0B,0xBF0C,0xBF0D,0xBF0E,0xBF0F,/* 0x90-0x97 */
+	0xBF10,0xBF11,0xBF12,0xBF13,0xBF14,0xBF15,0xBF16,0xBF17,/* 0x98-0x9F */
+	0xBF1A,0xBF1E,0xBF1F,0xBF20,0xBF21,0xBF22,0xBF23,0xBF24,/* 0xA0-0xA7 */
+	0xBF25,0xBF26,0xBF27,0xBF28,0xBF29,0xBF2A,0xBF2B,0xBF2C,/* 0xA8-0xAF */
+	0xBF2D,0xBF2E,0xBF2F,0xBF30,0xBF31,0xBF32,0xBF33,0xBF34,/* 0xB0-0xB7 */
+	0xBF35,0xBF36,0xBF37,0xBF38,0xBF39,0xBF3A,0xBF3B,0xBF3C,/* 0xB8-0xBF */
+	0xBF3D,0xBF3E,0xBF3F,0xBF42,0xBF43,0xBF45,0xBF46,0xBF47,/* 0xC0-0xC7 */
+	0xBF49,0xBF4A,0xBF4B,0xBF4C,0xBF4D,0xBF4E,0xBF4F,0xBF52,/* 0xC8-0xCF */
+	0xBF53,0xBF54,0xBF56,0xBF57,0xBF58,0xBF59,0xBF5A,0xBF5B,/* 0xD0-0xD7 */
+	0xBF5C,0xBF5D,0xBF5E,0xBF5F,0xBF60,0xBF61,0xBF62,0xBF63,/* 0xD8-0xDF */
+	0xBF64,0xBF65,0xBF66,0xBF67,0xBF68,0xBF69,0xBF6A,0xBF6B,/* 0xE0-0xE7 */
+	0xBF6C,0xBF6D,0xBF6E,0xBF6F,0xBF70,0xBF71,0xBF72,0xBF73,/* 0xE8-0xEF */
+	0xBF74,0xBF75,0xBF76,0xBF77,0xBF78,0xBF79,0xBF7A,0xBF7B,/* 0xF0-0xF7 */
+	0xBF7C,0xBF7D,0xBF7E,0xBF7F,0xBF80,0xBF81,0xBF82,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_97[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBF83,0xBF84,0xBF85,0xBF86,0xBF87,0xBF88,0xBF89,/* 0x40-0x47 */
+	0xBF8A,0xBF8B,0xBF8C,0xBF8D,0xBF8E,0xBF8F,0xBF90,0xBF91,/* 0x48-0x4F */
+	0xBF92,0xBF93,0xBF95,0xBF96,0xBF97,0xBF98,0xBF99,0xBF9A,/* 0x50-0x57 */
+	0xBF9B,0xBF9C,0xBF9D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBF9E,0xBF9F,0xBFA0,0xBFA1,0xBFA2,0xBFA3,0xBFA4,/* 0x60-0x67 */
+	0xBFA5,0xBFA6,0xBFA7,0xBFA8,0xBFA9,0xBFAA,0xBFAB,0xBFAC,/* 0x68-0x6F */
+	0xBFAD,0xBFAE,0xBFAF,0xBFB1,0xBFB2,0xBFB3,0xBFB4,0xBFB5,/* 0x70-0x77 */
+	0xBFB6,0xBFB7,0xBFB8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBFB9,0xBFBA,0xBFBB,0xBFBC,0xBFBD,0xBFBE,0xBFBF,/* 0x80-0x87 */
+	0xBFC0,0xBFC1,0xBFC2,0xBFC3,0xBFC4,0xBFC6,0xBFC7,0xBFC8,/* 0x88-0x8F */
+	0xBFC9,0xBFCA,0xBFCB,0xBFCE,0xBFCF,0xBFD1,0xBFD2,0xBFD3,/* 0x90-0x97 */
+	0xBFD5,0xBFD6,0xBFD7,0xBFD8,0xBFD9,0xBFDA,0xBFDB,0xBFDD,/* 0x98-0x9F */
+	0xBFDE,0xBFE0,0xBFE2,0xBFE3,0xBFE4,0xBFE5,0xBFE6,0xBFE7,/* 0xA0-0xA7 */
+	0xBFE8,0xBFE9,0xBFEA,0xBFEB,0xBFEC,0xBFED,0xBFEE,0xBFEF,/* 0xA8-0xAF */
+	0xBFF0,0xBFF1,0xBFF2,0xBFF3,0xBFF4,0xBFF5,0xBFF6,0xBFF7,/* 0xB0-0xB7 */
+	0xBFF8,0xBFF9,0xBFFA,0xBFFB,0xBFFC,0xBFFD,0xBFFE,0xBFFF,/* 0xB8-0xBF */
+	0xC000,0xC001,0xC002,0xC003,0xC004,0xC005,0xC006,0xC007,/* 0xC0-0xC7 */
+	0xC008,0xC009,0xC00A,0xC00B,0xC00C,0xC00D,0xC00E,0xC00F,/* 0xC8-0xCF */
+	0xC010,0xC011,0xC012,0xC013,0xC014,0xC015,0xC016,0xC017,/* 0xD0-0xD7 */
+	0xC018,0xC019,0xC01A,0xC01B,0xC01C,0xC01D,0xC01E,0xC01F,/* 0xD8-0xDF */
+	0xC020,0xC021,0xC022,0xC023,0xC024,0xC025,0xC026,0xC027,/* 0xE0-0xE7 */
+	0xC028,0xC029,0xC02A,0xC02B,0xC02C,0xC02D,0xC02E,0xC02F,/* 0xE8-0xEF */
+	0xC030,0xC031,0xC032,0xC033,0xC034,0xC035,0xC036,0xC037,/* 0xF0-0xF7 */
+	0xC038,0xC039,0xC03A,0xC03B,0xC03D,0xC03E,0xC03F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_98[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC040,0xC041,0xC042,0xC043,0xC044,0xC045,0xC046,/* 0x40-0x47 */
+	0xC047,0xC048,0xC049,0xC04A,0xC04B,0xC04C,0xC04D,0xC04E,/* 0x48-0x4F */
+	0xC04F,0xC050,0xC052,0xC053,0xC054,0xC055,0xC056,0xC057,/* 0x50-0x57 */
+	0xC059,0xC05A,0xC05B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC05D,0xC05E,0xC05F,0xC061,0xC062,0xC063,0xC064,/* 0x60-0x67 */
+	0xC065,0xC066,0xC067,0xC06A,0xC06B,0xC06C,0xC06D,0xC06E,/* 0x68-0x6F */
+	0xC06F,0xC070,0xC071,0xC072,0xC073,0xC074,0xC075,0xC076,/* 0x70-0x77 */
+	0xC077,0xC078,0xC079,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC07A,0xC07B,0xC07C,0xC07D,0xC07E,0xC07F,0xC080,/* 0x80-0x87 */
+	0xC081,0xC082,0xC083,0xC084,0xC085,0xC086,0xC087,0xC088,/* 0x88-0x8F */
+	0xC089,0xC08A,0xC08B,0xC08C,0xC08D,0xC08E,0xC08F,0xC092,/* 0x90-0x97 */
+	0xC093,0xC095,0xC096,0xC097,0xC099,0xC09A,0xC09B,0xC09C,/* 0x98-0x9F */
+	0xC09D,0xC09E,0xC09F,0xC0A2,0xC0A4,0xC0A6,0xC0A7,0xC0A8,/* 0xA0-0xA7 */
+	0xC0A9,0xC0AA,0xC0AB,0xC0AE,0xC0B1,0xC0B2,0xC0B7,0xC0B8,/* 0xA8-0xAF */
+	0xC0B9,0xC0BA,0xC0BB,0xC0BE,0xC0C2,0xC0C3,0xC0C4,0xC0C6,/* 0xB0-0xB7 */
+	0xC0C7,0xC0CA,0xC0CB,0xC0CD,0xC0CE,0xC0CF,0xC0D1,0xC0D2,/* 0xB8-0xBF */
+	0xC0D3,0xC0D4,0xC0D5,0xC0D6,0xC0D7,0xC0DA,0xC0DE,0xC0DF,/* 0xC0-0xC7 */
+	0xC0E0,0xC0E1,0xC0E2,0xC0E3,0xC0E6,0xC0E7,0xC0E9,0xC0EA,/* 0xC8-0xCF */
+	0xC0EB,0xC0ED,0xC0EE,0xC0EF,0xC0F0,0xC0F1,0xC0F2,0xC0F3,/* 0xD0-0xD7 */
+	0xC0F6,0xC0F8,0xC0FA,0xC0FB,0xC0FC,0xC0FD,0xC0FE,0xC0FF,/* 0xD8-0xDF */
+	0xC101,0xC102,0xC103,0xC105,0xC106,0xC107,0xC109,0xC10A,/* 0xE0-0xE7 */
+	0xC10B,0xC10C,0xC10D,0xC10E,0xC10F,0xC111,0xC112,0xC113,/* 0xE8-0xEF */
+	0xC114,0xC116,0xC117,0xC118,0xC119,0xC11A,0xC11B,0xC121,/* 0xF0-0xF7 */
+	0xC122,0xC125,0xC128,0xC129,0xC12A,0xC12B,0xC12E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_99[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC132,0xC133,0xC134,0xC135,0xC137,0xC13A,0xC13B,/* 0x40-0x47 */
+	0xC13D,0xC13E,0xC13F,0xC141,0xC142,0xC143,0xC144,0xC145,/* 0x48-0x4F */
+	0xC146,0xC147,0xC14A,0xC14E,0xC14F,0xC150,0xC151,0xC152,/* 0x50-0x57 */
+	0xC153,0xC156,0xC157,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC159,0xC15A,0xC15B,0xC15D,0xC15E,0xC15F,0xC160,/* 0x60-0x67 */
+	0xC161,0xC162,0xC163,0xC166,0xC16A,0xC16B,0xC16C,0xC16D,/* 0x68-0x6F */
+	0xC16E,0xC16F,0xC171,0xC172,0xC173,0xC175,0xC176,0xC177,/* 0x70-0x77 */
+	0xC179,0xC17A,0xC17B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC17C,0xC17D,0xC17E,0xC17F,0xC180,0xC181,0xC182,/* 0x80-0x87 */
+	0xC183,0xC184,0xC186,0xC187,0xC188,0xC189,0xC18A,0xC18B,/* 0x88-0x8F */
+	0xC18F,0xC191,0xC192,0xC193,0xC195,0xC197,0xC198,0xC199,/* 0x90-0x97 */
+	0xC19A,0xC19B,0xC19E,0xC1A0,0xC1A2,0xC1A3,0xC1A4,0xC1A6,/* 0x98-0x9F */
+	0xC1A7,0xC1AA,0xC1AB,0xC1AD,0xC1AE,0xC1AF,0xC1B1,0xC1B2,/* 0xA0-0xA7 */
+	0xC1B3,0xC1B4,0xC1B5,0xC1B6,0xC1B7,0xC1B8,0xC1B9,0xC1BA,/* 0xA8-0xAF */
+	0xC1BB,0xC1BC,0xC1BE,0xC1BF,0xC1C0,0xC1C1,0xC1C2,0xC1C3,/* 0xB0-0xB7 */
+	0xC1C5,0xC1C6,0xC1C7,0xC1C9,0xC1CA,0xC1CB,0xC1CD,0xC1CE,/* 0xB8-0xBF */
+	0xC1CF,0xC1D0,0xC1D1,0xC1D2,0xC1D3,0xC1D5,0xC1D6,0xC1D9,/* 0xC0-0xC7 */
+	0xC1DA,0xC1DB,0xC1DC,0xC1DD,0xC1DE,0xC1DF,0xC1E1,0xC1E2,/* 0xC8-0xCF */
+	0xC1E3,0xC1E5,0xC1E6,0xC1E7,0xC1E9,0xC1EA,0xC1EB,0xC1EC,/* 0xD0-0xD7 */
+	0xC1ED,0xC1EE,0xC1EF,0xC1F2,0xC1F4,0xC1F5,0xC1F6,0xC1F7,/* 0xD8-0xDF */
+	0xC1F8,0xC1F9,0xC1FA,0xC1FB,0xC1FE,0xC1FF,0xC201,0xC202,/* 0xE0-0xE7 */
+	0xC203,0xC205,0xC206,0xC207,0xC208,0xC209,0xC20A,0xC20B,/* 0xE8-0xEF */
+	0xC20E,0xC210,0xC212,0xC213,0xC214,0xC215,0xC216,0xC217,/* 0xF0-0xF7 */
+	0xC21A,0xC21B,0xC21D,0xC21E,0xC221,0xC222,0xC223,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC224,0xC225,0xC226,0xC227,0xC22A,0xC22C,0xC22E,/* 0x40-0x47 */
+	0xC230,0xC233,0xC235,0xC236,0xC237,0xC238,0xC239,0xC23A,/* 0x48-0x4F */
+	0xC23B,0xC23C,0xC23D,0xC23E,0xC23F,0xC240,0xC241,0xC242,/* 0x50-0x57 */
+	0xC243,0xC244,0xC245,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC246,0xC247,0xC249,0xC24A,0xC24B,0xC24C,0xC24D,/* 0x60-0x67 */
+	0xC24E,0xC24F,0xC252,0xC253,0xC255,0xC256,0xC257,0xC259,/* 0x68-0x6F */
+	0xC25A,0xC25B,0xC25C,0xC25D,0xC25E,0xC25F,0xC261,0xC262,/* 0x70-0x77 */
+	0xC263,0xC264,0xC266,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC267,0xC268,0xC269,0xC26A,0xC26B,0xC26E,0xC26F,/* 0x80-0x87 */
+	0xC271,0xC272,0xC273,0xC275,0xC276,0xC277,0xC278,0xC279,/* 0x88-0x8F */
+	0xC27A,0xC27B,0xC27E,0xC280,0xC282,0xC283,0xC284,0xC285,/* 0x90-0x97 */
+	0xC286,0xC287,0xC28A,0xC28B,0xC28C,0xC28D,0xC28E,0xC28F,/* 0x98-0x9F */
+	0xC291,0xC292,0xC293,0xC294,0xC295,0xC296,0xC297,0xC299,/* 0xA0-0xA7 */
+	0xC29A,0xC29C,0xC29E,0xC29F,0xC2A0,0xC2A1,0xC2A2,0xC2A3,/* 0xA8-0xAF */
+	0xC2A6,0xC2A7,0xC2A9,0xC2AA,0xC2AB,0xC2AE,0xC2AF,0xC2B0,/* 0xB0-0xB7 */
+	0xC2B1,0xC2B2,0xC2B3,0xC2B6,0xC2B8,0xC2BA,0xC2BB,0xC2BC,/* 0xB8-0xBF */
+	0xC2BD,0xC2BE,0xC2BF,0xC2C0,0xC2C1,0xC2C2,0xC2C3,0xC2C4,/* 0xC0-0xC7 */
+	0xC2C5,0xC2C6,0xC2C7,0xC2C8,0xC2C9,0xC2CA,0xC2CB,0xC2CC,/* 0xC8-0xCF */
+	0xC2CD,0xC2CE,0xC2CF,0xC2D0,0xC2D1,0xC2D2,0xC2D3,0xC2D4,/* 0xD0-0xD7 */
+	0xC2D5,0xC2D6,0xC2D7,0xC2D8,0xC2D9,0xC2DA,0xC2DB,0xC2DE,/* 0xD8-0xDF */
+	0xC2DF,0xC2E1,0xC2E2,0xC2E5,0xC2E6,0xC2E7,0xC2E8,0xC2E9,/* 0xE0-0xE7 */
+	0xC2EA,0xC2EE,0xC2F0,0xC2F2,0xC2F3,0xC2F4,0xC2F5,0xC2F7,/* 0xE8-0xEF */
+	0xC2FA,0xC2FD,0xC2FE,0xC2FF,0xC301,0xC302,0xC303,0xC304,/* 0xF0-0xF7 */
+	0xC305,0xC306,0xC307,0xC30A,0xC30B,0xC30E,0xC30F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC310,0xC311,0xC312,0xC316,0xC317,0xC319,0xC31A,/* 0x40-0x47 */
+	0xC31B,0xC31D,0xC31E,0xC31F,0xC320,0xC321,0xC322,0xC323,/* 0x48-0x4F */
+	0xC326,0xC327,0xC32A,0xC32B,0xC32C,0xC32D,0xC32E,0xC32F,/* 0x50-0x57 */
+	0xC330,0xC331,0xC332,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC333,0xC334,0xC335,0xC336,0xC337,0xC338,0xC339,/* 0x60-0x67 */
+	0xC33A,0xC33B,0xC33C,0xC33D,0xC33E,0xC33F,0xC340,0xC341,/* 0x68-0x6F */
+	0xC342,0xC343,0xC344,0xC346,0xC347,0xC348,0xC349,0xC34A,/* 0x70-0x77 */
+	0xC34B,0xC34C,0xC34D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC34E,0xC34F,0xC350,0xC351,0xC352,0xC353,0xC354,/* 0x80-0x87 */
+	0xC355,0xC356,0xC357,0xC358,0xC359,0xC35A,0xC35B,0xC35C,/* 0x88-0x8F */
+	0xC35D,0xC35E,0xC35F,0xC360,0xC361,0xC362,0xC363,0xC364,/* 0x90-0x97 */
+	0xC365,0xC366,0xC367,0xC36A,0xC36B,0xC36D,0xC36E,0xC36F,/* 0x98-0x9F */
+	0xC371,0xC373,0xC374,0xC375,0xC376,0xC377,0xC37A,0xC37B,/* 0xA0-0xA7 */
+	0xC37E,0xC37F,0xC380,0xC381,0xC382,0xC383,0xC385,0xC386,/* 0xA8-0xAF */
+	0xC387,0xC389,0xC38A,0xC38B,0xC38D,0xC38E,0xC38F,0xC390,/* 0xB0-0xB7 */
+	0xC391,0xC392,0xC393,0xC394,0xC395,0xC396,0xC397,0xC398,/* 0xB8-0xBF */
+	0xC399,0xC39A,0xC39B,0xC39C,0xC39D,0xC39E,0xC39F,0xC3A0,/* 0xC0-0xC7 */
+	0xC3A1,0xC3A2,0xC3A3,0xC3A4,0xC3A5,0xC3A6,0xC3A7,0xC3A8,/* 0xC8-0xCF */
+	0xC3A9,0xC3AA,0xC3AB,0xC3AC,0xC3AD,0xC3AE,0xC3AF,0xC3B0,/* 0xD0-0xD7 */
+	0xC3B1,0xC3B2,0xC3B3,0xC3B4,0xC3B5,0xC3B6,0xC3B7,0xC3B8,/* 0xD8-0xDF */
+	0xC3B9,0xC3BA,0xC3BB,0xC3BC,0xC3BD,0xC3BE,0xC3BF,0xC3C1,/* 0xE0-0xE7 */
+	0xC3C2,0xC3C3,0xC3C4,0xC3C5,0xC3C6,0xC3C7,0xC3C8,0xC3C9,/* 0xE8-0xEF */
+	0xC3CA,0xC3CB,0xC3CC,0xC3CD,0xC3CE,0xC3CF,0xC3D0,0xC3D1,/* 0xF0-0xF7 */
+	0xC3D2,0xC3D3,0xC3D4,0xC3D5,0xC3D6,0xC3D7,0xC3DA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC3DB,0xC3DD,0xC3DE,0xC3E1,0xC3E3,0xC3E4,0xC3E5,/* 0x40-0x47 */
+	0xC3E6,0xC3E7,0xC3EA,0xC3EB,0xC3EC,0xC3EE,0xC3EF,0xC3F0,/* 0x48-0x4F */
+	0xC3F1,0xC3F2,0xC3F3,0xC3F6,0xC3F7,0xC3F9,0xC3FA,0xC3FB,/* 0x50-0x57 */
+	0xC3FC,0xC3FD,0xC3FE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC3FF,0xC400,0xC401,0xC402,0xC403,0xC404,0xC405,/* 0x60-0x67 */
+	0xC406,0xC407,0xC409,0xC40A,0xC40B,0xC40C,0xC40D,0xC40E,/* 0x68-0x6F */
+	0xC40F,0xC411,0xC412,0xC413,0xC414,0xC415,0xC416,0xC417,/* 0x70-0x77 */
+	0xC418,0xC419,0xC41A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC41B,0xC41C,0xC41D,0xC41E,0xC41F,0xC420,0xC421,/* 0x80-0x87 */
+	0xC422,0xC423,0xC425,0xC426,0xC427,0xC428,0xC429,0xC42A,/* 0x88-0x8F */
+	0xC42B,0xC42D,0xC42E,0xC42F,0xC431,0xC432,0xC433,0xC435,/* 0x90-0x97 */
+	0xC436,0xC437,0xC438,0xC439,0xC43A,0xC43B,0xC43E,0xC43F,/* 0x98-0x9F */
+	0xC440,0xC441,0xC442,0xC443,0xC444,0xC445,0xC446,0xC447,/* 0xA0-0xA7 */
+	0xC449,0xC44A,0xC44B,0xC44C,0xC44D,0xC44E,0xC44F,0xC450,/* 0xA8-0xAF */
+	0xC451,0xC452,0xC453,0xC454,0xC455,0xC456,0xC457,0xC458,/* 0xB0-0xB7 */
+	0xC459,0xC45A,0xC45B,0xC45C,0xC45D,0xC45E,0xC45F,0xC460,/* 0xB8-0xBF */
+	0xC461,0xC462,0xC463,0xC466,0xC467,0xC469,0xC46A,0xC46B,/* 0xC0-0xC7 */
+	0xC46D,0xC46E,0xC46F,0xC470,0xC471,0xC472,0xC473,0xC476,/* 0xC8-0xCF */
+	0xC477,0xC478,0xC47A,0xC47B,0xC47C,0xC47D,0xC47E,0xC47F,/* 0xD0-0xD7 */
+	0xC481,0xC482,0xC483,0xC484,0xC485,0xC486,0xC487,0xC488,/* 0xD8-0xDF */
+	0xC489,0xC48A,0xC48B,0xC48C,0xC48D,0xC48E,0xC48F,0xC490,/* 0xE0-0xE7 */
+	0xC491,0xC492,0xC493,0xC495,0xC496,0xC497,0xC498,0xC499,/* 0xE8-0xEF */
+	0xC49A,0xC49B,0xC49D,0xC49E,0xC49F,0xC4A0,0xC4A1,0xC4A2,/* 0xF0-0xF7 */
+	0xC4A3,0xC4A4,0xC4A5,0xC4A6,0xC4A7,0xC4A8,0xC4A9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC4AA,0xC4AB,0xC4AC,0xC4AD,0xC4AE,0xC4AF,0xC4B0,/* 0x40-0x47 */
+	0xC4B1,0xC4B2,0xC4B3,0xC4B4,0xC4B5,0xC4B6,0xC4B7,0xC4B9,/* 0x48-0x4F */
+	0xC4BA,0xC4BB,0xC4BD,0xC4BE,0xC4BF,0xC4C0,0xC4C1,0xC4C2,/* 0x50-0x57 */
+	0xC4C3,0xC4C4,0xC4C5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC4C6,0xC4C7,0xC4C8,0xC4C9,0xC4CA,0xC4CB,0xC4CC,/* 0x60-0x67 */
+	0xC4CD,0xC4CE,0xC4CF,0xC4D0,0xC4D1,0xC4D2,0xC4D3,0xC4D4,/* 0x68-0x6F */
+	0xC4D5,0xC4D6,0xC4D7,0xC4D8,0xC4D9,0xC4DA,0xC4DB,0xC4DC,/* 0x70-0x77 */
+	0xC4DD,0xC4DE,0xC4DF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC4E0,0xC4E1,0xC4E2,0xC4E3,0xC4E4,0xC4E5,0xC4E6,/* 0x80-0x87 */
+	0xC4E7,0xC4E8,0xC4EA,0xC4EB,0xC4EC,0xC4ED,0xC4EE,0xC4EF,/* 0x88-0x8F */
+	0xC4F2,0xC4F3,0xC4F5,0xC4F6,0xC4F7,0xC4F9,0xC4FB,0xC4FC,/* 0x90-0x97 */
+	0xC4FD,0xC4FE,0xC502,0xC503,0xC504,0xC505,0xC506,0xC507,/* 0x98-0x9F */
+	0xC508,0xC509,0xC50A,0xC50B,0xC50D,0xC50E,0xC50F,0xC511,/* 0xA0-0xA7 */
+	0xC512,0xC513,0xC515,0xC516,0xC517,0xC518,0xC519,0xC51A,/* 0xA8-0xAF */
+	0xC51B,0xC51D,0xC51E,0xC51F,0xC520,0xC521,0xC522,0xC523,/* 0xB0-0xB7 */
+	0xC524,0xC525,0xC526,0xC527,0xC52A,0xC52B,0xC52D,0xC52E,/* 0xB8-0xBF */
+	0xC52F,0xC531,0xC532,0xC533,0xC534,0xC535,0xC536,0xC537,/* 0xC0-0xC7 */
+	0xC53A,0xC53C,0xC53E,0xC53F,0xC540,0xC541,0xC542,0xC543,/* 0xC8-0xCF */
+	0xC546,0xC547,0xC54B,0xC54F,0xC550,0xC551,0xC552,0xC556,/* 0xD0-0xD7 */
+	0xC55A,0xC55B,0xC55C,0xC55F,0xC562,0xC563,0xC565,0xC566,/* 0xD8-0xDF */
+	0xC567,0xC569,0xC56A,0xC56B,0xC56C,0xC56D,0xC56E,0xC56F,/* 0xE0-0xE7 */
+	0xC572,0xC576,0xC577,0xC578,0xC579,0xC57A,0xC57B,0xC57E,/* 0xE8-0xEF */
+	0xC57F,0xC581,0xC582,0xC583,0xC585,0xC586,0xC588,0xC589,/* 0xF0-0xF7 */
+	0xC58A,0xC58B,0xC58E,0xC590,0xC592,0xC593,0xC594,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC596,0xC599,0xC59A,0xC59B,0xC59D,0xC59E,0xC59F,/* 0x40-0x47 */
+	0xC5A1,0xC5A2,0xC5A3,0xC5A4,0xC5A5,0xC5A6,0xC5A7,0xC5A8,/* 0x48-0x4F */
+	0xC5AA,0xC5AB,0xC5AC,0xC5AD,0xC5AE,0xC5AF,0xC5B0,0xC5B1,/* 0x50-0x57 */
+	0xC5B2,0xC5B3,0xC5B6,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC5B7,0xC5BA,0xC5BF,0xC5C0,0xC5C1,0xC5C2,0xC5C3,/* 0x60-0x67 */
+	0xC5CB,0xC5CD,0xC5CF,0xC5D2,0xC5D3,0xC5D5,0xC5D6,0xC5D7,/* 0x68-0x6F */
+	0xC5D9,0xC5DA,0xC5DB,0xC5DC,0xC5DD,0xC5DE,0xC5DF,0xC5E2,/* 0x70-0x77 */
+	0xC5E4,0xC5E6,0xC5E7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC5E8,0xC5E9,0xC5EA,0xC5EB,0xC5EF,0xC5F1,0xC5F2,/* 0x80-0x87 */
+	0xC5F3,0xC5F5,0xC5F8,0xC5F9,0xC5FA,0xC5FB,0xC602,0xC603,/* 0x88-0x8F */
+	0xC604,0xC609,0xC60A,0xC60B,0xC60D,0xC60E,0xC60F,0xC611,/* 0x90-0x97 */
+	0xC612,0xC613,0xC614,0xC615,0xC616,0xC617,0xC61A,0xC61D,/* 0x98-0x9F */
+	0xC61E,0xC61F,0xC620,0xC621,0xC622,0xC623,0xC626,0xC627,/* 0xA0-0xA7 */
+	0xC629,0xC62A,0xC62B,0xC62F,0xC631,0xC632,0xC636,0xC638,/* 0xA8-0xAF */
+	0xC63A,0xC63C,0xC63D,0xC63E,0xC63F,0xC642,0xC643,0xC645,/* 0xB0-0xB7 */
+	0xC646,0xC647,0xC649,0xC64A,0xC64B,0xC64C,0xC64D,0xC64E,/* 0xB8-0xBF */
+	0xC64F,0xC652,0xC656,0xC657,0xC658,0xC659,0xC65A,0xC65B,/* 0xC0-0xC7 */
+	0xC65E,0xC65F,0xC661,0xC662,0xC663,0xC664,0xC665,0xC666,/* 0xC8-0xCF */
+	0xC667,0xC668,0xC669,0xC66A,0xC66B,0xC66D,0xC66E,0xC670,/* 0xD0-0xD7 */
+	0xC672,0xC673,0xC674,0xC675,0xC676,0xC677,0xC67A,0xC67B,/* 0xD8-0xDF */
+	0xC67D,0xC67E,0xC67F,0xC681,0xC682,0xC683,0xC684,0xC685,/* 0xE0-0xE7 */
+	0xC686,0xC687,0xC68A,0xC68C,0xC68E,0xC68F,0xC690,0xC691,/* 0xE8-0xEF */
+	0xC692,0xC693,0xC696,0xC697,0xC699,0xC69A,0xC69B,0xC69D,/* 0xF0-0xF7 */
+	0xC69E,0xC69F,0xC6A0,0xC6A1,0xC6A2,0xC6A3,0xC6A6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC6A8,0xC6AA,0xC6AB,0xC6AC,0xC6AD,0xC6AE,0xC6AF,/* 0x40-0x47 */
+	0xC6B2,0xC6B3,0xC6B5,0xC6B6,0xC6B7,0xC6BB,0xC6BC,0xC6BD,/* 0x48-0x4F */
+	0xC6BE,0xC6BF,0xC6C2,0xC6C4,0xC6C6,0xC6C7,0xC6C8,0xC6C9,/* 0x50-0x57 */
+	0xC6CA,0xC6CB,0xC6CE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC6CF,0xC6D1,0xC6D2,0xC6D3,0xC6D5,0xC6D6,0xC6D7,/* 0x60-0x67 */
+	0xC6D8,0xC6D9,0xC6DA,0xC6DB,0xC6DE,0xC6DF,0xC6E2,0xC6E3,/* 0x68-0x6F */
+	0xC6E4,0xC6E5,0xC6E6,0xC6E7,0xC6EA,0xC6EB,0xC6ED,0xC6EE,/* 0x70-0x77 */
+	0xC6EF,0xC6F1,0xC6F2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC6F3,0xC6F4,0xC6F5,0xC6F6,0xC6F7,0xC6FA,0xC6FB,/* 0x80-0x87 */
+	0xC6FC,0xC6FE,0xC6FF,0xC700,0xC701,0xC702,0xC703,0xC706,/* 0x88-0x8F */
+	0xC707,0xC709,0xC70A,0xC70B,0xC70D,0xC70E,0xC70F,0xC710,/* 0x90-0x97 */
+	0xC711,0xC712,0xC713,0xC716,0xC718,0xC71A,0xC71B,0xC71C,/* 0x98-0x9F */
+	0xC71D,0xC71E,0xC71F,0xC722,0xC723,0xC725,0xC726,0xC727,/* 0xA0-0xA7 */
+	0xC729,0xC72A,0xC72B,0xC72C,0xC72D,0xC72E,0xC72F,0xC732,/* 0xA8-0xAF */
+	0xC734,0xC736,0xC738,0xC739,0xC73A,0xC73B,0xC73E,0xC73F,/* 0xB0-0xB7 */
+	0xC741,0xC742,0xC743,0xC745,0xC746,0xC747,0xC748,0xC749,/* 0xB8-0xBF */
+	0xC74B,0xC74E,0xC750,0xC759,0xC75A,0xC75B,0xC75D,0xC75E,/* 0xC0-0xC7 */
+	0xC75F,0xC761,0xC762,0xC763,0xC764,0xC765,0xC766,0xC767,/* 0xC8-0xCF */
+	0xC769,0xC76A,0xC76C,0xC76D,0xC76E,0xC76F,0xC770,0xC771,/* 0xD0-0xD7 */
+	0xC772,0xC773,0xC776,0xC777,0xC779,0xC77A,0xC77B,0xC77F,/* 0xD8-0xDF */
+	0xC780,0xC781,0xC782,0xC786,0xC78B,0xC78C,0xC78D,0xC78F,/* 0xE0-0xE7 */
+	0xC792,0xC793,0xC795,0xC799,0xC79B,0xC79C,0xC79D,0xC79E,/* 0xE8-0xEF */
+	0xC79F,0xC7A2,0xC7A7,0xC7A8,0xC7A9,0xC7AA,0xC7AB,0xC7AE,/* 0xF0-0xF7 */
+	0xC7AF,0xC7B1,0xC7B2,0xC7B3,0xC7B5,0xC7B6,0xC7B7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC7B8,0xC7B9,0xC7BA,0xC7BB,0xC7BE,0xC7C2,0xC7C3,/* 0x40-0x47 */
+	0xC7C4,0xC7C5,0xC7C6,0xC7C7,0xC7CA,0xC7CB,0xC7CD,0xC7CF,/* 0x48-0x4F */
+	0xC7D1,0xC7D2,0xC7D3,0xC7D4,0xC7D5,0xC7D6,0xC7D7,0xC7D9,/* 0x50-0x57 */
+	0xC7DA,0xC7DB,0xC7DC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC7DE,0xC7DF,0xC7E0,0xC7E1,0xC7E2,0xC7E3,0xC7E5,/* 0x60-0x67 */
+	0xC7E6,0xC7E7,0xC7E9,0xC7EA,0xC7EB,0xC7ED,0xC7EE,0xC7EF,/* 0x68-0x6F */
+	0xC7F0,0xC7F1,0xC7F2,0xC7F3,0xC7F4,0xC7F5,0xC7F6,0xC7F7,/* 0x70-0x77 */
+	0xC7F8,0xC7F9,0xC7FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC7FB,0xC7FC,0xC7FD,0xC7FE,0xC7FF,0xC802,0xC803,/* 0x80-0x87 */
+	0xC805,0xC806,0xC807,0xC809,0xC80B,0xC80C,0xC80D,0xC80E,/* 0x88-0x8F */
+	0xC80F,0xC812,0xC814,0xC817,0xC818,0xC819,0xC81A,0xC81B,/* 0x90-0x97 */
+	0xC81E,0xC81F,0xC821,0xC822,0xC823,0xC825,0xC826,0xC827,/* 0x98-0x9F */
+	0xC828,0xC829,0xC82A,0xC82B,0xC82E,0xC830,0xC832,0xC833,/* 0xA0-0xA7 */
+	0xC834,0xC835,0xC836,0xC837,0xC839,0xC83A,0xC83B,0xC83D,/* 0xA8-0xAF */
+	0xC83E,0xC83F,0xC841,0xC842,0xC843,0xC844,0xC845,0xC846,/* 0xB0-0xB7 */
+	0xC847,0xC84A,0xC84B,0xC84E,0xC84F,0xC850,0xC851,0xC852,/* 0xB8-0xBF */
+	0xC853,0xC855,0xC856,0xC857,0xC858,0xC859,0xC85A,0xC85B,/* 0xC0-0xC7 */
+	0xC85C,0xC85D,0xC85E,0xC85F,0xC860,0xC861,0xC862,0xC863,/* 0xC8-0xCF */
+	0xC864,0xC865,0xC866,0xC867,0xC868,0xC869,0xC86A,0xC86B,/* 0xD0-0xD7 */
+	0xC86C,0xC86D,0xC86E,0xC86F,0xC872,0xC873,0xC875,0xC876,/* 0xD8-0xDF */
+	0xC877,0xC879,0xC87B,0xC87C,0xC87D,0xC87E,0xC87F,0xC882,/* 0xE0-0xE7 */
+	0xC884,0xC888,0xC889,0xC88A,0xC88E,0xC88F,0xC890,0xC891,/* 0xE8-0xEF */
+	0xC892,0xC893,0xC895,0xC896,0xC897,0xC898,0xC899,0xC89A,/* 0xF0-0xF7 */
+	0xC89B,0xC89C,0xC89E,0xC8A0,0xC8A2,0xC8A3,0xC8A4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC8A5,0xC8A6,0xC8A7,0xC8A9,0xC8AA,0xC8AB,0xC8AC,/* 0x40-0x47 */
+	0xC8AD,0xC8AE,0xC8AF,0xC8B0,0xC8B1,0xC8B2,0xC8B3,0xC8B4,/* 0x48-0x4F */
+	0xC8B5,0xC8B6,0xC8B7,0xC8B8,0xC8B9,0xC8BA,0xC8BB,0xC8BE,/* 0x50-0x57 */
+	0xC8BF,0xC8C0,0xC8C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC8C2,0xC8C3,0xC8C5,0xC8C6,0xC8C7,0xC8C9,0xC8CA,/* 0x60-0x67 */
+	0xC8CB,0xC8CD,0xC8CE,0xC8CF,0xC8D0,0xC8D1,0xC8D2,0xC8D3,/* 0x68-0x6F */
+	0xC8D6,0xC8D8,0xC8DA,0xC8DB,0xC8DC,0xC8DD,0xC8DE,0xC8DF,/* 0x70-0x77 */
+	0xC8E2,0xC8E3,0xC8E5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC8E6,0xC8E7,0xC8E8,0xC8E9,0xC8EA,0xC8EB,0xC8EC,/* 0x80-0x87 */
+	0xC8ED,0xC8EE,0xC8EF,0xC8F0,0xC8F1,0xC8F2,0xC8F3,0xC8F4,/* 0x88-0x8F */
+	0xC8F6,0xC8F7,0xC8F8,0xC8F9,0xC8FA,0xC8FB,0xC8FE,0xC8FF,/* 0x90-0x97 */
+	0xC901,0xC902,0xC903,0xC907,0xC908,0xC909,0xC90A,0xC90B,/* 0x98-0x9F */
+	0xC90E,0x3000,0x3001,0x3002,0x00B7,0x2025,0x2026,0x00A8,/* 0xA0-0xA7 */
+	0x3003,0x00AD,0x2015,0x2225,0xFF3C,0x223C,0x2018,0x2019,/* 0xA8-0xAF */
+	0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */
+	0x300C,0x300D,0x300E,0x300F,0x3010,0x3011,0x00B1,0x00D7,/* 0xB8-0xBF */
+	0x00F7,0x2260,0x2264,0x2265,0x221E,0x2234,0x00B0,0x2032,/* 0xC0-0xC7 */
+	0x2033,0x2103,0x212B,0xFFE0,0xFFE1,0xFFE5,0x2642,0x2640,/* 0xC8-0xCF */
+	0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,0x2252,0x00A7,/* 0xD0-0xD7 */
+	0x203B,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0xD8-0xDF */
+	0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x2192,0x2190,/* 0xE0-0xE7 */
+	0x2191,0x2193,0x2194,0x3013,0x226A,0x226B,0x221A,0x223D,/* 0xE8-0xEF */
+	0x221D,0x2235,0x222B,0x222C,0x2208,0x220B,0x2286,0x2287,/* 0xF0-0xF7 */
+	0x2282,0x2283,0x222A,0x2229,0x2227,0x2228,0xFFE2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC910,0xC912,0xC913,0xC914,0xC915,0xC916,0xC917,/* 0x40-0x47 */
+	0xC919,0xC91A,0xC91B,0xC91C,0xC91D,0xC91E,0xC91F,0xC920,/* 0x48-0x4F */
+	0xC921,0xC922,0xC923,0xC924,0xC925,0xC926,0xC927,0xC928,/* 0x50-0x57 */
+	0xC929,0xC92A,0xC92B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC92D,0xC92E,0xC92F,0xC930,0xC931,0xC932,0xC933,/* 0x60-0x67 */
+	0xC935,0xC936,0xC937,0xC938,0xC939,0xC93A,0xC93B,0xC93C,/* 0x68-0x6F */
+	0xC93D,0xC93E,0xC93F,0xC940,0xC941,0xC942,0xC943,0xC944,/* 0x70-0x77 */
+	0xC945,0xC946,0xC947,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC948,0xC949,0xC94A,0xC94B,0xC94C,0xC94D,0xC94E,/* 0x80-0x87 */
+	0xC94F,0xC952,0xC953,0xC955,0xC956,0xC957,0xC959,0xC95A,/* 0x88-0x8F */
+	0xC95B,0xC95C,0xC95D,0xC95E,0xC95F,0xC962,0xC964,0xC965,/* 0x90-0x97 */
+	0xC966,0xC967,0xC968,0xC969,0xC96A,0xC96B,0xC96D,0xC96E,/* 0x98-0x9F */
+	0xC96F,0x21D2,0x21D4,0x2200,0x2203,0x00B4,0xFF5E,0x02C7,/* 0xA0-0xA7 */
+	0x02D8,0x02DD,0x02DA,0x02D9,0x00B8,0x02DB,0x00A1,0x00BF,/* 0xA8-0xAF */
+	0x02D0,0x222E,0x2211,0x220F,0x00A4,0x2109,0x2030,0x25C1,/* 0xB0-0xB7 */
+	0x25C0,0x25B7,0x25B6,0x2664,0x2660,0x2661,0x2665,0x2667,/* 0xB8-0xBF */
+	0x2663,0x2299,0x25C8,0x25A3,0x25D0,0x25D1,0x2592,0x25A4,/* 0xC0-0xC7 */
+	0x25A5,0x25A8,0x25A7,0x25A6,0x25A9,0x2668,0x260F,0x260E,/* 0xC8-0xCF */
+	0x261C,0x261E,0x00B6,0x2020,0x2021,0x2195,0x2197,0x2199,/* 0xD0-0xD7 */
+	0x2196,0x2198,0x266D,0x2669,0x266A,0x266C,0x327F,0x321C,/* 0xD8-0xDF */
+	0x2116,0x33C7,0x2122,0x33C2,0x33D8,0x2121,0x20AC,0x00AE,/* 0xE0-0xE7 */
+};
+
+static const wchar_t c2u_A3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC971,0xC972,0xC973,0xC975,0xC976,0xC977,0xC978,/* 0x40-0x47 */
+	0xC979,0xC97A,0xC97B,0xC97D,0xC97E,0xC97F,0xC980,0xC981,/* 0x48-0x4F */
+	0xC982,0xC983,0xC984,0xC985,0xC986,0xC987,0xC98A,0xC98B,/* 0x50-0x57 */
+	0xC98D,0xC98E,0xC98F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC991,0xC992,0xC993,0xC994,0xC995,0xC996,0xC997,/* 0x60-0x67 */
+	0xC99A,0xC99C,0xC99E,0xC99F,0xC9A0,0xC9A1,0xC9A2,0xC9A3,/* 0x68-0x6F */
+	0xC9A4,0xC9A5,0xC9A6,0xC9A7,0xC9A8,0xC9A9,0xC9AA,0xC9AB,/* 0x70-0x77 */
+	0xC9AC,0xC9AD,0xC9AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC9AF,0xC9B0,0xC9B1,0xC9B2,0xC9B3,0xC9B4,0xC9B5,/* 0x80-0x87 */
+	0xC9B6,0xC9B7,0xC9B8,0xC9B9,0xC9BA,0xC9BB,0xC9BC,0xC9BD,/* 0x88-0x8F */
+	0xC9BE,0xC9BF,0xC9C2,0xC9C3,0xC9C5,0xC9C6,0xC9C9,0xC9CB,/* 0x90-0x97 */
+	0xC9CC,0xC9CD,0xC9CE,0xC9CF,0xC9D2,0xC9D4,0xC9D7,0xC9D8,/* 0x98-0x9F */
+	0xC9DB,0xFF01,0xFF02,0xFF03,0xFF04,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */
+	0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */
+	0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */
+	0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */
+	0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */
+	0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */
+	0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */
+	0xFF38,0xFF39,0xFF3A,0xFF3B,0xFFE6,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */
+	0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */
+	0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC9DE,0xC9DF,0xC9E1,0xC9E3,0xC9E5,0xC9E6,0xC9E8,/* 0x40-0x47 */
+	0xC9E9,0xC9EA,0xC9EB,0xC9EE,0xC9F2,0xC9F3,0xC9F4,0xC9F5,/* 0x48-0x4F */
+	0xC9F6,0xC9F7,0xC9FA,0xC9FB,0xC9FD,0xC9FE,0xC9FF,0xCA01,/* 0x50-0x57 */
+	0xCA02,0xCA03,0xCA04,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCA05,0xCA06,0xCA07,0xCA0A,0xCA0E,0xCA0F,0xCA10,/* 0x60-0x67 */
+	0xCA11,0xCA12,0xCA13,0xCA15,0xCA16,0xCA17,0xCA19,0xCA1A,/* 0x68-0x6F */
+	0xCA1B,0xCA1C,0xCA1D,0xCA1E,0xCA1F,0xCA20,0xCA21,0xCA22,/* 0x70-0x77 */
+	0xCA23,0xCA24,0xCA25,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCA26,0xCA27,0xCA28,0xCA2A,0xCA2B,0xCA2C,0xCA2D,/* 0x80-0x87 */
+	0xCA2E,0xCA2F,0xCA30,0xCA31,0xCA32,0xCA33,0xCA34,0xCA35,/* 0x88-0x8F */
+	0xCA36,0xCA37,0xCA38,0xCA39,0xCA3A,0xCA3B,0xCA3C,0xCA3D,/* 0x90-0x97 */
+	0xCA3E,0xCA3F,0xCA40,0xCA41,0xCA42,0xCA43,0xCA44,0xCA45,/* 0x98-0x9F */
+	0xCA46,0x3131,0x3132,0x3133,0x3134,0x3135,0x3136,0x3137,/* 0xA0-0xA7 */
+	0x3138,0x3139,0x313A,0x313B,0x313C,0x313D,0x313E,0x313F,/* 0xA8-0xAF */
+	0x3140,0x3141,0x3142,0x3143,0x3144,0x3145,0x3146,0x3147,/* 0xB0-0xB7 */
+	0x3148,0x3149,0x314A,0x314B,0x314C,0x314D,0x314E,0x314F,/* 0xB8-0xBF */
+	0x3150,0x3151,0x3152,0x3153,0x3154,0x3155,0x3156,0x3157,/* 0xC0-0xC7 */
+	0x3158,0x3159,0x315A,0x315B,0x315C,0x315D,0x315E,0x315F,/* 0xC8-0xCF */
+	0x3160,0x3161,0x3162,0x3163,0x3164,0x3165,0x3166,0x3167,/* 0xD0-0xD7 */
+	0x3168,0x3169,0x316A,0x316B,0x316C,0x316D,0x316E,0x316F,/* 0xD8-0xDF */
+	0x3170,0x3171,0x3172,0x3173,0x3174,0x3175,0x3176,0x3177,/* 0xE0-0xE7 */
+	0x3178,0x3179,0x317A,0x317B,0x317C,0x317D,0x317E,0x317F,/* 0xE8-0xEF */
+	0x3180,0x3181,0x3182,0x3183,0x3184,0x3185,0x3186,0x3187,/* 0xF0-0xF7 */
+	0x3188,0x3189,0x318A,0x318B,0x318C,0x318D,0x318E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCA47,0xCA48,0xCA49,0xCA4A,0xCA4B,0xCA4E,0xCA4F,/* 0x40-0x47 */
+	0xCA51,0xCA52,0xCA53,0xCA55,0xCA56,0xCA57,0xCA58,0xCA59,/* 0x48-0x4F */
+	0xCA5A,0xCA5B,0xCA5E,0xCA62,0xCA63,0xCA64,0xCA65,0xCA66,/* 0x50-0x57 */
+	0xCA67,0xCA69,0xCA6A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCA6B,0xCA6C,0xCA6D,0xCA6E,0xCA6F,0xCA70,0xCA71,/* 0x60-0x67 */
+	0xCA72,0xCA73,0xCA74,0xCA75,0xCA76,0xCA77,0xCA78,0xCA79,/* 0x68-0x6F */
+	0xCA7A,0xCA7B,0xCA7C,0xCA7E,0xCA7F,0xCA80,0xCA81,0xCA82,/* 0x70-0x77 */
+	0xCA83,0xCA85,0xCA86,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCA87,0xCA88,0xCA89,0xCA8A,0xCA8B,0xCA8C,0xCA8D,/* 0x80-0x87 */
+	0xCA8E,0xCA8F,0xCA90,0xCA91,0xCA92,0xCA93,0xCA94,0xCA95,/* 0x88-0x8F */
+	0xCA96,0xCA97,0xCA99,0xCA9A,0xCA9B,0xCA9C,0xCA9D,0xCA9E,/* 0x90-0x97 */
+	0xCA9F,0xCAA0,0xCAA1,0xCAA2,0xCAA3,0xCAA4,0xCAA5,0xCAA6,/* 0x98-0x9F */
+	0xCAA7,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */
+	0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */
+	0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,0x2167,/* 0xB0-0xB7 */
+	0x2168,0x2169,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */
+	0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xC0-0xC7 */
+	0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xC8-0xCF */
+	0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xD0-0xD7 */
+	0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */
+	0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xE0-0xE7 */
+	0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xE8-0xEF */
+	0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xF0-0xF7 */
+	0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCAA8,0xCAA9,0xCAAA,0xCAAB,0xCAAC,0xCAAD,0xCAAE,/* 0x40-0x47 */
+	0xCAAF,0xCAB0,0xCAB1,0xCAB2,0xCAB3,0xCAB4,0xCAB5,0xCAB6,/* 0x48-0x4F */
+	0xCAB7,0xCAB8,0xCAB9,0xCABA,0xCABB,0xCABE,0xCABF,0xCAC1,/* 0x50-0x57 */
+	0xCAC2,0xCAC3,0xCAC5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCAC6,0xCAC7,0xCAC8,0xCAC9,0xCACA,0xCACB,0xCACE,/* 0x60-0x67 */
+	0xCAD0,0xCAD2,0xCAD4,0xCAD5,0xCAD6,0xCAD7,0xCADA,0xCADB,/* 0x68-0x6F */
+	0xCADC,0xCADD,0xCADE,0xCADF,0xCAE1,0xCAE2,0xCAE3,0xCAE4,/* 0x70-0x77 */
+	0xCAE5,0xCAE6,0xCAE7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCAE8,0xCAE9,0xCAEA,0xCAEB,0xCAED,0xCAEE,0xCAEF,/* 0x80-0x87 */
+	0xCAF0,0xCAF1,0xCAF2,0xCAF3,0xCAF5,0xCAF6,0xCAF7,0xCAF8,/* 0x88-0x8F */
+	0xCAF9,0xCAFA,0xCAFB,0xCAFC,0xCAFD,0xCAFE,0xCAFF,0xCB00,/* 0x90-0x97 */
+	0xCB01,0xCB02,0xCB03,0xCB04,0xCB05,0xCB06,0xCB07,0xCB09,/* 0x98-0x9F */
+	0xCB0A,0x2500,0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,/* 0xA0-0xA7 */
+	0x252C,0x2524,0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,/* 0xA8-0xAF */
+	0x251B,0x2517,0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,/* 0xB0-0xB7 */
+	0x252F,0x2528,0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,/* 0xB8-0xBF */
+	0x2542,0x2512,0x2511,0x251A,0x2519,0x2516,0x2515,0x250E,/* 0xC0-0xC7 */
+	0x250D,0x251E,0x251F,0x2521,0x2522,0x2526,0x2527,0x2529,/* 0xC8-0xCF */
+	0x252A,0x252D,0x252E,0x2531,0x2532,0x2535,0x2536,0x2539,/* 0xD0-0xD7 */
+	0x253A,0x253D,0x253E,0x2540,0x2541,0x2543,0x2544,0x2545,/* 0xD8-0xDF */
+	0x2546,0x2547,0x2548,0x2549,0x254A,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */
+};
+
+static const wchar_t c2u_A7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCB0B,0xCB0C,0xCB0D,0xCB0E,0xCB0F,0xCB11,0xCB12,/* 0x40-0x47 */
+	0xCB13,0xCB15,0xCB16,0xCB17,0xCB19,0xCB1A,0xCB1B,0xCB1C,/* 0x48-0x4F */
+	0xCB1D,0xCB1E,0xCB1F,0xCB22,0xCB23,0xCB24,0xCB25,0xCB26,/* 0x50-0x57 */
+	0xCB27,0xCB28,0xCB29,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCB2A,0xCB2B,0xCB2C,0xCB2D,0xCB2E,0xCB2F,0xCB30,/* 0x60-0x67 */
+	0xCB31,0xCB32,0xCB33,0xCB34,0xCB35,0xCB36,0xCB37,0xCB38,/* 0x68-0x6F */
+	0xCB39,0xCB3A,0xCB3B,0xCB3C,0xCB3D,0xCB3E,0xCB3F,0xCB40,/* 0x70-0x77 */
+	0xCB42,0xCB43,0xCB44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCB45,0xCB46,0xCB47,0xCB4A,0xCB4B,0xCB4D,0xCB4E,/* 0x80-0x87 */
+	0xCB4F,0xCB51,0xCB52,0xCB53,0xCB54,0xCB55,0xCB56,0xCB57,/* 0x88-0x8F */
+	0xCB5A,0xCB5B,0xCB5C,0xCB5E,0xCB5F,0xCB60,0xCB61,0xCB62,/* 0x90-0x97 */
+	0xCB63,0xCB65,0xCB66,0xCB67,0xCB68,0xCB69,0xCB6A,0xCB6B,/* 0x98-0x9F */
+	0xCB6C,0x3395,0x3396,0x3397,0x2113,0x3398,0x33C4,0x33A3,/* 0xA0-0xA7 */
+	0x33A4,0x33A5,0x33A6,0x3399,0x339A,0x339B,0x339C,0x339D,/* 0xA8-0xAF */
+	0x339E,0x339F,0x33A0,0x33A1,0x33A2,0x33CA,0x338D,0x338E,/* 0xB0-0xB7 */
+	0x338F,0x33CF,0x3388,0x3389,0x33C8,0x33A7,0x33A8,0x33B0,/* 0xB8-0xBF */
+	0x33B1,0x33B2,0x33B3,0x33B4,0x33B5,0x33B6,0x33B7,0x33B8,/* 0xC0-0xC7 */
+	0x33B9,0x3380,0x3381,0x3382,0x3383,0x3384,0x33BA,0x33BB,/* 0xC8-0xCF */
+	0x33BC,0x33BD,0x33BE,0x33BF,0x3390,0x3391,0x3392,0x3393,/* 0xD0-0xD7 */
+	0x3394,0x2126,0x33C0,0x33C1,0x338A,0x338B,0x338C,0x33D6,/* 0xD8-0xDF */
+	0x33C5,0x33AD,0x33AE,0x33AF,0x33DB,0x33A9,0x33AA,0x33AB,/* 0xE0-0xE7 */
+	0x33AC,0x33DD,0x33D0,0x33D3,0x33C3,0x33C9,0x33DC,0x33C6,/* 0xE8-0xEF */
+};
+
+static const wchar_t c2u_A8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCB6D,0xCB6E,0xCB6F,0xCB70,0xCB71,0xCB72,0xCB73,/* 0x40-0x47 */
+	0xCB74,0xCB75,0xCB76,0xCB77,0xCB7A,0xCB7B,0xCB7C,0xCB7D,/* 0x48-0x4F */
+	0xCB7E,0xCB7F,0xCB80,0xCB81,0xCB82,0xCB83,0xCB84,0xCB85,/* 0x50-0x57 */
+	0xCB86,0xCB87,0xCB88,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCB89,0xCB8A,0xCB8B,0xCB8C,0xCB8D,0xCB8E,0xCB8F,/* 0x60-0x67 */
+	0xCB90,0xCB91,0xCB92,0xCB93,0xCB94,0xCB95,0xCB96,0xCB97,/* 0x68-0x6F */
+	0xCB98,0xCB99,0xCB9A,0xCB9B,0xCB9D,0xCB9E,0xCB9F,0xCBA0,/* 0x70-0x77 */
+	0xCBA1,0xCBA2,0xCBA3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCBA4,0xCBA5,0xCBA6,0xCBA7,0xCBA8,0xCBA9,0xCBAA,/* 0x80-0x87 */
+	0xCBAB,0xCBAC,0xCBAD,0xCBAE,0xCBAF,0xCBB0,0xCBB1,0xCBB2,/* 0x88-0x8F */
+	0xCBB3,0xCBB4,0xCBB5,0xCBB6,0xCBB7,0xCBB9,0xCBBA,0xCBBB,/* 0x90-0x97 */
+	0xCBBC,0xCBBD,0xCBBE,0xCBBF,0xCBC0,0xCBC1,0xCBC2,0xCBC3,/* 0x98-0x9F */
+	0xCBC4,0x00C6,0x00D0,0x00AA,0x0126,0x0000,0x0132,0x0000,/* 0xA0-0xA7 */
+	0x013F,0x0141,0x00D8,0x0152,0x00BA,0x00DE,0x0166,0x014A,/* 0xA8-0xAF */
+	0x0000,0x3260,0x3261,0x3262,0x3263,0x3264,0x3265,0x3266,/* 0xB0-0xB7 */
+	0x3267,0x3268,0x3269,0x326A,0x326B,0x326C,0x326D,0x326E,/* 0xB8-0xBF */
+	0x326F,0x3270,0x3271,0x3272,0x3273,0x3274,0x3275,0x3276,/* 0xC0-0xC7 */
+	0x3277,0x3278,0x3279,0x327A,0x327B,0x24D0,0x24D1,0x24D2,/* 0xC8-0xCF */
+	0x24D3,0x24D4,0x24D5,0x24D6,0x24D7,0x24D8,0x24D9,0x24DA,/* 0xD0-0xD7 */
+	0x24DB,0x24DC,0x24DD,0x24DE,0x24DF,0x24E0,0x24E1,0x24E2,/* 0xD8-0xDF */
+	0x24E3,0x24E4,0x24E5,0x24E6,0x24E7,0x24E8,0x24E9,0x2460,/* 0xE0-0xE7 */
+	0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,0x2468,/* 0xE8-0xEF */
+	0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x00BD,0x2153,/* 0xF0-0xF7 */
+	0x2154,0x00BC,0x00BE,0x215B,0x215C,0x215D,0x215E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCBC5,0xCBC6,0xCBC7,0xCBC8,0xCBC9,0xCBCA,0xCBCB,/* 0x40-0x47 */
+	0xCBCC,0xCBCD,0xCBCE,0xCBCF,0xCBD0,0xCBD1,0xCBD2,0xCBD3,/* 0x48-0x4F */
+	0xCBD5,0xCBD6,0xCBD7,0xCBD8,0xCBD9,0xCBDA,0xCBDB,0xCBDC,/* 0x50-0x57 */
+	0xCBDD,0xCBDE,0xCBDF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCBE0,0xCBE1,0xCBE2,0xCBE3,0xCBE5,0xCBE6,0xCBE8,/* 0x60-0x67 */
+	0xCBEA,0xCBEB,0xCBEC,0xCBED,0xCBEE,0xCBEF,0xCBF0,0xCBF1,/* 0x68-0x6F */
+	0xCBF2,0xCBF3,0xCBF4,0xCBF5,0xCBF6,0xCBF7,0xCBF8,0xCBF9,/* 0x70-0x77 */
+	0xCBFA,0xCBFB,0xCBFC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCBFD,0xCBFE,0xCBFF,0xCC00,0xCC01,0xCC02,0xCC03,/* 0x80-0x87 */
+	0xCC04,0xCC05,0xCC06,0xCC07,0xCC08,0xCC09,0xCC0A,0xCC0B,/* 0x88-0x8F */
+	0xCC0E,0xCC0F,0xCC11,0xCC12,0xCC13,0xCC15,0xCC16,0xCC17,/* 0x90-0x97 */
+	0xCC18,0xCC19,0xCC1A,0xCC1B,0xCC1E,0xCC1F,0xCC20,0xCC23,/* 0x98-0x9F */
+	0xCC24,0x00E6,0x0111,0x00F0,0x0127,0x0131,0x0133,0x0138,/* 0xA0-0xA7 */
+	0x0140,0x0142,0x00F8,0x0153,0x00DF,0x00FE,0x0167,0x014B,/* 0xA8-0xAF */
+	0x0149,0x3200,0x3201,0x3202,0x3203,0x3204,0x3205,0x3206,/* 0xB0-0xB7 */
+	0x3207,0x3208,0x3209,0x320A,0x320B,0x320C,0x320D,0x320E,/* 0xB8-0xBF */
+	0x320F,0x3210,0x3211,0x3212,0x3213,0x3214,0x3215,0x3216,/* 0xC0-0xC7 */
+	0x3217,0x3218,0x3219,0x321A,0x321B,0x249C,0x249D,0x249E,/* 0xC8-0xCF */
+	0x249F,0x24A0,0x24A1,0x24A2,0x24A3,0x24A4,0x24A5,0x24A6,/* 0xD0-0xD7 */
+	0x24A7,0x24A8,0x24A9,0x24AA,0x24AB,0x24AC,0x24AD,0x24AE,/* 0xD8-0xDF */
+	0x24AF,0x24B0,0x24B1,0x24B2,0x24B3,0x24B4,0x24B5,0x2474,/* 0xE0-0xE7 */
+	0x2475,0x2476,0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,/* 0xE8-0xEF */
+	0x247D,0x247E,0x247F,0x2480,0x2481,0x2482,0x00B9,0x00B2,/* 0xF0-0xF7 */
+	0x00B3,0x2074,0x207F,0x2081,0x2082,0x2083,0x2084,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCC25,0xCC26,0xCC2A,0xCC2B,0xCC2D,0xCC2F,0xCC31,/* 0x40-0x47 */
+	0xCC32,0xCC33,0xCC34,0xCC35,0xCC36,0xCC37,0xCC3A,0xCC3F,/* 0x48-0x4F */
+	0xCC40,0xCC41,0xCC42,0xCC43,0xCC46,0xCC47,0xCC49,0xCC4A,/* 0x50-0x57 */
+	0xCC4B,0xCC4D,0xCC4E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCC4F,0xCC50,0xCC51,0xCC52,0xCC53,0xCC56,0xCC5A,/* 0x60-0x67 */
+	0xCC5B,0xCC5C,0xCC5D,0xCC5E,0xCC5F,0xCC61,0xCC62,0xCC63,/* 0x68-0x6F */
+	0xCC65,0xCC67,0xCC69,0xCC6A,0xCC6B,0xCC6C,0xCC6D,0xCC6E,/* 0x70-0x77 */
+	0xCC6F,0xCC71,0xCC72,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCC73,0xCC74,0xCC76,0xCC77,0xCC78,0xCC79,0xCC7A,/* 0x80-0x87 */
+	0xCC7B,0xCC7C,0xCC7D,0xCC7E,0xCC7F,0xCC80,0xCC81,0xCC82,/* 0x88-0x8F */
+	0xCC83,0xCC84,0xCC85,0xCC86,0xCC87,0xCC88,0xCC89,0xCC8A,/* 0x90-0x97 */
+	0xCC8B,0xCC8C,0xCC8D,0xCC8E,0xCC8F,0xCC90,0xCC91,0xCC92,/* 0x98-0x9F */
+	0xCC93,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */
+	0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */
+	0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */
+	0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */
+	0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */
+	0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */
+	0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */
+	0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */
+	0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */
+	0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */
+	0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_AB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCC94,0xCC95,0xCC96,0xCC97,0xCC9A,0xCC9B,0xCC9D,/* 0x40-0x47 */
+	0xCC9E,0xCC9F,0xCCA1,0xCCA2,0xCCA3,0xCCA4,0xCCA5,0xCCA6,/* 0x48-0x4F */
+	0xCCA7,0xCCAA,0xCCAE,0xCCAF,0xCCB0,0xCCB1,0xCCB2,0xCCB3,/* 0x50-0x57 */
+	0xCCB6,0xCCB7,0xCCB9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCCBA,0xCCBB,0xCCBD,0xCCBE,0xCCBF,0xCCC0,0xCCC1,/* 0x60-0x67 */
+	0xCCC2,0xCCC3,0xCCC6,0xCCC8,0xCCCA,0xCCCB,0xCCCC,0xCCCD,/* 0x68-0x6F */
+	0xCCCE,0xCCCF,0xCCD1,0xCCD2,0xCCD3,0xCCD5,0xCCD6,0xCCD7,/* 0x70-0x77 */
+	0xCCD8,0xCCD9,0xCCDA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCCDB,0xCCDC,0xCCDD,0xCCDE,0xCCDF,0xCCE0,0xCCE1,/* 0x80-0x87 */
+	0xCCE2,0xCCE3,0xCCE5,0xCCE6,0xCCE7,0xCCE8,0xCCE9,0xCCEA,/* 0x88-0x8F */
+	0xCCEB,0xCCED,0xCCEE,0xCCEF,0xCCF1,0xCCF2,0xCCF3,0xCCF4,/* 0x90-0x97 */
+	0xCCF5,0xCCF6,0xCCF7,0xCCF8,0xCCF9,0xCCFA,0xCCFB,0xCCFC,/* 0x98-0x9F */
+	0xCCFD,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */
+	0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */
+	0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */
+	0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */
+	0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */
+	0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */
+	0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */
+	0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */
+	0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */
+	0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */
+	0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_AC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCCFE,0xCCFF,0xCD00,0xCD02,0xCD03,0xCD04,0xCD05,/* 0x40-0x47 */
+	0xCD06,0xCD07,0xCD0A,0xCD0B,0xCD0D,0xCD0E,0xCD0F,0xCD11,/* 0x48-0x4F */
+	0xCD12,0xCD13,0xCD14,0xCD15,0xCD16,0xCD17,0xCD1A,0xCD1C,/* 0x50-0x57 */
+	0xCD1E,0xCD1F,0xCD20,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCD21,0xCD22,0xCD23,0xCD25,0xCD26,0xCD27,0xCD29,/* 0x60-0x67 */
+	0xCD2A,0xCD2B,0xCD2D,0xCD2E,0xCD2F,0xCD30,0xCD31,0xCD32,/* 0x68-0x6F */
+	0xCD33,0xCD34,0xCD35,0xCD36,0xCD37,0xCD38,0xCD3A,0xCD3B,/* 0x70-0x77 */
+	0xCD3C,0xCD3D,0xCD3E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCD3F,0xCD40,0xCD41,0xCD42,0xCD43,0xCD44,0xCD45,/* 0x80-0x87 */
+	0xCD46,0xCD47,0xCD48,0xCD49,0xCD4A,0xCD4B,0xCD4C,0xCD4D,/* 0x88-0x8F */
+	0xCD4E,0xCD4F,0xCD50,0xCD51,0xCD52,0xCD53,0xCD54,0xCD55,/* 0x90-0x97 */
+	0xCD56,0xCD57,0xCD58,0xCD59,0xCD5A,0xCD5B,0xCD5D,0xCD5E,/* 0x98-0x9F */
+	0xCD5F,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */
+	0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */
+	0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */
+	0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */
+	0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */
+	0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */
+	0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */
+	0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */
+	0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_AD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCD61,0xCD62,0xCD63,0xCD65,0xCD66,0xCD67,0xCD68,/* 0x40-0x47 */
+	0xCD69,0xCD6A,0xCD6B,0xCD6E,0xCD70,0xCD72,0xCD73,0xCD74,/* 0x48-0x4F */
+	0xCD75,0xCD76,0xCD77,0xCD79,0xCD7A,0xCD7B,0xCD7C,0xCD7D,/* 0x50-0x57 */
+	0xCD7E,0xCD7F,0xCD80,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCD81,0xCD82,0xCD83,0xCD84,0xCD85,0xCD86,0xCD87,/* 0x60-0x67 */
+	0xCD89,0xCD8A,0xCD8B,0xCD8C,0xCD8D,0xCD8E,0xCD8F,0xCD90,/* 0x68-0x6F */
+	0xCD91,0xCD92,0xCD93,0xCD96,0xCD97,0xCD99,0xCD9A,0xCD9B,/* 0x70-0x77 */
+	0xCD9D,0xCD9E,0xCD9F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCDA0,0xCDA1,0xCDA2,0xCDA3,0xCDA6,0xCDA8,0xCDAA,/* 0x80-0x87 */
+	0xCDAB,0xCDAC,0xCDAD,0xCDAE,0xCDAF,0xCDB1,0xCDB2,0xCDB3,/* 0x88-0x8F */
+	0xCDB4,0xCDB5,0xCDB6,0xCDB7,0xCDB8,0xCDB9,0xCDBA,0xCDBB,/* 0x90-0x97 */
+	0xCDBC,0xCDBD,0xCDBE,0xCDBF,0xCDC0,0xCDC1,0xCDC2,0xCDC3,/* 0x98-0x9F */
+	0xCDC5,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCDC6,0xCDC7,0xCDC8,0xCDC9,0xCDCA,0xCDCB,0xCDCD,/* 0x40-0x47 */
+	0xCDCE,0xCDCF,0xCDD1,0xCDD2,0xCDD3,0xCDD4,0xCDD5,0xCDD6,/* 0x48-0x4F */
+	0xCDD7,0xCDD8,0xCDD9,0xCDDA,0xCDDB,0xCDDC,0xCDDD,0xCDDE,/* 0x50-0x57 */
+	0xCDDF,0xCDE0,0xCDE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCDE2,0xCDE3,0xCDE4,0xCDE5,0xCDE6,0xCDE7,0xCDE9,/* 0x60-0x67 */
+	0xCDEA,0xCDEB,0xCDED,0xCDEE,0xCDEF,0xCDF1,0xCDF2,0xCDF3,/* 0x68-0x6F */
+	0xCDF4,0xCDF5,0xCDF6,0xCDF7,0xCDFA,0xCDFC,0xCDFE,0xCDFF,/* 0x70-0x77 */
+	0xCE00,0xCE01,0xCE02,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCE03,0xCE05,0xCE06,0xCE07,0xCE09,0xCE0A,0xCE0B,/* 0x80-0x87 */
+	0xCE0D,0xCE0E,0xCE0F,0xCE10,0xCE11,0xCE12,0xCE13,0xCE15,/* 0x88-0x8F */
+	0xCE16,0xCE17,0xCE18,0xCE1A,0xCE1B,0xCE1C,0xCE1D,0xCE1E,/* 0x90-0x97 */
+	0xCE1F,0xCE22,0xCE23,0xCE25,0xCE26,0xCE27,0xCE29,0xCE2A,/* 0x98-0x9F */
+	0xCE2B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCE2C,0xCE2D,0xCE2E,0xCE2F,0xCE32,0xCE34,0xCE36,/* 0x40-0x47 */
+	0xCE37,0xCE38,0xCE39,0xCE3A,0xCE3B,0xCE3C,0xCE3D,0xCE3E,/* 0x48-0x4F */
+	0xCE3F,0xCE40,0xCE41,0xCE42,0xCE43,0xCE44,0xCE45,0xCE46,/* 0x50-0x57 */
+	0xCE47,0xCE48,0xCE49,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCE4A,0xCE4B,0xCE4C,0xCE4D,0xCE4E,0xCE4F,0xCE50,/* 0x60-0x67 */
+	0xCE51,0xCE52,0xCE53,0xCE54,0xCE55,0xCE56,0xCE57,0xCE5A,/* 0x68-0x6F */
+	0xCE5B,0xCE5D,0xCE5E,0xCE62,0xCE63,0xCE64,0xCE65,0xCE66,/* 0x70-0x77 */
+	0xCE67,0xCE6A,0xCE6C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCE6E,0xCE6F,0xCE70,0xCE71,0xCE72,0xCE73,0xCE76,/* 0x80-0x87 */
+	0xCE77,0xCE79,0xCE7A,0xCE7B,0xCE7D,0xCE7E,0xCE7F,0xCE80,/* 0x88-0x8F */
+	0xCE81,0xCE82,0xCE83,0xCE86,0xCE88,0xCE8A,0xCE8B,0xCE8C,/* 0x90-0x97 */
+	0xCE8D,0xCE8E,0xCE8F,0xCE92,0xCE93,0xCE95,0xCE96,0xCE97,/* 0x98-0x9F */
+	0xCE99,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_B0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCE9A,0xCE9B,0xCE9C,0xCE9D,0xCE9E,0xCE9F,0xCEA2,/* 0x40-0x47 */
+	0xCEA6,0xCEA7,0xCEA8,0xCEA9,0xCEAA,0xCEAB,0xCEAE,0xCEAF,/* 0x48-0x4F */
+	0xCEB0,0xCEB1,0xCEB2,0xCEB3,0xCEB4,0xCEB5,0xCEB6,0xCEB7,/* 0x50-0x57 */
+	0xCEB8,0xCEB9,0xCEBA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCEBB,0xCEBC,0xCEBD,0xCEBE,0xCEBF,0xCEC0,0xCEC2,/* 0x60-0x67 */
+	0xCEC3,0xCEC4,0xCEC5,0xCEC6,0xCEC7,0xCEC8,0xCEC9,0xCECA,/* 0x68-0x6F */
+	0xCECB,0xCECC,0xCECD,0xCECE,0xCECF,0xCED0,0xCED1,0xCED2,/* 0x70-0x77 */
+	0xCED3,0xCED4,0xCED5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCED6,0xCED7,0xCED8,0xCED9,0xCEDA,0xCEDB,0xCEDC,/* 0x80-0x87 */
+	0xCEDD,0xCEDE,0xCEDF,0xCEE0,0xCEE1,0xCEE2,0xCEE3,0xCEE6,/* 0x88-0x8F */
+	0xCEE7,0xCEE9,0xCEEA,0xCEED,0xCEEE,0xCEEF,0xCEF0,0xCEF1,/* 0x90-0x97 */
+	0xCEF2,0xCEF3,0xCEF6,0xCEFA,0xCEFB,0xCEFC,0xCEFD,0xCEFE,/* 0x98-0x9F */
+	0xCEFF,0xAC00,0xAC01,0xAC04,0xAC07,0xAC08,0xAC09,0xAC0A,/* 0xA0-0xA7 */
+	0xAC10,0xAC11,0xAC12,0xAC13,0xAC14,0xAC15,0xAC16,0xAC17,/* 0xA8-0xAF */
+	0xAC19,0xAC1A,0xAC1B,0xAC1C,0xAC1D,0xAC20,0xAC24,0xAC2C,/* 0xB0-0xB7 */
+	0xAC2D,0xAC2F,0xAC30,0xAC31,0xAC38,0xAC39,0xAC3C,0xAC40,/* 0xB8-0xBF */
+	0xAC4B,0xAC4D,0xAC54,0xAC58,0xAC5C,0xAC70,0xAC71,0xAC74,/* 0xC0-0xC7 */
+	0xAC77,0xAC78,0xAC7A,0xAC80,0xAC81,0xAC83,0xAC84,0xAC85,/* 0xC8-0xCF */
+	0xAC86,0xAC89,0xAC8A,0xAC8B,0xAC8C,0xAC90,0xAC94,0xAC9C,/* 0xD0-0xD7 */
+	0xAC9D,0xAC9F,0xACA0,0xACA1,0xACA8,0xACA9,0xACAA,0xACAC,/* 0xD8-0xDF */
+	0xACAF,0xACB0,0xACB8,0xACB9,0xACBB,0xACBC,0xACBD,0xACC1,/* 0xE0-0xE7 */
+	0xACC4,0xACC8,0xACCC,0xACD5,0xACD7,0xACE0,0xACE1,0xACE4,/* 0xE8-0xEF */
+	0xACE7,0xACE8,0xACEA,0xACEC,0xACEF,0xACF0,0xACF1,0xACF3,/* 0xF0-0xF7 */
+	0xACF5,0xACF6,0xACFC,0xACFD,0xAD00,0xAD04,0xAD06,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCF02,0xCF03,0xCF05,0xCF06,0xCF07,0xCF09,0xCF0A,/* 0x40-0x47 */
+	0xCF0B,0xCF0C,0xCF0D,0xCF0E,0xCF0F,0xCF12,0xCF14,0xCF16,/* 0x48-0x4F */
+	0xCF17,0xCF18,0xCF19,0xCF1A,0xCF1B,0xCF1D,0xCF1E,0xCF1F,/* 0x50-0x57 */
+	0xCF21,0xCF22,0xCF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCF25,0xCF26,0xCF27,0xCF28,0xCF29,0xCF2A,0xCF2B,/* 0x60-0x67 */
+	0xCF2E,0xCF32,0xCF33,0xCF34,0xCF35,0xCF36,0xCF37,0xCF39,/* 0x68-0x6F */
+	0xCF3A,0xCF3B,0xCF3C,0xCF3D,0xCF3E,0xCF3F,0xCF40,0xCF41,/* 0x70-0x77 */
+	0xCF42,0xCF43,0xCF44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCF45,0xCF46,0xCF47,0xCF48,0xCF49,0xCF4A,0xCF4B,/* 0x80-0x87 */
+	0xCF4C,0xCF4D,0xCF4E,0xCF4F,0xCF50,0xCF51,0xCF52,0xCF53,/* 0x88-0x8F */
+	0xCF56,0xCF57,0xCF59,0xCF5A,0xCF5B,0xCF5D,0xCF5E,0xCF5F,/* 0x90-0x97 */
+	0xCF60,0xCF61,0xCF62,0xCF63,0xCF66,0xCF68,0xCF6A,0xCF6B,/* 0x98-0x9F */
+	0xCF6C,0xAD0C,0xAD0D,0xAD0F,0xAD11,0xAD18,0xAD1C,0xAD20,/* 0xA0-0xA7 */
+	0xAD29,0xAD2C,0xAD2D,0xAD34,0xAD35,0xAD38,0xAD3C,0xAD44,/* 0xA8-0xAF */
+	0xAD45,0xAD47,0xAD49,0xAD50,0xAD54,0xAD58,0xAD61,0xAD63,/* 0xB0-0xB7 */
+	0xAD6C,0xAD6D,0xAD70,0xAD73,0xAD74,0xAD75,0xAD76,0xAD7B,/* 0xB8-0xBF */
+	0xAD7C,0xAD7D,0xAD7F,0xAD81,0xAD82,0xAD88,0xAD89,0xAD8C,/* 0xC0-0xC7 */
+	0xAD90,0xAD9C,0xAD9D,0xADA4,0xADB7,0xADC0,0xADC1,0xADC4,/* 0xC8-0xCF */
+	0xADC8,0xADD0,0xADD1,0xADD3,0xADDC,0xADE0,0xADE4,0xADF8,/* 0xD0-0xD7 */
+	0xADF9,0xADFC,0xADFF,0xAE00,0xAE01,0xAE08,0xAE09,0xAE0B,/* 0xD8-0xDF */
+	0xAE0D,0xAE14,0xAE30,0xAE31,0xAE34,0xAE37,0xAE38,0xAE3A,/* 0xE0-0xE7 */
+	0xAE40,0xAE41,0xAE43,0xAE45,0xAE46,0xAE4A,0xAE4C,0xAE4D,/* 0xE8-0xEF */
+	0xAE4E,0xAE50,0xAE54,0xAE56,0xAE5C,0xAE5D,0xAE5F,0xAE60,/* 0xF0-0xF7 */
+	0xAE61,0xAE65,0xAE68,0xAE69,0xAE6C,0xAE70,0xAE78,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCF6D,0xCF6E,0xCF6F,0xCF72,0xCF73,0xCF75,0xCF76,/* 0x40-0x47 */
+	0xCF77,0xCF79,0xCF7A,0xCF7B,0xCF7C,0xCF7D,0xCF7E,0xCF7F,/* 0x48-0x4F */
+	0xCF81,0xCF82,0xCF83,0xCF84,0xCF86,0xCF87,0xCF88,0xCF89,/* 0x50-0x57 */
+	0xCF8A,0xCF8B,0xCF8D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCF8E,0xCF8F,0xCF90,0xCF91,0xCF92,0xCF93,0xCF94,/* 0x60-0x67 */
+	0xCF95,0xCF96,0xCF97,0xCF98,0xCF99,0xCF9A,0xCF9B,0xCF9C,/* 0x68-0x6F */
+	0xCF9D,0xCF9E,0xCF9F,0xCFA0,0xCFA2,0xCFA3,0xCFA4,0xCFA5,/* 0x70-0x77 */
+	0xCFA6,0xCFA7,0xCFA9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCFAA,0xCFAB,0xCFAC,0xCFAD,0xCFAE,0xCFAF,0xCFB1,/* 0x80-0x87 */
+	0xCFB2,0xCFB3,0xCFB4,0xCFB5,0xCFB6,0xCFB7,0xCFB8,0xCFB9,/* 0x88-0x8F */
+	0xCFBA,0xCFBB,0xCFBC,0xCFBD,0xCFBE,0xCFBF,0xCFC0,0xCFC1,/* 0x90-0x97 */
+	0xCFC2,0xCFC3,0xCFC5,0xCFC6,0xCFC7,0xCFC8,0xCFC9,0xCFCA,/* 0x98-0x9F */
+	0xCFCB,0xAE79,0xAE7B,0xAE7C,0xAE7D,0xAE84,0xAE85,0xAE8C,/* 0xA0-0xA7 */
+	0xAEBC,0xAEBD,0xAEBE,0xAEC0,0xAEC4,0xAECC,0xAECD,0xAECF,/* 0xA8-0xAF */
+	0xAED0,0xAED1,0xAED8,0xAED9,0xAEDC,0xAEE8,0xAEEB,0xAEED,/* 0xB0-0xB7 */
+	0xAEF4,0xAEF8,0xAEFC,0xAF07,0xAF08,0xAF0D,0xAF10,0xAF2C,/* 0xB8-0xBF */
+	0xAF2D,0xAF30,0xAF32,0xAF34,0xAF3C,0xAF3D,0xAF3F,0xAF41,/* 0xC0-0xC7 */
+	0xAF42,0xAF43,0xAF48,0xAF49,0xAF50,0xAF5C,0xAF5D,0xAF64,/* 0xC8-0xCF */
+	0xAF65,0xAF79,0xAF80,0xAF84,0xAF88,0xAF90,0xAF91,0xAF95,/* 0xD0-0xD7 */
+	0xAF9C,0xAFB8,0xAFB9,0xAFBC,0xAFC0,0xAFC7,0xAFC8,0xAFC9,/* 0xD8-0xDF */
+	0xAFCB,0xAFCD,0xAFCE,0xAFD4,0xAFDC,0xAFE8,0xAFE9,0xAFF0,/* 0xE0-0xE7 */
+	0xAFF1,0xAFF4,0xAFF8,0xB000,0xB001,0xB004,0xB00C,0xB010,/* 0xE8-0xEF */
+	0xB014,0xB01C,0xB01D,0xB028,0xB044,0xB045,0xB048,0xB04A,/* 0xF0-0xF7 */
+	0xB04C,0xB04E,0xB053,0xB054,0xB055,0xB057,0xB059,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCFCC,0xCFCD,0xCFCE,0xCFCF,0xCFD0,0xCFD1,0xCFD2,/* 0x40-0x47 */
+	0xCFD3,0xCFD4,0xCFD5,0xCFD6,0xCFD7,0xCFD8,0xCFD9,0xCFDA,/* 0x48-0x4F */
+	0xCFDB,0xCFDC,0xCFDD,0xCFDE,0xCFDF,0xCFE2,0xCFE3,0xCFE5,/* 0x50-0x57 */
+	0xCFE6,0xCFE7,0xCFE9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCFEA,0xCFEB,0xCFEC,0xCFED,0xCFEE,0xCFEF,0xCFF2,/* 0x60-0x67 */
+	0xCFF4,0xCFF6,0xCFF7,0xCFF8,0xCFF9,0xCFFA,0xCFFB,0xCFFD,/* 0x68-0x6F */
+	0xCFFE,0xCFFF,0xD001,0xD002,0xD003,0xD005,0xD006,0xD007,/* 0x70-0x77 */
+	0xD008,0xD009,0xD00A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD00B,0xD00C,0xD00D,0xD00E,0xD00F,0xD010,0xD012,/* 0x80-0x87 */
+	0xD013,0xD014,0xD015,0xD016,0xD017,0xD019,0xD01A,0xD01B,/* 0x88-0x8F */
+	0xD01C,0xD01D,0xD01E,0xD01F,0xD020,0xD021,0xD022,0xD023,/* 0x90-0x97 */
+	0xD024,0xD025,0xD026,0xD027,0xD028,0xD029,0xD02A,0xD02B,/* 0x98-0x9F */
+	0xD02C,0xB05D,0xB07C,0xB07D,0xB080,0xB084,0xB08C,0xB08D,/* 0xA0-0xA7 */
+	0xB08F,0xB091,0xB098,0xB099,0xB09A,0xB09C,0xB09F,0xB0A0,/* 0xA8-0xAF */
+	0xB0A1,0xB0A2,0xB0A8,0xB0A9,0xB0AB,0xB0AC,0xB0AD,0xB0AE,/* 0xB0-0xB7 */
+	0xB0AF,0xB0B1,0xB0B3,0xB0B4,0xB0B5,0xB0B8,0xB0BC,0xB0C4,/* 0xB8-0xBF */
+	0xB0C5,0xB0C7,0xB0C8,0xB0C9,0xB0D0,0xB0D1,0xB0D4,0xB0D8,/* 0xC0-0xC7 */
+	0xB0E0,0xB0E5,0xB108,0xB109,0xB10B,0xB10C,0xB110,0xB112,/* 0xC8-0xCF */
+	0xB113,0xB118,0xB119,0xB11B,0xB11C,0xB11D,0xB123,0xB124,/* 0xD0-0xD7 */
+	0xB125,0xB128,0xB12C,0xB134,0xB135,0xB137,0xB138,0xB139,/* 0xD8-0xDF */
+	0xB140,0xB141,0xB144,0xB148,0xB150,0xB151,0xB154,0xB155,/* 0xE0-0xE7 */
+	0xB158,0xB15C,0xB160,0xB178,0xB179,0xB17C,0xB180,0xB182,/* 0xE8-0xEF */
+	0xB188,0xB189,0xB18B,0xB18D,0xB192,0xB193,0xB194,0xB198,/* 0xF0-0xF7 */
+	0xB19C,0xB1A8,0xB1CC,0xB1D0,0xB1D4,0xB1DC,0xB1DD,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD02E,0xD02F,0xD030,0xD031,0xD032,0xD033,0xD036,/* 0x40-0x47 */
+	0xD037,0xD039,0xD03A,0xD03B,0xD03D,0xD03E,0xD03F,0xD040,/* 0x48-0x4F */
+	0xD041,0xD042,0xD043,0xD046,0xD048,0xD04A,0xD04B,0xD04C,/* 0x50-0x57 */
+	0xD04D,0xD04E,0xD04F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD051,0xD052,0xD053,0xD055,0xD056,0xD057,0xD059,/* 0x60-0x67 */
+	0xD05A,0xD05B,0xD05C,0xD05D,0xD05E,0xD05F,0xD061,0xD062,/* 0x68-0x6F */
+	0xD063,0xD064,0xD065,0xD066,0xD067,0xD068,0xD069,0xD06A,/* 0x70-0x77 */
+	0xD06B,0xD06E,0xD06F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD071,0xD072,0xD073,0xD075,0xD076,0xD077,0xD078,/* 0x80-0x87 */
+	0xD079,0xD07A,0xD07B,0xD07E,0xD07F,0xD080,0xD082,0xD083,/* 0x88-0x8F */
+	0xD084,0xD085,0xD086,0xD087,0xD088,0xD089,0xD08A,0xD08B,/* 0x90-0x97 */
+	0xD08C,0xD08D,0xD08E,0xD08F,0xD090,0xD091,0xD092,0xD093,/* 0x98-0x9F */
+	0xD094,0xB1DF,0xB1E8,0xB1E9,0xB1EC,0xB1F0,0xB1F9,0xB1FB,/* 0xA0-0xA7 */
+	0xB1FD,0xB204,0xB205,0xB208,0xB20B,0xB20C,0xB214,0xB215,/* 0xA8-0xAF */
+	0xB217,0xB219,0xB220,0xB234,0xB23C,0xB258,0xB25C,0xB260,/* 0xB0-0xB7 */
+	0xB268,0xB269,0xB274,0xB275,0xB27C,0xB284,0xB285,0xB289,/* 0xB8-0xBF */
+	0xB290,0xB291,0xB294,0xB298,0xB299,0xB29A,0xB2A0,0xB2A1,/* 0xC0-0xC7 */
+	0xB2A3,0xB2A5,0xB2A6,0xB2AA,0xB2AC,0xB2B0,0xB2B4,0xB2C8,/* 0xC8-0xCF */
+	0xB2C9,0xB2CC,0xB2D0,0xB2D2,0xB2D8,0xB2D9,0xB2DB,0xB2DD,/* 0xD0-0xD7 */
+	0xB2E2,0xB2E4,0xB2E5,0xB2E6,0xB2E8,0xB2EB,0xB2EC,0xB2ED,/* 0xD8-0xDF */
+	0xB2EE,0xB2EF,0xB2F3,0xB2F4,0xB2F5,0xB2F7,0xB2F8,0xB2F9,/* 0xE0-0xE7 */
+	0xB2FA,0xB2FB,0xB2FF,0xB300,0xB301,0xB304,0xB308,0xB310,/* 0xE8-0xEF */
+	0xB311,0xB313,0xB314,0xB315,0xB31C,0xB354,0xB355,0xB356,/* 0xF0-0xF7 */
+	0xB358,0xB35B,0xB35C,0xB35E,0xB35F,0xB364,0xB365,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD095,0xD096,0xD097,0xD098,0xD099,0xD09A,0xD09B,/* 0x40-0x47 */
+	0xD09C,0xD09D,0xD09E,0xD09F,0xD0A0,0xD0A1,0xD0A2,0xD0A3,/* 0x48-0x4F */
+	0xD0A6,0xD0A7,0xD0A9,0xD0AA,0xD0AB,0xD0AD,0xD0AE,0xD0AF,/* 0x50-0x57 */
+	0xD0B0,0xD0B1,0xD0B2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD0B3,0xD0B6,0xD0B8,0xD0BA,0xD0BB,0xD0BC,0xD0BD,/* 0x60-0x67 */
+	0xD0BE,0xD0BF,0xD0C2,0xD0C3,0xD0C5,0xD0C6,0xD0C7,0xD0CA,/* 0x68-0x6F */
+	0xD0CB,0xD0CC,0xD0CD,0xD0CE,0xD0CF,0xD0D2,0xD0D6,0xD0D7,/* 0x70-0x77 */
+	0xD0D8,0xD0D9,0xD0DA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD0DB,0xD0DE,0xD0DF,0xD0E1,0xD0E2,0xD0E3,0xD0E5,/* 0x80-0x87 */
+	0xD0E6,0xD0E7,0xD0E8,0xD0E9,0xD0EA,0xD0EB,0xD0EE,0xD0F2,/* 0x88-0x8F */
+	0xD0F3,0xD0F4,0xD0F5,0xD0F6,0xD0F7,0xD0F9,0xD0FA,0xD0FB,/* 0x90-0x97 */
+	0xD0FC,0xD0FD,0xD0FE,0xD0FF,0xD100,0xD101,0xD102,0xD103,/* 0x98-0x9F */
+	0xD104,0xB367,0xB369,0xB36B,0xB36E,0xB370,0xB371,0xB374,/* 0xA0-0xA7 */
+	0xB378,0xB380,0xB381,0xB383,0xB384,0xB385,0xB38C,0xB390,/* 0xA8-0xAF */
+	0xB394,0xB3A0,0xB3A1,0xB3A8,0xB3AC,0xB3C4,0xB3C5,0xB3C8,/* 0xB0-0xB7 */
+	0xB3CB,0xB3CC,0xB3CE,0xB3D0,0xB3D4,0xB3D5,0xB3D7,0xB3D9,/* 0xB8-0xBF */
+	0xB3DB,0xB3DD,0xB3E0,0xB3E4,0xB3E8,0xB3FC,0xB410,0xB418,/* 0xC0-0xC7 */
+	0xB41C,0xB420,0xB428,0xB429,0xB42B,0xB434,0xB450,0xB451,/* 0xC8-0xCF */
+	0xB454,0xB458,0xB460,0xB461,0xB463,0xB465,0xB46C,0xB480,/* 0xD0-0xD7 */
+	0xB488,0xB49D,0xB4A4,0xB4A8,0xB4AC,0xB4B5,0xB4B7,0xB4B9,/* 0xD8-0xDF */
+	0xB4C0,0xB4C4,0xB4C8,0xB4D0,0xB4D5,0xB4DC,0xB4DD,0xB4E0,/* 0xE0-0xE7 */
+	0xB4E3,0xB4E4,0xB4E6,0xB4EC,0xB4ED,0xB4EF,0xB4F1,0xB4F8,/* 0xE8-0xEF */
+	0xB514,0xB515,0xB518,0xB51B,0xB51C,0xB524,0xB525,0xB527,/* 0xF0-0xF7 */
+	0xB528,0xB529,0xB52A,0xB530,0xB531,0xB534,0xB538,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD105,0xD106,0xD107,0xD108,0xD109,0xD10A,0xD10B,/* 0x40-0x47 */
+	0xD10C,0xD10E,0xD10F,0xD110,0xD111,0xD112,0xD113,0xD114,/* 0x48-0x4F */
+	0xD115,0xD116,0xD117,0xD118,0xD119,0xD11A,0xD11B,0xD11C,/* 0x50-0x57 */
+	0xD11D,0xD11E,0xD11F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD120,0xD121,0xD122,0xD123,0xD124,0xD125,0xD126,/* 0x60-0x67 */
+	0xD127,0xD128,0xD129,0xD12A,0xD12B,0xD12C,0xD12D,0xD12E,/* 0x68-0x6F */
+	0xD12F,0xD132,0xD133,0xD135,0xD136,0xD137,0xD139,0xD13B,/* 0x70-0x77 */
+	0xD13C,0xD13D,0xD13E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD13F,0xD142,0xD146,0xD147,0xD148,0xD149,0xD14A,/* 0x80-0x87 */
+	0xD14B,0xD14E,0xD14F,0xD151,0xD152,0xD153,0xD155,0xD156,/* 0x88-0x8F */
+	0xD157,0xD158,0xD159,0xD15A,0xD15B,0xD15E,0xD160,0xD162,/* 0x90-0x97 */
+	0xD163,0xD164,0xD165,0xD166,0xD167,0xD169,0xD16A,0xD16B,/* 0x98-0x9F */
+	0xD16D,0xB540,0xB541,0xB543,0xB544,0xB545,0xB54B,0xB54C,/* 0xA0-0xA7 */
+	0xB54D,0xB550,0xB554,0xB55C,0xB55D,0xB55F,0xB560,0xB561,/* 0xA8-0xAF */
+	0xB5A0,0xB5A1,0xB5A4,0xB5A8,0xB5AA,0xB5AB,0xB5B0,0xB5B1,/* 0xB0-0xB7 */
+	0xB5B3,0xB5B4,0xB5B5,0xB5BB,0xB5BC,0xB5BD,0xB5C0,0xB5C4,/* 0xB8-0xBF */
+	0xB5CC,0xB5CD,0xB5CF,0xB5D0,0xB5D1,0xB5D8,0xB5EC,0xB610,/* 0xC0-0xC7 */
+	0xB611,0xB614,0xB618,0xB625,0xB62C,0xB634,0xB648,0xB664,/* 0xC8-0xCF */
+	0xB668,0xB69C,0xB69D,0xB6A0,0xB6A4,0xB6AB,0xB6AC,0xB6B1,/* 0xD0-0xD7 */
+	0xB6D4,0xB6F0,0xB6F4,0xB6F8,0xB700,0xB701,0xB705,0xB728,/* 0xD8-0xDF */
+	0xB729,0xB72C,0xB72F,0xB730,0xB738,0xB739,0xB73B,0xB744,/* 0xE0-0xE7 */
+	0xB748,0xB74C,0xB754,0xB755,0xB760,0xB764,0xB768,0xB770,/* 0xE8-0xEF */
+	0xB771,0xB773,0xB775,0xB77C,0xB77D,0xB780,0xB784,0xB78C,/* 0xF0-0xF7 */
+	0xB78D,0xB78F,0xB790,0xB791,0xB792,0xB796,0xB797,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD16E,0xD16F,0xD170,0xD171,0xD172,0xD173,0xD174,/* 0x40-0x47 */
+	0xD175,0xD176,0xD177,0xD178,0xD179,0xD17A,0xD17B,0xD17D,/* 0x48-0x4F */
+	0xD17E,0xD17F,0xD180,0xD181,0xD182,0xD183,0xD185,0xD186,/* 0x50-0x57 */
+	0xD187,0xD189,0xD18A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD18B,0xD18C,0xD18D,0xD18E,0xD18F,0xD190,0xD191,/* 0x60-0x67 */
+	0xD192,0xD193,0xD194,0xD195,0xD196,0xD197,0xD198,0xD199,/* 0x68-0x6F */
+	0xD19A,0xD19B,0xD19C,0xD19D,0xD19E,0xD19F,0xD1A2,0xD1A3,/* 0x70-0x77 */
+	0xD1A5,0xD1A6,0xD1A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD1A9,0xD1AA,0xD1AB,0xD1AC,0xD1AD,0xD1AE,0xD1AF,/* 0x80-0x87 */
+	0xD1B2,0xD1B4,0xD1B6,0xD1B7,0xD1B8,0xD1B9,0xD1BB,0xD1BD,/* 0x88-0x8F */
+	0xD1BE,0xD1BF,0xD1C1,0xD1C2,0xD1C3,0xD1C4,0xD1C5,0xD1C6,/* 0x90-0x97 */
+	0xD1C7,0xD1C8,0xD1C9,0xD1CA,0xD1CB,0xD1CC,0xD1CD,0xD1CE,/* 0x98-0x9F */
+	0xD1CF,0xB798,0xB799,0xB79C,0xB7A0,0xB7A8,0xB7A9,0xB7AB,/* 0xA0-0xA7 */
+	0xB7AC,0xB7AD,0xB7B4,0xB7B5,0xB7B8,0xB7C7,0xB7C9,0xB7EC,/* 0xA8-0xAF */
+	0xB7ED,0xB7F0,0xB7F4,0xB7FC,0xB7FD,0xB7FF,0xB800,0xB801,/* 0xB0-0xB7 */
+	0xB807,0xB808,0xB809,0xB80C,0xB810,0xB818,0xB819,0xB81B,/* 0xB8-0xBF */
+	0xB81D,0xB824,0xB825,0xB828,0xB82C,0xB834,0xB835,0xB837,/* 0xC0-0xC7 */
+	0xB838,0xB839,0xB840,0xB844,0xB851,0xB853,0xB85C,0xB85D,/* 0xC8-0xCF */
+	0xB860,0xB864,0xB86C,0xB86D,0xB86F,0xB871,0xB878,0xB87C,/* 0xD0-0xD7 */
+	0xB88D,0xB8A8,0xB8B0,0xB8B4,0xB8B8,0xB8C0,0xB8C1,0xB8C3,/* 0xD8-0xDF */
+	0xB8C5,0xB8CC,0xB8D0,0xB8D4,0xB8DD,0xB8DF,0xB8E1,0xB8E8,/* 0xE0-0xE7 */
+	0xB8E9,0xB8EC,0xB8F0,0xB8F8,0xB8F9,0xB8FB,0xB8FD,0xB904,/* 0xE8-0xEF */
+	0xB918,0xB920,0xB93C,0xB93D,0xB940,0xB944,0xB94C,0xB94F,/* 0xF0-0xF7 */
+	0xB951,0xB958,0xB959,0xB95C,0xB960,0xB968,0xB969,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD1D0,0xD1D1,0xD1D2,0xD1D3,0xD1D4,0xD1D5,0xD1D6,/* 0x40-0x47 */
+	0xD1D7,0xD1D9,0xD1DA,0xD1DB,0xD1DC,0xD1DD,0xD1DE,0xD1DF,/* 0x48-0x4F */
+	0xD1E0,0xD1E1,0xD1E2,0xD1E3,0xD1E4,0xD1E5,0xD1E6,0xD1E7,/* 0x50-0x57 */
+	0xD1E8,0xD1E9,0xD1EA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD1EB,0xD1EC,0xD1ED,0xD1EE,0xD1EF,0xD1F0,0xD1F1,/* 0x60-0x67 */
+	0xD1F2,0xD1F3,0xD1F5,0xD1F6,0xD1F7,0xD1F9,0xD1FA,0xD1FB,/* 0x68-0x6F */
+	0xD1FC,0xD1FD,0xD1FE,0xD1FF,0xD200,0xD201,0xD202,0xD203,/* 0x70-0x77 */
+	0xD204,0xD205,0xD206,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD208,0xD20A,0xD20B,0xD20C,0xD20D,0xD20E,0xD20F,/* 0x80-0x87 */
+	0xD211,0xD212,0xD213,0xD214,0xD215,0xD216,0xD217,0xD218,/* 0x88-0x8F */
+	0xD219,0xD21A,0xD21B,0xD21C,0xD21D,0xD21E,0xD21F,0xD220,/* 0x90-0x97 */
+	0xD221,0xD222,0xD223,0xD224,0xD225,0xD226,0xD227,0xD228,/* 0x98-0x9F */
+	0xD229,0xB96B,0xB96D,0xB974,0xB975,0xB978,0xB97C,0xB984,/* 0xA0-0xA7 */
+	0xB985,0xB987,0xB989,0xB98A,0xB98D,0xB98E,0xB9AC,0xB9AD,/* 0xA8-0xAF */
+	0xB9B0,0xB9B4,0xB9BC,0xB9BD,0xB9BF,0xB9C1,0xB9C8,0xB9C9,/* 0xB0-0xB7 */
+	0xB9CC,0xB9CE,0xB9CF,0xB9D0,0xB9D1,0xB9D2,0xB9D8,0xB9D9,/* 0xB8-0xBF */
+	0xB9DB,0xB9DD,0xB9DE,0xB9E1,0xB9E3,0xB9E4,0xB9E5,0xB9E8,/* 0xC0-0xC7 */
+	0xB9EC,0xB9F4,0xB9F5,0xB9F7,0xB9F8,0xB9F9,0xB9FA,0xBA00,/* 0xC8-0xCF */
+	0xBA01,0xBA08,0xBA15,0xBA38,0xBA39,0xBA3C,0xBA40,0xBA42,/* 0xD0-0xD7 */
+	0xBA48,0xBA49,0xBA4B,0xBA4D,0xBA4E,0xBA53,0xBA54,0xBA55,/* 0xD8-0xDF */
+	0xBA58,0xBA5C,0xBA64,0xBA65,0xBA67,0xBA68,0xBA69,0xBA70,/* 0xE0-0xE7 */
+	0xBA71,0xBA74,0xBA78,0xBA83,0xBA84,0xBA85,0xBA87,0xBA8C,/* 0xE8-0xEF */
+	0xBAA8,0xBAA9,0xBAAB,0xBAAC,0xBAB0,0xBAB2,0xBAB8,0xBAB9,/* 0xF0-0xF7 */
+	0xBABB,0xBABD,0xBAC4,0xBAC8,0xBAD8,0xBAD9,0xBAFC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD22A,0xD22B,0xD22E,0xD22F,0xD231,0xD232,0xD233,/* 0x40-0x47 */
+	0xD235,0xD236,0xD237,0xD238,0xD239,0xD23A,0xD23B,0xD23E,/* 0x48-0x4F */
+	0xD240,0xD242,0xD243,0xD244,0xD245,0xD246,0xD247,0xD249,/* 0x50-0x57 */
+	0xD24A,0xD24B,0xD24C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD24D,0xD24E,0xD24F,0xD250,0xD251,0xD252,0xD253,/* 0x60-0x67 */
+	0xD254,0xD255,0xD256,0xD257,0xD258,0xD259,0xD25A,0xD25B,/* 0x68-0x6F */
+	0xD25D,0xD25E,0xD25F,0xD260,0xD261,0xD262,0xD263,0xD265,/* 0x70-0x77 */
+	0xD266,0xD267,0xD268,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD269,0xD26A,0xD26B,0xD26C,0xD26D,0xD26E,0xD26F,/* 0x80-0x87 */
+	0xD270,0xD271,0xD272,0xD273,0xD274,0xD275,0xD276,0xD277,/* 0x88-0x8F */
+	0xD278,0xD279,0xD27A,0xD27B,0xD27C,0xD27D,0xD27E,0xD27F,/* 0x90-0x97 */
+	0xD282,0xD283,0xD285,0xD286,0xD287,0xD289,0xD28A,0xD28B,/* 0x98-0x9F */
+	0xD28C,0xBB00,0xBB04,0xBB0D,0xBB0F,0xBB11,0xBB18,0xBB1C,/* 0xA0-0xA7 */
+	0xBB20,0xBB29,0xBB2B,0xBB34,0xBB35,0xBB36,0xBB38,0xBB3B,/* 0xA8-0xAF */
+	0xBB3C,0xBB3D,0xBB3E,0xBB44,0xBB45,0xBB47,0xBB49,0xBB4D,/* 0xB0-0xB7 */
+	0xBB4F,0xBB50,0xBB54,0xBB58,0xBB61,0xBB63,0xBB6C,0xBB88,/* 0xB8-0xBF */
+	0xBB8C,0xBB90,0xBBA4,0xBBA8,0xBBAC,0xBBB4,0xBBB7,0xBBC0,/* 0xC0-0xC7 */
+	0xBBC4,0xBBC8,0xBBD0,0xBBD3,0xBBF8,0xBBF9,0xBBFC,0xBBFF,/* 0xC8-0xCF */
+	0xBC00,0xBC02,0xBC08,0xBC09,0xBC0B,0xBC0C,0xBC0D,0xBC0F,/* 0xD0-0xD7 */
+	0xBC11,0xBC14,0xBC15,0xBC16,0xBC17,0xBC18,0xBC1B,0xBC1C,/* 0xD8-0xDF */
+	0xBC1D,0xBC1E,0xBC1F,0xBC24,0xBC25,0xBC27,0xBC29,0xBC2D,/* 0xE0-0xE7 */
+	0xBC30,0xBC31,0xBC34,0xBC38,0xBC40,0xBC41,0xBC43,0xBC44,/* 0xE8-0xEF */
+	0xBC45,0xBC49,0xBC4C,0xBC4D,0xBC50,0xBC5D,0xBC84,0xBC85,/* 0xF0-0xF7 */
+	0xBC88,0xBC8B,0xBC8C,0xBC8E,0xBC94,0xBC95,0xBC97,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD28D,0xD28E,0xD28F,0xD292,0xD293,0xD294,0xD296,/* 0x40-0x47 */
+	0xD297,0xD298,0xD299,0xD29A,0xD29B,0xD29D,0xD29E,0xD29F,/* 0x48-0x4F */
+	0xD2A1,0xD2A2,0xD2A3,0xD2A5,0xD2A6,0xD2A7,0xD2A8,0xD2A9,/* 0x50-0x57 */
+	0xD2AA,0xD2AB,0xD2AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD2AE,0xD2AF,0xD2B0,0xD2B2,0xD2B3,0xD2B4,0xD2B5,/* 0x60-0x67 */
+	0xD2B6,0xD2B7,0xD2BA,0xD2BB,0xD2BD,0xD2BE,0xD2C1,0xD2C3,/* 0x68-0x6F */
+	0xD2C4,0xD2C5,0xD2C6,0xD2C7,0xD2CA,0xD2CC,0xD2CD,0xD2CE,/* 0x70-0x77 */
+	0xD2CF,0xD2D0,0xD2D1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD2D2,0xD2D3,0xD2D5,0xD2D6,0xD2D7,0xD2D9,0xD2DA,/* 0x80-0x87 */
+	0xD2DB,0xD2DD,0xD2DE,0xD2DF,0xD2E0,0xD2E1,0xD2E2,0xD2E3,/* 0x88-0x8F */
+	0xD2E6,0xD2E7,0xD2E8,0xD2E9,0xD2EA,0xD2EB,0xD2EC,0xD2ED,/* 0x90-0x97 */
+	0xD2EE,0xD2EF,0xD2F2,0xD2F3,0xD2F5,0xD2F6,0xD2F7,0xD2F9,/* 0x98-0x9F */
+	0xD2FA,0xBC99,0xBC9A,0xBCA0,0xBCA1,0xBCA4,0xBCA7,0xBCA8,/* 0xA0-0xA7 */
+	0xBCB0,0xBCB1,0xBCB3,0xBCB4,0xBCB5,0xBCBC,0xBCBD,0xBCC0,/* 0xA8-0xAF */
+	0xBCC4,0xBCCD,0xBCCF,0xBCD0,0xBCD1,0xBCD5,0xBCD8,0xBCDC,/* 0xB0-0xB7 */
+	0xBCF4,0xBCF5,0xBCF6,0xBCF8,0xBCFC,0xBD04,0xBD05,0xBD07,/* 0xB8-0xBF */
+	0xBD09,0xBD10,0xBD14,0xBD24,0xBD2C,0xBD40,0xBD48,0xBD49,/* 0xC0-0xC7 */
+	0xBD4C,0xBD50,0xBD58,0xBD59,0xBD64,0xBD68,0xBD80,0xBD81,/* 0xC8-0xCF */
+	0xBD84,0xBD87,0xBD88,0xBD89,0xBD8A,0xBD90,0xBD91,0xBD93,/* 0xD0-0xD7 */
+	0xBD95,0xBD99,0xBD9A,0xBD9C,0xBDA4,0xBDB0,0xBDB8,0xBDD4,/* 0xD8-0xDF */
+	0xBDD5,0xBDD8,0xBDDC,0xBDE9,0xBDF0,0xBDF4,0xBDF8,0xBE00,/* 0xE0-0xE7 */
+	0xBE03,0xBE05,0xBE0C,0xBE0D,0xBE10,0xBE14,0xBE1C,0xBE1D,/* 0xE8-0xEF */
+	0xBE1F,0xBE44,0xBE45,0xBE48,0xBE4C,0xBE4E,0xBE54,0xBE55,/* 0xF0-0xF7 */
+	0xBE57,0xBE59,0xBE5A,0xBE5B,0xBE60,0xBE61,0xBE64,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD2FB,0xD2FC,0xD2FD,0xD2FE,0xD2FF,0xD302,0xD304,/* 0x40-0x47 */
+	0xD306,0xD307,0xD308,0xD309,0xD30A,0xD30B,0xD30F,0xD311,/* 0x48-0x4F */
+	0xD312,0xD313,0xD315,0xD317,0xD318,0xD319,0xD31A,0xD31B,/* 0x50-0x57 */
+	0xD31E,0xD322,0xD323,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD324,0xD326,0xD327,0xD32A,0xD32B,0xD32D,0xD32E,/* 0x60-0x67 */
+	0xD32F,0xD331,0xD332,0xD333,0xD334,0xD335,0xD336,0xD337,/* 0x68-0x6F */
+	0xD33A,0xD33E,0xD33F,0xD340,0xD341,0xD342,0xD343,0xD346,/* 0x70-0x77 */
+	0xD347,0xD348,0xD349,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD34A,0xD34B,0xD34C,0xD34D,0xD34E,0xD34F,0xD350,/* 0x80-0x87 */
+	0xD351,0xD352,0xD353,0xD354,0xD355,0xD356,0xD357,0xD358,/* 0x88-0x8F */
+	0xD359,0xD35A,0xD35B,0xD35C,0xD35D,0xD35E,0xD35F,0xD360,/* 0x90-0x97 */
+	0xD361,0xD362,0xD363,0xD364,0xD365,0xD366,0xD367,0xD368,/* 0x98-0x9F */
+	0xD369,0xBE68,0xBE6A,0xBE70,0xBE71,0xBE73,0xBE74,0xBE75,/* 0xA0-0xA7 */
+	0xBE7B,0xBE7C,0xBE7D,0xBE80,0xBE84,0xBE8C,0xBE8D,0xBE8F,/* 0xA8-0xAF */
+	0xBE90,0xBE91,0xBE98,0xBE99,0xBEA8,0xBED0,0xBED1,0xBED4,/* 0xB0-0xB7 */
+	0xBED7,0xBED8,0xBEE0,0xBEE3,0xBEE4,0xBEE5,0xBEEC,0xBF01,/* 0xB8-0xBF */
+	0xBF08,0xBF09,0xBF18,0xBF19,0xBF1B,0xBF1C,0xBF1D,0xBF40,/* 0xC0-0xC7 */
+	0xBF41,0xBF44,0xBF48,0xBF50,0xBF51,0xBF55,0xBF94,0xBFB0,/* 0xC8-0xCF */
+	0xBFC5,0xBFCC,0xBFCD,0xBFD0,0xBFD4,0xBFDC,0xBFDF,0xBFE1,/* 0xD0-0xD7 */
+	0xC03C,0xC051,0xC058,0xC05C,0xC060,0xC068,0xC069,0xC090,/* 0xD8-0xDF */
+	0xC091,0xC094,0xC098,0xC0A0,0xC0A1,0xC0A3,0xC0A5,0xC0AC,/* 0xE0-0xE7 */
+	0xC0AD,0xC0AF,0xC0B0,0xC0B3,0xC0B4,0xC0B5,0xC0B6,0xC0BC,/* 0xE8-0xEF */
+	0xC0BD,0xC0BF,0xC0C0,0xC0C1,0xC0C5,0xC0C8,0xC0C9,0xC0CC,/* 0xF0-0xF7 */
+	0xC0D0,0xC0D8,0xC0D9,0xC0DB,0xC0DC,0xC0DD,0xC0E4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD36A,0xD36B,0xD36C,0xD36D,0xD36E,0xD36F,0xD370,/* 0x40-0x47 */
+	0xD371,0xD372,0xD373,0xD374,0xD375,0xD376,0xD377,0xD378,/* 0x48-0x4F */
+	0xD379,0xD37A,0xD37B,0xD37E,0xD37F,0xD381,0xD382,0xD383,/* 0x50-0x57 */
+	0xD385,0xD386,0xD387,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD388,0xD389,0xD38A,0xD38B,0xD38E,0xD392,0xD393,/* 0x60-0x67 */
+	0xD394,0xD395,0xD396,0xD397,0xD39A,0xD39B,0xD39D,0xD39E,/* 0x68-0x6F */
+	0xD39F,0xD3A1,0xD3A2,0xD3A3,0xD3A4,0xD3A5,0xD3A6,0xD3A7,/* 0x70-0x77 */
+	0xD3AA,0xD3AC,0xD3AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD3AF,0xD3B0,0xD3B1,0xD3B2,0xD3B3,0xD3B5,0xD3B6,/* 0x80-0x87 */
+	0xD3B7,0xD3B9,0xD3BA,0xD3BB,0xD3BD,0xD3BE,0xD3BF,0xD3C0,/* 0x88-0x8F */
+	0xD3C1,0xD3C2,0xD3C3,0xD3C6,0xD3C7,0xD3CA,0xD3CB,0xD3CC,/* 0x90-0x97 */
+	0xD3CD,0xD3CE,0xD3CF,0xD3D1,0xD3D2,0xD3D3,0xD3D4,0xD3D5,/* 0x98-0x9F */
+	0xD3D6,0xC0E5,0xC0E8,0xC0EC,0xC0F4,0xC0F5,0xC0F7,0xC0F9,/* 0xA0-0xA7 */
+	0xC100,0xC104,0xC108,0xC110,0xC115,0xC11C,0xC11D,0xC11E,/* 0xA8-0xAF */
+	0xC11F,0xC120,0xC123,0xC124,0xC126,0xC127,0xC12C,0xC12D,/* 0xB0-0xB7 */
+	0xC12F,0xC130,0xC131,0xC136,0xC138,0xC139,0xC13C,0xC140,/* 0xB8-0xBF */
+	0xC148,0xC149,0xC14B,0xC14C,0xC14D,0xC154,0xC155,0xC158,/* 0xC0-0xC7 */
+	0xC15C,0xC164,0xC165,0xC167,0xC168,0xC169,0xC170,0xC174,/* 0xC8-0xCF */
+	0xC178,0xC185,0xC18C,0xC18D,0xC18E,0xC190,0xC194,0xC196,/* 0xD0-0xD7 */
+	0xC19C,0xC19D,0xC19F,0xC1A1,0xC1A5,0xC1A8,0xC1A9,0xC1AC,/* 0xD8-0xDF */
+	0xC1B0,0xC1BD,0xC1C4,0xC1C8,0xC1CC,0xC1D4,0xC1D7,0xC1D8,/* 0xE0-0xE7 */
+	0xC1E0,0xC1E4,0xC1E8,0xC1F0,0xC1F1,0xC1F3,0xC1FC,0xC1FD,/* 0xE8-0xEF */
+	0xC200,0xC204,0xC20C,0xC20D,0xC20F,0xC211,0xC218,0xC219,/* 0xF0-0xF7 */
+	0xC21C,0xC21F,0xC220,0xC228,0xC229,0xC22B,0xC22D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD3D7,0xD3D9,0xD3DA,0xD3DB,0xD3DC,0xD3DD,0xD3DE,/* 0x40-0x47 */
+	0xD3DF,0xD3E0,0xD3E2,0xD3E4,0xD3E5,0xD3E6,0xD3E7,0xD3E8,/* 0x48-0x4F */
+	0xD3E9,0xD3EA,0xD3EB,0xD3EE,0xD3EF,0xD3F1,0xD3F2,0xD3F3,/* 0x50-0x57 */
+	0xD3F5,0xD3F6,0xD3F7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD3F8,0xD3F9,0xD3FA,0xD3FB,0xD3FE,0xD400,0xD402,/* 0x60-0x67 */
+	0xD403,0xD404,0xD405,0xD406,0xD407,0xD409,0xD40A,0xD40B,/* 0x68-0x6F */
+	0xD40C,0xD40D,0xD40E,0xD40F,0xD410,0xD411,0xD412,0xD413,/* 0x70-0x77 */
+	0xD414,0xD415,0xD416,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD417,0xD418,0xD419,0xD41A,0xD41B,0xD41C,0xD41E,/* 0x80-0x87 */
+	0xD41F,0xD420,0xD421,0xD422,0xD423,0xD424,0xD425,0xD426,/* 0x88-0x8F */
+	0xD427,0xD428,0xD429,0xD42A,0xD42B,0xD42C,0xD42D,0xD42E,/* 0x90-0x97 */
+	0xD42F,0xD430,0xD431,0xD432,0xD433,0xD434,0xD435,0xD436,/* 0x98-0x9F */
+	0xD437,0xC22F,0xC231,0xC232,0xC234,0xC248,0xC250,0xC251,/* 0xA0-0xA7 */
+	0xC254,0xC258,0xC260,0xC265,0xC26C,0xC26D,0xC270,0xC274,/* 0xA8-0xAF */
+	0xC27C,0xC27D,0xC27F,0xC281,0xC288,0xC289,0xC290,0xC298,/* 0xB0-0xB7 */
+	0xC29B,0xC29D,0xC2A4,0xC2A5,0xC2A8,0xC2AC,0xC2AD,0xC2B4,/* 0xB8-0xBF */
+	0xC2B5,0xC2B7,0xC2B9,0xC2DC,0xC2DD,0xC2E0,0xC2E3,0xC2E4,/* 0xC0-0xC7 */
+	0xC2EB,0xC2EC,0xC2ED,0xC2EF,0xC2F1,0xC2F6,0xC2F8,0xC2F9,/* 0xC8-0xCF */
+	0xC2FB,0xC2FC,0xC300,0xC308,0xC309,0xC30C,0xC30D,0xC313,/* 0xD0-0xD7 */
+	0xC314,0xC315,0xC318,0xC31C,0xC324,0xC325,0xC328,0xC329,/* 0xD8-0xDF */
+	0xC345,0xC368,0xC369,0xC36C,0xC370,0xC372,0xC378,0xC379,/* 0xE0-0xE7 */
+	0xC37C,0xC37D,0xC384,0xC388,0xC38C,0xC3C0,0xC3D8,0xC3D9,/* 0xE8-0xEF */
+	0xC3DC,0xC3DF,0xC3E0,0xC3E2,0xC3E8,0xC3E9,0xC3ED,0xC3F4,/* 0xF0-0xF7 */
+	0xC3F5,0xC3F8,0xC408,0xC410,0xC424,0xC42C,0xC430,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD438,0xD439,0xD43A,0xD43B,0xD43C,0xD43D,0xD43E,/* 0x40-0x47 */
+	0xD43F,0xD441,0xD442,0xD443,0xD445,0xD446,0xD447,0xD448,/* 0x48-0x4F */
+	0xD449,0xD44A,0xD44B,0xD44C,0xD44D,0xD44E,0xD44F,0xD450,/* 0x50-0x57 */
+	0xD451,0xD452,0xD453,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD454,0xD455,0xD456,0xD457,0xD458,0xD459,0xD45A,/* 0x60-0x67 */
+	0xD45B,0xD45D,0xD45E,0xD45F,0xD461,0xD462,0xD463,0xD465,/* 0x68-0x6F */
+	0xD466,0xD467,0xD468,0xD469,0xD46A,0xD46B,0xD46C,0xD46E,/* 0x70-0x77 */
+	0xD470,0xD471,0xD472,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD473,0xD474,0xD475,0xD476,0xD477,0xD47A,0xD47B,/* 0x80-0x87 */
+	0xD47D,0xD47E,0xD481,0xD483,0xD484,0xD485,0xD486,0xD487,/* 0x88-0x8F */
+	0xD48A,0xD48C,0xD48E,0xD48F,0xD490,0xD491,0xD492,0xD493,/* 0x90-0x97 */
+	0xD495,0xD496,0xD497,0xD498,0xD499,0xD49A,0xD49B,0xD49C,/* 0x98-0x9F */
+	0xD49D,0xC434,0xC43C,0xC43D,0xC448,0xC464,0xC465,0xC468,/* 0xA0-0xA7 */
+	0xC46C,0xC474,0xC475,0xC479,0xC480,0xC494,0xC49C,0xC4B8,/* 0xA8-0xAF */
+	0xC4BC,0xC4E9,0xC4F0,0xC4F1,0xC4F4,0xC4F8,0xC4FA,0xC4FF,/* 0xB0-0xB7 */
+	0xC500,0xC501,0xC50C,0xC510,0xC514,0xC51C,0xC528,0xC529,/* 0xB8-0xBF */
+	0xC52C,0xC530,0xC538,0xC539,0xC53B,0xC53D,0xC544,0xC545,/* 0xC0-0xC7 */
+	0xC548,0xC549,0xC54A,0xC54C,0xC54D,0xC54E,0xC553,0xC554,/* 0xC8-0xCF */
+	0xC555,0xC557,0xC558,0xC559,0xC55D,0xC55E,0xC560,0xC561,/* 0xD0-0xD7 */
+	0xC564,0xC568,0xC570,0xC571,0xC573,0xC574,0xC575,0xC57C,/* 0xD8-0xDF */
+	0xC57D,0xC580,0xC584,0xC587,0xC58C,0xC58D,0xC58F,0xC591,/* 0xE0-0xE7 */
+	0xC595,0xC597,0xC598,0xC59C,0xC5A0,0xC5A9,0xC5B4,0xC5B5,/* 0xE8-0xEF */
+	0xC5B8,0xC5B9,0xC5BB,0xC5BC,0xC5BD,0xC5BE,0xC5C4,0xC5C5,/* 0xF0-0xF7 */
+	0xC5C6,0xC5C7,0xC5C8,0xC5C9,0xC5CA,0xC5CC,0xC5CE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD49E,0xD49F,0xD4A0,0xD4A1,0xD4A2,0xD4A3,0xD4A4,/* 0x40-0x47 */
+	0xD4A5,0xD4A6,0xD4A7,0xD4A8,0xD4AA,0xD4AB,0xD4AC,0xD4AD,/* 0x48-0x4F */
+	0xD4AE,0xD4AF,0xD4B0,0xD4B1,0xD4B2,0xD4B3,0xD4B4,0xD4B5,/* 0x50-0x57 */
+	0xD4B6,0xD4B7,0xD4B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD4B9,0xD4BA,0xD4BB,0xD4BC,0xD4BD,0xD4BE,0xD4BF,/* 0x60-0x67 */
+	0xD4C0,0xD4C1,0xD4C2,0xD4C3,0xD4C4,0xD4C5,0xD4C6,0xD4C7,/* 0x68-0x6F */
+	0xD4C8,0xD4C9,0xD4CA,0xD4CB,0xD4CD,0xD4CE,0xD4CF,0xD4D1,/* 0x70-0x77 */
+	0xD4D2,0xD4D3,0xD4D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD4D6,0xD4D7,0xD4D8,0xD4D9,0xD4DA,0xD4DB,0xD4DD,/* 0x80-0x87 */
+	0xD4DE,0xD4E0,0xD4E1,0xD4E2,0xD4E3,0xD4E4,0xD4E5,0xD4E6,/* 0x88-0x8F */
+	0xD4E7,0xD4E9,0xD4EA,0xD4EB,0xD4ED,0xD4EE,0xD4EF,0xD4F1,/* 0x90-0x97 */
+	0xD4F2,0xD4F3,0xD4F4,0xD4F5,0xD4F6,0xD4F7,0xD4F9,0xD4FA,/* 0x98-0x9F */
+	0xD4FC,0xC5D0,0xC5D1,0xC5D4,0xC5D8,0xC5E0,0xC5E1,0xC5E3,/* 0xA0-0xA7 */
+	0xC5E5,0xC5EC,0xC5ED,0xC5EE,0xC5F0,0xC5F4,0xC5F6,0xC5F7,/* 0xA8-0xAF */
+	0xC5FC,0xC5FD,0xC5FE,0xC5FF,0xC600,0xC601,0xC605,0xC606,/* 0xB0-0xB7 */
+	0xC607,0xC608,0xC60C,0xC610,0xC618,0xC619,0xC61B,0xC61C,/* 0xB8-0xBF */
+	0xC624,0xC625,0xC628,0xC62C,0xC62D,0xC62E,0xC630,0xC633,/* 0xC0-0xC7 */
+	0xC634,0xC635,0xC637,0xC639,0xC63B,0xC640,0xC641,0xC644,/* 0xC8-0xCF */
+	0xC648,0xC650,0xC651,0xC653,0xC654,0xC655,0xC65C,0xC65D,/* 0xD0-0xD7 */
+	0xC660,0xC66C,0xC66F,0xC671,0xC678,0xC679,0xC67C,0xC680,/* 0xD8-0xDF */
+	0xC688,0xC689,0xC68B,0xC68D,0xC694,0xC695,0xC698,0xC69C,/* 0xE0-0xE7 */
+	0xC6A4,0xC6A5,0xC6A7,0xC6A9,0xC6B0,0xC6B1,0xC6B4,0xC6B8,/* 0xE8-0xEF */
+	0xC6B9,0xC6BA,0xC6C0,0xC6C1,0xC6C3,0xC6C5,0xC6CC,0xC6CD,/* 0xF0-0xF7 */
+	0xC6D0,0xC6D4,0xC6DC,0xC6DD,0xC6E0,0xC6E1,0xC6E8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD4FE,0xD4FF,0xD500,0xD501,0xD502,0xD503,0xD505,/* 0x40-0x47 */
+	0xD506,0xD507,0xD509,0xD50A,0xD50B,0xD50D,0xD50E,0xD50F,/* 0x48-0x4F */
+	0xD510,0xD511,0xD512,0xD513,0xD516,0xD518,0xD519,0xD51A,/* 0x50-0x57 */
+	0xD51B,0xD51C,0xD51D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD51E,0xD51F,0xD520,0xD521,0xD522,0xD523,0xD524,/* 0x60-0x67 */
+	0xD525,0xD526,0xD527,0xD528,0xD529,0xD52A,0xD52B,0xD52C,/* 0x68-0x6F */
+	0xD52D,0xD52E,0xD52F,0xD530,0xD531,0xD532,0xD533,0xD534,/* 0x70-0x77 */
+	0xD535,0xD536,0xD537,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD538,0xD539,0xD53A,0xD53B,0xD53E,0xD53F,0xD541,/* 0x80-0x87 */
+	0xD542,0xD543,0xD545,0xD546,0xD547,0xD548,0xD549,0xD54A,/* 0x88-0x8F */
+	0xD54B,0xD54E,0xD550,0xD552,0xD553,0xD554,0xD555,0xD556,/* 0x90-0x97 */
+	0xD557,0xD55A,0xD55B,0xD55D,0xD55E,0xD55F,0xD561,0xD562,/* 0x98-0x9F */
+	0xD563,0xC6E9,0xC6EC,0xC6F0,0xC6F8,0xC6F9,0xC6FD,0xC704,/* 0xA0-0xA7 */
+	0xC705,0xC708,0xC70C,0xC714,0xC715,0xC717,0xC719,0xC720,/* 0xA8-0xAF */
+	0xC721,0xC724,0xC728,0xC730,0xC731,0xC733,0xC735,0xC737,/* 0xB0-0xB7 */
+	0xC73C,0xC73D,0xC740,0xC744,0xC74A,0xC74C,0xC74D,0xC74F,/* 0xB8-0xBF */
+	0xC751,0xC752,0xC753,0xC754,0xC755,0xC756,0xC757,0xC758,/* 0xC0-0xC7 */
+	0xC75C,0xC760,0xC768,0xC76B,0xC774,0xC775,0xC778,0xC77C,/* 0xC8-0xCF */
+	0xC77D,0xC77E,0xC783,0xC784,0xC785,0xC787,0xC788,0xC789,/* 0xD0-0xD7 */
+	0xC78A,0xC78E,0xC790,0xC791,0xC794,0xC796,0xC797,0xC798,/* 0xD8-0xDF */
+	0xC79A,0xC7A0,0xC7A1,0xC7A3,0xC7A4,0xC7A5,0xC7A6,0xC7AC,/* 0xE0-0xE7 */
+	0xC7AD,0xC7B0,0xC7B4,0xC7BC,0xC7BD,0xC7BF,0xC7C0,0xC7C1,/* 0xE8-0xEF */
+	0xC7C8,0xC7C9,0xC7CC,0xC7CE,0xC7D0,0xC7D8,0xC7DD,0xC7E4,/* 0xF0-0xF7 */
+	0xC7E8,0xC7EC,0xC800,0xC801,0xC804,0xC808,0xC80A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD564,0xD566,0xD567,0xD56A,0xD56C,0xD56E,0xD56F,/* 0x40-0x47 */
+	0xD570,0xD571,0xD572,0xD573,0xD576,0xD577,0xD579,0xD57A,/* 0x48-0x4F */
+	0xD57B,0xD57D,0xD57E,0xD57F,0xD580,0xD581,0xD582,0xD583,/* 0x50-0x57 */
+	0xD586,0xD58A,0xD58B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD58C,0xD58D,0xD58E,0xD58F,0xD591,0xD592,0xD593,/* 0x60-0x67 */
+	0xD594,0xD595,0xD596,0xD597,0xD598,0xD599,0xD59A,0xD59B,/* 0x68-0x6F */
+	0xD59C,0xD59D,0xD59E,0xD59F,0xD5A0,0xD5A1,0xD5A2,0xD5A3,/* 0x70-0x77 */
+	0xD5A4,0xD5A6,0xD5A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD5A8,0xD5A9,0xD5AA,0xD5AB,0xD5AC,0xD5AD,0xD5AE,/* 0x80-0x87 */
+	0xD5AF,0xD5B0,0xD5B1,0xD5B2,0xD5B3,0xD5B4,0xD5B5,0xD5B6,/* 0x88-0x8F */
+	0xD5B7,0xD5B8,0xD5B9,0xD5BA,0xD5BB,0xD5BC,0xD5BD,0xD5BE,/* 0x90-0x97 */
+	0xD5BF,0xD5C0,0xD5C1,0xD5C2,0xD5C3,0xD5C4,0xD5C5,0xD5C6,/* 0x98-0x9F */
+	0xD5C7,0xC810,0xC811,0xC813,0xC815,0xC816,0xC81C,0xC81D,/* 0xA0-0xA7 */
+	0xC820,0xC824,0xC82C,0xC82D,0xC82F,0xC831,0xC838,0xC83C,/* 0xA8-0xAF */
+	0xC840,0xC848,0xC849,0xC84C,0xC84D,0xC854,0xC870,0xC871,/* 0xB0-0xB7 */
+	0xC874,0xC878,0xC87A,0xC880,0xC881,0xC883,0xC885,0xC886,/* 0xB8-0xBF */
+	0xC887,0xC88B,0xC88C,0xC88D,0xC894,0xC89D,0xC89F,0xC8A1,/* 0xC0-0xC7 */
+	0xC8A8,0xC8BC,0xC8BD,0xC8C4,0xC8C8,0xC8CC,0xC8D4,0xC8D5,/* 0xC8-0xCF */
+	0xC8D7,0xC8D9,0xC8E0,0xC8E1,0xC8E4,0xC8F5,0xC8FC,0xC8FD,/* 0xD0-0xD7 */
+	0xC900,0xC904,0xC905,0xC906,0xC90C,0xC90D,0xC90F,0xC911,/* 0xD8-0xDF */
+	0xC918,0xC92C,0xC934,0xC950,0xC951,0xC954,0xC958,0xC960,/* 0xE0-0xE7 */
+	0xC961,0xC963,0xC96C,0xC970,0xC974,0xC97C,0xC988,0xC989,/* 0xE8-0xEF */
+	0xC98C,0xC990,0xC998,0xC999,0xC99B,0xC99D,0xC9C0,0xC9C1,/* 0xF0-0xF7 */
+	0xC9C4,0xC9C7,0xC9C8,0xC9CA,0xC9D0,0xC9D1,0xC9D3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD5CA,0xD5CB,0xD5CD,0xD5CE,0xD5CF,0xD5D1,0xD5D3,/* 0x40-0x47 */
+	0xD5D4,0xD5D5,0xD5D6,0xD5D7,0xD5DA,0xD5DC,0xD5DE,0xD5DF,/* 0x48-0x4F */
+	0xD5E0,0xD5E1,0xD5E2,0xD5E3,0xD5E6,0xD5E7,0xD5E9,0xD5EA,/* 0x50-0x57 */
+	0xD5EB,0xD5ED,0xD5EE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD5EF,0xD5F0,0xD5F1,0xD5F2,0xD5F3,0xD5F6,0xD5F8,/* 0x60-0x67 */
+	0xD5FA,0xD5FB,0xD5FC,0xD5FD,0xD5FE,0xD5FF,0xD602,0xD603,/* 0x68-0x6F */
+	0xD605,0xD606,0xD607,0xD609,0xD60A,0xD60B,0xD60C,0xD60D,/* 0x70-0x77 */
+	0xD60E,0xD60F,0xD612,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD616,0xD617,0xD618,0xD619,0xD61A,0xD61B,0xD61D,/* 0x80-0x87 */
+	0xD61E,0xD61F,0xD621,0xD622,0xD623,0xD625,0xD626,0xD627,/* 0x88-0x8F */
+	0xD628,0xD629,0xD62A,0xD62B,0xD62C,0xD62E,0xD62F,0xD630,/* 0x90-0x97 */
+	0xD631,0xD632,0xD633,0xD634,0xD635,0xD636,0xD637,0xD63A,/* 0x98-0x9F */
+	0xD63B,0xC9D5,0xC9D6,0xC9D9,0xC9DA,0xC9DC,0xC9DD,0xC9E0,/* 0xA0-0xA7 */
+	0xC9E2,0xC9E4,0xC9E7,0xC9EC,0xC9ED,0xC9EF,0xC9F0,0xC9F1,/* 0xA8-0xAF */
+	0xC9F8,0xC9F9,0xC9FC,0xCA00,0xCA08,0xCA09,0xCA0B,0xCA0C,/* 0xB0-0xB7 */
+	0xCA0D,0xCA14,0xCA18,0xCA29,0xCA4C,0xCA4D,0xCA50,0xCA54,/* 0xB8-0xBF */
+	0xCA5C,0xCA5D,0xCA5F,0xCA60,0xCA61,0xCA68,0xCA7D,0xCA84,/* 0xC0-0xC7 */
+	0xCA98,0xCABC,0xCABD,0xCAC0,0xCAC4,0xCACC,0xCACD,0xCACF,/* 0xC8-0xCF */
+	0xCAD1,0xCAD3,0xCAD8,0xCAD9,0xCAE0,0xCAEC,0xCAF4,0xCB08,/* 0xD0-0xD7 */
+	0xCB10,0xCB14,0xCB18,0xCB20,0xCB21,0xCB41,0xCB48,0xCB49,/* 0xD8-0xDF */
+	0xCB4C,0xCB50,0xCB58,0xCB59,0xCB5D,0xCB64,0xCB78,0xCB79,/* 0xE0-0xE7 */
+	0xCB9C,0xCBB8,0xCBD4,0xCBE4,0xCBE7,0xCBE9,0xCC0C,0xCC0D,/* 0xE8-0xEF */
+	0xCC10,0xCC14,0xCC1C,0xCC1D,0xCC21,0xCC22,0xCC27,0xCC28,/* 0xF0-0xF7 */
+	0xCC29,0xCC2C,0xCC2E,0xCC30,0xCC38,0xCC39,0xCC3B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD63D,0xD63E,0xD63F,0xD641,0xD642,0xD643,0xD644,/* 0x40-0x47 */
+	0xD646,0xD647,0xD64A,0xD64C,0xD64E,0xD64F,0xD650,0xD652,/* 0x48-0x4F */
+	0xD653,0xD656,0xD657,0xD659,0xD65A,0xD65B,0xD65D,0xD65E,/* 0x50-0x57 */
+	0xD65F,0xD660,0xD661,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD662,0xD663,0xD664,0xD665,0xD666,0xD668,0xD66A,/* 0x60-0x67 */
+	0xD66B,0xD66C,0xD66D,0xD66E,0xD66F,0xD672,0xD673,0xD675,/* 0x68-0x6F */
+	0xD676,0xD677,0xD678,0xD679,0xD67A,0xD67B,0xD67C,0xD67D,/* 0x70-0x77 */
+	0xD67E,0xD67F,0xD680,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD681,0xD682,0xD684,0xD686,0xD687,0xD688,0xD689,/* 0x80-0x87 */
+	0xD68A,0xD68B,0xD68E,0xD68F,0xD691,0xD692,0xD693,0xD695,/* 0x88-0x8F */
+	0xD696,0xD697,0xD698,0xD699,0xD69A,0xD69B,0xD69C,0xD69E,/* 0x90-0x97 */
+	0xD6A0,0xD6A2,0xD6A3,0xD6A4,0xD6A5,0xD6A6,0xD6A7,0xD6A9,/* 0x98-0x9F */
+	0xD6AA,0xCC3C,0xCC3D,0xCC3E,0xCC44,0xCC45,0xCC48,0xCC4C,/* 0xA0-0xA7 */
+	0xCC54,0xCC55,0xCC57,0xCC58,0xCC59,0xCC60,0xCC64,0xCC66,/* 0xA8-0xAF */
+	0xCC68,0xCC70,0xCC75,0xCC98,0xCC99,0xCC9C,0xCCA0,0xCCA8,/* 0xB0-0xB7 */
+	0xCCA9,0xCCAB,0xCCAC,0xCCAD,0xCCB4,0xCCB5,0xCCB8,0xCCBC,/* 0xB8-0xBF */
+	0xCCC4,0xCCC5,0xCCC7,0xCCC9,0xCCD0,0xCCD4,0xCCE4,0xCCEC,/* 0xC0-0xC7 */
+	0xCCF0,0xCD01,0xCD08,0xCD09,0xCD0C,0xCD10,0xCD18,0xCD19,/* 0xC8-0xCF */
+	0xCD1B,0xCD1D,0xCD24,0xCD28,0xCD2C,0xCD39,0xCD5C,0xCD60,/* 0xD0-0xD7 */
+	0xCD64,0xCD6C,0xCD6D,0xCD6F,0xCD71,0xCD78,0xCD88,0xCD94,/* 0xD8-0xDF */
+	0xCD95,0xCD98,0xCD9C,0xCDA4,0xCDA5,0xCDA7,0xCDA9,0xCDB0,/* 0xE0-0xE7 */
+	0xCDC4,0xCDCC,0xCDD0,0xCDE8,0xCDEC,0xCDF0,0xCDF8,0xCDF9,/* 0xE8-0xEF */
+	0xCDFB,0xCDFD,0xCE04,0xCE08,0xCE0C,0xCE14,0xCE19,0xCE20,/* 0xF0-0xF7 */
+	0xCE21,0xCE24,0xCE28,0xCE30,0xCE31,0xCE33,0xCE35,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD6AB,0xD6AD,0xD6AE,0xD6AF,0xD6B1,0xD6B2,0xD6B3,/* 0x40-0x47 */
+	0xD6B4,0xD6B5,0xD6B6,0xD6B7,0xD6B8,0xD6BA,0xD6BC,0xD6BD,/* 0x48-0x4F */
+	0xD6BE,0xD6BF,0xD6C0,0xD6C1,0xD6C2,0xD6C3,0xD6C6,0xD6C7,/* 0x50-0x57 */
+	0xD6C9,0xD6CA,0xD6CB,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD6CD,0xD6CE,0xD6CF,0xD6D0,0xD6D2,0xD6D3,0xD6D5,/* 0x60-0x67 */
+	0xD6D6,0xD6D8,0xD6DA,0xD6DB,0xD6DC,0xD6DD,0xD6DE,0xD6DF,/* 0x68-0x6F */
+	0xD6E1,0xD6E2,0xD6E3,0xD6E5,0xD6E6,0xD6E7,0xD6E9,0xD6EA,/* 0x70-0x77 */
+	0xD6EB,0xD6EC,0xD6ED,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD6EE,0xD6EF,0xD6F1,0xD6F2,0xD6F3,0xD6F4,0xD6F6,/* 0x80-0x87 */
+	0xD6F7,0xD6F8,0xD6F9,0xD6FA,0xD6FB,0xD6FE,0xD6FF,0xD701,/* 0x88-0x8F */
+	0xD702,0xD703,0xD705,0xD706,0xD707,0xD708,0xD709,0xD70A,/* 0x90-0x97 */
+	0xD70B,0xD70C,0xD70D,0xD70E,0xD70F,0xD710,0xD712,0xD713,/* 0x98-0x9F */
+	0xD714,0xCE58,0xCE59,0xCE5C,0xCE5F,0xCE60,0xCE61,0xCE68,/* 0xA0-0xA7 */
+	0xCE69,0xCE6B,0xCE6D,0xCE74,0xCE75,0xCE78,0xCE7C,0xCE84,/* 0xA8-0xAF */
+	0xCE85,0xCE87,0xCE89,0xCE90,0xCE91,0xCE94,0xCE98,0xCEA0,/* 0xB0-0xB7 */
+	0xCEA1,0xCEA3,0xCEA4,0xCEA5,0xCEAC,0xCEAD,0xCEC1,0xCEE4,/* 0xB8-0xBF */
+	0xCEE5,0xCEE8,0xCEEB,0xCEEC,0xCEF4,0xCEF5,0xCEF7,0xCEF8,/* 0xC0-0xC7 */
+	0xCEF9,0xCF00,0xCF01,0xCF04,0xCF08,0xCF10,0xCF11,0xCF13,/* 0xC8-0xCF */
+	0xCF15,0xCF1C,0xCF20,0xCF24,0xCF2C,0xCF2D,0xCF2F,0xCF30,/* 0xD0-0xD7 */
+	0xCF31,0xCF38,0xCF54,0xCF55,0xCF58,0xCF5C,0xCF64,0xCF65,/* 0xD8-0xDF */
+	0xCF67,0xCF69,0xCF70,0xCF71,0xCF74,0xCF78,0xCF80,0xCF85,/* 0xE0-0xE7 */
+	0xCF8C,0xCFA1,0xCFA8,0xCFB0,0xCFC4,0xCFE0,0xCFE1,0xCFE4,/* 0xE8-0xEF */
+	0xCFE8,0xCFF0,0xCFF1,0xCFF3,0xCFF5,0xCFFC,0xD000,0xD004,/* 0xF0-0xF7 */
+	0xD011,0xD018,0xD02D,0xD034,0xD035,0xD038,0xD03C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD715,0xD716,0xD717,0xD71A,0xD71B,0xD71D,0xD71E,/* 0x40-0x47 */
+	0xD71F,0xD721,0xD722,0xD723,0xD724,0xD725,0xD726,0xD727,/* 0x48-0x4F */
+	0xD72A,0xD72C,0xD72E,0xD72F,0xD730,0xD731,0xD732,0xD733,/* 0x50-0x57 */
+	0xD736,0xD737,0xD739,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD73A,0xD73B,0xD73D,0xD73E,0xD73F,0xD740,0xD741,/* 0x60-0x67 */
+	0xD742,0xD743,0xD745,0xD746,0xD748,0xD74A,0xD74B,0xD74C,/* 0x68-0x6F */
+	0xD74D,0xD74E,0xD74F,0xD752,0xD753,0xD755,0xD75A,0xD75B,/* 0x70-0x77 */
+	0xD75C,0xD75D,0xD75E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD75F,0xD762,0xD764,0xD766,0xD767,0xD768,0xD76A,/* 0x80-0x87 */
+	0xD76B,0xD76D,0xD76E,0xD76F,0xD771,0xD772,0xD773,0xD775,/* 0x88-0x8F */
+	0xD776,0xD777,0xD778,0xD779,0xD77A,0xD77B,0xD77E,0xD77F,/* 0x90-0x97 */
+	0xD780,0xD782,0xD783,0xD784,0xD785,0xD786,0xD787,0xD78A,/* 0x98-0x9F */
+	0xD78B,0xD044,0xD045,0xD047,0xD049,0xD050,0xD054,0xD058,/* 0xA0-0xA7 */
+	0xD060,0xD06C,0xD06D,0xD070,0xD074,0xD07C,0xD07D,0xD081,/* 0xA8-0xAF */
+	0xD0A4,0xD0A5,0xD0A8,0xD0AC,0xD0B4,0xD0B5,0xD0B7,0xD0B9,/* 0xB0-0xB7 */
+	0xD0C0,0xD0C1,0xD0C4,0xD0C8,0xD0C9,0xD0D0,0xD0D1,0xD0D3,/* 0xB8-0xBF */
+	0xD0D4,0xD0D5,0xD0DC,0xD0DD,0xD0E0,0xD0E4,0xD0EC,0xD0ED,/* 0xC0-0xC7 */
+	0xD0EF,0xD0F0,0xD0F1,0xD0F8,0xD10D,0xD130,0xD131,0xD134,/* 0xC8-0xCF */
+	0xD138,0xD13A,0xD140,0xD141,0xD143,0xD144,0xD145,0xD14C,/* 0xD0-0xD7 */
+	0xD14D,0xD150,0xD154,0xD15C,0xD15D,0xD15F,0xD161,0xD168,/* 0xD8-0xDF */
+	0xD16C,0xD17C,0xD184,0xD188,0xD1A0,0xD1A1,0xD1A4,0xD1A8,/* 0xE0-0xE7 */
+	0xD1B0,0xD1B1,0xD1B3,0xD1B5,0xD1BA,0xD1BC,0xD1C0,0xD1D8,/* 0xE8-0xEF */
+	0xD1F4,0xD1F8,0xD207,0xD209,0xD210,0xD22C,0xD22D,0xD230,/* 0xF0-0xF7 */
+	0xD234,0xD23C,0xD23D,0xD23F,0xD241,0xD248,0xD25C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD78D,0xD78E,0xD78F,0xD791,0xD792,0xD793,0xD794,/* 0x40-0x47 */
+	0xD795,0xD796,0xD797,0xD79A,0xD79C,0xD79E,0xD79F,0xD7A0,/* 0x48-0x4F */
+	0xD7A1,0xD7A2,0xD7A3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xD264,0xD280,0xD281,0xD284,0xD288,0xD290,0xD291,/* 0xA0-0xA7 */
+	0xD295,0xD29C,0xD2A0,0xD2A4,0xD2AC,0xD2B1,0xD2B8,0xD2B9,/* 0xA8-0xAF */
+	0xD2BC,0xD2BF,0xD2C0,0xD2C2,0xD2C8,0xD2C9,0xD2CB,0xD2D4,/* 0xB0-0xB7 */
+	0xD2D8,0xD2DC,0xD2E4,0xD2E5,0xD2F0,0xD2F1,0xD2F4,0xD2F8,/* 0xB8-0xBF */
+	0xD300,0xD301,0xD303,0xD305,0xD30C,0xD30D,0xD30E,0xD310,/* 0xC0-0xC7 */
+	0xD314,0xD316,0xD31C,0xD31D,0xD31F,0xD320,0xD321,0xD325,/* 0xC8-0xCF */
+	0xD328,0xD329,0xD32C,0xD330,0xD338,0xD339,0xD33B,0xD33C,/* 0xD0-0xD7 */
+	0xD33D,0xD344,0xD345,0xD37C,0xD37D,0xD380,0xD384,0xD38C,/* 0xD8-0xDF */
+	0xD38D,0xD38F,0xD390,0xD391,0xD398,0xD399,0xD39C,0xD3A0,/* 0xE0-0xE7 */
+	0xD3A8,0xD3A9,0xD3AB,0xD3AD,0xD3B4,0xD3B8,0xD3BC,0xD3C4,/* 0xE8-0xEF */
+	0xD3C5,0xD3C8,0xD3C9,0xD3D0,0xD3D8,0xD3E1,0xD3E3,0xD3EC,/* 0xF0-0xF7 */
+	0xD3ED,0xD3F0,0xD3F4,0xD3FC,0xD3FD,0xD3FF,0xD401,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xD408,0xD41D,0xD440,0xD444,0xD45C,0xD460,0xD464,/* 0xA0-0xA7 */
+	0xD46D,0xD46F,0xD478,0xD479,0xD47C,0xD47F,0xD480,0xD482,/* 0xA8-0xAF */
+	0xD488,0xD489,0xD48B,0xD48D,0xD494,0xD4A9,0xD4CC,0xD4D0,/* 0xB0-0xB7 */
+	0xD4D4,0xD4DC,0xD4DF,0xD4E8,0xD4EC,0xD4F0,0xD4F8,0xD4FB,/* 0xB8-0xBF */
+	0xD4FD,0xD504,0xD508,0xD50C,0xD514,0xD515,0xD517,0xD53C,/* 0xC0-0xC7 */
+	0xD53D,0xD540,0xD544,0xD54C,0xD54D,0xD54F,0xD551,0xD558,/* 0xC8-0xCF */
+	0xD559,0xD55C,0xD560,0xD565,0xD568,0xD569,0xD56B,0xD56D,/* 0xD0-0xD7 */
+	0xD574,0xD575,0xD578,0xD57C,0xD584,0xD585,0xD587,0xD588,/* 0xD8-0xDF */
+	0xD589,0xD590,0xD5A5,0xD5C8,0xD5C9,0xD5CC,0xD5D0,0xD5D2,/* 0xE0-0xE7 */
+	0xD5D8,0xD5D9,0xD5DB,0xD5DD,0xD5E4,0xD5E5,0xD5E8,0xD5EC,/* 0xE8-0xEF */
+	0xD5F4,0xD5F5,0xD5F7,0xD5F9,0xD600,0xD601,0xD604,0xD608,/* 0xF0-0xF7 */
+	0xD610,0xD611,0xD613,0xD614,0xD615,0xD61C,0xD620,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xD624,0xD62D,0xD638,0xD639,0xD63C,0xD640,0xD645,/* 0xA0-0xA7 */
+	0xD648,0xD649,0xD64B,0xD64D,0xD651,0xD654,0xD655,0xD658,/* 0xA8-0xAF */
+	0xD65C,0xD667,0xD669,0xD670,0xD671,0xD674,0xD683,0xD685,/* 0xB0-0xB7 */
+	0xD68C,0xD68D,0xD690,0xD694,0xD69D,0xD69F,0xD6A1,0xD6A8,/* 0xB8-0xBF */
+	0xD6AC,0xD6B0,0xD6B9,0xD6BB,0xD6C4,0xD6C5,0xD6C8,0xD6CC,/* 0xC0-0xC7 */
+	0xD6D1,0xD6D4,0xD6D7,0xD6D9,0xD6E0,0xD6E4,0xD6E8,0xD6F0,/* 0xC8-0xCF */
+	0xD6F5,0xD6FC,0xD6FD,0xD700,0xD704,0xD711,0xD718,0xD719,/* 0xD0-0xD7 */
+	0xD71C,0xD720,0xD728,0xD729,0xD72B,0xD72D,0xD734,0xD735,/* 0xD8-0xDF */
+	0xD738,0xD73C,0xD744,0xD747,0xD749,0xD750,0xD751,0xD754,/* 0xE0-0xE7 */
+	0xD756,0xD757,0xD758,0xD759,0xD760,0xD761,0xD763,0xD765,/* 0xE8-0xEF */
+	0xD769,0xD76C,0xD770,0xD774,0xD77C,0xD77D,0xD781,0xD788,/* 0xF0-0xF7 */
+	0xD789,0xD78C,0xD790,0xD798,0xD799,0xD79B,0xD79D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x4F3D,0x4F73,0x5047,0x50F9,0x52A0,0x53EF,0x5475,/* 0xA0-0xA7 */
+	0x54E5,0x5609,0x5AC1,0x5BB6,0x6687,0x67B6,0x67B7,0x67EF,/* 0xA8-0xAF */
+	0x6B4C,0x73C2,0x75C2,0x7A3C,0x82DB,0x8304,0x8857,0x8888,/* 0xB0-0xB7 */
+	0x8A36,0x8CC8,0x8DCF,0x8EFB,0x8FE6,0x99D5,0x523B,0x5374,/* 0xB8-0xBF */
+	0x5404,0x606A,0x6164,0x6BBC,0x73CF,0x811A,0x89BA,0x89D2,/* 0xC0-0xC7 */
+	0x95A3,0x4F83,0x520A,0x58BE,0x5978,0x59E6,0x5E72,0x5E79,/* 0xC8-0xCF */
+	0x61C7,0x63C0,0x6746,0x67EC,0x687F,0x6F97,0x764E,0x770B,/* 0xD0-0xD7 */
+	0x78F5,0x7A08,0x7AFF,0x7C21,0x809D,0x826E,0x8271,0x8AEB,/* 0xD8-0xDF */
+	0x9593,0x4E6B,0x559D,0x66F7,0x6E34,0x78A3,0x7AED,0x845B,/* 0xE0-0xE7 */
+	0x8910,0x874E,0x97A8,0x52D8,0x574E,0x582A,0x5D4C,0x611F,/* 0xE8-0xEF */
+	0x61BE,0x6221,0x6562,0x67D1,0x6A44,0x6E1B,0x7518,0x75B3,/* 0xF0-0xF7 */
+	0x76E3,0x77B0,0x7D3A,0x90AF,0x9451,0x9452,0x9F95,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5323,0x5CAC,0x7532,0x80DB,0x9240,0x9598,0x525B,/* 0xA0-0xA7 */
+	0x5808,0x59DC,0x5CA1,0x5D17,0x5EB7,0x5F3A,0x5F4A,0x6177,/* 0xA8-0xAF */
+	0x6C5F,0x757A,0x7586,0x7CE0,0x7D73,0x7DB1,0x7F8C,0x8154,/* 0xB0-0xB7 */
+	0x8221,0x8591,0x8941,0x8B1B,0x92FC,0x964D,0x9C47,0x4ECB,/* 0xB8-0xBF */
+	0x4EF7,0x500B,0x51F1,0x584F,0x6137,0x613E,0x6168,0x6539,/* 0xC0-0xC7 */
+	0x69EA,0x6F11,0x75A5,0x7686,0x76D6,0x7B87,0x82A5,0x84CB,/* 0xC8-0xCF */
+	0xF900,0x93A7,0x958B,0x5580,0x5BA2,0x5751,0xF901,0x7CB3,/* 0xD0-0xD7 */
+	0x7FB9,0x91B5,0x5028,0x53BB,0x5C45,0x5DE8,0x62D2,0x636E,/* 0xD8-0xDF */
+	0x64DA,0x64E7,0x6E20,0x70AC,0x795B,0x8DDD,0x8E1E,0xF902,/* 0xE0-0xE7 */
+	0x907D,0x9245,0x92F8,0x4E7E,0x4EF6,0x5065,0x5DFE,0x5EFA,/* 0xE8-0xEF */
+	0x6106,0x6957,0x8171,0x8654,0x8E47,0x9375,0x9A2B,0x4E5E,/* 0xF0-0xF7 */
+	0x5091,0x6770,0x6840,0x5109,0x528D,0x5292,0x6AA2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x77BC,0x9210,0x9ED4,0x52AB,0x602F,0x8FF2,0x5048,/* 0xA0-0xA7 */
+	0x61A9,0x63ED,0x64CA,0x683C,0x6A84,0x6FC0,0x8188,0x89A1,/* 0xA8-0xAF */
+	0x9694,0x5805,0x727D,0x72AC,0x7504,0x7D79,0x7E6D,0x80A9,/* 0xB0-0xB7 */
+	0x898B,0x8B74,0x9063,0x9D51,0x6289,0x6C7A,0x6F54,0x7D50,/* 0xB8-0xBF */
+	0x7F3A,0x8A23,0x517C,0x614A,0x7B9D,0x8B19,0x9257,0x938C,/* 0xC0-0xC7 */
+	0x4EAC,0x4FD3,0x501E,0x50BE,0x5106,0x52C1,0x52CD,0x537F,/* 0xC8-0xCF */
+	0x5770,0x5883,0x5E9A,0x5F91,0x6176,0x61AC,0x64CE,0x656C,/* 0xD0-0xD7 */
+	0x666F,0x66BB,0x66F4,0x6897,0x6D87,0x7085,0x70F1,0x749F,/* 0xD8-0xDF */
+	0x74A5,0x74CA,0x75D9,0x786C,0x78EC,0x7ADF,0x7AF6,0x7D45,/* 0xE0-0xE7 */
+	0x7D93,0x8015,0x803F,0x811B,0x8396,0x8B66,0x8F15,0x9015,/* 0xE8-0xEF */
+	0x93E1,0x9803,0x9838,0x9A5A,0x9BE8,0x4FC2,0x5553,0x583A,/* 0xF0-0xF7 */
+	0x5951,0x5B63,0x5C46,0x60B8,0x6212,0x6842,0x68B0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x68E8,0x6EAA,0x754C,0x7678,0x78CE,0x7A3D,0x7CFB,/* 0xA0-0xA7 */
+	0x7E6B,0x7E7C,0x8A08,0x8AA1,0x8C3F,0x968E,0x9DC4,0x53E4,/* 0xA8-0xAF */
+	0x53E9,0x544A,0x5471,0x56FA,0x59D1,0x5B64,0x5C3B,0x5EAB,/* 0xB0-0xB7 */
+	0x62F7,0x6537,0x6545,0x6572,0x66A0,0x67AF,0x69C1,0x6CBD,/* 0xB8-0xBF */
+	0x75FC,0x7690,0x777E,0x7A3F,0x7F94,0x8003,0x80A1,0x818F,/* 0xC0-0xC7 */
+	0x82E6,0x82FD,0x83F0,0x85C1,0x8831,0x88B4,0x8AA5,0xF903,/* 0xC8-0xCF */
+	0x8F9C,0x932E,0x96C7,0x9867,0x9AD8,0x9F13,0x54ED,0x659B,/* 0xD0-0xD7 */
+	0x66F2,0x688F,0x7A40,0x8C37,0x9D60,0x56F0,0x5764,0x5D11,/* 0xD8-0xDF */
+	0x6606,0x68B1,0x68CD,0x6EFE,0x7428,0x889E,0x9BE4,0x6C68,/* 0xE0-0xE7 */
+	0xF904,0x9AA8,0x4F9B,0x516C,0x5171,0x529F,0x5B54,0x5DE5,/* 0xE8-0xEF */
+	0x6050,0x606D,0x62F1,0x63A7,0x653B,0x73D9,0x7A7A,0x86A3,/* 0xF0-0xF7 */
+	0x8CA2,0x978F,0x4E32,0x5BE1,0x6208,0x679C,0x74DC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x79D1,0x83D3,0x8A87,0x8AB2,0x8DE8,0x904E,0x934B,/* 0xA0-0xA7 */
+	0x9846,0x5ED3,0x69E8,0x85FF,0x90ED,0xF905,0x51A0,0x5B98,/* 0xA8-0xAF */
+	0x5BEC,0x6163,0x68FA,0x6B3E,0x704C,0x742F,0x74D8,0x7BA1,/* 0xB0-0xB7 */
+	0x7F50,0x83C5,0x89C0,0x8CAB,0x95DC,0x9928,0x522E,0x605D,/* 0xB8-0xBF */
+	0x62EC,0x9002,0x4F8A,0x5149,0x5321,0x58D9,0x5EE3,0x66E0,/* 0xC0-0xC7 */
+	0x6D38,0x709A,0x72C2,0x73D6,0x7B50,0x80F1,0x945B,0x5366,/* 0xC8-0xCF */
+	0x639B,0x7F6B,0x4E56,0x5080,0x584A,0x58DE,0x602A,0x6127,/* 0xD0-0xD7 */
+	0x62D0,0x69D0,0x9B41,0x5B8F,0x7D18,0x80B1,0x8F5F,0x4EA4,/* 0xD8-0xDF */
+	0x50D1,0x54AC,0x55AC,0x5B0C,0x5DA0,0x5DE7,0x652A,0x654E,/* 0xE0-0xE7 */
+	0x6821,0x6A4B,0x72E1,0x768E,0x77EF,0x7D5E,0x7FF9,0x81A0,/* 0xE8-0xEF */
+	0x854E,0x86DF,0x8F03,0x8F4E,0x90CA,0x9903,0x9A55,0x9BAB,/* 0xF0-0xF7 */
+	0x4E18,0x4E45,0x4E5D,0x4EC7,0x4FF1,0x5177,0x52FE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5340,0x53E3,0x53E5,0x548E,0x5614,0x5775,0x57A2,/* 0xA0-0xA7 */
+	0x5BC7,0x5D87,0x5ED0,0x61FC,0x62D8,0x6551,0x67B8,0x67E9,/* 0xA8-0xAF */
+	0x69CB,0x6B50,0x6BC6,0x6BEC,0x6C42,0x6E9D,0x7078,0x72D7,/* 0xB0-0xB7 */
+	0x7396,0x7403,0x77BF,0x77E9,0x7A76,0x7D7F,0x8009,0x81FC,/* 0xB8-0xBF */
+	0x8205,0x820A,0x82DF,0x8862,0x8B33,0x8CFC,0x8EC0,0x9011,/* 0xC0-0xC7 */
+	0x90B1,0x9264,0x92B6,0x99D2,0x9A45,0x9CE9,0x9DD7,0x9F9C,/* 0xC8-0xCF */
+	0x570B,0x5C40,0x83CA,0x97A0,0x97AB,0x9EB4,0x541B,0x7A98,/* 0xD0-0xD7 */
+	0x7FA4,0x88D9,0x8ECD,0x90E1,0x5800,0x5C48,0x6398,0x7A9F,/* 0xD8-0xDF */
+	0x5BAE,0x5F13,0x7A79,0x7AAE,0x828E,0x8EAC,0x5026,0x5238,/* 0xE0-0xE7 */
+	0x52F8,0x5377,0x5708,0x62F3,0x6372,0x6B0A,0x6DC3,0x7737,/* 0xE8-0xEF */
+	0x53A5,0x7357,0x8568,0x8E76,0x95D5,0x673A,0x6AC3,0x6F70,/* 0xF0-0xF7 */
+	0x8A6D,0x8ECC,0x994B,0xF906,0x6677,0x6B78,0x8CB4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9B3C,0xF907,0x53EB,0x572D,0x594E,0x63C6,0x69FB,/* 0xA0-0xA7 */
+	0x73EA,0x7845,0x7ABA,0x7AC5,0x7CFE,0x8475,0x898F,0x8D73,/* 0xA8-0xAF */
+	0x9035,0x95A8,0x52FB,0x5747,0x7547,0x7B60,0x83CC,0x921E,/* 0xB0-0xB7 */
+	0xF908,0x6A58,0x514B,0x524B,0x5287,0x621F,0x68D8,0x6975,/* 0xB8-0xBF */
+	0x9699,0x50C5,0x52A4,0x52E4,0x61C3,0x65A4,0x6839,0x69FF,/* 0xC0-0xC7 */
+	0x747E,0x7B4B,0x82B9,0x83EB,0x89B2,0x8B39,0x8FD1,0x9949,/* 0xC8-0xCF */
+	0xF909,0x4ECA,0x5997,0x64D2,0x6611,0x6A8E,0x7434,0x7981,/* 0xD0-0xD7 */
+	0x79BD,0x82A9,0x887E,0x887F,0x895F,0xF90A,0x9326,0x4F0B,/* 0xD8-0xDF */
+	0x53CA,0x6025,0x6271,0x6C72,0x7D1A,0x7D66,0x4E98,0x5162,/* 0xE0-0xE7 */
+	0x77DC,0x80AF,0x4F01,0x4F0E,0x5176,0x5180,0x55DC,0x5668,/* 0xE8-0xEF */
+	0x573B,0x57FA,0x57FC,0x5914,0x5947,0x5993,0x5BC4,0x5C90,/* 0xF0-0xF7 */
+	0x5D0E,0x5DF1,0x5E7E,0x5FCC,0x6280,0x65D7,0x65E3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x671E,0x671F,0x675E,0x68CB,0x68C4,0x6A5F,0x6B3A,/* 0xA0-0xA7 */
+	0x6C23,0x6C7D,0x6C82,0x6DC7,0x7398,0x7426,0x742A,0x7482,/* 0xA8-0xAF */
+	0x74A3,0x7578,0x757F,0x7881,0x78EF,0x7941,0x7947,0x7948,/* 0xB0-0xB7 */
+	0x797A,0x7B95,0x7D00,0x7DBA,0x7F88,0x8006,0x802D,0x808C,/* 0xB8-0xBF */
+	0x8A18,0x8B4F,0x8C48,0x8D77,0x9321,0x9324,0x98E2,0x9951,/* 0xC0-0xC7 */
+	0x9A0E,0x9A0F,0x9A65,0x9E92,0x7DCA,0x4F76,0x5409,0x62EE,/* 0xC8-0xCF */
+	0x6854,0x91D1,0x55AB,0x513A,0xF90B,0xF90C,0x5A1C,0x61E6,/* 0xD0-0xD7 */
+	0xF90D,0x62CF,0x62FF,0xF90E,0xF90F,0xF910,0xF911,0xF912,/* 0xD8-0xDF */
+	0xF913,0x90A3,0xF914,0xF915,0xF916,0xF917,0xF918,0x8AFE,/* 0xE0-0xE7 */
+	0xF919,0xF91A,0xF91B,0xF91C,0x6696,0xF91D,0x7156,0xF91E,/* 0xE8-0xEF */
+	0xF91F,0x96E3,0xF920,0x634F,0x637A,0x5357,0xF921,0x678F,/* 0xF0-0xF7 */
+	0x6960,0x6E73,0xF922,0x7537,0xF923,0xF924,0xF925,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7D0D,0xF926,0xF927,0x8872,0x56CA,0x5A18,0xF928,/* 0xA0-0xA7 */
+	0xF929,0xF92A,0xF92B,0xF92C,0x4E43,0xF92D,0x5167,0x5948,/* 0xA8-0xAF */
+	0x67F0,0x8010,0xF92E,0x5973,0x5E74,0x649A,0x79CA,0x5FF5,/* 0xB0-0xB7 */
+	0x606C,0x62C8,0x637B,0x5BE7,0x5BD7,0x52AA,0xF92F,0x5974,/* 0xB8-0xBF */
+	0x5F29,0x6012,0xF930,0xF931,0xF932,0x7459,0xF933,0xF934,/* 0xC0-0xC7 */
+	0xF935,0xF936,0xF937,0xF938,0x99D1,0xF939,0xF93A,0xF93B,/* 0xC8-0xCF */
+	0xF93C,0xF93D,0xF93E,0xF93F,0xF940,0xF941,0xF942,0xF943,/* 0xD0-0xD7 */
+	0x6FC3,0xF944,0xF945,0x81BF,0x8FB2,0x60F1,0xF946,0xF947,/* 0xD8-0xDF */
+	0x8166,0xF948,0xF949,0x5C3F,0xF94A,0xF94B,0xF94C,0xF94D,/* 0xE0-0xE7 */
+	0xF94E,0xF94F,0xF950,0xF951,0x5AE9,0x8A25,0x677B,0x7D10,/* 0xE8-0xEF */
+	0xF952,0xF953,0xF954,0xF955,0xF956,0xF957,0x80FD,0xF958,/* 0xF0-0xF7 */
+	0xF959,0x5C3C,0x6CE5,0x533F,0x6EBA,0x591A,0x8336,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x4E39,0x4EB6,0x4F46,0x55AE,0x5718,0x58C7,0x5F56,/* 0xA0-0xA7 */
+	0x65B7,0x65E6,0x6A80,0x6BB5,0x6E4D,0x77ED,0x7AEF,0x7C1E,/* 0xA8-0xAF */
+	0x7DDE,0x86CB,0x8892,0x9132,0x935B,0x64BB,0x6FBE,0x737A,/* 0xB0-0xB7 */
+	0x75B8,0x9054,0x5556,0x574D,0x61BA,0x64D4,0x66C7,0x6DE1,/* 0xB8-0xBF */
+	0x6E5B,0x6F6D,0x6FB9,0x75F0,0x8043,0x81BD,0x8541,0x8983,/* 0xC0-0xC7 */
+	0x8AC7,0x8B5A,0x931F,0x6C93,0x7553,0x7B54,0x8E0F,0x905D,/* 0xC8-0xCF */
+	0x5510,0x5802,0x5858,0x5E62,0x6207,0x649E,0x68E0,0x7576,/* 0xD0-0xD7 */
+	0x7CD6,0x87B3,0x9EE8,0x4EE3,0x5788,0x576E,0x5927,0x5C0D,/* 0xD8-0xDF */
+	0x5CB1,0x5E36,0x5F85,0x6234,0x64E1,0x73B3,0x81FA,0x888B,/* 0xE0-0xE7 */
+	0x8CB8,0x968A,0x9EDB,0x5B85,0x5FB7,0x60B3,0x5012,0x5200,/* 0xE8-0xEF */
+	0x5230,0x5716,0x5835,0x5857,0x5C0E,0x5C60,0x5CF6,0x5D8B,/* 0xF0-0xF7 */
+	0x5EA6,0x5F92,0x60BC,0x6311,0x6389,0x6417,0x6843,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x68F9,0x6AC2,0x6DD8,0x6E21,0x6ED4,0x6FE4,0x71FE,/* 0xA0-0xA7 */
+	0x76DC,0x7779,0x79B1,0x7A3B,0x8404,0x89A9,0x8CED,0x8DF3,/* 0xA8-0xAF */
+	0x8E48,0x9003,0x9014,0x9053,0x90FD,0x934D,0x9676,0x97DC,/* 0xB0-0xB7 */
+	0x6BD2,0x7006,0x7258,0x72A2,0x7368,0x7763,0x79BF,0x7BE4,/* 0xB8-0xBF */
+	0x7E9B,0x8B80,0x58A9,0x60C7,0x6566,0x65FD,0x66BE,0x6C8C,/* 0xC0-0xC7 */
+	0x711E,0x71C9,0x8C5A,0x9813,0x4E6D,0x7A81,0x4EDD,0x51AC,/* 0xC8-0xCF */
+	0x51CD,0x52D5,0x540C,0x61A7,0x6771,0x6850,0x68DF,0x6D1E,/* 0xD0-0xD7 */
+	0x6F7C,0x75BC,0x77B3,0x7AE5,0x80F4,0x8463,0x9285,0x515C,/* 0xD8-0xDF */
+	0x6597,0x675C,0x6793,0x75D8,0x7AC7,0x8373,0xF95A,0x8C46,/* 0xE0-0xE7 */
+	0x9017,0x982D,0x5C6F,0x81C0,0x829A,0x9041,0x906F,0x920D,/* 0xE8-0xEF */
+	0x5F97,0x5D9D,0x6A59,0x71C8,0x767B,0x7B49,0x85E4,0x8B04,/* 0xF0-0xF7 */
+	0x9127,0x9A30,0x5587,0x61F6,0xF95B,0x7669,0x7F85,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x863F,0x87BA,0x88F8,0x908F,0xF95C,0x6D1B,0x70D9,/* 0xA0-0xA7 */
+	0x73DE,0x7D61,0x843D,0xF95D,0x916A,0x99F1,0xF95E,0x4E82,/* 0xA8-0xAF */
+	0x5375,0x6B04,0x6B12,0x703E,0x721B,0x862D,0x9E1E,0x524C,/* 0xB0-0xB7 */
+	0x8FA3,0x5D50,0x64E5,0x652C,0x6B16,0x6FEB,0x7C43,0x7E9C,/* 0xB8-0xBF */
+	0x85CD,0x8964,0x89BD,0x62C9,0x81D8,0x881F,0x5ECA,0x6717,/* 0xC0-0xC7 */
+	0x6D6A,0x72FC,0x7405,0x746F,0x8782,0x90DE,0x4F86,0x5D0D,/* 0xC8-0xCF */
+	0x5FA0,0x840A,0x51B7,0x63A0,0x7565,0x4EAE,0x5006,0x5169,/* 0xD0-0xD7 */
+	0x51C9,0x6881,0x6A11,0x7CAE,0x7CB1,0x7CE7,0x826F,0x8AD2,/* 0xD8-0xDF */
+	0x8F1B,0x91CF,0x4FB6,0x5137,0x52F5,0x5442,0x5EEC,0x616E,/* 0xE0-0xE7 */
+	0x623E,0x65C5,0x6ADA,0x6FFE,0x792A,0x85DC,0x8823,0x95AD,/* 0xE8-0xEF */
+	0x9A62,0x9A6A,0x9E97,0x9ECE,0x529B,0x66C6,0x6B77,0x701D,/* 0xF0-0xF7 */
+	0x792B,0x8F62,0x9742,0x6190,0x6200,0x6523,0x6F23,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7149,0x7489,0x7DF4,0x806F,0x84EE,0x8F26,0x9023,/* 0xA0-0xA7 */
+	0x934A,0x51BD,0x5217,0x52A3,0x6D0C,0x70C8,0x88C2,0x5EC9,/* 0xA8-0xAF */
+	0x6582,0x6BAE,0x6FC2,0x7C3E,0x7375,0x4EE4,0x4F36,0x56F9,/* 0xB0-0xB7 */
+	0xF95F,0x5CBA,0x5DBA,0x601C,0x73B2,0x7B2D,0x7F9A,0x7FCE,/* 0xB8-0xBF */
+	0x8046,0x901E,0x9234,0x96F6,0x9748,0x9818,0x9F61,0x4F8B,/* 0xC0-0xC7 */
+	0x6FA7,0x79AE,0x91B4,0x96B7,0x52DE,0xF960,0x6488,0x64C4,/* 0xC8-0xCF */
+	0x6AD3,0x6F5E,0x7018,0x7210,0x76E7,0x8001,0x8606,0x865C,/* 0xD0-0xD7 */
+	0x8DEF,0x8F05,0x9732,0x9B6F,0x9DFA,0x9E75,0x788C,0x797F,/* 0xD8-0xDF */
+	0x7DA0,0x83C9,0x9304,0x9E7F,0x9E93,0x8AD6,0x58DF,0x5F04,/* 0xE0-0xE7 */
+	0x6727,0x7027,0x74CF,0x7C60,0x807E,0x5121,0x7028,0x7262,/* 0xE8-0xEF */
+	0x78CA,0x8CC2,0x8CDA,0x8CF4,0x96F7,0x4E86,0x50DA,0x5BEE,/* 0xF0-0xF7 */
+	0x5ED6,0x6599,0x71CE,0x7642,0x77AD,0x804A,0x84FC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x907C,0x9B27,0x9F8D,0x58D8,0x5A41,0x5C62,0x6A13,/* 0xA0-0xA7 */
+	0x6DDA,0x6F0F,0x763B,0x7D2F,0x7E37,0x851E,0x8938,0x93E4,/* 0xA8-0xAF */
+	0x964B,0x5289,0x65D2,0x67F3,0x69B4,0x6D41,0x6E9C,0x700F,/* 0xB0-0xB7 */
+	0x7409,0x7460,0x7559,0x7624,0x786B,0x8B2C,0x985E,0x516D,/* 0xB8-0xBF */
+	0x622E,0x9678,0x4F96,0x502B,0x5D19,0x6DEA,0x7DB8,0x8F2A,/* 0xC0-0xC7 */
+	0x5F8B,0x6144,0x6817,0xF961,0x9686,0x52D2,0x808B,0x51DC,/* 0xC8-0xCF */
+	0x51CC,0x695E,0x7A1C,0x7DBE,0x83F1,0x9675,0x4FDA,0x5229,/* 0xD0-0xD7 */
+	0x5398,0x540F,0x550E,0x5C65,0x60A7,0x674E,0x68A8,0x6D6C,/* 0xD8-0xDF */
+	0x7281,0x72F8,0x7406,0x7483,0xF962,0x75E2,0x7C6C,0x7F79,/* 0xE0-0xE7 */
+	0x7FB8,0x8389,0x88CF,0x88E1,0x91CC,0x91D0,0x96E2,0x9BC9,/* 0xE8-0xEF */
+	0x541D,0x6F7E,0x71D0,0x7498,0x85FA,0x8EAA,0x96A3,0x9C57,/* 0xF0-0xF7 */
+	0x9E9F,0x6797,0x6DCB,0x7433,0x81E8,0x9716,0x782C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7ACB,0x7B20,0x7C92,0x6469,0x746A,0x75F2,0x78BC,/* 0xA0-0xA7 */
+	0x78E8,0x99AC,0x9B54,0x9EBB,0x5BDE,0x5E55,0x6F20,0x819C,/* 0xA8-0xAF */
+	0x83AB,0x9088,0x4E07,0x534D,0x5A29,0x5DD2,0x5F4E,0x6162,/* 0xB0-0xB7 */
+	0x633D,0x6669,0x66FC,0x6EFF,0x6F2B,0x7063,0x779E,0x842C,/* 0xB8-0xBF */
+	0x8513,0x883B,0x8F13,0x9945,0x9C3B,0x551C,0x62B9,0x672B,/* 0xC0-0xC7 */
+	0x6CAB,0x8309,0x896A,0x977A,0x4EA1,0x5984,0x5FD8,0x5FD9,/* 0xC8-0xCF */
+	0x671B,0x7DB2,0x7F54,0x8292,0x832B,0x83BD,0x8F1E,0x9099,/* 0xD0-0xD7 */
+	0x57CB,0x59B9,0x5A92,0x5BD0,0x6627,0x679A,0x6885,0x6BCF,/* 0xD8-0xDF */
+	0x7164,0x7F75,0x8CB7,0x8CE3,0x9081,0x9B45,0x8108,0x8C8A,/* 0xE0-0xE7 */
+	0x964C,0x9A40,0x9EA5,0x5B5F,0x6C13,0x731B,0x76F2,0x76DF,/* 0xE8-0xEF */
+	0x840C,0x51AA,0x8993,0x514D,0x5195,0x52C9,0x68C9,0x6C94,/* 0xF0-0xF7 */
+	0x7704,0x7720,0x7DBF,0x7DEC,0x9762,0x9EB5,0x6EC5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8511,0x51A5,0x540D,0x547D,0x660E,0x669D,0x6927,/* 0xA0-0xA7 */
+	0x6E9F,0x76BF,0x7791,0x8317,0x84C2,0x879F,0x9169,0x9298,/* 0xA8-0xAF */
+	0x9CF4,0x8882,0x4FAE,0x5192,0x52DF,0x59C6,0x5E3D,0x6155,/* 0xB0-0xB7 */
+	0x6478,0x6479,0x66AE,0x67D0,0x6A21,0x6BCD,0x6BDB,0x725F,/* 0xB8-0xBF */
+	0x7261,0x7441,0x7738,0x77DB,0x8017,0x82BC,0x8305,0x8B00,/* 0xC0-0xC7 */
+	0x8B28,0x8C8C,0x6728,0x6C90,0x7267,0x76EE,0x7766,0x7A46,/* 0xC8-0xCF */
+	0x9DA9,0x6B7F,0x6C92,0x5922,0x6726,0x8499,0x536F,0x5893,/* 0xD0-0xD7 */
+	0x5999,0x5EDF,0x63CF,0x6634,0x6773,0x6E3A,0x732B,0x7AD7,/* 0xD8-0xDF */
+	0x82D7,0x9328,0x52D9,0x5DEB,0x61AE,0x61CB,0x620A,0x62C7,/* 0xE0-0xE7 */
+	0x64AB,0x65E0,0x6959,0x6B66,0x6BCB,0x7121,0x73F7,0x755D,/* 0xE8-0xEF */
+	0x7E46,0x821E,0x8302,0x856A,0x8AA3,0x8CBF,0x9727,0x9D61,/* 0xF0-0xF7 */
+	0x58A8,0x9ED8,0x5011,0x520E,0x543B,0x554F,0x6587,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6C76,0x7D0A,0x7D0B,0x805E,0x868A,0x9580,0x96EF,/* 0xA0-0xA7 */
+	0x52FF,0x6C95,0x7269,0x5473,0x5A9A,0x5C3E,0x5D4B,0x5F4C,/* 0xA8-0xAF */
+	0x5FAE,0x672A,0x68B6,0x6963,0x6E3C,0x6E44,0x7709,0x7C73,/* 0xB0-0xB7 */
+	0x7F8E,0x8587,0x8B0E,0x8FF7,0x9761,0x9EF4,0x5CB7,0x60B6,/* 0xB8-0xBF */
+	0x610D,0x61AB,0x654F,0x65FB,0x65FC,0x6C11,0x6CEF,0x739F,/* 0xC0-0xC7 */
+	0x73C9,0x7DE1,0x9594,0x5BC6,0x871C,0x8B10,0x525D,0x535A,/* 0xC8-0xCF */
+	0x62CD,0x640F,0x64B2,0x6734,0x6A38,0x6CCA,0x73C0,0x749E,/* 0xD0-0xD7 */
+	0x7B94,0x7C95,0x7E1B,0x818A,0x8236,0x8584,0x8FEB,0x96F9,/* 0xD8-0xDF */
+	0x99C1,0x4F34,0x534A,0x53CD,0x53DB,0x62CC,0x642C,0x6500,/* 0xE0-0xE7 */
+	0x6591,0x69C3,0x6CEE,0x6F58,0x73ED,0x7554,0x7622,0x76E4,/* 0xE8-0xEF */
+	0x76FC,0x78D0,0x78FB,0x792C,0x7D46,0x822C,0x87E0,0x8FD4,/* 0xF0-0xF7 */
+	0x9812,0x98EF,0x52C3,0x62D4,0x64A5,0x6E24,0x6F51,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x767C,0x8DCB,0x91B1,0x9262,0x9AEE,0x9B43,0x5023,/* 0xA0-0xA7 */
+	0x508D,0x574A,0x59A8,0x5C28,0x5E47,0x5F77,0x623F,0x653E,/* 0xA8-0xAF */
+	0x65B9,0x65C1,0x6609,0x678B,0x699C,0x6EC2,0x78C5,0x7D21,/* 0xB0-0xB7 */
+	0x80AA,0x8180,0x822B,0x82B3,0x84A1,0x868C,0x8A2A,0x8B17,/* 0xB8-0xBF */
+	0x90A6,0x9632,0x9F90,0x500D,0x4FF3,0xF963,0x57F9,0x5F98,/* 0xC0-0xC7 */
+	0x62DC,0x6392,0x676F,0x6E43,0x7119,0x76C3,0x80CC,0x80DA,/* 0xC8-0xCF */
+	0x88F4,0x88F5,0x8919,0x8CE0,0x8F29,0x914D,0x966A,0x4F2F,/* 0xD0-0xD7 */
+	0x4F70,0x5E1B,0x67CF,0x6822,0x767D,0x767E,0x9B44,0x5E61,/* 0xD8-0xDF */
+	0x6A0A,0x7169,0x71D4,0x756A,0xF964,0x7E41,0x8543,0x85E9,/* 0xE0-0xE7 */
+	0x98DC,0x4F10,0x7B4F,0x7F70,0x95A5,0x51E1,0x5E06,0x68B5,/* 0xE8-0xEF */
+	0x6C3E,0x6C4E,0x6CDB,0x72AF,0x7BC4,0x8303,0x6CD5,0x743A,/* 0xF0-0xF7 */
+	0x50FB,0x5288,0x58C1,0x64D8,0x6A97,0x74A7,0x7656,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x78A7,0x8617,0x95E2,0x9739,0xF965,0x535E,0x5F01,/* 0xA0-0xA7 */
+	0x8B8A,0x8FA8,0x8FAF,0x908A,0x5225,0x77A5,0x9C49,0x9F08,/* 0xA8-0xAF */
+	0x4E19,0x5002,0x5175,0x5C5B,0x5E77,0x661E,0x663A,0x67C4,/* 0xB0-0xB7 */
+	0x68C5,0x70B3,0x7501,0x75C5,0x79C9,0x7ADD,0x8F27,0x9920,/* 0xB8-0xBF */
+	0x9A08,0x4FDD,0x5821,0x5831,0x5BF6,0x666E,0x6B65,0x6D11,/* 0xC0-0xC7 */
+	0x6E7A,0x6F7D,0x73E4,0x752B,0x83E9,0x88DC,0x8913,0x8B5C,/* 0xC8-0xCF */
+	0x8F14,0x4F0F,0x50D5,0x5310,0x535C,0x5B93,0x5FA9,0x670D,/* 0xD0-0xD7 */
+	0x798F,0x8179,0x832F,0x8514,0x8907,0x8986,0x8F39,0x8F3B,/* 0xD8-0xDF */
+	0x99A5,0x9C12,0x672C,0x4E76,0x4FF8,0x5949,0x5C01,0x5CEF,/* 0xE0-0xE7 */
+	0x5CF0,0x6367,0x68D2,0x70FD,0x71A2,0x742B,0x7E2B,0x84EC,/* 0xE8-0xEF */
+	0x8702,0x9022,0x92D2,0x9CF3,0x4E0D,0x4ED8,0x4FEF,0x5085,/* 0xF0-0xF7 */
+	0x5256,0x526F,0x5426,0x5490,0x57E0,0x592B,0x5A66,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5B5A,0x5B75,0x5BCC,0x5E9C,0xF966,0x6276,0x6577,/* 0xA0-0xA7 */
+	0x65A7,0x6D6E,0x6EA5,0x7236,0x7B26,0x7C3F,0x7F36,0x8150,/* 0xA8-0xAF */
+	0x8151,0x819A,0x8240,0x8299,0x83A9,0x8A03,0x8CA0,0x8CE6,/* 0xB0-0xB7 */
+	0x8CFB,0x8D74,0x8DBA,0x90E8,0x91DC,0x961C,0x9644,0x99D9,/* 0xB8-0xBF */
+	0x9CE7,0x5317,0x5206,0x5429,0x5674,0x58B3,0x5954,0x596E,/* 0xC0-0xC7 */
+	0x5FFF,0x61A4,0x626E,0x6610,0x6C7E,0x711A,0x76C6,0x7C89,/* 0xC8-0xCF */
+	0x7CDE,0x7D1B,0x82AC,0x8CC1,0x96F0,0xF967,0x4F5B,0x5F17,/* 0xD0-0xD7 */
+	0x5F7F,0x62C2,0x5D29,0x670B,0x68DA,0x787C,0x7E43,0x9D6C,/* 0xD8-0xDF */
+	0x4E15,0x5099,0x5315,0x532A,0x5351,0x5983,0x5A62,0x5E87,/* 0xE0-0xE7 */
+	0x60B2,0x618A,0x6249,0x6279,0x6590,0x6787,0x69A7,0x6BD4,/* 0xE8-0xEF */
+	0x6BD6,0x6BD7,0x6BD8,0x6CB8,0xF968,0x7435,0x75FA,0x7812,/* 0xF0-0xF7 */
+	0x7891,0x79D5,0x79D8,0x7C83,0x7DCB,0x7FE1,0x80A5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x813E,0x81C2,0x83F2,0x871A,0x88E8,0x8AB9,0x8B6C,/* 0xA0-0xA7 */
+	0x8CBB,0x9119,0x975E,0x98DB,0x9F3B,0x56AC,0x5B2A,0x5F6C,/* 0xA8-0xAF */
+	0x658C,0x6AB3,0x6BAF,0x6D5C,0x6FF1,0x7015,0x725D,0x73AD,/* 0xB0-0xB7 */
+	0x8CA7,0x8CD3,0x983B,0x6191,0x6C37,0x8058,0x9A01,0x4E4D,/* 0xB8-0xBF */
+	0x4E8B,0x4E9B,0x4ED5,0x4F3A,0x4F3C,0x4F7F,0x4FDF,0x50FF,/* 0xC0-0xC7 */
+	0x53F2,0x53F8,0x5506,0x55E3,0x56DB,0x58EB,0x5962,0x5A11,/* 0xC8-0xCF */
+	0x5BEB,0x5BFA,0x5C04,0x5DF3,0x5E2B,0x5F99,0x601D,0x6368,/* 0xD0-0xD7 */
+	0x659C,0x65AF,0x67F6,0x67FB,0x68AD,0x6B7B,0x6C99,0x6CD7,/* 0xD8-0xDF */
+	0x6E23,0x7009,0x7345,0x7802,0x793E,0x7940,0x7960,0x79C1,/* 0xE0-0xE7 */
+	0x7BE9,0x7D17,0x7D72,0x8086,0x820D,0x838E,0x84D1,0x86C7,/* 0xE8-0xEF */
+	0x88DF,0x8A50,0x8A5E,0x8B1D,0x8CDC,0x8D66,0x8FAD,0x90AA,/* 0xF0-0xF7 */
+	0x98FC,0x99DF,0x9E9D,0x524A,0xF969,0x6714,0xF96A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5098,0x522A,0x5C71,0x6563,0x6C55,0x73CA,0x7523,/* 0xA0-0xA7 */
+	0x759D,0x7B97,0x849C,0x9178,0x9730,0x4E77,0x6492,0x6BBA,/* 0xA8-0xAF */
+	0x715E,0x85A9,0x4E09,0xF96B,0x6749,0x68EE,0x6E17,0x829F,/* 0xB0-0xB7 */
+	0x8518,0x886B,0x63F7,0x6F81,0x9212,0x98AF,0x4E0A,0x50B7,/* 0xB8-0xBF */
+	0x50CF,0x511F,0x5546,0x55AA,0x5617,0x5B40,0x5C19,0x5CE0,/* 0xC0-0xC7 */
+	0x5E38,0x5E8A,0x5EA0,0x5EC2,0x60F3,0x6851,0x6A61,0x6E58,/* 0xC8-0xCF */
+	0x723D,0x7240,0x72C0,0x76F8,0x7965,0x7BB1,0x7FD4,0x88F3,/* 0xD0-0xD7 */
+	0x89F4,0x8A73,0x8C61,0x8CDE,0x971C,0x585E,0x74BD,0x8CFD,/* 0xD8-0xDF */
+	0x55C7,0xF96C,0x7A61,0x7D22,0x8272,0x7272,0x751F,0x7525,/* 0xE0-0xE7 */
+	0xF96D,0x7B19,0x5885,0x58FB,0x5DBC,0x5E8F,0x5EB6,0x5F90,/* 0xE8-0xEF */
+	0x6055,0x6292,0x637F,0x654D,0x6691,0x66D9,0x66F8,0x6816,/* 0xF0-0xF7 */
+	0x68F2,0x7280,0x745E,0x7B6E,0x7D6E,0x7DD6,0x7F72,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x80E5,0x8212,0x85AF,0x897F,0x8A93,0x901D,0x92E4,/* 0xA0-0xA7 */
+	0x9ECD,0x9F20,0x5915,0x596D,0x5E2D,0x60DC,0x6614,0x6673,/* 0xA8-0xAF */
+	0x6790,0x6C50,0x6DC5,0x6F5F,0x77F3,0x78A9,0x84C6,0x91CB,/* 0xB0-0xB7 */
+	0x932B,0x4ED9,0x50CA,0x5148,0x5584,0x5B0B,0x5BA3,0x6247,/* 0xB8-0xBF */
+	0x657E,0x65CB,0x6E32,0x717D,0x7401,0x7444,0x7487,0x74BF,/* 0xC0-0xC7 */
+	0x766C,0x79AA,0x7DDA,0x7E55,0x7FA8,0x817A,0x81B3,0x8239,/* 0xC8-0xCF */
+	0x861A,0x87EC,0x8A75,0x8DE3,0x9078,0x9291,0x9425,0x994D,/* 0xD0-0xD7 */
+	0x9BAE,0x5368,0x5C51,0x6954,0x6CC4,0x6D29,0x6E2B,0x820C,/* 0xD8-0xDF */
+	0x859B,0x893B,0x8A2D,0x8AAA,0x96EA,0x9F67,0x5261,0x66B9,/* 0xE0-0xE7 */
+	0x6BB2,0x7E96,0x87FE,0x8D0D,0x9583,0x965D,0x651D,0x6D89,/* 0xE8-0xEF */
+	0x71EE,0xF96E,0x57CE,0x59D3,0x5BAC,0x6027,0x60FA,0x6210,/* 0xF0-0xF7 */
+	0x661F,0x665F,0x7329,0x73F9,0x76DB,0x7701,0x7B6C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8056,0x8072,0x8165,0x8AA0,0x9192,0x4E16,0x52E2,/* 0xA0-0xA7 */
+	0x6B72,0x6D17,0x7A05,0x7B39,0x7D30,0xF96F,0x8CB0,0x53EC,/* 0xA8-0xAF */
+	0x562F,0x5851,0x5BB5,0x5C0F,0x5C11,0x5DE2,0x6240,0x6383,/* 0xB0-0xB7 */
+	0x6414,0x662D,0x68B3,0x6CBC,0x6D88,0x6EAF,0x701F,0x70A4,/* 0xB8-0xBF */
+	0x71D2,0x7526,0x758F,0x758E,0x7619,0x7B11,0x7BE0,0x7C2B,/* 0xC0-0xC7 */
+	0x7D20,0x7D39,0x852C,0x856D,0x8607,0x8A34,0x900D,0x9061,/* 0xC8-0xCF */
+	0x90B5,0x92B7,0x97F6,0x9A37,0x4FD7,0x5C6C,0x675F,0x6D91,/* 0xD0-0xD7 */
+	0x7C9F,0x7E8C,0x8B16,0x8D16,0x901F,0x5B6B,0x5DFD,0x640D,/* 0xD8-0xDF */
+	0x84C0,0x905C,0x98E1,0x7387,0x5B8B,0x609A,0x677E,0x6DDE,/* 0xE0-0xE7 */
+	0x8A1F,0x8AA6,0x9001,0x980C,0x5237,0xF970,0x7051,0x788E,/* 0xE8-0xEF */
+	0x9396,0x8870,0x91D7,0x4FEE,0x53D7,0x55FD,0x56DA,0x5782,/* 0xF0-0xF7 */
+	0x58FD,0x5AC2,0x5B88,0x5CAB,0x5CC0,0x5E25,0x6101,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x620D,0x624B,0x6388,0x641C,0x6536,0x6578,0x6A39,/* 0xA0-0xA7 */
+	0x6B8A,0x6C34,0x6D19,0x6F31,0x71E7,0x72E9,0x7378,0x7407,/* 0xA8-0xAF */
+	0x74B2,0x7626,0x7761,0x79C0,0x7A57,0x7AEA,0x7CB9,0x7D8F,/* 0xB0-0xB7 */
+	0x7DAC,0x7E61,0x7F9E,0x8129,0x8331,0x8490,0x84DA,0x85EA,/* 0xB8-0xBF */
+	0x8896,0x8AB0,0x8B90,0x8F38,0x9042,0x9083,0x916C,0x9296,/* 0xC0-0xC7 */
+	0x92B9,0x968B,0x96A7,0x96A8,0x96D6,0x9700,0x9808,0x9996,/* 0xC8-0xCF */
+	0x9AD3,0x9B1A,0x53D4,0x587E,0x5919,0x5B70,0x5BBF,0x6DD1,/* 0xD0-0xD7 */
+	0x6F5A,0x719F,0x7421,0x74B9,0x8085,0x83FD,0x5DE1,0x5F87,/* 0xD8-0xDF */
+	0x5FAA,0x6042,0x65EC,0x6812,0x696F,0x6A53,0x6B89,0x6D35,/* 0xE0-0xE7 */
+	0x6DF3,0x73E3,0x76FE,0x77AC,0x7B4D,0x7D14,0x8123,0x821C,/* 0xE8-0xEF */
+	0x8340,0x84F4,0x8563,0x8A62,0x8AC4,0x9187,0x931E,0x9806,/* 0xF0-0xF7 */
+	0x99B4,0x620C,0x8853,0x8FF0,0x9265,0x5D07,0x5D27,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5D69,0x745F,0x819D,0x8768,0x6FD5,0x62FE,0x7FD2,/* 0xA0-0xA7 */
+	0x8936,0x8972,0x4E1E,0x4E58,0x50E7,0x52DD,0x5347,0x627F,/* 0xA8-0xAF */
+	0x6607,0x7E69,0x8805,0x965E,0x4F8D,0x5319,0x5636,0x59CB,/* 0xB0-0xB7 */
+	0x5AA4,0x5C38,0x5C4E,0x5C4D,0x5E02,0x5F11,0x6043,0x65BD,/* 0xB8-0xBF */
+	0x662F,0x6642,0x67BE,0x67F4,0x731C,0x77E2,0x793A,0x7FC5,/* 0xC0-0xC7 */
+	0x8494,0x84CD,0x8996,0x8A66,0x8A69,0x8AE1,0x8C55,0x8C7A,/* 0xC8-0xCF */
+	0x57F4,0x5BD4,0x5F0F,0x606F,0x62ED,0x690D,0x6B96,0x6E5C,/* 0xD0-0xD7 */
+	0x7184,0x7BD2,0x8755,0x8B58,0x8EFE,0x98DF,0x98FE,0x4F38,/* 0xD8-0xDF */
+	0x4F81,0x4FE1,0x547B,0x5A20,0x5BB8,0x613C,0x65B0,0x6668,/* 0xE0-0xE7 */
+	0x71FC,0x7533,0x795E,0x7D33,0x814E,0x81E3,0x8398,0x85AA,/* 0xE8-0xEF */
+	0x85CE,0x8703,0x8A0A,0x8EAB,0x8F9B,0xF971,0x8FC5,0x5931,/* 0xF0-0xF7 */
+	0x5BA4,0x5BE6,0x6089,0x5BE9,0x5C0B,0x5FC3,0x6C81,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF972,0x6DF1,0x700B,0x751A,0x82AF,0x8AF6,0x4EC0,/* 0xA0-0xA7 */
+	0x5341,0xF973,0x96D9,0x6C0F,0x4E9E,0x4FC4,0x5152,0x555E,/* 0xA8-0xAF */
+	0x5A25,0x5CE8,0x6211,0x7259,0x82BD,0x83AA,0x86FE,0x8859,/* 0xB0-0xB7 */
+	0x8A1D,0x963F,0x96C5,0x9913,0x9D09,0x9D5D,0x580A,0x5CB3,/* 0xB8-0xBF */
+	0x5DBD,0x5E44,0x60E1,0x6115,0x63E1,0x6A02,0x6E25,0x9102,/* 0xC0-0xC7 */
+	0x9354,0x984E,0x9C10,0x9F77,0x5B89,0x5CB8,0x6309,0x664F,/* 0xC8-0xCF */
+	0x6848,0x773C,0x96C1,0x978D,0x9854,0x9B9F,0x65A1,0x8B01,/* 0xD0-0xD7 */
+	0x8ECB,0x95BC,0x5535,0x5CA9,0x5DD6,0x5EB5,0x6697,0x764C,/* 0xD8-0xDF */
+	0x83F4,0x95C7,0x58D3,0x62BC,0x72CE,0x9D28,0x4EF0,0x592E,/* 0xE0-0xE7 */
+	0x600F,0x663B,0x6B83,0x79E7,0x9D26,0x5393,0x54C0,0x57C3,/* 0xE8-0xEF */
+	0x5D16,0x611B,0x66D6,0x6DAF,0x788D,0x827E,0x9698,0x9744,/* 0xF0-0xF7 */
+	0x5384,0x627C,0x6396,0x6DB2,0x7E0A,0x814B,0x984D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6AFB,0x7F4C,0x9DAF,0x9E1A,0x4E5F,0x503B,0x51B6,/* 0xA0-0xA7 */
+	0x591C,0x60F9,0x63F6,0x6930,0x723A,0x8036,0xF974,0x91CE,/* 0xA8-0xAF */
+	0x5F31,0xF975,0xF976,0x7D04,0x82E5,0x846F,0x84BB,0x85E5,/* 0xB0-0xB7 */
+	0x8E8D,0xF977,0x4F6F,0xF978,0xF979,0x58E4,0x5B43,0x6059,/* 0xB8-0xBF */
+	0x63DA,0x6518,0x656D,0x6698,0xF97A,0x694A,0x6A23,0x6D0B,/* 0xC0-0xC7 */
+	0x7001,0x716C,0x75D2,0x760D,0x79B3,0x7A70,0xF97B,0x7F8A,/* 0xC8-0xCF */
+	0xF97C,0x8944,0xF97D,0x8B93,0x91C0,0x967D,0xF97E,0x990A,/* 0xD0-0xD7 */
+	0x5704,0x5FA1,0x65BC,0x6F01,0x7600,0x79A6,0x8A9E,0x99AD,/* 0xD8-0xDF */
+	0x9B5A,0x9F6C,0x5104,0x61B6,0x6291,0x6A8D,0x81C6,0x5043,/* 0xE0-0xE7 */
+	0x5830,0x5F66,0x7109,0x8A00,0x8AFA,0x5B7C,0x8616,0x4FFA,/* 0xE8-0xEF */
+	0x513C,0x56B4,0x5944,0x63A9,0x6DF9,0x5DAA,0x696D,0x5186,/* 0xF0-0xF7 */
+	0x4E88,0x4F59,0xF97F,0xF980,0xF981,0x5982,0xF982,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF983,0x6B5F,0x6C5D,0xF984,0x74B5,0x7916,0xF985,/* 0xA0-0xA7 */
+	0x8207,0x8245,0x8339,0x8F3F,0x8F5D,0xF986,0x9918,0xF987,/* 0xA8-0xAF */
+	0xF988,0xF989,0x4EA6,0xF98A,0x57DF,0x5F79,0x6613,0xF98B,/* 0xB0-0xB7 */
+	0xF98C,0x75AB,0x7E79,0x8B6F,0xF98D,0x9006,0x9A5B,0x56A5,/* 0xB8-0xBF */
+	0x5827,0x59F8,0x5A1F,0x5BB4,0xF98E,0x5EF6,0xF98F,0xF990,/* 0xC0-0xC7 */
+	0x6350,0x633B,0xF991,0x693D,0x6C87,0x6CBF,0x6D8E,0x6D93,/* 0xC8-0xCF */
+	0x6DF5,0x6F14,0xF992,0x70DF,0x7136,0x7159,0xF993,0x71C3,/* 0xD0-0xD7 */
+	0x71D5,0xF994,0x784F,0x786F,0xF995,0x7B75,0x7DE3,0xF996,/* 0xD8-0xDF */
+	0x7E2F,0xF997,0x884D,0x8EDF,0xF998,0xF999,0xF99A,0x925B,/* 0xE0-0xE7 */
+	0xF99B,0x9CF6,0xF99C,0xF99D,0xF99E,0x6085,0x6D85,0xF99F,/* 0xE8-0xEF */
+	0x71B1,0xF9A0,0xF9A1,0x95B1,0x53AD,0xF9A2,0xF9A3,0xF9A4,/* 0xF0-0xF7 */
+	0x67D3,0xF9A5,0x708E,0x7130,0x7430,0x8276,0x82D2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF9A6,0x95BB,0x9AE5,0x9E7D,0x66C4,0xF9A7,0x71C1,/* 0xA0-0xA7 */
+	0x8449,0xF9A8,0xF9A9,0x584B,0xF9AA,0xF9AB,0x5DB8,0x5F71,/* 0xA8-0xAF */
+	0xF9AC,0x6620,0x668E,0x6979,0x69AE,0x6C38,0x6CF3,0x6E36,/* 0xB0-0xB7 */
+	0x6F41,0x6FDA,0x701B,0x702F,0x7150,0x71DF,0x7370,0xF9AD,/* 0xB8-0xBF */
+	0x745B,0xF9AE,0x74D4,0x76C8,0x7A4E,0x7E93,0xF9AF,0xF9B0,/* 0xC0-0xC7 */
+	0x82F1,0x8A60,0x8FCE,0xF9B1,0x9348,0xF9B2,0x9719,0xF9B3,/* 0xC8-0xCF */
+	0xF9B4,0x4E42,0x502A,0xF9B5,0x5208,0x53E1,0x66F3,0x6C6D,/* 0xD0-0xD7 */
+	0x6FCA,0x730A,0x777F,0x7A62,0x82AE,0x85DD,0x8602,0xF9B6,/* 0xD8-0xDF */
+	0x88D4,0x8A63,0x8B7D,0x8C6B,0xF9B7,0x92B3,0xF9B8,0x9713,/* 0xE0-0xE7 */
+	0x9810,0x4E94,0x4F0D,0x4FC9,0x50B2,0x5348,0x543E,0x5433,/* 0xE8-0xEF */
+	0x55DA,0x5862,0x58BA,0x5967,0x5A1B,0x5BE4,0x609F,0xF9B9,/* 0xF0-0xF7 */
+	0x61CA,0x6556,0x65FF,0x6664,0x68A7,0x6C5A,0x6FB3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x70CF,0x71AC,0x7352,0x7B7D,0x8708,0x8AA4,0x9C32,/* 0xA0-0xA7 */
+	0x9F07,0x5C4B,0x6C83,0x7344,0x7389,0x923A,0x6EAB,0x7465,/* 0xA8-0xAF */
+	0x761F,0x7A69,0x7E15,0x860A,0x5140,0x58C5,0x64C1,0x74EE,/* 0xB0-0xB7 */
+	0x7515,0x7670,0x7FC1,0x9095,0x96CD,0x9954,0x6E26,0x74E6,/* 0xB8-0xBF */
+	0x7AA9,0x7AAA,0x81E5,0x86D9,0x8778,0x8A1B,0x5A49,0x5B8C,/* 0xC0-0xC7 */
+	0x5B9B,0x68A1,0x6900,0x6D63,0x73A9,0x7413,0x742C,0x7897,/* 0xC8-0xCF */
+	0x7DE9,0x7FEB,0x8118,0x8155,0x839E,0x8C4C,0x962E,0x9811,/* 0xD0-0xD7 */
+	0x66F0,0x5F80,0x65FA,0x6789,0x6C6A,0x738B,0x502D,0x5A03,/* 0xD8-0xDF */
+	0x6B6A,0x77EE,0x5916,0x5D6C,0x5DCD,0x7325,0x754F,0xF9BA,/* 0xE0-0xE7 */
+	0xF9BB,0x50E5,0x51F9,0x582F,0x592D,0x5996,0x59DA,0x5BE5,/* 0xE8-0xEF */
+	0xF9BC,0xF9BD,0x5DA2,0x62D7,0x6416,0x6493,0x64FE,0xF9BE,/* 0xF0-0xF7 */
+	0x66DC,0xF9BF,0x6A48,0xF9C0,0x71FF,0x7464,0xF9C1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7A88,0x7AAF,0x7E47,0x7E5E,0x8000,0x8170,0xF9C2,/* 0xA0-0xA7 */
+	0x87EF,0x8981,0x8B20,0x9059,0xF9C3,0x9080,0x9952,0x617E,/* 0xA8-0xAF */
+	0x6B32,0x6D74,0x7E1F,0x8925,0x8FB1,0x4FD1,0x50AD,0x5197,/* 0xB0-0xB7 */
+	0x52C7,0x57C7,0x5889,0x5BB9,0x5EB8,0x6142,0x6995,0x6D8C,/* 0xB8-0xBF */
+	0x6E67,0x6EB6,0x7194,0x7462,0x7528,0x752C,0x8073,0x8338,/* 0xC0-0xC7 */
+	0x84C9,0x8E0A,0x9394,0x93DE,0xF9C4,0x4E8E,0x4F51,0x5076,/* 0xC8-0xCF */
+	0x512A,0x53C8,0x53CB,0x53F3,0x5B87,0x5BD3,0x5C24,0x611A,/* 0xD0-0xD7 */
+	0x6182,0x65F4,0x725B,0x7397,0x7440,0x76C2,0x7950,0x7991,/* 0xD8-0xDF */
+	0x79B9,0x7D06,0x7FBD,0x828B,0x85D5,0x865E,0x8FC2,0x9047,/* 0xE0-0xE7 */
+	0x90F5,0x91EA,0x9685,0x96E8,0x96E9,0x52D6,0x5F67,0x65ED,/* 0xE8-0xEF */
+	0x6631,0x682F,0x715C,0x7A36,0x90C1,0x980A,0x4E91,0xF9C5,/* 0xF0-0xF7 */
+	0x6A52,0x6B9E,0x6F90,0x7189,0x8018,0x82B8,0x8553,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x904B,0x9695,0x96F2,0x97FB,0x851A,0x9B31,0x4E90,/* 0xA0-0xA7 */
+	0x718A,0x96C4,0x5143,0x539F,0x54E1,0x5713,0x5712,0x57A3,/* 0xA8-0xAF */
+	0x5A9B,0x5AC4,0x5BC3,0x6028,0x613F,0x63F4,0x6C85,0x6D39,/* 0xB0-0xB7 */
+	0x6E72,0x6E90,0x7230,0x733F,0x7457,0x82D1,0x8881,0x8F45,/* 0xB8-0xBF */
+	0x9060,0xF9C6,0x9662,0x9858,0x9D1B,0x6708,0x8D8A,0x925E,/* 0xC0-0xC7 */
+	0x4F4D,0x5049,0x50DE,0x5371,0x570D,0x59D4,0x5A01,0x5C09,/* 0xC8-0xCF */
+	0x6170,0x6690,0x6E2D,0x7232,0x744B,0x7DEF,0x80C3,0x840E,/* 0xD0-0xD7 */
+	0x8466,0x853F,0x875F,0x885B,0x8918,0x8B02,0x9055,0x97CB,/* 0xD8-0xDF */
+	0x9B4F,0x4E73,0x4F91,0x5112,0x516A,0xF9C7,0x552F,0x55A9,/* 0xE0-0xE7 */
+	0x5B7A,0x5BA5,0x5E7C,0x5E7D,0x5EBE,0x60A0,0x60DF,0x6108,/* 0xE8-0xEF */
+	0x6109,0x63C4,0x6538,0x6709,0xF9C8,0x67D4,0x67DA,0xF9C9,/* 0xF0-0xF7 */
+	0x6961,0x6962,0x6CB9,0x6D27,0xF9CA,0x6E38,0xF9CB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6FE1,0x7336,0x7337,0xF9CC,0x745C,0x7531,0xF9CD,/* 0xA0-0xA7 */
+	0x7652,0xF9CE,0xF9CF,0x7DAD,0x81FE,0x8438,0x88D5,0x8A98,/* 0xA8-0xAF */
+	0x8ADB,0x8AED,0x8E30,0x8E42,0x904A,0x903E,0x907A,0x9149,/* 0xB0-0xB7 */
+	0x91C9,0x936E,0xF9D0,0xF9D1,0x5809,0xF9D2,0x6BD3,0x8089,/* 0xB8-0xBF */
+	0x80B2,0xF9D3,0xF9D4,0x5141,0x596B,0x5C39,0xF9D5,0xF9D6,/* 0xC0-0xC7 */
+	0x6F64,0x73A7,0x80E4,0x8D07,0xF9D7,0x9217,0x958F,0xF9D8,/* 0xC8-0xCF */
+	0xF9D9,0xF9DA,0xF9DB,0x807F,0x620E,0x701C,0x7D68,0x878D,/* 0xD0-0xD7 */
+	0xF9DC,0x57A0,0x6069,0x6147,0x6BB7,0x8ABE,0x9280,0x96B1,/* 0xD8-0xDF */
+	0x4E59,0x541F,0x6DEB,0x852D,0x9670,0x97F3,0x98EE,0x63D6,/* 0xE0-0xE7 */
+	0x6CE3,0x9091,0x51DD,0x61C9,0x81BA,0x9DF9,0x4F9D,0x501A,/* 0xE8-0xEF */
+	0x5100,0x5B9C,0x610F,0x61FF,0x64EC,0x6905,0x6BC5,0x7591,/* 0xF0-0xF7 */
+	0x77E3,0x7FA9,0x8264,0x858F,0x87FB,0x8863,0x8ABC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8B70,0x91AB,0x4E8C,0x4EE5,0x4F0A,0xF9DD,0xF9DE,/* 0xA0-0xA7 */
+	0x5937,0x59E8,0xF9DF,0x5DF2,0x5F1B,0x5F5B,0x6021,0xF9E0,/* 0xA8-0xAF */
+	0xF9E1,0xF9E2,0xF9E3,0x723E,0x73E5,0xF9E4,0x7570,0x75CD,/* 0xB0-0xB7 */
+	0xF9E5,0x79FB,0xF9E6,0x800C,0x8033,0x8084,0x82E1,0x8351,/* 0xB8-0xBF */
+	0xF9E7,0xF9E8,0x8CBD,0x8CB3,0x9087,0xF9E9,0xF9EA,0x98F4,/* 0xC0-0xC7 */
+	0x990C,0xF9EB,0xF9EC,0x7037,0x76CA,0x7FCA,0x7FCC,0x7FFC,/* 0xC8-0xCF */
+	0x8B1A,0x4EBA,0x4EC1,0x5203,0x5370,0xF9ED,0x54BD,0x56E0,/* 0xD0-0xD7 */
+	0x59FB,0x5BC5,0x5F15,0x5FCD,0x6E6E,0xF9EE,0xF9EF,0x7D6A,/* 0xD8-0xDF */
+	0x8335,0xF9F0,0x8693,0x8A8D,0xF9F1,0x976D,0x9777,0xF9F2,/* 0xE0-0xE7 */
+	0xF9F3,0x4E00,0x4F5A,0x4F7E,0x58F9,0x65E5,0x6EA2,0x9038,/* 0xE8-0xEF */
+	0x93B0,0x99B9,0x4EFB,0x58EC,0x598A,0x59D9,0x6041,0xF9F4,/* 0xF0-0xF7 */
+	0xF9F5,0x7A14,0xF9F6,0x834F,0x8CC3,0x5165,0x5344,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF9F7,0xF9F8,0xF9F9,0x4ECD,0x5269,0x5B55,0x82BF,/* 0xA0-0xA7 */
+	0x4ED4,0x523A,0x54A8,0x59C9,0x59FF,0x5B50,0x5B57,0x5B5C,/* 0xA8-0xAF */
+	0x6063,0x6148,0x6ECB,0x7099,0x716E,0x7386,0x74F7,0x75B5,/* 0xB0-0xB7 */
+	0x78C1,0x7D2B,0x8005,0x81EA,0x8328,0x8517,0x85C9,0x8AEE,/* 0xB8-0xBF */
+	0x8CC7,0x96CC,0x4F5C,0x52FA,0x56BC,0x65AB,0x6628,0x707C,/* 0xC0-0xC7 */
+	0x70B8,0x7235,0x7DBD,0x828D,0x914C,0x96C0,0x9D72,0x5B71,/* 0xC8-0xCF */
+	0x68E7,0x6B98,0x6F7A,0x76DE,0x5C91,0x66AB,0x6F5B,0x7BB4,/* 0xD0-0xD7 */
+	0x7C2A,0x8836,0x96DC,0x4E08,0x4ED7,0x5320,0x5834,0x58BB,/* 0xD8-0xDF */
+	0x58EF,0x596C,0x5C07,0x5E33,0x5E84,0x5F35,0x638C,0x66B2,/* 0xE0-0xE7 */
+	0x6756,0x6A1F,0x6AA3,0x6B0C,0x6F3F,0x7246,0xF9FA,0x7350,/* 0xE8-0xEF */
+	0x748B,0x7AE0,0x7CA7,0x8178,0x81DF,0x81E7,0x838A,0x846C,/* 0xF0-0xF7 */
+	0x8523,0x8594,0x85CF,0x88DD,0x8D13,0x91AC,0x9577,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x969C,0x518D,0x54C9,0x5728,0x5BB0,0x624D,0x6750,/* 0xA0-0xA7 */
+	0x683D,0x6893,0x6E3D,0x6ED3,0x707D,0x7E21,0x88C1,0x8CA1,/* 0xA8-0xAF */
+	0x8F09,0x9F4B,0x9F4E,0x722D,0x7B8F,0x8ACD,0x931A,0x4F47,/* 0xB0-0xB7 */
+	0x4F4E,0x5132,0x5480,0x59D0,0x5E95,0x62B5,0x6775,0x696E,/* 0xB8-0xBF */
+	0x6A17,0x6CAE,0x6E1A,0x72D9,0x732A,0x75BD,0x7BB8,0x7D35,/* 0xC0-0xC7 */
+	0x82E7,0x83F9,0x8457,0x85F7,0x8A5B,0x8CAF,0x8E87,0x9019,/* 0xC8-0xCF */
+	0x90B8,0x96CE,0x9F5F,0x52E3,0x540A,0x5AE1,0x5BC2,0x6458,/* 0xD0-0xD7 */
+	0x6575,0x6EF4,0x72C4,0xF9FB,0x7684,0x7A4D,0x7B1B,0x7C4D,/* 0xD8-0xDF */
+	0x7E3E,0x7FDF,0x837B,0x8B2B,0x8CCA,0x8D64,0x8DE1,0x8E5F,/* 0xE0-0xE7 */
+	0x8FEA,0x8FF9,0x9069,0x93D1,0x4F43,0x4F7A,0x50B3,0x5168,/* 0xE8-0xEF */
+	0x5178,0x524D,0x526A,0x5861,0x587C,0x5960,0x5C08,0x5C55,/* 0xF0-0xF7 */
+	0x5EDB,0x609B,0x6230,0x6813,0x6BBF,0x6C08,0x6FB1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x714E,0x7420,0x7530,0x7538,0x7551,0x7672,0x7B4C,/* 0xA0-0xA7 */
+	0x7B8B,0x7BAD,0x7BC6,0x7E8F,0x8A6E,0x8F3E,0x8F49,0x923F,/* 0xA8-0xAF */
+	0x9293,0x9322,0x942B,0x96FB,0x985A,0x986B,0x991E,0x5207,/* 0xB0-0xB7 */
+	0x622A,0x6298,0x6D59,0x7664,0x7ACA,0x7BC0,0x7D76,0x5360,/* 0xB8-0xBF */
+	0x5CBE,0x5E97,0x6F38,0x70B9,0x7C98,0x9711,0x9B8E,0x9EDE,/* 0xC0-0xC7 */
+	0x63A5,0x647A,0x8776,0x4E01,0x4E95,0x4EAD,0x505C,0x5075,/* 0xC8-0xCF */
+	0x5448,0x59C3,0x5B9A,0x5E40,0x5EAD,0x5EF7,0x5F81,0x60C5,/* 0xD0-0xD7 */
+	0x633A,0x653F,0x6574,0x65CC,0x6676,0x6678,0x67FE,0x6968,/* 0xD8-0xDF */
+	0x6A89,0x6B63,0x6C40,0x6DC0,0x6DE8,0x6E1F,0x6E5E,0x701E,/* 0xE0-0xE7 */
+	0x70A1,0x738E,0x73FD,0x753A,0x775B,0x7887,0x798E,0x7A0B,/* 0xE8-0xEF */
+	0x7A7D,0x7CBE,0x7D8E,0x8247,0x8A02,0x8AEA,0x8C9E,0x912D,/* 0xF0-0xF7 */
+	0x914A,0x91D8,0x9266,0x92CC,0x9320,0x9706,0x9756,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x975C,0x9802,0x9F0E,0x5236,0x5291,0x557C,0x5824,/* 0xA0-0xA7 */
+	0x5E1D,0x5F1F,0x608C,0x63D0,0x68AF,0x6FDF,0x796D,0x7B2C,/* 0xA8-0xAF */
+	0x81CD,0x85BA,0x88FD,0x8AF8,0x8E44,0x918D,0x9664,0x969B,/* 0xB0-0xB7 */
+	0x973D,0x984C,0x9F4A,0x4FCE,0x5146,0x51CB,0x52A9,0x5632,/* 0xB8-0xBF */
+	0x5F14,0x5F6B,0x63AA,0x64CD,0x65E9,0x6641,0x66FA,0x66F9,/* 0xC0-0xC7 */
+	0x671D,0x689D,0x68D7,0x69FD,0x6F15,0x6F6E,0x7167,0x71E5,/* 0xC8-0xCF */
+	0x722A,0x74AA,0x773A,0x7956,0x795A,0x79DF,0x7A20,0x7A95,/* 0xD0-0xD7 */
+	0x7C97,0x7CDF,0x7D44,0x7E70,0x8087,0x85FB,0x86A4,0x8A54,/* 0xD8-0xDF */
+	0x8ABF,0x8D99,0x8E81,0x9020,0x906D,0x91E3,0x963B,0x96D5,/* 0xE0-0xE7 */
+	0x9CE5,0x65CF,0x7C07,0x8DB3,0x93C3,0x5B58,0x5C0A,0x5352,/* 0xE8-0xEF */
+	0x62D9,0x731D,0x5027,0x5B97,0x5F9E,0x60B0,0x616B,0x68D5,/* 0xF0-0xF7 */
+	0x6DD9,0x742E,0x7A2E,0x7D42,0x7D9C,0x7E31,0x816B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8E2A,0x8E35,0x937E,0x9418,0x4F50,0x5750,0x5DE6,/* 0xA0-0xA7 */
+	0x5EA7,0x632B,0x7F6A,0x4E3B,0x4F4F,0x4F8F,0x505A,0x59DD,/* 0xA8-0xAF */
+	0x80C4,0x546A,0x5468,0x55FE,0x594F,0x5B99,0x5DDE,0x5EDA,/* 0xB0-0xB7 */
+	0x665D,0x6731,0x67F1,0x682A,0x6CE8,0x6D32,0x6E4A,0x6F8D,/* 0xB8-0xBF */
+	0x70B7,0x73E0,0x7587,0x7C4C,0x7D02,0x7D2C,0x7DA2,0x821F,/* 0xC0-0xC7 */
+	0x86DB,0x8A3B,0x8A85,0x8D70,0x8E8A,0x8F33,0x9031,0x914E,/* 0xC8-0xCF */
+	0x9152,0x9444,0x99D0,0x7AF9,0x7CA5,0x4FCA,0x5101,0x51C6,/* 0xD0-0xD7 */
+	0x57C8,0x5BEF,0x5CFB,0x6659,0x6A3D,0x6D5A,0x6E96,0x6FEC,/* 0xD8-0xDF */
+	0x710C,0x756F,0x7AE3,0x8822,0x9021,0x9075,0x96CB,0x99FF,/* 0xE0-0xE7 */
+	0x8301,0x4E2D,0x4EF2,0x8846,0x91CD,0x537D,0x6ADB,0x696B,/* 0xE8-0xEF */
+	0x6C41,0x847A,0x589E,0x618E,0x66FE,0x62EF,0x70DD,0x7511,/* 0xF0-0xF7 */
+	0x75C7,0x7E52,0x84B8,0x8B49,0x8D08,0x4E4B,0x53EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x54AB,0x5730,0x5740,0x5FD7,0x6301,0x6307,0x646F,/* 0xA0-0xA7 */
+	0x652F,0x65E8,0x667A,0x679D,0x67B3,0x6B62,0x6C60,0x6C9A,/* 0xA8-0xAF */
+	0x6F2C,0x77E5,0x7825,0x7949,0x7957,0x7D19,0x80A2,0x8102,/* 0xB0-0xB7 */
+	0x81F3,0x829D,0x82B7,0x8718,0x8A8C,0xF9FC,0x8D04,0x8DBE,/* 0xB8-0xBF */
+	0x9072,0x76F4,0x7A19,0x7A37,0x7E54,0x8077,0x5507,0x55D4,/* 0xC0-0xC7 */
+	0x5875,0x632F,0x6422,0x6649,0x664B,0x686D,0x699B,0x6B84,/* 0xC8-0xCF */
+	0x6D25,0x6EB1,0x73CD,0x7468,0x74A1,0x755B,0x75B9,0x76E1,/* 0xD0-0xD7 */
+	0x771E,0x778B,0x79E6,0x7E09,0x7E1D,0x81FB,0x852F,0x8897,/* 0xD8-0xDF */
+	0x8A3A,0x8CD1,0x8EEB,0x8FB0,0x9032,0x93AD,0x9663,0x9673,/* 0xE0-0xE7 */
+	0x9707,0x4F84,0x53F1,0x59EA,0x5AC9,0x5E19,0x684E,0x74C6,/* 0xE8-0xEF */
+	0x75BE,0x79E9,0x7A92,0x81A3,0x86ED,0x8CEA,0x8DCC,0x8FED,/* 0xF0-0xF7 */
+	0x659F,0x6715,0xF9FD,0x57F7,0x6F57,0x7DDD,0x8F2F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x93F6,0x96C6,0x5FB5,0x61F2,0x6F84,0x4E14,0x4F98,/* 0xA0-0xA7 */
+	0x501F,0x53C9,0x55DF,0x5D6F,0x5DEE,0x6B21,0x6B64,0x78CB,/* 0xA8-0xAF */
+	0x7B9A,0xF9FE,0x8E49,0x8ECA,0x906E,0x6349,0x643E,0x7740,/* 0xB0-0xB7 */
+	0x7A84,0x932F,0x947F,0x9F6A,0x64B0,0x6FAF,0x71E6,0x74A8,/* 0xB8-0xBF */
+	0x74DA,0x7AC4,0x7C12,0x7E82,0x7CB2,0x7E98,0x8B9A,0x8D0A,/* 0xC0-0xC7 */
+	0x947D,0x9910,0x994C,0x5239,0x5BDF,0x64E6,0x672D,0x7D2E,/* 0xC8-0xCF */
+	0x50ED,0x53C3,0x5879,0x6158,0x6159,0x61FA,0x65AC,0x7AD9,/* 0xD0-0xD7 */
+	0x8B92,0x8B96,0x5009,0x5021,0x5275,0x5531,0x5A3C,0x5EE0,/* 0xD8-0xDF */
+	0x5F70,0x6134,0x655E,0x660C,0x6636,0x66A2,0x69CD,0x6EC4,/* 0xE0-0xE7 */
+	0x6F32,0x7316,0x7621,0x7A93,0x8139,0x8259,0x83D6,0x84BC,/* 0xE8-0xEF */
+	0x50B5,0x57F0,0x5BC0,0x5BE8,0x5F69,0x63A1,0x7826,0x7DB5,/* 0xF0-0xF7 */
+	0x83DC,0x8521,0x91C7,0x91F5,0x518A,0x67F5,0x7B56,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8CAC,0x51C4,0x59BB,0x60BD,0x8655,0x501C,0xF9FF,/* 0xA0-0xA7 */
+	0x5254,0x5C3A,0x617D,0x621A,0x62D3,0x64F2,0x65A5,0x6ECC,/* 0xA8-0xAF */
+	0x7620,0x810A,0x8E60,0x965F,0x96BB,0x4EDF,0x5343,0x5598,/* 0xB0-0xB7 */
+	0x5929,0x5DDD,0x64C5,0x6CC9,0x6DFA,0x7394,0x7A7F,0x821B,/* 0xB8-0xBF */
+	0x85A6,0x8CE4,0x8E10,0x9077,0x91E7,0x95E1,0x9621,0x97C6,/* 0xC0-0xC7 */
+	0x51F8,0x54F2,0x5586,0x5FB9,0x64A4,0x6F88,0x7DB4,0x8F1F,/* 0xC8-0xCF */
+	0x8F4D,0x9435,0x50C9,0x5C16,0x6CBE,0x6DFB,0x751B,0x77BB,/* 0xD0-0xD7 */
+	0x7C3D,0x7C64,0x8A79,0x8AC2,0x581E,0x59BE,0x5E16,0x6377,/* 0xD8-0xDF */
+	0x7252,0x758A,0x776B,0x8ADC,0x8CBC,0x8F12,0x5EF3,0x6674,/* 0xE0-0xE7 */
+	0x6DF8,0x807D,0x83C1,0x8ACB,0x9751,0x9BD6,0xFA00,0x5243,/* 0xE8-0xEF */
+	0x66FF,0x6D95,0x6EEF,0x7DE0,0x8AE6,0x902E,0x905E,0x9AD4,/* 0xF0-0xF7 */
+	0x521D,0x527F,0x54E8,0x6194,0x6284,0x62DB,0x68A2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6912,0x695A,0x6A35,0x7092,0x7126,0x785D,0x7901,/* 0xA0-0xA7 */
+	0x790E,0x79D2,0x7A0D,0x8096,0x8278,0x82D5,0x8349,0x8549,/* 0xA8-0xAF */
+	0x8C82,0x8D85,0x9162,0x918B,0x91AE,0x4FC3,0x56D1,0x71ED,/* 0xB0-0xB7 */
+	0x77D7,0x8700,0x89F8,0x5BF8,0x5FD6,0x6751,0x90A8,0x53E2,/* 0xB8-0xBF */
+	0x585A,0x5BF5,0x60A4,0x6181,0x6460,0x7E3D,0x8070,0x8525,/* 0xC0-0xC7 */
+	0x9283,0x64AE,0x50AC,0x5D14,0x6700,0x589C,0x62BD,0x63A8,/* 0xC8-0xCF */
+	0x690E,0x6978,0x6A1E,0x6E6B,0x76BA,0x79CB,0x82BB,0x8429,/* 0xD0-0xD7 */
+	0x8ACF,0x8DA8,0x8FFD,0x9112,0x914B,0x919C,0x9310,0x9318,/* 0xD8-0xDF */
+	0x939A,0x96DB,0x9A36,0x9C0D,0x4E11,0x755C,0x795D,0x7AFA,/* 0xE0-0xE7 */
+	0x7B51,0x7BC9,0x7E2E,0x84C4,0x8E59,0x8E74,0x8EF8,0x9010,/* 0xE8-0xEF */
+	0x6625,0x693F,0x7443,0x51FA,0x672E,0x9EDC,0x5145,0x5FE0,/* 0xF0-0xF7 */
+	0x6C96,0x87F2,0x885D,0x8877,0x60B4,0x81B5,0x8403,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8D05,0x53D6,0x5439,0x5634,0x5A36,0x5C31,0x708A,/* 0xA0-0xA7 */
+	0x7FE0,0x805A,0x8106,0x81ED,0x8DA3,0x9189,0x9A5F,0x9DF2,/* 0xA8-0xAF */
+	0x5074,0x4EC4,0x53A0,0x60FB,0x6E2C,0x5C64,0x4F88,0x5024,/* 0xB0-0xB7 */
+	0x55E4,0x5CD9,0x5E5F,0x6065,0x6894,0x6CBB,0x6DC4,0x71BE,/* 0xB8-0xBF */
+	0x75D4,0x75F4,0x7661,0x7A1A,0x7A49,0x7DC7,0x7DFB,0x7F6E,/* 0xC0-0xC7 */
+	0x81F4,0x86A9,0x8F1C,0x96C9,0x99B3,0x9F52,0x5247,0x52C5,/* 0xC8-0xCF */
+	0x98ED,0x89AA,0x4E03,0x67D2,0x6F06,0x4FB5,0x5BE2,0x6795,/* 0xD0-0xD7 */
+	0x6C88,0x6D78,0x741B,0x7827,0x91DD,0x937C,0x87C4,0x79E4,/* 0xD8-0xDF */
+	0x7A31,0x5FEB,0x4ED6,0x54A4,0x553E,0x58AE,0x59A5,0x60F0,/* 0xE0-0xE7 */
+	0x6253,0x62D6,0x6736,0x6955,0x8235,0x9640,0x99B1,0x99DD,/* 0xE8-0xEF */
+	0x502C,0x5353,0x5544,0x577C,0xFA01,0x6258,0xFA02,0x64E2,/* 0xF0-0xF7 */
+	0x666B,0x67DD,0x6FC1,0x6FEF,0x7422,0x7438,0x8A17,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9438,0x5451,0x5606,0x5766,0x5F48,0x619A,0x6B4E,/* 0xA0-0xA7 */
+	0x7058,0x70AD,0x7DBB,0x8A95,0x596A,0x812B,0x63A2,0x7708,/* 0xA8-0xAF */
+	0x803D,0x8CAA,0x5854,0x642D,0x69BB,0x5B95,0x5E11,0x6E6F,/* 0xB0-0xB7 */
+	0xFA03,0x8569,0x514C,0x53F0,0x592A,0x6020,0x614B,0x6B86,/* 0xB8-0xBF */
+	0x6C70,0x6CF0,0x7B1E,0x80CE,0x82D4,0x8DC6,0x90B0,0x98B1,/* 0xC0-0xC7 */
+	0xFA04,0x64C7,0x6FA4,0x6491,0x6504,0x514E,0x5410,0x571F,/* 0xC8-0xCF */
+	0x8A0E,0x615F,0x6876,0xFA05,0x75DB,0x7B52,0x7D71,0x901A,/* 0xD0-0xD7 */
+	0x5806,0x69CC,0x817F,0x892A,0x9000,0x9839,0x5078,0x5957,/* 0xD8-0xDF */
+	0x59AC,0x6295,0x900F,0x9B2A,0x615D,0x7279,0x95D6,0x5761,/* 0xE0-0xE7 */
+	0x5A46,0x5DF4,0x628A,0x64AD,0x64FA,0x6777,0x6CE2,0x6D3E,/* 0xE8-0xEF */
+	0x722C,0x7436,0x7834,0x7F77,0x82AD,0x8DDB,0x9817,0x5224,/* 0xF0-0xF7 */
+	0x5742,0x677F,0x7248,0x74E3,0x8CA9,0x8FA6,0x9211,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x962A,0x516B,0x53ED,0x634C,0x4F69,0x5504,0x6096,/* 0xA0-0xA7 */
+	0x6557,0x6C9B,0x6D7F,0x724C,0x72FD,0x7A17,0x8987,0x8C9D,/* 0xA8-0xAF */
+	0x5F6D,0x6F8E,0x70F9,0x81A8,0x610E,0x4FBF,0x504F,0x6241,/* 0xB0-0xB7 */
+	0x7247,0x7BC7,0x7DE8,0x7FE9,0x904D,0x97AD,0x9A19,0x8CB6,/* 0xB8-0xBF */
+	0x576A,0x5E73,0x67B0,0x840D,0x8A55,0x5420,0x5B16,0x5E63,/* 0xC0-0xC7 */
+	0x5EE2,0x5F0A,0x6583,0x80BA,0x853D,0x9589,0x965B,0x4F48,/* 0xC8-0xCF */
+	0x5305,0x530D,0x530F,0x5486,0x54FA,0x5703,0x5E03,0x6016,/* 0xD0-0xD7 */
+	0x629B,0x62B1,0x6355,0xFA06,0x6CE1,0x6D66,0x75B1,0x7832,/* 0xD8-0xDF */
+	0x80DE,0x812F,0x82DE,0x8461,0x84B2,0x888D,0x8912,0x900B,/* 0xE0-0xE7 */
+	0x92EA,0x98FD,0x9B91,0x5E45,0x66B4,0x66DD,0x7011,0x7206,/* 0xE8-0xEF */
+	0xFA07,0x4FF5,0x527D,0x5F6A,0x6153,0x6753,0x6A19,0x6F02,/* 0xF0-0xF7 */
+	0x74E2,0x7968,0x8868,0x8C79,0x98C7,0x98C4,0x9A43,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x54C1,0x7A1F,0x6953,0x8AF7,0x8C4A,0x98A8,0x99AE,/* 0xA0-0xA7 */
+	0x5F7C,0x62AB,0x75B2,0x76AE,0x88AB,0x907F,0x9642,0x5339,/* 0xA8-0xAF */
+	0x5F3C,0x5FC5,0x6CCC,0x73CC,0x7562,0x758B,0x7B46,0x82FE,/* 0xB0-0xB7 */
+	0x999D,0x4E4F,0x903C,0x4E0B,0x4F55,0x53A6,0x590F,0x5EC8,/* 0xB8-0xBF */
+	0x6630,0x6CB3,0x7455,0x8377,0x8766,0x8CC0,0x9050,0x971E,/* 0xC0-0xC7 */
+	0x9C15,0x58D1,0x5B78,0x8650,0x8B14,0x9DB4,0x5BD2,0x6068,/* 0xC8-0xCF */
+	0x608D,0x65F1,0x6C57,0x6F22,0x6FA3,0x701A,0x7F55,0x7FF0,/* 0xD0-0xD7 */
+	0x9591,0x9592,0x9650,0x97D3,0x5272,0x8F44,0x51FD,0x542B,/* 0xD8-0xDF */
+	0x54B8,0x5563,0x558A,0x6ABB,0x6DB5,0x7DD8,0x8266,0x929C,/* 0xE0-0xE7 */
+	0x9677,0x9E79,0x5408,0x54C8,0x76D2,0x86E4,0x95A4,0x95D4,/* 0xE8-0xEF */
+	0x965C,0x4EA2,0x4F09,0x59EE,0x5AE6,0x5DF7,0x6052,0x6297,/* 0xF0-0xF7 */
+	0x676D,0x6841,0x6C86,0x6E2F,0x7F38,0x809B,0x822A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xFA08,0xFA09,0x9805,0x4EA5,0x5055,0x54B3,0x5793,/* 0xA0-0xA7 */
+	0x595A,0x5B69,0x5BB3,0x61C8,0x6977,0x6D77,0x7023,0x87F9,/* 0xA8-0xAF */
+	0x89E3,0x8A72,0x8AE7,0x9082,0x99ED,0x9AB8,0x52BE,0x6838,/* 0xB0-0xB7 */
+	0x5016,0x5E78,0x674F,0x8347,0x884C,0x4EAB,0x5411,0x56AE,/* 0xB8-0xBF */
+	0x73E6,0x9115,0x97FF,0x9909,0x9957,0x9999,0x5653,0x589F,/* 0xC0-0xC7 */
+	0x865B,0x8A31,0x61B2,0x6AF6,0x737B,0x8ED2,0x6B47,0x96AA,/* 0xC8-0xCF */
+	0x9A57,0x5955,0x7200,0x8D6B,0x9769,0x4FD4,0x5CF4,0x5F26,/* 0xD0-0xD7 */
+	0x61F8,0x665B,0x6CEB,0x70AB,0x7384,0x73B9,0x73FE,0x7729,/* 0xD8-0xDF */
+	0x774D,0x7D43,0x7D62,0x7E23,0x8237,0x8852,0xFA0A,0x8CE2,/* 0xE0-0xE7 */
+	0x9249,0x986F,0x5B51,0x7A74,0x8840,0x9801,0x5ACC,0x4FE0,/* 0xE8-0xEF */
+	0x5354,0x593E,0x5CFD,0x633E,0x6D79,0x72F9,0x8105,0x8107,/* 0xF0-0xF7 */
+	0x83A2,0x92CF,0x9830,0x4EA8,0x5144,0x5211,0x578B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5F62,0x6CC2,0x6ECE,0x7005,0x7050,0x70AF,0x7192,/* 0xA0-0xA7 */
+	0x73E9,0x7469,0x834A,0x87A2,0x8861,0x9008,0x90A2,0x93A3,/* 0xA8-0xAF */
+	0x99A8,0x516E,0x5F57,0x60E0,0x6167,0x66B3,0x8559,0x8E4A,/* 0xB0-0xB7 */
+	0x91AF,0x978B,0x4E4E,0x4E92,0x547C,0x58D5,0x58FA,0x597D,/* 0xB8-0xBF */
+	0x5CB5,0x5F27,0x6236,0x6248,0x660A,0x6667,0x6BEB,0x6D69,/* 0xC0-0xC7 */
+	0x6DCF,0x6E56,0x6EF8,0x6F94,0x6FE0,0x6FE9,0x705D,0x72D0,/* 0xC8-0xCF */
+	0x7425,0x745A,0x74E0,0x7693,0x795C,0x7CCA,0x7E1E,0x80E1,/* 0xD0-0xD7 */
+	0x82A6,0x846B,0x84BF,0x864E,0x865F,0x8774,0x8B77,0x8C6A,/* 0xD8-0xDF */
+	0x93AC,0x9800,0x9865,0x60D1,0x6216,0x9177,0x5A5A,0x660F,/* 0xE0-0xE7 */
+	0x6DF7,0x6E3E,0x743F,0x9B42,0x5FFD,0x60DA,0x7B0F,0x54C4,/* 0xE8-0xEF */
+	0x5F18,0x6C5E,0x6CD3,0x6D2A,0x70D8,0x7D05,0x8679,0x8A0C,/* 0xF0-0xF7 */
+	0x9D3B,0x5316,0x548C,0x5B05,0x6A3A,0x706B,0x7575,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x798D,0x79BE,0x82B1,0x83EF,0x8A71,0x8B41,0x8CA8,/* 0xA0-0xA7 */
+	0x9774,0xFA0B,0x64F4,0x652B,0x78BA,0x78BB,0x7A6B,0x4E38,/* 0xA8-0xAF */
+	0x559A,0x5950,0x5BA6,0x5E7B,0x60A3,0x63DB,0x6B61,0x6665,/* 0xB0-0xB7 */
+	0x6853,0x6E19,0x7165,0x74B0,0x7D08,0x9084,0x9A69,0x9C25,/* 0xB8-0xBF */
+	0x6D3B,0x6ED1,0x733E,0x8C41,0x95CA,0x51F0,0x5E4C,0x5FA8,/* 0xC0-0xC7 */
+	0x604D,0x60F6,0x6130,0x614C,0x6643,0x6644,0x69A5,0x6CC1,/* 0xC8-0xCF */
+	0x6E5F,0x6EC9,0x6F62,0x714C,0x749C,0x7687,0x7BC1,0x7C27,/* 0xD0-0xD7 */
+	0x8352,0x8757,0x9051,0x968D,0x9EC3,0x532F,0x56DE,0x5EFB,/* 0xD8-0xDF */
+	0x5F8A,0x6062,0x6094,0x61F7,0x6666,0x6703,0x6A9C,0x6DEE,/* 0xE0-0xE7 */
+	0x6FAE,0x7070,0x736A,0x7E6A,0x81BE,0x8334,0x86D4,0x8AA8,/* 0xE8-0xEF */
+	0x8CC4,0x5283,0x7372,0x5B96,0x6A6B,0x9404,0x54EE,0x5686,/* 0xF0-0xF7 */
+	0x5B5D,0x6548,0x6585,0x66C9,0x689F,0x6D8D,0x6DC6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x723B,0x80B4,0x9175,0x9A4D,0x4FAF,0x5019,0x539A,/* 0xA0-0xA7 */
+	0x540E,0x543C,0x5589,0x55C5,0x5E3F,0x5F8C,0x673D,0x7166,/* 0xA8-0xAF */
+	0x73DD,0x9005,0x52DB,0x52F3,0x5864,0x58CE,0x7104,0x718F,/* 0xB0-0xB7 */
+	0x71FB,0x85B0,0x8A13,0x6688,0x85A8,0x55A7,0x6684,0x714A,/* 0xB8-0xBF */
+	0x8431,0x5349,0x5599,0x6BC1,0x5F59,0x5FBD,0x63EE,0x6689,/* 0xC0-0xC7 */
+	0x7147,0x8AF1,0x8F1D,0x9EBE,0x4F11,0x643A,0x70CB,0x7566,/* 0xC8-0xCF */
+	0x8667,0x6064,0x8B4E,0x9DF8,0x5147,0x51F6,0x5308,0x6D36,/* 0xD0-0xD7 */
+	0x80F8,0x9ED1,0x6615,0x6B23,0x7098,0x75D5,0x5403,0x5C79,/* 0xD8-0xDF */
+	0x7D07,0x8A16,0x6B20,0x6B3D,0x6B46,0x5438,0x6070,0x6D3D,/* 0xE0-0xE7 */
+	0x7FD5,0x8208,0x50D6,0x51DE,0x559C,0x566B,0x56CD,0x59EC,/* 0xE8-0xEF */
+	0x5B09,0x5E0C,0x6199,0x6198,0x6231,0x665E,0x66E6,0x7199,/* 0xF0-0xF7 */
+	0x71B9,0x71BA,0x72A7,0x79A7,0x7A00,0x7FB2,0x8A70,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, 
+	c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, 
+	c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, 
+	c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, 
+	c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, 
+	c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, 
+	c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, 
+	c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, 
+	c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, 
+	c2u_C8, NULL,   c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, 
+	c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, 
+	c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, 
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, 
+	c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, 
+	c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, NULL,   NULL,   
+};
+
+static const unsigned char u2c_01[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA9, 0xA2, 0xA9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xA9, 0xA4, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA9, 0xA5, 0xA8, 0xA6, 0xA9, 0xA6, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xA9, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA8, /* 0x3C-0x3F */
+	0xA9, 0xA8, 0xA8, 0xA9, 0xA9, 0xA9, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xA9, 0xB0, 0xA8, 0xAF, 0xA9, 0xAF, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xAB, 0xA9, 0xAB, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xAE, 0xA9, 0xAE, /* 0x64-0x67 */
+};
+
+static const unsigned char u2c_02[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA7, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xA2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xA2, 0xA8, 0xA2, 0xAB, 0xA2, 0xAA, 0xA2, 0xAD, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA2, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0x90-0x93 */
+	0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0x94-0x97 */
+	0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0x98-0x9B */
+	0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0x9C-0x9F */
+	0xA5, 0xD0, 0xA5, 0xD1, 0x00, 0x00, 0xA5, 0xD2, /* 0xA0-0xA3 */
+	0xA5, 0xD3, 0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, /* 0xA4-0xA7 */
+	0xA5, 0xD7, 0xA5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xB0-0xB3 */
+	0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xB4-0xB7 */
+	0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xB8-0xBB */
+	0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xBC-0xBF */
+	0xA5, 0xF0, 0xA5, 0xF1, 0x00, 0x00, 0xA5, 0xF2, /* 0xC0-0xC3 */
+	0xA5, 0xF3, 0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, /* 0xC4-0xC7 */
+	0xA5, 0xF7, 0xA5, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_04[512] = {
+	0x00, 0x00, 0xAC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xAC, 0xA1, 0xAC, 0xA2, 0xAC, 0xA3, 0xAC, 0xA4, /* 0x10-0x13 */
+	0xAC, 0xA5, 0xAC, 0xA6, 0xAC, 0xA8, 0xAC, 0xA9, /* 0x14-0x17 */
+	0xAC, 0xAA, 0xAC, 0xAB, 0xAC, 0xAC, 0xAC, 0xAD, /* 0x18-0x1B */
+	0xAC, 0xAE, 0xAC, 0xAF, 0xAC, 0xB0, 0xAC, 0xB1, /* 0x1C-0x1F */
+	0xAC, 0xB2, 0xAC, 0xB3, 0xAC, 0xB4, 0xAC, 0xB5, /* 0x20-0x23 */
+	0xAC, 0xB6, 0xAC, 0xB7, 0xAC, 0xB8, 0xAC, 0xB9, /* 0x24-0x27 */
+	0xAC, 0xBA, 0xAC, 0xBB, 0xAC, 0xBC, 0xAC, 0xBD, /* 0x28-0x2B */
+	0xAC, 0xBE, 0xAC, 0xBF, 0xAC, 0xC0, 0xAC, 0xC1, /* 0x2C-0x2F */
+	0xAC, 0xD1, 0xAC, 0xD2, 0xAC, 0xD3, 0xAC, 0xD4, /* 0x30-0x33 */
+	0xAC, 0xD5, 0xAC, 0xD6, 0xAC, 0xD8, 0xAC, 0xD9, /* 0x34-0x37 */
+	0xAC, 0xDA, 0xAC, 0xDB, 0xAC, 0xDC, 0xAC, 0xDD, /* 0x38-0x3B */
+	0xAC, 0xDE, 0xAC, 0xDF, 0xAC, 0xE0, 0xAC, 0xE1, /* 0x3C-0x3F */
+	0xAC, 0xE2, 0xAC, 0xE3, 0xAC, 0xE4, 0xAC, 0xE5, /* 0x40-0x43 */
+	0xAC, 0xE6, 0xAC, 0xE7, 0xAC, 0xE8, 0xAC, 0xE9, /* 0x44-0x47 */
+	0xAC, 0xEA, 0xAC, 0xEB, 0xAC, 0xEC, 0xAC, 0xED, /* 0x48-0x4B */
+	0xAC, 0xEE, 0xAC, 0xEF, 0xAC, 0xF0, 0xAC, 0xF1, /* 0x4C-0x4F */
+	0x00, 0x00, 0xAC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+};
+
+static const unsigned char u2c_11[512] = {
+	0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA4, 0xA4, 0xA7, /* 0x00-0x03 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xB1, 0xA4, 0xB2, /* 0x04-0x07 */
+	0xA4, 0xB3, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x08-0x0B */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x0C-0x0F */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0x10-0x13 */
+	0xA4, 0xD5, 0xA4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, /* 0x18-0x1B */
+	0xA4, 0xDE, 0xA4, 0xE1, 0xA4, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA4, 0xE3, 0xA4, 0xB4, 0xA4, 0xE4, 0xA4, 0xE5, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE6, /* 0x24-0x27 */
+	0x00, 0x00, 0xA4, 0xE7, 0x00, 0x00, 0xA4, 0xE8, /* 0x28-0x2B */
+	0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, 0xA4, 0xEC, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xED, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xEE, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB5, 0xA4, 0xB6, /* 0x3C-0x3F */
+	0xA4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xA4, 0xF2, 0xA4, 0xF3, 0xA4, 0xF0, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB8, 0xA4, 0xB9, /* 0x4C-0x4F */
+	0xA4, 0xB8, 0xA4, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xA4, 0xBA, 0xA4, 0xBA, 0x00, 0x00, 0xA4, 0xF4, /* 0x54-0x57 */
+	0xA4, 0xF5, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD4, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x60-0x63 */
+	0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, 0xA4, 0xC5, /* 0x64-0x67 */
+	0xA4, 0xC6, 0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, /* 0x68-0x6B */
+	0xA4, 0xCA, 0xA4, 0xCB, 0xA4, 0xCC, 0xA4, 0xCD, /* 0x6C-0x6F */
+	0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, 0xA4, 0xD1, /* 0x70-0x73 */
+	0xA4, 0xD2, 0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xA4, 0xF7, 0xA4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xA4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA4, 0xFA, 0xA4, 0xFB, 0x00, 0x00, /* 0x90-0x93 */
+	0xA4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xFD, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, 0xA4, 0xA4, /* 0xA8-0xAB */
+	0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, 0xA4, 0xA9, /* 0xAC-0xAF */
+	0xA4, 0xAA, 0xA4, 0xAB, 0xA4, 0xAC, 0xA4, 0xAD, /* 0xB0-0xB3 */
+	0xA4, 0xAE, 0xA4, 0xAF, 0xA4, 0xB0, 0xA4, 0xB1, /* 0xB4-0xB7 */
+	0xA4, 0xB2, 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xB8-0xBB */
+	0xA4, 0xB7, 0xA4, 0xB8, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xBC-0xBF */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xD6, 0xA4, 0xD7, /* 0xC4-0xC7 */
+	0xA4, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xA4, 0xD9, 0x00, 0x00, 0xA4, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDC, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xA4, 0xDE, 0xA4, 0xDF, 0x00, 0x00, 0xA4, 0xE0, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xE8, 0xA4, 0xEA, /* 0xE4-0xE7 */
+	0xA4, 0xEC, 0x00, 0x00, 0xA4, 0xED, 0xA4, 0xEF, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA4, 0xB7, 0xA4, 0xF2, 0xA4, 0xF3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xA4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA2, 0xD3, 0xA2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA2, 0xB6, 0x00, 0x00, 0xA1, 0xC7, 0xA1, 0xC8, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0xFA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xA9, 0xFB, 0xA9, 0xFC, 0xA9, 0xFD, /* 0x80-0x83 */
+	0xA9, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA2, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA4, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xE0, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA2, 0xE5, 0xA2, 0xE2, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xD9, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xCA, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF7, /* 0x50-0x53 */
+	0xA8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xFB, /* 0x58-0x5B */
+	0xA8, 0xFC, 0xA8, 0xFD, 0xA8, 0xFE, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0x60-0x63 */
+	0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0x64-0x67 */
+	0xA5, 0xB8, 0xA5, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, 0xA5, 0xA4, /* 0x70-0x73 */
+	0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, 0xA5, 0xA8, /* 0x74-0x77 */
+	0xA5, 0xA9, 0xA5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA1, 0xE7, 0xA1, 0xE8, 0xA1, 0xE6, 0xA1, 0xE9, /* 0x90-0x93 */
+	0xA1, 0xEA, 0xA2, 0xD5, 0xA2, 0xD8, 0xA2, 0xD6, /* 0x94-0x97 */
+	0xA2, 0xD9, 0xA2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xA1, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xA2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_22[512] = {
+	0xA2, 0xA3, 0x00, 0x00, 0xA1, 0xD3, 0xA2, 0xA4, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD4, /* 0x04-0x07 */
+	0xA1, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF5, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xB3, /* 0x0C-0x0F */
+	0x00, 0x00, 0xA2, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xEE, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xA1, 0xF0, 0xA1, 0xC4, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0xA1, 0xFC, /* 0x24-0x27 */
+	0xA1, 0xFD, 0xA1, 0xFB, 0xA1, 0xFA, 0xA1, 0xF2, /* 0x28-0x2B */
+	0xA1, 0xF3, 0x00, 0x00, 0xA2, 0xB1, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA1, 0xC5, 0xA1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA1, 0xAD, 0xA1, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD6, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA1, 0xC1, 0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA1, 0xC2, 0xA1, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, 0xA1, 0xED, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF6, 0xA1, 0xF7, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA2, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD2, 0x00, 0x00, /* 0x10-0x13 */
+};
+
+static const unsigned char u2c_24[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA8, 0xE7, 0xA8, 0xE8, 0xA8, 0xE9, 0xA8, 0xEA, /* 0x60-0x63 */
+	0xA8, 0xEB, 0xA8, 0xEC, 0xA8, 0xED, 0xA8, 0xEE, /* 0x64-0x67 */
+	0xA8, 0xEF, 0xA8, 0xF0, 0xA8, 0xF1, 0xA8, 0xF2, /* 0x68-0x6B */
+	0xA8, 0xF3, 0xA8, 0xF4, 0xA8, 0xF5, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA9, 0xE7, 0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, /* 0x74-0x77 */
+	0xA9, 0xEB, 0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, /* 0x78-0x7B */
+	0xA9, 0xEF, 0xA9, 0xF0, 0xA9, 0xF1, 0xA9, 0xF2, /* 0x7C-0x7F */
+	
+	0xA9, 0xF3, 0xA9, 0xF4, 0xA9, 0xF5, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, 0xA9, 0xD0, /* 0x9C-0x9F */
+	0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, 0xA9, 0xD4, /* 0xA0-0xA3 */
+	0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, 0xA9, 0xD8, /* 0xA4-0xA7 */
+	0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, 0xA9, 0xDC, /* 0xA8-0xAB */
+	0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, 0xA9, 0xE0, /* 0xAC-0xAF */
+	0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, 0xA9, 0xE4, /* 0xB0-0xB3 */
+	0xA9, 0xE5, 0xA9, 0xE6, 0xA8, 0xCD, 0xA8, 0xCE, /* 0xB4-0xB7 */
+	0xA8, 0xCF, 0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, /* 0xB8-0xBB */
+	0xA8, 0xD3, 0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, /* 0xBC-0xBF */
+	0xA8, 0xD7, 0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, /* 0xC0-0xC3 */
+	0xA8, 0xDB, 0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, /* 0xC4-0xC7 */
+	0xA8, 0xDF, 0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, /* 0xC8-0xCB */
+	0xA8, 0xE3, 0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, /* 0xCC-0xCF */
+	0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, 0xA8, 0xD0, /* 0xD0-0xD3 */
+	0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, 0xA8, 0xD4, /* 0xD4-0xD7 */
+	0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, 0xA8, 0xD8, /* 0xD8-0xDB */
+	0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, 0xA8, 0xDC, /* 0xDC-0xDF */
+	0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, 0xA8, 0xE0, /* 0xE0-0xE3 */
+	0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, 0xA8, 0xE4, /* 0xE4-0xE7 */
+	0xA8, 0xE5, 0xA8, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_25[512] = {
+	0xA6, 0xA1, 0xA6, 0xAC, 0xA6, 0xA2, 0xA6, 0xAD, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xA6, 0xA3, 0xA6, 0xC8, 0xA6, 0xC7, 0xA6, 0xAE, /* 0x0C-0x0F */
+	0xA6, 0xA4, 0xA6, 0xC2, 0xA6, 0xC1, 0xA6, 0xAF, /* 0x10-0x13 */
+	0xA6, 0xA6, 0xA6, 0xC6, 0xA6, 0xC5, 0xA6, 0xB1, /* 0x14-0x17 */
+	0xA6, 0xA5, 0xA6, 0xC4, 0xA6, 0xC3, 0xA6, 0xB0, /* 0x18-0x1B */
+	0xA6, 0xA7, 0xA6, 0xBC, 0xA6, 0xC9, 0xA6, 0xCA, /* 0x1C-0x1F */
+	0xA6, 0xB7, 0xA6, 0xCB, 0xA6, 0xCC, 0xA6, 0xB2, /* 0x20-0x23 */
+	0xA6, 0xA9, 0xA6, 0xBE, 0xA6, 0xCD, 0xA6, 0xCE, /* 0x24-0x27 */
+	0xA6, 0xB9, 0xA6, 0xCF, 0xA6, 0xD0, 0xA6, 0xB4, /* 0x28-0x2B */
+	0xA6, 0xA8, 0xA6, 0xD1, 0xA6, 0xD2, 0xA6, 0xB8, /* 0x2C-0x2F */
+	0xA6, 0xBD, 0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xB3, /* 0x30-0x33 */
+	0xA6, 0xAA, 0xA6, 0xD5, 0xA6, 0xD6, 0xA6, 0xBA, /* 0x34-0x37 */
+	0xA6, 0xBF, 0xA6, 0xD7, 0xA6, 0xD8, 0xA6, 0xB5, /* 0x38-0x3B */
+	0xA6, 0xAB, 0xA6, 0xD9, 0xA6, 0xDA, 0xA6, 0xBB, /* 0x3C-0x3F */
+	0xA6, 0xDB, 0xA6, 0xDC, 0xA6, 0xC0, 0xA6, 0xDD, /* 0x40-0x43 */
+	0xA6, 0xDE, 0xA6, 0xDF, 0xA6, 0xE0, 0xA6, 0xE1, /* 0x44-0x47 */
+	0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xE4, 0xA6, 0xB6, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xC6, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA1, 0xE1, 0xA1, 0xE0, 0x00, 0x00, 0xA2, 0xC3, /* 0xA0-0xA3 */
+	0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xCB, 0xA2, 0xCA, /* 0xA4-0xA7 */
+	0xA2, 0xC9, 0xA2, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xE3, 0xA1, 0xE2, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xBA, 0xA2, 0xB9, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xA1, 0xE5, 0xA1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xA2, 0xB8, 0xA2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDF, 0xA1, 0xDE, /* 0xC4-0xC7 */
+	0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDD, 0xA1, 0xDC, /* 0xCC-0xCF */
+	0xA2, 0xC4, 0xA2, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xD9, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xCF, 0xA2, 0xCE, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA2, 0xD0, 0x00, 0x00, 0xA2, 0xD1, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA1, 0xCF, 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xBC, 0xA2, 0xBD, 0x00, 0x00, 0xA2, 0xC0, /* 0x60-0x63 */
+	0xA2, 0xBB, 0xA2, 0xBE, 0x00, 0x00, 0xA2, 0xBF, /* 0x64-0x67 */
+	0xA2, 0xCD, 0xA2, 0xDB, 0xA2, 0xDC, 0x00, 0x00, /* 0x68-0x6B */
+	0xA2, 0xDD, 0xA2, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+};
+
+static const unsigned char u2c_30[512] = {
+	0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */
+	0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */
+	0xA1, 0xBC, 0xA1, 0xBD, 0x00, 0x00, 0xA1, 0xEB, /* 0x10-0x13 */
+	0xA1, 0xB2, 0xA1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xAA, 0xA1, 0xAA, 0xA2, 0xAA, 0xA3, /* 0x40-0x43 */
+	0xAA, 0xA4, 0xAA, 0xA5, 0xAA, 0xA6, 0xAA, 0xA7, /* 0x44-0x47 */
+	0xAA, 0xA8, 0xAA, 0xA9, 0xAA, 0xAA, 0xAA, 0xAB, /* 0x48-0x4B */
+	0xAA, 0xAC, 0xAA, 0xAD, 0xAA, 0xAE, 0xAA, 0xAF, /* 0x4C-0x4F */
+	0xAA, 0xB0, 0xAA, 0xB1, 0xAA, 0xB2, 0xAA, 0xB3, /* 0x50-0x53 */
+	0xAA, 0xB4, 0xAA, 0xB5, 0xAA, 0xB6, 0xAA, 0xB7, /* 0x54-0x57 */
+	0xAA, 0xB8, 0xAA, 0xB9, 0xAA, 0xBA, 0xAA, 0xBB, /* 0x58-0x5B */
+	0xAA, 0xBC, 0xAA, 0xBD, 0xAA, 0xBE, 0xAA, 0xBF, /* 0x5C-0x5F */
+	0xAA, 0xC0, 0xAA, 0xC1, 0xAA, 0xC2, 0xAA, 0xC3, /* 0x60-0x63 */
+	0xAA, 0xC4, 0xAA, 0xC5, 0xAA, 0xC6, 0xAA, 0xC7, /* 0x64-0x67 */
+	0xAA, 0xC8, 0xAA, 0xC9, 0xAA, 0xCA, 0xAA, 0xCB, /* 0x68-0x6B */
+	0xAA, 0xCC, 0xAA, 0xCD, 0xAA, 0xCE, 0xAA, 0xCF, /* 0x6C-0x6F */
+	0xAA, 0xD0, 0xAA, 0xD1, 0xAA, 0xD2, 0xAA, 0xD3, /* 0x70-0x73 */
+	0xAA, 0xD4, 0xAA, 0xD5, 0xAA, 0xD6, 0xAA, 0xD7, /* 0x74-0x77 */
+	0xAA, 0xD8, 0xAA, 0xD9, 0xAA, 0xDA, 0xAA, 0xDB, /* 0x78-0x7B */
+	0xAA, 0xDC, 0xAA, 0xDD, 0xAA, 0xDE, 0xAA, 0xDF, /* 0x7C-0x7F */
+	
+	0xAA, 0xE0, 0xAA, 0xE1, 0xAA, 0xE2, 0xAA, 0xE3, /* 0x80-0x83 */
+	0xAA, 0xE4, 0xAA, 0xE5, 0xAA, 0xE6, 0xAA, 0xE7, /* 0x84-0x87 */
+	0xAA, 0xE8, 0xAA, 0xE9, 0xAA, 0xEA, 0xAA, 0xEB, /* 0x88-0x8B */
+	0xAA, 0xEC, 0xAA, 0xED, 0xAA, 0xEE, 0xAA, 0xEF, /* 0x8C-0x8F */
+	0xAA, 0xF0, 0xAA, 0xF1, 0xAA, 0xF2, 0xAA, 0xF3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xAB, 0xA1, 0xAB, 0xA2, 0xAB, 0xA3, /* 0xA0-0xA3 */
+	0xAB, 0xA4, 0xAB, 0xA5, 0xAB, 0xA6, 0xAB, 0xA7, /* 0xA4-0xA7 */
+	0xAB, 0xA8, 0xAB, 0xA9, 0xAB, 0xAA, 0xAB, 0xAB, /* 0xA8-0xAB */
+	0xAB, 0xAC, 0xAB, 0xAD, 0xAB, 0xAE, 0xAB, 0xAF, /* 0xAC-0xAF */
+	0xAB, 0xB0, 0xAB, 0xB1, 0xAB, 0xB2, 0xAB, 0xB3, /* 0xB0-0xB3 */
+	0xAB, 0xB4, 0xAB, 0xB5, 0xAB, 0xB6, 0xAB, 0xB7, /* 0xB4-0xB7 */
+	0xAB, 0xB8, 0xAB, 0xB9, 0xAB, 0xBA, 0xAB, 0xBB, /* 0xB8-0xBB */
+	0xAB, 0xBC, 0xAB, 0xBD, 0xAB, 0xBE, 0xAB, 0xBF, /* 0xBC-0xBF */
+	0xAB, 0xC0, 0xAB, 0xC1, 0xAB, 0xC2, 0xAB, 0xC3, /* 0xC0-0xC3 */
+	0xAB, 0xC4, 0xAB, 0xC5, 0xAB, 0xC6, 0xAB, 0xC7, /* 0xC4-0xC7 */
+	0xAB, 0xC8, 0xAB, 0xC9, 0xAB, 0xCA, 0xAB, 0xCB, /* 0xC8-0xCB */
+	0xAB, 0xCC, 0xAB, 0xCD, 0xAB, 0xCE, 0xAB, 0xCF, /* 0xCC-0xCF */
+	0xAB, 0xD0, 0xAB, 0xD1, 0xAB, 0xD2, 0xAB, 0xD3, /* 0xD0-0xD3 */
+	0xAB, 0xD4, 0xAB, 0xD5, 0xAB, 0xD6, 0xAB, 0xD7, /* 0xD4-0xD7 */
+	0xAB, 0xD8, 0xAB, 0xD9, 0xAB, 0xDA, 0xAB, 0xDB, /* 0xD8-0xDB */
+	0xAB, 0xDC, 0xAB, 0xDD, 0xAB, 0xDE, 0xAB, 0xDF, /* 0xDC-0xDF */
+	0xAB, 0xE0, 0xAB, 0xE1, 0xAB, 0xE2, 0xAB, 0xE3, /* 0xE0-0xE3 */
+	0xAB, 0xE4, 0xAB, 0xE5, 0xAB, 0xE6, 0xAB, 0xE7, /* 0xE4-0xE7 */
+	0xAB, 0xE8, 0xAB, 0xE9, 0xAB, 0xEA, 0xAB, 0xEB, /* 0xE8-0xEB */
+	0xAB, 0xEC, 0xAB, 0xED, 0xAB, 0xEE, 0xAB, 0xEF, /* 0xEC-0xEF */
+	0xAB, 0xF0, 0xAB, 0xF1, 0xAB, 0xF2, 0xAB, 0xF3, /* 0xF0-0xF3 */
+	0xAB, 0xF4, 0xAB, 0xF5, 0xAB, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_31[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x30-0x33 */
+	0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x34-0x37 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x38-0x3B */
+	0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x3C-0x3F */
+	0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x40-0x43 */
+	0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x44-0x47 */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x48-0x4B */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x4C-0x4F */
+	0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x50-0x53 */
+	0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x54-0x57 */
+	0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x58-0x5B */
+	0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x5C-0x5F */
+	0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x60-0x63 */
+	0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x64-0x67 */
+	0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x68-0x6B */
+	0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x6C-0x6F */
+	0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x70-0x73 */
+	0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x74-0x77 */
+	0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x78-0x7B */
+	0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x7C-0x7F */
+	
+	0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x80-0x83 */
+	0xA4, 0xF4, 0xA4, 0xF5, 0xA4, 0xF6, 0xA4, 0xF7, /* 0x84-0x87 */
+	0xA4, 0xF8, 0xA4, 0xF9, 0xA4, 0xFA, 0xA4, 0xFB, /* 0x88-0x8B */
+	0xA4, 0xFC, 0xA4, 0xFD, 0xA4, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xE9, 0xEC, 0xA3, /* 0x90-0x93 */
+	0xDF, 0xB2, 0xDE, 0xCC, 0xDF, 0xBE, 0xF1, 0xE9, /* 0x94-0x97 */
+	0xF9, 0xBB, 0xCB, 0xA3, 0xEB, 0xE0, 0xDC, 0xB0, /* 0x98-0x9B */
+	0xEF, 0xCB, 0xF4, 0xB8, 0xF2, 0xA2, 0xEC, 0xD1, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_32[512] = {
+	0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, 0xA9, 0xB4, /* 0x00-0x03 */
+	0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, 0xA9, 0xB8, /* 0x04-0x07 */
+	0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, 0xA9, 0xBC, /* 0x08-0x0B */
+	0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, 0xA9, 0xC0, /* 0x0C-0x0F */
+	0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, 0xA9, 0xC4, /* 0x10-0x13 */
+	0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, 0xA9, 0xC8, /* 0x14-0x17 */
+	0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, 0xA9, 0xCC, /* 0x18-0x1B */
+	0xA2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x20-0x23 */
+	0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x24-0x27 */
+	0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x28-0x2B */
+	0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x2C-0x2F */
+	0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x30-0x33 */
+	0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x34-0x37 */
+	0xD6, 0xCC, 0xD3, 0xDB, 0xFB, 0xBC, 0xF9, 0xCA, /* 0x38-0x3B */
+	0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0x3C-0x3F */
+	0xF0, 0xAE, 0xFD, 0xCC, 0xED, 0xBB, 0xF2, 0xB8, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA8, 0xB1, 0xA8, 0xB2, 0xA8, 0xB3, 0xA8, 0xB4, /* 0x60-0x63 */
+	0xA8, 0xB5, 0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB8, /* 0x64-0x67 */
+	0xA8, 0xB9, 0xA8, 0xBA, 0xA8, 0xBB, 0xA8, 0xBC, /* 0x68-0x6B */
+	0xA8, 0xBD, 0xA8, 0xBE, 0xA8, 0xBF, 0xA8, 0xC0, /* 0x6C-0x6F */
+	0xA8, 0xC1, 0xA8, 0xC2, 0xA8, 0xC3, 0xA8, 0xC4, /* 0x70-0x73 */
+	0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, 0xA8, 0xC8, /* 0x74-0x77 */
+	0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, 0xA8, 0xCC, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xDE, /* 0x7C-0x7F */
+	
+	0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x80-0x83 */
+	0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x84-0x87 */
+	0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x88-0x8B */
+	0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x8C-0x8F */
+	0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x90-0x93 */
+	0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x94-0x97 */
+	0xD6, 0xCC, 0xDD, 0xFA, 0xD1, 0xFB, 0xD2, 0xB3, /* 0x98-0x9B */
+	0xEE, 0xEA, 0xE9, 0xD0, 0xEC, 0xD4, 0xF1, 0xBC, /* 0x9C-0x9F */
+	0xFA, 0xA3, 0xFD, 0xCC, 0xDE, 0xD0, 0xEF, 0xE1, /* 0xA0-0xA3 */
+	0xDF, 0xBE, 0xF1, 0xE9, 0xF9, 0xBB, 0xF1, 0xA7, /* 0xA4-0xA7 */
+	0xE9, 0xD3, 0xEC, 0xA2, 0xF0, 0xF3, 0xF9, 0xCA, /* 0xA8-0xAB */
+	0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0xAC-0xAF */
+	0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xA7, 0xC9, 0xA7, 0xCA, 0xA7, 0xCB, 0xA7, 0xCC, /* 0x80-0x83 */
+	0xA7, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x88-0x8B */
+	0xA7, 0xDE, 0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x8C-0x8F */
+	0xA7, 0xD4, 0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD7, /* 0x90-0x93 */
+	0xA7, 0xD8, 0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, /* 0x94-0x97 */
+	0xA7, 0xA5, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x98-0x9B */
+	0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x9C-0x9F */
+	0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xA7, /* 0xA0-0xA3 */
+	0xA7, 0xA8, 0xA7, 0xA9, 0xA7, 0xAA, 0xA7, 0xBD, /* 0xA4-0xA7 */
+	0xA7, 0xBE, 0xA7, 0xE5, 0xA7, 0xE6, 0xA7, 0xE7, /* 0xA8-0xAB */
+	0xA7, 0xE8, 0xA7, 0xE1, 0xA7, 0xE2, 0xA7, 0xE3, /* 0xAC-0xAF */
+	0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, 0xA7, 0xC2, /* 0xB0-0xB3 */
+	0xA7, 0xC3, 0xA7, 0xC4, 0xA7, 0xC5, 0xA7, 0xC6, /* 0xB4-0xB7 */
+	0xA7, 0xC7, 0xA7, 0xC8, 0xA7, 0xCE, 0xA7, 0xCF, /* 0xB8-0xBB */
+	0xA7, 0xD0, 0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, /* 0xBC-0xBF */
+	0xA7, 0xDA, 0xA7, 0xDB, 0xA2, 0xE3, 0xA7, 0xEC, /* 0xC0-0xC3 */
+	0xA7, 0xA6, 0xA7, 0xE0, 0xA7, 0xEF, 0xA2, 0xE1, /* 0xC4-0xC7 */
+	0xA7, 0xBC, 0xA7, 0xED, 0xA7, 0xB5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xB9, /* 0xCC-0xCF */
+	0xA7, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xEB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xDF, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xA2, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xE4, /* 0xD8-0xDB */
+	0xA7, 0xEE, 0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0xEC, 0xE9, 0xEF, 0xCB, 0x00, 0x00, 0xF6, 0xD2, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB2, /* 0x04-0x07 */
+	0xED, 0xDB, 0xDF, 0xB2, 0xDF, 0xBE, 0xF9, 0xBB, /* 0x08-0x0B */
+	0x00, 0x00, 0xDC, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF3, 0xA6, 0xDD, 0xE0, 0xE1, 0xA6, 0x00, 0x00, /* 0x14-0x17 */
+	0xCE, 0xF8, 0xDC, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xF1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xFA, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xFC, 0xAF, 0xD3, 0xA1, 0x00, 0x00, 0xF1, 0xAB, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD1, 0xD2, 0xAC, /* 0x40-0x43 */
+	0x00, 0x00, 0xCE, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xFD, /* 0x48-0x4B */
+	0x00, 0x00, 0xDE, 0xBF, 0xFB, 0xBA, 0xF9, 0xB9, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xD2, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xAB, 0xEB, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xCE, 0xFA, 0xCB, 0xF7, 0xE5, 0xA5, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE1, /* 0x68-0x6B */
+	0x00, 0x00, 0xD4, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE1, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xE3, 0xDF, 0xAD, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xEB, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xAF, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF5, 0x00, 0x00, /* 0x84-0x87 */
+	0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC0, /* 0x88-0x8B */
+	0xEC, 0xA3, 0x00, 0x00, 0xE9, 0xCD, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEA, 0xA7, 0xE9, 0xF6, 0xFB, 0xBB, 0x00, 0x00, /* 0x90-0x93 */
+	0xE7, 0xE9, 0xEF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xD0, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC1, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD8, 0xCC, 0xF9, 0xF1, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xCE, 0xDF, 0xFA, 0xA4, 0xE6, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFA, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBD, /* 0xA8-0xAB */
+	0xCC, 0xC8, 0xEF, 0xCD, 0xD5, 0xD5, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA2, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xD1, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE4, 0xA7, 0xEC, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF6, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xFB, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xD1, 0xCB, 0xBF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xED, 0xA8, 0xDE, 0xC2, 0xF6, 0xE2, 0xED, 0xDC, /* 0xD4-0xD7 */
+	0xDC, 0xF5, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xD4, 0xCE, 0x00, 0x00, 0xF4, 0xB5, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDB, /* 0xE0-0xE3 */
+	0xD6, 0xB5, 0xEC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE4, 0xE6, 0x00, 0x00, 0xF1, 0xEA, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xEC, 0xCB, 0xC0, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0x00, 0x00, 0xD0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF9, 0xF2, 0xEC, 0xA5, 0xD0, 0xDF, /* 0x08-0x0B */
+	0x00, 0x00, 0xE7, 0xEA, 0xD0, 0xEB, 0xDC, 0xD1, /* 0x0C-0x0F */
+	0xDB, 0xE9, 0xFD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xD7, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xDA, 0xE1, 0x00, 0x00, 0xD6, 0xB6, 0x00, 0x00, /* 0x34-0x37 */
+	0xE3, 0xDF, 0x00, 0x00, 0xDE, 0xC3, 0x00, 0x00, /* 0x38-0x3B */
+	0xDE, 0xC4, 0xCA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xEC, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA3, 0xEE, 0xB7, /* 0x44-0x47 */
+	0xF8, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEA, 0xC8, 0xEE, 0xB8, 0xF1, 0xAC, /* 0x4C-0x4F */
+	0xF1, 0xA5, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0xF9, 0xEC, 0xEA, 0xDD, 0xD6, /* 0x58-0x5B */
+	0xED, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xF8, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBA, /* 0x6C-0x6F */
+	0xDB, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA2, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xCD, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xED, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xEB, 0xDE, 0xC5, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE3, 0xE0, 0x00, 0x00, 0xCA, 0xC9, /* 0x80-0x83 */
+	0xF2, 0xE9, 0x00, 0x00, 0xD5, 0xCE, 0x00, 0x00, /* 0x84-0x87 */
+	0xF6, 0xB6, 0x00, 0x00, 0xCE, 0xC2, 0xD6, 0xC7, /* 0x88-0x8B */
+	0x00, 0x00, 0xE3, 0xB4, 0x00, 0x00, 0xF1, 0xAD, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xC2, 0x00, 0x00, /* 0x94-0x97 */
+	0xF3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xEA, /* 0x98-0x9B */
+	0x00, 0x00, 0xEB, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xB2, 0xFD, 0xA5, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF6, 0xD5, 0xD5, 0xE2, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB5, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xF5, 0xF5, 0xB5, /* 0xC0-0xC3 */
+	0xE4, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE7, 0xEB, 0xF1, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xBB, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE9, 0xB5, 0x00, 0x00, 0xCC, 0xC9, /* 0xD0-0xD3 */
+	0xFA, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xD6, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xDC, 0xC1, 0x00, 0x00, 0xDE, 0xC6, /* 0xDC-0xDF */
+	0xFA, 0xEF, 0xE3, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, 0xDC, 0xF6, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCE, 0xFC, 0x00, 0x00, 0xDB, 0xC4, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF8, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xDC, 0xE4, 0x00, 0x00, 0xE5, 0xEF, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_50[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB1, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xD6, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF3, 0xDA, 0x00, 0x00, 0xCB, 0xC1, /* 0x08-0x0B */
+	0x00, 0x00, 0xDB, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD9, 0xFA, 0xD3, 0xEE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xB8, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xFD, 0xA6, 0xEB, 0xEF, 0x00, 0x00, /* 0x18-0x1B */
+	0xF4, 0xA6, 0x00, 0x00, 0xCC, 0xCA, 0xF3, 0xA8, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF3, 0xDB, 0x00, 0x00, 0xDB, 0xA7, /* 0x20-0x23 */
+	0xF6, 0xB7, 0x00, 0x00, 0xCF, 0xE6, 0xF0, 0xF2, /* 0x24-0x27 */
+	0xCB, 0xDA, 0x00, 0x00, 0xE7, 0xD2, 0xD7, 0xC3, /* 0x28-0x2B */
+	0xF6, 0xF0, 0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE7, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA3, /* 0x44-0x47 */
+	0xCC, 0xA7, 0xEA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB6, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xFA, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x58-0x5B */
+	0xEF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xCB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xF6, 0xB0, 0xEF, 0xCF, 0xE9, 0xCF, 0x00, 0x00, /* 0x74-0x77 */
+	0xF7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCE, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xDC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xDB, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xCB, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xDF, 0xA1, 0xDD, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xF5, 0xCA, 0xE9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xEC, 0xEE, 0xEE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF3, 0xF0, 0x00, 0x00, 0xDF, 0xBF, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD0, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF4, 0xD2, 0xE0, 0xBA, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC0, /* 0xCC-0xCF */
+	0x00, 0x00, 0xCE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDC, 0xD2, 0xFD, 0xEA, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE8, 0xE9, 0x00, 0x00, 0xE3, 0xAC, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCA, 0xA4, 0x00, 0x00, 0xDB, 0xF8, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_51[512] = {
+	0xEB, 0xF0, 0xF1, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xE2, 0x00, 0x00, 0xCC, 0xCC, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xE3, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC1, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xD0, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xB9, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xD3, 0x00, 0x00, /* 0x38-0x3B */
+	0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE8, 0xB4, 0xEB, 0xC3, 0x00, 0x00, 0xEA, 0xAA, /* 0x40-0x43 */
+	0xFA, 0xFC, 0xF5, 0xF6, 0xF0, 0xBC, 0xFD, 0xD4, /* 0x44-0x47 */
+	0xE0, 0xBB, 0xCE, 0xC3, 0x00, 0x00, 0xD0, 0xBA, /* 0x48-0x4B */
+	0xF7, 0xBA, 0xD8, 0xF3, 0xF7, 0xCD, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAE, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xD4, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xEC, 0xFD, 0x00, 0x00, 0xD2, 0xAE, /* 0x64-0x67 */
+	0xEE, 0xEF, 0xD5, 0xD7, 0xEA, 0xE4, 0xF8, 0xA2, /* 0x68-0x6B */
+	0xCD, 0xEB, 0xD7, 0xBF, 0xFB, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xCD, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xDC, 0xB2, 0xD0, 0xEC, 0xCE, 0xFD, /* 0x74-0x77 */
+	0xEE, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xCC, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xFC, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xEE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xB3, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xD8, 0xF4, 0x00, 0x00, 0xE9, 0xB7, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD4, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, 0xD5, 0xD2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD6, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF4, 0xA2, 0x00, 0x00, 0xF1, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD5, 0xD8, 0x00, 0x00, 0xF0, 0xBD, /* 0xC8-0xCB */
+	0xD7, 0xD0, 0xD4, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD7, 0xCF, 0xEB, 0xEA, 0xFD, 0xEB, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xDB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xFC, 0xC5, 0xCB, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xD5, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF4, 0xC8, 0xE8, 0xEA, 0xF5, 0xF3, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF9, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0xD3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD3, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xC2, 0xEF, 0xB7, /* 0x04-0x07 */
+	0xE7, 0xD4, 0x00, 0x00, 0xCA, 0xCA, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xFB, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xFA, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAA, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xF4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xF7, 0xF7, 0xDC, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xD7, 0xD7, 0xDF, 0xA2, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xBE, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD3, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xA4, 0xE1, 0xEC, /* 0x34-0x37 */
+	0xCF, 0xE7, 0xF3, 0xCB, 0xED, 0xA9, 0xCA, 0xBE, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEF, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCE, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xFB, 0xD0, 0xBB, /* 0x48-0x4B */
+	0xD5, 0xB7, 0xEE, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xF4, 0xA8, 0x00, 0x00, 0xDC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA7, /* 0x58-0x5B */
+	0x00, 0x00, 0xDA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE0, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xED, 0xA5, 0xEE, 0xF2, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF9, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xDC, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF3, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF8, 0xF2, 0x00, 0x00, 0xF4, 0xF9, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF1, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBC, /* 0x84-0x87 */
+	0xDB, 0xF9, 0xD7, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xCB, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF0, 0xA5, 0xCB, 0xFD, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF4, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xED, /* 0x9C-0x9F */
+	0xCA, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAB, /* 0xA0-0xA3 */
+	0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xF0, 0xBE, 0xD2, 0xBD, 0xCC, 0xA4, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCC, 0xCD, 0x00, 0x00, 0xDA, 0xFA, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF6, 0xCF, 0x00, 0x00, 0xE9, 0xB8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD8, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xD4, 0xD1, 0xE9, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCA, 0xEB, 0xD9, 0xE2, 0x00, 0x00, 0xFD, 0xB2, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE3, 0xAD, 0xD6, 0xCC, 0xD9, 0xB4, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0xEE, 0xD3, /* 0xE0-0xE3 */
+	0xD0, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB3, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xCF, 0xE8, 0x00, 0x00, 0xED, 0xC3, 0xD0, 0xB2, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xFE, 0xDA, 0xA8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xF8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xFD, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xF8, 0xD1, 0x00, 0x00, 0xF8, 0xD2, /* 0x0C-0x0F */
+	0xDC, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xDD, 0xE2, 0xFB, 0xF9, 0xDD, 0xC1, /* 0x14-0x17 */
+	0x00, 0x00, 0xE3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xED, 0xDD, 0xCE, 0xC4, 0x00, 0x00, 0xCB, 0xA1, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xE3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDD, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xFB, /* 0x3C-0x3F */
+	0xCF, 0xA1, 0xE4, 0xA8, 0x00, 0x00, 0xF4, 0xB6, /* 0x40-0x43 */
+	0xEC, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, /* 0x44-0x47 */
+	0xE7, 0xED, 0xFD, 0xC1, 0xDA, 0xE2, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xD8, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xDD, 0xE4, 0xF0, 0xEF, 0xF6, 0xF1, /* 0x50-0x53 */
+	0xFA, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF5, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xCF, 0x00, 0x00, /* 0x58-0x5B */
+	0xDC, 0xD4, 0x00, 0x00, 0xDC, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEF, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xCF, 0x00, 0x00, /* 0x64-0x67 */
+	0xE0, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD6, /* 0x6C-0x6F */
+	0xEC, 0xD4, 0xEA, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xCA, 0xBF, 0xD5, 0xB0, 0x00, 0x00, 0xCF, 0xE9, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF1, 0xED, 0x00, 0x00, 0xCC, 0xCF, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xD7, 0xD8, 0x00, 0x00, 0xFD, 0xA7, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xAB, /* 0x9C-0x9F */
+	0xF6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCF, 0xF0, 0xF9, 0xBD, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE6, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xDB, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xD1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE9, 0xD1, 0xF3, 0xA9, 0xD0, 0xE0, 0xE9, 0xD2, /* 0xC8-0xCB */
+	0x00, 0x00, 0xDA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE2, 0xD2, 0x00, 0x00, 0xF6, 0xA2, 0xE1, 0xF4, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xE4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE7, 0xD5, 0xF5, 0xBF, 0xCF, 0xA2, /* 0xE0-0xE3 */
+	0xCD, 0xAF, 0xCF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCD, 0xB0, 0xF1, 0xFE, 0xD0, 0xA3, /* 0xE8-0xEB */
+	0xE1, 0xAF, 0xF8, 0xA3, 0x00, 0x00, 0xCA, 0xA6, /* 0xEC-0xEF */
+	0xF7, 0xBB, 0xF2, 0xEA, 0xDE, 0xC8, 0xE9, 0xD3, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xDE, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xDE, /* 0x00-0x03 */
+	0xCA, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xF9, 0xEA, 0xD1, 0xCE, 0xEE, 0xD4, 0x00, 0x00, /* 0x08-0x0B */
+	0xD4, 0xD2, 0xD9, 0xA3, 0xFD, 0xA8, 0xD7, 0xD9, /* 0x0C-0x0F */
+	0xF7, 0xCE, 0xFA, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD6, /* 0x18-0x1B */
+	0x00, 0x00, 0xD7, 0xF0, 0x00, 0x00, 0xEB, 0xE1, /* 0x1C-0x1F */
+	0xF8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xFA, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xDD, 0xC3, 0x00, 0x00, 0xF9, 0xDF, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xEF, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xFD, 0xE5, 0xF6, 0xA3, 0x00, 0x00, 0xD9, 0xFC, /* 0x38-0x3B */
+	0xFD, 0xA9, 0x00, 0x00, 0xE7, 0xEE, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xE5, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xEF, 0xD0, 0x00, 0x00, 0xCD, 0xB1, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xF7, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF1, 0xB2, 0x00, 0x00, 0xF1, 0xB1, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xCD, 0xB2, 0x00, 0x00, 0xDA, 0xAB, /* 0x70-0x73 */
+	0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE2, /* 0x78-0x7B */
+	0xFB, 0xBC, 0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xFB, 0xFA, 0x00, 0x00, 0xCF, 0xA4, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF6, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xED, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA1, /* 0xA8-0xAB */
+	0xCE, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA6, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF9, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE4, 0xEE, 0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xFB, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xF9, 0xEB, 0xEE, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEA, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xCA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF4, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCD, 0xD6, 0xFC, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD4, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF8, 0xA6, 0x00, 0x00, 0xDE, 0xCA, 0xF2, 0xC6, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xDA, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xD8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE6, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF3, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF6, 0xF2, 0x00, 0x00, 0xDF, 0xC2, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFD, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF6, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xBA, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE1, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF0, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCB, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE0, 0xBC, 0x00, 0x00, 0xF4, 0xCA, 0xD4, 0xFA, /* 0x84-0x87 */
+	0x00, 0x00, 0xFD, 0xAA, 0xF9, 0xE2, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xF4, 0xB7, 0xFD, 0xC2, 0xFC, 0xB0, 0x00, 0x00, /* 0x98-0x9B */
+	0xFD, 0xEC, 0xCA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBD, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xEA, 0xE7, 0xDF, 0xC3, 0xD1, 0xD2, /* 0xA8-0xAB */
+	0xCE, 0xE2, 0x00, 0x00, 0xD3, 0xA4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xFD, 0xAB, 0x00, 0x00, 0xDF, 0xE0, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF2, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF0, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD0, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCB, /* 0xE0-0xE3 */
+	0xF6, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE1, 0xF5, 0xF1, 0xB3, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xA3, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCA, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xCF, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC4, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB0, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xBF, 0x00, 0x00, /* 0x30-0x33 */
+	0xF6, 0xA4, 0x00, 0x00, 0xE3, 0xB6, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC6, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xD0, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xED, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xDD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xF7, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xDE, 0xAD, 0x00, 0x00, 0xFA, 0xBF, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xA5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xFD, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF5, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xF6, 0xDE, 0xCC, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xDE, 0x00, 0x00, /* 0xDC-0xDF */
+	0xEC, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xCD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xD6, 0xB7, 0xCD, 0xB3, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD5, /* 0x00-0x03 */
+	0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xCF, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD0, /* 0x08-0x0B */
+	0x00, 0x00, 0xEA, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xAE, 0xEA, 0xAD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xF1, 0x00, 0x00, /* 0x14-0x17 */
+	0xD3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xCF, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xEE, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD0, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF2, 0xA3, 0x00, 0x00, 0xF7, 0xF8, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB3, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xA9, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xD3, 0xBB, 0xCA, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF1, 0xA6, 0xCB, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xCD, 0xDE, 0x00, 0x00, 0xF7, 0xA4, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xC0, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xDD, 0x00, 0x00, /* 0x6C-0x6F */
+	0xCC, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xCF, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xF7, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD3, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xFE, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA7, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEB, 0xD9, 0x00, 0x00, 0xCF, 0xA7, 0xEA, 0xAF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, /* 0xC4-0xC7 */
+	0xF1, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD8, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF2, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB4, /* 0xDC-0xDF */
+	0xDC, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF3, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFB, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDB, 0xC6, 0xD0, 0xF1, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD0, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0xCF, 0xDC, 0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xCC, 0xB1, 0xF7, 0xD8, 0x00, 0x00, /* 0x04-0x07 */
+	0xCB, 0xA8, 0xEB, 0xBC, 0xE4, 0xBE, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDC, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xDC, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xF0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC0, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xED, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xEB, /* 0x2C-0x2F */
+	0xE5, 0xE8, 0xDC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xED, 0xDE, 0xD3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xF7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xD4, 0xE7, 0xAB, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC3, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xF7, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF3, /* 0x54-0x57 */
+	0xD3, 0xD2, 0x00, 0x00, 0xF5, 0xC0, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xDD, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xEE, 0xF3, 0xE7, 0xF1, 0x00, 0x00, /* 0x60-0x63 */
+	0xFD, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xEE, 0xF4, 0x00, 0x00, 0xE2, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD1, /* 0x80-0x83 */
+	0x00, 0x00, 0xDF, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE9, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD7, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF5, 0xCD, 0x00, 0x00, 0xF1, 0xF2, 0xFA, 0xC7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xD9, 0xF8, 0xD4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xE5, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xC5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF2, 0xED, 0xDF, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE8, 0xB5, 0x00, 0x00, 0xD3, 0xA6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xB5, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF9, 0xC9, 0x00, 0x00, 0xE4, 0xE2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xFB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD7, 0xA4, 0xCE, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xD5, 0xD6, 0xE6, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCD, /* 0xE8-0xEB */
+	0xEC, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE0, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEC, 0xEC, 0xFB, 0xBE, 0xDF, 0xEB, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBE, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xD0, 0xF3, 0xE0, 0xAA, 0xE8, 0xE2, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE2, 0xD4, 0xD2, 0xFD, 0x00, 0x00, /* 0x18-0x1B */
+	0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xD3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDE, /* 0x24-0x27 */
+	0x00, 0x00, 0xF4, 0xB8, 0xF7, 0xBC, 0xDC, 0xFD, /* 0x28-0x2B */
+	0x00, 0x00, 0xE8, 0xEC, 0xE4, 0xE7, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA8, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE5, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF4, /* 0x44-0x47 */
+	0xD2, 0xAF, 0xDC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xA5, 0xF1, 0xB4, /* 0x4C-0x4F */
+	0xFC, 0xB1, 0xCC, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xDD, 0xC6, 0xFA, 0xD1, 0x00, 0x00, 0xF7, 0xDF, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xA8, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEE, 0xF5, 0x00, 0x00, 0xDE, 0xCE, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF3, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xAC, 0xEB, 0xC4, /* 0x68-0x6B */
+	0xED, 0xE1, 0xE0, 0xAB, 0xDD, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB3, /* 0x70-0x73 */
+	0xD2, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xCA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xFB, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xFD, 0xDD, 0xE5, /* 0x80-0x83 */
+	0xD8, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF4, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xED, 0xD0, 0xD2, /* 0x94-0x97 */
+	0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xF6, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xDB, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xF7, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD8, 0xD9, 0x00, 0x00, 0xF4, 0xA3, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDD, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xD1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xB5, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xED, 0xAB, 0x00, 0x00, 0xE3, 0xB7, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xEE, 0xBB, 0xCD, 0xB4, 0x00, 0x00, 0xE0, 0xF3, /* 0xD0-0xD3 */
+	0xEA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEC, 0xF5, 0xE8, 0xEE, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCB, 0xA9, 0xF1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCD, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEC, 0xA9, 0x00, 0x00, 0xF2, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */
+	0xFD, 0xEF, 0x00, 0x00, 0xF9, 0xF3, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD8, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0x00, 0x00, 0xEA, 0xCE, 0x00, 0x00, 0xE8, 0xDF, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xDE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xD2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF4, /* 0x18-0x1B */
+	0xD1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC2, /* 0x1C-0x1F */
+	0xE3, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xD8, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xA5, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xF3, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD7, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xE8, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xE6, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xE6, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xFE, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xDA, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xAC, 0xEA, 0xB0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCA, 0xAA, 0xE1, 0xF9, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xFA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xF4, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xD2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xFB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xFD, 0xF0, 0x00, 0x00, 0xE0, 0xBD, /* 0x08-0x0B */
+	0xCE, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xC6, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xAE, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDF, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xED, 0xAD, 0xFA, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCD, 0xEE, 0xED, 0xA6, 0x00, 0x00, 0xED, 0xAE, /* 0x54-0x57 */
+	0xF0, 0xED, 0x00, 0x00, 0xDD, 0xA1, 0x00, 0x00, /* 0x58-0x5B */
+	0xED, 0xAF, 0xFC, 0xF8, 0x00, 0x00, 0xD8, 0xEB, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF9, /* 0x60-0x63 */
+	0xCD, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFA, 0xA9, 0x00, 0x00, 0xE1, 0xDD, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE2, 0xD5, 0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xDD, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xF9, 0xCA, 0x00, 0x00, 0xEA, 0xE8, 0x00, 0x00, /* 0x78-0x7B */
+	0xE5, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD3, 0xEB, 0x00, 0x00, 0xE9, 0xD4, /* 0x84-0x87 */
+	0xE1, 0xFA, 0xE4, 0xCC, 0x00, 0x00, 0xE1, 0xE4, /* 0x88-0x8B */
+	0xE8, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0x90-0x93 */
+	0x00, 0x00, 0xF7, 0xB5, 0xFC, 0xF3, 0xF0, 0xF3, /* 0x94-0x97 */
+	0xCE, 0xAF, 0xF1, 0xB5, 0xEF, 0xD2, 0xE8, 0xC8, /* 0x98-0x9B */
+	0xEB, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xD4, 0xE0, 0xBE, /* 0xA0-0xA3 */
+	0xE3, 0xF8, 0xEA, 0xE9, 0xFC, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE0, 0xF4, 0x00, 0x00, 0xCF, 0xE0, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAA, /* 0xB0-0xB3 */
+	0xE6, 0xC3, 0xE1, 0xB2, 0xCA, 0xAB, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE3, 0xE4, 0xE9, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD6, /* 0xBC-0xBF */
+	0xF3, 0xF2, 0x00, 0x00, 0xEE, 0xD6, 0xEA, 0xB2, /* 0xC0-0xC3 */
+	0xD0, 0xF6, 0xEC, 0xD9, 0xDA, 0xCB, 0xCF, 0xA8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xDD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD8, 0xDB, 0x00, 0x00, 0xF9, 0xCE, 0xE9, 0xD5, /* 0xD0-0xD3 */
+	0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xBC, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xAC, 0xF3, 0xCC, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCD, 0xFB, 0xF6, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE7, 0xF5, 0xE8, 0xEF, 0xE3, 0xF9, 0xD2, 0xBB, /* 0xE4-0xE7 */
+	0xF3, 0xF3, 0xE3, 0xFB, 0x00, 0x00, 0xDE, 0xD0, /* 0xE8-0xEB */
+	0xCE, 0xB0, 0x00, 0x00, 0xD6, 0xF7, 0xF1, 0xD9, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF5, 0xC1, 0xDC, 0xC4, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xBB, 0x00, 0x00, 0xDE, 0xD1, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x00, 0x00, 0xDC, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xDE, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE2, /* 0x04-0x07 */
+	0xEE, 0xF6, 0xEA, 0xCF, 0xF0, 0xEE, 0xE3, 0xFC, /* 0x08-0x0B */
+	0x00, 0x00, 0xD3, 0xDF, 0xD3, 0xF4, 0xE1, 0xB3, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xD3, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDF, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE9, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xDB, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF6, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xE3, 0xB9, 0xEB, 0xC5, 0xF4, 0xA9, 0xCD, 0xB6, /* 0x38-0x3B */
+	0xD2, 0xF9, 0x00, 0x00, 0xDA, 0xAD, 0xD2, 0xE3, /* 0x3C-0x3F */
+	0xCF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCB, 0xDC, 0xCC, 0xFA, 0x00, 0x00, /* 0x44-0x47 */
+	0xCF, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0x48-0x4B */
+	0x00, 0x00, 0xE3, 0xBB, 0xE3, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE0, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xEE, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD3, 0xF5, 0x00, 0x00, 0xD7, 0xA6, 0x00, 0x00, /* 0x60-0x63 */
+	0xF6, 0xB5, 0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xEA, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xFD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD0, 0xF7, 0xED, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCB, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE1, 0xFB, /* 0xA8-0xAB */
+	0xCB, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xD3, 0xE0, 0x00, 0x00, 0xE4, 0xBF, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xFB, 0xC0, 0x00, 0x00, 0xDA, 0xBE, /* 0xB4-0xB7 */
+	0xE4, 0xCD, 0x00, 0x00, 0xD6, 0xB9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xC0, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF6, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE7, /* 0xEC-0xEF */
+	0xDC, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xFA, 0xD6, 0x00, 0x00, 0xD3, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xDA, /* 0xF8-0xFB */
+	0x00, 0x00, 0xFA, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFD, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xD5, 0xCF, 0xD0, 0xF8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xCD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF5, 0xCB, 0x00, 0x00, 0xE4, 0xF0, 0xCB, 0xAB, /* 0x14-0x17 */
+	0x00, 0x00, 0xD7, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFE, /* 0x24-0x27 */
+	0x00, 0x00, 0xDD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAE, /* 0x48-0x4B */
+	0xCA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD5, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA9, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF7, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xD4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0xE4, 0x00, 0x00, 0xE8, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF5, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE7, 0xAE, 0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */
+	0xDF, 0xEC, 0xE4, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF4, 0xB9, 0xF1, 0xB6, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE2, 0xDE, 0xE1, 0xB5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xCD, 0xEF, 0xF1, 0xA7, 0xCE, 0xE5, /* 0xE4-0xE7 */
+	0xCB, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE3, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xAC, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD0, 0xF9, 0xEC, 0xAB, 0xDE, 0xD3, /* 0xF0-0xF3 */
+	0xF7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xF5, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE1, 0xDE, 0xCB, 0xEE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xF8, 0xD6, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xEE, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xFD, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF7, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDE, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF2, 0xED, 0x00, 0x00, 0xDB, 0xD9, /* 0x18-0x1B */
+	0x00, 0x00, 0xF0, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD4, /* 0x28-0x2B */
+	0x00, 0x00, 0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xE1, 0x00, 0x00, /* 0x34-0x37 */
+	0xDF, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xD9, 0xB6, 0x00, 0x00, 0xFD, 0xAC, /* 0x3C-0x3F */
+	0xEF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE4, 0xC1, 0xF8, 0xEB, 0x00, 0x00, 0xDB, 0xAC, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xFC, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xD8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xBA, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDB, 0xDF, 0xD3, 0xD3, 0xF8, 0xC7, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCE, 0xF8, 0xC1, /* 0x70-0x73 */
+	0xD2, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB4, /* 0x74-0x77 */
+	0xFA, 0xB9, 0xCA, 0xCF, 0x00, 0x00, 0xFC, 0xB3, /* 0x78-0x7B */
+	0xEA, 0xEA, 0xEA, 0xEB, 0xD0, 0xFA, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xED, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE7, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xC9, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xED, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xEE, 0xBC, 0x00, 0x00, 0xEF, 0xC1, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xD2, 0x00, 0x00, /* 0x98-0x9B */
+	0xDD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xDF, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xF8, 0xF1, 0xA8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB7, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE4, 0xDD, 0xDF, 0xEE, 0xCB, 0xAC, /* 0xB4-0xB7 */
+	0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xEC, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xCB, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xF9, 0xBF, 0xD6, 0xAF, 0xD5, 0xC6, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xCF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xB7, 0xEE, 0xF8, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD9, /* 0xDC-0xDF */
+	0xF3, 0xDF, 0x00, 0x00, 0xF8, 0xC8, 0xCE, 0xC6, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD5, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE6, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xC5, 0xEF, 0xD5, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xEF, 0xFC, 0xDF, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0x00, 0x00, 0xDC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD6, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xC9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD2, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, 0xCF, 0xE1, /* 0x10-0x13 */
+	0xF0, 0xC0, 0xEC, 0xDA, 0x00, 0x00, 0xDD, 0xD7, /* 0x14-0x17 */
+	0xFB, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xAC, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA9, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xD7, 0xFB, 0xC1, /* 0x24-0x27 */
+	0x00, 0x00, 0xD2, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE5, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xED, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xAD, 0x00, 0x00, /* 0x38-0x3B */
+	0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xF7, 0xA5, 0x00, 0x00, 0xCB, 0xAE, 0x00, 0x00, /* 0x48-0x4B */
+	0xDA, 0xAF, 0x00, 0x00, 0xD8, 0xB6, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA7, 0xFB, 0xB2, /* 0x54-0x57 */
+	0x00, 0x00, 0xFD, 0xC4, 0x00, 0x00, 0xEC, 0xAD, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xA1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE9, 0xE9, 0xEE, /* 0x64-0x67 */
+	0x00, 0x00, 0xF3, 0xF4, 0xF8, 0xF3, 0xF0, 0xC1, /* 0x68-0x6B */
+	0xDE, 0xAF, 0xF8, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF3, 0xE0, 0xE7, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAD, /* 0x74-0x77 */
+	0x00, 0x00, 0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF9, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD8, /* 0x7C-0x7F */
+	
+	0xE8, 0xD9, 0xEF, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD3, 0xE2, 0x00, 0x00, 0xE2, 0xDF, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE0, 0xD7, 0xC8, /* 0x88-0x8B */
+	0xFD, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDF, 0xEF, 0xCC, 0xD3, 0xD3, 0xF9, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF0, /* 0x94-0x97 */
+	0xDB, 0xC7, 0xDE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xF4, 0x00, 0x00, /* 0x9C-0x9F */
+	0xD5, 0xD0, 0xE5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFC, 0xC7, 0xDC, 0xD6, 0xE2, 0xE0, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xB0, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF3, 0xA3, 0x00, 0x00, 0xD3, 0xEC, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xFD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFD, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xD0, 0xFB, 0xEC, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xBC, 0xF2, 0xA4, /* 0xD4-0xD7 */
+	0xD8, 0xCE, 0xD8, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF5, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE1, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xD2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xFB, 0xEC, 0x00, 0x00, 0xDD, 0xC8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE8, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xC1, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD7, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xD6, 0xBB, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF7, 0xBD, 0xEC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD0, 0xE1, 0x00, 0x00, 0xE0, 0xF5, /* 0x24-0x27 */
+	0xEA, 0xB3, 0x00, 0x00, 0xCE, 0xD6, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xA5, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xEC, 0xF6, 0xE2, 0xE1, 0xE3, 0xBE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xFC, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCD, 0xF0, 0x00, 0x00, 0xF9, 0xF6, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xDF, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xCE, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE1, 0xED, 0xB0, /* 0x60-0x63 */
+	0xFD, 0xD1, 0xF6, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF9, 0xCF, 0xEB, 0xDA, 0xCA, 0xC1, 0x00, 0x00, /* 0x68-0x6B */
+	0xD2, 0xB8, 0xCD, 0xF1, 0x00, 0x00, 0xE3, 0xD3, /* 0x6C-0x6F */
+	0xFD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE3, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xF0, 0xAA, 0xF9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xFC, 0xE2, 0x00, 0x00, 0xF8, 0xA7, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0xEE, 0xF9, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF6, /* 0x9C-0x9F */
+	0xEA, 0xED, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xB4, /* 0xA0-0xA3 */
+	0xF5, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDC, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xF0, 0xF5, 0x00, 0x00, 0xDD, 0xE8, 0xD3, 0xED, /* 0xB0-0xB3 */
+	0xF5, 0xFC, 0x00, 0x00, 0xDA, 0xBF, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xCC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD3, 0xFA, 0xF4, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEF, 0xD7, 0x00, 0x00, 0xD4, 0xC3, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xFB, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xED, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE0, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xEE, /* 0xDC-0xDF */
+	0xFB, 0xB3, 0xE4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF6, 0xE7, 0xD2, 0xDD, 0x00, 0x00, 0xDF, 0xCC, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xC9, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE5, 0xA9, 0xE0, 0xF6, 0xF6, 0xB3, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_61[512] = {
+	0x00, 0x00, 0xE1, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, 0x00, 0x00, /* 0x04-0x07 */
+	0xEA, 0xEF, 0xEA, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xDA, 0xC0, 0xF8, 0xB4, 0xEB, 0xF2, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xD7, 0xE4, 0xF1, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xEF, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD7, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xFC, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xF3, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC4, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xE3, 0xE5, 0x00, 0x00, 0xCB, 0xC5, 0xEA, 0xB4, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xBD, 0x00, 0x00, /* 0x40-0x43 */
+	0xD7, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xDB, /* 0x44-0x47 */
+	0xED, 0xB1, 0x00, 0x00, 0xCC, 0xC3, 0xF7, 0xBE, /* 0x48-0x4B */
+	0xFC, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xF4, /* 0x50-0x53 */
+	0x00, 0x00, 0xD9, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xF3, 0xD3, 0xF3, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xF7, 0xE4, 0x00, 0x00, 0xF7, 0xD1, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xB7, 0xCE, 0xB1, /* 0x60-0x63 */
+	0xCA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB4, /* 0x64-0x67 */
+	0xCB, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xF6, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xE7, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEA, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xD4, 0xCB, 0xAF, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF4, 0xAA, 0xE9, 0xAF, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xF5, 0xC3, 0xE9, 0xD8, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xE9, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD5, 0xFB, 0xDE, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xF4, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xFD, 0xF3, 0xFD, 0xF2, 0xF7, 0xA6, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xDD, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD3, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xCC, 0xA8, 0x00, 0x00, 0xDA, 0xC1, /* 0xA8-0xAB */
+	0xCC, 0xD5, 0x00, 0x00, 0xD9, 0xE4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xCA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xBC, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC4, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD0, /* 0xC4-0xC7 */
+	0xFA, 0xAB, 0xEB, 0xEB, 0xE7, 0xF8, 0xD9, 0xE5, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xA4, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xFB, 0xFC, 0xE3, /* 0xF4-0xF7 */
+	0xFA, 0xD8, 0x00, 0x00, 0xF3, 0xD5, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0xD5, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD4, /* 0x04-0x07 */
+	0xCD, 0xFC, 0x00, 0x00, 0xD9, 0xE6, 0x00, 0x00, /* 0x08-0x0B */
+	0xE2, 0xF9, 0xE2, 0xA1, 0xEB, 0xD4, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE0, 0xF7, 0xE4, 0xB2, 0xCC, 0xFC, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xE4, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xAB, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBD, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCA, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xB8, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xC0, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEE, 0xFA, 0xFD, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xD3, 0xE3, 0x00, 0x00, 0xFB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xE8, 0xDB, 0xAE, /* 0x3C-0x3F */
+	0xE1, 0xB6, 0xF8, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xBF, /* 0x44-0x47 */
+	0xFB, 0xC3, 0xDD, 0xEA, 0x00, 0x00, 0xE2, 0xA2, /* 0x48-0x4B */
+	0x00, 0x00, 0xEE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE8, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xF6, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xCA, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xD0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xDD, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAF, /* 0x7C-0x7F */
+	
+	0xD0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xF4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xCC, 0xBC, 0xF7, 0xEA, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE5, 0xE4, 0xDF, 0xF1, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xF7, 0xE1, 0x00, 0x00, 0xF9, 0xF7, /* 0x94-0x97 */
+	0xEF, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA9, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE4, 0xE3, 0xF5, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xD9, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE7, /* 0xC4-0xC7 */
+	0xD2, 0xB9, 0xD5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xDA, 0xE5, 0xDA, 0xD0, 0x00, 0x00, 0xD1, 0xD9, /* 0xCC-0xCF */
+	0xCE, 0xD8, 0x00, 0x00, 0xCB, 0xDE, 0xF4, 0xAC, /* 0xD0-0xD3 */
+	0xDA, 0xFB, 0x00, 0x00, 0xF6, 0xE9, 0xE8, 0xF3, /* 0xD4-0xD7 */
+	0xCF, 0xAC, 0xF0, 0xF0, 0x00, 0x00, 0xF4, 0xFD, /* 0xD8-0xDB */
+	0xDB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCE, 0xC0, 0xE3, 0xD4, 0xD1, 0xCF, 0xF1, 0xF5, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCD, 0xF2, 0x00, 0x00, 0xCF, 0xEB, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB8, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA6, 0xD1, 0xDA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0x00, 0x00, 0xF2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xA9, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xD8, 0xE6, 0xC9, /* 0x38-0x3B */
+	0x00, 0x00, 0xD8, 0xB8, 0xFA, 0xF3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF8, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF3, /* 0x4C-0x4F */
+	0xE6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF8, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE9, /* 0x64-0x67 */
+	0xDE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xDF, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xEC, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDF, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xF4, 0xD2, 0xBA, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB7, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE2, 0xA3, 0xD3, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, 0x00, 0x00, /* 0x94-0x97 */
+	0xCF, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xD5, 0xD3, 0xF3, 0xF5, 0xF7, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xEF, 0xC8, 0x00, 0x00, 0xCD, 0xF3, /* 0xA4-0xA7 */
+	0xF5, 0xCF, 0xE5, 0xF3, 0xF0, 0xC2, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCA, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0xF1, 0x00, 0x00, 0xD0, 0xA6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDA, /* 0xCC-0xCF */
+	0xF0, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xE7, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC0, 0xFC, 0xB5, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE4, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCC, 0xA9, 0xFD, 0xC6, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEA, 0xB5, 0x00, 0x00, 0xE5, 0xAA, 0xDF, 0xBA, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_64[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE1, 0xDF, 0x00, 0x00, 0xDA, 0xD1, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE1, 0xB8, 0x00, 0x00, 0xE8, 0xF4, 0xD3, 0xFD, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xE2, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xCA, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xDA, 0xE6, 0xF7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xCD, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xB6, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xEE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xF5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xD8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA7, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xD9, 0xB8, 0xD9, 0xB9, 0xEF, 0xC9, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF7, 0xCB, 0xDF, 0xAE, 0xE8, 0xF5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xB5, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xD5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF4, 0xCC, 0xDA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE8, /* 0xA8-0xAB */
+	0x00, 0x00, 0xF7, 0xEB, 0xF5, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */
+	0xF3, 0xBC, 0x00, 0x00, 0xDA, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB5, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD6, 0xCF, 0xF4, 0xBA, 0x00, 0x00, 0xF7, 0xC9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xAA, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xF0, 0xC3, 0xCC, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xD3, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDB, 0xFB, 0x00, 0x00, 0xCB, 0xE0, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xD3, 0xE4, 0xF6, 0xF7, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD5, 0xBA, 0xF3, 0xCD, 0xCB, 0xE1, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xEB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xAD, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xFC, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xF6, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0xDA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF7, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xFD, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xE6, 0xFC, 0xAB, /* 0x28-0x2B */
+	0xD5, 0xBB, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xA5, 0xCD, 0xB9, /* 0x34-0x37 */
+	0xEA, 0xF2, 0xCB, 0xC7, 0x00, 0x00, 0xCD, 0xF4, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xAF, 0xEF, 0xD9, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCD, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xFC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xDF, 0xF3, 0xCE, 0xE7, 0xDA, 0xC2, /* 0x4C-0x4F */
+	0x00, 0x00, 0xCF, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, 0xF8, 0xA8, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xE2, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xF2, 0xDF, 0xA4, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xC4, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xCC, 0xD7, 0xE5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xBB, 0x00, 0x00, /* 0x70-0x73 */
+	0xEF, 0xDA, 0xEE, 0xD8, 0x00, 0x00, 0xDD, 0xA7, /* 0x74-0x77 */
+	0xE2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC0, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xB0, 0xF8, 0xCA, /* 0x80-0x83 */
+	0x00, 0x00, 0xFC, 0xFA, 0x00, 0x00, 0xD9, 0xFE, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xDE, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDD, 0xEC, 0xDA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE0, /* 0x94-0x97 */
+	0x00, 0x00, 0xD6, 0xF9, 0x00, 0x00, 0xCD, 0xD7, /* 0x98-0x9B */
+	0xDE, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF8, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xD0, 0xC5, 0xF4, 0xAE, 0x00, 0x00, 0xDD, 0xA8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC5, /* 0xA8-0xAB */
+	0xF3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD9, /* 0xAC-0xAF */
+	0xE3, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xDB, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE5, 0xDA, 0xE3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDB, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC1, /* 0xC8-0xCB */
+	0xEF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE9, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xB2, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFD, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xEC, 0xED, 0xD3, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF2, 0xA9, 0xF0, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE2, 0xE2, 0xE9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xF9, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE9, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xDA, 0xDA, 0xC3, /* 0xF8-0xFB */
+	0xDA, 0xC4, 0xD4, 0xC5, 0x00, 0x00, 0xE7, 0xFA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_66[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xE3, 0xB0, /* 0x04-0x07 */
+	0x00, 0x00, 0xDB, 0xB2, 0xFB, 0xC4, 0x00, 0x00, /* 0x08-0x0B */
+	0xF3, 0xE3, 0x00, 0x00, 0xD9, 0xA5, 0xFB, 0xE7, /* 0x0C-0x0F */
+	0xDD, 0xCB, 0xD0, 0xD4, 0x00, 0x00, 0xE6, 0xB6, /* 0x10-0x13 */
+	0xE0, 0xAE, 0xFD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB5, 0xE0, 0xF8, /* 0x1C-0x1F */
+	0xE7, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xF5, 0xF0, 0x00, 0x00, 0xD8, 0xDC, /* 0x24-0x27 */
+	0xED, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, 0xE3, 0xC0, /* 0x2C-0x2F */
+	0xF9, 0xC0, 0xE9, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xD9, 0xDB, 0x00, 0x00, 0xF3, 0xE4, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB6, 0xE4, 0xE9, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xF0, 0xC5, 0xE3, 0xC1, 0xFC, 0xCC, /* 0x40-0x43 */
+	0xFC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF2, 0xCB, 0x00, 0x00, 0xF2, 0xCC, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xF1, 0xDB, 0x00, 0x00, 0xFA, 0xD9, /* 0x58-0x5B */
+	0x00, 0x00, 0xF1, 0xB8, 0xFD, 0xF5, 0xE0, 0xF9, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE7, 0xFB, 0xFC, 0xB7, 0xFC, 0xE4, 0xFB, 0xC5, /* 0x64-0x67 */
+	0xE3, 0xE7, 0xD8, 0xB9, 0x00, 0x00, 0xF6, 0xF8, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, 0xCC, 0xD8, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAF, /* 0x70-0x73 */
+	0xF4, 0xE7, 0x00, 0x00, 0xEF, 0xDC, 0xCF, 0xFC, /* 0x74-0x77 */
+	0xEF, 0xDD, 0x00, 0x00, 0xF2, 0xAA, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xFD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAC, /* 0x84-0x87 */
+	0xFD, 0xBB, 0xFD, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xB2, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEA, 0xD1, 0xDF, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xE4, 0xDE, /* 0x94-0x97 */
+	0xE5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xD9, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCD, 0xBC, 0x00, 0x00, 0xF3, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xE7, 0xFB, 0xB5, /* 0xB0-0xB3 */
+	0xF8, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0xE7, 0x00, 0x00, 0xCC, 0xD9, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE7, 0xA5, 0x00, 0x00, 0xD5, 0xF5, 0xD3, 0xBE, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xFC, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xF2, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xDF, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE8, 0xF8, 0xF8, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xF6, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE8, 0xD8, 0x00, 0x00, 0xCD, 0xD8, 0xE7, 0xD6, /* 0xF0-0xF3 */
+	0xCC, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE3, /* 0xF4-0xF7 */
+	0xDF, 0xF6, 0xF0, 0xC7, 0xF0, 0xC6, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD8, 0xBA, 0x00, 0x00, 0xF1, 0xF4, 0xF4, 0xF0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0xF5, 0xCC, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xEA, 0xC5, 0xEA, 0xF3, 0x00, 0x00, 0xDD, 0xDB, /* 0x08-0x0B */
+	0x00, 0x00, 0xDC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xDE, 0xFD, 0xF2, 0xF9, 0x00, 0x00, 0xD5, 0xC7, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD0, /* 0x18-0x1B */
+	0x00, 0x00, 0xF0, 0xC8, 0xD1, 0xA1, 0xD1, 0xA2, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xD4, 0xD6, 0xE8, /* 0x24-0x27 */
+	0xD9, 0xCA, 0x00, 0x00, 0xDA, 0xB1, 0xD8, 0xC7, /* 0x28-0x2B */
+	0xDC, 0xE2, 0xF3, 0xCE, 0xF5, 0xF4, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF1, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xDA, 0xD3, 0x00, 0x00, 0xF6, 0xEA, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xF5, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xFD, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xD2, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xDF, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xDD, 0xFA, 0xBA, /* 0x4C-0x4F */
+	0xEE, 0xA7, 0xF5, 0xBD, 0x00, 0x00, 0xF8, 0xF5, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xE8, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xD4, 0xE1, 0x00, 0x00, 0xD1, 0xA3, 0xE1, 0xD6, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF9, 0xF8, 0x00, 0x00, 0xDB, 0xCA, /* 0x6C-0x6F */
+	0xCB, 0xF9, 0xD4, 0xD4, 0x00, 0x00, 0xD9, 0xDC, /* 0x70-0x73 */
+	0x00, 0x00, 0xEE, 0xBE, 0x00, 0x00, 0xF7, 0xED, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xEE, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0xF7, 0xF9, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xED, /* 0x84-0x87 */
+	0x00, 0x00, 0xE8, 0xDB, 0x00, 0x00, 0xDB, 0xB3, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF7, /* 0x8C-0x8F */
+	0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE2, /* 0x90-0x93 */
+	0x00, 0x00, 0xF6, 0xD7, 0x00, 0x00, 0xD7, 0xF9, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xDD, 0x00, 0x00, /* 0x98-0x9B */
+	0xCD, 0xFD, 0xF2, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xBD, /* 0xAC-0xAF */
+	0xF8, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xAC, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xAD, 0xCA, 0xAE, /* 0xB4-0xB7 */
+	0xCF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xDA, /* 0xCC-0xCF */
+	0xD9, 0xBB, 0xCA, 0xF3, 0xF6, 0xD3, 0xE6, 0xF8, /* 0xD0-0xD3 */
+	0xEA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCF, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCA, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAF, /* 0xEC-0xEF */
+	0xD2, 0xB0, 0xF1, 0xBA, 0x00, 0x00, 0xD7, 0xB3, /* 0xF0-0xF3 */
+	0xE3, 0xC3, 0xF3, 0xFD, 0xDE, 0xDA, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xE3, 0xEE, 0xFB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xF7, 0xD7, 0xCA, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCE, 0xE8, 0xDB, 0xDB, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xBB, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xFA, 0xB7, 0xD0, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xCC, 0xAB, 0xEE, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xCB, 0xFA, 0xF9, 0xF9, 0xCC, 0xFD, 0xD3, 0xFE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xEE, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD4, 0xD5, 0xDF, 0xCD, 0x00, 0x00, 0xFC, 0xB8, /* 0x50-0x53 */
+	0xD1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF2, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xD2, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD4, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD9, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA9, /* 0x90-0x93 */
+	0xF6, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xDB, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF0, 0xC9, 0x00, 0x00, 0xFC, 0xFC, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE8, 0xC9, 0xF4, 0xFE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xFC, /* 0xA4-0xA7 */
+	0xD7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xDE, 0xDC, 0x00, 0x00, 0xF0, 0xAC, /* 0xAC-0xAF */
+	0xCC, 0xFE, 0xCD, 0xE1, 0x00, 0x00, 0xE1, 0xBA, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xDB, 0xEF, 0xDA, 0xB2, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD1, 0xA5, 0xDC, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD8, 0xF6, 0x00, 0x00, 0xD1, 0xA4, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCD, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF0, 0xF7, 0x00, 0x00, 0xF0, 0xCA, /* 0xD4-0xD7 */
+	0xD0, 0xBE, 0x00, 0x00, 0xDD, 0xDC, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD6, /* 0xDC-0xDF */
+	0xD3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, /* 0xE4-0xE7 */
+	0xCD, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xD4, 0xA1, 0xCE, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_69[512] = {
+	0xE8, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xEB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE3, 0xD5, 0xF5, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA1, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA7, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE5, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xE6, 0xCB, 0x00, 0x00, 0xF5, 0xF1, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC5, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA3, /* 0x50-0x53 */
+	0xE0, 0xDB, 0xF6, 0xEB, 0x00, 0x00, 0xCB, 0xF1, /* 0x54-0x57 */
+	0x00, 0x00, 0xD9, 0xEA, 0xF5, 0xA2, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD1, 0xF8, 0xEA, 0xF8, 0xEA, 0xF9, 0xDA, 0xB3, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xEF, /* 0x68-0x6B */
+	0x00, 0x00, 0xE5, 0xF6, 0xEE, 0xBF, 0xE2, 0xE4, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD0, 0xBF, 0x00, 0x00, 0xFA, 0xAC, /* 0x74-0x77 */
+	0xF5, 0xD1, 0xE7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xCE, /* 0x98-0x9B */
+	0xDB, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xFC, 0xCE, 0x00, 0x00, 0xDD, 0xEE, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xD7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xB4, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCD, 0xBE, 0x00, 0x00, 0xDA, 0xE9, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, /* 0xC8-0xCB */
+	0xF7, 0xD9, 0xF3, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xCE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCE, 0xAA, 0x00, 0x00, 0xCB, 0xC8, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA7, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF0, 0xCB, 0x00, 0x00, 0xD0, 0xC7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xE0, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0xD7, 0xA7, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0x14-0x17 */
+	0x00, 0x00, 0xF8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xD2, 0xED, 0xE9, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD9, 0xBC, 0x00, 0x00, 0xE5, 0xC6, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF5, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xDA, 0xD4, 0xE2, 0xA7, 0xFB, 0xFC, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF1, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xCA, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xE9, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF8, 0xE2, 0xE5, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD0, 0xB9, 0xD4, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA6, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF4, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD3, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xCC, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE5, 0xE5, 0xD0, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xFC, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xFC, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xFE, 0xED, 0xEA, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB1, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE3, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xA2, 0xCF, 0xF6, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD0, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xEA, 0xF1, 0xEE, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xCB, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA1, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD5, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xED, 0x00, 0x00, /* 0x08-0x0B */
+	0xED, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xB2, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xBC, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xFD, 0xE2, 0xF3, 0xAD, 0x00, 0x00, 0xFD, 0xDB, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xA7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xFD, 0xE3, 0xCE, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xE4, 0xFA, 0xCE, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCA, 0xB0, 0x00, 0x00, 0xF7, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCF, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xFC, 0xB6, 0xF2, 0xAD, 0xEF, 0xE1, /* 0x60-0x63 */
+	0xF3, 0xAE, 0xDC, 0xC6, 0xD9, 0xEB, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xE0, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF6, /* 0x74-0x77 */
+	0xCF, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDD, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD1, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEA, /* 0x80-0x83 */
+	0xF2, 0xCF, 0x00, 0x00, 0xF7, 0xBF, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE2, 0xE6, 0xE2, 0xA8, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x94-0x97 */
+	0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF9, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xB1, 0xDE, 0xB2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xD3, 0xAB, 0x00, 0x00, 0xEB, 0xDC, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xAF, 0x00, 0x00, /* 0xB8-0xBB */
+	0xCA, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xFC, /* 0xBC-0xBF */
+	0x00, 0x00, 0xFD, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEB, 0xF6, 0xCF, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEC, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD9, 0xBD, 0x00, 0x00, 0xD8, 0xDF, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xB8, 0xEB, 0xBE, /* 0xD0-0xD3 */
+	0xDD, 0xEF, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xF1, /* 0xD4-0xD7 */
+	0xDD, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xBE, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC6, /* 0xE8-0xEB */
+	0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xEE, 0xFD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAB, /* 0x0C-0x0F */
+	0x00, 0x00, 0xDA, 0xC5, 0x00, 0x00, 0xD8, 0xEC, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA8, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE2, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xBC, /* 0x34-0x37 */
+	0xE7, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xF0, 0x00, 0x00, /* 0x3C-0x3F */
+	0xEF, 0xE2, 0xF1, 0xF0, 0xCF, 0xB4, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xF1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xDF, 0xA5, 0x00, 0x00, 0xF9, 0xD2, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xFD, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0xA3, 0xFB, 0xF1, 0xCB, 0xB0, /* 0x5C-0x5F */
+	0xF2, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xCD, 0xE7, 0x00, 0x00, 0xE8, 0xDC, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF7, 0xC0, 0x00, 0x00, 0xD0, 0xE3, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xBD, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xD1, 0xA9, 0xDD, 0xCC, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE3, 0xFE, 0xD1, 0xAA, 0xE8, 0xAA, /* 0x80-0x83 */
+	0x00, 0x00, 0xEA, 0xB6, 0xF9, 0xFA, 0xE6, 0xCC, /* 0x84-0x87 */
+	0xF6, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xD4, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD9, 0xCB, 0x00, 0x00, 0xD9, 0xD2, 0xD3, 0xCB, /* 0x90-0x93 */
+	0xD8, 0xF7, 0xDA, 0xA9, 0xF5, 0xF8, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xDE, 0xDE, 0xF2, 0xAF, 0xF8, 0xA9, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC8, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC1, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xDD, 0xF3, 0xEA, 0xFA, 0x00, 0x00, 0xF6, 0xBD, /* 0xB8-0xBB */
+	0xE1, 0xBB, 0xCD, 0xBF, 0xF4, 0xD4, 0xE6, 0xCD, /* 0xBC-0xBF */
+	0x00, 0x00, 0xFC, 0xCF, 0xFB, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE0, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF4, 0xBB, 0xDA, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDB, 0xF6, 0x00, 0x00, 0xDE, 0xDF, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF2, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF8, 0xDC, 0xF7, 0xEE, 0xEB, 0xE8, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF1, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDA, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xC6, /* 0xEC-0xEF */
+	0xF7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC7, /* 0x08-0x0B */
+	0xD6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xDC, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x14-0x17 */
+	0x00, 0x00, 0xE2, 0xAA, 0x00, 0x00, 0xD5, 0xA6, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xD7, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xF2, 0xD0, 0x00, 0x00, 0xEA, 0xFB, /* 0x24-0x27 */
+	0x00, 0x00, 0xE0, 0xDD, 0xFB, 0xF3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xBD, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE2, 0xE7, 0xFD, 0xD7, 0x00, 0x00, /* 0x34-0x37 */
+	0xCE, 0xC8, 0xEA, 0xB7, 0x00, 0x00, 0xFC, 0xC0, /* 0x38-0x3B */
+	0x00, 0x00, 0xFD, 0xE7, 0xF7, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD7, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xEF, 0xBA, 0xF1, 0xDD, 0x00, 0x00, /* 0x58-0x5B */
+	0xDE, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCB, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xDD, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFB, 0xC7, 0xD5, 0xC8, 0x00, 0x00, /* 0x68-0x6B */
+	0xD7, 0xDF, 0x00, 0x00, 0xDD, 0xA9, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAD, /* 0x74-0x77 */
+	0xF6, 0xD9, 0xFA, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xEE, 0x00, 0x00, 0xCC, 0xDC, /* 0x84-0x87 */
+	0xE1, 0xBC, 0xE0, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xE9, 0xBF, 0xFC, 0xFD, 0xE6, 0xCE, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE6, 0xCF, /* 0x90-0x93 */
+	0x00, 0x00, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xFB, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xEF, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEE, /* 0xC0-0xC3 */
+	0xF6, 0xBE, 0xE0, 0xB2, 0xFC, 0xFE, 0xD1, 0xAB, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFA, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC8, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD4, 0xA3, 0xF0, 0xF8, 0xD7, 0xA8, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xD3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEF, 0xE4, 0x00, 0x00, 0xD7, 0xC5, 0xEB, 0xE2, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE4, 0xA2, 0x00, 0x00, 0xE2, 0xE8, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE6, 0xD0, 0x00, 0x00, 0xFB, 0xE8, /* 0xF4-0xF7 */
+	0xF4, 0xE8, 0xE5, 0xF4, 0xF4, 0xBC, 0xF4, 0xD5, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB6, /* 0x14-0x17 */
+	0x00, 0x00, 0xFC, 0xB9, 0xEE, 0xC2, 0xCA, 0xF5, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xE5, /* 0x1C-0x1F */
+	0xCB, 0xE2, 0xD4, 0xA4, 0x00, 0x00, 0xDE, 0xE0, /* 0x20-0x23 */
+	0xDA, 0xFD, 0xE4, 0xC6, 0xE8, 0xBE, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0x28-0x2B */
+	0xF6, 0xB4, 0xEA, 0xD2, 0x00, 0x00, 0xF9, 0xFB, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC2, 0x00, 0x00, /* 0x30-0x33 */
+	0xCA, 0xE4, 0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, /* 0x34-0x37 */
+	0xEA, 0xFD, 0x00, 0x00, 0xD9, 0xDD, 0x00, 0x00, /* 0x38-0x3B */
+	0xDA, 0xB4, 0xEE, 0xAA, 0xFB, 0xE9, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xCB, /* 0x40-0x43 */
+	0xDA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xBE, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xD3, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xC9, 0x00, 0x00, /* 0x54-0x57 */
+	0xDF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC0, /* 0x58-0x5B */
+	0xE3, 0xD7, 0x00, 0x00, 0xEF, 0xE6, 0xFC, 0xD0, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC0, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD3, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xDC, 0xF7, 0xB7, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xB8, 0xD1, 0xF9, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xC8, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEA, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xDE, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xD7, 0xB6, 0xCF, 0xB5, 0x00, 0x00, 0xD9, 0xA8, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xEE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDD, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xA2, 0xE8, 0xAE, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xFC, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xB5, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF3, 0xE7, 0xD8, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xFC, 0xD1, 0x00, 0x00, 0xED, 0xB2, /* 0xC8-0xCB */
+	0xF4, 0xAF, 0x00, 0x00, 0xFB, 0xA3, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xFC, 0xC1, 0x00, 0x00, 0xEE, 0xAB, /* 0xD0-0xD3 */
+	0xD4, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF2, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xFB, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE3, 0xD8, 0xBB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0x00, 0x00, 0xE5, 0xDB, 0xF8, 0xF7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xD4, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA9, /* 0x0C-0x0F */
+	0x00, 0x00, 0xCB, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE6, 0xD1, 0xF0, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD8, 0xAE, 0x00, 0x00, 0xF9, 0xD3, 0xD5, 0xFE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBC, /* 0x28-0x2B */
+	0xF2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE2, 0xAB, 0xF3, 0xE8, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xEF, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xEC, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xDA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCC, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFC, /* 0x54-0x57 */
+	0xDA, 0xEB, 0x00, 0x00, 0xE2, 0xD8, 0xED, 0xD6, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xD1, 0xE0, 0xB3, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xD2, 0x00, 0x00, /* 0x60-0x63 */
+	0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xD3, 0xC1, 0xF0, 0xCD, 0x00, 0x00, /* 0x6C-0x6F */
+	0xCF, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, 0x00, 0x00, /* 0x78-0x7B */
+	0xD4, 0xD8, 0xDC, 0xC9, 0xD7, 0xF1, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xDF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xF3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xF4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xF1, 0xBF, 0xF8, 0xB1, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE9, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xFB, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD5, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xD4, /* 0xA0-0xA3 */
+	0xF7, 0xCA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE8, 0xF3, 0xBD, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEE, 0xFE, 0x00, 0x00, 0xE7, 0xFE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD3, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCC, 0xAD, 0xF6, 0xFA, 0xD6, 0xB2, 0xD2, 0xD8, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD8, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xB9, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xAD, /* 0xDC-0xDF */
+	0xFB, 0xCC, 0xEB, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD4, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xFB, 0xCD, 0x00, 0x00, 0xD5, 0xBD, /* 0xE8-0xEB */
+	0xF1, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFB, /* 0xEC-0xEF */
+	0x00, 0x00, 0xDE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xEB, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0x00, 0x00, 0xE5, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xFB, 0xA4, 0xD4, 0xB9, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xDE, 0xE1, 0x00, 0x00, 0xE4, 0xA3, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB7, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xDE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xD6, 0xD2, 0x00, 0x00, 0xF9, 0xD5, 0xE7, 0xBA, /* 0x18-0x1B */
+	0xEB, 0xD5, 0xD5, 0xF7, 0xEF, 0xE7, 0xE1, 0xBE, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xE9, /* 0x24-0x27 */
+	0xD6, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBB, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xCB, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCE, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xFB, 0xA5, 0xE1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xF7, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xFB, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xFD, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xFC, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xCF, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xED, 0xC7, 0xEE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xCC, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xFA, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA4, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xFD, 0xDC, 0xED, 0xB3, 0xCE, 0xC9, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEF, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDB, /* 0xA8-0xAB */
+	0xCB, 0xE3, 0xF7, 0xA9, 0x00, 0x00, 0xFB, 0xA6, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB9, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xC0, /* 0xB4-0xB7 */
+	0xED, 0xC8, 0xEF, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xD6, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCE, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA1, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xFB, 0xF4, 0xD5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF1, 0xF6, 0x00, 0x00, 0xE6, 0xD3, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCC, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xF8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xDC, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xFD, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE5, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xF1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDB, 0xCC, 0xDD, 0xCD, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xC8, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD9, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA5, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD4, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xC8, /* 0x44-0x47 */
+	0x00, 0x00, 0xD6, 0xA1, 0xFD, 0xBF, 0x00, 0x00, /* 0x48-0x4B */
+	0xFC, 0xD3, 0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xEE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE6, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE9, 0xF2, 0x00, 0x00, 0xDF, 0xB0, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xD8, 0xE0, 0xFC, 0xBA, 0xFD, 0xAF, 0xF0, 0xCE, /* 0x64-0x67 */
+	0x00, 0x00, 0xDB, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE5, 0xC9, 0x00, 0x00, 0xED, 0xB4, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE3, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE9, 0xFB, 0xEA, 0xA8, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xA7, 0x00, 0x00, /* 0x90-0x93 */
+	0xE9, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xFD, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD9, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xFD, 0xF8, 0xFD, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE7, 0xA7, 0x00, 0x00, 0xE6, 0xD7, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xD4, 0xF3, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xFA, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD7, 0xF2, 0x00, 0x00, 0xE1, 0xC0, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDB, 0xE2, 0xE6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF0, 0xCF, 0xF3, 0xBE, 0xE2, 0xAC, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF5, 0xB7, 0xE0, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB8, /* 0xF8-0xFB */
+	0xE3, 0xE8, 0x00, 0x00, 0xD4, 0xA7, 0xE8, 0xFC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0xFA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xEF, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD6, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xD0, 0x00, 0x00, /* 0x28-0x2B */
+	0xF7, 0xF0, 0xEE, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEA, 0xBA, 0x00, 0x00, 0xEA, 0xD3, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xED, 0xC9, 0xDD, 0xAB, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, 0xFD, 0xA1, /* 0x38-0x3B */
+	0x00, 0x00, 0xDF, 0xD0, 0xEC, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xED, 0xF8, 0xB8, /* 0x44-0x47 */
+	0xF7, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF8, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD4, 0xBA, 0xE4, 0xB3, 0x00, 0x00, 0xE9, 0xDA, /* 0x58-0x5B */
+	0x00, 0x00, 0xDE, 0xB6, 0x00, 0x00, 0xD9, 0xBF, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD9, 0xC0, 0xD6, 0xEF, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCC, /* 0x64-0x67 */
+	0x00, 0x00, 0xDA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xE5, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xCC, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDF, 0xF9, 0xD7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xBB, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xFA, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xCC, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xDF, 0xD2, 0x00, 0x00, 0xCE, 0xCA, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEE, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, 0x00, 0x00, /* 0xCC-0xCF */
+	0xFB, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB7, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE2, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xD7, 0xE1, 0xFA, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD5, 0xC9, 0xF8, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xE9, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xED, /* 0x18-0x1B */
+	0xE3, 0xC4, 0xF0, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE0, 0xFA, 0xEE, 0xC4, 0xD9, 0xDE, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xA2, 0xEB, 0xA3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xC2, 0xEA, 0xBB, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE8, 0xAB, 0xDE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xED, 0xEF, 0x00, 0x00, 0xE8, 0xA3, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xF1, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xD4, 0xBC, 0x00, 0x00, 0xFC, 0xEA, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE7, 0xBE, 0x00, 0x00, 0xFC, 0xF2, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xE2, 0xAE, 0x00, 0x00, 0xD3, 0xB7, 0xFA, 0xCC, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xFA, 0xDC, 0x00, 0x00, 0xED, 0xB5, 0xE1, 0xE3, /* 0x84-0x87 */
+	0x00, 0x00, 0xE8, 0xAC, 0x00, 0x00, 0xE8, 0xDD, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xE9, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xF4, 0xBD, 0x00, 0x00, 0xCF, 0xB8, 0xE9, 0xDB, /* 0x94-0x97 */
+	0xD1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xC7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xC9, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xDE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xBC, 0xD3, 0xE5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xFA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xDA, 0xD6, 0x00, 0x00, 0xCA, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xDA, 0xC8, 0xDF, 0xA6, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF9, 0xB3, 0xF2, 0xD2, 0x00, 0x00, 0xCA, 0xC4, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xCB, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xFD, 0xB0, 0xD5, 0xA8, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF1, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE9, /* 0xE0-0xE3 */
+	0xDC, 0xCA, 0xEC, 0xB4, 0xFA, 0xC0, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xFB, 0xA8, 0xD0, 0xA8, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xDA, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEE, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xEF, 0xEA, 0xFA, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0x00, 0x00, 0xE0, 0xC4, 0x00, 0x00, 0xCF, 0xB9, /* 0x00-0x03 */
+	0x00, 0x00, 0xD5, 0xCA, 0xD7, 0xE2, 0xE2, 0xAF, /* 0x04-0x07 */
+	0x00, 0x00, 0xD7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xEF, 0xA2, 0xE2, 0xDA, 0xF6, 0xFC, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xFB, 0xD0, 0xD1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */
+	0xCD, 0xE4, 0x00, 0x00, 0xD1, 0xAE, 0xDC, 0xED, /* 0x28-0x2B */
+	0xE8, 0xCE, 0x00, 0x00, 0xF0, 0xF9, 0xCE, 0xB5, /* 0x2C-0x2F */
+	0xE6, 0xFC, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFB, /* 0x30-0x33 */
+	0xD0, 0xD6, 0xDD, 0xF5, 0xF7, 0xF1, 0x00, 0x00, /* 0x34-0x37 */
+	0xF6, 0xFD, 0x00, 0x00, 0xDB, 0xF7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEA, /* 0x3C-0x3F */
+	0xE9, 0xDC, 0xD9, 0xC1, 0x00, 0x00, 0xF5, 0xF2, /* 0x40-0x43 */
+	0xE0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD4, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF9, 0xC2, 0x00, 0x00, 0xEA, 0xBC, /* 0x54-0x57 */
+	0x00, 0x00, 0xD2, 0xC5, 0xFB, 0xD1, 0xE7, 0xC0, /* 0x58-0x5B */
+	0xEB, 0xA5, 0x00, 0x00, 0xDF, 0xFA, 0xE3, 0xA2, /* 0x5C-0x5F */
+	0xD7, 0xB9, 0x00, 0x00, 0xE9, 0xC3, 0x00, 0x00, /* 0x60-0x63 */
+	0xE8, 0xFD, 0xE8, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF2, 0xD3, 0xFB, 0xA9, 0xD8, 0xA5, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xC8, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xAF, 0xD7, 0xE3, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC6, /* 0x84-0x87 */
+	0x00, 0x00, 0xD6, 0xA2, 0x00, 0x00, 0xED, 0xF0, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xD7, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xFC, 0xD4, 0x00, 0x00, 0xDA, 0xD7, 0xCC, 0xDF, /* 0x9C-0x9F */
+	0x00, 0x00, 0xF2, 0xD4, 0x00, 0x00, 0xD1, 0xB0, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCC, 0xE0, 0x00, 0x00, 0xDB, 0xFD, /* 0xA4-0xA7 */
+	0xF3, 0xBF, 0x00, 0x00, 0xF0, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xFC, 0xBB, 0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE6, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xDF, 0xDE, 0x00, 0x00, 0xE0, 0xC7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xEF, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xE1, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xEA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE7, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCE, 0xB6, 0x00, 0x00, 0xF3, 0xC0, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCD, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xFB, 0xD2, 0x00, 0x00, 0xF8, 0xF8, 0xF7, 0xFB, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB6, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_75[512] = {
+	0x00, 0x00, 0xDC, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xCC, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF1, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE8, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xCA, 0xF6, 0x00, 0x00, 0xE4, 0xA4, 0xF4, 0xD6, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xE6, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xA7, /* 0x20-0x23 */
+	0x00, 0x00, 0xDF, 0xE7, 0xE1, 0xC1, 0x00, 0x00, /* 0x24-0x27 */
+	0xE9, 0xC4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xCB, /* 0x28-0x2B */
+	0xE9, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEF, 0xA3, 0xEB, 0xA6, 0xCB, 0xA3, 0xE3, 0xE9, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, /* 0x34-0x37 */
+	0xEF, 0xA4, 0x00, 0x00, 0xEF, 0xEB, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB4, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEF, 0xA5, 0x00, 0x00, 0xD3, 0xCC, /* 0x50-0x53 */
+	0xDA, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD7, 0xBA, 0x00, 0x00, 0xF2, 0xD5, /* 0x58-0x5B */
+	0xF5, 0xE5, 0xD9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xB4, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xD5, 0xD4, 0xFD, 0xCF, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xE3, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE1, /* 0x6C-0x6F */
+	0xEC, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xFB, 0xFE, 0xD3, 0xD7, 0x00, 0x00, /* 0x74-0x77 */
+	0xD1, 0xB1, 0x00, 0x00, 0xCB, 0xB1, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xB2, 0xF1, 0xC2, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xE1, 0xF9, 0xB5, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEB, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xDF, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCB, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB9, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF8, 0xDE, 0xF9, 0xAA, 0xCA, 0xF7, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xED, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD3, 0xB8, 0xF2, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD4, 0xD9, 0xEE, 0xC5, 0xF2, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xDC, 0xBB, 0x00, 0x00, 0xF1, 0xF8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xEC, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xCA, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF6, 0xC0, 0xFD, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD4, 0xE3, 0xCC, 0xE2, 0x00, 0x00, 0xF7, 0xD4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xE5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xD3, 0xC3, 0x00, 0x00, 0xD8, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xF6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCD, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0xE5, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB0, /* 0x1C-0x1F */
+	0xF4, 0xB0, 0xF3, 0xEA, 0xDA, 0xEE, 0x00, 0x00, /* 0x20-0x23 */
+	0xD7, 0xBB, 0x00, 0x00, 0xE2, 0xB1, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAA, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xFB, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE4, 0xDF, 0x00, 0x00, 0xCA, 0xD6, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xA8, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xFE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF6, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEF, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xD4, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE8, 0xB9, 0x00, 0x00, 0xEF, 0xA6, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF4, /* 0x78-0x7B */
+	0xDB, 0xA1, 0xDB, 0xDC, 0xDB, 0xDD, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xEE, 0xDC, 0x00, 0x00, 0xCB, 0xCB, 0xFC, 0xD5, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xEB, 0x00, 0x00, /* 0x8C-0x8F */
+	0xCD, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xAB, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xD4, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA9, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xDD, 0xDB, 0xCD, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xCE, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE7, 0xC3, 0x00, 0x00, 0xEC, 0xCC, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xEC, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFC, /* 0xD8-0xDB */
+	0xD4, 0xA8, 0x00, 0x00, 0xED, 0xD3, 0xD8, 0xEF, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF2, 0xD7, 0x00, 0x00, 0xCA, 0xF8, /* 0xE0-0xE3 */
+	0xDA, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD4, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xCD, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xEE, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xF2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xDF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xDA, 0xF0, 0x00, 0x00, 0xE2, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0x00, 0x00, 0xE0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xF7, 0xAF, 0xDA, 0xB6, 0x00, 0x00, 0xCA, 0xD7, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xD8, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xFA, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEF, /* 0x34-0x37 */
+	0xD9, 0xC2, 0x00, 0x00, 0xF0, 0xD2, 0x00, 0x00, /* 0x38-0x3B */
+	0xE4, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF3, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xFA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEC, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE2, 0xB2, 0x00, 0x00, 0xD4, 0xBD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xCE, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE2, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xD4, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xC2, 0xE7, 0xDA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xD9, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD9, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE2, 0xEB, 0xD6, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xCA, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xDA, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD7, /* 0xB8-0xBB */
+	0xCC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBA, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB8, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xC3, /* 0xD8-0xDB */
+	0xD0, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC5, 0xEB, 0xF8, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF2, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xD3, 0xAD, 0xE8, 0xE1, 0xCE, 0xEC, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_78[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xE3, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xF7, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xF2, 0xB2, 0xF3, 0xF6, 0xF6, 0xDB, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xD7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xDF, 0x00, 0x00, /* 0x30-0x33 */
+	0xF7, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xD0, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDA, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xF5, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0x68-0x6B */
+	0xCC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xDD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xED, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xD6, 0xDE, 0xE4, 0xF4, 0xE1, 0xEF, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xDD, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCF, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE5, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA1, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xAC, 0xFC, 0xAD, /* 0xB8-0xBB */
+	0xD8, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xED, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xDB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF0, 0xF3, 0xAF, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xA5, 0x00, 0x00, /* 0xCC-0xCF */
+	0xDA, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xD8, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCC, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB4, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xCA, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xF2, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_79[512] = {
+	0x00, 0x00, 0xF5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xEC, 0xD5, 0xF8, /* 0x28-0x2B */
+	0xDA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDE, 0xE5, 0xD1, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB6, /* 0x44-0x47 */
+	0xD1, 0xB7, 0xF2, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE9, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xD3, 0xF2, 0xB4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, 0xCB, 0xE4, /* 0x58-0x5B */
+	0xFB, 0xD4, 0xF5, 0xE6, 0xE3, 0xEA, 0x00, 0x00, /* 0x5C-0x5F */
+	0xDE, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xDF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xB8, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDF, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xFC, 0xA1, 0xEF, 0xEE, 0xDC, 0xD8, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE9, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, 0xFD, 0xFB, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xD4, 0xAA, 0x00, 0x00, 0xE5, 0xCC, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD0, 0xD8, 0xFC, 0xA2, 0xD4, 0xBE, /* 0xBC-0xBF */
+	0xE2, 0xB3, 0xDE, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xDC, 0xBC, 0xD2, 0xB6, 0xF5, 0xD5, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xCE, 0xA1, 0xF5, 0xA9, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDD, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDD, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD5, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF6, 0xDF, 0x00, 0x00, 0xF2, 0xDA, 0xE4, 0xEB, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xF2, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xB9, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0xFD, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xCA, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEF, /* 0x08-0x0B */
+	0x00, 0x00, 0xF5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xEC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAD, /* 0x14-0x17 */
+	0x00, 0x00, 0xF2, 0xC2, 0xF6, 0xC3, 0x00, 0x00, /* 0x18-0x1B */
+	0xD7, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA2, /* 0x1C-0x1F */
+	0xF0, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xFA, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF6, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0xF2, 0xC3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAB, /* 0x38-0x3B */
+	0xCA, 0xB3, 0xCD, 0xA6, 0x00, 0x00, 0xCD, 0xC3, /* 0x3C-0x3F */
+	0xCD, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xCF, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEE, 0xDD, 0xE7, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDF, 0xE2, 0xE7, 0xDB, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE8, 0xB1, 0x00, 0x00, 0xFC, 0xAE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE5, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xFA, 0xEB, 0x00, 0x00, 0xCF, 0xBC, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xCF, 0xE2, 0xCD, 0xF6, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xEF, 0xF0, 0x00, 0x00, 0xF4, 0xBE, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xF3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xF2, 0xF3, 0xEB, /* 0x90-0x93 */
+	0x00, 0x00, 0xF0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCF, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDF, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE8, 0xC0, 0xE8, 0xC1, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xE3, 0xE9, 0xA2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xAA, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF3, 0xC1, 0xD0, 0xAB, 0x00, 0x00, 0xD4, 0xE4, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xBC, 0xD8, 0xA1, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDF, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF3, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xDC, 0xBD, 0x00, 0x00, 0xCC, 0xE5, /* 0xDC-0xDF */
+	0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE2, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD4, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xB5, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, 0xD3, 0xAE, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xF1, 0xD3, 0xF5, 0xE7, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xDA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEE, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDF, 0xE9, 0x00, 0x00, 0xEE, 0xDE, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xC2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xAC, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xF0, 0xAF, 0xD6, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xE1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xB6, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xD4, 0xF5, 0x00, 0x00, 0xD0, 0xC9, /* 0x48-0x4B */
+	0xEF, 0xA7, 0xE2, 0xEC, 0x00, 0x00, 0xDB, 0xEA, /* 0x4C-0x4F */
+	0xCE, 0xCC, 0xF5, 0xE8, 0xF7, 0xD5, 0x00, 0x00, /* 0x50-0x53 */
+	0xD3, 0xCD, 0x00, 0x00, 0xF3, 0xFE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD0, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0xFE, 0x00, 0x00, 0xDF, 0xFB, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xA8, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB4, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xDA, 0xD8, 0xD1, 0xB9, 0x00, 0x00, 0xDF, 0xA9, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xB0, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xCC, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEF, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xDF, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xEF, 0xBD, 0xFC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDB, 0xF4, 0x00, 0x00, 0xEF, 0xAA, 0xF8, 0xB9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD9, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE1, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD4, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xDE, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEA, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xC2, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xAF, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xD7, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD8, 0xE1, 0xC7, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF4, 0xD8, 0xD6, 0xB3, 0xDD, 0xAD, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xBE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF1, 0xC3, 0xEE, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xF4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xD7, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB7, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xFB, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xDD, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xA3, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xDA, 0xD9, 0x00, 0x00, 0xF0, 0xD8, /* 0x94-0x97 */
+	0xEF, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD8, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xF1, 0xD4, 0x00, 0x00, 0xED, 0xF2, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xDB, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xD5, 0xDC, 0xF3, 0xC4, 0xCB, 0xD7, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xF1, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xD0, 0xF0, 0xD9, /* 0xDC-0xDF */
+	0xCB, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDD, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA7, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0xD1, 0xBA, 0x00, 0x00, 0xF1, 0xC4, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xB3, 0xFB, 0xF5, 0xE9, 0xE1, 0xFD, 0xE0, /* 0x04-0x07 */
+	0xFC, 0xBC, 0x00, 0x00, 0xDA, 0xA2, 0xDA, 0xA3, /* 0x08-0x0B */
+	0x00, 0x00, 0xD2, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE9, /* 0x14-0x17 */
+	0xCE, 0xDC, 0xF2, 0xB5, 0xD0, 0xE4, 0xDD, 0xD1, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE1, 0xC8, 0xDB, 0xB7, 0xDF, 0xE3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB9, /* 0x28-0x2B */
+	0xF1, 0xC5, 0x00, 0x00, 0xF3, 0xCF, 0xD7, 0xAB, /* 0x2C-0x2F */
+	0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xEB, /* 0x30-0x33 */
+	0x00, 0x00, 0xEE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xE1, 0xC9, 0xCA, 0xFA, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xFB, 0xFA, 0xE1, /* 0x40-0x43 */
+	0xF0, 0xDA, 0xCC, 0xE7, 0xDA, 0xF4, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xED, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD5, 0xA9, 0xFA, 0xE2, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE5, 0x00, 0x00, /* 0x64-0x67 */
+	0xEB, 0xD6, 0x00, 0x00, 0xEC, 0xDF, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xFC, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xF7, 0xD6, 0xDE, 0xEA, 0xCB, 0xB4, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xBE, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xCC, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBD, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xF2, 0xE2, 0xB7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE8, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xD6, 0xE0, 0x00, 0x00, 0xF1, 0xC6, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE2, 0xB8, 0xEB, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xCB, 0xB5, 0xD8, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF4, 0xCE, 0xF3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD7, 0xC6, 0x00, 0x00, 0xD1, 0xBB, 0xF7, 0xAA, /* 0xB8-0xBB */
+	0x00, 0x00, 0xED, 0xCA, 0xD7, 0xD3, 0xD8, 0xFA, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xCC, 0xDD, 0xFC, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xFD, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xF9, 0xE5, 0x00, 0x00, 0xE0, 0xCA, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF2, 0xFD, 0xD3, 0xB0, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF4, 0xF3, 0xDA, 0xC9, 0x00, 0x00, 0xE6, 0xDE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF8, 0xBA, 0xE8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD8, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD5, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD6, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC6, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF2, 0xDB, 0xE4, 0xFC, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0xF2, 0xDC, 0xFB, 0xD6, 0xE9, 0xB2, /* 0x1C-0x1F */
+	0x00, 0x00, 0xEE, 0xAD, 0x00, 0x00, 0xFA, 0xE3, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEE, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xEA, 0xE6, 0xE0, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAC, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF5, 0xC5, 0xEE, 0xE0, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xDB, 0xE5, 0x00, 0x00, 0xDD, 0xDE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xF0, 0xE9, 0xA3, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x50-0x53 */
+	0xF2, 0xC4, 0xE0, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0xB1, 0xFC, 0xEB, 0xCD, 0xA8, /* 0x68-0x6B */
+	0x00, 0x00, 0xCC, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF0, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xCD, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xC3, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xE1, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xC5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x94-0x97 */
+	0xF3, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC0, /* 0x98-0x9B */
+	0xD5, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xAE, 0x00, 0x00, /* 0x34-0x37 */
+	0xF9, 0xFC, 0x00, 0x00, 0xCC, 0xC0, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCE, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD8, 0xD2, 0xF9, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xAA, 0xCE, 0xD1, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDB, 0xEB, 0x00, 0x00, 0xDF, 0xFE, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, 0xF7, 0xF3, /* 0x74-0x77 */
+	0x00, 0x00, 0xD7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD1, 0xBC, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x88-0x8B */
+	0xCB, 0xB6, 0x00, 0x00, 0xDA, 0xB8, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xBE, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xCF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE0, 0xCC, 0xEB, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xFD, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD7, 0xE8, 0xCB, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE8, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xCD, 0x00, 0x00, /* 0xC8-0xCB */
+	0xEC, 0xCE, 0x00, 0x00, 0xD6, 0xBF, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA7, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDF, 0xD6, 0xFD, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE1, /* 0xDC-0xDF */
+	0xF6, 0xA8, 0xDD, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xF8, 0xBB, 0x00, 0x00, 0xE8, 0xD1, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF9, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xEC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0xE9, 0xA5, 0xD6, 0xD5, 0x00, 0x00, 0xCD, 0xC5, /* 0x00-0x03 */
+	0x00, 0x00, 0xED, 0xBA, 0xD1, 0xBD, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xEC, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD2, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0xD9, 0xC4, /* 0x14-0x17 */
+	0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xBC, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xAD, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF7, 0xB0, 0x00, 0x00, 0xCC, 0xEA, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC4, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC0, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xFD, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA1, 0x00, 0x00, /* 0x54-0x57 */
+	0xDE, 0xBD, 0x00, 0x00, 0xF6, 0xA9, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xA4, /* 0x6C-0x6F */
+	0xF5, 0xC6, 0x00, 0x00, 0xE1, 0xA2, 0xE9, 0xC6, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC5, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF4, 0xE9, 0xD6, 0xEC, 0xEB, 0xD3, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xEC, 0xBD, 0xE2, 0xDC, 0xDE, 0xEB, 0xF0, 0xDC, /* 0x84-0x87 */
+	0x00, 0x00, 0xEB, 0xBF, 0x00, 0x00, 0xD7, 0xCE, /* 0x88-0x8B */
+	0xD1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xAB, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFD, /* 0x98-0x9B */
+	0x00, 0x00, 0xCA, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCD, 0xC6, 0xF2, 0xB6, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDD, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xCC, 0xB7, 0xDB, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE9, /* 0xAC-0xAF */
+	0x00, 0x00, 0xCE, 0xDD, 0xEB, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xFD, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xCB, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, /* 0xC0-0xC3 */
+	0xF1, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xDB, 0xCE, 0x00, 0x00, 0xF7, 0xC3, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xCF, 0xCB, 0xA4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE0, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xFB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xEB, 0xCA, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCE, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD4, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xFD, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xD2, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xB7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xFA, 0xF6, 0xF6, 0xAA, 0xFA, 0xF7, /* 0x04-0x07 */
+	0xD8, 0xE6, 0x00, 0x00, 0xF4, 0xB1, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE8, 0xD2, 0x00, 0x00, 0xCA, 0xC5, 0xCC, 0xEB, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xEE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE2, 0xBB, 0x00, 0x00, 0xF7, 0xAD, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE1, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xF3, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xA1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFD, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDD, 0xAF, 0xDD, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCB, 0xB7, 0xE8, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xA3, 0xD2, 0xE0, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE9, 0xA6, 0xCB, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xED, 0xF3, 0xDC, 0xD9, 0xE0, 0xCD, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xDA, /* 0x7C-0x7F */
+	
+	0xDB, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xCC, 0xAE, 0x00, 0x00, 0xDA, 0xDB, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xC7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xB1, 0x00, 0x00, /* 0x98-0x9B */
+	0xD8, 0xAF, 0xE3, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF3, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF8, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xCE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF5, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xEC, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD3, 0xC5, 0xFC, 0xEC, 0xD2, 0xDB, /* 0xBC-0xBF */
+	0xD4, 0xEB, 0x00, 0x00, 0xDE, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xF0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xED, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0xED, 0xF5, /* 0xE4-0xE7 */
+	0xD7, 0xFC, 0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xB8, /* 0xF0-0xF3 */
+	0xF6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xE6, 0xF2, 0xDD, /* 0xF8-0xFB */
+	0xCF, 0xBF, 0x00, 0x00, 0xEB, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xCF, 0xC0, 0x00, 0x00, 0xE6, 0xA8, /* 0x04-0x07 */
+	0xFD, 0xE9, 0x00, 0x00, 0xCF, 0xC1, 0x00, 0x00, /* 0x08-0x0B */
+	0xE0, 0xDF, 0xDE, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xA2, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xBF, /* 0x18-0x1B */
+	0xE2, 0xEF, 0x00, 0x00, 0xD9, 0xF1, 0xF1, 0xC7, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCB, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, 0xDB, 0xBA, /* 0x28-0x2B */
+	0xDA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF6, 0xEC, 0xDA, 0xDC, 0xFA, 0xE4, /* 0x34-0x37 */
+	0x00, 0x00, 0xE0, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDD, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, 0xEF, 0xF3, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xF3, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEB, 0xFA, 0x00, 0x00, 0xF9, 0xE6, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xDD, 0xD5, 0xDE, /* 0x6C-0x6F */
+	0x00, 0x00, 0xCA, 0xDE, 0xDF, 0xE4, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xFD, 0x00, 0x00, /* 0x74-0x77 */
+	0xF5, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, /* 0x88-0x8B */
+	0x00, 0x00, 0xED, 0xCB, 0xCF, 0xE4, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xD3, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xDD, 0xB3, 0xD4, 0xEC, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF2, 0xB9, 0x00, 0x00, 0xDF, 0xB7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCB, 0xCE, 0xFB, 0xD8, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD0, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xDD, 0xD2, 0xF7, 0xF4, 0xE7, 0xDC, 0xE4, 0xA5, /* 0xAC-0xAF */
+	0x00, 0x00, 0xFC, 0xA3, 0x00, 0x00, 0xDB, 0xBB, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBA, /* 0xB4-0xB7 */
+	0xE9, 0xFD, 0xD0, 0xCA, 0x00, 0x00, 0xF5, 0xD6, /* 0xB8-0xBB */
+	0xD9, 0xC5, 0xE4, 0xB4, 0x00, 0x00, 0xED, 0xA7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xEA, 0xBD, 0xE6, 0xFE, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF7, 0xC4, 0xF5, 0xAD, 0x00, 0x00, 0xD9, 0xE0, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE2, 0xCF, 0xC2, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEC, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE5, 0xB4, 0xCD, 0xC8, 0xEE, 0xC8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE7, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xCD, 0xC9, 0xF9, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_83[512] = {
+	0x00, 0x00, 0xF1, 0xE8, 0xD9, 0xF2, 0xDB, 0xF5, /* 0x00-0x03 */
+	0xCA, 0xB5, 0xD9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xD8, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAB, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xED, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD4, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDA, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xFC, 0xED, 0xEC, 0xE0, 0xD2, 0xFE, 0x00, 0x00, /* 0x34-0x37 */
+	0xE9, 0xC7, 0xE6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBB, /* 0x44-0x47 */
+	0x00, 0x00, 0xF5, 0xAE, 0xFB, 0xAA, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xFB, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEC, 0xBF, 0xFC, 0xD8, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE5, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC3, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE2, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xD7, 0xE9, 0xED, 0xF6, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xED, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0x94-0x97 */
+	0xE3, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xDD, 0xB4, 0xE4, 0xB5, 0xD8, 0xB0, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xCE, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD6, 0xE1, 0xCF, 0xD2, 0x00, 0x00, /* 0xC8-0xCB */
+	0xD0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xEE, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xF3, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xDC, 0xCC, 0x00, 0x00, 0xD0, 0xCB, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xA4, /* 0xEC-0xEF */
+	0xCD, 0xCA, 0xD7, 0xD4, 0xDE, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFE, /* 0x00-0x03 */
+	0xD4, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xD1, 0x00, 0x00, /* 0x08-0x0B */
+	0xD8, 0xF0, 0xF8, 0xC3, 0xEA, 0xD7, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xF5, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xD8, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xFD, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xD5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCA, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE7, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF8, 0xE3, 0x00, 0x00, 0xD4, 0xDD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xD8, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD9, /* 0x68-0x6B */
+	0xED, 0xF7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD0, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF1, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE3, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xD9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xDF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xDB, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF1, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB6, /* 0xB8-0xBB */
+	0xF3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDA, /* 0xBC-0xBF */
+	0xE1, 0xE0, 0x00, 0x00, 0xD9, 0xAC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF5, 0xEB, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE9, 0xC8, 0x00, 0x00, 0xCB, 0xCF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xDC, 0xEF, 0x00, 0x00, 0xD6, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE2, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD6, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD9, 0xA1, 0x00, 0x00, 0xD8, 0xC0, /* 0x10-0x13 */
+	0xDC, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x14-0x17 */
+	0xDF, 0xB8, 0x00, 0x00, 0xEA, 0xA5, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xAD, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF3, 0xF9, 0x00, 0x00, 0xED, 0xF8, /* 0x20-0x23 */
+	0x00, 0x00, 0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE1, 0xCA, 0xEB, 0xE3, 0x00, 0x00, 0xF2, 0xDE, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF8, 0xCC, 0x00, 0x00, 0xEA, 0xD9, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD3, 0xC6, 0x00, 0x00, 0xDB, 0xE6, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF5, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xF0, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xFE, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xFB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xCF, 0xF2, 0xF7, 0xB9, 0xD9, 0xF3, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE1, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xDA, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xFB, /* 0x8C-0x8F */
+	0x00, 0x00, 0xCB, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xED, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFD, 0xBC, 0xDF, 0xB1, 0xE3, 0xEF, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0xAC-0xAF */
+	0xFD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCD, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD5, 0xC0, 0xE3, 0xF0, 0xED, 0xFA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD5, 0xED, 0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD4, 0xF6, 0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xDB, 0xE7, 0xE2, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCB, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xF4, 0xF0, 0xDD, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xDE, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xD6, 0xE1, 0xCC, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xEE, 0xDC, 0xA2, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD5, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA1, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCB, 0xF3, 0xF4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC8, /* 0x58-0x5B */
+	0xD6, 0xD7, 0x00, 0x00, 0xE9, 0xE5, 0xFB, 0xDC, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD0, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xFB, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA5, 0x00, 0x00, /* 0x88-0x8B */
+	0xDB, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE2, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xF7, /* 0xA0-0xA3 */
+	0xF0, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xF6, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xEF, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB1, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xFC, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE8, 0xC3, 0x00, 0x00, 0xF1, 0xC8, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF1, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF9, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF2, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xB6, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xF5, 0xB9, 0x00, 0x00, 0xDC, 0xF0, 0xE3, 0xF1, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE8, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xF2, 0xBB, 0x00, 0x00, 0xDE, 0xA4, 0x00, 0x00, /* 0x18-0x1B */
+	0xDA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xE9, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, 0xFC, 0xD9, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDA, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0x00, 0x00, /* 0x64-0x67 */
+	0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xFB, 0xDD, 0x00, 0x00, 0xEF, 0xCA, 0x00, 0x00, /* 0x74-0x77 */
+	0xE8, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xCC, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xEB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAD, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD9, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xA2, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF6, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDA, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE0, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA8, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xFA, 0xAF, 0x00, 0x00, 0xEB, 0xFC, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE3, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC5, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xE3, 0xD5, 0xEE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xCD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD9, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC1, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xFA, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xEB, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xFA, 0xBC, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xE5, 0xE2, 0xFA, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB6, /* 0x54-0x57 */
+	0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0xEA, 0xDB, /* 0x58-0x5B */
+	0x00, 0x00, 0xF5, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xFB, 0xAC, 0xCF, 0xC3, 0xEB, 0xFD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB9, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE1, 0xF1, 0x00, 0x00, 0xD2, 0xA4, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFB, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xDA, 0xD0, 0xDB, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xEA, 0xBE, 0xD9, 0xB1, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xCA, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE7, /* 0x88-0x8B */
+	0x00, 0x00, 0xF8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, 0xF2, 0xDF, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xAC, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xCD, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xEE, 0xAE, 0xD6, 0xAE, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xEA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE7, 0xE0, 0xEB, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xCF, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDC, 0xCD, 0xED, 0xFB, 0x00, 0x00, 0xDE, 0xF0, /* 0xDC-0xDF */
+	0x00, 0x00, 0xD7, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xDE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xD7, /* 0xF0-0xF3 */
+	0xDB, 0xD0, 0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xD5, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF0, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDC, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xCA, 0xE8, 0x00, 0x00, 0xF8, 0xE6, 0xDC, 0xCE, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xEA, 0xDC, 0xDB, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xDB, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, 0x00, 0x00, /* 0x34-0x37 */
+	0xD7, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE1, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCB, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE5, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xDC, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xD5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xCA, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA9, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA4, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE9, 0xA9, 0x00, 0x00, 0xD3, 0xC7, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0xF8, 0xAE, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xB8, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAE, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xF2, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xCA, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCC, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD4, 0xAD, 0xF6, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xCC, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xC6, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB0, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xDF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0xE5, 0xEB, 0x00, 0x00, 0xEF, 0xF4, 0xDD, 0xB5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xCD, 0xAA, 0x00, 0x00, 0xE3, 0xF2, 0x00, 0x00, /* 0x08-0x0B */
+	0xFB, 0xF7, 0x00, 0x00, 0xF7, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBA, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xE1, 0xF6, 0xFE, /* 0x14-0x17 */
+	0xD1, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC5, /* 0x18-0x1B */
+	0x00, 0x00, 0xE4, 0xB8, 0x00, 0x00, 0xE1, 0xE8, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC1, /* 0x20-0x23 */
+	0x00, 0x00, 0xD2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xBE, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xFA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE1, 0xCD, 0x00, 0x00, 0xCA, 0xB8, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xE0, 0xF1, 0xC9, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDE, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xF0, 0xDF, 0xF8, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCC, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xF2, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE7, 0xC9, 0x00, 0x00, 0xE2, 0xF3, 0xE7, 0xE1, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xCF, 0xF8, 0xEF, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */
+	0xFD, 0xFE, 0xFC, 0xA5, 0xFA, 0xB1, 0xDF, 0xD9, /* 0x70-0x73 */
+	0x00, 0x00, 0xE0, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xF1, 0xCA, 0x00, 0x00, 0xCE, 0xA3, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xF2, 0xBC, 0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA5, /* 0x90-0x93 */
+	0x00, 0x00, 0xF7, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xEB, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE1, 0xA4, 0xCD, 0xAB, 0x00, 0x00, 0xD9, 0xF4, /* 0xA0-0xA3 */
+	0xE8, 0xA6, 0xCD, 0xCE, 0xE1, 0xE9, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFC, 0xEF, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE2, 0xC1, 0x00, 0x00, 0xCE, 0xA4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xDE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xEB, 0xFE, 0x00, 0x00, 0xEB, 0xDD, 0xF0, 0xE0, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDB, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE2, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEB, /* 0xC8-0xCB */
+	0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0xF5, 0xD8, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xE5, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB0, /* 0xD8-0xDB */
+	0xF4, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE3, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xF4, 0xFA, 0xB2, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xF5, 0xCA, 0xDF, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEB, 0xB1, 0xED, 0xBF, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xFD, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xA6, 0xF9, 0xA4, /* 0xF4-0xF7 */
+	0xF0, 0xB3, 0x00, 0x00, 0xE5, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xE7, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0xD9, 0xC7, 0xE4, 0xD7, 0xEA, 0xDD, 0x00, 0x00, /* 0x00-0x03 */
+	0xD4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xBA, 0x00, 0x00, /* 0x0C-0x0F */
+	0xDA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF9, 0xCC, 0x00, 0x00, 0xE1, 0xDA, 0xDB, 0xBF, /* 0x14-0x17 */
+	0x00, 0x00, 0xCC, 0xC5, 0xEC, 0xD0, 0xCB, 0xBB, /* 0x18-0x1B */
+	0x00, 0x00, 0xDE, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE9, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xD9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, /* 0x28-0x2B */
+	0xD7, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC4, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xD0, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xFC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF1, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xD2, 0xD1, 0xC1, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xDB, 0x00, 0x00, 0xD3, 0xC9, 0x00, 0x00, /* 0x58-0x5B */
+	0xDC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xDE, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xBB, /* 0x6C-0x6F */
+	0xEC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xCC, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDE, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE7, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD4, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xA8, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0xC2, 0x00, 0x00, 0xF3, 0xD8, 0xE5, 0xD3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xD9, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xC6, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xDB, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAC, /* 0x3C-0x3F */
+	0x00, 0x00, 0xFC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xE7, 0x00, 0x00, /* 0x44-0x47 */
+	0xD1, 0xC2, 0x00, 0x00, 0xF9, 0xA5, 0x00, 0x00, /* 0x48-0x4B */
+	0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xCA, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xDF, 0xE7, 0xE3, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF8, 0xFB, 0xE3, 0xCF, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xB0, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */
+	0xD9, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF8, 0xAF, 0xEF, 0xF6, 0x00, 0x00, /* 0x9C-0x9F */
+	0xDD, 0xB6, 0xEE, 0xAF, 0xCD, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB8, /* 0xA4-0xA7 */
+	0xFC, 0xA7, 0xF7, 0xFC, 0xF7, 0xB1, 0xCE, 0xBB, /* 0xA8-0xAB */
+	0xF4, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCD, /* 0xAC-0xAF */
+	0xE1, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC3, /* 0xB0-0xB3 */
+	0xCF, 0xFE, 0x00, 0x00, 0xF8, 0xBF, 0xD8, 0xE2, /* 0xB4-0xB7 */
+	0xD3, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA8, /* 0xB8-0xBB */
+	0xF4, 0xE4, 0xEC, 0xC2, 0x00, 0x00, 0xD9, 0xF5, /* 0xBC-0xBF */
+	0xF9, 0xC5, 0xDD, 0xD3, 0xD6, 0xF1, 0xEC, 0xFC, /* 0xC0-0xC3 */
+	0xFC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, /* 0xC4-0xC7 */
+	0xCA, 0xB9, 0x00, 0x00, 0xEE, 0xE4, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF2, 0xE1, 0x00, 0x00, 0xDE, 0xB9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF2, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDE, 0xF4, 0x00, 0x00, 0xDF, 0xDB, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDB, 0xD3, 0x00, 0x00, 0xFA, 0xE7, 0xD8, 0xE3, /* 0xE0-0xE3 */
+	0xF4, 0xC1, 0x00, 0x00, 0xDD, 0xB7, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xF5, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xB8, /* 0xF8-0xFB */
+	0xCF, 0xC5, 0xDF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF2, 0xBE, 0xF6, 0xA1, 0x00, 0x00, 0xEB, 0xCB, /* 0x04-0x07 */
+	0xF1, 0xFC, 0x00, 0x00, 0xF3, 0xC7, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE0, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFC, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xDB, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEE, 0xE5, 0x00, 0x00, 0xDE, 0xF5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xD3, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF1, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAF, /* 0x70-0x73 */
+	0xDD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xC3, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xF5, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xC6, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xF0, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAC, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF5, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEB, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xC5, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA2, /* 0xC8-0xCB */
+	0xF2, 0xF6, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF5, /* 0xD8-0xDB */
+	0x00, 0x00, 0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEE, 0xE6, 0x00, 0x00, 0xE0, 0xD3, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD8, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAF, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xCE, /* 0x0C-0x0F */
+	0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xE6, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xA1, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEB, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF1, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xB3, 0x00, 0x00, /* 0x40-0x43 */
+	0xF0, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF4, /* 0x44-0x47 */
+	0xD4, 0xB0, 0xF3, 0xB2, 0xFB, 0xB7, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xF5, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE7, /* 0x5C-0x5F */
+	0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xF5, 0xED, 0x00, 0x00, 0xCF, 0xF3, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xF0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCE, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xCC, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE5, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xF5, 0xE3, 0xF3, /* 0xA8-0xAB */
+	0xCF, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCF, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xB3, 0xE4, 0xD8, /* 0xC8-0xCB */
+	0xCF, 0xF9, 0xCF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE2, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBB, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xDC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF2, /* 0x00-0x03 */
+	0x00, 0x00, 0xD6, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xEE, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xE5, 0xD8, 0xC2, /* 0x10-0x13 */
+	0xDC, 0xD0, 0xCC, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE0, /* 0x18-0x1B */
+	0xF6, 0xCA, 0xFD, 0xCA, 0xD8, 0xD6, 0xF4, 0xCF, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xA6, 0xDC, 0xBE, /* 0x24-0x27 */
+	0x00, 0x00, 0xDB, 0xD4, 0xD7, 0xC7, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFE, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCD, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xE2, 0xC3, 0xDC, 0xDE, 0x00, 0x00, 0xDC, 0xDF, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xAD, 0xE6, 0xAB, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF9, 0xDD, 0xEA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xF4, 0xD0, 0xCE, 0xF3, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, 0xCE, 0xDE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xF9, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, /* 0x98-0x9B */
+	0xCD, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB8, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xFD, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xDC, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xDE, 0xF6, 0x00, 0x00, 0xDC, 0xAA, /* 0xAC-0xAF */
+	0xF2, 0xE3, 0xE9, 0xB4, 0xD2, 0xDC, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE3, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDA, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xBC, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xE8, 0xDA, 0xDE, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF2, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE2, 0xFB, 0x00, 0x00, 0xCC, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBB, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEE, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0xF7, 0xDC, 0xE1, 0xEA, 0xCE, 0xC1, 0xD4, 0xB1, /* 0x00-0x03 */
+	0x00, 0x00, 0xFD, 0xB1, 0xE6, 0xBD, 0x00, 0x00, /* 0x04-0x07 */
+	0xFB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE7, /* 0x08-0x0B */
+	0x00, 0x00, 0xE1, 0xCE, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */
+	0xF5, 0xEF, 0xCF, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xD4, 0xB2, 0xCC, 0xEF, 0x00, 0x00, 0xD4, 0xE8, /* 0x14-0x17 */
+	0x00, 0x00, 0xEE, 0xCF, 0xF7, 0xD7, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE0, 0xA6, 0xD6, 0xC1, 0xE1, 0xDC, /* 0x1C-0x1F */
+	0xF0, 0xE3, 0xF1, 0xE4, 0xDC, 0xF1, 0xD6, 0xA7, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xF5, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF1, 0xCE, 0xF2, 0xE4, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xEC, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xF9, 0xBA, 0x00, 0x00, 0xEB, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD4, 0xED, 0xE2, 0xC4, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE7, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xB4, 0xEA, 0xA1, /* 0x48-0x4B */
+	0x00, 0x00, 0xF8, 0xBC, 0xCE, 0xA6, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xC6, 0xFC, 0xDA, 0x00, 0x00, 0xD4, 0xB3, /* 0x50-0x53 */
+	0xD3, 0xB9, 0xEA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE9, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE1, 0xE1, 0xD3, 0xCF, 0xF4, 0xF6, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEA, 0xC0, 0xE1, 0xCF, 0x00, 0x00, 0xCC, 0xBA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF0, 0xE4, 0xF3, 0xB4, 0xD4, 0xEE, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xC0, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF1, 0xE5, 0x00, 0x00, 0xF4, 0xC3, /* 0x74-0x77 */
+	0xE0, 0xD4, 0x00, 0x00, 0xEB, 0xB6, 0x00, 0x00, /* 0x78-0x7B */
+	0xD7, 0xA1, 0xCB, 0xE8, 0x00, 0x00, 0xF9, 0xAD, /* 0x7C-0x7F */
+	
+	0xE9, 0xAD, 0xD8, 0xE4, 0xFA, 0xB3, 0xE2, 0xC5, /* 0x80-0x83 */
+	0xFC, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC4, /* 0x84-0x87 */
+	0xD8, 0xB1, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA4, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xE8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xD8, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xAE, 0xD1, 0xE1, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF5, 0xBE, 0x00, 0x00, 0xDE, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xFB, /* 0xAC-0xAF */
+	0xF7, 0xC6, 0xCF, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEE, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE9, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xF4, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xCD, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xDD, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE9, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xD4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xDB, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xFA, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDE, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF8, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEF, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xB3, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xEB, 0xB7, 0xEF, 0xF8, 0xF5, 0xDC, /* 0x48-0x4B */
+	0xED, 0xCC, 0xDB, 0xD5, 0xF1, 0xCF, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xD0, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xB2, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xD9, 0xAE, 0xD5, 0xAC, 0x00, 0x00, /* 0x68-0x6B */
+	0xE2, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xFD, 0xA3, 0x00, 0x00, 0xFB, 0xE5, /* 0x74-0x77 */
+	0xDF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF5, /* 0x84-0x87 */
+	0x00, 0x00, 0xF6, 0xAD, 0x00, 0x00, 0xF5, 0xB3, /* 0x88-0x8B */
+	0x00, 0x00, 0xF0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF5, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA2, /* 0xA8-0xAB */
+	0xED, 0xFD, 0x00, 0x00, 0xF5, 0xB4, 0xFB, 0xB8, /* 0xAC-0xAF */
+	0x00, 0x00, 0xDB, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xD6, 0xCA, 0xCB, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE5, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFA, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xEB, 0xB8, 0x00, 0x00, 0xE0, 0xB7, /* 0xC8-0xCB */
+	0xD7, 0xEC, 0xF1, 0xEC, 0xE5, 0xAF, 0xD5, 0xE1, /* 0xCC-0xCF */
+	0xD7, 0xED, 0xD1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF2, /* 0xD4-0xD7 */
+	0xEF, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDD, 0xBC, 0xF6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE5, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC4, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_92[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xD4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xCC, 0xA2, 0xF7, 0xFE, 0xDF, 0xBC, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCD, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xB7, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xD6, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xAD, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAF, /* 0x3C-0x3F */
+	0xCB, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xFA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC6, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, /* 0x60-0x63 */
+	0xCF, 0xC9, 0xE2, 0xFC, 0xEF, 0xFA, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEB, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xC8, /* 0x80-0x83 */
+	0x00, 0x00, 0xD4, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE0, 0xD5, 0x00, 0x00, 0xEF, 0xB0, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xC7, 0x00, 0x00, /* 0x94-0x97 */
+	0xD9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xCA, 0xE1, 0xD1, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xEF, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF9, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xCB, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCB, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD6, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF5, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xF5, 0xDF, 0x00, 0x00, 0xEE, 0xB6, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, 0xD3, 0xCA, /* 0x1C-0x1F */
+	0xEF, 0xFC, 0xD1, 0xC4, 0xEF, 0xB1, 0x00, 0x00, /* 0x20-0x23 */
+	0xD1, 0xC5, 0x00, 0x00, 0xD0, 0xDE, 0x00, 0x00, /* 0x24-0x27 */
+	0xD9, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xD1, 0xF3, 0xB9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE7, 0xCC, 0x00, 0x00, 0xD6, 0xA8, 0xCE, 0xA7, /* 0x48-0x4B */
+	0x00, 0x00, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB4, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xB9, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xCB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF6, 0xDD, 0x00, 0x00, 0xF1, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xCC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE9, 0xCA, 0x00, 0x00, 0xE1, 0xF0, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xE0, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD1, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xFB, 0xE0, 0xF2, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEC, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xEE, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xCB, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD7, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_94[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xFC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE0, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB2, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF4, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xF7, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF1, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xCA, 0xFC, 0xCA, 0xFD, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCE, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF3, 0xC8, 0x00, 0x00, 0xF3, 0xBA, /* 0x7C-0x7F */
+};
+
+static const unsigned char u2c_95[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFE, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDA, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEC, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0xCB, 0xD2, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCE, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF9, 0xD8, 0xF9, 0xD9, 0xCA, 0xE0, /* 0x90-0x93 */
+	0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCB, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC8, /* 0xA0-0xA3 */
+	0xF9, 0xEE, 0xDB, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xD0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xD5, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE6, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA2, /* 0xB8-0xBB */
+	0xE4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xC4, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF9, 0xEF, 0xCF, 0xF4, 0xF7, 0xE6, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCE, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF4, 0xC5, 0xDC, 0xA3, 0x00, 0x00, /* 0xE0-0xE3 */
+};
+
+static const unsigned char u2c_96[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xDD, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF4, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xA1, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xD6, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC1, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB9, /* 0x3C-0x3F */
+	0xF6, 0xED, 0x00, 0x00, 0xF9, 0xAE, 0x00, 0x00, /* 0x40-0x43 */
+	0xDD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB0, /* 0x48-0x4B */
+	0xD8, 0xE8, 0xCB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xCE, /* 0x58-0x5B */
+	0xF9, 0xF0, 0xE0, 0xED, 0xE3, 0xB3, 0xF4, 0xB3, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xC2, 0xF2, 0xE6, /* 0x60-0x63 */
+	0xF0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xD6, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEB, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE7, /* 0x70-0x73 */
+	0x00, 0x00, 0xD7, 0xD5, 0xD4, 0xB6, 0xF9, 0xE8, /* 0x74-0x77 */
+	0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE9, 0xEA, 0xD7, 0xCC, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xE9, 0xE2, 0xC9, /* 0x88-0x8B */
+	0x00, 0x00, 0xFC, 0xDB, 0xCD, 0xAD, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCC, 0xB0, 0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xE4, 0xF6, 0xD0, 0xC0, 0x00, 0x00, 0xF0, 0xB7, /* 0x98-0x9B */
+	0xEE, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF6, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0xA4-0xA7 */
+	0xE2, 0xCB, 0x00, 0x00, 0xFA, 0xCF, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xCB, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xB4, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xED, 0xCD, 0xE4, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0xA9, 0xE4, 0xBA, 0xF3, 0xA2, 0xCD, 0xD2, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF6, 0xCB, 0x00, 0x00, 0xF1, 0xE6, /* 0xC8-0xCB */
+	0xED, 0xC1, 0xE8, 0xBC, 0xEE, 0xD1, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF0, 0xE7, 0xE2, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE4, 0xAA, 0x00, 0x00, 0xF5, 0xE1, /* 0xD8-0xDB */
+	0xED, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xEE, 0xD1, 0xF1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE9, 0xEB, 0xE9, 0xEC, 0xE0, 0xE4, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA7, /* 0xEC-0xEF */
+	0xDD, 0xD4, 0x00, 0x00, 0xEA, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC3, 0xD6, 0xF4, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDA, 0xDF, 0x00, 0x00, 0xEF, 0xB3, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_97[512] = {
+	0xE2, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xFD, 0xF2, 0xE8, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xEF, 0xC5, 0x00, 0x00, 0xE7, 0xE7, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xFD, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xDF, 0xDC, 0x00, 0x00, 0xF9, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF6, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xDF, 0xAC, 0x00, 0x00, 0xD6, 0xDA, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xDC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF0, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xFA, 0x00, 0x00, /* 0x40-0x43 */
+	0xE4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xD6, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xF4, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xFE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xF0, 0xA1, 0x00, 0x00, 0xDE, 0xAA, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDA, 0xBC, 0xD8, 0xFC, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFA, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xEC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xFC, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE6, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xCB, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB9, /* 0x88-0x8B */
+	0x00, 0x00, 0xE4, 0xD3, 0x00, 0x00, 0xCD, 0xF9, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xCA, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD4, /* 0xA8-0xAB */
+	0x00, 0x00, 0xF8, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xC7, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xDB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD4, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xE5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA4, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0xFB, 0xE1, 0xFA, 0xED, 0xF0, 0xA2, 0xCC, 0xF1, /* 0x00-0x03 */
+	0x00, 0x00, 0xFA, 0xA3, 0xE2, 0xF7, 0x00, 0x00, /* 0x04-0x07 */
+	0xE2, 0xCE, 0x00, 0x00, 0xE9, 0xF5, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE7, 0xE8, 0xE8, 0xD7, 0xDA, 0xF8, 0xD4, 0xCB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF6, /* 0x14-0x17 */
+	0xD6, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD4, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xFA, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xCC, 0xF2, 0xF7, 0xDD, 0x00, 0x00, 0xDE, 0xBA, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xA8, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF0, 0xB9, 0xE4, 0xFE, 0xE4, 0xC9, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE4, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xEA, 0xC3, 0x00, 0x00, 0xEF, 0xB4, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xBE, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xFB, 0xE2, 0x00, 0x00, 0xCD, 0xD3, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB5, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xE9, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBD, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF7, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF8, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAB, /* 0xD8-0xDB */
+	0xDB, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xDD, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE1, 0xE2, 0xD1, 0xC6, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF6, 0xD0, 0xEB, 0xE6, 0xDA, 0xF9, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xDE, 0xF8, 0xF8, 0xE9, 0xE3, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xFA, 0xC3, 0xE5, 0xD7, 0x00, 0x00, /* 0x08-0x0B */
+	0xEC, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF3, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE6, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */
+	0xDC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xD0, 0xCF, 0x00, 0x00, 0xCF, 0xFA, /* 0x48-0x4B */
+	0xF3, 0xCA, 0xE0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD1, 0xC7, 0xE9, 0xAE, 0x00, 0x00, /* 0x50-0x53 */
+	0xE8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xFA, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDC, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFB, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD8, 0xA9, 0xE5, 0xDF, 0xF9, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF6, 0xEE, 0x00, 0x00, 0xF6, 0xCC, /* 0xB0-0xB3 */
+	0xE2, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xEC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xF1, 0xD2, 0xD2, 0xCC, 0xCF, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xCA, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xDD, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF6, 0xEF, 0x00, 0x00, 0xDE, 0xF9, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xFA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD5, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0x00, 0x00, 0xDE, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xDC, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xC8, 0xD1, 0xC9, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF8, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF6, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xE2, 0xE1, 0xD3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFE, /* 0x40-0x43 */
+	0x00, 0x00, 0xCF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xFD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xCE, 0xF6, 0x00, 0x00, 0xFA, 0xD0, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xF3, 0xE6, 0xBE, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xF0, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xD1, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFC, 0xBE, 0xD5, 0xF1, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xCD, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xFA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0xD0-0xD3 */
+	0xF4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCD, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA2, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xE3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xEA, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xD0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCE, 0xDA, 0xFB, 0xEB, 0xDB, 0xA6, /* 0x40-0x43 */
+	0xDB, 0xDE, 0xD8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE0, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE0, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xC6, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD5, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF7, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD7, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xCD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCC, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xF5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE4, 0xCA, 0x00, 0x00, 0xDC, 0xE1, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xF9, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xFC, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xA7, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xBE, /* 0x44-0x47 */
+	0x00, 0x00, 0xDC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF7, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF0, 0xE8, 0x00, 0x00, 0xDD, 0xC0, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCF, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF3, /* 0xF0-0xF3 */
+	0xD9, 0xB0, 0x00, 0x00, 0xE6, 0xE9, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0x24-0x27 */
+	0xE4, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF8, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xCC, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE4, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xCD, 0xDC, 0xD9, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xDD, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xCE, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xCD, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xCE, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xFD, 0xD3, 0xEB, 0xED, 0xD6, 0xDC, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF9, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE7, 0xA4, 0x00, 0x00, 0xD6, 0xE3, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xCB, 0xD6, 0xE4, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF2, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xDE, 0xFA, 0x00, 0x00, 0xD7, 0xF8, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xCF, 0xD5, 0xD8, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xAB, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDC, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE0, 0xA8, 0xD5, 0xF3, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xFD, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xCC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xEA, /* 0xD8-0xDB */
+	0xF5, 0xF5, 0x00, 0x00, 0xEF, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xD3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xDA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA8, /* 0x04-0x07 */
+	0xDC, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xA3, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAC, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xBA, 0xEE, 0xB1, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xB2, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xCD, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD6, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xBB, 0x00, 0x00, /* 0x68-0x6B */
+	0xE5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xD7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDB, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xCA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xCF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_AC[512] = {
+	0xB0, 0xA1, 0xB0, 0xA2, 0x81, 0x41, 0x81, 0x42, /* 0x00-0x03 */
+	0xB0, 0xA3, 0x81, 0x43, 0x81, 0x44, 0xB0, 0xA4, /* 0x04-0x07 */
+	0xB0, 0xA5, 0xB0, 0xA6, 0xB0, 0xA7, 0x81, 0x45, /* 0x08-0x0B */
+	0x81, 0x46, 0x81, 0x47, 0x81, 0x48, 0x81, 0x49, /* 0x0C-0x0F */
+	0xB0, 0xA8, 0xB0, 0xA9, 0xB0, 0xAA, 0xB0, 0xAB, /* 0x10-0x13 */
+	0xB0, 0xAC, 0xB0, 0xAD, 0xB0, 0xAE, 0xB0, 0xAF, /* 0x14-0x17 */
+	0x81, 0x4A, 0xB0, 0xB0, 0xB0, 0xB1, 0xB0, 0xB2, /* 0x18-0x1B */
+	0xB0, 0xB3, 0xB0, 0xB4, 0x81, 0x4B, 0x81, 0x4C, /* 0x1C-0x1F */
+	0xB0, 0xB5, 0x81, 0x4D, 0x81, 0x4E, 0x81, 0x4F, /* 0x20-0x23 */
+	0xB0, 0xB6, 0x81, 0x50, 0x81, 0x51, 0x81, 0x52, /* 0x24-0x27 */
+	0x81, 0x53, 0x81, 0x54, 0x81, 0x55, 0x81, 0x56, /* 0x28-0x2B */
+	0xB0, 0xB7, 0xB0, 0xB8, 0x81, 0x57, 0xB0, 0xB9, /* 0x2C-0x2F */
+	0xB0, 0xBA, 0xB0, 0xBB, 0x81, 0x58, 0x81, 0x59, /* 0x30-0x33 */
+	0x81, 0x5A, 0x81, 0x61, 0x81, 0x62, 0x81, 0x63, /* 0x34-0x37 */
+	0xB0, 0xBC, 0xB0, 0xBD, 0x81, 0x64, 0x81, 0x65, /* 0x38-0x3B */
+	0xB0, 0xBE, 0x81, 0x66, 0x81, 0x67, 0x81, 0x68, /* 0x3C-0x3F */
+	0xB0, 0xBF, 0x81, 0x69, 0x81, 0x6A, 0x81, 0x6B, /* 0x40-0x43 */
+	0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x44-0x47 */
+	0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0xB0, 0xC0, /* 0x48-0x4B */
+	0x81, 0x73, 0xB0, 0xC1, 0x81, 0x74, 0x81, 0x75, /* 0x4C-0x4F */
+	0x81, 0x76, 0x81, 0x77, 0x81, 0x78, 0x81, 0x79, /* 0x50-0x53 */
+	0xB0, 0xC2, 0x81, 0x7A, 0x81, 0x81, 0x81, 0x82, /* 0x54-0x57 */
+	0xB0, 0xC3, 0x81, 0x83, 0x81, 0x84, 0x81, 0x85, /* 0x58-0x5B */
+	0xB0, 0xC4, 0x81, 0x86, 0x81, 0x87, 0x81, 0x88, /* 0x5C-0x5F */
+	0x81, 0x89, 0x81, 0x8A, 0x81, 0x8B, 0x81, 0x8C, /* 0x60-0x63 */
+	0x81, 0x8D, 0x81, 0x8E, 0x81, 0x8F, 0x81, 0x90, /* 0x64-0x67 */
+	0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0x81, 0x94, /* 0x68-0x6B */
+	0x81, 0x95, 0x81, 0x96, 0x81, 0x97, 0x81, 0x98, /* 0x6C-0x6F */
+	0xB0, 0xC5, 0xB0, 0xC6, 0x81, 0x99, 0x81, 0x9A, /* 0x70-0x73 */
+	0xB0, 0xC7, 0x81, 0x9B, 0x81, 0x9C, 0xB0, 0xC8, /* 0x74-0x77 */
+	0xB0, 0xC9, 0x81, 0x9D, 0xB0, 0xCA, 0x81, 0x9E, /* 0x78-0x7B */
+	0x81, 0x9F, 0x81, 0xA0, 0x81, 0xA1, 0x81, 0xA2, /* 0x7C-0x7F */
+	
+	0xB0, 0xCB, 0xB0, 0xCC, 0x81, 0xA3, 0xB0, 0xCD, /* 0x80-0x83 */
+	0xB0, 0xCE, 0xB0, 0xCF, 0xB0, 0xD0, 0x81, 0xA4, /* 0x84-0x87 */
+	0x81, 0xA5, 0xB0, 0xD1, 0xB0, 0xD2, 0xB0, 0xD3, /* 0x88-0x8B */
+	0xB0, 0xD4, 0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, /* 0x8C-0x8F */
+	0xB0, 0xD5, 0x81, 0xA9, 0x81, 0xAA, 0x81, 0xAB, /* 0x90-0x93 */
+	0xB0, 0xD6, 0x81, 0xAC, 0x81, 0xAD, 0x81, 0xAE, /* 0x94-0x97 */
+	0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, 0x81, 0xB2, /* 0x98-0x9B */
+	0xB0, 0xD7, 0xB0, 0xD8, 0x81, 0xB3, 0xB0, 0xD9, /* 0x9C-0x9F */
+	0xB0, 0xDA, 0xB0, 0xDB, 0x81, 0xB4, 0x81, 0xB5, /* 0xA0-0xA3 */
+	0x81, 0xB6, 0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, /* 0xA4-0xA7 */
+	0xB0, 0xDC, 0xB0, 0xDD, 0xB0, 0xDE, 0x81, 0xBA, /* 0xA8-0xAB */
+	0xB0, 0xDF, 0x81, 0xBB, 0x81, 0xBC, 0xB0, 0xE0, /* 0xAC-0xAF */
+	0xB0, 0xE1, 0x81, 0xBD, 0x81, 0xBE, 0x81, 0xBF, /* 0xB0-0xB3 */
+	0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0x81, 0xC3, /* 0xB4-0xB7 */
+	0xB0, 0xE2, 0xB0, 0xE3, 0x81, 0xC4, 0xB0, 0xE4, /* 0xB8-0xBB */
+	0xB0, 0xE5, 0xB0, 0xE6, 0x81, 0xC5, 0x81, 0xC6, /* 0xBC-0xBF */
+	0x81, 0xC7, 0xB0, 0xE7, 0x81, 0xC8, 0x81, 0xC9, /* 0xC0-0xC3 */
+	0xB0, 0xE8, 0x81, 0xCA, 0x81, 0xCB, 0x81, 0xCC, /* 0xC4-0xC7 */
+	0xB0, 0xE9, 0x81, 0xCD, 0x81, 0xCE, 0x81, 0xCF, /* 0xC8-0xCB */
+	0xB0, 0xEA, 0x81, 0xD0, 0x81, 0xD1, 0x81, 0xD2, /* 0xCC-0xCF */
+	0x81, 0xD3, 0x81, 0xD4, 0x81, 0xD5, 0x81, 0xD6, /* 0xD0-0xD3 */
+	0x81, 0xD7, 0xB0, 0xEB, 0x81, 0xD8, 0xB0, 0xEC, /* 0xD4-0xD7 */
+	0x81, 0xD9, 0x81, 0xDA, 0x81, 0xDB, 0x81, 0xDC, /* 0xD8-0xDB */
+	0x81, 0xDD, 0x81, 0xDE, 0x81, 0xDF, 0x81, 0xE0, /* 0xDC-0xDF */
+	0xB0, 0xED, 0xB0, 0xEE, 0x81, 0xE1, 0x81, 0xE2, /* 0xE0-0xE3 */
+	0xB0, 0xEF, 0x81, 0xE3, 0x81, 0xE4, 0xB0, 0xF0, /* 0xE4-0xE7 */
+	0xB0, 0xF1, 0x81, 0xE5, 0xB0, 0xF2, 0x81, 0xE6, /* 0xE8-0xEB */
+	0xB0, 0xF3, 0x81, 0xE7, 0x81, 0xE8, 0xB0, 0xF4, /* 0xEC-0xEF */
+	0xB0, 0xF5, 0xB0, 0xF6, 0x81, 0xE9, 0xB0, 0xF7, /* 0xF0-0xF3 */
+	0x81, 0xEA, 0xB0, 0xF8, 0xB0, 0xF9, 0x81, 0xEB, /* 0xF4-0xF7 */
+	0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, 0x81, 0xEF, /* 0xF8-0xFB */
+	0xB0, 0xFA, 0xB0, 0xFB, 0x81, 0xF0, 0x81, 0xF1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_AD[512] = {
+	0xB0, 0xFC, 0x81, 0xF2, 0x81, 0xF3, 0x81, 0xF4, /* 0x00-0x03 */
+	0xB0, 0xFD, 0x81, 0xF5, 0xB0, 0xFE, 0x81, 0xF6, /* 0x04-0x07 */
+	0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0x81, 0xFA, /* 0x08-0x0B */
+	0xB1, 0xA1, 0xB1, 0xA2, 0x81, 0xFB, 0xB1, 0xA3, /* 0x0C-0x0F */
+	0x81, 0xFC, 0xB1, 0xA4, 0x81, 0xFD, 0x81, 0xFE, /* 0x10-0x13 */
+	0x82, 0x41, 0x82, 0x42, 0x82, 0x43, 0x82, 0x44, /* 0x14-0x17 */
+	0xB1, 0xA5, 0x82, 0x45, 0x82, 0x46, 0x82, 0x47, /* 0x18-0x1B */
+	0xB1, 0xA6, 0x82, 0x48, 0x82, 0x49, 0x82, 0x4A, /* 0x1C-0x1F */
+	0xB1, 0xA7, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0x20-0x23 */
+	0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, /* 0x24-0x27 */
+	0x82, 0x52, 0xB1, 0xA8, 0x82, 0x53, 0x82, 0x54, /* 0x28-0x2B */
+	0xB1, 0xA9, 0xB1, 0xAA, 0x82, 0x55, 0x82, 0x56, /* 0x2C-0x2F */
+	0x82, 0x57, 0x82, 0x58, 0x82, 0x59, 0x82, 0x5A, /* 0x30-0x33 */
+	0xB1, 0xAB, 0xB1, 0xAC, 0x82, 0x61, 0x82, 0x62, /* 0x34-0x37 */
+	0xB1, 0xAD, 0x82, 0x63, 0x82, 0x64, 0x82, 0x65, /* 0x38-0x3B */
+	0xB1, 0xAE, 0x82, 0x66, 0x82, 0x67, 0x82, 0x68, /* 0x3C-0x3F */
+	0x82, 0x69, 0x82, 0x6A, 0x82, 0x6B, 0x82, 0x6C, /* 0x40-0x43 */
+	0xB1, 0xAF, 0xB1, 0xB0, 0x82, 0x6D, 0xB1, 0xB1, /* 0x44-0x47 */
+	0x82, 0x6E, 0xB1, 0xB2, 0x82, 0x6F, 0x82, 0x70, /* 0x48-0x4B */
+	0x82, 0x71, 0x82, 0x72, 0x82, 0x73, 0x82, 0x74, /* 0x4C-0x4F */
+	0xB1, 0xB3, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x50-0x53 */
+	0xB1, 0xB4, 0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, /* 0x54-0x57 */
+	0xB1, 0xB5, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x58-0x5B */
+	0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x5C-0x5F */
+	0x82, 0x88, 0xB1, 0xB6, 0x82, 0x89, 0xB1, 0xB7, /* 0x60-0x63 */
+	0x82, 0x8A, 0x82, 0x8B, 0x82, 0x8C, 0x82, 0x8D, /* 0x64-0x67 */
+	0x82, 0x8E, 0x82, 0x8F, 0x82, 0x90, 0x82, 0x91, /* 0x68-0x6B */
+	0xB1, 0xB8, 0xB1, 0xB9, 0x82, 0x92, 0x82, 0x93, /* 0x6C-0x6F */
+	0xB1, 0xBA, 0x82, 0x94, 0x82, 0x95, 0xB1, 0xBB, /* 0x70-0x73 */
+	0xB1, 0xBC, 0xB1, 0xBD, 0xB1, 0xBE, 0x82, 0x96, /* 0x74-0x77 */
+	0x82, 0x97, 0x82, 0x98, 0x82, 0x99, 0xB1, 0xBF, /* 0x78-0x7B */
+	0xB1, 0xC0, 0xB1, 0xC1, 0x82, 0x9A, 0xB1, 0xC2, /* 0x7C-0x7F */
+	
+	0x82, 0x9B, 0xB1, 0xC3, 0xB1, 0xC4, 0x82, 0x9C, /* 0x80-0x83 */
+	0x82, 0x9D, 0x82, 0x9E, 0x82, 0x9F, 0x82, 0xA0, /* 0x84-0x87 */
+	0xB1, 0xC5, 0xB1, 0xC6, 0x82, 0xA1, 0x82, 0xA2, /* 0x88-0x8B */
+	0xB1, 0xC7, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x8C-0x8F */
+	0xB1, 0xC8, 0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, /* 0x90-0x93 */
+	0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x94-0x97 */
+	0x82, 0xAD, 0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, /* 0x98-0x9B */
+	0xB1, 0xC9, 0xB1, 0xCA, 0x82, 0xB1, 0x82, 0xB2, /* 0x9C-0x9F */
+	0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, 0x82, 0xB6, /* 0xA0-0xA3 */
+	0xB1, 0xCB, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0xA4-0xA7 */
+	0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0xA8-0xAB */
+	0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0xAC-0xAF */
+	0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0xB0-0xB3 */
+	0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0xB1, 0xCC, /* 0xB4-0xB7 */
+	0x82, 0xC9, 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, /* 0xB8-0xBB */
+	0x82, 0xCD, 0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, /* 0xBC-0xBF */
+	0xB1, 0xCD, 0xB1, 0xCE, 0x82, 0xD1, 0x82, 0xD2, /* 0xC0-0xC3 */
+	0xB1, 0xCF, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0xC4-0xC7 */
+	0xB1, 0xD0, 0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, /* 0xC8-0xCB */
+	0x82, 0xD9, 0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, /* 0xCC-0xCF */
+	0xB1, 0xD1, 0xB1, 0xD2, 0x82, 0xDD, 0xB1, 0xD3, /* 0xD0-0xD3 */
+	0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0xD4-0xD7 */
+	0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0xD8-0xDB */
+	0xB1, 0xD4, 0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, /* 0xDC-0xDF */
+	0xB1, 0xD5, 0x82, 0xE9, 0x82, 0xEA, 0x82, 0xEB, /* 0xE0-0xE3 */
+	0xB1, 0xD6, 0x82, 0xEC, 0x82, 0xED, 0x82, 0xEE, /* 0xE4-0xE7 */
+	0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, 0x82, 0xF2, /* 0xE8-0xEB */
+	0x82, 0xF3, 0x82, 0xF4, 0x82, 0xF5, 0x82, 0xF6, /* 0xEC-0xEF */
+	0x82, 0xF7, 0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, /* 0xF0-0xF3 */
+	0x82, 0xFB, 0x82, 0xFC, 0x82, 0xFD, 0x82, 0xFE, /* 0xF4-0xF7 */
+	0xB1, 0xD7, 0xB1, 0xD8, 0x83, 0x41, 0x83, 0x42, /* 0xF8-0xFB */
+	0xB1, 0xD9, 0x83, 0x43, 0x83, 0x44, 0xB1, 0xDA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_AE[512] = {
+	0xB1, 0xDB, 0xB1, 0xDC, 0x83, 0x45, 0x83, 0x46, /* 0x00-0x03 */
+	0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0x04-0x07 */
+	0xB1, 0xDD, 0xB1, 0xDE, 0x83, 0x4B, 0xB1, 0xDF, /* 0x08-0x0B */
+	0x83, 0x4C, 0xB1, 0xE0, 0x83, 0x4D, 0x83, 0x4E, /* 0x0C-0x0F */
+	0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0x10-0x13 */
+	0xB1, 0xE1, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0x14-0x17 */
+	0x83, 0x56, 0x83, 0x57, 0x83, 0x58, 0x83, 0x59, /* 0x18-0x1B */
+	0x83, 0x5A, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0x1C-0x1F */
+	0x83, 0x64, 0x83, 0x65, 0x83, 0x66, 0x83, 0x67, /* 0x20-0x23 */
+	0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, 0x83, 0x6B, /* 0x24-0x27 */
+	0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, 0x83, 0x6F, /* 0x28-0x2B */
+	0x83, 0x70, 0x83, 0x71, 0x83, 0x72, 0x83, 0x73, /* 0x2C-0x2F */
+	0xB1, 0xE2, 0xB1, 0xE3, 0x83, 0x74, 0x83, 0x75, /* 0x30-0x33 */
+	0xB1, 0xE4, 0x83, 0x76, 0x83, 0x77, 0xB1, 0xE5, /* 0x34-0x37 */
+	0xB1, 0xE6, 0x83, 0x78, 0xB1, 0xE7, 0x83, 0x79, /* 0x38-0x3B */
+	0x83, 0x7A, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0x3C-0x3F */
+	0xB1, 0xE8, 0xB1, 0xE9, 0x83, 0x84, 0xB1, 0xEA, /* 0x40-0x43 */
+	0x83, 0x85, 0xB1, 0xEB, 0xB1, 0xEC, 0x83, 0x86, /* 0x44-0x47 */
+	0x83, 0x87, 0x83, 0x88, 0xB1, 0xED, 0x83, 0x89, /* 0x48-0x4B */
+	0xB1, 0xEE, 0xB1, 0xEF, 0xB1, 0xF0, 0x83, 0x8A, /* 0x4C-0x4F */
+	0xB1, 0xF1, 0x83, 0x8B, 0x83, 0x8C, 0x83, 0x8D, /* 0x50-0x53 */
+	0xB1, 0xF2, 0x83, 0x8E, 0xB1, 0xF3, 0x83, 0x8F, /* 0x54-0x57 */
+	0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0x58-0x5B */
+	0xB1, 0xF4, 0xB1, 0xF5, 0x83, 0x94, 0xB1, 0xF6, /* 0x5C-0x5F */
+	0xB1, 0xF7, 0xB1, 0xF8, 0x83, 0x95, 0x83, 0x96, /* 0x60-0x63 */
+	0x83, 0x97, 0xB1, 0xF9, 0x83, 0x98, 0x83, 0x99, /* 0x64-0x67 */
+	0xB1, 0xFA, 0xB1, 0xFB, 0x83, 0x9A, 0x83, 0x9B, /* 0x68-0x6B */
+	0xB1, 0xFC, 0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, /* 0x6C-0x6F */
+	0xB1, 0xFD, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x70-0x73 */
+	0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x74-0x77 */
+	0xB1, 0xFE, 0xB2, 0xA1, 0x83, 0xA6, 0xB2, 0xA2, /* 0x78-0x7B */
+	0xB2, 0xA3, 0xB2, 0xA4, 0x83, 0xA7, 0x83, 0xA8, /* 0x7C-0x7F */
+	
+	0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, /* 0x80-0x83 */
+	0xB2, 0xA5, 0xB2, 0xA6, 0x83, 0xAD, 0x83, 0xAE, /* 0x84-0x87 */
+	0x83, 0xAF, 0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, /* 0x88-0x8B */
+	0xB2, 0xA7, 0x83, 0xB3, 0x83, 0xB4, 0x83, 0xB5, /* 0x8C-0x8F */
+	0x83, 0xB6, 0x83, 0xB7, 0x83, 0xB8, 0x83, 0xB9, /* 0x90-0x93 */
+	0x83, 0xBA, 0x83, 0xBB, 0x83, 0xBC, 0x83, 0xBD, /* 0x94-0x97 */
+	0x83, 0xBE, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0x98-0x9B */
+	0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0x9C-0x9F */
+	0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xA0-0xA3 */
+	0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xA4-0xA7 */
+	0x83, 0xCE, 0x83, 0xCF, 0x83, 0xD0, 0x83, 0xD1, /* 0xA8-0xAB */
+	0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, 0x83, 0xD5, /* 0xAC-0xAF */
+	0x83, 0xD6, 0x83, 0xD7, 0x83, 0xD8, 0x83, 0xD9, /* 0xB0-0xB3 */
+	0x83, 0xDA, 0x83, 0xDB, 0x83, 0xDC, 0x83, 0xDD, /* 0xB4-0xB7 */
+	0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, 0x83, 0xE1, /* 0xB8-0xBB */
+	0xB2, 0xA8, 0xB2, 0xA9, 0xB2, 0xAA, 0x83, 0xE2, /* 0xBC-0xBF */
+	0xB2, 0xAB, 0x83, 0xE3, 0x83, 0xE4, 0x83, 0xE5, /* 0xC0-0xC3 */
+	0xB2, 0xAC, 0x83, 0xE6, 0x83, 0xE7, 0x83, 0xE8, /* 0xC4-0xC7 */
+	0x83, 0xE9, 0x83, 0xEA, 0x83, 0xEB, 0x83, 0xEC, /* 0xC8-0xCB */
+	0xB2, 0xAD, 0xB2, 0xAE, 0x83, 0xED, 0xB2, 0xAF, /* 0xCC-0xCF */
+	0xB2, 0xB0, 0xB2, 0xB1, 0x83, 0xEE, 0x83, 0xEF, /* 0xD0-0xD3 */
+	0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, 0x83, 0xF3, /* 0xD4-0xD7 */
+	0xB2, 0xB2, 0xB2, 0xB3, 0x83, 0xF4, 0x83, 0xF5, /* 0xD8-0xDB */
+	0xB2, 0xB4, 0x83, 0xF6, 0x83, 0xF7, 0x83, 0xF8, /* 0xDC-0xDF */
+	0x83, 0xF9, 0x83, 0xFA, 0x83, 0xFB, 0x83, 0xFC, /* 0xE0-0xE3 */
+	0x83, 0xFD, 0x83, 0xFE, 0x84, 0x41, 0x84, 0x42, /* 0xE4-0xE7 */
+	0xB2, 0xB5, 0x84, 0x43, 0x84, 0x44, 0xB2, 0xB6, /* 0xE8-0xEB */
+	0x84, 0x45, 0xB2, 0xB7, 0x84, 0x46, 0x84, 0x47, /* 0xEC-0xEF */
+	0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, /* 0xF0-0xF3 */
+	0xB2, 0xB8, 0x84, 0x4C, 0x84, 0x4D, 0x84, 0x4E, /* 0xF4-0xF7 */
+	0xB2, 0xB9, 0x84, 0x4F, 0x84, 0x50, 0x84, 0x51, /* 0xF8-0xFB */
+	0xB2, 0xBA, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_AF[512] = {
+	0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x00-0x03 */
+	0x84, 0x59, 0x84, 0x5A, 0x84, 0x61, 0xB2, 0xBB, /* 0x04-0x07 */
+	0xB2, 0xBC, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x08-0x0B */
+	0x84, 0x65, 0xB2, 0xBD, 0x84, 0x66, 0x84, 0x67, /* 0x0C-0x0F */
+	0xB2, 0xBE, 0x84, 0x68, 0x84, 0x69, 0x84, 0x6A, /* 0x10-0x13 */
+	0x84, 0x6B, 0x84, 0x6C, 0x84, 0x6D, 0x84, 0x6E, /* 0x14-0x17 */
+	0x84, 0x6F, 0x84, 0x70, 0x84, 0x71, 0x84, 0x72, /* 0x18-0x1B */
+	0x84, 0x73, 0x84, 0x74, 0x84, 0x75, 0x84, 0x76, /* 0x1C-0x1F */
+	0x84, 0x77, 0x84, 0x78, 0x84, 0x79, 0x84, 0x7A, /* 0x20-0x23 */
+	0x84, 0x81, 0x84, 0x82, 0x84, 0x83, 0x84, 0x84, /* 0x24-0x27 */
+	0x84, 0x85, 0x84, 0x86, 0x84, 0x87, 0x84, 0x88, /* 0x28-0x2B */
+	0xB2, 0xBF, 0xB2, 0xC0, 0x84, 0x89, 0x84, 0x8A, /* 0x2C-0x2F */
+	0xB2, 0xC1, 0x84, 0x8B, 0xB2, 0xC2, 0x84, 0x8C, /* 0x30-0x33 */
+	0xB2, 0xC3, 0x84, 0x8D, 0x84, 0x8E, 0x84, 0x8F, /* 0x34-0x37 */
+	0x84, 0x90, 0x84, 0x91, 0x84, 0x92, 0x84, 0x93, /* 0x38-0x3B */
+	0xB2, 0xC4, 0xB2, 0xC5, 0x84, 0x94, 0xB2, 0xC6, /* 0x3C-0x3F */
+	0x84, 0x95, 0xB2, 0xC7, 0xB2, 0xC8, 0xB2, 0xC9, /* 0x40-0x43 */
+	0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x44-0x47 */
+	0xB2, 0xCA, 0xB2, 0xCB, 0x84, 0x9A, 0x84, 0x9B, /* 0x48-0x4B */
+	0x84, 0x9C, 0x84, 0x9D, 0x84, 0x9E, 0x84, 0x9F, /* 0x4C-0x4F */
+	0xB2, 0xCC, 0x84, 0xA0, 0x84, 0xA1, 0x84, 0xA2, /* 0x50-0x53 */
+	0x84, 0xA3, 0x84, 0xA4, 0x84, 0xA5, 0x84, 0xA6, /* 0x54-0x57 */
+	0x84, 0xA7, 0x84, 0xA8, 0x84, 0xA9, 0x84, 0xAA, /* 0x58-0x5B */
+	0xB2, 0xCD, 0xB2, 0xCE, 0x84, 0xAB, 0x84, 0xAC, /* 0x5C-0x5F */
+	0x84, 0xAD, 0x84, 0xAE, 0x84, 0xAF, 0x84, 0xB0, /* 0x60-0x63 */
+	0xB2, 0xCF, 0xB2, 0xD0, 0x84, 0xB1, 0x84, 0xB2, /* 0x64-0x67 */
+	0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0x68-0x6B */
+	0x84, 0xB7, 0x84, 0xB8, 0x84, 0xB9, 0x84, 0xBA, /* 0x6C-0x6F */
+	0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, 0x84, 0xBE, /* 0x70-0x73 */
+	0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, 0x84, 0xC2, /* 0x74-0x77 */
+	0x84, 0xC3, 0xB2, 0xD1, 0x84, 0xC4, 0x84, 0xC5, /* 0x78-0x7B */
+	0x84, 0xC6, 0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, /* 0x7C-0x7F */
+	
+	0xB2, 0xD2, 0x84, 0xCA, 0x84, 0xCB, 0x84, 0xCC, /* 0x80-0x83 */
+	0xB2, 0xD3, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0x84-0x87 */
+	0xB2, 0xD4, 0x84, 0xD0, 0x84, 0xD1, 0x84, 0xD2, /* 0x88-0x8B */
+	0x84, 0xD3, 0x84, 0xD4, 0x84, 0xD5, 0x84, 0xD6, /* 0x8C-0x8F */
+	0xB2, 0xD5, 0xB2, 0xD6, 0x84, 0xD7, 0x84, 0xD8, /* 0x90-0x93 */
+	0x84, 0xD9, 0xB2, 0xD7, 0x84, 0xDA, 0x84, 0xDB, /* 0x94-0x97 */
+	0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, 0x84, 0xDF, /* 0x98-0x9B */
+	0xB2, 0xD8, 0x84, 0xE0, 0x84, 0xE1, 0x84, 0xE2, /* 0x9C-0x9F */
+	0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, 0x84, 0xE6, /* 0xA0-0xA3 */
+	0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, 0x84, 0xEA, /* 0xA4-0xA7 */
+	0x84, 0xEB, 0x84, 0xEC, 0x84, 0xED, 0x84, 0xEE, /* 0xA8-0xAB */
+	0x84, 0xEF, 0x84, 0xF0, 0x84, 0xF1, 0x84, 0xF2, /* 0xAC-0xAF */
+	0x84, 0xF3, 0x84, 0xF4, 0x84, 0xF5, 0x84, 0xF6, /* 0xB0-0xB3 */
+	0x84, 0xF7, 0x84, 0xF8, 0x84, 0xF9, 0x84, 0xFA, /* 0xB4-0xB7 */
+	0xB2, 0xD9, 0xB2, 0xDA, 0x84, 0xFB, 0x84, 0xFC, /* 0xB8-0xBB */
+	0xB2, 0xDB, 0x84, 0xFD, 0x84, 0xFE, 0x85, 0x41, /* 0xBC-0xBF */
+	0xB2, 0xDC, 0x85, 0x42, 0x85, 0x43, 0x85, 0x44, /* 0xC0-0xC3 */
+	0x85, 0x45, 0x85, 0x46, 0x85, 0x47, 0xB2, 0xDD, /* 0xC4-0xC7 */
+	0xB2, 0xDE, 0xB2, 0xDF, 0x85, 0x48, 0xB2, 0xE0, /* 0xC8-0xCB */
+	0x85, 0x49, 0xB2, 0xE1, 0xB2, 0xE2, 0x85, 0x4A, /* 0xCC-0xCF */
+	0x85, 0x4B, 0x85, 0x4C, 0x85, 0x4D, 0x85, 0x4E, /* 0xD0-0xD3 */
+	0xB2, 0xE3, 0x85, 0x4F, 0x85, 0x50, 0x85, 0x51, /* 0xD4-0xD7 */
+	0x85, 0x52, 0x85, 0x53, 0x85, 0x54, 0x85, 0x55, /* 0xD8-0xDB */
+	0xB2, 0xE4, 0x85, 0x56, 0x85, 0x57, 0x85, 0x58, /* 0xDC-0xDF */
+	0x85, 0x59, 0x85, 0x5A, 0x85, 0x61, 0x85, 0x62, /* 0xE0-0xE3 */
+	0x85, 0x63, 0x85, 0x64, 0x85, 0x65, 0x85, 0x66, /* 0xE4-0xE7 */
+	0xB2, 0xE5, 0xB2, 0xE6, 0x85, 0x67, 0x85, 0x68, /* 0xE8-0xEB */
+	0x85, 0x69, 0x85, 0x6A, 0x85, 0x6B, 0x85, 0x6C, /* 0xEC-0xEF */
+	0xB2, 0xE7, 0xB2, 0xE8, 0x85, 0x6D, 0x85, 0x6E, /* 0xF0-0xF3 */
+	0xB2, 0xE9, 0x85, 0x6F, 0x85, 0x70, 0x85, 0x71, /* 0xF4-0xF7 */
+	0xB2, 0xEA, 0x85, 0x72, 0x85, 0x73, 0x85, 0x74, /* 0xF8-0xFB */
+	0x85, 0x75, 0x85, 0x76, 0x85, 0x77, 0x85, 0x78, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B0[512] = {
+	0xB2, 0xEB, 0xB2, 0xEC, 0x85, 0x79, 0x85, 0x7A, /* 0x00-0x03 */
+	0xB2, 0xED, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x04-0x07 */
+	0x85, 0x84, 0x85, 0x85, 0x85, 0x86, 0x85, 0x87, /* 0x08-0x0B */
+	0xB2, 0xEE, 0x85, 0x88, 0x85, 0x89, 0x85, 0x8A, /* 0x0C-0x0F */
+	0xB2, 0xEF, 0x85, 0x8B, 0x85, 0x8C, 0x85, 0x8D, /* 0x10-0x13 */
+	0xB2, 0xF0, 0x85, 0x8E, 0x85, 0x8F, 0x85, 0x90, /* 0x14-0x17 */
+	0x85, 0x91, 0x85, 0x92, 0x85, 0x93, 0x85, 0x94, /* 0x18-0x1B */
+	0xB2, 0xF1, 0xB2, 0xF2, 0x85, 0x95, 0x85, 0x96, /* 0x1C-0x1F */
+	0x85, 0x97, 0x85, 0x98, 0x85, 0x99, 0x85, 0x9A, /* 0x20-0x23 */
+	0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0x85, 0x9E, /* 0x24-0x27 */
+	0xB2, 0xF3, 0x85, 0x9F, 0x85, 0xA0, 0x85, 0xA1, /* 0x28-0x2B */
+	0x85, 0xA2, 0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, /* 0x2C-0x2F */
+	0x85, 0xA6, 0x85, 0xA7, 0x85, 0xA8, 0x85, 0xA9, /* 0x30-0x33 */
+	0x85, 0xAA, 0x85, 0xAB, 0x85, 0xAC, 0x85, 0xAD, /* 0x34-0x37 */
+	0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0x85, 0xB1, /* 0x38-0x3B */
+	0x85, 0xB2, 0x85, 0xB3, 0x85, 0xB4, 0x85, 0xB5, /* 0x3C-0x3F */
+	0x85, 0xB6, 0x85, 0xB7, 0x85, 0xB8, 0x85, 0xB9, /* 0x40-0x43 */
+	0xB2, 0xF4, 0xB2, 0xF5, 0x85, 0xBA, 0x85, 0xBB, /* 0x44-0x47 */
+	0xB2, 0xF6, 0x85, 0xBC, 0xB2, 0xF7, 0x85, 0xBD, /* 0x48-0x4B */
+	0xB2, 0xF8, 0x85, 0xBE, 0xB2, 0xF9, 0x85, 0xBF, /* 0x4C-0x4F */
+	0x85, 0xC0, 0x85, 0xC1, 0x85, 0xC2, 0xB2, 0xFA, /* 0x50-0x53 */
+	0xB2, 0xFB, 0xB2, 0xFC, 0x85, 0xC3, 0xB2, 0xFD, /* 0x54-0x57 */
+	0x85, 0xC4, 0xB2, 0xFE, 0x85, 0xC5, 0x85, 0xC6, /* 0x58-0x5B */
+	0x85, 0xC7, 0xB3, 0xA1, 0x85, 0xC8, 0x85, 0xC9, /* 0x5C-0x5F */
+	0x85, 0xCA, 0x85, 0xCB, 0x85, 0xCC, 0x85, 0xCD, /* 0x60-0x63 */
+	0x85, 0xCE, 0x85, 0xCF, 0x85, 0xD0, 0x85, 0xD1, /* 0x64-0x67 */
+	0x85, 0xD2, 0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, /* 0x68-0x6B */
+	0x85, 0xD6, 0x85, 0xD7, 0x85, 0xD8, 0x85, 0xD9, /* 0x6C-0x6F */
+	0x85, 0xDA, 0x85, 0xDB, 0x85, 0xDC, 0x85, 0xDD, /* 0x70-0x73 */
+	0x85, 0xDE, 0x85, 0xDF, 0x85, 0xE0, 0x85, 0xE1, /* 0x74-0x77 */
+	0x85, 0xE2, 0x85, 0xE3, 0x85, 0xE4, 0x85, 0xE5, /* 0x78-0x7B */
+	0xB3, 0xA2, 0xB3, 0xA3, 0x85, 0xE6, 0x85, 0xE7, /* 0x7C-0x7F */
+	
+	0xB3, 0xA4, 0x85, 0xE8, 0x85, 0xE9, 0x85, 0xEA, /* 0x80-0x83 */
+	0xB3, 0xA5, 0x85, 0xEB, 0x85, 0xEC, 0x85, 0xED, /* 0x84-0x87 */
+	0x85, 0xEE, 0x85, 0xEF, 0x85, 0xF0, 0x85, 0xF1, /* 0x88-0x8B */
+	0xB3, 0xA6, 0xB3, 0xA7, 0x85, 0xF2, 0xB3, 0xA8, /* 0x8C-0x8F */
+	0x85, 0xF3, 0xB3, 0xA9, 0x85, 0xF4, 0x85, 0xF5, /* 0x90-0x93 */
+	0x85, 0xF6, 0x85, 0xF7, 0x85, 0xF8, 0x85, 0xF9, /* 0x94-0x97 */
+	0xB3, 0xAA, 0xB3, 0xAB, 0xB3, 0xAC, 0x85, 0xFA, /* 0x98-0x9B */
+	0xB3, 0xAD, 0x85, 0xFB, 0x85, 0xFC, 0xB3, 0xAE, /* 0x9C-0x9F */
+	0xB3, 0xAF, 0xB3, 0xB0, 0xB3, 0xB1, 0x85, 0xFD, /* 0xA0-0xA3 */
+	0x85, 0xFE, 0x86, 0x41, 0x86, 0x42, 0x86, 0x43, /* 0xA4-0xA7 */
+	0xB3, 0xB2, 0xB3, 0xB3, 0x86, 0x44, 0xB3, 0xB4, /* 0xA8-0xAB */
+	0xB3, 0xB5, 0xB3, 0xB6, 0xB3, 0xB7, 0xB3, 0xB8, /* 0xAC-0xAF */
+	0x86, 0x45, 0xB3, 0xB9, 0x86, 0x46, 0xB3, 0xBA, /* 0xB0-0xB3 */
+	0xB3, 0xBB, 0xB3, 0xBC, 0x86, 0x47, 0x86, 0x48, /* 0xB4-0xB7 */
+	0xB3, 0xBD, 0x86, 0x49, 0x86, 0x4A, 0x86, 0x4B, /* 0xB8-0xBB */
+	0xB3, 0xBE, 0x86, 0x4C, 0x86, 0x4D, 0x86, 0x4E, /* 0xBC-0xBF */
+	0x86, 0x4F, 0x86, 0x50, 0x86, 0x51, 0x86, 0x52, /* 0xC0-0xC3 */
+	0xB3, 0xBF, 0xB3, 0xC0, 0x86, 0x53, 0xB3, 0xC1, /* 0xC4-0xC7 */
+	0xB3, 0xC2, 0xB3, 0xC3, 0x86, 0x54, 0x86, 0x55, /* 0xC8-0xCB */
+	0x86, 0x56, 0x86, 0x57, 0x86, 0x58, 0x86, 0x59, /* 0xCC-0xCF */
+	0xB3, 0xC4, 0xB3, 0xC5, 0x86, 0x5A, 0x86, 0x61, /* 0xD0-0xD3 */
+	0xB3, 0xC6, 0x86, 0x62, 0x86, 0x63, 0x86, 0x64, /* 0xD4-0xD7 */
+	0xB3, 0xC7, 0x86, 0x65, 0x86, 0x66, 0x86, 0x67, /* 0xD8-0xDB */
+	0x86, 0x68, 0x86, 0x69, 0x86, 0x6A, 0x86, 0x6B, /* 0xDC-0xDF */
+	0xB3, 0xC8, 0x86, 0x6C, 0x86, 0x6D, 0x86, 0x6E, /* 0xE0-0xE3 */
+	0x86, 0x6F, 0xB3, 0xC9, 0x86, 0x70, 0x86, 0x71, /* 0xE4-0xE7 */
+	0x86, 0x72, 0x86, 0x73, 0x86, 0x74, 0x86, 0x75, /* 0xE8-0xEB */
+	0x86, 0x76, 0x86, 0x77, 0x86, 0x78, 0x86, 0x79, /* 0xEC-0xEF */
+	0x86, 0x7A, 0x86, 0x81, 0x86, 0x82, 0x86, 0x83, /* 0xF0-0xF3 */
+	0x86, 0x84, 0x86, 0x85, 0x86, 0x86, 0x86, 0x87, /* 0xF4-0xF7 */
+	0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0xF8-0xFB */
+	0x86, 0x8C, 0x86, 0x8D, 0x86, 0x8E, 0x86, 0x8F, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B1[512] = {
+	0x86, 0x90, 0x86, 0x91, 0x86, 0x92, 0x86, 0x93, /* 0x00-0x03 */
+	0x86, 0x94, 0x86, 0x95, 0x86, 0x96, 0x86, 0x97, /* 0x04-0x07 */
+	0xB3, 0xCA, 0xB3, 0xCB, 0x86, 0x98, 0xB3, 0xCC, /* 0x08-0x0B */
+	0xB3, 0xCD, 0x86, 0x99, 0x86, 0x9A, 0x86, 0x9B, /* 0x0C-0x0F */
+	0xB3, 0xCE, 0x86, 0x9C, 0xB3, 0xCF, 0xB3, 0xD0, /* 0x10-0x13 */
+	0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, 0x86, 0xA0, /* 0x14-0x17 */
+	0xB3, 0xD1, 0xB3, 0xD2, 0x86, 0xA1, 0xB3, 0xD3, /* 0x18-0x1B */
+	0xB3, 0xD4, 0xB3, 0xD5, 0x86, 0xA2, 0x86, 0xA3, /* 0x1C-0x1F */
+	0x86, 0xA4, 0x86, 0xA5, 0x86, 0xA6, 0xB3, 0xD6, /* 0x20-0x23 */
+	0xB3, 0xD7, 0xB3, 0xD8, 0x86, 0xA7, 0x86, 0xA8, /* 0x24-0x27 */
+	0xB3, 0xD9, 0x86, 0xA9, 0x86, 0xAA, 0x86, 0xAB, /* 0x28-0x2B */
+	0xB3, 0xDA, 0x86, 0xAC, 0x86, 0xAD, 0x86, 0xAE, /* 0x2C-0x2F */
+	0x86, 0xAF, 0x86, 0xB0, 0x86, 0xB1, 0x86, 0xB2, /* 0x30-0x33 */
+	0xB3, 0xDB, 0xB3, 0xDC, 0x86, 0xB3, 0xB3, 0xDD, /* 0x34-0x37 */
+	0xB3, 0xDE, 0xB3, 0xDF, 0x86, 0xB4, 0x86, 0xB5, /* 0x38-0x3B */
+	0x86, 0xB6, 0x86, 0xB7, 0x86, 0xB8, 0x86, 0xB9, /* 0x3C-0x3F */
+	0xB3, 0xE0, 0xB3, 0xE1, 0x86, 0xBA, 0x86, 0xBB, /* 0x40-0x43 */
+	0xB3, 0xE2, 0x86, 0xBC, 0x86, 0xBD, 0x86, 0xBE, /* 0x44-0x47 */
+	0xB3, 0xE3, 0x86, 0xBF, 0x86, 0xC0, 0x86, 0xC1, /* 0x48-0x4B */
+	0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, 0x86, 0xC5, /* 0x4C-0x4F */
+	0xB3, 0xE4, 0xB3, 0xE5, 0x86, 0xC6, 0x86, 0xC7, /* 0x50-0x53 */
+	0xB3, 0xE6, 0xB3, 0xE7, 0x86, 0xC8, 0x86, 0xC9, /* 0x54-0x57 */
+	0xB3, 0xE8, 0x86, 0xCA, 0x86, 0xCB, 0x86, 0xCC, /* 0x58-0x5B */
+	0xB3, 0xE9, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0x5C-0x5F */
+	0xB3, 0xEA, 0x86, 0xD0, 0x86, 0xD1, 0x86, 0xD2, /* 0x60-0x63 */
+	0x86, 0xD3, 0x86, 0xD4, 0x86, 0xD5, 0x86, 0xD6, /* 0x64-0x67 */
+	0x86, 0xD7, 0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, /* 0x68-0x6B */
+	0x86, 0xDB, 0x86, 0xDC, 0x86, 0xDD, 0x86, 0xDE, /* 0x6C-0x6F */
+	0x86, 0xDF, 0x86, 0xE0, 0x86, 0xE1, 0x86, 0xE2, /* 0x70-0x73 */
+	0x86, 0xE3, 0x86, 0xE4, 0x86, 0xE5, 0x86, 0xE6, /* 0x74-0x77 */
+	0xB3, 0xEB, 0xB3, 0xEC, 0x86, 0xE7, 0x86, 0xE8, /* 0x78-0x7B */
+	0xB3, 0xED, 0x86, 0xE9, 0x86, 0xEA, 0x86, 0xEB, /* 0x7C-0x7F */
+	
+	0xB3, 0xEE, 0x86, 0xEC, 0xB3, 0xEF, 0x86, 0xED, /* 0x80-0x83 */
+	0x86, 0xEE, 0x86, 0xEF, 0x86, 0xF0, 0x86, 0xF1, /* 0x84-0x87 */
+	0xB3, 0xF0, 0xB3, 0xF1, 0x86, 0xF2, 0xB3, 0xF2, /* 0x88-0x8B */
+	0x86, 0xF3, 0xB3, 0xF3, 0x86, 0xF4, 0x86, 0xF5, /* 0x8C-0x8F */
+	0x86, 0xF6, 0x86, 0xF7, 0xB3, 0xF4, 0xB3, 0xF5, /* 0x90-0x93 */
+	0xB3, 0xF6, 0x86, 0xF8, 0x86, 0xF9, 0x86, 0xFA, /* 0x94-0x97 */
+	0xB3, 0xF7, 0x86, 0xFB, 0x86, 0xFC, 0x86, 0xFD, /* 0x98-0x9B */
+	0xB3, 0xF8, 0x86, 0xFE, 0x87, 0x41, 0x87, 0x42, /* 0x9C-0x9F */
+	0x87, 0x43, 0x87, 0x44, 0x87, 0x45, 0x87, 0x46, /* 0xA0-0xA3 */
+	0x87, 0x47, 0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, /* 0xA4-0xA7 */
+	0xB3, 0xF9, 0x87, 0x4B, 0x87, 0x4C, 0x87, 0x4D, /* 0xA8-0xAB */
+	0x87, 0x4E, 0x87, 0x4F, 0x87, 0x50, 0x87, 0x51, /* 0xAC-0xAF */
+	0x87, 0x52, 0x87, 0x53, 0x87, 0x54, 0x87, 0x55, /* 0xB0-0xB3 */
+	0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0xB4-0xB7 */
+	0x87, 0x5A, 0x87, 0x61, 0x87, 0x62, 0x87, 0x63, /* 0xB8-0xBB */
+	0x87, 0x64, 0x87, 0x65, 0x87, 0x66, 0x87, 0x67, /* 0xBC-0xBF */
+	0x87, 0x68, 0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, /* 0xC0-0xC3 */
+	0x87, 0x6C, 0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, /* 0xC4-0xC7 */
+	0x87, 0x70, 0x87, 0x71, 0x87, 0x72, 0x87, 0x73, /* 0xC8-0xCB */
+	0xB3, 0xFA, 0x87, 0x74, 0x87, 0x75, 0x87, 0x76, /* 0xCC-0xCF */
+	0xB3, 0xFB, 0x87, 0x77, 0x87, 0x78, 0x87, 0x79, /* 0xD0-0xD3 */
+	0xB3, 0xFC, 0x87, 0x7A, 0x87, 0x81, 0x87, 0x82, /* 0xD4-0xD7 */
+	0x87, 0x83, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0xD8-0xDB */
+	0xB3, 0xFD, 0xB3, 0xFE, 0x87, 0x87, 0xB4, 0xA1, /* 0xDC-0xDF */
+	0x87, 0x88, 0x87, 0x89, 0x87, 0x8A, 0x87, 0x8B, /* 0xE0-0xE3 */
+	0x87, 0x8C, 0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, /* 0xE4-0xE7 */
+	0xB4, 0xA2, 0xB4, 0xA3, 0x87, 0x90, 0x87, 0x91, /* 0xE8-0xEB */
+	0xB4, 0xA4, 0x87, 0x92, 0x87, 0x93, 0x87, 0x94, /* 0xEC-0xEF */
+	0xB4, 0xA5, 0x87, 0x95, 0x87, 0x96, 0x87, 0x97, /* 0xF0-0xF3 */
+	0x87, 0x98, 0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, /* 0xF4-0xF7 */
+	0x87, 0x9C, 0xB4, 0xA6, 0x87, 0x9D, 0xB4, 0xA7, /* 0xF8-0xFB */
+	0x87, 0x9E, 0xB4, 0xA8, 0x87, 0x9F, 0x87, 0xA0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B2[512] = {
+	0x87, 0xA1, 0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, /* 0x00-0x03 */
+	0xB4, 0xA9, 0xB4, 0xAA, 0x87, 0xA5, 0x87, 0xA6, /* 0x04-0x07 */
+	0xB4, 0xAB, 0x87, 0xA7, 0x87, 0xA8, 0xB4, 0xAC, /* 0x08-0x0B */
+	0xB4, 0xAD, 0x87, 0xA9, 0x87, 0xAA, 0x87, 0xAB, /* 0x0C-0x0F */
+	0x87, 0xAC, 0x87, 0xAD, 0x87, 0xAE, 0x87, 0xAF, /* 0x10-0x13 */
+	0xB4, 0xAE, 0xB4, 0xAF, 0x87, 0xB0, 0xB4, 0xB0, /* 0x14-0x17 */
+	0x87, 0xB1, 0xB4, 0xB1, 0x87, 0xB2, 0x87, 0xB3, /* 0x18-0x1B */
+	0x87, 0xB4, 0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, /* 0x1C-0x1F */
+	0xB4, 0xB2, 0x87, 0xB8, 0x87, 0xB9, 0x87, 0xBA, /* 0x20-0x23 */
+	0x87, 0xBB, 0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, /* 0x24-0x27 */
+	0x87, 0xBF, 0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, /* 0x28-0x2B */
+	0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0x2C-0x2F */
+	0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, 0x87, 0xCA, /* 0x30-0x33 */
+	0xB4, 0xB3, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0x34-0x37 */
+	0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0x38-0x3B */
+	0xB4, 0xB4, 0x87, 0xD2, 0x87, 0xD3, 0x87, 0xD4, /* 0x3C-0x3F */
+	0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0x40-0x43 */
+	0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0x44-0x47 */
+	0x87, 0xDD, 0x87, 0xDE, 0x87, 0xDF, 0x87, 0xE0, /* 0x48-0x4B */
+	0x87, 0xE1, 0x87, 0xE2, 0x87, 0xE3, 0x87, 0xE4, /* 0x4C-0x4F */
+	0x87, 0xE5, 0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, /* 0x50-0x53 */
+	0x87, 0xE9, 0x87, 0xEA, 0x87, 0xEB, 0x87, 0xEC, /* 0x54-0x57 */
+	0xB4, 0xB5, 0x87, 0xED, 0x87, 0xEE, 0x87, 0xEF, /* 0x58-0x5B */
+	0xB4, 0xB6, 0x87, 0xF0, 0x87, 0xF1, 0x87, 0xF2, /* 0x5C-0x5F */
+	0xB4, 0xB7, 0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, /* 0x60-0x63 */
+	0x87, 0xF6, 0x87, 0xF7, 0x87, 0xF8, 0x87, 0xF9, /* 0x64-0x67 */
+	0xB4, 0xB8, 0xB4, 0xB9, 0x87, 0xFA, 0x87, 0xFB, /* 0x68-0x6B */
+	0x87, 0xFC, 0x87, 0xFD, 0x87, 0xFE, 0x88, 0x41, /* 0x6C-0x6F */
+	0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x70-0x73 */
+	0xB4, 0xBA, 0xB4, 0xBB, 0x88, 0x46, 0x88, 0x47, /* 0x74-0x77 */
+	0x88, 0x48, 0x88, 0x49, 0x88, 0x4A, 0x88, 0x4B, /* 0x78-0x7B */
+	0xB4, 0xBC, 0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, /* 0x7C-0x7F */
+	
+	0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x80-0x83 */
+	0xB4, 0xBD, 0xB4, 0xBE, 0x88, 0x53, 0x88, 0x54, /* 0x84-0x87 */
+	0x88, 0x55, 0xB4, 0xBF, 0x88, 0x56, 0x88, 0x57, /* 0x88-0x8B */
+	0x88, 0x58, 0x88, 0x59, 0x88, 0x5A, 0x88, 0x61, /* 0x8C-0x8F */
+	0xB4, 0xC0, 0xB4, 0xC1, 0x88, 0x62, 0x88, 0x63, /* 0x90-0x93 */
+	0xB4, 0xC2, 0x88, 0x64, 0x88, 0x65, 0x88, 0x66, /* 0x94-0x97 */
+	0xB4, 0xC3, 0xB4, 0xC4, 0xB4, 0xC5, 0x88, 0x67, /* 0x98-0x9B */
+	0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0x88, 0x6B, /* 0x9C-0x9F */
+	0xB4, 0xC6, 0xB4, 0xC7, 0x88, 0x6C, 0xB4, 0xC8, /* 0xA0-0xA3 */
+	0x88, 0x6D, 0xB4, 0xC9, 0xB4, 0xCA, 0x88, 0x6E, /* 0xA4-0xA7 */
+	0x88, 0x6F, 0x88, 0x70, 0xB4, 0xCB, 0x88, 0x71, /* 0xA8-0xAB */
+	0xB4, 0xCC, 0x88, 0x72, 0x88, 0x73, 0x88, 0x74, /* 0xAC-0xAF */
+	0xB4, 0xCD, 0x88, 0x75, 0x88, 0x76, 0x88, 0x77, /* 0xB0-0xB3 */
+	0xB4, 0xCE, 0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, /* 0xB4-0xB7 */
+	0x88, 0x81, 0x88, 0x82, 0x88, 0x83, 0x88, 0x84, /* 0xB8-0xBB */
+	0x88, 0x85, 0x88, 0x86, 0x88, 0x87, 0x88, 0x88, /* 0xBC-0xBF */
+	0x88, 0x89, 0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, /* 0xC0-0xC3 */
+	0x88, 0x8D, 0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, /* 0xC4-0xC7 */
+	0xB4, 0xCF, 0xB4, 0xD0, 0x88, 0x91, 0x88, 0x92, /* 0xC8-0xCB */
+	0xB4, 0xD1, 0x88, 0x93, 0x88, 0x94, 0x88, 0x95, /* 0xCC-0xCF */
+	0xB4, 0xD2, 0x88, 0x96, 0xB4, 0xD3, 0x88, 0x97, /* 0xD0-0xD3 */
+	0x88, 0x98, 0x88, 0x99, 0x88, 0x9A, 0x88, 0x9B, /* 0xD4-0xD7 */
+	0xB4, 0xD4, 0xB4, 0xD5, 0x88, 0x9C, 0xB4, 0xD6, /* 0xD8-0xDB */
+	0x88, 0x9D, 0xB4, 0xD7, 0x88, 0x9E, 0x88, 0x9F, /* 0xDC-0xDF */
+	0x88, 0xA0, 0x88, 0xA1, 0xB4, 0xD8, 0x88, 0xA2, /* 0xE0-0xE3 */
+	0xB4, 0xD9, 0xB4, 0xDA, 0xB4, 0xDB, 0x88, 0xA3, /* 0xE4-0xE7 */
+	0xB4, 0xDC, 0x88, 0xA4, 0x88, 0xA5, 0xB4, 0xDD, /* 0xE8-0xEB */
+	0xB4, 0xDE, 0xB4, 0xDF, 0xB4, 0xE0, 0xB4, 0xE1, /* 0xEC-0xEF */
+	0x88, 0xA6, 0x88, 0xA7, 0x88, 0xA8, 0xB4, 0xE2, /* 0xF0-0xF3 */
+	0xB4, 0xE3, 0xB4, 0xE4, 0x88, 0xA9, 0xB4, 0xE5, /* 0xF4-0xF7 */
+	0xB4, 0xE6, 0xB4, 0xE7, 0xB4, 0xE8, 0xB4, 0xE9, /* 0xF8-0xFB */
+	0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, 0xB4, 0xEA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B3[512] = {
+	0xB4, 0xEB, 0xB4, 0xEC, 0x88, 0xAD, 0x88, 0xAE, /* 0x00-0x03 */
+	0xB4, 0xED, 0x88, 0xAF, 0x88, 0xB0, 0x88, 0xB1, /* 0x04-0x07 */
+	0xB4, 0xEE, 0x88, 0xB2, 0x88, 0xB3, 0x88, 0xB4, /* 0x08-0x0B */
+	0x88, 0xB5, 0x88, 0xB6, 0x88, 0xB7, 0x88, 0xB8, /* 0x0C-0x0F */
+	0xB4, 0xEF, 0xB4, 0xF0, 0x88, 0xB9, 0xB4, 0xF1, /* 0x10-0x13 */
+	0xB4, 0xF2, 0xB4, 0xF3, 0x88, 0xBA, 0x88, 0xBB, /* 0x14-0x17 */
+	0x88, 0xBC, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0x18-0x1B */
+	0xB4, 0xF4, 0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, /* 0x1C-0x1F */
+	0x88, 0xC3, 0x88, 0xC4, 0x88, 0xC5, 0x88, 0xC6, /* 0x20-0x23 */
+	0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, 0x88, 0xCA, /* 0x24-0x27 */
+	0x88, 0xCB, 0x88, 0xCC, 0x88, 0xCD, 0x88, 0xCE, /* 0x28-0x2B */
+	0x88, 0xCF, 0x88, 0xD0, 0x88, 0xD1, 0x88, 0xD2, /* 0x2C-0x2F */
+	0x88, 0xD3, 0x88, 0xD4, 0x88, 0xD5, 0x88, 0xD6, /* 0x30-0x33 */
+	0x88, 0xD7, 0x88, 0xD8, 0x88, 0xD9, 0x88, 0xDA, /* 0x34-0x37 */
+	0x88, 0xDB, 0x88, 0xDC, 0x88, 0xDD, 0x88, 0xDE, /* 0x38-0x3B */
+	0x88, 0xDF, 0x88, 0xE0, 0x88, 0xE1, 0x88, 0xE2, /* 0x3C-0x3F */
+	0x88, 0xE3, 0x88, 0xE4, 0x88, 0xE5, 0x88, 0xE6, /* 0x40-0x43 */
+	0x88, 0xE7, 0x88, 0xE8, 0x88, 0xE9, 0x88, 0xEA, /* 0x44-0x47 */
+	0x88, 0xEB, 0x88, 0xEC, 0x88, 0xED, 0x88, 0xEE, /* 0x48-0x4B */
+	0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x4C-0x4F */
+	0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, 0x88, 0xF6, /* 0x50-0x53 */
+	0xB4, 0xF5, 0xB4, 0xF6, 0xB4, 0xF7, 0x88, 0xF7, /* 0x54-0x57 */
+	0xB4, 0xF8, 0x88, 0xF8, 0x88, 0xF9, 0xB4, 0xF9, /* 0x58-0x5B */
+	0xB4, 0xFA, 0x88, 0xFA, 0xB4, 0xFB, 0xB4, 0xFC, /* 0x5C-0x5F */
+	0x88, 0xFB, 0x88, 0xFC, 0x88, 0xFD, 0x88, 0xFE, /* 0x60-0x63 */
+	0xB4, 0xFD, 0xB4, 0xFE, 0x89, 0x41, 0xB5, 0xA1, /* 0x64-0x67 */
+	0x89, 0x42, 0xB5, 0xA2, 0x89, 0x43, 0xB5, 0xA3, /* 0x68-0x6B */
+	0x89, 0x44, 0x89, 0x45, 0xB5, 0xA4, 0x89, 0x46, /* 0x6C-0x6F */
+	0xB5, 0xA5, 0xB5, 0xA6, 0x89, 0x47, 0x89, 0x48, /* 0x70-0x73 */
+	0xB5, 0xA7, 0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, /* 0x74-0x77 */
+	0xB5, 0xA8, 0x89, 0x4C, 0x89, 0x4D, 0x89, 0x4E, /* 0x78-0x7B */
+	0x89, 0x4F, 0x89, 0x50, 0x89, 0x51, 0x89, 0x52, /* 0x7C-0x7F */
+	
+	0xB5, 0xA9, 0xB5, 0xAA, 0x89, 0x53, 0xB5, 0xAB, /* 0x80-0x83 */
+	0xB5, 0xAC, 0xB5, 0xAD, 0x89, 0x54, 0x89, 0x55, /* 0x84-0x87 */
+	0x89, 0x56, 0x89, 0x57, 0x89, 0x58, 0x89, 0x59, /* 0x88-0x8B */
+	0xB5, 0xAE, 0x89, 0x5A, 0x89, 0x61, 0x89, 0x62, /* 0x8C-0x8F */
+	0xB5, 0xAF, 0x89, 0x63, 0x89, 0x64, 0x89, 0x65, /* 0x90-0x93 */
+	0xB5, 0xB0, 0x89, 0x66, 0x89, 0x67, 0x89, 0x68, /* 0x94-0x97 */
+	0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, 0x89, 0x6C, /* 0x98-0x9B */
+	0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, 0x89, 0x70, /* 0x9C-0x9F */
+	0xB5, 0xB1, 0xB5, 0xB2, 0x89, 0x71, 0x89, 0x72, /* 0xA0-0xA3 */
+	0x89, 0x73, 0x89, 0x74, 0x89, 0x75, 0x89, 0x76, /* 0xA4-0xA7 */
+	0xB5, 0xB3, 0x89, 0x77, 0x89, 0x78, 0x89, 0x79, /* 0xA8-0xAB */
+	0xB5, 0xB4, 0x89, 0x7A, 0x89, 0x81, 0x89, 0x82, /* 0xAC-0xAF */
+	0x89, 0x83, 0x89, 0x84, 0x89, 0x85, 0x89, 0x86, /* 0xB0-0xB3 */
+	0x89, 0x87, 0x89, 0x88, 0x89, 0x89, 0x89, 0x8A, /* 0xB4-0xB7 */
+	0x89, 0x8B, 0x89, 0x8C, 0x89, 0x8D, 0x89, 0x8E, /* 0xB8-0xBB */
+	0x89, 0x8F, 0x89, 0x90, 0x89, 0x91, 0x89, 0x92, /* 0xBC-0xBF */
+	0x89, 0x93, 0x89, 0x94, 0x89, 0x95, 0x89, 0x96, /* 0xC0-0xC3 */
+	0xB5, 0xB5, 0xB5, 0xB6, 0x89, 0x97, 0x89, 0x98, /* 0xC4-0xC7 */
+	0xB5, 0xB7, 0x89, 0x99, 0x89, 0x9A, 0xB5, 0xB8, /* 0xC8-0xCB */
+	0xB5, 0xB9, 0x89, 0x9B, 0xB5, 0xBA, 0x89, 0x9C, /* 0xCC-0xCF */
+	0xB5, 0xBB, 0x89, 0x9D, 0x89, 0x9E, 0x89, 0x9F, /* 0xD0-0xD3 */
+	0xB5, 0xBC, 0xB5, 0xBD, 0x89, 0xA0, 0xB5, 0xBE, /* 0xD4-0xD7 */
+	0x89, 0xA1, 0xB5, 0xBF, 0x89, 0xA2, 0xB5, 0xC0, /* 0xD8-0xDB */
+	0x89, 0xA3, 0xB5, 0xC1, 0x89, 0xA4, 0x89, 0xA5, /* 0xDC-0xDF */
+	0xB5, 0xC2, 0x89, 0xA6, 0x89, 0xA7, 0x89, 0xA8, /* 0xE0-0xE3 */
+	0xB5, 0xC3, 0x89, 0xA9, 0x89, 0xAA, 0x89, 0xAB, /* 0xE4-0xE7 */
+	0xB5, 0xC4, 0x89, 0xAC, 0x89, 0xAD, 0x89, 0xAE, /* 0xE8-0xEB */
+	0x89, 0xAF, 0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, /* 0xEC-0xEF */
+	0x89, 0xB3, 0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, /* 0xF0-0xF3 */
+	0x89, 0xB7, 0x89, 0xB8, 0x89, 0xB9, 0x89, 0xBA, /* 0xF4-0xF7 */
+	0x89, 0xBB, 0x89, 0xBC, 0x89, 0xBD, 0x89, 0xBE, /* 0xF8-0xFB */
+	0xB5, 0xC5, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B4[512] = {
+	0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0x00-0x03 */
+	0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0x04-0x07 */
+	0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, 0x89, 0xCD, /* 0x08-0x0B */
+	0x89, 0xCE, 0x89, 0xCF, 0x89, 0xD0, 0x89, 0xD1, /* 0x0C-0x0F */
+	0xB5, 0xC6, 0x89, 0xD2, 0x89, 0xD3, 0x89, 0xD4, /* 0x10-0x13 */
+	0x89, 0xD5, 0x89, 0xD6, 0x89, 0xD7, 0x89, 0xD8, /* 0x14-0x17 */
+	0xB5, 0xC7, 0x89, 0xD9, 0x89, 0xDA, 0x89, 0xDB, /* 0x18-0x1B */
+	0xB5, 0xC8, 0x89, 0xDC, 0x89, 0xDD, 0x89, 0xDE, /* 0x1C-0x1F */
+	0xB5, 0xC9, 0x89, 0xDF, 0x89, 0xE0, 0x89, 0xE1, /* 0x20-0x23 */
+	0x89, 0xE2, 0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, /* 0x24-0x27 */
+	0xB5, 0xCA, 0xB5, 0xCB, 0x89, 0xE6, 0xB5, 0xCC, /* 0x28-0x2B */
+	0x89, 0xE7, 0x89, 0xE8, 0x89, 0xE9, 0x89, 0xEA, /* 0x2C-0x2F */
+	0x89, 0xEB, 0x89, 0xEC, 0x89, 0xED, 0x89, 0xEE, /* 0x30-0x33 */
+	0xB5, 0xCD, 0x89, 0xEF, 0x89, 0xF0, 0x89, 0xF1, /* 0x34-0x37 */
+	0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x38-0x3B */
+	0x89, 0xF6, 0x89, 0xF7, 0x89, 0xF8, 0x89, 0xF9, /* 0x3C-0x3F */
+	0x89, 0xFA, 0x89, 0xFB, 0x89, 0xFC, 0x89, 0xFD, /* 0x40-0x43 */
+	0x89, 0xFE, 0x8A, 0x41, 0x8A, 0x42, 0x8A, 0x43, /* 0x44-0x47 */
+	0x8A, 0x44, 0x8A, 0x45, 0x8A, 0x46, 0x8A, 0x47, /* 0x48-0x4B */
+	0x8A, 0x48, 0x8A, 0x49, 0x8A, 0x4A, 0x8A, 0x4B, /* 0x4C-0x4F */
+	0xB5, 0xCE, 0xB5, 0xCF, 0x8A, 0x4C, 0x8A, 0x4D, /* 0x50-0x53 */
+	0xB5, 0xD0, 0x8A, 0x4E, 0x8A, 0x4F, 0x8A, 0x50, /* 0x54-0x57 */
+	0xB5, 0xD1, 0x8A, 0x51, 0x8A, 0x52, 0x8A, 0x53, /* 0x58-0x5B */
+	0x8A, 0x54, 0x8A, 0x55, 0x8A, 0x56, 0x8A, 0x57, /* 0x5C-0x5F */
+	0xB5, 0xD2, 0xB5, 0xD3, 0x8A, 0x58, 0xB5, 0xD4, /* 0x60-0x63 */
+	0x8A, 0x59, 0xB5, 0xD5, 0x8A, 0x5A, 0x8A, 0x61, /* 0x64-0x67 */
+	0x8A, 0x62, 0x8A, 0x63, 0x8A, 0x64, 0x8A, 0x65, /* 0x68-0x6B */
+	0xB5, 0xD6, 0x8A, 0x66, 0x8A, 0x67, 0x8A, 0x68, /* 0x6C-0x6F */
+	0x8A, 0x69, 0x8A, 0x6A, 0x8A, 0x6B, 0x8A, 0x6C, /* 0x70-0x73 */
+	0x8A, 0x6D, 0x8A, 0x6E, 0x8A, 0x6F, 0x8A, 0x70, /* 0x74-0x77 */
+	0x8A, 0x71, 0x8A, 0x72, 0x8A, 0x73, 0x8A, 0x74, /* 0x78-0x7B */
+	0x8A, 0x75, 0x8A, 0x76, 0x8A, 0x77, 0x8A, 0x78, /* 0x7C-0x7F */
+	
+	0xB5, 0xD7, 0x8A, 0x79, 0x8A, 0x7A, 0x8A, 0x81, /* 0x80-0x83 */
+	0x8A, 0x82, 0x8A, 0x83, 0x8A, 0x84, 0x8A, 0x85, /* 0x84-0x87 */
+	0xB5, 0xD8, 0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, /* 0x88-0x8B */
+	0x8A, 0x89, 0x8A, 0x8A, 0x8A, 0x8B, 0x8A, 0x8C, /* 0x8C-0x8F */
+	0x8A, 0x8D, 0x8A, 0x8E, 0x8A, 0x8F, 0x8A, 0x90, /* 0x90-0x93 */
+	0x8A, 0x91, 0x8A, 0x92, 0x8A, 0x93, 0x8A, 0x94, /* 0x94-0x97 */
+	0x8A, 0x95, 0x8A, 0x96, 0x8A, 0x97, 0x8A, 0x98, /* 0x98-0x9B */
+	0x8A, 0x99, 0xB5, 0xD9, 0x8A, 0x9A, 0x8A, 0x9B, /* 0x9C-0x9F */
+	0x8A, 0x9C, 0x8A, 0x9D, 0x8A, 0x9E, 0x8A, 0x9F, /* 0xA0-0xA3 */
+	0xB5, 0xDA, 0x8A, 0xA0, 0x8A, 0xA1, 0x8A, 0xA2, /* 0xA4-0xA7 */
+	0xB5, 0xDB, 0x8A, 0xA3, 0x8A, 0xA4, 0x8A, 0xA5, /* 0xA8-0xAB */
+	0xB5, 0xDC, 0x8A, 0xA6, 0x8A, 0xA7, 0x8A, 0xA8, /* 0xAC-0xAF */
+	0x8A, 0xA9, 0x8A, 0xAA, 0x8A, 0xAB, 0x8A, 0xAC, /* 0xB0-0xB3 */
+	0x8A, 0xAD, 0xB5, 0xDD, 0x8A, 0xAE, 0xB5, 0xDE, /* 0xB4-0xB7 */
+	0x8A, 0xAF, 0xB5, 0xDF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xB8-0xBB */
+	0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xBC-0xBF */
+	0xB5, 0xE0, 0x8A, 0xB6, 0x8A, 0xB7, 0x8A, 0xB8, /* 0xC0-0xC3 */
+	0xB5, 0xE1, 0x8A, 0xB9, 0x8A, 0xBA, 0x8A, 0xBB, /* 0xC4-0xC7 */
+	0xB5, 0xE2, 0x8A, 0xBC, 0x8A, 0xBD, 0x8A, 0xBE, /* 0xC8-0xCB */
+	0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, 0x8A, 0xC2, /* 0xCC-0xCF */
+	0xB5, 0xE3, 0x8A, 0xC3, 0x8A, 0xC4, 0x8A, 0xC5, /* 0xD0-0xD3 */
+	0x8A, 0xC6, 0xB5, 0xE4, 0x8A, 0xC7, 0x8A, 0xC8, /* 0xD4-0xD7 */
+	0x8A, 0xC9, 0x8A, 0xCA, 0x8A, 0xCB, 0x8A, 0xCC, /* 0xD8-0xDB */
+	0xB5, 0xE5, 0xB5, 0xE6, 0x8A, 0xCD, 0x8A, 0xCE, /* 0xDC-0xDF */
+	0xB5, 0xE7, 0x8A, 0xCF, 0x8A, 0xD0, 0xB5, 0xE8, /* 0xE0-0xE3 */
+	0xB5, 0xE9, 0x8A, 0xD1, 0xB5, 0xEA, 0x8A, 0xD2, /* 0xE4-0xE7 */
+	0x8A, 0xD3, 0x8A, 0xD4, 0x8A, 0xD5, 0x8A, 0xD6, /* 0xE8-0xEB */
+	0xB5, 0xEB, 0xB5, 0xEC, 0x8A, 0xD7, 0xB5, 0xED, /* 0xEC-0xEF */
+	0x8A, 0xD8, 0xB5, 0xEE, 0x8A, 0xD9, 0x8A, 0xDA, /* 0xF0-0xF3 */
+	0x8A, 0xDB, 0x8A, 0xDC, 0x8A, 0xDD, 0x8A, 0xDE, /* 0xF4-0xF7 */
+	0xB5, 0xEF, 0x8A, 0xDF, 0x8A, 0xE0, 0x8A, 0xE1, /* 0xF8-0xFB */
+	0x8A, 0xE2, 0x8A, 0xE3, 0x8A, 0xE4, 0x8A, 0xE5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B5[512] = {
+	0x8A, 0xE6, 0x8A, 0xE7, 0x8A, 0xE8, 0x8A, 0xE9, /* 0x00-0x03 */
+	0x8A, 0xEA, 0x8A, 0xEB, 0x8A, 0xEC, 0x8A, 0xED, /* 0x04-0x07 */
+	0x8A, 0xEE, 0x8A, 0xEF, 0x8A, 0xF0, 0x8A, 0xF1, /* 0x08-0x0B */
+	0x8A, 0xF2, 0x8A, 0xF3, 0x8A, 0xF4, 0x8A, 0xF5, /* 0x0C-0x0F */
+	0x8A, 0xF6, 0x8A, 0xF7, 0x8A, 0xF8, 0x8A, 0xF9, /* 0x10-0x13 */
+	0xB5, 0xF0, 0xB5, 0xF1, 0x8A, 0xFA, 0x8A, 0xFB, /* 0x14-0x17 */
+	0xB5, 0xF2, 0x8A, 0xFC, 0x8A, 0xFD, 0xB5, 0xF3, /* 0x18-0x1B */
+	0xB5, 0xF4, 0x8A, 0xFE, 0x8B, 0x41, 0x8B, 0x42, /* 0x1C-0x1F */
+	0x8B, 0x43, 0x8B, 0x44, 0x8B, 0x45, 0x8B, 0x46, /* 0x20-0x23 */
+	0xB5, 0xF5, 0xB5, 0xF6, 0x8B, 0x47, 0xB5, 0xF7, /* 0x24-0x27 */
+	0xB5, 0xF8, 0xB5, 0xF9, 0xB5, 0xFA, 0x8B, 0x48, /* 0x28-0x2B */
+	0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, 0x8B, 0x4C, /* 0x2C-0x2F */
+	0xB5, 0xFB, 0xB5, 0xFC, 0x8B, 0x4D, 0x8B, 0x4E, /* 0x30-0x33 */
+	0xB5, 0xFD, 0x8B, 0x4F, 0x8B, 0x50, 0x8B, 0x51, /* 0x34-0x37 */
+	0xB5, 0xFE, 0x8B, 0x52, 0x8B, 0x53, 0x8B, 0x54, /* 0x38-0x3B */
+	0x8B, 0x55, 0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, /* 0x3C-0x3F */
+	0xB6, 0xA1, 0xB6, 0xA2, 0x8B, 0x59, 0xB6, 0xA3, /* 0x40-0x43 */
+	0xB6, 0xA4, 0xB6, 0xA5, 0x8B, 0x5A, 0x8B, 0x61, /* 0x44-0x47 */
+	0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0xB6, 0xA6, /* 0x48-0x4B */
+	0xB6, 0xA7, 0xB6, 0xA8, 0x8B, 0x65, 0x8B, 0x66, /* 0x4C-0x4F */
+	0xB6, 0xA9, 0x8B, 0x67, 0x8B, 0x68, 0x8B, 0x69, /* 0x50-0x53 */
+	0xB6, 0xAA, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x54-0x57 */
+	0x8B, 0x6D, 0x8B, 0x6E, 0x8B, 0x6F, 0x8B, 0x70, /* 0x58-0x5B */
+	0xB6, 0xAB, 0xB6, 0xAC, 0x8B, 0x71, 0xB6, 0xAD, /* 0x5C-0x5F */
+	0xB6, 0xAE, 0xB6, 0xAF, 0x8B, 0x72, 0x8B, 0x73, /* 0x60-0x63 */
+	0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, 0x8B, 0x77, /* 0x64-0x67 */
+	0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, 0x8B, 0x81, /* 0x68-0x6B */
+	0x8B, 0x82, 0x8B, 0x83, 0x8B, 0x84, 0x8B, 0x85, /* 0x6C-0x6F */
+	0x8B, 0x86, 0x8B, 0x87, 0x8B, 0x88, 0x8B, 0x89, /* 0x70-0x73 */
+	0x8B, 0x8A, 0x8B, 0x8B, 0x8B, 0x8C, 0x8B, 0x8D, /* 0x74-0x77 */
+	0x8B, 0x8E, 0x8B, 0x8F, 0x8B, 0x90, 0x8B, 0x91, /* 0x78-0x7B */
+	0x8B, 0x92, 0x8B, 0x93, 0x8B, 0x94, 0x8B, 0x95, /* 0x7C-0x7F */
+	
+	0x8B, 0x96, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0x80-0x83 */
+	0x8B, 0x9A, 0x8B, 0x9B, 0x8B, 0x9C, 0x8B, 0x9D, /* 0x84-0x87 */
+	0x8B, 0x9E, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0x88-0x8B */
+	0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, 0x8B, 0xA5, /* 0x8C-0x8F */
+	0x8B, 0xA6, 0x8B, 0xA7, 0x8B, 0xA8, 0x8B, 0xA9, /* 0x90-0x93 */
+	0x8B, 0xAA, 0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, /* 0x94-0x97 */
+	0x8B, 0xAE, 0x8B, 0xAF, 0x8B, 0xB0, 0x8B, 0xB1, /* 0x98-0x9B */
+	0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0x9C-0x9F */
+	0xB6, 0xB0, 0xB6, 0xB1, 0x8B, 0xB6, 0x8B, 0xB7, /* 0xA0-0xA3 */
+	0xB6, 0xB2, 0x8B, 0xB8, 0x8B, 0xB9, 0x8B, 0xBA, /* 0xA4-0xA7 */
+	0xB6, 0xB3, 0x8B, 0xBB, 0xB6, 0xB4, 0xB6, 0xB5, /* 0xA8-0xAB */
+	0x8B, 0xBC, 0x8B, 0xBD, 0x8B, 0xBE, 0x8B, 0xBF, /* 0xAC-0xAF */
+	0xB6, 0xB6, 0xB6, 0xB7, 0x8B, 0xC0, 0xB6, 0xB8, /* 0xB0-0xB3 */
+	0xB6, 0xB9, 0xB6, 0xBA, 0x8B, 0xC1, 0x8B, 0xC2, /* 0xB4-0xB7 */
+	0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, 0xB6, 0xBB, /* 0xB8-0xBB */
+	0xB6, 0xBC, 0xB6, 0xBD, 0x8B, 0xC6, 0x8B, 0xC7, /* 0xBC-0xBF */
+	0xB6, 0xBE, 0x8B, 0xC8, 0x8B, 0xC9, 0x8B, 0xCA, /* 0xC0-0xC3 */
+	0xB6, 0xBF, 0x8B, 0xCB, 0x8B, 0xCC, 0x8B, 0xCD, /* 0xC4-0xC7 */
+	0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, 0x8B, 0xD1, /* 0xC8-0xCB */
+	0xB6, 0xC0, 0xB6, 0xC1, 0x8B, 0xD2, 0xB6, 0xC2, /* 0xCC-0xCF */
+	0xB6, 0xC3, 0xB6, 0xC4, 0x8B, 0xD3, 0x8B, 0xD4, /* 0xD0-0xD3 */
+	0x8B, 0xD5, 0x8B, 0xD6, 0x8B, 0xD7, 0x8B, 0xD8, /* 0xD4-0xD7 */
+	0xB6, 0xC5, 0x8B, 0xD9, 0x8B, 0xDA, 0x8B, 0xDB, /* 0xD8-0xDB */
+	0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, 0x8B, 0xDF, /* 0xDC-0xDF */
+	0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, 0x8B, 0xE3, /* 0xE0-0xE3 */
+	0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, 0x8B, 0xE7, /* 0xE4-0xE7 */
+	0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, 0x8B, 0xEB, /* 0xE8-0xEB */
+	0xB6, 0xC6, 0x8B, 0xEC, 0x8B, 0xED, 0x8B, 0xEE, /* 0xEC-0xEF */
+	0x8B, 0xEF, 0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, /* 0xF0-0xF3 */
+	0x8B, 0xF3, 0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, /* 0xF4-0xF7 */
+	0x8B, 0xF7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0xF8-0xFB */
+	0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B6[512] = {
+	0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, 0x8C, 0x44, /* 0x00-0x03 */
+	0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, 0x8C, 0x48, /* 0x04-0x07 */
+	0x8C, 0x49, 0x8C, 0x4A, 0x8C, 0x4B, 0x8C, 0x4C, /* 0x08-0x0B */
+	0x8C, 0x4D, 0x8C, 0x4E, 0x8C, 0x4F, 0x8C, 0x50, /* 0x0C-0x0F */
+	0xB6, 0xC7, 0xB6, 0xC8, 0x8C, 0x51, 0x8C, 0x52, /* 0x10-0x13 */
+	0xB6, 0xC9, 0x8C, 0x53, 0x8C, 0x54, 0x8C, 0x55, /* 0x14-0x17 */
+	0xB6, 0xCA, 0x8C, 0x56, 0x8C, 0x57, 0x8C, 0x58, /* 0x18-0x1B */
+	0x8C, 0x59, 0x8C, 0x5A, 0x8C, 0x61, 0x8C, 0x62, /* 0x1C-0x1F */
+	0x8C, 0x63, 0x8C, 0x64, 0x8C, 0x65, 0x8C, 0x66, /* 0x20-0x23 */
+	0x8C, 0x67, 0xB6, 0xCB, 0x8C, 0x68, 0x8C, 0x69, /* 0x24-0x27 */
+	0x8C, 0x6A, 0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, /* 0x28-0x2B */
+	0xB6, 0xCC, 0x8C, 0x6E, 0x8C, 0x6F, 0x8C, 0x70, /* 0x2C-0x2F */
+	0x8C, 0x71, 0x8C, 0x72, 0x8C, 0x73, 0x8C, 0x74, /* 0x30-0x33 */
+	0xB6, 0xCD, 0x8C, 0x75, 0x8C, 0x76, 0x8C, 0x77, /* 0x34-0x37 */
+	0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x81, /* 0x38-0x3B */
+	0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, 0x8C, 0x85, /* 0x3C-0x3F */
+	0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, 0x8C, 0x89, /* 0x40-0x43 */
+	0x8C, 0x8A, 0x8C, 0x8B, 0x8C, 0x8C, 0x8C, 0x8D, /* 0x44-0x47 */
+	0xB6, 0xCE, 0x8C, 0x8E, 0x8C, 0x8F, 0x8C, 0x90, /* 0x48-0x4B */
+	0x8C, 0x91, 0x8C, 0x92, 0x8C, 0x93, 0x8C, 0x94, /* 0x4C-0x4F */
+	0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, 0x8C, 0x98, /* 0x50-0x53 */
+	0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, 0x8C, 0x9C, /* 0x54-0x57 */
+	0x8C, 0x9D, 0x8C, 0x9E, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x58-0x5B */
+	0x8C, 0xA1, 0x8C, 0xA2, 0x8C, 0xA3, 0x8C, 0xA4, /* 0x5C-0x5F */
+	0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0x8C, 0xA8, /* 0x60-0x63 */
+	0xB6, 0xCF, 0x8C, 0xA9, 0x8C, 0xAA, 0x8C, 0xAB, /* 0x64-0x67 */
+	0xB6, 0xD0, 0x8C, 0xAC, 0x8C, 0xAD, 0x8C, 0xAE, /* 0x68-0x6B */
+	0x8C, 0xAF, 0x8C, 0xB0, 0x8C, 0xB1, 0x8C, 0xB2, /* 0x6C-0x6F */
+	0x8C, 0xB3, 0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, /* 0x70-0x73 */
+	0x8C, 0xB7, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x74-0x77 */
+	0x8C, 0xBB, 0x8C, 0xBC, 0x8C, 0xBD, 0x8C, 0xBE, /* 0x78-0x7B */
+	0x8C, 0xBF, 0x8C, 0xC0, 0x8C, 0xC1, 0x8C, 0xC2, /* 0x7C-0x7F */
+	
+	0x8C, 0xC3, 0x8C, 0xC4, 0x8C, 0xC5, 0x8C, 0xC6, /* 0x80-0x83 */
+	0x8C, 0xC7, 0x8C, 0xC8, 0x8C, 0xC9, 0x8C, 0xCA, /* 0x84-0x87 */
+	0x8C, 0xCB, 0x8C, 0xCC, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x88-0x8B */
+	0x8C, 0xCF, 0x8C, 0xD0, 0x8C, 0xD1, 0x8C, 0xD2, /* 0x8C-0x8F */
+	0x8C, 0xD3, 0x8C, 0xD4, 0x8C, 0xD5, 0x8C, 0xD6, /* 0x90-0x93 */
+	0x8C, 0xD7, 0x8C, 0xD8, 0x8C, 0xD9, 0x8C, 0xDA, /* 0x94-0x97 */
+	0x8C, 0xDB, 0x8C, 0xDC, 0x8C, 0xDD, 0x8C, 0xDE, /* 0x98-0x9B */
+	0xB6, 0xD1, 0xB6, 0xD2, 0x8C, 0xDF, 0x8C, 0xE0, /* 0x9C-0x9F */
+	0xB6, 0xD3, 0x8C, 0xE1, 0x8C, 0xE2, 0x8C, 0xE3, /* 0xA0-0xA3 */
+	0xB6, 0xD4, 0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, /* 0xA4-0xA7 */
+	0x8C, 0xE7, 0x8C, 0xE8, 0x8C, 0xE9, 0xB6, 0xD5, /* 0xA8-0xAB */
+	0xB6, 0xD6, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0xAC-0xAF */
+	0x8C, 0xED, 0xB6, 0xD7, 0x8C, 0xEE, 0x8C, 0xEF, /* 0xB0-0xB3 */
+	0x8C, 0xF0, 0x8C, 0xF1, 0x8C, 0xF2, 0x8C, 0xF3, /* 0xB4-0xB7 */
+	0x8C, 0xF4, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0xB8-0xBB */
+	0x8C, 0xF8, 0x8C, 0xF9, 0x8C, 0xFA, 0x8C, 0xFB, /* 0xBC-0xBF */
+	0x8C, 0xFC, 0x8C, 0xFD, 0x8C, 0xFE, 0x8D, 0x41, /* 0xC0-0xC3 */
+	0x8D, 0x42, 0x8D, 0x43, 0x8D, 0x44, 0x8D, 0x45, /* 0xC4-0xC7 */
+	0x8D, 0x46, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xC8-0xCB */
+	0x8D, 0x4A, 0x8D, 0x4B, 0x8D, 0x4C, 0x8D, 0x4D, /* 0xCC-0xCF */
+	0x8D, 0x4E, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xD0-0xD3 */
+	0xB6, 0xD8, 0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, /* 0xD4-0xD7 */
+	0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xD8-0xDB */
+	0x8D, 0x59, 0x8D, 0x5A, 0x8D, 0x61, 0x8D, 0x62, /* 0xDC-0xDF */
+	0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xE0-0xE3 */
+	0x8D, 0x67, 0x8D, 0x68, 0x8D, 0x69, 0x8D, 0x6A, /* 0xE4-0xE7 */
+	0x8D, 0x6B, 0x8D, 0x6C, 0x8D, 0x6D, 0x8D, 0x6E, /* 0xE8-0xEB */
+	0x8D, 0x6F, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xEC-0xEF */
+	0xB6, 0xD9, 0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, /* 0xF0-0xF3 */
+	0xB6, 0xDA, 0x8D, 0x76, 0x8D, 0x77, 0x8D, 0x78, /* 0xF4-0xF7 */
+	0xB6, 0xDB, 0x8D, 0x79, 0x8D, 0x7A, 0x8D, 0x81, /* 0xF8-0xFB */
+	0x8D, 0x82, 0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B7[512] = {
+	0xB6, 0xDC, 0xB6, 0xDD, 0x8D, 0x86, 0x8D, 0x87, /* 0x00-0x03 */
+	0x8D, 0x88, 0xB6, 0xDE, 0x8D, 0x89, 0x8D, 0x8A, /* 0x04-0x07 */
+	0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, 0x8D, 0x8E, /* 0x08-0x0B */
+	0x8D, 0x8F, 0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, /* 0x0C-0x0F */
+	0x8D, 0x93, 0x8D, 0x94, 0x8D, 0x95, 0x8D, 0x96, /* 0x10-0x13 */
+	0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, 0x8D, 0x9A, /* 0x14-0x17 */
+	0x8D, 0x9B, 0x8D, 0x9C, 0x8D, 0x9D, 0x8D, 0x9E, /* 0x18-0x1B */
+	0x8D, 0x9F, 0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, /* 0x1C-0x1F */
+	0x8D, 0xA3, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x20-0x23 */
+	0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x24-0x27 */
+	0xB6, 0xDF, 0xB6, 0xE0, 0x8D, 0xAB, 0x8D, 0xAC, /* 0x28-0x2B */
+	0xB6, 0xE1, 0x8D, 0xAD, 0x8D, 0xAE, 0xB6, 0xE2, /* 0x2C-0x2F */
+	0xB6, 0xE3, 0x8D, 0xAF, 0x8D, 0xB0, 0x8D, 0xB1, /* 0x30-0x33 */
+	0x8D, 0xB2, 0x8D, 0xB3, 0x8D, 0xB4, 0x8D, 0xB5, /* 0x34-0x37 */
+	0xB6, 0xE4, 0xB6, 0xE5, 0x8D, 0xB6, 0xB6, 0xE6, /* 0x38-0x3B */
+	0x8D, 0xB7, 0x8D, 0xB8, 0x8D, 0xB9, 0x8D, 0xBA, /* 0x3C-0x3F */
+	0x8D, 0xBB, 0x8D, 0xBC, 0x8D, 0xBD, 0x8D, 0xBE, /* 0x40-0x43 */
+	0xB6, 0xE7, 0x8D, 0xBF, 0x8D, 0xC0, 0x8D, 0xC1, /* 0x44-0x47 */
+	0xB6, 0xE8, 0x8D, 0xC2, 0x8D, 0xC3, 0x8D, 0xC4, /* 0x48-0x4B */
+	0xB6, 0xE9, 0x8D, 0xC5, 0x8D, 0xC6, 0x8D, 0xC7, /* 0x4C-0x4F */
+	0x8D, 0xC8, 0x8D, 0xC9, 0x8D, 0xCA, 0x8D, 0xCB, /* 0x50-0x53 */
+	0xB6, 0xEA, 0xB6, 0xEB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x54-0x57 */
+	0x8D, 0xCE, 0x8D, 0xCF, 0x8D, 0xD0, 0x8D, 0xD1, /* 0x58-0x5B */
+	0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x5C-0x5F */
+	0xB6, 0xEC, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x60-0x63 */
+	0xB6, 0xED, 0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, /* 0x64-0x67 */
+	0xB6, 0xEE, 0x8D, 0xDC, 0x8D, 0xDD, 0x8D, 0xDE, /* 0x68-0x6B */
+	0x8D, 0xDF, 0x8D, 0xE0, 0x8D, 0xE1, 0x8D, 0xE2, /* 0x6C-0x6F */
+	0xB6, 0xEF, 0xB6, 0xF0, 0x8D, 0xE3, 0xB6, 0xF1, /* 0x70-0x73 */
+	0x8D, 0xE4, 0xB6, 0xF2, 0x8D, 0xE5, 0x8D, 0xE6, /* 0x74-0x77 */
+	0x8D, 0xE7, 0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, /* 0x78-0x7B */
+	0xB6, 0xF3, 0xB6, 0xF4, 0x8D, 0xEB, 0x8D, 0xEC, /* 0x7C-0x7F */
+	
+	0xB6, 0xF5, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x80-0x83 */
+	0xB6, 0xF6, 0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, /* 0x84-0x87 */
+	0x8D, 0xF3, 0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, /* 0x88-0x8B */
+	0xB6, 0xF7, 0xB6, 0xF8, 0x8D, 0xF7, 0xB6, 0xF9, /* 0x8C-0x8F */
+	0xB6, 0xFA, 0xB6, 0xFB, 0xB6, 0xFC, 0x8D, 0xF8, /* 0x90-0x93 */
+	0x8D, 0xF9, 0x8D, 0xFA, 0xB6, 0xFD, 0xB6, 0xFE, /* 0x94-0x97 */
+	0xB7, 0xA1, 0xB7, 0xA2, 0x8D, 0xFB, 0x8D, 0xFC, /* 0x98-0x9B */
+	0xB7, 0xA3, 0x8D, 0xFD, 0x8D, 0xFE, 0x8E, 0x41, /* 0x9C-0x9F */
+	0xB7, 0xA4, 0x8E, 0x42, 0x8E, 0x43, 0x8E, 0x44, /* 0xA0-0xA3 */
+	0x8E, 0x45, 0x8E, 0x46, 0x8E, 0x47, 0x8E, 0x48, /* 0xA4-0xA7 */
+	0xB7, 0xA5, 0xB7, 0xA6, 0x8E, 0x49, 0xB7, 0xA7, /* 0xA8-0xAB */
+	0xB7, 0xA8, 0xB7, 0xA9, 0x8E, 0x4A, 0x8E, 0x4B, /* 0xAC-0xAF */
+	0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, 0x8E, 0x4F, /* 0xB0-0xB3 */
+	0xB7, 0xAA, 0xB7, 0xAB, 0x8E, 0x50, 0x8E, 0x51, /* 0xB4-0xB7 */
+	0xB7, 0xAC, 0x8E, 0x52, 0x8E, 0x53, 0x8E, 0x54, /* 0xB8-0xBB */
+	0x8E, 0x55, 0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, /* 0xBC-0xBF */
+	0x8E, 0x59, 0x8E, 0x5A, 0x8E, 0x61, 0x8E, 0x62, /* 0xC0-0xC3 */
+	0x8E, 0x63, 0x8E, 0x64, 0x8E, 0x65, 0xB7, 0xAD, /* 0xC4-0xC7 */
+	0x8E, 0x66, 0xB7, 0xAE, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */
+	0x8E, 0x69, 0x8E, 0x6A, 0x8E, 0x6B, 0x8E, 0x6C, /* 0xCC-0xCF */
+	0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, 0x8E, 0x70, /* 0xD0-0xD3 */
+	0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, 0x8E, 0x74, /* 0xD4-0xD7 */
+	0x8E, 0x75, 0x8E, 0x76, 0x8E, 0x77, 0x8E, 0x78, /* 0xD8-0xDB */
+	0x8E, 0x79, 0x8E, 0x7A, 0x8E, 0x81, 0x8E, 0x82, /* 0xDC-0xDF */
+	0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xE0-0xE3 */
+	0x8E, 0x87, 0x8E, 0x88, 0x8E, 0x89, 0x8E, 0x8A, /* 0xE4-0xE7 */
+	0x8E, 0x8B, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0xE8-0xEB */
+	0xB7, 0xAF, 0xB7, 0xB0, 0x8E, 0x8F, 0x8E, 0x90, /* 0xEC-0xEF */
+	0xB7, 0xB1, 0x8E, 0x91, 0x8E, 0x92, 0x8E, 0x93, /* 0xF0-0xF3 */
+	0xB7, 0xB2, 0x8E, 0x94, 0x8E, 0x95, 0x8E, 0x96, /* 0xF4-0xF7 */
+	0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, 0x8E, 0x9A, /* 0xF8-0xFB */
+	0xB7, 0xB3, 0xB7, 0xB4, 0x8E, 0x9B, 0xB7, 0xB5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B8[512] = {
+	0xB7, 0xB6, 0xB7, 0xB7, 0x8E, 0x9C, 0x8E, 0x9D, /* 0x00-0x03 */
+	0x8E, 0x9E, 0x8E, 0x9F, 0x8E, 0xA0, 0xB7, 0xB8, /* 0x04-0x07 */
+	0xB7, 0xB9, 0xB7, 0xBA, 0x8E, 0xA1, 0x8E, 0xA2, /* 0x08-0x0B */
+	0xB7, 0xBB, 0x8E, 0xA3, 0x8E, 0xA4, 0x8E, 0xA5, /* 0x0C-0x0F */
+	0xB7, 0xBC, 0x8E, 0xA6, 0x8E, 0xA7, 0x8E, 0xA8, /* 0x10-0x13 */
+	0x8E, 0xA9, 0x8E, 0xAA, 0x8E, 0xAB, 0x8E, 0xAC, /* 0x14-0x17 */
+	0xB7, 0xBD, 0xB7, 0xBE, 0x8E, 0xAD, 0xB7, 0xBF, /* 0x18-0x1B */
+	0x8E, 0xAE, 0xB7, 0xC0, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x1C-0x1F */
+	0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x20-0x23 */
+	0xB7, 0xC1, 0xB7, 0xC2, 0x8E, 0xB5, 0x8E, 0xB6, /* 0x24-0x27 */
+	0xB7, 0xC3, 0x8E, 0xB7, 0x8E, 0xB8, 0x8E, 0xB9, /* 0x28-0x2B */
+	0xB7, 0xC4, 0x8E, 0xBA, 0x8E, 0xBB, 0x8E, 0xBC, /* 0x2C-0x2F */
+	0x8E, 0xBD, 0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, /* 0x30-0x33 */
+	0xB7, 0xC5, 0xB7, 0xC6, 0x8E, 0xC1, 0xB7, 0xC7, /* 0x34-0x37 */
+	0xB7, 0xC8, 0xB7, 0xC9, 0x8E, 0xC2, 0x8E, 0xC3, /* 0x38-0x3B */
+	0x8E, 0xC4, 0x8E, 0xC5, 0x8E, 0xC6, 0x8E, 0xC7, /* 0x3C-0x3F */
+	0xB7, 0xCA, 0x8E, 0xC8, 0x8E, 0xC9, 0x8E, 0xCA, /* 0x40-0x43 */
+	0xB7, 0xCB, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x44-0x47 */
+	0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x48-0x4B */
+	0x8E, 0xD2, 0x8E, 0xD3, 0x8E, 0xD4, 0x8E, 0xD5, /* 0x4C-0x4F */
+	0x8E, 0xD6, 0xB7, 0xCC, 0x8E, 0xD7, 0xB7, 0xCD, /* 0x50-0x53 */
+	0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, 0x8E, 0xDB, /* 0x54-0x57 */
+	0x8E, 0xDC, 0x8E, 0xDD, 0x8E, 0xDE, 0x8E, 0xDF, /* 0x58-0x5B */
+	0xB7, 0xCE, 0xB7, 0xCF, 0x8E, 0xE0, 0x8E, 0xE1, /* 0x5C-0x5F */
+	0xB7, 0xD0, 0x8E, 0xE2, 0x8E, 0xE3, 0x8E, 0xE4, /* 0x60-0x63 */
+	0xB7, 0xD1, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0x64-0x67 */
+	0x8E, 0xE8, 0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, /* 0x68-0x6B */
+	0xB7, 0xD2, 0xB7, 0xD3, 0x8E, 0xEC, 0xB7, 0xD4, /* 0x6C-0x6F */
+	0x8E, 0xED, 0xB7, 0xD5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0x70-0x73 */
+	0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0x8E, 0xF3, /* 0x74-0x77 */
+	0xB7, 0xD6, 0x8E, 0xF4, 0x8E, 0xF5, 0x8E, 0xF6, /* 0x78-0x7B */
+	0xB7, 0xD7, 0x8E, 0xF7, 0x8E, 0xF8, 0x8E, 0xF9, /* 0x7C-0x7F */
+	
+	0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, 0x8E, 0xFD, /* 0x80-0x83 */
+	0x8E, 0xFE, 0x8F, 0x41, 0x8F, 0x42, 0x8F, 0x43, /* 0x84-0x87 */
+	0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0x88-0x8B */
+	0x8F, 0x48, 0xB7, 0xD8, 0x8F, 0x49, 0x8F, 0x4A, /* 0x8C-0x8F */
+	0x8F, 0x4B, 0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, /* 0x90-0x93 */
+	0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0x94-0x97 */
+	0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0x98-0x9B */
+	0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0x9C-0x9F */
+	0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xA0-0xA3 */
+	0x8F, 0x65, 0x8F, 0x66, 0x8F, 0x67, 0x8F, 0x68, /* 0xA4-0xA7 */
+	0xB7, 0xD9, 0x8F, 0x69, 0x8F, 0x6A, 0x8F, 0x6B, /* 0xA8-0xAB */
+	0x8F, 0x6C, 0x8F, 0x6D, 0x8F, 0x6E, 0x8F, 0x6F, /* 0xAC-0xAF */
+	0xB7, 0xDA, 0x8F, 0x70, 0x8F, 0x71, 0x8F, 0x72, /* 0xB0-0xB3 */
+	0xB7, 0xDB, 0x8F, 0x73, 0x8F, 0x74, 0x8F, 0x75, /* 0xB4-0xB7 */
+	0xB7, 0xDC, 0x8F, 0x76, 0x8F, 0x77, 0x8F, 0x78, /* 0xB8-0xBB */
+	0x8F, 0x79, 0x8F, 0x7A, 0x8F, 0x81, 0x8F, 0x82, /* 0xBC-0xBF */
+	0xB7, 0xDD, 0xB7, 0xDE, 0x8F, 0x83, 0xB7, 0xDF, /* 0xC0-0xC3 */
+	0x8F, 0x84, 0xB7, 0xE0, 0x8F, 0x85, 0x8F, 0x86, /* 0xC4-0xC7 */
+	0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0xC8-0xCB */
+	0xB7, 0xE1, 0x8F, 0x8B, 0x8F, 0x8C, 0x8F, 0x8D, /* 0xCC-0xCF */
+	0xB7, 0xE2, 0x8F, 0x8E, 0x8F, 0x8F, 0x8F, 0x90, /* 0xD0-0xD3 */
+	0xB7, 0xE3, 0x8F, 0x91, 0x8F, 0x92, 0x8F, 0x93, /* 0xD4-0xD7 */
+	0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, 0x8F, 0x97, /* 0xD8-0xDB */
+	0x8F, 0x98, 0xB7, 0xE4, 0x8F, 0x99, 0xB7, 0xE5, /* 0xDC-0xDF */
+	0x8F, 0x9A, 0xB7, 0xE6, 0x8F, 0x9B, 0x8F, 0x9C, /* 0xE0-0xE3 */
+	0x8F, 0x9D, 0x8F, 0x9E, 0x8F, 0x9F, 0x8F, 0xA0, /* 0xE4-0xE7 */
+	0xB7, 0xE7, 0xB7, 0xE8, 0x8F, 0xA1, 0x8F, 0xA2, /* 0xE8-0xEB */
+	0xB7, 0xE9, 0x8F, 0xA3, 0x8F, 0xA4, 0x8F, 0xA5, /* 0xEC-0xEF */
+	0xB7, 0xEA, 0x8F, 0xA6, 0x8F, 0xA7, 0x8F, 0xA8, /* 0xF0-0xF3 */
+	0x8F, 0xA9, 0x8F, 0xAA, 0x8F, 0xAB, 0x8F, 0xAC, /* 0xF4-0xF7 */
+	0xB7, 0xEB, 0xB7, 0xEC, 0x8F, 0xAD, 0xB7, 0xED, /* 0xF8-0xFB */
+	0x8F, 0xAE, 0xB7, 0xEE, 0x8F, 0xAF, 0x8F, 0xB0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B9[512] = {
+	0x8F, 0xB1, 0x8F, 0xB2, 0x8F, 0xB3, 0x8F, 0xB4, /* 0x00-0x03 */
+	0xB7, 0xEF, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x04-0x07 */
+	0x8F, 0xB8, 0x8F, 0xB9, 0x8F, 0xBA, 0x8F, 0xBB, /* 0x08-0x0B */
+	0x8F, 0xBC, 0x8F, 0xBD, 0x8F, 0xBE, 0x8F, 0xBF, /* 0x0C-0x0F */
+	0x8F, 0xC0, 0x8F, 0xC1, 0x8F, 0xC2, 0x8F, 0xC3, /* 0x10-0x13 */
+	0x8F, 0xC4, 0x8F, 0xC5, 0x8F, 0xC6, 0x8F, 0xC7, /* 0x14-0x17 */
+	0xB7, 0xF0, 0x8F, 0xC8, 0x8F, 0xC9, 0x8F, 0xCA, /* 0x18-0x1B */
+	0x8F, 0xCB, 0x8F, 0xCC, 0x8F, 0xCD, 0x8F, 0xCE, /* 0x1C-0x1F */
+	0xB7, 0xF1, 0x8F, 0xCF, 0x8F, 0xD0, 0x8F, 0xD1, /* 0x20-0x23 */
+	0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, 0x8F, 0xD5, /* 0x24-0x27 */
+	0x8F, 0xD6, 0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, /* 0x28-0x2B */
+	0x8F, 0xDA, 0x8F, 0xDB, 0x8F, 0xDC, 0x8F, 0xDD, /* 0x2C-0x2F */
+	0x8F, 0xDE, 0x8F, 0xDF, 0x8F, 0xE0, 0x8F, 0xE1, /* 0x30-0x33 */
+	0x8F, 0xE2, 0x8F, 0xE3, 0x8F, 0xE4, 0x8F, 0xE5, /* 0x34-0x37 */
+	0x8F, 0xE6, 0x8F, 0xE7, 0x8F, 0xE8, 0x8F, 0xE9, /* 0x38-0x3B */
+	0xB7, 0xF2, 0xB7, 0xF3, 0x8F, 0xEA, 0x8F, 0xEB, /* 0x3C-0x3F */
+	0xB7, 0xF4, 0x8F, 0xEC, 0x8F, 0xED, 0x8F, 0xEE, /* 0x40-0x43 */
+	0xB7, 0xF5, 0x8F, 0xEF, 0x8F, 0xF0, 0x8F, 0xF1, /* 0x44-0x47 */
+	0x8F, 0xF2, 0x8F, 0xF3, 0x8F, 0xF4, 0x8F, 0xF5, /* 0x48-0x4B */
+	0xB7, 0xF6, 0x8F, 0xF6, 0x8F, 0xF7, 0xB7, 0xF7, /* 0x4C-0x4F */
+	0x8F, 0xF8, 0xB7, 0xF8, 0x8F, 0xF9, 0x8F, 0xFA, /* 0x50-0x53 */
+	0x8F, 0xFB, 0x8F, 0xFC, 0x8F, 0xFD, 0x8F, 0xFE, /* 0x54-0x57 */
+	0xB7, 0xF9, 0xB7, 0xFA, 0x90, 0x41, 0x90, 0x42, /* 0x58-0x5B */
+	0xB7, 0xFB, 0x90, 0x43, 0x90, 0x44, 0x90, 0x45, /* 0x5C-0x5F */
+	0xB7, 0xFC, 0x90, 0x46, 0x90, 0x47, 0x90, 0x48, /* 0x60-0x63 */
+	0x90, 0x49, 0x90, 0x4A, 0x90, 0x4B, 0x90, 0x4C, /* 0x64-0x67 */
+	0xB7, 0xFD, 0xB7, 0xFE, 0x90, 0x4D, 0xB8, 0xA1, /* 0x68-0x6B */
+	0x90, 0x4E, 0xB8, 0xA2, 0x90, 0x4F, 0x90, 0x50, /* 0x6C-0x6F */
+	0x90, 0x51, 0x90, 0x52, 0x90, 0x53, 0x90, 0x54, /* 0x70-0x73 */
+	0xB8, 0xA3, 0xB8, 0xA4, 0x90, 0x55, 0x90, 0x56, /* 0x74-0x77 */
+	0xB8, 0xA5, 0x90, 0x57, 0x90, 0x58, 0x90, 0x59, /* 0x78-0x7B */
+	0xB8, 0xA6, 0x90, 0x5A, 0x90, 0x61, 0x90, 0x62, /* 0x7C-0x7F */
+	
+	0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0x90, 0x66, /* 0x80-0x83 */
+	0xB8, 0xA7, 0xB8, 0xA8, 0x90, 0x67, 0xB8, 0xA9, /* 0x84-0x87 */
+	0x90, 0x68, 0xB8, 0xAA, 0xB8, 0xAB, 0x90, 0x69, /* 0x88-0x8B */
+	0x90, 0x6A, 0xB8, 0xAC, 0xB8, 0xAD, 0x90, 0x6B, /* 0x8C-0x8F */
+	0x90, 0x6C, 0x90, 0x6D, 0x90, 0x6E, 0x90, 0x6F, /* 0x90-0x93 */
+	0x90, 0x70, 0x90, 0x71, 0x90, 0x72, 0x90, 0x73, /* 0x94-0x97 */
+	0x90, 0x74, 0x90, 0x75, 0x90, 0x76, 0x90, 0x77, /* 0x98-0x9B */
+	0x90, 0x78, 0x90, 0x79, 0x90, 0x7A, 0x90, 0x81, /* 0x9C-0x9F */
+	0x90, 0x82, 0x90, 0x83, 0x90, 0x84, 0x90, 0x85, /* 0xA0-0xA3 */
+	0x90, 0x86, 0x90, 0x87, 0x90, 0x88, 0x90, 0x89, /* 0xA4-0xA7 */
+	0x90, 0x8A, 0x90, 0x8B, 0x90, 0x8C, 0x90, 0x8D, /* 0xA8-0xAB */
+	0xB8, 0xAE, 0xB8, 0xAF, 0x90, 0x8E, 0x90, 0x8F, /* 0xAC-0xAF */
+	0xB8, 0xB0, 0x90, 0x90, 0x90, 0x91, 0x90, 0x92, /* 0xB0-0xB3 */
+	0xB8, 0xB1, 0x90, 0x93, 0x90, 0x94, 0x90, 0x95, /* 0xB4-0xB7 */
+	0x90, 0x96, 0x90, 0x97, 0x90, 0x98, 0x90, 0x99, /* 0xB8-0xBB */
+	0xB8, 0xB2, 0xB8, 0xB3, 0x90, 0x9A, 0xB8, 0xB4, /* 0xBC-0xBF */
+	0x90, 0x9B, 0xB8, 0xB5, 0x90, 0x9C, 0x90, 0x9D, /* 0xC0-0xC3 */
+	0x90, 0x9E, 0x90, 0x9F, 0x90, 0xA0, 0x90, 0xA1, /* 0xC4-0xC7 */
+	0xB8, 0xB6, 0xB8, 0xB7, 0x90, 0xA2, 0x90, 0xA3, /* 0xC8-0xCB */
+	0xB8, 0xB8, 0x90, 0xA4, 0xB8, 0xB9, 0xB8, 0xBA, /* 0xCC-0xCF */
+	0xB8, 0xBB, 0xB8, 0xBC, 0xB8, 0xBD, 0x90, 0xA5, /* 0xD0-0xD3 */
+	0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, 0x90, 0xA9, /* 0xD4-0xD7 */
+	0xB8, 0xBE, 0xB8, 0xBF, 0x90, 0xAA, 0xB8, 0xC0, /* 0xD8-0xDB */
+	0x90, 0xAB, 0xB8, 0xC1, 0xB8, 0xC2, 0x90, 0xAC, /* 0xDC-0xDF */
+	0x90, 0xAD, 0xB8, 0xC3, 0x90, 0xAE, 0xB8, 0xC4, /* 0xE0-0xE3 */
+	0xB8, 0xC5, 0xB8, 0xC6, 0x90, 0xAF, 0x90, 0xB0, /* 0xE4-0xE7 */
+	0xB8, 0xC7, 0x90, 0xB1, 0x90, 0xB2, 0x90, 0xB3, /* 0xE8-0xEB */
+	0xB8, 0xC8, 0x90, 0xB4, 0x90, 0xB5, 0x90, 0xB6, /* 0xEC-0xEF */
+	0x90, 0xB7, 0x90, 0xB8, 0x90, 0xB9, 0x90, 0xBA, /* 0xF0-0xF3 */
+	0xB8, 0xC9, 0xB8, 0xCA, 0x90, 0xBB, 0xB8, 0xCB, /* 0xF4-0xF7 */
+	0xB8, 0xCC, 0xB8, 0xCD, 0xB8, 0xCE, 0x90, 0xBC, /* 0xF8-0xFB */
+	0x90, 0xBD, 0x90, 0xBE, 0x90, 0xBF, 0x90, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BA[512] = {
+	0xB8, 0xCF, 0xB8, 0xD0, 0x90, 0xC1, 0x90, 0xC2, /* 0x00-0x03 */
+	0x90, 0xC3, 0x90, 0xC4, 0x90, 0xC5, 0x90, 0xC6, /* 0x04-0x07 */
+	0xB8, 0xD1, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0x08-0x0B */
+	0x90, 0xCA, 0x90, 0xCB, 0x90, 0xCC, 0x90, 0xCD, /* 0x0C-0x0F */
+	0x90, 0xCE, 0x90, 0xCF, 0x90, 0xD0, 0x90, 0xD1, /* 0x10-0x13 */
+	0x90, 0xD2, 0xB8, 0xD2, 0x90, 0xD3, 0x90, 0xD4, /* 0x14-0x17 */
+	0x90, 0xD5, 0x90, 0xD6, 0x90, 0xD7, 0x90, 0xD8, /* 0x18-0x1B */
+	0x90, 0xD9, 0x90, 0xDA, 0x90, 0xDB, 0x90, 0xDC, /* 0x1C-0x1F */
+	0x90, 0xDD, 0x90, 0xDE, 0x90, 0xDF, 0x90, 0xE0, /* 0x20-0x23 */
+	0x90, 0xE1, 0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, /* 0x24-0x27 */
+	0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x28-0x2B */
+	0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x2C-0x2F */
+	0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x30-0x33 */
+	0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x34-0x37 */
+	0xB8, 0xD3, 0xB8, 0xD4, 0x90, 0xF5, 0x90, 0xF6, /* 0x38-0x3B */
+	0xB8, 0xD5, 0x90, 0xF7, 0x90, 0xF8, 0x90, 0xF9, /* 0x3C-0x3F */
+	0xB8, 0xD6, 0x90, 0xFA, 0xB8, 0xD7, 0x90, 0xFB, /* 0x40-0x43 */
+	0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x41, /* 0x44-0x47 */
+	0xB8, 0xD8, 0xB8, 0xD9, 0x91, 0x42, 0xB8, 0xDA, /* 0x48-0x4B */
+	0x91, 0x43, 0xB8, 0xDB, 0xB8, 0xDC, 0x91, 0x44, /* 0x4C-0x4F */
+	0x91, 0x45, 0x91, 0x46, 0x91, 0x47, 0xB8, 0xDD, /* 0x50-0x53 */
+	0xB8, 0xDE, 0xB8, 0xDF, 0x91, 0x48, 0x91, 0x49, /* 0x54-0x57 */
+	0xB8, 0xE0, 0x91, 0x4A, 0x91, 0x4B, 0x91, 0x4C, /* 0x58-0x5B */
+	0xB8, 0xE1, 0x91, 0x4D, 0x91, 0x4E, 0x91, 0x4F, /* 0x5C-0x5F */
+	0x91, 0x50, 0x91, 0x51, 0x91, 0x52, 0x91, 0x53, /* 0x60-0x63 */
+	0xB8, 0xE2, 0xB8, 0xE3, 0x91, 0x54, 0xB8, 0xE4, /* 0x64-0x67 */
+	0xB8, 0xE5, 0xB8, 0xE6, 0x91, 0x55, 0x91, 0x56, /* 0x68-0x6B */
+	0x91, 0x57, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x6C-0x6F */
+	0xB8, 0xE7, 0xB8, 0xE8, 0x91, 0x61, 0x91, 0x62, /* 0x70-0x73 */
+	0xB8, 0xE9, 0x91, 0x63, 0x91, 0x64, 0x91, 0x65, /* 0x74-0x77 */
+	0xB8, 0xEA, 0x91, 0x66, 0x91, 0x67, 0x91, 0x68, /* 0x78-0x7B */
+	0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, 0x91, 0x6C, /* 0x7C-0x7F */
+	
+	0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, 0xB8, 0xEB, /* 0x80-0x83 */
+	0xB8, 0xEC, 0xB8, 0xED, 0x91, 0x70, 0xB8, 0xEE, /* 0x84-0x87 */
+	0x91, 0x71, 0x91, 0x72, 0x91, 0x73, 0x91, 0x74, /* 0x88-0x8B */
+	0xB8, 0xEF, 0x91, 0x75, 0x91, 0x76, 0x91, 0x77, /* 0x8C-0x8F */
+	0x91, 0x78, 0x91, 0x79, 0x91, 0x7A, 0x91, 0x81, /* 0x90-0x93 */
+	0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x94-0x97 */
+	0x91, 0x86, 0x91, 0x87, 0x91, 0x88, 0x91, 0x89, /* 0x98-0x9B */
+	0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, 0x91, 0x8D, /* 0x9C-0x9F */
+	0x91, 0x8E, 0x91, 0x8F, 0x91, 0x90, 0x91, 0x91, /* 0xA0-0xA3 */
+	0x91, 0x92, 0x91, 0x93, 0x91, 0x94, 0x91, 0x95, /* 0xA4-0xA7 */
+	0xB8, 0xF0, 0xB8, 0xF1, 0x91, 0x96, 0xB8, 0xF2, /* 0xA8-0xAB */
+	0xB8, 0xF3, 0x91, 0x97, 0x91, 0x98, 0x91, 0x99, /* 0xAC-0xAF */
+	0xB8, 0xF4, 0x91, 0x9A, 0xB8, 0xF5, 0x91, 0x9B, /* 0xB0-0xB3 */
+	0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB4-0xB7 */
+	0xB8, 0xF6, 0xB8, 0xF7, 0x91, 0xA0, 0xB8, 0xF8, /* 0xB8-0xBB */
+	0x91, 0xA1, 0xB8, 0xF9, 0x91, 0xA2, 0x91, 0xA3, /* 0xBC-0xBF */
+	0x91, 0xA4, 0x91, 0xA5, 0x91, 0xA6, 0x91, 0xA7, /* 0xC0-0xC3 */
+	0xB8, 0xFA, 0x91, 0xA8, 0x91, 0xA9, 0x91, 0xAA, /* 0xC4-0xC7 */
+	0xB8, 0xFB, 0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, /* 0xC8-0xCB */
+	0x91, 0xAE, 0x91, 0xAF, 0x91, 0xB0, 0x91, 0xB1, /* 0xCC-0xCF */
+	0x91, 0xB2, 0x91, 0xB3, 0x91, 0xB4, 0x91, 0xB5, /* 0xD0-0xD3 */
+	0x91, 0xB6, 0x91, 0xB7, 0x91, 0xB8, 0x91, 0xB9, /* 0xD4-0xD7 */
+	0xB8, 0xFC, 0xB8, 0xFD, 0x91, 0xBA, 0x91, 0xBB, /* 0xD8-0xDB */
+	0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xDC-0xDF */
+	0x91, 0xC0, 0x91, 0xC1, 0x91, 0xC2, 0x91, 0xC3, /* 0xE0-0xE3 */
+	0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, 0x91, 0xC7, /* 0xE4-0xE7 */
+	0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, 0x91, 0xCB, /* 0xE8-0xEB */
+	0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, 0x91, 0xCF, /* 0xEC-0xEF */
+	0x91, 0xD0, 0x91, 0xD1, 0x91, 0xD2, 0x91, 0xD3, /* 0xF0-0xF3 */
+	0x91, 0xD4, 0x91, 0xD5, 0x91, 0xD6, 0x91, 0xD7, /* 0xF4-0xF7 */
+	0x91, 0xD8, 0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, /* 0xF8-0xFB */
+	0xB8, 0xFE, 0x91, 0xDC, 0x91, 0xDD, 0x91, 0xDE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BB[512] = {
+	0xB9, 0xA1, 0x91, 0xDF, 0x91, 0xE0, 0x91, 0xE1, /* 0x00-0x03 */
+	0xB9, 0xA2, 0x91, 0xE2, 0x91, 0xE3, 0x91, 0xE4, /* 0x04-0x07 */
+	0x91, 0xE5, 0x91, 0xE6, 0x91, 0xE7, 0x91, 0xE8, /* 0x08-0x0B */
+	0x91, 0xE9, 0xB9, 0xA3, 0x91, 0xEA, 0xB9, 0xA4, /* 0x0C-0x0F */
+	0x91, 0xEB, 0xB9, 0xA5, 0x91, 0xEC, 0x91, 0xED, /* 0x10-0x13 */
+	0x91, 0xEE, 0x91, 0xEF, 0x91, 0xF0, 0x91, 0xF1, /* 0x14-0x17 */
+	0xB9, 0xA6, 0x91, 0xF2, 0x91, 0xF3, 0x91, 0xF4, /* 0x18-0x1B */
+	0xB9, 0xA7, 0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, /* 0x1C-0x1F */
+	0xB9, 0xA8, 0x91, 0xF8, 0x91, 0xF9, 0x91, 0xFA, /* 0x20-0x23 */
+	0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0x91, 0xFE, /* 0x24-0x27 */
+	0x92, 0x41, 0xB9, 0xA9, 0x92, 0x42, 0xB9, 0xAA, /* 0x28-0x2B */
+	0x92, 0x43, 0x92, 0x44, 0x92, 0x45, 0x92, 0x46, /* 0x2C-0x2F */
+	0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x30-0x33 */
+	0xB9, 0xAB, 0xB9, 0xAC, 0xB9, 0xAD, 0x92, 0x4B, /* 0x34-0x37 */
+	0xB9, 0xAE, 0x92, 0x4C, 0x92, 0x4D, 0xB9, 0xAF, /* 0x38-0x3B */
+	0xB9, 0xB0, 0xB9, 0xB1, 0xB9, 0xB2, 0x92, 0x4E, /* 0x3C-0x3F */
+	0x92, 0x4F, 0x92, 0x50, 0x92, 0x51, 0x92, 0x52, /* 0x40-0x43 */
+	0xB9, 0xB3, 0xB9, 0xB4, 0x92, 0x53, 0xB9, 0xB5, /* 0x44-0x47 */
+	0x92, 0x54, 0xB9, 0xB6, 0x92, 0x55, 0x92, 0x56, /* 0x48-0x4B */
+	0x92, 0x57, 0xB9, 0xB7, 0x92, 0x58, 0xB9, 0xB8, /* 0x4C-0x4F */
+	0xB9, 0xB9, 0x92, 0x59, 0x92, 0x5A, 0x92, 0x61, /* 0x50-0x53 */
+	0xB9, 0xBA, 0x92, 0x62, 0x92, 0x63, 0x92, 0x64, /* 0x54-0x57 */
+	0xB9, 0xBB, 0x92, 0x65, 0x92, 0x66, 0x92, 0x67, /* 0x58-0x5B */
+	0x92, 0x68, 0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, /* 0x5C-0x5F */
+	0x92, 0x6C, 0xB9, 0xBC, 0x92, 0x6D, 0xB9, 0xBD, /* 0x60-0x63 */
+	0x92, 0x6E, 0x92, 0x6F, 0x92, 0x70, 0x92, 0x71, /* 0x64-0x67 */
+	0x92, 0x72, 0x92, 0x73, 0x92, 0x74, 0x92, 0x75, /* 0x68-0x6B */
+	0xB9, 0xBE, 0x92, 0x76, 0x92, 0x77, 0x92, 0x78, /* 0x6C-0x6F */
+	0x92, 0x79, 0x92, 0x7A, 0x92, 0x81, 0x92, 0x82, /* 0x70-0x73 */
+	0x92, 0x83, 0x92, 0x84, 0x92, 0x85, 0x92, 0x86, /* 0x74-0x77 */
+	0x92, 0x87, 0x92, 0x88, 0x92, 0x89, 0x92, 0x8A, /* 0x78-0x7B */
+	0x92, 0x8B, 0x92, 0x8C, 0x92, 0x8D, 0x92, 0x8E, /* 0x7C-0x7F */
+	
+	0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0x80-0x83 */
+	0x92, 0x93, 0x92, 0x94, 0x92, 0x95, 0x92, 0x96, /* 0x84-0x87 */
+	0xB9, 0xBF, 0x92, 0x97, 0x92, 0x98, 0x92, 0x99, /* 0x88-0x8B */
+	0xB9, 0xC0, 0x92, 0x9A, 0x92, 0x9B, 0x92, 0x9C, /* 0x8C-0x8F */
+	0xB9, 0xC1, 0x92, 0x9D, 0x92, 0x9E, 0x92, 0x9F, /* 0x90-0x93 */
+	0x92, 0xA0, 0x92, 0xA1, 0x92, 0xA2, 0x92, 0xA3, /* 0x94-0x97 */
+	0x92, 0xA4, 0x92, 0xA5, 0x92, 0xA6, 0x92, 0xA7, /* 0x98-0x9B */
+	0x92, 0xA8, 0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, /* 0x9C-0x9F */
+	0x92, 0xAC, 0x92, 0xAD, 0x92, 0xAE, 0x92, 0xAF, /* 0xA0-0xA3 */
+	0xB9, 0xC2, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0xA4-0xA7 */
+	0xB9, 0xC3, 0x92, 0xB3, 0x92, 0xB4, 0x92, 0xB5, /* 0xA8-0xAB */
+	0xB9, 0xC4, 0x92, 0xB6, 0x92, 0xB7, 0x92, 0xB8, /* 0xAC-0xAF */
+	0x92, 0xB9, 0x92, 0xBA, 0x92, 0xBB, 0x92, 0xBC, /* 0xB0-0xB3 */
+	0xB9, 0xC5, 0x92, 0xBD, 0x92, 0xBE, 0xB9, 0xC6, /* 0xB4-0xB7 */
+	0x92, 0xBF, 0x92, 0xC0, 0x92, 0xC1, 0x92, 0xC2, /* 0xB8-0xBB */
+	0x92, 0xC3, 0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, /* 0xBC-0xBF */
+	0xB9, 0xC7, 0x92, 0xC7, 0x92, 0xC8, 0x92, 0xC9, /* 0xC0-0xC3 */
+	0xB9, 0xC8, 0x92, 0xCA, 0x92, 0xCB, 0x92, 0xCC, /* 0xC4-0xC7 */
+	0xB9, 0xC9, 0x92, 0xCD, 0x92, 0xCE, 0x92, 0xCF, /* 0xC8-0xCB */
+	0x92, 0xD0, 0x92, 0xD1, 0x92, 0xD2, 0x92, 0xD3, /* 0xCC-0xCF */
+	0xB9, 0xCA, 0x92, 0xD4, 0x92, 0xD5, 0xB9, 0xCB, /* 0xD0-0xD3 */
+	0x92, 0xD6, 0x92, 0xD7, 0x92, 0xD8, 0x92, 0xD9, /* 0xD4-0xD7 */
+	0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0xD8-0xDB */
+	0x92, 0xDE, 0x92, 0xDF, 0x92, 0xE0, 0x92, 0xE1, /* 0xDC-0xDF */
+	0x92, 0xE2, 0x92, 0xE3, 0x92, 0xE4, 0x92, 0xE5, /* 0xE0-0xE3 */
+	0x92, 0xE6, 0x92, 0xE7, 0x92, 0xE8, 0x92, 0xE9, /* 0xE4-0xE7 */
+	0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, 0x92, 0xED, /* 0xE8-0xEB */
+	0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, 0x92, 0xF1, /* 0xEC-0xEF */
+	0x92, 0xF2, 0x92, 0xF3, 0x92, 0xF4, 0x92, 0xF5, /* 0xF0-0xF3 */
+	0x92, 0xF6, 0x92, 0xF7, 0x92, 0xF8, 0x92, 0xF9, /* 0xF4-0xF7 */
+	0xB9, 0xCC, 0xB9, 0xCD, 0x92, 0xFA, 0x92, 0xFB, /* 0xF8-0xFB */
+	0xB9, 0xCE, 0x92, 0xFC, 0x92, 0xFD, 0xB9, 0xCF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BC[512] = {
+	0xB9, 0xD0, 0x92, 0xFE, 0xB9, 0xD1, 0x93, 0x41, /* 0x00-0x03 */
+	0x93, 0x42, 0x93, 0x43, 0x93, 0x44, 0x93, 0x45, /* 0x04-0x07 */
+	0xB9, 0xD2, 0xB9, 0xD3, 0x93, 0x46, 0xB9, 0xD4, /* 0x08-0x0B */
+	0xB9, 0xD5, 0xB9, 0xD6, 0x93, 0x47, 0xB9, 0xD7, /* 0x0C-0x0F */
+	0x93, 0x48, 0xB9, 0xD8, 0x93, 0x49, 0x93, 0x4A, /* 0x10-0x13 */
+	0xB9, 0xD9, 0xB9, 0xDA, 0xB9, 0xDB, 0xB9, 0xDC, /* 0x14-0x17 */
+	0xB9, 0xDD, 0x93, 0x4B, 0x93, 0x4C, 0xB9, 0xDE, /* 0x18-0x1B */
+	0xB9, 0xDF, 0xB9, 0xE0, 0xB9, 0xE1, 0xB9, 0xE2, /* 0x1C-0x1F */
+	0x93, 0x4D, 0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, /* 0x20-0x23 */
+	0xB9, 0xE3, 0xB9, 0xE4, 0x93, 0x51, 0xB9, 0xE5, /* 0x24-0x27 */
+	0x93, 0x52, 0xB9, 0xE6, 0x93, 0x53, 0x93, 0x54, /* 0x28-0x2B */
+	0x93, 0x55, 0xB9, 0xE7, 0x93, 0x56, 0x93, 0x57, /* 0x2C-0x2F */
+	0xB9, 0xE8, 0xB9, 0xE9, 0x93, 0x58, 0x93, 0x59, /* 0x30-0x33 */
+	0xB9, 0xEA, 0x93, 0x5A, 0x93, 0x61, 0x93, 0x62, /* 0x34-0x37 */
+	0xB9, 0xEB, 0x93, 0x63, 0x93, 0x64, 0x93, 0x65, /* 0x38-0x3B */
+	0x93, 0x66, 0x93, 0x67, 0x93, 0x68, 0x93, 0x69, /* 0x3C-0x3F */
+	0xB9, 0xEC, 0xB9, 0xED, 0x93, 0x6A, 0xB9, 0xEE, /* 0x40-0x43 */
+	0xB9, 0xEF, 0xB9, 0xF0, 0x93, 0x6B, 0x93, 0x6C, /* 0x44-0x47 */
+	0x93, 0x6D, 0xB9, 0xF1, 0x93, 0x6E, 0x93, 0x6F, /* 0x48-0x4B */
+	0xB9, 0xF2, 0xB9, 0xF3, 0x93, 0x70, 0x93, 0x71, /* 0x4C-0x4F */
+	0xB9, 0xF4, 0x93, 0x72, 0x93, 0x73, 0x93, 0x74, /* 0x50-0x53 */
+	0x93, 0x75, 0x93, 0x76, 0x93, 0x77, 0x93, 0x78, /* 0x54-0x57 */
+	0x93, 0x79, 0x93, 0x7A, 0x93, 0x81, 0x93, 0x82, /* 0x58-0x5B */
+	0x93, 0x83, 0xB9, 0xF5, 0x93, 0x84, 0x93, 0x85, /* 0x5C-0x5F */
+	0x93, 0x86, 0x93, 0x87, 0x93, 0x88, 0x93, 0x89, /* 0x60-0x63 */
+	0x93, 0x8A, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x64-0x67 */
+	0x93, 0x8E, 0x93, 0x8F, 0x93, 0x90, 0x93, 0x91, /* 0x68-0x6B */
+	0x93, 0x92, 0x93, 0x93, 0x93, 0x94, 0x93, 0x95, /* 0x6C-0x6F */
+	0x93, 0x96, 0x93, 0x97, 0x93, 0x98, 0x93, 0x99, /* 0x70-0x73 */
+	0x93, 0x9A, 0x93, 0x9B, 0x93, 0x9C, 0x93, 0x9D, /* 0x74-0x77 */
+	0x93, 0x9E, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x78-0x7B */
+	0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, 0x93, 0xA5, /* 0x7C-0x7F */
+	
+	0x93, 0xA6, 0x93, 0xA7, 0x93, 0xA8, 0x93, 0xA9, /* 0x80-0x83 */
+	0xB9, 0xF6, 0xB9, 0xF7, 0x93, 0xAA, 0x93, 0xAB, /* 0x84-0x87 */
+	0xB9, 0xF8, 0x93, 0xAC, 0x93, 0xAD, 0xB9, 0xF9, /* 0x88-0x8B */
+	0xB9, 0xFA, 0x93, 0xAE, 0xB9, 0xFB, 0x93, 0xAF, /* 0x8C-0x8F */
+	0x93, 0xB0, 0x93, 0xB1, 0x93, 0xB2, 0x93, 0xB3, /* 0x90-0x93 */
+	0xB9, 0xFC, 0xB9, 0xFD, 0x93, 0xB4, 0xB9, 0xFE, /* 0x94-0x97 */
+	0x93, 0xB5, 0xBA, 0xA1, 0xBA, 0xA2, 0x93, 0xB6, /* 0x98-0x9B */
+	0x93, 0xB7, 0x93, 0xB8, 0x93, 0xB9, 0x93, 0xBA, /* 0x9C-0x9F */
+	0xBA, 0xA3, 0xBA, 0xA4, 0x93, 0xBB, 0x93, 0xBC, /* 0xA0-0xA3 */
+	0xBA, 0xA5, 0x93, 0xBD, 0x93, 0xBE, 0xBA, 0xA6, /* 0xA4-0xA7 */
+	0xBA, 0xA7, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0xA8-0xAB */
+	0x93, 0xC2, 0x93, 0xC3, 0x93, 0xC4, 0x93, 0xC5, /* 0xAC-0xAF */
+	0xBA, 0xA8, 0xBA, 0xA9, 0x93, 0xC6, 0xBA, 0xAA, /* 0xB0-0xB3 */
+	0xBA, 0xAB, 0xBA, 0xAC, 0x93, 0xC7, 0x93, 0xC8, /* 0xB4-0xB7 */
+	0x93, 0xC9, 0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, /* 0xB8-0xBB */
+	0xBA, 0xAD, 0xBA, 0xAE, 0x93, 0xCD, 0x93, 0xCE, /* 0xBC-0xBF */
+	0xBA, 0xAF, 0x93, 0xCF, 0x93, 0xD0, 0x93, 0xD1, /* 0xC0-0xC3 */
+	0xBA, 0xB0, 0x93, 0xD2, 0x93, 0xD3, 0x93, 0xD4, /* 0xC4-0xC7 */
+	0x93, 0xD5, 0x93, 0xD6, 0x93, 0xD7, 0x93, 0xD8, /* 0xC8-0xCB */
+	0x93, 0xD9, 0xBA, 0xB1, 0x93, 0xDA, 0xBA, 0xB2, /* 0xCC-0xCF */
+	0xBA, 0xB3, 0xBA, 0xB4, 0x93, 0xDB, 0x93, 0xDC, /* 0xD0-0xD3 */
+	0x93, 0xDD, 0xBA, 0xB5, 0x93, 0xDE, 0x93, 0xDF, /* 0xD4-0xD7 */
+	0xBA, 0xB6, 0x93, 0xE0, 0x93, 0xE1, 0x93, 0xE2, /* 0xD8-0xDB */
+	0xBA, 0xB7, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xDC-0xDF */
+	0x93, 0xE6, 0x93, 0xE7, 0x93, 0xE8, 0x93, 0xE9, /* 0xE0-0xE3 */
+	0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, 0x93, 0xED, /* 0xE4-0xE7 */
+	0x93, 0xEE, 0x93, 0xEF, 0x93, 0xF0, 0x93, 0xF1, /* 0xE8-0xEB */
+	0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xEC-0xEF */
+	0x93, 0xF6, 0x93, 0xF7, 0x93, 0xF8, 0x93, 0xF9, /* 0xF0-0xF3 */
+	0xBA, 0xB8, 0xBA, 0xB9, 0xBA, 0xBA, 0x93, 0xFA, /* 0xF4-0xF7 */
+	0xBA, 0xBB, 0x93, 0xFB, 0x93, 0xFC, 0x93, 0xFD, /* 0xF8-0xFB */
+	0xBA, 0xBC, 0x93, 0xFE, 0x94, 0x41, 0x94, 0x42, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BD[512] = {
+	0x94, 0x43, 0x94, 0x44, 0x94, 0x45, 0x94, 0x46, /* 0x00-0x03 */
+	0xBA, 0xBD, 0xBA, 0xBE, 0x94, 0x47, 0xBA, 0xBF, /* 0x04-0x07 */
+	0x94, 0x48, 0xBA, 0xC0, 0x94, 0x49, 0x94, 0x4A, /* 0x08-0x0B */
+	0x94, 0x4B, 0x94, 0x4C, 0x94, 0x4D, 0x94, 0x4E, /* 0x0C-0x0F */
+	0xBA, 0xC1, 0x94, 0x4F, 0x94, 0x50, 0x94, 0x51, /* 0x10-0x13 */
+	0xBA, 0xC2, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0x14-0x17 */
+	0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0x18-0x1B */
+	0x94, 0x59, 0x94, 0x5A, 0x94, 0x61, 0x94, 0x62, /* 0x1C-0x1F */
+	0x94, 0x63, 0x94, 0x64, 0x94, 0x65, 0x94, 0x66, /* 0x20-0x23 */
+	0xBA, 0xC3, 0x94, 0x67, 0x94, 0x68, 0x94, 0x69, /* 0x24-0x27 */
+	0x94, 0x6A, 0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, /* 0x28-0x2B */
+	0xBA, 0xC4, 0x94, 0x6E, 0x94, 0x6F, 0x94, 0x70, /* 0x2C-0x2F */
+	0x94, 0x71, 0x94, 0x72, 0x94, 0x73, 0x94, 0x74, /* 0x30-0x33 */
+	0x94, 0x75, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x34-0x37 */
+	0x94, 0x79, 0x94, 0x7A, 0x94, 0x81, 0x94, 0x82, /* 0x38-0x3B */
+	0x94, 0x83, 0x94, 0x84, 0x94, 0x85, 0x94, 0x86, /* 0x3C-0x3F */
+	0xBA, 0xC5, 0x94, 0x87, 0x94, 0x88, 0x94, 0x89, /* 0x40-0x43 */
+	0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x44-0x47 */
+	0xBA, 0xC6, 0xBA, 0xC7, 0x94, 0x8E, 0x94, 0x8F, /* 0x48-0x4B */
+	0xBA, 0xC8, 0x94, 0x90, 0x94, 0x91, 0x94, 0x92, /* 0x4C-0x4F */
+	0xBA, 0xC9, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x50-0x53 */
+	0x94, 0x96, 0x94, 0x97, 0x94, 0x98, 0x94, 0x99, /* 0x54-0x57 */
+	0xBA, 0xCA, 0xBA, 0xCB, 0x94, 0x9A, 0x94, 0x9B, /* 0x58-0x5B */
+	0x94, 0x9C, 0x94, 0x9D, 0x94, 0x9E, 0x94, 0x9F, /* 0x5C-0x5F */
+	0x94, 0xA0, 0x94, 0xA1, 0x94, 0xA2, 0x94, 0xA3, /* 0x60-0x63 */
+	0xBA, 0xCC, 0x94, 0xA4, 0x94, 0xA5, 0x94, 0xA6, /* 0x64-0x67 */
+	0xBA, 0xCD, 0x94, 0xA7, 0x94, 0xA8, 0x94, 0xA9, /* 0x68-0x6B */
+	0x94, 0xAA, 0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, /* 0x6C-0x6F */
+	0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, 0x94, 0xB1, /* 0x70-0x73 */
+	0x94, 0xB2, 0x94, 0xB3, 0x94, 0xB4, 0x94, 0xB5, /* 0x74-0x77 */
+	0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, 0x94, 0xB9, /* 0x78-0x7B */
+	0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, 0x94, 0xBD, /* 0x7C-0x7F */
+	
+	0xBA, 0xCE, 0xBA, 0xCF, 0x94, 0xBE, 0x94, 0xBF, /* 0x80-0x83 */
+	0xBA, 0xD0, 0x94, 0xC0, 0x94, 0xC1, 0xBA, 0xD1, /* 0x84-0x87 */
+	0xBA, 0xD2, 0xBA, 0xD3, 0xBA, 0xD4, 0x94, 0xC2, /* 0x88-0x8B */
+	0x94, 0xC3, 0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, /* 0x8C-0x8F */
+	0xBA, 0xD5, 0xBA, 0xD6, 0x94, 0xC7, 0xBA, 0xD7, /* 0x90-0x93 */
+	0x94, 0xC8, 0xBA, 0xD8, 0x94, 0xC9, 0x94, 0xCA, /* 0x94-0x97 */
+	0x94, 0xCB, 0xBA, 0xD9, 0xBA, 0xDA, 0x94, 0xCC, /* 0x98-0x9B */
+	0xBA, 0xDB, 0x94, 0xCD, 0x94, 0xCE, 0x94, 0xCF, /* 0x9C-0x9F */
+	0x94, 0xD0, 0x94, 0xD1, 0x94, 0xD2, 0x94, 0xD3, /* 0xA0-0xA3 */
+	0xBA, 0xDC, 0x94, 0xD4, 0x94, 0xD5, 0x94, 0xD6, /* 0xA4-0xA7 */
+	0x94, 0xD7, 0x94, 0xD8, 0x94, 0xD9, 0x94, 0xDA, /* 0xA8-0xAB */
+	0x94, 0xDB, 0x94, 0xDC, 0x94, 0xDD, 0x94, 0xDE, /* 0xAC-0xAF */
+	0xBA, 0xDD, 0x94, 0xDF, 0x94, 0xE0, 0x94, 0xE1, /* 0xB0-0xB3 */
+	0x94, 0xE2, 0x94, 0xE3, 0x94, 0xE4, 0x94, 0xE5, /* 0xB4-0xB7 */
+	0xBA, 0xDE, 0x94, 0xE6, 0x94, 0xE7, 0x94, 0xE8, /* 0xB8-0xBB */
+	0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0x94, 0xEC, /* 0xBC-0xBF */
+	0x94, 0xED, 0x94, 0xEE, 0x94, 0xEF, 0x94, 0xF0, /* 0xC0-0xC3 */
+	0x94, 0xF1, 0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, /* 0xC4-0xC7 */
+	0x94, 0xF5, 0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, /* 0xC8-0xCB */
+	0x94, 0xF9, 0x94, 0xFA, 0x94, 0xFB, 0x94, 0xFC, /* 0xCC-0xCF */
+	0x94, 0xFD, 0x94, 0xFE, 0x95, 0x41, 0x95, 0x42, /* 0xD0-0xD3 */
+	0xBA, 0xDF, 0xBA, 0xE0, 0x95, 0x43, 0x95, 0x44, /* 0xD4-0xD7 */
+	0xBA, 0xE1, 0x95, 0x45, 0x95, 0x46, 0x95, 0x47, /* 0xD8-0xDB */
+	0xBA, 0xE2, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xDC-0xDF */
+	0x95, 0x4B, 0x95, 0x4C, 0x95, 0x4D, 0x95, 0x4E, /* 0xE0-0xE3 */
+	0x95, 0x4F, 0x95, 0x50, 0x95, 0x51, 0x95, 0x52, /* 0xE4-0xE7 */
+	0x95, 0x53, 0xBA, 0xE3, 0x95, 0x54, 0x95, 0x55, /* 0xE8-0xEB */
+	0x95, 0x56, 0x95, 0x57, 0x95, 0x58, 0x95, 0x59, /* 0xEC-0xEF */
+	0xBA, 0xE4, 0x95, 0x5A, 0x95, 0x61, 0x95, 0x62, /* 0xF0-0xF3 */
+	0xBA, 0xE5, 0x95, 0x63, 0x95, 0x64, 0x95, 0x65, /* 0xF4-0xF7 */
+	0xBA, 0xE6, 0x95, 0x66, 0x95, 0x67, 0x95, 0x68, /* 0xF8-0xFB */
+	0x95, 0x69, 0x95, 0x6A, 0x95, 0x6B, 0x95, 0x6C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BE[512] = {
+	0xBA, 0xE7, 0x95, 0x6D, 0x95, 0x6E, 0xBA, 0xE8, /* 0x00-0x03 */
+	0x95, 0x6F, 0xBA, 0xE9, 0x95, 0x70, 0x95, 0x71, /* 0x04-0x07 */
+	0x95, 0x72, 0x95, 0x73, 0x95, 0x74, 0x95, 0x75, /* 0x08-0x0B */
+	0xBA, 0xEA, 0xBA, 0xEB, 0x95, 0x76, 0x95, 0x77, /* 0x0C-0x0F */
+	0xBA, 0xEC, 0x95, 0x78, 0x95, 0x79, 0x95, 0x7A, /* 0x10-0x13 */
+	0xBA, 0xED, 0x95, 0x81, 0x95, 0x82, 0x95, 0x83, /* 0x14-0x17 */
+	0x95, 0x84, 0x95, 0x85, 0x95, 0x86, 0x95, 0x87, /* 0x18-0x1B */
+	0xBA, 0xEE, 0xBA, 0xEF, 0x95, 0x88, 0xBA, 0xF0, /* 0x1C-0x1F */
+	0x95, 0x89, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x20-0x23 */
+	0x95, 0x8D, 0x95, 0x8E, 0x95, 0x8F, 0x95, 0x90, /* 0x24-0x27 */
+	0x95, 0x91, 0x95, 0x92, 0x95, 0x93, 0x95, 0x94, /* 0x28-0x2B */
+	0x95, 0x95, 0x95, 0x96, 0x95, 0x97, 0x95, 0x98, /* 0x2C-0x2F */
+	0x95, 0x99, 0x95, 0x9A, 0x95, 0x9B, 0x95, 0x9C, /* 0x30-0x33 */
+	0x95, 0x9D, 0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, /* 0x34-0x37 */
+	0x95, 0xA1, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x38-0x3B */
+	0x95, 0xA5, 0x95, 0xA6, 0x95, 0xA7, 0x95, 0xA8, /* 0x3C-0x3F */
+	0x95, 0xA9, 0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, /* 0x40-0x43 */
+	0xBA, 0xF1, 0xBA, 0xF2, 0x95, 0xAD, 0x95, 0xAE, /* 0x44-0x47 */
+	0xBA, 0xF3, 0x95, 0xAF, 0x95, 0xB0, 0x95, 0xB1, /* 0x48-0x4B */
+	0xBA, 0xF4, 0x95, 0xB2, 0xBA, 0xF5, 0x95, 0xB3, /* 0x4C-0x4F */
+	0x95, 0xB4, 0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, /* 0x50-0x53 */
+	0xBA, 0xF6, 0xBA, 0xF7, 0x95, 0xB8, 0xBA, 0xF8, /* 0x54-0x57 */
+	0x95, 0xB9, 0xBA, 0xF9, 0xBA, 0xFA, 0xBA, 0xFB, /* 0x58-0x5B */
+	0x95, 0xBA, 0x95, 0xBB, 0x95, 0xBC, 0x95, 0xBD, /* 0x5C-0x5F */
+	0xBA, 0xFC, 0xBA, 0xFD, 0x95, 0xBE, 0x95, 0xBF, /* 0x60-0x63 */
+	0xBA, 0xFE, 0x95, 0xC0, 0x95, 0xC1, 0x95, 0xC2, /* 0x64-0x67 */
+	0xBB, 0xA1, 0x95, 0xC3, 0xBB, 0xA2, 0x95, 0xC4, /* 0x68-0x6B */
+	0x95, 0xC5, 0x95, 0xC6, 0x95, 0xC7, 0x95, 0xC8, /* 0x6C-0x6F */
+	0xBB, 0xA3, 0xBB, 0xA4, 0x95, 0xC9, 0xBB, 0xA5, /* 0x70-0x73 */
+	0xBB, 0xA6, 0xBB, 0xA7, 0x95, 0xCA, 0x95, 0xCB, /* 0x74-0x77 */
+	0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, 0xBB, 0xA8, /* 0x78-0x7B */
+	0xBB, 0xA9, 0xBB, 0xAA, 0x95, 0xCF, 0x95, 0xD0, /* 0x7C-0x7F */
+	
+	0xBB, 0xAB, 0x95, 0xD1, 0x95, 0xD2, 0x95, 0xD3, /* 0x80-0x83 */
+	0xBB, 0xAC, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0x84-0x87 */
+	0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0x88-0x8B */
+	0xBB, 0xAD, 0xBB, 0xAE, 0x95, 0xDB, 0xBB, 0xAF, /* 0x8C-0x8F */
+	0xBB, 0xB0, 0xBB, 0xB1, 0x95, 0xDC, 0x95, 0xDD, /* 0x90-0x93 */
+	0x95, 0xDE, 0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, /* 0x94-0x97 */
+	0xBB, 0xB2, 0xBB, 0xB3, 0x95, 0xE2, 0x95, 0xE3, /* 0x98-0x9B */
+	0x95, 0xE4, 0x95, 0xE5, 0x95, 0xE6, 0x95, 0xE7, /* 0x9C-0x9F */
+	0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, 0x95, 0xEB, /* 0xA0-0xA3 */
+	0x95, 0xEC, 0x95, 0xED, 0x95, 0xEE, 0x95, 0xEF, /* 0xA4-0xA7 */
+	0xBB, 0xB4, 0x95, 0xF0, 0x95, 0xF1, 0x95, 0xF2, /* 0xA8-0xAB */
+	0x95, 0xF3, 0x95, 0xF4, 0x95, 0xF5, 0x95, 0xF6, /* 0xAC-0xAF */
+	0x95, 0xF7, 0x95, 0xF8, 0x95, 0xF9, 0x95, 0xFA, /* 0xB0-0xB3 */
+	0x95, 0xFB, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0xB4-0xB7 */
+	0x96, 0x41, 0x96, 0x42, 0x96, 0x43, 0x96, 0x44, /* 0xB8-0xBB */
+	0x96, 0x45, 0x96, 0x46, 0x96, 0x47, 0x96, 0x48, /* 0xBC-0xBF */
+	0x96, 0x49, 0x96, 0x4A, 0x96, 0x4B, 0x96, 0x4C, /* 0xC0-0xC3 */
+	0x96, 0x4D, 0x96, 0x4E, 0x96, 0x4F, 0x96, 0x50, /* 0xC4-0xC7 */
+	0x96, 0x51, 0x96, 0x52, 0x96, 0x53, 0x96, 0x54, /* 0xC8-0xCB */
+	0x96, 0x55, 0x96, 0x56, 0x96, 0x57, 0x96, 0x58, /* 0xCC-0xCF */
+	0xBB, 0xB5, 0xBB, 0xB6, 0x96, 0x59, 0x96, 0x5A, /* 0xD0-0xD3 */
+	0xBB, 0xB7, 0x96, 0x61, 0x96, 0x62, 0xBB, 0xB8, /* 0xD4-0xD7 */
+	0xBB, 0xB9, 0x96, 0x63, 0x96, 0x64, 0x96, 0x65, /* 0xD8-0xDB */
+	0x96, 0x66, 0x96, 0x67, 0x96, 0x68, 0x96, 0x69, /* 0xDC-0xDF */
+	0xBB, 0xBA, 0x96, 0x6A, 0x96, 0x6B, 0xBB, 0xBB, /* 0xE0-0xE3 */
+	0xBB, 0xBC, 0xBB, 0xBD, 0x96, 0x6C, 0x96, 0x6D, /* 0xE4-0xE7 */
+	0x96, 0x6E, 0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, /* 0xE8-0xEB */
+	0xBB, 0xBE, 0x96, 0x72, 0x96, 0x73, 0x96, 0x74, /* 0xEC-0xEF */
+	0x96, 0x75, 0x96, 0x76, 0x96, 0x77, 0x96, 0x78, /* 0xF0-0xF3 */
+	0x96, 0x79, 0x96, 0x7A, 0x96, 0x81, 0x96, 0x82, /* 0xF4-0xF7 */
+	0x96, 0x83, 0x96, 0x84, 0x96, 0x85, 0x96, 0x86, /* 0xF8-0xFB */
+	0x96, 0x87, 0x96, 0x88, 0x96, 0x89, 0x96, 0x8A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BF[512] = {
+	0x96, 0x8B, 0xBB, 0xBF, 0x96, 0x8C, 0x96, 0x8D, /* 0x00-0x03 */
+	0x96, 0x8E, 0x96, 0x8F, 0x96, 0x90, 0x96, 0x91, /* 0x04-0x07 */
+	0xBB, 0xC0, 0xBB, 0xC1, 0x96, 0x92, 0x96, 0x93, /* 0x08-0x0B */
+	0x96, 0x94, 0x96, 0x95, 0x96, 0x96, 0x96, 0x97, /* 0x0C-0x0F */
+	0x96, 0x98, 0x96, 0x99, 0x96, 0x9A, 0x96, 0x9B, /* 0x10-0x13 */
+	0x96, 0x9C, 0x96, 0x9D, 0x96, 0x9E, 0x96, 0x9F, /* 0x14-0x17 */
+	0xBB, 0xC2, 0xBB, 0xC3, 0x96, 0xA0, 0xBB, 0xC4, /* 0x18-0x1B */
+	0xBB, 0xC5, 0xBB, 0xC6, 0x96, 0xA1, 0x96, 0xA2, /* 0x1C-0x1F */
+	0x96, 0xA3, 0x96, 0xA4, 0x96, 0xA5, 0x96, 0xA6, /* 0x20-0x23 */
+	0x96, 0xA7, 0x96, 0xA8, 0x96, 0xA9, 0x96, 0xAA, /* 0x24-0x27 */
+	0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, 0x96, 0xAE, /* 0x28-0x2B */
+	0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, 0x96, 0xB2, /* 0x2C-0x2F */
+	0x96, 0xB3, 0x96, 0xB4, 0x96, 0xB5, 0x96, 0xB6, /* 0x30-0x33 */
+	0x96, 0xB7, 0x96, 0xB8, 0x96, 0xB9, 0x96, 0xBA, /* 0x34-0x37 */
+	0x96, 0xBB, 0x96, 0xBC, 0x96, 0xBD, 0x96, 0xBE, /* 0x38-0x3B */
+	0x96, 0xBF, 0x96, 0xC0, 0x96, 0xC1, 0x96, 0xC2, /* 0x3C-0x3F */
+	0xBB, 0xC7, 0xBB, 0xC8, 0x96, 0xC3, 0x96, 0xC4, /* 0x40-0x43 */
+	0xBB, 0xC9, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0x44-0x47 */
+	0xBB, 0xCA, 0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, /* 0x48-0x4B */
+	0x96, 0xCB, 0x96, 0xCC, 0x96, 0xCD, 0x96, 0xCE, /* 0x4C-0x4F */
+	0xBB, 0xCB, 0xBB, 0xCC, 0x96, 0xCF, 0x96, 0xD0, /* 0x50-0x53 */
+	0x96, 0xD1, 0xBB, 0xCD, 0x96, 0xD2, 0x96, 0xD3, /* 0x54-0x57 */
+	0x96, 0xD4, 0x96, 0xD5, 0x96, 0xD6, 0x96, 0xD7, /* 0x58-0x5B */
+	0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x5C-0x5F */
+	0x96, 0xDC, 0x96, 0xDD, 0x96, 0xDE, 0x96, 0xDF, /* 0x60-0x63 */
+	0x96, 0xE0, 0x96, 0xE1, 0x96, 0xE2, 0x96, 0xE3, /* 0x64-0x67 */
+	0x96, 0xE4, 0x96, 0xE5, 0x96, 0xE6, 0x96, 0xE7, /* 0x68-0x6B */
+	0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x6C-0x6F */
+	0x96, 0xEC, 0x96, 0xED, 0x96, 0xEE, 0x96, 0xEF, /* 0x70-0x73 */
+	0x96, 0xF0, 0x96, 0xF1, 0x96, 0xF2, 0x96, 0xF3, /* 0x74-0x77 */
+	0x96, 0xF4, 0x96, 0xF5, 0x96, 0xF6, 0x96, 0xF7, /* 0x78-0x7B */
+	0x96, 0xF8, 0x96, 0xF9, 0x96, 0xFA, 0x96, 0xFB, /* 0x7C-0x7F */
+	
+	0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, 0x97, 0x41, /* 0x80-0x83 */
+	0x97, 0x42, 0x97, 0x43, 0x97, 0x44, 0x97, 0x45, /* 0x84-0x87 */
+	0x97, 0x46, 0x97, 0x47, 0x97, 0x48, 0x97, 0x49, /* 0x88-0x8B */
+	0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, 0x97, 0x4D, /* 0x8C-0x8F */
+	0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x90-0x93 */
+	0xBB, 0xCE, 0x97, 0x52, 0x97, 0x53, 0x97, 0x54, /* 0x94-0x97 */
+	0x97, 0x55, 0x97, 0x56, 0x97, 0x57, 0x97, 0x58, /* 0x98-0x9B */
+	0x97, 0x59, 0x97, 0x5A, 0x97, 0x61, 0x97, 0x62, /* 0x9C-0x9F */
+	0x97, 0x63, 0x97, 0x64, 0x97, 0x65, 0x97, 0x66, /* 0xA0-0xA3 */
+	0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0xA4-0xA7 */
+	0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0xA8-0xAB */
+	0x97, 0x6F, 0x97, 0x70, 0x97, 0x71, 0x97, 0x72, /* 0xAC-0xAF */
+	0xBB, 0xCF, 0x97, 0x73, 0x97, 0x74, 0x97, 0x75, /* 0xB0-0xB3 */
+	0x97, 0x76, 0x97, 0x77, 0x97, 0x78, 0x97, 0x79, /* 0xB4-0xB7 */
+	0x97, 0x7A, 0x97, 0x81, 0x97, 0x82, 0x97, 0x83, /* 0xB8-0xBB */
+	0x97, 0x84, 0x97, 0x85, 0x97, 0x86, 0x97, 0x87, /* 0xBC-0xBF */
+	0x97, 0x88, 0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, /* 0xC0-0xC3 */
+	0x97, 0x8C, 0xBB, 0xD0, 0x97, 0x8D, 0x97, 0x8E, /* 0xC4-0xC7 */
+	0x97, 0x8F, 0x97, 0x90, 0x97, 0x91, 0x97, 0x92, /* 0xC8-0xCB */
+	0xBB, 0xD1, 0xBB, 0xD2, 0x97, 0x93, 0x97, 0x94, /* 0xCC-0xCF */
+	0xBB, 0xD3, 0x97, 0x95, 0x97, 0x96, 0x97, 0x97, /* 0xD0-0xD3 */
+	0xBB, 0xD4, 0x97, 0x98, 0x97, 0x99, 0x97, 0x9A, /* 0xD4-0xD7 */
+	0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, 0x97, 0x9E, /* 0xD8-0xDB */
+	0xBB, 0xD5, 0x97, 0x9F, 0x97, 0xA0, 0xBB, 0xD6, /* 0xDC-0xDF */
+	0x97, 0xA1, 0xBB, 0xD7, 0x97, 0xA2, 0x97, 0xA3, /* 0xE0-0xE3 */
+	0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE4-0xE7 */
+	0x97, 0xA8, 0x97, 0xA9, 0x97, 0xAA, 0x97, 0xAB, /* 0xE8-0xEB */
+	0x97, 0xAC, 0x97, 0xAD, 0x97, 0xAE, 0x97, 0xAF, /* 0xEC-0xEF */
+	0x97, 0xB0, 0x97, 0xB1, 0x97, 0xB2, 0x97, 0xB3, /* 0xF0-0xF3 */
+	0x97, 0xB4, 0x97, 0xB5, 0x97, 0xB6, 0x97, 0xB7, /* 0xF4-0xF7 */
+	0x97, 0xB8, 0x97, 0xB9, 0x97, 0xBA, 0x97, 0xBB, /* 0xF8-0xFB */
+	0x97, 0xBC, 0x97, 0xBD, 0x97, 0xBE, 0x97, 0xBF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C0[512] = {
+	0x97, 0xC0, 0x97, 0xC1, 0x97, 0xC2, 0x97, 0xC3, /* 0x00-0x03 */
+	0x97, 0xC4, 0x97, 0xC5, 0x97, 0xC6, 0x97, 0xC7, /* 0x04-0x07 */
+	0x97, 0xC8, 0x97, 0xC9, 0x97, 0xCA, 0x97, 0xCB, /* 0x08-0x0B */
+	0x97, 0xCC, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x0C-0x0F */
+	0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, 0x97, 0xD3, /* 0x10-0x13 */
+	0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, 0x97, 0xD7, /* 0x14-0x17 */
+	0x97, 0xD8, 0x97, 0xD9, 0x97, 0xDA, 0x97, 0xDB, /* 0x18-0x1B */
+	0x97, 0xDC, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x1C-0x1F */
+	0x97, 0xE0, 0x97, 0xE1, 0x97, 0xE2, 0x97, 0xE3, /* 0x20-0x23 */
+	0x97, 0xE4, 0x97, 0xE5, 0x97, 0xE6, 0x97, 0xE7, /* 0x24-0x27 */
+	0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x28-0x2B */
+	0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x2C-0x2F */
+	0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x30-0x33 */
+	0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x34-0x37 */
+	0x97, 0xF8, 0x97, 0xF9, 0x97, 0xFA, 0x97, 0xFB, /* 0x38-0x3B */
+	0xBB, 0xD8, 0x97, 0xFC, 0x97, 0xFD, 0x97, 0xFE, /* 0x3C-0x3F */
+	0x98, 0x41, 0x98, 0x42, 0x98, 0x43, 0x98, 0x44, /* 0x40-0x43 */
+	0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0x98, 0x48, /* 0x44-0x47 */
+	0x98, 0x49, 0x98, 0x4A, 0x98, 0x4B, 0x98, 0x4C, /* 0x48-0x4B */
+	0x98, 0x4D, 0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, /* 0x4C-0x4F */
+	0x98, 0x51, 0xBB, 0xD9, 0x98, 0x52, 0x98, 0x53, /* 0x50-0x53 */
+	0x98, 0x54, 0x98, 0x55, 0x98, 0x56, 0x98, 0x57, /* 0x54-0x57 */
+	0xBB, 0xDA, 0x98, 0x58, 0x98, 0x59, 0x98, 0x5A, /* 0x58-0x5B */
+	0xBB, 0xDB, 0x98, 0x61, 0x98, 0x62, 0x98, 0x63, /* 0x5C-0x5F */
+	0xBB, 0xDC, 0x98, 0x64, 0x98, 0x65, 0x98, 0x66, /* 0x60-0x63 */
+	0x98, 0x67, 0x98, 0x68, 0x98, 0x69, 0x98, 0x6A, /* 0x64-0x67 */
+	0xBB, 0xDD, 0xBB, 0xDE, 0x98, 0x6B, 0x98, 0x6C, /* 0x68-0x6B */
+	0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0x98, 0x70, /* 0x6C-0x6F */
+	0x98, 0x71, 0x98, 0x72, 0x98, 0x73, 0x98, 0x74, /* 0x70-0x73 */
+	0x98, 0x75, 0x98, 0x76, 0x98, 0x77, 0x98, 0x78, /* 0x74-0x77 */
+	0x98, 0x79, 0x98, 0x7A, 0x98, 0x81, 0x98, 0x82, /* 0x78-0x7B */
+	0x98, 0x83, 0x98, 0x84, 0x98, 0x85, 0x98, 0x86, /* 0x7C-0x7F */
+	
+	0x98, 0x87, 0x98, 0x88, 0x98, 0x89, 0x98, 0x8A, /* 0x80-0x83 */
+	0x98, 0x8B, 0x98, 0x8C, 0x98, 0x8D, 0x98, 0x8E, /* 0x84-0x87 */
+	0x98, 0x8F, 0x98, 0x90, 0x98, 0x91, 0x98, 0x92, /* 0x88-0x8B */
+	0x98, 0x93, 0x98, 0x94, 0x98, 0x95, 0x98, 0x96, /* 0x8C-0x8F */
+	0xBB, 0xDF, 0xBB, 0xE0, 0x98, 0x97, 0x98, 0x98, /* 0x90-0x93 */
+	0xBB, 0xE1, 0x98, 0x99, 0x98, 0x9A, 0x98, 0x9B, /* 0x94-0x97 */
+	0xBB, 0xE2, 0x98, 0x9C, 0x98, 0x9D, 0x98, 0x9E, /* 0x98-0x9B */
+	0x98, 0x9F, 0x98, 0xA0, 0x98, 0xA1, 0x98, 0xA2, /* 0x9C-0x9F */
+	0xBB, 0xE3, 0xBB, 0xE4, 0x98, 0xA3, 0xBB, 0xE5, /* 0xA0-0xA3 */
+	0x98, 0xA4, 0xBB, 0xE6, 0x98, 0xA5, 0x98, 0xA6, /* 0xA4-0xA7 */
+	0x98, 0xA7, 0x98, 0xA8, 0x98, 0xA9, 0x98, 0xAA, /* 0xA8-0xAB */
+	0xBB, 0xE7, 0xBB, 0xE8, 0x98, 0xAB, 0xBB, 0xE9, /* 0xAC-0xAF */
+	0xBB, 0xEA, 0x98, 0xAC, 0x98, 0xAD, 0xBB, 0xEB, /* 0xB0-0xB3 */
+	0xBB, 0xEC, 0xBB, 0xED, 0xBB, 0xEE, 0x98, 0xAE, /* 0xB4-0xB7 */
+	0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xB8-0xBB */
+	0xBB, 0xEF, 0xBB, 0xF0, 0x98, 0xB3, 0xBB, 0xF1, /* 0xBC-0xBF */
+	0xBB, 0xF2, 0xBB, 0xF3, 0x98, 0xB4, 0x98, 0xB5, /* 0xC0-0xC3 */
+	0x98, 0xB6, 0xBB, 0xF4, 0x98, 0xB7, 0x98, 0xB8, /* 0xC4-0xC7 */
+	0xBB, 0xF5, 0xBB, 0xF6, 0x98, 0xB9, 0x98, 0xBA, /* 0xC8-0xCB */
+	0xBB, 0xF7, 0x98, 0xBB, 0x98, 0xBC, 0x98, 0xBD, /* 0xCC-0xCF */
+	0xBB, 0xF8, 0x98, 0xBE, 0x98, 0xBF, 0x98, 0xC0, /* 0xD0-0xD3 */
+	0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, 0x98, 0xC4, /* 0xD4-0xD7 */
+	0xBB, 0xF9, 0xBB, 0xFA, 0x98, 0xC5, 0xBB, 0xFB, /* 0xD8-0xDB */
+	0xBB, 0xFC, 0xBB, 0xFD, 0x98, 0xC6, 0x98, 0xC7, /* 0xDC-0xDF */
+	0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0x98, 0xCB, /* 0xE0-0xE3 */
+	0xBB, 0xFE, 0xBC, 0xA1, 0x98, 0xCC, 0x98, 0xCD, /* 0xE4-0xE7 */
+	0xBC, 0xA2, 0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, /* 0xE8-0xEB */
+	0xBC, 0xA3, 0x98, 0xD1, 0x98, 0xD2, 0x98, 0xD3, /* 0xEC-0xEF */
+	0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0xF0-0xF3 */
+	0xBC, 0xA4, 0xBC, 0xA5, 0x98, 0xD8, 0xBC, 0xA6, /* 0xF4-0xF7 */
+	0x98, 0xD9, 0xBC, 0xA7, 0x98, 0xDA, 0x98, 0xDB, /* 0xF8-0xFB */
+	0x98, 0xDC, 0x98, 0xDD, 0x98, 0xDE, 0x98, 0xDF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C1[512] = {
+	0xBC, 0xA8, 0x98, 0xE0, 0x98, 0xE1, 0x98, 0xE2, /* 0x00-0x03 */
+	0xBC, 0xA9, 0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, /* 0x04-0x07 */
+	0xBC, 0xAA, 0x98, 0xE6, 0x98, 0xE7, 0x98, 0xE8, /* 0x08-0x0B */
+	0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x0C-0x0F */
+	0xBC, 0xAB, 0x98, 0xED, 0x98, 0xEE, 0x98, 0xEF, /* 0x10-0x13 */
+	0x98, 0xF0, 0xBC, 0xAC, 0x98, 0xF1, 0x98, 0xF2, /* 0x14-0x17 */
+	0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x18-0x1B */
+	0xBC, 0xAD, 0xBC, 0xAE, 0xBC, 0xAF, 0xBC, 0xB0, /* 0x1C-0x1F */
+	0xBC, 0xB1, 0x98, 0xF7, 0x98, 0xF8, 0xBC, 0xB2, /* 0x20-0x23 */
+	0xBC, 0xB3, 0x98, 0xF9, 0xBC, 0xB4, 0xBC, 0xB5, /* 0x24-0x27 */
+	0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x28-0x2B */
+	0xBC, 0xB6, 0xBC, 0xB7, 0x98, 0xFE, 0xBC, 0xB8, /* 0x2C-0x2F */
+	0xBC, 0xB9, 0xBC, 0xBA, 0x99, 0x41, 0x99, 0x42, /* 0x30-0x33 */
+	0x99, 0x43, 0x99, 0x44, 0xBC, 0xBB, 0x99, 0x45, /* 0x34-0x37 */
+	0xBC, 0xBC, 0xBC, 0xBD, 0x99, 0x46, 0x99, 0x47, /* 0x38-0x3B */
+	0xBC, 0xBE, 0x99, 0x48, 0x99, 0x49, 0x99, 0x4A, /* 0x3C-0x3F */
+	0xBC, 0xBF, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x40-0x43 */
+	0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x44-0x47 */
+	0xBC, 0xC0, 0xBC, 0xC1, 0x99, 0x52, 0xBC, 0xC2, /* 0x48-0x4B */
+	0xBC, 0xC3, 0xBC, 0xC4, 0x99, 0x53, 0x99, 0x54, /* 0x4C-0x4F */
+	0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x50-0x53 */
+	0xBC, 0xC5, 0xBC, 0xC6, 0x99, 0x59, 0x99, 0x5A, /* 0x54-0x57 */
+	0xBC, 0xC7, 0x99, 0x61, 0x99, 0x62, 0x99, 0x63, /* 0x58-0x5B */
+	0xBC, 0xC8, 0x99, 0x64, 0x99, 0x65, 0x99, 0x66, /* 0x5C-0x5F */
+	0x99, 0x67, 0x99, 0x68, 0x99, 0x69, 0x99, 0x6A, /* 0x60-0x63 */
+	0xBC, 0xC9, 0xBC, 0xCA, 0x99, 0x6B, 0xBC, 0xCB, /* 0x64-0x67 */
+	0xBC, 0xCC, 0xBC, 0xCD, 0x99, 0x6C, 0x99, 0x6D, /* 0x68-0x6B */
+	0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0x6C-0x6F */
+	0xBC, 0xCE, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x70-0x73 */
+	0xBC, 0xCF, 0x99, 0x75, 0x99, 0x76, 0x99, 0x77, /* 0x74-0x77 */
+	0xBC, 0xD0, 0x99, 0x78, 0x99, 0x79, 0x99, 0x7A, /* 0x78-0x7B */
+	0x99, 0x81, 0x99, 0x82, 0x99, 0x83, 0x99, 0x84, /* 0x7C-0x7F */
+	
+	0x99, 0x85, 0x99, 0x86, 0x99, 0x87, 0x99, 0x88, /* 0x80-0x83 */
+	0x99, 0x89, 0xBC, 0xD1, 0x99, 0x8A, 0x99, 0x8B, /* 0x84-0x87 */
+	0x99, 0x8C, 0x99, 0x8D, 0x99, 0x8E, 0x99, 0x8F, /* 0x88-0x8B */
+	0xBC, 0xD2, 0xBC, 0xD3, 0xBC, 0xD4, 0x99, 0x90, /* 0x8C-0x8F */
+	0xBC, 0xD5, 0x99, 0x91, 0x99, 0x92, 0x99, 0x93, /* 0x90-0x93 */
+	0xBC, 0xD6, 0x99, 0x94, 0xBC, 0xD7, 0x99, 0x95, /* 0x94-0x97 */
+	0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0x98-0x9B */
+	0xBC, 0xD8, 0xBC, 0xD9, 0x99, 0x9A, 0xBC, 0xDA, /* 0x9C-0x9F */
+	0x99, 0x9B, 0xBC, 0xDB, 0x99, 0x9C, 0x99, 0x9D, /* 0xA0-0xA3 */
+	0x99, 0x9E, 0xBC, 0xDC, 0x99, 0x9F, 0x99, 0xA0, /* 0xA4-0xA7 */
+	0xBC, 0xDD, 0xBC, 0xDE, 0x99, 0xA1, 0x99, 0xA2, /* 0xA8-0xAB */
+	0xBC, 0xDF, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xAC-0xAF */
+	0xBC, 0xE0, 0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, /* 0xB0-0xB3 */
+	0x99, 0xA9, 0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, /* 0xB4-0xB7 */
+	0x99, 0xAD, 0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, /* 0xB8-0xBB */
+	0x99, 0xB1, 0xBC, 0xE1, 0x99, 0xB2, 0x99, 0xB3, /* 0xBC-0xBF */
+	0x99, 0xB4, 0x99, 0xB5, 0x99, 0xB6, 0x99, 0xB7, /* 0xC0-0xC3 */
+	0xBC, 0xE2, 0x99, 0xB8, 0x99, 0xB9, 0x99, 0xBA, /* 0xC4-0xC7 */
+	0xBC, 0xE3, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xC8-0xCB */
+	0xBC, 0xE4, 0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, /* 0xCC-0xCF */
+	0x99, 0xC1, 0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, /* 0xD0-0xD3 */
+	0xBC, 0xE5, 0x99, 0xC5, 0x99, 0xC6, 0xBC, 0xE6, /* 0xD4-0xD7 */
+	0xBC, 0xE7, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xD8-0xDB */
+	0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xDC-0xDF */
+	0xBC, 0xE8, 0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, /* 0xE0-0xE3 */
+	0xBC, 0xE9, 0x99, 0xD1, 0x99, 0xD2, 0x99, 0xD3, /* 0xE4-0xE7 */
+	0xBC, 0xEA, 0x99, 0xD4, 0x99, 0xD5, 0x99, 0xD6, /* 0xE8-0xEB */
+	0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, 0x99, 0xDA, /* 0xEC-0xEF */
+	0xBC, 0xEB, 0xBC, 0xEC, 0x99, 0xDB, 0xBC, 0xED, /* 0xF0-0xF3 */
+	0x99, 0xDC, 0x99, 0xDD, 0x99, 0xDE, 0x99, 0xDF, /* 0xF4-0xF7 */
+	0x99, 0xE0, 0x99, 0xE1, 0x99, 0xE2, 0x99, 0xE3, /* 0xF8-0xFB */
+	0xBC, 0xEE, 0xBC, 0xEF, 0x99, 0xE4, 0x99, 0xE5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C2[512] = {
+	0xBC, 0xF0, 0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, /* 0x00-0x03 */
+	0xBC, 0xF1, 0x99, 0xE9, 0x99, 0xEA, 0x99, 0xEB, /* 0x04-0x07 */
+	0x99, 0xEC, 0x99, 0xED, 0x99, 0xEE, 0x99, 0xEF, /* 0x08-0x0B */
+	0xBC, 0xF2, 0xBC, 0xF3, 0x99, 0xF0, 0xBC, 0xF4, /* 0x0C-0x0F */
+	0x99, 0xF1, 0xBC, 0xF5, 0x99, 0xF2, 0x99, 0xF3, /* 0x10-0x13 */
+	0x99, 0xF4, 0x99, 0xF5, 0x99, 0xF6, 0x99, 0xF7, /* 0x14-0x17 */
+	0xBC, 0xF6, 0xBC, 0xF7, 0x99, 0xF8, 0x99, 0xF9, /* 0x18-0x1B */
+	0xBC, 0xF8, 0x99, 0xFA, 0x99, 0xFB, 0xBC, 0xF9, /* 0x1C-0x1F */
+	0xBC, 0xFA, 0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, /* 0x20-0x23 */
+	0x9A, 0x41, 0x9A, 0x42, 0x9A, 0x43, 0x9A, 0x44, /* 0x24-0x27 */
+	0xBC, 0xFB, 0xBC, 0xFC, 0x9A, 0x45, 0xBC, 0xFD, /* 0x28-0x2B */
+	0x9A, 0x46, 0xBC, 0xFE, 0x9A, 0x47, 0xBD, 0xA1, /* 0x2C-0x2F */
+	0x9A, 0x48, 0xBD, 0xA2, 0xBD, 0xA3, 0x9A, 0x49, /* 0x30-0x33 */
+	0xBD, 0xA4, 0x9A, 0x4A, 0x9A, 0x4B, 0x9A, 0x4C, /* 0x34-0x37 */
+	0x9A, 0x4D, 0x9A, 0x4E, 0x9A, 0x4F, 0x9A, 0x50, /* 0x38-0x3B */
+	0x9A, 0x51, 0x9A, 0x52, 0x9A, 0x53, 0x9A, 0x54, /* 0x3C-0x3F */
+	0x9A, 0x55, 0x9A, 0x56, 0x9A, 0x57, 0x9A, 0x58, /* 0x40-0x43 */
+	0x9A, 0x59, 0x9A, 0x5A, 0x9A, 0x61, 0x9A, 0x62, /* 0x44-0x47 */
+	0xBD, 0xA5, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x48-0x4B */
+	0x9A, 0x66, 0x9A, 0x67, 0x9A, 0x68, 0x9A, 0x69, /* 0x4C-0x4F */
+	0xBD, 0xA6, 0xBD, 0xA7, 0x9A, 0x6A, 0x9A, 0x6B, /* 0x50-0x53 */
+	0xBD, 0xA8, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x54-0x57 */
+	0xBD, 0xA9, 0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, /* 0x58-0x5B */
+	0x9A, 0x72, 0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, /* 0x5C-0x5F */
+	0xBD, 0xAA, 0x9A, 0x76, 0x9A, 0x77, 0x9A, 0x78, /* 0x60-0x63 */
+	0x9A, 0x79, 0xBD, 0xAB, 0x9A, 0x7A, 0x9A, 0x81, /* 0x64-0x67 */
+	0x9A, 0x82, 0x9A, 0x83, 0x9A, 0x84, 0x9A, 0x85, /* 0x68-0x6B */
+	0xBD, 0xAC, 0xBD, 0xAD, 0x9A, 0x86, 0x9A, 0x87, /* 0x6C-0x6F */
+	0xBD, 0xAE, 0x9A, 0x88, 0x9A, 0x89, 0x9A, 0x8A, /* 0x70-0x73 */
+	0xBD, 0xAF, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x74-0x77 */
+	0x9A, 0x8E, 0x9A, 0x8F, 0x9A, 0x90, 0x9A, 0x91, /* 0x78-0x7B */
+	0xBD, 0xB0, 0xBD, 0xB1, 0x9A, 0x92, 0xBD, 0xB2, /* 0x7C-0x7F */
+	
+	0x9A, 0x93, 0xBD, 0xB3, 0x9A, 0x94, 0x9A, 0x95, /* 0x80-0x83 */
+	0x9A, 0x96, 0x9A, 0x97, 0x9A, 0x98, 0x9A, 0x99, /* 0x84-0x87 */
+	0xBD, 0xB4, 0xBD, 0xB5, 0x9A, 0x9A, 0x9A, 0x9B, /* 0x88-0x8B */
+	0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0x9A, 0x9F, /* 0x8C-0x8F */
+	0xBD, 0xB6, 0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, /* 0x90-0x93 */
+	0x9A, 0xA3, 0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, /* 0x94-0x97 */
+	0xBD, 0xB7, 0x9A, 0xA7, 0x9A, 0xA8, 0xBD, 0xB8, /* 0x98-0x9B */
+	0x9A, 0xA9, 0xBD, 0xB9, 0x9A, 0xAA, 0x9A, 0xAB, /* 0x9C-0x9F */
+	0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0x9A, 0xAF, /* 0xA0-0xA3 */
+	0xBD, 0xBA, 0xBD, 0xBB, 0x9A, 0xB0, 0x9A, 0xB1, /* 0xA4-0xA7 */
+	0xBD, 0xBC, 0x9A, 0xB2, 0x9A, 0xB3, 0x9A, 0xB4, /* 0xA8-0xAB */
+	0xBD, 0xBD, 0xBD, 0xBE, 0x9A, 0xB5, 0x9A, 0xB6, /* 0xAC-0xAF */
+	0x9A, 0xB7, 0x9A, 0xB8, 0x9A, 0xB9, 0x9A, 0xBA, /* 0xB0-0xB3 */
+	0xBD, 0xBF, 0xBD, 0xC0, 0x9A, 0xBB, 0xBD, 0xC1, /* 0xB4-0xB7 */
+	0x9A, 0xBC, 0xBD, 0xC2, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xB8-0xBB */
+	0x9A, 0xBF, 0x9A, 0xC0, 0x9A, 0xC1, 0x9A, 0xC2, /* 0xBC-0xBF */
+	0x9A, 0xC3, 0x9A, 0xC4, 0x9A, 0xC5, 0x9A, 0xC6, /* 0xC0-0xC3 */
+	0x9A, 0xC7, 0x9A, 0xC8, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xC4-0xC7 */
+	0x9A, 0xCB, 0x9A, 0xCC, 0x9A, 0xCD, 0x9A, 0xCE, /* 0xC8-0xCB */
+	0x9A, 0xCF, 0x9A, 0xD0, 0x9A, 0xD1, 0x9A, 0xD2, /* 0xCC-0xCF */
+	0x9A, 0xD3, 0x9A, 0xD4, 0x9A, 0xD5, 0x9A, 0xD6, /* 0xD0-0xD3 */
+	0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, 0x9A, 0xDA, /* 0xD4-0xD7 */
+	0x9A, 0xDB, 0x9A, 0xDC, 0x9A, 0xDD, 0x9A, 0xDE, /* 0xD8-0xDB */
+	0xBD, 0xC3, 0xBD, 0xC4, 0x9A, 0xDF, 0x9A, 0xE0, /* 0xDC-0xDF */
+	0xBD, 0xC5, 0x9A, 0xE1, 0x9A, 0xE2, 0xBD, 0xC6, /* 0xE0-0xE3 */
+	0xBD, 0xC7, 0x9A, 0xE3, 0x9A, 0xE4, 0x9A, 0xE5, /* 0xE4-0xE7 */
+	0x9A, 0xE6, 0x9A, 0xE7, 0x9A, 0xE8, 0xBD, 0xC8, /* 0xE8-0xEB */
+	0xBD, 0xC9, 0xBD, 0xCA, 0x9A, 0xE9, 0xBD, 0xCB, /* 0xEC-0xEF */
+	0x9A, 0xEA, 0xBD, 0xCC, 0x9A, 0xEB, 0x9A, 0xEC, /* 0xF0-0xF3 */
+	0x9A, 0xED, 0x9A, 0xEE, 0xBD, 0xCD, 0x9A, 0xEF, /* 0xF4-0xF7 */
+	0xBD, 0xCE, 0xBD, 0xCF, 0x9A, 0xF0, 0xBD, 0xD0, /* 0xF8-0xFB */
+	0xBD, 0xD1, 0x9A, 0xF1, 0x9A, 0xF2, 0x9A, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C3[512] = {
+	0xBD, 0xD2, 0x9A, 0xF4, 0x9A, 0xF5, 0x9A, 0xF6, /* 0x00-0x03 */
+	0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, 0x9A, 0xFA, /* 0x04-0x07 */
+	0xBD, 0xD3, 0xBD, 0xD4, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x08-0x0B */
+	0xBD, 0xD5, 0xBD, 0xD6, 0x9A, 0xFD, 0x9A, 0xFE, /* 0x0C-0x0F */
+	0x9B, 0x41, 0x9B, 0x42, 0x9B, 0x43, 0xBD, 0xD7, /* 0x10-0x13 */
+	0xBD, 0xD8, 0xBD, 0xD9, 0x9B, 0x44, 0x9B, 0x45, /* 0x14-0x17 */
+	0xBD, 0xDA, 0x9B, 0x46, 0x9B, 0x47, 0x9B, 0x48, /* 0x18-0x1B */
+	0xBD, 0xDB, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x1C-0x1F */
+	0x9B, 0x4C, 0x9B, 0x4D, 0x9B, 0x4E, 0x9B, 0x4F, /* 0x20-0x23 */
+	0xBD, 0xDC, 0xBD, 0xDD, 0x9B, 0x50, 0x9B, 0x51, /* 0x24-0x27 */
+	0xBD, 0xDE, 0xBD, 0xDF, 0x9B, 0x52, 0x9B, 0x53, /* 0x28-0x2B */
+	0x9B, 0x54, 0x9B, 0x55, 0x9B, 0x56, 0x9B, 0x57, /* 0x2C-0x2F */
+	0x9B, 0x58, 0x9B, 0x59, 0x9B, 0x5A, 0x9B, 0x61, /* 0x30-0x33 */
+	0x9B, 0x62, 0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, /* 0x34-0x37 */
+	0x9B, 0x66, 0x9B, 0x67, 0x9B, 0x68, 0x9B, 0x69, /* 0x38-0x3B */
+	0x9B, 0x6A, 0x9B, 0x6B, 0x9B, 0x6C, 0x9B, 0x6D, /* 0x3C-0x3F */
+	0x9B, 0x6E, 0x9B, 0x6F, 0x9B, 0x70, 0x9B, 0x71, /* 0x40-0x43 */
+	0x9B, 0x72, 0xBD, 0xE0, 0x9B, 0x73, 0x9B, 0x74, /* 0x44-0x47 */
+	0x9B, 0x75, 0x9B, 0x76, 0x9B, 0x77, 0x9B, 0x78, /* 0x48-0x4B */
+	0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x81, 0x9B, 0x82, /* 0x4C-0x4F */
+	0x9B, 0x83, 0x9B, 0x84, 0x9B, 0x85, 0x9B, 0x86, /* 0x50-0x53 */
+	0x9B, 0x87, 0x9B, 0x88, 0x9B, 0x89, 0x9B, 0x8A, /* 0x54-0x57 */
+	0x9B, 0x8B, 0x9B, 0x8C, 0x9B, 0x8D, 0x9B, 0x8E, /* 0x58-0x5B */
+	0x9B, 0x8F, 0x9B, 0x90, 0x9B, 0x91, 0x9B, 0x92, /* 0x5C-0x5F */
+	0x9B, 0x93, 0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, /* 0x60-0x63 */
+	0x9B, 0x97, 0x9B, 0x98, 0x9B, 0x99, 0x9B, 0x9A, /* 0x64-0x67 */
+	0xBD, 0xE1, 0xBD, 0xE2, 0x9B, 0x9B, 0x9B, 0x9C, /* 0x68-0x6B */
+	0xBD, 0xE3, 0x9B, 0x9D, 0x9B, 0x9E, 0x9B, 0x9F, /* 0x6C-0x6F */
+	0xBD, 0xE4, 0x9B, 0xA0, 0xBD, 0xE5, 0x9B, 0xA1, /* 0x70-0x73 */
+	0x9B, 0xA2, 0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, /* 0x74-0x77 */
+	0xBD, 0xE6, 0xBD, 0xE7, 0x9B, 0xA6, 0x9B, 0xA7, /* 0x78-0x7B */
+	0xBD, 0xE8, 0xBD, 0xE9, 0x9B, 0xA8, 0x9B, 0xA9, /* 0x7C-0x7F */
+	
+	0x9B, 0xAA, 0x9B, 0xAB, 0x9B, 0xAC, 0x9B, 0xAD, /* 0x80-0x83 */
+	0xBD, 0xEA, 0x9B, 0xAE, 0x9B, 0xAF, 0x9B, 0xB0, /* 0x84-0x87 */
+	0xBD, 0xEB, 0x9B, 0xB1, 0x9B, 0xB2, 0x9B, 0xB3, /* 0x88-0x8B */
+	0xBD, 0xEC, 0x9B, 0xB4, 0x9B, 0xB5, 0x9B, 0xB6, /* 0x8C-0x8F */
+	0x9B, 0xB7, 0x9B, 0xB8, 0x9B, 0xB9, 0x9B, 0xBA, /* 0x90-0x93 */
+	0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, 0x9B, 0xBE, /* 0x94-0x97 */
+	0x9B, 0xBF, 0x9B, 0xC0, 0x9B, 0xC1, 0x9B, 0xC2, /* 0x98-0x9B */
+	0x9B, 0xC3, 0x9B, 0xC4, 0x9B, 0xC5, 0x9B, 0xC6, /* 0x9C-0x9F */
+	0x9B, 0xC7, 0x9B, 0xC8, 0x9B, 0xC9, 0x9B, 0xCA, /* 0xA0-0xA3 */
+	0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0xA4-0xA7 */
+	0x9B, 0xCF, 0x9B, 0xD0, 0x9B, 0xD1, 0x9B, 0xD2, /* 0xA8-0xAB */
+	0x9B, 0xD3, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0xAC-0xAF */
+	0x9B, 0xD7, 0x9B, 0xD8, 0x9B, 0xD9, 0x9B, 0xDA, /* 0xB0-0xB3 */
+	0x9B, 0xDB, 0x9B, 0xDC, 0x9B, 0xDD, 0x9B, 0xDE, /* 0xB4-0xB7 */
+	0x9B, 0xDF, 0x9B, 0xE0, 0x9B, 0xE1, 0x9B, 0xE2, /* 0xB8-0xBB */
+	0x9B, 0xE3, 0x9B, 0xE4, 0x9B, 0xE5, 0x9B, 0xE6, /* 0xBC-0xBF */
+	0xBD, 0xED, 0x9B, 0xE7, 0x9B, 0xE8, 0x9B, 0xE9, /* 0xC0-0xC3 */
+	0x9B, 0xEA, 0x9B, 0xEB, 0x9B, 0xEC, 0x9B, 0xED, /* 0xC4-0xC7 */
+	0x9B, 0xEE, 0x9B, 0xEF, 0x9B, 0xF0, 0x9B, 0xF1, /* 0xC8-0xCB */
+	0x9B, 0xF2, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xCC-0xCF */
+	0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0x9B, 0xF9, /* 0xD0-0xD3 */
+	0x9B, 0xFA, 0x9B, 0xFB, 0x9B, 0xFC, 0x9B, 0xFD, /* 0xD4-0xD7 */
+	0xBD, 0xEE, 0xBD, 0xEF, 0x9B, 0xFE, 0x9C, 0x41, /* 0xD8-0xDB */
+	0xBD, 0xF0, 0x9C, 0x42, 0x9C, 0x43, 0xBD, 0xF1, /* 0xDC-0xDF */
+	0xBD, 0xF2, 0x9C, 0x44, 0xBD, 0xF3, 0x9C, 0x45, /* 0xE0-0xE3 */
+	0x9C, 0x46, 0x9C, 0x47, 0x9C, 0x48, 0x9C, 0x49, /* 0xE4-0xE7 */
+	0xBD, 0xF4, 0xBD, 0xF5, 0x9C, 0x4A, 0x9C, 0x4B, /* 0xE8-0xEB */
+	0x9C, 0x4C, 0xBD, 0xF6, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xEC-0xEF */
+	0x9C, 0x4F, 0x9C, 0x50, 0x9C, 0x51, 0x9C, 0x52, /* 0xF0-0xF3 */
+	0xBD, 0xF7, 0xBD, 0xF8, 0x9C, 0x53, 0x9C, 0x54, /* 0xF4-0xF7 */
+	0xBD, 0xF9, 0x9C, 0x55, 0x9C, 0x56, 0x9C, 0x57, /* 0xF8-0xFB */
+	0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0x9C, 0x61, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C4[512] = {
+	0x9C, 0x62, 0x9C, 0x63, 0x9C, 0x64, 0x9C, 0x65, /* 0x00-0x03 */
+	0x9C, 0x66, 0x9C, 0x67, 0x9C, 0x68, 0x9C, 0x69, /* 0x04-0x07 */
+	0xBD, 0xFA, 0x9C, 0x6A, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x08-0x0B */
+	0x9C, 0x6D, 0x9C, 0x6E, 0x9C, 0x6F, 0x9C, 0x70, /* 0x0C-0x0F */
+	0xBD, 0xFB, 0x9C, 0x71, 0x9C, 0x72, 0x9C, 0x73, /* 0x10-0x13 */
+	0x9C, 0x74, 0x9C, 0x75, 0x9C, 0x76, 0x9C, 0x77, /* 0x14-0x17 */
+	0x9C, 0x78, 0x9C, 0x79, 0x9C, 0x7A, 0x9C, 0x81, /* 0x18-0x1B */
+	0x9C, 0x82, 0x9C, 0x83, 0x9C, 0x84, 0x9C, 0x85, /* 0x1C-0x1F */
+	0x9C, 0x86, 0x9C, 0x87, 0x9C, 0x88, 0x9C, 0x89, /* 0x20-0x23 */
+	0xBD, 0xFC, 0x9C, 0x8A, 0x9C, 0x8B, 0x9C, 0x8C, /* 0x24-0x27 */
+	0x9C, 0x8D, 0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, /* 0x28-0x2B */
+	0xBD, 0xFD, 0x9C, 0x91, 0x9C, 0x92, 0x9C, 0x93, /* 0x2C-0x2F */
+	0xBD, 0xFE, 0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, /* 0x30-0x33 */
+	0xBE, 0xA1, 0x9C, 0x97, 0x9C, 0x98, 0x9C, 0x99, /* 0x34-0x37 */
+	0x9C, 0x9A, 0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, /* 0x38-0x3B */
+	0xBE, 0xA2, 0xBE, 0xA3, 0x9C, 0x9E, 0x9C, 0x9F, /* 0x3C-0x3F */
+	0x9C, 0xA0, 0x9C, 0xA1, 0x9C, 0xA2, 0x9C, 0xA3, /* 0x40-0x43 */
+	0x9C, 0xA4, 0x9C, 0xA5, 0x9C, 0xA6, 0x9C, 0xA7, /* 0x44-0x47 */
+	0xBE, 0xA4, 0x9C, 0xA8, 0x9C, 0xA9, 0x9C, 0xAA, /* 0x48-0x4B */
+	0x9C, 0xAB, 0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, /* 0x4C-0x4F */
+	0x9C, 0xAF, 0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, /* 0x50-0x53 */
+	0x9C, 0xB3, 0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, /* 0x54-0x57 */
+	0x9C, 0xB7, 0x9C, 0xB8, 0x9C, 0xB9, 0x9C, 0xBA, /* 0x58-0x5B */
+	0x9C, 0xBB, 0x9C, 0xBC, 0x9C, 0xBD, 0x9C, 0xBE, /* 0x5C-0x5F */
+	0x9C, 0xBF, 0x9C, 0xC0, 0x9C, 0xC1, 0x9C, 0xC2, /* 0x60-0x63 */
+	0xBE, 0xA5, 0xBE, 0xA6, 0x9C, 0xC3, 0x9C, 0xC4, /* 0x64-0x67 */
+	0xBE, 0xA7, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x68-0x6B */
+	0xBE, 0xA8, 0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, /* 0x6C-0x6F */
+	0x9C, 0xCB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x70-0x73 */
+	0xBE, 0xA9, 0xBE, 0xAA, 0x9C, 0xCF, 0x9C, 0xD0, /* 0x74-0x77 */
+	0x9C, 0xD1, 0xBE, 0xAB, 0x9C, 0xD2, 0x9C, 0xD3, /* 0x78-0x7B */
+	0x9C, 0xD4, 0x9C, 0xD5, 0x9C, 0xD6, 0x9C, 0xD7, /* 0x7C-0x7F */
+	
+	0xBE, 0xAC, 0x9C, 0xD8, 0x9C, 0xD9, 0x9C, 0xDA, /* 0x80-0x83 */
+	0x9C, 0xDB, 0x9C, 0xDC, 0x9C, 0xDD, 0x9C, 0xDE, /* 0x84-0x87 */
+	0x9C, 0xDF, 0x9C, 0xE0, 0x9C, 0xE1, 0x9C, 0xE2, /* 0x88-0x8B */
+	0x9C, 0xE3, 0x9C, 0xE4, 0x9C, 0xE5, 0x9C, 0xE6, /* 0x8C-0x8F */
+	0x9C, 0xE7, 0x9C, 0xE8, 0x9C, 0xE9, 0x9C, 0xEA, /* 0x90-0x93 */
+	0xBE, 0xAD, 0x9C, 0xEB, 0x9C, 0xEC, 0x9C, 0xED, /* 0x94-0x97 */
+	0x9C, 0xEE, 0x9C, 0xEF, 0x9C, 0xF0, 0x9C, 0xF1, /* 0x98-0x9B */
+	0xBE, 0xAE, 0x9C, 0xF2, 0x9C, 0xF3, 0x9C, 0xF4, /* 0x9C-0x9F */
+	0x9C, 0xF5, 0x9C, 0xF6, 0x9C, 0xF7, 0x9C, 0xF8, /* 0xA0-0xA3 */
+	0x9C, 0xF9, 0x9C, 0xFA, 0x9C, 0xFB, 0x9C, 0xFC, /* 0xA4-0xA7 */
+	0x9C, 0xFD, 0x9C, 0xFE, 0x9D, 0x41, 0x9D, 0x42, /* 0xA8-0xAB */
+	0x9D, 0x43, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xAC-0xAF */
+	0x9D, 0x47, 0x9D, 0x48, 0x9D, 0x49, 0x9D, 0x4A, /* 0xB0-0xB3 */
+	0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, 0x9D, 0x4E, /* 0xB4-0xB7 */
+	0xBE, 0xAF, 0x9D, 0x4F, 0x9D, 0x50, 0x9D, 0x51, /* 0xB8-0xBB */
+	0xBE, 0xB0, 0x9D, 0x52, 0x9D, 0x53, 0x9D, 0x54, /* 0xBC-0xBF */
+	0x9D, 0x55, 0x9D, 0x56, 0x9D, 0x57, 0x9D, 0x58, /* 0xC0-0xC3 */
+	0x9D, 0x59, 0x9D, 0x5A, 0x9D, 0x61, 0x9D, 0x62, /* 0xC4-0xC7 */
+	0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0xC8-0xCB */
+	0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, 0x9D, 0x6A, /* 0xCC-0xCF */
+	0x9D, 0x6B, 0x9D, 0x6C, 0x9D, 0x6D, 0x9D, 0x6E, /* 0xD0-0xD3 */
+	0x9D, 0x6F, 0x9D, 0x70, 0x9D, 0x71, 0x9D, 0x72, /* 0xD4-0xD7 */
+	0x9D, 0x73, 0x9D, 0x74, 0x9D, 0x75, 0x9D, 0x76, /* 0xD8-0xDB */
+	0x9D, 0x77, 0x9D, 0x78, 0x9D, 0x79, 0x9D, 0x7A, /* 0xDC-0xDF */
+	0x9D, 0x81, 0x9D, 0x82, 0x9D, 0x83, 0x9D, 0x84, /* 0xE0-0xE3 */
+	0x9D, 0x85, 0x9D, 0x86, 0x9D, 0x87, 0x9D, 0x88, /* 0xE4-0xE7 */
+	0x9D, 0x89, 0xBE, 0xB1, 0x9D, 0x8A, 0x9D, 0x8B, /* 0xE8-0xEB */
+	0x9D, 0x8C, 0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, /* 0xEC-0xEF */
+	0xBE, 0xB2, 0xBE, 0xB3, 0x9D, 0x90, 0x9D, 0x91, /* 0xF0-0xF3 */
+	0xBE, 0xB4, 0x9D, 0x92, 0x9D, 0x93, 0x9D, 0x94, /* 0xF4-0xF7 */
+	0xBE, 0xB5, 0x9D, 0x95, 0xBE, 0xB6, 0x9D, 0x96, /* 0xF8-0xFB */
+	0x9D, 0x97, 0x9D, 0x98, 0x9D, 0x99, 0xBE, 0xB7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C5[512] = {
+	0xBE, 0xB8, 0xBE, 0xB9, 0x9D, 0x9A, 0x9D, 0x9B, /* 0x00-0x03 */
+	0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x04-0x07 */
+	0x9D, 0xA0, 0x9D, 0xA1, 0x9D, 0xA2, 0x9D, 0xA3, /* 0x08-0x0B */
+	0xBE, 0xBA, 0x9D, 0xA4, 0x9D, 0xA5, 0x9D, 0xA6, /* 0x0C-0x0F */
+	0xBE, 0xBB, 0x9D, 0xA7, 0x9D, 0xA8, 0x9D, 0xA9, /* 0x10-0x13 */
+	0xBE, 0xBC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x14-0x17 */
+	0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x18-0x1B */
+	0xBE, 0xBD, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x1C-0x1F */
+	0x9D, 0xB4, 0x9D, 0xB5, 0x9D, 0xB6, 0x9D, 0xB7, /* 0x20-0x23 */
+	0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, 0x9D, 0xBB, /* 0x24-0x27 */
+	0xBE, 0xBE, 0xBE, 0xBF, 0x9D, 0xBC, 0x9D, 0xBD, /* 0x28-0x2B */
+	0xBE, 0xC0, 0x9D, 0xBE, 0x9D, 0xBF, 0x9D, 0xC0, /* 0x2C-0x2F */
+	0xBE, 0xC1, 0x9D, 0xC1, 0x9D, 0xC2, 0x9D, 0xC3, /* 0x30-0x33 */
+	0x9D, 0xC4, 0x9D, 0xC5, 0x9D, 0xC6, 0x9D, 0xC7, /* 0x34-0x37 */
+	0xBE, 0xC2, 0xBE, 0xC3, 0x9D, 0xC8, 0xBE, 0xC4, /* 0x38-0x3B */
+	0x9D, 0xC9, 0xBE, 0xC5, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x3C-0x3F */
+	0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0x40-0x43 */
+	0xBE, 0xC6, 0xBE, 0xC7, 0x9D, 0xD0, 0x9D, 0xD1, /* 0x44-0x47 */
+	0xBE, 0xC8, 0xBE, 0xC9, 0xBE, 0xCA, 0x9D, 0xD2, /* 0x48-0x4B */
+	0xBE, 0xCB, 0xBE, 0xCC, 0xBE, 0xCD, 0x9D, 0xD3, /* 0x4C-0x4F */
+	0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xBE, 0xCE, /* 0x50-0x53 */
+	0xBE, 0xCF, 0xBE, 0xD0, 0x9D, 0xD7, 0xBE, 0xD1, /* 0x54-0x57 */
+	0xBE, 0xD2, 0xBE, 0xD3, 0x9D, 0xD8, 0x9D, 0xD9, /* 0x58-0x5B */
+	0x9D, 0xDA, 0xBE, 0xD4, 0xBE, 0xD5, 0x9D, 0xDB, /* 0x5C-0x5F */
+	0xBE, 0xD6, 0xBE, 0xD7, 0x9D, 0xDC, 0x9D, 0xDD, /* 0x60-0x63 */
+	0xBE, 0xD8, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0x64-0x67 */
+	0xBE, 0xD9, 0x9D, 0xE1, 0x9D, 0xE2, 0x9D, 0xE3, /* 0x68-0x6B */
+	0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, 0x9D, 0xE7, /* 0x6C-0x6F */
+	0xBE, 0xDA, 0xBE, 0xDB, 0x9D, 0xE8, 0xBE, 0xDC, /* 0x70-0x73 */
+	0xBE, 0xDD, 0xBE, 0xDE, 0x9D, 0xE9, 0x9D, 0xEA, /* 0x74-0x77 */
+	0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, 0x9D, 0xEE, /* 0x78-0x7B */
+	0xBE, 0xDF, 0xBE, 0xE0, 0x9D, 0xEF, 0x9D, 0xF0, /* 0x7C-0x7F */
+	
+	0xBE, 0xE1, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0x80-0x83 */
+	0xBE, 0xE2, 0x9D, 0xF4, 0x9D, 0xF5, 0xBE, 0xE3, /* 0x84-0x87 */
+	0x9D, 0xF6, 0x9D, 0xF7, 0x9D, 0xF8, 0x9D, 0xF9, /* 0x88-0x8B */
+	0xBE, 0xE4, 0xBE, 0xE5, 0x9D, 0xFA, 0xBE, 0xE6, /* 0x8C-0x8F */
+	0x9D, 0xFB, 0xBE, 0xE7, 0x9D, 0xFC, 0x9D, 0xFD, /* 0x90-0x93 */
+	0x9D, 0xFE, 0xBE, 0xE8, 0x9E, 0x41, 0xBE, 0xE9, /* 0x94-0x97 */
+	0xBE, 0xEA, 0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, /* 0x98-0x9B */
+	0xBE, 0xEB, 0x9E, 0x45, 0x9E, 0x46, 0x9E, 0x47, /* 0x9C-0x9F */
+	0xBE, 0xEC, 0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, /* 0xA0-0xA3 */
+	0x9E, 0x4B, 0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, /* 0xA4-0xA7 */
+	0x9E, 0x4F, 0xBE, 0xED, 0x9E, 0x50, 0x9E, 0x51, /* 0xA8-0xAB */
+	0x9E, 0x52, 0x9E, 0x53, 0x9E, 0x54, 0x9E, 0x55, /* 0xAC-0xAF */
+	0x9E, 0x56, 0x9E, 0x57, 0x9E, 0x58, 0x9E, 0x59, /* 0xB0-0xB3 */
+	0xBE, 0xEE, 0xBE, 0xEF, 0x9E, 0x5A, 0x9E, 0x61, /* 0xB4-0xB7 */
+	0xBE, 0xF0, 0xBE, 0xF1, 0x9E, 0x62, 0xBE, 0xF2, /* 0xB8-0xBB */
+	0xBE, 0xF3, 0xBE, 0xF4, 0xBE, 0xF5, 0x9E, 0x63, /* 0xBC-0xBF */
+	0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0xC0-0xC3 */
+	0xBE, 0xF6, 0xBE, 0xF7, 0xBE, 0xF8, 0xBE, 0xF9, /* 0xC4-0xC7 */
+	0xBE, 0xFA, 0xBE, 0xFB, 0xBE, 0xFC, 0x9E, 0x68, /* 0xC8-0xCB */
+	0xBE, 0xFD, 0x9E, 0x69, 0xBE, 0xFE, 0x9E, 0x6A, /* 0xCC-0xCF */
+	0xBF, 0xA1, 0xBF, 0xA2, 0x9E, 0x6B, 0x9E, 0x6C, /* 0xD0-0xD3 */
+	0xBF, 0xA3, 0x9E, 0x6D, 0x9E, 0x6E, 0x9E, 0x6F, /* 0xD4-0xD7 */
+	0xBF, 0xA4, 0x9E, 0x70, 0x9E, 0x71, 0x9E, 0x72, /* 0xD8-0xDB */
+	0x9E, 0x73, 0x9E, 0x74, 0x9E, 0x75, 0x9E, 0x76, /* 0xDC-0xDF */
+	0xBF, 0xA5, 0xBF, 0xA6, 0x9E, 0x77, 0xBF, 0xA7, /* 0xE0-0xE3 */
+	0x9E, 0x78, 0xBF, 0xA8, 0x9E, 0x79, 0x9E, 0x7A, /* 0xE4-0xE7 */
+	0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0xE8-0xEB */
+	0xBF, 0xA9, 0xBF, 0xAA, 0xBF, 0xAB, 0x9E, 0x85, /* 0xEC-0xEF */
+	0xBF, 0xAC, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0xF0-0xF3 */
+	0xBF, 0xAD, 0x9E, 0x89, 0xBF, 0xAE, 0xBF, 0xAF, /* 0xF4-0xF7 */
+	0x9E, 0x8A, 0x9E, 0x8B, 0x9E, 0x8C, 0x9E, 0x8D, /* 0xF8-0xFB */
+	0xBF, 0xB0, 0xBF, 0xB1, 0xBF, 0xB2, 0xBF, 0xB3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C6[512] = {
+	0xBF, 0xB4, 0xBF, 0xB5, 0x9E, 0x8E, 0x9E, 0x8F, /* 0x00-0x03 */
+	0x9E, 0x90, 0xBF, 0xB6, 0xBF, 0xB7, 0xBF, 0xB8, /* 0x04-0x07 */
+	0xBF, 0xB9, 0x9E, 0x91, 0x9E, 0x92, 0x9E, 0x93, /* 0x08-0x0B */
+	0xBF, 0xBA, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x0C-0x0F */
+	0xBF, 0xBB, 0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, /* 0x10-0x13 */
+	0x9E, 0x9A, 0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, /* 0x14-0x17 */
+	0xBF, 0xBC, 0xBF, 0xBD, 0x9E, 0x9E, 0xBF, 0xBE, /* 0x18-0x1B */
+	0xBF, 0xBF, 0x9E, 0x9F, 0x9E, 0xA0, 0x9E, 0xA1, /* 0x1C-0x1F */
+	0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, 0x9E, 0xA5, /* 0x20-0x23 */
+	0xBF, 0xC0, 0xBF, 0xC1, 0x9E, 0xA6, 0x9E, 0xA7, /* 0x24-0x27 */
+	0xBF, 0xC2, 0x9E, 0xA8, 0x9E, 0xA9, 0x9E, 0xAA, /* 0x28-0x2B */
+	0xBF, 0xC3, 0xBF, 0xC4, 0xBF, 0xC5, 0x9E, 0xAB, /* 0x2C-0x2F */
+	0xBF, 0xC6, 0x9E, 0xAC, 0x9E, 0xAD, 0xBF, 0xC7, /* 0x30-0x33 */
+	0xBF, 0xC8, 0xBF, 0xC9, 0x9E, 0xAE, 0xBF, 0xCA, /* 0x34-0x37 */
+	0x9E, 0xAF, 0xBF, 0xCB, 0x9E, 0xB0, 0xBF, 0xCC, /* 0x38-0x3B */
+	0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, 0x9E, 0xB4, /* 0x3C-0x3F */
+	0xBF, 0xCD, 0xBF, 0xCE, 0x9E, 0xB5, 0x9E, 0xB6, /* 0x40-0x43 */
+	0xBF, 0xCF, 0x9E, 0xB7, 0x9E, 0xB8, 0x9E, 0xB9, /* 0x44-0x47 */
+	0xBF, 0xD0, 0x9E, 0xBA, 0x9E, 0xBB, 0x9E, 0xBC, /* 0x48-0x4B */
+	0x9E, 0xBD, 0x9E, 0xBE, 0x9E, 0xBF, 0x9E, 0xC0, /* 0x4C-0x4F */
+	0xBF, 0xD1, 0xBF, 0xD2, 0x9E, 0xC1, 0xBF, 0xD3, /* 0x50-0x53 */
+	0xBF, 0xD4, 0xBF, 0xD5, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x54-0x57 */
+	0x9E, 0xC4, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x58-0x5B */
+	0xBF, 0xD6, 0xBF, 0xD7, 0x9E, 0xC8, 0x9E, 0xC9, /* 0x5C-0x5F */
+	0xBF, 0xD8, 0x9E, 0xCA, 0x9E, 0xCB, 0x9E, 0xCC, /* 0x60-0x63 */
+	0x9E, 0xCD, 0x9E, 0xCE, 0x9E, 0xCF, 0x9E, 0xD0, /* 0x64-0x67 */
+	0x9E, 0xD1, 0x9E, 0xD2, 0x9E, 0xD3, 0x9E, 0xD4, /* 0x68-0x6B */
+	0xBF, 0xD9, 0x9E, 0xD5, 0x9E, 0xD6, 0xBF, 0xDA, /* 0x6C-0x6F */
+	0x9E, 0xD7, 0xBF, 0xDB, 0x9E, 0xD8, 0x9E, 0xD9, /* 0x70-0x73 */
+	0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, 0x9E, 0xDD, /* 0x74-0x77 */
+	0xBF, 0xDC, 0xBF, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, /* 0x78-0x7B */
+	0xBF, 0xDE, 0x9E, 0xE0, 0x9E, 0xE1, 0x9E, 0xE2, /* 0x7C-0x7F */
+	
+	0xBF, 0xDF, 0x9E, 0xE3, 0x9E, 0xE4, 0x9E, 0xE5, /* 0x80-0x83 */
+	0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0x9E, 0xE9, /* 0x84-0x87 */
+	0xBF, 0xE0, 0xBF, 0xE1, 0x9E, 0xEA, 0xBF, 0xE2, /* 0x88-0x8B */
+	0x9E, 0xEB, 0xBF, 0xE3, 0x9E, 0xEC, 0x9E, 0xED, /* 0x8C-0x8F */
+	0x9E, 0xEE, 0x9E, 0xEF, 0x9E, 0xF0, 0x9E, 0xF1, /* 0x90-0x93 */
+	0xBF, 0xE4, 0xBF, 0xE5, 0x9E, 0xF2, 0x9E, 0xF3, /* 0x94-0x97 */
+	0xBF, 0xE6, 0x9E, 0xF4, 0x9E, 0xF5, 0x9E, 0xF6, /* 0x98-0x9B */
+	0xBF, 0xE7, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0x9C-0x9F */
+	0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xA0-0xA3 */
+	0xBF, 0xE8, 0xBF, 0xE9, 0x9E, 0xFE, 0xBF, 0xEA, /* 0xA4-0xA7 */
+	0x9F, 0x41, 0xBF, 0xEB, 0x9F, 0x42, 0x9F, 0x43, /* 0xA8-0xAB */
+	0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, 0x9F, 0x47, /* 0xAC-0xAF */
+	0xBF, 0xEC, 0xBF, 0xED, 0x9F, 0x48, 0x9F, 0x49, /* 0xB0-0xB3 */
+	0xBF, 0xEE, 0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, /* 0xB4-0xB7 */
+	0xBF, 0xEF, 0xBF, 0xF0, 0xBF, 0xF1, 0x9F, 0x4D, /* 0xB8-0xBB */
+	0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0x9F, 0x51, /* 0xBC-0xBF */
+	0xBF, 0xF2, 0xBF, 0xF3, 0x9F, 0x52, 0xBF, 0xF4, /* 0xC0-0xC3 */
+	0x9F, 0x53, 0xBF, 0xF5, 0x9F, 0x54, 0x9F, 0x55, /* 0xC4-0xC7 */
+	0x9F, 0x56, 0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, /* 0xC8-0xCB */
+	0xBF, 0xF6, 0xBF, 0xF7, 0x9F, 0x5A, 0x9F, 0x61, /* 0xCC-0xCF */
+	0xBF, 0xF8, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0xD0-0xD3 */
+	0xBF, 0xF9, 0x9F, 0x65, 0x9F, 0x66, 0x9F, 0x67, /* 0xD4-0xD7 */
+	0x9F, 0x68, 0x9F, 0x69, 0x9F, 0x6A, 0x9F, 0x6B, /* 0xD8-0xDB */
+	0xBF, 0xFA, 0xBF, 0xFB, 0x9F, 0x6C, 0x9F, 0x6D, /* 0xDC-0xDF */
+	0xBF, 0xFC, 0xBF, 0xFD, 0x9F, 0x6E, 0x9F, 0x6F, /* 0xE0-0xE3 */
+	0x9F, 0x70, 0x9F, 0x71, 0x9F, 0x72, 0x9F, 0x73, /* 0xE4-0xE7 */
+	0xBF, 0xFE, 0xC0, 0xA1, 0x9F, 0x74, 0x9F, 0x75, /* 0xE8-0xEB */
+	0xC0, 0xA2, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0xEC-0xEF */
+	0xC0, 0xA3, 0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x81, /* 0xF0-0xF3 */
+	0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0xF4-0xF7 */
+	0xC0, 0xA4, 0xC0, 0xA5, 0x9F, 0x86, 0x9F, 0x87, /* 0xF8-0xFB */
+	0x9F, 0x88, 0xC0, 0xA6, 0x9F, 0x89, 0x9F, 0x8A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C7[512] = {
+	0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, 0x9F, 0x8E, /* 0x00-0x03 */
+	0xC0, 0xA7, 0xC0, 0xA8, 0x9F, 0x8F, 0x9F, 0x90, /* 0x04-0x07 */
+	0xC0, 0xA9, 0x9F, 0x91, 0x9F, 0x92, 0x9F, 0x93, /* 0x08-0x0B */
+	0xC0, 0xAA, 0x9F, 0x94, 0x9F, 0x95, 0x9F, 0x96, /* 0x0C-0x0F */
+	0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, 0x9F, 0x9A, /* 0x10-0x13 */
+	0xC0, 0xAB, 0xC0, 0xAC, 0x9F, 0x9B, 0xC0, 0xAD, /* 0x14-0x17 */
+	0x9F, 0x9C, 0xC0, 0xAE, 0x9F, 0x9D, 0x9F, 0x9E, /* 0x18-0x1B */
+	0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, 0x9F, 0xA2, /* 0x1C-0x1F */
+	0xC0, 0xAF, 0xC0, 0xB0, 0x9F, 0xA3, 0x9F, 0xA4, /* 0x20-0x23 */
+	0xC0, 0xB1, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x24-0x27 */
+	0xC0, 0xB2, 0x9F, 0xA8, 0x9F, 0xA9, 0x9F, 0xAA, /* 0x28-0x2B */
+	0x9F, 0xAB, 0x9F, 0xAC, 0x9F, 0xAD, 0x9F, 0xAE, /* 0x2C-0x2F */
+	0xC0, 0xB3, 0xC0, 0xB4, 0x9F, 0xAF, 0xC0, 0xB5, /* 0x30-0x33 */
+	0x9F, 0xB0, 0xC0, 0xB6, 0x9F, 0xB1, 0xC0, 0xB7, /* 0x34-0x37 */
+	0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, 0x9F, 0xB5, /* 0x38-0x3B */
+	0xC0, 0xB8, 0xC0, 0xB9, 0x9F, 0xB6, 0x9F, 0xB7, /* 0x3C-0x3F */
+	0xC0, 0xBA, 0x9F, 0xB8, 0x9F, 0xB9, 0x9F, 0xBA, /* 0x40-0x43 */
+	0xC0, 0xBB, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x44-0x47 */
+	0x9F, 0xBE, 0x9F, 0xBF, 0xC0, 0xBC, 0x9F, 0xC0, /* 0x48-0x4B */
+	0xC0, 0xBD, 0xC0, 0xBE, 0x9F, 0xC1, 0xC0, 0xBF, /* 0x4C-0x4F */
+	0x9F, 0xC2, 0xC0, 0xC0, 0xC0, 0xC1, 0xC0, 0xC2, /* 0x50-0x53 */
+	0xC0, 0xC3, 0xC0, 0xC4, 0xC0, 0xC5, 0xC0, 0xC6, /* 0x54-0x57 */
+	0xC0, 0xC7, 0x9F, 0xC3, 0x9F, 0xC4, 0x9F, 0xC5, /* 0x58-0x5B */
+	0xC0, 0xC8, 0x9F, 0xC6, 0x9F, 0xC7, 0x9F, 0xC8, /* 0x5C-0x5F */
+	0xC0, 0xC9, 0x9F, 0xC9, 0x9F, 0xCA, 0x9F, 0xCB, /* 0x60-0x63 */
+	0x9F, 0xCC, 0x9F, 0xCD, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x64-0x67 */
+	0xC0, 0xCA, 0x9F, 0xD0, 0x9F, 0xD1, 0xC0, 0xCB, /* 0x68-0x6B */
+	0x9F, 0xD2, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0x6C-0x6F */
+	0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0x70-0x73 */
+	0xC0, 0xCC, 0xC0, 0xCD, 0x9F, 0xDA, 0x9F, 0xDB, /* 0x74-0x77 */
+	0xC0, 0xCE, 0x9F, 0xDC, 0x9F, 0xDD, 0x9F, 0xDE, /* 0x78-0x7B */
+	0xC0, 0xCF, 0xC0, 0xD0, 0xC0, 0xD1, 0x9F, 0xDF, /* 0x7C-0x7F */
+	
+	0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xC0, 0xD2, /* 0x80-0x83 */
+	0xC0, 0xD3, 0xC0, 0xD4, 0x9F, 0xE3, 0xC0, 0xD5, /* 0x84-0x87 */
+	0xC0, 0xD6, 0xC0, 0xD7, 0xC0, 0xD8, 0x9F, 0xE4, /* 0x88-0x8B */
+	0x9F, 0xE5, 0x9F, 0xE6, 0xC0, 0xD9, 0x9F, 0xE7, /* 0x8C-0x8F */
+	0xC0, 0xDA, 0xC0, 0xDB, 0x9F, 0xE8, 0x9F, 0xE9, /* 0x90-0x93 */
+	0xC0, 0xDC, 0x9F, 0xEA, 0xC0, 0xDD, 0xC0, 0xDE, /* 0x94-0x97 */
+	0xC0, 0xDF, 0x9F, 0xEB, 0xC0, 0xE0, 0x9F, 0xEC, /* 0x98-0x9B */
+	0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0x9F, 0xF0, /* 0x9C-0x9F */
+	0xC0, 0xE1, 0xC0, 0xE2, 0x9F, 0xF1, 0xC0, 0xE3, /* 0xA0-0xA3 */
+	0xC0, 0xE4, 0xC0, 0xE5, 0xC0, 0xE6, 0x9F, 0xF2, /* 0xA4-0xA7 */
+	0x9F, 0xF3, 0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, /* 0xA8-0xAB */
+	0xC0, 0xE7, 0xC0, 0xE8, 0x9F, 0xF7, 0x9F, 0xF8, /* 0xAC-0xAF */
+	0xC0, 0xE9, 0x9F, 0xF9, 0x9F, 0xFA, 0x9F, 0xFB, /* 0xB0-0xB3 */
+	0xC0, 0xEA, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xB4-0xB7 */
+	0xA0, 0x41, 0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, /* 0xB8-0xBB */
+	0xC0, 0xEB, 0xC0, 0xEC, 0xA0, 0x45, 0xC0, 0xED, /* 0xBC-0xBF */
+	0xC0, 0xEE, 0xC0, 0xEF, 0xA0, 0x46, 0xA0, 0x47, /* 0xC0-0xC3 */
+	0xA0, 0x48, 0xA0, 0x49, 0xA0, 0x4A, 0xA0, 0x4B, /* 0xC4-0xC7 */
+	0xC0, 0xF0, 0xC0, 0xF1, 0xA0, 0x4C, 0xA0, 0x4D, /* 0xC8-0xCB */
+	0xC0, 0xF2, 0xA0, 0x4E, 0xC0, 0xF3, 0xA0, 0x4F, /* 0xCC-0xCF */
+	0xC0, 0xF4, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xD0-0xD3 */
+	0xA0, 0x53, 0xA0, 0x54, 0xA0, 0x55, 0xA0, 0x56, /* 0xD4-0xD7 */
+	0xC0, 0xF5, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xD8-0xDB */
+	0xA0, 0x5A, 0xC0, 0xF6, 0xA0, 0x61, 0xA0, 0x62, /* 0xDC-0xDF */
+	0xA0, 0x63, 0xA0, 0x64, 0xA0, 0x65, 0xA0, 0x66, /* 0xE0-0xE3 */
+	0xC0, 0xF7, 0xA0, 0x67, 0xA0, 0x68, 0xA0, 0x69, /* 0xE4-0xE7 */
+	0xC0, 0xF8, 0xA0, 0x6A, 0xA0, 0x6B, 0xA0, 0x6C, /* 0xE8-0xEB */
+	0xC0, 0xF9, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0xEC-0xEF */
+	0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0xF0-0xF3 */
+	0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0xF4-0xF7 */
+	0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x81, /* 0xF8-0xFB */
+	0xA0, 0x82, 0xA0, 0x83, 0xA0, 0x84, 0xA0, 0x85, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C8[512] = {
+	0xC0, 0xFA, 0xC0, 0xFB, 0xA0, 0x86, 0xA0, 0x87, /* 0x00-0x03 */
+	0xC0, 0xFC, 0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, /* 0x04-0x07 */
+	0xC0, 0xFD, 0xA0, 0x8B, 0xC0, 0xFE, 0xA0, 0x8C, /* 0x08-0x0B */
+	0xA0, 0x8D, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x0C-0x0F */
+	0xC1, 0xA1, 0xC1, 0xA2, 0xA0, 0x91, 0xC1, 0xA3, /* 0x10-0x13 */
+	0xA0, 0x92, 0xC1, 0xA4, 0xC1, 0xA5, 0xA0, 0x93, /* 0x14-0x17 */
+	0xA0, 0x94, 0xA0, 0x95, 0xA0, 0x96, 0xA0, 0x97, /* 0x18-0x1B */
+	0xC1, 0xA6, 0xC1, 0xA7, 0xA0, 0x98, 0xA0, 0x99, /* 0x1C-0x1F */
+	0xC1, 0xA8, 0xA0, 0x9A, 0xA0, 0x9B, 0xA0, 0x9C, /* 0x20-0x23 */
+	0xC1, 0xA9, 0xA0, 0x9D, 0xA0, 0x9E, 0xA0, 0x9F, /* 0x24-0x27 */
+	0xA0, 0xA0, 0xA0, 0xA1, 0xA0, 0xA2, 0xA0, 0xA3, /* 0x28-0x2B */
+	0xC1, 0xAA, 0xC1, 0xAB, 0xA0, 0xA4, 0xC1, 0xAC, /* 0x2C-0x2F */
+	0xA0, 0xA5, 0xC1, 0xAD, 0xA0, 0xA6, 0xA0, 0xA7, /* 0x30-0x33 */
+	0xA0, 0xA8, 0xA0, 0xA9, 0xA0, 0xAA, 0xA0, 0xAB, /* 0x34-0x37 */
+	0xC1, 0xAE, 0xA0, 0xAC, 0xA0, 0xAD, 0xA0, 0xAE, /* 0x38-0x3B */
+	0xC1, 0xAF, 0xA0, 0xAF, 0xA0, 0xB0, 0xA0, 0xB1, /* 0x3C-0x3F */
+	0xC1, 0xB0, 0xA0, 0xB2, 0xA0, 0xB3, 0xA0, 0xB4, /* 0x40-0x43 */
+	0xA0, 0xB5, 0xA0, 0xB6, 0xA0, 0xB7, 0xA0, 0xB8, /* 0x44-0x47 */
+	0xC1, 0xB1, 0xC1, 0xB2, 0xA0, 0xB9, 0xA0, 0xBA, /* 0x48-0x4B */
+	0xC1, 0xB3, 0xC1, 0xB4, 0xA0, 0xBB, 0xA0, 0xBC, /* 0x4C-0x4F */
+	0xA0, 0xBD, 0xA0, 0xBE, 0xA0, 0xBF, 0xA0, 0xC0, /* 0x50-0x53 */
+	0xC1, 0xB5, 0xA0, 0xC1, 0xA0, 0xC2, 0xA0, 0xC3, /* 0x54-0x57 */
+	0xA0, 0xC4, 0xA0, 0xC5, 0xA0, 0xC6, 0xA0, 0xC7, /* 0x58-0x5B */
+	0xA0, 0xC8, 0xA0, 0xC9, 0xA0, 0xCA, 0xA0, 0xCB, /* 0x5C-0x5F */
+	0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x60-0x63 */
+	0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x64-0x67 */
+	0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xA0, 0xD7, /* 0x68-0x6B */
+	0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, 0xA0, 0xDB, /* 0x6C-0x6F */
+	0xC1, 0xB6, 0xC1, 0xB7, 0xA0, 0xDC, 0xA0, 0xDD, /* 0x70-0x73 */
+	0xC1, 0xB8, 0xA0, 0xDE, 0xA0, 0xDF, 0xA0, 0xE0, /* 0x74-0x77 */
+	0xC1, 0xB9, 0xA0, 0xE1, 0xC1, 0xBA, 0xA0, 0xE2, /* 0x78-0x7B */
+	0xA0, 0xE3, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0x7C-0x7F */
+	
+	0xC1, 0xBB, 0xC1, 0xBC, 0xA0, 0xE7, 0xC1, 0xBD, /* 0x80-0x83 */
+	0xA0, 0xE8, 0xC1, 0xBE, 0xC1, 0xBF, 0xC1, 0xC0, /* 0x84-0x87 */
+	0xA0, 0xE9, 0xA0, 0xEA, 0xA0, 0xEB, 0xC1, 0xC1, /* 0x88-0x8B */
+	0xC1, 0xC2, 0xC1, 0xC3, 0xA0, 0xEC, 0xA0, 0xED, /* 0x8C-0x8F */
+	0xA0, 0xEE, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0x90-0x93 */
+	0xC1, 0xC4, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0x94-0x97 */
+	0xA0, 0xF5, 0xA0, 0xF6, 0xA0, 0xF7, 0xA0, 0xF8, /* 0x98-0x9B */
+	0xA0, 0xF9, 0xC1, 0xC5, 0xA0, 0xFA, 0xC1, 0xC6, /* 0x9C-0x9F */
+	0xA0, 0xFB, 0xC1, 0xC7, 0xA0, 0xFC, 0xA0, 0xFD, /* 0xA0-0xA3 */
+	0xA0, 0xFE, 0xA1, 0x41, 0xA1, 0x42, 0xA1, 0x43, /* 0xA4-0xA7 */
+	0xC1, 0xC8, 0xA1, 0x44, 0xA1, 0x45, 0xA1, 0x46, /* 0xA8-0xAB */
+	0xA1, 0x47, 0xA1, 0x48, 0xA1, 0x49, 0xA1, 0x4A, /* 0xAC-0xAF */
+	0xA1, 0x4B, 0xA1, 0x4C, 0xA1, 0x4D, 0xA1, 0x4E, /* 0xB0-0xB3 */
+	0xA1, 0x4F, 0xA1, 0x50, 0xA1, 0x51, 0xA1, 0x52, /* 0xB4-0xB7 */
+	0xA1, 0x53, 0xA1, 0x54, 0xA1, 0x55, 0xA1, 0x56, /* 0xB8-0xBB */
+	0xC1, 0xC9, 0xC1, 0xCA, 0xA1, 0x57, 0xA1, 0x58, /* 0xBC-0xBF */
+	0xA1, 0x59, 0xA1, 0x5A, 0xA1, 0x61, 0xA1, 0x62, /* 0xC0-0xC3 */
+	0xC1, 0xCB, 0xA1, 0x63, 0xA1, 0x64, 0xA1, 0x65, /* 0xC4-0xC7 */
+	0xC1, 0xCC, 0xA1, 0x66, 0xA1, 0x67, 0xA1, 0x68, /* 0xC8-0xCB */
+	0xC1, 0xCD, 0xA1, 0x69, 0xA1, 0x6A, 0xA1, 0x6B, /* 0xCC-0xCF */
+	0xA1, 0x6C, 0xA1, 0x6D, 0xA1, 0x6E, 0xA1, 0x6F, /* 0xD0-0xD3 */
+	0xC1, 0xCE, 0xC1, 0xCF, 0xA1, 0x70, 0xC1, 0xD0, /* 0xD4-0xD7 */
+	0xA1, 0x71, 0xC1, 0xD1, 0xA1, 0x72, 0xA1, 0x73, /* 0xD8-0xDB */
+	0xA1, 0x74, 0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x77, /* 0xDC-0xDF */
+	0xC1, 0xD2, 0xC1, 0xD3, 0xA1, 0x78, 0xA1, 0x79, /* 0xE0-0xE3 */
+	0xC1, 0xD4, 0xA1, 0x7A, 0xA1, 0x81, 0xA1, 0x82, /* 0xE4-0xE7 */
+	0xA1, 0x83, 0xA1, 0x84, 0xA1, 0x85, 0xA1, 0x86, /* 0xE8-0xEB */
+	0xA1, 0x87, 0xA1, 0x88, 0xA1, 0x89, 0xA1, 0x8A, /* 0xEC-0xEF */
+	0xA1, 0x8B, 0xA1, 0x8C, 0xA1, 0x8D, 0xA1, 0x8E, /* 0xF0-0xF3 */
+	0xA1, 0x8F, 0xC1, 0xD5, 0xA1, 0x90, 0xA1, 0x91, /* 0xF4-0xF7 */
+	0xA1, 0x92, 0xA1, 0x93, 0xA1, 0x94, 0xA1, 0x95, /* 0xF8-0xFB */
+	0xC1, 0xD6, 0xC1, 0xD7, 0xA1, 0x96, 0xA1, 0x97, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C9[512] = {
+	0xC1, 0xD8, 0xA1, 0x98, 0xA1, 0x99, 0xA1, 0x9A, /* 0x00-0x03 */
+	0xC1, 0xD9, 0xC1, 0xDA, 0xC1, 0xDB, 0xA1, 0x9B, /* 0x04-0x07 */
+	0xA1, 0x9C, 0xA1, 0x9D, 0xA1, 0x9E, 0xA1, 0x9F, /* 0x08-0x0B */
+	0xC1, 0xDC, 0xC1, 0xDD, 0xA1, 0xA0, 0xC1, 0xDE, /* 0x0C-0x0F */
+	0xA2, 0x41, 0xC1, 0xDF, 0xA2, 0x42, 0xA2, 0x43, /* 0x10-0x13 */
+	0xA2, 0x44, 0xA2, 0x45, 0xA2, 0x46, 0xA2, 0x47, /* 0x14-0x17 */
+	0xC1, 0xE0, 0xA2, 0x48, 0xA2, 0x49, 0xA2, 0x4A, /* 0x18-0x1B */
+	0xA2, 0x4B, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x1C-0x1F */
+	0xA2, 0x4F, 0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, /* 0x20-0x23 */
+	0xA2, 0x53, 0xA2, 0x54, 0xA2, 0x55, 0xA2, 0x56, /* 0x24-0x27 */
+	0xA2, 0x57, 0xA2, 0x58, 0xA2, 0x59, 0xA2, 0x5A, /* 0x28-0x2B */
+	0xC1, 0xE1, 0xA2, 0x61, 0xA2, 0x62, 0xA2, 0x63, /* 0x2C-0x2F */
+	0xA2, 0x64, 0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, /* 0x30-0x33 */
+	0xC1, 0xE2, 0xA2, 0x68, 0xA2, 0x69, 0xA2, 0x6A, /* 0x34-0x37 */
+	0xA2, 0x6B, 0xA2, 0x6C, 0xA2, 0x6D, 0xA2, 0x6E, /* 0x38-0x3B */
+	0xA2, 0x6F, 0xA2, 0x70, 0xA2, 0x71, 0xA2, 0x72, /* 0x3C-0x3F */
+	0xA2, 0x73, 0xA2, 0x74, 0xA2, 0x75, 0xA2, 0x76, /* 0x40-0x43 */
+	0xA2, 0x77, 0xA2, 0x78, 0xA2, 0x79, 0xA2, 0x7A, /* 0x44-0x47 */
+	0xA2, 0x81, 0xA2, 0x82, 0xA2, 0x83, 0xA2, 0x84, /* 0x48-0x4B */
+	0xA2, 0x85, 0xA2, 0x86, 0xA2, 0x87, 0xA2, 0x88, /* 0x4C-0x4F */
+	0xC1, 0xE3, 0xC1, 0xE4, 0xA2, 0x89, 0xA2, 0x8A, /* 0x50-0x53 */
+	0xC1, 0xE5, 0xA2, 0x8B, 0xA2, 0x8C, 0xA2, 0x8D, /* 0x54-0x57 */
+	0xC1, 0xE6, 0xA2, 0x8E, 0xA2, 0x8F, 0xA2, 0x90, /* 0x58-0x5B */
+	0xA2, 0x91, 0xA2, 0x92, 0xA2, 0x93, 0xA2, 0x94, /* 0x5C-0x5F */
+	0xC1, 0xE7, 0xC1, 0xE8, 0xA2, 0x95, 0xC1, 0xE9, /* 0x60-0x63 */
+	0xA2, 0x96, 0xA2, 0x97, 0xA2, 0x98, 0xA2, 0x99, /* 0x64-0x67 */
+	0xA2, 0x9A, 0xA2, 0x9B, 0xA2, 0x9C, 0xA2, 0x9D, /* 0x68-0x6B */
+	0xC1, 0xEA, 0xA2, 0x9E, 0xA2, 0x9F, 0xA2, 0xA0, /* 0x6C-0x6F */
+	0xC1, 0xEB, 0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, /* 0x70-0x73 */
+	0xC1, 0xEC, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x74-0x77 */
+	0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x78-0x7B */
+	0xC1, 0xED, 0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, /* 0x7C-0x7F */
+	
+	0xA3, 0x4E, 0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, /* 0x80-0x83 */
+	0xA3, 0x52, 0xA3, 0x53, 0xA3, 0x54, 0xA3, 0x55, /* 0x84-0x87 */
+	0xC1, 0xEE, 0xC1, 0xEF, 0xA3, 0x56, 0xA3, 0x57, /* 0x88-0x8B */
+	0xC1, 0xF0, 0xA3, 0x58, 0xA3, 0x59, 0xA3, 0x5A, /* 0x8C-0x8F */
+	0xC1, 0xF1, 0xA3, 0x61, 0xA3, 0x62, 0xA3, 0x63, /* 0x90-0x93 */
+	0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, 0xA3, 0x67, /* 0x94-0x97 */
+	0xC1, 0xF2, 0xC1, 0xF3, 0xA3, 0x68, 0xC1, 0xF4, /* 0x98-0x9B */
+	0xA3, 0x69, 0xC1, 0xF5, 0xA3, 0x6A, 0xA3, 0x6B, /* 0x9C-0x9F */
+	0xA3, 0x6C, 0xA3, 0x6D, 0xA3, 0x6E, 0xA3, 0x6F, /* 0xA0-0xA3 */
+	0xA3, 0x70, 0xA3, 0x71, 0xA3, 0x72, 0xA3, 0x73, /* 0xA4-0xA7 */
+	0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, 0xA3, 0x77, /* 0xA8-0xAB */
+	0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, 0xA3, 0x81, /* 0xAC-0xAF */
+	0xA3, 0x82, 0xA3, 0x83, 0xA3, 0x84, 0xA3, 0x85, /* 0xB0-0xB3 */
+	0xA3, 0x86, 0xA3, 0x87, 0xA3, 0x88, 0xA3, 0x89, /* 0xB4-0xB7 */
+	0xA3, 0x8A, 0xA3, 0x8B, 0xA3, 0x8C, 0xA3, 0x8D, /* 0xB8-0xBB */
+	0xA3, 0x8E, 0xA3, 0x8F, 0xA3, 0x90, 0xA3, 0x91, /* 0xBC-0xBF */
+	0xC1, 0xF6, 0xC1, 0xF7, 0xA3, 0x92, 0xA3, 0x93, /* 0xC0-0xC3 */
+	0xC1, 0xF8, 0xA3, 0x94, 0xA3, 0x95, 0xC1, 0xF9, /* 0xC4-0xC7 */
+	0xC1, 0xFA, 0xA3, 0x96, 0xC1, 0xFB, 0xA3, 0x97, /* 0xC8-0xCB */
+	0xA3, 0x98, 0xA3, 0x99, 0xA3, 0x9A, 0xA3, 0x9B, /* 0xCC-0xCF */
+	0xC1, 0xFC, 0xC1, 0xFD, 0xA3, 0x9C, 0xC1, 0xFE, /* 0xD0-0xD3 */
+	0xA3, 0x9D, 0xC2, 0xA1, 0xC2, 0xA2, 0xA3, 0x9E, /* 0xD4-0xD7 */
+	0xA3, 0x9F, 0xC2, 0xA3, 0xC2, 0xA4, 0xA3, 0xA0, /* 0xD8-0xDB */
+	0xC2, 0xA5, 0xC2, 0xA6, 0xA4, 0x41, 0xA4, 0x42, /* 0xDC-0xDF */
+	0xC2, 0xA7, 0xA4, 0x43, 0xC2, 0xA8, 0xA4, 0x44, /* 0xE0-0xE3 */
+	0xC2, 0xA9, 0xA4, 0x45, 0xA4, 0x46, 0xC2, 0xAA, /* 0xE4-0xE7 */
+	0xA4, 0x47, 0xA4, 0x48, 0xA4, 0x49, 0xA4, 0x4A, /* 0xE8-0xEB */
+	0xC2, 0xAB, 0xC2, 0xAC, 0xA4, 0x4B, 0xC2, 0xAD, /* 0xEC-0xEF */
+	0xC2, 0xAE, 0xC2, 0xAF, 0xA4, 0x4C, 0xA4, 0x4D, /* 0xF0-0xF3 */
+	0xA4, 0x4E, 0xA4, 0x4F, 0xA4, 0x50, 0xA4, 0x51, /* 0xF4-0xF7 */
+	0xC2, 0xB0, 0xC2, 0xB1, 0xA4, 0x52, 0xA4, 0x53, /* 0xF8-0xFB */
+	0xC2, 0xB2, 0xA4, 0x54, 0xA4, 0x55, 0xA4, 0x56, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CA[512] = {
+	0xC2, 0xB3, 0xA4, 0x57, 0xA4, 0x58, 0xA4, 0x59, /* 0x00-0x03 */
+	0xA4, 0x5A, 0xA4, 0x61, 0xA4, 0x62, 0xA4, 0x63, /* 0x04-0x07 */
+	0xC2, 0xB4, 0xC2, 0xB5, 0xA4, 0x64, 0xC2, 0xB6, /* 0x08-0x0B */
+	0xC2, 0xB7, 0xC2, 0xB8, 0xA4, 0x65, 0xA4, 0x66, /* 0x0C-0x0F */
+	0xA4, 0x67, 0xA4, 0x68, 0xA4, 0x69, 0xA4, 0x6A, /* 0x10-0x13 */
+	0xC2, 0xB9, 0xA4, 0x6B, 0xA4, 0x6C, 0xA4, 0x6D, /* 0x14-0x17 */
+	0xC2, 0xBA, 0xA4, 0x6E, 0xA4, 0x6F, 0xA4, 0x70, /* 0x18-0x1B */
+	0xA4, 0x71, 0xA4, 0x72, 0xA4, 0x73, 0xA4, 0x74, /* 0x1C-0x1F */
+	0xA4, 0x75, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0x20-0x23 */
+	0xA4, 0x79, 0xA4, 0x7A, 0xA4, 0x81, 0xA4, 0x82, /* 0x24-0x27 */
+	0xA4, 0x83, 0xC2, 0xBB, 0xA4, 0x84, 0xA4, 0x85, /* 0x28-0x2B */
+	0xA4, 0x86, 0xA4, 0x87, 0xA4, 0x88, 0xA4, 0x89, /* 0x2C-0x2F */
+	0xA4, 0x8A, 0xA4, 0x8B, 0xA4, 0x8C, 0xA4, 0x8D, /* 0x30-0x33 */
+	0xA4, 0x8E, 0xA4, 0x8F, 0xA4, 0x90, 0xA4, 0x91, /* 0x34-0x37 */
+	0xA4, 0x92, 0xA4, 0x93, 0xA4, 0x94, 0xA4, 0x95, /* 0x38-0x3B */
+	0xA4, 0x96, 0xA4, 0x97, 0xA4, 0x98, 0xA4, 0x99, /* 0x3C-0x3F */
+	0xA4, 0x9A, 0xA4, 0x9B, 0xA4, 0x9C, 0xA4, 0x9D, /* 0x40-0x43 */
+	0xA4, 0x9E, 0xA4, 0x9F, 0xA4, 0xA0, 0xA5, 0x41, /* 0x44-0x47 */
+	0xA5, 0x42, 0xA5, 0x43, 0xA5, 0x44, 0xA5, 0x45, /* 0x48-0x4B */
+	0xC2, 0xBC, 0xC2, 0xBD, 0xA5, 0x46, 0xA5, 0x47, /* 0x4C-0x4F */
+	0xC2, 0xBE, 0xA5, 0x48, 0xA5, 0x49, 0xA5, 0x4A, /* 0x50-0x53 */
+	0xC2, 0xBF, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0x54-0x57 */
+	0xA5, 0x4E, 0xA5, 0x4F, 0xA5, 0x50, 0xA5, 0x51, /* 0x58-0x5B */
+	0xC2, 0xC0, 0xC2, 0xC1, 0xA5, 0x52, 0xC2, 0xC2, /* 0x5C-0x5F */
+	0xC2, 0xC3, 0xC2, 0xC4, 0xA5, 0x53, 0xA5, 0x54, /* 0x60-0x63 */
+	0xA5, 0x55, 0xA5, 0x56, 0xA5, 0x57, 0xA5, 0x58, /* 0x64-0x67 */
+	0xC2, 0xC5, 0xA5, 0x59, 0xA5, 0x5A, 0xA5, 0x61, /* 0x68-0x6B */
+	0xA5, 0x62, 0xA5, 0x63, 0xA5, 0x64, 0xA5, 0x65, /* 0x6C-0x6F */
+	0xA5, 0x66, 0xA5, 0x67, 0xA5, 0x68, 0xA5, 0x69, /* 0x70-0x73 */
+	0xA5, 0x6A, 0xA5, 0x6B, 0xA5, 0x6C, 0xA5, 0x6D, /* 0x74-0x77 */
+	0xA5, 0x6E, 0xA5, 0x6F, 0xA5, 0x70, 0xA5, 0x71, /* 0x78-0x7B */
+	0xA5, 0x72, 0xC2, 0xC6, 0xA5, 0x73, 0xA5, 0x74, /* 0x7C-0x7F */
+	
+	0xA5, 0x75, 0xA5, 0x76, 0xA5, 0x77, 0xA5, 0x78, /* 0x80-0x83 */
+	0xC2, 0xC7, 0xA5, 0x79, 0xA5, 0x7A, 0xA5, 0x81, /* 0x84-0x87 */
+	0xA5, 0x82, 0xA5, 0x83, 0xA5, 0x84, 0xA5, 0x85, /* 0x88-0x8B */
+	0xA5, 0x86, 0xA5, 0x87, 0xA5, 0x88, 0xA5, 0x89, /* 0x8C-0x8F */
+	0xA5, 0x8A, 0xA5, 0x8B, 0xA5, 0x8C, 0xA5, 0x8D, /* 0x90-0x93 */
+	0xA5, 0x8E, 0xA5, 0x8F, 0xA5, 0x90, 0xA5, 0x91, /* 0x94-0x97 */
+	0xC2, 0xC8, 0xA5, 0x92, 0xA5, 0x93, 0xA5, 0x94, /* 0x98-0x9B */
+	0xA5, 0x95, 0xA5, 0x96, 0xA5, 0x97, 0xA5, 0x98, /* 0x9C-0x9F */
+	0xA5, 0x99, 0xA5, 0x9A, 0xA5, 0x9B, 0xA5, 0x9C, /* 0xA0-0xA3 */
+	0xA5, 0x9D, 0xA5, 0x9E, 0xA5, 0x9F, 0xA5, 0xA0, /* 0xA4-0xA7 */
+	0xA6, 0x41, 0xA6, 0x42, 0xA6, 0x43, 0xA6, 0x44, /* 0xA8-0xAB */
+	0xA6, 0x45, 0xA6, 0x46, 0xA6, 0x47, 0xA6, 0x48, /* 0xAC-0xAF */
+	0xA6, 0x49, 0xA6, 0x4A, 0xA6, 0x4B, 0xA6, 0x4C, /* 0xB0-0xB3 */
+	0xA6, 0x4D, 0xA6, 0x4E, 0xA6, 0x4F, 0xA6, 0x50, /* 0xB4-0xB7 */
+	0xA6, 0x51, 0xA6, 0x52, 0xA6, 0x53, 0xA6, 0x54, /* 0xB8-0xBB */
+	0xC2, 0xC9, 0xC2, 0xCA, 0xA6, 0x55, 0xA6, 0x56, /* 0xBC-0xBF */
+	0xC2, 0xCB, 0xA6, 0x57, 0xA6, 0x58, 0xA6, 0x59, /* 0xC0-0xC3 */
+	0xC2, 0xCC, 0xA6, 0x5A, 0xA6, 0x61, 0xA6, 0x62, /* 0xC4-0xC7 */
+	0xA6, 0x63, 0xA6, 0x64, 0xA6, 0x65, 0xA6, 0x66, /* 0xC8-0xCB */
+	0xC2, 0xCD, 0xC2, 0xCE, 0xA6, 0x67, 0xC2, 0xCF, /* 0xCC-0xCF */
+	0xA6, 0x68, 0xC2, 0xD0, 0xA6, 0x69, 0xC2, 0xD1, /* 0xD0-0xD3 */
+	0xA6, 0x6A, 0xA6, 0x6B, 0xA6, 0x6C, 0xA6, 0x6D, /* 0xD4-0xD7 */
+	0xC2, 0xD2, 0xC2, 0xD3, 0xA6, 0x6E, 0xA6, 0x6F, /* 0xD8-0xDB */
+	0xA6, 0x70, 0xA6, 0x71, 0xA6, 0x72, 0xA6, 0x73, /* 0xDC-0xDF */
+	0xC2, 0xD4, 0xA6, 0x74, 0xA6, 0x75, 0xA6, 0x76, /* 0xE0-0xE3 */
+	0xA6, 0x77, 0xA6, 0x78, 0xA6, 0x79, 0xA6, 0x7A, /* 0xE4-0xE7 */
+	0xA6, 0x81, 0xA6, 0x82, 0xA6, 0x83, 0xA6, 0x84, /* 0xE8-0xEB */
+	0xC2, 0xD5, 0xA6, 0x85, 0xA6, 0x86, 0xA6, 0x87, /* 0xEC-0xEF */
+	0xA6, 0x88, 0xA6, 0x89, 0xA6, 0x8A, 0xA6, 0x8B, /* 0xF0-0xF3 */
+	0xC2, 0xD6, 0xA6, 0x8C, 0xA6, 0x8D, 0xA6, 0x8E, /* 0xF4-0xF7 */
+	0xA6, 0x8F, 0xA6, 0x90, 0xA6, 0x91, 0xA6, 0x92, /* 0xF8-0xFB */
+	0xA6, 0x93, 0xA6, 0x94, 0xA6, 0x95, 0xA6, 0x96, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CB[512] = {
+	0xA6, 0x97, 0xA6, 0x98, 0xA6, 0x99, 0xA6, 0x9A, /* 0x00-0x03 */
+	0xA6, 0x9B, 0xA6, 0x9C, 0xA6, 0x9D, 0xA6, 0x9E, /* 0x04-0x07 */
+	0xC2, 0xD7, 0xA6, 0x9F, 0xA6, 0xA0, 0xA7, 0x41, /* 0x08-0x0B */
+	0xA7, 0x42, 0xA7, 0x43, 0xA7, 0x44, 0xA7, 0x45, /* 0x0C-0x0F */
+	0xC2, 0xD8, 0xA7, 0x46, 0xA7, 0x47, 0xA7, 0x48, /* 0x10-0x13 */
+	0xC2, 0xD9, 0xA7, 0x49, 0xA7, 0x4A, 0xA7, 0x4B, /* 0x14-0x17 */
+	0xC2, 0xDA, 0xA7, 0x4C, 0xA7, 0x4D, 0xA7, 0x4E, /* 0x18-0x1B */
+	0xA7, 0x4F, 0xA7, 0x50, 0xA7, 0x51, 0xA7, 0x52, /* 0x1C-0x1F */
+	0xC2, 0xDB, 0xC2, 0xDC, 0xA7, 0x53, 0xA7, 0x54, /* 0x20-0x23 */
+	0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x57, 0xA7, 0x58, /* 0x24-0x27 */
+	0xA7, 0x59, 0xA7, 0x5A, 0xA7, 0x61, 0xA7, 0x62, /* 0x28-0x2B */
+	0xA7, 0x63, 0xA7, 0x64, 0xA7, 0x65, 0xA7, 0x66, /* 0x2C-0x2F */
+	0xA7, 0x67, 0xA7, 0x68, 0xA7, 0x69, 0xA7, 0x6A, /* 0x30-0x33 */
+	0xA7, 0x6B, 0xA7, 0x6C, 0xA7, 0x6D, 0xA7, 0x6E, /* 0x34-0x37 */
+	0xA7, 0x6F, 0xA7, 0x70, 0xA7, 0x71, 0xA7, 0x72, /* 0x38-0x3B */
+	0xA7, 0x73, 0xA7, 0x74, 0xA7, 0x75, 0xA7, 0x76, /* 0x3C-0x3F */
+	0xA7, 0x77, 0xC2, 0xDD, 0xA7, 0x78, 0xA7, 0x79, /* 0x40-0x43 */
+	0xA7, 0x7A, 0xA7, 0x81, 0xA7, 0x82, 0xA7, 0x83, /* 0x44-0x47 */
+	0xC2, 0xDE, 0xC2, 0xDF, 0xA7, 0x84, 0xA7, 0x85, /* 0x48-0x4B */
+	0xC2, 0xE0, 0xA7, 0x86, 0xA7, 0x87, 0xA7, 0x88, /* 0x4C-0x4F */
+	0xC2, 0xE1, 0xA7, 0x89, 0xA7, 0x8A, 0xA7, 0x8B, /* 0x50-0x53 */
+	0xA7, 0x8C, 0xA7, 0x8D, 0xA7, 0x8E, 0xA7, 0x8F, /* 0x54-0x57 */
+	0xC2, 0xE2, 0xC2, 0xE3, 0xA7, 0x90, 0xA7, 0x91, /* 0x58-0x5B */
+	0xA7, 0x92, 0xC2, 0xE4, 0xA7, 0x93, 0xA7, 0x94, /* 0x5C-0x5F */
+	0xA7, 0x95, 0xA7, 0x96, 0xA7, 0x97, 0xA7, 0x98, /* 0x60-0x63 */
+	0xC2, 0xE5, 0xA7, 0x99, 0xA7, 0x9A, 0xA7, 0x9B, /* 0x64-0x67 */
+	0xA7, 0x9C, 0xA7, 0x9D, 0xA7, 0x9E, 0xA7, 0x9F, /* 0x68-0x6B */
+	0xA7, 0xA0, 0xA8, 0x41, 0xA8, 0x42, 0xA8, 0x43, /* 0x6C-0x6F */
+	0xA8, 0x44, 0xA8, 0x45, 0xA8, 0x46, 0xA8, 0x47, /* 0x70-0x73 */
+	0xA8, 0x48, 0xA8, 0x49, 0xA8, 0x4A, 0xA8, 0x4B, /* 0x74-0x77 */
+	0xC2, 0xE6, 0xC2, 0xE7, 0xA8, 0x4C, 0xA8, 0x4D, /* 0x78-0x7B */
+	0xA8, 0x4E, 0xA8, 0x4F, 0xA8, 0x50, 0xA8, 0x51, /* 0x7C-0x7F */
+	
+	0xA8, 0x52, 0xA8, 0x53, 0xA8, 0x54, 0xA8, 0x55, /* 0x80-0x83 */
+	0xA8, 0x56, 0xA8, 0x57, 0xA8, 0x58, 0xA8, 0x59, /* 0x84-0x87 */
+	0xA8, 0x5A, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x88-0x8B */
+	0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x8C-0x8F */
+	0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x90-0x93 */
+	0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x94-0x97 */
+	0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x98-0x9B */
+	0xC2, 0xE8, 0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, /* 0x9C-0x9F */
+	0xA8, 0x77, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0xA0-0xA3 */
+	0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, 0xA8, 0x84, /* 0xA4-0xA7 */
+	0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, 0xA8, 0x88, /* 0xA8-0xAB */
+	0xA8, 0x89, 0xA8, 0x8A, 0xA8, 0x8B, 0xA8, 0x8C, /* 0xAC-0xAF */
+	0xA8, 0x8D, 0xA8, 0x8E, 0xA8, 0x8F, 0xA8, 0x90, /* 0xB0-0xB3 */
+	0xA8, 0x91, 0xA8, 0x92, 0xA8, 0x93, 0xA8, 0x94, /* 0xB4-0xB7 */
+	0xC2, 0xE9, 0xA8, 0x95, 0xA8, 0x96, 0xA8, 0x97, /* 0xB8-0xBB */
+	0xA8, 0x98, 0xA8, 0x99, 0xA8, 0x9A, 0xA8, 0x9B, /* 0xBC-0xBF */
+	0xA8, 0x9C, 0xA8, 0x9D, 0xA8, 0x9E, 0xA8, 0x9F, /* 0xC0-0xC3 */
+	0xA8, 0xA0, 0xA9, 0x41, 0xA9, 0x42, 0xA9, 0x43, /* 0xC4-0xC7 */
+	0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, 0xA9, 0x47, /* 0xC8-0xCB */
+	0xA9, 0x48, 0xA9, 0x49, 0xA9, 0x4A, 0xA9, 0x4B, /* 0xCC-0xCF */
+	0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0xA9, 0x4F, /* 0xD0-0xD3 */
+	0xC2, 0xEA, 0xA9, 0x50, 0xA9, 0x51, 0xA9, 0x52, /* 0xD4-0xD7 */
+	0xA9, 0x53, 0xA9, 0x54, 0xA9, 0x55, 0xA9, 0x56, /* 0xD8-0xDB */
+	0xA9, 0x57, 0xA9, 0x58, 0xA9, 0x59, 0xA9, 0x5A, /* 0xDC-0xDF */
+	0xA9, 0x61, 0xA9, 0x62, 0xA9, 0x63, 0xA9, 0x64, /* 0xE0-0xE3 */
+	0xC2, 0xEB, 0xA9, 0x65, 0xA9, 0x66, 0xC2, 0xEC, /* 0xE4-0xE7 */
+	0xA9, 0x67, 0xC2, 0xED, 0xA9, 0x68, 0xA9, 0x69, /* 0xE8-0xEB */
+	0xA9, 0x6A, 0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, /* 0xEC-0xEF */
+	0xA9, 0x6E, 0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, /* 0xF0-0xF3 */
+	0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0xF4-0xF7 */
+	0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, 0xA9, 0x79, /* 0xF8-0xFB */
+	0xA9, 0x7A, 0xA9, 0x81, 0xA9, 0x82, 0xA9, 0x83, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CC[512] = {
+	0xA9, 0x84, 0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, /* 0x00-0x03 */
+	0xA9, 0x88, 0xA9, 0x89, 0xA9, 0x8A, 0xA9, 0x8B, /* 0x04-0x07 */
+	0xA9, 0x8C, 0xA9, 0x8D, 0xA9, 0x8E, 0xA9, 0x8F, /* 0x08-0x0B */
+	0xC2, 0xEE, 0xC2, 0xEF, 0xA9, 0x90, 0xA9, 0x91, /* 0x0C-0x0F */
+	0xC2, 0xF0, 0xA9, 0x92, 0xA9, 0x93, 0xA9, 0x94, /* 0x10-0x13 */
+	0xC2, 0xF1, 0xA9, 0x95, 0xA9, 0x96, 0xA9, 0x97, /* 0x14-0x17 */
+	0xA9, 0x98, 0xA9, 0x99, 0xA9, 0x9A, 0xA9, 0x9B, /* 0x18-0x1B */
+	0xC2, 0xF2, 0xC2, 0xF3, 0xA9, 0x9C, 0xA9, 0x9D, /* 0x1C-0x1F */
+	0xA9, 0x9E, 0xC2, 0xF4, 0xC2, 0xF5, 0xA9, 0x9F, /* 0x20-0x23 */
+	0xA9, 0xA0, 0xAA, 0x41, 0xAA, 0x42, 0xC2, 0xF6, /* 0x24-0x27 */
+	0xC2, 0xF7, 0xC2, 0xF8, 0xAA, 0x43, 0xAA, 0x44, /* 0x28-0x2B */
+	0xC2, 0xF9, 0xAA, 0x45, 0xC2, 0xFA, 0xAA, 0x46, /* 0x2C-0x2F */
+	0xC2, 0xFB, 0xAA, 0x47, 0xAA, 0x48, 0xAA, 0x49, /* 0x30-0x33 */
+	0xAA, 0x4A, 0xAA, 0x4B, 0xAA, 0x4C, 0xAA, 0x4D, /* 0x34-0x37 */
+	0xC2, 0xFC, 0xC2, 0xFD, 0xAA, 0x4E, 0xC2, 0xFE, /* 0x38-0x3B */
+	0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, 0xAA, 0x4F, /* 0x3C-0x3F */
+	0xAA, 0x50, 0xAA, 0x51, 0xAA, 0x52, 0xAA, 0x53, /* 0x40-0x43 */
+	0xC3, 0xA4, 0xC3, 0xA5, 0xAA, 0x54, 0xAA, 0x55, /* 0x44-0x47 */
+	0xC3, 0xA6, 0xAA, 0x56, 0xAA, 0x57, 0xAA, 0x58, /* 0x48-0x4B */
+	0xC3, 0xA7, 0xAA, 0x59, 0xAA, 0x5A, 0xAA, 0x61, /* 0x4C-0x4F */
+	0xAA, 0x62, 0xAA, 0x63, 0xAA, 0x64, 0xAA, 0x65, /* 0x50-0x53 */
+	0xC3, 0xA8, 0xC3, 0xA9, 0xAA, 0x66, 0xC3, 0xAA, /* 0x54-0x57 */
+	0xC3, 0xAB, 0xC3, 0xAC, 0xAA, 0x67, 0xAA, 0x68, /* 0x58-0x5B */
+	0xAA, 0x69, 0xAA, 0x6A, 0xAA, 0x6B, 0xAA, 0x6C, /* 0x5C-0x5F */
+	0xC3, 0xAD, 0xAA, 0x6D, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x60-0x63 */
+	0xC3, 0xAE, 0xAA, 0x70, 0xC3, 0xAF, 0xAA, 0x71, /* 0x64-0x67 */
+	0xC3, 0xB0, 0xAA, 0x72, 0xAA, 0x73, 0xAA, 0x74, /* 0x68-0x6B */
+	0xAA, 0x75, 0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, /* 0x6C-0x6F */
+	0xC3, 0xB1, 0xAA, 0x79, 0xAA, 0x7A, 0xAA, 0x81, /* 0x70-0x73 */
+	0xAA, 0x82, 0xC3, 0xB2, 0xAA, 0x83, 0xAA, 0x84, /* 0x74-0x77 */
+	0xAA, 0x85, 0xAA, 0x86, 0xAA, 0x87, 0xAA, 0x88, /* 0x78-0x7B */
+	0xAA, 0x89, 0xAA, 0x8A, 0xAA, 0x8B, 0xAA, 0x8C, /* 0x7C-0x7F */
+	
+	0xAA, 0x8D, 0xAA, 0x8E, 0xAA, 0x8F, 0xAA, 0x90, /* 0x80-0x83 */
+	0xAA, 0x91, 0xAA, 0x92, 0xAA, 0x93, 0xAA, 0x94, /* 0x84-0x87 */
+	0xAA, 0x95, 0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, /* 0x88-0x8B */
+	0xAA, 0x99, 0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, /* 0x8C-0x8F */
+	0xAA, 0x9D, 0xAA, 0x9E, 0xAA, 0x9F, 0xAA, 0xA0, /* 0x90-0x93 */
+	0xAB, 0x41, 0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, /* 0x94-0x97 */
+	0xC3, 0xB3, 0xC3, 0xB4, 0xAB, 0x45, 0xAB, 0x46, /* 0x98-0x9B */
+	0xC3, 0xB5, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x9C-0x9F */
+	0xC3, 0xB6, 0xAB, 0x4A, 0xAB, 0x4B, 0xAB, 0x4C, /* 0xA0-0xA3 */
+	0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0xA4-0xA7 */
+	0xC3, 0xB7, 0xC3, 0xB8, 0xAB, 0x51, 0xC3, 0xB9, /* 0xA8-0xAB */
+	0xC3, 0xBA, 0xC3, 0xBB, 0xAB, 0x52, 0xAB, 0x53, /* 0xAC-0xAF */
+	0xAB, 0x54, 0xAB, 0x55, 0xAB, 0x56, 0xAB, 0x57, /* 0xB0-0xB3 */
+	0xC3, 0xBC, 0xC3, 0xBD, 0xAB, 0x58, 0xAB, 0x59, /* 0xB4-0xB7 */
+	0xC3, 0xBE, 0xAB, 0x5A, 0xAB, 0x61, 0xAB, 0x62, /* 0xB8-0xBB */
+	0xC3, 0xBF, 0xAB, 0x63, 0xAB, 0x64, 0xAB, 0x65, /* 0xBC-0xBF */
+	0xAB, 0x66, 0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, /* 0xC0-0xC3 */
+	0xC3, 0xC0, 0xC3, 0xC1, 0xAB, 0x6A, 0xC3, 0xC2, /* 0xC4-0xC7 */
+	0xAB, 0x6B, 0xC3, 0xC3, 0xAB, 0x6C, 0xAB, 0x6D, /* 0xC8-0xCB */
+	0xAB, 0x6E, 0xAB, 0x6F, 0xAB, 0x70, 0xAB, 0x71, /* 0xCC-0xCF */
+	0xC3, 0xC4, 0xAB, 0x72, 0xAB, 0x73, 0xAB, 0x74, /* 0xD0-0xD3 */
+	0xC3, 0xC5, 0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, /* 0xD4-0xD7 */
+	0xAB, 0x78, 0xAB, 0x79, 0xAB, 0x7A, 0xAB, 0x81, /* 0xD8-0xDB */
+	0xAB, 0x82, 0xAB, 0x83, 0xAB, 0x84, 0xAB, 0x85, /* 0xDC-0xDF */
+	0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, 0xAB, 0x89, /* 0xE0-0xE3 */
+	0xC3, 0xC6, 0xAB, 0x8A, 0xAB, 0x8B, 0xAB, 0x8C, /* 0xE4-0xE7 */
+	0xAB, 0x8D, 0xAB, 0x8E, 0xAB, 0x8F, 0xAB, 0x90, /* 0xE8-0xEB */
+	0xC3, 0xC7, 0xAB, 0x91, 0xAB, 0x92, 0xAB, 0x93, /* 0xEC-0xEF */
+	0xC3, 0xC8, 0xAB, 0x94, 0xAB, 0x95, 0xAB, 0x96, /* 0xF0-0xF3 */
+	0xAB, 0x97, 0xAB, 0x98, 0xAB, 0x99, 0xAB, 0x9A, /* 0xF4-0xF7 */
+	0xAB, 0x9B, 0xAB, 0x9C, 0xAB, 0x9D, 0xAB, 0x9E, /* 0xF8-0xFB */
+	0xAB, 0x9F, 0xAB, 0xA0, 0xAC, 0x41, 0xAC, 0x42, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CD[512] = {
+	0xAC, 0x43, 0xC3, 0xC9, 0xAC, 0x44, 0xAC, 0x45, /* 0x00-0x03 */
+	0xAC, 0x46, 0xAC, 0x47, 0xAC, 0x48, 0xAC, 0x49, /* 0x04-0x07 */
+	0xC3, 0xCA, 0xC3, 0xCB, 0xAC, 0x4A, 0xAC, 0x4B, /* 0x08-0x0B */
+	0xC3, 0xCC, 0xAC, 0x4C, 0xAC, 0x4D, 0xAC, 0x4E, /* 0x0C-0x0F */
+	0xC3, 0xCD, 0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, /* 0x10-0x13 */
+	0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, 0xAC, 0x55, /* 0x14-0x17 */
+	0xC3, 0xCE, 0xC3, 0xCF, 0xAC, 0x56, 0xC3, 0xD0, /* 0x18-0x1B */
+	0xAC, 0x57, 0xC3, 0xD1, 0xAC, 0x58, 0xAC, 0x59, /* 0x1C-0x1F */
+	0xAC, 0x5A, 0xAC, 0x61, 0xAC, 0x62, 0xAC, 0x63, /* 0x20-0x23 */
+	0xC3, 0xD2, 0xAC, 0x64, 0xAC, 0x65, 0xAC, 0x66, /* 0x24-0x27 */
+	0xC3, 0xD3, 0xAC, 0x67, 0xAC, 0x68, 0xAC, 0x69, /* 0x28-0x2B */
+	0xC3, 0xD4, 0xAC, 0x6A, 0xAC, 0x6B, 0xAC, 0x6C, /* 0x2C-0x2F */
+	0xAC, 0x6D, 0xAC, 0x6E, 0xAC, 0x6F, 0xAC, 0x70, /* 0x30-0x33 */
+	0xAC, 0x71, 0xAC, 0x72, 0xAC, 0x73, 0xAC, 0x74, /* 0x34-0x37 */
+	0xAC, 0x75, 0xC3, 0xD5, 0xAC, 0x76, 0xAC, 0x77, /* 0x38-0x3B */
+	0xAC, 0x78, 0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x81, /* 0x3C-0x3F */
+	0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x40-0x43 */
+	0xAC, 0x86, 0xAC, 0x87, 0xAC, 0x88, 0xAC, 0x89, /* 0x44-0x47 */
+	0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x48-0x4B */
+	0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x4C-0x4F */
+	0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x50-0x53 */
+	0xAC, 0x96, 0xAC, 0x97, 0xAC, 0x98, 0xAC, 0x99, /* 0x54-0x57 */
+	0xAC, 0x9A, 0xAC, 0x9B, 0xAC, 0x9C, 0xAC, 0x9D, /* 0x58-0x5B */
+	0xC3, 0xD6, 0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, /* 0x5C-0x5F */
+	0xC3, 0xD7, 0xAD, 0x41, 0xAD, 0x42, 0xAD, 0x43, /* 0x60-0x63 */
+	0xC3, 0xD8, 0xAD, 0x44, 0xAD, 0x45, 0xAD, 0x46, /* 0x64-0x67 */
+	0xAD, 0x47, 0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, /* 0x68-0x6B */
+	0xC3, 0xD9, 0xC3, 0xDA, 0xAD, 0x4B, 0xC3, 0xDB, /* 0x6C-0x6F */
+	0xAD, 0x4C, 0xC3, 0xDC, 0xAD, 0x4D, 0xAD, 0x4E, /* 0x70-0x73 */
+	0xAD, 0x4F, 0xAD, 0x50, 0xAD, 0x51, 0xAD, 0x52, /* 0x74-0x77 */
+	0xC3, 0xDD, 0xAD, 0x53, 0xAD, 0x54, 0xAD, 0x55, /* 0x78-0x7B */
+	0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, 0xAD, 0x59, /* 0x7C-0x7F */
+	
+	0xAD, 0x5A, 0xAD, 0x61, 0xAD, 0x62, 0xAD, 0x63, /* 0x80-0x83 */
+	0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0x84-0x87 */
+	0xC3, 0xDE, 0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, /* 0x88-0x8B */
+	0xAD, 0x6B, 0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, /* 0x8C-0x8F */
+	0xAD, 0x6F, 0xAD, 0x70, 0xAD, 0x71, 0xAD, 0x72, /* 0x90-0x93 */
+	0xC3, 0xDF, 0xC3, 0xE0, 0xAD, 0x73, 0xAD, 0x74, /* 0x94-0x97 */
+	0xC3, 0xE1, 0xAD, 0x75, 0xAD, 0x76, 0xAD, 0x77, /* 0x98-0x9B */
+	0xC3, 0xE2, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0x9C-0x9F */
+	0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, 0xAD, 0x84, /* 0xA0-0xA3 */
+	0xC3, 0xE3, 0xC3, 0xE4, 0xAD, 0x85, 0xC3, 0xE5, /* 0xA4-0xA7 */
+	0xAD, 0x86, 0xC3, 0xE6, 0xAD, 0x87, 0xAD, 0x88, /* 0xA8-0xAB */
+	0xAD, 0x89, 0xAD, 0x8A, 0xAD, 0x8B, 0xAD, 0x8C, /* 0xAC-0xAF */
+	0xC3, 0xE7, 0xAD, 0x8D, 0xAD, 0x8E, 0xAD, 0x8F, /* 0xB0-0xB3 */
+	0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, 0xAD, 0x93, /* 0xB4-0xB7 */
+	0xAD, 0x94, 0xAD, 0x95, 0xAD, 0x96, 0xAD, 0x97, /* 0xB8-0xBB */
+	0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xBC-0xBF */
+	0xAD, 0x9C, 0xAD, 0x9D, 0xAD, 0x9E, 0xAD, 0x9F, /* 0xC0-0xC3 */
+	0xC3, 0xE8, 0xAD, 0xA0, 0xAE, 0x41, 0xAE, 0x42, /* 0xC4-0xC7 */
+	0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, 0xAE, 0x46, /* 0xC8-0xCB */
+	0xC3, 0xE9, 0xAE, 0x47, 0xAE, 0x48, 0xAE, 0x49, /* 0xCC-0xCF */
+	0xC3, 0xEA, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0xD0-0xD3 */
+	0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, 0xAE, 0x50, /* 0xD4-0xD7 */
+	0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, 0xAE, 0x54, /* 0xD8-0xDB */
+	0xAE, 0x55, 0xAE, 0x56, 0xAE, 0x57, 0xAE, 0x58, /* 0xDC-0xDF */
+	0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x61, 0xAE, 0x62, /* 0xE0-0xE3 */
+	0xAE, 0x63, 0xAE, 0x64, 0xAE, 0x65, 0xAE, 0x66, /* 0xE4-0xE7 */
+	0xC3, 0xEB, 0xAE, 0x67, 0xAE, 0x68, 0xAE, 0x69, /* 0xE8-0xEB */
+	0xC3, 0xEC, 0xAE, 0x6A, 0xAE, 0x6B, 0xAE, 0x6C, /* 0xEC-0xEF */
+	0xC3, 0xED, 0xAE, 0x6D, 0xAE, 0x6E, 0xAE, 0x6F, /* 0xF0-0xF3 */
+	0xAE, 0x70, 0xAE, 0x71, 0xAE, 0x72, 0xAE, 0x73, /* 0xF4-0xF7 */
+	0xC3, 0xEE, 0xC3, 0xEF, 0xAE, 0x74, 0xC3, 0xF0, /* 0xF8-0xFB */
+	0xAE, 0x75, 0xC3, 0xF1, 0xAE, 0x76, 0xAE, 0x77, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CE[512] = {
+	0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, 0xAE, 0x81, /* 0x00-0x03 */
+	0xC3, 0xF2, 0xAE, 0x82, 0xAE, 0x83, 0xAE, 0x84, /* 0x04-0x07 */
+	0xC3, 0xF3, 0xAE, 0x85, 0xAE, 0x86, 0xAE, 0x87, /* 0x08-0x0B */
+	0xC3, 0xF4, 0xAE, 0x88, 0xAE, 0x89, 0xAE, 0x8A, /* 0x0C-0x0F */
+	0xAE, 0x8B, 0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, /* 0x10-0x13 */
+	0xC3, 0xF5, 0xAE, 0x8F, 0xAE, 0x90, 0xAE, 0x91, /* 0x14-0x17 */
+	0xAE, 0x92, 0xC3, 0xF6, 0xAE, 0x93, 0xAE, 0x94, /* 0x18-0x1B */
+	0xAE, 0x95, 0xAE, 0x96, 0xAE, 0x97, 0xAE, 0x98, /* 0x1C-0x1F */
+	0xC3, 0xF7, 0xC3, 0xF8, 0xAE, 0x99, 0xAE, 0x9A, /* 0x20-0x23 */
+	0xC3, 0xF9, 0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, /* 0x24-0x27 */
+	0xC3, 0xFA, 0xAE, 0x9E, 0xAE, 0x9F, 0xAE, 0xA0, /* 0x28-0x2B */
+	0xAF, 0x41, 0xAF, 0x42, 0xAF, 0x43, 0xAF, 0x44, /* 0x2C-0x2F */
+	0xC3, 0xFB, 0xC3, 0xFC, 0xAF, 0x45, 0xC3, 0xFD, /* 0x30-0x33 */
+	0xAF, 0x46, 0xC3, 0xFE, 0xAF, 0x47, 0xAF, 0x48, /* 0x34-0x37 */
+	0xAF, 0x49, 0xAF, 0x4A, 0xAF, 0x4B, 0xAF, 0x4C, /* 0x38-0x3B */
+	0xAF, 0x4D, 0xAF, 0x4E, 0xAF, 0x4F, 0xAF, 0x50, /* 0x3C-0x3F */
+	0xAF, 0x51, 0xAF, 0x52, 0xAF, 0x53, 0xAF, 0x54, /* 0x40-0x43 */
+	0xAF, 0x55, 0xAF, 0x56, 0xAF, 0x57, 0xAF, 0x58, /* 0x44-0x47 */
+	0xAF, 0x59, 0xAF, 0x5A, 0xAF, 0x61, 0xAF, 0x62, /* 0x48-0x4B */
+	0xAF, 0x63, 0xAF, 0x64, 0xAF, 0x65, 0xAF, 0x66, /* 0x4C-0x4F */
+	0xAF, 0x67, 0xAF, 0x68, 0xAF, 0x69, 0xAF, 0x6A, /* 0x50-0x53 */
+	0xAF, 0x6B, 0xAF, 0x6C, 0xAF, 0x6D, 0xAF, 0x6E, /* 0x54-0x57 */
+	0xC4, 0xA1, 0xC4, 0xA2, 0xAF, 0x6F, 0xAF, 0x70, /* 0x58-0x5B */
+	0xC4, 0xA3, 0xAF, 0x71, 0xAF, 0x72, 0xC4, 0xA4, /* 0x5C-0x5F */
+	0xC4, 0xA5, 0xC4, 0xA6, 0xAF, 0x73, 0xAF, 0x74, /* 0x60-0x63 */
+	0xAF, 0x75, 0xAF, 0x76, 0xAF, 0x77, 0xAF, 0x78, /* 0x64-0x67 */
+	0xC4, 0xA7, 0xC4, 0xA8, 0xAF, 0x79, 0xC4, 0xA9, /* 0x68-0x6B */
+	0xAF, 0x7A, 0xC4, 0xAA, 0xAF, 0x81, 0xAF, 0x82, /* 0x6C-0x6F */
+	0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, 0xAF, 0x86, /* 0x70-0x73 */
+	0xC4, 0xAB, 0xC4, 0xAC, 0xAF, 0x87, 0xAF, 0x88, /* 0x74-0x77 */
+	0xC4, 0xAD, 0xAF, 0x89, 0xAF, 0x8A, 0xAF, 0x8B, /* 0x78-0x7B */
+	0xC4, 0xAE, 0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, /* 0x7C-0x7F */
+	
+	0xAF, 0x8F, 0xAF, 0x90, 0xAF, 0x91, 0xAF, 0x92, /* 0x80-0x83 */
+	0xC4, 0xAF, 0xC4, 0xB0, 0xAF, 0x93, 0xC4, 0xB1, /* 0x84-0x87 */
+	0xAF, 0x94, 0xC4, 0xB2, 0xAF, 0x95, 0xAF, 0x96, /* 0x88-0x8B */
+	0xAF, 0x97, 0xAF, 0x98, 0xAF, 0x99, 0xAF, 0x9A, /* 0x8C-0x8F */
+	0xC4, 0xB3, 0xC4, 0xB4, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x90-0x93 */
+	0xC4, 0xB5, 0xAF, 0x9D, 0xAF, 0x9E, 0xAF, 0x9F, /* 0x94-0x97 */
+	0xC4, 0xB6, 0xAF, 0xA0, 0xB0, 0x41, 0xB0, 0x42, /* 0x98-0x9B */
+	0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x9C-0x9F */
+	0xC4, 0xB7, 0xC4, 0xB8, 0xB0, 0x47, 0xC4, 0xB9, /* 0xA0-0xA3 */
+	0xC4, 0xBA, 0xC4, 0xBB, 0xB0, 0x48, 0xB0, 0x49, /* 0xA4-0xA7 */
+	0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, 0xB0, 0x4D, /* 0xA8-0xAB */
+	0xC4, 0xBC, 0xC4, 0xBD, 0xB0, 0x4E, 0xB0, 0x4F, /* 0xAC-0xAF */
+	0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, 0xB0, 0x53, /* 0xB0-0xB3 */
+	0xB0, 0x54, 0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, /* 0xB4-0xB7 */
+	0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x61, /* 0xB8-0xBB */
+	0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0xBC-0xBF */
+	0xB0, 0x66, 0xC4, 0xBE, 0xB0, 0x67, 0xB0, 0x68, /* 0xC0-0xC3 */
+	0xB0, 0x69, 0xB0, 0x6A, 0xB0, 0x6B, 0xB0, 0x6C, /* 0xC4-0xC7 */
+	0xB0, 0x6D, 0xB0, 0x6E, 0xB0, 0x6F, 0xB0, 0x70, /* 0xC8-0xCB */
+	0xB0, 0x71, 0xB0, 0x72, 0xB0, 0x73, 0xB0, 0x74, /* 0xCC-0xCF */
+	0xB0, 0x75, 0xB0, 0x76, 0xB0, 0x77, 0xB0, 0x78, /* 0xD0-0xD3 */
+	0xB0, 0x79, 0xB0, 0x7A, 0xB0, 0x81, 0xB0, 0x82, /* 0xD4-0xD7 */
+	0xB0, 0x83, 0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, /* 0xD8-0xDB */
+	0xB0, 0x87, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xDC-0xDF */
+	0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xE0-0xE3 */
+	0xC4, 0xBF, 0xC4, 0xC0, 0xB0, 0x8F, 0xB0, 0x90, /* 0xE4-0xE7 */
+	0xC4, 0xC1, 0xB0, 0x91, 0xB0, 0x92, 0xC4, 0xC2, /* 0xE8-0xEB */
+	0xC4, 0xC3, 0xB0, 0x93, 0xB0, 0x94, 0xB0, 0x95, /* 0xEC-0xEF */
+	0xB0, 0x96, 0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, /* 0xF0-0xF3 */
+	0xC4, 0xC4, 0xC4, 0xC5, 0xB0, 0x9A, 0xC4, 0xC6, /* 0xF4-0xF7 */
+	0xC4, 0xC7, 0xC4, 0xC8, 0xB0, 0x9B, 0xB0, 0x9C, /* 0xF8-0xFB */
+	0xB0, 0x9D, 0xB0, 0x9E, 0xB0, 0x9F, 0xB0, 0xA0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CF[512] = {
+	0xC4, 0xC9, 0xC4, 0xCA, 0xB1, 0x41, 0xB1, 0x42, /* 0x00-0x03 */
+	0xC4, 0xCB, 0xB1, 0x43, 0xB1, 0x44, 0xB1, 0x45, /* 0x04-0x07 */
+	0xC4, 0xCC, 0xB1, 0x46, 0xB1, 0x47, 0xB1, 0x48, /* 0x08-0x0B */
+	0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xB1, 0x4C, /* 0x0C-0x0F */
+	0xC4, 0xCD, 0xC4, 0xCE, 0xB1, 0x4D, 0xC4, 0xCF, /* 0x10-0x13 */
+	0xB1, 0x4E, 0xC4, 0xD0, 0xB1, 0x4F, 0xB1, 0x50, /* 0x14-0x17 */
+	0xB1, 0x51, 0xB1, 0x52, 0xB1, 0x53, 0xB1, 0x54, /* 0x18-0x1B */
+	0xC4, 0xD1, 0xB1, 0x55, 0xB1, 0x56, 0xB1, 0x57, /* 0x1C-0x1F */
+	0xC4, 0xD2, 0xB1, 0x58, 0xB1, 0x59, 0xB1, 0x5A, /* 0x20-0x23 */
+	0xC4, 0xD3, 0xB1, 0x61, 0xB1, 0x62, 0xB1, 0x63, /* 0x24-0x27 */
+	0xB1, 0x64, 0xB1, 0x65, 0xB1, 0x66, 0xB1, 0x67, /* 0x28-0x2B */
+	0xC4, 0xD4, 0xC4, 0xD5, 0xB1, 0x68, 0xC4, 0xD6, /* 0x2C-0x2F */
+	0xC4, 0xD7, 0xC4, 0xD8, 0xB1, 0x69, 0xB1, 0x6A, /* 0x30-0x33 */
+	0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x34-0x37 */
+	0xC4, 0xD9, 0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, /* 0x38-0x3B */
+	0xB1, 0x72, 0xB1, 0x73, 0xB1, 0x74, 0xB1, 0x75, /* 0x3C-0x3F */
+	0xB1, 0x76, 0xB1, 0x77, 0xB1, 0x78, 0xB1, 0x79, /* 0x40-0x43 */
+	0xB1, 0x7A, 0xB1, 0x81, 0xB1, 0x82, 0xB1, 0x83, /* 0x44-0x47 */
+	0xB1, 0x84, 0xB1, 0x85, 0xB1, 0x86, 0xB1, 0x87, /* 0x48-0x4B */
+	0xB1, 0x88, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x4C-0x4F */
+	0xB1, 0x8C, 0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, /* 0x50-0x53 */
+	0xC4, 0xDA, 0xC4, 0xDB, 0xB1, 0x90, 0xB1, 0x91, /* 0x54-0x57 */
+	0xC4, 0xDC, 0xB1, 0x92, 0xB1, 0x93, 0xB1, 0x94, /* 0x58-0x5B */
+	0xC4, 0xDD, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x5C-0x5F */
+	0xB1, 0x98, 0xB1, 0x99, 0xB1, 0x9A, 0xB1, 0x9B, /* 0x60-0x63 */
+	0xC4, 0xDE, 0xC4, 0xDF, 0xB1, 0x9C, 0xC4, 0xE0, /* 0x64-0x67 */
+	0xB1, 0x9D, 0xC4, 0xE1, 0xB1, 0x9E, 0xB1, 0x9F, /* 0x68-0x6B */
+	0xB1, 0xA0, 0xB2, 0x41, 0xB2, 0x42, 0xB2, 0x43, /* 0x6C-0x6F */
+	0xC4, 0xE2, 0xC4, 0xE3, 0xB2, 0x44, 0xB2, 0x45, /* 0x70-0x73 */
+	0xC4, 0xE4, 0xB2, 0x46, 0xB2, 0x47, 0xB2, 0x48, /* 0x74-0x77 */
+	0xC4, 0xE5, 0xB2, 0x49, 0xB2, 0x4A, 0xB2, 0x4B, /* 0x78-0x7B */
+	0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, 0xB2, 0x4F, /* 0x7C-0x7F */
+	
+	0xC4, 0xE6, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x80-0x83 */
+	0xB2, 0x53, 0xC4, 0xE7, 0xB2, 0x54, 0xB2, 0x55, /* 0x84-0x87 */
+	0xB2, 0x56, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x88-0x8B */
+	0xC4, 0xE8, 0xB2, 0x5A, 0xB2, 0x61, 0xB2, 0x62, /* 0x8C-0x8F */
+	0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x90-0x93 */
+	0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x94-0x97 */
+	0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xB2, 0x6E, /* 0x98-0x9B */
+	0xB2, 0x6F, 0xB2, 0x70, 0xB2, 0x71, 0xB2, 0x72, /* 0x9C-0x9F */
+	0xB2, 0x73, 0xC4, 0xE9, 0xB2, 0x74, 0xB2, 0x75, /* 0xA0-0xA3 */
+	0xB2, 0x76, 0xB2, 0x77, 0xB2, 0x78, 0xB2, 0x79, /* 0xA4-0xA7 */
+	0xC4, 0xEA, 0xB2, 0x7A, 0xB2, 0x81, 0xB2, 0x82, /* 0xA8-0xAB */
+	0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, 0xB2, 0x86, /* 0xAC-0xAF */
+	0xC4, 0xEB, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xB0-0xB3 */
+	0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xB4-0xB7 */
+	0xB2, 0x8E, 0xB2, 0x8F, 0xB2, 0x90, 0xB2, 0x91, /* 0xB8-0xBB */
+	0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, 0xB2, 0x95, /* 0xBC-0xBF */
+	0xB2, 0x96, 0xB2, 0x97, 0xB2, 0x98, 0xB2, 0x99, /* 0xC0-0xC3 */
+	0xC4, 0xEC, 0xB2, 0x9A, 0xB2, 0x9B, 0xB2, 0x9C, /* 0xC4-0xC7 */
+	0xB2, 0x9D, 0xB2, 0x9E, 0xB2, 0x9F, 0xB2, 0xA0, /* 0xC8-0xCB */
+	0xB3, 0x41, 0xB3, 0x42, 0xB3, 0x43, 0xB3, 0x44, /* 0xCC-0xCF */
+	0xB3, 0x45, 0xB3, 0x46, 0xB3, 0x47, 0xB3, 0x48, /* 0xD0-0xD3 */
+	0xB3, 0x49, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xD4-0xD7 */
+	0xB3, 0x4D, 0xB3, 0x4E, 0xB3, 0x4F, 0xB3, 0x50, /* 0xD8-0xDB */
+	0xB3, 0x51, 0xB3, 0x52, 0xB3, 0x53, 0xB3, 0x54, /* 0xDC-0xDF */
+	0xC4, 0xED, 0xC4, 0xEE, 0xB3, 0x55, 0xB3, 0x56, /* 0xE0-0xE3 */
+	0xC4, 0xEF, 0xB3, 0x57, 0xB3, 0x58, 0xB3, 0x59, /* 0xE4-0xE7 */
+	0xC4, 0xF0, 0xB3, 0x5A, 0xB3, 0x61, 0xB3, 0x62, /* 0xE8-0xEB */
+	0xB3, 0x63, 0xB3, 0x64, 0xB3, 0x65, 0xB3, 0x66, /* 0xEC-0xEF */
+	0xC4, 0xF1, 0xC4, 0xF2, 0xB3, 0x67, 0xC4, 0xF3, /* 0xF0-0xF3 */
+	0xB3, 0x68, 0xC4, 0xF4, 0xB3, 0x69, 0xB3, 0x6A, /* 0xF4-0xF7 */
+	0xB3, 0x6B, 0xB3, 0x6C, 0xB3, 0x6D, 0xB3, 0x6E, /* 0xF8-0xFB */
+	0xC4, 0xF5, 0xB3, 0x6F, 0xB3, 0x70, 0xB3, 0x71, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D0[512] = {
+	0xC4, 0xF6, 0xB3, 0x72, 0xB3, 0x73, 0xB3, 0x74, /* 0x00-0x03 */
+	0xC4, 0xF7, 0xB3, 0x75, 0xB3, 0x76, 0xB3, 0x77, /* 0x04-0x07 */
+	0xB3, 0x78, 0xB3, 0x79, 0xB3, 0x7A, 0xB3, 0x81, /* 0x08-0x0B */
+	0xB3, 0x82, 0xB3, 0x83, 0xB3, 0x84, 0xB3, 0x85, /* 0x0C-0x0F */
+	0xB3, 0x86, 0xC4, 0xF8, 0xB3, 0x87, 0xB3, 0x88, /* 0x10-0x13 */
+	0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, 0xB3, 0x8C, /* 0x14-0x17 */
+	0xC4, 0xF9, 0xB3, 0x8D, 0xB3, 0x8E, 0xB3, 0x8F, /* 0x18-0x1B */
+	0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, 0xB3, 0x93, /* 0x1C-0x1F */
+	0xB3, 0x94, 0xB3, 0x95, 0xB3, 0x96, 0xB3, 0x97, /* 0x20-0x23 */
+	0xB3, 0x98, 0xB3, 0x99, 0xB3, 0x9A, 0xB3, 0x9B, /* 0x24-0x27 */
+	0xB3, 0x9C, 0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, /* 0x28-0x2B */
+	0xB3, 0xA0, 0xC4, 0xFA, 0xB4, 0x41, 0xB4, 0x42, /* 0x2C-0x2F */
+	0xB4, 0x43, 0xB4, 0x44, 0xB4, 0x45, 0xB4, 0x46, /* 0x30-0x33 */
+	0xC4, 0xFB, 0xC4, 0xFC, 0xB4, 0x47, 0xB4, 0x48, /* 0x34-0x37 */
+	0xC4, 0xFD, 0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, /* 0x38-0x3B */
+	0xC4, 0xFE, 0xB4, 0x4C, 0xB4, 0x4D, 0xB4, 0x4E, /* 0x3C-0x3F */
+	0xB4, 0x4F, 0xB4, 0x50, 0xB4, 0x51, 0xB4, 0x52, /* 0x40-0x43 */
+	0xC5, 0xA1, 0xC5, 0xA2, 0xB4, 0x53, 0xC5, 0xA3, /* 0x44-0x47 */
+	0xB4, 0x54, 0xC5, 0xA4, 0xB4, 0x55, 0xB4, 0x56, /* 0x48-0x4B */
+	0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0x4C-0x4F */
+	0xC5, 0xA5, 0xB4, 0x61, 0xB4, 0x62, 0xB4, 0x63, /* 0x50-0x53 */
+	0xC5, 0xA6, 0xB4, 0x64, 0xB4, 0x65, 0xB4, 0x66, /* 0x54-0x57 */
+	0xC5, 0xA7, 0xB4, 0x67, 0xB4, 0x68, 0xB4, 0x69, /* 0x58-0x5B */
+	0xB4, 0x6A, 0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, /* 0x5C-0x5F */
+	0xC5, 0xA8, 0xB4, 0x6E, 0xB4, 0x6F, 0xB4, 0x70, /* 0x60-0x63 */
+	0xB4, 0x71, 0xB4, 0x72, 0xB4, 0x73, 0xB4, 0x74, /* 0x64-0x67 */
+	0xB4, 0x75, 0xB4, 0x76, 0xB4, 0x77, 0xB4, 0x78, /* 0x68-0x6B */
+	0xC5, 0xA9, 0xC5, 0xAA, 0xB4, 0x79, 0xB4, 0x7A, /* 0x6C-0x6F */
+	0xC5, 0xAB, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0x70-0x73 */
+	0xC5, 0xAC, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0x74-0x77 */
+	0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, 0xB4, 0x8A, /* 0x78-0x7B */
+	0xC5, 0xAD, 0xC5, 0xAE, 0xB4, 0x8B, 0xB4, 0x8C, /* 0x7C-0x7F */
+	
+	0xB4, 0x8D, 0xC5, 0xAF, 0xB4, 0x8E, 0xB4, 0x8F, /* 0x80-0x83 */
+	0xB4, 0x90, 0xB4, 0x91, 0xB4, 0x92, 0xB4, 0x93, /* 0x84-0x87 */
+	0xB4, 0x94, 0xB4, 0x95, 0xB4, 0x96, 0xB4, 0x97, /* 0x88-0x8B */
+	0xB4, 0x98, 0xB4, 0x99, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x8C-0x8F */
+	0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x90-0x93 */
+	0xB4, 0xA0, 0xB5, 0x41, 0xB5, 0x42, 0xB5, 0x43, /* 0x94-0x97 */
+	0xB5, 0x44, 0xB5, 0x45, 0xB5, 0x46, 0xB5, 0x47, /* 0x98-0x9B */
+	0xB5, 0x48, 0xB5, 0x49, 0xB5, 0x4A, 0xB5, 0x4B, /* 0x9C-0x9F */
+	0xB5, 0x4C, 0xB5, 0x4D, 0xB5, 0x4E, 0xB5, 0x4F, /* 0xA0-0xA3 */
+	0xC5, 0xB0, 0xC5, 0xB1, 0xB5, 0x50, 0xB5, 0x51, /* 0xA4-0xA7 */
+	0xC5, 0xB2, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0xA8-0xAB */
+	0xC5, 0xB3, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0xAC-0xAF */
+	0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x61, /* 0xB0-0xB3 */
+	0xC5, 0xB4, 0xC5, 0xB5, 0xB5, 0x62, 0xC5, 0xB6, /* 0xB4-0xB7 */
+	0xB5, 0x63, 0xC5, 0xB7, 0xB5, 0x64, 0xB5, 0x65, /* 0xB8-0xBB */
+	0xB5, 0x66, 0xB5, 0x67, 0xB5, 0x68, 0xB5, 0x69, /* 0xBC-0xBF */
+	0xC5, 0xB8, 0xC5, 0xB9, 0xB5, 0x6A, 0xB5, 0x6B, /* 0xC0-0xC3 */
+	0xC5, 0xBA, 0xB5, 0x6C, 0xB5, 0x6D, 0xB5, 0x6E, /* 0xC4-0xC7 */
+	0xC5, 0xBB, 0xC5, 0xBC, 0xB5, 0x6F, 0xB5, 0x70, /* 0xC8-0xCB */
+	0xB5, 0x71, 0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, /* 0xCC-0xCF */
+	0xC5, 0xBD, 0xC5, 0xBE, 0xB5, 0x75, 0xC5, 0xBF, /* 0xD0-0xD3 */
+	0xC5, 0xC0, 0xC5, 0xC1, 0xB5, 0x76, 0xB5, 0x77, /* 0xD4-0xD7 */
+	0xB5, 0x78, 0xB5, 0x79, 0xB5, 0x7A, 0xB5, 0x81, /* 0xD8-0xDB */
+	0xC5, 0xC2, 0xC5, 0xC3, 0xB5, 0x82, 0xB5, 0x83, /* 0xDC-0xDF */
+	0xC5, 0xC4, 0xB5, 0x84, 0xB5, 0x85, 0xB5, 0x86, /* 0xE0-0xE3 */
+	0xC5, 0xC5, 0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, /* 0xE4-0xE7 */
+	0xB5, 0x8A, 0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, /* 0xE8-0xEB */
+	0xC5, 0xC6, 0xC5, 0xC7, 0xB5, 0x8E, 0xC5, 0xC8, /* 0xEC-0xEF */
+	0xC5, 0xC9, 0xC5, 0xCA, 0xB5, 0x8F, 0xB5, 0x90, /* 0xF0-0xF3 */
+	0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, 0xB5, 0x94, /* 0xF4-0xF7 */
+	0xC5, 0xCB, 0xB5, 0x95, 0xB5, 0x96, 0xB5, 0x97, /* 0xF8-0xFB */
+	0xB5, 0x98, 0xB5, 0x99, 0xB5, 0x9A, 0xB5, 0x9B, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D1[512] = {
+	0xB5, 0x9C, 0xB5, 0x9D, 0xB5, 0x9E, 0xB5, 0x9F, /* 0x00-0x03 */
+	0xB5, 0xA0, 0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, /* 0x04-0x07 */
+	0xB6, 0x44, 0xB6, 0x45, 0xB6, 0x46, 0xB6, 0x47, /* 0x08-0x0B */
+	0xB6, 0x48, 0xC5, 0xCC, 0xB6, 0x49, 0xB6, 0x4A, /* 0x0C-0x0F */
+	0xB6, 0x4B, 0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, /* 0x10-0x13 */
+	0xB6, 0x4F, 0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, /* 0x14-0x17 */
+	0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0x18-0x1B */
+	0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0x1C-0x1F */
+	0xB6, 0x61, 0xB6, 0x62, 0xB6, 0x63, 0xB6, 0x64, /* 0x20-0x23 */
+	0xB6, 0x65, 0xB6, 0x66, 0xB6, 0x67, 0xB6, 0x68, /* 0x24-0x27 */
+	0xB6, 0x69, 0xB6, 0x6A, 0xB6, 0x6B, 0xB6, 0x6C, /* 0x28-0x2B */
+	0xB6, 0x6D, 0xB6, 0x6E, 0xB6, 0x6F, 0xB6, 0x70, /* 0x2C-0x2F */
+	0xC5, 0xCD, 0xC5, 0xCE, 0xB6, 0x71, 0xB6, 0x72, /* 0x30-0x33 */
+	0xC5, 0xCF, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0x34-0x37 */
+	0xC5, 0xD0, 0xB6, 0x76, 0xC5, 0xD1, 0xB6, 0x77, /* 0x38-0x3B */
+	0xB6, 0x78, 0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x81, /* 0x3C-0x3F */
+	0xC5, 0xD2, 0xC5, 0xD3, 0xB6, 0x82, 0xC5, 0xD4, /* 0x40-0x43 */
+	0xC5, 0xD5, 0xC5, 0xD6, 0xB6, 0x83, 0xB6, 0x84, /* 0x44-0x47 */
+	0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0x48-0x4B */
+	0xC5, 0xD7, 0xC5, 0xD8, 0xB6, 0x89, 0xB6, 0x8A, /* 0x4C-0x4F */
+	0xC5, 0xD9, 0xB6, 0x8B, 0xB6, 0x8C, 0xB6, 0x8D, /* 0x50-0x53 */
+	0xC5, 0xDA, 0xB6, 0x8E, 0xB6, 0x8F, 0xB6, 0x90, /* 0x54-0x57 */
+	0xB6, 0x91, 0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, /* 0x58-0x5B */
+	0xC5, 0xDB, 0xC5, 0xDC, 0xB6, 0x95, 0xC5, 0xDD, /* 0x5C-0x5F */
+	0xB6, 0x96, 0xC5, 0xDE, 0xB6, 0x97, 0xB6, 0x98, /* 0x60-0x63 */
+	0xB6, 0x99, 0xB6, 0x9A, 0xB6, 0x9B, 0xB6, 0x9C, /* 0x64-0x67 */
+	0xC5, 0xDF, 0xB6, 0x9D, 0xB6, 0x9E, 0xB6, 0x9F, /* 0x68-0x6B */
+	0xC5, 0xE0, 0xB6, 0xA0, 0xB7, 0x41, 0xB7, 0x42, /* 0x6C-0x6F */
+	0xB7, 0x43, 0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, /* 0x70-0x73 */
+	0xB7, 0x47, 0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, /* 0x74-0x77 */
+	0xB7, 0x4B, 0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, /* 0x78-0x7B */
+	0xC5, 0xE1, 0xB7, 0x4F, 0xB7, 0x50, 0xB7, 0x51, /* 0x7C-0x7F */
+	
+	0xB7, 0x52, 0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, /* 0x80-0x83 */
+	0xC5, 0xE2, 0xB7, 0x56, 0xB7, 0x57, 0xB7, 0x58, /* 0x84-0x87 */
+	0xC5, 0xE3, 0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x61, /* 0x88-0x8B */
+	0xB7, 0x62, 0xB7, 0x63, 0xB7, 0x64, 0xB7, 0x65, /* 0x8C-0x8F */
+	0xB7, 0x66, 0xB7, 0x67, 0xB7, 0x68, 0xB7, 0x69, /* 0x90-0x93 */
+	0xB7, 0x6A, 0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, /* 0x94-0x97 */
+	0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x98-0x9B */
+	0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x9C-0x9F */
+	0xC5, 0xE4, 0xC5, 0xE5, 0xB7, 0x76, 0xB7, 0x77, /* 0xA0-0xA3 */
+	0xC5, 0xE6, 0xB7, 0x78, 0xB7, 0x79, 0xB7, 0x7A, /* 0xA4-0xA7 */
+	0xC5, 0xE7, 0xB7, 0x81, 0xB7, 0x82, 0xB7, 0x83, /* 0xA8-0xAB */
+	0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, 0xB7, 0x87, /* 0xAC-0xAF */
+	0xC5, 0xE8, 0xC5, 0xE9, 0xB7, 0x88, 0xC5, 0xEA, /* 0xB0-0xB3 */
+	0xB7, 0x89, 0xC5, 0xEB, 0xB7, 0x8A, 0xB7, 0x8B, /* 0xB4-0xB7 */
+	0xB7, 0x8C, 0xB7, 0x8D, 0xC5, 0xEC, 0xB7, 0x8E, /* 0xB8-0xBB */
+	0xC5, 0xED, 0xB7, 0x8F, 0xB7, 0x90, 0xB7, 0x91, /* 0xBC-0xBF */
+	0xC5, 0xEE, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0xC0-0xC3 */
+	0xB7, 0x95, 0xB7, 0x96, 0xB7, 0x97, 0xB7, 0x98, /* 0xC4-0xC7 */
+	0xB7, 0x99, 0xB7, 0x9A, 0xB7, 0x9B, 0xB7, 0x9C, /* 0xC8-0xCB */
+	0xB7, 0x9D, 0xB7, 0x9E, 0xB7, 0x9F, 0xB7, 0xA0, /* 0xCC-0xCF */
+	0xB8, 0x41, 0xB8, 0x42, 0xB8, 0x43, 0xB8, 0x44, /* 0xD0-0xD3 */
+	0xB8, 0x45, 0xB8, 0x46, 0xB8, 0x47, 0xB8, 0x48, /* 0xD4-0xD7 */
+	0xC5, 0xEF, 0xB8, 0x49, 0xB8, 0x4A, 0xB8, 0x4B, /* 0xD8-0xDB */
+	0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, 0xB8, 0x4F, /* 0xDC-0xDF */
+	0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, 0xB8, 0x53, /* 0xE0-0xE3 */
+	0xB8, 0x54, 0xB8, 0x55, 0xB8, 0x56, 0xB8, 0x57, /* 0xE4-0xE7 */
+	0xB8, 0x58, 0xB8, 0x59, 0xB8, 0x5A, 0xB8, 0x61, /* 0xE8-0xEB */
+	0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, 0xB8, 0x65, /* 0xEC-0xEF */
+	0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, 0xB8, 0x69, /* 0xF0-0xF3 */
+	0xC5, 0xF0, 0xB8, 0x6A, 0xB8, 0x6B, 0xB8, 0x6C, /* 0xF4-0xF7 */
+	0xC5, 0xF1, 0xB8, 0x6D, 0xB8, 0x6E, 0xB8, 0x6F, /* 0xF8-0xFB */
+	0xB8, 0x70, 0xB8, 0x71, 0xB8, 0x72, 0xB8, 0x73, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D2[512] = {
+	0xB8, 0x74, 0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, /* 0x00-0x03 */
+	0xB8, 0x78, 0xB8, 0x79, 0xB8, 0x7A, 0xC5, 0xF2, /* 0x04-0x07 */
+	0xB8, 0x81, 0xC5, 0xF3, 0xB8, 0x82, 0xB8, 0x83, /* 0x08-0x0B */
+	0xB8, 0x84, 0xB8, 0x85, 0xB8, 0x86, 0xB8, 0x87, /* 0x0C-0x0F */
+	0xC5, 0xF4, 0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, /* 0x10-0x13 */
+	0xB8, 0x8B, 0xB8, 0x8C, 0xB8, 0x8D, 0xB8, 0x8E, /* 0x14-0x17 */
+	0xB8, 0x8F, 0xB8, 0x90, 0xB8, 0x91, 0xB8, 0x92, /* 0x18-0x1B */
+	0xB8, 0x93, 0xB8, 0x94, 0xB8, 0x95, 0xB8, 0x96, /* 0x1C-0x1F */
+	0xB8, 0x97, 0xB8, 0x98, 0xB8, 0x99, 0xB8, 0x9A, /* 0x20-0x23 */
+	0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, 0xB8, 0x9E, /* 0x24-0x27 */
+	0xB8, 0x9F, 0xB8, 0xA0, 0xB9, 0x41, 0xB9, 0x42, /* 0x28-0x2B */
+	0xC5, 0xF5, 0xC5, 0xF6, 0xB9, 0x43, 0xB9, 0x44, /* 0x2C-0x2F */
+	0xC5, 0xF7, 0xB9, 0x45, 0xB9, 0x46, 0xB9, 0x47, /* 0x30-0x33 */
+	0xC5, 0xF8, 0xB9, 0x48, 0xB9, 0x49, 0xB9, 0x4A, /* 0x34-0x37 */
+	0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x38-0x3B */
+	0xC5, 0xF9, 0xC5, 0xFA, 0xB9, 0x4F, 0xC5, 0xFB, /* 0x3C-0x3F */
+	0xB9, 0x50, 0xC5, 0xFC, 0xB9, 0x51, 0xB9, 0x52, /* 0x40-0x43 */
+	0xB9, 0x53, 0xB9, 0x54, 0xB9, 0x55, 0xB9, 0x56, /* 0x44-0x47 */
+	0xC5, 0xFD, 0xB9, 0x57, 0xB9, 0x58, 0xB9, 0x59, /* 0x48-0x4B */
+	0xB9, 0x5A, 0xB9, 0x61, 0xB9, 0x62, 0xB9, 0x63, /* 0x4C-0x4F */
+	0xB9, 0x64, 0xB9, 0x65, 0xB9, 0x66, 0xB9, 0x67, /* 0x50-0x53 */
+	0xB9, 0x68, 0xB9, 0x69, 0xB9, 0x6A, 0xB9, 0x6B, /* 0x54-0x57 */
+	0xB9, 0x6C, 0xB9, 0x6D, 0xB9, 0x6E, 0xB9, 0x6F, /* 0x58-0x5B */
+	0xC5, 0xFE, 0xB9, 0x70, 0xB9, 0x71, 0xB9, 0x72, /* 0x5C-0x5F */
+	0xB9, 0x73, 0xB9, 0x74, 0xB9, 0x75, 0xB9, 0x76, /* 0x60-0x63 */
+	0xC6, 0xA1, 0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x79, /* 0x64-0x67 */
+	0xB9, 0x7A, 0xB9, 0x81, 0xB9, 0x82, 0xB9, 0x83, /* 0x68-0x6B */
+	0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x6C-0x6F */
+	0xB9, 0x88, 0xB9, 0x89, 0xB9, 0x8A, 0xB9, 0x8B, /* 0x70-0x73 */
+	0xB9, 0x8C, 0xB9, 0x8D, 0xB9, 0x8E, 0xB9, 0x8F, /* 0x74-0x77 */
+	0xB9, 0x90, 0xB9, 0x91, 0xB9, 0x92, 0xB9, 0x93, /* 0x78-0x7B */
+	0xB9, 0x94, 0xB9, 0x95, 0xB9, 0x96, 0xB9, 0x97, /* 0x7C-0x7F */
+	
+	0xC6, 0xA2, 0xC6, 0xA3, 0xB9, 0x98, 0xB9, 0x99, /* 0x80-0x83 */
+	0xC6, 0xA4, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0x84-0x87 */
+	0xC6, 0xA5, 0xB9, 0x9D, 0xB9, 0x9E, 0xB9, 0x9F, /* 0x88-0x8B */
+	0xB9, 0xA0, 0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, /* 0x8C-0x8F */
+	0xC6, 0xA6, 0xC6, 0xA7, 0xBA, 0x44, 0xBA, 0x45, /* 0x90-0x93 */
+	0xBA, 0x46, 0xC6, 0xA8, 0xBA, 0x47, 0xBA, 0x48, /* 0x94-0x97 */
+	0xBA, 0x49, 0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, /* 0x98-0x9B */
+	0xC6, 0xA9, 0xBA, 0x4D, 0xBA, 0x4E, 0xBA, 0x4F, /* 0x9C-0x9F */
+	0xC6, 0xAA, 0xBA, 0x50, 0xBA, 0x51, 0xBA, 0x52, /* 0xA0-0xA3 */
+	0xC6, 0xAB, 0xBA, 0x53, 0xBA, 0x54, 0xBA, 0x55, /* 0xA4-0xA7 */
+	0xBA, 0x56, 0xBA, 0x57, 0xBA, 0x58, 0xBA, 0x59, /* 0xA8-0xAB */
+	0xC6, 0xAC, 0xBA, 0x5A, 0xBA, 0x61, 0xBA, 0x62, /* 0xAC-0xAF */
+	0xBA, 0x63, 0xC6, 0xAD, 0xBA, 0x64, 0xBA, 0x65, /* 0xB0-0xB3 */
+	0xBA, 0x66, 0xBA, 0x67, 0xBA, 0x68, 0xBA, 0x69, /* 0xB4-0xB7 */
+	0xC6, 0xAE, 0xC6, 0xAF, 0xBA, 0x6A, 0xBA, 0x6B, /* 0xB8-0xBB */
+	0xC6, 0xB0, 0xBA, 0x6C, 0xBA, 0x6D, 0xC6, 0xB1, /* 0xBC-0xBF */
+	0xC6, 0xB2, 0xBA, 0x6E, 0xC6, 0xB3, 0xBA, 0x6F, /* 0xC0-0xC3 */
+	0xBA, 0x70, 0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, /* 0xC4-0xC7 */
+	0xC6, 0xB4, 0xC6, 0xB5, 0xBA, 0x74, 0xC6, 0xB6, /* 0xC8-0xCB */
+	0xBA, 0x75, 0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, /* 0xCC-0xCF */
+	0xBA, 0x79, 0xBA, 0x7A, 0xBA, 0x81, 0xBA, 0x82, /* 0xD0-0xD3 */
+	0xC6, 0xB7, 0xBA, 0x83, 0xBA, 0x84, 0xBA, 0x85, /* 0xD4-0xD7 */
+	0xC6, 0xB8, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0xD8-0xDB */
+	0xC6, 0xB9, 0xBA, 0x89, 0xBA, 0x8A, 0xBA, 0x8B, /* 0xDC-0xDF */
+	0xBA, 0x8C, 0xBA, 0x8D, 0xBA, 0x8E, 0xBA, 0x8F, /* 0xE0-0xE3 */
+	0xC6, 0xBA, 0xC6, 0xBB, 0xBA, 0x90, 0xBA, 0x91, /* 0xE4-0xE7 */
+	0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0xE8-0xEB */
+	0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0xEC-0xEF */
+	0xC6, 0xBC, 0xC6, 0xBD, 0xBA, 0x9A, 0xBA, 0x9B, /* 0xF0-0xF3 */
+	0xC6, 0xBE, 0xBA, 0x9C, 0xBA, 0x9D, 0xBA, 0x9E, /* 0xF4-0xF7 */
+	0xC6, 0xBF, 0xBA, 0x9F, 0xBA, 0xA0, 0xBB, 0x41, /* 0xF8-0xFB */
+	0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, 0xBB, 0x45, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D3[512] = {
+	0xC6, 0xC0, 0xC6, 0xC1, 0xBB, 0x46, 0xC6, 0xC2, /* 0x00-0x03 */
+	0xBB, 0x47, 0xC6, 0xC3, 0xBB, 0x48, 0xBB, 0x49, /* 0x04-0x07 */
+	0xBB, 0x4A, 0xBB, 0x4B, 0xBB, 0x4C, 0xBB, 0x4D, /* 0x08-0x0B */
+	0xC6, 0xC4, 0xC6, 0xC5, 0xC6, 0xC6, 0xBB, 0x4E, /* 0x0C-0x0F */
+	0xC6, 0xC7, 0xBB, 0x4F, 0xBB, 0x50, 0xBB, 0x51, /* 0x10-0x13 */
+	0xC6, 0xC8, 0xBB, 0x52, 0xC6, 0xC9, 0xBB, 0x53, /* 0x14-0x17 */
+	0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x18-0x1B */
+	0xC6, 0xCA, 0xC6, 0xCB, 0xBB, 0x58, 0xC6, 0xCC, /* 0x1C-0x1F */
+	0xC6, 0xCD, 0xC6, 0xCE, 0xBB, 0x59, 0xBB, 0x5A, /* 0x20-0x23 */
+	0xBB, 0x61, 0xC6, 0xCF, 0xBB, 0x62, 0xBB, 0x63, /* 0x24-0x27 */
+	0xC6, 0xD0, 0xC6, 0xD1, 0xBB, 0x64, 0xBB, 0x65, /* 0x28-0x2B */
+	0xC6, 0xD2, 0xBB, 0x66, 0xBB, 0x67, 0xBB, 0x68, /* 0x2C-0x2F */
+	0xC6, 0xD3, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x30-0x33 */
+	0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xBB, 0x6F, /* 0x34-0x37 */
+	0xC6, 0xD4, 0xC6, 0xD5, 0xBB, 0x70, 0xC6, 0xD6, /* 0x38-0x3B */
+	0xC6, 0xD7, 0xC6, 0xD8, 0xBB, 0x71, 0xBB, 0x72, /* 0x3C-0x3F */
+	0xBB, 0x73, 0xBB, 0x74, 0xBB, 0x75, 0xBB, 0x76, /* 0x40-0x43 */
+	0xC6, 0xD9, 0xC6, 0xDA, 0xBB, 0x77, 0xBB, 0x78, /* 0x44-0x47 */
+	0xBB, 0x79, 0xBB, 0x7A, 0xBB, 0x81, 0xBB, 0x82, /* 0x48-0x4B */
+	0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x4C-0x4F */
+	0xBB, 0x87, 0xBB, 0x88, 0xBB, 0x89, 0xBB, 0x8A, /* 0x50-0x53 */
+	0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, 0xBB, 0x8E, /* 0x54-0x57 */
+	0xBB, 0x8F, 0xBB, 0x90, 0xBB, 0x91, 0xBB, 0x92, /* 0x58-0x5B */
+	0xBB, 0x93, 0xBB, 0x94, 0xBB, 0x95, 0xBB, 0x96, /* 0x5C-0x5F */
+	0xBB, 0x97, 0xBB, 0x98, 0xBB, 0x99, 0xBB, 0x9A, /* 0x60-0x63 */
+	0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, 0xBB, 0x9E, /* 0x64-0x67 */
+	0xBB, 0x9F, 0xBB, 0xA0, 0xBC, 0x41, 0xBC, 0x42, /* 0x68-0x6B */
+	0xBC, 0x43, 0xBC, 0x44, 0xBC, 0x45, 0xBC, 0x46, /* 0x6C-0x6F */
+	0xBC, 0x47, 0xBC, 0x48, 0xBC, 0x49, 0xBC, 0x4A, /* 0x70-0x73 */
+	0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, 0xBC, 0x4E, /* 0x74-0x77 */
+	0xBC, 0x4F, 0xBC, 0x50, 0xBC, 0x51, 0xBC, 0x52, /* 0x78-0x7B */
+	0xC6, 0xDB, 0xC6, 0xDC, 0xBC, 0x53, 0xBC, 0x54, /* 0x7C-0x7F */
+	
+	0xC6, 0xDD, 0xBC, 0x55, 0xBC, 0x56, 0xBC, 0x57, /* 0x80-0x83 */
+	0xC6, 0xDE, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0x84-0x87 */
+	0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0x88-0x8B */
+	0xC6, 0xDF, 0xC6, 0xE0, 0xBC, 0x65, 0xC6, 0xE1, /* 0x8C-0x8F */
+	0xC6, 0xE2, 0xC6, 0xE3, 0xBC, 0x66, 0xBC, 0x67, /* 0x90-0x93 */
+	0xBC, 0x68, 0xBC, 0x69, 0xBC, 0x6A, 0xBC, 0x6B, /* 0x94-0x97 */
+	0xC6, 0xE4, 0xC6, 0xE5, 0xBC, 0x6C, 0xBC, 0x6D, /* 0x98-0x9B */
+	0xC6, 0xE6, 0xBC, 0x6E, 0xBC, 0x6F, 0xBC, 0x70, /* 0x9C-0x9F */
+	0xC6, 0xE7, 0xBC, 0x71, 0xBC, 0x72, 0xBC, 0x73, /* 0xA0-0xA3 */
+	0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, 0xBC, 0x77, /* 0xA4-0xA7 */
+	0xC6, 0xE8, 0xC6, 0xE9, 0xBC, 0x78, 0xC6, 0xEA, /* 0xA8-0xAB */
+	0xBC, 0x79, 0xC6, 0xEB, 0xBC, 0x7A, 0xBC, 0x81, /* 0xAC-0xAF */
+	0xBC, 0x82, 0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, /* 0xB0-0xB3 */
+	0xC6, 0xEC, 0xBC, 0x86, 0xBC, 0x87, 0xBC, 0x88, /* 0xB4-0xB7 */
+	0xC6, 0xED, 0xBC, 0x89, 0xBC, 0x8A, 0xBC, 0x8B, /* 0xB8-0xBB */
+	0xC6, 0xEE, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0xBC-0xBF */
+	0xBC, 0x8F, 0xBC, 0x90, 0xBC, 0x91, 0xBC, 0x92, /* 0xC0-0xC3 */
+	0xC6, 0xEF, 0xC6, 0xF0, 0xBC, 0x93, 0xBC, 0x94, /* 0xC4-0xC7 */
+	0xC6, 0xF1, 0xC6, 0xF2, 0xBC, 0x95, 0xBC, 0x96, /* 0xC8-0xCB */
+	0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xBC, 0x9A, /* 0xCC-0xCF */
+	0xC6, 0xF3, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0xD0-0xD3 */
+	0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x41, /* 0xD4-0xD7 */
+	0xC6, 0xF4, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0xD8-0xDB */
+	0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0xDC-0xDF */
+	0xBD, 0x49, 0xC6, 0xF5, 0xBD, 0x4A, 0xC6, 0xF6, /* 0xE0-0xE3 */
+	0xBD, 0x4B, 0xBD, 0x4C, 0xBD, 0x4D, 0xBD, 0x4E, /* 0xE4-0xE7 */
+	0xBD, 0x4F, 0xBD, 0x50, 0xBD, 0x51, 0xBD, 0x52, /* 0xE8-0xEB */
+	0xC6, 0xF7, 0xC6, 0xF8, 0xBD, 0x53, 0xBD, 0x54, /* 0xEC-0xEF */
+	0xC6, 0xF9, 0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, /* 0xF0-0xF3 */
+	0xC6, 0xFA, 0xBD, 0x58, 0xBD, 0x59, 0xBD, 0x5A, /* 0xF4-0xF7 */
+	0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0xF8-0xFB */
+	0xC6, 0xFB, 0xC6, 0xFC, 0xBD, 0x65, 0xC6, 0xFD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D4[512] = {
+	0xBD, 0x66, 0xC6, 0xFE, 0xBD, 0x67, 0xBD, 0x68, /* 0x00-0x03 */
+	0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x04-0x07 */
+	0xC7, 0xA1, 0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, /* 0x08-0x0B */
+	0xBD, 0x70, 0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, /* 0x0C-0x0F */
+	0xBD, 0x74, 0xBD, 0x75, 0xBD, 0x76, 0xBD, 0x77, /* 0x10-0x13 */
+	0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x81, /* 0x14-0x17 */
+	0xBD, 0x82, 0xBD, 0x83, 0xBD, 0x84, 0xBD, 0x85, /* 0x18-0x1B */
+	0xBD, 0x86, 0xC7, 0xA2, 0xBD, 0x87, 0xBD, 0x88, /* 0x1C-0x1F */
+	0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, 0xBD, 0x8C, /* 0x20-0x23 */
+	0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, 0xBD, 0x90, /* 0x24-0x27 */
+	0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, 0xBD, 0x94, /* 0x28-0x2B */
+	0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, 0xBD, 0x98, /* 0x2C-0x2F */
+	0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, 0xBD, 0x9C, /* 0x30-0x33 */
+	0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, 0xBD, 0xA0, /* 0x34-0x37 */
+	0xBE, 0x41, 0xBE, 0x42, 0xBE, 0x43, 0xBE, 0x44, /* 0x38-0x3B */
+	0xBE, 0x45, 0xBE, 0x46, 0xBE, 0x47, 0xBE, 0x48, /* 0x3C-0x3F */
+	0xC7, 0xA3, 0xBE, 0x49, 0xBE, 0x4A, 0xBE, 0x4B, /* 0x40-0x43 */
+	0xC7, 0xA4, 0xBE, 0x4C, 0xBE, 0x4D, 0xBE, 0x4E, /* 0x44-0x47 */
+	0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, 0xBE, 0x52, /* 0x48-0x4B */
+	0xBE, 0x53, 0xBE, 0x54, 0xBE, 0x55, 0xBE, 0x56, /* 0x4C-0x4F */
+	0xBE, 0x57, 0xBE, 0x58, 0xBE, 0x59, 0xBE, 0x5A, /* 0x50-0x53 */
+	0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0x54-0x57 */
+	0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0x58-0x5B */
+	0xC7, 0xA5, 0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, /* 0x5C-0x5F */
+	0xC7, 0xA6, 0xBE, 0x6C, 0xBE, 0x6D, 0xBE, 0x6E, /* 0x60-0x63 */
+	0xC7, 0xA7, 0xBE, 0x6F, 0xBE, 0x70, 0xBE, 0x71, /* 0x64-0x67 */
+	0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, 0xBE, 0x75, /* 0x68-0x6B */
+	0xBE, 0x76, 0xC7, 0xA8, 0xBE, 0x77, 0xC7, 0xA9, /* 0x6C-0x6F */
+	0xBE, 0x78, 0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x81, /* 0x70-0x73 */
+	0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0x74-0x77 */
+	0xC7, 0xAA, 0xC7, 0xAB, 0xBE, 0x86, 0xBE, 0x87, /* 0x78-0x7B */
+	0xC7, 0xAC, 0xBE, 0x88, 0xBE, 0x89, 0xC7, 0xAD, /* 0x7C-0x7F */
+	
+	0xC7, 0xAE, 0xBE, 0x8A, 0xC7, 0xAF, 0xBE, 0x8B, /* 0x80-0x83 */
+	0xBE, 0x8C, 0xBE, 0x8D, 0xBE, 0x8E, 0xBE, 0x8F, /* 0x84-0x87 */
+	0xC7, 0xB0, 0xC7, 0xB1, 0xBE, 0x90, 0xC7, 0xB2, /* 0x88-0x8B */
+	0xBE, 0x91, 0xC7, 0xB3, 0xBE, 0x92, 0xBE, 0x93, /* 0x8C-0x8F */
+	0xBE, 0x94, 0xBE, 0x95, 0xBE, 0x96, 0xBE, 0x97, /* 0x90-0x93 */
+	0xC7, 0xB4, 0xBE, 0x98, 0xBE, 0x99, 0xBE, 0x9A, /* 0x94-0x97 */
+	0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, 0xBE, 0x9E, /* 0x98-0x9B */
+	0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x41, 0xBF, 0x42, /* 0x9C-0x9F */
+	0xBF, 0x43, 0xBF, 0x44, 0xBF, 0x45, 0xBF, 0x46, /* 0xA0-0xA3 */
+	0xBF, 0x47, 0xBF, 0x48, 0xBF, 0x49, 0xBF, 0x4A, /* 0xA4-0xA7 */
+	0xBF, 0x4B, 0xC7, 0xB5, 0xBF, 0x4C, 0xBF, 0x4D, /* 0xA8-0xAB */
+	0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, 0xBF, 0x51, /* 0xAC-0xAF */
+	0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, 0xBF, 0x55, /* 0xB0-0xB3 */
+	0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, 0xBF, 0x59, /* 0xB4-0xB7 */
+	0xBF, 0x5A, 0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, /* 0xB8-0xBB */
+	0xBF, 0x64, 0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, /* 0xBC-0xBF */
+	0xBF, 0x68, 0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, /* 0xC0-0xC3 */
+	0xBF, 0x6C, 0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, /* 0xC4-0xC7 */
+	0xBF, 0x70, 0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, /* 0xC8-0xCB */
+	0xC7, 0xB6, 0xBF, 0x74, 0xBF, 0x75, 0xBF, 0x76, /* 0xCC-0xCF */
+	0xC7, 0xB7, 0xBF, 0x77, 0xBF, 0x78, 0xBF, 0x79, /* 0xD0-0xD3 */
+	0xC7, 0xB8, 0xBF, 0x7A, 0xBF, 0x81, 0xBF, 0x82, /* 0xD4-0xD7 */
+	0xBF, 0x83, 0xBF, 0x84, 0xBF, 0x85, 0xBF, 0x86, /* 0xD8-0xDB */
+	0xC7, 0xB9, 0xBF, 0x87, 0xBF, 0x88, 0xC7, 0xBA, /* 0xDC-0xDF */
+	0xBF, 0x89, 0xBF, 0x8A, 0xBF, 0x8B, 0xBF, 0x8C, /* 0xE0-0xE3 */
+	0xBF, 0x8D, 0xBF, 0x8E, 0xBF, 0x8F, 0xBF, 0x90, /* 0xE4-0xE7 */
+	0xC7, 0xBB, 0xBF, 0x91, 0xBF, 0x92, 0xBF, 0x93, /* 0xE8-0xEB */
+	0xC7, 0xBC, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0xEC-0xEF */
+	0xC7, 0xBD, 0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, /* 0xF0-0xF3 */
+	0xBF, 0x9A, 0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, /* 0xF4-0xF7 */
+	0xC7, 0xBE, 0xBF, 0x9E, 0xBF, 0x9F, 0xC7, 0xBF, /* 0xF8-0xFB */
+	0xBF, 0xA0, 0xC7, 0xC0, 0xC0, 0x41, 0xC0, 0x42, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D5[512] = {
+	0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, 0xC0, 0x46, /* 0x00-0x03 */
+	0xC7, 0xC1, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x04-0x07 */
+	0xC7, 0xC2, 0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, /* 0x08-0x0B */
+	0xC7, 0xC3, 0xC0, 0x4D, 0xC0, 0x4E, 0xC0, 0x4F, /* 0x0C-0x0F */
+	0xC0, 0x50, 0xC0, 0x51, 0xC0, 0x52, 0xC0, 0x53, /* 0x10-0x13 */
+	0xC7, 0xC4, 0xC7, 0xC5, 0xC0, 0x54, 0xC7, 0xC6, /* 0x14-0x17 */
+	0xC0, 0x55, 0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, /* 0x18-0x1B */
+	0xC0, 0x59, 0xC0, 0x5A, 0xC0, 0x61, 0xC0, 0x62, /* 0x1C-0x1F */
+	0xC0, 0x63, 0xC0, 0x64, 0xC0, 0x65, 0xC0, 0x66, /* 0x20-0x23 */
+	0xC0, 0x67, 0xC0, 0x68, 0xC0, 0x69, 0xC0, 0x6A, /* 0x24-0x27 */
+	0xC0, 0x6B, 0xC0, 0x6C, 0xC0, 0x6D, 0xC0, 0x6E, /* 0x28-0x2B */
+	0xC0, 0x6F, 0xC0, 0x70, 0xC0, 0x71, 0xC0, 0x72, /* 0x2C-0x2F */
+	0xC0, 0x73, 0xC0, 0x74, 0xC0, 0x75, 0xC0, 0x76, /* 0x30-0x33 */
+	0xC0, 0x77, 0xC0, 0x78, 0xC0, 0x79, 0xC0, 0x7A, /* 0x34-0x37 */
+	0xC0, 0x81, 0xC0, 0x82, 0xC0, 0x83, 0xC0, 0x84, /* 0x38-0x3B */
+	0xC7, 0xC7, 0xC7, 0xC8, 0xC0, 0x85, 0xC0, 0x86, /* 0x3C-0x3F */
+	0xC7, 0xC9, 0xC0, 0x87, 0xC0, 0x88, 0xC0, 0x89, /* 0x40-0x43 */
+	0xC7, 0xCA, 0xC0, 0x8A, 0xC0, 0x8B, 0xC0, 0x8C, /* 0x44-0x47 */
+	0xC0, 0x8D, 0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, /* 0x48-0x4B */
+	0xC7, 0xCB, 0xC7, 0xCC, 0xC0, 0x91, 0xC7, 0xCD, /* 0x4C-0x4F */
+	0xC0, 0x92, 0xC7, 0xCE, 0xC0, 0x93, 0xC0, 0x94, /* 0x50-0x53 */
+	0xC0, 0x95, 0xC0, 0x96, 0xC0, 0x97, 0xC0, 0x98, /* 0x54-0x57 */
+	0xC7, 0xCF, 0xC7, 0xD0, 0xC0, 0x99, 0xC0, 0x9A, /* 0x58-0x5B */
+	0xC7, 0xD1, 0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, /* 0x5C-0x5F */
+	0xC7, 0xD2, 0xC0, 0x9E, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x60-0x63 */
+	0xC1, 0x41, 0xC7, 0xD3, 0xC1, 0x42, 0xC1, 0x43, /* 0x64-0x67 */
+	0xC7, 0xD4, 0xC7, 0xD5, 0xC1, 0x44, 0xC7, 0xD6, /* 0x68-0x6B */
+	0xC1, 0x45, 0xC7, 0xD7, 0xC1, 0x46, 0xC1, 0x47, /* 0x6C-0x6F */
+	0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x70-0x73 */
+	0xC7, 0xD8, 0xC7, 0xD9, 0xC1, 0x4C, 0xC1, 0x4D, /* 0x74-0x77 */
+	0xC7, 0xDA, 0xC1, 0x4E, 0xC1, 0x4F, 0xC1, 0x50, /* 0x78-0x7B */
+	0xC7, 0xDB, 0xC1, 0x51, 0xC1, 0x52, 0xC1, 0x53, /* 0x7C-0x7F */
+	
+	0xC1, 0x54, 0xC1, 0x55, 0xC1, 0x56, 0xC1, 0x57, /* 0x80-0x83 */
+	0xC7, 0xDC, 0xC7, 0xDD, 0xC1, 0x58, 0xC7, 0xDE, /* 0x84-0x87 */
+	0xC7, 0xDF, 0xC7, 0xE0, 0xC1, 0x59, 0xC1, 0x5A, /* 0x88-0x8B */
+	0xC1, 0x61, 0xC1, 0x62, 0xC1, 0x63, 0xC1, 0x64, /* 0x8C-0x8F */
+	0xC7, 0xE1, 0xC1, 0x65, 0xC1, 0x66, 0xC1, 0x67, /* 0x90-0x93 */
+	0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, 0xC1, 0x6B, /* 0x94-0x97 */
+	0xC1, 0x6C, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x6F, /* 0x98-0x9B */
+	0xC1, 0x70, 0xC1, 0x71, 0xC1, 0x72, 0xC1, 0x73, /* 0x9C-0x9F */
+	0xC1, 0x74, 0xC1, 0x75, 0xC1, 0x76, 0xC1, 0x77, /* 0xA0-0xA3 */
+	0xC1, 0x78, 0xC7, 0xE2, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA4-0xA7 */
+	0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xA8-0xAB */
+	0xC1, 0x85, 0xC1, 0x86, 0xC1, 0x87, 0xC1, 0x88, /* 0xAC-0xAF */
+	0xC1, 0x89, 0xC1, 0x8A, 0xC1, 0x8B, 0xC1, 0x8C, /* 0xB0-0xB3 */
+	0xC1, 0x8D, 0xC1, 0x8E, 0xC1, 0x8F, 0xC1, 0x90, /* 0xB4-0xB7 */
+	0xC1, 0x91, 0xC1, 0x92, 0xC1, 0x93, 0xC1, 0x94, /* 0xB8-0xBB */
+	0xC1, 0x95, 0xC1, 0x96, 0xC1, 0x97, 0xC1, 0x98, /* 0xBC-0xBF */
+	0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, 0xC1, 0x9C, /* 0xC0-0xC3 */
+	0xC1, 0x9D, 0xC1, 0x9E, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xC4-0xC7 */
+	0xC7, 0xE3, 0xC7, 0xE4, 0xC2, 0x41, 0xC2, 0x42, /* 0xC8-0xCB */
+	0xC7, 0xE5, 0xC2, 0x43, 0xC2, 0x44, 0xC2, 0x45, /* 0xCC-0xCF */
+	0xC7, 0xE6, 0xC2, 0x46, 0xC7, 0xE7, 0xC2, 0x47, /* 0xD0-0xD3 */
+	0xC2, 0x48, 0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, /* 0xD4-0xD7 */
+	0xC7, 0xE8, 0xC7, 0xE9, 0xC2, 0x4C, 0xC7, 0xEA, /* 0xD8-0xDB */
+	0xC2, 0x4D, 0xC7, 0xEB, 0xC2, 0x4E, 0xC2, 0x4F, /* 0xDC-0xDF */
+	0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, 0xC2, 0x53, /* 0xE0-0xE3 */
+	0xC7, 0xEC, 0xC7, 0xED, 0xC2, 0x54, 0xC2, 0x55, /* 0xE4-0xE7 */
+	0xC7, 0xEE, 0xC2, 0x56, 0xC2, 0x57, 0xC2, 0x58, /* 0xE8-0xEB */
+	0xC7, 0xEF, 0xC2, 0x59, 0xC2, 0x5A, 0xC2, 0x61, /* 0xEC-0xEF */
+	0xC2, 0x62, 0xC2, 0x63, 0xC2, 0x64, 0xC2, 0x65, /* 0xF0-0xF3 */
+	0xC7, 0xF0, 0xC7, 0xF1, 0xC2, 0x66, 0xC7, 0xF2, /* 0xF4-0xF7 */
+	0xC2, 0x67, 0xC7, 0xF3, 0xC2, 0x68, 0xC2, 0x69, /* 0xF8-0xFB */
+	0xC2, 0x6A, 0xC2, 0x6B, 0xC2, 0x6C, 0xC2, 0x6D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D6[512] = {
+	0xC7, 0xF4, 0xC7, 0xF5, 0xC2, 0x6E, 0xC2, 0x6F, /* 0x00-0x03 */
+	0xC7, 0xF6, 0xC2, 0x70, 0xC2, 0x71, 0xC2, 0x72, /* 0x04-0x07 */
+	0xC7, 0xF7, 0xC2, 0x73, 0xC2, 0x74, 0xC2, 0x75, /* 0x08-0x0B */
+	0xC2, 0x76, 0xC2, 0x77, 0xC2, 0x78, 0xC2, 0x79, /* 0x0C-0x0F */
+	0xC7, 0xF8, 0xC7, 0xF9, 0xC2, 0x7A, 0xC7, 0xFA, /* 0x10-0x13 */
+	0xC7, 0xFB, 0xC7, 0xFC, 0xC2, 0x81, 0xC2, 0x82, /* 0x14-0x17 */
+	0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, 0xC2, 0x86, /* 0x18-0x1B */
+	0xC7, 0xFD, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x1C-0x1F */
+	0xC7, 0xFE, 0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, /* 0x20-0x23 */
+	0xC8, 0xA1, 0xC2, 0x8D, 0xC2, 0x8E, 0xC2, 0x8F, /* 0x24-0x27 */
+	0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x28-0x2B */
+	0xC2, 0x94, 0xC8, 0xA2, 0xC2, 0x95, 0xC2, 0x96, /* 0x2C-0x2F */
+	0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x30-0x33 */
+	0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x34-0x37 */
+	0xC8, 0xA3, 0xC8, 0xA4, 0xC2, 0x9F, 0xC2, 0xA0, /* 0x38-0x3B */
+	0xC8, 0xA5, 0xC3, 0x41, 0xC3, 0x42, 0xC3, 0x43, /* 0x3C-0x3F */
+	0xC8, 0xA6, 0xC3, 0x44, 0xC3, 0x45, 0xC3, 0x46, /* 0x40-0x43 */
+	0xC3, 0x47, 0xC8, 0xA7, 0xC3, 0x48, 0xC3, 0x49, /* 0x44-0x47 */
+	0xC8, 0xA8, 0xC8, 0xA9, 0xC3, 0x4A, 0xC8, 0xAA, /* 0x48-0x4B */
+	0xC3, 0x4B, 0xC8, 0xAB, 0xC3, 0x4C, 0xC3, 0x4D, /* 0x4C-0x4F */
+	0xC3, 0x4E, 0xC8, 0xAC, 0xC3, 0x4F, 0xC3, 0x50, /* 0x50-0x53 */
+	0xC8, 0xAD, 0xC8, 0xAE, 0xC3, 0x51, 0xC3, 0x52, /* 0x54-0x57 */
+	0xC8, 0xAF, 0xC3, 0x53, 0xC3, 0x54, 0xC3, 0x55, /* 0x58-0x5B */
+	0xC8, 0xB0, 0xC3, 0x56, 0xC3, 0x57, 0xC3, 0x58, /* 0x5C-0x5F */
+	0xC3, 0x59, 0xC3, 0x5A, 0xC3, 0x61, 0xC3, 0x62, /* 0x60-0x63 */
+	0xC3, 0x63, 0xC3, 0x64, 0xC3, 0x65, 0xC8, 0xB1, /* 0x64-0x67 */
+	0xC3, 0x66, 0xC8, 0xB2, 0xC3, 0x67, 0xC3, 0x68, /* 0x68-0x6B */
+	0xC3, 0x69, 0xC3, 0x6A, 0xC3, 0x6B, 0xC3, 0x6C, /* 0x6C-0x6F */
+	0xC8, 0xB3, 0xC8, 0xB4, 0xC3, 0x6D, 0xC3, 0x6E, /* 0x70-0x73 */
+	0xC8, 0xB5, 0xC3, 0x6F, 0xC3, 0x70, 0xC3, 0x71, /* 0x74-0x77 */
+	0xC3, 0x72, 0xC3, 0x73, 0xC3, 0x74, 0xC3, 0x75, /* 0x78-0x7B */
+	0xC3, 0x76, 0xC3, 0x77, 0xC3, 0x78, 0xC3, 0x79, /* 0x7C-0x7F */
+	
+	0xC3, 0x7A, 0xC3, 0x81, 0xC3, 0x82, 0xC8, 0xB6, /* 0x80-0x83 */
+	0xC3, 0x83, 0xC8, 0xB7, 0xC3, 0x84, 0xC3, 0x85, /* 0x84-0x87 */
+	0xC3, 0x86, 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, /* 0x88-0x8B */
+	0xC8, 0xB8, 0xC8, 0xB9, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x8C-0x8F */
+	0xC8, 0xBA, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, /* 0x90-0x93 */
+	0xC8, 0xBB, 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, /* 0x94-0x97 */
+	0xC3, 0x92, 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, /* 0x98-0x9B */
+	0xC3, 0x96, 0xC8, 0xBC, 0xC3, 0x97, 0xC8, 0xBD, /* 0x9C-0x9F */
+	0xC3, 0x98, 0xC8, 0xBE, 0xC3, 0x99, 0xC3, 0x9A, /* 0xA0-0xA3 */
+	0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, /* 0xA4-0xA7 */
+	0xC8, 0xBF, 0xC3, 0x9F, 0xC3, 0xA0, 0xC4, 0x41, /* 0xA8-0xAB */
+	0xC8, 0xC0, 0xC4, 0x42, 0xC4, 0x43, 0xC4, 0x44, /* 0xAC-0xAF */
+	0xC8, 0xC1, 0xC4, 0x45, 0xC4, 0x46, 0xC4, 0x47, /* 0xB0-0xB3 */
+	0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, 0xC4, 0x4B, /* 0xB4-0xB7 */
+	0xC4, 0x4C, 0xC8, 0xC2, 0xC4, 0x4D, 0xC8, 0xC3, /* 0xB8-0xBB */
+	0xC4, 0x4E, 0xC4, 0x4F, 0xC4, 0x50, 0xC4, 0x51, /* 0xBC-0xBF */
+	0xC4, 0x52, 0xC4, 0x53, 0xC4, 0x54, 0xC4, 0x55, /* 0xC0-0xC3 */
+	0xC8, 0xC4, 0xC8, 0xC5, 0xC4, 0x56, 0xC4, 0x57, /* 0xC4-0xC7 */
+	0xC8, 0xC6, 0xC4, 0x58, 0xC4, 0x59, 0xC4, 0x5A, /* 0xC8-0xCB */
+	0xC8, 0xC7, 0xC4, 0x61, 0xC4, 0x62, 0xC4, 0x63, /* 0xCC-0xCF */
+	0xC4, 0x64, 0xC8, 0xC8, 0xC4, 0x65, 0xC4, 0x66, /* 0xD0-0xD3 */
+	0xC8, 0xC9, 0xC4, 0x67, 0xC4, 0x68, 0xC8, 0xCA, /* 0xD4-0xD7 */
+	0xC4, 0x69, 0xC8, 0xCB, 0xC4, 0x6A, 0xC4, 0x6B, /* 0xD8-0xDB */
+	0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xC4, 0x6F, /* 0xDC-0xDF */
+	0xC8, 0xCC, 0xC4, 0x70, 0xC4, 0x71, 0xC4, 0x72, /* 0xE0-0xE3 */
+	0xC8, 0xCD, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0xE4-0xE7 */
+	0xC8, 0xCE, 0xC4, 0x76, 0xC4, 0x77, 0xC4, 0x78, /* 0xE8-0xEB */
+	0xC4, 0x79, 0xC4, 0x7A, 0xC4, 0x81, 0xC4, 0x82, /* 0xEC-0xEF */
+	0xC8, 0xCF, 0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, /* 0xF0-0xF3 */
+	0xC4, 0x86, 0xC8, 0xD0, 0xC4, 0x87, 0xC4, 0x88, /* 0xF4-0xF7 */
+	0xC4, 0x89, 0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, /* 0xF8-0xFB */
+	0xC8, 0xD1, 0xC8, 0xD2, 0xC4, 0x8D, 0xC4, 0x8E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D7[512] = {
+	0xC8, 0xD3, 0xC4, 0x8F, 0xC4, 0x90, 0xC4, 0x91, /* 0x00-0x03 */
+	0xC8, 0xD4, 0xC4, 0x92, 0xC4, 0x93, 0xC4, 0x94, /* 0x04-0x07 */
+	0xC4, 0x95, 0xC4, 0x96, 0xC4, 0x97, 0xC4, 0x98, /* 0x08-0x0B */
+	0xC4, 0x99, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0x0C-0x0F */
+	0xC4, 0x9D, 0xC8, 0xD5, 0xC4, 0x9E, 0xC4, 0x9F, /* 0x10-0x13 */
+	0xC4, 0xA0, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0x14-0x17 */
+	0xC8, 0xD6, 0xC8, 0xD7, 0xC5, 0x44, 0xC5, 0x45, /* 0x18-0x1B */
+	0xC8, 0xD8, 0xC5, 0x46, 0xC5, 0x47, 0xC5, 0x48, /* 0x1C-0x1F */
+	0xC8, 0xD9, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0x20-0x23 */
+	0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xC5, 0x4F, /* 0x24-0x27 */
+	0xC8, 0xDA, 0xC8, 0xDB, 0xC5, 0x50, 0xC8, 0xDC, /* 0x28-0x2B */
+	0xC5, 0x51, 0xC8, 0xDD, 0xC5, 0x52, 0xC5, 0x53, /* 0x2C-0x2F */
+	0xC5, 0x54, 0xC5, 0x55, 0xC5, 0x56, 0xC5, 0x57, /* 0x30-0x33 */
+	0xC8, 0xDE, 0xC8, 0xDF, 0xC5, 0x58, 0xC5, 0x59, /* 0x34-0x37 */
+	0xC8, 0xE0, 0xC5, 0x5A, 0xC5, 0x61, 0xC5, 0x62, /* 0x38-0x3B */
+	0xC8, 0xE1, 0xC5, 0x63, 0xC5, 0x64, 0xC5, 0x65, /* 0x3C-0x3F */
+	0xC5, 0x66, 0xC5, 0x67, 0xC5, 0x68, 0xC5, 0x69, /* 0x40-0x43 */
+	0xC8, 0xE2, 0xC5, 0x6A, 0xC5, 0x6B, 0xC8, 0xE3, /* 0x44-0x47 */
+	0xC5, 0x6C, 0xC8, 0xE4, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x48-0x4B */
+	0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xC5, 0x72, /* 0x4C-0x4F */
+	0xC8, 0xE5, 0xC8, 0xE6, 0xC5, 0x73, 0xC5, 0x74, /* 0x50-0x53 */
+	0xC8, 0xE7, 0xC5, 0x75, 0xC8, 0xE8, 0xC8, 0xE9, /* 0x54-0x57 */
+	0xC8, 0xEA, 0xC8, 0xEB, 0xC5, 0x76, 0xC5, 0x77, /* 0x58-0x5B */
+	0xC5, 0x78, 0xC5, 0x79, 0xC5, 0x7A, 0xC5, 0x81, /* 0x5C-0x5F */
+	0xC8, 0xEC, 0xC8, 0xED, 0xC5, 0x82, 0xC8, 0xEE, /* 0x60-0x63 */
+	0xC5, 0x83, 0xC8, 0xEF, 0xC5, 0x84, 0xC5, 0x85, /* 0x64-0x67 */
+	0xC5, 0x86, 0xC8, 0xF0, 0xC5, 0x87, 0xC5, 0x88, /* 0x68-0x6B */
+	0xC8, 0xF1, 0xC5, 0x89, 0xC5, 0x8A, 0xC5, 0x8B, /* 0x6C-0x6F */
+	0xC8, 0xF2, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x70-0x73 */
+	0xC8, 0xF3, 0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, /* 0x74-0x77 */
+	0xC5, 0x92, 0xC5, 0x93, 0xC5, 0x94, 0xC5, 0x95, /* 0x78-0x7B */
+	0xC8, 0xF4, 0xC8, 0xF5, 0xC5, 0x96, 0xC5, 0x97, /* 0x7C-0x7F */
+	
+	0xC5, 0x98, 0xC8, 0xF6, 0xC5, 0x99, 0xC5, 0x9A, /* 0x80-0x83 */
+	0xC5, 0x9B, 0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, /* 0x84-0x87 */
+	0xC8, 0xF7, 0xC8, 0xF8, 0xC5, 0x9F, 0xC5, 0xA0, /* 0x88-0x8B */
+	0xC8, 0xF9, 0xC6, 0x41, 0xC6, 0x42, 0xC6, 0x43, /* 0x8C-0x8F */
+	0xC8, 0xFA, 0xC6, 0x44, 0xC6, 0x45, 0xC6, 0x46, /* 0x90-0x93 */
+	0xC6, 0x47, 0xC6, 0x48, 0xC6, 0x49, 0xC6, 0x4A, /* 0x94-0x97 */
+	0xC8, 0xFB, 0xC8, 0xFC, 0xC6, 0x4B, 0xC8, 0xFD, /* 0x98-0x9B */
+	0xC6, 0x4C, 0xC8, 0xFE, 0xC6, 0x4D, 0xC6, 0x4E, /* 0x9C-0x9F */
+	0xC6, 0x4F, 0xC6, 0x50, 0xC6, 0x51, 0xC6, 0x52, /* 0xA0-0xA3 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0xCB, 0xD0, 0xCB, 0xD6, 0xCB, 0xE7, 0xCD, 0xCF, /* 0x00-0x03 */
+	0xCD, 0xE8, 0xCE, 0xAD, 0xCF, 0xFB, 0xD0, 0xA2, /* 0x04-0x07 */
+	0xD0, 0xB8, 0xD0, 0xD0, 0xD0, 0xDD, 0xD1, 0xD4, /* 0x08-0x0B */
+	0xD1, 0xD5, 0xD1, 0xD8, 0xD1, 0xDB, 0xD1, 0xDC, /* 0x0C-0x0F */
+	0xD1, 0xDD, 0xD1, 0xDE, 0xD1, 0xDF, 0xD1, 0xE0, /* 0x10-0x13 */
+	0xD1, 0xE2, 0xD1, 0xE3, 0xD1, 0xE4, 0xD1, 0xE5, /* 0x14-0x17 */
+	0xD1, 0xE6, 0xD1, 0xE8, 0xD1, 0xE9, 0xD1, 0xEA, /* 0x18-0x1B */
+	0xD1, 0xEB, 0xD1, 0xED, 0xD1, 0xEF, 0xD1, 0xF0, /* 0x1C-0x1F */
+	0xD1, 0xF2, 0xD1, 0xF6, 0xD1, 0xFA, 0xD1, 0xFC, /* 0x20-0x23 */
+	0xD1, 0xFD, 0xD1, 0xFE, 0xD2, 0xA2, 0xD2, 0xA3, /* 0x24-0x27 */
+	0xD2, 0xA7, 0xD2, 0xA8, 0xD2, 0xA9, 0xD2, 0xAA, /* 0x28-0x2B */
+	0xD2, 0xAB, 0xD2, 0xAD, 0xD2, 0xB2, 0xD2, 0xBE, /* 0x2C-0x2F */
+	0xD2, 0xC2, 0xD2, 0xC3, 0xD2, 0xC4, 0xD2, 0xC6, /* 0x30-0x33 */
+	0xD2, 0xC7, 0xD2, 0xC8, 0xD2, 0xC9, 0xD2, 0xCA, /* 0x34-0x37 */
+	0xD2, 0xCB, 0xD2, 0xCD, 0xD2, 0xCE, 0xD2, 0xCF, /* 0x38-0x3B */
+	0xD2, 0xD0, 0xD2, 0xD1, 0xD2, 0xD2, 0xD2, 0xD3, /* 0x3C-0x3F */
+	0xD2, 0xD4, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xD7, /* 0x40-0x43 */
+	0xD2, 0xD9, 0xD2, 0xDA, 0xD2, 0xDE, 0xD2, 0xDF, /* 0x44-0x47 */
+	0xD2, 0xE1, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE5, /* 0x48-0x4B */
+	0xD2, 0xE6, 0xD2, 0xE7, 0xD2, 0xE8, 0xD2, 0xE9, /* 0x4C-0x4F */
+	0xD2, 0xEA, 0xD2, 0xEB, 0xD2, 0xF0, 0xD2, 0xF1, /* 0x50-0x53 */
+	0xD2, 0xF2, 0xD2, 0xF3, 0xD2, 0xF4, 0xD2, 0xF5, /* 0x54-0x57 */
+	0xD2, 0xF7, 0xD2, 0xF8, 0xD4, 0xE6, 0xD4, 0xFC, /* 0x58-0x5B */
+	0xD5, 0xA5, 0xD5, 0xAB, 0xD5, 0xAE, 0xD6, 0xB8, /* 0x5C-0x5F */
+	0xD6, 0xCD, 0xD7, 0xCB, 0xD7, 0xE4, 0xDB, 0xC5, /* 0x60-0x63 */
+	0xDB, 0xE4, 0xDC, 0xA5, 0xDD, 0xA5, 0xDD, 0xD5, /* 0x64-0x67 */
+	0xDD, 0xF4, 0xDE, 0xFC, 0xDE, 0xFE, 0xDF, 0xB3, /* 0x68-0x6B */
+	0xDF, 0xE1, 0xDF, 0xE8, 0xE0, 0xF1, 0xE1, 0xAD, /* 0x6C-0x6F */
+	0xE1, 0xED, 0xE3, 0xF5, 0xE4, 0xA1, 0xE4, 0xA9, /* 0x70-0x73 */
+	0xE5, 0xAE, 0xE5, 0xB1, 0xE5, 0xB2, 0xE5, 0xB9, /* 0x74-0x77 */
+	0xE5, 0xBB, 0xE5, 0xBC, 0xE5, 0xC4, 0xE5, 0xCE, /* 0x78-0x7B */
+	0xE5, 0xD0, 0xE5, 0xD2, 0xE5, 0xD6, 0xE5, 0xFA, /* 0x7C-0x7F */
+	
+	0xE5, 0xFB, 0xE5, 0xFC, 0xE5, 0xFE, 0xE6, 0xA1, /* 0x80-0x83 */
+	0xE6, 0xA4, 0xE6, 0xA7, 0xE6, 0xAD, 0xE6, 0xAF, /* 0x84-0x87 */
+	0xE6, 0xB0, 0xE6, 0xB1, 0xE6, 0xB3, 0xE6, 0xB7, /* 0x88-0x8B */
+	0xE6, 0xB8, 0xE6, 0xBC, 0xE6, 0xC4, 0xE6, 0xC6, /* 0x8C-0x8F */
+	0xE6, 0xC7, 0xE6, 0xCA, 0xE6, 0xD2, 0xE6, 0xD6, /* 0x90-0x93 */
+	0xE6, 0xD9, 0xE6, 0xDC, 0xE6, 0xDF, 0xE6, 0xE1, /* 0x94-0x97 */
+	0xE6, 0xE4, 0xE6, 0xE5, 0xE6, 0xE6, 0xE6, 0xE8, /* 0x98-0x9B */
+	0xE6, 0xEA, 0xE6, 0xEB, 0xE6, 0xEC, 0xE6, 0xEF, /* 0x9C-0x9F */
+	0xE6, 0xF1, 0xE6, 0xF2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */
+	0xE6, 0xF7, 0xE6, 0xF9, 0xE7, 0xA1, 0xE7, 0xA6, /* 0xA4-0xA7 */
+	0xE7, 0xA9, 0xE7, 0xAA, 0xE7, 0xAC, 0xE7, 0xAD, /* 0xA8-0xAB */
+	0xE7, 0xB0, 0xE7, 0xBF, 0xE7, 0xC1, 0xE7, 0xC6, /* 0xAC-0xAF */
+	0xE7, 0xC7, 0xE7, 0xCB, 0xE7, 0xCD, 0xE7, 0xCF, /* 0xB0-0xB3 */
+	0xE7, 0xD0, 0xE7, 0xD3, 0xE7, 0xDF, 0xE7, 0xE4, /* 0xB4-0xB7 */
+	0xE7, 0xE6, 0xE7, 0xF7, 0xE8, 0xE7, 0xE8, 0xE8, /* 0xB8-0xBB */
+	0xE8, 0xF0, 0xE8, 0xF1, 0xE8, 0xF7, 0xE8, 0xF9, /* 0xBC-0xBF */
+	0xE8, 0xFB, 0xE8, 0xFE, 0xE9, 0xA7, 0xE9, 0xAC, /* 0xC0-0xC3 */
+	0xE9, 0xCC, 0xE9, 0xF7, 0xEA, 0xC1, 0xEA, 0xE5, /* 0xC4-0xC7 */
+	0xEA, 0xF4, 0xEA, 0xF7, 0xEA, 0xFC, 0xEA, 0xFE, /* 0xC8-0xCB */
+	0xEB, 0xA4, 0xEB, 0xA7, 0xEB, 0xA9, 0xEB, 0xAA, /* 0xCC-0xCF */
+	0xEB, 0xBA, 0xEB, 0xBB, 0xEB, 0xBD, 0xEB, 0xC1, /* 0xD0-0xD3 */
+	0xEB, 0xC2, 0xEB, 0xC6, 0xEB, 0xC7, 0xEB, 0xCC, /* 0xD4-0xD7 */
+	0xEB, 0xCF, 0xEB, 0xD0, 0xEB, 0xD1, 0xEB, 0xD2, /* 0xD8-0xDB */
+	0xEB, 0xD8, 0xEC, 0xA6, 0xEC, 0xA7, 0xEC, 0xAA, /* 0xDC-0xDF */
+	0xEC, 0xAF, 0xEC, 0xB0, 0xEC, 0xB1, 0xEC, 0xB2, /* 0xE0-0xE3 */
+	0xEC, 0xB5, 0xEC, 0xB8, 0xEC, 0xBA, 0xEC, 0xC0, /* 0xE4-0xE7 */
+	0xEC, 0xC1, 0xEC, 0xC5, 0xEC, 0xC6, 0xEC, 0xC9, /* 0xE8-0xEB */
+	0xEC, 0xCA, 0xEC, 0xD5, 0xEC, 0xDD, 0xEC, 0xDE, /* 0xEC-0xEF */
+	0xEC, 0xE1, 0xEC, 0xE4, 0xEC, 0xE7, 0xEC, 0xE8, /* 0xF0-0xF3 */
+	0xEC, 0xF7, 0xEC, 0xF8, 0xEC, 0xFA, 0xED, 0xA1, /* 0xF4-0xF7 */
+	0xED, 0xA2, 0xED, 0xA3, 0xED, 0xEE, 0xEE, 0xDB, /* 0xF8-0xFB */
+	0xF2, 0xBD, 0xF2, 0xFA, 0xF3, 0xB1, 0xF4, 0xA7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0xF4, 0xEE, 0xF6, 0xF4, 0xF6, 0xF6, 0xF7, 0xB8, /* 0x00-0x03 */
+	0xF7, 0xC8, 0xF7, 0xD3, 0xF8, 0xDB, 0xF8, 0xF0, /* 0x04-0x07 */
+	0xFA, 0xA1, 0xFA, 0xA2, 0xFA, 0xE6, 0xFC, 0xA9, /* 0x08-0x0B */
+	0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF5, 0xC0, 0x00, 0x00, 0xF4, 0xE7, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xFD, 0xEB, 0x00, 0x00, 0xEC, 0xCC, /* 0x14-0x17 */
+	0x00, 0x00, 0xE3, 0xEA, 0xDF, 0xD4, 0xDC, 0xD8, /* 0x18-0x1B */
+	0xEF, 0xFE, 0xEF, 0xF1, 0xE9, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xB3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xEC, 0xEF, 0xD4, 0xB4, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xF9, 0xDE, 0xF8, /* 0x28-0x2B */
+	0xCE, 0xBD, 0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */
+	0xA3, 0xA4, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */
+	0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */
+	0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */
+	0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */
+	0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */
+	0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */
+	0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */
+	0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */
+	0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */
+	0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */
+	0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */
+	0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */
+	0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */
+	0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */
+	0xA1, 0xAC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */
+	0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */
+	0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */
+	0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */
+	0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */
+	0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */
+	0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */
+	0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */
+	0xA3, 0xFC, 0xA3, 0xFD, 0xA2, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA4, 0xD4, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0xA0-0xA3 */
+	0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0xA4-0xA7 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0xA8-0xAB */
+	0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0xAC-0xAF */
+	0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xB0-0xB3 */
+	0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0xB4-0xB7 */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xB8-0xBB */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, /* 0xC0-0xC3 */
+	0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xC5, 0xA4, 0xC6, /* 0xC8-0xCB */
+	0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xCB, 0xA4, 0xCC, /* 0xD0-0xD3 */
+	0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD2, /* 0xD8-0xDB */
+	0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA1, 0xCB, 0xA1, 0xCC, 0xA1, 0xFE, 0xA3, 0xFE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA1, 0xCD, 0xA3, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	NULL,   u2c_01, u2c_02, u2c_03, u2c_04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_11, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, u2c_31, u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_AC, u2c_AD, u2c_AE, u2c_AF, 
+	u2c_B0, u2c_B1, u2c_B2, u2c_B3, u2c_B4, u2c_B5, u2c_B6, u2c_B7, 
+	u2c_B8, u2c_B9, u2c_BA, u2c_BB, u2c_BC, u2c_BD, u2c_BE, u2c_BF, 
+	u2c_C0, u2c_C1, u2c_C2, u2c_C3, u2c_C4, u2c_C5, u2c_C6, u2c_C7, 
+	u2c_C8, u2c_C9, u2c_CA, u2c_CB, u2c_CC, u2c_CD, u2c_CE, u2c_CF, 
+	u2c_D0, u2c_D1, u2c_D2, u2c_D3, u2c_D4, u2c_D5, u2c_D6, u2c_D7, 
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   NULL,   u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+			unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen <= 1)
+			return -ENAMETOOLONG;
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		n = 2;
+	} else if (ch==0 && cl) {
+		out[0] = cl;
+		n = 1;
+	}
+	else
+		return -EINVAL;
+
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+			wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (boundlen == 1) {
+		*uni = rawstring[0];
+		return 1;
+	}
+
+	ch = rawstring[0];
+	cl = rawstring[1];
+
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		n = 2;
+	} else{
+		*uni = ch;
+		n = 1;
+	}
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "cp949",
+	.alias		= "euc-kr",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp949(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp949(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp949)
+module_exit(exit_nls_cp949)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(euc-kr);
diff --git a/kernel/fs/nls/nls_cp950.c b/kernel/fs/nls/nls_cp950.c
new file mode 100644
index 000000000..8e1418708
--- /dev/null
+++ b/kernel/fs/nls/nls_cp950.c
@@ -0,0 +1,9482 @@
+/*
+ * linux/fs/nls/nls_cp950.c
+ *
+ * Charset cp950 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_A1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x3000,0xFF0C,0x3001,0x3002,0xFF0E,0x2027,0xFF1B,0xFF1A,/* 0x40-0x47 */
+	0xFF1F,0xFF01,0xFE30,0x2026,0x2025,0xFE50,0xFE51,0xFE52,/* 0x48-0x4F */
+	0x00B7,0xFE54,0xFE55,0xFE56,0xFE57,0xFF5C,0x2013,0xFE31,/* 0x50-0x57 */
+	0x2014,0xFE33,0x2574,0xFE34,0xFE4F,0xFF08,0xFF09,0xFE35,/* 0x58-0x5F */
+	0xFE36,0xFF5B,0xFF5D,0xFE37,0xFE38,0x3014,0x3015,0xFE39,/* 0x60-0x67 */
+	0xFE3A,0x3010,0x3011,0xFE3B,0xFE3C,0x300A,0x300B,0xFE3D,/* 0x68-0x6F */
+	0xFE3E,0x3008,0x3009,0xFE3F,0xFE40,0x300C,0x300D,0xFE41,/* 0x70-0x77 */
+	0xFE42,0x300E,0x300F,0xFE43,0xFE44,0xFE59,0xFE5A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xFE5B,0xFE5C,0xFE5D,0xFE5E,0x2018,0x2019,0x201C,/* 0xA0-0xA7 */
+	0x201D,0x301D,0x301E,0x2035,0x2032,0xFF03,0xFF06,0xFF0A,/* 0xA8-0xAF */
+	0x203B,0x00A7,0x3003,0x25CB,0x25CF,0x25B3,0x25B2,0x25CE,/* 0xB0-0xB7 */
+	0x2606,0x2605,0x25C7,0x25C6,0x25A1,0x25A0,0x25BD,0x25BC,/* 0xB8-0xBF */
+	0x32A3,0x2105,0x00AF,0xFFE3,0xFF3F,0x02CD,0xFE49,0xFE4A,/* 0xC0-0xC7 */
+	0xFE4D,0xFE4E,0xFE4B,0xFE4C,0xFE5F,0xFE60,0xFE61,0xFF0B,/* 0xC8-0xCF */
+	0xFF0D,0x00D7,0x00F7,0x00B1,0x221A,0xFF1C,0xFF1E,0xFF1D,/* 0xD0-0xD7 */
+	0x2266,0x2267,0x2260,0x221E,0x2252,0x2261,0xFE62,0xFE63,/* 0xD8-0xDF */
+	0xFE64,0xFE65,0xFE66,0xFF5E,0x2229,0x222A,0x22A5,0x2220,/* 0xE0-0xE7 */
+	0x221F,0x22BF,0x33D2,0x33D1,0x222B,0x222E,0x2235,0x2234,/* 0xE8-0xEF */
+	0x2640,0x2642,0x2295,0x2299,0x2191,0x2193,0x2190,0x2192,/* 0xF0-0xF7 */
+	0x2196,0x2197,0x2199,0x2198,0x2225,0x2223,0xFF0F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0xFF3C,0x2215,0xFE68,0xFF04,0xFFE5,0x3012,0xFFE0,0xFFE1,/* 0x40-0x47 */
+	0xFF05,0xFF20,0x2103,0x2109,0xFE69,0xFE6A,0xFE6B,0x33D5,/* 0x48-0x4F */
+	0x339C,0x339D,0x339E,0x33CE,0x33A1,0x338E,0x338F,0x33C4,/* 0x50-0x57 */
+	0x00B0,0x5159,0x515B,0x515E,0x515D,0x5161,0x5163,0x55E7,/* 0x58-0x5F */
+	0x74E9,0x7CCE,0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,/* 0x60-0x67 */
+	0x2587,0x2588,0x258F,0x258E,0x258D,0x258C,0x258B,0x258A,/* 0x68-0x6F */
+	0x2589,0x253C,0x2534,0x252C,0x2524,0x251C,0x2594,0x2500,/* 0x70-0x77 */
+	0x2502,0x2595,0x250C,0x2510,0x2514,0x2518,0x256D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x256E,0x2570,0x256F,0x2550,0x255E,0x256A,0x2561,/* 0xA0-0xA7 */
+	0x25E2,0x25E3,0x25E5,0x25E4,0x2571,0x2572,0x2573,0xFF10,/* 0xA8-0xAF */
+	0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0xB0-0xB7 */
+	0xFF19,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xB8-0xBF */
+	0x2167,0x2168,0x2169,0x3021,0x3022,0x3023,0x3024,0x3025,/* 0xC0-0xC7 */
+	0x3026,0x3027,0x3028,0x3029,0x5341,0x5344,0x5345,0xFF21,/* 0xC8-0xCF */
+	0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,0xFF29,/* 0xD0-0xD7 */
+	0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,0xFF31,/* 0xD8-0xDF */
+	0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,0xFF39,/* 0xE0-0xE7 */
+	0xFF3A,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE8-0xEF */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xF0-0xF7 */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0xFF57,0xFF58,0xFF59,0xFF5A,0x0391,0x0392,0x0393,0x0394,/* 0x40-0x47 */
+	0x0395,0x0396,0x0397,0x0398,0x0399,0x039A,0x039B,0x039C,/* 0x48-0x4F */
+	0x039D,0x039E,0x039F,0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,/* 0x50-0x57 */
+	0x03A6,0x03A7,0x03A8,0x03A9,0x03B1,0x03B2,0x03B3,0x03B4,/* 0x58-0x5F */
+	0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,/* 0x60-0x67 */
+	0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,/* 0x68-0x6F */
+	0x03C6,0x03C7,0x03C8,0x03C9,0x3105,0x3106,0x3107,0x3108,/* 0x70-0x77 */
+	0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,/* 0xA0-0xA7 */
+	0x3117,0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,/* 0xA8-0xAF */
+	0x311F,0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,/* 0xB0-0xB7 */
+	0x3127,0x3128,0x3129,0x02D9,0x02C9,0x02CA,0x02C7,0x02CB,/* 0xB8-0xBF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */
+	0x0000,0x20AC,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */
+};
+
+static const wchar_t c2u_A4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E00,0x4E59,0x4E01,0x4E03,0x4E43,0x4E5D,0x4E86,0x4E8C,/* 0x40-0x47 */
+	0x4EBA,0x513F,0x5165,0x516B,0x51E0,0x5200,0x5201,0x529B,/* 0x48-0x4F */
+	0x5315,0x5341,0x535C,0x53C8,0x4E09,0x4E0B,0x4E08,0x4E0A,/* 0x50-0x57 */
+	0x4E2B,0x4E38,0x51E1,0x4E45,0x4E48,0x4E5F,0x4E5E,0x4E8E,/* 0x58-0x5F */
+	0x4EA1,0x5140,0x5203,0x52FA,0x5343,0x53C9,0x53E3,0x571F,/* 0x60-0x67 */
+	0x58EB,0x5915,0x5927,0x5973,0x5B50,0x5B51,0x5B53,0x5BF8,/* 0x68-0x6F */
+	0x5C0F,0x5C22,0x5C38,0x5C71,0x5DDD,0x5DE5,0x5DF1,0x5DF2,/* 0x70-0x77 */
+	0x5DF3,0x5DFE,0x5E72,0x5EFE,0x5F0B,0x5F13,0x624D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x4E11,0x4E10,0x4E0D,0x4E2D,0x4E30,0x4E39,0x4E4B,/* 0xA0-0xA7 */
+	0x5C39,0x4E88,0x4E91,0x4E95,0x4E92,0x4E94,0x4EA2,0x4EC1,/* 0xA8-0xAF */
+	0x4EC0,0x4EC3,0x4EC6,0x4EC7,0x4ECD,0x4ECA,0x4ECB,0x4EC4,/* 0xB0-0xB7 */
+	0x5143,0x5141,0x5167,0x516D,0x516E,0x516C,0x5197,0x51F6,/* 0xB8-0xBF */
+	0x5206,0x5207,0x5208,0x52FB,0x52FE,0x52FF,0x5316,0x5339,/* 0xC0-0xC7 */
+	0x5348,0x5347,0x5345,0x535E,0x5384,0x53CB,0x53CA,0x53CD,/* 0xC8-0xCF */
+	0x58EC,0x5929,0x592B,0x592A,0x592D,0x5B54,0x5C11,0x5C24,/* 0xD0-0xD7 */
+	0x5C3A,0x5C6F,0x5DF4,0x5E7B,0x5EFF,0x5F14,0x5F15,0x5FC3,/* 0xD8-0xDF */
+	0x6208,0x6236,0x624B,0x624E,0x652F,0x6587,0x6597,0x65A4,/* 0xE0-0xE7 */
+	0x65B9,0x65E5,0x66F0,0x6708,0x6728,0x6B20,0x6B62,0x6B79,/* 0xE8-0xEF */
+	0x6BCB,0x6BD4,0x6BDB,0x6C0F,0x6C34,0x706B,0x722A,0x7236,/* 0xF0-0xF7 */
+	0x723B,0x7247,0x7259,0x725B,0x72AC,0x738B,0x4E19,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E16,0x4E15,0x4E14,0x4E18,0x4E3B,0x4E4D,0x4E4F,0x4E4E,/* 0x40-0x47 */
+	0x4EE5,0x4ED8,0x4ED4,0x4ED5,0x4ED6,0x4ED7,0x4EE3,0x4EE4,/* 0x48-0x4F */
+	0x4ED9,0x4EDE,0x5145,0x5144,0x5189,0x518A,0x51AC,0x51F9,/* 0x50-0x57 */
+	0x51FA,0x51F8,0x520A,0x52A0,0x529F,0x5305,0x5306,0x5317,/* 0x58-0x5F */
+	0x531D,0x4EDF,0x534A,0x5349,0x5361,0x5360,0x536F,0x536E,/* 0x60-0x67 */
+	0x53BB,0x53EF,0x53E4,0x53F3,0x53EC,0x53EE,0x53E9,0x53E8,/* 0x68-0x6F */
+	0x53FC,0x53F8,0x53F5,0x53EB,0x53E6,0x53EA,0x53F2,0x53F1,/* 0x70-0x77 */
+	0x53F0,0x53E5,0x53ED,0x53FB,0x56DB,0x56DA,0x5916,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x592E,0x5931,0x5974,0x5976,0x5B55,0x5B83,0x5C3C,/* 0xA0-0xA7 */
+	0x5DE8,0x5DE7,0x5DE6,0x5E02,0x5E03,0x5E73,0x5E7C,0x5F01,/* 0xA8-0xAF */
+	0x5F18,0x5F17,0x5FC5,0x620A,0x6253,0x6254,0x6252,0x6251,/* 0xB0-0xB7 */
+	0x65A5,0x65E6,0x672E,0x672C,0x672A,0x672B,0x672D,0x6B63,/* 0xB8-0xBF */
+	0x6BCD,0x6C11,0x6C10,0x6C38,0x6C41,0x6C40,0x6C3E,0x72AF,/* 0xC0-0xC7 */
+	0x7384,0x7389,0x74DC,0x74E6,0x7518,0x751F,0x7528,0x7529,/* 0xC8-0xCF */
+	0x7530,0x7531,0x7532,0x7533,0x758B,0x767D,0x76AE,0x76BF,/* 0xD0-0xD7 */
+	0x76EE,0x77DB,0x77E2,0x77F3,0x793A,0x79BE,0x7A74,0x7ACB,/* 0xD8-0xDF */
+	0x4E1E,0x4E1F,0x4E52,0x4E53,0x4E69,0x4E99,0x4EA4,0x4EA6,/* 0xE0-0xE7 */
+	0x4EA5,0x4EFF,0x4F09,0x4F19,0x4F0A,0x4F15,0x4F0D,0x4F10,/* 0xE8-0xEF */
+	0x4F11,0x4F0F,0x4EF2,0x4EF6,0x4EFB,0x4EF0,0x4EF3,0x4EFD,/* 0xF0-0xF7 */
+	0x4F01,0x4F0B,0x5149,0x5147,0x5146,0x5148,0x5168,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5171,0x518D,0x51B0,0x5217,0x5211,0x5212,0x520E,0x5216,/* 0x40-0x47 */
+	0x52A3,0x5308,0x5321,0x5320,0x5370,0x5371,0x5409,0x540F,/* 0x48-0x4F */
+	0x540C,0x540A,0x5410,0x5401,0x540B,0x5404,0x5411,0x540D,/* 0x50-0x57 */
+	0x5408,0x5403,0x540E,0x5406,0x5412,0x56E0,0x56DE,0x56DD,/* 0x58-0x5F */
+	0x5733,0x5730,0x5728,0x572D,0x572C,0x572F,0x5729,0x5919,/* 0x60-0x67 */
+	0x591A,0x5937,0x5938,0x5984,0x5978,0x5983,0x597D,0x5979,/* 0x68-0x6F */
+	0x5982,0x5981,0x5B57,0x5B58,0x5B87,0x5B88,0x5B85,0x5B89,/* 0x70-0x77 */
+	0x5BFA,0x5C16,0x5C79,0x5DDE,0x5E06,0x5E76,0x5E74,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5F0F,0x5F1B,0x5FD9,0x5FD6,0x620E,0x620C,0x620D,/* 0xA0-0xA7 */
+	0x6210,0x6263,0x625B,0x6258,0x6536,0x65E9,0x65E8,0x65EC,/* 0xA8-0xAF */
+	0x65ED,0x66F2,0x66F3,0x6709,0x673D,0x6734,0x6731,0x6735,/* 0xB0-0xB7 */
+	0x6B21,0x6B64,0x6B7B,0x6C16,0x6C5D,0x6C57,0x6C59,0x6C5F,/* 0xB8-0xBF */
+	0x6C60,0x6C50,0x6C55,0x6C61,0x6C5B,0x6C4D,0x6C4E,0x7070,/* 0xC0-0xC7 */
+	0x725F,0x725D,0x767E,0x7AF9,0x7C73,0x7CF8,0x7F36,0x7F8A,/* 0xC8-0xCF */
+	0x7FBD,0x8001,0x8003,0x800C,0x8012,0x8033,0x807F,0x8089,/* 0xD0-0xD7 */
+	0x808B,0x808C,0x81E3,0x81EA,0x81F3,0x81FC,0x820C,0x821B,/* 0xD8-0xDF */
+	0x821F,0x826E,0x8272,0x827E,0x866B,0x8840,0x884C,0x8863,/* 0xE0-0xE7 */
+	0x897F,0x9621,0x4E32,0x4EA8,0x4F4D,0x4F4F,0x4F47,0x4F57,/* 0xE8-0xEF */
+	0x4F5E,0x4F34,0x4F5B,0x4F55,0x4F30,0x4F50,0x4F51,0x4F3D,/* 0xF0-0xF7 */
+	0x4F3A,0x4F38,0x4F43,0x4F54,0x4F3C,0x4F46,0x4F63,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4F5C,0x4F60,0x4F2F,0x4F4E,0x4F36,0x4F59,0x4F5D,0x4F48,/* 0x40-0x47 */
+	0x4F5A,0x514C,0x514B,0x514D,0x5175,0x51B6,0x51B7,0x5225,/* 0x48-0x4F */
+	0x5224,0x5229,0x522A,0x5228,0x52AB,0x52A9,0x52AA,0x52AC,/* 0x50-0x57 */
+	0x5323,0x5373,0x5375,0x541D,0x542D,0x541E,0x543E,0x5426,/* 0x58-0x5F */
+	0x544E,0x5427,0x5446,0x5443,0x5433,0x5448,0x5442,0x541B,/* 0x60-0x67 */
+	0x5429,0x544A,0x5439,0x543B,0x5438,0x542E,0x5435,0x5436,/* 0x68-0x6F */
+	0x5420,0x543C,0x5440,0x5431,0x542B,0x541F,0x542C,0x56EA,/* 0x70-0x77 */
+	0x56F0,0x56E4,0x56EB,0x574A,0x5751,0x5740,0x574D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5747,0x574E,0x573E,0x5750,0x574F,0x573B,0x58EF,/* 0xA0-0xA7 */
+	0x593E,0x599D,0x5992,0x59A8,0x599E,0x59A3,0x5999,0x5996,/* 0xA8-0xAF */
+	0x598D,0x59A4,0x5993,0x598A,0x59A5,0x5B5D,0x5B5C,0x5B5A,/* 0xB0-0xB7 */
+	0x5B5B,0x5B8C,0x5B8B,0x5B8F,0x5C2C,0x5C40,0x5C41,0x5C3F,/* 0xB8-0xBF */
+	0x5C3E,0x5C90,0x5C91,0x5C94,0x5C8C,0x5DEB,0x5E0C,0x5E8F,/* 0xC0-0xC7 */
+	0x5E87,0x5E8A,0x5EF7,0x5F04,0x5F1F,0x5F64,0x5F62,0x5F77,/* 0xC8-0xCF */
+	0x5F79,0x5FD8,0x5FCC,0x5FD7,0x5FCD,0x5FF1,0x5FEB,0x5FF8,/* 0xD0-0xD7 */
+	0x5FEA,0x6212,0x6211,0x6284,0x6297,0x6296,0x6280,0x6276,/* 0xD8-0xDF */
+	0x6289,0x626D,0x628A,0x627C,0x627E,0x6279,0x6273,0x6292,/* 0xE0-0xE7 */
+	0x626F,0x6298,0x626E,0x6295,0x6293,0x6291,0x6286,0x6539,/* 0xE8-0xEF */
+	0x653B,0x6538,0x65F1,0x66F4,0x675F,0x674E,0x674F,0x6750,/* 0xF0-0xF7 */
+	0x6751,0x675C,0x6756,0x675E,0x6749,0x6746,0x6760,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6753,0x6757,0x6B65,0x6BCF,0x6C42,0x6C5E,0x6C99,0x6C81,/* 0x40-0x47 */
+	0x6C88,0x6C89,0x6C85,0x6C9B,0x6C6A,0x6C7A,0x6C90,0x6C70,/* 0x48-0x4F */
+	0x6C8C,0x6C68,0x6C96,0x6C92,0x6C7D,0x6C83,0x6C72,0x6C7E,/* 0x50-0x57 */
+	0x6C74,0x6C86,0x6C76,0x6C8D,0x6C94,0x6C98,0x6C82,0x7076,/* 0x58-0x5F */
+	0x707C,0x707D,0x7078,0x7262,0x7261,0x7260,0x72C4,0x72C2,/* 0x60-0x67 */
+	0x7396,0x752C,0x752B,0x7537,0x7538,0x7682,0x76EF,0x77E3,/* 0x68-0x6F */
+	0x79C1,0x79C0,0x79BF,0x7A76,0x7CFB,0x7F55,0x8096,0x8093,/* 0x70-0x77 */
+	0x809D,0x8098,0x809B,0x809A,0x80B2,0x826F,0x8292,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x828B,0x828D,0x898B,0x89D2,0x8A00,0x8C37,0x8C46,/* 0xA0-0xA7 */
+	0x8C55,0x8C9D,0x8D64,0x8D70,0x8DB3,0x8EAB,0x8ECA,0x8F9B,/* 0xA8-0xAF */
+	0x8FB0,0x8FC2,0x8FC6,0x8FC5,0x8FC4,0x5DE1,0x9091,0x90A2,/* 0xB0-0xB7 */
+	0x90AA,0x90A6,0x90A3,0x9149,0x91C6,0x91CC,0x9632,0x962E,/* 0xB8-0xBF */
+	0x9631,0x962A,0x962C,0x4E26,0x4E56,0x4E73,0x4E8B,0x4E9B,/* 0xC0-0xC7 */
+	0x4E9E,0x4EAB,0x4EAC,0x4F6F,0x4F9D,0x4F8D,0x4F73,0x4F7F,/* 0xC8-0xCF */
+	0x4F6C,0x4F9B,0x4F8B,0x4F86,0x4F83,0x4F70,0x4F75,0x4F88,/* 0xD0-0xD7 */
+	0x4F69,0x4F7B,0x4F96,0x4F7E,0x4F8F,0x4F91,0x4F7A,0x5154,/* 0xD8-0xDF */
+	0x5152,0x5155,0x5169,0x5177,0x5176,0x5178,0x51BD,0x51FD,/* 0xE0-0xE7 */
+	0x523B,0x5238,0x5237,0x523A,0x5230,0x522E,0x5236,0x5241,/* 0xE8-0xEF */
+	0x52BE,0x52BB,0x5352,0x5354,0x5353,0x5351,0x5366,0x5377,/* 0xF0-0xF7 */
+	0x5378,0x5379,0x53D6,0x53D4,0x53D7,0x5473,0x5475,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5496,0x5478,0x5495,0x5480,0x547B,0x5477,0x5484,0x5492,/* 0x40-0x47 */
+	0x5486,0x547C,0x5490,0x5471,0x5476,0x548C,0x549A,0x5462,/* 0x48-0x4F */
+	0x5468,0x548B,0x547D,0x548E,0x56FA,0x5783,0x5777,0x576A,/* 0x50-0x57 */
+	0x5769,0x5761,0x5766,0x5764,0x577C,0x591C,0x5949,0x5947,/* 0x58-0x5F */
+	0x5948,0x5944,0x5954,0x59BE,0x59BB,0x59D4,0x59B9,0x59AE,/* 0x60-0x67 */
+	0x59D1,0x59C6,0x59D0,0x59CD,0x59CB,0x59D3,0x59CA,0x59AF,/* 0x68-0x6F */
+	0x59B3,0x59D2,0x59C5,0x5B5F,0x5B64,0x5B63,0x5B97,0x5B9A,/* 0x70-0x77 */
+	0x5B98,0x5B9C,0x5B99,0x5B9B,0x5C1A,0x5C48,0x5C45,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5C46,0x5CB7,0x5CA1,0x5CB8,0x5CA9,0x5CAB,0x5CB1,/* 0xA0-0xA7 */
+	0x5CB3,0x5E18,0x5E1A,0x5E16,0x5E15,0x5E1B,0x5E11,0x5E78,/* 0xA8-0xAF */
+	0x5E9A,0x5E97,0x5E9C,0x5E95,0x5E96,0x5EF6,0x5F26,0x5F27,/* 0xB0-0xB7 */
+	0x5F29,0x5F80,0x5F81,0x5F7F,0x5F7C,0x5FDD,0x5FE0,0x5FFD,/* 0xB8-0xBF */
+	0x5FF5,0x5FFF,0x600F,0x6014,0x602F,0x6035,0x6016,0x602A,/* 0xC0-0xC7 */
+	0x6015,0x6021,0x6027,0x6029,0x602B,0x601B,0x6216,0x6215,/* 0xC8-0xCF */
+	0x623F,0x623E,0x6240,0x627F,0x62C9,0x62CC,0x62C4,0x62BF,/* 0xD0-0xD7 */
+	0x62C2,0x62B9,0x62D2,0x62DB,0x62AB,0x62D3,0x62D4,0x62CB,/* 0xD8-0xDF */
+	0x62C8,0x62A8,0x62BD,0x62BC,0x62D0,0x62D9,0x62C7,0x62CD,/* 0xE0-0xE7 */
+	0x62B5,0x62DA,0x62B1,0x62D8,0x62D6,0x62D7,0x62C6,0x62AC,/* 0xE8-0xEF */
+	0x62CE,0x653E,0x65A7,0x65BC,0x65FA,0x6614,0x6613,0x660C,/* 0xF0-0xF7 */
+	0x6606,0x6602,0x660E,0x6600,0x660F,0x6615,0x660A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6607,0x670D,0x670B,0x676D,0x678B,0x6795,0x6771,0x679C,/* 0x40-0x47 */
+	0x6773,0x6777,0x6787,0x679D,0x6797,0x676F,0x6770,0x677F,/* 0x48-0x4F */
+	0x6789,0x677E,0x6790,0x6775,0x679A,0x6793,0x677C,0x676A,/* 0x50-0x57 */
+	0x6772,0x6B23,0x6B66,0x6B67,0x6B7F,0x6C13,0x6C1B,0x6CE3,/* 0x58-0x5F */
+	0x6CE8,0x6CF3,0x6CB1,0x6CCC,0x6CE5,0x6CB3,0x6CBD,0x6CBE,/* 0x60-0x67 */
+	0x6CBC,0x6CE2,0x6CAB,0x6CD5,0x6CD3,0x6CB8,0x6CC4,0x6CB9,/* 0x68-0x6F */
+	0x6CC1,0x6CAE,0x6CD7,0x6CC5,0x6CF1,0x6CBF,0x6CBB,0x6CE1,/* 0x70-0x77 */
+	0x6CDB,0x6CCA,0x6CAC,0x6CEF,0x6CDC,0x6CD6,0x6CE0,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7095,0x708E,0x7092,0x708A,0x7099,0x722C,0x722D,/* 0xA0-0xA7 */
+	0x7238,0x7248,0x7267,0x7269,0x72C0,0x72CE,0x72D9,0x72D7,/* 0xA8-0xAF */
+	0x72D0,0x73A9,0x73A8,0x739F,0x73AB,0x73A5,0x753D,0x759D,/* 0xB0-0xB7 */
+	0x7599,0x759A,0x7684,0x76C2,0x76F2,0x76F4,0x77E5,0x77FD,/* 0xB8-0xBF */
+	0x793E,0x7940,0x7941,0x79C9,0x79C8,0x7A7A,0x7A79,0x7AFA,/* 0xC0-0xC7 */
+	0x7CFE,0x7F54,0x7F8C,0x7F8B,0x8005,0x80BA,0x80A5,0x80A2,/* 0xC8-0xCF */
+	0x80B1,0x80A1,0x80AB,0x80A9,0x80B4,0x80AA,0x80AF,0x81E5,/* 0xD0-0xD7 */
+	0x81FE,0x820D,0x82B3,0x829D,0x8299,0x82AD,0x82BD,0x829F,/* 0xD8-0xDF */
+	0x82B9,0x82B1,0x82AC,0x82A5,0x82AF,0x82B8,0x82A3,0x82B0,/* 0xE0-0xE7 */
+	0x82BE,0x82B7,0x864E,0x8671,0x521D,0x8868,0x8ECB,0x8FCE,/* 0xE8-0xEF */
+	0x8FD4,0x8FD1,0x90B5,0x90B8,0x90B1,0x90B6,0x91C7,0x91D1,/* 0xF0-0xF7 */
+	0x9577,0x9580,0x961C,0x9640,0x963F,0x963B,0x9644,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9642,0x96B9,0x96E8,0x9752,0x975E,0x4E9F,0x4EAD,0x4EAE,/* 0x40-0x47 */
+	0x4FE1,0x4FB5,0x4FAF,0x4FBF,0x4FE0,0x4FD1,0x4FCF,0x4FDD,/* 0x48-0x4F */
+	0x4FC3,0x4FB6,0x4FD8,0x4FDF,0x4FCA,0x4FD7,0x4FAE,0x4FD0,/* 0x50-0x57 */
+	0x4FC4,0x4FC2,0x4FDA,0x4FCE,0x4FDE,0x4FB7,0x5157,0x5192,/* 0x58-0x5F */
+	0x5191,0x51A0,0x524E,0x5243,0x524A,0x524D,0x524C,0x524B,/* 0x60-0x67 */
+	0x5247,0x52C7,0x52C9,0x52C3,0x52C1,0x530D,0x5357,0x537B,/* 0x68-0x6F */
+	0x539A,0x53DB,0x54AC,0x54C0,0x54A8,0x54CE,0x54C9,0x54B8,/* 0x70-0x77 */
+	0x54A6,0x54B3,0x54C7,0x54C2,0x54BD,0x54AA,0x54C1,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x54C4,0x54C8,0x54AF,0x54AB,0x54B1,0x54BB,0x54A9,/* 0xA0-0xA7 */
+	0x54A7,0x54BF,0x56FF,0x5782,0x578B,0x57A0,0x57A3,0x57A2,/* 0xA8-0xAF */
+	0x57CE,0x57AE,0x5793,0x5955,0x5951,0x594F,0x594E,0x5950,/* 0xB0-0xB7 */
+	0x59DC,0x59D8,0x59FF,0x59E3,0x59E8,0x5A03,0x59E5,0x59EA,/* 0xB8-0xBF */
+	0x59DA,0x59E6,0x5A01,0x59FB,0x5B69,0x5BA3,0x5BA6,0x5BA4,/* 0xC0-0xC7 */
+	0x5BA2,0x5BA5,0x5C01,0x5C4E,0x5C4F,0x5C4D,0x5C4B,0x5CD9,/* 0xC8-0xCF */
+	0x5CD2,0x5DF7,0x5E1D,0x5E25,0x5E1F,0x5E7D,0x5EA0,0x5EA6,/* 0xD0-0xD7 */
+	0x5EFA,0x5F08,0x5F2D,0x5F65,0x5F88,0x5F85,0x5F8A,0x5F8B,/* 0xD8-0xDF */
+	0x5F87,0x5F8C,0x5F89,0x6012,0x601D,0x6020,0x6025,0x600E,/* 0xE0-0xE7 */
+	0x6028,0x604D,0x6070,0x6068,0x6062,0x6046,0x6043,0x606C,/* 0xE8-0xEF */
+	0x606B,0x606A,0x6064,0x6241,0x62DC,0x6316,0x6309,0x62FC,/* 0xF0-0xF7 */
+	0x62ED,0x6301,0x62EE,0x62FD,0x6307,0x62F1,0x62F7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x62EF,0x62EC,0x62FE,0x62F4,0x6311,0x6302,0x653F,0x6545,/* 0x40-0x47 */
+	0x65AB,0x65BD,0x65E2,0x6625,0x662D,0x6620,0x6627,0x662F,/* 0x48-0x4F */
+	0x661F,0x6628,0x6631,0x6624,0x66F7,0x67FF,0x67D3,0x67F1,/* 0x50-0x57 */
+	0x67D4,0x67D0,0x67EC,0x67B6,0x67AF,0x67F5,0x67E9,0x67EF,/* 0x58-0x5F */
+	0x67C4,0x67D1,0x67B4,0x67DA,0x67E5,0x67B8,0x67CF,0x67DE,/* 0x60-0x67 */
+	0x67F3,0x67B0,0x67D9,0x67E2,0x67DD,0x67D2,0x6B6A,0x6B83,/* 0x68-0x6F */
+	0x6B86,0x6BB5,0x6BD2,0x6BD7,0x6C1F,0x6CC9,0x6D0B,0x6D32,/* 0x70-0x77 */
+	0x6D2A,0x6D41,0x6D25,0x6D0C,0x6D31,0x6D1E,0x6D17,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6D3B,0x6D3D,0x6D3E,0x6D36,0x6D1B,0x6CF5,0x6D39,/* 0xA0-0xA7 */
+	0x6D27,0x6D38,0x6D29,0x6D2E,0x6D35,0x6D0E,0x6D2B,0x70AB,/* 0xA8-0xAF */
+	0x70BA,0x70B3,0x70AC,0x70AF,0x70AD,0x70B8,0x70AE,0x70A4,/* 0xB0-0xB7 */
+	0x7230,0x7272,0x726F,0x7274,0x72E9,0x72E0,0x72E1,0x73B7,/* 0xB8-0xBF */
+	0x73CA,0x73BB,0x73B2,0x73CD,0x73C0,0x73B3,0x751A,0x752D,/* 0xC0-0xC7 */
+	0x754F,0x754C,0x754E,0x754B,0x75AB,0x75A4,0x75A5,0x75A2,/* 0xC8-0xCF */
+	0x75A3,0x7678,0x7686,0x7687,0x7688,0x76C8,0x76C6,0x76C3,/* 0xD0-0xD7 */
+	0x76C5,0x7701,0x76F9,0x76F8,0x7709,0x770B,0x76FE,0x76FC,/* 0xD8-0xDF */
+	0x7707,0x77DC,0x7802,0x7814,0x780C,0x780D,0x7946,0x7949,/* 0xE0-0xE7 */
+	0x7948,0x7947,0x79B9,0x79BA,0x79D1,0x79D2,0x79CB,0x7A7F,/* 0xE8-0xEF */
+	0x7A81,0x7AFF,0x7AFD,0x7C7D,0x7D02,0x7D05,0x7D00,0x7D09,/* 0xF0-0xF7 */
+	0x7D07,0x7D04,0x7D06,0x7F38,0x7F8E,0x7FBF,0x8004,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8010,0x800D,0x8011,0x8036,0x80D6,0x80E5,0x80DA,0x80C3,/* 0x40-0x47 */
+	0x80C4,0x80CC,0x80E1,0x80DB,0x80CE,0x80DE,0x80E4,0x80DD,/* 0x48-0x4F */
+	0x81F4,0x8222,0x82E7,0x8303,0x8305,0x82E3,0x82DB,0x82E6,/* 0x50-0x57 */
+	0x8304,0x82E5,0x8302,0x8309,0x82D2,0x82D7,0x82F1,0x8301,/* 0x58-0x5F */
+	0x82DC,0x82D4,0x82D1,0x82DE,0x82D3,0x82DF,0x82EF,0x8306,/* 0x60-0x67 */
+	0x8650,0x8679,0x867B,0x867A,0x884D,0x886B,0x8981,0x89D4,/* 0x68-0x6F */
+	0x8A08,0x8A02,0x8A03,0x8C9E,0x8CA0,0x8D74,0x8D73,0x8DB4,/* 0x70-0x77 */
+	0x8ECD,0x8ECC,0x8FF0,0x8FE6,0x8FE2,0x8FEA,0x8FE5,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8FED,0x8FEB,0x8FE4,0x8FE8,0x90CA,0x90CE,0x90C1,/* 0xA0-0xA7 */
+	0x90C3,0x914B,0x914A,0x91CD,0x9582,0x9650,0x964B,0x964C,/* 0xA8-0xAF */
+	0x964D,0x9762,0x9769,0x97CB,0x97ED,0x97F3,0x9801,0x98A8,/* 0xB0-0xB7 */
+	0x98DB,0x98DF,0x9996,0x9999,0x4E58,0x4EB3,0x500C,0x500D,/* 0xB8-0xBF */
+	0x5023,0x4FEF,0x5026,0x5025,0x4FF8,0x5029,0x5016,0x5006,/* 0xC0-0xC7 */
+	0x503C,0x501F,0x501A,0x5012,0x5011,0x4FFA,0x5000,0x5014,/* 0xC8-0xCF */
+	0x5028,0x4FF1,0x5021,0x500B,0x5019,0x5018,0x4FF3,0x4FEE,/* 0xD0-0xD7 */
+	0x502D,0x502A,0x4FFE,0x502B,0x5009,0x517C,0x51A4,0x51A5,/* 0xD8-0xDF */
+	0x51A2,0x51CD,0x51CC,0x51C6,0x51CB,0x5256,0x525C,0x5254,/* 0xE0-0xE7 */
+	0x525B,0x525D,0x532A,0x537F,0x539F,0x539D,0x53DF,0x54E8,/* 0xE8-0xEF */
+	0x5510,0x5501,0x5537,0x54FC,0x54E5,0x54F2,0x5506,0x54FA,/* 0xF0-0xF7 */
+	0x5514,0x54E9,0x54ED,0x54E1,0x5509,0x54EE,0x54EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54E6,0x5527,0x5507,0x54FD,0x550F,0x5703,0x5704,0x57C2,/* 0x40-0x47 */
+	0x57D4,0x57CB,0x57C3,0x5809,0x590F,0x5957,0x5958,0x595A,/* 0x48-0x4F */
+	0x5A11,0x5A18,0x5A1C,0x5A1F,0x5A1B,0x5A13,0x59EC,0x5A20,/* 0x50-0x57 */
+	0x5A23,0x5A29,0x5A25,0x5A0C,0x5A09,0x5B6B,0x5C58,0x5BB0,/* 0x58-0x5F */
+	0x5BB3,0x5BB6,0x5BB4,0x5BAE,0x5BB5,0x5BB9,0x5BB8,0x5C04,/* 0x60-0x67 */
+	0x5C51,0x5C55,0x5C50,0x5CED,0x5CFD,0x5CFB,0x5CEA,0x5CE8,/* 0x68-0x6F */
+	0x5CF0,0x5CF6,0x5D01,0x5CF4,0x5DEE,0x5E2D,0x5E2B,0x5EAB,/* 0x70-0x77 */
+	0x5EAD,0x5EA7,0x5F31,0x5F92,0x5F91,0x5F90,0x6059,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6063,0x6065,0x6050,0x6055,0x606D,0x6069,0x606F,/* 0xA0-0xA7 */
+	0x6084,0x609F,0x609A,0x608D,0x6094,0x608C,0x6085,0x6096,/* 0xA8-0xAF */
+	0x6247,0x62F3,0x6308,0x62FF,0x634E,0x633E,0x632F,0x6355,/* 0xB0-0xB7 */
+	0x6342,0x6346,0x634F,0x6349,0x633A,0x6350,0x633D,0x632A,/* 0xB8-0xBF */
+	0x632B,0x6328,0x634D,0x634C,0x6548,0x6549,0x6599,0x65C1,/* 0xC0-0xC7 */
+	0x65C5,0x6642,0x6649,0x664F,0x6643,0x6652,0x664C,0x6645,/* 0xC8-0xCF */
+	0x6641,0x66F8,0x6714,0x6715,0x6717,0x6821,0x6838,0x6848,/* 0xD0-0xD7 */
+	0x6846,0x6853,0x6839,0x6842,0x6854,0x6829,0x68B3,0x6817,/* 0xD8-0xDF */
+	0x684C,0x6851,0x683D,0x67F4,0x6850,0x6840,0x683C,0x6843,/* 0xE0-0xE7 */
+	0x682A,0x6845,0x6813,0x6818,0x6841,0x6B8A,0x6B89,0x6BB7,/* 0xE8-0xEF */
+	0x6C23,0x6C27,0x6C28,0x6C26,0x6C24,0x6CF0,0x6D6A,0x6D95,/* 0xF0-0xF7 */
+	0x6D88,0x6D87,0x6D66,0x6D78,0x6D77,0x6D59,0x6D93,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6D6C,0x6D89,0x6D6E,0x6D5A,0x6D74,0x6D69,0x6D8C,0x6D8A,/* 0x40-0x47 */
+	0x6D79,0x6D85,0x6D65,0x6D94,0x70CA,0x70D8,0x70E4,0x70D9,/* 0x48-0x4F */
+	0x70C8,0x70CF,0x7239,0x7279,0x72FC,0x72F9,0x72FD,0x72F8,/* 0x50-0x57 */
+	0x72F7,0x7386,0x73ED,0x7409,0x73EE,0x73E0,0x73EA,0x73DE,/* 0x58-0x5F */
+	0x7554,0x755D,0x755C,0x755A,0x7559,0x75BE,0x75C5,0x75C7,/* 0x60-0x67 */
+	0x75B2,0x75B3,0x75BD,0x75BC,0x75B9,0x75C2,0x75B8,0x768B,/* 0x68-0x6F */
+	0x76B0,0x76CA,0x76CD,0x76CE,0x7729,0x771F,0x7720,0x7728,/* 0x70-0x77 */
+	0x77E9,0x7830,0x7827,0x7838,0x781D,0x7834,0x7837,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7825,0x782D,0x7820,0x781F,0x7832,0x7955,0x7950,/* 0xA0-0xA7 */
+	0x7960,0x795F,0x7956,0x795E,0x795D,0x7957,0x795A,0x79E4,/* 0xA8-0xAF */
+	0x79E3,0x79E7,0x79DF,0x79E6,0x79E9,0x79D8,0x7A84,0x7A88,/* 0xB0-0xB7 */
+	0x7AD9,0x7B06,0x7B11,0x7C89,0x7D21,0x7D17,0x7D0B,0x7D0A,/* 0xB8-0xBF */
+	0x7D20,0x7D22,0x7D14,0x7D10,0x7D15,0x7D1A,0x7D1C,0x7D0D,/* 0xC0-0xC7 */
+	0x7D19,0x7D1B,0x7F3A,0x7F5F,0x7F94,0x7FC5,0x7FC1,0x8006,/* 0xC8-0xCF */
+	0x8018,0x8015,0x8019,0x8017,0x803D,0x803F,0x80F1,0x8102,/* 0xD0-0xD7 */
+	0x80F0,0x8105,0x80ED,0x80F4,0x8106,0x80F8,0x80F3,0x8108,/* 0xD8-0xDF */
+	0x80FD,0x810A,0x80FC,0x80EF,0x81ED,0x81EC,0x8200,0x8210,/* 0xE0-0xE7 */
+	0x822A,0x822B,0x8228,0x822C,0x82BB,0x832B,0x8352,0x8354,/* 0xE8-0xEF */
+	0x834A,0x8338,0x8350,0x8349,0x8335,0x8334,0x834F,0x8332,/* 0xF0-0xF7 */
+	0x8339,0x8336,0x8317,0x8340,0x8331,0x8328,0x8343,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8654,0x868A,0x86AA,0x8693,0x86A4,0x86A9,0x868C,0x86A3,/* 0x40-0x47 */
+	0x869C,0x8870,0x8877,0x8881,0x8882,0x887D,0x8879,0x8A18,/* 0x48-0x4F */
+	0x8A10,0x8A0E,0x8A0C,0x8A15,0x8A0A,0x8A17,0x8A13,0x8A16,/* 0x50-0x57 */
+	0x8A0F,0x8A11,0x8C48,0x8C7A,0x8C79,0x8CA1,0x8CA2,0x8D77,/* 0x58-0x5F */
+	0x8EAC,0x8ED2,0x8ED4,0x8ECF,0x8FB1,0x9001,0x9006,0x8FF7,/* 0x60-0x67 */
+	0x9000,0x8FFA,0x8FF4,0x9003,0x8FFD,0x9005,0x8FF8,0x9095,/* 0x68-0x6F */
+	0x90E1,0x90DD,0x90E2,0x9152,0x914D,0x914C,0x91D8,0x91DD,/* 0x70-0x77 */
+	0x91D7,0x91DC,0x91D9,0x9583,0x9662,0x9663,0x9661,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x965B,0x965D,0x9664,0x9658,0x965E,0x96BB,0x98E2,/* 0xA0-0xA7 */
+	0x99AC,0x9AA8,0x9AD8,0x9B25,0x9B32,0x9B3C,0x4E7E,0x507A,/* 0xA8-0xAF */
+	0x507D,0x505C,0x5047,0x5043,0x504C,0x505A,0x5049,0x5065,/* 0xB0-0xB7 */
+	0x5076,0x504E,0x5055,0x5075,0x5074,0x5077,0x504F,0x500F,/* 0xB8-0xBF */
+	0x506F,0x506D,0x515C,0x5195,0x51F0,0x526A,0x526F,0x52D2,/* 0xC0-0xC7 */
+	0x52D9,0x52D8,0x52D5,0x5310,0x530F,0x5319,0x533F,0x5340,/* 0xC8-0xCF */
+	0x533E,0x53C3,0x66FC,0x5546,0x556A,0x5566,0x5544,0x555E,/* 0xD0-0xD7 */
+	0x5561,0x5543,0x554A,0x5531,0x5556,0x554F,0x5555,0x552F,/* 0xD8-0xDF */
+	0x5564,0x5538,0x552E,0x555C,0x552C,0x5563,0x5533,0x5541,/* 0xE0-0xE7 */
+	0x5557,0x5708,0x570B,0x5709,0x57DF,0x5805,0x580A,0x5806,/* 0xE8-0xEF */
+	0x57E0,0x57E4,0x57FA,0x5802,0x5835,0x57F7,0x57F9,0x5920,/* 0xF0-0xF7 */
+	0x5962,0x5A36,0x5A41,0x5A49,0x5A66,0x5A6A,0x5A40,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5A3C,0x5A62,0x5A5A,0x5A46,0x5A4A,0x5B70,0x5BC7,0x5BC5,/* 0x40-0x47 */
+	0x5BC4,0x5BC2,0x5BBF,0x5BC6,0x5C09,0x5C08,0x5C07,0x5C60,/* 0x48-0x4F */
+	0x5C5C,0x5C5D,0x5D07,0x5D06,0x5D0E,0x5D1B,0x5D16,0x5D22,/* 0x50-0x57 */
+	0x5D11,0x5D29,0x5D14,0x5D19,0x5D24,0x5D27,0x5D17,0x5DE2,/* 0x58-0x5F */
+	0x5E38,0x5E36,0x5E33,0x5E37,0x5EB7,0x5EB8,0x5EB6,0x5EB5,/* 0x60-0x67 */
+	0x5EBE,0x5F35,0x5F37,0x5F57,0x5F6C,0x5F69,0x5F6B,0x5F97,/* 0x68-0x6F */
+	0x5F99,0x5F9E,0x5F98,0x5FA1,0x5FA0,0x5F9C,0x607F,0x60A3,/* 0x70-0x77 */
+	0x6089,0x60A0,0x60A8,0x60CB,0x60B4,0x60E6,0x60BD,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x60C5,0x60BB,0x60B5,0x60DC,0x60BC,0x60D8,0x60D5,/* 0xA0-0xA7 */
+	0x60C6,0x60DF,0x60B8,0x60DA,0x60C7,0x621A,0x621B,0x6248,/* 0xA8-0xAF */
+	0x63A0,0x63A7,0x6372,0x6396,0x63A2,0x63A5,0x6377,0x6367,/* 0xB0-0xB7 */
+	0x6398,0x63AA,0x6371,0x63A9,0x6389,0x6383,0x639B,0x636B,/* 0xB8-0xBF */
+	0x63A8,0x6384,0x6388,0x6399,0x63A1,0x63AC,0x6392,0x638F,/* 0xC0-0xC7 */
+	0x6380,0x637B,0x6369,0x6368,0x637A,0x655D,0x6556,0x6551,/* 0xC8-0xCF */
+	0x6559,0x6557,0x555F,0x654F,0x6558,0x6555,0x6554,0x659C,/* 0xD0-0xD7 */
+	0x659B,0x65AC,0x65CF,0x65CB,0x65CC,0x65CE,0x665D,0x665A,/* 0xD8-0xDF */
+	0x6664,0x6668,0x6666,0x665E,0x66F9,0x52D7,0x671B,0x6881,/* 0xE0-0xE7 */
+	0x68AF,0x68A2,0x6893,0x68B5,0x687F,0x6876,0x68B1,0x68A7,/* 0xE8-0xEF */
+	0x6897,0x68B0,0x6883,0x68C4,0x68AD,0x6886,0x6885,0x6894,/* 0xF0-0xF7 */
+	0x689D,0x68A8,0x689F,0x68A1,0x6882,0x6B32,0x6BBA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6BEB,0x6BEC,0x6C2B,0x6D8E,0x6DBC,0x6DF3,0x6DD9,0x6DB2,/* 0x40-0x47 */
+	0x6DE1,0x6DCC,0x6DE4,0x6DFB,0x6DFA,0x6E05,0x6DC7,0x6DCB,/* 0x48-0x4F */
+	0x6DAF,0x6DD1,0x6DAE,0x6DDE,0x6DF9,0x6DB8,0x6DF7,0x6DF5,/* 0x50-0x57 */
+	0x6DC5,0x6DD2,0x6E1A,0x6DB5,0x6DDA,0x6DEB,0x6DD8,0x6DEA,/* 0x58-0x5F */
+	0x6DF1,0x6DEE,0x6DE8,0x6DC6,0x6DC4,0x6DAA,0x6DEC,0x6DBF,/* 0x60-0x67 */
+	0x6DE6,0x70F9,0x7109,0x710A,0x70FD,0x70EF,0x723D,0x727D,/* 0x68-0x6F */
+	0x7281,0x731C,0x731B,0x7316,0x7313,0x7319,0x7387,0x7405,/* 0x70-0x77 */
+	0x740A,0x7403,0x7406,0x73FE,0x740D,0x74E0,0x74F6,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x74F7,0x751C,0x7522,0x7565,0x7566,0x7562,0x7570,/* 0xA0-0xA7 */
+	0x758F,0x75D4,0x75D5,0x75B5,0x75CA,0x75CD,0x768E,0x76D4,/* 0xA8-0xAF */
+	0x76D2,0x76DB,0x7737,0x773E,0x773C,0x7736,0x7738,0x773A,/* 0xB0-0xB7 */
+	0x786B,0x7843,0x784E,0x7965,0x7968,0x796D,0x79FB,0x7A92,/* 0xB8-0xBF */
+	0x7A95,0x7B20,0x7B28,0x7B1B,0x7B2C,0x7B26,0x7B19,0x7B1E,/* 0xC0-0xC7 */
+	0x7B2E,0x7C92,0x7C97,0x7C95,0x7D46,0x7D43,0x7D71,0x7D2E,/* 0xC8-0xCF */
+	0x7D39,0x7D3C,0x7D40,0x7D30,0x7D33,0x7D44,0x7D2F,0x7D42,/* 0xD0-0xD7 */
+	0x7D32,0x7D31,0x7F3D,0x7F9E,0x7F9A,0x7FCC,0x7FCE,0x7FD2,/* 0xD8-0xDF */
+	0x801C,0x804A,0x8046,0x812F,0x8116,0x8123,0x812B,0x8129,/* 0xE0-0xE7 */
+	0x8130,0x8124,0x8202,0x8235,0x8237,0x8236,0x8239,0x838E,/* 0xE8-0xEF */
+	0x839E,0x8398,0x8378,0x83A2,0x8396,0x83BD,0x83AB,0x8392,/* 0xF0-0xF7 */
+	0x838A,0x8393,0x8389,0x83A0,0x8377,0x837B,0x837C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8386,0x83A7,0x8655,0x5F6A,0x86C7,0x86C0,0x86B6,0x86C4,/* 0x40-0x47 */
+	0x86B5,0x86C6,0x86CB,0x86B1,0x86AF,0x86C9,0x8853,0x889E,/* 0x48-0x4F */
+	0x8888,0x88AB,0x8892,0x8896,0x888D,0x888B,0x8993,0x898F,/* 0x50-0x57 */
+	0x8A2A,0x8A1D,0x8A23,0x8A25,0x8A31,0x8A2D,0x8A1F,0x8A1B,/* 0x58-0x5F */
+	0x8A22,0x8C49,0x8C5A,0x8CA9,0x8CAC,0x8CAB,0x8CA8,0x8CAA,/* 0x60-0x67 */
+	0x8CA7,0x8D67,0x8D66,0x8DBE,0x8DBA,0x8EDB,0x8EDF,0x9019,/* 0x68-0x6F */
+	0x900D,0x901A,0x9017,0x9023,0x901F,0x901D,0x9010,0x9015,/* 0x70-0x77 */
+	0x901E,0x9020,0x900F,0x9022,0x9016,0x901B,0x9014,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x90E8,0x90ED,0x90FD,0x9157,0x91CE,0x91F5,0x91E6,/* 0xA0-0xA7 */
+	0x91E3,0x91E7,0x91ED,0x91E9,0x9589,0x966A,0x9675,0x9673,/* 0xA8-0xAF */
+	0x9678,0x9670,0x9674,0x9676,0x9677,0x966C,0x96C0,0x96EA,/* 0xB0-0xB7 */
+	0x96E9,0x7AE0,0x7ADF,0x9802,0x9803,0x9B5A,0x9CE5,0x9E75,/* 0xB8-0xBF */
+	0x9E7F,0x9EA5,0x9EBB,0x50A2,0x508D,0x5085,0x5099,0x5091,/* 0xC0-0xC7 */
+	0x5080,0x5096,0x5098,0x509A,0x6700,0x51F1,0x5272,0x5274,/* 0xC8-0xCF */
+	0x5275,0x5269,0x52DE,0x52DD,0x52DB,0x535A,0x53A5,0x557B,/* 0xD0-0xD7 */
+	0x5580,0x55A7,0x557C,0x558A,0x559D,0x5598,0x5582,0x559C,/* 0xD8-0xDF */
+	0x55AA,0x5594,0x5587,0x558B,0x5583,0x55B3,0x55AE,0x559F,/* 0xE0-0xE7 */
+	0x553E,0x55B2,0x559A,0x55BB,0x55AC,0x55B1,0x557E,0x5589,/* 0xE8-0xEF */
+	0x55AB,0x5599,0x570D,0x582F,0x582A,0x5834,0x5824,0x5830,/* 0xF0-0xF7 */
+	0x5831,0x5821,0x581D,0x5820,0x58F9,0x58FA,0x5960,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5A77,0x5A9A,0x5A7F,0x5A92,0x5A9B,0x5AA7,0x5B73,0x5B71,/* 0x40-0x47 */
+	0x5BD2,0x5BCC,0x5BD3,0x5BD0,0x5C0A,0x5C0B,0x5C31,0x5D4C,/* 0x48-0x4F */
+	0x5D50,0x5D34,0x5D47,0x5DFD,0x5E45,0x5E3D,0x5E40,0x5E43,/* 0x50-0x57 */
+	0x5E7E,0x5ECA,0x5EC1,0x5EC2,0x5EC4,0x5F3C,0x5F6D,0x5FA9,/* 0x58-0x5F */
+	0x5FAA,0x5FA8,0x60D1,0x60E1,0x60B2,0x60B6,0x60E0,0x611C,/* 0x60-0x67 */
+	0x6123,0x60FA,0x6115,0x60F0,0x60FB,0x60F4,0x6168,0x60F1,/* 0x68-0x6F */
+	0x610E,0x60F6,0x6109,0x6100,0x6112,0x621F,0x6249,0x63A3,/* 0x70-0x77 */
+	0x638C,0x63CF,0x63C0,0x63E9,0x63C9,0x63C6,0x63CD,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x63D2,0x63E3,0x63D0,0x63E1,0x63D6,0x63ED,0x63EE,/* 0xA0-0xA7 */
+	0x6376,0x63F4,0x63EA,0x63DB,0x6452,0x63DA,0x63F9,0x655E,/* 0xA8-0xAF */
+	0x6566,0x6562,0x6563,0x6591,0x6590,0x65AF,0x666E,0x6670,/* 0xB0-0xB7 */
+	0x6674,0x6676,0x666F,0x6691,0x667A,0x667E,0x6677,0x66FE,/* 0xB8-0xBF */
+	0x66FF,0x671F,0x671D,0x68FA,0x68D5,0x68E0,0x68D8,0x68D7,/* 0xC0-0xC7 */
+	0x6905,0x68DF,0x68F5,0x68EE,0x68E7,0x68F9,0x68D2,0x68F2,/* 0xC8-0xCF */
+	0x68E3,0x68CB,0x68CD,0x690D,0x6912,0x690E,0x68C9,0x68DA,/* 0xD0-0xD7 */
+	0x696E,0x68FB,0x6B3E,0x6B3A,0x6B3D,0x6B98,0x6B96,0x6BBC,/* 0xD8-0xDF */
+	0x6BEF,0x6C2E,0x6C2F,0x6C2C,0x6E2F,0x6E38,0x6E54,0x6E21,/* 0xE0-0xE7 */
+	0x6E32,0x6E67,0x6E4A,0x6E20,0x6E25,0x6E23,0x6E1B,0x6E5B,/* 0xE8-0xEF */
+	0x6E58,0x6E24,0x6E56,0x6E6E,0x6E2D,0x6E26,0x6E6F,0x6E34,/* 0xF0-0xF7 */
+	0x6E4D,0x6E3A,0x6E2C,0x6E43,0x6E1D,0x6E3E,0x6ECB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6E89,0x6E19,0x6E4E,0x6E63,0x6E44,0x6E72,0x6E69,0x6E5F,/* 0x40-0x47 */
+	0x7119,0x711A,0x7126,0x7130,0x7121,0x7136,0x716E,0x711C,/* 0x48-0x4F */
+	0x724C,0x7284,0x7280,0x7336,0x7325,0x7334,0x7329,0x743A,/* 0x50-0x57 */
+	0x742A,0x7433,0x7422,0x7425,0x7435,0x7436,0x7434,0x742F,/* 0x58-0x5F */
+	0x741B,0x7426,0x7428,0x7525,0x7526,0x756B,0x756A,0x75E2,/* 0x60-0x67 */
+	0x75DB,0x75E3,0x75D9,0x75D8,0x75DE,0x75E0,0x767B,0x767C,/* 0x68-0x6F */
+	0x7696,0x7693,0x76B4,0x76DC,0x774F,0x77ED,0x785D,0x786C,/* 0x70-0x77 */
+	0x786F,0x7A0D,0x7A08,0x7A0B,0x7A05,0x7A00,0x7A98,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7A97,0x7A96,0x7AE5,0x7AE3,0x7B49,0x7B56,0x7B46,/* 0xA0-0xA7 */
+	0x7B50,0x7B52,0x7B54,0x7B4D,0x7B4B,0x7B4F,0x7B51,0x7C9F,/* 0xA8-0xAF */
+	0x7CA5,0x7D5E,0x7D50,0x7D68,0x7D55,0x7D2B,0x7D6E,0x7D72,/* 0xB0-0xB7 */
+	0x7D61,0x7D66,0x7D62,0x7D70,0x7D73,0x5584,0x7FD4,0x7FD5,/* 0xB8-0xBF */
+	0x800B,0x8052,0x8085,0x8155,0x8154,0x814B,0x8151,0x814E,/* 0xC0-0xC7 */
+	0x8139,0x8146,0x813E,0x814C,0x8153,0x8174,0x8212,0x821C,/* 0xC8-0xCF */
+	0x83E9,0x8403,0x83F8,0x840D,0x83E0,0x83C5,0x840B,0x83C1,/* 0xD0-0xD7 */
+	0x83EF,0x83F1,0x83F4,0x8457,0x840A,0x83F0,0x840C,0x83CC,/* 0xD8-0xDF */
+	0x83FD,0x83F2,0x83CA,0x8438,0x840E,0x8404,0x83DC,0x8407,/* 0xE0-0xE7 */
+	0x83D4,0x83DF,0x865B,0x86DF,0x86D9,0x86ED,0x86D4,0x86DB,/* 0xE8-0xEF */
+	0x86E4,0x86D0,0x86DE,0x8857,0x88C1,0x88C2,0x88B1,0x8983,/* 0xF0-0xF7 */
+	0x8996,0x8A3B,0x8A60,0x8A55,0x8A5E,0x8A3C,0x8A41,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8A54,0x8A5B,0x8A50,0x8A46,0x8A34,0x8A3A,0x8A36,0x8A56,/* 0x40-0x47 */
+	0x8C61,0x8C82,0x8CAF,0x8CBC,0x8CB3,0x8CBD,0x8CC1,0x8CBB,/* 0x48-0x4F */
+	0x8CC0,0x8CB4,0x8CB7,0x8CB6,0x8CBF,0x8CB8,0x8D8A,0x8D85,/* 0x50-0x57 */
+	0x8D81,0x8DCE,0x8DDD,0x8DCB,0x8DDA,0x8DD1,0x8DCC,0x8DDB,/* 0x58-0x5F */
+	0x8DC6,0x8EFB,0x8EF8,0x8EFC,0x8F9C,0x902E,0x9035,0x9031,/* 0x60-0x67 */
+	0x9038,0x9032,0x9036,0x9102,0x90F5,0x9109,0x90FE,0x9163,/* 0x68-0x6F */
+	0x9165,0x91CF,0x9214,0x9215,0x9223,0x9209,0x921E,0x920D,/* 0x70-0x77 */
+	0x9210,0x9207,0x9211,0x9594,0x958F,0x958B,0x9591,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9593,0x9592,0x958E,0x968A,0x968E,0x968B,0x967D,/* 0xA0-0xA7 */
+	0x9685,0x9686,0x968D,0x9672,0x9684,0x96C1,0x96C5,0x96C4,/* 0xA8-0xAF */
+	0x96C6,0x96C7,0x96EF,0x96F2,0x97CC,0x9805,0x9806,0x9808,/* 0xB0-0xB7 */
+	0x98E7,0x98EA,0x98EF,0x98E9,0x98F2,0x98ED,0x99AE,0x99AD,/* 0xB8-0xBF */
+	0x9EC3,0x9ECD,0x9ED1,0x4E82,0x50AD,0x50B5,0x50B2,0x50B3,/* 0xC0-0xC7 */
+	0x50C5,0x50BE,0x50AC,0x50B7,0x50BB,0x50AF,0x50C7,0x527F,/* 0xC8-0xCF */
+	0x5277,0x527D,0x52DF,0x52E6,0x52E4,0x52E2,0x52E3,0x532F,/* 0xD0-0xD7 */
+	0x55DF,0x55E8,0x55D3,0x55E6,0x55CE,0x55DC,0x55C7,0x55D1,/* 0xD8-0xDF */
+	0x55E3,0x55E4,0x55EF,0x55DA,0x55E1,0x55C5,0x55C6,0x55E5,/* 0xE0-0xE7 */
+	0x55C9,0x5712,0x5713,0x585E,0x5851,0x5858,0x5857,0x585A,/* 0xE8-0xEF */
+	0x5854,0x586B,0x584C,0x586D,0x584A,0x5862,0x5852,0x584B,/* 0xF0-0xF7 */
+	0x5967,0x5AC1,0x5AC9,0x5ACC,0x5ABE,0x5ABD,0x5ABC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5AB3,0x5AC2,0x5AB2,0x5D69,0x5D6F,0x5E4C,0x5E79,0x5EC9,/* 0x40-0x47 */
+	0x5EC8,0x5F12,0x5F59,0x5FAC,0x5FAE,0x611A,0x610F,0x6148,/* 0x48-0x4F */
+	0x611F,0x60F3,0x611B,0x60F9,0x6101,0x6108,0x614E,0x614C,/* 0x50-0x57 */
+	0x6144,0x614D,0x613E,0x6134,0x6127,0x610D,0x6106,0x6137,/* 0x58-0x5F */
+	0x6221,0x6222,0x6413,0x643E,0x641E,0x642A,0x642D,0x643D,/* 0x60-0x67 */
+	0x642C,0x640F,0x641C,0x6414,0x640D,0x6436,0x6416,0x6417,/* 0x68-0x6F */
+	0x6406,0x656C,0x659F,0x65B0,0x6697,0x6689,0x6687,0x6688,/* 0x70-0x77 */
+	0x6696,0x6684,0x6698,0x668D,0x6703,0x6994,0x696D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x695A,0x6977,0x6960,0x6954,0x6975,0x6930,0x6982,/* 0xA0-0xA7 */
+	0x694A,0x6968,0x696B,0x695E,0x6953,0x6979,0x6986,0x695D,/* 0xA8-0xAF */
+	0x6963,0x695B,0x6B47,0x6B72,0x6BC0,0x6BBF,0x6BD3,0x6BFD,/* 0xB0-0xB7 */
+	0x6EA2,0x6EAF,0x6ED3,0x6EB6,0x6EC2,0x6E90,0x6E9D,0x6EC7,/* 0xB8-0xBF */
+	0x6EC5,0x6EA5,0x6E98,0x6EBC,0x6EBA,0x6EAB,0x6ED1,0x6E96,/* 0xC0-0xC7 */
+	0x6E9C,0x6EC4,0x6ED4,0x6EAA,0x6EA7,0x6EB4,0x714E,0x7159,/* 0xC8-0xCF */
+	0x7169,0x7164,0x7149,0x7167,0x715C,0x716C,0x7166,0x714C,/* 0xD0-0xD7 */
+	0x7165,0x715E,0x7146,0x7168,0x7156,0x723A,0x7252,0x7337,/* 0xD8-0xDF */
+	0x7345,0x733F,0x733E,0x746F,0x745A,0x7455,0x745F,0x745E,/* 0xE0-0xE7 */
+	0x7441,0x743F,0x7459,0x745B,0x745C,0x7576,0x7578,0x7600,/* 0xE8-0xEF */
+	0x75F0,0x7601,0x75F2,0x75F1,0x75FA,0x75FF,0x75F4,0x75F3,/* 0xF0-0xF7 */
+	0x76DE,0x76DF,0x775B,0x776B,0x7766,0x775E,0x7763,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7779,0x776A,0x776C,0x775C,0x7765,0x7768,0x7762,0x77EE,/* 0x40-0x47 */
+	0x788E,0x78B0,0x7897,0x7898,0x788C,0x7889,0x787C,0x7891,/* 0x48-0x4F */
+	0x7893,0x787F,0x797A,0x797F,0x7981,0x842C,0x79BD,0x7A1C,/* 0x50-0x57 */
+	0x7A1A,0x7A20,0x7A14,0x7A1F,0x7A1E,0x7A9F,0x7AA0,0x7B77,/* 0x58-0x5F */
+	0x7BC0,0x7B60,0x7B6E,0x7B67,0x7CB1,0x7CB3,0x7CB5,0x7D93,/* 0x60-0x67 */
+	0x7D79,0x7D91,0x7D81,0x7D8F,0x7D5B,0x7F6E,0x7F69,0x7F6A,/* 0x68-0x6F */
+	0x7F72,0x7FA9,0x7FA8,0x7FA4,0x8056,0x8058,0x8086,0x8084,/* 0x70-0x77 */
+	0x8171,0x8170,0x8178,0x8165,0x816E,0x8173,0x816B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8179,0x817A,0x8166,0x8205,0x8247,0x8482,0x8477,/* 0xA0-0xA7 */
+	0x843D,0x8431,0x8475,0x8466,0x846B,0x8449,0x846C,0x845B,/* 0xA8-0xAF */
+	0x843C,0x8435,0x8461,0x8463,0x8469,0x846D,0x8446,0x865E,/* 0xB0-0xB7 */
+	0x865C,0x865F,0x86F9,0x8713,0x8708,0x8707,0x8700,0x86FE,/* 0xB8-0xBF */
+	0x86FB,0x8702,0x8703,0x8706,0x870A,0x8859,0x88DF,0x88D4,/* 0xC0-0xC7 */
+	0x88D9,0x88DC,0x88D8,0x88DD,0x88E1,0x88CA,0x88D5,0x88D2,/* 0xC8-0xCF */
+	0x899C,0x89E3,0x8A6B,0x8A72,0x8A73,0x8A66,0x8A69,0x8A70,/* 0xD0-0xD7 */
+	0x8A87,0x8A7C,0x8A63,0x8AA0,0x8A71,0x8A85,0x8A6D,0x8A62,/* 0xD8-0xDF */
+	0x8A6E,0x8A6C,0x8A79,0x8A7B,0x8A3E,0x8A68,0x8C62,0x8C8A,/* 0xE0-0xE7 */
+	0x8C89,0x8CCA,0x8CC7,0x8CC8,0x8CC4,0x8CB2,0x8CC3,0x8CC2,/* 0xE8-0xEF */
+	0x8CC5,0x8DE1,0x8DDF,0x8DE8,0x8DEF,0x8DF3,0x8DFA,0x8DEA,/* 0xF0-0xF7 */
+	0x8DE4,0x8DE6,0x8EB2,0x8F03,0x8F09,0x8EFE,0x8F0A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8F9F,0x8FB2,0x904B,0x904A,0x9053,0x9042,0x9054,0x903C,/* 0x40-0x47 */
+	0x9055,0x9050,0x9047,0x904F,0x904E,0x904D,0x9051,0x903E,/* 0x48-0x4F */
+	0x9041,0x9112,0x9117,0x916C,0x916A,0x9169,0x91C9,0x9237,/* 0x50-0x57 */
+	0x9257,0x9238,0x923D,0x9240,0x923E,0x925B,0x924B,0x9264,/* 0x58-0x5F */
+	0x9251,0x9234,0x9249,0x924D,0x9245,0x9239,0x923F,0x925A,/* 0x60-0x67 */
+	0x9598,0x9698,0x9694,0x9695,0x96CD,0x96CB,0x96C9,0x96CA,/* 0x68-0x6F */
+	0x96F7,0x96FB,0x96F9,0x96F6,0x9756,0x9774,0x9776,0x9810,/* 0x70-0x77 */
+	0x9811,0x9813,0x980A,0x9812,0x980C,0x98FC,0x98F4,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x98FD,0x98FE,0x99B3,0x99B1,0x99B4,0x9AE1,0x9CE9,/* 0xA0-0xA7 */
+	0x9E82,0x9F0E,0x9F13,0x9F20,0x50E7,0x50EE,0x50E5,0x50D6,/* 0xA8-0xAF */
+	0x50ED,0x50DA,0x50D5,0x50CF,0x50D1,0x50F1,0x50CE,0x50E9,/* 0xB0-0xB7 */
+	0x5162,0x51F3,0x5283,0x5282,0x5331,0x53AD,0x55FE,0x5600,/* 0xB8-0xBF */
+	0x561B,0x5617,0x55FD,0x5614,0x5606,0x5609,0x560D,0x560E,/* 0xC0-0xC7 */
+	0x55F7,0x5616,0x561F,0x5608,0x5610,0x55F6,0x5718,0x5716,/* 0xC8-0xCF */
+	0x5875,0x587E,0x5883,0x5893,0x588A,0x5879,0x5885,0x587D,/* 0xD0-0xD7 */
+	0x58FD,0x5925,0x5922,0x5924,0x596A,0x5969,0x5AE1,0x5AE6,/* 0xD8-0xDF */
+	0x5AE9,0x5AD7,0x5AD6,0x5AD8,0x5AE3,0x5B75,0x5BDE,0x5BE7,/* 0xE0-0xE7 */
+	0x5BE1,0x5BE5,0x5BE6,0x5BE8,0x5BE2,0x5BE4,0x5BDF,0x5C0D,/* 0xE8-0xEF */
+	0x5C62,0x5D84,0x5D87,0x5E5B,0x5E63,0x5E55,0x5E57,0x5E54,/* 0xF0-0xF7 */
+	0x5ED3,0x5ED6,0x5F0A,0x5F46,0x5F70,0x5FB9,0x6147,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x613F,0x614B,0x6177,0x6162,0x6163,0x615F,0x615A,0x6158,/* 0x40-0x47 */
+	0x6175,0x622A,0x6487,0x6458,0x6454,0x64A4,0x6478,0x645F,/* 0x48-0x4F */
+	0x647A,0x6451,0x6467,0x6434,0x646D,0x647B,0x6572,0x65A1,/* 0x50-0x57 */
+	0x65D7,0x65D6,0x66A2,0x66A8,0x669D,0x699C,0x69A8,0x6995,/* 0x58-0x5F */
+	0x69C1,0x69AE,0x69D3,0x69CB,0x699B,0x69B7,0x69BB,0x69AB,/* 0x60-0x67 */
+	0x69B4,0x69D0,0x69CD,0x69AD,0x69CC,0x69A6,0x69C3,0x69A3,/* 0x68-0x6F */
+	0x6B49,0x6B4C,0x6C33,0x6F33,0x6F14,0x6EFE,0x6F13,0x6EF4,/* 0x70-0x77 */
+	0x6F29,0x6F3E,0x6F20,0x6F2C,0x6F0F,0x6F02,0x6F22,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6EFF,0x6EEF,0x6F06,0x6F31,0x6F38,0x6F32,0x6F23,/* 0xA0-0xA7 */
+	0x6F15,0x6F2B,0x6F2F,0x6F88,0x6F2A,0x6EEC,0x6F01,0x6EF2,/* 0xA8-0xAF */
+	0x6ECC,0x6EF7,0x7194,0x7199,0x717D,0x718A,0x7184,0x7192,/* 0xB0-0xB7 */
+	0x723E,0x7292,0x7296,0x7344,0x7350,0x7464,0x7463,0x746A,/* 0xB8-0xBF */
+	0x7470,0x746D,0x7504,0x7591,0x7627,0x760D,0x760B,0x7609,/* 0xC0-0xC7 */
+	0x7613,0x76E1,0x76E3,0x7784,0x777D,0x777F,0x7761,0x78C1,/* 0xC8-0xCF */
+	0x789F,0x78A7,0x78B3,0x78A9,0x78A3,0x798E,0x798F,0x798D,/* 0xD0-0xD7 */
+	0x7A2E,0x7A31,0x7AAA,0x7AA9,0x7AED,0x7AEF,0x7BA1,0x7B95,/* 0xD8-0xDF */
+	0x7B8B,0x7B75,0x7B97,0x7B9D,0x7B94,0x7B8F,0x7BB8,0x7B87,/* 0xE0-0xE7 */
+	0x7B84,0x7CB9,0x7CBD,0x7CBE,0x7DBB,0x7DB0,0x7D9C,0x7DBD,/* 0xE8-0xEF */
+	0x7DBE,0x7DA0,0x7DCA,0x7DB4,0x7DB2,0x7DB1,0x7DBA,0x7DA2,/* 0xF0-0xF7 */
+	0x7DBF,0x7DB5,0x7DB8,0x7DAD,0x7DD2,0x7DC7,0x7DAC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7F70,0x7FE0,0x7FE1,0x7FDF,0x805E,0x805A,0x8087,0x8150,/* 0x40-0x47 */
+	0x8180,0x818F,0x8188,0x818A,0x817F,0x8182,0x81E7,0x81FA,/* 0x48-0x4F */
+	0x8207,0x8214,0x821E,0x824B,0x84C9,0x84BF,0x84C6,0x84C4,/* 0x50-0x57 */
+	0x8499,0x849E,0x84B2,0x849C,0x84CB,0x84B8,0x84C0,0x84D3,/* 0x58-0x5F */
+	0x8490,0x84BC,0x84D1,0x84CA,0x873F,0x871C,0x873B,0x8722,/* 0x60-0x67 */
+	0x8725,0x8734,0x8718,0x8755,0x8737,0x8729,0x88F3,0x8902,/* 0x68-0x6F */
+	0x88F4,0x88F9,0x88F8,0x88FD,0x88E8,0x891A,0x88EF,0x8AA6,/* 0x70-0x77 */
+	0x8A8C,0x8A9E,0x8AA3,0x8A8D,0x8AA1,0x8A93,0x8AA4,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8AAA,0x8AA5,0x8AA8,0x8A98,0x8A91,0x8A9A,0x8AA7,/* 0xA0-0xA7 */
+	0x8C6A,0x8C8D,0x8C8C,0x8CD3,0x8CD1,0x8CD2,0x8D6B,0x8D99,/* 0xA8-0xAF */
+	0x8D95,0x8DFC,0x8F14,0x8F12,0x8F15,0x8F13,0x8FA3,0x9060,/* 0xB0-0xB7 */
+	0x9058,0x905C,0x9063,0x9059,0x905E,0x9062,0x905D,0x905B,/* 0xB8-0xBF */
+	0x9119,0x9118,0x911E,0x9175,0x9178,0x9177,0x9174,0x9278,/* 0xC0-0xC7 */
+	0x9280,0x9285,0x9298,0x9296,0x927B,0x9293,0x929C,0x92A8,/* 0xC8-0xCF */
+	0x927C,0x9291,0x95A1,0x95A8,0x95A9,0x95A3,0x95A5,0x95A4,/* 0xD0-0xD7 */
+	0x9699,0x969C,0x969B,0x96CC,0x96D2,0x9700,0x977C,0x9785,/* 0xD8-0xDF */
+	0x97F6,0x9817,0x9818,0x98AF,0x98B1,0x9903,0x9905,0x990C,/* 0xE0-0xE7 */
+	0x9909,0x99C1,0x9AAF,0x9AB0,0x9AE6,0x9B41,0x9B42,0x9CF4,/* 0xE8-0xEF */
+	0x9CF6,0x9CF3,0x9EBC,0x9F3B,0x9F4A,0x5104,0x5100,0x50FB,/* 0xF0-0xF7 */
+	0x50F5,0x50F9,0x5102,0x5108,0x5109,0x5105,0x51DC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5287,0x5288,0x5289,0x528D,0x528A,0x52F0,0x53B2,0x562E,/* 0x40-0x47 */
+	0x563B,0x5639,0x5632,0x563F,0x5634,0x5629,0x5653,0x564E,/* 0x48-0x4F */
+	0x5657,0x5674,0x5636,0x562F,0x5630,0x5880,0x589F,0x589E,/* 0x50-0x57 */
+	0x58B3,0x589C,0x58AE,0x58A9,0x58A6,0x596D,0x5B09,0x5AFB,/* 0x58-0x5F */
+	0x5B0B,0x5AF5,0x5B0C,0x5B08,0x5BEE,0x5BEC,0x5BE9,0x5BEB,/* 0x60-0x67 */
+	0x5C64,0x5C65,0x5D9D,0x5D94,0x5E62,0x5E5F,0x5E61,0x5EE2,/* 0x68-0x6F */
+	0x5EDA,0x5EDF,0x5EDD,0x5EE3,0x5EE0,0x5F48,0x5F71,0x5FB7,/* 0x70-0x77 */
+	0x5FB5,0x6176,0x6167,0x616E,0x615D,0x6155,0x6182,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x617C,0x6170,0x616B,0x617E,0x61A7,0x6190,0x61AB,/* 0xA0-0xA7 */
+	0x618E,0x61AC,0x619A,0x61A4,0x6194,0x61AE,0x622E,0x6469,/* 0xA8-0xAF */
+	0x646F,0x6479,0x649E,0x64B2,0x6488,0x6490,0x64B0,0x64A5,/* 0xB0-0xB7 */
+	0x6493,0x6495,0x64A9,0x6492,0x64AE,0x64AD,0x64AB,0x649A,/* 0xB8-0xBF */
+	0x64AC,0x6499,0x64A2,0x64B3,0x6575,0x6577,0x6578,0x66AE,/* 0xC0-0xC7 */
+	0x66AB,0x66B4,0x66B1,0x6A23,0x6A1F,0x69E8,0x6A01,0x6A1E,/* 0xC8-0xCF */
+	0x6A19,0x69FD,0x6A21,0x6A13,0x6A0A,0x69F3,0x6A02,0x6A05,/* 0xD0-0xD7 */
+	0x69ED,0x6A11,0x6B50,0x6B4E,0x6BA4,0x6BC5,0x6BC6,0x6F3F,/* 0xD8-0xDF */
+	0x6F7C,0x6F84,0x6F51,0x6F66,0x6F54,0x6F86,0x6F6D,0x6F5B,/* 0xE0-0xE7 */
+	0x6F78,0x6F6E,0x6F8E,0x6F7A,0x6F70,0x6F64,0x6F97,0x6F58,/* 0xE8-0xEF */
+	0x6ED5,0x6F6F,0x6F60,0x6F5F,0x719F,0x71AC,0x71B1,0x71A8,/* 0xF0-0xF7 */
+	0x7256,0x729B,0x734E,0x7357,0x7469,0x748B,0x7483,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x747E,0x7480,0x757F,0x7620,0x7629,0x761F,0x7624,0x7626,/* 0x40-0x47 */
+	0x7621,0x7622,0x769A,0x76BA,0x76E4,0x778E,0x7787,0x778C,/* 0x48-0x4F */
+	0x7791,0x778B,0x78CB,0x78C5,0x78BA,0x78CA,0x78BE,0x78D5,/* 0x50-0x57 */
+	0x78BC,0x78D0,0x7A3F,0x7A3C,0x7A40,0x7A3D,0x7A37,0x7A3B,/* 0x58-0x5F */
+	0x7AAF,0x7AAE,0x7BAD,0x7BB1,0x7BC4,0x7BB4,0x7BC6,0x7BC7,/* 0x60-0x67 */
+	0x7BC1,0x7BA0,0x7BCC,0x7CCA,0x7DE0,0x7DF4,0x7DEF,0x7DFB,/* 0x68-0x6F */
+	0x7DD8,0x7DEC,0x7DDD,0x7DE8,0x7DE3,0x7DDA,0x7DDE,0x7DE9,/* 0x70-0x77 */
+	0x7D9E,0x7DD9,0x7DF2,0x7DF9,0x7F75,0x7F77,0x7FAF,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7FE9,0x8026,0x819B,0x819C,0x819D,0x81A0,0x819A,/* 0xA0-0xA7 */
+	0x8198,0x8517,0x853D,0x851A,0x84EE,0x852C,0x852D,0x8513,/* 0xA8-0xAF */
+	0x8511,0x8523,0x8521,0x8514,0x84EC,0x8525,0x84FF,0x8506,/* 0xB0-0xB7 */
+	0x8782,0x8774,0x8776,0x8760,0x8766,0x8778,0x8768,0x8759,/* 0xB8-0xBF */
+	0x8757,0x874C,0x8753,0x885B,0x885D,0x8910,0x8907,0x8912,/* 0xC0-0xC7 */
+	0x8913,0x8915,0x890A,0x8ABC,0x8AD2,0x8AC7,0x8AC4,0x8A95,/* 0xC8-0xCF */
+	0x8ACB,0x8AF8,0x8AB2,0x8AC9,0x8AC2,0x8ABF,0x8AB0,0x8AD6,/* 0xD0-0xD7 */
+	0x8ACD,0x8AB6,0x8AB9,0x8ADB,0x8C4C,0x8C4E,0x8C6C,0x8CE0,/* 0xD8-0xDF */
+	0x8CDE,0x8CE6,0x8CE4,0x8CEC,0x8CED,0x8CE2,0x8CE3,0x8CDC,/* 0xE0-0xE7 */
+	0x8CEA,0x8CE1,0x8D6D,0x8D9F,0x8DA3,0x8E2B,0x8E10,0x8E1D,/* 0xE8-0xEF */
+	0x8E22,0x8E0F,0x8E29,0x8E1F,0x8E21,0x8E1E,0x8EBA,0x8F1D,/* 0xF0-0xF7 */
+	0x8F1B,0x8F1F,0x8F29,0x8F26,0x8F2A,0x8F1C,0x8F1E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8F25,0x9069,0x906E,0x9068,0x906D,0x9077,0x9130,0x912D,/* 0x40-0x47 */
+	0x9127,0x9131,0x9187,0x9189,0x918B,0x9183,0x92C5,0x92BB,/* 0x48-0x4F */
+	0x92B7,0x92EA,0x92AC,0x92E4,0x92C1,0x92B3,0x92BC,0x92D2,/* 0x50-0x57 */
+	0x92C7,0x92F0,0x92B2,0x95AD,0x95B1,0x9704,0x9706,0x9707,/* 0x58-0x5F */
+	0x9709,0x9760,0x978D,0x978B,0x978F,0x9821,0x982B,0x981C,/* 0x60-0x67 */
+	0x98B3,0x990A,0x9913,0x9912,0x9918,0x99DD,0x99D0,0x99DF,/* 0x68-0x6F */
+	0x99DB,0x99D1,0x99D5,0x99D2,0x99D9,0x9AB7,0x9AEE,0x9AEF,/* 0x70-0x77 */
+	0x9B27,0x9B45,0x9B44,0x9B77,0x9B6F,0x9D06,0x9D09,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9D03,0x9EA9,0x9EBE,0x9ECE,0x58A8,0x9F52,0x5112,/* 0xA0-0xA7 */
+	0x5118,0x5114,0x5110,0x5115,0x5180,0x51AA,0x51DD,0x5291,/* 0xA8-0xAF */
+	0x5293,0x52F3,0x5659,0x566B,0x5679,0x5669,0x5664,0x5678,/* 0xB0-0xB7 */
+	0x566A,0x5668,0x5665,0x5671,0x566F,0x566C,0x5662,0x5676,/* 0xB8-0xBF */
+	0x58C1,0x58BE,0x58C7,0x58C5,0x596E,0x5B1D,0x5B34,0x5B78,/* 0xC0-0xC7 */
+	0x5BF0,0x5C0E,0x5F4A,0x61B2,0x6191,0x61A9,0x618A,0x61CD,/* 0xC8-0xCF */
+	0x61B6,0x61BE,0x61CA,0x61C8,0x6230,0x64C5,0x64C1,0x64CB,/* 0xD0-0xD7 */
+	0x64BB,0x64BC,0x64DA,0x64C4,0x64C7,0x64C2,0x64CD,0x64BF,/* 0xD8-0xDF */
+	0x64D2,0x64D4,0x64BE,0x6574,0x66C6,0x66C9,0x66B9,0x66C4,/* 0xE0-0xE7 */
+	0x66C7,0x66B8,0x6A3D,0x6A38,0x6A3A,0x6A59,0x6A6B,0x6A58,/* 0xE8-0xEF */
+	0x6A39,0x6A44,0x6A62,0x6A61,0x6A4B,0x6A47,0x6A35,0x6A5F,/* 0xF0-0xF7 */
+	0x6A48,0x6B59,0x6B77,0x6C05,0x6FC2,0x6FB1,0x6FA1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6FC3,0x6FA4,0x6FC1,0x6FA7,0x6FB3,0x6FC0,0x6FB9,0x6FB6,/* 0x40-0x47 */
+	0x6FA6,0x6FA0,0x6FB4,0x71BE,0x71C9,0x71D0,0x71D2,0x71C8,/* 0x48-0x4F */
+	0x71D5,0x71B9,0x71CE,0x71D9,0x71DC,0x71C3,0x71C4,0x7368,/* 0x50-0x57 */
+	0x749C,0x74A3,0x7498,0x749F,0x749E,0x74E2,0x750C,0x750D,/* 0x58-0x5F */
+	0x7634,0x7638,0x763A,0x76E7,0x76E5,0x77A0,0x779E,0x779F,/* 0x60-0x67 */
+	0x77A5,0x78E8,0x78DA,0x78EC,0x78E7,0x79A6,0x7A4D,0x7A4E,/* 0x68-0x6F */
+	0x7A46,0x7A4C,0x7A4B,0x7ABA,0x7BD9,0x7C11,0x7BC9,0x7BE4,/* 0x70-0x77 */
+	0x7BDB,0x7BE1,0x7BE9,0x7BE6,0x7CD5,0x7CD6,0x7E0A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7E11,0x7E08,0x7E1B,0x7E23,0x7E1E,0x7E1D,0x7E09,/* 0xA0-0xA7 */
+	0x7E10,0x7F79,0x7FB2,0x7FF0,0x7FF1,0x7FEE,0x8028,0x81B3,/* 0xA8-0xAF */
+	0x81A9,0x81A8,0x81FB,0x8208,0x8258,0x8259,0x854A,0x8559,/* 0xB0-0xB7 */
+	0x8548,0x8568,0x8569,0x8543,0x8549,0x856D,0x856A,0x855E,/* 0xB8-0xBF */
+	0x8783,0x879F,0x879E,0x87A2,0x878D,0x8861,0x892A,0x8932,/* 0xC0-0xC7 */
+	0x8925,0x892B,0x8921,0x89AA,0x89A6,0x8AE6,0x8AFA,0x8AEB,/* 0xC8-0xCF */
+	0x8AF1,0x8B00,0x8ADC,0x8AE7,0x8AEE,0x8AFE,0x8B01,0x8B02,/* 0xD0-0xD7 */
+	0x8AF7,0x8AED,0x8AF3,0x8AF6,0x8AFC,0x8C6B,0x8C6D,0x8C93,/* 0xD8-0xDF */
+	0x8CF4,0x8E44,0x8E31,0x8E34,0x8E42,0x8E39,0x8E35,0x8F3B,/* 0xE0-0xE7 */
+	0x8F2F,0x8F38,0x8F33,0x8FA8,0x8FA6,0x9075,0x9074,0x9078,/* 0xE8-0xEF */
+	0x9072,0x907C,0x907A,0x9134,0x9192,0x9320,0x9336,0x92F8,/* 0xF0-0xF7 */
+	0x9333,0x932F,0x9322,0x92FC,0x932B,0x9304,0x931A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9310,0x9326,0x9321,0x9315,0x932E,0x9319,0x95BB,0x96A7,/* 0x40-0x47 */
+	0x96A8,0x96AA,0x96D5,0x970E,0x9711,0x9716,0x970D,0x9713,/* 0x48-0x4F */
+	0x970F,0x975B,0x975C,0x9766,0x9798,0x9830,0x9838,0x983B,/* 0x50-0x57 */
+	0x9837,0x982D,0x9839,0x9824,0x9910,0x9928,0x991E,0x991B,/* 0x58-0x5F */
+	0x9921,0x991A,0x99ED,0x99E2,0x99F1,0x9AB8,0x9ABC,0x9AFB,/* 0x60-0x67 */
+	0x9AED,0x9B28,0x9B91,0x9D15,0x9D23,0x9D26,0x9D28,0x9D12,/* 0x68-0x6F */
+	0x9D1B,0x9ED8,0x9ED4,0x9F8D,0x9F9C,0x512A,0x511F,0x5121,/* 0x70-0x77 */
+	0x5132,0x52F5,0x568E,0x5680,0x5690,0x5685,0x5687,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x568F,0x58D5,0x58D3,0x58D1,0x58CE,0x5B30,0x5B2A,/* 0xA0-0xA7 */
+	0x5B24,0x5B7A,0x5C37,0x5C68,0x5DBC,0x5DBA,0x5DBD,0x5DB8,/* 0xA8-0xAF */
+	0x5E6B,0x5F4C,0x5FBD,0x61C9,0x61C2,0x61C7,0x61E6,0x61CB,/* 0xB0-0xB7 */
+	0x6232,0x6234,0x64CE,0x64CA,0x64D8,0x64E0,0x64F0,0x64E6,/* 0xB8-0xBF */
+	0x64EC,0x64F1,0x64E2,0x64ED,0x6582,0x6583,0x66D9,0x66D6,/* 0xC0-0xC7 */
+	0x6A80,0x6A94,0x6A84,0x6AA2,0x6A9C,0x6ADB,0x6AA3,0x6A7E,/* 0xC8-0xCF */
+	0x6A97,0x6A90,0x6AA0,0x6B5C,0x6BAE,0x6BDA,0x6C08,0x6FD8,/* 0xD0-0xD7 */
+	0x6FF1,0x6FDF,0x6FE0,0x6FDB,0x6FE4,0x6FEB,0x6FEF,0x6F80,/* 0xD8-0xDF */
+	0x6FEC,0x6FE1,0x6FE9,0x6FD5,0x6FEE,0x6FF0,0x71E7,0x71DF,/* 0xE0-0xE7 */
+	0x71EE,0x71E6,0x71E5,0x71ED,0x71EC,0x71F4,0x71E0,0x7235,/* 0xE8-0xEF */
+	0x7246,0x7370,0x7372,0x74A9,0x74B0,0x74A6,0x74A8,0x7646,/* 0xF0-0xF7 */
+	0x7642,0x764C,0x76EA,0x77B3,0x77AA,0x77B0,0x77AC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x77A7,0x77AD,0x77EF,0x78F7,0x78FA,0x78F4,0x78EF,0x7901,/* 0x40-0x47 */
+	0x79A7,0x79AA,0x7A57,0x7ABF,0x7C07,0x7C0D,0x7BFE,0x7BF7,/* 0x48-0x4F */
+	0x7C0C,0x7BE0,0x7CE0,0x7CDC,0x7CDE,0x7CE2,0x7CDF,0x7CD9,/* 0x50-0x57 */
+	0x7CDD,0x7E2E,0x7E3E,0x7E46,0x7E37,0x7E32,0x7E43,0x7E2B,/* 0x58-0x5F */
+	0x7E3D,0x7E31,0x7E45,0x7E41,0x7E34,0x7E39,0x7E48,0x7E35,/* 0x60-0x67 */
+	0x7E3F,0x7E2F,0x7F44,0x7FF3,0x7FFC,0x8071,0x8072,0x8070,/* 0x68-0x6F */
+	0x806F,0x8073,0x81C6,0x81C3,0x81BA,0x81C2,0x81C0,0x81BF,/* 0x70-0x77 */
+	0x81BD,0x81C9,0x81BE,0x81E8,0x8209,0x8271,0x85AA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8584,0x857E,0x859C,0x8591,0x8594,0x85AF,0x859B,/* 0xA0-0xA7 */
+	0x8587,0x85A8,0x858A,0x8667,0x87C0,0x87D1,0x87B3,0x87D2,/* 0xA8-0xAF */
+	0x87C6,0x87AB,0x87BB,0x87BA,0x87C8,0x87CB,0x893B,0x8936,/* 0xB0-0xB7 */
+	0x8944,0x8938,0x893D,0x89AC,0x8B0E,0x8B17,0x8B19,0x8B1B,/* 0xB8-0xBF */
+	0x8B0A,0x8B20,0x8B1D,0x8B04,0x8B10,0x8C41,0x8C3F,0x8C73,/* 0xC0-0xC7 */
+	0x8CFA,0x8CFD,0x8CFC,0x8CF8,0x8CFB,0x8DA8,0x8E49,0x8E4B,/* 0xC8-0xCF */
+	0x8E48,0x8E4A,0x8F44,0x8F3E,0x8F42,0x8F45,0x8F3F,0x907F,/* 0xD0-0xD7 */
+	0x907D,0x9084,0x9081,0x9082,0x9080,0x9139,0x91A3,0x919E,/* 0xD8-0xDF */
+	0x919C,0x934D,0x9382,0x9328,0x9375,0x934A,0x9365,0x934B,/* 0xE0-0xE7 */
+	0x9318,0x937E,0x936C,0x935B,0x9370,0x935A,0x9354,0x95CA,/* 0xE8-0xEF */
+	0x95CB,0x95CC,0x95C8,0x95C6,0x96B1,0x96B8,0x96D6,0x971C,/* 0xF0-0xF7 */
+	0x971E,0x97A0,0x97D3,0x9846,0x98B6,0x9935,0x9A01,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x99FF,0x9BAE,0x9BAB,0x9BAA,0x9BAD,0x9D3B,0x9D3F,0x9E8B,/* 0x40-0x47 */
+	0x9ECF,0x9EDE,0x9EDC,0x9EDD,0x9EDB,0x9F3E,0x9F4B,0x53E2,/* 0x48-0x4F */
+	0x5695,0x56AE,0x58D9,0x58D8,0x5B38,0x5F5D,0x61E3,0x6233,/* 0x50-0x57 */
+	0x64F4,0x64F2,0x64FE,0x6506,0x64FA,0x64FB,0x64F7,0x65B7,/* 0x58-0x5F */
+	0x66DC,0x6726,0x6AB3,0x6AAC,0x6AC3,0x6ABB,0x6AB8,0x6AC2,/* 0x60-0x67 */
+	0x6AAE,0x6AAF,0x6B5F,0x6B78,0x6BAF,0x7009,0x700B,0x6FFE,/* 0x68-0x6F */
+	0x7006,0x6FFA,0x7011,0x700F,0x71FB,0x71FC,0x71FE,0x71F8,/* 0x70-0x77 */
+	0x7377,0x7375,0x74A7,0x74BF,0x7515,0x7656,0x7658,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7652,0x77BD,0x77BF,0x77BB,0x77BC,0x790E,0x79AE,/* 0xA0-0xA7 */
+	0x7A61,0x7A62,0x7A60,0x7AC4,0x7AC5,0x7C2B,0x7C27,0x7C2A,/* 0xA8-0xAF */
+	0x7C1E,0x7C23,0x7C21,0x7CE7,0x7E54,0x7E55,0x7E5E,0x7E5A,/* 0xB0-0xB7 */
+	0x7E61,0x7E52,0x7E59,0x7F48,0x7FF9,0x7FFB,0x8077,0x8076,/* 0xB8-0xBF */
+	0x81CD,0x81CF,0x820A,0x85CF,0x85A9,0x85CD,0x85D0,0x85C9,/* 0xC0-0xC7 */
+	0x85B0,0x85BA,0x85B9,0x85A6,0x87EF,0x87EC,0x87F2,0x87E0,/* 0xC8-0xCF */
+	0x8986,0x89B2,0x89F4,0x8B28,0x8B39,0x8B2C,0x8B2B,0x8C50,/* 0xD0-0xD7 */
+	0x8D05,0x8E59,0x8E63,0x8E66,0x8E64,0x8E5F,0x8E55,0x8EC0,/* 0xD8-0xDF */
+	0x8F49,0x8F4D,0x9087,0x9083,0x9088,0x91AB,0x91AC,0x91D0,/* 0xE0-0xE7 */
+	0x9394,0x938A,0x9396,0x93A2,0x93B3,0x93AE,0x93AC,0x93B0,/* 0xE8-0xEF */
+	0x9398,0x939A,0x9397,0x95D4,0x95D6,0x95D0,0x95D5,0x96E2,/* 0xF0-0xF7 */
+	0x96DC,0x96D9,0x96DB,0x96DE,0x9724,0x97A3,0x97A6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x97AD,0x97F9,0x984D,0x984F,0x984C,0x984E,0x9853,0x98BA,/* 0x40-0x47 */
+	0x993E,0x993F,0x993D,0x992E,0x99A5,0x9A0E,0x9AC1,0x9B03,/* 0x48-0x4F */
+	0x9B06,0x9B4F,0x9B4E,0x9B4D,0x9BCA,0x9BC9,0x9BFD,0x9BC8,/* 0x50-0x57 */
+	0x9BC0,0x9D51,0x9D5D,0x9D60,0x9EE0,0x9F15,0x9F2C,0x5133,/* 0x58-0x5F */
+	0x56A5,0x58DE,0x58DF,0x58E2,0x5BF5,0x9F90,0x5EEC,0x61F2,/* 0x60-0x67 */
+	0x61F7,0x61F6,0x61F5,0x6500,0x650F,0x66E0,0x66DD,0x6AE5,/* 0x68-0x6F */
+	0x6ADD,0x6ADA,0x6AD3,0x701B,0x701F,0x7028,0x701A,0x701D,/* 0x70-0x77 */
+	0x7015,0x7018,0x7206,0x720D,0x7258,0x72A2,0x7378,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x737A,0x74BD,0x74CA,0x74E3,0x7587,0x7586,0x765F,/* 0xA0-0xA7 */
+	0x7661,0x77C7,0x7919,0x79B1,0x7A6B,0x7A69,0x7C3E,0x7C3F,/* 0xA8-0xAF */
+	0x7C38,0x7C3D,0x7C37,0x7C40,0x7E6B,0x7E6D,0x7E79,0x7E69,/* 0xB0-0xB7 */
+	0x7E6A,0x7F85,0x7E73,0x7FB6,0x7FB9,0x7FB8,0x81D8,0x85E9,/* 0xB8-0xBF */
+	0x85DD,0x85EA,0x85D5,0x85E4,0x85E5,0x85F7,0x87FB,0x8805,/* 0xC0-0xC7 */
+	0x880D,0x87F9,0x87FE,0x8960,0x895F,0x8956,0x895E,0x8B41,/* 0xC8-0xCF */
+	0x8B5C,0x8B58,0x8B49,0x8B5A,0x8B4E,0x8B4F,0x8B46,0x8B59,/* 0xD0-0xD7 */
+	0x8D08,0x8D0A,0x8E7C,0x8E72,0x8E87,0x8E76,0x8E6C,0x8E7A,/* 0xD8-0xDF */
+	0x8E74,0x8F54,0x8F4E,0x8FAD,0x908A,0x908B,0x91B1,0x91AE,/* 0xE0-0xE7 */
+	0x93E1,0x93D1,0x93DF,0x93C3,0x93C8,0x93DC,0x93DD,0x93D6,/* 0xE8-0xEF */
+	0x93E2,0x93CD,0x93D8,0x93E4,0x93D7,0x93E8,0x95DC,0x96B4,/* 0xF0-0xF7 */
+	0x96E3,0x972A,0x9727,0x9761,0x97DC,0x97FB,0x985E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9858,0x985B,0x98BC,0x9945,0x9949,0x9A16,0x9A19,0x9B0D,/* 0x40-0x47 */
+	0x9BE8,0x9BE7,0x9BD6,0x9BDB,0x9D89,0x9D61,0x9D72,0x9D6A,/* 0x48-0x4F */
+	0x9D6C,0x9E92,0x9E97,0x9E93,0x9EB4,0x52F8,0x56A8,0x56B7,/* 0x50-0x57 */
+	0x56B6,0x56B4,0x56BC,0x58E4,0x5B40,0x5B43,0x5B7D,0x5BF6,/* 0x58-0x5F */
+	0x5DC9,0x61F8,0x61FA,0x6518,0x6514,0x6519,0x66E6,0x6727,/* 0x60-0x67 */
+	0x6AEC,0x703E,0x7030,0x7032,0x7210,0x737B,0x74CF,0x7662,/* 0x68-0x6F */
+	0x7665,0x7926,0x792A,0x792C,0x792B,0x7AC7,0x7AF6,0x7C4C,/* 0x70-0x77 */
+	0x7C43,0x7C4D,0x7CEF,0x7CF0,0x8FAE,0x7E7D,0x7E7C,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7E82,0x7F4C,0x8000,0x81DA,0x8266,0x85FB,0x85F9,/* 0xA0-0xA7 */
+	0x8611,0x85FA,0x8606,0x860B,0x8607,0x860A,0x8814,0x8815,/* 0xA8-0xAF */
+	0x8964,0x89BA,0x89F8,0x8B70,0x8B6C,0x8B66,0x8B6F,0x8B5F,/* 0xB0-0xB7 */
+	0x8B6B,0x8D0F,0x8D0D,0x8E89,0x8E81,0x8E85,0x8E82,0x91B4,/* 0xB8-0xBF */
+	0x91CB,0x9418,0x9403,0x93FD,0x95E1,0x9730,0x98C4,0x9952,/* 0xC0-0xC7 */
+	0x9951,0x99A8,0x9A2B,0x9A30,0x9A37,0x9A35,0x9C13,0x9C0D,/* 0xC8-0xCF */
+	0x9E79,0x9EB5,0x9EE8,0x9F2F,0x9F5F,0x9F63,0x9F61,0x5137,/* 0xD0-0xD7 */
+	0x5138,0x56C1,0x56C0,0x56C2,0x5914,0x5C6C,0x5DCD,0x61FC,/* 0xD8-0xDF */
+	0x61FE,0x651D,0x651C,0x6595,0x66E9,0x6AFB,0x6B04,0x6AFA,/* 0xE0-0xE7 */
+	0x6BB2,0x704C,0x721B,0x72A7,0x74D6,0x74D4,0x7669,0x77D3,/* 0xE8-0xEF */
+	0x7C50,0x7E8F,0x7E8C,0x7FBC,0x8617,0x862D,0x861A,0x8823,/* 0xF0-0xF7 */
+	0x8822,0x8821,0x881F,0x896A,0x896C,0x89BD,0x8B74,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B77,0x8B7D,0x8D13,0x8E8A,0x8E8D,0x8E8B,0x8F5F,0x8FAF,/* 0x40-0x47 */
+	0x91BA,0x942E,0x9433,0x9435,0x943A,0x9438,0x9432,0x942B,/* 0x48-0x4F */
+	0x95E2,0x9738,0x9739,0x9732,0x97FF,0x9867,0x9865,0x9957,/* 0x50-0x57 */
+	0x9A45,0x9A43,0x9A40,0x9A3E,0x9ACF,0x9B54,0x9B51,0x9C2D,/* 0x58-0x5F */
+	0x9C25,0x9DAF,0x9DB4,0x9DC2,0x9DB8,0x9E9D,0x9EEF,0x9F19,/* 0x60-0x67 */
+	0x9F5C,0x9F66,0x9F67,0x513C,0x513B,0x56C8,0x56CA,0x56C9,/* 0x68-0x6F */
+	0x5B7F,0x5DD4,0x5DD2,0x5F4E,0x61FF,0x6524,0x6B0A,0x6B61,/* 0x70-0x77 */
+	0x7051,0x7058,0x7380,0x74E4,0x758A,0x766E,0x766C,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x79B3,0x7C60,0x7C5F,0x807E,0x807D,0x81DF,0x8972,/* 0xA0-0xA7 */
+	0x896F,0x89FC,0x8B80,0x8D16,0x8D17,0x8E91,0x8E93,0x8F61,/* 0xA8-0xAF */
+	0x9148,0x9444,0x9451,0x9452,0x973D,0x973E,0x97C3,0x97C1,/* 0xB0-0xB7 */
+	0x986B,0x9955,0x9A55,0x9A4D,0x9AD2,0x9B1A,0x9C49,0x9C31,/* 0xB8-0xBF */
+	0x9C3E,0x9C3B,0x9DD3,0x9DD7,0x9F34,0x9F6C,0x9F6A,0x9F94,/* 0xC0-0xC7 */
+	0x56CC,0x5DD6,0x6200,0x6523,0x652B,0x652A,0x66EC,0x6B10,/* 0xC8-0xCF */
+	0x74DA,0x7ACA,0x7C64,0x7C63,0x7C65,0x7E93,0x7E96,0x7E94,/* 0xD0-0xD7 */
+	0x81E2,0x8638,0x863F,0x8831,0x8B8A,0x9090,0x908F,0x9463,/* 0xD8-0xDF */
+	0x9460,0x9464,0x9768,0x986F,0x995C,0x9A5A,0x9A5B,0x9A57,/* 0xE0-0xE7 */
+	0x9AD3,0x9AD4,0x9AD1,0x9C54,0x9C57,0x9C56,0x9DE5,0x9E9F,/* 0xE8-0xEF */
+	0x9EF4,0x56D1,0x58E9,0x652C,0x705E,0x7671,0x7672,0x77D7,/* 0xF0-0xF7 */
+	0x7F50,0x7F88,0x8836,0x8839,0x8862,0x8B93,0x8B92,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B96,0x8277,0x8D1B,0x91C0,0x946A,0x9742,0x9748,0x9744,/* 0x40-0x47 */
+	0x97C6,0x9870,0x9A5F,0x9B22,0x9B58,0x9C5F,0x9DF9,0x9DFA,/* 0x48-0x4F */
+	0x9E7C,0x9E7D,0x9F07,0x9F77,0x9F72,0x5EF3,0x6B16,0x7063,/* 0x50-0x57 */
+	0x7C6C,0x7C6E,0x883B,0x89C0,0x8EA1,0x91C1,0x9472,0x9470,/* 0x58-0x5F */
+	0x9871,0x995E,0x9AD6,0x9B23,0x9ECC,0x7064,0x77DA,0x8B9A,/* 0x60-0x67 */
+	0x9477,0x97C9,0x9A62,0x9A65,0x7E9C,0x8B9C,0x8EAA,0x91C5,/* 0x68-0x6F */
+	0x947D,0x947E,0x947C,0x9C77,0x9C78,0x9EF7,0x8C54,0x947F,/* 0x70-0x77 */
+	0x9E1A,0x7228,0x9A6A,0x9B31,0x9E1B,0x9E1E,0x7C72,0x0000,/* 0x78-0x7F */
+};
+
+static const wchar_t c2u_C9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E42,0x4E5C,0x51F5,0x531A,0x5382,0x4E07,0x4E0C,0x4E47,/* 0x40-0x47 */
+	0x4E8D,0x56D7,0xFA0C,0x5C6E,0x5F73,0x4E0F,0x5187,0x4E0E,/* 0x48-0x4F */
+	0x4E2E,0x4E93,0x4EC2,0x4EC9,0x4EC8,0x5198,0x52FC,0x536C,/* 0x50-0x57 */
+	0x53B9,0x5720,0x5903,0x592C,0x5C10,0x5DFF,0x65E1,0x6BB3,/* 0x58-0x5F */
+	0x6BCC,0x6C14,0x723F,0x4E31,0x4E3C,0x4EE8,0x4EDC,0x4EE9,/* 0x60-0x67 */
+	0x4EE1,0x4EDD,0x4EDA,0x520C,0x531C,0x534C,0x5722,0x5723,/* 0x68-0x6F */
+	0x5917,0x592F,0x5B81,0x5B84,0x5C12,0x5C3B,0x5C74,0x5C73,/* 0x70-0x77 */
+	0x5E04,0x5E80,0x5E82,0x5FC9,0x6209,0x6250,0x6C15,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6C36,0x6C43,0x6C3F,0x6C3B,0x72AE,0x72B0,0x738A,/* 0xA0-0xA7 */
+	0x79B8,0x808A,0x961E,0x4F0E,0x4F18,0x4F2C,0x4EF5,0x4F14,/* 0xA8-0xAF */
+	0x4EF1,0x4F00,0x4EF7,0x4F08,0x4F1D,0x4F02,0x4F05,0x4F22,/* 0xB0-0xB7 */
+	0x4F13,0x4F04,0x4EF4,0x4F12,0x51B1,0x5213,0x5209,0x5210,/* 0xB8-0xBF */
+	0x52A6,0x5322,0x531F,0x534D,0x538A,0x5407,0x56E1,0x56DF,/* 0xC0-0xC7 */
+	0x572E,0x572A,0x5734,0x593C,0x5980,0x597C,0x5985,0x597B,/* 0xC8-0xCF */
+	0x597E,0x5977,0x597F,0x5B56,0x5C15,0x5C25,0x5C7C,0x5C7A,/* 0xD0-0xD7 */
+	0x5C7B,0x5C7E,0x5DDF,0x5E75,0x5E84,0x5F02,0x5F1A,0x5F74,/* 0xD8-0xDF */
+	0x5FD5,0x5FD4,0x5FCF,0x625C,0x625E,0x6264,0x6261,0x6266,/* 0xE0-0xE7 */
+	0x6262,0x6259,0x6260,0x625A,0x6265,0x65EF,0x65EE,0x673E,/* 0xE8-0xEF */
+	0x6739,0x6738,0x673B,0x673A,0x673F,0x673C,0x6733,0x6C18,/* 0xF0-0xF7 */
+	0x6C46,0x6C52,0x6C5C,0x6C4F,0x6C4A,0x6C54,0x6C4B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6C4C,0x7071,0x725E,0x72B4,0x72B5,0x738E,0x752A,0x767F,/* 0x40-0x47 */
+	0x7A75,0x7F51,0x8278,0x827C,0x8280,0x827D,0x827F,0x864D,/* 0x48-0x4F */
+	0x897E,0x9099,0x9097,0x9098,0x909B,0x9094,0x9622,0x9624,/* 0x50-0x57 */
+	0x9620,0x9623,0x4F56,0x4F3B,0x4F62,0x4F49,0x4F53,0x4F64,/* 0x58-0x5F */
+	0x4F3E,0x4F67,0x4F52,0x4F5F,0x4F41,0x4F58,0x4F2D,0x4F33,/* 0x60-0x67 */
+	0x4F3F,0x4F61,0x518F,0x51B9,0x521C,0x521E,0x5221,0x52AD,/* 0x68-0x6F */
+	0x52AE,0x5309,0x5363,0x5372,0x538E,0x538F,0x5430,0x5437,/* 0x70-0x77 */
+	0x542A,0x5454,0x5445,0x5419,0x541C,0x5425,0x5418,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x543D,0x544F,0x5441,0x5428,0x5424,0x5447,0x56EE,/* 0xA0-0xA7 */
+	0x56E7,0x56E5,0x5741,0x5745,0x574C,0x5749,0x574B,0x5752,/* 0xA8-0xAF */
+	0x5906,0x5940,0x59A6,0x5998,0x59A0,0x5997,0x598E,0x59A2,/* 0xB0-0xB7 */
+	0x5990,0x598F,0x59A7,0x59A1,0x5B8E,0x5B92,0x5C28,0x5C2A,/* 0xB8-0xBF */
+	0x5C8D,0x5C8F,0x5C88,0x5C8B,0x5C89,0x5C92,0x5C8A,0x5C86,/* 0xC0-0xC7 */
+	0x5C93,0x5C95,0x5DE0,0x5E0A,0x5E0E,0x5E8B,0x5E89,0x5E8C,/* 0xC8-0xCF */
+	0x5E88,0x5E8D,0x5F05,0x5F1D,0x5F78,0x5F76,0x5FD2,0x5FD1,/* 0xD0-0xD7 */
+	0x5FD0,0x5FED,0x5FE8,0x5FEE,0x5FF3,0x5FE1,0x5FE4,0x5FE3,/* 0xD8-0xDF */
+	0x5FFA,0x5FEF,0x5FF7,0x5FFB,0x6000,0x5FF4,0x623A,0x6283,/* 0xE0-0xE7 */
+	0x628C,0x628E,0x628F,0x6294,0x6287,0x6271,0x627B,0x627A,/* 0xE8-0xEF */
+	0x6270,0x6281,0x6288,0x6277,0x627D,0x6272,0x6274,0x6537,/* 0xF0-0xF7 */
+	0x65F0,0x65F4,0x65F3,0x65F2,0x65F5,0x6745,0x6747,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6759,0x6755,0x674C,0x6748,0x675D,0x674D,0x675A,0x674B,/* 0x40-0x47 */
+	0x6BD0,0x6C19,0x6C1A,0x6C78,0x6C67,0x6C6B,0x6C84,0x6C8B,/* 0x48-0x4F */
+	0x6C8F,0x6C71,0x6C6F,0x6C69,0x6C9A,0x6C6D,0x6C87,0x6C95,/* 0x50-0x57 */
+	0x6C9C,0x6C66,0x6C73,0x6C65,0x6C7B,0x6C8E,0x7074,0x707A,/* 0x58-0x5F */
+	0x7263,0x72BF,0x72BD,0x72C3,0x72C6,0x72C1,0x72BA,0x72C5,/* 0x60-0x67 */
+	0x7395,0x7397,0x7393,0x7394,0x7392,0x753A,0x7539,0x7594,/* 0x68-0x6F */
+	0x7595,0x7681,0x793D,0x8034,0x8095,0x8099,0x8090,0x8092,/* 0x70-0x77 */
+	0x809C,0x8290,0x828F,0x8285,0x828E,0x8291,0x8293,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x828A,0x8283,0x8284,0x8C78,0x8FC9,0x8FBF,0x909F,/* 0xA0-0xA7 */
+	0x90A1,0x90A5,0x909E,0x90A7,0x90A0,0x9630,0x9628,0x962F,/* 0xA8-0xAF */
+	0x962D,0x4E33,0x4F98,0x4F7C,0x4F85,0x4F7D,0x4F80,0x4F87,/* 0xB0-0xB7 */
+	0x4F76,0x4F74,0x4F89,0x4F84,0x4F77,0x4F4C,0x4F97,0x4F6A,/* 0xB8-0xBF */
+	0x4F9A,0x4F79,0x4F81,0x4F78,0x4F90,0x4F9C,0x4F94,0x4F9E,/* 0xC0-0xC7 */
+	0x4F92,0x4F82,0x4F95,0x4F6B,0x4F6E,0x519E,0x51BC,0x51BE,/* 0xC8-0xCF */
+	0x5235,0x5232,0x5233,0x5246,0x5231,0x52BC,0x530A,0x530B,/* 0xD0-0xD7 */
+	0x533C,0x5392,0x5394,0x5487,0x547F,0x5481,0x5491,0x5482,/* 0xD8-0xDF */
+	0x5488,0x546B,0x547A,0x547E,0x5465,0x546C,0x5474,0x5466,/* 0xE0-0xE7 */
+	0x548D,0x546F,0x5461,0x5460,0x5498,0x5463,0x5467,0x5464,/* 0xE8-0xEF */
+	0x56F7,0x56F9,0x576F,0x5772,0x576D,0x576B,0x5771,0x5770,/* 0xF0-0xF7 */
+	0x5776,0x5780,0x5775,0x577B,0x5773,0x5774,0x5762,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5768,0x577D,0x590C,0x5945,0x59B5,0x59BA,0x59CF,0x59CE,/* 0x40-0x47 */
+	0x59B2,0x59CC,0x59C1,0x59B6,0x59BC,0x59C3,0x59D6,0x59B1,/* 0x48-0x4F */
+	0x59BD,0x59C0,0x59C8,0x59B4,0x59C7,0x5B62,0x5B65,0x5B93,/* 0x50-0x57 */
+	0x5B95,0x5C44,0x5C47,0x5CAE,0x5CA4,0x5CA0,0x5CB5,0x5CAF,/* 0x58-0x5F */
+	0x5CA8,0x5CAC,0x5C9F,0x5CA3,0x5CAD,0x5CA2,0x5CAA,0x5CA7,/* 0x60-0x67 */
+	0x5C9D,0x5CA5,0x5CB6,0x5CB0,0x5CA6,0x5E17,0x5E14,0x5E19,/* 0x68-0x6F */
+	0x5F28,0x5F22,0x5F23,0x5F24,0x5F54,0x5F82,0x5F7E,0x5F7D,/* 0x70-0x77 */
+	0x5FDE,0x5FE5,0x602D,0x6026,0x6019,0x6032,0x600B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6034,0x600A,0x6017,0x6033,0x601A,0x601E,0x602C,/* 0xA0-0xA7 */
+	0x6022,0x600D,0x6010,0x602E,0x6013,0x6011,0x600C,0x6009,/* 0xA8-0xAF */
+	0x601C,0x6214,0x623D,0x62AD,0x62B4,0x62D1,0x62BE,0x62AA,/* 0xB0-0xB7 */
+	0x62B6,0x62CA,0x62AE,0x62B3,0x62AF,0x62BB,0x62A9,0x62B0,/* 0xB8-0xBF */
+	0x62B8,0x653D,0x65A8,0x65BB,0x6609,0x65FC,0x6604,0x6612,/* 0xC0-0xC7 */
+	0x6608,0x65FB,0x6603,0x660B,0x660D,0x6605,0x65FD,0x6611,/* 0xC8-0xCF */
+	0x6610,0x66F6,0x670A,0x6785,0x676C,0x678E,0x6792,0x6776,/* 0xD0-0xD7 */
+	0x677B,0x6798,0x6786,0x6784,0x6774,0x678D,0x678C,0x677A,/* 0xD8-0xDF */
+	0x679F,0x6791,0x6799,0x6783,0x677D,0x6781,0x6778,0x6779,/* 0xE0-0xE7 */
+	0x6794,0x6B25,0x6B80,0x6B7E,0x6BDE,0x6C1D,0x6C93,0x6CEC,/* 0xE8-0xEF */
+	0x6CEB,0x6CEE,0x6CD9,0x6CB6,0x6CD4,0x6CAD,0x6CE7,0x6CB7,/* 0xF0-0xF7 */
+	0x6CD0,0x6CC2,0x6CBA,0x6CC3,0x6CC6,0x6CED,0x6CF2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6CD2,0x6CDD,0x6CB4,0x6C8A,0x6C9D,0x6C80,0x6CDE,0x6CC0,/* 0x40-0x47 */
+	0x6D30,0x6CCD,0x6CC7,0x6CB0,0x6CF9,0x6CCF,0x6CE9,0x6CD1,/* 0x48-0x4F */
+	0x7094,0x7098,0x7085,0x7093,0x7086,0x7084,0x7091,0x7096,/* 0x50-0x57 */
+	0x7082,0x709A,0x7083,0x726A,0x72D6,0x72CB,0x72D8,0x72C9,/* 0x58-0x5F */
+	0x72DC,0x72D2,0x72D4,0x72DA,0x72CC,0x72D1,0x73A4,0x73A1,/* 0x60-0x67 */
+	0x73AD,0x73A6,0x73A2,0x73A0,0x73AC,0x739D,0x74DD,0x74E8,/* 0x68-0x6F */
+	0x753F,0x7540,0x753E,0x758C,0x7598,0x76AF,0x76F3,0x76F1,/* 0x70-0x77 */
+	0x76F0,0x76F5,0x77F8,0x77FC,0x77F9,0x77FB,0x77FA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x77F7,0x7942,0x793F,0x79C5,0x7A78,0x7A7B,0x7AFB,/* 0xA0-0xA7 */
+	0x7C75,0x7CFD,0x8035,0x808F,0x80AE,0x80A3,0x80B8,0x80B5,/* 0xA8-0xAF */
+	0x80AD,0x8220,0x82A0,0x82C0,0x82AB,0x829A,0x8298,0x829B,/* 0xB0-0xB7 */
+	0x82B5,0x82A7,0x82AE,0x82BC,0x829E,0x82BA,0x82B4,0x82A8,/* 0xB8-0xBF */
+	0x82A1,0x82A9,0x82C2,0x82A4,0x82C3,0x82B6,0x82A2,0x8670,/* 0xC0-0xC7 */
+	0x866F,0x866D,0x866E,0x8C56,0x8FD2,0x8FCB,0x8FD3,0x8FCD,/* 0xC8-0xCF */
+	0x8FD6,0x8FD5,0x8FD7,0x90B2,0x90B4,0x90AF,0x90B3,0x90B0,/* 0xD0-0xD7 */
+	0x9639,0x963D,0x963C,0x963A,0x9643,0x4FCD,0x4FC5,0x4FD3,/* 0xD8-0xDF */
+	0x4FB2,0x4FC9,0x4FCB,0x4FC1,0x4FD4,0x4FDC,0x4FD9,0x4FBB,/* 0xE0-0xE7 */
+	0x4FB3,0x4FDB,0x4FC7,0x4FD6,0x4FBA,0x4FC0,0x4FB9,0x4FEC,/* 0xE8-0xEF */
+	0x5244,0x5249,0x52C0,0x52C2,0x533D,0x537C,0x5397,0x5396,/* 0xF0-0xF7 */
+	0x5399,0x5398,0x54BA,0x54A1,0x54AD,0x54A5,0x54CF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54C3,0x830D,0x54B7,0x54AE,0x54D6,0x54B6,0x54C5,0x54C6,/* 0x40-0x47 */
+	0x54A0,0x5470,0x54BC,0x54A2,0x54BE,0x5472,0x54DE,0x54B0,/* 0x48-0x4F */
+	0x57B5,0x579E,0x579F,0x57A4,0x578C,0x5797,0x579D,0x579B,/* 0x50-0x57 */
+	0x5794,0x5798,0x578F,0x5799,0x57A5,0x579A,0x5795,0x58F4,/* 0x58-0x5F */
+	0x590D,0x5953,0x59E1,0x59DE,0x59EE,0x5A00,0x59F1,0x59DD,/* 0x60-0x67 */
+	0x59FA,0x59FD,0x59FC,0x59F6,0x59E4,0x59F2,0x59F7,0x59DB,/* 0x68-0x6F */
+	0x59E9,0x59F3,0x59F5,0x59E0,0x59FE,0x59F4,0x59ED,0x5BA8,/* 0x70-0x77 */
+	0x5C4C,0x5CD0,0x5CD8,0x5CCC,0x5CD7,0x5CCB,0x5CDB,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5CDE,0x5CDA,0x5CC9,0x5CC7,0x5CCA,0x5CD6,0x5CD3,/* 0xA0-0xA7 */
+	0x5CD4,0x5CCF,0x5CC8,0x5CC6,0x5CCE,0x5CDF,0x5CF8,0x5DF9,/* 0xA8-0xAF */
+	0x5E21,0x5E22,0x5E23,0x5E20,0x5E24,0x5EB0,0x5EA4,0x5EA2,/* 0xB0-0xB7 */
+	0x5E9B,0x5EA3,0x5EA5,0x5F07,0x5F2E,0x5F56,0x5F86,0x6037,/* 0xB8-0xBF */
+	0x6039,0x6054,0x6072,0x605E,0x6045,0x6053,0x6047,0x6049,/* 0xC0-0xC7 */
+	0x605B,0x604C,0x6040,0x6042,0x605F,0x6024,0x6044,0x6058,/* 0xC8-0xCF */
+	0x6066,0x606E,0x6242,0x6243,0x62CF,0x630D,0x630B,0x62F5,/* 0xD0-0xD7 */
+	0x630E,0x6303,0x62EB,0x62F9,0x630F,0x630C,0x62F8,0x62F6,/* 0xD8-0xDF */
+	0x6300,0x6313,0x6314,0x62FA,0x6315,0x62FB,0x62F0,0x6541,/* 0xE0-0xE7 */
+	0x6543,0x65AA,0x65BF,0x6636,0x6621,0x6632,0x6635,0x661C,/* 0xE8-0xEF */
+	0x6626,0x6622,0x6633,0x662B,0x663A,0x661D,0x6634,0x6639,/* 0xF0-0xF7 */
+	0x662E,0x670F,0x6710,0x67C1,0x67F2,0x67C8,0x67BA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x67DC,0x67BB,0x67F8,0x67D8,0x67C0,0x67B7,0x67C5,0x67EB,/* 0x40-0x47 */
+	0x67E4,0x67DF,0x67B5,0x67CD,0x67B3,0x67F7,0x67F6,0x67EE,/* 0x48-0x4F */
+	0x67E3,0x67C2,0x67B9,0x67CE,0x67E7,0x67F0,0x67B2,0x67FC,/* 0x50-0x57 */
+	0x67C6,0x67ED,0x67CC,0x67AE,0x67E6,0x67DB,0x67FA,0x67C9,/* 0x58-0x5F */
+	0x67CA,0x67C3,0x67EA,0x67CB,0x6B28,0x6B82,0x6B84,0x6BB6,/* 0x60-0x67 */
+	0x6BD6,0x6BD8,0x6BE0,0x6C20,0x6C21,0x6D28,0x6D34,0x6D2D,/* 0x68-0x6F */
+	0x6D1F,0x6D3C,0x6D3F,0x6D12,0x6D0A,0x6CDA,0x6D33,0x6D04,/* 0x70-0x77 */
+	0x6D19,0x6D3A,0x6D1A,0x6D11,0x6D00,0x6D1D,0x6D42,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6D01,0x6D18,0x6D37,0x6D03,0x6D0F,0x6D40,0x6D07,/* 0xA0-0xA7 */
+	0x6D20,0x6D2C,0x6D08,0x6D22,0x6D09,0x6D10,0x70B7,0x709F,/* 0xA8-0xAF */
+	0x70BE,0x70B1,0x70B0,0x70A1,0x70B4,0x70B5,0x70A9,0x7241,/* 0xB0-0xB7 */
+	0x7249,0x724A,0x726C,0x7270,0x7273,0x726E,0x72CA,0x72E4,/* 0xB8-0xBF */
+	0x72E8,0x72EB,0x72DF,0x72EA,0x72E6,0x72E3,0x7385,0x73CC,/* 0xC0-0xC7 */
+	0x73C2,0x73C8,0x73C5,0x73B9,0x73B6,0x73B5,0x73B4,0x73EB,/* 0xC8-0xCF */
+	0x73BF,0x73C7,0x73BE,0x73C3,0x73C6,0x73B8,0x73CB,0x74EC,/* 0xD0-0xD7 */
+	0x74EE,0x752E,0x7547,0x7548,0x75A7,0x75AA,0x7679,0x76C4,/* 0xD8-0xDF */
+	0x7708,0x7703,0x7704,0x7705,0x770A,0x76F7,0x76FB,0x76FA,/* 0xE0-0xE7 */
+	0x77E7,0x77E8,0x7806,0x7811,0x7812,0x7805,0x7810,0x780F,/* 0xE8-0xEF */
+	0x780E,0x7809,0x7803,0x7813,0x794A,0x794C,0x794B,0x7945,/* 0xF0-0xF7 */
+	0x7944,0x79D5,0x79CD,0x79CF,0x79D6,0x79CE,0x7A80,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7A7E,0x7AD1,0x7B00,0x7B01,0x7C7A,0x7C78,0x7C79,0x7C7F,/* 0x40-0x47 */
+	0x7C80,0x7C81,0x7D03,0x7D08,0x7D01,0x7F58,0x7F91,0x7F8D,/* 0x48-0x4F */
+	0x7FBE,0x8007,0x800E,0x800F,0x8014,0x8037,0x80D8,0x80C7,/* 0x50-0x57 */
+	0x80E0,0x80D1,0x80C8,0x80C2,0x80D0,0x80C5,0x80E3,0x80D9,/* 0x58-0x5F */
+	0x80DC,0x80CA,0x80D5,0x80C9,0x80CF,0x80D7,0x80E6,0x80CD,/* 0x60-0x67 */
+	0x81FF,0x8221,0x8294,0x82D9,0x82FE,0x82F9,0x8307,0x82E8,/* 0x68-0x6F */
+	0x8300,0x82D5,0x833A,0x82EB,0x82D6,0x82F4,0x82EC,0x82E1,/* 0x70-0x77 */
+	0x82F2,0x82F5,0x830C,0x82FB,0x82F6,0x82F0,0x82EA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x82E4,0x82E0,0x82FA,0x82F3,0x82ED,0x8677,0x8674,/* 0xA0-0xA7 */
+	0x867C,0x8673,0x8841,0x884E,0x8867,0x886A,0x8869,0x89D3,/* 0xA8-0xAF */
+	0x8A04,0x8A07,0x8D72,0x8FE3,0x8FE1,0x8FEE,0x8FE0,0x90F1,/* 0xB0-0xB7 */
+	0x90BD,0x90BF,0x90D5,0x90C5,0x90BE,0x90C7,0x90CB,0x90C8,/* 0xB8-0xBF */
+	0x91D4,0x91D3,0x9654,0x964F,0x9651,0x9653,0x964A,0x964E,/* 0xC0-0xC7 */
+	0x501E,0x5005,0x5007,0x5013,0x5022,0x5030,0x501B,0x4FF5,/* 0xC8-0xCF */
+	0x4FF4,0x5033,0x5037,0x502C,0x4FF6,0x4FF7,0x5017,0x501C,/* 0xD0-0xD7 */
+	0x5020,0x5027,0x5035,0x502F,0x5031,0x500E,0x515A,0x5194,/* 0xD8-0xDF */
+	0x5193,0x51CA,0x51C4,0x51C5,0x51C8,0x51CE,0x5261,0x525A,/* 0xE0-0xE7 */
+	0x5252,0x525E,0x525F,0x5255,0x5262,0x52CD,0x530E,0x539E,/* 0xE8-0xEF */
+	0x5526,0x54E2,0x5517,0x5512,0x54E7,0x54F3,0x54E4,0x551A,/* 0xF0-0xF7 */
+	0x54FF,0x5504,0x5508,0x54EB,0x5511,0x5505,0x54F1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x550A,0x54FB,0x54F7,0x54F8,0x54E0,0x550E,0x5503,0x550B,/* 0x40-0x47 */
+	0x5701,0x5702,0x57CC,0x5832,0x57D5,0x57D2,0x57BA,0x57C6,/* 0x48-0x4F */
+	0x57BD,0x57BC,0x57B8,0x57B6,0x57BF,0x57C7,0x57D0,0x57B9,/* 0x50-0x57 */
+	0x57C1,0x590E,0x594A,0x5A19,0x5A16,0x5A2D,0x5A2E,0x5A15,/* 0x58-0x5F */
+	0x5A0F,0x5A17,0x5A0A,0x5A1E,0x5A33,0x5B6C,0x5BA7,0x5BAD,/* 0x60-0x67 */
+	0x5BAC,0x5C03,0x5C56,0x5C54,0x5CEC,0x5CFF,0x5CEE,0x5CF1,/* 0x68-0x6F */
+	0x5CF7,0x5D00,0x5CF9,0x5E29,0x5E28,0x5EA8,0x5EAE,0x5EAA,/* 0x70-0x77 */
+	0x5EAC,0x5F33,0x5F30,0x5F67,0x605D,0x605A,0x6067,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6041,0x60A2,0x6088,0x6080,0x6092,0x6081,0x609D,/* 0xA0-0xA7 */
+	0x6083,0x6095,0x609B,0x6097,0x6087,0x609C,0x608E,0x6219,/* 0xA8-0xAF */
+	0x6246,0x62F2,0x6310,0x6356,0x632C,0x6344,0x6345,0x6336,/* 0xB0-0xB7 */
+	0x6343,0x63E4,0x6339,0x634B,0x634A,0x633C,0x6329,0x6341,/* 0xB8-0xBF */
+	0x6334,0x6358,0x6354,0x6359,0x632D,0x6347,0x6333,0x635A,/* 0xC0-0xC7 */
+	0x6351,0x6338,0x6357,0x6340,0x6348,0x654A,0x6546,0x65C6,/* 0xC8-0xCF */
+	0x65C3,0x65C4,0x65C2,0x664A,0x665F,0x6647,0x6651,0x6712,/* 0xD0-0xD7 */
+	0x6713,0x681F,0x681A,0x6849,0x6832,0x6833,0x683B,0x684B,/* 0xD8-0xDF */
+	0x684F,0x6816,0x6831,0x681C,0x6835,0x682B,0x682D,0x682F,/* 0xE0-0xE7 */
+	0x684E,0x6844,0x6834,0x681D,0x6812,0x6814,0x6826,0x6828,/* 0xE8-0xEF */
+	0x682E,0x684D,0x683A,0x6825,0x6820,0x6B2C,0x6B2F,0x6B2D,/* 0xF0-0xF7 */
+	0x6B31,0x6B34,0x6B6D,0x8082,0x6B88,0x6BE6,0x6BE4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6BE8,0x6BE3,0x6BE2,0x6BE7,0x6C25,0x6D7A,0x6D63,0x6D64,/* 0x40-0x47 */
+	0x6D76,0x6D0D,0x6D61,0x6D92,0x6D58,0x6D62,0x6D6D,0x6D6F,/* 0x48-0x4F */
+	0x6D91,0x6D8D,0x6DEF,0x6D7F,0x6D86,0x6D5E,0x6D67,0x6D60,/* 0x50-0x57 */
+	0x6D97,0x6D70,0x6D7C,0x6D5F,0x6D82,0x6D98,0x6D2F,0x6D68,/* 0x58-0x5F */
+	0x6D8B,0x6D7E,0x6D80,0x6D84,0x6D16,0x6D83,0x6D7B,0x6D7D,/* 0x60-0x67 */
+	0x6D75,0x6D90,0x70DC,0x70D3,0x70D1,0x70DD,0x70CB,0x7F39,/* 0x68-0x6F */
+	0x70E2,0x70D7,0x70D2,0x70DE,0x70E0,0x70D4,0x70CD,0x70C5,/* 0x70-0x77 */
+	0x70C6,0x70C7,0x70DA,0x70CE,0x70E1,0x7242,0x7278,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7277,0x7276,0x7300,0x72FA,0x72F4,0x72FE,0x72F6,/* 0xA0-0xA7 */
+	0x72F3,0x72FB,0x7301,0x73D3,0x73D9,0x73E5,0x73D6,0x73BC,/* 0xA8-0xAF */
+	0x73E7,0x73E3,0x73E9,0x73DC,0x73D2,0x73DB,0x73D4,0x73DD,/* 0xB0-0xB7 */
+	0x73DA,0x73D7,0x73D8,0x73E8,0x74DE,0x74DF,0x74F4,0x74F5,/* 0xB8-0xBF */
+	0x7521,0x755B,0x755F,0x75B0,0x75C1,0x75BB,0x75C4,0x75C0,/* 0xC0-0xC7 */
+	0x75BF,0x75B6,0x75BA,0x768A,0x76C9,0x771D,0x771B,0x7710,/* 0xC8-0xCF */
+	0x7713,0x7712,0x7723,0x7711,0x7715,0x7719,0x771A,0x7722,/* 0xD0-0xD7 */
+	0x7727,0x7823,0x782C,0x7822,0x7835,0x782F,0x7828,0x782E,/* 0xD8-0xDF */
+	0x782B,0x7821,0x7829,0x7833,0x782A,0x7831,0x7954,0x795B,/* 0xE0-0xE7 */
+	0x794F,0x795C,0x7953,0x7952,0x7951,0x79EB,0x79EC,0x79E0,/* 0xE8-0xEF */
+	0x79EE,0x79ED,0x79EA,0x79DC,0x79DE,0x79DD,0x7A86,0x7A89,/* 0xF0-0xF7 */
+	0x7A85,0x7A8B,0x7A8C,0x7A8A,0x7A87,0x7AD8,0x7B10,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7B04,0x7B13,0x7B05,0x7B0F,0x7B08,0x7B0A,0x7B0E,0x7B09,/* 0x40-0x47 */
+	0x7B12,0x7C84,0x7C91,0x7C8A,0x7C8C,0x7C88,0x7C8D,0x7C85,/* 0x48-0x4F */
+	0x7D1E,0x7D1D,0x7D11,0x7D0E,0x7D18,0x7D16,0x7D13,0x7D1F,/* 0x50-0x57 */
+	0x7D12,0x7D0F,0x7D0C,0x7F5C,0x7F61,0x7F5E,0x7F60,0x7F5D,/* 0x58-0x5F */
+	0x7F5B,0x7F96,0x7F92,0x7FC3,0x7FC2,0x7FC0,0x8016,0x803E,/* 0x60-0x67 */
+	0x8039,0x80FA,0x80F2,0x80F9,0x80F5,0x8101,0x80FB,0x8100,/* 0x68-0x6F */
+	0x8201,0x822F,0x8225,0x8333,0x832D,0x8344,0x8319,0x8351,/* 0x70-0x77 */
+	0x8325,0x8356,0x833F,0x8341,0x8326,0x831C,0x8322,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8342,0x834E,0x831B,0x832A,0x8308,0x833C,0x834D,/* 0xA0-0xA7 */
+	0x8316,0x8324,0x8320,0x8337,0x832F,0x8329,0x8347,0x8345,/* 0xA8-0xAF */
+	0x834C,0x8353,0x831E,0x832C,0x834B,0x8327,0x8348,0x8653,/* 0xB0-0xB7 */
+	0x8652,0x86A2,0x86A8,0x8696,0x868D,0x8691,0x869E,0x8687,/* 0xB8-0xBF */
+	0x8697,0x8686,0x868B,0x869A,0x8685,0x86A5,0x8699,0x86A1,/* 0xC0-0xC7 */
+	0x86A7,0x8695,0x8698,0x868E,0x869D,0x8690,0x8694,0x8843,/* 0xC8-0xCF */
+	0x8844,0x886D,0x8875,0x8876,0x8872,0x8880,0x8871,0x887F,/* 0xD0-0xD7 */
+	0x886F,0x8883,0x887E,0x8874,0x887C,0x8A12,0x8C47,0x8C57,/* 0xD8-0xDF */
+	0x8C7B,0x8CA4,0x8CA3,0x8D76,0x8D78,0x8DB5,0x8DB7,0x8DB6,/* 0xE0-0xE7 */
+	0x8ED1,0x8ED3,0x8FFE,0x8FF5,0x9002,0x8FFF,0x8FFB,0x9004,/* 0xE8-0xEF */
+	0x8FFC,0x8FF6,0x90D6,0x90E0,0x90D9,0x90DA,0x90E3,0x90DF,/* 0xF0-0xF7 */
+	0x90E5,0x90D8,0x90DB,0x90D7,0x90DC,0x90E4,0x9150,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x914E,0x914F,0x91D5,0x91E2,0x91DA,0x965C,0x965F,0x96BC,/* 0x40-0x47 */
+	0x98E3,0x9ADF,0x9B2F,0x4E7F,0x5070,0x506A,0x5061,0x505E,/* 0x48-0x4F */
+	0x5060,0x5053,0x504B,0x505D,0x5072,0x5048,0x504D,0x5041,/* 0x50-0x57 */
+	0x505B,0x504A,0x5062,0x5015,0x5045,0x505F,0x5069,0x506B,/* 0x58-0x5F */
+	0x5063,0x5064,0x5046,0x5040,0x506E,0x5073,0x5057,0x5051,/* 0x60-0x67 */
+	0x51D0,0x526B,0x526D,0x526C,0x526E,0x52D6,0x52D3,0x532D,/* 0x68-0x6F */
+	0x539C,0x5575,0x5576,0x553C,0x554D,0x5550,0x5534,0x552A,/* 0x70-0x77 */
+	0x5551,0x5562,0x5536,0x5535,0x5530,0x5552,0x5545,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x550C,0x5532,0x5565,0x554E,0x5539,0x5548,0x552D,/* 0xA0-0xA7 */
+	0x553B,0x5540,0x554B,0x570A,0x5707,0x57FB,0x5814,0x57E2,/* 0xA8-0xAF */
+	0x57F6,0x57DC,0x57F4,0x5800,0x57ED,0x57FD,0x5808,0x57F8,/* 0xB0-0xB7 */
+	0x580B,0x57F3,0x57CF,0x5807,0x57EE,0x57E3,0x57F2,0x57E5,/* 0xB8-0xBF */
+	0x57EC,0x57E1,0x580E,0x57FC,0x5810,0x57E7,0x5801,0x580C,/* 0xC0-0xC7 */
+	0x57F1,0x57E9,0x57F0,0x580D,0x5804,0x595C,0x5A60,0x5A58,/* 0xC8-0xCF */
+	0x5A55,0x5A67,0x5A5E,0x5A38,0x5A35,0x5A6D,0x5A50,0x5A5F,/* 0xD0-0xD7 */
+	0x5A65,0x5A6C,0x5A53,0x5A64,0x5A57,0x5A43,0x5A5D,0x5A52,/* 0xD8-0xDF */
+	0x5A44,0x5A5B,0x5A48,0x5A8E,0x5A3E,0x5A4D,0x5A39,0x5A4C,/* 0xE0-0xE7 */
+	0x5A70,0x5A69,0x5A47,0x5A51,0x5A56,0x5A42,0x5A5C,0x5B72,/* 0xE8-0xEF */
+	0x5B6E,0x5BC1,0x5BC0,0x5C59,0x5D1E,0x5D0B,0x5D1D,0x5D1A,/* 0xF0-0xF7 */
+	0x5D20,0x5D0C,0x5D28,0x5D0D,0x5D26,0x5D25,0x5D0F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5D30,0x5D12,0x5D23,0x5D1F,0x5D2E,0x5E3E,0x5E34,0x5EB1,/* 0x40-0x47 */
+	0x5EB4,0x5EB9,0x5EB2,0x5EB3,0x5F36,0x5F38,0x5F9B,0x5F96,/* 0x48-0x4F */
+	0x5F9F,0x608A,0x6090,0x6086,0x60BE,0x60B0,0x60BA,0x60D3,/* 0x50-0x57 */
+	0x60D4,0x60CF,0x60E4,0x60D9,0x60DD,0x60C8,0x60B1,0x60DB,/* 0x58-0x5F */
+	0x60B7,0x60CA,0x60BF,0x60C3,0x60CD,0x60C0,0x6332,0x6365,/* 0x60-0x67 */
+	0x638A,0x6382,0x637D,0x63BD,0x639E,0x63AD,0x639D,0x6397,/* 0x68-0x6F */
+	0x63AB,0x638E,0x636F,0x6387,0x6390,0x636E,0x63AF,0x6375,/* 0x70-0x77 */
+	0x639C,0x636D,0x63AE,0x637C,0x63A4,0x633B,0x639F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6378,0x6385,0x6381,0x6391,0x638D,0x6370,0x6553,/* 0xA0-0xA7 */
+	0x65CD,0x6665,0x6661,0x665B,0x6659,0x665C,0x6662,0x6718,/* 0xA8-0xAF */
+	0x6879,0x6887,0x6890,0x689C,0x686D,0x686E,0x68AE,0x68AB,/* 0xB0-0xB7 */
+	0x6956,0x686F,0x68A3,0x68AC,0x68A9,0x6875,0x6874,0x68B2,/* 0xB8-0xBF */
+	0x688F,0x6877,0x6892,0x687C,0x686B,0x6872,0x68AA,0x6880,/* 0xC0-0xC7 */
+	0x6871,0x687E,0x689B,0x6896,0x688B,0x68A0,0x6889,0x68A4,/* 0xC8-0xCF */
+	0x6878,0x687B,0x6891,0x688C,0x688A,0x687D,0x6B36,0x6B33,/* 0xD0-0xD7 */
+	0x6B37,0x6B38,0x6B91,0x6B8F,0x6B8D,0x6B8E,0x6B8C,0x6C2A,/* 0xD8-0xDF */
+	0x6DC0,0x6DAB,0x6DB4,0x6DB3,0x6E74,0x6DAC,0x6DE9,0x6DE2,/* 0xE0-0xE7 */
+	0x6DB7,0x6DF6,0x6DD4,0x6E00,0x6DC8,0x6DE0,0x6DDF,0x6DD6,/* 0xE8-0xEF */
+	0x6DBE,0x6DE5,0x6DDC,0x6DDD,0x6DDB,0x6DF4,0x6DCA,0x6DBD,/* 0xF0-0xF7 */
+	0x6DED,0x6DF0,0x6DBA,0x6DD5,0x6DC2,0x6DCF,0x6DC9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6DD0,0x6DF2,0x6DD3,0x6DFD,0x6DD7,0x6DCD,0x6DE3,0x6DBB,/* 0x40-0x47 */
+	0x70FA,0x710D,0x70F7,0x7117,0x70F4,0x710C,0x70F0,0x7104,/* 0x48-0x4F */
+	0x70F3,0x7110,0x70FC,0x70FF,0x7106,0x7113,0x7100,0x70F8,/* 0x50-0x57 */
+	0x70F6,0x710B,0x7102,0x710E,0x727E,0x727B,0x727C,0x727F,/* 0x58-0x5F */
+	0x731D,0x7317,0x7307,0x7311,0x7318,0x730A,0x7308,0x72FF,/* 0x60-0x67 */
+	0x730F,0x731E,0x7388,0x73F6,0x73F8,0x73F5,0x7404,0x7401,/* 0x68-0x6F */
+	0x73FD,0x7407,0x7400,0x73FA,0x73FC,0x73FF,0x740C,0x740B,/* 0x70-0x77 */
+	0x73F4,0x7408,0x7564,0x7563,0x75CE,0x75D2,0x75CF,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x75CB,0x75CC,0x75D1,0x75D0,0x768F,0x7689,0x76D3,/* 0xA0-0xA7 */
+	0x7739,0x772F,0x772D,0x7731,0x7732,0x7734,0x7733,0x773D,/* 0xA8-0xAF */
+	0x7725,0x773B,0x7735,0x7848,0x7852,0x7849,0x784D,0x784A,/* 0xB0-0xB7 */
+	0x784C,0x7826,0x7845,0x7850,0x7964,0x7967,0x7969,0x796A,/* 0xB8-0xBF */
+	0x7963,0x796B,0x7961,0x79BB,0x79FA,0x79F8,0x79F6,0x79F7,/* 0xC0-0xC7 */
+	0x7A8F,0x7A94,0x7A90,0x7B35,0x7B47,0x7B34,0x7B25,0x7B30,/* 0xC8-0xCF */
+	0x7B22,0x7B24,0x7B33,0x7B18,0x7B2A,0x7B1D,0x7B31,0x7B2B,/* 0xD0-0xD7 */
+	0x7B2D,0x7B2F,0x7B32,0x7B38,0x7B1A,0x7B23,0x7C94,0x7C98,/* 0xD8-0xDF */
+	0x7C96,0x7CA3,0x7D35,0x7D3D,0x7D38,0x7D36,0x7D3A,0x7D45,/* 0xE0-0xE7 */
+	0x7D2C,0x7D29,0x7D41,0x7D47,0x7D3E,0x7D3F,0x7D4A,0x7D3B,/* 0xE8-0xEF */
+	0x7D28,0x7F63,0x7F95,0x7F9C,0x7F9D,0x7F9B,0x7FCA,0x7FCB,/* 0xF0-0xF7 */
+	0x7FCD,0x7FD0,0x7FD1,0x7FC7,0x7FCF,0x7FC9,0x801F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x801E,0x801B,0x8047,0x8043,0x8048,0x8118,0x8125,0x8119,/* 0x40-0x47 */
+	0x811B,0x812D,0x811F,0x812C,0x811E,0x8121,0x8115,0x8127,/* 0x48-0x4F */
+	0x811D,0x8122,0x8211,0x8238,0x8233,0x823A,0x8234,0x8232,/* 0x50-0x57 */
+	0x8274,0x8390,0x83A3,0x83A8,0x838D,0x837A,0x8373,0x83A4,/* 0x58-0x5F */
+	0x8374,0x838F,0x8381,0x8395,0x8399,0x8375,0x8394,0x83A9,/* 0x60-0x67 */
+	0x837D,0x8383,0x838C,0x839D,0x839B,0x83AA,0x838B,0x837E,/* 0x68-0x6F */
+	0x83A5,0x83AF,0x8388,0x8397,0x83B0,0x837F,0x83A6,0x8387,/* 0x70-0x77 */
+	0x83AE,0x8376,0x839A,0x8659,0x8656,0x86BF,0x86B7,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x86C2,0x86C1,0x86C5,0x86BA,0x86B0,0x86C8,0x86B9,/* 0xA0-0xA7 */
+	0x86B3,0x86B8,0x86CC,0x86B4,0x86BB,0x86BC,0x86C3,0x86BD,/* 0xA8-0xAF */
+	0x86BE,0x8852,0x8889,0x8895,0x88A8,0x88A2,0x88AA,0x889A,/* 0xB0-0xB7 */
+	0x8891,0x88A1,0x889F,0x8898,0x88A7,0x8899,0x889B,0x8897,/* 0xB8-0xBF */
+	0x88A4,0x88AC,0x888C,0x8893,0x888E,0x8982,0x89D6,0x89D9,/* 0xC0-0xC7 */
+	0x89D5,0x8A30,0x8A27,0x8A2C,0x8A1E,0x8C39,0x8C3B,0x8C5C,/* 0xC8-0xCF */
+	0x8C5D,0x8C7D,0x8CA5,0x8D7D,0x8D7B,0x8D79,0x8DBC,0x8DC2,/* 0xD0-0xD7 */
+	0x8DB9,0x8DBF,0x8DC1,0x8ED8,0x8EDE,0x8EDD,0x8EDC,0x8ED7,/* 0xD8-0xDF */
+	0x8EE0,0x8EE1,0x9024,0x900B,0x9011,0x901C,0x900C,0x9021,/* 0xE0-0xE7 */
+	0x90EF,0x90EA,0x90F0,0x90F4,0x90F2,0x90F3,0x90D4,0x90EB,/* 0xE8-0xEF */
+	0x90EC,0x90E9,0x9156,0x9158,0x915A,0x9153,0x9155,0x91EC,/* 0xF0-0xF7 */
+	0x91F4,0x91F1,0x91F3,0x91F8,0x91E4,0x91F9,0x91EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x91EB,0x91F7,0x91E8,0x91EE,0x957A,0x9586,0x9588,0x967C,/* 0x40-0x47 */
+	0x966D,0x966B,0x9671,0x966F,0x96BF,0x976A,0x9804,0x98E5,/* 0x48-0x4F */
+	0x9997,0x509B,0x5095,0x5094,0x509E,0x508B,0x50A3,0x5083,/* 0x50-0x57 */
+	0x508C,0x508E,0x509D,0x5068,0x509C,0x5092,0x5082,0x5087,/* 0x58-0x5F */
+	0x515F,0x51D4,0x5312,0x5311,0x53A4,0x53A7,0x5591,0x55A8,/* 0x60-0x67 */
+	0x55A5,0x55AD,0x5577,0x5645,0x55A2,0x5593,0x5588,0x558F,/* 0x68-0x6F */
+	0x55B5,0x5581,0x55A3,0x5592,0x55A4,0x557D,0x558C,0x55A6,/* 0x70-0x77 */
+	0x557F,0x5595,0x55A1,0x558E,0x570C,0x5829,0x5837,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5819,0x581E,0x5827,0x5823,0x5828,0x57F5,0x5848,/* 0xA0-0xA7 */
+	0x5825,0x581C,0x581B,0x5833,0x583F,0x5836,0x582E,0x5839,/* 0xA8-0xAF */
+	0x5838,0x582D,0x582C,0x583B,0x5961,0x5AAF,0x5A94,0x5A9F,/* 0xB0-0xB7 */
+	0x5A7A,0x5AA2,0x5A9E,0x5A78,0x5AA6,0x5A7C,0x5AA5,0x5AAC,/* 0xB8-0xBF */
+	0x5A95,0x5AAE,0x5A37,0x5A84,0x5A8A,0x5A97,0x5A83,0x5A8B,/* 0xC0-0xC7 */
+	0x5AA9,0x5A7B,0x5A7D,0x5A8C,0x5A9C,0x5A8F,0x5A93,0x5A9D,/* 0xC8-0xCF */
+	0x5BEA,0x5BCD,0x5BCB,0x5BD4,0x5BD1,0x5BCA,0x5BCE,0x5C0C,/* 0xD0-0xD7 */
+	0x5C30,0x5D37,0x5D43,0x5D6B,0x5D41,0x5D4B,0x5D3F,0x5D35,/* 0xD8-0xDF */
+	0x5D51,0x5D4E,0x5D55,0x5D33,0x5D3A,0x5D52,0x5D3D,0x5D31,/* 0xE0-0xE7 */
+	0x5D59,0x5D42,0x5D39,0x5D49,0x5D38,0x5D3C,0x5D32,0x5D36,/* 0xE8-0xEF */
+	0x5D40,0x5D45,0x5E44,0x5E41,0x5F58,0x5FA6,0x5FA5,0x5FAB,/* 0xF0-0xF7 */
+	0x60C9,0x60B9,0x60CC,0x60E2,0x60CE,0x60C4,0x6114,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x60F2,0x610A,0x6116,0x6105,0x60F5,0x6113,0x60F8,0x60FC,/* 0x40-0x47 */
+	0x60FE,0x60C1,0x6103,0x6118,0x611D,0x6110,0x60FF,0x6104,/* 0x48-0x4F */
+	0x610B,0x624A,0x6394,0x63B1,0x63B0,0x63CE,0x63E5,0x63E8,/* 0x50-0x57 */
+	0x63EF,0x63C3,0x649D,0x63F3,0x63CA,0x63E0,0x63F6,0x63D5,/* 0x58-0x5F */
+	0x63F2,0x63F5,0x6461,0x63DF,0x63BE,0x63DD,0x63DC,0x63C4,/* 0x60-0x67 */
+	0x63D8,0x63D3,0x63C2,0x63C7,0x63CC,0x63CB,0x63C8,0x63F0,/* 0x68-0x6F */
+	0x63D7,0x63D9,0x6532,0x6567,0x656A,0x6564,0x655C,0x6568,/* 0x70-0x77 */
+	0x6565,0x658C,0x659D,0x659E,0x65AE,0x65D0,0x65D2,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x667C,0x666C,0x667B,0x6680,0x6671,0x6679,0x666A,/* 0xA0-0xA7 */
+	0x6672,0x6701,0x690C,0x68D3,0x6904,0x68DC,0x692A,0x68EC,/* 0xA8-0xAF */
+	0x68EA,0x68F1,0x690F,0x68D6,0x68F7,0x68EB,0x68E4,0x68F6,/* 0xB0-0xB7 */
+	0x6913,0x6910,0x68F3,0x68E1,0x6907,0x68CC,0x6908,0x6970,/* 0xB8-0xBF */
+	0x68B4,0x6911,0x68EF,0x68C6,0x6914,0x68F8,0x68D0,0x68FD,/* 0xC0-0xC7 */
+	0x68FC,0x68E8,0x690B,0x690A,0x6917,0x68CE,0x68C8,0x68DD,/* 0xC8-0xCF */
+	0x68DE,0x68E6,0x68F4,0x68D1,0x6906,0x68D4,0x68E9,0x6915,/* 0xD0-0xD7 */
+	0x6925,0x68C7,0x6B39,0x6B3B,0x6B3F,0x6B3C,0x6B94,0x6B97,/* 0xD8-0xDF */
+	0x6B99,0x6B95,0x6BBD,0x6BF0,0x6BF2,0x6BF3,0x6C30,0x6DFC,/* 0xE0-0xE7 */
+	0x6E46,0x6E47,0x6E1F,0x6E49,0x6E88,0x6E3C,0x6E3D,0x6E45,/* 0xE8-0xEF */
+	0x6E62,0x6E2B,0x6E3F,0x6E41,0x6E5D,0x6E73,0x6E1C,0x6E33,/* 0xF0-0xF7 */
+	0x6E4B,0x6E40,0x6E51,0x6E3B,0x6E03,0x6E2E,0x6E5E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6E68,0x6E5C,0x6E61,0x6E31,0x6E28,0x6E60,0x6E71,0x6E6B,/* 0x40-0x47 */
+	0x6E39,0x6E22,0x6E30,0x6E53,0x6E65,0x6E27,0x6E78,0x6E64,/* 0x48-0x4F */
+	0x6E77,0x6E55,0x6E79,0x6E52,0x6E66,0x6E35,0x6E36,0x6E5A,/* 0x50-0x57 */
+	0x7120,0x711E,0x712F,0x70FB,0x712E,0x7131,0x7123,0x7125,/* 0x58-0x5F */
+	0x7122,0x7132,0x711F,0x7128,0x713A,0x711B,0x724B,0x725A,/* 0x60-0x67 */
+	0x7288,0x7289,0x7286,0x7285,0x728B,0x7312,0x730B,0x7330,/* 0x68-0x6F */
+	0x7322,0x7331,0x7333,0x7327,0x7332,0x732D,0x7326,0x7323,/* 0x70-0x77 */
+	0x7335,0x730C,0x742E,0x742C,0x7430,0x742B,0x7416,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x741A,0x7421,0x742D,0x7431,0x7424,0x7423,0x741D,/* 0xA0-0xA7 */
+	0x7429,0x7420,0x7432,0x74FB,0x752F,0x756F,0x756C,0x75E7,/* 0xA8-0xAF */
+	0x75DA,0x75E1,0x75E6,0x75DD,0x75DF,0x75E4,0x75D7,0x7695,/* 0xB0-0xB7 */
+	0x7692,0x76DA,0x7746,0x7747,0x7744,0x774D,0x7745,0x774A,/* 0xB8-0xBF */
+	0x774E,0x774B,0x774C,0x77DE,0x77EC,0x7860,0x7864,0x7865,/* 0xC0-0xC7 */
+	0x785C,0x786D,0x7871,0x786A,0x786E,0x7870,0x7869,0x7868,/* 0xC8-0xCF */
+	0x785E,0x7862,0x7974,0x7973,0x7972,0x7970,0x7A02,0x7A0A,/* 0xD0-0xD7 */
+	0x7A03,0x7A0C,0x7A04,0x7A99,0x7AE6,0x7AE4,0x7B4A,0x7B3B,/* 0xD8-0xDF */
+	0x7B44,0x7B48,0x7B4C,0x7B4E,0x7B40,0x7B58,0x7B45,0x7CA2,/* 0xE0-0xE7 */
+	0x7C9E,0x7CA8,0x7CA1,0x7D58,0x7D6F,0x7D63,0x7D53,0x7D56,/* 0xE8-0xEF */
+	0x7D67,0x7D6A,0x7D4F,0x7D6D,0x7D5C,0x7D6B,0x7D52,0x7D54,/* 0xF0-0xF7 */
+	0x7D69,0x7D51,0x7D5F,0x7D4E,0x7F3E,0x7F3F,0x7F65,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7F66,0x7FA2,0x7FA0,0x7FA1,0x7FD7,0x8051,0x804F,0x8050,/* 0x40-0x47 */
+	0x80FE,0x80D4,0x8143,0x814A,0x8152,0x814F,0x8147,0x813D,/* 0x48-0x4F */
+	0x814D,0x813A,0x81E6,0x81EE,0x81F7,0x81F8,0x81F9,0x8204,/* 0x50-0x57 */
+	0x823C,0x823D,0x823F,0x8275,0x833B,0x83CF,0x83F9,0x8423,/* 0x58-0x5F */
+	0x83C0,0x83E8,0x8412,0x83E7,0x83E4,0x83FC,0x83F6,0x8410,/* 0x60-0x67 */
+	0x83C6,0x83C8,0x83EB,0x83E3,0x83BF,0x8401,0x83DD,0x83E5,/* 0x68-0x6F */
+	0x83D8,0x83FF,0x83E1,0x83CB,0x83CE,0x83D6,0x83F5,0x83C9,/* 0x70-0x77 */
+	0x8409,0x840F,0x83DE,0x8411,0x8406,0x83C2,0x83F3,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x83D5,0x83FA,0x83C7,0x83D1,0x83EA,0x8413,0x83C3,/* 0xA0-0xA7 */
+	0x83EC,0x83EE,0x83C4,0x83FB,0x83D7,0x83E2,0x841B,0x83DB,/* 0xA8-0xAF */
+	0x83FE,0x86D8,0x86E2,0x86E6,0x86D3,0x86E3,0x86DA,0x86EA,/* 0xB0-0xB7 */
+	0x86DD,0x86EB,0x86DC,0x86EC,0x86E9,0x86D7,0x86E8,0x86D1,/* 0xB8-0xBF */
+	0x8848,0x8856,0x8855,0x88BA,0x88D7,0x88B9,0x88B8,0x88C0,/* 0xC0-0xC7 */
+	0x88BE,0x88B6,0x88BC,0x88B7,0x88BD,0x88B2,0x8901,0x88C9,/* 0xC8-0xCF */
+	0x8995,0x8998,0x8997,0x89DD,0x89DA,0x89DB,0x8A4E,0x8A4D,/* 0xD0-0xD7 */
+	0x8A39,0x8A59,0x8A40,0x8A57,0x8A58,0x8A44,0x8A45,0x8A52,/* 0xD8-0xDF */
+	0x8A48,0x8A51,0x8A4A,0x8A4C,0x8A4F,0x8C5F,0x8C81,0x8C80,/* 0xE0-0xE7 */
+	0x8CBA,0x8CBE,0x8CB0,0x8CB9,0x8CB5,0x8D84,0x8D80,0x8D89,/* 0xE8-0xEF */
+	0x8DD8,0x8DD3,0x8DCD,0x8DC7,0x8DD6,0x8DDC,0x8DCF,0x8DD5,/* 0xF0-0xF7 */
+	0x8DD9,0x8DC8,0x8DD7,0x8DC5,0x8EEF,0x8EF7,0x8EFA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8EF9,0x8EE6,0x8EEE,0x8EE5,0x8EF5,0x8EE7,0x8EE8,0x8EF6,/* 0x40-0x47 */
+	0x8EEB,0x8EF1,0x8EEC,0x8EF4,0x8EE9,0x902D,0x9034,0x902F,/* 0x48-0x4F */
+	0x9106,0x912C,0x9104,0x90FF,0x90FC,0x9108,0x90F9,0x90FB,/* 0x50-0x57 */
+	0x9101,0x9100,0x9107,0x9105,0x9103,0x9161,0x9164,0x915F,/* 0x58-0x5F */
+	0x9162,0x9160,0x9201,0x920A,0x9225,0x9203,0x921A,0x9226,/* 0x60-0x67 */
+	0x920F,0x920C,0x9200,0x9212,0x91FF,0x91FD,0x9206,0x9204,/* 0x68-0x6F */
+	0x9227,0x9202,0x921C,0x9224,0x9219,0x9217,0x9205,0x9216,/* 0x70-0x77 */
+	0x957B,0x958D,0x958C,0x9590,0x9687,0x967E,0x9688,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9689,0x9683,0x9680,0x96C2,0x96C8,0x96C3,0x96F1,/* 0xA0-0xA7 */
+	0x96F0,0x976C,0x9770,0x976E,0x9807,0x98A9,0x98EB,0x9CE6,/* 0xA8-0xAF */
+	0x9EF9,0x4E83,0x4E84,0x4EB6,0x50BD,0x50BF,0x50C6,0x50AE,/* 0xB0-0xB7 */
+	0x50C4,0x50CA,0x50B4,0x50C8,0x50C2,0x50B0,0x50C1,0x50BA,/* 0xB8-0xBF */
+	0x50B1,0x50CB,0x50C9,0x50B6,0x50B8,0x51D7,0x527A,0x5278,/* 0xC0-0xC7 */
+	0x527B,0x527C,0x55C3,0x55DB,0x55CC,0x55D0,0x55CB,0x55CA,/* 0xC8-0xCF */
+	0x55DD,0x55C0,0x55D4,0x55C4,0x55E9,0x55BF,0x55D2,0x558D,/* 0xD0-0xD7 */
+	0x55CF,0x55D5,0x55E2,0x55D6,0x55C8,0x55F2,0x55CD,0x55D9,/* 0xD8-0xDF */
+	0x55C2,0x5714,0x5853,0x5868,0x5864,0x584F,0x584D,0x5849,/* 0xE0-0xE7 */
+	0x586F,0x5855,0x584E,0x585D,0x5859,0x5865,0x585B,0x583D,/* 0xE8-0xEF */
+	0x5863,0x5871,0x58FC,0x5AC7,0x5AC4,0x5ACB,0x5ABA,0x5AB8,/* 0xF0-0xF7 */
+	0x5AB1,0x5AB5,0x5AB0,0x5ABF,0x5AC8,0x5ABB,0x5AC6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5AB7,0x5AC0,0x5ACA,0x5AB4,0x5AB6,0x5ACD,0x5AB9,0x5A90,/* 0x40-0x47 */
+	0x5BD6,0x5BD8,0x5BD9,0x5C1F,0x5C33,0x5D71,0x5D63,0x5D4A,/* 0x48-0x4F */
+	0x5D65,0x5D72,0x5D6C,0x5D5E,0x5D68,0x5D67,0x5D62,0x5DF0,/* 0x50-0x57 */
+	0x5E4F,0x5E4E,0x5E4A,0x5E4D,0x5E4B,0x5EC5,0x5ECC,0x5EC6,/* 0x58-0x5F */
+	0x5ECB,0x5EC7,0x5F40,0x5FAF,0x5FAD,0x60F7,0x6149,0x614A,/* 0x60-0x67 */
+	0x612B,0x6145,0x6136,0x6132,0x612E,0x6146,0x612F,0x614F,/* 0x68-0x6F */
+	0x6129,0x6140,0x6220,0x9168,0x6223,0x6225,0x6224,0x63C5,/* 0x70-0x77 */
+	0x63F1,0x63EB,0x6410,0x6412,0x6409,0x6420,0x6424,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6433,0x6443,0x641F,0x6415,0x6418,0x6439,0x6437,/* 0xA0-0xA7 */
+	0x6422,0x6423,0x640C,0x6426,0x6430,0x6428,0x6441,0x6435,/* 0xA8-0xAF */
+	0x642F,0x640A,0x641A,0x6440,0x6425,0x6427,0x640B,0x63E7,/* 0xB0-0xB7 */
+	0x641B,0x642E,0x6421,0x640E,0x656F,0x6592,0x65D3,0x6686,/* 0xB8-0xBF */
+	0x668C,0x6695,0x6690,0x668B,0x668A,0x6699,0x6694,0x6678,/* 0xC0-0xC7 */
+	0x6720,0x6966,0x695F,0x6938,0x694E,0x6962,0x6971,0x693F,/* 0xC8-0xCF */
+	0x6945,0x696A,0x6939,0x6942,0x6957,0x6959,0x697A,0x6948,/* 0xD0-0xD7 */
+	0x6949,0x6935,0x696C,0x6933,0x693D,0x6965,0x68F0,0x6978,/* 0xD8-0xDF */
+	0x6934,0x6969,0x6940,0x696F,0x6944,0x6976,0x6958,0x6941,/* 0xE0-0xE7 */
+	0x6974,0x694C,0x693B,0x694B,0x6937,0x695C,0x694F,0x6951,/* 0xE8-0xEF */
+	0x6932,0x6952,0x692F,0x697B,0x693C,0x6B46,0x6B45,0x6B43,/* 0xF0-0xF7 */
+	0x6B42,0x6B48,0x6B41,0x6B9B,0xFA0D,0x6BFB,0x6BFC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6BF9,0x6BF7,0x6BF8,0x6E9B,0x6ED6,0x6EC8,0x6E8F,0x6EC0,/* 0x40-0x47 */
+	0x6E9F,0x6E93,0x6E94,0x6EA0,0x6EB1,0x6EB9,0x6EC6,0x6ED2,/* 0x48-0x4F */
+	0x6EBD,0x6EC1,0x6E9E,0x6EC9,0x6EB7,0x6EB0,0x6ECD,0x6EA6,/* 0x50-0x57 */
+	0x6ECF,0x6EB2,0x6EBE,0x6EC3,0x6EDC,0x6ED8,0x6E99,0x6E92,/* 0x58-0x5F */
+	0x6E8E,0x6E8D,0x6EA4,0x6EA1,0x6EBF,0x6EB3,0x6ED0,0x6ECA,/* 0x60-0x67 */
+	0x6E97,0x6EAE,0x6EA3,0x7147,0x7154,0x7152,0x7163,0x7160,/* 0x68-0x6F */
+	0x7141,0x715D,0x7162,0x7172,0x7178,0x716A,0x7161,0x7142,/* 0x70-0x77 */
+	0x7158,0x7143,0x714B,0x7170,0x715F,0x7150,0x7153,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7144,0x714D,0x715A,0x724F,0x728D,0x728C,0x7291,/* 0xA0-0xA7 */
+	0x7290,0x728E,0x733C,0x7342,0x733B,0x733A,0x7340,0x734A,/* 0xA8-0xAF */
+	0x7349,0x7444,0x744A,0x744B,0x7452,0x7451,0x7457,0x7440,/* 0xB0-0xB7 */
+	0x744F,0x7450,0x744E,0x7442,0x7446,0x744D,0x7454,0x74E1,/* 0xB8-0xBF */
+	0x74FF,0x74FE,0x74FD,0x751D,0x7579,0x7577,0x6983,0x75EF,/* 0xC0-0xC7 */
+	0x760F,0x7603,0x75F7,0x75FE,0x75FC,0x75F9,0x75F8,0x7610,/* 0xC8-0xCF */
+	0x75FB,0x75F6,0x75ED,0x75F5,0x75FD,0x7699,0x76B5,0x76DD,/* 0xD0-0xD7 */
+	0x7755,0x775F,0x7760,0x7752,0x7756,0x775A,0x7769,0x7767,/* 0xD8-0xDF */
+	0x7754,0x7759,0x776D,0x77E0,0x7887,0x789A,0x7894,0x788F,/* 0xE0-0xE7 */
+	0x7884,0x7895,0x7885,0x7886,0x78A1,0x7883,0x7879,0x7899,/* 0xE8-0xEF */
+	0x7880,0x7896,0x787B,0x797C,0x7982,0x797D,0x7979,0x7A11,/* 0xF0-0xF7 */
+	0x7A18,0x7A19,0x7A12,0x7A17,0x7A15,0x7A22,0x7A13,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7A1B,0x7A10,0x7AA3,0x7AA2,0x7A9E,0x7AEB,0x7B66,0x7B64,/* 0x40-0x47 */
+	0x7B6D,0x7B74,0x7B69,0x7B72,0x7B65,0x7B73,0x7B71,0x7B70,/* 0x48-0x4F */
+	0x7B61,0x7B78,0x7B76,0x7B63,0x7CB2,0x7CB4,0x7CAF,0x7D88,/* 0x50-0x57 */
+	0x7D86,0x7D80,0x7D8D,0x7D7F,0x7D85,0x7D7A,0x7D8E,0x7D7B,/* 0x58-0x5F */
+	0x7D83,0x7D7C,0x7D8C,0x7D94,0x7D84,0x7D7D,0x7D92,0x7F6D,/* 0x60-0x67 */
+	0x7F6B,0x7F67,0x7F68,0x7F6C,0x7FA6,0x7FA5,0x7FA7,0x7FDB,/* 0x68-0x6F */
+	0x7FDC,0x8021,0x8164,0x8160,0x8177,0x815C,0x8169,0x815B,/* 0x70-0x77 */
+	0x8162,0x8172,0x6721,0x815E,0x8176,0x8167,0x816F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8144,0x8161,0x821D,0x8249,0x8244,0x8240,0x8242,/* 0xA0-0xA7 */
+	0x8245,0x84F1,0x843F,0x8456,0x8476,0x8479,0x848F,0x848D,/* 0xA8-0xAF */
+	0x8465,0x8451,0x8440,0x8486,0x8467,0x8430,0x844D,0x847D,/* 0xB0-0xB7 */
+	0x845A,0x8459,0x8474,0x8473,0x845D,0x8507,0x845E,0x8437,/* 0xB8-0xBF */
+	0x843A,0x8434,0x847A,0x8443,0x8478,0x8432,0x8445,0x8429,/* 0xC0-0xC7 */
+	0x83D9,0x844B,0x842F,0x8442,0x842D,0x845F,0x8470,0x8439,/* 0xC8-0xCF */
+	0x844E,0x844C,0x8452,0x846F,0x84C5,0x848E,0x843B,0x8447,/* 0xD0-0xD7 */
+	0x8436,0x8433,0x8468,0x847E,0x8444,0x842B,0x8460,0x8454,/* 0xD8-0xDF */
+	0x846E,0x8450,0x870B,0x8704,0x86F7,0x870C,0x86FA,0x86D6,/* 0xE0-0xE7 */
+	0x86F5,0x874D,0x86F8,0x870E,0x8709,0x8701,0x86F6,0x870D,/* 0xE8-0xEF */
+	0x8705,0x88D6,0x88CB,0x88CD,0x88CE,0x88DE,0x88DB,0x88DA,/* 0xF0-0xF7 */
+	0x88CC,0x88D0,0x8985,0x899B,0x89DF,0x89E5,0x89E4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x89E1,0x89E0,0x89E2,0x89DC,0x89E6,0x8A76,0x8A86,0x8A7F,/* 0x40-0x47 */
+	0x8A61,0x8A3F,0x8A77,0x8A82,0x8A84,0x8A75,0x8A83,0x8A81,/* 0x48-0x4F */
+	0x8A74,0x8A7A,0x8C3C,0x8C4B,0x8C4A,0x8C65,0x8C64,0x8C66,/* 0x50-0x57 */
+	0x8C86,0x8C84,0x8C85,0x8CCC,0x8D68,0x8D69,0x8D91,0x8D8C,/* 0x58-0x5F */
+	0x8D8E,0x8D8F,0x8D8D,0x8D93,0x8D94,0x8D90,0x8D92,0x8DF0,/* 0x60-0x67 */
+	0x8DE0,0x8DEC,0x8DF1,0x8DEE,0x8DD0,0x8DE9,0x8DE3,0x8DE2,/* 0x68-0x6F */
+	0x8DE7,0x8DF2,0x8DEB,0x8DF4,0x8F06,0x8EFF,0x8F01,0x8F00,/* 0x70-0x77 */
+	0x8F05,0x8F07,0x8F08,0x8F02,0x8F0B,0x9052,0x903F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9044,0x9049,0x903D,0x9110,0x910D,0x910F,0x9111,/* 0xA0-0xA7 */
+	0x9116,0x9114,0x910B,0x910E,0x916E,0x916F,0x9248,0x9252,/* 0xA8-0xAF */
+	0x9230,0x923A,0x9266,0x9233,0x9265,0x925E,0x9283,0x922E,/* 0xB0-0xB7 */
+	0x924A,0x9246,0x926D,0x926C,0x924F,0x9260,0x9267,0x926F,/* 0xB8-0xBF */
+	0x9236,0x9261,0x9270,0x9231,0x9254,0x9263,0x9250,0x9272,/* 0xC0-0xC7 */
+	0x924E,0x9253,0x924C,0x9256,0x9232,0x959F,0x959C,0x959E,/* 0xC8-0xCF */
+	0x959B,0x9692,0x9693,0x9691,0x9697,0x96CE,0x96FA,0x96FD,/* 0xD0-0xD7 */
+	0x96F8,0x96F5,0x9773,0x9777,0x9778,0x9772,0x980F,0x980D,/* 0xD8-0xDF */
+	0x980E,0x98AC,0x98F6,0x98F9,0x99AF,0x99B2,0x99B0,0x99B5,/* 0xE0-0xE7 */
+	0x9AAD,0x9AAB,0x9B5B,0x9CEA,0x9CED,0x9CE7,0x9E80,0x9EFD,/* 0xE8-0xEF */
+	0x50E6,0x50D4,0x50D7,0x50E8,0x50F3,0x50DB,0x50EA,0x50DD,/* 0xF0-0xF7 */
+	0x50E4,0x50D3,0x50EC,0x50F0,0x50EF,0x50E3,0x50E0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x51D8,0x5280,0x5281,0x52E9,0x52EB,0x5330,0x53AC,0x5627,/* 0x40-0x47 */
+	0x5615,0x560C,0x5612,0x55FC,0x560F,0x561C,0x5601,0x5613,/* 0x48-0x4F */
+	0x5602,0x55FA,0x561D,0x5604,0x55FF,0x55F9,0x5889,0x587C,/* 0x50-0x57 */
+	0x5890,0x5898,0x5886,0x5881,0x587F,0x5874,0x588B,0x587A,/* 0x58-0x5F */
+	0x5887,0x5891,0x588E,0x5876,0x5882,0x5888,0x587B,0x5894,/* 0x60-0x67 */
+	0x588F,0x58FE,0x596B,0x5ADC,0x5AEE,0x5AE5,0x5AD5,0x5AEA,/* 0x68-0x6F */
+	0x5ADA,0x5AED,0x5AEB,0x5AF3,0x5AE2,0x5AE0,0x5ADB,0x5AEC,/* 0x70-0x77 */
+	0x5ADE,0x5ADD,0x5AD9,0x5AE8,0x5ADF,0x5B77,0x5BE0,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5BE3,0x5C63,0x5D82,0x5D80,0x5D7D,0x5D86,0x5D7A,/* 0xA0-0xA7 */
+	0x5D81,0x5D77,0x5D8A,0x5D89,0x5D88,0x5D7E,0x5D7C,0x5D8D,/* 0xA8-0xAF */
+	0x5D79,0x5D7F,0x5E58,0x5E59,0x5E53,0x5ED8,0x5ED1,0x5ED7,/* 0xB0-0xB7 */
+	0x5ECE,0x5EDC,0x5ED5,0x5ED9,0x5ED2,0x5ED4,0x5F44,0x5F43,/* 0xB8-0xBF */
+	0x5F6F,0x5FB6,0x612C,0x6128,0x6141,0x615E,0x6171,0x6173,/* 0xC0-0xC7 */
+	0x6152,0x6153,0x6172,0x616C,0x6180,0x6174,0x6154,0x617A,/* 0xC8-0xCF */
+	0x615B,0x6165,0x613B,0x616A,0x6161,0x6156,0x6229,0x6227,/* 0xD0-0xD7 */
+	0x622B,0x642B,0x644D,0x645B,0x645D,0x6474,0x6476,0x6472,/* 0xD8-0xDF */
+	0x6473,0x647D,0x6475,0x6466,0x64A6,0x644E,0x6482,0x645E,/* 0xE0-0xE7 */
+	0x645C,0x644B,0x6453,0x6460,0x6450,0x647F,0x643F,0x646C,/* 0xE8-0xEF */
+	0x646B,0x6459,0x6465,0x6477,0x6573,0x65A0,0x66A1,0x66A0,/* 0xF0-0xF7 */
+	0x669F,0x6705,0x6704,0x6722,0x69B1,0x69B6,0x69C9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x69A0,0x69CE,0x6996,0x69B0,0x69AC,0x69BC,0x6991,0x6999,/* 0x40-0x47 */
+	0x698E,0x69A7,0x698D,0x69A9,0x69BE,0x69AF,0x69BF,0x69C4,/* 0x48-0x4F */
+	0x69BD,0x69A4,0x69D4,0x69B9,0x69CA,0x699A,0x69CF,0x69B3,/* 0x50-0x57 */
+	0x6993,0x69AA,0x69A1,0x699E,0x69D9,0x6997,0x6990,0x69C2,/* 0x58-0x5F */
+	0x69B5,0x69A5,0x69C6,0x6B4A,0x6B4D,0x6B4B,0x6B9E,0x6B9F,/* 0x60-0x67 */
+	0x6BA0,0x6BC3,0x6BC4,0x6BFE,0x6ECE,0x6EF5,0x6EF1,0x6F03,/* 0x68-0x6F */
+	0x6F25,0x6EF8,0x6F37,0x6EFB,0x6F2E,0x6F09,0x6F4E,0x6F19,/* 0x70-0x77 */
+	0x6F1A,0x6F27,0x6F18,0x6F3B,0x6F12,0x6EED,0x6F0A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6F36,0x6F73,0x6EF9,0x6EEE,0x6F2D,0x6F40,0x6F30,/* 0xA0-0xA7 */
+	0x6F3C,0x6F35,0x6EEB,0x6F07,0x6F0E,0x6F43,0x6F05,0x6EFD,/* 0xA8-0xAF */
+	0x6EF6,0x6F39,0x6F1C,0x6EFC,0x6F3A,0x6F1F,0x6F0D,0x6F1E,/* 0xB0-0xB7 */
+	0x6F08,0x6F21,0x7187,0x7190,0x7189,0x7180,0x7185,0x7182,/* 0xB8-0xBF */
+	0x718F,0x717B,0x7186,0x7181,0x7197,0x7244,0x7253,0x7297,/* 0xC0-0xC7 */
+	0x7295,0x7293,0x7343,0x734D,0x7351,0x734C,0x7462,0x7473,/* 0xC8-0xCF */
+	0x7471,0x7475,0x7472,0x7467,0x746E,0x7500,0x7502,0x7503,/* 0xD0-0xD7 */
+	0x757D,0x7590,0x7616,0x7608,0x760C,0x7615,0x7611,0x760A,/* 0xD8-0xDF */
+	0x7614,0x76B8,0x7781,0x777C,0x7785,0x7782,0x776E,0x7780,/* 0xE0-0xE7 */
+	0x776F,0x777E,0x7783,0x78B2,0x78AA,0x78B4,0x78AD,0x78A8,/* 0xE8-0xEF */
+	0x787E,0x78AB,0x789E,0x78A5,0x78A0,0x78AC,0x78A2,0x78A4,/* 0xF0-0xF7 */
+	0x7998,0x798A,0x798B,0x7996,0x7995,0x7994,0x7993,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7997,0x7988,0x7992,0x7990,0x7A2B,0x7A4A,0x7A30,0x7A2F,/* 0x40-0x47 */
+	0x7A28,0x7A26,0x7AA8,0x7AAB,0x7AAC,0x7AEE,0x7B88,0x7B9C,/* 0x48-0x4F */
+	0x7B8A,0x7B91,0x7B90,0x7B96,0x7B8D,0x7B8C,0x7B9B,0x7B8E,/* 0x50-0x57 */
+	0x7B85,0x7B98,0x5284,0x7B99,0x7BA4,0x7B82,0x7CBB,0x7CBF,/* 0x58-0x5F */
+	0x7CBC,0x7CBA,0x7DA7,0x7DB7,0x7DC2,0x7DA3,0x7DAA,0x7DC1,/* 0x60-0x67 */
+	0x7DC0,0x7DC5,0x7D9D,0x7DCE,0x7DC4,0x7DC6,0x7DCB,0x7DCC,/* 0x68-0x6F */
+	0x7DAF,0x7DB9,0x7D96,0x7DBC,0x7D9F,0x7DA6,0x7DAE,0x7DA9,/* 0x70-0x77 */
+	0x7DA1,0x7DC9,0x7F73,0x7FE2,0x7FE3,0x7FE5,0x7FDE,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8024,0x805D,0x805C,0x8189,0x8186,0x8183,0x8187,/* 0xA0-0xA7 */
+	0x818D,0x818C,0x818B,0x8215,0x8497,0x84A4,0x84A1,0x849F,/* 0xA8-0xAF */
+	0x84BA,0x84CE,0x84C2,0x84AC,0x84AE,0x84AB,0x84B9,0x84B4,/* 0xB0-0xB7 */
+	0x84C1,0x84CD,0x84AA,0x849A,0x84B1,0x84D0,0x849D,0x84A7,/* 0xB8-0xBF */
+	0x84BB,0x84A2,0x8494,0x84C7,0x84CC,0x849B,0x84A9,0x84AF,/* 0xC0-0xC7 */
+	0x84A8,0x84D6,0x8498,0x84B6,0x84CF,0x84A0,0x84D7,0x84D4,/* 0xC8-0xCF */
+	0x84D2,0x84DB,0x84B0,0x8491,0x8661,0x8733,0x8723,0x8728,/* 0xD0-0xD7 */
+	0x876B,0x8740,0x872E,0x871E,0x8721,0x8719,0x871B,0x8743,/* 0xD8-0xDF */
+	0x872C,0x8741,0x873E,0x8746,0x8720,0x8732,0x872A,0x872D,/* 0xE0-0xE7 */
+	0x873C,0x8712,0x873A,0x8731,0x8735,0x8742,0x8726,0x8727,/* 0xE8-0xEF */
+	0x8738,0x8724,0x871A,0x8730,0x8711,0x88F7,0x88E7,0x88F1,/* 0xF0-0xF7 */
+	0x88F2,0x88FA,0x88FE,0x88EE,0x88FC,0x88F6,0x88FB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x88F0,0x88EC,0x88EB,0x899D,0x89A1,0x899F,0x899E,0x89E9,/* 0x40-0x47 */
+	0x89EB,0x89E8,0x8AAB,0x8A99,0x8A8B,0x8A92,0x8A8F,0x8A96,/* 0x48-0x4F */
+	0x8C3D,0x8C68,0x8C69,0x8CD5,0x8CCF,0x8CD7,0x8D96,0x8E09,/* 0x50-0x57 */
+	0x8E02,0x8DFF,0x8E0D,0x8DFD,0x8E0A,0x8E03,0x8E07,0x8E06,/* 0x58-0x5F */
+	0x8E05,0x8DFE,0x8E00,0x8E04,0x8F10,0x8F11,0x8F0E,0x8F0D,/* 0x60-0x67 */
+	0x9123,0x911C,0x9120,0x9122,0x911F,0x911D,0x911A,0x9124,/* 0x68-0x6F */
+	0x9121,0x911B,0x917A,0x9172,0x9179,0x9173,0x92A5,0x92A4,/* 0x70-0x77 */
+	0x9276,0x929B,0x927A,0x92A0,0x9294,0x92AA,0x928D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x92A6,0x929A,0x92AB,0x9279,0x9297,0x927F,0x92A3,/* 0xA0-0xA7 */
+	0x92EE,0x928E,0x9282,0x9295,0x92A2,0x927D,0x9288,0x92A1,/* 0xA8-0xAF */
+	0x928A,0x9286,0x928C,0x9299,0x92A7,0x927E,0x9287,0x92A9,/* 0xB0-0xB7 */
+	0x929D,0x928B,0x922D,0x969E,0x96A1,0x96FF,0x9758,0x977D,/* 0xB8-0xBF */
+	0x977A,0x977E,0x9783,0x9780,0x9782,0x977B,0x9784,0x9781,/* 0xC0-0xC7 */
+	0x977F,0x97CE,0x97CD,0x9816,0x98AD,0x98AE,0x9902,0x9900,/* 0xC8-0xCF */
+	0x9907,0x999D,0x999C,0x99C3,0x99B9,0x99BB,0x99BA,0x99C2,/* 0xD0-0xD7 */
+	0x99BD,0x99C7,0x9AB1,0x9AE3,0x9AE7,0x9B3E,0x9B3F,0x9B60,/* 0xD8-0xDF */
+	0x9B61,0x9B5F,0x9CF1,0x9CF2,0x9CF5,0x9EA7,0x50FF,0x5103,/* 0xE0-0xE7 */
+	0x5130,0x50F8,0x5106,0x5107,0x50F6,0x50FE,0x510B,0x510C,/* 0xE8-0xEF */
+	0x50FD,0x510A,0x528B,0x528C,0x52F1,0x52EF,0x5648,0x5642,/* 0xF0-0xF7 */
+	0x564C,0x5635,0x5641,0x564A,0x5649,0x5646,0x5658,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x565A,0x5640,0x5633,0x563D,0x562C,0x563E,0x5638,0x562A,/* 0x40-0x47 */
+	0x563A,0x571A,0x58AB,0x589D,0x58B1,0x58A0,0x58A3,0x58AF,/* 0x48-0x4F */
+	0x58AC,0x58A5,0x58A1,0x58FF,0x5AFF,0x5AF4,0x5AFD,0x5AF7,/* 0x50-0x57 */
+	0x5AF6,0x5B03,0x5AF8,0x5B02,0x5AF9,0x5B01,0x5B07,0x5B05,/* 0x58-0x5F */
+	0x5B0F,0x5C67,0x5D99,0x5D97,0x5D9F,0x5D92,0x5DA2,0x5D93,/* 0x60-0x67 */
+	0x5D95,0x5DA0,0x5D9C,0x5DA1,0x5D9A,0x5D9E,0x5E69,0x5E5D,/* 0x68-0x6F */
+	0x5E60,0x5E5C,0x7DF3,0x5EDB,0x5EDE,0x5EE1,0x5F49,0x5FB2,/* 0x70-0x77 */
+	0x618B,0x6183,0x6179,0x61B1,0x61B0,0x61A2,0x6189,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x619B,0x6193,0x61AF,0x61AD,0x619F,0x6192,0x61AA,/* 0xA0-0xA7 */
+	0x61A1,0x618D,0x6166,0x61B3,0x622D,0x646E,0x6470,0x6496,/* 0xA8-0xAF */
+	0x64A0,0x6485,0x6497,0x649C,0x648F,0x648B,0x648A,0x648C,/* 0xB0-0xB7 */
+	0x64A3,0x649F,0x6468,0x64B1,0x6498,0x6576,0x657A,0x6579,/* 0xB8-0xBF */
+	0x657B,0x65B2,0x65B3,0x66B5,0x66B0,0x66A9,0x66B2,0x66B7,/* 0xC0-0xC7 */
+	0x66AA,0x66AF,0x6A00,0x6A06,0x6A17,0x69E5,0x69F8,0x6A15,/* 0xC8-0xCF */
+	0x69F1,0x69E4,0x6A20,0x69FF,0x69EC,0x69E2,0x6A1B,0x6A1D,/* 0xD0-0xD7 */
+	0x69FE,0x6A27,0x69F2,0x69EE,0x6A14,0x69F7,0x69E7,0x6A40,/* 0xD8-0xDF */
+	0x6A08,0x69E6,0x69FB,0x6A0D,0x69FC,0x69EB,0x6A09,0x6A04,/* 0xE0-0xE7 */
+	0x6A18,0x6A25,0x6A0F,0x69F6,0x6A26,0x6A07,0x69F4,0x6A16,/* 0xE8-0xEF */
+	0x6B51,0x6BA5,0x6BA3,0x6BA2,0x6BA6,0x6C01,0x6C00,0x6BFF,/* 0xF0-0xF7 */
+	0x6C02,0x6F41,0x6F26,0x6F7E,0x6F87,0x6FC6,0x6F92,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6F8D,0x6F89,0x6F8C,0x6F62,0x6F4F,0x6F85,0x6F5A,0x6F96,/* 0x40-0x47 */
+	0x6F76,0x6F6C,0x6F82,0x6F55,0x6F72,0x6F52,0x6F50,0x6F57,/* 0x48-0x4F */
+	0x6F94,0x6F93,0x6F5D,0x6F00,0x6F61,0x6F6B,0x6F7D,0x6F67,/* 0x50-0x57 */
+	0x6F90,0x6F53,0x6F8B,0x6F69,0x6F7F,0x6F95,0x6F63,0x6F77,/* 0x58-0x5F */
+	0x6F6A,0x6F7B,0x71B2,0x71AF,0x719B,0x71B0,0x71A0,0x719A,/* 0x60-0x67 */
+	0x71A9,0x71B5,0x719D,0x71A5,0x719E,0x71A4,0x71A1,0x71AA,/* 0x68-0x6F */
+	0x719C,0x71A7,0x71B3,0x7298,0x729A,0x7358,0x7352,0x735E,/* 0x70-0x77 */
+	0x735F,0x7360,0x735D,0x735B,0x7361,0x735A,0x7359,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7362,0x7487,0x7489,0x748A,0x7486,0x7481,0x747D,/* 0xA0-0xA7 */
+	0x7485,0x7488,0x747C,0x7479,0x7508,0x7507,0x757E,0x7625,/* 0xA8-0xAF */
+	0x761E,0x7619,0x761D,0x761C,0x7623,0x761A,0x7628,0x761B,/* 0xB0-0xB7 */
+	0x769C,0x769D,0x769E,0x769B,0x778D,0x778F,0x7789,0x7788,/* 0xB8-0xBF */
+	0x78CD,0x78BB,0x78CF,0x78CC,0x78D1,0x78CE,0x78D4,0x78C8,/* 0xC0-0xC7 */
+	0x78C3,0x78C4,0x78C9,0x799A,0x79A1,0x79A0,0x799C,0x79A2,/* 0xC8-0xCF */
+	0x799B,0x6B76,0x7A39,0x7AB2,0x7AB4,0x7AB3,0x7BB7,0x7BCB,/* 0xD0-0xD7 */
+	0x7BBE,0x7BAC,0x7BCE,0x7BAF,0x7BB9,0x7BCA,0x7BB5,0x7CC5,/* 0xD8-0xDF */
+	0x7CC8,0x7CCC,0x7CCB,0x7DF7,0x7DDB,0x7DEA,0x7DE7,0x7DD7,/* 0xE0-0xE7 */
+	0x7DE1,0x7E03,0x7DFA,0x7DE6,0x7DF6,0x7DF1,0x7DF0,0x7DEE,/* 0xE8-0xEF */
+	0x7DDF,0x7F76,0x7FAC,0x7FB0,0x7FAD,0x7FED,0x7FEB,0x7FEA,/* 0xF0-0xF7 */
+	0x7FEC,0x7FE6,0x7FE8,0x8064,0x8067,0x81A3,0x819F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x819E,0x8195,0x81A2,0x8199,0x8197,0x8216,0x824F,0x8253,/* 0x40-0x47 */
+	0x8252,0x8250,0x824E,0x8251,0x8524,0x853B,0x850F,0x8500,/* 0x48-0x4F */
+	0x8529,0x850E,0x8509,0x850D,0x851F,0x850A,0x8527,0x851C,/* 0x50-0x57 */
+	0x84FB,0x852B,0x84FA,0x8508,0x850C,0x84F4,0x852A,0x84F2,/* 0x58-0x5F */
+	0x8515,0x84F7,0x84EB,0x84F3,0x84FC,0x8512,0x84EA,0x84E9,/* 0x60-0x67 */
+	0x8516,0x84FE,0x8528,0x851D,0x852E,0x8502,0x84FD,0x851E,/* 0x68-0x6F */
+	0x84F6,0x8531,0x8526,0x84E7,0x84E8,0x84F0,0x84EF,0x84F9,/* 0x70-0x77 */
+	0x8518,0x8520,0x8530,0x850B,0x8519,0x852F,0x8662,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8756,0x8763,0x8764,0x8777,0x87E1,0x8773,0x8758,/* 0xA0-0xA7 */
+	0x8754,0x875B,0x8752,0x8761,0x875A,0x8751,0x875E,0x876D,/* 0xA8-0xAF */
+	0x876A,0x8750,0x874E,0x875F,0x875D,0x876F,0x876C,0x877A,/* 0xB0-0xB7 */
+	0x876E,0x875C,0x8765,0x874F,0x877B,0x8775,0x8762,0x8767,/* 0xB8-0xBF */
+	0x8769,0x885A,0x8905,0x890C,0x8914,0x890B,0x8917,0x8918,/* 0xC0-0xC7 */
+	0x8919,0x8906,0x8916,0x8911,0x890E,0x8909,0x89A2,0x89A4,/* 0xC8-0xCF */
+	0x89A3,0x89ED,0x89F0,0x89EC,0x8ACF,0x8AC6,0x8AB8,0x8AD3,/* 0xD0-0xD7 */
+	0x8AD1,0x8AD4,0x8AD5,0x8ABB,0x8AD7,0x8ABE,0x8AC0,0x8AC5,/* 0xD8-0xDF */
+	0x8AD8,0x8AC3,0x8ABA,0x8ABD,0x8AD9,0x8C3E,0x8C4D,0x8C8F,/* 0xE0-0xE7 */
+	0x8CE5,0x8CDF,0x8CD9,0x8CE8,0x8CDA,0x8CDD,0x8CE7,0x8DA0,/* 0xE8-0xEF */
+	0x8D9C,0x8DA1,0x8D9B,0x8E20,0x8E23,0x8E25,0x8E24,0x8E2E,/* 0xF0-0xF7 */
+	0x8E15,0x8E1B,0x8E16,0x8E11,0x8E19,0x8E26,0x8E27,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E14,0x8E12,0x8E18,0x8E13,0x8E1C,0x8E17,0x8E1A,0x8F2C,/* 0x40-0x47 */
+	0x8F24,0x8F18,0x8F1A,0x8F20,0x8F23,0x8F16,0x8F17,0x9073,/* 0x48-0x4F */
+	0x9070,0x906F,0x9067,0x906B,0x912F,0x912B,0x9129,0x912A,/* 0x50-0x57 */
+	0x9132,0x9126,0x912E,0x9185,0x9186,0x918A,0x9181,0x9182,/* 0x58-0x5F */
+	0x9184,0x9180,0x92D0,0x92C3,0x92C4,0x92C0,0x92D9,0x92B6,/* 0x60-0x67 */
+	0x92CF,0x92F1,0x92DF,0x92D8,0x92E9,0x92D7,0x92DD,0x92CC,/* 0x68-0x6F */
+	0x92EF,0x92C2,0x92E8,0x92CA,0x92C8,0x92CE,0x92E6,0x92CD,/* 0x70-0x77 */
+	0x92D5,0x92C9,0x92E0,0x92DE,0x92E7,0x92D1,0x92D3,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x92B5,0x92E1,0x92C6,0x92B4,0x957C,0x95AC,0x95AB,/* 0xA0-0xA7 */
+	0x95AE,0x95B0,0x96A4,0x96A2,0x96D3,0x9705,0x9708,0x9702,/* 0xA8-0xAF */
+	0x975A,0x978A,0x978E,0x9788,0x97D0,0x97CF,0x981E,0x981D,/* 0xB0-0xB7 */
+	0x9826,0x9829,0x9828,0x9820,0x981B,0x9827,0x98B2,0x9908,/* 0xB8-0xBF */
+	0x98FA,0x9911,0x9914,0x9916,0x9917,0x9915,0x99DC,0x99CD,/* 0xC0-0xC7 */
+	0x99CF,0x99D3,0x99D4,0x99CE,0x99C9,0x99D6,0x99D8,0x99CB,/* 0xC8-0xCF */
+	0x99D7,0x99CC,0x9AB3,0x9AEC,0x9AEB,0x9AF3,0x9AF2,0x9AF1,/* 0xD0-0xD7 */
+	0x9B46,0x9B43,0x9B67,0x9B74,0x9B71,0x9B66,0x9B76,0x9B75,/* 0xD8-0xDF */
+	0x9B70,0x9B68,0x9B64,0x9B6C,0x9CFC,0x9CFA,0x9CFD,0x9CFF,/* 0xE0-0xE7 */
+	0x9CF7,0x9D07,0x9D00,0x9CF9,0x9CFB,0x9D08,0x9D05,0x9D04,/* 0xE8-0xEF */
+	0x9E83,0x9ED3,0x9F0F,0x9F10,0x511C,0x5113,0x5117,0x511A,/* 0xF0-0xF7 */
+	0x5111,0x51DE,0x5334,0x53E1,0x5670,0x5660,0x566E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5673,0x5666,0x5663,0x566D,0x5672,0x565E,0x5677,0x571C,/* 0x40-0x47 */
+	0x571B,0x58C8,0x58BD,0x58C9,0x58BF,0x58BA,0x58C2,0x58BC,/* 0x48-0x4F */
+	0x58C6,0x5B17,0x5B19,0x5B1B,0x5B21,0x5B14,0x5B13,0x5B10,/* 0x50-0x57 */
+	0x5B16,0x5B28,0x5B1A,0x5B20,0x5B1E,0x5BEF,0x5DAC,0x5DB1,/* 0x58-0x5F */
+	0x5DA9,0x5DA7,0x5DB5,0x5DB0,0x5DAE,0x5DAA,0x5DA8,0x5DB2,/* 0x60-0x67 */
+	0x5DAD,0x5DAF,0x5DB4,0x5E67,0x5E68,0x5E66,0x5E6F,0x5EE9,/* 0x68-0x6F */
+	0x5EE7,0x5EE6,0x5EE8,0x5EE5,0x5F4B,0x5FBC,0x619D,0x61A8,/* 0x70-0x77 */
+	0x6196,0x61C5,0x61B4,0x61C6,0x61C1,0x61CC,0x61BA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x61BF,0x61B8,0x618C,0x64D7,0x64D6,0x64D0,0x64CF,/* 0xA0-0xA7 */
+	0x64C9,0x64BD,0x6489,0x64C3,0x64DB,0x64F3,0x64D9,0x6533,/* 0xA8-0xAF */
+	0x657F,0x657C,0x65A2,0x66C8,0x66BE,0x66C0,0x66CA,0x66CB,/* 0xB0-0xB7 */
+	0x66CF,0x66BD,0x66BB,0x66BA,0x66CC,0x6723,0x6A34,0x6A66,/* 0xB8-0xBF */
+	0x6A49,0x6A67,0x6A32,0x6A68,0x6A3E,0x6A5D,0x6A6D,0x6A76,/* 0xC0-0xC7 */
+	0x6A5B,0x6A51,0x6A28,0x6A5A,0x6A3B,0x6A3F,0x6A41,0x6A6A,/* 0xC8-0xCF */
+	0x6A64,0x6A50,0x6A4F,0x6A54,0x6A6F,0x6A69,0x6A60,0x6A3C,/* 0xD0-0xD7 */
+	0x6A5E,0x6A56,0x6A55,0x6A4D,0x6A4E,0x6A46,0x6B55,0x6B54,/* 0xD8-0xDF */
+	0x6B56,0x6BA7,0x6BAA,0x6BAB,0x6BC8,0x6BC7,0x6C04,0x6C03,/* 0xE0-0xE7 */
+	0x6C06,0x6FAD,0x6FCB,0x6FA3,0x6FC7,0x6FBC,0x6FCE,0x6FC8,/* 0xE8-0xEF */
+	0x6F5E,0x6FC4,0x6FBD,0x6F9E,0x6FCA,0x6FA8,0x7004,0x6FA5,/* 0xF0-0xF7 */
+	0x6FAE,0x6FBA,0x6FAC,0x6FAA,0x6FCF,0x6FBF,0x6FB8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6FA2,0x6FC9,0x6FAB,0x6FCD,0x6FAF,0x6FB2,0x6FB0,0x71C5,/* 0x40-0x47 */
+	0x71C2,0x71BF,0x71B8,0x71D6,0x71C0,0x71C1,0x71CB,0x71D4,/* 0x48-0x4F */
+	0x71CA,0x71C7,0x71CF,0x71BD,0x71D8,0x71BC,0x71C6,0x71DA,/* 0x50-0x57 */
+	0x71DB,0x729D,0x729E,0x7369,0x7366,0x7367,0x736C,0x7365,/* 0x58-0x5F */
+	0x736B,0x736A,0x747F,0x749A,0x74A0,0x7494,0x7492,0x7495,/* 0x60-0x67 */
+	0x74A1,0x750B,0x7580,0x762F,0x762D,0x7631,0x763D,0x7633,/* 0x68-0x6F */
+	0x763C,0x7635,0x7632,0x7630,0x76BB,0x76E6,0x779A,0x779D,/* 0x70-0x77 */
+	0x77A1,0x779C,0x779B,0x77A2,0x77A3,0x7795,0x7799,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7797,0x78DD,0x78E9,0x78E5,0x78EA,0x78DE,0x78E3,/* 0xA0-0xA7 */
+	0x78DB,0x78E1,0x78E2,0x78ED,0x78DF,0x78E0,0x79A4,0x7A44,/* 0xA8-0xAF */
+	0x7A48,0x7A47,0x7AB6,0x7AB8,0x7AB5,0x7AB1,0x7AB7,0x7BDE,/* 0xB0-0xB7 */
+	0x7BE3,0x7BE7,0x7BDD,0x7BD5,0x7BE5,0x7BDA,0x7BE8,0x7BF9,/* 0xB8-0xBF */
+	0x7BD4,0x7BEA,0x7BE2,0x7BDC,0x7BEB,0x7BD8,0x7BDF,0x7CD2,/* 0xC0-0xC7 */
+	0x7CD4,0x7CD7,0x7CD0,0x7CD1,0x7E12,0x7E21,0x7E17,0x7E0C,/* 0xC8-0xCF */
+	0x7E1F,0x7E20,0x7E13,0x7E0E,0x7E1C,0x7E15,0x7E1A,0x7E22,/* 0xD0-0xD7 */
+	0x7E0B,0x7E0F,0x7E16,0x7E0D,0x7E14,0x7E25,0x7E24,0x7F43,/* 0xD8-0xDF */
+	0x7F7B,0x7F7C,0x7F7A,0x7FB1,0x7FEF,0x802A,0x8029,0x806C,/* 0xE0-0xE7 */
+	0x81B1,0x81A6,0x81AE,0x81B9,0x81B5,0x81AB,0x81B0,0x81AC,/* 0xE8-0xEF */
+	0x81B4,0x81B2,0x81B7,0x81A7,0x81F2,0x8255,0x8256,0x8257,/* 0xF0-0xF7 */
+	0x8556,0x8545,0x856B,0x854D,0x8553,0x8561,0x8558,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8540,0x8546,0x8564,0x8541,0x8562,0x8544,0x8551,0x8547,/* 0x40-0x47 */
+	0x8563,0x853E,0x855B,0x8571,0x854E,0x856E,0x8575,0x8555,/* 0x48-0x4F */
+	0x8567,0x8560,0x858C,0x8566,0x855D,0x8554,0x8565,0x856C,/* 0x50-0x57 */
+	0x8663,0x8665,0x8664,0x879B,0x878F,0x8797,0x8793,0x8792,/* 0x58-0x5F */
+	0x8788,0x8781,0x8796,0x8798,0x8779,0x8787,0x87A3,0x8785,/* 0x60-0x67 */
+	0x8790,0x8791,0x879D,0x8784,0x8794,0x879C,0x879A,0x8789,/* 0x68-0x6F */
+	0x891E,0x8926,0x8930,0x892D,0x892E,0x8927,0x8931,0x8922,/* 0x70-0x77 */
+	0x8929,0x8923,0x892F,0x892C,0x891F,0x89F1,0x8AE0,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8AE2,0x8AF2,0x8AF4,0x8AF5,0x8ADD,0x8B14,0x8AE4,/* 0xA0-0xA7 */
+	0x8ADF,0x8AF0,0x8AC8,0x8ADE,0x8AE1,0x8AE8,0x8AFF,0x8AEF,/* 0xA8-0xAF */
+	0x8AFB,0x8C91,0x8C92,0x8C90,0x8CF5,0x8CEE,0x8CF1,0x8CF0,/* 0xB0-0xB7 */
+	0x8CF3,0x8D6C,0x8D6E,0x8DA5,0x8DA7,0x8E33,0x8E3E,0x8E38,/* 0xB8-0xBF */
+	0x8E40,0x8E45,0x8E36,0x8E3C,0x8E3D,0x8E41,0x8E30,0x8E3F,/* 0xC0-0xC7 */
+	0x8EBD,0x8F36,0x8F2E,0x8F35,0x8F32,0x8F39,0x8F37,0x8F34,/* 0xC8-0xCF */
+	0x9076,0x9079,0x907B,0x9086,0x90FA,0x9133,0x9135,0x9136,/* 0xD0-0xD7 */
+	0x9193,0x9190,0x9191,0x918D,0x918F,0x9327,0x931E,0x9308,/* 0xD8-0xDF */
+	0x931F,0x9306,0x930F,0x937A,0x9338,0x933C,0x931B,0x9323,/* 0xE0-0xE7 */
+	0x9312,0x9301,0x9346,0x932D,0x930E,0x930D,0x92CB,0x931D,/* 0xE8-0xEF */
+	0x92FA,0x9325,0x9313,0x92F9,0x92F7,0x9334,0x9302,0x9324,/* 0xF0-0xF7 */
+	0x92FF,0x9329,0x9339,0x9335,0x932A,0x9314,0x930C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x930B,0x92FE,0x9309,0x9300,0x92FB,0x9316,0x95BC,0x95CD,/* 0x40-0x47 */
+	0x95BE,0x95B9,0x95BA,0x95B6,0x95BF,0x95B5,0x95BD,0x96A9,/* 0x48-0x4F */
+	0x96D4,0x970B,0x9712,0x9710,0x9799,0x9797,0x9794,0x97F0,/* 0x50-0x57 */
+	0x97F8,0x9835,0x982F,0x9832,0x9924,0x991F,0x9927,0x9929,/* 0x58-0x5F */
+	0x999E,0x99EE,0x99EC,0x99E5,0x99E4,0x99F0,0x99E3,0x99EA,/* 0x60-0x67 */
+	0x99E9,0x99E7,0x9AB9,0x9ABF,0x9AB4,0x9ABB,0x9AF6,0x9AFA,/* 0x68-0x6F */
+	0x9AF9,0x9AF7,0x9B33,0x9B80,0x9B85,0x9B87,0x9B7C,0x9B7E,/* 0x70-0x77 */
+	0x9B7B,0x9B82,0x9B93,0x9B92,0x9B90,0x9B7A,0x9B95,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9B7D,0x9B88,0x9D25,0x9D17,0x9D20,0x9D1E,0x9D14,/* 0xA0-0xA7 */
+	0x9D29,0x9D1D,0x9D18,0x9D22,0x9D10,0x9D19,0x9D1F,0x9E88,/* 0xA8-0xAF */
+	0x9E86,0x9E87,0x9EAE,0x9EAD,0x9ED5,0x9ED6,0x9EFA,0x9F12,/* 0xB0-0xB7 */
+	0x9F3D,0x5126,0x5125,0x5122,0x5124,0x5120,0x5129,0x52F4,/* 0xB8-0xBF */
+	0x5693,0x568C,0x568D,0x5686,0x5684,0x5683,0x567E,0x5682,/* 0xC0-0xC7 */
+	0x567F,0x5681,0x58D6,0x58D4,0x58CF,0x58D2,0x5B2D,0x5B25,/* 0xC8-0xCF */
+	0x5B32,0x5B23,0x5B2C,0x5B27,0x5B26,0x5B2F,0x5B2E,0x5B7B,/* 0xD0-0xD7 */
+	0x5BF1,0x5BF2,0x5DB7,0x5E6C,0x5E6A,0x5FBE,0x5FBB,0x61C3,/* 0xD8-0xDF */
+	0x61B5,0x61BC,0x61E7,0x61E0,0x61E5,0x61E4,0x61E8,0x61DE,/* 0xE0-0xE7 */
+	0x64EF,0x64E9,0x64E3,0x64EB,0x64E4,0x64E8,0x6581,0x6580,/* 0xE8-0xEF */
+	0x65B6,0x65DA,0x66D2,0x6A8D,0x6A96,0x6A81,0x6AA5,0x6A89,/* 0xF0-0xF7 */
+	0x6A9F,0x6A9B,0x6AA1,0x6A9E,0x6A87,0x6A93,0x6A8E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A95,0x6A83,0x6AA8,0x6AA4,0x6A91,0x6A7F,0x6AA6,0x6A9A,/* 0x40-0x47 */
+	0x6A85,0x6A8C,0x6A92,0x6B5B,0x6BAD,0x6C09,0x6FCC,0x6FA9,/* 0x48-0x4F */
+	0x6FF4,0x6FD4,0x6FE3,0x6FDC,0x6FED,0x6FE7,0x6FE6,0x6FDE,/* 0x50-0x57 */
+	0x6FF2,0x6FDD,0x6FE2,0x6FE8,0x71E1,0x71F1,0x71E8,0x71F2,/* 0x58-0x5F */
+	0x71E4,0x71F0,0x71E2,0x7373,0x736E,0x736F,0x7497,0x74B2,/* 0x60-0x67 */
+	0x74AB,0x7490,0x74AA,0x74AD,0x74B1,0x74A5,0x74AF,0x7510,/* 0x68-0x6F */
+	0x7511,0x7512,0x750F,0x7584,0x7643,0x7648,0x7649,0x7647,/* 0x70-0x77 */
+	0x76A4,0x76E9,0x77B5,0x77AB,0x77B2,0x77B7,0x77B6,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x77B4,0x77B1,0x77A8,0x77F0,0x78F3,0x78FD,0x7902,/* 0xA0-0xA7 */
+	0x78FB,0x78FC,0x78F2,0x7905,0x78F9,0x78FE,0x7904,0x79AB,/* 0xA8-0xAF */
+	0x79A8,0x7A5C,0x7A5B,0x7A56,0x7A58,0x7A54,0x7A5A,0x7ABE,/* 0xB0-0xB7 */
+	0x7AC0,0x7AC1,0x7C05,0x7C0F,0x7BF2,0x7C00,0x7BFF,0x7BFB,/* 0xB8-0xBF */
+	0x7C0E,0x7BF4,0x7C0B,0x7BF3,0x7C02,0x7C09,0x7C03,0x7C01,/* 0xC0-0xC7 */
+	0x7BF8,0x7BFD,0x7C06,0x7BF0,0x7BF1,0x7C10,0x7C0A,0x7CE8,/* 0xC8-0xCF */
+	0x7E2D,0x7E3C,0x7E42,0x7E33,0x9848,0x7E38,0x7E2A,0x7E49,/* 0xD0-0xD7 */
+	0x7E40,0x7E47,0x7E29,0x7E4C,0x7E30,0x7E3B,0x7E36,0x7E44,/* 0xD8-0xDF */
+	0x7E3A,0x7F45,0x7F7F,0x7F7E,0x7F7D,0x7FF4,0x7FF2,0x802C,/* 0xE0-0xE7 */
+	0x81BB,0x81C4,0x81CC,0x81CA,0x81C5,0x81C7,0x81BC,0x81E9,/* 0xE8-0xEF */
+	0x825B,0x825A,0x825C,0x8583,0x8580,0x858F,0x85A7,0x8595,/* 0xF0-0xF7 */
+	0x85A0,0x858B,0x85A3,0x857B,0x85A4,0x859A,0x859E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8577,0x857C,0x8589,0x85A1,0x857A,0x8578,0x8557,0x858E,/* 0x40-0x47 */
+	0x8596,0x8586,0x858D,0x8599,0x859D,0x8581,0x85A2,0x8582,/* 0x48-0x4F */
+	0x8588,0x8585,0x8579,0x8576,0x8598,0x8590,0x859F,0x8668,/* 0x50-0x57 */
+	0x87BE,0x87AA,0x87AD,0x87C5,0x87B0,0x87AC,0x87B9,0x87B5,/* 0x58-0x5F */
+	0x87BC,0x87AE,0x87C9,0x87C3,0x87C2,0x87CC,0x87B7,0x87AF,/* 0x60-0x67 */
+	0x87C4,0x87CA,0x87B4,0x87B6,0x87BF,0x87B8,0x87BD,0x87DE,/* 0x68-0x6F */
+	0x87B2,0x8935,0x8933,0x893C,0x893E,0x8941,0x8952,0x8937,/* 0x70-0x77 */
+	0x8942,0x89AD,0x89AF,0x89AE,0x89F2,0x89F3,0x8B1E,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8B18,0x8B16,0x8B11,0x8B05,0x8B0B,0x8B22,0x8B0F,/* 0xA0-0xA7 */
+	0x8B12,0x8B15,0x8B07,0x8B0D,0x8B08,0x8B06,0x8B1C,0x8B13,/* 0xA8-0xAF */
+	0x8B1A,0x8C4F,0x8C70,0x8C72,0x8C71,0x8C6F,0x8C95,0x8C94,/* 0xB0-0xB7 */
+	0x8CF9,0x8D6F,0x8E4E,0x8E4D,0x8E53,0x8E50,0x8E4C,0x8E47,/* 0xB8-0xBF */
+	0x8F43,0x8F40,0x9085,0x907E,0x9138,0x919A,0x91A2,0x919B,/* 0xC0-0xC7 */
+	0x9199,0x919F,0x91A1,0x919D,0x91A0,0x93A1,0x9383,0x93AF,/* 0xC8-0xCF */
+	0x9364,0x9356,0x9347,0x937C,0x9358,0x935C,0x9376,0x9349,/* 0xD0-0xD7 */
+	0x9350,0x9351,0x9360,0x936D,0x938F,0x934C,0x936A,0x9379,/* 0xD8-0xDF */
+	0x9357,0x9355,0x9352,0x934F,0x9371,0x9377,0x937B,0x9361,/* 0xE0-0xE7 */
+	0x935E,0x9363,0x9367,0x9380,0x934E,0x9359,0x95C7,0x95C0,/* 0xE8-0xEF */
+	0x95C9,0x95C3,0x95C5,0x95B7,0x96AE,0x96B0,0x96AC,0x9720,/* 0xF0-0xF7 */
+	0x971F,0x9718,0x971D,0x9719,0x979A,0x97A1,0x979C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x979E,0x979D,0x97D5,0x97D4,0x97F1,0x9841,0x9844,0x984A,/* 0x40-0x47 */
+	0x9849,0x9845,0x9843,0x9925,0x992B,0x992C,0x992A,0x9933,/* 0x48-0x4F */
+	0x9932,0x992F,0x992D,0x9931,0x9930,0x9998,0x99A3,0x99A1,/* 0x50-0x57 */
+	0x9A02,0x99FA,0x99F4,0x99F7,0x99F9,0x99F8,0x99F6,0x99FB,/* 0x58-0x5F */
+	0x99FD,0x99FE,0x99FC,0x9A03,0x9ABE,0x9AFE,0x9AFD,0x9B01,/* 0x60-0x67 */
+	0x9AFC,0x9B48,0x9B9A,0x9BA8,0x9B9E,0x9B9B,0x9BA6,0x9BA1,/* 0x68-0x6F */
+	0x9BA5,0x9BA4,0x9B86,0x9BA2,0x9BA0,0x9BAF,0x9D33,0x9D41,/* 0x70-0x77 */
+	0x9D67,0x9D36,0x9D2E,0x9D2F,0x9D31,0x9D38,0x9D30,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9D45,0x9D42,0x9D43,0x9D3E,0x9D37,0x9D40,0x9D3D,/* 0xA0-0xA7 */
+	0x7FF5,0x9D2D,0x9E8A,0x9E89,0x9E8D,0x9EB0,0x9EC8,0x9EDA,/* 0xA8-0xAF */
+	0x9EFB,0x9EFF,0x9F24,0x9F23,0x9F22,0x9F54,0x9FA0,0x5131,/* 0xB0-0xB7 */
+	0x512D,0x512E,0x5698,0x569C,0x5697,0x569A,0x569D,0x5699,/* 0xB8-0xBF */
+	0x5970,0x5B3C,0x5C69,0x5C6A,0x5DC0,0x5E6D,0x5E6E,0x61D8,/* 0xC0-0xC7 */
+	0x61DF,0x61ED,0x61EE,0x61F1,0x61EA,0x61F0,0x61EB,0x61D6,/* 0xC8-0xCF */
+	0x61E9,0x64FF,0x6504,0x64FD,0x64F8,0x6501,0x6503,0x64FC,/* 0xD0-0xD7 */
+	0x6594,0x65DB,0x66DA,0x66DB,0x66D8,0x6AC5,0x6AB9,0x6ABD,/* 0xD8-0xDF */
+	0x6AE1,0x6AC6,0x6ABA,0x6AB6,0x6AB7,0x6AC7,0x6AB4,0x6AAD,/* 0xE0-0xE7 */
+	0x6B5E,0x6BC9,0x6C0B,0x7007,0x700C,0x700D,0x7001,0x7005,/* 0xE8-0xEF */
+	0x7014,0x700E,0x6FFF,0x7000,0x6FFB,0x7026,0x6FFC,0x6FF7,/* 0xF0-0xF7 */
+	0x700A,0x7201,0x71FF,0x71F9,0x7203,0x71FD,0x7376,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x74B8,0x74C0,0x74B5,0x74C1,0x74BE,0x74B6,0x74BB,0x74C2,/* 0x40-0x47 */
+	0x7514,0x7513,0x765C,0x7664,0x7659,0x7650,0x7653,0x7657,/* 0x48-0x4F */
+	0x765A,0x76A6,0x76BD,0x76EC,0x77C2,0x77BA,0x78FF,0x790C,/* 0x50-0x57 */
+	0x7913,0x7914,0x7909,0x7910,0x7912,0x7911,0x79AD,0x79AC,/* 0x58-0x5F */
+	0x7A5F,0x7C1C,0x7C29,0x7C19,0x7C20,0x7C1F,0x7C2D,0x7C1D,/* 0x60-0x67 */
+	0x7C26,0x7C28,0x7C22,0x7C25,0x7C30,0x7E5C,0x7E50,0x7E56,/* 0x68-0x6F */
+	0x7E63,0x7E58,0x7E62,0x7E5F,0x7E51,0x7E60,0x7E57,0x7E53,/* 0x70-0x77 */
+	0x7FB5,0x7FB3,0x7FF7,0x7FF8,0x8075,0x81D1,0x81D2,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x81D0,0x825F,0x825E,0x85B4,0x85C6,0x85C0,0x85C3,/* 0xA0-0xA7 */
+	0x85C2,0x85B3,0x85B5,0x85BD,0x85C7,0x85C4,0x85BF,0x85CB,/* 0xA8-0xAF */
+	0x85CE,0x85C8,0x85C5,0x85B1,0x85B6,0x85D2,0x8624,0x85B8,/* 0xB0-0xB7 */
+	0x85B7,0x85BE,0x8669,0x87E7,0x87E6,0x87E2,0x87DB,0x87EB,/* 0xB8-0xBF */
+	0x87EA,0x87E5,0x87DF,0x87F3,0x87E4,0x87D4,0x87DC,0x87D3,/* 0xC0-0xC7 */
+	0x87ED,0x87D8,0x87E3,0x87A4,0x87D7,0x87D9,0x8801,0x87F4,/* 0xC8-0xCF */
+	0x87E8,0x87DD,0x8953,0x894B,0x894F,0x894C,0x8946,0x8950,/* 0xD0-0xD7 */
+	0x8951,0x8949,0x8B2A,0x8B27,0x8B23,0x8B33,0x8B30,0x8B35,/* 0xD8-0xDF */
+	0x8B47,0x8B2F,0x8B3C,0x8B3E,0x8B31,0x8B25,0x8B37,0x8B26,/* 0xE0-0xE7 */
+	0x8B36,0x8B2E,0x8B24,0x8B3B,0x8B3D,0x8B3A,0x8C42,0x8C75,/* 0xE8-0xEF */
+	0x8C99,0x8C98,0x8C97,0x8CFE,0x8D04,0x8D02,0x8D00,0x8E5C,/* 0xF0-0xF7 */
+	0x8E62,0x8E60,0x8E57,0x8E56,0x8E5E,0x8E65,0x8E67,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E5B,0x8E5A,0x8E61,0x8E5D,0x8E69,0x8E54,0x8F46,0x8F47,/* 0x40-0x47 */
+	0x8F48,0x8F4B,0x9128,0x913A,0x913B,0x913E,0x91A8,0x91A5,/* 0x48-0x4F */
+	0x91A7,0x91AF,0x91AA,0x93B5,0x938C,0x9392,0x93B7,0x939B,/* 0x50-0x57 */
+	0x939D,0x9389,0x93A7,0x938E,0x93AA,0x939E,0x93A6,0x9395,/* 0x58-0x5F */
+	0x9388,0x9399,0x939F,0x938D,0x93B1,0x9391,0x93B2,0x93A4,/* 0x60-0x67 */
+	0x93A8,0x93B4,0x93A3,0x93A5,0x95D2,0x95D3,0x95D1,0x96B3,/* 0x68-0x6F */
+	0x96D7,0x96DA,0x5DC2,0x96DF,0x96D8,0x96DD,0x9723,0x9722,/* 0x70-0x77 */
+	0x9725,0x97AC,0x97AE,0x97A8,0x97AB,0x97A4,0x97AA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x97A2,0x97A5,0x97D7,0x97D9,0x97D6,0x97D8,0x97FA,/* 0xA0-0xA7 */
+	0x9850,0x9851,0x9852,0x98B8,0x9941,0x993C,0x993A,0x9A0F,/* 0xA8-0xAF */
+	0x9A0B,0x9A09,0x9A0D,0x9A04,0x9A11,0x9A0A,0x9A05,0x9A07,/* 0xB0-0xB7 */
+	0x9A06,0x9AC0,0x9ADC,0x9B08,0x9B04,0x9B05,0x9B29,0x9B35,/* 0xB8-0xBF */
+	0x9B4A,0x9B4C,0x9B4B,0x9BC7,0x9BC6,0x9BC3,0x9BBF,0x9BC1,/* 0xC0-0xC7 */
+	0x9BB5,0x9BB8,0x9BD3,0x9BB6,0x9BC4,0x9BB9,0x9BBD,0x9D5C,/* 0xC8-0xCF */
+	0x9D53,0x9D4F,0x9D4A,0x9D5B,0x9D4B,0x9D59,0x9D56,0x9D4C,/* 0xD0-0xD7 */
+	0x9D57,0x9D52,0x9D54,0x9D5F,0x9D58,0x9D5A,0x9E8E,0x9E8C,/* 0xD8-0xDF */
+	0x9EDF,0x9F01,0x9F00,0x9F16,0x9F25,0x9F2B,0x9F2A,0x9F29,/* 0xE0-0xE7 */
+	0x9F28,0x9F4C,0x9F55,0x5134,0x5135,0x5296,0x52F7,0x53B4,/* 0xE8-0xEF */
+	0x56AB,0x56AD,0x56A6,0x56A7,0x56AA,0x56AC,0x58DA,0x58DD,/* 0xF0-0xF7 */
+	0x58DB,0x5912,0x5B3D,0x5B3E,0x5B3F,0x5DC3,0x5E70,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5FBF,0x61FB,0x6507,0x6510,0x650D,0x6509,0x650C,0x650E,/* 0x40-0x47 */
+	0x6584,0x65DE,0x65DD,0x66DE,0x6AE7,0x6AE0,0x6ACC,0x6AD1,/* 0x48-0x4F */
+	0x6AD9,0x6ACB,0x6ADF,0x6ADC,0x6AD0,0x6AEB,0x6ACF,0x6ACD,/* 0x50-0x57 */
+	0x6ADE,0x6B60,0x6BB0,0x6C0C,0x7019,0x7027,0x7020,0x7016,/* 0x58-0x5F */
+	0x702B,0x7021,0x7022,0x7023,0x7029,0x7017,0x7024,0x701C,/* 0x60-0x67 */
+	0x702A,0x720C,0x720A,0x7207,0x7202,0x7205,0x72A5,0x72A6,/* 0x68-0x6F */
+	0x72A4,0x72A3,0x72A1,0x74CB,0x74C5,0x74B7,0x74C3,0x7516,/* 0x70-0x77 */
+	0x7660,0x77C9,0x77CA,0x77C4,0x77F1,0x791D,0x791B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7921,0x791C,0x7917,0x791E,0x79B0,0x7A67,0x7A68,/* 0xA0-0xA7 */
+	0x7C33,0x7C3C,0x7C39,0x7C2C,0x7C3B,0x7CEC,0x7CEA,0x7E76,/* 0xA8-0xAF */
+	0x7E75,0x7E78,0x7E70,0x7E77,0x7E6F,0x7E7A,0x7E72,0x7E74,/* 0xB0-0xB7 */
+	0x7E68,0x7F4B,0x7F4A,0x7F83,0x7F86,0x7FB7,0x7FFD,0x7FFE,/* 0xB8-0xBF */
+	0x8078,0x81D7,0x81D5,0x8264,0x8261,0x8263,0x85EB,0x85F1,/* 0xC0-0xC7 */
+	0x85ED,0x85D9,0x85E1,0x85E8,0x85DA,0x85D7,0x85EC,0x85F2,/* 0xC8-0xCF */
+	0x85F8,0x85D8,0x85DF,0x85E3,0x85DC,0x85D1,0x85F0,0x85E6,/* 0xD0-0xD7 */
+	0x85EF,0x85DE,0x85E2,0x8800,0x87FA,0x8803,0x87F6,0x87F7,/* 0xD8-0xDF */
+	0x8809,0x880C,0x880B,0x8806,0x87FC,0x8808,0x87FF,0x880A,/* 0xE0-0xE7 */
+	0x8802,0x8962,0x895A,0x895B,0x8957,0x8961,0x895C,0x8958,/* 0xE8-0xEF */
+	0x895D,0x8959,0x8988,0x89B7,0x89B6,0x89F6,0x8B50,0x8B48,/* 0xF0-0xF7 */
+	0x8B4A,0x8B40,0x8B53,0x8B56,0x8B54,0x8B4B,0x8B55,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B51,0x8B42,0x8B52,0x8B57,0x8C43,0x8C77,0x8C76,0x8C9A,/* 0x40-0x47 */
+	0x8D06,0x8D07,0x8D09,0x8DAC,0x8DAA,0x8DAD,0x8DAB,0x8E6D,/* 0x48-0x4F */
+	0x8E78,0x8E73,0x8E6A,0x8E6F,0x8E7B,0x8EC2,0x8F52,0x8F51,/* 0x50-0x57 */
+	0x8F4F,0x8F50,0x8F53,0x8FB4,0x9140,0x913F,0x91B0,0x91AD,/* 0x58-0x5F */
+	0x93DE,0x93C7,0x93CF,0x93C2,0x93DA,0x93D0,0x93F9,0x93EC,/* 0x60-0x67 */
+	0x93CC,0x93D9,0x93A9,0x93E6,0x93CA,0x93D4,0x93EE,0x93E3,/* 0x68-0x6F */
+	0x93D5,0x93C4,0x93CE,0x93C0,0x93D2,0x93E7,0x957D,0x95DA,/* 0x70-0x77 */
+	0x95DB,0x96E1,0x9729,0x972B,0x972C,0x9728,0x9726,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x97B3,0x97B7,0x97B6,0x97DD,0x97DE,0x97DF,0x985C,/* 0xA0-0xA7 */
+	0x9859,0x985D,0x9857,0x98BF,0x98BD,0x98BB,0x98BE,0x9948,/* 0xA8-0xAF */
+	0x9947,0x9943,0x99A6,0x99A7,0x9A1A,0x9A15,0x9A25,0x9A1D,/* 0xB0-0xB7 */
+	0x9A24,0x9A1B,0x9A22,0x9A20,0x9A27,0x9A23,0x9A1E,0x9A1C,/* 0xB8-0xBF */
+	0x9A14,0x9AC2,0x9B0B,0x9B0A,0x9B0E,0x9B0C,0x9B37,0x9BEA,/* 0xC0-0xC7 */
+	0x9BEB,0x9BE0,0x9BDE,0x9BE4,0x9BE6,0x9BE2,0x9BF0,0x9BD4,/* 0xC8-0xCF */
+	0x9BD7,0x9BEC,0x9BDC,0x9BD9,0x9BE5,0x9BD5,0x9BE1,0x9BDA,/* 0xD0-0xD7 */
+	0x9D77,0x9D81,0x9D8A,0x9D84,0x9D88,0x9D71,0x9D80,0x9D78,/* 0xD8-0xDF */
+	0x9D86,0x9D8B,0x9D8C,0x9D7D,0x9D6B,0x9D74,0x9D75,0x9D70,/* 0xE0-0xE7 */
+	0x9D69,0x9D85,0x9D73,0x9D7B,0x9D82,0x9D6F,0x9D79,0x9D7F,/* 0xE8-0xEF */
+	0x9D87,0x9D68,0x9E94,0x9E91,0x9EC0,0x9EFC,0x9F2D,0x9F40,/* 0xF0-0xF7 */
+	0x9F41,0x9F4D,0x9F56,0x9F57,0x9F58,0x5337,0x56B2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x56B5,0x56B3,0x58E3,0x5B45,0x5DC6,0x5DC7,0x5EEE,0x5EEF,/* 0x40-0x47 */
+	0x5FC0,0x5FC1,0x61F9,0x6517,0x6516,0x6515,0x6513,0x65DF,/* 0x48-0x4F */
+	0x66E8,0x66E3,0x66E4,0x6AF3,0x6AF0,0x6AEA,0x6AE8,0x6AF9,/* 0x50-0x57 */
+	0x6AF1,0x6AEE,0x6AEF,0x703C,0x7035,0x702F,0x7037,0x7034,/* 0x58-0x5F */
+	0x7031,0x7042,0x7038,0x703F,0x703A,0x7039,0x7040,0x703B,/* 0x60-0x67 */
+	0x7033,0x7041,0x7213,0x7214,0x72A8,0x737D,0x737C,0x74BA,/* 0x68-0x6F */
+	0x76AB,0x76AA,0x76BE,0x76ED,0x77CC,0x77CE,0x77CF,0x77CD,/* 0x70-0x77 */
+	0x77F2,0x7925,0x7923,0x7927,0x7928,0x7924,0x7929,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x79B2,0x7A6E,0x7A6C,0x7A6D,0x7AF7,0x7C49,0x7C48,/* 0xA0-0xA7 */
+	0x7C4A,0x7C47,0x7C45,0x7CEE,0x7E7B,0x7E7E,0x7E81,0x7E80,/* 0xA8-0xAF */
+	0x7FBA,0x7FFF,0x8079,0x81DB,0x81D9,0x820B,0x8268,0x8269,/* 0xB0-0xB7 */
+	0x8622,0x85FF,0x8601,0x85FE,0x861B,0x8600,0x85F6,0x8604,/* 0xB8-0xBF */
+	0x8609,0x8605,0x860C,0x85FD,0x8819,0x8810,0x8811,0x8817,/* 0xC0-0xC7 */
+	0x8813,0x8816,0x8963,0x8966,0x89B9,0x89F7,0x8B60,0x8B6A,/* 0xC8-0xCF */
+	0x8B5D,0x8B68,0x8B63,0x8B65,0x8B67,0x8B6D,0x8DAE,0x8E86,/* 0xD0-0xD7 */
+	0x8E88,0x8E84,0x8F59,0x8F56,0x8F57,0x8F55,0x8F58,0x8F5A,/* 0xD8-0xDF */
+	0x908D,0x9143,0x9141,0x91B7,0x91B5,0x91B2,0x91B3,0x940B,/* 0xE0-0xE7 */
+	0x9413,0x93FB,0x9420,0x940F,0x9414,0x93FE,0x9415,0x9410,/* 0xE8-0xEF */
+	0x9428,0x9419,0x940D,0x93F5,0x9400,0x93F7,0x9407,0x940E,/* 0xF0-0xF7 */
+	0x9416,0x9412,0x93FA,0x9409,0x93F8,0x940A,0x93FF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x93FC,0x940C,0x93F6,0x9411,0x9406,0x95DE,0x95E0,0x95DF,/* 0x40-0x47 */
+	0x972E,0x972F,0x97B9,0x97BB,0x97FD,0x97FE,0x9860,0x9862,/* 0x48-0x4F */
+	0x9863,0x985F,0x98C1,0x98C2,0x9950,0x994E,0x9959,0x994C,/* 0x50-0x57 */
+	0x994B,0x9953,0x9A32,0x9A34,0x9A31,0x9A2C,0x9A2A,0x9A36,/* 0x58-0x5F */
+	0x9A29,0x9A2E,0x9A38,0x9A2D,0x9AC7,0x9ACA,0x9AC6,0x9B10,/* 0x60-0x67 */
+	0x9B12,0x9B11,0x9C0B,0x9C08,0x9BF7,0x9C05,0x9C12,0x9BF8,/* 0x68-0x6F */
+	0x9C40,0x9C07,0x9C0E,0x9C06,0x9C17,0x9C14,0x9C09,0x9D9F,/* 0x70-0x77 */
+	0x9D99,0x9DA4,0x9D9D,0x9D92,0x9D98,0x9D90,0x9D9B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9DA0,0x9D94,0x9D9C,0x9DAA,0x9D97,0x9DA1,0x9D9A,/* 0xA0-0xA7 */
+	0x9DA2,0x9DA8,0x9D9E,0x9DA3,0x9DBF,0x9DA9,0x9D96,0x9DA6,/* 0xA8-0xAF */
+	0x9DA7,0x9E99,0x9E9B,0x9E9A,0x9EE5,0x9EE4,0x9EE7,0x9EE6,/* 0xB0-0xB7 */
+	0x9F30,0x9F2E,0x9F5B,0x9F60,0x9F5E,0x9F5D,0x9F59,0x9F91,/* 0xB8-0xBF */
+	0x513A,0x5139,0x5298,0x5297,0x56C3,0x56BD,0x56BE,0x5B48,/* 0xC0-0xC7 */
+	0x5B47,0x5DCB,0x5DCF,0x5EF1,0x61FD,0x651B,0x6B02,0x6AFC,/* 0xC8-0xCF */
+	0x6B03,0x6AF8,0x6B00,0x7043,0x7044,0x704A,0x7048,0x7049,/* 0xD0-0xD7 */
+	0x7045,0x7046,0x721D,0x721A,0x7219,0x737E,0x7517,0x766A,/* 0xD8-0xDF */
+	0x77D0,0x792D,0x7931,0x792F,0x7C54,0x7C53,0x7CF2,0x7E8A,/* 0xE0-0xE7 */
+	0x7E87,0x7E88,0x7E8B,0x7E86,0x7E8D,0x7F4D,0x7FBB,0x8030,/* 0xE8-0xEF */
+	0x81DD,0x8618,0x862A,0x8626,0x861F,0x8623,0x861C,0x8619,/* 0xF0-0xF7 */
+	0x8627,0x862E,0x8621,0x8620,0x8629,0x861E,0x8625,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8829,0x881D,0x881B,0x8820,0x8824,0x881C,0x882B,0x884A,/* 0x40-0x47 */
+	0x896D,0x8969,0x896E,0x896B,0x89FA,0x8B79,0x8B78,0x8B45,/* 0x48-0x4F */
+	0x8B7A,0x8B7B,0x8D10,0x8D14,0x8DAF,0x8E8E,0x8E8C,0x8F5E,/* 0x50-0x57 */
+	0x8F5B,0x8F5D,0x9146,0x9144,0x9145,0x91B9,0x943F,0x943B,/* 0x58-0x5F */
+	0x9436,0x9429,0x943D,0x943C,0x9430,0x9439,0x942A,0x9437,/* 0x60-0x67 */
+	0x942C,0x9440,0x9431,0x95E5,0x95E4,0x95E3,0x9735,0x973A,/* 0x68-0x6F */
+	0x97BF,0x97E1,0x9864,0x98C9,0x98C6,0x98C0,0x9958,0x9956,/* 0x70-0x77 */
+	0x9A39,0x9A3D,0x9A46,0x9A44,0x9A42,0x9A41,0x9A3A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9A3F,0x9ACD,0x9B15,0x9B17,0x9B18,0x9B16,0x9B3A,/* 0xA0-0xA7 */
+	0x9B52,0x9C2B,0x9C1D,0x9C1C,0x9C2C,0x9C23,0x9C28,0x9C29,/* 0xA8-0xAF */
+	0x9C24,0x9C21,0x9DB7,0x9DB6,0x9DBC,0x9DC1,0x9DC7,0x9DCA,/* 0xB0-0xB7 */
+	0x9DCF,0x9DBE,0x9DC5,0x9DC3,0x9DBB,0x9DB5,0x9DCE,0x9DB9,/* 0xB8-0xBF */
+	0x9DBA,0x9DAC,0x9DC8,0x9DB1,0x9DAD,0x9DCC,0x9DB3,0x9DCD,/* 0xC0-0xC7 */
+	0x9DB2,0x9E7A,0x9E9C,0x9EEB,0x9EEE,0x9EED,0x9F1B,0x9F18,/* 0xC8-0xCF */
+	0x9F1A,0x9F31,0x9F4E,0x9F65,0x9F64,0x9F92,0x4EB9,0x56C6,/* 0xD0-0xD7 */
+	0x56C5,0x56CB,0x5971,0x5B4B,0x5B4C,0x5DD5,0x5DD1,0x5EF2,/* 0xD8-0xDF */
+	0x6521,0x6520,0x6526,0x6522,0x6B0B,0x6B08,0x6B09,0x6C0D,/* 0xE0-0xE7 */
+	0x7055,0x7056,0x7057,0x7052,0x721E,0x721F,0x72A9,0x737F,/* 0xE8-0xEF */
+	0x74D8,0x74D5,0x74D9,0x74D7,0x766D,0x76AD,0x7935,0x79B4,/* 0xF0-0xF7 */
+	0x7A70,0x7A71,0x7C57,0x7C5C,0x7C59,0x7C5B,0x7C5A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7CF4,0x7CF1,0x7E91,0x7F4F,0x7F87,0x81DE,0x826B,0x8634,/* 0x40-0x47 */
+	0x8635,0x8633,0x862C,0x8632,0x8636,0x882C,0x8828,0x8826,/* 0x48-0x4F */
+	0x882A,0x8825,0x8971,0x89BF,0x89BE,0x89FB,0x8B7E,0x8B84,/* 0x50-0x57 */
+	0x8B82,0x8B86,0x8B85,0x8B7F,0x8D15,0x8E95,0x8E94,0x8E9A,/* 0x58-0x5F */
+	0x8E92,0x8E90,0x8E96,0x8E97,0x8F60,0x8F62,0x9147,0x944C,/* 0x60-0x67 */
+	0x9450,0x944A,0x944B,0x944F,0x9447,0x9445,0x9448,0x9449,/* 0x68-0x6F */
+	0x9446,0x973F,0x97E3,0x986A,0x9869,0x98CB,0x9954,0x995B,/* 0x70-0x77 */
+	0x9A4E,0x9A53,0x9A54,0x9A4C,0x9A4F,0x9A48,0x9A4A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9A49,0x9A52,0x9A50,0x9AD0,0x9B19,0x9B2B,0x9B3B,/* 0xA0-0xA7 */
+	0x9B56,0x9B55,0x9C46,0x9C48,0x9C3F,0x9C44,0x9C39,0x9C33,/* 0xA8-0xAF */
+	0x9C41,0x9C3C,0x9C37,0x9C34,0x9C32,0x9C3D,0x9C36,0x9DDB,/* 0xB0-0xB7 */
+	0x9DD2,0x9DDE,0x9DDA,0x9DCB,0x9DD0,0x9DDC,0x9DD1,0x9DDF,/* 0xB8-0xBF */
+	0x9DE9,0x9DD9,0x9DD8,0x9DD6,0x9DF5,0x9DD5,0x9DDD,0x9EB6,/* 0xC0-0xC7 */
+	0x9EF0,0x9F35,0x9F33,0x9F32,0x9F42,0x9F6B,0x9F95,0x9FA2,/* 0xC8-0xCF */
+	0x513D,0x5299,0x58E8,0x58E7,0x5972,0x5B4D,0x5DD8,0x882F,/* 0xD0-0xD7 */
+	0x5F4F,0x6201,0x6203,0x6204,0x6529,0x6525,0x6596,0x66EB,/* 0xD8-0xDF */
+	0x6B11,0x6B12,0x6B0F,0x6BCA,0x705B,0x705A,0x7222,0x7382,/* 0xE0-0xE7 */
+	0x7381,0x7383,0x7670,0x77D4,0x7C67,0x7C66,0x7E95,0x826C,/* 0xE8-0xEF */
+	0x863A,0x8640,0x8639,0x863C,0x8631,0x863B,0x863E,0x8830,/* 0xF0-0xF7 */
+	0x8832,0x882E,0x8833,0x8976,0x8974,0x8973,0x89FE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B8C,0x8B8E,0x8B8B,0x8B88,0x8C45,0x8D19,0x8E98,0x8F64,/* 0x40-0x47 */
+	0x8F63,0x91BC,0x9462,0x9455,0x945D,0x9457,0x945E,0x97C4,/* 0x48-0x4F */
+	0x97C5,0x9800,0x9A56,0x9A59,0x9B1E,0x9B1F,0x9B20,0x9C52,/* 0x50-0x57 */
+	0x9C58,0x9C50,0x9C4A,0x9C4D,0x9C4B,0x9C55,0x9C59,0x9C4C,/* 0x58-0x5F */
+	0x9C4E,0x9DFB,0x9DF7,0x9DEF,0x9DE3,0x9DEB,0x9DF8,0x9DE4,/* 0x60-0x67 */
+	0x9DF6,0x9DE1,0x9DEE,0x9DE6,0x9DF2,0x9DF0,0x9DE2,0x9DEC,/* 0x68-0x6F */
+	0x9DF4,0x9DF3,0x9DE8,0x9DED,0x9EC2,0x9ED0,0x9EF2,0x9EF3,/* 0x70-0x77 */
+	0x9F06,0x9F1C,0x9F38,0x9F37,0x9F36,0x9F43,0x9F4F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9F71,0x9F70,0x9F6E,0x9F6F,0x56D3,0x56CD,0x5B4E,/* 0xA0-0xA7 */
+	0x5C6D,0x652D,0x66ED,0x66EE,0x6B13,0x705F,0x7061,0x705D,/* 0xA8-0xAF */
+	0x7060,0x7223,0x74DB,0x74E5,0x77D5,0x7938,0x79B7,0x79B6,/* 0xB0-0xB7 */
+	0x7C6A,0x7E97,0x7F89,0x826D,0x8643,0x8838,0x8837,0x8835,/* 0xB8-0xBF */
+	0x884B,0x8B94,0x8B95,0x8E9E,0x8E9F,0x8EA0,0x8E9D,0x91BE,/* 0xC0-0xC7 */
+	0x91BD,0x91C2,0x946B,0x9468,0x9469,0x96E5,0x9746,0x9743,/* 0xC8-0xCF */
+	0x9747,0x97C7,0x97E5,0x9A5E,0x9AD5,0x9B59,0x9C63,0x9C67,/* 0xD0-0xD7 */
+	0x9C66,0x9C62,0x9C5E,0x9C60,0x9E02,0x9DFE,0x9E07,0x9E03,/* 0xD8-0xDF */
+	0x9E06,0x9E05,0x9E00,0x9E01,0x9E09,0x9DFF,0x9DFD,0x9E04,/* 0xE0-0xE7 */
+	0x9EA0,0x9F1E,0x9F46,0x9F74,0x9F75,0x9F76,0x56D4,0x652E,/* 0xE8-0xEF */
+	0x65B8,0x6B18,0x6B19,0x6B17,0x6B1A,0x7062,0x7226,0x72AA,/* 0xF0-0xF7 */
+	0x77D8,0x77D9,0x7939,0x7C69,0x7C6B,0x7CF6,0x7E9A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E98,0x7E9B,0x7E99,0x81E0,0x81E1,0x8646,0x8647,0x8648,/* 0x40-0x47 */
+	0x8979,0x897A,0x897C,0x897B,0x89FF,0x8B98,0x8B99,0x8EA5,/* 0x48-0x4F */
+	0x8EA4,0x8EA3,0x946E,0x946D,0x946F,0x9471,0x9473,0x9749,/* 0x50-0x57 */
+	0x9872,0x995F,0x9C68,0x9C6E,0x9C6D,0x9E0B,0x9E0D,0x9E10,/* 0x58-0x5F */
+	0x9E0F,0x9E12,0x9E11,0x9EA1,0x9EF5,0x9F09,0x9F47,0x9F78,/* 0x60-0x67 */
+	0x9F7B,0x9F7A,0x9F79,0x571E,0x7066,0x7C6F,0x883C,0x8DB2,/* 0x68-0x6F */
+	0x8EA6,0x91C3,0x9474,0x9478,0x9476,0x9475,0x9A60,0x9C74,/* 0x70-0x77 */
+	0x9C73,0x9C71,0x9C75,0x9E14,0x9E13,0x9EF6,0x9F0A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9FA4,0x7068,0x7065,0x7CF7,0x866A,0x883E,0x883D,/* 0xA0-0xA7 */
+	0x883F,0x8B9E,0x8C9C,0x8EA9,0x8EC9,0x974B,0x9873,0x9874,/* 0xA8-0xAF */
+	0x98CC,0x9961,0x99AB,0x9A64,0x9A66,0x9A67,0x9B24,0x9E15,/* 0xB0-0xB7 */
+	0x9E17,0x9F48,0x6207,0x6B1E,0x7227,0x864C,0x8EA8,0x9482,/* 0xB8-0xBF */
+	0x9480,0x9481,0x9A69,0x9A68,0x9B2E,0x9E19,0x7229,0x864B,/* 0xC0-0xC7 */
+	0x8B9F,0x9483,0x9C79,0x9EB7,0x7675,0x9A6B,0x9C7A,0x9E1D,/* 0xC8-0xCF */
+	0x7069,0x706A,0x9EA4,0x9F7E,0x9F49,0x9F98,0x7881,0x92B9,/* 0xD0-0xD7 */
+	0x88CF,0x58BB,0x6052,0x7CA7,0x5AFA,0x2554,0x2566,0x2557,/* 0xD8-0xDF */
+	0x2560,0x256C,0x2563,0x255A,0x2569,0x255D,0x2552,0x2564,/* 0xE0-0xE7 */
+	0x2555,0x255E,0x256A,0x2561,0x2558,0x2567,0x255B,0x2553,/* 0xE8-0xEF */
+	0x2565,0x2556,0x255F,0x256B,0x2562,0x2559,0x2568,0x255C,/* 0xF0-0xF7 */
+	0x2551,0x2550,0x256D,0x256E,0x2570,0x256F,0x2593,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, 
+	c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, 
+	c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, 
+	c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, 
+	c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, NULL,   
+	NULL,   c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, 
+	c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, 
+	c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, 
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, 
+	c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, 
+	c2u_F8, c2u_F9, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char u2c_02[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA3, 0xBE, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xA3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x90-0x93 */
+	0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x94-0x97 */
+	0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, 0xA3, 0x4E, /* 0x98-0x9B */
+	0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, 0xA3, 0x52, /* 0x9C-0x9F */
+	0xA3, 0x53, 0xA3, 0x54, 0x00, 0x00, 0xA3, 0x55, /* 0xA0-0xA3 */
+	0xA3, 0x56, 0xA3, 0x57, 0xA3, 0x58, 0xA3, 0x59, /* 0xA4-0xA7 */
+	0xA3, 0x5A, 0xA3, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xA3, 0x5C, 0xA3, 0x5D, 0xA3, 0x5E, /* 0xB0-0xB3 */
+	0xA3, 0x5F, 0xA3, 0x60, 0xA3, 0x61, 0xA3, 0x62, /* 0xB4-0xB7 */
+	0xA3, 0x63, 0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, /* 0xB8-0xBB */
+	0xA3, 0x67, 0xA3, 0x68, 0xA3, 0x69, 0xA3, 0x6A, /* 0xBC-0xBF */
+	0xA3, 0x6B, 0xA3, 0x6C, 0x00, 0x00, 0xA3, 0x6D, /* 0xC0-0xC3 */
+	0xA3, 0x6E, 0xA3, 0x6F, 0xA3, 0x70, 0xA3, 0x71, /* 0xC4-0xC7 */
+	0xA3, 0x72, 0xA3, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x56, /* 0x10-0x13 */
+	0xA1, 0x58, 0xA2, 0x77, 0xA1, 0xFC, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xA7, 0xA1, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0x45, 0x00, 0x00, /* 0x20-0x23 */
+	0xA3, 0xBB, 0xA1, 0x4C, 0xA1, 0x4B, 0xA1, 0x45, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xAC, 0xA1, 0xB2, /* 0x30-0x33 */
+	0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB0, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xC3, 0x00, 0x00, /* 0x3C-0x3F */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0x4A, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA2, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x60-0x63 */
+	0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x64-0x67 */
+	0xA2, 0xC1, 0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA1, 0xF6, 0xA1, 0xF4, 0xA1, 0xF7, 0xA1, 0xF5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x94-0x97 */
+	0xA1, 0xFB, 0xA1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_22[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA2, 0x41, 0xA2, 0x42, 0x00, 0x00, /* 0x14-0x17 */
+	0xA2, 0x58, 0x00, 0x00, 0xA1, 0xD4, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, 0xA1, 0xE8, /* 0x1C-0x1F */
+	0xA1, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xFD, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, 0xA1, 0xEC, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDC, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA1, 0xDA, 0xA1, 0xDD, 0x00, 0x00, 0xA1, 0xDD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xD9, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA1, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xA1, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE9, /* 0xBC-0xBF */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x5B, /* 0x04-0x07 */
+};
+
+static const unsigned char u2c_25[512] = {
+	0xA2, 0x77, 0x00, 0x00, 0xA2, 0x78, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xA2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xA2, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xA2, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA2, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xA2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xA2, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA2, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA2, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xF9, 0xF9, 0xF8, 0xF9, 0xE6, 0xF9, 0xEF, /* 0x50-0x53 */
+	0xF9, 0xDD, 0xF9, 0xE8, 0xF9, 0xF1, 0xF9, 0xDF, /* 0x54-0x57 */
+	0xF9, 0xEC, 0xF9, 0xF5, 0xF9, 0xE3, 0xF9, 0xEE, /* 0x58-0x5B */
+	0xF9, 0xF7, 0xF9, 0xE5, 0xF9, 0xE9, 0xF9, 0xF2, /* 0x5C-0x5F */
+	0xF9, 0xE0, 0xF9, 0xEB, 0xF9, 0xF4, 0xF9, 0xE2, /* 0x60-0x63 */
+	0xF9, 0xE7, 0xF9, 0xF0, 0xF9, 0xDE, 0xF9, 0xED, /* 0x64-0x67 */
+	0xF9, 0xF6, 0xF9, 0xE4, 0xF9, 0xEA, 0xF9, 0xF3, /* 0x68-0x6B */
+	0xF9, 0xE1, 0xA2, 0x7E, 0xA2, 0xA1, 0xA2, 0xA3, /* 0x6C-0x6F */
+	0xA2, 0xA2, 0xA2, 0xAC, 0xA2, 0xAD, 0xA2, 0xAE, /* 0x70-0x73 */
+	0xA1, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xA2, 0x62, 0xA2, 0x63, 0xA2, 0x64, /* 0x80-0x83 */
+	0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, 0xA2, 0x68, /* 0x84-0x87 */
+	0xA2, 0x69, 0xA2, 0x70, 0xA2, 0x6F, 0xA2, 0x6E, /* 0x88-0x8B */
+	0xA2, 0x6D, 0xA2, 0x6C, 0xA2, 0x6B, 0xA2, 0x6A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, /* 0x90-0x93 */
+	0xA2, 0x76, 0xA2, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA1, 0xBD, 0xA1, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xB6, 0xA1, 0xB5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xA1, 0xBF, 0xA1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xBB, 0xA1, 0xBA, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB3, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xB7, 0xA1, 0xB4, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xA8, 0xA2, 0xA9, /* 0xE0-0xE3 */
+	0xA2, 0xAB, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xB9, 0xA1, 0xB8, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA1, 0xF0, 0xA1, 0xF2, 0xA1, 0xF1, 0x00, 0x00, /* 0x40-0x43 */
+};
+
+static const unsigned char u2c_30[512] = {
+	0xA1, 0x40, 0xA1, 0x42, 0xA1, 0x43, 0xA1, 0xB2, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA1, 0x71, 0xA1, 0x72, 0xA1, 0x6D, 0xA1, 0x6E, /* 0x08-0x0B */
+	0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x79, 0xA1, 0x7A, /* 0x0C-0x0F */
+	0xA1, 0x69, 0xA1, 0x6A, 0xA2, 0x45, 0x00, 0x00, /* 0x10-0x13 */
+	0xA1, 0x65, 0xA1, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xE3, 0xA1, 0xA9, 0xA1, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA2, 0xC3, 0xA2, 0xC4, 0xA2, 0xC5, /* 0x20-0x23 */
+	0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xC9, /* 0x24-0x27 */
+	0xA2, 0xCA, 0xA2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+};
+
+static const unsigned char u2c_31[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, /* 0x04-0x07 */
+	0xA3, 0x77, 0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, /* 0x08-0x0B */
+	0xA3, 0x7B, 0xA3, 0x7C, 0xA3, 0x7D, 0xA3, 0x7E, /* 0x0C-0x0F */
+	0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, 0xA3, 0xA4, /* 0x10-0x13 */
+	0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, 0xA3, 0xA8, /* 0x14-0x17 */
+	0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, 0xA3, 0xAC, /* 0x18-0x1B */
+	0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, 0xA3, 0xB0, /* 0x1C-0x1F */
+	0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, 0xA3, 0xB4, /* 0x20-0x23 */
+	0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, 0xA3, 0xB8, /* 0x24-0x27 */
+	0xA3, 0xB9, 0xA3, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0x40, 0xA4, 0x47, /* 0x90-0x93 */
+	0xA4, 0x54, 0xA5, 0x7C, 0xA4, 0x57, 0xA4, 0xA4, /* 0x94-0x97 */
+	0xA4, 0x55, 0xA5, 0xD2, 0xA4, 0x41, 0xA4, 0xFE, /* 0x98-0x9B */
+	0xA4, 0x42, 0xA4, 0xD1, 0xA6, 0x61, 0xA4, 0x48, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_32[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x20-0x23 */
+	0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x24-0x27 */
+	0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x28-0x2B */
+	0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x2C-0x2F */
+	0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x30-0x33 */
+	0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x34-0x37 */
+	0xB3, 0xD2, 0xA5, 0x4E, 0xA9, 0x49, 0xBE, 0xC7, /* 0x38-0x3B */
+	0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0x3C-0x3F */
+	0xB2, 0xBD, 0xA5, 0xF0, 0xA6, 0xDB, 0xA6, 0xDC, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x80-0x83 */
+	0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x84-0x87 */
+	0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x88-0x8B */
+	0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x8C-0x8F */
+	0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x90-0x93 */
+	0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x94-0x97 */
+	0xB3, 0xD2, 0xAF, 0xB5, 0xA8, 0x6B, 0xA4, 0x6B, /* 0x98-0x9B */
+	0xBE, 0x41, 0xC0, 0x75, 0xA6, 0x4C, 0xAA, 0x60, /* 0x9C-0x9F */
+	0xB6, 0xB5, 0xA5, 0xF0, 0xBC, 0x67, 0xA1, 0xC0, /* 0xA0-0xA3 */
+	0xA4, 0x57, 0xA4, 0xA4, 0xA4, 0x55, 0xA5, 0xAA, /* 0xA4-0xA7 */
+	0xA5, 0x6B, 0xC2, 0xE5, 0xA9, 0x76, 0xBE, 0xC7, /* 0xA8-0xAB */
+	0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0xAC-0xAF */
+	0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0x55, 0xA2, 0x56, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA2, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0x53, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xA1, 0xEB, 0xA1, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xA2, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0xA4, 0x40, 0xA4, 0x42, 0x00, 0x00, 0xA4, 0x43, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x45, /* 0x04-0x07 */
+	0xA4, 0x56, 0xA4, 0x54, 0xA4, 0x57, 0xA4, 0x55, /* 0x08-0x0B */
+	0xC9, 0x46, 0xA4, 0xA3, 0xC9, 0x4F, 0xC9, 0x4D, /* 0x0C-0x0F */
+	0xA4, 0xA2, 0xA4, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xA5, 0x42, 0xA5, 0x41, 0xA5, 0x40, 0x00, 0x00, /* 0x14-0x17 */
+	0xA5, 0x43, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xE0, 0xA5, 0xE1, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xC3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0x58, /* 0x28-0x2B */
+	0x00, 0x00, 0xA4, 0xA4, 0xC9, 0x50, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA4, 0xA5, 0xC9, 0x63, 0xA6, 0xEA, 0xCB, 0xB1, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xA4, 0x59, 0xA4, 0xA6, 0x00, 0x00, 0xA5, 0x44, /* 0x38-0x3B */
+	0xC9, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0x40, 0xA4, 0x44, /* 0x40-0x43 */
+	0x00, 0x00, 0xA4, 0x5B, 0x00, 0x00, 0xC9, 0x47, /* 0x44-0x47 */
+	0xA4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xA7, /* 0x48-0x4B */
+	0x00, 0x00, 0xA5, 0x45, 0xA5, 0x47, 0xA5, 0x46, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xE2, 0xA5, 0xE3, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xC4, 0x00, 0x00, /* 0x54-0x57 */
+	0xAD, 0xBC, 0xA4, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xC9, 0x41, 0xA4, 0x45, 0xA4, 0x5E, 0xA4, 0x5D, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xA5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC5, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xAE, 0xD4, 0x4B, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xC3, 0xDC, 0xB1, /* 0x80-0x83 */
+	0xDC, 0xB2, 0x00, 0x00, 0xA4, 0x46, 0x00, 0x00, /* 0x84-0x87 */
+	0xA4, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC6, /* 0x88-0x8B */
+	0xA4, 0x47, 0xC9, 0x48, 0xA4, 0x5F, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA4, 0xAA, 0xA4, 0xAC, 0xC9, 0x51, /* 0x90-0x93 */
+	0xA4, 0xAD, 0xA4, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA5, 0xE5, 0x00, 0x00, 0xA8, 0xC7, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xC8, 0xAB, 0x45, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA4, 0x60, 0xA4, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xA5, 0xE6, 0xA5, 0xE8, 0xA5, 0xE7, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC9, /* 0xA8-0xAB */
+	0xA8, 0xCA, 0xAB, 0x46, 0xAB, 0x47, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xBD, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF6, 0xD6, 0xA4, 0x48, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xA4, 0xB0, 0xA4, 0xAF, 0xC9, 0x52, 0xA4, 0xB1, /* 0xC0-0xC3 */
+	0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xC4-0xC7 */
+	0xC9, 0x54, 0xC9, 0x53, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xA5, 0x4A, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0xD4-0xD7 */
+	0xA5, 0x49, 0xA5, 0x50, 0xC9, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC9, 0x66, 0xC9, 0x69, 0xA5, 0x51, 0xA5, 0x61, /* 0xDC-0xDF */
+	0x00, 0x00, 0xC9, 0x68, 0x00, 0x00, 0xA5, 0x4E, /* 0xE0-0xE3 */
+	0xA5, 0x4F, 0xA5, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xC9, 0x65, 0xC9, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA5, 0xF5, 0xC9, 0xB0, 0xA5, 0xF2, 0xA5, 0xF6, /* 0xF0-0xF3 */
+	0xC9, 0xBA, 0xC9, 0xAE, 0xA5, 0xF3, 0xC9, 0xB2, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0xF4, /* 0xF8-0xFB */
+	0x00, 0x00, 0xA5, 0xF7, 0x00, 0x00, 0xA5, 0xE9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0xC9, 0xB1, 0xA5, 0xF8, 0xC9, 0xB5, 0x00, 0x00, /* 0x00-0x03 */
+	0xC9, 0xB9, 0xC9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xC9, 0xB3, 0xA5, 0xEA, 0xA5, 0xEC, 0xA5, 0xF9, /* 0x08-0x0B */
+	0x00, 0x00, 0xA5, 0xEE, 0xC9, 0xAB, 0xA5, 0xF1, /* 0x0C-0x0F */
+	0xA5, 0xEF, 0xA5, 0xF0, 0xC9, 0xBB, 0xC9, 0xB8, /* 0x10-0x13 */
+	0xC9, 0xAF, 0xA5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xC9, 0xAC, 0xA5, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xC9, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xB7, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xC9, 0xAD, 0xCA, 0x66, 0x00, 0x00, 0xA7, 0x42, /* 0x2C-0x2F */
+	0xA6, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x67, /* 0x30-0x33 */
+	0xA6, 0xF1, 0x00, 0x00, 0xA7, 0x44, 0x00, 0x00, /* 0x34-0x37 */
+	0xA6, 0xF9, 0x00, 0x00, 0xA6, 0xF8, 0xCA, 0x5B, /* 0x38-0x3B */
+	0xA6, 0xFC, 0xA6, 0xF7, 0xCA, 0x60, 0xCA, 0x68, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCA, 0x64, 0x00, 0x00, 0xA6, 0xFA, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0xFD, 0xA6, 0xEE, /* 0x44-0x47 */
+	0xA7, 0x47, 0xCA, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCB, 0xBD, 0xA6, 0xEC, 0xA7, 0x43, 0xA6, 0xED, /* 0x4C-0x4F */
+	0xA6, 0xF5, 0xA6, 0xF6, 0xCA, 0x62, 0xCA, 0x5E, /* 0x50-0x53 */
+	0xA6, 0xFB, 0xA6, 0xF3, 0xCA, 0x5A, 0xA6, 0xEF, /* 0x54-0x57 */
+	0xCA, 0x65, 0xA7, 0x45, 0xA7, 0x48, 0xA6, 0xF2, /* 0x58-0x5B */
+	0xA7, 0x40, 0xA7, 0x46, 0xA6, 0xF0, 0xCA, 0x63, /* 0x5C-0x5F */
+	0xA7, 0x41, 0xCA, 0x69, 0xCA, 0x5C, 0xA6, 0xFE, /* 0x60-0x63 */
+	0xCA, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x61, /* 0x64-0x67 */
+	0x00, 0x00, 0xA8, 0xD8, 0xCB, 0xBF, 0xCB, 0xCB, /* 0x68-0x6B */
+	0xA8, 0xD0, 0x00, 0x00, 0xCB, 0xCC, 0xA8, 0xCB, /* 0x6C-0x6F */
+	0xA8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xCE, /* 0x70-0x73 */
+	0xCB, 0xB9, 0xA8, 0xD6, 0xCB, 0xB8, 0xCB, 0xBC, /* 0x74-0x77 */
+	0xCB, 0xC3, 0xCB, 0xC1, 0xA8, 0xDE, 0xA8, 0xD9, /* 0x78-0x7B */
+	0xCB, 0xB3, 0xCB, 0xB5, 0xA8, 0xDB, 0xA8, 0xCF, /* 0x7C-0x7F */
+	
+	0xCB, 0xB6, 0xCB, 0xC2, 0xCB, 0xC9, 0xA8, 0xD4, /* 0x80-0x83 */
+	0xCB, 0xBB, 0xCB, 0xB4, 0xA8, 0xD3, 0xCB, 0xB7, /* 0x84-0x87 */
+	0xA8, 0xD7, 0xCB, 0xBA, 0x00, 0x00, 0xA8, 0xD2, /* 0x88-0x8B */
+	0x00, 0x00, 0xA8, 0xCD, 0x00, 0x00, 0xA8, 0xDC, /* 0x8C-0x8F */
+	0xCB, 0xC4, 0xA8, 0xDD, 0xCB, 0xC8, 0x00, 0x00, /* 0x90-0x93 */
+	0xCB, 0xC6, 0xCB, 0xCA, 0xA8, 0xDA, 0xCB, 0xBE, /* 0x94-0x97 */
+	0xCB, 0xB2, 0x00, 0x00, 0xCB, 0xC0, 0xA8, 0xD1, /* 0x98-0x9B */
+	0xCB, 0xC5, 0xA8, 0xCC, 0xCB, 0xC7, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0x56, 0xAB, 0x4A, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xCD, 0xE8, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xAB, 0x49, 0xAB, 0x51, 0xAB, 0x5D, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xCD, 0xEE, 0xCD, 0xEC, 0xCD, 0xE7, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x4B, /* 0xBC-0xBF */
+	0xCD, 0xED, 0xCD, 0xE3, 0xAB, 0x59, 0xAB, 0x50, /* 0xC0-0xC3 */
+	0xAB, 0x58, 0xCD, 0xDE, 0x00, 0x00, 0xCD, 0xEA, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xCD, 0xE1, 0xAB, 0x54, 0xCD, 0xE2, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCD, 0xDD, 0xAB, 0x5B, 0xAB, 0x4E, /* 0xCC-0xCF */
+	0xAB, 0x57, 0xAB, 0x4D, 0x00, 0x00, 0xCD, 0xDF, /* 0xD0-0xD3 */
+	0xCD, 0xE4, 0x00, 0x00, 0xCD, 0xEB, 0xAB, 0x55, /* 0xD4-0xD7 */
+	0xAB, 0x52, 0xCD, 0xE6, 0xAB, 0x5A, 0xCD, 0xE9, /* 0xD8-0xDB */
+	0xCD, 0xE5, 0xAB, 0x4F, 0xAB, 0x5C, 0xAB, 0x53, /* 0xDC-0xDF */
+	0xAB, 0x4C, 0xAB, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCD, 0xEF, 0x00, 0x00, 0xAD, 0xD7, 0xAD, 0xC1, /* 0xEC-0xEF */
+	0x00, 0x00, 0xAD, 0xD1, 0x00, 0x00, 0xAD, 0xD6, /* 0xF0-0xF3 */
+	0xD0, 0xD0, 0xD0, 0xCF, 0xD0, 0xD4, 0xD0, 0xD5, /* 0xF4-0xF7 */
+	0xAD, 0xC4, 0x00, 0x00, 0xAD, 0xCD, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xDA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_50[512] = {
+	0xAD, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xD0, 0xC9, 0xAD, 0xC7, 0xD0, 0xCA, /* 0x04-0x07 */
+	0x00, 0x00, 0xAD, 0xDC, 0x00, 0x00, 0xAD, 0xD3, /* 0x08-0x0B */
+	0xAD, 0xBE, 0xAD, 0xBF, 0xD0, 0xDD, 0xB0, 0xBF, /* 0x0C-0x0F */
+	0x00, 0x00, 0xAD, 0xCC, 0xAD, 0xCB, 0xD0, 0xCB, /* 0x10-0x13 */
+	0xAD, 0xCF, 0xD4, 0x5B, 0xAD, 0xC6, 0xD0, 0xD6, /* 0x14-0x17 */
+	0xAD, 0xD5, 0xAD, 0xD4, 0xAD, 0xCA, 0xD0, 0xCE, /* 0x18-0x1B */
+	0xD0, 0xD7, 0x00, 0x00, 0xD0, 0xC8, 0xAD, 0xC9, /* 0x1C-0x1F */
+	0xD0, 0xD8, 0xAD, 0xD2, 0xD0, 0xCC, 0xAD, 0xC0, /* 0x20-0x23 */
+	0x00, 0x00, 0xAD, 0xC3, 0xAD, 0xC2, 0xD0, 0xD9, /* 0x24-0x27 */
+	0xAD, 0xD0, 0xAD, 0xC5, 0xAD, 0xD9, 0xAD, 0xDB, /* 0x28-0x2B */
+	0xD0, 0xD3, 0xAD, 0xD8, 0x00, 0x00, 0xD0, 0xDB, /* 0x2C-0x2F */
+	0xD0, 0xCD, 0xD0, 0xDC, 0x00, 0x00, 0xD0, 0xD1, /* 0x30-0x33 */
+	0x00, 0x00, 0xD0, 0xDA, 0x00, 0x00, 0xD0, 0xD2, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xAD, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD4, 0x63, 0xD4, 0x57, 0x00, 0x00, 0xB0, 0xB3, /* 0x40-0x43 */
+	0x00, 0x00, 0xD4, 0x5C, 0xD4, 0x62, 0xB0, 0xB2, /* 0x44-0x47 */
+	0xD4, 0x55, 0xB0, 0xB6, 0xD4, 0x59, 0xD4, 0x52, /* 0x48-0x4B */
+	0xB0, 0xB4, 0xD4, 0x56, 0xB0, 0xB9, 0xB0, 0xBE, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD4, 0x67, 0x00, 0x00, 0xD4, 0x51, /* 0x50-0x53 */
+	0x00, 0x00, 0xB0, 0xBA, 0x00, 0x00, 0xD4, 0x66, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xB5, 0xD4, 0x58, /* 0x58-0x5B */
+	0xB0, 0xB1, 0xD4, 0x53, 0xD4, 0x4F, 0xD4, 0x5D, /* 0x5C-0x5F */
+	0xD4, 0x50, 0xD4, 0x4E, 0xD4, 0x5A, 0xD4, 0x60, /* 0x60-0x63 */
+	0xD4, 0x61, 0xB0, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xD8, 0x5B, 0xD4, 0x5E, 0xD4, 0x4D, 0xD4, 0x5F, /* 0x68-0x6B */
+	0x00, 0x00, 0xB0, 0xC1, 0xD4, 0x64, 0xB0, 0xC0, /* 0x6C-0x6F */
+	0xD4, 0x4C, 0x00, 0x00, 0xD4, 0x54, 0xD4, 0x65, /* 0x70-0x73 */
+	0xB0, 0xBC, 0xB0, 0xBB, 0xB0, 0xB8, 0xB0, 0xBD, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xAF, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xB0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xB3, 0xC8, 0x00, 0x00, 0xD8, 0x5E, 0xD8, 0x57, /* 0x80-0x83 */
+	0x00, 0x00, 0xB3, 0xC5, 0x00, 0x00, 0xD8, 0x5F, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x55, /* 0x88-0x8B */
+	0xD8, 0x58, 0xB3, 0xC4, 0xD8, 0x59, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xB3, 0xC7, 0xD8, 0x5D, 0x00, 0x00, /* 0x90-0x93 */
+	0xD8, 0x53, 0xD8, 0x52, 0xB3, 0xC9, 0x00, 0x00, /* 0x94-0x97 */
+	0xB3, 0xCA, 0xB3, 0xC6, 0xB3, 0xCB, 0xD8, 0x51, /* 0x98-0x9B */
+	0xD8, 0x5C, 0xD8, 0x5A, 0xD8, 0x54, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xC3, 0xD8, 0x56, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xB6, 0xCA, 0xB6, 0xC4, 0xDC, 0xB7, 0xB6, 0xCD, /* 0xAC-0xAF */
+	0xDC, 0xBD, 0xDC, 0xC0, 0xB6, 0xC6, 0xB6, 0xC7, /* 0xB0-0xB3 */
+	0xDC, 0xBA, 0xB6, 0xC5, 0xDC, 0xC3, 0xB6, 0xCB, /* 0xB4-0xB7 */
+	0xDC, 0xC4, 0x00, 0x00, 0xDC, 0xBF, 0xB6, 0xCC, /* 0xB8-0xBB */
+	0x00, 0x00, 0xDC, 0xB4, 0xB6, 0xC9, 0xDC, 0xB5, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDC, 0xBE, 0xDC, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDC, 0xB8, 0xB6, 0xC8, 0xDC, 0xB6, 0xB6, 0xCE, /* 0xC4-0xC7 */
+	0xDC, 0xBB, 0xDC, 0xC2, 0xDC, 0xB9, 0xDC, 0xC1, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xB6, 0xB9, 0xB3, /* 0xCC-0xCF */
+	0x00, 0x00, 0xB9, 0xB4, 0x00, 0x00, 0xE0, 0xF9, /* 0xD0-0xD3 */
+	0xE0, 0xF1, 0xB9, 0xB2, 0xB9, 0xAF, 0xE0, 0xF2, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xB1, 0xE0, 0xF5, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE0, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE0, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFD, /* 0xE0-0xE3 */
+	0xE0, 0xF8, 0xB9, 0xAE, 0xE0, 0xF0, 0xB9, 0xAC, /* 0xE4-0xE7 */
+	0xE0, 0xF3, 0xB9, 0xB7, 0xE0, 0xF6, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE0, 0xFA, 0xB9, 0xB0, 0xB9, 0xAD, 0xE0, 0xFC, /* 0xEC-0xEF */
+	0xE0, 0xFB, 0xB9, 0xB5, 0x00, 0x00, 0xE0, 0xF4, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xBB, 0xF8, 0xE4, 0xEC, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE4, 0xE9, 0xBB, 0xF9, 0x00, 0x00, 0xBB, 0xF7, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE4, 0xF0, 0xE4, 0xED, 0xE4, 0xE6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_51[512] = {
+	0xBB, 0xF6, 0x00, 0x00, 0xBB, 0xFA, 0xE4, 0xE7, /* 0x00-0x03 */
+	0xBB, 0xF5, 0xBB, 0xFD, 0xE4, 0xEA, 0xE4, 0xEB, /* 0x04-0x07 */
+	0xBB, 0xFB, 0xBB, 0xFC, 0xE4, 0xF1, 0xE4, 0xEE, /* 0x08-0x0B */
+	0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xBE, 0xAA, 0xE8, 0xF8, 0xBE, 0xA7, 0xE8, 0xF5, /* 0x10-0x13 */
+	0xBE, 0xA9, 0xBE, 0xAB, 0x00, 0x00, 0xE8, 0xF6, /* 0x14-0x17 */
+	0xBE, 0xA8, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x18-0x1B */
+	0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x76, /* 0x1C-0x1F */
+	0xEC, 0xBD, 0xC0, 0x77, 0xEC, 0xBB, 0x00, 0x00, /* 0x20-0x23 */
+	0xEC, 0xBC, 0xEC, 0xBA, 0xEC, 0xB9, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xEC, 0xBE, 0xC0, 0x75, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEF, 0xB8, 0xEF, 0xB9, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE4, 0xE8, 0xEF, 0xB7, 0xC0, 0x78, 0xC3, 0x5F, /* 0x30-0x33 */
+	0xF1, 0xEB, 0xF1, 0xEC, 0x00, 0x00, 0xC4, 0xD7, /* 0x34-0x37 */
+	0xC4, 0xD8, 0xF5, 0xC1, 0xF5, 0xC0, 0xC5, 0x6C, /* 0x38-0x3B */
+	0xC5, 0x6B, 0xF7, 0xD0, 0x00, 0x00, 0xA4, 0x49, /* 0x3C-0x3F */
+	0xA4, 0x61, 0xA4, 0xB9, 0x00, 0x00, 0xA4, 0xB8, /* 0x40-0x43 */
+	0xA5, 0x53, 0xA5, 0x52, 0xA5, 0xFC, 0xA5, 0xFB, /* 0x44-0x47 */
+	0xA5, 0xFD, 0xA5, 0xFA, 0x00, 0x00, 0xA7, 0x4A, /* 0x48-0x4B */
+	0xA7, 0x49, 0xA7, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xE0, 0x00, 0x00, /* 0x50-0x53 */
+	0xA8, 0xDF, 0xA8, 0xE1, 0x00, 0x00, 0xAB, 0x5E, /* 0x54-0x57 */
+	0x00, 0x00, 0xA2, 0x59, 0xD0, 0xDE, 0xA2, 0x5A, /* 0x58-0x5B */
+	0xB0, 0xC2, 0xA2, 0x5C, 0xA2, 0x5B, 0xD8, 0x60, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA2, 0x5D, 0xB9, 0xB8, 0xA2, 0x5E, /* 0x60-0x63 */
+	0x00, 0x00, 0xA4, 0x4A, 0x00, 0x00, 0xA4, 0xBA, /* 0x64-0x67 */
+	0xA5, 0xFE, 0xA8, 0xE2, 0x00, 0x00, 0xA4, 0x4B, /* 0x68-0x6B */
+	0xA4, 0xBD, 0xA4, 0xBB, 0xA4, 0xBC, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xA6, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xA7, 0x4C, 0xA8, 0xE4, 0xA8, 0xE3, /* 0x74-0x77 */
+	0xA8, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xAD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xBE, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x4E, /* 0x84-0x87 */
+	0x00, 0x00, 0xA5, 0x54, 0xA5, 0x55, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xA6, 0x41, 0x00, 0x00, 0xCA, 0x6A, /* 0x8C-0x8F */
+	0x00, 0x00, 0xAB, 0x60, 0xAB, 0x5F, 0xD0, 0xE0, /* 0x90-0x93 */
+	0xD0, 0xDF, 0xB0, 0xC3, 0x00, 0x00, 0xA4, 0xBE, /* 0x94-0x97 */
+	0xC9, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, 0x00, 0x00, /* 0x9C-0x9F */
+	0xAB, 0x61, 0x00, 0x00, 0xAD, 0xE0, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xAD, 0xDE, 0xAD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xBE, 0xAD, 0x00, 0x00, /* 0xA8-0xAB */
+	0xA5, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xA6, 0x42, 0xC9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0x4D, 0xA7, 0x4E, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xCA, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xCB, 0xCE, 0xA8, 0xE6, 0xCB, 0xCF, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD0, 0xE2, 0xD0, 0xE3, 0xAD, 0xE3, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xD0, 0xE4, 0x00, 0x00, 0xD0, 0xE1, 0xAD, 0xE4, /* 0xC8-0xCB */
+	0xAD, 0xE2, 0xAD, 0xE1, 0xD0, 0xE5, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD4, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xD8, 0x61, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, /* 0xD4-0xD7 */
+	0xE1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xBB, 0xFE, 0xBE, 0xAE, 0xE8, 0xF9, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA4, 0x4C, 0xA4, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xB0, 0xC4, 0xB3, 0xCD, 0x00, 0x00, 0xB9, 0xB9, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xC9, 0x42, 0xA4, 0xBF, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xA5, 0x59, 0xA5, 0x57, 0xA5, 0x58, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xA8, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0xA4, 0x4D, 0xA4, 0x4E, 0x00, 0x00, 0xA4, 0x62, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x04-0x07 */
+	0xA4, 0xC2, 0xC9, 0xBE, 0xA5, 0x5A, 0x00, 0x00, /* 0x08-0x0B */
+	0xC9, 0x6B, 0x00, 0x00, 0xA6, 0x46, 0x00, 0x00, /* 0x0C-0x0F */
+	0xC9, 0xBF, 0xA6, 0x44, 0xA6, 0x45, 0xC9, 0xBD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0x47, 0xA6, 0x43, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xCA, 0x6C, 0xAA, 0xEC, 0xCA, 0x6D, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xA7, 0x50, 0xA7, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xA7, 0x53, 0xA7, 0x51, 0xA7, 0x52, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xED, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA8, 0xEC, 0xCB, 0xD4, 0xCB, 0xD1, 0xCB, 0xD2, /* 0x30-0x33 */
+	0x00, 0x00, 0xCB, 0xD0, 0xA8, 0xEE, 0xA8, 0xEA, /* 0x34-0x37 */
+	0xA8, 0xE9, 0x00, 0x00, 0xA8, 0xEB, 0xA8, 0xE8, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xA8, 0xEF, 0x00, 0x00, 0xAB, 0x63, /* 0x40-0x43 */
+	0xCD, 0xF0, 0x00, 0x00, 0xCB, 0xD3, 0xAB, 0x68, /* 0x44-0x47 */
+	0x00, 0x00, 0xCD, 0xF1, 0xAB, 0x64, 0xAB, 0x67, /* 0x48-0x4B */
+	0xAB, 0x66, 0xAB, 0x65, 0xAB, 0x62, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE8, 0x00, 0x00, /* 0x50-0x53 */
+	0xAD, 0xE7, 0xD0, 0xEB, 0xAD, 0xE5, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0xAD, 0xE8, /* 0x58-0x5B */
+	0xAD, 0xE6, 0xAD, 0xE9, 0xD0, 0xE9, 0xD0, 0xEA, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD0, 0xE6, 0xD0, 0xEC, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xB3, 0xD1, 0xB0, 0xC5, 0xD4, 0x69, /* 0x68-0x6B */
+	0xD4, 0x6B, 0xD4, 0x6A, 0xD4, 0x6C, 0xB0, 0xC6, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xCE, 0x00, 0x00, /* 0x70-0x73 */
+	0xB3, 0xCF, 0xB3, 0xD0, 0x00, 0x00, 0xB6, 0xD0, /* 0x74-0x77 */
+	0xDC, 0xC7, 0x00, 0x00, 0xDC, 0xC6, 0xDC, 0xC8, /* 0x78-0x7B */
+	0xDC, 0xC9, 0xB6, 0xD1, 0x00, 0x00, 0xB6, 0xCF, /* 0x7C-0x7F */
+	
+	0xE1, 0x41, 0xE1, 0x42, 0xB9, 0xBB, 0xB9, 0xBA, /* 0x80-0x83 */
+	0xE3, 0x5A, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x40, /* 0x84-0x87 */
+	0xBC, 0x41, 0xBC, 0x42, 0xBC, 0x44, 0xE4, 0xF2, /* 0x88-0x8B */
+	0xE4, 0xF3, 0xBC, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBE, 0xAF, 0x00, 0x00, 0xBE, 0xB0, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xED, 0xF5, 0xC3, /* 0x94-0x97 */
+	0xF5, 0xC2, 0xF7, 0xD1, 0x00, 0x00, 0xA4, 0x4F, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x5C, /* 0x9C-0x9F */
+	0xA5, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x48, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x54, /* 0xA8-0xAB */
+	0xA7, 0x57, 0xCA, 0x6F, 0xCA, 0x70, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF1, /* 0xB8-0xBB */
+	0xCB, 0xD5, 0x00, 0x00, 0xA8, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCD, 0xF2, 0xAB, 0x6C, 0xCD, 0xF3, 0xAB, 0x6B, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x69, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xAB, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xC7, 0xD4, 0x6E, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xB0, 0xCA, 0xD4, 0x6D, 0xB1, 0xE5, /* 0xD4-0xD7 */
+	0xB0, 0xC9, 0xB0, 0xC8, 0x00, 0x00, 0xB3, 0xD4, /* 0xD8-0xDB */
+	0x00, 0x00, 0xB3, 0xD3, 0xB3, 0xD2, 0xB6, 0xD2, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xD5, 0xB6, 0xD6, /* 0xE0-0xE3 */
+	0xB6, 0xD4, 0x00, 0x00, 0xB6, 0xD3, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0xE1, 0x44, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0xEC-0xEF */
+	0xBC, 0x45, 0xE4, 0xF4, 0x00, 0x00, 0xBE, 0xB1, /* 0xF0-0xF3 */
+	0xEC, 0xBF, 0xC0, 0x79, 0x00, 0x00, 0xF1, 0xEE, /* 0xF4-0xF7 */
+	0xC4, 0x55, 0x00, 0x00, 0xA4, 0x63, 0xA4, 0xC3, /* 0xF8-0xFB */
+	0xC9, 0x56, 0x00, 0x00, 0xA4, 0xC4, 0xA4, 0xC5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA5, 0x5D, 0xA5, 0x5E, 0x00, 0x00, /* 0x04-0x07 */
+	0xA6, 0x49, 0xCA, 0x71, 0xCB, 0xD6, 0xCB, 0xD7, /* 0x08-0x0B */
+	0x00, 0x00, 0xAB, 0x6D, 0xD0, 0xEE, 0xB0, 0xCC, /* 0x0C-0x0F */
+	0xB0, 0xCB, 0xD8, 0x63, 0xD8, 0x62, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA4, 0x50, 0xA4, 0xC6, 0xA5, 0x5F, /* 0x14-0x17 */
+	0x00, 0x00, 0xB0, 0xCD, 0xC9, 0x43, 0x00, 0x00, /* 0x18-0x1B */
+	0xC9, 0x6C, 0xA5, 0x60, 0x00, 0x00, 0xC9, 0xC2, /* 0x1C-0x1F */
+	0xA6, 0x4B, 0xA6, 0x4A, 0xC9, 0xC1, 0xA7, 0x58, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xEA, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD4, 0x6F, 0x00, 0x00, 0xB6, 0xD7, /* 0x2C-0x2F */
+	0xE1, 0x45, 0xB9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFD, /* 0x34-0x37 */
+	0x00, 0x00, 0xA4, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xCB, 0xD8, 0xCD, 0xF4, 0xB0, 0xD0, 0xB0, 0xCE, /* 0x3C-0x3F */
+	0xB0, 0xCF, 0xA4, 0x51, 0x00, 0x00, 0xA4, 0x64, /* 0x40-0x43 */
+	0xA2, 0xCD, 0xA4, 0xCA, 0x00, 0x00, 0xA4, 0xC9, /* 0x44-0x47 */
+	0xA4, 0xC8, 0xA5, 0x63, 0xA5, 0x62, 0x00, 0x00, /* 0x48-0x4B */
+	0xC9, 0x6D, 0xC9, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xA8, 0xF5, 0xA8, 0xF2, 0xA8, 0xF4, /* 0x50-0x53 */
+	0xA8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x6E, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xD5, 0x00, 0x00, /* 0x58-0x5B */
+	0xA4, 0x52, 0x00, 0x00, 0xA4, 0xCB, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA5, 0x65, 0xA5, 0x64, 0x00, 0x00, 0xCA, 0x72, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xF6, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xC9, 0x57, 0x00, 0x00, 0xA5, 0x67, 0xA5, 0x66, /* 0x6C-0x6F */
+	0xA6, 0x4C, 0xA6, 0x4D, 0xCA, 0x73, 0xA7, 0x59, /* 0x70-0x73 */
+	0x00, 0x00, 0xA7, 0x5A, 0x00, 0x00, 0xA8, 0xF7, /* 0x74-0x77 */
+	0xA8, 0xF8, 0xA8, 0xF9, 0x00, 0x00, 0xAB, 0x6F, /* 0x78-0x7B */
+	0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEB, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0x44, 0x00, 0x00, /* 0x80-0x83 */
+	0xA4, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xC4, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0x74, 0xCA, 0x75, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xD9, 0x00, 0x00, /* 0x90-0x93 */
+	0xCB, 0xDA, 0x00, 0x00, 0xCD, 0xF7, 0xCD, 0xF6, /* 0x94-0x97 */
+	0xCD, 0xF9, 0xCD, 0xF8, 0xAB, 0x70, 0x00, 0x00, /* 0x98-0x9B */
+	0xD4, 0x70, 0xAD, 0xED, 0xD0, 0xEF, 0xAD, 0xEC, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xD8, 0x64, 0xB3, 0xD6, 0x00, 0x00, 0xD8, 0x65, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE1, 0x46, 0xB9, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF1, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xC9, 0x58, 0x00, 0x00, 0xA5, 0x68, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xD1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xA4, 0x53, 0xA4, 0x65, 0xA4, 0xCE, 0xA4, 0xCD, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA4, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xA8, 0xFB, 0x00, 0x00, 0xA8, 0xFA, 0xA8, 0xFC, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x71, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEE, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE8, 0xFB, 0xC2, 0x4F, 0xA4, 0x66, /* 0xE0-0xE3 */
+	0xA5, 0x6A, 0xA5, 0x79, 0xA5, 0x74, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xA5, 0x6F, 0xA5, 0x6E, 0xA5, 0x75, 0xA5, 0x73, /* 0xE8-0xEB */
+	0xA5, 0x6C, 0xA5, 0x7A, 0xA5, 0x6D, 0xA5, 0x69, /* 0xEC-0xEF */
+	0xA5, 0x78, 0xA5, 0x77, 0xA5, 0x76, 0xA5, 0x6B, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xA5, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xA5, 0x71, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x7B, /* 0xF8-0xFB */
+	0xA5, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x00, 0x00, 0xA6, 0x53, 0x00, 0x00, 0xA6, 0x59, /* 0x00-0x03 */
+	0xA6, 0x55, 0x00, 0x00, 0xA6, 0x5B, 0xC9, 0xC5, /* 0x04-0x07 */
+	0xA6, 0x58, 0xA6, 0x4E, 0xA6, 0x51, 0xA6, 0x54, /* 0x08-0x0B */
+	0xA6, 0x50, 0xA6, 0x57, 0xA6, 0x5A, 0xA6, 0x4F, /* 0x0C-0x0F */
+	0xA6, 0x52, 0xA6, 0x56, 0xA6, 0x5C, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xCA, 0x7E, 0xCA, 0x7B, 0x00, 0x00, 0xA7, 0x67, /* 0x18-0x1B */
+	0xCA, 0x7C, 0xA7, 0x5B, 0xA7, 0x5D, 0xA7, 0x75, /* 0x1C-0x1F */
+	0xA7, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xCA, 0xA5, 0xCA, 0x7D, 0xA7, 0x5F, 0xA7, 0x61, /* 0x24-0x27 */
+	0xCA, 0xA4, 0xA7, 0x68, 0xCA, 0x78, 0xA7, 0x74, /* 0x28-0x2B */
+	0xA7, 0x76, 0xA7, 0x5C, 0xA7, 0x6D, 0x00, 0x00, /* 0x2C-0x2F */
+	0xCA, 0x76, 0xA7, 0x73, 0x00, 0x00, 0xA7, 0x64, /* 0x30-0x33 */
+	0x00, 0x00, 0xA7, 0x6E, 0xA7, 0x6F, 0xCA, 0x77, /* 0x34-0x37 */
+	0xA7, 0x6C, 0xA7, 0x6A, 0x00, 0x00, 0xA7, 0x6B, /* 0x38-0x3B */
+	0xA7, 0x71, 0xCA, 0xA1, 0xA7, 0x5E, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA7, 0x72, 0xCA, 0xA3, 0xA7, 0x66, 0xA7, 0x63, /* 0x40-0x43 */
+	0x00, 0x00, 0xCA, 0x7A, 0xA7, 0x62, 0xCA, 0xA6, /* 0x44-0x47 */
+	0xA7, 0x65, 0x00, 0x00, 0xA7, 0x69, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0x60, 0xCA, 0xA2, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCA, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xCB, 0xEB, 0xCB, 0xEA, 0xA9, 0x4F, 0xCB, 0xED, /* 0x60-0x63 */
+	0xCB, 0xEF, 0xCB, 0xE4, 0xCB, 0xE7, 0xCB, 0xEE, /* 0x64-0x67 */
+	0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE1, /* 0x68-0x6B */
+	0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE9, /* 0x6C-0x6F */
+	0xCE, 0x49, 0xA9, 0x4B, 0xCE, 0x4D, 0xA8, 0xFD, /* 0x70-0x73 */
+	0xCB, 0xE6, 0xA8, 0xFE, 0xA9, 0x4C, 0xA9, 0x45, /* 0x74-0x77 */
+	0xA9, 0x41, 0x00, 0x00, 0xCB, 0xE2, 0xA9, 0x44, /* 0x78-0x7B */
+	0xA9, 0x49, 0xA9, 0x52, 0xCB, 0xE3, 0xCB, 0xDC, /* 0x7C-0x7F */
+	
+	0xA9, 0x43, 0xCB, 0xDD, 0xCB, 0xDF, 0x00, 0x00, /* 0x80-0x83 */
+	0xA9, 0x46, 0x00, 0x00, 0xA9, 0x48, 0xCB, 0xDB, /* 0x84-0x87 */
+	0xCB, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, /* 0x88-0x8B */
+	0xA9, 0x4D, 0xCB, 0xE8, 0xA9, 0x53, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA9, 0x4A, 0xCB, 0xDE, 0xA9, 0x47, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA9, 0x42, 0xA9, 0x40, 0x00, 0x00, /* 0x94-0x97 */
+	0xCB, 0xEC, 0x00, 0x00, 0xA9, 0x4E, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0x48, 0xCD, 0xFB, 0xCE, 0x4B, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCD, 0xFD, 0xAB, 0x78, 0xAB, 0xA8, /* 0xA4-0xA7 */
+	0xAB, 0x74, 0xAB, 0xA7, 0xAB, 0x7D, 0xAB, 0xA4, /* 0xA8-0xAB */
+	0xAB, 0x72, 0xCD, 0xFC, 0xCE, 0x43, 0xAB, 0xA3, /* 0xAC-0xAF */
+	0xCE, 0x4F, 0xAB, 0xA5, 0x00, 0x00, 0xAB, 0x79, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x45, 0xCE, 0x42, /* 0xB4-0xB7 */
+	0xAB, 0x77, 0x00, 0x00, 0xCD, 0xFA, 0xAB, 0xA6, /* 0xB8-0xBB */
+	0xCE, 0x4A, 0xAB, 0x7C, 0xCE, 0x4C, 0xAB, 0xA9, /* 0xBC-0xBF */
+	0xAB, 0x73, 0xAB, 0x7E, 0xAB, 0x7B, 0xCE, 0x40, /* 0xC0-0xC3 */
+	0xAB, 0xA1, 0xCE, 0x46, 0xCE, 0x47, 0xAB, 0x7A, /* 0xC4-0xC7 */
+	0xAB, 0xA2, 0xAB, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0x75, 0xCD, 0xFE, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x4E, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD1, 0x44, 0xAD, 0xFB, 0xD0, 0xF1, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD0, 0xF6, 0xAD, 0xF4, 0xAE, 0x40, 0xD0, 0xF4, /* 0xE4-0xE7 */
+	0xAD, 0xEF, 0xAD, 0xF9, 0xAD, 0xFE, 0xD0, 0xFB, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAD, 0xFA, 0xAD, 0xFD, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD0, 0xFE, 0xAD, 0xF5, 0xD0, 0xF5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x42, /* 0xF4-0xF7 */
+	0xD1, 0x43, 0x00, 0x00, 0xAD, 0xF7, 0xD1, 0x41, /* 0xF8-0xFB */
+	0xAD, 0xF3, 0xAE, 0x43, 0x00, 0x00, 0xD0, 0xF8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x00, 0x00, 0xAD, 0xF1, 0x00, 0x00, 0xD1, 0x46, /* 0x00-0x03 */
+	0xD0, 0xF9, 0xD0, 0xFD, 0xAD, 0xF6, 0xAE, 0x42, /* 0x04-0x07 */
+	0xD0, 0xFA, 0xAD, 0xFC, 0xD1, 0x40, 0xD1, 0x47, /* 0x08-0x0B */
+	0xD4, 0xA1, 0x00, 0x00, 0xD1, 0x45, 0xAE, 0x44, /* 0x0C-0x0F */
+	0xAD, 0xF0, 0xD0, 0xFC, 0xD0, 0xF3, 0x00, 0x00, /* 0x10-0x13 */
+	0xAD, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF2, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xF7, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, 0xAE, 0x41, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0x77, 0x00, 0x00, /* 0x28-0x2B */
+	0xB0, 0xE4, 0xD4, 0xA7, 0xB0, 0xE2, 0xB0, 0xDF, /* 0x2C-0x2F */
+	0xD4, 0x7C, 0xB0, 0xDB, 0xD4, 0xA2, 0xB0, 0xE6, /* 0x30-0x33 */
+	0xD4, 0x76, 0xD4, 0x7B, 0xD4, 0x7A, 0xAD, 0xF2, /* 0x34-0x37 */
+	0xB0, 0xE1, 0xD4, 0xA5, 0x00, 0x00, 0xD4, 0xA8, /* 0x38-0x3B */
+	0xD4, 0x73, 0x00, 0x00, 0xB3, 0xE8, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD4, 0xA9, 0xB0, 0xE7, 0x00, 0x00, 0xB0, 0xD9, /* 0x40-0x43 */
+	0xB0, 0xD6, 0xD4, 0x7E, 0xB0, 0xD3, 0x00, 0x00, /* 0x44-0x47 */
+	0xD4, 0xA6, 0x00, 0x00, 0xB0, 0xDA, 0xD4, 0xAA, /* 0x48-0x4B */
+	0x00, 0x00, 0xD4, 0x74, 0xD4, 0xA4, 0xB0, 0xDD, /* 0x4C-0x4F */
+	0xD4, 0x75, 0xD4, 0x78, 0xD4, 0x7D, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xB0, 0xDE, 0xB0, 0xDC, 0xB0, 0xE8, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xB0, 0xE3, 0x00, 0x00, 0xB0, 0xD7, 0xB1, 0xD2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB0, 0xD8, 0xD4, 0x79, 0xB0, 0xE5, /* 0x60-0x63 */
+	0xB0, 0xE0, 0xD4, 0xA3, 0xB0, 0xD5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD4, 0x71, 0xD4, 0x72, 0xD8, 0x6A, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xD7, /* 0x78-0x7B */
+	0xB3, 0xDA, 0xD8, 0x75, 0xB3, 0xEE, 0xD8, 0x78, /* 0x7C-0x7F */
+	
+	0xB3, 0xD8, 0xD8, 0x71, 0xB3, 0xDE, 0xB3, 0xE4, /* 0x80-0x83 */
+	0xB5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xE2, /* 0x84-0x87 */
+	0xD8, 0x6E, 0xB3, 0xEF, 0xB3, 0xDB, 0xB3, 0xE3, /* 0x88-0x8B */
+	0xD8, 0x76, 0xDC, 0xD7, 0xD8, 0x7B, 0xD8, 0x6F, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD8, 0x66, 0xD8, 0x73, 0xD8, 0x6D, /* 0x90-0x93 */
+	0xB3, 0xE1, 0xD8, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xB3, 0xDD, 0xB3, 0xF1, 0xB3, 0xEA, 0x00, 0x00, /* 0x98-0x9B */
+	0xB3, 0xDF, 0xB3, 0xDC, 0x00, 0x00, 0xB3, 0xE7, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD8, 0x7A, 0xD8, 0x6C, 0xD8, 0x72, /* 0xA0-0xA3 */
+	0xD8, 0x74, 0xD8, 0x68, 0xD8, 0x77, 0xB3, 0xD9, /* 0xA4-0xA7 */
+	0xD8, 0x67, 0x00, 0x00, 0xB3, 0xE0, 0xB3, 0xF0, /* 0xA8-0xAB */
+	0xB3, 0xEC, 0xD8, 0x69, 0xB3, 0xE6, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xB3, 0xED, 0xB3, 0xE9, 0xB3, 0xE5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xD8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xEB, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0xBC-0xBF */
+	0xDC, 0xD1, 0x00, 0x00, 0xDC, 0xE0, 0xDC, 0xCA, /* 0xC0-0xC3 */
+	0xDC, 0xD3, 0xB6, 0xE5, 0xB6, 0xE6, 0xB6, 0xDE, /* 0xC4-0xC7 */
+	0xDC, 0xDC, 0xB6, 0xE8, 0xDC, 0xCF, 0xDC, 0xCE, /* 0xC8-0xCB */
+	0xDC, 0xCC, 0xDC, 0xDE, 0xB6, 0xDC, 0xDC, 0xD8, /* 0xCC-0xCF */
+	0xDC, 0xCD, 0xB6, 0xDF, 0xDC, 0xD6, 0xB6, 0xDA, /* 0xD0-0xD3 */
+	0xDC, 0xD2, 0xDC, 0xD9, 0xDC, 0xDB, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xDC, 0xDF, 0xB6, 0xE3, 0xDC, 0xCB, /* 0xD8-0xDB */
+	0xB6, 0xDD, 0xDC, 0xD0, 0x00, 0x00, 0xB6, 0xD8, /* 0xDC-0xDF */
+	0x00, 0x00, 0xB6, 0xE4, 0xDC, 0xDA, 0xB6, 0xE0, /* 0xE0-0xE3 */
+	0xB6, 0xE1, 0xB6, 0xE7, 0xB6, 0xDB, 0xA2, 0x5F, /* 0xE4-0xE7 */
+	0xB6, 0xD9, 0xDC, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xE2, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xCD, 0xB9, 0xC8, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE1, 0x55, 0xE1, 0x51, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE1, 0x4B, 0xB9, 0xC2, 0xB9, 0xBE, 0xE1, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0xB9, 0xBF, 0xE1, 0x4E, 0xE1, 0x50, 0x00, 0x00, /* 0x00-0x03 */
+	0xE1, 0x53, 0x00, 0x00, 0xB9, 0xC4, 0x00, 0x00, /* 0x04-0x07 */
+	0xB9, 0xCB, 0xB9, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0x49, 0xB9, 0xC6, 0xB9, 0xC7, 0xE1, 0x4C, /* 0x0C-0x0F */
+	0xB9, 0xCC, 0x00, 0x00, 0xE1, 0x4A, 0xE1, 0x4F, /* 0x10-0x13 */
+	0xB9, 0xC3, 0xE1, 0x48, 0xB9, 0xC9, 0xB9, 0xC1, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB9, 0xC0, /* 0x18-0x1B */
+	0xE1, 0x4D, 0xE1, 0x52, 0x00, 0x00, 0xB9, 0xCA, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x47, /* 0x24-0x27 */
+	0x00, 0x00, 0xBC, 0x4D, 0xE5, 0x47, 0x00, 0x00, /* 0x28-0x2B */
+	0xE5, 0x44, 0x00, 0x00, 0xBC, 0x47, 0xBC, 0x53, /* 0x2C-0x2F */
+	0xBC, 0x54, 0x00, 0x00, 0xBC, 0x4A, 0xE5, 0x42, /* 0x30-0x33 */
+	0xBC, 0x4C, 0xE4, 0xF9, 0xBC, 0x52, 0x00, 0x00, /* 0x34-0x37 */
+	0xE5, 0x46, 0xBC, 0x49, 0xE5, 0x48, 0xBC, 0x48, /* 0x38-0x3B */
+	0x00, 0x00, 0xE5, 0x43, 0xE5, 0x45, 0xBC, 0x4B, /* 0x3C-0x3F */
+	0xE5, 0x41, 0xE4, 0xFA, 0xE4, 0xF7, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xD8, 0x6B, 0xE4, 0xFD, 0x00, 0x00, /* 0x44-0x47 */
+	0xE4, 0xF6, 0xE4, 0xFC, 0xE4, 0xFB, 0x00, 0x00, /* 0x48-0x4B */
+	0xE4, 0xF8, 0x00, 0x00, 0xBC, 0x4F, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x4E, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x50, /* 0x54-0x57 */
+	0xE4, 0xFE, 0xBE, 0xB2, 0xE5, 0x40, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0xFD, 0x00, 0x00, 0xBE, 0xBE, 0xE9, 0x42, /* 0x60-0x63 */
+	0xBE, 0xB6, 0xBE, 0xBA, 0xE9, 0x41, 0x00, 0x00, /* 0x64-0x67 */
+	0xBE, 0xB9, 0xBE, 0xB5, 0xBE, 0xB8, 0xBE, 0xB3, /* 0x68-0x6B */
+	0xBE, 0xBD, 0xE9, 0x43, 0xE8, 0xFE, 0xBE, 0xBC, /* 0x6C-0x6F */
+	0xE8, 0xFC, 0xBE, 0xBB, 0xE9, 0x44, 0xE9, 0x40, /* 0x70-0x73 */
+	0xBC, 0x51, 0x00, 0x00, 0xBE, 0xBF, 0xE9, 0x46, /* 0x74-0x77 */
+	0xBE, 0xB7, 0xBE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xC6, 0xEC, 0xC8, /* 0x7C-0x7F */
+	
+	0xC0, 0x7B, 0xEC, 0xC9, 0xEC, 0xC7, 0xEC, 0xC5, /* 0x80-0x83 */
+	0xEC, 0xC4, 0xC0, 0x7D, 0xEC, 0xC3, 0xC0, 0x7E, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xEC, 0xC1, 0xEC, 0xC2, 0xC0, 0x7A, 0xC0, 0xA1, /* 0x8C-0x8F */
+	0xC0, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC0, /* 0x90-0x93 */
+	0x00, 0x00, 0xC2, 0x50, 0x00, 0x00, 0xEF, 0xBC, /* 0x94-0x97 */
+	0xEF, 0xBA, 0xEF, 0xBF, 0xEF, 0xBD, 0x00, 0x00, /* 0x98-0x9B */
+	0xEF, 0xBB, 0xEF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xC3, 0x60, 0xF1, 0xF2, 0xF1, 0xF3, /* 0xA4-0xA7 */
+	0xC4, 0x56, 0x00, 0x00, 0xF1, 0xF4, 0xF1, 0xF0, /* 0xA8-0xAB */
+	0xF1, 0xF5, 0xF1, 0xF1, 0xC2, 0x51, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xFE, 0xF4, 0x41, /* 0xB0-0xB3 */
+	0xC4, 0x59, 0xF4, 0x40, 0xC4, 0x58, 0xC4, 0x57, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xC4, 0x5A, 0xF5, 0xC5, 0xF5, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */
+	0xC4, 0xDA, 0xC4, 0xD9, 0xC4, 0xDB, 0xF5, 0xC4, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF6, 0xD8, 0xF6, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xC5, 0x6D, 0xC5, 0x6F, 0xC5, 0x6E, 0xF6, 0xD9, /* 0xC8-0xCB */
+	0xC5, 0xC8, 0xF8, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xC5, 0xF1, 0x00, 0x00, 0xF8, 0xA5, /* 0xD0-0xD3 */
+	0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x49, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0x7D, 0xA5, 0x7C, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA6, 0x5F, 0xA6, 0x5E, 0xC9, 0xC7, /* 0xDC-0xDF */
+	0xA6, 0x5D, 0xC9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xA7, 0x79, 0xCA, 0xA9, 0x00, 0x00, 0xCA, 0xA8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0x77, 0xA7, 0x7A, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA7, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCB, 0xF1, 0xA9, 0x54, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x00, 0x00, 0xD1, 0x48, 0xD1, 0x49, 0xAE, 0x45, /* 0x00-0x03 */
+	0xAE, 0x46, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAC, /* 0x04-0x07 */
+	0xB0, 0xE9, 0xB0, 0xEB, 0xD4, 0xAB, 0xB0, 0xEA, /* 0x08-0x0B */
+	0xD8, 0x7C, 0xB3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xE9, 0xB6, 0xEA, /* 0x10-0x13 */
+	0xDC, 0xE1, 0x00, 0x00, 0xB9, 0xCF, 0x00, 0x00, /* 0x14-0x17 */
+	0xB9, 0xCE, 0x00, 0x00, 0xE5, 0x49, 0xE9, 0x48, /* 0x18-0x1B */
+	0xE9, 0x47, 0x00, 0x00, 0xF9, 0x6B, 0xA4, 0x67, /* 0x1C-0x1F */
+	0xC9, 0x59, 0x00, 0x00, 0xC9, 0x6E, 0xC9, 0x6F, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xA6, 0x62, 0xA6, 0x66, 0xC9, 0xC9, 0x00, 0x00, /* 0x28-0x2B */
+	0xA6, 0x64, 0xA6, 0x63, 0xC9, 0xC8, 0xA6, 0x65, /* 0x2C-0x2F */
+	0xA6, 0x61, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x60, /* 0x30-0x33 */
+	0xC9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xA3, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA7, 0x7D, 0xCA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCA, 0xAB, 0x00, 0x00, 0xA7, 0xA1, /* 0x44-0x47 */
+	0x00, 0x00, 0xCA, 0xAD, 0xA7, 0x7B, 0xCA, 0xAE, /* 0x48-0x4B */
+	0xCA, 0xAC, 0xA7, 0x7E, 0xA7, 0xA2, 0xA7, 0xA5, /* 0x4C-0x4F */
+	0xA7, 0xA4, 0xA7, 0x7C, 0xCA, 0xAF, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA9, 0x59, 0xCB, 0xFE, 0x00, 0x00, /* 0x60-0x63 */
+	0xA9, 0x5B, 0x00, 0x00, 0xA9, 0x5A, 0x00, 0x00, /* 0x64-0x67 */
+	0xCC, 0x40, 0xA9, 0x58, 0xA9, 0x57, 0xCB, 0xF5, /* 0x68-0x6B */
+	0x00, 0x00, 0xCB, 0xF4, 0x00, 0x00, 0xCB, 0xF2, /* 0x6C-0x6F */
+	0xCB, 0xF7, 0xCB, 0xF6, 0xCB, 0xF3, 0xCB, 0xFC, /* 0x70-0x73 */
+	0xCB, 0xFD, 0xCB, 0xFA, 0xCB, 0xF8, 0xA9, 0x56, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xFB, /* 0x78-0x7B */
+	0xA9, 0x5C, 0xCC, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCB, 0xF9, 0x00, 0x00, 0xAB, 0xAB, 0xA9, 0x55, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAC, /* 0x88-0x8B */
+	0xCE, 0x54, 0x00, 0x00, 0x00, 0x00, 0xCE, 0x5A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB2, /* 0x90-0x93 */
+	0xCE, 0x58, 0xCE, 0x5E, 0x00, 0x00, 0xCE, 0x55, /* 0x94-0x97 */
+	0xCE, 0x59, 0xCE, 0x5B, 0xCE, 0x5D, 0xCE, 0x57, /* 0x98-0x9B */
+	0x00, 0x00, 0xCE, 0x56, 0xCE, 0x51, 0xCE, 0x52, /* 0x9C-0x9F */
+	0xAB, 0xAD, 0x00, 0x00, 0xAB, 0xAF, 0xAB, 0xAE, /* 0xA0-0xA3 */
+	0xCE, 0x53, 0xCE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xB1, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xCE, 0x50, 0xD1, 0x53, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD1, 0x52, 0xD1, 0x57, 0xD1, 0x4E, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD1, 0x51, 0xD1, 0x50, 0x00, 0x00, 0xD1, 0x54, /* 0xBC-0xBF */
+	0x00, 0x00, 0xD1, 0x58, 0xAE, 0x47, 0xAE, 0x4A, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0x4F, 0xD1, 0x55, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x49, /* 0xC8-0xCB */
+	0xD1, 0x4A, 0x00, 0x00, 0xAB, 0xB0, 0xD4, 0xBA, /* 0xCC-0xCF */
+	0xD1, 0x56, 0x00, 0x00, 0xD1, 0x4D, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xAE, 0x48, 0xD1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xEC, /* 0xDC-0xDF */
+	0xB0, 0xF0, 0xD4, 0xC1, 0xD4, 0xAF, 0xD4, 0xBD, /* 0xE0-0xE3 */
+	0xB0, 0xF1, 0xD4, 0xBF, 0x00, 0x00, 0xD4, 0xC5, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD4, 0xC0, 0xD4, 0xB4, 0xD4, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */
+	0xD4, 0xCA, 0xD4, 0xC8, 0xD4, 0xBE, 0xD4, 0xB9, /* 0xF0-0xF3 */
+	0xD4, 0xB2, 0xD8, 0xA6, 0xD4, 0xB0, 0xB0, 0xF5, /* 0xF4-0xF7 */
+	0xD4, 0xB7, 0xB0, 0xF6, 0xB0, 0xF2, 0xD4, 0xAD, /* 0xF8-0xFB */
+	0xD4, 0xC3, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0xD4, 0xB3, 0xD4, 0xC6, 0xB0, 0xF3, 0x00, 0x00, /* 0x00-0x03 */
+	0xD4, 0xCC, 0xB0, 0xED, 0xB0, 0xEF, 0xD4, 0xBB, /* 0x04-0x07 */
+	0xD4, 0xB6, 0xAE, 0x4B, 0xB0, 0xEE, 0xD4, 0xB8, /* 0x08-0x0B */
+	0xD4, 0xC7, 0xD4, 0xCB, 0xD4, 0xC2, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD4, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xD8, 0xA1, 0x00, 0x00, 0xD8, 0xAA, /* 0x18-0x1B */
+	0xD8, 0xA9, 0xB3, 0xFA, 0xD8, 0xA2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB3, 0xFB, 0xB3, 0xF9, 0x00, 0x00, 0xD8, 0xA4, /* 0x20-0x23 */
+	0xB3, 0xF6, 0xD8, 0xA8, 0x00, 0x00, 0xD8, 0xA3, /* 0x24-0x27 */
+	0xD8, 0xA5, 0xD8, 0x7D, 0xB3, 0xF4, 0x00, 0x00, /* 0x28-0x2B */
+	0xD8, 0xB2, 0xD8, 0xB1, 0xD8, 0xAE, 0xB3, 0xF3, /* 0x2C-0x2F */
+	0xB3, 0xF7, 0xB3, 0xF8, 0xD1, 0x4B, 0xD8, 0xAB, /* 0x30-0x33 */
+	0xB3, 0xF5, 0xB0, 0xF4, 0xD8, 0xAD, 0xD8, 0x7E, /* 0x34-0x37 */
+	0xD8, 0xB0, 0xD8, 0xAF, 0x00, 0x00, 0xD8, 0xB3, /* 0x38-0x3B */
+	0x00, 0x00, 0xDC, 0xEF, 0x00, 0x00, 0xD8, 0xAC, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xD8, 0xA7, 0xDC, 0xE7, 0xB6, 0xF4, 0xB6, 0xF7, /* 0x48-0x4B */
+	0xB6, 0xF2, 0xDC, 0xE6, 0xDC, 0xEA, 0xDC, 0xE5, /* 0x4C-0x4F */
+	0x00, 0x00, 0xB6, 0xEC, 0xB6, 0xF6, 0xDC, 0xE2, /* 0x50-0x53 */
+	0xB6, 0xF0, 0xDC, 0xE9, 0x00, 0x00, 0xB6, 0xEE, /* 0x54-0x57 */
+	0xB6, 0xED, 0xDC, 0xEC, 0xB6, 0xEF, 0xDC, 0xEE, /* 0x58-0x5B */
+	0x00, 0x00, 0xDC, 0xEB, 0xB6, 0xEB, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xF5, 0xDC, 0xF0, /* 0x60-0x63 */
+	0xDC, 0xE4, 0xDC, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xDC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF1, /* 0x68-0x6B */
+	0x00, 0x00, 0xB6, 0xF3, 0x00, 0x00, 0xDC, 0xE8, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE1, 0x5D, 0xB9, 0xD0, 0xE1, 0x63, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xB9, 0xD5, 0xE1, 0x5F, 0xE1, 0x66, /* 0x78-0x7B */
+	0xE1, 0x57, 0xB9, 0xD7, 0xB9, 0xD1, 0xE1, 0x5C, /* 0x7C-0x7F */
+	
+	0xBC, 0x55, 0xE1, 0x5B, 0xE1, 0x64, 0xB9, 0xD2, /* 0x80-0x83 */
+	0x00, 0x00, 0xB9, 0xD6, 0xE1, 0x5A, 0xE1, 0x60, /* 0x84-0x87 */
+	0xE1, 0x65, 0xE1, 0x56, 0xB9, 0xD4, 0xE1, 0x5E, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x62, 0xE1, 0x68, /* 0x8C-0x8F */
+	0xE1, 0x58, 0xE1, 0x61, 0x00, 0x00, 0xB9, 0xD3, /* 0x90-0x93 */
+	0xE1, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xE1, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xBC, 0x59, 0xE5, 0x4B, 0xBC, 0x57, 0xBC, 0x56, /* 0x9C-0x9F */
+	0xE5, 0x4D, 0xE5, 0x52, 0x00, 0x00, 0xE5, 0x4E, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE5, 0x51, 0xBC, 0x5C, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBE, 0xA5, 0xBC, 0x5B, 0x00, 0x00, 0xE5, 0x4A, /* 0xA8-0xAB */
+	0xE5, 0x50, 0x00, 0x00, 0xBC, 0x5A, 0xE5, 0x4F, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE5, 0x4C, 0x00, 0x00, 0xBC, 0x58, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x4D, 0xF9, 0xD9, /* 0xB8-0xBB */
+	0xE9, 0x4F, 0xE9, 0x4A, 0xBE, 0xC1, 0xE9, 0x4C, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBE, 0xC0, 0xE9, 0x4E, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xBE, 0xC3, 0xE9, 0x50, 0xBE, 0xC2, /* 0xC4-0xC7 */
+	0xE9, 0x49, 0xE9, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xA5, 0xEC, 0xCC, /* 0xCC-0xCF */
+	0x00, 0x00, 0xC0, 0xA4, 0xEC, 0xCD, 0xC0, 0xA3, /* 0xD0-0xD3 */
+	0xEC, 0xCB, 0xC0, 0xA2, 0xEC, 0xCA, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xC2, 0x53, 0xC2, 0x52, 0xF1, 0xF6, 0xF1, 0xF8, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF1, 0xF7, 0xC3, 0x61, 0xC3, 0x62, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0x63, 0xF4, 0x42, /* 0xE0-0xE3 */
+	0xC4, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xD3, /* 0xE4-0xE7 */
+	0xF7, 0xD2, 0xC5, 0xF2, 0x00, 0x00, 0xA4, 0x68, /* 0xE8-0xEB */
+	0xA4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA7, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xCE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xB3, 0xFC, 0xB3, 0xFD, 0x00, 0x00, /* 0xF8-0xFB */
+	0xDC, 0xF2, 0xB9, 0xD8, 0xE1, 0x69, 0xE5, 0x53, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x5A, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xB0, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xCC, 0x42, 0xCE, 0x60, 0xD1, 0x59, 0xAE, 0x4C, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x10-0x13 */
+	0xC4, 0xDC, 0xA4, 0x69, 0xA5, 0x7E, 0xC9, 0x70, /* 0x14-0x17 */
+	0x00, 0x00, 0xA6, 0x67, 0xA6, 0x68, 0x00, 0x00, /* 0x18-0x1B */
+	0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB0, 0xF7, 0x00, 0x00, 0xB9, 0xDA, 0x00, 0x00, /* 0x20-0x23 */
+	0xB9, 0xDB, 0xB9, 0xD9, 0x00, 0x00, 0xA4, 0x6A, /* 0x24-0x27 */
+	0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD3, 0xA4, 0xD2, /* 0x28-0x2B */
+	0xC9, 0x5B, 0xA4, 0xD4, 0xA5, 0xA1, 0xC9, 0x71, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA5, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x69, /* 0x34-0x37 */
+	0xA6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xC9, 0xCB, 0x00, 0x00, 0xA7, 0xA8, 0x00, 0x00, /* 0x3C-0x3F */
+	0xCA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xA9, 0x61, 0xCC, 0x43, 0x00, 0x00, 0xA9, 0x5F, /* 0x44-0x47 */
+	0xA9, 0x60, 0xA9, 0x5E, 0xD1, 0x5A, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xB6, 0xAB, 0xB5, /* 0x4C-0x4F */
+	0xAB, 0xB7, 0xAB, 0xB4, 0x00, 0x00, 0xCE, 0x61, /* 0x50-0x53 */
+	0xA9, 0x62, 0xAB, 0xB3, 0x00, 0x00, 0xAE, 0x4D, /* 0x54-0x57 */
+	0xAE, 0x4E, 0x00, 0x00, 0xAE, 0x4F, 0x00, 0x00, /* 0x58-0x5B */
+	0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB3, 0xFE, 0xD8, 0xB4, 0xB0, 0xF8, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF8, /* 0x64-0x67 */
+	0x00, 0x00, 0xB9, 0xDD, 0xB9, 0xDC, 0xE1, 0x6A, /* 0x68-0x6B */
+	0x00, 0x00, 0xBC, 0x5D, 0xBE, 0xC4, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEF, 0xC0, 0xF6, 0xDA, 0xF7, 0xD4, 0xA4, 0x6B, /* 0x70-0x73 */
+	0xA5, 0xA3, 0x00, 0x00, 0xA5, 0xA4, 0xC9, 0xD1, /* 0x74-0x77 */
+	0xA6, 0x6C, 0xA6, 0x6F, 0x00, 0x00, 0xC9, 0xCF, /* 0x78-0x7B */
+	0xC9, 0xCD, 0xA6, 0x6E, 0xC9, 0xD0, 0xC9, 0xD2, /* 0x7C-0x7F */
+	
+	0xC9, 0xCC, 0xA6, 0x71, 0xA6, 0x70, 0xA6, 0x6D, /* 0x80-0x83 */
+	0xA6, 0x6B, 0xC9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xB3, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xA7, 0xB0, 0xCA, 0xB6, 0xCA, 0xB9, /* 0x8C-0x8F */
+	0xCA, 0xB8, 0x00, 0x00, 0xA7, 0xAA, 0xA7, 0xB2, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xAF, 0xCA, 0xB5, /* 0x94-0x97 */
+	0xCA, 0xB3, 0xA7, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xA7, 0xA9, 0xA7, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCA, 0xB4, 0xCA, 0xBB, 0xCA, 0xB7, 0xA7, 0xAD, /* 0xA0-0xA3 */
+	0xA7, 0xB1, 0xA7, 0xB4, 0xCA, 0xB2, 0xCA, 0xBA, /* 0xA4-0xA7 */
+	0xA7, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x67, 0xA9, 0x6F, /* 0xAC-0xAF */
+	0x00, 0x00, 0xCC, 0x4F, 0xCC, 0x48, 0xA9, 0x70, /* 0xB0-0xB3 */
+	0xCC, 0x53, 0xCC, 0x44, 0xCC, 0x4B, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xA9, 0x66, 0xCC, 0x45, 0xA9, 0x64, /* 0xB8-0xBB */
+	0xCC, 0x4C, 0xCC, 0x50, 0xA9, 0x63, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCC, 0x51, 0xCC, 0x4A, 0x00, 0x00, 0xCC, 0x4D, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xA9, 0x72, 0xA9, 0x69, 0xCC, 0x54, /* 0xC4-0xC7 */
+	0xCC, 0x52, 0x00, 0x00, 0xA9, 0x6E, 0xA9, 0x6C, /* 0xC8-0xCB */
+	0xCC, 0x49, 0xA9, 0x6B, 0xCC, 0x47, 0xCC, 0x46, /* 0xCC-0xCF */
+	0xA9, 0x6A, 0xA9, 0x68, 0xA9, 0x71, 0xA9, 0x6D, /* 0xD0-0xD3 */
+	0xA9, 0x65, 0x00, 0x00, 0xCC, 0x4E, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xAB, 0xB9, 0x00, 0x00, 0xAB, 0xC0, 0xCE, 0x6F, /* 0xD8-0xDB */
+	0xAB, 0xB8, 0xCE, 0x67, 0xCE, 0x63, 0x00, 0x00, /* 0xDC-0xDF */
+	0xCE, 0x73, 0xCE, 0x62, 0x00, 0x00, 0xAB, 0xBB, /* 0xE0-0xE3 */
+	0xCE, 0x6C, 0xAB, 0xBE, 0xAB, 0xC1, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAB, 0xBC, 0xCE, 0x70, 0xAB, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */
+	0xAE, 0x56, 0xCE, 0x76, 0xCE, 0x64, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCE, 0x66, 0xCE, 0x6D, 0xCE, 0x71, /* 0xF0-0xF3 */
+	0xCE, 0x75, 0xCE, 0x72, 0xCE, 0x6B, 0xCE, 0x6E, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x68, 0xAB, 0xC3, /* 0xF8-0xFB */
+	0xCE, 0x6A, 0xCE, 0x69, 0xCE, 0x74, 0xAB, 0xBA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0xCE, 0x65, 0xAB, 0xC2, 0x00, 0x00, 0xAB, 0xBD, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xAE, 0x5C, 0xD1, 0x62, 0x00, 0x00, /* 0x08-0x0B */
+	0xAE, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x60, /* 0x0C-0x0F */
+	0x00, 0x00, 0xAE, 0x50, 0x00, 0x00, 0xAE, 0x55, /* 0x10-0x13 */
+	0x00, 0x00, 0xD1, 0x5F, 0xD1, 0x5C, 0xD1, 0x61, /* 0x14-0x17 */
+	0xAE, 0x51, 0xD1, 0x5B, 0x00, 0x00, 0xAE, 0x54, /* 0x18-0x1B */
+	0xAE, 0x52, 0x00, 0x00, 0xD1, 0x63, 0xAE, 0x53, /* 0x1C-0x1F */
+	0xAE, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x58, /* 0x20-0x23 */
+	0x00, 0x00, 0xAE, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xAE, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD1, 0x5D, 0xD1, 0x5E, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x64, /* 0x30-0x33 */
+	0x00, 0x00, 0xD4, 0xD4, 0xB0, 0xF9, 0xD8, 0xC2, /* 0x34-0x37 */
+	0xD4, 0xD3, 0xD4, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xB1, 0x40, 0x00, 0x00, 0xD4, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */
+	0xB0, 0xFE, 0xB0, 0xFA, 0xD4, 0xED, 0xD4, 0xDD, /* 0x40-0x43 */
+	0xD4, 0xE0, 0x00, 0x00, 0xB1, 0x43, 0xD4, 0xEA, /* 0x44-0x47 */
+	0xD4, 0xE2, 0xB0, 0xFB, 0xB1, 0x44, 0x00, 0x00, /* 0x48-0x4B */
+	0xD4, 0xE7, 0xD4, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD4, 0xD6, 0xD4, 0xEB, 0xD4, 0xDF, 0xD4, 0xDA, /* 0x50-0x53 */
+	0x00, 0x00, 0xD4, 0xD0, 0xD4, 0xEC, 0xD4, 0xDC, /* 0x54-0x57 */
+	0xD4, 0xCF, 0x00, 0x00, 0xB1, 0x42, 0xD4, 0xE1, /* 0x58-0x5B */
+	0xD4, 0xEE, 0xD4, 0xDE, 0xD4, 0xD2, 0xD4, 0xD7, /* 0x5C-0x5F */
+	0xD4, 0xCE, 0x00, 0x00, 0xB1, 0x41, 0x00, 0x00, /* 0x60-0x63 */
+	0xD4, 0xDB, 0xD4, 0xD8, 0xB0, 0xFC, 0xD4, 0xD1, /* 0x64-0x67 */
+	0x00, 0x00, 0xD4, 0xE9, 0xB0, 0xFD, 0x00, 0x00, /* 0x68-0x6B */
+	0xD4, 0xD9, 0xD4, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xD4, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x40, /* 0x74-0x77 */
+	0xD8, 0xBB, 0x00, 0x00, 0xD8, 0xB8, 0xD8, 0xC9, /* 0x78-0x7B */
+	0xD8, 0xBD, 0xD8, 0xCA, 0x00, 0x00, 0xB4, 0x42, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC6, /* 0x80-0x83 */
+	0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, 0xD8, 0xC7, /* 0x88-0x8B */
+	0xD8, 0xCB, 0x00, 0x00, 0xD4, 0xE3, 0xD8, 0xCD, /* 0x8C-0x8F */
+	0xDD, 0x47, 0x00, 0x00, 0xB4, 0x43, 0xD8, 0xCE, /* 0x90-0x93 */
+	0xD8, 0xB6, 0xD8, 0xC0, 0x00, 0x00, 0xD8, 0xC5, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xB4, 0x41, 0xB4, 0x44, /* 0x98-0x9B */
+	0xD8, 0xCC, 0xD8, 0xCF, 0xD8, 0xBA, 0xD8, 0xB7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xB9, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD8, 0xBE, 0xD8, 0xBC, 0xB4, 0x45, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD8, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD8, 0xBF, 0x00, 0x00, 0xD8, 0xC1, 0xD8, 0xB5, /* 0xAC-0xAF */
+	0xDC, 0xFA, 0xDC, 0xF8, 0xB7, 0x42, 0xB7, 0x40, /* 0xB0-0xB3 */
+	0xDD, 0x43, 0xDC, 0xF9, 0xDD, 0x44, 0xDD, 0x40, /* 0xB4-0xB7 */
+	0xDC, 0xF7, 0xDD, 0x46, 0xDC, 0xF6, 0xDC, 0xFD, /* 0xB8-0xBB */
+	0xB6, 0xFE, 0xB6, 0xFD, 0xB6, 0xFC, 0xDC, 0xFB, /* 0xBC-0xBF */
+	0xDD, 0x41, 0xB6, 0xF9, 0xB7, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDC, 0xF4, 0x00, 0x00, 0xDC, 0xFE, 0xDC, 0xF3, /* 0xC4-0xC7 */
+	0xDC, 0xFC, 0xB6, 0xFA, 0xDD, 0x42, 0xDC, 0xF5, /* 0xC8-0xCB */
+	0xB6, 0xFB, 0xDD, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE1, 0x6E, 0xB9, 0xE2, 0xB9, 0xE1, /* 0xD4-0xD7 */
+	0xB9, 0xE3, 0xE1, 0x7A, 0xE1, 0x70, 0xE1, 0x76, /* 0xD8-0xDB */
+	0xE1, 0x6B, 0xE1, 0x79, 0xE1, 0x78, 0xE1, 0x7C, /* 0xDC-0xDF */
+	0xE1, 0x75, 0xB9, 0xDE, 0xE1, 0x74, 0xB9, 0xE4, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE1, 0x6D, 0xB9, 0xDF, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE1, 0x7B, 0xB9, 0xE0, 0xE1, 0x6F, 0xE1, 0x72, /* 0xE8-0xEB */
+	0xE1, 0x77, 0xE1, 0x71, 0xE1, 0x6C, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x73, /* 0xF0-0xF3 */
+	0xE5, 0x55, 0xBC, 0x61, 0xE5, 0x58, 0xE5, 0x57, /* 0xF4-0xF7 */
+	0xE5, 0x5A, 0xE5, 0x5C, 0xF9, 0xDC, 0xBC, 0x5F, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE5, 0x56, 0x00, 0x00, 0xE5, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5B, 0xE5, 0x59, /* 0x00-0x03 */
+	0x00, 0x00, 0xE5, 0x5F, 0x00, 0x00, 0xE5, 0x5E, /* 0x04-0x07 */
+	0xBC, 0x63, 0xBC, 0x5E, 0x00, 0x00, 0xBC, 0x60, /* 0x08-0x0B */
+	0xBC, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, /* 0x0C-0x0F */
+	0xE9, 0x57, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x56, /* 0x10-0x13 */
+	0xE9, 0x55, 0x00, 0x00, 0xE9, 0x58, 0xE9, 0x51, /* 0x14-0x17 */
+	0x00, 0x00, 0xE9, 0x52, 0xE9, 0x5A, 0xE9, 0x53, /* 0x18-0x1B */
+	0x00, 0x00, 0xBE, 0xC5, 0xE9, 0x5C, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE9, 0x5B, 0xE9, 0x54, 0x00, 0x00, 0xEC, 0xD1, /* 0x20-0x23 */
+	0xC0, 0xA8, 0xEC, 0xCF, 0xEC, 0xD4, 0xEC, 0xD3, /* 0x24-0x27 */
+	0xE9, 0x59, 0x00, 0x00, 0xC0, 0xA7, 0x00, 0x00, /* 0x28-0x2B */
+	0xEC, 0xD2, 0xEC, 0xCE, 0xEC, 0xD6, 0xEC, 0xD5, /* 0x2C-0x2F */
+	0xC0, 0xA6, 0x00, 0x00, 0xEC, 0xD0, 0x00, 0x00, /* 0x30-0x33 */
+	0xBE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xC2, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xEF, 0xC1, 0xF1, 0xFA, 0xF1, 0xFB, 0xF1, 0xFC, /* 0x3C-0x3F */
+	0xC4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x5D, /* 0x40-0x43 */
+	0x00, 0x00, 0xF4, 0x43, 0x00, 0x00, 0xF5, 0xC8, /* 0x44-0x47 */
+	0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDB, /* 0x48-0x4B */
+	0xF6, 0xDC, 0xF7, 0xD5, 0xF8, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */
+	0xA4, 0x6C, 0xA4, 0x6D, 0x00, 0x00, 0xA4, 0x6E, /* 0x50-0x53 */
+	0xA4, 0xD5, 0xA5, 0xA5, 0xC9, 0xD3, 0xA6, 0x72, /* 0x54-0x57 */
+	0xA6, 0x73, 0x00, 0x00, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x58-0x5B */
+	0xA7, 0xB6, 0xA7, 0xB5, 0x00, 0x00, 0xA9, 0x73, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0x55, 0xA9, 0x75, /* 0x60-0x63 */
+	0xA9, 0x74, 0xCC, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xAB, 0xC4, 0x00, 0x00, 0xAE, 0x5D, /* 0x68-0x6B */
+	0xD1, 0x65, 0x00, 0x00, 0xD4, 0xF0, 0x00, 0x00, /* 0x6C-0x6F */
+	0xB1, 0x45, 0xB4, 0x47, 0xD4, 0xEF, 0xB4, 0x46, /* 0x70-0x73 */
+	0x00, 0x00, 0xB9, 0xE5, 0x00, 0x00, 0xE1, 0x7D, /* 0x74-0x77 */
+	0xBE, 0xC7, 0x00, 0x00, 0xC0, 0xA9, 0xEC, 0xD7, /* 0x78-0x7B */
+	0x00, 0x00, 0xC4, 0x5E, 0x00, 0x00, 0xC5, 0x70, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xC9, 0x72, 0x00, 0x00, 0xA5, 0xA6, /* 0x80-0x83 */
+	0xC9, 0x73, 0xA6, 0x76, 0x00, 0x00, 0xA6, 0x74, /* 0x84-0x87 */
+	0xA6, 0x75, 0xA6, 0x77, 0x00, 0x00, 0xA7, 0xBA, /* 0x88-0x8B */
+	0xA7, 0xB9, 0x00, 0x00, 0xCA, 0xBC, 0xA7, 0xBB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xBD, 0xCC, 0x57, /* 0x90-0x93 */
+	0x00, 0x00, 0xCC, 0x58, 0x00, 0x00, 0xA9, 0x76, /* 0x94-0x97 */
+	0xA9, 0x78, 0xA9, 0x7A, 0xA9, 0x77, 0xA9, 0x7B, /* 0x98-0x9B */
+	0xA9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xC8, 0xAB, 0xC5, /* 0xA0-0xA3 */
+	0xAB, 0xC7, 0xAB, 0xC9, 0xAB, 0xC6, 0xD1, 0x66, /* 0xA4-0xA7 */
+	0xCE, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD1, 0x68, 0xD1, 0x67, 0xAE, 0x63, 0x00, 0x00, /* 0xAC-0xAF */
+	0xAE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x60, /* 0xB0-0xB3 */
+	0xAE, 0x62, 0xAE, 0x64, 0xAE, 0x61, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xAE, 0x66, 0xAE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4A, /* 0xBC-0xBF */
+	0xD4, 0xF2, 0xD4, 0xF1, 0xB1, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB1, 0x48, 0xB1, 0x47, 0xB1, 0x4B, 0xB1, 0x46, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xD5, 0xD8, 0xD2, /* 0xC8-0xCB */
+	0xB4, 0x49, 0xD8, 0xD1, 0xD8, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */
+	0xB4, 0x4B, 0xD8, 0xD4, 0xB4, 0x48, 0xB4, 0x4A, /* 0xD0-0xD3 */
+	0xD8, 0xD3, 0x00, 0x00, 0xDD, 0x48, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDD, 0x49, 0xDD, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xE6, 0xB9, 0xEE, /* 0xDC-0xDF */
+	0xE1, 0x7E, 0xB9, 0xE8, 0xB9, 0xEC, 0xE1, 0xA1, /* 0xE0-0xE3 */
+	0xB9, 0xED, 0xB9, 0xE9, 0xB9, 0xEA, 0xB9, 0xE7, /* 0xE4-0xE7 */
+	0xB9, 0xEB, 0xBC, 0x66, 0xD8, 0xD0, 0xBC, 0x67, /* 0xE8-0xEB */
+	0xBC, 0x65, 0x00, 0x00, 0xBC, 0x64, 0xE9, 0x5D, /* 0xEC-0xEF */
+	0xBE, 0xC8, 0xEC, 0xD8, 0xEC, 0xD9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xC3, 0x64, 0xC4, 0x5F, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xA4, 0x6F, 0x00, 0x00, 0xA6, 0x78, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x00, 0x00, 0xAB, 0xCA, 0x00, 0x00, 0xD1, 0x69, /* 0x00-0x03 */
+	0xAE, 0x67, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4E, /* 0x04-0x07 */
+	0xB1, 0x4D, 0xB1, 0x4C, 0xB4, 0x4C, 0xB4, 0x4D, /* 0x08-0x0B */
+	0xD8, 0xD7, 0xB9, 0xEF, 0xBE, 0xC9, 0xA4, 0x70, /* 0x0C-0x0F */
+	0xC9, 0x5C, 0xA4, 0xD6, 0xC9, 0x74, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xC9, 0xD4, 0xA6, 0x79, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x7C, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0x4B, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0x71, 0x00, 0x00, /* 0x20-0x23 */
+	0xA4, 0xD7, 0xC9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCA, 0xBE, 0x00, 0x00, 0xCA, 0xBF, 0x00, 0x00, /* 0x28-0x2B */
+	0xA7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD8, 0xD8, 0xB4, 0x4E, 0x00, 0x00, 0xDD, 0x4C, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xAA, /* 0x34-0x37 */
+	0xA4, 0x72, 0xA4, 0xA8, 0xA4, 0xD8, 0xC9, 0x75, /* 0x38-0x3B */
+	0xA5, 0xA7, 0x00, 0x00, 0xA7, 0xC0, 0xA7, 0xBF, /* 0x3C-0x3F */
+	0xA7, 0xBD, 0xA7, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xCC, 0x59, 0xA9, 0x7E, 0xA9, 0xA1, 0xCC, 0x5A, /* 0x44-0x47 */
+	0xA9, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xCE, /* 0x48-0x4B */
+	0xCE, 0x78, 0xAB, 0xCD, 0xAB, 0xCB, 0xAB, 0xCC, /* 0x4C-0x4F */
+	0xAE, 0x6A, 0xAE, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD1, 0x6B, 0xAE, 0x69, 0xD1, 0x6A, 0x00, 0x00, /* 0x54-0x57 */
+	0xAE, 0x5E, 0xD4, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xB1, 0x50, 0xB1, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB1, 0x4F, 0x00, 0x00, 0xB9, 0xF0, 0xE1, 0xA2, /* 0x60-0x63 */
+	0xBC, 0x68, 0xBC, 0x69, 0x00, 0x00, 0xE5, 0x61, /* 0x64-0x67 */
+	0xC0, 0xAB, 0xEF, 0xC2, 0xEF, 0xC3, 0x00, 0x00, /* 0x68-0x6B */
+	0xC4, 0xDD, 0xF8, 0xA8, 0xC9, 0x4B, 0xA4, 0xD9, /* 0x6C-0x6F */
+	0x00, 0x00, 0xA4, 0x73, 0x00, 0x00, 0xC9, 0x77, /* 0x70-0x73 */
+	0xC9, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xA6, 0x7A, 0xC9, 0xD7, 0xC9, 0xD8, /* 0x78-0x7B */
+	0xC9, 0xD6, 0x00, 0x00, 0xC9, 0xD9, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0x84-0x87 */
+	0xCA, 0xC2, 0xCA, 0xC4, 0xCA, 0xC6, 0xCA, 0xC3, /* 0x88-0x8B */
+	0xA7, 0xC4, 0xCA, 0xC0, 0x00, 0x00, 0xCA, 0xC1, /* 0x8C-0x8F */
+	0xA7, 0xC1, 0xA7, 0xC2, 0xCA, 0xC5, 0xCA, 0xC8, /* 0x90-0x93 */
+	0xA7, 0xC3, 0xCA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xCC, 0x68, 0x00, 0x00, 0xCC, 0x62, /* 0x9C-0x9F */
+	0xCC, 0x5D, 0xA9, 0xA3, 0xCC, 0x65, 0xCC, 0x63, /* 0xA0-0xA3 */
+	0xCC, 0x5C, 0xCC, 0x69, 0xCC, 0x6C, 0xCC, 0x67, /* 0xA4-0xA7 */
+	0xCC, 0x60, 0xA9, 0xA5, 0xCC, 0x66, 0xA9, 0xA6, /* 0xA8-0xAB */
+	0xCC, 0x61, 0xCC, 0x64, 0xCC, 0x5B, 0xCC, 0x5F, /* 0xAC-0xAF */
+	0xCC, 0x6B, 0xA9, 0xA7, 0x00, 0x00, 0xA9, 0xA8, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xCC, 0x5E, 0xCC, 0x6A, 0xA9, 0xA2, /* 0xB4-0xB7 */
+	0xA9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, 0xCE, 0xA4, /* 0xC4-0xC7 */
+	0xCE, 0xAA, 0xCE, 0xA3, 0xCE, 0xA5, 0xCE, 0x7D, /* 0xC8-0xCB */
+	0xCE, 0x7B, 0x00, 0x00, 0xCE, 0xAC, 0xCE, 0xA9, /* 0xCC-0xCF */
+	0xCE, 0x79, 0x00, 0x00, 0xAB, 0xD0, 0xCE, 0xA7, /* 0xD0-0xD3 */
+	0xCE, 0xA8, 0x00, 0x00, 0xCE, 0xA6, 0xCE, 0x7C, /* 0xD4-0xD7 */
+	0xCE, 0x7A, 0xAB, 0xCF, 0xCE, 0xA2, 0xCE, 0x7E, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xA1, 0xCE, 0xAD, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAE, 0x6F, 0x00, 0x00, 0xAE, 0x6E, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD1, 0x6C, 0xAE, 0x6B, 0xD1, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */
+	0xAE, 0x70, 0xD1, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xAE, 0x73, 0x00, 0x00, 0xAE, 0x71, 0xD1, 0x70, /* 0xF4-0xF7 */
+	0xCE, 0xAE, 0xD1, 0x72, 0x00, 0x00, 0xAE, 0x6D, /* 0xF8-0xFB */
+	0x00, 0x00, 0xAE, 0x6C, 0x00, 0x00, 0xD1, 0x6D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0xD1, 0x71, 0xAE, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xB1, 0x53, 0xB1, 0x52, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF5, /* 0x08-0x0B */
+	0xD4, 0xF9, 0xD4, 0xFB, 0xB1, 0x54, 0xD4, 0xFE, /* 0x0C-0x0F */
+	0x00, 0x00, 0xB1, 0x58, 0xD5, 0x41, 0x00, 0x00, /* 0x10-0x13 */
+	0xB1, 0x5A, 0x00, 0x00, 0xB1, 0x56, 0xB1, 0x5E, /* 0x14-0x17 */
+	0x00, 0x00, 0xB1, 0x5B, 0xD4, 0xF7, 0xB1, 0x55, /* 0x18-0x1B */
+	0x00, 0x00, 0xD4, 0xF6, 0xD4, 0xF4, 0xD5, 0x43, /* 0x1C-0x1F */
+	0xD4, 0xF8, 0x00, 0x00, 0xB1, 0x57, 0xD5, 0x42, /* 0x20-0x23 */
+	0xB1, 0x5C, 0xD4, 0xFD, 0xD4, 0xFC, 0xB1, 0x5D, /* 0x24-0x27 */
+	0xD4, 0xFA, 0xB1, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0x44, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD5, 0x40, 0xD8, 0xE7, 0xD8, 0xEE, 0xD8, 0xE3, /* 0x30-0x33 */
+	0xB4, 0x51, 0xD8, 0xDF, 0xD8, 0xEF, 0xD8, 0xD9, /* 0x34-0x37 */
+	0xD8, 0xEC, 0xD8, 0xEA, 0xD8, 0xE4, 0x00, 0x00, /* 0x38-0x3B */
+	0xD8, 0xED, 0xD8, 0xE6, 0x00, 0x00, 0xD8, 0xDE, /* 0x3C-0x3F */
+	0xD8, 0xF0, 0xD8, 0xDC, 0xD8, 0xE9, 0xD8, 0xDA, /* 0x40-0x43 */
+	0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, 0xB4, 0x52, /* 0x44-0x47 */
+	0x00, 0x00, 0xD8, 0xEB, 0xDD, 0x4F, 0xD8, 0xDD, /* 0x48-0x4B */
+	0xB4, 0x4F, 0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xB4, 0x50, 0xD8, 0xE0, 0xD8, 0xE5, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xD8, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD8, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x53, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x56, 0xDD, 0x4E, /* 0x60-0x63 */
+	0x00, 0x00, 0xDD, 0x50, 0x00, 0x00, 0xDD, 0x55, /* 0x64-0x67 */
+	0xDD, 0x54, 0xB7, 0x43, 0x00, 0x00, 0xD8, 0xDB, /* 0x68-0x6B */
+	0xDD, 0x52, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x44, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDD, 0x4D, 0xDD, 0x51, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x74-0x77 */
+	0x00, 0x00, 0xE1, 0xB0, 0xE1, 0xA7, 0x00, 0x00, /* 0x78-0x7B */
+	0xE1, 0xAE, 0xE1, 0xA5, 0xE1, 0xAD, 0xE1, 0xB1, /* 0x7C-0x7F */
+	
+	0xE1, 0xA4, 0xE1, 0xA8, 0xE1, 0xA3, 0x00, 0x00, /* 0x80-0x83 */
+	0xB9, 0xF1, 0x00, 0x00, 0xE1, 0xA6, 0xB9, 0xF2, /* 0x84-0x87 */
+	0xE1, 0xAC, 0xE1, 0xAB, 0xE1, 0xAA, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x65, 0xE5, 0x67, /* 0x90-0x93 */
+	0xBC, 0x6B, 0xE5, 0x68, 0x00, 0x00, 0xE5, 0x63, /* 0x94-0x97 */
+	0x00, 0x00, 0xE5, 0x62, 0xE5, 0x6C, 0x00, 0x00, /* 0x98-0x9B */
+	0xE5, 0x6A, 0xBC, 0x6A, 0xE5, 0x6D, 0xE5, 0x64, /* 0x9C-0x9F */
+	0xE5, 0x69, 0xE5, 0x6B, 0xE5, 0x66, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x61, /* 0xA4-0xA7 */
+	0xE9, 0x66, 0xE9, 0x60, 0xE9, 0x65, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE9, 0x5E, 0xE9, 0x68, 0xE9, 0x64, 0xE9, 0x69, /* 0xAC-0xAF */
+	0xE9, 0x63, 0xE9, 0x5F, 0xE9, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE9, 0x6A, 0xE9, 0x62, 0x00, 0x00, 0xEC, 0xDA, /* 0xB4-0xB7 */
+	0xC0, 0xAF, 0x00, 0x00, 0xC0, 0xAD, 0x00, 0x00, /* 0xB8-0xBB */
+	0xC0, 0xAC, 0xC0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xEF, 0xC4, 0x00, 0x00, 0xF1, 0x72, 0xF1, 0xFD, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0x44, 0xF4, 0x45, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC4, 0x60, 0x00, 0x00, 0xF5, 0xC9, /* 0xC8-0xCB */
+	0x00, 0x00, 0xC4, 0xDE, 0x00, 0x00, 0xF5, 0xCA, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF6, 0xDE, 0xC5, 0x72, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xC5, 0x71, 0xF6, 0xDD, 0xC5, 0xC9, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xF7, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA4, 0x74, 0xA6, 0x7B, 0xC9, 0xDA, /* 0xDC-0xDF */
+	0xCA, 0xCA, 0xA8, 0xB5, 0xB1, 0x5F, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA4, 0x75, 0xA5, 0xAA, 0xA5, 0xA9, /* 0xE4-0xE7 */
+	0xA5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC5, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xAE, 0x74, 0x00, 0x00, /* 0xEC-0xEF */
+	0xDD, 0x57, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0xF0-0xF3 */
+	0xA4, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xD1, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCE, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xB4, 0x53, 0xA4, 0x79, 0xC9, 0x5D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xAB, 0xA5, 0xAC, /* 0x00-0x03 */
+	0xC9, 0x78, 0x00, 0x00, 0xA6, 0x7C, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0x08-0x0B */
+	0xA7, 0xC6, 0x00, 0x00, 0xCA, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xA9, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xCC, 0x6E, 0xA9, 0xAC, 0xA9, 0xAB, 0xCC, 0x6D, /* 0x14-0x17 */
+	0xA9, 0xA9, 0xCC, 0x6F, 0xA9, 0xAA, 0xA9, 0xAD, /* 0x18-0x1B */
+	0x00, 0x00, 0xAB, 0xD2, 0x00, 0x00, 0xAB, 0xD4, /* 0x1C-0x1F */
+	0xCE, 0xB3, 0xCE, 0xB0, 0xCE, 0xB1, 0xCE, 0xB2, /* 0x20-0x23 */
+	0xCE, 0xB4, 0xAB, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xD1, 0x74, 0xD1, 0x73, 0x00, 0x00, 0xAE, 0x76, /* 0x28-0x2B */
+	0x00, 0x00, 0xAE, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x62, /* 0x30-0x33 */
+	0xD5, 0x46, 0x00, 0x00, 0xB1, 0x61, 0xB1, 0x63, /* 0x34-0x37 */
+	0xB1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB4, 0x55, 0xD5, 0x45, 0x00, 0x00, /* 0x3C-0x3F */
+	0xB4, 0x56, 0xD8, 0xF3, 0x00, 0x00, 0xB4, 0x57, /* 0x40-0x43 */
+	0xD8, 0xF2, 0xB4, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x5A, 0xDD, 0x5C, /* 0x48-0x4B */
+	0xB7, 0x45, 0xDD, 0x5B, 0xDD, 0x59, 0xDD, 0x58, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, /* 0x50-0x53 */
+	0xB9, 0xF7, 0xB9, 0xF5, 0x00, 0x00, 0xB9, 0xF6, /* 0x54-0x57 */
+	0xE1, 0xB2, 0xE1, 0xB3, 0x00, 0x00, 0xB9, 0xF3, /* 0x58-0x5B */
+	0xE5, 0x71, 0xE5, 0x6F, 0x00, 0x00, 0xBC, 0x6D, /* 0x5C-0x5F */
+	0xE5, 0x70, 0xBC, 0x6E, 0xBC, 0x6C, 0xB9, 0xF4, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x6D, 0xE9, 0x6B, /* 0x64-0x67 */
+	0xE9, 0x6C, 0xE5, 0x6E, 0xEC, 0xDC, 0xC0, 0xB0, /* 0x68-0x6B */
+	0xEC, 0xDB, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x6E, /* 0x6C-0x6F */
+	0xF1, 0xFE, 0x00, 0x00, 0xA4, 0x7A, 0xA5, 0xAD, /* 0x70-0x73 */
+	0xA6, 0x7E, 0xC9, 0xDB, 0xA6, 0x7D, 0x00, 0x00, /* 0x74-0x77 */
+	0xA9, 0xAF, 0xB7, 0x46, 0x00, 0x00, 0xA4, 0xDB, /* 0x78-0x7B */
+	0xA5, 0xAE, 0xAB, 0xD5, 0xB4, 0x58, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xC9, 0x79, 0x00, 0x00, 0xC9, 0x7A, 0x00, 0x00, /* 0x80-0x83 */
+	0xC9, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC8, /* 0x84-0x87 */
+	0xCA, 0xD0, 0xCA, 0xCE, 0xA7, 0xC9, 0xCA, 0xCD, /* 0x88-0x8B */
+	0xCA, 0xCF, 0xCA, 0xD1, 0x00, 0x00, 0xA7, 0xC7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA9, 0xB3, 0xA9, 0xB4, 0xA9, 0xB1, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xB0, 0xCE, 0xB8, /* 0x98-0x9B */
+	0xA9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xAB, 0xD6, 0x00, 0x00, 0xCE, 0xB7, 0xCE, 0xB9, /* 0xA0-0xA3 */
+	0xCE, 0xB6, 0xCE, 0xBA, 0xAB, 0xD7, 0xAE, 0x79, /* 0xA4-0xA7 */
+	0xD1, 0x75, 0x00, 0x00, 0xD1, 0x77, 0xAE, 0x77, /* 0xA8-0xAB */
+	0xD1, 0x78, 0xAE, 0x78, 0xD1, 0x76, 0x00, 0x00, /* 0xAC-0xAF */
+	0xCE, 0xB5, 0xD5, 0x47, 0xD5, 0x4A, 0xD5, 0x4B, /* 0xB0-0xB3 */
+	0xD5, 0x48, 0xB1, 0x67, 0xB1, 0x66, 0xB1, 0x64, /* 0xB4-0xB7 */
+	0xB1, 0x65, 0xD5, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xB1, 0x68, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xB4, 0x5A, 0xB4, 0x5B, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB4, 0x5C, 0xDD, 0x5D, 0xDD, 0x5F, 0xDD, 0x61, /* 0xC4-0xC7 */
+	0xB7, 0x48, 0xB7, 0x47, 0xB4, 0x59, 0xDD, 0x60, /* 0xC8-0xCB */
+	0xDD, 0x5E, 0x00, 0x00, 0xE1, 0xB8, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE1, 0xB6, 0xE1, 0xBC, 0xB9, 0xF8, /* 0xD0-0xD3 */
+	0xE1, 0xBD, 0xE1, 0xBA, 0xB9, 0xF9, 0xE1, 0xB7, /* 0xD4-0xD7 */
+	0xE1, 0xB5, 0xE1, 0xBB, 0xBC, 0x70, 0xE5, 0x73, /* 0xD8-0xDB */
+	0xE1, 0xB9, 0xBC, 0x72, 0xE5, 0x74, 0xBC, 0x71, /* 0xDC-0xDF */
+	0xBC, 0x74, 0xE5, 0x75, 0xBC, 0x6F, 0xBC, 0x73, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE9, 0x73, 0xE9, 0x71, 0xE9, 0x70, /* 0xE4-0xE7 */
+	0xE9, 0x72, 0xE9, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xC3, 0x66, 0x00, 0x00, 0xF4, 0x46, 0xF4, 0x47, /* 0xEC-0xEF */
+	0x00, 0x00, 0xF5, 0xCB, 0xF6, 0xDF, 0xC6, 0x55, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xB5, 0xA7, 0xCA, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xD8, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0x7B, 0xA4, 0xDC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0x00, 0x00, 0xA5, 0xAF, 0xC9, 0xDD, 0x00, 0x00, /* 0x00-0x03 */
+	0xA7, 0xCB, 0xCA, 0xD2, 0x00, 0x00, 0xCE, 0xBB, /* 0x04-0x07 */
+	0xAB, 0xD9, 0x00, 0x00, 0xB9, 0xFA, 0xA4, 0x7C, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xA1, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0x49, 0xA4, 0x7D, /* 0x10-0x13 */
+	0xA4, 0xDD, 0xA4, 0xDE, 0x00, 0x00, 0xA5, 0xB1, /* 0x14-0x17 */
+	0xA5, 0xB0, 0x00, 0x00, 0xC9, 0xDE, 0xA6, 0xA2, /* 0x18-0x1B */
+	0x00, 0x00, 0xCA, 0xD3, 0x00, 0x00, 0xA7, 0xCC, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0x71, 0xCC, 0x72, /* 0x20-0x23 */
+	0xCC, 0x73, 0x00, 0x00, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x24-0x27 */
+	0xCC, 0x70, 0xA9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xAB, 0xDA, 0xCE, 0xBC, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD1, 0x7A, 0xAE, 0x7A, 0x00, 0x00, 0xD1, 0x79, /* 0x30-0x33 */
+	0x00, 0x00, 0xB1, 0x69, 0xD5, 0x4C, 0xB1, 0x6A, /* 0x34-0x37 */
+	0xD5, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xB4, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDD, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBF, /* 0x40-0x43 */
+	0xE1, 0xBE, 0x00, 0x00, 0xB9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */
+	0xBC, 0x75, 0xE5, 0x76, 0xBE, 0xCA, 0xE9, 0x74, /* 0x48-0x4B */
+	0xC0, 0xB1, 0x00, 0x00, 0xC5, 0x73, 0xF7, 0xD8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCC, 0x74, 0x00, 0x00, 0xCE, 0xBD, 0xB1, 0x6B, /* 0x54-0x57 */
+	0xD8, 0xF4, 0xB7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xC2, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xCE, 0x00, 0x00, /* 0x60-0x63 */
+	0xA7, 0xCD, 0xAB, 0xDB, 0x00, 0x00, 0xD1, 0x7B, /* 0x64-0x67 */
+	0x00, 0x00, 0xB1, 0x6D, 0xB3, 0x43, 0xB1, 0x6E, /* 0x68-0x6B */
+	0xB1, 0x6C, 0xB4, 0x5E, 0x00, 0x00, 0xE1, 0xC0, /* 0x6C-0x6F */
+	0xB9, 0xFC, 0xBC, 0x76, 0x00, 0x00, 0xC9, 0x4C, /* 0x70-0x73 */
+	0xC9, 0xDF, 0x00, 0x00, 0xCA, 0xD5, 0xA7, 0xCF, /* 0x74-0x77 */
+	0xCA, 0xD4, 0xA7, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xA9, 0xBC, 0xCC, 0x77, 0xCC, 0x76, 0xA9, 0xBB, /* 0x7C-0x7F */
+	
+	0xA9, 0xB9, 0xA9, 0xBA, 0xCC, 0x75, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xAB, 0xDD, 0xCE, 0xBE, 0xAB, 0xE0, /* 0x84-0x87 */
+	0xAB, 0xDC, 0xAB, 0xE2, 0xAB, 0xDE, 0xAB, 0xDF, /* 0x88-0x8B */
+	0xAB, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xAE, 0x7D, 0xAE, 0x7C, 0xAE, 0x7B, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0x4F, 0xB1, 0x6F, /* 0x94-0x97 */
+	0xB1, 0x72, 0xB1, 0x70, 0x00, 0x00, 0xD5, 0x4E, /* 0x98-0x9B */
+	0xB1, 0x75, 0x00, 0x00, 0xB1, 0x71, 0xD5, 0x50, /* 0x9C-0x9F */
+	0xB1, 0x74, 0xB1, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD8, 0xF6, 0xD8, 0xF5, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xB4, 0x61, 0xB4, 0x5F, 0xB4, 0x60, 0xD8, 0xF7, /* 0xA8-0xAB */
+	0xB7, 0x4B, 0xDD, 0x64, 0xB7, 0x4C, 0xDD, 0x63, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x77, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xBC, 0x78, 0xE1, 0xC1, 0xBC, 0x77, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xB9, 0xFD, 0x00, 0x00, 0xEC, 0xDE, /* 0xB8-0xBB */
+	0xE9, 0x75, 0xC0, 0xB2, 0xEC, 0xDD, 0xF2, 0x40, /* 0xBC-0xBF */
+	0xF4, 0x48, 0xF4, 0x49, 0x00, 0x00, 0xA4, 0xDF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xA5, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xA7, 0xD2, 0xA7, 0xD4, 0x00, 0x00, 0xC9, 0xE2, /* 0xCC-0xCF */
+	0xCA, 0xD8, 0xCA, 0xD7, 0xCA, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xC9, 0xE1, 0xC9, 0xE0, 0xA6, 0xA4, 0xA7, 0xD3, /* 0xD4-0xD7 */
+	0xA7, 0xD1, 0xA6, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA9, 0xBD, 0xCC, 0x78, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA9, 0xBE, 0xCA, 0xDD, 0x00, 0x00, 0xCA, 0xDF, /* 0xE0-0xE3 */
+	0xCA, 0xDE, 0xCC, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCA, 0xDA, 0x00, 0x00, 0xA7, 0xD8, 0xA7, 0xD6, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCA, 0xD9, 0xCA, 0xDB, 0xCA, 0xE1, /* 0xEC-0xEF */
+	0x00, 0x00, 0xA7, 0xD5, 0x00, 0x00, 0xCA, 0xDC, /* 0xF0-0xF3 */
+	0xCA, 0xE5, 0xA9, 0xC0, 0x00, 0x00, 0xCA, 0xE2, /* 0xF4-0xF7 */
+	0xA7, 0xD7, 0x00, 0x00, 0xCA, 0xE0, 0xCA, 0xE3, /* 0xF8-0xFB */
+	0x00, 0x00, 0xA9, 0xBF, 0x00, 0x00, 0xA9, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0xCA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCC, 0xAF, 0xCC, 0xA2, 0xCC, 0x7E, /* 0x08-0x0B */
+	0xCC, 0xAE, 0xCC, 0xA9, 0xAB, 0xE7, 0xA9, 0xC2, /* 0x0C-0x0F */
+	0xCC, 0xAA, 0xCC, 0xAD, 0xAB, 0xE3, 0xCC, 0xAC, /* 0x10-0x13 */
+	0xA9, 0xC3, 0xA9, 0xC8, 0xA9, 0xC6, 0xCC, 0xA3, /* 0x14-0x17 */
+	0x00, 0x00, 0xCC, 0x7C, 0xCC, 0xA5, 0xA9, 0xCD, /* 0x18-0x1B */
+	0xCC, 0xB0, 0xAB, 0xE4, 0xCC, 0xA6, 0x00, 0x00, /* 0x1C-0x1F */
+	0xAB, 0xE5, 0xA9, 0xC9, 0xCC, 0xA8, 0x00, 0x00, /* 0x20-0x23 */
+	0xCE, 0xCD, 0xAB, 0xE6, 0xCC, 0x7B, 0xA9, 0xCA, /* 0x24-0x27 */
+	0xAB, 0xE8, 0xA9, 0xCB, 0xA9, 0xC7, 0xA9, 0xCC, /* 0x28-0x2B */
+	0xCC, 0xA7, 0xCC, 0x7A, 0xCC, 0xAB, 0xA9, 0xC4, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0x7D, 0xCC, 0xA4, /* 0x30-0x33 */
+	0xCC, 0xA1, 0xA9, 0xC5, 0x00, 0x00, 0xCE, 0xBF, /* 0x34-0x37 */
+	0x00, 0x00, 0xCE, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xCE, 0xCA, 0xD1, 0xA1, 0xCE, 0xCB, 0xAB, 0xEE, /* 0x40-0x43 */
+	0xCE, 0xCE, 0xCE, 0xC4, 0xAB, 0xED, 0xCE, 0xC6, /* 0x44-0x47 */
+	0x00, 0x00, 0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCE, 0xC9, 0xAB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xAE, 0xA3, 0x00, 0x00, 0xF9, 0xDA, 0xCE, 0xC5, /* 0x50-0x53 */
+	0xCE, 0xC1, 0xAE, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xCE, 0xCF, 0xAE, 0x7E, 0xD1, 0x7D, 0xCE, 0xC8, /* 0x58-0x5B */
+	0x00, 0x00, 0xD1, 0x7C, 0xCE, 0xC3, 0xCE, 0xCC, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xEC, 0xAE, 0xA1, /* 0x60-0x63 */
+	0xAB, 0xF2, 0xAE, 0xA2, 0xCE, 0xD0, 0xD1, 0x7E, /* 0x64-0x67 */
+	0xAB, 0xEB, 0xAE, 0xA6, 0xAB, 0xF1, 0xAB, 0xF0, /* 0x68-0x6B */
+	0xAB, 0xEF, 0xAE, 0xA5, 0xCE, 0xD1, 0xAE, 0xA7, /* 0x6C-0x6F */
+	0xAB, 0xEA, 0x00, 0x00, 0xCE, 0xC2, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x76, /* 0x7C-0x7F */
+	
+	0xD1, 0xA4, 0xD1, 0xA6, 0x00, 0x00, 0xD1, 0xA8, /* 0x80-0x83 */
+	0xAE, 0xA8, 0xAE, 0xAE, 0xD5, 0x53, 0xD1, 0xAC, /* 0x84-0x87 */
+	0xD1, 0xA3, 0xB1, 0x78, 0xD5, 0x51, 0x00, 0x00, /* 0x88-0x8B */
+	0xAE, 0xAD, 0xAE, 0xAB, 0xD1, 0xAE, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD5, 0x52, 0x00, 0x00, 0xD1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */
+	0xAE, 0xAC, 0xD1, 0xA9, 0xAE, 0xAF, 0xD1, 0xAB, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xAE, 0xAA, 0xD1, 0xAA, /* 0x98-0x9B */
+	0xD1, 0xAD, 0xD1, 0xA7, 0x00, 0x00, 0xAE, 0xA9, /* 0x9C-0x9F */
+	0xB1, 0x79, 0x00, 0x00, 0xD1, 0xA2, 0xB1, 0x77, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xB1, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xD5, 0x55, 0xD5, 0x5E, 0xB4, 0x64, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xB1, 0x7C, 0xB1, 0xA3, 0xB4, 0x65, 0xD5, 0x60, /* 0xB4-0xB7 */
+	0xB1, 0xAA, 0xD8, 0xF9, 0xD5, 0x56, 0xB1, 0xA2, /* 0xB8-0xBB */
+	0xB1, 0xA5, 0xB1, 0x7E, 0xD5, 0x54, 0xD5, 0x62, /* 0xBC-0xBF */
+	0xD5, 0x65, 0xD9, 0x49, 0x00, 0x00, 0xD5, 0x63, /* 0xC0-0xC3 */
+	0xD8, 0xFD, 0xB1, 0xA1, 0xB1, 0xA8, 0xB1, 0xAC, /* 0xC4-0xC7 */
+	0xD5, 0x5D, 0xD8, 0xF8, 0xD5, 0x61, 0xB1, 0x7B, /* 0xC8-0xCB */
+	0xD8, 0xFA, 0xD5, 0x64, 0xD8, 0xFC, 0xD5, 0x59, /* 0xCC-0xCF */
+	0x00, 0x00, 0xB4, 0x62, 0x00, 0x00, 0xD5, 0x57, /* 0xD0-0xD3 */
+	0xD5, 0x58, 0xB1, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xB1, 0xA6, 0xD5, 0x5B, 0xB1, 0xAB, 0xD5, 0x5F, /* 0xD8-0xDB */
+	0xB1, 0xA4, 0xD5, 0x5C, 0x00, 0x00, 0xB1, 0xA9, /* 0xDC-0xDF */
+	0xB4, 0x66, 0xB4, 0x63, 0xD8, 0xFB, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD5, 0x5A, 0x00, 0x00, 0xB1, 0x7D, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xB4, 0x6B, 0xB4, 0x6F, 0xD9, 0x40, 0xB7, 0x51, /* 0xF0-0xF3 */
+	0xB4, 0x6D, 0xD9, 0x44, 0xB4, 0x71, 0xDD, 0x65, /* 0xF4-0xF7 */
+	0xD9, 0x46, 0xB7, 0x53, 0xB4, 0x69, 0xB4, 0x6C, /* 0xF8-0xFB */
+	0xD9, 0x47, 0x00, 0x00, 0xD9, 0x48, 0xD9, 0x4E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_61[512] = {
+	0xB4, 0x73, 0xB7, 0x54, 0x00, 0x00, 0xD9, 0x4A, /* 0x00-0x03 */
+	0xD9, 0x4F, 0xD9, 0x43, 0xB7, 0x5E, 0x00, 0x00, /* 0x04-0x07 */
+	0xB7, 0x55, 0xB4, 0x72, 0xD9, 0x41, 0xD9, 0x50, /* 0x08-0x0B */
+	0x00, 0x00, 0xB7, 0x5D, 0xB4, 0x70, 0xB7, 0x4E, /* 0x0C-0x0F */
+	0xD9, 0x4D, 0x00, 0x00, 0xB4, 0x74, 0xD9, 0x45, /* 0x10-0x13 */
+	0xD8, 0xFE, 0xB4, 0x6A, 0xD9, 0x42, 0x00, 0x00, /* 0x14-0x17 */
+	0xD9, 0x4B, 0x00, 0x00, 0xB7, 0x4D, 0xB7, 0x52, /* 0x18-0x1B */
+	0xB4, 0x67, 0xD9, 0x4C, 0x00, 0x00, 0xB7, 0x50, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x68, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x5C, /* 0x24-0x27 */
+	0xE1, 0xC3, 0xDD, 0x70, 0x00, 0x00, 0xDD, 0x68, /* 0x28-0x2B */
+	0xE1, 0xC2, 0x00, 0x00, 0xDD, 0x6C, 0xDD, 0x6E, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x6B, 0x00, 0x00, /* 0x30-0x33 */
+	0xB7, 0x5B, 0x00, 0x00, 0xDD, 0x6A, 0xB7, 0x5F, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0x5A, 0xBA, 0x40, /* 0x3C-0x3F */
+	0xDD, 0x71, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xB7, 0x58, 0xDD, 0x69, 0xDD, 0x6D, 0xB9, 0xFE, /* 0x44-0x47 */
+	0xB7, 0x4F, 0xDD, 0x66, 0xDD, 0x67, 0xBA, 0x41, /* 0x48-0x4B */
+	0xB7, 0x57, 0xB7, 0x59, 0xB7, 0x56, 0xDD, 0x6F, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xC9, /* 0x50-0x53 */
+	0xE1, 0xCE, 0xBC, 0x7D, 0xE1, 0xD5, 0x00, 0x00, /* 0x54-0x57 */
+	0xBA, 0x47, 0x00, 0x00, 0xBA, 0x46, 0xE1, 0xD0, /* 0x58-0x5B */
+	0x00, 0x00, 0xBC, 0x7C, 0xE1, 0xC5, 0xBA, 0x45, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE1, 0xD4, 0xBA, 0x43, 0xBA, 0x44, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xD1, 0xE5, 0xAA, 0xBC, 0x7A, /* 0x64-0x67 */
+	0xB4, 0x6E, 0x00, 0x00, 0xE1, 0xD3, 0xBC, 0xA3, /* 0x68-0x6B */
+	0xE1, 0xCB, 0x00, 0x00, 0xBC, 0x7B, 0x00, 0x00, /* 0x6C-0x6F */
+	0xBC, 0xA2, 0xE1, 0xC6, 0xE1, 0xCA, 0xE1, 0xC7, /* 0x70-0x73 */
+	0xE1, 0xCD, 0xBA, 0x48, 0xBC, 0x79, 0xBA, 0x42, /* 0x74-0x77 */
+	0x00, 0x00, 0xE5, 0x7A, 0xE1, 0xCF, 0x00, 0x00, /* 0x78-0x7B */
+	0xBC, 0xA1, 0x00, 0x00, 0xBC, 0xA4, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE1, 0xCC, 0x00, 0x00, 0xBC, 0x7E, 0xE5, 0x79, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE5, 0x7E, 0xBE, 0xCE, 0xE5, 0x78, /* 0x88-0x8B */
+	0xE9, 0xA3, 0xE5, 0xA9, 0xBC, 0xA8, 0x00, 0x00, /* 0x8C-0x8F */
+	0xBC, 0xA6, 0xBE, 0xCC, 0xE5, 0xA6, 0xE5, 0xA2, /* 0x90-0x93 */
+	0xBC, 0xAC, 0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0xAA, 0xE5, 0xA1, /* 0x98-0x9B */
+	0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0xE5, 0xA5, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE5, 0xA8, 0xE5, 0x7D, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xBC, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xA5, /* 0xA4-0xA7 */
+	0xE9, 0x77, 0xBE, 0xCD, 0xE5, 0xA7, 0xBC, 0xA7, /* 0xA8-0xAB */
+	0xBC, 0xA9, 0xE5, 0xA4, 0xBC, 0xAD, 0xE5, 0xA3, /* 0xAC-0xAF */
+	0xE5, 0x7C, 0xE5, 0x7B, 0xBE, 0xCB, 0xE5, 0xAB, /* 0xB0-0xB3 */
+	0xE9, 0x7A, 0xEC, 0xE0, 0xBE, 0xD0, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE9, 0xA2, 0x00, 0x00, 0xE9, 0x7E, 0x00, 0x00, /* 0xB8-0xBB */
+	0xEC, 0xE1, 0x00, 0x00, 0xBE, 0xD1, 0xE9, 0xA1, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE9, 0x7C, 0xC0, 0xB4, 0xEC, 0xDF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE9, 0x79, 0xE9, 0x7B, 0xC0, 0xB5, /* 0xC4-0xC7 */
+	0xBE, 0xD3, 0xC0, 0xB3, 0xBE, 0xD2, 0xC0, 0xB7, /* 0xC8-0xCB */
+	0xE9, 0x7D, 0xBE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xCF, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xE7, 0xEF, 0xC8, /* 0xDC-0xDF */
+	0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x56, /* 0xE0-0xE3 */
+	0xEC, 0xE5, 0xEC, 0xE4, 0xC0, 0xB6, 0xEC, 0xE2, /* 0xE4-0xE7 */
+	0xEC, 0xE6, 0xEF, 0xD0, 0xEF, 0xCC, 0xEF, 0xCE, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEF, 0xC9, 0xEF, 0xCA, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEF, 0xCD, 0xEF, 0xCB, 0xC3, 0x67, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xC3, 0x6A, 0xC3, 0x69, 0xC3, 0x68, /* 0xF4-0xF7 */
+	0xC4, 0x61, 0xF4, 0x4A, 0xC4, 0x62, 0xF2, 0x41, /* 0xF8-0xFB */
+	0xC4, 0xDF, 0xF5, 0xCC, 0xC4, 0xE0, 0xC5, 0x74, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0xC5, 0xCA, 0xF7, 0xD9, 0x00, 0x00, 0xF7, 0xDA, /* 0x00-0x03 */
+	0xF7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBA, /* 0x04-0x07 */
+	0xA4, 0xE0, 0xC9, 0x7C, 0xA5, 0xB3, 0x00, 0x00, /* 0x08-0x0B */
+	0xA6, 0xA6, 0xA6, 0xA7, 0xA6, 0xA5, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA6, 0xA8, 0xA7, 0xDA, 0xA7, 0xD9, 0x00, 0x00, /* 0x10-0x13 */
+	0xCC, 0xB1, 0xA9, 0xCF, 0xA9, 0xCE, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xD1, 0xAF, 0xB1, 0xAD, 0xB1, 0xAE, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x75, /* 0x1C-0x1F */
+	0xDD, 0x72, 0xB7, 0x60, 0xB7, 0x61, 0xDD, 0x74, /* 0x20-0x23 */
+	0xDD, 0x76, 0xDD, 0x75, 0x00, 0x00, 0xE1, 0xD7, /* 0x24-0x27 */
+	0x00, 0x00, 0xE1, 0xD6, 0xBA, 0x49, 0xE1, 0xD8, /* 0x28-0x2B */
+	0x00, 0x00, 0xE5, 0xAC, 0xBC, 0xAE, 0x00, 0x00, /* 0x2C-0x2F */
+	0xBE, 0xD4, 0x00, 0x00, 0xC0, 0xB8, 0xC2, 0x57, /* 0x30-0x33 */
+	0xC0, 0xB9, 0x00, 0x00, 0xA4, 0xE1, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xCC, 0xB2, 0xA9, 0xD1, 0xA9, 0xD0, /* 0x3C-0x3F */
+	0xA9, 0xD2, 0xAB, 0xF3, 0xCE, 0xD2, 0xCE, 0xD3, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xB0, 0xAE, 0xB0, /* 0x44-0x47 */
+	0xB1, 0xAF, 0xB4, 0x76, 0xD9, 0x51, 0xA4, 0xE2, /* 0x48-0x4B */
+	0x00, 0x00, 0xA4, 0x7E, 0xA4, 0xE3, 0x00, 0x00, /* 0x4C-0x4F */
+	0xC9, 0x7D, 0xA5, 0xB7, 0xA5, 0xB6, 0xA5, 0xB4, /* 0x50-0x53 */
+	0xA5, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xA6, 0xAB, 0xC9, 0xE9, 0xC9, 0xEB, 0xA6, 0xAA, /* 0x58-0x5B */
+	0xC9, 0xE3, 0x00, 0x00, 0xC9, 0xE4, 0x00, 0x00, /* 0x5C-0x5F */
+	0xC9, 0xEA, 0xC9, 0xE6, 0xC9, 0xE8, 0xA6, 0xA9, /* 0x60-0x63 */
+	0xC9, 0xE5, 0xC9, 0xEC, 0xC9, 0xE7, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xA7, 0xE1, 0xA7, 0xEA, 0xA7, 0xE8, /* 0x6C-0x6F */
+	0xCA, 0xF0, 0xCA, 0xED, 0xCA, 0xF5, 0xA7, 0xE6, /* 0x70-0x73 */
+	0xCA, 0xF6, 0x00, 0x00, 0xA7, 0xDF, 0xCA, 0xF3, /* 0x74-0x77 */
+	0x00, 0x00, 0xA7, 0xE5, 0xCA, 0xEF, 0xCA, 0xEE, /* 0x78-0x7B */
+	0xA7, 0xE3, 0xCA, 0xF4, 0xA7, 0xE4, 0xA9, 0xD3, /* 0x7C-0x7F */
+	
+	0xA7, 0xDE, 0xCA, 0xF1, 0x00, 0x00, 0xCA, 0xE7, /* 0x80-0x83 */
+	0xA7, 0xDB, 0x00, 0x00, 0xA7, 0xEE, 0xCA, 0xEC, /* 0x84-0x87 */
+	0xCA, 0xF2, 0xA7, 0xE0, 0xA7, 0xE2, 0x00, 0x00, /* 0x88-0x8B */
+	0xCA, 0xE8, 0x00, 0x00, 0xCA, 0xE9, 0xCA, 0xEA, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA7, 0xED, 0xA7, 0xE7, 0xA7, 0xEC, /* 0x90-0x93 */
+	0xCA, 0xEB, 0xA7, 0xEB, 0xA7, 0xDD, 0xA7, 0xDC, /* 0x94-0x97 */
+	0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xA9, 0xE1, 0xCC, 0xBE, 0xCC, 0xB7, 0xA9, 0xDC, /* 0xA8-0xAB */
+	0xA9, 0xEF, 0xCC, 0xB3, 0xCC, 0xBA, 0xCC, 0xBC, /* 0xAC-0xAF */
+	0xCC, 0xBF, 0xA9, 0xEA, 0x00, 0x00, 0xCC, 0xBB, /* 0xB0-0xB3 */
+	0xCC, 0xB4, 0xA9, 0xE8, 0xCC, 0xB8, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xCC, 0xC0, 0xA9, 0xD9, 0x00, 0x00, 0xCC, 0xBD, /* 0xB8-0xBB */
+	0xA9, 0xE3, 0xA9, 0xE2, 0xCC, 0xB6, 0xA9, 0xD7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xD8, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA9, 0xD6, 0x00, 0x00, 0xA9, 0xEE, 0xA9, 0xE6, /* 0xC4-0xC7 */
+	0xA9, 0xE0, 0xA9, 0xD4, 0xCC, 0xB9, 0xA9, 0xDF, /* 0xC8-0xCB */
+	0xA9, 0xD5, 0xA9, 0xE7, 0xA9, 0xF0, 0xCE, 0xD4, /* 0xCC-0xCF */
+	0xA9, 0xE4, 0xCC, 0xB5, 0xA9, 0xDA, 0xA9, 0xDD, /* 0xD0-0xD3 */
+	0xA9, 0xDE, 0x00, 0x00, 0xA9, 0xEC, 0xA9, 0xED, /* 0xD4-0xD7 */
+	0xA9, 0xEB, 0xA9, 0xE5, 0xA9, 0xE9, 0xA9, 0xDB, /* 0xD8-0xDB */
+	0xAB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDA, /* 0xE8-0xEB */
+	0xAC, 0x41, 0xAB, 0xF8, 0xAB, 0xFA, 0xAC, 0x40, /* 0xEC-0xEF */
+	0xCE, 0xE6, 0xAB, 0xFD, 0xD1, 0xB1, 0xAE, 0xB1, /* 0xF0-0xF3 */
+	0xAC, 0x43, 0xCE, 0xD7, 0xCE, 0xDF, 0xAB, 0xFE, /* 0xF4-0xF7 */
+	0xCE, 0xDE, 0xCE, 0xDB, 0xCE, 0xE3, 0xCE, 0xE5, /* 0xF8-0xFB */
+	0xAB, 0xF7, 0xAB, 0xFB, 0xAC, 0x42, 0xAE, 0xB3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0xCE, 0xE0, 0xAB, 0xF9, 0xAC, 0x45, 0xCE, 0xD9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xFC, /* 0x04-0x07 */
+	0xAE, 0xB2, 0xAB, 0xF6, 0x00, 0x00, 0xCE, 0xD6, /* 0x08-0x0B */
+	0xCE, 0xDD, 0xCE, 0xD5, 0xCE, 0xD8, 0xCE, 0xDC, /* 0x0C-0x0F */
+	0xD1, 0xB2, 0xAC, 0x44, 0x00, 0x00, 0xCE, 0xE1, /* 0x10-0x13 */
+	0xCE, 0xE2, 0xCE, 0xE4, 0xAB, 0xF5, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xAE, 0xC1, 0xD1, 0xBE, 0xAE, 0xBF, 0xAE, 0xC0, /* 0x28-0x2B */
+	0xD1, 0xB4, 0xD1, 0xC4, 0x00, 0x00, 0xAE, 0xB6, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0x66, 0xD1, 0xC6, /* 0x30-0x33 */
+	0xD1, 0xC0, 0x00, 0x00, 0xD1, 0xB7, 0x00, 0x00, /* 0x34-0x37 */
+	0xD1, 0xC9, 0xD1, 0xBA, 0xAE, 0xBC, 0xD5, 0x7D, /* 0x38-0x3B */
+	0xD1, 0xBD, 0xAE, 0xBE, 0xAE, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD1, 0xCB, 0xD1, 0xBF, 0xAE, 0xB8, 0xD1, 0xB8, /* 0x40-0x43 */
+	0xD1, 0xB5, 0xD1, 0xB6, 0xAE, 0xB9, 0xD1, 0xC5, /* 0x44-0x47 */
+	0xD1, 0xCC, 0xAE, 0xBB, 0xD1, 0xBC, 0xD1, 0xBB, /* 0x48-0x4B */
+	0xAE, 0xC3, 0xAE, 0xC2, 0xAE, 0xB4, 0xAE, 0xBA, /* 0x4C-0x4F */
+	0xAE, 0xBD, 0xD1, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD1, 0xC2, 0xAE, 0xB7, 0xD1, 0xB3, 0xD1, 0xCA, /* 0x54-0x57 */
+	0xD1, 0xC1, 0xD1, 0xC3, 0xD1, 0xC7, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xD5, 0x67, 0x00, 0x00, 0xB1, 0xB7, /* 0x64-0x67 */
+	0xB1, 0xCB, 0xB1, 0xCA, 0x00, 0x00, 0xB1, 0xBF, /* 0x68-0x6B */
+	0x00, 0x00, 0xD5, 0x79, 0xD5, 0x75, 0xD5, 0x72, /* 0x6C-0x6F */
+	0xD5, 0xA6, 0xB1, 0xBA, 0xB1, 0xB2, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD5, 0x77, 0xB4, 0xA8, 0xB1, 0xB6, /* 0x74-0x77 */
+	0xD5, 0xA1, 0x00, 0x00, 0xB1, 0xCC, 0xB1, 0xC9, /* 0x78-0x7B */
+	0xD5, 0x7B, 0xD5, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xB1, 0xC8, 0xD5, 0xA3, 0xD5, 0x69, 0xB1, 0xBD, /* 0x80-0x83 */
+	0xB1, 0xC1, 0xD5, 0xA2, 0x00, 0x00, 0xD5, 0x73, /* 0x84-0x87 */
+	0xB1, 0xC2, 0xB1, 0xBC, 0xD5, 0x68, 0x00, 0x00, /* 0x88-0x8B */
+	0xB4, 0x78, 0xD5, 0xA5, 0xD5, 0x71, 0xB1, 0xC7, /* 0x8C-0x8F */
+	0xD5, 0x74, 0xD5, 0xA4, 0xB1, 0xC6, 0x00, 0x00, /* 0x90-0x93 */
+	0xD9, 0x52, 0x00, 0x00, 0xB1, 0xB3, 0xD5, 0x6F, /* 0x94-0x97 */
+	0xB1, 0xB8, 0xB1, 0xC3, 0x00, 0x00, 0xB1, 0xBE, /* 0x98-0x9B */
+	0xD5, 0x78, 0xD5, 0x6E, 0xD5, 0x6C, 0xD5, 0x7E, /* 0x9C-0x9F */
+	0xB1, 0xB0, 0xB1, 0xC4, 0xB1, 0xB4, 0xB4, 0x77, /* 0xA0-0xA3 */
+	0xD5, 0x7C, 0xB1, 0xB5, 0x00, 0x00, 0xB1, 0xB1, /* 0xA4-0xA7 */
+	0xB1, 0xC0, 0xB1, 0xBB, 0xB1, 0xB9, 0xD5, 0x70, /* 0xA8-0xAB */
+	0xB1, 0xC5, 0xD5, 0x6D, 0xD5, 0x7A, 0xD5, 0x76, /* 0xAC-0xAF */
+	0xD9, 0x54, 0xD9, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD5, 0x6B, 0xD9, 0x64, 0x00, 0x00, /* 0xBC-0xBF */
+	0xB4, 0x7A, 0x00, 0x00, 0xD9, 0x6A, 0xD9, 0x59, /* 0xC0-0xC3 */
+	0xD9, 0x67, 0xDD, 0x77, 0xB4, 0x7D, 0xD9, 0x6B, /* 0xC4-0xC7 */
+	0xD9, 0x6E, 0xB4, 0x7C, 0xD9, 0x5C, 0xD9, 0x6D, /* 0xC8-0xCB */
+	0xD9, 0x6C, 0xB4, 0x7E, 0xD9, 0x55, 0xB4, 0x79, /* 0xCC-0xCF */
+	0xB4, 0xA3, 0x00, 0x00, 0xB4, 0xA1, 0xD9, 0x69, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xD9, 0x5F, 0xB4, 0xA5, 0xD9, 0x70, /* 0xD4-0xD7 */
+	0xD9, 0x68, 0xD9, 0x71, 0xB4, 0xAD, 0xB4, 0xAB, /* 0xD8-0xDB */
+	0xD9, 0x66, 0xD9, 0x65, 0x00, 0x00, 0xD9, 0x63, /* 0xDC-0xDF */
+	0xD9, 0x5D, 0xB4, 0xA4, 0x00, 0x00, 0xB4, 0xA2, /* 0xE0-0xE3 */
+	0xD1, 0xB9, 0xD9, 0x56, 0x00, 0x00, 0xDD, 0xB7, /* 0xE4-0xE7 */
+	0xD9, 0x57, 0xB4, 0x7B, 0xB4, 0xAA, 0xDD, 0x79, /* 0xE8-0xEB */
+	0x00, 0x00, 0xB4, 0xA6, 0xB4, 0xA7, 0xD9, 0x58, /* 0xEC-0xEF */
+	0xD9, 0x6F, 0xDD, 0x78, 0xD9, 0x60, 0xD9, 0x5B, /* 0xF0-0xF3 */
+	0xB4, 0xA9, 0xD9, 0x61, 0xD9, 0x5E, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xB4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_64[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0x70, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xDD, 0x7C, 0xDD, 0xB1, 0xDD, 0xB6, /* 0x08-0x0B */
+	0xDD, 0xAA, 0xB7, 0x6C, 0xDD, 0xBB, 0xB7, 0x69, /* 0x0C-0x0F */
+	0xDD, 0x7A, 0x00, 0x00, 0xDD, 0x7B, 0xB7, 0x62, /* 0x10-0x13 */
+	0xB7, 0x6B, 0xDD, 0xA4, 0xB7, 0x6E, 0xB7, 0x6F, /* 0x14-0x17 */
+	0xDD, 0xA5, 0x00, 0x00, 0xDD, 0xB2, 0xDD, 0xB8, /* 0x18-0x1B */
+	0xB7, 0x6A, 0x00, 0x00, 0xB7, 0x64, 0xDD, 0xA3, /* 0x1C-0x1F */
+	0xDD, 0x7D, 0xDD, 0xBA, 0xDD, 0xA8, 0xDD, 0xA9, /* 0x20-0x23 */
+	0xDD, 0x7E, 0xDD, 0xB4, 0xDD, 0xAB, 0xDD, 0xB5, /* 0x24-0x27 */
+	0xDD, 0xAD, 0x00, 0x00, 0xB7, 0x65, 0xE1, 0xD9, /* 0x28-0x2B */
+	0xB7, 0x68, 0xB7, 0x66, 0xDD, 0xB9, 0xDD, 0xB0, /* 0x2C-0x2F */
+	0xDD, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xA1, /* 0x30-0x33 */
+	0xBA, 0x53, 0xDD, 0xAF, 0xB7, 0x6D, 0xDD, 0xA7, /* 0x34-0x37 */
+	0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB7, 0x67, 0xB7, 0x63, 0xE1, 0xEE, /* 0x3C-0x3F */
+	0xDD, 0xB3, 0xDD, 0xAE, 0x00, 0x00, 0xDD, 0xA2, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE9, /* 0x48-0x4B */
+	0x00, 0x00, 0xE1, 0xDA, 0xE1, 0xE5, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE1, 0xEC, 0xBA, 0x51, 0xB4, 0xAC, 0xE1, 0xEA, /* 0x50-0x53 */
+	0xBA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xBA, 0x4B, 0xE1, 0xF1, 0x00, 0x00, 0xE1, 0xDB, /* 0x58-0x5B */
+	0xE1, 0xE8, 0xE1, 0xDC, 0xE1, 0xE7, 0xBA, 0x4F, /* 0x5C-0x5F */
+	0xE1, 0xEB, 0xD9, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xF2, 0xE1, 0xE3, 0xBA, 0x52, /* 0x64-0x67 */
+	0xE5, 0xBA, 0xBC, 0xAF, 0x00, 0x00, 0xE1, 0xF0, /* 0x68-0x6B */
+	0xE1, 0xEF, 0xBA, 0x54, 0xE5, 0xAD, 0xBC, 0xB0, /* 0x6C-0x6F */
+	0xE5, 0xAE, 0x00, 0x00, 0xE1, 0xDF, 0xE1, 0xE0, /* 0x70-0x73 */
+	0xE1, 0xDD, 0xE1, 0xE2, 0xE1, 0xDE, 0xE1, 0xF3, /* 0x74-0x77 */
+	0xBA, 0x4E, 0xBC, 0xB1, 0xBA, 0x50, 0xBA, 0x55, /* 0x78-0x7B */
+	0x00, 0x00, 0xE1, 0xE1, 0x00, 0x00, 0xE1, 0xED, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE5, 0xB1, 0x00, 0x00, 0xBA, 0x4A, /* 0x84-0x87 */
+	0xBC, 0xB4, 0xE9, 0xAA, 0xE5, 0xB6, 0xE5, 0xB5, /* 0x88-0x8B */
+	0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB4, /* 0x8C-0x8F */
+	0xBC, 0xB5, 0x00, 0x00, 0xBC, 0xBB, 0xBC, 0xB8, /* 0x90-0x93 */
+	0x00, 0x00, 0xBC, 0xB9, 0xE5, 0xAF, 0xE5, 0xB2, /* 0x94-0x97 */
+	0xE5, 0xBC, 0xBC, 0xC1, 0xBC, 0xBF, 0x00, 0x00, /* 0x98-0x9B */
+	0xE5, 0xB3, 0xD9, 0x5A, 0xBC, 0xB2, 0xE5, 0xB9, /* 0x9C-0x9F */
+	0xE5, 0xB0, 0x00, 0x00, 0xBC, 0xC2, 0xE5, 0xB8, /* 0xA0-0xA3 */
+	0xBA, 0x4D, 0xBC, 0xB7, 0xE1, 0xE4, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xBC, 0xBA, 0x00, 0x00, 0xBC, 0xBE, /* 0xA8-0xAB */
+	0xBC, 0xC0, 0xBC, 0xBD, 0xBC, 0xBC, 0x00, 0x00, /* 0xAC-0xAF */
+	0xBC, 0xB6, 0xE5, 0xBB, 0xBC, 0xB3, 0xBC, 0xC3, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xD8, /* 0xB8-0xBB */
+	0xBE, 0xD9, 0xE9, 0xA9, 0xBE, 0xE2, 0xBE, 0xDF, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBE, 0xD6, 0xBE, 0xDD, 0xE9, 0xAB, /* 0xC0-0xC3 */
+	0xBE, 0xDB, 0xBE, 0xD5, 0x00, 0x00, 0xBE, 0xDC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE9, 0xA8, 0xC0, 0xBB, 0xBE, 0xD7, /* 0xC8-0xCB */
+	0x00, 0x00, 0xBE, 0xDE, 0xC0, 0xBA, 0xE9, 0xA7, /* 0xCC-0xCF */
+	0xE9, 0xA6, 0x00, 0x00, 0xBE, 0xE0, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xBE, 0xE1, 0x00, 0x00, 0xE9, 0xA5, 0xE9, 0xA4, /* 0xD4-0xD7 */
+	0xC0, 0xBC, 0xE9, 0xAE, 0xBE, 0xDA, 0xE9, 0xAC, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xC0, 0xBD, 0x00, 0x00, 0xC0, 0xC2, 0xEC, 0xEA, /* 0xE0-0xE3 */
+	0xEC, 0xEC, 0x00, 0x00, 0xC0, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEC, 0xED, 0xEC, 0xE9, 0x00, 0x00, 0xEC, 0xEB, /* 0xE8-0xEB */
+	0xC0, 0xC0, 0xC0, 0xC3, 0x00, 0x00, 0xEC, 0xE8, /* 0xEC-0xEF */
+	0xC0, 0xBE, 0xC0, 0xC1, 0xC2, 0x59, 0xE9, 0xAD, /* 0xF0-0xF3 */
+	0xC2, 0x58, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x5E, /* 0xF4-0xF7 */
+	0xEF, 0xD4, 0x00, 0x00, 0xC2, 0x5C, 0xC2, 0x5D, /* 0xF8-0xFB */
+	0xEF, 0xD7, 0xEF, 0xD3, 0xC2, 0x5A, 0xEF, 0xD1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0xC3, 0x6B, 0xEF, 0xD5, 0x00, 0x00, 0xEF, 0xD6, /* 0x00-0x03 */
+	0xEF, 0xD2, 0x00, 0x00, 0xC2, 0x5B, 0xF2, 0x42, /* 0x04-0x07 */
+	0x00, 0x00, 0xF2, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xF2, 0x46, 0xF2, 0x44, 0xF2, 0x47, 0xC3, 0x6C, /* 0x0C-0x0F */
+	0xF2, 0x43, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x4E, /* 0x10-0x13 */
+	0xC4, 0x64, 0xF4, 0x4D, 0xF4, 0x4C, 0xF4, 0x4B, /* 0x14-0x17 */
+	0xC4, 0x63, 0xC4, 0x65, 0x00, 0x00, 0xF5, 0xCD, /* 0x18-0x1B */
+	0xC4, 0xE2, 0xC4, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF6, 0xE1, 0xF6, 0xE0, 0xF6, 0xE3, 0xC5, 0xCB, /* 0x20-0x23 */
+	0xC5, 0x75, 0xF7, 0xDD, 0xF6, 0xE2, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xF7, 0xDC, 0xC5, 0xCD, 0xC5, 0xCC, /* 0x28-0x2B */
+	0xC5, 0xF3, 0xF8, 0xA9, 0xF8, 0xEF, 0xA4, 0xE4, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0x72, 0xE9, 0xAF, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0xAC, 0xCA, 0xF7, /* 0x34-0x37 */
+	0xA7, 0xF1, 0xA7, 0xEF, 0x00, 0x00, 0xA7, 0xF0, /* 0x38-0x3B */
+	0x00, 0x00, 0xCC, 0xC1, 0xA9, 0xF1, 0xAC, 0x46, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCE, 0xE7, 0x00, 0x00, 0xCE, 0xE8, /* 0x40-0x43 */
+	0x00, 0x00, 0xAC, 0x47, 0xD1, 0xCE, 0x00, 0x00, /* 0x44-0x47 */
+	0xAE, 0xC4, 0xAE, 0xC5, 0xD1, 0xCD, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xD3, /* 0x4C-0x4F */
+	0x00, 0x00, 0xB1, 0xCF, 0x00, 0x00, 0xD5, 0xA7, /* 0x50-0x53 */
+	0xB1, 0xD6, 0xB1, 0xD5, 0xB1, 0xCE, 0xB1, 0xD1, /* 0x54-0x57 */
+	0xB1, 0xD4, 0xB1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xD9, 0x76, 0xB1, 0xCD, 0xB4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xB4, 0xB1, 0xB4, 0xB2, /* 0x60-0x63 */
+	0xD9, 0x75, 0xD9, 0x78, 0xB4, 0xB0, 0xD9, 0x73, /* 0x64-0x67 */
+	0xD9, 0x77, 0x00, 0x00, 0xD9, 0x74, 0x00, 0x00, /* 0x68-0x6B */
+	0xB7, 0x71, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xBC, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0x56, 0xE1, 0xF4, /* 0x70-0x73 */
+	0xBE, 0xE3, 0xBC, 0xC4, 0xE5, 0xBD, 0xBC, 0xC5, /* 0x74-0x77 */
+	0xBC, 0xC6, 0xE5, 0xBF, 0xE5, 0xBE, 0xE5, 0xC0, /* 0x78-0x7B */
+	0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, /* 0x7C-0x7F */
+	
+	0xEC, 0xEF, 0xEC, 0xEE, 0xC0, 0xC4, 0xC0, 0xC5, /* 0x80-0x83 */
+	0xF2, 0x48, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE5, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xD9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xB4, 0xB4, 0xB4, 0xB3, 0xDD, 0xBD, 0x00, 0x00, /* 0x90-0x93 */
+	0xEF, 0xD8, 0xC4, 0xE3, 0xF7, 0xDE, 0xA4, 0xE6, /* 0x94-0x97 */
+	0x00, 0x00, 0xAE, 0xC6, 0x00, 0x00, 0xB1, 0xD8, /* 0x98-0x9B */
+	0xB1, 0xD7, 0xD9, 0x7A, 0xD9, 0x7B, 0xB7, 0x72, /* 0x9C-0x9F */
+	0xE1, 0xF5, 0xBA, 0x57, 0xE9, 0xB2, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xA4, 0xE7, 0xA5, 0xB8, 0x00, 0x00, 0xA9, 0xF2, /* 0xA4-0xA7 */
+	0xCC, 0xC2, 0x00, 0x00, 0xCE, 0xE9, 0xAC, 0x48, /* 0xA8-0xAB */
+	0xB1, 0xD9, 0x00, 0x00, 0xD9, 0x7C, 0xB4, 0xB5, /* 0xAC-0xAF */
+	0xB7, 0x73, 0x00, 0x00, 0xE5, 0xC1, 0xE5, 0xC2, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF0, 0xC2, 0x5F, /* 0xB4-0xB7 */
+	0xF8, 0xF0, 0xA4, 0xE8, 0x00, 0x00, 0xCC, 0xC3, /* 0xB8-0xBB */
+	0xA9, 0xF3, 0xAC, 0x49, 0x00, 0x00, 0xCE, 0xEA, /* 0xBC-0xBF */
+	0x00, 0x00, 0xAE, 0xC7, 0xD1, 0xD2, 0xD1, 0xD0, /* 0xC0-0xC3 */
+	0xD1, 0xD1, 0xAE, 0xC8, 0xD1, 0xCF, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xDB, /* 0xC8-0xCB */
+	0xB1, 0xDC, 0xD5, 0xA8, 0xB1, 0xDD, 0xB1, 0xDA, /* 0xCC-0xCF */
+	0xD9, 0x7D, 0x00, 0x00, 0xD9, 0x7E, 0xDD, 0xBE, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0x59, 0xBA, 0x58, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF1, 0xEF, 0xD9, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF2, 0x4A, 0xF2, 0x49, 0xF4, 0x4F, /* 0xDC-0xDF */
+	0x00, 0x00, 0xC9, 0x5E, 0xAC, 0x4A, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA4, 0xE9, 0xA5, 0xB9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xA6, 0xAE, 0xA6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xA6, 0xAF, 0xA6, 0xB0, 0xC9, 0xEE, 0xC9, 0xED, /* 0xEC-0xEF */
+	0xCA, 0xF8, 0xA7, 0xF2, 0xCA, 0xFB, 0xCA, 0xFA, /* 0xF0-0xF3 */
+	0xCA, 0xF9, 0xCA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xF4, 0xCC, 0xC9, /* 0xF8-0xFB */
+	0xCC, 0xC5, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_66[512] = {
+	0xA9, 0xFB, 0x00, 0x00, 0xA9, 0xF9, 0xCC, 0xCA, /* 0x00-0x03 */
+	0xCC, 0xC6, 0xCC, 0xCD, 0xA9, 0xF8, 0xAA, 0x40, /* 0x04-0x07 */
+	0xCC, 0xC8, 0xCC, 0xC4, 0xA9, 0xFE, 0xCC, 0xCB, /* 0x08-0x0B */
+	0xA9, 0xF7, 0xCC, 0xCC, 0xA9, 0xFA, 0xA9, 0xFC, /* 0x0C-0x0F */
+	0xCC, 0xD0, 0xCC, 0xCF, 0xCC, 0xC7, 0xA9, 0xF6, /* 0x10-0x13 */
+	0xA9, 0xF5, 0xA9, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xCE, 0xEF, 0xCE, 0xF5, 0x00, 0x00, 0xAC, 0x50, /* 0x1C-0x1F */
+	0xAC, 0x4D, 0xCE, 0xEC, 0xCE, 0xF1, 0x00, 0x00, /* 0x20-0x23 */
+	0xAC, 0x53, 0xAC, 0x4B, 0xCE, 0xF0, 0xAC, 0x4E, /* 0x24-0x27 */
+	0xAC, 0x51, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF3, /* 0x28-0x2B */
+	0x00, 0x00, 0xAC, 0x4C, 0xCE, 0xF8, 0xAC, 0x4F, /* 0x2C-0x2F */
+	0x00, 0x00, 0xAC, 0x52, 0xCE, 0xED, 0xCE, 0xF2, /* 0x30-0x33 */
+	0xCE, 0xF6, 0xCE, 0xEE, 0xCE, 0xEB, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xCE, 0xF7, 0xCE, 0xF4, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xAE, 0xD0, 0xAE, 0xC9, 0xAE, 0xCC, /* 0x40-0x43 */
+	0x00, 0x00, 0xAE, 0xCF, 0x00, 0x00, 0xD1, 0xD5, /* 0x44-0x47 */
+	0x00, 0x00, 0xAE, 0xCA, 0xD1, 0xD3, 0x00, 0x00, /* 0x48-0x4B */
+	0xAE, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xAE, 0xCB, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD1, 0xD6, 0xAE, 0xCD, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD5, 0xAC, 0xB1, 0xDF, 0xD5, 0xAB, /* 0x58-0x5B */
+	0xD5, 0xAD, 0xB1, 0xDE, 0xB1, 0xE3, 0xD1, 0xD4, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD5, 0xAA, 0xD5, 0xAE, 0x00, 0x00, /* 0x60-0x63 */
+	0xB1, 0xE0, 0xD5, 0xA9, 0xB1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */
+	0xB1, 0xE1, 0x00, 0x00, 0xD9, 0xA7, 0x00, 0x00, /* 0x68-0x6B */
+	0xD9, 0xA2, 0x00, 0x00, 0xB4, 0xB6, 0xB4, 0xBA, /* 0x6C-0x6F */
+	0xB4, 0xB7, 0xD9, 0xA5, 0xD9, 0xA8, 0x00, 0x00, /* 0x70-0x73 */
+	0xB4, 0xB8, 0x00, 0x00, 0xB4, 0xB9, 0xB4, 0xBE, /* 0x74-0x77 */
+	0xDD, 0xC7, 0xD9, 0xA6, 0xB4, 0xBC, 0xD9, 0xA3, /* 0x78-0x7B */
+	0xD9, 0xA1, 0x00, 0x00, 0xB4, 0xBD, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xB7, 0x79, 0x00, 0x00, 0xDD, 0xBF, 0xB7, 0x76, /* 0x84-0x87 */
+	0xB7, 0x77, 0xB7, 0x75, 0xDD, 0xC4, 0xDD, 0xC3, /* 0x88-0x8B */
+	0xDD, 0xC0, 0xB7, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDD, 0xC2, 0xB4, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xDD, 0xC6, 0xDD, 0xC1, 0xB7, 0x78, 0xB7, 0x74, /* 0x94-0x97 */
+	0xB7, 0x7A, 0xDD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xBA, 0x5C, 0x00, 0x00, 0xE1, 0xF8, /* 0x9C-0x9F */
+	0xE1, 0xF7, 0xE1, 0xF6, 0xBA, 0x5A, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBA, 0x5B, 0xE5, 0xC5, 0xE5, 0xC8, 0xBC, 0xC8, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0xC7, 0xE5, 0xC9, /* 0xAC-0xAF */
+	0xE5, 0xC4, 0xBC, 0xCA, 0xE5, 0xC6, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xBC, 0xC9, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC7, /* 0xB4-0xB7 */
+	0xBE, 0xE9, 0xBE, 0xE6, 0xE9, 0xBB, 0xE9, 0xBA, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xB4, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE9, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xBE, 0xE7, 0x00, 0x00, 0xBE, 0xE4, 0xBE, 0xE8, /* 0xC4-0xC7 */
+	0xE9, 0xB3, 0xBE, 0xE5, 0xE9, 0xB6, 0xE9, 0xB7, /* 0xC8-0xCB */
+	0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB8, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xC7, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEF, 0xDC, 0xC0, 0xC6, 0xEF, 0xDA, 0xEF, 0xDB, /* 0xD8-0xDB */
+	0xC2, 0x60, 0xC3, 0x6E, 0xF2, 0x4B, 0x00, 0x00, /* 0xDC-0xDF */
+	0xC3, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x51, /* 0xE0-0xE3 */
+	0xF4, 0x52, 0x00, 0x00, 0xC4, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF4, 0x50, 0xC4, 0xE4, 0x00, 0x00, 0xF7, 0xDF, /* 0xE8-0xEB */
+	0xC5, 0xCE, 0xF8, 0xAA, 0xF8, 0xAB, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA4, 0xEA, 0x00, 0x00, 0xA6, 0xB1, 0xA6, 0xB2, /* 0xF0-0xF3 */
+	0xA7, 0xF3, 0x00, 0x00, 0xCC, 0xD1, 0xAC, 0x54, /* 0xF4-0xF7 */
+	0xAE, 0xD1, 0xB1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xB0, 0xD2, 0x00, 0x00, 0xB4, 0xBF, 0xB4, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0xB3, 0xCC, 0xD9, 0xA9, 0x00, 0x00, 0xB7, 0x7C, /* 0x00-0x03 */
+	0xE1, 0xFA, 0xE1, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA4, 0xEB, 0xA6, 0xB3, 0xCC, 0xD2, 0xAA, 0x42, /* 0x08-0x0B */
+	0x00, 0x00, 0xAA, 0x41, 0x00, 0x00, 0xCE, 0xF9, /* 0x0C-0x0F */
+	0xCE, 0xFA, 0x00, 0x00, 0xD1, 0xD7, 0xD1, 0xD8, /* 0x10-0x13 */
+	0xAE, 0xD2, 0xAE, 0xD3, 0x00, 0x00, 0xAE, 0xD4, /* 0x14-0x17 */
+	0xD5, 0xAF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xE6, /* 0x18-0x1B */
+	0x00, 0x00, 0xB4, 0xC2, 0x00, 0x00, 0xB4, 0xC1, /* 0x1C-0x1F */
+	0xDD, 0xC8, 0xDF, 0x7A, 0xE1, 0xFB, 0xE9, 0xBD, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x61, 0xC4, 0x67, /* 0x24-0x27 */
+	0xA4, 0xEC, 0x00, 0x00, 0xA5, 0xBC, 0xA5, 0xBD, /* 0x28-0x2B */
+	0xA5, 0xBB, 0xA5, 0xBE, 0xA5, 0xBA, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA6, 0xB6, 0x00, 0x00, 0xC9, 0xF6, /* 0x30-0x33 */
+	0xA6, 0xB5, 0xA6, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xC9, 0xF1, 0xC9, 0xF0, 0xC9, 0xF3, 0xC9, 0xF2, /* 0x38-0x3B */
+	0xC9, 0xF5, 0xA6, 0xB4, 0xC9, 0xEF, 0xC9, 0xF4, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCA, 0xFD, 0xA7, 0xFD, 0xCA, 0xFE, /* 0x44-0x47 */
+	0xCB, 0x43, 0xA7, 0xFC, 0x00, 0x00, 0xCB, 0x47, /* 0x48-0x4B */
+	0xCB, 0x42, 0xCB, 0x45, 0xA7, 0xF5, 0xA7, 0xF6, /* 0x4C-0x4F */
+	0xA7, 0xF7, 0xA7, 0xF8, 0x00, 0x00, 0xA8, 0x40, /* 0x50-0x53 */
+	0x00, 0x00, 0xCB, 0x41, 0xA7, 0xFA, 0xA8, 0x41, /* 0x54-0x57 */
+	0x00, 0x00, 0xCB, 0x40, 0xCB, 0x46, 0x00, 0x00, /* 0x58-0x5B */
+	0xA7, 0xF9, 0xCB, 0x44, 0xA7, 0xFB, 0xA7, 0xF4, /* 0x5C-0x5F */
+	0xA7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0x57, 0x00, 0x00, /* 0x68-0x6B */
+	0xCC, 0xD4, 0xAA, 0x43, 0x00, 0x00, 0xAA, 0x4D, /* 0x6C-0x6F */
+	0xAA, 0x4E, 0xAA, 0x46, 0xAA, 0x58, 0xAA, 0x48, /* 0x70-0x73 */
+	0xCC, 0xDC, 0xAA, 0x53, 0xCC, 0xD7, 0xAA, 0x49, /* 0x74-0x77 */
+	0xCC, 0xE6, 0xCC, 0xE7, 0xCC, 0xDF, 0xCC, 0xD8, /* 0x78-0x7B */
+	0xAA, 0x56, 0xCC, 0xE4, 0xAA, 0x51, 0xAA, 0x4F, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xCC, 0xE5, 0x00, 0x00, 0xCC, 0xE3, /* 0x80-0x83 */
+	0xCC, 0xDB, 0xCC, 0xD3, 0xCC, 0xDA, 0xAA, 0x4A, /* 0x84-0x87 */
+	0x00, 0x00, 0xAA, 0x50, 0x00, 0x00, 0xAA, 0x44, /* 0x88-0x8B */
+	0xCC, 0xDE, 0xCC, 0xDD, 0xCC, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */
+	0xAA, 0x52, 0xCC, 0xE1, 0xCC, 0xD6, 0xAA, 0x55, /* 0x90-0x93 */
+	0xCC, 0xE8, 0xAA, 0x45, 0x00, 0x00, 0xAA, 0x4C, /* 0x94-0x97 */
+	0xCC, 0xD9, 0xCC, 0xE2, 0xAA, 0x54, 0x00, 0x00, /* 0x98-0x9B */
+	0xAA, 0x47, 0xAA, 0x4B, 0x00, 0x00, 0xCC, 0xE0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0x5B, 0xAC, 0x5C, /* 0xAC-0xAF */
+	0xAC, 0x69, 0x00, 0x00, 0xCF, 0x56, 0xCF, 0x4C, /* 0xB0-0xB3 */
+	0xAC, 0x62, 0xCF, 0x4A, 0xAC, 0x5B, 0xCF, 0x45, /* 0xB4-0xB7 */
+	0xAC, 0x65, 0xCF, 0x52, 0xCE, 0xFE, 0xCF, 0x41, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCF, 0x44, 0xCE, 0xFB, 0xCF, 0x51, 0xCF, 0x61, /* 0xC0-0xC3 */
+	0xAC, 0x60, 0xCF, 0x46, 0xCF, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xCE, 0xFD, 0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x63, /* 0xC8-0xCB */
+	0xCF, 0x5A, 0xCF, 0x4B, 0xCF, 0x53, 0xAC, 0x66, /* 0xCC-0xCF */
+	0xAC, 0x59, 0xAC, 0x61, 0xAC, 0x6D, 0xAC, 0x56, /* 0xD0-0xD3 */
+	0xAC, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCF, 0x43, 0xAC, 0x6A, 0xAC, 0x63, 0xCF, 0x5D, /* 0xD8-0xDB */
+	0xCF, 0x40, 0xAC, 0x6C, 0xAC, 0x67, 0xCF, 0x49, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0x6B, 0xCF, 0x50, /* 0xE0-0xE3 */
+	0xCF, 0x48, 0xAC, 0x64, 0xCF, 0x5C, 0xCF, 0x54, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xAC, 0x5E, 0xCF, 0x62, 0xCF, 0x47, /* 0xE8-0xEB */
+	0xAC, 0x5A, 0xCF, 0x59, 0xCF, 0x4F, 0xAC, 0x5F, /* 0xEC-0xEF */
+	0xCF, 0x55, 0xAC, 0x57, 0xCE, 0xFC, 0xAC, 0x68, /* 0xF0-0xF3 */
+	0xAE, 0xE3, 0xAC, 0x5D, 0xCF, 0x4E, 0xCF, 0x4D, /* 0xF4-0xF7 */
+	0xCF, 0x42, 0x00, 0x00, 0xCF, 0x5E, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCF, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAC, 0x55, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xAE, 0xEA, /* 0x10-0x13 */
+	0xD1, 0xED, 0x00, 0x00, 0xD1, 0xE1, 0xAE, 0xDF, /* 0x14-0x17 */
+	0xAE, 0xEB, 0x00, 0x00, 0xD1, 0xDA, 0x00, 0x00, /* 0x18-0x1B */
+	0xD1, 0xE3, 0xD1, 0xEB, 0x00, 0x00, 0xD1, 0xD9, /* 0x1C-0x1F */
+	0xD1, 0xF4, 0xAE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD1, 0xF3, 0xD1, 0xEE, 0x00, 0x00, /* 0x24-0x27 */
+	0xD1, 0xEF, 0xAE, 0xDD, 0xAE, 0xE8, 0xD1, 0xE5, /* 0x28-0x2B */
+	0x00, 0x00, 0xD1, 0xE6, 0xD1, 0xF0, 0xD1, 0xE7, /* 0x2C-0x2F */
+	0x00, 0x00, 0xD1, 0xE2, 0xD1, 0xDC, 0xD1, 0xDD, /* 0x30-0x33 */
+	0xD1, 0xEA, 0xD1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xAE, 0xD6, 0xAE, 0xDA, 0xD1, 0xF2, 0xD1, 0xDE, /* 0x38-0x3B */
+	0xAE, 0xE6, 0xAE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xAE, 0xE5, 0xAE, 0xEC, 0xAE, 0xDB, 0xAE, 0xE7, /* 0x40-0x43 */
+	0xD1, 0xE9, 0xAE, 0xE9, 0xAE, 0xD8, 0x00, 0x00, /* 0x44-0x47 */
+	0xAE, 0xD7, 0xD1, 0xDB, 0x00, 0x00, 0xD1, 0xDF, /* 0x48-0x4B */
+	0xAE, 0xE0, 0xD1, 0xF1, 0xD1, 0xE8, 0xD1, 0xE0, /* 0x4C-0x4F */
+	0xAE, 0xE4, 0xAE, 0xE1, 0x00, 0x00, 0xAE, 0xD9, /* 0x50-0x53 */
+	0xAE, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC4, /* 0x68-0x6B */
+	0x00, 0x00, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5, 0xB9, /* 0x6C-0x6F */
+	0x00, 0x00, 0xD5, 0xC8, 0xD5, 0xC5, 0x00, 0x00, /* 0x70-0x73 */
+	0xD5, 0xBE, 0xD5, 0xBD, 0xB1, 0xED, 0xD5, 0xC1, /* 0x74-0x77 */
+	0xD5, 0xD0, 0xD5, 0xB0, 0x00, 0x00, 0xD5, 0xD1, /* 0x78-0x7B */
+	0xD5, 0xC3, 0xD5, 0xD5, 0xD5, 0xC9, 0xB1, 0xEC, /* 0x7C-0x7F */
+	
+	0xD5, 0xC7, 0xB1, 0xE7, 0xB1, 0xFC, 0xB1, 0xF2, /* 0x80-0x83 */
+	0x00, 0x00, 0xB1, 0xF6, 0xB1, 0xF5, 0xD5, 0xB1, /* 0x84-0x87 */
+	0x00, 0x00, 0xD5, 0xCE, 0xD5, 0xD4, 0xD5, 0xCC, /* 0x88-0x8B */
+	0xD5, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC0, /* 0x8C-0x8F */
+	0xD5, 0xB2, 0xD5, 0xD2, 0xD5, 0xC2, 0xB1, 0xEA, /* 0x90-0x93 */
+	0xB1, 0xF7, 0x00, 0x00, 0xD5, 0xCB, 0xB1, 0xF0, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCA, /* 0x98-0x9B */
+	0xD5, 0xB3, 0xB1, 0xF8, 0x00, 0x00, 0xB1, 0xFA, /* 0x9C-0x9F */
+	0xD5, 0xCD, 0xB1, 0xFB, 0xB1, 0xE9, 0xD5, 0xBA, /* 0xA0-0xA3 */
+	0xD5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xEF, /* 0xA4-0xA7 */
+	0xB1, 0xF9, 0xD5, 0xBC, 0xD5, 0xC6, 0xD5, 0xB7, /* 0xA8-0xAB */
+	0xD5, 0xBB, 0xB1, 0xF4, 0xD5, 0xB6, 0xB1, 0xE8, /* 0xAC-0xAF */
+	0xB1, 0xF1, 0xB1, 0xEE, 0xD5, 0xBF, 0xAE, 0xDE, /* 0xB0-0xB3 */
+	0xD9, 0xC0, 0xB1, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB1, 0xF3, 0x00, 0x00, 0xD9, 0xC3, 0xD9, 0xD9, /* 0xC4-0xC7 */
+	0xD9, 0xCE, 0xB4, 0xD6, 0x00, 0x00, 0xB4, 0xD1, /* 0xC8-0xCB */
+	0xD9, 0xBD, 0xB4, 0xD2, 0xD9, 0xCD, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD9, 0xC6, 0xD9, 0xD3, 0xB4, 0xCE, 0xD9, 0xAB, /* 0xD0-0xD3 */
+	0xD9, 0xD5, 0xB4, 0xC4, 0xD9, 0xB3, 0xB4, 0xC7, /* 0xD4-0xD7 */
+	0xB4, 0xC6, 0x00, 0x00, 0xB4, 0xD7, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD9, 0xAD, 0xD9, 0xCF, 0xD9, 0xD0, 0xB4, 0xC9, /* 0xDC-0xDF */
+	0xB4, 0xC5, 0xD9, 0xBB, 0x00, 0x00, 0xB4, 0xD0, /* 0xE0-0xE3 */
+	0xD9, 0xB6, 0x00, 0x00, 0xD9, 0xD1, 0xB4, 0xCC, /* 0xE4-0xE7 */
+	0xD9, 0xC9, 0xD9, 0xD6, 0xD9, 0xB0, 0xD9, 0xB5, /* 0xE8-0xEB */
+	0xD9, 0xAF, 0x00, 0x00, 0xB4, 0xCB, 0xD9, 0xC2, /* 0xEC-0xEF */
+	0xDD, 0xDE, 0xD9, 0xB1, 0xB4, 0xCF, 0xD9, 0xBA, /* 0xF0-0xF3 */
+	0xD9, 0xD2, 0xB4, 0xCA, 0xD9, 0xB7, 0xD9, 0xB4, /* 0xF4-0xF7 */
+	0xD9, 0xC5, 0xB4, 0xCD, 0xB4, 0xC3, 0xB4, 0xD9, /* 0xF8-0xFB */
+	0xD9, 0xC8, 0xD9, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_69[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD9, 0xAC, 0xB4, 0xC8, 0xD9, 0xD4, 0xD9, 0xBC, /* 0x04-0x07 */
+	0xD9, 0xBE, 0x00, 0x00, 0xD9, 0xCB, 0xD9, 0xCA, /* 0x08-0x0B */
+	0xD9, 0xAA, 0xB4, 0xD3, 0xB4, 0xD5, 0xD9, 0xB2, /* 0x0C-0x0F */
+	0xD9, 0xB9, 0xD9, 0xC1, 0xB4, 0xD4, 0xD9, 0xB8, /* 0x10-0x13 */
+	0xD9, 0xC4, 0xD9, 0xD7, 0x00, 0x00, 0xD9, 0xCC, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xAE, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF2, /* 0x2C-0x2F */
+	0xB7, 0xA6, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xDB, /* 0x30-0x33 */
+	0xDD, 0xE0, 0xDD, 0xD9, 0x00, 0x00, 0xDD, 0xEC, /* 0x34-0x37 */
+	0xDD, 0xCB, 0xDD, 0xD2, 0x00, 0x00, 0xDD, 0xEA, /* 0x38-0x3B */
+	0xDD, 0xF4, 0xDD, 0xDC, 0x00, 0x00, 0xDD, 0xCF, /* 0x3C-0x3F */
+	0xDD, 0xE2, 0xDD, 0xE7, 0xDD, 0xD3, 0x00, 0x00, /* 0x40-0x43 */
+	0xDD, 0xE4, 0xDD, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xDD, 0xD7, 0xDD, 0xD8, 0xB7, 0xA8, 0xDD, 0xEB, /* 0x48-0x4B */
+	0xDD, 0xE9, 0x00, 0x00, 0xDD, 0xCC, 0xDD, 0xEE, /* 0x4C-0x4F */
+	0x00, 0x00, 0xDD, 0xEF, 0xDD, 0xF1, 0xB7, 0xAC, /* 0x50-0x53 */
+	0xB7, 0xA4, 0x00, 0x00, 0xD5, 0xB8, 0xDD, 0xD4, /* 0x54-0x57 */
+	0xDD, 0xE6, 0xDD, 0xD5, 0xB7, 0xA1, 0xB7, 0xB1, /* 0x58-0x5B */
+	0xDD, 0xED, 0xB7, 0xAF, 0xB7, 0xAB, 0xDD, 0xCA, /* 0x5C-0x5F */
+	0xB7, 0xA3, 0x00, 0x00, 0xDD, 0xCD, 0xB7, 0xB0, /* 0x60-0x63 */
+	0x00, 0x00, 0xDD, 0xDD, 0xDD, 0xC9, 0x00, 0x00, /* 0x64-0x67 */
+	0xB7, 0xA9, 0xDD, 0xE1, 0xDD, 0xD1, 0xB7, 0xAA, /* 0x68-0x6B */
+	0xDD, 0xDA, 0xB7, 0x7E, 0xB4, 0xD8, 0xDD, 0xE3, /* 0x6C-0x6F */
+	0xD9, 0xBF, 0xDD, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xDD, 0xE8, 0xB7, 0xA5, 0xDD, 0xE5, 0xB7, 0xA2, /* 0x74-0x77 */
+	0xDD, 0xDF, 0xB7, 0xAD, 0xDD, 0xD6, 0xDD, 0xF3, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xA7, 0xDE, 0xC6, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xAE, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE2, 0x4A, 0xE2, 0x48, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0x5E, 0xE2, 0x46, 0x00, 0x00, 0xE2, 0x58, /* 0x90-0x93 */
+	0xB7, 0x7D, 0xBA, 0x5F, 0xE2, 0x42, 0xE2, 0x5D, /* 0x94-0x97 */
+	0x00, 0x00, 0xE2, 0x47, 0xE2, 0x55, 0xBA, 0x64, /* 0x98-0x9B */
+	0xBA, 0x5D, 0x00, 0x00, 0xE2, 0x5B, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE2, 0x40, 0xE2, 0x5A, 0x00, 0x00, 0xBA, 0x6F, /* 0xA0-0xA3 */
+	0xE2, 0x51, 0xE2, 0x61, 0xBA, 0x6D, 0xE2, 0x49, /* 0xA4-0xA7 */
+	0xBA, 0x5E, 0xE2, 0x4B, 0xE2, 0x59, 0xBA, 0x67, /* 0xA8-0xAB */
+	0xE2, 0x44, 0xBA, 0x6B, 0xBA, 0x61, 0xE2, 0x4D, /* 0xAC-0xAF */
+	0xE2, 0x43, 0xE1, 0xFC, 0x00, 0x00, 0xE2, 0x57, /* 0xB0-0xB3 */
+	0xBA, 0x68, 0xE2, 0x60, 0xE1, 0xFD, 0xBA, 0x65, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, 0xBA, 0x66, /* 0xB8-0xBB */
+	0xE2, 0x45, 0xE2, 0x50, 0xE2, 0x4C, 0xE2, 0x4E, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBA, 0x60, 0xE2, 0x5F, 0xBA, 0x6E, /* 0xC0-0xC3 */
+	0xE2, 0x4F, 0x00, 0x00, 0xE2, 0x62, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE1, 0xFE, 0xE2, 0x54, 0xBA, 0x63, /* 0xC8-0xCB */
+	0xBA, 0x6C, 0xBA, 0x6A, 0xE2, 0x41, 0xE2, 0x56, /* 0xCC-0xCF */
+	0xBA, 0x69, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x62, /* 0xD0-0xD3 */
+	0xE2, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE5, 0xD1, 0xE5, 0xCD, 0xE5, 0xE1, 0xE5, 0xDE, /* 0xE4-0xE7 */
+	0xBC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE5, /* 0xE8-0xEB */
+	0xE5, 0xD4, 0xBC, 0xD8, 0xE5, 0xDB, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE5, 0xD0, 0xE5, 0xDA, 0xBC, 0xD5, /* 0xF0-0xF3 */
+	0xE5, 0xEE, 0x00, 0x00, 0xE5, 0xEB, 0xE5, 0xDD, /* 0xF4-0xF7 */
+	0xE5, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE2, /* 0xF8-0xFB */
+	0xE5, 0xE4, 0xBC, 0xD1, 0xE5, 0xD8, 0xE5, 0xD3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0xE5, 0xCA, 0xBC, 0xCE, 0xBC, 0xD6, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xE7, 0xBC, 0xD7, 0xE5, 0xCB, 0xE5, 0xED, /* 0x04-0x07 */
+	0xE5, 0xE0, 0xE5, 0xE6, 0xBC, 0xD4, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, 0xE5, 0xEA, /* 0x0C-0x0F */
+	0x00, 0x00, 0xBC, 0xD9, 0x00, 0x00, 0xBC, 0xD3, /* 0x10-0x13 */
+	0xE5, 0xDC, 0xE5, 0xCF, 0xE5, 0xEF, 0xE5, 0xCC, /* 0x14-0x17 */
+	0xE5, 0xE8, 0xBC, 0xD0, 0x00, 0x00, 0xE5, 0xD6, /* 0x18-0x1B */
+	0x00, 0x00, 0xE5, 0xD7, 0xBC, 0xCF, 0xBC, 0xCC, /* 0x1C-0x1F */
+	0xE5, 0xD2, 0xBC, 0xD2, 0x00, 0x00, 0xBC, 0xCB, /* 0x20-0x23 */
+	0x00, 0x00, 0xE5, 0xE9, 0xE5, 0xEC, 0xE5, 0xD9, /* 0x24-0x27 */
+	0xE9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0x30-0x33 */
+	0xE9, 0xBE, 0xBE, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xBE, 0xEB, 0xBE, 0xF0, 0xBE, 0xEC, 0xE9, 0xCC, /* 0x38-0x3B */
+	0xE9, 0xD7, 0xBE, 0xEA, 0xE9, 0xC4, 0xE9, 0xCD, /* 0x3C-0x3F */
+	0xE5, 0xDF, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xBE, 0xF1, 0x00, 0x00, 0xE9, 0xDD, 0xBE, 0xF5, /* 0x44-0x47 */
+	0xBE, 0xF8, 0xE9, 0xC0, 0x00, 0x00, 0xBE, 0xF4, /* 0x48-0x4B */
+	0x00, 0x00, 0xE9, 0xDB, 0xE9, 0xDC, 0xE9, 0xD2, /* 0x4C-0x4F */
+	0xE9, 0xD1, 0xE9, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE9, 0xD3, 0xE9, 0xDA, 0xE9, 0xD9, 0x00, 0x00, /* 0x54-0x57 */
+	0xBE, 0xEF, 0xBE, 0xED, 0xE9, 0xCB, 0xE9, 0xC8, /* 0x58-0x5B */
+	0x00, 0x00, 0xE9, 0xC5, 0xE9, 0xD8, 0xBE, 0xF7, /* 0x5C-0x5F */
+	0xE9, 0xD6, 0xBE, 0xF3, 0xBE, 0xF2, 0x00, 0x00, /* 0x60-0x63 */
+	0xE9, 0xD0, 0x00, 0x00, 0xE9, 0xBF, 0xE9, 0xC1, /* 0x64-0x67 */
+	0xE9, 0xC3, 0xE9, 0xD5, 0xE9, 0xCF, 0xBE, 0xEE, /* 0x68-0x6B */
+	0x00, 0x00, 0xE9, 0xC6, 0x00, 0x00, 0xE9, 0xD4, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC7, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xCF, 0xED, 0x45, /* 0x7C-0x7F */
+	
+	0xC0, 0xC8, 0xEC, 0xF5, 0x00, 0x00, 0xED, 0x41, /* 0x80-0x83 */
+	0xC0, 0xCA, 0xED, 0x48, 0x00, 0x00, 0xEC, 0xFC, /* 0x84-0x87 */
+	0x00, 0x00, 0xEC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xED, 0x49, 0xEC, 0xF3, 0xEC, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */
+	0xC0, 0xD1, 0xED, 0x44, 0xED, 0x4A, 0xEC, 0xFD, /* 0x90-0x93 */
+	0xC0, 0xC9, 0xED, 0x40, 0xEC, 0xF4, 0xC0, 0xD0, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x47, 0xEC, 0xF9, /* 0x98-0x9B */
+	0xC0, 0xCC, 0x00, 0x00, 0xEC, 0xFB, 0xEC, 0xF8, /* 0x9C-0x9F */
+	0xC0, 0xD2, 0xEC, 0xFA, 0xC0, 0xCB, 0xC0, 0xCE, /* 0xA0-0xA3 */
+	0xED, 0x43, 0xEC, 0xF6, 0xED, 0x46, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xC2, 0x63, 0xEF, 0xE7, 0xC2, 0x68, 0xC2, 0x69, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x62, /* 0xB0-0xB3 */
+	0xEF, 0xE6, 0x00, 0x00, 0xEF, 0xE3, 0xEF, 0xE4, /* 0xB4-0xB7 */
+	0xC2, 0x66, 0xEF, 0xDE, 0xEF, 0xE2, 0xC2, 0x65, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x67, 0xC2, 0x64, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEF, 0xDD, 0xEF, 0xE1, 0xEF, 0xE5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0x51, /* 0xC8-0xCB */
+	0xF2, 0x4E, 0xF2, 0x57, 0x00, 0x00, 0xF2, 0x56, /* 0xCC-0xCF */
+	0xF2, 0x54, 0xF2, 0x4F, 0x00, 0x00, 0xC3, 0x72, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF2, 0x50, 0xC3, 0x71, 0xC0, 0xCD, /* 0xD8-0xDB */
+	0xF2, 0x53, 0xC3, 0x70, 0xF2, 0x58, 0xF2, 0x52, /* 0xDC-0xDF */
+	0xF2, 0x4D, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xC3, 0x6F, 0x00, 0x00, 0xF2, 0x4C, /* 0xE4-0xE7 */
+	0xF4, 0x56, 0x00, 0x00, 0xF4, 0x55, 0xF2, 0x55, /* 0xE8-0xEB */
+	0xC4, 0x68, 0x00, 0x00, 0xF4, 0x59, 0xF4, 0x5A, /* 0xEC-0xEF */
+	0xF4, 0x54, 0xF4, 0x58, 0x00, 0x00, 0xF4, 0x53, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xD1, 0xF4, 0x57, 0xC4, 0xE7, 0xC4, 0xE5, /* 0xF8-0xFB */
+	0xF5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0xF5, 0xD2, 0x00, 0x00, 0xF5, 0xCE, 0xF5, 0xD0, /* 0x00-0x03 */
+	0xC4, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xF6, 0xE5, 0xF6, 0xE6, 0xC5, 0x76, 0xF6, 0xE4, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */
+	0xC5, 0xCF, 0xF7, 0xE0, 0xF7, 0xE1, 0xF8, 0xAC, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xC6, 0x56, 0xF8, 0xF3, /* 0x14-0x17 */
+	0xF8, 0xF1, 0xF8, 0xF2, 0xF8, 0xF4, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xBB, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA4, 0xED, 0xA6, 0xB8, 0x00, 0x00, 0xAA, 0x59, /* 0x20-0x23 */
+	0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCF, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xD1, 0xF5, 0xD1, 0xF7, 0x00, 0x00, 0xD1, 0xF6, /* 0x2C-0x2F */
+	0x00, 0x00, 0xD1, 0xF8, 0xB1, 0xFD, 0xD5, 0xD7, /* 0x30-0x33 */
+	0xD1, 0xF9, 0x00, 0x00, 0xD5, 0xD6, 0xD5, 0xD8, /* 0x34-0x37 */
+	0xD5, 0xD9, 0xD9, 0xDA, 0xB4, 0xDB, 0xD9, 0xDB, /* 0x38-0x3B */
+	0xD9, 0xDD, 0xB4, 0xDC, 0xB4, 0xDA, 0xD9, 0xDC, /* 0x3C-0x3F */
+	0x00, 0x00, 0xDD, 0xFA, 0xDD, 0xF8, 0xDD, 0xF7, /* 0x40-0x43 */
+	0x00, 0x00, 0xDD, 0xF6, 0xDD, 0xF5, 0xB7, 0xB2, /* 0x44-0x47 */
+	0xDD, 0xF9, 0xBA, 0x70, 0xE2, 0x63, 0xE2, 0x65, /* 0x48-0x4B */
+	0xBA, 0x71, 0xE2, 0x64, 0xBC, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */
+	0xBC, 0xDA, 0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE9, 0xDF, 0xE9, 0xDE, 0xE9, 0xE0, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xBE, 0xF9, 0x00, 0x00, 0xED, 0x4B, /* 0x58-0x5B */
+	0xC0, 0xD3, 0x00, 0x00, 0xEF, 0xE8, 0xC2, 0x6A, /* 0x5C-0x5F */
+	0xF2, 0x59, 0xC5, 0x77, 0xA4, 0xEE, 0xA5, 0xBF, /* 0x60-0x63 */
+	0xA6, 0xB9, 0xA8, 0x42, 0xAA, 0x5A, 0xAA, 0x5B, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0x6E, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xD1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xB3, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD1, 0xBE, 0xFA, /* 0x74-0x77 */
+	0xC2, 0x6B, 0xA4, 0xEF, 0x00, 0x00, 0xA6, 0xBA, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xEB, 0xAA, 0x5C, /* 0x7C-0x7F */
+	
+	0xCC, 0xEA, 0x00, 0x00, 0xCF, 0x65, 0xAC, 0x6F, /* 0x80-0x83 */
+	0xCF, 0x66, 0x00, 0x00, 0xAC, 0x70, 0x00, 0x00, /* 0x84-0x87 */
+	0xD1, 0xFC, 0xAE, 0xEE, 0xAE, 0xED, 0x00, 0x00, /* 0x88-0x8B */
+	0xD5, 0xDE, 0xD5, 0xDC, 0xD5, 0xDD, 0xD5, 0xDB, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xD9, 0xDE, 0xD9, 0xE1, 0xB4, 0xDE, 0xD9, 0xDF, /* 0x94-0x97 */
+	0xB4, 0xDD, 0xD9, 0xE0, 0x00, 0x00, 0xDD, 0xFB, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x66, 0xE2, 0x67, /* 0x9C-0x9F */
+	0xE2, 0x68, 0x00, 0x00, 0xE5, 0xF3, 0xE5, 0xF2, /* 0xA0-0xA3 */
+	0xBC, 0xDC, 0xE5, 0xF1, 0xE5, 0xF4, 0xE9, 0xE1, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0xE9, 0xE3, /* 0xA8-0xAB */
+	0x00, 0x00, 0xED, 0x4C, 0xC0, 0xD4, 0xC2, 0x6C, /* 0xAC-0xAF */
+	0xF2, 0x5A, 0x00, 0x00, 0xC4, 0xE8, 0xC9, 0x5F, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xAC, 0x71, 0xCF, 0x67, 0xAE, 0xEF, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xB1, 0xFE, 0x00, 0x00, /* 0xB8-0xBB */
+	0xB4, 0xDF, 0xD9, 0xE2, 0x00, 0x00, 0xB7, 0xB5, /* 0xBC-0xBF */
+	0xB7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x69, /* 0xC0-0xC3 */
+	0xE2, 0x6A, 0xBC, 0xDD, 0xBC, 0xDE, 0xE9, 0xE5, /* 0xC4-0xC7 */
+	0xE9, 0xE4, 0xEF, 0xE9, 0xF7, 0xE3, 0xA4, 0xF0, /* 0xC8-0xCB */
+	0xC9, 0x60, 0xA5, 0xC0, 0x00, 0x00, 0xA8, 0x43, /* 0xCC-0xCF */
+	0xCB, 0x48, 0x00, 0x00, 0xAC, 0x72, 0xB7, 0xB6, /* 0xD0-0xD3 */
+	0xA4, 0xF1, 0x00, 0x00, 0xCF, 0x68, 0xAC, 0x73, /* 0xD4-0xD7 */
+	0xCF, 0x69, 0x00, 0x00, 0xC0, 0xD5, 0xA4, 0xF2, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0xDC-0xDF */
+	0xCF, 0x6A, 0x00, 0x00, 0xD2, 0x42, 0xD2, 0x41, /* 0xE0-0xE3 */
+	0xD1, 0xFE, 0x00, 0x00, 0xD1, 0xFD, 0xD2, 0x43, /* 0xE4-0xE7 */
+	0xD2, 0x40, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x40, /* 0xE8-0xEB */
+	0xB2, 0x41, 0x00, 0x00, 0x00, 0x00, 0xB4, 0xE0, /* 0xEC-0xEF */
+	0xD9, 0xE3, 0x00, 0x00, 0xD9, 0xE4, 0xD9, 0xE5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0x41, /* 0xF4-0xF7 */
+	0xDE, 0x42, 0xDE, 0x40, 0x00, 0x00, 0xDD, 0xFD, /* 0xF8-0xFB */
+	0xDD, 0xFE, 0xB7, 0xB7, 0xE2, 0x6B, 0xE5, 0xF7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0xE5, 0xF6, 0xE5, 0xF5, 0xE5, 0xF8, 0xE9, 0xE7, /* 0x00-0x03 */
+	0xE9, 0xE6, 0xBE, 0xFB, 0xE9, 0xE8, 0x00, 0x00, /* 0x04-0x07 */
+	0xC0, 0xD6, 0xED, 0x4D, 0x00, 0x00, 0xEF, 0xEA, /* 0x08-0x0B */
+	0xF2, 0x5B, 0xF6, 0xE7, 0x00, 0x00, 0xA4, 0xF3, /* 0x0C-0x0F */
+	0xA5, 0xC2, 0xA5, 0xC1, 0x00, 0x00, 0xAA, 0x5D, /* 0x10-0x13 */
+	0xC9, 0x61, 0xC9, 0x7E, 0xA6, 0xBB, 0x00, 0x00, /* 0x14-0x17 */
+	0xC9, 0xF7, 0xCB, 0x49, 0xCB, 0x4A, 0xAA, 0x5E, /* 0x18-0x1B */
+	0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, 0xAC, 0x74, /* 0x1C-0x1F */
+	0xCF, 0x6B, 0xCF, 0x6C, 0x00, 0x00, 0xAE, 0xF0, /* 0x20-0x23 */
+	0xAE, 0xF4, 0xD2, 0x44, 0xAE, 0xF3, 0xAE, 0xF1, /* 0x24-0x27 */
+	0xAE, 0xF2, 0x00, 0x00, 0xD5, 0xDF, 0xB2, 0x42, /* 0x28-0x2B */
+	0xB4, 0xE3, 0x00, 0x00, 0xB4, 0xE1, 0xB4, 0xE2, /* 0x2C-0x2F */
+	0xD9, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x72, /* 0x30-0x33 */
+	0xA4, 0xF4, 0x00, 0x00, 0xC9, 0xA1, 0x00, 0x00, /* 0x34-0x37 */
+	0xA5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xC9, 0xA4, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xC6, 0xC9, 0xA3, /* 0x3C-0x3F */
+	0xA5, 0xC5, 0xA5, 0xC4, 0xA8, 0x44, 0xC9, 0xA2, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xF8, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xFC, 0xC9, 0xFE, /* 0x48-0x4B */
+	0xCA, 0x40, 0xA6, 0xC5, 0xA6, 0xC6, 0xC9, 0xFB, /* 0x4C-0x4F */
+	0xA6, 0xC1, 0x00, 0x00, 0xC9, 0xF9, 0x00, 0x00, /* 0x50-0x53 */
+	0xC9, 0xFD, 0xA6, 0xC2, 0x00, 0x00, 0xA6, 0xBD, /* 0x54-0x57 */
+	0x00, 0x00, 0xA6, 0xBE, 0x00, 0x00, 0xA6, 0xC4, /* 0x58-0x5B */
+	0xC9, 0xFA, 0xA6, 0xBC, 0xA8, 0x45, 0xA6, 0xBF, /* 0x5C-0x5F */
+	0xA6, 0xC0, 0xA6, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xCB, 0x5B, 0xCB, 0x59, 0xCB, 0x4C, /* 0x64-0x67 */
+	0xA8, 0x51, 0xCB, 0x53, 0xA8, 0x4C, 0xCB, 0x4D, /* 0x68-0x6B */
+	0x00, 0x00, 0xCB, 0x55, 0x00, 0x00, 0xCB, 0x52, /* 0x6C-0x6F */
+	0xA8, 0x4F, 0xCB, 0x51, 0xA8, 0x56, 0xCB, 0x5A, /* 0x70-0x73 */
+	0xA8, 0x58, 0x00, 0x00, 0xA8, 0x5A, 0x00, 0x00, /* 0x74-0x77 */
+	0xCB, 0x4B, 0x00, 0x00, 0xA8, 0x4D, 0xCB, 0x5C, /* 0x78-0x7B */
+	0x00, 0x00, 0xA8, 0x54, 0xA8, 0x57, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCD, 0x45, 0xA8, 0x47, 0xA8, 0x5E, 0xA8, 0x55, /* 0x80-0x83 */
+	0xCB, 0x4E, 0xA8, 0x4A, 0xA8, 0x59, 0xCB, 0x56, /* 0x84-0x87 */
+	0xA8, 0x48, 0xA8, 0x49, 0xCD, 0x43, 0xCB, 0x4F, /* 0x88-0x8B */
+	0xA8, 0x50, 0xA8, 0x5B, 0xCB, 0x5D, 0xCB, 0x50, /* 0x8C-0x8F */
+	0xA8, 0x4E, 0x00, 0x00, 0xA8, 0x53, 0xCC, 0xEE, /* 0x90-0x93 */
+	0xA8, 0x5C, 0xCB, 0x57, 0xA8, 0x52, 0x00, 0x00, /* 0x94-0x97 */
+	0xA8, 0x5D, 0xA8, 0x46, 0xCB, 0x54, 0xA8, 0x4B, /* 0x98-0x9B */
+	0xCB, 0x58, 0xCD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0x6A, /* 0xA8-0xAB */
+	0xAA, 0x7A, 0xCC, 0xF5, 0xAA, 0x71, 0x00, 0x00, /* 0xAC-0xAF */
+	0xCD, 0x4B, 0xAA, 0x62, 0x00, 0x00, 0xAA, 0x65, /* 0xB0-0xB3 */
+	0xCD, 0x42, 0x00, 0x00, 0xCC, 0xF3, 0xCC, 0xF7, /* 0xB4-0xB7 */
+	0xAA, 0x6D, 0xAA, 0x6F, 0xCC, 0xFA, 0xAA, 0x76, /* 0xB8-0xBB */
+	0xAA, 0x68, 0xAA, 0x66, 0xAA, 0x67, 0xAA, 0x75, /* 0xBC-0xBF */
+	0xCD, 0x47, 0xAA, 0x70, 0xCC, 0xF9, 0xCC, 0xFB, /* 0xC0-0xC3 */
+	0xAA, 0x6E, 0xAA, 0x73, 0xCC, 0xFC, 0xCD, 0x4A, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xAC, 0x75, 0xAA, 0x79, 0x00, 0x00, /* 0xC8-0xCB */
+	0xAA, 0x63, 0xCD, 0x49, 0x00, 0x00, 0xCD, 0x4D, /* 0xCC-0xCF */
+	0xCC, 0xF8, 0xCD, 0x4F, 0xCD, 0x40, 0xAA, 0x6C, /* 0xD0-0xD3 */
+	0xCC, 0xF4, 0xAA, 0x6B, 0xAA, 0x7D, 0xAA, 0x72, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xCC, 0xF2, 0xCF, 0x75, 0xAA, 0x78, /* 0xD8-0xDB */
+	0xAA, 0x7C, 0xCD, 0x41, 0xCD, 0x46, 0x00, 0x00, /* 0xDC-0xDF */
+	0xAA, 0x7E, 0xAA, 0x77, 0xAA, 0x69, 0xAA, 0x5F, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xAA, 0x64, 0x00, 0x00, 0xCC, 0xF6, /* 0xE4-0xE7 */
+	0xAA, 0x60, 0xCD, 0x4E, 0x00, 0x00, 0xCC, 0xF0, /* 0xE8-0xEB */
+	0xCC, 0xEF, 0xCC, 0xFD, 0xCC, 0xF1, 0xAA, 0x7B, /* 0xEC-0xEF */
+	0xAE, 0xF5, 0xAA, 0x74, 0xCC, 0xFE, 0xAA, 0x61, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xAC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCD, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0xCF, 0x7C, 0xCF, 0xA1, 0x00, 0x00, 0xCF, 0xA4, /* 0x00-0x03 */
+	0xCF, 0x77, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA7, /* 0x04-0x07 */
+	0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0x74, 0xAC, 0x76, /* 0x08-0x0B */
+	0xAC, 0x7B, 0xD2, 0x49, 0xAC, 0xAD, 0xCF, 0xA5, /* 0x0C-0x0F */
+	0xCF, 0xAD, 0xCF, 0x7B, 0xCF, 0x73, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0x64, 0xAC, 0x7E, /* 0x14-0x17 */
+	0xCF, 0xA2, 0xCF, 0x78, 0xCF, 0x7A, 0xAC, 0xA5, /* 0x18-0x1B */
+	0x00, 0x00, 0xCF, 0x7D, 0xAC, 0x7D, 0xCF, 0x70, /* 0x1C-0x1F */
+	0xCF, 0xA8, 0x00, 0x00, 0xCF, 0xAB, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xAC, 0x7A, 0x00, 0x00, 0xAC, 0xA8, /* 0x24-0x27 */
+	0xCF, 0x6D, 0xAC, 0xAA, 0xAC, 0x78, 0xAC, 0xAE, /* 0x28-0x2B */
+	0xCF, 0xA9, 0xCF, 0x6F, 0xAC, 0xAB, 0xD2, 0x5E, /* 0x2C-0x2F */
+	0xCD, 0x48, 0xAC, 0x7C, 0xAC, 0x77, 0xCF, 0x76, /* 0x30-0x33 */
+	0xCF, 0x6E, 0xAC, 0xAC, 0xAC, 0xA4, 0xCF, 0xA3, /* 0x34-0x37 */
+	0xAC, 0xA9, 0xAC, 0xA7, 0xCF, 0x79, 0xAC, 0xA1, /* 0x38-0x3B */
+	0xCF, 0x71, 0xAC, 0xA2, 0xAC, 0xA3, 0xCF, 0x72, /* 0x3C-0x3F */
+	0xCF, 0xA6, 0xAC, 0x79, 0xCF, 0x7E, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD2, 0x4C, 0xAE, 0xFD, 0xAF, 0x43, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0x55, 0xD2, 0x5B, /* 0x5C-0x5F */
+	0xD2, 0x57, 0xD2, 0x4A, 0xD2, 0x4D, 0xD2, 0x46, /* 0x60-0x63 */
+	0xD2, 0x47, 0xAF, 0x4A, 0xAE, 0xFA, 0xD2, 0x56, /* 0x64-0x67 */
+	0xD2, 0x5F, 0xAF, 0x45, 0xAE, 0xF6, 0x00, 0x00, /* 0x68-0x6B */
+	0xAF, 0x40, 0xD2, 0x4E, 0xAF, 0x42, 0xD2, 0x4F, /* 0x6C-0x6F */
+	0xD2, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xAF, 0x44, 0xD2, 0x68, 0xD2, 0x48, 0xAE, 0xFC, /* 0x74-0x77 */
+	0xAE, 0xFB, 0xAF, 0x48, 0xD2, 0x45, 0xD2, 0x66, /* 0x78-0x7B */
+	0xD2, 0x5A, 0xD2, 0x67, 0xD2, 0x61, 0xD2, 0x53, /* 0x7C-0x7F */
+	
+	0xD2, 0x62, 0x00, 0x00, 0xD2, 0x5C, 0xD2, 0x65, /* 0x80-0x83 */
+	0xD2, 0x63, 0xAF, 0x49, 0xD2, 0x54, 0xAE, 0xF9, /* 0x84-0x87 */
+	0xAE, 0xF8, 0xAF, 0x41, 0xAF, 0x47, 0xD2, 0x60, /* 0x88-0x8B */
+	0xAF, 0x46, 0xD2, 0x51, 0xB2, 0x43, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD2, 0x69, 0xD2, 0x50, 0xD2, 0x4B, 0xAE, 0xFE, /* 0x90-0x93 */
+	0xAF, 0x4B, 0xAE, 0xF7, 0x00, 0x00, 0xD2, 0x58, /* 0x94-0x97 */
+	0xD2, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0x65, 0xD5, 0xE1, /* 0xA8-0xAB */
+	0xD5, 0xE5, 0x00, 0x00, 0xB2, 0x52, 0xB2, 0x50, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0x47, 0xD5, 0xE3, /* 0xB0-0xB3 */
+	0xD5, 0xE2, 0xB2, 0x5B, 0x00, 0x00, 0xD5, 0xE8, /* 0xB4-0xB7 */
+	0xB2, 0x55, 0x00, 0x00, 0xD5, 0xFA, 0xD6, 0x47, /* 0xB8-0xBB */
+	0xB2, 0x44, 0xD5, 0xF7, 0xD5, 0xF0, 0xB2, 0x67, /* 0xBC-0xBF */
+	0xD5, 0xE0, 0x00, 0x00, 0xD5, 0xFC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB2, 0x64, 0xB2, 0x58, 0xB2, 0x63, 0xB2, 0x4E, /* 0xC4-0xC7 */
+	0xD5, 0xEC, 0xD5, 0xFE, 0xD5, 0xF6, 0xB2, 0x4F, /* 0xC8-0xCB */
+	0xB2, 0x49, 0xD6, 0x45, 0x00, 0x00, 0xD5, 0xFD, /* 0xCC-0xCF */
+	0xD6, 0x40, 0xB2, 0x51, 0xB2, 0x59, 0xD6, 0x42, /* 0xD0-0xD3 */
+	0xD5, 0xEA, 0xD5, 0xFB, 0xD5, 0xEF, 0xD6, 0x44, /* 0xD4-0xD7 */
+	0xB2, 0x5E, 0xB2, 0x46, 0xB2, 0x5C, 0xD5, 0xF4, /* 0xD8-0xDB */
+	0xD5, 0xF2, 0xD5, 0xF3, 0xB2, 0x53, 0xD5, 0xEE, /* 0xDC-0xDF */
+	0xD5, 0xED, 0xB2, 0x48, 0xD5, 0xE7, 0xD6, 0x46, /* 0xE0-0xE3 */
+	0xB2, 0x4A, 0xD5, 0xF1, 0xB2, 0x68, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xB2, 0x62, 0xD5, 0xE6, 0xB2, 0x5F, 0xB2, 0x5D, /* 0xE8-0xEB */
+	0xB2, 0x66, 0xD5, 0xF8, 0xB2, 0x61, 0xD2, 0x52, /* 0xEC-0xEF */
+	0xD5, 0xF9, 0xB2, 0x60, 0xD6, 0x41, 0xB2, 0x45, /* 0xF0-0xF3 */
+	0xD5, 0xF5, 0xB2, 0x57, 0xD5, 0xE9, 0xB2, 0x56, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xB2, 0x54, 0xB2, 0x4C, 0xB2, 0x4B, /* 0xF8-0xFB */
+	0xD9, 0xE7, 0xD6, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0xD5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFC, /* 0x00-0x03 */
+	0x00, 0x00, 0xB2, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xB5, 0x41, 0xB2, 0x5A, 0xB4, 0xEE, /* 0x18-0x1B */
+	0xD9, 0xF6, 0xB4, 0xFC, 0x00, 0x00, 0xD9, 0xEA, /* 0x1C-0x1F */
+	0xB4, 0xEB, 0xB4, 0xE7, 0xDA, 0x49, 0xB4, 0xED, /* 0x20-0x23 */
+	0xB4, 0xF1, 0xB4, 0xEC, 0xB4, 0xF5, 0xDA, 0x4D, /* 0x24-0x27 */
+	0xDA, 0x44, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF1, /* 0x28-0x2B */
+	0xB4, 0xFA, 0xB4, 0xF4, 0xD9, 0xFD, 0xB4, 0xE4, /* 0x2C-0x2F */
+	0xDA, 0x4A, 0xDA, 0x43, 0xB4, 0xE8, 0xD9, 0xF7, /* 0x30-0x33 */
+	0xB4, 0xF7, 0xDA, 0x55, 0xDA, 0x56, 0x00, 0x00, /* 0x34-0x37 */
+	0xB4, 0xE5, 0xDA, 0x48, 0xB4, 0xF9, 0xD9, 0xFB, /* 0x38-0x3B */
+	0xD9, 0xED, 0xD9, 0xEE, 0xB4, 0xFD, 0xD9, 0xF2, /* 0x3C-0x3F */
+	0xD9, 0xF9, 0xD9, 0xF3, 0x00, 0x00, 0xB4, 0xFB, /* 0x40-0x43 */
+	0xB5, 0x44, 0xD9, 0xEF, 0xD9, 0xE8, 0xD9, 0xE9, /* 0x44-0x47 */
+	0x00, 0x00, 0xD9, 0xEB, 0xB4, 0xEA, 0xD9, 0xF8, /* 0x48-0x4B */
+	0x00, 0x00, 0xB4, 0xF8, 0xB5, 0x42, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD9, 0xFA, 0xDA, 0x53, 0xDA, 0x4B, /* 0x50-0x53 */
+	0xB4, 0xE6, 0xDA, 0x51, 0xB4, 0xF2, 0x00, 0x00, /* 0x54-0x57 */
+	0xB4, 0xF0, 0x00, 0x00, 0xDA, 0x57, 0xB4, 0xEF, /* 0x58-0x5B */
+	0xDA, 0x41, 0xD9, 0xF4, 0xD9, 0xFE, 0xB5, 0x47, /* 0x5C-0x5F */
+	0xDA, 0x45, 0xDA, 0x42, 0xD9, 0xF0, 0xB5, 0x43, /* 0x60-0x63 */
+	0xDA, 0x4F, 0xDA, 0x4C, 0xDA, 0x54, 0xB4, 0xE9, /* 0x64-0x67 */
+	0xDA, 0x40, 0xB5, 0x46, 0x00, 0x00, 0xDA, 0x47, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xB4, 0xF3, 0xB4, 0xF6, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDA, 0x46, 0xB5, 0x45, 0xD9, 0xF5, /* 0x70-0x73 */
+	0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDA, 0x50, /* 0x74-0x77 */
+	0xDA, 0x4E, 0xDA, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD9, 0xEC, 0xB5, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xDE, 0x61, 0xDE, 0x60, 0xDE, 0x46, /* 0x8C-0x8F */
+	0xB7, 0xBD, 0x00, 0x00, 0xDE, 0x5F, 0xDE, 0x49, /* 0x90-0x93 */
+	0xDE, 0x4A, 0x00, 0x00, 0xB7, 0xC7, 0xDE, 0x68, /* 0x94-0x97 */
+	0xB7, 0xC2, 0xDE, 0x5E, 0x00, 0x00, 0xDE, 0x43, /* 0x98-0x9B */
+	0xB7, 0xC8, 0xB7, 0xBE, 0xDE, 0x52, 0xDE, 0x48, /* 0x9C-0x9F */
+	0xDE, 0x4B, 0xDE, 0x63, 0xB7, 0xB8, 0xDE, 0x6A, /* 0xA0-0xA3 */
+	0xDE, 0x62, 0xB7, 0xC1, 0xDE, 0x57, 0xB7, 0xCC, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xCB, 0xB7, 0xC5, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0x69, 0xB7, 0xB9, /* 0xAC-0xAF */
+	0xDE, 0x55, 0xDE, 0x4C, 0xDE, 0x59, 0xDE, 0x65, /* 0xB0-0xB3 */
+	0xB7, 0xCD, 0x00, 0x00, 0xB7, 0xBB, 0xDE, 0x54, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xDE, 0x4D, 0xB7, 0xC4, 0x00, 0x00, /* 0xB8-0xBB */
+	0xB7, 0xC3, 0xDE, 0x50, 0xDE, 0x5A, 0xDE, 0x64, /* 0xBC-0xBF */
+	0xDE, 0x47, 0xDE, 0x51, 0xB7, 0xBC, 0xDE, 0x5B, /* 0xC0-0xC3 */
+	0xB7, 0xC9, 0xB7, 0xC0, 0xDE, 0x4E, 0xB7, 0xBF, /* 0xC4-0xC7 */
+	0xDE, 0x45, 0xDE, 0x53, 0xDE, 0x67, 0xB4, 0xFE, /* 0xC8-0xCB */
+	0xBA, 0xB0, 0xDE, 0x56, 0xE2, 0x6C, 0xDE, 0x58, /* 0xCC-0xCF */
+	0xDE, 0x66, 0xB7, 0xC6, 0xDE, 0x4F, 0xB7, 0xBA, /* 0xD0-0xD3 */
+	0xB7, 0xCA, 0xBC, 0xF0, 0xDE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDE, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDE, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xAA, /* 0xE8-0xEB */
+	0xBA, 0xAD, 0xE2, 0x7D, 0xE2, 0xA4, 0xBA, 0xA2, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE2, 0x6E, 0xBA, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xBA, 0x77, 0xE2, 0x6D, 0xE2, 0xB0, 0xBA, 0xB1, /* 0xF4-0xF7 */
+	0xE2, 0x71, 0xE2, 0xA3, 0x00, 0x00, 0xE2, 0x73, /* 0xF8-0xFB */
+	0xE2, 0xB3, 0xE2, 0xAF, 0xBA, 0x75, 0xBA, 0xA1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0xE6, 0x53, 0xBA, 0xAE, 0xBA, 0x7D, 0xE2, 0x6F, /* 0x00-0x03 */
+	0x00, 0x00, 0xE2, 0xAE, 0xBA, 0xA3, 0xE2, 0xAB, /* 0x04-0x07 */
+	0xE2, 0xB8, 0xE2, 0x75, 0xE2, 0x7E, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE2, 0xB6, 0xE2, 0xAC, 0xBA, 0x7C, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x7C, 0xBA, 0x76, /* 0x10-0x13 */
+	0xBA, 0x74, 0xBA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE2, 0x7A, 0xE2, 0x77, 0xE2, 0x78, 0x00, 0x00, /* 0x18-0x1B */
+	0xE2, 0xB2, 0x00, 0x00, 0xE2, 0xB7, 0xE2, 0xB5, /* 0x1C-0x1F */
+	0xBA, 0x7A, 0xE2, 0xB9, 0xBA, 0x7E, 0xBA, 0xA7, /* 0x20-0x23 */
+	0x00, 0x00, 0xE2, 0x70, 0xE5, 0xFA, 0xE2, 0x79, /* 0x24-0x27 */
+	0x00, 0x00, 0xBA, 0x78, 0xBA, 0xAC, 0xBA, 0xA9, /* 0x28-0x2B */
+	0xBA, 0x7B, 0xE2, 0xA5, 0xE2, 0x74, 0xBA, 0xAA, /* 0x2C-0x2F */
+	0xE2, 0xA7, 0xBA, 0xA4, 0xBA, 0xA6, 0xBA, 0x73, /* 0x30-0x33 */
+	0x00, 0x00, 0xE2, 0xA9, 0xE2, 0xA1, 0xE2, 0x72, /* 0x34-0x37 */
+	0xBA, 0xA5, 0xE2, 0xB1, 0xE2, 0xB4, 0xE2, 0x7B, /* 0x38-0x3B */
+	0xE2, 0xA8, 0x00, 0x00, 0xBA, 0x79, 0xBC, 0xDF, /* 0x3C-0x3F */
+	0xE2, 0xA6, 0xE5, 0xF9, 0x00, 0x00, 0xE2, 0xAD, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x76, 0xE6, 0x44, /* 0x4C-0x4F */
+	0xE6, 0x4E, 0xBC, 0xE2, 0xE6, 0x4D, 0xE6, 0x59, /* 0x50-0x53 */
+	0xBC, 0xE4, 0xE6, 0x4B, 0x00, 0x00, 0xE6, 0x4F, /* 0x54-0x57 */
+	0xBC, 0xEF, 0x00, 0x00, 0xE6, 0x46, 0xBC, 0xE7, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0x52, 0xE9, 0xF0, 0xBC, 0xF3, /* 0x5C-0x5F */
+	0xBC, 0xF2, 0xE6, 0x54, 0xE6, 0x43, 0xE6, 0x5E, /* 0x60-0x63 */
+	0xBC, 0xED, 0x00, 0x00, 0xBC, 0xE3, 0xE6, 0x57, /* 0x64-0x67 */
+	0x00, 0x00, 0xE6, 0x5B, 0xE6, 0x60, 0xE6, 0x55, /* 0x68-0x6B */
+	0xE6, 0x49, 0xBC, 0xE6, 0xBC, 0xE9, 0xBC, 0xF1, /* 0x6C-0x6F */
+	0xBC, 0xEC, 0x00, 0x00, 0xE6, 0x4C, 0xE2, 0xA2, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x48, 0xE6, 0x5F, /* 0x74-0x77 */
+	0xBC, 0xE8, 0x00, 0x00, 0xBC, 0xEB, 0xE6, 0x61, /* 0x78-0x7B */
+	0xBC, 0xE0, 0xE6, 0x56, 0xE5, 0xFB, 0xE6, 0x5C, /* 0x7C-0x7F */
+	
+	0xC0, 0xDF, 0x00, 0x00, 0xE6, 0x4A, 0x00, 0x00, /* 0x80-0x83 */
+	0xBC, 0xE1, 0xE6, 0x45, 0xBC, 0xE5, 0xE5, 0xFC, /* 0x84-0x87 */
+	0xBA, 0xAB, 0xE6, 0x41, 0x00, 0x00, 0xE6, 0x5A, /* 0x88-0x8B */
+	0xE6, 0x42, 0xE6, 0x40, 0xBC, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE6, 0x58, 0x00, 0x00, 0xE5, 0xFE, 0xE6, 0x51, /* 0x90-0x93 */
+	0xE6, 0x50, 0xE6, 0x5D, 0xE6, 0x47, 0xBC, 0xEE, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0x00, 0x00, /* 0x9C-0x9F */
+	0xBF, 0x49, 0xBE, 0xFE, 0xEA, 0x40, 0xE9, 0xEB, /* 0xA0-0xA3 */
+	0xBF, 0x41, 0xE9, 0xF7, 0xBF, 0x48, 0xBF, 0x43, /* 0xA4-0xA7 */
+	0xE9, 0xF5, 0xED, 0x4F, 0xE9, 0xFB, 0xEA, 0x42, /* 0xA8-0xAB */
+	0xE9, 0xFA, 0xE9, 0xE9, 0xE9, 0xF8, 0xEA, 0x44, /* 0xAC-0xAF */
+	0xEA, 0x46, 0xBE, 0xFD, 0xEA, 0x45, 0xBF, 0x44, /* 0xB0-0xB3 */
+	0xBF, 0x4A, 0x00, 0x00, 0xBF, 0x47, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE9, 0xFE, 0xBF, 0x46, 0xE9, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE9, 0xED, 0xE9, 0xF2, 0x00, 0x00, 0xE9, 0xFD, /* 0xBC-0xBF */
+	0xBF, 0x45, 0xBF, 0x42, 0xBE, 0xFC, 0xBF, 0x40, /* 0xC0-0xC3 */
+	0xE9, 0xF1, 0x00, 0x00, 0xE5, 0xFD, 0xE9, 0xEC, /* 0xC4-0xC7 */
+	0xE9, 0xEF, 0xEA, 0x41, 0xE9, 0xF4, 0xE9, 0xEA, /* 0xC8-0xCB */
+	0xED, 0x4E, 0xEA, 0x43, 0xE9, 0xEE, 0xE9, 0xFC, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xED, 0x51, 0xC0, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xC0, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xDB, /* 0xD8-0xDB */
+	0xED, 0x53, 0xED, 0x59, 0xED, 0x57, 0xC0, 0xD9, /* 0xDC-0xDF */
+	0xC0, 0xDA, 0xC0, 0xE1, 0xED, 0x5A, 0xED, 0x52, /* 0xE0-0xE3 */
+	0xC0, 0xDC, 0x00, 0x00, 0xED, 0x56, 0xED, 0x55, /* 0xE4-0xE7 */
+	0xED, 0x5B, 0xC0, 0xE2, 0x00, 0x00, 0xC0, 0xDD, /* 0xE8-0xEB */
+	0xC0, 0xE0, 0xED, 0x54, 0xC0, 0xE4, 0xC0, 0xDE, /* 0xEC-0xEF */
+	0xC0, 0xE5, 0xC0, 0xD8, 0xED, 0x58, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xED, 0x50, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF7, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x71, 0xEF, 0xF4, /* 0xF8-0xFB */
+	0xEF, 0xF6, 0x00, 0x00, 0xC2, 0x6F, 0xEF, 0xF2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0xEF, 0xF3, 0xEF, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE9, 0xF6, 0xEF, 0xEF, 0xC2, 0x70, 0xEF, 0xEB, /* 0x04-0x07 */
+	0x00, 0x00, 0xC2, 0x6D, 0xEF, 0xF8, 0xC2, 0x6E, /* 0x08-0x0B */
+	0xEF, 0xEC, 0xEF, 0xED, 0xEF, 0xF1, 0xC2, 0x73, /* 0x0C-0x0F */
+	0x00, 0x00, 0xC2, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xEF, 0xF0, 0xC3, 0x78, 0xF2, 0x5F, 0xF2, 0x65, /* 0x14-0x17 */
+	0xC3, 0x79, 0xF2, 0x5C, 0xC3, 0x76, 0xC3, 0x73, /* 0x18-0x1B */
+	0xF2, 0x67, 0xC3, 0x77, 0x00, 0x00, 0xC3, 0x74, /* 0x1C-0x1F */
+	0xF2, 0x5E, 0xF2, 0x61, 0xF2, 0x62, 0xF2, 0x63, /* 0x20-0x23 */
+	0xF2, 0x66, 0x00, 0x00, 0xEF, 0xF5, 0xF2, 0x5D, /* 0x24-0x27 */
+	0xC3, 0x75, 0xF2, 0x64, 0xF2, 0x68, 0xF2, 0x60, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x5D, /* 0x2C-0x2F */
+	0xC4, 0x6A, 0xF4, 0x60, 0xC4, 0x6B, 0xF4, 0x68, /* 0x30-0x33 */
+	0xF4, 0x5F, 0xF4, 0x5C, 0x00, 0x00, 0xF4, 0x5E, /* 0x34-0x37 */
+	0xF4, 0x62, 0xF4, 0x65, 0xF4, 0x64, 0xF4, 0x67, /* 0x38-0x3B */
+	0xF4, 0x5B, 0x00, 0x00, 0xC4, 0x69, 0xF4, 0x63, /* 0x3C-0x3F */
+	0xF4, 0x66, 0xF4, 0x69, 0xF4, 0x61, 0xF5, 0xD3, /* 0x40-0x43 */
+	0xF5, 0xD4, 0xF5, 0xD8, 0xF5, 0xD9, 0x00, 0x00, /* 0x44-0x47 */
+	0xF5, 0xD6, 0xF5, 0xD7, 0xF5, 0xD5, 0x00, 0x00, /* 0x48-0x4B */
+	0xC4, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xC5, 0x78, 0xF6, 0xEB, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF6, 0xE8, 0xF6, 0xE9, 0xF6, 0xEA, /* 0x54-0x57 */
+	0xC5, 0x79, 0x00, 0x00, 0xF7, 0xE5, 0xF7, 0xE4, /* 0x58-0x5B */
+	0x00, 0x00, 0xF8, 0xAF, 0xC5, 0xF4, 0xF8, 0xAD, /* 0x5C-0x5F */
+	0xF8, 0xB0, 0xF8, 0xAE, 0xF8, 0xF5, 0xC6, 0x57, /* 0x60-0x63 */
+	0xC6, 0x65, 0xF9, 0xA3, 0xF9, 0x6C, 0x00, 0x00, /* 0x64-0x67 */
+	0xF9, 0xA2, 0xF9, 0xD0, 0xF9, 0xD1, 0xA4, 0xF5, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xA6, 0xC7, 0xCA, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xCB, 0x5E, 0x00, 0x00, 0xA8, 0x5F, 0x00, 0x00, /* 0x74-0x77 */
+	0xA8, 0x62, 0x00, 0x00, 0xCB, 0x5F, 0x00, 0x00, /* 0x78-0x7B */
+	0xA8, 0x60, 0xA8, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0x58, 0xCD, 0x5A, /* 0x80-0x83 */
+	0xCD, 0x55, 0xCD, 0x52, 0xCD, 0x54, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0xA4, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0xA2, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xCD, 0x56, 0xAA, 0xA3, 0xCD, 0x53, /* 0x90-0x93 */
+	0xCD, 0x50, 0xAA, 0xA1, 0xCD, 0x57, 0x00, 0x00, /* 0x94-0x97 */
+	0xCD, 0x51, 0xAA, 0xA5, 0xCD, 0x59, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xAF, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xAC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xCF, 0xB6, 0x00, 0x00, 0xAC, 0xAF, /* 0xA8-0xAB */
+	0xAC, 0xB2, 0xAC, 0xB4, 0xAC, 0xB6, 0xAC, 0xB3, /* 0xAC-0xAF */
+	0xCF, 0xB2, 0xCF, 0xB1, 0x00, 0x00, 0xAC, 0xB1, /* 0xB0-0xB3 */
+	0xCF, 0xB4, 0xCF, 0xB5, 0x00, 0x00, 0xCF, 0xAE, /* 0xB4-0xB7 */
+	0xAC, 0xB5, 0x00, 0x00, 0xAC, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, /* 0xC4-0xC7 */
+	0xAF, 0x50, 0x00, 0x00, 0xAF, 0x4C, 0xD2, 0x6E, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD2, 0x76, 0xD2, 0x7B, 0xAF, 0x51, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD2, 0x6C, 0xD2, 0x72, 0xD2, 0x6B, /* 0xD0-0xD3 */
+	0xD2, 0x75, 0x00, 0x00, 0x00, 0x00, 0xD2, 0x71, /* 0xD4-0xD7 */
+	0xAF, 0x4D, 0xAF, 0x4F, 0xD2, 0x7A, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD2, 0x6A, 0xD2, 0x6D, 0xD2, 0x73, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD2, 0x74, 0xD2, 0x7C, 0xD2, 0x70, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xAF, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x6D, /* 0xEC-0xEF */
+	0xD6, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x50, /* 0xF0-0xF3 */
+	0xD6, 0x4C, 0x00, 0x00, 0xD6, 0x58, 0xD6, 0x4A, /* 0xF4-0xF7 */
+	0xD6, 0x57, 0xB2, 0x69, 0xD6, 0x48, 0xDA, 0x5B, /* 0xF8-0xFB */
+	0xD6, 0x52, 0xB2, 0x6C, 0x00, 0x00, 0xD6, 0x53, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0xD6, 0x56, 0x00, 0x00, 0xD6, 0x5A, 0x00, 0x00, /* 0x00-0x03 */
+	0xD6, 0x4F, 0x00, 0x00, 0xD6, 0x54, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xB2, 0x6A, 0xB2, 0x6B, 0xD6, 0x59, /* 0x08-0x0B */
+	0xD6, 0x4D, 0xD6, 0x49, 0xD6, 0x5B, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD6, 0x51, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x55, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x4B, /* 0x14-0x17 */
+	0x00, 0x00, 0xB5, 0x48, 0xB5, 0x49, 0xDA, 0x65, /* 0x18-0x1B */
+	0xB5, 0x4F, 0x00, 0x00, 0xDA, 0x59, 0xDA, 0x62, /* 0x1C-0x1F */
+	0xDA, 0x58, 0xB5, 0x4C, 0xDA, 0x60, 0xDA, 0x5E, /* 0x20-0x23 */
+	0x00, 0x00, 0xDA, 0x5F, 0xB5, 0x4A, 0x00, 0x00, /* 0x24-0x27 */
+	0xDA, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x5C, 0xDA, 0x5A, /* 0x2C-0x2F */
+	0xB5, 0x4B, 0xDA, 0x5D, 0xDA, 0x61, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0x4D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x64, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xDE, 0x70, 0xDE, 0x77, 0xDE, 0x79, /* 0x40-0x43 */
+	0xDE, 0xA1, 0x00, 0x00, 0xB7, 0xDA, 0xDE, 0x6B, /* 0x44-0x47 */
+	0x00, 0x00, 0xB7, 0xD2, 0x00, 0x00, 0xDE, 0x7A, /* 0x48-0x4B */
+	0xB7, 0xD7, 0xDE, 0xA2, 0xB7, 0xCE, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDE, 0x7D, 0x00, 0x00, 0xDE, 0x6D, 0xDE, 0x7E, /* 0x50-0x53 */
+	0xDE, 0x6C, 0x00, 0x00, 0xB7, 0xDC, 0x00, 0x00, /* 0x54-0x57 */
+	0xDE, 0x78, 0xB7, 0xCF, 0xDE, 0xA3, 0x00, 0x00, /* 0x58-0x5B */
+	0xB7, 0xD4, 0xDE, 0x71, 0xB7, 0xD9, 0xDE, 0x7C, /* 0x5C-0x5F */
+	0xDE, 0x6F, 0xDE, 0x76, 0xDE, 0x72, 0xDE, 0x6E, /* 0x60-0x63 */
+	0xB7, 0xD1, 0xB7, 0xD8, 0xB7, 0xD6, 0xB7, 0xD3, /* 0x64-0x67 */
+	0xB7, 0xDB, 0xB7, 0xD0, 0xDE, 0x75, 0x00, 0x00, /* 0x68-0x6B */
+	0xB7, 0xD5, 0x00, 0x00, 0xB5, 0x4E, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDE, 0x7B, 0x00, 0x00, 0xDE, 0x73, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xDE, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC1, /* 0x78-0x7B */
+	0x00, 0x00, 0xBA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE2, 0xBD, 0xE2, 0xC3, 0xE2, 0xBF, 0x00, 0x00, /* 0x80-0x83 */
+	0xBA, 0xB6, 0xE2, 0xBE, 0xE2, 0xC2, 0xE2, 0xBA, /* 0x84-0x87 */
+	0x00, 0x00, 0xE2, 0xBC, 0xBA, 0xB5, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, /* 0x8C-0x8F */
+	0xE2, 0xBB, 0x00, 0x00, 0xBA, 0xB7, 0x00, 0x00, /* 0x90-0x93 */
+	0xBA, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC4, /* 0x94-0x97 */
+	0x00, 0x00, 0xBA, 0xB3, 0xE6, 0x67, 0xE6, 0x64, /* 0x98-0x9B */
+	0xE6, 0x70, 0xE6, 0x6A, 0xE6, 0x6C, 0xBC, 0xF4, /* 0x9C-0x9F */
+	0xE6, 0x66, 0xE6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE6, 0x6D, 0xE6, 0x6B, 0x00, 0x00, 0xE6, 0x71, /* 0xA4-0xA7 */
+	0xBC, 0xF7, 0xE6, 0x68, 0xE6, 0x6F, 0x00, 0x00, /* 0xA8-0xAB */
+	0xBC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x63, /* 0xAC-0xAF */
+	0xE6, 0x65, 0xBC, 0xF6, 0xE6, 0x62, 0xE6, 0x72, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEA, 0x4A, 0xBF, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xEA, 0x55, 0xEA, 0x53, 0xBF, 0x4B, 0xEA, 0x49, /* 0xBC-0xBF */
+	0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x48, 0xBF, 0x55, /* 0xC0-0xC3 */
+	0xBF, 0x56, 0xEA, 0x47, 0xEA, 0x56, 0xEA, 0x51, /* 0xC4-0xC7 */
+	0xBF, 0x4F, 0xBF, 0x4C, 0xEA, 0x50, 0xEA, 0x4E, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0x52, 0xEA, 0x52, /* 0xCC-0xCF */
+	0xBF, 0x4D, 0x00, 0x00, 0xBF, 0x4E, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0x4F, 0xBF, 0x50, 0xEA, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEA, 0x54, 0xBF, 0x53, 0xEA, 0x57, 0xEA, 0x58, /* 0xD8-0xDB */
+	0xBF, 0x54, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xE7, /* 0xDC-0xDF */
+	0xC0, 0xEE, 0xED, 0x5C, 0xED, 0x62, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xED, 0x60, 0xC0, 0xEA, 0xC0, 0xE9, 0xC0, 0xE6, /* 0xE4-0xE7 */
+	0xED, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xC0, 0xEC, 0xC0, 0xEB, 0xC0, 0xE8, 0x00, 0x00, /* 0xEC-0xEF */
+	0xED, 0x61, 0xED, 0x5D, 0xED, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xC0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xC2, 0x77, 0xEF, 0xFB, 0x00, 0x00, 0xC2, 0x74, /* 0xF8-0xFB */
+	0xC2, 0x75, 0xEF, 0xFD, 0xC2, 0x76, 0xEF, 0xFA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0x00, 0x00, 0xEF, 0xF9, 0xF2, 0x6C, 0xEF, 0xFC, /* 0x00-0x03 */
+	0x00, 0x00, 0xF2, 0x6D, 0xC3, 0x7A, 0xF2, 0x6B, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0x6A, 0x00, 0x00, /* 0x08-0x0B */
+	0xF2, 0x69, 0xC3, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xC4, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x6A, /* 0x10-0x13 */
+	0xF4, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF5, 0xDC, 0xF5, 0xDB, 0xC4, 0xEA, /* 0x18-0x1B */
+	0x00, 0x00, 0xF5, 0xDA, 0xF6, 0xEC, 0xF6, 0xED, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xE6, 0xF8, 0xB1, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xF6, 0xF9, 0xBC, /* 0x24-0x27 */
+	0xC6, 0x79, 0xF9, 0xC6, 0xA4, 0xF6, 0x00, 0x00, /* 0x28-0x2B */
+	0xAA, 0xA6, 0xAA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xAC, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xC0, 0xEF, 0xA4, 0xF7, 0x00, 0x00, /* 0x34-0x37 */
+	0xAA, 0xA8, 0xAF, 0x52, 0xB7, 0xDD, 0xA4, 0xF8, /* 0x38-0x3B */
+	0x00, 0x00, 0xB2, 0x6E, 0xBA, 0xB8, 0xC9, 0x62, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCF, 0xB7, 0xD2, 0x7D, 0x00, 0x00, /* 0x40-0x43 */
+	0xE2, 0xC5, 0x00, 0x00, 0xC0, 0xF0, 0xA4, 0xF9, /* 0x44-0x47 */
+	0xAA, 0xA9, 0xCF, 0xB8, 0xCF, 0xB9, 0xDA, 0x66, /* 0x48-0x4B */
+	0xB5, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA4, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xDE, 0xE2, 0xC6, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */
+	0xC3, 0x7C, 0xA4, 0xFA, 0xDA, 0x67, 0xA4, 0xFB, /* 0x58-0x5B */
+	0x00, 0x00, 0xA6, 0xC9, 0xCA, 0x42, 0xA6, 0xC8, /* 0x5C-0x5F */
+	0xA8, 0x65, 0xA8, 0x64, 0xA8, 0x63, 0xCB, 0x60, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xAA, /* 0x64-0x67 */
+	0x00, 0x00, 0xAA, 0xAB, 0xCD, 0x5B, 0x00, 0x00, /* 0x68-0x6B */
+	0xCF, 0xBA, 0x00, 0x00, 0xCF, 0xBD, 0xAC, 0xBA, /* 0x6C-0x6F */
+	0xCF, 0xBB, 0x00, 0x00, 0xAC, 0xB9, 0xCF, 0xBC, /* 0x70-0x73 */
+	0xAC, 0xBB, 0x00, 0x00, 0xD2, 0xA2, 0xD2, 0xA1, /* 0x74-0x77 */
+	0xD2, 0x7E, 0xAF, 0x53, 0x00, 0x00, 0xD6, 0x5D, /* 0x78-0x7B */
+	0xD6, 0x5E, 0xB2, 0x6F, 0xD6, 0x5C, 0xD6, 0x5F, /* 0x7C-0x7F */
+	
+	0xB5, 0x52, 0xB2, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xB5, 0x51, 0xDA, 0x6B, 0xDA, 0x6A, 0x00, 0x00, /* 0x84-0x87 */
+	0xDA, 0x68, 0xDA, 0x69, 0x00, 0x00, 0xDA, 0x6C, /* 0x88-0x8B */
+	0xDE, 0xA6, 0xDE, 0xA5, 0xDE, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDE, 0xA8, 0xDE, 0xA7, 0xBA, 0xB9, 0xE2, 0xC9, /* 0x90-0x93 */
+	0x00, 0x00, 0xE2, 0xC8, 0xBA, 0xBA, 0xE2, 0xC7, /* 0x94-0x97 */
+	0xE6, 0x73, 0x00, 0x00, 0xE6, 0x74, 0xBC, 0xF9, /* 0x98-0x9B */
+	0x00, 0x00, 0xEA, 0x59, 0xEA, 0x5A, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xF2, 0x72, 0xC3, 0x7D, 0xF2, 0x71, /* 0xA0-0xA3 */
+	0xF2, 0x70, 0xF2, 0x6E, 0xF2, 0x6F, 0xC4, 0xEB, /* 0xA4-0xA7 */
+	0xF4, 0x6C, 0xF6, 0xEE, 0xF8, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */
+	0xA4, 0xFC, 0x00, 0x00, 0xC9, 0xA5, 0xA5, 0xC7, /* 0xAC-0xAF */
+	0xC9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xCA, 0x43, 0xCA, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0x66, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xCB, 0x62, 0x00, 0x00, 0xCB, 0x61, /* 0xBC-0xBF */
+	0xAA, 0xAC, 0xCB, 0x65, 0xA8, 0x67, 0xCB, 0x63, /* 0xC0-0xC3 */
+	0xA8, 0x66, 0xCB, 0x67, 0xCB, 0x64, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xCD, 0x5F, 0xCF, 0xBE, 0xCD, 0x5D, /* 0xC8-0xCB */
+	0xCD, 0x64, 0x00, 0x00, 0xAA, 0xAD, 0x00, 0x00, /* 0xCC-0xCF */
+	0xAA, 0xB0, 0xCD, 0x65, 0xCD, 0x61, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xCD, 0x62, 0x00, 0x00, 0xCD, 0x5C, 0xAA, 0xAF, /* 0xD4-0xD7 */
+	0xCD, 0x5E, 0xAA, 0xAE, 0xCD, 0x63, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCD, 0x60, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC2, /* 0xDC-0xDF */
+	0xAC, 0xBD, 0xAC, 0xBE, 0x00, 0x00, 0xCF, 0xC5, /* 0xE0-0xE3 */
+	0xCF, 0xBF, 0x00, 0x00, 0xCF, 0xC4, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCF, 0xC0, 0xAC, 0xBC, 0xCF, 0xC3, 0xCF, 0xC1, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xA8, /* 0xF0-0xF3 */
+	0xD2, 0xA5, 0x00, 0x00, 0xD2, 0xA7, 0xAF, 0x58, /* 0xF4-0xF7 */
+	0xAF, 0x57, 0xAF, 0x55, 0xD2, 0xA4, 0xD2, 0xA9, /* 0xF8-0xFB */
+	0xAF, 0x54, 0xAF, 0x56, 0xD2, 0xA6, 0xD6, 0x67, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0xD2, 0xA3, 0xD2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x62, /* 0x04-0x07 */
+	0xD6, 0x66, 0x00, 0x00, 0xD6, 0x65, 0xDA, 0x6E, /* 0x08-0x0B */
+	0xDA, 0x79, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x68, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD6, 0x63, 0xDA, 0x6D, 0xB2, 0x74, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0x73, 0xD6, 0x61, /* 0x14-0x17 */
+	0xD6, 0x64, 0xB2, 0x75, 0x00, 0x00, 0xB2, 0x72, /* 0x18-0x1B */
+	0xB2, 0x71, 0xD6, 0x60, 0xD6, 0x69, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x70, 0xDA, 0x77, /* 0x20-0x23 */
+	0x00, 0x00, 0xB5, 0x54, 0xDA, 0x76, 0xDA, 0x73, /* 0x24-0x27 */
+	0x00, 0x00, 0xB5, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xDA, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xDA, 0x6F, 0xDA, 0x71, 0xDA, 0x74, 0xDA, 0x72, /* 0x30-0x33 */
+	0xB5, 0x55, 0xDA, 0x78, 0xB5, 0x53, 0xB7, 0xDF, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xDE, 0xAC, /* 0x38-0x3B */
+	0xDE, 0xAA, 0x00, 0x00, 0xB7, 0xE2, 0xB7, 0xE1, /* 0x3C-0x3F */
+	0xDE, 0xAE, 0x00, 0x00, 0xDE, 0xAB, 0xE2, 0xCA, /* 0x40-0x43 */
+	0xBA, 0xBB, 0xB7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xDE, 0xB0, 0xDE, 0xAF, 0x00, 0x00, /* 0x48-0x4B */
+	0xE2, 0xCD, 0xE2, 0xCB, 0xBC, 0xFA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xBA, 0xBC, 0xE2, 0xCC, 0xE6, 0x76, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xFB, /* 0x54-0x57 */
+	0xE6, 0x75, 0xE6, 0x7E, 0xE6, 0x7D, 0xE6, 0x7B, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0x7A, 0xE6, 0x77, 0xE6, 0x78, /* 0x5C-0x5F */
+	0xE6, 0x79, 0xE6, 0x7C, 0xE6, 0xA1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xEA, 0x5F, 0xEA, 0x5C, 0xEA, 0x5D, /* 0x64-0x67 */
+	0xBF, 0x57, 0xEA, 0x5B, 0xEA, 0x61, 0xEA, 0x60, /* 0x68-0x6B */
+	0xEA, 0x5E, 0x00, 0x00, 0xED, 0x64, 0xED, 0x65, /* 0x6C-0x6F */
+	0xC0, 0xF1, 0x00, 0x00, 0xC0, 0xF2, 0xED, 0x63, /* 0x70-0x73 */
+	0x00, 0x00, 0xC2, 0x79, 0xEF, 0xFE, 0xC2, 0x78, /* 0x74-0x77 */
+	0xC3, 0x7E, 0x00, 0x00, 0xC3, 0xA1, 0xC4, 0x6D, /* 0x78-0x7B */
+	0xF4, 0x6E, 0xF4, 0x6D, 0xF5, 0xDD, 0xF6, 0xEF, /* 0x7C-0x7F */
+	
+	0xC5, 0x7A, 0xF7, 0xE8, 0xF7, 0xE7, 0xF7, 0xE9, /* 0x80-0x83 */
+	0xA5, 0xC8, 0xCF, 0xC6, 0xAF, 0x59, 0xB2, 0x76, /* 0x84-0x87 */
+	0xD6, 0x6A, 0xA5, 0xC9, 0xC9, 0xA7, 0xA4, 0xFD, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0x45, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0x6C, 0xCB, 0x6A, /* 0x90-0x93 */
+	0xCB, 0x6B, 0xCB, 0x68, 0xA8, 0x68, 0xCB, 0x69, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xCD, 0x6D, 0x00, 0x00, 0xAA, 0xB3, /* 0x9C-0x9F */
+	0xCD, 0x6B, 0xCD, 0x67, 0xCD, 0x6A, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xCD, 0x66, 0xAA, 0xB5, 0xCD, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xAA, 0xB2, 0xAA, 0xB1, 0x00, 0x00, 0xAA, 0xB4, /* 0xA8-0xAB */
+	0xCD, 0x6C, 0xCD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0xC2, 0xAC, 0xC5, /* 0xB0-0xB3 */
+	0xCF, 0xCE, 0xCF, 0xCD, 0xCF, 0xCC, 0xAC, 0xBF, /* 0xB4-0xB7 */
+	0xCF, 0xD5, 0xCF, 0xCB, 0x00, 0x00, 0xAC, 0xC1, /* 0xB8-0xBB */
+	0xD2, 0xAF, 0x00, 0x00, 0xCF, 0xD2, 0xCF, 0xD0, /* 0xBC-0xBF */
+	0xAC, 0xC4, 0x00, 0x00, 0xCF, 0xC8, 0xCF, 0xD3, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xCF, 0xCA, 0xCF, 0xD4, 0xCF, 0xD1, /* 0xC4-0xC7 */
+	0xCF, 0xC9, 0x00, 0x00, 0xAC, 0xC0, 0xCF, 0xD6, /* 0xC8-0xCB */
+	0xCF, 0xC7, 0xAC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xB4, 0xD2, 0xAB, /* 0xD0-0xD3 */
+	0xD2, 0xB6, 0x00, 0x00, 0xD2, 0xAE, 0xD2, 0xB9, /* 0xD4-0xD7 */
+	0xD2, 0xBA, 0xD2, 0xAC, 0xD2, 0xB8, 0xD2, 0xB5, /* 0xD8-0xDB */
+	0xD2, 0xB3, 0xD2, 0xB7, 0xAF, 0x5F, 0x00, 0x00, /* 0xDC-0xDF */
+	0xAF, 0x5D, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD2, 0xAD, 0x00, 0x00, 0xD2, 0xB0, /* 0xE4-0xE7 */
+	0xD2, 0xBB, 0xD2, 0xB2, 0xAF, 0x5E, 0xCF, 0xCF, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAF, 0x5A, 0xAF, 0x5C, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD6, 0x78, 0xD6, 0x6D, 0xD6, 0x6B, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xD6, 0x6C, 0x00, 0x00, 0xD6, 0x73, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD6, 0x74, 0xD6, 0x70, 0xB2, 0x7B, 0xD6, 0x75, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0xD6, 0x72, 0xD6, 0x6F, 0x00, 0x00, 0xB2, 0x79, /* 0x00-0x03 */
+	0xD6, 0x6E, 0xB2, 0x77, 0xB2, 0x7A, 0xD6, 0x71, /* 0x04-0x07 */
+	0xD6, 0x79, 0xAF, 0x5B, 0xB2, 0x78, 0xD6, 0x77, /* 0x08-0x0B */
+	0xD6, 0x76, 0xB2, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x7E, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0xB5, 0x60, /* 0x18-0x1B */
+	0x00, 0x00, 0xDA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xDA, 0xA9, 0xDA, 0xA2, 0xB5, 0x5A, 0xDA, 0xA6, /* 0x20-0x23 */
+	0xDA, 0xA5, 0xB5, 0x5B, 0xB5, 0x61, 0x00, 0x00, /* 0x24-0x27 */
+	0xB5, 0x62, 0xDA, 0xA8, 0xB5, 0x58, 0xDA, 0x7D, /* 0x28-0x2B */
+	0xDA, 0x7B, 0xDA, 0xA3, 0xDA, 0x7A, 0xB5, 0x5F, /* 0x2C-0x2F */
+	0xDA, 0x7C, 0xDA, 0xA4, 0xDA, 0xAA, 0xB5, 0x59, /* 0x30-0x33 */
+	0xB5, 0x5E, 0xB5, 0x5C, 0xB5, 0x5D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0x57, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0xE9, /* 0x3C-0x3F */
+	0xDE, 0xB7, 0xB7, 0xE8, 0xDE, 0xBB, 0x00, 0x00, /* 0x40-0x43 */
+	0xDE, 0xB1, 0x00, 0x00, 0xDE, 0xBC, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xB2, 0xDE, 0xB3, /* 0x48-0x4B */
+	0x00, 0x00, 0xDE, 0xBD, 0xDE, 0xBA, 0xDE, 0xB8, /* 0x4C-0x4F */
+	0xDE, 0xB9, 0xDE, 0xB5, 0xDE, 0xB4, 0x00, 0x00, /* 0x50-0x53 */
+	0xDE, 0xBE, 0xB7, 0xE5, 0x00, 0x00, 0xDE, 0xB6, /* 0x54-0x57 */
+	0x00, 0x00, 0xB7, 0xEA, 0xB7, 0xE4, 0xB7, 0xEB, /* 0x58-0x5B */
+	0xB7, 0xEC, 0x00, 0x00, 0xB7, 0xE7, 0xB7, 0xE6, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, 0xBA, 0xBE, /* 0x60-0x63 */
+	0xBA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD3, /* 0x64-0x67 */
+	0x00, 0x00, 0xBC, 0xFC, 0xBA, 0xBF, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xBA, 0xC1, 0xE2, 0xD4, 0xB7, 0xE3, /* 0x6C-0x6F */
+	0xBA, 0xC0, 0xE2, 0xD0, 0xE2, 0xD2, 0xE2, 0xCF, /* 0x70-0x73 */
+	0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE6, 0xAA, 0xE6, 0xA7, 0xBD, 0x40, 0xEA, 0x62, /* 0x7C-0x7F */
+	
+	0xBD, 0x41, 0xE6, 0xA6, 0x00, 0x00, 0xBC, 0xFE, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xA8, 0xE6, 0xA5, 0xE6, 0xA2, /* 0x84-0x87 */
+	0xE6, 0xA9, 0xE6, 0xA3, 0xE6, 0xA4, 0xBC, 0xFD, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xED, 0x69, 0x00, 0x00, 0xEA, 0x66, 0x00, 0x00, /* 0x90-0x93 */
+	0xEA, 0x65, 0xEA, 0x67, 0x00, 0x00, 0xED, 0x66, /* 0x94-0x97 */
+	0xBF, 0x5A, 0x00, 0x00, 0xEA, 0x63, 0x00, 0x00, /* 0x98-0x9B */
+	0xBF, 0x58, 0x00, 0x00, 0xBF, 0x5C, 0xBF, 0x5B, /* 0x9C-0x9F */
+	0xEA, 0x64, 0xEA, 0x68, 0x00, 0x00, 0xBF, 0x59, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xED, 0x6D, 0xC0, 0xF5, 0xC2, 0x7A, /* 0xA4-0xA7 */
+	0xC0, 0xF6, 0xC0, 0xF3, 0xED, 0x6A, 0xED, 0x68, /* 0xA8-0xAB */
+	0x00, 0x00, 0xED, 0x6B, 0x00, 0x00, 0xED, 0x6E, /* 0xAC-0xAF */
+	0xC0, 0xF4, 0xED, 0x6C, 0xED, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF0, 0x42, 0xF0, 0x45, 0xF2, 0x75, /* 0xB4-0xB7 */
+	0xF0, 0x40, 0x00, 0x00, 0xF4, 0x6F, 0xF0, 0x46, /* 0xB8-0xBB */
+	0x00, 0x00, 0xC3, 0xA2, 0xF0, 0x44, 0xC2, 0x7B, /* 0xBC-0xBF */
+	0xF0, 0x41, 0xF0, 0x43, 0xF0, 0x47, 0xF2, 0x76, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF2, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xA3, 0xF2, 0x73, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x6E, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xC4, 0xED, 0xF6, 0xF1, 0xC4, 0xEC, 0xF6, 0xF3, /* 0xD4-0xD7 */
+	0xF6, 0xF0, 0xF6, 0xF2, 0xC5, 0xD0, 0xF8, 0xB2, /* 0xD8-0xDB */
+	0xA5, 0xCA, 0xCD, 0x6E, 0xD2, 0xBC, 0xD2, 0xBD, /* 0xDC-0xDF */
+	0xB2, 0x7D, 0xDE, 0xBF, 0xBF, 0x5D, 0xC3, 0xA4, /* 0xE0-0xE3 */
+	0xC5, 0x7B, 0xF8, 0xB3, 0xA5, 0xCB, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCD, 0x6F, 0xA2, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCF, 0xD7, 0x00, 0x00, 0xCF, 0xD8, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD2, 0xBE, 0xD2, 0xBF, 0xB2, 0x7E, 0xB2, 0xA1, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAB, /* 0xF8-0xFB */
+	0x00, 0x00, 0xDE, 0xC2, 0xDE, 0xC1, 0xDE, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_75[512] = {
+	0xE2, 0xD5, 0x00, 0x00, 0xE2, 0xD6, 0xE2, 0xD7, /* 0x00-0x03 */
+	0xBA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAD, /* 0x04-0x07 */
+	0xE6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x69, /* 0x08-0x0B */
+	0xBF, 0x5E, 0xBF, 0x5F, 0x00, 0x00, 0xED, 0x72, /* 0x0C-0x0F */
+	0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xF0, 0x49, /* 0x10-0x13 */
+	0xF0, 0x48, 0xC2, 0x7C, 0xF2, 0x77, 0xF5, 0xDE, /* 0x14-0x17 */
+	0xA5, 0xCC, 0x00, 0x00, 0xAC, 0xC6, 0x00, 0x00, /* 0x18-0x1B */
+	0xB2, 0xA2, 0xDE, 0xC3, 0x00, 0x00, 0xA5, 0xCD, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD2, 0xC0, 0xB2, 0xA3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xB5, 0x63, 0xB5, 0x64, 0x00, 0x00, /* 0x24-0x27 */
+	0xA5, 0xCE, 0xA5, 0xCF, 0xCA, 0x46, 0xA8, 0x6A, /* 0x28-0x2B */
+	0xA8, 0x69, 0xAC, 0xC7, 0xCF, 0xD9, 0xDA, 0xAC, /* 0x2C-0x2F */
+	0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x6B, /* 0x34-0x37 */
+	0xA8, 0x6C, 0xCB, 0x6E, 0xCB, 0x6D, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xAA, 0xB6, 0xCD, 0x72, 0xCD, 0x70, /* 0x3C-0x3F */
+	0xCD, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDA, /* 0x44-0x47 */
+	0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xAC, 0xCB, /* 0x48-0x4B */
+	0xAC, 0xC9, 0x00, 0x00, 0xAC, 0xCA, 0xAC, 0xC8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xAF, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xAF, 0x64, 0xAF, 0x63, 0xD2, 0xC1, /* 0x58-0x5B */
+	0xAF, 0x62, 0xAF, 0x61, 0x00, 0x00, 0xD2, 0xC2, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xA6, 0xD6, 0x7B, /* 0x60-0x63 */
+	0xD6, 0x7A, 0xB2, 0xA4, 0xB2, 0xA5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0x66, 0xB5, 0x65, /* 0x68-0x6B */
+	0xDA, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAD, /* 0x6C-0x6F */
+	0xB2, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xED, 0xDE, 0xC5, /* 0x74-0x77 */
+	0xB7, 0xEE, 0xDE, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE2, 0xD8, 0xE6, 0xAE, 0xBD, 0x42, /* 0x7C-0x7F */
+	
+	0xEA, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xED, 0x73, 0x00, 0x00, 0xC3, 0xA6, 0xC3, 0xA5, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0x7C, 0xA5, 0xD4, /* 0x88-0x8B */
+	0xCD, 0x73, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xA8, /* 0x8C-0x8F */
+	0xE2, 0xD9, 0xBA, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCB, 0x6F, 0xCB, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCD, 0x74, 0xAA, 0xB8, 0xAA, 0xB9, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xAA, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0xCF, 0xAC, 0xD0, /* 0xA0-0xA3 */
+	0xAC, 0xCD, 0xAC, 0xCE, 0x00, 0x00, 0xCF, 0xDC, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xDD, 0xAC, 0xCC, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xD2, 0xC3, 0x00, 0x00, 0xAF, 0x68, 0xAF, 0x69, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xB2, 0xAB, 0xD2, 0xC9, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xAF, 0x6E, 0xAF, 0x6C, 0xD2, 0xCA, 0xD2, 0xC5, /* 0xB8-0xBB */
+	0xAF, 0x6B, 0xAF, 0x6A, 0xAF, 0x65, 0xD2, 0xC8, /* 0xBC-0xBF */
+	0xD2, 0xC7, 0xD2, 0xC4, 0xAF, 0x6D, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD2, 0xC6, 0xAF, 0x66, 0x00, 0x00, 0xAF, 0x67, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xAC, 0xD6, 0xA1, /* 0xC8-0xCB */
+	0xD6, 0xA2, 0xB2, 0xAD, 0xD6, 0x7C, 0xD6, 0x7E, /* 0xCC-0xCF */
+	0xD6, 0xA4, 0xD6, 0xA3, 0xD6, 0x7D, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB2, 0xA9, 0xB2, 0xAA, 0x00, 0x00, 0xDA, 0xB6, /* 0xD4-0xD7 */
+	0xB5, 0x6B, 0xB5, 0x6A, 0xDA, 0xB0, 0xB5, 0x68, /* 0xD8-0xDB */
+	0x00, 0x00, 0xDA, 0xB3, 0xB5, 0x6C, 0xDA, 0xB4, /* 0xDC-0xDF */
+	0xB5, 0x6D, 0xDA, 0xB1, 0xB5, 0x67, 0xB5, 0x69, /* 0xE0-0xE3 */
+	0xDA, 0xB5, 0x00, 0x00, 0xDA, 0xB2, 0xDA, 0xAF, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xDE, 0xD2, 0x00, 0x00, 0xDE, 0xC7, /* 0xEC-0xEF */
+	0xB7, 0xF0, 0xB7, 0xF3, 0xB7, 0xF2, 0xB7, 0xF7, /* 0xF0-0xF3 */
+	0xB7, 0xF6, 0xDE, 0xD3, 0xDE, 0xD1, 0xDE, 0xCA, /* 0xF4-0xF7 */
+	0xDE, 0xCE, 0xDE, 0xCD, 0xB7, 0xF4, 0xDE, 0xD0, /* 0xF8-0xFB */
+	0xDE, 0xCC, 0xDE, 0xD4, 0xDE, 0xCB, 0xB7, 0xF5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0xB7, 0xEF, 0xB7, 0xF1, 0x00, 0x00, 0xDE, 0xC9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE2, 0xDB, 0xBA, 0xC7, 0xE2, 0xDF, 0xBA, 0xC6, /* 0x08-0x0B */
+	0xE2, 0xDC, 0xBA, 0xC5, 0x00, 0x00, 0xDE, 0xC8, /* 0x0C-0x0F */
+	0xDE, 0xCF, 0xE2, 0xDE, 0x00, 0x00, 0xBA, 0xC8, /* 0x10-0x13 */
+	0xE2, 0xE0, 0xE2, 0xDD, 0xE2, 0xDA, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE6, 0xB1, 0xE6, 0xB5, 0xE6, 0xB7, /* 0x18-0x1B */
+	0xE6, 0xB3, 0xE6, 0xB2, 0xE6, 0xB0, 0xBD, 0x45, /* 0x1C-0x1F */
+	0xBD, 0x43, 0xBD, 0x48, 0xBD, 0x49, 0xE6, 0xB4, /* 0x20-0x23 */
+	0xBD, 0x46, 0xE6, 0xAF, 0xBD, 0x47, 0xBA, 0xC4, /* 0x24-0x27 */
+	0xE6, 0xB6, 0xBD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0xEA, 0x6B, /* 0x2C-0x2F */
+	0xEA, 0x73, 0xEA, 0x6D, 0xEA, 0x72, 0xEA, 0x6F, /* 0x30-0x33 */
+	0xBF, 0x60, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xBF, 0x61, 0x00, 0x00, 0xBF, 0x62, 0x00, 0x00, /* 0x38-0x3B */
+	0xEA, 0x70, 0xEA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xF8, 0xED, 0x74, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xF7, 0xED, 0x77, /* 0x44-0x47 */
+	0xED, 0x75, 0xED, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xC0, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF0, 0x4D, 0x00, 0x00, 0xC2, 0xA1, 0xF0, 0x4E, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x7D, 0xF0, 0x4F, /* 0x54-0x57 */
+	0xC2, 0x7E, 0xF0, 0x4C, 0xF0, 0x50, 0x00, 0x00, /* 0x58-0x5B */
+	0xF0, 0x4A, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA7, /* 0x5C-0x5F */
+	0xF2, 0x78, 0xC3, 0xA8, 0xC4, 0x6F, 0x00, 0x00, /* 0x60-0x63 */
+	0xF0, 0x4B, 0xC4, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xC4, 0xEE, 0xF5, 0xDF, 0x00, 0x00, /* 0x68-0x6B */
+	0xC5, 0x7E, 0xF6, 0xF4, 0xC5, 0x7D, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF7, 0xEA, 0xC5, 0xF5, 0xC5, 0xF6, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF9, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xAC, 0xD1, 0xCF, 0xDE, 0x00, 0x00, 0xB5, 0x6E, /* 0x78-0x7B */
+	0xB5, 0x6F, 0xA5, 0xD5, 0xA6, 0xCA, 0xCA, 0x47, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xCB, 0x71, 0xA8, 0x6D, 0x00, 0x00, /* 0x80-0x83 */
+	0xAA, 0xBA, 0x00, 0x00, 0xAC, 0xD2, 0xAC, 0xD3, /* 0x84-0x87 */
+	0xAC, 0xD4, 0xD6, 0xA6, 0xD2, 0xCB, 0xAF, 0x6F, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xAE, 0xD6, 0xA5, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xB8, 0xB5, 0x71, /* 0x90-0x93 */
+	0x00, 0x00, 0xDA, 0xB7, 0xB5, 0x70, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xDE, 0xD5, 0xBD, 0x4A, 0xE6, 0xBB, /* 0x98-0x9B */
+	0xE6, 0xB8, 0xE6, 0xB9, 0xE6, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xED, 0x78, 0x00, 0x00, 0xF0, 0x51, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0x71, 0xF4, 0x70, /* 0xA8-0xAB */
+	0x00, 0x00, 0xF6, 0xF5, 0xA5, 0xD6, 0xCD, 0x75, /* 0xAC-0xAF */
+	0xAF, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xB5, 0x72, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE2, 0xE1, 0x00, 0x00, 0xBD, 0x4B, 0xEA, 0x74, /* 0xB8-0xBB */
+	0x00, 0x00, 0xF0, 0x52, 0xF4, 0x72, 0xA5, 0xD7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0xBB, 0xAC, 0xD7, /* 0xC0-0xC3 */
+	0xCF, 0xDF, 0xAC, 0xD8, 0xAC, 0xD6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xAC, 0xD5, 0xD2, 0xCC, 0xAF, 0x71, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xAF, 0x72, 0xAF, 0x73, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xB0, 0xD6, 0xA7, /* 0xD0-0xD3 */
+	0xB2, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, 0xB2, 0xB1, /* 0xD8-0xDB */
+	0xB5, 0x73, 0xDE, 0xD7, 0xB7, 0xF8, 0xB7, 0xF9, /* 0xDC-0xDF */
+	0x00, 0x00, 0xBA, 0xC9, 0x00, 0x00, 0xBA, 0xCA, /* 0xE0-0xE3 */
+	0xBD, 0x4C, 0xBF, 0x64, 0xEA, 0x75, 0xBF, 0x63, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xED, 0x79, 0xC0, 0xFA, 0x00, 0x00, /* 0xE8-0xEB */
+	0xF0, 0x53, 0xF4, 0x73, 0xA5, 0xD8, 0xA8, 0x6E, /* 0xEC-0xEF */
+	0xCD, 0x78, 0xCD, 0x77, 0xAA, 0xBC, 0xCD, 0x76, /* 0xF0-0xF3 */
+	0xAA, 0xBD, 0xCD, 0x79, 0x00, 0x00, 0xCF, 0xE5, /* 0xF4-0xF7 */
+	0xAC, 0xDB, 0xAC, 0xDA, 0xCF, 0xE7, 0xCF, 0xE6, /* 0xF8-0xFB */
+	0xAC, 0xDF, 0x00, 0x00, 0xAC, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0x00, 0x00, 0xAC, 0xD9, 0x00, 0x00, 0xCF, 0xE1, /* 0x00-0x03 */
+	0xCF, 0xE2, 0xCF, 0xE3, 0x00, 0x00, 0xAC, 0xE0, /* 0x04-0x07 */
+	0xCF, 0xE0, 0xAC, 0xDC, 0xCF, 0xE4, 0xAC, 0xDD, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD2, 0xCF, 0xD2, 0xD3, 0xD2, 0xD1, 0xD2, 0xD0, /* 0x10-0x13 */
+	0x00, 0x00, 0xD2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xCE, /* 0x18-0x1B */
+	0x00, 0x00, 0xD2, 0xCD, 0x00, 0x00, 0xAF, 0x75, /* 0x1C-0x1F */
+	0xAF, 0x76, 0x00, 0x00, 0xD2, 0xD7, 0xD2, 0xD2, /* 0x20-0x23 */
+	0x00, 0x00, 0xD6, 0xB0, 0x00, 0x00, 0xD2, 0xD8, /* 0x24-0x27 */
+	0xAF, 0x77, 0xAF, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD6, 0xAA, 0x00, 0x00, 0xD6, 0xA9, /* 0x2C-0x2F */
+	0x00, 0x00, 0xD6, 0xAB, 0xD6, 0xAC, 0xD6, 0xAE, /* 0x30-0x33 */
+	0xD6, 0xAD, 0xD6, 0xB2, 0xB2, 0xB5, 0xB2, 0xB2, /* 0x34-0x37 */
+	0xB2, 0xB6, 0xD6, 0xA8, 0xB2, 0xB7, 0xD6, 0xB1, /* 0x38-0x3B */
+	0xB2, 0xB4, 0xD6, 0xAF, 0xB2, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xDA, 0xBC, 0xDA, 0xBE, 0xDA, 0xBA, 0xDA, 0xBB, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xBF, 0xDA, 0xC1, /* 0x48-0x4B */
+	0xDA, 0xC2, 0xDA, 0xBD, 0xDA, 0xC0, 0xB5, 0x74, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, 0x00, 0x00, /* 0x50-0x53 */
+	0xDE, 0xE0, 0xDE, 0xD8, 0xDE, 0xDC, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xDE, 0xE1, 0xDE, 0xDD, 0xB7, 0xFA, /* 0x58-0x5B */
+	0xB8, 0x43, 0x00, 0x00, 0xB7, 0xFD, 0xDE, 0xD9, /* 0x5C-0x5F */
+	0xDE, 0xDA, 0xBA, 0xCE, 0xB8, 0x46, 0xB7, 0xFE, /* 0x60-0x63 */
+	0x00, 0x00, 0xB8, 0x44, 0xB7, 0xFC, 0xDE, 0xDF, /* 0x64-0x67 */
+	0xB8, 0x45, 0xDE, 0xDE, 0xB8, 0x41, 0xB7, 0xFB, /* 0x68-0x6B */
+	0xB8, 0x42, 0xDE, 0xE2, 0xE2, 0xE6, 0xE2, 0xE8, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xB8, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE2, 0xE3, 0xBA, 0xCC, 0xE2, 0xE9, 0xBA, 0xCD, /* 0x7C-0x7F */
+	
+	0xE2, 0xE7, 0xE2, 0xE2, 0xE2, 0xE5, 0xE2, 0xEA, /* 0x80-0x83 */
+	0xBA, 0xCB, 0xE2, 0xE4, 0x00, 0x00, 0xBD, 0x4E, /* 0x84-0x87 */
+	0xE6, 0xBF, 0xE6, 0xBE, 0x00, 0x00, 0xBD, 0x51, /* 0x88-0x8B */
+	0xBD, 0x4F, 0xE6, 0xBC, 0xBD, 0x4D, 0xE6, 0xBD, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBD, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xEA, 0x7D, 0x00, 0x00, 0xEA, 0xA1, /* 0x94-0x97 */
+	0x00, 0x00, 0xEA, 0x7E, 0xEA, 0x76, 0xEA, 0x7A, /* 0x98-0x9B */
+	0xEA, 0x79, 0xEA, 0x77, 0xBF, 0x66, 0xBF, 0x67, /* 0x9C-0x9F */
+	0xBF, 0x65, 0xEA, 0x78, 0xEA, 0x7B, 0xEA, 0x7C, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xBF, 0x68, 0x00, 0x00, 0xC1, 0x40, /* 0xA4-0xA7 */
+	0xED, 0xA3, 0x00, 0x00, 0xC0, 0xFC, 0xED, 0x7B, /* 0xA8-0xAB */
+	0xC0, 0xFE, 0xC1, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xC0, 0xFD, 0xED, 0xA2, 0xED, 0x7C, 0xC0, 0xFB, /* 0xB0-0xB3 */
+	0xED, 0xA1, 0xED, 0x7A, 0xED, 0x7E, 0xED, 0x7D, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0x55, 0xC2, 0xA4, /* 0xB8-0xBB */
+	0xC2, 0xA5, 0xC2, 0xA2, 0x00, 0x00, 0xC2, 0xA3, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0x54, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF2, 0x79, 0xF2, 0x7A, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF4, 0x74, 0xF4, 0x77, 0xF4, 0x75, 0xF4, 0x76, /* 0xCC-0xCF */
+	0xF5, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xEF, /* 0xD0-0xD3 */
+	0xF7, 0xEB, 0xF8, 0xB4, 0x00, 0x00, 0xC5, 0xF7, /* 0xD4-0xD7 */
+	0xF8, 0xF8, 0xF8, 0xF9, 0xC6, 0x66, 0xA5, 0xD9, /* 0xD8-0xDB */
+	0xAC, 0xE1, 0x00, 0x00, 0xDA, 0xC3, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDE, 0xE3, 0x00, 0x00, 0xA5, 0xDA, 0xA8, 0x6F, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xAA, 0xBE, 0x00, 0x00, 0xCF, 0xE8, /* 0xE4-0xE7 */
+	0xCF, 0xE9, 0xAF, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xDA, 0xC4, 0xB5, 0x75, 0xB8, 0x47, 0xC1, 0x42, /* 0xEC-0xEF */
+	0xED, 0xA4, 0xF2, 0x7C, 0xF4, 0x78, 0xA5, 0xDB, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA1, /* 0xF4-0xF7 */
+	0xCD, 0x7A, 0xCD, 0x7C, 0xCD, 0x7E, 0xCD, 0x7D, /* 0xF8-0xFB */
+	0xCD, 0x7B, 0xAA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_78[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0xE2, 0xCF, 0xF2, /* 0x00-0x03 */
+	0x00, 0x00, 0xCF, 0xED, 0xCF, 0xEA, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCF, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xAC, 0xE4, 0xAC, 0xE5, 0xCF, 0xF0, 0xCF, 0xEF, /* 0x0C-0x0F */
+	0xCF, 0xEE, 0xCF, 0xEB, 0xCF, 0xEC, 0xCF, 0xF3, /* 0x10-0x13 */
+	0xAC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xAF, 0x7C, 0x00, 0x00, 0xAF, 0xA4, /* 0x1C-0x1F */
+	0xAF, 0xA3, 0xD2, 0xE1, 0xD2, 0xDB, 0xD2, 0xD9, /* 0x20-0x23 */
+	0x00, 0x00, 0xAF, 0xA1, 0xD6, 0xB9, 0xAF, 0x7A, /* 0x24-0x27 */
+	0xD2, 0xDE, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE0, /* 0x28-0x2B */
+	0xD2, 0xDA, 0xAF, 0xA2, 0xD2, 0xDF, 0xD2, 0xDD, /* 0x2C-0x2F */
+	0xAF, 0x79, 0xD2, 0xE5, 0xAF, 0xA5, 0xD2, 0xE3, /* 0x30-0x33 */
+	0xAF, 0x7D, 0xD2, 0xDC, 0x00, 0x00, 0xAF, 0x7E, /* 0x34-0x37 */
+	0xAF, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xB9, /* 0x40-0x43 */
+	0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xD6, 0xB3, 0xD6, 0xB5, 0xD6, 0xB7, 0x00, 0x00, /* 0x48-0x4B */
+	0xD6, 0xB8, 0xD6, 0xB6, 0xB2, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD6, 0xBB, 0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xDA, 0xC8, 0xB5, 0x76, 0xDA, 0xD0, 0x00, 0x00, /* 0x5C-0x5F */
+	0xDA, 0xC5, 0x00, 0x00, 0xDA, 0xD1, 0x00, 0x00, /* 0x60-0x63 */
+	0xDA, 0xC6, 0xDA, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xDA, 0xCF, 0xDA, 0xCE, 0xDA, 0xCB, 0xB2, 0xB8, /* 0x68-0x6B */
+	0xB5, 0x77, 0xDA, 0xC9, 0xDA, 0xCC, 0xB5, 0x78, /* 0x6C-0x6F */
+	0xDA, 0xCD, 0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0xDE, 0xF2, /* 0x78-0x7B */
+	0xB8, 0x4E, 0x00, 0x00, 0xE2, 0xF0, 0xB8, 0x51, /* 0x7C-0x7F */
+	
+	0xDE, 0xF0, 0xF9, 0xD6, 0x00, 0x00, 0xDE, 0xED, /* 0x80-0x83 */
+	0xDE, 0xE8, 0xDE, 0xEA, 0xDE, 0xEB, 0xDE, 0xE4, /* 0x84-0x87 */
+	0x00, 0x00, 0xB8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xB8, 0x4C, 0x00, 0x00, 0xB8, 0x48, 0xDE, 0xE7, /* 0x8C-0x8F */
+	0x00, 0x00, 0xB8, 0x4F, 0x00, 0x00, 0xB8, 0x50, /* 0x90-0x93 */
+	0xDE, 0xE6, 0xDE, 0xE9, 0xDE, 0xF1, 0xB8, 0x4A, /* 0x94-0x97 */
+	0xB8, 0x4B, 0xDE, 0xEF, 0xDE, 0xE5, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, 0xBA, 0xD0, /* 0x9C-0x9F */
+	0xE2, 0xF4, 0xDE, 0xEC, 0xE2, 0xF6, 0xBA, 0xD4, /* 0xA0-0xA3 */
+	0xE2, 0xF7, 0xE2, 0xF3, 0x00, 0x00, 0xBA, 0xD1, /* 0xA4-0xA7 */
+	0xE2, 0xEF, 0xBA, 0xD3, 0xE2, 0xEC, 0xE2, 0xF1, /* 0xA8-0xAB */
+	0xE2, 0xF5, 0xE2, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xB8, 0x49, 0x00, 0x00, 0xE2, 0xEB, 0xBA, 0xD2, /* 0xB0-0xB3 */
+	0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0x54, 0xE6, 0xC1, /* 0xB8-0xBB */
+	0xBD, 0x58, 0x00, 0x00, 0xBD, 0x56, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBA, 0xCF, 0x00, 0x00, 0xE6, 0xC8, /* 0xC0-0xC3 */
+	0xE6, 0xC9, 0xBD, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE6, 0xC7, 0xE6, 0xCA, 0xBD, 0x55, 0xBD, 0x52, /* 0xC8-0xCB */
+	0xE6, 0xC3, 0xE6, 0xC0, 0xE6, 0xC5, 0xE6, 0xC2, /* 0xCC-0xCF */
+	0xBD, 0x59, 0xE6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE6, 0xC6, 0xBD, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0x6A, 0xEA, 0xA8, /* 0xD8-0xDB */
+	0x00, 0x00, 0xEA, 0xA2, 0xEA, 0xA6, 0xEA, 0xAC, /* 0xDC-0xDF */
+	0xEA, 0xAD, 0xEA, 0xA9, 0xEA, 0xAA, 0xEA, 0xA7, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0xBF, 0x6C, /* 0xE4-0xE7 */
+	0xBF, 0x69, 0xEA, 0xA3, 0xEA, 0xA5, 0x00, 0x00, /* 0xE8-0xEB */
+	0xBF, 0x6B, 0xEA, 0xAB, 0x00, 0x00, 0xC1, 0x46, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xAA, 0xED, 0xA5, /* 0xF0-0xF3 */
+	0xC1, 0x45, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x43, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xED, 0xAC, 0xC1, 0x44, 0xED, 0xA8, /* 0xF8-0xFB */
+	0xED, 0xA9, 0xED, 0xA6, 0xED, 0xAD, 0xF0, 0x56, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_79[512] = {
+	0x00, 0x00, 0xC1, 0x47, 0xED, 0xA7, 0x00, 0x00, /* 0x00-0x03 */
+	0xED, 0xAE, 0xED, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF0, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xF0, 0x57, 0x00, 0x00, 0xC2, 0xA6, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF0, 0x5B, 0xF0, 0x5D, 0xF0, 0x5C, 0xF0, 0x58, /* 0x10-0x13 */
+	0xF0, 0x59, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA3, /* 0x14-0x17 */
+	0x00, 0x00, 0xC3, 0xAA, 0x00, 0x00, 0xF2, 0x7E, /* 0x18-0x1B */
+	0xF2, 0xA2, 0xF2, 0x7D, 0xF2, 0xA4, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF2, 0xA1, 0x00, 0x00, 0xF4, 0x7A, /* 0x20-0x23 */
+	0xF4, 0x7D, 0xF4, 0x79, 0xC4, 0x71, 0xF4, 0x7B, /* 0x24-0x27 */
+	0xF4, 0x7C, 0xF4, 0x7E, 0xC4, 0x72, 0xC4, 0x74, /* 0x28-0x2B */
+	0xC4, 0x73, 0xF5, 0xE1, 0x00, 0x00, 0xF5, 0xE3, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF5, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF6, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xF8, 0xB5, 0xF8, 0xFA, 0xA5, 0xDC, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xCB, 0x72, 0xAA, 0xC0, 0xCD, 0xA3, /* 0x3C-0x3F */
+	0xAA, 0xC1, 0xAA, 0xC2, 0xCD, 0xA2, 0x00, 0x00, /* 0x40-0x43 */
+	0xCF, 0xF8, 0xCF, 0xF7, 0xAC, 0xE6, 0xAC, 0xE9, /* 0x44-0x47 */
+	0xAC, 0xE8, 0xAC, 0xE7, 0xCF, 0xF4, 0xCF, 0xF6, /* 0x48-0x4B */
+	0xCF, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xE8, /* 0x4C-0x4F */
+	0xAF, 0xA7, 0xD2, 0xEC, 0xD2, 0xEB, 0xD2, 0xEA, /* 0x50-0x53 */
+	0xD2, 0xE6, 0xAF, 0xA6, 0xAF, 0xAA, 0xAF, 0xAD, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xAF, 0xAE, 0xD2, 0xE7, /* 0x58-0x5B */
+	0xD2, 0xE9, 0xAF, 0xAC, 0xAF, 0xAB, 0xAF, 0xA9, /* 0x5C-0x5F */
+	0xAF, 0xA8, 0xD6, 0xC2, 0x00, 0x00, 0xD6, 0xC0, /* 0x60-0x63 */
+	0xD6, 0xBC, 0xB2, 0xBB, 0x00, 0x00, 0xD6, 0xBD, /* 0x64-0x67 */
+	0xB2, 0xBC, 0xD6, 0xBE, 0xD6, 0xBF, 0xD6, 0xC1, /* 0x68-0x6B */
+	0x00, 0x00, 0xB2, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDA, 0xD5, 0x00, 0x00, 0xDA, 0xD4, 0xDA, 0xD3, /* 0x70-0x73 */
+	0xDA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xDE, 0xF6, 0xB8, 0x52, 0x00, 0x00, /* 0x78-0x7B */
+	0xDE, 0xF3, 0xDE, 0xF5, 0x00, 0x00, 0xB8, 0x53, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xB8, 0x54, 0xDE, 0xF4, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE3, 0x41, 0x00, 0x00, 0xE2, 0xF9, 0xE2, 0xFA, /* 0x88-0x8B */
+	0x00, 0x00, 0xBA, 0xD7, 0xBA, 0xD5, 0xBA, 0xD6, /* 0x8C-0x8F */
+	0xE3, 0x43, 0x00, 0x00, 0xE3, 0x42, 0xE2, 0xFE, /* 0x90-0x93 */
+	0xE2, 0xFD, 0xE2, 0xFC, 0xE2, 0xFB, 0xE3, 0x40, /* 0x94-0x97 */
+	0xE2, 0xF8, 0x00, 0x00, 0xE6, 0xCB, 0xE6, 0xD0, /* 0x98-0x9B */
+	0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE6, 0xCD, 0xE6, 0xCC, 0xE6, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEA, 0xAE, 0x00, 0x00, 0xBF, 0x6D, 0xC1, 0x48, /* 0xA4-0xA7 */
+	0xED, 0xB0, 0x00, 0x00, 0xC1, 0x49, 0xED, 0xAF, /* 0xA8-0xAB */
+	0xF0, 0x5F, 0xF0, 0x5E, 0xC2, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */
+	0xF2, 0xA5, 0xC3, 0xAB, 0xF4, 0xA1, 0xC5, 0xA1, /* 0xB0-0xB3 */
+	0xF6, 0xF7, 0x00, 0x00, 0xF8, 0xB7, 0xF8, 0xB6, /* 0xB4-0xB7 */
+	0xC9, 0xA8, 0xAC, 0xEA, 0xAC, 0xEB, 0xD6, 0xC3, /* 0xB8-0xBB */
+	0x00, 0x00, 0xB8, 0x56, 0xA5, 0xDD, 0xA8, 0x72, /* 0xBC-0xBF */
+	0xA8, 0x71, 0xA8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xAA, 0xC4, 0xAA, 0xC3, 0x00, 0x00, 0xAC, 0xEE, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCF, 0xFA, 0xCF, 0xFD, 0xCF, 0xFB, /* 0xCC-0xCF */
+	0x00, 0x00, 0xAC, 0xEC, 0xAC, 0xED, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xCF, 0xF9, 0xCF, 0xFC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xAF, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD2, 0xF3, 0xD2, 0xF5, 0xD2, 0xF4, 0xAF, 0xB2, /* 0xDC-0xDF */
+	0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xAF, 0xB0, /* 0xE0-0xE3 */
+	0xAF, 0xAF, 0x00, 0x00, 0xAF, 0xB3, 0xAF, 0xB1, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xAF, 0xB4, 0xD2, 0xF2, 0xD2, 0xED, /* 0xE8-0xEB */
+	0xD2, 0xEE, 0xD2, 0xF1, 0xD2, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC6, 0xD6, 0xC7, /* 0xF4-0xF7 */
+	0xD6, 0xC5, 0x00, 0x00, 0xD6, 0xC4, 0xB2, 0xBE, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0xB5, 0x7D, 0x00, 0x00, 0xDA, 0xD6, 0xDA, 0xD8, /* 0x00-0x03 */
+	0xDA, 0xDA, 0xB5, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xB5, 0x7A, 0x00, 0x00, 0xDA, 0xD7, 0xB5, 0x7B, /* 0x08-0x0B */
+	0xDA, 0xD9, 0xB5, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xDF, 0x41, 0xDE, 0xF7, 0xDE, 0xFA, 0xDE, 0xFE, /* 0x10-0x13 */
+	0xB8, 0x5A, 0xDE, 0xFC, 0x00, 0x00, 0xDE, 0xFB, /* 0x14-0x17 */
+	0xDE, 0xF8, 0xDE, 0xF9, 0xB8, 0x58, 0xDF, 0x40, /* 0x18-0x1B */
+	0xB8, 0x57, 0x00, 0x00, 0xB8, 0x5C, 0xB8, 0x5B, /* 0x1C-0x1F */
+	0xB8, 0x59, 0x00, 0x00, 0xDE, 0xFD, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, /* 0x24-0x27 */
+	0xE3, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x44, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0xD8, 0xE3, 0x47, /* 0x2C-0x2F */
+	0xE3, 0x46, 0xBA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBD, 0x5E, /* 0x34-0x37 */
+	0x00, 0x00, 0xE6, 0xD2, 0x00, 0x00, 0xBD, 0x5F, /* 0x38-0x3B */
+	0xBD, 0x5B, 0xBD, 0x5D, 0x00, 0x00, 0xBD, 0x5A, /* 0x3C-0x3F */
+	0xBD, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xEA, 0xAF, 0x00, 0x00, 0xBF, 0x70, 0xEA, 0xB1, /* 0x44-0x47 */
+	0xEA, 0xB0, 0x00, 0x00, 0xE3, 0x45, 0xBF, 0x72, /* 0x48-0x4B */
+	0xBF, 0x71, 0xBF, 0x6E, 0xBF, 0x6F, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xED, 0xB5, 0x00, 0x00, 0xED, 0xB3, 0xC1, 0x4A, /* 0x54-0x57 */
+	0xED, 0xB4, 0x00, 0x00, 0xED, 0xB6, 0xED, 0xB2, /* 0x58-0x5B */
+	0xED, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x60, /* 0x5C-0x5F */
+	0xC2, 0xAA, 0xC2, 0xA8, 0xC2, 0xA9, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x64-0x67 */
+	0xF2, 0xA7, 0xC3, 0xAD, 0x00, 0x00, 0xC3, 0xAC, /* 0x68-0x6B */
+	0xF4, 0xA3, 0xF4, 0xA4, 0xF4, 0xA2, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF6, 0xF8, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA5, 0xDE, 0xCA, 0x48, 0xA8, 0x73, 0x00, 0x00, /* 0x74-0x77 */
+	0xCD, 0xA5, 0xAA, 0xC6, 0xAA, 0xC5, 0xCD, 0xA6, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0x40, 0xAC, 0xEF, /* 0x7C-0x7F */
+	
+	0xCF, 0xFE, 0xAC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xAF, 0xB6, 0xD2, 0xF8, 0xD2, 0xF6, 0xD2, 0xFC, /* 0x84-0x87 */
+	0xAF, 0xB7, 0xD2, 0xF7, 0xD2, 0xFB, 0xD2, 0xF9, /* 0x88-0x8B */
+	0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0x8C-0x8F */
+	0xD6, 0xCA, 0x00, 0x00, 0xB2, 0xBF, 0x00, 0x00, /* 0x90-0x93 */
+	0xD6, 0xC9, 0xB2, 0xC0, 0xB5, 0xA2, 0xB5, 0xA1, /* 0x94-0x97 */
+	0xB5, 0x7E, 0xDA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0x44, 0xB8, 0x5D, /* 0x9C-0x9F */
+	0xB8, 0x5E, 0x00, 0x00, 0xDF, 0x43, 0xDF, 0x42, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE3, 0x4A, 0xBA, 0xDB, 0xBA, 0xDA, 0xE3, 0x4B, /* 0xA8-0xAB */
+	0xE3, 0x4C, 0x00, 0x00, 0xBD, 0x61, 0xBD, 0x60, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEA, 0xB5, 0xE6, 0xD3, 0xE6, 0xD5, /* 0xB0-0xB3 */
+	0xE6, 0xD4, 0xEA, 0xB4, 0xEA, 0xB2, 0xEA, 0xB6, /* 0xB4-0xB7 */
+	0xEA, 0xB3, 0x00, 0x00, 0xBF, 0x73, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xB7, 0xC1, 0x4B, /* 0xBC-0xBF */
+	0xED, 0xB8, 0xED, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xC2, 0xAB, 0xC2, 0xAC, 0x00, 0x00, 0xC4, 0x75, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0xD1, 0xA5, 0xDF, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD0, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD2, 0xFD, 0xAF, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xBA, /* 0xDC-0xDF */
+	0xB3, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xA4, /* 0xE0-0xE3 */
+	0xDA, 0xDD, 0xB5, 0xA3, 0xDA, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x45, /* 0xE8-0xEB */
+	0x00, 0x00, 0xBA, 0xDC, 0xE3, 0x4D, 0xBA, 0xDD, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xC4, 0x76, 0xF4, 0xA5, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xA6, 0xCB, 0xAA, 0xC7, 0xCD, 0xA7, /* 0xF8-0xFB */
+	0x00, 0x00, 0xAC, 0xF2, 0x00, 0x00, 0xAC, 0xF1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0xD0, 0x42, 0xD0, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD3, 0x40, 0xD3, 0x42, 0xAF, 0xB9, 0x00, 0x00, /* 0x04-0x07 */
+	0xD3, 0x44, 0xD3, 0x47, 0xD3, 0x45, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0x46, 0xD3, 0x43, /* 0x0C-0x0F */
+	0xD2, 0xFE, 0xAF, 0xBA, 0xD3, 0x48, 0xD3, 0x41, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xD6, 0xD3, 0xB2, 0xC6, 0xD6, 0xDC, 0xB2, 0xC3, /* 0x18-0x1B */
+	0x00, 0x00, 0xD6, 0xD5, 0xB2, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB2, 0xC1, 0x00, 0x00, 0xD6, 0xD0, 0xD6, 0xDD, /* 0x20-0x23 */
+	0xD6, 0xD1, 0xD6, 0xCE, 0xB2, 0xC5, 0x00, 0x00, /* 0x24-0x27 */
+	0xB2, 0xC2, 0x00, 0x00, 0xD6, 0xD4, 0xD6, 0xD7, /* 0x28-0x2B */
+	0xB2, 0xC4, 0xD6, 0xD8, 0xB2, 0xC8, 0xD6, 0xD9, /* 0x2C-0x2F */
+	0xD6, 0xCF, 0xD6, 0xD6, 0xD6, 0xDA, 0xD6, 0xD2, /* 0x30-0x33 */
+	0xD6, 0xCD, 0xD6, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xD6, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDF, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xDA, 0xE0, 0xDA, 0xE6, 0xB5, 0xA7, 0xD6, 0xCC, /* 0x44-0x47 */
+	0xDA, 0xE1, 0xB5, 0xA5, 0xDA, 0xDE, 0xB5, 0xAC, /* 0x48-0x4B */
+	0xDA, 0xE2, 0xB5, 0xAB, 0xDA, 0xE3, 0xB5, 0xAD, /* 0x4C-0x4F */
+	0xB5, 0xA8, 0xB5, 0xAE, 0xB5, 0xA9, 0x00, 0x00, /* 0x50-0x53 */
+	0xB5, 0xAA, 0x00, 0x00, 0xB5, 0xA6, 0x00, 0x00, /* 0x54-0x57 */
+	0xDA, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB8, 0x61, 0xDF, 0x50, 0x00, 0x00, 0xDF, 0x53, /* 0x60-0x63 */
+	0xDF, 0x47, 0xDF, 0x4C, 0xDF, 0x46, 0xB8, 0x63, /* 0x64-0x67 */
+	0x00, 0x00, 0xDF, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xDF, 0x48, 0xB8, 0x62, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDF, 0x4F, 0xDF, 0x4E, 0xDF, 0x4B, 0xDF, 0x4D, /* 0x70-0x73 */
+	0xDF, 0x49, 0xBA, 0xE1, 0xDF, 0x52, 0xB8, 0x5F, /* 0x74-0x77 */
+	0xDF, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, 0x00, 0x00, /* 0x80-0x83 */
+	0xBA, 0xE8, 0xE3, 0x58, 0x00, 0x00, 0xBA, 0xE7, /* 0x84-0x87 */
+	0xE3, 0x4E, 0x00, 0x00, 0xE3, 0x50, 0xBA, 0xE0, /* 0x88-0x8B */
+	0xE3, 0x55, 0xE3, 0x54, 0xE3, 0x57, 0xBA, 0xE5, /* 0x8C-0x8F */
+	0xE3, 0x52, 0xE3, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xBA, 0xE4, 0xBA, 0xDF, 0xE3, 0x53, 0xBA, 0xE2, /* 0x94-0x97 */
+	0xE3, 0x59, 0xE3, 0x5B, 0x00, 0x00, 0xE3, 0x56, /* 0x98-0x9B */
+	0xE3, 0x4F, 0xBA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xBD, 0x69, 0xBA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE6, 0xD9, 0xBD, 0x62, 0x00, 0x00, 0xE6, 0xDB, /* 0xAC-0xAF */
+	0x00, 0x00, 0xBD, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xBD, 0x65, 0xE6, 0xDE, 0x00, 0x00, 0xE6, 0xD6, /* 0xB4-0xB7 */
+	0xBA, 0xE6, 0xE6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xB8, 0x60, 0xBD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xBD, 0x64, 0x00, 0x00, 0xBD, 0x66, 0xBD, 0x67, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xBF, 0x76, 0xE6, 0xDD, 0xE6, 0xD7, /* 0xC8-0xCB */
+	0xBD, 0x6A, 0x00, 0x00, 0xE6, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0xC0, 0xEA, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEA, 0xC5, 0xBF, 0x74, 0xEA, 0xBD, 0xBF, 0x78, /* 0xD8-0xDB */
+	0xEA, 0xC3, 0xEA, 0xBA, 0xEA, 0xB7, 0xEA, 0xC6, /* 0xDC-0xDF */
+	0xC1, 0x51, 0xBF, 0x79, 0xEA, 0xC2, 0xEA, 0xB8, /* 0xE0-0xE3 */
+	0xBF, 0x77, 0xEA, 0xBC, 0xBF, 0x7B, 0xEA, 0xB9, /* 0xE4-0xE7 */
+	0xEA, 0xBE, 0xBF, 0x7A, 0xEA, 0xC1, 0xEA, 0xC4, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xED, 0xCB, 0xED, 0xCC, 0xED, 0xBC, 0xED, 0xC3, /* 0xF0-0xF3 */
+	0xED, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x4F, /* 0xF4-0xF7 */
+	0xED, 0xC8, 0xEA, 0xBF, 0x00, 0x00, 0xED, 0xBF, /* 0xF8-0xFB */
+	0x00, 0x00, 0xED, 0xC9, 0xC1, 0x4E, 0xED, 0xBE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0xED, 0xBD, 0xED, 0xC7, 0xED, 0xC4, 0xED, 0xC6, /* 0x00-0x03 */
+	0x00, 0x00, 0xED, 0xBA, 0xED, 0xCA, 0xC1, 0x4C, /* 0x04-0x07 */
+	0x00, 0x00, 0xED, 0xC5, 0xED, 0xCE, 0xED, 0xC2, /* 0x08-0x0B */
+	0xC1, 0x50, 0xC1, 0x4D, 0xED, 0xC0, 0xED, 0xBB, /* 0x0C-0x0F */
+	0xED, 0xCD, 0xBF, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF0, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xF0, 0x61, 0xF0, 0x67, 0xC2, 0xB0, 0xF0, 0x65, /* 0x1C-0x1F */
+	0xF0, 0x64, 0xC2, 0xB2, 0xF0, 0x6A, 0xC2, 0xB1, /* 0x20-0x23 */
+	0x00, 0x00, 0xF0, 0x6B, 0xF0, 0x68, 0xC2, 0xAE, /* 0x24-0x27 */
+	0xF0, 0x69, 0xF0, 0x62, 0xC2, 0xAF, 0xC2, 0xAD, /* 0x28-0x2B */
+	0xF2, 0xAB, 0xF0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF0, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xB2, /* 0x34-0x37 */
+	0xC3, 0xB0, 0xF2, 0xAA, 0x00, 0x00, 0xF2, 0xAC, /* 0x38-0x3B */
+	0xF2, 0xA9, 0xC3, 0xB1, 0xC3, 0xAE, 0xC3, 0xAF, /* 0x3C-0x3F */
+	0xC3, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x78, /* 0x40-0x43 */
+	0x00, 0x00, 0xF4, 0xAA, 0x00, 0x00, 0xF4, 0xA9, /* 0x44-0x47 */
+	0xF4, 0xA7, 0xF4, 0xA6, 0xF4, 0xA8, 0x00, 0x00, /* 0x48-0x4B */
+	0xC4, 0x77, 0xC4, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xC4, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE5, /* 0x50-0x53 */
+	0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFA, /* 0x54-0x57 */
+	0x00, 0x00, 0xF6, 0xFC, 0xF6, 0xFE, 0xF6, 0xFD, /* 0x58-0x5B */
+	0xF6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xA3, /* 0x5C-0x5F */
+	0xC5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xD3, /* 0x60-0x63 */
+	0xC5, 0xD2, 0xC5, 0xD4, 0xF7, 0xED, 0xF7, 0xEC, /* 0x64-0x67 */
+	0x00, 0x00, 0xF8, 0xFB, 0xF8, 0xB8, 0xF8, 0xFC, /* 0x68-0x6B */
+	0xC6, 0x58, 0x00, 0x00, 0xC6, 0x59, 0xF9, 0x6D, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xC6, 0x7E, 0xA6, 0xCC, /* 0x70-0x73 */
+	0x00, 0x00, 0xCD, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x44, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xAC, 0xF3, 0x00, 0x00, 0xD0, 0x47, /* 0x7C-0x7F */
+	
+	0xD0, 0x48, 0xD0, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xD3, 0x49, 0xD3, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD3, 0x4D, 0xAF, 0xBB, 0xD3, 0x4B, 0x00, 0x00, /* 0x88-0x8B */
+	0xD3, 0x4C, 0xD3, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD3, 0x4A, 0xB2, 0xC9, 0x00, 0x00, /* 0x90-0x93 */
+	0xD6, 0xDE, 0xB2, 0xCB, 0xD6, 0xE0, 0xB2, 0xCA, /* 0x94-0x97 */
+	0xD6, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xE8, 0xB5, 0xAF, /* 0x9C-0x9F */
+	0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xE7, 0xD6, 0xE1, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xB5, 0xB0, 0x00, 0x00, 0xF9, 0xDB, /* 0xA4-0xA7 */
+	0xDA, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x56, /* 0xAC-0xAF */
+	0x00, 0x00, 0xB8, 0x64, 0xDF, 0x54, 0xB8, 0x65, /* 0xB0-0xB3 */
+	0xDF, 0x55, 0xB8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xBA, 0xE9, 0xE3, 0x61, 0xE3, 0x5E, /* 0xB8-0xBB */
+	0xE3, 0x60, 0xBA, 0xEA, 0xBA, 0xEB, 0xE3, 0x5F, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE6, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE6, 0xE0, 0x00, 0x00, 0xBD, 0x6B, 0xE6, 0xE2, /* 0xC8-0xCB */
+	0xE6, 0xE1, 0x00, 0x00, 0xA2, 0x61, 0x00, 0x00, /* 0xCC-0xCF */
+	0xEA, 0xCA, 0xEA, 0xCB, 0xEA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0xC8, 0xBF, 0x7C, 0xBF, 0x7D, 0xEA, 0xC9, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xC1, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC1, 0x53, 0xC1, 0x58, 0xC1, 0x54, 0xC1, 0x56, /* 0xDC-0xDF */
+	0xC1, 0x52, 0x00, 0x00, 0xC1, 0x55, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0xB3, /* 0xE4-0xE7 */
+	0xED, 0xCF, 0x00, 0x00, 0xF2, 0xAE, 0x00, 0x00, /* 0xE8-0xEB */
+	0xF2, 0xAD, 0x00, 0x00, 0xF4, 0xAB, 0xC4, 0x7A, /* 0xEC-0xEF */
+	0xC4, 0x7B, 0xF7, 0x41, 0xF5, 0xE6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xF7, 0x40, 0x00, 0x00, 0xF8, 0xFD, 0xF9, 0xA4, /* 0xF4-0xF7 */
+	0xA6, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x74, /* 0xF8-0xFB */
+	0x00, 0x00, 0xCD, 0xA9, 0xAA, 0xC8, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0xAC, 0xF6, 0xD0, 0x4C, 0xAC, 0xF4, 0xD0, 0x4A, /* 0x00-0x03 */
+	0xAC, 0xF9, 0xAC, 0xF5, 0xAC, 0xFA, 0xAC, 0xF8, /* 0x04-0x07 */
+	0xD0, 0x4B, 0xAC, 0xF7, 0xAF, 0xBF, 0xAF, 0xBE, /* 0x08-0x0B */
+	0xD3, 0x5A, 0xAF, 0xC7, 0xD3, 0x53, 0xD3, 0x59, /* 0x0C-0x0F */
+	0xAF, 0xC3, 0xD3, 0x52, 0xD3, 0x58, 0xD3, 0x56, /* 0x10-0x13 */
+	0xAF, 0xC2, 0xAF, 0xC4, 0xD3, 0x55, 0xAF, 0xBD, /* 0x14-0x17 */
+	0xD3, 0x54, 0xAF, 0xC8, 0xAF, 0xC5, 0xAF, 0xC9, /* 0x18-0x1B */
+	0xAF, 0xC6, 0xD3, 0x51, 0xD3, 0x50, 0xD3, 0x57, /* 0x1C-0x1F */
+	0xAF, 0xC0, 0xAF, 0xBC, 0xAF, 0xC1, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xD6, 0xF0, 0xD6, 0xE9, 0x00, 0x00, 0xB5, 0xB5, /* 0x28-0x2B */
+	0xD6, 0xE8, 0x00, 0x00, 0xB2, 0xCF, 0xB2, 0xD6, /* 0x2C-0x2F */
+	0xB2, 0xD3, 0xB2, 0xD9, 0xB2, 0xD8, 0xB2, 0xD4, /* 0x30-0x33 */
+	0x00, 0x00, 0xD6, 0xE2, 0xD6, 0xE5, 0x00, 0x00, /* 0x34-0x37 */
+	0xD6, 0xE4, 0xB2, 0xD0, 0xD6, 0xE6, 0xD6, 0xEF, /* 0x38-0x3B */
+	0xB2, 0xD1, 0xD6, 0xE3, 0xD6, 0xEC, 0xD6, 0xED, /* 0x3C-0x3F */
+	0xB2, 0xD2, 0xD6, 0xEA, 0xB2, 0xD7, 0xB2, 0xCD, /* 0x40-0x43 */
+	0xB2, 0xD5, 0xD6, 0xE7, 0xB2, 0xCC, 0xD6, 0xEB, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xEE, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xFB, 0xDA, 0xF2, /* 0x4C-0x4F */
+	0xB5, 0xB2, 0xDA, 0xF9, 0xDA, 0xF6, 0xDA, 0xEE, /* 0x50-0x53 */
+	0xDA, 0xF7, 0xB5, 0xB4, 0xDA, 0xEF, 0x00, 0x00, /* 0x54-0x57 */
+	0xDA, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xB8, 0x6C, /* 0x58-0x5B */
+	0xDA, 0xF4, 0x00, 0x00, 0xB5, 0xB1, 0xDA, 0xFA, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB5, 0xB8, 0xB5, 0xBA, 0xDA, 0xED, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0xB9, 0xDA, 0xF0, /* 0x64-0x67 */
+	0xB5, 0xB3, 0xDA, 0xF8, 0xDA, 0xF1, 0xDA, 0xF5, /* 0x68-0x6B */
+	0x00, 0x00, 0xDA, 0xF3, 0xB5, 0xB6, 0xDA, 0xEC, /* 0x6C-0x6F */
+	0xB5, 0xBB, 0xB2, 0xCE, 0xB5, 0xB7, 0xB5, 0xBC, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xB8, 0x68, 0xDF, 0x5D, 0xDF, 0x5F, /* 0x78-0x7B */
+	0xDF, 0x61, 0xDF, 0x65, 0x00, 0x00, 0xDF, 0x5B, /* 0x7C-0x7F */
+	
+	0xDF, 0x59, 0xB8, 0x6A, 0x00, 0x00, 0xDF, 0x60, /* 0x80-0x83 */
+	0xDF, 0x64, 0xDF, 0x5C, 0xDF, 0x58, 0x00, 0x00, /* 0x84-0x87 */
+	0xDF, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xDF, 0x62, 0xDF, 0x5A, 0xDF, 0x5E, 0xB8, 0x6B, /* 0x8C-0x8F */
+	0x00, 0x00, 0xB8, 0x69, 0xDF, 0x66, 0xB8, 0x67, /* 0x90-0x93 */
+	0xDF, 0x63, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xBA, 0xEE, 0xE3, 0x6A, 0xBD, 0x78, 0xE3, 0x74, /* 0x9C-0x9F */
+	0xBA, 0xF1, 0xE3, 0x78, 0xBA, 0xF7, 0xE3, 0x65, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x75, 0xE3, 0x62, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE3, 0x77, 0xE3, 0x66, 0x00, 0x00, /* 0xA8-0xAB */
+	0xBA, 0xFE, 0xBA, 0xFB, 0xE3, 0x76, 0xE3, 0x70, /* 0xAC-0xAF */
+	0xBA, 0xED, 0xBA, 0xF5, 0xBA, 0xF4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xBA, 0xF3, 0xBA, 0xF9, 0x00, 0x00, 0xE3, 0x63, /* 0xB4-0xB7 */
+	0xBA, 0xFA, 0xE3, 0x71, 0xBA, 0xF6, 0xBA, 0xEC, /* 0xB8-0xBB */
+	0xE3, 0x73, 0xBA, 0xEF, 0xBA, 0xF0, 0xBA, 0xF8, /* 0xBC-0xBF */
+	0xE3, 0x68, 0xE3, 0x67, 0xE3, 0x64, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE3, 0x6C, 0xE3, 0x69, 0xE3, 0x6D, 0xBA, 0xFD, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE3, 0x79, 0xBA, 0xF2, 0xE3, 0x6E, /* 0xC8-0xCB */
+	0xE3, 0x6F, 0x00, 0x00, 0xE3, 0x6B, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0xFC, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0xD4-0xD7 */
+	0xBD, 0x70, 0xBD, 0x79, 0xBD, 0x75, 0xE6, 0xE4, /* 0xD8-0xDB */
+	0x00, 0x00, 0xBD, 0x72, 0xBD, 0x76, 0xE6, 0xF0, /* 0xDC-0xDF */
+	0xBD, 0x6C, 0xE6, 0xE8, 0x00, 0x00, 0xBD, 0x74, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, 0xE6, 0xE6, /* 0xE4-0xE7 */
+	0xBD, 0x73, 0xBD, 0x77, 0xE6, 0xE5, 0x00, 0x00, /* 0xE8-0xEB */
+	0xBD, 0x71, 0x00, 0x00, 0xE6, 0xEF, 0xBD, 0x6E, /* 0xEC-0xEF */
+	0xE6, 0xEE, 0xE6, 0xED, 0xBD, 0x7A, 0xE5, 0x72, /* 0xF0-0xF3 */
+	0xBD, 0x6D, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xE3, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xBD, 0x7B, 0xE6, 0xEA, 0xBD, 0x6F, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xBF, 0xA2, 0xBF, 0xA7, 0xBF, 0x7E, 0xEA, 0xD8, /* 0x08-0x0B */
+	0xEA, 0xCF, 0xEA, 0xDB, 0xEA, 0xD3, 0xEA, 0xD9, /* 0x0C-0x0F */
+	0xBF, 0xA8, 0xBF, 0xA1, 0xEA, 0xCC, 0xEA, 0xD2, /* 0x10-0x13 */
+	0xEA, 0xDC, 0xEA, 0xD5, 0xEA, 0xDA, 0xEA, 0xCE, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, 0xBF, 0xA3, /* 0x18-0x1B */
+	0xEA, 0xD4, 0xBF, 0xA6, 0xBF, 0xA5, 0xEA, 0xD0, /* 0x1C-0x1F */
+	0xEA, 0xD1, 0xEA, 0xCD, 0xEA, 0xD7, 0xBF, 0xA4, /* 0x20-0x23 */
+	0xEA, 0xDE, 0xEA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xED, 0xDA, 0xED, 0xD6, 0xC1, 0x5F, /* 0x28-0x2B */
+	0x00, 0x00, 0xED, 0xD0, 0xC1, 0x59, 0xC1, 0x69, /* 0x2C-0x2F */
+	0xED, 0xDC, 0xC1, 0x61, 0xC1, 0x5D, 0xED, 0xD3, /* 0x30-0x33 */
+	0xC1, 0x64, 0xC1, 0x67, 0xED, 0xDE, 0xC1, 0x5C, /* 0x34-0x37 */
+	0xED, 0xD5, 0xC1, 0x65, 0xED, 0xE0, 0xED, 0xDD, /* 0x38-0x3B */
+	0xED, 0xD1, 0xC1, 0x60, 0xC1, 0x5A, 0xC1, 0x68, /* 0x3C-0x3F */
+	0xED, 0xD8, 0xC1, 0x63, 0xED, 0xD2, 0xC1, 0x5E, /* 0x40-0x43 */
+	0xED, 0xDF, 0xC1, 0x62, 0xC1, 0x5B, 0xED, 0xD9, /* 0x44-0x47 */
+	0xC1, 0x66, 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xED, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF0, 0x6E, 0xF0, 0x74, 0xC2, 0xB9, 0xF0, 0x77, /* 0x50-0x53 */
+	0xC2, 0xB4, 0xC2, 0xB5, 0xF0, 0x6F, 0xF0, 0x76, /* 0x54-0x57 */
+	0xF0, 0x71, 0xC2, 0xBA, 0xC2, 0xB7, 0x00, 0x00, /* 0x58-0x5B */
+	0xF0, 0x6D, 0x00, 0x00, 0xC2, 0xB6, 0xF0, 0x73, /* 0x5C-0x5F */
+	0xF0, 0x75, 0xC2, 0xB8, 0xF0, 0x72, 0xF0, 0x70, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF2, 0xB8, 0xC3, 0xB7, 0xC3, 0xB8, 0xC3, 0xB4, /* 0x68-0x6B */
+	0x00, 0x00, 0xC3, 0xB5, 0x00, 0x00, 0xF2, 0xB4, /* 0x6C-0x6F */
+	0xF2, 0xB2, 0x00, 0x00, 0xF2, 0xB6, 0xC3, 0xBA, /* 0x70-0x73 */
+	0xF2, 0xB7, 0xF2, 0xB0, 0xF2, 0xAF, 0xF2, 0xB3, /* 0x74-0x77 */
+	0xF2, 0xB1, 0xC3, 0xB6, 0xF2, 0xB5, 0xF4, 0xAC, /* 0x78-0x7B */
+	0xC4, 0x7E, 0xC4, 0x7D, 0xF4, 0xAD, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xF4, 0xAF, 0xF4, 0xAE, 0xC4, 0xA1, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xEB, 0xF5, 0xE8, /* 0x84-0x87 */
+	0xF5, 0xE9, 0x00, 0x00, 0xF5, 0xE7, 0xF5, 0xEA, /* 0x88-0x8B */
+	0xC4, 0xF2, 0xF5, 0xEC, 0x00, 0x00, 0xC4, 0xF1, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF7, 0x42, 0x00, 0x00, 0xC5, 0xD5, /* 0x90-0x93 */
+	0xC5, 0xD7, 0xF7, 0xEE, 0xC5, 0xD6, 0xF8, 0xB9, /* 0x94-0x97 */
+	0xF9, 0x40, 0xF9, 0x42, 0xF8, 0xFE, 0xF9, 0x41, /* 0x98-0x9B */
+	0xC6, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0xCE, 0x00, 0x00, /* 0x34-0x37 */
+	0xAC, 0xFB, 0xD2, 0x6F, 0xAF, 0xCA, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB2, 0xDA, 0xDA, 0xFC, 0xDA, 0xFD, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0x40-0x43 */
+	0xC1, 0x6A, 0xED, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xC2, 0xBB, 0x00, 0x00, 0xF2, 0xBA, 0xF2, 0xB9, /* 0x48-0x4B */
+	0xC4, 0xA2, 0xF5, 0xED, 0x00, 0x00, 0xF7, 0x43, /* 0x4C-0x4F */
+	0xC5, 0xF8, 0xCA, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xAA, 0xC9, 0xA8, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x60, /* 0x58-0x5B */
+	0xD3, 0x5B, 0xD3, 0x5F, 0xD3, 0x5D, 0xAF, 0xCB, /* 0x5C-0x5F */
+	0xD3, 0x5E, 0xD3, 0x5C, 0x00, 0x00, 0xD6, 0xF1, /* 0x60-0x63 */
+	0x00, 0x00, 0xDA, 0xFE, 0xDB, 0x40, 0xDF, 0x69, /* 0x64-0x67 */
+	0xDF, 0x6A, 0xB8, 0x6E, 0xB8, 0x6F, 0xDF, 0x68, /* 0x68-0x6B */
+	0xDF, 0x6B, 0xDF, 0x67, 0xB8, 0x6D, 0x00, 0x00, /* 0x6C-0x6F */
+	0xBB, 0x40, 0x00, 0x00, 0xB8, 0x70, 0xE3, 0x7A, /* 0x70-0x73 */
+	0x00, 0x00, 0xBD, 0x7C, 0xE6, 0xF1, 0xBD, 0x7D, /* 0x74-0x77 */
+	0x00, 0x00, 0xBF, 0xA9, 0xEA, 0xE2, 0xEA, 0xE0, /* 0x78-0x7B */
+	0xEA, 0xE1, 0xED, 0xE4, 0xED, 0xE3, 0xED, 0xE2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBB, /* 0x80-0x83 */
+	0x00, 0x00, 0xC3, 0xB9, 0xF2, 0xBC, 0xF7, 0x44, /* 0x84-0x87 */
+	0xC5, 0xF9, 0xF8, 0xBA, 0xA6, 0xCF, 0xAA, 0xCB, /* 0x88-0x8B */
+	0xAA, 0xCA, 0xD0, 0x4F, 0xAC, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD0, 0x4E, 0xD3, 0x62, 0x00, 0x00, /* 0x90-0x93 */
+	0xAF, 0xCC, 0xD6, 0xF2, 0xD3, 0x61, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xDC, 0xD6, 0xF5, /* 0x98-0x9B */
+	0xD6, 0xF3, 0xD6, 0xF4, 0xB2, 0xDB, 0x00, 0x00, /* 0x9C-0x9F */
+	0xDB, 0x42, 0xDB, 0x43, 0xDB, 0x41, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xB8, 0x73, 0xDF, 0x6D, 0xDF, 0x6C, 0xDF, 0x6E, /* 0xA4-0xA7 */
+	0xB8, 0x72, 0xB8, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE6, 0xF2, 0xE6, 0xF4, 0x00, 0x00, 0xBD, 0x7E, /* 0xAC-0xAF */
+	0xE6, 0xF3, 0xEA, 0xE3, 0xBF, 0xAA, 0xF0, 0x79, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF0, 0x78, 0xC3, 0xBB, 0xF2, 0xBD, /* 0xB4-0xB7 */
+	0xC3, 0xBD, 0xC3, 0xBC, 0xF4, 0xB0, 0xF5, 0xEE, /* 0xB8-0xBB */
+	0xC4, 0xF3, 0xA6, 0xD0, 0xD0, 0x50, 0xAC, 0xFD, /* 0xBC-0xBF */
+	0xD3, 0x65, 0xAF, 0xCE, 0xD3, 0x64, 0xD3, 0x63, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xAF, 0xCD, 0x00, 0x00, 0xD6, 0xFB, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD6, 0xFD, 0xD6, 0xF6, 0xD6, 0xF7, /* 0xC8-0xCB */
+	0xB2, 0xDD, 0xD6, 0xF8, 0xB2, 0xDE, 0xD6, 0xFC, /* 0xCC-0xCF */
+	0xD6, 0xF9, 0xD6, 0xFA, 0xB2, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB5, 0xBE, 0xB5, 0xBF, 0x00, 0x00, 0xDB, 0x44, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x6F, /* 0xD8-0xDB */
+	0xDF, 0x70, 0x00, 0x00, 0xE3, 0x7E, 0xBB, 0x43, /* 0xDC-0xDF */
+	0xBB, 0x41, 0xBB, 0x42, 0xE3, 0x7B, 0xE3, 0x7C, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE3, 0x7D, 0xE6, 0xF9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE6, 0xFA, 0xBD, 0xA1, 0xE6, 0xF7, 0xE6, 0xF6, /* 0xE8-0xEB */
+	0xE6, 0xF8, 0xE6, 0xF5, 0xBF, 0xAD, 0xEA, 0xE4, /* 0xEC-0xEF */
+	0xBF, 0xAB, 0xBF, 0xAC, 0xED, 0xE6, 0xC1, 0x6B, /* 0xF0-0xF3 */
+	0xED, 0xE5, 0xEF, 0xA8, 0x00, 0x00, 0xF0, 0x7A, /* 0xF4-0xF7 */
+	0xF0, 0x7B, 0xC2, 0xBC, 0x00, 0x00, 0xC2, 0xBD, /* 0xF8-0xFB */
+	0xC1, 0x6C, 0xF2, 0xBE, 0xF2, 0xBF, 0xF4, 0xB1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0xC4, 0xA3, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0x00-0x03 */
+	0xAC, 0xFE, 0xAA, 0xCC, 0xAF, 0xCF, 0xD0, 0x51, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xC0, /* 0x08-0x0B */
+	0xA6, 0xD3, 0xAD, 0x41, 0xD0, 0x52, 0xD0, 0x53, /* 0x0C-0x0F */
+	0xAD, 0x40, 0xAD, 0x42, 0xA6, 0xD4, 0x00, 0x00, /* 0x10-0x13 */
+	0xD0, 0x54, 0xAF, 0xD1, 0xD3, 0x66, 0xAF, 0xD3, /* 0x14-0x17 */
+	0xAF, 0xD0, 0xAF, 0xD2, 0x00, 0x00, 0xD7, 0x41, /* 0x18-0x1B */
+	0xB2, 0xE0, 0x00, 0x00, 0xD7, 0x40, 0xD6, 0xFE, /* 0x1C-0x1F */
+	0x00, 0x00, 0xDF, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE3, 0xA1, 0x00, 0x00, 0xBD, 0xA2, 0x00, 0x00, /* 0x24-0x27 */
+	0xBF, 0xAE, 0xEA, 0xE6, 0xEA, 0xE5, 0x00, 0x00, /* 0x28-0x2B */
+	0xED, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF5, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xD5, /* 0x30-0x33 */
+	0xCB, 0x73, 0xCD, 0xAA, 0xAD, 0x43, 0xD0, 0x55, /* 0x34-0x37 */
+	0x00, 0x00, 0xD3, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xAF, 0xD4, 0xD3, 0x67, 0xAF, 0xD5, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x43, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xE2, 0xD7, 0x42, /* 0x44-0x47 */
+	0xD7, 0x44, 0x00, 0x00, 0xB2, 0xE1, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x46, /* 0x4C-0x4F */
+	0xDB, 0x47, 0xDB, 0x45, 0xB5, 0xC1, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0x74, 0x00, 0x00, /* 0x54-0x57 */
+	0xB8, 0x75, 0x00, 0x00, 0xBB, 0x45, 0x00, 0x00, /* 0x58-0x5B */
+	0xE3, 0xA3, 0xE3, 0xA2, 0xBB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFC, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xEA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x70, /* 0x6C-0x6F */
+	0xC1, 0x6F, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x71, /* 0x70-0x73 */
+	0x00, 0x00, 0xF0, 0x7C, 0xC2, 0xBF, 0xC2, 0xBE, /* 0x74-0x77 */
+	0xF2, 0xC0, 0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xC5, 0xA5, 0xC5, 0xA4, 0xA6, 0xD6, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, 0x00, 0x00, /* 0x80-0x83 */
+	0xB8, 0x77, 0xB5, 0xC2, 0xB8, 0x76, 0xBB, 0x46, /* 0x84-0x87 */
+	0x00, 0x00, 0xA6, 0xD7, 0xC9, 0xA9, 0xA6, 0xD8, /* 0x88-0x8B */
+	0xA6, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAB, /* 0x8C-0x8F */
+	0xCB, 0x76, 0x00, 0x00, 0xCB, 0x77, 0xA8, 0x77, /* 0x90-0x93 */
+	0x00, 0x00, 0xCB, 0x74, 0xA8, 0x76, 0x00, 0x00, /* 0x94-0x97 */
+	0xA8, 0x79, 0xCB, 0x75, 0xA8, 0x7B, 0xA8, 0x7A, /* 0x98-0x9B */
+	0xCB, 0x78, 0xA8, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xAA, 0xD1, 0xAA, 0xCF, 0xCD, 0xAD, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xAA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xAA, 0xD3, 0xAA, 0xD5, 0xAA, 0xD2, /* 0xA8-0xAB */
+	0x00, 0x00, 0xCD, 0xB0, 0xCD, 0xAC, 0xAA, 0xD6, /* 0xAC-0xAF */
+	0x00, 0x00, 0xAA, 0xD0, 0xA8, 0x7C, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xAA, 0xD4, 0xCD, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xCD, 0xAE, 0x00, 0x00, 0xAA, 0xCD, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0x5B, 0xAD, 0x47, /* 0xC0-0xC3 */
+	0xAD, 0x48, 0xD0, 0x5D, 0x00, 0x00, 0xD0, 0x57, /* 0xC4-0xC7 */
+	0xD0, 0x5A, 0xD0, 0x63, 0xD0, 0x61, 0x00, 0x00, /* 0xC8-0xCB */
+	0xAD, 0x49, 0xD0, 0x67, 0xAD, 0x4C, 0xD0, 0x64, /* 0xCC-0xCF */
+	0xD0, 0x5C, 0xD0, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDB, 0x49, 0xD0, 0x62, 0xAD, 0x44, 0xD0, 0x65, /* 0xD4-0xD7 */
+	0xD0, 0x56, 0xD0, 0x5F, 0xAD, 0x46, 0xAD, 0x4B, /* 0xD8-0xDB */
+	0xD0, 0x60, 0xAD, 0x4F, 0xAD, 0x4D, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD0, 0x58, 0xAD, 0x4A, 0x00, 0x00, 0xD0, 0x5E, /* 0xE0-0xE3 */
+	0xAD, 0x4E, 0xAD, 0x45, 0xD0, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAF, 0xDA, 0x00, 0x00, 0xAF, 0xE3, /* 0xEC-0xEF */
+	0xAF, 0xD8, 0xAF, 0xD6, 0xD3, 0x6A, 0xAF, 0xDE, /* 0xF0-0xF3 */
+	0xAF, 0xDB, 0xD3, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xAF, 0xDD, 0xD3, 0x6B, 0xD3, 0x69, 0xD3, 0x6E, /* 0xF8-0xFB */
+	0xAF, 0xE2, 0xAF, 0xE0, 0xDB, 0x48, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0xD3, 0x6F, 0xD3, 0x6D, 0xAF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xAF, 0xD9, 0xAF, 0xDC, 0x00, 0x00, /* 0x04-0x07 */
+	0xAF, 0xDF, 0x00, 0x00, 0xAF, 0xE1, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xD7, 0x4E, 0xB2, 0xE4, 0x00, 0x00, /* 0x14-0x17 */
+	0xD7, 0x45, 0xD7, 0x47, 0x00, 0x00, 0xD7, 0x48, /* 0x18-0x1B */
+	0x00, 0x00, 0xD7, 0x50, 0xD7, 0x4C, 0xD7, 0x4A, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD7, 0x4D, 0xD7, 0x51, 0xB2, 0xE5, /* 0x20-0x23 */
+	0xB2, 0xE9, 0xD7, 0x46, 0x00, 0x00, 0xD7, 0x4F, /* 0x24-0x27 */
+	0x00, 0x00, 0xB2, 0xE7, 0x00, 0x00, 0xB2, 0xE6, /* 0x28-0x2B */
+	0xD7, 0x4B, 0xD7, 0x49, 0x00, 0x00, 0xB2, 0xE3, /* 0x2C-0x2F */
+	0xB2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xB5, 0xC8, 0xDB, 0x51, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xDB, 0x4F, 0xB5, 0xCA, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x4A, /* 0x40-0x43 */
+	0xDF, 0xA1, 0x00, 0x00, 0xB5, 0xC9, 0xDB, 0x4E, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0x4B, 0xB5, 0xC5, /* 0x48-0x4B */
+	0xB5, 0xCB, 0xDB, 0x50, 0xB5, 0xC7, 0xDB, 0x4D, /* 0x4C-0x4F */
+	0xBB, 0x47, 0xB5, 0xC6, 0xDB, 0x4C, 0xB5, 0xCC, /* 0x50-0x53 */
+	0xB5, 0xC4, 0xB5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x77, /* 0x58-0x5B */
+	0xDF, 0x75, 0x00, 0x00, 0xDF, 0x7B, 0x00, 0x00, /* 0x5C-0x5F */
+	0xDF, 0x73, 0xDF, 0xA2, 0xDF, 0x78, 0x00, 0x00, /* 0x60-0x63 */
+	0xDF, 0x72, 0xB8, 0x7B, 0xB8, 0xA3, 0xDF, 0x7D, /* 0x64-0x67 */
+	0x00, 0x00, 0xDF, 0x76, 0x00, 0x00, 0xB8, 0x7E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0x7C, 0xDF, 0x7E, /* 0x6C-0x6F */
+	0xB8, 0x79, 0xB8, 0x78, 0xDF, 0x79, 0xB8, 0x7D, /* 0x70-0x73 */
+	0xB5, 0xCD, 0x00, 0x00, 0xDF, 0x7C, 0xDF, 0x74, /* 0x74-0x77 */
+	0xB8, 0x7A, 0xB8, 0xA1, 0xB8, 0xA2, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x4C, /* 0x7C-0x7F */
+	
+	0xBB, 0x48, 0x00, 0x00, 0xBB, 0x4D, 0xE3, 0xA6, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA5, 0xE3, 0xA7, /* 0x84-0x87 */
+	0xBB, 0x4A, 0xE3, 0xA4, 0xBB, 0x4B, 0xE3, 0xAA, /* 0x88-0x8B */
+	0xE3, 0xA9, 0xE3, 0xA8, 0x00, 0x00, 0xBB, 0x49, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xE7, 0x41, 0x00, 0x00, 0xE7, 0x44, /* 0x94-0x97 */
+	0xBD, 0xA8, 0xE7, 0x43, 0xBD, 0xA7, 0xBD, 0xA3, /* 0x98-0x9B */
+	0xBD, 0xA4, 0xBD, 0xA5, 0xE7, 0x40, 0xE6, 0xFE, /* 0x9C-0x9F */
+	0xBD, 0xA6, 0x00, 0x00, 0xE7, 0x42, 0xE6, 0xFD, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xE9, 0xEA, 0xF3, /* 0xA4-0xA7 */
+	0xBF, 0xB1, 0xBF, 0xB0, 0x00, 0x00, 0xEA, 0xED, /* 0xA8-0xAB */
+	0xEA, 0xEF, 0x00, 0x00, 0xEA, 0xEA, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEA, 0xEE, 0xEA, 0xE8, 0xEA, 0xF1, 0xBF, 0xAF, /* 0xB0-0xB3 */
+	0xEA, 0xF0, 0xEA, 0xEC, 0x00, 0x00, 0xEA, 0xF2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xEA, 0xEB, 0xC1, 0x74, 0xED, 0xE8, /* 0xB8-0xBB */
+	0xED, 0xEE, 0xC1, 0x78, 0xC1, 0x7A, 0xC1, 0x77, /* 0xBC-0xBF */
+	0xC1, 0x76, 0x00, 0x00, 0xC1, 0x75, 0xC1, 0x73, /* 0xC0-0xC3 */
+	0xED, 0xE9, 0xED, 0xEC, 0xC1, 0x72, 0xED, 0xED, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC1, 0x79, 0xED, 0xEB, 0x00, 0x00, /* 0xC8-0xCB */
+	0xED, 0xEA, 0xC2, 0xC0, 0x00, 0x00, 0xC2, 0xC1, /* 0xCC-0xCF */
+	0xF0, 0xA1, 0xF0, 0x7D, 0xF0, 0x7E, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF2, 0xC2, 0x00, 0x00, 0xF2, 0xC1, /* 0xD4-0xD7 */
+	0xC3, 0xBE, 0xF4, 0xB4, 0xC4, 0xA4, 0xF4, 0xB3, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF5, 0xF0, 0xF7, 0x45, 0xC5, 0xA6, /* 0xDC-0xDF */
+	0xF9, 0x43, 0xF9, 0x44, 0xC5, 0xD8, 0xA6, 0xDA, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xAA, 0xD7, 0xDB, 0x52, 0xBB, 0x4E, /* 0xE4-0xE7 */
+	0xC1, 0x7B, 0xED, 0xEF, 0xA6, 0xDB, 0x00, 0x00, /* 0xE8-0xEB */
+	0xAF, 0xE5, 0xAF, 0xE4, 0xDB, 0x53, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xF4, 0xA6, 0xDC, /* 0xF0-0xF3 */
+	0xAD, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x54, /* 0xF4-0xF7 */
+	0xDB, 0x55, 0xDB, 0x56, 0xBB, 0x4F, 0xBF, 0xB2, /* 0xF8-0xFB */
+	0xA6, 0xDD, 0x00, 0x00, 0xAA, 0xD8, 0xD0, 0x68, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0xAF, 0xE6, 0xD3, 0x70, 0xB2, 0xEA, 0x00, 0x00, /* 0x00-0x03 */
+	0xDB, 0x57, 0xB8, 0xA4, 0x00, 0x00, 0xBB, 0x50, /* 0x04-0x07 */
+	0xBF, 0xB3, 0xC1, 0x7C, 0xC2, 0xC2, 0xF4, 0xB5, /* 0x08-0x0B */
+	0xA6, 0xDE, 0xAA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xAF, 0xE7, 0xD7, 0x52, 0xB5, 0xCE, 0x00, 0x00, /* 0x10-0x13 */
+	0xBB, 0x51, 0xE3, 0xAB, 0xE7, 0x45, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xDF, /* 0x18-0x1B */
+	0xB5, 0xCF, 0xDF, 0xA3, 0xBB, 0x52, 0xA6, 0xE0, /* 0x1C-0x1F */
+	0xCD, 0xB1, 0xD0, 0x69, 0xAD, 0x51, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD3, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xAF, 0xEA, 0x00, 0x00, 0xAF, 0xE8, 0xAF, 0xE9, /* 0x28-0x2B */
+	0xAF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x71, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0x57, 0xD7, 0x54, /* 0x30-0x33 */
+	0xD7, 0x56, 0xB2, 0xEB, 0xB2, 0xED, 0xB2, 0xEC, /* 0x34-0x37 */
+	0xD7, 0x53, 0xB2, 0xEE, 0xD7, 0x55, 0x00, 0x00, /* 0x38-0x3B */
+	0xDB, 0x58, 0xDB, 0x59, 0x00, 0x00, 0xDB, 0x5A, /* 0x3C-0x3F */
+	0xDF, 0xA6, 0x00, 0x00, 0xDF, 0xA7, 0x00, 0x00, /* 0x40-0x43 */
+	0xDF, 0xA5, 0xDF, 0xA8, 0x00, 0x00, 0xB8, 0xA5, /* 0x44-0x47 */
+	0x00, 0x00, 0xDF, 0xA4, 0x00, 0x00, 0xBB, 0x53, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x4A, 0xE7, 0x46, /* 0x4C-0x4F */
+	0xE7, 0x49, 0xE7, 0x4B, 0xE7, 0x48, 0xE7, 0x47, /* 0x50-0x53 */
+	0x00, 0x00, 0xEA, 0xF5, 0xEA, 0xF6, 0xEA, 0xF7, /* 0x54-0x57 */
+	0xBF, 0xB4, 0xBF, 0xB5, 0xED, 0xF1, 0xED, 0xF0, /* 0x58-0x5B */
+	0xED, 0xF2, 0x00, 0x00, 0xF0, 0xA3, 0xF0, 0xA2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF2, 0xC4, 0x00, 0x00, 0xF2, 0xC5, /* 0x60-0x63 */
+	0xF2, 0xC3, 0x00, 0x00, 0xC4, 0xA5, 0x00, 0x00, /* 0x64-0x67 */
+	0xF4, 0xB6, 0xF4, 0xB7, 0x00, 0x00, 0xF7, 0x46, /* 0x68-0x6B */
+	0xF7, 0xEF, 0xF8, 0xBB, 0xA6, 0xE1, 0xA8, 0x7D, /* 0x6C-0x6F */
+	0x00, 0x00, 0xC1, 0x7D, 0xA6, 0xE2, 0x00, 0x00, /* 0x70-0x73 */
+	0xD7, 0x58, 0xDB, 0x5B, 0x00, 0x00, 0xC6, 0x41, /* 0x74-0x77 */
+	0xCA, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xCA, 0x4B, 0xCA, 0x4D, 0xA6, 0xE3, 0xCA, 0x4E, /* 0x7C-0x7F */
+	
+	0xCA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA2, /* 0x80-0x83 */
+	0xCB, 0xA3, 0xCB, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xA1, 0xA8, 0xA1, /* 0x88-0x8B */
+	0x00, 0x00, 0xA8, 0xA2, 0xCB, 0x7C, 0xCB, 0x7A, /* 0x8C-0x8F */
+	0xCB, 0x79, 0xCB, 0x7D, 0xA8, 0x7E, 0xCB, 0x7E, /* 0x90-0x93 */
+	0xD0, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCD, 0xB6, 0xAA, 0xDC, 0xCD, 0xB5, 0xCD, 0xB7, /* 0x98-0x9B */
+	0x00, 0x00, 0xAA, 0xDB, 0xCD, 0xBC, 0xAA, 0xDF, /* 0x9C-0x9F */
+	0xCD, 0xB2, 0xCD, 0xC0, 0xCD, 0xC6, 0xAA, 0xE6, /* 0xA0-0xA3 */
+	0xCD, 0xC3, 0xAA, 0xE3, 0x00, 0x00, 0xCD, 0xB9, /* 0xA4-0xA7 */
+	0xCD, 0xBF, 0xCD, 0xC1, 0x00, 0x00, 0xCD, 0xB4, /* 0xA8-0xAB */
+	0xAA, 0xE2, 0xAA, 0xDD, 0xCD, 0xBA, 0xAA, 0xE4, /* 0xAC-0xAF */
+	0xAA, 0xE7, 0xAA, 0xE1, 0x00, 0x00, 0xAA, 0xDA, /* 0xB0-0xB3 */
+	0xCD, 0xBE, 0xCD, 0xB8, 0xCD, 0xC5, 0xAA, 0xE9, /* 0xB4-0xB7 */
+	0xAA, 0xE5, 0xAA, 0xE0, 0xCD, 0xBD, 0xAF, 0xEC, /* 0xB8-0xBB */
+	0xCD, 0xBB, 0xAA, 0xDE, 0xAA, 0xE8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCD, 0xB3, 0x00, 0x00, 0xCD, 0xC2, 0xCD, 0xC4, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xAD, 0x62, 0xAD, 0x5C, 0xAD, 0x64, /* 0xD0-0xD3 */
+	0xAD, 0x61, 0xD0, 0x71, 0xD0, 0x74, 0xAD, 0x5D, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xD0, 0x6B, 0x00, 0x00, 0xAD, 0x56, /* 0xD8-0xDB */
+	0xAD, 0x60, 0x00, 0x00, 0xAD, 0x63, 0xAD, 0x65, /* 0xDC-0xDF */
+	0xD0, 0xA2, 0xD0, 0x77, 0x00, 0x00, 0xAD, 0x55, /* 0xE0-0xE3 */
+	0xD0, 0xA1, 0xAD, 0x59, 0xAD, 0x57, 0xAD, 0x52, /* 0xE4-0xE7 */
+	0xD0, 0x6F, 0x00, 0x00, 0xD0, 0x7E, 0xD0, 0x73, /* 0xE8-0xEB */
+	0xD0, 0x76, 0xD0, 0xA5, 0x00, 0x00, 0xAD, 0x66, /* 0xEC-0xEF */
+	0xD0, 0x7D, 0xAD, 0x5E, 0xD0, 0x78, 0xD0, 0xA4, /* 0xF0-0xF3 */
+	0xD0, 0x75, 0xD0, 0x79, 0xD0, 0x7C, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xD0, 0x6D, 0xD0, 0xA3, 0xD0, 0x7B, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0x6C, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_83[512] = {
+	0xD0, 0x70, 0xAD, 0x5F, 0xAD, 0x5A, 0xAD, 0x53, /* 0x00-0x03 */
+	0xAD, 0x58, 0xAD, 0x54, 0xAD, 0x67, 0xD0, 0x6E, /* 0x04-0x07 */
+	0xD3, 0xA5, 0xAD, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xD0, 0x7A, 0xCE, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, 0xAF, 0xFA, /* 0x14-0x17 */
+	0x00, 0x00, 0xD3, 0x76, 0x00, 0x00, 0xD3, 0xA3, /* 0x18-0x1B */
+	0xD3, 0x7D, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD3, 0xAA, 0x00, 0x00, 0xD3, 0x7E, 0x00, 0x00, /* 0x20-0x23 */
+	0xD3, 0xA9, 0xD3, 0x78, 0xD3, 0x7C, 0xD3, 0xB5, /* 0x24-0x27 */
+	0xAF, 0xFD, 0xD3, 0xAD, 0xD3, 0xA4, 0xAF, 0xED, /* 0x28-0x2B */
+	0xD3, 0xB3, 0xD3, 0x74, 0x00, 0x00, 0xD3, 0xAC, /* 0x2C-0x2F */
+	0x00, 0x00, 0xAF, 0xFC, 0xAF, 0xF7, 0xD3, 0x73, /* 0x30-0x33 */
+	0xAF, 0xF5, 0xAF, 0xF4, 0xAF, 0xF9, 0xD3, 0xAB, /* 0x34-0x37 */
+	0xAF, 0xF1, 0xAF, 0xF8, 0xD0, 0x72, 0xDB, 0x5C, /* 0x38-0x3B */
+	0xD3, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x7A, /* 0x3C-0x3F */
+	0xAF, 0xFB, 0xD3, 0x7B, 0xD3, 0xA1, 0xAF, 0xFE, /* 0x40-0x43 */
+	0xD3, 0x75, 0xD3, 0xAF, 0x00, 0x00, 0xD3, 0xAE, /* 0x44-0x47 */
+	0xD3, 0xB6, 0xAF, 0xF3, 0xAF, 0xF0, 0xD3, 0xB4, /* 0x48-0x4B */
+	0xD3, 0xB0, 0xD3, 0xA7, 0xD3, 0xA2, 0xAF, 0xF6, /* 0x4C-0x4F */
+	0xAF, 0xF2, 0xD3, 0x77, 0xAF, 0xEE, 0xD3, 0xB1, /* 0x50-0x53 */
+	0xAF, 0xEF, 0x00, 0x00, 0xD3, 0x79, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x5E, /* 0x70-0x73 */
+	0xD7, 0x60, 0xD7, 0x65, 0xD7, 0x79, 0xB2, 0xFC, /* 0x74-0x77 */
+	0xB2, 0xF2, 0x00, 0x00, 0xD7, 0x5D, 0xB2, 0xFD, /* 0x78-0x7B */
+	0xB2, 0xFE, 0xD7, 0x68, 0xD7, 0x6F, 0xD7, 0x75, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD7, 0x62, 0x00, 0x00, 0xD7, 0x69, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x40, 0xD7, 0x77, /* 0x84-0x87 */
+	0xD7, 0x72, 0xB2, 0xFA, 0xB2, 0xF8, 0xD7, 0x6E, /* 0x88-0x8B */
+	0xD7, 0x6A, 0xD7, 0x5C, 0xB2, 0xEF, 0xD7, 0x61, /* 0x8C-0x8F */
+	0xD7, 0x59, 0x00, 0x00, 0xB2, 0xF7, 0xB2, 0xF9, /* 0x90-0x93 */
+	0xD7, 0x66, 0xD7, 0x63, 0xB2, 0xF4, 0xD7, 0x73, /* 0x94-0x97 */
+	0xB2, 0xF1, 0xD7, 0x64, 0xD7, 0x7A, 0xD7, 0x6C, /* 0x98-0x9B */
+	0x00, 0x00, 0xD7, 0x6B, 0xB2, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */
+	0xB2, 0xFB, 0x00, 0x00, 0xB2, 0xF3, 0xD7, 0x5A, /* 0xA0-0xA3 */
+	0xD7, 0x5F, 0xD7, 0x70, 0xD7, 0x76, 0xB3, 0x41, /* 0xA4-0xA7 */
+	0xD7, 0x5B, 0xD7, 0x67, 0xD7, 0x6D, 0xB2, 0xF6, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0x78, 0xD7, 0x71, /* 0xAC-0xAF */
+	0xD7, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xB2, 0xF5, 0x00, 0x00, 0xDB, 0x6C, /* 0xBC-0xBF */
+	0xDB, 0x60, 0xB5, 0xD7, 0xDB, 0x7D, 0xDB, 0xA7, /* 0xC0-0xC3 */
+	0xDB, 0xAA, 0xB5, 0xD5, 0xDB, 0x68, 0xDB, 0xA3, /* 0xC4-0xC7 */
+	0xDB, 0x69, 0xDB, 0x77, 0xB5, 0xE2, 0xDB, 0x73, /* 0xC8-0xCB */
+	0xB5, 0xDF, 0x00, 0x00, 0xDB, 0x74, 0xDB, 0x5D, /* 0xCC-0xCF */
+	0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB5, 0xE8, 0xDB, 0xA1, 0xDB, 0x75, 0xDB, 0xAC, /* 0xD4-0xD7 */
+	0xDB, 0x70, 0xDF, 0xC8, 0x00, 0x00, 0xDB, 0xAF, /* 0xD8-0xDB */
+	0xB5, 0xE6, 0xDB, 0x6E, 0xDB, 0x7A, 0xB5, 0xE9, /* 0xDC-0xDF */
+	0xB5, 0xD4, 0xDB, 0x72, 0xDB, 0xAD, 0xDB, 0x6B, /* 0xE0-0xE3 */
+	0xDB, 0x64, 0xDB, 0x6F, 0x00, 0x00, 0xDB, 0x63, /* 0xE4-0xE7 */
+	0xDB, 0x61, 0xB5, 0xD0, 0xDB, 0xA5, 0xDB, 0x6A, /* 0xE8-0xEB */
+	0xDB, 0xA8, 0x00, 0x00, 0xDB, 0xA9, 0xB5, 0xD8, /* 0xEC-0xEF */
+	0xB5, 0xDD, 0xB5, 0xD9, 0xB5, 0xE1, 0xDB, 0x7E, /* 0xF0-0xF3 */
+	0xB5, 0xDA, 0xDB, 0x76, 0xDB, 0x66, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xB5, 0xD2, 0xDB, 0x5E, 0xDB, 0xA2, 0xDB, 0xAB, /* 0xF8-0xFB */
+	0xDB, 0x65, 0xB5, 0xE0, 0xDB, 0xB0, 0xDB, 0x71, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0x00, 0x00, 0xDB, 0x6D, 0x00, 0x00, 0xB5, 0xD1, /* 0x00-0x03 */
+	0xB5, 0xE5, 0x00, 0x00, 0xDB, 0x7C, 0xB5, 0xE7, /* 0x04-0x07 */
+	0x00, 0x00, 0xDB, 0x78, 0xB5, 0xDC, 0xB5, 0xD6, /* 0x08-0x0B */
+	0xB5, 0xDE, 0xB5, 0xD3, 0xB5, 0xE4, 0xDB, 0x79, /* 0x0C-0x0F */
+	0xDB, 0x67, 0xDB, 0x7B, 0xDB, 0x62, 0xDB, 0xA6, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAE, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x5F, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xDF, 0xC7, 0x00, 0x00, 0xDF, 0xDD, /* 0x28-0x2B */
+	0xB8, 0x55, 0xDF, 0xCC, 0x00, 0x00, 0xDF, 0xCA, /* 0x2C-0x2F */
+	0xDF, 0xB5, 0xB8, 0xA9, 0xDF, 0xC5, 0xDF, 0xD9, /* 0x30-0x33 */
+	0xDF, 0xC1, 0xB8, 0xB1, 0xDF, 0xD8, 0xDF, 0xBF, /* 0x34-0x37 */
+	0xB5, 0xE3, 0xDF, 0xCF, 0xDF, 0xC0, 0xDF, 0xD6, /* 0x38-0x3B */
+	0xB8, 0xB0, 0xB8, 0xA8, 0x00, 0x00, 0xDF, 0xAA, /* 0x3C-0x3F */
+	0xDF, 0xB2, 0x00, 0x00, 0xDF, 0xCB, 0xDF, 0xC3, /* 0x40-0x43 */
+	0xDF, 0xDC, 0xDF, 0xC6, 0xB8, 0xB6, 0xDF, 0xD7, /* 0x44-0x47 */
+	0x00, 0x00, 0xB8, 0xAD, 0x00, 0x00, 0xDF, 0xC9, /* 0x48-0x4B */
+	0xDF, 0xD1, 0xDF, 0xB6, 0xDF, 0xD0, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDF, 0xE1, 0xDF, 0xB1, 0xDF, 0xD2, 0x00, 0x00, /* 0x50-0x53 */
+	0xDF, 0xDF, 0x00, 0x00, 0xDF, 0xAB, 0xB5, 0xDB, /* 0x54-0x57 */
+	0x00, 0x00, 0xDF, 0xB9, 0xDF, 0xB8, 0xB8, 0xAF, /* 0x58-0x5B */
+	0x00, 0x00, 0xDF, 0xBC, 0xDF, 0xBE, 0xDF, 0xCD, /* 0x5C-0x5F */
+	0xDF, 0xDE, 0xB8, 0xB2, 0x00, 0x00, 0xB8, 0xB3, /* 0x60-0x63 */
+	0x00, 0x00, 0xDF, 0xB0, 0xB8, 0xAB, 0xDF, 0xB4, /* 0x64-0x67 */
+	0xDF, 0xDA, 0xB8, 0xB4, 0x00, 0x00, 0xB8, 0xAC, /* 0x68-0x6B */
+	0xB8, 0xAE, 0xB8, 0xB5, 0xDF, 0xE0, 0xDF, 0xD3, /* 0x6C-0x6F */
+	0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBB, /* 0x70-0x73 */
+	0xDF, 0xBA, 0xB8, 0xAA, 0xDF, 0xAC, 0xB8, 0xA7, /* 0x74-0x77 */
+	0xDF, 0xC4, 0xDF, 0xAD, 0xDF, 0xC2, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xDF, 0xB7, 0xDF, 0xDB, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xA6, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xB3, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xDF, 0xAF, 0xDF, 0xD5, 0xDF, 0xAE, /* 0x8C-0x8F */
+	0xBB, 0x60, 0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE3, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x94-0x97 */
+	0xE3, 0xCA, 0xBB, 0x58, 0xE3, 0xBB, 0xE3, 0xC5, /* 0x98-0x9B */
+	0xBB, 0x5B, 0xE3, 0xBE, 0xBB, 0x59, 0xE3, 0xAF, /* 0x9C-0x9F */
+	0xE3, 0xCD, 0xE3, 0xAE, 0xE3, 0xC1, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE3, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBF, /* 0xA4-0xA7 */
+	0xE3, 0xC8, 0xE3, 0xC6, 0xE3, 0xBA, 0xE3, 0xB5, /* 0xA8-0xAB */
+	0xE3, 0xB3, 0x00, 0x00, 0xE3, 0xB4, 0xE3, 0xC7, /* 0xAC-0xAF */
+	0xE3, 0xD2, 0xE3, 0xBC, 0xBB, 0x5A, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE3, 0xB7, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xBB, 0x5D, 0xE3, 0xB6, 0xE3, 0xB0, 0xE3, 0xC0, /* 0xB8-0xBB */
+	0xBB, 0x61, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x55, /* 0xBC-0xBF */
+	0xBB, 0x5E, 0xE3, 0xB8, 0xE3, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xBB, 0x57, 0xDF, 0xD4, 0xBB, 0x56, 0xE3, 0xC3, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xBB, 0x54, 0xBB, 0x63, 0xBB, 0x5C, /* 0xC8-0xCB */
+	0xE3, 0xC4, 0xE3, 0xB9, 0xE3, 0xB1, 0xE3, 0xCC, /* 0xCC-0xCF */
+	0xE3, 0xBD, 0xBB, 0x62, 0xE3, 0xD0, 0xBB, 0x5F, /* 0xD0-0xD3 */
+	0xE3, 0xCF, 0x00, 0x00, 0xE3, 0xC9, 0xE3, 0xCE, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD1, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0xE4-0xE7 */
+	0xE7, 0x74, 0xE7, 0x67, 0xE7, 0x66, 0xE7, 0x62, /* 0xE8-0xEB */
+	0xBD, 0xB4, 0x00, 0x00, 0xBD, 0xAC, 0xE7, 0x76, /* 0xEC-0xEF */
+	0xE7, 0x75, 0xDF, 0xA9, 0xE7, 0x5F, 0xE7, 0x63, /* 0xF0-0xF3 */
+	0xE7, 0x5D, 0x00, 0x00, 0xE7, 0x70, 0xE7, 0x61, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE7, 0x77, 0xE7, 0x5A, 0xE7, 0x58, /* 0xF8-0xFB */
+	0xE7, 0x64, 0xE7, 0x6E, 0xE7, 0x69, 0xBD, 0xB6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0xE7, 0x4F, 0x00, 0x00, 0xE7, 0x6D, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xB7, 0xDF, 0xBD, /* 0x04-0x07 */
+	0xE7, 0x5B, 0xE7, 0x52, 0xE7, 0x55, 0xE7, 0x7B, /* 0x08-0x0B */
+	0xE7, 0x5C, 0xE7, 0x53, 0xE7, 0x51, 0xE7, 0x4E, /* 0x0C-0x0F */
+	0x00, 0x00, 0xBD, 0xB0, 0xE7, 0x65, 0xBD, 0xAF, /* 0x10-0x13 */
+	0xBD, 0xB3, 0xE7, 0x60, 0xE7, 0x68, 0xBD, 0xA9, /* 0x14-0x17 */
+	0xE7, 0x78, 0xE7, 0x7C, 0xBD, 0xAB, 0x00, 0x00, /* 0x18-0x1B */
+	0xE7, 0x57, 0xE7, 0x6B, 0xE7, 0x6F, 0xE7, 0x54, /* 0x1C-0x1F */
+	0xE7, 0x79, 0xBD, 0xB2, 0x00, 0x00, 0xBD, 0xB1, /* 0x20-0x23 */
+	0xE7, 0x4C, 0xBD, 0xB5, 0xE7, 0x72, 0xE7, 0x56, /* 0x24-0x27 */
+	0xE7, 0x6A, 0xE7, 0x50, 0xE7, 0x5E, 0xE7, 0x59, /* 0x28-0x2B */
+	0xBD, 0xAD, 0xBD, 0xAE, 0xE7, 0x6C, 0xE7, 0x7D, /* 0x2C-0x2F */
+	0xE7, 0x7A, 0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4D, /* 0x38-0x3B */
+	0x00, 0x00, 0xBD, 0xAA, 0xEB, 0x49, 0x00, 0x00, /* 0x3C-0x3F */
+	0xEB, 0x40, 0xEB, 0x43, 0x00, 0x00, 0xBF, 0xBB, /* 0x40-0x43 */
+	0xEB, 0x45, 0xEA, 0xF9, 0xEB, 0x41, 0xEB, 0x47, /* 0x44-0x47 */
+	0xBF, 0xB8, 0xBF, 0xBC, 0xBF, 0xB6, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEA, 0xFB, 0xEB, 0x4C, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEB, 0x46, 0x00, 0x00, 0xEA, 0xFC, /* 0x50-0x53 */
+	0xEB, 0x55, 0xEB, 0x4F, 0xEA, 0xF8, 0xEE, 0x46, /* 0x54-0x57 */
+	0xEA, 0xFE, 0xBF, 0xB7, 0x00, 0x00, 0xEB, 0x4A, /* 0x58-0x5B */
+	0x00, 0x00, 0xEB, 0x54, 0xBF, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEB, 0x51, 0xEA, 0xFD, 0xEB, 0x44, 0xEB, 0x48, /* 0x60-0x63 */
+	0xEB, 0x42, 0xEB, 0x56, 0xEB, 0x53, 0xEB, 0x50, /* 0x64-0x67 */
+	0xBF, 0xB9, 0xBF, 0xBA, 0xBF, 0xBE, 0xEA, 0xFA, /* 0x68-0x6B */
+	0xEB, 0x57, 0xBF, 0xBD, 0xEB, 0x4D, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xEB, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xEB, 0x4E, 0xEE, 0x53, 0xEE, 0x40, /* 0x74-0x77 */
+	0xEE, 0x45, 0xEE, 0x52, 0xEE, 0x44, 0xED, 0xFB, /* 0x78-0x7B */
+	0xEE, 0x41, 0x00, 0x00, 0xC1, 0xA2, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xED, 0xF4, 0xEE, 0x4D, 0xEE, 0x4F, 0xED, 0xF3, /* 0x80-0x83 */
+	0xC1, 0xA1, 0xEE, 0x51, 0xEE, 0x49, 0xC1, 0xA8, /* 0x84-0x87 */
+	0xEE, 0x50, 0xEE, 0x42, 0xC1, 0xAA, 0xED, 0xF9, /* 0x88-0x8B */
+	0xEB, 0x52, 0xEE, 0x4A, 0xEE, 0x47, 0xED, 0xF5, /* 0x8C-0x8F */
+	0xEE, 0x55, 0xC1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xC1, 0xA5, 0xED, 0xF7, 0xEE, 0x48, 0x00, 0x00, /* 0x94-0x97 */
+	0xEE, 0x54, 0xEE, 0x4B, 0xED, 0xFD, 0xC1, 0xA7, /* 0x98-0x9B */
+	0xC1, 0xA3, 0xEE, 0x4C, 0xED, 0xFE, 0xEE, 0x56, /* 0x9C-0x9F */
+	0xED, 0xF8, 0xEE, 0x43, 0xEE, 0x4E, 0xED, 0xFA, /* 0xA0-0xA3 */
+	0xED, 0xFC, 0x00, 0x00, 0xC2, 0xCB, 0xED, 0xF6, /* 0xA4-0xA7 */
+	0xC1, 0xA9, 0xC2, 0xC4, 0xC1, 0x7E, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xA6, /* 0xAC-0xAF */
+	0xC2, 0xC8, 0xF0, 0xB3, 0x00, 0x00, 0xF0, 0xA9, /* 0xB0-0xB3 */
+	0xF0, 0xA4, 0xF0, 0xAA, 0xF0, 0xB4, 0xF0, 0xB8, /* 0xB4-0xB7 */
+	0xF0, 0xB7, 0xC2, 0xCA, 0xC2, 0xC9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xF0, 0xAB, 0xF0, 0xB9, 0xF0, 0xAE, /* 0xBC-0xBF */
+	0xF0, 0xA6, 0x00, 0x00, 0xF0, 0xA8, 0xF0, 0xA7, /* 0xC0-0xC3 */
+	0xF0, 0xAD, 0xF0, 0xB2, 0xF0, 0xA5, 0xF0, 0xAC, /* 0xC4-0xC7 */
+	0xF0, 0xB1, 0xC2, 0xC7, 0x00, 0x00, 0xF0, 0xAF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xC2, 0xC5, 0xF0, 0xB0, 0xC2, 0xC3, /* 0xCC-0xCF */
+	0xC2, 0xC6, 0xF2, 0xD5, 0xF0, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xC3, 0xC2, 0x00, 0x00, 0xF2, 0xCD, /* 0xD4-0xD7 */
+	0xF2, 0xD1, 0xF2, 0xC9, 0xF2, 0xCC, 0x00, 0x00, /* 0xD8-0xDB */
+	0xF2, 0xD4, 0xC3, 0xC0, 0xF2, 0xD9, 0xF2, 0xD2, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF2, 0xCA, 0xF2, 0xDA, 0xF2, 0xD3, /* 0xE0-0xE3 */
+	0xC3, 0xC3, 0xC3, 0xC4, 0xF2, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF2, 0xCB, 0xC3, 0xBF, 0xC3, 0xC1, 0xF2, 0xC6, /* 0xE8-0xEB */
+	0xF2, 0xCE, 0xF2, 0xC8, 0x00, 0x00, 0xF2, 0xD8, /* 0xEC-0xEF */
+	0xF2, 0xD6, 0xF2, 0xC7, 0xF2, 0xCF, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xBE, 0xC3, 0xC5, /* 0xF4-0xF7 */
+	0xF2, 0xD0, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xA6, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF4, 0xC3, 0xF4, 0xBB, 0xF4, 0xB9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0xF4, 0xBD, 0xF4, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF4, 0xBF, 0xF4, 0xC1, 0xC4, 0xAA, 0xC4, 0xAC, /* 0x04-0x07 */
+	0x00, 0x00, 0xF4, 0xC0, 0xC4, 0xAD, 0xC4, 0xAB, /* 0x08-0x0B */
+	0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xC4, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xF4, /* 0x14-0x17 */
+	0xF5, 0xF1, 0xF5, 0xF7, 0xC4, 0xF6, 0xF4, 0xBC, /* 0x18-0x1B */
+	0xF5, 0xF6, 0x00, 0x00, 0xF5, 0xFD, 0xF5, 0xF4, /* 0x1C-0x1F */
+	0xF5, 0xFB, 0xF5, 0xFA, 0xF4, 0xB8, 0xF5, 0xF5, /* 0x20-0x23 */
+	0xF0, 0xB6, 0xF5, 0xFE, 0xF5, 0xF3, 0xF5, 0xF8, /* 0x24-0x27 */
+	0x00, 0x00, 0xF5, 0xFC, 0xF5, 0xF2, 0x00, 0x00, /* 0x28-0x2B */
+	0xF7, 0x4A, 0xC4, 0xF5, 0xF5, 0xF9, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF7, 0xF4, 0xF7, 0x4B, 0xF7, 0x49, /* 0x30-0x33 */
+	0xF7, 0x47, 0xF7, 0x48, 0xF7, 0x4C, 0x00, 0x00, /* 0x34-0x37 */
+	0xC5, 0xD9, 0xF7, 0xF2, 0xF7, 0xF0, 0xF7, 0xF5, /* 0x38-0x3B */
+	0xF7, 0xF3, 0x00, 0x00, 0xF7, 0xF6, 0xC5, 0xDA, /* 0x3C-0x3F */
+	0xF7, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xBC, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0x45, 0xF9, 0x46, /* 0x44-0x47 */
+	0xF9, 0x47, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC7, /* 0x48-0x4B */
+	0xF9, 0xBD, 0xCA, 0x4F, 0xAA, 0xEA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xAD, 0x68, 0x00, 0x00, 0xD3, 0xB8, 0xD3, 0xB7, /* 0x50-0x53 */
+	0xB0, 0x40, 0xB3, 0x42, 0xD7, 0x7C, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD7, 0x7B, 0x00, 0x00, 0xB5, 0xEA, /* 0x58-0x5B */
+	0xB8, 0xB8, 0x00, 0x00, 0xB8, 0xB7, 0xB8, 0xB9, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE3, 0xD4, 0xE7, 0x7E, 0xEB, 0x58, /* 0x60-0x63 */
+	0xEB, 0x5A, 0xEB, 0x59, 0x00, 0x00, 0xC1, 0xAB, /* 0x64-0x67 */
+	0xEE, 0x57, 0xF0, 0xBA, 0xF9, 0xA5, 0xA6, 0xE4, /* 0x68-0x6B */
+	0x00, 0x00, 0xCD, 0xC9, 0xCD, 0xCA, 0xCD, 0xC8, /* 0x6C-0x6F */
+	0xCD, 0xC7, 0xAA, 0xEB, 0x00, 0x00, 0xD0, 0xA9, /* 0x70-0x73 */
+	0xD0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA6, /* 0x74-0x77 */
+	0x00, 0x00, 0xAD, 0x69, 0xAD, 0x6B, 0xAD, 0x6A, /* 0x78-0x7B */
+	0xD0, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD3, 0xC4, 0xD3, 0xC1, 0xD3, 0xBF, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0x41, 0xD3, 0xC2, /* 0x88-0x8B */
+	0xB0, 0x46, 0xD3, 0xBC, 0xD3, 0xCB, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD3, 0xCD, 0xD3, 0xBD, 0x00, 0x00, 0xB0, 0x43, /* 0x90-0x93 */
+	0xD3, 0xCE, 0xD3, 0xC9, 0xD3, 0xBB, 0xD3, 0xC0, /* 0x94-0x97 */
+	0xD3, 0xCA, 0xD3, 0xC6, 0xD3, 0xC3, 0x00, 0x00, /* 0x98-0x9B */
+	0xB0, 0x48, 0xD3, 0xCC, 0xD3, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD3, 0xC7, 0xD3, 0xB9, 0xB0, 0x47, /* 0xA0-0xA3 */
+	0xB0, 0x44, 0xD3, 0xC5, 0x00, 0x00, 0xD3, 0xC8, /* 0xA4-0xA7 */
+	0xD3, 0xBA, 0xB0, 0x45, 0xB0, 0x42, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x4C, /* 0xAC-0xAF */
+	0xD7, 0xA5, 0xB3, 0x4B, 0x00, 0x00, 0xD7, 0xA8, /* 0xB0-0xB3 */
+	0xD7, 0xAB, 0xB3, 0x48, 0xB3, 0x46, 0xD7, 0x7E, /* 0xB4-0xB7 */
+	0xD7, 0xA9, 0xD7, 0xA7, 0xD7, 0xA4, 0xD7, 0xAC, /* 0xB8-0xBB */
+	0xD7, 0xAD, 0xD7, 0xAF, 0xD7, 0xB0, 0xD7, 0x7D, /* 0xBC-0xBF */
+	0xB3, 0x45, 0xD7, 0xA2, 0xD7, 0xA1, 0xD7, 0xAE, /* 0xC0-0xC3 */
+	0xB3, 0x47, 0xD7, 0xA3, 0xB3, 0x49, 0xB3, 0x44, /* 0xC4-0xC7 */
+	0xD7, 0xA6, 0xB3, 0x4D, 0x00, 0x00, 0xB3, 0x4A, /* 0xC8-0xCB */
+	0xD7, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xB5, 0xF1, 0xDB, 0xBF, 0x00, 0x00, 0xDB, 0xB4, /* 0xD0-0xD3 */
+	0xB5, 0xEE, 0x00, 0x00, 0xDF, 0xE7, 0xDB, 0xBD, /* 0xD4-0xD7 */
+	0xDB, 0xB1, 0xB5, 0xEC, 0xDB, 0xB6, 0xB5, 0xEF, /* 0xD8-0xDB */
+	0xDB, 0xBA, 0xDB, 0xB8, 0xB5, 0xF2, 0xB5, 0xEB, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xB2, 0xDB, 0xB5, /* 0xE0-0xE3 */
+	0xB5, 0xF0, 0x00, 0x00, 0xDB, 0xB3, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xDB, 0xBE, 0xDB, 0xBC, 0xDB, 0xB7, 0xDB, 0xB9, /* 0xE8-0xEB */
+	0xDB, 0xBB, 0xB5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xDF, 0xE8, 0xDF, 0xEE, 0xDF, 0xE4, /* 0xF4-0xF7 */
+	0xDF, 0xEA, 0xB8, 0xBA, 0xDF, 0xE6, 0xB8, 0xC0, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xBF, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xB8, 0xBE, 0xDF, 0xED, 0xB8, 0xC1, 0xB8, 0xC2, /* 0x00-0x03 */
+	0xDF, 0xE3, 0xDF, 0xF0, 0xB8, 0xC3, 0xB8, 0xBD, /* 0x04-0x07 */
+	0xB8, 0xBC, 0xDF, 0xEC, 0xB8, 0xC4, 0xDF, 0xE2, /* 0x08-0x0B */
+	0xDF, 0xE5, 0xDF, 0xEF, 0xDF, 0xEB, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE3, 0xF4, 0xE3, 0xE9, 0xB8, 0xBB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xBB, 0x6A, 0xE3, 0xDD, 0xE3, 0xF2, 0xE3, 0xDE, /* 0x18-0x1B */
+	0xBB, 0x65, 0x00, 0x00, 0xE3, 0xDB, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE3, 0xE4, 0xE3, 0xDC, 0xBB, 0x67, 0xE3, 0xD6, /* 0x20-0x23 */
+	0xE3, 0xF1, 0xBB, 0x68, 0xE3, 0xEE, 0xE3, 0xEF, /* 0x24-0x27 */
+	0xE3, 0xD7, 0xBB, 0x6D, 0xE3, 0xE6, 0x00, 0x00, /* 0x28-0x2B */
+	0xE3, 0xE0, 0xE3, 0xE7, 0xE3, 0xDA, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE3, 0xF3, 0xE3, 0xEB, 0xE3, 0xE5, 0xE3, 0xD5, /* 0x30-0x33 */
+	0xBB, 0x69, 0xE3, 0xEC, 0x00, 0x00, 0xBB, 0x6C, /* 0x34-0x37 */
+	0xE3, 0xF0, 0x00, 0x00, 0xE3, 0xEA, 0xBB, 0x66, /* 0x38-0x3B */
+	0xE3, 0xE8, 0x00, 0x00, 0xE3, 0xE2, 0xBB, 0x64, /* 0x3C-0x3F */
+	0xE3, 0xD9, 0xE3, 0xE1, 0xE3, 0xED, 0xE3, 0xDF, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xE3, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xBD, 0xC1, 0xDF, 0xE9, 0xE7, 0xB2, 0xE7, 0xBB, /* 0x4C-0x4F */
+	0xE7, 0xB1, 0xE7, 0xAD, 0xE7, 0xAA, 0xBD, 0xC2, /* 0x50-0x53 */
+	0xE7, 0xA8, 0xBB, 0x6B, 0xE7, 0xA1, 0xBD, 0xC0, /* 0x54-0x57 */
+	0xE7, 0xA7, 0xBD, 0xBF, 0xE7, 0xAC, 0xE7, 0xA9, /* 0x58-0x5B */
+	0xE7, 0xB9, 0xE7, 0xB4, 0xE7, 0xAE, 0xE7, 0xB3, /* 0x5C-0x5F */
+	0xBD, 0xBB, 0xE7, 0xAB, 0xE7, 0xBE, 0xE7, 0xA2, /* 0x60-0x63 */
+	0xE7, 0xA3, 0xE7, 0xBA, 0xBD, 0xBC, 0xE7, 0xBF, /* 0x64-0x67 */
+	0xBD, 0xBE, 0xE7, 0xC0, 0xE7, 0xB0, 0xE3, 0xD8, /* 0x68-0x6B */
+	0xE7, 0xB6, 0xE7, 0xAF, 0xE7, 0xB8, 0xE7, 0xB5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA6, /* 0x70-0x73 */
+	0xBD, 0xB9, 0xE7, 0xBD, 0xBD, 0xBA, 0xE7, 0xA4, /* 0x74-0x77 */
+	0xBD, 0xBD, 0xEB, 0x64, 0xE7, 0xB7, 0xE7, 0xBC, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xEB, 0x61, 0xBD, 0xB8, 0xBF, 0xC0, /* 0x80-0x83 */
+	0xEB, 0x6B, 0xEB, 0x67, 0x00, 0x00, 0xEB, 0x65, /* 0x84-0x87 */
+	0xEB, 0x60, 0xEB, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xBF, 0xC4, 0x00, 0x00, 0xEB, 0x5C, /* 0x8C-0x8F */
+	0xEB, 0x68, 0xEB, 0x69, 0xEB, 0x5F, 0xEB, 0x5E, /* 0x90-0x93 */
+	0xEB, 0x6C, 0x00, 0x00, 0xEB, 0x62, 0xEB, 0x5D, /* 0x94-0x97 */
+	0xEB, 0x63, 0x00, 0x00, 0xEB, 0x6E, 0xEB, 0x5B, /* 0x98-0x9B */
+	0xEB, 0x6D, 0xEB, 0x6A, 0xBF, 0xC2, 0xBF, 0xC1, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0xC3, 0xEB, 0x66, /* 0xA0-0xA3 */
+	0xF0, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x59, 0xC1, 0xB1, /* 0xA8-0xAB */
+	0xEE, 0x5D, 0xEE, 0x5A, 0xEE, 0x61, 0xEE, 0x67, /* 0xAC-0xAF */
+	0xEE, 0x5C, 0x00, 0x00, 0xEE, 0x70, 0xC1, 0xAE, /* 0xB0-0xB3 */
+	0xEE, 0x6A, 0xEE, 0x5F, 0xEE, 0x6B, 0xEE, 0x66, /* 0xB4-0xB7 */
+	0xEE, 0x6D, 0xEE, 0x5E, 0xC1, 0xB3, 0xC1, 0xB2, /* 0xB8-0xBB */
+	0xEE, 0x60, 0xEE, 0x6E, 0xEE, 0x58, 0xEE, 0x6C, /* 0xBC-0xBF */
+	0xC1, 0xAC, 0x00, 0x00, 0xEE, 0x64, 0xEE, 0x63, /* 0xC0-0xC3 */
+	0xEE, 0x68, 0xEE, 0x5B, 0xC1, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xC1, 0xB4, 0xEE, 0x62, 0xEE, 0x69, 0xC1, 0xB5, /* 0xC8-0xCB */
+	0xEE, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xC1, 0xAD, 0xC1, 0xAF, 0xF0, 0xC7, /* 0xD0-0xD3 */
+	0xF0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xCC, /* 0xD4-0xD7 */
+	0xF0, 0xC9, 0xF0, 0xCD, 0x00, 0x00, 0xF0, 0xBE, /* 0xD8-0xDB */
+	0xF0, 0xC6, 0xF0, 0xD1, 0xEE, 0x6F, 0xF0, 0xC2, /* 0xDC-0xDF */
+	0xC2, 0xCF, 0xE7, 0xA5, 0xF0, 0xBD, 0xF0, 0xCA, /* 0xE0-0xE3 */
+	0xF0, 0xC4, 0xF0, 0xC1, 0xF0, 0xBC, 0xF0, 0xBB, /* 0xE4-0xE7 */
+	0xF0, 0xD0, 0x00, 0x00, 0xF0, 0xC0, 0xF0, 0xBF, /* 0xE8-0xEB */
+	0xC2, 0xCD, 0xF0, 0xC8, 0x00, 0x00, 0xC2, 0xCC, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0xCE, 0xF0, 0xC3, /* 0xF0-0xF3 */
+	0xF0, 0xCF, 0x00, 0x00, 0xF2, 0xDE, 0xF2, 0xDF, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xC3, 0xC9, 0xF2, 0xDC, 0xC3, 0xC6, /* 0xF8-0xFB */
+	0xF2, 0xE4, 0x00, 0x00, 0xC3, 0xCA, 0xF2, 0xE6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0xF2, 0xDB, 0xF0, 0xCE, 0xF2, 0xE8, 0xF2, 0xDD, /* 0x00-0x03 */
+	0x00, 0x00, 0xC3, 0xC7, 0xF2, 0xE3, 0x00, 0x00, /* 0x04-0x07 */
+	0xF2, 0xE5, 0xF2, 0xE0, 0xF2, 0xE7, 0xF2, 0xE2, /* 0x08-0x0B */
+	0xF2, 0xE1, 0xC3, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF4, 0xC5, 0xF4, 0xC6, 0x00, 0x00, 0xF4, 0xC8, /* 0x10-0x13 */
+	0xC4, 0xAE, 0xC4, 0xAF, 0xF4, 0xC9, 0xF4, 0xC7, /* 0x14-0x17 */
+	0x00, 0x00, 0xF4, 0xC4, 0x00, 0x00, 0xF6, 0x42, /* 0x18-0x1B */
+	0xF6, 0x45, 0xF6, 0x41, 0x00, 0x00, 0xC4, 0xFA, /* 0x1C-0x1F */
+	0xF6, 0x43, 0xC4, 0xF9, 0xC4, 0xF8, 0xC4, 0xF7, /* 0x20-0x23 */
+	0xF6, 0x44, 0xF7, 0x51, 0xF7, 0x4F, 0x00, 0x00, /* 0x24-0x27 */
+	0xF7, 0x4E, 0xF6, 0x40, 0xF7, 0x50, 0xF6, 0x46, /* 0x28-0x2B */
+	0xF7, 0x4D, 0x00, 0x00, 0xF7, 0xF9, 0xF7, 0xD7, /* 0x2C-0x2F */
+	0xF7, 0xF7, 0xC5, 0xDB, 0xF7, 0xF8, 0xF7, 0xFA, /* 0x30-0x33 */
+	0x00, 0x00, 0xF8, 0xBF, 0xC5, 0xFA, 0xF8, 0xBE, /* 0x34-0x37 */
+	0xF8, 0xBD, 0xC5, 0xFB, 0x00, 0x00, 0xC6, 0x5A, /* 0x38-0x3B */
+	0xF9, 0x6E, 0xF9, 0xA7, 0xF9, 0xA6, 0xF9, 0xA8, /* 0x3C-0x3F */
+	0xA6, 0xE5, 0xD0, 0xAA, 0x00, 0x00, 0xD3, 0xCF, /* 0x40-0x43 */
+	0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xDB, 0xC0, 0x00, 0x00, 0xF6, 0x47, 0xF8, 0xC0, /* 0x48-0x4B */
+	0xA6, 0xE6, 0xAD, 0x6C, 0xD0, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xB1, 0xB3, 0x4E, /* 0x50-0x53 */
+	0x00, 0x00, 0xDB, 0xC2, 0xDB, 0xC1, 0xB5, 0xF3, /* 0x54-0x57 */
+	0x00, 0x00, 0xB8, 0xC5, 0xE7, 0xC1, 0xBD, 0xC3, /* 0x58-0x5B */
+	0x00, 0x00, 0xBD, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xBF, 0xC5, 0xC5, 0xFC, 0xA6, 0xE7, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, /* 0x64-0x67 */
+	0xAA, 0xED, 0xD0, 0xAE, 0xD0, 0xAD, 0xAD, 0x6D, /* 0x68-0x6B */
+	0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, 0xD3, 0xD8, /* 0x6C-0x6F */
+	0xB0, 0x49, 0xD3, 0xD6, 0xD3, 0xD4, 0x00, 0x00, /* 0x70-0x73 */
+	0xD3, 0xDB, 0xD3, 0xD2, 0xD3, 0xD3, 0xB0, 0x4A, /* 0x74-0x77 */
+	0x00, 0x00, 0xB0, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xD3, 0xDC, 0xB0, 0x4D, 0xD3, 0xDA, 0xD3, 0xD7, /* 0x7C-0x7F */
+	
+	0xD3, 0xD5, 0xB0, 0x4B, 0xB0, 0x4C, 0xD3, 0xD9, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xB3, 0x50, 0xD7, 0xB2, 0x00, 0x00, 0xB3, 0x55, /* 0x88-0x8B */
+	0xD7, 0xC2, 0xB3, 0x54, 0xD7, 0xC4, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD7, 0xB8, 0xB3, 0x52, 0xD7, 0xC3, /* 0x90-0x93 */
+	0x00, 0x00, 0xD7, 0xB3, 0xB3, 0x53, 0xD7, 0xBF, /* 0x94-0x97 */
+	0xD7, 0xBB, 0xD7, 0xBD, 0xD7, 0xB7, 0xD7, 0xBE, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x4F, 0xD7, 0xBA, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD7, 0xB9, 0xD7, 0xB5, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xD7, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0xA4-0xA7 */
+	0xD7, 0xB4, 0x00, 0x00, 0xD7, 0xB6, 0xB3, 0x51, /* 0xA8-0xAB */
+	0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xB5, 0xF6, 0xDB, 0xCD, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0xDB, 0xCB, /* 0xB4-0xB7 */
+	0xDB, 0xC6, 0xDB, 0xC5, 0xDB, 0xC3, 0x00, 0x00, /* 0xB8-0xBB */
+	0xDB, 0xCA, 0xDB, 0xCC, 0xDB, 0xC8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xDB, 0xC7, 0xB5, 0xF4, 0xB5, 0xF5, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xDB, 0xCF, 0xB8, 0xCD, 0xDF, 0xF2, /* 0xC8-0xCB */
+	0xDF, 0xF8, 0xDF, 0xF3, 0xDF, 0xF4, 0xF9, 0xD8, /* 0xCC-0xCF */
+	0xDF, 0xF9, 0x00, 0x00, 0xB8, 0xCF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB8, 0xC7, 0xB8, 0xCE, 0xDF, 0xF1, 0xDB, 0xC4, /* 0xD4-0xD7 */
+	0xB8, 0xCA, 0xB8, 0xC8, 0xDF, 0xF7, 0xDF, 0xF6, /* 0xD8-0xDB */
+	0xB8, 0xC9, 0xB8, 0xCB, 0xDF, 0xF5, 0xB8, 0xC6, /* 0xDC-0xDF */
+	0x00, 0x00, 0xB8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xE4-0xE7 */
+	0xBB, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, /* 0xE8-0xEB */
+	0xE4, 0x41, 0x00, 0x00, 0xE3, 0xFB, 0xBB, 0x76, /* 0xEC-0xEF */
+	0xE4, 0x40, 0xE3, 0xF7, 0xE3, 0xF8, 0xBB, 0x6E, /* 0xF0-0xF3 */
+	0xBB, 0x70, 0x00, 0x00, 0xE3, 0xFD, 0xE3, 0xF5, /* 0xF4-0xF7 */
+	0xBB, 0x72, 0xBB, 0x71, 0xE3, 0xF9, 0xE3, 0xFE, /* 0xF8-0xFB */
+	0xE3, 0xFC, 0xBB, 0x73, 0xE3, 0xFA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0x00, 0x00, 0xDB, 0xCE, 0xBB, 0x6F, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE7, 0xC2, 0xE7, 0xC9, 0xBD, 0xC6, /* 0x04-0x07 */
+	0x00, 0x00, 0xE7, 0xCD, 0xBD, 0xCA, 0xE7, 0xC5, /* 0x08-0x0B */
+	0xE7, 0xC3, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */
+	0xBD, 0xC5, 0xE7, 0xCB, 0xBD, 0xC7, 0xBD, 0xC8, /* 0x10-0x13 */
+	0xE7, 0xC4, 0xBD, 0xC9, 0xE7, 0xCA, 0xE7, 0xC6, /* 0x14-0x17 */
+	0xE7, 0xC7, 0xE7, 0xC8, 0xBB, 0x75, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0x70, 0xEB, 0x7C, /* 0x1C-0x1F */
+	0x00, 0x00, 0xBF, 0xCA, 0xEB, 0x77, 0xEB, 0x79, /* 0x20-0x23 */
+	0x00, 0x00, 0xBF, 0xC8, 0xEB, 0x71, 0xEB, 0x75, /* 0x24-0x27 */
+	0x00, 0x00, 0xEB, 0x78, 0xBF, 0xC6, 0xBF, 0xC9, /* 0x28-0x2B */
+	0xEB, 0x7B, 0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x7A, /* 0x2C-0x2F */
+	0xEB, 0x72, 0xEB, 0x76, 0xBF, 0xC7, 0xEE, 0x72, /* 0x30-0x33 */
+	0x00, 0x00, 0xEE, 0x71, 0xC1, 0xB7, 0xEE, 0x77, /* 0x34-0x37 */
+	0xC1, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xB6, /* 0x38-0x3B */
+	0xEE, 0x73, 0xC1, 0xBA, 0xEE, 0x74, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xEE, 0x75, 0xEE, 0x78, 0x00, 0x00, /* 0x40-0x43 */
+	0xC1, 0xB8, 0x00, 0x00, 0xF0, 0xD6, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF0, 0xD9, 0x00, 0x00, 0xF0, 0xD3, /* 0x48-0x4B */
+	0xF0, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, /* 0x4C-0x4F */
+	0xF0, 0xD7, 0xF0, 0xD8, 0xEE, 0x76, 0xF0, 0xD2, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xCD, 0xF2, 0xEC, /* 0x54-0x57 */
+	0xF2, 0xEF, 0xF2, 0xF1, 0xF2, 0xEA, 0xF2, 0xEB, /* 0x58-0x5B */
+	0xF2, 0xEE, 0xF2, 0xF0, 0xC3, 0xCE, 0xC3, 0xCC, /* 0x5C-0x5F */
+	0xC3, 0xCB, 0xF2, 0xED, 0xF2, 0xE9, 0xF4, 0xCA, /* 0x60-0x63 */
+	0xC4, 0xB0, 0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xF6, 0x49, 0xC4, 0xFB, 0xF6, 0x4B, /* 0x68-0x6B */
+	0xC4, 0xFC, 0xF6, 0x48, 0xF6, 0x4A, 0xC5, 0xA8, /* 0x6C-0x6F */
+	0x00, 0x00, 0xF7, 0x52, 0xC5, 0xA7, 0xF7, 0xFD, /* 0x70-0x73 */
+	0xF7, 0xFC, 0x00, 0x00, 0xF7, 0xFB, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF9, 0x48, 0xF9, 0x49, 0xF9, 0x4B, /* 0x78-0x7B */
+	0xF9, 0x4A, 0x00, 0x00, 0xCA, 0x50, 0xA6, 0xE8, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xAD, 0x6E, 0xD7, 0xC5, 0xB5, 0xF7, /* 0x80-0x83 */
+	0x00, 0x00, 0xDF, 0xFA, 0xC2, 0xD0, 0x00, 0x00, /* 0x84-0x87 */
+	0xF2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA3, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x57, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x56, /* 0x90-0x93 */
+	0x00, 0x00, 0xDB, 0xD0, 0xB5, 0xF8, 0xDB, 0xD2, /* 0x94-0x97 */
+	0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFB, /* 0x98-0x9B */
+	0xB8, 0xD0, 0xE4, 0x43, 0xE4, 0x46, 0xE4, 0x45, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0x44, 0xE7, 0xCE, 0xE7, 0xD0, /* 0xA0-0xA3 */
+	0xE7, 0xCF, 0x00, 0x00, 0xBF, 0xCC, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0xCB, 0x00, 0x00, /* 0xA8-0xAB */
+	0xC1, 0xBB, 0xEE, 0x79, 0xEE, 0x7B, 0xEE, 0x7A, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xF4, 0xF2, 0xF3, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF4, 0xCC, 0xC4, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xC4, 0xFD, 0xF7, 0x54, 0xF7, 0x53, /* 0xBC-0xBF */
+	0xC6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xD0, 0xAF, /* 0xD0-0xD3 */
+	0xAD, 0x6F, 0xD7, 0xC8, 0xD7, 0xC6, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xD7, 0xC7, 0xDB, 0xD4, 0xDB, 0xD5, /* 0xD8-0xDB */
+	0xE0, 0x43, 0xDB, 0xD3, 0x00, 0x00, 0xDF, 0xFC, /* 0xDC-0xDF */
+	0xE0, 0x41, 0xE0, 0x40, 0xE0, 0x42, 0xB8, 0xD1, /* 0xE0-0xE3 */
+	0xDF, 0xFE, 0xDF, 0xFD, 0xE0, 0x44, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE4, 0x49, 0xE4, 0x47, 0x00, 0x00, 0xE4, 0x48, /* 0xE8-0xEB */
+	0xE7, 0xD3, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE7, 0xD2, 0xEB, 0x7D, 0xEE, 0x7C, 0xEE, 0x7D, /* 0xF0-0xF3 */
+	0xC2, 0xD2, 0x00, 0x00, 0xF2, 0xF5, 0xF4, 0xCD, /* 0xF4-0xF7 */
+	0xC4, 0xB2, 0x00, 0x00, 0xF6, 0x4C, 0xF7, 0x55, /* 0xF8-0xFB */
+	0xC5, 0xA9, 0x00, 0x00, 0xF7, 0xFE, 0xF9, 0x4C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0xA8, 0xA5, 0x00, 0x00, 0xAD, 0x71, 0xAD, 0x72, /* 0x00-0x03 */
+	0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB1, /* 0x04-0x07 */
+	0xAD, 0x70, 0x00, 0x00, 0xB0, 0x54, 0x00, 0x00, /* 0x08-0x0B */
+	0xB0, 0x52, 0x00, 0x00, 0xB0, 0x51, 0xB0, 0x58, /* 0x0C-0x0F */
+	0xB0, 0x50, 0xB0, 0x59, 0xD3, 0xDD, 0xB0, 0x56, /* 0x10-0x13 */
+	0x00, 0x00, 0xB0, 0x53, 0xB0, 0x57, 0xB0, 0x55, /* 0x14-0x17 */
+	0xB0, 0x4F, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x5F, /* 0x18-0x1B */
+	0x00, 0x00, 0xB3, 0x59, 0xD7, 0xCC, 0xB3, 0x5E, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x60, 0xB3, 0x5A, /* 0x20-0x23 */
+	0x00, 0x00, 0xB3, 0x5B, 0x00, 0x00, 0xD7, 0xCA, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x58, 0x00, 0x00, /* 0x28-0x2B */
+	0xD7, 0xCB, 0xB3, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD7, 0xC9, 0xB3, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xB6, 0x44, 0x00, 0x00, 0xB6, 0x46, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xDB, 0xD8, 0xB6, 0x45, 0xB5, 0xF9, /* 0x38-0x3B */
+	0xB5, 0xFD, 0x00, 0x00, 0xB8, 0xE4, 0xE0, 0x49, /* 0x3C-0x3F */
+	0xDB, 0xDA, 0xB5, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xDB, 0xDD, 0xDB, 0xDE, 0xB6, 0x43, 0x00, 0x00, /* 0x44-0x47 */
+	0xDB, 0xE0, 0x00, 0x00, 0xDB, 0xE2, 0x00, 0x00, /* 0x48-0x4B */
+	0xDB, 0xE3, 0xDB, 0xD7, 0xDB, 0xD6, 0xDB, 0xE4, /* 0x4C-0x4F */
+	0xB6, 0x42, 0xDB, 0xE1, 0xDB, 0xDF, 0x00, 0x00, /* 0x50-0x53 */
+	0xB6, 0x40, 0xB5, 0xFB, 0xB6, 0x47, 0xDB, 0xDB, /* 0x54-0x57 */
+	0xDB, 0xDC, 0xDB, 0xD9, 0x00, 0x00, 0xB6, 0x41, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0xFC, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB5, 0xFA, 0xE0, 0x48, 0xB8, 0xDF, 0xB8, 0xDA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xD5, 0x00, 0x00, /* 0x64-0x67 */
+	0xB8, 0xE5, 0xB8, 0xD6, 0x00, 0x00, 0xB8, 0xD2, /* 0x68-0x6B */
+	0xB8, 0xE1, 0xB8, 0xDE, 0xB8, 0xE0, 0x00, 0x00, /* 0x6C-0x6F */
+	0xB8, 0xD7, 0xB8, 0xDC, 0xB8, 0xD3, 0xB8, 0xD4, /* 0x70-0x73 */
+	0xE0, 0x50, 0xE0, 0x4D, 0xE0, 0x45, 0xE0, 0x4A, /* 0x74-0x77 */
+	0x00, 0x00, 0xB8, 0xE2, 0xE0, 0x51, 0xB8, 0xE3, /* 0x78-0x7B */
+	0xB8, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x47, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE0, 0x4F, 0xE0, 0x4B, 0xE0, 0x4E, /* 0x80-0x83 */
+	0xE0, 0x4C, 0xB8, 0xDD, 0xE0, 0x46, 0xB8, 0xD8, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x4C, /* 0x88-0x8B */
+	0xBB, 0x78, 0xBB, 0x7B, 0x00, 0x00, 0xE4, 0x4E, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBB, 0xA5, 0xE4, 0x4D, 0xBB, 0x7D, /* 0x90-0x93 */
+	0x00, 0x00, 0xBD, 0xCF, 0xE4, 0x4F, 0x00, 0x00, /* 0x94-0x97 */
+	0xBB, 0xA4, 0xE4, 0x4B, 0xBB, 0xA6, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0x79, 0x00, 0x00, /* 0x9C-0x9F */
+	0xB8, 0xDB, 0xBB, 0x7C, 0x00, 0x00, 0xBB, 0x7A, /* 0xA0-0xA3 */
+	0xBB, 0x7E, 0xBB, 0xA2, 0xBB, 0x77, 0xBB, 0xA7, /* 0xA4-0xA7 */
+	0xBB, 0xA3, 0x00, 0x00, 0xBB, 0xA1, 0xE4, 0x4A, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xBD, 0xD6, 0x00, 0x00, 0xBD, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xD9, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE7, 0xD6, 0xBD, 0xDA, 0xE7, 0xE2, 0xE7, 0xDB, /* 0xB8-0xBB */
+	0xBD, 0xCB, 0xE7, 0xE3, 0xE7, 0xDD, 0xBD, 0xD5, /* 0xBC-0xBF */
+	0xE7, 0xDE, 0x00, 0x00, 0xBD, 0xD4, 0xE7, 0xE1, /* 0xC0-0xC3 */
+	0xBD, 0xCE, 0xE7, 0xDF, 0xE7, 0xD5, 0xBD, 0xCD, /* 0xC4-0xC7 */
+	0xEB, 0xAA, 0xBD, 0xD3, 0x00, 0x00, 0xBD, 0xD0, /* 0xC8-0xCB */
+	0x00, 0x00, 0xBD, 0xD8, 0x00, 0x00, 0xE7, 0xD4, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE7, 0xD8, 0xBD, 0xCC, 0xE7, 0xD7, /* 0xD0-0xD3 */
+	0xE7, 0xD9, 0xE7, 0xDA, 0xBD, 0xD7, 0xE7, 0xDC, /* 0xD4-0xD7 */
+	0xE7, 0xE0, 0xE7, 0xE4, 0x00, 0x00, 0xBD, 0xDB, /* 0xD8-0xDB */
+	0xBF, 0xD2, 0xEB, 0xA5, 0xEB, 0xAB, 0xEB, 0xA8, /* 0xDC-0xDF */
+	0xEB, 0x7E, 0xEB, 0xAC, 0xEB, 0xA1, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xEB, 0xA7, 0x00, 0x00, 0xBF, 0xCD, 0xBF, 0xD3, /* 0xE4-0xE7 */
+	0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xCF, /* 0xE8-0xEB */
+	0x00, 0x00, 0xBF, 0xD9, 0xBF, 0xD4, 0xEB, 0xAF, /* 0xEC-0xEF */
+	0xEB, 0xA9, 0xBF, 0xD0, 0xEB, 0xA2, 0xBF, 0xDA, /* 0xF0-0xF3 */
+	0xEB, 0xA3, 0xEB, 0xA4, 0xBF, 0xDB, 0xBF, 0xD8, /* 0xF4-0xF7 */
+	0xBD, 0xD1, 0x00, 0x00, 0xBF, 0xCE, 0xEB, 0xB0, /* 0xF8-0xFB */
+	0xBF, 0xDC, 0x00, 0x00, 0xBF, 0xD5, 0xEB, 0xAE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0xBF, 0xD1, 0xBF, 0xD6, 0xBF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */
+	0xC1, 0xC3, 0xEE, 0xA4, 0xEE, 0xAD, 0xEE, 0xAA, /* 0x04-0x07 */
+	0xEE, 0xAC, 0x00, 0x00, 0xC1, 0xC0, 0xEE, 0xA5, /* 0x08-0x0B */
+	0x00, 0x00, 0xEE, 0xAB, 0xC1, 0xBC, 0xEE, 0xA7, /* 0x0C-0x0F */
+	0xC1, 0xC4, 0xEE, 0xA3, 0xEE, 0xA8, 0xEE, 0xAF, /* 0x10-0x13 */
+	0xEB, 0xA6, 0xEE, 0xA9, 0xEE, 0xA2, 0xC1, 0xBD, /* 0x14-0x17 */
+	0xEE, 0xA1, 0xC1, 0xBE, 0xEE, 0xB0, 0xC1, 0xBF, /* 0x18-0x1B */
+	0xEE, 0xAE, 0xC1, 0xC2, 0xEE, 0x7E, 0x00, 0x00, /* 0x1C-0x1F */
+	0xC1, 0xC1, 0x00, 0x00, 0xEE, 0xA6, 0xF0, 0xDC, /* 0x20-0x23 */
+	0xF0, 0xEA, 0xF0, 0xE5, 0xF0, 0xE7, 0xF0, 0xDB, /* 0x24-0x27 */
+	0xC2, 0xD3, 0x00, 0x00, 0xF0, 0xDA, 0xC2, 0xD6, /* 0x28-0x2B */
+	0xC2, 0xD5, 0x00, 0x00, 0xF0, 0xE9, 0xF0, 0xE1, /* 0x2C-0x2F */
+	0xF0, 0xDE, 0xF0, 0xE4, 0x00, 0x00, 0xF0, 0xDD, /* 0x30-0x33 */
+	0x00, 0x00, 0xF0, 0xDF, 0xF0, 0xE8, 0xF0, 0xE6, /* 0x34-0x37 */
+	0x00, 0x00, 0xC2, 0xD4, 0xF0, 0xED, 0xF0, 0xEB, /* 0x38-0x3B */
+	0xF0, 0xE2, 0xF0, 0xEC, 0xF0, 0xE3, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF2, 0xF9, 0xC3, 0xCF, 0xF3, 0x41, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xF6, 0x4F, 0xC3, 0xD6, 0xF0, 0xE0, /* 0x44-0x47 */
+	0xF2, 0xF7, 0xC3, 0xD2, 0xF2, 0xF8, 0xF2, 0xFD, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xD4, 0xC3, 0xD5, /* 0x4C-0x4F */
+	0xF2, 0xF6, 0xF3, 0x40, 0xF3, 0x42, 0xF2, 0xFA, /* 0x50-0x53 */
+	0xF2, 0xFC, 0xF2, 0xFE, 0xF2, 0xFB, 0xF3, 0x43, /* 0x54-0x57 */
+	0xC3, 0xD1, 0xC3, 0xD7, 0xC3, 0xD3, 0x00, 0x00, /* 0x58-0x5B */
+	0xC3, 0xD0, 0xF4, 0xD0, 0x00, 0x00, 0xC4, 0xB7, /* 0x5C-0x5F */
+	0xF4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD2, /* 0x60-0x63 */
+	0x00, 0x00, 0xF4, 0xD3, 0xC4, 0xB5, 0xF4, 0xD4, /* 0x64-0x67 */
+	0xF4, 0xD1, 0x00, 0x00, 0xF4, 0xCF, 0xC4, 0xB8, /* 0x68-0x6B */
+	0xC4, 0xB4, 0xF4, 0xD5, 0x00, 0x00, 0xC4, 0xB6, /* 0x6C-0x6F */
+	0xC4, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xC4, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x40, /* 0x74-0x77 */
+	0xF6, 0x4E, 0xF6, 0x4D, 0xF6, 0x50, 0xF6, 0x51, /* 0x78-0x7B */
+	0x00, 0x00, 0xC5, 0x41, 0xF7, 0x56, 0xF7, 0x5B, /* 0x7C-0x7F */
+	
+	0xC5, 0xAA, 0x00, 0x00, 0xF7, 0x58, 0x00, 0x00, /* 0x80-0x83 */
+	0xF7, 0x57, 0xF7, 0x5A, 0xF7, 0x59, 0x00, 0x00, /* 0x84-0x87 */
+	0xF8, 0x43, 0x00, 0x00, 0xC5, 0xDC, 0xF8, 0x42, /* 0x88-0x8B */
+	0xF8, 0x40, 0x00, 0x00, 0xF8, 0x41, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0xFE, 0xC5, 0xFD, /* 0x90-0x93 */
+	0xF8, 0xC1, 0xF8, 0xC2, 0xC6, 0x40, 0x00, 0x00, /* 0x94-0x97 */
+	0xF9, 0x4D, 0xF9, 0x4E, 0xC6, 0x67, 0x00, 0x00, /* 0x98-0x9B */
+	0xC6, 0x6D, 0x00, 0x00, 0xF9, 0xA9, 0xF9, 0xC8, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA6, /* 0x34-0x37 */
+	0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, 0xD7, 0xCE, /* 0x38-0x3B */
+	0xE0, 0x52, 0xE4, 0x50, 0xE7, 0xE5, 0xC1, 0xC6, /* 0x3C-0x3F */
+	0x00, 0x00, 0xC1, 0xC5, 0xF0, 0xEE, 0xF3, 0x44, /* 0x40-0x43 */
+	0x00, 0x00, 0xF8, 0x44, 0xA8, 0xA7, 0xD3, 0xDE, /* 0x44-0x47 */
+	0xB0, 0x5A, 0xB3, 0x61, 0xE0, 0x54, 0xE0, 0x53, /* 0x48-0x4B */
+	0xBD, 0xDC, 0xE7, 0xE6, 0xBD, 0xDD, 0xEE, 0xB1, /* 0x4C-0x4F */
+	0xC2, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xC6, 0x76, 0xA8, 0xA8, 0xCD, 0xCB, 0xD3, 0xDF, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x62, 0x00, 0x00, /* 0x58-0x5B */
+	0xD7, 0xCF, 0xD7, 0xD0, 0x00, 0x00, 0xDB, 0xE5, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB6, 0x48, 0xB8, 0xE6, 0x00, 0x00, /* 0x60-0x63 */
+	0xE0, 0x56, 0xE0, 0x55, 0xE0, 0x57, 0x00, 0x00, /* 0x64-0x67 */
+	0xE4, 0x51, 0xE4, 0x52, 0xBB, 0xA8, 0xBF, 0xDD, /* 0x68-0x6B */
+	0xBD, 0xDE, 0xBF, 0xDE, 0x00, 0x00, 0xEE, 0xB5, /* 0x6C-0x6F */
+	0xEE, 0xB2, 0xEE, 0xB4, 0xEE, 0xB3, 0xC1, 0xC7, /* 0x70-0x73 */
+	0x00, 0x00, 0xF0, 0xEF, 0xF3, 0x46, 0xF3, 0x45, /* 0x74-0x77 */
+	0xCB, 0xA4, 0xB0, 0x5C, 0xB0, 0x5B, 0xD3, 0xE0, /* 0x78-0x7B */
+	0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDB, 0xE7, 0xDB, 0xE6, 0xB6, 0x49, 0x00, 0x00, /* 0x80-0x83 */
+	0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x58, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xB8, 0xE8, 0xB8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */
+	0xBB, 0xAA, 0xBB, 0xA9, 0x00, 0x00, 0xE7, 0xE7, /* 0x8C-0x8F */
+	0xEB, 0xB3, 0xEB, 0xB1, 0xEB, 0xB2, 0xBF, 0xDF, /* 0x90-0x93 */
+	0xEE, 0xB7, 0xEE, 0xB6, 0x00, 0x00, 0xF0, 0xF2, /* 0x94-0x97 */
+	0xF0, 0xF1, 0xF0, 0xF0, 0xF3, 0x47, 0x00, 0x00, /* 0x98-0x9B */
+	0xF9, 0xAA, 0xA8, 0xA9, 0xAD, 0x73, 0x00, 0x00, /* 0x9C-0x9F */
+	0xAD, 0x74, 0xB0, 0x5D, 0xB0, 0x5E, 0xD3, 0xE2, /* 0xA0-0xA3 */
+	0xD3, 0xE1, 0xD7, 0xD2, 0x00, 0x00, 0xB3, 0x68, /* 0xA4-0xA7 */
+	0xB3, 0x66, 0xB3, 0x63, 0xB3, 0x67, 0xB3, 0x65, /* 0xA8-0xAB */
+	0xB3, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x4A, /* 0xAC-0xAF */
+	0xDB, 0xEA, 0x00, 0x00, 0xB8, 0xED, 0xB6, 0x4C, /* 0xB0-0xB3 */
+	0xB6, 0x51, 0xDB, 0xEC, 0xB6, 0x53, 0xB6, 0x52, /* 0xB4-0xB7 */
+	0xB6, 0x55, 0xDB, 0xEB, 0xDB, 0xE8, 0xB6, 0x4F, /* 0xB8-0xBB */
+	0xB6, 0x4B, 0xB6, 0x4D, 0xDB, 0xE9, 0xB6, 0x54, /* 0xBC-0xBF */
+	0xB6, 0x50, 0xB6, 0x4E, 0xB8, 0xEF, 0xB8, 0xEE, /* 0xC0-0xC3 */
+	0xB8, 0xEC, 0xB8, 0xF0, 0x00, 0x00, 0xB8, 0xEA, /* 0xC4-0xC7 */
+	0xB8, 0xEB, 0x00, 0x00, 0xB8, 0xE9, 0x00, 0x00, /* 0xC8-0xCB */
+	0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0xCC-0xCF */
+	0x00, 0x00, 0xBB, 0xAC, 0xBB, 0xAD, 0xBB, 0xAB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE4, 0x53, 0x00, 0x00, 0xE4, 0x55, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE7, 0xEA, 0xE7, 0xEC, 0x00, 0x00, /* 0xD8-0xDB */
+	0xBD, 0xE7, 0xE7, 0xED, 0xBD, 0xE0, 0xE7, 0xE9, /* 0xDC-0xDF */
+	0xBD, 0xDF, 0xBD, 0xE9, 0xBD, 0xE5, 0xBD, 0xE6, /* 0xE0-0xE3 */
+	0xBD, 0xE2, 0xE7, 0xE8, 0xBD, 0xE1, 0xE7, 0xEE, /* 0xE4-0xE7 */
+	0xE7, 0xEB, 0x00, 0x00, 0xBD, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */
+	0xBD, 0xE3, 0xBD, 0xE4, 0xEB, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEB, 0xB7, 0xEB, 0xB6, 0x00, 0x00, 0xEB, 0xB8, /* 0xF0-0xF3 */
+	0xBF, 0xE0, 0xEB, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xC1, 0xCB, 0xEE, 0xB8, 0xC1, 0xC8, 0xC1, 0xCC, /* 0xF8-0xFB */
+	0xC1, 0xCA, 0xC1, 0xC9, 0xF0, 0xF3, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0xF0, 0xF6, 0x00, 0x00, 0xF0, 0xF5, 0x00, 0x00, /* 0x00-0x03 */
+	0xF0, 0xF4, 0xC2, 0xD8, 0xF3, 0x48, 0xF3, 0x49, /* 0x04-0x07 */
+	0xC3, 0xD8, 0xF3, 0x4A, 0xC3, 0xD9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xC4, 0xBA, 0x00, 0x00, 0xC4, 0xB9, /* 0x0C-0x0F */
+	0xF6, 0x52, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x42, /* 0x10-0x13 */
+	0xF6, 0x53, 0xF7, 0x5C, 0xC5, 0xAB, 0xC5, 0xAC, /* 0x14-0x17 */
+	0x00, 0x00, 0xF8, 0x45, 0x00, 0x00, 0xC6, 0x42, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA8, 0xAA, 0x00, 0x00, 0xB3, 0x6A, 0xB3, 0x69, /* 0x64-0x67 */
+	0xE0, 0x5C, 0xE0, 0x5D, 0x00, 0x00, 0xBB, 0xAE, /* 0x68-0x6B */
+	0xEB, 0xB9, 0xBD, 0xEA, 0xEB, 0xBA, 0xEE, 0xB9, /* 0x6C-0x6F */
+	0xA8, 0xAB, 0x00, 0x00, 0xD0, 0xB2, 0xAD, 0x76, /* 0x70-0x73 */
+	0xAD, 0x75, 0x00, 0x00, 0xD3, 0xE3, 0xB0, 0x5F, /* 0x74-0x77 */
+	0xD3, 0xE4, 0xD7, 0xD5, 0x00, 0x00, 0xD7, 0xD4, /* 0x78-0x7B */
+	0x00, 0x00, 0xD7, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDB, 0xEE, 0xB6, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xDB, 0xED, 0xB6, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xDB, 0xEF, 0xB6, 0x56, 0x00, 0x00, /* 0x88-0x8B */
+	0xE0, 0x5F, 0xE0, 0x62, 0xE0, 0x60, 0xE0, 0x61, /* 0x8C-0x8F */
+	0xE0, 0x65, 0xE0, 0x5E, 0xE0, 0x66, 0xE0, 0x63, /* 0x90-0x93 */
+	0xE0, 0x64, 0xBB, 0xB0, 0xE4, 0x56, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xBB, 0xAF, 0x00, 0x00, 0xE7, 0xF2, /* 0x98-0x9B */
+	0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xBD, 0xEB, /* 0x9C-0x9F */
+	0xE7, 0xEF, 0xE7, 0xF1, 0x00, 0x00, 0xBD, 0xEC, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xEB, 0xBB, 0x00, 0x00, 0xEB, 0xBC, /* 0xA4-0xA7 */
+	0xC1, 0xCD, 0x00, 0x00, 0xF3, 0x4C, 0xF3, 0x4E, /* 0xA8-0xAB */
+	0xF3, 0x4B, 0xF3, 0x4D, 0xF4, 0xD6, 0xF6, 0x54, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0x6F, 0xA8, 0xAC, /* 0xB0-0xB3 */
+	0xAD, 0x77, 0xD3, 0xE5, 0xD3, 0xE7, 0xD3, 0xE6, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD7, 0xD8, 0xB3, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD7, 0xD6, 0x00, 0x00, 0xB3, 0x6B, 0xD7, 0xD9, /* 0xBC-0xBF */
+	0x00, 0x00, 0xD7, 0xDA, 0xD7, 0xD7, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xDB, 0xFB, 0xB6, 0x60, 0xDB, 0xF3, /* 0xC4-0xC7 */
+	0xDB, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x5B, /* 0xC8-0xCB */
+	0xB6, 0x5E, 0xDB, 0xF2, 0xB6, 0x59, 0xDB, 0xF6, /* 0xCC-0xCF */
+	0xE0, 0x6C, 0xB6, 0x5D, 0x00, 0x00, 0xDB, 0xF1, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDB, 0xF7, 0xDB, 0xF4, 0xDB, 0xFA, /* 0xD4-0xD7 */
+	0xDB, 0xF0, 0xDB, 0xF8, 0xB6, 0x5C, 0xB6, 0x5F, /* 0xD8-0xDB */
+	0xDB, 0xF5, 0xB6, 0x5A, 0x00, 0x00, 0xB8, 0xF2, /* 0xDC-0xDF */
+	0xE0, 0x68, 0xB8, 0xF1, 0xE0, 0x6F, 0xE0, 0x6E, /* 0xE0-0xE3 */
+	0xB8, 0xF8, 0x00, 0x00, 0xB8, 0xF9, 0xE0, 0x70, /* 0xE4-0xE7 */
+	0xB8, 0xF3, 0xE0, 0x6D, 0xB8, 0xF7, 0xE0, 0x72, /* 0xE8-0xEB */
+	0xE0, 0x69, 0x00, 0x00, 0xE0, 0x6B, 0xB8, 0xF4, /* 0xEC-0xEF */
+	0xE0, 0x67, 0xE0, 0x6A, 0xE0, 0x71, 0xB8, 0xF5, /* 0xF0-0xF3 */
+	0xE0, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */
+	0xBB, 0xB1, 0xE4, 0x5B, 0xE4, 0x61, 0xE4, 0x59, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0xE4, 0x62, 0x00, 0x00, 0xE4, 0x58, 0xE4, 0x5D, /* 0x00-0x03 */
+	0xE4, 0x63, 0xE4, 0x60, 0xE4, 0x5F, 0xE4, 0x5E, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0x57, 0xE4, 0x5C, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE4, 0x5A, 0x00, 0x00, 0xBD, 0xF1, /* 0x0C-0x0F */
+	0xBD, 0xEE, 0xE7, 0xFB, 0xE8, 0x41, 0xE8, 0x43, /* 0x10-0x13 */
+	0xE8, 0x40, 0xE7, 0xF8, 0xE7, 0xFA, 0xE8, 0x45, /* 0x14-0x17 */
+	0xE8, 0x42, 0xE7, 0xFC, 0xE8, 0x46, 0xE7, 0xF9, /* 0x18-0x1B */
+	0xE8, 0x44, 0xBD, 0xEF, 0xBD, 0xF5, 0xBD, 0xF3, /* 0x1C-0x1F */
+	0xE7, 0xF3, 0xBD, 0xF4, 0xBD, 0xF0, 0xE7, 0xF4, /* 0x20-0x23 */
+	0xE7, 0xF6, 0xE7, 0xF5, 0xE7, 0xFD, 0xE7, 0xFE, /* 0x24-0x27 */
+	0x00, 0x00, 0xBD, 0xF2, 0x00, 0x00, 0xBD, 0xED, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEB, 0xC6, 0xBF, 0xE2, 0x00, 0x00, 0xEB, 0xBD, /* 0x30-0x33 */
+	0xBF, 0xE3, 0xBF, 0xE6, 0xEB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */
+	0xEB, 0xBF, 0xBF, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xEB, 0xC3, 0xEB, 0xC4, 0xEB, 0xBE, 0xEB, 0xC7, /* 0x3C-0x3F */
+	0xEB, 0xC0, 0xEB, 0xC5, 0xBF, 0xE4, 0x00, 0x00, /* 0x40-0x43 */
+	0xBF, 0xE1, 0xEB, 0xC1, 0x00, 0x00, 0xEE, 0xBF, /* 0x44-0x47 */
+	0xC1, 0xD0, 0xC1, 0xCE, 0xC1, 0xD1, 0xC1, 0xCF, /* 0x48-0x4B */
+	0xEE, 0xBE, 0xEE, 0xBB, 0xEE, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xBC, /* 0x50-0x53 */
+	0xF1, 0x45, 0xC2, 0xDE, 0xF0, 0xFB, 0xF0, 0xFA, /* 0x54-0x57 */
+	0x00, 0x00, 0xC2, 0xD9, 0xF1, 0x41, 0xF1, 0x40, /* 0x58-0x5B */
+	0xF0, 0xF7, 0xF1, 0x43, 0xF0, 0xFC, 0xC2, 0xDD, /* 0x5C-0x5F */
+	0xF0, 0xF9, 0xF1, 0x42, 0xF0, 0xF8, 0xC2, 0xDA, /* 0x60-0x63 */
+	0xC2, 0xDC, 0xF0, 0xFD, 0xC2, 0xDB, 0xF0, 0xFE, /* 0x64-0x67 */
+	0x00, 0x00, 0xF1, 0x44, 0xF3, 0x52, 0x00, 0x00, /* 0x68-0x6B */
+	0xC3, 0xDE, 0xF3, 0x4F, 0x00, 0x00, 0xF3, 0x53, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xDB, 0xF3, 0x51, /* 0x70-0x73 */
+	0xC3, 0xE0, 0x00, 0x00, 0xC3, 0xDD, 0x00, 0x00, /* 0x74-0x77 */
+	0xF3, 0x50, 0x00, 0x00, 0xC3, 0xDF, 0xF3, 0x54, /* 0x78-0x7B */
+	0xC3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xC4, 0xBC, 0xC4, 0xBE, 0x00, 0x00, /* 0x80-0x83 */
+	0xF4, 0xD9, 0xC4, 0xBD, 0xF4, 0xD7, 0xC3, 0xDC, /* 0x84-0x87 */
+	0xF4, 0xD8, 0xC4, 0xBB, 0xC5, 0x43, 0xC5, 0x45, /* 0x88-0x8B */
+	0xF6, 0x56, 0xC5, 0x44, 0xF6, 0x55, 0x00, 0x00, /* 0x8C-0x8F */
+	0xF7, 0x61, 0xC5, 0xAD, 0xF7, 0x60, 0xC5, 0xAE, /* 0x90-0x93 */
+	0xF7, 0x5E, 0xF7, 0x5D, 0xF7, 0x62, 0xF7, 0x63, /* 0x94-0x97 */
+	0xF8, 0x46, 0x00, 0x00, 0xF7, 0x5F, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF8, 0xC6, 0xF8, 0xC3, 0xF8, 0xC4, /* 0x9C-0x9F */
+	0xF8, 0xC5, 0xC6, 0x5C, 0x00, 0x00, 0xF9, 0x51, /* 0xA0-0xA3 */
+	0xF9, 0x50, 0xF9, 0x4F, 0xF9, 0x70, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF9, 0xBE, 0xF9, 0xAB, 0xC6, 0x6E, 0xA8, 0xAD, /* 0xA8-0xAB */
+	0xB0, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xFA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xF6, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xC2, 0xDF, 0x00, 0x00, 0xF3, 0x55, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF9, 0xAC, 0xA8, 0xAE, 0xAA, 0xEE, /* 0xC8-0xCB */
+	0xAD, 0x79, 0xAD, 0x78, 0x00, 0x00, 0xB0, 0x63, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD3, 0xE8, 0xB0, 0x61, 0xD3, 0xE9, /* 0xD0-0xD3 */
+	0xB0, 0x62, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDF, /* 0xD4-0xD7 */
+	0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x6D, /* 0xD8-0xDB */
+	0xD7, 0xDE, 0xD7, 0xDD, 0xD7, 0xDC, 0xB3, 0x6E, /* 0xDC-0xDF */
+	0xD7, 0xE0, 0xD7, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xDC, 0x43, 0xDC, 0x41, 0xDC, 0x45, /* 0xE4-0xE7 */
+	0xDC, 0x46, 0xDC, 0x4C, 0x00, 0x00, 0xDC, 0x48, /* 0xE8-0xEB */
+	0xDC, 0x4A, 0x00, 0x00, 0xDC, 0x42, 0xDB, 0xFC, /* 0xEC-0xEF */
+	0x00, 0x00, 0xDC, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xDC, 0x4B, 0xDC, 0x44, 0xDC, 0x47, 0xDB, 0xFD, /* 0xF4-0xF7 */
+	0xB6, 0x62, 0xDC, 0x40, 0xDB, 0xFE, 0xB6, 0x61, /* 0xF8-0xFB */
+	0xB6, 0x63, 0x00, 0x00, 0xB8, 0xFD, 0xE0, 0x75, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0xE0, 0x77, 0xE0, 0x76, 0xE0, 0x7B, 0xB8, 0xFB, /* 0x00-0x03 */
+	0x00, 0x00, 0xE0, 0x78, 0xE0, 0x74, 0xE0, 0x79, /* 0x04-0x07 */
+	0xE0, 0x7A, 0xB8, 0xFC, 0xB8, 0xFE, 0xE0, 0x7C, /* 0x08-0x0B */
+	0x00, 0x00, 0xE4, 0x67, 0xE4, 0x66, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE4, 0x64, 0xE4, 0x65, 0xBB, 0xB3, 0xBB, 0xB5, /* 0x10-0x13 */
+	0xBB, 0xB2, 0xBB, 0xB4, 0xE8, 0x4D, 0xE8, 0x4E, /* 0x14-0x17 */
+	0xE8, 0x49, 0x00, 0x00, 0xE8, 0x4A, 0xBD, 0xF8, /* 0x18-0x1B */
+	0xBD, 0xFD, 0xBD, 0xF7, 0xBD, 0xFE, 0xBD, 0xF9, /* 0x1C-0x1F */
+	0xE8, 0x4B, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x4C, /* 0x20-0x23 */
+	0xE8, 0x48, 0xBE, 0x40, 0xBD, 0xFB, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xBD, 0xFA, 0xBD, 0xFC, 0x00, 0x00, /* 0x28-0x2B */
+	0xE8, 0x47, 0x00, 0x00, 0xEB, 0xCA, 0xBF, 0xE8, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xCC, 0xBF, 0xEA, /* 0x30-0x33 */
+	0xEB, 0xCF, 0xEB, 0xCB, 0xEB, 0xC9, 0xEB, 0xCE, /* 0x34-0x37 */
+	0xBF, 0xE9, 0xEB, 0xCD, 0x00, 0x00, 0xBF, 0xE7, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xC1, 0xD3, 0xC1, 0xD6, /* 0x3C-0x3F */
+	0xEE, 0xC1, 0x00, 0x00, 0xC1, 0xD4, 0xEE, 0xC0, /* 0x40-0x43 */
+	0xC1, 0xD2, 0xC1, 0xD5, 0xF1, 0x46, 0xF1, 0x47, /* 0x44-0x47 */
+	0xF1, 0x48, 0xC2, 0xE0, 0x00, 0x00, 0xF1, 0x49, /* 0x48-0x4B */
+	0x00, 0x00, 0xC2, 0xE1, 0xC3, 0xE2, 0xF3, 0x58, /* 0x4C-0x4F */
+	0xF3, 0x59, 0xF3, 0x57, 0xF3, 0x56, 0xF3, 0x5A, /* 0x50-0x53 */
+	0xC3, 0xE1, 0xF4, 0xDD, 0xF4, 0xDB, 0xF4, 0xDC, /* 0x54-0x57 */
+	0xF4, 0xDE, 0xF4, 0xDA, 0xF4, 0xDF, 0xF6, 0x58, /* 0x58-0x5B */
+	0x00, 0x00, 0xF6, 0x59, 0xF6, 0x57, 0xC5, 0x46, /* 0x5C-0x5F */
+	0xF7, 0x64, 0xC5, 0xAF, 0xF7, 0x65, 0xF8, 0x48, /* 0x60-0x63 */
+	0xF8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAF, /* 0x98-0x9B */
+	0xB6, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x40, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xB6, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0xEC, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xC3, 0xE3, 0xC4, 0x7C, 0xC5, 0x47, /* 0xAC-0xAF */
+	0xA8, 0xB0, 0xB0, 0x64, 0xB9, 0x41, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF3, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA6, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA8, 0xB4, 0xA8, 0xB3, 0xA8, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xCB, 0xA5, 0x00, 0x00, 0xCD, 0xCD, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCD, 0xCF, 0xAA, 0xEF, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xAA, 0xF1, 0xCD, 0xCC, 0xCD, 0xCE, /* 0xD0-0xD3 */
+	0xAA, 0xF0, 0xCD, 0xD1, 0xCD, 0xD0, 0xCD, 0xD2, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD0, 0xB6, 0xD0, 0xB4, 0xAD, 0x7C, 0xD0, 0xB3, /* 0xE0-0xE3 */
+	0xAD, 0xA3, 0xAD, 0x7E, 0xAD, 0x7B, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAD, 0xA4, 0x00, 0x00, 0xAD, 0x7D, 0xAD, 0xA2, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAD, 0xA1, 0xD0, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */
+	0xAD, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xB0, 0x6A, 0xD3, 0xEB, 0xD3, 0xF1, 0xB0, 0x67, /* 0xF4-0xF7 */
+	0xB0, 0x6E, 0x00, 0x00, 0xB0, 0x69, 0xD3, 0xEE, /* 0xF8-0xFB */
+	0xD3, 0xF0, 0xB0, 0x6C, 0xD3, 0xEA, 0xD3, 0xED, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0xB0, 0x68, 0xB0, 0x65, 0xD3, 0xEC, 0xB0, 0x6B, /* 0x00-0x03 */
+	0xD3, 0xEF, 0xB0, 0x6D, 0xB0, 0x66, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xE3, /* 0x08-0x0B */
+	0xD7, 0xE6, 0xB3, 0x70, 0x00, 0x00, 0xB3, 0x7A, /* 0x0C-0x0F */
+	0xB3, 0x76, 0xD7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xB3, 0x7E, 0xB3, 0x77, 0xB3, 0x7C, 0xB3, 0x72, /* 0x14-0x17 */
+	0x00, 0x00, 0xB3, 0x6F, 0xB3, 0x71, 0xB3, 0x7D, /* 0x18-0x1B */
+	0xD7, 0xE5, 0xB3, 0x75, 0xB3, 0x78, 0xB3, 0x74, /* 0x1C-0x1F */
+	0xB3, 0x79, 0xD7, 0xE7, 0xB3, 0x7B, 0xB3, 0x73, /* 0x20-0x23 */
+	0xD7, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xDC, 0x4D, 0xB6, 0x65, 0xDC, 0x4F, /* 0x2C-0x2F */
+	0x00, 0x00, 0xB6, 0x67, 0xB6, 0x69, 0x00, 0x00, /* 0x30-0x33 */
+	0xDC, 0x4E, 0xB6, 0x66, 0xB6, 0x6A, 0x00, 0x00, /* 0x34-0x37 */
+	0xB6, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xB9, 0x47, 0xE0, 0xA3, 0xB9, 0x4F, 0xE0, 0x7E, /* 0x3C-0x3F */
+	0x00, 0x00, 0xB9, 0x50, 0xB9, 0x45, 0x00, 0x00, /* 0x40-0x43 */
+	0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x4A, /* 0x44-0x47 */
+	0x00, 0x00, 0xE0, 0xA2, 0xB9, 0x43, 0xB9, 0x42, /* 0x48-0x4B */
+	0x00, 0x00, 0xB9, 0x4D, 0xB9, 0x4C, 0xB9, 0x4B, /* 0x4C-0x4F */
+	0xB9, 0x49, 0xB9, 0x4E, 0xE0, 0x7D, 0xB9, 0x44, /* 0x50-0x53 */
+	0xB9, 0x46, 0xB9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xBB, 0xB8, 0xBB, 0xBB, 0x00, 0x00, 0xBB, 0xBF, /* 0x58-0x5B */
+	0xBB, 0xB9, 0xBB, 0xBE, 0xBB, 0xBC, 0x00, 0x00, /* 0x5C-0x5F */
+	0xBB, 0xB7, 0x00, 0x00, 0xBB, 0xBD, 0xBB, 0xBA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x52, /* 0x64-0x67 */
+	0xBE, 0x43, 0xBE, 0x41, 0x00, 0x00, 0xE8, 0x53, /* 0x68-0x6B */
+	0x00, 0x00, 0xBE, 0x44, 0xBE, 0x42, 0xE8, 0x51, /* 0x6C-0x6F */
+	0xE8, 0x50, 0x00, 0x00, 0xBF, 0xF0, 0xE8, 0x4F, /* 0x70-0x73 */
+	0xBF, 0xEE, 0xBF, 0xED, 0xEB, 0xD0, 0xBE, 0x45, /* 0x74-0x77 */
+	0xBF, 0xEF, 0xEB, 0xD1, 0xBF, 0xF2, 0xEB, 0xD2, /* 0x78-0x7B */
+	0xBF, 0xF1, 0xC1, 0xD8, 0xEE, 0xC3, 0xC1, 0xD7, /* 0x7C-0x7F */
+	
+	0xC1, 0xDC, 0xC1, 0xDA, 0xC1, 0xDB, 0xC2, 0xE3, /* 0x80-0x83 */
+	0xC1, 0xD9, 0xEE, 0xC2, 0xEB, 0xD3, 0xC2, 0xE2, /* 0x84-0x87 */
+	0xC2, 0xE4, 0x00, 0x00, 0xC3, 0xE4, 0xC3, 0xE5, /* 0x88-0x8B */
+	0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, 0xC5, 0xDE, /* 0x8C-0x8F */
+	0xC5, 0xDD, 0xA8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCA, 0x55, 0xB0, 0x6F, 0x00, 0x00, 0xCA, 0x52, /* 0x94-0x97 */
+	0xCA, 0x53, 0xCA, 0x51, 0x00, 0x00, 0xCA, 0x54, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xAA, 0xCB, 0xA7, /* 0x9C-0x9F */
+	0xCB, 0xAC, 0xCB, 0xA8, 0xA8, 0xB7, 0xA8, 0xBA, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCB, 0xA9, 0xA8, 0xB9, 0xCB, 0xAB, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0xAC-0xAF */
+	0xCD, 0xD7, 0xAA, 0xF4, 0xCD, 0xD3, 0xCD, 0xD6, /* 0xB0-0xB3 */
+	0xCD, 0xD4, 0xAA, 0xF2, 0xAA, 0xF5, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xAA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD0, 0xB8, 0xD0, 0xBC, 0xD0, 0xB9, /* 0xBC-0xBF */
+	0x00, 0x00, 0xAD, 0xA7, 0x00, 0x00, 0xAD, 0xA8, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD0, 0xBB, 0x00, 0x00, 0xD0, 0xBD, /* 0xC4-0xC7 */
+	0xD0, 0xBF, 0x00, 0x00, 0xAD, 0xA5, 0xD0, 0xBE, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xA6, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xD7, 0xEE, 0xD0, 0xBA, 0xD3, 0xF2, 0xD3, 0xFB, /* 0xD4-0xD7 */
+	0xD3, 0xF9, 0xD3, 0xF4, 0xD3, 0xF5, 0xD3, 0xFA, /* 0xD8-0xDB */
+	0xD3, 0xFC, 0xB0, 0x71, 0x00, 0x00, 0xD3, 0xF7, /* 0xDC-0xDF */
+	0xD3, 0xF3, 0xB0, 0x70, 0xB0, 0x72, 0xD3, 0xF6, /* 0xE0-0xE3 */
+	0xD3, 0xFD, 0xD3, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xB3, 0xA1, 0xD7, 0xF1, 0xD7, 0xE9, 0xD7, 0xEF, /* 0xE8-0xEB */
+	0xD7, 0xF0, 0xB3, 0xA2, 0x00, 0x00, 0xD7, 0xE8, /* 0xEC-0xEF */
+	0xD7, 0xEA, 0xD0, 0xB7, 0xD7, 0xEC, 0xD7, 0xED, /* 0xF0-0xF3 */
+	0xD7, 0xEB, 0xB6, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDC, 0x56, 0xEB, 0xD4, 0xDC, 0x57, /* 0xF8-0xFB */
+	0xDC, 0x54, 0xB3, 0xA3, 0xB6, 0x6E, 0xDC, 0x53, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0xDC, 0x59, 0xDC, 0x58, 0xB6, 0x6B, 0xDC, 0x5C, /* 0x00-0x03 */
+	0xDC, 0x52, 0xDC, 0x5B, 0xDC, 0x50, 0xDC, 0x5A, /* 0x04-0x07 */
+	0xDC, 0x55, 0xB6, 0x6D, 0x00, 0x00, 0xE0, 0xAA, /* 0x08-0x0B */
+	0x00, 0x00, 0xE0, 0xA5, 0xE0, 0xAB, 0xE0, 0xA6, /* 0x0C-0x0F */
+	0xE0, 0xA4, 0xE0, 0xA7, 0xB9, 0x51, 0x00, 0x00, /* 0x10-0x13 */
+	0xE0, 0xA9, 0x00, 0x00, 0xE0, 0xA8, 0xB9, 0x52, /* 0x14-0x17 */
+	0xBB, 0xC1, 0xBB, 0xC0, 0xE4, 0x6E, 0xE4, 0x71, /* 0x18-0x1B */
+	0xE4, 0x69, 0xE4, 0x6D, 0xBB, 0xC2, 0xE4, 0x6C, /* 0x1C-0x1F */
+	0xE4, 0x6A, 0xE4, 0x70, 0xE4, 0x6B, 0xE4, 0x68, /* 0x20-0x23 */
+	0xE4, 0x6F, 0x00, 0x00, 0xE8, 0x59, 0xBE, 0x48, /* 0x24-0x27 */
+	0xF1, 0x4A, 0xE8, 0x56, 0xE8, 0x57, 0xE8, 0x55, /* 0x28-0x2B */
+	0xDC, 0x51, 0xBE, 0x47, 0xE8, 0x5A, 0xE8, 0x54, /* 0x2C-0x2F */
+	0xBE, 0x46, 0xBE, 0x49, 0xE8, 0x58, 0xEB, 0xD5, /* 0x30-0x33 */
+	0xBF, 0xF3, 0xEB, 0xD6, 0xEB, 0xD7, 0x00, 0x00, /* 0x34-0x37 */
+	0xEE, 0xC4, 0xC1, 0xDD, 0xF1, 0x4B, 0xF1, 0x4C, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0x4D, 0xF3, 0x5D, /* 0x3C-0x3F */
+	0xF3, 0x5C, 0xF4, 0xE2, 0x00, 0x00, 0xF4, 0xE1, /* 0x40-0x43 */
+	0xF6, 0x5B, 0xF6, 0x5C, 0xF6, 0x5A, 0xF7, 0x66, /* 0x44-0x47 */
+	0xC5, 0xB0, 0xA8, 0xBB, 0xAD, 0xAA, 0xAD, 0xA9, /* 0x48-0x4B */
+	0xB0, 0x75, 0xB0, 0x74, 0xD4, 0x40, 0xD4, 0x41, /* 0x4C-0x4F */
+	0xD3, 0xFE, 0x00, 0x00, 0xB0, 0x73, 0xD7, 0xF5, /* 0x50-0x53 */
+	0x00, 0x00, 0xD7, 0xF6, 0xD7, 0xF2, 0xB3, 0xA4, /* 0x54-0x57 */
+	0xD7, 0xF3, 0x00, 0x00, 0xD7, 0xF4, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0x5F, /* 0x5C-0x5F */
+	0xDC, 0x61, 0xDC, 0x5D, 0xDC, 0x60, 0xB6, 0x6F, /* 0x60-0x63 */
+	0xDC, 0x5E, 0xB6, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xDD, 0x73, 0xB9, 0x55, 0xB9, 0x54, 0x00, 0x00, /* 0x68-0x6B */
+	0xB9, 0x53, 0x00, 0x00, 0xE0, 0xAC, 0xE0, 0xAD, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0xE4, 0x75, /* 0x70-0x73 */
+	0xBB, 0xC6, 0xBB, 0xC3, 0x00, 0x00, 0xBB, 0xC5, /* 0x74-0x77 */
+	0xBB, 0xC4, 0xE4, 0x74, 0xE4, 0x72, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE8, 0x61, 0xE8, 0x5E, 0xE8, 0x5F, 0xBE, 0x4D, /* 0x80-0x83 */
+	0xE8, 0x60, 0xE8, 0x5B, 0xE8, 0x5C, 0xBE, 0x4A, /* 0x84-0x87 */
+	0x00, 0x00, 0xBE, 0x4B, 0xE8, 0x5D, 0xBE, 0x4C, /* 0x88-0x8B */
+	0x00, 0x00, 0xEB, 0xDB, 0x00, 0x00, 0xEB, 0xDC, /* 0x8C-0x8F */
+	0xEB, 0xD9, 0xEB, 0xDA, 0xBF, 0xF4, 0xEB, 0xD8, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xEE, 0xC8, 0xEE, 0xC5, 0xEE, 0xC7, /* 0x98-0x9B */
+	0xC1, 0xE0, 0xEE, 0xCB, 0xC1, 0xDF, 0xEE, 0xC9, /* 0x9C-0x9F */
+	0xEE, 0xCC, 0xEE, 0xCA, 0xEE, 0xC6, 0xC1, 0xDE, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xF1, 0x4F, 0x00, 0x00, 0xF1, 0x50, /* 0xA4-0xA7 */
+	0xF1, 0x4E, 0x00, 0x00, 0xF1, 0x52, 0xC2, 0xE5, /* 0xA8-0xAB */
+	0xC2, 0xE6, 0xF3, 0x5F, 0xC3, 0xE7, 0xF1, 0x51, /* 0xAC-0xAF */
+	0xF3, 0x5E, 0xC3, 0xE6, 0xF4, 0xE5, 0xF4, 0xE6, /* 0xB0-0xB3 */
+	0xC4, 0xBF, 0xF4, 0xE4, 0x00, 0x00, 0xF4, 0xE3, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF6, 0x5D, 0xC5, 0x48, 0x00, 0x00, /* 0xB8-0xBB */
+	0xF8, 0x49, 0xF8, 0xC8, 0xF8, 0xC7, 0x00, 0x00, /* 0xBC-0xBF */
+	0xC6, 0x43, 0xC6, 0x5D, 0xF8, 0xC9, 0xF9, 0x71, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xC6, 0x6F, 0xA8, 0xBC, 0xAA, 0xF6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xB9, 0x56, 0x00, 0x00, 0xC4, 0xC0, /* 0xC8-0xCB */
+	0xA8, 0xBD, 0xAD, 0xAB, 0xB3, 0xA5, 0xB6, 0x71, /* 0xCC-0xCF */
+	0xC2, 0xE7, 0xAA, 0xF7, 0x00, 0x00, 0xD0, 0xC1, /* 0xD0-0xD3 */
+	0xD0, 0xC0, 0xD4, 0x42, 0x00, 0x00, 0xB0, 0x78, /* 0xD4-0xD7 */
+	0xB0, 0x76, 0xB0, 0x7A, 0xD4, 0x44, 0x00, 0x00, /* 0xD8-0xDB */
+	0xB0, 0x79, 0xB0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0x43, 0xB3, 0xA8, /* 0xE0-0xE3 */
+	0xD7, 0xFC, 0x00, 0x00, 0xB3, 0xA7, 0xB3, 0xA9, /* 0xE4-0xE7 */
+	0xD8, 0x42, 0xB3, 0xAB, 0xD7, 0xFE, 0xD8, 0x40, /* 0xE8-0xEB */
+	0xD7, 0xF7, 0xB3, 0xAA, 0xD8, 0x43, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD7, 0xF9, 0x00, 0x00, 0xD7, 0xFA, /* 0xF0-0xF3 */
+	0xD7, 0xF8, 0xB3, 0xA6, 0x00, 0x00, 0xD8, 0x41, /* 0xF4-0xF7 */
+	0xD7, 0xFB, 0xD7, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xDC, 0x6D, 0x00, 0x00, 0xDC, 0x6C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_92[512] = {
+	0xDC, 0x6A, 0xDC, 0x62, 0xDC, 0x71, 0xDC, 0x65, /* 0x00-0x03 */
+	0xDC, 0x6F, 0xDC, 0x76, 0xDC, 0x6E, 0xB6, 0x79, /* 0x04-0x07 */
+	0x00, 0x00, 0xB6, 0x75, 0xDC, 0x63, 0x00, 0x00, /* 0x08-0x0B */
+	0xDC, 0x69, 0xB6, 0x77, 0x00, 0x00, 0xDC, 0x68, /* 0x0C-0x0F */
+	0xB6, 0x78, 0xB6, 0x7A, 0xDC, 0x6B, 0x00, 0x00, /* 0x10-0x13 */
+	0xB6, 0x72, 0xB6, 0x73, 0xDC, 0x77, 0xDC, 0x75, /* 0x14-0x17 */
+	0x00, 0x00, 0xDC, 0x74, 0xDC, 0x66, 0x00, 0x00, /* 0x18-0x1B */
+	0xDC, 0x72, 0x00, 0x00, 0xB6, 0x76, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x74, /* 0x20-0x23 */
+	0xDC, 0x73, 0xDC, 0x64, 0xDC, 0x67, 0xDC, 0x70, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xE4, 0xBA, 0xE0, 0xB7, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE0, 0xB0, 0xE0, 0xC3, 0xE0, 0xCC, 0xE0, 0xB3, /* 0x30-0x33 */
+	0xB9, 0x61, 0x00, 0x00, 0xE0, 0xC0, 0xB9, 0x57, /* 0x34-0x37 */
+	0xB9, 0x59, 0xB9, 0x65, 0xE0, 0xB1, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB9, 0x5A, 0xB9, 0x5C, 0xB9, 0x66, /* 0x3C-0x3F */
+	0xB9, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xB9, 0x64, 0xE0, 0xB9, 0x00, 0x00, /* 0x44-0x47 */
+	0xE0, 0xAE, 0xB9, 0x62, 0xE0, 0xB8, 0xB9, 0x5E, /* 0x48-0x4B */
+	0xE0, 0xCA, 0xB9, 0x63, 0xE0, 0xC8, 0xE0, 0xBC, /* 0x4C-0x4F */
+	0xE0, 0xC6, 0xB9, 0x60, 0xE0, 0xAF, 0xE0, 0xC9, /* 0x50-0x53 */
+	0xE0, 0xC4, 0x00, 0x00, 0xE0, 0xCB, 0xB9, 0x58, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0x67, 0xB9, 0x5D, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE0, 0xBD, 0xE0, 0xC1, 0x00, 0x00, 0xE0, 0xC5, /* 0x60-0x63 */
+	0xB9, 0x5F, 0xE0, 0xB4, 0xE0, 0xB2, 0xE0, 0xBE, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0xBB, 0xE0, 0xBA, 0x00, 0x00, 0xE0, 0xBF, /* 0x6C-0x6F */
+	0xE0, 0xC2, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x78, 0x00, 0x00, /* 0x74-0x77 */
+	0xBB, 0xC7, 0xE4, 0xA4, 0xE4, 0x7A, 0xBB, 0xCC, /* 0x78-0x7B */
+	0xBB, 0xD0, 0xE4, 0xAD, 0xE4, 0xB5, 0xE4, 0xA6, /* 0x7C-0x7F */
+	
+	0xBB, 0xC8, 0x00, 0x00, 0xE4, 0xAA, 0xE0, 0xB6, /* 0x80-0x83 */
+	0x00, 0x00, 0xBB, 0xC9, 0xE4, 0xB1, 0xE4, 0xB6, /* 0x84-0x87 */
+	0xE4, 0xAE, 0x00, 0x00, 0xE4, 0xB0, 0xE4, 0xB9, /* 0x88-0x8B */
+	0xE4, 0xB2, 0xE4, 0x7E, 0xE4, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBB, 0xD1, 0x00, 0x00, 0xBB, 0xCD, /* 0x90-0x93 */
+	0xE4, 0x7C, 0xE4, 0xAB, 0xBB, 0xCB, 0xE4, 0xA5, /* 0x94-0x97 */
+	0xBB, 0xCA, 0xE4, 0xB3, 0xE4, 0xA2, 0xE4, 0x79, /* 0x98-0x9B */
+	0xBB, 0xCE, 0xE4, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE4, 0x7B, 0xE4, 0xAF, 0xE4, 0xAC, 0xE4, 0xA7, /* 0xA0-0xA3 */
+	0xE4, 0x77, 0xE4, 0x76, 0xE4, 0xA1, 0xE4, 0xB4, /* 0xA4-0xA7 */
+	0xBB, 0xCF, 0xE4, 0xB7, 0xE4, 0x7D, 0xE4, 0xA3, /* 0xA8-0xAB */
+	0xBE, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xBE, 0x5A, 0xBE, 0x55, /* 0xB0-0xB3 */
+	0xE8, 0xA4, 0xE8, 0xA1, 0xE8, 0x67, 0xBE, 0x50, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF9, 0xD7, 0x00, 0x00, 0xBE, 0x4F, /* 0xB8-0xBB */
+	0xBE, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE8, 0x65, 0xBE, 0x54, 0xE8, 0x71, 0xE8, 0x63, /* 0xC0-0xC3 */
+	0xE8, 0x64, 0xBE, 0x4E, 0xE8, 0xA3, 0xBE, 0x58, /* 0xC4-0xC7 */
+	0xE8, 0x74, 0xE8, 0x79, 0xE8, 0x73, 0xEB, 0xEE, /* 0xC8-0xCB */
+	0xE8, 0x6F, 0xE8, 0x77, 0xE8, 0x75, 0xE8, 0x68, /* 0xCC-0xCF */
+	0xE8, 0x62, 0xE8, 0x7D, 0xBE, 0x57, 0xE8, 0x7E, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE8, 0x78, 0x00, 0x00, 0xE8, 0x6D, /* 0xD4-0xD7 */
+	0xE8, 0x6B, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE8, 0x6E, 0xE8, 0x7B, 0xE8, 0x6A, /* 0xDC-0xDF */
+	0xE8, 0x7A, 0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xBE, 0x53, 0x00, 0x00, 0xE8, 0x76, 0xE8, 0x7C, /* 0xE4-0xE7 */
+	0xE8, 0x72, 0xE8, 0x6C, 0xBE, 0x51, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, 0xE8, 0x70, /* 0xEC-0xEF */
+	0xBE, 0x59, 0xE8, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF4, /* 0xF4-0xF7 */
+	0xBF, 0xF7, 0xEB, 0xF3, 0xEB, 0xF0, 0xEC, 0x44, /* 0xF8-0xFB */
+	0xBF, 0xFB, 0x00, 0x00, 0xEC, 0x41, 0xEB, 0xF8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0xEC, 0x43, 0xEB, 0xE9, 0xEB, 0xF6, 0x00, 0x00, /* 0x00-0x03 */
+	0xBF, 0xFD, 0x00, 0x00, 0xEB, 0xE1, 0x00, 0x00, /* 0x04-0x07 */
+	0xEB, 0xDF, 0xEC, 0x42, 0x00, 0x00, 0xEC, 0x40, /* 0x08-0x0B */
+	0xEB, 0xFE, 0xEB, 0xED, 0xEB, 0xEC, 0xEB, 0xE2, /* 0x0C-0x0F */
+	0xC0, 0x40, 0x00, 0x00, 0xEB, 0xE8, 0xEB, 0xF2, /* 0x10-0x13 */
+	0xEB, 0xFD, 0xC0, 0x43, 0xEC, 0x45, 0x00, 0x00, /* 0x14-0x17 */
+	0xC1, 0xE8, 0xC0, 0x45, 0xBF, 0xFE, 0xEB, 0xE6, /* 0x18-0x1B */
+	0x00, 0x00, 0xEB, 0xEF, 0xEB, 0xDE, 0xEB, 0xE0, /* 0x1C-0x1F */
+	0xBF, 0xF5, 0xC0, 0x42, 0xBF, 0xFA, 0xEB, 0xE7, /* 0x20-0x23 */
+	0xEB, 0xF7, 0xEB, 0xF1, 0xC0, 0x41, 0xEB, 0xDD, /* 0x24-0x27 */
+	0xC1, 0xE3, 0xEB, 0xF9, 0xEB, 0xFC, 0xBF, 0xFC, /* 0x28-0x2B */
+	0x00, 0x00, 0xEB, 0xEB, 0xC0, 0x44, 0xBF, 0xF9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xF8, /* 0x30-0x33 */
+	0xEB, 0xF5, 0xEB, 0xFB, 0xBF, 0xF6, 0x00, 0x00, /* 0x34-0x37 */
+	0xEB, 0xE4, 0xEB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xEB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xEA, 0xEE, 0xD2, /* 0x44-0x47 */
+	0x00, 0x00, 0xEE, 0xD7, 0xC1, 0xE5, 0xC1, 0xE7, /* 0x48-0x4B */
+	0xEE, 0xDD, 0xC1, 0xE1, 0xEE, 0xEC, 0xEE, 0xE3, /* 0x4C-0x4F */
+	0xEE, 0xD8, 0xEE, 0xD9, 0xEE, 0xE2, 0x00, 0x00, /* 0x50-0x53 */
+	0xC1, 0xEE, 0xEE, 0xE1, 0xEE, 0xD1, 0xEE, 0xE0, /* 0x54-0x57 */
+	0xEE, 0xD4, 0xEE, 0xED, 0xC1, 0xED, 0xC1, 0xEB, /* 0x58-0x5B */
+	0xEE, 0xD5, 0x00, 0x00, 0xEE, 0xE8, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEE, 0xDA, 0xEE, 0xE7, 0x00, 0x00, 0xEE, 0xE9, /* 0x60-0x63 */
+	0xEE, 0xD0, 0xC1, 0xE6, 0x00, 0x00, 0xEE, 0xEA, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x68-0x6B */
+	0xC1, 0xEA, 0xEE, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xC1, 0xEC, 0xEE, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xC1, 0xE4, 0xEE, 0xD6, 0xEE, 0xE5, /* 0x74-0x77 */
+	0x00, 0x00, 0xEE, 0xDF, 0xEB, 0xE3, 0xEE, 0xE6, /* 0x78-0x7B */
+	0xEE, 0xD3, 0x00, 0x00, 0xC1, 0xE9, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEE, 0xEB, 0x00, 0x00, 0xC1, 0xE2, 0xEE, 0xCE, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xF1, 0x60, 0xF1, 0x59, 0xC2, 0xE9, 0x00, 0x00, /* 0x88-0x8B */
+	0xF1, 0x54, 0xF1, 0x63, 0xF1, 0x5B, 0xEE, 0xDC, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF1, 0x65, 0xF1, 0x55, 0x00, 0x00, /* 0x90-0x93 */
+	0xC2, 0xE8, 0xF1, 0x5F, 0xC2, 0xEA, 0xC2, 0xF2, /* 0x94-0x97 */
+	0xC2, 0xF0, 0xF1, 0x61, 0xC2, 0xF1, 0xF1, 0x57, /* 0x98-0x9B */
+	0x00, 0x00, 0xF1, 0x58, 0xF1, 0x5D, 0xF1, 0x62, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEE, 0xCD, 0xC2, 0xEB, 0xF1, 0x6A, /* 0xA0-0xA3 */
+	0xF1, 0x67, 0xF1, 0x6B, 0xF1, 0x5E, 0xF1, 0x5A, /* 0xA4-0xA7 */
+	0xF1, 0x68, 0xF3, 0x6A, 0xF1, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */
+	0xC2, 0xEE, 0x00, 0x00, 0xC2, 0xED, 0xEE, 0xCF, /* 0xAC-0xAF */
+	0xC2, 0xEF, 0xF1, 0x64, 0xF1, 0x66, 0xC2, 0xEC, /* 0xB0-0xB3 */
+	0xF1, 0x69, 0xF1, 0x53, 0x00, 0x00, 0xF1, 0x56, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xF3, 0x73, 0x00, 0x00, 0xF3, 0x63, 0xC3, 0xEB, /* 0xC0-0xC3 */
+	0xF3, 0x71, 0x00, 0x00, 0x00, 0x00, 0xF3, 0x61, /* 0xC4-0xC7 */
+	0xC3, 0xEC, 0x00, 0x00, 0xF3, 0x6C, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF3, 0x68, 0xC3, 0xF1, 0xF3, 0x72, 0xF3, 0x62, /* 0xCC-0xCF */
+	0xF3, 0x65, 0xC3, 0xE9, 0xF3, 0x74, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF3, 0x6D, 0xF3, 0x70, 0xC3, 0xEF, 0xC3, 0xF4, /* 0xD4-0xD7 */
+	0xC3, 0xF2, 0xF3, 0x69, 0xF3, 0x64, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC3, 0xED, 0xC3, 0xEE, 0xF3, 0x60, 0xC3, 0xEA, /* 0xDC-0xDF */
+	0x00, 0x00, 0xC3, 0xE8, 0xC3, 0xF0, 0xF3, 0x6F, /* 0xE0-0xE3 */
+	0xC3, 0xF3, 0x00, 0x00, 0xF3, 0x6B, 0xF3, 0x75, /* 0xE4-0xE7 */
+	0xC3, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xF3, 0x67, 0x00, 0x00, 0xF3, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF4, 0xF3, 0xF5, 0x42, 0xF4, 0xF5, /* 0xF4-0xF7 */
+	0xF4, 0xFC, 0xF3, 0x66, 0xF4, 0xFA, 0xF4, 0xE9, /* 0xF8-0xFB */
+	0xF5, 0x40, 0xC4, 0xC3, 0xF4, 0xED, 0xF4, 0xFE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_94[512] = {
+	0xF4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xC2, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x44, 0xF4, 0xF6, /* 0x04-0x07 */
+	0x00, 0x00, 0xF4, 0xFB, 0xF4, 0xFD, 0xF4, 0xE7, /* 0x08-0x0B */
+	0xF5, 0x41, 0xF4, 0xF2, 0xF4, 0xF7, 0xF4, 0xEB, /* 0x0C-0x0F */
+	0xF4, 0xEF, 0xF5, 0x43, 0xF4, 0xF9, 0xF4, 0xE8, /* 0x10-0x13 */
+	0xF4, 0xEC, 0xF4, 0xEE, 0xF4, 0xF8, 0x00, 0x00, /* 0x14-0x17 */
+	0xC4, 0xC1, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xF4, 0xF0, 0xF6, 0x61, 0xF6, 0x66, 0xC5, 0x4F, /* 0x28-0x2B */
+	0xF6, 0x68, 0x00, 0x00, 0xC5, 0x49, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF6, 0x64, 0xF6, 0x6A, 0xC5, 0x4E, 0xC5, 0x4A, /* 0x30-0x33 */
+	0x00, 0x00, 0xC5, 0x4B, 0xF6, 0x60, 0xF6, 0x67, /* 0x34-0x37 */
+	0xC5, 0x4D, 0xF6, 0x65, 0xC5, 0x4C, 0xF6, 0x5F, /* 0x38-0x3B */
+	0xF6, 0x63, 0xF6, 0x62, 0x00, 0x00, 0xF6, 0x5E, /* 0x3C-0x3F */
+	0xF6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xC5, 0xB1, 0xF7, 0x6D, 0xF7, 0x70, 0xF7, 0x6C, /* 0x44-0x47 */
+	0xF7, 0x6E, 0xF7, 0x6F, 0xF7, 0x69, 0xF7, 0x6A, /* 0x48-0x4B */
+	0xF7, 0x67, 0x00, 0x00, 0x00, 0x00, 0xF7, 0x6B, /* 0x4C-0x4F */
+	0xF7, 0x68, 0xC5, 0xB2, 0xC5, 0xB3, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF8, 0x4B, 0x00, 0x00, 0xF8, 0x4D, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xF8, 0x4C, 0xF8, 0x4E, 0x00, 0x00, /* 0x5C-0x5F */
+	0xC5, 0xE0, 0x00, 0x00, 0xF8, 0x4A, 0xC5, 0xDF, /* 0x60-0x63 */
+	0xC5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF8, 0xCB, 0xF8, 0xCC, 0xC6, 0x44, 0xF8, 0xCA, /* 0x68-0x6B */
+	0x00, 0x00, 0xF9, 0x53, 0xF9, 0x52, 0xF9, 0x54, /* 0x6C-0x6F */
+	0xC6, 0x5F, 0xF9, 0x55, 0xC6, 0x5E, 0xF9, 0x56, /* 0x70-0x73 */
+	0xF9, 0x72, 0xF9, 0x75, 0xF9, 0x74, 0xC6, 0x68, /* 0x74-0x77 */
+	0xF9, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xC6, 0x72, 0xC6, 0x70, 0xC6, 0x71, 0xC6, 0x77, /* 0x7C-0x7F */
+	
+	0xF9, 0xC0, 0xF9, 0xC1, 0xF9, 0xBF, 0xF9, 0xC9, /* 0x80-0x83 */
+};
+
+static const unsigned char u2c_95[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xF8, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0x44, 0xDC, 0x78, /* 0x78-0x7B */
+	0xE8, 0xA5, 0xF3, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xAA, 0xF9, 0x00, 0x00, 0xAD, 0xAC, 0xB0, 0x7B, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0x45, 0x00, 0x00, /* 0x84-0x87 */
+	0xD8, 0x46, 0xB3, 0xAC, 0x00, 0x00, 0xB6, 0x7D, /* 0x88-0x8B */
+	0xDC, 0x7A, 0xDC, 0x79, 0xB6, 0xA3, 0xB6, 0x7C, /* 0x8C-0x8F */
+	0xDC, 0x7B, 0xB6, 0x7E, 0xB6, 0xA2, 0xB6, 0xA1, /* 0x90-0x93 */
+	0xB6, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xB9, 0x68, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, /* 0x98-0x9B */
+	0xE0, 0xCE, 0x00, 0x00, 0xE0, 0xCF, 0xE0, 0xCD, /* 0x9C-0x9F */
+	0x00, 0x00, 0xBB, 0xD2, 0x00, 0x00, 0xBB, 0xD5, /* 0xA0-0xA3 */
+	0xBB, 0xD7, 0xBB, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBB, 0xD3, 0xBB, 0xD4, 0x00, 0x00, 0xE8, 0xA7, /* 0xA8-0xAB */
+	0xE8, 0xA6, 0xBE, 0x5B, 0xE8, 0xA8, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE8, 0xA9, 0xBE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xEC, 0x4D, 0xEC, 0x4B, 0xEE, 0xF3, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xEC, 0x49, 0xEC, 0x4A, 0xC0, 0x46, /* 0xB8-0xBB */
+	0xEC, 0x46, 0xEC, 0x4E, 0xEC, 0x48, 0xEC, 0x4C, /* 0xBC-0xBF */
+	0xEE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xF1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEE, 0xF2, 0xC1, 0xF3, 0xEE, 0xEE, /* 0xC4-0xC7 */
+	0xC1, 0xF2, 0xEE, 0xF0, 0xC1, 0xEF, 0xC1, 0xF0, /* 0xC8-0xCB */
+	0xC1, 0xF1, 0xEC, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xC2, 0xF5, 0xF1, 0x6E, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xD0-0xD3 */
+	0xC2, 0xF3, 0xC2, 0xF6, 0xC2, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0x77, 0xF3, 0x78, /* 0xD8-0xDB */
+	0xC3, 0xF6, 0x00, 0x00, 0xF5, 0x45, 0xF5, 0x47, /* 0xDC-0xDF */
+	0xF5, 0x46, 0xC4, 0xC4, 0xC5, 0x50, 0xF6, 0x6D, /* 0xE0-0xE3 */
+	0xF6, 0x6C, 0xF6, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char u2c_96[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xAA, 0xFA, 0x00, 0x00, 0xC9, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */
+	0xCA, 0x58, 0xA6, 0xE9, 0xCA, 0x56, 0xCA, 0x59, /* 0x20-0x23 */
+	0xCA, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCB, 0xAE, 0x00, 0x00, 0xA8, 0xC1, 0x00, 0x00, /* 0x28-0x2B */
+	0xA8, 0xC2, 0xCB, 0xB0, 0xA8, 0xBF, 0xCB, 0xAF, /* 0x2C-0x2F */
+	0xCB, 0xAD, 0xA8, 0xC0, 0xA8, 0xBE, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xCD, 0xD8, 0xCD, 0xDB, 0xAA, 0xFD, /* 0x38-0x3B */
+	0xCD, 0xDA, 0xCD, 0xD9, 0x00, 0x00, 0xAA, 0xFC, /* 0x3C-0x3F */
+	0xAA, 0xFB, 0x00, 0x00, 0xAB, 0x40, 0xCD, 0xDC, /* 0x40-0x43 */
+	0xAA, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xC6, 0xAD, 0xAE, /* 0x48-0x4B */
+	0xAD, 0xAF, 0xAD, 0xB0, 0xD0, 0xC7, 0xD0, 0xC3, /* 0x4C-0x4F */
+	0xAD, 0xAD, 0xD0, 0xC4, 0x00, 0x00, 0xD0, 0xC5, /* 0x50-0x53 */
+	0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xB0, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xA1, /* 0x58-0x5B */
+	0xD4, 0x45, 0xB0, 0xA2, 0xB0, 0xA5, 0xD4, 0x46, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB0, 0x7E, 0xB0, 0x7C, 0xB0, 0x7D, /* 0x60-0x63 */
+	0xB0, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xAD, 0xD8, 0x49, /* 0x68-0x6B */
+	0xB3, 0xB5, 0xD8, 0x48, 0x00, 0x00, 0xD8, 0x4B, /* 0x6C-0x6F */
+	0xB3, 0xB1, 0xD8, 0x4A, 0xB6, 0xAB, 0xB3, 0xAF, /* 0x70-0x73 */
+	0xB3, 0xB2, 0xB3, 0xAE, 0xB3, 0xB3, 0xB3, 0xB4, /* 0x74-0x77 */
+	0xB3, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xD8, 0x47, 0xB6, 0xA7, 0xDC, 0x7D, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA2, /* 0x80-0x83 */
+	0xB6, 0xAC, 0xB6, 0xA8, 0xB6, 0xA9, 0xDC, 0x7C, /* 0x84-0x87 */
+	0xDC, 0x7E, 0xDC, 0xA1, 0xB6, 0xA4, 0xB6, 0xA6, /* 0x88-0x8B */
+	0x00, 0x00, 0xB6, 0xAA, 0xB6, 0xA5, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE0, 0xD3, 0xE0, 0xD1, 0xE0, 0xD2, /* 0x90-0x93 */
+	0xB9, 0x6A, 0xB9, 0x6B, 0x00, 0x00, 0xE0, 0xD4, /* 0x94-0x97 */
+	0xB9, 0x69, 0xBB, 0xD8, 0x00, 0x00, 0xBB, 0xDA, /* 0x98-0x9B */
+	0xBB, 0xD9, 0x00, 0x00, 0xE4, 0xBB, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0xBC, 0xE8, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x47, /* 0xA4-0xA7 */
+	0xC0, 0x48, 0xEC, 0x4F, 0xC0, 0x49, 0x00, 0x00, /* 0xA8-0xAB */
+	0xEE, 0xF6, 0x00, 0x00, 0xEE, 0xF4, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEE, 0xF5, 0xC1, 0xF4, 0x00, 0x00, 0xF1, 0x6F, /* 0xB0-0xB3 */
+	0xC3, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xC1, 0xF5, 0xAB, 0x41, 0x00, 0x00, 0xB0, 0xA6, /* 0xB8-0xBB */
+	0xD4, 0x47, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x4C, /* 0xBC-0xBF */
+	0xB3, 0xB6, 0xB6, 0xAD, 0xDC, 0xA4, 0xDC, 0xA6, /* 0xC0-0xC3 */
+	0xB6, 0xAF, 0xB6, 0xAE, 0xB6, 0xB0, 0xB6, 0xB1, /* 0xC4-0xC7 */
+	0xDC, 0xA5, 0xB9, 0x6E, 0xB9, 0x6F, 0xB9, 0x6D, /* 0xC8-0xCB */
+	0xBB, 0xDB, 0xB9, 0x6C, 0xE0, 0xD5, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0xDC, 0xE8, 0xAC, /* 0xD0-0xD3 */
+	0xEC, 0x50, 0xC0, 0x4A, 0xC1, 0xF6, 0xF1, 0x70, /* 0xD4-0xD7 */
+	0xF1, 0x74, 0xC2, 0xF9, 0xF1, 0x71, 0xC2, 0xFA, /* 0xD8-0xDB */
+	0xC2, 0xF8, 0xF1, 0x75, 0xC2, 0xFB, 0xF1, 0x73, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF3, 0x79, 0xC2, 0xF7, 0xC3, 0xF8, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAB, 0x42, 0xB3, 0xB8, 0xB3, 0xB7, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xB2, /* 0xEC-0xEF */
+	0xDC, 0xA8, 0xDC, 0xA7, 0xB6, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE0, 0xD9, 0xB9, 0x73, 0xB9, 0x70, /* 0xF4-0xF7 */
+	0xE0, 0xD8, 0xB9, 0x72, 0xE0, 0xD6, 0xB9, 0x71, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE0, 0xD7, 0x00, 0x00, 0xE4, 0xBD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_97[512] = {
+	0xBB, 0xDD, 0x00, 0x00, 0xE8, 0xAF, 0x00, 0x00, /* 0x00-0x03 */
+	0xBE, 0x5D, 0xE8, 0xAD, 0xBE, 0x5E, 0xBE, 0x5F, /* 0x04-0x07 */
+	0xE8, 0xAE, 0xBE, 0x60, 0x00, 0x00, 0xEC, 0x51, /* 0x08-0x0B */
+	0x00, 0x00, 0xC0, 0x4E, 0xC0, 0x4B, 0xC0, 0x50, /* 0x0C-0x0F */
+	0xEC, 0x53, 0xC0, 0x4C, 0xEC, 0x52, 0xC0, 0x4F, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x4D, 0x00, 0x00, /* 0x14-0x17 */
+	0xEE, 0xF9, 0xEE, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xC1, 0xF7, 0xEE, 0xFA, 0xC1, 0xF8, 0xEE, 0xF8, /* 0x1C-0x1F */
+	0xEE, 0xF7, 0x00, 0x00, 0xF1, 0x77, 0xF1, 0x76, /* 0x20-0x23 */
+	0xC2, 0xFC, 0xF1, 0x78, 0xF3, 0x7E, 0xC3, 0xFA, /* 0x24-0x27 */
+	0xF3, 0x7D, 0xF3, 0x7A, 0xC3, 0xF9, 0xF3, 0x7B, /* 0x28-0x2B */
+	0xF3, 0x7C, 0x00, 0x00, 0xF5, 0x48, 0xF5, 0x49, /* 0x2C-0x2F */
+	0xC4, 0xC5, 0x00, 0x00, 0xC5, 0x53, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xC5, 0x51, 0xC5, 0x52, 0xF6, 0x6F, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xC5, 0xB4, 0xC5, 0xB5, 0xF7, 0x71, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xC6, 0x45, 0xF8, 0xCF, /* 0x40-0x43 */
+	0xC6, 0x47, 0x00, 0x00, 0xF8, 0xCE, 0xF8, 0xD0, /* 0x44-0x47 */
+	0xC6, 0x46, 0xF9, 0x57, 0x00, 0x00, 0xF9, 0xAD, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0x43, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0x74, 0x00, 0x00, /* 0x54-0x57 */
+	0xE4, 0xBE, 0x00, 0x00, 0xE8, 0xB0, 0xC0, 0x51, /* 0x58-0x5B */
+	0xC0, 0x52, 0x00, 0x00, 0xAB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */
+	0xBE, 0x61, 0xC3, 0xFB, 0xAD, 0xB1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x53, 0x00, 0x00, /* 0x64-0x67 */
+	0xC5, 0xE2, 0xAD, 0xB2, 0xD8, 0x4D, 0x00, 0x00, /* 0x68-0x6B */
+	0xDC, 0xA9, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDC, 0xAA, 0x00, 0x00, 0xE0, 0xDD, 0xE0, 0xDA, /* 0x70-0x73 */
+	0xB9, 0x75, 0x00, 0x00, 0xB9, 0x76, 0xE0, 0xDB, /* 0x74-0x77 */
+	0xE0, 0xDC, 0x00, 0x00, 0xE4, 0xC0, 0xE4, 0xC5, /* 0x78-0x7B */
+	0xBB, 0xDE, 0xE4, 0xBF, 0xE4, 0xC1, 0xE4, 0xC8, /* 0x7C-0x7F */
+	
+	0xE4, 0xC3, 0xE4, 0xC7, 0xE4, 0xC4, 0xE4, 0xC2, /* 0x80-0x83 */
+	0xE4, 0xC6, 0xBB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB1, 0xBE, 0x63, /* 0x88-0x8B */
+	0x00, 0x00, 0xBE, 0x62, 0xE8, 0xB2, 0xBE, 0x64, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xEC, 0x56, 0x00, 0x00, 0x00, 0x00, 0xEC, 0x55, /* 0x94-0x97 */
+	0xC0, 0x54, 0xEC, 0x54, 0xEE, 0xFC, 0x00, 0x00, /* 0x98-0x9B */
+	0xEE, 0xFE, 0xEF, 0x41, 0xEF, 0x40, 0x00, 0x00, /* 0x9C-0x9F */
+	0xC1, 0xF9, 0xEE, 0xFD, 0xF1, 0xA1, 0xC2, 0xFD, /* 0xA0-0xA3 */
+	0xF1, 0x7D, 0xF1, 0xA2, 0xC2, 0xFE, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF1, 0x7B, 0x00, 0x00, 0xF1, 0x7E, 0xF1, 0x7C, /* 0xA8-0xAB */
+	0xF1, 0x79, 0xC3, 0x40, 0xF1, 0x7A, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xA3, 0xF3, 0xA2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF5, 0x4A, 0x00, 0x00, 0xF5, 0x4B, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0x70, /* 0xBC-0xBF */
+	0x00, 0x00, 0xC5, 0xB7, 0x00, 0x00, 0xC5, 0xB6, /* 0xC0-0xC3 */
+	0xF8, 0x4F, 0xF8, 0x50, 0xC6, 0x48, 0xF8, 0xD1, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC6, 0x69, 0x00, 0x00, 0xAD, 0xB3, /* 0xC8-0xCB */
+	0xB6, 0xB4, 0xE4, 0xCA, 0xE4, 0xC9, 0xE8, 0xB5, /* 0xCC-0xCF */
+	0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xFA, /* 0xD0-0xD3 */
+	0xEF, 0x43, 0xEF, 0x42, 0xF1, 0xA5, 0xF1, 0xA3, /* 0xD4-0xD7 */
+	0xF1, 0xA6, 0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC3, 0xFC, 0xF3, 0xA4, 0xF3, 0xA5, 0xF3, 0xA6, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF6, 0x71, 0x00, 0x00, 0xF7, 0x72, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF8, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAD, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEC, 0x57, 0xEF, 0x44, 0x00, 0x00, 0xAD, 0xB5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0xE0, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xEC, 0x58, 0xC3, 0x41, 0xF1, 0xA7, 0xC3, 0xFD, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF5, 0x4C, 0xF5, 0x4D, 0xC5, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0xF8, 0x51, 0xAD, 0xB6, 0xB3, 0xBB, 0xB3, 0xBC, /* 0x00-0x03 */
+	0xD8, 0x4E, 0xB6, 0xB5, 0xB6, 0xB6, 0xDC, 0xAC, /* 0x04-0x07 */
+	0xB6, 0xB7, 0x00, 0x00, 0xB9, 0x7A, 0x00, 0x00, /* 0x08-0x0B */
+	0xB9, 0x7C, 0xE0, 0xDF, 0xE0, 0xE0, 0xE0, 0xDE, /* 0x0C-0x0F */
+	0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x7B, 0xB9, 0x79, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, 0xBB, 0xE1, /* 0x14-0x17 */
+	0xBB, 0xE2, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xBC, /* 0x18-0x1B */
+	0xBE, 0x67, 0xE8, 0xB7, 0xE8, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE8, 0xBB, 0xBE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xC0, 0x5B, 0x00, 0x00, 0xE8, 0xB8, 0xE8, 0xBD, /* 0x24-0x27 */
+	0xE8, 0xBA, 0xE8, 0xB9, 0x00, 0x00, 0xBE, 0x66, /* 0x28-0x2B */
+	0x00, 0x00, 0xC0, 0x59, 0x00, 0x00, 0xEC, 0x5A, /* 0x2C-0x2F */
+	0xC0, 0x55, 0x00, 0x00, 0xEC, 0x5B, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xEC, 0x59, 0x00, 0x00, 0xC0, 0x58, /* 0x34-0x37 */
+	0xC0, 0x56, 0xC0, 0x5A, 0x00, 0x00, 0xC0, 0x57, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xEF, 0x45, 0x00, 0x00, 0xEF, 0x4A, /* 0x40-0x43 */
+	0xEF, 0x46, 0xEF, 0x49, 0xC1, 0xFB, 0x00, 0x00, /* 0x44-0x47 */
+	0xED, 0xD4, 0xEF, 0x48, 0xEF, 0x47, 0x00, 0x00, /* 0x48-0x4B */
+	0xC3, 0x44, 0xC3, 0x42, 0xC3, 0x45, 0xC3, 0x43, /* 0x4C-0x4F */
+	0xF1, 0xA8, 0xF1, 0xA9, 0xF1, 0xAA, 0xC3, 0x46, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0x54-0x57 */
+	0xC4, 0x40, 0xF3, 0xA8, 0x00, 0x00, 0xC4, 0x41, /* 0x58-0x5B */
+	0xF3, 0xA7, 0xF3, 0xA9, 0xC3, 0xFE, 0xF5, 0x51, /* 0x5C-0x5F */
+	0xF5, 0x4E, 0x00, 0x00, 0xF5, 0x4F, 0xF5, 0x50, /* 0x60-0x63 */
+	0xF6, 0x72, 0xC5, 0x56, 0x00, 0x00, 0xC5, 0x55, /* 0x64-0x67 */
+	0x00, 0x00, 0xF7, 0x74, 0xF7, 0x73, 0xC5, 0xB8, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xE3, /* 0x6C-0x6F */
+	0xC6, 0x49, 0xC6, 0x60, 0xF9, 0x58, 0xF9, 0xAE, /* 0x70-0x73 */
+	0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xAD, 0xB7, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE0, 0xE1, 0xE4, 0xCC, 0xE4, 0xCD, 0xBB, 0xE3, /* 0xAC-0xAF */
+	0x00, 0x00, 0xBB, 0xE4, 0xE8, 0xBE, 0xBE, 0x68, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xC1, 0xFC, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF1, 0xAB, 0x00, 0x00, 0xC3, 0x47, 0xF3, 0xAD, /* 0xB8-0xBB */
+	0xC4, 0x42, 0xF3, 0xAC, 0xF3, 0xAE, 0xF3, 0xAB, /* 0xBC-0xBF */
+	0xF6, 0x75, 0xF5, 0x52, 0xF5, 0x53, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xC4, 0xC6, 0x00, 0x00, 0xF6, 0x74, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF6, 0x73, 0x00, 0x00, 0xF7, 0x75, /* 0xC8-0xCB */
+	0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB8, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB9, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xA7, 0xD4, 0x48, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD8, 0x4F, 0x00, 0x00, 0xB6, 0xB8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xB6, 0xBB, 0xB6, 0xB9, 0xDC, 0xAE, /* 0xE8-0xEB */
+	0x00, 0x00, 0xB6, 0xBD, 0x00, 0x00, 0xB6, 0xBA, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xBC, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xB9, 0x7E, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE0, 0xE3, 0xE8, 0xC0, 0x00, 0x00, /* 0xF8-0xFB */
+	0xB9, 0x7D, 0xB9, 0xA1, 0xB9, 0xA2, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0xE4, 0xCF, 0x00, 0x00, 0xE4, 0xCE, 0xBB, 0xE5, /* 0x00-0x03 */
+	0x00, 0x00, 0xBB, 0xE6, 0x00, 0x00, 0xE4, 0xD0, /* 0x04-0x07 */
+	0xE8, 0xBF, 0xBB, 0xE8, 0xBE, 0x69, 0x00, 0x00, /* 0x08-0x0B */
+	0xBB, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xC0, 0x5C, 0xE8, 0xC1, 0xBE, 0x6B, 0xBE, 0x6A, /* 0x10-0x13 */
+	0xE8, 0xC2, 0xE8, 0xC5, 0xE8, 0xC3, 0xE8, 0xC4, /* 0x14-0x17 */
+	0xBE, 0x6C, 0x00, 0x00, 0xC0, 0x61, 0xC0, 0x5F, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x5E, 0xEC, 0x5D, /* 0x1C-0x1F */
+	0x00, 0x00, 0xC0, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xEC, 0x5C, 0xEF, 0x4B, 0x00, 0x00, 0xEC, 0x5E, /* 0x24-0x27 */
+	0xC0, 0x5D, 0xEC, 0x5F, 0xEF, 0x4E, 0xEF, 0x4C, /* 0x28-0x2B */
+	0xEF, 0x4D, 0xEF, 0x52, 0xC3, 0x4B, 0xEF, 0x51, /* 0x2C-0x2F */
+	0xEF, 0x54, 0xEF, 0x53, 0xEF, 0x50, 0xEF, 0x4F, /* 0x30-0x33 */
+	0x00, 0x00, 0xC1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x38-0x3B */
+	0xF1, 0xAD, 0xC3, 0x4A, 0xC3, 0x48, 0xC3, 0x49, /* 0x3C-0x3F */
+	0x00, 0x00, 0xF1, 0xAC, 0x00, 0x00, 0xF3, 0xB1, /* 0x40-0x43 */
+	0x00, 0x00, 0xC4, 0x43, 0x00, 0x00, 0xF3, 0xB0, /* 0x44-0x47 */
+	0xF3, 0xAF, 0xC4, 0x44, 0x00, 0x00, 0xF5, 0x58, /* 0x48-0x4B */
+	0xF5, 0x57, 0x00, 0x00, 0xF5, 0x55, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF5, 0x54, 0xC4, 0xC8, 0xC4, 0xC7, 0xF5, 0x59, /* 0x50-0x53 */
+	0xF7, 0x76, 0xC5, 0xB9, 0xF6, 0x77, 0xC5, 0x57, /* 0x54-0x57 */
+	0xF6, 0x76, 0xF5, 0x56, 0x00, 0x00, 0xF7, 0x77, /* 0x58-0x5B */
+	0xC5, 0xE4, 0x00, 0x00, 0xC6, 0x61, 0xF9, 0x59, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xBA, 0xD8, 0x50, /* 0x94-0x97 */
+	0xEF, 0x55, 0xAD, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xE4, 0xD2, 0xE4, 0xD1, 0xEC, 0x60, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEF, 0x57, 0x00, 0x00, 0xEF, 0x56, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xC3, 0x4C, 0xF3, 0xB2, 0xF3, 0xB3, /* 0xA4-0xA7 */
+	0xC4, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB2, /* 0xA8-0xAB */
+	0xB0, 0xA8, 0xB6, 0xBF, 0xB6, 0xBE, 0xE0, 0xE4, /* 0xAC-0xAF */
+	0xE0, 0xE6, 0xB9, 0xA4, 0xE0, 0xE5, 0xB9, 0xA3, /* 0xB0-0xB3 */
+	0xB9, 0xA5, 0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE4, 0xD4, 0xE4, 0xD6, 0xE4, 0xD5, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBB, 0xE9, 0xE4, 0xD7, 0xE4, 0xD3, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0xE8, 0xCF, /* 0xC8-0xCB */
+	0xE8, 0xD1, 0xE8, 0xC7, 0xE8, 0xCB, 0xE8, 0xC8, /* 0xCC-0xCF */
+	0xBE, 0x6E, 0xBE, 0x71, 0xBE, 0x73, 0xE8, 0xC9, /* 0xD0-0xD3 */
+	0xE8, 0xCA, 0xBE, 0x72, 0xE8, 0xCD, 0xE8, 0xD0, /* 0xD4-0xD7 */
+	0xE8, 0xCE, 0xBE, 0x74, 0x00, 0x00, 0xBE, 0x70, /* 0xD8-0xDB */
+	0xE8, 0xC6, 0xBE, 0x6D, 0x00, 0x00, 0xBE, 0x6F, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x63, 0xEC, 0x66, /* 0xE0-0xE3 */
+	0xEC, 0x64, 0xEC, 0x63, 0x00, 0x00, 0xEC, 0x69, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xEC, 0x68, 0xEC, 0x67, 0x00, 0x00, /* 0xE8-0xEB */
+	0xEC, 0x62, 0xC0, 0x62, 0xEC, 0x61, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEC, 0x65, 0xC0, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEF, 0x5A, 0x00, 0x00, 0xEF, 0x5E, 0xEF, 0x5B, /* 0xF4-0xF7 */
+	0xEF, 0x5D, 0xEF, 0x5C, 0xEF, 0x59, 0xEF, 0x5F, /* 0xF8-0xFB */
+	0xEF, 0x62, 0xEF, 0x60, 0xEF, 0x61, 0xC2, 0x40, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0x00, 0x00, 0xC1, 0xFE, 0xEF, 0x58, 0xEF, 0x63, /* 0x00-0x03 */
+	0xF1, 0xB3, 0xF1, 0xB6, 0xF1, 0xB8, 0xF1, 0xB7, /* 0x04-0x07 */
+	0x00, 0x00, 0xF1, 0xB1, 0xF1, 0xB5, 0xF1, 0xB0, /* 0x08-0x0B */
+	0x00, 0x00, 0xF1, 0xB2, 0xC3, 0x4D, 0xF1, 0xAF, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF3, 0xC0, 0xF3, 0xB5, 0xC4, 0x45, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xC4, 0x46, 0xF3, 0xB4, 0xF3, 0xB9, /* 0x18-0x1B */
+	0xF3, 0xBF, 0xF3, 0xB7, 0xF3, 0xBE, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF3, 0xBB, 0x00, 0x00, 0xF3, 0xBA, 0xF3, 0xBD, /* 0x20-0x23 */
+	0xF3, 0xB8, 0xF3, 0xB6, 0x00, 0x00, 0xF3, 0xBC, /* 0x24-0x27 */
+	0x00, 0x00, 0xF5, 0x60, 0xF5, 0x5E, 0xC4, 0xCA, /* 0x28-0x2B */
+	0xF5, 0x5D, 0xF5, 0x63, 0xF5, 0x61, 0x00, 0x00, /* 0x2C-0x2F */
+	0xC4, 0xCB, 0xF5, 0x5C, 0xF5, 0x5A, 0x00, 0x00, /* 0x30-0x33 */
+	0xF5, 0x5B, 0xC4, 0xCD, 0xF5, 0x5F, 0xC4, 0xCC, /* 0x34-0x37 */
+	0xF5, 0x62, 0xF6, 0x78, 0xF6, 0x7E, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF6, 0x79, 0xC5, 0x5B, 0xF6, 0xA1, /* 0x3C-0x3F */
+	0xC5, 0x5A, 0xF6, 0x7D, 0xF6, 0x7C, 0xC5, 0x59, /* 0x40-0x43 */
+	0xF6, 0x7B, 0xC5, 0x58, 0xF6, 0x7A, 0x00, 0x00, /* 0x44-0x47 */
+	0xF7, 0x7D, 0xF7, 0xA1, 0xF7, 0x7E, 0x00, 0x00, /* 0x48-0x4B */
+	0xF7, 0x7B, 0xC5, 0xBB, 0xF7, 0x78, 0xF7, 0x7C, /* 0x4C-0x4F */
+	0xF7, 0xA3, 0x00, 0x00, 0xF7, 0xA2, 0xF7, 0x79, /* 0x50-0x53 */
+	0xF7, 0x7A, 0xC5, 0xBA, 0xF8, 0x52, 0xC5, 0xE7, /* 0x54-0x57 */
+	0x00, 0x00, 0xF8, 0x53, 0xC5, 0xE5, 0xC5, 0xE6, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0xC6, 0x4A, /* 0x5C-0x5F */
+	0xF9, 0x76, 0x00, 0x00, 0xC6, 0x6A, 0x00, 0x00, /* 0x60-0x63 */
+	0xF9, 0xB3, 0xC6, 0x6B, 0xF9, 0xB4, 0xF9, 0xB5, /* 0x64-0x67 */
+	0xF9, 0xC3, 0xF9, 0xC2, 0xC6, 0x7A, 0xF9, 0xCD, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xB0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, 0xBB, 0xEA, /* 0xAC-0xAF */
+	0xBB, 0xEB, 0xE4, 0xDA, 0x00, 0x00, 0xE8, 0xD2, /* 0xB0-0xB3 */
+	0xEC, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x75, /* 0xB4-0xB7 */
+	0xC0, 0x65, 0xEC, 0x6A, 0x00, 0x00, 0xEC, 0x6D, /* 0xB8-0xBB */
+	0xC0, 0x66, 0x00, 0x00, 0xEF, 0x64, 0xEC, 0x6B, /* 0xBC-0xBF */
+	0xF1, 0xB9, 0xC3, 0x4E, 0xF3, 0xC1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x66, 0xF5, 0x64, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x65, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xF6, 0xA2, 0x00, 0x00, 0xC5, 0x5C, /* 0xCC-0xCF */
+	0xF7, 0xA4, 0xC5, 0xEA, 0xC5, 0xBC, 0xC5, 0xE8, /* 0xD0-0xD3 */
+	0xC5, 0xE9, 0xF8, 0xD4, 0xC6, 0x62, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xB0, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xF1, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xD4, 0x49, /* 0xDC-0xDF */
+	0x00, 0x00, 0xB9, 0xA6, 0x00, 0x00, 0xE4, 0xDB, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0xEC, 0xE4, 0xDC, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, /* 0xE8-0xEB */
+	0xE8, 0xD3, 0xC0, 0x68, 0xBE, 0x76, 0xBE, 0x77, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE8, 0xD7, 0xE8, 0xD6, 0xE8, 0xD5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0x6E, 0xEC, 0x71, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEC, 0x70, 0xEC, 0x6F, 0xC0, 0x67, /* 0xF8-0xFB */
+	0xEF, 0x68, 0xEF, 0x66, 0xEF, 0x65, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0x00, 0x00, 0xEF, 0x67, 0x00, 0x00, 0xC3, 0x4F, /* 0x00-0x03 */
+	0xF1, 0xBC, 0xF1, 0xBD, 0xC3, 0x50, 0x00, 0x00, /* 0x04-0x07 */
+	0xF1, 0xBB, 0x00, 0x00, 0xF3, 0xC3, 0xF3, 0xC2, /* 0x08-0x0B */
+	0xF3, 0xC5, 0xC4, 0x47, 0xF3, 0xC4, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF5, 0x67, 0xF5, 0x69, 0xF5, 0x68, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xF6, 0xA3, 0xF6, 0xA6, 0xF6, 0xA4, /* 0x14-0x17 */
+	0xF6, 0xA5, 0xF7, 0xA5, 0xC5, 0xBD, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0x54, 0xF8, 0x55, /* 0x1C-0x1F */
+	0xF8, 0x56, 0x00, 0x00, 0xC6, 0x4B, 0xC6, 0x63, /* 0x20-0x23 */
+	0xF9, 0xB6, 0xB0, 0xAB, 0x00, 0x00, 0xBE, 0x78, /* 0x24-0x27 */
+	0xC0, 0x69, 0xF1, 0xBE, 0x00, 0x00, 0xF7, 0xA6, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0xD4, 0x4A, /* 0x2C-0x2F */
+	0x00, 0x00, 0xC6, 0x7B, 0xB0, 0xAC, 0xEC, 0x72, /* 0x30-0x33 */
+	0x00, 0x00, 0xF1, 0xBF, 0x00, 0x00, 0xF3, 0xC6, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0xF7, 0xA7, /* 0x38-0x3B */
+	0xB0, 0xAD, 0x00, 0x00, 0xE4, 0xDD, 0xE4, 0xDE, /* 0x3C-0x3F */
+	0x00, 0x00, 0xBB, 0xED, 0xBB, 0xEE, 0xE8, 0xD9, /* 0x40-0x43 */
+	0xBE, 0x7A, 0xBE, 0x79, 0xE8, 0xD8, 0x00, 0x00, /* 0x44-0x47 */
+	0xEF, 0x69, 0x00, 0x00, 0xF1, 0xC0, 0xF1, 0xC2, /* 0x48-0x4B */
+	0xF1, 0xC1, 0xC3, 0x53, 0xC3, 0x52, 0xC3, 0x51, /* 0x4C-0x4F */
+	0x00, 0x00, 0xC5, 0x5E, 0xF6, 0xA8, 0x00, 0x00, /* 0x50-0x53 */
+	0xC5, 0x5D, 0xF7, 0xA9, 0xF7, 0xA8, 0x00, 0x00, /* 0x54-0x57 */
+	0xC6, 0x4C, 0xF8, 0xD5, 0xB3, 0xBD, 0xE0, 0xEA, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0x5C-0x5F */
+	0xE4, 0xDF, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE8, 0xE2, 0x00, 0x00, 0xE8, 0xDD, 0xE8, 0xDA, /* 0x64-0x67 */
+	0xE8, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x7C, /* 0x6C-0x6F */
+	0xE8, 0xE0, 0xE8, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE8, 0xDB, 0xE8, 0xDF, 0xE8, 0xDE, 0xBE, 0x7B, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0x7D, 0xEC, 0x78, /* 0x78-0x7B */
+	0xEC, 0x76, 0xEC, 0xA1, 0xEC, 0x77, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEC, 0x73, 0x00, 0x00, 0xEC, 0x79, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xEC, 0x74, 0xEF, 0x72, 0xEC, 0x75, /* 0x84-0x87 */
+	0xEC, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEC, 0x7C, 0xC0, 0x6A, 0xEC, 0x7B, 0xEC, 0x7A, /* 0x90-0x93 */
+	0x00, 0x00, 0xEC, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0x6A, 0xEF, 0x6D, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0x6C, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEF, 0x74, 0xEF, 0x6F, 0xEF, 0x73, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEF, 0x71, 0xEF, 0x70, 0xEF, 0x6E, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xEF, 0x6B, 0x00, 0x00, 0xC2, 0x43, 0xC2, 0x42, /* 0xA8-0xAB */
+	0x00, 0x00, 0xC2, 0x44, 0xC2, 0x41, 0xEF, 0x75, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF1, 0xC8, 0xF1, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF1, 0xC9, 0xF1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xF1, 0xCE, 0x00, 0x00, 0xF1, 0xC6, /* 0xBC-0xBF */
+	0xC3, 0x58, 0xF1, 0xC7, 0x00, 0x00, 0xF1, 0xC5, /* 0xC0-0xC3 */
+	0xF1, 0xCC, 0x00, 0x00, 0xF1, 0xC4, 0xF1, 0xC3, /* 0xC4-0xC7 */
+	0xC3, 0x57, 0xC3, 0x55, 0xC3, 0x54, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCA, /* 0xD0-0xD3 */
+	0xF3, 0xCF, 0xF3, 0xD5, 0xC4, 0x4A, 0xF3, 0xD0, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF3, 0xD3, 0xF3, 0xD7, 0xC4, 0x4B, /* 0xD8-0xDB */
+	0xF3, 0xD2, 0x00, 0x00, 0xF3, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF3, 0xC9, 0xF3, 0xD6, 0xF3, 0xCD, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF3, 0xCB, 0xF3, 0xD4, 0xF3, 0xCC, 0xC4, 0x49, /* 0xE4-0xE7 */
+	0xC4, 0x48, 0x00, 0x00, 0xF3, 0xC7, 0xF3, 0xC8, /* 0xE8-0xEB */
+	0xF3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF3, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x6C, /* 0xF4-0xF7 */
+	0xF5, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xC3, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xF5, 0x6D, 0xF5, 0x73, 0xF5, 0x71, /* 0x04-0x07 */
+	0xF5, 0x6B, 0xF5, 0x76, 0x00, 0x00, 0xF5, 0x6A, /* 0x08-0x0B */
+	0x00, 0x00, 0xC4, 0xCF, 0xF5, 0x72, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x6E, 0xC4, 0xCE, /* 0x10-0x13 */
+	0xF5, 0x75, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x74, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xF6, 0xAB, 0xF6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF6, 0xB1, 0x00, 0x00, 0xF6, 0xAD, /* 0x20-0x23 */
+	0xF6, 0xB0, 0xC5, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xF6, 0xAE, 0xF6, 0xAF, 0x00, 0x00, 0xF6, 0xA9, /* 0x28-0x2B */
+	0xF6, 0xAC, 0xC5, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xC5, 0xBF, 0xF7, 0xB4, 0xF7, 0xAF, /* 0x30-0x33 */
+	0xF7, 0xB3, 0x00, 0x00, 0xF7, 0xB6, 0xF7, 0xB2, /* 0x34-0x37 */
+	0x00, 0x00, 0xF7, 0xAE, 0x00, 0x00, 0xC5, 0xC1, /* 0x38-0x3B */
+	0xF7, 0xB1, 0xF7, 0xB5, 0xC5, 0xC0, 0xF7, 0xAC, /* 0x3C-0x3F */
+	0xF5, 0x70, 0xF7, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF7, 0xAD, 0x00, 0x00, 0xF7, 0xAA, 0x00, 0x00, /* 0x44-0x47 */
+	0xF7, 0xAB, 0xC5, 0xBE, 0xF8, 0x5A, 0xF8, 0x5C, /* 0x48-0x4B */
+	0xF8, 0x5F, 0xF8, 0x5B, 0xF8, 0x60, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF8, 0x59, 0x00, 0x00, 0xF8, 0x57, 0x00, 0x00, /* 0x50-0x53 */
+	0xC5, 0xEB, 0xF8, 0x5D, 0xC5, 0xED, 0xC5, 0xEC, /* 0x54-0x57 */
+	0xF8, 0x58, 0xF8, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xDA, 0xC6, 0x4D, /* 0x5C-0x5F */
+	0xF8, 0xDB, 0x00, 0x00, 0xF8, 0xD9, 0xF8, 0xD6, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, 0xF8, 0xD7, /* 0x64-0x67 */
+	0xF9, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF9, 0x5C, 0xF9, 0x5B, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xF9, 0x79, 0x00, 0x00, 0xF9, 0x78, /* 0x70-0x73 */
+	0xF9, 0x77, 0xF9, 0x7A, 0x00, 0x00, 0xC6, 0x73, /* 0x74-0x77 */
+	0xC6, 0x74, 0xF9, 0xCA, 0xF9, 0xCE, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xB3, 0xBE, 0xDC, 0xAF, 0xE0, 0xED, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xB9, 0xA7, 0xE0, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE0, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE4, 0xE2, 0xE4, 0xE3, 0xBB, 0xF1, /* 0xF0-0xF3 */
+	0xBB, 0xEF, 0xE4, 0xE4, 0xBB, 0xF0, 0xE8, 0xE8, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE8, 0xEB, 0xE8, 0xE5, 0xE8, 0xEC, /* 0xF8-0xFB */
+	0xE8, 0xE4, 0xE8, 0xE6, 0x00, 0x00, 0xE8, 0xE7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0xE8, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xA1, /* 0x00-0x03 */
+	0xE8, 0xEF, 0xE8, 0xEE, 0xBE, 0x7D, 0xE8, 0xE9, /* 0x04-0x07 */
+	0xE8, 0xED, 0xBE, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xEC, 0xAC, 0x00, 0x00, 0xC0, 0x6F, 0x00, 0x00, /* 0x10-0x13 */
+	0xEC, 0xA7, 0xC0, 0x6B, 0x00, 0x00, 0xEC, 0xA4, /* 0x14-0x17 */
+	0xEC, 0xAA, 0xEC, 0xAD, 0x00, 0x00, 0xC0, 0x70, /* 0x18-0x1B */
+	0x00, 0x00, 0xEC, 0xA9, 0xEC, 0xA6, 0xEC, 0xAE, /* 0x1C-0x1F */
+	0xEC, 0xA5, 0x00, 0x00, 0xEC, 0xAB, 0xC0, 0x6C, /* 0x20-0x23 */
+	0x00, 0x00, 0xEC, 0xA3, 0xC0, 0x6D, 0x00, 0x00, /* 0x24-0x27 */
+	0xC0, 0x6E, 0xEC, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEF, 0xA9, 0xEF, 0x7A, 0xEF, 0x7B, /* 0x2C-0x2F */
+	0xEF, 0x7E, 0xEF, 0x7C, 0x00, 0x00, 0xEF, 0x76, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0x79, 0xEF, 0xA5, /* 0x34-0x37 */
+	0xEF, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x45, /* 0x38-0x3B */
+	0x00, 0x00, 0xEF, 0xA7, 0xEF, 0xA4, 0xC2, 0x46, /* 0x3C-0x3F */
+	0xEF, 0xA6, 0xEF, 0x77, 0xEF, 0xA2, 0xEF, 0xA3, /* 0x40-0x43 */
+	0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xD2, 0xF1, 0xD4, /* 0x48-0x4B */
+	0xF1, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD1, /* 0x4C-0x4F */
+	0x00, 0x00, 0xC3, 0x59, 0xF1, 0xD9, 0xF1, 0xD0, /* 0x50-0x53 */
+	0xF1, 0xDA, 0x00, 0x00, 0xF1, 0xD6, 0xF1, 0xD8, /* 0x54-0x57 */
+	0xF1, 0xDC, 0xF1, 0xD5, 0xF1, 0xDD, 0xF1, 0xD3, /* 0x58-0x5B */
+	0xF1, 0xCF, 0xC3, 0x5A, 0x00, 0x00, 0xF1, 0xDB, /* 0x5C-0x5F */
+	0xC3, 0x5B, 0xC4, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0x78, /* 0x64-0x67 */
+	0xF3, 0xF1, 0xF3, 0xE8, 0xC4, 0x4F, 0xF3, 0xE4, /* 0x68-0x6B */
+	0xC4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xED, /* 0x6C-0x6F */
+	0xF3, 0xE7, 0xF3, 0xDD, 0xC4, 0x4E, 0xF3, 0xEA, /* 0x70-0x73 */
+	0xF3, 0xE5, 0xF3, 0xE6, 0x00, 0x00, 0xF3, 0xD8, /* 0x74-0x77 */
+	0xF3, 0xDF, 0xF3, 0xEE, 0x00, 0x00, 0xF3, 0xEB, /* 0x78-0x7B */
+	0x00, 0x00, 0xF3, 0xE3, 0x00, 0x00, 0xF3, 0xEF, /* 0x7C-0x7F */
+	
+	0xF3, 0xDE, 0xF3, 0xD9, 0xF3, 0xEC, 0x00, 0x00, /* 0x80-0x83 */
+	0xF3, 0xDB, 0xF3, 0xE9, 0xF3, 0xE0, 0xF3, 0xF0, /* 0x84-0x87 */
+	0xF3, 0xDC, 0xC4, 0x4C, 0xF3, 0xDA, 0xF3, 0xE1, /* 0x88-0x8B */
+	0xF3, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xF5, 0x7D, 0x00, 0x00, 0xF5, 0x7B, 0x00, 0x00, /* 0x90-0x93 */
+	0xF5, 0xA2, 0x00, 0x00, 0xF5, 0xAE, 0xF5, 0xA5, /* 0x94-0x97 */
+	0xF5, 0x7C, 0xF5, 0x78, 0xF5, 0xA7, 0xF5, 0x7E, /* 0x98-0x9B */
+	0xF5, 0xA3, 0xF5, 0x7A, 0xF5, 0xAA, 0xF5, 0x77, /* 0x9C-0x9F */
+	0xF5, 0xA1, 0xF5, 0xA6, 0xF5, 0xA8, 0xF5, 0xAB, /* 0xA0-0xA3 */
+	0xF5, 0x79, 0x00, 0x00, 0xF5, 0xAF, 0xF5, 0xB0, /* 0xA4-0xA7 */
+	0xF5, 0xA9, 0xF5, 0xAD, 0xF5, 0xA4, 0x00, 0x00, /* 0xA8-0xAB */
+	0xF6, 0xC1, 0xF6, 0xC4, 0x00, 0x00, 0xC5, 0x61, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF6, 0xC3, 0xF6, 0xC8, 0xF6, 0xC6, /* 0xB0-0xB3 */
+	0xC5, 0x62, 0xF6, 0xBD, 0xF6, 0xB3, 0xF6, 0xB2, /* 0xB4-0xB7 */
+	0xC5, 0x64, 0xF6, 0xBF, 0xF6, 0xC0, 0xF6, 0xBC, /* 0xB8-0xBB */
+	0xF6, 0xB4, 0x00, 0x00, 0xF6, 0xB9, 0xF5, 0xAC, /* 0xBC-0xBF */
+	0x00, 0x00, 0xF6, 0xB5, 0xC5, 0x63, 0xF6, 0xBB, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF6, 0xBA, 0x00, 0x00, 0xF6, 0xB6, /* 0xC4-0xC7 */
+	0xF6, 0xC2, 0x00, 0x00, 0xF6, 0xB7, 0xF7, 0xBB, /* 0xC8-0xCB */
+	0xF6, 0xC5, 0xF6, 0xC7, 0xF6, 0xBE, 0xF6, 0xB8, /* 0xCC-0xCF */
+	0xF7, 0xBC, 0xF7, 0xBE, 0xF7, 0xB8, 0xC5, 0xC2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF7, 0xC5, 0xF7, 0xC3, 0xC5, 0xC3, /* 0xD4-0xD7 */
+	0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xBA, 0xF7, 0xB7, /* 0xD8-0xDB */
+	0xF7, 0xBD, 0xF7, 0xC6, 0xF7, 0xB9, 0xF7, 0xBF, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF8, 0x69, 0xF8, 0x6E, 0xF8, 0x64, /* 0xE0-0xE3 */
+	0xF8, 0x67, 0xC5, 0xEE, 0xF8, 0x6B, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF8, 0x72, 0xF7, 0xC0, 0x00, 0x00, 0xF8, 0x65, /* 0xE8-0xEB */
+	0xF8, 0x6F, 0xF8, 0x73, 0xF8, 0x6A, 0xF8, 0x63, /* 0xEC-0xEF */
+	0xF8, 0x6D, 0x00, 0x00, 0xF8, 0x6C, 0xF8, 0x71, /* 0xF0-0xF3 */
+	0xF8, 0x70, 0xF7, 0xC4, 0xF8, 0x68, 0xF8, 0x62, /* 0xF4-0xF7 */
+	0xF8, 0x66, 0xC6, 0x4E, 0xC6, 0x4F, 0xF8, 0x61, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF8, 0xE6, 0xF8, 0xDD, 0xF8, 0xE5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0xF8, 0xE2, 0xF8, 0xE3, 0xF8, 0xDC, 0xF8, 0xDF, /* 0x00-0x03 */
+	0xF8, 0xE7, 0xF8, 0xE1, 0xF8, 0xE0, 0xF8, 0xDE, /* 0x04-0x07 */
+	0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, 0xF9, 0x5D, /* 0x08-0x0B */
+	0x00, 0x00, 0xF9, 0x5E, 0x00, 0x00, 0xF9, 0x60, /* 0x0C-0x0F */
+	0xF9, 0x5F, 0xF9, 0x62, 0xF9, 0x61, 0xF9, 0x7C, /* 0x10-0x13 */
+	0xF9, 0x7B, 0xF9, 0xB7, 0x00, 0x00, 0xF9, 0xB8, /* 0x14-0x17 */
+	0x00, 0x00, 0xF9, 0xC5, 0xC6, 0x78, 0xC6, 0x7C, /* 0x18-0x1B */
+	0x00, 0x00, 0xF9, 0xCF, 0xC6, 0x7D, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xB3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xC4, 0xD0, 0xF6, 0xC9, 0x00, 0x00, /* 0x78-0x7B */
+	0xC6, 0x50, 0xC6, 0x51, 0x00, 0x00, 0xB3, 0xC0, /* 0x7C-0x7F */
+	
+	0xE0, 0xEE, 0x00, 0x00, 0xB9, 0xA8, 0xE8, 0xF0, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xB0, 0xEC, 0xB1, /* 0x84-0x87 */
+	0xEC, 0xAF, 0xEF, 0xAB, 0xEF, 0xAA, 0xC2, 0x47, /* 0x88-0x8B */
+	0xF1, 0xDF, 0xEF, 0xAC, 0xF1, 0xDE, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF3, 0xF3, 0xC4, 0x51, 0xC4, 0x53, /* 0x90-0x93 */
+	0xF3, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x52, /* 0x94-0x97 */
+	0x00, 0x00, 0xF5, 0xB1, 0xF5, 0xB3, 0xF5, 0xB2, /* 0x98-0x9B */
+	0xF6, 0xCA, 0xC5, 0x65, 0x00, 0x00, 0xC5, 0xEF, /* 0x9C-0x9F */
+	0xF8, 0xE8, 0xF9, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF9, 0xD2, 0xB3, 0xC1, 0x00, 0x00, 0xE4, 0xE5, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xBE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEC, 0xB3, 0xEC, 0xB2, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEF, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xC4, 0x54, 0xC4, 0xD1, 0xF7, 0xC7, 0xF9, 0xCB, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xC2, /* 0xB8-0xBB */
+	0xBB, 0xF2, 0x00, 0x00, 0xBE, 0xA3, 0x00, 0x00, /* 0xBC-0xBF */
+	0xF3, 0xF4, 0x00, 0x00, 0xF8, 0x74, 0xB6, 0xC0, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xC6, 0x64, 0xB6, 0xC1, 0xBE, 0xA4, 0xC2, 0x48, /* 0xCC-0xCF */
+	0xF8, 0x75, 0xB6, 0xC2, 0x00, 0x00, 0xE8, 0xF1, /* 0xD0-0xD3 */
+	0xC0, 0x72, 0xEC, 0xB4, 0xEC, 0xB5, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xC0, 0x71, 0x00, 0x00, 0xEF, 0xAF, 0xC2, 0x4C, /* 0xD8-0xDB */
+	0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x49, 0xF1, 0xE0, /* 0xDC-0xDF */
+	0xC3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF5, 0xB5, 0xF5, 0xB4, 0xF5, 0xB7, 0xF5, 0xB6, /* 0xE4-0xE7 */
+	0xC4, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCB, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF6, 0xCD, 0xF6, 0xCC, 0xC5, 0x66, /* 0xEC-0xEF */
+	0xF7, 0xC8, 0x00, 0x00, 0xF8, 0x76, 0xF8, 0x77, /* 0xF0-0xF3 */
+	0xC5, 0xF0, 0xF9, 0x64, 0xF9, 0x7D, 0xC6, 0x75, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDC, 0xB0, 0xEC, 0xB6, 0xEF, 0xB0, /* 0xF8-0xFB */
+	0xF3, 0xF5, 0xE0, 0xEF, 0x00, 0x00, 0xEF, 0xB1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0xF1, 0xE2, 0xF1, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0x78, 0xC6, 0x52, /* 0x04-0x07 */
+	0x00, 0x00, 0xF9, 0x65, 0xF9, 0x7E, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xA9, 0xE8, 0xF2, /* 0x0C-0x0F */
+	0xE8, 0xF3, 0x00, 0x00, 0xEC, 0xB7, 0xB9, 0xAA, /* 0x10-0x13 */
+	0x00, 0x00, 0xC3, 0x5D, 0xF1, 0xE3, 0x00, 0x00, /* 0x14-0x17 */
+	0xF6, 0xCF, 0xC5, 0x67, 0xF6, 0xD0, 0xF6, 0xCE, /* 0x18-0x1B */
+	0xF8, 0x79, 0x00, 0x00, 0xF8, 0xE9, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB9, 0xAB, 0x00, 0x00, 0xEF, 0xB4, 0xEF, 0xB3, /* 0x20-0x23 */
+	0xEF, 0xB2, 0xF1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xF1, 0xE8, 0xF1, 0xE7, 0xF1, 0xE6, 0xF1, 0xE5, /* 0x28-0x2B */
+	0xC3, 0x5E, 0xF3, 0xF6, 0xF5, 0xB9, 0xC4, 0xD3, /* 0x2C-0x2F */
+	0xF5, 0xB8, 0xF6, 0xD1, 0xF7, 0xCB, 0xF7, 0xCA, /* 0x30-0x33 */
+	0xC5, 0xC4, 0xF7, 0xC9, 0xF8, 0x7C, 0xF8, 0x7B, /* 0x34-0x37 */
+	0xF8, 0x7A, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xF3, /* 0x38-0x3B */
+	0x00, 0x00, 0xEC, 0xB8, 0xC2, 0x4D, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF3, 0xF7, 0xF3, 0xF8, 0xF7, 0xCC, 0xF8, 0x7D, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xEA, 0xF9, 0x66, /* 0x44-0x47 */
+	0xF9, 0xB9, 0xF9, 0xD4, 0xBB, 0xF4, 0xC2, 0x4E, /* 0x48-0x4B */
+	0xF1, 0xE9, 0xF3, 0xF9, 0xF6, 0xD2, 0xF8, 0x7E, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xBE, 0xA6, 0x00, 0x00, /* 0x50-0x53 */
+	0xEF, 0xB5, 0xF1, 0xEA, 0xF3, 0xFA, 0xF3, 0xFB, /* 0x54-0x57 */
+	0xF3, 0xFC, 0xF5, 0xBE, 0x00, 0x00, 0xF5, 0xBA, /* 0x58-0x5B */
+	0xC5, 0x68, 0xF5, 0xBD, 0xF5, 0xBC, 0xC4, 0xD4, /* 0x5C-0x5F */
+	0xF5, 0xBB, 0xC4, 0xD6, 0x00, 0x00, 0xC4, 0xD5, /* 0x60-0x63 */
+	0xF6, 0xD4, 0xF6, 0xD3, 0xC5, 0x69, 0xC5, 0x6A, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0xC6, 0xF7, 0xCD, /* 0x68-0x6B */
+	0xC5, 0xC5, 0x00, 0x00, 0xF8, 0xA3, 0xF8, 0xA4, /* 0x6C-0x6F */
+	0xF8, 0xA2, 0xF8, 0xA1, 0xC6, 0x54, 0x00, 0x00, /* 0x70-0x73 */
+	0xF8, 0xEB, 0xF8, 0xEC, 0xF8, 0xED, 0xC6, 0x53, /* 0x74-0x77 */
+	0xF9, 0x67, 0xF9, 0x6A, 0xF9, 0x69, 0xF9, 0x68, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xC0, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xC3, 0x65, 0xF5, 0xBF, 0xF6, 0xD5, 0x00, 0x00, /* 0x90-0x93 */
+	0xC5, 0xC7, 0xF7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xF9, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xC0, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEF, 0xB6, 0x00, 0x00, 0xF7, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0xB0, 0x5A, 0xA7, 0xF3, 0xA8, 0xAE, 0xB8, 0xEB, /* 0x00-0x03 */
+	0xB7, 0xC6, 0xA6, 0xEA, 0xA5, 0x79, 0xC0, 0x74, /* 0x04-0x07 */
+	0xC0, 0x74, 0xAB, 0xB4, 0xAA, 0xF7, 0xB3, 0xE2, /* 0x08-0x0B */
+	0xA9, 0x60, 0xC3, 0x69, 0xC4, 0xEE, 0xC3, 0xB9, /* 0x0C-0x0F */
+	0xC5, 0xDA, 0xC1, 0xB3, 0xBB, 0x72, 0xC5, 0xDE, /* 0x10-0x13 */
+	0xBC, 0xD6, 0xAC, 0xA5, 0xAF, 0x4F, 0xAF, 0x5F, /* 0x14-0x17 */
+	0xB8, 0xA8, 0xB9, 0x54, 0xC0, 0x64, 0xB6, 0xC3, /* 0x18-0x1B */
+	0xA7, 0x5A, 0xC4, 0xE6, 0xC4, 0xEA, 0xC4, 0xF5, /* 0x1C-0x1F */
+	0xC6, 0x7D, 0xB4, 0x50, 0xC0, 0xDD, 0xC2, 0xC5, /* 0x20-0x23 */
+	0xC4, 0xB0, 0xA9, 0xD4, 0xC3, 0xBE, 0xC4, 0xFA, /* 0x24-0x27 */
+	0xB4, 0x59, 0xAE, 0xD4, 0xAE, 0xF6, 0xAF, 0x54, /* 0x28-0x2B */
+	0x00, 0x00, 0xA8, 0xD3, 0xA7, 0x4E, 0xB3, 0xD2, /* 0x2C-0x2F */
+	0xBE, 0xDB, 0xC3, 0x72, 0xC4, 0x6C, 0xBF, 0x63, /* 0x30-0x33 */
+	0xA6, 0xD1, 0xC4, 0xAA, 0xB8, 0xB8, 0xB8, 0xF4, /* 0x34-0x37 */
+	0xC5, 0x53, 0xBE, 0x7C, 0xC6, 0x4F, 0xB8, 0x4C, /* 0x38-0x3B */
+	0xB8, 0x53, 0xBA, 0xF1, 0xDB, 0x77, 0xBF, 0xFD, /* 0x3C-0x3F */
+	0xB3, 0xC0, 0xBD, 0xD7, 0xC3, 0x62, 0xA7, 0xCB, /* 0x40-0x43 */
+	0xC5, 0xA2, 0xC5, 0xA4, 0xA8, 0x63, 0xBD, 0x55, /* 0x44-0x47 */
+	0xB8, 0xEF, 0xB9, 0x70, 0xC2, 0x53, 0xB9, 0xF0, /* 0x48-0x4B */
+	0xBC, 0xD3, 0xB2, 0x5C, 0xBA, 0x7C, 0xB2, 0xD6, /* 0x4C-0x4F */
+	0xC1, 0x5C, 0xAD, 0xAE, 0xB0, 0xC7, 0xA6, 0xD8, /* 0x50-0x53 */
+	0xBB, 0xFE, 0xAD, 0xE2, 0xB8, 0x57, 0xBA, 0xF0, /* 0x54-0x57 */
+	0xB5, 0xD9, 0xB3, 0xAE, 0xC5, 0xAA, 0xCE, 0xD4, /* 0x58-0x5B */
+	0xBC, 0xD6, 0xBF, 0xD5, 0xA4, 0xA6, 0xB9, 0xE7, /* 0x5C-0x5F */
+	0xAB, 0xE3, 0xB2, 0x76, 0xB2, 0xA7, 0xA5, 0x5F, /* 0x60-0x63 */
+	0xED, 0xA8, 0xAB, 0x4B, 0xB4, 0x5F, 0xA4, 0xA3, /* 0x64-0x67 */
+	0xAA, 0x63, 0xBC, 0xC6, 0xAF, 0xC1, 0xB0, 0xD1, /* 0x68-0x6B */
+	0xB6, 0xEB, 0xAC, 0xD9, 0xB8, 0xAD, 0xBB, 0xA1, /* 0x6C-0x6F */
+	0xB1, 0xFE, 0xA8, 0xB0, 0xA8, 0x48, 0xAC, 0x42, /* 0x70-0x73 */
+	0xAD, 0x59, 0xB1, 0xB0, 0xB2, 0xA4, 0xAB, 0x47, /* 0x74-0x77 */
+	0xA8, 0xE2, 0x00, 0x00, 0xB1, 0xE7, 0xC2, 0xB3, /* 0x78-0x7B */
+	0xA8, 0x7D, 0xBD, 0xCC, 0xB6, 0x71, 0xC0, 0x79, /* 0x7C-0x7F */
+	
+	0xA7, 0x66, 0xA4, 0x6B, 0xC3, 0x66, 0xAE, 0xC8, /* 0x80-0x83 */
+	0xC2, 0x6F, 0xC4, 0x72, 0xBE, 0x5B, 0xC6, 0x7A, /* 0x84-0x87 */
+	0xC4, 0x52, 0xBE, 0xA4, 0xA4, 0x4F, 0xBE, 0xE4, /* 0x88-0x8B */
+	0xBE, 0xFA, 0xF7, 0x65, 0xA6, 0x7E, 0xBC, 0xA6, /* 0x8C-0x8F */
+	0xC5, 0xCA, 0xBC, 0xBF, 0xBA, 0xA7, 0xB7, 0xD2, /* 0x90-0x93 */
+	0xE6, 0xA3, 0x00, 0x00, 0xBD, 0x6D, 0xC1, 0x70, /* 0x94-0x97 */
+	0xBD, 0xFB, 0xBD, 0xAC, 0xB3, 0x73, 0xC1, 0xE5, /* 0x98-0x9B */
+	0xA6, 0x43, 0xA6, 0x48, 0xAB, 0x7C, 0xAF, 0x50, /* 0x9C-0x9F */
+	0xB5, 0xF5, 0xBB, 0xA1, 0xB7, 0x47, 0xA9, 0xC0, /* 0xA0-0xA3 */
+	0xB1, 0xC9, 0xC0, 0xD4, 0xC3, 0xAE, 0xC2, 0x79, /* 0xA4-0xA7 */
+	0xA5, 0x4F, 0xCB, 0xF1, 0xB9, 0xE7, 0xC0, 0xAD, /* 0xA8-0xAB */
+	0xCC, 0xB0, 0xAC, 0xC2, 0xBC, 0xFC, 0xB2, 0xDC, /* 0xAC-0xAF */
+	0xB2, 0xE2, 0xB9, 0x61, 0xB9, 0x73, 0xC6, 0x46, /* 0xB0-0xB3 */
+	0xBB, 0xE2, 0xA8, 0xD2, 0xC2, 0xA7, 0xC4, 0xBF, /* 0xB4-0xB7 */
+	0xC1, 0xF5, 0xB4, 0x63, 0xA4, 0x46, 0xB9, 0xB1, /* 0xB8-0xBB */
+	0xBC, 0x64, 0xA7, 0xBF, 0xAE, 0xC6, 0xBC, 0xD6, /* 0xBC-0xBF */
+	0xBF, 0x52, 0xC0, 0xF8, 0xE7, 0x64, 0xBF, 0xF1, /* 0xC0-0xC3 */
+	0xC0, 0x73, 0xB7, 0x77, 0xA8, 0xBF, 0xBC, 0x42, /* 0xC4-0xC7 */
+	0xCC, 0xD8, 0xAC, 0x68, 0xAC, 0x79, 0xB7, 0xC8, /* 0xC8-0xCB */
+	0xAF, 0x5B, 0xAF, 0x64, 0xB2, 0xB8, 0xAF, 0xC3, /* 0xCC-0xCF */
+	0xC3, 0xFE, 0xA4, 0xBB, 0xBC, 0xAE, 0xB3, 0xB0, /* 0xD0-0xD3 */
+	0xAD, 0xDB, 0xB1, 0x5B, 0xB2, 0x5F, 0xBD, 0xFC, /* 0xD4-0xD7 */
+	0xAB, 0xDF, 0xB7, 0x58, 0xAE, 0xDF, 0xB2, 0x76, /* 0xD8-0xDB */
+	0xB6, 0xA9, 0xA7, 0x51, 0xA6, 0x4F, 0xBC, 0x69, /* 0xDC-0xDF */
+	0xA9, 0xF6, 0xA7, 0xF5, 0xB1, 0xF9, 0xAA, 0x64, /* 0xE0-0xE3 */
+	0xB2, 0x7A, 0xB5, 0x67, 0xBF, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xB8, 0xCC, 0xA8, 0xBD, 0xC2, 0xF7, 0xB0, 0xCE, /* 0xE8-0xEB */
+	0xB7, 0xC4, 0xA7, 0x5B, 0xBF, 0x4D, 0xBF, 0x5A, /* 0xEC-0xEF */
+	0xC4, 0xA9, 0x00, 0x00, 0xC5, 0xEC, 0xC5, 0xEF, /* 0xF0-0xF3 */
+	0xAA, 0x4C, 0xB2, 0x4F, 0xC1, 0x7B, 0xA5, 0xDF, /* 0xF4-0xF7 */
+	0xB2, 0xC1, 0xB2, 0xC9, 0xAA, 0xAC, 0xAA, 0xA5, /* 0xF8-0xFB */
+	0xC3, 0xD1, 0xA4, 0xB0, 0xAF, 0xF9, 0xA8, 0xEB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0xA4, 0xC1, 0xAB, 0xD7, 0xA9, 0xDD, 0xBF, 0x7D, /* 0x00-0x03 */
+	0xA6, 0x76, 0xAC, 0x7D, 0xBC, 0xC9, 0xBF, 0xE7, /* 0x04-0x07 */
+	0xA6, 0xE6, 0xAD, 0xB0, 0xA8, 0xA3, 0xB9, 0xF8, /* 0x08-0x0B */
+	0xC9, 0x4A, 0xDD, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xB6, 0xEF, 0x00, 0x00, 0xB4, 0xB8, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE8, 0xF9, 0xBD, 0xDE, 0xAF, 0x71, /* 0x14-0x17 */
+	0x00, 0x00, 0xAF, 0xAB, 0xB2, 0xBB, 0xBA, 0xD6, /* 0x18-0x1B */
+	0xB9, 0x74, 0xBA, 0xEB, 0xA6, 0xD0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xD1, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xB6, 0x68, 0xB3, 0xA3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xBA, 0xB9, 0x7D, /* 0x28-0x2B */
+	0xC0, 0x5D, 0xC5, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FE[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA1, 0x4A, 0xA1, 0x57, 0x00, 0x00, 0xA1, 0x59, /* 0x30-0x33 */
+	0xA1, 0x5B, 0xA1, 0x5F, 0xA1, 0x60, 0xA1, 0x63, /* 0x34-0x37 */
+	0xA1, 0x64, 0xA1, 0x67, 0xA1, 0x68, 0xA1, 0x6B, /* 0x38-0x3B */
+	0xA1, 0x6C, 0xA1, 0x6F, 0xA1, 0x70, 0xA1, 0x73, /* 0x3C-0x3F */
+	0xA1, 0x74, 0xA1, 0x77, 0xA1, 0x78, 0xA1, 0x7B, /* 0x40-0x43 */
+	0xA1, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xA1, 0xC6, 0xA1, 0xC7, 0xA1, 0xCA, /* 0x48-0x4B */
+	0xA1, 0xCB, 0xA1, 0xC8, 0xA1, 0xC9, 0xA1, 0x5C, /* 0x4C-0x4F */
+	0xA1, 0x4D, 0xA1, 0x4E, 0xA1, 0x4F, 0x00, 0x00, /* 0x50-0x53 */
+	0xA1, 0x51, 0xA1, 0x52, 0xA1, 0x53, 0xA1, 0x54, /* 0x54-0x57 */
+	0x00, 0x00, 0xA1, 0x7D, 0xA1, 0x7E, 0xA1, 0xA1, /* 0x58-0x5B */
+	0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA4, 0xA1, 0xCC, /* 0x5C-0x5F */
+	0xA1, 0xCD, 0xA1, 0xCE, 0xA1, 0xDE, 0xA1, 0xDF, /* 0x60-0x63 */
+	0xA1, 0xE0, 0xA1, 0xE1, 0xA1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */
+	0xA2, 0x42, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x68-0x6B */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0xA1, 0x49, 0xA1, 0xA8, 0xA1, 0xAD, /* 0x00-0x03 */
+	0xA2, 0x43, 0xA2, 0x48, 0xA1, 0xAE, 0xA1, 0xA6, /* 0x04-0x07 */
+	0xA1, 0x5D, 0xA1, 0x5E, 0xA1, 0xAF, 0xA1, 0xCF, /* 0x08-0x0B */
+	0xA1, 0x41, 0xA1, 0xD0, 0xA1, 0x44, 0xA1, 0xFE, /* 0x0C-0x0F */
+	0xA2, 0xAF, 0xA2, 0xB0, 0xA2, 0xB1, 0xA2, 0xB2, /* 0x10-0x13 */
+	0xA2, 0xB3, 0xA2, 0xB4, 0xA2, 0xB5, 0xA2, 0xB6, /* 0x14-0x17 */
+	0xA2, 0xB7, 0xA2, 0xB8, 0xA1, 0x47, 0xA1, 0x46, /* 0x18-0x1B */
+	0xA1, 0xD5, 0xA1, 0xD7, 0xA1, 0xD6, 0xA1, 0x48, /* 0x1C-0x1F */
+	0xA2, 0x49, 0xA2, 0xCF, 0xA2, 0xD0, 0xA2, 0xD1, /* 0x20-0x23 */
+	0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, 0xA2, 0xD5, /* 0x24-0x27 */
+	0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, 0xA2, 0xD9, /* 0x28-0x2B */
+	0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, 0xA2, 0xDD, /* 0x2C-0x2F */
+	0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, 0xA2, 0xE1, /* 0x30-0x33 */
+	0xA2, 0xE2, 0xA2, 0xE3, 0xA2, 0xE4, 0xA2, 0xE5, /* 0x34-0x37 */
+	0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, 0xA1, 0x65, /* 0x38-0x3B */
+	0xA2, 0x40, 0xA1, 0x66, 0xA1, 0x73, 0xA1, 0xC4, /* 0x3C-0x3F */
+	0xA1, 0xA5, 0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, /* 0x40-0x43 */
+	0xA2, 0xEC, 0xA2, 0xED, 0xA2, 0xEE, 0xA2, 0xEF, /* 0x44-0x47 */
+	0xA2, 0xF0, 0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, /* 0x48-0x4B */
+	0xA2, 0xF4, 0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, /* 0x4C-0x4F */
+	0xA2, 0xF8, 0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, /* 0x50-0x53 */
+	0xA2, 0xFC, 0xA2, 0xFD, 0xA2, 0xFE, 0xA3, 0x40, /* 0x54-0x57 */
+	0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, 0xA1, 0x61, /* 0x58-0x5B */
+	0xA1, 0x55, 0xA1, 0x62, 0xA1, 0xE3, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA1, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA2, 0x46, 0xA2, 0x47, 0x00, 0x00, 0xA1, 0xC3, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA2, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	NULL,   NULL,   u2c_02, u2c_03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, NULL,   u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, u2c_31, u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   u2c_FE, u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+			unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen <= 1)
+			return -ENAMETOOLONG;
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		n = 2;
+	} else if (ch==0 && cl) {
+		out[0] = cl;
+		n = 1;
+	}
+	else
+		return -EINVAL;
+
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+			wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (boundlen == 1) {
+		*uni = rawstring[0];
+		return 1;
+	}
+
+	ch = rawstring[0];
+	cl = rawstring[1];
+
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		n = 2;
+	} else{
+		*uni = ch;
+		n = 1;
+	}
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "cp950",
+	.alias		= "big5",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_cp950(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp950(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp950)
+module_exit(exit_nls_cp950)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(big5);
diff --git a/kernel/fs/nls/nls_euc-jp.c b/kernel/fs/nls/nls_euc-jp.c
new file mode 100644
index 000000000..162b3f160
--- /dev/null
+++ b/kernel/fs/nls/nls_euc-jp.c
@@ -0,0 +1,580 @@
+/*
+ * linux/fs/nls/nls_euc-jp.c
+ *
+ * Added `OSF/JVC Recommended Code Set Conversion Specification
+ * between Japanese EUC and Shift-JIS' support: <hirofumi@mail.parknet.co.jp>
+ * (http://www.opengroup.or.jp/jvc/cde/sjis-euc-e.html)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static struct nls_table *p_nls;
+
+#define IS_SJIS_LOW_BYTE(l)	((0x40 <= (l)) && ((l) <= 0xFC) && ((l) != 0x7F))
+/* JIS X 0208 (include NEC spesial characters) */
+#define IS_SJIS_JISX0208(h, l)	((((0x81 <= (h)) && ((h) <= 0x9F))	\
+				 || ((0xE0 <= (h)) && ((h) <= 0xEA)))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_JISX0201KANA(c)	((0xA1 <= (c)) && ((c) <= 0xDF))
+#define IS_SJIS_UDC_LOW(h, l)	(((0xF0 <= (h)) && ((h) <= 0xF4))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_UDC_HI(h, l)	(((0xF5 <= (h)) && ((h) <= 0xF9))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_IBM(h, l)	(((0xFA <= (h)) && ((h) <= 0xFC))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_NECIBM(h, l)	(((0xED <= (h)) && ((h) <= 0xEE))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define MAP_SJIS2EUC(sjis_hi, sjis_lo, sjis_p, euc_hi, euc_lo, euc_p) {		\
+	if ((sjis_lo) >= 0x9F) {						\
+		(euc_hi) = (sjis_hi) * 2 - (((sjis_p) * 2 - (euc_p)) - 1);	\
+		(euc_lo) = (sjis_lo) + 2;					\
+	} else {								\
+		(euc_hi) = (sjis_hi) * 2 - ((sjis_p) * 2 - (euc_p));		\
+		(euc_lo) = (sjis_lo) + ((sjis_lo) >= 0x7F ? 0x60 : 0x61);	\
+	}									\
+} while(0)
+
+#define SS2		(0x8E)		/* Single Shift 2 */
+#define SS3		(0x8F)		/* Single Shift 3 */
+#define IS_EUC_BYTE(c)		((0xA1 <= (c)) && ((c) <= 0xFE))
+#define IS_EUC_JISX0208(h, l)	(IS_EUC_BYTE(h) && IS_EUC_BYTE(l))
+#define IS_EUC_JISX0201KANA(h, l)	(((h) == SS2) && (0xA1 <= (l) && (l) <= 0xDF))
+#define IS_EUC_UDC_LOW(h, l)	(((0xF5 <= (h)) && ((h) <= 0xFE))	\
+				 && IS_EUC_BYTE(l))
+#define IS_EUC_UDC_HI(h, l)	IS_EUC_UDC_LOW(h, l) /* G3 block */
+#define MAP_EUC2SJIS(euc_hi, euc_lo, euc_p, sjis_hi, sjis_lo, sjis_p) {		\
+	if ((euc_hi) & 1) {							\
+		(sjis_hi) = (euc_hi) / 2 + ((sjis_p) - (euc_p) / 2);		\
+		(sjis_lo) = (euc_lo) - ((euc_lo) >= 0xE0 ? 0x60 : 0x61);	\
+	} else {								\
+		(sjis_hi) = (euc_hi) / 2 + (((sjis_p) - (euc_p) / 2) - 1);	\
+		(sjis_lo) = (euc_lo) - 2;					\
+	}									\
+} while(0)
+
+/* SJIS IBM extended characters to EUC map */
+static const unsigned char sjisibm2euc_map[][2] = {
+	{0xF3, 0xF3}, {0xF3, 0xF4}, {0xF3, 0xF5}, {0xF3, 0xF6}, {0xF3, 0xF7},
+	{0xF3, 0xF8}, {0xF3, 0xF9}, {0xF3, 0xFA}, {0xF3, 0xFB}, {0xF3, 0xFC},
+	{0xF3, 0xFD}, {0xF3, 0xFE}, {0xF4, 0xA1}, {0xF4, 0xA2}, {0xF4, 0xA3},
+	{0xF4, 0xA4}, {0xF4, 0xA5}, {0xF4, 0xA6}, {0xF4, 0xA7}, {0xF4, 0xA8},
+	{0xA2, 0xCC}, {0xA2, 0xC3}, {0xF4, 0xA9}, {0xF4, 0xAA}, {0xF4, 0xAB},
+	{0xF4, 0xAC}, {0xF4, 0xAD}, {0xA2, 0xE8}, {0xD4, 0xE3}, {0xDC, 0xDF},
+	{0xE4, 0xE9}, {0xE3, 0xF8}, {0xD9, 0xA1}, {0xB1, 0xBB}, {0xF4, 0xAE},
+	{0xC2, 0xAD}, {0xC3, 0xFC}, {0xE4, 0xD0}, {0xC2, 0xBF}, {0xBC, 0xF4},
+	{0xB0, 0xA9}, {0xB0, 0xC8}, {0xF4, 0xAF}, {0xB0, 0xD2}, {0xB0, 0xD4},
+	{0xB0, 0xE3}, {0xB0, 0xEE}, {0xB1, 0xA7}, {0xB1, 0xA3}, {0xB1, 0xAC},
+	{0xB1, 0xA9}, {0xB1, 0xBE}, {0xB1, 0xDF}, {0xB1, 0xD8}, {0xB1, 0xC8},
+	{0xB1, 0xD7}, {0xB1, 0xE3}, {0xB1, 0xF4}, {0xB1, 0xE1}, {0xB2, 0xA3},
+	{0xF4, 0xB0}, {0xB2, 0xBB}, {0xB2, 0xE6}, {0x00, 0x00}, {0xB2, 0xED},
+	{0xB2, 0xF5}, {0xB2, 0xFC}, {0xF4, 0xB1}, {0xB3, 0xB5}, {0xB3, 0xD8},
+	{0xB3, 0xDB}, {0xB3, 0xE5}, {0xB3, 0xEE}, {0xB3, 0xFB}, {0xF4, 0xB2},
+	{0xF4, 0xB3}, {0xB4, 0xC0}, {0xB4, 0xC7}, {0xB4, 0xD0}, {0xB4, 0xDE},
+	{0xF4, 0xB4}, {0xB5, 0xAA}, {0xF4, 0xB5}, {0xB5, 0xAF}, {0xB5, 0xC4},
+	{0xB5, 0xE8}, {0xF4, 0xB6}, {0xB7, 0xC2}, {0xB7, 0xE4}, {0xB7, 0xE8},
+	{0xB7, 0xE7}, {0xF4, 0xB7}, {0xF4, 0xB8}, {0xF4, 0xB9}, {0xB8, 0xCE},
+	{0xB8, 0xE1}, {0xB8, 0xF5}, {0xB8, 0xF7}, {0xB8, 0xF8}, {0xB8, 0xFC},
+	{0xB9, 0xAF}, {0xB9, 0xB7}, {0xBA, 0xBE}, {0xBA, 0xDB}, {0xCD, 0xAA},
+	{0xBA, 0xE1}, {0xF4, 0xBA}, {0xBA, 0xEB}, {0xBB, 0xB3}, {0xBB, 0xB8},
+	{0xF4, 0xBB}, {0xBB, 0xCA}, {0xF4, 0xBC}, {0xF4, 0xBD}, {0xBB, 0xD0},
+	{0xBB, 0xDE}, {0xBB, 0xF4}, {0xBB, 0xF5}, {0xBB, 0xF9}, {0xBC, 0xE4},
+	{0xBC, 0xED}, {0xBC, 0xFE}, {0xF4, 0xBE}, {0xBD, 0xC2}, {0xBD, 0xE7},
+	{0xF4, 0xBF}, {0xBD, 0xF0}, {0xBE, 0xB0}, {0xBE, 0xAC}, {0xF4, 0xC0},
+	{0xBE, 0xB3}, {0xBE, 0xBD}, {0xBE, 0xCD}, {0xBE, 0xC9}, {0xBE, 0xE4},
+	{0xBF, 0xA8}, {0xBF, 0xC9}, {0xC0, 0xC4}, {0xC0, 0xE4}, {0xC0, 0xF4},
+	{0xC1, 0xA6}, {0xF4, 0xC1}, {0xC1, 0xF5}, {0xC1, 0xFC}, {0xF4, 0xC2},
+	{0xC1, 0xF8}, {0xC2, 0xAB}, {0xC2, 0xA1}, {0xC2, 0xA5}, {0xF4, 0xC3},
+	{0xC2, 0xB8}, {0xC2, 0xBA}, {0xF4, 0xC4}, {0xC2, 0xC4}, {0xC2, 0xD2},
+	{0xC2, 0xD7}, {0xC2, 0xDB}, {0xC2, 0xDE}, {0xC2, 0xED}, {0xC2, 0xF0},
+	{0xF4, 0xC5}, {0xC3, 0xA1}, {0xC3, 0xB5}, {0xC3, 0xC9}, {0xC3, 0xB9},
+	{0xF4, 0xC6}, {0xC3, 0xD8}, {0xC3, 0xFE}, {0xF4, 0xC7}, {0xC4, 0xCC},
+	{0xF4, 0xC8}, {0xC4, 0xD9}, {0xC4, 0xEA}, {0xC4, 0xFD}, {0xF4, 0xC9},
+	{0xC5, 0xA7}, {0xC5, 0xB5}, {0xC5, 0xB6}, {0xF4, 0xCA}, {0xC5, 0xD5},
+	{0xC6, 0xB8}, {0xC6, 0xD7}, {0xC6, 0xE0}, {0xC6, 0xEA}, {0xC6, 0xE3},
+	{0xC7, 0xA1}, {0xC7, 0xAB}, {0xC7, 0xC7}, {0xC7, 0xC3}, {0xC7, 0xCB},
+	{0xC7, 0xCF}, {0xC7, 0xD9}, {0xF4, 0xCB}, {0xF4, 0xCC}, {0xC7, 0xE6},
+	{0xC7, 0xEE}, {0xC7, 0xFC}, {0xC7, 0xEB}, {0xC7, 0xF0}, {0xC8, 0xB1},
+	{0xC8, 0xE5}, {0xC8, 0xF8}, {0xC9, 0xA6}, {0xC9, 0xAB}, {0xC9, 0xAD},
+	{0xF4, 0xCD}, {0xC9, 0xCA}, {0xC9, 0xD3}, {0xC9, 0xE9}, {0xC9, 0xE3},
+	{0xC9, 0xFC}, {0xC9, 0xF4}, {0xC9, 0xF5}, {0xF4, 0xCE}, {0xCA, 0xB3},
+	{0xCA, 0xBD}, {0xCA, 0xEF}, {0xCA, 0xF1}, {0xCB, 0xAE}, {0xF4, 0xCF},
+	{0xCB, 0xCA}, {0xCB, 0xE6}, {0xCB, 0xEA}, {0xCB, 0xF0}, {0xCB, 0xF4},
+	{0xCB, 0xEE}, {0xCC, 0xA5}, {0xCB, 0xF9}, {0xCC, 0xAB}, {0xCC, 0xAE},
+	{0xCC, 0xAD}, {0xCC, 0xB2}, {0xCC, 0xC2}, {0xCC, 0xD0}, {0xCC, 0xD9},
+	{0xF4, 0xD0}, {0xCD, 0xBB}, {0xF4, 0xD1}, {0xCE, 0xBB}, {0xF4, 0xD2},
+	{0xCE, 0xBA}, {0xCE, 0xC3}, {0xF4, 0xD3}, {0xCE, 0xF2}, {0xB3, 0xDD},
+	{0xCF, 0xD5}, {0xCF, 0xE2}, {0xCF, 0xE9}, {0xCF, 0xED}, {0xF4, 0xD4},
+	{0xF4, 0xD5}, {0xF4, 0xD6}, {0x00, 0x00}, {0xF4, 0xD7}, {0xD0, 0xE5},
+	{0xF4, 0xD8}, {0xD0, 0xE9}, {0xD1, 0xE8}, {0xF4, 0xD9}, {0xF4, 0xDA},
+	{0xD1, 0xEC}, {0xD2, 0xBB}, {0xF4, 0xDB}, {0xD3, 0xE1}, {0xD3, 0xE8},
+	{0xD4, 0xA7}, {0xF4, 0xDC}, {0xF4, 0xDD}, {0xD4, 0xD4}, {0xD4, 0xF2},
+	{0xD5, 0xAE}, {0xF4, 0xDE}, {0xD7, 0xDE}, {0xF4, 0xDF}, {0xD8, 0xA2},
+	{0xD8, 0xB7}, {0xD8, 0xC1}, {0xD8, 0xD1}, {0xD8, 0xF4}, {0xD9, 0xC6},
+	{0xD9, 0xC8}, {0xD9, 0xD1}, {0xF4, 0xE0}, {0xF4, 0xE1}, {0xF4, 0xE2},
+	{0xF4, 0xE3}, {0xF4, 0xE4}, {0xDC, 0xD3}, {0xDD, 0xC8}, {0xDD, 0xD4},
+	{0xDD, 0xEA}, {0xDD, 0xFA}, {0xDE, 0xA4}, {0xDE, 0xB0}, {0xF4, 0xE5},
+	{0xDE, 0xB5}, {0xDE, 0xCB}, {0xF4, 0xE6}, {0xDF, 0xB9}, {0xF4, 0xE7},
+	{0xDF, 0xC3}, {0xF4, 0xE8}, {0xF4, 0xE9}, {0xE0, 0xD9}, {0xF4, 0xEA},
+	{0xF4, 0xEB}, {0xE1, 0xE2}, {0xF4, 0xEC}, {0xF4, 0xED}, {0xF4, 0xEE},
+	{0xE2, 0xC7}, {0xE3, 0xA8}, {0xE3, 0xA6}, {0xE3, 0xA9}, {0xE3, 0xAF},
+	{0xE3, 0xB0}, {0xE3, 0xAA}, {0xE3, 0xAB}, {0xE3, 0xBC}, {0xE3, 0xC1},
+	{0xE3, 0xBF}, {0xE3, 0xD5}, {0xE3, 0xD8}, {0xE3, 0xD6}, {0xE3, 0xDF},
+	{0xE3, 0xE3}, {0xE3, 0xE1}, {0xE3, 0xD4}, {0xE3, 0xE9}, {0xE4, 0xA6},
+	{0xE3, 0xF1}, {0xE3, 0xF2}, {0xE4, 0xCB}, {0xE4, 0xC1}, {0xE4, 0xC3},
+	{0xE4, 0xBE}, {0xF4, 0xEF}, {0xE4, 0xC0}, {0xE4, 0xC7}, {0xE4, 0xBF},
+	{0xE4, 0xE0}, {0xE4, 0xDE}, {0xE4, 0xD1}, {0xF4, 0xF0}, {0xE4, 0xDC},
+	{0xE4, 0xD2}, {0xE4, 0xDB}, {0xE4, 0xD4}, {0xE4, 0xFA}, {0xE4, 0xEF},
+	{0xE5, 0xB3}, {0xE5, 0xBF}, {0xE5, 0xC9}, {0xE5, 0xD0}, {0xE5, 0xE2},
+	{0xE5, 0xEA}, {0xE5, 0xEB}, {0xF4, 0xF1}, {0xF4, 0xF2}, {0xF4, 0xF3},
+	{0xE6, 0xE8}, {0xE6, 0xEF}, {0xE7, 0xAC}, {0xF4, 0xF4}, {0xE7, 0xAE},
+	{0xF4, 0xF5}, {0xE7, 0xB1}, {0xF4, 0xF6}, {0xE7, 0xB2}, {0xE8, 0xB1},
+	{0xE8, 0xB6}, {0xF4, 0xF7}, {0xF4, 0xF8}, {0xE8, 0xDD}, {0xF4, 0xF9},
+	{0xF4, 0xFA}, {0xE9, 0xD1}, {0xF4, 0xFB}, {0xE9, 0xED}, {0xEA, 0xCD},
+	{0xF4, 0xFC}, {0xEA, 0xDB}, {0xEA, 0xE6}, {0xEA, 0xEA}, {0xEB, 0xA5},
+	{0xEB, 0xFB}, {0xEB, 0xFA}, {0xF4, 0xFD}, {0xEC, 0xD6}, {0xF4, 0xFE},
+};
+
+#define IS_EUC_IBM2JISX0208(h, l) \
+		(((h) == 0xA2 && (l) == 0xCC) || ((h) == 0xA2 && (l) == 0xE8))
+
+/* EUC to SJIS IBM extended characters map (G3 JIS X 0212 block) */
+static struct {
+	unsigned short euc;
+	unsigned char sjis[2];
+} euc2sjisibm_jisx0212_map[] = {
+	{0xA2C3, {0xFA, 0x55}}, {0xB0A9, {0xFA, 0x68}}, {0xB0C8, {0xFA, 0x69}},
+	{0xB0D2, {0xFA, 0x6B}}, {0xB0D4, {0xFA, 0x6C}}, {0xB0E3, {0xFA, 0x6D}},
+	{0xB0EE, {0xFA, 0x6E}}, {0xB1A3, {0xFA, 0x70}}, {0xB1A7, {0xFA, 0x6F}},
+	{0xB1A9, {0xFA, 0x72}}, {0xB1AC, {0xFA, 0x71}}, {0xB1BB, {0xFA, 0x61}},
+	{0xB1BE, {0xFA, 0x73}}, {0xB1C8, {0xFA, 0x76}}, {0xB1D7, {0xFA, 0x77}},
+	{0xB1D8, {0xFA, 0x75}}, {0xB1DF, {0xFA, 0x74}}, {0xB1E1, {0xFA, 0x7A}},
+	{0xB1E3, {0xFA, 0x78}}, {0xB1F4, {0xFA, 0x79}}, {0xB2A3, {0xFA, 0x7B}},
+	{0xB2BB, {0xFA, 0x7D}}, {0xB2E6, {0xFA, 0x7E}}, {0xB2ED, {0xFA, 0x80}},
+	{0xB2F5, {0xFA, 0x81}}, {0xB2FC, {0xFA, 0x82}}, {0xB3B5, {0xFA, 0x84}},
+	{0xB3D8, {0xFA, 0x85}}, {0xB3DB, {0xFA, 0x86}}, {0xB3DD, {0xFB, 0x77}},
+	{0xB3E5, {0xFA, 0x87}}, {0xB3EE, {0xFA, 0x88}}, {0xB3FB, {0xFA, 0x89}},
+	{0xB4C0, {0xFA, 0x8C}}, {0xB4C7, {0xFA, 0x8D}}, {0xB4D0, {0xFA, 0x8E}},
+	{0xB4DE, {0xFA, 0x8F}}, {0xB5AA, {0xFA, 0x91}}, {0xB5AF, {0xFA, 0x93}},
+	{0xB5C4, {0xFA, 0x94}}, {0xB5E8, {0xFA, 0x95}}, {0xB7C2, {0xFA, 0x97}},
+	{0xB7E4, {0xFA, 0x98}}, {0xB7E7, {0xFA, 0x9A}}, {0xB7E8, {0xFA, 0x99}},
+	{0xB8CE, {0xFA, 0x9E}}, {0xB8E1, {0xFA, 0x9F}}, {0xB8F5, {0xFA, 0xA0}},
+	{0xB8F7, {0xFA, 0xA1}}, {0xB8F8, {0xFA, 0xA2}}, {0xB8FC, {0xFA, 0xA3}},
+	{0xB9AF, {0xFA, 0xA4}}, {0xB9B7, {0xFA, 0xA5}}, {0xBABE, {0xFA, 0xA6}},
+	{0xBADB, {0xFA, 0xA7}}, {0xBAE1, {0xFA, 0xA9}}, {0xBAEB, {0xFA, 0xAB}},
+	{0xBBB3, {0xFA, 0xAC}}, {0xBBB8, {0xFA, 0xAD}}, {0xBBCA, {0xFA, 0xAF}},
+	{0xBBD0, {0xFA, 0xB2}}, {0xBBDE, {0xFA, 0xB3}}, {0xBBF4, {0xFA, 0xB4}},
+	{0xBBF5, {0xFA, 0xB5}}, {0xBBF9, {0xFA, 0xB6}}, {0xBCE4, {0xFA, 0xB7}},
+	{0xBCED, {0xFA, 0xB8}}, {0xBCF4, {0xFA, 0x67}}, {0xBCFE, {0xFA, 0xB9}},
+	{0xBDC2, {0xFA, 0xBB}}, {0xBDE7, {0xFA, 0xBC}}, {0xBDF0, {0xFA, 0xBE}},
+	{0xBEAC, {0xFA, 0xC0}}, {0xBEB0, {0xFA, 0xBF}}, {0xBEB3, {0xFA, 0xC2}},
+	{0xBEBD, {0xFA, 0xC3}}, {0xBEC9, {0xFA, 0xC5}}, {0xBECD, {0xFA, 0xC4}},
+	{0xBEE4, {0xFA, 0xC6}}, {0xBFA8, {0xFA, 0xC7}}, {0xBFC9, {0xFA, 0xC8}},
+	{0xC0C4, {0xFA, 0xC9}}, {0xC0E4, {0xFA, 0xCA}}, {0xC0F4, {0xFA, 0xCB}},
+	{0xC1A6, {0xFA, 0xCC}}, {0xC1F5, {0xFA, 0xCE}}, {0xC1F8, {0xFA, 0xD1}},
+	{0xC1FC, {0xFA, 0xCF}}, {0xC2A1, {0xFA, 0xD3}}, {0xC2A5, {0xFA, 0xD4}},
+	{0xC2AB, {0xFA, 0xD2}}, {0xC2AD, {0xFA, 0x63}}, {0xC2B8, {0xFA, 0xD6}},
+	{0xC2BA, {0xFA, 0xD7}}, {0xC2BF, {0xFA, 0x66}}, {0xC2C4, {0xFA, 0xD9}},
+	{0xC2D2, {0xFA, 0xDA}}, {0xC2D7, {0xFA, 0xDB}}, {0xC2DB, {0xFA, 0xDC}},
+	{0xC2DE, {0xFA, 0xDD}}, {0xC2ED, {0xFA, 0xDE}}, {0xC2F0, {0xFA, 0xDF}},
+	{0xC3A1, {0xFA, 0xE1}}, {0xC3B5, {0xFA, 0xE2}}, {0xC3B9, {0xFA, 0xE4}},
+	{0xC3C9, {0xFA, 0xE3}}, {0xC3D8, {0xFA, 0xE6}}, {0xC3FC, {0xFA, 0x64}},
+	{0xC3FE, {0xFA, 0xE7}}, {0xC4CC, {0xFA, 0xE9}}, {0xC4D9, {0xFA, 0xEB}},
+	{0xC4EA, {0xFA, 0xEC}}, {0xC4FD, {0xFA, 0xED}}, {0xC5A7, {0xFA, 0xEF}},
+	{0xC5B5, {0xFA, 0xF0}}, {0xC5B6, {0xFA, 0xF1}}, {0xC5D5, {0xFA, 0xF3}},
+	{0xC6B8, {0xFA, 0xF4}}, {0xC6D7, {0xFA, 0xF5}}, {0xC6E0, {0xFA, 0xF6}},
+	{0xC6E3, {0xFA, 0xF8}}, {0xC6EA, {0xFA, 0xF7}}, {0xC7A1, {0xFA, 0xF9}},
+	{0xC7AB, {0xFA, 0xFA}}, {0xC7C3, {0xFA, 0xFC}}, {0xC7C7, {0xFA, 0xFB}},
+	{0xC7CB, {0xFB, 0x40}}, {0xC7CF, {0xFB, 0x41}}, {0xC7D9, {0xFB, 0x42}},
+	{0xC7E6, {0xFB, 0x45}}, {0xC7EB, {0xFB, 0x48}}, {0xC7EE, {0xFB, 0x46}},
+	{0xC7F0, {0xFB, 0x49}}, {0xC7FC, {0xFB, 0x47}}, {0xC8B1, {0xFB, 0x4A}},
+	{0xC8E5, {0xFB, 0x4B}}, {0xC8F8, {0xFB, 0x4C}}, {0xC9A6, {0xFB, 0x4D}},
+	{0xC9AB, {0xFB, 0x4E}}, {0xC9AD, {0xFB, 0x4F}}, {0xC9CA, {0xFB, 0x51}},
+	{0xC9D3, {0xFB, 0x52}}, {0xC9E3, {0xFB, 0x54}}, {0xC9E9, {0xFB, 0x53}},
+	{0xC9F4, {0xFB, 0x56}}, {0xC9F5, {0xFB, 0x57}}, {0xC9FC, {0xFB, 0x55}},
+	{0xCAB3, {0xFB, 0x59}}, {0xCABD, {0xFB, 0x5A}}, {0xCAEF, {0xFB, 0x5B}},
+	{0xCAF1, {0xFB, 0x5C}}, {0xCBAE, {0xFB, 0x5D}}, {0xCBCA, {0xFB, 0x5F}},
+	{0xCBE6, {0xFB, 0x60}}, {0xCBEA, {0xFB, 0x61}}, {0xCBEE, {0xFB, 0x64}},
+	{0xCBF0, {0xFB, 0x62}}, {0xCBF4, {0xFB, 0x63}}, {0xCBF9, {0xFB, 0x66}},
+	{0xCCA5, {0xFB, 0x65}}, {0xCCAB, {0xFB, 0x67}}, {0xCCAD, {0xFB, 0x69}},
+	{0xCCAE, {0xFB, 0x68}}, {0xCCB2, {0xFB, 0x6A}}, {0xCCC2, {0xFB, 0x6B}},
+	{0xCCD0, {0xFB, 0x6C}}, {0xCCD9, {0xFB, 0x6D}}, {0xCDAA, {0xFA, 0xA8}},
+	{0xCDBB, {0xFB, 0x6F}}, {0xCEBA, {0xFB, 0x73}}, {0xCEBB, {0xFB, 0x71}},
+	{0xCEC3, {0xFB, 0x74}}, {0xCEF2, {0xFB, 0x76}}, {0xCFD5, {0xFB, 0x78}},
+	{0xCFE2, {0xFB, 0x79}}, {0xCFE9, {0xFB, 0x7A}}, {0xCFED, {0xFB, 0x7B}},
+	{0xD0E5, {0xFB, 0x81}}, {0xD0E9, {0xFB, 0x83}}, {0xD1E8, {0xFB, 0x84}},
+	{0xD1EC, {0xFB, 0x87}}, {0xD2BB, {0xFB, 0x88}}, {0xD3E1, {0xFB, 0x8A}},
+	{0xD3E8, {0xFB, 0x8B}}, {0xD4A7, {0xFB, 0x8C}}, {0xD4D4, {0xFB, 0x8F}},
+	{0xD4E3, {0xFA, 0x5C}}, {0xD4F2, {0xFB, 0x90}}, {0xD5AE, {0xFB, 0x91}},
+	{0xD7DE, {0xFB, 0x93}}, {0xD8A2, {0xFB, 0x95}}, {0xD8B7, {0xFB, 0x96}},
+	{0xD8C1, {0xFB, 0x97}}, {0xD8D1, {0xFB, 0x98}}, {0xD8F4, {0xFB, 0x99}},
+	{0xD9A1, {0xFA, 0x60}}, {0xD9C6, {0xFB, 0x9A}}, {0xD9C8, {0xFB, 0x9B}},
+	{0xD9D1, {0xFB, 0x9C}}, {0xDCD3, {0xFB, 0xA2}}, {0xDCDF, {0xFA, 0x5D}},
+	{0xDDC8, {0xFB, 0xA3}}, {0xDDD4, {0xFB, 0xA4}}, {0xDDEA, {0xFB, 0xA5}},
+	{0xDDFA, {0xFB, 0xA6}}, {0xDEA4, {0xFB, 0xA7}}, {0xDEB0, {0xFB, 0xA8}},
+	{0xDEB5, {0xFB, 0xAA}}, {0xDECB, {0xFB, 0xAB}}, {0xDFB9, {0xFB, 0xAD}},
+	{0xDFC3, {0xFB, 0xAF}}, {0xE0D9, {0xFB, 0xB2}}, {0xE1E2, {0xFB, 0xB5}},
+	{0xE2C7, {0xFB, 0xB9}}, {0xE3A6, {0xFB, 0xBB}}, {0xE3A8, {0xFB, 0xBA}},
+	{0xE3A9, {0xFB, 0xBC}}, {0xE3AA, {0xFB, 0xBF}}, {0xE3AB, {0xFB, 0xC0}},
+	{0xE3AF, {0xFB, 0xBD}}, {0xE3B0, {0xFB, 0xBE}}, {0xE3BC, {0xFB, 0xC1}},
+	{0xE3BF, {0xFB, 0xC3}}, {0xE3C1, {0xFB, 0xC2}}, {0xE3D4, {0xFB, 0xCA}},
+	{0xE3D5, {0xFB, 0xC4}}, {0xE3D6, {0xFB, 0xC6}}, {0xE3D8, {0xFB, 0xC5}},
+	{0xE3DF, {0xFB, 0xC7}}, {0xE3E1, {0xFB, 0xC9}}, {0xE3E3, {0xFB, 0xC8}},
+	{0xE3E9, {0xFB, 0xCB}}, {0xE3F1, {0xFB, 0xCD}}, {0xE3F2, {0xFB, 0xCE}},
+	{0xE3F8, {0xFA, 0x5F}}, {0xE4A6, {0xFB, 0xCC}}, {0xE4BE, {0xFB, 0xD2}},
+	{0xE4BF, {0xFB, 0xD6}}, {0xE4C0, {0xFB, 0xD4}}, {0xE4C1, {0xFB, 0xD0}},
+	{0xE4C3, {0xFB, 0xD1}}, {0xE4C7, {0xFB, 0xD5}}, {0xE4CB, {0xFB, 0xCF}},
+	{0xE4D0, {0xFA, 0x65}}, {0xE4D1, {0xFB, 0xD9}}, {0xE4D2, {0xFB, 0xDC}},
+	{0xE4D4, {0xFB, 0xDE}}, {0xE4DB, {0xFB, 0xDD}}, {0xE4DC, {0xFB, 0xDB}},
+	{0xE4DE, {0xFB, 0xD8}}, {0xE4E0, {0xFB, 0xD7}}, {0xE4E9, {0xFA, 0x5E}},
+	{0xE4EF, {0xFB, 0xE0}}, {0xE4FA, {0xFB, 0xDF}}, {0xE5B3, {0xFB, 0xE1}},
+	{0xE5BF, {0xFB, 0xE2}}, {0xE5C9, {0xFB, 0xE3}}, {0xE5D0, {0xFB, 0xE4}},
+	{0xE5E2, {0xFB, 0xE5}}, {0xE5EA, {0xFB, 0xE6}}, {0xE5EB, {0xFB, 0xE7}},
+	{0xE6E8, {0xFB, 0xEB}}, {0xE6EF, {0xFB, 0xEC}}, {0xE7AC, {0xFB, 0xED}},
+	{0xE7AE, {0xFB, 0xEF}}, {0xE7B1, {0xFB, 0xF1}}, {0xE7B2, {0xFB, 0xF3}},
+	{0xE8B1, {0xFB, 0xF4}}, {0xE8B6, {0xFB, 0xF5}}, {0xE8DD, {0xFB, 0xF8}},
+	{0xE9D1, {0xFB, 0xFB}}, {0xE9ED, {0xFC, 0x40}}, {0xEACD, {0xFC, 0x41}},
+	{0xEADB, {0xFC, 0x43}}, {0xEAE6, {0xFC, 0x44}}, {0xEAEA, {0xFC, 0x45}},
+	{0xEBA5, {0xFC, 0x46}}, {0xEBFA, {0xFC, 0x48}}, {0xEBFB, {0xFC, 0x47}},
+	{0xECD6, {0xFC, 0x4A}},
+};
+
+/* EUC to SJIS IBM extended characters map (G3 Upper block) */
+static const unsigned char euc2sjisibm_g3upper_map[][2] = {
+	{0xFA, 0x40}, {0xFA, 0x41}, {0xFA, 0x42}, {0xFA, 0x43}, {0xFA, 0x44},
+	{0xFA, 0x45}, {0xFA, 0x46}, {0xFA, 0x47}, {0xFA, 0x48}, {0xFA, 0x49},
+	{0xFA, 0x4A}, {0xFA, 0x4B}, {0xFA, 0x4C}, {0xFA, 0x4D}, {0xFA, 0x4E},
+	{0xFA, 0x4F}, {0xFA, 0x50}, {0xFA, 0x51}, {0xFA, 0x52}, {0xFA, 0x53},
+	{0xFA, 0x56}, {0xFA, 0x57}, {0xFA, 0x58}, {0xFA, 0x59}, {0xFA, 0x5A},
+	{0xFA, 0x62}, {0xFA, 0x6A}, {0xFA, 0x7C}, {0xFA, 0x83}, {0xFA, 0x8A},
+	{0xFA, 0x8B}, {0xFA, 0x90}, {0xFA, 0x92}, {0xFA, 0x96}, {0xFA, 0x9B},
+	{0xFA, 0x9C}, {0xFA, 0x9D}, {0xFA, 0xAA}, {0xFA, 0xAE}, {0xFA, 0xB0},
+	{0xFA, 0xB1}, {0xFA, 0xBA}, {0xFA, 0xBD}, {0xFA, 0xC1}, {0xFA, 0xCD},
+	{0xFA, 0xD0}, {0xFA, 0xD5}, {0xFA, 0xD8}, {0xFA, 0xE0}, {0xFA, 0xE5},
+	{0xFA, 0xE8}, {0xFA, 0xEA}, {0xFA, 0xEE}, {0xFA, 0xF2}, {0xFB, 0x43},
+	{0xFB, 0x44}, {0xFB, 0x50}, {0xFB, 0x58}, {0xFB, 0x5E}, {0xFB, 0x6E},
+	{0xFB, 0x70}, {0xFB, 0x72}, {0xFB, 0x75}, {0xFB, 0x7C}, {0xFB, 0x7D},
+	{0xFB, 0x7E}, {0xFB, 0x80}, {0xFB, 0x82}, {0xFB, 0x85}, {0xFB, 0x86},
+	{0xFB, 0x89}, {0xFB, 0x8D}, {0xFB, 0x8E}, {0xFB, 0x92}, {0xFB, 0x94},
+	{0xFB, 0x9D}, {0xFB, 0x9E}, {0xFB, 0x9F}, {0xFB, 0xA0}, {0xFB, 0xA1},
+	{0xFB, 0xA9}, {0xFB, 0xAC}, {0xFB, 0xAE}, {0xFB, 0xB0}, {0xFB, 0xB1},
+	{0xFB, 0xB3}, {0xFB, 0xB4}, {0xFB, 0xB6}, {0xFB, 0xB7}, {0xFB, 0xB8},
+	{0xFB, 0xD3}, {0xFB, 0xDA}, {0xFB, 0xE8}, {0xFB, 0xE9}, {0xFB, 0xEA},
+	{0xFB, 0xEE}, {0xFB, 0xF0}, {0xFB, 0xF2}, {0xFB, 0xF6}, {0xFB, 0xF7},
+	{0xFB, 0xF9}, {0xFB, 0xFA}, {0xFB, 0xFC}, {0xFC, 0x42}, {0xFC, 0x49},
+	{0xFC, 0x4B},
+};
+
+static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi,
+			      const unsigned char sjis_lo);
+static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi,
+				       const unsigned char euc_lo);
+static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi,
+				      const unsigned char euc_lo);
+static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi,
+			      const unsigned char euc_lo);
+static inline int sjisnec2sjisibm(unsigned char *sjisibm,
+				  const unsigned char sjisnec_hi,
+				  const unsigned char sjisnec_lo);
+
+/* SJIS IBM extended characters to EUC */
+static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi,
+			      const unsigned char sjis_lo)
+{
+	int index;
+
+	index = ((sjis_hi - 0xFA) * (0xFD - 0x40)) + (sjis_lo - 0x40);
+	if (IS_EUC_IBM2JISX0208(sjisibm2euc_map[index][0],
+				sjisibm2euc_map[index][1])) {
+		euc[0] = sjisibm2euc_map[index][0];
+		euc[1] = sjisibm2euc_map[index][1];
+		return 2;
+	} else {
+		euc[0] = SS3;
+		euc[1] = sjisibm2euc_map[index][0];
+		euc[2] = sjisibm2euc_map[index][1];
+		return 3;
+	}
+}
+
+/* EUC to SJIS IBM extended characters (G3 JIS X 0212 block) */
+static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi,
+				       const unsigned char euc_lo)
+{
+	int index, min_index, max_index;
+	unsigned short euc;
+
+	min_index = 0;
+	max_index = ARRAY_SIZE(euc2sjisibm_jisx0212_map) - 1;
+	euc = (euc_hi << 8) | euc_lo;
+
+	while (min_index <= max_index) {
+		index = (min_index + max_index) / 2;
+		if (euc < euc2sjisibm_jisx0212_map[index].euc)
+			max_index = index - 1;
+		else
+			min_index = index + 1;
+		if (euc == euc2sjisibm_jisx0212_map[index].euc) {
+			sjis[0] = euc2sjisibm_jisx0212_map[index].sjis[0];
+			sjis[1] = euc2sjisibm_jisx0212_map[index].sjis[1];
+			return 3;
+		}
+	}
+	return 0;
+}
+
+/* EUC to SJIS IBM extended characters (G3 Upper block) */
+static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi,
+				      const unsigned char euc_lo)
+{
+	int index;
+
+	if (euc_hi == 0xF3)
+		index = ((euc_hi << 8) | euc_lo) - 0xF3F3;
+	else
+		index = ((euc_hi << 8) | euc_lo) - 0xF4A1 + 12;
+
+	if ((index < 0) || (index >= ARRAY_SIZE(euc2sjisibm_g3upper_map)))
+		return 0;
+
+	sjis[0] = euc2sjisibm_g3upper_map[index][0];
+	sjis[1] = euc2sjisibm_g3upper_map[index][1];
+
+	return 3;
+}
+
+/* EUC to SJIS IBM extended characters (G3 block) */
+static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi,
+			      const unsigned char euc_lo)
+{
+	int n;
+
+#if 0
+	if ((euc_hi == 0xA2) && (euc_lo == 0xCC)) {
+		sjis[0] = 0xFA;
+		sjis[1] = 0x54;
+		return 2;
+	} else if ((euc_hi == 0xA2) && (euc_lo == 0xE8)) {
+		sjis[0] = 0xFA;
+		sjis[1] = 0x5B;
+		return 2;
+	}
+#endif
+	if ((n = euc2sjisibm_g3upper(sjis, euc_hi, euc_lo))) {
+		return n;
+	} else if ((n = euc2sjisibm_jisx0212(sjis, euc_hi, euc_lo))) {
+		return n;
+	}
+
+	return 0;
+}
+
+/* NEC/IBM extended characters to IBM extended characters */
+static inline int sjisnec2sjisibm(unsigned char *sjisibm,
+				  const unsigned char sjisnec_hi,
+				  const unsigned char sjisnec_lo)
+{
+	int count;
+
+	if (! IS_SJIS_NECIBM(sjisnec_hi, sjisnec_lo))
+		return 0;
+
+	if ((sjisnec_hi == 0xEE) && (sjisnec_lo == 0xF9)) {
+		sjisibm[0] = 0x81;
+		sjisibm[1] = 0xCA;
+		return 2;
+	}
+
+	if ((sjisnec_hi == 0xEE) && (sjisnec_lo >= 0xEF)) {
+		count = (sjisnec_hi << 8 | sjisnec_lo)
+			- (sjisnec_lo <= 0xF9 ? 0xEEEF : (0xEEEF - 10));
+	} else {
+		count = (sjisnec_hi - 0xED) * (0xFC - 0x40)
+			+ (sjisnec_lo - 0x40) + (0x5C - 0x40);
+		if (sjisnec_lo >= 0x7F)
+			count--;
+	}
+
+	sjisibm[0] = 0xFA + (count / (0xFC - 0x40));
+	sjisibm[1] = 0x40 + (count % (0xFC - 0x40));
+	if (sjisibm[1] >= 0x7F)
+		sjisibm[1]++;
+
+	return 2;
+}
+
+static int uni2char(const wchar_t uni,
+		    unsigned char *out, int boundlen)
+{
+	int n;
+
+	if (!p_nls)
+		return -EINVAL;
+	if ((n = p_nls->uni2char(uni, out, boundlen)) < 0)
+		return n;
+
+	/* translate SJIS into EUC-JP */
+	if (n == 1) {
+		if (IS_SJIS_JISX0201KANA(out[0])) {
+			/* JIS X 0201 KANA */
+			if (boundlen < 2)
+				return -ENAMETOOLONG;
+
+			out[1] = out[0];
+			out[0] = SS2;
+			return 2;
+		}
+	} else if (n == 2) {
+		/* NEC/IBM extended characters to IBM extended characters */
+		sjisnec2sjisibm(out, out[0], out[1]);
+
+		if (IS_SJIS_UDC_LOW(out[0], out[1])) {
+			/* User defined characters half low */
+			MAP_SJIS2EUC(out[0], out[1], 0xF0, out[0], out[1], 0xF5);
+		} else if (IS_SJIS_UDC_HI(out[0], out[1])) {
+			/* User defined characters half high */
+			unsigned char ch, cl;
+
+			if (boundlen < 3)
+				return -ENAMETOOLONG;
+
+			n = 3; ch = out[0]; cl = out[1];
+			out[0] = SS3;
+			MAP_SJIS2EUC(ch, cl, 0xF5, out[1], out[2], 0xF5);
+		} else if (IS_SJIS_IBM(out[0], out[1])) {
+			/* IBM extended characters */
+			unsigned char euc[3], i;
+
+			n = sjisibm2euc(euc, out[0], out[1]);
+			if (boundlen < n)
+				return -ENAMETOOLONG;
+			for (i = 0; i < n; i++)
+				out[i] = euc[i];
+		} else if (IS_SJIS_JISX0208(out[0], out[1])) {
+			/* JIS X 0208 (include NEC special characters) */
+			out[0] = (out[0]^0xA0)*2 + 0x5F;
+			if (out[1] > 0x9E)
+				out[0]++;
+
+			if (out[1] < 0x7F)
+				out[1] = out[1] + 0x61;
+			else if (out[1] < 0x9F)
+				out[1] = out[1] + 0x60;
+			else
+				out[1] = out[1] + 0x02;
+		} else {
+			/* Invalid characters */
+			return -EINVAL;
+		}
+	}
+	else
+		return -EINVAL;
+
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+		    wchar_t *uni)
+{
+	unsigned char sjis_temp[2];
+	int euc_offset, n;
+
+	if ( !p_nls )
+		return -EINVAL;
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	/* translate EUC-JP into SJIS */
+	if (rawstring[0] > 0x7F) {
+		if (rawstring[0] == SS3) {
+			if (boundlen < 3)
+				return -EINVAL;
+			euc_offset = 3;
+
+			if (IS_EUC_UDC_HI(rawstring[1], rawstring[2])) {
+				/* User defined characters half high */
+				MAP_EUC2SJIS(rawstring[1], rawstring[2], 0xF5,
+					     sjis_temp[0], sjis_temp[1], 0xF5);
+			} else if (euc2sjisibm(sjis_temp,rawstring[1],rawstring[2])) {
+				/* IBM extended characters */
+			} else {
+				/* JIS X 0212 and Invalid characters*/
+				return -EINVAL;
+
+				/* 'GETA' with SJIS coding */
+				/* sjis_temp[0] = 0x81; */
+				/* sjis_temp[1] = 0xAC; */
+			}
+		} else {
+			if (boundlen < 2)
+				return -EINVAL;
+			euc_offset = 2;
+
+			if (IS_EUC_JISX0201KANA(rawstring[0], rawstring[1])) {
+				/* JIS X 0201 KANA */
+				sjis_temp[0] = rawstring[1];
+				sjis_temp[1] = 0x00;
+			} else if (IS_EUC_UDC_LOW(rawstring[0], rawstring[1])) {
+				/* User defined characters half low */
+				MAP_EUC2SJIS(rawstring[0], rawstring[1], 0xF5,
+					     sjis_temp[0], sjis_temp[1], 0xF0);
+			} else if (IS_EUC_JISX0208(rawstring[0], rawstring[1])) {
+				/* JIS X 0208 (include NEC spesial characters) */
+				sjis_temp[0] = ((rawstring[0]-0x5f)/2) ^ 0xA0;
+				if (!(rawstring[0] & 1))
+					sjis_temp[1] = rawstring[1] - 0x02;
+				else if (rawstring[1] < 0xE0)
+					sjis_temp[1] = rawstring[1] - 0x61;
+				else
+					sjis_temp[1] = rawstring[1] - 0x60;
+			} else {
+				/* Invalid characters */
+				return -EINVAL;
+			}
+		}
+	} else {
+		euc_offset = 1;
+
+		/* JIS X 0201 ROMAJI */
+		sjis_temp[0] = rawstring[0];
+		sjis_temp[1] = 0x00;
+	}
+
+	if ( (n = p_nls->char2uni(sjis_temp, sizeof(sjis_temp), uni)) < 0)
+		return n;
+
+	return euc_offset;
+}
+
+static struct nls_table table = {
+	.charset	= "euc-jp",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+};
+
+static int __init init_nls_euc_jp(void)
+{
+	p_nls = load_nls("cp932");
+
+	if (p_nls) {
+		table.charset2upper = p_nls->charset2upper;
+		table.charset2lower = p_nls->charset2lower;
+		return register_nls(&table);
+	}
+
+	return -EINVAL;
+}
+
+static void __exit exit_nls_euc_jp(void)
+{
+	unregister_nls(&table);
+	unload_nls(p_nls);
+}
+
+module_init(init_nls_euc_jp)
+module_exit(exit_nls_euc_jp)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-1.c b/kernel/fs/nls/nls_iso8859-1.c
new file mode 100644
index 000000000..69ac020d4
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-1.c
@@ -0,0 +1,257 @@
+/*
+ * linux/fs/nls/nls_iso8859-1.c
+ *
+ * Charset iso8859-1 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x00a4, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00ba, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x00d0, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x00dd, 0x00de, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x00f0, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x00fd, 0x00fe, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-1",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_1(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_1(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_1)
+module_exit(exit_nls_iso8859_1)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-13.c b/kernel/fs/nls/nls_iso8859-13.c
new file mode 100644
index 000000000..afb3f8f27
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-13.c
@@ -0,0 +1,285 @@
+/*
+ * linux/fs/nls/nls_iso8859-13.c
+ *
+ * Charset iso8859-13 translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x201d, 0x00a2, 0x00a3,
+	0x00a4, 0x201e, 0x00a6, 0x00a7,
+	0x00d8, 0x00a9, 0x0156, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00c6,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x201c, 0x00b5, 0x00b6, 0x00b7,
+	0x00f8, 0x00b9, 0x0157, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00e6,
+	/* 0xc0*/
+	0x0104, 0x012e, 0x0100, 0x0106,
+	0x00c4, 0x00c5, 0x0118, 0x0112,
+	0x010c, 0x00c9, 0x0179, 0x0116,
+	0x0122, 0x0136, 0x012a, 0x013b,
+	/* 0xd0*/
+	0x0160, 0x0143, 0x0145, 0x00d3,
+	0x014c, 0x00d5, 0x00d6, 0x00d7,
+	0x0172, 0x0141, 0x015a, 0x016a,
+	0x00dc, 0x017b, 0x017d, 0x00df,
+	/* 0xe0*/
+	0x0105, 0x012f, 0x0101, 0x0107,
+	0x00e4, 0x00e5, 0x0119, 0x0113,
+	0x010d, 0x00e9, 0x017a, 0x0117,
+	0x0123, 0x0137, 0x012b, 0x013c,
+	/* 0xf0*/
+	0x0161, 0x0144, 0x0146, 0x00f3,
+	0x014d, 0x00f5, 0x00f6, 0x00f7,
+	0x0173, 0x0142, 0x015b, 0x016b,
+	0x00fc, 0x017c, 0x017e, 0x2019,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0xc4, 0xc5, 0xaf, 0x00, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xd3, 0x00, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xa8, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0xe4, 0xe5, 0xbf, 0x00, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xf3, 0x00, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xb8, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0xc2, 0xe2, 0x00, 0x00, 0xc0, 0xe0, 0xc3, 0xe3, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0xc7, 0xe7, 0x00, 0x00, 0xcb, 0xeb, /* 0x10-0x17 */
+	0xc6, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0xce, 0xee, 0x00, 0x00, 0xc1, 0xe1, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xcd, 0xed, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0xcf, 0xef, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0xd9, 0xf9, 0xd1, 0xf1, 0xd2, 0xf2, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xd4, 0xf4, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0xba, /* 0x50-0x57 */
+	0x00, 0x00, 0xda, 0xfa, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0xd8, 0xf8, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0xca, 0xea, 0xdd, 0xfd, 0xde, 0xfe, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xff, 0x00, 0x00, 0xb4, 0xa1, 0xa5, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	  NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	  NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	  NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-13",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_13(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_13(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_13)
+module_exit(exit_nls_iso8859_13)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-14.c b/kernel/fs/nls/nls_iso8859-14.c
new file mode 100644
index 000000000..046370f0b
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-14.c
@@ -0,0 +1,341 @@
+/*
+ * linux/fs/nls/nls_iso8859-14.c
+ *
+ * Charset iso8859-14 translation tables.
+ *
+ * Generated automatically from the Unicode and charset table
+ * provided by the Unicode Organisation at
+ * http://www.unicode.org/
+ * The Unicode to charset table has only exact mappings.
+ *
+ * Rhys Jones, Swansea University Computer Society
+ * rhys@sucs.swan.ac.uk
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003, 
+	0x0004, 0x0005, 0x0006, 0x0007, 
+	0x0008, 0x0009, 0x000a, 0x000b, 
+	0x000c, 0x000d, 0x000e, 0x000f, 
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013, 
+	0x0014, 0x0015, 0x0016, 0x0017, 
+	0x0018, 0x0019, 0x001a, 0x001b, 
+	0x001c, 0x001d, 0x001e, 0x001f, 
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023, 
+	0x0024, 0x0025, 0x0026, 0x0027, 
+	0x0028, 0x0029, 0x002a, 0x002b, 
+	0x002c, 0x002d, 0x002e, 0x002f, 
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033, 
+	0x0034, 0x0035, 0x0036, 0x0037, 
+	0x0038, 0x0039, 0x003a, 0x003b, 
+	0x003c, 0x003d, 0x003e, 0x003f, 
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043, 
+	0x0044, 0x0045, 0x0046, 0x0047, 
+	0x0048, 0x0049, 0x004a, 0x004b, 
+	0x004c, 0x004d, 0x004e, 0x004f, 
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053, 
+	0x0054, 0x0055, 0x0056, 0x0057, 
+	0x0058, 0x0059, 0x005a, 0x005b, 
+	0x005c, 0x005d, 0x005e, 0x005f, 
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063, 
+	0x0064, 0x0065, 0x0066, 0x0067, 
+	0x0068, 0x0069, 0x006a, 0x006b, 
+	0x006c, 0x006d, 0x006e, 0x006f, 
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073, 
+	0x0074, 0x0075, 0x0076, 0x0077, 
+	0x0078, 0x0079, 0x007a, 0x007b, 
+	0x007c, 0x007d, 0x007e, 0x007f, 
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x1e02, 0x1e03, 0x00a3, 
+	0x010a, 0x010b, 0x1e0a, 0x00a7, 
+	0x1e80, 0x00a9, 0x1e82, 0x1e0b, 
+	0x1ef2, 0x00ad, 0x00ae, 0x0178, 
+	/* 0xb0*/
+	0x1e1e, 0x1e1f, 0x0120, 0x0121, 
+	0x1e40, 0x1e41, 0x00b6, 0x1e56, 
+	0x1e81, 0x1e57, 0x1e83, 0x1e60, 
+	0x1ef3, 0x1e84, 0x1e85, 0x1e61, 
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3, 
+	0x00c4, 0x00c5, 0x00c6, 0x00c7, 
+	0x00c8, 0x00c9, 0x00ca, 0x00cb, 
+	0x00cc, 0x00cd, 0x00ce, 0x00cf, 
+	/* 0xd0*/
+	0x0174, 0x00d1, 0x00d2, 0x00d3, 
+	0x00d4, 0x00d5, 0x00d6, 0x1e6a, 
+	0x00d8, 0x00d9, 0x00da, 0x00db, 
+	0x00dc, 0x00dd, 0x0176, 0x00df, 
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3, 
+	0x00e4, 0x00e5, 0x00e6, 0x00e7, 
+	0x00e8, 0x00e9, 0x00ea, 0x00eb, 
+	0x00ec, 0x00ed, 0x00ee, 0x00ef, 
+	/* 0xf0*/
+	0x0175, 0x00f1, 0x00f2, 0x00f3, 
+	0x00f4, 0x00f5, 0x00f6, 0x1e6b, 
+	0x00f8, 0x00f9, 0x00fa, 0x00fb, 
+	0x00fc, 0x00fd, 0x0177, 0x00ff, 
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0x00, 0x00, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb6, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0x00, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x00, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0x00, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */
+	0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */
+	0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	
+	0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page1e[256] = {
+	0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */
+	0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */
+	0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00,	page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	page1e,	NULL,
+
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa2, 0xa2, 0xa3, 0xab, 0xab, 0xab, 0xa7, /* 0xa0-0xa7 */
+	0xb8, 0xa9, 0xba, 0xab, 0xbc, 0xad, 0xae, 0xff, /* 0xa8-0xaf */
+	0xb1, 0xb1, 0xb3, 0xb3, 0xb5, 0xb5, 0xb6, 0xb9, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbf, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa1, 0xa3, 0xa6, 0xa6, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xa6, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb0, 0xb2, 0xb2, 0xb4, 0xb4, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xa8, 0xb7, 0xaa, 0xbb, 0xac, 0xbd, 0xbd, 0xbb, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xaf, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-14",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_14(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_14(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_14)
+module_exit(exit_nls_iso8859_14)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-15.c b/kernel/fs/nls/nls_iso8859-15.c
new file mode 100644
index 000000000..7e34a841a
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-15.c
@@ -0,0 +1,307 @@
+/*
+ * linux/fs/nls/nls_iso8859-15.c
+ *
+ * Charset iso8859-15 translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x20ac, 0x00a5, 0x0160, 0x00a7,
+	0x0161, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x017d, 0x00b5, 0x00b6, 0x00b7,
+	0x017e, 0x00b9, 0x00ba, 0x00bb,
+	0x0152, 0x0153, 0x0178, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x00d0, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x00dd, 0x00de, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x00f0, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x00fd, 0x00fe, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0xa5, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0xb9, 0xba, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xbc, 0xbd, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xa6, 0xa8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0xbe, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xb8, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01,	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+
+	page20,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb8, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbd, 0xbd, 0xff, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa6, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb4, 0xb9, 0xba, 0xbb, 0xbc, 0xbc, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xbe, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-15",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_15(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_15(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_15)
+module_exit(exit_nls_iso8859_15)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-2.c b/kernel/fs/nls/nls_iso8859-2.c
new file mode 100644
index 000000000..7dd571181
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-2.c
@@ -0,0 +1,308 @@
+/*
+ * linux/fs/nls/nls_iso8859-2.c
+ *
+ * Charset iso8859-2 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0104, 0x02d8, 0x0141,
+	0x00a4, 0x013d, 0x015a, 0x00a7,
+	0x00a8, 0x0160, 0x015e, 0x0164,
+	0x0179, 0x00ad, 0x017d, 0x017b,
+	/* 0xb0*/
+	0x00b0, 0x0105, 0x02db, 0x0142,
+	0x00b4, 0x013e, 0x015b, 0x02c7,
+	0x00b8, 0x0161, 0x015f, 0x0165,
+	0x017a, 0x02dd, 0x017e, 0x017c,
+	/* 0xc0*/
+	0x0154, 0x00c1, 0x00c2, 0x0102,
+	0x00c4, 0x0139, 0x0106, 0x00c7,
+	0x010c, 0x00c9, 0x0118, 0x00cb,
+	0x011a, 0x00cd, 0x00ce, 0x010e,
+	/* 0xd0*/
+	0x0110, 0x0143, 0x0147, 0x00d3,
+	0x00d4, 0x0150, 0x00d6, 0x00d7,
+	0x0158, 0x016e, 0x00da, 0x0170,
+	0x00dc, 0x00dd, 0x0162, 0x00df,
+	/* 0xe0*/
+	0x0155, 0x00e1, 0x00e2, 0x0103,
+	0x00e4, 0x013a, 0x0107, 0x00e7,
+	0x010d, 0x00e9, 0x0119, 0x00eb,
+	0x011b, 0x00ed, 0x00ee, 0x010f,
+	/* 0xf0*/
+	0x0111, 0x0144, 0x0148, 0x00f3,
+	0x00f4, 0x0151, 0x00f6, 0x00f7,
+	0x0159, 0x016f, 0x00fa, 0x0171,
+	0x00fc, 0x00fd, 0x0163, 0x02d9,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xc3, 0xe3, 0xa1, 0xb1, 0xc6, 0xe6, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */
+	0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xc5, 0xe5, 0x00, 0x00, 0xa5, 0xb5, 0x00, /* 0x38-0x3f */
+	0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */
+	0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */
+	0xd8, 0xf8, 0xa6, 0xb6, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */
+	0xa9, 0xb9, 0xde, 0xfe, 0xab, 0xbb, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */
+	0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0xac, 0xbc, 0xaf, 0xbf, 0xae, 0xbe, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-2",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_2(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_2(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_2)
+module_exit(exit_nls_iso8859_2)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-3.c b/kernel/fs/nls/nls_iso8859-3.c
new file mode 100644
index 000000000..740b75ec4
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-3.c
@@ -0,0 +1,308 @@
+/*
+ * linux/fs/nls/nls_iso8859-3.c
+ *
+ * Charset iso8859-3 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0126, 0x02d8, 0x00a3,
+	0x00a4, 0x0000, 0x0124, 0x00a7,
+	0x00a8, 0x0130, 0x015e, 0x011e,
+	0x0134, 0x00ad, 0x0000, 0x017b,
+	/* 0xb0*/
+	0x00b0, 0x0127, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x0125, 0x00b7,
+	0x00b8, 0x0131, 0x015f, 0x011f,
+	0x0135, 0x00bd, 0x0000, 0x017c,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x0000,
+	0x00c4, 0x010a, 0x0108, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x0000, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x0120, 0x00d6, 0x00d7,
+	0x011c, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x016c, 0x015c, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x0000,
+	0x00e4, 0x010b, 0x0109, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x0000, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x0121, 0x00f6, 0x00f7,
+	0x011d, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x016d, 0x015d, 0x02d9,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0xa3, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0xb0, 0x00, 0xb2, 0xb3, 0xb4, 0xb5, 0x00, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0x00, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0x00, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0xc6, 0xe6, 0xc5, 0xe5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xd8, 0xf8, 0xab, 0xbb, /* 0x18-0x1f */
+	0xd5, 0xf5, 0x00, 0x00, 0xa6, 0xb6, 0xa1, 0xb1, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xa9, 0xb9, 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xaa, 0xba, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xdd, 0xfd, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0xaf, 0xbf, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xa2, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xa3, 0xa4, 0x00, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x69, 0xba, 0xbb, 0xbc, 0xad, 0x00, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0x00, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xb3, 0xb4, 0x00, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0x49, 0xaa, 0xab, 0xac, 0xbd, 0x00, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-3",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_3(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_3(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_3)
+module_exit(exit_nls_iso8859_3)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-4.c b/kernel/fs/nls/nls_iso8859-4.c
new file mode 100644
index 000000000..8826021e3
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-4.c
@@ -0,0 +1,308 @@
+/*
+ * linux/fs/nls/nls_iso8859-4.c
+ *
+ * Charset iso8859-4 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0104, 0x0138, 0x0156,
+	0x00a4, 0x0128, 0x013b, 0x00a7,
+	0x00a8, 0x0160, 0x0112, 0x0122,
+	0x0166, 0x00ad, 0x017d, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x0105, 0x02db, 0x0157,
+	0x00b4, 0x0129, 0x013c, 0x02c7,
+	0x00b8, 0x0161, 0x0113, 0x0123,
+	0x0167, 0x014a, 0x017e, 0x014b,
+	/* 0xc0*/
+	0x0100, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x012e,
+	0x010c, 0x00c9, 0x0118, 0x00cb,
+	0x0116, 0x00cd, 0x00ce, 0x012a,
+	/* 0xd0*/
+	0x0110, 0x0145, 0x014c, 0x0136,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x0172, 0x00da, 0x00db,
+	0x00dc, 0x0168, 0x016a, 0x00df,
+	/* 0xe0*/
+	0x0101, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x012f,
+	0x010d, 0x00e9, 0x0119, 0x00eb,
+	0x0117, 0x00ed, 0x00ee, 0x012b,
+	/* 0xf0*/
+	0x0111, 0x0146, 0x014d, 0x0137,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x0173, 0x00fa, 0x00fb,
+	0x00fc, 0x0169, 0x016b, 0x02d9,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0x00, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0x00, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0x00, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0x00, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0xc0, 0xe0, 0x00, 0x00, 0xa1, 0xb1, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */
+	0xd0, 0xf0, 0xaa, 0xba, 0x00, 0x00, 0xcc, 0xec, /* 0x10-0x17 */
+	0xca, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xab, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0xa5, 0xb5, 0xcf, 0xef, 0x00, 0x00, 0xc7, 0xe7, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd3, 0xf3, /* 0x30-0x37 */
+	0xa2, 0x00, 0x00, 0xa6, 0xb6, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xd1, 0xf1, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0xbd, 0xbf, 0xd2, 0xf2, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xb3, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0xac, 0xbc, /* 0x60-0x67 */
+	0xdd, 0xfd, 0xde, 0xfe, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0xd9, 0xf9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0xff, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-4",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_4(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_4(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_4)
+module_exit(exit_nls_iso8859_4)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-5.c b/kernel/fs/nls/nls_iso8859-5.c
new file mode 100644
index 000000000..7c04057a1
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-5.c
@@ -0,0 +1,272 @@
+/*
+ * linux/fs/nls/nls_iso8859-5.c
+ *
+ * Charset iso8859-5 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0401, 0x0402, 0x0403,
+	0x0404, 0x0405, 0x0406, 0x0407,
+	0x0408, 0x0409, 0x040a, 0x040b,
+	0x040c, 0x00ad, 0x040e, 0x040f,
+	/* 0xb0*/
+	0x0410, 0x0411, 0x0412, 0x0413,
+	0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b,
+	0x041c, 0x041d, 0x041e, 0x041f,
+	/* 0xc0*/
+	0x0420, 0x0421, 0x0422, 0x0423,
+	0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b,
+	0x042c, 0x042d, 0x042e, 0x042f,
+	/* 0xd0*/
+	0x0430, 0x0431, 0x0432, 0x0433,
+	0x0434, 0x0435, 0x0436, 0x0437,
+	0x0438, 0x0439, 0x043a, 0x043b,
+	0x043c, 0x043d, 0x043e, 0x043f,
+	/* 0xe0*/
+	0x0440, 0x0441, 0x0442, 0x0443,
+	0x0444, 0x0445, 0x0446, 0x0447,
+	0x0448, 0x0449, 0x044a, 0x044b,
+	0x044c, 0x044d, 0x044e, 0x044f,
+	/* 0xf0*/
+	0x2116, 0x0451, 0x0452, 0x0453,
+	0x0454, 0x0455, 0x0456, 0x0457,
+	0x0458, 0x0459, 0x045a, 0x045b,
+	0x045c, 0x00a7, 0x045e, 0x045f,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, /* 0x08-0x0f */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0xfe, 0xff, /* 0x58-0x5f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   page21, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xa0-0xa7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xad, 0xfe, 0xff, /* 0xa8-0xaf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xb0-0xb7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xd0-0xd7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xf0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xf0-0xf7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xfd, 0xae, 0xaf, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-5",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_5(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_5(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_5)
+module_exit(exit_nls_iso8859_5)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-6.c b/kernel/fs/nls/nls_iso8859-6.c
new file mode 100644
index 000000000..d4a881400
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-6.c
@@ -0,0 +1,263 @@
+/*
+ * linux/fs/nls/nls_iso8859-6.c
+ *
+ * Charset iso8859-6 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0660, 0x0661, 0x0662, 0x0663,
+	0x0664, 0x0665, 0x0666, 0x0667,
+	0x0668, 0x0669, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0000, 0x0000, 0x0000,
+	0x00a4, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x060c, 0x00ad, 0x0000, 0x0000,
+	/* 0xb0*/
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x061b,
+	0x0000, 0x0000, 0x0000, 0x061f,
+	/* 0xc0*/
+	0x0000, 0x0621, 0x0622, 0x0623,
+	0x0624, 0x0625, 0x0626, 0x0627,
+	0x0628, 0x0629, 0x062a, 0x062b,
+	0x062c, 0x062d, 0x062e, 0x062f,
+	/* 0xd0*/
+	0x0630, 0x0631, 0x0632, 0x0633,
+	0x0634, 0x0635, 0x0636, 0x0637,
+	0x0638, 0x0639, 0x063a, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0xe0*/
+	0x0640, 0x0641, 0x0642, 0x0643,
+	0x0644, 0x0645, 0x0646, 0x0647,
+	0x0648, 0x0649, 0x064a, 0x064b,
+	0x064c, 0x064d, 0x064e, 0x064f,
+	/* 0xf0*/
+	0x0650, 0x0651, 0x0652, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char page06[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x60-0x67 */
+	0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   NULL,   NULL,   page06, NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-6",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_6(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_6(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_6)
+module_exit(exit_nls_iso8859_6)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-7.c b/kernel/fs/nls/nls_iso8859-7.c
new file mode 100644
index 000000000..37b75d825
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-7.c
@@ -0,0 +1,317 @@
+/*
+ * linux/fs/nls/nls_iso8859-7.c
+ *
+ * Charset iso8859-7 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x02bd, 0x02bc, 0x00a3,
+	0x0000, 0x0000, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x0000, 0x00ab,
+	0x00ac, 0x00ad, 0x0000, 0x2015,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x0384, 0x0385, 0x0386, 0x00b7,
+	0x0388, 0x0389, 0x038a, 0x00bb,
+	0x038c, 0x00bd, 0x038e, 0x038f,
+	/* 0xc0*/
+	0x0390, 0x0391, 0x0392, 0x0393,
+	0x0394, 0x0395, 0x0396, 0x0397,
+	0x0398, 0x0399, 0x039a, 0x039b,
+	0x039c, 0x039d, 0x039e, 0x039f,
+	/* 0xd0*/
+	0x03a0, 0x03a1, 0x0000, 0x03a3,
+	0x03a4, 0x03a5, 0x03a6, 0x03a7,
+	0x03a8, 0x03a9, 0x03aa, 0x03ab,
+	0x03ac, 0x03ad, 0x03ae, 0x03af,
+	/* 0xe0*/
+	0x03b0, 0x03b1, 0x03b2, 0x03b3,
+	0x03b4, 0x03b5, 0x03b6, 0x03b7,
+	0x03b8, 0x03b9, 0x03ba, 0x03bb,
+	0x03bc, 0x03bd, 0x03be, 0x03bf,
+	/* 0xf0*/
+	0x03c0, 0x03c1, 0x03c2, 0x03c3,
+	0x03c4, 0x03c5, 0x03c6, 0x03c7,
+	0x03c8, 0x03c9, 0x03ca, 0x03cb,
+	0x03cc, 0x03cd, 0x03ce, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0x00, 0x00, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0xa2, 0xa1, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0x00, /* 0x80-0x87 */
+	0xb8, 0xb9, 0xba, 0x00, 0xbc, 0x00, 0xbe, 0xbf, /* 0x88-0x8f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x90-0x97 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x98-0x9f */
+	0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xa0-0xa7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xa8-0xaf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xb0-0xb7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xb8-0xbf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xc0-0xc7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xc8-0xcf */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   page02, page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xdc, 0xb7, /* 0xb0-0xb7 */
+	0xdd, 0xde, 0xdf, 0xbb, 0xfc, 0xbd, 0xfd, 0xfe, /* 0xb8-0xbf */
+	0xc0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xb6, 0xb8, 0xb9, 0xba, /* 0xd8-0xdf */
+	0xe0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd3, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xbc, 0xbe, 0xbf, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-7",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_7(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_7(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_7)
+module_exit(exit_nls_iso8859_7)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_iso8859-9.c b/kernel/fs/nls/nls_iso8859-9.c
new file mode 100644
index 000000000..557b98250
--- /dev/null
+++ b/kernel/fs/nls/nls_iso8859-9.c
@@ -0,0 +1,272 @@
+/*
+ * linux/fs/nls/nls_iso8859-9.c
+ *
+ * Charset iso8859-9 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x00a4, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00ba, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x011e, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x0130, 0x015e, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x011f, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x0131, 0x015f, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xdd, 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, /* 0x58-0x5f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x69, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x49, 0xde, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-9",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_iso8859_9(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_9(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_9)
+module_exit(exit_nls_iso8859_9)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_koi8-r.c b/kernel/fs/nls/nls_koi8-r.c
new file mode 100644
index 000000000..811f232fc
--- /dev/null
+++ b/kernel/fs/nls/nls_koi8-r.c
@@ -0,0 +1,323 @@
+/*
+ * linux/fs/nls/nls_koi8-r.c
+ *
+ * Charset koi8-r translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x2500, 0x2502, 0x250c, 0x2510,
+	0x2514, 0x2518, 0x251c, 0x2524,
+	0x252c, 0x2534, 0x253c, 0x2580,
+	0x2584, 0x2588, 0x258c, 0x2590,
+	/* 0x90*/
+	0x2591, 0x2592, 0x2593, 0x2320,
+	0x25a0, 0x2219, 0x221a, 0x2248,
+	0x2264, 0x2265, 0x00a0, 0x2321,
+	0x00b0, 0x00b2, 0x00b7, 0x00f7,
+	/* 0xa0*/
+	0x2550, 0x2551, 0x2552, 0x0451,
+	0x2553, 0x2554, 0x2555, 0x2556,
+	0x2557, 0x2558, 0x2559, 0x255a,
+	0x255b, 0x255c, 0x255d, 0x255e,
+	/* 0xb0*/
+	0x255f, 0x2560, 0x2561, 0x0401,
+	0x2562, 0x2563, 0x2564, 0x2565,
+	0x2566, 0x2567, 0x2568, 0x2569,
+	0x256a, 0x256b, 0x256c, 0x00a9,
+	/* 0xc0*/
+	0x044e, 0x0430, 0x0431, 0x0446,
+	0x0434, 0x0435, 0x0444, 0x0433,
+	0x0445, 0x0438, 0x0439, 0x043a,
+	0x043b, 0x043c, 0x043d, 0x043e,
+	/* 0xd0*/
+	0x043f, 0x044f, 0x0440, 0x0441,
+	0x0442, 0x0443, 0x0436, 0x0432,
+	0x044c, 0x044b, 0x0437, 0x0448,
+	0x044d, 0x0449, 0x0447, 0x044a,
+	/* 0xe0*/
+	0x042e, 0x0410, 0x0411, 0x0426,
+	0x0414, 0x0415, 0x0424, 0x0413,
+	0x0425, 0x0418, 0x0419, 0x041a,
+	0x041b, 0x041c, 0x041d, 0x041e,
+	/* 0xf0*/
+	0x041f, 0x042f, 0x0420, 0x0421,
+	0x0422, 0x0423, 0x0416, 0x0412,
+	0x042c, 0x042b, 0x0417, 0x0428,
+	0x042d, 0x0429, 0x0427, 0x042a,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */
+	0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */
+	0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */
+	0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */
+	0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */
+	0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */
+	0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */
+	0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */
+	0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xa0, 0xa1, 0xa2, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, /* 0x50-0x57 */
+	0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */
+	0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, /* 0x60-0x67 */
+	0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "koi8-r",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_koi8_r(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_koi8_r(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_koi8_r)
+module_exit(exit_nls_koi8_r)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_koi8-ru.c b/kernel/fs/nls/nls_koi8-ru.c
new file mode 100644
index 000000000..a80a741a8
--- /dev/null
+++ b/kernel/fs/nls/nls_koi8-ru.c
@@ -0,0 +1,82 @@
+/*
+ * linux/fs/nls/nls_koi8-ru.c
+ *
+ * Charset koi8-ru translation based on charset koi8-u.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static struct nls_table *p_nls;
+
+static int uni2char(const wchar_t uni,
+		    unsigned char *out, int boundlen)
+{
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if ((uni & 0xffaf) == 0x040e || (uni & 0xffce) == 0x254c) {
+		/* koi8-ru and koi8-u differ only on two characters */
+		if (uni == 0x040e)
+			out[0] = 0xbe;
+		else if (uni == 0x045e)
+			out[0] = 0xae;
+		else if (uni == 0x255d || uni == 0x256c)
+			return 0;
+		else
+			return p_nls->uni2char(uni, out, boundlen);
+		return 1;
+	}
+	else
+		/* fast path */
+		return p_nls->uni2char(uni, out, boundlen);
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+		    wchar_t *uni)
+{
+	int n;
+
+	if ((*rawstring & 0xef) != 0xae) {
+		/* koi8-ru and koi8-u differ only on two characters */
+		*uni = (*rawstring & 0x10) ? 0x040e : 0x045e;
+		return 1;
+	}
+
+	n = p_nls->char2uni(rawstring, boundlen, uni);
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "koi8-ru",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+};
+
+static int __init init_nls_koi8_ru(void)
+{
+	p_nls = load_nls("koi8-u");
+
+	if (p_nls) {
+		table.charset2upper = p_nls->charset2upper;
+		table.charset2lower = p_nls->charset2lower;
+		return register_nls(&table);
+	}
+
+	return -EINVAL;
+}
+
+static void __exit exit_nls_koi8_ru(void)
+{
+	unregister_nls(&table);
+	unload_nls(p_nls);
+}
+
+module_init(init_nls_koi8_ru)
+module_exit(exit_nls_koi8_ru)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_koi8-u.c b/kernel/fs/nls/nls_koi8-u.c
new file mode 100644
index 000000000..7e029e4c1
--- /dev/null
+++ b/kernel/fs/nls/nls_koi8-u.c
@@ -0,0 +1,330 @@
+/*
+ * linux/fs/nls/nls_koi8-u.c
+ *
+ * Charset koi8-u translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x2500, 0x2502, 0x250c, 0x2510,
+	0x2514, 0x2518, 0x251c, 0x2524,
+	0x252c, 0x2534, 0x253c, 0x2580,
+	0x2584, 0x2588, 0x258c, 0x2590,
+	/* 0x90*/
+	0x2591, 0x2592, 0x2593, 0x2320,
+	0x25a0, 0x2219, 0x221a, 0x2248,
+	0x2264, 0x2265, 0x00a0, 0x2321,
+	0x00b0, 0x00b2, 0x00b7, 0x00f7,
+	/* 0xa0*/
+	0x2550, 0x2551, 0x2552, 0x0451,
+	0x0454, 0x2554, 0x0456, 0x0457,
+	0x2557, 0x2558, 0x2559, 0x255a,
+	0x255b, 0x0491, 0x255d, 0x255e,
+	/* 0xb0*/
+	0x255f, 0x2560, 0x2561, 0x0401,
+	0x0404, 0x2563, 0x0406, 0x0407,
+	0x2566, 0x2567, 0x2568, 0x2569,
+	0x256a, 0x0490, 0x256c, 0x00a9,
+	/* 0xc0*/
+	0x044e, 0x0430, 0x0431, 0x0446,
+	0x0434, 0x0435, 0x0444, 0x0433,
+	0x0445, 0x0438, 0x0439, 0x043a,
+	0x043b, 0x043c, 0x043d, 0x043e,
+	/* 0xd0*/
+	0x043f, 0x044f, 0x0440, 0x0441,
+	0x0442, 0x0443, 0x0436, 0x0432,
+	0x044c, 0x044b, 0x0437, 0x0448,
+	0x044d, 0x0449, 0x0447, 0x044a,
+	/* 0xe0*/
+	0x042e, 0x0410, 0x0411, 0x0426,
+	0x0414, 0x0415, 0x0424, 0x0413,
+	0x0425, 0x0418, 0x0419, 0x041a,
+	0x041b, 0x041c, 0x041d, 0x041e,
+	/* 0xf0*/
+	0x041f, 0x042f, 0x0420, 0x0421,
+	0x0422, 0x0423, 0x0416, 0x0412,
+	0x042c, 0x042b, 0x0417, 0x0428,
+	0x042d, 0x0429, 0x0427, 0x042a,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xb3, 0x00, 0x00, 0xb4, 0x00, 0xb6, 0xb7, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */
+	0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */
+	0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */
+	0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */
+	0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */
+	0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */
+	0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */
+	0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */
+	0x00, 0xa3, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xbd, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xa0, 0xa1, 0xa2, 0x00, 0xa5, 0x00, 0x00, 0xa8, /* 0x50-0x57 */
+	0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */
+	0xb1, 0xb2, 0x00, 0xb5, 0x00, 0x00, 0xb8, 0xb9, /* 0x60-0x67 */
+	0xba, 0xbb, 0xbc, 0x00, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xa3, 0xa4, 0xb5, 0xa6, 0xa7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xb3, 0xb4, 0xa5, 0xb6, 0xb7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "koi8-u",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+static int __init init_nls_koi8_u(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_koi8_u(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_koi8_u)
+module_exit(exit_nls_koi8_u)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/nls/nls_utf8.c b/kernel/fs/nls/nls_utf8.c
new file mode 100644
index 000000000..afcfbc4a1
--- /dev/null
+++ b/kernel/fs/nls/nls_utf8.c
@@ -0,0 +1,67 @@
+/*
+ * Module for handling utf8 just like any other charset.
+ * By Urban Widmark 2000
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static unsigned char identity[256];
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	n = utf32_to_utf8(uni, out, boundlen);
+	if (n < 0) {
+		*out = '?';
+		return -EINVAL;
+	}
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	int n;
+	unicode_t u;
+
+	n = utf8_to_utf32(rawstring, boundlen, &u);
+	if (n < 0 || u > MAX_WCHAR_T) {
+		*uni = 0x003f;	/* ? */
+		return -EINVAL;
+	}
+	*uni = (wchar_t) u;
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "utf8",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= identity,	/* no conversion */
+	.charset2upper	= identity,
+};
+
+static int __init init_nls_utf8(void)
+{
+	int i;
+	for (i=0; i<256; i++)
+		identity[i] = i;
+
+        return register_nls(&table);
+}
+
+static void __exit exit_nls_utf8(void)
+{
+        unregister_nls(&table);
+}
+
+module_init(init_nls_utf8)
+module_exit(exit_nls_utf8)
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kernel/fs/no-block.c b/kernel/fs/no-block.c
new file mode 100644
index 000000000..6e40e42a4
--- /dev/null
+++ b/kernel/fs/no-block.c
@@ -0,0 +1,23 @@
+/* no-block.c: implementation of routines required for non-BLOCK configuration
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+
+static int no_blkdev_open(struct inode * inode, struct file * filp)
+{
+	return -ENODEV;
+}
+
+const struct file_operations def_blk_fops = {
+	.open		= no_blkdev_open,
+	.llseek		= noop_llseek,
+};
diff --git a/kernel/fs/notify/Kconfig b/kernel/fs/notify/Kconfig
new file mode 100644
index 000000000..2a24249b3
--- /dev/null
+++ b/kernel/fs/notify/Kconfig
@@ -0,0 +1,7 @@
+config FSNOTIFY
+	def_bool n
+	select SRCU
+
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
+source "fs/notify/fanotify/Kconfig"
diff --git a/kernel/fs/notify/Makefile b/kernel/fs/notify/Makefile
new file mode 100644
index 000000000..96d3420d0
--- /dev/null
+++ b/kernel/fs/notify/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
+				   mark.o vfsmount_mark.o fdinfo.o
+
+obj-y			+= dnotify/
+obj-y			+= inotify/
+obj-y			+= fanotify/
diff --git a/kernel/fs/notify/dnotify/Kconfig b/kernel/fs/notify/dnotify/Kconfig
new file mode 100644
index 000000000..f9c1ca139
--- /dev/null
+++ b/kernel/fs/notify/dnotify/Kconfig
@@ -0,0 +1,11 @@
+config DNOTIFY
+	bool "Dnotify support"
+	select FSNOTIFY
+	default y
+	help
+	  Dnotify is a directory-based per-fd file change notification system
+	  that uses signals to communicate events to user-space.  There exist
+	  superior alternatives, but some applications may still rely on
+	  dnotify.
+
+	  If unsure, say Y.
diff --git a/kernel/fs/notify/dnotify/Makefile b/kernel/fs/notify/dnotify/Makefile
new file mode 100644
index 000000000..f145251dc
--- /dev/null
+++ b/kernel/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY)		+= dnotify.o
diff --git a/kernel/fs/notify/dnotify/dnotify.c b/kernel/fs/notify/dnotify/dnotify.c
new file mode 100644
index 000000000..44523f4a6
--- /dev/null
+++ b/kernel/fs/notify/dnotify/dnotify.c
@@ -0,0 +1,388 @@
+/*
+ * Directory notifications for Linux.
+ *
+ * Copyright (C) 2000,2001,2002 Stephen Rothwell
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/dnotify.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
+
+int dir_notify_enable __read_mostly = 1;
+
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_marks) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark {
+	struct fsnotify_mark fsn_mark;
+	struct dnotify_struct *dn;
+};
+
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
+{
+	__u32 new_mask, old_mask;
+	struct dnotify_struct *dn;
+	struct dnotify_mark *dn_mark  = container_of(fsn_mark,
+						     struct dnotify_mark,
+						     fsn_mark);
+
+	assert_spin_locked(&fsn_mark->lock);
+
+	old_mask = fsn_mark->mask;
+	new_mask = 0;
+	for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
+		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
+	fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
+
+	if (old_mask == new_mask)
+		return;
+
+	if (fsn_mark->inode)
+		fsnotify_recalc_inode_mask(fsn_mark->inode);
+}
+
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				u32 mask, void *data, int data_type,
+				const unsigned char *file_name, u32 cookie)
+{
+	struct dnotify_mark *dn_mark;
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *fown;
+	__u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
+
+	/* not a dir, dnotify doesn't care */
+	if (!S_ISDIR(inode->i_mode))
+		return 0;
+
+	BUG_ON(vfsmount_mark);
+
+	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
+
+	spin_lock(&inode_mark->lock);
+	prev = &dn_mark->dn;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & test_mask) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & FS_DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(inode_mark);
+		}
+	}
+
+	spin_unlock(&inode_mark->lock);
+
+	return 0;
+}
+
+static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	struct dnotify_mark *dn_mark = container_of(fsn_mark,
+						    struct dnotify_mark,
+						    fsn_mark);
+
+	BUG_ON(dn_mark->dn);
+
+	kmem_cache_free(dnotify_mark_cache, dn_mark);
+}
+
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+	.handle_event = dnotify_handle_event,
+};
+
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn structures attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark.
+ */
+void dnotify_flush(struct file *filp, fl_owner_t id)
+{
+	struct fsnotify_mark *fsn_mark;
+	struct dnotify_mark *dn_mark;
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct inode *inode;
+
+	inode = file_inode(filp);
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	if (!fsn_mark)
+		return;
+	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
+
+	mutex_lock(&dnotify_group->mark_mutex);
+
+	spin_lock(&fsn_mark->lock);
+	prev = &dn_mark->dn;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
+			*prev = dn->dn_next;
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(fsn_mark);
+			break;
+		}
+		prev = &dn->dn_next;
+	}
+
+	spin_unlock(&fsn_mark->lock);
+
+	/* nothing else could have found us thanks to the dnotify_groups
+	   mark_mutex */
+	if (dn_mark->dn == NULL)
+		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+
+	mutex_unlock(&dnotify_group->mark_mutex);
+
+	fsnotify_put_mark(fsn_mark);
+}
+
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+	__u32 new_mask = FS_EVENT_ON_CHILD;
+
+	if (arg & DN_MULTISHOT)
+		new_mask |= FS_DN_MULTISHOT;
+	if (arg & DN_DELETE)
+		new_mask |= (FS_DELETE | FS_MOVED_FROM);
+	if (arg & DN_MODIFY)
+		new_mask |= FS_MODIFY;
+	if (arg & DN_ACCESS)
+		new_mask |= FS_ACCESS;
+	if (arg & DN_ATTRIB)
+		new_mask |= FS_ATTRIB;
+	if (arg & DN_RENAME)
+		new_mask |= FS_DN_RENAME;
+	if (arg & DN_CREATE)
+		new_mask |= (FS_CREATE | FS_MOVED_TO);
+
+	return new_mask;
+}
+
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
+		     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+	struct dnotify_struct *odn;
+
+	odn = dn_mark->dn;
+	while (odn != NULL) {
+		/* adding more events to existing dnofiy_struct? */
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= mask;
+			return -EEXIST;
+		}
+		odn = odn->dn_next;
+	}
+
+	dn->dn_mask = mask;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	dn->dn_next = dn_mark->dn;
+	dn_mark->dn = dn;
+
+	return 0;
+}
+
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
+int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+{
+	struct dnotify_mark *new_dn_mark, *dn_mark;
+	struct fsnotify_mark *new_fsn_mark, *fsn_mark;
+	struct dnotify_struct *dn;
+	struct inode *inode;
+	fl_owner_t id = current->files;
+	struct file *f;
+	int destroy = 0, error = 0;
+	__u32 mask;
+
+	/* we use these to tell if we need to kfree */
+	new_fsn_mark = NULL;
+	dn = NULL;
+
+	if (!dir_notify_enable) {
+		error = -EINVAL;
+		goto out_err;
+	}
+
+	/* a 0 mask means we are explicitly removing the watch */
+	if ((arg & ~DN_MULTISHOT) == 0) {
+		dnotify_flush(filp, id);
+		error = 0;
+		goto out_err;
+	}
+
+	/* dnotify only works on directories */
+	inode = file_inode(filp);
+	if (!S_ISDIR(inode->i_mode)) {
+		error = -ENOTDIR;
+		goto out_err;
+	}
+
+	/* expect most fcntl to add new rather than augment old */
+	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
+	if (!dn) {
+		error = -ENOMEM;
+		goto out_err;
+	}
+
+	/* new fsnotify mark, we expect most fcntl calls to add a new mark */
+	new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
+	if (!new_dn_mark) {
+		error = -ENOMEM;
+		goto out_err;
+	}
+
+	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
+	mask = convert_arg(arg);
+
+	/* set up the new_fsn_mark and new_dn_mark */
+	new_fsn_mark = &new_dn_mark->fsn_mark;
+	fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
+	new_fsn_mark->mask = mask;
+	new_dn_mark->dn = NULL;
+
+	/* this is needed to prevent the fcntl/close race described below */
+	mutex_lock(&dnotify_group->mark_mutex);
+
+	/* add the new_fsn_mark or find an old one. */
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	if (fsn_mark) {
+		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
+		spin_lock(&fsn_mark->lock);
+	} else {
+		fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
+					 NULL, 0);
+		spin_lock(&new_fsn_mark->lock);
+		fsn_mark = new_fsn_mark;
+		dn_mark = new_dn_mark;
+		/* we used new_fsn_mark, so don't free it */
+		new_fsn_mark = NULL;
+	}
+
+	rcu_read_lock();
+	f = fcheck(fd);
+	rcu_read_unlock();
+
+	/* if (f != filp) means that we lost a race and another task/thread
+	 * actually closed the fd we are still playing with before we grabbed
+	 * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
+	 * fd is the only time we clean up the marks we need to get our mark
+	 * off the list. */
+	if (f != filp) {
+		/* if we added ourselves, shoot ourselves, it's possible that
+		 * the flush actually did shoot this fsn_mark.  That's fine too
+		 * since multiple calls to destroy_mark is perfectly safe, if
+		 * we found a dn_mark already attached to the inode, just sod
+		 * off silently as the flush at close time dealt with it.
+		 */
+		if (dn_mark == new_dn_mark)
+			destroy = 1;
+		goto out;
+	}
+
+	__f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+
+	error = attach_dn(dn, dn_mark, id, fd, filp, mask);
+	/* !error means that we attached the dn to the dn_mark, so don't free it */
+	if (!error)
+		dn = NULL;
+	/* -EEXIST means that we didn't add this new dn and used an old one.
+	 * that isn't an error (and the unused dn should be freed) */
+	else if (error == -EEXIST)
+		error = 0;
+
+	dnotify_recalc_inode_mask(fsn_mark);
+out:
+	spin_unlock(&fsn_mark->lock);
+
+	if (destroy)
+		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+
+	mutex_unlock(&dnotify_group->mark_mutex);
+	fsnotify_put_mark(fsn_mark);
+out_err:
+	if (new_fsn_mark)
+		fsnotify_put_mark(new_fsn_mark);
+	if (dn)
+		kmem_cache_free(dnotify_struct_cache, dn);
+	return error;
+}
+
+static int __init dnotify_init(void)
+{
+	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
+	dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
+
+	dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
+	if (IS_ERR(dnotify_group))
+		panic("unable to allocate fsnotify group for dnotify\n");
+	return 0;
+}
+
+module_init(dnotify_init)
diff --git a/kernel/fs/notify/fanotify/Kconfig b/kernel/fs/notify/fanotify/Kconfig
new file mode 100644
index 000000000..e5f911bd8
--- /dev/null
+++ b/kernel/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
+config FANOTIFY
+	bool "Filesystem wide access notification"
+	select FSNOTIFY
+	select ANON_INODES
+	default n
+	---help---
+	   Say Y here to enable fanotify support.  fanotify is a file access
+	   notification system which differs from inotify in that it sends
+	   an open file descriptor to the userspace listener along with
+	   the event.
+
+	   If unsure, say Y.
+
+config FANOTIFY_ACCESS_PERMISSIONS
+	bool "fanotify permissions checking"
+	depends on FANOTIFY
+	depends on SECURITY
+	default n
+	---help---
+	   Say Y here is you want fanotify listeners to be able to make permissions
+	   decisions concerning filesystem events.  This is used by some fanotify
+	   listeners which need to scan files before allowing the system access to
+	   use those files.  This is used by some anti-malware vendors and by some
+	   hierarchical storage managent systems.
+
+	   If unsure, say N.
diff --git a/kernel/fs/notify/fanotify/Makefile b/kernel/fs/notify/fanotify/Makefile
new file mode 100644
index 000000000..0999213e7
--- /dev/null
+++ b/kernel/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FANOTIFY)		+= fanotify.o fanotify_user.o
diff --git a/kernel/fs/notify/fanotify/fanotify.c b/kernel/fs/notify/fanotify/fanotify.c
new file mode 100644
index 000000000..d2f97ecca
--- /dev/null
+++ b/kernel/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,270 @@
+#include <linux/fanotify.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h> /* UINT_MAX */
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include "fanotify.h"
+
+static bool should_merge(struct fsnotify_event *old_fsn,
+			 struct fsnotify_event *new_fsn)
+{
+	struct fanotify_event_info *old, *new;
+
+	pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
+	old = FANOTIFY_E(old_fsn);
+	new = FANOTIFY_E(new_fsn);
+
+	if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
+	    old->path.mnt == new->path.mnt &&
+	    old->path.dentry == new->path.dentry)
+		return true;
+	return false;
+}
+
+/* and the list better be locked by something too! */
+static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
+{
+	struct fsnotify_event *test_event;
+	bool do_merge = false;
+
+	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	/*
+	 * Don't merge a permission event with any other event so that we know
+	 * the event structure we have created in fanotify_handle_event() is the
+	 * one we should check for permission response.
+	 */
+	if (event->mask & FAN_ALL_PERM_EVENTS)
+		return 0;
+#endif
+
+	list_for_each_entry_reverse(test_event, list, list) {
+		if (should_merge(test_event, event)) {
+			do_merge = true;
+			break;
+		}
+	}
+
+	if (!do_merge)
+		return 0;
+
+	test_event->mask |= event->mask;
+	return 1;
+}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static int fanotify_get_response(struct fsnotify_group *group,
+				 struct fanotify_perm_event_info *event)
+{
+	int ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	wait_event(group->fanotify_data.access_waitq, event->response ||
+				atomic_read(&group->fanotify_data.bypass_perm));
+
+	if (!event->response) {	/* bypass_perm set */
+		/*
+		 * Event was canceled because group is being destroyed. Remove
+		 * it from group's event list because we are responsible for
+		 * freeing the permission event.
+		 */
+		fsnotify_remove_event(group, &event->fae.fse);
+		return 0;
+	}
+
+	/* userspace responded, convert to something usable */
+	switch (event->response) {
+	case FAN_ALLOW:
+		ret = 0;
+		break;
+	case FAN_DENY:
+	default:
+		ret = -EPERM;
+	}
+	event->response = 0;
+
+	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
+		 group, event, ret);
+	
+	return ret;
+}
+#endif
+
+static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
+				       struct fsnotify_mark *vfsmnt_mark,
+				       u32 event_mask,
+				       void *data, int data_type)
+{
+	__u32 marks_mask, marks_ignored_mask;
+	struct path *path = data;
+
+	pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
+		 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
+		 event_mask, data, data_type);
+
+	/* if we don't have enough info to send an event to userspace say no */
+	if (data_type != FSNOTIFY_EVENT_PATH)
+		return false;
+
+	/* sorry, fanotify only gives a damn about files and dirs */
+	if (!d_is_reg(path->dentry) &&
+	    !d_can_lookup(path->dentry))
+		return false;
+
+	if (inode_mark && vfsmnt_mark) {
+		marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
+		marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
+	} else if (inode_mark) {
+		/*
+		 * if the event is for a child and this inode doesn't care about
+		 * events on the child, don't send it!
+		 */
+		if ((event_mask & FS_EVENT_ON_CHILD) &&
+		    !(inode_mark->mask & FS_EVENT_ON_CHILD))
+			return false;
+		marks_mask = inode_mark->mask;
+		marks_ignored_mask = inode_mark->ignored_mask;
+	} else if (vfsmnt_mark) {
+		marks_mask = vfsmnt_mark->mask;
+		marks_ignored_mask = vfsmnt_mark->ignored_mask;
+	} else {
+		BUG();
+	}
+
+	if (d_is_dir(path->dentry) &&
+	    !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
+		return false;
+
+	if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
+				 ~marks_ignored_mask)
+		return true;
+
+	return false;
+}
+
+struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+						 struct path *path)
+{
+	struct fanotify_event_info *event;
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (mask & FAN_ALL_PERM_EVENTS) {
+		struct fanotify_perm_event_info *pevent;
+
+		pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
+					  GFP_KERNEL);
+		if (!pevent)
+			return NULL;
+		event = &pevent->fae;
+		pevent->response = 0;
+		goto init;
+	}
+#endif
+	event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+init: __maybe_unused
+	fsnotify_init_event(&event->fse, inode, mask);
+	event->tgid = get_pid(task_tgid(current));
+	if (path) {
+		event->path = *path;
+		path_get(&event->path);
+	} else {
+		event->path.mnt = NULL;
+		event->path.dentry = NULL;
+	}
+	return event;
+}
+
+static int fanotify_handle_event(struct fsnotify_group *group,
+				 struct inode *inode,
+				 struct fsnotify_mark *inode_mark,
+				 struct fsnotify_mark *fanotify_mark,
+				 u32 mask, void *data, int data_type,
+				 const unsigned char *file_name, u32 cookie)
+{
+	int ret = 0;
+	struct fanotify_event_info *event;
+	struct fsnotify_event *fsn_event;
+
+	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+
+	if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
+					data_type))
+		return 0;
+
+	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+		 mask);
+
+	event = fanotify_alloc_event(inode, mask, data);
+	if (unlikely(!event))
+		return -ENOMEM;
+
+	fsn_event = &event->fse;
+	ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
+	if (ret) {
+		/* Permission events shouldn't be merged */
+		BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
+		/* Our event wasn't used in the end. Free it. */
+		fsnotify_destroy_event(group, fsn_event);
+
+		return 0;
+	}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (mask & FAN_ALL_PERM_EVENTS) {
+		ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
+		fsnotify_destroy_event(group, fsn_event);
+	}
+#endif
+	return ret;
+}
+
+static void fanotify_free_group_priv(struct fsnotify_group *group)
+{
+	struct user_struct *user;
+
+	user = group->fanotify_data.user;
+	atomic_dec(&user->fanotify_listeners);
+	free_uid(user);
+}
+
+static void fanotify_free_event(struct fsnotify_event *fsn_event)
+{
+	struct fanotify_event_info *event;
+
+	event = FANOTIFY_E(fsn_event);
+	path_put(&event->path);
+	put_pid(event->tgid);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (fsn_event->mask & FAN_ALL_PERM_EVENTS) {
+		kmem_cache_free(fanotify_perm_event_cachep,
+				FANOTIFY_PE(fsn_event));
+		return;
+	}
+#endif
+	kmem_cache_free(fanotify_event_cachep, event);
+}
+
+const struct fsnotify_ops fanotify_fsnotify_ops = {
+	.handle_event = fanotify_handle_event,
+	.free_group_priv = fanotify_free_group_priv,
+	.free_event = fanotify_free_event,
+};
diff --git a/kernel/fs/notify/fanotify/fanotify.h b/kernel/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000..2a5fb1411
--- /dev/null
+++ b/kernel/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,50 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+extern struct kmem_cache *fanotify_event_cachep;
+extern struct kmem_cache *fanotify_perm_event_cachep;
+
+/*
+ * Structure for normal fanotify events. It gets allocated in
+ * fanotify_handle_event() and freed when the information is retrieved by
+ * userspace
+ */
+struct fanotify_event_info {
+	struct fsnotify_event fse;
+	/*
+	 * We hold ref to this path so it may be dereferenced at any point
+	 * during this object's lifetime
+	 */
+	struct path path;
+	struct pid *tgid;
+};
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+/*
+ * Structure for permission fanotify events. It gets allocated and freed in
+ * fanotify_handle_event() since we wait there for user response. When the
+ * information is retrieved by userspace the structure is moved from
+ * group->notification_list to group->fanotify_data.access_list to wait for
+ * user response.
+ */
+struct fanotify_perm_event_info {
+	struct fanotify_event_info fae;
+	int response;	/* userspace answer to question */
+	int fd;		/* fd we passed to userspace for this event */
+};
+
+static inline struct fanotify_perm_event_info *
+FANOTIFY_PE(struct fsnotify_event *fse)
+{
+	return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+}
+#endif
+
+static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+{
+	return container_of(fse, struct fanotify_event_info, fse);
+}
+
+struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+						 struct path *path);
diff --git a/kernel/fs/notify/fanotify/fanotify_user.c b/kernel/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 000000000..cf275500a
--- /dev/null
+++ b/kernel/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,940 @@
+#include <linux/fanotify.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
+
+#include <asm/ioctls.h>
+
+#include "../../mount.h"
+#include "../fdinfo.h"
+#include "fanotify.h"
+
+#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
+#define FANOTIFY_DEFAULT_MAX_MARKS	8192
+#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
+
+/*
+ * All flags that may be specified in parameter event_f_flags of fanotify_init.
+ *
+ * Internal and external open flags are stored together in field f_flags of
+ * struct file. Only external open flags shall be allowed in event_f_flags.
+ * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
+ * excluded.
+ */
+#define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
+		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
+		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
+		O_LARGEFILE	| O_NOATIME	)
+
+extern const struct fsnotify_ops fanotify_fsnotify_ops;
+
+static struct kmem_cache *fanotify_mark_cache __read_mostly;
+struct kmem_cache *fanotify_event_cachep __read_mostly;
+struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+
+/*
+ * Get an fsnotify notification event if one exists and is small
+ * enough to fit in "count". Return an error pointer if the count
+ * is not large enough.
+ *
+ * Called with the group->notification_mutex held.
+ */
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
+
+	if (FAN_EVENT_METADATA_LEN > count)
+		return ERR_PTR(-EINVAL);
+
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	return fsnotify_remove_first_event(group);
+}
+
+static int create_fd(struct fsnotify_group *group,
+		     struct fanotify_event_info *event,
+		     struct file **file)
+{
+	int client_fd;
+	struct file *new_file;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
+	if (client_fd < 0)
+		return client_fd;
+
+	/*
+	 * we need a new file handle for the userspace program so it can read even if it was
+	 * originally opened O_WRONLY.
+	 */
+	/* it's possible this event was an overflow event.  in that case dentry and mnt
+	 * are NULL;  That's fine, just don't call dentry open */
+	if (event->path.dentry && event->path.mnt)
+		new_file = dentry_open(&event->path,
+				       group->fanotify_data.f_flags | FMODE_NONOTIFY,
+				       current_cred());
+	else
+		new_file = ERR_PTR(-EOVERFLOW);
+	if (IS_ERR(new_file)) {
+		/*
+		 * we still send an event even if we can't open the file.  this
+		 * can happen when say tasks are gone and we try to open their
+		 * /proc files or we try to open a WRONLY file like in sysfs
+		 * we just send the errno to userspace since there isn't much
+		 * else we can do.
+		 */
+		put_unused_fd(client_fd);
+		client_fd = PTR_ERR(new_file);
+	} else {
+		*file = new_file;
+	}
+
+	return client_fd;
+}
+
+static int fill_event_metadata(struct fsnotify_group *group,
+			       struct fanotify_event_metadata *metadata,
+			       struct fsnotify_event *fsn_event,
+			       struct file **file)
+{
+	int ret = 0;
+	struct fanotify_event_info *event;
+
+	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
+		 group, metadata, fsn_event);
+
+	*file = NULL;
+	event = container_of(fsn_event, struct fanotify_event_info, fse);
+	metadata->event_len = FAN_EVENT_METADATA_LEN;
+	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
+	metadata->vers = FANOTIFY_METADATA_VERSION;
+	metadata->reserved = 0;
+	metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
+	metadata->pid = pid_vnr(event->tgid);
+	if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
+		metadata->fd = FAN_NOFD;
+	else {
+		metadata->fd = create_fd(group, event, file);
+		if (metadata->fd < 0)
+			ret = metadata->fd;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static struct fanotify_perm_event_info *dequeue_event(
+				struct fsnotify_group *group, int fd)
+{
+	struct fanotify_perm_event_info *event, *return_e = NULL;
+
+	spin_lock(&group->fanotify_data.access_lock);
+	list_for_each_entry(event, &group->fanotify_data.access_list,
+			    fae.fse.list) {
+		if (event->fd != fd)
+			continue;
+
+		list_del_init(&event->fae.fse.list);
+		return_e = event;
+		break;
+	}
+	spin_unlock(&group->fanotify_data.access_lock);
+
+	pr_debug("%s: found return_re=%p\n", __func__, return_e);
+
+	return return_e;
+}
+
+static int process_access_response(struct fsnotify_group *group,
+				   struct fanotify_response *response_struct)
+{
+	struct fanotify_perm_event_info *event;
+	int fd = response_struct->fd;
+	int response = response_struct->response;
+
+	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
+		 fd, response);
+	/*
+	 * make sure the response is valid, if invalid we do nothing and either
+	 * userspace can send a valid response or we will clean it up after the
+	 * timeout
+	 */
+	switch (response) {
+	case FAN_ALLOW:
+	case FAN_DENY:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (fd < 0)
+		return -EINVAL;
+
+	event = dequeue_event(group, fd);
+	if (!event)
+		return -ENOENT;
+
+	event->response = response;
+	wake_up(&group->fanotify_data.access_waitq);
+
+	return 0;
+}
+#endif
+
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
+				  char __user *buf)
+{
+	struct fanotify_event_metadata fanotify_event_metadata;
+	struct file *f;
+	int fd, ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
+	if (ret < 0)
+		return ret;
+
+	fd = fanotify_event_metadata.fd;
+	ret = -EFAULT;
+	if (copy_to_user(buf, &fanotify_event_metadata,
+			 fanotify_event_metadata.event_len))
+		goto out_close_fd;
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (event->mask & FAN_ALL_PERM_EVENTS)
+		FANOTIFY_PE(event)->fd = fd;
+#endif
+
+	if (fd != FAN_NOFD)
+		fd_install(fd, f);
+	return fanotify_event_metadata.event_len;
+
+out_close_fd:
+	if (fd != FAN_NOFD) {
+		put_unused_fd(fd);
+		fput(f);
+	}
+	return ret;
+}
+
+/* intofiy userspace file descriptor functions */
+static unsigned int fanotify_poll(struct file *file, poll_table *wait)
+{
+	struct fsnotify_group *group = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&group->notification_mutex);
+
+	return ret;
+}
+
+static ssize_t fanotify_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	start = buf;
+	group = file->private_data;
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	add_wait_queue(&group->notification_waitq, &wait);
+	while (1) {
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
+
+		if (IS_ERR(kevent)) {
+			ret = PTR_ERR(kevent);
+			break;
+		}
+
+		if (!kevent) {
+			ret = -EAGAIN;
+			if (file->f_flags & O_NONBLOCK)
+				break;
+
+			ret = -ERESTARTSYS;
+			if (signal_pending(current))
+				break;
+
+			if (start != buf)
+				break;
+
+			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		ret = copy_event_to_user(group, kevent, buf);
+		/*
+		 * Permission events get queued to wait for response.  Other
+		 * events can be destroyed now.
+		 */
+		if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
+			fsnotify_destroy_event(group, kevent);
+			if (ret < 0)
+				break;
+		} else {
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+			if (ret < 0) {
+				FANOTIFY_PE(kevent)->response = FAN_DENY;
+				wake_up(&group->fanotify_data.access_waitq);
+				break;
+			}
+			spin_lock(&group->fanotify_data.access_lock);
+			list_add_tail(&kevent->list,
+				      &group->fanotify_data.access_list);
+			spin_unlock(&group->fanotify_data.access_lock);
+#endif
+		}
+		buf += ret;
+		count -= ret;
+	}
+	remove_wait_queue(&group->notification_waitq, &wait);
+
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
+	return ret;
+}
+
+static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
+{
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_response response = { .fd = -1, .response = -1 };
+	struct fsnotify_group *group;
+	int ret;
+
+	group = file->private_data;
+
+	if (count > sizeof(response))
+		count = sizeof(response);
+
+	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
+
+	if (copy_from_user(&response, buf, count))
+		return -EFAULT;
+
+	ret = process_access_response(group, &response);
+	if (ret < 0)
+		count = ret;
+
+	return count;
+#else
+	return -EINVAL;
+#endif
+}
+
+static int fanotify_release(struct inode *ignored, struct file *file)
+{
+	struct fsnotify_group *group = file->private_data;
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_perm_event_info *event, *next;
+
+	/*
+	 * There may be still new events arriving in the notification queue
+	 * but since userspace cannot use fanotify fd anymore, no event can
+	 * enter or leave access_list by now.
+	 */
+	spin_lock(&group->fanotify_data.access_lock);
+
+	atomic_inc(&group->fanotify_data.bypass_perm);
+
+	list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
+				 fae.fse.list) {
+		pr_debug("%s: found group=%p event=%p\n", __func__, group,
+			 event);
+
+		list_del_init(&event->fae.fse.list);
+		event->response = FAN_ALLOW;
+	}
+	spin_unlock(&group->fanotify_data.access_lock);
+
+	/*
+	 * Since bypass_perm is set, newly queued events will not wait for
+	 * access response. Wake up the already sleeping ones now.
+	 * synchronize_srcu() in fsnotify_destroy_group() will wait for all
+	 * processes sleeping in fanotify_handle_event() waiting for access
+	 * response and thus also for all permission events to be freed.
+	 */
+	wake_up(&group->fanotify_data.access_waitq);
+#endif
+
+	/* matches the fanotify_init->fsnotify_alloc_group */
+	fsnotify_destroy_group(group);
+
+	return 0;
+}
+
+static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *fsn_event;
+	void __user *p;
+	int ret = -ENOTTY;
+	size_t send_len = 0;
+
+	group = file->private_data;
+
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(fsn_event, &group->notification_list, list)
+			send_len += FAN_EVENT_METADATA_LEN;
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations fanotify_fops = {
+	.show_fdinfo	= fanotify_show_fdinfo,
+	.poll		= fanotify_poll,
+	.read		= fanotify_read,
+	.write		= fanotify_write,
+	.fasync		= NULL,
+	.release	= fanotify_release,
+	.unlocked_ioctl	= fanotify_ioctl,
+	.compat_ioctl	= fanotify_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
+static int fanotify_find_path(int dfd, const char __user *filename,
+			      struct path *path, unsigned int flags)
+{
+	int ret;
+
+	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
+		 dfd, filename, flags);
+
+	if (filename == NULL) {
+		struct fd f = fdget(dfd);
+
+		ret = -EBADF;
+		if (!f.file)
+			goto out;
+
+		ret = -ENOTDIR;
+		if ((flags & FAN_MARK_ONLYDIR) &&
+		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
+			fdput(f);
+			goto out;
+		}
+
+		*path = f.file->f_path;
+		path_get(path);
+		fdput(f);
+	} else {
+		unsigned int lookup_flags = 0;
+
+		if (!(flags & FAN_MARK_DONT_FOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+		if (flags & FAN_MARK_ONLYDIR)
+			lookup_flags |= LOOKUP_DIRECTORY;
+
+		ret = user_path_at(dfd, filename, lookup_flags, path);
+		if (ret)
+			goto out;
+	}
+
+	/* you can only watch an inode if you have read permissions on it */
+	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (ret)
+		path_put(path);
+out:
+	return ret;
+}
+
+static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
+					    __u32 mask,
+					    unsigned int flags,
+					    int *destroy)
+{
+	__u32 oldmask = 0;
+
+	spin_lock(&fsn_mark->lock);
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		__u32 tmask = fsn_mark->mask & ~mask;
+
+		if (flags & FAN_MARK_ONDIR)
+			tmask &= ~FAN_ONDIR;
+
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+	} else {
+		__u32 tmask = fsn_mark->ignored_mask & ~mask;
+		if (flags & FAN_MARK_ONDIR)
+			tmask &= ~FAN_ONDIR;
+
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+	}
+	*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
+	spin_unlock(&fsn_mark->lock);
+
+	return mask & oldmask;
+}
+
+static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
+					 struct vfsmount *mnt, __u32 mask,
+					 unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark = NULL;
+	__u32 removed;
+	int destroy_mark;
+
+	mutex_lock(&group->mark_mutex);
+	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	if (!fsn_mark) {
+		mutex_unlock(&group->mark_mutex);
+		return -ENOENT;
+	}
+
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
+						 &destroy_mark);
+	if (destroy_mark)
+		fsnotify_destroy_mark_locked(fsn_mark, group);
+	mutex_unlock(&group->mark_mutex);
+
+	fsnotify_put_mark(fsn_mark);
+	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
+		fsnotify_recalc_vfsmount_mask(mnt);
+
+	return 0;
+}
+
+static int fanotify_remove_inode_mark(struct fsnotify_group *group,
+				      struct inode *inode, __u32 mask,
+				      unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark = NULL;
+	__u32 removed;
+	int destroy_mark;
+
+	mutex_lock(&group->mark_mutex);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark) {
+		mutex_unlock(&group->mark_mutex);
+		return -ENOENT;
+	}
+
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
+						 &destroy_mark);
+	if (destroy_mark)
+		fsnotify_destroy_mark_locked(fsn_mark, group);
+	mutex_unlock(&group->mark_mutex);
+
+	/* matches the fsnotify_find_inode_mark() */
+	fsnotify_put_mark(fsn_mark);
+	if (removed & inode->i_fsnotify_mask)
+		fsnotify_recalc_inode_mask(inode);
+
+	return 0;
+}
+
+static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+				       __u32 mask,
+				       unsigned int flags)
+{
+	__u32 oldmask = -1;
+
+	spin_lock(&fsn_mark->lock);
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		__u32 tmask = fsn_mark->mask | mask;
+
+		if (flags & FAN_MARK_ONDIR)
+			tmask |= FAN_ONDIR;
+
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+	} else {
+		__u32 tmask = fsn_mark->ignored_mask | mask;
+		if (flags & FAN_MARK_ONDIR)
+			tmask |= FAN_ONDIR;
+
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
+			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+	}
+	spin_unlock(&fsn_mark->lock);
+
+	return mask & ~oldmask;
+}
+
+static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+						   struct inode *inode,
+						   struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	int ret;
+
+	if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+		return ERR_PTR(-ENOSPC);
+
+	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+	if (!mark)
+		return ERR_PTR(-ENOMEM);
+
+	fsnotify_init_mark(mark, fanotify_free_mark);
+	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+	if (ret) {
+		fsnotify_put_mark(mark);
+		return ERR_PTR(ret);
+	}
+
+	return mark;
+}
+
+
+static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
+				      struct vfsmount *mnt, __u32 mask,
+				      unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 added;
+
+	mutex_lock(&group->mark_mutex);
+	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	if (!fsn_mark) {
+		fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
+		if (IS_ERR(fsn_mark)) {
+			mutex_unlock(&group->mark_mutex);
+			return PTR_ERR(fsn_mark);
+		}
+	}
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	mutex_unlock(&group->mark_mutex);
+
+	if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
+		fsnotify_recalc_vfsmount_mask(mnt);
+
+	fsnotify_put_mark(fsn_mark);
+	return 0;
+}
+
+static int fanotify_add_inode_mark(struct fsnotify_group *group,
+				   struct inode *inode, __u32 mask,
+				   unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 added;
+
+	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
+
+	/*
+	 * If some other task has this inode open for write we should not add
+	 * an ignored mark, unless that ignored mark is supposed to survive
+	 * modification changes anyway.
+	 */
+	if ((flags & FAN_MARK_IGNORED_MASK) &&
+	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	    (atomic_read(&inode->i_writecount) > 0))
+		return 0;
+
+	mutex_lock(&group->mark_mutex);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark) {
+		fsn_mark = fanotify_add_new_mark(group, inode, NULL);
+		if (IS_ERR(fsn_mark)) {
+			mutex_unlock(&group->mark_mutex);
+			return PTR_ERR(fsn_mark);
+		}
+	}
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	mutex_unlock(&group->mark_mutex);
+
+	if (added & ~inode->i_fsnotify_mask)
+		fsnotify_recalc_inode_mask(inode);
+
+	fsnotify_put_mark(fsn_mark);
+	return 0;
+}
+
+/* fanotify syscalls */
+SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+{
+	struct fsnotify_group *group;
+	int f_flags, fd;
+	struct user_struct *user;
+	struct fanotify_event_info *oevent;
+
+	pr_debug("%s: flags=%d event_f_flags=%d\n",
+		__func__, flags, event_f_flags);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (flags & ~FAN_ALL_INIT_FLAGS)
+		return -EINVAL;
+
+	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
+		return -EINVAL;
+
+	switch (event_f_flags & O_ACCMODE) {
+	case O_RDONLY:
+	case O_RDWR:
+	case O_WRONLY:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	user = get_current_user();
+	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
+		free_uid(user);
+		return -EMFILE;
+	}
+
+	f_flags = O_RDWR | FMODE_NONOTIFY;
+	if (flags & FAN_CLOEXEC)
+		f_flags |= O_CLOEXEC;
+	if (flags & FAN_NONBLOCK)
+		f_flags |= O_NONBLOCK;
+
+	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
+	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+	if (IS_ERR(group)) {
+		free_uid(user);
+		return PTR_ERR(group);
+	}
+
+	group->fanotify_data.user = user;
+	atomic_inc(&user->fanotify_listeners);
+
+	oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
+	if (unlikely(!oevent)) {
+		fd = -ENOMEM;
+		goto out_destroy_group;
+	}
+	group->overflow_event = &oevent->fse;
+
+	if (force_o_largefile())
+		event_f_flags |= O_LARGEFILE;
+	group->fanotify_data.f_flags = event_f_flags;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	spin_lock_init(&group->fanotify_data.access_lock);
+	init_waitqueue_head(&group->fanotify_data.access_waitq);
+	INIT_LIST_HEAD(&group->fanotify_data.access_list);
+	atomic_set(&group->fanotify_data.bypass_perm, 0);
+#endif
+	switch (flags & FAN_ALL_CLASS_BITS) {
+	case FAN_CLASS_NOTIF:
+		group->priority = FS_PRIO_0;
+		break;
+	case FAN_CLASS_CONTENT:
+		group->priority = FS_PRIO_1;
+		break;
+	case FAN_CLASS_PRE_CONTENT:
+		group->priority = FS_PRIO_2;
+		break;
+	default:
+		fd = -EINVAL;
+		goto out_destroy_group;
+	}
+
+	if (flags & FAN_UNLIMITED_QUEUE) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_destroy_group;
+		group->max_events = UINT_MAX;
+	} else {
+		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+	}
+
+	if (flags & FAN_UNLIMITED_MARKS) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_destroy_group;
+		group->fanotify_data.max_marks = UINT_MAX;
+	} else {
+		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+	}
+
+	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
+	if (fd < 0)
+		goto out_destroy_group;
+
+	return fd;
+
+out_destroy_group:
+	fsnotify_destroy_group(group);
+	return fd;
+}
+
+SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
+			      __u64, mask, int, dfd,
+			      const char  __user *, pathname)
+{
+	struct inode *inode = NULL;
+	struct vfsmount *mnt = NULL;
+	struct fsnotify_group *group;
+	struct fd f;
+	struct path path;
+	int ret;
+
+	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
+		 __func__, fanotify_fd, flags, dfd, pathname, mask);
+
+	/* we only use the lower 32 bits as of right now. */
+	if (mask & ((__u64)0xffffffff << 32))
+		return -EINVAL;
+
+	if (flags & ~FAN_ALL_MARK_FLAGS)
+		return -EINVAL;
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+	case FAN_MARK_ADD:		/* fallthrough */
+	case FAN_MARK_REMOVE:
+		if (!mask)
+			return -EINVAL;
+		break;
+	case FAN_MARK_FLUSH:
+		if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH))
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (mask & FAN_ONDIR) {
+		flags |= FAN_MARK_ONDIR;
+		mask &= ~FAN_ONDIR;
+	}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
+#else
+	if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
+#endif
+		return -EINVAL;
+
+	f = fdget(fanotify_fd);
+	if (unlikely(!f.file))
+		return -EBADF;
+
+	/* verify that this is indeed an fanotify instance */
+	ret = -EINVAL;
+	if (unlikely(f.file->f_op != &fanotify_fops))
+		goto fput_and_out;
+	group = f.file->private_data;
+
+	/*
+	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
+	 * allowed to set permissions events.
+	 */
+	ret = -EINVAL;
+	if (mask & FAN_ALL_PERM_EVENTS &&
+	    group->priority == FS_PRIO_0)
+		goto fput_and_out;
+
+	if (flags & FAN_MARK_FLUSH) {
+		ret = 0;
+		if (flags & FAN_MARK_MOUNT)
+			fsnotify_clear_vfsmount_marks_by_group(group);
+		else
+			fsnotify_clear_inode_marks_by_group(group);
+		goto fput_and_out;
+	}
+
+	ret = fanotify_find_path(dfd, pathname, &path, flags);
+	if (ret)
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; group by fget on fd */
+	if (!(flags & FAN_MARK_MOUNT))
+		inode = path.dentry->d_inode;
+	else
+		mnt = path.mnt;
+
+	/* create/update an inode mark */
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
+	case FAN_MARK_ADD:
+		if (flags & FAN_MARK_MOUNT)
+			ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+		else
+			ret = fanotify_add_inode_mark(group, inode, mask, flags);
+		break;
+	case FAN_MARK_REMOVE:
+		if (flags & FAN_MARK_MOUNT)
+			ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+		else
+			ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	path_put(&path);
+fput_and_out:
+	fdput(f);
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(fanotify_mark,
+				int, fanotify_fd, unsigned int, flags,
+				__u32, mask0, __u32, mask1, int, dfd,
+				const char  __user *, pathname)
+{
+	return sys_fanotify_mark(fanotify_fd, flags,
+#ifdef __BIG_ENDIAN
+				((__u64)mask0 << 32) | mask1,
+#else
+				((__u64)mask1 << 32) | mask0,
+#endif
+				 dfd, pathname);
+}
+#endif
+
+/*
+ * fanotify_user_setup - Our initialization function.  Note that we cannot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init fanotify_user_setup(void)
+{
+	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+	fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info,
+						SLAB_PANIC);
+#endif
+
+	return 0;
+}
+device_initcall(fanotify_user_setup);
diff --git a/kernel/fs/notify/fdinfo.c b/kernel/fs/notify/fdinfo.c
new file mode 100644
index 000000000..58b7cdb63
--- /dev/null
+++ b/kernel/fs/notify/fdinfo.c
@@ -0,0 +1,165 @@
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/inotify.h>
+#include <linux/fanotify.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/exportfs.h>
+
+#include "inotify/inotify.h"
+#include "../fs/mount.h"
+
+#if defined(CONFIG_PROC_FS)
+
+#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY)
+
+static void show_fdinfo(struct seq_file *m, struct file *f,
+			void (*show)(struct seq_file *m,
+				     struct fsnotify_mark *mark))
+{
+	struct fsnotify_group *group = f->private_data;
+	struct fsnotify_mark *mark;
+
+	mutex_lock(&group->mark_mutex);
+	list_for_each_entry(mark, &group->marks_list, g_list) {
+		show(m, mark);
+		if (seq_has_overflowed(m))
+			break;
+	}
+	mutex_unlock(&group->mark_mutex);
+}
+
+#if defined(CONFIG_EXPORTFS)
+static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
+{
+	struct {
+		struct file_handle handle;
+		u8 pad[MAX_HANDLE_SZ];
+	} f;
+	int size, ret, i;
+
+	f.handle.handle_bytes = sizeof(f.pad);
+	size = f.handle.handle_bytes >> 2;
+
+	ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
+	if ((ret == FILEID_INVALID) || (ret < 0)) {
+		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
+		return;
+	}
+
+	f.handle.handle_type = ret;
+	f.handle.handle_bytes = size * sizeof(u32);
+
+	seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
+		   f.handle.handle_bytes, f.handle.handle_type);
+
+	for (i = 0; i < f.handle.handle_bytes; i++)
+		seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+}
+#else
+static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
+{
+}
+#endif
+
+#ifdef CONFIG_INOTIFY_USER
+
+static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+{
+	struct inotify_inode_mark *inode_mark;
+	struct inode *inode;
+
+	if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+		return;
+
+	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
+	inode = igrab(mark->inode);
+	if (inode) {
+		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+			   inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
+			   mark->mask, mark->ignored_mask);
+		show_mark_fhandle(m, inode);
+		seq_putc(m, '\n');
+		iput(inode);
+	}
+}
+
+void inotify_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	show_fdinfo(m, f, inotify_fdinfo);
+}
+
+#endif /* CONFIG_INOTIFY_USER */
+
+#ifdef CONFIG_FANOTIFY
+
+static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+{
+	unsigned int mflags = 0;
+	struct inode *inode;
+
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
+		return;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
+		inode = igrab(mark->inode);
+		if (!inode)
+			return;
+		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
+			   inode->i_ino, inode->i_sb->s_dev,
+			   mflags, mark->mask, mark->ignored_mask);
+		show_mark_fhandle(m, inode);
+		seq_putc(m, '\n');
+		iput(inode);
+	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
+		struct mount *mnt = real_mount(mark->mnt);
+
+		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
+			   mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
+	}
+}
+
+void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct fsnotify_group *group = f->private_data;
+	unsigned int flags = 0;
+
+	switch (group->priority) {
+	case FS_PRIO_0:
+		flags |= FAN_CLASS_NOTIF;
+		break;
+	case FS_PRIO_1:
+		flags |= FAN_CLASS_CONTENT;
+		break;
+	case FS_PRIO_2:
+		flags |= FAN_CLASS_PRE_CONTENT;
+		break;
+	}
+
+	if (group->max_events == UINT_MAX)
+		flags |= FAN_UNLIMITED_QUEUE;
+
+	if (group->fanotify_data.max_marks == UINT_MAX)
+		flags |= FAN_UNLIMITED_MARKS;
+
+	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
+		   flags, group->fanotify_data.f_flags);
+
+	show_fdinfo(m, f, fanotify_fdinfo);
+}
+
+#endif /* CONFIG_FANOTIFY */
+
+#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/fs/notify/fdinfo.h b/kernel/fs/notify/fdinfo.h
new file mode 100644
index 000000000..9664c4904
--- /dev/null
+++ b/kernel/fs/notify/fdinfo.h
@@ -0,0 +1,27 @@
+#ifndef __FSNOTIFY_FDINFO_H__
+#define __FSNOTIFY_FDINFO_H__
+
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+
+struct seq_file;
+struct file;
+
+#ifdef CONFIG_PROC_FS
+
+#ifdef CONFIG_INOTIFY_USER
+void inotify_show_fdinfo(struct seq_file *m, struct file *f);
+#endif
+
+#ifdef CONFIG_FANOTIFY
+void fanotify_show_fdinfo(struct seq_file *m, struct file *f);
+#endif
+
+#else /* CONFIG_PROC_FS */
+
+#define inotify_show_fdinfo	NULL
+#define fanotify_show_fdinfo	NULL
+
+#endif /* CONFIG_PROC_FS */
+
+#endif /* __FSNOTIFY_FDINFO_H__ */
diff --git a/kernel/fs/notify/fsnotify.c b/kernel/fs/notify/fsnotify.c
new file mode 100644
index 000000000..dd3fb0b17
--- /dev/null
+++ b/kernel/fs/notify/fsnotify.c
@@ -0,0 +1,299 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/srcu.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+#include "../mount.h"
+
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+	fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+
+void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
+{
+	fsnotify_clear_marks_by_mount(mnt);
+}
+
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+	struct dentry *alias;
+	int watched;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	/* determine if the children should tell inode about their events */
+	watched = fsnotify_inode_watches_children(inode);
+
+	spin_lock(&inode->i_lock);
+	/* run all of the dentries associated with this inode.  Since this is a
+	 * directory, there damn well better only be one item on this list */
+	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+		struct dentry *child;
+
+		/* run all of the children of the original inode and fix their
+		 * d_flags to indicate parental interest (their parent is the
+		 * original inode) */
+		spin_lock(&alias->d_lock);
+		list_for_each_entry(child, &alias->d_subdirs, d_child) {
+			if (!child->d_inode)
+				continue;
+
+			spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+			if (watched)
+				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+			else
+				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+			spin_unlock(&child->d_lock);
+		}
+		spin_unlock(&alias->d_lock);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+/* Notify this dentry's parent about a child's events. */
+int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+{
+	struct dentry *parent;
+	struct inode *p_inode;
+	int ret = 0;
+
+	if (!dentry)
+		dentry = path->dentry;
+
+	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+		return 0;
+
+	parent = dget_parent(dentry);
+	p_inode = parent->d_inode;
+
+	if (unlikely(!fsnotify_inode_watches_children(p_inode)))
+		__fsnotify_update_child_dentry_flags(p_inode);
+	else if (p_inode->i_fsnotify_mask & mask) {
+		/* we are notifying a parent so come up with the new mask which
+		 * specifies these are events which came from a child. */
+		mask |= FS_EVENT_ON_CHILD;
+
+		if (path)
+			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+				       dentry->d_name.name, 0);
+		else
+			ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+				       dentry->d_name.name, 0);
+	}
+
+	dput(parent);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+
+static int send_to_group(struct inode *to_tell,
+			 struct fsnotify_mark *inode_mark,
+			 struct fsnotify_mark *vfsmount_mark,
+			 __u32 mask, void *data,
+			 int data_is, u32 cookie,
+			 const unsigned char *file_name)
+{
+	struct fsnotify_group *group = NULL;
+	__u32 inode_test_mask = 0;
+	__u32 vfsmount_test_mask = 0;
+
+	if (unlikely(!inode_mark && !vfsmount_mark)) {
+		BUG();
+		return 0;
+	}
+
+	/* clear ignored on inode modification */
+	if (mask & FS_MODIFY) {
+		if (inode_mark &&
+		    !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			inode_mark->ignored_mask = 0;
+		if (vfsmount_mark &&
+		    !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			vfsmount_mark->ignored_mask = 0;
+	}
+
+	/* does the inode mark tell us to do something? */
+	if (inode_mark) {
+		group = inode_mark->group;
+		inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+		inode_test_mask &= inode_mark->mask;
+		inode_test_mask &= ~inode_mark->ignored_mask;
+	}
+
+	/* does the vfsmount_mark tell us to do something? */
+	if (vfsmount_mark) {
+		vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+		group = vfsmount_mark->group;
+		vfsmount_test_mask &= vfsmount_mark->mask;
+		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
+		if (inode_mark)
+			vfsmount_test_mask &= ~inode_mark->ignored_mask;
+	}
+
+	pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
+		 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+		 " data=%p data_is=%d cookie=%d\n",
+		 __func__, group, to_tell, mask, inode_mark,
+		 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+		 data_is, cookie);
+
+	if (!inode_test_mask && !vfsmount_test_mask)
+		return 0;
+
+	return group->ops->handle_event(group, to_tell, inode_mark,
+					vfsmount_mark, mask, data, data_is,
+					file_name, cookie);
+}
+
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+	     const unsigned char *file_name, u32 cookie)
+{
+	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
+	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
+	struct fsnotify_group *inode_group, *vfsmount_group;
+	struct mount *mnt;
+	int idx, ret = 0;
+	/* global tests shouldn't care about events on child only the specific event */
+	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+
+	if (data_is == FSNOTIFY_EVENT_PATH)
+		mnt = real_mount(((struct path *)data)->mnt);
+	else
+		mnt = NULL;
+
+	/*
+	 * if this is a modify event we may need to clear the ignored masks
+	 * otherwise return if neither the inode nor the vfsmount care about
+	 * this type of event.
+	 */
+	if (!(mask & FS_MODIFY) &&
+	    !(test_mask & to_tell->i_fsnotify_mask) &&
+	    !(mnt && test_mask & mnt->mnt_fsnotify_mask))
+		return 0;
+
+	idx = srcu_read_lock(&fsnotify_mark_srcu);
+
+	if ((mask & FS_MODIFY) ||
+	    (test_mask & to_tell->i_fsnotify_mask))
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
+
+	if (mnt && ((mask & FS_MODIFY) ||
+		    (test_mask & mnt->mnt_fsnotify_mask))) {
+		vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+						 &fsnotify_mark_srcu);
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
+	}
+
+	/*
+	 * We need to merge inode & vfsmount mark lists so that inode mark
+	 * ignore masks are properly reflected for mount mark notifications.
+	 * That's why this traversal is so complicated...
+	 */
+	while (inode_node || vfsmount_node) {
+		inode_group = NULL;
+		inode_mark = NULL;
+		vfsmount_group = NULL;
+		vfsmount_mark = NULL;
+
+		if (inode_node) {
+			inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
+						 struct fsnotify_mark, obj_list);
+			inode_group = inode_mark->group;
+		}
+
+		if (vfsmount_node) {
+			vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
+						    struct fsnotify_mark, obj_list);
+			vfsmount_group = vfsmount_mark->group;
+		}
+
+		if (inode_group && vfsmount_group) {
+			int cmp = fsnotify_compare_groups(inode_group,
+							  vfsmount_group);
+			if (cmp > 0) {
+				inode_group = NULL;
+				inode_mark = NULL;
+			} else if (cmp < 0) {
+				vfsmount_group = NULL;
+				vfsmount_mark = NULL;
+			}
+		}
+		ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
+				    data, data_is, cookie, file_name);
+
+		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
+			goto out;
+
+		if (inode_group)
+			inode_node = srcu_dereference(inode_node->next,
+						      &fsnotify_mark_srcu);
+		if (vfsmount_group)
+			vfsmount_node = srcu_dereference(vfsmount_node->next,
+							 &fsnotify_mark_srcu);
+	}
+	ret = 0;
+out:
+	srcu_read_unlock(&fsnotify_mark_srcu, idx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+
+static __init int fsnotify_init(void)
+{
+	int ret;
+
+	BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
+
+	ret = init_srcu_struct(&fsnotify_mark_srcu);
+	if (ret)
+		panic("initializing fsnotify_mark_srcu");
+
+	return 0;
+}
+core_initcall(fsnotify_init);
diff --git a/kernel/fs/notify/fsnotify.h b/kernel/fs/notify/fsnotify.h
new file mode 100644
index 000000000..13a00be51
--- /dev/null
+++ b/kernel/fs/notify/fsnotify.h
@@ -0,0 +1,60 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+
+/* protects reads of inode and vfsmount marks list */
+extern struct srcu_struct fsnotify_mark_srcu;
+
+/* Calculate mask of events for a list of marks */
+extern u32 fsnotify_recalc_mask(struct hlist_head *head);
+
+/* compare two groups for sorting of marks lists */
+extern int fsnotify_compare_groups(struct fsnotify_group *a,
+				   struct fsnotify_group *b);
+
+extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
+						__u32 mask);
+/* Add mark to a proper place in mark list */
+extern int fsnotify_add_mark_list(struct hlist_head *head,
+				  struct fsnotify_mark *mark,
+				  int allow_dups);
+/* add a mark to an inode */
+extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+				   struct fsnotify_group *group, struct inode *inode,
+				   int allow_dups);
+/* add a mark to a vfsmount */
+extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+				      struct fsnotify_group *group, struct vfsmount *mnt,
+				      int allow_dups);
+
+/* vfsmount specific destruction of a mark */
+extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
+/* inode specific destruction of a mark */
+extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
+/* Destroy all marks in the given list */
+extern void fsnotify_destroy_marks(struct list_head *to_free);
+/* Find mark belonging to given group in the list of marks */
+extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
+						struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/* run the list of all marks associated with vfsmount and flag them to be freed */
+extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+
+#endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/kernel/fs/notify/group.c b/kernel/fs/notify/group.c
new file mode 100644
index 000000000..d16b62cb2
--- /dev/null
+++ b/kernel/fs/notify/group.c
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+#include <linux/atomic.h>
+
+/*
+ * Final freeing of a group
+ */
+static void fsnotify_final_destroy_group(struct fsnotify_group *group)
+{
+	if (group->ops->free_group_priv)
+		group->ops->free_group_priv(group);
+
+	kfree(group);
+}
+
+/*
+ * Trying to get rid of a group. Remove all marks, flush all events and release
+ * the group reference.
+ * Note that another thread calling fsnotify_clear_marks_by_group() may still
+ * hold a ref to the group.
+ */
+void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	/* clear all inode marks for this group */
+	fsnotify_clear_marks_by_group(group);
+
+	synchronize_srcu(&fsnotify_mark_srcu);
+
+	/* clear the notification queue of all events */
+	fsnotify_flush_notify(group);
+
+	/*
+	 * Destroy overflow event (we cannot use fsnotify_destroy_event() as
+	 * that deliberately ignores overflow events.
+	 */
+	if (group->overflow_event)
+		group->ops->free_event(group->overflow_event);
+
+	fsnotify_put_group(group);
+}
+
+/*
+ * Get reference to a group.
+ */
+void fsnotify_get_group(struct fsnotify_group *group)
+{
+	atomic_inc(&group->refcnt);
+}
+
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+	if (atomic_dec_and_test(&group->refcnt))
+		fsnotify_final_destroy_group(group);
+}
+
+/*
+ * Create a new fsnotify_group and hold a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group;
+
+	group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	/* set to 0 when there a no external references to this group */
+	atomic_set(&group->refcnt, 1);
+	atomic_set(&group->num_marks, 0);
+
+	mutex_init(&group->notification_mutex);
+	INIT_LIST_HEAD(&group->notification_list);
+	init_waitqueue_head(&group->notification_waitq);
+	group->max_events = UINT_MAX;
+
+	mutex_init(&group->mark_mutex);
+	INIT_LIST_HEAD(&group->marks_list);
+
+	group->ops = ops;
+
+	return group;
+}
+
+int fsnotify_fasync(int fd, struct file *file, int on)
+{
+	struct fsnotify_group *group = file->private_data;
+
+	return fasync_helper(fd, file, on, &group->fsn_fa) >= 0 ? 0 : -EIO;
+}
diff --git a/kernel/fs/notify/inode_mark.c b/kernel/fs/notify/inode_mark.c
new file mode 100644
index 000000000..3daf513ee
--- /dev/null
+++ b/kernel/fs/notify/inode_mark.c
@@ -0,0 +1,247 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include <linux/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+#include "../internal.h"
+
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+	spin_unlock(&inode->i_lock);
+
+	__fsnotify_update_child_dentry_flags(inode);
+}
+
+void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
+{
+	struct inode *inode = mark->inode;
+
+	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
+	assert_spin_locked(&mark->lock);
+
+	spin_lock(&inode->i_lock);
+
+	hlist_del_init_rcu(&mark->obj_list);
+	mark->inode = NULL;
+
+	/*
+	 * this mark is now off the inode->i_fsnotify_marks list and we
+	 * hold the inode->i_lock, so this is the perfect time to update the
+	 * inode->i_fsnotify_mask
+	 */
+	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
+		list_add(&mark->free_list, &free_list);
+		hlist_del_init_rcu(&mark->obj_list);
+		fsnotify_get_mark(mark);
+	}
+	spin_unlock(&inode->i_lock);
+
+	fsnotify_destroy_marks(&free_list);
+}
+
+/*
+ * Given a group clear all of the inode marks associated with that group.
+ */
+void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
+}
+
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
+					       struct inode *inode)
+{
+	struct fsnotify_mark *mark;
+
+	spin_lock(&inode->i_lock);
+	mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+	spin_unlock(&inode->i_lock);
+
+	return mark;
+}
+
+/*
+ * If we are setting a mark mask on an inode mark we should pin the inode
+ * in memory.
+ */
+void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
+					 __u32 mask)
+{
+	struct inode *inode;
+
+	assert_spin_locked(&mark->lock);
+
+	if (mask &&
+	    mark->inode &&
+	    !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
+		mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
+		inode = igrab(mark->inode);
+		/*
+		 * we shouldn't be able to get here if the inode wasn't
+		 * already safely held in memory.  But bug in case it
+		 * ever is wrong.
+		 */
+		BUG_ON(!inode);
+	}
+}
+
+/*
+ * Attach an initialized mark to a given inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.  These
+ * marks are ordered according to priority, highest number first, and then by
+ * the group's location in memory.
+ */
+int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+			    struct fsnotify_group *group, struct inode *inode,
+			    int allow_dups)
+{
+	int ret;
+
+	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
+
+	BUG_ON(!mutex_is_locked(&group->mark_mutex));
+	assert_spin_locked(&mark->lock);
+
+	spin_lock(&inode->i_lock);
+	mark->inode = inode;
+	ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
+				     allow_dups);
+	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+	spin_unlock(&inode->i_lock);
+
+	return ret;
+}
+
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inode *need_iput_tmp;
+
+		/*
+		 * We cannot __iget() an inode in state I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+
+		/* In case fsnotify_inode_delete() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+		spin_unlock(&inode->i_lock);
+
+		/* In case the dropping of a reference would nuke next_i. */
+		while (&next_i->i_sb_list != list) {
+			spin_lock(&next_i->i_lock);
+			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
+						atomic_read(&next_i->i_count)) {
+				__iget(next_i);
+				need_iput = next_i;
+				spin_unlock(&next_i->i_lock);
+				break;
+			}
+			spin_unlock(&next_i->i_lock);
+			next_i = list_entry(next_i->i_sb_list.next,
+						struct inode, i_sb_list);
+		}
+
+		/*
+		 * We can safely drop inode_sb_list_lock here because either
+		 * we actually hold references on both inode and next_i or
+		 * end of list.  Also no new inodes will be added since the
+		 * umount has begun.
+		 */
+		spin_unlock(&inode_sb_list_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send FS_UNMOUNT and then remove it */
+		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+		fsnotify_inode_delete(inode);
+
+		iput(inode);
+
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+}
diff --git a/kernel/fs/notify/inotify/Kconfig b/kernel/fs/notify/inotify/Kconfig
new file mode 100644
index 000000000..b981fc0c8
--- /dev/null
+++ b/kernel/fs/notify/inotify/Kconfig
@@ -0,0 +1,17 @@
+config INOTIFY_USER
+	bool "Inotify support for userspace"
+	select ANON_INODES
+	select FSNOTIFY
+	default y
+	---help---
+	  Say Y here to enable inotify support for userspace, including the
+	  associated system calls.  Inotify allows monitoring of both files and
+	  directories via a single open fd.  Events are read from the file
+	  descriptor, which is also select()- and poll()-able.
+	  Inotify fixes numerous shortcomings in dnotify and introduces several
+	  new features including multiple file events, one-shot support, and
+	  unmount notification.
+
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
+
+	  If unsure, say Y.
diff --git a/kernel/fs/notify/inotify/Makefile b/kernel/fs/notify/inotify/Makefile
new file mode 100644
index 000000000..a380dabe0
--- /dev/null
+++ b/kernel/fs/notify/inotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
diff --git a/kernel/fs/notify/inotify/inotify.h b/kernel/fs/notify/inotify/inotify.h
new file mode 100644
index 000000000..ed855ef6f
--- /dev/null
+++ b/kernel/fs/notify/inotify/inotify.h
@@ -0,0 +1,32 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+
+struct inotify_event_info {
+	struct fsnotify_event fse;
+	int wd;
+	u32 sync_cookie;
+	int name_len;
+	char name[];
+};
+
+struct inotify_inode_mark {
+	struct fsnotify_mark fsn_mark;
+	int wd;
+};
+
+static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
+{
+	return container_of(fse, struct inotify_event_info, fse);
+}
+
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
+					   struct fsnotify_group *group);
+extern int inotify_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				u32 mask, void *data, int data_type,
+				const unsigned char *file_name, u32 cookie);
+
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/kernel/fs/notify/inotify/inotify_fsnotify.c b/kernel/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 000000000..2cd900c2c
--- /dev/null
+++ b/kernel/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,184 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/dcache.h> /* d_unlinked */
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+#include <linux/sched.h>
+
+#include "inotify.h"
+
+/*
+ * Check if 2 events contain the same information.
+ */
+static bool event_compare(struct fsnotify_event *old_fsn,
+			  struct fsnotify_event *new_fsn)
+{
+	struct inotify_event_info *old, *new;
+
+	if (old_fsn->mask & FS_IN_IGNORED)
+		return false;
+	old = INOTIFY_E(old_fsn);
+	new = INOTIFY_E(new_fsn);
+	if ((old_fsn->mask == new_fsn->mask) &&
+	    (old_fsn->inode == new_fsn->inode) &&
+	    (old->name_len == new->name_len) &&
+	    (!old->name_len || !strcmp(old->name, new->name)))
+		return true;
+	return false;
+}
+
+static int inotify_merge(struct list_head *list,
+			  struct fsnotify_event *event)
+{
+	struct fsnotify_event *last_event;
+
+	last_event = list_entry(list->prev, struct fsnotify_event, list);
+	return event_compare(last_event, event);
+}
+
+int inotify_handle_event(struct fsnotify_group *group,
+			 struct inode *inode,
+			 struct fsnotify_mark *inode_mark,
+			 struct fsnotify_mark *vfsmount_mark,
+			 u32 mask, void *data, int data_type,
+			 const unsigned char *file_name, u32 cookie)
+{
+	struct inotify_inode_mark *i_mark;
+	struct inotify_event_info *event;
+	struct fsnotify_event *fsn_event;
+	int ret;
+	int len = 0;
+	int alloc_len = sizeof(struct inotify_event_info);
+
+	BUG_ON(vfsmount_mark);
+
+	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+	    (data_type == FSNOTIFY_EVENT_PATH)) {
+		struct path *path = data;
+
+		if (d_unlinked(path->dentry))
+			return 0;
+	}
+	if (file_name) {
+		len = strlen(file_name);
+		alloc_len += len + 1;
+	}
+
+	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+		 mask);
+
+	i_mark = container_of(inode_mark, struct inotify_inode_mark,
+			      fsn_mark);
+
+	event = kmalloc(alloc_len, GFP_KERNEL);
+	if (unlikely(!event))
+		return -ENOMEM;
+
+	fsn_event = &event->fse;
+	fsnotify_init_event(fsn_event, inode, mask);
+	event->wd = i_mark->wd;
+	event->sync_cookie = cookie;
+	event->name_len = len;
+	if (len)
+		strcpy(event->name, file_name);
+
+	ret = fsnotify_add_event(group, fsn_event, inotify_merge);
+	if (ret) {
+		/* Our event wasn't used in the end. Free it. */
+		fsnotify_destroy_event(group, fsn_event);
+	}
+
+	if (inode_mark->mask & IN_ONESHOT)
+		fsnotify_destroy_mark(inode_mark, group);
+
+	return 0;
+}
+
+static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
+{
+	inotify_ignored_and_remove_idr(fsn_mark, group);
+}
+
+/*
+ * This is NEVER supposed to be called.  Inotify marks should either have been
+ * removed from the idr when the watch was removed or in the
+ * fsnotify_destroy_mark_by_group() call when the inotify instance was being
+ * torn down.  This is only called if the idr is about to be freed but there
+ * are still marks in it.
+ */
+static int idr_callback(int id, void *p, void *data)
+{
+	struct fsnotify_mark *fsn_mark;
+	struct inotify_inode_mark *i_mark;
+	static bool warned = false;
+
+	if (warned)
+		return 0;
+
+	warned = true;
+	fsn_mark = p;
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
+		"idr.  Probably leaking memory\n", id, p, data);
+
+	/*
+	 * I'm taking the liberty of assuming that the mark in question is a
+	 * valid address and I'm dereferencing it.  This might help to figure
+	 * out why we got here and the panic is no worse than the original
+	 * BUG() that was here.
+	 */
+	if (fsn_mark)
+		printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
+			fsn_mark->group, fsn_mark->inode, i_mark->wd);
+	return 0;
+}
+
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+	/* ideally the idr is empty and we won't hit the BUG in the callback */
+	idr_for_each(&group->inotify_data.idr, idr_callback, group);
+	idr_destroy(&group->inotify_data.idr);
+	if (group->inotify_data.user) {
+		atomic_dec(&group->inotify_data.user->inotify_devs);
+		free_uid(group->inotify_data.user);
+	}
+}
+
+static void inotify_free_event(struct fsnotify_event *fsn_event)
+{
+	kfree(INOTIFY_E(fsn_event));
+}
+
+const struct fsnotify_ops inotify_fsnotify_ops = {
+	.handle_event = inotify_handle_event,
+	.free_group_priv = inotify_free_group_priv,
+	.free_event = inotify_free_event,
+	.freeing_mark = inotify_freeing_mark,
+};
diff --git a/kernel/fs/notify/inotify/inotify_user.c b/kernel/fs/notify/inotify/inotify_user.c
new file mode 100644
index 000000000..450648697
--- /dev/null
+++ b/kernel/fs/notify/inotify/inotify_user.c
@@ -0,0 +1,815 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h> /* module_init */
+#include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/anon_inodes.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
+
+#include "inotify.h"
+#include "../fdinfo.h"
+
+#include <asm/ioctls.h>
+
+/* these are configurable via /proc/sys/fs/inotify/ */
+static int inotify_max_user_instances __read_mostly;
+static int inotify_max_queued_events __read_mostly;
+static int inotify_max_user_watches __read_mostly;
+
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+struct ctl_table inotify_table[] = {
+	{
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
+
+static inline __u32 inotify_arg_to_mask(u32 arg)
+{
+	__u32 mask;
+
+	/*
+	 * everything should accept their own ignored, cares about children,
+	 * and should receive events when the inode is unmounted
+	 */
+	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
+
+	/* mask off the flags used to open the fd */
+	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+
+	return mask;
+}
+
+static inline u32 inotify_mask_to_arg(__u32 mask)
+{
+	return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+		       IN_Q_OVERFLOW);
+}
+
+/* intofiy userspace file descriptor functions */
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct fsnotify_group *group = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&group->notification_mutex);
+
+	return ret;
+}
+
+static int round_event_name_len(struct fsnotify_event *fsn_event)
+{
+	struct inotify_event_info *event;
+
+	event = INOTIFY_E(fsn_event);
+	if (!event->name_len)
+		return 0;
+	return roundup(event->name_len + 1, sizeof(struct inotify_event));
+}
+
+/*
+ * Get an inotify_kernel_event if one exists and is small
+ * enough to fit in "count". Return an error pointer if
+ * not large enough.
+ *
+ * Called with the group->notification_mutex held.
+ */
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
+{
+	size_t event_size = sizeof(struct inotify_event);
+	struct fsnotify_event *event;
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
+
+	event = fsnotify_peek_first_event(group);
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	event_size += round_event_name_len(event);
+	if (event_size > count)
+		return ERR_PTR(-EINVAL);
+
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	fsnotify_remove_first_event(group);
+
+	return event;
+}
+
+/*
+ * Copy an event to user space, returning how much we copied.
+ *
+ * We already checked that the event size is smaller than the
+ * buffer we had in "get_one_event()" above.
+ */
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *fsn_event,
+				  char __user *buf)
+{
+	struct inotify_event inotify_event;
+	struct inotify_event_info *event;
+	size_t event_size = sizeof(struct inotify_event);
+	size_t name_len;
+	size_t pad_name_len;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+
+	event = INOTIFY_E(fsn_event);
+	name_len = event->name_len;
+	/*
+	 * round up name length so it is a multiple of event_size
+	 * plus an extra byte for the terminating '\0'.
+	 */
+	pad_name_len = round_event_name_len(fsn_event);
+	inotify_event.len = pad_name_len;
+	inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+	inotify_event.wd = event->wd;
+	inotify_event.cookie = event->sync_cookie;
+
+	/* send the main event */
+	if (copy_to_user(buf, &inotify_event, event_size))
+		return -EFAULT;
+
+	buf += event_size;
+
+	/*
+	 * fsnotify only stores the pathname, so here we have to send the pathname
+	 * and then pad that pathname out to a multiple of sizeof(inotify_event)
+	 * with zeros.
+	 */
+	if (pad_name_len) {
+		/* copy the path name */
+		if (copy_to_user(buf, event->name, name_len))
+			return -EFAULT;
+		buf += name_len;
+
+		/* fill userspace with 0's */
+		if (clear_user(buf, pad_name_len - name_len))
+			return -EFAULT;
+		event_size += pad_name_len;
+	}
+
+	return event_size;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	start = buf;
+	group = file->private_data;
+
+	add_wait_queue(&group->notification_waitq, &wait);
+	while (1) {
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
+
+		pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
+
+		if (kevent) {
+			ret = PTR_ERR(kevent);
+			if (IS_ERR(kevent))
+				break;
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_destroy_event(group, kevent);
+			if (ret < 0)
+				break;
+			buf += ret;
+			count -= ret;
+			continue;
+		}
+
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
+			break;
+		ret = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+
+		if (start != buf)
+			break;
+
+		wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+	}
+	remove_wait_queue(&group->notification_waitq, &wait);
+
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
+	return ret;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct fsnotify_group *group = file->private_data;
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
+	fsnotify_destroy_group(group);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *fsn_event;
+	void __user *p;
+	int ret = -ENOTTY;
+	size_t send_len = 0;
+
+	group = file->private_data;
+	p = (void __user *) arg;
+
+	pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
+
+	switch (cmd) {
+	case FIONREAD:
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(fsn_event, &group->notification_list,
+				    list) {
+			send_len += sizeof(struct inotify_event);
+			send_len += round_event_name_len(fsn_event);
+		}
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations inotify_fops = {
+	.show_fdinfo	= inotify_show_fdinfo,
+	.poll		= inotify_poll,
+	.read		= inotify_read,
+	.fasync		= fsnotify_fasync,
+	.release	= inotify_release,
+	.unlocked_ioctl	= inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+	.llseek		= noop_llseek,
+};
+
+
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
+}
+
+static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
+			      struct inotify_inode_mark *i_mark)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(idr_lock);
+
+	ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		/* we added the mark to the idr, take a reference */
+		i_mark->wd = ret;
+		fsnotify_get_mark(&i_mark->fsn_mark);
+	}
+
+	spin_unlock(idr_lock);
+	idr_preload_end();
+	return ret < 0 ? ret : 0;
+}
+
+static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
+								int wd)
+{
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	struct inotify_inode_mark *i_mark;
+
+	assert_spin_locked(idr_lock);
+
+	i_mark = idr_find(idr, wd);
+	if (i_mark) {
+		struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
+
+		fsnotify_get_mark(fsn_mark);
+		/* One ref for being in the idr, one ref we just took */
+		BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
+	}
+
+	return i_mark;
+}
+
+static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
+							 int wd)
+{
+	struct inotify_inode_mark *i_mark;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+
+	spin_lock(idr_lock);
+	i_mark = inotify_idr_find_locked(group, wd);
+	spin_unlock(idr_lock);
+
+	return i_mark;
+}
+
+static void do_inotify_remove_from_idr(struct fsnotify_group *group,
+				       struct inotify_inode_mark *i_mark)
+{
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	int wd = i_mark->wd;
+
+	assert_spin_locked(idr_lock);
+
+	idr_remove(idr, wd);
+
+	/* removed from the idr, drop that ref */
+	fsnotify_put_mark(&i_mark->fsn_mark);
+}
+
+/*
+ * Remove the mark from the idr (if present) and drop the reference
+ * on the mark because it was in the idr.
+ */
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+				    struct inotify_inode_mark *i_mark)
+{
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	struct inotify_inode_mark *found_i_mark = NULL;
+	int wd;
+
+	spin_lock(idr_lock);
+	wd = i_mark->wd;
+
+	/*
+	 * does this i_mark think it is in the idr?  we shouldn't get called
+	 * if it wasn't....
+	 */
+	if (wd == -1) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+		goto out;
+	}
+
+	/* Lets look in the idr to see if we find it */
+	found_i_mark = inotify_idr_find_locked(group, wd);
+	if (unlikely(!found_i_mark)) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+		goto out;
+	}
+
+	/*
+	 * We found an mark in the idr at the right wd, but it's
+	 * not the mark we were told to remove.  eparis seriously
+	 * fucked up somewhere.
+	 */
+	if (unlikely(found_i_mark != i_mark)) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
+			"mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
+			"found_i_mark->group=%p found_i_mark->inode=%p\n",
+			__func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
+			i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd,
+			found_i_mark->fsn_mark.group,
+			found_i_mark->fsn_mark.inode);
+		goto out;
+	}
+
+	/*
+	 * One ref for being in the idr
+	 * one ref held by the caller trying to kill us
+	 * one ref grabbed by inotify_idr_find
+	 */
+	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
+		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+		/* we can't really recover with bad ref cnting.. */
+		BUG();
+	}
+
+	do_inotify_remove_from_idr(group, i_mark);
+out:
+	/* match the ref taken by inotify_idr_find_locked() */
+	if (found_i_mark)
+		fsnotify_put_mark(&found_i_mark->fsn_mark);
+	i_mark->wd = -1;
+	spin_unlock(idr_lock);
+}
+
+/*
+ * Send IN_IGNORED for this wd, remove this wd from the idr.
+ */
+void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
+				    struct fsnotify_group *group)
+{
+	struct inotify_inode_mark *i_mark;
+
+	/* Queue ignore event for the watch */
+	inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
+			     NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+	/* remove this mark from the idr */
+	inotify_remove_from_idr(group, i_mark);
+
+	atomic_dec(&group->inotify_data.user->inotify_watches);
+}
+
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	struct inotify_inode_mark *i_mark;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	kmem_cache_free(inotify_inode_mark_cachep, i_mark);
+}
+
+static int inotify_update_existing_watch(struct fsnotify_group *group,
+					 struct inode *inode,
+					 u32 arg)
+{
+	struct fsnotify_mark *fsn_mark;
+	struct inotify_inode_mark *i_mark;
+	__u32 old_mask, new_mask;
+	__u32 mask;
+	int add = (arg & IN_MASK_ADD);
+	int ret;
+
+	mask = inotify_arg_to_mask(arg);
+
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark)
+		return -ENOENT;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	spin_lock(&fsn_mark->lock);
+
+	old_mask = fsn_mark->mask;
+	if (add)
+		fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+	else
+		fsnotify_set_mark_mask_locked(fsn_mark, mask);
+	new_mask = fsn_mark->mask;
+
+	spin_unlock(&fsn_mark->lock);
+
+	if (old_mask != new_mask) {
+		/* more bits in old than in new? */
+		int dropped = (old_mask & ~new_mask);
+		/* more bits in this fsn_mark than the inode's mask? */
+		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+
+		/* update the inode with this new fsn_mark */
+		if (dropped || do_inode)
+			fsnotify_recalc_inode_mask(inode);
+
+	}
+
+	/* return the wd */
+	ret = i_mark->wd;
+
+	/* match the get from fsnotify_find_mark() */
+	fsnotify_put_mark(fsn_mark);
+
+	return ret;
+}
+
+static int inotify_new_watch(struct fsnotify_group *group,
+			     struct inode *inode,
+			     u32 arg)
+{
+	struct inotify_inode_mark *tmp_i_mark;
+	__u32 mask;
+	int ret;
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+
+	mask = inotify_arg_to_mask(arg);
+
+	tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!tmp_i_mark))
+		return -ENOMEM;
+
+	fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
+	tmp_i_mark->fsn_mark.mask = mask;
+	tmp_i_mark->wd = -1;
+
+	ret = -ENOSPC;
+	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
+		goto out_err;
+
+	ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
+	if (ret)
+		goto out_err;
+
+	/* we are on the idr, now get on the inode */
+	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
+				       NULL, 0);
+	if (ret) {
+		/* we failed to get on the inode, get off the idr */
+		inotify_remove_from_idr(group, tmp_i_mark);
+		goto out_err;
+	}
+
+	/* increment the number of watches the user has */
+	atomic_inc(&group->inotify_data.user->inotify_watches);
+
+	/* return the watch descriptor for this new mark */
+	ret = tmp_i_mark->wd;
+
+out_err:
+	/* match the ref from fsnotify_init_mark() */
+	fsnotify_put_mark(&tmp_i_mark->fsn_mark);
+
+	return ret;
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+	int ret = 0;
+
+	mutex_lock(&group->mark_mutex);
+	/* try to update and existing watch with the new arg */
+	ret = inotify_update_existing_watch(group, inode, arg);
+	/* no mark present, try to add a new one */
+	if (ret == -ENOENT)
+		ret = inotify_new_watch(group, inode, arg);
+	mutex_unlock(&group->mark_mutex);
+
+	return ret;
+}
+
+static struct fsnotify_group *inotify_new_group(unsigned int max_events)
+{
+	struct fsnotify_group *group;
+	struct inotify_event_info *oevent;
+
+	group = fsnotify_alloc_group(&inotify_fsnotify_ops);
+	if (IS_ERR(group))
+		return group;
+
+	oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+	if (unlikely(!oevent)) {
+		fsnotify_destroy_group(group);
+		return ERR_PTR(-ENOMEM);
+	}
+	group->overflow_event = &oevent->fse;
+	fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+	oevent->wd = -1;
+	oevent->sync_cookie = 0;
+	oevent->name_len = 0;
+
+	group->max_events = max_events;
+
+	spin_lock_init(&group->inotify_data.idr_lock);
+	idr_init(&group->inotify_data.idr);
+	group->inotify_data.user = get_current_user();
+
+	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
+	    inotify_max_user_instances) {
+		fsnotify_destroy_group(group);
+		return ERR_PTR(-EMFILE);
+	}
+
+	return group;
+}
+
+
+/* inotify syscalls */
+SYSCALL_DEFINE1(inotify_init1, int, flags)
+{
+	struct fsnotify_group *group;
+	int ret;
+
+	/* Check the IN_* constants for consistency.  */
+	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+		return -EINVAL;
+
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	group = inotify_new_group(inotify_max_queued_events);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = anon_inode_getfd("inotify", &inotify_fops, group,
+				  O_RDONLY | flags);
+	if (ret < 0)
+		fsnotify_destroy_group(group);
+
+	return ret;
+}
+
+SYSCALL_DEFINE0(inotify_init)
+{
+	return sys_inotify_init1(0);
+}
+
+SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
+		u32, mask)
+{
+	struct fsnotify_group *group;
+	struct inode *inode;
+	struct path path;
+	struct fd f;
+	int ret;
+	unsigned flags = 0;
+
+	/* don't allow invalid bits: we don't want flags set */
+	if (unlikely(!(mask & ALL_INOTIFY_BITS)))
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (unlikely(!f.file))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(f.file->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
+
+	if (!(mask & IN_DONT_FOLLOW))
+		flags |= LOOKUP_FOLLOW;
+	if (mask & IN_ONLYDIR)
+		flags |= LOOKUP_DIRECTORY;
+
+	ret = inotify_find_inode(pathname, &path, flags);
+	if (ret)
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; group by fget on fd */
+	inode = path.dentry->d_inode;
+	group = f.file->private_data;
+
+	/* create/update an inode mark */
+	ret = inotify_update_watch(group, inode, mask);
+	path_put(&path);
+fput_and_out:
+	fdput(f);
+	return ret;
+}
+
+SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
+{
+	struct fsnotify_group *group;
+	struct inotify_inode_mark *i_mark;
+	struct fd f;
+	int ret = 0;
+
+	f = fdget(fd);
+	if (unlikely(!f.file))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	ret = -EINVAL;
+	if (unlikely(f.file->f_op != &inotify_fops))
+		goto out;
+
+	group = f.file->private_data;
+
+	ret = -EINVAL;
+	i_mark = inotify_idr_find(group, wd);
+	if (unlikely(!i_mark))
+		goto out;
+
+	ret = 0;
+
+	fsnotify_destroy_mark(&i_mark->fsn_mark, group);
+
+	/* match ref taken by inotify_idr_find */
+	fsnotify_put_mark(&i_mark->fsn_mark);
+
+out:
+	fdput(f);
+	return ret;
+}
+
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+	BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
+	BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
+	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
+	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
+
+	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
+
+	inotify_max_queued_events = 16384;
+	inotify_max_user_instances = 128;
+	inotify_max_user_watches = 8192;
+
+	return 0;
+}
+module_init(inotify_user_setup);
diff --git a/kernel/fs/notify/mark.c b/kernel/fs/notify/mark.c
new file mode 100644
index 000000000..92e48c70f
--- /dev/null
+++ b/kernel/fs/notify/mark.c
@@ -0,0 +1,494 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The group->recnt and mark->refcnt tell how many "things" in the kernel
+ * currently are referencing the objects. Both kind of objects typically will
+ * live inside the kernel with a refcnt of 2, one for its creation and one for
+ * the reference a group and a mark hold to each other.
+ * If you are holding the appropriate locks, you can take a reference and the
+ * object itself is guaranteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 locks involved with fsnotify inode marks and they MUST be taken
+ * in order as follows:
+ *
+ * group->mark_mutex
+ * mark->lock
+ * inode->i_lock
+ *
+ * group->mark_mutex protects the marks_list anchored inside a given group and
+ * each mark is hooked via the g_list.  It also protects the groups private
+ * data (i.e group limits).
+
+ * mark->lock protects the marks attributes like its masks and flags.
+ * Furthermore it protects the access to a reference of the group that the mark
+ * is assigned to as well as the access to a reference of the inode/vfsmount
+ * that is being watched by the mark.
+ *
+ * inode->i_lock protects the i_fsnotify_marks list anchored inside a
+ * given inode and each mark is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list; we walk i_free_list
+ * and before we destroy the mark we make sure that we dont race with a
+ * concurrent destroy_group by getting a ref to the marks group and taking the
+ * groups mutex.
+
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+
+#include <linux/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
+
+void fsnotify_get_mark(struct fsnotify_mark *mark)
+{
+	atomic_inc(&mark->refcnt);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+	if (atomic_dec_and_test(&mark->refcnt)) {
+		if (mark->group)
+			fsnotify_put_group(mark->group);
+		mark->free_mark(mark);
+	}
+}
+
+/* Calculate mask of events for a list of marks */
+u32 fsnotify_recalc_mask(struct hlist_head *head)
+{
+	u32 new_mask = 0;
+	struct fsnotify_mark *mark;
+
+	hlist_for_each_entry(mark, head, obj_list)
+		new_mask |= mark->mask;
+	return new_mask;
+}
+
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the mark->lock
+ */
+void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
+				  struct fsnotify_group *group)
+{
+	struct inode *inode = NULL;
+
+	BUG_ON(!mutex_is_locked(&group->mark_mutex));
+
+	spin_lock(&mark->lock);
+
+	/* something else already called this function on this mark */
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+		spin_unlock(&mark->lock);
+		return;
+	}
+
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
+		inode = mark->inode;
+		fsnotify_destroy_inode_mark(mark);
+	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
+		fsnotify_destroy_vfsmount_mark(mark);
+	else
+		BUG();
+
+	list_del_init(&mark->g_list);
+
+	spin_unlock(&mark->lock);
+
+	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
+		iput(inode);
+	/* release lock temporarily */
+	mutex_unlock(&group->mark_mutex);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+	/*
+	 * We don't necessarily have a ref on mark from caller so the above destroy
+	 * may have actually freed it, unless this group provides a 'freeing_mark'
+	 * function which must be holding a reference.
+	 */
+
+	/*
+	 * Some groups like to know that marks are being freed.  This is a
+	 * callback to the group function to let it know that this mark
+	 * is being freed.
+	 */
+	if (group->ops->freeing_mark)
+		group->ops->freeing_mark(mark, group);
+
+	/*
+	 * __fsnotify_update_child_dentry_flags(inode);
+	 *
+	 * I really want to call that, but we can't, we have no idea if the inode
+	 * still exists the second we drop the mark->lock.
+	 *
+	 * The next time an event arrive to this inode from one of it's children
+	 * __fsnotify_parent will see that the inode doesn't care about it's
+	 * children and will update all of these flags then.  So really this
+	 * is just a lazy update (and could be a perf win...)
+	 */
+
+	atomic_dec(&group->num_marks);
+
+	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+}
+
+void fsnotify_destroy_mark(struct fsnotify_mark *mark,
+			   struct fsnotify_group *group)
+{
+	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+	fsnotify_destroy_mark_locked(mark, group);
+	mutex_unlock(&group->mark_mutex);
+}
+
+/*
+ * Destroy all marks in the given list. The marks must be already detached from
+ * the original inode / vfsmount.
+ */
+void fsnotify_destroy_marks(struct list_head *to_free)
+{
+	struct fsnotify_mark *mark, *lmark;
+	struct fsnotify_group *group;
+
+	list_for_each_entry_safe(mark, lmark, to_free, free_list) {
+		spin_lock(&mark->lock);
+		fsnotify_get_group(mark->group);
+		group = mark->group;
+		spin_unlock(&mark->lock);
+
+		fsnotify_destroy_mark(mark, group);
+		fsnotify_put_mark(mark);
+		fsnotify_put_group(group);
+	}
+}
+
+void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->mask = mask;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+		fsnotify_set_inode_mark_mask_locked(mark, mask);
+}
+
+void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->ignored_mask = mask;
+}
+
+/*
+ * Sorting function for lists of fsnotify marks.
+ *
+ * Fanotify supports different notification classes (reflected as priority of
+ * notification group). Events shall be passed to notification groups in
+ * decreasing priority order. To achieve this marks in notification lists for
+ * inodes and vfsmounts are sorted so that priorities of corresponding groups
+ * are descending.
+ *
+ * Furthermore correct handling of the ignore mask requires processing inode
+ * and vfsmount marks of each group together. Using the group address as
+ * further sort criterion provides a unique sorting order and thus we can
+ * merge inode and vfsmount lists of marks in linear time and find groups
+ * present in both lists.
+ *
+ * A return value of 1 signifies that b has priority over a.
+ * A return value of 0 signifies that the two marks have to be handled together.
+ * A return value of -1 signifies that a has priority over b.
+ */
+int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+{
+	if (a == b)
+		return 0;
+	if (!a)
+		return 1;
+	if (!b)
+		return -1;
+	if (a->priority < b->priority)
+		return 1;
+	if (a->priority > b->priority)
+		return -1;
+	if (a < b)
+		return 1;
+	return -1;
+}
+
+/* Add mark into proper place in given list of marks */
+int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
+			   int allow_dups)
+{
+	struct fsnotify_mark *lmark, *last = NULL;
+	int cmp;
+
+	/* is mark the first mark? */
+	if (hlist_empty(head)) {
+		hlist_add_head_rcu(&mark->obj_list, head);
+		return 0;
+	}
+
+	/* should mark be in the middle of the current list? */
+	hlist_for_each_entry(lmark, head, obj_list) {
+		last = lmark;
+
+		if ((lmark->group == mark->group) && !allow_dups)
+			return -EEXIST;
+
+		cmp = fsnotify_compare_groups(lmark->group, mark->group);
+		if (cmp >= 0) {
+			hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
+			return 0;
+		}
+	}
+
+	BUG_ON(last == NULL);
+	/* mark should be the last entry.  last is the current last entry */
+	hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
+	return 0;
+}
+
+/*
+ * Attach an initialized mark to a given group and fs object.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group.
+ */
+int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
+			     struct fsnotify_group *group, struct inode *inode,
+			     struct vfsmount *mnt, int allow_dups)
+{
+	int ret = 0;
+
+	BUG_ON(inode && mnt);
+	BUG_ON(!inode && !mnt);
+	BUG_ON(!mutex_is_locked(&group->mark_mutex));
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * group->mark_mutex
+	 * mark->lock
+	 * inode->i_lock
+	 */
+	spin_lock(&mark->lock);
+	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+
+	fsnotify_get_group(group);
+	mark->group = group;
+	list_add(&mark->g_list, &group->marks_list);
+	atomic_inc(&group->num_marks);
+	fsnotify_get_mark(mark); /* for i_list and g_list */
+
+	if (inode) {
+		ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
+		if (ret)
+			goto err;
+	} else if (mnt) {
+		ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
+		if (ret)
+			goto err;
+	} else {
+		BUG();
+	}
+
+	/* this will pin the object if appropriate */
+	fsnotify_set_mark_mask_locked(mark, mark->mask);
+	spin_unlock(&mark->lock);
+
+	if (inode)
+		__fsnotify_update_child_dentry_flags(inode);
+
+	return ret;
+err:
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	list_del_init(&mark->g_list);
+	fsnotify_put_group(group);
+	mark->group = NULL;
+	atomic_dec(&group->num_marks);
+
+	spin_unlock(&mark->lock);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+
+	return ret;
+}
+
+int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
+		      struct inode *inode, struct vfsmount *mnt, int allow_dups)
+{
+	int ret;
+	mutex_lock(&group->mark_mutex);
+	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups);
+	mutex_unlock(&group->mark_mutex);
+	return ret;
+}
+
+/*
+ * Given a list of marks, find the mark associated with given group. If found
+ * take a reference to that mark and return it, else return NULL.
+ */
+struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
+					 struct fsnotify_group *group)
+{
+	struct fsnotify_mark *mark;
+
+	hlist_for_each_entry(mark, head, obj_list) {
+		if (mark->group == group) {
+			fsnotify_get_mark(mark);
+			return mark;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * clear any marks in a group in which mark->flags & flags is true
+ */
+void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
+					 unsigned int flags)
+{
+	struct fsnotify_mark *lmark, *mark;
+
+	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
+		if (mark->flags & flags) {
+			fsnotify_get_mark(mark);
+			fsnotify_destroy_mark_locked(mark, group);
+			fsnotify_put_mark(mark);
+		}
+	}
+	mutex_unlock(&group->mark_mutex);
+}
+
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+}
+
+void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
+{
+	assert_spin_locked(&old->lock);
+	new->inode = old->inode;
+	new->mnt = old->mnt;
+	if (old->group)
+		fsnotify_get_group(old->group);
+	new->group = old->group;
+	new->mask = old->mask;
+	new->free_mark = old->free_mark;
+}
+
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark *mark,
+			void (*free_mark)(struct fsnotify_mark *mark))
+{
+	memset(mark, 0, sizeof(*mark));
+	spin_lock_init(&mark->lock);
+	atomic_set(&mark->refcnt, 1);
+	mark->free_mark = free_mark;
+}
+
+static int fsnotify_mark_destroy(void *ignored)
+{
+	struct fsnotify_mark *mark, *next;
+	struct list_head private_destroy_list;
+
+	for (;;) {
+		spin_lock(&destroy_lock);
+		/* exchange the list head */
+		list_replace_init(&destroy_list, &private_destroy_list);
+		spin_unlock(&destroy_lock);
+
+		synchronize_srcu(&fsnotify_mark_srcu);
+
+		list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+			list_del_init(&mark->g_list);
+			fsnotify_put_mark(mark);
+		}
+
+		wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
+	}
+
+	return 0;
+}
+
+static int __init fsnotify_mark_init(void)
+{
+	struct task_struct *thread;
+
+	thread = kthread_run(fsnotify_mark_destroy, NULL,
+			     "fsnotify_mark");
+	if (IS_ERR(thread))
+		panic("unable to start fsnotify mark destruction thread.");
+
+	return 0;
+}
+device_initcall(fsnotify_mark_init);
diff --git a/kernel/fs/notify/notification.c b/kernel/fs/notify/notification.c
new file mode 100644
index 000000000..a95d8e037
--- /dev/null
+++ b/kernel/fs/notify/notification.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asynchronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+	return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
+
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	return list_empty(&group->notification_list) ? true : false;
+}
+
+void fsnotify_destroy_event(struct fsnotify_group *group,
+			    struct fsnotify_event *event)
+{
+	/* Overflow events are per-group and we don't want to free them */
+	if (!event || event->mask == FS_Q_OVERFLOW)
+		return;
+	/* If the event is still queued, we have a problem... */
+	WARN_ON(!list_empty(&event->list));
+	group->ops->free_event(event);
+}
+
+/*
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  The function returns 0 if the event was
+ * added to the queue, 1 if the event was merged with some other queued event,
+ * 2 if the queue of events has overflown.
+ */
+int fsnotify_add_event(struct fsnotify_group *group,
+		       struct fsnotify_event *event,
+		       int (*merge)(struct list_head *,
+				    struct fsnotify_event *))
+{
+	int ret = 0;
+	struct list_head *list = &group->notification_list;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	mutex_lock(&group->notification_mutex);
+
+	if (group->q_len >= group->max_events) {
+		ret = 2;
+		/* Queue overflow event only if it isn't already queued */
+		if (!list_empty(&group->overflow_event->list)) {
+			mutex_unlock(&group->notification_mutex);
+			return ret;
+		}
+		event = group->overflow_event;
+		goto queue;
+	}
+
+	if (!list_empty(list) && merge) {
+		ret = merge(list, event);
+		if (ret) {
+			mutex_unlock(&group->notification_mutex);
+			return ret;
+		}
+	}
+
+queue:
+	group->q_len++;
+	list_add_tail(&event->list, list);
+	mutex_unlock(&group->notification_mutex);
+
+	wake_up(&group->notification_waitq);
+	kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
+	return ret;
+}
+
+/*
+ * Remove @event from group's notification queue. It is the responsibility of
+ * the caller to destroy the event.
+ */
+void fsnotify_remove_event(struct fsnotify_group *group,
+			   struct fsnotify_event *event)
+{
+	mutex_lock(&group->notification_mutex);
+	if (!list_empty(&event->list)) {
+		list_del_init(&event->list);
+		group->q_len--;
+	}
+	mutex_unlock(&group->notification_mutex);
+}
+
+/*
+ * Remove and return the first event from the notification list.  It is the
+ * responsibility of the caller to destroy the obtained event
+ */
+struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	event = list_first_entry(&group->notification_list,
+				 struct fsnotify_event, list);
+	/*
+	 * We need to init list head for the case of overflow event so that
+	 * check in fsnotify_add_event() works
+	 */
+	list_del_init(&event->list);
+	group->q_len--;
+
+	return event;
+}
+
+/*
+ * This will not remove the event, that must be done with
+ * fsnotify_remove_first_event()
+ */
+struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	return list_first_entry(&group->notification_list,
+				struct fsnotify_event, list);
+}
+
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		event = fsnotify_remove_first_event(group);
+		fsnotify_destroy_event(group, event);
+	}
+	mutex_unlock(&group->notification_mutex);
+}
+
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @inode the inode which is supposed to receive the event (sometimes a
+ *	parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
+ */
+void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
+			 u32 mask)
+{
+	INIT_LIST_HEAD(&event->list);
+	event->inode = inode;
+	event->mask = mask;
+}
diff --git a/kernel/fs/notify/vfsmount_mark.c b/kernel/fs/notify/vfsmount_mark.c
new file mode 100644
index 000000000..326b148e6
--- /dev/null
+++ b/kernel/fs/notify/vfsmount_mark.c
@@ -0,0 +1,127 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include <linux/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+#include "../mount.h"
+
+void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *n;
+	struct mount *m = real_mount(mnt);
+	LIST_HEAD(free_list);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
+		list_add(&mark->free_list, &free_list);
+		hlist_del_init_rcu(&mark->obj_list);
+		fsnotify_get_mark(mark);
+	}
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	fsnotify_destroy_marks(&free_list);
+}
+
+void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
+}
+
+/*
+ * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this mount point
+ */
+void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
+{
+	struct mount *m = real_mount(mnt);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
+{
+	struct vfsmount *mnt = mark->mnt;
+	struct mount *m = real_mount(mnt);
+
+	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
+	assert_spin_locked(&mark->lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+
+	hlist_del_init_rcu(&mark->obj_list);
+	mark->mnt = NULL;
+
+	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+/*
+ * given a group and vfsmount, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
+						  struct vfsmount *mnt)
+{
+	struct mount *m = real_mount(mnt);
+	struct fsnotify_mark *mark;
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return mark;
+}
+
+/*
+ * Attach an initialized mark to a given group and vfsmount.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which groups.
+ */
+int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+			       struct fsnotify_group *group, struct vfsmount *mnt,
+			       int allow_dups)
+{
+	struct mount *m = real_mount(mnt);
+	int ret;
+
+	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
+
+	BUG_ON(!mutex_is_locked(&group->mark_mutex));
+	assert_spin_locked(&mark->lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	mark->mnt = mnt;
+	ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
+	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return ret;
+}
diff --git a/kernel/fs/nsfs.c b/kernel/fs/nsfs.c
new file mode 100644
index 000000000..99521e7c4
--- /dev/null
+++ b/kernel/fs/nsfs.c
@@ -0,0 +1,161 @@
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/magic.h>
+#include <linux/ktime.h>
+
+static struct vfsmount *nsfs_mnt;
+
+static const struct file_operations ns_file_operations = {
+	.llseek		= no_llseek,
+};
+
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = d_inode(dentry);
+	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+
+	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+		ns_ops->name, inode->i_ino);
+}
+
+static void ns_prune_dentry(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	if (inode) {
+		struct ns_common *ns = inode->i_private;
+		atomic_long_set(&ns->stashed, 0);
+	}
+}
+
+const struct dentry_operations ns_dentry_operations =
+{
+	.d_prune	= ns_prune_dentry,
+	.d_delete	= always_delete_dentry,
+	.d_dname	= ns_dname,
+};
+
+static void nsfs_evict(struct inode *inode)
+{
+	struct ns_common *ns = inode->i_private;
+	clear_inode(inode);
+	ns->ops->put(ns);
+}
+
+void *ns_get_path(struct path *path, struct task_struct *task,
+			const struct proc_ns_operations *ns_ops)
+{
+	struct vfsmount *mnt = mntget(nsfs_mnt);
+	struct qstr qname = { .name = "", };
+	struct dentry *dentry;
+	struct inode *inode;
+	struct ns_common *ns;
+	unsigned long d;
+
+again:
+	ns = ns_ops->get(task);
+	if (!ns) {
+		mntput(mnt);
+		return ERR_PTR(-ENOENT);
+	}
+	rcu_read_lock();
+	d = atomic_long_read(&ns->stashed);
+	if (!d)
+		goto slow;
+	dentry = (struct dentry *)d;
+	if (!lockref_get_not_dead(&dentry->d_lockref))
+		goto slow;
+	rcu_read_unlock();
+	ns_ops->put(ns);
+got_it:
+	path->mnt = mnt;
+	path->dentry = dentry;
+	return NULL;
+slow:
+	rcu_read_unlock();
+	inode = new_inode_pseudo(mnt->mnt_sb);
+	if (!inode) {
+		ns_ops->put(ns);
+		mntput(mnt);
+		return ERR_PTR(-ENOMEM);
+	}
+	inode->i_ino = ns->inum;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_flags |= S_IMMUTABLE;
+	inode->i_mode = S_IFREG | S_IRUGO;
+	inode->i_fop = &ns_file_operations;
+	inode->i_private = ns;
+
+	dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
+	if (!dentry) {
+		iput(inode);
+		mntput(mnt);
+		return ERR_PTR(-ENOMEM);
+	}
+	d_instantiate(dentry, inode);
+	dentry->d_fsdata = (void *)ns_ops;
+	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
+	if (d) {
+		d_delete(dentry);	/* make sure ->d_prune() does nothing */
+		dput(dentry);
+		cpu_relax();
+		goto again;
+	}
+	goto got_it;
+}
+
+int ns_get_name(char *buf, size_t size, struct task_struct *task,
+			const struct proc_ns_operations *ns_ops)
+{
+	struct ns_common *ns;
+	int res = -ENOENT;
+	ns = ns_ops->get(task);
+	if (ns) {
+		res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+		ns_ops->put(ns);
+	}
+	return res;
+}
+
+struct file *proc_ns_fget(int fd)
+{
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+
+	if (file->f_op != &ns_file_operations)
+		goto out_invalid;
+
+	return file;
+
+out_invalid:
+	fput(file);
+	return ERR_PTR(-EINVAL);
+}
+
+static const struct super_operations nsfs_ops = {
+	.statfs = simple_statfs,
+	.evict_inode = nsfs_evict,
+};
+static struct dentry *nsfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
+			&ns_dentry_operations, NSFS_MAGIC);
+}
+static struct file_system_type nsfs = {
+	.name = "nsfs",
+	.mount = nsfs_mount,
+	.kill_sb = kill_anon_super,
+};
+
+void __init nsfs_init(void)
+{
+	nsfs_mnt = kern_mount(&nsfs);
+	if (IS_ERR(nsfs_mnt))
+		panic("can't set nsfs up\n");
+	nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+}
diff --git a/kernel/fs/ntfs/Kconfig b/kernel/fs/ntfs/Kconfig
new file mode 100644
index 000000000..f5a868cc9
--- /dev/null
+++ b/kernel/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
+config NTFS_FS
+	tristate "NTFS file system support"
+	select NLS
+	help
+	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
+
+	  Saying Y or M here enables read support.  There is partial, but
+	  safe, write support available.  For write support you must also
+	  say Y to "NTFS write support" below.
+
+	  There are also a number of user-space tools available, called
+	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
+	  without NTFS support enabled in the kernel.
+
+	  This is a rewrite from scratch of Linux NTFS support and replaced
+	  the old NTFS code starting with Linux 2.5.11.  A backport to
+	  the Linux 2.4 kernel series is separately available as a patch
+	  from the project web site.
+
+	  For more information see <file:Documentation/filesystems/ntfs.txt>
+	  and <http://www.linux-ntfs.org/>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ntfs.
+
+	  If you are not using Windows NT, 2000, XP or 2003 in addition to
+	  Linux on your computer it is safe to say N.
+
+config NTFS_DEBUG
+	bool "NTFS debugging support"
+	depends on NTFS_FS
+	help
+	  If you are experiencing any problems with the NTFS file system, say
+	  Y here.  This will result in additional consistency checks to be
+	  performed by the driver as well as additional debugging messages to
+	  be written to the system log.  Note that debugging messages are
+	  disabled by default.  To enable them, supply the option debug_msgs=1
+	  at the kernel command line when booting the kernel or as an option
+	  to insmod when loading the ntfs module.  Once the driver is active,
+	  you can enable debugging messages by doing (as root):
+	  echo 1 > /proc/sys/fs/ntfs-debug
+	  Replacing the "1" with "0" would disable debug messages.
+
+	  If you leave debugging messages disabled, this results in little
+	  overhead, but enabling debug messages results in very significant
+	  slowdown of the system.
+
+	  When reporting bugs, please try to have available a full dump of
+	  debugging messages while the misbehaviour was occurring.
+
+config NTFS_RW
+	bool "NTFS write support"
+	depends on NTFS_FS
+	help
+	  This enables the partial, but safe, write support in the NTFS driver.
+
+	  The only supported operation is overwriting existing files, without
+	  changing the file length.  No file or directory creation, deletion or
+	  renaming is possible.  Note only non-resident files can be written to
+	  so you may find that some very small files (<500 bytes or so) cannot
+	  be written to.
+
+	  While we cannot guarantee that it will not damage any data, we have
+	  so far not received a single report where the driver would have
+	  damaged someones data so we assume it is perfectly safe to use.
+
+	  Note:  While write support is safe in this version (a rewrite from
+	  scratch of the NTFS support), it should be noted that the old NTFS
+	  write support, included in Linux 2.5.10 and before (since 1997),
+	  is not safe.
+
+	  This is currently useful with TopologiLinux.  TopologiLinux is run
+	  on top of any DOS/Microsoft Windows system without partitioning your
+	  hard disk.  Unlike other Linux distributions TopologiLinux does not
+	  need its own partition.  For more information see
+	  <http://topologi-linux.sourceforge.net/>
+
+	  It is perfectly safe to say N here.
diff --git a/kernel/fs/ntfs/Makefile b/kernel/fs/ntfs/Makefile
new file mode 100644
index 000000000..2ff263e6d
--- /dev/null
+++ b/kernel/fs/ntfs/Makefile
@@ -0,0 +1,14 @@
+# Rules for making the NTFS driver.
+
+obj-$(CONFIG_NTFS_FS) += ntfs.o
+
+ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+	  index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+	  unistr.o upcase.o
+
+ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
+
+ccflags-y := -DNTFS_VERSION=\"2.1.32\"
+ccflags-$(CONFIG_NTFS_DEBUG)	+= -DDEBUG
+ccflags-$(CONFIG_NTFS_RW)	+= -DNTFS_RW
+
diff --git a/kernel/fs/ntfs/aops.c b/kernel/fs/ntfs/aops.c
new file mode 100644
index 000000000..f0de4b6b8
--- /dev/null
+++ b/kernel/fs/ntfs/aops.c
@@ -0,0 +1,1773 @@
+/**
+ * aops.c - NTFS kernel address space operations and page cache handling.
+ *
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "debug.h"
+#include "inode.h"
+#include "mft.h"
+#include "runlist.h"
+#include "types.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_end_buffer_async_read - async io completion for reading attributes
+ * @bh:		buffer head on which io is completed
+ * @uptodate:	whether @bh is now uptodate or not
+ *
+ * Asynchronous I/O completion handler for reading pages belonging to the
+ * attribute address space of an inode.  The inodes can either be files or
+ * directories or they can be fake inodes describing some attribute.
+ *
+ * If NInoMstProtected(), perform the post read mst fixups when all IO on the
+ * page has been completed and mark the page uptodate or set the error bit on
+ * the page.  To determine the size of the records that need fixing up, we
+ * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
+ * record size, and index_block_size_bits, to the log(base 2) of the ntfs
+ * record size.
+ */
+static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags;
+	struct buffer_head *first, *tmp;
+	struct page *page;
+	struct inode *vi;
+	ntfs_inode *ni;
+	int page_uptodate = 1;
+
+	page = bh->b_page;
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+
+	if (likely(uptodate)) {
+		loff_t i_size;
+		s64 file_ofs, init_size;
+
+		set_buffer_uptodate(bh);
+
+		file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
+				bh_offset(bh);
+		read_lock_irqsave(&ni->size_lock, flags);
+		init_size = ni->initialized_size;
+		i_size = i_size_read(vi);
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (unlikely(init_size > i_size)) {
+			/* Race with shrinking truncate. */
+			init_size = i_size;
+		}
+		/* Check for the current buffer head overflowing. */
+		if (unlikely(file_ofs + bh->b_size > init_size)) {
+			int ofs;
+			void *kaddr;
+
+			ofs = 0;
+			if (file_ofs < init_size)
+				ofs = init_size - file_ofs;
+			local_irq_save(flags);
+			kaddr = kmap_atomic(page);
+			memset(kaddr + bh_offset(bh) + ofs, 0,
+					bh->b_size - ofs);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr);
+			local_irq_restore(flags);
+		}
+	} else {
+		clear_buffer_uptodate(bh);
+		SetPageError(page);
+		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
+				"0x%llx.", (unsigned long long)bh->b_blocknr);
+	}
+	first = page_buffers(page);
+	flags = bh_uptodate_lock_irqsave(first);
+	clear_buffer_async_read(bh);
+	unlock_buffer(bh);
+	tmp = bh;
+	do {
+		if (!buffer_uptodate(tmp))
+			page_uptodate = 0;
+		if (buffer_async_read(tmp)) {
+			if (likely(buffer_locked(tmp)))
+				goto still_busy;
+			/* Async buffers must be locked. */
+			BUG();
+		}
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+	bh_uptodate_unlock_irqrestore(first, flags);
+	/*
+	 * If none of the buffers had errors then we can set the page uptodate,
+	 * but we first have to perform the post read mst fixups, if the
+	 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
+	 * Note we ignore fixup errors as those are detected when
+	 * map_mft_record() is called which gives us per record granularity
+	 * rather than per page granularity.
+	 */
+	if (!NInoMstProtected(ni)) {
+		if (likely(page_uptodate && !PageError(page)))
+			SetPageUptodate(page);
+	} else {
+		u8 *kaddr;
+		unsigned int i, recs;
+		u32 rec_size;
+
+		rec_size = ni->itype.index.block_size;
+		recs = PAGE_CACHE_SIZE / rec_size;
+		/* Should have been verified before we got here... */
+		BUG_ON(!recs);
+		local_irq_save_nort(flags);
+		kaddr = kmap_atomic(page);
+		for (i = 0; i < recs; i++)
+			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
+					i * rec_size), rec_size);
+		kunmap_atomic(kaddr);
+		local_irq_restore_nort(flags);
+		flush_dcache_page(page);
+		if (likely(page_uptodate && !PageError(page)))
+			SetPageUptodate(page);
+	}
+	unlock_page(page);
+	return;
+still_busy:
+	bh_uptodate_unlock_irqrestore(first, flags);
+}
+
+/**
+ * ntfs_read_block - fill a @page of an address space with data
+ * @page:	page cache page to fill with data
+ *
+ * Fill the page @page of the address space belonging to the @page->host inode.
+ * We read each buffer asynchronously and when all buffers are read in, our io
+ * completion handler ntfs_end_buffer_read_async(), if required, automatically
+ * applies the mst fixups to the page before finally marking it uptodate and
+ * unlocking it.
+ *
+ * We only enforce allocated_size limit because i_size is checked for in
+ * generic_file_read().
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Contains an adapted version of fs/buffer.c::block_read_full_page().
+ */
+static int ntfs_read_block(struct page *page)
+{
+	loff_t i_size;
+	VCN vcn;
+	LCN lcn;
+	s64 init_size;
+	struct inode *vi;
+	ntfs_inode *ni;
+	ntfs_volume *vol;
+	runlist_element *rl;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	sector_t iblock, lblock, zblock;
+	unsigned long flags;
+	unsigned int blocksize, vcn_ofs;
+	int i, nr;
+	unsigned char blocksize_bits;
+
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+	vol = ni->vol;
+
+	/* $MFT/$DATA must have its complete runlist in memory at all times. */
+	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
+
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, blocksize, 0);
+		if (unlikely(!page_has_buffers(page))) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+
+	/*
+	 * We may be racing with truncate.  To avoid some of the problems we
+	 * now take a snapshot of the various sizes and use those for the whole
+	 * of the function.  In case of an extending truncate it just means we
+	 * may leave some buffers unmapped which are now allocated.  This is
+	 * not a problem since these buffers will just get mapped when a write
+	 * occurs.  In case of a shrinking truncate, we will detect this later
+	 * on due to the runlist being incomplete and if the page is being
+	 * fully truncated, truncate will throw it away as soon as we unlock
+	 * it so no need to worry what we do with it.
+	 */
+	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+	read_lock_irqsave(&ni->size_lock, flags);
+	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
+	init_size = ni->initialized_size;
+	i_size = i_size_read(vi);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(init_size > i_size)) {
+		/* Race with shrinking truncate. */
+		init_size = i_size;
+	}
+	zblock = (init_size + blocksize - 1) >> blocksize_bits;
+
+	/* Loop through all the buffers in the page. */
+	rl = NULL;
+	nr = i = 0;
+	do {
+		int err = 0;
+
+		if (unlikely(buffer_uptodate(bh)))
+			continue;
+		if (unlikely(buffer_mapped(bh))) {
+			arr[nr++] = bh;
+			continue;
+		}
+		bh->b_bdev = vol->sb->s_bdev;
+		/* Is the block within the allowed limits? */
+		if (iblock < lblock) {
+			bool is_retry = false;
+
+			/* Convert iblock into corresponding vcn and offset. */
+			vcn = (VCN)iblock << blocksize_bits >>
+					vol->cluster_size_bits;
+			vcn_ofs = ((VCN)iblock << blocksize_bits) &
+					vol->cluster_size_mask;
+			if (!rl) {
+lock_retry_remap:
+				down_read(&ni->runlist.lock);
+				rl = ni->runlist.rl;
+			}
+			if (likely(rl != NULL)) {
+				/* Seek to element containing target vcn. */
+				while (rl->length && rl[1].vcn <= vcn)
+					rl++;
+				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			} else
+				lcn = LCN_RL_NOT_MAPPED;
+			/* Successful remap. */
+			if (lcn >= 0) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn << vol->cluster_size_bits)
+						+ vcn_ofs) >> blocksize_bits;
+				set_buffer_mapped(bh);
+				/* Only read initialized data blocks. */
+				if (iblock < zblock) {
+					arr[nr++] = bh;
+					continue;
+				}
+				/* Fully non-initialized data block, zero it. */
+				goto handle_zblock;
+			}
+			/* It is a hole, need to zero it. */
+			if (lcn == LCN_HOLE)
+				goto handle_hole;
+			/* If first try and runlist unmapped, map and retry. */
+			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+				is_retry = true;
+				/*
+				 * Attempt to map runlist, dropping lock for
+				 * the duration.
+				 */
+				up_read(&ni->runlist.lock);
+				err = ntfs_map_runlist(ni, vcn);
+				if (likely(!err))
+					goto lock_retry_remap;
+				rl = NULL;
+			} else if (!rl)
+				up_read(&ni->runlist.lock);
+			/*
+			 * If buffer is outside the runlist, treat it as a
+			 * hole.  This can happen due to concurrent truncate
+			 * for example.
+			 */
+			if (err == -ENOENT || lcn == LCN_ENOENT) {
+				err = 0;
+				goto handle_hole;
+			}
+			/* Hard error, zero out region. */
+			if (!err)
+				err = -EIO;
+			bh->b_blocknr = -1;
+			SetPageError(page);
+			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
+					"attribute type 0x%x, vcn 0x%llx, "
+					"offset 0x%x because its location on "
+					"disk could not be determined%s "
+					"(error code %i).", ni->mft_no,
+					ni->type, (unsigned long long)vcn,
+					vcn_ofs, is_retry ? " even after "
+					"retrying" : "", err);
+		}
+		/*
+		 * Either iblock was outside lblock limits or
+		 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
+		 * of the page and set the buffer uptodate.
+		 */
+handle_hole:
+		bh->b_blocknr = -1UL;
+		clear_buffer_mapped(bh);
+handle_zblock:
+		zero_user(page, i * blocksize, blocksize);
+		if (likely(!err))
+			set_buffer_uptodate(bh);
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	/* Release the lock if we took it. */
+	if (rl)
+		up_read(&ni->runlist.lock);
+
+	/* Check we have at least one buffer ready for i/o. */
+	if (nr) {
+		struct buffer_head *tbh;
+
+		/* Lock the buffers. */
+		for (i = 0; i < nr; i++) {
+			tbh = arr[i];
+			lock_buffer(tbh);
+			tbh->b_end_io = ntfs_end_buffer_async_read;
+			set_buffer_async_read(tbh);
+		}
+		/* Finally, start i/o on the buffers. */
+		for (i = 0; i < nr; i++) {
+			tbh = arr[i];
+			if (likely(!buffer_uptodate(tbh)))
+				submit_bh(READ, tbh);
+			else
+				ntfs_end_buffer_async_read(tbh, 1);
+		}
+		return 0;
+	}
+	/* No i/o was scheduled on any of the buffers. */
+	if (likely(!PageError(page)))
+		SetPageUptodate(page);
+	else /* Signal synchronous i/o error. */
+		nr = -EIO;
+	unlock_page(page);
+	return nr;
+}
+
+/**
+ * ntfs_readpage - fill a @page of a @file with data from the device
+ * @file:	open file to which the page @page belongs or NULL
+ * @page:	page cache page to fill with data
+ *
+ * For non-resident attributes, ntfs_readpage() fills the @page of the open
+ * file @file by calling the ntfs version of the generic block_read_full_page()
+ * function, ntfs_read_block(), which in turn creates and reads in the buffers
+ * associated with the page asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
+ * data from the mft record (which at this stage is most likely in memory) and
+ * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
+ * even if the mft record is not cached at this point in time, we need to wait
+ * for it to be read in before we can do the copy.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_readpage(struct file *file, struct page *page)
+{
+	loff_t i_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	u8 *addr;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *mrec;
+	unsigned long flags;
+	u32 attr_len;
+	int err = 0;
+
+retry_readpage:
+	BUG_ON(!PageLocked(page));
+	vi = page->mapping->host;
+	i_size = i_size_read(vi);
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		ntfs_debug("Read outside i_size - truncated?");
+		goto done;
+	}
+	/*
+	 * This can potentially happen because we clear PageUptodate() during
+	 * ntfs_writepage() of MstProtected() attributes.
+	 */
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		return 0;
+	}
+	ni = NTFS_I(vi);
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If attribute is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			err = -EACCES;
+			goto err_out;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			return ntfs_read_compressed_block(page);
+		}
+	}
+	/* NInoNonResident() == NInoIndexAllocPresent() */
+	if (NInoNonResident(ni)) {
+		/* Normal, non-resident data stream. */
+		return ntfs_read_block(page);
+	}
+	/*
+	 * Attribute is resident, implying it is not compressed or encrypted.
+	 * This also means the attribute is smaller than an mft record and
+	 * hence smaller than a page, so can simply zero out any pages with
+	 * index above 0.  Note the attribute can actually be marked compressed
+	 * but if it is resident the actual data is not compressed so we are
+	 * ok to ignore the compressed flag here.
+	 */
+	if (unlikely(page->index > 0)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		goto done;
+	}
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Map, pin, and lock the mft record. */
+	mrec = map_mft_record(base_ni);
+	if (IS_ERR(mrec)) {
+		err = PTR_ERR(mrec);
+		goto err_out;
+	}
+	/*
+	 * If a parallel write made the attribute non-resident, drop the mft
+	 * record and retry the readpage.
+	 */
+	if (unlikely(NInoNonResident(ni))) {
+		unmap_mft_record(base_ni);
+		goto retry_readpage;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err))
+		goto put_unm_err_out;
+	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+	read_lock_irqsave(&ni->size_lock, flags);
+	if (unlikely(attr_len > ni->initialized_size))
+		attr_len = ni->initialized_size;
+	i_size = i_size_read(vi);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate. */
+		attr_len = i_size;
+	}
+	addr = kmap_atomic(page);
+	/* Copy the data to the page. */
+	memcpy(addr, (u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset),
+			attr_len);
+	/* Zero the remainder of the page. */
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	flush_dcache_page(page);
+	kunmap_atomic(addr);
+put_unm_err_out:
+	ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+	unmap_mft_record(base_ni);
+done:
+	SetPageUptodate(page);
+err_out:
+	unlock_page(page);
+	return err;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_write_block - write a @page to the backing store
+ * @page:	page cache page to write out
+ * @wbc:	writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, non-mst
+ * protected attributes to their backing store.
+ *
+ * For a page with buffers, map and write the dirty buffers asynchronously
+ * under page writeback. For a page without buffers, create buffers for the
+ * page, then proceed as above.
+ *
+ * If a page doesn't have buffers the page dirty state is definitive. If a page
+ * does have buffers, the page dirty state is just a hint, and the buffer dirty
+ * state is definitive. (A hint which has rules: dirty buffers against a clean
+ * page is illegal. Other combinations are legal and need to be handled. In
+ * particular a dirty page containing clean buffers for example.)
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_read_block() and __block_write_full_page().
+ */
+static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
+{
+	VCN vcn;
+	LCN lcn;
+	s64 initialized_size;
+	loff_t i_size;
+	sector_t block, dblock, iblock;
+	struct inode *vi;
+	ntfs_inode *ni;
+	ntfs_volume *vol;
+	runlist_element *rl;
+	struct buffer_head *bh, *head;
+	unsigned long flags;
+	unsigned int blocksize, vcn_ofs;
+	int err;
+	bool need_end_writeback;
+	unsigned char blocksize_bits;
+
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+	vol = ni->vol;
+
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+			"0x%lx.", ni->mft_no, ni->type, page->index);
+
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(NInoMstProtected(ni));
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+	if (!page_has_buffers(page)) {
+		BUG_ON(!PageUptodate(page));
+		create_empty_buffers(page, blocksize,
+				(1 << BH_Uptodate) | (1 << BH_Dirty));
+		if (unlikely(!page_has_buffers(page))) {
+			ntfs_warning(vol->sb, "Error allocating page "
+					"buffers.  Redirtying page so we try "
+					"again later.");
+			/*
+			 * Put the page back on mapping->dirty_pages, but leave
+			 * its buffers' dirty state as-is.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+
+	/* NOTE: Different naming scheme to ntfs_read_block()! */
+
+	/* The first block in the page. */
+	block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+
+	read_lock_irqsave(&ni->size_lock, flags);
+	i_size = i_size_read(vi);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+
+	/* The first out of bounds block for the data size. */
+	dblock = (i_size + blocksize - 1) >> blocksize_bits;
+
+	/* The last (fully or partially) initialized block. */
+	iblock = initialized_size >> blocksize_bits;
+
+	/*
+	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+	 * here, and the (potentially unmapped) buffers may become dirty at
+	 * any time.  If a buffer becomes dirty here after we've inspected it
+	 * then we just miss that fact, and the page stays dirty.
+	 *
+	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+	 * handle that here by just cleaning them.
+	 */
+
+	/*
+	 * Loop through all the buffers in the page, mapping all the dirty
+	 * buffers to disk addresses and handling any aliases from the
+	 * underlying block device's mapping.
+	 */
+	rl = NULL;
+	err = 0;
+	do {
+		bool is_retry = false;
+
+		if (unlikely(block >= dblock)) {
+			/*
+			 * Mapped buffers outside i_size will occur, because
+			 * this page can be outside i_size when there is a
+			 * truncate in progress. The contents of such buffers
+			 * were zeroed by ntfs_writepage().
+			 *
+			 * FIXME: What about the small race window where
+			 * ntfs_writepage() has not done any clearing because
+			 * the page was within i_size but before we get here,
+			 * vmtruncate() modifies i_size?
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+			continue;
+		}
+
+		/* Clean buffers are not written out, so no need to map them. */
+		if (!buffer_dirty(bh))
+			continue;
+
+		/* Make sure we have enough initialized size. */
+		if (unlikely((block >= iblock) &&
+				(initialized_size < i_size))) {
+			/*
+			 * If this page is fully outside initialized size, zero
+			 * out all pages between the current initialized size
+			 * and the current page. Just use ntfs_readpage() to do
+			 * the zeroing transparently.
+			 */
+			if (block > iblock) {
+				// TODO:
+				// For each page do:
+				// - read_cache_page()
+				// Again for each page do:
+				// - wait_on_page_locked()
+				// - Check (PageUptodate(page) &&
+				//			!PageError(page))
+				// Update initialized size in the attribute and
+				// in the inode.
+				// Again, for each page do:
+				//	__set_page_dirty_buffers();
+				// page_cache_release()
+				// We don't need to wait on the writes.
+				// Update iblock.
+			}
+			/*
+			 * The current page straddles initialized size. Zero
+			 * all non-uptodate buffers and set them uptodate (and
+			 * dirty?). Note, there aren't any non-uptodate buffers
+			 * if the page is uptodate.
+			 * FIXME: For an uptodate page, the buffers may need to
+			 * be written out because they were not initialized on
+			 * disk before.
+			 */
+			if (!PageUptodate(page)) {
+				// TODO:
+				// Zero any non-uptodate buffers up to i_size.
+				// Set them uptodate and dirty.
+			}
+			// TODO:
+			// Update initialized size in the attribute and in the
+			// inode (up to i_size).
+			// Update iblock.
+			// FIXME: This is inefficient. Try to batch the two
+			// size changes to happen in one go.
+			ntfs_error(vol->sb, "Writing beyond initialized size "
+					"is not supported yet. Sorry.");
+			err = -EOPNOTSUPP;
+			break;
+			// Do NOT set_buffer_new() BUT DO clear buffer range
+			// outside write request range.
+			// set_buffer_uptodate() on complete buffers as well as
+			// set_buffer_dirty().
+		}
+
+		/* No need to map buffers that are already mapped. */
+		if (buffer_mapped(bh))
+			continue;
+
+		/* Unmapped, dirty buffer. Need to map it. */
+		bh->b_bdev = vol->sb->s_bdev;
+
+		/* Convert block into corresponding vcn and offset. */
+		vcn = (VCN)block << blocksize_bits;
+		vcn_ofs = vcn & vol->cluster_size_mask;
+		vcn >>= vol->cluster_size_bits;
+		if (!rl) {
+lock_retry_remap:
+			down_read(&ni->runlist.lock);
+			rl = ni->runlist.rl;
+		}
+		if (likely(rl != NULL)) {
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+		} else
+			lcn = LCN_RL_NOT_MAPPED;
+		/* Successful remap. */
+		if (lcn >= 0) {
+			/* Setup buffer head to point to correct block. */
+			bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
+					vcn_ofs) >> blocksize_bits;
+			set_buffer_mapped(bh);
+			continue;
+		}
+		/* It is a hole, need to instantiate it. */
+		if (lcn == LCN_HOLE) {
+			u8 *kaddr;
+			unsigned long *bpos, *bend;
+
+			/* Check if the buffer is zero. */
+			kaddr = kmap_atomic(page);
+			bpos = (unsigned long *)(kaddr + bh_offset(bh));
+			bend = (unsigned long *)((u8*)bpos + blocksize);
+			do {
+				if (unlikely(*bpos))
+					break;
+			} while (likely(++bpos < bend));
+			kunmap_atomic(kaddr);
+			if (bpos == bend) {
+				/*
+				 * Buffer is zero and sparse, no need to write
+				 * it.
+				 */
+				bh->b_blocknr = -1;
+				clear_buffer_dirty(bh);
+				continue;
+			}
+			// TODO: Instantiate the hole.
+			// clear_buffer_new(bh);
+			// unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			ntfs_error(vol->sb, "Writing into sparse regions is "
+					"not supported yet. Sorry.");
+			err = -EOPNOTSUPP;
+			break;
+		}
+		/* If first try and runlist unmapped, map and retry. */
+		if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+			is_retry = true;
+			/*
+			 * Attempt to map runlist, dropping lock for
+			 * the duration.
+			 */
+			up_read(&ni->runlist.lock);
+			err = ntfs_map_runlist(ni, vcn);
+			if (likely(!err))
+				goto lock_retry_remap;
+			rl = NULL;
+		} else if (!rl)
+			up_read(&ni->runlist.lock);
+		/*
+		 * If buffer is outside the runlist, truncate has cut it out
+		 * of the runlist.  Just clean and clear the buffer and set it
+		 * uptodate so it can get discarded by the VM.
+		 */
+		if (err == -ENOENT || lcn == LCN_ENOENT) {
+			bh->b_blocknr = -1;
+			clear_buffer_dirty(bh);
+			zero_user(page, bh_offset(bh), blocksize);
+			set_buffer_uptodate(bh);
+			err = 0;
+			continue;
+		}
+		/* Failed to map the buffer, even after retrying. */
+		if (!err)
+			err = -EIO;
+		bh->b_blocknr = -1;
+		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
+				"because its location on disk could not be "
+				"determined%s (error code %i).", ni->mft_no,
+				ni->type, (unsigned long long)vcn,
+				vcn_ofs, is_retry ? " even after "
+				"retrying" : "", err);
+		break;
+	} while (block++, (bh = bh->b_this_page) != head);
+
+	/* Release the lock if we took it. */
+	if (rl)
+		up_read(&ni->runlist.lock);
+
+	/* For the error case, need to reset bh to the beginning. */
+	bh = head;
+
+	/* Just an optimization, so ->readpage() is not called later. */
+	if (unlikely(!PageUptodate(page))) {
+		int uptodate = 1;
+		do {
+			if (!buffer_uptodate(bh)) {
+				uptodate = 0;
+				bh = head;
+				break;
+			}
+		} while ((bh = bh->b_this_page) != head);
+		if (uptodate)
+			SetPageUptodate(page);
+	}
+
+	/* Setup all mapped, dirty buffers for async write i/o. */
+	do {
+		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+			lock_buffer(bh);
+			if (test_clear_buffer_dirty(bh)) {
+				BUG_ON(!buffer_uptodate(bh));
+				mark_buffer_async_write(bh);
+			} else
+				unlock_buffer(bh);
+		} else if (unlikely(err)) {
+			/*
+			 * For the error case. The buffer may have been set
+			 * dirty during attachment to a dirty page.
+			 */
+			if (err != -ENOMEM)
+				clear_buffer_dirty(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	if (unlikely(err)) {
+		// TODO: Remove the -EOPNOTSUPP check later on...
+		if (unlikely(err == -EOPNOTSUPP))
+			err = 0;
+		else if (err == -ENOMEM) {
+			ntfs_warning(vol->sb, "Error allocating memory. "
+					"Redirtying page so we try again "
+					"later.");
+			/*
+			 * Put the page back on mapping->dirty_pages, but
+			 * leave its buffer's dirty state as-is.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			err = 0;
+		} else
+			SetPageError(page);
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
+
+	/* Submit the prepared buffers for i/o. */
+	need_end_writeback = true;
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			need_end_writeback = false;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	/* If no i/o was started, need to end_page_writeback(). */
+	if (unlikely(need_end_writeback))
+		end_page_writeback(page);
+
+	ntfs_debug("Done.");
+	return err;
+}
+
+/**
+ * ntfs_write_mst_block - write a @page to the backing store
+ * @page:	page cache page to write out
+ * @wbc:	writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, mst protected
+ * attributes to their backing store.  The only supported attributes are index
+ * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
+ * supported for the index allocation case.
+ *
+ * The page must remain locked for the duration of the write because we apply
+ * the mst fixups, write, and then undo the fixups, so if we were to unlock the
+ * page before undoing the fixups, any other user of the page will see the
+ * page contents as corrupt.
+ *
+ * We clear the page uptodate flag for the duration of the function to ensure
+ * exclusion for the $MFT/$DATA case against someone mapping an mft record we
+ * are about to apply the mst fixups to.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_write_block(), ntfs_mft_writepage(), and
+ * write_mft_record_nolock().
+ */
+static int ntfs_write_mst_block(struct page *page,
+		struct writeback_control *wbc)
+{
+	sector_t block, dblock, rec_block;
+	struct inode *vi = page->mapping->host;
+	ntfs_inode *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+	u8 *kaddr;
+	unsigned int rec_size = ni->itype.index.block_size;
+	ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	runlist_element *rl;
+	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
+	unsigned bh_size, rec_size_bits;
+	bool sync, is_mft, page_is_dirty, rec_is_dirty;
+	unsigned char bh_size_bits;
+
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+			"0x%lx.", vi->i_ino, ni->type, page->index);
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(!NInoMstProtected(ni));
+	is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
+	/*
+	 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
+	 * in its page cache were to be marked dirty.  However this should
+	 * never happen with the current driver and considering we do not
+	 * handle this case here we do want to BUG(), at least for now.
+	 */
+	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
+			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
+	bh_size = vol->sb->s_blocksize;
+	bh_size_bits = vol->sb->s_blocksize_bits;
+	max_bhs = PAGE_CACHE_SIZE / bh_size;
+	BUG_ON(!max_bhs);
+	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
+
+	/* Were we called for sync purposes? */
+	sync = (wbc->sync_mode == WB_SYNC_ALL);
+
+	/* Make sure we have mapped buffers. */
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+
+	rec_size_bits = ni->itype.index.block_size_bits;
+	BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
+	bhs_per_rec = rec_size >> bh_size_bits;
+	BUG_ON(!bhs_per_rec);
+
+	/* The first block in the page. */
+	rec_block = block = (sector_t)page->index <<
+			(PAGE_CACHE_SHIFT - bh_size_bits);
+
+	/* The first out of bounds block for the data size. */
+	dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
+
+	rl = NULL;
+	err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
+	page_is_dirty = rec_is_dirty = false;
+	rec_start_bh = NULL;
+	do {
+		bool is_retry = false;
+
+		if (likely(block < rec_block)) {
+			if (unlikely(block >= dblock)) {
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * This block is not the first one in the record.  We
+			 * ignore the buffer's dirty state because we could
+			 * have raced with a parallel mark_ntfs_record_dirty().
+			 */
+			if (!rec_is_dirty)
+				continue;
+			if (unlikely(err2)) {
+				if (err2 != -ENOMEM)
+					clear_buffer_dirty(bh);
+				continue;
+			}
+		} else /* if (block == rec_block) */ {
+			BUG_ON(block > rec_block);
+			/* This block is the first one in the record. */
+			rec_block += bhs_per_rec;
+			err2 = 0;
+			if (unlikely(block >= dblock)) {
+				clear_buffer_dirty(bh);
+				continue;
+			}
+			if (!buffer_dirty(bh)) {
+				/* Clean records are not written out. */
+				rec_is_dirty = false;
+				continue;
+			}
+			rec_is_dirty = true;
+			rec_start_bh = bh;
+		}
+		/* Need to map the buffer if it is not mapped already. */
+		if (unlikely(!buffer_mapped(bh))) {
+			VCN vcn;
+			LCN lcn;
+			unsigned int vcn_ofs;
+
+			bh->b_bdev = vol->sb->s_bdev;
+			/* Obtain the vcn and offset of the current block. */
+			vcn = (VCN)block << bh_size_bits;
+			vcn_ofs = vcn & vol->cluster_size_mask;
+			vcn >>= vol->cluster_size_bits;
+			if (!rl) {
+lock_retry_remap:
+				down_read(&ni->runlist.lock);
+				rl = ni->runlist.rl;
+			}
+			if (likely(rl != NULL)) {
+				/* Seek to element containing target vcn. */
+				while (rl->length && rl[1].vcn <= vcn)
+					rl++;
+				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			} else
+				lcn = LCN_RL_NOT_MAPPED;
+			/* Successful remap. */
+			if (likely(lcn >= 0)) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn <<
+						vol->cluster_size_bits) +
+						vcn_ofs) >> bh_size_bits;
+				set_buffer_mapped(bh);
+			} else {
+				/*
+				 * Remap failed.  Retry to map the runlist once
+				 * unless we are working on $MFT which always
+				 * has the whole of its runlist in memory.
+				 */
+				if (!is_mft && !is_retry &&
+						lcn == LCN_RL_NOT_MAPPED) {
+					is_retry = true;
+					/*
+					 * Attempt to map runlist, dropping
+					 * lock for the duration.
+					 */
+					up_read(&ni->runlist.lock);
+					err2 = ntfs_map_runlist(ni, vcn);
+					if (likely(!err2))
+						goto lock_retry_remap;
+					if (err2 == -ENOMEM)
+						page_is_dirty = true;
+					lcn = err2;
+				} else {
+					err2 = -EIO;
+					if (!rl)
+						up_read(&ni->runlist.lock);
+				}
+				/* Hard error.  Abort writing this record. */
+				if (!err || err == -ENOMEM)
+					err = err2;
+				bh->b_blocknr = -1;
+				ntfs_error(vol->sb, "Cannot write ntfs record "
+						"0x%llx (inode 0x%lx, "
+						"attribute type 0x%x) because "
+						"its location on disk could "
+						"not be determined (error "
+						"code %lli).",
+						(long long)block <<
+						bh_size_bits >>
+						vol->mft_record_size_bits,
+						ni->mft_no, ni->type,
+						(long long)lcn);
+				/*
+				 * If this is not the first buffer, remove the
+				 * buffers in this record from the list of
+				 * buffers to write and clear their dirty bit
+				 * if not error -ENOMEM.
+				 */
+				if (rec_start_bh != bh) {
+					while (bhs[--nr_bhs] != rec_start_bh)
+						;
+					if (err2 != -ENOMEM) {
+						do {
+							clear_buffer_dirty(
+								rec_start_bh);
+						} while ((rec_start_bh =
+								rec_start_bh->
+								b_this_page) !=
+								bh);
+					}
+				}
+				continue;
+			}
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(nr_bhs >= max_bhs);
+		bhs[nr_bhs++] = bh;
+	} while (block++, (bh = bh->b_this_page) != head);
+	if (unlikely(rl))
+		up_read(&ni->runlist.lock);
+	/* If there were no dirty buffers, we are done. */
+	if (!nr_bhs)
+		goto done;
+	/* Map the page so we can access its contents. */
+	kaddr = kmap(page);
+	/* Clear the page uptodate flag whilst the mst fixups are applied. */
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	for (i = 0; i < nr_bhs; i++) {
+		unsigned int ofs;
+
+		/* Skip buffers which are not at the beginning of records. */
+		if (i % bhs_per_rec)
+			continue;
+		tbh = bhs[i];
+		ofs = bh_offset(tbh);
+		if (is_mft) {
+			ntfs_inode *tni;
+			unsigned long mft_no;
+
+			/* Get the mft record number. */
+			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+					>> rec_size_bits;
+			/* Check whether to write this mft record. */
+			tni = NULL;
+			if (!ntfs_may_write_mft_record(vol, mft_no,
+					(MFT_RECORD*)(kaddr + ofs), &tni)) {
+				/*
+				 * The record should not be written.  This
+				 * means we need to redirty the page before
+				 * returning.
+				 */
+				page_is_dirty = true;
+				/*
+				 * Remove the buffers in this mft record from
+				 * the list of buffers to write.
+				 */
+				do {
+					bhs[i] = NULL;
+				} while (++i % bhs_per_rec);
+				continue;
+			}
+			/*
+			 * The record should be written.  If a locked ntfs
+			 * inode was returned, add it to the array of locked
+			 * ntfs inodes.
+			 */
+			if (tni)
+				locked_nis[nr_locked_nis++] = tni;
+		}
+		/* Apply the mst protection fixups. */
+		err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
+				rec_size);
+		if (unlikely(err2)) {
+			if (!err || err == -ENOMEM)
+				err = -EIO;
+			ntfs_error(vol->sb, "Failed to apply mst fixups "
+					"(inode 0x%lx, attribute type 0x%x, "
+					"page index 0x%lx, page offset 0x%x)!"
+					"  Unmount and run chkdsk.", vi->i_ino,
+					ni->type, page->index, ofs);
+			/*
+			 * Mark all the buffers in this record clean as we do
+			 * not want to write corrupt data to disk.
+			 */
+			do {
+				clear_buffer_dirty(bhs[i]);
+				bhs[i] = NULL;
+			} while (++i % bhs_per_rec);
+			continue;
+		}
+		nr_recs++;
+	}
+	/* If no records are to be written out, we are done. */
+	if (!nr_recs)
+		goto unm_done;
+	flush_dcache_page(page);
+	/* Lock buffers and start synchronous write i/o on them. */
+	for (i = 0; i < nr_bhs; i++) {
+		tbh = bhs[i];
+		if (!tbh)
+			continue;
+		if (!trylock_buffer(tbh))
+			BUG();
+		/* The buffer dirty state is now irrelevant, just clean it. */
+		clear_buffer_dirty(tbh);
+		BUG_ON(!buffer_uptodate(tbh));
+		BUG_ON(!buffer_mapped(tbh));
+		get_bh(tbh);
+		tbh->b_end_io = end_buffer_write_sync;
+		submit_bh(WRITE, tbh);
+	}
+	/* Synchronize the mft mirror now if not @sync. */
+	if (is_mft && !sync)
+		goto do_mirror;
+do_wait:
+	/* Wait on i/o completion of buffers. */
+	for (i = 0; i < nr_bhs; i++) {
+		tbh = bhs[i];
+		if (!tbh)
+			continue;
+		wait_on_buffer(tbh);
+		if (unlikely(!buffer_uptodate(tbh))) {
+			ntfs_error(vol->sb, "I/O error while writing ntfs "
+					"record buffer (inode 0x%lx, "
+					"attribute type 0x%x, page index "
+					"0x%lx, page offset 0x%lx)!  Unmount "
+					"and run chkdsk.", vi->i_ino, ni->type,
+					page->index, bh_offset(tbh));
+			if (!err || err == -ENOMEM)
+				err = -EIO;
+			/*
+			 * Set the buffer uptodate so the page and buffer
+			 * states do not become out of sync.
+			 */
+			set_buffer_uptodate(tbh);
+		}
+	}
+	/* If @sync, now synchronize the mft mirror. */
+	if (is_mft && sync) {
+do_mirror:
+		for (i = 0; i < nr_bhs; i++) {
+			unsigned long mft_no;
+			unsigned int ofs;
+
+			/*
+			 * Skip buffers which are not at the beginning of
+			 * records.
+			 */
+			if (i % bhs_per_rec)
+				continue;
+			tbh = bhs[i];
+			/* Skip removed buffers (and hence records). */
+			if (!tbh)
+				continue;
+			ofs = bh_offset(tbh);
+			/* Get the mft record number. */
+			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+					>> rec_size_bits;
+			if (mft_no < vol->mftmirr_size)
+				ntfs_sync_mft_mirror(vol, mft_no,
+						(MFT_RECORD*)(kaddr + ofs),
+						sync);
+		}
+		if (!sync)
+			goto do_wait;
+	}
+	/* Remove the mst protection fixups again. */
+	for (i = 0; i < nr_bhs; i++) {
+		if (!(i % bhs_per_rec)) {
+			tbh = bhs[i];
+			if (!tbh)
+				continue;
+			post_write_mst_fixup((NTFS_RECORD*)(kaddr +
+					bh_offset(tbh)));
+		}
+	}
+	flush_dcache_page(page);
+unm_done:
+	/* Unlock any locked inodes. */
+	while (nr_locked_nis-- > 0) {
+		ntfs_inode *tni, *base_tni;
+		
+		tni = locked_nis[nr_locked_nis];
+		/* Get the base inode. */
+		mutex_lock(&tni->extent_lock);
+		if (tni->nr_extents >= 0)
+			base_tni = tni;
+		else {
+			base_tni = tni->ext.base_ntfs_ino;
+			BUG_ON(!base_tni);
+		}
+		mutex_unlock(&tni->extent_lock);
+		ntfs_debug("Unlocking %s inode 0x%lx.",
+				tni == base_tni ? "base" : "extent",
+				tni->mft_no);
+		mutex_unlock(&tni->mrec_lock);
+		atomic_dec(&tni->count);
+		iput(VFS_I(base_tni));
+	}
+	SetPageUptodate(page);
+	kunmap(page);
+done:
+	if (unlikely(err && err != -ENOMEM)) {
+		/*
+		 * Set page error if there is only one ntfs record in the page.
+		 * Otherwise we would loose per-record granularity.
+		 */
+		if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
+			SetPageError(page);
+		NVolSetErrors(vol);
+	}
+	if (page_is_dirty) {
+		ntfs_debug("Page still contains one or more dirty ntfs "
+				"records.  Redirtying the page starting at "
+				"record 0x%lx.", page->index <<
+				(PAGE_CACHE_SHIFT - rec_size_bits));
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+	} else {
+		/*
+		 * Keep the VM happy.  This must be done otherwise the
+		 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
+		 * the page is clean.
+		 */
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+	}
+	if (likely(!err))
+		ntfs_debug("Done.");
+	return err;
+}
+
+/**
+ * ntfs_writepage - write a @page to the backing store
+ * @page:	page cache page to write out
+ * @wbc:	writeback control structure
+ *
+ * This is called from the VM when it wants to have a dirty ntfs page cache
+ * page cleaned.  The VM has already locked the page and marked it clean.
+ *
+ * For non-resident attributes, ntfs_writepage() writes the @page by calling
+ * the ntfs version of the generic block_write_full_page() function,
+ * ntfs_write_block(), which in turn if necessary creates and writes the
+ * buffers associated with the page asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
+ * the data to the mft record (which at this stage is most likely in memory).
+ * The mft record is then marked dirty and written out asynchronously via the
+ * vfs inode dirty code path for the inode the mft record belongs to or via the
+ * vm page dirty code path for the page the mft record is in.
+ *
+ * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	loff_t i_size;
+	struct inode *vi = page->mapping->host;
+	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
+	char *addr;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *m = NULL;
+	u32 attr_len;
+	int err;
+
+retry_writepage:
+	BUG_ON(!PageLocked(page));
+	i_size = i_size_read(vi);
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT)) {
+		/*
+		 * The page may have dirty, unmapped buffers.  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		unlock_page(page);
+		ntfs_debug("Write outside i_size - truncated?");
+		return 0;
+	}
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If file is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			unlock_page(page);
+			BUG_ON(ni->type != AT_DATA);
+			ntfs_debug("Denying write access to encrypted file.");
+			return -EACCES;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			// TODO: Implement and replace this with
+			// return ntfs_write_compressed_block(page);
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to compressed files is "
+					"not supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+		// TODO: Implement and remove this check.
+		if (NInoNonResident(ni) && NInoSparse(ni)) {
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to sparse files is not "
+					"supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+	}
+	/* NInoNonResident() == NInoIndexAllocPresent() */
+	if (NInoNonResident(ni)) {
+		/* We have to zero every time due to mmap-at-end-of-file. */
+		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
+			/* The page straddles i_size. */
+			unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
+			zero_user_segment(page, ofs, PAGE_CACHE_SIZE);
+		}
+		/* Handle mst protected attributes. */
+		if (NInoMstProtected(ni))
+			return ntfs_write_mst_block(page, wbc);
+		/* Normal, non-resident data stream. */
+		return ntfs_write_block(page, wbc);
+	}
+	/*
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * mst protected.  This also means the attribute is smaller than an mft
+	 * record and hence smaller than a page, so can simply return error on
+	 * any pages with index above 0.  Note the attribute can actually be
+	 * marked compressed but if it is resident the actual data is not
+	 * compressed so we are ok to ignore the compressed flag here.
+	 */
+	BUG_ON(page_has_buffers(page));
+	BUG_ON(!PageUptodate(page));
+	if (unlikely(page->index > 0)) {
+		ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0.  "
+				"Aborting write.", page->index);
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+		return -EIO;
+	}
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	/*
+	 * If a parallel write made the attribute non-resident, drop the mft
+	 * record and retry the writepage.
+	 */
+	if (unlikely(NInoNonResident(ni))) {
+		unmap_mft_record(base_ni);
+		goto retry_writepage;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err))
+		goto err_out;
+	/*
+	 * Keep the VM happy.  This must be done otherwise the radix-tree tag
+	 * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+	i_size = i_size_read(vi);
+	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate or a failed truncate. */
+		attr_len = i_size;
+		/*
+		 * If the truncate failed, fix it up now.  If a concurrent
+		 * truncate, we do its job, so it does not have to do anything.
+		 */
+		err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
+				attr_len);
+		/* Shrinking cannot fail. */
+		BUG_ON(err);
+	}
+	addr = kmap_atomic(page);
+	/* Copy the data from the page to the mft record. */
+	memcpy((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset),
+			addr, attr_len);
+	/* Zero out of bounds area in the page cache page. */
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	kunmap_atomic(addr);
+	flush_dcache_page(page);
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	/* We are done with the page. */
+	end_page_writeback(page);
+	/* Finally, mark the mft record dirty, so it gets written back. */
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	return 0;
+err_out:
+	if (err == -ENOMEM) {
+		ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
+				"page so we try again later.");
+		/*
+		 * Put the page back on mapping->dirty_pages, but leave its
+		 * buffers' dirty state as-is.
+		 */
+		redirty_page_for_writepage(wbc, page);
+		err = 0;
+	} else {
+		ntfs_error(vi->i_sb, "Resident attribute write failed with "
+				"error %i.", err);
+		SetPageError(page);
+		NVolSetErrors(ni->vol);
+	}
+	unlock_page(page);
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	return err;
+}
+
+#endif	/* NTFS_RW */
+
+/**
+ * ntfs_bmap - map logical file block to physical device block
+ * @mapping:	address space mapping to which the block to be mapped belongs
+ * @block:	logical block to map to its physical device block
+ *
+ * For regular, non-resident files (i.e. not compressed and not encrypted), map
+ * the logical @block belonging to the file described by the address space
+ * mapping @mapping to its physical device block.
+ *
+ * The size of the block is equal to the @s_blocksize field of the super block
+ * of the mounted file system which is guaranteed to be smaller than or equal
+ * to the cluster size thus the block is guaranteed to fit entirely inside the
+ * cluster which means we do not need to care how many contiguous bytes are
+ * available after the beginning of the block.
+ *
+ * Return the physical device block if the mapping succeeded or 0 if the block
+ * is sparse or there was an error.
+ *
+ * Note: This is a problem if someone tries to run bmap() on $Boot system file
+ * as that really is in block zero but there is nothing we can do.  bmap() is
+ * just broken in that respect (just like it cannot distinguish sparse from
+ * not available or error).
+ */
+static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
+{
+	s64 ofs, size;
+	loff_t i_size;
+	LCN lcn;
+	unsigned long blocksize, flags;
+	ntfs_inode *ni = NTFS_I(mapping->host);
+	ntfs_volume *vol = ni->vol;
+	unsigned delta;
+	unsigned char blocksize_bits, cluster_size_shift;
+
+	ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
+			ni->mft_no, (unsigned long long)block);
+	if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
+		ntfs_error(vol->sb, "BMAP does not make sense for %s "
+				"attributes, returning 0.",
+				(ni->type != AT_DATA) ? "non-data" :
+				(!NInoNonResident(ni) ? "resident" :
+				"encrypted"));
+		return 0;
+	}
+	/* None of these can happen. */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoMstProtected(ni));
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+	ofs = (s64)block << blocksize_bits;
+	read_lock_irqsave(&ni->size_lock, flags);
+	size = ni->initialized_size;
+	i_size = i_size_read(VFS_I(ni));
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * If the offset is outside the initialized size or the block straddles
+	 * the initialized size then pretend it is a hole unless the
+	 * initialized size equals the file size.
+	 */
+	if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
+		goto hole;
+	cluster_size_shift = vol->cluster_size_bits;
+	down_read(&ni->runlist.lock);
+	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
+	up_read(&ni->runlist.lock);
+	if (unlikely(lcn < LCN_HOLE)) {
+		/*
+		 * Step down to an integer to avoid gcc doing a long long
+		 * comparision in the switch when we know @lcn is between
+		 * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
+		 *
+		 * Otherwise older gcc (at least on some architectures) will
+		 * try to use __cmpdi2() which is of course not available in
+		 * the kernel.
+		 */
+		switch ((int)lcn) {
+		case LCN_ENOENT:
+			/*
+			 * If the offset is out of bounds then pretend it is a
+			 * hole.
+			 */
+			goto hole;
+		case LCN_ENOMEM:
+			ntfs_error(vol->sb, "Not enough memory to complete "
+					"mapping for inode 0x%lx.  "
+					"Returning 0.", ni->mft_no);
+			break;
+		default:
+			ntfs_error(vol->sb, "Failed to complete mapping for "
+					"inode 0x%lx.  Run chkdsk.  "
+					"Returning 0.", ni->mft_no);
+			break;
+		}
+		return 0;
+	}
+	if (lcn < 0) {
+		/* It is a hole. */
+hole:
+		ntfs_debug("Done (returning hole).");
+		return 0;
+	}
+	/*
+	 * The block is really allocated and fullfils all our criteria.
+	 * Convert the cluster to units of block size and return the result.
+	 */
+	delta = ofs & vol->cluster_size_mask;
+	if (unlikely(sizeof(block) < sizeof(lcn))) {
+		block = lcn = ((lcn << cluster_size_shift) + delta) >>
+				blocksize_bits;
+		/* If the block number was truncated return 0. */
+		if (unlikely(block != lcn)) {
+			ntfs_error(vol->sb, "Physical block 0x%llx is too "
+					"large to be returned, returning 0.",
+					(long long)lcn);
+			return 0;
+		}
+	} else
+		block = ((lcn << cluster_size_shift) + delta) >>
+				blocksize_bits;
+	ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
+	return block;
+}
+
+/**
+ * ntfs_normal_aops - address space operations for normal inodes and attributes
+ *
+ * Note these are not used for compressed or mst protected inodes and
+ * attributes.
+ */
+const struct address_space_operations ntfs_normal_aops = {
+	.readpage	= ntfs_readpage,
+#ifdef NTFS_RW
+	.writepage	= ntfs_writepage,
+	.set_page_dirty	= __set_page_dirty_buffers,
+#endif /* NTFS_RW */
+	.bmap		= ntfs_bmap,
+	.migratepage	= buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+/**
+ * ntfs_compressed_aops - address space operations for compressed inodes
+ */
+const struct address_space_operations ntfs_compressed_aops = {
+	.readpage	= ntfs_readpage,
+#ifdef NTFS_RW
+	.writepage	= ntfs_writepage,
+	.set_page_dirty	= __set_page_dirty_buffers,
+#endif /* NTFS_RW */
+	.migratepage	= buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+/**
+ * ntfs_mst_aops - general address space operations for mst protecteed inodes
+ *		   and attributes
+ */
+const struct address_space_operations ntfs_mst_aops = {
+	.readpage	= ntfs_readpage,	/* Fill page with data. */
+#ifdef NTFS_RW
+	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
+	.set_page_dirty	= __set_page_dirty_nobuffers,	/* Set the page dirty
+						   without touching the buffers
+						   belonging to the page. */
+#endif /* NTFS_RW */
+	.migratepage	= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+#ifdef NTFS_RW
+
+/**
+ * mark_ntfs_record_dirty - mark an ntfs record dirty
+ * @page:	page containing the ntfs record to mark dirty
+ * @ofs:	byte offset within @page at which the ntfs record begins
+ *
+ * Set the buffers and the page in which the ntfs record is located dirty.
+ *
+ * The latter also marks the vfs inode the ntfs record belongs to dirty
+ * (I_DIRTY_PAGES only).
+ *
+ * If the page does not have buffers, we create them and set them uptodate.
+ * The page may not be locked which is why we need to handle the buffers under
+ * the mapping->private_lock.  Once the buffers are marked dirty we no longer
+ * need the lock since try_to_free_buffers() does not free dirty buffers.
+ */
+void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
+	struct address_space *mapping = page->mapping;
+	ntfs_inode *ni = NTFS_I(mapping->host);
+	struct buffer_head *bh, *head, *buffers_to_free = NULL;
+	unsigned int end, bh_size, bh_ofs;
+
+	BUG_ON(!PageUptodate(page));
+	end = ofs + ni->itype.index.block_size;
+	bh_size = VFS_I(ni)->i_sb->s_blocksize;
+	spin_lock(&mapping->private_lock);
+	if (unlikely(!page_has_buffers(page))) {
+		spin_unlock(&mapping->private_lock);
+		bh = head = alloc_page_buffers(page, bh_size, 1);
+		spin_lock(&mapping->private_lock);
+		if (likely(!page_has_buffers(page))) {
+			struct buffer_head *tail;
+
+			do {
+				set_buffer_uptodate(bh);
+				tail = bh;
+				bh = bh->b_this_page;
+			} while (bh);
+			tail->b_this_page = head;
+			attach_page_buffers(page, head);
+		} else
+			buffers_to_free = bh;
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+	do {
+		bh_ofs = bh_offset(bh);
+		if (bh_ofs + bh_size <= ofs)
+			continue;
+		if (unlikely(bh_ofs >= end))
+			break;
+		set_buffer_dirty(bh);
+	} while ((bh = bh->b_this_page) != head);
+	spin_unlock(&mapping->private_lock);
+	__set_page_dirty_nobuffers(page);
+	if (unlikely(buffers_to_free)) {
+		do {
+			bh = buffers_to_free->b_this_page;
+			free_buffer_head(buffers_to_free);
+			buffers_to_free = bh;
+		} while (buffers_to_free);
+	}
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/aops.h b/kernel/fs/ntfs/aops.h
new file mode 100644
index 000000000..caecc58f5
--- /dev/null
+++ b/kernel/fs/ntfs/aops.h
@@ -0,0 +1,107 @@
+/**
+ * aops.h - Defines for NTFS kernel address space operations and page cache
+ *	    handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_AOPS_H
+#define _LINUX_NTFS_AOPS_H
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+
+#include "inode.h"
+
+/**
+ * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
+ * @page:	the page to release
+ *
+ * Unpin, unmap and release a page that was obtained from ntfs_map_page().
+ */
+static inline void ntfs_unmap_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/**
+ * ntfs_map_page - map a page into accessible memory, reading it if necessary
+ * @mapping:	address space for which to obtain the page
+ * @index:	index into the page cache for @mapping of the page to map
+ *
+ * Read a page from the page cache of the address space @mapping at position
+ * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
+ *
+ * If the page is not in memory it is loaded from disk first using the readpage
+ * method defined in the address space operations of @mapping and the page is
+ * added to the page cache of @mapping in the process.
+ *
+ * If the page belongs to an mst protected attribute and it is marked as such
+ * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
+ * error checking is performed.  This means the caller has to verify whether
+ * the ntfs record(s) contained in the page are valid or not using one of the
+ * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
+ * expecting to see.  (For details of the macros, see fs/ntfs/layout.h.)
+ *
+ * If the page is in high memory it is mapped into memory directly addressible
+ * by the kernel.
+ *
+ * Finally the page count is incremented, thus pinning the page into place.
+ *
+ * The above means that page_address(page) can be used on all pages obtained
+ * with ntfs_map_page() to get the kernel virtual address of the page.
+ *
+ * When finished with the page, the caller has to call ntfs_unmap_page() to
+ * unpin, unmap and release the page.
+ *
+ * Note this does not grant exclusive access. If such is desired, the caller
+ * must provide it independently of the ntfs_{un}map_page() calls by using
+ * a {rw_}semaphore or other means of serialization. A spin lock cannot be
+ * used as ntfs_map_page() can block.
+ *
+ * The unlocked and uptodate page is returned on success or an encoded error
+ * on failure. Caller has to test for error using the IS_ERR() macro on the
+ * return value. If that evaluates to 'true', the negative error code can be
+ * obtained using PTR_ERR() on the return value of ntfs_map_page().
+ */
+static inline struct page *ntfs_map_page(struct address_space *mapping,
+		unsigned long index)
+{
+	struct page *page = read_mapping_page(mapping, index, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageError(page))
+			return page;
+		ntfs_unmap_page(page);
+		return ERR_PTR(-EIO);
+	}
+	return page;
+}
+
+#ifdef NTFS_RW
+
+extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/kernel/fs/ntfs/attrib.c b/kernel/fs/ntfs/attrib.c
new file mode 100644
index 000000000..250ed5b20
--- /dev/null
+++ b/kernel/fs/ntfs/attrib.c
@@ -0,0 +1,2614 @@
+/**
+ * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+
+#include "attrib.h"
+#include "debug.h"
+#include "layout.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+#include "types.h"
+
+/**
+ * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
+ * @ni:		ntfs inode for which to map (part of) a runlist
+ * @vcn:	map runlist part containing this vcn
+ * @ctx:	active attribute search context if present or NULL if not
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_map_runlist_nolock() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
+ * restores it before returning.  Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry.  However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_map_runlist_nolock(), you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Return 0 on success and -errno on error.  There is one special error code
+ * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Note the runlist can be NULL after this function returns if @vcn is zero and
+ * the attribute has zero allocated size, i.e. there simply is no runlist.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist will be modified.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
+{
+	VCN end_vcn;
+	unsigned long flags;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	runlist_element *rl;
+	struct page *put_this_page = NULL;
+	int err = 0;
+	bool ctx_is_temporary, ctx_needs_reset;
+	ntfs_attr_search_ctx old_ctx = { NULL, };
+
+	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
+			(unsigned long long)vcn);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	if (!ctx) {
+		ctx_is_temporary = ctx_needs_reset = true;
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m))
+			return PTR_ERR(m);
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+	} else {
+		VCN allocated_size_vcn;
+
+		BUG_ON(IS_ERR(ctx->mrec));
+		a = ctx->attr;
+		BUG_ON(!a->non_resident);
+		ctx_is_temporary = false;
+		end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size_vcn = ni->allocated_size >>
+				ni->vol->cluster_size_bits;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
+			end_vcn = allocated_size_vcn - 1;
+		/*
+		 * If we already have the attribute extent containing @vcn in
+		 * @ctx, no need to look it up again.  We slightly cheat in
+		 * that if vcn exceeds the allocated size, we will refuse to
+		 * map the runlist below, so there is definitely no need to get
+		 * the right attribute extent.
+		 */
+		if (vcn >= allocated_size_vcn || (a->type == ni->type &&
+				a->name_length == ni->name_len &&
+				!memcmp((u8*)a + le16_to_cpu(a->name_offset),
+				ni->name, ni->name_len) &&
+				sle64_to_cpu(a->data.non_resident.lowest_vcn)
+				<= vcn && end_vcn >= vcn))
+			ctx_needs_reset = false;
+		else {
+			/* Save the old search context. */
+			old_ctx = *ctx;
+			/*
+			 * If the currently mapped (extent) inode is not the
+			 * base inode we will unmap it when we reinitialize the
+			 * search context which means we need to get a
+			 * reference to the page containing the mapped mft
+			 * record so we do not accidentally drop changes to the
+			 * mft record when it has not been marked dirty yet.
+			 */
+			if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
+					old_ctx.base_ntfs_ino) {
+				put_this_page = old_ctx.ntfs_ino->page;
+				page_cache_get(put_this_page);
+			}
+			/*
+			 * Reinitialize the search context so we can lookup the
+			 * needed attribute extent.
+			 */
+			ntfs_attr_reinit_search_ctx(ctx);
+			ctx_needs_reset = true;
+		}
+	}
+	if (ctx_needs_reset) {
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, vcn, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			goto err_out;
+		}
+		BUG_ON(!ctx->attr->non_resident);
+	}
+	a = ctx->attr;
+	/*
+	 * Only decompress the mapping pairs if @vcn is inside it.  Otherwise
+	 * we get into problems when we try to map an out of bounds vcn because
+	 * we then try to map the already mapped runlist fragment and
+	 * ntfs_mapping_pairs_decompress() fails.
+	 */
+	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
+	if (unlikely(vcn && vcn >= end_vcn)) {
+		err = -ENOENT;
+		goto err_out;
+	}
+	rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
+	if (IS_ERR(rl))
+		err = PTR_ERR(rl);
+	else
+		ni->runlist.rl = rl;
+err_out:
+	if (ctx_is_temporary) {
+		if (likely(ctx))
+			ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+	} else if (ctx_needs_reset) {
+		/*
+		 * If there is no attribute list, restoring the search context
+		 * is accomplished simply by copying the saved context back over
+		 * the caller supplied context.  If there is an attribute list,
+		 * things are more complicated as we need to deal with mapping
+		 * of mft records and resulting potential changes in pointers.
+		 */
+		if (NInoAttrList(base_ni)) {
+			/*
+			 * If the currently mapped (extent) inode is not the
+			 * one we had before, we need to unmap it and map the
+			 * old one.
+			 */
+			if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
+				/*
+				 * If the currently mapped inode is not the
+				 * base inode, unmap it.
+				 */
+				if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
+						ctx->base_ntfs_ino) {
+					unmap_extent_mft_record(ctx->ntfs_ino);
+					ctx->mrec = ctx->base_mrec;
+					BUG_ON(!ctx->mrec);
+				}
+				/*
+				 * If the old mapped inode is not the base
+				 * inode, map it.
+				 */
+				if (old_ctx.base_ntfs_ino &&
+						old_ctx.ntfs_ino !=
+						old_ctx.base_ntfs_ino) {
+retry_map:
+					ctx->mrec = map_mft_record(
+							old_ctx.ntfs_ino);
+					/*
+					 * Something bad has happened.  If out
+					 * of memory retry till it succeeds.
+					 * Any other errors are fatal and we
+					 * return the error code in ctx->mrec.
+					 * Let the caller deal with it...  We
+					 * just need to fudge things so the
+					 * caller can reinit and/or put the
+					 * search context safely.
+					 */
+					if (IS_ERR(ctx->mrec)) {
+						if (PTR_ERR(ctx->mrec) ==
+								-ENOMEM) {
+							schedule();
+							goto retry_map;
+						} else
+							old_ctx.ntfs_ino =
+								old_ctx.
+								base_ntfs_ino;
+					}
+				}
+			}
+			/* Update the changed pointers in the saved context. */
+			if (ctx->mrec != old_ctx.mrec) {
+				if (!IS_ERR(ctx->mrec))
+					old_ctx.attr = (ATTR_RECORD*)(
+							(u8*)ctx->mrec +
+							((u8*)old_ctx.attr -
+							(u8*)old_ctx.mrec));
+				old_ctx.mrec = ctx->mrec;
+			}
+		}
+		/* Restore the search context to the saved one. */
+		*ctx = old_ctx;
+		/*
+		 * We drop the reference on the page we took earlier.  In the
+		 * case that IS_ERR(ctx->mrec) is true this means we might lose
+		 * some changes to the mft record that had been made between
+		 * the last time it was marked dirty/written out and now.  This
+		 * at this stage is not a problem as the mapping error is fatal
+		 * enough that the mft record cannot be written out anyway and
+		 * the caller is very likely to shutdown the whole inode
+		 * immediately and mark the volume dirty for chkdsk to pick up
+		 * the pieces anyway.
+		 */
+		if (put_this_page)
+			page_cache_release(put_this_page);
+	}
+	return err;
+}
+
+/**
+ * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * @ni:		ntfs inode for which to map (part of) a runlist
+ * @vcn:	map runlist part containing this vcn
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * Return 0 on success and -errno on error.  There is one special error code
+ * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Locking: - The runlist must be unlocked on entry and is unlocked on return.
+ *	    - This function takes the runlist lock for writing and may modify
+ *	      the runlist.
+ */
+int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+{
+	int err = 0;
+
+	down_write(&ni->runlist.lock);
+	/* Make sure someone else didn't do the work while we were sleeping. */
+	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
+			LCN_RL_NOT_MAPPED))
+		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+	up_write(&ni->runlist.lock);
+	return err;
+}
+
+/**
+ * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
+ * @ni:			ntfs inode of the attribute whose runlist to search
+ * @vcn:		vcn to convert
+ * @write_locked:	true if the runlist is locked for writing
+ *
+ * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
+ * described by the ntfs inode @ni and return the corresponding logical cluster
+ * number (lcn).
+ *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @write_locked is true the caller has locked the runlist for writing and
+ * if false for reading.
+ *
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
+ *
+ * Return code	Meaning / Description
+ * ==========================================
+ *  LCN_HOLE	Hole / not allocated on disk.
+ *  LCN_ENOENT	There is no such vcn in the runlist, i.e. @vcn is out of bounds.
+ *  LCN_ENOMEM	Not enough memory to map runlist.
+ *  LCN_EIO	Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * Locking: - The runlist must be locked on entry and is left locked on return.
+ *	    - If @write_locked is 'false', i.e. the runlist is locked for reading,
+ *	      the lock may be dropped inside the function so you cannot rely on
+ *	      the runlist still being the same when this function returns.
+ */
+LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const bool write_locked)
+{
+	LCN lcn;
+	unsigned long flags;
+	bool is_retry = false;
+
+	BUG_ON(!ni);
+	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
+			ni->mft_no, (unsigned long long)vcn,
+			write_locked ? "write" : "read");
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return LCN_ENOENT;
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
+retry_remap:
+	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
+	lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
+	if (likely(lcn >= LCN_HOLE)) {
+		ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
+		return lcn;
+	}
+	if (lcn != LCN_RL_NOT_MAPPED) {
+		if (lcn != LCN_ENOENT)
+			lcn = LCN_EIO;
+	} else if (!is_retry) {
+		int err;
+
+		if (!write_locked) {
+			up_read(&ni->runlist.lock);
+			down_write(&ni->runlist.lock);
+			if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
+					LCN_RL_NOT_MAPPED)) {
+				up_write(&ni->runlist.lock);
+				down_read(&ni->runlist.lock);
+				goto retry_remap;
+			}
+		}
+		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+		if (!write_locked) {
+			up_write(&ni->runlist.lock);
+			down_read(&ni->runlist.lock);
+		}
+		if (likely(!err)) {
+			is_retry = true;
+			goto retry_remap;
+		}
+		if (err == -ENOENT)
+			lcn = LCN_ENOENT;
+		else if (err == -ENOMEM)
+			lcn = LCN_ENOMEM;
+		else
+			lcn = LCN_EIO;
+	}
+	if (lcn != LCN_ENOENT)
+		ntfs_error(ni->vol->sb, "Failed with error code %lli.",
+				(long long)lcn);
+	return lcn;
+}
+
+/**
+ * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
+ * @ni:		ntfs inode describing the runlist to search
+ * @vcn:	vcn to find
+ * @ctx:	active attribute search context if present or NULL if not
+ *
+ * Find the virtual cluster number @vcn in the runlist described by the ntfs
+ * inode @ni and return the address of the runlist element containing the @vcn.
+ *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
+ * restores it before returning.  Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry.  However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_attr_find_vcn_nolock(), you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ * Note you need to distinguish between the lcn of the returned runlist element
+ * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
+ * read and allocate clusters on write.
+ *
+ * Return the runlist element containing the @vcn on success and
+ * ERR_PTR(-errno) on error.  You need to test the return value with IS_ERR()
+ * to decide if the return is success or failure and PTR_ERR() to get to the
+ * error code if IS_ERR() is true.
+ *
+ * The possible error return codes are:
+ *	-ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
+ *	-ENOMEM - Not enough memory to map runlist.
+ *	-EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
+		ntfs_attr_search_ctx *ctx)
+{
+	unsigned long flags;
+	runlist_element *rl;
+	int err = 0;
+	bool is_retry = false;
+
+	BUG_ON(!ni);
+	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
+			ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return ERR_PTR(-ENOENT);
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
+retry_remap:
+	rl = ni->runlist.rl;
+	if (likely(rl && vcn >= rl[0].vcn)) {
+		while (likely(rl->length)) {
+			if (unlikely(vcn < rl[1].vcn)) {
+				if (likely(rl->lcn >= LCN_HOLE)) {
+					ntfs_debug("Done.");
+					return rl;
+				}
+				break;
+			}
+			rl++;
+		}
+		if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
+			if (likely(rl->lcn == LCN_ENOENT))
+				err = -ENOENT;
+			else
+				err = -EIO;
+		}
+	}
+	if (!err && !is_retry) {
+		/*
+		 * If the search context is invalid we cannot map the unmapped
+		 * region.
+		 */
+		if (IS_ERR(ctx->mrec))
+			err = PTR_ERR(ctx->mrec);
+		else {
+			/*
+			 * The @vcn is in an unmapped region, map the runlist
+			 * and retry.
+			 */
+			err = ntfs_map_runlist_nolock(ni, vcn, ctx);
+			if (likely(!err)) {
+				is_retry = true;
+				goto retry_remap;
+			}
+		}
+		if (err == -EINVAL)
+			err = -EIO;
+	} else if (!err)
+		err = -EIO;
+	if (err != -ENOENT)
+		ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
+	return ERR_PTR(err);
+}
+
+/**
+ * ntfs_attr_find - find (next) attribute in mft record
+ * @type:	attribute type to find
+ * @name:	attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:	attribute name length (only needed if @name present)
+ * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @val:	attribute value to find (optional, resident attributes only)
+ * @val_len:	attribute value length
+ * @ctx:	search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly.  Use ntfs_attr_lookup()
+ * instead.
+ *
+ * ntfs_attr_find() takes a search context @ctx as parameter and searches the
+ * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
+ * attribute of @type, optionally @name and @val.
+ *
+ * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
+ * point to the found attribute.
+ *
+ * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute before which the attribute being
+ * searched for would need to be inserted if such an action were to be desired.
+ *
+ * On actual error, ntfs_attr_find() returns -EIO.  In this case @ctx->attr is
+ * undefined and in particular do not rely on it not changing.
+ *
+ * If @ctx->is_first is 'true', the search begins with @ctx->attr itself.  If it
+ * is 'false', the search begins after @ctx->attr.
+ *
+ * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
+ * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
+ * @ctx->mrec belongs.  This is so we can get at the ntfs volume and hence at
+ * the upcase table.  If @ic is CASE_SENSITIVE, the comparison is case
+ * sensitive.  When @name is present, @name_len is the @name length in Unicode
+ * characters.
+ *
+ * If @name is not present (NULL), we assume that the unnamed attribute is
+ * being searched for.
+ *
+ * Finally, the resident attribute value @val is looked for, if present.  If
+ * @val is not present (NULL), @val_len is ignored.
+ *
+ * ntfs_attr_find() only searches the specified mft record and it ignores the
+ * presence of an attribute list attribute (unless it is the one being searched
+ * for, obviously).  If you need to take attribute lists into consideration,
+ * use ntfs_attr_lookup() instead (see below).  This also means that you cannot
+ * use ntfs_attr_find() to search for extent records of non-resident
+ * attributes, as extents with lowest_vcn != 0 are usually described by the
+ * attribute list attribute only. - Note that it is possible that the first
+ * extent is only in the attribute list while the last extent is in the base
+ * mft record, so do not rely on being able to find the first extent in the
+ * base mft record.
+ *
+ * Warning: Never use @val when looking for attribute types which can be
+ *	    non-resident as this most likely will result in a crash!
+ */
+static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
+		const u32 name_len, const IGNORE_CASE_BOOL ic,
+		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+	ATTR_RECORD *a;
+	ntfs_volume *vol = ctx->ntfs_ino->vol;
+	ntfschar *upcase = vol->upcase;
+	u32 upcase_len = vol->upcase_len;
+
+	/*
+	 * Iterate over attributes in mft record starting at @ctx->attr, or the
+	 * attribute following that, if @ctx->is_first is 'true'.
+	 */
+	if (ctx->is_first) {
+		a = ctx->attr;
+		ctx->is_first = false;
+	} else
+		a = (ATTR_RECORD*)((u8*)ctx->attr +
+				le32_to_cpu(ctx->attr->length));
+	for (;;	a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
+		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
+				le32_to_cpu(ctx->mrec->bytes_allocated))
+			break;
+		ctx->attr = a;
+		if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
+				a->type == AT_END))
+			return -ENOENT;
+		if (unlikely(!a->length))
+			break;
+		if (a->type != type)
+			continue;
+		/*
+		 * If @name is present, compare the two names.  If @name is
+		 * missing, assume we want an unnamed attribute.
+		 */
+		if (!name) {
+			/* The search failed if the found attribute is named. */
+			if (a->name_length)
+				return -ENOENT;
+		} else if (!ntfs_are_names_equal(name, name_len,
+			    (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
+			    a->name_length, ic, upcase, upcase_len)) {
+			register int rc;
+
+			rc = ntfs_collate_names(name, name_len,
+					(ntfschar*)((u8*)a +
+					le16_to_cpu(a->name_offset)),
+					a->name_length, 1, IGNORE_CASE,
+					upcase, upcase_len);
+			/*
+			 * If @name collates before a->name, there is no
+			 * matching attribute.
+			 */
+			if (rc == -1)
+				return -ENOENT;
+			/* If the strings are not equal, continue search. */
+			if (rc)
+				continue;
+			rc = ntfs_collate_names(name, name_len,
+					(ntfschar*)((u8*)a +
+					le16_to_cpu(a->name_offset)),
+					a->name_length, 1, CASE_SENSITIVE,
+					upcase, upcase_len);
+			if (rc == -1)
+				return -ENOENT;
+			if (rc)
+				continue;
+		}
+		/*
+		 * The names match or @name not present and attribute is
+		 * unnamed.  If no @val specified, we have found the attribute
+		 * and are done.
+		 */
+		if (!val)
+			return 0;
+		/* @val is present; compare values. */
+		else {
+			register int rc;
+
+			rc = memcmp(val, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
+					min_t(u32, val_len, le32_to_cpu(
+					a->data.resident.value_length)));
+			/*
+			 * If @val collates before the current attribute's
+			 * value, there is no matching attribute.
+			 */
+			if (!rc) {
+				register u32 avl;
+
+				avl = le32_to_cpu(
+						a->data.resident.value_length);
+				if (val_len == avl)
+					return 0;
+				if (val_len < avl)
+					return -ENOENT;
+			} else if (rc < 0)
+				return -ENOENT;
+		}
+	}
+	ntfs_error(vol->sb, "Inode is corrupt.  Run chkdsk.");
+	NVolSetErrors(vol);
+	return -EIO;
+}
+
+/**
+ * load_attribute_list - load an attribute list into memory
+ * @vol:		ntfs volume from which to read
+ * @runlist:		runlist of the attribute list
+ * @al_start:		destination buffer
+ * @size:		size of the destination buffer in bytes
+ * @initialized_size:	initialized size of the attribute list
+ *
+ * Walk the runlist @runlist and load all clusters from it copying them into
+ * the linear buffer @al. The maximum number of bytes copied to @al is @size
+ * bytes. Note, @size does not need to be a multiple of the cluster size. If
+ * @initialized_size is less than @size, the region in @al between
+ * @initialized_size and @size will be zeroed and not read from disk.
+ *
+ * Return 0 on success or -errno on error.
+ */
+int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
+		const s64 size, const s64 initialized_size)
+{
+	LCN lcn;
+	u8 *al = al_start;
+	u8 *al_end = al + initialized_size;
+	runlist_element *rl;
+	struct buffer_head *bh;
+	struct super_block *sb;
+	unsigned long block_size;
+	unsigned long block, max_block;
+	int err = 0;
+	unsigned char block_size_bits;
+
+	ntfs_debug("Entering.");
+	if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
+			initialized_size > size)
+		return -EINVAL;
+	if (!initialized_size) {
+		memset(al, 0, size);
+		return 0;
+	}
+	sb = vol->sb;
+	block_size = sb->s_blocksize;
+	block_size_bits = sb->s_blocksize_bits;
+	down_read(&runlist->lock);
+	rl = runlist->rl;
+	if (!rl) {
+		ntfs_error(sb, "Cannot read attribute list since runlist is "
+				"missing.");
+		goto err_out;	
+	}
+	/* Read all clusters specified by the runlist one run at a time. */
+	while (rl->length) {
+		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
+		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+				(unsigned long long)rl->vcn,
+				(unsigned long long)lcn);
+		/* The attribute list cannot be sparse. */
+		if (lcn < 0) {
+			ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed.  Cannot "
+					"read attribute list.");
+			goto err_out;
+		}
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		/* Read the run from device in chunks of block_size bytes. */
+		max_block = block + (rl->length << vol->cluster_size_bits >>
+				block_size_bits);
+		ntfs_debug("max_block = 0x%lx.", max_block);
+		do {
+			ntfs_debug("Reading block = 0x%lx.", block);
+			bh = sb_bread(sb, block);
+			if (!bh) {
+				ntfs_error(sb, "sb_bread() failed. Cannot "
+						"read attribute list.");
+				goto err_out;
+			}
+			if (al + block_size >= al_end)
+				goto do_final;
+			memcpy(al, bh->b_data, block_size);
+			brelse(bh);
+			al += block_size;
+		} while (++block < max_block);
+		rl++;
+	}
+	if (initialized_size < size) {
+initialize:
+		memset(al_start + initialized_size, 0, size - initialized_size);
+	}
+done:
+	up_read(&runlist->lock);
+	return err;
+do_final:
+	if (al < al_end) {
+		/*
+		 * Partial block.
+		 *
+		 * Note: The attribute list can be smaller than its allocation
+		 * by multiple clusters.  This has been encountered by at least
+		 * two people running Windows XP, thus we cannot do any
+		 * truncation sanity checking here. (AIA)
+		 */
+		memcpy(al, bh->b_data, al_end - al);
+		brelse(bh);
+		if (initialized_size < size)
+			goto initialize;
+		goto done;
+	}
+	brelse(bh);
+	/* Real overflow! */
+	ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
+			"is truncated.");
+err_out:
+	err = -EIO;
+	goto done;
+}
+
+/**
+ * ntfs_external_attr_find - find an attribute in the attribute list of an inode
+ * @type:	attribute type to find
+ * @name:	attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:	attribute name length (only needed if @name present)
+ * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
+ * @val:	attribute value to find (optional, resident attributes only)
+ * @val_len:	attribute value length
+ * @ctx:	search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly.  Use ntfs_attr_lookup()
+ * instead.
+ *
+ * Find an attribute by searching the attribute list for the corresponding
+ * attribute list entry.  Having found the entry, map the mft record if the
+ * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
+ * in there and return it.
+ *
+ * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
+ * have been obtained from a call to ntfs_attr_get_search_ctx().  On subsequent
+ * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
+ * then the base inode).
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * If the attribute is found, ntfs_external_attr_find() returns 0 and
+ * @ctx->attr will point to the found attribute.  @ctx->mrec will point to the
+ * mft record in which @ctx->attr is located and @ctx->al_entry will point to
+ * the attribute list entry for the attribute.
+ *
+ * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute in the base mft record before which
+ * the attribute being searched for would need to be inserted if such an action
+ * were to be desired.  @ctx->mrec will point to the mft record in which
+ * @ctx->attr is located and @ctx->al_entry will point to the attribute list
+ * entry of the attribute before which the attribute being searched for would
+ * need to be inserted if such an action were to be desired.
+ *
+ * Thus to insert the not found attribute, one wants to add the attribute to
+ * @ctx->mrec (the base mft record) and if there is not enough space, the
+ * attribute should be placed in a newly allocated extent mft record.  The
+ * attribute list entry for the inserted attribute should be inserted in the
+ * attribute list attribute at @ctx->al_entry.
+ *
+ * On actual error, ntfs_external_attr_find() returns -EIO.  In this case
+ * @ctx->attr is undefined and in particular do not rely on it not changing.
+ */
+static int ntfs_external_attr_find(const ATTR_TYPE type,
+		const ntfschar *name, const u32 name_len,
+		const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
+		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+	ntfs_inode *base_ni, *ni;
+	ntfs_volume *vol;
+	ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+	u8 *al_start, *al_end;
+	ATTR_RECORD *a;
+	ntfschar *al_name;
+	u32 al_name_len;
+	int err = 0;
+	static const char *es = " Unmount and run chkdsk.";
+
+	ni = ctx->ntfs_ino;
+	base_ni = ctx->base_ntfs_ino;
+	ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
+	if (!base_ni) {
+		/* First call happens with the base mft record. */
+		base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
+		ctx->base_mrec = ctx->mrec;
+	}
+	if (ni == base_ni)
+		ctx->base_attr = ctx->attr;
+	if (type == AT_END)
+		goto not_found;
+	vol = base_ni->vol;
+	al_start = base_ni->attr_list;
+	al_end = al_start + base_ni->attr_list_size;
+	if (!ctx->al_entry)
+		ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
+	/*
+	 * Iterate over entries in attribute list starting at @ctx->al_entry,
+	 * or the entry following that, if @ctx->is_first is 'true'.
+	 */
+	if (ctx->is_first) {
+		al_entry = ctx->al_entry;
+		ctx->is_first = false;
+	} else
+		al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
+				le16_to_cpu(ctx->al_entry->length));
+	for (;; al_entry = next_al_entry) {
+		/* Out of bounds check. */
+		if ((u8*)al_entry < base_ni->attr_list ||
+				(u8*)al_entry > al_end)
+			break;	/* Inode is corrupt. */
+		ctx->al_entry = al_entry;
+		/* Catch the end of the attribute list. */
+		if ((u8*)al_entry == al_end)
+			goto not_found;
+		if (!al_entry->length)
+			break;
+		if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+				le16_to_cpu(al_entry->length) > al_end)
+			break;
+		next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+				le16_to_cpu(al_entry->length));
+		if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
+			goto not_found;
+		if (type != al_entry->type)
+			continue;
+		/*
+		 * If @name is present, compare the two names.  If @name is
+		 * missing, assume we want an unnamed attribute.
+		 */
+		al_name_len = al_entry->name_length;
+		al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
+		if (!name) {
+			if (al_name_len)
+				goto not_found;
+		} else if (!ntfs_are_names_equal(al_name, al_name_len, name,
+				name_len, ic, vol->upcase, vol->upcase_len)) {
+			register int rc;
+
+			rc = ntfs_collate_names(name, name_len, al_name,
+					al_name_len, 1, IGNORE_CASE,
+					vol->upcase, vol->upcase_len);
+			/*
+			 * If @name collates before al_name, there is no
+			 * matching attribute.
+			 */
+			if (rc == -1)
+				goto not_found;
+			/* If the strings are not equal, continue search. */
+			if (rc)
+				continue;
+			/*
+			 * FIXME: Reverse engineering showed 0, IGNORE_CASE but
+			 * that is inconsistent with ntfs_attr_find().  The
+			 * subsequent rc checks were also different.  Perhaps I
+			 * made a mistake in one of the two.  Need to recheck
+			 * which is correct or at least see what is going on...
+			 * (AIA)
+			 */
+			rc = ntfs_collate_names(name, name_len, al_name,
+					al_name_len, 1, CASE_SENSITIVE,
+					vol->upcase, vol->upcase_len);
+			if (rc == -1)
+				goto not_found;
+			if (rc)
+				continue;
+		}
+		/*
+		 * The names match or @name not present and attribute is
+		 * unnamed.  Now check @lowest_vcn.  Continue search if the
+		 * next attribute list entry still fits @lowest_vcn.  Otherwise
+		 * we have reached the right one or the search has failed.
+		 */
+		if (lowest_vcn && (u8*)next_al_entry >= al_start	    &&
+				(u8*)next_al_entry + 6 < al_end		    &&
+				(u8*)next_al_entry + le16_to_cpu(
+					next_al_entry->length) <= al_end    &&
+				sle64_to_cpu(next_al_entry->lowest_vcn) <=
+					lowest_vcn			    &&
+				next_al_entry->type == al_entry->type	    &&
+				next_al_entry->name_length == al_name_len   &&
+				ntfs_are_names_equal((ntfschar*)((u8*)
+					next_al_entry +
+					next_al_entry->name_offset),
+					next_al_entry->name_length,
+					al_name, al_name_len, CASE_SENSITIVE,
+					vol->upcase, vol->upcase_len))
+			continue;
+		if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
+			if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
+				ntfs_error(vol->sb, "Found stale mft "
+						"reference in attribute list "
+						"of base inode 0x%lx.%s",
+						base_ni->mft_no, es);
+				err = -EIO;
+				break;
+			}
+		} else { /* Mft references do not match. */
+			/* If there is a mapped record unmap it first. */
+			if (ni != base_ni)
+				unmap_extent_mft_record(ni);
+			/* Do we want the base record back? */
+			if (MREF_LE(al_entry->mft_reference) ==
+					base_ni->mft_no) {
+				ni = ctx->ntfs_ino = base_ni;
+				ctx->mrec = ctx->base_mrec;
+			} else {
+				/* We want an extent record. */
+				ctx->mrec = map_extent_mft_record(base_ni,
+						le64_to_cpu(
+						al_entry->mft_reference), &ni);
+				if (IS_ERR(ctx->mrec)) {
+					ntfs_error(vol->sb, "Failed to map "
+							"extent mft record "
+							"0x%lx of base inode "
+							"0x%lx.%s",
+							MREF_LE(al_entry->
+							mft_reference),
+							base_ni->mft_no, es);
+					err = PTR_ERR(ctx->mrec);
+					if (err == -ENOENT)
+						err = -EIO;
+					/* Cause @ctx to be sanitized below. */
+					ni = NULL;
+					break;
+				}
+				ctx->ntfs_ino = ni;
+			}
+			ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+					le16_to_cpu(ctx->mrec->attrs_offset));
+		}
+		/*
+		 * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
+		 * mft record containing the attribute represented by the
+		 * current al_entry.
+		 */
+		/*
+		 * We could call into ntfs_attr_find() to find the right
+		 * attribute in this mft record but this would be less
+		 * efficient and not quite accurate as ntfs_attr_find() ignores
+		 * the attribute instance numbers for example which become
+		 * important when one plays with attribute lists.  Also,
+		 * because a proper match has been found in the attribute list
+		 * entry above, the comparison can now be optimized.  So it is
+		 * worth re-implementing a simplified ntfs_attr_find() here.
+		 */
+		a = ctx->attr;
+		/*
+		 * Use a manual loop so we can still use break and continue
+		 * with the same meanings as above.
+		 */
+do_next_attr_loop:
+		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
+				le32_to_cpu(ctx->mrec->bytes_allocated))
+			break;
+		if (a->type == AT_END)
+			break;
+		if (!a->length)
+			break;
+		if (al_entry->instance != a->instance)
+			goto do_next_attr;
+		/*
+		 * If the type and/or the name are mismatched between the
+		 * attribute list entry and the attribute record, there is
+		 * corruption so we break and return error EIO.
+		 */
+		if (al_entry->type != a->type)
+			break;
+		if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
+				le16_to_cpu(a->name_offset)), a->name_length,
+				al_name, al_name_len, CASE_SENSITIVE,
+				vol->upcase, vol->upcase_len))
+			break;
+		ctx->attr = a;
+		/*
+		 * If no @val specified or @val specified and it matches, we
+		 * have found it!
+		 */
+		if (!val || (!a->non_resident && le32_to_cpu(
+				a->data.resident.value_length) == val_len &&
+				!memcmp((u8*)a +
+				le16_to_cpu(a->data.resident.value_offset),
+				val, val_len))) {
+			ntfs_debug("Done, found.");
+			return 0;
+		}
+do_next_attr:
+		/* Proceed to the next attribute in the current mft record. */
+		a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
+		goto do_next_attr_loop;
+	}
+	if (!err) {
+		ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
+				"attribute list attribute.%s", base_ni->mft_no,
+				es);
+		err = -EIO;
+	}
+	if (ni != base_ni) {
+		if (ni)
+			unmap_extent_mft_record(ni);
+		ctx->ntfs_ino = base_ni;
+		ctx->mrec = ctx->base_mrec;
+		ctx->attr = ctx->base_attr;
+	}
+	if (err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+not_found:
+	/*
+	 * If we were looking for AT_END, we reset the search context @ctx and
+	 * use ntfs_attr_find() to seek to the end of the base mft record.
+	 */
+	if (type == AT_END) {
+		ntfs_attr_reinit_search_ctx(ctx);
+		return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
+				ctx);
+	}
+	/*
+	 * The attribute was not found.  Before we return, we want to ensure
+	 * @ctx->mrec and @ctx->attr indicate the position at which the
+	 * attribute should be inserted in the base mft record.  Since we also
+	 * want to preserve @ctx->al_entry we cannot reinitialize the search
+	 * context using ntfs_attr_reinit_search_ctx() as this would set
+	 * @ctx->al_entry to NULL.  Thus we do the necessary bits manually (see
+	 * ntfs_attr_init_search_ctx() below).  Note, we _only_ preserve
+	 * @ctx->al_entry as the remaining fields (base_*) are identical to
+	 * their non base_ counterparts and we cannot set @ctx->base_attr
+	 * correctly yet as we do not know what @ctx->attr will be set to by
+	 * the call to ntfs_attr_find() below.
+	 */
+	if (ni != base_ni)
+		unmap_extent_mft_record(ni);
+	ctx->mrec = ctx->base_mrec;
+	ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+			le16_to_cpu(ctx->mrec->attrs_offset));
+	ctx->is_first = true;
+	ctx->ntfs_ino = base_ni;
+	ctx->base_ntfs_ino = NULL;
+	ctx->base_mrec = NULL;
+	ctx->base_attr = NULL;
+	/*
+	 * In case there are multiple matches in the base mft record, need to
+	 * keep enumerating until we get an attribute not found response (or
+	 * another error), otherwise we would keep returning the same attribute
+	 * over and over again and all programs using us for enumeration would
+	 * lock up in a tight loop.
+	 */
+	do {
+		err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
+				ctx);
+	} while (!err);
+	ntfs_debug("Done, not found.");
+	return err;
+}
+
+/**
+ * ntfs_attr_lookup - find an attribute in an ntfs inode
+ * @type:	attribute type to find
+ * @name:	attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:	attribute name length (only needed if @name present)
+ * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
+ * @val:	attribute value to find (optional, resident attributes only)
+ * @val_len:	attribute value length
+ * @ctx:	search context with mft record and attribute to search from
+ *
+ * Find an attribute in an ntfs inode.  On first search @ctx->ntfs_ino must
+ * be the base mft record and @ctx must have been obtained from a call to
+ * ntfs_attr_get_search_ctx().
+ *
+ * This function transparently handles attribute lists and @ctx is used to
+ * continue searches where they were left off at.
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * Return 0 if the search was successful and -errno if not.
+ *
+ * When 0, @ctx->attr is the found attribute and it is in mft record
+ * @ctx->mrec.  If an attribute list attribute is present, @ctx->al_entry is
+ * the attribute list entry of the found attribute.
+ *
+ * When -ENOENT, @ctx->attr is the attribute which collates just after the
+ * attribute being searched for, i.e. if one wants to add the attribute to the
+ * mft record this is the correct place to insert it into.  If an attribute
+ * list attribute is present, @ctx->al_entry is the attribute list entry which
+ * collates just after the attribute list entry of the attribute being searched
+ * for, i.e. if one wants to add the attribute to the mft record this is the
+ * correct place to insert its attribute list entry into.
+ *
+ * When -errno != -ENOENT, an error occurred during the lookup.  @ctx->attr is
+ * then undefined and in particular you should not rely on it not changing.
+ */
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+		const u32 name_len, const IGNORE_CASE_BOOL ic,
+		const VCN lowest_vcn, const u8 *val, const u32 val_len,
+		ntfs_attr_search_ctx *ctx)
+{
+	ntfs_inode *base_ni;
+
+	ntfs_debug("Entering.");
+	BUG_ON(IS_ERR(ctx->mrec));
+	if (ctx->base_ntfs_ino)
+		base_ni = ctx->base_ntfs_ino;
+	else
+		base_ni = ctx->ntfs_ino;
+	/* Sanity check, just for debugging really. */
+	BUG_ON(!base_ni);
+	if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
+		return ntfs_attr_find(type, name, name_len, ic, val, val_len,
+				ctx);
+	return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
+			val, val_len, ctx);
+}
+
+/**
+ * ntfs_attr_init_search_ctx - initialize an attribute search context
+ * @ctx:	attribute search context to initialize
+ * @ni:		ntfs inode with which to initialize the search context
+ * @mrec:	mft record with which to initialize the search context
+ *
+ * Initialize the attribute search context @ctx with @ni and @mrec.
+ */
+static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
+		ntfs_inode *ni, MFT_RECORD *mrec)
+{
+	*ctx = (ntfs_attr_search_ctx) {
+		.mrec = mrec,
+		/* Sanity checks are performed elsewhere. */
+		.attr = (ATTR_RECORD*)((u8*)mrec +
+				le16_to_cpu(mrec->attrs_offset)),
+		.is_first = true,
+		.ntfs_ino = ni,
+	};
+}
+
+/**
+ * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
+ * @ctx:	attribute search context to reinitialize
+ *
+ * Reinitialize the attribute search context @ctx, unmapping an associated
+ * extent mft record if present, and initialize the search context again.
+ *
+ * This is used when a search for a new attribute is being started to reset
+ * the search context to the beginning.
+ */
+void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+	if (likely(!ctx->base_ntfs_ino)) {
+		/* No attribute list. */
+		ctx->is_first = true;
+		/* Sanity checks are performed elsewhere. */
+		ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+				le16_to_cpu(ctx->mrec->attrs_offset));
+		/*
+		 * This needs resetting due to ntfs_external_attr_find() which
+		 * can leave it set despite having zeroed ctx->base_ntfs_ino.
+		 */
+		ctx->al_entry = NULL;
+		return;
+	} /* Attribute list. */
+	if (ctx->ntfs_ino != ctx->base_ntfs_ino)
+		unmap_extent_mft_record(ctx->ntfs_ino);
+	ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
+	return;
+}
+
+/**
+ * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
+ * @ni:		ntfs inode with which to initialize the search context
+ * @mrec:	mft record with which to initialize the search context
+ *
+ * Allocate a new attribute search context, initialize it with @ni and @mrec,
+ * and return it. Return NULL if allocation failed.
+ */
+ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
+{
+	ntfs_attr_search_ctx *ctx;
+
+	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
+	if (ctx)
+		ntfs_attr_init_search_ctx(ctx, ni, mrec);
+	return ctx;
+}
+
+/**
+ * ntfs_attr_put_search_ctx - release an attribute search context
+ * @ctx:	attribute search context to free
+ *
+ * Release the attribute search context @ctx, unmapping an associated extent
+ * mft record if present.
+ */
+void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+	if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
+		unmap_extent_mft_record(ctx->ntfs_ino);
+	kmem_cache_free(ntfs_attr_ctx_cache, ctx);
+	return;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to find
+ *
+ * Search for the attribute definition record corresponding to the attribute
+ * @type in the $AttrDef system file.
+ *
+ * Return the attribute type definition record if found and NULL if not found.
+ */
+static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
+		const ATTR_TYPE type)
+{
+	ATTR_DEF *ad;
+
+	BUG_ON(!vol->attrdef);
+	BUG_ON(!type);
+	for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
+			vol->attrdef_size && ad->type; ++ad) {
+		/* We have not found it yet, carry on searching. */
+		if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
+			continue;
+		/* We found the attribute; return it. */
+		if (likely(ad->type == type))
+			return ad;
+		/* We have gone too far already.  No point in continuing. */
+		break;
+	}
+	/* Attribute not found. */
+	ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
+			le32_to_cpu(type));
+	return NULL;
+}
+
+/**
+ * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to check
+ * @size:	size which to check
+ *
+ * Check whether the @size in bytes is valid for an attribute of @type on the
+ * ntfs volume @vol.  This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
+ * listed in $AttrDef.
+ */
+int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
+		const s64 size)
+{
+	ATTR_DEF *ad;
+
+	BUG_ON(size < 0);
+	/*
+	 * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
+	 * listed in $AttrDef.
+	 */
+	if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
+		return -ERANGE;
+	/* Get the $AttrDef entry for the attribute @type. */
+	ad = ntfs_attr_find_in_attrdef(vol, type);
+	if (unlikely(!ad))
+		return -ENOENT;
+	/* Do the bounds check. */
+	if (((sle64_to_cpu(ad->min_size) > 0) &&
+			size < sle64_to_cpu(ad->min_size)) ||
+			((sle64_to_cpu(ad->max_size) > 0) && size >
+			sle64_to_cpu(ad->max_size)))
+		return -ERANGE;
+	return 0;
+}
+
+/**
+ * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be non-resident.  This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
+ * -ENOENT if the attribute is not listed in $AttrDef.
+ */
+int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+	ATTR_DEF *ad;
+
+	/* Find the attribute definition record in $AttrDef. */
+	ad = ntfs_attr_find_in_attrdef(vol, type);
+	if (unlikely(!ad))
+		return -ENOENT;
+	/* Check the flags and return the result. */
+	if (ad->flags & ATTR_DEF_RESIDENT)
+		return -EPERM;
+	return 0;
+}
+
+/**
+ * ntfs_attr_can_be_resident - check if an attribute can be resident
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be resident.  This information is derived from our ntfs knowledge and may
+ * not be completely accurate, especially when user defined attributes are
+ * present.  Basically we allow everything to be resident except for index
+ * allocation and $EA attributes.
+ *
+ * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
+ *
+ * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
+ *	    otherwise windows will not boot (blue screen of death)!  We cannot
+ *	    check for this here as we do not know which inode's $Bitmap is
+ *	    being asked about so the caller needs to special case this.
+ */
+int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+	if (type == AT_INDEX_ALLOCATION)
+		return -EPERM;
+	return 0;
+}
+
+/**
+ * ntfs_attr_record_resize - resize an attribute record
+ * @m:		mft record containing attribute record
+ * @a:		attribute record to resize
+ * @new_size:	new size in bytes to which to resize the attribute record @a
+ *
+ * Resize the attribute record @a, i.e. the resident part of the attribute, in
+ * the mft record @m to @new_size bytes.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
+{
+	ntfs_debug("Entering for new_size %u.", new_size);
+	/* Align to 8 bytes if it is not already done. */
+	if (new_size & 7)
+		new_size = (new_size + 7) & ~7;
+	/* If the actual attribute length has changed, move things around. */
+	if (new_size != le32_to_cpu(a->length)) {
+		u32 new_muse = le32_to_cpu(m->bytes_in_use) -
+				le32_to_cpu(a->length) + new_size;
+		/* Not enough space in this mft record. */
+		if (new_muse > le32_to_cpu(m->bytes_allocated))
+			return -ENOSPC;
+		/* Move attributes following @a to their new location. */
+		memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
+				le32_to_cpu(m->bytes_in_use) - ((u8*)a -
+				(u8*)m) - le32_to_cpu(a->length));
+		/* Adjust @m to reflect the change in used space. */
+		m->bytes_in_use = cpu_to_le32(new_muse);
+		/* Adjust @a to reflect the new size. */
+		if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
+			a->length = cpu_to_le32(new_size);
+	}
+	return 0;
+}
+
+/**
+ * ntfs_resident_attr_value_resize - resize the value of a resident attribute
+ * @m:		mft record containing attribute record
+ * @a:		attribute record whose value to resize
+ * @new_size:	new size in bytes to which to resize the attribute value of @a
+ *
+ * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
+ * If the value is made bigger, the newly allocated space is cleared.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size)
+{
+	u32 old_size;
+
+	/* Resize the resident part of the attribute record. */
+	if (ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) + new_size))
+		return -ENOSPC;
+	/*
+	 * The resize succeeded!  If we made the attribute value bigger, clear
+	 * the area between the old size and @new_size.
+	 */
+	old_size = le32_to_cpu(a->data.resident.value_length);
+	if (new_size > old_size)
+		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+				old_size, 0, new_size - old_size);
+	/* Finally update the length of the attribute value. */
+	a->data.resident.value_length = cpu_to_le32(new_size);
+	return 0;
+}
+
+/**
+ * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
+ * @ni:		ntfs inode describing the attribute to convert
+ * @data_size:	size of the resident data to copy to the non-resident attribute
+ *
+ * Convert the resident ntfs attribute described by the ntfs inode @ni to a
+ * non-resident one.
+ *
+ * @data_size must be equal to the attribute value size.  This is needed since
+ * we need to know the size before we can map the mft record and our callers
+ * always know it.  The reason we cannot simply read the size from the vfs
+ * inode i_size is that this is not necessarily uptodate.  This happens when
+ * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
+ *
+ * Return 0 on success and -errno on error.  The following error return codes
+ * are defined:
+ *	-EPERM	- The attribute is not allowed to be non-resident.
+ *	-ENOMEM	- Not enough memory.
+ *	-ENOSPC	- Not enough disk space.
+ *	-EINVAL	- Attribute not defined on the volume.
+ *	-EIO	- I/o error or other error.
+ * Note that -ENOSPC is also returned in the case that there is not enough
+ * space in the mft record to do the conversion.  This can happen when the mft
+ * record is already very full.  The caller is responsible for trying to make
+ * space in the mft record and trying again.  FIXME: Do we need a separate
+ * error return code for this kind of -ENOSPC or is it always worth trying
+ * again in case the attribute may then fit in a resident state so no need to
+ * make it non-resident at all?  Ho-hum...  (AIA)
+ *
+ * NOTE to self: No changes in the attribute list are required to move from
+ *		 a resident to a non-resident attribute.
+ *
+ * Locking: - The caller must hold i_mutex on the inode.
+ */
+int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
+{
+	s64 new_size;
+	struct inode *vi = VFS_I(ni);
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	struct page *page;
+	runlist_element *rl;
+	u8 *kaddr;
+	unsigned long flags;
+	int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
+	u32 attr_size;
+	u8 old_res_attr_flags;
+
+	/* Check that the attribute is allowed to be non-resident. */
+	err = ntfs_attr_can_be_non_resident(vol, ni->type);
+	if (unlikely(err)) {
+		if (err == -EPERM)
+			ntfs_debug("Attribute is not allowed to be "
+					"non-resident.");
+		else
+			ntfs_debug("Attribute not defined on the NTFS "
+					"volume!");
+		return err;
+	}
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
+	/*
+	 * The size needs to be aligned to a cluster boundary for allocation
+	 * purposes.
+	 */
+	new_size = (data_size + vol->cluster_size - 1) &
+			~(vol->cluster_size - 1);
+	if (new_size > 0) {
+		/*
+		 * Will need the page later and since the page lock nests
+		 * outside all ntfs locks, we need to get the page now.
+		 */
+		page = find_or_create_page(vi->i_mapping, 0,
+				mapping_gfp_mask(vi->i_mapping));
+		if (unlikely(!page))
+			return -ENOMEM;
+		/* Start by allocating clusters to hold the attribute value. */
+		rl = ntfs_cluster_alloc(vol, 0, new_size >>
+				vol->cluster_size_bits, -1, DATA_ZONE, true);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			ntfs_debug("Failed to allocate cluster%s, error code "
+					"%i.", (new_size >>
+					vol->cluster_size_bits) > 1 ? "s" : "",
+					err);
+			goto page_err_out;
+		}
+	} else {
+		rl = NULL;
+		page = NULL;
+	}
+	/* Determine the size of the mapping pairs array. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
+	if (unlikely(mp_size < 0)) {
+		err = mp_size;
+		ntfs_debug("Failed to get size for mapping pairs array, error "
+				"code %i.", err);
+		goto rl_err_out;
+	}
+	down_write(&ni->runlist.lock);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(NInoNonResident(ni));
+	BUG_ON(a->non_resident);
+	/*
+	 * Calculate new offsets for the name and the mapping pairs array.
+	 */
+	if (NInoSparse(ni) || NInoCompressed(ni))
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) +
+				sizeof(a->data.non_resident.compressed_size) +
+				7) & ~7;
+	else
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) + 7) & ~7;
+	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+	/*
+	 * Determine the size of the resident part of the now non-resident
+	 * attribute record.
+	 */
+	arec_size = (mp_ofs + mp_size + 7) & ~7;
+	/*
+	 * If the page is not uptodate bring it uptodate by copying from the
+	 * attribute value.
+	 */
+	attr_size = le32_to_cpu(a->data.resident.value_length);
+	BUG_ON(attr_size != data_size);
+	if (page && !PageUptodate(page)) {
+		kaddr = kmap_atomic(page);
+		memcpy(kaddr, (u8*)a +
+				le16_to_cpu(a->data.resident.value_offset),
+				attr_size);
+		memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size);
+		kunmap_atomic(kaddr);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+	/* Backup the attribute flag. */
+	old_res_attr_flags = a->data.resident.flags;
+	/* Resize the resident part of the attribute record. */
+	err = ntfs_attr_record_resize(m, a, arec_size);
+	if (unlikely(err))
+		goto err_out;
+	/*
+	 * Convert the resident part of the attribute record to describe a
+	 * non-resident attribute.
+	 */
+	a->non_resident = 1;
+	/* Move the attribute name if it exists and update the offset. */
+	if (a->name_length)
+		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+				a->name_length * sizeof(ntfschar));
+	a->name_offset = cpu_to_le16(name_ofs);
+	/* Setup the fields specific to non-resident attributes. */
+	a->data.non_resident.lowest_vcn = 0;
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
+			vol->cluster_size_bits);
+	a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
+	memset(&a->data.non_resident.reserved, 0,
+			sizeof(a->data.non_resident.reserved));
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
+	a->data.non_resident.data_size =
+			a->data.non_resident.initialized_size =
+			cpu_to_sle64(attr_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		a->data.non_resident.compression_unit = 0;
+		if (NInoCompressed(ni) || vol->major_ver < 3)
+			a->data.non_resident.compression_unit = 4;
+		a->data.non_resident.compressed_size =
+				a->data.non_resident.allocated_size;
+	} else
+		a->data.non_resident.compression_unit = 0;
+	/* Generate the mapping pairs array into the attribute record. */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
+			arec_size - mp_ofs, rl, 0, -1, NULL);
+	if (unlikely(err)) {
+		ntfs_debug("Failed to build mapping pairs, error code %i.",
+				err);
+		goto undo_err_out;
+	}
+	/* Setup the in-memory attribute structure to be non-resident. */
+	ni->runlist.rl = rl;
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_size;
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size = ni->allocated_size;
+		if (a->data.non_resident.compression_unit) {
+			ni->itype.compressed.block_size = 1U << (a->data.
+					non_resident.compression_unit +
+					vol->cluster_size_bits);
+			ni->itype.compressed.block_size_bits =
+					ffs(ni->itype.compressed.block_size) -
+					1;
+			ni->itype.compressed.block_clusters = 1U <<
+					a->data.non_resident.compression_unit;
+		} else {
+			ni->itype.compressed.block_size = 0;
+			ni->itype.compressed.block_size_bits = 0;
+			ni->itype.compressed.block_clusters = 0;
+		}
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	} else
+		vi->i_blocks = ni->allocated_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * This needs to be last since the address space operations ->readpage
+	 * and ->writepage can run concurrently with us as they are not
+	 * serialized on i_mutex.  Note, we are not allowed to fail once we flip
+	 * this switch, which is another reason to do this last.
+	 */
+	NInoSetNonResident(ni);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	if (page) {
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	ntfs_debug("Done.");
+	return 0;
+undo_err_out:
+	/* Convert the attribute back into a resident attribute. */
+	a->non_resident = 0;
+	/* Move the attribute name if it exists and update the offset. */
+	name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
+			sizeof(a->data.resident.reserved) + 7) & ~7;
+	if (a->name_length)
+		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+				a->name_length * sizeof(ntfschar));
+	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+	a->name_offset = cpu_to_le16(name_ofs);
+	arec_size = (mp_ofs + attr_size + 7) & ~7;
+	/* Resize the resident part of the attribute record. */
+	err2 = ntfs_attr_record_resize(m, a, arec_size);
+	if (unlikely(err2)) {
+		/*
+		 * This cannot happen (well if memory corruption is at work it
+		 * could happen in theory), but deal with it as well as we can.
+		 * If the old size is too small, truncate the attribute,
+		 * otherwise simply give it a larger allocated size.
+		 * FIXME: Should check whether chkdsk complains when the
+		 * allocated size is much bigger than the resident value size.
+		 */
+		arec_size = le32_to_cpu(a->length);
+		if ((mp_ofs + attr_size) > arec_size) {
+			err2 = attr_size;
+			attr_size = arec_size - mp_ofs;
+			ntfs_error(vol->sb, "Failed to undo partial resident "
+					"to non-resident attribute "
+					"conversion.  Truncating inode 0x%lx, "
+					"attribute type 0x%x from %i bytes to "
+					"%i bytes to maintain metadata "
+					"consistency.  THIS MEANS YOU ARE "
+					"LOSING %i BYTES DATA FROM THIS %s.",
+					vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type),
+					err2, attr_size, err2 - attr_size,
+					((ni->type == AT_DATA) &&
+					!ni->name_len) ? "FILE": "ATTRIBUTE");
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->initialized_size = attr_size;
+			i_size_write(vi, attr_size);
+			write_unlock_irqrestore(&ni->size_lock, flags);
+		}
+	}
+	/* Setup the fields specific to resident attributes. */
+	a->data.resident.value_length = cpu_to_le32(attr_size);
+	a->data.resident.value_offset = cpu_to_le16(mp_ofs);
+	a->data.resident.flags = old_res_attr_flags;
+	memset(&a->data.resident.reserved, 0,
+			sizeof(a->data.resident.reserved));
+	/* Copy the data from the page back to the attribute value. */
+	if (page) {
+		kaddr = kmap_atomic(page);
+		memcpy((u8*)a + mp_ofs, kaddr, attr_size);
+		kunmap_atomic(kaddr);
+	}
+	/* Setup the allocated size in the ntfs inode in case it changed. */
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = arec_size - mp_ofs;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ni->runlist.rl = NULL;
+	up_write(&ni->runlist.lock);
+rl_err_out:
+	if (rl) {
+		if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
+			ntfs_error(vol->sb, "Failed to release allocated "
+					"cluster(s) in error code path.  Run "
+					"chkdsk to recover the lost "
+					"cluster(s).");
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl);
+page_err_out:
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (err == -EINVAL)
+		err = -EIO;
+	return err;
+}
+
+/**
+ * ntfs_attr_extend_allocation - extend the allocated space of an attribute
+ * @ni:			ntfs inode of the attribute whose allocation to extend
+ * @new_alloc_size:	new size in bytes to which to extend the allocation to
+ * @new_data_size:	new size in bytes to which to extend the data to
+ * @data_start:		beginning of region which is required to be non-sparse
+ *
+ * Extend the allocated space of an attribute described by the ntfs inode @ni
+ * to @new_alloc_size bytes.  If @data_start is -1, the whole extension may be
+ * implemented as a hole in the file (as long as both the volume and the ntfs
+ * inode @ni have sparse support enabled).  If @data_start is >= 0, then the
+ * region between the old allocated size and @data_start - 1 may be made sparse
+ * but the regions between @data_start and @new_alloc_size must be backed by
+ * actual clusters.
+ *
+ * If @new_data_size is -1, it is ignored.  If it is >= 0, then the data size
+ * of the attribute is extended to @new_data_size.  Note that the i_size of the
+ * vfs inode is not updated.  Only the data size in the base attribute record
+ * is updated.  The caller has to update i_size separately if this is required.
+ * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
+ * size as well as for @new_data_size to be greater than @new_alloc_size.
+ *
+ * For resident attributes this involves resizing the attribute record and if
+ * necessary moving it and/or other attributes into extent mft records and/or
+ * converting the attribute to a non-resident attribute which in turn involves
+ * extending the allocation of a non-resident attribute as described below.
+ *
+ * For non-resident attributes this involves allocating clusters in the data
+ * zone on the volume (except for regions that are being made sparse) and
+ * extending the run list to describe the allocated clusters as well as
+ * updating the mapping pairs array of the attribute.  This in turn involves
+ * resizing the attribute record and if necessary moving it and/or other
+ * attributes into extent mft records and/or splitting the attribute record
+ * into multiple extent attribute records.
+ *
+ * Also, the attribute list attribute is updated if present and in some of the
+ * above cases (the ones where extent mft records/attributes come into play),
+ * an attribute list attribute is created if not already present.
+ *
+ * Return the new allocated size on success and -errno on error.  In the case
+ * that an error is encountered but a partial extension at least up to
+ * @data_start (if present) is possible, the allocation is partially extended
+ * and this is returned.  This means the caller must check the returned size to
+ * determine if the extension was partial.  If @data_start is -1 then partial
+ * allocations are not performed.
+ *
+ * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
+ *
+ * Locking: This function takes the runlist lock of @ni for writing as well as
+ * locking the mft record of the base ntfs inode.  These locks are maintained
+ * throughout execution of the function.  These locks are required so that the
+ * attribute can be resized safely and so that it can for example be converted
+ * from resident to non-resident safely.
+ *
+ * TODO: At present attribute list attribute handling is not implemented.
+ *
+ * TODO: At present it is not safe to call this function for anything other
+ * than the $DATA attribute(s) of an uncompressed and unencrypted file.
+ */
+s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+		const s64 new_data_size, const s64 data_start)
+{
+	VCN vcn;
+	s64 ll, allocated_size, start = data_start;
+	struct inode *vi = VFS_I(ni);
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	runlist_element *rl, *rl2;
+	unsigned long flags;
+	int err, mp_size;
+	u32 attr_len = 0; /* Silence stupid gcc warning. */
+	bool mp_rebuilt;
+
+#ifdef DEBUG
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"old_allocated_size 0x%llx, "
+			"new_allocated_size 0x%llx, new_data_size 0x%llx, "
+			"data_start 0x%llx.", vi->i_ino,
+			(unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)allocated_size,
+			(unsigned long long)new_alloc_size,
+			(unsigned long long)new_data_size,
+			(unsigned long long)start);
+#endif
+retry_extend:
+	/*
+	 * For non-resident attributes, @start and @new_size need to be aligned
+	 * to cluster boundaries for allocation purposes.
+	 */
+	if (NInoNonResident(ni)) {
+		if (start > 0)
+			start &= ~(s64)vol->cluster_size_mask;
+		new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
+				~(s64)vol->cluster_size_mask;
+	}
+	BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
+	/* Check if new size is allowed in $AttrDef. */
+	err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
+	if (unlikely(err)) {
+		/* Only emit errors when the write will fail completely. */
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (start < 0 || start >= allocated_size) {
+			if (err == -ERANGE) {
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because the new "
+						"allocation would exceed the "
+						"maximum allowed size for "
+						"this attribute type.",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+			} else {
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because this "
+						"attribute type is not "
+						"defined on the NTFS volume.  "
+						"Possible corruption!  You "
+						"should run chkdsk!",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+			}
+		}
+		/* Translate error code to be POSIX conformant for write(2). */
+		if (err == -ERANGE)
+			err = -EFBIG;
+		else
+			err = -EIO;
+		return err;
+	}
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/*
+	 * We will be modifying both the runlist (if non-resident) and the mft
+	 * record so lock them both down.
+	 */
+	down_write(&ni->runlist.lock);
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * If non-resident, seek to the last extent.  If resident, there is
+	 * only one extent, so seek to that.
+	 */
+	vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
+			0;
+	/*
+	 * Abort if someone did the work whilst we waited for the locks.  If we
+	 * just converted the attribute from resident to non-resident it is
+	 * likely that exactly this has happened already.  We cannot quite
+	 * abort if we need to update the data size.
+	 */
+	if (unlikely(new_alloc_size <= allocated_size)) {
+		ntfs_debug("Allocated size already exceeds requested size.");
+		new_alloc_size = allocated_size;
+		if (new_data_size < 0)
+			goto done;
+		/*
+		 * We want the first attribute extent so that we can update the
+		 * data size.
+		 */
+		vcn = 0;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, vcn, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/* Use goto to reduce indentation. */
+	if (a->non_resident)
+		goto do_non_resident_extend;
+	BUG_ON(NInoNonResident(ni));
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	/*
+	 * Extend the attribute record to be able to store the new attribute
+	 * size.  ntfs_attr_record_resize() will not do anything if the size is
+	 * not changing.
+	 */
+	if (new_alloc_size < vol->mft_record_size &&
+			!ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) +
+			new_alloc_size)) {
+		/* The resize succeeded! */
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		if (new_data_size >= 0) {
+			BUG_ON(new_data_size < attr_len);
+			a->data.resident.value_length =
+					cpu_to_le32((u32)new_data_size);
+		}
+		goto flush_done;
+	}
+	/*
+	 * We have to drop all the locks so we can call
+	 * ntfs_attr_make_non_resident().  This could be optimised by try-
+	 * locking the first page cache page and only if that fails dropping
+	 * the locks, locking the page, and redoing all the locking and
+	 * lookups.  While this would be a huge optimisation, it is not worth
+	 * it as this is definitely a slow code path.
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	/*
+	 * Not enough space in the mft record, try to make the attribute
+	 * non-resident and if successful restart the extension process.
+	 */
+	err = ntfs_attr_make_non_resident(ni, attr_len);
+	if (likely(!err))
+		goto retry_extend;
+	/*
+	 * Could not make non-resident.  If this is due to this not being
+	 * permitted for this attribute type or there not being enough space,
+	 * try to make other attributes non-resident.  Otherwise fail.
+	 */
+	if (unlikely(err != -EPERM && err != -ENOSPC)) {
+		/* Only emit errors when the write will fail completely. */
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the conversion from resident "
+					"to non-resident attribute failed "
+					"with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		goto conv_err_out;
+	}
+	/* TODO: Not implemented from here, abort. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (start < 0 || start >= allocated_size) {
+		if (err == -ENOSPC)
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record/on disk for the non-resident "
+					"attribute value.  This case is not "
+					"implemented yet.");
+		else /* if (err == -EPERM) */
+			ntfs_error(vol->sb, "This attribute type may not be "
+					"non-resident.  This case is not "
+					"implemented yet.");
+	}
+	err = -EOPNOTSUPP;
+	goto conv_err_out;
+#if 0
+	// TODO: Attempt to make other attributes non-resident.
+	if (!err)
+		goto do_resident_extend;
+	/*
+	 * Both the attribute list attribute and the standard information
+	 * attribute must remain in the base inode.  Thus, if this is one of
+	 * these attributes, we have to try to move other attributes out into
+	 * extent mft records instead.
+	 */
+	if (ni->type == AT_ATTRIBUTE_LIST ||
+			ni->type == AT_STANDARD_INFORMATION) {
+		// TODO: Attempt to move other attributes into extent mft
+		// records.
+		err = -EOPNOTSUPP;
+		if (!err)
+			goto do_resident_extend;
+		goto err_out;
+	}
+	// TODO: Attempt to move this attribute to an extent mft record, but
+	// only if it is not already the only attribute in an mft record in
+	// which case there would be nothing to gain.
+	err = -EOPNOTSUPP;
+	if (!err)
+		goto do_resident_extend;
+	/* There is nothing we can do to make enough space. )-: */
+	goto err_out;
+#endif
+do_non_resident_extend:
+	BUG_ON(!NInoNonResident(ni));
+	if (new_alloc_size == allocated_size) {
+		BUG_ON(vcn);
+		goto alloc_done;
+	}
+	/*
+	 * If the data starts after the end of the old allocation, this is a
+	 * $DATA attribute and sparse attributes are enabled on the volume and
+	 * for this inode, then create a sparse region between the old
+	 * allocated size and the start of the data.  Otherwise simply proceed
+	 * with filling the whole space between the old allocated size and the
+	 * new allocated size with clusters.
+	 */
+	if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
+			!NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
+		goto skip_sparse;
+	// TODO: This is not implemented yet.  We just fill in with real
+	// clusters for now...
+	ntfs_debug("Inserting holes is not-implemented yet.  Falling back to "
+			"allocating real clusters instead.");
+skip_sparse:
+	rl = ni->runlist.rl;
+	if (likely(rl)) {
+		/* Seek to the end of the runlist. */
+		while (rl->length)
+			rl++;
+	}
+	/* If this attribute extent is not mapped, map it now. */
+	if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
+			(rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
+			(rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
+		if (!rl && !allocated_size)
+			goto first_alloc;
+		rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			if (start < 0 || start >= allocated_size)
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because the "
+						"mapping of a runlist "
+						"fragment failed with error "
+						"code %i.", vi->i_ino,
+						(unsigned)le32_to_cpu(ni->type),
+						err);
+			if (err != -ENOMEM)
+				err = -EIO;
+			goto err_out;
+		}
+		ni->runlist.rl = rl;
+		/* Seek to the end of the runlist. */
+		while (rl->length)
+			rl++;
+	}
+	/*
+	 * We now know the runlist of the last extent is mapped and @rl is at
+	 * the end of the runlist.  We want to begin allocating clusters
+	 * starting at the last allocated cluster to reduce fragmentation.  If
+	 * there are no valid LCNs in the attribute we let the cluster
+	 * allocator choose the starting cluster.
+	 */
+	/* If the last LCN is a hole or simillar seek back to last real LCN. */
+	while (rl->lcn < 0 && rl > ni->runlist.rl)
+		rl--;
+first_alloc:
+	// FIXME: Need to implement partial allocations so at least part of the
+	// write can be performed when start >= 0.  (Needed for POSIX write(2)
+	// conformance.)
+	rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
+			(new_alloc_size - allocated_size) >>
+			vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
+			rl->lcn + rl->length : -1, DATA_ZONE, true);
+	if (IS_ERR(rl2)) {
+		err = PTR_ERR(rl2);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the allocation of clusters "
+					"failed with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM && err != -ENOSPC)
+			err = -EIO;
+		goto err_out;
+	}
+	rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+	if (IS_ERR(rl)) {
+		err = PTR_ERR(rl);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the runlist merge failed "
+					"with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		if (ntfs_cluster_free_from_rl(vol, rl2)) {
+			ntfs_error(vol->sb, "Failed to release allocated "
+					"cluster(s) in error code path.  Run "
+					"chkdsk to recover the lost "
+					"cluster(s).");
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl2);
+		goto err_out;
+	}
+	ni->runlist.rl = rl;
+	ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
+			allocated_size) >> vol->cluster_size_bits);
+	/* Find the runlist element with which the attribute extent starts. */
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
+	BUG_ON(!rl2);
+	BUG_ON(!rl2->length);
+	BUG_ON(rl2->lcn < LCN_HOLE);
+	mp_rebuilt = false;
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+	if (unlikely(mp_size <= 0)) {
+		err = mp_size;
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because determining the size for the "
+					"mapping pairs failed with error code "
+					"%i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		err = -EIO;
+		goto undo_alloc;
+	}
+	/* Extend the attribute record to fit the bigger mapping pairs array. */
+	attr_len = le32_to_cpu(a->length);
+	err = ntfs_attr_record_resize(m, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(err)) {
+		BUG_ON(err != -ENOSPC);
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record,
+		// possibly by extending this extent partially and filling it
+		// and creating a new extent for the remainder, or by making
+		// other attributes non-resident and/or by moving other
+		// attributes out of this mft record.
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record for the extended attribute "
+					"record.  This case is not "
+					"implemented yet.");
+		err = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	mp_rebuilt = true;
+	/* Generate the mapping pairs array directly into the attr record. */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, -1, NULL);
+	if (unlikely(err)) {
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because building the mapping pairs "
+					"failed with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		err = -EIO;
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+			vol->cluster_size_bits) - 1);
+	/*
+	 * We now have extended the allocated size of the attribute.  Reflect
+	 * this in the ntfs_inode structure and the attribute record.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err))
+			goto restore_undo_alloc;
+		/* @m is not used any more so no need to set it. */
+		a = ctx->attr;
+	}
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_alloc_size;
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+	/*
+	 * FIXME: This would fail if @ni is a directory, $MFT, or an index,
+	 * since those can have sparse/compressed set.  For example can be
+	 * set compressed even though it is not compressed itself and in that
+	 * case the bit means that files are to be created compressed in the
+	 * directory...  At present this is ok as this code is only called for
+	 * regular files, and only for their $DATA attribute(s).
+	 * FIXME: The calculation is wrong if we created a hole above.  For now
+	 * it does not matter as we never create holes.
+	 */
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size += new_alloc_size - allocated_size;
+		a->data.non_resident.compressed_size =
+				cpu_to_sle64(ni->itype.compressed.size);
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	} else
+		vi->i_blocks = new_alloc_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+alloc_done:
+	if (new_data_size >= 0) {
+		BUG_ON(new_data_size <
+				sle64_to_cpu(a->data.non_resident.data_size));
+		a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
+	}
+flush_done:
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+done:
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	ntfs_debug("Done, new_allocated_size 0x%llx.",
+			(unsigned long long)new_alloc_size);
+	return new_alloc_size;
+restore_undo_alloc:
+	if (start < 0 || start >= allocated_size)
+		ntfs_error(vol->sb, "Cannot complete extension of allocation "
+				"of inode 0x%lx, attribute type 0x%x, because "
+				"lookup of first attribute extent failed with "
+				"error code %i.", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), err);
+	if (err == -ENOENT)
+		err = -EIO;
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
+			allocated_size >> vol->cluster_size_bits, NULL, 0,
+			ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"attribute in error code path.  Run chkdsk to "
+				"recover.");
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->allocated_size = new_alloc_size;
+		/*
+		 * FIXME: This would fail if @ni is a directory...  See above.
+		 * FIXME: The calculation is wrong if we created a hole above.
+		 * For now it does not matter as we never create holes.
+		 */
+		if (NInoSparse(ni) || NInoCompressed(ni)) {
+			ni->itype.compressed.size += new_alloc_size -
+					allocated_size;
+			vi->i_blocks = ni->itype.compressed.size >> 9;
+		} else
+			vi->i_blocks = new_alloc_size >> 9;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		up_write(&ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is the allocated size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return err;
+	}
+	ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
+			(allocated_size >> vol->cluster_size_bits) - 1);
+undo_alloc:
+	ll = allocated_size >> vol->cluster_size_bits;
+	if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
+		ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
+				"in error code path.  Run chkdsk to recover "
+				"the lost cluster(s).");
+		NVolSetErrors(vol);
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/*
+	 * If the runlist truncation fails and/or the search context is no
+	 * longer valid, we cannot resize the attribute record or build the
+	 * mapping pairs array thus we mark the inode bad so that no access to
+	 * the freed clusters can happen.
+	 */
+	if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
+		ntfs_error(vol->sb, "Failed to %s in error code path.  Run "
+				"chkdsk to recover.", IS_ERR(m) ?
+				"restore attribute search context" :
+				"truncate attribute runlist");
+		NVolSetErrors(vol);
+	} else if (mp_rebuilt) {
+		if (ntfs_attr_record_resize(m, a, attr_len)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record in error code path.  Run "
+					"chkdsk to recover.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+					a->data.non_resident.
+					mapping_pairs_offset), attr_len -
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), rl2, ll, -1,
+					NULL)) {
+				ntfs_error(vol->sb, "Failed to restore "
+						"mapping pairs array in error "
+						"code path.  Run chkdsk to "
+						"recover.");
+				NVolSetErrors(vol);
+			}
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+		}
+	}
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+conv_err_out:
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+}
+
+/**
+ * ntfs_attr_set - fill (a part of) an attribute with a byte
+ * @ni:		ntfs inode describing the attribute to fill
+ * @ofs:	offset inside the attribute at which to start to fill
+ * @cnt:	number of bytes to fill
+ * @val:	the unsigned 8-bit value with which to fill the attribute
+ *
+ * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
+ * byte offset @ofs inside the attribute with the constant byte @val.
+ *
+ * This function is effectively like memset() applied to an ntfs attribute.
+ * Note thie function actually only operates on the page cache pages belonging
+ * to the ntfs attribute and it marks them dirty after doing the memset().
+ * Thus it relies on the vm dirty page write code paths to cause the modified
+ * pages to be written to the mft record/disk.
+ *
+ * Return 0 on success and -errno on error.  An error code of -ESPIPE means
+ * that @ofs + @cnt were outside the end of the attribute and no write was
+ * performed.
+ */
+int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
+{
+	ntfs_volume *vol = ni->vol;
+	struct address_space *mapping;
+	struct page *page;
+	u8 *kaddr;
+	pgoff_t idx, end;
+	unsigned start_ofs, end_ofs, size;
+
+	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
+			(long long)ofs, (long long)cnt, val);
+	BUG_ON(ofs < 0);
+	BUG_ON(cnt < 0);
+	if (!cnt)
+		goto done;
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
+	mapping = VFS_I(ni)->i_mapping;
+	/* Work out the starting index and page offset. */
+	idx = ofs >> PAGE_CACHE_SHIFT;
+	start_ofs = ofs & ~PAGE_CACHE_MASK;
+	/* Work out the ending index and page offset. */
+	end = ofs + cnt;
+	end_ofs = end & ~PAGE_CACHE_MASK;
+	/* If the end is outside the inode size return -ESPIPE. */
+	if (unlikely(end > i_size_read(VFS_I(ni)))) {
+		ntfs_error(vol->sb, "Request exceeds end of attribute.");
+		return -ESPIPE;
+	}
+	end >>= PAGE_CACHE_SHIFT;
+	/* If there is a first partial page, need to do it the slow way. */
+	if (start_ofs) {
+		page = read_mapping_page(mapping, idx, NULL);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read first partial "
+					"page (error, index 0x%lx).", idx);
+			return PTR_ERR(page);
+		}
+		/*
+		 * If the last page is the same as the first page, need to
+		 * limit the write to the end offset.
+		 */
+		size = PAGE_CACHE_SIZE;
+		if (idx == end)
+			size = end_ofs;
+		kaddr = kmap_atomic(page);
+		memset(kaddr + start_ofs, val, size - start_ofs);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr);
+		set_page_dirty(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+		if (idx == end)
+			goto done;
+		idx++;
+	}
+	/* Do the whole pages the fast way. */
+	for (; idx < end; idx++) {
+		/* Find or create the current page.  (The page is locked.) */
+		page = grab_cache_page(mapping, idx);
+		if (unlikely(!page)) {
+			ntfs_error(vol->sb, "Insufficient memory to grab "
+					"page (index 0x%lx).", idx);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page);
+		memset(kaddr, val, PAGE_CACHE_SIZE);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr);
+		/*
+		 * If the page has buffers, mark them uptodate since buffer
+		 * state and not page state is definitive in 2.6 kernels.
+		 */
+		if (page_has_buffers(page)) {
+			struct buffer_head *bh, *head;
+
+			bh = head = page_buffers(page);
+			do {
+				set_buffer_uptodate(bh);
+			} while ((bh = bh->b_this_page) != head);
+		}
+		/* Now that buffers are uptodate, set the page uptodate, too. */
+		SetPageUptodate(page);
+		/*
+		 * Set the page and all its buffers dirty and mark the inode
+		 * dirty, too.  The VM will write the page later on.
+		 */
+		set_page_dirty(page);
+		/* Finally unlock and release the page. */
+		unlock_page(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	}
+	/* If there is a last partial page, need to do it the slow way. */
+	if (end_ofs) {
+		page = read_mapping_page(mapping, idx, NULL);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read last partial page "
+					"(error, index 0x%lx).", idx);
+			return PTR_ERR(page);
+		}
+		kaddr = kmap_atomic(page);
+		memset(kaddr, val, end_ofs);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr);
+		set_page_dirty(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	}
+done:
+	ntfs_debug("Done.");
+	return 0;
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/attrib.h b/kernel/fs/ntfs/attrib.h
new file mode 100644
index 000000000..3c8b74c99
--- /dev/null
+++ b/kernel/fs/ntfs/attrib.h
@@ -0,0 +1,116 @@
+/*
+ * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
+ *	      Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_ATTRIB_H
+#define _LINUX_NTFS_ATTRIB_H
+
+#include "endian.h"
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "runlist.h"
+#include "volume.h"
+
+/**
+ * ntfs_attr_search_ctx - used in attribute search functions
+ * @mrec:	buffer containing mft record to search
+ * @attr:	attribute record in @mrec where to begin/continue search
+ * @is_first:	if true ntfs_attr_lookup() begins search with @attr, else after
+ *
+ * Structure must be initialized to zero before the first call to one of the
+ * attribute search functions. Initialize @mrec to point to the mft record to
+ * search, and @attr to point to the first attribute within @mrec (not necessary
+ * if calling the _first() functions), and set @is_first to 'true' (not necessary
+ * if calling the _first() functions).
+ *
+ * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
+ * the search begins after @attr. This is so that, after the first call to one
+ * of the search attribute functions, we can call the function again, without
+ * any modification of the search context, to automagically get the next
+ * matching attribute.
+ */
+typedef struct {
+	MFT_RECORD *mrec;
+	ATTR_RECORD *attr;
+	bool is_first;
+	ntfs_inode *ntfs_ino;
+	ATTR_LIST_ENTRY *al_entry;
+	ntfs_inode *base_ntfs_ino;
+	MFT_RECORD *base_mrec;
+	ATTR_RECORD *base_attr;
+} ntfs_attr_search_ctx;
+
+extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
+		ntfs_attr_search_ctx *ctx);
+extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
+
+extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const bool write_locked);
+
+extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
+		const VCN vcn, ntfs_attr_search_ctx *ctx);
+
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+		const u32 name_len, const IGNORE_CASE_BOOL ic,
+		const VCN lowest_vcn, const u8 *val, const u32 val_len,
+		ntfs_attr_search_ctx *ctx);
+
+extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
+		const s64 size, const s64 initialized_size);
+
+static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
+{
+	if (!a->non_resident)
+		return (s64)le32_to_cpu(a->data.resident.value_length);
+	return sle64_to_cpu(a->data.non_resident.data_size);
+}
+
+extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
+extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
+		MFT_RECORD *mrec);
+extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
+
+#ifdef NTFS_RW
+
+extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
+		const ATTR_TYPE type, const s64 size);
+extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
+		const ATTR_TYPE type);
+extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
+		const ATTR_TYPE type);
+
+extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size);
+
+extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
+
+extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+		const s64 new_data_size, const s64 data_start);
+
+extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
+		const u8 val);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/kernel/fs/ntfs/bitmap.c b/kernel/fs/ntfs/bitmap.c
new file mode 100644
index 000000000..0809cf876
--- /dev/null
+++ b/kernel/fs/ntfs/bitmap.c
@@ -0,0 +1,193 @@
+/*
+ * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/pagemap.h>
+
+#include "bitmap.h"
+#include "debug.h"
+#include "aops.h"
+#include "ntfs.h"
+
+/**
+ * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi:			vfs inode describing the bitmap
+ * @start_bit:		first bit to set
+ * @count:		number of bits to set
+ * @value:		value to set the bits to (i.e. 0 or 1)
+ * @is_rollback:	if 'true' this is a rollback operation
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * @is_rollback should always be 'false', it is for internal use to rollback
+ * errors.  You probably want to use ntfs_bitmap_set_bits_in_run() instead.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+		const s64 count, const u8 value, const bool is_rollback)
+{
+	s64 cnt = count;
+	pgoff_t index, end_index;
+	struct address_space *mapping;
+	struct page *page;
+	u8 *kaddr;
+	int pos, len;
+	u8 bit;
+
+	BUG_ON(!vi);
+	ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
+			"value %u.%s", vi->i_ino, (unsigned long long)start_bit,
+			(unsigned long long)cnt, (unsigned int)value,
+			is_rollback ? " (rollback)" : "");
+	BUG_ON(start_bit < 0);
+	BUG_ON(cnt < 0);
+	BUG_ON(value > 1);
+	/*
+	 * Calculate the indices for the pages containing the first and last
+	 * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
+	 */
+	index = start_bit >> (3 + PAGE_CACHE_SHIFT);
+	end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT);
+
+	/* Get the page containing the first bit (@start_bit). */
+	mapping = vi->i_mapping;
+	page = ntfs_map_page(mapping, index);
+	if (IS_ERR(page)) {
+		if (!is_rollback)
+			ntfs_error(vi->i_sb, "Failed to map first page (error "
+					"%li), aborting.", PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+	kaddr = page_address(page);
+
+	/* Set @pos to the position of the byte containing @start_bit. */
+	pos = (start_bit >> 3) & ~PAGE_CACHE_MASK;
+
+	/* Calculate the position of @start_bit in the first byte. */
+	bit = start_bit & 7;
+
+	/* If the first byte is partial, modify the appropriate bits in it. */
+	if (bit) {
+		u8 *byte = kaddr + pos;
+		while ((bit & 7) && cnt) {
+			cnt--;
+			if (value)
+				*byte |= 1 << bit++;
+			else
+				*byte &= ~(1 << bit++);
+		}
+		/* If we are done, unmap the page and return success. */
+		if (!cnt)
+			goto done;
+
+		/* Update @pos to the new position. */
+		pos++;
+	}
+	/*
+	 * Depending on @value, modify all remaining whole bytes in the page up
+	 * to @cnt.
+	 */
+	len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos);
+	memset(kaddr + pos, value ? 0xff : 0, len);
+	cnt -= len << 3;
+
+	/* Update @len to point to the first not-done byte in the page. */
+	if (cnt < 8)
+		len += pos;
+
+	/* If we are not in the last page, deal with all subsequent pages. */
+	while (index < end_index) {
+		BUG_ON(cnt <= 0);
+
+		/* Update @index and get the next page. */
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		ntfs_unmap_page(page);
+		page = ntfs_map_page(mapping, ++index);
+		if (IS_ERR(page))
+			goto rollback;
+		kaddr = page_address(page);
+		/*
+		 * Depending on @value, modify all remaining whole bytes in the
+		 * page up to @cnt.
+		 */
+		len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE);
+		memset(kaddr, value ? 0xff : 0, len);
+		cnt -= len << 3;
+	}
+	/*
+	 * The currently mapped page is the last one.  If the last byte is
+	 * partial, modify the appropriate bits in it.  Note, @len is the
+	 * position of the last byte inside the page.
+	 */
+	if (cnt) {
+		u8 *byte;
+
+		BUG_ON(cnt > 7);
+
+		bit = cnt;
+		byte = kaddr + len;
+		while (bit--) {
+			if (value)
+				*byte |= 1 << bit;
+			else
+				*byte &= ~(1 << bit);
+		}
+	}
+done:
+	/* We are done.  Unmap the page and return success. */
+	flush_dcache_page(page);
+	set_page_dirty(page);
+	ntfs_unmap_page(page);
+	ntfs_debug("Done.");
+	return 0;
+rollback:
+	/*
+	 * Current state:
+	 *	- no pages are mapped
+	 *	- @count - @cnt is the number of bits that have been modified
+	 */
+	if (is_rollback)
+		return PTR_ERR(page);
+	if (count != cnt)
+		pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
+				value ? 0 : 1, true);
+	else
+		pos = 0;
+	if (!pos) {
+		/* Rollback was successful. */
+		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+				"%li), aborting.", PTR_ERR(page));
+	} else {
+		/* Rollback failed. */
+		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+				"%li) and rollback failed (error %i).  "
+				"Aborting and leaving inconsistent metadata.  "
+				"Unmount and run chkdsk.", PTR_ERR(page), pos);
+		NVolSetErrors(NTFS_SB(vi->i_sb));
+	}
+	return PTR_ERR(page);
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/bitmap.h b/kernel/fs/ntfs/bitmap.h
new file mode 100644
index 000000000..72c9ad8be
--- /dev/null
+++ b/kernel/fs/ntfs/bitmap.h
@@ -0,0 +1,118 @@
+/*
+ * bitmap.h - Defines for NTFS kernel bitmap handling.  Part of the Linux-NTFS
+ *	      project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_BITMAP_H
+#define _LINUX_NTFS_BITMAP_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "types.h"
+
+extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+		const s64 count, const u8 value, const bool is_rollback);
+
+/**
+ * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi:			vfs inode describing the bitmap
+ * @start_bit:		first bit to set
+ * @count:		number of bits to set
+ * @value:		value to set the bits to (i.e. 0 or 1)
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
+		const s64 start_bit, const s64 count, const u8 value)
+{
+	return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
+			false);
+}
+
+/**
+ * ntfs_bitmap_set_run - set a run of bits in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @start_bit:	first bit to set
+ * @count:	number of bits to set
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
+		const s64 count)
+{
+	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
+}
+
+/**
+ * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @start_bit:	first bit to clear
+ * @count:	number of bits to clear
+ *
+ * Clear @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
+		const s64 count)
+{
+	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
+}
+
+/**
+ * ntfs_bitmap_set_bit - set a bit in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @bit:	bit to set
+ *
+ * Set bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
+{
+	return ntfs_bitmap_set_run(vi, bit, 1);
+}
+
+/**
+ * ntfs_bitmap_clear_bit - clear a bit in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @bit:	bit to clear
+ *
+ * Clear bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
+{
+	return ntfs_bitmap_clear_run(vi, bit, 1);
+}
+
+#endif /* NTFS_RW */
+
+#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/kernel/fs/ntfs/collate.c b/kernel/fs/ntfs/collate.c
new file mode 100644
index 000000000..4a28ab389
--- /dev/null
+++ b/kernel/fs/ntfs/collate.c
@@ -0,0 +1,124 @@
+/*
+ * collate.c - NTFS kernel collation handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "collate.h"
+#include "debug.h"
+#include "ntfs.h"
+
+static int ntfs_collate_binary(ntfs_volume *vol,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len)
+{
+	int rc;
+
+	ntfs_debug("Entering.");
+	rc = memcmp(data1, data2, min(data1_len, data2_len));
+	if (!rc && (data1_len != data2_len)) {
+		if (data1_len < data2_len)
+			rc = -1;
+		else
+			rc = 1;
+	}
+	ntfs_debug("Done, returning %i", rc);
+	return rc;
+}
+
+static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len)
+{
+	int rc;
+	u32 d1, d2;
+
+	ntfs_debug("Entering.");
+	// FIXME:  We don't really want to bug here.
+	BUG_ON(data1_len != data2_len);
+	BUG_ON(data1_len != 4);
+	d1 = le32_to_cpup(data1);
+	d2 = le32_to_cpup(data2);
+	if (d1 < d2)
+		rc = -1;
+	else {
+		if (d1 == d2)
+			rc = 0;
+		else
+			rc = 1;
+	}
+	ntfs_debug("Done, returning %i", rc);
+	return rc;
+}
+
+typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
+		const void *, const int);
+
+static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
+	ntfs_collate_binary,
+	NULL/*ntfs_collate_file_name*/,
+	NULL/*ntfs_collate_unicode_string*/,
+};
+
+static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
+	ntfs_collate_ntofs_ulong,
+	NULL/*ntfs_collate_ntofs_sid*/,
+	NULL/*ntfs_collate_ntofs_security_hash*/,
+	NULL/*ntfs_collate_ntofs_ulongs*/,
+};
+
+/**
+ * ntfs_collate - collate two data items using a specified collation rule
+ * @vol:	ntfs volume to which the data items belong
+ * @cr:		collation rule to use when comparing the items
+ * @data1:	first data item to collate
+ * @data1_len:	length in bytes of @data1
+ * @data2:	second data item to collate
+ * @data2_len:	length in bytes of @data2
+ *
+ * Collate the two data items @data1 and @data2 using the collation rule @cr
+ * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
+ * to match, or to collate after @data2.
+ *
+ * For speed we use the collation rule @cr as an index into two tables of
+ * function pointers to call the appropriate collation function.
+ */
+int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len) {
+	int i;
+
+	ntfs_debug("Entering.");
+	/*
+	 * FIXME:  At the moment we only support COLLATION_BINARY and
+	 * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
+	 */
+	BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
+	i = le32_to_cpu(cr);
+	BUG_ON(i < 0);
+	if (i <= 0x02)
+		return ntfs_do_collate0x0[i](vol, data1, data1_len,
+				data2, data2_len);
+	BUG_ON(i < 0x10);
+	i -= 0x10;
+	if (likely(i <= 3))
+		return ntfs_do_collate0x1[i](vol, data1, data1_len,
+				data2, data2_len);
+	BUG();
+	return 0;
+}
diff --git a/kernel/fs/ntfs/collate.h b/kernel/fs/ntfs/collate.h
new file mode 100644
index 000000000..aba83347e
--- /dev/null
+++ b/kernel/fs/ntfs/collate.h
@@ -0,0 +1,50 @@
+/*
+ * collate.h - Defines for NTFS kernel collation handling.  Part of the
+ *	       Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_COLLATE_H
+#define _LINUX_NTFS_COLLATE_H
+
+#include "types.h"
+#include "volume.h"
+
+static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
+	int i;
+
+	/*
+	 * FIXME:  At the moment we only support COLLATION_BINARY and
+	 * COLLATION_NTOFS_ULONG, so we return false for everything else for
+	 * now.
+	 */
+	if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
+		return false;
+	i = le32_to_cpu(cr);
+	if (likely(((i >= 0) && (i <= 0x02)) ||
+			((i >= 0x10) && (i <= 0x13))))
+		return true;
+	return false;
+}
+
+extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len);
+
+#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/kernel/fs/ntfs/compress.c b/kernel/fs/ntfs/compress.c
new file mode 100644
index 000000000..f82498c35
--- /dev/null
+++ b/kernel/fs/ntfs/compress.c
@@ -0,0 +1,969 @@
+/**
+ * compress.c - NTFS kernel compressed attributes handling.
+ *		Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include "attrib.h"
+#include "inode.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_compression_constants - enum of constants used in the compression code
+ */
+typedef enum {
+	/* Token types and access mask. */
+	NTFS_SYMBOL_TOKEN	=	0,
+	NTFS_PHRASE_TOKEN	=	1,
+	NTFS_TOKEN_MASK		=	1,
+
+	/* Compression sub-block constants. */
+	NTFS_SB_SIZE_MASK	=	0x0fff,
+	NTFS_SB_SIZE		=	0x1000,
+	NTFS_SB_IS_COMPRESSED	=	0x8000,
+
+	/*
+	 * The maximum compression block size is by definition 16 * the cluster
+	 * size, with the maximum supported cluster size being 4kiB. Thus the
+	 * maximum compression buffer size is 64kiB, so we use this when
+	 * initializing the compression buffer.
+	 */
+	NTFS_MAX_CB_SIZE	= 64 * 1024,
+} ntfs_compression_constants;
+
+/**
+ * ntfs_compression_buffer - one buffer for the decompression engine
+ */
+static u8 *ntfs_compression_buffer;
+
+/**
+ * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
+ */
+static DEFINE_SPINLOCK(ntfs_cb_lock);
+
+/**
+ * allocate_compression_buffers - allocate the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock mutex.
+ *
+ * Return 0 on success or -ENOMEM if the allocations failed.
+ */
+int allocate_compression_buffers(void)
+{
+	BUG_ON(ntfs_compression_buffer);
+
+	ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
+	if (!ntfs_compression_buffer)
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * free_compression_buffers - free the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock mutex.
+ */
+void free_compression_buffers(void)
+{
+	BUG_ON(!ntfs_compression_buffer);
+	vfree(ntfs_compression_buffer);
+	ntfs_compression_buffer = NULL;
+}
+
+/**
+ * zero_partial_compressed_page - zero out of bounds compressed page region
+ */
+static void zero_partial_compressed_page(struct page *page,
+		const s64 initialized_size)
+{
+	u8 *kp = page_address(page);
+	unsigned int kp_ofs;
+
+	ntfs_debug("Zeroing page region outside initialized size.");
+	if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) {
+		/*
+		 * FIXME: Using clear_page() will become wrong when we get
+		 * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
+		 */
+		clear_page(kp);
+		return;
+	}
+	kp_ofs = initialized_size & ~PAGE_CACHE_MASK;
+	memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
+	return;
+}
+
+/**
+ * handle_bounds_compressed_page - test for&handle out of bounds compressed page
+ */
+static inline void handle_bounds_compressed_page(struct page *page,
+		const loff_t i_size, const s64 initialized_size)
+{
+	if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) &&
+			(initialized_size < i_size))
+		zero_partial_compressed_page(page, initialized_size);
+	return;
+}
+
+/**
+ * ntfs_decompress - decompress a compression block into an array of pages
+ * @dest_pages:		destination array of pages
+ * @dest_index:		current index into @dest_pages (IN/OUT)
+ * @dest_ofs:		current offset within @dest_pages[@dest_index] (IN/OUT)
+ * @dest_max_index:	maximum index into @dest_pages (IN)
+ * @dest_max_ofs:	maximum offset within @dest_pages[@dest_max_index] (IN)
+ * @xpage:		the target page (-1 if none) (IN)
+ * @xpage_done:		set to 1 if xpage was completed successfully (IN/OUT)
+ * @cb_start:		compression block to decompress (IN)
+ * @cb_size:		size of compression block @cb_start in bytes (IN)
+ * @i_size:		file size when we started the read (IN)
+ * @initialized_size:	initialized file size when we started the read (IN)
+ *
+ * The caller must have disabled preemption. ntfs_decompress() reenables it when
+ * the critical section is finished.
+ *
+ * This decompresses the compression block @cb_start into the array of
+ * destination pages @dest_pages starting at index @dest_index into @dest_pages
+ * and at offset @dest_pos into the page @dest_pages[@dest_index].
+ *
+ * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
+ * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
+ *
+ * @cb_start is a pointer to the compression block which needs decompressing
+ * and @cb_size is the size of @cb_start in bytes (8-64kiB).
+ *
+ * Return 0 if success or -EOVERFLOW on error in the compressed stream.
+ * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
+ * completed during the decompression of the compression block (@cb_start).
+ *
+ * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up
+ * unpredicatbly! You have been warned!
+ *
+ * Note to hackers: This function may not sleep until it has finished accessing
+ * the compression block @cb_start as it is a per-CPU buffer.
+ */
+static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
+		int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
+		const int xpage, char *xpage_done, u8 *const cb_start,
+		const u32 cb_size, const loff_t i_size,
+		const s64 initialized_size)
+{
+	/*
+	 * Pointers into the compressed data, i.e. the compression block (cb),
+	 * and the therein contained sub-blocks (sb).
+	 */
+	u8 *cb_end = cb_start + cb_size; /* End of cb. */
+	u8 *cb = cb_start;	/* Current position in cb. */
+	u8 *cb_sb_start = cb;	/* Beginning of the current sb in the cb. */
+	u8 *cb_sb_end;		/* End of current sb / beginning of next sb. */
+
+	/* Variables for uncompressed data / destination. */
+	struct page *dp;	/* Current destination page being worked on. */
+	u8 *dp_addr;		/* Current pointer into dp. */
+	u8 *dp_sb_start;	/* Start of current sub-block in dp. */
+	u8 *dp_sb_end;		/* End of current sb in dp (dp_sb_start +
+				   NTFS_SB_SIZE). */
+	u16 do_sb_start;	/* @dest_ofs when starting this sub-block. */
+	u16 do_sb_end;		/* @dest_ofs of end of this sb (do_sb_start +
+				   NTFS_SB_SIZE). */
+
+	/* Variables for tag and token parsing. */
+	u8 tag;			/* Current tag. */
+	int token;		/* Loop counter for the eight tokens in tag. */
+
+	/* Need this because we can't sleep, so need two stages. */
+	int completed_pages[dest_max_index - *dest_index + 1];
+	int nr_completed_pages = 0;
+
+	/* Default error code. */
+	int err = -EOVERFLOW;
+
+	ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
+do_next_sb:
+	ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
+			cb - cb_start);
+	/*
+	 * Have we reached the end of the compression block or the end of the
+	 * decompressed data?  The latter can happen for example if the current
+	 * position in the compression block is one byte before its end so the
+	 * first two checks do not detect it.
+	 */
+	if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
+			(*dest_index == dest_max_index &&
+			*dest_ofs == dest_max_ofs)) {
+		int i;
+
+		ntfs_debug("Completed. Returning success (0).");
+		err = 0;
+return_error:
+		/* We can sleep from now on, so we drop lock. */
+		spin_unlock(&ntfs_cb_lock);
+		/* Second stage: finalize completed pages. */
+		if (nr_completed_pages > 0) {
+			for (i = 0; i < nr_completed_pages; i++) {
+				int di = completed_pages[i];
+
+				dp = dest_pages[di];
+				/*
+				 * If we are outside the initialized size, zero
+				 * the out of bounds page range.
+				 */
+				handle_bounds_compressed_page(dp, i_size,
+						initialized_size);
+				flush_dcache_page(dp);
+				kunmap(dp);
+				SetPageUptodate(dp);
+				unlock_page(dp);
+				if (di == xpage)
+					*xpage_done = 1;
+				else
+					page_cache_release(dp);
+				dest_pages[di] = NULL;
+			}
+		}
+		return err;
+	}
+
+	/* Setup offsets for the current sub-block destination. */
+	do_sb_start = *dest_ofs;
+	do_sb_end = do_sb_start + NTFS_SB_SIZE;
+
+	/* Check that we are still within allowed boundaries. */
+	if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
+		goto return_overflow;
+
+	/* Does the minimum size of a compressed sb overflow valid range? */
+	if (cb + 6 > cb_end)
+		goto return_overflow;
+
+	/* Setup the current sub-block source pointers and validate range. */
+	cb_sb_start = cb;
+	cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
+			+ 3;
+	if (cb_sb_end > cb_end)
+		goto return_overflow;
+
+	/* Get the current destination page. */
+	dp = dest_pages[*dest_index];
+	if (!dp) {
+		/* No page present. Skip decompression of this sub-block. */
+		cb = cb_sb_end;
+
+		/* Advance destination position to next sub-block. */
+		*dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK;
+		if (!*dest_ofs && (++*dest_index > dest_max_index))
+			goto return_overflow;
+		goto do_next_sb;
+	}
+
+	/* We have a valid destination page. Setup the destination pointers. */
+	dp_addr = (u8*)page_address(dp) + do_sb_start;
+
+	/* Now, we are ready to process the current sub-block (sb). */
+	if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
+		ntfs_debug("Found uncompressed sub-block.");
+		/* This sb is not compressed, just copy it into destination. */
+
+		/* Advance source position to first data byte. */
+		cb += 2;
+
+		/* An uncompressed sb must be full size. */
+		if (cb_sb_end - cb != NTFS_SB_SIZE)
+			goto return_overflow;
+
+		/* Copy the block and advance the source position. */
+		memcpy(dp_addr, cb, NTFS_SB_SIZE);
+		cb += NTFS_SB_SIZE;
+
+		/* Advance destination position to next sub-block. */
+		*dest_ofs += NTFS_SB_SIZE;
+		if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) {
+finalize_page:
+			/*
+			 * First stage: add current page index to array of
+			 * completed pages.
+			 */
+			completed_pages[nr_completed_pages++] = *dest_index;
+			if (++*dest_index > dest_max_index)
+				goto return_overflow;
+		}
+		goto do_next_sb;
+	}
+	ntfs_debug("Found compressed sub-block.");
+	/* This sb is compressed, decompress it into destination. */
+
+	/* Setup destination pointers. */
+	dp_sb_start = dp_addr;
+	dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
+
+	/* Forward to the first tag in the sub-block. */
+	cb += 2;
+do_next_tag:
+	if (cb == cb_sb_end) {
+		/* Check if the decompressed sub-block was not full-length. */
+		if (dp_addr < dp_sb_end) {
+			int nr_bytes = do_sb_end - *dest_ofs;
+
+			ntfs_debug("Filling incomplete sub-block with "
+					"zeroes.");
+			/* Zero remainder and update destination position. */
+			memset(dp_addr, 0, nr_bytes);
+			*dest_ofs += nr_bytes;
+		}
+		/* We have finished the current sub-block. */
+		if (!(*dest_ofs &= ~PAGE_CACHE_MASK))
+			goto finalize_page;
+		goto do_next_sb;
+	}
+
+	/* Check we are still in range. */
+	if (cb > cb_sb_end || dp_addr > dp_sb_end)
+		goto return_overflow;
+
+	/* Get the next tag and advance to first token. */
+	tag = *cb++;
+
+	/* Parse the eight tokens described by the tag. */
+	for (token = 0; token < 8; token++, tag >>= 1) {
+		u16 lg, pt, length, max_non_overlap;
+		register u16 i;
+		u8 *dp_back_addr;
+
+		/* Check if we are done / still in range. */
+		if (cb >= cb_sb_end || dp_addr > dp_sb_end)
+			break;
+
+		/* Determine token type and parse appropriately.*/
+		if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
+			/*
+			 * We have a symbol token, copy the symbol across, and
+			 * advance the source and destination positions.
+			 */
+			*dp_addr++ = *cb++;
+			++*dest_ofs;
+
+			/* Continue with the next token. */
+			continue;
+		}
+
+		/*
+		 * We have a phrase token. Make sure it is not the first tag in
+		 * the sb as this is illegal and would confuse the code below.
+		 */
+		if (dp_addr == dp_sb_start)
+			goto return_overflow;
+
+		/*
+		 * Determine the number of bytes to go back (p) and the number
+		 * of bytes to copy (l). We use an optimized algorithm in which
+		 * we first calculate log2(current destination position in sb),
+		 * which allows determination of l and p in O(1) rather than
+		 * O(n). We just need an arch-optimized log2() function now.
+		 */
+		lg = 0;
+		for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
+			lg++;
+
+		/* Get the phrase token into i. */
+		pt = le16_to_cpup((le16*)cb);
+
+		/*
+		 * Calculate starting position of the byte sequence in
+		 * the destination using the fact that p = (pt >> (12 - lg)) + 1
+		 * and make sure we don't go too far back.
+		 */
+		dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
+		if (dp_back_addr < dp_sb_start)
+			goto return_overflow;
+
+		/* Now calculate the length of the byte sequence. */
+		length = (pt & (0xfff >> lg)) + 3;
+
+		/* Advance destination position and verify it is in range. */
+		*dest_ofs += length;
+		if (*dest_ofs > do_sb_end)
+			goto return_overflow;
+
+		/* The number of non-overlapping bytes. */
+		max_non_overlap = dp_addr - dp_back_addr;
+
+		if (length <= max_non_overlap) {
+			/* The byte sequence doesn't overlap, just copy it. */
+			memcpy(dp_addr, dp_back_addr, length);
+
+			/* Advance destination pointer. */
+			dp_addr += length;
+		} else {
+			/*
+			 * The byte sequence does overlap, copy non-overlapping
+			 * part and then do a slow byte by byte copy for the
+			 * overlapping part. Also, advance the destination
+			 * pointer.
+			 */
+			memcpy(dp_addr, dp_back_addr, max_non_overlap);
+			dp_addr += max_non_overlap;
+			dp_back_addr += max_non_overlap;
+			length -= max_non_overlap;
+			while (length--)
+				*dp_addr++ = *dp_back_addr++;
+		}
+
+		/* Advance source position and continue with the next token. */
+		cb += 2;
+	}
+
+	/* No tokens left in the current tag. Continue with the next tag. */
+	goto do_next_tag;
+
+return_overflow:
+	ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
+	goto return_error;
+}
+
+/**
+ * ntfs_read_compressed_block - read a compressed block into the page cache
+ * @page:	locked page in the compression block(s) we need to read
+ *
+ * When we are called the page has already been verified to be locked and the
+ * attribute is known to be non-resident, not encrypted, but compressed.
+ *
+ * 1. Determine which compression block(s) @page is in.
+ * 2. Get hold of all pages corresponding to this/these compression block(s).
+ * 3. Read the (first) compression block.
+ * 4. Decompress it into the corresponding pages.
+ * 5. Throw the compressed data away and proceed to 3. for the next compression
+ *    block or return success if no more compression blocks left.
+ *
+ * Warning: We have to be careful what we do about existing pages. They might
+ * have been written to so that we would lose data if we were to just overwrite
+ * them with the out-of-date uncompressed data.
+ *
+ * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at
+ * the end of the file I think. We need to detect this case and zero the out
+ * of bounds remainder of the page in question and mark it as handled. At the
+ * moment we would just return -EIO on such a page. This bug will only become
+ * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
+ * clusters so is probably not going to be seen by anyone. Still this should
+ * be fixed. (AIA)
+ *
+ * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in
+ * handling sparse and compressed cbs. (AIA)
+ *
+ * FIXME: At the moment we don't do any zeroing out in the case that
+ * initialized_size is less than data_size. This should be safe because of the
+ * nature of the compression algorithm used. Just in case we check and output
+ * an error message in read inode if the two sizes are not equal for a
+ * compressed file. (AIA)
+ */
+int ntfs_read_compressed_block(struct page *page)
+{
+	loff_t i_size;
+	s64 initialized_size;
+	struct address_space *mapping = page->mapping;
+	ntfs_inode *ni = NTFS_I(mapping->host);
+	ntfs_volume *vol = ni->vol;
+	struct super_block *sb = vol->sb;
+	runlist_element *rl;
+	unsigned long flags, block_size = sb->s_blocksize;
+	unsigned char block_size_bits = sb->s_blocksize_bits;
+	u8 *cb, *cb_pos, *cb_end;
+	struct buffer_head **bhs;
+	unsigned long offset, index = page->index;
+	u32 cb_size = ni->itype.compressed.block_size;
+	u64 cb_size_mask = cb_size - 1UL;
+	VCN vcn;
+	LCN lcn;
+	/* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */
+	VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
+			vol->cluster_size_bits;
+	/*
+	 * The first vcn after the last wanted vcn (minimum alignment is again
+	 * PAGE_CACHE_SIZE.
+	 */
+	VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
+			& ~cb_size_mask) >> vol->cluster_size_bits;
+	/* Number of compression blocks (cbs) in the wanted vcn range. */
+	unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
+			>> ni->itype.compressed.block_size_bits;
+	/*
+	 * Number of pages required to store the uncompressed data from all
+	 * compression blocks (cbs) overlapping @page. Due to alignment
+	 * guarantees of start_vcn and end_vcn, no need to round up here.
+	 */
+	unsigned int nr_pages = (end_vcn - start_vcn) <<
+			vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+	unsigned int xpage, max_page, cur_page, cur_ofs, i;
+	unsigned int cb_clusters, cb_max_ofs;
+	int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
+	struct page **pages;
+	unsigned char xpage_done = 0;
+
+	ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
+			"%i.", index, cb_size, nr_pages);
+	/*
+	 * Bad things happen if we get here for anything that is not an
+	 * unnamed $DATA attribute.
+	 */
+	BUG_ON(ni->type != AT_DATA);
+	BUG_ON(ni->name_len);
+
+	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS);
+
+	/* Allocate memory to store the buffer heads we need. */
+	bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
+	bhs = kmalloc(bhs_size, GFP_NOFS);
+
+	if (unlikely(!pages || !bhs)) {
+		kfree(bhs);
+		kfree(pages);
+		unlock_page(page);
+		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
+		return -ENOMEM;
+	}
+
+	/*
+	 * We have already been given one page, this is the one we must do.
+	 * Once again, the alignment guarantees keep it simple.
+	 */
+	offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+	xpage = index - offset;
+	pages[xpage] = page;
+	/*
+	 * The remaining pages need to be allocated and inserted into the page
+	 * cache, alignment guarantees keep all the below much simpler. (-8
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	i_size = i_size_read(VFS_I(ni));
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+			offset;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (xpage >= max_page) {
+		kfree(bhs);
+		kfree(pages);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		ntfs_debug("Compressed read outside i_size - truncated?");
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+	if (nr_pages < max_page)
+		max_page = nr_pages;
+	for (i = 0; i < max_page; i++, offset++) {
+		if (i != xpage)
+			pages[i] = grab_cache_page_nowait(mapping, offset);
+		page = pages[i];
+		if (page) {
+			/*
+			 * We only (re)read the page if it isn't already read
+			 * in and/or dirty or we would be losing data or at
+			 * least wasting our time.
+			 */
+			if (!PageDirty(page) && (!PageUptodate(page) ||
+					PageError(page))) {
+				ClearPageError(page);
+				kmap(page);
+				continue;
+			}
+			unlock_page(page);
+			page_cache_release(page);
+			pages[i] = NULL;
+		}
+	}
+
+	/*
+	 * We have the runlist, and all the destination pages we need to fill.
+	 * Now read the first compression block.
+	 */
+	cur_page = 0;
+	cur_ofs = 0;
+	cb_clusters = ni->itype.compressed.block_clusters;
+do_next_cb:
+	nr_cbs--;
+	nr_bhs = 0;
+
+	/* Read all cb buffer heads one cluster at a time. */
+	rl = NULL;
+	for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
+			vcn++) {
+		bool is_retry = false;
+
+		if (!rl) {
+lock_retry_remap:
+			down_read(&ni->runlist.lock);
+			rl = ni->runlist.rl;
+		}
+		if (likely(rl != NULL)) {
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+		} else
+			lcn = LCN_RL_NOT_MAPPED;
+		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+				(unsigned long long)vcn,
+				(unsigned long long)lcn);
+		if (lcn < 0) {
+			/*
+			 * When we reach the first sparse cluster we have
+			 * finished with the cb.
+			 */
+			if (lcn == LCN_HOLE)
+				break;
+			if (is_retry || lcn != LCN_RL_NOT_MAPPED)
+				goto rl_err;
+			is_retry = true;
+			/*
+			 * Attempt to map runlist, dropping lock for the
+			 * duration.
+			 */
+			up_read(&ni->runlist.lock);
+			if (!ntfs_map_runlist(ni, vcn))
+				goto lock_retry_remap;
+			goto map_rl_err;
+		}
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		/* Read the lcn from device in chunks of block_size bytes. */
+		max_block = block + (vol->cluster_size >> block_size_bits);
+		do {
+			ntfs_debug("block = 0x%x.", block);
+			if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
+				goto getblk_err;
+			nr_bhs++;
+		} while (++block < max_block);
+	}
+
+	/* Release the lock if we took it. */
+	if (rl)
+		up_read(&ni->runlist.lock);
+
+	/* Setup and initiate io on all buffer heads. */
+	for (i = 0; i < nr_bhs; i++) {
+		struct buffer_head *tbh = bhs[i];
+
+		if (!trylock_buffer(tbh))
+			continue;
+		if (unlikely(buffer_uptodate(tbh))) {
+			unlock_buffer(tbh);
+			continue;
+		}
+		get_bh(tbh);
+		tbh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, tbh);
+	}
+
+	/* Wait for io completion on all buffer heads. */
+	for (i = 0; i < nr_bhs; i++) {
+		struct buffer_head *tbh = bhs[i];
+
+		if (buffer_uptodate(tbh))
+			continue;
+		wait_on_buffer(tbh);
+		/*
+		 * We need an optimization barrier here, otherwise we start
+		 * hitting the below fixup code when accessing a loopback
+		 * mounted ntfs partition. This indicates either there is a
+		 * race condition in the loop driver or, more likely, gcc
+		 * overoptimises the code without the barrier and it doesn't
+		 * do the Right Thing(TM).
+		 */
+		barrier();
+		if (unlikely(!buffer_uptodate(tbh))) {
+			ntfs_warning(vol->sb, "Buffer is unlocked but not "
+					"uptodate! Unplugging the disk queue "
+					"and rescheduling.");
+			get_bh(tbh);
+			io_schedule();
+			put_bh(tbh);
+			if (unlikely(!buffer_uptodate(tbh)))
+				goto read_err;
+			ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
+		}
+	}
+
+	/*
+	 * Get the compression buffer. We must not sleep any more
+	 * until we are finished with it.
+	 */
+	spin_lock(&ntfs_cb_lock);
+	cb = ntfs_compression_buffer;
+
+	BUG_ON(!cb);
+
+	cb_pos = cb;
+	cb_end = cb + cb_size;
+
+	/* Copy the buffer heads into the contiguous buffer. */
+	for (i = 0; i < nr_bhs; i++) {
+		memcpy(cb_pos, bhs[i]->b_data, block_size);
+		cb_pos += block_size;
+	}
+
+	/* Just a precaution. */
+	if (cb_pos + 2 <= cb + cb_size)
+		*(u16*)cb_pos = 0;
+
+	/* Reset cb_pos back to the beginning. */
+	cb_pos = cb;
+
+	/* We now have both source (if present) and destination. */
+	ntfs_debug("Successfully read the compression block.");
+
+	/* The last page and maximum offset within it for the current cb. */
+	cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size;
+	cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK;
+	cb_max_page >>= PAGE_CACHE_SHIFT;
+
+	/* Catch end of file inside a compression block. */
+	if (cb_max_page > max_page)
+		cb_max_page = max_page;
+
+	if (vcn == start_vcn - cb_clusters) {
+		/* Sparse cb, zero out page range overlapping the cb. */
+		ntfs_debug("Found sparse compression block.");
+		/* We can sleep from now on, so we drop lock. */
+		spin_unlock(&ntfs_cb_lock);
+		if (cb_max_ofs)
+			cb_max_page--;
+		for (; cur_page < cb_max_page; cur_page++) {
+			page = pages[cur_page];
+			if (page) {
+				/*
+				 * FIXME: Using clear_page() will become wrong
+				 * when we get PAGE_CACHE_SIZE != PAGE_SIZE but
+				 * for now there is no problem.
+				 */
+				if (likely(!cur_ofs))
+					clear_page(page_address(page));
+				else
+					memset(page_address(page) + cur_ofs, 0,
+							PAGE_CACHE_SIZE -
+							cur_ofs);
+				flush_dcache_page(page);
+				kunmap(page);
+				SetPageUptodate(page);
+				unlock_page(page);
+				if (cur_page == xpage)
+					xpage_done = 1;
+				else
+					page_cache_release(page);
+				pages[cur_page] = NULL;
+			}
+			cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+			cur_ofs = 0;
+			if (cb_pos >= cb_end)
+				break;
+		}
+		/* If we have a partial final page, deal with it now. */
+		if (cb_max_ofs && cb_pos < cb_end) {
+			page = pages[cur_page];
+			if (page)
+				memset(page_address(page) + cur_ofs, 0,
+						cb_max_ofs - cur_ofs);
+			/*
+			 * No need to update cb_pos at this stage:
+			 *	cb_pos += cb_max_ofs - cur_ofs;
+			 */
+			cur_ofs = cb_max_ofs;
+		}
+	} else if (vcn == start_vcn) {
+		/* We can't sleep so we need two stages. */
+		unsigned int cur2_page = cur_page;
+		unsigned int cur_ofs2 = cur_ofs;
+		u8 *cb_pos2 = cb_pos;
+
+		ntfs_debug("Found uncompressed compression block.");
+		/* Uncompressed cb, copy it to the destination pages. */
+		/*
+		 * TODO: As a big optimization, we could detect this case
+		 * before we read all the pages and use block_read_full_page()
+		 * on all full pages instead (we still have to treat partial
+		 * pages especially but at least we are getting rid of the
+		 * synchronous io for the majority of pages.
+		 * Or if we choose not to do the read-ahead/-behind stuff, we
+		 * could just return block_read_full_page(pages[xpage]) as long
+		 * as PAGE_CACHE_SIZE <= cb_size.
+		 */
+		if (cb_max_ofs)
+			cb_max_page--;
+		/* First stage: copy data into destination pages. */
+		for (; cur_page < cb_max_page; cur_page++) {
+			page = pages[cur_page];
+			if (page)
+				memcpy(page_address(page) + cur_ofs, cb_pos,
+						PAGE_CACHE_SIZE - cur_ofs);
+			cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+			cur_ofs = 0;
+			if (cb_pos >= cb_end)
+				break;
+		}
+		/* If we have a partial final page, deal with it now. */
+		if (cb_max_ofs && cb_pos < cb_end) {
+			page = pages[cur_page];
+			if (page)
+				memcpy(page_address(page) + cur_ofs, cb_pos,
+						cb_max_ofs - cur_ofs);
+			cb_pos += cb_max_ofs - cur_ofs;
+			cur_ofs = cb_max_ofs;
+		}
+		/* We can sleep from now on, so drop lock. */
+		spin_unlock(&ntfs_cb_lock);
+		/* Second stage: finalize pages. */
+		for (; cur2_page < cb_max_page; cur2_page++) {
+			page = pages[cur2_page];
+			if (page) {
+				/*
+				 * If we are outside the initialized size, zero
+				 * the out of bounds page range.
+				 */
+				handle_bounds_compressed_page(page, i_size,
+						initialized_size);
+				flush_dcache_page(page);
+				kunmap(page);
+				SetPageUptodate(page);
+				unlock_page(page);
+				if (cur2_page == xpage)
+					xpage_done = 1;
+				else
+					page_cache_release(page);
+				pages[cur2_page] = NULL;
+			}
+			cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2;
+			cur_ofs2 = 0;
+			if (cb_pos2 >= cb_end)
+				break;
+		}
+	} else {
+		/* Compressed cb, decompress it into the destination page(s). */
+		unsigned int prev_cur_page = cur_page;
+
+		ntfs_debug("Found compressed compression block.");
+		err = ntfs_decompress(pages, &cur_page, &cur_ofs,
+				cb_max_page, cb_max_ofs, xpage, &xpage_done,
+				cb_pos,	cb_size - (cb_pos - cb), i_size,
+				initialized_size);
+		/*
+		 * We can sleep from now on, lock already dropped by
+		 * ntfs_decompress().
+		 */
+		if (err) {
+			ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
+					"0x%lx with error code %i. Skipping "
+					"this compression block.",
+					ni->mft_no, -err);
+			/* Release the unfinished pages. */
+			for (; prev_cur_page < cur_page; prev_cur_page++) {
+				page = pages[prev_cur_page];
+				if (page) {
+					flush_dcache_page(page);
+					kunmap(page);
+					unlock_page(page);
+					if (prev_cur_page != xpage)
+						page_cache_release(page);
+					pages[prev_cur_page] = NULL;
+				}
+			}
+		}
+	}
+
+	/* Release the buffer heads. */
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+
+	/* Do we have more work to do? */
+	if (nr_cbs)
+		goto do_next_cb;
+
+	/* We no longer need the list of buffer heads. */
+	kfree(bhs);
+
+	/* Clean up if we have any pages left. Should never happen. */
+	for (cur_page = 0; cur_page < max_page; cur_page++) {
+		page = pages[cur_page];
+		if (page) {
+			ntfs_error(vol->sb, "Still have pages left! "
+					"Terminating them with extreme "
+					"prejudice.  Inode 0x%lx, page index "
+					"0x%lx.", ni->mft_no, page->index);
+			flush_dcache_page(page);
+			kunmap(page);
+			unlock_page(page);
+			if (cur_page != xpage)
+				page_cache_release(page);
+			pages[cur_page] = NULL;
+		}
+	}
+
+	/* We no longer need the list of pages. */
+	kfree(pages);
+
+	/* If we have completed the requested page, we return success. */
+	if (likely(xpage_done))
+		return 0;
+
+	ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
+			"EOVERFLOW" : (!err ? "EIO" : "unknown error"));
+	return err < 0 ? err : -EIO;
+
+read_err:
+	ntfs_error(vol->sb, "IO error while reading compressed data.");
+	/* Release the buffer heads. */
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+	goto err_out;
+
+map_rl_err:
+	ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
+			"compression block.");
+	goto err_out;
+
+rl_err:
+	up_read(&ni->runlist.lock);
+	ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
+			"compression block.");
+	goto err_out;
+
+getblk_err:
+	up_read(&ni->runlist.lock);
+	ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
+
+err_out:
+	kfree(bhs);
+	for (i = cur_page; i < max_page; i++) {
+		page = pages[i];
+		if (page) {
+			flush_dcache_page(page);
+			kunmap(page);
+			unlock_page(page);
+			if (i != xpage)
+				page_cache_release(page);
+		}
+	}
+	kfree(pages);
+	return -EIO;
+}
diff --git a/kernel/fs/ntfs/debug.c b/kernel/fs/ntfs/debug.c
new file mode 100644
index 000000000..825a54e8f
--- /dev/null
+++ b/kernel/fs/ntfs/debug.c
@@ -0,0 +1,173 @@
+/*
+ * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include "debug.h"
+
+/**
+ * __ntfs_warning - output a warning to the syslog
+ * @function:	name of function outputting the warning
+ * @sb:		super block of mounted ntfs filesystem
+ * @fmt:	warning string containing format specifications
+ * @...:	a variable number of arguments specified in @fmt
+ *
+ * Outputs a warning to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the warning string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_warning is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_warning(const char *function, const struct super_block *sb,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int flen = 0;
+
+#ifndef DEBUG
+	if (!printk_ratelimit())
+		return;
+#endif
+	if (function)
+		flen = strlen(function);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	if (sb)
+		pr_warn("(device %s): %s(): %pV\n",
+			sb->s_id, flen ? function : "", &vaf);
+	else
+		pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
+	va_end(args);
+}
+
+/**
+ * __ntfs_error - output an error to the syslog
+ * @function:	name of function outputting the error
+ * @sb:		super block of mounted ntfs filesystem
+ * @fmt:	error string containing format specifications
+ * @...:	a variable number of arguments specified in @fmt
+ *
+ * Outputs an error to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the error string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_error is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_error(const char *function, const struct super_block *sb,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int flen = 0;
+
+#ifndef DEBUG
+	if (!printk_ratelimit())
+		return;
+#endif
+	if (function)
+		flen = strlen(function);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	if (sb)
+		pr_err("(device %s): %s(): %pV\n",
+		       sb->s_id, flen ? function : "", &vaf);
+	else
+		pr_err("%s(): %pV\n", flen ? function : "", &vaf);
+	va_end(args);
+}
+
+#ifdef DEBUG
+
+/* If 1, output debug messages, and if 0, don't. */
+int debug_msgs = 0;
+
+void __ntfs_debug(const char *file, int line, const char *function,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int flen = 0;
+
+	if (!debug_msgs)
+		return;
+	if (function)
+		flen = strlen(function);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
+	va_end(args);
+}
+
+/* Dump a runlist. Caller has to provide synchronisation for @rl. */
+void ntfs_debug_dump_runlist(const runlist_element *rl)
+{
+	int i;
+	const char *lcn_str[5] = { "LCN_HOLE         ", "LCN_RL_NOT_MAPPED",
+				   "LCN_ENOENT       ", "LCN_unknown      " };
+
+	if (!debug_msgs)
+		return;
+	pr_debug("Dumping runlist (values in hex):\n");
+	if (!rl) {
+		pr_debug("Run list not present.\n");
+		return;
+	}
+	pr_debug("VCN              LCN               Run length\n");
+	for (i = 0; ; i++) {
+		LCN lcn = (rl + i)->lcn;
+
+		if (lcn < (LCN)0) {
+			int index = -lcn - 1;
+
+			if (index > -LCN_ENOENT - 1)
+				index = 3;
+			pr_debug("%-16Lx %s %-16Lx%s\n",
+					(long long)(rl + i)->vcn, lcn_str[index],
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
+		} else
+			pr_debug("%-16Lx %-16Lx  %-16Lx%s\n",
+					(long long)(rl + i)->vcn,
+					(long long)(rl + i)->lcn,
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
+		if (!(rl + i)->length)
+			break;
+	}
+}
+
+#endif
diff --git a/kernel/fs/ntfs/debug.h b/kernel/fs/ntfs/debug.h
new file mode 100644
index 000000000..61bf091e3
--- /dev/null
+++ b/kernel/fs/ntfs/debug.h
@@ -0,0 +1,71 @@
+/*
+ * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_DEBUG_H
+#define _LINUX_NTFS_DEBUG_H
+
+#include <linux/fs.h>
+
+#include "runlist.h"
+
+#ifdef DEBUG
+
+extern int debug_msgs;
+
+extern __printf(4, 5)
+void __ntfs_debug(const char *file, int line, const char *function,
+		  const char *format, ...);
+/**
+ * ntfs_debug - write a debug level message to syslog
+ * @f:		a printf format string containing the message
+ * @...:	the variables to substitute into @f
+ *
+ * ntfs_debug() writes a DEBUG level message to the syslog but only if the
+ * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
+ */
+#define ntfs_debug(f, a...)						\
+	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
+
+extern void ntfs_debug_dump_runlist(const runlist_element *rl);
+
+#else	/* !DEBUG */
+
+#define ntfs_debug(fmt, ...)						\
+do {									\
+	if (0)								\
+		no_printk(fmt, ##__VA_ARGS__);				\
+} while (0)
+
+#define ntfs_debug_dump_runlist(rl)	do {} while (0)
+
+#endif	/* !DEBUG */
+
+extern  __printf(3, 4)
+void __ntfs_warning(const char *function, const struct super_block *sb,
+		    const char *fmt, ...);
+#define ntfs_warning(sb, f, a...)	__ntfs_warning(__func__, sb, f, ##a)
+
+extern  __printf(3, 4)
+void __ntfs_error(const char *function, const struct super_block *sb,
+		  const char *fmt, ...);
+#define ntfs_error(sb, f, a...)		__ntfs_error(__func__, sb, f, ##a)
+
+#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/kernel/fs/ntfs/dir.c b/kernel/fs/ntfs/dir.c
new file mode 100644
index 000000000..9e38dafa3
--- /dev/null
+++ b/kernel/fs/ntfs/dir.c
@@ -0,0 +1,1553 @@
+/**
+ * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+
+#include "dir.h"
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * The little endian Unicode string $I30 as a global constant.
+ */
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
+		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
+
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni:	ntfs inode of the directory in which to search for the name
+ * @uname:	Unicode name for which to search in the directory
+ * @uname_len:	length of the name @uname in Unicode characters
+ * @res:	return the found file name if necessary (see below)
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ *
+ * Note, we look for a case sensitive match first but we also look for a case
+ * insensitive match at the same time. If we find a case insensitive match, we
+ * save that for the case that we don't find an exact match, where we return
+ * the case insensitive match and setup @res (which we allocate!) with the mft
+ * reference, the file name type, length and with a copy of the little endian
+ * Unicode file name itself. If we match a file name which is in the DOS name
+ * space, we only return the mft reference and file name type in @res.
+ * ntfs_lookup() then uses this to find the long file name in the inode itself.
+ * This is to avoid polluting the dcache with short file names. We want them to
+ * work but we don't care for how quickly one can access them. This also fixes
+ * the dcache aliasing issues.
+ *
+ * Locking:  - Caller must hold i_mutex on the directory.
+ *	     - Each page cache page in the index allocation mapping must be
+ *	       locked whilst being accessed otherwise we may find a corrupt
+ *	       page due to it being under ->writepage at the moment which
+ *	       applies the mst protection fixups before writing out and then
+ *	       removes them again after the write is complete after which it 
+ *	       unlocks the page.
+ */
+MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+		const int uname_len, ntfs_name **res)
+{
+	ntfs_volume *vol = dir_ni->vol;
+	struct super_block *sb = vol->sb;
+	MFT_RECORD *m;
+	INDEX_ROOT *ir;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *index_end;
+	u64 mref;
+	ntfs_attr_search_ctx *ctx;
+	int err, rc;
+	VCN vcn, old_vcn;
+	struct address_space *ia_mapping;
+	struct page *page;
+	u8 *kaddr;
+	ntfs_name *name = NULL;
+
+	BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
+	BUG_ON(NInoAttr(dir_ni));
+	/* Get hold of the mft record for the directory. */
+	m = map_mft_record(dir_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+				-PTR_ERR(m));
+		return ERR_MREF(PTR_ERR(m));
+	}
+	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(sb, "Index root attribute missing in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			err = -EIO;
+		}
+		goto err_out;
+	}
+	/* Get to the index root value (it's been verified in read_inode). */
+	ir = (INDEX_ROOT*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end)
+			goto dir_err_out;
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * We perform a case sensitive comparison and if that matches
+		 * we are done and return the mft reference of the inode (i.e.
+		 * the inode number together with the sequence number for
+		 * consistency checking). We convert it to cpu format before
+		 * returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it:
+			/*
+			 * We have a perfect match, so we don't need to care
+			 * about having matched imperfectly before, so we can
+			 * free name and set *res to NULL.
+			 * However, if the perfect match is a short file name,
+			 * we need to signal this through *res, so that
+			 * ntfs_lookup() can fix dcache aliasing issues.
+			 * As an optimization we just reuse an existing
+			 * allocation of *res.
+			 */
+			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+				if (!name) {
+					name = kmalloc(sizeof(ntfs_name),
+							GFP_NOFS);
+					if (!name) {
+						err = -ENOMEM;
+						goto err_out;
+					}
+				}
+				name->mref = le64_to_cpu(
+						ie->data.dir.indexed_file);
+				name->type = FILE_NAME_DOS;
+				name->len = 0;
+				*res = name;
+			} else {
+				kfree(name);
+				*res = NULL;
+			}
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(dir_ni);
+			return mref;
+		}
+		/*
+		 * For a case insensitive mount, we also perform a case
+		 * insensitive comparison (provided the file name is not in the
+		 * POSIX namespace). If the comparison matches, and the name is
+		 * in the WIN32 namespace, we cache the filename in *res so
+		 * that the caller, ntfs_lookup(), can work on it. If the
+		 * comparison matches, and the name is in the DOS namespace, we
+		 * only cache the mft reference and the file name type (we set
+		 * the name length to zero for simplicity).
+		 */
+		if (!NVolCaseSensitive(vol) &&
+				ie->key.file_name.file_name_type &&
+				ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+			int name_size = sizeof(ntfs_name);
+			u8 type = ie->key.file_name.file_name_type;
+			u8 len = ie->key.file_name.file_name_length;
+
+			/* Only one case insensitive matching name allowed. */
+			if (name) {
+				ntfs_error(sb, "Found already allocated name "
+						"in phase 1. Please run chkdsk "
+						"and if that doesn't find any "
+						"errors please report you saw "
+						"this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net.");
+				goto dir_err_out;
+			}
+
+			if (type != FILE_NAME_DOS)
+				name_size += len * sizeof(ntfschar);
+			name = kmalloc(name_size, GFP_NOFS);
+			if (!name) {
+				err = -ENOMEM;
+				goto err_out;
+			}
+			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+			name->type = type;
+			if (type != FILE_NAME_DOS) {
+				name->len = len;
+				memcpy(name->name, ie->key.file_name.file_name,
+						len * sizeof(ntfschar));
+			} else
+				name->len = 0;
+			*res = name;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it;
+	}
+	/*
+	 * We have finished with this index without success. Check for the
+	 * presence of a child node and if not present return -ENOENT, unless
+	 * we have got a matching name cached in name in which case return the
+	 * mft reference associated with it.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		if (name) {
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(dir_ni);
+			return name->mref;
+		}
+		ntfs_debug("Entry not found.");
+		err = -ENOENT;
+		goto err_out;
+	} /* Child node present, descend into it. */
+	/* Consistency check: Verify that an index allocation exists. */
+	if (!NInoIndexAllocPresent(dir_ni)) {
+		ntfs_error(sb, "No index allocation attribute but index entry "
+				"requires one. Directory inode 0x%lx is "
+				"corrupt or driver bug.", dir_ni->mft_no);
+		goto err_out;
+	}
+	/* Get the starting vcn of the index_block holding the child node. */
+	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+	ia_mapping = VFS_I(dir_ni)->i_mapping;
+	/*
+	 * We are done with the index root and the mft record. Release them,
+	 * otherwise we deadlock with ntfs_map_page().
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(dir_ni);
+	m = NULL;
+	ctx = NULL;
+descend_into_child_node:
+	/*
+	 * Convert vcn to index into the index allocation attribute in units
+	 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+	 * disk if necessary.
+	 */
+	page = ntfs_map_page(ia_mapping, vcn <<
+			dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		ntfs_error(sb, "Failed to map directory index page, error %ld.",
+				-PTR_ERR(page));
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+	/* Get to the index allocation block. */
+	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+			dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+	/* Bounds checks. */
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+				"inode 0x%lx or driver bug.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx). "
+				"Directory inode 0x%lx is corrupt or driver "
+				"bug.", (unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx has a size (%u) differing from the "
+				"directory specified size (%u). Directory "
+				"inode is corrupt or driver bug.",
+				(unsigned long long)vcn, dir_ni->mft_no,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				dir_ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	index_end = (u8*)ia + dir_ni->itype.index.block_size;
+	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx crosses page boundary. Impossible! "
+				"Cannot access! This is probably a bug in the "
+				"driver.", (unsigned long long)vcn,
+				dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+				"inode 0x%lx exceeds maximum size.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Iterate similar to above big loop but applied to index buffer, thus
+	 * loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds check. */
+		if ((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end) {
+			ntfs_error(sb, "Index entry out of bounds in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * We perform a case sensitive comparison and if that matches
+		 * we are done and return the mft reference of the inode (i.e.
+		 * the inode number together with the sequence number for
+		 * consistency checking). We convert it to cpu format before
+		 * returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it2:
+			/*
+			 * We have a perfect match, so we don't need to care
+			 * about having matched imperfectly before, so we can
+			 * free name and set *res to NULL.
+			 * However, if the perfect match is a short file name,
+			 * we need to signal this through *res, so that
+			 * ntfs_lookup() can fix dcache aliasing issues.
+			 * As an optimization we just reuse an existing
+			 * allocation of *res.
+			 */
+			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+				if (!name) {
+					name = kmalloc(sizeof(ntfs_name),
+							GFP_NOFS);
+					if (!name) {
+						err = -ENOMEM;
+						goto unm_err_out;
+					}
+				}
+				name->mref = le64_to_cpu(
+						ie->data.dir.indexed_file);
+				name->type = FILE_NAME_DOS;
+				name->len = 0;
+				*res = name;
+			} else {
+				kfree(name);
+				*res = NULL;
+			}
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			return mref;
+		}
+		/*
+		 * For a case insensitive mount, we also perform a case
+		 * insensitive comparison (provided the file name is not in the
+		 * POSIX namespace). If the comparison matches, and the name is
+		 * in the WIN32 namespace, we cache the filename in *res so
+		 * that the caller, ntfs_lookup(), can work on it. If the
+		 * comparison matches, and the name is in the DOS namespace, we
+		 * only cache the mft reference and the file name type (we set
+		 * the name length to zero for simplicity).
+		 */
+		if (!NVolCaseSensitive(vol) &&
+				ie->key.file_name.file_name_type &&
+				ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+			int name_size = sizeof(ntfs_name);
+			u8 type = ie->key.file_name.file_name_type;
+			u8 len = ie->key.file_name.file_name_length;
+
+			/* Only one case insensitive matching name allowed. */
+			if (name) {
+				ntfs_error(sb, "Found already allocated name "
+						"in phase 2. Please run chkdsk "
+						"and if that doesn't find any "
+						"errors please report you saw "
+						"this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net.");
+				unlock_page(page);
+				ntfs_unmap_page(page);
+				goto dir_err_out;
+			}
+
+			if (type != FILE_NAME_DOS)
+				name_size += len * sizeof(ntfschar);
+			name = kmalloc(name_size, GFP_NOFS);
+			if (!name) {
+				err = -ENOMEM;
+				goto unm_err_out;
+			}
+			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+			name->type = type;
+			if (type != FILE_NAME_DOS) {
+				name->len = len;
+				memcpy(name->name, ie->key.file_name.file_name,
+						len * sizeof(ntfschar));
+			} else
+				name->len = 0;
+			*res = name;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it2;
+	}
+	/*
+	 * We have finished with this index buffer without success. Check for
+	 * the presence of a child node.
+	 */
+	if (ie->flags & INDEX_ENTRY_NODE) {
+		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+			ntfs_error(sb, "Index entry with child node found in "
+					"a leaf node in directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/* Child node present, descend into it. */
+		old_vcn = vcn;
+		vcn = sle64_to_cpup((sle64*)((u8*)ie +
+				le16_to_cpu(ie->length) - 8));
+		if (vcn >= 0) {
+			/* If vcn is in the same page cache page as old_vcn we
+			 * recycle the mapped page. */
+			if (old_vcn << vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT == vcn <<
+					vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT)
+				goto fast_descend_into_child_node;
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto descend_into_child_node;
+		}
+		ntfs_error(sb, "Negative child node vcn in directory inode "
+				"0x%lx.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/*
+	 * No child node present, return -ENOENT, unless we have got a matching
+	 * name cached in name in which case return the mft reference
+	 * associated with it.
+	 */
+	if (name) {
+		unlock_page(page);
+		ntfs_unmap_page(page);
+		return name->mref;
+	}
+	ntfs_debug("Entry not found.");
+	err = -ENOENT;
+unm_err_out:
+	unlock_page(page);
+	ntfs_unmap_page(page);
+err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(dir_ni);
+	if (name) {
+		kfree(name);
+		*res = NULL;
+	}
+	return ERR_MREF(err);
+dir_err_out:
+	ntfs_error(sb, "Corrupt directory.  Aborting lookup.");
+	goto err_out;
+}
+
+#if 0
+
+// TODO: (AIA)
+// The algorithm embedded in this code will be required for the time when we
+// want to support adding of entries to directories, where we require correct
+// collation of file names in order not to cause corruption of the filesystem.
+
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni:	ntfs inode of the directory in which to search for the name
+ * @uname:	Unicode name for which to search in the directory
+ * @uname_len:	length of the name @uname in Unicode characters
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ */
+u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+		const int uname_len)
+{
+	ntfs_volume *vol = dir_ni->vol;
+	struct super_block *sb = vol->sb;
+	MFT_RECORD *m;
+	INDEX_ROOT *ir;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *index_end;
+	u64 mref;
+	ntfs_attr_search_ctx *ctx;
+	int err, rc;
+	IGNORE_CASE_BOOL ic;
+	VCN vcn, old_vcn;
+	struct address_space *ia_mapping;
+	struct page *page;
+	u8 *kaddr;
+
+	/* Get hold of the mft record for the directory. */
+	m = map_mft_record(dir_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+				-PTR_ERR(m));
+		return ERR_MREF(PTR_ERR(m));
+	}
+	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(sb, "Index root attribute missing in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			err = -EIO;
+		}
+		goto err_out;
+	}
+	/* Get to the index root value (it's been verified in read_inode). */
+	ir = (INDEX_ROOT*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end)
+			goto dir_err_out;
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * If the current entry has a name type of POSIX, the name is
+		 * case sensitive and not otherwise. This has the effect of us
+		 * not being able to access any POSIX file names which collate
+		 * after the non-POSIX one when they only differ in case, but
+		 * anyone doing screwy stuff like that deserves to burn in
+		 * hell... Doing that kind of stuff on NT4 actually causes
+		 * corruption on the partition even when using SP6a and Linux
+		 * is not involved at all.
+		 */
+		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+				CASE_SENSITIVE;
+		/*
+		 * If the names match perfectly, we are done and return the
+		 * mft reference of the inode (i.e. the inode number together
+		 * with the sequence number for consistency checking. We
+		 * convert it to cpu format before returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, ic,
+				vol->upcase, vol->upcase_len)) {
+found_it:
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(dir_ni);
+			return mref;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it;
+	}
+	/*
+	 * We have finished with this index without success. Check for the
+	 * presence of a child node.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		/* No child node, return -ENOENT. */
+		err = -ENOENT;
+		goto err_out;
+	} /* Child node present, descend into it. */
+	/* Consistency check: Verify that an index allocation exists. */
+	if (!NInoIndexAllocPresent(dir_ni)) {
+		ntfs_error(sb, "No index allocation attribute but index entry "
+				"requires one. Directory inode 0x%lx is "
+				"corrupt or driver bug.", dir_ni->mft_no);
+		goto err_out;
+	}
+	/* Get the starting vcn of the index_block holding the child node. */
+	vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+	ia_mapping = VFS_I(dir_ni)->i_mapping;
+	/*
+	 * We are done with the index root and the mft record. Release them,
+	 * otherwise we deadlock with ntfs_map_page().
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(dir_ni);
+	m = NULL;
+	ctx = NULL;
+descend_into_child_node:
+	/*
+	 * Convert vcn to index into the index allocation attribute in units
+	 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+	 * disk if necessary.
+	 */
+	page = ntfs_map_page(ia_mapping, vcn <<
+			dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		ntfs_error(sb, "Failed to map directory index page, error %ld.",
+				-PTR_ERR(page));
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+	/* Get to the index allocation block. */
+	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+			dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+	/* Bounds checks. */
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+				"inode 0x%lx or driver bug.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx). "
+				"Directory inode 0x%lx is corrupt or driver "
+				"bug.", (unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx has a size (%u) differing from the "
+				"directory specified size (%u). Directory "
+				"inode is corrupt or driver bug.",
+				(unsigned long long)vcn, dir_ni->mft_no,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				dir_ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	index_end = (u8*)ia + dir_ni->itype.index.block_size;
+	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx crosses page boundary. Impossible! "
+				"Cannot access! This is probably a bug in the "
+				"driver.", (unsigned long long)vcn,
+				dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+				"inode 0x%lx exceeds maximum size.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Iterate similar to above big loop but applied to index buffer, thus
+	 * loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds check. */
+		if ((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end) {
+			ntfs_error(sb, "Index entry out of bounds in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * If the current entry has a name type of POSIX, the name is
+		 * case sensitive and not otherwise. This has the effect of us
+		 * not being able to access any POSIX file names which collate
+		 * after the non-POSIX one when they only differ in case, but
+		 * anyone doing screwy stuff like that deserves to burn in
+		 * hell... Doing that kind of stuff on NT4 actually causes
+		 * corruption on the partition even when using SP6a and Linux
+		 * is not involved at all.
+		 */
+		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+				CASE_SENSITIVE;
+		/*
+		 * If the names match perfectly, we are done and return the
+		 * mft reference of the inode (i.e. the inode number together
+		 * with the sequence number for consistency checking. We
+		 * convert it to cpu format before returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, ic,
+				vol->upcase, vol->upcase_len)) {
+found_it2:
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			return mref;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it2;
+	}
+	/*
+	 * We have finished with this index buffer without success. Check for
+	 * the presence of a child node.
+	 */
+	if (ie->flags & INDEX_ENTRY_NODE) {
+		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+			ntfs_error(sb, "Index entry with child node found in "
+					"a leaf node in directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/* Child node present, descend into it. */
+		old_vcn = vcn;
+		vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+		if (vcn >= 0) {
+			/* If vcn is in the same page cache page as old_vcn we
+			 * recycle the mapped page. */
+			if (old_vcn << vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT == vcn <<
+					vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT)
+				goto fast_descend_into_child_node;
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto descend_into_child_node;
+		}
+		ntfs_error(sb, "Negative child node vcn in directory inode "
+				"0x%lx.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* No child node, return -ENOENT. */
+	ntfs_debug("Entry not found.");
+	err = -ENOENT;
+unm_err_out:
+	unlock_page(page);
+	ntfs_unmap_page(page);
+err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(dir_ni);
+	return ERR_MREF(err);
+dir_err_out:
+	ntfs_error(sb, "Corrupt directory. Aborting lookup.");
+	goto err_out;
+}
+
+#endif
+
+/**
+ * ntfs_filldir - ntfs specific filldir method
+ * @vol:	current ntfs volume
+ * @ndir:	ntfs inode of current directory
+ * @ia_page:	page in which the index allocation buffer @ie is in resides
+ * @ie:		current index entry
+ * @name:	buffer to use for the converted name
+ * @actor:	what to feed the entries to
+ *
+ * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
+ * callback.
+ *
+ * If @ia_page is not NULL it is the locked page containing the index
+ * allocation block containing the index entry @ie.
+ *
+ * Note, we drop (and then reacquire) the page lock on @ia_page across the
+ * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
+ * since ntfs_lookup() will lock the same page.  As an optimization, we do not
+ * retake the lock if we are returning a non-zero value as ntfs_readdir()
+ * would need to drop the lock immediately anyway.
+ */
+static inline int ntfs_filldir(ntfs_volume *vol,
+		ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
+		u8 *name, struct dir_context *actor)
+{
+	unsigned long mref;
+	int name_len;
+	unsigned dt_type;
+	FILE_NAME_TYPE_FLAGS name_type;
+
+	name_type = ie->key.file_name.file_name_type;
+	if (name_type == FILE_NAME_DOS) {
+		ntfs_debug("Skipping DOS name space entry.");
+		return 0;
+	}
+	if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
+		ntfs_debug("Skipping root directory self reference entry.");
+		return 0;
+	}
+	if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
+			!NVolShowSystemFiles(vol)) {
+		ntfs_debug("Skipping system file.");
+		return 0;
+	}
+	name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
+			ie->key.file_name.file_name_length, &name,
+			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
+	if (name_len <= 0) {
+		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
+				(long long)MREF_LE(ie->data.dir.indexed_file));
+		return 0;
+	}
+	if (ie->key.file_name.file_attributes &
+			FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
+		dt_type = DT_DIR;
+	else
+		dt_type = DT_REG;
+	mref = MREF_LE(ie->data.dir.indexed_file);
+	/*
+	 * Drop the page lock otherwise we deadlock with NFS when it calls
+	 * ->lookup since ntfs_lookup() will lock the same page.
+	 */
+	if (ia_page)
+		unlock_page(ia_page);
+	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
+			"0x%lx, DT_%s.", name, name_len, actor->pos, mref,
+			dt_type == DT_DIR ? "DIR" : "REG");
+	if (!dir_emit(actor, name, name_len, mref, dt_type))
+		return 1;
+	/* Relock the page but not if we are aborting ->readdir. */
+	if (ia_page)
+		lock_page(ia_page);
+	return 0;
+}
+
+/*
+ * We use the same basic approach as the old NTFS driver, i.e. we parse the
+ * index root entries and then the index allocation entries that are marked
+ * as in use in the index bitmap.
+ *
+ * While this will return the names in random order this doesn't matter for
+ * ->readdir but OTOH results in a faster ->readdir.
+ *
+ * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
+ * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
+ * modifications).
+ *
+ * Locking:  - Caller must hold i_mutex on the directory.
+ *	     - Each page cache page in the index allocation mapping must be
+ *	       locked whilst being accessed otherwise we may find a corrupt
+ *	       page due to it being under ->writepage at the moment which
+ *	       applies the mst protection fixups before writing out and then
+ *	       removes them again after the write is complete after which it 
+ *	       unlocks the page.
+ */
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
+{
+	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
+	loff_t i_size;
+	struct inode *bmp_vi, *vdir = file_inode(file);
+	struct super_block *sb = vdir->i_sb;
+	ntfs_inode *ndir = NTFS_I(vdir);
+	ntfs_volume *vol = NTFS_SB(sb);
+	MFT_RECORD *m;
+	INDEX_ROOT *ir = NULL;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *name = NULL;
+	int rc, err, ir_pos, cur_bmp_pos;
+	struct address_space *ia_mapping, *bmp_mapping;
+	struct page *bmp_page = NULL, *ia_page = NULL;
+	u8 *kaddr, *bmp, *index_end;
+	ntfs_attr_search_ctx *ctx;
+
+	ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
+			vdir->i_ino, actor->pos);
+	rc = err = 0;
+	/* Are we at end of dir yet? */
+	i_size = i_size_read(vdir);
+	if (actor->pos >= i_size + vol->mft_record_size)
+		return 0;
+	/* Emulate . and .. for all directories. */
+	if (!dir_emit_dots(file, actor))
+		return 0;
+	m = NULL;
+	ctx = NULL;
+	/*
+	 * Allocate a buffer to store the current name being processed
+	 * converted to format determined by current NLS.
+	 */
+	name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
+	if (unlikely(!name)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Are we jumping straight into the index allocation attribute? */
+	if (actor->pos >= vol->mft_record_size)
+		goto skip_index_root;
+	/* Get hold of the mft record for the directory. */
+	m = map_mft_record(ndir);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(ndir, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Get the offset into the index root attribute. */
+	ir_pos = (s64)actor->pos;
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		ntfs_error(sb, "Index root attribute missing in directory "
+				"inode 0x%lx.", vdir->i_ino);
+		goto err_out;
+	}
+	/*
+	 * Copy the index root attribute value to a buffer so that we can put
+	 * the search context and unmap the mft record before calling the
+	 * filldir() callback.  We need to do this because of NFSd which calls
+	 * ->lookup() from its filldir callback() and this causes NTFS to
+	 * deadlock as ntfs_lookup() maps the mft record of the directory and
+	 * we have got it mapped here already.  The only solution is for us to
+	 * unmap the mft record here so that a call to ntfs_lookup() is able to
+	 * map the mft record without deadlocking.
+	 */
+	rc = le32_to_cpu(ctx->attr->data.resident.value_length);
+	ir = kmalloc(rc, GFP_NOFS);
+	if (unlikely(!ir)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Copy the index root value (it has been verified in read_inode). */
+	memcpy(ir, (u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ndir);
+	ctx = NULL;
+	m = NULL;
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry or until filldir tells us it has had enough
+	 * or signals an error (both covered by the rc test).
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
+		/* Bounds checks. */
+		if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end))
+			goto err_out;
+		/* The last entry cannot contain a name. */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Skip index root entry if continuing previous readdir. */
+		if (ir_pos > (u8*)ie - (u8*)ir)
+			continue;
+		/* Advance the position even if going to skip the entry. */
+		actor->pos = (u8*)ie - (u8*)ir;
+		/* Submit the name to the filldir callback. */
+		rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
+		if (rc) {
+			kfree(ir);
+			goto abort;
+		}
+	}
+	/* We are done with the index root and can free the buffer. */
+	kfree(ir);
+	ir = NULL;
+	/* If there is no index allocation attribute we are finished. */
+	if (!NInoIndexAllocPresent(ndir))
+		goto EOD;
+	/* Advance fpos to the beginning of the index allocation. */
+	actor->pos = vol->mft_record_size;
+skip_index_root:
+	kaddr = NULL;
+	prev_ia_pos = -1LL;
+	/* Get the offset into the index allocation attribute. */
+	ia_pos = (s64)actor->pos - vol->mft_record_size;
+	ia_mapping = vdir->i_mapping;
+	ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
+	bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
+	if (IS_ERR(bmp_vi)) {
+		ntfs_error(sb, "Failed to get bitmap attribute.");
+		err = PTR_ERR(bmp_vi);
+		goto err_out;
+	}
+	bmp_mapping = bmp_vi->i_mapping;
+	/* Get the starting bitmap bit position and sanity check it. */
+	bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
+	if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
+		ntfs_error(sb, "Current index allocation position exceeds "
+				"index bitmap size.");
+		goto iput_err_out;
+	}
+	/* Get the starting bit position in the current bitmap page. */
+	cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
+	bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1);
+get_next_bmp_page:
+	ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
+			(unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
+			(unsigned long long)bmp_pos &
+			(unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
+	bmp_page = ntfs_map_page(bmp_mapping,
+			bmp_pos >> (3 + PAGE_CACHE_SHIFT));
+	if (IS_ERR(bmp_page)) {
+		ntfs_error(sb, "Reading index bitmap failed.");
+		err = PTR_ERR(bmp_page);
+		bmp_page = NULL;
+		goto iput_err_out;
+	}
+	bmp = (u8*)page_address(bmp_page);
+	/* Find next index block in use. */
+	while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
+find_next_index_buffer:
+		cur_bmp_pos++;
+		/*
+		 * If we have reached the end of the bitmap page, get the next
+		 * page, and put away the old one.
+		 */
+		if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) {
+			ntfs_unmap_page(bmp_page);
+			bmp_pos += PAGE_CACHE_SIZE * 8;
+			cur_bmp_pos = 0;
+			goto get_next_bmp_page;
+		}
+		/* If we have reached the end of the bitmap, we are done. */
+		if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
+			goto unm_EOD;
+		ia_pos = (bmp_pos + cur_bmp_pos) <<
+				ndir->itype.index.block_size_bits;
+	}
+	ntfs_debug("Handling index buffer 0x%llx.",
+			(unsigned long long)bmp_pos + cur_bmp_pos);
+	/* If the current index buffer is in the same page we reuse the page. */
+	if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) !=
+			(ia_pos & (s64)PAGE_CACHE_MASK)) {
+		prev_ia_pos = ia_pos;
+		if (likely(ia_page != NULL)) {
+			unlock_page(ia_page);
+			ntfs_unmap_page(ia_page);
+		}
+		/*
+		 * Map the page cache page containing the current ia_pos,
+		 * reading it from disk if necessary.
+		 */
+		ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
+		if (IS_ERR(ia_page)) {
+			ntfs_error(sb, "Reading index allocation data failed.");
+			err = PTR_ERR(ia_page);
+			ia_page = NULL;
+			goto err_out;
+		}
+		lock_page(ia_page);
+		kaddr = (u8*)page_address(ia_page);
+	}
+	/* Get the current index buffer. */
+	ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK &
+			~(s64)(ndir->itype.index.block_size - 1)));
+	/* Bounds checks. */
+	if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) {
+		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+				"inode 0x%lx or driver bug.", vdir->i_ino);
+		goto err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
+			~(s64)(ndir->itype.index.block_size - 1)) >>
+			ndir->itype.index.vcn_size_bits)) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx). "
+				"Directory inode 0x%lx is corrupt or driver "
+				"bug. ", (unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			ndir->itype.index.block_size)) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx has a size (%u) differing from the "
+				"directory specified size (%u). Directory "
+				"inode is corrupt or driver bug.",
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				ndir->itype.index.block_size);
+		goto err_out;
+	}
+	index_end = (u8*)ia + ndir->itype.index.block_size;
+	if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx crosses page boundary. Impossible! "
+				"Cannot access! This is probably a bug in the "
+				"driver.", (unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+				"inode 0x%lx exceeds maximum size.",
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	/* The first index entry in this index buffer. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry or until filldir tells us it has had enough
+	 * or signals an error (both covered by the rc test).
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		ntfs_debug("In index allocation, offset 0x%llx.",
+				(unsigned long long)ia_start +
+				(unsigned long long)((u8*)ie - (u8*)ia));
+		/* Bounds checks. */
+		if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end))
+			goto err_out;
+		/* The last entry cannot contain a name. */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Skip index block entry if continuing previous readdir. */
+		if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
+			continue;
+		/* Advance the position even if going to skip the entry. */
+		actor->pos = (u8*)ie - (u8*)ia +
+				(sle64_to_cpu(ia->index_block_vcn) <<
+				ndir->itype.index.vcn_size_bits) +
+				vol->mft_record_size;
+		/*
+		 * Submit the name to the @filldir callback.  Note,
+		 * ntfs_filldir() drops the lock on @ia_page but it retakes it
+		 * before returning, unless a non-zero value is returned in
+		 * which case the page is left unlocked.
+		 */
+		rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
+		if (rc) {
+			/* @ia_page is already unlocked in this case. */
+			ntfs_unmap_page(ia_page);
+			ntfs_unmap_page(bmp_page);
+			iput(bmp_vi);
+			goto abort;
+		}
+	}
+	goto find_next_index_buffer;
+unm_EOD:
+	if (ia_page) {
+		unlock_page(ia_page);
+		ntfs_unmap_page(ia_page);
+	}
+	ntfs_unmap_page(bmp_page);
+	iput(bmp_vi);
+EOD:
+	/* We are finished, set fpos to EOD. */
+	actor->pos = i_size + vol->mft_record_size;
+abort:
+	kfree(name);
+	return 0;
+err_out:
+	if (bmp_page) {
+		ntfs_unmap_page(bmp_page);
+iput_err_out:
+		iput(bmp_vi);
+	}
+	if (ia_page) {
+		unlock_page(ia_page);
+		ntfs_unmap_page(ia_page);
+	}
+	kfree(ir);
+	kfree(name);
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(ndir);
+	if (!err)
+		err = -EIO;
+	ntfs_debug("Failed. Returning error code %i.", -err);
+	return err;
+}
+
+/**
+ * ntfs_dir_open - called when an inode is about to be opened
+ * @vi:		inode to be opened
+ * @filp:	file structure describing the inode
+ *
+ * Limit directory size to the page cache limit on architectures where unsigned
+ * long is 32-bits. This is the most we can do for now without overflowing the
+ * page cache page index. Doing it this way means we don't run into problems
+ * because of existing too large directories. It would be better to allow the
+ * user to read the accessible part of the directory but I doubt very much
+ * anyone is going to hit this check on a 32-bit architecture, so there is no
+ * point in adding the extra complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ */
+static int ntfs_dir_open(struct inode *vi, struct file *filp)
+{
+	if (sizeof(unsigned long) < 8) {
+		if (i_size_read(vi) > MAX_LFS_FILESIZE)
+			return -EFBIG;
+	}
+	return 0;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_dir_fsync - sync a directory to disk
+ * @filp:	directory to be synced
+ * @dentry:	dentry describing the directory to sync
+ * @datasync:	if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a directory to disk.  Used for fsync, fdatasync, and
+ * msync system calls.  This function is based on file.c::ntfs_file_fsync().
+ *
+ * Write the mft record and all associated extent mft records as well as the
+ * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
+ *
+ * If @datasync is true, we do not wait on the inode(s) to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Note: In the past @filp could be NULL so we ignore it as we don't need it
+ * anyway.
+ *
+ * Locking: Caller must hold i_mutex on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.  We do write the $BITMAP attribute if it is present
+ * which is the important one for a directory so things are not too bad.
+ */
+static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+			  int datasync)
+{
+	struct inode *bmp_vi, *vi = filp->f_mapping->host;
+	int err, ret;
+	ntfs_attr na;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&vi->i_mutex);
+
+	BUG_ON(!S_ISDIR(vi->i_mode));
+	/* If the bitmap attribute inode is in memory sync it, too. */
+	na.mft_no = vi->i_ino;
+	na.type = AT_BITMAP;
+	na.name = I30;
+	na.name_len = 4;
+	bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na);
+	if (bmp_vi) {
+ 		write_inode_now(bmp_vi, !datasync);
+		iput(bmp_vi);
+	}
+	ret = __ntfs_write_inode(vi, 1);
+	write_inode_now(vi, !datasync);
+	err = sync_blockdev(vi->i_sb->s_bdev);
+	if (unlikely(err && !ret))
+		ret = err;
+	if (likely(!ret))
+		ntfs_debug("Done.");
+	else
+		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
+				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	mutex_unlock(&vi->i_mutex);
+	return ret;
+}
+
+#endif /* NTFS_RW */
+
+const struct file_operations ntfs_dir_ops = {
+	.llseek		= generic_file_llseek,	/* Seek inside directory. */
+	.read		= generic_read_dir,	/* Return -EISDIR. */
+	.iterate	= ntfs_readdir,		/* Read directory contents. */
+#ifdef NTFS_RW
+	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
+	/*.aio_fsync	= ,*/			/* Sync all outstanding async
+						   i/o operations on a kiocb. */
+#endif /* NTFS_RW */
+	/*.ioctl	= ,*/			/* Perform function on the
+						   mounted filesystem. */
+	.open		= ntfs_dir_open,	/* Open directory. */
+};
diff --git a/kernel/fs/ntfs/dir.h b/kernel/fs/ntfs/dir.h
new file mode 100644
index 000000000..aea7582d5
--- /dev/null
+++ b/kernel/fs/ntfs/dir.h
@@ -0,0 +1,48 @@
+/*
+ * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
+ *	   the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_DIR_H
+#define _LINUX_NTFS_DIR_H
+
+#include "layout.h"
+#include "inode.h"
+#include "types.h"
+
+/*
+ * ntfs_name is used to return the file name to the caller of
+ * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
+ * to be able to deal with dcache aliasing issues.
+ */
+typedef struct {
+	MFT_REF mref;
+	FILE_NAME_TYPE_FLAGS type;
+	u8 len;
+	ntfschar name[0];
+} __attribute__ ((__packed__)) ntfs_name;
+
+/* The little endian Unicode string $I30 as a global constant. */
+extern ntfschar I30[5];
+
+extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
+		const ntfschar *uname, const int uname_len, ntfs_name **res);
+
+#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/kernel/fs/ntfs/endian.h b/kernel/fs/ntfs/endian.h
new file mode 100644
index 000000000..927b5bf04
--- /dev/null
+++ b/kernel/fs/ntfs/endian.h
@@ -0,0 +1,93 @@
+/*
+ * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
+ *	      Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_ENDIAN_H
+#define _LINUX_NTFS_ENDIAN_H
+
+#include <asm/byteorder.h>
+#include "types.h"
+
+/*
+ * Signed endianness conversion functions.
+ */
+
+static inline s16 sle16_to_cpu(sle16 x)
+{
+	return le16_to_cpu((__force le16)x);
+}
+
+static inline s32 sle32_to_cpu(sle32 x)
+{
+	return le32_to_cpu((__force le32)x);
+}
+
+static inline s64 sle64_to_cpu(sle64 x)
+{
+	return le64_to_cpu((__force le64)x);
+}
+
+static inline s16 sle16_to_cpup(sle16 *x)
+{
+	return le16_to_cpu(*(__force le16*)x);
+}
+
+static inline s32 sle32_to_cpup(sle32 *x)
+{
+	return le32_to_cpu(*(__force le32*)x);
+}
+
+static inline s64 sle64_to_cpup(sle64 *x)
+{
+	return le64_to_cpu(*(__force le64*)x);
+}
+
+static inline sle16 cpu_to_sle16(s16 x)
+{
+	return (__force sle16)cpu_to_le16(x);
+}
+
+static inline sle32 cpu_to_sle32(s32 x)
+{
+	return (__force sle32)cpu_to_le32(x);
+}
+
+static inline sle64 cpu_to_sle64(s64 x)
+{
+	return (__force sle64)cpu_to_le64(x);
+}
+
+static inline sle16 cpu_to_sle16p(s16 *x)
+{
+	return (__force sle16)cpu_to_le16(*x);
+}
+
+static inline sle32 cpu_to_sle32p(s32 *x)
+{
+	return (__force sle32)cpu_to_le32(*x);
+}
+
+static inline sle64 cpu_to_sle64p(s64 *x)
+{
+	return (__force sle64)cpu_to_le64(*x);
+}
+
+#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/kernel/fs/ntfs/file.c b/kernel/fs/ntfs/file.c
new file mode 100644
index 000000000..7bb487e66
--- /dev/null
+++ b/kernel/fs/ntfs/file.c
@@ -0,0 +1,2043 @@
+/*
+ * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+
+#include <asm/page.h>
+#include <asm/uaccess.h>
+
+#include "attrib.h"
+#include "bitmap.h"
+#include "inode.h"
+#include "debug.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_file_open - called when an inode is about to be opened
+ * @vi:		inode to be opened
+ * @filp:	file structure describing the inode
+ *
+ * Limit file size to the page cache limit on architectures where unsigned long
+ * is 32-bits. This is the most we can do for now without overflowing the page
+ * cache page index. Doing it this way means we don't run into problems because
+ * of existing too large files. It would be better to allow the user to read
+ * the beginning of the file but I doubt very much anyone is going to hit this
+ * check on a 32-bit architecture, so there is no point in adding the extra
+ * complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ *
+ * After the check passes, just call generic_file_open() to do its work.
+ */
+static int ntfs_file_open(struct inode *vi, struct file *filp)
+{
+	if (sizeof(unsigned long) < 8) {
+		if (i_size_read(vi) > MAX_LFS_FILESIZE)
+			return -EOVERFLOW;
+	}
+	return generic_file_open(vi, filp);
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_attr_extend_initialized - extend the initialized size of an attribute
+ * @ni:			ntfs inode of the attribute to extend
+ * @new_init_size:	requested new initialized size in bytes
+ *
+ * Extend the initialized size of an attribute described by the ntfs inode @ni
+ * to @new_init_size bytes.  This involves zeroing any non-sparse space between
+ * the old initialized size and @new_init_size both in the page cache and on
+ * disk (if relevant complete pages are already uptodate in the page cache then
+ * these are simply marked dirty).
+ *
+ * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
+ * in the resident attribute case, it is tied to the initialized size and, in
+ * the non-resident attribute case, it may not fall below the initialized size.
+ *
+ * Note that if the attribute is resident, we do not need to touch the page
+ * cache at all.  This is because if the page cache page is not uptodate we
+ * bring it uptodate later, when doing the write to the mft record since we
+ * then already have the page mapped.  And if the page is uptodate, the
+ * non-initialized region will already have been zeroed when the page was
+ * brought uptodate and the region may in fact already have been overwritten
+ * with new data via mmap() based writes, so we cannot just zero it.  And since
+ * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
+ * is unspecified, we choose not to do zeroing and thus we do not need to touch
+ * the page at all.  For a more detailed explanation see ntfs_truncate() in
+ * fs/ntfs/inode.c.
+ *
+ * Return 0 on success and -errno on error.  In the case that an error is
+ * encountered it is possible that the initialized size will already have been
+ * incremented some way towards @new_init_size but it is guaranteed that if
+ * this is the case, the necessary zeroing will also have happened and that all
+ * metadata is self-consistent.
+ *
+ * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
+ *	    held by the caller.
+ */
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
+{
+	s64 old_init_size;
+	loff_t old_i_size;
+	pgoff_t index, end_index;
+	unsigned long flags;
+	struct inode *vi = VFS_I(ni);
+	ntfs_inode *base_ni;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx = NULL;
+	struct address_space *mapping;
+	struct page *page = NULL;
+	u8 *kattr;
+	int err;
+	u32 attr_len;
+
+	read_lock_irqsave(&ni->size_lock, flags);
+	old_init_size = ni->initialized_size;
+	old_i_size = i_size_read(vi);
+	BUG_ON(new_init_size > ni->allocated_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"old_initialized_size 0x%llx, "
+			"new_initialized_size 0x%llx, i_size 0x%llx.",
+			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)old_init_size,
+			(unsigned long long)new_init_size, old_i_size);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Use goto to reduce indentation and we need the label below anyway. */
+	if (NInoNonResident(ni))
+		goto do_non_resident_extend;
+	BUG_ON(old_init_size != old_i_size);
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(a->non_resident);
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	BUG_ON(old_i_size != (loff_t)attr_len);
+	/*
+	 * Do the zeroing in the mft record and update the attribute size in
+	 * the mft record.
+	 */
+	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+	memset(kattr + attr_len, 0, new_init_size - attr_len);
+	a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
+	/* Finally, update the sizes in the vfs and ntfs inodes. */
+	write_lock_irqsave(&ni->size_lock, flags);
+	i_size_write(vi, new_init_size);
+	ni->initialized_size = new_init_size;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	goto done;
+do_non_resident_extend:
+	/*
+	 * If the new initialized size @new_init_size exceeds the current file
+	 * size (vfs inode->i_size), we need to extend the file size to the
+	 * new initialized size.
+	 */
+	if (new_init_size > old_i_size) {
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			m = NULL;
+			goto err_out;
+		}
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			goto err_out;
+		}
+		m = ctx->mrec;
+		a = ctx->attr;
+		BUG_ON(!a->non_resident);
+		BUG_ON(old_i_size != (loff_t)
+				sle64_to_cpu(a->data.non_resident.data_size));
+		a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		/* Update the file size in the vfs inode. */
+		i_size_write(vi, new_init_size);
+		ntfs_attr_put_search_ctx(ctx);
+		ctx = NULL;
+		unmap_mft_record(base_ni);
+		m = NULL;
+	}
+	mapping = vi->i_mapping;
+	index = old_init_size >> PAGE_CACHE_SHIFT;
+	end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	do {
+		/*
+		 * Read the page.  If the page is not present, this will zero
+		 * the uninitialized regions for us.
+		 */
+		page = read_mapping_page(mapping, index, NULL);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto init_err_out;
+		}
+		if (unlikely(PageError(page))) {
+			page_cache_release(page);
+			err = -EIO;
+			goto init_err_out;
+		}
+		/*
+		 * Update the initialized size in the ntfs inode.  This is
+		 * enough to make ntfs_writepage() work.
+		 */
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
+		if (ni->initialized_size > new_init_size)
+			ni->initialized_size = new_init_size;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		/* Set the page dirty so it gets written out. */
+		set_page_dirty(page);
+		page_cache_release(page);
+		/*
+		 * Play nice with the vm and the rest of the system.  This is
+		 * very much needed as we can potentially be modifying the
+		 * initialised size from a very small value to a really huge
+		 * value, e.g.
+		 *	f = open(somefile, O_TRUNC);
+		 *	truncate(f, 10GiB);
+		 *	seek(f, 10GiB);
+		 *	write(f, 1);
+		 * And this would mean we would be marking dirty hundreds of
+		 * thousands of pages or as in the above example more than
+		 * two and a half million pages!
+		 *
+		 * TODO: For sparse pages could optimize this workload by using
+		 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
+		 * would be set in readpage for sparse pages and here we would
+		 * not need to mark dirty any pages which have this bit set.
+		 * The only caveat is that we have to clear the bit everywhere
+		 * where we allocate any clusters that lie in the page or that
+		 * contain the page.
+		 *
+		 * TODO: An even greater optimization would be for us to only
+		 * call readpage() on pages which are not in sparse regions as
+		 * determined from the runlist.  This would greatly reduce the
+		 * number of pages we read and make dirty in the case of sparse
+		 * files.
+		 */
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (++index < end_index);
+	read_lock_irqsave(&ni->size_lock, flags);
+	BUG_ON(ni->initialized_size != new_init_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/* Now bring in sync the initialized_size in the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto init_err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto init_err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto init_err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(!a->non_resident);
+	a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
+done:
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
+			(unsigned long long)new_init_size, i_size_read(vi));
+	return 0;
+init_err_out:
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->initialized_size = old_init_size;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+}
+
+static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
+		struct iov_iter *from)
+{
+	loff_t pos;
+	s64 end, ll;
+	ssize_t err;
+	unsigned long flags;
+	struct file *file = iocb->ki_filp;
+	struct inode *vi = file_inode(file);
+	ntfs_inode *base_ni, *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+			"0x%llx, count 0x%zx.", vi->i_ino,
+			(unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)iocb->ki_pos,
+			iov_iter_count(from));
+	err = generic_write_checks(iocb, from);
+	if (unlikely(err <= 0))
+		goto out;
+	/*
+	 * All checks have passed.  Before we start doing any writing we want
+	 * to abort any totally illegal writes.
+	 */
+	BUG_ON(NInoMstProtected(ni));
+	BUG_ON(ni->type != AT_DATA);
+	/* If file is encrypted, deny access, just like NT4. */
+	if (NInoEncrypted(ni)) {
+		/* Only $DATA attributes can be encrypted. */
+		/*
+		 * Reminder for later: Encrypted files are _always_
+		 * non-resident so that the content can always be encrypted.
+		 */
+		ntfs_debug("Denying write access to encrypted file.");
+		err = -EACCES;
+		goto out;
+	}
+	if (NInoCompressed(ni)) {
+		/* Only unnamed $DATA attribute can be compressed. */
+		BUG_ON(ni->name_len);
+		/*
+		 * Reminder for later: If resident, the data is not actually
+		 * compressed.  Only on the switch to non-resident does
+		 * compression kick in.  This is in contrast to encrypted files
+		 * (see above).
+		 */
+		ntfs_error(vi->i_sb, "Writing to compressed files is not "
+				"implemented yet.  Sorry.");
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+	base_ni = ni;
+	if (NInoAttr(ni))
+		base_ni = ni->ext.base_ntfs_ino;
+	err = file_remove_suid(file);
+	if (unlikely(err))
+		goto out;
+	/*
+	 * Our ->update_time method always succeeds thus file_update_time()
+	 * cannot fail either so there is no need to check the return code.
+	 */
+	file_update_time(file);
+	pos = iocb->ki_pos;
+	/* The first byte after the last cluster being written to. */
+	end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
+			~(u64)vol->cluster_size_mask;
+	/*
+	 * If the write goes beyond the allocated size, extend the allocation
+	 * to cover the whole of the write, rounded up to the nearest cluster.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	ll = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (end > ll) {
+		/*
+		 * Extend the allocation without changing the data size.
+		 *
+		 * Note we ensure the allocation is big enough to at least
+		 * write some data but we do not require the allocation to be
+		 * complete, i.e. it may be partial.
+		 */
+		ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+		if (likely(ll >= 0)) {
+			BUG_ON(pos >= ll);
+			/* If the extension was partial truncate the write. */
+			if (end > ll) {
+				ntfs_debug("Truncating write to inode 0x%lx, "
+						"attribute type 0x%x, because "
+						"the allocation was only "
+						"partially extended.",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+				iov_iter_truncate(from, ll - pos);
+			}
+		} else {
+			err = ll;
+			read_lock_irqsave(&ni->size_lock, flags);
+			ll = ni->allocated_size;
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			/* Perform a partial write if possible or fail. */
+			if (pos < ll) {
+				ntfs_debug("Truncating write to inode 0x%lx "
+						"attribute type 0x%x, because "
+						"extending the allocation "
+						"failed (error %d).",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type),
+						(int)-err);
+				iov_iter_truncate(from, ll - pos);
+			} else {
+				if (err != -ENOSPC)
+					ntfs_error(vi->i_sb, "Cannot perform "
+							"write to inode "
+							"0x%lx, attribute "
+							"type 0x%x, because "
+							"extending the "
+							"allocation failed "
+							"(error %ld).",
+							vi->i_ino, (unsigned)
+							le32_to_cpu(ni->type),
+							(long)-err);
+				else
+					ntfs_debug("Cannot perform write to "
+							"inode 0x%lx, "
+							"attribute type 0x%x, "
+							"because there is not "
+							"space left.",
+							vi->i_ino, (unsigned)
+							le32_to_cpu(ni->type));
+				goto out;
+			}
+		}
+	}
+	/*
+	 * If the write starts beyond the initialized size, extend it up to the
+	 * beginning of the write and initialize all non-sparse space between
+	 * the old initialized size and the new one.  This automatically also
+	 * increments the vfs inode->i_size to keep it above or equal to the
+	 * initialized_size.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	ll = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (pos > ll) {
+		/*
+		 * Wait for ongoing direct i/o to complete before proceeding.
+		 * New direct i/o cannot start as we hold i_mutex.
+		 */
+		inode_dio_wait(vi);
+		err = ntfs_attr_extend_initialized(ni, pos);
+		if (unlikely(err < 0))
+			ntfs_error(vi->i_sb, "Cannot perform write to inode "
+					"0x%lx, attribute type 0x%x, because "
+					"extending the initialized size "
+					"failed (error %d).", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type),
+					(int)-err);
+	}
+out:
+	return err;
+}
+
+/**
+ * __ntfs_grab_cache_pages - obtain a number of locked pages
+ * @mapping:	address space mapping from which to obtain page cache pages
+ * @index:	starting index in @mapping at which to begin obtaining pages
+ * @nr_pages:	number of page cache pages to obtain
+ * @pages:	array of pages in which to return the obtained page cache pages
+ * @cached_page: allocated but as yet unused page
+ *
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
+ * starting at index @index.
+ *
+ * If a page is newly created, add it to lru list
+ *
+ * Note, the page locks are obtained in ascending page index order.
+ */
+static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
+		pgoff_t index, const unsigned nr_pages, struct page **pages,
+		struct page **cached_page)
+{
+	int err, nr;
+
+	BUG_ON(!nr_pages);
+	err = nr = 0;
+	do {
+		pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
+				FGP_ACCESSED);
+		if (!pages[nr]) {
+			if (!*cached_page) {
+				*cached_page = page_cache_alloc(mapping);
+				if (unlikely(!*cached_page)) {
+					err = -ENOMEM;
+					goto err_out;
+				}
+			}
+			err = add_to_page_cache_lru(*cached_page, mapping,
+					index, GFP_KERNEL);
+			if (unlikely(err)) {
+				if (err == -EEXIST)
+					continue;
+				goto err_out;
+			}
+			pages[nr] = *cached_page;
+			*cached_page = NULL;
+		}
+		index++;
+		nr++;
+	} while (nr < nr_pages);
+out:
+	return err;
+err_out:
+	while (nr > 0) {
+		unlock_page(pages[--nr]);
+		page_cache_release(pages[nr]);
+	}
+	goto out;
+}
+
+static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	return submit_bh(READ, bh);
+}
+
+/**
+ * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * This is called for non-resident attributes from ntfs_file_buffered_write()
+ * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
+ * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
+ * data has not yet been copied into the @pages.
+ * 
+ * Need to fill any holes with actual clusters, allocate buffers if necessary,
+ * ensure all the buffers are mapped, and bring uptodate any buffers that are
+ * only partially being written to.
+ *
+ * If @nr_pages is greater than one, we are guaranteed that the cluster size is
+ * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
+ * the same cluster and that they are the entirety of that cluster, and that
+ * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
+ *
+ * i_size is not to be modified yet.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
+		unsigned nr_pages, s64 pos, size_t bytes)
+{
+	VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
+	LCN lcn;
+	s64 bh_pos, vcn_len, end, initialized_size;
+	sector_t lcn_block;
+	struct page *page;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni = NULL;
+	ntfs_volume *vol;
+	runlist_element *rl, *rl2;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a = NULL;
+	unsigned long flags;
+	u32 attr_rec_len = 0;
+	unsigned blocksize, u;
+	int err, mp_size;
+	bool rl_write_locked, was_hole, is_retry;
+	unsigned char blocksize_bits;
+	struct {
+		u8 runlist_merged:1;
+		u8 mft_attr_mapped:1;
+		u8 mp_rebuilt:1;
+		u8 attr_switched:1;
+	} status = { 0, 0, 0, 0 };
+
+	BUG_ON(!nr_pages);
+	BUG_ON(!pages);
+	BUG_ON(!*pages);
+	vi = pages[0]->mapping->host;
+	ni = NTFS_I(vi);
+	vol = ni->vol;
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+			vi->i_ino, ni->type, pages[0]->index, nr_pages,
+			(long long)pos, bytes);
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+	u = 0;
+	do {
+		page = pages[u];
+		BUG_ON(!page);
+		/*
+		 * create_empty_buffers() will create uptodate/dirty buffers if
+		 * the page is uptodate/dirty.
+		 */
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, blocksize, 0);
+			if (unlikely(!page_has_buffers(page)))
+				return -ENOMEM;
+		}
+	} while (++u < nr_pages);
+	rl_write_locked = false;
+	rl = NULL;
+	err = 0;
+	vcn = lcn = -1;
+	vcn_len = 0;
+	lcn_block = -1;
+	was_hole = false;
+	cpos = pos >> vol->cluster_size_bits;
+	end = pos + bytes;
+	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
+	/*
+	 * Loop over each page and for each page over each buffer.  Use goto to
+	 * reduce indentation.
+	 */
+	u = 0;
+do_next_page:
+	page = pages[u];
+	bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+	bh = head = page_buffers(page);
+	do {
+		VCN cdelta;
+		s64 bh_end;
+		unsigned bh_cofs;
+
+		/* Clear buffer_new on all buffers to reinitialise state. */
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+		bh_end = bh_pos + blocksize;
+		bh_cpos = bh_pos >> vol->cluster_size_bits;
+		bh_cofs = bh_pos & vol->cluster_size_mask;
+		if (buffer_mapped(bh)) {
+			/*
+			 * The buffer is already mapped.  If it is uptodate,
+			 * ignore it.
+			 */
+			if (buffer_uptodate(bh))
+				continue;
+			/*
+			 * The buffer is not uptodate.  If the page is uptodate
+			 * set the buffer uptodate and otherwise ignore it.
+			 */
+			if (PageUptodate(page)) {
+				set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * Neither the page nor the buffer are uptodate.  If
+			 * the buffer is only partially being written to, we
+			 * need to read it in before the write, i.e. now.
+			 */
+			if ((bh_pos < pos && bh_end > pos) ||
+					(bh_pos < end && bh_end > end)) {
+				/*
+				 * If the buffer is fully or partially within
+				 * the initialized size, do an actual read.
+				 * Otherwise, simply zero the buffer.
+				 */
+				read_lock_irqsave(&ni->size_lock, flags);
+				initialized_size = ni->initialized_size;
+				read_unlock_irqrestore(&ni->size_lock, flags);
+				if (bh_pos < initialized_size) {
+					ntfs_submit_bh_for_read(bh);
+					*wait_bh++ = bh;
+				} else {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+			}
+			continue;
+		}
+		/* Unmapped buffer.  Need to map it. */
+		bh->b_bdev = vol->sb->s_bdev;
+		/*
+		 * If the current buffer is in the same clusters as the map
+		 * cache, there is no need to check the runlist again.  The
+		 * map cache is made up of @vcn, which is the first cached file
+		 * cluster, @vcn_len which is the number of cached file
+		 * clusters, @lcn is the device cluster corresponding to @vcn,
+		 * and @lcn_block is the block number corresponding to @lcn.
+		 */
+		cdelta = bh_cpos - vcn;
+		if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
+map_buffer_cached:
+			BUG_ON(lcn < 0);
+			bh->b_blocknr = lcn_block +
+					(cdelta << (vol->cluster_size_bits -
+					blocksize_bits)) +
+					(bh_cofs >> blocksize_bits);
+			set_buffer_mapped(bh);
+			/*
+			 * If the page is uptodate so is the buffer.  If the
+			 * buffer is fully outside the write, we ignore it if
+			 * it was already allocated and we mark it dirty so it
+			 * gets written out if we allocated it.  On the other
+			 * hand, if we allocated the buffer but we are not
+			 * marking it dirty we set buffer_new so we can do
+			 * error recovery.
+			 */
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+				if (unlikely(was_hole)) {
+					/* We allocated the buffer. */
+					unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+					if (bh_end <= pos || bh_pos >= end)
+						mark_buffer_dirty(bh);
+					else
+						set_buffer_new(bh);
+				}
+				continue;
+			}
+			/* Page is _not_ uptodate. */
+			if (likely(!was_hole)) {
+				/*
+				 * Buffer was already allocated.  If it is not
+				 * uptodate and is only partially being written
+				 * to, we need to read it in before the write,
+				 * i.e. now.
+				 */
+				if (!buffer_uptodate(bh) && bh_pos < end &&
+						bh_end > pos &&
+						(bh_pos < pos ||
+						bh_end > end)) {
+					/*
+					 * If the buffer is fully or partially
+					 * within the initialized size, do an
+					 * actual read.  Otherwise, simply zero
+					 * the buffer.
+					 */
+					read_lock_irqsave(&ni->size_lock,
+							flags);
+					initialized_size = ni->initialized_size;
+					read_unlock_irqrestore(&ni->size_lock,
+							flags);
+					if (bh_pos < initialized_size) {
+						ntfs_submit_bh_for_read(bh);
+						*wait_bh++ = bh;
+					} else {
+						zero_user(page, bh_offset(bh),
+								blocksize);
+						set_buffer_uptodate(bh);
+					}
+				}
+				continue;
+			}
+			/* We allocated the buffer. */
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			/*
+			 * If the buffer is fully outside the write, zero it,
+			 * set it uptodate, and mark it dirty so it gets
+			 * written out.  If it is partially being written to,
+			 * zero region surrounding the write but leave it to
+			 * commit write to do anything else.  Finally, if the
+			 * buffer is fully being overwritten, do nothing.
+			 */
+			if (bh_end <= pos || bh_pos >= end) {
+				if (!buffer_uptodate(bh)) {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+				mark_buffer_dirty(bh);
+				continue;
+			}
+			set_buffer_new(bh);
+			if (!buffer_uptodate(bh) &&
+					(bh_pos < pos || bh_end > end)) {
+				u8 *kaddr;
+				unsigned pofs;
+					
+				kaddr = kmap_atomic(page);
+				if (bh_pos < pos) {
+					pofs = bh_pos & ~PAGE_CACHE_MASK;
+					memset(kaddr + pofs, 0, pos - bh_pos);
+				}
+				if (bh_end > end) {
+					pofs = end & ~PAGE_CACHE_MASK;
+					memset(kaddr + pofs, 0, bh_end - end);
+				}
+				kunmap_atomic(kaddr);
+				flush_dcache_page(page);
+			}
+			continue;
+		}
+		/*
+		 * Slow path: this is the first buffer in the cluster.  If it
+		 * is outside allocated size and is not uptodate, zero it and
+		 * set it uptodate.
+		 */
+		read_lock_irqsave(&ni->size_lock, flags);
+		initialized_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (bh_pos > initialized_size) {
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+			} else if (!buffer_uptodate(bh)) {
+				zero_user(page, bh_offset(bh), blocksize);
+				set_buffer_uptodate(bh);
+			}
+			continue;
+		}
+		is_retry = false;
+		if (!rl) {
+			down_read(&ni->runlist.lock);
+retry_remap:
+			rl = ni->runlist.rl;
+		}
+		if (likely(rl != NULL)) {
+			/* Seek to element containing target cluster. */
+			while (rl->length && rl[1].vcn <= bh_cpos)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
+			if (likely(lcn >= 0)) {
+				/*
+				 * Successful remap, setup the map cache and
+				 * use that to deal with the buffer.
+				 */
+				was_hole = false;
+				vcn = bh_cpos;
+				vcn_len = rl[1].vcn - vcn;
+				lcn_block = lcn << (vol->cluster_size_bits -
+						blocksize_bits);
+				cdelta = 0;
+				/*
+				 * If the number of remaining clusters touched
+				 * by the write is smaller or equal to the
+				 * number of cached clusters, unlock the
+				 * runlist as the map cache will be used from
+				 * now on.
+				 */
+				if (likely(vcn + vcn_len >= cend)) {
+					if (rl_write_locked) {
+						up_write(&ni->runlist.lock);
+						rl_write_locked = false;
+					} else
+						up_read(&ni->runlist.lock);
+					rl = NULL;
+				}
+				goto map_buffer_cached;
+			}
+		} else
+			lcn = LCN_RL_NOT_MAPPED;
+		/*
+		 * If it is not a hole and not out of bounds, the runlist is
+		 * probably unmapped so try to map it now.
+		 */
+		if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
+			if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
+				/* Attempt to map runlist. */
+				if (!rl_write_locked) {
+					/*
+					 * We need the runlist locked for
+					 * writing, so if it is locked for
+					 * reading relock it now and retry in
+					 * case it changed whilst we dropped
+					 * the lock.
+					 */
+					up_read(&ni->runlist.lock);
+					down_write(&ni->runlist.lock);
+					rl_write_locked = true;
+					goto retry_remap;
+				}
+				err = ntfs_map_runlist_nolock(ni, bh_cpos,
+						NULL);
+				if (likely(!err)) {
+					is_retry = true;
+					goto retry_remap;
+				}
+				/*
+				 * If @vcn is out of bounds, pretend @lcn is
+				 * LCN_ENOENT.  As long as the buffer is out
+				 * of bounds this will work fine.
+				 */
+				if (err == -ENOENT) {
+					lcn = LCN_ENOENT;
+					err = 0;
+					goto rl_not_mapped_enoent;
+				}
+			} else
+				err = -EIO;
+			/* Failed to map the buffer, even after retrying. */
+			bh->b_blocknr = -1;
+			ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+					"attribute type 0x%x, vcn 0x%llx, "
+					"vcn offset 0x%x, because its "
+					"location on disk could not be "
+					"determined%s (error code %i).",
+					ni->mft_no, ni->type,
+					(unsigned long long)bh_cpos,
+					(unsigned)bh_pos &
+					vol->cluster_size_mask,
+					is_retry ? " even after retrying" : "",
+					err);
+			break;
+		}
+rl_not_mapped_enoent:
+		/*
+		 * The buffer is in a hole or out of bounds.  We need to fill
+		 * the hole, unless the buffer is in a cluster which is not
+		 * touched by the write, in which case we just leave the buffer
+		 * unmapped.  This can only happen when the cluster size is
+		 * less than the page cache size.
+		 */
+		if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
+			bh_cend = (bh_end + vol->cluster_size - 1) >>
+					vol->cluster_size_bits;
+			if ((bh_cend <= cpos || bh_cpos >= cend)) {
+				bh->b_blocknr = -1;
+				/*
+				 * If the buffer is uptodate we skip it.  If it
+				 * is not but the page is uptodate, we can set
+				 * the buffer uptodate.  If the page is not
+				 * uptodate, we can clear the buffer and set it
+				 * uptodate.  Whether this is worthwhile is
+				 * debatable and this could be removed.
+				 */
+				if (PageUptodate(page)) {
+					if (!buffer_uptodate(bh))
+						set_buffer_uptodate(bh);
+				} else if (!buffer_uptodate(bh)) {
+					zero_user(page, bh_offset(bh),
+						blocksize);
+					set_buffer_uptodate(bh);
+				}
+				continue;
+			}
+		}
+		/*
+		 * Out of bounds buffer is invalid if it was not really out of
+		 * bounds.
+		 */
+		BUG_ON(lcn != LCN_HOLE);
+		/*
+		 * We need the runlist locked for writing, so if it is locked
+		 * for reading relock it now and retry in case it changed
+		 * whilst we dropped the lock.
+		 */
+		BUG_ON(!rl);
+		if (!rl_write_locked) {
+			up_read(&ni->runlist.lock);
+			down_write(&ni->runlist.lock);
+			rl_write_locked = true;
+			goto retry_remap;
+		}
+		/* Find the previous last allocated cluster. */
+		BUG_ON(rl->lcn != LCN_HOLE);
+		lcn = -1;
+		rl2 = rl;
+		while (--rl2 >= ni->runlist.rl) {
+			if (rl2->lcn >= 0) {
+				lcn = rl2->lcn + rl2->length;
+				break;
+			}
+		}
+		rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
+				false);
+		if (IS_ERR(rl2)) {
+			err = PTR_ERR(rl2);
+			ntfs_debug("Failed to allocate cluster, error code %i.",
+					err);
+			break;
+		}
+		lcn = rl2->lcn;
+		rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			if (err != -ENOMEM)
+				err = -EIO;
+			if (ntfs_cluster_free_from_rl(vol, rl2)) {
+				ntfs_error(vol->sb, "Failed to release "
+						"allocated cluster in error "
+						"code path.  Run chkdsk to "
+						"recover the lost cluster.");
+				NVolSetErrors(vol);
+			}
+			ntfs_free(rl2);
+			break;
+		}
+		ni->runlist.rl = rl;
+		status.runlist_merged = 1;
+		ntfs_debug("Allocated cluster, lcn 0x%llx.",
+				(unsigned long long)lcn);
+		/* Map and lock the mft record and get the attribute record. */
+		if (!NInoAttr(ni))
+			base_ni = ni;
+		else
+			base_ni = ni->ext.base_ntfs_ino;
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			break;
+		}
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			unmap_mft_record(base_ni);
+			break;
+		}
+		status.mft_attr_mapped = 1;
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			break;
+		}
+		m = ctx->mrec;
+		a = ctx->attr;
+		/*
+		 * Find the runlist element with which the attribute extent
+		 * starts.  Note, we cannot use the _attr_ version because we
+		 * have mapped the mft record.  That is ok because we know the
+		 * runlist fragment must be mapped already to have ever gotten
+		 * here, so we can just use the _rl_ version.
+		 */
+		vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+		rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
+		BUG_ON(!rl2);
+		BUG_ON(!rl2->length);
+		BUG_ON(rl2->lcn < LCN_HOLE);
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		/*
+		 * If @highest_vcn is zero, calculate the real highest_vcn
+		 * (which can really be zero).
+		 */
+		if (!highest_vcn)
+			highest_vcn = (sle64_to_cpu(
+					a->data.non_resident.allocated_size) >>
+					vol->cluster_size_bits) - 1;
+		/*
+		 * Determine the size of the mapping pairs array for the new
+		 * extent, i.e. the old extent with the hole filled.
+		 */
+		mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
+				highest_vcn);
+		if (unlikely(mp_size <= 0)) {
+			if (!(err = mp_size))
+				err = -EIO;
+			ntfs_debug("Failed to get size for mapping pairs "
+					"array, error code %i.", err);
+			break;
+		}
+		/*
+		 * Resize the attribute record to fit the new mapping pairs
+		 * array.
+		 */
+		attr_rec_len = le32_to_cpu(a->length);
+		err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset));
+		if (unlikely(err)) {
+			BUG_ON(err != -ENOSPC);
+			// TODO: Deal with this by using the current attribute
+			// and fill it with as much of the mapping pairs
+			// array as possible.  Then loop over each attribute
+			// extent rewriting the mapping pairs arrays as we go
+			// along and if when we reach the end we have not
+			// enough space, try to resize the last attribute
+			// extent and if even that fails, add a new attribute
+			// extent.
+			// We could also try to resize at each step in the hope
+			// that we will not need to rewrite every single extent.
+			// Note, we may need to decompress some extents to fill
+			// the runlist as we are walking the extents...
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record for the extended attribute "
+					"record.  This case is not "
+					"implemented yet.");
+			err = -EOPNOTSUPP;
+			break ;
+		}
+		status.mp_rebuilt = 1;
+		/*
+		 * Generate the mapping pairs array directly into the attribute
+		 * record.
+		 */
+		err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				mp_size, rl2, vcn, highest_vcn, NULL);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
+					"attribute type 0x%x, because building "
+					"the mapping pairs failed with error "
+					"code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			err = -EIO;
+			break;
+		}
+		/* Update the highest_vcn but only if it was not set. */
+		if (unlikely(!a->data.non_resident.highest_vcn))
+			a->data.non_resident.highest_vcn =
+					cpu_to_sle64(highest_vcn);
+		/*
+		 * If the attribute is sparse/compressed, update the compressed
+		 * size in the ntfs_inode structure and the attribute record.
+		 */
+		if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
+			/*
+			 * If we are not in the first attribute extent, switch
+			 * to it, but first ensure the changes will make it to
+			 * disk later.
+			 */
+			if (a->data.non_resident.lowest_vcn) {
+				flush_dcache_mft_record_page(ctx->ntfs_ino);
+				mark_mft_record_dirty(ctx->ntfs_ino);
+				ntfs_attr_reinit_search_ctx(ctx);
+				err = ntfs_attr_lookup(ni->type, ni->name,
+						ni->name_len, CASE_SENSITIVE,
+						0, NULL, 0, ctx);
+				if (unlikely(err)) {
+					status.attr_switched = 1;
+					break;
+				}
+				/* @m is not used any more so do not set it. */
+				a = ctx->attr;
+			}
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->itype.compressed.size += vol->cluster_size;
+			a->data.non_resident.compressed_size =
+					cpu_to_sle64(ni->itype.compressed.size);
+			write_unlock_irqrestore(&ni->size_lock, flags);
+		}
+		/* Ensure the changes make it to disk. */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		/* Successfully filled the hole. */
+		status.runlist_merged = 0;
+		status.mft_attr_mapped = 0;
+		status.mp_rebuilt = 0;
+		/* Setup the map cache and use that to deal with the buffer. */
+		was_hole = true;
+		vcn = bh_cpos;
+		vcn_len = 1;
+		lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
+		cdelta = 0;
+		/*
+		 * If the number of remaining clusters in the @pages is smaller
+		 * or equal to the number of cached clusters, unlock the
+		 * runlist as the map cache will be used from now on.
+		 */
+		if (likely(vcn + vcn_len >= cend)) {
+			up_write(&ni->runlist.lock);
+			rl_write_locked = false;
+			rl = NULL;
+		}
+		goto map_buffer_cached;
+	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+	/* If there are no errors, do the next page. */
+	if (likely(!err && ++u < nr_pages))
+		goto do_next_page;
+	/* If there are no errors, release the runlist lock if we took it. */
+	if (likely(!err)) {
+		if (unlikely(rl_write_locked)) {
+			up_write(&ni->runlist.lock);
+			rl_write_locked = false;
+		} else if (unlikely(rl))
+			up_read(&ni->runlist.lock);
+		rl = NULL;
+	}
+	/* If we issued read requests, let them complete. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	while (wait_bh > wait) {
+		bh = *--wait_bh;
+		wait_on_buffer(bh);
+		if (likely(buffer_uptodate(bh))) {
+			page = bh->b_page;
+			bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
+					bh_offset(bh);
+			/*
+			 * If the buffer overflows the initialized size, need
+			 * to zero the overflowing region.
+			 */
+			if (unlikely(bh_pos + blocksize > initialized_size)) {
+				int ofs = 0;
+
+				if (likely(bh_pos < initialized_size))
+					ofs = initialized_size - bh_pos;
+				zero_user_segment(page, bh_offset(bh) + ofs,
+						blocksize);
+			}
+		} else /* if (unlikely(!buffer_uptodate(bh))) */
+			err = -EIO;
+	}
+	if (likely(!err)) {
+		/* Clear buffer_new on all buffers. */
+		u = 0;
+		do {
+			bh = head = page_buffers(pages[u]);
+			do {
+				if (buffer_new(bh))
+					clear_buffer_new(bh);
+			} while ((bh = bh->b_this_page) != head);
+		} while (++u < nr_pages);
+		ntfs_debug("Done.");
+		return err;
+	}
+	if (status.attr_switched) {
+		/* Get back to the attribute extent we modified. */
+		ntfs_attr_reinit_search_ctx(ctx);
+		if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
+			ntfs_error(vol->sb, "Failed to find required "
+					"attribute extent of attribute in "
+					"error code path.  Run chkdsk to "
+					"recover.");
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->itype.compressed.size += vol->cluster_size;
+			write_unlock_irqrestore(&ni->size_lock, flags);
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+			/*
+			 * The only thing that is now wrong is the compressed
+			 * size of the base attribute extent which chkdsk
+			 * should be able to fix.
+			 */
+			NVolSetErrors(vol);
+		} else {
+			m = ctx->mrec;
+			a = ctx->attr;
+			status.attr_switched = 0;
+		}
+	}
+	/*
+	 * If the runlist has been modified, need to restore it by punching a
+	 * hole into it and we then need to deallocate the on-disk cluster as
+	 * well.  Note, we only modify the runlist if we are able to generate a
+	 * new mapping pairs array, i.e. only when the mapped attribute extent
+	 * is not switched.
+	 */
+	if (status.runlist_merged && !status.attr_switched) {
+		BUG_ON(!rl_write_locked);
+		/* Make the file cluster we allocated sparse in the runlist. */
+		if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
+			ntfs_error(vol->sb, "Failed to punch hole into "
+					"attribute runlist in error code "
+					"path.  Run chkdsk to recover the "
+					"lost cluster.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			status.runlist_merged = 0;
+			/*
+			 * Deallocate the on-disk cluster we allocated but only
+			 * if we succeeded in punching its vcn out of the
+			 * runlist.
+			 */
+			down_write(&vol->lcnbmp_lock);
+			if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+				ntfs_error(vol->sb, "Failed to release "
+						"allocated cluster in error "
+						"code path.  Run chkdsk to "
+						"recover the lost cluster.");
+				NVolSetErrors(vol);
+			}
+			up_write(&vol->lcnbmp_lock);
+		}
+	}
+	/*
+	 * Resize the attribute record to its old size and rebuild the mapping
+	 * pairs array.  Note, we only can do this if the runlist has been
+	 * restored to its old state which also implies that the mapped
+	 * attribute extent is not switched.
+	 */
+	if (status.mp_rebuilt && !status.runlist_merged) {
+		if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record in error code path.  Run "
+					"chkdsk to recover.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			if (ntfs_mapping_pairs_build(vol, (u8*)a +
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), attr_rec_len -
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), ni->runlist.rl,
+					vcn, highest_vcn, NULL)) {
+				ntfs_error(vol->sb, "Failed to restore "
+						"mapping pairs array in error "
+						"code path.  Run chkdsk to "
+						"recover.");
+				NVolSetErrors(vol);
+			}
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+		}
+	}
+	/* Release the mft record and the attribute. */
+	if (status.mft_attr_mapped) {
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+	}
+	/* Release the runlist lock. */
+	if (rl_write_locked)
+		up_write(&ni->runlist.lock);
+	else if (rl)
+		up_read(&ni->runlist.lock);
+	/*
+	 * Zero out any newly allocated blocks to avoid exposing stale data.
+	 * If BH_New is set, we know that the block was newly allocated above
+	 * and that it has not been fully zeroed and marked dirty yet.
+	 */
+	nr_pages = u;
+	u = 0;
+	end = bh_cpos << vol->cluster_size_bits;
+	do {
+		page = pages[u];
+		bh = head = page_buffers(page);
+		do {
+			if (u == nr_pages &&
+					((s64)page->index << PAGE_CACHE_SHIFT) +
+					bh_offset(bh) >= end)
+				break;
+			if (!buffer_new(bh))
+				continue;
+			clear_buffer_new(bh);
+			if (!buffer_uptodate(bh)) {
+				if (PageUptodate(page))
+					set_buffer_uptodate(bh);
+				else {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+			}
+			mark_buffer_dirty(bh);
+		} while ((bh = bh->b_this_page) != head);
+	} while (++u <= nr_pages);
+	ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
+	return err;
+}
+
+static inline void ntfs_flush_dcache_pages(struct page **pages,
+		unsigned nr_pages)
+{
+	BUG_ON(!nr_pages);
+	/*
+	 * Warning: Do not do the decrement at the same time as the call to
+	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
+	 * decrement never happens so the loop never terminates.
+	 */
+	do {
+		--nr_pages;
+		flush_dcache_page(pages[nr_pages]);
+	} while (nr_pages > 0);
+}
+
+/**
+ * ntfs_commit_pages_after_non_resident_write - commit the received data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * See description of ntfs_commit_pages_after_write(), below.
+ */
+static inline int ntfs_commit_pages_after_non_resident_write(
+		struct page **pages, const unsigned nr_pages,
+		s64 pos, size_t bytes)
+{
+	s64 end, initialized_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	struct buffer_head *bh, *head;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	unsigned long flags;
+	unsigned blocksize, u;
+	int err;
+
+	vi = pages[0]->mapping->host;
+	ni = NTFS_I(vi);
+	blocksize = vi->i_sb->s_blocksize;
+	end = pos + bytes;
+	u = 0;
+	do {
+		s64 bh_pos;
+		struct page *page;
+		bool partial;
+
+		page = pages[u];
+		bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+		bh = head = page_buffers(page);
+		partial = false;
+		do {
+			s64 bh_end;
+
+			bh_end = bh_pos + blocksize;
+			if (bh_end <= pos || bh_pos >= end) {
+				if (!buffer_uptodate(bh))
+					partial = true;
+			} else {
+				set_buffer_uptodate(bh);
+				mark_buffer_dirty(bh);
+			}
+		} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+		/*
+		 * If all buffers are now uptodate but the page is not, set the
+		 * page uptodate.
+		 */
+		if (!partial && !PageUptodate(page))
+			SetPageUptodate(page);
+	} while (++u < nr_pages);
+	/*
+	 * Finally, if we do not need to update initialized_size or i_size we
+	 * are finished.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (end <= initialized_size) {
+		ntfs_debug("Done.");
+		return 0;
+	}
+	/*
+	 * Update initialized_size/i_size as appropriate, both in the inode and
+	 * the mft record.
+	 */
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	BUG_ON(!NInoNonResident(ni));
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	a = ctx->attr;
+	BUG_ON(!a->non_resident);
+	write_lock_irqsave(&ni->size_lock, flags);
+	BUG_ON(end > ni->allocated_size);
+	ni->initialized_size = end;
+	a->data.non_resident.initialized_size = cpu_to_sle64(end);
+	if (end > i_size_read(vi)) {
+		i_size_write(vi, end);
+		a->data.non_resident.data_size =
+				a->data.non_resident.initialized_size;
+	}
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	ntfs_debug("Done.");
+	return 0;
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
+			"code %i).", err);
+	if (err != -ENOMEM)
+		NVolSetErrors(ni->vol);
+	return err;
+}
+
+/**
+ * ntfs_commit_pages_after_write - commit the received data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
+ * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
+ * locked but not kmap()ped.  The source data has already been copied into the
+ * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
+ * the data was copied (for non-resident attributes only) and it returned
+ * success.
+ *
+ * Need to set uptodate and mark dirty all buffers within the boundary of the
+ * write.  If all buffers in a page are uptodate we set the page uptodate, too.
+ *
+ * Setting the buffers dirty ensures that they get written out later when
+ * ntfs_writepage() is invoked by the VM.
+ *
+ * Finally, we need to update i_size and initialized_size as appropriate both
+ * in the inode and the mft record.
+ *
+ * This is modelled after fs/buffer.c::generic_commit_write(), which marks
+ * buffers uptodate and dirty, sets the page uptodate if all buffers in the
+ * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
+ * that case, it also marks the inode dirty.
+ *
+ * If things have gone as outlined in
+ * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
+ * content modifications here for non-resident attributes.  For resident
+ * attributes we need to do the uptodate bringing here which we combine with
+ * the copying into the mft record which means we save one atomic kmap.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_commit_pages_after_write(struct page **pages,
+		const unsigned nr_pages, s64 pos, size_t bytes)
+{
+	s64 end, initialized_size;
+	loff_t i_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	struct page *page;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	char *kattr, *kaddr;
+	unsigned long flags;
+	u32 attr_len;
+	int err;
+
+	BUG_ON(!nr_pages);
+	BUG_ON(!pages);
+	page = pages[0];
+	BUG_ON(!page);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+			vi->i_ino, ni->type, page->index, nr_pages,
+			(long long)pos, bytes);
+	if (NInoNonResident(ni))
+		return ntfs_commit_pages_after_non_resident_write(pages,
+				nr_pages, pos, bytes);
+	BUG_ON(nr_pages > 1);
+	/*
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * sparse.
+	 */
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	BUG_ON(NInoNonResident(ni));
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	a = ctx->attr;
+	BUG_ON(a->non_resident);
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	i_size = i_size_read(vi);
+	BUG_ON(attr_len != i_size);
+	BUG_ON(pos > attr_len);
+	end = pos + bytes;
+	BUG_ON(end > le32_to_cpu(a->length) -
+			le16_to_cpu(a->data.resident.value_offset));
+	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+	kaddr = kmap_atomic(page);
+	/* Copy the received data from the page to the mft record. */
+	memcpy(kattr + pos, kaddr + pos, bytes);
+	/* Update the attribute length if necessary. */
+	if (end > attr_len) {
+		attr_len = end;
+		a->data.resident.value_length = cpu_to_le32(attr_len);
+	}
+	/*
+	 * If the page is not uptodate, bring the out of bounds area(s)
+	 * uptodate by copying data from the mft record to the page.
+	 */
+	if (!PageUptodate(page)) {
+		if (pos > 0)
+			memcpy(kaddr, kattr, pos);
+		if (end < attr_len)
+			memcpy(kaddr + end, kattr + end, attr_len - end);
+		/* Zero the region outside the end of the attribute value. */
+		memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+	kunmap_atomic(kaddr);
+	/* Update initialized_size/i_size if necessary. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	BUG_ON(end > ni->allocated_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	BUG_ON(initialized_size != i_size);
+	if (end > initialized_size) {
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->initialized_size = end;
+		i_size_write(vi, end);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+	}
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	ntfs_debug("Done.");
+	return 0;
+err_out:
+	if (err == -ENOMEM) {
+		ntfs_warning(vi->i_sb, "Error allocating memory required to "
+				"commit the write.");
+		if (PageUptodate(page)) {
+			ntfs_warning(vi->i_sb, "Page is uptodate, setting "
+					"dirty so the write will be retried "
+					"later on by the VM.");
+			/*
+			 * Put the page on mapping->dirty_pages, but leave its
+			 * buffers' dirty state as-is.
+			 */
+			__set_page_dirty_nobuffers(page);
+			err = 0;
+		} else
+			ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
+					"data has been lost.");
+	} else {
+		ntfs_error(vi->i_sb, "Resident attribute commit write failed "
+				"with error %i.", err);
+		NVolSetErrors(ni->vol);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	return err;
+}
+
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied.  If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+		unsigned ofs, struct iov_iter *i, size_t bytes)
+{
+	struct page **last_page = pages + nr_pages;
+	size_t total = 0;
+	struct iov_iter data = *i;
+	unsigned len, copied;
+
+	do {
+		len = PAGE_CACHE_SIZE - ofs;
+		if (len > bytes)
+			len = bytes;
+		copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
+				len);
+		total += copied;
+		bytes -= copied;
+		if (!bytes)
+			break;
+		iov_iter_advance(&data, copied);
+		if (copied < len)
+			goto err;
+		ofs = 0;
+	} while (++pages < last_page);
+out:
+	return total;
+err:
+	/* Zero the rest of the target like __copy_from_user(). */
+	len = PAGE_CACHE_SIZE - copied;
+	do {
+		if (len > bytes)
+			len = bytes;
+		zero_user(*pages, copied, len);
+		bytes -= len;
+		copied = 0;
+		len = PAGE_CACHE_SIZE;
+	} while (++pages < last_page);
+	goto out;
+}
+
+/**
+ * ntfs_perform_write - perform buffered write to a file
+ * @file:	file to write to
+ * @i:		iov_iter with data to write
+ * @pos:	byte offset in file at which to begin writing to
+ */
+static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
+		loff_t pos)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *vi = mapping->host;
+	ntfs_inode *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+	struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
+	struct page *cached_page = NULL;
+	VCN last_vcn;
+	LCN lcn;
+	size_t bytes;
+	ssize_t status, written = 0;
+	unsigned nr_pages;
+
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+			"0x%llx, count 0x%lx.", vi->i_ino,
+			(unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)pos,
+			(unsigned long)iov_iter_count(i));
+	/*
+	 * If a previous ntfs_truncate() failed, repeat it and abort if it
+	 * fails again.
+	 */
+	if (unlikely(NInoTruncateFailed(ni))) {
+		int err;
+
+		inode_dio_wait(vi);
+		err = ntfs_truncate(vi);
+		if (err || NInoTruncateFailed(ni)) {
+			if (!err)
+				err = -EIO;
+			ntfs_error(vol->sb, "Cannot perform write to inode "
+					"0x%lx, attribute type 0x%x, because "
+					"ntfs_truncate() failed (error code "
+					"%i).", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			return err;
+		}
+	}
+	/*
+	 * Determine the number of pages per cluster for non-resident
+	 * attributes.
+	 */
+	nr_pages = 1;
+	if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
+		nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
+	last_vcn = -1;
+	do {
+		VCN vcn;
+		pgoff_t idx, start_idx;
+		unsigned ofs, do_pages, u;
+		size_t copied;
+
+		start_idx = idx = pos >> PAGE_CACHE_SHIFT;
+		ofs = pos & ~PAGE_CACHE_MASK;
+		bytes = PAGE_CACHE_SIZE - ofs;
+		do_pages = 1;
+		if (nr_pages > 1) {
+			vcn = pos >> vol->cluster_size_bits;
+			if (vcn != last_vcn) {
+				last_vcn = vcn;
+				/*
+				 * Get the lcn of the vcn the write is in.  If
+				 * it is a hole, need to lock down all pages in
+				 * the cluster.
+				 */
+				down_read(&ni->runlist.lock);
+				lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
+						vol->cluster_size_bits, false);
+				up_read(&ni->runlist.lock);
+				if (unlikely(lcn < LCN_HOLE)) {
+					if (lcn == LCN_ENOMEM)
+						status = -ENOMEM;
+					else {
+						status = -EIO;
+						ntfs_error(vol->sb, "Cannot "
+							"perform write to "
+							"inode 0x%lx, "
+							"attribute type 0x%x, "
+							"because the attribute "
+							"is corrupt.",
+							vi->i_ino, (unsigned)
+							le32_to_cpu(ni->type));
+					}
+					break;
+				}
+				if (lcn == LCN_HOLE) {
+					start_idx = (pos & ~(s64)
+							vol->cluster_size_mask)
+							>> PAGE_CACHE_SHIFT;
+					bytes = vol->cluster_size - (pos &
+							vol->cluster_size_mask);
+					do_pages = nr_pages;
+				}
+			}
+		}
+		if (bytes > iov_iter_count(i))
+			bytes = iov_iter_count(i);
+again:
+		/*
+		 * Bring in the user page(s) that we will copy from _first_.
+		 * Otherwise there is a nasty deadlock on copying from the same
+		 * page(s) as we are writing to, without it/them being marked
+		 * up-to-date.  Note, at present there is nothing to stop the
+		 * pages being swapped out between us bringing them into memory
+		 * and doing the actual copying.
+		 */
+		if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
+			status = -EFAULT;
+			break;
+		}
+		/* Get and lock @do_pages starting at index @start_idx. */
+		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
+				pages, &cached_page);
+		if (unlikely(status))
+			break;
+		/*
+		 * For non-resident attributes, we need to fill any holes with
+		 * actual clusters and ensure all bufferes are mapped.  We also
+		 * need to bring uptodate any buffers that are only partially
+		 * being written to.
+		 */
+		if (NInoNonResident(ni)) {
+			status = ntfs_prepare_pages_for_non_resident_write(
+					pages, do_pages, pos, bytes);
+			if (unlikely(status)) {
+				do {
+					unlock_page(pages[--do_pages]);
+					page_cache_release(pages[do_pages]);
+				} while (do_pages);
+				break;
+			}
+		}
+		u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
+		copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
+					i, bytes);
+		ntfs_flush_dcache_pages(pages + u, do_pages - u);
+		status = 0;
+		if (likely(copied == bytes)) {
+			status = ntfs_commit_pages_after_write(pages, do_pages,
+					pos, bytes);
+			if (!status)
+				status = bytes;
+		}
+		do {
+			unlock_page(pages[--do_pages]);
+			page_cache_release(pages[do_pages]);
+		} while (do_pages);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+		cond_resched();
+		if (unlikely(!copied)) {
+			size_t sc;
+
+			/*
+			 * We failed to copy anything.  Fall back to single
+			 * segment length write.
+			 *
+			 * This is needed to avoid possible livelock in the
+			 * case that all segments in the iov cannot be copied
+			 * at once without a pagefault.
+			 */
+			sc = iov_iter_single_seg_count(i);
+			if (bytes > sc)
+				bytes = sc;
+			goto again;
+		}
+		iov_iter_advance(i, copied);
+		pos += copied;
+		written += copied;
+		balance_dirty_pages_ratelimited(mapping);
+		if (fatal_signal_pending(current)) {
+			status = -EINTR;
+			break;
+		}
+	} while (iov_iter_count(i));
+	if (cached_page)
+		page_cache_release(cached_page);
+	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
+			written ? "written" : "status", (unsigned long)written,
+			(long)status);
+	return written ? written : status;
+}
+
+/**
+ * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ * @iocb:	IO state structure
+ * @from:	iov_iter with data to write
+ *
+ * Basically the same as generic_file_write_iter() except that it ends up
+ * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ * O_DIRECT is not implemented.
+ */
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *vi = file_inode(file);
+	ssize_t written = 0;
+	ssize_t err;
+
+	mutex_lock(&vi->i_mutex);
+	/* We can write back this queue in page reclaim. */
+	current->backing_dev_info = inode_to_bdi(vi);
+	err = ntfs_prepare_file_for_write(iocb, from);
+	if (iov_iter_count(from) && !err)
+		written = ntfs_perform_write(file, from, iocb->ki_pos);
+	current->backing_dev_info = NULL;
+	mutex_unlock(&vi->i_mutex);
+	if (likely(written > 0)) {
+		err = generic_write_sync(file, iocb->ki_pos, written);
+		if (err < 0)
+			written = 0;
+	}
+	iocb->ki_pos += written;
+	return written ? written : err;
+}
+
+/**
+ * ntfs_file_fsync - sync a file to disk
+ * @filp:	file to be synced
+ * @datasync:	if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
+ * system calls.  This function is inspired by fs/buffer.c::file_fsync().
+ *
+ * If @datasync is false, write the mft record and all associated extent mft
+ * records as well as the $DATA attribute and then sync the block device.
+ *
+ * If @datasync is true and the attribute is non-resident, we skip the writing
+ * of the mft record and all associated extent mft records (this might still
+ * happen due to the write_inode_now() call).
+ *
+ * Also, if @datasync is true, we do not wait on the inode to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Locking: Caller must hold i_mutex on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.
+ */
+static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			   int datasync)
+{
+	struct inode *vi = filp->f_mapping->host;
+	int err, ret = 0;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&vi->i_mutex);
+
+	BUG_ON(S_ISDIR(vi->i_mode));
+	if (!datasync || !NInoNonResident(NTFS_I(vi)))
+		ret = __ntfs_write_inode(vi, 1);
+	write_inode_now(vi, !datasync);
+	/*
+	 * NOTE: If we were to use mapping->private_list (see ext2 and
+	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
+	 * sync_mapping_buffers(vi->i_mapping).
+	 */
+	err = sync_blockdev(vi->i_sb->s_bdev);
+	if (unlikely(err && !ret))
+		ret = err;
+	if (likely(!ret))
+		ntfs_debug("Done.");
+	else
+		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
+				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	mutex_unlock(&vi->i_mutex);
+	return ret;
+}
+
+#endif /* NTFS_RW */
+
+const struct file_operations ntfs_file_ops = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+#ifdef NTFS_RW
+	.write_iter	= ntfs_file_write_iter,
+	.fsync		= ntfs_file_fsync,
+#endif /* NTFS_RW */
+	.mmap		= generic_file_mmap,
+	.open		= ntfs_file_open,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations ntfs_file_inode_ops = {
+#ifdef NTFS_RW
+	.setattr	= ntfs_setattr,
+#endif /* NTFS_RW */
+};
+
+const struct file_operations ntfs_empty_file_ops = {};
+
+const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/kernel/fs/ntfs/index.c b/kernel/fs/ntfs/index.c
new file mode 100644
index 000000000..096c13569
--- /dev/null
+++ b/kernel/fs/ntfs/index.c
@@ -0,0 +1,454 @@
+/*
+ * index.c - NTFS kernel index handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/slab.h>
+
+#include "aops.h"
+#include "collate.h"
+#include "debug.h"
+#include "index.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_index_ctx_get - allocate and initialize a new index context
+ * @idx_ni:	ntfs index inode with which to initialize the context
+ *
+ * Allocate a new index context, initialize it with @idx_ni and return it.
+ * Return NULL if allocation failed.
+ *
+ * Locking:  Caller must hold i_mutex on the index inode.
+ */
+ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
+{
+	ntfs_index_context *ictx;
+
+	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
+	if (ictx)
+		*ictx = (ntfs_index_context){ .idx_ni = idx_ni };
+	return ictx;
+}
+
+/**
+ * ntfs_index_ctx_put - release an index context
+ * @ictx:	index context to free
+ *
+ * Release the index context @ictx, releasing all associated resources.
+ *
+ * Locking:  Caller must hold i_mutex on the index inode.
+ */
+void ntfs_index_ctx_put(ntfs_index_context *ictx)
+{
+	if (ictx->entry) {
+		if (ictx->is_in_root) {
+			if (ictx->actx)
+				ntfs_attr_put_search_ctx(ictx->actx);
+			if (ictx->base_ni)
+				unmap_mft_record(ictx->base_ni);
+		} else {
+			struct page *page = ictx->page;
+			if (page) {
+				BUG_ON(!PageLocked(page));
+				unlock_page(page);
+				ntfs_unmap_page(page);
+			}
+		}
+	}
+	kmem_cache_free(ntfs_index_ctx_cache, ictx);
+	return;
+}
+
+/**
+ * ntfs_index_lookup - find a key in an index and return its index entry
+ * @key:	[IN] key for which to search in the index
+ * @key_len:	[IN] length of @key in bytes
+ * @ictx:	[IN/OUT] context describing the index and the returned entry
+ *
+ * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
+ * call to ntfs_index_ctx_get().
+ *
+ * Look for the @key in the index specified by the index lookup context @ictx.
+ * ntfs_index_lookup() walks the contents of the index looking for the @key.
+ *
+ * If the @key is found in the index, 0 is returned and @ictx is setup to
+ * describe the index entry containing the matching @key.  @ictx->entry is the
+ * index entry and @ictx->data and @ictx->data_len are the index entry data and
+ * its length in bytes, respectively.
+ *
+ * If the @key is not found in the index, -ENOENT is returned and @ictx is
+ * setup to describe the index entry whose key collates immediately after the
+ * search @key, i.e. this is the position in the index at which an index entry
+ * with a key of @key would need to be inserted.
+ *
+ * If an error occurs return the negative error code and @ictx is left
+ * untouched.
+ *
+ * When finished with the entry and its data, call ntfs_index_ctx_put() to free
+ * the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ *
+ * Locking:  - Caller must hold i_mutex on the index inode.
+ *	     - Each page cache page in the index allocation mapping must be
+ *	       locked whilst being accessed otherwise we may find a corrupt
+ *	       page due to it being under ->writepage at the moment which
+ *	       applies the mst protection fixups before writing out and then
+ *	       removes them again after the write is complete after which it 
+ *	       unlocks the page.
+ */
+int ntfs_index_lookup(const void *key, const int key_len,
+		ntfs_index_context *ictx)
+{
+	VCN vcn, old_vcn;
+	ntfs_inode *idx_ni = ictx->idx_ni;
+	ntfs_volume *vol = idx_ni->vol;
+	struct super_block *sb = vol->sb;
+	ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
+	MFT_RECORD *m;
+	INDEX_ROOT *ir;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *index_end, *kaddr;
+	ntfs_attr_search_ctx *actx;
+	struct address_space *ia_mapping;
+	struct page *page;
+	int rc, err = 0;
+
+	ntfs_debug("Entering.");
+	BUG_ON(!NInoAttr(idx_ni));
+	BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
+	BUG_ON(idx_ni->nr_extents != -1);
+	BUG_ON(!base_ni);
+	BUG_ON(!key);
+	BUG_ON(key_len <= 0);
+	if (!ntfs_is_collation_rule_supported(
+			idx_ni->itype.index.collation_rule)) {
+		ntfs_error(sb, "Index uses unsupported collation rule 0x%x.  "
+				"Aborting lookup.", le32_to_cpu(
+				idx_ni->itype.index.collation_rule));
+		return -EOPNOTSUPP;
+	}
+	/* Get hold of the mft record for the index inode. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+				-PTR_ERR(m));
+		return PTR_ERR(m);
+	}
+	actx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!actx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, actx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(sb, "Index root attribute missing in inode "
+					"0x%lx.", idx_ni->mft_no);
+			err = -EIO;
+		}
+		goto err_out;
+	}
+	/* Get to the index root value (it has been verified in read_inode). */
+	ir = (INDEX_ROOT*)((u8*)actx->attr +
+			le16_to_cpu(actx->attr->data.resident.value_offset));
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->length) > index_end)
+			goto idx_err_out;
+		/*
+		 * The last entry cannot contain a key.  It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Further bounds checks. */
+		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+				le16_to_cpu(ie->key_length) >
+				le16_to_cpu(ie->data.vi.data_offset) ||
+				(u32)le16_to_cpu(ie->data.vi.data_offset) +
+				le16_to_cpu(ie->data.vi.data_length) >
+				le16_to_cpu(ie->length))
+			goto idx_err_out;
+		/* If the keys match perfectly, we setup @ictx and return 0. */
+		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+				&ie->key, key_len)) {
+ir_done:
+			ictx->is_in_root = true;
+			ictx->ir = ir;
+			ictx->actx = actx;
+			ictx->base_ni = base_ni;
+			ictx->ia = NULL;
+			ictx->page = NULL;
+done:
+			ictx->entry = ie;
+			ictx->data = (u8*)ie +
+					le16_to_cpu(ie->data.vi.data_offset);
+			ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
+			ntfs_debug("Done.");
+			return err;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+				key_len, &ie->key, le16_to_cpu(ie->key_length));
+		/*
+		 * If @key collates before the key of the current entry, there
+		 * is definitely no such key in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/*
+		 * A match should never happen as the memcmp() call should have
+		 * cought it, but we still treat it correctly.
+		 */
+		if (!rc)
+			goto ir_done;
+		/* The keys are not equal, continue the search. */
+	}
+	/*
+	 * We have finished with this index without success.  Check for the
+	 * presence of a child node and if not present setup @ictx and return
+	 * -ENOENT.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		ntfs_debug("Entry not found.");
+		err = -ENOENT;
+		goto ir_done;
+	} /* Child node present, descend into it. */
+	/* Consistency check: Verify that an index allocation exists. */
+	if (!NInoIndexAllocPresent(idx_ni)) {
+		ntfs_error(sb, "No index allocation attribute but index entry "
+				"requires one.  Inode 0x%lx is corrupt or "
+				"driver bug.", idx_ni->mft_no);
+		goto err_out;
+	}
+	/* Get the starting vcn of the index_block holding the child node. */
+	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+	ia_mapping = VFS_I(idx_ni)->i_mapping;
+	/*
+	 * We are done with the index root and the mft record.  Release them,
+	 * otherwise we deadlock with ntfs_map_page().
+	 */
+	ntfs_attr_put_search_ctx(actx);
+	unmap_mft_record(base_ni);
+	m = NULL;
+	actx = NULL;
+descend_into_child_node:
+	/*
+	 * Convert vcn to index into the index allocation attribute in units
+	 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+	 * disk if necessary.
+	 */
+	page = ntfs_map_page(ia_mapping, vcn <<
+			idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		ntfs_error(sb, "Failed to map index page, error %ld.",
+				-PTR_ERR(page));
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+	/* Get to the index allocation block. */
+	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+			idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+	/* Bounds checks. */
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Out of bounds check failed.  Corrupt inode "
+				"0x%lx or driver bug.", idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Index record with vcn 0x%llx is corrupt.  "
+				"Corrupt inode 0x%lx.  Run chkdsk.",
+				(long long)vcn, idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx).  Inode "
+				"0x%lx is corrupt or driver bug.",
+				(unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)vcn, idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			idx_ni->itype.index.block_size) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
+				"a size (%u) differing from the index "
+				"specified size (%u).  Inode is corrupt or "
+				"driver bug.", (unsigned long long)vcn,
+				idx_ni->mft_no,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				idx_ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	index_end = (u8*)ia + idx_ni->itype.index.block_size;
+	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
+				"crosses page boundary.  Impossible!  Cannot "
+				"access!  This is probably a bug in the "
+				"driver.", (unsigned long long)vcn,
+				idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
+				"0x%lx exceeds maximum size.",
+				(unsigned long long)vcn, idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Iterate similar to above big loop but applied to index buffer, thus
+	 * loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->length) > index_end) {
+			ntfs_error(sb, "Index entry out of bounds in inode "
+					"0x%lx.", idx_ni->mft_no);
+			goto unm_err_out;
+		}
+		/*
+		 * The last entry cannot contain a key.  It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Further bounds checks. */
+		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+				le16_to_cpu(ie->key_length) >
+				le16_to_cpu(ie->data.vi.data_offset) ||
+				(u32)le16_to_cpu(ie->data.vi.data_offset) +
+				le16_to_cpu(ie->data.vi.data_length) >
+				le16_to_cpu(ie->length)) {
+			ntfs_error(sb, "Index entry out of bounds in inode "
+					"0x%lx.", idx_ni->mft_no);
+			goto unm_err_out;
+		}
+		/* If the keys match perfectly, we setup @ictx and return 0. */
+		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+				&ie->key, key_len)) {
+ia_done:
+			ictx->is_in_root = false;
+			ictx->actx = NULL;
+			ictx->base_ni = NULL;
+			ictx->ia = ia;
+			ictx->page = page;
+			goto done;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+				key_len, &ie->key, le16_to_cpu(ie->key_length));
+		/*
+		 * If @key collates before the key of the current entry, there
+		 * is definitely no such key in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/*
+		 * A match should never happen as the memcmp() call should have
+		 * cought it, but we still treat it correctly.
+		 */
+		if (!rc)
+			goto ia_done;
+		/* The keys are not equal, continue the search. */
+	}
+	/*
+	 * We have finished with this index buffer without success.  Check for
+	 * the presence of a child node and if not present return -ENOENT.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		ntfs_debug("Entry not found.");
+		err = -ENOENT;
+		goto ia_done;
+	}
+	if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+		ntfs_error(sb, "Index entry with child node found in a leaf "
+				"node in inode 0x%lx.", idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Child node present, descend into it. */
+	old_vcn = vcn;
+	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+	if (vcn >= 0) {
+		/*
+		 * If vcn is in the same page cache page as old_vcn we recycle
+		 * the mapped page.
+		 */
+		if (old_vcn << vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT == vcn <<
+				vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT)
+			goto fast_descend_into_child_node;
+		unlock_page(page);
+		ntfs_unmap_page(page);
+		goto descend_into_child_node;
+	}
+	ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
+			idx_ni->mft_no);
+unm_err_out:
+	unlock_page(page);
+	ntfs_unmap_page(page);
+err_out:
+	if (!err)
+		err = -EIO;
+	if (actx)
+		ntfs_attr_put_search_ctx(actx);
+	if (m)
+		unmap_mft_record(base_ni);
+	return err;
+idx_err_out:
+	ntfs_error(sb, "Corrupt index.  Aborting lookup.");
+	goto err_out;
+}
diff --git a/kernel/fs/ntfs/index.h b/kernel/fs/ntfs/index.h
new file mode 100644
index 000000000..8745469c3
--- /dev/null
+++ b/kernel/fs/ntfs/index.h
@@ -0,0 +1,148 @@
+/*
+ * index.h - Defines for NTFS kernel index handling.  Part of the Linux-NTFS
+ *	     project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_INDEX_H
+#define _LINUX_NTFS_INDEX_H
+
+#include <linux/fs.h>
+
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "attrib.h"
+#include "mft.h"
+#include "aops.h"
+
+/**
+ * @idx_ni:	index inode containing the @entry described by this context
+ * @entry:	index entry (points into @ir or @ia)
+ * @data:	index entry data (points into @entry)
+ * @data_len:	length in bytes of @data
+ * @is_in_root:	'true' if @entry is in @ir and 'false' if it is in @ia
+ * @ir:		index root if @is_in_root and NULL otherwise
+ * @actx:	attribute search context if @is_in_root and NULL otherwise
+ * @base_ni:	base inode if @is_in_root and NULL otherwise
+ * @ia:		index block if @is_in_root is 'false' and NULL otherwise
+ * @page:	page if @is_in_root is 'false' and NULL otherwise
+ *
+ * @idx_ni is the index inode this context belongs to.
+ *
+ * @entry is the index entry described by this context.  @data and @data_len
+ * are the index entry data and its length in bytes, respectively.  @data
+ * simply points into @entry.  This is probably what the user is interested in.
+ *
+ * If @is_in_root is 'true', @entry is in the index root attribute @ir described
+ * by the attribute search context @actx and the base inode @base_ni.  @ia and
+ * @page are NULL in this case.
+ *
+ * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
+ * and @page point to the index allocation block and the mapped, locked page it
+ * is in, respectively.  @ir, @actx and @base_ni are NULL in this case.
+ *
+ * To obtain a context call ntfs_index_ctx_get().
+ *
+ * We use this context to allow ntfs_index_lookup() to return the found index
+ * @entry and its @data without having to allocate a buffer and copy the @entry
+ * and/or its @data into it.
+ *
+ * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
+ * free the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ */
+typedef struct {
+	ntfs_inode *idx_ni;
+	INDEX_ENTRY *entry;
+	void *data;
+	u16 data_len;
+	bool is_in_root;
+	INDEX_ROOT *ir;
+	ntfs_attr_search_ctx *actx;
+	ntfs_inode *base_ni;
+	INDEX_ALLOCATION *ia;
+	struct page *page;
+} ntfs_index_context;
+
+extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
+extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
+
+extern int ntfs_index_lookup(const void *key, const int key_len,
+		ntfs_index_context *ictx);
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
+ * @ictx:	ntfs index context describing the index entry
+ *
+ * Call flush_dcache_page() for the page in which an index entry resides.
+ *
+ * This must be called every time an index entry is modified, just after the
+ * modification.
+ *
+ * If the index entry is in the index root attribute, simply flush the page
+ * containing the mft record containing the index root attribute.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, simply flush the page cache page containing the index block.
+ */
+static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
+{
+	if (ictx->is_in_root)
+		flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
+	else
+		flush_dcache_page(ictx->page);
+}
+
+/**
+ * ntfs_index_entry_mark_dirty - mark an index entry dirty
+ * @ictx:	ntfs index context describing the index entry
+ *
+ * Mark the index entry described by the index entry context @ictx dirty.
+ *
+ * If the index entry is in the index root attribute, simply mark the mft
+ * record containing the index root attribute dirty.  This ensures the mft
+ * record, and hence the index root attribute, will be written out to disk
+ * later.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, mark the buffers belonging to the index record as well as the
+ * page cache page the index block is in dirty.  This automatically marks the
+ * VFS inode of the ntfs index inode to which the index entry belongs dirty,
+ * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
+ * dirty index block, will be written out to disk later.
+ */
+static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
+{
+	if (ictx->is_in_root)
+		mark_mft_record_dirty(ictx->actx->ntfs_ino);
+	else
+		mark_ntfs_record_dirty(ictx->page,
+				(u8*)ictx->ia - (u8*)page_address(ictx->page));
+}
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/kernel/fs/ntfs/inode.c b/kernel/fs/ntfs/inode.c
new file mode 100644
index 000000000..d284f07ed
--- /dev/null
+++ b/kernel/fs/ntfs/inode.c
@@ -0,0 +1,3111 @@
+/**
+ * inode.c - NTFS kernel inode handling.
+ *
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "bitmap.h"
+#include "dir.h"
+#include "debug.h"
+#include "inode.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "time.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_test_inode - compare two (possibly fake) inodes for equality
+ * @vi:		vfs inode which to test
+ * @na:		ntfs attribute which is being tested with
+ *
+ * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
+ * inode @vi for equality with the ntfs attribute @na.
+ *
+ * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
+ * @na->name and @na->name_len are then ignored.
+ *
+ * Return 1 if the attributes match and 0 if not.
+ *
+ * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
+ * allowed to sleep.
+ */
+int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
+{
+	ntfs_inode *ni;
+
+	if (vi->i_ino != na->mft_no)
+		return 0;
+	ni = NTFS_I(vi);
+	/* If !NInoAttr(ni), @vi is a normal file or directory inode. */
+	if (likely(!NInoAttr(ni))) {
+		/* If not looking for a normal inode this is a mismatch. */
+		if (unlikely(na->type != AT_UNUSED))
+			return 0;
+	} else {
+		/* A fake inode describing an attribute. */
+		if (ni->type != na->type)
+			return 0;
+		if (ni->name_len != na->name_len)
+			return 0;
+		if (na->name_len && memcmp(ni->name, na->name,
+				na->name_len * sizeof(ntfschar)))
+			return 0;
+	}
+	/* Match! */
+	return 1;
+}
+
+/**
+ * ntfs_init_locked_inode - initialize an inode
+ * @vi:		vfs inode to initialize
+ * @na:		ntfs attribute which to initialize @vi to
+ *
+ * Initialize the vfs inode @vi with the values from the ntfs attribute @na in
+ * order to enable ntfs_test_inode() to do its work.
+ *
+ * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
+ * In that case, @na->name and @na->name_len should be set to NULL and 0,
+ * respectively. Although that is not strictly necessary as
+ * ntfs_read_locked_inode() will fill them in later.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
+ * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
+ */
+static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
+{
+	ntfs_inode *ni = NTFS_I(vi);
+
+	vi->i_ino = na->mft_no;
+
+	ni->type = na->type;
+	if (na->type == AT_INDEX_ALLOCATION)
+		NInoSetMstProtected(ni);
+
+	ni->name = na->name;
+	ni->name_len = na->name_len;
+
+	/* If initializing a normal inode, we are done. */
+	if (likely(na->type == AT_UNUSED)) {
+		BUG_ON(na->name);
+		BUG_ON(na->name_len);
+		return 0;
+	}
+
+	/* It is a fake inode. */
+	NInoSetAttr(ni);
+
+	/*
+	 * We have I30 global constant as an optimization as it is the name
+	 * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
+	 * allocation but that is ok. And most attributes are unnamed anyway,
+	 * thus the fraction of named attributes with name != I30 is actually
+	 * absolutely tiny.
+	 */
+	if (na->name_len && na->name != I30) {
+		unsigned int i;
+
+		BUG_ON(!na->name);
+		i = na->name_len * sizeof(ntfschar);
+		ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
+		if (!ni->name)
+			return -ENOMEM;
+		memcpy(ni->name, na->name, i);
+		ni->name[na->name_len] = 0;
+	}
+	return 0;
+}
+
+typedef int (*set_t)(struct inode *, void *);
+static int ntfs_read_locked_inode(struct inode *vi);
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
+static int ntfs_read_locked_index_inode(struct inode *base_vi,
+		struct inode *vi);
+
+/**
+ * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
+ * @sb:		super block of mounted volume
+ * @mft_no:	mft record number / inode number to obtain
+ *
+ * Obtain the struct inode corresponding to a specific normal inode (i.e. a
+ * file or directory).
+ *
+ * If the inode is in the cache, it is just returned with an increased
+ * reference count. Otherwise, a new struct inode is allocated and initialized,
+ * and finally ntfs_read_locked_inode() is called to read in the inode and
+ * fill in the remainder of the inode structure.
+ *
+ * Return the struct inode on success. Check the return value with IS_ERR() and
+ * if true, the function failed and the error code is obtained from PTR_ERR().
+ */
+struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
+{
+	struct inode *vi;
+	int err;
+	ntfs_attr na;
+
+	na.mft_no = mft_no;
+	na.type = AT_UNUSED;
+	na.name = NULL;
+	na.name_len = 0;
+
+	vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
+			(set_t)ntfs_init_locked_inode, &na);
+	if (unlikely(!vi))
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+
+	/* If this is a freshly allocated inode, need to read it now. */
+	if (vi->i_state & I_NEW) {
+		err = ntfs_read_locked_inode(vi);
+		unlock_new_inode(vi);
+	}
+	/*
+	 * There is no point in keeping bad inodes around if the failure was
+	 * due to ENOMEM. We want to be able to retry again later.
+	 */
+	if (unlikely(err == -ENOMEM)) {
+		iput(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+/**
+ * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
+ * @base_vi:	vfs base inode containing the attribute
+ * @type:	attribute type
+ * @name:	Unicode name of the attribute (NULL if unnamed)
+ * @name_len:	length of @name in Unicode characters (0 if unnamed)
+ *
+ * Obtain the (fake) struct inode corresponding to the attribute specified by
+ * @type, @name, and @name_len, which is present in the base mft record
+ * specified by the vfs inode @base_vi.
+ *
+ * If the attribute inode is in the cache, it is just returned with an
+ * increased reference count. Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
+ * attribute and fill in the inode structure.
+ *
+ * Note, for index allocation attributes, you need to use ntfs_index_iget()
+ * instead of ntfs_attr_iget() as working with indices is a lot more complex.
+ *
+ * Return the struct inode of the attribute inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+		ntfschar *name, u32 name_len)
+{
+	struct inode *vi;
+	int err;
+	ntfs_attr na;
+
+	/* Make sure no one calls ntfs_attr_iget() for indices. */
+	BUG_ON(type == AT_INDEX_ALLOCATION);
+
+	na.mft_no = base_vi->i_ino;
+	na.type = type;
+	na.name = name;
+	na.name_len = name_len;
+
+	vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
+			(set_t)ntfs_init_locked_inode, &na);
+	if (unlikely(!vi))
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+
+	/* If this is a freshly allocated inode, need to read it now. */
+	if (vi->i_state & I_NEW) {
+		err = ntfs_read_locked_attr_inode(base_vi, vi);
+		unlock_new_inode(vi);
+	}
+	/*
+	 * There is no point in keeping bad attribute inodes around. This also
+	 * simplifies things in that we never need to check for bad attribute
+	 * inodes elsewhere.
+	 */
+	if (unlikely(err)) {
+		iput(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+/**
+ * ntfs_index_iget - obtain a struct inode corresponding to an index
+ * @base_vi:	vfs base inode containing the index related attributes
+ * @name:	Unicode name of the index
+ * @name_len:	length of @name in Unicode characters
+ *
+ * Obtain the (fake) struct inode corresponding to the index specified by @name
+ * and @name_len, which is present in the base mft record specified by the vfs
+ * inode @base_vi.
+ *
+ * If the index inode is in the cache, it is just returned with an increased
+ * reference count.  Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_index_inode() is called to read
+ * the index related attributes and fill in the inode structure.
+ *
+ * Return the struct inode of the index inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+		u32 name_len)
+{
+	struct inode *vi;
+	int err;
+	ntfs_attr na;
+
+	na.mft_no = base_vi->i_ino;
+	na.type = AT_INDEX_ALLOCATION;
+	na.name = name;
+	na.name_len = name_len;
+
+	vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
+			(set_t)ntfs_init_locked_inode, &na);
+	if (unlikely(!vi))
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+
+	/* If this is a freshly allocated inode, need to read it now. */
+	if (vi->i_state & I_NEW) {
+		err = ntfs_read_locked_index_inode(base_vi, vi);
+		unlock_new_inode(vi);
+	}
+	/*
+	 * There is no point in keeping bad index inodes around.  This also
+	 * simplifies things in that we never need to check for bad index
+	 * inodes elsewhere.
+	 */
+	if (unlikely(err)) {
+		iput(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+struct inode *ntfs_alloc_big_inode(struct super_block *sb)
+{
+	ntfs_inode *ni;
+
+	ntfs_debug("Entering.");
+	ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
+	if (likely(ni != NULL)) {
+		ni->state = 0;
+		return VFS_I(ni);
+	}
+	ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
+	return NULL;
+}
+
+static void ntfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
+
+void ntfs_destroy_big_inode(struct inode *inode)
+{
+	ntfs_inode *ni = NTFS_I(inode);
+
+	ntfs_debug("Entering.");
+	BUG_ON(ni->page);
+	if (!atomic_dec_and_test(&ni->count))
+		BUG();
+	call_rcu(&inode->i_rcu, ntfs_i_callback);
+}
+
+static inline ntfs_inode *ntfs_alloc_extent_inode(void)
+{
+	ntfs_inode *ni;
+
+	ntfs_debug("Entering.");
+	ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
+	if (likely(ni != NULL)) {
+		ni->state = 0;
+		return ni;
+	}
+	ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
+	return NULL;
+}
+
+static void ntfs_destroy_extent_inode(ntfs_inode *ni)
+{
+	ntfs_debug("Entering.");
+	BUG_ON(ni->page);
+	if (!atomic_dec_and_test(&ni->count))
+		BUG();
+	kmem_cache_free(ntfs_inode_cache, ni);
+}
+
+/*
+ * The attribute runlist lock has separate locking rules from the
+ * normal runlist lock, so split the two lock-classes:
+ */
+static struct lock_class_key attr_list_rl_lock_class;
+
+/**
+ * __ntfs_init_inode - initialize ntfs specific part of an inode
+ * @sb:		super block of mounted volume
+ * @ni:		freshly allocated ntfs inode which to initialize
+ *
+ * Initialize an ntfs inode to defaults.
+ *
+ * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
+ * untouched. Make sure to initialize them elsewhere.
+ *
+ * Return zero on success and -ENOMEM on error.
+ */
+void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
+{
+	ntfs_debug("Entering.");
+	rwlock_init(&ni->size_lock);
+	ni->initialized_size = ni->allocated_size = 0;
+	ni->seq_no = 0;
+	atomic_set(&ni->count, 1);
+	ni->vol = NTFS_SB(sb);
+	ntfs_init_runlist(&ni->runlist);
+	mutex_init(&ni->mrec_lock);
+	ni->page = NULL;
+	ni->page_ofs = 0;
+	ni->attr_list_size = 0;
+	ni->attr_list = NULL;
+	ntfs_init_runlist(&ni->attr_list_rl);
+	lockdep_set_class(&ni->attr_list_rl.lock,
+				&attr_list_rl_lock_class);
+	ni->itype.index.block_size = 0;
+	ni->itype.index.vcn_size = 0;
+	ni->itype.index.collation_rule = 0;
+	ni->itype.index.block_size_bits = 0;
+	ni->itype.index.vcn_size_bits = 0;
+	mutex_init(&ni->extent_lock);
+	ni->nr_extents = 0;
+	ni->ext.base_ntfs_ino = NULL;
+}
+
+/*
+ * Extent inodes get MFT-mapped in a nested way, while the base inode
+ * is still mapped. Teach this nesting to the lock validator by creating
+ * a separate class for nested inode's mrec_lock's:
+ */
+static struct lock_class_key extent_inode_mrec_lock_key;
+
+inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+		unsigned long mft_no)
+{
+	ntfs_inode *ni = ntfs_alloc_extent_inode();
+
+	ntfs_debug("Entering.");
+	if (likely(ni != NULL)) {
+		__ntfs_init_inode(sb, ni);
+		lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
+		ni->mft_no = mft_no;
+		ni->type = AT_UNUSED;
+		ni->name = NULL;
+		ni->name_len = 0;
+	}
+	return ni;
+}
+
+/**
+ * ntfs_is_extended_system_file - check if a file is in the $Extend directory
+ * @ctx:	initialized attribute search context
+ *
+ * Search all file name attributes in the inode described by the attribute
+ * search context @ctx and check if any of the names are in the $Extend system
+ * directory.
+ *
+ * Return values:
+ *	   1: file is in $Extend directory
+ *	   0: file is not in $Extend directory
+ *    -errno: failed to determine if the file is in the $Extend directory
+ */
+static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
+{
+	int nr_links, err;
+
+	/* Restart search. */
+	ntfs_attr_reinit_search_ctx(ctx);
+
+	/* Get number of hard links. */
+	nr_links = le16_to_cpu(ctx->mrec->link_count);
+
+	/* Loop through all hard links. */
+	while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
+			ctx))) {
+		FILE_NAME_ATTR *file_name_attr;
+		ATTR_RECORD *attr = ctx->attr;
+		u8 *p, *p2;
+
+		nr_links--;
+		/*
+		 * Maximum sanity checking as we are called on an inode that
+		 * we suspect might be corrupt.
+		 */
+		p = (u8*)attr + le32_to_cpu(attr->length);
+		if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
+				le32_to_cpu(ctx->mrec->bytes_in_use)) {
+err_corrupt_attr:
+			ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
+					"attribute. You should run chkdsk.");
+			return -EIO;
+		}
+		if (attr->non_resident) {
+			ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
+					"name. You should run chkdsk.");
+			return -EIO;
+		}
+		if (attr->flags) {
+			ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
+					"invalid flags. You should run "
+					"chkdsk.");
+			return -EIO;
+		}
+		if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
+			ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
+					"name. You should run chkdsk.");
+			return -EIO;
+		}
+		file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
+				le16_to_cpu(attr->data.resident.value_offset));
+		p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length);
+		if (p2 < (u8*)attr || p2 > p)
+			goto err_corrupt_attr;
+		/* This attribute is ok, but is it in the $Extend directory? */
+		if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
+			return 1;	/* YES, it's an extended system file. */
+	}
+	if (unlikely(err != -ENOENT))
+		return err;
+	if (unlikely(nr_links)) {
+		ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
+				"doesn't match number of name attributes. You "
+				"should run chkdsk.");
+		return -EIO;
+	}
+	return 0;	/* NO, it is not an extended system file. */
+}
+
+/**
+ * ntfs_read_locked_inode - read an inode from its device
+ * @vi:		inode to read
+ *
+ * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
+ * described by @vi into memory from the device.
+ *
+ * The only fields in @vi that we need to/can look at when the function is
+ * called are i_sb, pointing to the mounted device's super block, and i_ino,
+ * the number of the inode to load.
+ *
+ * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
+ * for reading and sets up the necessary @vi fields as well as initializing
+ * the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
+ *    is allowed to write to them. We should of course be honouring them but
+ *    we need to do that using the IS_* macros defined in include/linux/fs.h.
+ *    In any case ntfs_read_locked_inode() has nothing to do with i_flags.
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_inode(struct inode *vi)
+{
+	ntfs_volume *vol = NTFS_SB(vi->i_sb);
+	ntfs_inode *ni;
+	struct inode *bvi;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	STANDARD_INFORMATION *si;
+	ntfs_attr_search_ctx *ctx;
+	int err = 0;
+
+	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+
+	/* Setup the generic vfs inode parts now. */
+
+	/*
+	 * This is for checking whether an inode has changed w.r.t. a file so
+	 * that the file can be updated if necessary (compare with f_version).
+	 */
+	vi->i_version = 1;
+
+	vi->i_uid = vol->uid;
+	vi->i_gid = vol->gid;
+	vi->i_mode = 0;
+
+	/*
+	 * Initialize the ntfs specific part of @vi special casing
+	 * FILE_MFT which we need to do at mount time.
+	 */
+	if (vi->i_ino != FILE_MFT)
+		ntfs_init_big_inode(vi);
+	ni = NTFS_I(vi);
+
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+
+	if (!(m->flags & MFT_RECORD_IN_USE)) {
+		ntfs_error(vi->i_sb, "Inode is not in use!");
+		goto unm_err_out;
+	}
+	if (m->base_mft_record) {
+		ntfs_error(vi->i_sb, "Inode is an extent inode!");
+		goto unm_err_out;
+	}
+
+	/* Transfer information from mft record into vfs and ntfs inodes. */
+	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+
+	/*
+	 * FIXME: Keep in mind that link_count is two for files which have both
+	 * a long file name and a short file name as separate entries, so if
+	 * we are hiding short file names this will be too high. Either we need
+	 * to account for the short file names by subtracting them or we need
+	 * to make sure we delete files even though i_nlink is not zero which
+	 * might be tricky due to vfs interactions. Need to think about this
+	 * some more when implementing the unlink command.
+	 */
+	set_nlink(vi, le16_to_cpu(m->link_count));
+	/*
+	 * FIXME: Reparse points can have the directory bit set even though
+	 * they would be S_IFLNK. Need to deal with this further below when we
+	 * implement reparse points / symbolic links but it will do for now.
+	 * Also if not a directory, it could be something else, rather than
+	 * a regular file. But again, will do for now.
+	 */
+	/* Everyone gets all permissions. */
+	vi->i_mode |= S_IRWXUGO;
+	/* If read-only, no one gets write permissions. */
+	if (IS_RDONLY(vi))
+		vi->i_mode &= ~S_IWUGO;
+	if (m->flags & MFT_RECORD_IS_DIRECTORY) {
+		vi->i_mode |= S_IFDIR;
+		/*
+		 * Apply the directory permissions mask set in the mount
+		 * options.
+		 */
+		vi->i_mode &= ~vol->dmask;
+		/* Things break without this kludge! */
+		if (vi->i_nlink > 1)
+			set_nlink(vi, 1);
+	} else {
+		vi->i_mode |= S_IFREG;
+		/* Apply the file permissions mask set in the mount options. */
+		vi->i_mode &= ~vol->fmask;
+	}
+	/*
+	 * Find the standard information attribute in the mft record. At this
+	 * stage we haven't setup the attribute list stuff yet, so this could
+	 * in fact fail if the standard information is in an extent record, but
+	 * I don't think this actually ever happens.
+	 */
+	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+			ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			/*
+			 * TODO: We should be performing a hot fix here (if the
+			 * recover mount option is set) by creating a new
+			 * attribute.
+			 */
+			ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
+					"is missing.");
+		}
+		goto unm_err_out;
+	}
+	a = ctx->attr;
+	/* Get the standard information attribute value. */
+	si = (STANDARD_INFORMATION*)((u8*)a +
+			le16_to_cpu(a->data.resident.value_offset));
+
+	/* Transfer information from the standard information into vi. */
+	/*
+	 * Note: The i_?times do not quite map perfectly onto the NTFS times,
+	 * but they are close enough, and in the end it doesn't really matter
+	 * that much...
+	 */
+	/*
+	 * mtime is the last change of the data within the file. Not changed
+	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
+	 */
+	vi->i_mtime = ntfs2utc(si->last_data_change_time);
+	/*
+	 * ctime is the last change of the metadata of the file. This obviously
+	 * always changes, when mtime is changed. ctime can be changed on its
+	 * own, mtime is then not changed, e.g. when a file is renamed.
+	 */
+	vi->i_ctime = ntfs2utc(si->last_mft_change_time);
+	/*
+	 * Last access to the data within the file. Not changed during a rename
+	 * for example but changed whenever the file is written to.
+	 */
+	vi->i_atime = ntfs2utc(si->last_access_time);
+
+	/* Find the attribute list attribute if present. */
+	ntfs_attr_reinit_search_ctx(ctx);
+	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+	if (err) {
+		if (unlikely(err != -ENOENT)) {
+			ntfs_error(vi->i_sb, "Failed to lookup attribute list "
+					"attribute.");
+			goto unm_err_out;
+		}
+	} else /* if (!err) */ {
+		if (vi->i_ino == FILE_MFT)
+			goto skip_attr_list_load;
+		ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
+		NInoSetAttrList(ni);
+		a = ctx->attr;
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			ntfs_error(vi->i_sb, "Attribute list attribute is "
+					"compressed.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(vi->i_sb, "Non-resident attribute "
+						"list attribute is encrypted/"
+						"sparse.");
+				goto unm_err_out;
+			}
+			ntfs_warning(vi->i_sb, "Resident attribute list "
+					"attribute in inode 0x%lx is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.",
+					vi->i_ino);
+		}
+		/* Now allocate memory for the attribute list. */
+		ni->attr_list_size = (u32)ntfs_attr_size(a);
+		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+		if (!ni->attr_list) {
+			ntfs_error(vi->i_sb, "Not enough memory to allocate "
+					"buffer for attribute list.");
+			err = -ENOMEM;
+			goto unm_err_out;
+		}
+		if (a->non_resident) {
+			NInoSetAttrListNonResident(ni);
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(vi->i_sb, "Attribute list has non "
+						"zero lowest_vcn.");
+				goto unm_err_out;
+			}
+			/*
+			 * Setup the runlist. No need for locking as we have
+			 * exclusive access to the inode at this time.
+			 */
+			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+					a, NULL);
+			if (IS_ERR(ni->attr_list_rl.rl)) {
+				err = PTR_ERR(ni->attr_list_rl.rl);
+				ni->attr_list_rl.rl = NULL;
+				ntfs_error(vi->i_sb, "Mapping pairs "
+						"decompression failed.");
+				goto unm_err_out;
+			}
+			/* Now load the attribute list. */
+			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+					ni->attr_list, ni->attr_list_size,
+					sle64_to_cpu(a->data.non_resident.
+					initialized_size)))) {
+				ntfs_error(vi->i_sb, "Failed to load "
+						"attribute list attribute.");
+				goto unm_err_out;
+			}
+		} else /* if (!a->non_resident) */ {
+			if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
+					+ le32_to_cpu(
+					a->data.resident.value_length) >
+					(u8*)ctx->mrec + vol->mft_record_size) {
+				ntfs_error(vi->i_sb, "Corrupt attribute list "
+						"in inode.");
+				goto unm_err_out;
+			}
+			/* Now copy the attribute list. */
+			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
+					le32_to_cpu(
+					a->data.resident.value_length));
+		}
+	}
+skip_attr_list_load:
+	/*
+	 * If an attribute list is present we now have the attribute list value
+	 * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
+	 */
+	if (S_ISDIR(vi->i_mode)) {
+		loff_t bvi_size;
+		ntfs_inode *bni;
+		INDEX_ROOT *ir;
+		u8 *ir_end, *index_end;
+
+		/* It is a directory, find index root attribute. */
+		ntfs_attr_reinit_search_ctx(ctx);
+		err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
+				0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT) {
+				// FIXME: File is corrupt! Hot-fix with empty
+				// index root attribute if recovery option is
+				// set.
+				ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
+						"is missing.");
+			}
+			goto unm_err_out;
+		}
+		a = ctx->attr;
+		/* Set up the state. */
+		if (unlikely(a->non_resident)) {
+			ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
+					"resident.");
+			goto unm_err_out;
+		}
+		/* Ensure the attribute name is placed before the value. */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(a->data.resident.value_offset)))) {
+			ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
+					"placed after the attribute value.");
+			goto unm_err_out;
+		}
+		/*
+		 * Compressed/encrypted index root just means that the newly
+		 * created files in that directory should be created compressed/
+		 * encrypted. However index root cannot be both compressed and
+		 * encrypted.
+		 */
+		if (a->flags & ATTR_COMPRESSION_MASK)
+			NInoSetCompressed(ni);
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
+				ntfs_error(vi->i_sb, "Found encrypted and "
+						"compressed attribute.");
+				goto unm_err_out;
+			}
+			NInoSetEncrypted(ni);
+		}
+		if (a->flags & ATTR_IS_SPARSE)
+			NInoSetSparse(ni);
+		ir = (INDEX_ROOT*)((u8*)a +
+				le16_to_cpu(a->data.resident.value_offset));
+		ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
+		if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
+			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+					"corrupt.");
+			goto unm_err_out;
+		}
+		index_end = (u8*)&ir->index +
+				le32_to_cpu(ir->index.index_length);
+		if (index_end > ir_end) {
+			ntfs_error(vi->i_sb, "Directory index is corrupt.");
+			goto unm_err_out;
+		}
+		if (ir->type != AT_FILE_NAME) {
+			ntfs_error(vi->i_sb, "Indexed attribute is not "
+					"$FILE_NAME.");
+			goto unm_err_out;
+		}
+		if (ir->collation_rule != COLLATION_FILE_NAME) {
+			ntfs_error(vi->i_sb, "Index collation rule is not "
+					"COLLATION_FILE_NAME.");
+			goto unm_err_out;
+		}
+		ni->itype.index.collation_rule = ir->collation_rule;
+		ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+		if (ni->itype.index.block_size &
+				(ni->itype.index.block_size - 1)) {
+			ntfs_error(vi->i_sb, "Index block size (%u) is not a "
+					"power of two.",
+					ni->itype.index.block_size);
+			goto unm_err_out;
+		}
+		if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+			ntfs_error(vi->i_sb, "Index block size (%u) > "
+					"PAGE_CACHE_SIZE (%ld) is not "
+					"supported.  Sorry.",
+					ni->itype.index.block_size,
+					PAGE_CACHE_SIZE);
+			err = -EOPNOTSUPP;
+			goto unm_err_out;
+		}
+		if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+			ntfs_error(vi->i_sb, "Index block size (%u) < "
+					"NTFS_BLOCK_SIZE (%i) is not "
+					"supported.  Sorry.",
+					ni->itype.index.block_size,
+					NTFS_BLOCK_SIZE);
+			err = -EOPNOTSUPP;
+			goto unm_err_out;
+		}
+		ni->itype.index.block_size_bits =
+				ffs(ni->itype.index.block_size) - 1;
+		/* Determine the size of a vcn in the directory index. */
+		if (vol->cluster_size <= ni->itype.index.block_size) {
+			ni->itype.index.vcn_size = vol->cluster_size;
+			ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+		} else {
+			ni->itype.index.vcn_size = vol->sector_size;
+			ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+		}
+
+		/* Setup the index allocation attribute, even if not present. */
+		NInoSetMstProtected(ni);
+		ni->type = AT_INDEX_ALLOCATION;
+		ni->name = I30;
+		ni->name_len = 4;
+
+		if (!(ir->index.flags & LARGE_INDEX)) {
+			/* No index allocation. */
+			vi->i_size = ni->initialized_size =
+					ni->allocated_size = 0;
+			/* We are done with the mft record, so we release it. */
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(ni);
+			m = NULL;
+			ctx = NULL;
+			goto skip_large_dir_stuff;
+		} /* LARGE_INDEX: Index allocation present. Setup state. */
+		NInoSetIndexAllocPresent(ni);
+		/* Find index allocation attribute. */
+		ntfs_attr_reinit_search_ctx(ctx);
+		err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
+						"attribute is not present but "
+						"$INDEX_ROOT indicated it is.");
+			else
+				ntfs_error(vi->i_sb, "Failed to lookup "
+						"$INDEX_ALLOCATION "
+						"attribute.");
+			goto unm_err_out;
+		}
+		a = ctx->attr;
+		if (!a->non_resident) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is resident.");
+			goto unm_err_out;
+		}
+		/*
+		 * Ensure the attribute name is placed before the mapping pairs
+		 * array.
+		 */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset)))) {
+			ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
+					"is placed after the mapping pairs "
+					"array.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is encrypted.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_SPARSE) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is sparse.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is compressed.");
+			goto unm_err_out;
+		}
+		if (a->data.non_resident.lowest_vcn) {
+			ntfs_error(vi->i_sb, "First extent of "
+					"$INDEX_ALLOCATION attribute has non "
+					"zero lowest_vcn.");
+			goto unm_err_out;
+		}
+		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+		ni->initialized_size = sle64_to_cpu(
+				a->data.non_resident.initialized_size);
+		ni->allocated_size = sle64_to_cpu(
+				a->data.non_resident.allocated_size);
+		/*
+		 * We are done with the mft record, so we release it. Otherwise
+		 * we would deadlock in ntfs_attr_iget().
+		 */
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+		m = NULL;
+		ctx = NULL;
+		/* Get the index bitmap attribute inode. */
+		bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
+		if (IS_ERR(bvi)) {
+			ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+			err = PTR_ERR(bvi);
+			goto unm_err_out;
+		}
+		bni = NTFS_I(bvi);
+		if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+				NInoSparse(bni)) {
+			ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
+					"and/or encrypted and/or sparse.");
+			goto iput_unm_err_out;
+		}
+		/* Consistency check bitmap size vs. index allocation size. */
+		bvi_size = i_size_read(bvi);
+		if ((bvi_size << 3) < (vi->i_size >>
+				ni->itype.index.block_size_bits)) {
+			ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
+					"for index allocation (0x%llx).",
+					bvi_size << 3, vi->i_size);
+			goto iput_unm_err_out;
+		}
+		/* No longer need the bitmap attribute inode. */
+		iput(bvi);
+skip_large_dir_stuff:
+		/* Setup the operations for this inode. */
+		vi->i_op = &ntfs_dir_inode_ops;
+		vi->i_fop = &ntfs_dir_ops;
+		vi->i_mapping->a_ops = &ntfs_mst_aops;
+	} else {
+		/* It is a file. */
+		ntfs_attr_reinit_search_ctx(ctx);
+
+		/* Setup the data attribute, even if not present. */
+		ni->type = AT_DATA;
+		ni->name = NULL;
+		ni->name_len = 0;
+
+		/* Find first extent of the unnamed data attribute. */
+		err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			vi->i_size = ni->initialized_size =
+					ni->allocated_size = 0;
+			if (err != -ENOENT) {
+				ntfs_error(vi->i_sb, "Failed to lookup $DATA "
+						"attribute.");
+				goto unm_err_out;
+			}
+			/*
+			 * FILE_Secure does not have an unnamed $DATA
+			 * attribute, so we special case it here.
+			 */
+			if (vi->i_ino == FILE_Secure)
+				goto no_data_attr_special_case;
+			/*
+			 * Most if not all the system files in the $Extend
+			 * system directory do not have unnamed data
+			 * attributes so we need to check if the parent
+			 * directory of the file is FILE_Extend and if it is
+			 * ignore this error. To do this we need to get the
+			 * name of this inode from the mft record as the name
+			 * contains the back reference to the parent directory.
+			 */
+			if (ntfs_is_extended_system_file(ctx) > 0)
+				goto no_data_attr_special_case;
+			// FIXME: File is corrupt! Hot-fix with empty data
+			// attribute if recovery option is set.
+			ntfs_error(vi->i_sb, "$DATA attribute is missing.");
+			goto unm_err_out;
+		}
+		a = ctx->attr;
+		/* Setup the state. */
+		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
+				NInoSetCompressed(ni);
+				if (vol->cluster_size > 4096) {
+					ntfs_error(vi->i_sb, "Found "
+							"compressed data but "
+							"compression is "
+							"disabled due to "
+							"cluster size (%i) > "
+							"4kiB.",
+							vol->cluster_size);
+					goto unm_err_out;
+				}
+				if ((a->flags & ATTR_COMPRESSION_MASK)
+						!= ATTR_IS_COMPRESSED) {
+					ntfs_error(vi->i_sb, "Found unknown "
+							"compression method "
+							"or corrupt file.");
+					goto unm_err_out;
+				}
+			}
+			if (a->flags & ATTR_IS_SPARSE)
+				NInoSetSparse(ni);
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (NInoCompressed(ni)) {
+				ntfs_error(vi->i_sb, "Found encrypted and "
+						"compressed data.");
+				goto unm_err_out;
+			}
+			NInoSetEncrypted(ni);
+		}
+		if (a->non_resident) {
+			NInoSetNonResident(ni);
+			if (NInoCompressed(ni) || NInoSparse(ni)) {
+				if (NInoCompressed(ni) && a->data.non_resident.
+						compression_unit != 4) {
+					ntfs_error(vi->i_sb, "Found "
+							"non-standard "
+							"compression unit (%u "
+							"instead of 4).  "
+							"Cannot handle this.",
+							a->data.non_resident.
+							compression_unit);
+					err = -EOPNOTSUPP;
+					goto unm_err_out;
+				}
+				if (a->data.non_resident.compression_unit) {
+					ni->itype.compressed.block_size = 1U <<
+							(a->data.non_resident.
+							compression_unit +
+							vol->cluster_size_bits);
+					ni->itype.compressed.block_size_bits =
+							ffs(ni->itype.
+							compressed.
+							block_size) - 1;
+					ni->itype.compressed.block_clusters =
+							1U << a->data.
+							non_resident.
+							compression_unit;
+				} else {
+					ni->itype.compressed.block_size = 0;
+					ni->itype.compressed.block_size_bits =
+							0;
+					ni->itype.compressed.block_clusters =
+							0;
+				}
+				ni->itype.compressed.size = sle64_to_cpu(
+						a->data.non_resident.
+						compressed_size);
+			}
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(vi->i_sb, "First extent of $DATA "
+						"attribute has non zero "
+						"lowest_vcn.");
+				goto unm_err_out;
+			}
+			vi->i_size = sle64_to_cpu(
+					a->data.non_resident.data_size);
+			ni->initialized_size = sle64_to_cpu(
+					a->data.non_resident.initialized_size);
+			ni->allocated_size = sle64_to_cpu(
+					a->data.non_resident.allocated_size);
+		} else { /* Resident attribute. */
+			vi->i_size = ni->initialized_size = le32_to_cpu(
+					a->data.resident.value_length);
+			ni->allocated_size = le32_to_cpu(a->length) -
+					le16_to_cpu(
+					a->data.resident.value_offset);
+			if (vi->i_size > ni->allocated_size) {
+				ntfs_error(vi->i_sb, "Resident data attribute "
+						"is corrupt (size exceeds "
+						"allocation).");
+				goto unm_err_out;
+			}
+		}
+no_data_attr_special_case:
+		/* We are done with the mft record, so we release it. */
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+		m = NULL;
+		ctx = NULL;
+		/* Setup the operations for this inode. */
+		vi->i_op = &ntfs_file_inode_ops;
+		vi->i_fop = &ntfs_file_ops;
+		vi->i_mapping->a_ops = &ntfs_normal_aops;
+		if (NInoMstProtected(ni))
+			vi->i_mapping->a_ops = &ntfs_mst_aops;
+		else if (NInoCompressed(ni))
+			vi->i_mapping->a_ops = &ntfs_compressed_aops;
+	}
+	/*
+	 * The number of 512-byte blocks used on disk (for stat). This is in so
+	 * far inaccurate as it doesn't account for any named streams or other
+	 * special non-resident attributes, but that is how Windows works, too,
+	 * so we are at least consistent with Windows, if not entirely
+	 * consistent with the Linux Way. Doing it the Linux Way would cause a
+	 * significant slowdown as it would involve iterating over all
+	 * attributes in the mft record and adding the allocated/compressed
+	 * sizes of all non-resident attributes present to give us the Linux
+	 * correct size that should go into i_blocks (after division by 512).
+	 */
+	if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	else
+		vi->i_blocks = ni->allocated_size >> 9;
+	ntfs_debug("Done.");
+	return 0;
+iput_unm_err_out:
+	iput(bvi);
+unm_err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(ni);
+err_out:
+	ntfs_error(vol->sb, "Failed with error code %i.  Marking corrupt "
+			"inode 0x%lx as bad.  Run chkdsk.", err, vi->i_ino);
+	make_bad_inode(vi);
+	if (err != -EOPNOTSUPP && err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+}
+
+/**
+ * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
+ * @base_vi:	base inode
+ * @vi:		attribute inode to read
+ *
+ * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
+ * attribute inode described by @vi into memory from the base mft record
+ * described by @base_ni.
+ *
+ * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
+ * reading and looks up the attribute described by @vi before setting up the
+ * necessary fields in @vi as well as initializing the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ *
+ * Note this cannot be called for AT_INDEX_ALLOCATION.
+ */
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
+{
+	ntfs_volume *vol = NTFS_SB(vi->i_sb);
+	ntfs_inode *ni, *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	int err = 0;
+
+	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+
+	ntfs_init_big_inode(vi);
+
+	ni	= NTFS_I(vi);
+	base_ni = NTFS_I(base_vi);
+
+	/* Just mirror the values from the base inode. */
+	vi->i_version	= base_vi->i_version;
+	vi->i_uid	= base_vi->i_uid;
+	vi->i_gid	= base_vi->i_gid;
+	set_nlink(vi, base_vi->i_nlink);
+	vi->i_mtime	= base_vi->i_mtime;
+	vi->i_ctime	= base_vi->i_ctime;
+	vi->i_atime	= base_vi->i_atime;
+	vi->i_generation = ni->seq_no = base_ni->seq_no;
+
+	/* Set inode type to zero but preserve permissions. */
+	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
+
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	/* Find the attribute. */
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err))
+		goto unm_err_out;
+	a = ctx->attr;
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			NInoSetCompressed(ni);
+			if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
+					ni->name_len)) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"non-data or named data "
+						"attribute.  Please report "
+						"you saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				goto unm_err_out;
+			}
+			if (vol->cluster_size > 4096) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"attribute but compression is "
+						"disabled due to cluster size "
+						"(%i) > 4kiB.",
+						vol->cluster_size);
+				goto unm_err_out;
+			}
+			if ((a->flags & ATTR_COMPRESSION_MASK) !=
+					ATTR_IS_COMPRESSED) {
+				ntfs_error(vi->i_sb, "Found unknown "
+						"compression method.");
+				goto unm_err_out;
+			}
+		}
+		/*
+		 * The compressed/sparse flag set in an index root just means
+		 * to compress all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is %s.  Please "
+					"report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net",
+					NInoCompressed(ni) ? "compressed" :
+					"sparse");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_SPARSE)
+			NInoSetSparse(ni);
+	}
+	if (a->flags & ATTR_IS_ENCRYPTED) {
+		if (NInoCompressed(ni)) {
+			ntfs_error(vi->i_sb, "Found encrypted and compressed "
+					"data.");
+			goto unm_err_out;
+		}
+		/*
+		 * The encryption flag set in an index root just means to
+		 * encrypt all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is encrypted.  "
+					"Please report you saw this message "
+					"to linux-ntfs-dev@lists.sourceforge."
+					"net");
+			goto unm_err_out;
+		}
+		if (ni->type != AT_DATA) {
+			ntfs_error(vi->i_sb, "Found encrypted non-data "
+					"attribute.");
+			goto unm_err_out;
+		}
+		NInoSetEncrypted(ni);
+	}
+	if (!a->non_resident) {
+		/* Ensure the attribute name is placed before the value. */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(a->data.resident.value_offset)))) {
+			ntfs_error(vol->sb, "Attribute name is placed after "
+					"the attribute value.");
+			goto unm_err_out;
+		}
+		if (NInoMstProtected(ni)) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is resident.  "
+					"Please report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net");
+			goto unm_err_out;
+		}
+		vi->i_size = ni->initialized_size = le32_to_cpu(
+				a->data.resident.value_length);
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		if (vi->i_size > ni->allocated_size) {
+			ntfs_error(vi->i_sb, "Resident attribute is corrupt "
+					"(size exceeds allocation).");
+			goto unm_err_out;
+		}
+	} else {
+		NInoSetNonResident(ni);
+		/*
+		 * Ensure the attribute name is placed before the mapping pairs
+		 * array.
+		 */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset)))) {
+			ntfs_error(vol->sb, "Attribute name is placed after "
+					"the mapping pairs array.");
+			goto unm_err_out;
+		}
+		if (NInoCompressed(ni) || NInoSparse(ni)) {
+			if (NInoCompressed(ni) && a->data.non_resident.
+					compression_unit != 4) {
+				ntfs_error(vi->i_sb, "Found non-standard "
+						"compression unit (%u instead "
+						"of 4).  Cannot handle this.",
+						a->data.non_resident.
+						compression_unit);
+				err = -EOPNOTSUPP;
+				goto unm_err_out;
+			}
+			if (a->data.non_resident.compression_unit) {
+				ni->itype.compressed.block_size = 1U <<
+						(a->data.non_resident.
+						compression_unit +
+						vol->cluster_size_bits);
+				ni->itype.compressed.block_size_bits =
+						ffs(ni->itype.compressed.
+						block_size) - 1;
+				ni->itype.compressed.block_clusters = 1U <<
+						a->data.non_resident.
+						compression_unit;
+			} else {
+				ni->itype.compressed.block_size = 0;
+				ni->itype.compressed.block_size_bits = 0;
+				ni->itype.compressed.block_clusters = 0;
+			}
+			ni->itype.compressed.size = sle64_to_cpu(
+					a->data.non_resident.compressed_size);
+		}
+		if (a->data.non_resident.lowest_vcn) {
+			ntfs_error(vi->i_sb, "First extent of attribute has "
+					"non-zero lowest_vcn.");
+			goto unm_err_out;
+		}
+		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+		ni->initialized_size = sle64_to_cpu(
+				a->data.non_resident.initialized_size);
+		ni->allocated_size = sle64_to_cpu(
+				a->data.non_resident.allocated_size);
+	}
+	vi->i_mapping->a_ops = &ntfs_normal_aops;
+	if (NInoMstProtected(ni))
+		vi->i_mapping->a_ops = &ntfs_mst_aops;
+	else if (NInoCompressed(ni))
+		vi->i_mapping->a_ops = &ntfs_compressed_aops;
+	if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	else
+		vi->i_blocks = ni->allocated_size >> 9;
+	/*
+	 * Make sure the base inode does not go away and attach it to the
+	 * attribute inode.
+	 */
+	igrab(base_vi);
+	ni->ext.base_ntfs_ino = base_ni;
+	ni->nr_extents = -1;
+
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+
+	ntfs_debug("Done.");
+	return 0;
+
+unm_err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+err_out:
+	ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
+			"inode (mft_no 0x%lx, type 0x%x, name_len %i).  "
+			"Marking corrupt inode and base inode 0x%lx as bad.  "
+			"Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
+			base_vi->i_ino);
+	make_bad_inode(vi);
+	if (err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+}
+
+/**
+ * ntfs_read_locked_index_inode - read an index inode from its base inode
+ * @base_vi:	base inode
+ * @vi:		index inode to read
+ *
+ * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
+ * index inode described by @vi into memory from the base mft record described
+ * by @base_ni.
+ *
+ * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
+ * reading and looks up the attributes relating to the index described by @vi
+ * before setting up the necessary fields in @vi as well as initializing the
+ * ntfs inode.
+ *
+ * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
+ * with the attribute type set to AT_INDEX_ALLOCATION.  Apart from that, they
+ * are setup like directory inodes since directories are a special case of
+ * indices ao they need to be treated in much the same way.  Most importantly,
+ * for small indices the index allocation attribute might not actually exist.
+ * However, the index root attribute always exists but this does not need to
+ * have an inode associated with it and this is why we define a new inode type
+ * index.  Also, like for directories, we need to have an attribute inode for
+ * the bitmap attribute corresponding to the index allocation attribute and we
+ * can store this in the appropriate field of the inode, just like we do for
+ * normal directory inodes.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
+{
+	loff_t bvi_size;
+	ntfs_volume *vol = NTFS_SB(vi->i_sb);
+	ntfs_inode *ni, *base_ni, *bni;
+	struct inode *bvi;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	INDEX_ROOT *ir;
+	u8 *ir_end, *index_end;
+	int err = 0;
+
+	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+	ntfs_init_big_inode(vi);
+	ni	= NTFS_I(vi);
+	base_ni = NTFS_I(base_vi);
+	/* Just mirror the values from the base inode. */
+	vi->i_version	= base_vi->i_version;
+	vi->i_uid	= base_vi->i_uid;
+	vi->i_gid	= base_vi->i_gid;
+	set_nlink(vi, base_vi->i_nlink);
+	vi->i_mtime	= base_vi->i_mtime;
+	vi->i_ctime	= base_vi->i_ctime;
+	vi->i_atime	= base_vi->i_atime;
+	vi->i_generation = ni->seq_no = base_ni->seq_no;
+	/* Set inode type to zero but preserve permissions. */
+	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
+	/* Map the mft record for the base inode. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	/* Find the index root attribute. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+					"missing.");
+		goto unm_err_out;
+	}
+	a = ctx->attr;
+	/* Set up the state. */
+	if (unlikely(a->non_resident)) {
+		ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
+		goto unm_err_out;
+	}
+	/* Ensure the attribute name is placed before the value. */
+	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+			le16_to_cpu(a->data.resident.value_offset)))) {
+		ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
+				"after the attribute value.");
+		goto unm_err_out;
+	}
+	/*
+	 * Compressed/encrypted/sparse index root is not allowed, except for
+	 * directories of course but those are not dealt with here.
+	 */
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
+			ATTR_IS_SPARSE)) {
+		ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
+				"root attribute.");
+		goto unm_err_out;
+	}
+	ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
+	ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
+	if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
+		ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	if (index_end > ir_end) {
+		ntfs_error(vi->i_sb, "Index is corrupt.");
+		goto unm_err_out;
+	}
+	if (ir->type) {
+		ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
+				le32_to_cpu(ir->type));
+		goto unm_err_out;
+	}
+	ni->itype.index.collation_rule = ir->collation_rule;
+	ntfs_debug("Index collation rule is 0x%x.",
+			le32_to_cpu(ir->collation_rule));
+	ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+	if (!is_power_of_2(ni->itype.index.block_size)) {
+		ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
+				"two.", ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+		ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE "
+				"(%ld) is not supported.  Sorry.",
+				ni->itype.index.block_size, PAGE_CACHE_SIZE);
+		err = -EOPNOTSUPP;
+		goto unm_err_out;
+	}
+	if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+		ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
+				"(%i) is not supported.  Sorry.",
+				ni->itype.index.block_size, NTFS_BLOCK_SIZE);
+		err = -EOPNOTSUPP;
+		goto unm_err_out;
+	}
+	ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
+	/* Determine the size of a vcn in the index. */
+	if (vol->cluster_size <= ni->itype.index.block_size) {
+		ni->itype.index.vcn_size = vol->cluster_size;
+		ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+	} else {
+		ni->itype.index.vcn_size = vol->sector_size;
+		ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+	}
+	/* Check for presence of index allocation attribute. */
+	if (!(ir->index.flags & LARGE_INDEX)) {
+		/* No index allocation. */
+		vi->i_size = ni->initialized_size = ni->allocated_size = 0;
+		/* We are done with the mft record, so we release it. */
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		m = NULL;
+		ctx = NULL;
+		goto skip_large_index_stuff;
+	} /* LARGE_INDEX:  Index allocation present.  Setup state. */
+	NInoSetIndexAllocPresent(ni);
+	/* Find index allocation attribute. */
+	ntfs_attr_reinit_search_ctx(ctx);
+	err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+					"not present but $INDEX_ROOT "
+					"indicated it is.");
+		else
+			ntfs_error(vi->i_sb, "Failed to lookup "
+					"$INDEX_ALLOCATION attribute.");
+		goto unm_err_out;
+	}
+	a = ctx->attr;
+	if (!a->non_resident) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+				"resident.");
+		goto unm_err_out;
+	}
+	/*
+	 * Ensure the attribute name is placed before the mapping pairs array.
+	 */
+	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+			le16_to_cpu(
+			a->data.non_resident.mapping_pairs_offset)))) {
+		ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
+				"placed after the mapping pairs array.");
+		goto unm_err_out;
+	}
+	if (a->flags & ATTR_IS_ENCRYPTED) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+				"encrypted.");
+		goto unm_err_out;
+	}
+	if (a->flags & ATTR_IS_SPARSE) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
+		goto unm_err_out;
+	}
+	if (a->flags & ATTR_COMPRESSION_MASK) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+				"compressed.");
+		goto unm_err_out;
+	}
+	if (a->data.non_resident.lowest_vcn) {
+		ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
+				"attribute has non zero lowest_vcn.");
+		goto unm_err_out;
+	}
+	vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+	ni->initialized_size = sle64_to_cpu(
+			a->data.non_resident.initialized_size);
+	ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
+	/*
+	 * We are done with the mft record, so we release it.  Otherwise
+	 * we would deadlock in ntfs_attr_iget().
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	m = NULL;
+	ctx = NULL;
+	/* Get the index bitmap attribute inode. */
+	bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
+	if (IS_ERR(bvi)) {
+		ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+		err = PTR_ERR(bvi);
+		goto unm_err_out;
+	}
+	bni = NTFS_I(bvi);
+	if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+			NInoSparse(bni)) {
+		ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
+				"encrypted and/or sparse.");
+		goto iput_unm_err_out;
+	}
+	/* Consistency check bitmap size vs. index allocation size. */
+	bvi_size = i_size_read(bvi);
+	if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
+		ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
+				"index allocation (0x%llx).", bvi_size << 3,
+				vi->i_size);
+		goto iput_unm_err_out;
+	}
+	iput(bvi);
+skip_large_index_stuff:
+	/* Setup the operations for this index inode. */
+	vi->i_mapping->a_ops = &ntfs_mst_aops;
+	vi->i_blocks = ni->allocated_size >> 9;
+	/*
+	 * Make sure the base inode doesn't go away and attach it to the
+	 * index inode.
+	 */
+	igrab(base_vi);
+	ni->ext.base_ntfs_ino = base_ni;
+	ni->nr_extents = -1;
+
+	ntfs_debug("Done.");
+	return 0;
+iput_unm_err_out:
+	iput(bvi);
+unm_err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+err_out:
+	ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
+			"inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
+			ni->name_len);
+	make_bad_inode(vi);
+	if (err != -EOPNOTSUPP && err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+}
+
+/*
+ * The MFT inode has special locking, so teach the lock validator
+ * about this by splitting off the locking rules of the MFT from
+ * the locking rules of other inodes. The MFT inode can never be
+ * accessed from the VFS side (or even internally), only by the
+ * map_mft functions.
+ */
+static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
+
+/**
+ * ntfs_read_inode_mount - special read_inode for mount time use only
+ * @vi:		inode to read
+ *
+ * Read inode FILE_MFT at mount time, only called with super_block lock
+ * held from within the read_super() code path.
+ *
+ * This function exists because when it is called the page cache for $MFT/$DATA
+ * is not initialized and hence we cannot get at the contents of mft records
+ * by calling map_mft_record*().
+ *
+ * Further it needs to cope with the circular references problem, i.e. cannot
+ * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
+ * we do not know where the other extent mft records are yet and again, because
+ * we cannot call map_mft_record*() yet.  Obviously this applies only when an
+ * attribute list is actually present in $MFT inode.
+ *
+ * We solve these problems by starting with the $DATA attribute before anything
+ * else and iterating using ntfs_attr_lookup($DATA) over all extents.  As each
+ * extent is found, we ntfs_mapping_pairs_decompress() including the implied
+ * ntfs_runlists_merge().  Each step of the iteration necessarily provides
+ * sufficient information for the next step to complete.
+ *
+ * This should work but there are two possible pit falls (see inline comments
+ * below), but only time will tell if they are real pits or just smoke...
+ */
+int ntfs_read_inode_mount(struct inode *vi)
+{
+	VCN next_vcn, last_vcn, highest_vcn;
+	s64 block;
+	struct super_block *sb = vi->i_sb;
+	ntfs_volume *vol = NTFS_SB(sb);
+	struct buffer_head *bh;
+	ntfs_inode *ni;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	unsigned int i, nr_blocks;
+	int err;
+
+	ntfs_debug("Entering.");
+
+	/* Initialize the ntfs specific part of @vi. */
+	ntfs_init_big_inode(vi);
+
+	ni = NTFS_I(vi);
+
+	/* Setup the data attribute. It is special as it is mst protected. */
+	NInoSetNonResident(ni);
+	NInoSetMstProtected(ni);
+	NInoSetSparseDisabled(ni);
+	ni->type = AT_DATA;
+	ni->name = NULL;
+	ni->name_len = 0;
+	/*
+	 * This sets up our little cheat allowing us to reuse the async read io
+	 * completion handler for directories.
+	 */
+	ni->itype.index.block_size = vol->mft_record_size;
+	ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+
+	/* Very important! Needed to be able to call map_mft_record*(). */
+	vol->mft_ino = vi;
+
+	/* Allocate enough memory to read the first mft record. */
+	if (vol->mft_record_size > 64 * 1024) {
+		ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
+				vol->mft_record_size);
+		goto err_out;
+	}
+	i = vol->mft_record_size;
+	if (i < sb->s_blocksize)
+		i = sb->s_blocksize;
+	m = (MFT_RECORD*)ntfs_malloc_nofs(i);
+	if (!m) {
+		ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
+		goto err_out;
+	}
+
+	/* Determine the first block of the $MFT/$DATA attribute. */
+	block = vol->mft_lcn << vol->cluster_size_bits >>
+			sb->s_blocksize_bits;
+	nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
+	if (!nr_blocks)
+		nr_blocks = 1;
+
+	/* Load $MFT/$DATA's first mft record. */
+	for (i = 0; i < nr_blocks; i++) {
+		bh = sb_bread(sb, block++);
+		if (!bh) {
+			ntfs_error(sb, "Device read failed.");
+			goto err_out;
+		}
+		memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
+				sb->s_blocksize);
+		brelse(bh);
+	}
+
+	/* Apply the mst fixups. */
+	if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
+		/* FIXME: Try to use the $MFTMirr now. */
+		ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
+		goto err_out;
+	}
+
+	/* Need this to sanity check attribute list references to $MFT. */
+	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+
+	/* Provides readpage() and sync_page() for map_mft_record(). */
+	vi->i_mapping->a_ops = &ntfs_mst_aops;
+
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	/* Find the attribute list attribute if present. */
+	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+	if (err) {
+		if (unlikely(err != -ENOENT)) {
+			ntfs_error(sb, "Failed to lookup attribute list "
+					"attribute. You should run chkdsk.");
+			goto put_err_out;
+		}
+	} else /* if (!err) */ {
+		ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+		u8 *al_end;
+		static const char *es = "  Not allowed.  $MFT is corrupt.  "
+				"You should run chkdsk.";
+
+		ntfs_debug("Attribute list attribute found in $MFT.");
+		NInoSetAttrList(ni);
+		a = ctx->attr;
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			ntfs_error(sb, "Attribute list attribute is "
+					"compressed.%s", es);
+			goto put_err_out;
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(sb, "Non-resident attribute list "
+						"attribute is encrypted/"
+						"sparse.%s", es);
+				goto put_err_out;
+			}
+			ntfs_warning(sb, "Resident attribute list attribute "
+					"in $MFT system file is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.");
+		}
+		/* Now allocate memory for the attribute list. */
+		ni->attr_list_size = (u32)ntfs_attr_size(a);
+		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+		if (!ni->attr_list) {
+			ntfs_error(sb, "Not enough memory to allocate buffer "
+					"for attribute list.");
+			goto put_err_out;
+		}
+		if (a->non_resident) {
+			NInoSetAttrListNonResident(ni);
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(sb, "Attribute list has non zero "
+						"lowest_vcn. $MFT is corrupt. "
+						"You should run chkdsk.");
+				goto put_err_out;
+			}
+			/* Setup the runlist. */
+			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+					a, NULL);
+			if (IS_ERR(ni->attr_list_rl.rl)) {
+				err = PTR_ERR(ni->attr_list_rl.rl);
+				ni->attr_list_rl.rl = NULL;
+				ntfs_error(sb, "Mapping pairs decompression "
+						"failed with error code %i.",
+						-err);
+				goto put_err_out;
+			}
+			/* Now load the attribute list. */
+			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+					ni->attr_list, ni->attr_list_size,
+					sle64_to_cpu(a->data.
+					non_resident.initialized_size)))) {
+				ntfs_error(sb, "Failed to load attribute list "
+						"attribute with error code %i.",
+						-err);
+				goto put_err_out;
+			}
+		} else /* if (!ctx.attr->non_resident) */ {
+			if ((u8*)a + le16_to_cpu(
+					a->data.resident.value_offset) +
+					le32_to_cpu(
+					a->data.resident.value_length) >
+					(u8*)ctx->mrec + vol->mft_record_size) {
+				ntfs_error(sb, "Corrupt attribute list "
+						"attribute.");
+				goto put_err_out;
+			}
+			/* Now copy the attribute list. */
+			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
+					le32_to_cpu(
+					a->data.resident.value_length));
+		}
+		/* The attribute list is now setup in memory. */
+		/*
+		 * FIXME: I don't know if this case is actually possible.
+		 * According to logic it is not possible but I have seen too
+		 * many weird things in MS software to rely on logic... Thus we
+		 * perform a manual search and make sure the first $MFT/$DATA
+		 * extent is in the base inode. If it is not we abort with an
+		 * error and if we ever see a report of this error we will need
+		 * to do some magic in order to have the necessary mft record
+		 * loaded and in the right place in the page cache. But
+		 * hopefully logic will prevail and this never happens...
+		 */
+		al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
+		al_end = (u8*)al_entry + ni->attr_list_size;
+		for (;; al_entry = next_al_entry) {
+			/* Out of bounds check. */
+			if ((u8*)al_entry < ni->attr_list ||
+					(u8*)al_entry > al_end)
+				goto em_put_err_out;
+			/* Catch the end of the attribute list. */
+			if ((u8*)al_entry == al_end)
+				goto em_put_err_out;
+			if (!al_entry->length)
+				goto em_put_err_out;
+			if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+					le16_to_cpu(al_entry->length) > al_end)
+				goto em_put_err_out;
+			next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+					le16_to_cpu(al_entry->length));
+			if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
+				goto em_put_err_out;
+			if (AT_DATA != al_entry->type)
+				continue;
+			/* We want an unnamed attribute. */
+			if (al_entry->name_length)
+				goto em_put_err_out;
+			/* Want the first entry, i.e. lowest_vcn == 0. */
+			if (al_entry->lowest_vcn)
+				goto em_put_err_out;
+			/* First entry has to be in the base mft record. */
+			if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
+				/* MFT references do not match, logic fails. */
+				ntfs_error(sb, "BUG: The first $DATA extent "
+						"of $MFT is not in the base "
+						"mft record. Please report "
+						"you saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				goto put_err_out;
+			} else {
+				/* Sequence numbers must match. */
+				if (MSEQNO_LE(al_entry->mft_reference) !=
+						ni->seq_no)
+					goto em_put_err_out;
+				/* Got it. All is ok. We can stop now. */
+				break;
+			}
+		}
+	}
+
+	ntfs_attr_reinit_search_ctx(ctx);
+
+	/* Now load all attribute extents. */
+	a = NULL;
+	next_vcn = last_vcn = highest_vcn = 0;
+	while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
+			ctx))) {
+		runlist_element *nrl;
+
+		/* Cache the current attribute. */
+		a = ctx->attr;
+		/* $MFT must be non-resident. */
+		if (!a->non_resident) {
+			ntfs_error(sb, "$MFT must be non-resident but a "
+					"resident extent was found. $MFT is "
+					"corrupt. Run chkdsk.");
+			goto put_err_out;
+		}
+		/* $MFT must be uncompressed and unencrypted. */
+		if (a->flags & ATTR_COMPRESSION_MASK ||
+				a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			ntfs_error(sb, "$MFT must be uncompressed, "
+					"non-sparse, and unencrypted but a "
+					"compressed/sparse/encrypted extent "
+					"was found. $MFT is corrupt. Run "
+					"chkdsk.");
+			goto put_err_out;
+		}
+		/*
+		 * Decompress the mapping pairs array of this extent and merge
+		 * the result into the existing runlist. No need for locking
+		 * as we have exclusive access to the inode at this time and we
+		 * are a mount in progress task, too.
+		 */
+		nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+		if (IS_ERR(nrl)) {
+			ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
+					"failed with error code %ld.  $MFT is "
+					"corrupt.", PTR_ERR(nrl));
+			goto put_err_out;
+		}
+		ni->runlist.rl = nrl;
+
+		/* Are we in the first extent? */
+		if (!next_vcn) {
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(sb, "First extent of $DATA "
+						"attribute has non zero "
+						"lowest_vcn. $MFT is corrupt. "
+						"You should run chkdsk.");
+				goto put_err_out;
+			}
+			/* Get the last vcn in the $DATA attribute. */
+			last_vcn = sle64_to_cpu(
+					a->data.non_resident.allocated_size)
+					>> vol->cluster_size_bits;
+			/* Fill in the inode size. */
+			vi->i_size = sle64_to_cpu(
+					a->data.non_resident.data_size);
+			ni->initialized_size = sle64_to_cpu(
+					a->data.non_resident.initialized_size);
+			ni->allocated_size = sle64_to_cpu(
+					a->data.non_resident.allocated_size);
+			/*
+			 * Verify the number of mft records does not exceed
+			 * 2^32 - 1.
+			 */
+			if ((vi->i_size >> vol->mft_record_size_bits) >=
+					(1ULL << 32)) {
+				ntfs_error(sb, "$MFT is too big! Aborting.");
+				goto put_err_out;
+			}
+			/*
+			 * We have got the first extent of the runlist for
+			 * $MFT which means it is now relatively safe to call
+			 * the normal ntfs_read_inode() function.
+			 * Complete reading the inode, this will actually
+			 * re-read the mft record for $MFT, this time entering
+			 * it into the page cache with which we complete the
+			 * kick start of the volume. It should be safe to do
+			 * this now as the first extent of $MFT/$DATA is
+			 * already known and we would hope that we don't need
+			 * further extents in order to find the other
+			 * attributes belonging to $MFT. Only time will tell if
+			 * this is really the case. If not we will have to play
+			 * magic at this point, possibly duplicating a lot of
+			 * ntfs_read_inode() at this point. We will need to
+			 * ensure we do enough of its work to be able to call
+			 * ntfs_read_inode() on extents of $MFT/$DATA. But lets
+			 * hope this never happens...
+			 */
+			ntfs_read_locked_inode(vi);
+			if (is_bad_inode(vi)) {
+				ntfs_error(sb, "ntfs_read_inode() of $MFT "
+						"failed. BUG or corrupt $MFT. "
+						"Run chkdsk and if no errors "
+						"are found, please report you "
+						"saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				ntfs_attr_put_search_ctx(ctx);
+				/* Revert to the safe super operations. */
+				ntfs_free(m);
+				return -1;
+			}
+			/*
+			 * Re-initialize some specifics about $MFT's inode as
+			 * ntfs_read_inode() will have set up the default ones.
+			 */
+			/* Set uid and gid to root. */
+			vi->i_uid = GLOBAL_ROOT_UID;
+			vi->i_gid = GLOBAL_ROOT_GID;
+			/* Regular file. No access for anyone. */
+			vi->i_mode = S_IFREG;
+			/* No VFS initiated operations allowed for $MFT. */
+			vi->i_op = &ntfs_empty_inode_ops;
+			vi->i_fop = &ntfs_empty_file_ops;
+		}
+
+		/* Get the lowest vcn for the next extent. */
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		next_vcn = highest_vcn + 1;
+
+		/* Only one extent or error, which we catch below. */
+		if (next_vcn <= 0)
+			break;
+
+		/* Avoid endless loops due to corruption. */
+		if (next_vcn < sle64_to_cpu(
+				a->data.non_resident.lowest_vcn)) {
+			ntfs_error(sb, "$MFT has corrupt attribute list "
+					"attribute. Run chkdsk.");
+			goto put_err_out;
+		}
+	}
+	if (err != -ENOENT) {
+		ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
+				"$MFT is corrupt. Run chkdsk.");
+		goto put_err_out;
+	}
+	if (!a) {
+		ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
+				"corrupt. Run chkdsk.");
+		goto put_err_out;
+	}
+	if (highest_vcn && highest_vcn != last_vcn - 1) {
+		ntfs_error(sb, "Failed to load the complete runlist for "
+				"$MFT/$DATA. Driver bug or corrupt $MFT. "
+				"Run chkdsk.");
+		ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
+				(unsigned long long)highest_vcn,
+				(unsigned long long)last_vcn - 1);
+		goto put_err_out;
+	}
+	ntfs_attr_put_search_ctx(ctx);
+	ntfs_debug("Done.");
+	ntfs_free(m);
+
+	/*
+	 * Split the locking rules of the MFT inode from the
+	 * locking rules of other inodes:
+	 */
+	lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
+	lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
+
+	return 0;
+
+em_put_err_out:
+	ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
+			"attribute list. $MFT is corrupt. Run chkdsk.");
+put_err_out:
+	ntfs_attr_put_search_ctx(ctx);
+err_out:
+	ntfs_error(sb, "Failed. Marking inode as bad.");
+	make_bad_inode(vi);
+	ntfs_free(m);
+	return -1;
+}
+
+static void __ntfs_clear_inode(ntfs_inode *ni)
+{
+	/* Free all alocated memory. */
+	down_write(&ni->runlist.lock);
+	if (ni->runlist.rl) {
+		ntfs_free(ni->runlist.rl);
+		ni->runlist.rl = NULL;
+	}
+	up_write(&ni->runlist.lock);
+
+	if (ni->attr_list) {
+		ntfs_free(ni->attr_list);
+		ni->attr_list = NULL;
+	}
+
+	down_write(&ni->attr_list_rl.lock);
+	if (ni->attr_list_rl.rl) {
+		ntfs_free(ni->attr_list_rl.rl);
+		ni->attr_list_rl.rl = NULL;
+	}
+	up_write(&ni->attr_list_rl.lock);
+
+	if (ni->name_len && ni->name != I30) {
+		/* Catch bugs... */
+		BUG_ON(!ni->name);
+		kfree(ni->name);
+	}
+}
+
+void ntfs_clear_extent_inode(ntfs_inode *ni)
+{
+	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(ni->nr_extents != -1);
+
+#ifdef NTFS_RW
+	if (NInoDirty(ni)) {
+		if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
+			ntfs_error(ni->vol->sb, "Clearing dirty extent inode!  "
+					"Losing data!  This is a BUG!!!");
+		// FIXME:  Do something!!!
+	}
+#endif /* NTFS_RW */
+
+	__ntfs_clear_inode(ni);
+
+	/* Bye, bye... */
+	ntfs_destroy_extent_inode(ni);
+}
+
+/**
+ * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
+ * @vi:		vfs inode pending annihilation
+ *
+ * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
+ * is called, which deallocates all memory belonging to the NTFS specific part
+ * of the inode and returns.
+ *
+ * If the MFT record is dirty, we commit it before doing anything else.
+ */
+void ntfs_evict_big_inode(struct inode *vi)
+{
+	ntfs_inode *ni = NTFS_I(vi);
+
+	truncate_inode_pages_final(&vi->i_data);
+	clear_inode(vi);
+
+#ifdef NTFS_RW
+	if (NInoDirty(ni)) {
+		bool was_bad = (is_bad_inode(vi));
+
+		/* Committing the inode also commits all extent inodes. */
+		ntfs_commit_inode(vi);
+
+		if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
+			ntfs_error(vi->i_sb, "Failed to commit dirty inode "
+					"0x%lx.  Losing data!", vi->i_ino);
+			// FIXME:  Do something!!!
+		}
+	}
+#endif /* NTFS_RW */
+
+	/* No need to lock at this stage as no one else has a reference. */
+	if (ni->nr_extents > 0) {
+		int i;
+
+		for (i = 0; i < ni->nr_extents; i++)
+			ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
+		kfree(ni->ext.extent_ntfs_inos);
+	}
+
+	__ntfs_clear_inode(ni);
+
+	if (NInoAttr(ni)) {
+		/* Release the base inode if we are holding it. */
+		if (ni->nr_extents == -1) {
+			iput(VFS_I(ni->ext.base_ntfs_ino));
+			ni->nr_extents = 0;
+			ni->ext.base_ntfs_ino = NULL;
+		}
+	}
+	return;
+}
+
+/**
+ * ntfs_show_options - show mount options in /proc/mounts
+ * @sf:		seq_file in which to write our mount options
+ * @root:	root of the mounted tree whose mount options to display
+ *
+ * Called by the VFS once for each mounted ntfs volume when someone reads
+ * /proc/mounts in order to display the NTFS specific mount options of each
+ * mount. The mount options of fs specified by @root are written to the seq file
+ * @sf and success is returned.
+ */
+int ntfs_show_options(struct seq_file *sf, struct dentry *root)
+{
+	ntfs_volume *vol = NTFS_SB(root->d_sb);
+	int i;
+
+	seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
+	seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
+	if (vol->fmask == vol->dmask)
+		seq_printf(sf, ",umask=0%o", vol->fmask);
+	else {
+		seq_printf(sf, ",fmask=0%o", vol->fmask);
+		seq_printf(sf, ",dmask=0%o", vol->dmask);
+	}
+	seq_printf(sf, ",nls=%s", vol->nls_map->charset);
+	if (NVolCaseSensitive(vol))
+		seq_printf(sf, ",case_sensitive");
+	if (NVolShowSystemFiles(vol))
+		seq_printf(sf, ",show_sys_files");
+	if (!NVolSparseEnabled(vol))
+		seq_printf(sf, ",disable_sparse");
+	for (i = 0; on_errors_arr[i].val; i++) {
+		if (on_errors_arr[i].val & vol->on_errors)
+			seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
+	}
+	seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
+	return 0;
+}
+
+#ifdef NTFS_RW
+
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+		"chkdsk.";
+
+/**
+ * ntfs_truncate - called when the i_size of an ntfs inode is changed
+ * @vi:		inode for which the i_size was changed
+ *
+ * We only support i_size changes for normal files at present, i.e. not
+ * compressed and not encrypted.  This is enforced in ntfs_setattr(), see
+ * below.
+ *
+ * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
+ * that the change is allowed.
+ *
+ * This implies for us that @vi is a file inode rather than a directory, index,
+ * or attribute inode as well as that @vi is a base inode.
+ *
+ * Returns 0 on success or -errno on error.
+ *
+ * Called with ->i_mutex held.
+ */
+int ntfs_truncate(struct inode *vi)
+{
+	s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
+	VCN highest_vcn;
+	unsigned long flags;
+	ntfs_inode *base_ni, *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	const char *te = "  Leaving file length out of sync with i_size.";
+	int err, mp_size, size_change, alloc_change;
+	u32 attr_len;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(S_ISDIR(vi->i_mode));
+	BUG_ON(NInoMstProtected(ni));
+	BUG_ON(ni->nr_extents < 0);
+retry_truncate:
+	/*
+	 * Lock the runlist for writing and map the mft record to ensure it is
+	 * safe to mess with the attribute runlist and sizes.
+	 */
+	down_write(&ni->runlist.lock);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
+				"(error code %d).%s", vi->i_ino, err, te);
+		ctx = NULL;
+		m = NULL;
+		goto old_bad_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		ntfs_error(vi->i_sb, "Failed to allocate a search context for "
+				"inode 0x%lx (not enough memory).%s",
+				vi->i_ino, te);
+		err = -ENOMEM;
+		goto old_bad_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(vi->i_sb, "Open attribute is missing from "
+					"mft record.  Inode 0x%lx is corrupt.  "
+					"Run chkdsk.%s", vi->i_ino, te);
+			err = -EIO;
+		} else
+			ntfs_error(vi->i_sb, "Failed to lookup attribute in "
+					"inode 0x%lx (error code %d).%s",
+					vi->i_ino, err, te);
+		goto old_bad_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/*
+	 * The i_size of the vfs inode is the new size for the attribute value.
+	 */
+	new_size = i_size_read(vi);
+	/* The current size of the attribute value is the old size. */
+	old_size = ntfs_attr_size(a);
+	/* Calculate the new allocated size. */
+	if (NInoNonResident(ni))
+		new_alloc_size = (new_size + vol->cluster_size - 1) &
+				~(s64)vol->cluster_size_mask;
+	else
+		new_alloc_size = (new_size + 7) & ~7;
+	/* The current allocated size is the old allocated size. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	old_alloc_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * The change in the file size.  This will be 0 if no change, >0 if the
+	 * size is growing, and <0 if the size is shrinking.
+	 */
+	size_change = -1;
+	if (new_size - old_size >= 0) {
+		size_change = 1;
+		if (new_size == old_size)
+			size_change = 0;
+	}
+	/* As above for the allocated size. */
+	alloc_change = -1;
+	if (new_alloc_size - old_alloc_size >= 0) {
+		alloc_change = 1;
+		if (new_alloc_size == old_alloc_size)
+			alloc_change = 0;
+	}
+	/*
+	 * If neither the size nor the allocation are being changed there is
+	 * nothing to do.
+	 */
+	if (!size_change && !alloc_change)
+		goto unm_done;
+	/* If the size is changing, check if new size is allowed in $AttrDef. */
+	if (size_change) {
+		err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
+		if (unlikely(err)) {
+			if (err == -ERANGE) {
+				ntfs_error(vol->sb, "Truncate would cause the "
+						"inode 0x%lx to %simum size "
+						"for its attribute type "
+						"(0x%x).  Aborting truncate.",
+						vi->i_ino,
+						new_size > old_size ? "exceed "
+						"the max" : "go under the min",
+						le32_to_cpu(ni->type));
+				err = -EFBIG;
+			} else {
+				ntfs_error(vol->sb, "Inode 0x%lx has unknown "
+						"attribute type 0x%x.  "
+						"Aborting truncate.",
+						vi->i_ino,
+						le32_to_cpu(ni->type));
+				err = -EIO;
+			}
+			/* Reset the vfs inode size to the old size. */
+			i_size_write(vi, old_size);
+			goto err_out;
+		}
+	}
+	if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+		ntfs_warning(vi->i_sb, "Changes in inode size are not "
+				"supported yet for %s files, ignoring.",
+				NInoCompressed(ni) ? "compressed" :
+				"encrypted");
+		err = -EOPNOTSUPP;
+		goto bad_out;
+	}
+	if (a->non_resident)
+		goto do_non_resident_truncate;
+	BUG_ON(NInoNonResident(ni));
+	/* Resize the attribute record to best fit the new attribute size. */
+	if (new_size < vol->mft_record_size &&
+			!ntfs_resident_attr_value_resize(m, a, new_size)) {
+		/* The resize succeeded! */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		write_lock_irqsave(&ni->size_lock, flags);
+		/* Update the sizes in the ntfs inode and all is done. */
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		/*
+		 * Note ntfs_resident_attr_value_resize() has already done any
+		 * necessary data clearing in the attribute record.  When the
+		 * file is being shrunk vmtruncate() will already have cleared
+		 * the top part of the last partial page, i.e. since this is
+		 * the resident case this is the page with index 0.  However,
+		 * when the file is being expanded, the page cache page data
+		 * between the old data_size, i.e. old_size, and the new_size
+		 * has not been zeroed.  Fortunately, we do not need to zero it
+		 * either since on one hand it will either already be zero due
+		 * to both readpage and writepage clearing partial page data
+		 * beyond i_size in which case there is nothing to do or in the
+		 * case of the file being mmap()ped at the same time, POSIX
+		 * specifies that the behaviour is unspecified thus we do not
+		 * have to do anything.  This means that in our implementation
+		 * in the rare case that the file is mmap()ped and a write
+		 * occurred into the mmap()ped region just beyond the file size
+		 * and writepage has not yet been called to write out the page
+		 * (which would clear the area beyond the file size) and we now
+		 * extend the file size to incorporate this dirty region
+		 * outside the file size, a write of the page would result in
+		 * this data being written to disk instead of being cleared.
+		 * Given both POSIX and the Linux mmap(2) man page specify that
+		 * this corner case is undefined, we choose to leave it like
+		 * that as this is much simpler for us as we cannot lock the
+		 * relevant page now since we are holding too many ntfs locks
+		 * which would result in a lock reversal deadlock.
+		 */
+		ni->initialized_size = new_size;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		goto unm_done;
+	}
+	/* If the above resize failed, this must be an attribute extension. */
+	BUG_ON(size_change < 0);
+	/*
+	 * We have to drop all the locks so we can call
+	 * ntfs_attr_make_non_resident().  This could be optimised by try-
+	 * locking the first page cache page and only if that fails dropping
+	 * the locks, locking the page, and redoing all the locking and
+	 * lookups.  While this would be a huge optimisation, it is not worth
+	 * it as this is definitely a slow code path as it only ever can happen
+	 * once for any given file.
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	/*
+	 * Not enough space in the mft record, try to make the attribute
+	 * non-resident and if successful restart the truncation process.
+	 */
+	err = ntfs_attr_make_non_resident(ni, old_size);
+	if (likely(!err))
+		goto retry_truncate;
+	/*
+	 * Could not make non-resident.  If this is due to this not being
+	 * permitted for this attribute type or there not being enough space,
+	 * try to make other attributes non-resident.  Otherwise fail.
+	 */
+	if (unlikely(err != -EPERM && err != -ENOSPC)) {
+		ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
+				"type 0x%x, because the conversion from "
+				"resident to non-resident attribute failed "
+				"with error code %i.", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		goto conv_err_out;
+	}
+	/* TODO: Not implemented from here, abort. */
+	if (err == -ENOSPC)
+		ntfs_error(vol->sb, "Not enough space in the mft record/on "
+				"disk for the non-resident attribute value.  "
+				"This case is not implemented yet.");
+	else /* if (err == -EPERM) */
+		ntfs_error(vol->sb, "This attribute type may not be "
+				"non-resident.  This case is not implemented "
+				"yet.");
+	err = -EOPNOTSUPP;
+	goto conv_err_out;
+#if 0
+	// TODO: Attempt to make other attributes non-resident.
+	if (!err)
+		goto do_resident_extend;
+	/*
+	 * Both the attribute list attribute and the standard information
+	 * attribute must remain in the base inode.  Thus, if this is one of
+	 * these attributes, we have to try to move other attributes out into
+	 * extent mft records instead.
+	 */
+	if (ni->type == AT_ATTRIBUTE_LIST ||
+			ni->type == AT_STANDARD_INFORMATION) {
+		// TODO: Attempt to move other attributes into extent mft
+		// records.
+		err = -EOPNOTSUPP;
+		if (!err)
+			goto do_resident_extend;
+		goto err_out;
+	}
+	// TODO: Attempt to move this attribute to an extent mft record, but
+	// only if it is not already the only attribute in an mft record in
+	// which case there would be nothing to gain.
+	err = -EOPNOTSUPP;
+	if (!err)
+		goto do_resident_extend;
+	/* There is nothing we can do to make enough space. )-: */
+	goto err_out;
+#endif
+do_non_resident_truncate:
+	BUG_ON(!NInoNonResident(ni));
+	if (alloc_change < 0) {
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		if (highest_vcn > 0 &&
+				old_alloc_size >> vol->cluster_size_bits >
+				highest_vcn + 1) {
+			/*
+			 * This attribute has multiple extents.  Not yet
+			 * supported.
+			 */
+			ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
+					"attribute type 0x%x, because the "
+					"attribute is highly fragmented (it "
+					"consists of multiple extents) and "
+					"this case is not implemented yet.",
+					vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type));
+			err = -EOPNOTSUPP;
+			goto bad_out;
+		}
+	}
+	/*
+	 * If the size is shrinking, need to reduce the initialized_size and
+	 * the data_size before reducing the allocation.
+	 */
+	if (size_change < 0) {
+		/*
+		 * Make the valid size smaller (i_size is already up-to-date).
+		 */
+		write_lock_irqsave(&ni->size_lock, flags);
+		if (new_size < ni->initialized_size) {
+			ni->initialized_size = new_size;
+			a->data.non_resident.initialized_size =
+					cpu_to_sle64(new_size);
+		}
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		/* If the allocated size is not changing, we are done. */
+		if (!alloc_change)
+			goto unm_done;
+		/*
+		 * If the size is shrinking it makes no sense for the
+		 * allocation to be growing.
+		 */
+		BUG_ON(alloc_change > 0);
+	} else /* if (size_change >= 0) */ {
+		/*
+		 * The file size is growing or staying the same but the
+		 * allocation can be shrinking, growing or staying the same.
+		 */
+		if (alloc_change > 0) {
+			/*
+			 * We need to extend the allocation and possibly update
+			 * the data size.  If we are updating the data size,
+			 * since we are not touching the initialized_size we do
+			 * not need to worry about the actual data on disk.
+			 * And as far as the page cache is concerned, there
+			 * will be no pages beyond the old data size and any
+			 * partial region in the last page between the old and
+			 * new data size (or the end of the page if the new
+			 * data size is outside the page) does not need to be
+			 * modified as explained above for the resident
+			 * attribute truncate case.  To do this, we simply drop
+			 * the locks we hold and leave all the work to our
+			 * friendly helper ntfs_attr_extend_allocation().
+			 */
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(base_ni);
+			up_write(&ni->runlist.lock);
+			err = ntfs_attr_extend_allocation(ni, new_size,
+					size_change > 0 ? new_size : -1, -1);
+			/*
+			 * ntfs_attr_extend_allocation() will have done error
+			 * output already.
+			 */
+			goto done;
+		}
+		if (!alloc_change)
+			goto alloc_done;
+	}
+	/* alloc_change < 0 */
+	/* Free the clusters. */
+	nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
+			vol->cluster_size_bits, -1, ctx);
+	m = ctx->mrec;
+	a = ctx->attr;
+	if (unlikely(nr_freed < 0)) {
+		ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
+				"%lli).  Unmount and run chkdsk to recover "
+				"the lost cluster(s).", (long long)nr_freed);
+		NVolSetErrors(vol);
+		nr_freed = 0;
+	}
+	/* Truncate the runlist. */
+	err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
+			new_alloc_size >> vol->cluster_size_bits);
+	/*
+	 * If the runlist truncation failed and/or the search context is no
+	 * longer valid, we cannot resize the attribute record or build the
+	 * mapping pairs array thus we mark the inode bad so that no access to
+	 * the freed clusters can happen.
+	 */
+	if (unlikely(err || IS_ERR(m))) {
+		ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
+				IS_ERR(m) ?
+				"restore attribute search context" :
+				"truncate attribute runlist",
+				IS_ERR(m) ? PTR_ERR(m) : err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Get the size for the shrunk mapping pairs array for the runlist. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because determining the "
+				"size for the mapping pairs failed with error "
+				"code %i.%s", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), mp_size, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/*
+	 * Shrink the attribute record for the new mapping pairs array.  Note,
+	 * this cannot fail since we are making the attribute smaller thus by
+	 * definition there is enough space to do so.
+	 */
+	attr_len = le32_to_cpu(a->length);
+	err = ntfs_attr_record_resize(m, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	BUG_ON(err);
+	/*
+	 * Generate the mapping pairs array directly into the attribute record.
+	 */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, ni->runlist.rl, 0, -1, NULL);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because building the "
+				"mapping pairs failed with error code %i.%s",
+				vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+				err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Update the allocated/compressed size as well as the highest vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+			vol->cluster_size_bits) - 1);
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_alloc_size;
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		if (nr_freed) {
+			ni->itype.compressed.size -= nr_freed <<
+					vol->cluster_size_bits;
+			BUG_ON(ni->itype.compressed.size < 0);
+			a->data.non_resident.compressed_size = cpu_to_sle64(
+					ni->itype.compressed.size);
+			vi->i_blocks = ni->itype.compressed.size >> 9;
+		}
+	} else
+		vi->i_blocks = new_alloc_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * We have shrunk the allocation.  If this is a shrinking truncate we
+	 * have already dealt with the initialized_size and the data_size above
+	 * and we are done.  If the truncate is only changing the allocation
+	 * and not the data_size, we are also done.  If this is an extending
+	 * truncate, need to extend the data_size now which is ensured by the
+	 * fact that @size_change is positive.
+	 */
+alloc_done:
+	/*
+	 * If the size is growing, need to update it now.  If it is shrinking,
+	 * we have already updated it above (before the allocation change).
+	 */
+	if (size_change > 0)
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+	/* Ensure the modified mft record is written out. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+unm_done:
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+done:
+	/* Update the mtime and ctime on the base inode. */
+	/* normally ->truncate shouldn't update ctime or mtime,
+	 * but ntfs did before so it got a copy & paste version
+	 * of file_update_time.  one day someone should fix this
+	 * for real.
+	 */
+	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
+		struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb);
+		int sync_it = 0;
+
+		if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		    !timespec_equal(&VFS_I(base_ni)->i_ctime, &now))
+			sync_it = 1;
+		VFS_I(base_ni)->i_mtime = now;
+		VFS_I(base_ni)->i_ctime = now;
+
+		if (sync_it)
+			mark_inode_dirty_sync(VFS_I(base_ni));
+	}
+
+	if (likely(!err)) {
+		NInoClearTruncateFailed(ni);
+		ntfs_debug("Done.");
+	}
+	return err;
+old_bad_out:
+	old_size = -1;
+bad_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP)
+		NVolSetErrors(vol);
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else if (old_size >= 0)
+		i_size_write(vi, old_size);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+out:
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+conv_err_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP)
+		NVolSetErrors(vol);
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else
+		i_size_write(vi, old_size);
+	goto out;
+}
+
+/**
+ * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
+ * @vi:		inode for which the i_size was changed
+ *
+ * Wrapper for ntfs_truncate() that has no return value.
+ *
+ * See ntfs_truncate() description above for details.
+ */
+#ifdef NTFS_RW
+void ntfs_truncate_vfs(struct inode *vi) {
+	ntfs_truncate(vi);
+}
+#endif
+
+/**
+ * ntfs_setattr - called from notify_change() when an attribute is being changed
+ * @dentry:	dentry whose attributes to change
+ * @attr:	structure describing the attributes and the changes
+ *
+ * We have to trap VFS attempts to truncate the file described by @dentry as
+ * soon as possible, because we do not implement changes in i_size yet.  So we
+ * abort all i_size changes here.
+ *
+ * We also abort all changes of user, group, and mode as we do not implement
+ * the NTFS ACLs yet.
+ *
+ * Called with ->i_mutex held.
+ */
+int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *vi = d_inode(dentry);
+	int err;
+	unsigned int ia_valid = attr->ia_valid;
+
+	err = inode_change_ok(vi, attr);
+	if (err)
+		goto out;
+	/* We do not support NTFS ACLs yet. */
+	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
+		ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
+				"supported yet, ignoring.");
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		if (attr->ia_size != i_size_read(vi)) {
+			ntfs_inode *ni = NTFS_I(vi);
+			/*
+			 * FIXME: For now we do not support resizing of
+			 * compressed or encrypted files yet.
+			 */
+			if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+				ntfs_warning(vi->i_sb, "Changes in inode size "
+						"are not supported yet for "
+						"%s files, ignoring.",
+						NInoCompressed(ni) ?
+						"compressed" : "encrypted");
+				err = -EOPNOTSUPP;
+			} else {
+				truncate_setsize(vi, attr->ia_size);
+				ntfs_truncate_vfs(vi);
+			}
+			if (err || ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/*
+			 * We skipped the truncate but must still update
+			 * timestamps.
+			 */
+			ia_valid |= ATTR_MTIME | ATTR_CTIME;
+		}
+	}
+	if (ia_valid & ATTR_ATIME)
+		vi->i_atime = timespec_trunc(attr->ia_atime,
+				vi->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		vi->i_mtime = timespec_trunc(attr->ia_mtime,
+				vi->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		vi->i_ctime = timespec_trunc(attr->ia_ctime,
+				vi->i_sb->s_time_gran);
+	mark_inode_dirty(vi);
+out:
+	return err;
+}
+
+/**
+ * ntfs_write_inode - write out a dirty inode
+ * @vi:		inode to write out
+ * @sync:	if true, write out synchronously
+ *
+ * Write out a dirty inode to disk including any extent inodes if present.
+ *
+ * If @sync is true, commit the inode to disk and wait for io completion.  This
+ * is done using write_mft_record().
+ *
+ * If @sync is false, just schedule the write to happen but do not wait for i/o
+ * completion.  In 2.6 kernels, scheduling usually happens just by virtue of
+ * marking the page (and in this case mft record) dirty but we do not implement
+ * this yet as write_mft_record() largely ignores the @sync parameter and
+ * always performs synchronous writes.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int __ntfs_write_inode(struct inode *vi, int sync)
+{
+	sle64 nt;
+	ntfs_inode *ni = NTFS_I(vi);
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	STANDARD_INFORMATION *si;
+	int err = 0;
+	bool modified = false;
+
+	ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
+			vi->i_ino);
+	/*
+	 * Dirty attribute inodes are written via their real inodes so just
+	 * clean them here.  Access time updates are taken care off when the
+	 * real inode is written.
+	 */
+	if (NInoAttr(ni)) {
+		NInoClearDirty(ni);
+		ntfs_debug("Done.");
+		return 0;
+	}
+	/* Map, pin, and lock the mft record belonging to the inode. */
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	/* Update the access times in the standard information attribute. */
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		ntfs_attr_put_search_ctx(ctx);
+		goto unm_err_out;
+	}
+	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	/* Update the access times if they have changed. */
+	nt = utc2ntfs(vi->i_mtime);
+	if (si->last_data_change_time != nt) {
+		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
+				"new = 0x%llx", vi->i_ino, (long long)
+				sle64_to_cpu(si->last_data_change_time),
+				(long long)sle64_to_cpu(nt));
+		si->last_data_change_time = nt;
+		modified = true;
+	}
+	nt = utc2ntfs(vi->i_ctime);
+	if (si->last_mft_change_time != nt) {
+		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
+				"new = 0x%llx", vi->i_ino, (long long)
+				sle64_to_cpu(si->last_mft_change_time),
+				(long long)sle64_to_cpu(nt));
+		si->last_mft_change_time = nt;
+		modified = true;
+	}
+	nt = utc2ntfs(vi->i_atime);
+	if (si->last_access_time != nt) {
+		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
+				"new = 0x%llx", vi->i_ino,
+				(long long)sle64_to_cpu(si->last_access_time),
+				(long long)sle64_to_cpu(nt));
+		si->last_access_time = nt;
+		modified = true;
+	}
+	/*
+	 * If we just modified the standard information attribute we need to
+	 * mark the mft record it is in dirty.  We do this manually so that
+	 * mark_inode_dirty() is not called which would redirty the inode and
+	 * hence result in an infinite loop of trying to write the inode.
+	 * There is no need to mark the base inode nor the base mft record
+	 * dirty, since we are going to write this mft record below in any case
+	 * and the base mft record may actually not have been modified so it
+	 * might not need to be written out.
+	 * NOTE: It is not a problem when the inode for $MFT itself is being
+	 * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
+	 * on the $MFT inode and hence ntfs_write_inode() will not be
+	 * re-invoked because of it which in turn is ok since the dirtied mft
+	 * record will be cleaned and written out to disk below, i.e. before
+	 * this function returns.
+	 */
+	if (modified) {
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		if (!NInoTestSetDirty(ctx->ntfs_ino))
+			mark_ntfs_record_dirty(ctx->ntfs_ino->page,
+					ctx->ntfs_ino->page_ofs);
+	}
+	ntfs_attr_put_search_ctx(ctx);
+	/* Now the access times are updated, write the base mft record. */
+	if (NInoDirty(ni))
+		err = write_mft_record(ni, m, sync);
+	/* Write all attached extent mft records. */
+	mutex_lock(&ni->extent_lock);
+	if (ni->nr_extents > 0) {
+		ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
+		int i;
+
+		ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
+		for (i = 0; i < ni->nr_extents; i++) {
+			ntfs_inode *tni = extent_nis[i];
+
+			if (NInoDirty(tni)) {
+				MFT_RECORD *tm = map_mft_record(tni);
+				int ret;
+
+				if (IS_ERR(tm)) {
+					if (!err || err == -ENOMEM)
+						err = PTR_ERR(tm);
+					continue;
+				}
+				ret = write_mft_record(tni, tm, sync);
+				unmap_mft_record(tni);
+				if (unlikely(ret)) {
+					if (!err || err == -ENOMEM)
+						err = ret;
+				}
+			}
+		}
+	}
+	mutex_unlock(&ni->extent_lock);
+	unmap_mft_record(ni);
+	if (unlikely(err))
+		goto err_out;
+	ntfs_debug("Done.");
+	return 0;
+unm_err_out:
+	unmap_mft_record(ni);
+err_out:
+	if (err == -ENOMEM) {
+		ntfs_warning(vi->i_sb, "Not enough memory to write inode.  "
+				"Marking the inode dirty again, so the VFS "
+				"retries later.");
+		mark_inode_dirty(vi);
+	} else {
+		ntfs_error(vi->i_sb, "Failed (error %i):  Run chkdsk.", -err);
+		NVolSetErrors(ni->vol);
+	}
+	return err;
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/inode.h b/kernel/fs/ntfs/inode.h
new file mode 100644
index 000000000..76b6cfb57
--- /dev/null
+++ b/kernel/fs/ntfs/inode.h
@@ -0,0 +1,325 @@
+/*
+ * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
+ *	     the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_INODE_H
+#define _LINUX_NTFS_INODE_H
+
+#include <linux/atomic.h>
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+
+#include "layout.h"
+#include "volume.h"
+#include "types.h"
+#include "runlist.h"
+#include "debug.h"
+
+typedef struct _ntfs_inode ntfs_inode;
+
+/*
+ * The NTFS in-memory inode structure. It is just used as an extension to the
+ * fields already provided in the VFS inode.
+ */
+struct _ntfs_inode {
+	rwlock_t size_lock;	/* Lock serializing access to inode sizes. */
+	s64 initialized_size;	/* Copy from the attribute record. */
+	s64 allocated_size;	/* Copy from the attribute record. */
+	unsigned long state;	/* NTFS specific flags describing this inode.
+				   See ntfs_inode_state_bits below. */
+	unsigned long mft_no;	/* Number of the mft record / inode. */
+	u16 seq_no;		/* Sequence number of the mft record. */
+	atomic_t count;		/* Inode reference count for book keeping. */
+	ntfs_volume *vol;	/* Pointer to the ntfs volume of this inode. */
+	/*
+	 * If NInoAttr() is true, the below fields describe the attribute which
+	 * this fake inode belongs to. The actual inode of this attribute is
+	 * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
+	 * below). For real inodes, we also set the type (AT_DATA for files and
+	 * AT_INDEX_ALLOCATION for directories), with the name = NULL and
+	 * name_len = 0 for files and name = I30 (global constant) and
+	 * name_len = 4 for directories.
+	 */
+	ATTR_TYPE type;	/* Attribute type of this fake inode. */
+	ntfschar *name;		/* Attribute name of this fake inode. */
+	u32 name_len;		/* Attribute name length of this fake inode. */
+	runlist runlist;	/* If state has the NI_NonResident bit set,
+				   the runlist of the unnamed data attribute
+				   (if a file) or of the index allocation
+				   attribute (directory) or of the attribute
+				   described by the fake inode (if NInoAttr()).
+				   If runlist.rl is NULL, the runlist has not
+				   been read in yet or has been unmapped. If
+				   NI_NonResident is clear, the attribute is
+				   resident (file and fake inode) or there is
+				   no $I30 index allocation attribute
+				   (small directory). In the latter case
+				   runlist.rl is always NULL.*/
+	/*
+	 * The following fields are only valid for real inodes and extent
+	 * inodes.
+	 */
+	struct mutex mrec_lock;	/* Lock for serializing access to the
+				   mft record belonging to this inode. */
+	struct page *page;	/* The page containing the mft record of the
+				   inode. This should only be touched by the
+				   (un)map_mft_record*() functions. */
+	int page_ofs;		/* Offset into the page at which the mft record
+				   begins. This should only be touched by the
+				   (un)map_mft_record*() functions. */
+	/*
+	 * Attribute list support (only for use by the attribute lookup
+	 * functions). Setup during read_inode for all inodes with attribute
+	 * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
+	 * further only valid if NI_AttrListNonResident is set.
+	 */
+	u32 attr_list_size;	/* Length of attribute list value in bytes. */
+	u8 *attr_list;		/* Attribute list value itself. */
+	runlist attr_list_rl;	/* Run list for the attribute list value. */
+	union {
+		struct { /* It is a directory, $MFT, or an index inode. */
+			u32 block_size;		/* Size of an index block. */
+			u32 vcn_size;		/* Size of a vcn in this
+						   index. */
+			COLLATION_RULE collation_rule; /* The collation rule
+						   for the index. */
+			u8 block_size_bits; 	/* Log2 of the above. */
+			u8 vcn_size_bits;	/* Log2 of the above. */
+		} index;
+		struct { /* It is a compressed/sparse file/attribute inode. */
+			s64 size;		/* Copy of compressed_size from
+						   $DATA. */
+			u32 block_size;		/* Size of a compression block
+						   (cb). */
+			u8 block_size_bits;	/* Log2 of the size of a cb. */
+			u8 block_clusters;	/* Number of clusters per cb. */
+		} compressed;
+	} itype;
+	struct mutex extent_lock;	/* Lock for accessing/modifying the
+					   below . */
+	s32 nr_extents;	/* For a base mft record, the number of attached extent
+			   inodes (0 if none), for extent records and for fake
+			   inodes describing an attribute this is -1. */
+	union {		/* This union is only used if nr_extents != 0. */
+		ntfs_inode **extent_ntfs_inos;	/* For nr_extents > 0, array of
+						   the ntfs inodes of the extent
+						   mft records belonging to
+						   this base inode which have
+						   been loaded. */
+		ntfs_inode *base_ntfs_ino;	/* For nr_extents == -1, the
+						   ntfs inode of the base mft
+						   record. For fake inodes, the
+						   real (base) inode to which
+						   the attribute belongs. */
+	} ext;
+};
+
+/*
+ * Defined bits for the state field in the ntfs_inode structure.
+ * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
+ */
+typedef enum {
+	NI_Dirty,		/* 1: Mft record needs to be written to disk. */
+	NI_AttrList,		/* 1: Mft record contains an attribute list. */
+	NI_AttrListNonResident,	/* 1: Attribute list is non-resident. Implies
+				      NI_AttrList is set. */
+
+	NI_Attr,		/* 1: Fake inode for attribute i/o.
+				   0: Real inode or extent inode. */
+
+	NI_MstProtected,	/* 1: Attribute is protected by MST fixups.
+				   0: Attribute is not protected by fixups. */
+	NI_NonResident,		/* 1: Unnamed data attr is non-resident (f).
+				   1: Attribute is non-resident (a). */
+	NI_IndexAllocPresent = NI_NonResident,	/* 1: $I30 index alloc attr is
+						   present (d). */
+	NI_Compressed,		/* 1: Unnamed data attr is compressed (f).
+				   1: Create compressed files by default (d).
+				   1: Attribute is compressed (a). */
+	NI_Encrypted,		/* 1: Unnamed data attr is encrypted (f).
+				   1: Create encrypted files by default (d).
+				   1: Attribute is encrypted (a). */
+	NI_Sparse,		/* 1: Unnamed data attr is sparse (f).
+				   1: Create sparse files by default (d).
+				   1: Attribute is sparse (a). */
+	NI_SparseDisabled,	/* 1: May not create sparse regions. */
+	NI_TruncateFailed,	/* 1: Last ntfs_truncate() call failed. */
+} ntfs_inode_state_bits;
+
+/*
+ * NOTE: We should be adding dirty mft records to a list somewhere and they
+ * should be independent of the (ntfs/vfs) inode structure so that an inode can
+ * be removed but the record can be left dirty for syncing later.
+ */
+
+/*
+ * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
+ * functions.
+ */
+#define NINO_FNS(flag)					\
+static inline int NIno##flag(ntfs_inode *ni)		\
+{							\
+	return test_bit(NI_##flag, &(ni)->state);	\
+}							\
+static inline void NInoSet##flag(ntfs_inode *ni)	\
+{							\
+	set_bit(NI_##flag, &(ni)->state);		\
+}							\
+static inline void NInoClear##flag(ntfs_inode *ni)	\
+{							\
+	clear_bit(NI_##flag, &(ni)->state);		\
+}
+
+/*
+ * As above for NInoTestSetFoo() and NInoTestClearFoo().
+ */
+#define TAS_NINO_FNS(flag)					\
+static inline int NInoTestSet##flag(ntfs_inode *ni)		\
+{								\
+	return test_and_set_bit(NI_##flag, &(ni)->state);	\
+}								\
+static inline int NInoTestClear##flag(ntfs_inode *ni)		\
+{								\
+	return test_and_clear_bit(NI_##flag, &(ni)->state);	\
+}
+
+/* Emit the ntfs inode bitops functions. */
+NINO_FNS(Dirty)
+TAS_NINO_FNS(Dirty)
+NINO_FNS(AttrList)
+NINO_FNS(AttrListNonResident)
+NINO_FNS(Attr)
+NINO_FNS(MstProtected)
+NINO_FNS(NonResident)
+NINO_FNS(IndexAllocPresent)
+NINO_FNS(Compressed)
+NINO_FNS(Encrypted)
+NINO_FNS(Sparse)
+NINO_FNS(SparseDisabled)
+NINO_FNS(TruncateFailed)
+
+/*
+ * The full structure containing a ntfs_inode and a vfs struct inode. Used for
+ * all real and fake inodes but not for extent inodes which lack the vfs struct
+ * inode.
+ */
+typedef struct {
+	ntfs_inode ntfs_inode;
+	struct inode vfs_inode;		/* The vfs inode structure. */
+} big_ntfs_inode;
+
+/**
+ * NTFS_I - return the ntfs inode given a vfs inode
+ * @inode:	VFS inode
+ *
+ * NTFS_I() returns the ntfs inode associated with the VFS @inode.
+ */
+static inline ntfs_inode *NTFS_I(struct inode *inode)
+{
+	return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode);
+}
+
+static inline struct inode *VFS_I(ntfs_inode *ni)
+{
+	return &((big_ntfs_inode *)ni)->vfs_inode;
+}
+
+/**
+ * ntfs_attr - ntfs in memory attribute structure
+ * @mft_no:	mft record number of the base mft record of this attribute
+ * @name:	Unicode name of the attribute (NULL if unnamed)
+ * @name_len:	length of @name in Unicode characters (0 if unnamed)
+ * @type:	attribute type (see layout.h)
+ *
+ * This structure exists only to provide a small structure for the
+ * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
+ *
+ * NOTE: Elements are ordered by size to make the structure as compact as
+ * possible on all architectures.
+ */
+typedef struct {
+	unsigned long mft_no;
+	ntfschar *name;
+	u32 name_len;
+	ATTR_TYPE type;
+} ntfs_attr;
+
+typedef int (*test_t)(struct inode *, void *);
+
+extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na);
+
+extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
+extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+		ntfschar *name, u32 name_len);
+extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+		u32 name_len);
+
+extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
+extern void ntfs_destroy_big_inode(struct inode *inode);
+extern void ntfs_evict_big_inode(struct inode *vi);
+
+extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
+
+static inline void ntfs_init_big_inode(struct inode *vi)
+{
+	ntfs_inode *ni = NTFS_I(vi);
+
+	ntfs_debug("Entering.");
+	__ntfs_init_inode(vi->i_sb, ni);
+	ni->mft_no = vi->i_ino;
+}
+
+extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+		unsigned long mft_no);
+extern void ntfs_clear_extent_inode(ntfs_inode *ni);
+
+extern int ntfs_read_inode_mount(struct inode *vi);
+
+extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
+
+#ifdef NTFS_RW
+
+extern int ntfs_truncate(struct inode *vi);
+extern void ntfs_truncate_vfs(struct inode *vi);
+
+extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
+
+extern int __ntfs_write_inode(struct inode *vi, int sync);
+
+static inline void ntfs_commit_inode(struct inode *vi)
+{
+	if (!is_bad_inode(vi))
+		__ntfs_write_inode(vi, 1);
+	return;
+}
+
+#else
+
+static inline void ntfs_truncate_vfs(struct inode *vi) {}
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_INODE_H */
diff --git a/kernel/fs/ntfs/layout.h b/kernel/fs/ntfs/layout.h
new file mode 100644
index 000000000..809c0e6d8
--- /dev/null
+++ b/kernel/fs/ntfs/layout.h
@@ -0,0 +1,2435 @@
+/*
+ * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
+ *	      project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_LAYOUT_H
+#define _LINUX_NTFS_LAYOUT_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+
+#include "types.h"
+
+/* The NTFS oem_id "NTFS    " */
+#define magicNTFS	cpu_to_le64(0x202020205346544eULL)
+
+/*
+ * Location of bootsector on partition:
+ *	The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
+ *	On NT4 and above there is one backup copy of the boot sector to
+ *	be found on the last sector of the partition (not normally accessible
+ *	from within Windows as the bootsector contained number of sectors
+ *	value is one less than the actual value!).
+ *	On versions of NT 3.51 and earlier, the backup copy was located at
+ *	number of sectors/2 (integer divide), i.e. in the middle of the volume.
+ */
+
+/*
+ * BIOS parameter block (bpb) structure.
+ */
+typedef struct {
+	le16 bytes_per_sector;		/* Size of a sector in bytes. */
+	u8  sectors_per_cluster;	/* Size of a cluster in sectors. */
+	le16 reserved_sectors;		/* zero */
+	u8  fats;			/* zero */
+	le16 root_entries;		/* zero */
+	le16 sectors;			/* zero */
+	u8  media_type;			/* 0xf8 = hard disk */
+	le16 sectors_per_fat;		/* zero */
+	le16 sectors_per_track;		/* irrelevant */
+	le16 heads;			/* irrelevant */
+	le32 hidden_sectors;		/* zero */
+	le32 large_sectors;		/* zero */
+} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
+
+/*
+ * NTFS boot sector structure.
+ */
+typedef struct {
+	u8  jump[3];			/* Irrelevant (jump to boot up code).*/
+	le64 oem_id;			/* Magic "NTFS    ". */
+	BIOS_PARAMETER_BLOCK bpb;	/* See BIOS_PARAMETER_BLOCK. */
+	u8  unused[4];			/* zero, NTFS diskedit.exe states that
+					   this is actually:
+						__u8 physical_drive;	// 0x80
+						__u8 current_head;	// zero
+						__u8 extended_boot_signature;
+									// 0x80
+						__u8 unused;		// zero
+					 */
+/*0x28*/sle64 number_of_sectors;	/* Number of sectors in volume. Gives
+					   maximum volume size of 2^63 sectors.
+					   Assuming standard sector size of 512
+					   bytes, the maximum byte size is
+					   approx. 4.7x10^21 bytes. (-; */
+	sle64 mft_lcn;			/* Cluster location of mft data. */
+	sle64 mftmirr_lcn;		/* Cluster location of copy of mft. */
+	s8  clusters_per_mft_record;	/* Mft record size in clusters. */
+	u8  reserved0[3];		/* zero */
+	s8  clusters_per_index_record;	/* Index block size in clusters. */
+	u8  reserved1[3];		/* zero */
+	le64 volume_serial_number;	/* Irrelevant (serial number). */
+	le32 checksum;			/* Boot sector checksum. */
+/*0x54*/u8  bootstrap[426];		/* Irrelevant (boot up code). */
+	le16 end_of_sector_marker;	/* End of bootsector magic. Always is
+					   0xaa55 in little endian. */
+/* sizeof() = 512 (0x200) bytes */
+} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
+
+/*
+ * Magic identifiers present at the beginning of all ntfs record containing
+ * records (like mft records for example).
+ */
+enum {
+	/* Found in $MFT/$DATA. */
+	magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
+	magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
+	magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+
+	/* Found in $LogFile/$DATA. */
+	magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
+	magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
+
+	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
+	magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+
+	/* Found in all ntfs record containing records. */
+	magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
+						       transfer was detected. */
+	/*
+	 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
+	 * thus not initialized.  Page must be initialized before using it.
+	 */
+	magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
+};
+
+typedef le32 NTFS_RECORD_TYPE;
+
+/*
+ * Generic magic comparison macros. Finally found a use for the ## preprocessor
+ * operator! (-8
+ */
+
+static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
+{
+	return (x == r);
+}
+#define ntfs_is_magic(x, m)	__ntfs_is_magic(x, magic_##m)
+
+static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
+{
+	return (*p == r);
+}
+#define ntfs_is_magicp(p, m)	__ntfs_is_magicp(p, magic_##m)
+
+/*
+ * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
+ */
+#define ntfs_is_file_record(x)		( ntfs_is_magic (x, FILE) )
+#define ntfs_is_file_recordp(p)		( ntfs_is_magicp(p, FILE) )
+#define ntfs_is_mft_record(x)		( ntfs_is_file_record (x) )
+#define ntfs_is_mft_recordp(p)		( ntfs_is_file_recordp(p) )
+#define ntfs_is_indx_record(x)		( ntfs_is_magic (x, INDX) )
+#define ntfs_is_indx_recordp(p)		( ntfs_is_magicp(p, INDX) )
+#define ntfs_is_hole_record(x)		( ntfs_is_magic (x, HOLE) )
+#define ntfs_is_hole_recordp(p)		( ntfs_is_magicp(p, HOLE) )
+
+#define ntfs_is_rstr_record(x)		( ntfs_is_magic (x, RSTR) )
+#define ntfs_is_rstr_recordp(p)		( ntfs_is_magicp(p, RSTR) )
+#define ntfs_is_rcrd_record(x)		( ntfs_is_magic (x, RCRD) )
+#define ntfs_is_rcrd_recordp(p)		( ntfs_is_magicp(p, RCRD) )
+
+#define ntfs_is_chkd_record(x)		( ntfs_is_magic (x, CHKD) )
+#define ntfs_is_chkd_recordp(p)		( ntfs_is_magicp(p, CHKD) )
+
+#define ntfs_is_baad_record(x)		( ntfs_is_magic (x, BAAD) )
+#define ntfs_is_baad_recordp(p)		( ntfs_is_magicp(p, BAAD) )
+
+#define ntfs_is_empty_record(x)		( ntfs_is_magic (x, empty) )
+#define ntfs_is_empty_recordp(p)	( ntfs_is_magicp(p, empty) )
+
+/*
+ * The Update Sequence Array (usa) is an array of the le16 values which belong
+ * to the end of each sector protected by the update sequence record in which
+ * this array is contained. Note that the first entry is the Update Sequence
+ * Number (usn), a cyclic counter of how many times the protected record has
+ * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
+ * last le16's of each sector have to be equal to the usn (during reading) or
+ * are set to it (during writing). If they are not, an incomplete multi sector
+ * transfer has occurred when the data was written.
+ * The maximum size for the update sequence array is fixed to:
+ *	maximum size = usa_ofs + (usa_count * 2) = 510 bytes
+ * The 510 bytes comes from the fact that the last le16 in the array has to
+ * (obviously) finish before the last le16 of the first 512-byte sector.
+ * This formula can be used as a consistency check in that usa_ofs +
+ * (usa_count * 2) has to be less than or equal to 510.
+ */
+typedef struct {
+	NTFS_RECORD_TYPE magic;	/* A four-byte magic identifying the record
+				   type and/or status. */
+	le16 usa_ofs;		/* Offset to the Update Sequence Array (usa)
+				   from the start of the ntfs record. */
+	le16 usa_count;		/* Number of le16 sized entries in the usa
+				   including the Update Sequence Number (usn),
+				   thus the number of fixups is the usa_count
+				   minus 1. */
+} __attribute__ ((__packed__)) NTFS_RECORD;
+
+/*
+ * System files mft record numbers. All these files are always marked as used
+ * in the bitmap attribute of the mft; presumably in order to avoid accidental
+ * allocation for random other mft records. Also, the sequence number for each
+ * of the system files is always equal to their mft record number and it is
+ * never modified.
+ */
+typedef enum {
+	FILE_MFT       = 0,	/* Master file table (mft). Data attribute
+				   contains the entries and bitmap attribute
+				   records which ones are in use (bit==1). */
+	FILE_MFTMirr   = 1,	/* Mft mirror: copy of first four mft records
+				   in data attribute. If cluster size > 4kiB,
+				   copy of first N mft records, with
+					N = cluster_size / mft_record_size. */
+	FILE_LogFile   = 2,	/* Journalling log in data attribute. */
+	FILE_Volume    = 3,	/* Volume name attribute and volume information
+				   attribute (flags and ntfs version). Windows
+				   refers to this file as volume DASD (Direct
+				   Access Storage Device). */
+	FILE_AttrDef   = 4,	/* Array of attribute definitions in data
+				   attribute. */
+	FILE_root      = 5,	/* Root directory. */
+	FILE_Bitmap    = 6,	/* Allocation bitmap of all clusters (lcns) in
+				   data attribute. */
+	FILE_Boot      = 7,	/* Boot sector (always at cluster 0) in data
+				   attribute. */
+	FILE_BadClus   = 8,	/* Contains all bad clusters in the non-resident
+				   data attribute. */
+	FILE_Secure    = 9,	/* Shared security descriptors in data attribute
+				   and two indexes into the descriptors.
+				   Appeared in Windows 2000. Before that, this
+				   file was named $Quota but was unused. */
+	FILE_UpCase    = 10,	/* Uppercase equivalents of all 65536 Unicode
+				   characters in data attribute. */
+	FILE_Extend    = 11,	/* Directory containing other system files (eg.
+				   $ObjId, $Quota, $Reparse and $UsnJrnl). This
+				   is new to NTFS3.0. */
+	FILE_reserved12 = 12,	/* Reserved for future use (records 12-15). */
+	FILE_reserved13 = 13,
+	FILE_reserved14 = 14,
+	FILE_reserved15 = 15,
+	FILE_first_user = 16,	/* First user file, used as test limit for
+				   whether to allow opening a file or not. */
+} NTFS_SYSTEM_FILES;
+
+/*
+ * These are the so far known MFT_RECORD_* flags (16-bit) which contain
+ * information about the mft record in which they are present.
+ */
+enum {
+	MFT_RECORD_IN_USE	= cpu_to_le16(0x0001),
+	MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
+} __attribute__ ((__packed__));
+
+typedef le16 MFT_RECORD_FLAGS;
+
+/*
+ * mft references (aka file references or file record segment references) are
+ * used whenever a structure needs to refer to a record in the mft.
+ *
+ * A reference consists of a 48-bit index into the mft and a 16-bit sequence
+ * number used to detect stale references.
+ *
+ * For error reporting purposes we treat the 48-bit index as a signed quantity.
+ *
+ * The sequence number is a circular counter (skipping 0) describing how many
+ * times the referenced mft record has been (re)used. This has to match the
+ * sequence number of the mft record being referenced, otherwise the reference
+ * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
+ *
+ * If the sequence number is zero it is assumed that no sequence number
+ * consistency checking should be performed.
+ *
+ * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
+ * for high_part being 0 and if not either BUG(), cause a panic() or handle
+ * the situation in some other way. This shouldn't be a problem as a volume has
+ * to become HUGE in order to need more than 32-bits worth of mft records.
+ * Assuming the standard mft record size of 1kb only the records (never mind
+ * the non-resident attributes, etc.) would require 4Tb of space on their own
+ * for the first 32 bits worth of records. This is only if some strange person
+ * doesn't decide to foul play and make the mft sparse which would be a really
+ * horrible thing to do as it would trash our current driver implementation. )-:
+ * Do I hear screams "we want 64-bit inodes!" ?!? (-;
+ *
+ * FIXME: The mft zone is defined as the first 12% of the volume. This space is
+ * reserved so that the mft can grow contiguously and hence doesn't become
+ * fragmented. Volume free space includes the empty part of the mft zone and
+ * when the volume's free 88% are used up, the mft zone is shrunk by a factor
+ * of 2, thus making more space available for more files/data. This process is
+ * repeated every time there is no more free space except for the mft zone until
+ * there really is no more free space.
+ */
+
+/*
+ * Typedef the MFT_REF as a 64-bit value for easier handling.
+ * Also define two unpacking macros to get to the reference (MREF) and
+ * sequence number (MSEQNO) respectively.
+ * The _LE versions are to be applied on little endian MFT_REFs.
+ * Note: The _LE versions will return a CPU endian formatted value!
+ */
+#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
+
+typedef u64 MFT_REF;
+typedef le64 leMFT_REF;
+
+#define MK_MREF(m, s)	((MFT_REF)(((MFT_REF)(s) << 48) |		\
+					((MFT_REF)(m) & MFT_REF_MASK_CPU)))
+#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
+
+#define MREF(x)		((unsigned long)((x) & MFT_REF_MASK_CPU))
+#define MSEQNO(x)	((u16)(((x) >> 48) & 0xffff))
+#define MREF_LE(x)	((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
+#define MSEQNO_LE(x)	((u16)((le64_to_cpu(x) >> 48) & 0xffff))
+
+#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? true : false)
+#define ERR_MREF(x)	((u64)((s64)(x)))
+#define MREF_ERR(x)	((int)((s64)(x)))
+
+/*
+ * The mft record header present at the beginning of every record in the mft.
+ * This is followed by a sequence of variable length attribute records which
+ * is terminated by an attribute of type AT_END which is a truncated attribute
+ * in that it only consists of the attribute type code AT_END and none of the
+ * other members of the attribute structure are present.
+ */
+typedef struct {
+/*Ofs*/
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
+	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
+	le16 usa_count;		/* See NTFS_RECORD definition above. */
+
+/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
+				   Changed every time the record is modified. */
+/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
+				   reused. (See description for MFT_REF
+				   above.) NOTE: The increment (skipping zero)
+				   is done when the file is deleted. NOTE: If
+				   this is zero it is left zero. */
+/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
+				   directory entries referencing this record.
+				   NOTE: Only used in mft base records.
+				   NOTE: When deleting a directory entry we
+				   check the link_count and if it is 1 we
+				   delete the file. Otherwise we delete the
+				   FILE_NAME_ATTR being referenced by the
+				   directory entry from the mft record and
+				   decrement the link_count.
+				   FIXME: Careful with Win32 + DOS names! */
+/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
+				   mft record from the start of the mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
+				   is deleted, the MFT_RECORD_IN_USE flag is
+				   set to zero. */
+/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
+				   record. This should be equal to the mft
+				   record size. */
+/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
+				   When it is not zero it is a mft reference
+				   pointing to the base mft record to which
+				   this record belongs (this is then used to
+				   locate the attribute list attribute present
+				   in the base record which describes this
+				   extension record and hence might need
+				   modification when the extension record
+				   itself is modified, also locating the
+				   attribute list also means finding the other
+				   potential extents, belonging to the non-base
+				   mft record). */
+/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
+				   the next attribute added to this mft record.
+				   NOTE: Incremented each time after it is used.
+				   NOTE: Every time the mft record is reused
+				   this number is set to zero.  NOTE: The first
+				   instance number is always 0. */
+/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
+/* 42*/ le16 reserved;		/* Reserved/alignment. */
+/* 44*/ le32 mft_record_number;	/* Number of this mft record. */
+/* sizeof() = 48 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes.  This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work.  As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD;
+
+/* This is the version without the NTFS 3.1+ specific fields. */
+typedef struct {
+/*Ofs*/
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
+	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
+	le16 usa_count;		/* See NTFS_RECORD definition above. */
+
+/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
+				   Changed every time the record is modified. */
+/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
+				   reused. (See description for MFT_REF
+				   above.) NOTE: The increment (skipping zero)
+				   is done when the file is deleted. NOTE: If
+				   this is zero it is left zero. */
+/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
+				   directory entries referencing this record.
+				   NOTE: Only used in mft base records.
+				   NOTE: When deleting a directory entry we
+				   check the link_count and if it is 1 we
+				   delete the file. Otherwise we delete the
+				   FILE_NAME_ATTR being referenced by the
+				   directory entry from the mft record and
+				   decrement the link_count.
+				   FIXME: Careful with Win32 + DOS names! */
+/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
+				   mft record from the start of the mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
+				   is deleted, the MFT_RECORD_IN_USE flag is
+				   set to zero. */
+/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
+				   record. This should be equal to the mft
+				   record size. */
+/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
+				   When it is not zero it is a mft reference
+				   pointing to the base mft record to which
+				   this record belongs (this is then used to
+				   locate the attribute list attribute present
+				   in the base record which describes this
+				   extension record and hence might need
+				   modification when the extension record
+				   itself is modified, also locating the
+				   attribute list also means finding the other
+				   potential extents, belonging to the non-base
+				   mft record). */
+/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
+				   the next attribute added to this mft record.
+				   NOTE: Incremented each time after it is used.
+				   NOTE: Every time the mft record is reused
+				   this number is set to zero.  NOTE: The first
+				   instance number is always 0. */
+/* sizeof() = 42 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes.  This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work.  As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD_OLD;
+
+/*
+ * System defined attributes (32-bit).  Each attribute type has a corresponding
+ * attribute name (Unicode string of maximum 64 character length) as described
+ * by the attribute definitions present in the data attribute of the $AttrDef
+ * system file.  On NTFS 3.0 volumes the names are just as the types are named
+ * in the below defines exchanging AT_ for the dollar sign ($).  If that is not
+ * a revealing choice of symbol I do not know what is... (-;
+ */
+enum {
+	AT_UNUSED			= cpu_to_le32(         0),
+	AT_STANDARD_INFORMATION		= cpu_to_le32(      0x10),
+	AT_ATTRIBUTE_LIST		= cpu_to_le32(      0x20),
+	AT_FILE_NAME			= cpu_to_le32(      0x30),
+	AT_OBJECT_ID			= cpu_to_le32(      0x40),
+	AT_SECURITY_DESCRIPTOR		= cpu_to_le32(      0x50),
+	AT_VOLUME_NAME			= cpu_to_le32(      0x60),
+	AT_VOLUME_INFORMATION		= cpu_to_le32(      0x70),
+	AT_DATA				= cpu_to_le32(      0x80),
+	AT_INDEX_ROOT			= cpu_to_le32(      0x90),
+	AT_INDEX_ALLOCATION		= cpu_to_le32(      0xa0),
+	AT_BITMAP			= cpu_to_le32(      0xb0),
+	AT_REPARSE_POINT		= cpu_to_le32(      0xc0),
+	AT_EA_INFORMATION		= cpu_to_le32(      0xd0),
+	AT_EA				= cpu_to_le32(      0xe0),
+	AT_PROPERTY_SET			= cpu_to_le32(      0xf0),
+	AT_LOGGED_UTILITY_STREAM	= cpu_to_le32(     0x100),
+	AT_FIRST_USER_DEFINED_ATTRIBUTE	= cpu_to_le32(    0x1000),
+	AT_END				= cpu_to_le32(0xffffffff)
+};
+
+typedef le32 ATTR_TYPE;
+
+/*
+ * The collation rules for sorting views/indexes/etc (32-bit).
+ *
+ * COLLATION_BINARY - Collate by binary compare where the first byte is most
+ *	significant.
+ * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
+ *	Unicode values, except that when a character can be uppercased, the
+ *	upper case value collates before the lower case one.
+ * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
+ *	is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
+ *	what the difference is. Perhaps the difference is that file names
+ *	would treat some special characters in an odd way (see
+ *	unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
+ *	for what I mean but COLLATION_UNICODE_STRING would not give any special
+ *	treatment to any characters at all, but this is speculation.
+ * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
+ *	values. E.g. used for $SII index in FILE_Secure, which sorts by
+ *	security_id (le32).
+ * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
+ *	E.g. used for $O index in FILE_Extend/$Quota.
+ * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
+ *	values and second by ascending security_id values. E.g. used for $SDH
+ *	index in FILE_Secure.
+ * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
+ *	le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
+ *	sorts by object_id (16-byte), by splitting up the object_id in four
+ *	le32 values and using them as individual keys. E.g. take the following
+ *	two security_ids, stored as follows on disk:
+ *		1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
+ *		2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
+ *	To compare them, they are split into four le32 values each, like so:
+ *		1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
+ *		2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
+ *	Now, it is apparent why the 2nd object_id collates after the 1st: the
+ *	first le32 value of the 1st object_id is less than the first le32 of
+ *	the 2nd object_id. If the first le32 values of both object_ids were
+ *	equal then the second le32 values would be compared, etc.
+ */
+enum {
+	COLLATION_BINARY		= cpu_to_le32(0x00),
+	COLLATION_FILE_NAME		= cpu_to_le32(0x01),
+	COLLATION_UNICODE_STRING	= cpu_to_le32(0x02),
+	COLLATION_NTOFS_ULONG		= cpu_to_le32(0x10),
+	COLLATION_NTOFS_SID		= cpu_to_le32(0x11),
+	COLLATION_NTOFS_SECURITY_HASH	= cpu_to_le32(0x12),
+	COLLATION_NTOFS_ULONGS		= cpu_to_le32(0x13),
+};
+
+typedef le32 COLLATION_RULE;
+
+/*
+ * The flags (32-bit) describing attribute properties in the attribute
+ * definition structure.  FIXME: This information is based on Regis's
+ * information and, according to him, it is not certain and probably
+ * incomplete.  The INDEXABLE flag is fairly certainly correct as only the file
+ * name attribute has this flag set and this is the only attribute indexed in
+ * NT4.
+ */
+enum {
+	ATTR_DEF_INDEXABLE	= cpu_to_le32(0x02), /* Attribute can be
+					indexed. */
+	ATTR_DEF_MULTIPLE	= cpu_to_le32(0x04), /* Attribute type
+					can be present multiple times in the
+					mft records of an inode. */
+	ATTR_DEF_NOT_ZERO	= cpu_to_le32(0x08), /* Attribute value
+					must contain at least one non-zero
+					byte. */
+	ATTR_DEF_INDEXED_UNIQUE	= cpu_to_le32(0x10), /* Attribute must be
+					indexed and the attribute value must be
+					unique for the attribute type in all of
+					the mft records of an inode. */
+	ATTR_DEF_NAMED_UNIQUE	= cpu_to_le32(0x20), /* Attribute must be
+					named and the name must be unique for
+					the attribute type in all of the mft
+					records of an inode. */
+	ATTR_DEF_RESIDENT	= cpu_to_le32(0x40), /* Attribute must be
+					resident. */
+	ATTR_DEF_ALWAYS_LOG	= cpu_to_le32(0x80), /* Always log
+					modifications to this attribute,
+					regardless of whether it is resident or
+					non-resident.  Without this, only log
+					modifications if the attribute is
+					resident. */
+};
+
+typedef le32 ATTR_DEF_FLAGS;
+
+/*
+ * The data attribute of FILE_AttrDef contains a sequence of attribute
+ * definitions for the NTFS volume. With this, it is supposed to be safe for an
+ * older NTFS driver to mount a volume containing a newer NTFS version without
+ * damaging it (that's the theory. In practice it's: not damaging it too much).
+ * Entries are sorted by attribute type. The flags describe whether the
+ * attribute can be resident/non-resident and possibly other things, but the
+ * actual bits are unknown.
+ */
+typedef struct {
+/*hex ofs*/
+/*  0*/	ntfschar name[0x40];		/* Unicode name of the attribute. Zero
+					   terminated. */
+/* 80*/	ATTR_TYPE type;			/* Type of the attribute. */
+/* 84*/	le32 display_rule;		/* Default display rule.
+					   FIXME: What does it mean? (AIA) */
+/* 88*/ COLLATION_RULE collation_rule;	/* Default collation rule. */
+/* 8c*/	ATTR_DEF_FLAGS flags;		/* Flags describing the attribute. */
+/* 90*/	sle64 min_size;			/* Optional minimum attribute size. */
+/* 98*/	sle64 max_size;			/* Maximum size of attribute. */
+/* sizeof() = 0xa0 or 160 bytes */
+} __attribute__ ((__packed__)) ATTR_DEF;
+
+/*
+ * Attribute flags (16-bit).
+ */
+enum {
+	ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
+	ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
+							      mask.  Also, first
+							      illegal value. */
+	ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
+	ATTR_IS_SPARSE	      = cpu_to_le16(0x8000),
+} __attribute__ ((__packed__));
+
+typedef le16 ATTR_FLAGS;
+
+/*
+ * Attribute compression.
+ *
+ * Only the data attribute is ever compressed in the current ntfs driver in
+ * Windows. Further, compression is only applied when the data attribute is
+ * non-resident. Finally, to use compression, the maximum allowed cluster size
+ * on a volume is 4kib.
+ *
+ * The compression method is based on independently compressing blocks of X
+ * clusters, where X is determined from the compression_unit value found in the
+ * non-resident attribute record header (more precisely: X = 2^compression_unit
+ * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
+ *
+ * There are three different cases of how a compression block of X clusters
+ * can be stored:
+ *
+ *   1) The data in the block is all zero (a sparse block):
+ *	  This is stored as a sparse block in the runlist, i.e. the runlist
+ *	  entry has length = X and lcn = -1. The mapping pairs array actually
+ *	  uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
+ *	  all, which is then interpreted by the driver as lcn = -1.
+ *	  NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
+ *	  the same principles apply as above, except that the length is not
+ *	  restricted to being any particular value.
+ *
+ *   2) The data in the block is not compressed:
+ *	  This happens when compression doesn't reduce the size of the block
+ *	  in clusters. I.e. if compression has a small effect so that the
+ *	  compressed data still occupies X clusters, then the uncompressed data
+ *	  is stored in the block.
+ *	  This case is recognised by the fact that the runlist entry has
+ *	  length = X and lcn >= 0. The mapping pairs array stores this as
+ *	  normal with a run length of X and some specific delta_lcn, i.e.
+ *	  delta_lcn has to be present.
+ *
+ *   3) The data in the block is compressed:
+ *	  The common case. This case is recognised by the fact that the run
+ *	  list entry has length L < X and lcn >= 0. The mapping pairs array
+ *	  stores this as normal with a run length of X and some specific
+ *	  delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
+ *	  immediately followed by a sparse entry with length = X - L and
+ *	  lcn = -1. The latter entry is to make up the vcn counting to the
+ *	  full compression block size X.
+ *
+ * In fact, life is more complicated because adjacent entries of the same type
+ * can be coalesced. This means that one has to keep track of the number of
+ * clusters handled and work on a basis of X clusters at a time being one
+ * block. An example: if length L > X this means that this particular runlist
+ * entry contains a block of length X and part of one or more blocks of length
+ * L - X. Another example: if length L < X, this does not necessarily mean that
+ * the block is compressed as it might be that the lcn changes inside the block
+ * and hence the following runlist entry describes the continuation of the
+ * potentially compressed block. The block would be compressed if the
+ * following runlist entry describes at least X - L sparse clusters, thus
+ * making up the compression block length as described in point 3 above. (Of
+ * course, there can be several runlist entries with small lengths so that the
+ * sparse entry does not follow the first data containing entry with
+ * length < X.)
+ *
+ * NOTE: At the end of the compressed attribute value, there most likely is not
+ * just the right amount of data to make up a compression block, thus this data
+ * is not even attempted to be compressed. It is just stored as is, unless
+ * the number of clusters it occupies is reduced when compressed in which case
+ * it is stored as a compressed compression block, complete with sparse
+ * clusters at the end.
+ */
+
+/*
+ * Flags of resident attributes (8-bit).
+ */
+enum {
+	RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
+					    (has implications for deleting and
+					    modifying the attribute). */
+} __attribute__ ((__packed__));
+
+typedef u8 RESIDENT_ATTR_FLAGS;
+
+/*
+ * Attribute record header. Always aligned to 8-byte boundary.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	ATTR_TYPE type;		/* The (32-bit) type of the attribute. */
+/*  4*/	le32 length;		/* Byte size of the resident part of the
+				   attribute (aligned to 8-byte boundary).
+				   Used to get to the next attribute. */
+/*  8*/	u8 non_resident;	/* If 0, attribute is resident.
+				   If 1, attribute is non-resident. */
+/*  9*/	u8 name_length;		/* Unicode character size of name of attribute.
+				   0 if unnamed. */
+/* 10*/	le16 name_offset;	/* If name_length != 0, the byte offset to the
+				   beginning of the name from the attribute
+				   record. Note that the name is stored as a
+				   Unicode string. When creating, place offset
+				   just at the end of the record header. Then,
+				   follow with attribute value or mapping pairs
+				   array, resident and non-resident attributes
+				   respectively, aligning to an 8-byte
+				   boundary. */
+/* 12*/	ATTR_FLAGS flags;	/* Flags describing the attribute. */
+/* 14*/	le16 instance;		/* The instance of this attribute record. This
+				   number is unique within this mft record (see
+				   MFT_RECORD/next_attribute_instance notes in
+				   in mft.h for more details). */
+/* 16*/	union {
+		/* Resident attributes. */
+		struct {
+/* 16 */		le32 value_length;/* Byte size of attribute value. */
+/* 20 */		le16 value_offset;/* Byte offset of the attribute
+					     value from the start of the
+					     attribute record. When creating,
+					     align to 8-byte boundary if we
+					     have a name present as this might
+					     not have a length of a multiple
+					     of 8-bytes. */
+/* 22 */		RESIDENT_ATTR_FLAGS flags; /* See above. */
+/* 23 */		s8 reserved;	  /* Reserved/alignment to 8-byte
+					     boundary. */
+		} __attribute__ ((__packed__)) resident;
+		/* Non-resident attributes. */
+		struct {
+/* 16*/			leVCN lowest_vcn;/* Lowest valid virtual cluster number
+				for this portion of the attribute value or
+				0 if this is the only extent (usually the
+				case). - Only when an attribute list is used
+				does lowest_vcn != 0 ever occur. */
+/* 24*/			leVCN highest_vcn;/* Highest valid vcn of this extent of
+				the attribute value. - Usually there is only one
+				portion, so this usually equals the attribute
+				value size in clusters minus 1. Can be -1 for
+				zero length files. Can be 0 for "single extent"
+				attributes. */
+/* 32*/			le16 mapping_pairs_offset; /* Byte offset from the
+				beginning of the structure to the mapping pairs
+				array which contains the mappings between the
+				vcns and the logical cluster numbers (lcns).
+				When creating, place this at the end of this
+				record header aligned to 8-byte boundary. */
+/* 34*/			u8 compression_unit; /* The compression unit expressed
+				as the log to the base 2 of the number of
+				clusters in a compression unit.  0 means not
+				compressed.  (This effectively limits the
+				compression unit size to be a power of two
+				clusters.)  WinNT4 only uses a value of 4.
+				Sparse files have this set to 0 on XPSP2. */
+/* 35*/			u8 reserved[5];		/* Align to 8-byte boundary. */
+/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
+   be difficult to keep them up-to-date.*/
+/* 40*/			sle64 allocated_size;	/* Byte size of disk space
+				allocated to hold the attribute value. Always
+				is a multiple of the cluster size. When a file
+				is compressed, this field is a multiple of the
+				compression block size (2^compression_unit) and
+				it represents the logically allocated space
+				rather than the actual on disk usage. For this
+				use the compressed_size (see below). */
+/* 48*/			sle64 data_size;	/* Byte size of the attribute
+				value. Can be larger than allocated_size if
+				attribute value is compressed or sparse. */
+/* 56*/			sle64 initialized_size;	/* Byte size of initialized
+				portion of the attribute value. Usually equals
+				data_size. */
+/* sizeof(uncompressed attr) = 64*/
+/* 64*/			sle64 compressed_size;	/* Byte size of the attribute
+				value after compression.  Only present when
+				compressed or sparse.  Always is a multiple of
+				the cluster size.  Represents the actual amount
+				of disk space being used on the disk. */
+/* sizeof(compressed attr) = 72*/
+		} __attribute__ ((__packed__)) non_resident;
+	} __attribute__ ((__packed__)) data;
+} __attribute__ ((__packed__)) ATTR_RECORD;
+
+typedef ATTR_RECORD ATTR_REC;
+
+/*
+ * File attribute flags (32-bit) appearing in the file_attributes fields of the
+ * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
+ * attributes of MFT_RECORDs and directory index entries.
+ *
+ * All of the below flags appear in the directory index entries but only some
+ * appear in the STANDARD_INFORMATION attribute whilst only some others appear
+ * in the FILENAME_ATTR attribute of MFT_RECORDs.  Unless otherwise stated the
+ * flags appear in all of the above.
+ */
+enum {
+	FILE_ATTR_READONLY		= cpu_to_le32(0x00000001),
+	FILE_ATTR_HIDDEN		= cpu_to_le32(0x00000002),
+	FILE_ATTR_SYSTEM		= cpu_to_le32(0x00000004),
+	/* Old DOS volid. Unused in NT.	= cpu_to_le32(0x00000008), */
+
+	FILE_ATTR_DIRECTORY		= cpu_to_le32(0x00000010),
+	/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
+	   reserved for the DOS SUBDIRECTORY flag. */
+	FILE_ATTR_ARCHIVE		= cpu_to_le32(0x00000020),
+	FILE_ATTR_DEVICE		= cpu_to_le32(0x00000040),
+	FILE_ATTR_NORMAL		= cpu_to_le32(0x00000080),
+
+	FILE_ATTR_TEMPORARY		= cpu_to_le32(0x00000100),
+	FILE_ATTR_SPARSE_FILE		= cpu_to_le32(0x00000200),
+	FILE_ATTR_REPARSE_POINT		= cpu_to_le32(0x00000400),
+	FILE_ATTR_COMPRESSED		= cpu_to_le32(0x00000800),
+
+	FILE_ATTR_OFFLINE		= cpu_to_le32(0x00001000),
+	FILE_ATTR_NOT_CONTENT_INDEXED	= cpu_to_le32(0x00002000),
+	FILE_ATTR_ENCRYPTED		= cpu_to_le32(0x00004000),
+
+	FILE_ATTR_VALID_FLAGS		= cpu_to_le32(0x00007fb7),
+	/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
+	   FILE_ATTR_DEVICE and preserves everything else.  This mask is used
+	   to obtain all flags that are valid for reading. */
+	FILE_ATTR_VALID_SET_FLAGS	= cpu_to_le32(0x000031a7),
+	/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
+	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
+	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
+	   is used to obtain all flags that are valid for setting. */
+	/*
+	 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
+	 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
+	 * attribute of an mft record.
+	 */
+	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= cpu_to_le32(0x10000000),
+	/* Note, this is a copy of the corresponding bit from the mft record,
+	   telling us whether this is a directory or not, i.e. whether it has
+	   an index root attribute or not. */
+	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= cpu_to_le32(0x20000000),
+	/* Note, this is a copy of the corresponding bit from the mft record,
+	   telling us whether this file has a view index present (eg. object id
+	   index, quota index, one of the security indexes or the encrypting
+	   filesystem related indexes). */
+};
+
+typedef le32 FILE_ATTR_FLAGS;
+
+/*
+ * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
+ * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
+ * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
+ * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
+ */
+
+/*
+ * Attribute: Standard information (0x10).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present in all base file records on a volume.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ *	 fields but the meaning as defined below has been verified to be
+ *	 correct by practical experimentation on Windows NT4 SP6a and is hence
+ *	 assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	sle64 creation_time;		/* Time file was created. Updated when
+					   a filename is changed(?). */
+/*  8*/	sle64 last_data_change_time;	/* Time the data attribute was last
+					   modified. */
+/* 16*/	sle64 last_mft_change_time;	/* Time this mft record was last
+					   modified. */
+/* 24*/	sle64 last_access_time;		/* Approximate time when the file was
+					   last accessed (obviously this is not
+					   updated on read-only volumes). In
+					   Windows this is only updated when
+					   accessed if some time delta has
+					   passed since the last update. Also,
+					   last access time updates can be
+					   disabled altogether for speed. */
+/* 32*/	FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
+/* 36*/	union {
+	/* NTFS 1.2 */
+		struct {
+		/* 36*/	u8 reserved12[12];	/* Reserved/alignment to 8-byte
+						   boundary. */
+		} __attribute__ ((__packed__)) v1;
+	/* sizeof() = 48 bytes */
+	/* NTFS 3.x */
+		struct {
+/*
+ * If a volume has been upgraded from a previous NTFS version, then these
+ * fields are present only if the file has been accessed since the upgrade.
+ * Recognize the difference by comparing the length of the resident attribute
+ * value. If it is 48, then the following fields are missing. If it is 72 then
+ * the fields are present. Maybe just check like this:
+ *	if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
+ *		Assume NTFS 1.2- format.
+ *		If (volume version is 3.x)
+ *			Upgrade attribute to NTFS 3.x format.
+ *		else
+ *			Use NTFS 1.2- format for access.
+ *	} else
+ *		Use NTFS 3.x format for access.
+ * Only problem is that it might be legal to set the length of the value to
+ * arbitrarily large values thus spoiling this check. - But chkdsk probably
+ * views that as a corruption, assuming that it behaves like this for all
+ * attributes.
+ */
+		/* 36*/	le32 maximum_versions;	/* Maximum allowed versions for
+				file. Zero if version numbering is disabled. */
+		/* 40*/	le32 version_number;	/* This file's version (if any).
+				Set to zero if maximum_versions is zero. */
+		/* 44*/	le32 class_id;		/* Class id from bidirectional
+				class id index (?). */
+		/* 48*/	le32 owner_id;		/* Owner_id of the user owning
+				the file. Translate via $Q index in FILE_Extend
+				/$Quota to the quota control entry for the user
+				owning the file. Zero if quotas are disabled. */
+		/* 52*/	le32 security_id;	/* Security_id for the file.
+				Translate via $SII index and $SDS data stream
+				in FILE_Secure to the security descriptor. */
+		/* 56*/	le64 quota_charged;	/* Byte size of the charge to
+				the quota for all streams of the file. Note: Is
+				zero if quotas are disabled. */
+		/* 64*/	leUSN usn;		/* Last update sequence number
+				of the file.  This is a direct index into the
+				transaction log file ($UsnJrnl).  It is zero if
+				the usn journal is disabled or this file has
+				not been subject to logging yet.  See usnjrnl.h
+				for details. */
+		} __attribute__ ((__packed__)) v3;
+	/* sizeof() = 72 bytes (NTFS 3.x) */
+	} __attribute__ ((__packed__)) ver;
+} __attribute__ ((__packed__)) STANDARD_INFORMATION;
+
+/*
+ * Attribute: Attribute list (0x20).
+ *
+ * - Can be either resident or non-resident.
+ * - Value consists of a sequence of variable length, 8-byte aligned,
+ * ATTR_LIST_ENTRY records.
+ * - The list is not terminated by anything at all! The only way to know when
+ * the end is reached is to keep track of the current offset and compare it to
+ * the attribute value size.
+ * - The attribute list attribute contains one entry for each attribute of
+ * the file in which the list is located, except for the list attribute
+ * itself. The list is sorted: first by attribute type, second by attribute
+ * name (if present), third by instance number. The extents of one
+ * non-resident attribute (if present) immediately follow after the initial
+ * extent. They are ordered by lowest_vcn and have their instace set to zero.
+ * It is not allowed to have two attributes with all sorting keys equal.
+ * - Further restrictions:
+ *	- If not resident, the vcn to lcn mapping array has to fit inside the
+ *	  base mft record.
+ *	- The attribute list attribute value has a maximum size of 256kb. This
+ *	  is imposed by the Windows cache manager.
+ * - Attribute lists are only used when the attributes of mft record do not
+ * fit inside the mft record despite all attributes (that can be made
+ * non-resident) having been made non-resident. This can happen e.g. when:
+ *	- File has a large number of hard links (lots of file name
+ *	  attributes present).
+ *	- The mapping pairs array of some non-resident attribute becomes so
+ *	  large due to fragmentation that it overflows the mft record.
+ *	- The security descriptor is very complex (not applicable to
+ *	  NTFS 3.0 volumes).
+ *	- There are many named streams.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	ATTR_TYPE type;		/* Type of referenced attribute. */
+/*  4*/	le16 length;		/* Byte size of this entry (8-byte aligned). */
+/*  6*/	u8 name_length;		/* Size in Unicode chars of the name of the
+				   attribute or 0 if unnamed. */
+/*  7*/	u8 name_offset;		/* Byte offset to beginning of attribute name
+				   (always set this to where the name would
+				   start even if unnamed). */
+/*  8*/	leVCN lowest_vcn;	/* Lowest virtual cluster number of this portion
+				   of the attribute value. This is usually 0. It
+				   is non-zero for the case where one attribute
+				   does not fit into one mft record and thus
+				   several mft records are allocated to hold
+				   this attribute. In the latter case, each mft
+				   record holds one extent of the attribute and
+				   there is one attribute list entry for each
+				   extent. NOTE: This is DEFINITELY a signed
+				   value! The windows driver uses cmp, followed
+				   by jg when comparing this, thus it treats it
+				   as signed. */
+/* 16*/	leMFT_REF mft_reference;/* The reference of the mft record holding
+				   the ATTR_RECORD for this portion of the
+				   attribute value. */
+/* 24*/	le16 instance;		/* If lowest_vcn = 0, the instance of the
+				   attribute being referenced; otherwise 0. */
+/* 26*/	ntfschar name[0];	/* Use when creating only. When reading use
+				   name_offset to determine the location of the
+				   name. */
+/* sizeof() = 26 + (attribute_name_length * 2) bytes */
+} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
+
+/*
+ * The maximum allowed length for a file name.
+ */
+#define MAXIMUM_FILE_NAME_LENGTH	255
+
+/*
+ * Possible namespaces for filenames in ntfs (8-bit).
+ */
+enum {
+	FILE_NAME_POSIX		= 0x00,
+	/* This is the largest namespace. It is case sensitive and allows all
+	   Unicode characters except for: '\0' and '/'.  Beware that in
+	   WinNT/2k/2003 by default files which eg have the same name except
+	   for their case will not be distinguished by the standard utilities
+	   and thus a "del filename" will delete both "filename" and "fileName"
+	   without warning.  However if for example Services For Unix (SFU) are
+	   installed and the case sensitive option was enabled at installation
+	   time, then you can create/access/delete such files.
+	   Note that even SFU places restrictions on the filenames beyond the
+	   '\0' and '/' and in particular the following set of characters is
+	   not allowed: '"', '/', '<', '>', '\'.  All other characters,
+	   including the ones no allowed in WIN32 namespace are allowed.
+	   Tested with SFU 3.5 (this is now free) running on Windows XP. */
+	FILE_NAME_WIN32		= 0x01,
+	/* The standard WinNT/2k NTFS long filenames. Case insensitive.  All
+	   Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
+	   and '|'.  Further, names cannot end with a '.' or a space. */
+	FILE_NAME_DOS		= 0x02,
+	/* The standard DOS filenames (8.3 format). Uppercase only.  All 8-bit
+	   characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
+	   '<', '=', '>', '?', and '\'. */
+	FILE_NAME_WIN32_AND_DOS	= 0x03,
+	/* 3 means that both the Win32 and the DOS filenames are identical and
+	   hence have been saved in this single filename record. */
+} __attribute__ ((__packed__));
+
+typedef u8 FILE_NAME_TYPE_FLAGS;
+
+/*
+ * Attribute: Filename (0x30).
+ *
+ * NOTE: Always resident.
+ * NOTE: All fields, except the parent_directory, are only updated when the
+ *	 filename is changed. Until then, they just become out of sync with
+ *	 reality and the more up to date values are present in the standard
+ *	 information attribute.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ *	 fields but the meaning as defined below has been verified to be
+ *	 correct by practical experimentation on Windows NT4 SP6a and is hence
+ *	 assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*hex ofs*/
+/*  0*/	leMFT_REF parent_directory;	/* Directory this filename is
+					   referenced from. */
+/*  8*/	sle64 creation_time;		/* Time file was created. */
+/* 10*/	sle64 last_data_change_time;	/* Time the data attribute was last
+					   modified. */
+/* 18*/	sle64 last_mft_change_time;	/* Time this mft record was last
+					   modified. */
+/* 20*/	sle64 last_access_time;		/* Time this mft record was last
+					   accessed. */
+/* 28*/	sle64 allocated_size;		/* Byte size of on-disk allocated space
+					   for the unnamed data attribute.  So
+					   for normal $DATA, this is the
+					   allocated_size from the unnamed
+					   $DATA attribute and for compressed
+					   and/or sparse $DATA, this is the
+					   compressed_size from the unnamed
+					   $DATA attribute.  For a directory or
+					   other inode without an unnamed $DATA
+					   attribute, this is always 0.  NOTE:
+					   This is a multiple of the cluster
+					   size. */
+/* 30*/	sle64 data_size;		/* Byte size of actual data in unnamed
+					   data attribute.  For a directory or
+					   other inode without an unnamed $DATA
+					   attribute, this is always 0. */
+/* 38*/	FILE_ATTR_FLAGS file_attributes;	/* Flags describing the file. */
+/* 3c*/	union {
+	/* 3c*/	struct {
+		/* 3c*/	le16 packed_ea_size;	/* Size of the buffer needed to
+						   pack the extended attributes
+						   (EAs), if such are present.*/
+		/* 3e*/	le16 reserved;		/* Reserved for alignment. */
+		} __attribute__ ((__packed__)) ea;
+	/* 3c*/	struct {
+		/* 3c*/	le32 reparse_point_tag;	/* Type of reparse point,
+						   present only in reparse
+						   points and only if there are
+						   no EAs. */
+		} __attribute__ ((__packed__)) rp;
+	} __attribute__ ((__packed__)) type;
+/* 40*/	u8 file_name_length;			/* Length of file name in
+						   (Unicode) characters. */
+/* 41*/	FILE_NAME_TYPE_FLAGS file_name_type;	/* Namespace of the file name.*/
+/* 42*/	ntfschar file_name[0];			/* File name in Unicode. */
+} __attribute__ ((__packed__)) FILE_NAME_ATTR;
+
+/*
+ * GUID structures store globally unique identifiers (GUID). A GUID is a
+ * 128-bit value consisting of one group of eight hexadecimal digits, followed
+ * by three groups of four hexadecimal digits each, followed by one group of
+ * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
+ * distributed computing environment (DCE) universally unique identifier (UUID).
+ * Example of a GUID:
+ *	1F010768-5A73-BC91-0010A52216A7
+ */
+typedef struct {
+	le32 data1;	/* The first eight hexadecimal digits of the GUID. */
+	le16 data2;	/* The first group of four hexadecimal digits. */
+	le16 data3;	/* The second group of four hexadecimal digits. */
+	u8 data4[8];	/* The first two bytes are the third group of four
+			   hexadecimal digits. The remaining six bytes are the
+			   final 12 hexadecimal digits. */
+} __attribute__ ((__packed__)) GUID;
+
+/*
+ * FILE_Extend/$ObjId contains an index named $O. This index contains all
+ * object_ids present on the volume as the index keys and the corresponding
+ * mft_record numbers as the index entry data parts. The data part (defined
+ * below) also contains three other object_ids:
+ *	birth_volume_id - object_id of FILE_Volume on which the file was first
+ *			  created. Optional (i.e. can be zero).
+ *	birth_object_id - object_id of file when it was first created. Usually
+ *			  equals the object_id. Optional (i.e. can be zero).
+ *	domain_id	- Reserved (always zero).
+ */
+typedef struct {
+	leMFT_REF mft_reference;/* Mft record containing the object_id in
+				   the index entry key. */
+	union {
+		struct {
+			GUID birth_volume_id;
+			GUID birth_object_id;
+			GUID domain_id;
+		} __attribute__ ((__packed__)) origin;
+		u8 extended_info[48];
+	} __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
+
+/*
+ * Attribute: Object id (NTFS 3.0+) (0x40).
+ *
+ * NOTE: Always resident.
+ */
+typedef struct {
+	GUID object_id;				/* Unique id assigned to the
+						   file.*/
+	/* The following fields are optional. The attribute value size is 16
+	   bytes, i.e. sizeof(GUID), if these are not present at all. Note,
+	   the entries can be present but one or more (or all) can be zero
+	   meaning that that particular value(s) is(are) not defined. */
+	union {
+		struct {
+			GUID birth_volume_id;	/* Unique id of volume on which
+						   the file was first created.*/
+			GUID birth_object_id;	/* Unique id of file when it was
+						   first created. */
+			GUID domain_id;		/* Reserved, zero. */
+		} __attribute__ ((__packed__)) origin;
+		u8 extended_info[48];
+	} __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
+
+/*
+ * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
+ * the SID structure (see below).
+ */
+//typedef enum {					/* SID string prefix. */
+//	SECURITY_NULL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 0},	/* S-1-0 */
+//	SECURITY_WORLD_SID_AUTHORITY	= {0, 0, 0, 0, 0, 1},	/* S-1-1 */
+//	SECURITY_LOCAL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 2},	/* S-1-2 */
+//	SECURITY_CREATOR_SID_AUTHORITY	= {0, 0, 0, 0, 0, 3},	/* S-1-3 */
+//	SECURITY_NON_UNIQUE_AUTHORITY	= {0, 0, 0, 0, 0, 4},	/* S-1-4 */
+//	SECURITY_NT_SID_AUTHORITY	= {0, 0, 0, 0, 0, 5},	/* S-1-5 */
+//} IDENTIFIER_AUTHORITIES;
+
+/*
+ * These relative identifiers (RIDs) are used with the above identifier
+ * authorities to make up universal well-known SIDs.
+ *
+ * Note: The relative identifier (RID) refers to the portion of a SID, which
+ * identifies a user or group in relation to the authority that issued the SID.
+ * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
+ * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
+ * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
+ */
+typedef enum {					/* Identifier authority. */
+	SECURITY_NULL_RID		  = 0,	/* S-1-0 */
+	SECURITY_WORLD_RID		  = 0,	/* S-1-1 */
+	SECURITY_LOCAL_RID		  = 0,	/* S-1-2 */
+
+	SECURITY_CREATOR_OWNER_RID	  = 0,	/* S-1-3 */
+	SECURITY_CREATOR_GROUP_RID	  = 1,	/* S-1-3 */
+
+	SECURITY_CREATOR_OWNER_SERVER_RID = 2,	/* S-1-3 */
+	SECURITY_CREATOR_GROUP_SERVER_RID = 3,	/* S-1-3 */
+
+	SECURITY_DIALUP_RID		  = 1,
+	SECURITY_NETWORK_RID		  = 2,
+	SECURITY_BATCH_RID		  = 3,
+	SECURITY_INTERACTIVE_RID	  = 4,
+	SECURITY_SERVICE_RID		  = 6,
+	SECURITY_ANONYMOUS_LOGON_RID	  = 7,
+	SECURITY_PROXY_RID		  = 8,
+	SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
+	SECURITY_SERVER_LOGON_RID	  = 9,
+	SECURITY_PRINCIPAL_SELF_RID	  = 0xa,
+	SECURITY_AUTHENTICATED_USER_RID	  = 0xb,
+	SECURITY_RESTRICTED_CODE_RID	  = 0xc,
+	SECURITY_TERMINAL_SERVER_RID	  = 0xd,
+
+	SECURITY_LOGON_IDS_RID		  = 5,
+	SECURITY_LOGON_IDS_RID_COUNT	  = 3,
+
+	SECURITY_LOCAL_SYSTEM_RID	  = 0x12,
+
+	SECURITY_NT_NON_UNIQUE		  = 0x15,
+
+	SECURITY_BUILTIN_DOMAIN_RID	  = 0x20,
+
+	/*
+	 * Well-known domain relative sub-authority values (RIDs).
+	 */
+
+	/* Users. */
+	DOMAIN_USER_RID_ADMIN		  = 0x1f4,
+	DOMAIN_USER_RID_GUEST		  = 0x1f5,
+	DOMAIN_USER_RID_KRBTGT		  = 0x1f6,
+
+	/* Groups. */
+	DOMAIN_GROUP_RID_ADMINS		  = 0x200,
+	DOMAIN_GROUP_RID_USERS		  = 0x201,
+	DOMAIN_GROUP_RID_GUESTS		  = 0x202,
+	DOMAIN_GROUP_RID_COMPUTERS	  = 0x203,
+	DOMAIN_GROUP_RID_CONTROLLERS	  = 0x204,
+	DOMAIN_GROUP_RID_CERT_ADMINS	  = 0x205,
+	DOMAIN_GROUP_RID_SCHEMA_ADMINS	  = 0x206,
+	DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
+	DOMAIN_GROUP_RID_POLICY_ADMINS	  = 0x208,
+
+	/* Aliases. */
+	DOMAIN_ALIAS_RID_ADMINS		  = 0x220,
+	DOMAIN_ALIAS_RID_USERS		  = 0x221,
+	DOMAIN_ALIAS_RID_GUESTS		  = 0x222,
+	DOMAIN_ALIAS_RID_POWER_USERS	  = 0x223,
+
+	DOMAIN_ALIAS_RID_ACCOUNT_OPS	  = 0x224,
+	DOMAIN_ALIAS_RID_SYSTEM_OPS	  = 0x225,
+	DOMAIN_ALIAS_RID_PRINT_OPS	  = 0x226,
+	DOMAIN_ALIAS_RID_BACKUP_OPS	  = 0x227,
+
+	DOMAIN_ALIAS_RID_REPLICATOR	  = 0x228,
+	DOMAIN_ALIAS_RID_RAS_SERVERS	  = 0x229,
+	DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
+} RELATIVE_IDENTIFIERS;
+
+/*
+ * The universal well-known SIDs:
+ *
+ *	NULL_SID			S-1-0-0
+ *	WORLD_SID			S-1-1-0
+ *	LOCAL_SID			S-1-2-0
+ *	CREATOR_OWNER_SID		S-1-3-0
+ *	CREATOR_GROUP_SID		S-1-3-1
+ *	CREATOR_OWNER_SERVER_SID	S-1-3-2
+ *	CREATOR_GROUP_SERVER_SID	S-1-3-3
+ *
+ *	(Non-unique IDs)		S-1-4
+ *
+ * NT well-known SIDs:
+ *
+ *	NT_AUTHORITY_SID	S-1-5
+ *	DIALUP_SID		S-1-5-1
+ *
+ *	NETWORD_SID		S-1-5-2
+ *	BATCH_SID		S-1-5-3
+ *	INTERACTIVE_SID		S-1-5-4
+ *	SERVICE_SID		S-1-5-6
+ *	ANONYMOUS_LOGON_SID	S-1-5-7		(aka null logon session)
+ *	PROXY_SID		S-1-5-8
+ *	SERVER_LOGON_SID	S-1-5-9		(aka domain controller account)
+ *	SELF_SID		S-1-5-10	(self RID)
+ *	AUTHENTICATED_USER_SID	S-1-5-11
+ *	RESTRICTED_CODE_SID	S-1-5-12	(running restricted code)
+ *	TERMINAL_SERVER_SID	S-1-5-13	(running on terminal server)
+ *
+ *	(Logon IDs)		S-1-5-5-X-Y
+ *
+ *	(NT non-unique IDs)	S-1-5-0x15-...
+ *
+ *	(Built-in domain)	S-1-5-0x20
+ */
+
+/*
+ * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
+ *
+ * NOTE: This is stored as a big endian number, hence the high_part comes
+ * before the low_part.
+ */
+typedef union {
+	struct {
+		u16 high_part;	/* High 16-bits. */
+		u32 low_part;	/* Low 32-bits. */
+	} __attribute__ ((__packed__)) parts;
+	u8 value[6];		/* Value as individual bytes. */
+} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
+
+/*
+ * The SID structure is a variable-length structure used to uniquely identify
+ * users or groups. SID stands for security identifier.
+ *
+ * The standard textual representation of the SID is of the form:
+ *	S-R-I-S-S...
+ * Where:
+ *    - The first "S" is the literal character 'S' identifying the following
+ *	digits as a SID.
+ *    - R is the revision level of the SID expressed as a sequence of digits
+ *	either in decimal or hexadecimal (if the later, prefixed by "0x").
+ *    - I is the 48-bit identifier_authority, expressed as digits as R above.
+ *    - S... is one or more sub_authority values, expressed as digits as above.
+ *
+ * Example SID; the domain-relative SID of the local Administrators group on
+ * Windows NT/2k:
+ *	S-1-5-32-544
+ * This translates to a SID with:
+ *	revision = 1,
+ *	sub_authority_count = 2,
+ *	identifier_authority = {0,0,0,0,0,5},	// SECURITY_NT_AUTHORITY
+ *	sub_authority[0] = 32,			// SECURITY_BUILTIN_DOMAIN_RID
+ *	sub_authority[1] = 544			// DOMAIN_ALIAS_RID_ADMINS
+ */
+typedef struct {
+	u8 revision;
+	u8 sub_authority_count;
+	SID_IDENTIFIER_AUTHORITY identifier_authority;
+	le32 sub_authority[1];		/* At least one sub_authority. */
+} __attribute__ ((__packed__)) SID;
+
+/*
+ * Current constants for SIDs.
+ */
+typedef enum {
+	SID_REVISION			=  1,	/* Current revision level. */
+	SID_MAX_SUB_AUTHORITIES		= 15,	/* Maximum number of those. */
+	SID_RECOMMENDED_SUB_AUTHORITIES	=  1,	/* Will change to around 6 in
+						   a future revision. */
+} SID_CONSTANTS;
+
+/*
+ * The predefined ACE types (8-bit, see below).
+ */
+enum {
+	ACCESS_MIN_MS_ACE_TYPE		= 0,
+	ACCESS_ALLOWED_ACE_TYPE		= 0,
+	ACCESS_DENIED_ACE_TYPE		= 1,
+	SYSTEM_AUDIT_ACE_TYPE		= 2,
+	SYSTEM_ALARM_ACE_TYPE		= 3, /* Not implemented as of Win2k. */
+	ACCESS_MAX_MS_V2_ACE_TYPE	= 3,
+
+	ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
+	ACCESS_MAX_MS_V3_ACE_TYPE	= 4,
+
+	/* The following are Win2k only. */
+	ACCESS_MIN_MS_OBJECT_ACE_TYPE	= 5,
+	ACCESS_ALLOWED_OBJECT_ACE_TYPE	= 5,
+	ACCESS_DENIED_OBJECT_ACE_TYPE	= 6,
+	SYSTEM_AUDIT_OBJECT_ACE_TYPE	= 7,
+	SYSTEM_ALARM_OBJECT_ACE_TYPE	= 8,
+	ACCESS_MAX_MS_OBJECT_ACE_TYPE	= 8,
+
+	ACCESS_MAX_MS_V4_ACE_TYPE	= 8,
+
+	/* This one is for WinNT/2k. */
+	ACCESS_MAX_MS_ACE_TYPE		= 8,
+} __attribute__ ((__packed__));
+
+typedef u8 ACE_TYPES;
+
+/*
+ * The ACE flags (8-bit) for audit and inheritance (see below).
+ *
+ * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
+ * types to indicate that a message is generated (in Windows!) for successful
+ * accesses.
+ *
+ * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
+ * to indicate that a message is generated (in Windows!) for failed accesses.
+ */
+enum {
+	/* The inheritance flags. */
+	OBJECT_INHERIT_ACE		= 0x01,
+	CONTAINER_INHERIT_ACE		= 0x02,
+	NO_PROPAGATE_INHERIT_ACE	= 0x04,
+	INHERIT_ONLY_ACE		= 0x08,
+	INHERITED_ACE			= 0x10,	/* Win2k only. */
+	VALID_INHERIT_FLAGS		= 0x1f,
+
+	/* The audit flags. */
+	SUCCESSFUL_ACCESS_ACE_FLAG	= 0x40,
+	FAILED_ACCESS_ACE_FLAG		= 0x80,
+} __attribute__ ((__packed__));
+
+typedef u8 ACE_FLAGS;
+
+/*
+ * An ACE is an access-control entry in an access-control list (ACL).
+ * An ACE defines access to an object for a specific user or group or defines
+ * the types of access that generate system-administration messages or alarms
+ * for a specific user or group. The user or group is identified by a security
+ * identifier (SID).
+ *
+ * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
+ * which specifies the type and size of the ACE. The format of the subsequent
+ * data depends on the ACE type.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	ACE_TYPES type;		/* Type of the ACE. */
+/*  1*/	ACE_FLAGS flags;	/* Flags describing the ACE. */
+/*  2*/	le16 size;		/* Size in bytes of the ACE. */
+} __attribute__ ((__packed__)) ACE_HEADER;
+
+/*
+ * The access mask (32-bit). Defines the access rights.
+ *
+ * The specific rights (bits 0 to 15).  These depend on the type of the object
+ * being secured by the ACE.
+ */
+enum {
+	/* Specific rights for files and directories are as follows: */
+
+	/* Right to read data from the file. (FILE) */
+	FILE_READ_DATA			= cpu_to_le32(0x00000001),
+	/* Right to list contents of a directory. (DIRECTORY) */
+	FILE_LIST_DIRECTORY		= cpu_to_le32(0x00000001),
+
+	/* Right to write data to the file. (FILE) */
+	FILE_WRITE_DATA			= cpu_to_le32(0x00000002),
+	/* Right to create a file in the directory. (DIRECTORY) */
+	FILE_ADD_FILE			= cpu_to_le32(0x00000002),
+
+	/* Right to append data to the file. (FILE) */
+	FILE_APPEND_DATA		= cpu_to_le32(0x00000004),
+	/* Right to create a subdirectory. (DIRECTORY) */
+	FILE_ADD_SUBDIRECTORY		= cpu_to_le32(0x00000004),
+
+	/* Right to read extended attributes. (FILE/DIRECTORY) */
+	FILE_READ_EA			= cpu_to_le32(0x00000008),
+
+	/* Right to write extended attributes. (FILE/DIRECTORY) */
+	FILE_WRITE_EA			= cpu_to_le32(0x00000010),
+
+	/* Right to execute a file. (FILE) */
+	FILE_EXECUTE			= cpu_to_le32(0x00000020),
+	/* Right to traverse the directory. (DIRECTORY) */
+	FILE_TRAVERSE			= cpu_to_le32(0x00000020),
+
+	/*
+	 * Right to delete a directory and all the files it contains (its
+	 * children), even if the files are read-only. (DIRECTORY)
+	 */
+	FILE_DELETE_CHILD		= cpu_to_le32(0x00000040),
+
+	/* Right to read file attributes. (FILE/DIRECTORY) */
+	FILE_READ_ATTRIBUTES		= cpu_to_le32(0x00000080),
+
+	/* Right to change file attributes. (FILE/DIRECTORY) */
+	FILE_WRITE_ATTRIBUTES		= cpu_to_le32(0x00000100),
+
+	/*
+	 * The standard rights (bits 16 to 23).  These are independent of the
+	 * type of object being secured.
+	 */
+
+	/* Right to delete the object. */
+	DELETE				= cpu_to_le32(0x00010000),
+
+	/*
+	 * Right to read the information in the object's security descriptor,
+	 * not including the information in the SACL, i.e. right to read the
+	 * security descriptor and owner.
+	 */
+	READ_CONTROL			= cpu_to_le32(0x00020000),
+
+	/* Right to modify the DACL in the object's security descriptor. */
+	WRITE_DAC			= cpu_to_le32(0x00040000),
+
+	/* Right to change the owner in the object's security descriptor. */
+	WRITE_OWNER			= cpu_to_le32(0x00080000),
+
+	/*
+	 * Right to use the object for synchronization.  Enables a process to
+	 * wait until the object is in the signalled state.  Some object types
+	 * do not support this access right.
+	 */
+	SYNCHRONIZE			= cpu_to_le32(0x00100000),
+
+	/*
+	 * The following STANDARD_RIGHTS_* are combinations of the above for
+	 * convenience and are defined by the Win32 API.
+	 */
+
+	/* These are currently defined to READ_CONTROL. */
+	STANDARD_RIGHTS_READ		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_WRITE		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_EXECUTE		= cpu_to_le32(0x00020000),
+
+	/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
+	STANDARD_RIGHTS_REQUIRED	= cpu_to_le32(0x000f0000),
+
+	/*
+	 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
+	 * SYNCHRONIZE access.
+	 */
+	STANDARD_RIGHTS_ALL		= cpu_to_le32(0x001f0000),
+
+	/*
+	 * The access system ACL and maximum allowed access types (bits 24 to
+	 * 25, bits 26 to 27 are reserved).
+	 */
+	ACCESS_SYSTEM_SECURITY		= cpu_to_le32(0x01000000),
+	MAXIMUM_ALLOWED			= cpu_to_le32(0x02000000),
+
+	/*
+	 * The generic rights (bits 28 to 31).  These map onto the standard and
+	 * specific rights.
+	 */
+
+	/* Read, write, and execute access. */
+	GENERIC_ALL			= cpu_to_le32(0x10000000),
+
+	/* Execute access. */
+	GENERIC_EXECUTE			= cpu_to_le32(0x20000000),
+
+	/*
+	 * Write access.  For files, this maps onto:
+	 *	FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
+	 *	FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
+	 * For directories, the mapping has the same numerical value.  See
+	 * above for the descriptions of the rights granted.
+	 */
+	GENERIC_WRITE			= cpu_to_le32(0x40000000),
+
+	/*
+	 * Read access.  For files, this maps onto:
+	 *	FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
+	 *	STANDARD_RIGHTS_READ | SYNCHRONIZE
+	 * For directories, the mapping has the same numberical value.  See
+	 * above for the descriptions of the rights granted.
+	 */
+	GENERIC_READ			= cpu_to_le32(0x80000000),
+};
+
+typedef le32 ACCESS_MASK;
+
+/*
+ * The generic mapping array. Used to denote the mapping of each generic
+ * access right to a specific access mask.
+ *
+ * FIXME: What exactly is this and what is it for? (AIA)
+ */
+typedef struct {
+	ACCESS_MASK generic_read;
+	ACCESS_MASK generic_write;
+	ACCESS_MASK generic_execute;
+	ACCESS_MASK generic_all;
+} __attribute__ ((__packed__)) GENERIC_MAPPING;
+
+/*
+ * The predefined ACE type structures are as defined below.
+ */
+
+/*
+ * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
+ */
+typedef struct {
+/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+	ACE_TYPES type;		/* Type of the ACE. */
+	ACE_FLAGS flags;	/* Flags describing the ACE. */
+	le16 size;		/* Size in bytes of the ACE. */
+/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
+
+/*  8*/	SID sid;		/* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
+			       SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
+
+/*
+ * The object ACE flags (32-bit).
+ */
+enum {
+	ACE_OBJECT_TYPE_PRESENT			= cpu_to_le32(1),
+	ACE_INHERITED_OBJECT_TYPE_PRESENT	= cpu_to_le32(2),
+};
+
+typedef le32 OBJECT_ACE_FLAGS;
+
+typedef struct {
+/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+	ACE_TYPES type;		/* Type of the ACE. */
+	ACE_FLAGS flags;	/* Flags describing the ACE. */
+	le16 size;		/* Size in bytes of the ACE. */
+/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
+
+/*  8*/	OBJECT_ACE_FLAGS object_flags;	/* Flags describing the object ACE. */
+/* 12*/	GUID object_type;
+/* 28*/	GUID inherited_object_type;
+
+/* 44*/	SID sid;		/* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
+			       ACCESS_DENIED_OBJECT_ACE,
+			       SYSTEM_AUDIT_OBJECT_ACE,
+			       SYSTEM_ALARM_OBJECT_ACE;
+
+/*
+ * An ACL is an access-control list (ACL).
+ * An ACL starts with an ACL header structure, which specifies the size of
+ * the ACL and the number of ACEs it contains. The ACL header is followed by
+ * zero or more access control entries (ACEs). The ACL as well as each ACE
+ * are aligned on 4-byte boundaries.
+ */
+typedef struct {
+	u8 revision;	/* Revision of this ACL. */
+	u8 alignment1;
+	le16 size;	/* Allocated space in bytes for ACL. Includes this
+			   header, the ACEs and the remaining free space. */
+	le16 ace_count;	/* Number of ACEs in the ACL. */
+	le16 alignment2;
+/* sizeof() = 8 bytes */
+} __attribute__ ((__packed__)) ACL;
+
+/*
+ * Current constants for ACLs.
+ */
+typedef enum {
+	/* Current revision. */
+	ACL_REVISION		= 2,
+	ACL_REVISION_DS		= 4,
+
+	/* History of revisions. */
+	ACL_REVISION1		= 1,
+	MIN_ACL_REVISION	= 2,
+	ACL_REVISION2		= 2,
+	ACL_REVISION3		= 3,
+	ACL_REVISION4		= 4,
+	MAX_ACL_REVISION	= 4,
+} ACL_CONSTANTS;
+
+/*
+ * The security descriptor control flags (16-bit).
+ *
+ * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
+ *	pointed to by the Owner field was provided by a defaulting mechanism
+ *	rather than explicitly provided by the original provider of the
+ *	security descriptor.  This may affect the treatment of the SID with
+ *	respect to inheritance of an owner.
+ *
+ * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
+ *	the Group field was provided by a defaulting mechanism rather than
+ *	explicitly provided by the original provider of the security
+ *	descriptor.  This may affect the treatment of the SID with respect to
+ *	inheritance of a primary group.
+ *
+ * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
+ *	descriptor contains a discretionary ACL.  If this flag is set and the
+ *	Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
+ *	explicitly being specified.
+ *
+ * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ *	pointed to by the Dacl field was provided by a defaulting mechanism
+ *	rather than explicitly provided by the original provider of the
+ *	security descriptor.  This may affect the treatment of the ACL with
+ *	respect to inheritance of an ACL.  This flag is ignored if the
+ *	DaclPresent flag is not set.
+ *
+ * SE_SACL_PRESENT - This boolean flag, when set,  indicates that the security
+ *	descriptor contains a system ACL pointed to by the Sacl field.  If this
+ *	flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
+ *	an empty (but present) ACL is being specified.
+ *
+ * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ *	pointed to by the Sacl field was provided by a defaulting mechanism
+ *	rather than explicitly provided by the original provider of the
+ *	security descriptor.  This may affect the treatment of the ACL with
+ *	respect to inheritance of an ACL.  This flag is ignored if the
+ *	SaclPresent flag is not set.
+ *
+ * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
+ *	descriptor is in self-relative form.  In this form, all fields of the
+ *	security descriptor are contiguous in memory and all pointer fields are
+ *	expressed as offsets from the beginning of the security descriptor.
+ */
+enum {
+	SE_OWNER_DEFAULTED		= cpu_to_le16(0x0001),
+	SE_GROUP_DEFAULTED		= cpu_to_le16(0x0002),
+	SE_DACL_PRESENT			= cpu_to_le16(0x0004),
+	SE_DACL_DEFAULTED		= cpu_to_le16(0x0008),
+
+	SE_SACL_PRESENT			= cpu_to_le16(0x0010),
+	SE_SACL_DEFAULTED		= cpu_to_le16(0x0020),
+
+	SE_DACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0100),
+	SE_SACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0200),
+	SE_DACL_AUTO_INHERITED		= cpu_to_le16(0x0400),
+	SE_SACL_AUTO_INHERITED		= cpu_to_le16(0x0800),
+
+	SE_DACL_PROTECTED		= cpu_to_le16(0x1000),
+	SE_SACL_PROTECTED		= cpu_to_le16(0x2000),
+	SE_RM_CONTROL_VALID		= cpu_to_le16(0x4000),
+	SE_SELF_RELATIVE		= cpu_to_le16(0x8000)
+} __attribute__ ((__packed__));
+
+typedef le16 SECURITY_DESCRIPTOR_CONTROL;
+
+/*
+ * Self-relative security descriptor. Contains the owner and group SIDs as well
+ * as the sacl and dacl ACLs inside the security descriptor itself.
+ */
+typedef struct {
+	u8 revision;	/* Revision level of the security descriptor. */
+	u8 alignment;
+	SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
+			   the descriptor as well as the following fields. */
+	le32 owner;	/* Byte offset to a SID representing an object's
+			   owner. If this is NULL, no owner SID is present in
+			   the descriptor. */
+	le32 group;	/* Byte offset to a SID representing an object's
+			   primary group. If this is NULL, no primary group
+			   SID is present in the descriptor. */
+	le32 sacl;	/* Byte offset to a system ACL. Only valid, if
+			   SE_SACL_PRESENT is set in the control field. If
+			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+			   is specified. */
+	le32 dacl;	/* Byte offset to a discretionary ACL. Only valid, if
+			   SE_DACL_PRESENT is set in the control field. If
+			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+			   (unconditionally granting access) is specified. */
+/* sizeof() = 0x14 bytes */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
+
+/*
+ * Absolute security descriptor. Does not contain the owner and group SIDs, nor
+ * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
+ * pointers to these structures in memory. Obviously, absolute security
+ * descriptors are only useful for in memory representations of security
+ * descriptors. On disk, a self-relative security descriptor is used.
+ */
+typedef struct {
+	u8 revision;	/* Revision level of the security descriptor. */
+	u8 alignment;
+	SECURITY_DESCRIPTOR_CONTROL control;	/* Flags qualifying the type of
+			   the descriptor as well as the following fields. */
+	SID *owner;	/* Points to a SID representing an object's owner. If
+			   this is NULL, no owner SID is present in the
+			   descriptor. */
+	SID *group;	/* Points to a SID representing an object's primary
+			   group. If this is NULL, no primary group SID is
+			   present in the descriptor. */
+	ACL *sacl;	/* Points to a system ACL. Only valid, if
+			   SE_SACL_PRESENT is set in the control field. If
+			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+			   is specified. */
+	ACL *dacl;	/* Points to a discretionary ACL. Only valid, if
+			   SE_DACL_PRESENT is set in the control field. If
+			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+			   (unconditionally granting access) is specified. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
+
+/*
+ * Current constants for security descriptors.
+ */
+typedef enum {
+	/* Current revision. */
+	SECURITY_DESCRIPTOR_REVISION	= 1,
+	SECURITY_DESCRIPTOR_REVISION1	= 1,
+
+	/* The sizes of both the absolute and relative security descriptors is
+	   the same as pointers, at least on ia32 architecture are 32-bit. */
+	SECURITY_DESCRIPTOR_MIN_LENGTH	= sizeof(SECURITY_DESCRIPTOR),
+} SECURITY_DESCRIPTOR_CONSTANTS;
+
+/*
+ * Attribute: Security descriptor (0x50). A standard self-relative security
+ * descriptor.
+ *
+ * NOTE: Can be resident or non-resident.
+ * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
+ * in FILE_Secure and the correct descriptor is found using the security_id
+ * from the standard information attribute.
+ */
+typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
+
+/*
+ * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
+ * referenced instance of each unique security descriptor is stored.
+ *
+ * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
+ * does, however, contain two indexes ($SDH and $SII) as well as a named data
+ * stream ($SDS).
+ *
+ * Every unique security descriptor is assigned a unique security identifier
+ * (security_id, not to be confused with a SID). The security_id is unique for
+ * the NTFS volume and is used as an index into the $SII index, which maps
+ * security_ids to the security descriptor's storage location within the $SDS
+ * data attribute. The $SII index is sorted by ascending security_id.
+ *
+ * A simple hash is computed from each security descriptor. This hash is used
+ * as an index into the $SDH index, which maps security descriptor hashes to
+ * the security descriptor's storage location within the $SDS data attribute.
+ * The $SDH index is sorted by security descriptor hash and is stored in a B+
+ * tree. When searching $SDH (with the intent of determining whether or not a
+ * new security descriptor is already present in the $SDS data stream), if a
+ * matching hash is found, but the security descriptors do not match, the
+ * search in the $SDH index is continued, searching for a next matching hash.
+ *
+ * When a precise match is found, the security_id coresponding to the security
+ * descriptor in the $SDS attribute is read from the found $SDH index entry and
+ * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
+ * which the security descriptor is being applied. The $STANDARD_INFORMATION
+ * attribute is present in all base mft records (i.e. in all files and
+ * directories).
+ *
+ * If a match is not found, the security descriptor is assigned a new unique
+ * security_id and is added to the $SDS data attribute. Then, entries
+ * referencing the this security descriptor in the $SDS data attribute are
+ * added to the $SDH and $SII indexes.
+ *
+ * Note: Entries are never deleted from FILE_Secure, even if nothing
+ * references an entry any more.
+ */
+
+/*
+ * This header precedes each security descriptor in the $SDS data stream.
+ * This is also the index entry data part of both the $SII and $SDH indexes.
+ */
+typedef struct {
+	le32 hash;	  /* Hash of the security descriptor. */
+	le32 security_id; /* The security_id assigned to the descriptor. */
+	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
+	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
+
+/*
+ * The $SDS data stream contains the security descriptors, aligned on 16-byte
+ * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
+ * cross 256kib boundaries (this restriction is imposed by the Windows cache
+ * manager). Each security descriptor is contained in a SDS_ENTRY structure.
+ * Also, each security descriptor is stored twice in the $SDS stream with a
+ * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
+ * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
+ * the first copy of the security descriptor will be at offset 0x51d0 in the
+ * $SDS data stream and the second copy will be at offset 0x451d0.
+ */
+typedef struct {
+/*Ofs*/
+/*  0	SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
+				       unnamed structs. */
+	le32 hash;	  /* Hash of the security descriptor. */
+	le32 security_id; /* The security_id assigned to the descriptor. */
+	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
+	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
+/* 20*/	SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
+					     descriptor. */
+} __attribute__ ((__packed__)) SDS_ENTRY;
+
+/*
+ * The index entry key used in the $SII index. The collation type is
+ * COLLATION_NTOFS_ULONG.
+ */
+typedef struct {
+	le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SII_INDEX_KEY;
+
+/*
+ * The index entry key used in the $SDH index. The keys are sorted first by
+ * hash and then by security_id. The collation rule is
+ * COLLATION_NTOFS_SECURITY_HASH.
+ */
+typedef struct {
+	le32 hash;	  /* Hash of the security descriptor. */
+	le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SDH_INDEX_KEY;
+
+/*
+ * Attribute: Volume name (0x60).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ */
+typedef struct {
+	ntfschar name[0];	/* The name of the volume in Unicode. */
+} __attribute__ ((__packed__)) VOLUME_NAME;
+
+/*
+ * Possible flags for the volume (16-bit).
+ */
+enum {
+	VOLUME_IS_DIRTY			= cpu_to_le16(0x0001),
+	VOLUME_RESIZE_LOG_FILE		= cpu_to_le16(0x0002),
+	VOLUME_UPGRADE_ON_MOUNT		= cpu_to_le16(0x0004),
+	VOLUME_MOUNTED_ON_NT4		= cpu_to_le16(0x0008),
+
+	VOLUME_DELETE_USN_UNDERWAY	= cpu_to_le16(0x0010),
+	VOLUME_REPAIR_OBJECT_ID		= cpu_to_le16(0x0020),
+
+	VOLUME_CHKDSK_UNDERWAY		= cpu_to_le16(0x4000),
+	VOLUME_MODIFIED_BY_CHKDSK	= cpu_to_le16(0x8000),
+
+	VOLUME_FLAGS_MASK		= cpu_to_le16(0xc03f),
+
+	/* To make our life easier when checking if we must mount read-only. */
+	VOLUME_MUST_MOUNT_RO_MASK	= cpu_to_le16(0xc027),
+} __attribute__ ((__packed__));
+
+typedef le16 VOLUME_FLAGS;
+
+/*
+ * Attribute: Volume information (0x70).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
+ *	 NTFS 1.2. I haven't personally seen other values yet.
+ */
+typedef struct {
+	le64 reserved;		/* Not used (yet?). */
+	u8 major_ver;		/* Major version of the ntfs format. */
+	u8 minor_ver;		/* Minor version of the ntfs format. */
+	VOLUME_FLAGS flags;	/* Bit array of VOLUME_* flags. */
+} __attribute__ ((__packed__)) VOLUME_INFORMATION;
+
+/*
+ * Attribute: Data attribute (0x80).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Data contents of a file (i.e. the unnamed stream) or of a named stream.
+ */
+typedef struct {
+	u8 data[0];		/* The file's data contents. */
+} __attribute__ ((__packed__)) DATA_ATTR;
+
+/*
+ * Index header flags (8-bit).
+ */
+enum {
+	/*
+	 * When index header is in an index root attribute:
+	 */
+	SMALL_INDEX = 0, /* The index is small enough to fit inside the index
+			    root attribute and there is no index allocation
+			    attribute present. */
+	LARGE_INDEX = 1, /* The index is too large to fit in the index root
+			    attribute and/or an index allocation attribute is
+			    present. */
+	/*
+	 * When index header is in an index block, i.e. is part of index
+	 * allocation attribute:
+	 */
+	LEAF_NODE  = 0, /* This is a leaf node, i.e. there are no more nodes
+			   branching off it. */
+	INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
+			   node. */
+	NODE_MASK  = 1, /* Mask for accessing the *_NODE bits. */
+} __attribute__ ((__packed__));
+
+typedef u8 INDEX_HEADER_FLAGS;
+
+/*
+ * This is the header for indexes, describing the INDEX_ENTRY records, which
+ * follow the INDEX_HEADER. Together the index header and the index entries
+ * make up a complete index.
+ *
+ * IMPORTANT NOTE: The offset, length and size structure members are counted
+ * relative to the start of the index header structure and not relative to the
+ * start of the index root or index allocation structures themselves.
+ */
+typedef struct {
+	le32 entries_offset;		/* Byte offset to first INDEX_ENTRY
+					   aligned to 8-byte boundary. */
+	le32 index_length;		/* Data size of the index in bytes,
+					   i.e. bytes used from allocated
+					   size, aligned to 8-byte boundary. */
+	le32 allocated_size;		/* Byte size of this index (block),
+					   multiple of 8 bytes. */
+	/* NOTE: For the index root attribute, the above two numbers are always
+	   equal, as the attribute is resident and it is resized as needed. In
+	   the case of the index allocation attribute the attribute is not
+	   resident and hence the allocated_size is a fixed value and must
+	   equal the index_block_size specified by the INDEX_ROOT attribute
+	   corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
+	   belongs to. */
+	INDEX_HEADER_FLAGS flags;	/* Bit field of INDEX_HEADER_FLAGS. */
+	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
+} __attribute__ ((__packed__)) INDEX_HEADER;
+
+/*
+ * Attribute: Index root (0x90).
+ *
+ * NOTE: Always resident.
+ *
+ * This is followed by a sequence of index entries (INDEX_ENTRY structures)
+ * as described by the index header.
+ *
+ * When a directory is small enough to fit inside the index root then this
+ * is the only attribute describing the directory. When the directory is too
+ * large to fit in the index root, on the other hand, two additional attributes
+ * are present: an index allocation attribute, containing sub-nodes of the B+
+ * directory tree (see below), and a bitmap attribute, describing which virtual
+ * cluster numbers (vcns) in the index allocation attribute are in use by an
+ * index block.
+ *
+ * NOTE: The root directory (FILE_root) contains an entry for itself. Other
+ * directories do not contain entries for themselves, though.
+ */
+typedef struct {
+	ATTR_TYPE type;			/* Type of the indexed attribute. Is
+					   $FILE_NAME for directories, zero
+					   for view indexes. No other values
+					   allowed. */
+	COLLATION_RULE collation_rule;	/* Collation rule used to sort the
+					   index entries. If type is $FILE_NAME,
+					   this must be COLLATION_FILE_NAME. */
+	le32 index_block_size;		/* Size of each index block in bytes (in
+					   the index allocation attribute). */
+	u8 clusters_per_index_block;	/* Cluster size of each index block (in
+					   the index allocation attribute), when
+					   an index block is >= than a cluster,
+					   otherwise this will be the log of
+					   the size (like how the encoding of
+					   the mft record size and the index
+					   record size found in the boot sector
+					   work). Has to be a power of 2. */
+	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
+	INDEX_HEADER index;		/* Index header describing the
+					   following index entries. */
+} __attribute__ ((__packed__)) INDEX_ROOT;
+
+/*
+ * Attribute: Index allocation (0xa0).
+ *
+ * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
+ *
+ * This is an array of index blocks. Each index block starts with an
+ * INDEX_BLOCK structure containing an index header, followed by a sequence of
+ * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
+ */
+typedef struct {
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+	NTFS_RECORD_TYPE magic;	/* Magic is "INDX". */
+	le16 usa_ofs;		/* See NTFS_RECORD definition. */
+	le16 usa_count;		/* See NTFS_RECORD definition. */
+
+/*  8*/	sle64 lsn;		/* $LogFile sequence number of the last
+				   modification of this index block. */
+/* 16*/	leVCN index_block_vcn;	/* Virtual cluster number of the index block.
+				   If the cluster_size on the volume is <= the
+				   index_block_size of the directory,
+				   index_block_vcn counts in units of clusters,
+				   and in units of sectors otherwise. */
+/* 24*/	INDEX_HEADER index;	/* Describes the following index entries. */
+/* sizeof()= 40 (0x28) bytes */
+/*
+ * When creating the index block, we place the update sequence array at this
+ * offset, i.e. before we start with the index entries. This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work. As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) INDEX_BLOCK;
+
+typedef INDEX_BLOCK INDEX_ALLOCATION;
+
+/*
+ * The system file FILE_Extend/$Reparse contains an index named $R listing
+ * all reparse points on the volume. The index entry keys are as defined
+ * below. Note, that there is no index data associated with the index entries.
+ *
+ * The index entries are sorted by the index key file_id. The collation rule is
+ * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
+ * primary key / is not a key at all. (AIA)
+ */
+typedef struct {
+	le32 reparse_tag;	/* Reparse point type (inc. flags). */
+	leMFT_REF file_id;	/* Mft record of the file containing the
+				   reparse point attribute. */
+} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
+
+/*
+ * Quota flags (32-bit).
+ *
+ * The user quota flags.  Names explain meaning.
+ */
+enum {
+	QUOTA_FLAG_DEFAULT_LIMITS	= cpu_to_le32(0x00000001),
+	QUOTA_FLAG_LIMIT_REACHED	= cpu_to_le32(0x00000002),
+	QUOTA_FLAG_ID_DELETED		= cpu_to_le32(0x00000004),
+
+	QUOTA_FLAG_USER_MASK		= cpu_to_le32(0x00000007),
+	/* This is a bit mask for the user quota flags. */
+
+	/*
+	 * These flags are only present in the quota defaults index entry, i.e.
+	 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
+	 */
+	QUOTA_FLAG_TRACKING_ENABLED	= cpu_to_le32(0x00000010),
+	QUOTA_FLAG_ENFORCEMENT_ENABLED	= cpu_to_le32(0x00000020),
+	QUOTA_FLAG_TRACKING_REQUESTED	= cpu_to_le32(0x00000040),
+	QUOTA_FLAG_LOG_THRESHOLD	= cpu_to_le32(0x00000080),
+
+	QUOTA_FLAG_LOG_LIMIT		= cpu_to_le32(0x00000100),
+	QUOTA_FLAG_OUT_OF_DATE		= cpu_to_le32(0x00000200),
+	QUOTA_FLAG_CORRUPT		= cpu_to_le32(0x00000400),
+	QUOTA_FLAG_PENDING_DELETES	= cpu_to_le32(0x00000800),
+};
+
+typedef le32 QUOTA_FLAGS;
+
+/*
+ * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
+ * are on a per volume and per user basis.
+ *
+ * The $Q index contains one entry for each existing user_id on the volume. The
+ * index key is the user_id of the user/group owning this quota control entry,
+ * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
+ * owner_id, is found in the standard information attribute. The collation rule
+ * for $Q is COLLATION_NTOFS_ULONG.
+ *
+ * The $O index contains one entry for each user/group who has been assigned
+ * a quota on that volume. The index key holds the SID of the user_id the
+ * entry belongs to, i.e. the owner_id. The collation rule for $O is
+ * COLLATION_NTOFS_SID.
+ *
+ * The $O index entry data is the user_id of the user corresponding to the SID.
+ * This user_id is used as an index into $Q to find the quota control entry
+ * associated with the SID.
+ *
+ * The $Q index entry data is the quota control entry and is defined below.
+ */
+typedef struct {
+	le32 version;		/* Currently equals 2. */
+	QUOTA_FLAGS flags;	/* Flags describing this quota entry. */
+	le64 bytes_used;	/* How many bytes of the quota are in use. */
+	sle64 change_time;	/* Last time this quota entry was changed. */
+	sle64 threshold;	/* Soft quota (-1 if not limited). */
+	sle64 limit;		/* Hard quota (-1 if not limited). */
+	sle64 exceeded_time;	/* How long the soft quota has been exceeded. */
+	SID sid;		/* The SID of the user/object associated with
+				   this quota entry.  Equals zero for the quota
+				   defaults entry (and in fact on a WinXP
+				   volume, it is not present at all). */
+} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
+
+/*
+ * Predefined owner_id values (32-bit).
+ */
+enum {
+	QUOTA_INVALID_ID	= cpu_to_le32(0x00000000),
+	QUOTA_DEFAULTS_ID	= cpu_to_le32(0x00000001),
+	QUOTA_FIRST_USER_ID	= cpu_to_le32(0x00000100),
+};
+
+/*
+ * Current constants for quota control entries.
+ */
+typedef enum {
+	/* Current version. */
+	QUOTA_VERSION	= 2,
+} QUOTA_CONTROL_ENTRY_CONSTANTS;
+
+/*
+ * Index entry flags (16-bit).
+ */
+enum {
+	INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
+			sub-node, i.e. a reference to an index block in form of
+			a virtual cluster number (see below). */
+	INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
+			entry in an index block.  The index entry does not
+			represent a file but it can point to a sub-node. */
+
+	INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
+			enum bit width to 16-bit. */
+} __attribute__ ((__packed__));
+
+typedef le16 INDEX_ENTRY_FLAGS;
+
+/*
+ * This the index entry header (see below).
+ */
+typedef struct {
+/*  0*/	union {
+		struct { /* Only valid when INDEX_ENTRY_END is not set. */
+			leMFT_REF indexed_file;	/* The mft reference of the file
+						   described by this index
+						   entry. Used for directory
+						   indexes. */
+		} __attribute__ ((__packed__)) dir;
+		struct { /* Used for views/indexes to find the entry's data. */
+			le16 data_offset;	/* Data byte offset from this
+						   INDEX_ENTRY. Follows the
+						   index key. */
+			le16 data_length;	/* Data length in bytes. */
+			le32 reservedV;		/* Reserved (zero). */
+		} __attribute__ ((__packed__)) vi;
+	} __attribute__ ((__packed__)) data;
+/*  8*/	le16 length;		 /* Byte size of this index entry, multiple of
+				    8-bytes. */
+/* 10*/	le16 key_length;	 /* Byte size of the key value, which is in the
+				    index entry. It follows field reserved. Not
+				    multiple of 8-bytes. */
+/* 12*/	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+/* 14*/	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
+/* sizeof() = 16 bytes */
+} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
+
+/*
+ * This is an index entry. A sequence of such entries follows each INDEX_HEADER
+ * structure. Together they make up a complete index. The index follows either
+ * an index root attribute or an index allocation attribute.
+ *
+ * NOTE: Before NTFS 3.0 only filename attributes were indexed.
+ */
+typedef struct {
+/*Ofs*/
+/*  0	INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
+	union {
+		struct { /* Only valid when INDEX_ENTRY_END is not set. */
+			leMFT_REF indexed_file;	/* The mft reference of the file
+						   described by this index
+						   entry. Used for directory
+						   indexes. */
+		} __attribute__ ((__packed__)) dir;
+		struct { /* Used for views/indexes to find the entry's data. */
+			le16 data_offset;	/* Data byte offset from this
+						   INDEX_ENTRY. Follows the
+						   index key. */
+			le16 data_length;	/* Data length in bytes. */
+			le32 reservedV;		/* Reserved (zero). */
+		} __attribute__ ((__packed__)) vi;
+	} __attribute__ ((__packed__)) data;
+	le16 length;		 /* Byte size of this index entry, multiple of
+				    8-bytes. */
+	le16 key_length;	 /* Byte size of the key value, which is in the
+				    index entry. It follows field reserved. Not
+				    multiple of 8-bytes. */
+	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
+
+/* 16*/	union {		/* The key of the indexed attribute. NOTE: Only present
+			   if INDEX_ENTRY_END bit in flags is not set. NOTE: On
+			   NTFS versions before 3.0 the only valid key is the
+			   FILE_NAME_ATTR. On NTFS 3.0+ the following
+			   additional index keys are defined: */
+		FILE_NAME_ATTR file_name;/* $I30 index in directories. */
+		SII_INDEX_KEY sii;	/* $SII index in $Secure. */
+		SDH_INDEX_KEY sdh;	/* $SDH index in $Secure. */
+		GUID object_id;		/* $O index in FILE_Extend/$ObjId: The
+					   object_id of the mft record found in
+					   the data part of the index. */
+		REPARSE_INDEX_KEY reparse;	/* $R index in
+						   FILE_Extend/$Reparse. */
+		SID sid;		/* $O index in FILE_Extend/$Quota:
+					   SID of the owner of the user_id. */
+		le32 owner_id;		/* $Q index in FILE_Extend/$Quota:
+					   user_id of the owner of the quota
+					   control entry in the data part of
+					   the index. */
+	} __attribute__ ((__packed__)) key;
+	/* The (optional) index data is inserted here when creating. */
+	// leVCN vcn;	/* If INDEX_ENTRY_NODE bit in flags is set, the last
+	//		   eight bytes of this index entry contain the virtual
+	//		   cluster number of the index block that holds the
+	//		   entries immediately preceding the current entry (the
+	//		   vcn references the corresponding cluster in the data
+	//		   of the non-resident index allocation attribute). If
+	//		   the key_length is zero, then the vcn immediately
+	//		   follows the INDEX_ENTRY_HEADER. Regardless of
+	//		   key_length, the address of the 8-byte boundary
+	//		   aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
+	//		   (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
+	//		   where sizeof(VCN) can be hardcoded as 8 if wanted. */
+} __attribute__ ((__packed__)) INDEX_ENTRY;
+
+/*
+ * Attribute: Bitmap (0xb0).
+ *
+ * Contains an array of bits (aka a bitfield).
+ *
+ * When used in conjunction with the index allocation attribute, each bit
+ * corresponds to one index block within the index allocation attribute. Thus
+ * the number of bits in the bitmap * index block size / cluster size is the
+ * number of clusters in the index allocation attribute.
+ */
+typedef struct {
+	u8 bitmap[0];			/* Array of bits. */
+} __attribute__ ((__packed__)) BITMAP_ATTR;
+
+/*
+ * The reparse point tag defines the type of the reparse point. It also
+ * includes several flags, which further describe the reparse point.
+ *
+ * The reparse point tag is an unsigned 32-bit value divided in three parts:
+ *
+ * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
+ *    the reparse point.
+ * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
+ * 3. The most significant three bits are flags describing the reparse point.
+ *    They are defined as follows:
+ *	bit 29: Name surrogate bit. If set, the filename is an alias for
+ *		another object in the system.
+ *	bit 30: High-latency bit. If set, accessing the first byte of data will
+ *		be slow. (E.g. the data is stored on a tape drive.)
+ *	bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
+ *		defined tags have to use zero here.
+ *
+ * These are the predefined reparse point tags:
+ */
+enum {
+	IO_REPARSE_TAG_IS_ALIAS		= cpu_to_le32(0x20000000),
+	IO_REPARSE_TAG_IS_HIGH_LATENCY	= cpu_to_le32(0x40000000),
+	IO_REPARSE_TAG_IS_MICROSOFT	= cpu_to_le32(0x80000000),
+
+	IO_REPARSE_TAG_RESERVED_ZERO	= cpu_to_le32(0x00000000),
+	IO_REPARSE_TAG_RESERVED_ONE	= cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_RANGE	= cpu_to_le32(0x00000001),
+
+	IO_REPARSE_TAG_NSS		= cpu_to_le32(0x68000005),
+	IO_REPARSE_TAG_NSS_RECOVER	= cpu_to_le32(0x68000006),
+	IO_REPARSE_TAG_SIS		= cpu_to_le32(0x68000007),
+	IO_REPARSE_TAG_DFS		= cpu_to_le32(0x68000008),
+
+	IO_REPARSE_TAG_MOUNT_POINT	= cpu_to_le32(0x88000003),
+
+	IO_REPARSE_TAG_HSM		= cpu_to_le32(0xa8000004),
+
+	IO_REPARSE_TAG_SYMBOLIC_LINK	= cpu_to_le32(0xe8000000),
+
+	IO_REPARSE_TAG_VALID_VALUES	= cpu_to_le32(0xe000ffff),
+};
+
+/*
+ * Attribute: Reparse point (0xc0).
+ *
+ * NOTE: Can be resident or non-resident.
+ */
+typedef struct {
+	le32 reparse_tag;		/* Reparse point type (inc. flags). */
+	le16 reparse_data_length;	/* Byte size of reparse data. */
+	le16 reserved;			/* Align to 8-byte boundary. */
+	u8 reparse_data[0];		/* Meaning depends on reparse_tag. */
+} __attribute__ ((__packed__)) REPARSE_POINT;
+
+/*
+ * Attribute: Extended attribute (EA) information (0xd0).
+ *
+ * NOTE: Always resident. (Is this true???)
+ */
+typedef struct {
+	le16 ea_length;		/* Byte size of the packed extended
+				   attributes. */
+	le16 need_ea_count;	/* The number of extended attributes which have
+				   the NEED_EA bit set. */
+	le32 ea_query_length;	/* Byte size of the buffer required to query
+				   the extended attributes when calling
+				   ZwQueryEaFile() in Windows NT/2k. I.e. the
+				   byte size of the unpacked extended
+				   attributes. */
+} __attribute__ ((__packed__)) EA_INFORMATION;
+
+/*
+ * Extended attribute flags (8-bit).
+ */
+enum {
+	NEED_EA	= 0x80		/* If set the file to which the EA belongs
+				   cannot be interpreted without understanding
+				   the associates extended attributes. */
+} __attribute__ ((__packed__));
+
+typedef u8 EA_FLAGS;
+
+/*
+ * Attribute: Extended attribute (EA) (0xe0).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Like the attribute list and the index buffer list, the EA attribute value is
+ * a sequence of EA_ATTR variable length records.
+ */
+typedef struct {
+	le32 next_entry_offset;	/* Offset to the next EA_ATTR. */
+	EA_FLAGS flags;		/* Flags describing the EA. */
+	u8 ea_name_length;	/* Length of the name of the EA in bytes
+				   excluding the '\0' byte terminator. */
+	le16 ea_value_length;	/* Byte size of the EA's value. */
+	u8 ea_name[0];		/* Name of the EA.  Note this is ASCII, not
+				   Unicode and it is zero terminated. */
+	u8 ea_value[0];		/* The value of the EA.  Immediately follows
+				   the name. */
+} __attribute__ ((__packed__)) EA_ATTR;
+
+/*
+ * Attribute: Property set (0xf0).
+ *
+ * Intended to support Native Structure Storage (NSS) - a feature removed from
+ * NTFS 3.0 during beta testing.
+ */
+typedef struct {
+	/* Irrelevant as feature unused. */
+} __attribute__ ((__packed__)) PROPERTY_SET;
+
+/*
+ * Attribute: Logged utility stream (0x100).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Operations on this attribute are logged to the journal ($LogFile) like
+ * normal metadata changes.
+ *
+ * Used by the Encrypting File System (EFS). All encrypted files have this
+ * attribute with the name $EFS.
+ */
+typedef struct {
+	/* Can be anything the creator chooses. */
+	/* EFS uses it as follows: */
+	// FIXME: Type this info, verifying it along the way. (AIA)
+} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
+
+#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/kernel/fs/ntfs/lcnalloc.c b/kernel/fs/ntfs/lcnalloc.c
new file mode 100644
index 000000000..1711b710b
--- /dev/null
+++ b/kernel/fs/ntfs/lcnalloc.c
@@ -0,0 +1,1014 @@
+/*
+ * lcnalloc.c - Cluster (de)allocation code.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/pagemap.h>
+
+#include "lcnalloc.h"
+#include "debug.h"
+#include "bitmap.h"
+#include "inode.h"
+#include "volume.h"
+#include "attrib.h"
+#include "malloc.h"
+#include "aops.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
+ * @vol:	mounted ntfs volume on which to free the clusters
+ * @rl:		runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol.  In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - The volume lcn bitmap must be locked for writing on entry and is
+ *	      left locked on return.
+ */
+int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+		const runlist_element *rl)
+{
+	struct inode *lcnbmp_vi = vol->lcnbmp_ino;
+	int ret = 0;
+
+	ntfs_debug("Entering.");
+	if (!rl)
+		return 0;
+	for (; rl->length; rl++) {
+		int err;
+
+		if (rl->lcn < 0)
+			continue;
+		err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
+		if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
+			ret = err;
+	}
+	ntfs_debug("Done.");
+	return ret;
+}
+
+/**
+ * ntfs_cluster_alloc - allocate clusters on an ntfs volume
+ * @vol:	mounted ntfs volume on which to allocate the clusters
+ * @start_vcn:	vcn to use for the first allocated cluster
+ * @count:	number of clusters to allocate
+ * @start_lcn:	starting lcn at which to allocate the clusters (or -1 if none)
+ * @zone:	zone from which to allocate the clusters
+ * @is_extension:	if 'true', this is an attribute extension
+ *
+ * Allocate @count clusters preferably starting at cluster @start_lcn or at the
+ * current allocator position if @start_lcn is -1, on the mounted ntfs volume
+ * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
+ * MFT_ZONE for allocation of clusters for the master file table, i.e. the
+ * $MFT/$DATA attribute.
+ *
+ * @start_vcn specifies the vcn of the first allocated cluster.  This makes
+ * merging the resulting runlist with the old runlist easier.
+ *
+ * If @is_extension is 'true', the caller is allocating clusters to extend an
+ * attribute and if it is 'false', the caller is allocating clusters to fill a
+ * hole in an attribute.  Practically the difference is that if @is_extension
+ * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
+ * @is_extension is 'false' the runlist will be terminated with
+ * LCN_RL_NOT_MAPPED.
+ *
+ * You need to check the return value with IS_ERR().  If this is false, the
+ * function was successful and the return value is a runlist describing the
+ * allocated cluster(s).  If IS_ERR() is true, the function failed and
+ * PTR_ERR() gives you the error code.
+ *
+ * Notes on the allocation algorithm
+ * =================================
+ *
+ * There are two data zones.  First is the area between the end of the mft zone
+ * and the end of the volume, and second is the area between the start of the
+ * volume and the start of the mft zone.  On unmodified/standard NTFS 1.x
+ * volumes, the second data zone does not exist due to the mft zone being
+ * expanded to cover the start of the volume in order to reserve space for the
+ * mft bitmap attribute.
+ *
+ * This is not the prettiest function but the complexity stems from the need of
+ * implementing the mft vs data zoned approach and from the fact that we have
+ * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
+ * need to cope with crossing over boundaries of two buffers.  Further, the
+ * fact that the allocator allows for caller supplied hints as to the location
+ * of where allocation should begin and the fact that the allocator keeps track
+ * of where in the data zones the next natural allocation should occur,
+ * contribute to the complexity of the function.  But it should all be
+ * worthwhile, because this allocator should: 1) be a full implementation of
+ * the MFT zone approach used by Windows NT, 2) cause reduction in
+ * fragmentation, and 3) be speedy in allocations (the code is not optimized
+ * for speed, but the algorithm is, so further speed improvements are probably
+ * possible).
+ *
+ * FIXME: We should be monitoring cluster allocation and increment the MFT zone
+ * size dynamically but this is something for the future.  We will just cause
+ * heavier fragmentation by not doing it and I am not even sure Windows would
+ * grow the MFT zone dynamically, so it might even be correct not to do this.
+ * The overhead in doing dynamic MFT zone expansion would be very large and
+ * unlikely worth the effort. (AIA)
+ *
+ * TODO: I have added in double the required zone position pointer wrap around
+ * logic which can be optimized to having only one of the two logic sets.
+ * However, having the double logic will work fine, but if we have only one of
+ * the sets and we get it wrong somewhere, then we get into trouble, so
+ * removing the duplicate logic requires _very_ careful consideration of _all_
+ * possible code paths.  So at least for now, I am leaving the double logic -
+ * better safe than sorry... (AIA)
+ *
+ * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *	      on return.
+ *	    - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ */
+runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
+		const s64 count, const LCN start_lcn,
+		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+		const bool is_extension)
+{
+	LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
+	LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
+	s64 clusters;
+	loff_t i_size;
+	struct inode *lcnbmp_vi;
+	runlist_element *rl = NULL;
+	struct address_space *mapping;
+	struct page *page = NULL;
+	u8 *buf, *byte;
+	int err = 0, rlpos, rlsize, buf_size;
+	u8 pass, done_zones, search_zone, need_writeback = 0, bit;
+
+	ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
+			"0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
+			(unsigned long long)count,
+			(unsigned long long)start_lcn,
+			zone == MFT_ZONE ? "MFT" : "DATA");
+	BUG_ON(!vol);
+	lcnbmp_vi = vol->lcnbmp_ino;
+	BUG_ON(!lcnbmp_vi);
+	BUG_ON(start_vcn < 0);
+	BUG_ON(count < 0);
+	BUG_ON(start_lcn < -1);
+	BUG_ON(zone < FIRST_ZONE);
+	BUG_ON(zone > LAST_ZONE);
+
+	/* Return NULL if @count is zero. */
+	if (!count)
+		return NULL;
+	/* Take the lcnbmp lock for writing. */
+	down_write(&vol->lcnbmp_lock);
+	/*
+	 * If no specific @start_lcn was requested, use the current data zone
+	 * position, otherwise use the requested @start_lcn but make sure it
+	 * lies outside the mft zone.  Also set done_zones to 0 (no zones done)
+	 * and pass depending on whether we are starting inside a zone (1) or
+	 * at the beginning of a zone (2).  If requesting from the MFT_ZONE,
+	 * we either start at the current position within the mft zone or at
+	 * the specified position.  If the latter is out of bounds then we start
+	 * at the beginning of the MFT_ZONE.
+	 */
+	done_zones = 0;
+	pass = 1;
+	/*
+	 * zone_start and zone_end are the current search range.  search_zone
+	 * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
+	 * volume) and 4 for data zone 2 (start of volume till start of mft
+	 * zone).
+	 */
+	zone_start = start_lcn;
+	if (zone_start < 0) {
+		if (zone == DATA_ZONE)
+			zone_start = vol->data1_zone_pos;
+		else
+			zone_start = vol->mft_zone_pos;
+		if (!zone_start) {
+			/*
+			 * Zone starts at beginning of volume which means a
+			 * single pass is sufficient.
+			 */
+			pass = 2;
+		}
+	} else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
+			zone_start < vol->mft_zone_end) {
+		zone_start = vol->mft_zone_end;
+		/*
+		 * Starting at beginning of data1_zone which means a single
+		 * pass in this zone is sufficient.
+		 */
+		pass = 2;
+	} else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
+			zone_start >= vol->mft_zone_end)) {
+		zone_start = vol->mft_lcn;
+		if (!vol->mft_zone_end)
+			zone_start = 0;
+		/*
+		 * Starting at beginning of volume which means a single pass
+		 * is sufficient.
+		 */
+		pass = 2;
+	}
+	if (zone == MFT_ZONE) {
+		zone_end = vol->mft_zone_end;
+		search_zone = 1;
+	} else /* if (zone == DATA_ZONE) */ {
+		/* Skip searching the mft zone. */
+		done_zones |= 1;
+		if (zone_start >= vol->mft_zone_end) {
+			zone_end = vol->nr_clusters;
+			search_zone = 2;
+		} else {
+			zone_end = vol->mft_zone_start;
+			search_zone = 4;
+		}
+	}
+	/*
+	 * bmp_pos is the current bit position inside the bitmap.  We use
+	 * bmp_initial_pos to determine whether or not to do a zone switch.
+	 */
+	bmp_pos = bmp_initial_pos = zone_start;
+
+	/* Loop until all clusters are allocated, i.e. clusters == 0. */
+	clusters = count;
+	rlpos = rlsize = 0;
+	mapping = lcnbmp_vi->i_mapping;
+	i_size = i_size_read(lcnbmp_vi);
+	while (1) {
+		ntfs_debug("Start of outer while loop: done_zones 0x%x, "
+				"search_zone %i, pass %i, zone_start 0x%llx, "
+				"zone_end 0x%llx, bmp_initial_pos 0x%llx, "
+				"bmp_pos 0x%llx, rlpos %i, rlsize %i.",
+				done_zones, search_zone, pass,
+				(unsigned long long)zone_start,
+				(unsigned long long)zone_end,
+				(unsigned long long)bmp_initial_pos,
+				(unsigned long long)bmp_pos, rlpos, rlsize);
+		/* Loop until we run out of free clusters. */
+		last_read_pos = bmp_pos >> 3;
+		ntfs_debug("last_read_pos 0x%llx.",
+				(unsigned long long)last_read_pos);
+		if (last_read_pos > i_size) {
+			ntfs_debug("End of attribute reached.  "
+					"Skipping to zone_pass_done.");
+			goto zone_pass_done;
+		}
+		if (likely(page)) {
+			if (need_writeback) {
+				ntfs_debug("Marking page dirty.");
+				flush_dcache_page(page);
+				set_page_dirty(page);
+				need_writeback = 0;
+			}
+			ntfs_unmap_page(page);
+		}
+		page = ntfs_map_page(mapping, last_read_pos >>
+				PAGE_CACHE_SHIFT);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			ntfs_error(vol->sb, "Failed to map page.");
+			goto out;
+		}
+		buf_size = last_read_pos & ~PAGE_CACHE_MASK;
+		buf = page_address(page) + buf_size;
+		buf_size = PAGE_CACHE_SIZE - buf_size;
+		if (unlikely(last_read_pos + buf_size > i_size))
+			buf_size = i_size - last_read_pos;
+		buf_size <<= 3;
+		lcn = bmp_pos & 7;
+		bmp_pos &= ~(LCN)7;
+		ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
+				"bmp_pos 0x%llx, need_writeback %i.", buf_size,
+				(unsigned long long)lcn,
+				(unsigned long long)bmp_pos, need_writeback);
+		while (lcn < buf_size && lcn + bmp_pos < zone_end) {
+			byte = buf + (lcn >> 3);
+			ntfs_debug("In inner while loop: buf_size %i, "
+					"lcn 0x%llx, bmp_pos 0x%llx, "
+					"need_writeback %i, byte ofs 0x%x, "
+					"*byte 0x%x.", buf_size,
+					(unsigned long long)lcn,
+					(unsigned long long)bmp_pos,
+					need_writeback,
+					(unsigned int)(lcn >> 3),
+					(unsigned int)*byte);
+			/* Skip full bytes. */
+			if (*byte == 0xff) {
+				lcn = (lcn + 8) & ~(LCN)7;
+				ntfs_debug("Continuing while loop 1.");
+				continue;
+			}
+			bit = 1 << (lcn & 7);
+			ntfs_debug("bit 0x%x.", bit);
+			/* If the bit is already set, go onto the next one. */
+			if (*byte & bit) {
+				lcn++;
+				ntfs_debug("Continuing while loop 2.");
+				continue;
+			}
+			/*
+			 * Allocate more memory if needed, including space for
+			 * the terminator element.
+			 * ntfs_malloc_nofs() operates on whole pages only.
+			 */
+			if ((rlpos + 2) * sizeof(*rl) > rlsize) {
+				runlist_element *rl2;
+
+				ntfs_debug("Reallocating memory.");
+				if (!rl)
+					ntfs_debug("First free bit is at LCN "
+							"0x%llx.",
+							(unsigned long long)
+							(lcn + bmp_pos));
+				rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+				if (unlikely(!rl2)) {
+					err = -ENOMEM;
+					ntfs_error(vol->sb, "Failed to "
+							"allocate memory.");
+					goto out;
+				}
+				memcpy(rl2, rl, rlsize);
+				ntfs_free(rl);
+				rl = rl2;
+				rlsize += PAGE_SIZE;
+				ntfs_debug("Reallocated memory, rlsize 0x%x.",
+						rlsize);
+			}
+			/* Allocate the bitmap bit. */
+			*byte |= bit;
+			/* We need to write this bitmap page to disk. */
+			need_writeback = 1;
+			ntfs_debug("*byte 0x%x, need_writeback is set.",
+					(unsigned int)*byte);
+			/*
+			 * Coalesce with previous run if adjacent LCNs.
+			 * Otherwise, append a new run.
+			 */
+			ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
+					"prev_lcn 0x%llx, lcn 0x%llx, "
+					"bmp_pos 0x%llx, prev_run_len 0x%llx, "
+					"rlpos %i.",
+					(unsigned long long)(lcn + bmp_pos),
+					1ULL, (unsigned long long)prev_lcn,
+					(unsigned long long)lcn,
+					(unsigned long long)bmp_pos,
+					(unsigned long long)prev_run_len,
+					rlpos);
+			if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
+				ntfs_debug("Coalescing to run (lcn 0x%llx, "
+						"len 0x%llx).",
+						(unsigned long long)
+						rl[rlpos - 1].lcn,
+						(unsigned long long)
+						rl[rlpos - 1].length);
+				rl[rlpos - 1].length = ++prev_run_len;
+				ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
+						"prev_run_len 0x%llx.",
+						(unsigned long long)
+						rl[rlpos - 1].lcn,
+						(unsigned long long)
+						rl[rlpos - 1].length,
+						(unsigned long long)
+						prev_run_len);
+			} else {
+				if (likely(rlpos)) {
+					ntfs_debug("Adding new run, (previous "
+							"run lcn 0x%llx, "
+							"len 0x%llx).",
+							(unsigned long long)
+							rl[rlpos - 1].lcn,
+							(unsigned long long)
+							rl[rlpos - 1].length);
+					rl[rlpos].vcn = rl[rlpos - 1].vcn +
+							prev_run_len;
+				} else {
+					ntfs_debug("Adding new run, is first "
+							"run.");
+					rl[rlpos].vcn = start_vcn;
+				}
+				rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
+				rl[rlpos].length = prev_run_len = 1;
+				rlpos++;
+			}
+			/* Done? */
+			if (!--clusters) {
+				LCN tc;
+				/*
+				 * Update the current zone position.  Positions
+				 * of already scanned zones have been updated
+				 * during the respective zone switches.
+				 */
+				tc = lcn + bmp_pos + 1;
+				ntfs_debug("Done. Updating current zone "
+						"position, tc 0x%llx, "
+						"search_zone %i.",
+						(unsigned long long)tc,
+						search_zone);
+				switch (search_zone) {
+				case 1:
+					ntfs_debug("Before checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+					if (tc >= vol->mft_zone_end) {
+						vol->mft_zone_pos =
+								vol->mft_lcn;
+						if (!vol->mft_zone_end)
+							vol->mft_zone_pos = 0;
+					} else if ((bmp_initial_pos >=
+							vol->mft_zone_pos ||
+							tc > vol->mft_zone_pos)
+							&& tc >= vol->mft_lcn)
+						vol->mft_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+					break;
+				case 2:
+					ntfs_debug("Before checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+					if (tc >= vol->nr_clusters)
+						vol->data1_zone_pos =
+							     vol->mft_zone_end;
+					else if ((bmp_initial_pos >=
+						    vol->data1_zone_pos ||
+						    tc > vol->data1_zone_pos)
+						    && tc >= vol->mft_zone_end)
+						vol->data1_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+					break;
+				case 4:
+					ntfs_debug("Before checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+					if (tc >= vol->mft_zone_start)
+						vol->data2_zone_pos = 0;
+					else if (bmp_initial_pos >=
+						      vol->data2_zone_pos ||
+						      tc > vol->data2_zone_pos)
+						vol->data2_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+					break;
+				default:
+					BUG();
+				}
+				ntfs_debug("Finished.  Going to out.");
+				goto out;
+			}
+			lcn++;
+		}
+		bmp_pos += buf_size;
+		ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
+				"0x%llx, bmp_pos 0x%llx, need_writeback %i.",
+				buf_size, (unsigned long long)lcn,
+				(unsigned long long)bmp_pos, need_writeback);
+		if (bmp_pos < zone_end) {
+			ntfs_debug("Continuing outer while loop, "
+					"bmp_pos 0x%llx, zone_end 0x%llx.",
+					(unsigned long long)bmp_pos,
+					(unsigned long long)zone_end);
+			continue;
+		}
+zone_pass_done:	/* Finished with the current zone pass. */
+		ntfs_debug("At zone_pass_done, pass %i.", pass);
+		if (pass == 1) {
+			/*
+			 * Now do pass 2, scanning the first part of the zone
+			 * we omitted in pass 1.
+			 */
+			pass = 2;
+			zone_end = zone_start;
+			switch (search_zone) {
+			case 1: /* mft_zone */
+				zone_start = vol->mft_zone_start;
+				break;
+			case 2: /* data1_zone */
+				zone_start = vol->mft_zone_end;
+				break;
+			case 4: /* data2_zone */
+				zone_start = 0;
+				break;
+			default:
+				BUG();
+			}
+			/* Sanity check. */
+			if (zone_end < zone_start)
+				zone_end = zone_start;
+			bmp_pos = zone_start;
+			ntfs_debug("Continuing outer while loop, pass 2, "
+					"zone_start 0x%llx, zone_end 0x%llx, "
+					"bmp_pos 0x%llx.",
+					(unsigned long long)zone_start,
+					(unsigned long long)zone_end,
+					(unsigned long long)bmp_pos);
+			continue;
+		} /* pass == 2 */
+done_zones_check:
+		ntfs_debug("At done_zones_check, search_zone %i, done_zones "
+				"before 0x%x, done_zones after 0x%x.",
+				search_zone, done_zones,
+				done_zones | search_zone);
+		done_zones |= search_zone;
+		if (done_zones < 7) {
+			ntfs_debug("Switching zone.");
+			/* Now switch to the next zone we haven't done yet. */
+			pass = 1;
+			switch (search_zone) {
+			case 1:
+				ntfs_debug("Switching from mft zone to data1 "
+						"zone.");
+				/* Update mft zone position. */
+				if (rlpos) {
+					LCN tc;
+
+					ntfs_debug("Before checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+					tc = rl[rlpos - 1].lcn +
+							rl[rlpos - 1].length;
+					if (tc >= vol->mft_zone_end) {
+						vol->mft_zone_pos =
+								vol->mft_lcn;
+						if (!vol->mft_zone_end)
+							vol->mft_zone_pos = 0;
+					} else if ((bmp_initial_pos >=
+							vol->mft_zone_pos ||
+							tc > vol->mft_zone_pos)
+							&& tc >= vol->mft_lcn)
+						vol->mft_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+				}
+				/* Switch from mft zone to data1 zone. */
+switch_to_data1_zone:		search_zone = 2;
+				zone_start = bmp_initial_pos =
+						vol->data1_zone_pos;
+				zone_end = vol->nr_clusters;
+				if (zone_start == vol->mft_zone_end)
+					pass = 2;
+				if (zone_start >= zone_end) {
+					vol->data1_zone_pos = zone_start =
+							vol->mft_zone_end;
+					pass = 2;
+				}
+				break;
+			case 2:
+				ntfs_debug("Switching from data1 zone to "
+						"data2 zone.");
+				/* Update data1 zone position. */
+				if (rlpos) {
+					LCN tc;
+
+					ntfs_debug("Before checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+					tc = rl[rlpos - 1].lcn +
+							rl[rlpos - 1].length;
+					if (tc >= vol->nr_clusters)
+						vol->data1_zone_pos =
+							     vol->mft_zone_end;
+					else if ((bmp_initial_pos >=
+						    vol->data1_zone_pos ||
+						    tc > vol->data1_zone_pos)
+						    && tc >= vol->mft_zone_end)
+						vol->data1_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+				}
+				/* Switch from data1 zone to data2 zone. */
+				search_zone = 4;
+				zone_start = bmp_initial_pos =
+						vol->data2_zone_pos;
+				zone_end = vol->mft_zone_start;
+				if (!zone_start)
+					pass = 2;
+				if (zone_start >= zone_end) {
+					vol->data2_zone_pos = zone_start =
+							bmp_initial_pos = 0;
+					pass = 2;
+				}
+				break;
+			case 4:
+				ntfs_debug("Switching from data2 zone to "
+						"data1 zone.");
+				/* Update data2 zone position. */
+				if (rlpos) {
+					LCN tc;
+
+					ntfs_debug("Before checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+					tc = rl[rlpos - 1].lcn +
+							rl[rlpos - 1].length;
+					if (tc >= vol->mft_zone_start)
+						vol->data2_zone_pos = 0;
+					else if (bmp_initial_pos >=
+						      vol->data2_zone_pos ||
+						      tc > vol->data2_zone_pos)
+						vol->data2_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+				}
+				/* Switch from data2 zone to data1 zone. */
+				goto switch_to_data1_zone;
+			default:
+				BUG();
+			}
+			ntfs_debug("After zone switch, search_zone %i, "
+					"pass %i, bmp_initial_pos 0x%llx, "
+					"zone_start 0x%llx, zone_end 0x%llx.",
+					search_zone, pass,
+					(unsigned long long)bmp_initial_pos,
+					(unsigned long long)zone_start,
+					(unsigned long long)zone_end);
+			bmp_pos = zone_start;
+			if (zone_start == zone_end) {
+				ntfs_debug("Empty zone, going to "
+						"done_zones_check.");
+				/* Empty zone. Don't bother searching it. */
+				goto done_zones_check;
+			}
+			ntfs_debug("Continuing outer while loop.");
+			continue;
+		} /* done_zones == 7 */
+		ntfs_debug("All zones are finished.");
+		/*
+		 * All zones are finished!  If DATA_ZONE, shrink mft zone.  If
+		 * MFT_ZONE, we have really run out of space.
+		 */
+		mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
+		ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
+				"0x%llx, mft_zone_size 0x%llx.",
+				(unsigned long long)vol->mft_zone_start,
+				(unsigned long long)vol->mft_zone_end,
+				(unsigned long long)mft_zone_size);
+		if (zone == MFT_ZONE || mft_zone_size <= 0) {
+			ntfs_debug("No free clusters left, going to out.");
+			/* Really no more space left on device. */
+			err = -ENOSPC;
+			goto out;
+		} /* zone == DATA_ZONE && mft_zone_size > 0 */
+		ntfs_debug("Shrinking mft zone.");
+		zone_end = vol->mft_zone_end;
+		mft_zone_size >>= 1;
+		if (mft_zone_size > 0)
+			vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
+		else /* mft zone and data2 zone no longer exist. */
+			vol->data2_zone_pos = vol->mft_zone_start =
+					vol->mft_zone_end = 0;
+		if (vol->mft_zone_pos >= vol->mft_zone_end) {
+			vol->mft_zone_pos = vol->mft_lcn;
+			if (!vol->mft_zone_end)
+				vol->mft_zone_pos = 0;
+		}
+		bmp_pos = zone_start = bmp_initial_pos =
+				vol->data1_zone_pos = vol->mft_zone_end;
+		search_zone = 2;
+		pass = 2;
+		done_zones &= ~2;
+		ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
+				"vol->mft_zone_start 0x%llx, "
+				"vol->mft_zone_end 0x%llx, "
+				"vol->mft_zone_pos 0x%llx, search_zone 2, "
+				"pass 2, dones_zones 0x%x, zone_start 0x%llx, "
+				"zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
+				"continuing outer while loop.",
+				(unsigned long long)mft_zone_size,
+				(unsigned long long)vol->mft_zone_start,
+				(unsigned long long)vol->mft_zone_end,
+				(unsigned long long)vol->mft_zone_pos,
+				done_zones, (unsigned long long)zone_start,
+				(unsigned long long)zone_end,
+				(unsigned long long)vol->data1_zone_pos);
+	}
+	ntfs_debug("After outer while loop.");
+out:
+	ntfs_debug("At out.");
+	/* Add runlist terminator element. */
+	if (likely(rl)) {
+		rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
+		rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
+		rl[rlpos].length = 0;
+	}
+	if (likely(page && !IS_ERR(page))) {
+		if (need_writeback) {
+			ntfs_debug("Marking page dirty.");
+			flush_dcache_page(page);
+			set_page_dirty(page);
+			need_writeback = 0;
+		}
+		ntfs_unmap_page(page);
+	}
+	if (likely(!err)) {
+		up_write(&vol->lcnbmp_lock);
+		ntfs_debug("Done.");
+		return rl;
+	}
+	ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
+			"(error %i).", err);
+	if (rl) {
+		int err2;
+
+		if (err == -ENOSPC)
+			ntfs_debug("Not enough space to complete allocation, "
+					"err -ENOSPC, first free lcn 0x%llx, "
+					"could allocate up to 0x%llx "
+					"clusters.",
+					(unsigned long long)rl[0].lcn,
+					(unsigned long long)(count - clusters));
+		/* Deallocate all allocated clusters. */
+		ntfs_debug("Attempting rollback...");
+		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
+		if (err2) {
+			ntfs_error(vol->sb, "Failed to rollback (error %i).  "
+					"Leaving inconsistent metadata!  "
+					"Unmount and run chkdsk.", err2);
+			NVolSetErrors(vol);
+		}
+		/* Free the runlist. */
+		ntfs_free(rl);
+	} else if (err == -ENOSPC)
+		ntfs_debug("No space left at all, err = -ENOSPC, first free "
+				"lcn = 0x%llx.",
+				(long long)vol->data1_zone_pos);
+	up_write(&vol->lcnbmp_lock);
+	return ERR_PTR(err);
+}
+
+/**
+ * __ntfs_cluster_free - free clusters on an ntfs volume
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
+ * @count:	number of clusters to free or -1 for all clusters
+ * @ctx:	active attribute search context if present or NULL if not
+ * @is_rollback:	true if this is a rollback operation
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the vfs inode @ni.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated.  Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when __ntfs_cluster_free() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
+ * perform the necessary mapping and unmapping.
+ *
+ * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning.  Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry.  However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
+ * you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * @is_rollback should always be 'false', it is for internal use to rollback
+ * errors.  You probably want to use ntfs_cluster_free() instead.
+ *
+ * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
+ * remove from the runlist or mark sparse the freed runs later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
+ *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *	      on return.
+ *	    - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
+		ntfs_attr_search_ctx *ctx, const bool is_rollback)
+{
+	s64 delta, to_free, total_freed, real_freed;
+	ntfs_volume *vol;
+	struct inode *lcnbmp_vi;
+	runlist_element *rl;
+	int err;
+
+	BUG_ON(!ni);
+	ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
+			"0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
+			(unsigned long long)count,
+			is_rollback ? " (rollback)" : "");
+	vol = ni->vol;
+	lcnbmp_vi = vol->lcnbmp_ino;
+	BUG_ON(!lcnbmp_vi);
+	BUG_ON(start_vcn < 0);
+	BUG_ON(count < -1);
+	/*
+	 * Lock the lcn bitmap for writing but only if not rolling back.  We
+	 * must hold the lock all the way including through rollback otherwise
+	 * rollback is not possible because once we have cleared a bit and
+	 * dropped the lock, anyone could have set the bit again, thus
+	 * allocating the cluster for another use.
+	 */
+	if (likely(!is_rollback))
+		down_write(&vol->lcnbmp_lock);
+
+	total_freed = real_freed = 0;
+
+	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
+	if (IS_ERR(rl)) {
+		if (!is_rollback)
+			ntfs_error(vol->sb, "Failed to find first runlist "
+					"element (error %li), aborting.",
+					PTR_ERR(rl));
+		err = PTR_ERR(rl);
+		goto err_out;
+	}
+	if (unlikely(rl->lcn < LCN_HOLE)) {
+		if (!is_rollback)
+			ntfs_error(vol->sb, "First runlist element has "
+					"invalid lcn, aborting.");
+		err = -EIO;
+		goto err_out;
+	}
+	/* Find the starting cluster inside the run that needs freeing. */
+	delta = start_vcn - rl->vcn;
+
+	/* The number of clusters in this run that need freeing. */
+	to_free = rl->length - delta;
+	if (count >= 0 && to_free > count)
+		to_free = count;
+
+	if (likely(rl->lcn >= 0)) {
+		/* Do the actual freeing of the clusters in this run. */
+		err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
+				to_free, likely(!is_rollback) ? 0 : 1);
+		if (unlikely(err)) {
+			if (!is_rollback)
+				ntfs_error(vol->sb, "Failed to clear first run "
+						"(error %i), aborting.", err);
+			goto err_out;
+		}
+		/* We have freed @to_free real clusters. */
+		real_freed = to_free;
+	};
+	/* Go to the next run and adjust the number of clusters left to free. */
+	++rl;
+	if (count >= 0)
+		count -= to_free;
+
+	/* Keep track of the total "freed" clusters, including sparse ones. */
+	total_freed = to_free;
+	/*
+	 * Loop over the remaining runs, using @count as a capping value, and
+	 * free them.
+	 */
+	for (; rl->length && count != 0; ++rl) {
+		if (unlikely(rl->lcn < LCN_HOLE)) {
+			VCN vcn;
+
+			/* Attempt to map runlist. */
+			vcn = rl->vcn;
+			rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
+			if (IS_ERR(rl)) {
+				err = PTR_ERR(rl);
+				if (!is_rollback)
+					ntfs_error(vol->sb, "Failed to map "
+							"runlist fragment or "
+							"failed to find "
+							"subsequent runlist "
+							"element.");
+				goto err_out;
+			}
+			if (unlikely(rl->lcn < LCN_HOLE)) {
+				if (!is_rollback)
+					ntfs_error(vol->sb, "Runlist element "
+							"has invalid lcn "
+							"(0x%llx).",
+							(unsigned long long)
+							rl->lcn);
+				err = -EIO;
+				goto err_out;
+			}
+		}
+		/* The number of clusters in this run that need freeing. */
+		to_free = rl->length;
+		if (count >= 0 && to_free > count)
+			to_free = count;
+
+		if (likely(rl->lcn >= 0)) {
+			/* Do the actual freeing of the clusters in the run. */
+			err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
+					to_free, likely(!is_rollback) ? 0 : 1);
+			if (unlikely(err)) {
+				if (!is_rollback)
+					ntfs_error(vol->sb, "Failed to clear "
+							"subsequent run.");
+				goto err_out;
+			}
+			/* We have freed @to_free real clusters. */
+			real_freed += to_free;
+		}
+		/* Adjust the number of clusters left to free. */
+		if (count >= 0)
+			count -= to_free;
+	
+		/* Update the total done clusters. */
+		total_freed += to_free;
+	}
+	if (likely(!is_rollback))
+		up_write(&vol->lcnbmp_lock);
+
+	BUG_ON(count > 0);
+
+	/* We are done.  Return the number of actually freed clusters. */
+	ntfs_debug("Done.");
+	return real_freed;
+err_out:
+	if (is_rollback)
+		return err;
+	/* If no real clusters were freed, no need to rollback. */
+	if (!real_freed) {
+		up_write(&vol->lcnbmp_lock);
+		return err;
+	}
+	/*
+	 * Attempt to rollback and if that succeeds just return the error code.
+	 * If rollback fails, set the volume errors flag, emit an error
+	 * message, and return the error code.
+	 */
+	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
+	if (delta < 0) {
+		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
+				"inconsistent metadata!  Unmount and run "
+				"chkdsk.", (int)delta);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->lcnbmp_lock);
+	ntfs_error(vol->sb, "Aborting (error %i).", err);
+	return err;
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/lcnalloc.h b/kernel/fs/ntfs/lcnalloc.h
new file mode 100644
index 000000000..2adb04316
--- /dev/null
+++ b/kernel/fs/ntfs/lcnalloc.h
@@ -0,0 +1,145 @@
+/*
+ * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
+ *		Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_LCNALLOC_H
+#define _LINUX_NTFS_LCNALLOC_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "attrib.h"
+#include "types.h"
+#include "inode.h"
+#include "runlist.h"
+#include "volume.h"
+
+typedef enum {
+	FIRST_ZONE	= 0,	/* For sanity checking. */
+	MFT_ZONE	= 0,	/* Allocate from $MFT zone. */
+	DATA_ZONE	= 1,	/* Allocate from $DATA zone. */
+	LAST_ZONE	= 1,	/* For sanity checking. */
+} NTFS_CLUSTER_ALLOCATION_ZONES;
+
+extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
+		const VCN start_vcn, const s64 count, const LCN start_lcn,
+		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+		const bool is_extension);
+
+extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
+
+/**
+ * ntfs_cluster_free - free clusters on an ntfs volume
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
+ * @count:	number of clusters to free or -1 for all clusters
+ * @ctx:	active attribute search context if present or NULL if not
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the ntfs inode @ni.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated.  Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_cluster_free() encounters unmapped runlist
+ * fragments and allows their mapping.  If you do not have the mft record
+ * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
+ * the necessary mapping and unmapping.
+ *
+ * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning.  Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry.  However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
+ * you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
+ * from the runlist or mark sparse the freed runs later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
+ *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *	      on return.
+ *	    - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count, ntfs_attr_search_ctx *ctx)
+{
+	return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
+}
+
+extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+		const runlist_element *rl);
+
+/**
+ * ntfs_cluster_free_from_rl - free clusters from runlist
+ * @vol:	mounted ntfs volume on which to free the clusters
+ * @rl:		runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol.  In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - The caller must have locked the runlist @rl for reading or
+ *	      writing.
+ */
+static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
+		const runlist_element *rl)
+{
+	int ret;
+
+	down_write(&vol->lcnbmp_lock);
+	ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
+	up_write(&vol->lcnbmp_lock);
+	return ret;
+}
+
+#endif /* NTFS_RW */
+
+#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/kernel/fs/ntfs/logfile.c b/kernel/fs/ntfs/logfile.c
new file mode 100644
index 000000000..c71de292c
--- /dev/null
+++ b/kernel/fs/ntfs/logfile.c
@@ -0,0 +1,863 @@
+/*
+ * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2007 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "attrib.h"
+#include "aops.h"
+#include "debug.h"
+#include "logfile.h"
+#include "malloc.h"
+#include "volume.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_check_restart_page_header - check the page header for consistency
+ * @vi:		$LogFile inode to which the restart page header belongs
+ * @rp:		restart page header to check
+ * @pos:	position in @vi at which the restart page header resides
+ *
+ * Check the restart page header @rp for consistency and return 'true' if it is
+ * consistent and 'false' otherwise.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static bool ntfs_check_restart_page_header(struct inode *vi,
+		RESTART_PAGE_HEADER *rp, s64 pos)
+{
+	u32 logfile_system_page_size, logfile_log_page_size;
+	u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
+	bool have_usa = true;
+
+	ntfs_debug("Entering.");
+	/*
+	 * If the system or log page sizes are smaller than the ntfs block size
+	 * or either is not a power of 2 we cannot handle this log file.
+	 */
+	logfile_system_page_size = le32_to_cpu(rp->system_page_size);
+	logfile_log_page_size = le32_to_cpu(rp->log_page_size);
+	if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
+			logfile_log_page_size < NTFS_BLOCK_SIZE ||
+			logfile_system_page_size &
+			(logfile_system_page_size - 1) ||
+			!is_power_of_2(logfile_log_page_size)) {
+		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
+		return false;
+	}
+	/*
+	 * We must be either at !pos (1st restart page) or at pos = system page
+	 * size (2nd restart page).
+	 */
+	if (pos && pos != logfile_system_page_size) {
+		ntfs_error(vi->i_sb, "Found restart area in incorrect "
+				"position in $LogFile.");
+		return false;
+	}
+	/* We only know how to handle version 1.1. */
+	if (sle16_to_cpu(rp->major_ver) != 1 ||
+			sle16_to_cpu(rp->minor_ver) != 1) {
+		ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
+				"supported.  (This driver supports version "
+				"1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
+				(int)sle16_to_cpu(rp->minor_ver));
+		return false;
+	}
+	/*
+	 * If chkdsk has been run the restart page may not be protected by an
+	 * update sequence array.
+	 */
+	if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
+		have_usa = false;
+		goto skip_usa_checks;
+	}
+	/* Verify the size of the update sequence array. */
+	usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
+	if (usa_count != le16_to_cpu(rp->usa_count)) {
+		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+				"inconsistent update sequence array count.");
+		return false;
+	}
+	/* Verify the position of the update sequence array. */
+	usa_ofs = le16_to_cpu(rp->usa_ofs);
+	usa_end = usa_ofs + usa_count * sizeof(u16);
+	if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
+			usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
+		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+				"inconsistent update sequence array offset.");
+		return false;
+	}
+skip_usa_checks:
+	/*
+	 * Verify the position of the restart area.  It must be:
+	 *	- aligned to 8-byte boundary,
+	 *	- after the update sequence array, and
+	 *	- within the system page size.
+	 */
+	ra_ofs = le16_to_cpu(rp->restart_area_offset);
+	if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
+			ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
+			ra_ofs > logfile_system_page_size) {
+		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+				"inconsistent restart area offset.");
+		return false;
+	}
+	/*
+	 * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
+	 * set.
+	 */
+	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
+		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
+				"by chkdsk but a chkdsk LSN is specified.");
+		return false;
+	}
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * ntfs_check_restart_area - check the restart area for consistency
+ * @vi:		$LogFile inode to which the restart page belongs
+ * @rp:		restart page whose restart area to check
+ *
+ * Check the restart area of the restart page @rp for consistency and return
+ * 'true' if it is consistent and 'false' otherwise.
+ *
+ * This function assumes that the restart page header has already been
+ * consistency checked.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
+{
+	u64 file_size;
+	RESTART_AREA *ra;
+	u16 ra_ofs, ra_len, ca_ofs;
+	u8 fs_bits;
+
+	ntfs_debug("Entering.");
+	ra_ofs = le16_to_cpu(rp->restart_area_offset);
+	ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
+	/*
+	 * Everything before ra->file_size must be before the first word
+	 * protected by an update sequence number.  This ensures that it is
+	 * safe to access ra->client_array_offset.
+	 */
+	if (ra_ofs + offsetof(RESTART_AREA, file_size) >
+			NTFS_BLOCK_SIZE - sizeof(u16)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent file offset.");
+		return false;
+	}
+	/*
+	 * Now that we can access ra->client_array_offset, make sure everything
+	 * up to the log client array is before the first word protected by an
+	 * update sequence number.  This ensures we can access all of the
+	 * restart area elements safely.  Also, the client array offset must be
+	 * aligned to an 8-byte boundary.
+	 */
+	ca_ofs = le16_to_cpu(ra->client_array_offset);
+	if (((ca_ofs + 7) & ~7) != ca_ofs ||
+			ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent client array offset.");
+		return false;
+	}
+	/*
+	 * The restart area must end within the system page size both when
+	 * calculated manually and as specified by ra->restart_area_length.
+	 * Also, the calculated length must not exceed the specified length.
+	 */
+	ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
+			sizeof(LOG_CLIENT_RECORD);
+	if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
+			ra_ofs + le16_to_cpu(ra->restart_area_length) >
+			le32_to_cpu(rp->system_page_size) ||
+			ra_len > le16_to_cpu(ra->restart_area_length)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
+				"of the system page size specified by the "
+				"restart page header and/or the specified "
+				"restart area length is inconsistent.");
+		return false;
+	}
+	/*
+	 * The ra->client_free_list and ra->client_in_use_list must be either
+	 * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
+	 * overflowing the client array.
+	 */
+	if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
+			le16_to_cpu(ra->client_free_list) >=
+			le16_to_cpu(ra->log_clients)) ||
+			(ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+			le16_to_cpu(ra->client_in_use_list) >=
+			le16_to_cpu(ra->log_clients))) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"overflowing client free and/or in use lists.");
+		return false;
+	}
+	/*
+	 * Check ra->seq_number_bits against ra->file_size for consistency.
+	 * We cannot just use ffs() because the file size is not a power of 2.
+	 */
+	file_size = (u64)sle64_to_cpu(ra->file_size);
+	fs_bits = 0;
+	while (file_size) {
+		file_size >>= 1;
+		fs_bits++;
+	}
+	if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent sequence number bits.");
+		return false;
+	}
+	/* The log record header length must be a multiple of 8. */
+	if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
+			le16_to_cpu(ra->log_record_header_length)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent log record header length.");
+		return false;
+	}
+	/* Dito for the log page data offset. */
+	if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
+			le16_to_cpu(ra->log_page_data_offset)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent log page data offset.");
+		return false;
+	}
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * ntfs_check_log_client_array - check the log client array for consistency
+ * @vi:		$LogFile inode to which the restart page belongs
+ * @rp:		restart page whose log client array to check
+ *
+ * Check the log client array of the restart page @rp for consistency and
+ * return 'true' if it is consistent and 'false' otherwise.
+ *
+ * This function assumes that the restart page header and the restart area have
+ * already been consistency checked.
+ *
+ * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
+ * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
+ * restart page and the page must be multi sector transfer deprotected.
+ */
+static bool ntfs_check_log_client_array(struct inode *vi,
+		RESTART_PAGE_HEADER *rp)
+{
+	RESTART_AREA *ra;
+	LOG_CLIENT_RECORD *ca, *cr;
+	u16 nr_clients, idx;
+	bool in_free_list, idx_is_first;
+
+	ntfs_debug("Entering.");
+	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+	ca = (LOG_CLIENT_RECORD*)((u8*)ra +
+			le16_to_cpu(ra->client_array_offset));
+	/*
+	 * Check the ra->client_free_list first and then check the
+	 * ra->client_in_use_list.  Check each of the log client records in
+	 * each of the lists and check that the array does not overflow the
+	 * ra->log_clients value.  Also keep track of the number of records
+	 * visited as there cannot be more than ra->log_clients records and
+	 * that way we detect eventual loops in within a list.
+	 */
+	nr_clients = le16_to_cpu(ra->log_clients);
+	idx = le16_to_cpu(ra->client_free_list);
+	in_free_list = true;
+check_list:
+	for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
+			idx = le16_to_cpu(cr->next_client)) {
+		if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
+			goto err_out;
+		/* Set @cr to the current log client record. */
+		cr = ca + idx;
+		/* The first log client record must not have a prev_client. */
+		if (idx_is_first) {
+			if (cr->prev_client != LOGFILE_NO_CLIENT)
+				goto err_out;
+			idx_is_first = false;
+		}
+	}
+	/* Switch to and check the in use list if we just did the free list. */
+	if (in_free_list) {
+		in_free_list = false;
+		idx = le16_to_cpu(ra->client_in_use_list);
+		goto check_list;
+	}
+	ntfs_debug("Done.");
+	return true;
+err_out:
+	ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
+	return false;
+}
+
+/**
+ * ntfs_check_and_load_restart_page - check the restart page for consistency
+ * @vi:		$LogFile inode to which the restart page belongs
+ * @rp:		restart page to check
+ * @pos:	position in @vi at which the restart page resides
+ * @wrp:	[OUT] copy of the multi sector transfer deprotected restart page
+ * @lsn:	[OUT] set to the current logfile lsn on success
+ *
+ * Check the restart page @rp for consistency and return 0 if it is consistent
+ * and -errno otherwise.  The restart page may have been modified by chkdsk in
+ * which case its magic is CHKD instead of RSTR.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ *
+ * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
+ * copy of the complete multi sector transfer deprotected page.  On failure,
+ * *@wrp is undefined.
+ *
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
+ * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
+ *
+ * The following error codes are defined:
+ *	-EINVAL	- The restart page is inconsistent.
+ *	-ENOMEM	- Not enough memory to load the restart page.
+ *	-EIO	- Failed to reading from $LogFile.
+ */
+static int ntfs_check_and_load_restart_page(struct inode *vi,
+		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
+		LSN *lsn)
+{
+	RESTART_AREA *ra;
+	RESTART_PAGE_HEADER *trp;
+	int size, err;
+
+	ntfs_debug("Entering.");
+	/* Check the restart page header for consistency. */
+	if (!ntfs_check_restart_page_header(vi, rp, pos)) {
+		/* Error output already done inside the function. */
+		return -EINVAL;
+	}
+	/* Check the restart area for consistency. */
+	if (!ntfs_check_restart_area(vi, rp)) {
+		/* Error output already done inside the function. */
+		return -EINVAL;
+	}
+	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+	/*
+	 * Allocate a buffer to store the whole restart page so we can multi
+	 * sector transfer deprotect it.
+	 */
+	trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
+	if (!trp) {
+		ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
+				"restart page buffer.");
+		return -ENOMEM;
+	}
+	/*
+	 * Read the whole of the restart page into the buffer.  If it fits
+	 * completely inside @rp, just copy it from there.  Otherwise map all
+	 * the required pages and copy the data from them.
+	 */
+	size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK);
+	if (size >= le32_to_cpu(rp->system_page_size)) {
+		memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
+	} else {
+		pgoff_t idx;
+		struct page *page;
+		int have_read, to_read;
+
+		/* First copy what we already have in @rp. */
+		memcpy(trp, rp, size);
+		/* Copy the remaining data one page at a time. */
+		have_read = size;
+		to_read = le32_to_cpu(rp->system_page_size) - size;
+		idx = (pos + size) >> PAGE_CACHE_SHIFT;
+		BUG_ON((pos + size) & ~PAGE_CACHE_MASK);
+		do {
+			page = ntfs_map_page(vi->i_mapping, idx);
+			if (IS_ERR(page)) {
+				ntfs_error(vi->i_sb, "Error mapping $LogFile "
+						"page (index %lu).", idx);
+				err = PTR_ERR(page);
+				if (err != -EIO && err != -ENOMEM)
+					err = -EIO;
+				goto err_out;
+			}
+			size = min_t(int, to_read, PAGE_CACHE_SIZE);
+			memcpy((u8*)trp + have_read, page_address(page), size);
+			ntfs_unmap_page(page);
+			have_read += size;
+			to_read -= size;
+			idx++;
+		} while (to_read > 0);
+	}
+	/*
+	 * Perform the multi sector transfer deprotection on the buffer if the
+	 * restart page is protected.
+	 */
+	if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
+			&& post_read_mst_fixup((NTFS_RECORD*)trp,
+			le32_to_cpu(rp->system_page_size))) {
+		/*
+		 * A multi sector tranfer error was detected.  We only need to
+		 * abort if the restart page contents exceed the multi sector
+		 * transfer fixup of the first sector.
+		 */
+		if (le16_to_cpu(rp->restart_area_offset) +
+				le16_to_cpu(ra->restart_area_length) >
+				NTFS_BLOCK_SIZE - sizeof(u16)) {
+			ntfs_error(vi->i_sb, "Multi sector transfer error "
+					"detected in $LogFile restart page.");
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	/*
+	 * If the restart page is modified by chkdsk or there are no active
+	 * logfile clients, the logfile is consistent.  Otherwise, need to
+	 * check the log client records for consistency, too.
+	 */
+	err = 0;
+	if (ntfs_is_rstr_record(rp->magic) &&
+			ra->client_in_use_list != LOGFILE_NO_CLIENT) {
+		if (!ntfs_check_log_client_array(vi, trp)) {
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	if (lsn) {
+		if (ntfs_is_rstr_record(rp->magic))
+			*lsn = sle64_to_cpu(ra->current_lsn);
+		else /* if (ntfs_is_chkd_record(rp->magic)) */
+			*lsn = sle64_to_cpu(rp->chkdsk_lsn);
+	}
+	ntfs_debug("Done.");
+	if (wrp)
+		*wrp = trp;
+	else {
+err_out:
+		ntfs_free(trp);
+	}
+	return err;
+}
+
+/**
+ * ntfs_check_logfile - check the journal for consistency
+ * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		[OUT] on success this is a copy of the current restart page
+ *
+ * Check the $LogFile journal for consistency and return 'true' if it is
+ * consistent and 'false' if not.  On success, the current restart page is
+ * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
+ *
+ * At present we only check the two restart pages and ignore the log record
+ * pages.
+ *
+ * Note that the MstProtected flag is not set on the $LogFile inode and hence
+ * when reading pages they are not deprotected.  This is because we do not know
+ * if the $LogFile was created on a system with a different page size to ours
+ * yet and mst deprotection would fail if our page size is smaller.
+ */
+bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
+{
+	s64 size, pos;
+	LSN rstr1_lsn, rstr2_lsn;
+	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+	struct address_space *mapping = log_vi->i_mapping;
+	struct page *page = NULL;
+	u8 *kaddr = NULL;
+	RESTART_PAGE_HEADER *rstr1_ph = NULL;
+	RESTART_PAGE_HEADER *rstr2_ph = NULL;
+	int log_page_size, log_page_mask, err;
+	bool logfile_is_empty = true;
+	u8 log_page_bits;
+
+	ntfs_debug("Entering.");
+	/* An empty $LogFile must have been clean before it got emptied. */
+	if (NVolLogFileEmpty(vol))
+		goto is_empty;
+	size = i_size_read(log_vi);
+	/* Make sure the file doesn't exceed the maximum allowed size. */
+	if (size > MaxLogFileSize)
+		size = MaxLogFileSize;
+	/*
+	 * Truncate size to a multiple of the page cache size or the default
+	 * log page size if the page cache size is between the default log page
+	 * log page size if the page cache size is between the default log page
+	 * size and twice that.
+	 */
+	if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <=
+			DefaultLogPageSize * 2)
+		log_page_size = DefaultLogPageSize;
+	else
+		log_page_size = PAGE_CACHE_SIZE;
+	log_page_mask = log_page_size - 1;
+	/*
+	 * Use ntfs_ffs() instead of ffs() to enable the compiler to
+	 * optimize log_page_size and log_page_bits into constants.
+	 */
+	log_page_bits = ntfs_ffs(log_page_size) - 1;
+	size &= ~(s64)(log_page_size - 1);
+	/*
+	 * Ensure the log file is big enough to store at least the two restart
+	 * pages and the minimum number of log record pages.
+	 */
+	if (size < log_page_size * 2 || (size - log_page_size * 2) >>
+			log_page_bits < MinLogRecordPages) {
+		ntfs_error(vol->sb, "$LogFile is too small.");
+		return false;
+	}
+	/*
+	 * Read through the file looking for a restart page.  Since the restart
+	 * page header is at the beginning of a page we only need to search at
+	 * what could be the beginning of a page (for each page size) rather
+	 * than scanning the whole file byte by byte.  If all potential places
+	 * contain empty and uninitialzed records, the log file can be assumed
+	 * to be empty.
+	 */
+	for (pos = 0; pos < size; pos <<= 1) {
+		pgoff_t idx = pos >> PAGE_CACHE_SHIFT;
+		if (!page || page->index != idx) {
+			if (page)
+				ntfs_unmap_page(page);
+			page = ntfs_map_page(mapping, idx);
+			if (IS_ERR(page)) {
+				ntfs_error(vol->sb, "Error mapping $LogFile "
+						"page (index %lu).", idx);
+				goto err_out;
+			}
+		}
+		kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
+		/*
+		 * A non-empty block means the logfile is not empty while an
+		 * empty block after a non-empty block has been encountered
+		 * means we are done.
+		 */
+		if (!ntfs_is_empty_recordp((le32*)kaddr))
+			logfile_is_empty = false;
+		else if (!logfile_is_empty)
+			break;
+		/*
+		 * A log record page means there cannot be a restart page after
+		 * this so no need to continue searching.
+		 */
+		if (ntfs_is_rcrd_recordp((le32*)kaddr))
+			break;
+		/* If not a (modified by chkdsk) restart page, continue. */
+		if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
+				!ntfs_is_chkd_recordp((le32*)kaddr)) {
+			if (!pos)
+				pos = NTFS_BLOCK_SIZE >> 1;
+			continue;
+		}
+		/*
+		 * Check the (modified by chkdsk) restart page for consistency
+		 * and get a copy of the complete multi sector transfer
+		 * deprotected restart page.
+		 */
+		err = ntfs_check_and_load_restart_page(log_vi,
+				(RESTART_PAGE_HEADER*)kaddr, pos,
+				!rstr1_ph ? &rstr1_ph : &rstr2_ph,
+				!rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
+		if (!err) {
+			/*
+			 * If we have now found the first (modified by chkdsk)
+			 * restart page, continue looking for the second one.
+			 */
+			if (!pos) {
+				pos = NTFS_BLOCK_SIZE >> 1;
+				continue;
+			}
+			/*
+			 * We have now found the second (modified by chkdsk)
+			 * restart page, so we can stop looking.
+			 */
+			break;
+		}
+		/*
+		 * Error output already done inside the function.  Note, we do
+		 * not abort if the restart page was invalid as we might still
+		 * find a valid one further in the file.
+		 */
+		if (err != -EINVAL) {
+			ntfs_unmap_page(page);
+			goto err_out;
+		}
+		/* Continue looking. */
+		if (!pos)
+			pos = NTFS_BLOCK_SIZE >> 1;
+	}
+	if (page)
+		ntfs_unmap_page(page);
+	if (logfile_is_empty) {
+		NVolSetLogFileEmpty(vol);
+is_empty:
+		ntfs_debug("Done.  ($LogFile is empty.)");
+		return true;
+	}
+	if (!rstr1_ph) {
+		BUG_ON(rstr2_ph);
+		ntfs_error(vol->sb, "Did not find any restart pages in "
+				"$LogFile and it was not empty.");
+		return false;
+	}
+	/* If both restart pages were found, use the more recent one. */
+	if (rstr2_ph) {
+		/*
+		 * If the second restart area is more recent, switch to it.
+		 * Otherwise just throw it away.
+		 */
+		if (rstr2_lsn > rstr1_lsn) {
+			ntfs_debug("Using second restart page as it is more "
+					"recent.");
+			ntfs_free(rstr1_ph);
+			rstr1_ph = rstr2_ph;
+			/* rstr1_lsn = rstr2_lsn; */
+		} else {
+			ntfs_debug("Using first restart page as it is more "
+					"recent.");
+			ntfs_free(rstr2_ph);
+		}
+		rstr2_ph = NULL;
+	}
+	/* All consistency checks passed. */
+	if (rp)
+		*rp = rstr1_ph;
+	else
+		ntfs_free(rstr1_ph);
+	ntfs_debug("Done.");
+	return true;
+err_out:
+	if (rstr1_ph)
+		ntfs_free(rstr1_ph);
+	return false;
+}
+
+/**
+ * ntfs_is_logfile_clean - check in the journal if the volume is clean
+ * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		copy of the current restart page
+ *
+ * Analyze the $LogFile journal and return 'true' if it indicates the volume was
+ * shutdown cleanly and 'false' if not.
+ *
+ * At present we only look at the two restart pages and ignore the log record
+ * pages.  This is a little bit crude in that there will be a very small number
+ * of cases where we think that a volume is dirty when in fact it is clean.
+ * This should only affect volumes that have not been shutdown cleanly but did
+ * not have any pending, non-check-pointed i/o, i.e. they were completely idle
+ * at least for the five seconds preceding the unclean shutdown.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
+ * is empty this function requires that NVolLogFileEmpty() is true otherwise an
+ * empty volume will be reported as dirty.
+ */
+bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
+{
+	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+	RESTART_AREA *ra;
+
+	ntfs_debug("Entering.");
+	/* An empty $LogFile must have been clean before it got emptied. */
+	if (NVolLogFileEmpty(vol)) {
+		ntfs_debug("Done.  ($LogFile is empty.)");
+		return true;
+	}
+	BUG_ON(!rp);
+	if (!ntfs_is_rstr_record(rp->magic) &&
+			!ntfs_is_chkd_record(rp->magic)) {
+		ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
+				"probably a bug in that the $LogFile should "
+				"have been consistency checked before calling "
+				"this function.");
+		return false;
+	}
+	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+	/*
+	 * If the $LogFile has active clients, i.e. it is open, and we do not
+	 * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
+	 * we assume there was an unclean shutdown.
+	 */
+	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
+		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
+		return false;
+	}
+	/* $LogFile indicates a clean shutdown. */
+	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
+	return true;
+}
+
+/**
+ * ntfs_empty_logfile - empty the contents of the $LogFile journal
+ * @log_vi:	struct inode of loaded journal $LogFile to empty
+ *
+ * Empty the contents of the $LogFile journal @log_vi and return 'true' on
+ * success and 'false' on error.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
+ * has been used to ensure that the $LogFile is clean.
+ */
+bool ntfs_empty_logfile(struct inode *log_vi)
+{
+	VCN vcn, end_vcn;
+	ntfs_inode *log_ni = NTFS_I(log_vi);
+	ntfs_volume *vol = log_ni->vol;
+	struct super_block *sb = vol->sb;
+	runlist_element *rl;
+	unsigned long flags;
+	unsigned block_size, block_size_bits;
+	int err;
+	bool should_wait = true;
+
+	ntfs_debug("Entering.");
+	if (NVolLogFileEmpty(vol)) {
+		ntfs_debug("Done.");
+		return true;
+	}
+	/*
+	 * We cannot use ntfs_attr_set() because we may be still in the middle
+	 * of a mount operation.  Thus we do the emptying by hand by first
+	 * zapping the page cache pages for the $LogFile/$DATA attribute and
+	 * then emptying each of the buffers in each of the clusters specified
+	 * by the runlist by hand.
+	 */
+	block_size = sb->s_blocksize;
+	block_size_bits = sb->s_blocksize_bits;
+	vcn = 0;
+	read_lock_irqsave(&log_ni->size_lock, flags);
+	end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
+			vol->cluster_size_bits;
+	read_unlock_irqrestore(&log_ni->size_lock, flags);
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	down_write(&log_ni->runlist.lock);
+	rl = log_ni->runlist.rl;
+	if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
+map_vcn:
+		err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
+		if (err) {
+			ntfs_error(sb, "Failed to map runlist fragment (error "
+					"%d).", -err);
+			goto err;
+		}
+		rl = log_ni->runlist.rl;
+		BUG_ON(!rl || vcn < rl->vcn || !rl->length);
+	}
+	/* Seek to the runlist element containing @vcn. */
+	while (rl->length && vcn >= rl[1].vcn)
+		rl++;
+	do {
+		LCN lcn;
+		sector_t block, end_block;
+		s64 len;
+
+		/*
+		 * If this run is not mapped map it now and start again as the
+		 * runlist will have been updated.
+		 */
+		lcn = rl->lcn;
+		if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
+			vcn = rl->vcn;
+			goto map_vcn;
+		}
+		/* If this run is not valid abort with an error. */
+		if (unlikely(!rl->length || lcn < LCN_HOLE))
+			goto rl_err;
+		/* Skip holes. */
+		if (lcn == LCN_HOLE)
+			continue;
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		len = rl->length;
+		if (rl[1].vcn > end_vcn)
+			len = end_vcn - rl->vcn;
+		end_block = (lcn + len) << vol->cluster_size_bits >>
+				block_size_bits;
+		/* Iterate over the blocks in the run and empty them. */
+		do {
+			struct buffer_head *bh;
+
+			/* Obtain the buffer, possibly not uptodate. */
+			bh = sb_getblk(sb, block);
+			BUG_ON(!bh);
+			/* Setup buffer i/o submission. */
+			lock_buffer(bh);
+			bh->b_end_io = end_buffer_write_sync;
+			get_bh(bh);
+			/* Set the entire contents of the buffer to 0xff. */
+			memset(bh->b_data, -1, block_size);
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			if (buffer_dirty(bh))
+				clear_buffer_dirty(bh);
+			/*
+			 * Submit the buffer and wait for i/o to complete but
+			 * only for the first buffer so we do not miss really
+			 * serious i/o errors.  Once the first buffer has
+			 * completed ignore errors afterwards as we can assume
+			 * that if one buffer worked all of them will work.
+			 */
+			submit_bh(WRITE, bh);
+			if (should_wait) {
+				should_wait = false;
+				wait_on_buffer(bh);
+				if (unlikely(!buffer_uptodate(bh)))
+					goto io_err;
+			}
+			brelse(bh);
+		} while (++block < end_block);
+	} while ((++rl)->vcn < end_vcn);
+	up_write(&log_ni->runlist.lock);
+	/*
+	 * Zap the pages again just in case any got instantiated whilst we were
+	 * emptying the blocks by hand.  FIXME: We may not have completed
+	 * writing to all the buffer heads yet so this may happen too early.
+	 * We really should use a kernel thread to do the emptying
+	 * asynchronously and then we can also set the volume dirty and output
+	 * an error message if emptying should fail.
+	 */
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	/* Set the flag so we do not have to do it again on remount. */
+	NVolSetLogFileEmpty(vol);
+	ntfs_debug("Done.");
+	return true;
+io_err:
+	ntfs_error(sb, "Failed to write buffer.  Unmount and run chkdsk.");
+	goto dirty_err;
+rl_err:
+	ntfs_error(sb, "Runlist is corrupt.  Unmount and run chkdsk.");
+dirty_err:
+	NVolSetErrors(vol);
+	err = -EIO;
+err:
+	up_write(&log_ni->runlist.lock);
+	ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
+			-err);
+	return false;
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/logfile.h b/kernel/fs/ntfs/logfile.h
new file mode 100644
index 000000000..aa2b6ac3f
--- /dev/null
+++ b/kernel/fs/ntfs/logfile.h
@@ -0,0 +1,309 @@
+/*
+ * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
+ *	       the Linux-NTFS project.
+ *
+ * Copyright (c) 2000-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_LOGFILE_H
+#define _LINUX_NTFS_LOGFILE_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+
+/*
+ * Journal ($LogFile) organization:
+ *
+ * Two restart areas present in the first two pages (restart pages, one restart
+ * area in each page).  When the volume is dismounted they should be identical,
+ * except for the update sequence array which usually has a different update
+ * sequence number.
+ *
+ * These are followed by log records organized in pages headed by a log record
+ * header going up to log file size.  Not all pages contain log records when a
+ * volume is first formatted, but as the volume ages, all records will be used.
+ * When the log file fills up, the records at the beginning are purged (by
+ * modifying the oldest_lsn to a higher value presumably) and writing begins
+ * at the beginning of the file.  Effectively, the log file is viewed as a
+ * circular entity.
+ *
+ * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
+ * versions <= 1.x, including 0.-1.  (Yes, that is a minus one in there!)  We
+ * probably only want to support 1.1 as this seems to be the current version
+ * and we don't know how that differs from the older versions.  The only
+ * exception is if the journal is clean as marked by the two restart pages
+ * then it doesn't matter whether we are on an earlier version.  We can just
+ * reinitialize the logfile and start again with version 1.1.
+ */
+
+/* Some $LogFile related constants. */
+#define MaxLogFileSize		0x100000000ULL
+#define DefaultLogPageSize	4096
+#define MinLogRecordPages	48
+
+/*
+ * Log file restart page header (begins the restart area).
+ */
+typedef struct {
+/*Ofs*/
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+/*  0*/	NTFS_RECORD_TYPE magic;	/* The magic is "RSTR". */
+/*  4*/	le16 usa_ofs;		/* See NTFS_RECORD definition in layout.h.
+				   When creating, set this to be immediately
+				   after this header structure (without any
+				   alignment). */
+/*  6*/	le16 usa_count;		/* See NTFS_RECORD definition in layout.h. */
+
+/*  8*/	leLSN chkdsk_lsn;	/* The last log file sequence number found by
+				   chkdsk.  Only used when the magic is changed
+				   to "CHKD".  Otherwise this is zero. */
+/* 16*/	le32 system_page_size;	/* Byte size of system pages when the log file
+				   was created, has to be >= 512 and a power of
+				   2.  Use this to calculate the required size
+				   of the usa (usa_count) and add it to usa_ofs.
+				   Then verify that the result is less than the
+				   value of the restart_area_offset. */
+/* 20*/	le32 log_page_size;	/* Byte size of log file pages, has to be >=
+				   512 and a power of 2.  The default is 4096
+				   and is used when the system page size is
+				   between 4096 and 8192.  Otherwise this is
+				   set to the system page size instead. */
+/* 24*/	le16 restart_area_offset;/* Byte offset from the start of this header to
+				   the RESTART_AREA.  Value has to be aligned
+				   to 8-byte boundary.  When creating, set this
+				   to be after the usa. */
+/* 26*/	sle16 minor_ver;	/* Log file minor version.  Only check if major
+				   version is 1. */
+/* 28*/	sle16 major_ver;	/* Log file major version.  We only support
+				   version 1.1. */
+/* sizeof() = 30 (0x1e) bytes */
+} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
+
+/*
+ * Constant for the log client indices meaning that there are no client records
+ * in this particular client array.  Also inside the client records themselves,
+ * this means that there are no client records preceding or following this one.
+ */
+#define LOGFILE_NO_CLIENT	cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT_CPU	0xffff
+
+/*
+ * These are the so far known RESTART_AREA_* flags (16-bit) which contain
+ * information about the log file in which they are present.
+ */
+enum {
+	RESTART_VOLUME_IS_CLEAN	= cpu_to_le16(0x0002),
+	RESTART_SPACE_FILLER	= cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+} __attribute__ ((__packed__));
+
+typedef le16 RESTART_AREA_FLAGS;
+
+/*
+ * Log file restart area record.  The offset of this record is found by adding
+ * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
+ * in it.  See notes at restart_area_offset above.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	leLSN current_lsn;	/* The current, i.e. last LSN inside the log
+				   when the restart area was last written.
+				   This happens often but what is the interval?
+				   Is it just fixed time or is it every time a
+				   check point is written or somethine else?
+				   On create set to 0. */
+/*  8*/	le16 log_clients;	/* Number of log client records in the array of
+				   log client records which follows this
+				   restart area.  Must be 1.  */
+/* 10*/	le16 client_free_list;	/* The index of the first free log client record
+				   in the array of log client records.
+				   LOGFILE_NO_CLIENT means that there are no
+				   free log client records in the array.
+				   If != LOGFILE_NO_CLIENT, check that
+				   log_clients > client_free_list.  On Win2k
+				   and presumably earlier, on a clean volume
+				   this is != LOGFILE_NO_CLIENT, and it should
+				   be 0, i.e. the first (and only) client
+				   record is free and thus the logfile is
+				   closed and hence clean.  A dirty volume
+				   would have left the logfile open and hence
+				   this would be LOGFILE_NO_CLIENT.  On WinXP
+				   and presumably later, the logfile is always
+				   open, even on clean shutdown so this should
+				   always be LOGFILE_NO_CLIENT. */
+/* 12*/	le16 client_in_use_list;/* The index of the first in-use log client
+				   record in the array of log client records.
+				   LOGFILE_NO_CLIENT means that there are no
+				   in-use log client records in the array.  If
+				   != LOGFILE_NO_CLIENT check that log_clients
+				   > client_in_use_list.  On Win2k and
+				   presumably earlier, on a clean volume this
+				   is LOGFILE_NO_CLIENT, i.e. there are no
+				   client records in use and thus the logfile
+				   is closed and hence clean.  A dirty volume
+				   would have left the logfile open and hence
+				   this would be != LOGFILE_NO_CLIENT, and it
+				   should be 0, i.e. the first (and only)
+				   client record is in use.  On WinXP and
+				   presumably later, the logfile is always
+				   open, even on clean shutdown so this should
+				   always be 0. */
+/* 14*/	RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour.  On Win2k
+				   and presumably earlier this is always 0.  On
+				   WinXP and presumably later, if the logfile
+				   was shutdown cleanly, the second bit,
+				   RESTART_VOLUME_IS_CLEAN, is set.  This bit
+				   is cleared when the volume is mounted by
+				   WinXP and set when the volume is dismounted,
+				   thus if the logfile is dirty, this bit is
+				   clear.  Thus we don't need to check the
+				   Windows version to determine if the logfile
+				   is clean.  Instead if the logfile is closed,
+				   we know it must be clean.  If it is open and
+				   this bit is set, we also know it must be
+				   clean.  If on the other hand the logfile is
+				   open and this bit is clear, we can be almost
+				   certain that the logfile is dirty. */
+/* 16*/	le32 seq_number_bits;	/* How many bits to use for the sequence
+				   number.  This is calculated as 67 - the
+				   number of bits required to store the logfile
+				   size in bytes and this can be used in with
+				   the specified file_size as a consistency
+				   check. */
+/* 20*/	le16 restart_area_length;/* Length of the restart area including the
+				   client array.  Following checks required if
+				   version matches.  Otherwise, skip them.
+				   restart_area_offset + restart_area_length
+				   has to be <= system_page_size.  Also,
+				   restart_area_length has to be >=
+				   client_array_offset + (log_clients *
+				   sizeof(log client record)). */
+/* 22*/	le16 client_array_offset;/* Offset from the start of this record to
+				   the first log client record if versions are
+				   matched.  When creating, set this to be
+				   after this restart area structure, aligned
+				   to 8-bytes boundary.  If the versions do not
+				   match, this is ignored and the offset is
+				   assumed to be (sizeof(RESTART_AREA) + 7) &
+				   ~7, i.e. rounded up to first 8-byte
+				   boundary.  Either way, client_array_offset
+				   has to be aligned to an 8-byte boundary.
+				   Also, restart_area_offset +
+				   client_array_offset has to be <= 510.
+				   Finally, client_array_offset + (log_clients
+				   * sizeof(log client record)) has to be <=
+				   system_page_size.  On Win2k and presumably
+				   earlier, this is 0x30, i.e. immediately
+				   following this record.  On WinXP and
+				   presumably later, this is 0x40, i.e. there
+				   are 16 extra bytes between this record and
+				   the client array.  This probably means that
+				   the RESTART_AREA record is actually bigger
+				   in WinXP and later. */
+/* 24*/	sle64 file_size;	/* Usable byte size of the log file.  If the
+				   restart_area_offset + the offset of the
+				   file_size are > 510 then corruption has
+				   occurred.  This is the very first check when
+				   starting with the restart_area as if it
+				   fails it means that some of the above values
+				   will be corrupted by the multi sector
+				   transfer protection.  The file_size has to
+				   be rounded down to be a multiple of the
+				   log_page_size in the RESTART_PAGE_HEADER and
+				   then it has to be at least big enough to
+				   store the two restart pages and 48 (0x30)
+				   log record pages. */
+/* 32*/	le32 last_lsn_data_length;/* Length of data of last LSN, not including
+				   the log record header.  On create set to
+				   0. */
+/* 36*/	le16 log_record_header_length;/* Byte size of the log record header.
+				   If the version matches then check that the
+				   value of log_record_header_length is a
+				   multiple of 8, i.e.
+				   (log_record_header_length + 7) & ~7 ==
+				   log_record_header_length.  When creating set
+				   it to sizeof(LOG_RECORD_HEADER), aligned to
+				   8 bytes. */
+/* 38*/	le16 log_page_data_offset;/* Offset to the start of data in a log record
+				   page.  Must be a multiple of 8.  On create
+				   set it to immediately after the update
+				   sequence array of the log record page. */
+/* 40*/	le32 restart_log_open_count;/* A counter that gets incremented every
+				   time the logfile is restarted which happens
+				   at mount time when the logfile is opened.
+				   When creating set to a random value.  Win2k
+				   sets it to the low 32 bits of the current
+				   system time in NTFS format (see time.h). */
+/* 44*/	le32 reserved;		/* Reserved/alignment to 8-byte boundary. */
+/* sizeof() = 48 (0x30) bytes */
+} __attribute__ ((__packed__)) RESTART_AREA;
+
+/*
+ * Log client record.  The offset of this record is found by adding the offset
+ * of the RESTART_AREA to the client_array_offset value found in it.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	leLSN oldest_lsn;	/* Oldest LSN needed by this client.  On create
+				   set to 0. */
+/*  8*/	leLSN client_restart_lsn;/* LSN at which this client needs to restart
+				   the volume, i.e. the current position within
+				   the log file.  At present, if clean this
+				   should = current_lsn in restart area but it
+				   probably also = current_lsn when dirty most
+				   of the time.  At create set to 0. */
+/* 16*/	le16 prev_client;	/* The offset to the previous log client record
+				   in the array of log client records.
+				   LOGFILE_NO_CLIENT means there is no previous
+				   client record, i.e. this is the first one.
+				   This is always LOGFILE_NO_CLIENT. */
+/* 18*/	le16 next_client;	/* The offset to the next log client record in
+				   the array of log client records.
+				   LOGFILE_NO_CLIENT means there are no next
+				   client records, i.e. this is the last one.
+				   This is always LOGFILE_NO_CLIENT. */
+/* 20*/	le16 seq_number;	/* On Win2k and presumably earlier, this is set
+				   to zero every time the logfile is restarted
+				   and it is incremented when the logfile is
+				   closed at dismount time.  Thus it is 0 when
+				   dirty and 1 when clean.  On WinXP and
+				   presumably later, this is always 0. */
+/* 22*/	u8 reserved[6];		/* Reserved/alignment. */
+/* 28*/	le32 client_name_length;/* Length of client name in bytes.  Should
+				   always be 8. */
+/* 32*/	ntfschar client_name[64];/* Name of the client in Unicode.  Should
+				   always be "NTFS" with the remaining bytes
+				   set to 0. */
+/* sizeof() = 160 (0xa0) bytes */
+} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
+
+extern bool ntfs_check_logfile(struct inode *log_vi,
+		RESTART_PAGE_HEADER **rp);
+
+extern bool ntfs_is_logfile_clean(struct inode *log_vi,
+		const RESTART_PAGE_HEADER *rp);
+
+extern bool ntfs_empty_logfile(struct inode *log_vi);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/kernel/fs/ntfs/malloc.h b/kernel/fs/ntfs/malloc.h
new file mode 100644
index 000000000..a44b14cbc
--- /dev/null
+++ b/kernel/fs/ntfs/malloc.h
@@ -0,0 +1,96 @@
+/*
+ * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_MALLOC_H
+#define _LINUX_NTFS_MALLOC_H
+
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+/**
+ * __ntfs_malloc - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ * @gfp_mask:	extra flags for the allocator
+ *
+ * Internal function.  You probably want ntfs_malloc_nofs()...
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ * Depending on @gfp_mask the allocation may be guaranteed to succeed.
+ */
+static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
+{
+	if (likely(size <= PAGE_SIZE)) {
+		BUG_ON(!size);
+		/* kmalloc() has per-CPU caches so is faster for now. */
+		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
+		/* return (void *)__get_free_page(gfp_mask); */
+	}
+	if (likely((size >> PAGE_SHIFT) < totalram_pages))
+		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
+	return NULL;
+}
+
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
+}
+
+/**
+ * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+}
+
+static inline void ntfs_free(void *addr)
+{
+	if (!is_vmalloc_addr(addr)) {
+		kfree(addr);
+		/* free_page((unsigned long)addr); */
+		return;
+	}
+	vfree(addr);
+}
+
+#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/kernel/fs/ntfs/mft.c b/kernel/fs/ntfs/mft.c
new file mode 100644
index 000000000..3014a36a2
--- /dev/null
+++ b/kernel/fs/ntfs/mft.c
@@ -0,0 +1,2917 @@
+/**
+ * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+
+#include "attrib.h"
+#include "aops.h"
+#include "bitmap.h"
+#include "debug.h"
+#include "dir.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * map_mft_record_page - map the page in which a specific mft record resides
+ * @ni:		ntfs inode whose mft record page to map
+ *
+ * This maps the page in which the mft record of the ntfs inode @ni is situated
+ * and returns a pointer to the mft record within the mapped page.
+ *
+ * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
+ * contains the negative error code returned.
+ */
+static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
+{
+	loff_t i_size;
+	ntfs_volume *vol = ni->vol;
+	struct inode *mft_vi = vol->mft_ino;
+	struct page *page;
+	unsigned long index, end_index;
+	unsigned ofs;
+
+	BUG_ON(ni->page);
+	/*
+	 * The index into the page cache and the offset within the page cache
+	 * page of the wanted mft record. FIXME: We need to check for
+	 * overflowing the unsigned long, but I don't think we would ever get
+	 * here if the volume was that big...
+	 */
+	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
+			PAGE_CACHE_SHIFT;
+	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+
+	i_size = i_size_read(mft_vi);
+	/* The maximum valid index into the page cache for $MFT's data. */
+	end_index = i_size >> PAGE_CACHE_SHIFT;
+
+	/* If the wanted index is out of bounds the mft record doesn't exist. */
+	if (unlikely(index >= end_index)) {
+		if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
+				vol->mft_record_size) {
+			page = ERR_PTR(-ENOENT);
+			ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
+					"which is beyond the end of the mft.  "
+					"This is probably a bug in the ntfs "
+					"driver.", ni->mft_no);
+			goto err_out;
+		}
+	}
+	/* Read, map, and pin the page. */
+	page = ntfs_map_page(mft_vi->i_mapping, index);
+	if (likely(!IS_ERR(page))) {
+		/* Catch multi sector transfer fixup errors. */
+		if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
+				ofs)))) {
+			ni->page = page;
+			ni->page_ofs = ofs;
+			return page_address(page) + ofs;
+		}
+		ntfs_error(vol->sb, "Mft record 0x%lx is corrupt.  "
+				"Run chkdsk.", ni->mft_no);
+		ntfs_unmap_page(page);
+		page = ERR_PTR(-EIO);
+		NVolSetErrors(vol);
+	}
+err_out:
+	ni->page = NULL;
+	ni->page_ofs = 0;
+	return (void*)page;
+}
+
+/**
+ * map_mft_record - map, pin and lock an mft record
+ * @ni:		ntfs inode whose MFT record to map
+ *
+ * First, take the mrec_lock mutex.  We might now be sleeping, while waiting
+ * for the mutex if it was already locked by someone else.
+ *
+ * The page of the record is mapped using map_mft_record_page() before being
+ * returned to the caller.
+ *
+ * This in turn uses ntfs_map_page() to get the page containing the wanted mft
+ * record (it in turn calls read_cache_page() which reads it in from disk if
+ * necessary, increments the use count on the page so that it cannot disappear
+ * under us and returns a reference to the page cache page).
+ *
+ * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
+ * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
+ * and the post-read mst fixups on each mft record in the page have been
+ * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
+ * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
+ * ntfs_map_page() waits for PG_locked to become clear and checks if
+ * PG_uptodate is set and returns an error code if not. This provides
+ * sufficient protection against races when reading/using the page.
+ *
+ * However there is the write mapping to think about. Doing the above described
+ * checking here will be fine, because when initiating the write we will set
+ * PG_locked and clear PG_uptodate making sure nobody is touching the page
+ * contents. Doing the locking this way means that the commit to disk code in
+ * the page cache code paths is automatically sufficiently locked with us as
+ * we will not touch a page that has been locked or is not uptodate. The only
+ * locking problem then is them locking the page while we are accessing it.
+ *
+ * So that code will end up having to own the mrec_lock of all mft
+ * records/inodes present in the page before I/O can proceed. In that case we
+ * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
+ * accessing anything without owning the mrec_lock mutex.  But we do need to
+ * use them because of the read_cache_page() invocation and the code becomes so
+ * much simpler this way that it is well worth it.
+ *
+ * The mft record is now ours and we return a pointer to it. You need to check
+ * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
+ * the error code.
+ *
+ * NOTE: Caller is responsible for setting the mft record dirty before calling
+ * unmap_mft_record(). This is obviously only necessary if the caller really
+ * modified the mft record...
+ * Q: Do we want to recycle one of the VFS inode state bits instead?
+ * A: No, the inode ones mean we want to change the mft record, not we want to
+ * write it out.
+ */
+MFT_RECORD *map_mft_record(ntfs_inode *ni)
+{
+	MFT_RECORD *m;
+
+	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+
+	/* Make sure the ntfs inode doesn't go away. */
+	atomic_inc(&ni->count);
+
+	/* Serialize access to this mft record. */
+	mutex_lock(&ni->mrec_lock);
+
+	m = map_mft_record_page(ni);
+	if (likely(!IS_ERR(m)))
+		return m;
+
+	mutex_unlock(&ni->mrec_lock);
+	atomic_dec(&ni->count);
+	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
+	return m;
+}
+
+/**
+ * unmap_mft_record_page - unmap the page in which a specific mft record resides
+ * @ni:		ntfs inode whose mft record page to unmap
+ *
+ * This unmaps the page in which the mft record of the ntfs inode @ni is
+ * situated and returns. This is a NOOP if highmem is not configured.
+ *
+ * The unmap happens via ntfs_unmap_page() which in turn decrements the use
+ * count on the page thus releasing it from the pinned state.
+ *
+ * We do not actually unmap the page from memory of course, as that will be
+ * done by the page cache code itself when memory pressure increases or
+ * whatever.
+ */
+static inline void unmap_mft_record_page(ntfs_inode *ni)
+{
+	BUG_ON(!ni->page);
+
+	// TODO: If dirty, blah...
+	ntfs_unmap_page(ni->page);
+	ni->page = NULL;
+	ni->page_ofs = 0;
+	return;
+}
+
+/**
+ * unmap_mft_record - release a mapped mft record
+ * @ni:		ntfs inode whose MFT record to unmap
+ *
+ * We release the page mapping and the mrec_lock mutex which unmaps the mft
+ * record and releases it for others to get hold of. We also release the ntfs
+ * inode by decrementing the ntfs inode reference count.
+ *
+ * NOTE: If caller has modified the mft record, it is imperative to set the mft
+ * record dirty BEFORE calling unmap_mft_record().
+ */
+void unmap_mft_record(ntfs_inode *ni)
+{
+	struct page *page = ni->page;
+
+	BUG_ON(!page);
+
+	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+
+	unmap_mft_record_page(ni);
+	mutex_unlock(&ni->mrec_lock);
+	atomic_dec(&ni->count);
+	/*
+	 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
+	 * ntfs_clear_extent_inode() in the extent inode case, and to the
+	 * caller in the non-extent, yet pure ntfs inode case, to do the actual
+	 * tear down of all structures and freeing of all allocated memory.
+	 */
+	return;
+}
+
+/**
+ * map_extent_mft_record - load an extent inode and attach it to its base
+ * @base_ni:	base ntfs inode
+ * @mref:	mft reference of the extent inode to load
+ * @ntfs_ino:	on successful return, pointer to the ntfs_inode structure
+ *
+ * Load the extent mft record @mref and attach it to its base inode @base_ni.
+ * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
+ * PTR_ERR(result) gives the negative error code.
+ *
+ * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
+ * structure of the mapped extent inode.
+ */
+MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+		ntfs_inode **ntfs_ino)
+{
+	MFT_RECORD *m;
+	ntfs_inode *ni = NULL;
+	ntfs_inode **extent_nis = NULL;
+	int i;
+	unsigned long mft_no = MREF(mref);
+	u16 seq_no = MSEQNO(mref);
+	bool destroy_ni = false;
+
+	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
+			mft_no, base_ni->mft_no);
+	/* Make sure the base ntfs inode doesn't go away. */
+	atomic_inc(&base_ni->count);
+	/*
+	 * Check if this extent inode has already been added to the base inode,
+	 * in which case just return it. If not found, add it to the base
+	 * inode before returning it.
+	 */
+	mutex_lock(&base_ni->extent_lock);
+	if (base_ni->nr_extents > 0) {
+		extent_nis = base_ni->ext.extent_ntfs_inos;
+		for (i = 0; i < base_ni->nr_extents; i++) {
+			if (mft_no != extent_nis[i]->mft_no)
+				continue;
+			ni = extent_nis[i];
+			/* Make sure the ntfs inode doesn't go away. */
+			atomic_inc(&ni->count);
+			break;
+		}
+	}
+	if (likely(ni != NULL)) {
+		mutex_unlock(&base_ni->extent_lock);
+		atomic_dec(&base_ni->count);
+		/* We found the record; just have to map and return it. */
+		m = map_mft_record(ni);
+		/* map_mft_record() has incremented this on success. */
+		atomic_dec(&ni->count);
+		if (likely(!IS_ERR(m))) {
+			/* Verify the sequence number. */
+			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
+				ntfs_debug("Done 1.");
+				*ntfs_ino = ni;
+				return m;
+			}
+			unmap_mft_record(ni);
+			ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+					"reference! Corrupt filesystem. "
+					"Run chkdsk.");
+			return ERR_PTR(-EIO);
+		}
+map_err_out:
+		ntfs_error(base_ni->vol->sb, "Failed to map extent "
+				"mft record, error code %ld.", -PTR_ERR(m));
+		return m;
+	}
+	/* Record wasn't there. Get a new ntfs inode and initialize it. */
+	ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
+	if (unlikely(!ni)) {
+		mutex_unlock(&base_ni->extent_lock);
+		atomic_dec(&base_ni->count);
+		return ERR_PTR(-ENOMEM);
+	}
+	ni->vol = base_ni->vol;
+	ni->seq_no = seq_no;
+	ni->nr_extents = -1;
+	ni->ext.base_ntfs_ino = base_ni;
+	/* Now map the record. */
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		mutex_unlock(&base_ni->extent_lock);
+		atomic_dec(&base_ni->count);
+		ntfs_clear_extent_inode(ni);
+		goto map_err_out;
+	}
+	/* Verify the sequence number if it is present. */
+	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
+		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+				"reference! Corrupt filesystem. Run chkdsk.");
+		destroy_ni = true;
+		m = ERR_PTR(-EIO);
+		goto unm_err_out;
+	}
+	/* Attach extent inode to base inode, reallocating memory if needed. */
+	if (!(base_ni->nr_extents & 3)) {
+		ntfs_inode **tmp;
+		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
+
+		tmp = kmalloc(new_size, GFP_NOFS);
+		if (unlikely(!tmp)) {
+			ntfs_error(base_ni->vol->sb, "Failed to allocate "
+					"internal buffer.");
+			destroy_ni = true;
+			m = ERR_PTR(-ENOMEM);
+			goto unm_err_out;
+		}
+		if (base_ni->nr_extents) {
+			BUG_ON(!base_ni->ext.extent_ntfs_inos);
+			memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
+					4 * sizeof(ntfs_inode *));
+			kfree(base_ni->ext.extent_ntfs_inos);
+		}
+		base_ni->ext.extent_ntfs_inos = tmp;
+	}
+	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
+	mutex_unlock(&base_ni->extent_lock);
+	atomic_dec(&base_ni->count);
+	ntfs_debug("Done 2.");
+	*ntfs_ino = ni;
+	return m;
+unm_err_out:
+	unmap_mft_record(ni);
+	mutex_unlock(&base_ni->extent_lock);
+	atomic_dec(&base_ni->count);
+	/*
+	 * If the extent inode was not attached to the base inode we need to
+	 * release it or we will leak memory.
+	 */
+	if (destroy_ni)
+		ntfs_clear_extent_inode(ni);
+	return m;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * __mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni:		ntfs inode describing the mapped mft record
+ *
+ * Internal function.  Users should call mark_mft_record_dirty() instead.
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty.  Also, mark the base
+ * vfs inode dirty.  This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE:  We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
+ * on the base vfs inode, because even though file data may have been modified,
+ * it is dirty in the inode meta data rather than the data page cache of the
+ * inode, and thus there are no data pages that need writing out.  Therefore, a
+ * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
+ */
+void __mark_mft_record_dirty(ntfs_inode *ni)
+{
+	ntfs_inode *base_ni;
+
+	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+	BUG_ON(NInoAttr(ni));
+	mark_ntfs_record_dirty(ni->page, ni->page_ofs);
+	/* Determine the base vfs inode and mark it dirty, too. */
+	mutex_lock(&ni->extent_lock);
+	if (likely(ni->nr_extents >= 0))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	mutex_unlock(&ni->extent_lock);
+	__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+}
+
+static const char *ntfs_please_email = "Please email "
+		"linux-ntfs-dev@lists.sourceforge.net and say that you saw "
+		"this message.  Thank you.";
+
+/**
+ * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
+ * @vol:	ntfs volume on which the mft record to synchronize resides
+ * @mft_no:	mft record number of mft record to synchronize
+ * @m:		mapped, mst protected (extent) mft record to synchronize
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
+ * bypassing the page cache and the $MFTMirr inode itself.
+ *
+ * This function is only for use at umount time when the mft mirror inode has
+ * already been disposed off.  We BUG() if we are called while the mft mirror
+ * inode is still attached to the volume.
+ *
+ * On success return 0.  On error return -errno.
+ *
+ * NOTE:  This function is not implemented yet as I am not convinced it can
+ * actually be triggered considering the sequence of commits we do in super.c::
+ * ntfs_put_super().  But just in case we provide this place holder as the
+ * alternative would be either to BUG() or to get a NULL pointer dereference
+ * and Oops.
+ */
+static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
+		const unsigned long mft_no, MFT_RECORD *m)
+{
+	BUG_ON(vol->mftmirr_ino);
+	ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
+			"implemented yet.  %s", ntfs_please_email);
+	return -EOPNOTSUPP;
+}
+
+/**
+ * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
+ * @vol:	ntfs volume on which the mft record to synchronize resides
+ * @mft_no:	mft record number of mft record to synchronize
+ * @m:		mapped, mst protected (extent) mft record to synchronize
+ * @sync:	if true, wait for i/o completion
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
+ *
+ * On success return 0.  On error return -errno and set the volume errors flag
+ * in the ntfs volume @vol.
+ *
+ * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
+ *
+ * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+		MFT_RECORD *m, int sync)
+{
+	struct page *page;
+	unsigned int blocksize = vol->sb->s_blocksize;
+	int max_bhs = vol->mft_record_size / blocksize;
+	struct buffer_head *bhs[max_bhs];
+	struct buffer_head *bh, *head;
+	u8 *kmirr;
+	runlist_element *rl;
+	unsigned int block_start, block_end, m_start, m_end, page_ofs;
+	int i_bhs, nr_bhs, err = 0;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
+
+	ntfs_debug("Entering for inode 0x%lx.", mft_no);
+	BUG_ON(!max_bhs);
+	if (unlikely(!vol->mftmirr_ino)) {
+		/* This could happen during umount... */
+		err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
+		if (likely(!err))
+			return err;
+		goto err_out;
+	}
+	/* Get the page containing the mirror copy of the mft record @m. */
+	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
+			(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to map mft mirror page.");
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	/* Offset of the mft mirror record inside the page. */
+	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* The address in the page of the mirror copy of the mft record @m. */
+	kmirr = page_address(page) + page_ofs;
+	/* Copy the mst protected mft record to the mirror. */
+	memcpy(kmirr, m, vol->mft_record_size);
+	/* Create uptodate buffers if not present. */
+	if (unlikely(!page_has_buffers(page))) {
+		struct buffer_head *tail;
+
+		bh = head = alloc_page_buffers(page, blocksize, 1);
+		do {
+			set_buffer_uptodate(bh);
+			tail = bh;
+			bh = bh->b_this_page;
+		} while (bh);
+		tail->b_this_page = head;
+		attach_page_buffers(page, head);
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+	rl = NULL;
+	nr_bhs = 0;
+	block_start = 0;
+	m_start = kmirr - (u8*)page_address(page);
+	m_end = m_start + vol->mft_record_size;
+	do {
+		block_end = block_start + blocksize;
+		/* If the buffer is outside the mft record, skip it. */
+		if (block_end <= m_start)
+			continue;
+		if (unlikely(block_start >= m_end))
+			break;
+		/* Need to map the buffer if it is not mapped already. */
+		if (unlikely(!buffer_mapped(bh))) {
+			VCN vcn;
+			LCN lcn;
+			unsigned int vcn_ofs;
+
+			bh->b_bdev = vol->sb->s_bdev;
+			/* Obtain the vcn and offset of the current block. */
+			vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
+					(block_start - m_start);
+			vcn_ofs = vcn & vol->cluster_size_mask;
+			vcn >>= vol->cluster_size_bits;
+			if (!rl) {
+				down_read(&NTFS_I(vol->mftmirr_ino)->
+						runlist.lock);
+				rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
+				/*
+				 * $MFTMirr always has the whole of its runlist
+				 * in memory.
+				 */
+				BUG_ON(!rl);
+			}
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			/* For $MFTMirr, only lcn >= 0 is a successful remap. */
+			if (likely(lcn >= 0)) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn <<
+						vol->cluster_size_bits) +
+						vcn_ofs) >> blocksize_bits;
+				set_buffer_mapped(bh);
+			} else {
+				bh->b_blocknr = -1;
+				ntfs_error(vol->sb, "Cannot write mft mirror "
+						"record 0x%lx because its "
+						"location on disk could not "
+						"be determined (error code "
+						"%lli).", mft_no,
+						(long long)lcn);
+				err = -EIO;
+			}
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(!nr_bhs && (m_start != block_start));
+		BUG_ON(nr_bhs >= max_bhs);
+		bhs[nr_bhs++] = bh;
+		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+	} while (block_start = block_end, (bh = bh->b_this_page) != head);
+	if (unlikely(rl))
+		up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
+	if (likely(!err)) {
+		/* Lock buffers and start synchronous write i/o on them. */
+		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+			struct buffer_head *tbh = bhs[i_bhs];
+
+			if (!trylock_buffer(tbh))
+				BUG();
+			BUG_ON(!buffer_uptodate(tbh));
+			clear_buffer_dirty(tbh);
+			get_bh(tbh);
+			tbh->b_end_io = end_buffer_write_sync;
+			submit_bh(WRITE, tbh);
+		}
+		/* Wait on i/o completion of buffers. */
+		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+			struct buffer_head *tbh = bhs[i_bhs];
+
+			wait_on_buffer(tbh);
+			if (unlikely(!buffer_uptodate(tbh))) {
+				err = -EIO;
+				/*
+				 * Set the buffer uptodate so the page and
+				 * buffer states do not become out of sync.
+				 */
+				set_buffer_uptodate(tbh);
+			}
+		}
+	} else /* if (unlikely(err)) */ {
+		/* Clean the buffers. */
+		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+			clear_buffer_dirty(bhs[i_bhs]);
+	}
+	/* Current state: all buffers are clean, unlocked, and uptodate. */
+	/* Remove the mst protection fixups again. */
+	post_write_mst_fixup((NTFS_RECORD*)kmirr);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	ntfs_unmap_page(page);
+	if (likely(!err)) {
+		ntfs_debug("Done.");
+	} else {
+		ntfs_error(vol->sb, "I/O error while writing mft mirror "
+				"record 0x%lx!", mft_no);
+err_out:
+		ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
+				"code %i).  Volume will be left marked dirty "
+				"on umount.  Run ntfsfix on the partition "
+				"after umounting to correct this.", -err);
+		NVolSetErrors(vol);
+	}
+	return err;
+}
+
+/**
+ * write_mft_record_nolock - write out a mapped (extent) mft record
+ * @ni:		ntfs inode describing the mapped (extent) mft record
+ * @m:		mapped (extent) mft record to write
+ * @sync:	if true, wait for i/o completion
+ *
+ * Write the mapped (extent) mft record @m described by the (regular or extent)
+ * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
+ * the mft mirror, that is also updated.
+ *
+ * We only write the mft record if the ntfs inode @ni is dirty and the first
+ * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
+ * of subsequent buffers because we could have raced with
+ * fs/ntfs/aops.c::mark_ntfs_record_dirty().
+ *
+ * On success, clean the mft record and return 0.  On error, leave the mft
+ * record dirty and return -errno.
+ *
+ * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
+ * However, if the mft record has a counterpart in the mft mirror and @sync is
+ * true, we write the mft record, wait for i/o completion, and only then write
+ * the mft mirror copy.  This ensures that if the system crashes either the mft
+ * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
+ * false on the other hand, we start i/o on both and then wait for completion
+ * on them.  This provides a speedup but no longer guarantees that you will end
+ * up with a self-consistent mft record in the case of a crash but if you asked
+ * for asynchronous writing you probably do not care about that anyway.
+ *
+ * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+	ntfs_volume *vol = ni->vol;
+	struct page *page = ni->page;
+	unsigned int blocksize = vol->sb->s_blocksize;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
+	int max_bhs = vol->mft_record_size / blocksize;
+	struct buffer_head *bhs[max_bhs];
+	struct buffer_head *bh, *head;
+	runlist_element *rl;
+	unsigned int block_start, block_end, m_start, m_end;
+	int i_bhs, nr_bhs, err = 0;
+
+	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(!max_bhs);
+	BUG_ON(!PageLocked(page));
+	/*
+	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
+	 * mark it as clean now so that it can be redirtied later on if needed.
+	 * There is no danger of races since the caller is holding the locks
+	 * for the mft record @m and the page it is in.
+	 */
+	if (!NInoTestClearDirty(ni))
+		goto done;
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+	rl = NULL;
+	nr_bhs = 0;
+	block_start = 0;
+	m_start = ni->page_ofs;
+	m_end = m_start + vol->mft_record_size;
+	do {
+		block_end = block_start + blocksize;
+		/* If the buffer is outside the mft record, skip it. */
+		if (block_end <= m_start)
+			continue;
+		if (unlikely(block_start >= m_end))
+			break;
+		/*
+		 * If this block is not the first one in the record, we ignore
+		 * the buffer's dirty state because we could have raced with a
+		 * parallel mark_ntfs_record_dirty().
+		 */
+		if (block_start == m_start) {
+			/* This block is the first one in the record. */
+			if (!buffer_dirty(bh)) {
+				BUG_ON(nr_bhs);
+				/* Clean records are not written out. */
+				break;
+			}
+		}
+		/* Need to map the buffer if it is not mapped already. */
+		if (unlikely(!buffer_mapped(bh))) {
+			VCN vcn;
+			LCN lcn;
+			unsigned int vcn_ofs;
+
+			bh->b_bdev = vol->sb->s_bdev;
+			/* Obtain the vcn and offset of the current block. */
+			vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
+					(block_start - m_start);
+			vcn_ofs = vcn & vol->cluster_size_mask;
+			vcn >>= vol->cluster_size_bits;
+			if (!rl) {
+				down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+				rl = NTFS_I(vol->mft_ino)->runlist.rl;
+				BUG_ON(!rl);
+			}
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			/* For $MFT, only lcn >= 0 is a successful remap. */
+			if (likely(lcn >= 0)) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn <<
+						vol->cluster_size_bits) +
+						vcn_ofs) >> blocksize_bits;
+				set_buffer_mapped(bh);
+			} else {
+				bh->b_blocknr = -1;
+				ntfs_error(vol->sb, "Cannot write mft record "
+						"0x%lx because its location "
+						"on disk could not be "
+						"determined (error code %lli).",
+						ni->mft_no, (long long)lcn);
+				err = -EIO;
+			}
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(!nr_bhs && (m_start != block_start));
+		BUG_ON(nr_bhs >= max_bhs);
+		bhs[nr_bhs++] = bh;
+		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+	} while (block_start = block_end, (bh = bh->b_this_page) != head);
+	if (unlikely(rl))
+		up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+	if (!nr_bhs)
+		goto done;
+	if (unlikely(err))
+		goto cleanup_out;
+	/* Apply the mst protection fixups. */
+	err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
+	if (err) {
+		ntfs_error(vol->sb, "Failed to apply mst fixups!");
+		goto cleanup_out;
+	}
+	flush_dcache_mft_record_page(ni);
+	/* Lock buffers and start synchronous write i/o on them. */
+	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+		struct buffer_head *tbh = bhs[i_bhs];
+
+		if (!trylock_buffer(tbh))
+			BUG();
+		BUG_ON(!buffer_uptodate(tbh));
+		clear_buffer_dirty(tbh);
+		get_bh(tbh);
+		tbh->b_end_io = end_buffer_write_sync;
+		submit_bh(WRITE, tbh);
+	}
+	/* Synchronize the mft mirror now if not @sync. */
+	if (!sync && ni->mft_no < vol->mftmirr_size)
+		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+	/* Wait on i/o completion of buffers. */
+	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+		struct buffer_head *tbh = bhs[i_bhs];
+
+		wait_on_buffer(tbh);
+		if (unlikely(!buffer_uptodate(tbh))) {
+			err = -EIO;
+			/*
+			 * Set the buffer uptodate so the page and buffer
+			 * states do not become out of sync.
+			 */
+			if (PageUptodate(page))
+				set_buffer_uptodate(tbh);
+		}
+	}
+	/* If @sync, now synchronize the mft mirror. */
+	if (sync && ni->mft_no < vol->mftmirr_size)
+		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+	/* Remove the mst protection fixups again. */
+	post_write_mst_fixup((NTFS_RECORD*)m);
+	flush_dcache_mft_record_page(ni);
+	if (unlikely(err)) {
+		/* I/O error during writing.  This is really bad! */
+		ntfs_error(vol->sb, "I/O error while writing mft record "
+				"0x%lx!  Marking base inode as bad.  You "
+				"should unmount the volume and run chkdsk.",
+				ni->mft_no);
+		goto err_out;
+	}
+done:
+	ntfs_debug("Done.");
+	return 0;
+cleanup_out:
+	/* Clean the buffers. */
+	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+		clear_buffer_dirty(bhs[i_bhs]);
+err_out:
+	/*
+	 * Current state: all buffers are clean, unlocked, and uptodate.
+	 * The caller should mark the base inode as bad so that no more i/o
+	 * happens.  ->clear_inode() will still be invoked so all extent inodes
+	 * and other allocated memory will be freed.
+	 */
+	if (err == -ENOMEM) {
+		ntfs_error(vol->sb, "Not enough memory to write mft record.  "
+				"Redirtying so the write is retried later.");
+		mark_mft_record_dirty(ni);
+		err = 0;
+	} else
+		NVolSetErrors(vol);
+	return err;
+}
+
+/**
+ * ntfs_may_write_mft_record - check if an mft record may be written out
+ * @vol:	[IN]  ntfs volume on which the mft record to check resides
+ * @mft_no:	[IN]  mft record number of the mft record to check
+ * @m:		[IN]  mapped mft record to check
+ * @locked_ni:	[OUT] caller has to unlock this ntfs inode if one is returned
+ *
+ * Check if the mapped (base or extent) mft record @m with mft record number
+ * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
+ * and possible the ntfs inode of the mft record is locked and the base vfs
+ * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
+ * caller is responsible for unlocking the ntfs inode and unpinning the base
+ * vfs inode.
+ *
+ * Return 'true' if the mft record may be written out and 'false' if not.
+ *
+ * The caller has locked the page and cleared the uptodate flag on it which
+ * means that we can safely write out any dirty mft records that do not have
+ * their inodes in icache as determined by ilookup5() as anyone
+ * opening/creating such an inode would block when attempting to map the mft
+ * record in read_cache_page() until we are finished with the write out.
+ *
+ * Here is a description of the tests we perform:
+ *
+ * If the inode is found in icache we know the mft record must be a base mft
+ * record.  If it is dirty, we do not write it and return 'false' as the vfs
+ * inode write paths will result in the access times being updated which would
+ * cause the base mft record to be redirtied and written out again.  (We know
+ * the access time update will modify the base mft record because Windows
+ * chkdsk complains if the standard information attribute is not in the base
+ * mft record.)
+ *
+ * If the inode is in icache and not dirty, we attempt to lock the mft record
+ * and if we find the lock was already taken, it is not safe to write the mft
+ * record and we return 'false'.
+ *
+ * If we manage to obtain the lock we have exclusive access to the mft record,
+ * which also allows us safe writeout of the mft record.  We then set
+ * @locked_ni to the locked ntfs inode and return 'true'.
+ *
+ * Note we cannot just lock the mft record and sleep while waiting for the lock
+ * because this would deadlock due to lock reversal (normally the mft record is
+ * locked before the page is locked but we already have the page locked here
+ * when we try to lock the mft record).
+ *
+ * If the inode is not in icache we need to perform further checks.
+ *
+ * If the mft record is not a FILE record or it is a base mft record, we can
+ * safely write it and return 'true'.
+ *
+ * We now know the mft record is an extent mft record.  We check if the inode
+ * corresponding to its base mft record is in icache and obtain a reference to
+ * it if it is.  If it is not, we can safely write it and return 'true'.
+ *
+ * We now have the base inode for the extent mft record.  We check if it has an
+ * ntfs inode for the extent mft record attached and if not it is safe to write
+ * the extent mft record and we return 'true'.
+ *
+ * The ntfs inode for the extent mft record is attached to the base inode so we
+ * attempt to lock the extent mft record and if we find the lock was already
+ * taken, it is not safe to write the extent mft record and we return 'false'.
+ *
+ * If we manage to obtain the lock we have exclusive access to the extent mft
+ * record, which also allows us safe writeout of the extent mft record.  We
+ * set the ntfs inode of the extent mft record clean and then set @locked_ni to
+ * the now locked ntfs inode and return 'true'.
+ *
+ * Note, the reason for actually writing dirty mft records here and not just
+ * relying on the vfs inode dirty code paths is that we can have mft records
+ * modified without them ever having actual inodes in memory.  Also we can have
+ * dirty mft records with clean ntfs inodes in memory.  None of the described
+ * cases would result in the dirty mft records being written out if we only
+ * relied on the vfs inode dirty code paths.  And these cases can really occur
+ * during allocation of new mft records and in particular when the
+ * initialized_size of the $MFT/$DATA attribute is extended and the new space
+ * is initialized using ntfs_mft_record_format().  The clean inode can then
+ * appear if the mft record is reused for a new inode before it got written
+ * out.
+ */
+bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
+		const MFT_RECORD *m, ntfs_inode **locked_ni)
+{
+	struct super_block *sb = vol->sb;
+	struct inode *mft_vi = vol->mft_ino;
+	struct inode *vi;
+	ntfs_inode *ni, *eni, **extent_nis;
+	int i;
+	ntfs_attr na;
+
+	ntfs_debug("Entering for inode 0x%lx.", mft_no);
+	/*
+	 * Normally we do not return a locked inode so set @locked_ni to NULL.
+	 */
+	BUG_ON(!locked_ni);
+	*locked_ni = NULL;
+	/*
+	 * Check if the inode corresponding to this mft record is in the VFS
+	 * inode cache and obtain a reference to it if it is.
+	 */
+	ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
+	na.mft_no = mft_no;
+	na.name = NULL;
+	na.name_len = 0;
+	na.type = AT_UNUSED;
+	/*
+	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
+	 * we get here for it rather often.
+	 */
+	if (!mft_no) {
+		/* Balance the below iput(). */
+		vi = igrab(mft_vi);
+		BUG_ON(vi != mft_vi);
+	} else {
+		/*
+		 * Have to use ilookup5_nowait() since ilookup5() waits for the
+		 * inode lock which causes ntfs to deadlock when a concurrent
+		 * inode write via the inode dirty code paths and the page
+		 * dirty code path of the inode dirty code path when writing
+		 * $MFT occurs.
+		 */
+		vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+	}
+	if (vi) {
+		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
+		/* The inode is in icache. */
+		ni = NTFS_I(vi);
+		/* Take a reference to the ntfs inode. */
+		atomic_inc(&ni->count);
+		/* If the inode is dirty, do not write this record. */
+		if (NInoDirty(ni)) {
+			ntfs_debug("Inode 0x%lx is dirty, do not write it.",
+					mft_no);
+			atomic_dec(&ni->count);
+			iput(vi);
+			return false;
+		}
+		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
+		/* The inode is not dirty, try to take the mft record lock. */
+		if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
+			ntfs_debug("Mft record 0x%lx is already locked, do "
+					"not write it.", mft_no);
+			atomic_dec(&ni->count);
+			iput(vi);
+			return false;
+		}
+		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
+				mft_no);
+		/*
+		 * The write has to occur while we hold the mft record lock so
+		 * return the locked ntfs inode.
+		 */
+		*locked_ni = ni;
+		return true;
+	}
+	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
+	/* The inode is not in icache. */
+	/* Write the record if it is not a mft record (type "FILE"). */
+	if (!ntfs_is_mft_record(m->magic)) {
+		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
+				mft_no);
+		return true;
+	}
+	/* Write the mft record if it is a base inode. */
+	if (!m->base_mft_record) {
+		ntfs_debug("Mft record 0x%lx is a base record, write it.",
+				mft_no);
+		return true;
+	}
+	/*
+	 * This is an extent mft record.  Check if the inode corresponding to
+	 * its base mft record is in icache and obtain a reference to it if it
+	 * is.
+	 */
+	na.mft_no = MREF_LE(m->base_mft_record);
+	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
+			"inode 0x%lx in icache.", mft_no, na.mft_no);
+	if (!na.mft_no) {
+		/* Balance the below iput(). */
+		vi = igrab(mft_vi);
+		BUG_ON(vi != mft_vi);
+	} else
+		vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+				&na);
+	if (!vi) {
+		/*
+		 * The base inode is not in icache, write this extent mft
+		 * record.
+		 */
+		ntfs_debug("Base inode 0x%lx is not in icache, write the "
+				"extent record.", na.mft_no);
+		return true;
+	}
+	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
+	/*
+	 * The base inode is in icache.  Check if it has the extent inode
+	 * corresponding to this extent mft record attached.
+	 */
+	ni = NTFS_I(vi);
+	mutex_lock(&ni->extent_lock);
+	if (ni->nr_extents <= 0) {
+		/*
+		 * The base inode has no attached extent inodes, write this
+		 * extent mft record.
+		 */
+		mutex_unlock(&ni->extent_lock);
+		iput(vi);
+		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
+				"write the extent record.", na.mft_no);
+		return true;
+	}
+	/* Iterate over the attached extent inodes. */
+	extent_nis = ni->ext.extent_ntfs_inos;
+	for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
+		if (mft_no == extent_nis[i]->mft_no) {
+			/*
+			 * Found the extent inode corresponding to this extent
+			 * mft record.
+			 */
+			eni = extent_nis[i];
+			break;
+		}
+	}
+	/*
+	 * If the extent inode was not attached to the base inode, write this
+	 * extent mft record.
+	 */
+	if (!eni) {
+		mutex_unlock(&ni->extent_lock);
+		iput(vi);
+		ntfs_debug("Extent inode 0x%lx is not attached to its base "
+				"inode 0x%lx, write the extent record.",
+				mft_no, na.mft_no);
+		return true;
+	}
+	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
+			mft_no, na.mft_no);
+	/* Take a reference to the extent ntfs inode. */
+	atomic_inc(&eni->count);
+	mutex_unlock(&ni->extent_lock);
+	/*
+	 * Found the extent inode coresponding to this extent mft record.
+	 * Try to take the mft record lock.
+	 */
+	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
+		atomic_dec(&eni->count);
+		iput(vi);
+		ntfs_debug("Extent mft record 0x%lx is already locked, do "
+				"not write it.", mft_no);
+		return false;
+	}
+	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
+			mft_no);
+	if (NInoTestClearDirty(eni))
+		ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
+				mft_no);
+	/*
+	 * The write has to occur while we hold the mft record lock so return
+	 * the locked extent ntfs inode.
+	 */
+	*locked_ni = eni;
+	return true;
+}
+
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+		"chkdsk.";
+
+/**
+ * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
+ * @vol:	volume on which to search for a free mft record
+ * @base_ni:	open base inode if allocating an extent mft record or NULL
+ *
+ * Search for a free mft record in the mft bitmap attribute on the ntfs volume
+ * @vol.
+ *
+ * If @base_ni is NULL start the search at the default allocator position.
+ *
+ * If @base_ni is not NULL start the search at the mft record after the base
+ * mft record @base_ni.
+ *
+ * Return the free mft record on success and -errno on error.  An error code of
+ * -ENOSPC means that there are no free mft records in the currently
+ * initialized mft bitmap.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
+		ntfs_inode *base_ni)
+{
+	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
+	unsigned long flags;
+	struct address_space *mftbmp_mapping;
+	u8 *buf, *byte;
+	struct page *page;
+	unsigned int page_ofs, size;
+	u8 pass, b;
+
+	ntfs_debug("Searching for free mft record in the currently "
+			"initialized mft bitmap.");
+	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
+	/*
+	 * Set the end of the pass making sure we do not overflow the mft
+	 * bitmap.
+	 */
+	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
+	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
+			vol->mft_record_size_bits;
+	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
+	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
+	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
+	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
+	if (pass_end > ll)
+		pass_end = ll;
+	pass = 1;
+	if (!base_ni)
+		data_pos = vol->mft_data_pos;
+	else
+		data_pos = base_ni->mft_no + 1;
+	if (data_pos < 24)
+		data_pos = 24;
+	if (data_pos >= pass_end) {
+		data_pos = 24;
+		pass = 2;
+		/* This happens on a freshly formatted volume. */
+		if (data_pos >= pass_end)
+			return -ENOSPC;
+	}
+	pass_start = data_pos;
+	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
+			"pass_end 0x%llx, data_pos 0x%llx.", pass,
+			(long long)pass_start, (long long)pass_end,
+			(long long)data_pos);
+	/* Loop until a free mft record is found. */
+	for (; pass <= 2;) {
+		/* Cap size to pass_end. */
+		ofs = data_pos >> 3;
+		page_ofs = ofs & ~PAGE_CACHE_MASK;
+		size = PAGE_CACHE_SIZE - page_ofs;
+		ll = ((pass_end + 7) >> 3) - ofs;
+		if (size > ll)
+			size = ll;
+		size <<= 3;
+		/*
+		 * If we are still within the active pass, search the next page
+		 * for a zero bit.
+		 */
+		if (size) {
+			page = ntfs_map_page(mftbmp_mapping,
+					ofs >> PAGE_CACHE_SHIFT);
+			if (IS_ERR(page)) {
+				ntfs_error(vol->sb, "Failed to read mft "
+						"bitmap, aborting.");
+				return PTR_ERR(page);
+			}
+			buf = (u8*)page_address(page) + page_ofs;
+			bit = data_pos & 7;
+			data_pos &= ~7ull;
+			ntfs_debug("Before inner for loop: size 0x%x, "
+					"data_pos 0x%llx, bit 0x%llx", size,
+					(long long)data_pos, (long long)bit);
+			for (; bit < size && data_pos + bit < pass_end;
+					bit &= ~7ull, bit += 8) {
+				byte = buf + (bit >> 3);
+				if (*byte == 0xff)
+					continue;
+				b = ffz((unsigned long)*byte);
+				if (b < 8 && b >= (bit & 7)) {
+					ll = data_pos + (bit & ~7ull) + b;
+					if (unlikely(ll > (1ll << 32))) {
+						ntfs_unmap_page(page);
+						return -ENOSPC;
+					}
+					*byte |= 1 << b;
+					flush_dcache_page(page);
+					set_page_dirty(page);
+					ntfs_unmap_page(page);
+					ntfs_debug("Done.  (Found and "
+							"allocated mft record "
+							"0x%llx.)",
+							(long long)ll);
+					return ll;
+				}
+			}
+			ntfs_debug("After inner for loop: size 0x%x, "
+					"data_pos 0x%llx, bit 0x%llx", size,
+					(long long)data_pos, (long long)bit);
+			data_pos += size;
+			ntfs_unmap_page(page);
+			/*
+			 * If the end of the pass has not been reached yet,
+			 * continue searching the mft bitmap for a zero bit.
+			 */
+			if (data_pos < pass_end)
+				continue;
+		}
+		/* Do the next pass. */
+		if (++pass == 2) {
+			/*
+			 * Starting the second pass, in which we scan the first
+			 * part of the zone which we omitted earlier.
+			 */
+			pass_end = pass_start;
+			data_pos = pass_start = 24;
+			ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
+					"0x%llx.", pass, (long long)pass_start,
+					(long long)pass_end);
+			if (data_pos >= pass_end)
+				break;
+		}
+	}
+	/* No free mft records in currently initialized mft bitmap. */
+	ntfs_debug("Done.  (No free mft records left in currently initialized "
+			"mft bitmap.)");
+	return -ENOSPC;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
+ * @vol:	volume on which to extend the mft bitmap attribute
+ *
+ * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
+ *
+ * Note: Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *	    - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
+ *	      writing and releases it before returning.
+ *	    - This function takes vol->lcnbmp_lock for writing and releases it
+ *	      before returning.
+ */
+static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
+{
+	LCN lcn;
+	s64 ll;
+	unsigned long flags;
+	struct page *page;
+	ntfs_inode *mft_ni, *mftbmp_ni;
+	runlist_element *rl, *rl2 = NULL;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a = NULL;
+	int ret, mp_size;
+	u32 old_alen = 0;
+	u8 *b, tb;
+	struct {
+		u8 added_cluster:1;
+		u8 added_run:1;
+		u8 mp_rebuilt:1;
+	} status = { 0, 0, 0 };
+
+	ntfs_debug("Extending mft bitmap allocation.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+	/*
+	 * Determine the last lcn of the mft bitmap.  The allocated size of the
+	 * mft bitmap cannot be zero so we are ok to do this.
+	 */
+	down_write(&mftbmp_ni->runlist.lock);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ll = mftbmp_ni->allocated_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
+			(ll - 1) >> vol->cluster_size_bits, NULL);
+	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mftbmp_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to determine last allocated "
+				"cluster of mft bitmap attribute.");
+		if (!IS_ERR(rl))
+			ret = -EIO;
+		else
+			ret = PTR_ERR(rl);
+		return ret;
+	}
+	lcn = rl->lcn + rl->length;
+	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
+			(long long)lcn);
+	/*
+	 * Attempt to get the cluster following the last allocated cluster by
+	 * hand as it may be in the MFT zone so the allocator would not give it
+	 * to us.
+	 */
+	ll = lcn >> 3;
+	page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
+			ll >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		up_write(&mftbmp_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
+		return PTR_ERR(page);
+	}
+	b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
+	tb = 1 << (lcn & 7ull);
+	down_write(&vol->lcnbmp_lock);
+	if (*b != 0xff && !(*b & tb)) {
+		/* Next cluster is free, allocate it. */
+		*b |= tb;
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		up_write(&vol->lcnbmp_lock);
+		ntfs_unmap_page(page);
+		/* Update the mft bitmap runlist. */
+		rl->length++;
+		rl[1].vcn++;
+		status.added_cluster = 1;
+		ntfs_debug("Appending one cluster to mft bitmap.");
+	} else {
+		up_write(&vol->lcnbmp_lock);
+		ntfs_unmap_page(page);
+		/* Allocate a cluster from the DATA_ZONE. */
+		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
+				true);
+		if (IS_ERR(rl2)) {
+			up_write(&mftbmp_ni->runlist.lock);
+			ntfs_error(vol->sb, "Failed to allocate a cluster for "
+					"the mft bitmap.");
+			return PTR_ERR(rl2);
+		}
+		rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
+		if (IS_ERR(rl)) {
+			up_write(&mftbmp_ni->runlist.lock);
+			ntfs_error(vol->sb, "Failed to merge runlists for mft "
+					"bitmap.");
+			if (ntfs_cluster_free_from_rl(vol, rl2)) {
+				ntfs_error(vol->sb, "Failed to deallocate "
+						"allocated cluster.%s", es);
+				NVolSetErrors(vol);
+			}
+			ntfs_free(rl2);
+			return PTR_ERR(rl);
+		}
+		mftbmp_ni->runlist.rl = rl;
+		status.added_run = 1;
+		ntfs_debug("Adding one run to mft bitmap.");
+		/* Find the last run in the new runlist. */
+		for (; rl[1].length; rl++)
+			;
+	}
+	/*
+	 * Update the attribute record as well.  Note: @rl is the last
+	 * (non-terminator) runlist element of mft bitmap.
+	 */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		ret = PTR_ERR(mrec);
+		goto undo_alloc;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto undo_alloc;
+	}
+	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+			0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft bitmap attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	a = ctx->attr;
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	/* Search back for the previous last allocated cluster of mft bitmap. */
+	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
+		if (ll >= rl2->vcn)
+			break;
+	}
+	BUG_ON(ll < rl2->vcn);
+	BUG_ON(ll >= rl2->vcn + rl2->length);
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+				"mft bitmap attribute extent.");
+		ret = mp_size;
+		if (!ret)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	/* Expand the attribute record if necessary. */
+	old_alen = le32_to_cpu(a->length);
+	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(ret)) {
+		if (ret != -ENOSPC) {
+			ntfs_error(vol->sb, "Failed to resize attribute "
+					"record for mft bitmap attribute.");
+			goto undo_alloc;
+		}
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record or by
+		// moving other attributes out of this mft record.
+		// Note: It will need to be a special mft record and if none of
+		// those are available it gets rather complicated...
+		ntfs_error(vol->sb, "Not enough space in this mft record to "
+				"accommodate extended mft bitmap attribute "
+				"extent.  Cannot handle this yet.");
+		ret = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	status.mp_rebuilt = 1;
+	/* Generate the mapping pairs array directly into the attr record. */
+	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, -1, NULL);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
+				"mft bitmap attribute.");
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+	/*
+	 * We now have extended the mft bitmap allocated_size by one cluster.
+	 * Reflect this in the ntfs_inode structure and the attribute record.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+				mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
+				0, ctx);
+		if (unlikely(ret)) {
+			ntfs_error(vol->sb, "Failed to find first attribute "
+					"extent of mft bitmap attribute.");
+			goto restore_undo_alloc;
+		}
+		a = ctx->attr;
+	}
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	mftbmp_ni->allocated_size += vol->cluster_size;
+	a->data.non_resident.allocated_size =
+			cpu_to_sle64(mftbmp_ni->allocated_size);
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	up_write(&mftbmp_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return 0;
+restore_undo_alloc:
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+			0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft bitmap attribute.%s", es);
+		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+		mftbmp_ni->allocated_size += vol->cluster_size;
+		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		up_write(&mftbmp_ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is ->allocated_size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return ret;
+	}
+	a = ctx->attr;
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
+undo_alloc:
+	if (status.added_cluster) {
+		/* Truncate the last run in the runlist by one cluster. */
+		rl->length--;
+		rl[1].vcn--;
+	} else if (status.added_run) {
+		lcn = rl->lcn;
+		/* Remove the last run from the runlist. */
+		rl->lcn = rl[1].lcn;
+		rl->length = 0;
+	}
+	/* Deallocate the cluster. */
+	down_write(&vol->lcnbmp_lock);
+	if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+		ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->lcnbmp_lock);
+	if (status.mp_rebuilt) {
+		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				old_alen - le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				rl2, ll, -1, NULL)) {
+			ntfs_error(vol->sb, "Failed to restore mapping pairs "
+					"array.%s", es);
+			NVolSetErrors(vol);
+		}
+		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record.%s", es);
+			NVolSetErrors(vol);
+		}
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (!IS_ERR(mrec))
+		unmap_mft_record(mft_ni);
+	up_write(&mftbmp_ni->runlist.lock);
+	return ret;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
+ * @vol:	volume on which to extend the mft bitmap attribute
+ *
+ * Extend the initialized portion of the mft bitmap attribute on the ntfs
+ * volume @vol by 8 bytes.
+ *
+ * Note:  Only changes initialized_size and data_size, i.e. requires that
+ * allocated_size is big enough to fit the new initialized_size.
+ *
+ * Return 0 on success and -error on error.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
+{
+	s64 old_data_size, old_initialized_size;
+	unsigned long flags;
+	struct inode *mftbmp_vi;
+	ntfs_inode *mft_ni, *mftbmp_ni;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a;
+	int ret;
+
+	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_vi = vol->mftbmp_ino;
+	mftbmp_ni = NTFS_I(mftbmp_vi);
+	/* Get the attribute record. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		return PTR_ERR(mrec);
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto unm_err_out;
+	}
+	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft bitmap attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto put_err_out;
+	}
+	a = ctx->attr;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = i_size_read(mftbmp_vi);
+	old_initialized_size = mftbmp_ni->initialized_size;
+	/*
+	 * We can simply update the initialized_size before filling the space
+	 * with zeroes because the caller is holding the mft bitmap lock for
+	 * writing which ensures that no one else is trying to access the data.
+	 */
+	mftbmp_ni->initialized_size += 8;
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(mftbmp_ni->initialized_size);
+	if (mftbmp_ni->initialized_size > old_data_size) {
+		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
+		a->data.non_resident.data_size =
+				cpu_to_sle64(mftbmp_ni->initialized_size);
+	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	/* Initialize the mft bitmap attribute value with zeroes. */
+	ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
+	if (likely(!ret)) {
+		ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
+				"bitmap.");
+		return 0;
+	}
+	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
+	/* Try to recover from the error. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.%s", es);
+		NVolSetErrors(vol);
+		return ret;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.%s", es);
+		NVolSetErrors(vol);
+		goto unm_err_out;
+	}
+	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft bitmap attribute.%s", es);
+		NVolSetErrors(vol);
+put_err_out:
+		ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+		unmap_mft_record(mft_ni);
+		goto err_out;
+	}
+	a = ctx->attr;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	mftbmp_ni->initialized_size = old_initialized_size;
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(old_initialized_size);
+	if (i_size_read(mftbmp_vi) != old_data_size) {
+		i_size_write(mftbmp_vi, old_data_size);
+		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
+	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
+			"data_size 0x%llx, initialized_size 0x%llx.",
+			(long long)mftbmp_ni->allocated_size,
+			(long long)i_size_read(mftbmp_vi),
+			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+err_out:
+	return ret;
+}
+
+/**
+ * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
+ * @vol:	volume on which to extend the mft data attribute
+ *
+ * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
+ * worth of clusters or if not enough space for this by one mft record worth
+ * of clusters.
+ *
+ * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *	    - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
+ *	      writing and releases it before returning.
+ *	    - This function calls functions which take vol->lcnbmp_lock for
+ *	      writing and release it before returning.
+ */
+static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
+{
+	LCN lcn;
+	VCN old_last_vcn;
+	s64 min_nr, nr, ll;
+	unsigned long flags;
+	ntfs_inode *mft_ni;
+	runlist_element *rl, *rl2;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a = NULL;
+	int ret, mp_size;
+	u32 old_alen = 0;
+	bool mp_rebuilt = false;
+
+	ntfs_debug("Extending mft data allocation.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	/*
+	 * Determine the preferred allocation location, i.e. the last lcn of
+	 * the mft data attribute.  The allocated size of the mft data
+	 * attribute cannot be zero so we are ok to do this.
+	 */
+	down_write(&mft_ni->runlist.lock);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	rl = ntfs_attr_find_vcn_nolock(mft_ni,
+			(ll - 1) >> vol->cluster_size_bits, NULL);
+	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mft_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to determine last allocated "
+				"cluster of mft data attribute.");
+		if (!IS_ERR(rl))
+			ret = -EIO;
+		else
+			ret = PTR_ERR(rl);
+		return ret;
+	}
+	lcn = rl->lcn + rl->length;
+	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
+	/* Minimum allocation is one mft record worth of clusters. */
+	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
+	if (!min_nr)
+		min_nr = 1;
+	/* Want to allocate 16 mft records worth of clusters. */
+	nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
+	if (!nr)
+		nr = min_nr;
+	/* Ensure we do not go above 2^32-1 mft records. */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
+			vol->mft_record_size_bits >= (1ll << 32))) {
+		nr = min_nr;
+		if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
+				vol->mft_record_size_bits >= (1ll << 32))) {
+			ntfs_warning(vol->sb, "Cannot allocate mft record "
+					"because the maximum number of inodes "
+					"(2^32) has already been reached.");
+			up_write(&mft_ni->runlist.lock);
+			return -ENOSPC;
+		}
+	}
+	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
+			nr > min_nr ? "default" : "minimal", (long long)nr);
+	old_last_vcn = rl[1].vcn;
+	do {
+		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
+				true);
+		if (likely(!IS_ERR(rl2)))
+			break;
+		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
+			ntfs_error(vol->sb, "Failed to allocate the minimal "
+					"number of clusters (%lli) for the "
+					"mft data attribute.", (long long)nr);
+			up_write(&mft_ni->runlist.lock);
+			return PTR_ERR(rl2);
+		}
+		/*
+		 * There is not enough space to do the allocation, but there
+		 * might be enough space to do a minimal allocation so try that
+		 * before failing.
+		 */
+		nr = min_nr;
+		ntfs_debug("Retrying mft data allocation with minimal cluster "
+				"count %lli.", (long long)nr);
+	} while (1);
+	rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
+	if (IS_ERR(rl)) {
+		up_write(&mft_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to merge runlists for mft data "
+				"attribute.");
+		if (ntfs_cluster_free_from_rl(vol, rl2)) {
+			ntfs_error(vol->sb, "Failed to deallocate clusters "
+					"from the mft data attribute.%s", es);
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl2);
+		return PTR_ERR(rl);
+	}
+	mft_ni->runlist.rl = rl;
+	ntfs_debug("Allocated %lli clusters.", (long long)nr);
+	/* Find the last run in the new runlist. */
+	for (; rl[1].length; rl++)
+		;
+	/* Update the attribute record as well. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		ret = PTR_ERR(mrec);
+		goto undo_alloc;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto undo_alloc;
+	}
+	ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft data attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	a = ctx->attr;
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	/* Search back for the previous last allocated cluster of mft bitmap. */
+	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
+		if (ll >= rl2->vcn)
+			break;
+	}
+	BUG_ON(ll < rl2->vcn);
+	BUG_ON(ll >= rl2->vcn + rl2->length);
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+				"mft data attribute extent.");
+		ret = mp_size;
+		if (!ret)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	/* Expand the attribute record if necessary. */
+	old_alen = le32_to_cpu(a->length);
+	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(ret)) {
+		if (ret != -ENOSPC) {
+			ntfs_error(vol->sb, "Failed to resize attribute "
+					"record for mft data attribute.");
+			goto undo_alloc;
+		}
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record or by
+		// moving other attributes out of this mft record.
+		// Note: Use the special reserved mft records and ensure that
+		// this extent is not required to find the mft record in
+		// question.  If no free special records left we would need to
+		// move an existing record away, insert ours in its place, and
+		// then place the moved record into the newly allocated space
+		// and we would then need to update all references to this mft
+		// record appropriately.  This is rather complicated...
+		ntfs_error(vol->sb, "Not enough space in this mft record to "
+				"accommodate extended mft data attribute "
+				"extent.  Cannot handle this yet.");
+		ret = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	mp_rebuilt = true;
+	/* Generate the mapping pairs array directly into the attr record. */
+	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, -1, NULL);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
+				"mft data attribute.");
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+	/*
+	 * We now have extended the mft data allocated_size by nr clusters.
+	 * Reflect this in the ntfs_inode structure and the attribute record.
+	 * @rl is the last (non-terminator) runlist element of mft data
+	 * attribute.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
+				mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
+				ctx);
+		if (unlikely(ret)) {
+			ntfs_error(vol->sb, "Failed to find first attribute "
+					"extent of mft data attribute.");
+			goto restore_undo_alloc;
+		}
+		a = ctx->attr;
+	}
+	write_lock_irqsave(&mft_ni->size_lock, flags);
+	mft_ni->allocated_size += nr << vol->cluster_size_bits;
+	a->data.non_resident.allocated_size =
+			cpu_to_sle64(mft_ni->allocated_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	up_write(&mft_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return 0;
+restore_undo_alloc:
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft data attribute.%s", es);
+		write_lock_irqsave(&mft_ni->size_lock, flags);
+		mft_ni->allocated_size += nr << vol->cluster_size_bits;
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		up_write(&mft_ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is ->allocated_size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return ret;
+	}
+	ctx->attr->data.non_resident.highest_vcn =
+			cpu_to_sle64(old_last_vcn - 1);
+undo_alloc:
+	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
+		ntfs_error(vol->sb, "Failed to free clusters from mft data "
+				"attribute.%s", es);
+		NVolSetErrors(vol);
+	}
+	a = ctx->attr;
+	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
+		ntfs_error(vol->sb, "Failed to truncate mft data attribute "
+				"runlist.%s", es);
+		NVolSetErrors(vol);
+	}
+	if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
+		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				old_alen - le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				rl2, ll, -1, NULL)) {
+			ntfs_error(vol->sb, "Failed to restore mapping pairs "
+					"array.%s", es);
+			NVolSetErrors(vol);
+		}
+		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record.%s", es);
+			NVolSetErrors(vol);
+		}
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+	} else if (IS_ERR(ctx->mrec)) {
+		ntfs_error(vol->sb, "Failed to restore attribute search "
+				"context.%s", es);
+		NVolSetErrors(vol);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (!IS_ERR(mrec))
+		unmap_mft_record(mft_ni);
+	up_write(&mft_ni->runlist.lock);
+	return ret;
+}
+
+/**
+ * ntfs_mft_record_layout - layout an mft record into a memory buffer
+ * @vol:	volume to which the mft record will belong
+ * @mft_no:	mft reference specifying the mft record number
+ * @m:		destination buffer of size >= @vol->mft_record_size bytes
+ *
+ * Layout an empty, unused mft record with the mft record number @mft_no into
+ * the buffer @m.  The volume @vol is needed because the mft record structure
+ * was modified in NTFS 3.1 so we need to know which volume version this mft
+ * record will be used on.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
+		MFT_RECORD *m)
+{
+	ATTR_RECORD *a;
+
+	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+	if (mft_no >= (1ll << 32)) {
+		ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
+				"maximum of 2^32.", (long long)mft_no);
+		return -ERANGE;
+	}
+	/* Start by clearing the whole mft record to gives us a clean slate. */
+	memset(m, 0, vol->mft_record_size);
+	/* Aligned to 2-byte boundary. */
+	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
+		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
+	else {
+		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
+		/*
+		 * Set the NTFS 3.1+ specific fields while we know that the
+		 * volume version is 3.1+.
+		 */
+		m->reserved = 0;
+		m->mft_record_number = cpu_to_le32((u32)mft_no);
+	}
+	m->magic = magic_FILE;
+	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
+		m->usa_count = cpu_to_le16(vol->mft_record_size /
+				NTFS_BLOCK_SIZE + 1);
+	else {
+		m->usa_count = cpu_to_le16(1);
+		ntfs_warning(vol->sb, "Sector size is bigger than mft record "
+				"size.  Setting usa_count to 1.  If chkdsk "
+				"reports this as corruption, please email "
+				"linux-ntfs-dev@lists.sourceforge.net stating "
+				"that you saw this message and that the "
+				"modified filesystem created was corrupt.  "
+				"Thank you.");
+	}
+	/* Set the update sequence number to 1. */
+	*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
+	m->lsn = 0;
+	m->sequence_number = cpu_to_le16(1);
+	m->link_count = 0;
+	/*
+	 * Place the attributes straight after the update sequence array,
+	 * aligned to 8-byte boundary.
+	 */
+	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
+			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
+	m->flags = 0;
+	/*
+	 * Using attrs_offset plus eight bytes (for the termination attribute).
+	 * attrs_offset is already aligned to 8-byte boundary, so no need to
+	 * align again.
+	 */
+	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
+	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
+	m->base_mft_record = 0;
+	m->next_attr_instance = 0;
+	/* Add the termination attribute. */
+	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
+	a->type = AT_END;
+	a->length = 0;
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * ntfs_mft_record_format - format an mft record on an ntfs volume
+ * @vol:	volume on which to format the mft record
+ * @mft_no:	mft record number to format
+ *
+ * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
+ * mft record into the appropriate place of the mft data attribute.  This is
+ * used when extending the mft data attribute.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
+{
+	loff_t i_size;
+	struct inode *mft_vi = vol->mft_ino;
+	struct page *page;
+	MFT_RECORD *m;
+	pgoff_t index, end_index;
+	unsigned int ofs;
+	int err;
+
+	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+	/*
+	 * The index into the page cache and the offset within the page cache
+	 * page of the wanted mft record.
+	 */
+	index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* The maximum valid index into the page cache for $MFT's data. */
+	i_size = i_size_read(mft_vi);
+	end_index = i_size >> PAGE_CACHE_SHIFT;
+	if (unlikely(index >= end_index)) {
+		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
+				(i_size & ~PAGE_CACHE_MASK))) {
+			ntfs_error(vol->sb, "Tried to format non-existing mft "
+					"record 0x%llx.", (long long)mft_no);
+			return -ENOENT;
+		}
+	}
+	/* Read, map, and pin the page containing the mft record. */
+	page = ntfs_map_page(mft_vi->i_mapping, index);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to map page containing mft record "
+				"to format 0x%llx.", (long long)mft_no);
+		return PTR_ERR(page);
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+	err = ntfs_mft_record_layout(vol, mft_no, m);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
+				(long long)mft_no);
+		SetPageUptodate(page);
+		unlock_page(page);
+		ntfs_unmap_page(page);
+		return err;
+	}
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	/*
+	 * Make sure the mft record is written out to disk.  We could use
+	 * ilookup5() to check if an inode is in icache and so on but this is
+	 * unnecessary as ntfs_writepage() will write the dirty record anyway.
+	 */
+	mark_ntfs_record_dirty(page, ofs);
+	ntfs_unmap_page(page);
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
+ * @vol:	[IN]  volume on which to allocate the mft record
+ * @mode:	[IN]  mode if want a file or directory, i.e. base inode or 0
+ * @base_ni:	[IN]  open base inode if allocating an extent mft record or NULL
+ * @mrec:	[OUT] on successful return this is the mapped mft record
+ *
+ * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
+ *
+ * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
+ * direvctory inode, and allocate it at the default allocator position.  In
+ * this case @mode is the file mode as given to us by the caller.  We in
+ * particular use @mode to distinguish whether a file or a directory is being
+ * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
+ *
+ * If @base_ni is not NULL make the allocated mft record an extent record,
+ * allocate it starting at the mft record after the base mft record and attach
+ * the allocated and opened ntfs inode to the base inode @base_ni.  In this
+ * case @mode must be 0 as it is meaningless for extent inodes.
+ *
+ * You need to check the return value with IS_ERR().  If false, the function
+ * was successful and the return value is the now opened ntfs inode of the
+ * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
+ * and locked mft record.  If IS_ERR() is true, the function failed and the
+ * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
+ * this case.
+ *
+ * Allocation strategy:
+ *
+ * To find a free mft record, we scan the mft bitmap for a zero bit.  To
+ * optimize this we start scanning at the place specified by @base_ni or if
+ * @base_ni is NULL we start where we last stopped and we perform wrap around
+ * when we reach the end.  Note, we do not try to allocate mft records below
+ * number 24 because numbers 0 to 15 are the defined system files anyway and 16
+ * to 24 are special in that they are used for storing extension mft records
+ * for the $DATA attribute of $MFT.  This is required to avoid the possibility
+ * of creating a runlist with a circular dependency which once written to disk
+ * can never be read in again.  Windows will only use records 16 to 24 for
+ * normal files if the volume is completely out of space.  We never use them
+ * which means that when the volume is really out of space we cannot create any
+ * more files while Windows can still create up to 8 small files.  We can start
+ * doing this at some later time, it does not matter much for now.
+ *
+ * When scanning the mft bitmap, we only search up to the last allocated mft
+ * record.  If there are no free records left in the range 24 to number of
+ * allocated mft records, then we extend the $MFT/$DATA attribute in order to
+ * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
+ * records at a time or one cluster, if cluster size is above 16kiB.  If there
+ * is not sufficient space to do this, we try to extend by a single mft record
+ * or one cluster, if cluster size is above the mft record size.
+ *
+ * No matter how many mft records we allocate, we initialize only the first
+ * allocated mft record, incrementing mft data size and initialized size
+ * accordingly, open an ntfs_inode for it and return it to the caller, unless
+ * there are less than 24 mft records, in which case we allocate and initialize
+ * mft records until we reach record 24 which we consider as the first free mft
+ * record for use by normal files.
+ *
+ * If during any stage we overflow the initialized data in the mft bitmap, we
+ * extend the initialized size (and data size) by 8 bytes, allocating another
+ * cluster if required.  The bitmap data size has to be at least equal to the
+ * number of mft records in the mft, but it can be bigger, in which case the
+ * superflous bits are padded with zeroes.
+ *
+ * Thus, when we return successfully (IS_ERR() is false), we will have:
+ *	- initialized / extended the mft bitmap if necessary,
+ *	- initialized / extended the mft data if necessary,
+ *	- set the bit corresponding to the mft record being allocated in the
+ *	  mft bitmap,
+ *	- opened an ntfs_inode for the allocated mft record, and we will have
+ *	- returned the ntfs_inode as well as the allocated mapped, pinned, and
+ *	  locked mft record.
+ *
+ * On error, the volume will be left in a consistent state and no record will
+ * be allocated.  If rolling back a partial operation fails, we may leave some
+ * inconsistent metadata in which case we set NVolErrors() so the volume is
+ * left dirty when unmounted.
+ *
+ * Note, this function cannot make use of most of the normal functions, like
+ * for example for attribute resizing, etc, because when the run list overflows
+ * the base mft record and an attribute list is used, it is very important that
+ * the extension mft records used to store the $DATA attribute of $MFT can be
+ * reached without having to read the information contained inside them, as
+ * this would make it impossible to find them in the first place after the
+ * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
+ * rule because the bitmap is not essential for finding the mft records, but on
+ * the other hand, handling the bitmap in this special way would make life
+ * easier because otherwise there might be circular invocations of functions
+ * when reading the bitmap.
+ */
+ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+		ntfs_inode *base_ni, MFT_RECORD **mrec)
+{
+	s64 ll, bit, old_data_initialized, old_data_size;
+	unsigned long flags;
+	struct inode *vi;
+	struct page *page;
+	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	pgoff_t index;
+	unsigned int ofs;
+	int err;
+	le16 seq_no, usn;
+	bool record_formatted = false;
+
+	if (base_ni) {
+		ntfs_debug("Entering (allocating an extent mft record for "
+				"base mft record 0x%llx).",
+				(long long)base_ni->mft_no);
+		/* @mode and @base_ni are mutually exclusive. */
+		BUG_ON(mode);
+	} else
+		ntfs_debug("Entering (allocating a base mft record).");
+	if (mode) {
+		/* @mode and @base_ni are mutually exclusive. */
+		BUG_ON(base_ni);
+		/* We only support creation of normal files and directories. */
+		if (!S_ISREG(mode) && !S_ISDIR(mode))
+			return ERR_PTR(-EOPNOTSUPP);
+	}
+	BUG_ON(!mrec);
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+	down_write(&vol->mftbmp_lock);
+	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
+	if (bit >= 0) {
+		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
+				(long long)bit);
+		goto have_alloc_rec;
+	}
+	if (bit != -ENOSPC) {
+		up_write(&vol->mftbmp_lock);
+		return ERR_PTR(bit);
+	}
+	/*
+	 * No free mft records left.  If the mft bitmap already covers more
+	 * than the currently used mft records, the next records are all free,
+	 * so we can simply allocate the first unused mft record.
+	 * Note: We also have to make sure that the mft bitmap at least covers
+	 * the first 24 mft records as they are special and whilst they may not
+	 * be in use, we do not allocate from them.
+	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_initialized = mftbmp_ni->initialized_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
+		bit = ll;
+		if (bit < 24)
+			bit = 24;
+		if (unlikely(bit >= (1ll << 32)))
+			goto max_err_out;
+		ntfs_debug("Found free record (#2), bit 0x%llx.",
+				(long long)bit);
+		goto found_free_rec;
+	}
+	/*
+	 * The mft bitmap needs to be expanded until it covers the first unused
+	 * mft record that we can allocate.
+	 * Note: The smallest mft record we allocate is mft record 24.
+	 */
+	bit = old_data_initialized << 3;
+	if (unlikely(bit >= (1ll << 32)))
+		goto max_err_out;
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = mftbmp_ni->allocated_size;
+	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
+			"data_size 0x%llx, initialized_size 0x%llx.",
+			(long long)old_data_size,
+			(long long)i_size_read(vol->mftbmp_ino),
+			(long long)old_data_initialized);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized + 8 > old_data_size) {
+		/* Need to extend bitmap by one more cluster. */
+		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
+		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
+		if (unlikely(err)) {
+			up_write(&vol->mftbmp_lock);
+			goto err_out;
+		}
+#ifdef DEBUG
+		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+		ntfs_debug("Status of mftbmp after allocation extension: "
+				"allocated_size 0x%llx, data_size 0x%llx, "
+				"initialized_size 0x%llx.",
+				(long long)mftbmp_ni->allocated_size,
+				(long long)i_size_read(vol->mftbmp_ino),
+				(long long)mftbmp_ni->initialized_size);
+		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+	}
+	/*
+	 * We now have sufficient allocated space, extend the initialized_size
+	 * as well as the data_size if necessary and fill the new space with
+	 * zeroes.
+	 */
+	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
+	if (unlikely(err)) {
+		up_write(&vol->mftbmp_lock);
+		goto err_out;
+	}
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ntfs_debug("Status of mftbmp after initialized extension: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mftbmp_ni->allocated_size,
+			(long long)i_size_read(vol->mftbmp_ino),
+			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
+found_free_rec:
+	/* @bit is the found free mft record, allocate it in the mft bitmap. */
+	ntfs_debug("At found_free_rec.");
+	err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
+		up_write(&vol->mftbmp_lock);
+		goto err_out;
+	}
+	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
+have_alloc_rec:
+	/*
+	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
+	 * Note, we keep hold of the mft bitmap lock for writing until all
+	 * modifications to the mft data attribute are complete, too, as they
+	 * will impact decisions for mft bitmap and mft record allocation done
+	 * by a parallel allocation and if the lock is not maintained a
+	 * parallel allocation could allocate the same mft record as this one.
+	 */
+	ll = (bit + 1) << vol->mft_record_size_bits;
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	old_data_initialized = mft_ni->initialized_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (ll <= old_data_initialized) {
+		ntfs_debug("Allocated mft record already initialized.");
+		goto mft_rec_already_initialized;
+	}
+	ntfs_debug("Initializing allocated mft record.");
+	/*
+	 * The mft record is outside the initialized data.  Extend the mft data
+	 * attribute until it covers the allocated record.  The loop is only
+	 * actually traversed more than once when a freshly formatted volume is
+	 * first written to so it optimizes away nicely in the common case.
+	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ntfs_debug("Status of mft data before extension: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mft_ni->allocated_size,
+			(long long)i_size_read(vol->mft_ino),
+			(long long)mft_ni->initialized_size);
+	while (ll > mft_ni->allocated_size) {
+		read_unlock_irqrestore(&mft_ni->size_lock, flags);
+		err = ntfs_mft_data_extend_allocation_nolock(vol);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to extend mft data "
+					"allocation.");
+			goto undo_mftbmp_alloc_nolock;
+		}
+		read_lock_irqsave(&mft_ni->size_lock, flags);
+		ntfs_debug("Status of mft data after allocation extension: "
+				"allocated_size 0x%llx, data_size 0x%llx, "
+				"initialized_size 0x%llx.",
+				(long long)mft_ni->allocated_size,
+				(long long)i_size_read(vol->mft_ino),
+				(long long)mft_ni->initialized_size);
+	}
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/*
+	 * Extend mft data initialized size (and data size of course) to reach
+	 * the allocated mft record, formatting the mft records allong the way.
+	 * Note: We only modify the ntfs_inode structure as that is all that is
+	 * needed by ntfs_mft_record_format().  We will update the attribute
+	 * record itself in one fell swoop later on.
+	 */
+	write_lock_irqsave(&mft_ni->size_lock, flags);
+	old_data_initialized = mft_ni->initialized_size;
+	old_data_size = vol->mft_ino->i_size;
+	while (ll > mft_ni->initialized_size) {
+		s64 new_initialized_size, mft_no;
+		
+		new_initialized_size = mft_ni->initialized_size +
+				vol->mft_record_size;
+		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
+		if (new_initialized_size > i_size_read(vol->mft_ino))
+			i_size_write(vol->mft_ino, new_initialized_size);
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
+		ntfs_debug("Initializing mft record 0x%llx.",
+				(long long)mft_no);
+		err = ntfs_mft_record_format(vol, mft_no);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to format mft record.");
+			goto undo_data_init;
+		}
+		write_lock_irqsave(&mft_ni->size_lock, flags);
+		mft_ni->initialized_size = new_initialized_size;
+	}
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
+	record_formatted = true;
+	/* Update the mft data attribute record to reflect the new sizes. */
+	m = map_mft_record(mft_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		err = PTR_ERR(m);
+		goto undo_data_init;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, m);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		err = -ENOMEM;
+		unmap_mft_record(mft_ni);
+		goto undo_data_init;
+	}
+	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft data attribute.");
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		goto undo_data_init;
+	}
+	a = ctx->attr;
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(mft_ni->initialized_size);
+	a->data.non_resident.data_size =
+			cpu_to_sle64(i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ntfs_debug("Status of mft data after mft record initialization: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mft_ni->allocated_size,
+			(long long)i_size_read(vol->mft_ino),
+			(long long)mft_ni->initialized_size);
+	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
+	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+mft_rec_already_initialized:
+	/*
+	 * We can finally drop the mft bitmap lock as the mft data attribute
+	 * has been fully updated.  The only disparity left is that the
+	 * allocated mft record still needs to be marked as in use to match the
+	 * set bit in the mft bitmap but this is actually not a problem since
+	 * this mft record is not referenced from anywhere yet and the fact
+	 * that it is allocated in the mft bitmap means that no-one will try to
+	 * allocate it either.
+	 */
+	up_write(&vol->mftbmp_lock);
+	/*
+	 * We now have allocated and initialized the mft record.  Calculate the
+	 * index of and the offset within the page cache page the record is in.
+	 */
+	index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* Read, map, and pin the page containing the mft record. */
+	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to map page containing allocated "
+				"mft record 0x%llx.", (long long)bit);
+		err = PTR_ERR(page);
+		goto undo_mftbmp_alloc;
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+	/* If we just formatted the mft record no need to do it again. */
+	if (!record_formatted) {
+		/* Sanity check that the mft record is really not in use. */
+		if (ntfs_is_file_record(m->magic) &&
+				(m->flags & MFT_RECORD_IN_USE)) {
+			ntfs_error(vol->sb, "Mft record 0x%llx was marked "
+					"free in mft bitmap but is marked "
+					"used itself.  Corrupt filesystem.  "
+					"Unmount and run chkdsk.",
+					(long long)bit);
+			err = -EIO;
+			SetPageUptodate(page);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			NVolSetErrors(vol);
+			goto undo_mftbmp_alloc;
+		}
+		/*
+		 * We need to (re-)format the mft record, preserving the
+		 * sequence number if it is not zero as well as the update
+		 * sequence number if it is not zero or -1 (0xffff).  This
+		 * means we do not need to care whether or not something went
+		 * wrong with the previous mft record.
+		 */
+		seq_no = m->sequence_number;
+		usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
+		err = ntfs_mft_record_layout(vol, bit, m);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to layout allocated mft "
+					"record 0x%llx.", (long long)bit);
+			SetPageUptodate(page);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		if (seq_no)
+			m->sequence_number = seq_no;
+		if (usn && le16_to_cpu(usn) != 0xffff)
+			*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
+	}
+	/* Set the mft record itself in use. */
+	m->flags |= MFT_RECORD_IN_USE;
+	if (S_ISDIR(mode))
+		m->flags |= MFT_RECORD_IS_DIRECTORY;
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	if (base_ni) {
+		MFT_RECORD *m_tmp;
+
+		/*
+		 * Setup the base mft record in the extent mft record.  This
+		 * completes initialization of the allocated extent mft record
+		 * and we can simply use it with map_extent_mft_record().
+		 */
+		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
+				base_ni->seq_no);
+		/*
+		 * Allocate an extent inode structure for the new mft record,
+		 * attach it to the base inode @base_ni and map, pin, and lock
+		 * its, i.e. the allocated, mft record.
+		 */
+		m_tmp = map_extent_mft_record(base_ni, bit, &ni);
+		if (IS_ERR(m_tmp)) {
+			ntfs_error(vol->sb, "Failed to map allocated extent "
+					"mft record 0x%llx.", (long long)bit);
+			err = PTR_ERR(m_tmp);
+			/* Set the mft record itself not in use. */
+			m->flags &= cpu_to_le16(
+					~le16_to_cpu(MFT_RECORD_IN_USE));
+			flush_dcache_page(page);
+			/* Make sure the mft record is written out to disk. */
+			mark_ntfs_record_dirty(page, ofs);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		BUG_ON(m != m_tmp);
+		/*
+		 * Make sure the allocated mft record is written out to disk.
+		 * No need to set the inode dirty because the caller is going
+		 * to do that anyway after finishing with the new extent mft
+		 * record (e.g. at a minimum a new attribute will be added to
+		 * the mft record.
+		 */
+		mark_ntfs_record_dirty(page, ofs);
+		unlock_page(page);
+		/*
+		 * Need to unmap the page since map_extent_mft_record() mapped
+		 * it as well so we have it mapped twice at the moment.
+		 */
+		ntfs_unmap_page(page);
+	} else {
+		/*
+		 * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
+		 * is set to 1 but the mft record->link_count is 0.  The caller
+		 * needs to bear this in mind.
+		 */
+		vi = new_inode(vol->sb);
+		if (unlikely(!vi)) {
+			err = -ENOMEM;
+			/* Set the mft record itself not in use. */
+			m->flags &= cpu_to_le16(
+					~le16_to_cpu(MFT_RECORD_IN_USE));
+			flush_dcache_page(page);
+			/* Make sure the mft record is written out to disk. */
+			mark_ntfs_record_dirty(page, ofs);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		vi->i_ino = bit;
+		/*
+		 * This is for checking whether an inode has changed w.r.t. a
+		 * file so that the file can be updated if necessary (compare
+		 * with f_version).
+		 */
+		vi->i_version = 1;
+
+		/* The owner and group come from the ntfs volume. */
+		vi->i_uid = vol->uid;
+		vi->i_gid = vol->gid;
+
+		/* Initialize the ntfs specific part of @vi. */
+		ntfs_init_big_inode(vi);
+		ni = NTFS_I(vi);
+		/*
+		 * Set the appropriate mode, attribute type, and name.  For
+		 * directories, also setup the index values to the defaults.
+		 */
+		if (S_ISDIR(mode)) {
+			vi->i_mode = S_IFDIR | S_IRWXUGO;
+			vi->i_mode &= ~vol->dmask;
+
+			NInoSetMstProtected(ni);
+			ni->type = AT_INDEX_ALLOCATION;
+			ni->name = I30;
+			ni->name_len = 4;
+
+			ni->itype.index.block_size = 4096;
+			ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
+			ni->itype.index.collation_rule = COLLATION_FILE_NAME;
+			if (vol->cluster_size <= ni->itype.index.block_size) {
+				ni->itype.index.vcn_size = vol->cluster_size;
+				ni->itype.index.vcn_size_bits =
+						vol->cluster_size_bits;
+			} else {
+				ni->itype.index.vcn_size = vol->sector_size;
+				ni->itype.index.vcn_size_bits =
+						vol->sector_size_bits;
+			}
+		} else {
+			vi->i_mode = S_IFREG | S_IRWXUGO;
+			vi->i_mode &= ~vol->fmask;
+
+			ni->type = AT_DATA;
+			ni->name = NULL;
+			ni->name_len = 0;
+		}
+		if (IS_RDONLY(vi))
+			vi->i_mode &= ~S_IWUGO;
+
+		/* Set the inode times to the current time. */
+		vi->i_atime = vi->i_mtime = vi->i_ctime =
+			current_fs_time(vi->i_sb);
+		/*
+		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
+		 * the call to ntfs_init_big_inode() below.
+		 */
+		vi->i_size = 0;
+		vi->i_blocks = 0;
+
+		/* Set the sequence number. */
+		vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+		/*
+		 * Manually map, pin, and lock the mft record as we already
+		 * have its page mapped and it is very easy to do.
+		 */
+		atomic_inc(&ni->count);
+		mutex_lock(&ni->mrec_lock);
+		ni->page = page;
+		ni->page_ofs = ofs;
+		/*
+		 * Make sure the allocated mft record is written out to disk.
+		 * NOTE: We do not set the ntfs inode dirty because this would
+		 * fail in ntfs_write_inode() because the inode does not have a
+		 * standard information attribute yet.  Also, there is no need
+		 * to set the inode dirty because the caller is going to do
+		 * that anyway after finishing with the new mft record (e.g. at
+		 * a minimum some new attributes will be added to the mft
+		 * record.
+		 */
+		mark_ntfs_record_dirty(page, ofs);
+		unlock_page(page);
+
+		/* Add the inode to the inode hash for the superblock. */
+		insert_inode_hash(vi);
+
+		/* Update the default mft allocation position. */
+		vol->mft_data_pos = bit + 1;
+	}
+	/*
+	 * Return the opened, allocated inode of the allocated mft record as
+	 * well as the mapped, pinned, and locked mft record.
+	 */
+	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
+			base_ni ? "extent " : "", (long long)bit);
+	*mrec = m;
+	return ni;
+undo_data_init:
+	write_lock_irqsave(&mft_ni->size_lock, flags);
+	mft_ni->initialized_size = old_data_initialized;
+	i_size_write(vol->mft_ino, old_data_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
+	goto undo_mftbmp_alloc_nolock;
+undo_mftbmp_alloc:
+	down_write(&vol->mftbmp_lock);
+undo_mftbmp_alloc_nolock:
+	if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
+		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->mftbmp_lock);
+err_out:
+	return ERR_PTR(err);
+max_err_out:
+	ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
+			"number of inodes (2^32) has already been reached.");
+	up_write(&vol->mftbmp_lock);
+	return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
+ * @ni:		ntfs inode of the mapped extent mft record to free
+ * @m:		mapped extent mft record of the ntfs inode @ni
+ *
+ * Free the mapped extent mft record @m of the extent ntfs inode @ni.
+ *
+ * Note that this function unmaps the mft record and closes and destroys @ni
+ * internally and hence you cannot use either @ni nor @m any more after this
+ * function returns success.
+ *
+ * On success return 0 and on error return -errno.  @ni and @m are still valid
+ * in this case and have not been freed.
+ *
+ * For some errors an error message is displayed and the success code 0 is
+ * returned and the volume is then left dirty on umount.  This makes sense in
+ * case we could not rollback the changes that were already done since the
+ * caller no longer wants to reference this mft record so it does not matter to
+ * the caller if something is wrong with it as long as it is properly detached
+ * from the base inode.
+ */
+int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
+{
+	unsigned long mft_no = ni->mft_no;
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	ntfs_inode **extent_nis;
+	int i, err;
+	le16 old_seq_no;
+	u16 seq_no;
+	
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(ni->nr_extents != -1);
+
+	mutex_lock(&ni->extent_lock);
+	base_ni = ni->ext.base_ntfs_ino;
+	mutex_unlock(&ni->extent_lock);
+
+	BUG_ON(base_ni->nr_extents <= 0);
+
+	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
+			mft_no, base_ni->mft_no);
+
+	mutex_lock(&base_ni->extent_lock);
+
+	/* Make sure we are holding the only reference to the extent inode. */
+	if (atomic_read(&ni->count) > 2) {
+		ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
+				"not freeing.", base_ni->mft_no);
+		mutex_unlock(&base_ni->extent_lock);
+		return -EBUSY;
+	}
+
+	/* Dissociate the ntfs inode from the base inode. */
+	extent_nis = base_ni->ext.extent_ntfs_inos;
+	err = -ENOENT;
+	for (i = 0; i < base_ni->nr_extents; i++) {
+		if (ni != extent_nis[i])
+			continue;
+		extent_nis += i;
+		base_ni->nr_extents--;
+		memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
+				sizeof(ntfs_inode*));
+		err = 0;
+		break;
+	}
+
+	mutex_unlock(&base_ni->extent_lock);
+
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
+				"its base inode 0x%lx.", mft_no,
+				base_ni->mft_no);
+		BUG();
+	}
+
+	/*
+	 * The extent inode is no longer attached to the base inode so no one
+	 * can get a reference to it any more.
+	 */
+
+	/* Mark the mft record as not in use. */
+	m->flags &= ~MFT_RECORD_IN_USE;
+
+	/* Increment the sequence number, skipping zero, if it is not zero. */
+	old_seq_no = m->sequence_number;
+	seq_no = le16_to_cpu(old_seq_no);
+	if (seq_no == 0xffff)
+		seq_no = 1;
+	else if (seq_no)
+		seq_no++;
+	m->sequence_number = cpu_to_le16(seq_no);
+
+	/*
+	 * Set the ntfs inode dirty and write it out.  We do not need to worry
+	 * about the base inode here since whatever caused the extent mft
+	 * record to be freed is guaranteed to do it already.
+	 */
+	NInoSetDirty(ni);
+	err = write_mft_record(ni, m, 0);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
+				"freeing.", mft_no);
+		goto rollback;
+	}
+rollback_error:
+	/* Unmap and throw away the now freed extent inode. */
+	unmap_extent_mft_record(ni);
+	ntfs_clear_extent_inode(ni);
+
+	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
+	down_write(&vol->mftbmp_lock);
+	err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
+	up_write(&vol->mftbmp_lock);
+	if (unlikely(err)) {
+		/*
+		 * The extent inode is gone but we failed to deallocate it in
+		 * the mft bitmap.  Just emit a warning and leave the volume
+		 * dirty on umount.
+		 */
+		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+		NVolSetErrors(vol);
+	}
+	return 0;
+rollback:
+	/* Rollback what we did... */
+	mutex_lock(&base_ni->extent_lock);
+	extent_nis = base_ni->ext.extent_ntfs_inos;
+	if (!(base_ni->nr_extents & 3)) {
+		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
+
+		extent_nis = kmalloc(new_size, GFP_NOFS);
+		if (unlikely(!extent_nis)) {
+			ntfs_error(vol->sb, "Failed to allocate internal "
+					"buffer during rollback.%s", es);
+			mutex_unlock(&base_ni->extent_lock);
+			NVolSetErrors(vol);
+			goto rollback_error;
+		}
+		if (base_ni->nr_extents) {
+			BUG_ON(!base_ni->ext.extent_ntfs_inos);
+			memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
+					new_size - 4 * sizeof(ntfs_inode*));
+			kfree(base_ni->ext.extent_ntfs_inos);
+		}
+		base_ni->ext.extent_ntfs_inos = extent_nis;
+	}
+	m->flags |= MFT_RECORD_IN_USE;
+	m->sequence_number = old_seq_no;
+	extent_nis[base_ni->nr_extents++] = ni;
+	mutex_unlock(&base_ni->extent_lock);
+	mark_mft_record_dirty(ni);
+	return err;
+}
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/mft.h b/kernel/fs/ntfs/mft.h
new file mode 100644
index 000000000..b52bf87b9
--- /dev/null
+++ b/kernel/fs/ntfs/mft.h
@@ -0,0 +1,124 @@
+/*
+ * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
+ *	   Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_MFT_H
+#define _LINUX_NTFS_MFT_H
+
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include "inode.h"
+
+extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
+extern void unmap_mft_record(ntfs_inode *ni);
+
+extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+		ntfs_inode **ntfs_ino);
+
+static inline void unmap_extent_mft_record(ntfs_inode *ni)
+{
+	unmap_mft_record(ni);
+	return;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * flush_dcache_mft_record_page - flush_dcache_page() for mft records
+ * @ni:		ntfs inode structure of mft record
+ *
+ * Call flush_dcache_page() for the page in which an mft record resides.
+ *
+ * This must be called every time an mft record is modified, just after the
+ * modification.
+ */
+static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
+{
+	flush_dcache_page(ni->page);
+}
+
+extern void __mark_mft_record_dirty(ntfs_inode *ni);
+
+/**
+ * mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni:		ntfs inode describing the mapped mft record
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty.  Also, mark the base
+ * vfs inode dirty.  This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE:  Do not do anything if the mft record is already marked dirty.
+ */
+static inline void mark_mft_record_dirty(ntfs_inode *ni)
+{
+	if (!NInoTestSetDirty(ni))
+		__mark_mft_record_dirty(ni);
+}
+
+extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+		MFT_RECORD *m, int sync);
+
+extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
+
+/**
+ * write_mft_record - write out a mapped (extent) mft record
+ * @ni:		ntfs inode describing the mapped (extent) mft record
+ * @m:		mapped (extent) mft record to write
+ * @sync:	if true, wait for i/o completion
+ *
+ * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
+ * locks the page for the duration of the write.  This ensures that there are
+ * no race conditions between writing the mft record via the dirty inode code
+ * paths and via the page cache write back code paths or between writing
+ * neighbouring mft records residing in the same page.
+ *
+ * Locking the page also serializes us against ->readpage() if the page is not
+ * uptodate.
+ *
+ * On success, clean the mft record and return 0.  On error, leave the mft
+ * record dirty and return -errno.
+ */
+static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+	struct page *page = ni->page;
+	int err;
+
+	BUG_ON(!page);
+	lock_page(page);
+	err = write_mft_record_nolock(ni, m, sync);
+	unlock_page(page);
+	return err;
+}
+
+extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
+		const unsigned long mft_no, const MFT_RECORD *m,
+		ntfs_inode **locked_ni);
+
+extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+		ntfs_inode *base_ni, MFT_RECORD **mrec);
+extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_MFT_H */
diff --git a/kernel/fs/ntfs/mst.c b/kernel/fs/ntfs/mst.c
new file mode 100644
index 000000000..5a858d839
--- /dev/null
+++ b/kernel/fs/ntfs/mst.c
@@ -0,0 +1,203 @@
+/*
+ * mst.c - NTFS multi sector transfer protection handling code. Part of the
+ *	   Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "ntfs.h"
+
+/**
+ * post_read_mst_fixup - deprotect multi sector transfer protected data
+ * @b:		pointer to the data to deprotect
+ * @size:	size in bytes of @b
+ *
+ * Perform the necessary post read multi sector transfer fixup and detect the
+ * presence of incomplete multi sector transfers. - In that case, overwrite the
+ * magic of the ntfs record header being processed with "BAAD" (in memory only!)
+ * and abort processing.
+ *
+ * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not protected at all and hence doesn't need to
+ * be fixed up. Thus, we return success and not failure in this case. This is
+ * in contrast to pre_write_mst_fixup(), see below.
+ */
+int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+	u16 usa_ofs, usa_count, usn;
+	u16 *usa_pos, *data_pos;
+
+	/* Setup the variables. */
+	usa_ofs = le16_to_cpu(b->usa_ofs);
+	/* Decrement usa_count to get number of fixups. */
+	usa_count = le16_to_cpu(b->usa_count) - 1;
+	/* Size and alignment checks. */
+	if ( size & (NTFS_BLOCK_SIZE - 1)	||
+	     usa_ofs & 1			||
+	     usa_ofs + (usa_count * 2) > size	||
+	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+		return 0;
+	/* Position of usn in update sequence array. */
+	usa_pos = (u16*)b + usa_ofs/sizeof(u16);
+	/*
+	 * The update sequence number which has to be equal to each of the
+	 * u16 values before they are fixed up. Note no need to care for
+	 * endianness since we are comparing and moving data for on disk
+	 * structures which means the data is consistent. - If it is
+	 * consistenty the wrong endianness it doesn't make any difference.
+	 */
+	usn = *usa_pos;
+	/*
+	 * Position in protected data of first u16 that needs fixing up.
+	 */
+	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+	/*
+	 * Check for incomplete multi sector transfer(s).
+	 */
+	while (usa_count--) {
+		if (*data_pos != usn) {
+			/*
+			 * Incomplete multi sector transfer detected! )-:
+			 * Set the magic to "BAAD" and return failure.
+			 * Note that magic_BAAD is already converted to le32.
+			 */
+			b->magic = magic_BAAD;
+			return -EINVAL;
+		}
+		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+	}
+	/* Re-setup the variables. */
+	usa_count = le16_to_cpu(b->usa_count) - 1;
+	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+	/* Fixup all sectors. */
+	while (usa_count--) {
+		/*
+		 * Increment position in usa and restore original data from
+		 * the usa into the data buffer.
+		 */
+		*data_pos = *(++usa_pos);
+		/* Increment position in data as well. */
+		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+	}
+	return 0;
+}
+
+/**
+ * pre_write_mst_fixup - apply multi sector transfer protection
+ * @b:		pointer to the data to protect
+ * @size:	size in bytes of @b
+ *
+ * Perform the necessary pre write multi sector transfer fixup on the data
+ * pointer to by @b of @size.
+ *
+ * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
+ * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not subject to protection and hence doesn't need
+ * to be fixed up. This means that you have to create a valid update sequence
+ * array header in the ntfs record before calling this function, otherwise it
+ * will fail (the header needs to contain the position of the update sequence
+ * array together with the number of elements in the array). You also need to
+ * initialise the update sequence number before calling this function
+ * otherwise a random word will be used (whatever was in the record at that
+ * position at that time).
+ */
+int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+	le16 *usa_pos, *data_pos;
+	u16 usa_ofs, usa_count, usn;
+	le16 le_usn;
+
+	/* Sanity check + only fixup if it makes sense. */
+	if (!b || ntfs_is_baad_record(b->magic) ||
+			ntfs_is_hole_record(b->magic))
+		return -EINVAL;
+	/* Setup the variables. */
+	usa_ofs = le16_to_cpu(b->usa_ofs);
+	/* Decrement usa_count to get number of fixups. */
+	usa_count = le16_to_cpu(b->usa_count) - 1;
+	/* Size and alignment checks. */
+	if ( size & (NTFS_BLOCK_SIZE - 1)	||
+	     usa_ofs & 1			||
+	     usa_ofs + (usa_count * 2) > size	||
+	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+		return -EINVAL;
+	/* Position of usn in update sequence array. */
+	usa_pos = (le16*)((u8*)b + usa_ofs);
+	/*
+	 * Cyclically increment the update sequence number
+	 * (skipping 0 and -1, i.e. 0xffff).
+	 */
+	usn = le16_to_cpup(usa_pos) + 1;
+	if (usn == 0xffff || !usn)
+		usn = 1;
+	le_usn = cpu_to_le16(usn);
+	*usa_pos = le_usn;
+	/* Position in data of first u16 that needs fixing up. */
+	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+	/* Fixup all sectors. */
+	while (usa_count--) {
+		/*
+		 * Increment the position in the usa and save the
+		 * original data from the data buffer into the usa.
+		 */
+		*(++usa_pos) = *data_pos;
+		/* Apply fixup to data. */
+		*data_pos = le_usn;
+		/* Increment position in data as well. */
+		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+	}
+	return 0;
+}
+
+/**
+ * post_write_mst_fixup - fast deprotect multi sector transfer protected data
+ * @b:		pointer to the data to deprotect
+ *
+ * Perform the necessary post write multi sector transfer fixup, not checking
+ * for any errors, because we assume we have just used pre_write_mst_fixup(),
+ * thus the data will be fine or we would never have gotten here.
+ */
+void post_write_mst_fixup(NTFS_RECORD *b)
+{
+	le16 *usa_pos, *data_pos;
+
+	u16 usa_ofs = le16_to_cpu(b->usa_ofs);
+	u16 usa_count = le16_to_cpu(b->usa_count) - 1;
+
+	/* Position of usn in update sequence array. */
+	usa_pos = (le16*)b + usa_ofs/sizeof(le16);
+
+	/* Position in protected data of first u16 that needs fixing up. */
+	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+
+	/* Fixup all sectors. */
+	while (usa_count--) {
+		/*
+		 * Increment position in usa and restore original data from
+		 * the usa into the data buffer.
+		 */
+		*data_pos = *(++usa_pos);
+
+		/* Increment position in data as well. */
+		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+	}
+}
diff --git a/kernel/fs/ntfs/namei.c b/kernel/fs/ntfs/namei.c
new file mode 100644
index 000000000..0f35b80d1
--- /dev/null
+++ b/kernel/fs/ntfs/namei.c
@@ -0,0 +1,405 @@
+/*
+ * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
+ *	     project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+
+#include "attrib.h"
+#include "debug.h"
+#include "dir.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_lookup - find the inode represented by a dentry in a directory inode
+ * @dir_ino:	directory inode in which to look for the inode
+ * @dent:	dentry representing the inode to look for
+ * @nd:		lookup nameidata
+ *
+ * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
+ * in the directory inode @dir_ino and if found attaches the inode to the
+ * dentry @dent.
+ *
+ * In more detail, the dentry @dent specifies which inode to look for by
+ * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
+ * converts the name to Unicode and walks the contents of the directory inode
+ * @dir_ino looking for the converted Unicode name. If the name is found in the
+ * directory, the corresponding inode is loaded by calling ntfs_iget() on its
+ * inode number and the inode is associated with the dentry @dent via a call to
+ * d_splice_alias().
+ *
+ * If the name is not found in the directory, a NULL inode is inserted into the
+ * dentry @dent via a call to d_add(). The dentry is then termed a negative
+ * dentry.
+ *
+ * Only if an actual error occurs, do we return an error via ERR_PTR().
+ *
+ * In order to handle the case insensitivity issues of NTFS with regards to the
+ * dcache and the dcache requiring only one dentry per directory, we deal with
+ * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
+ * a case sensitive dcache. This means that we get the full benefit of dcache
+ * speed when the file/directory is looked up with the same case as returned by
+ * ->ntfs_readdir() but that a lookup for any other case (or for the short file
+ * name) will not find anything in dcache and will enter ->ntfs_lookup()
+ * instead, where we search the directory for a fully matching file name
+ * (including case) and if that is not found, we search for a file name that
+ * matches with different case and if that has non-POSIX semantics we return
+ * that. We actually do only one search (case sensitive) and keep tabs on
+ * whether we have found a case insensitive match in the process.
+ *
+ * To simplify matters for us, we do not treat the short vs long filenames as
+ * two hard links but instead if the lookup matches a short filename, we
+ * return the dentry for the corresponding long filename instead.
+ *
+ * There are three cases we need to distinguish here:
+ *
+ * 1) @dent perfectly matches (i.e. including case) a directory entry with a
+ *    file name in the WIN32 or POSIX namespaces. In this case
+ *    ntfs_lookup_inode_by_name() will return with name set to NULL and we
+ *    just d_splice_alias() @dent.
+ * 2) @dent matches (not including case) a directory entry with a file name in
+ *    the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
+ *    with name set to point to a kmalloc()ed ntfs_name structure containing
+ *    the properly cased little endian Unicode name. We convert the name to the
+ *    current NLS code page, search if a dentry with this name already exists
+ *    and if so return that instead of @dent.  At this point things are
+ *    complicated by the possibility of 'disconnected' dentries due to NFS
+ *    which we deal with appropriately (see the code comments).  The VFS will
+ *    then destroy the old @dent and use the one we returned.  If a dentry is
+ *    not found, we allocate a new one, d_splice_alias() it, and return it as
+ *    above.
+ * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
+ *    directory entry with a file name in the DOS namespace. In this case
+ *    ntfs_lookup_inode_by_name() will return with name set to point to a
+ *    kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
+ *    of the inode. We use the mft reference to read the inode and to find the
+ *    file name in the WIN32 namespace corresponding to the matched short file
+ *    name. We then convert the name to the current NLS code page, and proceed
+ *    searching for a dentry with this name, etc, as in case 2), above.
+ *
+ * Locking: Caller must hold i_mutex on the directory.
+ */
+static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
+		unsigned int flags)
+{
+	ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
+	struct inode *dent_inode;
+	ntfschar *uname;
+	ntfs_name *name = NULL;
+	MFT_REF mref;
+	unsigned long dent_ino;
+	int uname_len;
+
+	ntfs_debug("Looking up %pd in directory inode 0x%lx.",
+			dent, dir_ino->i_ino);
+	/* Convert the name of the dentry to Unicode. */
+	uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
+			&uname);
+	if (uname_len < 0) {
+		if (uname_len != -ENAMETOOLONG)
+			ntfs_error(vol->sb, "Failed to convert name to "
+					"Unicode.");
+		return ERR_PTR(uname_len);
+	}
+	mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
+			&name);
+	kmem_cache_free(ntfs_name_cache, uname);
+	if (!IS_ERR_MREF(mref)) {
+		dent_ino = MREF(mref);
+		ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
+		dent_inode = ntfs_iget(vol->sb, dent_ino);
+		if (likely(!IS_ERR(dent_inode))) {
+			/* Consistency check. */
+			if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
+					NTFS_I(dent_inode)->seq_no ||
+					dent_ino == FILE_MFT) {
+				/* Perfect WIN32/POSIX match. -- Case 1. */
+				if (!name) {
+					ntfs_debug("Done.  (Case 1.)");
+					return d_splice_alias(dent_inode, dent);
+				}
+				/*
+				 * We are too indented.  Handle imperfect
+				 * matches and short file names further below.
+				 */
+				goto handle_name;
+			}
+			ntfs_error(vol->sb, "Found stale reference to inode "
+					"0x%lx (reference sequence number = "
+					"0x%x, inode sequence number = 0x%x), "
+					"returning -EIO. Run chkdsk.",
+					dent_ino, MSEQNO(mref),
+					NTFS_I(dent_inode)->seq_no);
+			iput(dent_inode);
+			dent_inode = ERR_PTR(-EIO);
+		} else
+			ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
+					"error code %li.", dent_ino,
+					PTR_ERR(dent_inode));
+		kfree(name);
+		/* Return the error code. */
+		return (struct dentry *)dent_inode;
+	}
+	/* It is guaranteed that @name is no longer allocated at this point. */
+	if (MREF_ERR(mref) == -ENOENT) {
+		ntfs_debug("Entry was not found, adding negative dentry.");
+		/* The dcache will handle negative entries. */
+		d_add(dent, NULL);
+		ntfs_debug("Done.");
+		return NULL;
+	}
+	ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
+			"code %i.", -MREF_ERR(mref));
+	return ERR_PTR(MREF_ERR(mref));
+	// TODO: Consider moving this lot to a separate function! (AIA)
+handle_name:
+   {
+	MFT_RECORD *m;
+	ntfs_attr_search_ctx *ctx;
+	ntfs_inode *ni = NTFS_I(dent_inode);
+	int err;
+	struct qstr nls_name;
+
+	nls_name.name = NULL;
+	if (name->type != FILE_NAME_DOS) {			/* Case 2. */
+		ntfs_debug("Case 2.");
+		nls_name.len = (unsigned)ntfs_ucstonls(vol,
+				(ntfschar*)&name->name, name->len,
+				(unsigned char**)&nls_name.name, 0);
+		kfree(name);
+	} else /* if (name->type == FILE_NAME_DOS) */ {		/* Case 3. */
+		FILE_NAME_ATTR *fn;
+
+		ntfs_debug("Case 3.");
+		kfree(name);
+
+		/* Find the WIN32 name corresponding to the matched DOS name. */
+		ni = NTFS_I(dent_inode);
+		m = map_mft_record(ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			m = NULL;
+			ctx = NULL;
+			goto err_out;
+		}
+		ctx = ntfs_attr_get_search_ctx(ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		do {
+			ATTR_RECORD *a;
+			u32 val_len;
+
+			err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
+					NULL, 0, ctx);
+			if (unlikely(err)) {
+				ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
+						"namespace counterpart to DOS "
+						"file name. Run chkdsk.");
+				if (err == -ENOENT)
+					err = -EIO;
+				goto err_out;
+			}
+			/* Consistency checks. */
+			a = ctx->attr;
+			if (a->non_resident || a->flags)
+				goto eio_err_out;
+			val_len = le32_to_cpu(a->data.resident.value_length);
+			if (le16_to_cpu(a->data.resident.value_offset) +
+					val_len > le32_to_cpu(a->length))
+				goto eio_err_out;
+			fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
+					ctx->attr->data.resident.value_offset));
+			if ((u32)(fn->file_name_length * sizeof(ntfschar) +
+					sizeof(FILE_NAME_ATTR)) > val_len)
+				goto eio_err_out;
+		} while (fn->file_name_type != FILE_NAME_WIN32);
+
+		/* Convert the found WIN32 name to current NLS code page. */
+		nls_name.len = (unsigned)ntfs_ucstonls(vol,
+				(ntfschar*)&fn->file_name, fn->file_name_length,
+				(unsigned char**)&nls_name.name, 0);
+
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+	}
+	m = NULL;
+	ctx = NULL;
+
+	/* Check if a conversion error occurred. */
+	if ((signed)nls_name.len < 0) {
+		err = (signed)nls_name.len;
+		goto err_out;
+	}
+	nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
+
+	dent = d_add_ci(dent, dent_inode, &nls_name);
+	kfree(nls_name.name);
+	return dent;
+
+eio_err_out:
+	ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
+	err = -EIO;
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(ni);
+	iput(dent_inode);
+	ntfs_error(vol->sb, "Failed, returning error code %i.", err);
+	return ERR_PTR(err);
+   }
+}
+
+/**
+ * Inode operations for directories.
+ */
+const struct inode_operations ntfs_dir_inode_ops = {
+	.lookup	= ntfs_lookup,	/* VFS: Lookup directory. */
+};
+
+/**
+ * ntfs_get_parent - find the dentry of the parent of a given directory dentry
+ * @child_dent:		dentry of the directory whose parent directory to find
+ *
+ * Find the dentry for the parent directory of the directory specified by the
+ * dentry @child_dent.  This function is called from
+ * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
+ * default ->decode_fh() which is export_decode_fh() in the same file.
+ *
+ * The code is based on the ext3 ->get_parent() implementation found in
+ * fs/ext3/namei.c::ext3_get_parent().
+ *
+ * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down.
+ *
+ * Return the dentry of the parent directory on success or the error code on
+ * error (IS_ERR() is true).
+ */
+static struct dentry *ntfs_get_parent(struct dentry *child_dent)
+{
+	struct inode *vi = d_inode(child_dent);
+	ntfs_inode *ni = NTFS_I(vi);
+	MFT_RECORD *mrec;
+	ntfs_attr_search_ctx *ctx;
+	ATTR_RECORD *attr;
+	FILE_NAME_ATTR *fn;
+	unsigned long parent_ino;
+	int err;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+	/* Get the mft record of the inode belonging to the child dentry. */
+	mrec = map_mft_record(ni);
+	if (IS_ERR(mrec))
+		return (struct dentry *)mrec;
+	/* Find the first file name attribute in the mft record. */
+	ctx = ntfs_attr_get_search_ctx(ni, mrec);
+	if (unlikely(!ctx)) {
+		unmap_mft_record(ni);
+		return ERR_PTR(-ENOMEM);
+	}
+try_next:
+	err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+		if (err == -ENOENT)
+			ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
+					"file name attribute.  Run chkdsk.",
+					vi->i_ino);
+		return ERR_PTR(err);
+	}
+	attr = ctx->attr;
+	if (unlikely(attr->non_resident))
+		goto try_next;
+	fn = (FILE_NAME_ATTR *)((u8 *)attr +
+			le16_to_cpu(attr->data.resident.value_offset));
+	if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
+			(u8*)attr + le32_to_cpu(attr->length)))
+		goto try_next;
+	/* Get the inode number of the parent directory. */
+	parent_ino = MREF_LE(fn->parent_directory);
+	/* Release the search context and the mft record of the child. */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ni);
+
+	return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
+}
+
+static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	inode = ntfs_iget(sb, ino);
+	if (!IS_ERR(inode)) {
+		if (is_bad_inode(inode) || inode->i_generation != generation) {
+			iput(inode);
+			inode = ERR_PTR(-ESTALE);
+		}
+	}
+
+	return inode;
+}
+
+static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ntfs_nfs_get_inode);
+}
+
+static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ntfs_nfs_get_inode);
+}
+
+/**
+ * Export operations allowing NFS exporting of mounted NTFS partitions.
+ *
+ * We use the default ->encode_fh() for now.  Note that they
+ * use 32 bits to store the inode number which is an unsigned long so on 64-bit
+ * architectures is usually 64 bits so it would all fail horribly on huge
+ * volumes.  I guess we need to define our own encode and decode fh functions
+ * that store 64-bit inode numbers at some point but for now we will ignore the
+ * problem...
+ *
+ * We also use the default ->get_name() helper (used by ->decode_fh() via
+ * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
+ * independent.
+ *
+ * The default ->get_parent() just returns -EACCES so we have to provide our
+ * own and the default ->get_dentry() is incompatible with NTFS due to not
+ * allowing the inode number 0 which is used in NTFS for the system file $MFT
+ * and due to using iget() whereas NTFS needs ntfs_iget().
+ */
+const struct export_operations ntfs_export_ops = {
+	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
+						   directory. */
+	.fh_to_dentry	= ntfs_fh_to_dentry,
+	.fh_to_parent	= ntfs_fh_to_parent,
+};
diff --git a/kernel/fs/ntfs/ntfs.h b/kernel/fs/ntfs/ntfs.h
new file mode 100644
index 000000000..c581e26a3
--- /dev/null
+++ b/kernel/fs/ntfs/ntfs.h
@@ -0,0 +1,164 @@
+/*
+ * ntfs.h - Defines for NTFS Linux kernel driver.
+ *
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (C) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_H
+#define _LINUX_NTFS_H
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+#include <linux/smp.h>
+#include <linux/pagemap.h>
+
+#include "types.h"
+#include "volume.h"
+#include "layout.h"
+
+typedef enum {
+	NTFS_BLOCK_SIZE		= 512,
+	NTFS_BLOCK_SIZE_BITS	= 9,
+	NTFS_SB_MAGIC		= 0x5346544e,	/* 'NTFS' */
+	NTFS_MAX_NAME_LEN	= 255,
+	NTFS_MAX_ATTR_NAME_LEN	= 255,
+	NTFS_MAX_CLUSTER_SIZE	= 64 * 1024,	/* 64kiB */
+	NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE,
+} NTFS_CONSTANTS;
+
+/* Global variables. */
+
+/* Slab caches (from super.c). */
+extern struct kmem_cache *ntfs_name_cache;
+extern struct kmem_cache *ntfs_inode_cache;
+extern struct kmem_cache *ntfs_big_inode_cache;
+extern struct kmem_cache *ntfs_attr_ctx_cache;
+extern struct kmem_cache *ntfs_index_ctx_cache;
+
+/* The various operations structs defined throughout the driver files. */
+extern const struct address_space_operations ntfs_normal_aops;
+extern const struct address_space_operations ntfs_compressed_aops;
+extern const struct address_space_operations ntfs_mst_aops;
+
+extern const struct  file_operations ntfs_file_ops;
+extern const struct inode_operations ntfs_file_inode_ops;
+
+extern const struct  file_operations ntfs_dir_ops;
+extern const struct inode_operations ntfs_dir_inode_ops;
+
+extern const struct  file_operations ntfs_empty_file_ops;
+extern const struct inode_operations ntfs_empty_inode_ops;
+
+extern const struct export_operations ntfs_export_ops;
+
+/**
+ * NTFS_SB - return the ntfs volume given a vfs super block
+ * @sb:		VFS super block
+ *
+ * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
+ */
+static inline ntfs_volume *NTFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* Declarations of functions and global variables. */
+
+/* From fs/ntfs/compress.c */
+extern int ntfs_read_compressed_block(struct page *page);
+extern int allocate_compression_buffers(void);
+extern void free_compression_buffers(void);
+
+/* From fs/ntfs/super.c */
+#define default_upcase_len 0x10000
+extern struct mutex ntfs_lock;
+
+typedef struct {
+	int val;
+	char *str;
+} option_t;
+extern const option_t on_errors_arr[];
+
+/* From fs/ntfs/mst.c */
+extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern void post_write_mst_fixup(NTFS_RECORD *b);
+
+/* From fs/ntfs/unistr.c */
+extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+		const ntfschar *s2, size_t s2_len,
+		const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_size);
+extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+		const ntfschar *name2, const u32 name2_len,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
+extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+		const ntfschar *upcase, const u32 upcase_size);
+extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
+		const ntfschar *upcase, const u32 upcase_len);
+extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+		const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+		FILE_NAME_ATTR *file_name_attr2,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+		const int ins_len, ntfschar **outs);
+extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+		const int ins_len, unsigned char **outs, int outs_len);
+
+/* From fs/ntfs/upcase.c */
+extern ntfschar *generate_default_upcase(void);
+
+static inline int ntfs_ffs(int x)
+{
+	int r = 1;
+
+	if (!x)
+		return 0;
+	if (!(x & 0xffff)) {
+		x >>= 16;
+		r += 16;
+	}
+	if (!(x & 0xff)) {
+		x >>= 8;
+		r += 8;
+	}
+	if (!(x & 0xf)) {
+		x >>= 4;
+		r += 4;
+	}
+	if (!(x & 3)) {
+		x >>= 2;
+		r += 2;
+	}
+	if (!(x & 1)) {
+		x >>= 1;
+		r += 1;
+	}
+	return r;
+}
+
+#endif /* _LINUX_NTFS_H */
diff --git a/kernel/fs/ntfs/quota.c b/kernel/fs/ntfs/quota.c
new file mode 100644
index 000000000..d80e3315c
--- /dev/null
+++ b/kernel/fs/ntfs/quota.c
@@ -0,0 +1,117 @@
+/*
+ * quota.c - NTFS kernel quota ($Quota) handling.  Part of the Linux-NTFS
+ *	     project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include "index.h"
+#include "quota.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
+ * @vol:	ntfs volume on which to mark the quotas out of date
+ *
+ * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
+ * success and 'false' on error.
+ */
+bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
+{
+	ntfs_index_context *ictx;
+	QUOTA_CONTROL_ENTRY *qce;
+	const le32 qid = QUOTA_DEFAULTS_ID;
+	int err;
+
+	ntfs_debug("Entering.");
+	if (NVolQuotaOutOfDate(vol))
+		goto done;
+	if (!vol->quota_ino || !vol->quota_q_ino) {
+		ntfs_error(vol->sb, "Quota inodes are not open.");
+		return false;
+	}
+	mutex_lock(&vol->quota_q_ino->i_mutex);
+	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
+	if (!ictx) {
+		ntfs_error(vol->sb, "Failed to get index context.");
+		goto err_out;
+	}
+	err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
+	if (err) {
+		if (err == -ENOENT)
+			ntfs_error(vol->sb, "Quota defaults entry is not "
+					"present.");
+		else
+			ntfs_error(vol->sb, "Lookup of quota defaults entry "
+					"failed.");
+		goto err_out;
+	}
+	if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
+		ntfs_error(vol->sb, "Quota defaults entry size is invalid.  "
+				"Run chkdsk.");
+		goto err_out;
+	}
+	qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
+	if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
+		ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
+				"supported.", le32_to_cpu(qce->version));
+		goto err_out;
+	}
+	ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
+	/* If quotas are already marked out of date, no need to do anything. */
+	if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
+		goto set_done;
+	/*
+	 * If quota tracking is neither requested, nor enabled and there are no
+	 * pending deletes, no need to mark the quotas out of date.
+	 */
+	if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
+			QUOTA_FLAG_TRACKING_REQUESTED |
+			QUOTA_FLAG_PENDING_DELETES)))
+		goto set_done;
+	/*
+	 * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
+	 * This is verified on WinXP to be sufficient to cause windows to
+	 * rescan the volume on boot and update all quota entries.
+	 */
+	qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
+	/* Ensure the modified flags are written to disk. */
+	ntfs_index_entry_flush_dcache_page(ictx);
+	ntfs_index_entry_mark_dirty(ictx);
+set_done:
+	ntfs_index_ctx_put(ictx);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	/*
+	 * We set the flag so we do not try to mark the quotas out of date
+	 * again on remount.
+	 */
+	NVolSetQuotaOutOfDate(vol);
+done:
+	ntfs_debug("Done.");
+	return true;
+err_out:
+	if (ictx)
+		ntfs_index_ctx_put(ictx);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	return false;
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/quota.h b/kernel/fs/ntfs/quota.h
new file mode 100644
index 000000000..4cbe5594c
--- /dev/null
+++ b/kernel/fs/ntfs/quota.h
@@ -0,0 +1,35 @@
+/*
+ * quota.h - Defines for NTFS kernel quota ($Quota) handling.  Part of the
+ *	     Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_QUOTA_H
+#define _LINUX_NTFS_QUOTA_H
+
+#ifdef NTFS_RW
+
+#include "types.h"
+#include "volume.h"
+
+extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/kernel/fs/ntfs/runlist.c b/kernel/fs/ntfs/runlist.c
new file mode 100644
index 000000000..eac7d6788
--- /dev/null
+++ b/kernel/fs/ntfs/runlist.c
@@ -0,0 +1,1907 @@
+/**
+ * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002-2005 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "debug.h"
+#include "dir.h"
+#include "endian.h"
+#include "malloc.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_rl_mm - runlist memmove
+ *
+ * It is up to the caller to serialize access to the runlist @base.
+ */
+static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
+		int size)
+{
+	if (likely((dst != src) && (size > 0)))
+		memmove(base + dst, base + src, size * sizeof(*base));
+}
+
+/**
+ * ntfs_rl_mc - runlist memory copy
+ *
+ * It is up to the caller to serialize access to the runlists @dstbase and
+ * @srcbase.
+ */
+static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
+		runlist_element *srcbase, int src, int size)
+{
+	if (likely(size > 0))
+		memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
+}
+
+/**
+ * ntfs_rl_realloc - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs(new_size);
+	if (unlikely(!new_rl))
+		return ERR_PTR(-ENOMEM);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
+/**
+ * ntfs_rl_realloc_nofail - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs_nofail(new_size);
+	BUG_ON(!new_rl);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
+/**
+ * ntfs_are_rl_mergeable - test if two runlists can be joined together
+ * @dst:	original runlist
+ * @src:	new runlist to test for mergeability with @dst
+ *
+ * Test if two runlists can be joined together. For this, their VCNs and LCNs
+ * must be adjacent.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * Return: true   Success, the runlists can be merged.
+ *	   false  Failure, the runlists cannot be merged.
+ */
+static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
+		runlist_element *src)
+{
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* We can merge unmapped regions even if they are misaligned. */
+	if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
+		return true;
+	/* If the runs are misaligned, we cannot merge them. */
+	if ((dst->vcn + dst->length) != src->vcn)
+		return false;
+	/* If both runs are non-sparse and contiguous, we can merge them. */
+	if ((dst->lcn >= 0) && (src->lcn >= 0) &&
+			((dst->lcn + dst->length) == src->lcn))
+		return true;
+	/* If we are merging two holes, we can merge them. */
+	if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
+		return true;
+	/* Cannot merge. */
+	return false;
+}
+
+/**
+ * __ntfs_rl_merge - merge two runlists without testing if they can be merged
+ * @dst:	original, destination runlist
+ * @src:	new runlist to merge with @dst
+ *
+ * Merge the two runlists, writing into the destination runlist @dst. The
+ * caller must make sure the runlists can be merged or this will corrupt the
+ * destination runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ */
+static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
+{
+	dst->length += src->length;
+}
+
+/**
+ * ntfs_rl_append - append a runlist after a given element
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	runlist to be inserted into @dst
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	append the new runlist @src after this element in @dst
+ *
+ * Append the runlist @src after element @loc in @dst.  Merge the right end of
+ * the new runlist, if necessary. Adjust the size of the hole before the
+ * appended runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_append(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	bool right = false;	/* Right end of @src needs merging. */
+	int marker;		/* End of the inserted runs. */
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* First, check if the right hand end needs merging. */
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+
+	/* Space required: @dst size + @src size, less one if we merged. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* First, merge the right hand end, if necessary. */
+	if (right)
+		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+
+	/* First run after the @src runs that have been inserted. */
+	marker = loc + ssize + 1;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
+	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+	/* Adjust the size of the preceding hole. */
+	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+
+	/* We may have changed the length of the file, so fix the end marker */
+	if (dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+
+	return dst;
+}
+
+/**
+ * ntfs_rl_insert - insert a runlist into another
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	insert the new runlist @src before this element in @dst
+ *
+ * Insert the runlist @src before element @loc in the runlist @dst. Merge the
+ * left end of the new runlist, if necessary. Adjust the size of the hole
+ * after the inserted runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	bool left = false;	/* Left end of @src needs merging. */
+	bool disc = false;	/* Discontinuity between @dst and @src. */
+	int marker;		/* End of the inserted runs. */
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/*
+	 * disc => Discontinuity between the end of @dst and the start of @src.
+	 *	   This means we might need to insert a "not mapped" run.
+	 */
+	if (loc == 0)
+		disc = (src[0].vcn > 0);
+	else {
+		s64 merged_length;
+
+		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+
+		merged_length = dst[loc - 1].length;
+		if (left)
+			merged_length += src->length;
+
+		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
+	}
+	/*
+	 * Space required: @dst size + @src size, less one if we merged, plus
+	 * one if there was a discontinuity.
+	 */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlist.
+	 */
+	if (left)
+		__ntfs_rl_merge(dst + loc - 1, src);
+	/*
+	 * First run after the @src runs that have been inserted.
+	 * Nominally,  @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.  And if @disc, then @dst and @src do
+	 * not meet and we need an extra run to fill the gap.
+	 */
+	marker = loc + ssize - left + disc;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, marker, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
+
+	/* Adjust the VCN of the first run after the insertion... */
+	dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+	/* ... and the length. */
+	if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
+		dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
+
+	/* Writing beyond the end of the file and there is a discontinuity. */
+	if (disc) {
+		if (loc > 0) {
+			dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
+			dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+		} else {
+			dst[loc].vcn = 0;
+			dst[loc].length = dst[loc + 1].vcn;
+		}
+		dst[loc].lcn = LCN_RL_NOT_MAPPED;
+	}
+	return dst;
+}
+
+/**
+ * ntfs_rl_replace - overwrite a runlist element with another runlist
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	index in runlist @dst to overwrite with @src
+ *
+ * Replace the runlist element @dst at @loc with @src. Merge the left and
+ * right ends of the inserted runlist, if necessary.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	signed delta;
+	bool left = false;	/* Left end of @src needs merging. */
+	bool right = false;	/* Right end of @src needs merging. */
+	int tail;		/* Start of tail of @dst. */
+	int marker;		/* End of the inserted runs. */
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* First, see if the left and right ends need merging. */
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+	if (loc > 0)
+		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+	/*
+	 * Allocate some space.  We will need less if the left, right, or both
+	 * ends get merged.  The -1 accounts for the run being replaced.
+	 */
+	delta = ssize - 1 - left - right;
+	if (delta > 0) {
+		dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
+		if (IS_ERR(dst))
+			return dst;
+	}
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* First, merge the left and right ends, if necessary. */
+	if (right)
+		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+	if (left)
+		__ntfs_rl_merge(dst + loc - 1, src);
+	/*
+	 * Offset of the tail of @dst.  This needs to be moved out of the way
+	 * to make space for the runs to be copied from @src, i.e. the first
+	 * run of the tail of @dst.
+	 * Nominally, @tail equals @loc + 1, i.e. location, skipping the
+	 * replaced run.  However, if @right, then one of @dst's runs is
+	 * already merged into @src.
+	 */
+	tail = loc + right + 1;
+	/*
+	 * First run after the @src runs that have been inserted, i.e. where
+	 * the tail of @dst needs to be moved to.
+	 * Nominally, @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.
+	 */
+	marker = loc + ssize - left;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, marker, tail, dsize - tail);
+	ntfs_rl_mc(dst, loc, src, left, ssize - left);
+
+	/* We may have changed the length of the file, so fix the end marker. */
+	if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+	return dst;
+}
+
+/**
+ * ntfs_rl_split - insert a runlist into the centre of a hole
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	index in runlist @dst at which to split and insert @src
+ *
+ * Split the runlist @dst at @loc into two and insert @new in between the two
+ * fragments. No merging of runlists is necessary. Adjust the size of the
+ * holes either side.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
+		runlist_element *src, int ssize, int loc)
+{
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* Space required: @dst size + @src size + one new hole. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+	/* Adjust the size of the holes either size of @src. */
+	dst[loc].length		= dst[loc+1].vcn       - dst[loc].vcn;
+	dst[loc+ssize+1].vcn    = dst[loc+ssize].vcn   + dst[loc+ssize].length;
+	dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
+
+	return dst;
+}
+
+/**
+ * ntfs_runlists_merge - merge two runlists into one
+ * @drl:	original runlist to be worked on
+ * @srl:	new runlist to be merged into @drl
+ *
+ * First we sanity check the two runlists @srl and @drl to make sure that they
+ * are sensible and can be merged. The runlist @srl must be either after the
+ * runlist @drl or completely within a hole (or unmapped region) in @drl.
+ *
+ * It is up to the caller to serialize access to the runlists @drl and @srl.
+ *
+ * Merging of runlists is necessary in two cases:
+ *   1. When attribute lists are used and a further extent is being mapped.
+ *   2. When new clusters are allocated to fill a hole or extend a file.
+ *
+ * There are four possible ways @srl can be merged. It can:
+ *	- be inserted at the beginning of a hole,
+ *	- split the hole in two and be inserted between the two fragments,
+ *	- be appended at the end of a hole, or it can
+ *	- replace the whole hole.
+ * It can also be appended to the end of the runlist, which is just a variant
+ * of the insert case.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @drl and @srl are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ *	-ERANGE	- The runlists overlap and cannot be merged.
+ */
+runlist_element *ntfs_runlists_merge(runlist_element *drl,
+		runlist_element *srl)
+{
+	int di, si;		/* Current index into @[ds]rl. */
+	int sstart;		/* First index with lcn > LCN_RL_NOT_MAPPED. */
+	int dins;		/* Index into @drl at which to insert @srl. */
+	int dend, send;		/* Last index into @[ds]rl. */
+	int dfinal, sfinal;	/* The last index into @[ds]rl with
+				   lcn >= LCN_HOLE. */
+	int marker = 0;
+	VCN marker_vcn = 0;
+
+#ifdef DEBUG
+	ntfs_debug("dst:");
+	ntfs_debug_dump_runlist(drl);
+	ntfs_debug("src:");
+	ntfs_debug_dump_runlist(srl);
+#endif
+
+	/* Check for silly calling... */
+	if (unlikely(!srl))
+		return drl;
+	if (IS_ERR(srl) || IS_ERR(drl))
+		return ERR_PTR(-EINVAL);
+
+	/* Check for the case where the first mapping is being done now. */
+	if (unlikely(!drl)) {
+		drl = srl;
+		/* Complete the source runlist if necessary. */
+		if (unlikely(drl[0].vcn)) {
+			/* Scan to the end of the source runlist. */
+			for (dend = 0; likely(drl[dend].length); dend++)
+				;
+			dend++;
+			drl = ntfs_rl_realloc(drl, dend, dend + 1);
+			if (IS_ERR(drl))
+				return drl;
+			/* Insert start element at the front of the runlist. */
+			ntfs_rl_mm(drl, 1, 0, dend);
+			drl[0].vcn = 0;
+			drl[0].lcn = LCN_RL_NOT_MAPPED;
+			drl[0].length = drl[1].vcn;
+		}
+		goto finished;
+	}
+
+	si = di = 0;
+
+	/* Skip any unmapped start element(s) in the source runlist. */
+	while (srl[si].length && srl[si].lcn < LCN_HOLE)
+		si++;
+
+	/* Can't have an entirely unmapped source runlist. */
+	BUG_ON(!srl[si].length);
+
+	/* Record the starting points. */
+	sstart = si;
+
+	/*
+	 * Skip forward in @drl until we reach the position where @srl needs to
+	 * be inserted. If we reach the end of @drl, @srl just needs to be
+	 * appended to @drl.
+	 */
+	for (; drl[di].length; di++) {
+		if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
+			break;
+	}
+	dins = di;
+
+	/* Sanity check for illegal overlaps. */
+	if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
+			(srl[si].lcn >= 0)) {
+		ntfs_error(NULL, "Run lists overlap. Cannot merge!");
+		return ERR_PTR(-ERANGE);
+	}
+
+	/* Scan to the end of both runlists in order to know their sizes. */
+	for (send = si; srl[send].length; send++)
+		;
+	for (dend = di; drl[dend].length; dend++)
+		;
+
+	if (srl[send].lcn == LCN_ENOENT)
+		marker_vcn = srl[marker = send].vcn;
+
+	/* Scan to the last element with lcn >= LCN_HOLE. */
+	for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
+		;
+	for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
+		;
+
+	{
+	bool start;
+	bool finish;
+	int ds = dend + 1;		/* Number of elements in drl & srl */
+	int ss = sfinal - sstart + 1;
+
+	start  = ((drl[dins].lcn <  LCN_RL_NOT_MAPPED) ||    /* End of file   */
+		  (drl[dins].vcn == srl[sstart].vcn));	     /* Start of hole */
+	finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) &&    /* End of file   */
+		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
+		  (srl[send - 1].vcn + srl[send - 1].length)));
+
+	/* Or we will lose an end marker. */
+	if (finish && !drl[dins].length)
+		ss++;
+	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
+		finish = false;
+#if 0
+	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
+	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
+	ntfs_debug("start = %i, finish = %i", start, finish);
+	ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
+#endif
+	if (start) {
+		if (finish)
+			drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
+		else
+			drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
+	} else {
+		if (finish)
+			drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
+		else
+			drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
+	}
+	if (IS_ERR(drl)) {
+		ntfs_error(NULL, "Merge failed.");
+		return drl;
+	}
+	ntfs_free(srl);
+	if (marker) {
+		ntfs_debug("Triggering marker code.");
+		for (ds = dend; drl[ds].length; ds++)
+			;
+		/* We only need to care if @srl ended after @drl. */
+		if (drl[ds].vcn <= marker_vcn) {
+			int slots = 0;
+
+			if (drl[ds].vcn == marker_vcn) {
+				ntfs_debug("Old marker = 0x%llx, replacing "
+						"with LCN_ENOENT.",
+						(unsigned long long)
+						drl[ds].lcn);
+				drl[ds].lcn = LCN_ENOENT;
+				goto finished;
+			}
+			/*
+			 * We need to create an unmapped runlist element in
+			 * @drl or extend an existing one before adding the
+			 * ENOENT terminator.
+			 */
+			if (drl[ds].lcn == LCN_ENOENT) {
+				ds--;
+				slots = 1;
+			}
+			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
+				/* Add an unmapped runlist element. */
+				if (!slots) {
+					drl = ntfs_rl_realloc_nofail(drl, ds,
+							ds + 2);
+					slots = 2;
+				}
+				ds++;
+				/* Need to set vcn if it isn't set already. */
+				if (slots != 1)
+					drl[ds].vcn = drl[ds - 1].vcn +
+							drl[ds - 1].length;
+				drl[ds].lcn = LCN_RL_NOT_MAPPED;
+				/* We now used up a slot. */
+				slots--;
+			}
+			drl[ds].length = marker_vcn - drl[ds].vcn;
+			/* Finally add the ENOENT terminator. */
+			ds++;
+			if (!slots)
+				drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
+			drl[ds].vcn = marker_vcn;
+			drl[ds].lcn = LCN_ENOENT;
+			drl[ds].length = (s64)0;
+		}
+	}
+	}
+
+finished:
+	/* The merge was completed successfully. */
+	ntfs_debug("Merged runlist:");
+	ntfs_debug_dump_runlist(drl);
+	return drl;
+}
+
+/**
+ * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
+ * @vol:	ntfs volume on which the attribute resides
+ * @attr:	attribute record whose mapping pairs array to decompress
+ * @old_rl:	optional runlist in which to insert @attr's runlist
+ *
+ * It is up to the caller to serialize access to the runlist @old_rl.
+ *
+ * Decompress the attribute @attr's mapping pairs array into a runlist. On
+ * success, return the decompressed runlist.
+ *
+ * If @old_rl is not NULL, decompressed runlist is inserted into the
+ * appropriate place in @old_rl and the resultant, combined runlist is
+ * returned. The original @old_rl is deallocated.
+ *
+ * On error, return -errno. @old_rl is left unmodified in that case.
+ *
+ * The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EIO	- Corrupt runlist.
+ *	-EINVAL	- Invalid parameters were passed in.
+ *	-ERANGE	- The two runlists overlap.
+ *
+ * FIXME: For now we take the conceptionally simplest approach of creating the
+ * new runlist disregarding the already existing one and then splicing the
+ * two into one, if that is possible (we check for overlap and discard the new
+ * runlist if overlap present before returning ERR_PTR(-ERANGE)).
+ */
+runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+		const ATTR_RECORD *attr, runlist_element *old_rl)
+{
+	VCN vcn;		/* Current vcn. */
+	LCN lcn;		/* Current lcn. */
+	s64 deltaxcn;		/* Change in [vl]cn. */
+	runlist_element *rl;	/* The output runlist. */
+	u8 *buf;		/* Current position in mapping pairs array. */
+	u8 *attr_end;		/* End of attribute. */
+	int rlsize;		/* Size of runlist buffer. */
+	u16 rlpos;		/* Current runlist position in units of
+				   runlist_elements. */
+	u8 b;			/* Current byte offset in buf. */
+
+#ifdef DEBUG
+	/* Make sure attr exists and is non-resident. */
+	if (!attr || !attr->non_resident || sle64_to_cpu(
+			attr->data.non_resident.lowest_vcn) < (VCN)0) {
+		ntfs_error(vol->sb, "Invalid arguments.");
+		return ERR_PTR(-EINVAL);
+	}
+#endif
+	/* Start at vcn = lowest_vcn and lcn 0. */
+	vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
+	lcn = 0;
+	/* Get start of the mapping pairs array. */
+	buf = (u8*)attr + le16_to_cpu(
+			attr->data.non_resident.mapping_pairs_offset);
+	attr_end = (u8*)attr + le32_to_cpu(attr->length);
+	if (unlikely(buf < (u8*)attr || buf > attr_end)) {
+		ntfs_error(vol->sb, "Corrupt attribute.");
+		return ERR_PTR(-EIO);
+	}
+	/* If the mapping pairs array is valid but empty, nothing to do. */
+	if (!vcn && !*buf)
+		return old_rl;
+	/* Current position in runlist array. */
+	rlpos = 0;
+	/* Allocate first page and set current runlist size to one page. */
+	rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
+	if (unlikely(!rl))
+		return ERR_PTR(-ENOMEM);
+	/* Insert unmapped starting element if necessary. */
+	if (vcn) {
+		rl->vcn = 0;
+		rl->lcn = LCN_RL_NOT_MAPPED;
+		rl->length = vcn;
+		rlpos++;
+	}
+	while (buf < attr_end && *buf) {
+		/*
+		 * Allocate more memory if needed, including space for the
+		 * not-mapped and terminator elements. ntfs_malloc_nofs()
+		 * operates on whole pages only.
+		 */
+		if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
+			runlist_element *rl2;
+
+			rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+			if (unlikely(!rl2)) {
+				ntfs_free(rl);
+				return ERR_PTR(-ENOMEM);
+			}
+			memcpy(rl2, rl, rlsize);
+			ntfs_free(rl);
+			rl = rl2;
+			rlsize += PAGE_SIZE;
+		}
+		/* Enter the current vcn into the current runlist element. */
+		rl[rlpos].vcn = vcn;
+		/*
+		 * Get the change in vcn, i.e. the run length in clusters.
+		 * Doing it this way ensures that we signextend negative values.
+		 * A negative run length doesn't make any sense, but hey, I
+		 * didn't make up the NTFS specs and Windows NT4 treats the run
+		 * length as a signed value so that's how it is...
+		 */
+		b = *buf & 0xf;
+		if (b) {
+			if (unlikely(buf + b > attr_end))
+				goto io_error;
+			for (deltaxcn = (s8)buf[b--]; b; b--)
+				deltaxcn = (deltaxcn << 8) + buf[b];
+		} else { /* The length entry is compulsory. */
+			ntfs_error(vol->sb, "Missing length entry in mapping "
+					"pairs array.");
+			deltaxcn = (s64)-1;
+		}
+		/*
+		 * Assume a negative length to indicate data corruption and
+		 * hence clean-up and return NULL.
+		 */
+		if (unlikely(deltaxcn < 0)) {
+			ntfs_error(vol->sb, "Invalid length in mapping pairs "
+					"array.");
+			goto err_out;
+		}
+		/*
+		 * Enter the current run length into the current runlist
+		 * element.
+		 */
+		rl[rlpos].length = deltaxcn;
+		/* Increment the current vcn by the current run length. */
+		vcn += deltaxcn;
+		/*
+		 * There might be no lcn change at all, as is the case for
+		 * sparse clusters on NTFS 3.0+, in which case we set the lcn
+		 * to LCN_HOLE.
+		 */
+		if (!(*buf & 0xf0))
+			rl[rlpos].lcn = LCN_HOLE;
+		else {
+			/* Get the lcn change which really can be negative. */
+			u8 b2 = *buf & 0xf;
+			b = b2 + ((*buf >> 4) & 0xf);
+			if (buf + b > attr_end)
+				goto io_error;
+			for (deltaxcn = (s8)buf[b--]; b > b2; b--)
+				deltaxcn = (deltaxcn << 8) + buf[b];
+			/* Change the current lcn to its new value. */
+			lcn += deltaxcn;
+#ifdef DEBUG
+			/*
+			 * On NTFS 1.2-, apparently can have lcn == -1 to
+			 * indicate a hole. But we haven't verified ourselves
+			 * whether it is really the lcn or the deltaxcn that is
+			 * -1. So if either is found give us a message so we
+			 * can investigate it further!
+			 */
+			if (vol->major_ver < 3) {
+				if (unlikely(deltaxcn == (LCN)-1))
+					ntfs_error(vol->sb, "lcn delta == -1");
+				if (unlikely(lcn == (LCN)-1))
+					ntfs_error(vol->sb, "lcn == -1");
+			}
+#endif
+			/* Check lcn is not below -1. */
+			if (unlikely(lcn < (LCN)-1)) {
+				ntfs_error(vol->sb, "Invalid LCN < -1 in "
+						"mapping pairs array.");
+				goto err_out;
+			}
+			/* Enter the current lcn into the runlist element. */
+			rl[rlpos].lcn = lcn;
+		}
+		/* Get to the next runlist element. */
+		rlpos++;
+		/* Increment the buffer position to the next mapping pair. */
+		buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
+	}
+	if (unlikely(buf >= attr_end))
+		goto io_error;
+	/*
+	 * If there is a highest_vcn specified, it must be equal to the final
+	 * vcn in the runlist - 1, or something has gone badly wrong.
+	 */
+	deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
+	if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
+mpa_err:
+		ntfs_error(vol->sb, "Corrupt mapping pairs array in "
+				"non-resident attribute.");
+		goto err_out;
+	}
+	/* Setup not mapped runlist element if this is the base extent. */
+	if (!attr->data.non_resident.lowest_vcn) {
+		VCN max_cluster;
+
+		max_cluster = ((sle64_to_cpu(
+				attr->data.non_resident.allocated_size) +
+				vol->cluster_size - 1) >>
+				vol->cluster_size_bits) - 1;
+		/*
+		 * A highest_vcn of zero means this is a single extent
+		 * attribute so simply terminate the runlist with LCN_ENOENT).
+		 */
+		if (deltaxcn) {
+			/*
+			 * If there is a difference between the highest_vcn and
+			 * the highest cluster, the runlist is either corrupt
+			 * or, more likely, there are more extents following
+			 * this one.
+			 */
+			if (deltaxcn < max_cluster) {
+				ntfs_debug("More extents to follow; deltaxcn "
+						"= 0x%llx, max_cluster = "
+						"0x%llx",
+						(unsigned long long)deltaxcn,
+						(unsigned long long)
+						max_cluster);
+				rl[rlpos].vcn = vcn;
+				vcn += rl[rlpos].length = max_cluster -
+						deltaxcn;
+				rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+				rlpos++;
+			} else if (unlikely(deltaxcn > max_cluster)) {
+				ntfs_error(vol->sb, "Corrupt attribute.  "
+						"deltaxcn = 0x%llx, "
+						"max_cluster = 0x%llx",
+						(unsigned long long)deltaxcn,
+						(unsigned long long)
+						max_cluster);
+				goto mpa_err;
+			}
+		}
+		rl[rlpos].lcn = LCN_ENOENT;
+	} else /* Not the base extent. There may be more extents to follow. */
+		rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+
+	/* Setup terminating runlist element. */
+	rl[rlpos].vcn = vcn;
+	rl[rlpos].length = (s64)0;
+	/* If no existing runlist was specified, we are done. */
+	if (!old_rl) {
+		ntfs_debug("Mapping pairs array successfully decompressed:");
+		ntfs_debug_dump_runlist(rl);
+		return rl;
+	}
+	/* Now combine the new and old runlists checking for overlaps. */
+	old_rl = ntfs_runlists_merge(old_rl, rl);
+	if (likely(!IS_ERR(old_rl)))
+		return old_rl;
+	ntfs_free(rl);
+	ntfs_error(vol->sb, "Failed to merge runlists.");
+	return old_rl;
+io_error:
+	ntfs_error(vol->sb, "Corrupt attribute.");
+err_out:
+	ntfs_free(rl);
+	return ERR_PTR(-EIO);
+}
+
+/**
+ * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
+ * @rl:		runlist to use for conversion
+ * @vcn:	vcn to convert
+ *
+ * Convert the virtual cluster number @vcn of an attribute into a logical
+ * cluster number (lcn) of a device using the runlist @rl to map vcns to their
+ * corresponding lcns.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
+ *
+ * Return code		Meaning / Description
+ * ==================================================
+ *  LCN_HOLE		Hole / not allocated on disk.
+ *  LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
+ *			inserted into the runlist yet.
+ *  LCN_ENOENT		There is no such vcn in the attribute.
+ *
+ * Locking: - The caller must have locked the runlist (for reading or writing).
+ *	    - This function does not touch the lock, nor does it modify the
+ *	      runlist.
+ */
+LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
+{
+	int i;
+
+	BUG_ON(vcn < 0);
+	/*
+	 * If rl is NULL, assume that we have found an unmapped runlist. The
+	 * caller can then attempt to map it and fail appropriately if
+	 * necessary.
+	 */
+	if (unlikely(!rl))
+		return LCN_RL_NOT_MAPPED;
+
+	/* Catch out of lower bounds vcn. */
+	if (unlikely(vcn < rl[0].vcn))
+		return LCN_ENOENT;
+
+	for (i = 0; likely(rl[i].length); i++) {
+		if (unlikely(vcn < rl[i+1].vcn)) {
+			if (likely(rl[i].lcn >= (LCN)0))
+				return rl[i].lcn + (vcn - rl[i].vcn);
+			return rl[i].lcn;
+		}
+	}
+	/*
+	 * The terminator element is setup to the correct value, i.e. one of
+	 * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
+	 */
+	if (likely(rl[i].lcn < (LCN)0))
+		return rl[i].lcn;
+	/* Just in case... We could replace this with BUG() some day. */
+	return LCN_ENOENT;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
+ * @rl:		runlist to search
+ * @vcn:	vcn to find
+ *
+ * Find the virtual cluster number @vcn in the runlist @rl and return the
+ * address of the runlist element containing the @vcn on success.
+ *
+ * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
+ * the runlist.
+ *
+ * Locking: The runlist must be locked on entry.
+ */
+runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
+{
+	BUG_ON(vcn < 0);
+	if (unlikely(!rl || vcn < rl[0].vcn))
+		return NULL;
+	while (likely(rl->length)) {
+		if (unlikely(vcn < rl[1].vcn)) {
+			if (likely(rl->lcn >= LCN_HOLE))
+				return rl;
+			return NULL;
+		}
+		rl++;
+	}
+	if (likely(rl->lcn == LCN_ENOENT))
+		return rl;
+	return NULL;
+}
+
+/**
+ * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
+ * @n:		number for which to get the number of bytes for
+ *
+ * Return the number of bytes required to store @n unambiguously as
+ * a signed number.
+ *
+ * This is used in the context of the mapping pairs array to determine how
+ * many bytes will be needed in the array to store a given logical cluster
+ * number (lcn) or a specific run length.
+ *
+ * Return the number of bytes written.  This function cannot fail.
+ */
+static inline int ntfs_get_nr_significant_bytes(const s64 n)
+{
+	s64 l = n;
+	int i;
+	s8 j;
+
+	i = 0;
+	do {
+		l >>= 8;
+		i++;
+	} while (l != 0 && l != -1);
+	j = (n >> 8 * (i - 1)) & 0xff;
+	/* If the sign bit is wrong, we need an extra byte. */
+	if ((n < 0 && j >= 0) || (n > 0 && j < 0))
+		i++;
+	return i;
+}
+
+/**
+ * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
+ * @vol:	ntfs volume (needed for the ntfs version)
+ * @rl:		locked runlist to determine the size of the mapping pairs of
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
+ *
+ * Walk the locked runlist @rl and calculate the size in bytes of the mapping
+ * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
+ * finishing with vcn @last_vcn.
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the size of the
+ * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
+ * and finishing at the end of the runlist is determined.
+ *
+ * This for example allows us to allocate a buffer of the right size when
+ * building the mapping pairs array.
+ *
+ * If @rl is NULL, just return 1 (for the single terminator byte).
+ *
+ * Return the calculated size in bytes on success.  On error, return -errno.
+ * The following error codes are defined:
+ *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
+ *		  fully mapped runlists to this function.
+ *	-EIO	- The runlist is corrupt.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *	    remains locked throughout, and is left locked upon return.
+ */
+int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn)
+{
+	LCN prev_lcn;
+	int rls;
+	bool the_end = false;
+
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
+	if (!rl) {
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
+		return 1;
+	}
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
+		rl++;
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
+		return -EINVAL;
+	prev_lcn = 0;
+	/* Always need the termining zero byte. */
+	rls = 1;
+	/* Do the first partial run if present. */
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
+
+		/* We know rl->length != 0 already. */
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		delta = first_vcn - rl->vcn;
+		/* Header byte + length. */
+		rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just store the lcn.
+		 * Note: this assumes that on NTFS 1.2-, holes are stored with
+		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			prev_lcn = rl->lcn;
+			if (likely(rl->lcn >= 0))
+				prev_lcn += delta;
+			/* Change in lcn. */
+			rls += ntfs_get_nr_significant_bytes(prev_lcn);
+		}
+		/* Go to next runlist element. */
+		rl++;
+	}
+	/* Do the full runs. */
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		/* Header byte + length. */
+		rls += 1 + ntfs_get_nr_significant_bytes(length);
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just store the lcn.
+		 * Note: this assumes that on NTFS 1.2-, holes are stored with
+		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			/* Change in lcn. */
+			rls += ntfs_get_nr_significant_bytes(rl->lcn -
+					prev_lcn);
+			prev_lcn = rl->lcn;
+		}
+	}
+	return rls;
+err_out:
+	if (rl->lcn == LCN_RL_NOT_MAPPED)
+		rls = -EINVAL;
+	else
+		rls = -EIO;
+	return rls;
+}
+
+/**
+ * ntfs_write_significant_bytes - write the significant bytes of a number
+ * @dst:	destination buffer to write to
+ * @dst_max:	pointer to last byte of destination buffer for bounds checking
+ * @n:		number whose significant bytes to write
+ *
+ * Store in @dst, the minimum bytes of the number @n which are required to
+ * identify @n unambiguously as a signed number, taking care not to exceed
+ * @dest_max, the maximum position within @dst to which we are allowed to
+ * write.
+ *
+ * This is used when building the mapping pairs array of a runlist to compress
+ * a given logical cluster number (lcn) or a specific run length to the minimum
+ * size possible.
+ *
+ * Return the number of bytes written on success.  On error, i.e. the
+ * destination buffer @dst is too small, return -ENOSPC.
+ */
+static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
+		const s64 n)
+{
+	s64 l = n;
+	int i;
+	s8 j;
+
+	i = 0;
+	do {
+		if (unlikely(dst > dst_max))
+			goto err_out;
+		*dst++ = l & 0xffll;
+		l >>= 8;
+		i++;
+	} while (l != 0 && l != -1);
+	j = (n >> 8 * (i - 1)) & 0xff;
+	/* If the sign bit is wrong, we need an extra byte. */
+	if (n < 0 && j >= 0) {
+		if (unlikely(dst > dst_max))
+			goto err_out;
+		i++;
+		*dst = (s8)-1;
+	} else if (n > 0 && j < 0) {
+		if (unlikely(dst > dst_max))
+			goto err_out;
+		i++;
+		*dst = (s8)0;
+	}
+	return i;
+err_out:
+	return -ENOSPC;
+}
+
+/**
+ * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
+ * @vol:	ntfs volume (needed for the ntfs version)
+ * @dst:	destination buffer to which to write the mapping pairs array
+ * @dst_len:	size of destination buffer @dst in bytes
+ * @rl:		locked runlist for which to build the mapping pairs array
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
+ * @stop_vcn:	first vcn outside destination buffer on success or -ENOSPC
+ *
+ * Create the mapping pairs array from the locked runlist @rl, starting at vcn
+ * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
+ * @dst_len is the size of @dst in bytes and it should be at least equal to the
+ * value obtained by calling ntfs_get_size_for_mapping_pairs().
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
+ * array corresponding to the runlist starting at vcn @first_vcn and finishing
+ * at the end of the runlist is created.
+ *
+ * If @rl is NULL, just write a single terminator byte to @dst.
+ *
+ * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
+ * the first vcn outside the destination buffer.  Note that on error, @dst has
+ * been filled with all the mapping pairs that will fit, thus it can be treated
+ * as partial success, in that a new attribute extent needs to be created or
+ * the next extent has to be used and the mapping pairs build has to be
+ * continued with @first_vcn set to *@stop_vcn.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
+ *		  fully mapped runlists to this function.
+ *	-EIO	- The runlist is corrupt.
+ *	-ENOSPC	- The destination buffer is too small.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *	    remains locked throughout, and is left locked upon return.
+ */
+int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+		const int dst_len, const runlist_element *rl,
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
+{
+	LCN prev_lcn;
+	s8 *dst_max, *dst_next;
+	int err = -ENOSPC;
+	bool the_end = false;
+	s8 len_len, lcn_len;
+
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
+	BUG_ON(dst_len < 1);
+	if (!rl) {
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
+		if (stop_vcn)
+			*stop_vcn = 0;
+		/* Terminator byte. */
+		*dst = 0;
+		return 0;
+	}
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
+		rl++;
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
+		return -EINVAL;
+	/*
+	 * @dst_max is used for bounds checking in
+	 * ntfs_write_significant_bytes().
+	 */
+	dst_max = dst + dst_len - 1;
+	prev_lcn = 0;
+	/* Do the first partial run if present. */
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
+
+		/* We know rl->length != 0 already. */
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		delta = first_vcn - rl->vcn;
+		/* Write length. */
+		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+				length - delta);
+		if (unlikely(len_len < 0))
+			goto size_err;
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just write the lcn
+		 * change.  FIXME: Do we need to write the lcn change or just
+		 * the lcn in that case?  Not sure as I have never seen this
+		 * case on NT4. - We assume that we just need to write the lcn
+		 * change until someone tells us otherwise... (AIA)
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			prev_lcn = rl->lcn;
+			if (likely(rl->lcn >= 0))
+				prev_lcn += delta;
+			/* Write change in lcn. */
+			lcn_len = ntfs_write_significant_bytes(dst + 1 +
+					len_len, dst_max, prev_lcn);
+			if (unlikely(lcn_len < 0))
+				goto size_err;
+		} else
+			lcn_len = 0;
+		dst_next = dst + len_len + lcn_len + 1;
+		if (unlikely(dst_next > dst_max))
+			goto size_err;
+		/* Update header byte. */
+		*dst = lcn_len << 4 | len_len;
+		/* Position at next mapping pairs array element. */
+		dst = dst_next;
+		/* Go to next runlist element. */
+		rl++;
+	}
+	/* Do the full runs. */
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		/* Write length. */
+		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+				length);
+		if (unlikely(len_len < 0))
+			goto size_err;
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just write the lcn
+		 * change.  FIXME: Do we need to write the lcn change or just
+		 * the lcn in that case?  Not sure as I have never seen this
+		 * case on NT4. - We assume that we just need to write the lcn
+		 * change until someone tells us otherwise... (AIA)
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			/* Write change in lcn. */
+			lcn_len = ntfs_write_significant_bytes(dst + 1 +
+					len_len, dst_max, rl->lcn - prev_lcn);
+			if (unlikely(lcn_len < 0))
+				goto size_err;
+			prev_lcn = rl->lcn;
+		} else
+			lcn_len = 0;
+		dst_next = dst + len_len + lcn_len + 1;
+		if (unlikely(dst_next > dst_max))
+			goto size_err;
+		/* Update header byte. */
+		*dst = lcn_len << 4 | len_len;
+		/* Position at next mapping pairs array element. */
+		dst = dst_next;
+	}
+	/* Success. */
+	err = 0;
+size_err:
+	/* Set stop vcn. */
+	if (stop_vcn)
+		*stop_vcn = rl->vcn;
+	/* Add terminator byte. */
+	*dst = 0;
+	return err;
+err_out:
+	if (rl->lcn == LCN_RL_NOT_MAPPED)
+		err = -EINVAL;
+	else
+		err = -EIO;
+	return err;
+}
+
+/**
+ * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @vol:	ntfs volume (needed for error output)
+ * @runlist:	runlist to truncate
+ * @new_length:	the new length of the runlist in VCNs
+ *
+ * Truncate the runlist described by @runlist as well as the memory buffer
+ * holding the runlist elements to a length of @new_length VCNs.
+ *
+ * If @new_length lies within the runlist, the runlist elements with VCNs of
+ * @new_length and above are discarded.  As a special case if @new_length is
+ * zero, the runlist is discarded and set to NULL.
+ *
+ * If @new_length lies beyond the runlist, a sparse runlist element is added to
+ * the end of the runlist @runlist or if the last runlist element is a sparse
+ * one already, this is extended.
+ *
+ * Note, no checking is done for unmapped runlist elements.  It is assumed that
+ * the caller has mapped any elements that need to be mapped already.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const s64 new_length)
+{
+	runlist_element *rl;
+	int old_size;
+
+	ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
+	BUG_ON(!runlist);
+	BUG_ON(new_length < 0);
+	rl = runlist->rl;
+	if (!new_length) {
+		ntfs_debug("Freeing runlist.");
+		runlist->rl = NULL;
+		if (rl)
+			ntfs_free(rl);
+		return 0;
+	}
+	if (unlikely(!rl)) {
+		/*
+		 * Create a runlist consisting of a sparse runlist element of
+		 * length @new_length followed by a terminator runlist element.
+		 */
+		rl = ntfs_malloc_nofs(PAGE_SIZE);
+		if (unlikely(!rl)) {
+			ntfs_error(vol->sb, "Not enough memory to allocate "
+					"runlist element buffer.");
+			return -ENOMEM;
+		}
+		runlist->rl = rl;
+		rl[1].length = rl->vcn = 0;
+		rl->lcn = LCN_HOLE;
+		rl[1].vcn = rl->length = new_length;
+		rl[1].lcn = LCN_ENOENT;
+		return 0;
+	}
+	BUG_ON(new_length < rl->vcn);
+	/* Find @new_length in the runlist. */
+	while (likely(rl->length && new_length >= rl[1].vcn))
+		rl++;
+	/*
+	 * If not at the end of the runlist we need to shrink it.
+	 * If at the end of the runlist we need to expand it.
+	 */
+	if (rl->length) {
+		runlist_element *trl;
+		bool is_end;
+
+		ntfs_debug("Shrinking runlist.");
+		/* Determine the runlist size. */
+		trl = rl + 1;
+		while (likely(trl->length))
+			trl++;
+		old_size = trl - runlist->rl + 1;
+		/* Truncate the run. */
+		rl->length = new_length - rl->vcn;
+		/*
+		 * If a run was partially truncated, make the following runlist
+		 * element a terminator.
+		 */
+		is_end = false;
+		if (rl->length) {
+			rl++;
+			if (!rl->length)
+				is_end = true;
+			rl->vcn = new_length;
+			rl->length = 0;
+		}
+		rl->lcn = LCN_ENOENT;
+		/* Reallocate memory if necessary. */
+		if (!is_end) {
+			int new_size = rl - runlist->rl + 1;
+			rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+	} else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
+		ntfs_debug("Expanding runlist.");
+		/*
+		 * If there is a previous runlist element and it is a sparse
+		 * one, extend it.  Otherwise need to add a new, sparse runlist
+		 * element.
+		 */
+		if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
+			(rl - 1)->length = new_length - (rl - 1)->vcn;
+		else {
+			/* Determine the runlist size. */
+			old_size = rl - runlist->rl + 1;
+			/* Reallocate memory if necessary. */
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size + 1);
+			if (IS_ERR(rl)) {
+				ntfs_error(vol->sb, "Failed to expand runlist "
+						"buffer, aborting.");
+				return PTR_ERR(rl);
+			}
+			runlist->rl = rl;
+			/*
+			 * Set @rl to the same runlist element in the new
+			 * runlist as before in the old runlist.
+			 */
+			rl += old_size - 1;
+			/* Add a new, sparse runlist element. */
+			rl->lcn = LCN_HOLE;
+			rl->length = new_length - rl->vcn;
+			/* Add a new terminator runlist element. */
+			rl++;
+			rl->length = 0;
+		}
+		rl->vcn = new_length;
+		rl->lcn = LCN_ENOENT;
+	} else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
+		/* Runlist already has same size as requested. */
+		rl->lcn = LCN_ENOENT;
+	}
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * ntfs_rl_punch_nolock - punch a hole into a runlist
+ * @vol:	ntfs volume (needed for error output)
+ * @runlist:	runlist to punch a hole into
+ * @start:	starting VCN of the hole to be created
+ * @length:	size of the hole to be created in units of clusters
+ *
+ * Punch a hole into the runlist @runlist starting at VCN @start and of size
+ * @length clusters.
+ *
+ * Return 0 on success and -errno on error, in which case @runlist has not been
+ * modified.
+ *
+ * If @start and/or @start + @length are outside the runlist return error code
+ * -ENOENT.
+ *
+ * If the runlist contains unmapped or error elements between @start and @start
+ * + @length return error code -EINVAL.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length)
+{
+	const VCN end = start + length;
+	s64 delta;
+	runlist_element *rl, *rl_end, *rl_real_end, *trl;
+	int old_size;
+	bool lcn_fixup = false;
+
+	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
+			(long long)start, (long long)length);
+	BUG_ON(!runlist);
+	BUG_ON(start < 0);
+	BUG_ON(length < 0);
+	BUG_ON(end < 0);
+	rl = runlist->rl;
+	if (unlikely(!rl)) {
+		if (likely(!start && !length))
+			return 0;
+		return -ENOENT;
+	}
+	/* Find @start in the runlist. */
+	while (likely(rl->length && start >= rl[1].vcn))
+		rl++;
+	rl_end = rl;
+	/* Find @end in the runlist. */
+	while (likely(rl_end->length && end >= rl_end[1].vcn)) {
+		/* Verify there are no unmapped or error elements. */
+		if (unlikely(rl_end->lcn < LCN_HOLE))
+			return -EINVAL;
+		rl_end++;
+	}
+	/* Check the last element. */
+	if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
+		return -EINVAL;
+	/* This covers @start being out of bounds, too. */
+	if (!rl_end->length && end > rl_end->vcn)
+		return -ENOENT;
+	if (!length)
+		return 0;
+	if (!rl->length)
+		return -ENOENT;
+	rl_real_end = rl_end;
+	/* Determine the runlist size. */
+	while (likely(rl_real_end->length))
+		rl_real_end++;
+	old_size = rl_real_end - runlist->rl + 1;
+	/* If @start is in a hole simply extend the hole. */
+	if (rl->lcn == LCN_HOLE) {
+		/*
+		 * If both @start and @end are in the same sparse run, we are
+		 * done.
+		 */
+		if (end <= rl[1].vcn) {
+			ntfs_debug("Done (requested hole is already sparse).");
+			return 0;
+		}
+extend_hole:
+		/* Extend the hole. */
+		rl->length = end - rl->vcn;
+		/* If @end is in a hole, merge it with the current one. */
+		if (rl_end->lcn == LCN_HOLE) {
+			rl_end++;
+			rl->length = rl_end->vcn - rl->vcn;
+		}
+		/* We have done the hole.  Now deal with the remaining tail. */
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Adjust the beginning of the tail if necessary. */
+		if (end > rl->vcn) {
+			delta = end - rl->vcn;
+			rl->vcn = end;
+			rl->length -= delta;
+			/* Only adjust the lcn if it is real. */
+			if (rl->lcn >= 0)
+				rl->lcn += delta;
+		}
+shrink_allocation:
+		/* Reallocate memory if the allocation changed. */
+		if (rl < rl_end) {
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size - (rl_end - rl));
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+		ntfs_debug("Done (extend hole).");
+		return 0;
+	}
+	/*
+	 * If @start is at the beginning of a run things are easier as there is
+	 * no need to split the first run.
+	 */
+	if (start == rl->vcn) {
+		/*
+		 * @start is at the beginning of a run.
+		 *
+		 * If the previous run is sparse, extend its hole.
+		 *
+		 * If @end is not in the same run, switch the run to be sparse
+		 * and extend the newly created hole.
+		 *
+		 * Thus both of these cases reduce the problem to the above
+		 * case of "@start is in a hole".
+		 */
+		if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
+			rl--;
+			goto extend_hole;
+		}
+		if (end >= rl[1].vcn) {
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		/*
+		 * The final case is when @end is in the same run as @start.
+		 * For this need to split the run into two.  One run for the
+		 * sparse region between the beginning of the old run, i.e.
+		 * @start, and @end and one for the remaining non-sparse
+		 * region, i.e. between @end and the end of the old run.
+		 */
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+split_end:
+		/* Shift all the runs up by one. */
+		memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+		/* Finally, setup the two split runs. */
+		rl->lcn = LCN_HOLE;
+		rl->length = length;
+		rl++;
+		rl->vcn += length;
+		/* Only adjust the lcn if it is real. */
+		if (rl->lcn >= 0 || lcn_fixup)
+			rl->lcn += length;
+		rl->length -= length;
+		ntfs_debug("Done (split one).");
+		return 0;
+	}
+	/*
+	 * @start is neither in a hole nor at the beginning of a run.
+	 *
+	 * If @end is in a hole, things are easier as simply truncating the run
+	 * @start is in to end at @start - 1, deleting all runs after that up
+	 * to @end, and finally extending the beginning of the run @end is in
+	 * to be @start is all that is needed.
+	 */
+	if (rl_end->lcn == LCN_HOLE) {
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Extend the beginning of the run @end is in to be @start. */
+		rl->vcn = start;
+		rl->length = rl[1].vcn - start;
+		goto shrink_allocation;
+	}
+	/* 
+	 * If @end is not in a hole there are still two cases to distinguish.
+	 * Either @end is or is not in the same run as @start.
+	 *
+	 * The second case is easier as it can be reduced to an already solved
+	 * problem by truncating the run @start is in to end at @start - 1.
+	 * Then, if @end is in the next run need to split the run into a sparse
+	 * run followed by a non-sparse run (already covered above) and if @end
+	 * is not in the next run switching it to be sparse, again reduces the
+	 * problem to the already covered case of "@start is in a hole".
+	 */
+	if (end >= rl[1].vcn) {
+		/*
+		 * If @end is not in the next run, reduce the problem to the
+		 * case of "@start is in a hole".
+		 */
+		if (rl[1].length && end >= rl[2].vcn) {
+			/* Truncate the run containing @start. */
+			rl->length = start - rl->vcn;
+			rl++;
+			rl->vcn = start;
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/*
+		 * @end is in the next run, reduce the problem to the case
+		 * where "@start is at the beginning of a run and @end is in
+		 * the same run as @start".
+		 */
+		delta = rl->vcn - start;
+		rl->vcn = start;
+		if (rl->lcn >= 0) {
+			rl->lcn -= delta;
+			/* Need this in case the lcn just became negative. */
+			lcn_fixup = true;
+		}
+		rl->length += delta;
+		goto split_end;
+	}
+	/*
+	 * The first case from above, i.e. @end is in the same run as @start.
+	 * We need to split the run into three.  One run for the non-sparse
+	 * region between the beginning of the old run and @start, one for the
+	 * sparse region between @start and @end, and one for the remaining
+	 * non-sparse region, i.e. between @end and the end of the old run.
+	 */
+	trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
+	if (IS_ERR(trl))
+		goto enomem_out;
+	old_size += 2;
+	if (runlist->rl != trl) {
+		rl = trl + (rl - runlist->rl);
+		rl_end = trl + (rl_end - runlist->rl);
+		rl_real_end = trl + (rl_real_end - runlist->rl);
+		runlist->rl = trl;
+	}
+	/* Shift all the runs up by two. */
+	memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+	/* Finally, setup the three split runs. */
+	rl->length = start - rl->vcn;
+	rl++;
+	rl->vcn = start;
+	rl->lcn = LCN_HOLE;
+	rl->length = length;
+	rl++;
+	delta = end - rl->vcn;
+	rl->vcn = end;
+	rl->lcn += delta;
+	rl->length -= delta;
+	ntfs_debug("Done (split both).");
+	return 0;
+enomem_out:
+	ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
+	return -ENOMEM;
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/runlist.h b/kernel/fs/ntfs/runlist.h
new file mode 100644
index 000000000..47728fbb6
--- /dev/null
+++ b/kernel/fs/ntfs/runlist.h
@@ -0,0 +1,102 @@
+/*
+ * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
+ *	       Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_RUNLIST_H
+#define _LINUX_NTFS_RUNLIST_H
+
+#include "types.h"
+#include "layout.h"
+#include "volume.h"
+
+/**
+ * runlist_element - in memory vcn to lcn mapping array element
+ * @vcn:	starting vcn of the current array element
+ * @lcn:	starting lcn of the current array element
+ * @length:	length in clusters of the current array element
+ *
+ * The last vcn (in fact the last vcn + 1) is reached when length == 0.
+ *
+ * When lcn == -1 this means that the count vcns starting at vcn are not
+ * physically allocated (i.e. this is a hole / data is sparse).
+ */
+typedef struct {	/* In memory vcn to lcn mapping structure element. */
+	VCN vcn;	/* vcn = Starting virtual cluster number. */
+	LCN lcn;	/* lcn = Starting logical cluster number. */
+	s64 length;	/* Run length in clusters. */
+} runlist_element;
+
+/**
+ * runlist - in memory vcn to lcn mapping array including a read/write lock
+ * @rl:		pointer to an array of runlist elements
+ * @lock:	read/write spinlock for serializing access to @rl
+ *
+ */
+typedef struct {
+	runlist_element *rl;
+	struct rw_semaphore lock;
+} runlist;
+
+static inline void ntfs_init_runlist(runlist *rl)
+{
+	rl->rl = NULL;
+	init_rwsem(&rl->lock);
+}
+
+typedef enum {
+	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
+	LCN_RL_NOT_MAPPED	= -2,
+	LCN_ENOENT		= -3,
+	LCN_ENOMEM		= -4,
+	LCN_EIO			= -5,
+} LCN_SPECIAL_VALUES;
+
+extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
+		runlist_element *srl);
+
+extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+		const ATTR_RECORD *attr, runlist_element *old_rl);
+
+extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
+
+#ifdef NTFS_RW
+
+extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
+		const VCN vcn);
+
+extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn);
+
+extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+		const int dst_len, const runlist_element *rl,
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
+
+extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
+		runlist *const runlist, const s64 new_length);
+
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/kernel/fs/ntfs/super.c b/kernel/fs/ntfs/super.c
new file mode 100644
index 000000000..9e1e11207
--- /dev/null
+++ b/kernel/fs/ntfs/super.c
@@ -0,0 +1,3220 @@
+/*
+ * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001,2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/moduleparam.h>
+#include <linux/bitmap.h>
+
+#include "sysctl.h"
+#include "logfile.h"
+#include "quota.h"
+#include "usnjrnl.h"
+#include "dir.h"
+#include "debug.h"
+#include "index.h"
+#include "inode.h"
+#include "aops.h"
+#include "layout.h"
+#include "malloc.h"
+#include "ntfs.h"
+
+/* Number of mounted filesystems which have compression enabled. */
+static unsigned long ntfs_nr_compression_users;
+
+/* A global default upcase table and a corresponding reference count. */
+static ntfschar *default_upcase;
+static unsigned long ntfs_nr_upcase_users;
+
+/* Error constants/strings used in inode.c::ntfs_show_options(). */
+typedef enum {
+	/* One of these must be present, default is ON_ERRORS_CONTINUE. */
+	ON_ERRORS_PANIC			= 0x01,
+	ON_ERRORS_REMOUNT_RO		= 0x02,
+	ON_ERRORS_CONTINUE		= 0x04,
+	/* Optional, can be combined with any of the above. */
+	ON_ERRORS_RECOVER		= 0x10,
+} ON_ERRORS_ACTIONS;
+
+const option_t on_errors_arr[] = {
+	{ ON_ERRORS_PANIC,	"panic" },
+	{ ON_ERRORS_REMOUNT_RO,	"remount-ro", },
+	{ ON_ERRORS_CONTINUE,	"continue", },
+	{ ON_ERRORS_RECOVER,	"recover" },
+	{ 0,			NULL }
+};
+
+/**
+ * simple_getbool -
+ *
+ * Copied from old ntfs driver (which copied from vfat driver).
+ */
+static int simple_getbool(char *s, bool *setval)
+{
+	if (s) {
+		if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
+			*setval = true;
+		else if (!strcmp(s, "0") || !strcmp(s, "no") ||
+							!strcmp(s, "false"))
+			*setval = false;
+		else
+			return 0;
+	} else
+		*setval = true;
+	return 1;
+}
+
+/**
+ * parse_options - parse the (re)mount options
+ * @vol:	ntfs volume
+ * @opt:	string containing the (re)mount options
+ *
+ * Parse the recognized options in @opt for the ntfs volume described by @vol.
+ */
+static bool parse_options(ntfs_volume *vol, char *opt)
+{
+	char *p, *v, *ov;
+	static char *utf8 = "utf8";
+	int errors = 0, sloppy = 0;
+	kuid_t uid = INVALID_UID;
+	kgid_t gid = INVALID_GID;
+	umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
+	int mft_zone_multiplier = -1, on_errors = -1;
+	int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
+	struct nls_table *nls_map = NULL, *old_nls;
+
+	/* I am lazy... (-8 */
+#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value)	\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			variable = default_value;			\
+		else {							\
+			variable = simple_strtoul(ov = v, &v, 0);	\
+			if (*v)						\
+				goto needs_val;				\
+		}							\
+	}
+#define NTFS_GETOPT(option, variable)					\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		variable = simple_strtoul(ov = v, &v, 0);		\
+		if (*v)							\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_UID(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		uid_t uid_value;					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		uid_value = simple_strtoul(ov = v, &v, 0);		\
+		if (*v)							\
+			goto needs_val;					\
+		variable = make_kuid(current_user_ns(), uid_value);	\
+		if (!uid_valid(variable))				\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_GID(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		gid_t gid_value;					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		gid_value = simple_strtoul(ov = v, &v, 0);		\
+		if (*v)							\
+			goto needs_val;					\
+		variable = make_kgid(current_user_ns(), gid_value);	\
+		if (!gid_valid(variable))				\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_OCTAL(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		variable = simple_strtoul(ov = v, &v, 8);		\
+		if (*v)							\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_BOOL(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		bool val;						\
+		if (!simple_getbool(v, &val))				\
+			goto needs_bool;				\
+		variable = val;						\
+	}
+#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array)		\
+	if (!strcmp(p, option)) {					\
+		int _i;							\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		ov = v;							\
+		if (variable == -1)					\
+			variable = 0;					\
+		for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
+			if (!strcmp(opt_array[_i].str, v)) {		\
+				variable |= opt_array[_i].val;		\
+				break;					\
+			}						\
+		if (!opt_array[_i].str || !*opt_array[_i].str)		\
+			goto needs_val;					\
+	}
+	if (!opt || !*opt)
+		goto no_mount_options;
+	ntfs_debug("Entering with mount options string: %s", opt);
+	while ((p = strsep(&opt, ","))) {
+		if ((v = strchr(p, '=')))
+			*v++ = 0;
+		NTFS_GETOPT_UID("uid", uid)
+		else NTFS_GETOPT_GID("gid", gid)
+		else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
+		else NTFS_GETOPT_OCTAL("fmask", fmask)
+		else NTFS_GETOPT_OCTAL("dmask", dmask)
+		else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
+		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
+		else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
+		else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
+		else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
+		else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
+				on_errors_arr)
+		else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
+			ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
+					p);
+		else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
+			if (!strcmp(p, "iocharset"))
+				ntfs_warning(vol->sb, "Option iocharset is "
+						"deprecated. Please use "
+						"option nls=<charsetname> in "
+						"the future.");
+			if (!v || !*v)
+				goto needs_arg;
+use_utf8:
+			old_nls = nls_map;
+			nls_map = load_nls(v);
+			if (!nls_map) {
+				if (!old_nls) {
+					ntfs_error(vol->sb, "NLS character set "
+							"%s not found.", v);
+					return false;
+				}
+				ntfs_error(vol->sb, "NLS character set %s not "
+						"found. Using previous one %s.",
+						v, old_nls->charset);
+				nls_map = old_nls;
+			} else /* nls_map */ {
+				unload_nls(old_nls);
+			}
+		} else if (!strcmp(p, "utf8")) {
+			bool val = false;
+			ntfs_warning(vol->sb, "Option utf8 is no longer "
+				   "supported, using option nls=utf8. Please "
+				   "use option nls=utf8 in the future and "
+				   "make sure utf8 is compiled either as a "
+				   "module or into the kernel.");
+			if (!v || !*v)
+				val = true;
+			else if (!simple_getbool(v, &val))
+				goto needs_bool;
+			if (val) {
+				v = utf8;
+				goto use_utf8;
+			}
+		} else {
+			ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
+			if (errors < INT_MAX)
+				errors++;
+		}
+#undef NTFS_GETOPT_OPTIONS_ARRAY
+#undef NTFS_GETOPT_BOOL
+#undef NTFS_GETOPT
+#undef NTFS_GETOPT_WITH_DEFAULT
+	}
+no_mount_options:
+	if (errors && !sloppy)
+		return false;
+	if (sloppy)
+		ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
+				"unrecognized mount option(s) and continuing.");
+	/* Keep this first! */
+	if (on_errors != -1) {
+		if (!on_errors) {
+			ntfs_error(vol->sb, "Invalid errors option argument "
+					"or bug in options parser.");
+			return false;
+		}
+	}
+	if (nls_map) {
+		if (vol->nls_map && vol->nls_map != nls_map) {
+			ntfs_error(vol->sb, "Cannot change NLS character set "
+					"on remount.");
+			return false;
+		} /* else (!vol->nls_map) */
+		ntfs_debug("Using NLS character set %s.", nls_map->charset);
+		vol->nls_map = nls_map;
+	} else /* (!nls_map) */ {
+		if (!vol->nls_map) {
+			vol->nls_map = load_nls_default();
+			if (!vol->nls_map) {
+				ntfs_error(vol->sb, "Failed to load default "
+						"NLS character set.");
+				return false;
+			}
+			ntfs_debug("Using default NLS character set (%s).",
+					vol->nls_map->charset);
+		}
+	}
+	if (mft_zone_multiplier != -1) {
+		if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
+				mft_zone_multiplier) {
+			ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
+					"on remount.");
+			return false;
+		}
+		if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
+			ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
+					"Using default value, i.e. 1.");
+			mft_zone_multiplier = 1;
+		}
+		vol->mft_zone_multiplier = mft_zone_multiplier;
+	}
+	if (!vol->mft_zone_multiplier)
+		vol->mft_zone_multiplier = 1;
+	if (on_errors != -1)
+		vol->on_errors = on_errors;
+	if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
+		vol->on_errors |= ON_ERRORS_CONTINUE;
+	if (uid_valid(uid))
+		vol->uid = uid;
+	if (gid_valid(gid))
+		vol->gid = gid;
+	if (fmask != (umode_t)-1)
+		vol->fmask = fmask;
+	if (dmask != (umode_t)-1)
+		vol->dmask = dmask;
+	if (show_sys_files != -1) {
+		if (show_sys_files)
+			NVolSetShowSystemFiles(vol);
+		else
+			NVolClearShowSystemFiles(vol);
+	}
+	if (case_sensitive != -1) {
+		if (case_sensitive)
+			NVolSetCaseSensitive(vol);
+		else
+			NVolClearCaseSensitive(vol);
+	}
+	if (disable_sparse != -1) {
+		if (disable_sparse)
+			NVolClearSparseEnabled(vol);
+		else {
+			if (!NVolSparseEnabled(vol) &&
+					vol->major_ver && vol->major_ver < 3)
+				ntfs_warning(vol->sb, "Not enabling sparse "
+						"support due to NTFS volume "
+						"version %i.%i (need at least "
+						"version 3.0).", vol->major_ver,
+						vol->minor_ver);
+			else
+				NVolSetSparseEnabled(vol);
+		}
+	}
+	return true;
+needs_arg:
+	ntfs_error(vol->sb, "The %s option requires an argument.", p);
+	return false;
+needs_bool:
+	ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
+	return false;
+needs_val:
+	ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
+	return false;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_write_volume_flags - write new flags to the volume information flags
+ * @vol:	ntfs volume on which to modify the flags
+ * @flags:	new flags value for the volume information flags
+ *
+ * Internal function.  You probably want to use ntfs_{set,clear}_volume_flags()
+ * instead (see below).
+ *
+ * Replace the volume information flags on the volume @vol with the value
+ * supplied in @flags.  Note, this overwrites the volume information flags, so
+ * make sure to combine the flags you want to modify with the old flags and use
+ * the result when calling ntfs_write_volume_flags().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
+{
+	ntfs_inode *ni = NTFS_I(vol->vol_ino);
+	MFT_RECORD *m;
+	VOLUME_INFORMATION *vi;
+	ntfs_attr_search_ctx *ctx;
+	int err;
+
+	ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
+			le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
+	if (vol->vol_flags == flags)
+		goto done;
+	BUG_ON(!ni);
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto put_unm_err_out;
+	}
+	err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+			ctx);
+	if (err)
+		goto put_unm_err_out;
+	vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	vol->vol_flags = vi->flags = flags;
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ni);
+done:
+	ntfs_debug("Done.");
+	return 0;
+put_unm_err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ni);
+err_out:
+	ntfs_error(vol->sb, "Failed with error code %i.", -err);
+	return err;
+}
+
+/**
+ * ntfs_set_volume_flags - set bits in the volume information flags
+ * @vol:	ntfs volume on which to modify the flags
+ * @flags:	flags to set on the volume
+ *
+ * Set the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+	flags &= VOLUME_FLAGS_MASK;
+	return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
+}
+
+/**
+ * ntfs_clear_volume_flags - clear bits in the volume information flags
+ * @vol:	ntfs volume on which to modify the flags
+ * @flags:	flags to clear on the volume
+ *
+ * Clear the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+	flags &= VOLUME_FLAGS_MASK;
+	flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
+	return ntfs_write_volume_flags(vol, flags);
+}
+
+#endif /* NTFS_RW */
+
+/**
+ * ntfs_remount - change the mount options of a mounted ntfs filesystem
+ * @sb:		superblock of mounted ntfs filesystem
+ * @flags:	remount flags
+ * @opt:	remount options string
+ *
+ * Change the mount options of an already mounted ntfs filesystem.
+ *
+ * NOTE:  The VFS sets the @sb->s_flags remount flags to @flags after
+ * ntfs_remount() returns successfully (i.e. returns 0).  Otherwise,
+ * @sb->s_flags are not changed.
+ */
+static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
+{
+	ntfs_volume *vol = NTFS_SB(sb);
+
+	ntfs_debug("Entering with remount options string: %s", opt);
+
+	sync_filesystem(sb);
+
+#ifndef NTFS_RW
+	/* For read-only compiled driver, enforce read-only flag. */
+	*flags |= MS_RDONLY;
+#else /* NTFS_RW */
+	/*
+	 * For the read-write compiled driver, if we are remounting read-write,
+	 * make sure there are no volume errors and that no unsupported volume
+	 * flags are set.  Also, empty the logfile journal as it would become
+	 * stale as soon as something is written to the volume and mark the
+	 * volume dirty so that chkdsk is run if the volume is not umounted
+	 * cleanly.  Finally, mark the quotas out of date so Windows rescans
+	 * the volume on boot and updates them.
+	 *
+	 * When remounting read-only, mark the volume clean if no volume errors
+	 * have occurred.
+	 */
+	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		static const char *es = ".  Cannot remount read-write.";
+
+		/* Remounting read-write. */
+		if (NVolErrors(vol)) {
+			ntfs_error(sb, "Volume has errors and is read-only%s",
+					es);
+			return -EROFS;
+		}
+		if (vol->vol_flags & VOLUME_IS_DIRTY) {
+			ntfs_error(sb, "Volume is dirty and read-only%s", es);
+			return -EROFS;
+		}
+		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			ntfs_error(sb, "Volume has been modified by chkdsk "
+					"and is read-only%s", es);
+			return -EROFS;
+		}
+		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+			ntfs_error(sb, "Volume has unsupported flags set "
+					"(0x%x) and is read-only%s",
+					(unsigned)le16_to_cpu(vol->vol_flags),
+					es);
+			return -EROFS;
+		}
+		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+			ntfs_error(sb, "Failed to set dirty bit in volume "
+					"information flags%s", es);
+			return -EROFS;
+		}
+#if 0
+		// TODO: Enable this code once we start modifying anything that
+		//	 is different between NTFS 1.2 and 3.x...
+		/* Set NT4 compatibility flag on newer NTFS version volumes. */
+		if ((vol->major_ver > 1)) {
+			if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+				ntfs_error(sb, "Failed to set NT4 "
+						"compatibility flag%s", es);
+				NVolSetErrors(vol);
+				return -EROFS;
+			}
+		}
+#endif
+		if (!ntfs_empty_logfile(vol->logfile_ino)) {
+			ntfs_error(sb, "Failed to empty journal $LogFile%s",
+					es);
+			NVolSetErrors(vol);
+			return -EROFS;
+		}
+		if (!ntfs_mark_quotas_out_of_date(vol)) {
+			ntfs_error(sb, "Failed to mark quotas out of date%s",
+					es);
+			NVolSetErrors(vol);
+			return -EROFS;
+		}
+		if (!ntfs_stamp_usnjrnl(vol)) {
+			ntfs_error(sb, "Failed to stamp transation log "
+					"($UsnJrnl)%s", es);
+			NVolSetErrors(vol);
+			return -EROFS;
+		}
+	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+		/* Remounting read-only. */
+		if (!NVolErrors(vol)) {
+			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+				ntfs_warning(sb, "Failed to clear dirty bit "
+						"in volume information "
+						"flags.  Run chkdsk.");
+		}
+	}
+#endif /* NTFS_RW */
+
+	// TODO: Deal with *flags.
+
+	if (!parse_options(vol, opt))
+		return -EINVAL;
+
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
+ * @sb:		Super block of the device to which @b belongs.
+ * @b:		Boot sector of device @sb to check.
+ * @silent:	If 'true', all output will be silenced.
+ *
+ * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
+ * sector. Returns 'true' if it is valid and 'false' if not.
+ *
+ * @sb is only needed for warning/error output, i.e. it can be NULL when silent
+ * is 'true'.
+ */
+static bool is_boot_sector_ntfs(const struct super_block *sb,
+		const NTFS_BOOT_SECTOR *b, const bool silent)
+{
+	/*
+	 * Check that checksum == sum of u32 values from b to the checksum
+	 * field.  If checksum is zero, no checking is done.  We will work when
+	 * the checksum test fails, since some utilities update the boot sector
+	 * ignoring the checksum which leaves the checksum out-of-date.  We
+	 * report a warning if this is the case.
+	 */
+	if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
+		le32 *u;
+		u32 i;
+
+		for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
+			i += le32_to_cpup(u);
+		if (le32_to_cpu(b->checksum) != i)
+			ntfs_warning(sb, "Invalid boot sector checksum.");
+	}
+	/* Check OEMidentifier is "NTFS    " */
+	if (b->oem_id != magicNTFS)
+		goto not_ntfs;
+	/* Check bytes per sector value is between 256 and 4096. */
+	if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
+			le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
+		goto not_ntfs;
+	/* Check sectors per cluster value is valid. */
+	switch (b->bpb.sectors_per_cluster) {
+	case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
+		break;
+	default:
+		goto not_ntfs;
+	}
+	/* Check the cluster size is not above the maximum (64kiB). */
+	if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
+			b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
+		goto not_ntfs;
+	/* Check reserved/unused fields are really zero. */
+	if (le16_to_cpu(b->bpb.reserved_sectors) ||
+			le16_to_cpu(b->bpb.root_entries) ||
+			le16_to_cpu(b->bpb.sectors) ||
+			le16_to_cpu(b->bpb.sectors_per_fat) ||
+			le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
+		goto not_ntfs;
+	/* Check clusters per file mft record value is valid. */
+	if ((u8)b->clusters_per_mft_record < 0xe1 ||
+			(u8)b->clusters_per_mft_record > 0xf7)
+		switch (b->clusters_per_mft_record) {
+		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+			break;
+		default:
+			goto not_ntfs;
+		}
+	/* Check clusters per index block value is valid. */
+	if ((u8)b->clusters_per_index_record < 0xe1 ||
+			(u8)b->clusters_per_index_record > 0xf7)
+		switch (b->clusters_per_index_record) {
+		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+			break;
+		default:
+			goto not_ntfs;
+		}
+	/*
+	 * Check for valid end of sector marker. We will work without it, but
+	 * many BIOSes will refuse to boot from a bootsector if the magic is
+	 * incorrect, so we emit a warning.
+	 */
+	if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
+		ntfs_warning(sb, "Invalid end of sector marker.");
+	return true;
+not_ntfs:
+	return false;
+}
+
+/**
+ * read_ntfs_boot_sector - read the NTFS boot sector of a device
+ * @sb:		super block of device to read the boot sector from
+ * @silent:	if true, suppress all output
+ *
+ * Reads the boot sector from the device and validates it. If that fails, tries
+ * to read the backup boot sector, first from the end of the device a-la NT4 and
+ * later and then from the middle of the device a-la NT3.51 and before.
+ *
+ * If a valid boot sector is found but it is not the primary boot sector, we
+ * repair the primary boot sector silently (unless the device is read-only or
+ * the primary boot sector is not accessible).
+ *
+ * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
+ * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
+ * to their respective values.
+ *
+ * Return the unlocked buffer head containing the boot sector or NULL on error.
+ */
+static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
+		const int silent)
+{
+	const char *read_err_str = "Unable to read %s boot sector.";
+	struct buffer_head *bh_primary, *bh_backup;
+	sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
+
+	/* Try to read primary boot sector. */
+	if ((bh_primary = sb_bread(sb, 0))) {
+		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+				bh_primary->b_data, silent))
+			return bh_primary;
+		if (!silent)
+			ntfs_error(sb, "Primary boot sector is invalid.");
+	} else if (!silent)
+		ntfs_error(sb, read_err_str, "primary");
+	if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
+		if (bh_primary)
+			brelse(bh_primary);
+		if (!silent)
+			ntfs_error(sb, "Mount option errors=recover not used. "
+					"Aborting without trying to recover.");
+		return NULL;
+	}
+	/* Try to read NT4+ backup boot sector. */
+	if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
+		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+				bh_backup->b_data, silent))
+			goto hotfix_primary_boot_sector;
+		brelse(bh_backup);
+	} else if (!silent)
+		ntfs_error(sb, read_err_str, "backup");
+	/* Try to read NT3.51- backup boot sector. */
+	if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
+		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+				bh_backup->b_data, silent))
+			goto hotfix_primary_boot_sector;
+		if (!silent)
+			ntfs_error(sb, "Could not find a valid backup boot "
+					"sector.");
+		brelse(bh_backup);
+	} else if (!silent)
+		ntfs_error(sb, read_err_str, "backup");
+	/* We failed. Cleanup and return. */
+	if (bh_primary)
+		brelse(bh_primary);
+	return NULL;
+hotfix_primary_boot_sector:
+	if (bh_primary) {
+		/*
+		 * If we managed to read sector zero and the volume is not
+		 * read-only, copy the found, valid backup boot sector to the
+		 * primary boot sector.  Note we only copy the actual boot
+		 * sector structure, not the actual whole device sector as that
+		 * may be bigger and would potentially damage the $Boot system
+		 * file (FIXME: Would be nice to know if the backup boot sector
+		 * on a large sector device contains the whole boot loader or
+		 * just the first 512 bytes).
+		 */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
+					"boot sector from backup copy.");
+			memcpy(bh_primary->b_data, bh_backup->b_data,
+					NTFS_BLOCK_SIZE);
+			mark_buffer_dirty(bh_primary);
+			sync_dirty_buffer(bh_primary);
+			if (buffer_uptodate(bh_primary)) {
+				brelse(bh_backup);
+				return bh_primary;
+			}
+			ntfs_error(sb, "Hot-fix: Device write error while "
+					"recovering primary boot sector.");
+		} else {
+			ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
+					"sector failed: Read-only mount.");
+		}
+		brelse(bh_primary);
+	}
+	ntfs_warning(sb, "Using backup boot sector.");
+	return bh_backup;
+}
+
+/**
+ * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
+ * @vol:	volume structure to initialise with data from boot sector
+ * @b:		boot sector to parse
+ *
+ * Parse the ntfs boot sector @b and store all imporant information therein in
+ * the ntfs super block @vol.  Return 'true' on success and 'false' on error.
+ */
+static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
+{
+	unsigned int sectors_per_cluster_bits, nr_hidden_sects;
+	int clusters_per_mft_record, clusters_per_index_record;
+	s64 ll;
+
+	vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
+	vol->sector_size_bits = ffs(vol->sector_size) - 1;
+	ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
+			vol->sector_size);
+	ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
+			vol->sector_size_bits);
+	if (vol->sector_size < vol->sb->s_blocksize) {
+		ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
+				"device block size (%lu).  This is not "
+				"supported.  Sorry.", vol->sector_size,
+				vol->sb->s_blocksize);
+		return false;
+	}
+	ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
+	sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
+	ntfs_debug("sectors_per_cluster_bits = 0x%x",
+			sectors_per_cluster_bits);
+	nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
+	ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
+	vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
+	vol->cluster_size_mask = vol->cluster_size - 1;
+	vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
+	ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
+			vol->cluster_size);
+	ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
+	ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
+	if (vol->cluster_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->cluster_size, vol->sector_size);
+		return false;
+	}
+	clusters_per_mft_record = b->clusters_per_mft_record;
+	ntfs_debug("clusters_per_mft_record = %i (0x%x)",
+			clusters_per_mft_record, clusters_per_mft_record);
+	if (clusters_per_mft_record > 0)
+		vol->mft_record_size = vol->cluster_size <<
+				(ffs(clusters_per_mft_record) - 1);
+	else
+		/*
+		 * When mft_record_size < cluster_size, clusters_per_mft_record
+		 * = -log2(mft_record_size) bytes. mft_record_size normaly is
+		 * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
+		 */
+		vol->mft_record_size = 1 << -clusters_per_mft_record;
+	vol->mft_record_size_mask = vol->mft_record_size - 1;
+	vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
+	ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
+			vol->mft_record_size);
+	ntfs_debug("vol->mft_record_size_mask = 0x%x",
+			vol->mft_record_size_mask);
+	ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
+			vol->mft_record_size_bits, vol->mft_record_size_bits);
+	/*
+	 * We cannot support mft record sizes above the PAGE_CACHE_SIZE since
+	 * we store $MFT/$DATA, the table of mft records in the page cache.
+	 */
+	if (vol->mft_record_size > PAGE_CACHE_SIZE) {
+		ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
+				"PAGE_CACHE_SIZE on your system (%lu).  "
+				"This is not supported.  Sorry.",
+				vol->mft_record_size, PAGE_CACHE_SIZE);
+		return false;
+	}
+	/* We cannot support mft record sizes below the sector size. */
+	if (vol->mft_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->mft_record_size,
+				vol->sector_size);
+		return false;
+	}
+	clusters_per_index_record = b->clusters_per_index_record;
+	ntfs_debug("clusters_per_index_record = %i (0x%x)",
+			clusters_per_index_record, clusters_per_index_record);
+	if (clusters_per_index_record > 0)
+		vol->index_record_size = vol->cluster_size <<
+				(ffs(clusters_per_index_record) - 1);
+	else
+		/*
+		 * When index_record_size < cluster_size,
+		 * clusters_per_index_record = -log2(index_record_size) bytes.
+		 * index_record_size normaly equals 4096 bytes, which is
+		 * encoded as 0xF4 (-12 in decimal).
+		 */
+		vol->index_record_size = 1 << -clusters_per_index_record;
+	vol->index_record_size_mask = vol->index_record_size - 1;
+	vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
+	ntfs_debug("vol->index_record_size = %i (0x%x)",
+			vol->index_record_size, vol->index_record_size);
+	ntfs_debug("vol->index_record_size_mask = 0x%x",
+			vol->index_record_size_mask);
+	ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
+			vol->index_record_size_bits,
+			vol->index_record_size_bits);
+	/* We cannot support index record sizes below the sector size. */
+	if (vol->index_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Index record size (%i) is smaller than "
+				"the sector size (%i).  This is not "
+				"supported.  Sorry.", vol->index_record_size,
+				vol->sector_size);
+		return false;
+	}
+	/*
+	 * Get the size of the volume in clusters and check for 64-bit-ness.
+	 * Windows currently only uses 32 bits to save the clusters so we do
+	 * the same as it is much faster on 32-bit CPUs.
+	 */
+	ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
+	if ((u64)ll >= 1ULL << 32) {
+		ntfs_error(vol->sb, "Cannot handle 64-bit clusters.  Sorry.");
+		return false;
+	}
+	vol->nr_clusters = ll;
+	ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
+	/*
+	 * On an architecture where unsigned long is 32-bits, we restrict the
+	 * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
+	 * will hopefully optimize the whole check away.
+	 */
+	if (sizeof(unsigned long) < 8) {
+		if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
+			ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
+					"large for this architecture.  "
+					"Maximum supported is 2TiB.  Sorry.",
+					(unsigned long long)ll >> (40 -
+					vol->cluster_size_bits));
+			return false;
+		}
+	}
+	ll = sle64_to_cpu(b->mft_lcn);
+	if (ll >= vol->nr_clusters) {
+		ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
+				"volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
+		return false;
+	}
+	vol->mft_lcn = ll;
+	ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
+	ll = sle64_to_cpu(b->mftmirr_lcn);
+	if (ll >= vol->nr_clusters) {
+		ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
+				"of volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
+		return false;
+	}
+	vol->mftmirr_lcn = ll;
+	ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
+#ifdef NTFS_RW
+	/*
+	 * Work out the size of the mft mirror in number of mft records. If the
+	 * cluster size is less than or equal to the size taken by four mft
+	 * records, the mft mirror stores the first four mft records. If the
+	 * cluster size is bigger than the size taken by four mft records, the
+	 * mft mirror contains as many mft records as will fit into one
+	 * cluster.
+	 */
+	if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
+		vol->mftmirr_size = 4;
+	else
+		vol->mftmirr_size = vol->cluster_size >>
+				vol->mft_record_size_bits;
+	ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
+#endif /* NTFS_RW */
+	vol->serial_no = le64_to_cpu(b->volume_serial_number);
+	ntfs_debug("vol->serial_no = 0x%llx",
+			(unsigned long long)vol->serial_no);
+	return true;
+}
+
+/**
+ * ntfs_setup_allocators - initialize the cluster and mft allocators
+ * @vol:	volume structure for which to setup the allocators
+ *
+ * Setup the cluster (lcn) and mft allocators to the starting values.
+ */
+static void ntfs_setup_allocators(ntfs_volume *vol)
+{
+#ifdef NTFS_RW
+	LCN mft_zone_size, mft_lcn;
+#endif /* NTFS_RW */
+
+	ntfs_debug("vol->mft_zone_multiplier = 0x%x",
+			vol->mft_zone_multiplier);
+#ifdef NTFS_RW
+	/* Determine the size of the MFT zone. */
+	mft_zone_size = vol->nr_clusters;
+	switch (vol->mft_zone_multiplier) {  /* % of volume size in clusters */
+	case 4:
+		mft_zone_size >>= 1;			/* 50%   */
+		break;
+	case 3:
+		mft_zone_size = (mft_zone_size +
+				(mft_zone_size >> 1)) >> 2;	/* 37.5% */
+		break;
+	case 2:
+		mft_zone_size >>= 2;			/* 25%   */
+		break;
+	/* case 1: */
+	default:
+		mft_zone_size >>= 3;			/* 12.5% */
+		break;
+	}
+	/* Setup the mft zone. */
+	vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
+	ntfs_debug("vol->mft_zone_pos = 0x%llx",
+			(unsigned long long)vol->mft_zone_pos);
+	/*
+	 * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
+	 * source) and if the actual mft_lcn is in the expected place or even
+	 * further to the front of the volume, extend the mft_zone to cover the
+	 * beginning of the volume as well.  This is in order to protect the
+	 * area reserved for the mft bitmap as well within the mft_zone itself.
+	 * On non-standard volumes we do not protect it as the overhead would
+	 * be higher than the speed increase we would get by doing it.
+	 */
+	mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
+	if (mft_lcn * vol->cluster_size < 16 * 1024)
+		mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
+				vol->cluster_size;
+	if (vol->mft_zone_start <= mft_lcn)
+		vol->mft_zone_start = 0;
+	ntfs_debug("vol->mft_zone_start = 0x%llx",
+			(unsigned long long)vol->mft_zone_start);
+	/*
+	 * Need to cap the mft zone on non-standard volumes so that it does
+	 * not point outside the boundaries of the volume.  We do this by
+	 * halving the zone size until we are inside the volume.
+	 */
+	vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+	while (vol->mft_zone_end >= vol->nr_clusters) {
+		mft_zone_size >>= 1;
+		vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+	}
+	ntfs_debug("vol->mft_zone_end = 0x%llx",
+			(unsigned long long)vol->mft_zone_end);
+	/*
+	 * Set the current position within each data zone to the start of the
+	 * respective zone.
+	 */
+	vol->data1_zone_pos = vol->mft_zone_end;
+	ntfs_debug("vol->data1_zone_pos = 0x%llx",
+			(unsigned long long)vol->data1_zone_pos);
+	vol->data2_zone_pos = 0;
+	ntfs_debug("vol->data2_zone_pos = 0x%llx",
+			(unsigned long long)vol->data2_zone_pos);
+
+	/* Set the mft data allocation position to mft record 24. */
+	vol->mft_data_pos = 24;
+	ntfs_debug("vol->mft_data_pos = 0x%llx",
+			(unsigned long long)vol->mft_data_pos);
+#endif /* NTFS_RW */
+}
+
+#ifdef NTFS_RW
+
+/**
+ * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
+ * @vol:	ntfs super block describing device whose mft mirror to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_mft_mirror(ntfs_volume *vol)
+{
+	struct inode *tmp_ino;
+	ntfs_inode *tmp_ni;
+
+	ntfs_debug("Entering.");
+	/* Get mft mirror inode. */
+	tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
+	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		/* Caller will display error message. */
+		return false;
+	}
+	/*
+	 * Re-initialize some specifics about $MFTMirr's inode as
+	 * ntfs_read_inode() will have set up the default ones.
+	 */
+	/* Set uid and gid to root. */
+	tmp_ino->i_uid = GLOBAL_ROOT_UID;
+	tmp_ino->i_gid = GLOBAL_ROOT_GID;
+	/* Regular file.  No access for anyone. */
+	tmp_ino->i_mode = S_IFREG;
+	/* No VFS initiated operations allowed for $MFTMirr. */
+	tmp_ino->i_op = &ntfs_empty_inode_ops;
+	tmp_ino->i_fop = &ntfs_empty_file_ops;
+	/* Put in our special address space operations. */
+	tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
+	tmp_ni = NTFS_I(tmp_ino);
+	/* The $MFTMirr, like the $MFT is multi sector transfer protected. */
+	NInoSetMstProtected(tmp_ni);
+	NInoSetSparseDisabled(tmp_ni);
+	/*
+	 * Set up our little cheat allowing us to reuse the async read io
+	 * completion handler for directories.
+	 */
+	tmp_ni->itype.index.block_size = vol->mft_record_size;
+	tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+	vol->mftmirr_ino = tmp_ino;
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * check_mft_mirror - compare contents of the mft mirror with the mft
+ * @vol:	ntfs super block describing device whose mft mirror to check
+ *
+ * Return 'true' on success or 'false' on error.
+ *
+ * Note, this function also results in the mft mirror runlist being completely
+ * mapped into memory.  The mft mirror write code requires this and will BUG()
+ * should it find an unmapped runlist element.
+ */
+static bool check_mft_mirror(ntfs_volume *vol)
+{
+	struct super_block *sb = vol->sb;
+	ntfs_inode *mirr_ni;
+	struct page *mft_page, *mirr_page;
+	u8 *kmft, *kmirr;
+	runlist_element *rl, rl2[2];
+	pgoff_t index;
+	int mrecs_per_page, i;
+
+	ntfs_debug("Entering.");
+	/* Compare contents of $MFT and $MFTMirr. */
+	mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size;
+	BUG_ON(!mrecs_per_page);
+	BUG_ON(!vol->mftmirr_size);
+	mft_page = mirr_page = NULL;
+	kmft = kmirr = NULL;
+	index = i = 0;
+	do {
+		u32 bytes;
+
+		/* Switch pages if necessary. */
+		if (!(i % mrecs_per_page)) {
+			if (index) {
+				ntfs_unmap_page(mft_page);
+				ntfs_unmap_page(mirr_page);
+			}
+			/* Get the $MFT page. */
+			mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
+					index);
+			if (IS_ERR(mft_page)) {
+				ntfs_error(sb, "Failed to read $MFT.");
+				return false;
+			}
+			kmft = page_address(mft_page);
+			/* Get the $MFTMirr page. */
+			mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
+					index);
+			if (IS_ERR(mirr_page)) {
+				ntfs_error(sb, "Failed to read $MFTMirr.");
+				goto mft_unmap_out;
+			}
+			kmirr = page_address(mirr_page);
+			++index;
+		}
+		/* Do not check the record if it is not in use. */
+		if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
+			/* Make sure the record is ok. */
+			if (ntfs_is_baad_recordp((le32*)kmft)) {
+				ntfs_error(sb, "Incomplete multi sector "
+						"transfer detected in mft "
+						"record %i.", i);
+mm_unmap_out:
+				ntfs_unmap_page(mirr_page);
+mft_unmap_out:
+				ntfs_unmap_page(mft_page);
+				return false;
+			}
+		}
+		/* Do not check the mirror record if it is not in use. */
+		if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
+			if (ntfs_is_baad_recordp((le32*)kmirr)) {
+				ntfs_error(sb, "Incomplete multi sector "
+						"transfer detected in mft "
+						"mirror record %i.", i);
+				goto mm_unmap_out;
+			}
+		}
+		/* Get the amount of data in the current record. */
+		bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
+		if (bytes < sizeof(MFT_RECORD_OLD) ||
+				bytes > vol->mft_record_size ||
+				ntfs_is_baad_recordp((le32*)kmft)) {
+			bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
+			if (bytes < sizeof(MFT_RECORD_OLD) ||
+					bytes > vol->mft_record_size ||
+					ntfs_is_baad_recordp((le32*)kmirr))
+				bytes = vol->mft_record_size;
+		}
+		/* Compare the two records. */
+		if (memcmp(kmft, kmirr, bytes)) {
+			ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
+					"match.  Run ntfsfix or chkdsk.", i);
+			goto mm_unmap_out;
+		}
+		kmft += vol->mft_record_size;
+		kmirr += vol->mft_record_size;
+	} while (++i < vol->mftmirr_size);
+	/* Release the last pages. */
+	ntfs_unmap_page(mft_page);
+	ntfs_unmap_page(mirr_page);
+
+	/* Construct the mft mirror runlist by hand. */
+	rl2[0].vcn = 0;
+	rl2[0].lcn = vol->mftmirr_lcn;
+	rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
+			vol->cluster_size - 1) / vol->cluster_size;
+	rl2[1].vcn = rl2[0].length;
+	rl2[1].lcn = LCN_ENOENT;
+	rl2[1].length = 0;
+	/*
+	 * Because we have just read all of the mft mirror, we know we have
+	 * mapped the full runlist for it.
+	 */
+	mirr_ni = NTFS_I(vol->mftmirr_ino);
+	down_read(&mirr_ni->runlist.lock);
+	rl = mirr_ni->runlist.rl;
+	/* Compare the two runlists.  They must be identical. */
+	i = 0;
+	do {
+		if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
+				rl2[i].length != rl[i].length) {
+			ntfs_error(sb, "$MFTMirr location mismatch.  "
+					"Run chkdsk.");
+			up_read(&mirr_ni->runlist.lock);
+			return false;
+		}
+	} while (rl2[i++].length);
+	up_read(&mirr_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * load_and_check_logfile - load and check the logfile inode for a volume
+ * @vol:	ntfs super block describing device whose logfile to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_check_logfile(ntfs_volume *vol,
+		RESTART_PAGE_HEADER **rp)
+{
+	struct inode *tmp_ino;
+
+	ntfs_debug("Entering.");
+	tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
+	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		/* Caller will display error message. */
+		return false;
+	}
+	if (!ntfs_check_logfile(tmp_ino, rp)) {
+		iput(tmp_ino);
+		/* ntfs_check_logfile() will have displayed error output. */
+		return false;
+	}
+	NInoSetSparseDisabled(NTFS_I(tmp_ino));
+	vol->logfile_ino = tmp_ino;
+	ntfs_debug("Done.");
+	return true;
+}
+
+#define NTFS_HIBERFIL_HEADER_SIZE	4096
+
+/**
+ * check_windows_hibernation_status - check if Windows is suspended on a volume
+ * @vol:	ntfs super block of device to check
+ *
+ * Check if Windows is hibernated on the ntfs volume @vol.  This is done by
+ * looking for the file hiberfil.sys in the root directory of the volume.  If
+ * the file is not present Windows is definitely not suspended.
+ *
+ * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
+ * definitely suspended (this volume is not the system volume).  Caveat:  on a
+ * system with many volumes it is possible that the < 4kiB check is bogus but
+ * for now this should do fine.
+ *
+ * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
+ * hiberfil header (which is the first 4kiB).  If this begins with "hibr",
+ * Windows is definitely suspended.  If it is completely full of zeroes,
+ * Windows is definitely not hibernated.  Any other case is treated as if
+ * Windows is suspended.  This caters for the above mentioned caveat of a
+ * system with many volumes where no "hibr" magic would be present and there is
+ * no zero header.
+ *
+ * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
+ * hibernated on the volume, and -errno on error.
+ */
+static int check_windows_hibernation_status(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *vi;
+	struct page *page;
+	u32 *kaddr, *kend;
+	ntfs_name *name = NULL;
+	int ret = 1;
+	static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
+			cpu_to_le16('i'), cpu_to_le16('b'),
+			cpu_to_le16('e'), cpu_to_le16('r'),
+			cpu_to_le16('f'), cpu_to_le16('i'),
+			cpu_to_le16('l'), cpu_to_le16('.'),
+			cpu_to_le16('s'), cpu_to_le16('y'),
+			cpu_to_le16('s'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the hibernation file by looking up the
+	 * filename hiberfil.sys in the root directory.
+	 */
+	mutex_lock(&vol->root_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
+			&name);
+	mutex_unlock(&vol->root_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		ret = MREF_ERR(mref);
+		/* If the file does not exist, Windows is not hibernated. */
+		if (ret == -ENOENT) {
+			ntfs_debug("hiberfil.sys not present.  Windows is not "
+					"hibernated on the volume.");
+			return 0;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for "
+				"hiberfil.sys.");
+		return ret;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	vi = ntfs_iget(vol->sb, MREF(mref));
+	if (IS_ERR(vi) || is_bad_inode(vi)) {
+		if (!IS_ERR(vi))
+			iput(vi);
+		ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
+		return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
+	}
+	if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
+		ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx).  "
+				"Windows is hibernated on the volume.  This "
+				"is not the system volume.", i_size_read(vi));
+		goto iput_out;
+	}
+	page = ntfs_map_page(vi->i_mapping, 0);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
+		ret = PTR_ERR(page);
+		goto iput_out;
+	}
+	kaddr = (u32*)page_address(page);
+	if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
+		ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
+				"hibernated on the volume.  This is the "
+				"system volume.");
+		goto unm_iput_out;
+	}
+	kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
+	do {
+		if (unlikely(*kaddr)) {
+			ntfs_debug("hiberfil.sys is larger than 4kiB "
+					"(0x%llx), does not contain the "
+					"\"hibr\" magic, and does not have a "
+					"zero header.  Windows is hibernated "
+					"on the volume.  This is not the "
+					"system volume.", i_size_read(vi));
+			goto unm_iput_out;
+		}
+	} while (++kaddr < kend);
+	ntfs_debug("hiberfil.sys contains a zero header.  Windows is not "
+			"hibernated on the volume.  This is the system "
+			"volume.");
+	ret = 0;
+unm_iput_out:
+	ntfs_unmap_page(page);
+iput_out:
+	iput(vi);
+	return ret;
+}
+
+/**
+ * load_and_init_quota - load and setup the quota file for a volume if present
+ * @vol:	ntfs super block describing device whose quota file to load
+ *
+ * Return 'true' on success or 'false' on error.  If $Quota is not present, we
+ * leave vol->quota_ino as NULL and return success.
+ */
+static bool load_and_init_quota(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *tmp_ino;
+	ntfs_name *name = NULL;
+	static const ntfschar Quota[7] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), cpu_to_le16('u'),
+			cpu_to_le16('o'), cpu_to_le16('t'),
+			cpu_to_le16('a'), 0 };
+	static ntfschar Q[3] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the quota file by looking up the filename
+	 * $Quota in the extended system files directory $Extend.
+	 */
+	mutex_lock(&vol->extend_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
+			&name);
+	mutex_unlock(&vol->extend_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		/*
+		 * If the file does not exist, quotas are disabled and have
+		 * never been enabled on this volume, just return success.
+		 */
+		if (MREF_ERR(mref) == -ENOENT) {
+			ntfs_debug("$Quota not present.  Volume does not have "
+					"quotas enabled.");
+			/*
+			 * No need to try to set quotas out of date if they are
+			 * not enabled.
+			 */
+			NVolSetQuotaOutOfDate(vol);
+			return true;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
+		return false;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		ntfs_error(vol->sb, "Failed to load $Quota.");
+		return false;
+	}
+	vol->quota_ino = tmp_ino;
+	/* Get the $Q index allocation attribute. */
+	tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
+		return false;
+	}
+	vol->quota_q_ino = tmp_ino;
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * load_and_init_usnjrnl - load and setup the transaction log if present
+ * @vol:	ntfs super block describing device whose usnjrnl file to load
+ *
+ * Return 'true' on success or 'false' on error.
+ *
+ * If $UsnJrnl is not present or in the process of being disabled, we set
+ * NVolUsnJrnlStamped() and return success.
+ *
+ * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
+ * i.e. transaction logging has only just been enabled or the journal has been
+ * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
+ * and return success.
+ */
+static bool load_and_init_usnjrnl(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *tmp_ino;
+	ntfs_inode *tmp_ni;
+	struct page *page;
+	ntfs_name *name = NULL;
+	USN_HEADER *uh;
+	static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
+			cpu_to_le16('U'), cpu_to_le16('s'),
+			cpu_to_le16('n'), cpu_to_le16('J'),
+			cpu_to_le16('r'), cpu_to_le16('n'),
+			cpu_to_le16('l'), 0 };
+	static ntfschar Max[5] = { cpu_to_le16('$'),
+			cpu_to_le16('M'), cpu_to_le16('a'),
+			cpu_to_le16('x'), 0 };
+	static ntfschar J[3] = { cpu_to_le16('$'),
+			cpu_to_le16('J'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the transaction log file by looking up the
+	 * filename $UsnJrnl in the extended system files directory $Extend.
+	 */
+	mutex_lock(&vol->extend_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
+			&name);
+	mutex_unlock(&vol->extend_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		/*
+		 * If the file does not exist, transaction logging is disabled,
+		 * just return success.
+		 */
+		if (MREF_ERR(mref) == -ENOENT) {
+			ntfs_debug("$UsnJrnl not present.  Volume does not "
+					"have transaction logging enabled.");
+not_enabled:
+			/*
+			 * No need to try to stamp the transaction log if
+			 * transaction logging is not enabled.
+			 */
+			NVolSetUsnJrnlStamped(vol);
+			return true;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for "
+				"$UsnJrnl.");
+		return false;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+	if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
+		return false;
+	}
+	vol->usnjrnl_ino = tmp_ino;
+	/*
+	 * If the transaction log is in the process of being deleted, we can
+	 * ignore it.
+	 */
+	if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
+		ntfs_debug("$UsnJrnl in the process of being disabled.  "
+				"Volume does not have transaction logging "
+				"enabled.");
+		goto not_enabled;
+	}
+	/* Get the $DATA/$Max attribute. */
+	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
+				"attribute.");
+		return false;
+	}
+	vol->usnjrnl_max_ino = tmp_ino;
+	if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
+		ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
+				"attribute (size is 0x%llx but should be at "
+				"least 0x%zx bytes).", i_size_read(tmp_ino),
+				sizeof(USN_HEADER));
+		return false;
+	}
+	/* Get the $DATA/$J attribute. */
+	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
+				"attribute.");
+		return false;
+	}
+	vol->usnjrnl_j_ino = tmp_ino;
+	/* Verify $J is non-resident and sparse. */
+	tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
+	if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
+		ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
+				"and/or not sparse.");
+		return false;
+	}
+	/* Read the USN_HEADER from $DATA/$Max. */
+	page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
+				"attribute.");
+		return false;
+	}
+	uh = (USN_HEADER*)page_address(page);
+	/* Sanity check the $Max. */
+	if (unlikely(sle64_to_cpu(uh->allocation_delta) >
+			sle64_to_cpu(uh->maximum_size))) {
+		ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
+				"maximum size (0x%llx).  $UsnJrnl is corrupt.",
+				(long long)sle64_to_cpu(uh->allocation_delta),
+				(long long)sle64_to_cpu(uh->maximum_size));
+		ntfs_unmap_page(page);
+		return false;
+	}
+	/*
+	 * If the transaction log has been stamped and nothing has been written
+	 * to it since, we do not need to stamp it.
+	 */
+	if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
+			i_size_read(vol->usnjrnl_j_ino))) {
+		if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
+				i_size_read(vol->usnjrnl_j_ino))) {
+			ntfs_unmap_page(page);
+			ntfs_debug("$UsnJrnl is enabled but nothing has been "
+					"logged since it was last stamped.  "
+					"Treating this as if the volume does "
+					"not have transaction logging "
+					"enabled.");
+			goto not_enabled;
+		}
+		ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
+				"which is out of bounds (0x%llx).  $UsnJrnl "
+				"is corrupt.",
+				(long long)sle64_to_cpu(uh->lowest_valid_usn),
+				i_size_read(vol->usnjrnl_j_ino));
+		ntfs_unmap_page(page);
+		return false;
+	}
+	ntfs_unmap_page(page);
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * load_and_init_attrdef - load the attribute definitions table for a volume
+ * @vol:	ntfs super block describing device whose attrdef to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_attrdef(ntfs_volume *vol)
+{
+	loff_t i_size;
+	struct super_block *sb = vol->sb;
+	struct inode *ino;
+	struct page *page;
+	pgoff_t index, max_index;
+	unsigned int size;
+
+	ntfs_debug("Entering.");
+	/* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
+	ino = ntfs_iget(sb, FILE_AttrDef);
+	if (IS_ERR(ino) || is_bad_inode(ino)) {
+		if (!IS_ERR(ino))
+			iput(ino);
+		goto failed;
+	}
+	NInoSetSparseDisabled(NTFS_I(ino));
+	/* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
+	i_size = i_size_read(ino);
+	if (i_size <= 0 || i_size > 0x7fffffff)
+		goto iput_failed;
+	vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
+	if (!vol->attrdef)
+		goto iput_failed;
+	index = 0;
+	max_index = i_size >> PAGE_CACHE_SHIFT;
+	size = PAGE_CACHE_SIZE;
+	while (index < max_index) {
+		/* Read the attrdef table and copy it into the linear buffer. */
+read_partial_attrdef_page:
+		page = ntfs_map_page(ino->i_mapping, index);
+		if (IS_ERR(page))
+			goto free_iput_failed;
+		memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT),
+				page_address(page), size);
+		ntfs_unmap_page(page);
+	};
+	if (size == PAGE_CACHE_SIZE) {
+		size = i_size & ~PAGE_CACHE_MASK;
+		if (size)
+			goto read_partial_attrdef_page;
+	}
+	vol->attrdef_size = i_size;
+	ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
+	iput(ino);
+	return true;
+free_iput_failed:
+	ntfs_free(vol->attrdef);
+	vol->attrdef = NULL;
+iput_failed:
+	iput(ino);
+failed:
+	ntfs_error(sb, "Failed to initialize attribute definition table.");
+	return false;
+}
+
+#endif /* NTFS_RW */
+
+/**
+ * load_and_init_upcase - load the upcase table for an ntfs volume
+ * @vol:	ntfs super block describing device whose upcase to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_upcase(ntfs_volume *vol)
+{
+	loff_t i_size;
+	struct super_block *sb = vol->sb;
+	struct inode *ino;
+	struct page *page;
+	pgoff_t index, max_index;
+	unsigned int size;
+	int i, max;
+
+	ntfs_debug("Entering.");
+	/* Read upcase table and setup vol->upcase and vol->upcase_len. */
+	ino = ntfs_iget(sb, FILE_UpCase);
+	if (IS_ERR(ino) || is_bad_inode(ino)) {
+		if (!IS_ERR(ino))
+			iput(ino);
+		goto upcase_failed;
+	}
+	/*
+	 * The upcase size must not be above 64k Unicode characters, must not
+	 * be zero and must be a multiple of sizeof(ntfschar).
+	 */
+	i_size = i_size_read(ino);
+	if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
+			i_size > 64ULL * 1024 * sizeof(ntfschar))
+		goto iput_upcase_failed;
+	vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
+	if (!vol->upcase)
+		goto iput_upcase_failed;
+	index = 0;
+	max_index = i_size >> PAGE_CACHE_SHIFT;
+	size = PAGE_CACHE_SIZE;
+	while (index < max_index) {
+		/* Read the upcase table and copy it into the linear buffer. */
+read_partial_upcase_page:
+		page = ntfs_map_page(ino->i_mapping, index);
+		if (IS_ERR(page))
+			goto iput_upcase_failed;
+		memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT),
+				page_address(page), size);
+		ntfs_unmap_page(page);
+	};
+	if (size == PAGE_CACHE_SIZE) {
+		size = i_size & ~PAGE_CACHE_MASK;
+		if (size)
+			goto read_partial_upcase_page;
+	}
+	vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
+	ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
+			i_size, 64 * 1024 * sizeof(ntfschar));
+	iput(ino);
+	mutex_lock(&ntfs_lock);
+	if (!default_upcase) {
+		ntfs_debug("Using volume specified $UpCase since default is "
+				"not present.");
+		mutex_unlock(&ntfs_lock);
+		return true;
+	}
+	max = default_upcase_len;
+	if (max > vol->upcase_len)
+		max = vol->upcase_len;
+	for (i = 0; i < max; i++)
+		if (vol->upcase[i] != default_upcase[i])
+			break;
+	if (i == max) {
+		ntfs_free(vol->upcase);
+		vol->upcase = default_upcase;
+		vol->upcase_len = max;
+		ntfs_nr_upcase_users++;
+		mutex_unlock(&ntfs_lock);
+		ntfs_debug("Volume specified $UpCase matches default. Using "
+				"default.");
+		return true;
+	}
+	mutex_unlock(&ntfs_lock);
+	ntfs_debug("Using volume specified $UpCase since it does not match "
+			"the default.");
+	return true;
+iput_upcase_failed:
+	iput(ino);
+	ntfs_free(vol->upcase);
+	vol->upcase = NULL;
+upcase_failed:
+	mutex_lock(&ntfs_lock);
+	if (default_upcase) {
+		vol->upcase = default_upcase;
+		vol->upcase_len = default_upcase_len;
+		ntfs_nr_upcase_users++;
+		mutex_unlock(&ntfs_lock);
+		ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
+				"default.");
+		return true;
+	}
+	mutex_unlock(&ntfs_lock);
+	ntfs_error(sb, "Failed to initialize upcase table.");
+	return false;
+}
+
+/*
+ * The lcn and mft bitmap inodes are NTFS-internal inodes with
+ * their own special locking rules:
+ */
+static struct lock_class_key
+	lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
+	mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
+
+/**
+ * load_system_files - open the system files using normal functions
+ * @vol:	ntfs super block describing device whose system files to load
+ *
+ * Open the system files with normal access functions and complete setting up
+ * the ntfs super block @vol.
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_system_files(ntfs_volume *vol)
+{
+	struct super_block *sb = vol->sb;
+	MFT_RECORD *m;
+	VOLUME_INFORMATION *vi;
+	ntfs_attr_search_ctx *ctx;
+#ifdef NTFS_RW
+	RESTART_PAGE_HEADER *rp;
+	int err;
+#endif /* NTFS_RW */
+
+	ntfs_debug("Entering.");
+#ifdef NTFS_RW
+	/* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
+	if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
+		static const char *es1 = "Failed to load $MFTMirr";
+		static const char *es2 = "$MFTMirr does not match $MFT";
+		static const char *es3 = ".  Run ntfsfix and/or chkdsk.";
+
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						!vol->mftmirr_ino ? es1 : es2,
+						es3);
+				goto iput_mirr_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s",
+					!vol->mftmirr_ino ? es1 : es2, es3);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s",
+					!vol->mftmirr_ino ? es1 : es2, es3);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+#endif /* NTFS_RW */
+	/* Get mft bitmap attribute inode. */
+	vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
+	if (IS_ERR(vol->mftbmp_ino)) {
+		ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
+		goto iput_mirr_err_out;
+	}
+	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
+			   &mftbmp_runlist_lock_key);
+	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
+			   &mftbmp_mrec_lock_key);
+	/* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
+	if (!load_and_init_upcase(vol))
+		goto iput_mftbmp_err_out;
+#ifdef NTFS_RW
+	/*
+	 * Read attribute definitions table and setup @vol->attrdef and
+	 * @vol->attrdef_size.
+	 */
+	if (!load_and_init_attrdef(vol))
+		goto iput_upcase_err_out;
+#endif /* NTFS_RW */
+	/*
+	 * Get the cluster allocation bitmap inode and verify the size, no
+	 * need for any locking at this stage as we are already running
+	 * exclusively as we are mount in progress task.
+	 */
+	vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
+	if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
+		if (!IS_ERR(vol->lcnbmp_ino))
+			iput(vol->lcnbmp_ino);
+		goto bitmap_failed;
+	}
+	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
+			   &lcnbmp_runlist_lock_key);
+	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
+			   &lcnbmp_mrec_lock_key);
+
+	NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
+	if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
+		iput(vol->lcnbmp_ino);
+bitmap_failed:
+		ntfs_error(sb, "Failed to load $Bitmap.");
+		goto iput_attrdef_err_out;
+	}
+	/*
+	 * Get the volume inode and setup our cache of the volume flags and
+	 * version.
+	 */
+	vol->vol_ino = ntfs_iget(sb, FILE_Volume);
+	if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
+		if (!IS_ERR(vol->vol_ino))
+			iput(vol->vol_ino);
+volume_failed:
+		ntfs_error(sb, "Failed to load $Volume.");
+		goto iput_lcnbmp_err_out;
+	}
+	m = map_mft_record(NTFS_I(vol->vol_ino));
+	if (IS_ERR(m)) {
+iput_volume_failed:
+		iput(vol->vol_ino);
+		goto volume_failed;
+	}
+	if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
+		ntfs_error(sb, "Failed to get attribute search context.");
+		goto get_ctx_vol_failed;
+	}
+	if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+			ctx) || ctx->attr->non_resident || ctx->attr->flags) {
+err_put_vol:
+		ntfs_attr_put_search_ctx(ctx);
+get_ctx_vol_failed:
+		unmap_mft_record(NTFS_I(vol->vol_ino));
+		goto iput_volume_failed;
+	}
+	vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	/* Some bounds checks. */
+	if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
+			le32_to_cpu(ctx->attr->data.resident.value_length) >
+			(u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
+		goto err_put_vol;
+	/* Copy the volume flags and version to the ntfs_volume structure. */
+	vol->vol_flags = vi->flags;
+	vol->major_ver = vi->major_ver;
+	vol->minor_ver = vi->minor_ver;
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(NTFS_I(vol->vol_ino));
+	pr_info("volume version %i.%i.\n", vol->major_ver,
+			vol->minor_ver);
+	if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
+		ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
+				"volume version %i.%i (need at least version "
+				"3.0).", vol->major_ver, vol->minor_ver);
+		NVolClearSparseEnabled(vol);
+	}
+#ifdef NTFS_RW
+	/* Make sure that no unsupported volume flags are set. */
+	if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+		static const char *es1a = "Volume is dirty";
+		static const char *es1b = "Volume has been modified by chkdsk";
+		static const char *es1c = "Volume has unsupported flags set";
+		static const char *es2a = ".  Run chkdsk and mount in Windows.";
+		static const char *es2b = ".  Mount in Windows.";
+		const char *es1, *es2;
+
+		es2 = es2a;
+		if (vol->vol_flags & VOLUME_IS_DIRTY)
+			es1 = es1a;
+		else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			es1 = es1b;
+			es2 = es2b;
+		} else {
+			es1 = es1c;
+			ntfs_warning(sb, "Unsupported volume flags 0x%x "
+					"encountered.",
+					(unsigned)le16_to_cpu(vol->vol_flags));
+		}
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_vol_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/*
+		 * Do not set NVolErrors() because ntfs_remount() re-checks the
+		 * flags which we need to do in case any flags have changed.
+		 */
+	}
+	/*
+	 * Get the inode for the logfile, check it and determine if the volume
+	 * was shutdown cleanly.
+	 */
+	rp = NULL;
+	if (!load_and_check_logfile(vol, &rp) ||
+			!ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
+		static const char *es1a = "Failed to load $LogFile";
+		static const char *es1b = "$LogFile is not clean";
+		static const char *es2 = ".  Mount in Windows.";
+		const char *es1;
+
+		es1 = !vol->logfile_ino ? es1a : es1b;
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				if (vol->logfile_ino) {
+					BUG_ON(!rp);
+					ntfs_free(rp);
+				}
+				goto iput_logfile_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	ntfs_free(rp);
+#endif /* NTFS_RW */
+	/* Get the root directory inode so we can do path lookups. */
+	vol->root_ino = ntfs_iget(sb, FILE_root);
+	if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
+		if (!IS_ERR(vol->root_ino))
+			iput(vol->root_ino);
+		ntfs_error(sb, "Failed to load root directory.");
+		goto iput_logfile_err_out;
+	}
+#ifdef NTFS_RW
+	/*
+	 * Check if Windows is suspended to disk on the target volume.  If it
+	 * is hibernated, we must not write *anything* to the disk so set
+	 * NVolErrors() without setting the dirty volume flag and mount
+	 * read-only.  This will prevent read-write remounting and it will also
+	 * prevent all writes.
+	 */
+	err = check_windows_hibernation_status(vol);
+	if (unlikely(err)) {
+		static const char *es1a = "Failed to determine if Windows is "
+				"hibernated";
+		static const char *es1b = "Windows is hibernated";
+		static const char *es2 = ".  Run chkdsk.";
+		const char *es1;
+
+		es1 = err < 0 ? es1a : es1b;
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_root_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	/* If (still) a read-write mount, mark the volume dirty. */
+	if (!(sb->s_flags & MS_RDONLY) &&
+			ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+		static const char *es1 = "Failed to set dirty bit in volume "
+				"information flags";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_root_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		/*
+		 * Do not set NVolErrors() because ntfs_remount() might manage
+		 * to set the dirty flag in which case all would be well.
+		 */
+	}
+#if 0
+	// TODO: Enable this code once we start modifying anything that is
+	//	 different between NTFS 1.2 and 3.x...
+	/*
+	 * If (still) a read-write mount, set the NT4 compatibility flag on
+	 * newer NTFS version volumes.
+	 */
+	if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
+			ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+		static const char *es1 = "Failed to set NT4 compatibility flag";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_root_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+#endif
+	/* If (still) a read-write mount, empty the logfile. */
+	if (!(sb->s_flags & MS_RDONLY) &&
+			!ntfs_empty_logfile(vol->logfile_ino)) {
+		static const char *es1 = "Failed to empty $LogFile";
+		static const char *es2 = ".  Mount in Windows.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_root_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+#endif /* NTFS_RW */
+	/* If on NTFS versions before 3.0, we are done. */
+	if (unlikely(vol->major_ver < 3))
+		return true;
+	/* NTFS 3.0+ specific initialization. */
+	/* Get the security descriptors inode. */
+	vol->secure_ino = ntfs_iget(sb, FILE_Secure);
+	if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
+		if (!IS_ERR(vol->secure_ino))
+			iput(vol->secure_ino);
+		ntfs_error(sb, "Failed to load $Secure.");
+		goto iput_root_err_out;
+	}
+	// TODO: Initialize security.
+	/* Get the extended system files' directory inode. */
+	vol->extend_ino = ntfs_iget(sb, FILE_Extend);
+	if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+		if (!IS_ERR(vol->extend_ino))
+			iput(vol->extend_ino);
+		ntfs_error(sb, "Failed to load $Extend.");
+		goto iput_sec_err_out;
+	}
+#ifdef NTFS_RW
+	/* Find the quota file, load it if present, and set it up. */
+	if (!load_and_init_quota(vol)) {
+		static const char *es1 = "Failed to load $Quota";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_quota_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	/* If (still) a read-write mount, mark the quotas out of date. */
+	if (!(sb->s_flags & MS_RDONLY) &&
+			!ntfs_mark_quotas_out_of_date(vol)) {
+		static const char *es1 = "Failed to mark quotas out of date";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_quota_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+	/*
+	 * Find the transaction log file ($UsnJrnl), load it if present, check
+	 * it, and set it up.
+	 */
+	if (!load_and_init_usnjrnl(vol)) {
+		static const char *es1 = "Failed to load $UsnJrnl";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_usnjrnl_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	/* If (still) a read-write mount, stamp the transaction log. */
+	if (!(sb->s_flags & MS_RDONLY) && !ntfs_stamp_usnjrnl(vol)) {
+		static const char *es1 = "Failed to stamp transaction log "
+				"($UsnJrnl)";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_usnjrnl_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+#endif /* NTFS_RW */
+	return true;
+#ifdef NTFS_RW
+iput_usnjrnl_err_out:
+	if (vol->usnjrnl_j_ino)
+		iput(vol->usnjrnl_j_ino);
+	if (vol->usnjrnl_max_ino)
+		iput(vol->usnjrnl_max_ino);
+	if (vol->usnjrnl_ino)
+		iput(vol->usnjrnl_ino);
+iput_quota_err_out:
+	if (vol->quota_q_ino)
+		iput(vol->quota_q_ino);
+	if (vol->quota_ino)
+		iput(vol->quota_ino);
+	iput(vol->extend_ino);
+#endif /* NTFS_RW */
+iput_sec_err_out:
+	iput(vol->secure_ino);
+iput_root_err_out:
+	iput(vol->root_ino);
+iput_logfile_err_out:
+#ifdef NTFS_RW
+	if (vol->logfile_ino)
+		iput(vol->logfile_ino);
+iput_vol_err_out:
+#endif /* NTFS_RW */
+	iput(vol->vol_ino);
+iput_lcnbmp_err_out:
+	iput(vol->lcnbmp_ino);
+iput_attrdef_err_out:
+	vol->attrdef_size = 0;
+	if (vol->attrdef) {
+		ntfs_free(vol->attrdef);
+		vol->attrdef = NULL;
+	}
+#ifdef NTFS_RW
+iput_upcase_err_out:
+#endif /* NTFS_RW */
+	vol->upcase_len = 0;
+	mutex_lock(&ntfs_lock);
+	if (vol->upcase == default_upcase) {
+		ntfs_nr_upcase_users--;
+		vol->upcase = NULL;
+	}
+	mutex_unlock(&ntfs_lock);
+	if (vol->upcase) {
+		ntfs_free(vol->upcase);
+		vol->upcase = NULL;
+	}
+iput_mftbmp_err_out:
+	iput(vol->mftbmp_ino);
+iput_mirr_err_out:
+#ifdef NTFS_RW
+	if (vol->mftmirr_ino)
+		iput(vol->mftmirr_ino);
+#endif /* NTFS_RW */
+	return false;
+}
+
+/**
+ * ntfs_put_super - called by the vfs to unmount a volume
+ * @sb:		vfs superblock of volume to unmount
+ *
+ * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
+ * the volume is being unmounted (umount system call has been invoked) and it
+ * releases all inodes and memory belonging to the NTFS specific part of the
+ * super block.
+ */
+static void ntfs_put_super(struct super_block *sb)
+{
+	ntfs_volume *vol = NTFS_SB(sb);
+
+	ntfs_debug("Entering.");
+
+#ifdef NTFS_RW
+	/*
+	 * Commit all inodes while they are still open in case some of them
+	 * cause others to be dirtied.
+	 */
+	ntfs_commit_inode(vol->vol_ino);
+
+	/* NTFS 3.0+ specific. */
+	if (vol->major_ver >= 3) {
+		if (vol->usnjrnl_j_ino)
+			ntfs_commit_inode(vol->usnjrnl_j_ino);
+		if (vol->usnjrnl_max_ino)
+			ntfs_commit_inode(vol->usnjrnl_max_ino);
+		if (vol->usnjrnl_ino)
+			ntfs_commit_inode(vol->usnjrnl_ino);
+		if (vol->quota_q_ino)
+			ntfs_commit_inode(vol->quota_q_ino);
+		if (vol->quota_ino)
+			ntfs_commit_inode(vol->quota_ino);
+		if (vol->extend_ino)
+			ntfs_commit_inode(vol->extend_ino);
+		if (vol->secure_ino)
+			ntfs_commit_inode(vol->secure_ino);
+	}
+
+	ntfs_commit_inode(vol->root_ino);
+
+	down_write(&vol->lcnbmp_lock);
+	ntfs_commit_inode(vol->lcnbmp_ino);
+	up_write(&vol->lcnbmp_lock);
+
+	down_write(&vol->mftbmp_lock);
+	ntfs_commit_inode(vol->mftbmp_ino);
+	up_write(&vol->mftbmp_lock);
+
+	if (vol->logfile_ino)
+		ntfs_commit_inode(vol->logfile_ino);
+
+	if (vol->mftmirr_ino)
+		ntfs_commit_inode(vol->mftmirr_ino);
+	ntfs_commit_inode(vol->mft_ino);
+
+	/*
+	 * If a read-write mount and no volume errors have occurred, mark the
+	 * volume clean.  Also, re-commit all affected inodes.
+	 */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		if (!NVolErrors(vol)) {
+			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+				ntfs_warning(sb, "Failed to clear dirty bit "
+						"in volume information "
+						"flags.  Run chkdsk.");
+			ntfs_commit_inode(vol->vol_ino);
+			ntfs_commit_inode(vol->root_ino);
+			if (vol->mftmirr_ino)
+				ntfs_commit_inode(vol->mftmirr_ino);
+			ntfs_commit_inode(vol->mft_ino);
+		} else {
+			ntfs_warning(sb, "Volume has errors.  Leaving volume "
+					"marked dirty.  Run chkdsk.");
+		}
+	}
+#endif /* NTFS_RW */
+
+	iput(vol->vol_ino);
+	vol->vol_ino = NULL;
+
+	/* NTFS 3.0+ specific clean up. */
+	if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+		if (vol->usnjrnl_j_ino) {
+			iput(vol->usnjrnl_j_ino);
+			vol->usnjrnl_j_ino = NULL;
+		}
+		if (vol->usnjrnl_max_ino) {
+			iput(vol->usnjrnl_max_ino);
+			vol->usnjrnl_max_ino = NULL;
+		}
+		if (vol->usnjrnl_ino) {
+			iput(vol->usnjrnl_ino);
+			vol->usnjrnl_ino = NULL;
+		}
+		if (vol->quota_q_ino) {
+			iput(vol->quota_q_ino);
+			vol->quota_q_ino = NULL;
+		}
+		if (vol->quota_ino) {
+			iput(vol->quota_ino);
+			vol->quota_ino = NULL;
+		}
+#endif /* NTFS_RW */
+		if (vol->extend_ino) {
+			iput(vol->extend_ino);
+			vol->extend_ino = NULL;
+		}
+		if (vol->secure_ino) {
+			iput(vol->secure_ino);
+			vol->secure_ino = NULL;
+		}
+	}
+
+	iput(vol->root_ino);
+	vol->root_ino = NULL;
+
+	down_write(&vol->lcnbmp_lock);
+	iput(vol->lcnbmp_ino);
+	vol->lcnbmp_ino = NULL;
+	up_write(&vol->lcnbmp_lock);
+
+	down_write(&vol->mftbmp_lock);
+	iput(vol->mftbmp_ino);
+	vol->mftbmp_ino = NULL;
+	up_write(&vol->mftbmp_lock);
+
+#ifdef NTFS_RW
+	if (vol->logfile_ino) {
+		iput(vol->logfile_ino);
+		vol->logfile_ino = NULL;
+	}
+	if (vol->mftmirr_ino) {
+		/* Re-commit the mft mirror and mft just in case. */
+		ntfs_commit_inode(vol->mftmirr_ino);
+		ntfs_commit_inode(vol->mft_ino);
+		iput(vol->mftmirr_ino);
+		vol->mftmirr_ino = NULL;
+	}
+	/*
+	 * We should have no dirty inodes left, due to
+	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+	 * the underlying mft records are written out and cleaned.
+	 */
+	ntfs_commit_inode(vol->mft_ino);
+	write_inode_now(vol->mft_ino, 1);
+#endif /* NTFS_RW */
+
+	iput(vol->mft_ino);
+	vol->mft_ino = NULL;
+
+	/* Throw away the table of attribute definitions. */
+	vol->attrdef_size = 0;
+	if (vol->attrdef) {
+		ntfs_free(vol->attrdef);
+		vol->attrdef = NULL;
+	}
+	vol->upcase_len = 0;
+	/*
+	 * Destroy the global default upcase table if necessary.  Also decrease
+	 * the number of upcase users if we are a user.
+	 */
+	mutex_lock(&ntfs_lock);
+	if (vol->upcase == default_upcase) {
+		ntfs_nr_upcase_users--;
+		vol->upcase = NULL;
+	}
+	if (!ntfs_nr_upcase_users && default_upcase) {
+		ntfs_free(default_upcase);
+		default_upcase = NULL;
+	}
+	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+		free_compression_buffers();
+	mutex_unlock(&ntfs_lock);
+	if (vol->upcase) {
+		ntfs_free(vol->upcase);
+		vol->upcase = NULL;
+	}
+
+	unload_nls(vol->nls_map);
+
+	sb->s_fs_info = NULL;
+	kfree(vol);
+}
+
+/**
+ * get_nr_free_clusters - return the number of free clusters on a volume
+ * @vol:	ntfs volume for which to obtain free cluster count
+ *
+ * Calculate the number of free clusters on the mounted NTFS volume @vol. We
+ * actually calculate the number of clusters in use instead because this
+ * allows us to not care about partial pages as these will be just zero filled
+ * and hence not be counted as allocated clusters.
+ *
+ * The only particularity is that clusters beyond the end of the logical ntfs
+ * volume will be marked as allocated to prevent errors which means we have to
+ * discount those at the end. This is important as the cluster bitmap always
+ * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
+ * the logical volume and marked in use when they are not as they do not exist.
+ *
+ * If any pages cannot be read we assume all clusters in the erroring pages are
+ * in use. This means we return an underestimate on errors which is better than
+ * an overestimate.
+ */
+static s64 get_nr_free_clusters(ntfs_volume *vol)
+{
+	s64 nr_free = vol->nr_clusters;
+	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
+	struct page *page;
+	pgoff_t index, max_index;
+
+	ntfs_debug("Entering.");
+	/* Serialize accesses to the cluster bitmap. */
+	down_read(&vol->lcnbmp_lock);
+	/*
+	 * Convert the number of bits into bytes rounded up, then convert into
+	 * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one
+	 * full and one partial page max_index = 2.
+	 */
+	max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT;
+	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
+			max_index, PAGE_CACHE_SIZE / 4);
+	for (index = 0; index < max_index; index++) {
+		unsigned long *kaddr;
+
+		/*
+		 * Read the page from page cache, getting it from backing store
+		 * if necessary, and increment the use count.
+		 */
+		page = read_mapping_page(mapping, index, NULL);
+		/* Ignore pages which errored synchronously. */
+		if (IS_ERR(page)) {
+			ntfs_debug("read_mapping_page() error. Skipping "
+					"page (index 0x%lx).", index);
+			nr_free -= PAGE_CACHE_SIZE * 8;
+			continue;
+		}
+		kaddr = kmap_atomic(page);
+		/*
+		 * Subtract the number of set bits. If this
+		 * is the last page and it is partial we don't really care as
+		 * it just means we do a little extra work but it won't affect
+		 * the result as all out of range bytes are set to zero by
+		 * ntfs_readpage().
+		 */
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+		kunmap_atomic(kaddr);
+		page_cache_release(page);
+	}
+	ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
+	/*
+	 * Fixup for eventual bits outside logical ntfs volume (see function
+	 * description above).
+	 */
+	if (vol->nr_clusters & 63)
+		nr_free += 64 - (vol->nr_clusters & 63);
+	up_read(&vol->lcnbmp_lock);
+	/* If errors occurred we may well have gone below zero, fix this. */
+	if (nr_free < 0)
+		nr_free = 0;
+	ntfs_debug("Exiting.");
+	return nr_free;
+}
+
+/**
+ * __get_nr_free_mft_records - return the number of free inodes on a volume
+ * @vol:	ntfs volume for which to obtain free inode count
+ * @nr_free:	number of mft records in filesystem
+ * @max_index:	maximum number of pages containing set bits
+ *
+ * Calculate the number of free mft records (inodes) on the mounted NTFS
+ * volume @vol. We actually calculate the number of mft records in use instead
+ * because this allows us to not care about partial pages as these will be just
+ * zero filled and hence not be counted as allocated mft record.
+ *
+ * If any pages cannot be read we assume all mft records in the erroring pages
+ * are in use. This means we return an underestimate on errors which is better
+ * than an overestimate.
+ *
+ * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
+ */
+static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
+		s64 nr_free, const pgoff_t max_index)
+{
+	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
+	struct page *page;
+	pgoff_t index;
+
+	ntfs_debug("Entering.");
+	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
+			"0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
+	for (index = 0; index < max_index; index++) {
+		unsigned long *kaddr;
+
+		/*
+		 * Read the page from page cache, getting it from backing store
+		 * if necessary, and increment the use count.
+		 */
+		page = read_mapping_page(mapping, index, NULL);
+		/* Ignore pages which errored synchronously. */
+		if (IS_ERR(page)) {
+			ntfs_debug("read_mapping_page() error. Skipping "
+					"page (index 0x%lx).", index);
+			nr_free -= PAGE_CACHE_SIZE * 8;
+			continue;
+		}
+		kaddr = kmap_atomic(page);
+		/*
+		 * Subtract the number of set bits. If this
+		 * is the last page and it is partial we don't really care as
+		 * it just means we do a little extra work but it won't affect
+		 * the result as all out of range bytes are set to zero by
+		 * ntfs_readpage().
+		 */
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+		kunmap_atomic(kaddr);
+		page_cache_release(page);
+	}
+	ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
+			index - 1);
+	/* If errors occurred we may well have gone below zero, fix this. */
+	if (nr_free < 0)
+		nr_free = 0;
+	ntfs_debug("Exiting.");
+	return nr_free;
+}
+
+/**
+ * ntfs_statfs - return information about mounted NTFS volume
+ * @dentry:	dentry from mounted volume
+ * @sfs:	statfs structure in which to return the information
+ *
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
+ * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
+ * called). We interpret the values to be correct of the moment in time at
+ * which we are called. Most values are variable otherwise and this isn't just
+ * the free values but the totals as well. For example we can increase the
+ * total number of file nodes if we run out and we can keep doing this until
+ * there is no more space on the volume left at all.
+ *
+ * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
+ * ustat system calls.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
+{
+	struct super_block *sb = dentry->d_sb;
+	s64 size;
+	ntfs_volume *vol = NTFS_SB(sb);
+	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
+	pgoff_t max_index;
+	unsigned long flags;
+
+	ntfs_debug("Entering.");
+	/* Type of filesystem. */
+	sfs->f_type   = NTFS_SB_MAGIC;
+	/* Optimal transfer block size. */
+	sfs->f_bsize  = PAGE_CACHE_SIZE;
+	/*
+	 * Total data blocks in filesystem in units of f_bsize and since
+	 * inodes are also stored in data blocs ($MFT is a file) this is just
+	 * the total clusters.
+	 */
+	sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT;
+	/* Free data blocks in filesystem in units of f_bsize. */
+	size	      = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT;
+	if (size < 0LL)
+		size = 0LL;
+	/* Free blocks avail to non-superuser, same as above on NTFS. */
+	sfs->f_bavail = sfs->f_bfree = size;
+	/* Serialize accesses to the inode bitmap. */
+	down_read(&vol->mftbmp_lock);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
+	/*
+	 * Convert the maximum number of set bits into bytes rounded up, then
+	 * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
+	 * have one full and one partial page max_index = 2.
+	 */
+	max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
+			+ 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/* Number of inodes in filesystem (at this point in time). */
+	sfs->f_files = size;
+	/* Free inodes in fs (based on current total count). */
+	sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
+	up_read(&vol->mftbmp_lock);
+	/*
+	 * File system id. This is extremely *nix flavour dependent and even
+	 * within Linux itself all fs do their own thing. I interpret this to
+	 * mean a unique id associated with the mounted fs and not the id
+	 * associated with the filesystem driver, the latter is already given
+	 * by the filesystem type in sfs->f_type. Thus we use the 64-bit
+	 * volume serial number splitting it into two 32-bit parts. We enter
+	 * the least significant 32-bits in f_fsid[0] and the most significant
+	 * 32-bits in f_fsid[1].
+	 */
+	sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff;
+	sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff;
+	/* Maximum length of filenames. */
+	sfs->f_namelen	   = NTFS_MAX_NAME_LEN;
+	return 0;
+}
+
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+	return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
+
+/**
+ * The complete super operations.
+ */
+static const struct super_operations ntfs_sops = {
+	.alloc_inode	= ntfs_alloc_big_inode,	  /* VFS: Allocate new inode. */
+	.destroy_inode	= ntfs_destroy_big_inode, /* VFS: Deallocate inode. */
+#ifdef NTFS_RW
+	.write_inode	= ntfs_write_inode,	/* VFS: Write dirty inode to
+						   disk. */
+#endif /* NTFS_RW */
+	.put_super	= ntfs_put_super,	/* Syscall: umount. */
+	.statfs		= ntfs_statfs,		/* Syscall: statfs */
+	.remount_fs	= ntfs_remount,		/* Syscall: mount -o remount. */
+	.evict_inode	= ntfs_evict_big_inode,	/* VFS: Called when an inode is
+						   removed from memory. */
+	.show_options	= ntfs_show_options,	/* Show mount options in
+						   proc. */
+};
+
+/**
+ * ntfs_fill_super - mount an ntfs filesystem
+ * @sb:		super block of ntfs filesystem to mount
+ * @opt:	string containing the mount options
+ * @silent:	silence error output
+ *
+ * ntfs_fill_super() is called by the VFS to mount the device described by @sb
+ * with the mount otions in @data with the NTFS filesystem.
+ *
+ * If @silent is true, remain silent even if errors are detected. This is used
+ * during bootup, when the kernel tries to mount the root filesystem with all
+ * registered filesystems one after the other until one succeeds. This implies
+ * that all filesystems except the correct one will quite correctly and
+ * expectedly return an error, but nobody wants to see error messages when in
+ * fact this is what is supposed to happen.
+ *
+ * NOTE: @sb->s_flags contains the mount options flags.
+ */
+static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
+{
+	ntfs_volume *vol;
+	struct buffer_head *bh;
+	struct inode *tmp_ino;
+	int blocksize, result;
+
+	/*
+	 * We do a pretty difficult piece of bootstrap by reading the
+	 * MFT (and other metadata) from disk into memory. We'll only
+	 * release this metadata during umount, so the locking patterns
+	 * observed during bootstrap do not count. So turn off the
+	 * observation of locking patterns (strictly for this context
+	 * only) while mounting NTFS. [The validator is still active
+	 * otherwise, even for this context: it will for example record
+	 * lock class registrations.]
+	 */
+	lockdep_off();
+	ntfs_debug("Entering.");
+#ifndef NTFS_RW
+	sb->s_flags |= MS_RDONLY;
+#endif /* ! NTFS_RW */
+	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
+	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
+	vol = NTFS_SB(sb);
+	if (!vol) {
+		if (!silent)
+			ntfs_error(sb, "Allocation of NTFS volume structure "
+					"failed. Aborting mount...");
+		lockdep_on();
+		return -ENOMEM;
+	}
+	/* Initialize ntfs_volume structure. */
+	*vol = (ntfs_volume) {
+		.sb = sb,
+		/*
+		 * Default is group and other don't have any access to files or
+		 * directories while owner has full access. Further, files by
+		 * default are not executable but directories are of course
+		 * browseable.
+		 */
+		.fmask = 0177,
+		.dmask = 0077,
+	};
+	init_rwsem(&vol->mftbmp_lock);
+	init_rwsem(&vol->lcnbmp_lock);
+
+	/* By default, enable sparse support. */
+	NVolSetSparseEnabled(vol);
+
+	/* Important to get the mount options dealt with now. */
+	if (!parse_options(vol, (char*)opt))
+		goto err_out_now;
+
+	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+		if (!silent)
+			ntfs_error(sb, "Device has unsupported sector size "
+					"(%i).  The maximum supported sector "
+					"size on this architecture is %lu "
+					"bytes.",
+					bdev_logical_block_size(sb->s_bdev),
+					PAGE_CACHE_SIZE);
+		goto err_out_now;
+	}
+	/*
+	 * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
+	 * sector size, whichever is bigger.
+	 */
+	blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
+	if (blocksize < NTFS_BLOCK_SIZE) {
+		if (!silent)
+			ntfs_error(sb, "Unable to set device block size.");
+		goto err_out_now;
+	}
+	BUG_ON(blocksize != sb->s_blocksize);
+	ntfs_debug("Set device block size to %i bytes (block size bits %i).",
+			blocksize, sb->s_blocksize_bits);
+	/* Determine the size of the device in units of block_size bytes. */
+	if (!i_size_read(sb->s_bdev->bd_inode)) {
+		if (!silent)
+			ntfs_error(sb, "Unable to determine device size.");
+		goto err_out_now;
+	}
+	vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
+			sb->s_blocksize_bits;
+	/* Read the boot sector and return unlocked buffer head to it. */
+	if (!(bh = read_ntfs_boot_sector(sb, silent))) {
+		if (!silent)
+			ntfs_error(sb, "Not an NTFS volume.");
+		goto err_out_now;
+	}
+	/*
+	 * Extract the data from the boot sector and setup the ntfs volume
+	 * using it.
+	 */
+	result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
+	brelse(bh);
+	if (!result) {
+		if (!silent)
+			ntfs_error(sb, "Unsupported NTFS filesystem.");
+		goto err_out_now;
+	}
+	/*
+	 * If the boot sector indicates a sector size bigger than the current
+	 * device block size, switch the device block size to the sector size.
+	 * TODO: It may be possible to support this case even when the set
+	 * below fails, we would just be breaking up the i/o for each sector
+	 * into multiple blocks for i/o purposes but otherwise it should just
+	 * work.  However it is safer to leave disabled until someone hits this
+	 * error message and then we can get them to try it without the setting
+	 * so we know for sure that it works.
+	 */
+	if (vol->sector_size > blocksize) {
+		blocksize = sb_set_blocksize(sb, vol->sector_size);
+		if (blocksize != vol->sector_size) {
+			if (!silent)
+				ntfs_error(sb, "Unable to set device block "
+						"size to sector size (%i).",
+						vol->sector_size);
+			goto err_out_now;
+		}
+		BUG_ON(blocksize != sb->s_blocksize);
+		vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
+				sb->s_blocksize_bits;
+		ntfs_debug("Changed device block size to %i bytes (block size "
+				"bits %i) to match volume sector size.",
+				blocksize, sb->s_blocksize_bits);
+	}
+	/* Initialize the cluster and mft allocators. */
+	ntfs_setup_allocators(vol);
+	/* Setup remaining fields in the super block. */
+	sb->s_magic = NTFS_SB_MAGIC;
+	/*
+	 * Ntfs allows 63 bits for the file size, i.e. correct would be:
+	 *	sb->s_maxbytes = ~0ULL >> 1;
+	 * But the kernel uses a long as the page cache page index which on
+	 * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
+	 * defined to the maximum the page cache page index can cope with
+	 * without overflowing the index or to 2^63 - 1, whichever is smaller.
+	 */
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	/* Ntfs measures time in 100ns intervals. */
+	sb->s_time_gran = 100;
+	/*
+	 * Now load the metadata required for the page cache and our address
+	 * space operations to function. We do this by setting up a specialised
+	 * read_inode method and then just calling the normal iget() to obtain
+	 * the inode for $MFT which is sufficient to allow our normal inode
+	 * operations and associated address space operations to function.
+	 */
+	sb->s_op = &ntfs_sops;
+	tmp_ino = new_inode(sb);
+	if (!tmp_ino) {
+		if (!silent)
+			ntfs_error(sb, "Failed to load essential metadata.");
+		goto err_out_now;
+	}
+	tmp_ino->i_ino = FILE_MFT;
+	insert_inode_hash(tmp_ino);
+	if (ntfs_read_inode_mount(tmp_ino) < 0) {
+		if (!silent)
+			ntfs_error(sb, "Failed to load essential metadata.");
+		goto iput_tmp_ino_err_out_now;
+	}
+	mutex_lock(&ntfs_lock);
+	/*
+	 * The current mount is a compression user if the cluster size is
+	 * less than or equal 4kiB.
+	 */
+	if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
+		result = allocate_compression_buffers();
+		if (result) {
+			ntfs_error(NULL, "Failed to allocate buffers "
+					"for compression engine.");
+			ntfs_nr_compression_users--;
+			mutex_unlock(&ntfs_lock);
+			goto iput_tmp_ino_err_out_now;
+		}
+	}
+	/*
+	 * Generate the global default upcase table if necessary.  Also
+	 * temporarily increment the number of upcase users to avoid race
+	 * conditions with concurrent (u)mounts.
+	 */
+	if (!default_upcase)
+		default_upcase = generate_default_upcase();
+	ntfs_nr_upcase_users++;
+	mutex_unlock(&ntfs_lock);
+	/*
+	 * From now on, ignore @silent parameter. If we fail below this line,
+	 * it will be due to a corrupt fs or a system error, so we report it.
+	 */
+	/*
+	 * Open the system files with normal access functions and complete
+	 * setting up the ntfs super block.
+	 */
+	if (!load_system_files(vol)) {
+		ntfs_error(sb, "Failed to load system files.");
+		goto unl_upcase_iput_tmp_ino_err_out_now;
+	}
+
+	/* We grab a reference, simulating an ntfs_iget(). */
+	ihold(vol->root_ino);
+	if ((sb->s_root = d_make_root(vol->root_ino))) {
+		ntfs_debug("Exiting, status successful.");
+		/* Release the default upcase if it has no users. */
+		mutex_lock(&ntfs_lock);
+		if (!--ntfs_nr_upcase_users && default_upcase) {
+			ntfs_free(default_upcase);
+			default_upcase = NULL;
+		}
+		mutex_unlock(&ntfs_lock);
+		sb->s_export_op = &ntfs_export_ops;
+		lockdep_on();
+		return 0;
+	}
+	ntfs_error(sb, "Failed to allocate root directory.");
+	/* Clean up after the successful load_system_files() call from above. */
+	// TODO: Use ntfs_put_super() instead of repeating all this code...
+	// FIXME: Should mark the volume clean as the error is most likely
+	// 	  -ENOMEM.
+	iput(vol->vol_ino);
+	vol->vol_ino = NULL;
+	/* NTFS 3.0+ specific clean up. */
+	if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+		if (vol->usnjrnl_j_ino) {
+			iput(vol->usnjrnl_j_ino);
+			vol->usnjrnl_j_ino = NULL;
+		}
+		if (vol->usnjrnl_max_ino) {
+			iput(vol->usnjrnl_max_ino);
+			vol->usnjrnl_max_ino = NULL;
+		}
+		if (vol->usnjrnl_ino) {
+			iput(vol->usnjrnl_ino);
+			vol->usnjrnl_ino = NULL;
+		}
+		if (vol->quota_q_ino) {
+			iput(vol->quota_q_ino);
+			vol->quota_q_ino = NULL;
+		}
+		if (vol->quota_ino) {
+			iput(vol->quota_ino);
+			vol->quota_ino = NULL;
+		}
+#endif /* NTFS_RW */
+		if (vol->extend_ino) {
+			iput(vol->extend_ino);
+			vol->extend_ino = NULL;
+		}
+		if (vol->secure_ino) {
+			iput(vol->secure_ino);
+			vol->secure_ino = NULL;
+		}
+	}
+	iput(vol->root_ino);
+	vol->root_ino = NULL;
+	iput(vol->lcnbmp_ino);
+	vol->lcnbmp_ino = NULL;
+	iput(vol->mftbmp_ino);
+	vol->mftbmp_ino = NULL;
+#ifdef NTFS_RW
+	if (vol->logfile_ino) {
+		iput(vol->logfile_ino);
+		vol->logfile_ino = NULL;
+	}
+	if (vol->mftmirr_ino) {
+		iput(vol->mftmirr_ino);
+		vol->mftmirr_ino = NULL;
+	}
+#endif /* NTFS_RW */
+	/* Throw away the table of attribute definitions. */
+	vol->attrdef_size = 0;
+	if (vol->attrdef) {
+		ntfs_free(vol->attrdef);
+		vol->attrdef = NULL;
+	}
+	vol->upcase_len = 0;
+	mutex_lock(&ntfs_lock);
+	if (vol->upcase == default_upcase) {
+		ntfs_nr_upcase_users--;
+		vol->upcase = NULL;
+	}
+	mutex_unlock(&ntfs_lock);
+	if (vol->upcase) {
+		ntfs_free(vol->upcase);
+		vol->upcase = NULL;
+	}
+	if (vol->nls_map) {
+		unload_nls(vol->nls_map);
+		vol->nls_map = NULL;
+	}
+	/* Error exit code path. */
+unl_upcase_iput_tmp_ino_err_out_now:
+	/*
+	 * Decrease the number of upcase users and destroy the global default
+	 * upcase table if necessary.
+	 */
+	mutex_lock(&ntfs_lock);
+	if (!--ntfs_nr_upcase_users && default_upcase) {
+		ntfs_free(default_upcase);
+		default_upcase = NULL;
+	}
+	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+		free_compression_buffers();
+	mutex_unlock(&ntfs_lock);
+iput_tmp_ino_err_out_now:
+	iput(tmp_ino);
+	if (vol->mft_ino && vol->mft_ino != tmp_ino)
+		iput(vol->mft_ino);
+	vol->mft_ino = NULL;
+	/* Errors at this stage are irrelevant. */
+err_out_now:
+	sb->s_fs_info = NULL;
+	kfree(vol);
+	ntfs_debug("Failed, returning -EINVAL.");
+	lockdep_on();
+	return -EINVAL;
+}
+
+/*
+ * This is a slab cache to optimize allocations and deallocations of Unicode
+ * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
+ * (255) Unicode characters + a terminating NULL Unicode character.
+ */
+struct kmem_cache *ntfs_name_cache;
+
+/* Slab caches for efficient allocation/deallocation of inodes. */
+struct kmem_cache *ntfs_inode_cache;
+struct kmem_cache *ntfs_big_inode_cache;
+
+/* Init once constructor for the inode slab cache. */
+static void ntfs_big_inode_init_once(void *foo)
+{
+	ntfs_inode *ni = (ntfs_inode *)foo;
+
+	inode_init_once(VFS_I(ni));
+}
+
+/*
+ * Slab caches to optimize allocations and deallocations of attribute search
+ * contexts and index contexts, respectively.
+ */
+struct kmem_cache *ntfs_attr_ctx_cache;
+struct kmem_cache *ntfs_index_ctx_cache;
+
+/* Driver wide mutex. */
+DEFINE_MUTEX(ntfs_lock);
+
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+}
+
+static struct file_system_type ntfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ntfs",
+	.mount		= ntfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ntfs");
+
+/* Stable names for the slab caches. */
+static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
+static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
+static const char ntfs_name_cache_name[] = "ntfs_name_cache";
+static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
+static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
+
+static int __init init_ntfs_fs(void)
+{
+	int err = 0;
+
+	/* This may be ugly but it results in pretty output so who cares. (-8 */
+	pr_info("driver " NTFS_VERSION " [Flags: R/"
+#ifdef NTFS_RW
+			"W"
+#else
+			"O"
+#endif
+#ifdef DEBUG
+			" DEBUG"
+#endif
+#ifdef MODULE
+			" MODULE"
+#endif
+			"].\n");
+
+	ntfs_debug("Debug messages are enabled.");
+
+	ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
+			sizeof(ntfs_index_context), 0 /* offset */,
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
+	if (!ntfs_index_ctx_cache) {
+		pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
+		goto ictx_err_out;
+	}
+	ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
+			sizeof(ntfs_attr_search_ctx), 0 /* offset */,
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
+	if (!ntfs_attr_ctx_cache) {
+		pr_crit("NTFS: Failed to create %s!\n",
+			ntfs_attr_ctx_cache_name);
+		goto actx_err_out;
+	}
+
+	ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
+			(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+	if (!ntfs_name_cache) {
+		pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
+		goto name_err_out;
+	}
+
+	ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
+			sizeof(ntfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	if (!ntfs_inode_cache) {
+		pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
+		goto inode_err_out;
+	}
+
+	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
+			sizeof(big_ntfs_inode), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			ntfs_big_inode_init_once);
+	if (!ntfs_big_inode_cache) {
+		pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
+		goto big_inode_err_out;
+	}
+
+	/* Register the ntfs sysctls. */
+	err = ntfs_sysctl(1);
+	if (err) {
+		pr_crit("Failed to register NTFS sysctls!\n");
+		goto sysctl_err_out;
+	}
+
+	err = register_filesystem(&ntfs_fs_type);
+	if (!err) {
+		ntfs_debug("NTFS driver registered successfully.");
+		return 0; /* Success! */
+	}
+	pr_crit("Failed to register NTFS filesystem driver!\n");
+
+	/* Unregister the ntfs sysctls. */
+	ntfs_sysctl(0);
+sysctl_err_out:
+	kmem_cache_destroy(ntfs_big_inode_cache);
+big_inode_err_out:
+	kmem_cache_destroy(ntfs_inode_cache);
+inode_err_out:
+	kmem_cache_destroy(ntfs_name_cache);
+name_err_out:
+	kmem_cache_destroy(ntfs_attr_ctx_cache);
+actx_err_out:
+	kmem_cache_destroy(ntfs_index_ctx_cache);
+ictx_err_out:
+	if (!err) {
+		pr_crit("Aborting NTFS filesystem driver registration...\n");
+		err = -ENOMEM;
+	}
+	return err;
+}
+
+static void __exit exit_ntfs_fs(void)
+{
+	ntfs_debug("Unregistering NTFS driver.");
+
+	unregister_filesystem(&ntfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ntfs_big_inode_cache);
+	kmem_cache_destroy(ntfs_inode_cache);
+	kmem_cache_destroy(ntfs_name_cache);
+	kmem_cache_destroy(ntfs_attr_ctx_cache);
+	kmem_cache_destroy(ntfs_index_ctx_cache);
+	/* Unregister the ntfs sysctls. */
+	ntfs_sysctl(0);
+}
+
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
+MODULE_VERSION(NTFS_VERSION);
+MODULE_LICENSE("GPL");
+#ifdef DEBUG
+module_param(debug_msgs, bint, 0);
+MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
+#endif
+
+module_init(init_ntfs_fs)
+module_exit(exit_ntfs_fs)
diff --git a/kernel/fs/ntfs/sysctl.c b/kernel/fs/ntfs/sysctl.c
new file mode 100644
index 000000000..a503156ec
--- /dev/null
+++ b/kernel/fs/ntfs/sysctl.c
@@ -0,0 +1,83 @@
+/*
+ * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
+ *	      the Linux-NTFS project. Adapted from the old NTFS driver,
+ *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
+ *
+ * Copyright (c) 2002-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef DEBUG
+
+#include <linux/module.h>
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+#include "sysctl.h"
+#include "debug.h"
+
+/* Definition of the ntfs sysctl. */
+static struct ctl_table ntfs_sysctls[] = {
+	{
+		.procname	= "ntfs-debug",
+		.data		= &debug_msgs,		/* Data pointer and size. */
+		.maxlen		= sizeof(debug_msgs),
+		.mode		= 0644,			/* Mode, proc handler. */
+		.proc_handler	= proc_dointvec
+	},
+	{}
+};
+
+/* Define the parent directory /proc/sys/fs. */
+static struct ctl_table sysctls_root[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= ntfs_sysctls
+	},
+	{}
+};
+
+/* Storage for the sysctls header. */
+static struct ctl_table_header *sysctls_root_table;
+
+/**
+ * ntfs_sysctl - add or remove the debug sysctl
+ * @add:	add (1) or remove (0) the sysctl
+ *
+ * Add or remove the debug sysctl. Return 0 on success or -errno on error.
+ */
+int ntfs_sysctl(int add)
+{
+	if (add) {
+		BUG_ON(sysctls_root_table);
+		sysctls_root_table = register_sysctl_table(sysctls_root);
+		if (!sysctls_root_table)
+			return -ENOMEM;
+	} else {
+		BUG_ON(!sysctls_root_table);
+		unregister_sysctl_table(sysctls_root_table);
+		sysctls_root_table = NULL;
+	}
+	return 0;
+}
+
+#endif /* CONFIG_SYSCTL */
+#endif /* DEBUG */
diff --git a/kernel/fs/ntfs/sysctl.h b/kernel/fs/ntfs/sysctl.h
new file mode 100644
index 000000000..d4f8ce920
--- /dev/null
+++ b/kernel/fs/ntfs/sysctl.h
@@ -0,0 +1,41 @@
+/*
+ * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
+ *	      the Linux-NTFS project. Adapted from the old NTFS driver,
+ *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_SYSCTL_H
+#define _LINUX_NTFS_SYSCTL_H
+
+
+#if defined(DEBUG) && defined(CONFIG_SYSCTL)
+
+extern int ntfs_sysctl(int add);
+
+#else
+
+/* Just return success. */
+static inline int ntfs_sysctl(int add)
+{
+	return 0;
+}
+
+#endif /* DEBUG && CONFIG_SYSCTL */
+#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/kernel/fs/ntfs/time.h b/kernel/fs/ntfs/time.h
new file mode 100644
index 000000000..01233989d
--- /dev/null
+++ b/kernel/fs/ntfs/time.h
@@ -0,0 +1,100 @@
+/*
+ * time.h - NTFS time conversion functions.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_TIME_H
+#define _LINUX_NTFS_TIME_H
+
+#include <linux/time.h>		/* For current_kernel_time(). */
+#include <asm/div64.h>		/* For do_div(). */
+
+#include "endian.h"
+
+#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
+
+/**
+ * utc2ntfs - convert Linux UTC time to NTFS time
+ * @ts:		Linux UTC time to convert to NTFS time
+ *
+ * Convert the Linux UTC time @ts to its corresponding NTFS time and return
+ * that in little endian format.
+ *
+ * Linux stores time in a struct timespec consisting of a time_t (long at
+ * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
+ * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
+ * 1-nano-second intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100-nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline sle64 utc2ntfs(const struct timespec ts)
+{
+	/*
+	 * Convert the seconds to 100ns intervals, add the nano-seconds
+	 * converted to 100ns intervals, and then add the NTFS time offset.
+	 */
+	return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
+			NTFS_TIME_OFFSET);
+}
+
+/**
+ * get_current_ntfs_time - get the current time in little endian NTFS format
+ *
+ * Get the current time from the Linux kernel, convert it to its corresponding
+ * NTFS time and return that in little endian format.
+ */
+static inline sle64 get_current_ntfs_time(void)
+{
+	return utc2ntfs(current_kernel_time());
+}
+
+/**
+ * ntfs2utc - convert NTFS time to Linux time
+ * @time:	NTFS time (little endian) to convert to Linux UTC
+ *
+ * Convert the little endian NTFS time @time to its corresponding Linux UTC
+ * time and return that in cpu format.
+ *
+ * Linux stores time in a struct timespec consisting of a time_t (long at
+ * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
+ * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
+ * 1-nano-second intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100 nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline struct timespec ntfs2utc(const sle64 time)
+{
+	struct timespec ts;
+
+	/* Subtract the NTFS time offset. */
+	u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
+	/*
+	 * Convert the time to 1-second intervals and the remainder to
+	 * 1-nano-second intervals.
+	 */
+	ts.tv_nsec = do_div(t, 10000000) * 100;
+	ts.tv_sec = t;
+	return ts;
+}
+
+#endif /* _LINUX_NTFS_TIME_H */
diff --git a/kernel/fs/ntfs/types.h b/kernel/fs/ntfs/types.h
new file mode 100644
index 000000000..8c8053b66
--- /dev/null
+++ b/kernel/fs/ntfs/types.h
@@ -0,0 +1,69 @@
+/*
+ * types.h - Defines for NTFS Linux kernel driver specific types.
+ *	     Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_TYPES_H
+#define _LINUX_NTFS_TYPES_H
+
+#include <linux/types.h>
+
+typedef __le16 le16;
+typedef __le32 le32;
+typedef __le64 le64;
+typedef __u16 __bitwise sle16;
+typedef __u32 __bitwise sle32;
+typedef __u64 __bitwise sle64;
+
+/* 2-byte Unicode character type. */
+typedef le16 ntfschar;
+#define UCHAR_T_SIZE_BITS 1
+
+/*
+ * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
+ * and VCN, to allow for type checking and better code readability.
+ */
+typedef s64 VCN;
+typedef sle64 leVCN;
+typedef s64 LCN;
+typedef sle64 leLCN;
+
+/*
+ * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
+ * values.  We define our own type LSN, to allow for type checking and better
+ * code readability.
+ */
+typedef s64 LSN;
+typedef sle64 leLSN;
+
+/*
+ * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
+ * We define our own type USN, to allow for type checking and better code
+ * readability.
+ */
+typedef s64 USN;
+typedef sle64 leUSN;
+
+typedef enum {
+	CASE_SENSITIVE = 0,
+	IGNORE_CASE = 1,
+} IGNORE_CASE_BOOL;
+
+#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/kernel/fs/ntfs/unistr.c b/kernel/fs/ntfs/unistr.c
new file mode 100644
index 000000000..005ca4b0f
--- /dev/null
+++ b/kernel/fs/ntfs/unistr.c
@@ -0,0 +1,398 @@
+/*
+ * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/slab.h>
+
+#include "types.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/*
+ * IMPORTANT
+ * =========
+ *
+ * All these routines assume that the Unicode characters are in little endian
+ * encoding inside the strings!!!
+ */
+
+/*
+ * This is used by the name collation functions to quickly determine what
+ * characters are (in)valid.
+ */
+static const u8 legal_ansi_char_array[0x40] = {
+	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+
+	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
+	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
+
+	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
+	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
+};
+
+/**
+ * ntfs_are_names_equal - compare two Unicode names for equality
+ * @s1:			name to compare to @s2
+ * @s1_len:		length in Unicode characters of @s1
+ * @s2:			name to compare to @s1
+ * @s2_len:		length in Unicode characters of @s2
+ * @ic:			ignore case bool
+ * @upcase:		upcase table (only if @ic == IGNORE_CASE)
+ * @upcase_size:	length in Unicode characters of @upcase (if present)
+ *
+ * Compare the names @s1 and @s2 and return 'true' (1) if the names are
+ * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
+ * the @upcase table is used to performa a case insensitive comparison.
+ */
+bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_size)
+{
+	if (s1_len != s2_len)
+		return false;
+	if (ic == CASE_SENSITIVE)
+		return !ntfs_ucsncmp(s1, s2, s1_len);
+	return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
+}
+
+/**
+ * ntfs_collate_names - collate two Unicode names
+ * @name1:	first Unicode name to compare
+ * @name2:	second Unicode name to compare
+ * @err_val:	if @name1 contains an invalid character return this value
+ * @ic:		either CASE_SENSITIVE or IGNORE_CASE
+ * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
+ * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
+ *
+ * ntfs_collate_names collates two Unicode names and returns:
+ *
+ *  -1 if the first name collates before the second one,
+ *   0 if the names match,
+ *   1 if the second name collates before the first one, or
+ * @err_val if an invalid character is found in @name1 during the comparison.
+ *
+ * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
+ */
+int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+		const ntfschar *name2, const u32 name2_len,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len)
+{
+	u32 cnt, min_len;
+	u16 c1, c2;
+
+	min_len = name1_len;
+	if (name1_len > name2_len)
+		min_len = name2_len;
+	for (cnt = 0; cnt < min_len; ++cnt) {
+		c1 = le16_to_cpu(*name1++);
+		c2 = le16_to_cpu(*name2++);
+		if (ic) {
+			if (c1 < upcase_len)
+				c1 = le16_to_cpu(upcase[c1]);
+			if (c2 < upcase_len)
+				c2 = le16_to_cpu(upcase[c2]);
+		}
+		if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+			return err_val;
+		if (c1 < c2)
+			return -1;
+		if (c1 > c2)
+			return 1;
+	}
+	if (name1_len < name2_len)
+		return -1;
+	if (name1_len == name2_len)
+		return 0;
+	/* name1_len > name2_len */
+	c1 = le16_to_cpu(*name1);
+	if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+		return err_val;
+	return 1;
+}
+
+/**
+ * ntfs_ucsncmp - compare two little endian Unicode strings
+ * @s1:		first string
+ * @s2:		second string
+ * @n:		maximum unicode characters to compare
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * The strings in little endian format and appropriate le16_to_cpu()
+ * conversion is performed on non-little endian machines.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
+{
+	u16 c1, c2;
+	size_t i;
+
+	for (i = 0; i < n; ++i) {
+		c1 = le16_to_cpu(s1[i]);
+		c2 = le16_to_cpu(s2[i]);
+		if (c1 < c2)
+			return -1;
+		if (c1 > c2)
+			return 1;
+		if (!c1)
+			break;
+	}
+	return 0;
+}
+
+/**
+ * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
+ * @s1:			first string
+ * @s2:			second string
+ * @n:			maximum unicode characters to compare
+ * @upcase:		upcase table
+ * @upcase_size:	upcase table size in Unicode characters
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * ignoring case. The strings in little endian format and appropriate
+ * le16_to_cpu() conversion is performed on non-little endian machines.
+ *
+ * Each character is uppercased using the @upcase table before the comparison.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+		const ntfschar *upcase, const u32 upcase_size)
+{
+	size_t i;
+	u16 c1, c2;
+
+	for (i = 0; i < n; ++i) {
+		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
+			c1 = le16_to_cpu(upcase[c1]);
+		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
+			c2 = le16_to_cpu(upcase[c2]);
+		if (c1 < c2)
+			return -1;
+		if (c1 > c2)
+			return 1;
+		if (!c1)
+			break;
+	}
+	return 0;
+}
+
+void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
+		const u32 upcase_len)
+{
+	u32 i;
+	u16 u;
+
+	for (i = 0; i < name_len; i++)
+		if ((u = le16_to_cpu(name[i])) < upcase_len)
+			name[i] = upcase[u];
+}
+
+void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+		const ntfschar *upcase, const u32 upcase_len)
+{
+	ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
+			file_name_attr->file_name_length, upcase, upcase_len);
+}
+
+int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+		FILE_NAME_ATTR *file_name_attr2,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len)
+{
+	return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
+			file_name_attr1->file_name_length,
+			(ntfschar*)&file_name_attr2->file_name,
+			file_name_attr2->file_name_length,
+			err_val, ic, upcase, upcase_len);
+}
+
+/**
+ * ntfs_nlstoucs - convert NLS string to little endian Unicode string
+ * @vol:	ntfs volume which we are working with
+ * @ins:	input NLS string buffer
+ * @ins_len:	length of input string in bytes
+ * @outs:	on return contains the allocated output Unicode string buffer
+ *
+ * Convert the input string @ins, which is in whatever format the loaded NLS
+ * map dictates, into a little endian, 2-byte Unicode string.
+ *
+ * This function allocates the string and the caller is responsible for
+ * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
+ *
+ * On success the function returns the number of Unicode characters written to
+ * the output string *@outs (>= 0), not counting the terminating Unicode NULL
+ * character. *@outs is set to the allocated output string buffer.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. Both *@outs and *@outs_len
+ * are then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+		const int ins_len, ntfschar **outs)
+{
+	struct nls_table *nls = vol->nls_map;
+	ntfschar *ucs;
+	wchar_t wc;
+	int i, o, wc_len;
+
+	/* We do not trust outside sources. */
+	if (likely(ins)) {
+		ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
+		if (likely(ucs)) {
+			for (i = o = 0; i < ins_len; i += wc_len) {
+				wc_len = nls->char2uni(ins + i, ins_len - i,
+						&wc);
+				if (likely(wc_len >= 0 &&
+						o < NTFS_MAX_NAME_LEN)) {
+					if (likely(wc)) {
+						ucs[o++] = cpu_to_le16(wc);
+						continue;
+					} /* else if (!wc) */
+					break;
+				} /* else if (wc_len < 0 ||
+						o >= NTFS_MAX_NAME_LEN) */
+				goto name_err;
+			}
+			ucs[o] = 0;
+			*outs = ucs;
+			return o;
+		} /* else if (!ucs) */
+		ntfs_error(vol->sb, "Failed to allocate buffer for converted "
+				"name from ntfs_name_cache.");
+		return -ENOMEM;
+	} /* else if (!ins) */
+	ntfs_error(vol->sb, "Received NULL pointer.");
+	return -EINVAL;
+name_err:
+	kmem_cache_free(ntfs_name_cache, ucs);
+	if (wc_len < 0) {
+		ntfs_error(vol->sb, "Name using character set %s contains "
+				"characters that cannot be converted to "
+				"Unicode.", nls->charset);
+		i = -EILSEQ;
+	} else /* if (o >= NTFS_MAX_NAME_LEN) */ {
+		ntfs_error(vol->sb, "Name is too long (maximum length for a "
+				"name on NTFS is %d Unicode characters.",
+				NTFS_MAX_NAME_LEN);
+		i = -ENAMETOOLONG;
+	}
+	return i;
+}
+
+/**
+ * ntfs_ucstonls - convert little endian Unicode string to NLS string
+ * @vol:	ntfs volume which we are working with
+ * @ins:	input Unicode string buffer
+ * @ins_len:	length of input string in Unicode characters
+ * @outs:	on return contains the (allocated) output NLS string buffer
+ * @outs_len:	length of output string buffer in bytes
+ *
+ * Convert the input little endian, 2-byte Unicode string @ins, of length
+ * @ins_len into the string format dictated by the loaded NLS.
+ *
+ * If *@outs is NULL, this function allocates the string and the caller is
+ * responsible for calling kfree(*@outs); when finished with it. In this case
+ * @outs_len is ignored and can be 0.
+ *
+ * On success the function returns the number of bytes written to the output
+ * string *@outs (>= 0), not counting the terminating NULL byte. If the output
+ * string buffer was allocated, *@outs is set to it.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. The contents of *@outs are
+ * then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+		const int ins_len, unsigned char **outs, int outs_len)
+{
+	struct nls_table *nls = vol->nls_map;
+	unsigned char *ns;
+	int i, o, ns_len, wc;
+
+	/* We don't trust outside sources. */
+	if (ins) {
+		ns = *outs;
+		ns_len = outs_len;
+		if (ns && !ns_len) {
+			wc = -ENAMETOOLONG;
+			goto conversion_err;
+		}
+		if (!ns) {
+			ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
+			ns = kmalloc(ns_len + 1, GFP_NOFS);
+			if (!ns)
+				goto mem_err_out;
+		}
+		for (i = o = 0; i < ins_len; i++) {
+retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
+					ns_len - o);
+			if (wc > 0) {
+				o += wc;
+				continue;
+			} else if (!wc)
+				break;
+			else if (wc == -ENAMETOOLONG && ns != *outs) {
+				unsigned char *tc;
+				/* Grow in multiples of 64 bytes. */
+				tc = kmalloc((ns_len + 64) &
+						~63, GFP_NOFS);
+				if (tc) {
+					memcpy(tc, ns, ns_len);
+					ns_len = ((ns_len + 64) & ~63) - 1;
+					kfree(ns);
+					ns = tc;
+					goto retry;
+				} /* No memory so goto conversion_error; */
+			} /* wc < 0, real error. */
+			goto conversion_err;
+		}
+		ns[o] = 0;
+		*outs = ns;
+		return o;
+	} /* else (!ins) */
+	ntfs_error(vol->sb, "Received NULL pointer.");
+	return -EINVAL;
+conversion_err:
+	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
+			"converted to character set %s.  You might want to "
+			"try to use the mount option nls=utf8.", nls->charset);
+	if (ns != *outs)
+		kfree(ns);
+	if (wc != -ENAMETOOLONG)
+		wc = -EILSEQ;
+	return wc;
+mem_err_out:
+	ntfs_error(vol->sb, "Failed to allocate name!");
+	return -ENOMEM;
+}
diff --git a/kernel/fs/ntfs/upcase.c b/kernel/fs/ntfs/upcase.c
new file mode 100644
index 000000000..e2f72ca98
--- /dev/null
+++ b/kernel/fs/ntfs/upcase.c
@@ -0,0 +1,87 @@
+/*
+ * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
+ *	      Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS source
+ * in the file COPYING); if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "malloc.h"
+#include "ntfs.h"
+
+ntfschar *generate_default_upcase(void)
+{
+	static const int uc_run_table[][3] = { /* Start, End, Add */
+	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
+	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
+	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
+	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
+	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
+	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
+	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
+	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
+	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
+	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
+	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
+	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
+	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
+	{0}
+	};
+
+	static const int uc_dup_table[][2] = { /* Start, End */
+	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
+	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
+	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
+	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
+	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
+	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
+	{0}
+	};
+
+	static const int uc_word_table[][2] = { /* Offset, Value */
+	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
+	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
+	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
+	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
+	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
+	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
+	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
+	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
+	{0}
+	};
+
+	int i, r;
+	ntfschar *uc;
+
+	uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
+	if (!uc)
+		return uc;
+	memset(uc, 0, default_upcase_len * sizeof(ntfschar));
+	/* Generate the little endian Unicode upcase table used by ntfs. */
+	for (i = 0; i < default_upcase_len; i++)
+		uc[i] = cpu_to_le16(i);
+	for (r = 0; uc_run_table[r][0]; r++)
+		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
+			le16_add_cpu(&uc[i], uc_run_table[r][2]);
+	for (r = 0; uc_dup_table[r][0]; r++)
+		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
+			le16_add_cpu(&uc[i + 1], -1);
+	for (r = 0; uc_word_table[r][0]; r++)
+		uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
+	return uc;
+}
diff --git a/kernel/fs/ntfs/usnjrnl.c b/kernel/fs/ntfs/usnjrnl.c
new file mode 100644
index 000000000..b2bc0d55b
--- /dev/null
+++ b/kernel/fs/ntfs/usnjrnl.c
@@ -0,0 +1,84 @@
+/*
+ * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling.  Part of the
+ *	       Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+
+#include "aops.h"
+#include "debug.h"
+#include "endian.h"
+#include "time.h"
+#include "types.h"
+#include "usnjrnl.h"
+#include "volume.h"
+
+/**
+ * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
+ * @vol:	ntfs volume on which to stamp the transaction log
+ *
+ * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
+ * 'true' on success and 'false' on error.
+ *
+ * This function assumes that the transaction log has already been loaded and
+ * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
+ */
+bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
+{
+	ntfs_debug("Entering.");
+	if (likely(!NVolUsnJrnlStamped(vol))) {
+		sle64 stamp;
+		struct page *page;
+		USN_HEADER *uh;
+
+		page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read from "
+					"$UsnJrnl/$DATA/$Max attribute.");
+			return false;
+		}
+		uh = (USN_HEADER*)page_address(page);
+		stamp = get_current_ntfs_time();
+		ntfs_debug("Stamping transaction log ($UsnJrnl): old "
+				"journal_id 0x%llx, old lowest_valid_usn "
+				"0x%llx, new journal_id 0x%llx, new "
+				"lowest_valid_usn 0x%llx.",
+				(long long)sle64_to_cpu(uh->journal_id),
+				(long long)sle64_to_cpu(uh->lowest_valid_usn),
+				(long long)sle64_to_cpu(stamp),
+				i_size_read(vol->usnjrnl_j_ino));
+		uh->lowest_valid_usn =
+				cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
+		uh->journal_id = stamp;
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		ntfs_unmap_page(page);
+		/* Set the flag so we do not have to do it again on remount. */
+		NVolSetUsnJrnlStamped(vol);
+	}
+	ntfs_debug("Done.");
+	return true;
+}
+
+#endif /* NTFS_RW */
diff --git a/kernel/fs/ntfs/usnjrnl.h b/kernel/fs/ntfs/usnjrnl.h
new file mode 100644
index 000000000..00d8e6bd7
--- /dev/null
+++ b/kernel/fs/ntfs/usnjrnl.h
@@ -0,0 +1,205 @@
+/*
+ * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
+ *	       Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_USNJRNL_H
+#define _LINUX_NTFS_USNJRNL_H
+
+#ifdef NTFS_RW
+
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+#include "volume.h"
+
+/*
+ * Transaction log ($UsnJrnl) organization:
+ *
+ * The transaction log records whenever a file is modified in any way.  So for
+ * example it will record that file "blah" was written to at a particular time
+ * but not what was written.  If will record that a file was deleted or
+ * created, that a file was truncated, etc.  See below for all the reason
+ * codes used.
+ *
+ * The transaction log is in the $Extend directory which is in the root
+ * directory of each volume.  If it is not present it means transaction
+ * logging is disabled.  If it is present it means transaction logging is
+ * either enabled or in the process of being disabled in which case we can
+ * ignore it as it will go away as soon as Windows gets its hands on it.
+ *
+ * To determine whether the transaction logging is enabled or in the process
+ * of being disabled, need to check the volume flags in the
+ * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
+ * in the root directory and has a fixed mft record number, see layout.h).
+ * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
+ * is in the process of being disabled and if this flag is clear it means the
+ * transaction log is enabled.
+ *
+ * The transaction log consists of two parts; the $DATA/$Max attribute as well
+ * as the $DATA/$J attribute.  $Max is a header describing the transaction
+ * log whilst $J is the transaction log data itself as a sequence of variable
+ * sized USN_RECORDs (see below for all the structures).
+ *
+ * We do not care about transaction logging at this point in time but we still
+ * need to let windows know that the transaction log is out of date.  To do
+ * this we need to stamp the transaction log.  This involves setting the
+ * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
+ * for the next added USN_RECORD to the $DATA/$J attribute as well as
+ * generating a new journal_id in $DATA/$Max.
+ *
+ * The journal_id is as of the current version (2.0) of the transaction log
+ * simply the 64-bit timestamp of when the journal was either created or last
+ * stamped.
+ *
+ * To determine the next usn there are two ways.  The first is to parse
+ * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
+ * to its usn (which is the byte offset in the $DATA/$J attribute).  The
+ * second is simply to take the data size of the attribute.  Since the usns
+ * are simply byte offsets into $DATA/$J, this is exactly the next usn.  For
+ * obvious reasons we use the second method as it is much simpler and faster.
+ *
+ * As an aside, note that to actually disable the transaction log, one would
+ * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
+ * through all the mft records on the volume and set the usn field in their
+ * $STANDARD_INFORMATION attribute to zero.  Once that is done, one would need
+ * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
+ * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
+ *
+ * Note that if a volume is unmounted whilst the transaction log is being
+ * disabled, the process will continue the next time the volume is mounted.
+ * This is why we can safely mount read-write when we see a transaction log
+ * in the process of being deleted.
+ */
+
+/* Some $UsnJrnl related constants. */
+#define UsnJrnlMajorVer		2
+#define UsnJrnlMinorVer		0
+
+/*
+ * $DATA/$Max attribute.  This is (always?) resident and has a fixed size of
+ * 32 bytes.  It contains the header describing the transaction log.
+ */
+typedef struct {
+/*Ofs*/
+/*   0*/sle64 maximum_size;	/* The maximum on-disk size of the $DATA/$J
+				   attribute. */
+/*   8*/sle64 allocation_delta;	/* Number of bytes by which to increase the
+				   size of the $DATA/$J attribute. */
+/*0x10*/sle64 journal_id;	/* Current id of the transaction log. */
+/*0x18*/leUSN lowest_valid_usn;	/* Lowest valid usn in $DATA/$J for the
+				   current journal_id. */
+/* sizeof() = 32 (0x20) bytes */
+} __attribute__ ((__packed__)) USN_HEADER;
+
+/*
+ * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
+ * file since it was last opened.  I think the names speak for themselves but
+ * if you disagree check out the descriptions in the Linux NTFS project NTFS
+ * documentation: http://www.linux-ntfs.org/
+ */
+enum {
+	USN_REASON_DATA_OVERWRITE	= cpu_to_le32(0x00000001),
+	USN_REASON_DATA_EXTEND		= cpu_to_le32(0x00000002),
+	USN_REASON_DATA_TRUNCATION	= cpu_to_le32(0x00000004),
+	USN_REASON_NAMED_DATA_OVERWRITE	= cpu_to_le32(0x00000010),
+	USN_REASON_NAMED_DATA_EXTEND	= cpu_to_le32(0x00000020),
+	USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
+	USN_REASON_FILE_CREATE		= cpu_to_le32(0x00000100),
+	USN_REASON_FILE_DELETE		= cpu_to_le32(0x00000200),
+	USN_REASON_EA_CHANGE		= cpu_to_le32(0x00000400),
+	USN_REASON_SECURITY_CHANGE	= cpu_to_le32(0x00000800),
+	USN_REASON_RENAME_OLD_NAME	= cpu_to_le32(0x00001000),
+	USN_REASON_RENAME_NEW_NAME	= cpu_to_le32(0x00002000),
+	USN_REASON_INDEXABLE_CHANGE	= cpu_to_le32(0x00004000),
+	USN_REASON_BASIC_INFO_CHANGE	= cpu_to_le32(0x00008000),
+	USN_REASON_HARD_LINK_CHANGE	= cpu_to_le32(0x00010000),
+	USN_REASON_COMPRESSION_CHANGE	= cpu_to_le32(0x00020000),
+	USN_REASON_ENCRYPTION_CHANGE	= cpu_to_le32(0x00040000),
+	USN_REASON_OBJECT_ID_CHANGE	= cpu_to_le32(0x00080000),
+	USN_REASON_REPARSE_POINT_CHANGE	= cpu_to_le32(0x00100000),
+	USN_REASON_STREAM_CHANGE	= cpu_to_le32(0x00200000),
+	USN_REASON_CLOSE		= cpu_to_le32(0x80000000),
+};
+
+typedef le32 USN_REASON_FLAGS;
+
+/*
+ * Source info flags (32-bit).  Information about the source of the change(s)
+ * to the file.  For detailed descriptions of what these mean, see the Linux
+ * NTFS project NTFS documentation:
+ *	http://www.linux-ntfs.org/
+ */
+enum {
+	USN_SOURCE_DATA_MANAGEMENT	  = cpu_to_le32(0x00000001),
+	USN_SOURCE_AUXILIARY_DATA	  = cpu_to_le32(0x00000002),
+	USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
+};
+
+typedef le32 USN_SOURCE_INFO_FLAGS;
+
+/*
+ * $DATA/$J attribute.  This is always non-resident, is marked as sparse, and
+ * is of variabled size.  It consists of a sequence of variable size
+ * USN_RECORDS.  The minimum allocated_size is allocation_delta as
+ * specified in $DATA/$Max.  When the maximum_size specified in $DATA/$Max is
+ * exceeded by more than allocation_delta bytes, allocation_delta bytes are
+ * allocated and appended to the $DATA/$J attribute and an equal number of
+ * bytes at the beginning of the attribute are freed and made sparse.  Note the
+ * making sparse only happens at volume checkpoints and hence the actual
+ * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
+ */
+typedef struct {
+/*Ofs*/
+/*   0*/le32 length;		/* Byte size of this record (8-byte
+				   aligned). */
+/*   4*/le16 major_ver;		/* Major version of the transaction log used
+				   for this record. */
+/*   6*/le16 minor_ver;		/* Minor version of the transaction log used
+				   for this record. */
+/*   8*/leMFT_REF mft_reference;/* The mft reference of the file (or
+				   directory) described by this record. */
+/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
+				   directory of the file described by this
+				   record. */
+/*0x18*/leUSN usn;		/* The usn of this record.  Equals the offset
+				   within the $DATA/$J attribute. */
+/*0x20*/sle64 time;		/* Time when this record was created. */
+/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
+/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
+/*0x30*/le32 security_id;	/* File security_id copied from
+				   $STANDARD_INFORMATION. */
+/*0x34*/FILE_ATTR_FLAGS file_attributes;	/* File attributes copied from
+				   $STANDARD_INFORMATION or $FILE_NAME (not
+				   sure which). */
+/*0x38*/le16 file_name_size;	/* Size of the file name in bytes. */
+/*0x3a*/le16 file_name_offset;	/* Offset to the file name in bytes from the
+				   start of this record. */
+/*0x3c*/ntfschar file_name[0];	/* Use when creating only.  When reading use
+				   file_name_offset to determine the location
+				   of the name. */
+/* sizeof() = 60 (0x3c) bytes */
+} __attribute__ ((__packed__)) USN_RECORD;
+
+extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/kernel/fs/ntfs/volume.h b/kernel/fs/ntfs/volume.h
new file mode 100644
index 000000000..4f579b02b
--- /dev/null
+++ b/kernel/fs/ntfs/volume.h
@@ -0,0 +1,178 @@
+/*
+ * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
+ *	      of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_VOLUME_H
+#define _LINUX_NTFS_VOLUME_H
+
+#include <linux/rwsem.h>
+#include <linux/uidgid.h>
+
+#include "types.h"
+#include "layout.h"
+
+/*
+ * The NTFS in memory super block structure.
+ */
+typedef struct {
+	/*
+	 * FIXME: Reorder to have commonly used together element within the
+	 * same cache line, aiming at a cache line size of 32 bytes. Aim for
+	 * 64 bytes for less commonly used together elements. Put most commonly
+	 * used elements to front of structure. Obviously do this only when the
+	 * structure has stabilized... (AIA)
+	 */
+	/* Device specifics. */
+	struct super_block *sb;		/* Pointer back to the super_block. */
+	LCN nr_blocks;			/* Number of sb->s_blocksize bytes
+					   sized blocks on the device. */
+	/* Configuration provided by user at mount time. */
+	unsigned long flags;		/* Miscellaneous flags, see below. */
+	kuid_t uid;			/* uid that files will be mounted as. */
+	kgid_t gid;			/* gid that files will be mounted as. */
+	umode_t fmask;			/* The mask for file permissions. */
+	umode_t dmask;			/* The mask for directory
+					   permissions. */
+	u8 mft_zone_multiplier;		/* Initial mft zone multiplier. */
+	u8 on_errors;			/* What to do on filesystem errors. */
+	/* NTFS bootsector provided information. */
+	u16 sector_size;		/* in bytes */
+	u8 sector_size_bits;		/* log2(sector_size) */
+	u32 cluster_size;		/* in bytes */
+	u32 cluster_size_mask;		/* cluster_size - 1 */
+	u8 cluster_size_bits;		/* log2(cluster_size) */
+	u32 mft_record_size;		/* in bytes */
+	u32 mft_record_size_mask;	/* mft_record_size - 1 */
+	u8 mft_record_size_bits;	/* log2(mft_record_size) */
+	u32 index_record_size;		/* in bytes */
+	u32 index_record_size_mask;	/* index_record_size - 1 */
+	u8 index_record_size_bits;	/* log2(index_record_size) */
+	LCN nr_clusters;		/* Volume size in clusters == number of
+					   bits in lcn bitmap. */
+	LCN mft_lcn;			/* Cluster location of mft data. */
+	LCN mftmirr_lcn;		/* Cluster location of copy of mft. */
+	u64 serial_no;			/* The volume serial number. */
+	/* Mount specific NTFS information. */
+	u32 upcase_len;			/* Number of entries in upcase[]. */
+	ntfschar *upcase;		/* The upcase table. */
+
+	s32 attrdef_size;		/* Size of the attribute definition
+					   table in bytes. */
+	ATTR_DEF *attrdef;		/* Table of attribute definitions.
+					   Obtained from FILE_AttrDef. */
+
+#ifdef NTFS_RW
+	/* Variables used by the cluster and mft allocators. */
+	s64 mft_data_pos;		/* Mft record number at which to
+					   allocate the next mft record. */
+	LCN mft_zone_start;		/* First cluster of the mft zone. */
+	LCN mft_zone_end;		/* First cluster beyond the mft zone. */
+	LCN mft_zone_pos;		/* Current position in the mft zone. */
+	LCN data1_zone_pos;		/* Current position in the first data
+					   zone. */
+	LCN data2_zone_pos;		/* Current position in the second data
+					   zone. */
+#endif /* NTFS_RW */
+
+	struct inode *mft_ino;		/* The VFS inode of $MFT. */
+
+	struct inode *mftbmp_ino;	/* Attribute inode for $MFT/$BITMAP. */
+	struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
+					    mft record bitmap ($MFT/$BITMAP). */
+#ifdef NTFS_RW
+	struct inode *mftmirr_ino;	/* The VFS inode of $MFTMirr. */
+	int mftmirr_size;		/* Size of mft mirror in mft records. */
+
+	struct inode *logfile_ino;	/* The VFS inode of $LogFile. */
+#endif /* NTFS_RW */
+
+	struct inode *lcnbmp_ino;	/* The VFS inode of $Bitmap. */
+	struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
+					    cluster bitmap ($Bitmap/$DATA). */
+
+	struct inode *vol_ino;		/* The VFS inode of $Volume. */
+	VOLUME_FLAGS vol_flags;		/* Volume flags. */
+	u8 major_ver;			/* Ntfs major version of volume. */
+	u8 minor_ver;			/* Ntfs minor version of volume. */
+
+	struct inode *root_ino;		/* The VFS inode of the root
+					   directory. */
+	struct inode *secure_ino;	/* The VFS inode of $Secure (NTFS3.0+
+					   only, otherwise NULL). */
+	struct inode *extend_ino;	/* The VFS inode of $Extend (NTFS3.0+
+					   only, otherwise NULL). */
+#ifdef NTFS_RW
+	/* $Quota stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
+	struct inode *quota_ino;	/* The VFS inode of $Quota. */
+	struct inode *quota_q_ino;	/* Attribute inode for $Quota/$Q. */
+	/* $UsnJrnl stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
+	struct inode *usnjrnl_ino;	/* The VFS inode of $UsnJrnl. */
+	struct inode *usnjrnl_max_ino;	/* Attribute inode for $UsnJrnl/$Max. */
+	struct inode *usnjrnl_j_ino;	/* Attribute inode for $UsnJrnl/$J. */
+#endif /* NTFS_RW */
+	struct nls_table *nls_map;
+} ntfs_volume;
+
+/*
+ * Defined bits for the flags field in the ntfs_volume structure.
+ */
+typedef enum {
+	NV_Errors,		/* 1: Volume has errors, prevent remount rw. */
+	NV_ShowSystemFiles,	/* 1: Return system files in ntfs_readdir(). */
+	NV_CaseSensitive,	/* 1: Treat file names as case sensitive and
+				      create filenames in the POSIX namespace.
+				      Otherwise be case insensitive but still
+				      create file names in POSIX namespace. */
+	NV_LogFileEmpty,	/* 1: $LogFile journal is empty. */
+	NV_QuotaOutOfDate,	/* 1: $Quota is out of date. */
+	NV_UsnJrnlStamped,	/* 1: $UsnJrnl has been stamped. */
+	NV_SparseEnabled,	/* 1: May create sparse files. */
+} ntfs_volume_flags;
+
+/*
+ * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
+ * functions.
+ */
+#define DEFINE_NVOL_BIT_OPS(flag)					\
+static inline int NVol##flag(ntfs_volume *vol)		\
+{							\
+	return test_bit(NV_##flag, &(vol)->flags);	\
+}							\
+static inline void NVolSet##flag(ntfs_volume *vol)	\
+{							\
+	set_bit(NV_##flag, &(vol)->flags);		\
+}							\
+static inline void NVolClear##flag(ntfs_volume *vol)	\
+{							\
+	clear_bit(NV_##flag, &(vol)->flags);		\
+}
+
+/* Emit the ntfs volume bitops functions. */
+DEFINE_NVOL_BIT_OPS(Errors)
+DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
+DEFINE_NVOL_BIT_OPS(CaseSensitive)
+DEFINE_NVOL_BIT_OPS(LogFileEmpty)
+DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
+DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
+DEFINE_NVOL_BIT_OPS(SparseEnabled)
+
+#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/kernel/fs/ocfs2/Kconfig b/kernel/fs/ocfs2/Kconfig
new file mode 100644
index 000000000..77a8de5f7
--- /dev/null
+++ b/kernel/fs/ocfs2/Kconfig
@@ -0,0 +1,76 @@
+config OCFS2_FS
+	tristate "OCFS2 file system support"
+	depends on NET && SYSFS && CONFIGFS_FS
+	select JBD2
+	select CRC32
+	select QUOTA
+	select QUOTA_TREE
+	select FS_POSIX_ACL
+	help
+	  OCFS2 is a general purpose extent based shared disk cluster file
+	  system with many similarities to ext3. It supports 64 bit inode
+	  numbers, and has automatically extending metadata groups which may
+	  also make it attractive for non-clustered use.
+
+	  You'll want to install the ocfs2-tools package in order to at least
+	  get "mount.ocfs2".
+
+	  Project web page:    http://oss.oracle.com/projects/ocfs2
+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+	  For more information on OCFS2, see the file
+	  <file:Documentation/filesystems/ocfs2.txt>.
+
+config OCFS2_FS_O2CB
+	tristate "O2CB Kernelspace Clustering"
+	depends on OCFS2_FS
+	default y
+	help
+	  OCFS2 includes a simple kernelspace clustering package, the OCFS2
+	  Cluster Base.  It only requires a very small userspace component
+	  to configure it. This comes with the standard ocfs2-tools package.
+	  O2CB is limited to maintaining a cluster for OCFS2 file systems.
+	  It cannot manage any other cluster applications.
+
+	  It is always safe to say Y here, as the clustering method is
+	  run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+	tristate "OCFS2 Userspace Clustering"
+	depends on OCFS2_FS && DLM
+	default y
+	help
+	  This option will allow OCFS2 to use userspace clustering services
+	  in conjunction with the DLM in fs/dlm.  If you are using a
+	  userspace cluster manager, say Y here.
+
+	  It is safe to say Y, as the clustering method is run-time
+	  selectable.
+
+config OCFS2_FS_STATS
+	bool "OCFS2 statistics"
+	depends on OCFS2_FS && DEBUG_FS
+	default y
+	help
+	  This option allows some fs statistics to be captured. Enabling
+	  this option may increase the memory consumption.
+
+config OCFS2_DEBUG_MASKLOG
+	bool "OCFS2 logging support"
+	depends on OCFS2_FS
+	default y
+	help
+	  The ocfs2 filesystem has an extensive logging system.  The system
+	  allows selection of events to log via files in /sys/o2cb/logmask/.
+	  This option will enlarge your kernel, but it allows debugging of
+	  ocfs2 filesystem issues.
+
+config OCFS2_DEBUG_FS
+	bool "OCFS2 expensive checks"
+	depends on OCFS2_FS
+	default n
+	help
+	  This option will enable expensive consistency checks. Enable
+	  this option for debugging only as it is likely to decrease
+	  performance of the filesystem.
diff --git a/kernel/fs/ocfs2/Makefile b/kernel/fs/ocfs2/Makefile
new file mode 100644
index 000000000..ce210d495
--- /dev/null
+++ b/kernel/fs/ocfs2/Makefile
@@ -0,0 +1,53 @@
+ccflags-y := -Ifs/ocfs2
+
+ccflags-y += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += 	\
+	ocfs2.o			\
+	ocfs2_stackglue.o
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
+
+ocfs2-objs := \
+	alloc.o 		\
+	aops.o 			\
+	blockcheck.o		\
+	buffer_head_io.o	\
+	dcache.o 		\
+	dir.o 			\
+	dlmglue.o 		\
+	export.o 		\
+	extent_map.o 		\
+	file.o 			\
+	heartbeat.o 		\
+	inode.o 		\
+	ioctl.o 		\
+	journal.o 		\
+	localalloc.o 		\
+	locks.o			\
+	mmap.o 			\
+	namei.o 		\
+	refcounttree.o		\
+	reservations.o		\
+	move_extents.o		\
+	resize.o		\
+	slot_map.o 		\
+	suballoc.o 		\
+	super.o 		\
+	symlink.o 		\
+	sysfile.o 		\
+	uptodate.o		\
+	quota_local.o		\
+	quota_global.o		\
+	xattr.o			\
+	acl.o
+
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
+
+obj-$(CONFIG_OCFS2_FS) += dlmfs/
+# cluster/ is always needed when OCFS2_FS for masklog support
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/kernel/fs/ocfs2/acl.c b/kernel/fs/ocfs2/acl.c
new file mode 100644
index 000000000..c58a1bcfd
--- /dev/null
+++ b/kernel/fs/ocfs2/acl.c
@@ -0,0 +1,310 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct posix_acl_entry))
+		return ERR_PTR(-EINVAL);
+
+	count = size / sizeof(struct posix_acl_entry);
+
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		struct ocfs2_acl_entry *entry =
+			(struct ocfs2_acl_entry *)value;
+
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch(acl->a_entries[n].e_tag) {
+		case ACL_USER:
+			acl->a_entries[n].e_uid =
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		default:
+			break;
+		}
+		value += sizeof(struct posix_acl_entry);
+
+	}
+	return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+	struct ocfs2_acl_entry *entry = NULL;
+	char *ocfs2_acl;
+	size_t n;
+
+	*size = acl->a_count * sizeof(struct posix_acl_entry);
+
+	ocfs2_acl = kmalloc(*size, GFP_NOFS);
+	if (!ocfs2_acl)
+		return ERR_PTR(-ENOMEM);
+
+	entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+	for (n = 0; n < acl->a_count; n++, entry++) {
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch(acl->a_entries[n].e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns,
+					  acl->a_entries[n].e_uid));
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns,
+					  acl->a_entries[n].e_gid));
+			break;
+		default:
+			entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			break;
+		}
+	}
+	return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+					      int type,
+					      struct buffer_head *di_bh)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+						"", value, retval);
+	}
+
+	if (retval > 0)
+		acl = ocfs2_acl_from_xattr(value, retval);
+	else if (retval == -ENODATA || retval == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+
+	kfree(value);
+
+	return acl;
+}
+
+/*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+			      handle_t *handle, umode_t new_mode)
+{
+	int ret, commit_handle = 0;
+	struct ocfs2_dinode *di;
+
+	if (di_bh == NULL) {
+		ret = ocfs2_read_inode_block(inode, &di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else
+		get_bh(di_bh);
+
+	if (handle == NULL) {
+		handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+					   OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out_brelse;
+		}
+
+		commit_handle = 1;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_mode = new_mode;
+	inode->i_ctime = CURRENT_TIME;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	if (commit_handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+int ocfs2_set_acl(handle_t *handle,
+			 struct inode *inode,
+			 struct buffer_head *di_bh,
+			 int type,
+			 struct posix_acl *acl,
+			 struct ocfs2_alloc_context *meta_ac,
+			 struct ocfs2_alloc_context *data_ac)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			umode_t mode = inode->i_mode;
+			ret = posix_acl_equiv_mode(acl, &mode);
+			if (ret < 0)
+				return ret;
+
+			if (ret == 0)
+				acl = NULL;
+
+			ret = ocfs2_acl_set_mode(inode, di_bh,
+						 handle, mode);
+			if (ret)
+				return ret;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = ocfs2_acl_to_xattr(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	if (handle)
+		ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+					     "", value, size, 0,
+					     meta_ac, data_ac);
+	else
+		ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+	kfree(value);
+
+	return ret;
+}
+
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+}
+
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
+{
+	struct ocfs2_super *osb;
+	struct buffer_head *di_bh = NULL;
+	struct posix_acl *acl;
+	int ret = -EAGAIN;
+
+	osb = OCFS2_SB(inode->i_sb);
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+	brelse(di_bh);
+
+	return acl;
+}
diff --git a/kernel/fs/ocfs2/acl.h b/kernel/fs/ocfs2/acl.h
new file mode 100644
index 000000000..3fce68d08
--- /dev/null
+++ b/kernel/fs/ocfs2/acl.h
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
+};
+
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_set_acl(handle_t *handle,
+			 struct inode *inode,
+			 struct buffer_head *di_bh,
+			 int type,
+			 struct posix_acl *acl,
+			 struct ocfs2_alloc_context *meta_ac,
+			 struct ocfs2_alloc_context *data_ac);
+
+#endif /* OCFS2_ACL_H */
diff --git a/kernel/fs/ocfs2/alloc.c b/kernel/fs/ocfs2/alloc.c
new file mode 100644
index 000000000..2d7f76e52
--- /dev/null
+++ b/kernel/fs/ocfs2/alloc.c
@@ -0,0 +1,7405 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+enum ocfs2_contig_type {
+	CONTIG_NONE = 0,
+	CONTIG_LEFT,
+	CONTIG_RIGHT,
+	CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_contig_type
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec);
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+	/*
+	 * last_eb_blk is the block number of the right most leaf extent
+	 * block.  Most on-disk structures containing an extent tree store
+	 * this value for fast access.  The ->eo_set_last_eb_blk() and
+	 * ->eo_get_last_eb_blk() operations access this value.  They are
+	 *  both required.
+	 */
+	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+				   u64 blkno);
+	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * The on-disk structure usually keeps track of how many total
+	 * clusters are stored in this extent tree.  This function updates
+	 * that value.  new_clusters is the delta, and must be
+	 * added to the total.  Required.
+	 */
+	void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
+				   u32 new_clusters);
+
+	/*
+	 * If this extent tree is supported by an extent map, insert
+	 * a record into the map.
+	 */
+	void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec);
+
+	/*
+	 * If this extent tree is supported by an extent map, truncate the
+	 * map to clusters,
+	 */
+	void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
+				       u32 clusters);
+
+	/*
+	 * If ->eo_insert_check() exists, it is called before rec is
+	 * inserted into the extent tree.  It is optional.
+	 */
+	int (*eo_insert_check)(struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec);
+	int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * --------------------------------------------------------------
+	 * The remaining are internal to ocfs2_extent_tree and don't have
+	 * accessor functions
+	 */
+
+	/*
+	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+	 * It is required.
+	 */
+	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+	 * it exists.  If it does not, et->et_max_leaf_clusters is set
+	 * to 0 (unlimited).  Optional.
+	 */
+	void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
+	 * are contiguous or not. Optional. Don't need to set it if use
+	 * ocfs2_extent_rec as the tree leaf.
+	 */
+	enum ocfs2_contig_type
+		(*eo_extent_contig)(struct ocfs2_extent_tree *et,
+				    struct ocfs2_extent_rec *ext,
+				    struct ocfs2_extent_rec *insert_rec);
+};
+
+
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno);
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
+					 u32 clusters);
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+					   struct ocfs2_extent_rec *rec);
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+					     u32 clusters);
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dinode_update_clusters,
+	.eo_extent_map_insert	= ocfs2_dinode_extent_map_insert,
+	.eo_extent_map_truncate	= ocfs2_dinode_extent_map_truncate,
+	.eo_insert_check	= ocfs2_dinode_insert_check,
+	.eo_sanity_check	= ocfs2_dinode_sanity_check,
+	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
+					 u32 clusters)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+	struct ocfs2_dinode *di = et->et_object;
+
+	le32_add_cpu(&di->i_clusters, clusters);
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = le32_to_cpu(di->i_clusters);
+	spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+					   struct ocfs2_extent_rec *rec)
+{
+	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+	ocfs2_extent_map_insert_rec(inode, rec);
+}
+
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+					     u32 clusters)
+{
+	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+	ocfs2_extent_map_trunc(inode, clusters);
+}
+
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+	struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+			(oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
+			"Device %s, asking for sparse allocation: inode %llu, "
+			"cpos %u, clusters %u\n",
+			osb->dev_str,
+			(unsigned long long)oi->ip_blkno,
+			rec->e_cpos, oi->ip_clusters);
+
+	return 0;
+}
+
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	return 0;
+}
+
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	et->et_root_el = &di->id2.i_list;
+}
+
+
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	et->et_root_el = &vb->vb_xv->xr_list;
+}
+
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					      u64 blkno)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
+					      u32 clusters)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
+};
+
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
+{
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	et->et_max_leaf_clusters =
+		ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					     u64 blkno)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
+					     u32 clusters)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
+	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					  u64 blkno)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+
+static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
+					  u32 clusters)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+
+static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+
+	return 0;
+}
+
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	et->et_root_el = &dx_root->dr_list;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
+	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
+	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
+};
+
+static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	et->et_root_el = &rb->rf_list;
+}
+
+static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+						u64 blkno)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	rb->rf_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	return le64_to_cpu(rb->rf_last_eb_blk);
+}
+
+static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
+						u32 clusters)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	le32_add_cpu(&rb->rf_clusters, clusters);
+}
+
+static enum ocfs2_contig_type
+ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
+				  struct ocfs2_extent_rec *ext,
+				  struct ocfs2_extent_rec *insert_rec)
+{
+	return CONTIG_NONE;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_refcount_tree_fill_root_el,
+	.eo_extent_contig	= ocfs2_refcount_tree_extent_contig,
+};
+
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh,
+				     ocfs2_journal_access_func access,
+				     void *obj,
+				     struct ocfs2_extent_tree_operations *ops)
+{
+	et->et_ops = ops;
+	et->et_root_bh = bh;
+	et->et_ci = ci;
+	et->et_root_journal_access = access;
+	if (!obj)
+		obj = (void *)bh->b_data;
+	et->et_object = obj;
+
+	et->et_ops->eo_fill_root_el(et);
+	if (!et->et_ops->eo_fill_max_leaf_clusters)
+		et->et_max_leaf_clusters = 0;
+	else
+		et->et_ops->eo_fill_max_leaf_clusters(et);
+}
+
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
+				 NULL, &ocfs2_dinode_et_ops);
+}
+
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct ocfs2_caching_info *ci,
+				       struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
+				 NULL, &ocfs2_xattr_tree_et_ops);
+}
+
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ci,
+					struct ocfs2_xattr_value_buf *vb)
+{
+	__ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
+				 &ocfs2_xattr_value_et_ops);
+}
+
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
+				 NULL, &ocfs2_dx_root_et_ops);
+}
+
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
+				 NULL, &ocfs2_refcount_tree_et_ops);
+}
+
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					    u64 new_last_eb_blk)
+{
+	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	return et->et_ops->eo_get_last_eb_blk(et);
+}
+
+static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
+					    u32 clusters)
+{
+	et->et_ops->eo_update_clusters(et, clusters);
+}
+
+static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
+					      struct ocfs2_extent_rec *rec)
+{
+	if (et->et_ops->eo_extent_map_insert)
+		et->et_ops->eo_extent_map_insert(et, rec);
+}
+
+static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
+						u32 clusters)
+{
+	if (et->et_ops->eo_extent_map_truncate)
+		et->et_ops->eo_extent_map_truncate(et, clusters);
+}
+
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+					       struct ocfs2_extent_tree *et,
+					       int type)
+{
+	return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
+					  type);
+}
+
+static inline enum ocfs2_contig_type
+	ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec,
+			       struct ocfs2_extent_rec *insert_rec)
+{
+	if (et->et_ops->eo_extent_contig)
+		return et->et_ops->eo_extent_contig(et, rec, insert_rec);
+
+	return ocfs2_extent_rec_contig(
+				ocfs2_metadata_cache_get_super(et->et_ci),
+				rec, insert_rec);
+}
+
+static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
+					struct ocfs2_extent_rec *rec)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_insert_check)
+		ret = et->et_ops->eo_insert_check(et, rec);
+	return ret;
+}
+
+static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_sanity_check)
+		ret = et->et_ops->eo_sanity_check(et);
+	return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb);
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+					   struct ocfs2_extent_tree *et,
+					   struct ocfs2_path *path,
+					   struct ocfs2_extent_rec *insert_rec);
+/*
+ * Reset the actual path elements so that we can re-use the structure
+ * to build another path. Generally, this involves freeing the buffer
+ * heads.
+ */
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+	int i, start = 0, depth = 0;
+	struct ocfs2_path_item *node;
+
+	if (keep_root)
+		start = 1;
+
+	for(i = start; i < path_num_items(path); i++) {
+		node = &path->p_node[i];
+
+		brelse(node->bh);
+		node->bh = NULL;
+		node->el = NULL;
+	}
+
+	/*
+	 * Tree depth may change during truncate, or insert. If we're
+	 * keeping the root extent list, then make sure that our path
+	 * structure reflects the proper depth.
+	 */
+	if (keep_root)
+		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+	else
+		path_root_access(path) = NULL;
+
+	path->p_tree_depth = depth;
+}
+
+void ocfs2_free_path(struct ocfs2_path *path)
+{
+	if (path) {
+		ocfs2_reinit_path(path, 0);
+		kfree(path);
+	}
+}
+
+/*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+	int i;
+
+	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+	BUG_ON(path_root_el(dest) != path_root_el(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
+
+	ocfs2_reinit_path(dest, 1);
+
+	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+		dest->p_node[i].bh = src->p_node[i].bh;
+		dest->p_node[i].el = src->p_node[i].el;
+
+		if (dest->p_node[i].bh)
+			get_bh(dest->p_node[i].bh);
+	}
+}
+
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+	int i;
+
+	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
+
+	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+		brelse(dest->p_node[i].bh);
+
+		dest->p_node[i].bh = src->p_node[i].bh;
+		dest->p_node[i].el = src->p_node[i].el;
+
+		src->p_node[i].bh = NULL;
+		src->p_node[i].el = NULL;
+	}
+}
+
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+					struct buffer_head *eb_bh)
+{
+	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+	/*
+	 * Right now, no root bh is an extent block, so this helps
+	 * catch code errors with dinode trees. The assertion can be
+	 * safely removed if we ever need to insert extent block
+	 * structures at the root.
+	 */
+	BUG_ON(index == 0);
+
+	path->p_node[index].bh = eb_bh;
+	path->p_node[index].el = &eb->h_list;
+}
+
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+					 struct ocfs2_extent_list *root_el,
+					 ocfs2_journal_access_func access)
+{
+	struct ocfs2_path *path;
+
+	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+
+	path = kzalloc(sizeof(*path), GFP_NOFS);
+	if (path) {
+		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+		get_bh(root_bh);
+		path_root_bh(path) = root_bh;
+		path_root_el(path) = root_el;
+		path_root_access(path) = access;
+	}
+
+	return path;
+}
+
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+			      path_root_access(path));
+}
+
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+			      et->et_root_journal_access);
+}
+
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx)
+{
+	ocfs2_journal_access_func access = path_root_access(path);
+
+	if (!access)
+		access = ocfs2_journal_access;
+
+	if (idx)
+		access = ocfs2_journal_access_eb;
+
+	return access(handle, ci, path->p_node[idx].bh,
+		      OCFS2_JOURNAL_ACCESS_WRITE);
+}
+
+/*
+ * Convenience function to journal all components in a path.
+ */
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path)
+{
+	int i, ret = 0;
+
+	if (!path)
+		goto out;
+
+	for(i = 0; i < path_num_items(path); i++) {
+		ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+	int ret = -1;
+	int i;
+	struct ocfs2_extent_rec *rec;
+	u32 rec_end, rec_start, clusters;
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		rec_start = le32_to_cpu(rec->e_cpos);
+		clusters = ocfs2_rec_clusters(el, rec);
+
+		rec_end = rec_start + clusters;
+
+		if (v_cluster >= rec_start && v_cluster < rec_end) {
+			ret = i;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_rec_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+				     struct ocfs2_extent_rec *ext,
+				     u64 blkno)
+{
+	u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+	blk_end += ocfs2_clusters_to_blocks(sb,
+				    le16_to_cpu(ext->e_leaf_clusters));
+
+	return blkno == blk_end;
+}
+
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+				  struct ocfs2_extent_rec *right)
+{
+	u32 left_range;
+
+	left_range = le32_to_cpu(left->e_cpos) +
+		le16_to_cpu(left->e_leaf_clusters);
+
+	return (left_range == le32_to_cpu(right->e_cpos));
+}
+
+static enum ocfs2_contig_type
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec)
+{
+	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+
+	/*
+	 * Refuse to coalesce extent records with different flag
+	 * fields - we don't want to mix unwritten extents with user
+	 * data.
+	 */
+	if (ext->e_flags != insert_rec->e_flags)
+		return CONTIG_NONE;
+
+	if (ocfs2_extents_adjacent(ext, insert_rec) &&
+	    ocfs2_block_extent_contig(sb, ext, blkno))
+			return CONTIG_RIGHT;
+
+	blkno = le64_to_cpu(ext->e_blkno);
+	if (ocfs2_extents_adjacent(insert_rec, ext) &&
+	    ocfs2_block_extent_contig(sb, insert_rec, blkno))
+		return CONTIG_LEFT;
+
+	return CONTIG_NONE;
+}
+
+/*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+	APPEND_NONE = 0,
+	APPEND_TAIL,
+};
+
+enum ocfs2_split_type {
+	SPLIT_NONE = 0,
+	SPLIT_LEFT,
+	SPLIT_RIGHT,
+};
+
+struct ocfs2_insert_type {
+	enum ocfs2_split_type	ins_split;
+	enum ocfs2_append_type	ins_appending;
+	enum ocfs2_contig_type	ins_contig;
+	int			ins_contig_index;
+	int			ins_tree_depth;
+};
+
+struct ocfs2_merge_ctxt {
+	enum ocfs2_contig_type	c_contig_type;
+	int			c_has_empty_extent;
+	int			c_split_covers_rec;
+};
+
+static int ocfs2_validate_extent_block(struct super_block *sb,
+				       struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_extent_block *eb =
+		(struct ocfs2_extent_block *)bh->b_data;
+
+	trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return rc;
+	}
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    eb->h_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid h_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(eb->h_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid "
+			    "h_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(eb->h_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
+			    struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(ci, eb_blkno, &tmp,
+			      ocfs2_validate_extent_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct ocfs2_extent_tree *et)
+{
+	int retval;
+	struct ocfs2_extent_list *el = NULL;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL;
+	u64 last_eb_blk = 0;
+
+	el = et->et_root_el;
+	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
+
+	if (last_eb_blk) {
+		retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
+						 &eb_bh);
+		if (retval < 0) {
+			mlog_errno(retval);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	}
+
+	BUG_ON(el->l_tree_depth != 0);
+
+	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+	brelse(eb_bh);
+
+	trace_ocfs2_num_free_extents(retval);
+	return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[])
+{
+	int count, status, i;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, first_blkno;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+	struct ocfs2_extent_block *eb;
+
+	count = 0;
+	while (count < wanted) {
+		status = ocfs2_claim_metadata(handle,
+					      meta_ac,
+					      wanted - count,
+					      &suballoc_loc,
+					      &suballoc_bit_start,
+					      &num_got,
+					      &first_blkno);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = count;  i < (num_got + count); i++) {
+			bhs[i] = sb_getblk(osb->sb, first_blkno);
+			if (bhs[i] == NULL) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto bail;
+			}
+			ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
+
+			status = ocfs2_journal_access_eb(handle, et->et_ci,
+							 bhs[i],
+							 OCFS2_JOURNAL_ACCESS_CREATE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+			/* Ok, setup the minimal stuff here. */
+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+			eb->h_blkno = cpu_to_le64(first_blkno);
+			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+			eb->h_suballoc_slot =
+				cpu_to_le16(meta_ac->ac_alloc_slot);
+			eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
+			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+			eb->h_list.l_count =
+				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+			suballoc_bit_start++;
+			first_blkno++;
+
+			/* We'll also be dirtied by the caller, so
+			 * this isn't absolutely necessary. */
+			ocfs2_journal_dirty(handle, bhs[i]);
+		}
+
+		count += num_got;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		for(i = 0; i < wanted; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+		mlog_errno(status);
+	}
+	return status;
+}
+
+/*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
+{
+	int i;
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+	return le32_to_cpu(el->l_recs[i].e_cpos) +
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+
+/*
+ * Change range of the branches in the right most path according to the leaf
+ * extent block's rightmost record.
+ */
+static int ocfs2_adjust_rightmost_branch(handle_t *handle,
+					 struct ocfs2_extent_tree *et)
+{
+	int status;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		status = -ENOMEM;
+		return status;
+	}
+
+	status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_extend_trans(handle, path_num_items(path));
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
+
+	ocfs2_adjust_rightmost_records(handle, et, path, rec);
+
+out:
+	ocfs2_free_path(path);
+	return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the root
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with cluster count == 0.
+ */
+static int ocfs2_add_branch(handle_t *handle,
+			    struct ocfs2_extent_tree *et,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head **last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int status, new_blocks, i;
+	u64 next_blkno, new_last_eb_blk;
+	struct buffer_head *bh;
+	struct buffer_head **new_eb_bhs = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *eb_el;
+	struct ocfs2_extent_list  *el;
+	u32 new_cpos, root_end;
+
+	BUG_ON(!last_eb_bh || !*last_eb_bh);
+
+	if (eb_bh) {
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = et->et_root_el;
+
+	/* we never add a branch to a leaf. */
+	BUG_ON(!el->l_tree_depth);
+
+	new_blocks = le16_to_cpu(el->l_tree_depth);
+
+	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+	root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
+
+	/*
+	 * If there is a gap before the root end and the real end
+	 * of the righmost leaf block, we need to remove the gap
+	 * between new_cpos and root_end first so that the tree
+	 * is consistent after we add a new branch(it will start
+	 * from new_cpos).
+	 */
+	if (root_end > new_cpos) {
+		trace_ocfs2_adjust_rightmost_branch(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			root_end, new_cpos);
+
+		status = ocfs2_adjust_rightmost_branch(handle, et);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* allocate the number of new eb blocks we need */
+	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+			     GFP_KERNEL);
+	if (!new_eb_bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
+					   meta_ac, new_eb_bhs);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree.
+	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
+	 *
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block. */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		bh = new_eb_bhs[i];
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		/* ocfs2_create_new_meta_bhs() should create it right! */
+		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
+		eb_el = &eb->h_list;
+
+		status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
+						 OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = cpu_to_le16(i);
+		eb_el->l_next_free_rec = cpu_to_le16(1);
+		/*
+		 * This actually counts as an empty extent as
+		 * c_clusters == 0
+		 */
+		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
+		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+		/*
+		 * eb_el isn't always an interior node, but even leaf
+		 * nodes want a zero'd flags and reserved field so
+		 * this gets the whole 32 bits regardless of use.
+		 */
+		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+		ocfs2_journal_dirty(handle, bh);
+		next_blkno = le64_to_cpu(eb->h_blkno);
+	}
+
+	/* This is a bit hairy. We want to update up to three blocks
+	 * here without leaving any of them in an inconsistent state
+	 * in case of error. We don't have to worry about
+	 * journal_dirty erroring as it won't unless we've aborted the
+	 * handle (in which case we would never be here) so reserving
+	 * the write with journal_access is all we need to do. */
+	status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = ocfs2_et_root_journal_access(handle, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (eb_bh) {
+		status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the root_bh, or the extent block passed in. */
+	i = le16_to_cpu(el->l_next_free_rec);
+	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+	el->l_recs[i].e_int_clusters = 0;
+	le16_add_cpu(&el->l_next_free_rec, 1);
+
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block. */
+	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
+
+	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+	ocfs2_journal_dirty(handle, *last_eb_bh);
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+	if (eb_bh)
+		ocfs2_journal_dirty(handle, eb_bh);
+
+	/*
+	 * Some callers want to track the rightmost leaf so pass it
+	 * back here.
+	 */
+	brelse(*last_eb_bh);
+	get_bh(new_eb_bhs[0]);
+	*last_eb_bh = new_eb_bhs[0];
+
+	status = 0;
+bail:
+	if (new_eb_bhs) {
+		for (i = 0; i < new_blocks; i++)
+			brelse(new_eb_bhs[i]);
+		kfree(new_eb_bhs);
+	}
+
+	return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh)
+{
+	int status, i;
+	u32 new_clusters;
+	struct buffer_head *new_eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *root_el;
+	struct ocfs2_extent_list  *eb_el;
+
+	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+					   &new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+	/* ocfs2_create_new_meta_bhs() should create it right! */
+	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
+
+	eb_el = &eb->h_list;
+	root_el = et->et_root_el;
+
+	status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* copy the root extent list data into the new extent block */
+	eb_el->l_tree_depth = root_el->l_tree_depth;
+	eb_el->l_next_free_rec = root_el->l_next_free_rec;
+	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = root_el->l_recs[i];
+
+	ocfs2_journal_dirty(handle, new_eb_bh);
+
+	status = ocfs2_et_root_journal_access(handle, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
+
+	/* update root_bh now */
+	le16_add_cpu(&root_el->l_tree_depth, 1);
+	root_el->l_recs[0].e_cpos = 0;
+	root_el->l_recs[0].e_blkno = eb->h_blkno;
+	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	root_el->l_next_free_rec = cpu_to_le16(1);
+
+	/* If this is our 1st tree depth shift, then last_eb_blk
+	 * becomes the allocated extent block */
+	if (root_el->l_tree_depth == cpu_to_le16(1))
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+	*ret_new_eb_bh = new_eb_bh;
+	new_eb_bh = NULL;
+	status = 0;
+bail:
+	brelse(new_eb_bh);
+
+	return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the root_el has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the root_el is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
+				    struct buffer_head **target_bh)
+{
+	int status = 0, i;
+	u64 blkno;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lowest_bh = NULL;
+
+	*target_bh = NULL;
+
+	el = et->et_root_el;
+
+	while(le16_to_cpu(el->l_tree_depth) > 1) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has empty "
+				    "extent list (next_free_rec == 0)",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+			status = -EIO;
+			goto bail;
+		}
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (!blkno) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has extent "
+				    "list where extent # %d has no physical "
+				    "block start",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
+			status = -EIO;
+			goto bail;
+		}
+
+		brelse(bh);
+		bh = NULL;
+
+		status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) <
+		    le16_to_cpu(el->l_count)) {
+			brelse(lowest_bh);
+			lowest_bh = bh;
+			get_bh(lowest_bh);
+		}
+	}
+
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	el = et->et_root_el;
+	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
+		status = 1;
+
+	*target_bh = lowest_bh;
+bail:
+	brelse(bh);
+
+	return status;
+}
+
+/*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+			   int *final_depth, struct buffer_head **last_eb_bh,
+			   struct ocfs2_alloc_context *meta_ac)
+{
+	int ret, shift;
+	struct ocfs2_extent_list *el = et->et_root_el;
+	int depth = le16_to_cpu(el->l_tree_depth);
+	struct buffer_head *bh = NULL;
+
+	BUG_ON(meta_ac == NULL);
+
+	shift = ocfs2_find_branch_target(et, &bh);
+	if (shift < 0) {
+		ret = shift;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We traveled all the way to the bottom of the allocation tree
+	 * and didn't find room for any more extents - we need to add
+	 * another tree level */
+	if (shift) {
+		BUG_ON(bh);
+		trace_ocfs2_grow_tree(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			depth);
+
+		/* ocfs2_shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		depth++;
+		if (depth == 1) {
+			/*
+			 * Special case: we have room now if we shifted from
+			 * tree_depth 0, so no more work needs to be done.
+			 *
+			 * We won't be calling add_branch, so pass
+			 * back *last_eb_bh as the new leaf. At depth
+			 * zero, it should always be null so there's
+			 * no reason to brelse.
+			 */
+			BUG_ON(*last_eb_bh);
+			get_bh(bh);
+			*last_eb_bh = bh;
+			goto out;
+		}
+	}
+
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
+			       meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (final_depth)
+		*final_depth = depth;
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+	int count = le16_to_cpu(el->l_count);
+	unsigned int num_bytes;
+
+	BUG_ON(!next_free);
+	/* This will cause us to go off the end of our extent list. */
+	BUG_ON(next_free >= count);
+
+	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+
+	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+			      struct ocfs2_extent_rec *insert_rec)
+{
+	int i, insert_index, next_free, has_empty, num_bytes;
+	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+	struct ocfs2_extent_rec *rec;
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+	BUG_ON(!next_free);
+
+	/* The tree code before us didn't allow enough room in the leaf. */
+	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
+
+	/*
+	 * The easiest way to approach this is to just remove the
+	 * empty extent and temporarily decrement next_free.
+	 */
+	if (has_empty) {
+		/*
+		 * If next_free was 1 (only an empty extent), this
+		 * loop won't execute, which is fine. We still want
+		 * the decrement above to happen.
+		 */
+		for(i = 0; i < (next_free - 1); i++)
+			el->l_recs[i] = el->l_recs[i+1];
+
+		next_free--;
+	}
+
+	/*
+	 * Figure out what the new record index should be.
+	 */
+	for(i = 0; i < next_free; i++) {
+		rec = &el->l_recs[i];
+
+		if (insert_cpos < le32_to_cpu(rec->e_cpos))
+			break;
+	}
+	insert_index = i;
+
+	trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
+				has_empty, next_free,
+				le16_to_cpu(el->l_count));
+
+	BUG_ON(insert_index < 0);
+	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+	BUG_ON(insert_index > next_free);
+
+	/*
+	 * No need to memmove if we're just adding to the tail.
+	 */
+	if (insert_index != next_free) {
+		BUG_ON(next_free >= le16_to_cpu(el->l_count));
+
+		num_bytes = next_free - insert_index;
+		num_bytes *= sizeof(struct ocfs2_extent_rec);
+		memmove(&el->l_recs[insert_index + 1],
+			&el->l_recs[insert_index],
+			num_bytes);
+	}
+
+	/*
+	 * Either we had an empty extent, and need to re-increment or
+	 * there was no empty extent on a non full rightmost leaf node,
+	 * in which case we still need to increment.
+	 */
+	next_free++;
+	el->l_next_free_rec = cpu_to_le16(next_free);
+	/*
+	 * Make sure none of the math above just messed up our tree.
+	 */
+	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+
+	el->l_recs[insert_index] = *insert_rec;
+
+}
+
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+
+	BUG_ON(num_recs == 0);
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		num_recs--;
+		size = num_recs * sizeof(struct ocfs2_extent_rec);
+		memmove(&el->l_recs[0], &el->l_recs[1], size);
+		memset(&el->l_recs[num_recs], 0,
+		       sizeof(struct ocfs2_extent_rec));
+		el->l_next_free_rec = cpu_to_le16(num_recs);
+	}
+}
+
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (next_free == 0)
+		goto set_and_inc;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0]))
+		return;
+
+	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+			"Asked to create an empty extent in a full list:\n"
+			"count = %u, tree depth = %u",
+			le16_to_cpu(el->l_count),
+			le16_to_cpu(el->l_tree_depth));
+
+	ocfs2_shift_records_right(el);
+
+set_and_inc:
+	le16_add_cpu(&el->l_next_free_rec, 1);
+	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *left,
+			    struct ocfs2_path *right)
+{
+	int i = 0;
+
+	/*
+	 * Check that the caller passed in two paths from the same tree.
+	 */
+	BUG_ON(path_root_bh(left) != path_root_bh(right));
+
+	do {
+		i++;
+
+		/*
+		 * The caller didn't pass two adjacent paths.
+		 */
+		mlog_bug_on_msg(i > left->p_tree_depth,
+				"Owner %llu, left depth %u, right depth %u\n"
+				"left leaf blk %llu, right leaf blk %llu\n",
+				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				left->p_tree_depth, right->p_tree_depth,
+				(unsigned long long)path_leaf_bh(left)->b_blocknr,
+				(unsigned long long)path_leaf_bh(right)->b_blocknr);
+	} while (left->p_node[i].bh->b_blocknr ==
+		 right->p_node[i].bh->b_blocknr);
+
+	return i - 1;
+}
+
+typedef void (path_insert_t)(void *, struct buffer_head *);
+
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
+			     struct ocfs2_extent_list *root_el, u32 cpos,
+			     path_insert_t *func, void *data)
+{
+	int i, ret = 0;
+	u32 range;
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	el = root_el;
+	while (el->l_tree_depth) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+				    "Owner %llu has empty extent list at "
+				    "depth %u\n",
+				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
+				    le16_to_cpu(el->l_tree_depth));
+			ret = -EROFS;
+			goto out;
+
+		}
+
+		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
+			rec = &el->l_recs[i];
+
+			/*
+			 * In the case that cpos is off the allocation
+			 * tree, this should just wind up returning the
+			 * rightmost record.
+			 */
+			range = le32_to_cpu(rec->e_cpos) +
+				ocfs2_rec_clusters(el, rec);
+			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+			    break;
+		}
+
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (blkno == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+				    "Owner %llu has bad blkno in extent list "
+				    "at depth %u (index %d)\n",
+				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
+				    le16_to_cpu(el->l_tree_depth), i);
+			ret = -EROFS;
+			goto out;
+		}
+
+		brelse(bh);
+		bh = NULL;
+		ret = ocfs2_read_extent_block(ci, blkno, &bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) >
+		    le16_to_cpu(el->l_count)) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+				    "Owner %llu has bad count in extent list "
+				    "at block %llu (next free=%u, count=%u)\n",
+				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
+				    (unsigned long long)bh->b_blocknr,
+				    le16_to_cpu(el->l_next_free_rec),
+				    le16_to_cpu(el->l_count));
+			ret = -EROFS;
+			goto out;
+		}
+
+		if (func)
+			func(data, bh);
+	}
+
+out:
+	/*
+	 * Catch any trailing bh that the loop didn't handle.
+	 */
+	brelse(bh);
+
+	return ret;
+}
+
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+	int index;
+	struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+	struct find_path_data *fp = data;
+
+	get_bh(bh);
+	ocfs2_path_insert_eb(fp->path, fp->index, bh);
+	fp->index++;
+}
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path, u32 cpos)
+{
+	struct find_path_data data;
+
+	data.index = 1;
+	data.path = path;
+	return __ocfs2_find_path(ci, path_root_el(path), cpos,
+				 find_path_ins, &data);
+}
+
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+	struct ocfs2_extent_list *el = &eb->h_list;
+	struct buffer_head **ret = data;
+
+	/* We want to retain only the leaf block. */
+	if (le16_to_cpu(el->l_tree_depth) == 0) {
+		get_bh(bh);
+		*ret = bh;
+	}
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+		    struct ocfs2_extent_list *root_el, u32 cpos,
+		    struct buffer_head **leaf_bh)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+
+	ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*leaf_bh = bh;
+out:
+	return ret;
+}
+
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+				  struct ocfs2_extent_list *left_child_el,
+				  struct ocfs2_extent_rec *right_rec,
+				  struct ocfs2_extent_list *right_child_el)
+{
+	u32 left_clusters, right_end;
+
+	/*
+	 * Interior nodes never have holes. Their cpos is the cpos of
+	 * the leftmost record in their child list. Their cluster
+	 * count covers the full theoretical range of their child list
+	 * - the range between their cpos and the cpos of the record
+	 * immediately to their right.
+	 */
+	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+		BUG_ON(right_child_el->l_tree_depth);
+		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+	}
+	left_clusters -= le32_to_cpu(left_rec->e_cpos);
+	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+
+	/*
+	 * Calculate the rightmost cluster count boundary before
+	 * moving cpos - we will need to adjust clusters after
+	 * updating e_cpos to keep the same highest cluster count.
+	 */
+	right_end = le32_to_cpu(right_rec->e_cpos);
+	right_end += le32_to_cpu(right_rec->e_int_clusters);
+
+	right_rec->e_cpos = left_rec->e_cpos;
+	le32_add_cpu(&right_rec->e_cpos, left_clusters);
+
+	right_end -= le32_to_cpu(right_rec->e_cpos);
+	right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+				      struct ocfs2_extent_list *left_el,
+				      struct ocfs2_extent_list *right_el,
+				      u64 left_el_blkno)
+{
+	int i;
+
+	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+	       le16_to_cpu(left_el->l_tree_depth));
+
+	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+			break;
+	}
+
+	/*
+	 * The path walking code should have never returned a root and
+	 * two paths which are not adjacent.
+	 */
+	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+
+	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+				      &root_el->l_recs[i + 1], right_el);
+}
+
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ *   - When we've moved an extent record from the left path leaf to the right
+ *     path leaf to make room for an empty extent in the left path leaf.
+ *   - When our insert into the right path leaf is at the leftmost edge
+ *     and requires an update of the path immediately to it's left. This
+ *     can occur at the end of some types of rotation and appending inserts.
+ *   - When we've adjusted the last extent record in the left path leaf and the
+ *     1st extent record in the right path leaf during cross extent block merge.
+ */
+static void ocfs2_complete_edge_insert(handle_t *handle,
+				       struct ocfs2_path *left_path,
+				       struct ocfs2_path *right_path,
+				       int subtree_index)
+{
+	int i, idx;
+	struct ocfs2_extent_list *el, *left_el, *right_el;
+	struct ocfs2_extent_rec *left_rec, *right_rec;
+	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+
+	/*
+	 * Update the counts and position values within all the
+	 * interior nodes to reflect the leaf rotation we just did.
+	 *
+	 * The root node is handled below the loop.
+	 *
+	 * We begin the loop with right_el and left_el pointing to the
+	 * leaf lists and work our way up.
+	 *
+	 * NOTE: within this loop, left_el and right_el always refer
+	 * to the *child* lists.
+	 */
+	left_el = path_leaf_el(left_path);
+	right_el = path_leaf_el(right_path);
+	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+		trace_ocfs2_complete_edge_insert(i);
+
+		/*
+		 * One nice property of knowing that all of these
+		 * nodes are below the root is that we only deal with
+		 * the leftmost right node record and the rightmost
+		 * left node record.
+		 */
+		el = left_path->p_node[i].el;
+		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+		left_rec = &el->l_recs[idx];
+
+		el = right_path->p_node[i].el;
+		right_rec = &el->l_recs[0];
+
+		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+					      right_el);
+
+		ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+		ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+
+		/*
+		 * Setup our list pointers now so that the current
+		 * parents become children in the next iteration.
+		 */
+		left_el = left_path->p_node[i].el;
+		right_el = right_path->p_node[i].el;
+	}
+
+	/*
+	 * At the root node, adjust the two adjacent records which
+	 * begin our path to the leaves.
+	 */
+
+	el = left_path->p_node[subtree_index].el;
+	left_el = left_path->p_node[subtree_index + 1].el;
+	right_el = right_path->p_node[subtree_index + 1].el;
+
+	ocfs2_adjust_root_records(el, left_el, right_el,
+				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
+
+	root_bh = left_path->p_node[subtree_index].bh;
+
+	ocfs2_journal_dirty(handle, root_bh);
+}
+
+static int ocfs2_rotate_subtree_right(handle_t *handle,
+				      struct ocfs2_extent_tree *et,
+				      struct ocfs2_path *left_path,
+				      struct ocfs2_path *right_path,
+				      int subtree_index)
+{
+	int ret, i;
+	struct buffer_head *right_leaf_bh;
+	struct buffer_head *left_leaf_bh = NULL;
+	struct buffer_head *root_bh;
+	struct ocfs2_extent_list *right_el, *left_el;
+	struct ocfs2_extent_rec move_rec;
+
+	left_leaf_bh = path_leaf_bh(left_path);
+	left_el = path_leaf_el(left_path);
+
+	if (left_el->l_next_free_rec != left_el->l_count) {
+		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+			    "Inode %llu has non-full interior leaf node %llu"
+			    "(next free = %u)",
+			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			    (unsigned long long)left_leaf_bh->b_blocknr,
+			    le16_to_cpu(left_el->l_next_free_rec));
+		return -EROFS;
+	}
+
+	/*
+	 * This extent block may already have an empty record, so we
+	 * return early if so.
+	 */
+	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+		return 0;
+
+	root_bh = left_path->p_node[subtree_index].bh;
+	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+					   subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   right_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   left_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	right_leaf_bh = path_leaf_bh(right_path);
+	right_el = path_leaf_el(right_path);
+
+	/* This is a code error, not a disk corruption. */
+	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+			"because rightmost leaf block %llu is empty\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			(unsigned long long)right_leaf_bh->b_blocknr);
+
+	ocfs2_create_empty_extent(right_el);
+
+	ocfs2_journal_dirty(handle, right_leaf_bh);
+
+	/* Do the copy now. */
+	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+	move_rec = left_el->l_recs[i];
+	right_el->l_recs[0] = move_rec;
+
+	/*
+	 * Clear out the record we just copied and shift everything
+	 * over, leaving an empty extent in the left leaf.
+	 *
+	 * We temporarily subtract from next_free_rec so that the
+	 * shift will lose the tail record (which is now defunct).
+	 */
+	le16_add_cpu(&left_el->l_next_free_rec, -1);
+	ocfs2_shift_records_right(left_el);
+	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+	le16_add_cpu(&left_el->l_next_free_rec, 1);
+
+	ocfs2_journal_dirty(handle, left_leaf_bh);
+
+	ocfs2_complete_edge_insert(handle, left_path, right_path,
+				   subtree_index);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+				  struct ocfs2_path *path, u32 *cpos)
+{
+	int i, j, ret = 0;
+	u64 blkno;
+	struct ocfs2_extent_list *el;
+
+	BUG_ON(path->p_tree_depth == 0);
+
+	*cpos = 0;
+
+	blkno = path_leaf_bh(path)->b_blocknr;
+
+	/* Start at the tree node just above the leaf and work our way up. */
+	i = path->p_tree_depth - 1;
+	while (i >= 0) {
+		el = path->p_node[i].el;
+
+		/*
+		 * Find the extent record just before the one in our
+		 * path.
+		 */
+		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+				if (j == 0) {
+					if (i == 0) {
+						/*
+						 * We've determined that the
+						 * path specified is already
+						 * the leftmost one - return a
+						 * cpos of zero.
+						 */
+						goto out;
+					}
+					/*
+					 * The leftmost record points to our
+					 * leaf - we need to travel up the
+					 * tree one level.
+					 */
+					goto next_node;
+				}
+
+				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+				*cpos = *cpos + ocfs2_rec_clusters(el,
+							   &el->l_recs[j - 1]);
+				*cpos = *cpos - 1;
+				goto out;
+			}
+		}
+
+		/*
+		 * If we got here, we never found a valid node where
+		 * the tree indicated one should be.
+		 */
+		ocfs2_error(sb,
+			    "Invalid extent tree at extent block %llu\n",
+			    (unsigned long long)blkno);
+		ret = -EROFS;
+		goto out;
+
+next_node:
+		blkno = path->p_node[i].bh->b_blocknr;
+		i--;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+					   int op_credits,
+					   struct ocfs2_path *path)
+{
+	int ret = 0;
+	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
+
+	if (handle->h_buffer_credits < credits)
+		ret = ocfs2_extend_trans(handle,
+					 credits - handle->h_buffer_credits);
+
+	return ret;
+}
+
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+						 u32 insert_cpos)
+{
+	struct ocfs2_extent_list *left_el;
+	struct ocfs2_extent_rec *rec;
+	int next_free;
+
+	left_el = path_leaf_el(left_path);
+	next_free = le16_to_cpu(left_el->l_next_free_rec);
+	rec = &left_el->l_recs[next_free - 1];
+
+	if (insert_cpos > le32_to_cpu(rec->e_cpos))
+		return 1;
+	return 0;
+}
+
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	if (next_free == 0)
+		return 0;
+
+	rec = &el->l_recs[0];
+	if (ocfs2_is_empty_extent(rec)) {
+		/* Empty list. */
+		if (next_free == 1)
+			return 0;
+		rec = &el->l_recs[1];
+	}
+
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+		return 1;
+	return 0;
+}
+
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon successful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ *   whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ *   *ret_left_path will contain a valid path which can be passed to
+ *   ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(handle_t *handle,
+				   struct ocfs2_extent_tree *et,
+				   enum ocfs2_split_type split,
+				   u32 insert_cpos,
+				   struct ocfs2_path *right_path,
+				   struct ocfs2_path **ret_left_path)
+{
+	int ret, start, orig_credits = handle->h_buffer_credits;
+	u32 cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+	*ret_left_path = NULL;
+
+	left_path = ocfs2_new_path_from_path(right_path);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_rotate_tree_right(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		insert_cpos, cpos);
+
+	/*
+	 * What we want to do here is:
+	 *
+	 * 1) Start with the rightmost path.
+	 *
+	 * 2) Determine a path to the leaf block directly to the left
+	 *    of that leaf.
+	 *
+	 * 3) Determine the 'subtree root' - the lowest level tree node
+	 *    which contains a path to both leaves.
+	 *
+	 * 4) Rotate the subtree.
+	 *
+	 * 5) Find the next subtree by considering the left path to be
+	 *    the new right path.
+	 *
+	 * The check at the top of this while loop also accepts
+	 * insert_cpos == cpos because cpos is only a _theoretical_
+	 * value to get us the left path - insert_cpos might very well
+	 * be filling that hole.
+	 *
+	 * Stop at a cpos of '0' because we either started at the
+	 * leftmost branch (i.e., a tree with one branch and a
+	 * rotation inside of it), or we've gone as far as we can in
+	 * rotating subtrees.
+	 */
+	while (cpos && insert_cpos <= cpos) {
+		trace_ocfs2_rotate_tree_right(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			insert_cpos, cpos);
+
+		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		mlog_bug_on_msg(path_leaf_bh(left_path) ==
+				path_leaf_bh(right_path),
+				"Owner %llu: error during insert of %u "
+				"(left path cpos %u) results in two identical "
+				"paths ending at %llu\n",
+				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				insert_cpos, cpos,
+				(unsigned long long)
+				path_leaf_bh(left_path)->b_blocknr);
+
+		if (split == SPLIT_NONE &&
+		    ocfs2_rotate_requires_path_adjustment(left_path,
+							  insert_cpos)) {
+
+			/*
+			 * We've rotated the tree as much as we
+			 * should. The rest is up to
+			 * ocfs2_insert_path() to complete, after the
+			 * record insertion. We indicate this
+			 * situation by returning the left path.
+			 *
+			 * The reason we don't adjust the records here
+			 * before the record insert is that an error
+			 * later might break the rule where a parent
+			 * record e_cpos will reflect the actual
+			 * e_cpos of the 1st nonempty record of the
+			 * child list.
+			 */
+			*ret_left_path = left_path;
+			goto out_ret_path;
+		}
+
+		start = ocfs2_find_subtree_root(et, left_path, right_path);
+
+		trace_ocfs2_rotate_subtree(start,
+			(unsigned long long)
+			right_path->p_node[start].bh->b_blocknr,
+			right_path->p_tree_depth);
+
+		ret = ocfs2_extend_rotate_transaction(handle, start,
+						      orig_credits, right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_subtree_right(handle, et, left_path,
+						 right_path, start);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (split != SPLIT_NONE &&
+		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+						insert_cpos)) {
+			/*
+			 * A rotate moves the rightmost left leaf
+			 * record over to the leftmost right leaf
+			 * slot. If we're doing an extent split
+			 * instead of a real insert, then we have to
+			 * check that the extent to be split wasn't
+			 * just moved over. If it was, then we can
+			 * exit here, passing left_path back -
+			 * ocfs2_split_extent() is smart enough to
+			 * search both leaves.
+			 */
+			*ret_left_path = left_path;
+			goto out_ret_path;
+		}
+
+		/*
+		 * There is no need to re-read the next right path
+		 * as we know that it'll be our current left
+		 * path. Optimize by copying values instead.
+		 */
+		ocfs2_mv_path(right_path, left_path);
+
+		ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(left_path);
+
+out_ret_path:
+	return ret;
+}
+
+static int ocfs2_update_edge_lengths(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     int subtree_index, struct ocfs2_path *path)
+{
+	int i, idx, ret;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+	u32 range;
+
+	/*
+	 * In normal tree rotation process, we will never touch the
+	 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+	 * doesn't reserve the credits for them either.
+	 *
+	 * But we do have a special case here which will update the rightmost
+	 * records for all the bh in the path.
+	 * So we have to allocate extra credits and access them.
+	 */
+	ret = ocfs2_extend_trans(handle, subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Path should always be rightmost. */
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+	BUG_ON(eb->h_next_leaf_blk != 0ULL);
+
+	el = &eb->h_list;
+	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+	idx = le16_to_cpu(el->l_next_free_rec) - 1;
+	rec = &el->l_recs[idx];
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	for (i = 0; i < path->p_tree_depth; i++) {
+		el = path->p_node[i].el;
+		idx = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[idx];
+
+		rec->e_int_clusters = cpu_to_le32(range);
+		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
+
+		ocfs2_journal_dirty(handle, path->p_node[i].bh);
+	}
+out:
+	return ret;
+}
+
+static void ocfs2_unlink_path(handle_t *handle,
+			      struct ocfs2_extent_tree *et,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      struct ocfs2_path *path, int unlink_start)
+{
+	int ret, i;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *bh;
+
+	for(i = unlink_start; i < path_num_items(path); i++) {
+		bh = path->p_node[i].bh;
+
+		eb = (struct ocfs2_extent_block *)bh->b_data;
+		/*
+		 * Not all nodes might have had their final count
+		 * decremented by the caller - handle this here.
+		 */
+		el = &eb->h_list;
+		if (le16_to_cpu(el->l_next_free_rec) > 1) {
+			mlog(ML_ERROR,
+			     "Inode %llu, attempted to remove extent block "
+			     "%llu with %u records\n",
+			     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			     (unsigned long long)le64_to_cpu(eb->h_blkno),
+			     le16_to_cpu(el->l_next_free_rec));
+
+			ocfs2_journal_dirty(handle, bh);
+			ocfs2_remove_from_cache(et->et_ci, bh);
+			continue;
+		}
+
+		el->l_next_free_rec = 0;
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+		ocfs2_journal_dirty(handle, bh);
+
+		ret = ocfs2_cache_extent_block_free(dealloc, eb);
+		if (ret)
+			mlog_errno(ret);
+
+		ocfs2_remove_from_cache(et->et_ci, bh);
+	}
+}
+
+static void ocfs2_unlink_subtree(handle_t *handle,
+				 struct ocfs2_extent_tree *et,
+				 struct ocfs2_path *left_path,
+				 struct ocfs2_path *right_path,
+				 int subtree_index,
+				 struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int i;
+	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+
+	el = path_leaf_el(left_path);
+
+	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+
+	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+			break;
+
+	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+
+	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	le16_add_cpu(&root_el->l_next_free_rec, -1);
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+	eb->h_next_leaf_blk = 0;
+
+	ocfs2_journal_dirty(handle, root_bh);
+	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+	ocfs2_unlink_path(handle, et, dealloc, right_path,
+			  subtree_index + 1);
+}
+
+static int ocfs2_rotate_subtree_left(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_path *left_path,
+				     struct ocfs2_path *right_path,
+				     int subtree_index,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     int *deleted)
+{
+	int ret, i, del_right_subtree = 0, right_has_empty = 0;
+	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
+	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+	struct ocfs2_extent_block *eb;
+
+	*deleted = 0;
+
+	right_leaf_el = path_leaf_el(right_path);
+	left_leaf_el = path_leaf_el(left_path);
+	root_bh = left_path->p_node[subtree_index].bh;
+	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+		return 0;
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+		/*
+		 * It's legal for us to proceed if the right leaf is
+		 * the rightmost one and it has an empty extent. There
+		 * are two cases to handle - whether the leaf will be
+		 * empty after removal or not. If the leaf isn't empty
+		 * then just remove the empty extent up front. The
+		 * next block will handle empty leaves by flagging
+		 * them for unlink.
+		 *
+		 * Non rightmost leaves will throw -EAGAIN and the
+		 * caller can manually move the subtree and retry.
+		 */
+
+		if (eb->h_next_leaf_blk != 0ULL)
+			return -EAGAIN;
+
+		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+			ret = ocfs2_journal_access_eb(handle, et->et_ci,
+						      path_leaf_bh(right_path),
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ocfs2_remove_empty_extent(right_leaf_el);
+		} else
+			right_has_empty = 1;
+	}
+
+	if (eb->h_next_leaf_blk == 0ULL &&
+	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+		/*
+		 * We have to update i_last_eb_blk during the meta
+		 * data delete.
+		 */
+		ret = ocfs2_et_root_journal_access(handle, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		del_right_subtree = 1;
+	}
+
+	/*
+	 * Getting here with an empty extent in the right path implies
+	 * that it's the rightmost path and will be deleted.
+	 */
+	BUG_ON(right_has_empty && !del_right_subtree);
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+					   subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   right_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   left_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!right_has_empty) {
+		/*
+		 * Only do this if we're moving a real
+		 * record. Otherwise, the action is delayed until
+		 * after removal of the right path in which case we
+		 * can do a simple shift to remove the empty extent.
+		 */
+		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+		memset(&right_leaf_el->l_recs[0], 0,
+		       sizeof(struct ocfs2_extent_rec));
+	}
+	if (eb->h_next_leaf_blk == 0ULL) {
+		/*
+		 * Move recs over to get rid of empty extent, decrease
+		 * next_free. This is allowed to remove the last
+		 * extent in our leaf (setting l_next_free_rec to
+		 * zero) - the delete code below won't care.
+		 */
+		ocfs2_remove_empty_extent(right_leaf_el);
+	}
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+	ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+
+	if (del_right_subtree) {
+		ocfs2_unlink_subtree(handle, et, left_path, right_path,
+				     subtree_index, dealloc);
+		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+
+		/*
+		 * Removal of the extent in the left leaf was skipped
+		 * above so we could delete the right path
+		 * 1st.
+		 */
+		if (right_has_empty)
+			ocfs2_remove_empty_extent(left_leaf_el);
+
+		ocfs2_journal_dirty(handle, et_root_bh);
+
+		*deleted = 1;
+	} else
+		ocfs2_complete_edge_insert(handle, left_path, right_path,
+					   subtree_index);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+				   struct ocfs2_path *path, u32 *cpos)
+{
+	int i, j, ret = 0;
+	u64 blkno;
+	struct ocfs2_extent_list *el;
+
+	*cpos = 0;
+
+	if (path->p_tree_depth == 0)
+		return 0;
+
+	blkno = path_leaf_bh(path)->b_blocknr;
+
+	/* Start at the tree node just above the leaf and work our way up. */
+	i = path->p_tree_depth - 1;
+	while (i >= 0) {
+		int next_free;
+
+		el = path->p_node[i].el;
+
+		/*
+		 * Find the extent record just after the one in our
+		 * path.
+		 */
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+				if (j == (next_free - 1)) {
+					if (i == 0) {
+						/*
+						 * We've determined that the
+						 * path specified is already
+						 * the rightmost one - return a
+						 * cpos of zero.
+						 */
+						goto out;
+					}
+					/*
+					 * The rightmost record points to our
+					 * leaf - we need to travel up the
+					 * tree one level.
+					 */
+					goto next_node;
+				}
+
+				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+				goto out;
+			}
+		}
+
+		/*
+		 * If we got here, we never found a valid node where
+		 * the tree indicated one should be.
+		 */
+		ocfs2_error(sb,
+			    "Invalid extent tree at extent block %llu\n",
+			    (unsigned long long)blkno);
+		ret = -EROFS;
+		goto out;
+
+next_node:
+		blkno = path->p_node[i].bh->b_blocknr;
+		i--;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
+					    struct ocfs2_extent_tree *et,
+					    struct ocfs2_path *path)
+{
+	int ret;
+	struct buffer_head *bh = path_leaf_bh(path);
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+
+	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+		return 0;
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_empty_extent(el);
+	ocfs2_journal_dirty(handle, bh);
+
+out:
+	return ret;
+}
+
+static int __ocfs2_rotate_tree_left(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    int orig_credits,
+				    struct ocfs2_path *path,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc,
+				    struct ocfs2_path **empty_extent_path)
+{
+	int ret, subtree_root, deleted;
+	u32 right_cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_path *right_path = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+
+	*empty_extent_path = NULL;
+
+	ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	left_path = ocfs2_new_path_from_path(path);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_cp_path(left_path, path);
+
+	right_path = ocfs2_new_path_from_path(path);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (right_cpos) {
+		ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		subtree_root = ocfs2_find_subtree_root(et, left_path,
+						       right_path);
+
+		trace_ocfs2_rotate_subtree(subtree_root,
+		     (unsigned long long)
+		     right_path->p_node[subtree_root].bh->b_blocknr,
+		     right_path->p_tree_depth);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+						      orig_credits, left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Caller might still want to make changes to the
+		 * tree root, so re-add it to the journal here.
+		 */
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   left_path, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_subtree_left(handle, et, left_path,
+						right_path, subtree_root,
+						dealloc, &deleted);
+		if (ret == -EAGAIN) {
+			/*
+			 * The rotation has to temporarily stop due to
+			 * the right subtree having an empty
+			 * extent. Pass it back to the caller for a
+			 * fixup.
+			 */
+			*empty_extent_path = right_path;
+			right_path = NULL;
+			goto out;
+		}
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * The subtree rotate might have removed records on
+		 * the rightmost edge. If so, then rotation is
+		 * complete.
+		 */
+		if (deleted)
+			break;
+
+		ocfs2_mv_path(left_path, right_path);
+
+		ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
+						     &right_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(right_path);
+	ocfs2_free_path(left_path);
+
+	return ret;
+}
+
+static int ocfs2_remove_rightmost_path(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_path *path,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, subtree_index;
+	u32 cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+
+	ret = ocfs2_et_sanity_check(et);
+	if (ret)
+		goto out;
+	/*
+	 * There's two ways we handle this depending on
+	 * whether path is the only existing one.
+	 */
+	ret = ocfs2_extend_rotate_transaction(handle, 0,
+					      handle->h_buffer_credits,
+					      path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+					    path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (cpos) {
+		/*
+		 * We have a path to the left of this one - it needs
+		 * an update too.
+		 */
+		left_path = ocfs2_new_path_from_path(path);
+		if (!left_path) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+
+		ocfs2_unlink_subtree(handle, et, left_path, path,
+				     subtree_index, dealloc);
+		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+	} else {
+		/*
+		 * 'path' is also the leftmost path which
+		 * means it must be the only one. This gets
+		 * handled differently because we want to
+		 * revert the root back to having extents
+		 * in-line.
+		 */
+		ocfs2_unlink_path(handle, et, dealloc, path, 1);
+
+		el = et->et_root_el;
+		el->l_tree_depth = 0;
+		el->l_next_free_rec = 0;
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+		ocfs2_et_set_last_eb_blk(et, 0);
+	}
+
+	ocfs2_journal_dirty(handle, path_root_bh(path));
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+}
+
+/*
+ * Left rotation of btree records.
+ *
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
+ *
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
+ */
+static int ocfs2_rotate_tree_left(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_path *path,
+				  struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, orig_credits = handle->h_buffer_credits;
+	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	el = path_leaf_el(path);
+	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+		return 0;
+
+	if (path->p_tree_depth == 0) {
+rightmost_no_delete:
+		/*
+		 * Inline extents. This is trivially handled, so do
+		 * it up front.
+		 */
+		ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Handle rightmost branch now. There's several cases:
+	 *  1) simple rotation leaving records in there. That's trivial.
+	 *  2) rotation requiring a branch delete - there's no more
+	 *     records left. Two cases of this:
+	 *     a) There are branches to the left.
+	 *     b) This is also the leftmost (the only) branch.
+	 *
+	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
+	 *  2a) we need the left branch so that we can update it with the unlink
+	 *  2b) we need to bring the root back to inline extents.
+	 */
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+	el = &eb->h_list;
+	if (eb->h_next_leaf_blk == 0) {
+		/*
+		 * This gets a bit tricky if we're going to delete the
+		 * rightmost path. Get the other cases out of the way
+		 * 1st.
+		 */
+		if (le16_to_cpu(el->l_next_free_rec) > 1)
+			goto rightmost_no_delete;
+
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ret = -EIO;
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has empty extent block at %llu",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				    (unsigned long long)le64_to_cpu(eb->h_blkno));
+			goto out;
+		}
+
+		/*
+		 * XXX: The caller can not trust "path" any more after
+		 * this as it will have been deleted. What do we do?
+		 *
+		 * In theory the rotate-for-merge code will never get
+		 * here because it'll always ask for a rotate in a
+		 * nonempty list.
+		 */
+
+		ret = ocfs2_remove_rightmost_path(handle, et, path,
+						  dealloc);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Now we can loop, remembering the path we get from -EAGAIN
+	 * and restarting from there.
+	 */
+try_rotate:
+	ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
+				       dealloc, &restart_path);
+	if (ret && ret != -EAGAIN) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (ret == -EAGAIN) {
+		tmp_path = restart_path;
+		restart_path = NULL;
+
+		ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
+					       tmp_path, dealloc,
+					       &restart_path);
+		if (ret && ret != -EAGAIN) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_free_path(tmp_path);
+		tmp_path = NULL;
+
+		if (ret == 0)
+			goto try_rotate;
+	}
+
+out:
+	ocfs2_free_path(tmp_path);
+	ocfs2_free_path(restart_path);
+	return ret;
+}
+
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+				int index)
+{
+	struct ocfs2_extent_rec *rec = &el->l_recs[index];
+	unsigned int size;
+
+	if (rec->e_leaf_clusters == 0) {
+		/*
+		 * We consumed all of the merged-from record. An empty
+		 * extent cannot exist anywhere but the 1st array
+		 * position, so move things over if the merged-from
+		 * record doesn't occupy that position.
+		 *
+		 * This creates a new empty extent so the caller
+		 * should be smart enough to have removed any existing
+		 * ones.
+		 */
+		if (index > 0) {
+			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+			size = index * sizeof(struct ocfs2_extent_rec);
+			memmove(&el->l_recs[1], &el->l_recs[0], size);
+		}
+
+		/*
+		 * Always memset - the caller doesn't check whether it
+		 * created an empty extent, so there could be junk in
+		 * the other fields.
+		 */
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+	}
+}
+
+static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
+				struct ocfs2_path *left_path,
+				struct ocfs2_path **ret_right_path)
+{
+	int ret;
+	u32 right_cpos;
+	struct ocfs2_path *right_path = NULL;
+	struct ocfs2_extent_list *left_el;
+
+	*ret_right_path = NULL;
+
+	/* This function shouldn't be called for non-trees. */
+	BUG_ON(left_path->p_tree_depth == 0);
+
+	left_el = path_leaf_el(left_path);
+	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+
+	ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+					     left_path, &right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This function shouldn't be called for the rightmost leaf. */
+	BUG_ON(right_cpos == 0);
+
+	right_path = ocfs2_new_path_from_path(left_path);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_right_path = right_path;
+out:
+	if (ret)
+		ocfs2_free_path(right_path);
+	return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
+ */
+static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
+				 handle_t *handle,
+				 struct ocfs2_extent_tree *et,
+				 struct ocfs2_extent_rec *split_rec,
+				 int index)
+{
+	int ret, next_free, i;
+	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+	struct ocfs2_extent_rec *left_rec;
+	struct ocfs2_extent_rec *right_rec;
+	struct ocfs2_extent_list *right_el;
+	struct ocfs2_path *right_path = NULL;
+	int subtree_index = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(left_path);
+	struct buffer_head *bh = path_leaf_bh(left_path);
+	struct buffer_head *root_bh = NULL;
+
+	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+	left_rec = &el->l_recs[index];
+
+	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
+	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+		/* we meet with a cross extent block merge. */
+		ret = ocfs2_get_right_path(et, left_path, &right_path);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		right_el = path_leaf_el(right_path);
+		next_free = le16_to_cpu(right_el->l_next_free_rec);
+		BUG_ON(next_free <= 0);
+		right_rec = &right_el->l_recs[0];
+		if (ocfs2_is_empty_extent(right_rec)) {
+			BUG_ON(next_free <= 1);
+			right_rec = &right_el->l_recs[1];
+		}
+
+		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+		       le16_to_cpu(left_rec->e_leaf_clusters) !=
+		       le32_to_cpu(right_rec->e_cpos));
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path,
+							right_path);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+						      handle->h_buffer_credits,
+						      right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		root_bh = left_path->p_node[subtree_index].bh;
+		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+						   subtree_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = subtree_index + 1;
+		     i < path_num_items(right_path); i++) {
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   right_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   left_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+	} else {
+		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+		right_rec = &el->l_recs[index + 1];
+	}
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
+					   path_num_items(left_path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+
+	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+	le64_add_cpu(&right_rec->e_blkno,
+		     -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+					       split_clusters));
+	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+
+	ocfs2_cleanup_merge(el, index);
+
+	ocfs2_journal_dirty(handle, bh);
+	if (right_path) {
+		ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+		ocfs2_complete_edge_insert(handle, left_path, right_path,
+					   subtree_index);
+	}
+out:
+	ocfs2_free_path(right_path);
+	return ret;
+}
+
+static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
+			       struct ocfs2_path *right_path,
+			       struct ocfs2_path **ret_left_path)
+{
+	int ret;
+	u32 left_cpos;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/* This function shouldn't be called for non-trees. */
+	BUG_ON(right_path->p_tree_depth == 0);
+
+	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+					    right_path, &left_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This function shouldn't be called for the leftmost leaf. */
+	BUG_ON(left_cpos == 0);
+
+	left_path = ocfs2_new_path_from_path(right_path);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_left_path = left_path;
+out:
+	if (ret)
+		ocfs2_free_path(left_path);
+	return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
+ */
+static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
+				handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_extent_rec *split_rec,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				int index)
+{
+	int ret, i, subtree_index = 0, has_empty_extent = 0;
+	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+	struct ocfs2_extent_rec *left_rec;
+	struct ocfs2_extent_rec *right_rec;
+	struct ocfs2_extent_list *el = path_leaf_el(right_path);
+	struct buffer_head *bh = path_leaf_bh(right_path);
+	struct buffer_head *root_bh = NULL;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *left_el;
+
+	BUG_ON(index < 0);
+
+	right_rec = &el->l_recs[index];
+	if (index == 0) {
+		/* we meet with a cross extent block merge. */
+		ret = ocfs2_get_left_path(et, right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		left_el = path_leaf_el(left_path);
+		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+		       le16_to_cpu(left_el->l_count));
+
+		left_rec = &left_el->l_recs[
+				le16_to_cpu(left_el->l_next_free_rec) - 1];
+		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+		       le16_to_cpu(left_rec->e_leaf_clusters) !=
+		       le32_to_cpu(split_rec->e_cpos));
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path,
+							right_path);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+						      handle->h_buffer_credits,
+						      left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		root_bh = left_path->p_node[subtree_index].bh;
+		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+						   subtree_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = subtree_index + 1;
+		     i < path_num_items(right_path); i++) {
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   right_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   left_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+	} else {
+		left_rec = &el->l_recs[index - 1];
+		if (ocfs2_is_empty_extent(&el->l_recs[0]))
+			has_empty_extent = 1;
+	}
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+					   path_num_items(right_path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (has_empty_extent && index == 1) {
+		/*
+		 * The easy case - we can just plop the record right in.
+		 */
+		*left_rec = *split_rec;
+
+		has_empty_extent = 0;
+	} else
+		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+
+	le32_add_cpu(&right_rec->e_cpos, split_clusters);
+	le64_add_cpu(&right_rec->e_blkno,
+		     ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+					      split_clusters));
+	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+
+	ocfs2_cleanup_merge(el, index);
+
+	ocfs2_journal_dirty(handle, bh);
+	if (left_path) {
+		ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+		/*
+		 * In the situation that the right_rec is empty and the extent
+		 * block is empty also,  ocfs2_complete_edge_insert can't handle
+		 * it and we need to delete the right extent block.
+		 */
+		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+		    le16_to_cpu(el->l_next_free_rec) == 1) {
+
+			ret = ocfs2_remove_rightmost_path(handle, et,
+							  right_path,
+							  dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/* Now the rightmost extent block has been deleted.
+			 * So we use the new rightmost path.
+			 */
+			ocfs2_mv_path(right_path, left_path);
+			left_path = NULL;
+		} else
+			ocfs2_complete_edge_insert(handle, left_path,
+						   right_path, subtree_index);
+	}
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+}
+
+static int ocfs2_try_to_merge_extent(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_path *path,
+				     int split_index,
+				     struct ocfs2_extent_rec *split_rec,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     struct ocfs2_merge_ctxt *ctxt)
+{
+	int ret = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+
+	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+
+	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+		/*
+		 * The merge code will need to create an empty
+		 * extent to take the place of the newly
+		 * emptied slot. Remove any pre-existing empty
+		 * extents - having more than one in a leaf is
+		 * illegal.
+		 */
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		split_index--;
+		rec = &el->l_recs[split_index];
+	}
+
+	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+		/*
+		 * Left-right contig implies this.
+		 */
+		BUG_ON(!ctxt->c_split_covers_rec);
+
+		/*
+		 * Since the leftright insert always covers the entire
+		 * extent, this call will delete the insert record
+		 * entirely, resulting in an empty extent record added to
+		 * the extent block.
+		 *
+		 * Since the adding of an empty extent shifts
+		 * everything back to the right, there's no need to
+		 * update split_index here.
+		 *
+		 * When the split_index is zero, we need to merge it to the
+		 * prevoius extent block. It is more efficient and easier
+		 * if we do merge_right first and merge_left later.
+		 */
+		ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
+					    split_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We can only get this from logic error above.
+		 */
+		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+		/* The merge left us with an empty extent, remove it. */
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		rec = &el->l_recs[split_index];
+
+		/*
+		 * Note that we don't pass split_rec here on purpose -
+		 * we've merged it into the rec already.
+		 */
+		ret = ocfs2_merge_rec_left(path, handle, et, rec,
+					   dealloc, split_index);
+
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		/*
+		 * Error from this last rotate is not critical, so
+		 * print but don't bubble it up.
+		 */
+		if (ret)
+			mlog_errno(ret);
+		ret = 0;
+	} else {
+		/*
+		 * Merge a record to the left or right.
+		 *
+		 * 'contig_type' is relative to the existing record,
+		 * so for example, if we're "right contig", it's to
+		 * the record on the left (hence the left merge).
+		 */
+		if (ctxt->c_contig_type == CONTIG_RIGHT) {
+			ret = ocfs2_merge_rec_left(path, handle, et,
+						   split_rec, dealloc,
+						   split_index);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else {
+			ret = ocfs2_merge_rec_right(path, handle,
+						    et, split_rec,
+						    split_index);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		if (ctxt->c_split_covers_rec) {
+			/*
+			 * The merge may have left an empty extent in
+			 * our leaf. Try to rotate it away.
+			 */
+			ret = ocfs2_rotate_tree_left(handle, et, path,
+						     dealloc);
+			if (ret)
+				mlog_errno(ret);
+			ret = 0;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+				    enum ocfs2_split_type split,
+				    struct ocfs2_extent_rec *rec,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	u64 len_blocks;
+
+	len_blocks = ocfs2_clusters_to_blocks(sb,
+				le16_to_cpu(split_rec->e_leaf_clusters));
+
+	if (split == SPLIT_LEFT) {
+		/*
+		 * Region is on the left edge of the existing
+		 * record.
+		 */
+		le32_add_cpu(&rec->e_cpos,
+			     le16_to_cpu(split_rec->e_leaf_clusters));
+		le64_add_cpu(&rec->e_blkno, len_blocks);
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     -le16_to_cpu(split_rec->e_leaf_clusters));
+	} else {
+		/*
+		 * Region is on the right edge of the existing
+		 * record.
+		 */
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     -le16_to_cpu(split_rec->e_leaf_clusters));
+	}
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
+				 struct ocfs2_extent_rec *insert_rec,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_insert_type *insert)
+{
+	int i = insert->ins_contig_index;
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (insert->ins_split != SPLIT_NONE) {
+		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+		BUG_ON(i == -1);
+		rec = &el->l_recs[i];
+		ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+					insert->ins_split, rec,
+					insert_rec);
+		goto rotate;
+	}
+
+	/*
+	 * Contiguous insert - either left or right.
+	 */
+	if (insert->ins_contig != CONTIG_NONE) {
+		rec = &el->l_recs[i];
+		if (insert->ins_contig == CONTIG_LEFT) {
+			rec->e_blkno = insert_rec->e_blkno;
+			rec->e_cpos = insert_rec->e_cpos;
+		}
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		return;
+	}
+
+	/*
+	 * Handle insert into an empty leaf.
+	 */
+	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		el->l_recs[0] = *insert_rec;
+		el->l_next_free_rec = cpu_to_le16(1);
+		return;
+	}
+
+	/*
+	 * Appending insert.
+	 */
+	if (insert->ins_appending == APPEND_TAIL) {
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[i];
+		range = le32_to_cpu(rec->e_cpos)
+			+ le16_to_cpu(rec->e_leaf_clusters);
+		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+				le16_to_cpu(el->l_count),
+				"owner %llu, depth %u, count %u, next free %u, "
+				"rec.cpos %u, rec.clusters %u, "
+				"insert.cpos %u, insert.clusters %u\n",
+				ocfs2_metadata_cache_owner(et->et_ci),
+				le16_to_cpu(el->l_tree_depth),
+				le16_to_cpu(el->l_count),
+				le16_to_cpu(el->l_next_free_rec),
+				le32_to_cpu(el->l_recs[i].e_cpos),
+				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+				le32_to_cpu(insert_rec->e_cpos),
+				le16_to_cpu(insert_rec->e_leaf_clusters));
+		i++;
+		el->l_recs[i] = *insert_rec;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+		return;
+	}
+
+rotate:
+	/*
+	 * Ok, we have to rotate.
+	 *
+	 * At this point, it is safe to assume that inserting into an
+	 * empty leaf and appending to a leaf have both been handled
+	 * above.
+	 *
+	 * This leaf needs to have space, either by the empty 1st
+	 * extent record, or by virtue of an l_next_rec < l_count.
+	 */
+	ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+					   struct ocfs2_extent_tree *et,
+					   struct ocfs2_path *path,
+					   struct ocfs2_extent_rec *insert_rec)
+{
+	int ret, i, next_free;
+	struct buffer_head *bh;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	/*
+	 * Update everything except the leaf block.
+	 */
+	for (i = 0; i < path->p_tree_depth; i++) {
+		bh = path->p_node[i].bh;
+		el = path->p_node[i].el;
+
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has a bad extent list",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+			ret = -EIO;
+			return;
+		}
+
+		rec = &el->l_recs[next_free - 1];
+
+		rec->e_int_clusters = insert_rec->e_cpos;
+		le32_add_cpu(&rec->e_int_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		le32_add_cpu(&rec->e_int_clusters,
+			     -le32_to_cpu(rec->e_cpos));
+
+		ocfs2_journal_dirty(handle, bh);
+	}
+}
+
+static int ocfs2_append_rec_to_path(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    struct ocfs2_extent_rec *insert_rec,
+				    struct ocfs2_path *right_path,
+				    struct ocfs2_path **ret_left_path)
+{
+	int ret, next_free;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/*
+	 * This shouldn't happen for non-trees. The extent rec cluster
+	 * count manipulation below only works for interior nodes.
+	 */
+	BUG_ON(right_path->p_tree_depth == 0);
+
+	/*
+	 * If our appending insert is at the leftmost edge of a leaf,
+	 * then we might need to update the rightmost records of the
+	 * neighboring path.
+	 */
+	el = path_leaf_el(right_path);
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		u32 left_cpos;
+
+		ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+						    right_path, &left_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		trace_ocfs2_append_rec_to_path(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			le32_to_cpu(insert_rec->e_cpos),
+			left_cpos);
+
+		/*
+		 * No need to worry if the append is already in the
+		 * leftmost leaf.
+		 */
+		if (left_cpos) {
+			left_path = ocfs2_new_path_from_path(right_path);
+			if (!left_path) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_find_path(et->et_ci, left_path,
+					      left_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/*
+			 * ocfs2_insert_path() will pass the left_path to the
+			 * journal for us.
+			 */
+		}
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
+
+	*ret_left_path = left_path;
+	ret = 0;
+out:
+	if (ret != 0)
+		ocfs2_free_path(left_path);
+
+	return ret;
+}
+
+static void ocfs2_split_record(struct ocfs2_extent_tree *et,
+			       struct ocfs2_path *left_path,
+			       struct ocfs2_path *right_path,
+			       struct ocfs2_extent_rec *split_rec,
+			       enum ocfs2_split_type split)
+{
+	int index;
+	u32 cpos = le32_to_cpu(split_rec->e_cpos);
+	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+	struct ocfs2_extent_rec *rec, *tmprec;
+
+	right_el = path_leaf_el(right_path);
+	if (left_path)
+		left_el = path_leaf_el(left_path);
+
+	el = right_el;
+	insert_el = right_el;
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index != -1) {
+		if (index == 0 && left_path) {
+			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+
+			/*
+			 * This typically means that the record
+			 * started in the left path but moved to the
+			 * right as a result of rotation. We either
+			 * move the existing record to the left, or we
+			 * do the later insert there.
+			 *
+			 * In this case, the left path should always
+			 * exist as the rotate code will have passed
+			 * it back for a post-insert update.
+			 */
+
+			if (split == SPLIT_LEFT) {
+				/*
+				 * It's a left split. Since we know
+				 * that the rotate code gave us an
+				 * empty extent in the left path, we
+				 * can just do the insert there.
+				 */
+				insert_el = left_el;
+			} else {
+				/*
+				 * Right split - we have to move the
+				 * existing record over to the left
+				 * leaf. The insert will be into the
+				 * newly created empty extent in the
+				 * right leaf.
+				 */
+				tmprec = &right_el->l_recs[index];
+				ocfs2_rotate_leaf(left_el, tmprec);
+				el = left_el;
+
+				memset(tmprec, 0, sizeof(*tmprec));
+				index = ocfs2_search_extent_list(left_el, cpos);
+				BUG_ON(index == -1);
+			}
+		}
+	} else {
+		BUG_ON(!left_path);
+		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+		/*
+		 * Left path is easy - we can just allow the insert to
+		 * happen.
+		 */
+		el = left_el;
+		insert_el = left_el;
+		index = ocfs2_search_extent_list(el, cpos);
+		BUG_ON(index == -1);
+	}
+
+	rec = &el->l_recs[index];
+	ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+				split, rec, split_rec);
+	ocfs2_rotate_leaf(insert_el, split_rec);
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     struct ocfs2_path *left_path,
+			     struct ocfs2_path *right_path,
+			     struct ocfs2_extent_rec *insert_rec,
+			     struct ocfs2_insert_type *insert)
+{
+	int ret, subtree_index;
+	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+
+	if (left_path) {
+		/*
+		 * There's a chance that left_path got passed back to
+		 * us without being accounted for in the
+		 * journal. Extend our transaction here to be sure we
+		 * can change those blocks.
+		 */
+		ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Pass both paths to the journal. The majority of inserts
+	 * will be touching all components anyway.
+	 */
+	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (insert->ins_split != SPLIT_NONE) {
+		/*
+		 * We could call ocfs2_insert_at_leaf() for some types
+		 * of splits, but it's easier to just let one separate
+		 * function sort it all out.
+		 */
+		ocfs2_split_record(et, left_path, right_path,
+				   insert_rec, insert->ins_split);
+
+		/*
+		 * Split might have modified either leaf and we don't
+		 * have a guarantee that the later edge insert will
+		 * dirty this for us.
+		 */
+		if (left_path)
+			ocfs2_journal_dirty(handle,
+					    path_leaf_bh(left_path));
+	} else
+		ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
+				     insert);
+
+	ocfs2_journal_dirty(handle, leaf_bh);
+
+	if (left_path) {
+		/*
+		 * The rotate code has indicated that we need to fix
+		 * up portions of the tree after the insert.
+		 *
+		 * XXX: Should we extend the transaction here?
+		 */
+		subtree_index = ocfs2_find_subtree_root(et, left_path,
+							right_path);
+		ocfs2_complete_edge_insert(handle, left_path, right_path,
+					   subtree_index);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_do_insert_extent(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_extent_rec *insert_rec,
+				  struct ocfs2_insert_type *type)
+{
+	int ret, rotate = 0;
+	u32 cpos;
+	struct ocfs2_path *right_path = NULL;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *el;
+
+	el = et->et_root_el;
+
+	ret = ocfs2_et_root_journal_access(handle, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (le16_to_cpu(el->l_tree_depth) == 0) {
+		ocfs2_insert_at_leaf(et, insert_rec, el, type);
+		goto out_update_clusters;
+	}
+
+	right_path = ocfs2_new_path_from_et(et);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Determine the path to start with. Rotations need the
+	 * rightmost path, everything else can go directly to the
+	 * target leaf.
+	 */
+	cpos = le32_to_cpu(insert_rec->e_cpos);
+	if (type->ins_appending == APPEND_NONE &&
+	    type->ins_contig == CONTIG_NONE) {
+		rotate = 1;
+		cpos = UINT_MAX;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, right_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Rotations and appends need special treatment - they modify
+	 * parts of the tree's above them.
+	 *
+	 * Both might pass back a path immediate to the left of the
+	 * one being inserted to. This will be cause
+	 * ocfs2_insert_path() to modify the rightmost records of
+	 * left_path to account for an edge insert.
+	 *
+	 * XXX: When modifying this code, keep in mind that an insert
+	 * can wind up skipping both of these two special cases...
+	 */
+	if (rotate) {
+		ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
+					      le32_to_cpu(insert_rec->e_cpos),
+					      right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * ocfs2_rotate_tree_right() might have extended the
+		 * transaction without re-journaling our tree root.
+		 */
+		ret = ocfs2_et_root_journal_access(handle, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else if (type->ins_appending == APPEND_TAIL
+		   && type->ins_contig != CONTIG_LEFT) {
+		ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
+					       right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_insert_path(handle, et, left_path, right_path,
+				insert_rec, type);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out_update_clusters:
+	if (type->ins_split == SPLIT_NONE)
+		ocfs2_et_update_clusters(et,
+					 le16_to_cpu(insert_rec->e_leaf_clusters));
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+out:
+	ocfs2_free_path(left_path);
+	ocfs2_free_path(right_path);
+
+	return ret;
+}
+
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+			       struct ocfs2_path *path,
+			       struct ocfs2_extent_list *el, int index,
+			       struct ocfs2_extent_rec *split_rec)
+{
+	int status;
+	enum ocfs2_contig_type ret = CONTIG_NONE;
+	u32 left_cpos, right_cpos;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_extent_list *new_el;
+	struct ocfs2_path *left_path = NULL, *right_path = NULL;
+	struct buffer_head *bh;
+	struct ocfs2_extent_block *eb;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+	if (index > 0) {
+		rec = &el->l_recs[index - 1];
+	} else if (path->p_tree_depth > 0) {
+		status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+		if (status)
+			goto exit;
+
+		if (left_cpos != 0) {
+			left_path = ocfs2_new_path_from_path(path);
+			if (!left_path)
+				goto exit;
+
+			status = ocfs2_find_path(et->et_ci, left_path,
+						 left_cpos);
+			if (status)
+				goto free_left_path;
+
+			new_el = path_leaf_el(left_path);
+
+			if (le16_to_cpu(new_el->l_next_free_rec) !=
+			    le16_to_cpu(new_el->l_count)) {
+				bh = path_leaf_bh(left_path);
+				eb = (struct ocfs2_extent_block *)bh->b_data;
+				ocfs2_error(sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of "
+					    "%d.  It should have "
+					    "matched the l_count of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec),
+					    le16_to_cpu(new_el->l_count));
+				status = -EINVAL;
+				goto free_left_path;
+			}
+			rec = &new_el->l_recs[
+				le16_to_cpu(new_el->l_next_free_rec) - 1];
+		}
+	}
+
+	/*
+	 * We're careful to check for an empty extent record here -
+	 * the merge code will know what to do if it sees one.
+	 */
+	if (rec) {
+		if (index == 1 && ocfs2_is_empty_extent(rec)) {
+			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+				ret = CONTIG_RIGHT;
+		} else {
+			ret = ocfs2_et_extent_contig(et, rec, split_rec);
+		}
+	}
+
+	rec = NULL;
+	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+		rec = &el->l_recs[index + 1];
+	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+		 path->p_tree_depth > 0) {
+		status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
+		if (status)
+			goto free_left_path;
+
+		if (right_cpos == 0)
+			goto free_left_path;
+
+		right_path = ocfs2_new_path_from_path(path);
+		if (!right_path)
+			goto free_left_path;
+
+		status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+		if (status)
+			goto free_right_path;
+
+		new_el = path_leaf_el(right_path);
+		rec = &new_el->l_recs[0];
+		if (ocfs2_is_empty_extent(rec)) {
+			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+				bh = path_leaf_bh(right_path);
+				eb = (struct ocfs2_extent_block *)bh->b_data;
+				ocfs2_error(sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec));
+				status = -EINVAL;
+				goto free_right_path;
+			}
+			rec = &new_el->l_recs[1];
+		}
+	}
+
+	if (rec) {
+		enum ocfs2_contig_type contig_type;
+
+		contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
+
+		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+			ret = CONTIG_LEFTRIGHT;
+		else if (ret == CONTIG_NONE)
+			ret = contig_type;
+	}
+
+free_right_path:
+	ocfs2_free_path(right_path);
+free_left_path:
+	ocfs2_free_path(left_path);
+exit:
+	return ret;
+}
+
+static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
+				     struct ocfs2_insert_type *insert,
+				     struct ocfs2_extent_list *el,
+				     struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+						     insert_rec);
+		if (contig_type != CONTIG_NONE) {
+			insert->ins_contig_index = i;
+			break;
+		}
+	}
+	insert->ins_contig = contig_type;
+
+	if (insert->ins_contig != CONTIG_NONE) {
+		struct ocfs2_extent_rec *rec =
+				&el->l_recs[insert->ins_contig_index];
+		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+				   le16_to_cpu(insert_rec->e_leaf_clusters);
+
+		/*
+		 * Caller might want us to limit the size of extents, don't
+		 * calculate contiguousness if we might exceed that limit.
+		 */
+		if (et->et_max_leaf_clusters &&
+		    (len > et->et_max_leaf_clusters))
+			insert->ins_contig = CONTIG_NONE;
+	}
+}
+
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+					struct ocfs2_extent_list *el,
+					struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+	struct ocfs2_extent_rec *rec;
+
+	insert->ins_appending = APPEND_NONE;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (!el->l_next_free_rec)
+		goto set_tail_append;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		/* Were all records empty? */
+		if (le16_to_cpu(el->l_next_free_rec) == 1)
+			goto set_tail_append;
+	}
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	rec = &el->l_recs[i];
+
+	if (cpos >=
+	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
+		goto set_tail_append;
+
+	return;
+
+set_tail_append:
+	insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the beginning of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
+				    struct buffer_head **last_eb_bh,
+				    struct ocfs2_extent_rec *insert_rec,
+				    int *free_records,
+				    struct ocfs2_insert_type *insert)
+{
+	int ret;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *path = NULL;
+	struct buffer_head *bh = NULL;
+
+	insert->ins_split = SPLIT_NONE;
+
+	el = et->et_root_el;
+	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+
+	if (el->l_tree_depth) {
+		/*
+		 * If we have tree depth, we read in the
+		 * rightmost extent block ahead of time as
+		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
+		 * may want it later.
+		 */
+		ret = ocfs2_read_extent_block(et->et_ci,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+	}
+
+	/*
+	 * Unless we have a contiguous insert, we'll need to know if
+	 * there is room left in our allocation tree for another
+	 * extent record.
+	 *
+	 * XXX: This test is simplistic, we can search for empty
+	 * extent records too.
+	 */
+	*free_records = le16_to_cpu(el->l_count) -
+		le16_to_cpu(el->l_next_free_rec);
+
+	if (!insert->ins_tree_depth) {
+		ocfs2_figure_contig_type(et, insert, el, insert_rec);
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+		return 0;
+	}
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * In the case that we're inserting past what the tree
+	 * currently accounts for, ocfs2_find_path() will return for
+	 * us the rightmost tree path. This is accounted for below in
+	 * the appending code.
+	 */
+	ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	/*
+	 * Now that we have the path, there's two things we want to determine:
+	 * 1) Contiguousness (also set contig_index if this is so)
+	 *
+	 * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+	 */
+	ocfs2_figure_contig_type(et, insert, el, insert_rec);
+
+	/*
+	 * The insert code isn't quite ready to deal with all cases of
+	 * left contiguousness. Specifically, if it's an insert into
+	 * the 1st record in a leaf, it will require the adjustment of
+	 * cluster count on the last record of the path directly to it's
+	 * left. For now, just catch that case and fool the layers
+	 * above us. This works just fine for tree_depth == 0, which
+	 * is why we allow that above.
+	 */
+	if (insert->ins_contig == CONTIG_LEFT &&
+	    insert->ins_contig_index == 0)
+		insert->ins_contig = CONTIG_NONE;
+
+	/*
+	 * Ok, so we can simply compare against last_eb to figure out
+	 * whether the path doesn't exist. This will only happen in
+	 * the case that we're doing a tail append, so maybe we can
+	 * take advantage of that information somehow.
+	 */
+	if (ocfs2_et_get_last_eb_blk(et) ==
+	    path_leaf_bh(path)->b_blocknr) {
+		/*
+		 * Ok, ocfs2_find_path() returned us the rightmost
+		 * tree path. This might be an appending insert. There are
+		 * two cases:
+		 *    1) We're doing a true append at the tail:
+		 *	-This might even be off the end of the leaf
+		 *    2) We're "appending" by rotating in the tail
+		 */
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+	}
+
+out:
+	ocfs2_free_path(path);
+
+	if (ret == 0)
+		*last_eb_bh = bh;
+	else
+		brelse(bh);
+	return ret;
+}
+
+/*
+ * Insert an extent into a btree.
+ *
+ * The caller needs to update the owning btree's cluster count.
+ */
+int ocfs2_insert_extent(handle_t *handle,
+			struct ocfs2_extent_tree *et,
+			u32 cpos,
+			u64 start_blk,
+			u32 new_clusters,
+			u8 flags,
+			struct ocfs2_alloc_context *meta_ac)
+{
+	int status;
+	int uninitialized_var(free_records);
+	struct buffer_head *last_eb_bh = NULL;
+	struct ocfs2_insert_type insert = {0, };
+	struct ocfs2_extent_rec rec;
+
+	trace_ocfs2_insert_extent_start(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, new_clusters);
+
+	memset(&rec, 0, sizeof(rec));
+	rec.e_cpos = cpu_to_le32(cpos);
+	rec.e_blkno = cpu_to_le64(start_blk);
+	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+	rec.e_flags = flags;
+	status = ocfs2_et_insert_check(et, &rec);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
+					  &free_records, &insert);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
+				  insert.ins_contig_index, free_records,
+				  insert.ins_tree_depth);
+
+	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
+		status = ocfs2_grow_tree(handle, et,
+					 &insert.ins_tree_depth, &last_eb_bh,
+					 meta_ac);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Finally, we can add clusters. This might rotate the tree for us. */
+	status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
+	if (status < 0)
+		mlog_errno(status);
+	else
+		ocfs2_et_extent_map_insert(et, &rec);
+
+bail:
+	brelse(last_eb_bh);
+
+	return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0, err = 0;
+	int need_free = 0;
+	int free_extents;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+	u8 flags = 0;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+	BUG_ON(!clusters_to_add);
+
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
+	free_extents = ocfs2_num_free_extents(osb, et);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		err = -1;
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(et->et_root_el))) {
+		err = -2;
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = __ocfs2_claim_clusters(handle, data_ac, 1,
+					clusters_to_add, &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the tree root */
+	status = ocfs2_et_root_journal_access(handle, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		need_free = 1;
+		goto bail;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_add_clusters_in_btree(
+	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+	     bit_off, num_bits);
+	status = ocfs2_insert_extent(handle, et, *logical_offset, block,
+				     num_bits, flags, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		need_free = 1;
+		goto bail;
+	}
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+	clusters_to_add -= num_bits;
+	*logical_offset += num_bits;
+
+	if (clusters_to_add) {
+		err = clusters_to_add;
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+bail:
+	if (need_free) {
+		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+					bit_off, num_bits);
+		else
+			ocfs2_free_clusters(handle,
+					data_ac->ac_inode,
+					data_ac->ac_bh,
+					ocfs2_clusters_to_blocks(osb->sb, bit_off),
+					num_bits);
+	}
+
+leave:
+	if (reason_ret)
+		*reason_ret = reason;
+	trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
+	return status;
+}
+
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+				       struct ocfs2_extent_rec *split_rec,
+				       u32 cpos,
+				       struct ocfs2_extent_rec *rec)
+{
+	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+
+	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+
+	split_rec->e_cpos = cpu_to_le32(cpos);
+	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+
+	split_rec->e_blkno = rec->e_blkno;
+	le64_add_cpu(&split_rec->e_blkno,
+		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+
+	split_rec->e_flags = rec->e_flags;
+}
+
+static int ocfs2_split_and_insert(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_path *path,
+				  struct buffer_head **last_eb_bh,
+				  int split_index,
+				  struct ocfs2_extent_rec *orig_split_rec,
+				  struct ocfs2_alloc_context *meta_ac)
+{
+	int ret = 0, depth;
+	unsigned int insert_range, rec_range, do_leftright = 0;
+	struct ocfs2_extent_rec tmprec;
+	struct ocfs2_extent_list *rightmost_el;
+	struct ocfs2_extent_rec rec;
+	struct ocfs2_extent_rec split_rec = *orig_split_rec;
+	struct ocfs2_insert_type insert;
+	struct ocfs2_extent_block *eb;
+
+leftright:
+	/*
+	 * Store a copy of the record on the stack - it might move
+	 * around as the tree is manipulated below.
+	 */
+	rec = path_leaf_el(path)->l_recs[split_index];
+
+	rightmost_el = et->et_root_el;
+
+	depth = le16_to_cpu(rightmost_el->l_tree_depth);
+	if (depth) {
+		BUG_ON(!(*last_eb_bh));
+		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+		rightmost_el = &eb->h_list;
+	}
+
+	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+	    le16_to_cpu(rightmost_el->l_count)) {
+		ret = ocfs2_grow_tree(handle, et,
+				      &depth, last_eb_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+	insert.ins_appending = APPEND_NONE;
+	insert.ins_contig = CONTIG_NONE;
+	insert.ins_tree_depth = depth;
+
+	insert_range = le32_to_cpu(split_rec.e_cpos) +
+		le16_to_cpu(split_rec.e_leaf_clusters);
+	rec_range = le32_to_cpu(rec.e_cpos) +
+		le16_to_cpu(rec.e_leaf_clusters);
+
+	if (split_rec.e_cpos == rec.e_cpos) {
+		insert.ins_split = SPLIT_LEFT;
+	} else if (insert_range == rec_range) {
+		insert.ins_split = SPLIT_RIGHT;
+	} else {
+		/*
+		 * Left/right split. We fake this as a right split
+		 * first and then make a second pass as a left split.
+		 */
+		insert.ins_split = SPLIT_RIGHT;
+
+		ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+					   &tmprec, insert_range, &rec);
+
+		split_rec = tmprec;
+
+		BUG_ON(do_leftright);
+		do_leftright = 1;
+	}
+
+	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (do_leftright == 1) {
+		u32 cpos;
+		struct ocfs2_extent_list *el;
+
+		do_leftright++;
+		split_rec = *orig_split_rec;
+
+		ocfs2_reinit_path(path, 1);
+
+		cpos = le32_to_cpu(split_rec.e_cpos);
+		ret = ocfs2_find_path(et->et_ci, path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		el = path_leaf_el(path);
+		split_index = ocfs2_search_extent_list(el, cpos);
+		if (split_index == -1) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+					"Owner %llu has an extent at cpos %u "
+					"which can no longer be found.\n",
+					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+					cpos);
+			ret = -EROFS;
+			goto out;
+		}
+		goto leftright;
+	}
+out:
+
+	return ret;
+}
+
+static int ocfs2_replace_extent_rec(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    struct ocfs2_path *path,
+				    struct ocfs2_extent_list *el,
+				    int split_index,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	int ret;
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el->l_recs[split_index] = *split_rec;
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+	return ret;
+}
+
+/*
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+	struct buffer_head *last_eb_bh = NULL;
+	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+	struct ocfs2_merge_ctxt ctxt;
+	struct ocfs2_extent_list *rightmost_el;
+
+	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
+							    split_index,
+							    split_rec);
+
+	/*
+	 * The core merge / split code wants to know how much room is
+	 * left in this allocation tree, so we pass the
+	 * rightmost extent list.
+	 */
+	if (path->p_tree_depth) {
+		struct ocfs2_extent_block *eb;
+
+		ret = ocfs2_read_extent_block(et->et_ci,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		rightmost_el = &eb->h_list;
+	} else
+		rightmost_el = path_root_el(path);
+
+	if (rec->e_cpos == split_rec->e_cpos &&
+	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+		ctxt.c_split_covers_rec = 1;
+	else
+		ctxt.c_split_covers_rec = 0;
+
+	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+	trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
+				 ctxt.c_has_empty_extent,
+				 ctxt.c_split_covers_rec);
+
+	if (ctxt.c_contig_type == CONTIG_NONE) {
+		if (ctxt.c_split_covers_rec)
+			ret = ocfs2_replace_extent_rec(handle, et, path, el,
+						       split_index, split_rec);
+		else
+			ret = ocfs2_split_and_insert(handle, et, path,
+						     &last_eb_bh, split_index,
+						     split_rec, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+	} else {
+		ret = ocfs2_try_to_merge_extent(handle, et, path,
+						split_index, split_rec,
+						dealloc, &ctxt);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	brelse(last_eb_bh);
+	return ret;
+}
+
+/*
+ * Change the flags of the already-existing extent at cpos for len clusters.
+ *
+ * new_flags: the flags we want to set.
+ * clear_flags: the flags we want to clear.
+ * phys: the new physical offset we want this new extent starts from.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags)
+{
+	int ret, index;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
+	struct ocfs2_extent_rec split_rec;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	left_path = ocfs2_new_path_from_et(et);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	el = path_leaf_el(left_path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1) {
+		ocfs2_error(sb,
+			    "Owner %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			     (unsigned long long)
+			     ocfs2_metadata_cache_owner(et->et_ci), cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = -EIO;
+	rec = &el->l_recs[index];
+	if (new_flags && (rec->e_flags & new_flags)) {
+		mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
+		     "extent that already had them",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     new_flags);
+		goto out;
+	}
+
+	if (clear_flags && !(rec->e_flags & clear_flags)) {
+		mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
+		     "extent that didn't have them",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     clear_flags);
+		goto out;
+	}
+
+	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+	split_rec.e_cpos = cpu_to_le32(cpos);
+	split_rec.e_leaf_clusters = cpu_to_le16(len);
+	split_rec.e_blkno = cpu_to_le64(start_blkno);
+	split_rec.e_flags = rec->e_flags;
+	if (new_flags)
+		split_rec.e_flags |= new_flags;
+	if (clear_flags)
+		split_rec.e_flags &= ~clear_flags;
+
+	ret = ocfs2_split_extent(handle, et, left_path,
+				 index, &split_rec, meta_ac,
+				 dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	trace_ocfs2_mark_extent_written(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		cpos, len, phys);
+
+	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+			    "that are being written to, but the feature bit "
+			    "is not set in the super block.",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * XXX: This should be fixed up so that we just re-insert the
+	 * next extent records.
+	 */
+	ocfs2_et_extent_map_truncate(et, 0);
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       0, OCFS2_EXT_UNWRITTEN);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *path,
+			    int index, u32 new_range,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int ret, depth, credits;
+	struct buffer_head *last_eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *rightmost_el, *el;
+	struct ocfs2_extent_rec split_rec;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_insert_type insert;
+
+	/*
+	 * Setup the record to split before we grow the tree.
+	 */
+	el = path_leaf_el(path);
+	rec = &el->l_recs[index];
+	ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+				   &split_rec, new_range, rec);
+
+	depth = path->p_tree_depth;
+	if (depth > 0) {
+		ret = ocfs2_read_extent_block(et->et_ci,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		rightmost_el = &eb->h_list;
+	} else
+		rightmost_el = path_leaf_el(path);
+
+	credits = path->p_tree_depth +
+		  ocfs2_extend_meta_needed(et->et_root_el);
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+	    le16_to_cpu(rightmost_el->l_count)) {
+		ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
+				      meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+	insert.ins_appending = APPEND_NONE;
+	insert.ins_contig = CONTIG_NONE;
+	insert.ins_split = SPLIT_RIGHT;
+	insert.ins_tree_depth = depth;
+
+	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(last_eb_bh);
+	return ret;
+}
+
+static int ocfs2_truncate_rec(handle_t *handle,
+			      struct ocfs2_extent_tree *et,
+			      struct ocfs2_path *path, int index,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      u32 cpos, u32 len)
+{
+	int ret;
+	u32 left_cpos, rec_range, trunc_range;
+	int wants_rotate = 0, is_rightmost_tree_rec = 0;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_block *eb;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		index--;
+	}
+
+	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+	    path->p_tree_depth) {
+		/*
+		 * Check whether this is the rightmost tree record. If
+		 * we remove all of this record or part of its right
+		 * edge then an update of the record lengths above it
+		 * will be required.
+		 */
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+		if (eb->h_next_leaf_blk == 0)
+			is_rightmost_tree_rec = 1;
+	}
+
+	rec = &el->l_recs[index];
+	if (index == 0 && path->p_tree_depth &&
+	    le32_to_cpu(rec->e_cpos) == cpos) {
+		/*
+		 * Changing the leftmost offset (via partial or whole
+		 * record truncate) of an interior (or rightmost) path
+		 * means we have to update the subtree that is formed
+		 * by this leaf and the one to it's left.
+		 *
+		 * There are two cases we can skip:
+		 *   1) Path is the leftmost one in our btree.
+		 *   2) The leaf is rightmost and will be empty after
+		 *      we remove the extent record - the rotate code
+		 *      knows how to update the newly formed edge.
+		 */
+
+		ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+			left_path = ocfs2_new_path_from_path(path);
+			if (!left_path) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_find_path(et->et_ci, left_path,
+					      left_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+	}
+
+	ret = ocfs2_extend_rotate_transaction(handle, 0,
+					      handle->h_buffer_credits,
+					      path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+	trunc_range = cpos + len;
+
+	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+		int next_free;
+
+		memset(rec, 0, sizeof(*rec));
+		ocfs2_cleanup_merge(el, index);
+		wants_rotate = 1;
+
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (is_rightmost_tree_rec && next_free > 1) {
+			/*
+			 * We skip the edge update if this path will
+			 * be deleted by the rotate code.
+			 */
+			rec = &el->l_recs[next_free - 1];
+			ocfs2_adjust_rightmost_records(handle, et, path,
+						       rec);
+		}
+	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
+		/* Remove leftmost portion of the record. */
+		le32_add_cpu(&rec->e_cpos, len);
+		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+		le16_add_cpu(&rec->e_leaf_clusters, -len);
+	} else if (rec_range == trunc_range) {
+		/* Remove rightmost portion of the record */
+		le16_add_cpu(&rec->e_leaf_clusters, -len);
+		if (is_rightmost_tree_rec)
+			ocfs2_adjust_rightmost_records(handle, et, path, rec);
+	} else {
+		/* Caller should have trapped this. */
+		mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
+		     "(%u, %u)\n",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     le32_to_cpu(rec->e_cpos),
+		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+		BUG();
+	}
+
+	if (left_path) {
+		int subtree_index;
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+		ocfs2_complete_edge_insert(handle, left_path, path,
+					   subtree_index);
+	}
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+
+	ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+}
+
+int ocfs2_remove_extent(handle_t *handle,
+			struct ocfs2_extent_tree *et,
+			u32 cpos, u32 len,
+			struct ocfs2_alloc_context *meta_ac,
+			struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	u32 rec_range, trunc_range;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *path = NULL;
+
+	/*
+	 * XXX: Why are we truncating to 0 instead of wherever this
+	 * affects us?
+	 */
+	ocfs2_et_extent_map_truncate(et, 0);
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1) {
+		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+			    "Owner %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			    cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * We have 3 cases of extent removal:
+	 *   1) Range covers the entire extent rec
+	 *   2) Range begins or ends on one edge of the extent rec
+	 *   3) Range is in the middle of the extent rec (no shared edges)
+	 *
+	 * For case 1 we remove the extent rec and left rotate to
+	 * fill the hole.
+	 *
+	 * For case 2 we just shrink the existing extent rec, with a
+	 * tree update if the shrinking edge is also the edge of an
+	 * extent block.
+	 *
+	 * For case 3 we do a right split to turn the extent rec into
+	 * something case 2 can handle.
+	 */
+	rec = &el->l_recs[index];
+	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+	trunc_range = cpos + len;
+
+	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+
+	trace_ocfs2_remove_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, len, index, le32_to_cpu(rec->e_cpos),
+		ocfs2_rec_clusters(el, rec));
+
+	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+					 cpos, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		ret = ocfs2_split_tree(handle, et, path, index,
+				       trunc_range, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * The split could have manipulated the tree enough to
+		 * move the record location, so we have to look for it again.
+		 */
+		ocfs2_reinit_path(path, 1);
+
+		ret = ocfs2_find_path(et->et_ci, path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		el = path_leaf_el(path);
+		index = ocfs2_search_extent_list(el, cpos);
+		if (index == -1) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu: split at cpos %u lost record.",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				    cpos);
+			ret = -EROFS;
+			goto out;
+		}
+
+		/*
+		 * Double check our values here. If anything is fishy,
+		 * it's easier to catch it at the top level.
+		 */
+		rec = &el->l_recs[index];
+		rec_range = le32_to_cpu(rec->e_cpos) +
+			ocfs2_rec_clusters(el, rec);
+		if (rec_range != trunc_range) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu: error after split at cpos %u"
+				    "trunc len %u, existing record is (%u,%u)",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				    cpos, len, le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+
+		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+					 cpos, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+					      struct ocfs2_extent_tree *et,
+					      u32 extents_to_split,
+					      struct ocfs2_alloc_context **ac,
+					      int extra_blocks)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*ac = NULL;
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+	if (extra_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	if (ret) {
+		if (*ac) {
+			ocfs2_free_alloc_context(*ac);
+			*ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len, int flags,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     u64 refcount_loc, bool refcount_tree_locked)
+{
+	int ret, credits = 0, extra_blocks = 0;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+
+	if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		if (!refcount_tree_locked) {
+			ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+						       &ref_tree, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto bail;
+			}
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							    refcount_loc,
+							    phys_blkno,
+							    len,
+							    &credits,
+							    &extra_blocks);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+	}
+
+	ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+						 extra_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb,
+			ocfs2_remove_extent_credits(osb->sb) + credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_et_root_journal_access(handle, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	dquot_free_space_nodirty(inode,
+				  ocfs2_clusters_to_bytes(inode->i_sb, len));
+
+	ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_et_update_clusters(et, -len);
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+	if (phys_blkno) {
+		if (flags & OCFS2_EXT_REFCOUNTED)
+			ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 phys_blkno),
+					len, meta_ac,
+					dealloc, 1);
+		else
+			ret = ocfs2_truncate_log_append(osb, handle,
+							phys_blkno, len);
+		if (ret)
+			mlog_errno(ret);
+
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+bail:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+
+	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+			"slot %d, invalid truncate log parameters: used = "
+			"%u, count = %u\n", osb->slot_num,
+			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+					   unsigned int new_start)
+{
+	unsigned int tail_index;
+	unsigned int current_tail;
+
+	/* No records, nothing to coalesce */
+	if (!le16_to_cpu(tl->tl_used))
+		return 0;
+
+	tail_index = le16_to_cpu(tl->tl_used) - 1;
+	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+	return current_tail == new_start;
+}
+
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+			      handle_t *handle,
+			      u64 start_blk,
+			      unsigned int num_clusters)
+{
+	int status, index;
+	unsigned int start_cluster, tl_count;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+
+	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
+	tl_count = le16_to_cpu(tl->tl_count);
+	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+			tl_count == 0,
+			"Truncate record count on #%llu invalid "
+			"wanted %u, actual %u\n",
+			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+			ocfs2_truncate_recs_per_inode(osb->sb),
+			le16_to_cpu(tl->tl_count));
+
+	/* Caller should have known to flush before calling us. */
+	index = le16_to_cpu(tl->tl_used);
+	if (index >= tl_count) {
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_truncate_log_append(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
+		start_cluster, num_clusters);
+	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+		/*
+		 * Move index back to the record we are coalescing with.
+		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+		 */
+		index--;
+
+		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+		trace_ocfs2_truncate_log_append(
+			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+			index, le32_to_cpu(tl->tl_recs[index].t_start),
+			num_clusters);
+	} else {
+		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+		tl->tl_used = cpu_to_le16(index + 1);
+	}
+	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+	ocfs2_journal_dirty(handle, tl_bh);
+
+	osb->truncated_clusters += num_clusters;
+bail:
+	return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+					 handle_t *handle,
+					 struct inode *data_alloc_inode,
+					 struct buffer_head *data_alloc_bh)
+{
+	int status = 0;
+	int i;
+	unsigned int num_clusters;
+	u64 start_blk;
+	struct ocfs2_truncate_rec rec;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	i = le16_to_cpu(tl->tl_used) - 1;
+	while (i >= 0) {
+		/* Caller has given us at least enough credits to
+		 * update the truncate log dinode */
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		tl->tl_used = cpu_to_le16(i);
+
+		ocfs2_journal_dirty(handle, tl_bh);
+
+		/* TODO: Perhaps we can calculate the bulk of the
+		 * credits up front rather than extending like
+		 * this. */
+		status = ocfs2_extend_trans(handle,
+					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		rec = tl->tl_recs[i];
+		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+						    le32_to_cpu(rec.t_start));
+		num_clusters = le32_to_cpu(rec.t_clusters);
+
+		/* if start_blk is not set, we ignore the record as
+		 * invalid. */
+		if (start_blk) {
+			trace_ocfs2_replay_truncate_records(
+				(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+				i, le32_to_cpu(rec.t_start), num_clusters);
+
+			status = ocfs2_free_clusters(handle, data_alloc_inode,
+						     data_alloc_bh, start_blk,
+						     num_clusters);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i--;
+	}
+
+	osb->truncated_clusters = 0;
+
+bail:
+	return status;
+}
+
+/* Expects you to already be holding tl_inode->i_mutex */
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	unsigned int num_to_flush;
+	handle_t *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *data_alloc_inode = NULL;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct buffer_head *data_alloc_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
+	num_to_flush = le16_to_cpu(tl->tl_used);
+	trace_ocfs2_flush_truncate_log(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+		num_to_flush);
+	if (!num_to_flush) {
+		status = 0;
+		goto out;
+	}
+
+	data_alloc_inode = ocfs2_get_system_file_inode(osb,
+						       GLOBAL_BITMAP_SYSTEM_INODE,
+						       OCFS2_INVALID_SLOT);
+	if (!data_alloc_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto out;
+	}
+
+	mutex_lock(&data_alloc_inode->i_mutex);
+
+	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_unlock;
+	}
+
+	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+					       data_alloc_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	brelse(data_alloc_bh);
+	ocfs2_inode_unlock(data_alloc_inode, 1);
+
+out_mutex:
+	mutex_unlock(&data_alloc_inode->i_mutex);
+	iput(data_alloc_inode);
+
+out:
+	return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mutex_lock(&tl_inode->i_mutex);
+	status = __ocfs2_flush_truncate_log(osb);
+	mutex_unlock(&tl_inode->i_mutex);
+
+	return status;
+}
+
+static void ocfs2_truncate_log_worker(struct work_struct *work)
+{
+	int status;
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     osb_truncate_log_wq.work);
+
+	status = ocfs2_flush_truncate_log(osb);
+	if (status < 0)
+		mlog_errno(status);
+	else
+		ocfs2_init_steal_slots(osb);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel)
+{
+	if (osb->osb_tl_inode &&
+			atomic_read(&osb->osb_tl_disable) == 0) {
+		/* We want to push off log flushes while truncates are
+		 * still running. */
+		if (cancel)
+			cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+	}
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+				       int slot_num,
+				       struct inode **tl_inode,
+				       struct buffer_head **tl_bh)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					   TRUNCATE_LOG_SYSTEM_INODE,
+					   slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+		goto bail;
+	}
+
+	status = ocfs2_read_inode_block(inode, &bh);
+	if (status < 0) {
+		iput(inode);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*tl_inode = inode;
+	*tl_bh    = bh;
+bail:
+	return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	*tl_copy = NULL;
+
+	trace_ocfs2_begin_truncate_log_recovery(slot_num);
+
+	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+	 * validated by the underlying call to ocfs2_read_inode_block(),
+	 * so any corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
+	if (le16_to_cpu(tl->tl_used)) {
+		trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
+
+		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+		if (!(*tl_copy)) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Assuming the write-out below goes well, this copy
+		 * will be passed back to recovery for processing. */
+		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
+
+		/* All we need to do to clear the truncate log is set
+		 * tl_used. */
+		tl->tl_used = 0;
+
+		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
+		status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (tl_inode)
+		iput(tl_inode);
+	brelse(tl_bh);
+
+	if (status < 0 && (*tl_copy)) {
+		kfree(*tl_copy);
+		*tl_copy = NULL;
+		mlog_errno(status);
+	}
+
+	return status;
+}
+
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy)
+{
+	int status = 0;
+	int i;
+	unsigned int clusters, num_recs, start_cluster;
+	u64 start_blk;
+	handle_t *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_truncate_log *tl;
+
+	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
+		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
+		return -EINVAL;
+	}
+
+	tl = &tl_copy->id2.i_dealloc;
+	num_recs = le16_to_cpu(tl->tl_used);
+	trace_ocfs2_complete_truncate_log_recovery(
+		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+		num_recs);
+
+	mutex_lock(&tl_inode->i_mutex);
+	for(i = 0; i < num_recs; i++) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			status = __ocfs2_flush_truncate_log(osb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail_up;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_up;
+		}
+
+		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
+		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
+		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
+
+		status = ocfs2_truncate_log_append(osb, handle,
+						   start_blk, clusters);
+		ocfs2_commit_trans(osb, handle);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_up;
+		}
+	}
+
+bail_up:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	return status;
+}
+
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	atomic_set(&osb->osb_tl_disable, 1);
+
+	if (tl_inode) {
+		cancel_delayed_work(&osb->osb_truncate_log_wq);
+		flush_workqueue(ocfs2_wq);
+
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		brelse(osb->osb_tl_bh);
+		iput(osb->osb_tl_inode);
+	}
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+
+	status = ocfs2_get_truncate_log_info(osb,
+					     osb->slot_num,
+					     &tl_inode,
+					     &tl_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* ocfs2_truncate_log_shutdown keys on the existence of
+	 * osb->osb_tl_inode so we don't set any of the osb variables
+	 * until we're sure all is well. */
+	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
+			  ocfs2_truncate_log_worker);
+	atomic_set(&osb->osb_tl_disable, 0);
+	osb->osb_tl_bh    = tl_bh;
+	osb->osb_tl_inode = tl_inode;
+
+	return status;
+}
+
+/*
+ * Delayed de-allocation of suballocator blocks.
+ *
+ * Some sets of block de-allocations might involve multiple suballocator inodes.
+ *
+ * The locking for this can get extremely complicated, especially when
+ * the suballocator inodes to delete from aren't known until deep
+ * within an unrelated codepath.
+ *
+ * ocfs2_extent_block structures are a good example of this - an inode
+ * btree could have been grown by any number of nodes each allocating
+ * out of their own suballoc inode.
+ *
+ * These structures allow the delay of block de-allocation until a
+ * later time, when locking of multiple cluster inodes won't cause
+ * deadlock.
+ */
+
+/*
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
+ */
+struct ocfs2_cached_block_free {
+	struct ocfs2_cached_block_free		*free_next;
+	u64					free_bg;
+	u64					free_blk;
+	unsigned int				free_bit;
+};
+
+struct ocfs2_per_slot_free_list {
+	struct ocfs2_per_slot_free_list		*f_next_suballocator;
+	int					f_inode_type;
+	int					f_slot;
+	struct ocfs2_cached_block_free		*f_first;
+};
+
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+				    int sysfile_type,
+				    int slot,
+				    struct ocfs2_cached_block_free *head)
+{
+	int ret;
+	u64 bg_blkno;
+	handle_t *handle;
+	struct inode *inode;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_cached_block_free *tmp;
+
+	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	while (head) {
+		if (head->free_bg)
+			bg_blkno = head->free_bg;
+		else
+			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+							      head->free_bit);
+		trace_ocfs2_free_cached_blocks(
+		     (unsigned long long)head->free_blk, head->free_bit);
+
+		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
+					       head->free_bit, bg_blkno, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_journal;
+		}
+
+		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_journal;
+		}
+
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+out_journal:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+out:
+	while(head) {
+		/* Premature exit may have left some dangling items. */
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+	return ret;
+}
+
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit)
+{
+	int ret = 0;
+	struct ocfs2_cached_block_free *item;
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (item == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
+
+	item->free_blk = blkno;
+	item->free_bit = bit;
+	item->free_next = ctxt->c_global_allocator;
+
+	ctxt->c_global_allocator = item;
+	return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+				      struct ocfs2_cached_block_free *head)
+{
+	struct ocfs2_cached_block_free *tmp;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	int ret = 0;
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	while (head) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			ret = __ocfs2_flush_truncate_log(osb);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+						head->free_bit);
+
+		ocfs2_commit_trans(osb, handle);
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	while (head) {
+		/* Premature exit may have left some dangling items. */
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+	return ret;
+}
+
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+		       struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+	int ret = 0, ret2;
+	struct ocfs2_per_slot_free_list *fl;
+
+	if (!ctxt)
+		return 0;
+
+	while (ctxt->c_first_suballocator) {
+		fl = ctxt->c_first_suballocator;
+
+		if (fl->f_first) {
+			trace_ocfs2_run_deallocs(fl->f_inode_type,
+						 fl->f_slot);
+			ret2 = ocfs2_free_cached_blocks(osb,
+							fl->f_inode_type,
+							fl->f_slot,
+							fl->f_first);
+			if (ret2)
+				mlog_errno(ret2);
+			if (!ret)
+				ret = ret2;
+		}
+
+		ctxt->c_first_suballocator = fl->f_next_suballocator;
+		kfree(fl);
+	}
+
+	if (ctxt->c_global_allocator) {
+		ret2 = ocfs2_free_cached_clusters(osb,
+						  ctxt->c_global_allocator);
+		if (ret2)
+			mlog_errno(ret2);
+		if (!ret)
+			ret = ret2;
+
+		ctxt->c_global_allocator = NULL;
+	}
+
+	return ret;
+}
+
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_per_slot_free_list(int type,
+			      int slot,
+			      struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+	while (fl) {
+		if (fl->f_inode_type == type && fl->f_slot == slot)
+			return fl;
+
+		fl = fl->f_next_suballocator;
+	}
+
+	fl = kmalloc(sizeof(*fl), GFP_NOFS);
+	if (fl) {
+		fl->f_inode_type = type;
+		fl->f_slot = slot;
+		fl->f_first = NULL;
+		fl->f_next_suballocator = ctxt->c_first_suballocator;
+
+		ctxt->c_first_suballocator = fl;
+	}
+	return fl;
+}
+
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+			      int type, int slot, u64 suballoc,
+			      u64 blkno, unsigned int bit)
+{
+	int ret;
+	struct ocfs2_per_slot_free_list *fl;
+	struct ocfs2_cached_block_free *item;
+
+	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+	if (fl == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (item == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_cache_block_dealloc(type, slot,
+					(unsigned long long)suballoc,
+					(unsigned long long)blkno, bit);
+
+	item->free_bg = suballoc;
+	item->free_blk = blkno;
+	item->free_bit = bit;
+	item->free_next = fl->f_first;
+
+	fl->f_first = item;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb)
+{
+	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+					 le16_to_cpu(eb->h_suballoc_slot),
+					 le64_to_cpu(eb->h_suballoc_loc),
+					 le64_to_cpu(eb->h_blkno),
+					 le16_to_cpu(eb->h_suballoc_bit));
+}
+
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	return 0;
+}
+
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys)
+{
+	int ret, partial = 0;
+
+	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
+	if (ret)
+		mlog_errno(ret);
+
+	if (zero)
+		zero_user_segment(page, from, to);
+
+	/*
+	 * Need to set the buffers we zero'd into uptodate
+	 * here if they aren't - ocfs2_map_page_blocks()
+	 * might've skipped some
+	 */
+	ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, &partial,
+				ocfs2_zero_func);
+	if (ret < 0)
+		mlog_errno(ret);
+	else if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	if (!partial)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
+				     loff_t end, struct page **pages,
+				     int numpages, u64 phys, handle_t *handle)
+{
+	int i;
+	struct page *page;
+	unsigned int from, to = PAGE_CACHE_SIZE;
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+	if (numpages == 0)
+		goto out;
+
+	to = PAGE_CACHE_SIZE;
+	for(i = 0; i < numpages; i++) {
+		page = pages[i];
+
+		from = start & (PAGE_CACHE_SIZE - 1);
+		if ((end >> PAGE_CACHE_SHIFT) == page->index)
+			to = end & (PAGE_CACHE_SIZE - 1);
+
+		BUG_ON(from > PAGE_CACHE_SIZE);
+		BUG_ON(to > PAGE_CACHE_SIZE);
+
+		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
+					 &phys);
+
+		start = (page->index + 1) << PAGE_CACHE_SHIFT;
+	}
+out:
+	if (pages)
+		ocfs2_unlock_and_free_pages(pages, numpages);
+}
+
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num)
+{
+	int numpages, ret = 0;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index;
+	loff_t last_page_bytes;
+
+	BUG_ON(start > end);
+
+	numpages = 0;
+	last_page_bytes = PAGE_ALIGN(end);
+	index = start >> PAGE_CACHE_SHIFT;
+	do {
+		pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
+		if (!pages[numpages]) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		numpages++;
+		index++;
+	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+	if (ret != 0) {
+		if (pages)
+			ocfs2_unlock_and_free_pages(pages, numpages);
+		numpages = 0;
+	}
+
+	*num = numpages;
+
+	return ret;
+}
+
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+				struct page **pages, int *num)
+{
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+	return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+				  u64 range_start, u64 range_end)
+{
+	int ret = 0, numpages;
+	struct page **pages = NULL;
+	u64 phys;
+	unsigned int ext_flags;
+	struct super_block *sb = inode->i_sb;
+
+	/*
+	 * File systems which don't support sparse files zero on every
+	 * extend.
+	 */
+	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
+		return 0;
+
+	pages = kcalloc(ocfs2_pages_per_cluster(sb),
+			sizeof(struct page *), GFP_NOFS);
+	if (pages == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (range_start == range_end)
+		goto out;
+
+	ret = ocfs2_extent_map_get_blocks(inode,
+					  range_start >> sb->s_blocksize_bits,
+					  &phys, NULL, &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Tail is a hole, or is marked unwritten. In either case, we
+	 * can count on read and write to return/push zero's.
+	 */
+	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
+		goto out;
+
+	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+				   &numpages);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
+				 numpages, phys, handle);
+
+	/*
+	 * Initiate writeout of the pages we zero'd here. We don't
+	 * wait on them - the truncate_inode_pages() call later will
+	 * do that for us.
+	 */
+	ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
+				       range_end - 1);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	kfree(pages);
+
+	return ret;
+}
+
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+					     struct ocfs2_dinode *di)
+{
+	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2) -
+				    xattrsize);
+	else
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2));
+}
+
+void ocfs2_dinode_new_extent_list(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
+	di->id2.i_list.l_tree_depth = 0;
+	di->id2.i_list.l_next_free_rec = 0;
+	di->id2.i_list.l_count = cpu_to_le16(
+		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
+}
+
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	/*
+	 * We clear the entire i_data structure here so that all
+	 * fields can be properly initialized.
+	 */
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
+
+	idata->id_count = cpu_to_le16(
+			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
+}
+
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+					 struct buffer_head *di_bh)
+{
+	int ret, i, has_data, num_pages = 0;
+	int need_free = 0;
+	u32 bit_off, num;
+	handle_t *handle;
+	u64 uninitialized_var(block);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct page **pages = NULL;
+	loff_t end = osb->s_clustersize;
+	struct ocfs2_extent_tree et;
+	int did_quota = 0;
+
+	has_data = i_size_read(inode) ? 1 : 0;
+
+	if (has_data) {
+		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
+				sizeof(struct page *), GFP_NOFS);
+		if (pages == NULL) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto free_pages;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_inline_to_extents_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (has_data) {
+		unsigned int page_end;
+		u64 phys;
+
+		ret = dquot_alloc_space_nodirty(inode,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (ret)
+			goto out_commit;
+		did_quota = 1;
+
+		data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
+					   &num);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/*
+		 * Save two copies, one for insert, and one that can
+		 * be changed by ocfs2_map_and_dirty_page() below.
+		 */
+		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+		/*
+		 * Non sparse file systems zero on extend, so no need
+		 * to do that now.
+		 */
+		if (!ocfs2_sparse_alloc(osb) &&
+		    PAGE_CACHE_SIZE < osb->s_clustersize)
+			end = PAGE_CACHE_SIZE;
+
+		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+		if (ret) {
+			mlog_errno(ret);
+			need_free = 1;
+			goto out_commit;
+		}
+
+		/*
+		 * This should populate the 1st page for us and mark
+		 * it up to date.
+		 */
+		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			need_free = 1;
+			goto out_unlock;
+		}
+
+		page_end = PAGE_CACHE_SIZE;
+		if (PAGE_CACHE_SIZE > osb->s_clustersize)
+			page_end = osb->s_clustersize;
+
+		for (i = 0; i < num_pages; i++)
+			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
+						 pages[i], i > 0, &phys);
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+	ocfs2_dinode_new_extent_list(inode, di);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	if (has_data) {
+		/*
+		 * An error at this point should be extremely rare. If
+		 * this proves to be false, we could always re-build
+		 * the in-inode data from our pages.
+		 */
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			need_free = 1;
+			goto out_unlock;
+		}
+
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	}
+
+out_unlock:
+	if (pages)
+		ocfs2_unlock_and_free_pages(pages, num_pages);
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(inode,
+					  ocfs2_clusters_to_bytes(osb->sb, 1));
+
+	if (need_free) {
+		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+					bit_off, num);
+		else
+			ocfs2_free_clusters(handle,
+					data_ac->ac_inode,
+					data_ac->ac_bh,
+					ocfs2_clusters_to_blocks(osb->sb, bit_off),
+					num);
+	}
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+free_pages:
+	kfree(pages);
+	return ret;
+}
+
+/*
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
+ *
+ * WARNING: This will kfree the truncate context
+ */
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *di_bh)
+{
+	int status = 0, i, flags = 0;
+	u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
+	u64 blkno = 0;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+	u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+	struct ocfs2_extent_tree et;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
+						     i_size_read(inode));
+
+	path = ocfs2_new_path(di_bh, &di->id2.i_list,
+			      ocfs2_journal_access_di);
+	if (!path) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_extent_map_trunc(inode, new_highest_cpos);
+
+start:
+	/*
+	 * Check that we still have allocation to delete.
+	 */
+	if (OCFS2_I(inode)->ip_clusters == 0) {
+		status = 0;
+		goto bail;
+	}
+
+	/*
+	 * Truncate always works against the rightmost tree branch.
+	 */
+	status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_commit_truncate(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		new_highest_cpos,
+		OCFS2_I(inode)->ip_clusters,
+		path->p_tree_depth);
+
+	/*
+	 * By now, el will point to the extent list on the bottom most
+	 * portion of this tree. Only the tail record is considered in
+	 * each pass.
+	 *
+	 * We handle the following cases, in order:
+	 * - empty extent: delete the remaining branch
+	 * - remove the entire record
+	 * - remove a partial record
+	 * - no record needs to be removed (truncate has completed)
+	 */
+	el = path_leaf_el(path);
+	if (le16_to_cpu(el->l_next_free_rec) == 0) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has empty extent block at %llu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
+		status = -EROFS;
+		goto bail;
+	}
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	rec = &el->l_recs[i];
+	flags = rec->e_flags;
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	if (i == 0 && ocfs2_is_empty_extent(rec)) {
+		/*
+		 * Lower levels depend on this never happening, but it's best
+		 * to check it up here before changing the tree.
+		*/
+		if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+			ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+				    "extent record, depth %u\n", inode->i_ino,
+				    le16_to_cpu(root_el->l_tree_depth));
+			status = -EROFS;
+			goto bail;
+		}
+		trunc_cpos = le32_to_cpu(rec->e_cpos);
+		trunc_len = 0;
+		blkno = 0;
+	} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+		/*
+		 * Truncate entire record.
+		 */
+		trunc_cpos = le32_to_cpu(rec->e_cpos);
+		trunc_len = ocfs2_rec_clusters(el, rec);
+		blkno = le64_to_cpu(rec->e_blkno);
+	} else if (range > new_highest_cpos) {
+		/*
+		 * Partial truncate. it also should be
+		 * the last truncate we're doing.
+		 */
+		trunc_cpos = new_highest_cpos;
+		trunc_len = range - new_highest_cpos;
+		coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+		blkno = le64_to_cpu(rec->e_blkno) +
+				ocfs2_clusters_to_blocks(inode->i_sb, coff);
+	} else {
+		/*
+		 * Truncate completed, leave happily.
+		 */
+		status = 0;
+		goto bail;
+	}
+
+	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+	if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
+		status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+				&ref_tree, NULL);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+					  phys_cpos, trunc_len, flags, &dealloc,
+					  refcount_loc, true);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_reinit_path(path, 1);
+
+	/*
+	 * The check above will catch the case where we've truncated
+	 * away all allocation.
+	 */
+	goto start;
+
+bail:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	ocfs2_free_path(path);
+
+	return status;
+}
+
+/*
+ * 'start' is inclusive, 'end' is not.
+ */
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+			  unsigned int start, unsigned int end, int trunc)
+{
+	int ret;
+	unsigned int numbytes;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+	if (end > i_size_read(inode))
+		end = i_size_read(inode);
+
+	BUG_ON(start > end);
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
+	    !ocfs2_supports_inline_data(osb)) {
+		ocfs2_error(inode->i_sb,
+			    "Inline data flags for inode %llu don't agree! "
+			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    le16_to_cpu(di->i_dyn_features),
+			    OCFS2_I(inode)->ip_dyn_features,
+			    osb->s_feature_incompat);
+		ret = -EROFS;
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	numbytes = end - start;
+	memset(idata->id_data + start, 0, numbytes);
+
+	/*
+	 * No need to worry about the data page here - it's been
+	 * truncated already and inline data doesn't need it for
+	 * pushing zero's to disk, so we'll let readpage pick it up
+	 * later.
+	 */
+	if (trunc) {
+		i_size_write(inode, start);
+		di->i_size = cpu_to_le64(start);
+	}
+
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	return ret;
+}
+
+static int ocfs2_trim_extent(struct super_block *sb,
+			     struct ocfs2_group_desc *gd,
+			     u32 start, u32 count)
+{
+	u64 discard, bcount;
+
+	bcount = ocfs2_clusters_to_blocks(sb, count);
+	discard = le64_to_cpu(gd->bg_blkno) +
+			ocfs2_clusters_to_blocks(sb, start);
+
+	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+
+	return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+			    struct ocfs2_group_desc *gd,
+			    u32 start, u32 max, u32 minbits)
+{
+	int ret = 0, count = 0, next;
+	void *bitmap = gd->bg_bitmap;
+
+	if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+		return 0;
+
+	trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+			       start, max, minbits);
+
+	while (start < max) {
+		start = ocfs2_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = ocfs2_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minbits) {
+			ret = ocfs2_trim_extent(sb, gd,
+						start, next - start);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+			break;
+	}
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 start, len, trimmed, first_group, last_group, group;
+	int ret, cnt;
+	u32 first_bit, last_bit, minlen;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_dinode *main_bm;
+	struct ocfs2_group_desc *gd = NULL;
+
+	start = range->start >> osb->s_clustersize_bits;
+	len = range->len >> osb->s_clustersize_bits;
+	minlen = range->minlen >> osb->s_clustersize_bits;
+
+	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
+		return -EINVAL;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (start >= le32_to_cpu(main_bm->i_clusters)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	len = range->len >> osb->s_clustersize_bits;
+	if (start + len > le32_to_cpu(main_bm->i_clusters))
+		len = le32_to_cpu(main_bm->i_clusters) - start;
+
+	trace_ocfs2_trim_fs(start, len, minlen);
+
+	/* Determine first and last group to examine based on start and len */
+	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+	if (first_group == osb->first_cluster_group_blkno)
+		first_bit = start;
+	else
+		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+	last_bit = osb->bitmap_cpg;
+
+	trimmed = 0;
+	for (group = first_group; group <= last_group;) {
+		if (first_bit + len >= osb->bitmap_cpg)
+			last_bit = osb->bitmap_cpg;
+		else
+			last_bit = first_bit + len;
+
+		ret = ocfs2_read_group_descriptor(main_bm_inode,
+						  main_bm, group,
+						  &gd_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+		brelse(gd_bh);
+		gd_bh = NULL;
+		if (cnt < 0) {
+			ret = cnt;
+			mlog_errno(ret);
+			break;
+		}
+
+		trimmed += cnt;
+		len -= osb->bitmap_cpg - first_bit;
+		first_bit = 0;
+		if (group == osb->first_cluster_group_blkno)
+			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+		else
+			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+	}
+	range->len = trimmed * sb->s_blocksize;
+out_unlock:
+	ocfs2_inode_unlock(main_bm_inode, 0);
+	brelse(main_bm_bh);
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+out:
+	return ret;
+}
diff --git a/kernel/fs/ocfs2/alloc.h b/kernel/fs/ocfs2/alloc.h
new file mode 100644
index 000000000..fb09b97db
--- /dev/null
+++ b/kernel/fs/ocfs2/alloc.h
@@ -0,0 +1,324 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+
+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation.  Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.  It needs the ocfs2_caching_info structure associated with
+ * I/O on the tree.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+	struct ocfs2_extent_tree_operations	*et_ops;
+	struct buffer_head			*et_root_bh;
+	struct ocfs2_extent_list		*et_root_el;
+	struct ocfs2_caching_info		*et_ci;
+	ocfs2_journal_access_func		et_root_journal_access;
+	void					*et_object;
+	unsigned int				et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct ocfs2_caching_info *ci,
+				       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ci,
+					struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *bh);
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh);
+
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
+			    struct buffer_head **bh);
+
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(handle_t *handle,
+			struct ocfs2_extent_tree *et,
+			u32 cpos,
+			u64 start_blk,
+			u32 new_clusters,
+			u8 flags,
+			struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret);
+struct ocfs2_cached_dealloc_ctxt;
+struct ocfs2_path;
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags);
+int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
+			u32 cpos, u32 len,
+			struct ocfs2_alloc_context *meta_ac,
+			struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len, int flags,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     u64 refcount_loc, bool refcount_tree_locked);
+
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return le16_to_cpu(root_el->l_tree_depth) + 2;
+}
+
+void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+					 struct buffer_head *di_bh);
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+			      handle_t *handle,
+			      u64 start_blk,
+			      unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+	struct ocfs2_per_slot_free_list		*c_first_suballocator;
+	struct ocfs2_cached_block_free 		*c_global_allocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	c->c_first_suballocator = NULL;
+	c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit);
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+			      int type, int slot, u64 suballoc, u64 blkno,
+			      unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	return c->c_global_allocator != NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+		       struct ocfs2_cached_dealloc_ctxt *ctxt);
+
+struct ocfs2_truncate_context {
+	struct ocfs2_cached_dealloc_ctxt tc_dealloc;
+	int tc_ext_alloc_locked; /* is it cluster locked? */
+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+};
+
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+				  u64 range_start, u64 range_end);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *di_bh);
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+			  unsigned int start, unsigned int end, int trunc);
+
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+		    struct ocfs2_extent_list *root_el, u32 cpos,
+		    struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+					      struct ocfs2_extent_rec *rec)
+{
+	/*
+	 * Cluster count in extent records is slightly different
+	 * between interior nodes and leaf nodes. This is to support
+	 * unwritten extents which need a flags field in leaf node
+	 * records, thus shrinking the available space for a clusters
+	 * field.
+	 */
+	if (el->l_tree_depth)
+		return le32_to_cpu(rec->e_int_clusters);
+	else
+		return le16_to_cpu(rec->e_leaf_clusters);
+}
+
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys);
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+	struct buffer_head		*bh;
+	struct ocfs2_extent_list	*el;
+};
+
+#define OCFS2_MAX_PATH_DEPTH	5
+
+struct ocfs2_path {
+	int				p_tree_depth;
+	ocfs2_journal_access_func	p_root_access;
+	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
+void ocfs2_free_path(struct ocfs2_path *path);
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path,
+		    u32 cpos);
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx);
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+				   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+				  struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *left,
+			    struct ocfs2_path *right);
+#endif /* OCFS2_ALLOC_H */
diff --git a/kernel/fs/ocfs2/aops.c b/kernel/fs/ocfs2/aops.c
new file mode 100644
index 000000000..f906a250d
--- /dev/null
+++ b/kernel/fs/ocfs2/aops.c
@@ -0,0 +1,2448 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mpage.h>
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
+#include <linux/uio.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+#include "dir.h"
+#include "namei.h"
+#include "sysfile.h"
+
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	void *kaddr;
+
+	trace_ocfs2_symlink_get_block(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)iblock, bh_result, create);
+
+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs2_read_inode_block(inode, &bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+						    le32_to_cpu(fe->i_clusters))) {
+		err = -ENOMEM;
+		mlog(ML_ERROR, "block offset is outside the allocated size: "
+		     "%llu\n", (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+			    iblock;
+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
+		if (!buffer_cache_bh) {
+			err = -ENOMEM;
+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh)
+		    && ocfs2_inode_is_new(inode)) {
+			kaddr = kmap_atomic(bh_result->b_page);
+			if (!kaddr) {
+				mlog(ML_ERROR, "couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock),
+			       buffer_cache_bh->b_data,
+			       bh_result->b_size);
+			kunmap_atomic(kaddr);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+
+	err = 0;
+
+bail:
+	brelse(bh);
+
+	return err;
+}
+
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	int err = 0;
+	unsigned int ext_flags;
+	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
+	u64 p_blkno, count, past_eof;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      (unsigned long long)iblock, bh_result, create);
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+		     inode, inode->i_ino);
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+		goto bail;
+	}
+
+	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
+					  &ext_flags);
+	if (err) {
+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
+		     (unsigned long long)p_blkno);
+		goto bail;
+	}
+
+	if (max_blocks < count)
+		count = max_blocks;
+
+	/*
+	 * ocfs2 never allocates in this function - the only time we
+	 * need to use BH_New is when we're extending i_size on a file
+	 * system which doesn't support holes, in which case BH_New
+	 * allows __block_write_begin() to zero.
+	 *
+	 * If we see this on a sparse file system, then a truncate has
+	 * raced us and removed the cluster. In this case, we clear
+	 * the buffers dirty and uptodate bits and let the buffer code
+	 * ignore it as a hole.
+	 */
+	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
+		clear_buffer_dirty(bh_result);
+		clear_buffer_uptodate(bh_result);
+		goto bail;
+	}
+
+	/* Treat the unwritten extent as a hole for zeroing purposes. */
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+		map_bh(bh_result, inode->i_sb, p_blkno);
+
+	bh_result->b_size = count << inode->i_blkbits;
+
+	if (!ocfs2_sparse_alloc(osb)) {
+		if (p_blkno == 0) {
+			err = -EIO;
+			mlog(ML_ERROR,
+			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+			     (unsigned long long)iblock,
+			     (unsigned long long)p_blkno,
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+			dump_stack();
+			goto bail;
+		}
+	}
+
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+
+	trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)past_eof);
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	return err;
+}
+
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+			   struct buffer_head *di_bh)
+{
+	void *kaddr;
+	loff_t size;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
+		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		return -EROFS;
+	}
+
+	size = i_size_read(inode);
+
+	if (size > PAGE_CACHE_SIZE ||
+	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has with inline data has bad size: %Lu",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)size);
+		return -EROFS;
+	}
+
+	kaddr = kmap_atomic(page);
+	if (size)
+		memcpy(kaddr, di->id2.i_data.id_data, size);
+	/* Clear the remaining part of the page */
+	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_inline_data(inode, page, di_bh);
+out:
+	unlock_page(page);
+
+	brelse(di_bh);
+	return ret;
+}
+
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
+
+	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+			     (page ? page->index : 0));
+
+	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+		/*
+		 * Unlock the page and cycle ip_alloc_sem so that we don't
+		 * busyloop waiting for ip_alloc_sem to unlock
+		 */
+		ret = AOP_TRUNCATED_PAGE;
+		unlock_page(page);
+		unlock = 0;
+		down_read(&oi->ip_alloc_sem);
+		up_read(&oi->ip_alloc_sem);
+		goto out_inode_unlock;
+	}
+
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		zero_user(page, 0, PAGE_SIZE);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		ret = ocfs2_readpage_inline(inode, page);
+	else
+		ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
+
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+out_inode_unlock:
+	ocfs2_inode_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
+	return ret;
+}
+
+/*
+ * This is used only for read-ahead. Failures or difficult to handle
+ * situations are safe to ignore.
+ *
+ * Right now, we don't bother with BH_Boundary - in-inode extent lists
+ * are quite large (243 extents on 4k blocks), so most inodes don't
+ * grow out to a tree. If need be, detecting boundary extents could
+ * trivially be added in a future version of ocfs2_get_block().
+ */
+static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	int ret, err = -EIO;
+	struct inode *inode = mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	loff_t start;
+	struct page *last;
+
+	/*
+	 * Use the nonblocking flag for the dlm code to avoid page
+	 * lock inversion, but don't bother with retrying.
+	 */
+	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
+	if (ret)
+		return err;
+
+	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+		ocfs2_inode_unlock(inode, 0);
+		return err;
+	}
+
+	/*
+	 * Don't bother with inline-data. There isn't anything
+	 * to read-ahead in that case anyway...
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		goto out_unlock;
+
+	/*
+	 * Check whether a remote node truncated this file - we just
+	 * drop out in that case as it's not worth handling here.
+	 */
+	last = list_entry(pages->prev, struct page, lru);
+	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+	if (start >= i_size_read(inode))
+		goto out_unlock;
+
+	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+
+out_unlock:
+	up_read(&oi->ip_alloc_sem);
+	ocfs2_inode_unlock(inode, 0);
+
+	return err;
+}
+
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	trace_ocfs2_writepage(
+		(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+		page->index);
+
+	return block_write_full_page(page, ocfs2_get_block, wbc);
+}
+
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t status;
+	u64 p_blkno = 0;
+	int err = 0;
+	struct inode *inode = mapping->host;
+
+	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 (unsigned long long)block);
+
+	/* We don't need to lock journal system files, since they aren't
+	 * accessed concurrently from multiple nodes.
+	 */
+	if (!INODE_JOURNAL(inode)) {
+		err = ocfs2_inode_lock(inode, NULL, 0);
+		if (err) {
+			if (err != -ENOENT)
+				mlog_errno(err);
+			goto bail;
+		}
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	}
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+						  NULL);
+
+	if (!INODE_JOURNAL(inode)) {
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		ocfs2_inode_unlock(inode, 0);
+	}
+
+	if (err) {
+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+		     (unsigned long long)block);
+		mlog_errno(err);
+		goto bail;
+	}
+
+bail:
+	status = err ? 0 : p_blkno;
+
+	return status;
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     struct buffer_head *bh_result, int create)
+{
+	int ret;
+	u32 cpos = 0;
+	int alloc_locked = 0;
+	u64 p_blkno, inode_blocks, contig_blocks;
+	unsigned int ext_flags;
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+	unsigned long len = bh_result->b_size;
+	unsigned int clusters_to_alloc = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+
+	/* This figures out the size of the next contiguous block, and
+	 * our logical offset */
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+					  &contig_blocks, &ext_flags);
+	if (ret) {
+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+		     (unsigned long long)iblock);
+		ret = -EIO;
+		goto bail;
+	}
+
+	/* We should already CoW the refcounted extent in case of create. */
+	BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+
+	/* allocate blocks if no p_blkno is found, and create == 1 */
+	if (!p_blkno && create) {
+		ret = ocfs2_inode_lock(inode, NULL, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+
+		alloc_locked = 1;
+
+		/* fill hole, allocate blocks can't be larger than the size
+		 * of the hole */
+		clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
+		if (clusters_to_alloc > contig_blocks)
+			clusters_to_alloc = contig_blocks;
+
+		/* allocate extent and insert them into the extent tree */
+		ret = ocfs2_extend_allocation(inode, cpos,
+				clusters_to_alloc, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+
+		ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+				&contig_blocks, &ext_flags);
+		if (ret < 0) {
+			mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+					(unsigned long long)iblock);
+			ret = -EIO;
+			goto bail;
+		}
+	}
+
+	/*
+	 * get_more_blocks() expects us to describe a hole by clearing
+	 * the mapped bit on bh_result().
+	 *
+	 * Consider an unwritten extent as a hole.
+	 */
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+		map_bh(bh_result, inode->i_sb, p_blkno);
+	else
+		clear_buffer_mapped(bh_result);
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+	bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+	if (alloc_locked)
+		ocfs2_inode_unlock(inode, 1);
+	return ret;
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	int level;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+	if (ocfs2_iocb_is_sem_locked(iocb))
+		ocfs2_iocb_clear_sem_locked(iocb);
+
+	if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+		ocfs2_iocb_clear_unaligned_aio(iocb);
+
+		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+	}
+
+	ocfs2_iocb_clear_rw_locked(iocb);
+
+	level = ocfs2_iocb_rw_locked_level(iocb);
+	ocfs2_rw_unlock(inode, level);
+}
+
+static int ocfs2_releasepage(struct page *page, gfp_t wait)
+{
+	if (!page_has_buffers(page))
+		return 0;
+	return try_to_free_buffers(page);
+}
+
+static int ocfs2_is_overwrite(struct ocfs2_super *osb,
+		struct inode *inode, loff_t offset)
+{
+	int ret = 0;
+	u32 v_cpos = 0;
+	u32 p_cpos = 0;
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+
+	v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+	ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+			&num_clusters, &ext_flags);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+		return 1;
+
+	return 0;
+}
+
+static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
+		struct inode *inode, loff_t offset,
+		u64 zero_len, int cluster_align)
+{
+	u32 p_cpos = 0;
+	u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+	int ret = 0;
+
+	if (offset <= i_size_read(inode) || cluster_align)
+		return 0;
+
+	ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+			&ext_flags);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+		u64 s = i_size_read(inode);
+		sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) +
+			(do_div(s, osb->s_clustersize) >> 9);
+
+		ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
+				zero_len >> 9, GFP_NOFS, false);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	return ret;
+}
+
+static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
+		struct inode *inode, loff_t offset)
+{
+	u64 zero_start, zero_len, total_zero_len;
+	u32 p_cpos = 0, clusters_to_add;
+	u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+	u32 size_div, offset_div;
+	int ret = 0;
+
+	{
+		u64 o = offset;
+		u64 s = i_size_read(inode);
+
+		offset_div = do_div(o, osb->s_clustersize);
+		size_div = do_div(s, osb->s_clustersize);
+	}
+
+	if (offset <= i_size_read(inode))
+		return 0;
+
+	clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
+		ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
+	total_zero_len = offset - i_size_read(inode);
+	if (clusters_to_add)
+		total_zero_len -= offset_div;
+
+	/* Allocate clusters to fill out holes, and this is only needed
+	 * when we add more than one clusters. Otherwise the cluster will
+	 * be allocated during direct IO */
+	if (clusters_to_add > 1) {
+		ret = ocfs2_extend_allocation(inode,
+				OCFS2_I(inode)->ip_clusters,
+				clusters_to_add - 1, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	while (total_zero_len) {
+		ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+				&ext_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
+			size_div;
+		zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
+			size_div;
+		zero_len = min(total_zero_len, zero_len);
+
+		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+			ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+					zero_start >> 9, zero_len >> 9,
+					GFP_NOFS, false);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		total_zero_len -= zero_len;
+		v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
+
+		/* Only at first iteration can be cluster not aligned.
+		 * So set size_div to 0 for the rest */
+		size_div = 0;
+	}
+
+out:
+	return ret;
+}
+
+static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
+		struct iov_iter *iter,
+		loff_t offset)
+{
+	ssize_t ret = 0;
+	ssize_t written = 0;
+	bool orphaned = false;
+	int is_overwrite = 0;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file)->i_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	size_t count = iter->count;
+	journal_t *journal = osb->journal->j_journal;
+	u64 zero_len_head, zero_len_tail;
+	int cluster_align_head, cluster_align_tail;
+	loff_t final_size = offset + count;
+	int append_write = offset >= i_size_read(inode) ? 1 : 0;
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+
+	{
+		u64 o = offset;
+		u64 s = i_size_read(inode);
+
+		zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
+		cluster_align_head = !zero_len_head;
+
+		zero_len_tail = osb->s_clustersize -
+			do_div(s, osb->s_clustersize);
+		if ((offset - i_size_read(inode)) < zero_len_tail)
+			zero_len_tail = offset - i_size_read(inode);
+		cluster_align_tail = !zero_len_tail;
+	}
+
+	/*
+	 * when final_size > inode->i_size, inode->i_size will be
+	 * updated after direct write, so add the inode to orphan
+	 * dir first.
+	 */
+	if (final_size > i_size_read(inode)) {
+		ret = ocfs2_add_inode_to_orphan(osb, inode);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		orphaned = true;
+	}
+
+	if (append_write) {
+		ret = ocfs2_inode_lock(inode, NULL, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto clean_orphan;
+		}
+
+		/* zeroing out the previously allocated cluster tail
+		 * that but not zeroed */
+		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+			ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
+					zero_len_tail, cluster_align_tail);
+		else
+			ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
+					offset);
+		if (ret < 0) {
+			mlog_errno(ret);
+			ocfs2_inode_unlock(inode, 1);
+			goto clean_orphan;
+		}
+
+		is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
+		if (is_overwrite < 0) {
+			mlog_errno(is_overwrite);
+			ocfs2_inode_unlock(inode, 1);
+			goto clean_orphan;
+		}
+
+		ocfs2_inode_unlock(inode, 1);
+	}
+
+	written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+				       offset, ocfs2_direct_IO_get_blocks,
+				       ocfs2_dio_end_io, NULL, 0);
+	if (unlikely(written < 0)) {
+		loff_t i_size = i_size_read(inode);
+
+		if (offset + count > i_size) {
+			ret = ocfs2_inode_lock(inode, &di_bh, 1);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto clean_orphan;
+			}
+
+			if (i_size == i_size_read(inode)) {
+				ret = ocfs2_truncate_file(inode, di_bh,
+						i_size);
+				if (ret < 0) {
+					if (ret != -ENOSPC)
+						mlog_errno(ret);
+
+					ocfs2_inode_unlock(inode, 1);
+					brelse(di_bh);
+					goto clean_orphan;
+				}
+			}
+
+			ocfs2_inode_unlock(inode, 1);
+			brelse(di_bh);
+
+			ret = jbd2_journal_force_commit(journal);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+	} else if (written > 0 && append_write && !is_overwrite &&
+			!cluster_align_head) {
+		/* zeroing out the allocated cluster head */
+		u32 p_cpos = 0;
+		u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+
+		ret = ocfs2_inode_lock(inode, NULL, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto clean_orphan;
+		}
+
+		ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+				&num_clusters, &ext_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			ocfs2_inode_unlock(inode, 0);
+			goto clean_orphan;
+		}
+
+		BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
+
+		ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+				p_cpos << (osb->s_clustersize_bits - 9),
+				zero_len_head >> 9, GFP_NOFS, false);
+		if (ret < 0)
+			mlog_errno(ret);
+
+		ocfs2_inode_unlock(inode, 0);
+	}
+
+clean_orphan:
+	if (orphaned) {
+		int tmp_ret;
+		int update_isize = written > 0 ? 1 : 0;
+		loff_t end = update_isize ? offset + written : 0;
+
+		tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+				update_isize, end);
+		if (tmp_ret < 0) {
+			ret = tmp_ret;
+			goto out;
+		}
+
+		tmp_ret = jbd2_journal_force_commit(journal);
+		if (tmp_ret < 0) {
+			ret = tmp_ret;
+			mlog_errno(tmp_ret);
+		}
+	}
+
+out:
+	if (ret >= 0)
+		ret = written;
+	return ret;
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			       loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file)->i_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int full_coherency = !(osb->s_mount_opt &
+			OCFS2_MOUNT_COHERENCY_BUFFERED);
+
+	/*
+	 * Fallback to buffered I/O if we see an inode without
+	 * extents.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	/* Fallback to buffered I/O if we are appending and
+	 * concurrent O_DIRECT writes are allowed.
+	 */
+	if (i_size_read(inode) <= offset && !full_coherency)
+		return 0;
+
+	if (iov_iter_rw(iter) == READ)
+		return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+					    iter, offset,
+					    ocfs2_direct_IO_get_blocks,
+					    ocfs2_dio_end_io, NULL, 0);
+	else
+		return ocfs2_direct_IO_write(iocb, iter, offset);
+}
+
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+					    u32 cpos,
+					    unsigned int *start,
+					    unsigned int *end)
+{
+	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+		unsigned int cpp;
+
+		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+
+		cluster_start = cpos % cpp;
+		cluster_start = cluster_start << osb->s_clustersize_bits;
+
+		cluster_end = cluster_start + osb->s_clustersize;
+	}
+
+	BUG_ON(cluster_start > PAGE_SIZE);
+	BUG_ON(cluster_end > PAGE_SIZE);
+
+	if (start)
+		*start = cluster_start;
+	if (end)
+		*end = cluster_end;
+}
+
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+				     struct ocfs2_super *osb, u32 cpos,
+				     unsigned from, unsigned to)
+{
+	void *kaddr;
+	unsigned int cluster_start, cluster_end;
+
+	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+
+	kaddr = kmap_atomic(page);
+
+	if (from || to) {
+		if (from > cluster_start)
+			memset(kaddr + cluster_start, 0, from - cluster_start);
+		if (to < cluster_end)
+			memset(kaddr + to, 0, cluster_end - to);
+	} else {
+		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+	}
+
+	kunmap_atomic(kaddr);
+}
+
+/*
+ * Nonsparse file systems fully allocate before we get to the write
+ * code. This prevents ocfs2_write() from tagging the write as an
+ * allocating one, which means ocfs2_map_page_blocks() might try to
+ * read-in the blocks at the tail of our file. Avoid reading them by
+ * testing i_size against each block offset.
+ */
+static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+				 unsigned int block_start)
+{
+	u64 offset = page_offset(page) + block_start;
+
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		return 1;
+
+	if (i_size_read(inode) > offset)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Some of this taken from __block_write_begin(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new)
+{
+	int ret = 0;
+	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+	unsigned int block_end, block_start;
+	unsigned int bsize = 1 << inode->i_blkbits;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, bsize, 0);
+
+	head = page_buffers(page);
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	     bh = bh->b_this_page, block_start += bsize) {
+		block_end = block_start + bsize;
+
+		clear_buffer_new(bh);
+
+		/*
+		 * Ignore blocks outside of our i/o range -
+		 * they may belong to unallocated clusters.
+		 */
+		if (block_start >= to || block_end <= from) {
+			if (PageUptodate(page))
+				set_buffer_uptodate(bh);
+			continue;
+		}
+
+		/*
+		 * For an allocating write with cluster size >= page
+		 * size, we always write the entire page.
+		 */
+		if (new)
+			set_buffer_new(bh);
+
+		if (!buffer_mapped(bh)) {
+			map_bh(bh, inode->i_sb, *p_blkno);
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		}
+
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+			   !buffer_new(bh) &&
+			   ocfs2_should_read_blk(inode, page, block_start) &&
+			   (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+
+		*p_blkno = *p_blkno + 1;
+	}
+
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			ret = -EIO;
+	}
+
+	if (ret == 0 || !new)
+		return ret;
+
+	/*
+	 * If we get -EIO above, zero out any newly allocated blocks
+	 * to avoid exposing stale data.
+	 */
+	bh = head;
+	block_start = 0;
+	do {
+		block_end = block_start + bsize;
+		if (block_end <= from)
+			goto next_bh;
+		if (block_start >= to)
+			break;
+
+		zero_user(page, block_start, bh->b_size);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+
+next_bh:
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES	1
+#else
+#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+
+#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
+/*
+ * Describe the state of a single cluster to be written to.
+ */
+struct ocfs2_write_cluster_desc {
+	u32		c_cpos;
+	u32		c_phys;
+	/*
+	 * Give this a unique field because c_phys eventually gets
+	 * filled.
+	 */
+	unsigned	c_new;
+	unsigned	c_unwritten;
+	unsigned	c_needs_zero;
+};
+
+struct ocfs2_write_ctxt {
+	/* Logical cluster position / len of write */
+	u32				w_cpos;
+	u32				w_clen;
+
+	/* First cluster allocated in a nonsparse extend */
+	u32				w_first_new_cpos;
+
+	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
+
+	/*
+	 * This is true if page_size > cluster_size.
+	 *
+	 * It triggers a set of special cases during write which might
+	 * have to deal with allocating writes to partial pages.
+	 */
+	unsigned int			w_large_pages;
+
+	/*
+	 * Pages involved in this write.
+	 *
+	 * w_target_page is the page being written to by the user.
+	 *
+	 * w_pages is an array of pages which always contains
+	 * w_target_page, and in the case of an allocating write with
+	 * page_size < cluster size, it will contain zero'd and mapped
+	 * pages adjacent to w_target_page which need to be written
+	 * out in so that future reads from that region will get
+	 * zero's.
+	 */
+	unsigned int			w_num_pages;
+	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
+	struct page			*w_target_page;
+
+	/*
+	 * w_target_locked is used for page_mkwrite path indicating no unlocking
+	 * against w_target_page in ocfs2_write_end_nolock.
+	 */
+	unsigned int			w_target_locked:1;
+
+	/*
+	 * ocfs2_write_end() uses this to know what the real range to
+	 * write in the target should be.
+	 */
+	unsigned int			w_target_from;
+	unsigned int			w_target_to;
+
+	/*
+	 * We could use journal_current_handle() but this is cleaner,
+	 * IMHO -Mark
+	 */
+	handle_t			*w_handle;
+
+	struct buffer_head		*w_di_bh;
+
+	struct ocfs2_cached_dealloc_ctxt w_dealloc;
+};
+
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
+{
+	int i;
+
+	for(i = 0; i < num_pages; i++) {
+		if (pages[i]) {
+			unlock_page(pages[i]);
+			mark_page_accessed(pages[i]);
+			page_cache_release(pages[i]);
+		}
+	}
+}
+
+static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
+{
+	int i;
+
+	/*
+	 * w_target_locked is only set to true in the page_mkwrite() case.
+	 * The intent is to allow us to lock the target page from write_begin()
+	 * to write_end(). The caller must hold a ref on w_target_page.
+	 */
+	if (wc->w_target_locked) {
+		BUG_ON(!wc->w_target_page);
+		for (i = 0; i < wc->w_num_pages; i++) {
+			if (wc->w_target_page == wc->w_pages[i]) {
+				wc->w_pages[i] = NULL;
+				break;
+			}
+		}
+		mark_page_accessed(wc->w_target_page);
+		page_cache_release(wc->w_target_page);
+	}
+	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+}
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+	ocfs2_unlock_pages(wc);
+	brelse(wc->w_di_bh);
+	kfree(wc);
+}
+
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+				  struct ocfs2_super *osb, loff_t pos,
+				  unsigned len, struct buffer_head *di_bh)
+{
+	u32 cend;
+	struct ocfs2_write_ctxt *wc;
+
+	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+	if (!wc)
+		return -ENOMEM;
+
+	wc->w_cpos = pos >> osb->s_clustersize_bits;
+	wc->w_first_new_cpos = UINT_MAX;
+	cend = (pos + len - 1) >> osb->s_clustersize_bits;
+	wc->w_clen = cend - wc->w_cpos + 1;
+	get_bh(di_bh);
+	wc->w_di_bh = di_bh;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+		wc->w_large_pages = 1;
+	else
+		wc->w_large_pages = 0;
+
+	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+
+	*wcp = wc;
+
+	return 0;
+}
+
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, end;
+
+					start = max(from, block_start);
+					end = min(to, block_end);
+
+					zero_user_segment(page, start, end);
+					set_buffer_uptodate(bh);
+				}
+
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
+		}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
+/*
+ * Only called when we have a failure during allocating write to write
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+				struct ocfs2_write_ctxt *wc,
+				loff_t user_pos, unsigned user_len)
+{
+	int i;
+	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+		to = user_pos + user_len;
+	struct page *tmppage;
+
+	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
+
+		if (page_has_buffers(tmppage)) {
+			if (ocfs2_should_order_data(inode))
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+
+			block_commit_write(tmppage, from, to);
+		}
+	}
+}
+
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
+					struct ocfs2_write_ctxt *wc,
+					struct page *page, u32 cpos,
+					loff_t user_pos, unsigned user_len,
+					int new)
+{
+	int ret;
+	unsigned int map_from = 0, map_to = 0;
+	unsigned int cluster_start, cluster_end;
+	unsigned int user_data_from = 0, user_data_to = 0;
+
+	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
+					&cluster_start, &cluster_end);
+
+	/* treat the write as new if the a hole/lseek spanned across
+	 * the page boundary.
+	 */
+	new = new | ((i_size_read(inode) <= page_offset(page)) &&
+			(page_offset(page) <= user_pos));
+
+	if (page == wc->w_target_page) {
+		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+		map_to = map_from + user_len;
+
+		if (new)
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    cluster_start, cluster_end,
+						    new);
+		else
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    map_from, map_to, new);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		user_data_from = map_from;
+		user_data_to = map_to;
+		if (new) {
+			map_from = cluster_start;
+			map_to = cluster_end;
+		}
+	} else {
+		/*
+		 * If we haven't allocated the new page yet, we
+		 * shouldn't be writing it out without copying user
+		 * data. This is likely a math error from the caller.
+		 */
+		BUG_ON(!new);
+
+		map_from = cluster_start;
+		map_to = cluster_end;
+
+		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+					    cluster_start, cluster_end, new);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Parts of newly allocated pages need to be zero'd.
+	 *
+	 * Above, we have also rewritten 'to' and 'from' - as far as
+	 * the rest of the function is concerned, the entire cluster
+	 * range inside of a page needs to be written.
+	 *
+	 * We can skip this if the page is up to date - it's already
+	 * been zero'd from being read in as a hole.
+	 */
+	if (new && !PageUptodate(page))
+		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+					 cpos, user_data_from, user_data_to);
+
+	flush_dcache_page(page);
+
+out:
+	return ret;
+}
+
+/*
+ * This function will only grab one clusters worth of pages.
+ */
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
+				      struct ocfs2_write_ctxt *wc,
+				      u32 cpos, loff_t user_pos,
+				      unsigned user_len, int new,
+				      struct page *mmap_page)
+{
+	int ret = 0, i;
+	unsigned long start, target_index, end_index, index;
+	struct inode *inode = mapping->host;
+	loff_t last_byte;
+
+	target_index = user_pos >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * Figure out how many pages we'll be manipulating here. For
+	 * non allocating write, we just change the one
+	 * page. Otherwise, we'll need a whole clusters worth.  If we're
+	 * writing past i_size, we only need enough pages to cover the
+	 * last page of the write.
+	 */
+	if (new) {
+		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+		/*
+		 * We need the index *past* the last page we could possibly
+		 * touch.  This is the page past the end of the write or
+		 * i_size, whichever is greater.
+		 */
+		last_byte = max(user_pos + user_len, i_size_read(inode));
+		BUG_ON(last_byte < 1);
+		end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+		if ((start + wc->w_num_pages) > end_index)
+			wc->w_num_pages = end_index - start;
+	} else {
+		wc->w_num_pages = 1;
+		start = target_index;
+	}
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		index = start + i;
+
+		if (index == target_index && mmap_page) {
+			/*
+			 * ocfs2_pagemkwrite() is a little different
+			 * and wants us to directly use the page
+			 * passed in.
+			 */
+			lock_page(mmap_page);
+
+			/* Exit and let the caller retry */
+			if (mmap_page->mapping != mapping) {
+				WARN_ON(mmap_page->mapping);
+				unlock_page(mmap_page);
+				ret = -EAGAIN;
+				goto out;
+			}
+
+			page_cache_get(mmap_page);
+			wc->w_pages[i] = mmap_page;
+			wc->w_target_locked = true;
+		} else {
+			wc->w_pages[i] = find_or_create_page(mapping, index,
+							     GFP_NOFS);
+			if (!wc->w_pages[i]) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+		wait_for_stable_page(wc->w_pages[i]);
+
+		if (index == target_index)
+			wc->w_target_page = wc->w_pages[i];
+	}
+out:
+	if (ret)
+		wc->w_target_locked = false;
+	return ret;
+}
+
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+			       u32 phys, unsigned int unwritten,
+			       unsigned int should_zero,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct ocfs2_write_ctxt *wc, u32 cpos,
+			       loff_t user_pos, unsigned user_len)
+{
+	int ret, i, new;
+	u64 v_blkno, p_blkno;
+	struct inode *inode = mapping->host;
+	struct ocfs2_extent_tree et;
+
+	new = phys == 0 ? 1 : 0;
+	if (new) {
+		u32 tmp_pos;
+
+		/*
+		 * This is safe to call with the page locks - it won't take
+		 * any additional semaphores or cluster locks.
+		 */
+		tmp_pos = cpos;
+		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+					   &tmp_pos, 1, 0, wc->w_di_bh,
+					   wc->w_handle, data_ac,
+					   meta_ac, NULL);
+		/*
+		 * This shouldn't happen because we must have already
+		 * calculated the correct meta data allocation required. The
+		 * internal tree allocation code should know how to increase
+		 * transaction credits itself.
+		 *
+		 * If need be, we could handle -EAGAIN for a
+		 * RESTART_TRANS here.
+		 */
+		mlog_bug_on_msg(ret == -EAGAIN,
+				"Inode %llu: EAGAIN return during allocation.\n",
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else if (unwritten) {
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+					      wc->w_di_bh);
+		ret = ocfs2_mark_extent_written(inode, &et,
+						wc->w_handle, cpos, 1, phys,
+						meta_ac, &wc->w_dealloc);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (should_zero)
+		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+	else
+		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+
+	/*
+	 * The only reason this should fail is due to an inability to
+	 * find the extent added.
+	 */
+	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+					  NULL);
+	if (ret < 0) {
+		mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
+			    "at logical block %llu",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_blkno);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		int tmpret;
+
+		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
+						      wc->w_pages[i], cpos,
+						      user_pos, user_len,
+						      should_zero);
+		if (tmpret) {
+			mlog_errno(tmpret);
+			if (ret == 0)
+				ret = tmpret;
+		}
+	}
+
+	/*
+	 * We only have cleanup to do in case of allocating write.
+	 */
+	if (ret && new)
+		ocfs2_write_failure(inode, wc, user_pos, user_len);
+
+out:
+
+	return ret;
+}
+
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+				       struct ocfs2_alloc_context *data_ac,
+				       struct ocfs2_alloc_context *meta_ac,
+				       struct ocfs2_write_ctxt *wc,
+				       loff_t pos, unsigned len)
+{
+	int ret, i;
+	loff_t cluster_off;
+	unsigned int local_len = len;
+	struct ocfs2_write_cluster_desc *desc;
+	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
+
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+
+		/*
+		 * We have to make sure that the total write passed in
+		 * doesn't extend past a single cluster.
+		 */
+		local_len = len;
+		cluster_off = pos & (osb->s_clustersize - 1);
+		if ((cluster_off + local_len) > osb->s_clustersize)
+			local_len = osb->s_clustersize - cluster_off;
+
+		ret = ocfs2_write_cluster(mapping, desc->c_phys,
+					  desc->c_unwritten,
+					  desc->c_needs_zero,
+					  data_ac, meta_ac,
+					  wc, desc->c_cpos, pos, local_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		len -= local_len;
+		pos += local_len;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_write_end() wants to know which parts of the target page it
+ * should complete the write on. It's easiest to compute them ahead of
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+					struct ocfs2_write_ctxt *wc,
+					loff_t pos, unsigned len, int alloc)
+{
+	struct ocfs2_write_cluster_desc *desc;
+
+	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+	wc->w_target_to = wc->w_target_from + len;
+
+	if (alloc == 0)
+		return;
+
+	/*
+	 * Allocating write - we may have different boundaries based
+	 * on page size and cluster size.
+	 *
+	 * NOTE: We can no longer compute one value from the other as
+	 * the actual write length and user provided length may be
+	 * different.
+	 */
+
+	if (wc->w_large_pages) {
+		/*
+		 * We only care about the 1st and last cluster within
+		 * our range and whether they should be zero'd or not. Either
+		 * value may be extended out to the start/end of a
+		 * newly allocated cluster.
+		 */
+		desc = &wc->w_desc[0];
+		if (desc->c_needs_zero)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							&wc->w_target_from,
+							NULL);
+
+		desc = &wc->w_desc[wc->w_clen - 1];
+		if (desc->c_needs_zero)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							NULL,
+							&wc->w_target_to);
+	} else {
+		wc->w_target_from = 0;
+		wc->w_target_to = PAGE_CACHE_SIZE;
+	}
+}
+
+/*
+ * Populate each single-cluster write descriptor in the write context
+ * with information about the i/o to be done.
+ *
+ * Returns the number of clusters that will have to be allocated, as
+ * well as a worst case estimate of the number of extent records that
+ * would have to be created during a write to an unwritten region.
+ */
+static int ocfs2_populate_write_desc(struct inode *inode,
+				     struct ocfs2_write_ctxt *wc,
+				     unsigned int *clusters_to_alloc,
+				     unsigned int *extents_to_split)
+{
+	int ret;
+	struct ocfs2_write_cluster_desc *desc;
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+	u32 phys = 0;
+	int i;
+
+	*clusters_to_alloc = 0;
+	*extents_to_split = 0;
+
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+		desc->c_cpos = wc->w_cpos + i;
+
+		if (num_clusters == 0) {
+			/*
+			 * Need to look up the next extent record.
+			 */
+			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+						 &num_clusters, &ext_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/* We should already CoW the refcountd extent. */
+			BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
+			/*
+			 * Assume worst case - that we're writing in
+			 * the middle of the extent.
+			 *
+			 * We can assume that the write proceeds from
+			 * left to right, in which case the extent
+			 * insert code is smart enough to coalesce the
+			 * next splits into the previous records created.
+			 */
+			if (ext_flags & OCFS2_EXT_UNWRITTEN)
+				*extents_to_split = *extents_to_split + 2;
+		} else if (phys) {
+			/*
+			 * Only increment phys if it doesn't describe
+			 * a hole.
+			 */
+			phys++;
+		}
+
+		/*
+		 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+		 * file that got extended.  w_first_new_cpos tells us
+		 * where the newly allocated clusters are so we can
+		 * zero them.
+		 */
+		if (desc->c_cpos >= wc->w_first_new_cpos) {
+			BUG_ON(phys == 0);
+			desc->c_needs_zero = 1;
+		}
+
+		desc->c_phys = phys;
+		if (phys == 0) {
+			desc->c_new = 1;
+			desc->c_needs_zero = 1;
+			*clusters_to_alloc = *clusters_to_alloc + 1;
+		}
+
+		if (ext_flags & OCFS2_EXT_UNWRITTEN) {
+			desc->c_unwritten = 1;
+			desc->c_needs_zero = 1;
+		}
+
+		num_clusters--;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_write_begin_inline(struct address_space *mapping,
+				    struct inode *inode,
+				    struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct page *page;
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	page = find_or_create_page(mapping, 0, GFP_NOFS);
+	if (!page) {
+		ocfs2_commit_trans(osb, handle);
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	/*
+	 * If we don't set w_num_pages then this page won't get unlocked
+	 * and freed on cleanup of the write context.
+	 */
+	wc->w_pages[0] = wc->w_target_page = page;
+	wc->w_num_pages = 1;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		ocfs2_commit_trans(osb, handle);
+
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		ocfs2_set_inode_data_inline(inode, di);
+
+	if (!PageUptodate(page)) {
+		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
+		if (ret) {
+			ocfs2_commit_trans(osb, handle);
+
+			goto out;
+		}
+	}
+
+	wc->w_handle = handle;
+out:
+	return ret;
+}
+
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
+		return 1;
+	return 0;
+}
+
+static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
+					  struct inode *inode, loff_t pos,
+					  unsigned len, struct page *mmap_page,
+					  struct ocfs2_write_ctxt *wc)
+{
+	int ret, written = 0;
+	loff_t end = pos + len;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = NULL;
+
+	trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+					     len, (unsigned long long)pos,
+					     oi->ip_dyn_features);
+
+	/*
+	 * Handle inodes which already have inline data 1st.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		if (mmap_page == NULL &&
+		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
+			goto do_inline_write;
+
+		/*
+		 * The write won't fit - we have to give this inode an
+		 * inline extent list now.
+		 */
+		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Check whether the inode can accept inline data.
+	 */
+	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
+		return 0;
+
+	/*
+	 * Check whether the write can fit.
+	 */
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	if (mmap_page ||
+	    end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
+		return 0;
+
+do_inline_write:
+	ret = ocfs2_write_begin_inline(mapping, inode, wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * This signals to the caller that the data can be written
+	 * inline.
+	 */
+	written = 1;
+out:
+	return written ? written : ret;
+}
+
+/*
+ * This function only does anything for file systems which can't
+ * handle sparse files.
+ *
+ * What we want to do here is fill in any hole between the current end
+ * of allocation and the end of our write. That way the rest of the
+ * write path can treat it as an non-allocating write, which has no
+ * special case code for sparse/nonsparse files.
+ */
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+					struct buffer_head *di_bh,
+					loff_t pos, unsigned len,
+					struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	loff_t newsize = pos + len;
+
+	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+
+	if (newsize <= i_size_read(inode))
+		return 0;
+
+	ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
+	if (ret)
+		mlog_errno(ret);
+
+	wc->w_first_new_cpos =
+		ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
+	return ret;
+}
+
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+			   loff_t pos)
+{
+	int ret = 0;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+	if (pos > i_size_read(inode))
+		ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+	return ret;
+}
+
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+					  unsigned int needed)
+{
+	tid_t target;
+	int ret = 0;
+	unsigned int truncated_clusters;
+
+	mutex_lock(&osb->osb_tl_inode->i_mutex);
+	truncated_clusters = osb->truncated_clusters;
+	mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+	/*
+	 * Check whether we can succeed in allocating if we free
+	 * the truncate log.
+	 */
+	if (truncated_clusters < needed)
+		goto out;
+
+	ret = ocfs2_flush_truncate_log(osb);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+		jbd2_log_wait_commit(osb->journal->j_journal, target);
+		ret = 1;
+	}
+out:
+	return ret;
+}
+
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page)
+{
+	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
+	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
+	struct ocfs2_write_ctxt *wc;
+	struct inode *inode = mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle;
+	struct ocfs2_extent_tree et;
+	int try_free = 1, ret1;
+
+try_again:
+	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ocfs2_supports_inline_data(osb)) {
+		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
+						     mmap_page, wc);
+		if (ret == 1) {
+			ret = 0;
+			goto success;
+		}
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (ocfs2_sparse_alloc(osb))
+		ret = ocfs2_zero_tail(inode, di_bh, pos);
+	else
+		ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+						   wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_check_range_for_refcount(inode, pos, len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	} else if (ret == 1) {
+		clusters_need = wc->w_clen;
+		ret = ocfs2_refcount_cow(inode, di_bh,
+					 wc->w_cpos, wc->w_clen, UINT_MAX);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+					&extents_to_split);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	clusters_need += clusters_to_alloc;
+
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+	trace_ocfs2_write_begin_nolock(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(long long)i_size_read(inode),
+			le32_to_cpu(di->i_clusters),
+			pos, len, flags, mmap_page,
+			clusters_to_alloc, extents_to_split);
+
+	/*
+	 * We set w_target_from, w_target_to here so that
+	 * ocfs2_write_end() knows which range in the target page to
+	 * write out. An allocation requires that we write the entire
+	 * cluster range.
+	 */
+	if (clusters_to_alloc || extents_to_split) {
+		/*
+		 * XXX: We are stretching the limits of
+		 * ocfs2_lock_allocators(). It greatly over-estimates
+		 * the work to be done.
+		 */
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+					      wc->w_di_bh);
+		ret = ocfs2_lock_allocators(inode, &et,
+					    clusters_to_alloc, extents_to_split,
+					    &data_ac, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (data_ac)
+			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+		credits = ocfs2_calc_extend_credits(inode->i_sb,
+						    &di->id2.i_list);
+
+	}
+
+	/*
+	 * We have to zero sparse allocated clusters, unwritten extent clusters,
+	 * and non-sparse clusters we just extended.  For non-sparse writes,
+	 * we know zeros will only be needed in the first and/or last cluster.
+	 */
+	if (clusters_to_alloc || extents_to_split ||
+	    (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+			    wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+		cluster_of_pages = 1;
+	else
+		cluster_of_pages = 0;
+
+	ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	wc->w_handle = handle;
+
+	if (clusters_to_alloc) {
+		ret = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+		if (ret)
+			goto out_commit;
+	}
+	/*
+	 * We don't want this to fail in ocfs2_write_end(), so do it
+	 * here.
+	 */
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_quota;
+	}
+
+	/*
+	 * Fill our page array first. That way we've grabbed enough so
+	 * that we can zero and flush if we error after adding the
+	 * extent.
+	 */
+	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
+					 cluster_of_pages, mmap_page);
+	if (ret && ret != -EAGAIN) {
+		mlog_errno(ret);
+		goto out_quota;
+	}
+
+	/*
+	 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+	 * the target page. In this case, we exit with no error and no target
+	 * page. This will trigger the caller, page_mkwrite(), to re-try
+	 * the operation.
+	 */
+	if (ret == -EAGAIN) {
+		BUG_ON(wc->w_target_page);
+		ret = 0;
+		goto out_quota;
+	}
+
+	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+					  len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_quota;
+	}
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+success:
+	*pagep = wc->w_target_page;
+	*fsdata = wc;
+	return 0;
+out_quota:
+	if (clusters_to_alloc)
+		dquot_free_space(inode,
+			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	ocfs2_free_write_ctxt(wc);
+
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+
+	if (ret == -ENOSPC && try_free) {
+		/*
+		 * Try to free some truncate log so that we can have enough
+		 * clusters to allocate.
+		 */
+		try_free = 0;
+
+		ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+		if (ret1 == 1)
+			goto try_again;
+
+		if (ret1 < 0)
+			mlog_errno(ret1);
+	}
+
+	return ret;
+}
+
+static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = mapping->host;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * Take alloc sem here to prevent concurrent lookups. That way
+	 * the mapping, zeroing and tree manipulation within
+	 * ocfs2_write() will be safe against ->readpage(). This
+	 * should also serve to lock out allocation from a shared
+	 * writeable region.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
+				       fsdata, di_bh, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_fail;
+	}
+
+	brelse(di_bh);
+
+	return 0;
+
+out_fail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+
+	return ret;
+}
+
+static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
+				   unsigned len, unsigned *copied,
+				   struct ocfs2_dinode *di,
+				   struct ocfs2_write_ctxt *wc)
+{
+	void *kaddr;
+
+	if (unlikely(*copied < len)) {
+		if (!PageUptodate(wc->w_target_page)) {
+			*copied = 0;
+			return;
+		}
+	}
+
+	kaddr = kmap_atomic(wc->w_target_page);
+	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
+	kunmap_atomic(kaddr);
+
+	trace_ocfs2_write_end_inline(
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (unsigned long long)pos, *copied,
+	     le16_to_cpu(di->id2.i_data.id_count),
+	     le16_to_cpu(di->i_dyn_features));
+}
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	int i;
+	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_write_ctxt *wc = fsdata;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	handle_t *handle = wc->w_handle;
+	struct page *tmppage;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
+		goto out_write_size;
+	}
+
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(wc->w_target_page))
+			copied = 0;
+
+		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+				       start+len);
+	}
+	flush_dcache_page(wc->w_target_page);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
+
+		if (tmppage == wc->w_target_page) {
+			from = wc->w_target_from;
+			to = wc->w_target_to;
+
+			BUG_ON(from > PAGE_CACHE_SIZE ||
+			       to > PAGE_CACHE_SIZE ||
+			       to < from);
+		} else {
+			/*
+			 * Pages adjacent to the target (if any) imply
+			 * a hole-filling write in which case we want
+			 * to flush their entire range.
+			 */
+			from = 0;
+			to = PAGE_CACHE_SIZE;
+		}
+
+		if (page_has_buffers(tmppage)) {
+			if (ocfs2_should_order_data(inode))
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+			block_commit_write(tmppage, from, to);
+		}
+	}
+
+out_write_size:
+	pos += copied;
+	if (pos > i_size_read(inode)) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+	ocfs2_journal_dirty(handle, wc->w_di_bh);
+
+	/* unlock pages before dealloc since it needs acquiring j_trans_barrier
+	 * lock, or it will cause a deadlock since journal commit threads holds
+	 * this lock and will ask for the page lock when flushing the data.
+	 * put it here to preserve the unlock order.
+	 */
+	ocfs2_unlock_pages(wc);
+
+	ocfs2_commit_trans(osb, handle);
+
+	ocfs2_run_deallocs(osb, &wc->w_dealloc);
+
+	brelse(wc->w_di_bh);
+	kfree(wc);
+
+	return copied;
+}
+
+static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	int ret;
+	struct inode *inode = mapping->host;
+
+	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_inode_unlock(inode, 1);
+
+	return ret;
+}
+
+const struct address_space_operations ocfs2_aops = {
+	.readpage		= ocfs2_readpage,
+	.readpages		= ocfs2_readpages,
+	.writepage		= ocfs2_writepage,
+	.write_begin		= ocfs2_write_begin,
+	.write_end		= ocfs2_write_end,
+	.bmap			= ocfs2_bmap,
+	.direct_IO		= ocfs2_direct_IO,
+	.invalidatepage		= block_invalidatepage,
+	.releasepage		= ocfs2_releasepage,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
diff --git a/kernel/fs/ocfs2/aops.h b/kernel/fs/ocfs2/aops.h
new file mode 100644
index 000000000..dd59599b0
--- /dev/null
+++ b/kernel/fs/ocfs2/aops.h
@@ -0,0 +1,105 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+#include <linux/fs.h>
+
+handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to);
+
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new);
+
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
+
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh));
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata);
+
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page);
+
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+			   struct buffer_head *di_bh);
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
+
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create);
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+	test_bit(0, (unsigned long *)&iocb->private)
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
+{
+	set_bit(0, (unsigned long *)&iocb->private);
+	if (level)
+		set_bit(1, (unsigned long *)&iocb->private);
+	else
+		clear_bit(1, (unsigned long *)&iocb->private);
+}
+
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication between
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+	OCFS2_IOCB_RW_LOCK = 0,
+	OCFS2_IOCB_RW_LOCK_LEVEL,
+	OCFS2_IOCB_SEM,
+	OCFS2_IOCB_UNALIGNED_IO,
+	OCFS2_IOCB_NUM_LOCKS
+};
+
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+	clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_rw_locked_level(iocb) \
+	test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+	set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+	clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+	test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+	set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+	clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+	test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
+#endif /* OCFS2_FILE_H */
diff --git a/kernel/fs/ocfs2/blockcheck.c b/kernel/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000..0725e6054
--- /dev/null
+++ b/kernel/fs/ocfs2/blockcheck.c
@@ -0,0 +1,647 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offset by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+	unsigned int b, p = 0;
+
+	/*
+	 * Data bits are 0-based, but we're talking code bits, which
+	 * are 1-based.
+	 */
+	b = i + 1;
+
+	/* Use the cache if it is there */
+	if (p_cache)
+		p = *p_cache;
+        b += p;
+
+	/*
+	 * For every power of two below our bit number, bump our bit.
+	 *
+	 * We compare with (b + 1) because we have to compare with what b
+	 * would be _if_ it were bumped up by the parity bit.  Capice?
+	 *
+	 * p is set above.
+	 */
+	for (; (1 << p) < (b + 1); p++)
+		b++;
+
+	if (p_cache)
+		*p_cache = p;
+
+	return b;
+}
+
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+	unsigned int i, b, p = 0;
+
+	BUG_ON(!d);
+
+	/*
+	 * b is the hamming code bit number.  Hamming code specifies a
+	 * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+	 * for the algorithm.
+	 *
+	 * The i++ in the for loop is so that the start offset passed
+	 * to ocfs2_find_next_bit_set() is one greater than the previously
+	 * found bit.
+	 */
+	for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+	{
+		/*
+		 * i is the offset in this hunk, nr + i is the total bit
+		 * offset.
+		 */
+		b = calc_code_bit(nr + i, &p);
+
+		/*
+		 * Data bits in the resultant code are checked by
+		 * parity bits that are part of the bit number
+		 * representation.  Huh?
+		 *
+		 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+		 * In other words, the parity bit at position 2^k
+		 * checks bits in positions having bit k set in
+		 * their binary representation.  Conversely, for
+		 * instance, bit 13, i.e. 1101(2), is checked by
+		 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+		 * </wikipedia>
+		 *
+		 * Note that 'k' is the _code_ bit number.  'b' in
+		 * our loop.
+		 */
+		parity ^= b;
+	}
+
+	/* While the data buffer was treated as little endian, the
+	 * return value is in host endian. */
+	return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+	return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix)
+{
+	unsigned int i, b;
+
+	BUG_ON(!d);
+
+	/*
+	 * If the bit to fix has an hweight of 1, it's a parity bit.  One
+	 * busted parity bit is its own error.  Nothing to do here.
+	 */
+	if (hweight32(fix) == 1)
+		return;
+
+	/*
+	 * nr + d is the bit right past the data hunk we're looking at.
+	 * If fix after that, nothing to do
+	 */
+	if (fix >= calc_code_bit(nr + d, NULL))
+		return;
+
+	/*
+	 * nr is the offset in the data hunk we're starting at.  Let's
+	 * start b at the offset in the code buffer.  See hamming_encode()
+	 * for a more detailed description of 'b'.
+	 */
+	b = calc_code_bit(nr, NULL);
+	/* If the fix is before this hunk, nothing to do */
+	if (fix < b)
+		return;
+
+	for (i = 0; i < d; i++, b++)
+	{
+		/* Skip past parity bits */
+		while (hweight32(b) == 1)
+			b++;
+
+		/*
+		 * i is the offset in this data hunk.
+		 * nr + i is the offset in the total data buffer.
+		 * b is the offset in the total code buffer.
+		 *
+		 * Thus, when b == fix, bit i in the current hunk needs
+		 * fixing.
+		 */
+		if (b == fix)
+		{
+			if (ocfs2_test_bit(i, data))
+				ocfs2_clear_bit(i, data);
+			else
+				ocfs2_set_bit(i, data);
+			break;
+		}
+	}
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+			     unsigned int fix)
+{
+	ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+
+/*
+ * Debugfs handling.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+static int blockcheck_u64_get(void *data, u64 *val)
+{
+	*val = *(u64 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
+
+static struct dentry *blockcheck_debugfs_create(const char *name,
+						struct dentry *parent,
+						u64 *value)
+{
+	return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
+				   &blockcheck_fops);
+}
+
+static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+	if (stats) {
+		debugfs_remove(stats->b_debug_check);
+		stats->b_debug_check = NULL;
+		debugfs_remove(stats->b_debug_failure);
+		stats->b_debug_failure = NULL;
+		debugfs_remove(stats->b_debug_recover);
+		stats->b_debug_recover = NULL;
+		debugfs_remove(stats->b_debug_dir);
+		stats->b_debug_dir = NULL;
+	}
+}
+
+static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+					  struct dentry *parent)
+{
+	int rc = -EINVAL;
+
+	if (!stats)
+		goto out;
+
+	stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+	if (!stats->b_debug_dir)
+		goto out;
+
+	stats->b_debug_check =
+		blockcheck_debugfs_create("blocks_checked",
+					  stats->b_debug_dir,
+					  &stats->b_check_count);
+
+	stats->b_debug_failure =
+		blockcheck_debugfs_create("checksums_failed",
+					  stats->b_debug_dir,
+					  &stats->b_failure_count);
+
+	stats->b_debug_recover =
+		blockcheck_debugfs_create("ecc_recoveries",
+					  stats->b_debug_dir,
+					  &stats->b_recover_count);
+	if (stats->b_debug_check && stats->b_debug_failure &&
+	    stats->b_debug_recover)
+		rc = 0;
+
+out:
+	if (rc)
+		ocfs2_blockcheck_debug_remove(stats);
+	return rc;
+}
+#else
+static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+						 struct dentry *parent)
+{
+	return 0;
+}
+
+static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
+
+/* Always-called wrappers for starting and stopping the debugfs files */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					   struct dentry *parent)
+{
+	return ocfs2_blockcheck_debug_install(stats, parent);
+}
+
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
+{
+	ocfs2_blockcheck_debug_remove(stats);
+}
+
+static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_check_count++;
+	new_count = stats->b_check_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "Block check count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_failure_count++;
+	new_count = stats->b_failure_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_recover_count++;
+	new_count = stats->b_recover_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
+}
+
+
+
+/*
+ * These are the low-level APIs for using the ocfs2_block_check structure.
+ */
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	u32 crc;
+	u32 ecc;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	crc = crc32_le(~0, data, blocksize);
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHRT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc,
+			       struct ocfs2_blockcheck_stats *stats)
+{
+	int rc = 0;
+	u32 bc_crc32e;
+	u16 bc_ecc;
+	u32 crc, ecc;
+
+	ocfs2_blockcheck_inc_check(stats);
+
+	bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == bc_crc32e)
+		goto out;
+
+	ocfs2_blockcheck_inc_failure(stats);
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
+	     (unsigned int)bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+	ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
+
+	/* And check the crc32 again */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == bc_crc32e) {
+		ocfs2_blockcheck_inc_recover(stats);
+		goto out;
+	}
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
+	     (unsigned int)bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(bc_ecc);
+
+	return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i;
+	u32 crc, ecc;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHRT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc,
+				   struct ocfs2_blockcheck_stats *stats)
+{
+	int i, rc = 0;
+	u32 bc_crc32e;
+	u16 bc_ecc;
+	u32 crc, ecc, fix;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return 0;
+
+	ocfs2_blockcheck_inc_check(stats);
+
+	bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == bc_crc32e)
+		goto out;
+
+	ocfs2_blockcheck_inc_failure(stats);
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	for (i = 0, ecc = 0; i < nr; i++) {
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+	fix = ecc ^ bc_ecc;
+	for (i = 0; i < nr; i++) {
+		/*
+		 * Try the fix against each buffer.  It will only affect
+		 * one of them.
+		 */
+		ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+				  bhs[i]->b_size * 8 * i, fix);
+	}
+
+	/* And check the crc32 again */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == bc_crc32e) {
+		ocfs2_blockcheck_inc_recover(stats);
+		goto out;
+	}
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(bc_ecc);
+
+	return rc;
+}
+
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_meta_ecc(osb))
+		rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
+						&osb->osb_ecc_stats);
+
+	return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_meta_ecc(osb))
+		rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
+						    &osb->osb_ecc_stats);
+
+	return rc;
+}
+
diff --git a/kernel/fs/ocfs2/blockcheck.h b/kernel/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000..d4b69febf
--- /dev/null
+++ b/kernel/fs/ocfs2/blockcheck.h
@@ -0,0 +1,107 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* Count errors and error correction from blockcheck.c */
+struct ocfs2_blockcheck_stats {
+	spinlock_t b_lock;
+	u64 b_check_count;	/* Number of blocks we've checked */
+	u64 b_failure_count;	/* Number of failed checksums */
+	u64 b_recover_count;	/* Number of blocks fixed by ecc */
+
+	/*
+	 * debugfs entries, used if this is passed to
+	 * ocfs2_blockcheck_stats_debugfs_install()
+	 */
+	struct dentry *b_debug_dir;	/* Parent of the debugfs  files */
+	struct dentry *b_debug_check;	/* Exposes b_check_count */
+	struct dentry *b_debug_failure;	/* Exposes b_failure_count */
+	struct dentry *b_debug_recover;	/* Exposes b_recover_count */
+};
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc,
+			       struct ocfs2_blockcheck_stats *stats);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc,
+				   struct ocfs2_blockcheck_stats *stats);
+
+/* Debug Initialization */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					   struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+			 unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+				    unsigned int fix);
+#endif
diff --git a/kernel/fs/ocfs2/buffer_head_io.c b/kernel/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 000000000..1edcb141f
--- /dev/null
+++ b/kernel/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,427 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
+
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+	BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+		      struct ocfs2_caching_info *ci)
+{
+	int ret = 0;
+
+	trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
+
+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+	BUG_ON(buffer_jbd(bh));
+
+	/* No need to check for a soft readonly file system here. non
+	 * journalled writes are only ever done on system files which
+	 * can get modified during recovery even if read-only. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_metadata_cache_io_lock(ci);
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		ocfs2_set_buffer_uptodate(ci, bh);
+	} else {
+		/* We don't need to remove the clustered uptodate
+		 * information for this bh as it's not marked locally
+		 * uptodate. */
+		ret = -EIO;
+		mlog_errno(ret);
+	}
+
+	ocfs2_metadata_cache_io_unlock(ci);
+out:
+	return ret;
+}
+
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[])
+{
+	int status = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
+
+	if (!nr)
+		goto bail;
+
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(osb->sb, block++);
+			if (bhs[i] == NULL) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+
+		if (buffer_jbd(bh)) {
+			trace_ocfs2_read_blocks_sync_jbd(
+					(unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (buffer_dirty(bh)) {
+			/* This should probably be a BUG, or
+			 * at least return an error. */
+			mlog(ML_ERROR,
+			     "trying to sync read a dirty "
+			     "buffer! (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		lock_buffer(bh);
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "block %llu had the JBD bit set "
+			     "while I was in lock_buffer!",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		clear_buffer_uptodate(bh);
+		get_bh(bh); /* for end_buffer_read_sync() */
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+	}
+
+	for (i = nr; i > 0; i--) {
+		bh = bhs[i - 1];
+
+		/* No need to wait on the buffer if it's managed by JBD. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
+
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. */
+			status = -EIO;
+			put_bh(bh);
+			bhs[i - 1] = NULL;
+		}
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh))
+{
+	int status = 0;
+	int i, ignore_cache = 0;
+	struct buffer_head *bh;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+
+	trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
+
+	BUG_ON(!ci);
+	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+	       (flags & OCFS2_BH_IGNORE_CACHE));
+
+	if (bhs == NULL) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr < 0) {
+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr == 0) {
+		status = 0;
+		goto bail;
+	}
+
+	ocfs2_metadata_cache_io_lock(ci);
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(sb, block++);
+			if (bhs[i] == NULL) {
+				ocfs2_metadata_cache_io_unlock(ci);
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
+
+		/* There are three read-ahead cases here which we need to
+		 * be concerned with. All three assume a buffer has
+		 * previously been submitted with OCFS2_BH_READAHEAD
+		 * and it hasn't yet completed I/O.
+		 *
+		 * 1) The current request is sync to disk. This rarely
+		 *    happens these days, and never when performance
+		 *    matters - the code can just wait on the buffer
+		 *    lock and re-submit.
+		 *
+		 * 2) The current request is cached, but not
+		 *    readahead. ocfs2_buffer_uptodate() will return
+		 *    false anyway, so we'll wind up waiting on the
+		 *    buffer lock to do I/O. We re-check the request
+		 *    with after getting the lock to avoid a re-submit.
+		 *
+		 * 3) The current request is readahead (and so must
+		 *    also be a caching one). We short circuit if the
+		 *    buffer is locked (under I/O) and if it's in the
+		 *    uptodate cache. The re-check from #2 catches the
+		 *    case that the previous read-ahead completes just
+		 *    before our is-it-in-flight check.
+		 */
+
+		if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
+			trace_ocfs2_read_blocks_from_disk(
+			     (unsigned long long)bh->b_blocknr,
+			     (unsigned long long)ocfs2_metadata_cache_owner(ci));
+			/* We're using ignore_cache here to say
+			 * "go to disk" */
+			ignore_cache = 1;
+		}
+
+		trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+			ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
+
+		if (buffer_jbd(bh)) {
+			continue;
+		}
+
+		if (ignore_cache) {
+			if (buffer_dirty(bh)) {
+				/* This should probably be a BUG, or
+				 * at least return an error. */
+				continue;
+			}
+
+			/* A read-ahead request was made - if the
+			 * buffer is already under read-ahead from a
+			 * previously submitted request than we are
+			 * done here. */
+			if ((flags & OCFS2_BH_READAHEAD)
+			    && ocfs2_buffer_read_ahead(ci, bh))
+				continue;
+
+			lock_buffer(bh);
+			if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+				mlog(ML_ERROR, "block %llu had the JBD bit set "
+					       "while I was in lock_buffer!",
+				     (unsigned long long)bh->b_blocknr);
+				BUG();
+#else
+				unlock_buffer(bh);
+				continue;
+#endif
+			}
+
+			/* Re-check ocfs2_buffer_uptodate() as a
+			 * previously read-ahead buffer may have
+			 * completed I/O while we were waiting for the
+			 * buffer lock. */
+			if (!(flags & OCFS2_BH_IGNORE_CACHE)
+			    && !(flags & OCFS2_BH_READAHEAD)
+			    && ocfs2_buffer_uptodate(ci, bh)) {
+				unlock_buffer(bh);
+				continue;
+			}
+
+			clear_buffer_uptodate(bh);
+			get_bh(bh); /* for end_buffer_read_sync() */
+			if (validate)
+				set_buffer_needs_validate(bh);
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READ, bh);
+			continue;
+		}
+	}
+
+	status = 0;
+
+	for (i = (nr - 1); i >= 0; i--) {
+		bh = bhs[i];
+
+		if (!(flags & OCFS2_BH_READAHEAD)) {
+			/* We know this can't have changed as we hold the
+			 * owner sem. Avoid doing any work on the bh if the
+			 * journal has it. */
+			if (!buffer_jbd(bh))
+				wait_on_buffer(bh);
+
+			if (!buffer_uptodate(bh)) {
+				/* Status won't be cleared from here on out,
+				 * so we can safely record this and loop back
+				 * to cleanup the other buffers. Don't need to
+				 * remove the clustered uptodate information
+				 * for this bh as it's not marked locally
+				 * uptodate. */
+				status = -EIO;
+				put_bh(bh);
+				bhs[i] = NULL;
+				continue;
+			}
+
+			if (buffer_needs_validate(bh)) {
+				/* We never set NeedsValidate if the
+				 * buffer was held by the journal, so
+				 * that better not have changed */
+				BUG_ON(buffer_jbd(bh));
+				clear_buffer_needs_validate(bh);
+				status = validate(sb, bh);
+				if (status) {
+					put_bh(bh);
+					bhs[i] = NULL;
+					continue;
+				}
+			}
+		}
+
+		/* Always set the buffer in the cache, even if it was
+		 * a forced read, or read-ahead which hasn't yet
+		 * completed. */
+		ocfs2_set_buffer_uptodate(ci, bh);
+	}
+	ocfs2_metadata_cache_io_unlock(ci);
+
+	trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
+				    flags, ignore_cache);
+
+bail:
+
+	return status;
+}
+
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+					sector_t blkno)
+{
+	int i;
+	u64 backup_blkno;
+
+	if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+		return;
+
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		backup_blkno = ocfs2_backup_super_blkno(sb, i);
+		if (backup_blkno == blkno)
+			return;
+	}
+
+	BUG();
+}
+
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+				struct buffer_head *bh)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	BUG_ON(buffer_jbd(bh));
+	ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */
+	bh->b_end_io = end_buffer_write_sync;
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh)) {
+		ret = -EIO;
+		mlog_errno(ret);
+	}
+
+out:
+	return ret;
+}
diff --git a/kernel/fs/ocfs2/buffer_head_io.h b/kernel/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 000000000..b97bcc6dd
--- /dev/null
+++ b/kernel/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,77 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+
+#include <linux/buffer_head.h>
+
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+			     int uptodate);
+
+int ocfs2_write_block(struct ocfs2_super          *osb,
+		      struct buffer_head  *bh,
+		      struct ocfs2_caching_info   *ci);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[]);
+
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh));
+
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+				struct buffer_head *bh);
+
+#define OCFS2_BH_IGNORE_CACHE      1
+#define OCFS2_BH_READAHEAD         8
+
+static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
+				   struct buffer_head **bh,
+				   int (*validate)(struct super_block *sb,
+						   struct buffer_head *bh))
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
+
+bail:
+	return status;
+}
+
+#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/kernel/fs/ocfs2/cluster/Makefile b/kernel/fs/ocfs2/cluster/Makefile
new file mode 100644
index 000000000..1aefc0350
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
+
+ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
+	quorum.o tcp.o netdebug.o
diff --git a/kernel/fs/ocfs2/cluster/heartbeat.c b/kernel/fs/ocfs2/cluster/heartbeat.c
new file mode 100644
index 000000000..16eff4572
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,2678 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "quorum.h"
+
+#include "masklog.h"
+
+
+/*
+ * The first heartbeat pass had one global thread that would serialize all hb
+ * callback calls.  This global serializing sem should only be removed once
+ * we've made sure that all callees can deal with being called concurrently
+ * from multiple hb region threads.
+ */
+static DECLARE_RWSEM(o2hb_callback_sem);
+
+/*
+ * multiple hb threads are watching multiple regions.  A node is live
+ * whenever any of the threads sees activity from the node in its region.
+ */
+static DEFINE_SPINLOCK(o2hb_live_lock);
+static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
+static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+static LIST_HEAD(o2hb_node_events);
+static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * 	- o2hb_region_bitmap allows us to limit the region number to max region.
+ * 	- o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * 	- o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * 		heartbeat on it.
+ * 	- o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES		0
+#define O2HB_DB_TYPE_LIVEREGIONS	1
+#define O2HB_DB_TYPE_QUORUMREGIONS	2
+#define O2HB_DB_TYPE_FAILEDREGIONS	3
+#define O2HB_DB_TYPE_REGION_LIVENODES	4
+#define O2HB_DB_TYPE_REGION_NUMBER	5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME	6
+#define O2HB_DB_TYPE_REGION_PINNED	7
+struct o2hb_debug_buf {
+	int db_type;
+	int db_size;
+	int db_len;
+	void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
+#define O2HB_DEBUG_DIR			"o2hb"
+#define O2HB_DEBUG_LIVENODES		"livenodes"
+#define O2HB_DEBUG_LIVEREGIONS		"live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS	"quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER	"num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME	"elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED	"pinned"
+
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
+
+static LIST_HEAD(o2hb_all_regions);
+
+static struct o2hb_callback {
+	struct list_head list;
+} o2hb_callbacks[O2HB_NUM_CB];
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
+
+#define O2HB_DEFAULT_BLOCK_BITS       9
+
+enum o2hb_heartbeat_modes {
+	O2HB_HEARTBEAT_LOCAL		= 0,
+	O2HB_HEARTBEAT_GLOBAL,
+	O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+		"local",	/* O2HB_HEARTBEAT_LOCAL */
+		"global",	/* O2HB_HEARTBEAT_GLOBAL */
+};
+
+unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF		3
+
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
+
+/* Only sets a new threshold if there are no active regions.
+ *
+ * No locking or otherwise interesting code is required for reading
+ * o2hb_dead_threshold as it can't change once regions are active and
+ * it's not interesting to anyone until then anyway. */
+static void o2hb_dead_threshold_set(unsigned int threshold)
+{
+	if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions))
+			o2hb_dead_threshold = threshold;
+		spin_unlock(&o2hb_live_lock);
+	}
+}
+
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
+{
+	int ret = -1;
+
+	if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions)) {
+			o2hb_heartbeat_mode = hb_mode;
+			ret = 0;
+		}
+		spin_unlock(&o2hb_live_lock);
+	}
+
+	return ret;
+}
+
+struct o2hb_node_event {
+	struct list_head        hn_item;
+	enum o2hb_callback_type hn_event_type;
+	struct o2nm_node        *hn_node;
+	int                     hn_node_num;
+};
+
+struct o2hb_disk_slot {
+	struct o2hb_disk_heartbeat_block *ds_raw_block;
+	u8			ds_node_num;
+	u64			ds_last_time;
+	u64			ds_last_generation;
+	u16			ds_equal_samples;
+	u16			ds_changed_samples;
+	struct list_head	ds_live_item;
+};
+
+/* each thread owns a region.. when we're asked to tear down the region
+ * we ask the thread to stop, who cleans up the region */
+struct o2hb_region {
+	struct config_item	hr_item;
+
+	struct list_head	hr_all_item;
+	unsigned		hr_unclean_stop:1,
+				hr_aborted_start:1,
+				hr_item_pinned:1,
+				hr_item_dropped:1;
+
+	/* protected by the hr_callback_sem */
+	struct task_struct 	*hr_task;
+
+	unsigned int		hr_blocks;
+	unsigned long long	hr_start_block;
+
+	unsigned int		hr_block_bits;
+	unsigned int		hr_block_bytes;
+
+	unsigned int		hr_slots_per_page;
+	unsigned int		hr_num_pages;
+
+	struct page             **hr_slot_data;
+	struct block_device	*hr_bdev;
+	struct o2hb_disk_slot	*hr_slots;
+
+	/* live node map of this region */
+	unsigned long		hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned int		hr_region_num;
+
+	struct dentry		*hr_debug_dir;
+	struct dentry		*hr_debug_livenodes;
+	struct dentry		*hr_debug_regnum;
+	struct dentry		*hr_debug_elapsed_time;
+	struct dentry		*hr_debug_pinned;
+	struct o2hb_debug_buf	*hr_db_livenodes;
+	struct o2hb_debug_buf	*hr_db_regnum;
+	struct o2hb_debug_buf	*hr_db_elapsed_time;
+	struct o2hb_debug_buf	*hr_db_pinned;
+
+	/* let the person setting up hb wait for it to return until it
+	 * has reached a 'steady' state.  This will be fixed when we have
+	 * a more complete api that doesn't lead to this sort of fragility. */
+	atomic_t		hr_steady_iterations;
+
+	/* terminate o2hb thread if it does not reach steady state
+	 * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+	atomic_t		hr_unsteady_iterations;
+
+	char			hr_dev_name[BDEVNAME_SIZE];
+
+	unsigned int		hr_timeout_ms;
+
+	/* randomized as the region goes up and down so that a node
+	 * recognizes a node going up and down in one iteration */
+	u64			hr_generation;
+
+	struct delayed_work	hr_write_timeout_work;
+	unsigned long		hr_last_timeout_start;
+
+	/* Used during o2hb_check_slot to hold a copy of the block
+	 * being checked because we temporarily have to zero out the
+	 * crc field. */
+	struct o2hb_disk_heartbeat_block *hr_tmp_block;
+};
+
+struct o2hb_bio_wait_ctxt {
+	atomic_t          wc_num_reqs;
+	struct completion wc_io_complete;
+	int               wc_error;
+};
+
+static void o2hb_write_timeout(struct work_struct *work)
+{
+	int failed, quorum;
+	unsigned long flags;
+	struct o2hb_region *reg =
+		container_of(work, struct o2hb_region,
+			     hr_write_timeout_work.work);
+
+	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
+	     "milliseconds\n", reg->hr_dev_name,
+	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock_irqsave(&o2hb_live_lock, flags);
+		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+			set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		failed = bitmap_weight(o2hb_failed_region_bitmap,
+					O2NM_MAX_REGIONS);
+		quorum = bitmap_weight(o2hb_quorum_region_bitmap,
+					O2NM_MAX_REGIONS);
+		spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+		mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+		     quorum, failed);
+
+		/*
+		 * Fence if the number of failed regions >= half the number
+		 * of  quorum regions
+		 */
+		if ((failed << 1) < quorum)
+			return;
+	}
+
+	o2quo_disk_timeout();
+}
+
+static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+{
+	/* Arm writeout only after thread reaches steady state */
+	if (atomic_read(&reg->hr_steady_iterations) != 0)
+		return;
+
+	mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
+	     O2HB_MAX_WRITE_TIMEOUT_MS);
+
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock(&o2hb_live_lock);
+		clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		spin_unlock(&o2hb_live_lock);
+	}
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	reg->hr_last_timeout_start = jiffies;
+	schedule_delayed_work(&reg->hr_write_timeout_work,
+			      msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+}
+
+static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+{
+	cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+}
+
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
+{
+	atomic_set(&wc->wc_num_reqs, 1);
+	init_completion(&wc->wc_io_complete);
+	wc->wc_error = 0;
+}
+
+/* Used in error paths too */
+static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
+				     unsigned int num)
+{
+	/* sadly atomic_sub_and_test() isn't available on all platforms.  The
+	 * good news is that the fast path only completes one at a time */
+	while(num--) {
+		if (atomic_dec_and_test(&wc->wc_num_reqs)) {
+			BUG_ON(num > 0);
+			complete(&wc->wc_io_complete);
+		}
+	}
+}
+
+static void o2hb_wait_on_io(struct o2hb_region *reg,
+			    struct o2hb_bio_wait_ctxt *wc)
+{
+	o2hb_bio_wait_dec(wc, 1);
+	wait_for_completion(&wc->wc_io_complete);
+}
+
+static void o2hb_bio_end_io(struct bio *bio,
+			   int error)
+{
+	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
+
+	if (error) {
+		mlog(ML_ERROR, "IO Error %d\n", error);
+		wc->wc_error = error;
+	}
+
+	o2hb_bio_wait_dec(wc, 1);
+	bio_put(bio);
+}
+
+/* Setup a Bio to cover I/O against num_slots slots starting at
+ * start_slot. */
+static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
+				      struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int *current_slot,
+				      unsigned int max_slots)
+{
+	int len, current_page;
+	unsigned int vec_len, vec_start;
+	unsigned int bits = reg->hr_block_bits;
+	unsigned int spp = reg->hr_slots_per_page;
+	unsigned int cs = *current_slot;
+	struct bio *bio;
+	struct page *page;
+
+	/* Testing has shown this allocation to take long enough under
+	 * GFP_KERNEL that the local node can get fenced. It would be
+	 * nicest if we could pre-allocate these bios and avoid this
+	 * all together. */
+	bio = bio_alloc(GFP_ATOMIC, 16);
+	if (!bio) {
+		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
+		bio = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	/* Must put everything in 512 byte sectors for the bio... */
+	bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
+	bio->bi_bdev = reg->hr_bdev;
+	bio->bi_private = wc;
+	bio->bi_end_io = o2hb_bio_end_io;
+
+	vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+	while(cs < max_slots) {
+		current_page = cs / spp;
+		page = reg->hr_slot_data[current_page];
+
+		vec_len = min(PAGE_CACHE_SIZE - vec_start,
+			      (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
+
+		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
+		     current_page, vec_len, vec_start);
+
+		len = bio_add_page(bio, page, vec_len, vec_start);
+		if (len != vec_len) break;
+
+		cs += vec_len / (PAGE_CACHE_SIZE/spp);
+		vec_start = 0;
+	}
+
+bail:
+	*current_slot = cs;
+	return bio;
+}
+
+static int o2hb_read_slots(struct o2hb_region *reg,
+			   unsigned int max_slots)
+{
+	unsigned int current_slot=0;
+	int status;
+	struct o2hb_bio_wait_ctxt wc;
+	struct bio *bio;
+
+	o2hb_bio_wait_init(&wc);
+
+	while(current_slot < max_slots) {
+		bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
+		if (IS_ERR(bio)) {
+			status = PTR_ERR(bio);
+			mlog_errno(status);
+			goto bail_and_wait;
+		}
+
+		atomic_inc(&wc.wc_num_reqs);
+		submit_bio(READ, bio);
+	}
+
+	status = 0;
+
+bail_and_wait:
+	o2hb_wait_on_io(reg, &wc);
+	if (wc.wc_error && !status)
+		status = wc.wc_error;
+
+	return status;
+}
+
+static int o2hb_issue_node_write(struct o2hb_region *reg,
+				 struct o2hb_bio_wait_ctxt *write_wc)
+{
+	int status;
+	unsigned int slot;
+	struct bio *bio;
+
+	o2hb_bio_wait_init(write_wc);
+
+	slot = o2nm_this_node();
+
+	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
+	if (IS_ERR(bio)) {
+		status = PTR_ERR(bio);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&write_wc->wc_num_reqs);
+	submit_bio(WRITE_SYNC, bio);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+				     struct o2hb_disk_heartbeat_block *hb_block)
+{
+	__le32 old_cksum;
+	u32 ret;
+
+	/* We want to compute the block crc with a 0 value in the
+	 * hb_cksum field. Save it off here and replace after the
+	 * crc. */
+	old_cksum = hb_block->hb_cksum;
+	hb_block->hb_cksum = 0;
+
+	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+	hb_block->hb_cksum = old_cksum;
+
+	return ret;
+}
+
+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
+{
+	mlog(ML_ERROR, "Dump slot information: seq = 0x%llx, node = %u, "
+	     "cksum = 0x%x, generation 0x%llx\n",
+	     (long long)le64_to_cpu(hb_block->hb_seq),
+	     hb_block->hb_node, le32_to_cpu(hb_block->hb_cksum),
+	     (long long)le64_to_cpu(hb_block->hb_generation));
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+			   struct o2hb_disk_heartbeat_block *hb_block)
+{
+	u32 read, computed;
+
+	read = le32_to_cpu(hb_block->hb_cksum);
+	computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+	return read == computed;
+}
+
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
+{
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+	char *errstr;
+
+	slot = &reg->hr_slots[o2nm_this_node()];
+	/* Don't check on our 1st timestamp */
+	if (!slot->ds_last_time)
+		return 0;
+
+	hb_block = slot->ds_raw_block;
+	if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
+	    le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
+	    hb_block->hb_node == slot->ds_node_num)
+		return 1;
+
+#define ERRSTR1		"Another node is heartbeating on device"
+#define ERRSTR2		"Heartbeat generation mismatch on device"
+#define ERRSTR3		"Heartbeat sequence mismatch on device"
+
+	if (hb_block->hb_node != slot->ds_node_num)
+		errstr = ERRSTR1;
+	else if (le64_to_cpu(hb_block->hb_generation) !=
+		 slot->ds_last_generation)
+		errstr = ERRSTR2;
+	else
+		errstr = ERRSTR3;
+
+	mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
+	     "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
+	     slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
+	     (unsigned long long)slot->ds_last_time, hb_block->hb_node,
+	     (unsigned long long)le64_to_cpu(hb_block->hb_generation),
+	     (unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+	return 0;
+}
+
+static inline void o2hb_prepare_block(struct o2hb_region *reg,
+				      u64 generation)
+{
+	int node_num;
+	u64 cputime;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+	slot = &reg->hr_slots[node_num];
+
+	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
+	memset(hb_block, 0, reg->hr_block_bytes);
+	/* TODO: time stuff */
+	cputime = CURRENT_TIME.tv_sec;
+	if (!cputime)
+		cputime = 1;
+
+	hb_block->hb_seq = cpu_to_le64(cputime);
+	hb_block->hb_node = node_num;
+	hb_block->hb_generation = cpu_to_le64(generation);
+	hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
+
+	/* This step must always happen last! */
+	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
+								   hb_block));
+
+	mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
+	     (long long)generation,
+	     le32_to_cpu(hb_block->hb_cksum));
+}
+
+static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
+				struct o2nm_node *node,
+				int idx)
+{
+	struct o2hb_callback_func *f;
+
+	list_for_each_entry(f, &hbcall->list, hc_item) {
+		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
+		(f->hc_func)(node, idx, f->hc_data);
+	}
+}
+
+/* Will run the list in order until we process the passed event */
+static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
+{
+	struct o2hb_callback *hbcall;
+	struct o2hb_node_event *event;
+
+	/* Holding callback sem assures we don't alter the callback
+	 * lists when doing this, and serializes ourselves with other
+	 * processes wanting callbacks. */
+	down_write(&o2hb_callback_sem);
+
+	spin_lock(&o2hb_live_lock);
+	while (!list_empty(&o2hb_node_events)
+	       && !list_empty(&queued_event->hn_item)) {
+		event = list_entry(o2hb_node_events.next,
+				   struct o2hb_node_event,
+				   hn_item);
+		list_del_init(&event->hn_item);
+		spin_unlock(&o2hb_live_lock);
+
+		mlog(ML_HEARTBEAT, "Node %s event for %d\n",
+		     event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
+		     event->hn_node_num);
+
+		hbcall = hbcall_from_type(event->hn_event_type);
+
+		/* We should *never* have gotten on to the list with a
+		 * bad type... This isn't something that we should try
+		 * to recover from. */
+		BUG_ON(IS_ERR(hbcall));
+
+		o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
+
+		spin_lock(&o2hb_live_lock);
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	up_write(&o2hb_callback_sem);
+}
+
+static void o2hb_queue_node_event(struct o2hb_node_event *event,
+				  enum o2hb_callback_type type,
+				  struct o2nm_node *node,
+				  int node_num)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
+	event->hn_event_type = type;
+	event->hn_node = node;
+	event->hn_node_num = node_num;
+
+	mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
+	     type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
+
+	list_add_tail(&event->hn_item, &o2hb_node_events);
+}
+
+static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
+{
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+	int queued = 0;
+
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return;
+
+	spin_lock(&o2hb_live_lock);
+	if (!list_empty(&slot->ds_live_item)) {
+		mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
+		     slot->ds_node_num);
+
+		list_del_init(&slot->ds_live_item);
+
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+			queued = 1;
+		}
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	if (queued)
+		o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+}
+
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
+{
+	if (!o2hb_global_heartbeat_active())
+		return;
+
+	/* Prevent race with o2hb_heartbeat_group_drop_item() */
+	if (kthread_should_stop())
+		return;
+
+	/* Tag region as quorum only after thread reaches steady state */
+	if (atomic_read(&reg->hr_steady_iterations) != 0)
+		return;
+
+	spin_lock(&o2hb_live_lock);
+
+	if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+		goto unlock;
+
+	/*
+	 * A region can be added to the quorum only when it sees all
+	 * live nodes heartbeat on it. In other words, the region has been
+	 * added to all nodes.
+	 */
+	if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+		   sizeof(o2hb_live_node_bitmap)))
+		goto unlock;
+
+	printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+	       config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+	/*
+	 * If global heartbeat active, unpin all regions if the
+	 * region count > CUT_OFF
+	 */
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+		o2hb_region_unpin(NULL);
+unlock:
+	spin_unlock(&o2hb_live_lock);
+}
+
+static int o2hb_check_slot(struct o2hb_region *reg,
+			   struct o2hb_disk_slot *slot)
+{
+	int changed = 0, gen_changed = 0;
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
+	u64 cputime;
+	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
+	unsigned int slot_dead_ms;
+	int tmp;
+	int queued = 0;
+
+	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
+
+	/*
+	 * If a node is no longer configured but is still in the livemap, we
+	 * may need to clear that bit from the livemap.
+	 */
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node) {
+		spin_lock(&o2hb_live_lock);
+		tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+		spin_unlock(&o2hb_live_lock);
+		if (!tmp)
+			return 0;
+	}
+
+	if (!o2hb_verify_crc(reg, hb_block)) {
+		/* all paths from here will drop o2hb_live_lock for
+		 * us. */
+		spin_lock(&o2hb_live_lock);
+
+		/* Don't print an error on the console in this case -
+		 * a freshly formatted heartbeat area will not have a
+		 * crc set on it. */
+		if (list_empty(&slot->ds_live_item))
+			goto out;
+
+		/* The node is live but pushed out a bad crc. We
+		 * consider it a transient miss but don't populate any
+		 * other values as they may be junk. */
+		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+		     slot->ds_node_num, reg->hr_dev_name);
+		o2hb_dump_slot(hb_block);
+
+		slot->ds_equal_samples++;
+		goto fire_callbacks;
+	}
+
+	/* we don't care if these wrap.. the state transitions below
+	 * clear at the right places */
+	cputime = le64_to_cpu(hb_block->hb_seq);
+	if (slot->ds_last_time != cputime)
+		slot->ds_changed_samples++;
+	else
+		slot->ds_equal_samples++;
+	slot->ds_last_time = cputime;
+
+	/* The node changed heartbeat generations. We assume this to
+	 * mean it dropped off but came back before we timed out. We
+	 * want to consider it down for the time being but don't want
+	 * to lose any changed_samples state we might build up to
+	 * considering it live again. */
+	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+		gen_changed = 1;
+		slot->ds_equal_samples = 0;
+		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%llx "
+		     "to 0x%llx)\n", slot->ds_node_num,
+		     (long long)slot->ds_last_generation,
+		     (long long)le64_to_cpu(hb_block->hb_generation));
+	}
+
+	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+	mlog(ML_HEARTBEAT, "Slot %d gen 0x%llx cksum 0x%x "
+	     "seq %llu last %llu changed %u equal %u\n",
+	     slot->ds_node_num, (long long)slot->ds_last_generation,
+	     le32_to_cpu(hb_block->hb_cksum),
+	     (unsigned long long)le64_to_cpu(hb_block->hb_seq),
+	     (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
+	     slot->ds_equal_samples);
+
+	spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
+	/* dead nodes only come to life after some number of
+	 * changes at any time during their dead time */
+	if (list_empty(&slot->ds_live_item) &&
+	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
+		mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
+		     slot->ds_node_num, (long long)slot->ds_last_generation);
+
+		set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
+		/* first on the list generates a callback */
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+			     "bitmap\n", slot->ds_node_num);
+			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+			queued = 1;
+		}
+
+		list_add_tail(&slot->ds_live_item,
+			      &o2hb_live_slots[slot->ds_node_num]);
+
+		slot->ds_equal_samples = 0;
+
+		/* We want to be sure that all nodes agree on the
+		 * number of milliseconds before a node will be
+		 * considered dead. The self-fencing timeout is
+		 * computed from this value, and a discrepancy might
+		 * result in heartbeat calling a node dead when it
+		 * hasn't self-fenced yet. */
+		slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
+		if (slot_dead_ms && slot_dead_ms != dead_ms) {
+			/* TODO: Perhaps we can fail the region here. */
+			mlog(ML_ERROR, "Node %d on device %s has a dead count "
+			     "of %u ms, but our count is %u ms.\n"
+			     "Please double check your configuration values "
+			     "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
+			     slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
+			     dead_ms);
+		}
+		goto out;
+	}
+
+	/* if the list is dead, we're done.. */
+	if (list_empty(&slot->ds_live_item))
+		goto out;
+
+	/* live nodes only go dead after enough consequtive missed
+	 * samples..  reset the missed counter whenever we see
+	 * activity */
+	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
+		mlog(ML_HEARTBEAT, "Node %d left my region\n",
+		     slot->ds_node_num);
+
+		clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
+		/* last off the live_slot generates a callback */
+		list_del_init(&slot->ds_live_item);
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+			     "nodes bitmap\n", slot->ds_node_num);
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			/* node can be null */
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+					      node, slot->ds_node_num);
+
+			changed = 1;
+			queued = 1;
+		}
+
+		/* We don't clear this because the node is still
+		 * actually writing new blocks. */
+		if (!gen_changed)
+			slot->ds_changed_samples = 0;
+		goto out;
+	}
+	if (slot->ds_changed_samples) {
+		slot->ds_changed_samples = 0;
+		slot->ds_equal_samples = 0;
+	}
+out:
+	spin_unlock(&o2hb_live_lock);
+
+	if (queued)
+		o2hb_run_event_list(&event);
+
+	if (node)
+		o2nm_node_put(node);
+	return changed;
+}
+
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
+{
+	return find_last_bit(nodes, numbits);
+}
+
+static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+{
+	int i, ret, highest_node;
+	int membership_change = 0, own_slot_ok = 0;
+	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct o2hb_bio_wait_ctxt write_wc;
+
+	ret = o2nm_configured_node_map(configured_nodes,
+				       sizeof(configured_nodes));
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	/*
+	 * If a node is not configured but is in the livemap, we still need
+	 * to read the slot so as to be able to remove it from the livemap.
+	 */
+	o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+	i = -1;
+	while ((i = find_next_bit(live_node_bitmap,
+				  O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+		set_bit(i, configured_nodes);
+	}
+
+	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
+	if (highest_node >= O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* No sense in reading the slots of nodes that don't exist
+	 * yet. Of course, if the node definitions have holes in them
+	 * then we're reading an empty slot anyway... Consider this
+	 * best-effort. */
+	ret = o2hb_read_slots(reg, highest_node + 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	/* With an up to date view of the slots, we can check that no
+	 * other node has been improperly configured to heartbeat in
+	 * our slot. */
+	own_slot_ok = o2hb_check_own_slot(reg);
+
+	/* fill in the proper info for our next heartbeat */
+	o2hb_prepare_block(reg, reg->hr_generation);
+
+	ret = o2hb_issue_node_write(reg, &write_wc);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	i = -1;
+	while((i = find_next_bit(configured_nodes,
+				 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+		membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+	}
+
+	/*
+	 * We have to be sure we've advertised ourselves on disk
+	 * before we can go to steady state.  This ensures that
+	 * people we find in our steady state have seen us.
+	 */
+	o2hb_wait_on_io(reg, &write_wc);
+	if (write_wc.wc_error) {
+		/* Do not re-arm the write timeout on I/O error - we
+		 * can't be sure that the new block ever made it to
+		 * disk */
+		mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
+		     write_wc.wc_error, reg->hr_dev_name);
+		ret = write_wc.wc_error;
+		goto bail;
+	}
+
+	/* Skip disarming the timeout if own slot has stale/bad data */
+	if (own_slot_ok) {
+		o2hb_set_quorum_device(reg);
+		o2hb_arm_write_timeout(reg);
+	}
+
+bail:
+	/* let the person who launched us know when things are steady */
+	if (atomic_read(&reg->hr_steady_iterations) != 0) {
+		if (!ret && own_slot_ok && !membership_change) {
+			if (atomic_dec_and_test(&reg->hr_steady_iterations))
+				wake_up(&o2hb_steady_queue);
+		}
+	}
+
+	if (atomic_read(&reg->hr_steady_iterations) != 0) {
+		if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+			printk(KERN_NOTICE "o2hb: Unable to stabilize "
+			       "heartbeart on region %s (%s)\n",
+			       config_item_name(&reg->hr_item),
+			       reg->hr_dev_name);
+			atomic_set(&reg->hr_steady_iterations, 0);
+			reg->hr_aborted_start = 1;
+			wake_up(&o2hb_steady_queue);
+			ret = -EIO;
+		}
+	}
+
+	return ret;
+}
+
+/* Subtract b from a, storing the result in a. a *must* have a larger
+ * value than b. */
+static void o2hb_tv_subtract(struct timeval *a,
+			     struct timeval *b)
+{
+	/* just return 0 when a is after b */
+	if (a->tv_sec < b->tv_sec ||
+	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
+		a->tv_sec = 0;
+		a->tv_usec = 0;
+		return;
+	}
+
+	a->tv_sec -= b->tv_sec;
+	a->tv_usec -= b->tv_usec;
+	while ( a->tv_usec < 0 ) {
+		a->tv_sec--;
+		a->tv_usec += 1000000;
+	}
+}
+
+static unsigned int o2hb_elapsed_msecs(struct timeval *start,
+				       struct timeval *end)
+{
+	struct timeval res = *end;
+
+	o2hb_tv_subtract(&res, start);
+
+	return res.tv_sec * 1000 + res.tv_usec / 1000;
+}
+
+/*
+ * we ride the region ref that the region dir holds.  before the region
+ * dir is removed and drops it ref it will wait to tear down this
+ * thread.
+ */
+static int o2hb_thread(void *data)
+{
+	int i, ret;
+	struct o2hb_region *reg = data;
+	struct o2hb_bio_wait_ctxt write_wc;
+	struct timeval before_hb, after_hb;
+	unsigned int elapsed_msec;
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
+
+	set_user_nice(current, MIN_NICE);
+
+	/* Pin node */
+	o2nm_depend_this_node();
+
+	while (!kthread_should_stop() &&
+	       !reg->hr_unclean_stop && !reg->hr_aborted_start) {
+		/* We track the time spent inside
+		 * o2hb_do_disk_heartbeat so that we avoid more than
+		 * hr_timeout_ms between disk writes. On busy systems
+		 * this should result in a heartbeat which is less
+		 * likely to time itself out. */
+		do_gettimeofday(&before_hb);
+
+		ret = o2hb_do_disk_heartbeat(reg);
+
+		do_gettimeofday(&after_hb);
+		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+
+		mlog(ML_HEARTBEAT,
+		     "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
+		     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
+		     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
+		     elapsed_msec, ret);
+
+		if (!kthread_should_stop() &&
+		    elapsed_msec < reg->hr_timeout_ms) {
+			/* the kthread api has blocked signals for us so no
+			 * need to record the return value. */
+			msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
+		}
+	}
+
+	o2hb_disarm_write_timeout(reg);
+
+	/* unclean stop is only used in very bad situation */
+	for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
+		o2hb_shutdown_slot(&reg->hr_slots[i]);
+
+	/* Explicit down notification - avoid forcing the other nodes
+	 * to timeout on this region when we could just as easily
+	 * write a clear generation - thus indicating to them that
+	 * this node has left this region.
+	 */
+	if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+		o2hb_prepare_block(reg, 0);
+		ret = o2hb_issue_node_write(reg, &write_wc);
+		if (ret == 0)
+			o2hb_wait_on_io(reg, &write_wc);
+		else
+			mlog_errno(ret);
+	}
+
+	/* Unpin node */
+	o2nm_undepend_this_node();
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
+
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+	struct o2hb_debug_buf *db = inode->i_private;
+	struct o2hb_region *reg;
+	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long lts;
+	char *buf = NULL;
+	int i = -1;
+	int out = 0;
+
+	/* max_nodes should be the largest bitmap we pass here */
+	BUG_ON(sizeof(map) < db->db_size);
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto bail;
+
+	switch (db->db_type) {
+	case O2HB_DB_TYPE_LIVENODES:
+	case O2HB_DB_TYPE_LIVEREGIONS:
+	case O2HB_DB_TYPE_QUORUMREGIONS:
+	case O2HB_DB_TYPE_FAILEDREGIONS:
+		spin_lock(&o2hb_live_lock);
+		memcpy(map, db->db_data, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	case O2HB_DB_TYPE_REGION_LIVENODES:
+		spin_lock(&o2hb_live_lock);
+		reg = (struct o2hb_region *)db->db_data;
+		memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	case O2HB_DB_TYPE_REGION_NUMBER:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+				reg->hr_region_num);
+		goto done;
+
+	case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+		reg = (struct o2hb_region *)db->db_data;
+		lts = reg->hr_last_timeout_start;
+		/* If 0, it has never been set before */
+		if (lts)
+			lts = jiffies_to_msecs(jiffies - lts);
+		out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
+		goto done;
+
+	case O2HB_DB_TYPE_REGION_PINNED:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+				!!reg->hr_item_pinned);
+		goto done;
+
+	default:
+		goto done;
+	}
+
+	while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+	out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+done:
+	i_size_write(inode, out);
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+				 size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+			       size_t nbytes, loff_t *ppos)
+{
+	return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+
+static const struct file_operations o2hb_debug_fops = {
+	.open =		o2hb_debug_open,
+	.release =	o2hb_debug_release,
+	.read =		o2hb_debug_read,
+	.llseek =	generic_file_llseek,
+};
+
+void o2hb_exit(void)
+{
+	kfree(o2hb_db_livenodes);
+	kfree(o2hb_db_liveregions);
+	kfree(o2hb_db_quorumregions);
+	kfree(o2hb_db_failedregions);
+	debugfs_remove(o2hb_debug_failedregions);
+	debugfs_remove(o2hb_debug_quorumregions);
+	debugfs_remove(o2hb_debug_liveregions);
+	debugfs_remove(o2hb_debug_livenodes);
+	debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+					struct o2hb_debug_buf **db, int db_len,
+					int type, int size, int len, void *data)
+{
+	*db = kmalloc(db_len, GFP_KERNEL);
+	if (!*db)
+		return NULL;
+
+	(*db)->db_type = type;
+	(*db)->db_size = size;
+	(*db)->db_len = len;
+	(*db)->db_data = data;
+
+	return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+				   &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+	int ret = -ENOMEM;
+
+	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+	if (!o2hb_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+						 o2hb_debug_dir,
+						 &o2hb_db_livenodes,
+						 sizeof(*o2hb_db_livenodes),
+						 O2HB_DB_TYPE_LIVENODES,
+						 sizeof(o2hb_live_node_bitmap),
+						 O2NM_MAX_NODES,
+						 o2hb_live_node_bitmap);
+	if (!o2hb_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+						   o2hb_debug_dir,
+						   &o2hb_db_liveregions,
+						   sizeof(*o2hb_db_liveregions),
+						   O2HB_DB_TYPE_LIVEREGIONS,
+						   sizeof(o2hb_live_region_bitmap),
+						   O2NM_MAX_REGIONS,
+						   o2hb_live_region_bitmap);
+	if (!o2hb_debug_liveregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_quorumregions =
+			o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_quorumregions,
+					  sizeof(*o2hb_db_quorumregions),
+					  O2HB_DB_TYPE_QUORUMREGIONS,
+					  sizeof(o2hb_quorum_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_quorum_region_bitmap);
+	if (!o2hb_debug_quorumregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_failedregions =
+			o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_failedregions,
+					  sizeof(*o2hb_db_failedregions),
+					  O2HB_DB_TYPE_FAILEDREGIONS,
+					  sizeof(o2hb_failed_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_failed_region_bitmap);
+	if (!o2hb_debug_failedregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	if (ret)
+		o2hb_exit();
+
+	return ret;
+}
+
+int o2hb_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
+		INIT_LIST_HEAD(&o2hb_callbacks[i].list);
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
+		INIT_LIST_HEAD(&o2hb_live_slots[i]);
+
+	INIT_LIST_HEAD(&o2hb_node_events);
+
+	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+	memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+	memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+
+	o2hb_dependent_users = 0;
+
+	return o2hb_debug_init();
+}
+
+/* if we're already in a callback then we're already serialized by the sem */
+static void o2hb_fill_node_map_from_callback(unsigned long *map,
+					     unsigned bytes)
+{
+	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+	memcpy(map, &o2hb_live_node_bitmap, bytes);
+}
+
+/*
+ * get a map of all nodes that are heartbeating in any regions
+ */
+void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+{
+	/* callers want to serialize this map and callbacks so that they
+	 * can trust that they don't miss nodes coming to the party */
+	down_read(&o2hb_callback_sem);
+	spin_lock(&o2hb_live_lock);
+	o2hb_fill_node_map_from_callback(map, bytes);
+	spin_unlock(&o2hb_live_lock);
+	up_read(&o2hb_callback_sem);
+}
+EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
+
+/*
+ * heartbeat configfs bits.  The heartbeat set is a default set under
+ * the cluster set in nodemanager.c.
+ */
+
+static struct o2hb_region *to_o2hb_region(struct config_item *item)
+{
+	return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
+}
+
+/* drop_item only drops its ref after killing the thread, nothing should
+ * be using the region anymore.  this has to clean up any state that
+ * attributes might have built up. */
+static void o2hb_region_release(struct config_item *item)
+{
+	int i;
+	struct page *page;
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
+	kfree(reg->hr_tmp_block);
+
+	if (reg->hr_slot_data) {
+		for (i = 0; i < reg->hr_num_pages; i++) {
+			page = reg->hr_slot_data[i];
+			if (page)
+				__free_page(page);
+		}
+		kfree(reg->hr_slot_data);
+	}
+
+	if (reg->hr_bdev)
+		blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
+
+	kfree(reg->hr_slots);
+
+	kfree(reg->hr_db_regnum);
+	kfree(reg->hr_db_livenodes);
+	debugfs_remove(reg->hr_debug_livenodes);
+	debugfs_remove(reg->hr_debug_regnum);
+	debugfs_remove(reg->hr_debug_elapsed_time);
+	debugfs_remove(reg->hr_debug_pinned);
+	debugfs_remove(reg->hr_debug_dir);
+
+	spin_lock(&o2hb_live_lock);
+	list_del(&reg->hr_all_item);
+	spin_unlock(&o2hb_live_lock);
+
+	kfree(reg);
+}
+
+static int o2hb_read_block_input(struct o2hb_region *reg,
+				 const char *page,
+				 size_t count,
+				 unsigned long *ret_bytes,
+				 unsigned int *ret_bits)
+{
+	unsigned long bytes;
+	char *p = (char *)page;
+
+	bytes = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	/* Heartbeat and fs min / max block sizes are the same. */
+	if (bytes > 4096 || bytes < 512)
+		return -ERANGE;
+	if (hweight16(bytes) != 1)
+		return -EINVAL;
+
+	if (ret_bytes)
+		*ret_bytes = bytes;
+	if (ret_bits)
+		*ret_bits = ffs(bytes) - 1;
+
+	return 0;
+}
+
+static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%u\n", reg->hr_block_bytes);
+}
+
+static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	int status;
+	unsigned long block_bytes;
+	unsigned int block_bits;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	status = o2hb_read_block_input(reg, page, count,
+				       &block_bytes, &block_bits);
+	if (status)
+		return status;
+
+	reg->hr_block_bytes = (unsigned int)block_bytes;
+	reg->hr_block_bits = block_bits;
+
+	return count;
+}
+
+static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%llu\n", reg->hr_start_block);
+}
+
+static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	unsigned long long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoull(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	reg->hr_start_block = tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
+				       char *page)
+{
+	return sprintf(page, "%d\n", reg->hr_blocks);
+}
+
+static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+					const char *page,
+					size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > O2NM_MAX_NODES || tmp == 0)
+		return -ERANGE;
+
+	reg->hr_blocks = (unsigned int)tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
+				    char *page)
+{
+	unsigned int ret = 0;
+
+	if (reg->hr_bdev)
+		ret = sprintf(page, "%s\n", reg->hr_dev_name);
+
+	return ret;
+}
+
+static void o2hb_init_region_params(struct o2hb_region *reg)
+{
+	reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+	reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
+
+	mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
+	     reg->hr_start_block, reg->hr_blocks);
+	mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
+	     reg->hr_block_bytes, reg->hr_block_bits);
+	mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
+	mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
+}
+
+static int o2hb_map_slot_data(struct o2hb_region *reg)
+{
+	int i, j;
+	unsigned int last_slot;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct page *page;
+	char *raw;
+	struct o2hb_disk_slot *slot;
+
+	reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
+	if (reg->hr_tmp_block == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	reg->hr_slots = kcalloc(reg->hr_blocks,
+				sizeof(struct o2hb_disk_slot), GFP_KERNEL);
+	if (reg->hr_slots == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		slot->ds_node_num = i;
+		INIT_LIST_HEAD(&slot->ds_live_item);
+		slot->ds_raw_block = NULL;
+	}
+
+	reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
+	mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
+			   "at %u blocks per page\n",
+	     reg->hr_num_pages, reg->hr_blocks, spp);
+
+	reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
+				    GFP_KERNEL);
+	if (!reg->hr_slot_data) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_num_pages; i++) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+
+		reg->hr_slot_data[i] = page;
+
+		last_slot = i * spp;
+		raw = page_address(page);
+		for (j = 0;
+		     (j < spp) && ((j + last_slot) < reg->hr_blocks);
+		     j++) {
+			BUG_ON((j + last_slot) >= reg->hr_blocks);
+
+			slot = &reg->hr_slots[j + last_slot];
+			slot->ds_raw_block =
+				(struct o2hb_disk_heartbeat_block *) raw;
+
+			raw += reg->hr_block_bytes;
+		}
+	}
+
+	return 0;
+}
+
+/* Read in all the slots available and populate the tracking
+ * structures so that we can start with a baseline idea of what's
+ * there. */
+static int o2hb_populate_slot_data(struct o2hb_region *reg)
+{
+	int ret, i;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	ret = o2hb_read_slots(reg, reg->hr_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We only want to get an idea of the values initially in each
+	 * slot, so we do no verification - o2hb_check_slot will
+	 * actually determine if each configured slot is valid and
+	 * whether any values have changed. */
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
+
+		/* Only fill the values that o2hb_check_slot uses to
+		 * determine changing slots */
+		slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
+		slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+	}
+
+out:
+	return ret;
+}
+
+/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+				     const char *page,
+				     size_t count)
+{
+	struct task_struct *hb_task;
+	long fd;
+	int sectsize;
+	char *p = (char *)page;
+	struct fd f;
+	struct inode *inode;
+	ssize_t ret = -EINVAL;
+	int live_threshold;
+
+	if (reg->hr_bdev)
+		goto out;
+
+	/* We can't heartbeat without having had our node number
+	 * configured yet. */
+	if (o2nm_this_node() == O2NM_MAX_NODES)
+		goto out;
+
+	fd = simple_strtol(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		goto out;
+
+	if (fd < 0 || fd >= INT_MAX)
+		goto out;
+
+	f = fdget(fd);
+	if (f.file == NULL)
+		goto out;
+
+	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
+	    reg->hr_block_bytes == 0)
+		goto out2;
+
+	inode = igrab(f.file->f_mapping->host);
+	if (inode == NULL)
+		goto out2;
+
+	if (!S_ISBLK(inode->i_mode))
+		goto out3;
+
+	reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
+	if (ret) {
+		reg->hr_bdev = NULL;
+		goto out3;
+	}
+	inode = NULL;
+
+	bdevname(reg->hr_bdev, reg->hr_dev_name);
+
+	sectsize = bdev_logical_block_size(reg->hr_bdev);
+	if (sectsize != reg->hr_block_bytes) {
+		mlog(ML_ERROR,
+		     "blocksize %u incorrect for device, expected %d",
+		     reg->hr_block_bytes, sectsize);
+		ret = -EINVAL;
+		goto out3;
+	}
+
+	o2hb_init_region_params(reg);
+
+	/* Generation of zero is invalid */
+	do {
+		get_random_bytes(&reg->hr_generation,
+				 sizeof(reg->hr_generation));
+	} while (reg->hr_generation == 0);
+
+	ret = o2hb_map_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out3;
+	}
+
+	ret = o2hb_populate_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out3;
+	}
+
+	INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+
+	/*
+	 * A node is considered live after it has beat LIVE_THRESHOLD
+	 * times.  We're not steady until we've given them a chance
+	 * _after_ our first read.
+	 * The default threshold is bare minimum so as to limit the delay
+	 * during mounts. For global heartbeat, the threshold doubled for the
+	 * first region.
+	 */
+	live_threshold = O2HB_LIVE_THRESHOLD;
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock(&o2hb_live_lock);
+		if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+			live_threshold <<= 1;
+		spin_unlock(&o2hb_live_lock);
+	}
+	++live_threshold;
+	atomic_set(&reg->hr_steady_iterations, live_threshold);
+	/* unsteady_iterations is double the steady_iterations */
+	atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+
+	hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
+			      reg->hr_item.ci_name);
+	if (IS_ERR(hb_task)) {
+		ret = PTR_ERR(hb_task);
+		mlog_errno(ret);
+		goto out3;
+	}
+
+	spin_lock(&o2hb_live_lock);
+	reg->hr_task = hb_task;
+	spin_unlock(&o2hb_live_lock);
+
+	ret = wait_event_interruptible(o2hb_steady_queue,
+				atomic_read(&reg->hr_steady_iterations) == 0);
+	if (ret) {
+		atomic_set(&reg->hr_steady_iterations, 0);
+		reg->hr_aborted_start = 1;
+	}
+
+	if (reg->hr_aborted_start) {
+		ret = -EIO;
+		goto out3;
+	}
+
+	/* Ok, we were woken.  Make sure it wasn't by drop_item() */
+	spin_lock(&o2hb_live_lock);
+	hb_task = reg->hr_task;
+	if (o2hb_global_heartbeat_active())
+		set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+	spin_unlock(&o2hb_live_lock);
+
+	if (hb_task)
+		ret = count;
+	else
+		ret = -EIO;
+
+	if (hb_task && o2hb_global_heartbeat_active())
+		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+		       config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+out3:
+	iput(inode);
+out2:
+	fdput(f);
+out:
+	if (ret < 0) {
+		if (reg->hr_bdev) {
+			blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
+			reg->hr_bdev = NULL;
+		}
+	}
+	return ret;
+}
+
+static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
+                                      char *page)
+{
+	pid_t pid = 0;
+
+	spin_lock(&o2hb_live_lock);
+	if (reg->hr_task)
+		pid = task_pid_nr(reg->hr_task);
+	spin_unlock(&o2hb_live_lock);
+
+	if (!pid)
+		return 0;
+
+	return sprintf(page, "%u\n", pid);
+}
+
+struct o2hb_region_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_region *, char *);
+	ssize_t (*store)(struct o2hb_region *, const char *, size_t);
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "block_bytes",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_block_bytes_read,
+	.store	= o2hb_region_block_bytes_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_start_block = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "start_block",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_start_block_read,
+	.store	= o2hb_region_start_block_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_blocks = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "blocks",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_blocks_read,
+	.store	= o2hb_region_blocks_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_dev = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dev",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_dev_read,
+	.store	= o2hb_region_dev_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_pid = {
+       .attr   = { .ca_owner = THIS_MODULE,
+                   .ca_name = "pid",
+                   .ca_mode = S_IRUGO | S_IRUSR },
+       .show   = o2hb_region_pid_read,
+};
+
+static struct configfs_attribute *o2hb_region_attrs[] = {
+	&o2hb_region_attr_block_bytes.attr,
+	&o2hb_region_attr_start_block.attr,
+	&o2hb_region_attr_blocks.attr,
+	&o2hb_region_attr_dev.attr,
+	&o2hb_region_attr_pid.attr,
+	NULL,
+};
+
+static ssize_t o2hb_region_show(struct config_item *item,
+				struct configfs_attribute *attr,
+				char *page)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_region_attr->show)
+		ret = o2hb_region_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_region_store(struct config_item *item,
+				 struct configfs_attribute *attr,
+				 const char *page, size_t count)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_region_attr->store)
+		ret = o2hb_region_attr->store(reg, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations o2hb_region_item_ops = {
+	.release		= o2hb_region_release,
+	.show_attribute		= o2hb_region_show,
+	.store_attribute	= o2hb_region_store,
+};
+
+static struct config_item_type o2hb_region_type = {
+	.ct_item_ops	= &o2hb_region_item_ops,
+	.ct_attrs	= o2hb_region_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* heartbeat set */
+
+struct o2hb_heartbeat_group {
+	struct config_group hs_group;
+	/* some stuff? */
+};
+
+static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2hb_heartbeat_group, hs_group)
+		: NULL;
+}
+
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+	int ret = -ENOMEM;
+
+	reg->hr_debug_dir =
+		debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+	if (!reg->hr_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_livenodes =
+			o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_livenodes),
+					  sizeof(*(reg->hr_db_livenodes)),
+					  O2HB_DB_TYPE_REGION_LIVENODES,
+					  sizeof(reg->hr_live_node_bitmap),
+					  O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_regnum =
+			o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_regnum),
+					  sizeof(*(reg->hr_db_regnum)),
+					  O2HB_DB_TYPE_REGION_NUMBER,
+					  0, O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_regnum) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_elapsed_time =
+			o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_elapsed_time),
+					  sizeof(*(reg->hr_db_elapsed_time)),
+					  O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+					  0, 0, reg);
+	if (!reg->hr_debug_elapsed_time) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_pinned =
+			o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_pinned),
+					  sizeof(*(reg->hr_db_pinned)),
+					  O2HB_DB_TYPE_REGION_PINNED,
+					  0, 0, reg);
+	if (!reg->hr_debug_pinned) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	return ret;
+}
+
+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
+							  const char *name)
+{
+	struct o2hb_region *reg = NULL;
+	int ret;
+
+	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
+	if (reg == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		goto free;
+	}
+
+	spin_lock(&o2hb_live_lock);
+	reg->hr_region_num = 0;
+	if (o2hb_global_heartbeat_active()) {
+		reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+							 O2NM_MAX_REGIONS);
+		if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+			spin_unlock(&o2hb_live_lock);
+			ret = -EFBIG;
+			goto free;
+		}
+		set_bit(reg->hr_region_num, o2hb_region_bitmap);
+	}
+	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
+	spin_unlock(&o2hb_live_lock);
+
+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+	ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+	if (ret) {
+		config_item_put(&reg->hr_item);
+		goto free;
+	}
+
+	return &reg->hr_item;
+free:
+	kfree(reg);
+	return ERR_PTR(ret);
+}
+
+static void o2hb_heartbeat_group_drop_item(struct config_group *group,
+					   struct config_item *item)
+{
+	struct task_struct *hb_task;
+	struct o2hb_region *reg = to_o2hb_region(item);
+	int quorum_region = 0;
+
+	/* stop the thread when the user removes the region dir */
+	spin_lock(&o2hb_live_lock);
+	hb_task = reg->hr_task;
+	reg->hr_task = NULL;
+	reg->hr_item_dropped = 1;
+	spin_unlock(&o2hb_live_lock);
+
+	if (hb_task)
+		kthread_stop(hb_task);
+
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock(&o2hb_live_lock);
+		clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+		clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+			quorum_region = 1;
+		clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+		spin_unlock(&o2hb_live_lock);
+		printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+		       ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+			"stopped" : "start aborted"), config_item_name(item),
+		       reg->hr_dev_name);
+	}
+
+	/*
+	 * If we're racing a dev_write(), we need to wake them.  They will
+	 * check reg->hr_task
+	 */
+	if (atomic_read(&reg->hr_steady_iterations) != 0) {
+		reg->hr_aborted_start = 1;
+		atomic_set(&reg->hr_steady_iterations, 0);
+		wake_up(&o2hb_steady_queue);
+	}
+
+	config_item_put(item);
+
+	if (!o2hb_global_heartbeat_active() || !quorum_region)
+		return;
+
+	/*
+	 * If global heartbeat active and there are dependent users,
+	 * pin all regions if quorum region count <= CUT_OFF
+	 */
+	spin_lock(&o2hb_live_lock);
+
+	if (!o2hb_dependent_users)
+		goto unlock;
+
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+		o2hb_region_pin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
+}
+
+struct o2hb_heartbeat_group_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
+	ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
+};
+
+static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
+					 struct configfs_attribute *attr,
+					 char *page)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_heartbeat_group_attr->show)
+		ret = o2hb_heartbeat_group_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
+					  struct configfs_attribute *attr,
+					  const char *page, size_t count)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_heartbeat_group_attr->store)
+		ret = o2hb_heartbeat_group_attr->store(reg, page, count);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
+						     char *page)
+{
+	return sprintf(page, "%u\n", o2hb_dead_threshold);
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
+						    const char *page,
+						    size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+                return -EINVAL;
+
+	/* this will validate ranges for us. */
+	o2hb_dead_threshold_set((unsigned int) tmp);
+
+	return count;
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+				       char *page)
+{
+	return sprintf(page, "%s\n",
+		       o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+					const char *page, size_t count)
+{
+	unsigned int i;
+	int ret;
+	size_t len;
+
+	len = (page[count - 1] == '\n') ? count - 1 : count;
+	if (!len)
+		return -EINVAL;
+
+	for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+		if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len))
+			continue;
+
+		ret = o2hb_global_heartbeat_mode_set(i);
+		if (!ret)
+			printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+			       o2hb_heartbeat_mode_desc[i]);
+		return count;
+	}
+
+	return -EINVAL;
+
+}
+
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dead_threshold",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_heartbeat_group_threshold_show,
+	.store	= o2hb_heartbeat_group_threshold_store,
+};
+
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+	.attr   = { .ca_owner = THIS_MODULE,
+		.ca_name = "mode",
+		.ca_mode = S_IRUGO | S_IWUSR },
+	.show   = o2hb_heartbeat_group_mode_show,
+	.store  = o2hb_heartbeat_group_mode_store,
+};
+
+static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
+	&o2hb_heartbeat_group_attr_threshold.attr,
+	&o2hb_heartbeat_group_attr_mode.attr,
+	NULL,
+};
+
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
+	.show_attribute		= o2hb_heartbeat_group_show,
+	.store_attribute	= o2hb_heartbeat_group_store,
+};
+
+static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
+	.make_item	= o2hb_heartbeat_group_make_item,
+	.drop_item	= o2hb_heartbeat_group_drop_item,
+};
+
+static struct config_item_type o2hb_heartbeat_group_type = {
+	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
+	.ct_item_ops	= &o2hb_heartbeat_group_item_ops,
+	.ct_attrs	= o2hb_heartbeat_group_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* this is just here to avoid touching group in heartbeat.h which the
+ * entire damn world #includes */
+struct config_group *o2hb_alloc_hb_set(void)
+{
+	struct o2hb_heartbeat_group *hs = NULL;
+	struct config_group *ret = NULL;
+
+	hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
+	if (hs == NULL)
+		goto out;
+
+	config_group_init_type_name(&hs->hs_group, "heartbeat",
+				    &o2hb_heartbeat_group_type);
+
+	ret = &hs->hs_group;
+out:
+	if (ret == NULL)
+		kfree(hs);
+	return ret;
+}
+
+void o2hb_free_hb_set(struct config_group *group)
+{
+	struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
+	kfree(hs);
+}
+
+/* hb callback registration and issuing */
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
+{
+	if (type == O2HB_NUM_CB)
+		return ERR_PTR(-EINVAL);
+
+	return &o2hb_callbacks[type];
+}
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority)
+{
+	INIT_LIST_HEAD(&hc->hc_item);
+	hc->hc_func = func;
+	hc->hc_data = data;
+	hc->hc_priority = priority;
+	hc->hc_type = type;
+	hc->hc_magic = O2HB_CB_MAGIC;
+}
+EXPORT_SYMBOL_GPL(o2hb_setup_callback);
+
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
+{
+	int ret = 0, found = 0;
+	struct o2hb_region *reg;
+	char *uuid;
+
+	assert_spin_locked(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
+		uuid = config_item_name(&reg->hr_item);
+
+		/* local heartbeat */
+		if (region_uuid) {
+			if (strcmp(region_uuid, uuid))
+				continue;
+			found = 1;
+		}
+
+		if (reg->hr_item_pinned || reg->hr_item_dropped)
+			goto skip_pin;
+
+		/* Ignore ENOENT only for local hb (userdlm domain) */
+		ret = o2nm_depend_item(&reg->hr_item);
+		if (!ret) {
+			mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+			reg->hr_item_pinned = 1;
+		} else {
+			if (ret == -ENOENT && found)
+				ret = 0;
+			else {
+				mlog(ML_ERROR, "Pin region %s fails with %d\n",
+				     uuid, ret);
+				break;
+			}
+		}
+skip_pin:
+		if (found)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
+{
+	struct o2hb_region *reg;
+	char *uuid;
+	int found = 0;
+
+	assert_spin_locked(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
+		uuid = config_item_name(&reg->hr_item);
+		if (region_uuid) {
+			if (strcmp(region_uuid, uuid))
+				continue;
+			found = 1;
+		}
+
+		if (reg->hr_item_pinned) {
+			mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+			o2nm_undepend_item(&reg->hr_item);
+			reg->hr_item_pinned = 0;
+		}
+		if (found)
+			break;
+	}
+}
+
+static int o2hb_region_inc_user(const char *region_uuid)
+{
+	int ret = 0;
+
+	spin_lock(&o2hb_live_lock);
+
+	/* local heartbeat */
+	if (!o2hb_global_heartbeat_active()) {
+	    ret = o2hb_region_pin(region_uuid);
+	    goto unlock;
+	}
+
+	/*
+	 * if global heartbeat active and this is the first dependent user,
+	 * pin all regions if quorum region count <= CUT_OFF
+	 */
+	o2hb_dependent_users++;
+	if (o2hb_dependent_users > 1)
+		goto unlock;
+
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+		ret = o2hb_region_pin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
+	return ret;
+}
+
+void o2hb_region_dec_user(const char *region_uuid)
+{
+	spin_lock(&o2hb_live_lock);
+
+	/* local heartbeat */
+	if (!o2hb_global_heartbeat_active()) {
+	    o2hb_region_unpin(region_uuid);
+	    goto unlock;
+	}
+
+	/*
+	 * if global heartbeat active and there are no dependent users,
+	 * unpin all quorum regions
+	 */
+	o2hb_dependent_users--;
+	if (!o2hb_dependent_users)
+		o2hb_region_unpin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
+}
+
+int o2hb_register_callback(const char *region_uuid,
+			   struct o2hb_callback_func *hc)
+{
+	struct o2hb_callback_func *f;
+	struct o2hb_callback *hbcall;
+	int ret;
+
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+	BUG_ON(!list_empty(&hc->hc_item));
+
+	hbcall = hbcall_from_type(hc->hc_type);
+	if (IS_ERR(hbcall)) {
+		ret = PTR_ERR(hbcall);
+		goto out;
+	}
+
+	if (region_uuid) {
+		ret = o2hb_region_inc_user(region_uuid);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	down_write(&o2hb_callback_sem);
+
+	list_for_each_entry(f, &hbcall->list, hc_item) {
+		if (hc->hc_priority < f->hc_priority) {
+			list_add_tail(&hc->hc_item, &f->hc_item);
+			break;
+		}
+	}
+	if (list_empty(&hc->hc_item))
+		list_add_tail(&hc->hc_item, &hbcall->list);
+
+	up_write(&o2hb_callback_sem);
+	ret = 0;
+out:
+	mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
+	     ret, __builtin_return_address(0), hc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2hb_register_callback);
+
+void o2hb_unregister_callback(const char *region_uuid,
+			      struct o2hb_callback_func *hc)
+{
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+
+	mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
+	     __builtin_return_address(0), hc);
+
+	/* XXX Can this happen _with_ a region reference? */
+	if (list_empty(&hc->hc_item))
+		return;
+
+	if (region_uuid)
+		o2hb_region_dec_user(region_uuid);
+
+	down_write(&o2hb_callback_sem);
+
+	list_del_init(&hc->hc_item);
+
+	up_write(&o2hb_callback_sem);
+}
+EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
+
+int o2hb_check_node_heartbeating(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+
+int o2hb_check_node_heartbeating_no_sem(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long flags;
+
+	spin_lock_irqsave(&o2hb_live_lock, flags);
+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	spin_unlock_irqrestore(&o2hb_live_lock, flags);
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
+
+int o2hb_check_node_heartbeating_from_callback(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
+
+/* Makes sure our local node is configured with a node number, and is
+ * heartbeating. */
+int o2hb_check_local_node_heartbeating(void)
+{
+	u8 node_num;
+
+	/* if this node was set then we have networking */
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_MAX_NODES) {
+		mlog(ML_HEARTBEAT, "this node has not been configured.\n");
+		return 0;
+	}
+
+	return o2hb_check_node_heartbeating(node_num);
+}
+EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
+
+/*
+ * this is just a hack until we get the plumbing which flips file systems
+ * read only and drops the hb ref instead of killing the node dead.
+ */
+void o2hb_stop_all_regions(void)
+{
+	struct o2hb_region *reg;
+
+	mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
+
+	spin_lock(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
+		reg->hr_unclean_stop = 1;
+
+	spin_unlock(&o2hb_live_lock);
+}
+EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+	struct o2hb_region *reg;
+	int numregs = 0;
+	char *p;
+
+	spin_lock(&o2hb_live_lock);
+
+	p = region_uuids;
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
+		mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+		if (numregs < max_regions) {
+			memcpy(p, config_item_name(&reg->hr_item),
+			       O2HB_MAX_REGION_NAME_LEN);
+			p += O2HB_MAX_REGION_NAME_LEN;
+		}
+		numregs++;
+	}
+
+	spin_unlock(&o2hb_live_lock);
+
+	return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+	return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/kernel/fs/ocfs2/cluster/heartbeat.h b/kernel/fs/ocfs2/cluster/heartbeat.h
new file mode 100644
index 000000000..3ef5137dc
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,90 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_HEARTBEAT_H
+#define O2CLUSTER_HEARTBEAT_H
+
+#include "ocfs2_heartbeat.h"
+
+#define O2HB_REGION_TIMEOUT_MS		2000
+
+#define O2HB_MAX_REGION_NAME_LEN	32
+
+/* number of changes to be seen as live */
+#define O2HB_LIVE_THRESHOLD	   2
+/* number of equal samples to be seen as dead */
+extern unsigned int o2hb_dead_threshold;
+#define O2HB_DEFAULT_DEAD_THRESHOLD	   31
+/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
+#define O2HB_MIN_DEAD_THRESHOLD	  2
+#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
+
+#define O2HB_CB_MAGIC		0x51d1e4ec
+
+/* callback stuff */
+enum o2hb_callback_type {
+	O2HB_NODE_DOWN_CB = 0,
+	O2HB_NODE_UP_CB,
+	O2HB_NUM_CB
+};
+
+struct o2nm_node;
+typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
+
+struct o2hb_callback_func {
+	u32			hc_magic;
+	struct list_head	hc_item;
+	o2hb_cb_func		*hc_func;
+	void			*hc_data;
+	int			hc_priority;
+	enum o2hb_callback_type hc_type;
+};
+
+struct config_group *o2hb_alloc_hb_set(void);
+void o2hb_free_hb_set(struct config_group *group);
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority);
+int o2hb_register_callback(const char *region_uuid,
+			   struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(const char *region_uuid,
+			      struct o2hb_callback_func *hc);
+void o2hb_fill_node_map(unsigned long *map,
+			unsigned bytes);
+void o2hb_exit(void);
+int o2hb_init(void);
+int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num);
+int o2hb_check_node_heartbeating_from_callback(u8 node_num);
+int o2hb_check_local_node_heartbeating(void);
+void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
+
+#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/kernel/fs/ocfs2/cluster/masklog.c b/kernel/fs/ocfs2/cluster/masklog.c
new file mode 100644
index 000000000..af7598bff
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,155 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+
+#include "masklog.h"
+
+struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
+EXPORT_SYMBOL_GPL(mlog_and_bits);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
+EXPORT_SYMBOL_GPL(mlog_not_bits);
+
+static ssize_t mlog_mask_show(u64 mask, char *buf)
+{
+	char *state;
+
+	if (__mlog_test_u64(mask, mlog_and_bits))
+		state = "allow";
+	else if (__mlog_test_u64(mask, mlog_not_bits))
+		state = "deny";
+	else
+		state = "off";
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", state);
+}
+
+static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
+{
+	if (!strncasecmp(buf, "allow", 5)) {
+		__mlog_set_u64(mask, mlog_and_bits);
+		__mlog_clear_u64(mask, mlog_not_bits);
+	} else if (!strncasecmp(buf, "deny", 4)) {
+		__mlog_set_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else if (!strncasecmp(buf, "off", 3)) {
+		__mlog_clear_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+struct mlog_attribute {
+	struct attribute attr;
+	u64 mask;
+};
+
+#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
+
+#define define_mask(_name) {			\
+	.attr = {				\
+		.name = #_name,			\
+		.mode = S_IRUGO | S_IWUSR,	\
+	},					\
+	.mask = ML_##_name,			\
+}
+
+static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
+	define_mask(TCP),
+	define_mask(MSG),
+	define_mask(SOCKET),
+	define_mask(HEARTBEAT),
+	define_mask(HB_BIO),
+	define_mask(DLMFS),
+	define_mask(DLM),
+	define_mask(DLM_DOMAIN),
+	define_mask(DLM_THREAD),
+	define_mask(DLM_MASTER),
+	define_mask(DLM_RECOVERY),
+	define_mask(DLM_GLUE),
+	define_mask(VOTE),
+	define_mask(CONN),
+	define_mask(QUORUM),
+	define_mask(BASTS),
+	define_mask(CLUSTER),
+	define_mask(ERROR),
+	define_mask(NOTICE),
+	define_mask(KTHREAD),
+};
+
+static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
+
+static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
+			 char *buf)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_show(mlog_attr->mask, buf);
+}
+
+static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_store(mlog_attr->mask, buf, count);
+}
+
+static const struct sysfs_ops mlog_attr_ops = {
+	.show  = mlog_show,
+	.store = mlog_store,
+};
+
+static struct kobj_type mlog_ktype = {
+	.default_attrs = mlog_attr_ptrs,
+	.sysfs_ops     = &mlog_attr_ops,
+};
+
+static struct kset mlog_kset = {
+	.kobj   = {.ktype = &mlog_ktype},
+};
+
+int mlog_sys_init(struct kset *o2cb_kset)
+{
+	int i = 0;
+
+	while (mlog_attrs[i].attr.mode) {
+		mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
+		i++;
+	}
+	mlog_attr_ptrs[i] = NULL;
+
+	kobject_set_name(&mlog_kset.kobj, "logmask");
+	mlog_kset.kobj.kset = o2cb_kset;
+	return kset_register(&mlog_kset);
+}
+
+void mlog_sys_shutdown(void)
+{
+	kset_unregister(&mlog_kset);
+}
diff --git a/kernel/fs/ocfs2/cluster/masklog.h b/kernel/fs/ocfs2/cluster/masklog.h
new file mode 100644
index 000000000..7fdc25a4d
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,221 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_MASKLOG_H
+#define O2CLUSTER_MASKLOG_H
+
+/*
+ * For now this is a trivial wrapper around printk() that gives the critical
+ * ability to enable sets of debugging output at run-time.  In the future this
+ * will almost certainly be redirected to relayfs so that it can pay a
+ * substantially lower heisenberg tax.
+ *
+ * Callers associate the message with a bitmask and a global bitmask is
+ * maintained with help from /proc.  If any of the bits match the message is
+ * output.
+ *
+ * We must have efficient bit tests on i386 and it seems gcc still emits crazy
+ * code for the 64bit compare.  It emits very good code for the dual unsigned
+ * long tests, though, completely avoiding tests that can never pass if the
+ * caller gives a constant bitmask that fills one of the longs with all 0s.  So
+ * the desire is to have almost all of the calls decided on by comparing just
+ * one of the longs.  This leads to having infrequently given bits that are
+ * frequently matched in the high bits.
+ *
+ * _ERROR and _NOTICE are used for messages that always go to the console and
+ * have appropriate KERN_ prefixes.  We wrap these in our function instead of
+ * just calling printk() so that this can eventually make its way through
+ * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
+ * The inline tests and macro dance give GCC the opportunity to quite cleverly
+ * only emit the appropriage printk() when the caller passes in a constant
+ * mask, as is almost always the case.
+ *
+ * All this bitmask nonsense is managed from the files under
+ * /sys/fs/o2cb/logmask/.  Reading the files gives a straightforward
+ * indication of which bits are allowed (allow) or denied (off/deny).
+ * 	ENTRY deny
+ * 	EXIT deny
+ * 	TCP off
+ * 	MSG off
+ * 	SOCKET off
+ * 	ERROR allow
+ * 	NOTICE allow
+ *
+ * Writing changes the state of a given bit and requires a strictly formatted
+ * single write() call:
+ *
+ * 	write(fd, "allow", 5);
+ *
+ * Echoing allow/deny/off string into the logmask files can flip the bits
+ * on or off as expected; here is the bash script for example:
+ *
+ * log_mask="/sys/fs/o2cb/log_mask"
+ * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
+ *	echo allow >"$log_mask"/"$node"
+ * done
+ *
+ * The debugfs.ocfs2 tool can also flip the bits with the -l option:
+ *
+ * debugfs.ocfs2 -l TCP allow
+ */
+
+/* for task_struct */
+#include <linux/sched.h>
+
+/* bits that are frequently given and infrequently matched in the low word */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
+#define ML_TCP		0x0000000000000001ULL /* net cluster/tcp.c */
+#define ML_MSG		0x0000000000000002ULL /* net network messages */
+#define ML_SOCKET	0x0000000000000004ULL /* net socket lifetime */
+#define ML_HEARTBEAT	0x0000000000000008ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO	0x0000000000000010ULL /* hb io tracing */
+#define ML_DLMFS	0x0000000000000020ULL /* dlm user dlmfs */
+#define ML_DLM		0x0000000000000040ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN	0x0000000000000080ULL /* dlm domain debugging */
+#define ML_DLM_THREAD	0x0000000000000100ULL /* dlm domain thread */
+#define ML_DLM_MASTER	0x0000000000000200ULL /* dlm master functions */
+#define ML_DLM_RECOVERY	0x0000000000000400ULL /* dlm master functions */
+#define ML_DLM_GLUE	0x0000000000000800ULL /* ocfs2 dlm glue layer */
+#define ML_VOTE		0x0000000000001000ULL /* ocfs2 node messaging  */
+#define ML_CONN		0x0000000000002000ULL /* net connection management */
+#define ML_QUORUM	0x0000000000004000ULL /* net connection quorum */
+#define ML_BASTS	0x0000000000008000ULL /* dlmglue asts and basts */
+#define ML_CLUSTER	0x0000000000010000ULL /* cluster stack */
+
+/* bits that are infrequently given and frequently matched in the high word */
+#define ML_ERROR	0x1000000000000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE	0x2000000000000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD	0x4000000000000000ULL /* kernel thread activity */
+
+#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
+#ifndef MLOG_MASK_PREFIX
+#define MLOG_MASK_PREFIX 0
+#endif
+
+/*
+ * When logging is disabled, force the bit test to 0 for anything other
+ * than errors and notices, allowing gcc to remove the code completely.
+ * When enabled, allow all masks.
+ */
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
+#define ML_ALLOWED_BITS ~0
+#else
+#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
+#endif
+
+#define MLOG_MAX_BITS 64
+
+struct mlog_bits {
+	unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
+};
+
+extern struct mlog_bits mlog_and_bits, mlog_not_bits;
+
+#if BITS_PER_LONG == 32
+
+#define __mlog_test_u64(mask, bits)			\
+	( (u32)(mask & 0xffffffff) & bits.words[0] || 	\
+	  ((u64)(mask) >> 32) & bits.words[1] )
+#define __mlog_set_u64(mask, bits) do {			\
+	bits.words[0] |= (u32)(mask & 0xffffffff);	\
+       	bits.words[1] |= (u64)(mask) >> 32;		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {		\
+	bits.words[0] &= ~((u32)(mask & 0xffffffff));	\
+       	bits.words[1] &= ~((u64)(mask) >> 32);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) {				\
+	{						\
+		[0] = (u32)(mask & 0xffffffff),		\
+		[1] = (u64)(mask) >> 32,		\
+	}						\
+}
+
+#else /* 32bit long above, 64bit long below */
+
+#define __mlog_test_u64(mask, bits)	((mask) & bits.words[0])
+#define __mlog_set_u64(mask, bits) do {		\
+	bits.words[0] |= (mask);		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {	\
+	bits.words[0] &= ~(mask);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) { { (mask) } }
+
+#endif
+
+/*
+ * smp_processor_id() "helpfully" screams when called outside preemptible
+ * regions in current kernels.  sles doesn't have the variants that don't
+ * scream.  just do this instead of trying to guess which we're building
+ * against.. *sigh*.
+ */
+#define __mlog_cpu_guess ({		\
+	unsigned long _cpu = get_cpu();	\
+	put_cpu();			\
+	_cpu;				\
+})
+
+/* In the following two macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define __mlog_printk(level, fmt, args...)				\
+	printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,		\
+	       task_pid_nr(current), __mlog_cpu_guess,			\
+	       __PRETTY_FUNCTION__, __LINE__ , ##args)
+
+#define mlog(mask, fmt, args...) do {					\
+	u64 __m = MLOG_MASK_PREFIX | (mask);				\
+	if ((__m & ML_ALLOWED_BITS) &&					\
+	    __mlog_test_u64(__m, mlog_and_bits) &&			\
+	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
+		if (__m & ML_ERROR)					\
+			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
+		else if (__m & ML_NOTICE)				\
+			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
+		else __mlog_printk(KERN_INFO, fmt , ##args);		\
+	}								\
+} while (0)
+
+#define mlog_errno(st) ({						\
+	int _st = (st);							\
+	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
+	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC &&		\
+	    _st != -EDQUOT)						\
+		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
+	_st;								\
+})
+
+#define mlog_bug_on_msg(cond, fmt, args...) do {			\
+	if (cond) {							\
+		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
+		mlog(ML_ERROR, fmt, ##args);				\
+		BUG();							\
+	}								\
+} while (0)
+
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+int mlog_sys_init(struct kset *o2cb_subsys);
+void mlog_sys_shutdown(void);
+
+#endif /* O2CLUSTER_MASKLOG_H */
diff --git a/kernel/fs/ocfs2/cluster/netdebug.c b/kernel/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 000000000..27d1242c8
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,539 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <linux/uaccess.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+
+#include "tcp_internal.h"
+
+#define O2NET_DEBUG_DIR		"o2net"
+#define SC_DEBUG_NAME		"sock_containers"
+#define NST_DEBUG_NAME		"send_tracking"
+#define STATS_DEBUG_NAME	"stats"
+#define NODES_DEBUG_NAME	"connected_nodes"
+
+#define SHOW_SOCK_CONTAINERS	0
+#define SHOW_SOCK_STATS		1
+
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
+
+static DEFINE_SPINLOCK(o2net_debug_lock);
+
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+	spin_lock(&o2net_debug_lock);
+	list_add(&nst->st_net_debug_item, &send_tracking);
+	spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+	spin_lock(&o2net_debug_lock);
+	if (!list_empty(&nst->st_net_debug_item))
+		list_del_init(&nst->st_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_send_tracking
+			*next_nst(struct o2net_send_tracking *nst_start)
+{
+	struct o2net_send_tracking *nst, *ret = NULL;
+
+	assert_spin_locked(&o2net_debug_lock);
+
+	list_for_each_entry(nst, &nst_start->st_net_debug_item,
+			    st_net_debug_item) {
+		/* discover the head of the list */
+		if (&nst->st_net_debug_item == &send_tracking)
+			break;
+
+		/* use st_task to detect real nsts in the list */
+		if (nst->st_task != NULL) {
+			ret = nst;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	spin_unlock(&o2net_debug_lock);
+
+	return nst;
+}
+
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	list_del_init(&dummy_nst->st_net_debug_item);
+	if (nst)
+		list_add(&dummy_nst->st_net_debug_item,
+			 &nst->st_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+
+	return nst; /* unused, just needs to be null when done */
+}
+
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+	ktime_t now;
+	s64 sock, send, status;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	if (!nst)
+		goto out;
+
+	now = ktime_get();
+	sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
+	send = ktime_to_us(ktime_sub(now, nst->st_send_time));
+	status = ktime_to_us(ktime_sub(now, nst->st_status_time));
+
+	/* get_task_comm isn't exported.  oh well. */
+	seq_printf(seq, "%p:\n"
+		   "  pid:          %lu\n"
+		   "  tgid:         %lu\n"
+		   "  process name: %s\n"
+		   "  node:         %u\n"
+		   "  sc:           %p\n"
+		   "  message id:   %d\n"
+		   "  message type: %u\n"
+		   "  message key:  0x%08x\n"
+		   "  sock acquiry: %lld usecs ago\n"
+		   "  send start:   %lld usecs ago\n"
+		   "  wait start:   %lld usecs ago\n",
+		   nst, (unsigned long)task_pid_nr(nst->st_task),
+		   (unsigned long)nst->st_task->tgid,
+		   nst->st_task->comm, nst->st_node,
+		   nst->st_sc, nst->st_id, nst->st_msg_type,
+		   nst->st_msg_key,
+		   (long long)sock,
+		   (long long)send,
+		   (long long)status);
+
+out:
+	spin_unlock(&o2net_debug_lock);
+
+	return 0;
+}
+
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations nst_seq_ops = {
+	.start = nst_seq_start,
+	.next = nst_seq_next,
+	.stop = nst_seq_stop,
+	.show = nst_seq_show,
+};
+
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_send_tracking *dummy_nst;
+
+	dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
+	if (!dummy_nst)
+		return -ENOMEM;
+	o2net_debug_add_nst(dummy_nst);
+
+	return 0;
+}
+
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct o2net_send_tracking *dummy_nst = seq->private;
+
+	o2net_debug_del_nst(dummy_nst);
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations nst_seq_fops = {
+	.open = nst_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = nst_fop_release,
+};
+
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+	spin_lock(&o2net_debug_lock);
+	list_add(&sc->sc_net_debug_item, &sock_containers);
+	spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+	spin_lock(&o2net_debug_lock);
+	list_del_init(&sc->sc_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+}
+
+struct o2net_sock_debug {
+	int dbg_ctxt;
+	struct o2net_sock_container *dbg_sock;
+};
+
+static struct o2net_sock_container
+			*next_sc(struct o2net_sock_container *sc_start)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+
+	assert_spin_locked(&o2net_debug_lock);
+
+	list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+			    sc_net_debug_item) {
+		/* discover the head of the list miscast as a sc */
+		if (&sc->sc_net_debug_item == &sock_containers)
+			break;
+
+		/* use sc_page to detect real scs in the list */
+		if (sc->sc_page != NULL) {
+			ret = sc;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+	spin_unlock(&o2net_debug_lock);
+
+	return sc;
+}
+
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+	list_del_init(&dummy_sc->sc_net_debug_item);
+	if (sc)
+		list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+
+	return sc; /* unused, just needs to be null when done */
+}
+
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)		((_s)->sc_send_count)
+# define sc_recv_count(_s)		((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)		(0U)
+# define sc_recv_count(_s)		(0U)
+# define sc_tv_acquiry_total_ns(_s)	(0LL)
+# define sc_tv_send_total_ns(_s)	(0LL)
+# define sc_tv_status_total_ns(_s)	(0LL)
+# define sc_tv_process_total_ns(_s)	(0LL)
+#endif
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION		1
+static void sc_show_sock_stats(struct seq_file *seq,
+			       struct o2net_sock_container *sc)
+{
+	if (!sc)
+		return;
+
+	seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+		   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+		   (long long)sc_tv_acquiry_total_ns(sc),
+		   (long long)sc_tv_send_total_ns(sc),
+		   (long long)sc_tv_status_total_ns(sc),
+		   (unsigned long)sc_recv_count(sc),
+		   (long long)sc_tv_process_total_ns(sc));
+}
+
+static void sc_show_sock_container(struct seq_file *seq,
+				   struct o2net_sock_container *sc)
+{
+	struct inet_sock *inet = NULL;
+	__be32 saddr = 0, daddr = 0;
+	__be16 sport = 0, dport = 0;
+
+	if (!sc)
+		return;
+
+	if (sc->sc_sock) {
+		inet = inet_sk(sc->sc_sock->sk);
+		/* the stack's structs aren't sparse endian clean */
+		saddr = (__force __be32)inet->inet_saddr;
+		daddr = (__force __be32)inet->inet_daddr;
+		sport = (__force __be16)inet->inet_sport;
+		dport = (__force __be16)inet->inet_dport;
+	}
+
+	/* XXX sigh, inet-> doesn't have sparse annotation so any
+	 * use of it here generates a warning with -Wbitwise */
+	seq_printf(seq, "%p:\n"
+		   "  krefs:           %d\n"
+		   "  sock:            %pI4:%u -> "
+				      "%pI4:%u\n"
+		   "  remote node:     %s\n"
+		   "  page off:        %zu\n"
+		   "  handshake ok:    %u\n"
+		   "  timer:           %lld usecs\n"
+		   "  data ready:      %lld usecs\n"
+		   "  advance start:   %lld usecs\n"
+		   "  advance stop:    %lld usecs\n"
+		   "  func start:      %lld usecs\n"
+		   "  func stop:       %lld usecs\n"
+		   "  func key:        0x%08x\n"
+		   "  func type:       %u\n",
+		   sc,
+		   atomic_read(&sc->sc_kref.refcount),
+		   &saddr, inet ? ntohs(sport) : 0,
+		   &daddr, inet ? ntohs(dport) : 0,
+		   sc->sc_node->nd_name,
+		   sc->sc_page_off,
+		   sc->sc_handshake_ok,
+		   (long long)ktime_to_us(sc->sc_tv_timer),
+		   (long long)ktime_to_us(sc->sc_tv_data_ready),
+		   (long long)ktime_to_us(sc->sc_tv_advance_start),
+		   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+		   (long long)ktime_to_us(sc->sc_tv_func_start),
+		   (long long)ktime_to_us(sc->sc_tv_func_stop),
+		   sc->sc_msg_key,
+		   sc->sc_msg_type);
+}
+
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+
+	if (sc) {
+		if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+			sc_show_sock_container(seq, sc);
+		else
+			sc_show_sock_stats(seq, sc);
+	}
+
+	spin_unlock(&o2net_debug_lock);
+
+	return 0;
+}
+
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations sc_seq_ops = {
+	.start = sc_seq_start,
+	.next = sc_seq_next,
+	.stop = sc_seq_stop,
+	.show = sc_seq_show,
+};
+
+static int sc_common_open(struct file *file, int ctxt)
+{
+	struct o2net_sock_debug *sd;
+	struct o2net_sock_container *dummy_sc;
+
+	dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
+	if (!dummy_sc)
+		return -ENOMEM;
+
+	sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
+	if (!sd) {
+		kfree(dummy_sc);
+		return -ENOMEM;
+	}
+
+	sd->dbg_ctxt = ctxt;
+	sd->dbg_sock = dummy_sc;
+
+	o2net_debug_add_sc(dummy_sc);
+
+	return 0;
+}
+
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *dummy_sc = sd->dbg_sock;
+
+	o2net_debug_del_sc(dummy_sc);
+	return seq_release_private(inode, file);
+}
+
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+	return sc_common_open(file, SHOW_SOCK_STATS);
+}
+
+static const struct file_operations stats_seq_fops = {
+	.open = stats_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = sc_fop_release,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+	return sc_common_open(file, SHOW_SOCK_CONTAINERS);
+}
+
+static const struct file_operations sc_seq_fops = {
+	.open = sc_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = sc_fop_release,
+};
+
+static int o2net_fill_bitmap(char *buf, int len)
+{
+	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int i = -1, out = 0;
+
+	o2net_fill_node_map(map, sizeof(map));
+
+	while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+	out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+	return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+	char *buf;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+	file->private_data = buf;
+
+	return 0;
+}
+
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+	.open		= nodes_fop_open,
+	.release	= o2net_debug_release,
+	.read		= o2net_debug_read,
+	.llseek		= generic_file_llseek,
+};
+
+void o2net_debugfs_exit(void)
+{
+	debugfs_remove(nodes_dentry);
+	debugfs_remove(stats_dentry);
+	debugfs_remove(sc_dentry);
+	debugfs_remove(nst_dentry);
+	debugfs_remove(o2net_dentry);
+}
+
+int o2net_debugfs_init(void)
+{
+	umode_t mode = S_IFREG|S_IRUSR;
+
+	o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+	if (o2net_dentry)
+		nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+					o2net_dentry, NULL, &nst_seq_fops);
+	if (nst_dentry)
+		sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+					o2net_dentry, NULL, &sc_seq_fops);
+	if (sc_dentry)
+		stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+					o2net_dentry, NULL, &stats_seq_fops);
+	if (stats_dentry)
+		nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+					o2net_dentry, NULL, &nodes_fops);
+	if (nodes_dentry)
+		return 0;
+
+	o2net_debugfs_exit();
+	mlog_errno(-ENOMEM);
+	return -ENOMEM;
+}
+
+#endif	/* CONFIG_DEBUG_FS */
diff --git a/kernel/fs/ocfs2/cluster/nodemanager.c b/kernel/fs/ocfs2/cluster/nodemanager.c
new file mode 100644
index 000000000..441c84e16
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,987 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+#include "masklog.h"
+#include "sys.h"
+
+/* for now we operate under the assertion that there can be only one
+ * cluster active at a time.  Changing this will require trickling
+ * cluster references throughout where nodes are looked up */
+struct o2nm_cluster *o2nm_single_cluster = NULL;
+
+char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+		"reset",	/* O2NM_FENCE_RESET */
+		"panic",	/* O2NM_FENCE_PANIC */
+};
+
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
+{
+	struct o2nm_node *node = NULL;
+
+	if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
+		goto out;
+
+	read_lock(&o2nm_single_cluster->cl_nodes_lock);
+	node = o2nm_single_cluster->cl_nodes[node_num];
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&o2nm_single_cluster->cl_nodes_lock);
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
+{
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
+
+	if (cluster == NULL)
+		return -EINVAL;
+
+	read_lock(&cluster->cl_nodes_lock);
+	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+	read_unlock(&cluster->cl_nodes_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
+
+static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
+						  __be32 ip_needle,
+						  struct rb_node ***ret_p,
+						  struct rb_node **ret_parent)
+{
+	struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct o2nm_node *node, *ret = NULL;
+
+	while (*p) {
+		int cmp;
+
+		parent = *p;
+		node = rb_entry(parent, struct o2nm_node, nd_ip_node);
+
+		cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
+				sizeof(ip_needle));
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else {
+			ret = node;
+			break;
+		}
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+
+	return ret;
+}
+
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
+{
+	struct o2nm_node *node = NULL;
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	if (cluster == NULL)
+		goto out;
+
+	read_lock(&cluster->cl_nodes_lock);
+	node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&cluster->cl_nodes_lock);
+
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
+
+void o2nm_node_put(struct o2nm_node *node)
+{
+	config_item_put(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_put);
+
+void o2nm_node_get(struct o2nm_node *node)
+{
+	config_item_get(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_get);
+
+u8 o2nm_this_node(void)
+{
+	u8 node_num = O2NM_MAX_NODES;
+
+	if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
+		node_num = o2nm_single_cluster->cl_local_node;
+
+	return node_num;
+}
+EXPORT_SYMBOL_GPL(o2nm_this_node);
+
+/* node configfs bits */
+
+static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct o2nm_cluster,
+			     cl_group)
+		: NULL;
+}
+
+static struct o2nm_node *to_o2nm_node(struct config_item *item)
+{
+	return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
+}
+
+static void o2nm_node_release(struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	kfree(node);
+}
+
+static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_num);
+}
+
+static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
+{
+	/* through the first node_set .parent
+	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
+	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+}
+
+enum {
+	O2NM_NODE_ATTR_NUM = 0,
+	O2NM_NODE_ATTR_PORT,
+	O2NM_NODE_ATTR_ADDRESS,
+	O2NM_NODE_ATTR_LOCAL,
+};
+
+static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+				   size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp >= O2NM_MAX_NODES)
+		return -ERANGE;
+
+	/* once we're in the cl_nodes tree networking can look us up by
+	 * node number and try to use our address and port attributes
+	 * to connect to this node.. make sure that they've been set
+	 * before writing the node attribute? */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	write_lock(&cluster->cl_nodes_lock);
+	if (cluster->cl_nodes[tmp])
+		p = NULL;
+	else  {
+		cluster->cl_nodes[tmp] = node;
+		node->nd_num = tmp;
+		set_bit(tmp, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (p == NULL)
+		return -EEXIST;
+
+	return count;
+}
+static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+}
+
+static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+					 const char *page, size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp == 0)
+		return -EINVAL;
+	if (tmp >= (u16)-1)
+		return -ERANGE;
+
+	node->nd_ipv4_port = htons(tmp);
+
+	return count;
+}
+
+static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
+}
+
+static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+					    const char *page,
+					    size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	int ret, i;
+	struct rb_node **p, *parent;
+	unsigned int octets[4];
+	__be32 ipv4_addr = 0;
+
+	ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
+		     &octets[1], &octets[0]);
+	if (ret != 4)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(octets); i++) {
+		if (octets[i] > 255)
+			return -ERANGE;
+		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
+	}
+
+	ret = 0;
+	write_lock(&cluster->cl_nodes_lock);
+	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
+		ret = -EEXIST;
+	else {
+		rb_link_node(&node->nd_ip_node, parent, p);
+		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (ret)
+		return ret;
+
+	memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
+
+	return count;
+}
+
+static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_local);
+}
+
+static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+				     size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+	ssize_t ret;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	tmp = !!tmp; /* boolean of whether this node wants to be local */
+
+	/* setting local turns on networking rx for now so we require having
+	 * set everything else first */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	/* the only failure case is trying to set a new local node
+	 * when a different one is already set */
+	if (tmp && tmp == cluster->cl_has_local &&
+	    cluster->cl_local_node != node->nd_num)
+		return -EBUSY;
+
+	/* bring up the rx thread if we're setting the new local node. */
+	if (tmp && !cluster->cl_has_local) {
+		ret = o2net_start_listening(node);
+		if (ret)
+			return ret;
+	}
+
+	if (!tmp && cluster->cl_has_local &&
+	    cluster->cl_local_node == node->nd_num) {
+		o2net_stop_listening(node);
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	node->nd_local = tmp;
+	if (node->nd_local) {
+		cluster->cl_has_local = tmp;
+		cluster->cl_local_node = node->nd_num;
+	}
+
+	return count;
+}
+
+struct o2nm_node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2nm_node *, char *);
+	ssize_t (*store)(struct o2nm_node *, const char *, size_t);
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_num = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "num",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_num_read,
+	.store	= o2nm_node_num_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_port",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_port_read,
+	.store	= o2nm_node_ipv4_port_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_address",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_address_read,
+	.store	= o2nm_node_ipv4_address_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_local = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "local",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_local_read,
+	.store	= o2nm_node_local_write,
+};
+
+static struct configfs_attribute *o2nm_node_attrs[] = {
+	[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
+	[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
+	[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
+	[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+	NULL,
+};
+
+static int o2nm_attr_index(struct configfs_attribute *attr)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
+		if (attr == o2nm_node_attrs[i])
+			return i;
+	}
+	BUG();
+	return 0;
+}
+
+static ssize_t o2nm_node_show(struct config_item *item,
+			      struct configfs_attribute *attr,
+			      char *page)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2nm_node_attr->show)
+		ret = o2nm_node_attr->show(node, page);
+	return ret;
+}
+
+static ssize_t o2nm_node_store(struct config_item *item,
+			       struct configfs_attribute *attr,
+			       const char *page, size_t count)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret;
+	int attr_index = o2nm_attr_index(attr);
+
+	if (o2nm_node_attr->store == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (test_bit(attr_index, &node->nd_set_attributes))
+		return -EBUSY;
+
+	ret = o2nm_node_attr->store(node, page, count);
+	if (ret < count)
+		goto out;
+
+	set_bit(attr_index, &node->nd_set_attributes);
+out:
+	return ret;
+}
+
+static struct configfs_item_operations o2nm_node_item_ops = {
+	.release		= o2nm_node_release,
+	.show_attribute		= o2nm_node_show,
+	.store_attribute	= o2nm_node_store,
+};
+
+static struct config_item_type o2nm_node_type = {
+	.ct_item_ops	= &o2nm_node_item_ops,
+	.ct_attrs	= o2nm_node_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* node set */
+
+struct o2nm_node_group {
+	struct config_group ns_group;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2nm_node_group, ns_group)
+		: NULL;
+}
+#endif
+
+struct o2nm_cluster_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2nm_cluster *, char *);
+	ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
+};
+
+static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
+                                       unsigned int *val)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp == 0)
+		return -EINVAL;
+	if (tmp >= (u32)-1)
+		return -ERANGE;
+
+	*val = tmp;
+
+	return count;
+}
+
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+}
+
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	ssize_t ret;
+	unsigned int val;
+
+	ret =  o2nm_cluster_attr_write(page, count, &val);
+
+	if (ret > 0) {
+		if (cluster->cl_idle_timeout_ms != val
+			&& o2net_num_connected_peers()) {
+			mlog(ML_NOTICE,
+			     "o2net: cannot change idle timeout after "
+			     "the first peer has agreed to it."
+			     "  %d connected peers\n",
+			     o2net_num_connected_peers());
+			ret = -EINVAL;
+		} else if (val <= cluster->cl_keepalive_delay_ms) {
+			mlog(ML_NOTICE, "o2net: idle timeout must be larger "
+			     "than keepalive delay\n");
+			ret = -EINVAL;
+		} else {
+			cluster->cl_idle_timeout_ms = val;
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	ssize_t ret;
+	unsigned int val;
+
+	ret =  o2nm_cluster_attr_write(page, count, &val);
+
+	if (ret > 0) {
+		if (cluster->cl_keepalive_delay_ms != val
+		    && o2net_num_connected_peers()) {
+			mlog(ML_NOTICE,
+			     "o2net: cannot change keepalive delay after"
+			     " the first peer has agreed to it."
+			     "  %d connected peers\n",
+			     o2net_num_connected_peers());
+			ret = -EINVAL;
+		} else if (val >= cluster->cl_idle_timeout_ms) {
+			mlog(ML_NOTICE, "o2net: keepalive delay must be "
+			     "smaller than idle timeout\n");
+			ret = -EINVAL;
+		} else {
+			cluster->cl_keepalive_delay_ms = val;
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	return o2nm_cluster_attr_write(page, count,
+	                               &cluster->cl_reconnect_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_fence_method_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	ssize_t ret = 0;
+
+	if (cluster)
+		ret = sprintf(page, "%s\n",
+			      o2nm_fence_method_desc[cluster->cl_fence_method]);
+	return ret;
+}
+
+static ssize_t o2nm_cluster_attr_fence_method_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	unsigned int i;
+
+	if (page[count - 1] != '\n')
+		goto bail;
+
+	for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
+		if (count != strlen(o2nm_fence_method_desc[i]) + 1)
+			continue;
+		if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
+			continue;
+		if (cluster->cl_fence_method != i) {
+			printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
+			       o2nm_fence_method_desc[i]);
+			cluster->cl_fence_method = i;
+		}
+		return count;
+	}
+
+bail:
+	return -EINVAL;
+}
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "idle_timeout_ms",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_idle_timeout_ms_read,
+	.store	= o2nm_cluster_attr_idle_timeout_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "keepalive_delay_ms",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_keepalive_delay_ms_read,
+	.store	= o2nm_cluster_attr_keepalive_delay_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "reconnect_delay_ms",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_reconnect_delay_ms_read,
+	.store	= o2nm_cluster_attr_reconnect_delay_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "fence_method",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_fence_method_read,
+	.store	= o2nm_cluster_attr_fence_method_write,
+};
+
+static struct configfs_attribute *o2nm_cluster_attrs[] = {
+	&o2nm_cluster_attr_idle_timeout_ms.attr,
+	&o2nm_cluster_attr_keepalive_delay_ms.attr,
+	&o2nm_cluster_attr_reconnect_delay_ms.attr,
+	&o2nm_cluster_attr_fence_method.attr,
+	NULL,
+};
+static ssize_t o2nm_cluster_show(struct config_item *item,
+                                 struct configfs_attribute *attr,
+                                 char *page)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	struct o2nm_cluster_attribute *o2nm_cluster_attr =
+		container_of(attr, struct o2nm_cluster_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2nm_cluster_attr->show)
+		ret = o2nm_cluster_attr->show(cluster, page);
+	return ret;
+}
+
+static ssize_t o2nm_cluster_store(struct config_item *item,
+                                  struct configfs_attribute *attr,
+                                  const char *page, size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	struct o2nm_cluster_attribute *o2nm_cluster_attr =
+		container_of(attr, struct o2nm_cluster_attribute, attr);
+	ssize_t ret;
+
+	if (o2nm_cluster_attr->store == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = o2nm_cluster_attr->store(cluster, page, count);
+	if (ret < count)
+		goto out;
+out:
+	return ret;
+}
+
+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
+						     const char *name)
+{
+	struct o2nm_node *node = NULL;
+
+	if (strlen(name) > O2NM_MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
+	if (node == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
+	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
+	spin_lock_init(&node->nd_lock);
+
+	mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
+	return &node->nd_item;
+}
+
+static void o2nm_node_group_drop_item(struct config_group *group,
+				      struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
+
+	o2net_disconnect_node(node);
+
+	if (cluster->cl_has_local &&
+	    (cluster->cl_local_node == node->nd_num)) {
+		cluster->cl_has_local = 0;
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+		o2net_stop_listening(node);
+	}
+
+	/* XXX call into net to stop this node from trading messages */
+
+	write_lock(&cluster->cl_nodes_lock);
+
+	/* XXX sloppy */
+	if (node->nd_ipv4_address)
+		rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+
+	/* nd_num might be 0 if the node number hasn't been set.. */
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		cluster->cl_nodes[node->nd_num] = NULL;
+		clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+
+	mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+	     config_item_name(&node->nd_item));
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_node_group_group_ops = {
+	.make_item	= o2nm_node_group_make_item,
+	.drop_item	= o2nm_node_group_drop_item,
+};
+
+static struct config_item_type o2nm_node_group_type = {
+	.ct_group_ops	= &o2nm_node_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster */
+
+static void o2nm_cluster_release(struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+
+	kfree(cluster->cl_group.default_groups);
+	kfree(cluster);
+}
+
+static struct configfs_item_operations o2nm_cluster_item_ops = {
+	.release	= o2nm_cluster_release,
+	.show_attribute		= o2nm_cluster_show,
+	.store_attribute	= o2nm_cluster_store,
+};
+
+static struct config_item_type o2nm_cluster_type = {
+	.ct_item_ops	= &o2nm_cluster_item_ops,
+	.ct_attrs	= o2nm_cluster_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster set */
+
+struct o2nm_cluster_group {
+	struct configfs_subsystem cs_subsys;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
+{
+	return group ?
+		container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
+	       : NULL;
+}
+#endif
+
+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
+							  const char *name)
+{
+	struct o2nm_cluster *cluster = NULL;
+	struct o2nm_node_group *ns = NULL;
+	struct config_group *o2hb_group = NULL, *ret = NULL;
+	void *defs = NULL;
+
+	/* this runs under the parent dir's i_mutex; there can be only
+	 * one caller in here at a time */
+	if (o2nm_single_cluster)
+		return ERR_PTR(-ENOSPC);
+
+	cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
+	ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
+	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+	o2hb_group = o2hb_alloc_hb_set();
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+		goto out;
+
+	config_group_init_type_name(&cluster->cl_group, name,
+				    &o2nm_cluster_type);
+	config_group_init_type_name(&ns->ns_group, "node",
+				    &o2nm_node_group_type);
+
+	cluster->cl_group.default_groups = defs;
+	cluster->cl_group.default_groups[0] = &ns->ns_group;
+	cluster->cl_group.default_groups[1] = o2hb_group;
+	cluster->cl_group.default_groups[2] = NULL;
+	rwlock_init(&cluster->cl_nodes_lock);
+	cluster->cl_node_ip_tree = RB_ROOT;
+	cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
+	cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
+	cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
+	cluster->cl_fence_method       = O2NM_FENCE_RESET;
+
+	ret = &cluster->cl_group;
+	o2nm_single_cluster = cluster;
+
+out:
+	if (ret == NULL) {
+		kfree(cluster);
+		kfree(ns);
+		o2hb_free_hb_set(o2hb_group);
+		kfree(defs);
+		ret = ERR_PTR(-ENOMEM);
+	}
+
+	return ret;
+}
+
+static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	int i;
+	struct config_item *killme;
+
+	BUG_ON(o2nm_single_cluster != cluster);
+	o2nm_single_cluster = NULL;
+
+	for (i = 0; cluster->cl_group.default_groups[i]; i++) {
+		killme = &cluster->cl_group.default_groups[i]->cg_item;
+		cluster->cl_group.default_groups[i] = NULL;
+		config_item_put(killme);
+	}
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_cluster_group_group_ops = {
+	.make_group	= o2nm_cluster_group_make_group,
+	.drop_item	= o2nm_cluster_group_drop_item,
+};
+
+static struct config_item_type o2nm_cluster_group_type = {
+	.ct_group_ops	= &o2nm_cluster_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct o2nm_cluster_group o2nm_cluster_group = {
+	.cs_subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "cluster",
+				.ci_type = &o2nm_cluster_group_type,
+			},
+		},
+	},
+};
+
+int o2nm_depend_item(struct config_item *item)
+{
+	return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+void o2nm_undepend_item(struct config_item *item)
+{
+	configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+int o2nm_depend_this_node(void)
+{
+	int ret = 0;
+	struct o2nm_node *local_node;
+
+	local_node = o2nm_get_node_by_num(o2nm_this_node());
+	if (!local_node) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = o2nm_depend_item(&local_node->nd_item);
+	o2nm_node_put(local_node);
+
+out:
+	return ret;
+}
+
+void o2nm_undepend_this_node(void)
+{
+	struct o2nm_node *local_node;
+
+	local_node = o2nm_get_node_by_num(o2nm_this_node());
+	BUG_ON(!local_node);
+
+	o2nm_undepend_item(&local_node->nd_item);
+	o2nm_node_put(local_node);
+}
+
+
+static void __exit exit_o2nm(void)
+{
+	/* XXX sync with hb callbacks and shut down hb? */
+	o2net_unregister_hb_callbacks();
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+	o2cb_sys_shutdown();
+
+	o2net_exit();
+	o2hb_exit();
+}
+
+static int __init init_o2nm(void)
+{
+	int ret = -1;
+
+	ret = o2hb_init();
+	if (ret)
+		goto out;
+
+	ret = o2net_init();
+	if (ret)
+		goto out_o2hb;
+
+	ret = o2net_register_hb_callbacks();
+	if (ret)
+		goto out_o2net;
+
+	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
+	mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
+	ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
+	if (ret) {
+		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
+		goto out_callbacks;
+	}
+
+	ret = o2cb_sys_init();
+	if (!ret)
+		goto out;
+
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+out_callbacks:
+	o2net_unregister_hb_callbacks();
+out_o2net:
+	o2net_exit();
+out_o2hb:
+	o2hb_exit();
+out:
+	return ret;
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
+
+module_init(init_o2nm)
+module_exit(exit_o2nm)
diff --git a/kernel/fs/ocfs2/cluster/nodemanager.h b/kernel/fs/ocfs2/cluster/nodemanager.h
new file mode 100644
index 000000000..09ea2d388
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,88 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_NODEMANAGER_H
+#define O2CLUSTER_NODEMANAGER_H
+
+#include "ocfs2_nodemanager.h"
+
+/* This totally doesn't belong here. */
+#include <linux/configfs.h>
+#include <linux/rbtree.h>
+
+enum o2nm_fence_method {
+	O2NM_FENCE_RESET	= 0,
+	O2NM_FENCE_PANIC,
+	O2NM_FENCE_METHODS,	/* Number of fence methods */
+};
+
+struct o2nm_node {
+	spinlock_t		nd_lock;
+	struct config_item	nd_item;
+	char			nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
+	__u8			nd_num;
+	/* only one address per node, as attributes, for now. */
+	__be32			nd_ipv4_address;
+	__be16			nd_ipv4_port;
+	struct rb_node		nd_ip_node;
+	/* there can be only one local node for now */
+	int			nd_local;
+
+	unsigned long		nd_set_attributes;
+};
+
+struct o2nm_cluster {
+	struct config_group	cl_group;
+	unsigned		cl_has_local:1;
+	u8			cl_local_node;
+	rwlock_t		cl_nodes_lock;
+	struct o2nm_node  	*cl_nodes[O2NM_MAX_NODES];
+	struct rb_root		cl_node_ip_tree;
+	unsigned int		cl_idle_timeout_ms;
+	unsigned int		cl_keepalive_delay_ms;
+	unsigned int		cl_reconnect_delay_ms;
+	enum o2nm_fence_method	cl_fence_method;
+
+	/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
+	unsigned long	cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+extern struct o2nm_cluster *o2nm_single_cluster;
+
+u8 o2nm_this_node(void);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
+void o2nm_node_get(struct o2nm_node *node);
+void o2nm_node_put(struct o2nm_node *node);
+
+int o2nm_depend_item(struct config_item *item);
+void o2nm_undepend_item(struct config_item *item);
+int o2nm_depend_this_node(void);
+void o2nm_undepend_this_node(void);
+
+#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h b/kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h
new file mode 100644
index 000000000..3f4151da9
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_heartbeat.h
+ *
+ * On-disk structures for ocfs2_heartbeat
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_HEARTBEAT_H
+#define _OCFS2_HEARTBEAT_H
+
+struct o2hb_disk_heartbeat_block {
+	__le64 hb_seq;
+	__u8  hb_node;
+	__u8  hb_pad1[3];
+	__le32 hb_cksum;
+	__le64 hb_generation;
+	__le32 hb_dead_ms;
+};
+
+#endif /* _OCFS2_HEARTBEAT_H */
diff --git a/kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h b/kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h
new file mode 100644
index 000000000..49b594325
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_nodemanager.h
+ *
+ * Header describing the interface between userspace and the kernel
+ * for the ocfs2_nodemanager module.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef _OCFS2_NODEMANAGER_H
+#define _OCFS2_NODEMANAGER_H
+
+#define O2NM_API_VERSION	5
+
+#define O2NM_MAX_NODES		255
+#define O2NM_INVALID_NODE_NUM	255
+
+/* host name, group name, cluster name all 64 bytes */
+#define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
+
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION**  Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS	32
+
+#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/kernel/fs/ocfs2/cluster/quorum.c b/kernel/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 000000000..62e8ec619
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,340 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* This quorum hack is only here until we transition to some more rational
+ * approach that is driven from userspace.  Honest.  No foolin'.
+ *
+ * Imagine two nodes lose network connectivity to each other but they're still
+ * up and operating in every other way.  Presumably a network timeout indicates
+ * that a node is broken and should be recovered.  They can't both recover each
+ * other and both carry on without serialising their access to the file system.
+ * They need to decide who is authoritative.  Now extend that problem to
+ * arbitrary groups of nodes losing connectivity between each other.
+ *
+ * So we declare that a node which has given up on connecting to a majority
+ * of nodes who are still heartbeating will fence itself.
+ *
+ * There are huge opportunities for races here.  After we give up on a node's
+ * connection we need to wait long enough to give heartbeat an opportunity
+ * to declare the node as truly dead.  We also need to be careful with the
+ * race between when we see a node start heartbeating and when we connect
+ * to it.
+ *
+ * So nodes that are in this transtion put a hold on the quorum decision
+ * with a counter.  As they fall out of this transition they drop the count
+ * and if they're the last, they fire off the decision.
+ */
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/reboot.h>
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_QUORUM
+#include "masklog.h"
+#include "quorum.h"
+
+static struct o2quo_state {
+	spinlock_t		qs_lock;
+	struct work_struct	qs_work;
+	int			qs_pending;
+	int			qs_heartbeating;
+	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_connected;
+	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_holds;
+	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+} o2quo_state;
+
+/* this is horribly heavy-handed.  It should instead flip the file
+ * system RO and call some userspace script. */
+static void o2quo_fence_self(void)
+{
+	/* panic spins with interrupts enabled.  with preempt
+	 * threads can still schedule, etc, etc */
+	o2hb_stop_all_regions();
+
+	switch (o2nm_single_cluster->cl_fence_method) {
+	case O2NM_FENCE_PANIC:
+		panic("*** ocfs2 is very sorry to be fencing this system by "
+		      "panicing ***\n");
+		break;
+	default:
+		WARN_ON(o2nm_single_cluster->cl_fence_method >=
+			O2NM_FENCE_METHODS);
+	case O2NM_FENCE_RESET:
+		printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
+		       "system by restarting ***\n");
+		emergency_restart();
+		break;
+	};
+}
+
+/* Indicate that a timeout occurred on a hearbeat region write. The
+ * other nodes in the cluster may consider us dead at that time so we
+ * want to "fence" ourselves so that we don't scribble on the disk
+ * after they think they've recovered us. This can't solve all
+ * problems related to writeout after recovery but this hack can at
+ * least close some of those gaps. When we have real fencing, this can
+ * go away as our node would be fenced externally before other nodes
+ * begin recovery. */
+void o2quo_disk_timeout(void)
+{
+	o2quo_fence_self();
+}
+
+static void o2quo_make_decision(struct work_struct *work)
+{
+	int quorum;
+	int lowest_hb, lowest_reachable = 0, fence = 0;
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
+	if (lowest_hb != O2NM_MAX_NODES)
+		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
+
+	mlog(0, "heartbeating: %d, connected: %d, "
+	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
+	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
+
+	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
+	    qs->qs_heartbeating == 1)
+		goto out;
+
+	if (qs->qs_heartbeating & 1) {
+		/* the odd numbered cluster case is straight forward --
+		 * if we can't talk to the majority we're hosed */
+		quorum = (qs->qs_heartbeating + 1)/2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+	} else {
+		/* the even numbered cluster adds the possibility of each half
+		 * of the cluster being able to talk amongst themselves.. in
+		 * that case we're hosed if we can't talk to the group that has
+		 * the lowest numbered node */
+		quorum = qs->qs_heartbeating / 2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+		else if ((qs->qs_connected == quorum) &&
+			 !lowest_reachable) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "connected to a half-quorum of %u out of %u "
+			     "nodes which doesn't include the lowest active "
+			     "node %u\n", quorum, qs->qs_heartbeating,
+			     lowest_hb);
+			fence = 1;
+		}
+	}
+
+out:
+	if (fence) {
+		spin_unlock(&qs->qs_lock);
+		o2quo_fence_self();
+	} else {
+		mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
+			"connected: %d, lowest: %d (%sreachable)\n",
+			qs->qs_heartbeating, qs->qs_connected, lowest_hb,
+			lowest_reachable ? "" : "un");
+		spin_unlock(&qs->qs_lock);
+
+	}
+
+}
+
+static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
+		qs->qs_holds++;
+		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
+			        "node %u\n", node);
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
+	}
+}
+
+static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
+		if (--qs->qs_holds == 0) {
+			if (qs->qs_pending) {
+				qs->qs_pending = 0;
+				schedule_work(&qs->qs_work);
+			}
+		}
+		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
+				node, qs->qs_holds);
+	}
+}
+
+/* as a node comes up we delay the quorum decision until we know the fate of
+ * the connection.  the hold will be droped in conn_up or hb_down.  it might be
+ * perpetuated by con_err until hb_down.  if we already have a conn, we might
+ * be dropping a hold that conn_up got. */
+void o2quo_hb_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating++;
+	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	set_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	if (!test_bit(node, qs->qs_conn_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* hb going down releases any holds we might have had due to this node from
+ * conn_up, conn_err, or hb_up */
+void o2quo_hb_down(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating--;
+	mlog_bug_on_msg(qs->qs_heartbeating < 0,
+			"node %u, %d heartbeating\n",
+			node, qs->qs_heartbeating);
+	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	clear_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* this tells us that we've decided that the node is still heartbeating
+ * even though we've lost it's conn.  it must only be called after conn_err
+ * and indicates that we must now make a quorum decision in the future,
+ * though we might be doing so after waiting for holds to drain.  Here
+ * we'll be dropping the hold from conn_err. */
+void o2quo_hb_still_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	mlog(0, "node %u\n", node);
+
+	qs->qs_pending = 1;
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* This is analogous to hb_up.  as a node's connection comes up we delay the
+ * quorum decision until we see it heartbeating.  the hold will be droped in
+ * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
+ * it's already heartbeating we might be dropping a hold that conn_up got.
+ * */
+void o2quo_conn_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_connected++;
+	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
+	set_bit(node, qs->qs_conn_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (!test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* we've decided that we won't ever be connecting to the node again.  if it's
+ * still heartbeating we grab a hold that will delay decisions until either the
+ * node stops heartbeating from hb_down or the caller decides that the node is
+ * still up and calls still_up */
+void o2quo_conn_err(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	if (test_bit(node, qs->qs_conn_bm)) {
+		qs->qs_connected--;
+		mlog_bug_on_msg(qs->qs_connected < 0,
+				"node %u, connected %d\n",
+				node, qs->qs_connected);
+
+		clear_bit(node, qs->qs_conn_bm);
+	}
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+void o2quo_init(void)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock_init(&qs->qs_lock);
+	INIT_WORK(&qs->qs_work, o2quo_make_decision);
+}
+
+void o2quo_exit(void)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	flush_work(&qs->qs_work);
+}
diff --git a/kernel/fs/ocfs2/cluster/quorum.h b/kernel/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 000000000..6649cc6f6
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_QUORUM_H
+#define O2CLUSTER_QUORUM_H
+
+void o2quo_init(void);
+void o2quo_exit(void);
+
+void o2quo_hb_up(u8 node);
+void o2quo_hb_down(u8 node);
+void o2quo_hb_still_up(u8 node);
+void o2quo_conn_up(u8 node);
+void o2quo_conn_err(u8 node);
+void o2quo_disk_timeout(void);
+
+#endif /* O2CLUSTER_QUORUM_H */
diff --git a/kernel/fs/ocfs2/cluster/sys.c b/kernel/fs/ocfs2/cluster/sys.c
new file mode 100644
index 000000000..b7f57271d
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.c
+ *
+ * OCFS2 cluster sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+
+#include "ocfs2_nodemanager.h"
+#include "masklog.h"
+#include "sys.h"
+
+
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+			    char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
+}
+static struct kobj_attribute attr_version =
+	__ATTR(interface_revision, S_IRUGO, version_show, NULL);
+
+static struct attribute *o2cb_attrs[] = {
+	&attr_version.attr,
+	NULL,
+};
+
+static struct attribute_group o2cb_attr_group = {
+	.attrs = o2cb_attrs,
+};
+
+static struct kset *o2cb_kset;
+
+void o2cb_sys_shutdown(void)
+{
+	mlog_sys_shutdown();
+	kset_unregister(o2cb_kset);
+}
+
+int o2cb_sys_init(void)
+{
+	int ret;
+
+	o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
+	if (!o2cb_kset)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
+	if (ret)
+		goto error;
+
+	ret = mlog_sys_init(o2cb_kset);
+	if (ret)
+		goto error;
+	return 0;
+error:
+	kset_unregister(o2cb_kset);
+	return ret;
+}
diff --git a/kernel/fs/ocfs2/cluster/sys.h b/kernel/fs/ocfs2/cluster/sys.h
new file mode 100644
index 000000000..d66b8ab00
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.h
+ *
+ * Function prototypes for o2cb sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_SYS_H
+#define O2CLUSTER_SYS_H
+
+void o2cb_sys_shutdown(void);
+int o2cb_sys_init(void);
+
+#endif /* O2CLUSTER_SYS_H */
diff --git a/kernel/fs/ocfs2/cluster/tcp.c b/kernel/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 000000000..56c403a56
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,2219 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * ----
+ *
+ * Callers for this were originally written against a very simple synchronus
+ * API.  This implementation reflects those simple callers.  Some day I'm sure
+ * we'll need to move to a more robust posting/callback mechanism.
+ *
+ * Transmit calls pass in kernel virtual addresses and block copying this into
+ * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
+ * for a failed socket to timeout.  TX callers can also pass in a poniter to an
+ * 'int' which gets filled with an errno off the wire in response to the
+ * message they send.
+ *
+ * Handlers for unsolicited messages are registered.  Each socket has a page
+ * that incoming data is copied into.  First the header, then the data.
+ * Handlers are called from only one thread with a reference to this per-socket
+ * page.  This page is destroyed after the handler call, so it can't be
+ * referenced beyond the call.  Handlers may block but are discouraged from
+ * doing so.
+ *
+ * Any framing errors (bad magic, large payload lengths) close a connection.
+ *
+ * Our sock_container holds the state we associate with a socket.  It's current
+ * framing state is held there as well as the refcounting we do around when it
+ * is safe to tear down the socket.  The socket is only finally torn down from
+ * the container when the container loses all of its references -- so as long
+ * as you hold a ref on the container you can trust that the socket is valid
+ * for use with kernel socket APIs.
+ *
+ * Connections are initiated between a pair of nodes when the node with the
+ * higher node number gets a heartbeat callback which indicates that the lower
+ * numbered node has started heartbeating.  The lower numbered node is passive
+ * and only accepts the connection if the higher numbered node is heartbeating.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/net.h>
+#include <linux/export.h>
+#include <net/tcp.h>
+
+#include <asm/uaccess.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "quorum.h"
+
+#include "tcp_internal.h"
+
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
+#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
+			  &sc->sc_node->nd_ipv4_address,		\
+			  ntohs(sc->sc_node->nd_ipv4_port)
+
+/*
+ * In the following two log macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define msglog(hdr, fmt, args...) do {					\
+	typeof(hdr) __hdr = (hdr);					\
+	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
+	     "key %08x num %u] " fmt,					\
+	     be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), 	\
+	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
+	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
+	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
+} while (0)
+
+#define sclog(sc, fmt, args...) do {					\
+	typeof(sc) __sc = (sc);						\
+	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
+	     "pg_off %zu] " fmt, __sc,					\
+	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
+	    ##args);							\
+} while (0)
+
+static DEFINE_RWLOCK(o2net_handler_lock);
+static struct rb_root o2net_handler_tree = RB_ROOT;
+
+static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
+
+/* XXX someday we'll need better accounting */
+static struct socket *o2net_listen_sock;
+
+/*
+ * listen work is only queued by the listening socket callbacks on the
+ * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
+ * quorum work is queued as sock containers are shutdown.. stop_listening
+ * tears down all the node's sock containers, preventing future shutdowns
+ * and queued quroum work, before canceling delayed quorum work and
+ * destroying the work queue.
+ */
+static struct workqueue_struct *o2net_wq;
+static struct work_struct o2net_listen_work;
+
+static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
+#define O2NET_HB_PRI 0x1
+
+static struct o2net_handshake *o2net_hand;
+static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
+
+static int o2net_sys_err_translations[O2NET_ERR_MAX] =
+		{[O2NET_ERR_NONE]	= 0,
+		 [O2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
+		 [O2NET_ERR_OVERFLOW]	= -EOVERFLOW,
+		 [O2NET_ERR_DIED]	= -EHOSTDOWN,};
+
+/* can't quite avoid *all* internal declarations :/ */
+static void o2net_sc_connect_completed(struct work_struct *work);
+static void o2net_rx_until_empty(struct work_struct *work);
+static void o2net_shutdown_sc(struct work_struct *work);
+static void o2net_listen_data_ready(struct sock *sk);
+static void o2net_sc_send_keep_req(struct work_struct *work);
+static void o2net_idle_timer(unsigned long data);
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
+static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
+
+#ifdef CONFIG_DEBUG_FS
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+			   u32 msgkey, struct task_struct *task, u8 node)
+{
+	INIT_LIST_HEAD(&nst->st_net_debug_item);
+	nst->st_task = task;
+	nst->st_msg_type = msgtype;
+	nst->st_msg_key = msgkey;
+	nst->st_node = node;
+}
+
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+	nst->st_sock_time = ktime_get();
+}
+
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+	nst->st_send_time = ktime_get();
+}
+
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+	nst->st_status_time = ktime_get();
+}
+
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
+{
+	nst->st_sc = sc;
+}
+
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
+{
+	nst->st_id = msg_id;
+}
+
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_timer = ktime_get();
+}
+
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_data_ready = ktime_get();
+}
+
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_advance_start = ktime_get();
+}
+
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_advance_stop = ktime_get();
+}
+
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_func_start = ktime_get();
+}
+
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_func_stop = ktime_get();
+}
+
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_OCFS2_FS_STATS
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+	return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+				    struct o2net_sock_container *sc)
+{
+	sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+					   ktime_sub(ktime_get(),
+						     nst->st_status_time));
+	sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+					 ktime_sub(nst->st_status_time,
+						   nst->st_send_time));
+	sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+					    ktime_sub(nst->st_send_time,
+						      nst->st_sock_time));
+	sc->sc_send_count++;
+}
+
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+					    o2net_get_func_run_time(sc));
+	sc->sc_recv_count++;
+}
+
+#else
+
+# define o2net_update_send_stats(a, b)
+
+# define o2net_update_recv_stats(sc)
+
+#endif /* CONFIG_OCFS2_FS_STATS */
+
+static inline unsigned int o2net_reconnect_delay(void)
+{
+	return o2nm_single_cluster->cl_reconnect_delay_ms;
+}
+
+static inline unsigned int o2net_keepalive_delay(void)
+{
+	return o2nm_single_cluster->cl_keepalive_delay_ms;
+}
+
+static inline unsigned int o2net_idle_timeout(void)
+{
+	return o2nm_single_cluster->cl_idle_timeout_ms;
+}
+
+static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
+{
+	int trans;
+	BUG_ON(err >= O2NET_ERR_MAX);
+	trans = o2net_sys_err_translations[err];
+
+	/* Just in case we mess up the translation table above */
+	BUG_ON(err != O2NET_ERR_NONE && trans == 0);
+	return trans;
+}
+
+static struct o2net_node * o2net_nn_from_num(u8 node_num)
+{
+	BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
+	return &o2net_nodes[node_num];
+}
+
+static u8 o2net_num_from_nn(struct o2net_node *nn)
+{
+	BUG_ON(nn == NULL);
+	return nn - o2net_nodes;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
+{
+	int ret;
+
+	spin_lock(&nn->nn_lock);
+	ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC);
+	if (ret >= 0) {
+		nsw->ns_id = ret;
+		list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);
+	}
+	spin_unlock(&nn->nn_lock);
+	if (ret < 0)
+		return ret;
+
+	init_waitqueue_head(&nsw->ns_wq);
+	nsw->ns_sys_status = O2NET_ERR_NONE;
+	nsw->ns_status = 0;
+	return 0;
+}
+
+static void o2net_complete_nsw_locked(struct o2net_node *nn,
+				      struct o2net_status_wait *nsw,
+				      enum o2net_system_error sys_status,
+				      s32 status)
+{
+	assert_spin_locked(&nn->nn_lock);
+
+	if (!list_empty(&nsw->ns_node_item)) {
+		list_del_init(&nsw->ns_node_item);
+		nsw->ns_sys_status = sys_status;
+		nsw->ns_status = status;
+		idr_remove(&nn->nn_status_idr, nsw->ns_id);
+		wake_up(&nsw->ns_wq);
+	}
+}
+
+static void o2net_complete_nsw(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw,
+			       u64 id, enum o2net_system_error sys_status,
+			       s32 status)
+{
+	spin_lock(&nn->nn_lock);
+	if (nsw == NULL) {
+		if (id > INT_MAX)
+			goto out;
+
+		nsw = idr_find(&nn->nn_status_idr, id);
+		if (nsw == NULL)
+			goto out;
+	}
+
+	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+
+out:
+	spin_unlock(&nn->nn_lock);
+	return;
+}
+
+static void o2net_complete_nodes_nsw(struct o2net_node *nn)
+{
+	struct o2net_status_wait *nsw, *tmp;
+	unsigned int num_kills = 0;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
+		o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
+		num_kills++;
+	}
+
+	mlog(0, "completed %d messages for node %u\n", num_kills,
+	     o2net_num_from_nn(nn));
+}
+
+static int o2net_nsw_completed(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw)
+{
+	int completed;
+	spin_lock(&nn->nn_lock);
+	completed = list_empty(&nsw->ns_node_item);
+	spin_unlock(&nn->nn_lock);
+	return completed;
+}
+
+/* ------------------------------------------------------------ */
+
+static void sc_kref_release(struct kref *kref)
+{
+	struct o2net_sock_container *sc = container_of(kref,
+					struct o2net_sock_container, sc_kref);
+	BUG_ON(timer_pending(&sc->sc_idle_timeout));
+
+	sclog(sc, "releasing\n");
+
+	if (sc->sc_sock) {
+		sock_release(sc->sc_sock);
+		sc->sc_sock = NULL;
+	}
+
+	o2nm_undepend_item(&sc->sc_node->nd_item);
+	o2nm_node_put(sc->sc_node);
+	sc->sc_node = NULL;
+
+	o2net_debug_del_sc(sc);
+
+	if (sc->sc_page)
+		__free_page(sc->sc_page);
+	kfree(sc);
+}
+
+static void sc_put(struct o2net_sock_container *sc)
+{
+	sclog(sc, "put\n");
+	kref_put(&sc->sc_kref, sc_kref_release);
+}
+static void sc_get(struct o2net_sock_container *sc)
+{
+	sclog(sc, "get\n");
+	kref_get(&sc->sc_kref);
+}
+static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+	struct page *page = NULL;
+	int status = 0;
+
+	page = alloc_page(GFP_NOFS);
+	sc = kzalloc(sizeof(*sc), GFP_NOFS);
+	if (sc == NULL || page == NULL)
+		goto out;
+
+	kref_init(&sc->sc_kref);
+	o2nm_node_get(node);
+	sc->sc_node = node;
+
+	/* pin the node item of the remote node */
+	status = o2nm_depend_item(&node->nd_item);
+	if (status) {
+		mlog_errno(status);
+		o2nm_node_put(node);
+		goto out;
+	}
+	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
+	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
+	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
+	INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
+
+	init_timer(&sc->sc_idle_timeout);
+	sc->sc_idle_timeout.function = o2net_idle_timer;
+	sc->sc_idle_timeout.data = (unsigned long)sc;
+
+	sclog(sc, "alloced\n");
+
+	ret = sc;
+	sc->sc_page = page;
+	o2net_debug_add_sc(sc);
+	sc = NULL;
+	page = NULL;
+
+out:
+	if (page)
+		__free_page(page);
+	kfree(sc);
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static void o2net_sc_queue_work(struct o2net_sock_container *sc,
+				struct work_struct *work)
+{
+	sc_get(sc);
+	if (!queue_work(o2net_wq, work))
+		sc_put(sc);
+}
+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
+					struct delayed_work *work,
+					int delay)
+{
+	sc_get(sc);
+	if (!queue_delayed_work(o2net_wq, work, delay))
+		sc_put(sc);
+}
+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
+					 struct delayed_work *work)
+{
+	if (cancel_delayed_work(work))
+		sc_put(sc);
+}
+
+static atomic_t o2net_connected_peers = ATOMIC_INIT(0);
+
+int o2net_num_connected_peers(void)
+{
+	return atomic_read(&o2net_connected_peers);
+}
+
+static void o2net_set_nn_state(struct o2net_node *nn,
+			       struct o2net_sock_container *sc,
+			       unsigned valid, int err)
+{
+	int was_valid = nn->nn_sc_valid;
+	int was_err = nn->nn_persistent_error;
+	struct o2net_sock_container *old_sc = nn->nn_sc;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	if (old_sc && !sc)
+		atomic_dec(&o2net_connected_peers);
+	else if (!old_sc && sc)
+		atomic_inc(&o2net_connected_peers);
+
+	/* the node num comparison and single connect/accept path should stop
+	 * an non-null sc from being overwritten with another */
+	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
+	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
+	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
+
+	if (was_valid && !valid && err == 0)
+		err = -ENOTCONN;
+
+	mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
+	     o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
+	     nn->nn_persistent_error, err);
+
+	nn->nn_sc = sc;
+	nn->nn_sc_valid = valid ? 1 : 0;
+	nn->nn_persistent_error = err;
+
+	/* mirrors o2net_tx_can_proceed() */
+	if (nn->nn_persistent_error || nn->nn_sc_valid)
+		wake_up(&nn->nn_sc_wq);
+
+	if (was_valid && !was_err && nn->nn_persistent_error) {
+		o2quo_conn_err(o2net_num_from_nn(nn));
+		queue_delayed_work(o2net_wq, &nn->nn_still_up,
+				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+	}
+
+	if (was_valid && !valid) {
+		if (old_sc)
+			printk(KERN_NOTICE "o2net: No longer connected to "
+				SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
+		o2net_complete_nodes_nsw(nn);
+	}
+
+	if (!was_valid && valid) {
+		o2quo_conn_up(o2net_num_from_nn(nn));
+		cancel_delayed_work(&nn->nn_connect_expired);
+		printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
+		       o2nm_this_node() > sc->sc_node->nd_num ?
+		       "Connected to" : "Accepted connection from",
+		       SC_NODEF_ARGS(sc));
+	}
+
+	/* trigger the connecting worker func as long as we're not valid,
+	 * it will back off if it shouldn't connect.  This can be called
+	 * from node config teardown and so needs to be careful about
+	 * the work queue actually being up. */
+	if (!valid && o2net_wq) {
+		unsigned long delay;
+		/* delay if we're within a RECONNECT_DELAY of the
+		 * last attempt */
+		delay = (nn->nn_last_connect_attempt +
+			 msecs_to_jiffies(o2net_reconnect_delay()))
+			- jiffies;
+		if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
+			delay = 0;
+		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
+		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+
+		/*
+		 * Delay the expired work after idle timeout.
+		 *
+		 * We might have lots of failed connection attempts that run
+		 * through here but we only cancel the connect_expired work when
+		 * a connection attempt succeeds.  So only the first enqueue of
+		 * the connect_expired work will do anything.  The rest will see
+		 * that it's already queued and do nothing.
+		 */
+		delay += msecs_to_jiffies(o2net_idle_timeout());
+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
+	}
+
+	/* keep track of the nn's sc ref for the caller */
+	if ((old_sc == NULL) && sc)
+		sc_get(sc);
+	if (old_sc && (old_sc != sc)) {
+		o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
+		sc_put(old_sc);
+	}
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_data_ready(struct sock *sk)
+{
+	void (*ready)(struct sock *sk);
+
+	read_lock(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		struct o2net_sock_container *sc = sk->sk_user_data;
+		sclog(sc, "data_ready hit\n");
+		o2net_set_data_ready_time(sc);
+		o2net_sc_queue_work(sc, &sc->sc_rx_work);
+		ready = sc->sc_data_ready;
+	} else {
+		ready = sk->sk_data_ready;
+	}
+	read_unlock(&sk->sk_callback_lock);
+
+	ready(sk);
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct o2net_sock_container *sc;
+
+	read_lock(&sk->sk_callback_lock);
+	sc = sk->sk_user_data;
+	if (sc == NULL) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+
+	sclog(sc, "state_change to %d\n", sk->sk_state);
+
+	state_change = sc->sc_state_change;
+
+	switch(sk->sk_state) {
+	/* ignore connecting sockets as they make progress */
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:
+		break;
+	case TCP_ESTABLISHED:
+		o2net_sc_queue_work(sc, &sc->sc_connect_work);
+		break;
+	default:
+		printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+			" shutdown, state %d\n",
+			SC_NODEF_ARGS(sc), sk->sk_state);
+		o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+		break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+/*
+ * we register callbacks so we can queue work on events before calling
+ * the original callbacks.  our callbacks our careful to test user_data
+ * to discover when they've reaced with o2net_unregister_callbacks().
+ */
+static void o2net_register_callbacks(struct sock *sk,
+				     struct o2net_sock_container *sc)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+
+	/* accepted sockets inherit the old listen socket data ready */
+	if (sk->sk_data_ready == o2net_listen_data_ready) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+
+	BUG_ON(sk->sk_user_data != NULL);
+	sk->sk_user_data = sc;
+	sc_get(sc);
+
+	sc->sc_data_ready = sk->sk_data_ready;
+	sc->sc_state_change = sk->sk_state_change;
+	sk->sk_data_ready = o2net_data_ready;
+	sk->sk_state_change = o2net_state_change;
+
+	mutex_init(&sc->sc_send_lock);
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int o2net_unregister_callbacks(struct sock *sk,
+			           struct o2net_sock_container *sc)
+{
+	int ret = 0;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data == sc) {
+		ret = 1;
+		sk->sk_user_data = NULL;
+		sk->sk_data_ready = sc->sc_data_ready;
+		sk->sk_state_change = sc->sc_state_change;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	return ret;
+}
+
+/*
+ * this is a little helper that is called by callers who have seen a problem
+ * with an sc and want to detach it from the nn if someone already hasn't beat
+ * them to it.  if an error is given then the shutdown will be persistent
+ * and pending transmits will be canceled.
+ */
+static void o2net_ensure_shutdown(struct o2net_node *nn,
+			           struct o2net_sock_container *sc,
+				   int err)
+{
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc == sc)
+		o2net_set_nn_state(nn, NULL, 0, err);
+	spin_unlock(&nn->nn_lock);
+}
+
+/*
+ * This work queue function performs the blocking parts of socket shutdown.  A
+ * few paths lead here.  set_nn_state will trigger this callback if it sees an
+ * sc detached from the nn.  state_change will also trigger this callback
+ * directly when it sees errors.  In that case we need to call set_nn_state
+ * ourselves as state_change couldn't get the nn_lock and call set_nn_state
+ * itself.
+ */
+static void o2net_shutdown_sc(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container,
+			     sc_shutdown_work);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	sclog(sc, "shutting down\n");
+
+	/* drop the callbacks ref and call shutdown only once */
+	if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
+		/* we shouldn't flush as we're in the thread, the
+		 * races with pending sc work structs are harmless */
+		del_timer_sync(&sc->sc_idle_timeout);
+		o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+		sc_put(sc);
+		kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
+	}
+
+	/* not fatal so failed connects before the other guy has our
+	 * heartbeat can be retried */
+	o2net_ensure_shutdown(nn, sc, 0);
+	sc_put(sc);
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
+			     u32 key)
+{
+	int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
+
+	if (ret == 0)
+		ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
+
+	return ret;
+}
+
+static struct o2net_msg_handler *
+o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
+			  struct rb_node **ret_parent)
+{
+	struct rb_node **p = &o2net_handler_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct o2net_msg_handler *nmh, *ret = NULL;
+	int cmp;
+
+	while (*p) {
+		parent = *p;
+		nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+		cmp = o2net_handler_cmp(nmh, msg_type, key);
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else {
+			ret = nmh;
+			break;
+		}
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+
+	return ret;
+}
+
+static void o2net_handler_kref_release(struct kref *kref)
+{
+	struct o2net_msg_handler *nmh;
+	nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
+
+	kfree(nmh);
+}
+
+static void o2net_handler_put(struct o2net_msg_handler *nmh)
+{
+	kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+}
+
+/* max_len is protection for the handler func.  incoming messages won't
+ * be given to the handler if their payload is longer than the max. */
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   o2net_post_msg_handler_func *post_func,
+			   struct list_head *unreg_list)
+{
+	struct o2net_msg_handler *nmh = NULL;
+	struct rb_node **p, *parent;
+	int ret = 0;
+
+	if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "max_len for message handler out of range: %u\n",
+			max_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!msg_type) {
+		mlog(0, "no message type provided: %u, %p\n", msg_type, func);
+		ret = -EINVAL;
+		goto out;
+
+	}
+	if (!func) {
+		mlog(0, "no message handler provided: %u, %p\n",
+		       msg_type, func);
+		ret = -EINVAL;
+		goto out;
+	}
+
+       	nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS);
+	if (nmh == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nmh->nh_func = func;
+	nmh->nh_func_data = data;
+	nmh->nh_post_func = post_func;
+	nmh->nh_msg_type = msg_type;
+	nmh->nh_max_len = max_len;
+	nmh->nh_key = key;
+	/* the tree and list get this ref.. they're both removed in
+	 * unregister when this ref is dropped */
+	kref_init(&nmh->nh_kref);
+	INIT_LIST_HEAD(&nmh->nh_unregister_item);
+
+	write_lock(&o2net_handler_lock);
+	if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
+		ret = -EEXIST;
+	else {
+	        rb_link_node(&nmh->nh_node, parent, p);
+		rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
+		list_add_tail(&nmh->nh_unregister_item, unreg_list);
+
+		mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
+		     func, msg_type, key);
+		/* we've had some trouble with handlers seemingly vanishing. */
+		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
+							  &parent) == NULL,
+			        "couldn't find handler we *just* registered "
+				"for type %u key %08x\n", msg_type, key);
+	}
+	write_unlock(&o2net_handler_lock);
+	if (ret)
+		goto out;
+
+out:
+	if (ret)
+		kfree(nmh);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_register_handler);
+
+void o2net_unregister_handler_list(struct list_head *list)
+{
+	struct o2net_msg_handler *nmh, *n;
+
+	write_lock(&o2net_handler_lock);
+	list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
+		mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
+		     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
+		rb_erase(&nmh->nh_node, &o2net_handler_tree);
+		list_del_init(&nmh->nh_unregister_item);
+		kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+	}
+	write_unlock(&o2net_handler_lock);
+}
+EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
+
+static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
+{
+	struct o2net_msg_handler *nmh;
+
+	read_lock(&o2net_handler_lock);
+	nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
+	if (nmh)
+		kref_get(&nmh->nh_kref);
+	read_unlock(&o2net_handler_lock);
+
+	return nmh;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
+{
+	struct kvec vec = { .iov_len = len, .iov_base = data, };
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+	return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
+}
+
+static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
+			      size_t veclen, size_t total)
+{
+	int ret;
+	struct msghdr msg = {.msg_flags = 0,};
+
+	if (sock == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+	if (likely(ret == total))
+		return 0;
+	mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+	if (ret >= 0)
+		ret = -EPIPE; /* should be smarter, I bet */
+out:
+	mlog(0, "returning error: %d\n", ret);
+	return ret;
+}
+
+static void o2net_sendpage(struct o2net_sock_container *sc,
+			   void *kmalloced_virt,
+			   size_t size)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	ssize_t ret;
+
+	while (1) {
+		mutex_lock(&sc->sc_send_lock);
+		ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
+						 virt_to_page(kmalloced_virt),
+						 (long)kmalloced_virt & ~PAGE_MASK,
+						 size, MSG_DONTWAIT);
+		mutex_unlock(&sc->sc_send_lock);
+		if (ret == size)
+			break;
+		if (ret == (ssize_t)-EAGAIN) {
+			mlog(0, "sendpage of size %zu to " SC_NODEF_FMT
+			     " returned EAGAIN\n", size, SC_NODEF_ARGS(sc));
+			cond_resched();
+			continue;
+		}
+		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
+		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
+		o2net_ensure_shutdown(nn, sc, 0);
+		break;
+	}
+}
+
+static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
+{
+	memset(msg, 0, sizeof(struct o2net_msg));
+	msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
+	msg->data_len = cpu_to_be16(data_len);
+	msg->msg_type = cpu_to_be16(msg_type);
+	msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
+	msg->status = 0;
+	msg->key = cpu_to_be32(key);
+}
+
+static int o2net_tx_can_proceed(struct o2net_node *nn,
+			        struct o2net_sock_container **sc_ret,
+				int *error)
+{
+	int ret = 0;
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_persistent_error) {
+		ret = 1;
+		*sc_ret = NULL;
+		*error = nn->nn_persistent_error;
+	} else if (nn->nn_sc_valid) {
+		kref_get(&nn->nn_sc->sc_kref);
+
+		ret = 1;
+		*sc_ret = nn->nn_sc;
+		*error = 0;
+	}
+	spin_unlock(&nn->nn_lock);
+
+	return ret;
+}
+
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+	struct o2net_sock_container *sc;
+	int node, ret;
+
+	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+	memset(map, 0, bytes);
+	for (node = 0; node < O2NM_MAX_NODES; ++node) {
+		if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
+			continue;
+		if (!ret) {
+			set_bit(node, map);
+			sc_put(sc);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
+			   size_t caller_veclen, u8 target_node, int *status)
+{
+	int ret = 0;
+	struct o2net_msg *msg = NULL;
+	size_t veclen, caller_bytes = 0;
+	struct kvec *vec = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn = o2net_nn_from_num(target_node);
+	struct o2net_status_wait nsw = {
+		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
+	};
+	struct o2net_send_tracking nst;
+
+	o2net_init_nst(&nst, msg_type, key, current, target_node);
+
+	if (o2net_wq == NULL) {
+		mlog(0, "attempt to tx without o2netd running\n");
+		ret = -ESRCH;
+		goto out;
+	}
+
+	if (caller_veclen == 0) {
+		mlog(0, "bad kvec array length\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
+	if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "total payload len %zu too large\n", caller_bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (target_node == o2nm_this_node()) {
+		ret = -ELOOP;
+		goto out;
+	}
+
+	o2net_debug_add_nst(&nst);
+
+	o2net_set_nst_sock_time(&nst);
+
+	wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
+	if (ret)
+		goto out;
+
+	o2net_set_nst_sock_container(&nst, sc);
+
+	veclen = caller_veclen + 1;
+	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
+	if (vec == NULL) {
+		mlog(0, "failed to %zu element kvec!\n", veclen);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
+	if (!msg) {
+		mlog(0, "failed to allocate a o2net_msg!\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	o2net_init_msg(msg, caller_bytes, msg_type, key);
+
+	vec[0].iov_len = sizeof(struct o2net_msg);
+	vec[0].iov_base = msg;
+	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
+
+	ret = o2net_prep_nsw(nn, &nsw);
+	if (ret)
+		goto out;
+
+	msg->msg_num = cpu_to_be32(nsw.ns_id);
+	o2net_set_nst_msg_id(&nst, nsw.ns_id);
+
+	o2net_set_nst_send_time(&nst);
+
+	/* finally, convert the message header to network byte-order
+	 * and send */
+	mutex_lock(&sc->sc_send_lock);
+	ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
+				 sizeof(struct o2net_msg) + caller_bytes);
+	mutex_unlock(&sc->sc_send_lock);
+	msglog(msg, "sending returned %d\n", ret);
+	if (ret < 0) {
+		mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
+		goto out;
+	}
+
+	/* wait on other node's handler */
+	o2net_set_nst_status_time(&nst);
+	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+
+	o2net_update_send_stats(&nst, sc);
+
+	/* Note that we avoid overwriting the callers status return
+	 * variable if a system error was reported on the other
+	 * side. Callers beware. */
+	ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
+	if (status && !ret)
+		*status = nsw.ns_status;
+
+	mlog(0, "woken, returning system status %d, user status %d\n",
+	     ret, nsw.ns_status);
+out:
+	o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
+	if (sc)
+		sc_put(sc);
+	kfree(vec);
+	kfree(msg);
+	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_send_message_vec);
+
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status)
+{
+	struct kvec vec = {
+		.iov_base = data,
+		.iov_len = len,
+	};
+	return o2net_send_message_vec(msg_type, key, &vec, 1,
+				      target_node, status);
+}
+EXPORT_SYMBOL_GPL(o2net_send_message);
+
+static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
+				   enum o2net_system_error syserr, int err)
+{
+	struct kvec vec = {
+		.iov_base = hdr,
+		.iov_len = sizeof(struct o2net_msg),
+	};
+
+	BUG_ON(syserr >= O2NET_ERR_MAX);
+
+	/* leave other fields intact from the incoming message, msg_num
+	 * in particular */
+	hdr->sys_status = cpu_to_be32(syserr);
+	hdr->status = cpu_to_be32(err);
+	hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC);  // twiddle the magic
+	hdr->data_len = 0;
+
+	msglog(hdr, "about to send status magic %d\n", err);
+	/* hdr has been in host byteorder this whole time */
+	return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
+}
+
+/* this returns -errno if the header was unknown or too large, etc.
+ * after this is called the buffer us reused for the next message */
+static int o2net_process_message(struct o2net_sock_container *sc,
+				 struct o2net_msg *hdr)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	int ret = 0, handler_status;
+	enum  o2net_system_error syserr;
+	struct o2net_msg_handler *nmh = NULL;
+	void *ret_data = NULL;
+
+	msglog(hdr, "processing message\n");
+
+	o2net_sc_postpone_idle(sc);
+
+	switch(be16_to_cpu(hdr->magic)) {
+		case O2NET_MSG_STATUS_MAGIC:
+			/* special type for returning message status */
+			o2net_complete_nsw(nn, NULL,
+					   be32_to_cpu(hdr->msg_num),
+					   be32_to_cpu(hdr->sys_status),
+					   be32_to_cpu(hdr->status));
+			goto out;
+		case O2NET_MSG_KEEP_REQ_MAGIC:
+			o2net_sendpage(sc, o2net_keep_resp,
+				       sizeof(*o2net_keep_resp));
+			goto out;
+		case O2NET_MSG_KEEP_RESP_MAGIC:
+			goto out;
+		case O2NET_MSG_MAGIC:
+			break;
+		default:
+			msglog(hdr, "bad magic\n");
+			ret = -EINVAL;
+			goto out;
+			break;
+	}
+
+	/* find a handler for it */
+	handler_status = 0;
+	nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
+				be32_to_cpu(hdr->key));
+	if (!nmh) {
+		mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
+		     be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
+		syserr = O2NET_ERR_NO_HNDLR;
+		goto out_respond;
+	}
+
+	syserr = O2NET_ERR_NONE;
+
+	if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
+		syserr = O2NET_ERR_OVERFLOW;
+
+	if (syserr != O2NET_ERR_NONE)
+		goto out_respond;
+
+	o2net_set_func_start_time(sc);
+	sc->sc_msg_key = be32_to_cpu(hdr->key);
+	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
+	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
+					     be16_to_cpu(hdr->data_len),
+					nmh->nh_func_data, &ret_data);
+	o2net_set_func_stop_time(sc);
+
+	o2net_update_recv_stats(sc);
+
+out_respond:
+	/* this destroys the hdr, so don't use it after this */
+	mutex_lock(&sc->sc_send_lock);
+	ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
+				      handler_status);
+	mutex_unlock(&sc->sc_send_lock);
+	hdr = NULL;
+	mlog(0, "sending handler status %d, syserr %d returned %d\n",
+	     handler_status, syserr, ret);
+
+	if (nmh) {
+		BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL);
+		if (nmh->nh_post_func)
+			(nmh->nh_post_func)(handler_status, nmh->nh_func_data,
+					    ret_data);
+	}
+
+out:
+	if (nmh)
+		o2net_handler_put(nmh);
+	return ret;
+}
+
+static int o2net_check_handshake(struct o2net_sock_container *sc)
+{
+	struct o2net_handshake *hand = page_address(sc->sc_page);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
+		printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+		       "protocol version %llu but %llu is required. "
+		       "Disconnecting.\n", SC_NODEF_ARGS(sc),
+		       (unsigned long long)be64_to_cpu(hand->protocol_version),
+		       O2NET_PROTOCOL_VERSION);
+
+		/* don't bother reconnecting if its the wrong version. */
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	/*
+	 * Ensure timeouts are consistent with other nodes, otherwise
+	 * we can end up with one node thinking that the other must be down,
+	 * but isn't. This can ultimately cause corruption.
+	 */
+	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
+				o2net_idle_timeout()) {
+		printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+		       "idle timeout of %u ms, but we use %u ms locally. "
+		       "Disconnecting.\n", SC_NODEF_ARGS(sc),
+		       be32_to_cpu(hand->o2net_idle_timeout_ms),
+		       o2net_idle_timeout());
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
+			o2net_keepalive_delay()) {
+		printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+		       "delay of %u ms, but we use %u ms locally. "
+		       "Disconnecting.\n", SC_NODEF_ARGS(sc),
+		       be32_to_cpu(hand->o2net_keepalive_delay_ms),
+		       o2net_keepalive_delay());
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
+			O2HB_MAX_WRITE_TIMEOUT_MS) {
+		printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+		       "timeout of %u ms, but we use %u ms locally. "
+		       "Disconnecting.\n", SC_NODEF_ARGS(sc),
+		       be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+		       O2HB_MAX_WRITE_TIMEOUT_MS);
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	sc->sc_handshake_ok = 1;
+
+	spin_lock(&nn->nn_lock);
+	/* set valid and queue the idle timers only if it hasn't been
+	 * shut down already */
+	if (nn->nn_sc == sc) {
+		o2net_sc_reset_idle_timer(sc);
+		atomic_set(&nn->nn_timeout, 0);
+		o2net_set_nn_state(nn, sc, 1, 0);
+	}
+	spin_unlock(&nn->nn_lock);
+
+	/* shift everything up as though it wasn't there */
+	sc->sc_page_off -= sizeof(struct o2net_handshake);
+	if (sc->sc_page_off)
+		memmove(hand, hand + 1, sc->sc_page_off);
+
+	return 0;
+}
+
+/* this demuxes the queued rx bytes into header or payload bits and calls
+ * handlers as each full message is read off the socket.  it returns -error,
+ * == 0 eof, or > 0 for progress made.*/
+static int o2net_advance_rx(struct o2net_sock_container *sc)
+{
+	struct o2net_msg *hdr;
+	int ret = 0;
+	void *data;
+	size_t datalen;
+
+	sclog(sc, "receiving\n");
+	o2net_set_advance_start_time(sc);
+
+	if (unlikely(sc->sc_handshake_ok == 0)) {
+		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
+			data = page_address(sc->sc_page) + sc->sc_page_off;
+			datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
+			ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+			if (ret > 0)
+				sc->sc_page_off += ret;
+		}
+
+		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
+			o2net_check_handshake(sc);
+			if (unlikely(sc->sc_handshake_ok == 0))
+				ret = -EPROTO;
+		}
+		goto out;
+	}
+
+	/* do we need more header? */
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0) {
+			sc->sc_page_off += ret;
+			/* only swab incoming here.. we can
+			 * only get here once as we cross from
+			 * being under to over */
+			if (sc->sc_page_off == sizeof(struct o2net_msg)) {
+				hdr = page_address(sc->sc_page);
+				if (be16_to_cpu(hdr->data_len) >
+				    O2NET_MAX_PAYLOAD_BYTES)
+					ret = -EOVERFLOW;
+			}
+		}
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		/* oof, still don't have a header */
+		goto out;
+	}
+
+	/* this was swabbed above when we first read it */
+	hdr = page_address(sc->sc_page);
+
+	msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
+
+	/* do we need more payload? */
+	if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
+		/* need more payload */
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
+			  sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0)
+			sc->sc_page_off += ret;
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
+		/* we can only get here once, the first time we read
+		 * the payload.. so set ret to progress if the handler
+		 * works out. after calling this the message is toast */
+		ret = o2net_process_message(sc, hdr);
+		if (ret == 0)
+			ret = 1;
+		sc->sc_page_off = 0;
+	}
+
+out:
+	sclog(sc, "ret = %d\n", ret);
+	o2net_set_advance_stop_time(sc);
+	return ret;
+}
+
+/* this work func is triggerd by data ready.  it reads until it can read no
+ * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
+ * our work the work struct will be marked and we'll be called again. */
+static void o2net_rx_until_empty(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container, sc_rx_work);
+	int ret;
+
+	do {
+		ret = o2net_advance_rx(sc);
+	} while (ret > 0);
+
+	if (ret <= 0 && ret != -EAGAIN) {
+		struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+		sclog(sc, "saw error %d, closing\n", ret);
+		/* not permanent so read failed handshake can retry */
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+
+	sc_put(sc);
+}
+
+static int o2net_set_nodelay(struct socket *sock)
+{
+	int ret, val = 1;
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Dear unsuspecting programmer,
+	 *
+	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
+	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
+	 * silently turn into SO_DEBUG.
+	 *
+	 * Yours,
+	 * Keeper of hilariously fragile interfaces.
+	 */
+	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				    (char __user *)&val, sizeof(val));
+
+	set_fs(oldfs);
+	return ret;
+}
+
+static int o2net_set_usertimeout(struct socket *sock)
+{
+	int user_timeout = O2NET_TCP_USER_TIMEOUT;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+				(char *)&user_timeout, sizeof(user_timeout));
+}
+
+static void o2net_initialize_handshake(void)
+{
+	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
+		O2HB_MAX_WRITE_TIMEOUT_MS);
+	o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
+	o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
+		o2net_keepalive_delay());
+	o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
+		o2net_reconnect_delay());
+}
+
+/* ------------------------------------------------------------ */
+
+/* called when a connect completes and after a sock is accepted.  the
+ * rx path will see the response and mark the sc valid */
+static void o2net_sc_connect_completed(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container,
+			     sc_connect_work);
+
+	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
+              (unsigned long long)O2NET_PROTOCOL_VERSION,
+	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
+
+	o2net_initialize_handshake();
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+	sc_put(sc);
+}
+
+/* this is called as a work_struct func. */
+static void o2net_sc_send_keep_req(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container,
+			     sc_keepalive_work.work);
+
+	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
+	sc_put(sc);
+}
+
+/* socket shutdown does a del_timer_sync against this as it tears down.
+ * we can't start this timer until we've got to the point in sc buildup
+ * where shutdown is going to be involved */
+static void o2net_idle_timer(unsigned long data)
+{
+	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+#ifdef CONFIG_DEBUG_FS
+	unsigned long msecs = ktime_to_ms(ktime_get()) -
+		ktime_to_ms(sc->sc_tv_timer);
+#else
+	unsigned long msecs = o2net_idle_timeout();
+#endif
+
+	printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+	       "idle for %lu.%lu secs.\n",
+	       SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
+
+	/* idle timerout happen, don't shutdown the connection, but
+	 * make fence decision. Maybe the connection can recover before
+	 * the decision is made.
+	 */
+	atomic_set(&nn->nn_timeout, 1);
+	o2quo_conn_err(o2net_num_from_nn(nn));
+	queue_delayed_work(o2net_wq, &nn->nn_still_up,
+			msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+
+	o2net_sc_reset_idle_timer(sc);
+
+}
+
+static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
+{
+	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+		      msecs_to_jiffies(o2net_keepalive_delay()));
+	o2net_set_sock_timer(sc);
+	mod_timer(&sc->sc_idle_timeout,
+	       jiffies + msecs_to_jiffies(o2net_idle_timeout()));
+}
+
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	/* clear fence decision since the connection recover from timeout*/
+	if (atomic_read(&nn->nn_timeout)) {
+		o2quo_conn_up(o2net_num_from_nn(nn));
+		cancel_delayed_work(&nn->nn_still_up);
+		atomic_set(&nn->nn_timeout, 0);
+	}
+
+	/* Only push out an existing timer */
+	if (timer_pending(&sc->sc_idle_timeout))
+		o2net_sc_reset_idle_timer(sc);
+}
+
+/* this work func is kicked whenever a path sets the nn state which doesn't
+ * have valid set.  This includes seeing hb come up, losing a connection,
+ * having a connect attempt fail, etc. This centralizes the logic which decides
+ * if a connect attempt should be made or if we should give up and all future
+ * transmit attempts should fail */
+static void o2net_start_connect(struct work_struct *work)
+{
+	struct o2net_node *nn =
+		container_of(work, struct o2net_node, nn_connect_work.work);
+	struct o2net_sock_container *sc = NULL;
+	struct o2nm_node *node = NULL, *mynode = NULL;
+	struct socket *sock = NULL;
+	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
+	int ret = 0, stop;
+	unsigned int timeout;
+	unsigned int noio_flag;
+
+	/*
+	 * sock_create allocates the sock with GFP_KERNEL. We must set
+	 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+	 * by this process are done as if GFP_NOIO was specified. So we
+	 * are not reentering filesystem while doing memory reclaim.
+	 */
+	noio_flag = memalloc_noio_save();
+	/* if we're greater we initiate tx, otherwise we accept */
+	if (o2nm_this_node() <= o2net_num_from_nn(nn))
+		goto out;
+
+	/* watch for racing with tearing a node down */
+	node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
+	if (node == NULL) {
+		ret = 0;
+		goto out;
+	}
+
+	mynode = o2nm_get_node_by_num(o2nm_this_node());
+	if (mynode == NULL) {
+		ret = 0;
+		goto out;
+	}
+
+	spin_lock(&nn->nn_lock);
+	/*
+	 * see if we already have one pending or have given up.
+	 * For nn_timeout, it is set when we close the connection
+	 * because of the idle time out. So it means that we have
+	 * at least connected to that node successfully once,
+	 * now try to connect to it again.
+	 */
+	timeout = atomic_read(&nn->nn_timeout);
+	stop = (nn->nn_sc ||
+		(nn->nn_persistent_error &&
+		(nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
+	spin_unlock(&nn->nn_lock);
+	if (stop)
+		goto out;
+
+	nn->nn_last_connect_attempt = jiffies;
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		mlog(0, "couldn't allocate sc\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(0, "can't create socket: %d\n", ret);
+		goto out;
+	}
+	sc->sc_sock = sock; /* freed by sc_kref_release */
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	myaddr.sin_family = AF_INET;
+	myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
+	myaddr.sin_port = htons(0); /* any port */
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
+			      sizeof(myaddr));
+	if (ret) {
+		mlog(ML_ERROR, "bind failed with %d at address %pI4\n",
+		     ret, &mynode->nd_ipv4_address);
+		goto out;
+	}
+
+	ret = o2net_set_nodelay(sc->sc_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	ret = o2net_set_usertimeout(sock);
+	if (ret) {
+		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+		goto out;
+	}
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+
+	spin_lock(&nn->nn_lock);
+	/* handshake completion will set nn->nn_sc_valid */
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	remoteaddr.sin_family = AF_INET;
+	remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
+	remoteaddr.sin_port = node->nd_ipv4_port;
+
+	ret = sc->sc_sock->ops->connect(sc->sc_sock,
+					(struct sockaddr *)&remoteaddr,
+					sizeof(remoteaddr),
+					O_NONBLOCK);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+
+out:
+	if (ret && sc) {
+		printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+		       " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
+		/* 0 err so that another will be queued and attempted
+		 * from set_nn_state */
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+	if (sc)
+		sc_put(sc);
+	if (node)
+		o2nm_node_put(node);
+	if (mynode)
+		o2nm_node_put(mynode);
+
+	memalloc_noio_restore(noio_flag);
+	return;
+}
+
+static void o2net_connect_expired(struct work_struct *work)
+{
+	struct o2net_node *nn =
+		container_of(work, struct o2net_node, nn_connect_expired.work);
+
+	spin_lock(&nn->nn_lock);
+	if (!nn->nn_sc_valid) {
+		printk(KERN_NOTICE "o2net: No connection established with "
+		       "node %u after %u.%u seconds, check network and"
+		       " cluster configuration.\n",
+		     o2net_num_from_nn(nn),
+		     o2net_idle_timeout() / 1000,
+		     o2net_idle_timeout() % 1000);
+
+		o2net_set_nn_state(nn, NULL, 0, 0);
+	}
+	spin_unlock(&nn->nn_lock);
+}
+
+static void o2net_still_up(struct work_struct *work)
+{
+	struct o2net_node *nn =
+		container_of(work, struct o2net_node, nn_still_up.work);
+
+	o2quo_hb_still_up(o2net_num_from_nn(nn));
+}
+
+/* ------------------------------------------------------------ */
+
+void o2net_disconnect_node(struct o2nm_node *node)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
+
+	/* don't reconnect until it's heartbeating again */
+	spin_lock(&nn->nn_lock);
+	atomic_set(&nn->nn_timeout, 0);
+	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	spin_unlock(&nn->nn_lock);
+
+	if (o2net_wq) {
+		cancel_delayed_work(&nn->nn_connect_expired);
+		cancel_delayed_work(&nn->nn_connect_work);
+		cancel_delayed_work(&nn->nn_still_up);
+		flush_workqueue(o2net_wq);
+	}
+}
+
+static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
+				  void *data)
+{
+	o2quo_hb_down(node_num);
+
+	if (!node)
+		return;
+
+	if (node_num != o2nm_this_node())
+		o2net_disconnect_node(node);
+
+	BUG_ON(atomic_read(&o2net_connected_peers) < 0);
+}
+
+static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
+				void *data)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node_num);
+
+	o2quo_hb_up(node_num);
+
+	BUG_ON(!node);
+
+	/* ensure an immediate connect attempt */
+	nn->nn_last_connect_attempt = jiffies -
+		(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
+
+	if (node_num != o2nm_this_node()) {
+		/* believe it or not, accept and node hearbeating testing
+		 * can succeed for this node before we got here.. so
+		 * only use set_nn_state to clear the persistent error
+		 * if that hasn't already happened */
+		spin_lock(&nn->nn_lock);
+		atomic_set(&nn->nn_timeout, 0);
+		if (nn->nn_persistent_error)
+			o2net_set_nn_state(nn, NULL, 0, 0);
+		spin_unlock(&nn->nn_lock);
+	}
+}
+
+void o2net_unregister_hb_callbacks(void)
+{
+	o2hb_unregister_callback(NULL, &o2net_hb_up);
+	o2hb_unregister_callback(NULL, &o2net_hb_down);
+}
+
+int o2net_register_hb_callbacks(void)
+{
+	int ret;
+
+	o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
+			    o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
+	o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
+			    o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
+
+	ret = o2hb_register_callback(NULL, &o2net_hb_up);
+	if (ret == 0)
+		ret = o2hb_register_callback(NULL, &o2net_hb_down);
+
+	if (ret)
+		o2net_unregister_hb_callbacks();
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_accept_one(struct socket *sock, int *more)
+{
+	int ret, slen;
+	struct sockaddr_in sin;
+	struct socket *new_sock = NULL;
+	struct o2nm_node *node = NULL;
+	struct o2nm_node *local_node = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn;
+	unsigned int noio_flag;
+
+	/*
+	 * sock_create_lite allocates the sock with GFP_KERNEL. We must set
+	 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+	 * by this process are done as if GFP_NOIO was specified. So we
+	 * are not reentering filesystem while doing memory reclaim.
+	 */
+	noio_flag = memalloc_noio_save();
+
+	BUG_ON(sock == NULL);
+	*more = 0;
+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+			       sock->sk->sk_protocol, &new_sock);
+	if (ret)
+		goto out;
+
+	new_sock->type = sock->type;
+	new_sock->ops = sock->ops;
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+
+	*more = 1;
+	new_sock->sk->sk_allocation = GFP_ATOMIC;
+
+	ret = o2net_set_nodelay(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	ret = o2net_set_usertimeout(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+		goto out;
+	}
+
+	slen = sizeof(sin);
+	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
+				       &slen, 1);
+	if (ret < 0)
+		goto out;
+
+	node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
+	if (node == NULL) {
+		printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+		       "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+		       ntohs(sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (o2nm_this_node() >= node->nd_num) {
+		local_node = o2nm_get_node_by_num(o2nm_this_node());
+		if (local_node)
+			printk(KERN_NOTICE "o2net: Unexpected connect attempt "
+					"seen at node '%s' (%u, %pI4:%d) from "
+					"node '%s' (%u, %pI4:%d)\n",
+					local_node->nd_name, local_node->nd_num,
+					&(local_node->nd_ipv4_address),
+					ntohs(local_node->nd_ipv4_port),
+					node->nd_name,
+					node->nd_num, &sin.sin_addr.s_addr,
+					ntohs(sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* this happens all the time when the other node sees our heartbeat
+	 * and tries to connect before we see their heartbeat */
+	if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
+		mlog(ML_CONN, "attempt to connect from node '%s' at "
+		     "%pI4:%d but it isn't heartbeating\n",
+		     node->nd_name, &sin.sin_addr.s_addr,
+		     ntohs(sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nn = o2net_nn_from_num(node->nd_num);
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc)
+		ret = -EBUSY;
+	else
+		ret = 0;
+	spin_unlock(&nn->nn_lock);
+	if (ret) {
+		printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+		       "at %pI4:%d but it already has an open connection\n",
+		       node->nd_name, &sin.sin_addr.s_addr,
+		       ntohs(sin.sin_port));
+		goto out;
+	}
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sc->sc_sock = new_sock;
+	new_sock = NULL;
+
+	spin_lock(&nn->nn_lock);
+	atomic_set(&nn->nn_timeout, 0);
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+	o2net_sc_queue_work(sc, &sc->sc_rx_work);
+
+	o2net_initialize_handshake();
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+
+out:
+	if (new_sock)
+		sock_release(new_sock);
+	if (node)
+		o2nm_node_put(node);
+	if (local_node)
+		o2nm_node_put(local_node);
+	if (sc)
+		sc_put(sc);
+
+	memalloc_noio_restore(noio_flag);
+	return ret;
+}
+
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
+static void o2net_accept_many(struct work_struct *work)
+{
+	struct socket *sock = o2net_listen_sock;
+	int	more;
+	int	err;
+
+	/*
+	 * It is critical to note that due to interrupt moderation
+	 * at the network driver level, we can't assume to get a
+	 * softIRQ for every single conn since tcp SYN packets
+	 * can arrive back-to-back, and therefore many pending
+	 * accepts may result in just 1 softIRQ. If we terminate
+	 * the o2net_accept_one() loop upon seeing an err, what happens
+	 * to the rest of the conns in the queue? If no new SYN
+	 * arrives for hours, no softIRQ  will be delivered,
+	 * and the connections will just sit in the queue.
+	 */
+
+	for (;;) {
+		err = o2net_accept_one(sock, &more);
+		if (!more)
+			break;
+		cond_resched();
+	}
+}
+
+static void o2net_listen_data_ready(struct sock *sk)
+{
+	void (*ready)(struct sock *sk);
+
+	read_lock(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (ready == NULL) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/* This callback may called twice when a new connection
+	 * is  being established as a child socket inherits everything
+	 * from a parent LISTEN socket, including the data_ready cb of
+	 * the parent. This leads to a hazard. In o2net_accept_one()
+	 * we are still initializing the child socket but have not
+	 * changed the inherited data_ready callback yet when
+	 * data starts arriving.
+	 * We avoid this hazard by checking the state.
+	 * For the listening socket,  the state will be TCP_LISTEN; for the new
+	 * socket, will be  TCP_ESTABLISHED. Also, in this case,
+	 * sk->sk_user_data is not a valid function pointer.
+	 */
+
+	if (sk->sk_state == TCP_LISTEN) {
+		queue_work(o2net_wq, &o2net_listen_work);
+	} else {
+		ready = NULL;
+	}
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+	if (ready != NULL)
+		ready(sk);
+}
+
+static int o2net_open_listening_sock(__be32 addr, __be16 port)
+{
+	struct socket *sock = NULL;
+	int ret;
+	struct sockaddr_in sin = {
+		.sin_family = PF_INET,
+		.sin_addr = { .s_addr = addr },
+		.sin_port = port,
+	};
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
+		goto out;
+	}
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = o2net_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	o2net_listen_sock = sock;
+	INIT_WORK(&o2net_listen_work, o2net_accept_many);
+
+	sock->sk->sk_reuse = SK_CAN_REUSE;
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0) {
+		printk(KERN_ERR "o2net: Error %d while binding socket at "
+		       "%pI4:%u\n", ret, &addr, ntohs(port)); 
+		goto out;
+	}
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0)
+		printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+		       ret, &addr, ntohs(port));
+
+out:
+	if (ret) {
+		o2net_listen_sock = NULL;
+		if (sock)
+			sock_release(sock);
+	}
+	return ret;
+}
+
+/*
+ * called from node manager when we should bring up our network listening
+ * socket.  node manager handles all the serialization to only call this
+ * once and to match it with o2net_stop_listening().  note,
+ * o2nm_this_node() doesn't work yet as we're being called while it
+ * is being set up.
+ */
+int o2net_start_listening(struct o2nm_node *node)
+{
+	int ret = 0;
+
+	BUG_ON(o2net_wq != NULL);
+	BUG_ON(o2net_listen_sock != NULL);
+
+	mlog(ML_KTHREAD, "starting o2net thread...\n");
+	o2net_wq = create_singlethread_workqueue("o2net");
+	if (o2net_wq == NULL) {
+		mlog(ML_ERROR, "unable to launch o2net thread\n");
+		return -ENOMEM; /* ? */
+	}
+
+	ret = o2net_open_listening_sock(node->nd_ipv4_address,
+					node->nd_ipv4_port);
+	if (ret) {
+		destroy_workqueue(o2net_wq);
+		o2net_wq = NULL;
+	} else
+		o2quo_conn_up(node->nd_num);
+
+	return ret;
+}
+
+/* again, o2nm_this_node() doesn't work here as we're involved in
+ * tearing it down */
+void o2net_stop_listening(struct o2nm_node *node)
+{
+	struct socket *sock = o2net_listen_sock;
+	size_t i;
+
+	BUG_ON(o2net_wq == NULL);
+	BUG_ON(o2net_listen_sock == NULL);
+
+	/* stop the listening socket from generating work */
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready = sock->sk->sk_user_data;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2nm_node *node = o2nm_get_node_by_num(i);
+		if (node) {
+			o2net_disconnect_node(node);
+			o2nm_node_put(node);
+		}
+	}
+
+	/* finish all work and tear down the work queue */
+	mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
+	destroy_workqueue(o2net_wq);
+	o2net_wq = NULL;
+
+	sock_release(o2net_listen_sock);
+	o2net_listen_sock = NULL;
+
+	o2quo_conn_err(node->nd_num);
+}
+
+/* ------------------------------------------------------------ */
+
+int o2net_init(void)
+{
+	unsigned long i;
+
+	o2quo_init();
+
+	if (o2net_debugfs_init())
+		goto out;
+
+	o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
+	o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
+	o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
+		goto out;
+
+	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
+	o2net_hand->connector_id = cpu_to_be64(1);
+
+	o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
+	o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2net_node *nn = o2net_nn_from_num(i);
+
+		atomic_set(&nn->nn_timeout, 0);
+		spin_lock_init(&nn->nn_lock);
+		INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
+		INIT_DELAYED_WORK(&nn->nn_connect_expired,
+				  o2net_connect_expired);
+		INIT_DELAYED_WORK(&nn->nn_still_up, o2net_still_up);
+		/* until we see hb from a node we'll return einval */
+		nn->nn_persistent_error = -ENOTCONN;
+		init_waitqueue_head(&nn->nn_sc_wq);
+		idr_init(&nn->nn_status_idr);
+		INIT_LIST_HEAD(&nn->nn_status_list);
+	}
+
+	return 0;
+
+out:
+	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
+
+	o2quo_exit();
+	return -ENOMEM;
+}
+
+void o2net_exit(void)
+{
+	o2quo_exit();
+	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
+	o2net_debugfs_exit();
+}
diff --git a/kernel/fs/ocfs2/cluster/tcp.h b/kernel/fs/ocfs2/cluster/tcp.h
new file mode 100644
index 000000000..c571e849f
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,155 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_TCP_H
+#define O2CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#include <linux/tcp.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+struct o2net_msg
+{
+	__be16 magic;
+	__be16 data_len;
+	__be16 msg_type;
+	__be16 pad1;
+	__be32 sys_status;
+	__be32 status;
+	__be32 key;
+	__be32 msg_num;
+	__u8  buf[0];
+};
+
+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
+				     void **ret_data);
+typedef void (o2net_post_msg_handler_func)(int status, void *data,
+					   void *ret_data);
+
+#define O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct o2net_msg))
+
+/* same as hb delay, we're waiting for another node to recognize our hb */
+#define O2NET_RECONNECT_DELAY_MS_DEFAULT	2000
+
+#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT	2000
+#define O2NET_IDLE_TIMEOUT_MS_DEFAULT		30000
+
+#define O2NET_TCP_USER_TIMEOUT			0x7fffffff
+
+/* TODO: figure this out.... */
+static inline int o2net_link_down(int err, struct socket *sock)
+{
+	if (sock) {
+		if (sock->sk->sk_state != TCP_ESTABLISHED &&
+	    	    sock->sk->sk_state != TCP_CLOSE_WAIT)
+			return 1;
+	}
+
+	if (err >= 0)
+		return 0;
+	switch (err) {
+		/* ????????????????????????? */
+		case -ERESTARTSYS:
+		case -EBADF:
+		/* When the server has died, an ICMP port unreachable
+		 * message prompts ECONNREFUSED. */
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+			return 1;
+	}
+	return 0;
+}
+
+enum {
+	O2NET_DRIVER_UNINITED,
+	O2NET_DRIVER_READY,
+};
+
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status);
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
+			   size_t veclen, u8 target_node, int *status);
+
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   o2net_post_msg_handler_func *post_func,
+			   struct list_head *unreg_list);
+void o2net_unregister_handler_list(struct list_head *list);
+
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
+struct o2nm_node;
+int o2net_register_hb_callbacks(void);
+void o2net_unregister_hb_callbacks(void);
+int o2net_start_listening(struct o2nm_node *node);
+void o2net_stop_listening(struct o2nm_node *node);
+void o2net_disconnect_node(struct o2nm_node *node);
+int o2net_num_connected_peers(void);
+
+int o2net_init(void);
+void o2net_exit(void);
+
+struct o2net_send_tracking;
+struct o2net_sock_container;
+
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static inline int o2net_debugfs_init(void)
+{
+	return 0;
+}
+static inline void o2net_debugfs_exit(void)
+{
+}
+static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif	/* CONFIG_DEBUG_FS */
+
+#endif /* O2CLUSTER_TCP_H */
diff --git a/kernel/fs/ocfs2/cluster/tcp_internal.h b/kernel/fs/ocfs2/cluster/tcp_internal.h
new file mode 100644
index 000000000..b95e7df5b
--- /dev/null
+++ b/kernel/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,242 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_TCP_INTERNAL_H
+#define O2CLUSTER_TCP_INTERNAL_H
+
+#define O2NET_MSG_MAGIC           ((u16)0xfa55)
+#define O2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define O2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
+#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
+
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
+/*
+ * This version number represents quite a lot, unfortunately.  It not
+ * only represents the raw network message protocol on the wire but also
+ * locking semantics of the file system using the protocol.  It should
+ * be somewhere else, I'm sure, but right now it isn't.
+ *
+ * With version 11, we separate out the filesystem locking portion.  The
+ * filesystem now has a major.minor version it negotiates.  Version 11
+ * introduces this negotiation to the o2dlm protocol, and as such the
+ * version here in tcp_internal.h should not need to be bumped for
+ * filesystem locking changes.
+ *
+ * New in version 11
+ * 	- Negotiation of filesystem locking in the dlm join.
+ *
+ * New in version 10:
+ * 	- Meta/data locks combined
+ *
+ * New in version 9:
+ * 	- All votes removed
+ *
+ * New in version 8:
+ * 	- Replace delete inode votes with a cluster lock
+ *
+ * New in version 7:
+ * 	- DLM join domain includes the live nodemap
+ *
+ * New in version 6:
+ * 	- DLM lockres remote refcount fixes.
+ *
+ * New in version 5:
+ * 	- Network timeout checking protocol
+ *
+ * New in version 4:
+ * 	- Remove i_generation from lock names for better stat performance.
+ *
+ * New in version 3:
+ * 	- Replace dentry votes with a cluster lock
+ *
+ * New in version 2:
+ * 	- full 64 bit i_size in the metadata lock lvbs
+ * 	- introduction of "rw" lock and pushing meta/data locking down
+ */
+#define O2NET_PROTOCOL_VERSION 11ULL
+struct o2net_handshake {
+	__be64	protocol_version;
+	__be64	connector_id;
+	__be32  o2hb_heartbeat_timeout_ms;
+	__be32  o2net_idle_timeout_ms;
+	__be32  o2net_keepalive_delay_ms;
+	__be32  o2net_reconnect_delay_ms;
+};
+
+struct o2net_node {
+	/* this is never called from int/bh */
+	spinlock_t			nn_lock;
+
+	/* set the moment an sc is allocated and a connect is started */
+	struct o2net_sock_container	*nn_sc;
+	/* _valid is only set after the handshake passes and tx can happen */
+	unsigned			nn_sc_valid:1;
+	/* if this is set tx just returns it */
+	int				nn_persistent_error;
+	/* It is only set to 1 after the idle time out. */
+	atomic_t			nn_timeout;
+
+	/* threads waiting for an sc to arrive wait on the wq for generation
+	 * to increase.  it is increased when a connecting socket succeeds
+	 * or fails or when an accepted socket is attached. */
+	wait_queue_head_t		nn_sc_wq;
+
+	struct idr			nn_status_idr;
+	struct list_head		nn_status_list;
+
+	/* connects are attempted from when heartbeat comes up until either hb
+	 * goes down, the node is unconfigured, or a connect succeeds.
+	 * connect_work is queued from set_nn_state both from hb up and from
+	 * itself if a connect attempt fails and so can be self-arming.
+	 * shutdown is careful to first mark the nn such that no connects will
+	 * be attempted before canceling delayed connect work and flushing the
+	 * queue. */
+	struct delayed_work		nn_connect_work;
+	unsigned long			nn_last_connect_attempt;
+
+	/* this is queued as nodes come up and is canceled when a connection is
+	 * established.  this expiring gives up on the node and errors out
+	 * transmits */
+	struct delayed_work		nn_connect_expired;
+
+	/* after we give up on a socket we wait a while before deciding
+	 * that it is still heartbeating and that we should do some
+	 * quorum work */
+	struct delayed_work		nn_still_up;
+};
+
+struct o2net_sock_container {
+	struct kref		sc_kref;
+	/* the next two are valid for the life time of the sc */
+	struct socket		*sc_sock;
+	struct o2nm_node	*sc_node;
+
+	/* all of these sc work structs hold refs on the sc while they are
+	 * queued.  they should not be able to ref a freed sc.  the teardown
+	 * race is with o2net_wq destruction in o2net_stop_listening() */
+
+	/* rx and connect work are generated from socket callbacks.  sc
+	 * shutdown removes the callbacks and then flushes the work queue */
+	struct work_struct	sc_rx_work;
+	struct work_struct	sc_connect_work;
+	/* shutdown work is triggered in two ways.  the simple way is
+	 * for a code path calls ensure_shutdown which gets a lock, removes
+	 * the sc from the nn, and queues the work.  in this case the
+	 * work is single-shot.  the work is also queued from a sock
+	 * callback, though, and in this case the work will find the sc
+	 * still on the nn and will call ensure_shutdown itself.. this
+	 * ends up triggering the shutdown work again, though nothing
+	 * will be done in that second iteration.  so work queue teardown
+	 * has to be careful to remove the sc from the nn before waiting
+	 * on the work queue so that the shutdown work doesn't remove the
+	 * sc and rearm itself.
+	 */
+	struct work_struct	sc_shutdown_work;
+
+	struct timer_list	sc_idle_timeout;
+	struct delayed_work	sc_keepalive_work;
+
+	unsigned		sc_handshake_ok:1;
+
+	struct page 		*sc_page;
+	size_t			sc_page_off;
+
+	/* original handlers for the sockets */
+	void			(*sc_state_change)(struct sock *sk);
+	void			(*sc_data_ready)(struct sock *sk);
+
+	u32			sc_msg_key;
+	u16			sc_msg_type;
+
+#ifdef CONFIG_DEBUG_FS
+	struct list_head        sc_net_debug_item;
+	ktime_t			sc_tv_timer;
+	ktime_t			sc_tv_data_ready;
+	ktime_t			sc_tv_advance_start;
+	ktime_t			sc_tv_advance_stop;
+	ktime_t			sc_tv_func_start;
+	ktime_t			sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+	ktime_t			sc_tv_acquiry_total;
+	ktime_t			sc_tv_send_total;
+	ktime_t			sc_tv_status_total;
+	u32			sc_send_count;
+	u32			sc_recv_count;
+	ktime_t			sc_tv_process_total;
+#endif
+	struct mutex		sc_send_lock;
+};
+
+struct o2net_msg_handler {
+	struct rb_node		nh_node;
+	u32			nh_max_len;
+	u32			nh_msg_type;
+	u32			nh_key;
+	o2net_msg_handler_func	*nh_func;
+	o2net_msg_handler_func	*nh_func_data;
+	o2net_post_msg_handler_func
+				*nh_post_func;
+	struct kref		nh_kref;
+	struct list_head	nh_unregister_item;
+};
+
+enum o2net_system_error {
+	O2NET_ERR_NONE = 0,
+	O2NET_ERR_NO_HNDLR,
+	O2NET_ERR_OVERFLOW,
+	O2NET_ERR_DIED,
+	O2NET_ERR_MAX
+};
+
+struct o2net_status_wait {
+	enum o2net_system_error	ns_sys_status;
+	s32			ns_status;
+	int			ns_id;
+	wait_queue_head_t	ns_wq;
+	struct list_head	ns_node_item;
+};
+
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+	struct list_head		st_net_debug_item;
+	struct task_struct		*st_task;
+	struct o2net_sock_container	*st_sc;
+	u32				st_id;
+	u32				st_msg_type;
+	u32				st_msg_key;
+	u8				st_node;
+	ktime_t				st_sock_time;
+	ktime_t				st_send_time;
+	ktime_t				st_status_time;
+};
+#else
+struct o2net_send_tracking {
+	u32	dummy;
+};
+#endif	/* CONFIG_DEBUG_FS */
+
+#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/kernel/fs/ocfs2/dcache.c b/kernel/fs/ocfs2/dcache.c
new file mode 100644
index 000000000..290373024
--- /dev/null
+++ b/kernel/fs/ocfs2/dcache.c
@@ -0,0 +1,474 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.c
+ *
+ * dentry cache handling code
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "ocfs2_trace.h"
+
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+	unsigned long gen =
+		OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen;
+	BUG_ON(d_inode(dentry));
+	dentry->d_fsdata = (void *)gen;
+}
+
+
+static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode;
+	int ret = 0;    /* if all else fails, just return false */
+	struct ocfs2_super *osb;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = d_inode(dentry);
+	osb = OCFS2_SB(dentry->d_sb);
+
+	trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
+				      dentry->d_name.name);
+
+	/* For a negative dentry -
+	 * check the generation number of the parent and compare with the
+	 * one stored in the inode.
+	 */
+	if (inode == NULL) {
+		unsigned long gen = (unsigned long) dentry->d_fsdata;
+		unsigned long pgen;
+		spin_lock(&dentry->d_lock);
+		pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen;
+		spin_unlock(&dentry->d_lock);
+		trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
+						       dentry->d_name.name,
+						       pgen, gen);
+		if (gen != pgen)
+			goto bail;
+		goto valid;
+	}
+
+	BUG_ON(!osb);
+
+	if (inode == osb->root_inode || is_bad_inode(inode))
+		goto bail;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	/* did we or someone else delete this inode? */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		trace_ocfs2_dentry_revalidate_delete(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/*
+	 * We don't need a cluster lock to test this because once an
+	 * inode nlink hits zero, it never goes back.
+	 */
+	if (inode->i_nlink == 0) {
+		trace_ocfs2_dentry_revalidate_orphaned(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			S_ISDIR(inode->i_mode));
+		goto bail;
+	}
+
+	/*
+	 * If the last lookup failed to create dentry lock, let us
+	 * redo it.
+	 */
+	if (!dentry->d_fsdata) {
+		trace_ocfs2_dentry_revalidate_nofsdata(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto bail;
+	}
+
+valid:
+	ret = 1;
+
+bail:
+	trace_ocfs2_dentry_revalidate_ret(ret);
+	return ret;
+}
+
+static int ocfs2_match_dentry(struct dentry *dentry,
+			      u64 parent_blkno,
+			      int skip_unhashed)
+{
+	struct inode *parent;
+
+	/*
+	 * ocfs2_lookup() does a d_splice_alias() _before_ attaching
+	 * to the lock data, so we skip those here, otherwise
+	 * ocfs2_dentry_attach_lock() will get its original dentry
+	 * back.
+	 */
+	if (!dentry->d_fsdata)
+		return 0;
+
+	if (!dentry->d_parent)
+		return 0;
+
+	if (skip_unhashed && d_unhashed(dentry))
+		return 0;
+
+	parent = d_inode(dentry->d_parent);
+	/* Negative parent dentry? */
+	if (!parent)
+		return 0;
+
+	/* Name is in a different directory. */
+	if (OCFS2_I(parent)->ip_blkno != parent_blkno)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Walk the inode alias list, and find a dentry which has a given
+ * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
+ * is looking for a dentry_lock reference. The downconvert thread is
+ * looking to unhash aliases, so we allow it to skip any that already
+ * have that property.
+ */
+struct dentry *ocfs2_find_local_alias(struct inode *inode,
+				      u64 parent_blkno,
+				      int skip_unhashed)
+{
+	struct dentry *dentry;
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
+		spin_lock(&dentry->d_lock);
+		if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
+			trace_ocfs2_find_local_alias(dentry->d_name.len,
+						     dentry->d_name.name);
+
+			dget_dlock(dentry);
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&inode->i_lock);
+			return dentry;
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&inode->i_lock);
+	return NULL;
+}
+
+DEFINE_SPINLOCK(dentry_attach_lock);
+
+/*
+ * Attach this dentry to a cluster lock.
+ *
+ * Dentry locks cover all links in a given directory to a particular
+ * inode. We do this so that ocfs2 can build a lock name which all
+ * nodes in the cluster can agree on at all times. Shoving full names
+ * in the cluster lock won't work due to size restrictions. Covering
+ * links inside of a directory is a good compromise because it still
+ * allows us to use the parent directory lock to synchronize
+ * operations.
+ *
+ * Call this function with the parent dir semaphore and the parent dir
+ * cluster lock held.
+ *
+ * The dir semaphore will protect us from having to worry about
+ * concurrent processes on our node trying to attach a lock at the
+ * same time.
+ *
+ * The dir cluster lock (held at either PR or EX mode) protects us
+ * from unlink and rename on other nodes.
+ *
+ * A dput() can happen asynchronously due to pruning, so we cover
+ * attaching and detaching the dentry lock with a
+ * dentry_attach_lock.
+ *
+ * A node which has done lookup on a name retains a protected read
+ * lock until final dput. If the user requests and unlink or rename,
+ * the protected read is upgraded to an exclusive lock. Other nodes
+ * who have seen the dentry will then be informed that they need to
+ * downgrade their lock, which will involve d_delete on the
+ * dentry. This happens in ocfs2_dentry_convert_worker().
+ */
+int ocfs2_dentry_attach_lock(struct dentry *dentry,
+			     struct inode *inode,
+			     u64 parent_blkno)
+{
+	int ret;
+	struct dentry *alias;
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
+				       (unsigned long long)parent_blkno, dl);
+
+	/*
+	 * Negative dentry. We ignore these for now.
+	 *
+	 * XXX: Could we can improve ocfs2_dentry_revalidate() by
+	 * tracking these?
+	 */
+	if (!inode)
+		return 0;
+
+	if (d_really_is_negative(dentry) && dentry->d_fsdata) {
+		/* Converting a negative dentry to positive
+		   Clear dentry->d_fsdata */
+		dentry->d_fsdata = dl = NULL;
+	}
+
+	if (dl) {
+		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
+				" \"%pd\": old parent: %llu, new: %llu\n",
+				dentry,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)dl->dl_parent_blkno);
+		return 0;
+	}
+
+	alias = ocfs2_find_local_alias(inode, parent_blkno, 0);
+	if (alias) {
+		/*
+		 * Great, an alias exists, which means we must have a
+		 * dentry lock already. We can just grab the lock off
+		 * the alias and add it to the list.
+		 *
+		 * We're depending here on the fact that this dentry
+		 * was found and exists in the dcache and so must have
+		 * a reference to the dentry_lock because we can't
+		 * race creates. Final dput() cannot happen on it
+		 * since we have it pinned, so our reference is safe.
+		 */
+		dl = alias->d_fsdata;
+		mlog_bug_on_msg(!dl, "parent %llu, ino %llu\n",
+				(unsigned long long)parent_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
+				" \"%pd\": old parent: %llu, new: %llu\n",
+				dentry,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)dl->dl_parent_blkno);
+
+		trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+		goto out_attach;
+	}
+
+	/*
+	 * There are no other aliases
+	 */
+	dl = kmalloc(sizeof(*dl), GFP_NOFS);
+	if (!dl) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	dl->dl_count = 0;
+	/*
+	 * Does this have to happen below, for all attaches, in case
+	 * the struct inode gets blown away by the downconvert thread?
+	 */
+	dl->dl_inode = igrab(inode);
+	dl->dl_parent_blkno = parent_blkno;
+	ocfs2_dentry_lock_res_init(dl, parent_blkno, inode);
+
+out_attach:
+	spin_lock(&dentry_attach_lock);
+	dentry->d_fsdata = dl;
+	dl->dl_count++;
+	spin_unlock(&dentry_attach_lock);
+
+	/*
+	 * This actually gets us our PRMODE level lock. From now on,
+	 * we'll have a notification if one of these names is
+	 * destroyed on another node.
+	 */
+	ret = ocfs2_dentry_lock(dentry, 0);
+	if (!ret)
+		ocfs2_dentry_unlock(dentry, 0);
+	else
+		mlog_errno(ret);
+
+	/*
+	 * In case of error, manually free the allocation and do the iput().
+	 * We need to do this because error here means no d_instantiate(),
+	 * which means iput() will not be called during dput(dentry).
+	 */
+	if (ret < 0 && !alias) {
+		ocfs2_lock_res_free(&dl->dl_lockres);
+		BUG_ON(dl->dl_count != 1);
+		spin_lock(&dentry_attach_lock);
+		dentry->d_fsdata = NULL;
+		spin_unlock(&dentry_attach_lock);
+		kfree(dl);
+		iput(inode);
+	}
+
+	dput(alias);
+
+	return ret;
+}
+
+/*
+ * ocfs2_dentry_iput() and friends.
+ *
+ * At this point, our particular dentry is detached from the inodes
+ * alias list, so there's no way that the locking code can find it.
+ *
+ * The interesting stuff happens when we determine that our lock needs
+ * to go away because this is the last subdir alias in the
+ * system. This function needs to handle a couple things:
+ *
+ * 1) Synchronizing lock shutdown with the downconvert threads. This
+ *    is already handled for us via the lockres release drop function
+ *    called in ocfs2_release_dentry_lock()
+ *
+ * 2) A race may occur when we're doing our lock shutdown and
+ *    another process wants to create a new dentry lock. Right now we
+ *    let them race, which means that for a very short while, this
+ *    node might have two locks on a lock resource. This should be a
+ *    problem though because one of them is in the process of being
+ *    thrown out.
+ */
+static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
+				   struct ocfs2_dentry_lock *dl)
+{
+	iput(dl->dl_inode);
+	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+	ocfs2_lock_res_free(&dl->dl_lockres);
+	kfree(dl);
+}
+
+void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
+			   struct ocfs2_dentry_lock *dl)
+{
+	int unlock = 0;
+
+	BUG_ON(dl->dl_count == 0);
+
+	spin_lock(&dentry_attach_lock);
+	dl->dl_count--;
+	unlock = !dl->dl_count;
+	spin_unlock(&dentry_attach_lock);
+
+	if (unlock)
+		ocfs2_drop_dentry_lock(osb, dl);
+}
+
+static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+	if (!dl) {
+		/*
+		 * No dentry lock is ok if we're disconnected or
+		 * unhashed.
+		 */
+		if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
+		    !d_unhashed(dentry)) {
+			unsigned long long ino = 0ULL;
+			if (inode)
+				ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
+			mlog(ML_ERROR, "Dentry is missing cluster lock. "
+			     "inode: %llu, d_flags: 0x%x, d_name: %pd\n",
+			     ino, dentry->d_flags, dentry);
+		}
+
+		goto out;
+	}
+
+	mlog_bug_on_msg(dl->dl_count == 0, "dentry: %pd, count: %u\n",
+			dentry, dl->dl_count);
+
+	ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl);
+
+out:
+	iput(inode);
+}
+
+/*
+ * d_move(), but keep the locks in sync.
+ *
+ * When we are done, "dentry" will have the parent dir and name of
+ * "target", which will be thrown away.
+ *
+ * We manually update the lock of "dentry" if need be.
+ *
+ * "target" doesn't have it's dentry lock touched - we allow the later
+ * dput() to handle this for us.
+ *
+ * This is called during ocfs2_rename(), while holding parent
+ * directory locks. The dentries have already been deleted on other
+ * nodes via ocfs2_remote_dentry_delete().
+ *
+ * Normally, the VFS handles the d_move() for the file system, after
+ * the ->rename() callback. OCFS2 wants to handle this internally, so
+ * the new lock can be created atomically with respect to the cluster.
+ */
+void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
+		       struct inode *old_dir, struct inode *new_dir)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb);
+	struct inode *inode = d_inode(dentry);
+
+	/*
+	 * Move within the same directory, so the actual lock info won't
+	 * change.
+	 *
+	 * XXX: Is there any advantage to dropping the lock here?
+	 */
+	if (old_dir == new_dir)
+		goto out_move;
+
+	ocfs2_dentry_lock_put(osb, dentry->d_fsdata);
+
+	dentry->d_fsdata = NULL;
+	ret = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(new_dir)->ip_blkno);
+	if (ret)
+		mlog_errno(ret);
+
+out_move:
+	d_move(dentry, target);
+}
+
+const struct dentry_operations ocfs2_dentry_ops = {
+	.d_revalidate		= ocfs2_dentry_revalidate,
+	.d_iput			= ocfs2_dentry_iput,
+};
diff --git a/kernel/fs/ocfs2/dcache.h b/kernel/fs/ocfs2/dcache.h
new file mode 100644
index 000000000..55f58892b
--- /dev/null
+++ b/kernel/fs/ocfs2/dcache.h
@@ -0,0 +1,59 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DCACHE_H
+#define OCFS2_DCACHE_H
+
+extern const struct dentry_operations ocfs2_dentry_ops;
+
+struct ocfs2_dentry_lock {
+	unsigned int		dl_count;
+	u64			dl_parent_blkno;
+
+	/*
+	 * The ocfs2_dentry_lock keeps an inode reference until
+	 * dl_lockres has been destroyed. This is usually done in
+	 * ->d_iput() anyway, so there should be minimal impact.
+	 */
+	struct inode		*dl_inode;
+	struct ocfs2_lock_res	dl_lockres;
+};
+
+int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
+			     u64 parent_blkno);
+
+void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
+			   struct ocfs2_dentry_lock *dl);
+
+struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
+				      int skip_unhashed);
+
+void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
+		       struct inode *old_dir, struct inode *new_dir);
+
+extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
+
+#endif /* OCFS2_DCACHE_H */
diff --git a/kernel/fs/ocfs2/dir.c b/kernel/fs/ocfs2/dir.c
new file mode 100644
index 000000000..ccd4dcfc3
--- /dev/null
+++ b/kernel/fs/ocfs2/dir.c
@@ -0,0 +1,4501 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c
+ *
+ * Creates, reads, walks and deletes directory-nodes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/quotaops.h>
+#include <linux/sort.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+
+static unsigned char ocfs2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ocfs2_do_extend_dir(struct super_block *sb,
+			       handle_t *handle,
+			       struct inode *dir,
+			       struct buffer_head *parent_fe_bh,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
+
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_supports_dir_trailer(struct inode *dir)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
+}
+
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	return ocfs2_meta_ecc(osb) ||
+		ocfs2_supports_indexed_dirs(osb);
+}
+
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+	return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
+
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
+
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data)
+{
+	char *p = data;
+
+	p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+	return (struct ocfs2_dir_block_trailer *)p;
+}
+
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+				  struct ocfs2_dir_entry *de,
+				  unsigned long offset,
+				  unsigned long blklen)
+{
+	unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
+
+	if (!ocfs2_supports_dir_trailer(dir))
+		return 0;
+
+	if (offset != toff)
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+				   struct buffer_head *bh, u16 rec_len)
+{
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+	strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+	trailer->db_compat_rec_len =
+			cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+	trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+	trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+				     struct buffer_head *dx_root_bh,
+				     struct buffer_head *dirdata_bh)
+{
+	int ret;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	trailer->db_free_next = dx_root->dr_free_blk;
+	dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+	return ret;
+}
+
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+	return res->dl_prev_leaf_bh == NULL;
+}
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+	brelse(res->dl_dx_root_bh);
+	brelse(res->dl_leaf_bh);
+	brelse(res->dl_dx_leaf_bh);
+	brelse(res->dl_prev_leaf_bh);
+}
+
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+	return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while (--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = msg[i] + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+				   struct ocfs2_dx_hinfo *hinfo)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	const char	*p;
+	__u32		in[8], buf[4];
+
+	/*
+	 * XXX: Is this really necessary, if the index is never looked
+	 * at by readdir? Is a hash value of '0' a bad idea?
+	 */
+	if ((len == 1 && !strncmp(".", name, 1)) ||
+	    (len == 2 && !strncmp("..", name, 2))) {
+		buf[0] = buf[1] = 0;
+		goto out;
+	}
+
+#ifdef OCFS2_DEBUG_DX_DIRS
+	/*
+	 * This makes it very easy to debug indexing problems. We
+	 * should never allow this to be selected without hand editing
+	 * this file though.
+	 */
+	buf[0] = buf[1] = len;
+	goto out;
+#endif
+
+	memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+
+	p = name;
+	while (len > 0) {
+		str2hashbuf(p, len, in, 4);
+		TEA_transform(buf, in);
+		len -= 16;
+		p += 16;
+	}
+
+out:
+	hinfo->major_hash = buf[0];
+	hinfo->minor_hash = buf[1];
+}
+
+/*
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
+static int ocfs2_check_dir_entry(struct inode * dir,
+				 struct ocfs2_dir_entry * de,
+				 struct buffer_head * bh,
+				 unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
+		error_msg = "rec_len is smaller than minimal";
+	else if (unlikely(rlen % 4 != 0))
+		error_msg = "rec_len % 4 != 0";
+	else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
+		error_msg = "rec_len is too small for name_len";
+	else if (unlikely(
+		 ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
+		error_msg = "directory entry across blocks";
+
+	if (unlikely(error_msg != NULL))
+		mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
+		     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
+		     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
+		     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
+		     de->name_len);
+
+	return error_msg == NULL ? 1 : 0;
+}
+
+static inline int ocfs2_match(int len,
+			      const char * const name,
+			      struct ocfs2_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					char *first_de,
+					unsigned int bytes,
+					struct ocfs2_dir_entry **res_dir)
+{
+	struct ocfs2_dir_entry *de;
+	char *dlimit, *de_buf;
+	int de_len;
+	int ret = 0;
+
+	de_buf = first_de;
+	dlimit = de_buf + bytes;
+
+	while (de_buf < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		de = (struct ocfs2_dir_entry *) de_buf;
+
+		if (de_buf + namelen <= dlimit &&
+		    ocfs2_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+				ret = -1;
+				goto bail;
+			}
+			*res_dir = de;
+			ret = 1;
+			goto bail;
+		}
+
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0) {
+			ret = -1;
+			goto bail;
+		}
+
+		de_buf += de_len;
+		offset += de_len;
+	}
+
+bail:
+	trace_ocfs2_search_dirblock(ret);
+	return ret;
+}
+
+static struct buffer_head *ocfs2_find_entry_id(const char *name,
+					       int namelen,
+					       struct inode *dir,
+					       struct ocfs2_dir_entry **res_dir)
+{
+	int ret, found;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+
+	ret = ocfs2_read_inode_block(dir, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
+				      data->id_data, i_size_read(dir), res_dir);
+	if (found == 1)
+		return di_bh;
+
+	brelse(di_bh);
+out:
+	return NULL;
+}
+
+static int ocfs2_validate_dir_block(struct super_block *sb,
+				    struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(bh, sb);
+
+
+	/*
+	 * We don't validate dirents here, that's handled
+	 * in-place when the code walks them.
+	 */
+	trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 *
+	 * Note that we are safe to call this even if the directory
+	 * doesn't have a trailer.  Filesystems without metaecc will do
+	 * nothing, and filesystems with it will have one.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+	if (rc)
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	return rc;
+}
+
+/*
+ * Validate a directory trailer.
+ *
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
+ */
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+	int rc = 0;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+	if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Invalid dirblock #%llu: "
+			    "signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    trailer->db_signature);
+		goto out;
+	}
+	if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Directory block #%llu has an invalid "
+			    "db_blkno of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+		goto out;
+	}
+	if (le64_to_cpu(trailer->db_parent_dinode) !=
+	    OCFS2_I(dir)->ip_blkno) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Directory block #%llu on dinode "
+			    "#%llu has an invalid parent_dinode "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+				struct buffer_head **bh, int flags)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+				    ocfs2_validate_dir_block);
+	if (rc) {
+		mlog_errno(rc);
+		goto out;
+	}
+
+	if (!(flags & OCFS2_BH_READAHEAD) &&
+	    ocfs2_supports_dir_trailer(inode)) {
+		rc = ocfs2_check_dir_trailer(inode, tmp);
+		if (rc) {
+			if (!*bh)
+				brelse(tmp);
+			mlog_errno(rc);
+			goto out;
+		}
+	}
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc ? -EIO : 0;
+}
+
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+				       struct buffer_head **bh)
+{
+	int ret;
+	struct buffer_head *tmp = *bh;
+
+	ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
+			       ocfs2_validate_dir_block);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_supports_dir_trailer(dir)) {
+		ret = ocfs2_check_dir_trailer(dir, tmp);
+		if (ret) {
+			if (!*bh)
+				brelse(tmp);
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!ret && !*bh)
+		*bh = tmp;
+out:
+	return ret;
+}
+
+static int ocfs2_validate_dx_root(struct super_block *sb,
+				  struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_dx_root_block *dx_root;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+
+	ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+	if (ret) {
+		mlog(ML_ERROR,
+		     "Checksum failed for dir index root block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return ret;
+	}
+
+	if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+		ocfs2_error(sb,
+			    "Dir Index Root # %llu has bad signature %.*s",
+			    (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+			    7, dx_root->dr_signature);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+			      struct buffer_head **dx_root_bh)
+{
+	int ret;
+	u64 blkno = le64_to_cpu(di->i_dx_root);
+	struct buffer_head *tmp = *dx_root_bh;
+
+	ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+			       ocfs2_validate_dx_root);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!ret && !*dx_root_bh)
+		*dx_root_bh = tmp;
+
+	return ret;
+}
+
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+				  struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+	if (ret) {
+		mlog(ML_ERROR,
+		     "Checksum failed for dir index leaf block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return ret;
+	}
+
+	if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+		ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+			    7, dx_leaf->dl_signature);
+		return -EROFS;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+			      struct buffer_head **dx_leaf_bh)
+{
+	int ret;
+	struct buffer_head *tmp = *dx_leaf_bh;
+
+	ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+			       ocfs2_validate_dx_leaf);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!ret && !*dx_leaf_bh)
+		*dx_leaf_bh = tmp;
+
+	return ret;
+}
+
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+				struct buffer_head **dx_leaf_bhs)
+{
+	int ret;
+
+	ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
+				ocfs2_validate_dx_leaf);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
+					       struct inode *dir,
+					       struct ocfs2_dir_entry **res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+
+	sb = dir->i_sb;
+
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	start = OCFS2_I(dir)->ip_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+
+				bh = NULL;
+				err = ocfs2_read_dir_block(dir, b++, &bh,
+							   OCFS2_BH_READAHEAD);
+				bh_use[ra_max] = bh;
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
+			/* read error, skip block & hope for the best.
+			 * ocfs2_read_dir_block() has released the bh. */
+			mlog(ML_ERROR, "reading directory %llu, "
+				    "offset %lu\n",
+				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    block);
+			goto next;
+		}
+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
+					  block << sb->s_blocksize_bits,
+					  bh->b_data, sb->s_blocksize,
+					  res_dir);
+		if (i == 1) {
+			OCFS2_I(dir)->ip_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+
+	trace_ocfs2_find_entry_el(ret);
+	return ret;
+}
+
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+				   struct ocfs2_extent_list *el,
+				   u32 major_hash,
+				   u32 *ret_cpos,
+				   u64 *ret_phys_blkno,
+				   unsigned int *ret_clen)
+{
+	int ret = 0, i, found;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "btree tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	found = 0;
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in btree", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	if (ret_phys_blkno)
+		*ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+	if (ret_cpos)
+		*ret_cpos = le32_to_cpu(rec->e_cpos);
+	if (ret_clen)
+		*ret_clen = le16_to_cpu(rec->e_leaf_clusters);
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
+ */
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+						   u32 minor_hash)
+{
+	return minor_hash & osb->osb_dx_mask;
+}
+
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+					  struct ocfs2_dx_hinfo *hinfo)
+{
+	return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
+
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+			       struct ocfs2_extent_list *el,
+			       struct ocfs2_dx_hinfo *hinfo,
+			       u32 *ret_cpos,
+			       u64 *ret_phys_blkno)
+{
+	int ret = 0;
+	unsigned int cend, uninitialized_var(clen);
+	u32 uninitialized_var(cpos);
+	u64 uninitialized_var(blkno);
+	u32 name_hash = hinfo->major_hash;
+
+	ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+				      &clen);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	cend = cpos + clen;
+	if (name_hash >= cend) {
+		/* We want the last cluster */
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+		cpos += clen - 1;
+	} else {
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+						  name_hash - cpos);
+		cpos = name_hash;
+	}
+
+	/*
+	 * We now have the cluster which should hold our entry. To
+	 * find the exact block from the start of the cluster to
+	 * search, we take the lower bits of the hash.
+	 */
+	blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+
+	if (ret_phys_blkno)
+		*ret_phys_blkno = blkno;
+	if (ret_cpos)
+		*ret_cpos = cpos;
+
+out:
+
+	return ret;
+}
+
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+			       struct inode *dir,
+			       struct ocfs2_dx_root_block *dx_root,
+			       struct ocfs2_dir_lookup_result *res)
+{
+	int ret, i, found;
+	u64 uninitialized_var(phys);
+	struct buffer_head *dx_leaf_bh = NULL;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct ocfs2_dx_entry *dx_entry = NULL;
+	struct buffer_head *dir_ent_bh = NULL;
+	struct ocfs2_dir_entry *dir_ent = NULL;
+	struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+	struct ocfs2_extent_list *dr_el;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+
+	if (ocfs2_dx_root_inline(dx_root)) {
+		entry_list = &dx_root->dr_entries;
+		goto search;
+	}
+
+	dr_el = &dx_root->dr_list;
+
+	ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				  namelen, name, hinfo->major_hash,
+				  hinfo->minor_hash, (unsigned long long)phys);
+
+	ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+
+	trace_ocfs2_dx_dir_search_leaf_info(
+			le16_to_cpu(dx_leaf->dl_list.de_num_used),
+			le16_to_cpu(dx_leaf->dl_list.de_count));
+
+	entry_list = &dx_leaf->dl_list;
+
+search:
+	/*
+	 * Empty leaf is legal, so no need to check for that.
+	 */
+	found = 0;
+	for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+		dx_entry = &entry_list->de_entries[i];
+
+		if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+		    || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+			continue;
+
+		/*
+		 * Search unindexed leaf block now. We're not
+		 * guaranteed to find anything.
+		 */
+		ret = ocfs2_read_dir_block_direct(dir,
+					  le64_to_cpu(dx_entry->dx_dirent_blk),
+					  &dir_ent_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * XXX: We should check the unindexed block here,
+		 * before using it.
+		 */
+
+		found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+					      0, dir_ent_bh->b_data,
+					      dir->i_sb->s_blocksize, &dir_ent);
+		if (found == 1)
+			break;
+
+		if (found == -1) {
+			/* This means we found a bad directory entry. */
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		brelse(dir_ent_bh);
+		dir_ent_bh = NULL;
+	}
+
+	if (found <= 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	res->dl_leaf_bh = dir_ent_bh;
+	res->dl_entry = dir_ent;
+	res->dl_dx_leaf_bh = dx_leaf_bh;
+	res->dl_dx_entry = dx_entry;
+
+	ret = 0;
+out:
+	if (ret) {
+		brelse(dx_leaf_bh);
+		brelse(dir_ent_bh);
+	}
+	return ret;
+}
+
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+			       struct inode *dir,
+			       struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+
+	ret = ocfs2_read_inode_block(dir, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+	ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+	if (ret) {
+		if (ret != -ENOENT)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	lookup->dl_dx_root_bh = dx_root_bh;
+	dx_root_bh = NULL;
+out:
+	brelse(di_bh);
+	brelse(dx_root_bh);
+	return ret;
+}
+
+/*
+ * Try to find an entry of the provided name within 'dir'.
+ *
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
+ *
+ * Caller can NOT assume anything about the contents of the
+ * buffer_heads - they are passed back only so that it can be passed
+ * into any one of the manipulation functions (add entry, delete
+ * entry, etc). As an example, bh in the extent directory case is a
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
+ */
+int ocfs2_find_entry(const char *name, int namelen,
+		     struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
+{
+	struct buffer_head *bh;
+	struct ocfs2_dir_entry *res_dir = NULL;
+
+	if (ocfs2_dir_indexed(dir))
+		return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+
+	/*
+	 * The unindexed dir code only uses part of the lookup
+	 * structure, so there's no reason to push it down further
+	 * than this.
+	 */
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+	else
+		bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+
+	if (bh == NULL)
+		return -ENOENT;
+
+	lookup->dl_leaf_bh = bh;
+	lookup->dl_entry = res_dir;
+	return 0;
+}
+
+/*
+ * Update inode number and type of a previously found directory entry.
+ */
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+		       struct ocfs2_dir_lookup_result *res,
+		       struct inode *new_entry_inode)
+{
+	int ret;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
+	struct ocfs2_dir_entry *de = res->dl_entry;
+	struct buffer_head *de_bh = res->dl_leaf_bh;
+
+	/*
+	 * The same code works fine for both inline-data and extent
+	 * based directories, so no need to split this up.  The only
+	 * difference is the journal_access function.
+	 */
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
+	ret = access(handle, INODE_CACHE(dir), de_bh,
+		     OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
+	ocfs2_set_de_type(de, new_entry_inode->i_mode);
+
+	ocfs2_journal_dirty(handle, de_bh);
+
+out:
+	return ret;
+}
+
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
+				struct ocfs2_dir_entry *de_del,
+				struct buffer_head *bh, char *first_de,
+				unsigned int bytes)
+{
+	struct ocfs2_dir_entry *de, *pde;
+	int i, status = -ENOENT;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
+	i = 0;
+	pde = NULL;
+	de = (struct ocfs2_dir_entry *) first_de;
+	while (i < bytes) {
+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+			status = -EIO;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (de == de_del)  {
+			status = access(handle, INODE_CACHE(dir), bh,
+					OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			if (pde)
+				le16_add_cpu(&pde->rec_len,
+						le16_to_cpu(de->rec_len));
+			de->inode = 0;
+			dir->i_version++;
+			ocfs2_journal_dirty(handle, bh);
+			goto bail;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+	}
+bail:
+	return status;
+}
+
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+	unsigned int hole;
+
+	if (le64_to_cpu(de->inode) == 0)
+		hole = le16_to_cpu(de->rec_len);
+	else
+		hole = le16_to_cpu(de->rec_len) -
+			OCFS2_DIR_REC_LEN(de->name_len);
+
+	return hole;
+}
+
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+				  struct buffer_head *dirblock_bh)
+{
+	int size, this_hole, largest_hole = 0;
+	char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+	struct ocfs2_dir_entry *de;
+
+	trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+	size = ocfs2_dir_trailer_blk_off(sb);
+	limit = start + size;
+	de_buf = start;
+	de = (struct ocfs2_dir_entry *)de_buf;
+	do {
+		if (de_buf != trailer) {
+			this_hole = ocfs2_figure_dirent_hole(de);
+			if (this_hole > largest_hole)
+				largest_hole = this_hole;
+		}
+
+		de_buf += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)de_buf;
+	} while (de_buf < limit);
+
+	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+		return largest_hole;
+	return 0;
+}
+
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+				       int index)
+{
+	int num_used = le16_to_cpu(entry_list->de_num_used);
+
+	if (num_used == 1 || index == (num_used - 1))
+		goto clear;
+
+	memmove(&entry_list->de_entries[index],
+		&entry_list->de_entries[index + 1],
+		(num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+	num_used--;
+	memset(&entry_list->de_entries[num_used], 0,
+	       sizeof(struct ocfs2_dx_entry));
+	entry_list->de_num_used = cpu_to_le16(num_used);
+}
+
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+				 struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, index, max_rec_len, add_to_free_list = 0;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+	struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+	struct ocfs2_dir_block_trailer *trailer;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	/*
+	 * This function gets a bit messy because we might have to
+	 * modify the root block, regardless of whether the indexed
+	 * entries are stored inline.
+	 */
+
+	/*
+	 * *Only* set 'entry_list' here, based on where we're looking
+	 * for the indexed entries. Later, we might still want to
+	 * journal both blocks, based on free list state.
+	 */
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	if (ocfs2_dx_root_inline(dx_root)) {
+		entry_list = &dx_root->dr_entries;
+	} else {
+		dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+		entry_list = &dx_leaf->dl_list;
+	}
+
+	/* Neither of these are a disk corruption - that should have
+	 * been caught by lookup, before we got here. */
+	BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+	BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+
+	index = (char *)dx_entry - (char *)entry_list->de_entries;
+	index /= sizeof(*dx_entry);
+
+	if (index >= le16_to_cpu(entry_list->de_num_used)) {
+		mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+		     (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+		     entry_list, dx_entry);
+		return -EIO;
+	}
+
+	/*
+	 * We know that removal of this dirent will leave enough room
+	 * for a new one, so add this block to the free list if it
+	 * isn't already there.
+	 */
+	trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+	if (trailer->db_free_rec_len == 0)
+		add_to_free_list = 1;
+
+	/*
+	 * Add the block holding our index into the journal before
+	 * removing the unindexed entry. If we get an error return
+	 * from __ocfs2_delete_entry(), then it hasn't removed the
+	 * entry yet. Likewise, successful return means we *must*
+	 * remove the indexed entry.
+	 *
+	 * We're also careful to journal the root tree block here as
+	 * the entry count needs to be updated. Also, we might be
+	 * adding to the start of the free list.
+	 */
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!ocfs2_dx_root_inline(dx_root)) {
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+					      lookup->dl_dx_leaf_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    index);
+
+	ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+				   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+	trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+	if (add_to_free_list) {
+		trailer->db_free_next = dx_root->dr_free_blk;
+		dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+		ocfs2_journal_dirty(handle, dx_root_bh);
+	}
+
+	/* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+	ocfs2_journal_dirty(handle, leaf_bh);
+
+	le32_add_cpu(&dx_root->dr_num_entries, -1);
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+	ocfs2_dx_list_remove_entry(entry_list, index);
+
+	if (!ocfs2_dx_root_inline(dx_root))
+		ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+
+out:
+	return ret;
+}
+
+static inline int ocfs2_delete_entry_id(handle_t *handle,
+					struct inode *dir,
+					struct ocfs2_dir_entry *de_del,
+					struct buffer_head *bh)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+
+	ret = ocfs2_read_inode_block(dir, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
+				   i_size_read(dir));
+
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+static inline int ocfs2_delete_entry_el(handle_t *handle,
+					struct inode *dir,
+					struct ocfs2_dir_entry *de_del,
+					struct buffer_head *bh)
+{
+	return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
+				    bh->b_size);
+}
+
+/*
+ * Delete a directory entry. Hide the details of directory
+ * implementation from the caller.
+ */
+int ocfs2_delete_entry(handle_t *handle,
+		       struct inode *dir,
+		       struct ocfs2_dir_lookup_result *res)
+{
+	if (ocfs2_dir_indexed(dir))
+		return ocfs2_delete_entry_dx(handle, dir, res);
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+					     res->dl_leaf_bh);
+
+	return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+				     res->dl_leaf_bh);
+}
+
+/*
+ * Check whether 'de' has enough room to hold an entry of
+ * 'new_rec_len' bytes.
+ */
+static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
+					 unsigned int new_rec_len)
+{
+	unsigned int de_really_used;
+
+	/* Check whether this is an empty record with enough space */
+	if (le64_to_cpu(de->inode) == 0 &&
+	    le16_to_cpu(de->rec_len) >= new_rec_len)
+		return 1;
+
+	/*
+	 * Record might have free space at the end which we can
+	 * use.
+	 */
+	de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
+	if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
+	    return 1;
+
+	return 0;
+}
+
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+					  struct ocfs2_dx_entry *dx_new_entry)
+{
+	int i;
+
+	i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+	dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+
+	le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
+
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+				       struct ocfs2_dx_hinfo *hinfo,
+				       u64 dirent_blk)
+{
+	int i;
+	struct ocfs2_dx_entry *dx_entry;
+
+	i = le16_to_cpu(entry_list->de_num_used);
+	dx_entry = &entry_list->de_entries[i];
+
+	memset(dx_entry, 0, sizeof(*dx_entry));
+	dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+	dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+	dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+
+	le16_add_cpu(&entry_list->de_num_used, 1);
+}
+
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+				      struct ocfs2_dx_hinfo *hinfo,
+				      u64 dirent_blk,
+				      struct buffer_head *dx_leaf_bh)
+{
+	int ret;
+	struct ocfs2_dx_leaf *dx_leaf;
+
+	ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+	ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+	ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+out:
+	return ret;
+}
+
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+					struct ocfs2_dx_hinfo *hinfo,
+					u64 dirent_blk,
+					struct ocfs2_dx_root_block *dx_root)
+{
+	ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+			       struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret = 0;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+	if (ocfs2_dx_root_inline(dx_root)) {
+		ocfs2_dx_inline_root_insert(dir, handle,
+					    &lookup->dl_hinfo,
+					    lookup->dl_leaf_bh->b_blocknr,
+					    dx_root);
+	} else {
+		ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+						 lookup->dl_leaf_bh->b_blocknr,
+						 lookup->dl_dx_leaf_bh);
+		if (ret)
+			goto out;
+	}
+
+	le32_add_cpu(&dx_root->dr_num_entries, 1);
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+	return ret;
+}
+
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+				       handle_t *handle,
+				       struct ocfs2_dir_lookup_result *lookup)
+{
+	struct ocfs2_dir_block_trailer *trailer, *prev;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *bh;
+
+	trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+
+	if (ocfs2_free_list_at_root(lookup)) {
+		bh = lookup->dl_dx_root_bh;
+		dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+		dx_root->dr_free_blk = trailer->db_free_next;
+	} else {
+		bh = lookup->dl_prev_leaf_bh;
+		prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+		prev->db_free_next = trailer->db_free_next;
+	}
+
+	trailer->db_free_rec_len = cpu_to_le16(0);
+	trailer->db_free_next = cpu_to_le64(0);
+
+	ocfs2_journal_dirty(handle, bh);
+	ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+				   struct ocfs2_dir_lookup_result *lookup)
+{
+	int max_rec_len;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	/* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+	max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+	if (max_rec_len) {
+		/*
+		 * There's still room in this block, so no need to remove it
+		 * from the free list. In this case, we just want to update
+		 * the rec len accounting.
+		 */
+		trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+		trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+		ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+	} else {
+		ocfs2_remove_block_from_free_list(dir, handle, lookup);
+	}
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * The lookup context must have been filled from
+ * ocfs2_prepare_dir_for_insert.
+ */
+int __ocfs2_add_entry(handle_t *handle,
+		      struct inode *dir,
+		      const char *name, int namelen,
+		      struct inode *inode, u64 blkno,
+		      struct buffer_head *parent_fe_bh,
+		      struct ocfs2_dir_lookup_result *lookup)
+{
+	unsigned long offset;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de, *de1;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+	struct super_block *sb = dir->i_sb;
+	int retval, status;
+	unsigned int size = sb->s_blocksize;
+	struct buffer_head *insert_bh = lookup->dl_leaf_bh;
+	char *data_start = insert_bh->b_data;
+
+	if (!namelen)
+		return -EINVAL;
+
+	if (ocfs2_dir_indexed(dir)) {
+		struct buffer_head *bh;
+
+		/*
+		 * An indexed dir may require that we update the free space
+		 * list. Reserve a write to the previous node in the list so
+		 * that we don't fail later.
+		 *
+		 * XXX: This can be either a dx_root_block, or an unindexed
+		 * directory tree leaf block.
+		 */
+		if (ocfs2_free_list_at_root(lookup)) {
+			bh = lookup->dl_dx_root_bh;
+			retval = ocfs2_journal_access_dr(handle,
+						 INODE_CACHE(dir), bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		} else {
+			bh = lookup->dl_prev_leaf_bh;
+			retval = ocfs2_journal_access_db(handle,
+						 INODE_CACHE(dir), bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		}
+		if (retval) {
+			mlog_errno(retval);
+			return retval;
+		}
+	} else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		data_start = di->id2.i_data.id_data;
+		size = i_size_read(dir);
+
+		BUG_ON(insert_bh != parent_fe_bh);
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) data_start;
+	while (1) {
+		BUG_ON((char *)de >= (size + data_start));
+
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+			retval = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			retval = -EEXIST;
+			goto bail;
+		}
+
+		/* We're guaranteed that we should have space, so we
+		 * can't possibly have hit the trailer...right? */
+		mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+				"Hit dir trailer trying to insert %.*s "
+			        "(namelen %d) into directory %llu.  "
+				"offset is %lu, trailer offset is %d\n",
+				namelen, name, namelen,
+				(unsigned long long)parent_fe_bh->b_blocknr,
+				offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+			if (retval < 0) {
+				mlog_errno(retval);
+				goto bail;
+			}
+
+			if (insert_bh == parent_fe_bh)
+				status = ocfs2_journal_access_di(handle,
+								 INODE_CACHE(dir),
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
+			else {
+				status = ocfs2_journal_access_db(handle,
+								 INODE_CACHE(dir),
+								 insert_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+
+				if (ocfs2_dir_indexed(dir)) {
+					status = ocfs2_dx_dir_insert(dir,
+								handle,
+								lookup);
+					if (status) {
+						mlog_errno(status);
+						goto bail;
+					}
+				}
+			}
+
+			/* By now the buffer is marked for journaling */
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				de1 = (struct ocfs2_dir_entry *)((char *) de +
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de1->rec_len =
+					cpu_to_le16(le16_to_cpu(de->rec_len) -
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+				de = de1;
+			}
+			de->file_type = OCFS2_FT_UNKNOWN;
+			if (blkno) {
+				de->inode = cpu_to_le64(blkno);
+				ocfs2_set_de_type(de, inode->i_mode);
+			} else
+				de->inode = 0;
+			de->name_len = namelen;
+			memcpy(de->name, name, namelen);
+
+			if (ocfs2_dir_indexed(dir))
+				ocfs2_recalc_free_list(dir, handle, lookup);
+
+			dir->i_version++;
+			ocfs2_journal_dirty(handle, insert_bh);
+			retval = 0;
+			goto bail;
+		}
+
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
+	retval = -ENOSPC;
+bail:
+	if (retval)
+		mlog_errno(retval);
+
+	return retval;
+}
+
+static int ocfs2_dir_foreach_blk_id(struct inode *inode,
+				    u64 *f_version,
+				    struct dir_context *ctx)
+{
+	int ret, i;
+	unsigned long offset = ctx->pos;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+	struct ocfs2_dir_entry *de;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	while (ctx->pos < i_size_read(inode)) {
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (*f_version != inode->i_version) {
+			for (i = 0; i < i_size_read(inode) && i < offset; ) {
+				de = (struct ocfs2_dir_entry *)
+					(data->id_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			ctx->pos = offset = i;
+			*f_version = inode->i_version;
+		}
+
+		de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+		if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
+			/* On error, skip the f_pos to the end. */
+			ctx->pos = i_size_read(inode);
+			break;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		if (le64_to_cpu(de->inode)) {
+			unsigned char d_type = DT_UNKNOWN;
+
+			if (de->file_type < OCFS2_FT_MAX)
+				d_type = ocfs2_filetype_table[de->file_type];
+
+			if (!dir_emit(ctx, de->name, de->name_len,
+				      le64_to_cpu(de->inode), d_type))
+				goto out;
+		}
+		ctx->pos += le16_to_cpu(de->rec_len);
+	}
+out:
+	brelse(di_bh);
+	return 0;
+}
+
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
+static int ocfs2_dir_foreach_blk_el(struct inode *inode,
+				    u64 *f_version,
+				    struct dir_context *ctx,
+				    bool persist)
+{
+	unsigned long offset, blk, last_ra_blk = 0;
+	int i;
+	struct buffer_head * bh, * tmp;
+	struct ocfs2_dir_entry * de;
+	struct super_block * sb = inode->i_sb;
+	unsigned int ra_sectors = 16;
+	int stored = 0;
+
+	bh = NULL;
+
+	offset = ctx->pos & (sb->s_blocksize - 1);
+
+	while (ctx->pos < i_size_read(inode)) {
+		blk = ctx->pos >> sb->s_blocksize_bits;
+		if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+			/* Skip the corrupt dirblock and keep trying */
+			ctx->pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/* The idea here is to begin with 8k read-ahead and to stay
+		 * 4k ahead of our current position.
+		 *
+		 * TODO: Use the pagecache for this. We just need to
+		 * make sure it's cluster-safe... */
+		if (!last_ra_blk
+		    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
+			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
+			     i > 0; i--) {
+				tmp = NULL;
+				if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+							  OCFS2_BH_READAHEAD))
+					brelse(tmp);
+			}
+			last_ra_blk = blk;
+			ra_sectors = 8;
+		}
+
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (*f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
+				| offset;
+			*f_version = inode->i_version;
+		}
+
+		while (ctx->pos < i_size_read(inode)
+		       && offset < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+				/* On error, skip the f_pos to the
+				   next block. */
+				ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				continue;
+			}
+			if (le64_to_cpu(de->inode)) {
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (de->file_type < OCFS2_FT_MAX)
+					d_type = ocfs2_filetype_table[de->file_type];
+				if (!dir_emit(ctx, de->name,
+						de->name_len,
+						le64_to_cpu(de->inode),
+						d_type)) {
+					brelse(bh);
+					return 0;
+				}
+				stored++;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			ctx->pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse(bh);
+		bh = NULL;
+		if (!persist && stored)
+			break;
+	}
+	return 0;
+}
+
+static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
+				 struct dir_context *ctx,
+				 bool persist)
+{
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+	return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
+}
+
+/*
+ * This is intended to be called from inside other kernel functions,
+ * so we fake some arguments.
+ */
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
+{
+	u64 version = inode->i_version;
+	ocfs2_dir_foreach_blk(inode, &version, ctx, true);
+	return 0;
+}
+
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
+{
+	int error = 0;
+	struct inode *inode = file_inode(file);
+	int lock_level = 0;
+
+	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
+	if (lock_level && error >= 0) {
+		/* We release EX lock which used to update atime
+		 * and get PR lock again to reduce contention
+		 * on commonly accessed directories. */
+		ocfs2_inode_unlock(inode, 1);
+		lock_level = 0;
+		error = ocfs2_inode_lock(inode, NULL, 0);
+	}
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		goto bail_nolock;
+	}
+
+	error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
+
+	ocfs2_inode_unlock(inode, lock_level);
+	if (error)
+		mlog_errno(error);
+
+bail_nolock:
+
+	return error;
+}
+
+/*
+ * NOTE: this should always be called with parent dir i_mutex taken.
+ */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct ocfs2_dir_lookup_result *lookup)
+{
+	int status = -ENOENT;
+
+	trace_ocfs2_find_files_on_disk(namelen, name, blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_find_entry(name, namelen, inode, lookup);
+	if (status)
+		goto leave;
+
+	*blkno = le64_to_cpu(lookup->dl_entry->inode);
+
+	status = 0;
+leave:
+
+	return status;
+}
+
+/*
+ * Convenience function for callers which just want the block number
+ * mapped to a name and don't require the full dirent info, etc.
+ */
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+			       int namelen, u64 *blkno)
+{
+	int ret;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	return ret;
+}
+
+/* Check for a name within a directory.
+ *
+ * Return 0 if the name does not exist
+ * Return -EEXIST if the directory contains the name
+ *
+ * Callers should have i_mutex + a cluster lock on dir
+ */
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen)
+{
+	int ret = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	trace_ocfs2_check_dir_for_entry(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+
+	if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
+		ret = -EEXIST;
+		mlog_errno(ret);
+	}
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	return ret;
+}
+
+struct ocfs2_empty_dir_priv {
+	struct dir_context ctx;
+	unsigned seen_dot;
+	unsigned seen_dot_dot;
+	unsigned seen_other;
+	unsigned dx_dir;
+};
+static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
+				   int name_len, loff_t pos, u64 ino,
+				   unsigned type)
+{
+	struct ocfs2_empty_dir_priv *p =
+		container_of(ctx, struct ocfs2_empty_dir_priv, ctx);
+
+	/*
+	 * Check the positions of "." and ".." records to be sure
+	 * they're in the correct place.
+	 *
+	 * Indexed directories don't need to proceed past the first
+	 * two entries, so we end the scan after seeing '..'. Despite
+	 * that, we allow the scan to proceed In the event that we
+	 * have a corrupted indexed directory (no dot or dot dot
+	 * entries). This allows us to double check for existing
+	 * entries which might not have been found in the index.
+	 */
+	if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
+		p->seen_dot = 1;
+		return 0;
+	}
+
+	if (name_len == 2 && !strncmp("..", name, 2) &&
+	    pos == OCFS2_DIR_REC_LEN(1)) {
+		p->seen_dot_dot = 1;
+
+		if (p->dx_dir && p->seen_dot)
+			return 1;
+
+		return 0;
+	}
+
+	p->seen_other = 1;
+	return 1;
+}
+
+static int ocfs2_empty_dir_dx(struct inode *inode,
+			      struct ocfs2_empty_dir_priv *priv)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_dx_root_block *dx_root;
+
+	priv->dx_dir = 1;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+		priv->seen_other = 1;
+
+out:
+	brelse(di_bh);
+	brelse(dx_root_bh);
+	return ret;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ *
+ * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
+ */
+int ocfs2_empty_dir(struct inode *inode)
+{
+	int ret;
+	struct ocfs2_empty_dir_priv priv = {
+		.ctx.actor = ocfs2_empty_dir_filldir,
+	};
+
+	if (ocfs2_dir_indexed(inode)) {
+		ret = ocfs2_empty_dir_dx(inode, &priv);
+		if (ret)
+			mlog_errno(ret);
+		/*
+		 * We still run ocfs2_dir_foreach to get the checks
+		 * for "." and "..".
+		 */
+	}
+
+	ret = ocfs2_dir_foreach(inode, &priv.ctx);
+	if (ret)
+		mlog_errno(ret);
+
+	if (!priv.seen_dot || !priv.seen_dot_dot) {
+		mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		/*
+		 * XXX: Is it really safe to allow an unlink to continue?
+		 */
+		return 1;
+	}
+
+	return !priv.seen_other;
+}
+
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+							  struct inode *parent,
+							  char *start,
+							  unsigned int size)
+{
+	struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
+
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	return de;
+}
+
+/*
+ * This works together with code in ocfs2_mknod_locked() which sets
+ * the inline-data flag and initializes the inline-data section.
+ */
+static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *di_bh)
+{
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inline_data *data = &di->id2.i_data;
+	unsigned int size = le16_to_cpu(data->id_count);
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	i_size_write(inode, size);
+	set_nlink(inode, 2);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+
+	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *fe_bh,
+				 struct ocfs2_alloc_context *data_ac,
+				 struct buffer_head **ret_new_bh)
+{
+	int status;
+	unsigned int size = osb->sb->s_blocksize;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de;
+
+	if (ocfs2_new_dir_wants_trailer(inode))
+		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+	status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+	if (ocfs2_new_dir_wants_trailer(inode)) {
+		int size = le16_to_cpu(de->rec_len);
+
+		/*
+		 * Figure out the size of the hole left over after
+		 * insertion of '.' and '..'. The trailer wants this
+		 * information.
+		 */
+		size -= OCFS2_DIR_REC_LEN(2);
+		size -= sizeof(struct ocfs2_dir_block_trailer);
+
+		ocfs2_init_dir_trailer(inode, new_bh, size);
+	}
+
+	ocfs2_journal_dirty(handle, new_bh);
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	set_nlink(inode, 2);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+	if (ret_new_bh) {
+		*ret_new_bh = new_bh;
+		new_bh = NULL;
+	}
+bail:
+	brelse(new_bh);
+
+	return status;
+}
+
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+				     handle_t *handle, struct inode *dir,
+				     struct buffer_head *di_bh,
+				     struct buffer_head *dirdata_bh,
+				     struct ocfs2_alloc_context *meta_ac,
+				     int dx_inline, u32 num_entries,
+				     struct buffer_head **ret_dx_root_bh)
+{
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	u16 dr_suballoc_bit;
+	u64 suballoc_loc, dr_blkno;
+	unsigned int num_bits;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &dr_suballoc_bit, &num_bits, &dr_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_dx_dir_attach_index(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)dr_blkno);
+
+	dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+	if (dx_root_bh == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
+
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	memset(dx_root, 0, osb->sb->s_blocksize);
+	strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+	dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
+	dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+	dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+	dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+	dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+	dx_root->dr_num_entries = cpu_to_le32(num_entries);
+	if (le16_to_cpu(trailer->db_free_rec_len))
+		dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+	else
+		dx_root->dr_free_blk = cpu_to_le64(0);
+
+	if (dx_inline) {
+		dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+		dx_root->dr_entries.de_count =
+			cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+	} else {
+		dx_root->dr_list.l_count =
+			cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+	}
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di->i_dx_root = cpu_to_le64(dr_blkno);
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	*ret_dx_root_bh = dx_root_bh;
+	dx_root_bh = NULL;
+
+out:
+	brelse(dx_root_bh);
+	return ret;
+}
+
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+				       handle_t *handle, struct inode *dir,
+				       struct buffer_head **dx_leaves,
+				       int num_dx_leaves, u64 start_blk)
+{
+	int ret, i;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct buffer_head *bh;
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		bh = sb_getblk(osb->sb, start_blk + i);
+		if (bh == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		dx_leaves[i] = bh;
+
+		ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
+
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+
+		memset(dx_leaf, 0, osb->sb->s_blocksize);
+		strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+		dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+		dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+		dx_leaf->dl_list.de_count =
+			cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+
+		trace_ocfs2_dx_dir_format_cluster(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)bh->b_blocknr,
+				le16_to_cpu(dx_leaf->dl_list.de_count));
+
+		ocfs2_journal_dirty(handle, bh);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+				      u32 cpos, handle_t *handle,
+				      struct ocfs2_alloc_context *data_ac,
+				      struct buffer_head **dx_leaves,
+				      int num_dx_leaves, u64 *ret_phys_blkno)
+{
+	int ret;
+	u32 phys, num;
+	u64 phys_blkno;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	/*
+	 * XXX: For create, this should claim cluster for the index
+	 * *before* the unindexed insert so that we have a better
+	 * chance of contiguousness as the directory grows in number
+	 * of entries.
+	 */
+	ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Format the new cluster first. That way, we're inserting
+	 * valid data.
+	 */
+	phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+	ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+					  num_dx_leaves, phys_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_phys_blkno = phys_blkno;
+out:
+	return ret;
+}
+
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, handle_t *handle,
+				    struct ocfs2_alloc_context *data_ac,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct buffer_head **dx_leaves,
+				    int num_dx_leaves)
+{
+	int ret;
+	u64 phys_blkno;
+
+	ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+					 num_dx_leaves, &phys_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
+				  meta_ac);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+							int *ret_num_leaves)
+{
+	int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+	struct buffer_head **dx_leaves;
+
+	dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+			    GFP_NOFS);
+	if (dx_leaves && ret_num_leaves)
+		*ret_num_leaves = num_dx_leaves;
+
+	return dx_leaves;
+}
+
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *di_bh,
+				 struct ocfs2_alloc_context *data_ac,
+				 struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct buffer_head *leaf_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_hinfo hinfo;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	/*
+	 * Our strategy is to create the directory as though it were
+	 * unindexed, then add the index block. This works with very
+	 * little complication since the state of a new directory is a
+	 * very well known quantity.
+	 *
+	 * Essentially, we have two dirents ("." and ".."), in the 1st
+	 * block which need indexing. These are easily inserted into
+	 * the index block.
+	 */
+
+	ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+				    data_ac, &leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+					meta_ac, 1, 2, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	/* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+	ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+	ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+	ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+	ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+out:
+	brelse(dx_root_bh);
+	brelse(leaf_bh);
+	return ret;
+}
+
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+		       handle_t *handle,
+		       struct inode *parent,
+		       struct inode *inode,
+		       struct buffer_head *fe_bh,
+		       struct ocfs2_alloc_context *data_ac,
+		       struct ocfs2_alloc_context *meta_ac)
+
+{
+	BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
+
+	if (ocfs2_supports_indexed_dirs(osb))
+		return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+					     data_ac, meta_ac);
+
+	return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
+				     data_ac, NULL);
+}
+
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+				    handle_t *handle,
+				    struct buffer_head **dx_leaves,
+				    int num_dx_leaves,
+				    u32 *num_dx_entries,
+				    struct buffer_head *dirent_bh)
+{
+	int ret = 0, namelen, i;
+	char *de_buf, *limit;
+	struct ocfs2_dir_entry *de;
+	struct buffer_head *dx_leaf_bh;
+	struct ocfs2_dx_hinfo hinfo;
+	u64 dirent_blk = dirent_bh->b_blocknr;
+
+	de_buf = dirent_bh->b_data;
+	limit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		namelen = de->name_len;
+		if (!namelen || !de->inode)
+			goto inc;
+
+		ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+
+		i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+		dx_leaf_bh = dx_leaves[i];
+
+		ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+						 dirent_blk, dx_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		*num_dx_entries = *num_dx_entries + 1;
+
+inc:
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * XXX: This expects dx_root_bh to already be part of the transaction.
+ */
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+					 struct buffer_head *dx_root_bh,
+					 struct buffer_head *dirent_bh)
+{
+	char *de_buf, *limit;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dx_hinfo hinfo;
+	u64 dirent_blk = dirent_bh->b_blocknr;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	de_buf = dirent_bh->b_data;
+	limit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (!de->name_len || !de->inode)
+			goto inc;
+
+		ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+
+		trace_ocfs2_dx_dir_index_root_block(
+				(unsigned long long)dir->i_ino,
+				hinfo.major_hash, hinfo.minor_hash,
+				de->name_len, de->name,
+				le16_to_cpu(dx_root->dr_entries.de_num_used));
+
+		ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+					   dirent_blk);
+
+		le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+}
+
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+					 struct buffer_head *di_bh)
+{
+	int dirent_count = 0;
+	char *de_buf, *limit;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	de_buf = di->id2.i_data.id_data;
+	limit = de_buf + i_size_read(dir);
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (de->name_len && de->inode)
+			dirent_count++;
+
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+
+	/* We are careful to leave room for one extra record. */
+	return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
+}
+
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+					     struct inode *dir)
+{
+	struct super_block *sb = dir->i_sb;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dir_entry *prev_de;
+	char *de_buf, *limit;
+	unsigned int new_size = sb->s_blocksize;
+	unsigned int bytes, this_hole;
+	unsigned int largest_hole = 0;
+
+	if (ocfs2_new_dir_wants_trailer(dir))
+		new_size = ocfs2_dir_trailer_blk_off(sb);
+
+	bytes = new_size - old_size;
+
+	limit = start + old_size;
+	de_buf = start;
+	de = (struct ocfs2_dir_entry *)de_buf;
+	do {
+		this_hole = ocfs2_figure_dirent_hole(de);
+		if (this_hole > largest_hole)
+			largest_hole = this_hole;
+
+		prev_de = de;
+		de_buf += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)de_buf;
+	} while (de_buf < limit);
+
+	le16_add_cpu(&prev_de->rec_len, bytes);
+
+	/* We need to double check this after modification of the final
+	 * dirent. */
+	this_hole = ocfs2_figure_dirent_hole(prev_de);
+	if (this_hole > largest_hole)
+		largest_hole = this_hole;
+
+	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+		return largest_hole;
+	return 0;
+}
+
+/*
+ * We allocate enough clusters to fulfill "blocks_wanted", but set
+ * i_size to exactly one block. Ocfs2_extend_dir() will handle the
+ * rest automatically for us.
+ *
+ * *first_block_bh is a pointer to the 1st data block allocated to the
+ *  directory.
+ */
+static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
+				   unsigned int blocks_wanted,
+				   struct ocfs2_dir_lookup_result *lookup,
+				   struct buffer_head **first_block_bh)
+{
+	u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
+	struct super_block *sb = dir->i_sb;
+	int ret, i, num_dx_leaves = 0, dx_inline = 0,
+		credits = ocfs2_inline_to_extents_credits(sb);
+	u64 dx_insert_blkno, blkno,
+		bytes = blocks_wanted << sb->s_blocksize_bits;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(dir);
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct buffer_head *dirdata_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct buffer_head **dx_leaves = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	handle_t *handle;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_extent_tree dx_et;
+	int did_quota = 0, bytes_allocated = 0;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
+
+	alloc = ocfs2_clusters_for_bytes(sb, bytes);
+	dx_alloc = 0;
+
+	down_write(&oi->ip_alloc_sem);
+
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		credits += ocfs2_add_dir_index_credits(sb);
+
+		dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+		if (!dx_inline) {
+			/* Add one more cluster for an index leaf */
+			dx_alloc++;
+			dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+								&num_dx_leaves);
+			if (!dx_leaves) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		/* This gets us the dx_root */
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * We should never need more than 2 clusters for the unindexed
+	 * tree - maximum dirent size is far less than one block. In
+	 * fact, the only time we'd need more than one cluster is if
+	 * blocksize == clustersize and the dirent won't fit in the
+	 * extra space that the expansion to a single block gives. As
+	 * of today, that only happens on 4k/4k file systems.
+	 */
+	BUG_ON(alloc > 2);
+
+	ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Prepare for worst case allocation scenario of two separate
+	 * extents in the unindexed tree.
+	 */
+	if (alloc == 2)
+		credits += OCFS2_SUBALLOC_ALLOC;
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = dquot_alloc_space_nodirty(dir,
+		ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+	if (ret)
+		goto out_commit;
+	did_quota = 1;
+
+	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+		/*
+		 * Allocate our index cluster first, to maximize the
+		 * possibility that unindexed leaves grow
+		 * contiguously.
+		 */
+		ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+						 dx_leaves, num_dx_leaves,
+						 &dx_insert_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+	}
+
+	/*
+	 * Try to claim as many clusters as the bitmap can give though
+	 * if we only get one now, that's enough to continue. The rest
+	 * will be claimed after the conversion to extents.
+	 */
+	if (ocfs2_dir_resv_allowed(osb))
+		data_ac->ac_resv = &oi->ip_la_data_resv;
+	ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+	bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+
+	/*
+	 * Operations are carefully ordered so that we set up the new
+	 * data block first. The conversion from inline data to
+	 * extents follows.
+	 */
+	blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+	dirdata_bh = sb_getblk(sb, blkno);
+	if (!dirdata_bh) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
+
+	ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
+	memset(dirdata_bh->b_data + i_size_read(dir), 0,
+	       sb->s_blocksize - i_size_read(dir));
+	i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
+	if (ocfs2_new_dir_wants_trailer(dir)) {
+		/*
+		 * Prepare the dir trailer up front. It will otherwise look
+		 * like a valid dirent. Even if inserting the index fails
+		 * (unlikely), then all we'll have done is given first dir
+		 * block a small amount of fragmentation.
+		 */
+		ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+	}
+
+	ocfs2_update_inode_fsync_trans(handle, dir, 1);
+	ocfs2_journal_dirty(handle, dirdata_bh);
+
+	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+		/*
+		 * Dx dirs with an external cluster need to do this up
+		 * front. Inline dx root's get handled later, after
+		 * we've allocated our root block. We get passed back
+		 * a total number of items so that dr_num_entries can
+		 * be correctly set once the dx_root has been
+		 * allocated.
+		 */
+		ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+					       num_dx_leaves, &num_dx_entries,
+					       dirdata_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/*
+	 * Set extent, i_size, etc on the directory. After this, the
+	 * inode should contain the same exact dirents as before and
+	 * be fully accessible from system calls.
+	 *
+	 * We let the later dirent insert modify c/mtime - to the user
+	 * the data hasn't changed.
+	 */
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_dinode_new_extent_list(dir, di);
+
+	i_size_write(dir, sb->s_blocksize);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+
+	di->i_size = cpu_to_le64(sb->s_blocksize);
+	di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+	ocfs2_update_inode_fsync_trans(handle, dir, 1);
+
+	/*
+	 * This should never fail as our extent list is empty and all
+	 * related blocks have been journaled already.
+	 */
+	ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
+				  0, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Set i_blocks after the extent insert for the most up to
+	 * date ip_clusters value.
+	 */
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+						dirdata_bh, meta_ac, dx_inline,
+						num_dx_entries, &dx_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		if (dx_inline) {
+			ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+						      dirdata_bh);
+		} else {
+			ocfs2_init_dx_root_extent_tree(&dx_et,
+						       INODE_CACHE(dir),
+						       dx_root_bh);
+			ret = ocfs2_insert_extent(handle, &dx_et, 0,
+						  dx_insert_blkno, 1, 0, NULL);
+			if (ret)
+				mlog_errno(ret);
+		}
+	}
+
+	/*
+	 * We asked for two clusters, but only got one in the 1st
+	 * pass. Claim the 2nd cluster as a separate extent.
+	 */
+	if (alloc > len) {
+		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
+					   &len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+
+		ret = ocfs2_insert_extent(handle, &et, 1,
+					  blkno, len, 0, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+	}
+
+	*first_block_bh = dirdata_bh;
+	dirdata_bh = NULL;
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		unsigned int off;
+
+		if (!dx_inline) {
+			/*
+			 * We need to return the correct block within the
+			 * cluster which should hold our entry.
+			 */
+			off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+						    &lookup->dl_hinfo);
+			get_bh(dx_leaves[off]);
+			lookup->dl_dx_leaf_bh = dx_leaves[off];
+		}
+		lookup->dl_dx_root_bh = dx_root_bh;
+		dx_root_bh = NULL;
+	}
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(dir, bytes_allocated);
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	up_write(&oi->ip_alloc_sem);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++)
+			brelse(dx_leaves[i]);
+		kfree(dx_leaves);
+	}
+
+	brelse(dirdata_bh);
+	brelse(dx_root_bh);
+
+	return ret;
+}
+
+/* returns a bh of the 1st new block in the allocation. */
+static int ocfs2_do_extend_dir(struct super_block *sb,
+			       handle_t *handle,
+			       struct inode *dir,
+			       struct buffer_head *parent_fe_bh,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct buffer_head **new_bh)
+{
+	int status;
+	int extend, did_quota = 0;
+	u64 p_blkno, v_blkno;
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	if (extend) {
+		u32 offset = OCFS2_I(dir)->ip_clusters;
+
+		status = dquot_alloc_space_nodirty(dir,
+					ocfs2_clusters_to_bytes(sb, 1));
+		if (status)
+			goto bail;
+		did_quota = 1;
+
+		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+					      1, 0, parent_fe_bh, handle,
+					      data_ac, meta_ac, NULL);
+		BUG_ON(status == -EAGAIN);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
+	status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_bh = sb_getblk(sb, p_blkno);
+	if (!*new_bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	if (did_quota && status < 0)
+		dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+	return status;
+}
+
+/*
+ * Assumes you already have a cluster lock on the directory.
+ *
+ * 'blocks_wanted' is only used if we have an inline directory which
+ * is to be turned into an extent based one. The size of the dirent to
+ * insert might be larger than the space gained by growing to just one
+ * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
+ */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    unsigned int blocks_wanted,
+			    struct ocfs2_dir_lookup_result *lookup,
+			    struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	int credits, num_free_extents, drop_alloc_sem = 0;
+	loff_t dir_i_size;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_extent_tree et;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		/*
+		 * This would be a code error as an inline directory should
+		 * never have an index root.
+		 */
+		BUG_ON(dx_root_bh);
+
+		status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
+						 blocks_wanted, lookup,
+						 &new_bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Expansion from inline to an indexed directory will
+		 * have given us this. */
+		dx_root_bh = lookup->dl_dx_root_bh;
+
+		if (blocks_wanted == 1) {
+			/*
+			 * If the new dirent will fit inside the space
+			 * created by pushing out to one block, then
+			 * we can complete the operation
+			 * here. Otherwise we have to expand i_size
+			 * and format the 2nd block below.
+			 */
+			BUG_ON(new_bh == NULL);
+			goto bail_bh;
+		}
+
+		/*
+		 * Get rid of 'new_bh' - we want to format the 2nd
+		 * data block and return that instead.
+		 */
+		brelse(new_bh);
+		new_bh = NULL;
+
+		down_write(&OCFS2_I(dir)->ip_alloc_sem);
+		drop_alloc_sem = 1;
+		dir_i_size = i_size_read(dir);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+		goto do_extend;
+	}
+
+	down_write(&OCFS2_I(dir)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+	dir_i_size = i_size_read(dir);
+	trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
+			       dir_i_size);
+
+	/* dir->i_size is always block aligned. */
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
+					      parent_fe_bh);
+		num_free_extents = ocfs2_num_free_extents(osb, &et);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!num_free_extents) {
+			status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
+			if (status < 0) {
+				if (status != -ENOSPC)
+					mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		if (ocfs2_dir_resv_allowed(osb))
+			data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
+		credits = ocfs2_calc_extend_credits(sb, el);
+	} else {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+	}
+
+do_extend:
+	if (ocfs2_dir_indexed(dir))
+		credits++; /* For attaching the new dirent block to the
+			    * dx_root */
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+				     data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
+
+	status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	if (ocfs2_supports_dir_trailer(dir)) {
+		de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+
+		ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+
+		if (ocfs2_dir_indexed(dir)) {
+			status = ocfs2_dx_dir_link_trailer(dir, handle,
+							   dx_root_bh, new_bh);
+			if (status) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		de->rec_len = cpu_to_le16(sb->s_blocksize);
+	}
+	ocfs2_update_inode_fsync_trans(handle, dir, 1);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	dir_i_size += dir->i_sb->s_blocksize;
+	i_size_write(dir, dir_i_size);
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail_bh:
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+	if (drop_alloc_sem)
+		up_write(&OCFS2_I(dir)->ip_alloc_sem);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	brelse(new_bh);
+
+	return status;
+}
+
+static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
+				   const char *name, int namelen,
+				   struct buffer_head **ret_de_bh,
+				   unsigned int *blocks_wanted)
+{
+	int ret;
+	struct super_block *sb = dir->i_sb;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dir_entry *de, *last_de = NULL;
+	char *de_buf, *limit;
+	unsigned long offset = 0;
+	unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+	/*
+	 * This calculates how many free bytes we'd have in block zero, should
+	 * this function force expansion to an extent tree.
+	 */
+	if (ocfs2_new_dir_wants_trailer(dir))
+		free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+	else
+		free_space = dir->i_sb->s_blocksize - i_size_read(dir);
+
+	de_buf = di->id2.i_data.id_data;
+	limit = de_buf + i_size_read(dir);
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			ret = -EEXIST;
+			goto out;
+		}
+		/*
+		 * No need to check for a trailing dirent record here as
+		 * they're not used for inline dirs.
+		 */
+
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = di_bh;
+			get_bh(*ret_de_bh);
+			ret = 0;
+			goto out;
+		}
+
+		last_de = de;
+		de_buf += le16_to_cpu(de->rec_len);
+		offset += le16_to_cpu(de->rec_len);
+	}
+
+	/*
+	 * We're going to require expansion of the directory - figure
+	 * out how many blocks we'll need so that a place for the
+	 * dirent can be found.
+	 */
+	*blocks_wanted = 1;
+	new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
+	if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
+		*blocks_wanted = 2;
+
+	ret = -ENOSPC;
+out:
+	return ret;
+}
+
+static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
+				   int namelen, struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head *bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb = dir->i_sb;
+	int status;
+	int blocksize = dir->i_sb->s_blocksize;
+
+	status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+	if (status)
+		goto bail;
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse(bh);
+			bh = NULL;
+
+			if (i_size_read(dir) <= offset) {
+				/*
+				 * Caller will have to expand this
+				 * directory.
+				 */
+				status = -ENOSPC;
+				goto bail;
+			}
+			status = ocfs2_read_dir_block(dir,
+					     offset >> sb->s_blocksize_bits,
+					     &bh, 0);
+			if (status)
+				goto bail;
+
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+
+		if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+					   blocksize))
+			goto next;
+
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+next:
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+bail:
+	brelse(bh);
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+	const struct ocfs2_dx_entry *entry1 = a;
+	const struct ocfs2_dx_entry *entry2 = b;
+	u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+	u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+	u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+	u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+
+	if (major_hash1 > major_hash2)
+		return 1;
+	if (major_hash1 < major_hash2)
+		return -1;
+
+	/*
+	 * It is not strictly necessary to sort by minor
+	 */
+	if (minor_hash1 > minor_hash2)
+		return 1;
+	if (minor_hash1 < minor_hash2)
+		return -1;
+	return 0;
+}
+
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+	struct ocfs2_dx_entry *entry1 = a;
+	struct ocfs2_dx_entry *entry2 = b;
+	struct ocfs2_dx_entry tmp;
+
+	BUG_ON(size != sizeof(*entry1));
+
+	tmp = *entry1;
+	*entry1 = *entry2;
+	*entry2 = tmp;
+}
+
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+	struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+	int i, num = le16_to_cpu(dl_list->de_num_used);
+
+	for (i = 0; i < (num - 1); i++) {
+		if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+		    le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+					u32 leaf_cpos, u32 insert_hash,
+					u32 *split_hash)
+{
+	struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+	int i, num_used = le16_to_cpu(dl_list->de_num_used);
+	int allsame;
+
+	/*
+	 * There's a couple rare, but nasty corner cases we have to
+	 * check for here. All of them involve a leaf where all value
+	 * have the same hash, which is what we look for first.
+	 *
+	 * Most of the time, all of the above is false, and we simply
+	 * pick the median value for a split.
+	 */
+	allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+	if (allsame) {
+		u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+
+		if (val == insert_hash) {
+			/*
+			 * No matter where we would choose to split,
+			 * the new entry would want to occupy the same
+			 * block as these. Since there's no space left
+			 * in their existing block, we know there
+			 * won't be space after the split.
+			 */
+			return -ENOSPC;
+		}
+
+		if (val == leaf_cpos) {
+			/*
+			 * Because val is the same as leaf_cpos (which
+			 * is the smallest value this leaf can have),
+			 * yet is not equal to insert_hash, then we
+			 * know that insert_hash *must* be larger than
+			 * val (and leaf_cpos). At least cpos+1 in value.
+			 *
+			 * We also know then, that there cannot be an
+			 * adjacent extent (otherwise we'd be looking
+			 * at it). Choosing this value gives us a
+			 * chance to get some contiguousness.
+			 */
+			*split_hash = leaf_cpos + 1;
+			return 0;
+		}
+
+		if (val > insert_hash) {
+			/*
+			 * val can not be the same as insert hash, and
+			 * also must be larger than leaf_cpos. Also,
+			 * we know that there can't be a leaf between
+			 * cpos and val, otherwise the entries with
+			 * hash 'val' would be there.
+			 */
+			*split_hash = val;
+			return 0;
+		}
+
+		*split_hash = insert_hash;
+		return 0;
+	}
+
+	/*
+	 * Since the records are sorted and the checks above
+	 * guaranteed that not all records in this block are the same,
+	 * we simple travel forward, from the median, and pick the 1st
+	 * record whose value is larger than leaf_cpos.
+	 */
+	for (i = (num_used / 2); i < num_used; i++)
+		if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+		    leaf_cpos)
+			break;
+
+	BUG_ON(i == num_used); /* Should be impossible */
+	*split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+	return 0;
+}
+
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+				       handle_t *handle,
+				       struct ocfs2_dx_leaf *tmp_dx_leaf,
+				       struct buffer_head **orig_dx_leaves,
+				       struct buffer_head **new_dx_leaves,
+				       int num_dx_leaves)
+{
+	int i, j, num_used;
+	u32 major_hash;
+	struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+	struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+	struct ocfs2_dx_entry *dx_entry;
+
+	tmp_list = &tmp_dx_leaf->dl_list;
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+		orig_list = &orig_dx_leaf->dl_list;
+		new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+		new_list = &new_dx_leaf->dl_list;
+
+		num_used = le16_to_cpu(orig_list->de_num_used);
+
+		memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+		tmp_list->de_num_used = cpu_to_le16(0);
+		memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+
+		for (j = 0; j < num_used; j++) {
+			dx_entry = &orig_list->de_entries[j];
+			major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+			if (major_hash >= split_hash)
+				ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+							      dx_entry);
+			else
+				ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+							      dx_entry);
+		}
+		memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+
+		ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+		ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+	}
+}
+
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+					  struct ocfs2_dx_root_block *dx_root)
+{
+	int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+
+	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
+	credits += ocfs2_quota_trans_credits(osb->sb);
+	return credits;
+}
+
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+				  struct buffer_head *dx_root_bh,
+				  struct buffer_head *dx_leaf_bh,
+				  struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+				  u64 leaf_blkno)
+{
+	struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+	int credits, ret, i, num_used, did_quota = 0;
+	u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+	u64 orig_leaves_start;
+	int num_dx_leaves;
+	struct buffer_head **orig_dx_leaves = NULL;
+	struct buffer_head **new_dx_leaves = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+	struct ocfs2_extent_tree et;
+	handle_t *handle = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+
+	trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				     (unsigned long long)leaf_blkno,
+				     insert_hash);
+
+	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	/*
+	 * XXX: This is a rather large limit. We should use a more
+	 * realistic value.
+	 */
+	if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+		return -ENOSPC;
+
+	num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+	if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+		mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+		     "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+		     (unsigned long long)leaf_blkno, num_used);
+		ret = -EIO;
+		goto out;
+	}
+
+	orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+	if (!orig_dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+	if (!new_dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(dir->i_sb, 1));
+	if (ret)
+		goto out_commit;
+	did_quota = 1;
+
+	ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * This block is changing anyway, so we can sort it in place.
+	 */
+	sort(dx_leaf->dl_list.de_entries, num_used,
+	     sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+	     dx_leaf_sort_swap);
+
+	ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+	ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+					   &split_hash);
+	if (ret) {
+		mlog_errno(ret);
+		goto  out_commit;
+	}
+
+	trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
+
+	/*
+	 * We have to carefully order operations here. There are items
+	 * which want to be in the new cluster before insert, but in
+	 * order to put those items in the new cluster, we alter the
+	 * old cluster. A failure to insert gets nasty.
+	 *
+	 * So, start by reserving writes to the old
+	 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+	 * the new cluster for us, before inserting it. The insert
+	 * won't happen if there's an error before that. Once the
+	 * insert is done then, we can transfer from one leaf into the
+	 * other without fear of hitting any error.
+	 */
+
+	/*
+	 * The leaf transfer wants some scratch space so that we don't
+	 * wind up doing a bunch of expensive memmove().
+	 */
+	tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+	if (!tmp_dx_leaf) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+	ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+				   orig_dx_leaves);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	cpos = split_hash;
+	ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+				       data_ac, meta_ac, new_dx_leaves,
+				       num_dx_leaves);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+					      orig_dx_leaves[i],
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+					      new_dx_leaves[i],
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+				   orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(dir,
+				ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+	ocfs2_update_inode_fsync_trans(handle, dir, 1);
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (orig_dx_leaves || new_dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++) {
+			if (orig_dx_leaves)
+				brelse(orig_dx_leaves[i]);
+			if (new_dx_leaves)
+				brelse(new_dx_leaves[i]);
+		}
+		kfree(orig_dx_leaves);
+		kfree(new_dx_leaves);
+	}
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	kfree(tmp_dx_leaf);
+	return ret;
+}
+
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+				   struct buffer_head *di_bh,
+				   struct buffer_head *dx_root_bh,
+				   const char *name, int namelen,
+				   struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, rebalanced = 0;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *dx_leaf_bh = NULL;
+	struct ocfs2_dx_leaf *dx_leaf;
+	u64 blkno;
+	u32 leaf_cpos;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+restart_search:
+	ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+				  &leaf_cpos, &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+
+	if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+	    le16_to_cpu(dx_leaf->dl_list.de_count)) {
+		if (rebalanced) {
+			/*
+			 * Rebalancing should have provided us with
+			 * space in an appropriate leaf.
+			 *
+			 * XXX: Is this an abnormal condition then?
+			 * Should we print a message here?
+			 */
+			ret = -ENOSPC;
+			goto out;
+		}
+
+		ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+					     &lookup->dl_hinfo, leaf_cpos,
+					     blkno);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Restart the lookup. The rebalance might have
+		 * changed which block our item fits into. Mark our
+		 * progress, so we only execute this once.
+		 */
+		brelse(dx_leaf_bh);
+		dx_leaf_bh = NULL;
+		rebalanced = 1;
+		goto restart_search;
+	}
+
+	lookup->dl_dx_leaf_bh = dx_leaf_bh;
+	dx_leaf_bh = NULL;
+
+out:
+	brelse(dx_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_search_dx_free_list(struct inode *dir,
+				     struct buffer_head *dx_root_bh,
+				     int namelen,
+				     struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret = -ENOSPC;
+	struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+	struct ocfs2_dir_block_trailer *db;
+	u64 next_block;
+	int rec_len = OCFS2_DIR_REC_LEN(namelen);
+	struct ocfs2_dx_root_block *dx_root;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	next_block = le64_to_cpu(dx_root->dr_free_blk);
+
+	while (next_block) {
+		brelse(prev_leaf_bh);
+		prev_leaf_bh = leaf_bh;
+		leaf_bh = NULL;
+
+		ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+		if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+			lookup->dl_leaf_bh = leaf_bh;
+			lookup->dl_prev_leaf_bh = prev_leaf_bh;
+			leaf_bh = NULL;
+			prev_leaf_bh = NULL;
+			break;
+		}
+
+		next_block = le64_to_cpu(db->db_free_next);
+	}
+
+	if (!next_block)
+		ret = -ENOSPC;
+
+out:
+
+	brelse(leaf_bh);
+	brelse(prev_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+				       struct buffer_head *dx_root_bh)
+{
+	int ret, num_dx_leaves, i, j, did_quota = 0;
+	struct buffer_head **dx_leaves = NULL;
+	struct ocfs2_extent_tree et;
+	u64 insert_blkno;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	handle_t *handle = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+	struct ocfs2_dx_entry *dx_entry;
+	struct ocfs2_dx_leaf *target_leaf;
+
+	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+	if (!dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (ret)
+		goto out_commit;
+	did_quota = 1;
+
+	/*
+	 * We do this up front, before the allocation, so that a
+	 * failure to add the dx_root_bh to the journal won't result
+	 * us losing clusters.
+	 */
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+					 num_dx_leaves, &insert_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Transfer the entries from our dx_root into the appropriate
+	 * block
+	 */
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+		dx_entry = &entry_list->de_entries[i];
+
+		j = __ocfs2_dx_dir_hash_idx(osb,
+					    le32_to_cpu(dx_entry->dx_minor_hash));
+		target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+
+		ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+
+		/* Each leaf has been passed to the journal already
+		 * via __ocfs2_dx_dir_new_cluster() */
+	}
+
+	dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+	memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+	       offsetof(struct ocfs2_dx_root_block, dr_list));
+	dx_root->dr_list.l_count =
+		cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+
+	/* This should never fail considering we start with an empty
+	 * dx_root. */
+	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+	ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
+	if (ret)
+		mlog_errno(ret);
+	did_quota = 0;
+
+	ocfs2_update_inode_fsync_trans(handle, dir, 1);
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(dir,
+					  ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	if (dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++)
+			brelse(dx_leaves[i]);
+		kfree(dx_leaves);
+	}
+	return ret;
+}
+
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	if (le16_to_cpu(entry_list->de_num_used) >=
+	    le16_to_cpu(entry_list->de_count))
+		return -ENOSPC;
+
+	return 0;
+}
+
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+					   struct buffer_head *di_bh,
+					   const char *name,
+					   int namelen,
+					   struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, free_dx_root = 1;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct buffer_head *dx_root_bh = NULL;
+	struct buffer_head *leaf_bh = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dx_root_block *dx_root;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+		ret = -ENOSPC;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_dx_root_inline(dx_root)) {
+		ret = ocfs2_inline_dx_has_space(dx_root_bh);
+
+		if (ret == 0)
+			goto search_el;
+
+		/*
+		 * We ran out of room in the root block. Expand it to
+		 * an extent, then allow ocfs2_find_dir_space_dx to do
+		 * the rest.
+		 */
+		ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Insert preparation for an indexed directory is split into two
+	 * steps. The call to find_dir_space_dx reserves room in the index for
+	 * an additional item. If we run out of space there, it's a real error
+	 * we can't continue on.
+	 */
+	ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+				      namelen, lookup);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+search_el:
+	/*
+	 * Next, we need to find space in the unindexed tree. This call
+	 * searches using the free space linked list. If the unindexed tree
+	 * lacks sufficient space, we'll expand it below. The expansion code
+	 * is smart enough to add any new blocks to the free space list.
+	 */
+	ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+	if (ret && ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Do this up here - ocfs2_extend_dir might need the dx_root */
+	lookup->dl_dx_root_bh = dx_root_bh;
+	free_dx_root = 0;
+
+	if (ret == -ENOSPC) {
+		ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
+
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We make the assumption here that new leaf blocks are added
+		 * to the front of our free list.
+		 */
+		lookup->dl_prev_leaf_bh = NULL;
+		lookup->dl_leaf_bh = leaf_bh;
+	}
+
+out:
+	if (free_dx_root)
+		brelse(dx_root_bh);
+	return ret;
+}
+
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret;
+	unsigned int blocks_wanted = 1;
+	struct buffer_head *bh = NULL;
+
+	trace_ocfs2_prepare_dir_for_insert(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
+
+	if (!namelen) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Do this up front to reduce confusion.
+	 *
+	 * The directory might start inline, then be turned into an
+	 * indexed one, in which case we'd need to hash deep inside
+	 * ocfs2_find_dir_space_id(). Since
+	 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+	 * done, there seems no point in spreading out the calls. We
+	 * can optimize away the case where the file system doesn't
+	 * support indexing.
+	 */
+	if (ocfs2_supports_indexed_dirs(osb))
+		ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+
+	if (ocfs2_dir_indexed(dir)) {
+		ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+						      name, namelen, lookup);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
+					      namelen, &bh, &blocks_wanted);
+	} else
+		ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
+
+	if (ret && ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ret == -ENOSPC) {
+		/*
+		 * We have to expand the directory to add this name.
+		 */
+		BUG_ON(bh);
+
+		ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
+				       lookup, &bh);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(!bh);
+	}
+
+	lookup->dl_leaf_bh = bh;
+	bh = NULL;
+out:
+	brelse(bh);
+	return ret;
+}
+
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+				     struct buffer_head *di_bh,
+				     struct buffer_head *dx_root_bh)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dx_root_block *dx_root;
+	struct inode *dx_alloc_inode = NULL;
+	struct buffer_head *dx_alloc_bh = NULL;
+	handle_t *handle;
+	u64 blk;
+	u16 bit;
+	u64 bg_blkno;
+
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+	dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+					EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(dx_root->dr_suballoc_slot));
+	if (!dx_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&dx_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+	di->i_dx_root = cpu_to_le64(0ULL);
+	ocfs2_update_inode_fsync_trans(handle, dir, 1);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	blk = le64_to_cpu(dx_root->dr_blkno);
+	bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+	if (dx_root->dr_suballoc_loc)
+		bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+	else
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+	ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	ocfs2_inode_unlock(dx_alloc_inode, 1);
+
+out_mutex:
+	mutex_unlock(&dx_alloc_inode->i_mutex);
+	brelse(dx_alloc_bh);
+out:
+	iput(dx_alloc_inode);
+	return ret;
+}
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+	int ret;
+	unsigned int uninitialized_var(clen);
+	u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+	u64 uninitialized_var(blkno);
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (!ocfs2_dir_indexed(dir))
+		return 0;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	if (ocfs2_dx_root_inline(dx_root))
+		goto remove_index;
+
+	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+
+	/* XXX: What if dr_clusters is too large? */
+	while (le32_to_cpu(dx_root->dr_clusters)) {
+		ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+					      major_hash, &cpos, &blkno, &clen);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+
+		ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+					       &dealloc, 0, false);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (cpos == 0)
+			break;
+
+		major_hash = cpos - 1;
+	}
+
+remove_index:
+	ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	brelse(dx_root_bh);
+	return ret;
+}
diff --git a/kernel/fs/ocfs2/dir.h b/kernel/fs/ocfs2/dir.h
new file mode 100644
index 000000000..3d8639f38
--- /dev/null
+++ b/kernel/fs/ocfs2/dir.h
@@ -0,0 +1,116 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DIR_H
+#define OCFS2_DIR_H
+
+struct ocfs2_dx_hinfo {
+	u32	major_hash;
+	u32	minor_hash;
+};
+
+struct ocfs2_dir_lookup_result {
+	struct buffer_head		*dl_leaf_bh;	/* Unindexed leaf
+							 * block */
+	struct ocfs2_dir_entry		*dl_entry;	/* Target dirent in
+							 * unindexed leaf */
+
+	struct buffer_head		*dl_dx_root_bh;	/* Root of indexed
+							 * tree */
+
+	struct buffer_head		*dl_dx_leaf_bh;	/* Indexed leaf block */
+	struct ocfs2_dx_entry		*dl_dx_entry;	/* Target dx_entry in
+							 * indexed leaf */
+	struct ocfs2_dx_hinfo		dl_hinfo;	/* Name hash results */
+
+	struct buffer_head		*dl_prev_leaf_bh;/* Previous entry in
+							  * dir free space
+							  * list. NULL if
+							  * previous entry is
+							  * dx root block. */
+};
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+
+int ocfs2_find_entry(const char *name, int namelen,
+		     struct inode *dir,
+		     struct ocfs2_dir_lookup_result *lookup);
+int ocfs2_delete_entry(handle_t *handle,
+		       struct inode *dir,
+		       struct ocfs2_dir_lookup_result *res);
+int __ocfs2_add_entry(handle_t *handle,
+		      struct inode *dir,
+		      const char *name, int namelen,
+		      struct inode *inode, u64 blkno,
+		      struct buffer_head *parent_fe_bh,
+		      struct ocfs2_dir_lookup_result *lookup);
+static inline int ocfs2_add_entry(handle_t *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct ocfs2_dir_lookup_result *lookup)
+{
+	return __ocfs2_add_entry(handle, d_inode(dentry->d_parent),
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, lookup);
+}
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+		       struct ocfs2_dir_lookup_result *res,
+		       struct inode *new_entry_inode);
+
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen);
+int ocfs2_empty_dir(struct inode *inode);
+
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct ocfs2_dir_lookup_result *res);
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+			       int namelen, u64 *blkno);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct ocfs2_dir_lookup_result *lookup);
+struct ocfs2_alloc_context;
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+		       handle_t *handle,
+		       struct inode *parent,
+		       struct inode *inode,
+		       struct buffer_head *fe_bh,
+		       struct ocfs2_alloc_context *data_ac,
+		       struct ocfs2_alloc_context *meta_ac);
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
+
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data);
+#endif /* OCFS2_DIR_H */
diff --git a/kernel/fs/ocfs2/dlm/Makefile b/kernel/fs/ocfs2/dlm/Makefile
new file mode 100644
index 000000000..bd1aab1f4
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,7 @@
+ccflags-y := -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
+
+ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
+	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
+
diff --git a/kernel/fs/ocfs2/dlm/dlmapi.h b/kernel/fs/ocfs2/dlm/dlmapi.h
new file mode 100644
index 000000000..3cfa114aa
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,220 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmapi.h
+ *
+ * externally exported dlm interfaces
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMAPI_H
+#define DLMAPI_H
+
+struct dlm_lock;
+struct dlm_ctxt;
+
+/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
+enum dlm_status {
+	DLM_NORMAL = 0,           /*  0: request in progress */
+	DLM_GRANTED,              /*  1: request granted */
+	DLM_DENIED,               /*  2: request denied */
+	DLM_DENIED_NOLOCKS,       /*  3: request denied, out of system resources */
+	DLM_WORKING,              /*  4: async request in progress */
+	DLM_BLOCKED,              /*  5: lock request blocked */
+	DLM_BLOCKED_ORPHAN,       /*  6: lock request blocked by a orphan lock*/
+	DLM_DENIED_GRACE_PERIOD,  /*  7: topological change in progress */
+	DLM_SYSERR,               /*  8: system error */
+	DLM_NOSUPPORT,            /*  9: unsupported */
+	DLM_CANCELGRANT,          /* 10: can't cancel convert: already granted */
+	DLM_IVLOCKID,             /* 11: bad lockid */
+	DLM_SYNC,                 /* 12: synchronous request granted */
+	DLM_BADTYPE,              /* 13: bad resource type */
+	DLM_BADRESOURCE,          /* 14: bad resource handle */
+	DLM_MAXHANDLES,           /* 15: no more resource handles */
+	DLM_NOCLINFO,             /* 16: can't contact cluster manager */
+	DLM_NOLOCKMGR,            /* 17: can't contact lock manager */
+	DLM_NOPURGED,             /* 18: can't contact purge daemon */
+	DLM_BADARGS,              /* 19: bad api args */
+	DLM_VOID,                 /* 20: no status */
+	DLM_NOTQUEUED,            /* 21: NOQUEUE was specified and request failed */
+	DLM_IVBUFLEN,             /* 22: invalid resource name length */
+	DLM_CVTUNGRANT,           /* 23: attempted to convert ungranted lock */
+	DLM_BADPARAM,             /* 24: invalid lock mode specified */
+	DLM_VALNOTVALID,          /* 25: value block has been invalidated */
+	DLM_REJECTED,             /* 26: request rejected, unrecognized client */
+	DLM_ABORT,                /* 27: blocked lock request cancelled */
+	DLM_CANCEL,               /* 28: conversion request cancelled */
+	DLM_IVRESHANDLE,          /* 29: invalid resource handle */
+	DLM_DEADLOCK,             /* 30: deadlock recovery refused this request */
+	DLM_DENIED_NOASTS,        /* 31: failed to allocate AST */
+	DLM_FORWARD,              /* 32: request must wait for primary's response */
+	DLM_TIMEOUT,              /* 33: timeout value for lock has expired */
+	DLM_IVGROUPID,            /* 34: invalid group specification */
+	DLM_VERS_CONFLICT,        /* 35: version conflicts prevent request handling */
+	DLM_BAD_DEVICE_PATH,      /* 36: Locks device does not exist or path wrong */
+	DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
+	DLM_NO_CONTROL_DEVICE,    /* 38: Cannot set options on opened device */
+
+	DLM_RECOVERING,           /* 39: extension, allows caller to fail a lock
+				     request if it is being recovered */
+	DLM_MIGRATING,            /* 40: extension, allows caller to fail a lock
+				     request if it is being migrated */
+	DLM_MAXSTATS,             /* 41: upper limit for return code validation */
+};
+
+/* for pretty-printing dlm_status error messages */
+const char *dlm_errmsg(enum dlm_status err);
+/* for pretty-printing dlm_status error names */
+const char *dlm_errname(enum dlm_status err);
+
+/* Eventually the DLM will use standard errno values, but in the
+ * meantime this lets us track dlm errors as they bubble up. When we
+ * bring its error reporting into line with the rest of the stack,
+ * these can just be replaced with calls to mlog_errno. */
+#define dlm_error(st) do {						\
+	if ((st) != DLM_RECOVERING &&					\
+	    (st) != DLM_MIGRATING &&					\
+	    (st) != DLM_FORWARD)					\
+		mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st)));	\
+} while (0)
+
+#define DLM_LKSB_UNUSED1           0x01
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+#define DLM_LVB_LEN  64
+
+/* Callers are only allowed access to the lvb and status members of
+ * this struct. */
+struct dlm_lockstatus {
+	enum dlm_status status;
+	u32 flags;
+	struct dlm_lock *lockid;
+	char lvb[DLM_LVB_LEN];
+};
+
+/* Valid lock modes. */
+#define LKM_IVMODE      (-1)            /* invalid mode */
+#define LKM_NLMODE      0               /* null lock */
+#define LKM_CRMODE      1               /* concurrent read    unsupported */
+#define LKM_CWMODE      2               /* concurrent write   unsupported */
+#define LKM_PRMODE      3               /* protected read */
+#define LKM_PWMODE      4               /* protected write    unsupported */
+#define LKM_EXMODE      5               /* exclusive */
+#define LKM_MAXMODE     5
+#define LKM_MODEMASK    0xff
+
+/* Flags passed to dlmlock and dlmunlock:
+ * reserved: flags used by the "real" dlm
+ * only a few are supported by this dlm
+ * (U) = unsupported by ocfs2 dlm */
+#define LKM_ORPHAN       0x00000010  /* this lock is orphanable (U) */
+#define LKM_PARENTABLE   0x00000020  /* this lock was orphaned (U) */
+#define LKM_BLOCK        0x00000040  /* blocking lock request (U) */
+#define LKM_LOCAL        0x00000080  /* local lock request */
+#define LKM_VALBLK       0x00000100  /* lock value block request */
+#define LKM_NOQUEUE      0x00000200  /* non blocking request */
+#define LKM_CONVERT      0x00000400  /* conversion request */
+#define LKM_NODLCKWT     0x00000800  /* this lock wont deadlock (U) */
+#define LKM_UNLOCK       0x00001000  /* deallocate this lock */
+#define LKM_CANCEL       0x00002000  /* cancel conversion request */
+#define LKM_DEQALL       0x00004000  /* remove all locks held by proc (U) */
+#define LKM_INVVALBLK    0x00008000  /* invalidate lock value block */
+#define LKM_SYNCSTS      0x00010000  /* return synchronous status if poss (U) */
+#define LKM_TIMEOUT      0x00020000  /* lock request contains timeout (U) */
+#define LKM_SNGLDLCK     0x00040000  /* request can self-deadlock (U) */
+#define LKM_FINDLOCAL    0x00080000  /* find local lock request (U) */
+#define LKM_PROC_OWNED   0x00100000  /* owned by process, not group (U) */
+#define LKM_XID          0x00200000  /* use transaction id for deadlock (U) */
+#define LKM_XID_CONFLICT 0x00400000  /* do not allow lock inheritance (U) */
+#define LKM_FORCE        0x00800000  /* force unlock flag */
+#define LKM_REVVALBLK    0x01000000  /* temporary solution: re-validate
+					lock value block (U) */
+/* unused */
+#define LKM_UNUSED1      0x00000001  /* unused */
+#define LKM_UNUSED2      0x00000002  /* unused */
+#define LKM_UNUSED3      0x00000004  /* unused */
+#define LKM_UNUSED4      0x00000008  /* unused */
+#define LKM_UNUSED5      0x02000000  /* unused */
+#define LKM_UNUSED6      0x04000000  /* unused */
+#define LKM_UNUSED7      0x08000000  /* unused */
+
+/* ocfs2 extensions: internal only
+ * should never be used by caller */
+#define LKM_MIGRATION    0x10000000  /* extension: lockres is to be migrated
+					to another node */
+#define LKM_PUT_LVB      0x20000000  /* extension: lvb is being passed
+					should be applied to lockres */
+#define LKM_GET_LVB      0x40000000  /* extension: lvb should be copied
+					from lockres when lock is granted */
+#define LKM_RECOVERY     0x80000000  /* extension: flag for recovery lock
+					used to avoid recovery rwsem */
+
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm,
+			int mode,
+			struct dlm_lockstatus *lksb,
+			int flags,
+			const char *name,
+			int namelen,
+			dlm_astlockfunc_t *ast,
+			void *data,
+			dlm_bastlockfunc_t *bast);
+
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
+			  struct dlm_lockstatus *lksb,
+			  int flags,
+			  dlm_astunlockfunc_t *unlockast,
+			  void *data);
+
+struct dlm_protocol_version {
+	u8 pv_major;
+	u8 pv_minor;
+};
+struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key,
+				      struct dlm_protocol_version *fs_proto);
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm);
+
+void dlm_print_one_lock(struct dlm_lock *lockid);
+
+typedef void (dlm_eviction_func)(int, void *);
+struct dlm_eviction_cb {
+	struct list_head        ec_item;
+	dlm_eviction_func       *ec_func;
+	void                    *ec_data;
+};
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data);
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb);
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
+
+#endif /* DLMAPI_H */
diff --git a/kernel/fs/ocfs2/dlm/dlmast.c b/kernel/fs/ocfs2/dlm/dlmast.c
new file mode 100644
index 000000000..fd6bbbbd7
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,504 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmast.c
+ *
+ * AST and BAST functionality for local and remote nodes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock);
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+/* Should be called as an ast gets queued to see if the new
+ * lock level will obsolete a pending bast.
+ * For example, if dlm_thread queued a bast for an EX lock that
+ * was blocking another EX, but before sending the bast the
+ * lock owner downconverted to NL, the bast is now obsolete.
+ * Only the ast should be sent.
+ * This is needed because the lock and convert paths can queue
+ * asts out-of-band (not waiting for dlm_thread) in order to
+ * allow for LKM_NOQUEUE to get immediate responses. */
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	assert_spin_locked(&dlm->ast_lock);
+	assert_spin_locked(&lock->spinlock);
+
+	if (lock->ml.highest_blocked == LKM_IVMODE)
+		return 0;
+	BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
+
+	if (lock->bast_pending &&
+	    list_empty(&lock->bast_list))
+		/* old bast already sent, ok */
+		return 0;
+
+	if (lock->ml.type == LKM_EXMODE)
+		/* EX blocks anything left, any bast still valid */
+		return 0;
+	else if (lock->ml.type == LKM_NLMODE)
+		/* NL blocks nothing, no reason to send any bast, cancel it */
+		return 1;
+	else if (lock->ml.highest_blocked != LKM_EXMODE)
+		/* PR only blocks EX */
+		return 1;
+
+	return 0;
+}
+
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	res = lock->lockres;
+
+	assert_spin_locked(&dlm->ast_lock);
+
+	if (!list_empty(&lock->ast_list)) {
+		mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+		     "AST list not empty, pending %d, newlevel %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     lock->ast_pending, lock->ml.type);
+		BUG();
+	}
+	if (lock->ast_pending)
+		mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+
+	/* check to see if this ast obsoletes the bast */
+	if (dlm_should_cancel_bast(dlm, lock)) {
+		mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+		lock->bast_pending = 0;
+		list_del_init(&lock->bast_list);
+		lock->ml.highest_blocked = LKM_IVMODE;
+		/* removing lock from list, remove a ref.  guaranteed
+		 * this won't be the last ref because of the get above,
+		 * so res->spinlock will not be taken here */
+		dlm_lock_put(lock);
+		/* free up the reserved bast that we are cancelling.
+		 * guaranteed that this will not be the last reserved
+		 * ast because *both* an ast and a bast were reserved
+		 * to get to this point.  the res->spinlock will not be
+		 * taken here */
+		dlm_lockres_release_ast(dlm, res);
+	}
+	list_add_tail(&lock->ast_list, &dlm->pending_asts);
+	lock->ast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_ast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	assert_spin_locked(&dlm->ast_lock);
+
+	res = lock->lockres;
+
+	BUG_ON(!list_empty(&lock->bast_list));
+	if (lock->bast_pending)
+		mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+	list_add_tail(&lock->bast_list, &dlm->pending_basts);
+	lock->bast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_bast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock)
+{
+	struct dlm_lockstatus *lksb = lock->lksb;
+	BUG_ON(!lksb);
+
+	/* only updates if this node masters the lockres */
+	spin_lock(&res->spinlock);
+	if (res->owner == dlm->node_num) {
+		/* check the lksb flags for the direction */
+		if (lksb->flags & DLM_LKSB_GET_LVB) {
+			mlog(0, "getting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
+		}
+		/* Do nothing for lvb put requests - they should be done in
+ 		 * place when the lock is downconverted - otherwise we risk
+ 		 * racing gets and puts which could result in old lvb data
+ 		 * being propagated. We leave the put flag set and clear it
+ 		 * here. In the future we might want to clear it at the time
+ 		 * the put is actually done.
+		 */
+	}
+	spin_unlock(&res->spinlock);
+
+	/* reset any lvb flags on the lksb */
+	lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+}
+
+void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	dlm_astlockfunc_t *fn;
+	struct dlm_lockstatus *lksb;
+
+	mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+	     res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	lksb = lock->lksb;
+	fn = lock->ast;
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	dlm_update_lvb(dlm, res, lock);
+	(*fn)(lock->astdata);
+}
+
+
+int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	int ret;
+	struct dlm_lockstatus *lksb;
+	int lksbflags;
+
+	mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+	     res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	lksb = lock->lksb;
+	BUG_ON(lock->ml.node == dlm->node_num);
+
+	lksbflags = lksb->flags;
+	dlm_update_lvb(dlm, res, lock);
+
+	/* lock request came from another node
+	 * go do the ast over there */
+	ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
+	return ret;
+}
+
+void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		       struct dlm_lock *lock, int blocked_type)
+{
+	dlm_bastlockfunc_t *fn = lock->bast;
+
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+	     dlm->name, res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	     blocked_type);
+
+	(*fn)(lock->astdata, blocked_type);
+}
+
+
+
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+			  void **ret_data)
+{
+	int ret;
+	unsigned int locklen;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
+	char *name;
+	struct list_head *head = NULL;
+	__be64 cookie;
+	u32 flags;
+	u8 node;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = past->name;
+	locklen = past->namelen;
+	cookie = past->cookie;
+	flags = be32_to_cpu(past->flags);
+	node = past->node_idx;
+
+	if (locklen > DLM_LOCKID_NAME_MAX) {
+		ret = DLM_IVBUFLEN;
+		mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+		     "handler!\n", locklen);
+		goto leave;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+		     flags);
+		ret = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+		  (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+	if (past->type != DLM_AST &&
+	    past->type != DLM_BAST) {
+		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
+		     "name=%.*s, node=%u\n", past->type,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	res = dlm_lookup_lockres(dlm, name, locklen);
+	if (!res) {
+		mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+		     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	/* cannot get a proxy ast message if this node owns it */
+	BUG_ON(res->owner == dlm->node_num);
+
+	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "Responding with DLM_RECOVERING!\n");
+		ret = DLM_RECOVERING;
+		goto unlock_out;
+	}
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "Responding with DLM_MIGRATING!\n");
+		ret = DLM_MIGRATING;
+		goto unlock_out;
+	}
+	/* try convert queue for both ast/bast */
+	head = &res->converting;
+	lock = NULL;
+	list_for_each_entry(lock, head, list) {
+		if (lock->ml.cookie == cookie)
+			goto do_ast;
+	}
+
+	/* if not on convert, try blocked for ast, granted for bast */
+	if (past->type == DLM_AST)
+		head = &res->blocked;
+	else
+		head = &res->granted;
+
+	list_for_each_entry(lock, head, list) {
+		/* if lock is found but unlock is pending ignore the bast */
+		if (lock->ml.cookie == cookie) {
+			if (lock->unlock_pending)
+				break;
+			goto do_ast;
+		}
+	}
+
+	mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n", past->type == DLM_AST ? "" : "b",
+	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+	     locklen, name, node);
+
+	ret = DLM_NORMAL;
+unlock_out:
+	spin_unlock(&res->spinlock);
+	goto leave;
+
+do_ast:
+	ret = DLM_NORMAL;
+	if (past->type == DLM_AST) {
+		/* do not alter lock refcount.  switching lists. */
+		list_move_tail(&lock->list, &res->granted);
+		mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     lock->ml.type, lock->ml.convert_type);
+
+		if (lock->ml.convert_type != LKM_IVMODE) {
+			lock->ml.type = lock->ml.convert_type;
+			lock->ml.convert_type = LKM_IVMODE;
+		} else {
+			// should already be there....
+		}
+
+		lock->lksb->status = DLM_NORMAL;
+
+		/* if we requested the lvb, fetch it into our lksb now */
+		if (flags & LKM_GET_LVB) {
+			BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
+			memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	if (past->type == DLM_AST)
+		dlm_do_local_ast(dlm, res, lock);
+	else
+		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
+
+leave:
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+	return ret;
+}
+
+
+
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock, int msg_type,
+			   int blocked_type, int flags)
+{
+	int ret = 0;
+	struct dlm_proxy_ast past;
+	struct kvec vec[2];
+	size_t veclen = 1;
+	int status;
+
+	mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
+	     res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
+	     blocked_type);
+
+	memset(&past, 0, sizeof(struct dlm_proxy_ast));
+	past.node_idx = dlm->node_num;
+	past.type = msg_type;
+	past.blocked_type = blocked_type;
+	past.namelen = res->lockname.len;
+	memcpy(past.name, res->lockname.name, past.namelen);
+	past.cookie = lock->ml.cookie;
+
+	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
+	vec[0].iov_base = &past;
+	if (flags & DLM_LKSB_GET_LVB) {
+		be32_add_cpu(&past.flags, LKM_GET_LVB);
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
+				     lock->ml.node, &status);
+	if (ret < 0)
+		mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
+		     dlm->name, res->lockname.len, res->lockname.name, ret,
+		     lock->ml.node);
+	else {
+		if (status == DLM_RECOVERING) {
+			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
+			     "node is dead!\n", lock->ml.node);
+			BUG();
+		} else if (status == DLM_MIGRATING) {
+			mlog(ML_ERROR, "sent AST to node %u, it returned "
+			     "DLM_MIGRATING!\n", lock->ml.node);
+			BUG();
+		} else if (status != DLM_NORMAL && status != DLM_IVLOCKID) {
+			mlog(ML_ERROR, "AST to node %u returned %d!\n",
+			     lock->ml.node, status);
+			/* ignore it */
+		}
+		ret = 0;
+	}
+	return ret;
+}
diff --git a/kernel/fs/ocfs2/dlm/dlmcommon.h b/kernel/fs/ocfs2/dlm/dlmcommon.h
new file mode 100644
index 000000000..fae17c640
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,1150 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCOMMON_H
+#define DLMCOMMON_H
+
+#include <linux/kref.h>
+
+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
+#define DLM_HB_NODE_UP_PRI       (0x8000000)
+
+#define DLM_LOCKID_NAME_MAX    32
+
+#define DLM_DOMAIN_NAME_MAX_LEN    255
+#define DLM_LOCK_RES_OWNER_UNKNOWN     O2NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
+
+#define DLM_HASH_SIZE_DEFAULT	(1 << 17)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES		1
+#else
+# define DLM_HASH_PAGES		(DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
+#define DLM_BUCKETS_PER_PAGE	(PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS	(DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
+
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+
+enum dlm_mle_type {
+	DLM_MLE_BLOCK = 0,
+	DLM_MLE_MASTER = 1,
+	DLM_MLE_MIGRATION = 2,
+	DLM_MLE_NUM_TYPES = 3,
+};
+
+struct dlm_master_list_entry {
+	struct hlist_node master_hash_node;
+	struct list_head hb_events;
+	struct dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	struct kref mle_refs;
+	int inuse;
+	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	u8 master;
+	u8 new_master;
+	enum dlm_mle_type type;
+	struct o2hb_callback_func mle_hb_up;
+	struct o2hb_callback_func mle_hb_down;
+	struct dlm_lock_resource *mleres;
+	unsigned char mname[DLM_LOCKID_NAME_MAX];
+	unsigned int mnamelen;
+	unsigned int mnamehash;
+};
+
+enum dlm_ast_type {
+	DLM_AST = 0,
+	DLM_BAST = 1,
+	DLM_ASTUNLOCK = 2,
+};
+
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN   9
+
+static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
+{
+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+	    memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
+		return 1;
+	return 0;
+}
+
+#define DLM_RECO_STATE_ACTIVE    0x0001
+#define DLM_RECO_STATE_FINALIZE  0x0002
+
+struct dlm_recovery_ctxt
+{
+	struct list_head resources;
+	struct list_head node_data;
+	u8  new_master;
+	u8  dead_node;
+	u16 state;
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	wait_queue_head_t event;
+};
+
+enum dlm_ctxt_state {
+	DLM_CTXT_NEW = 0,
+	DLM_CTXT_JOINED = 1,
+	DLM_CTXT_IN_SHUTDOWN = 2,
+	DLM_CTXT_LEAVING = 3,
+};
+
+struct dlm_ctxt
+{
+	struct list_head list;
+	struct hlist_head **lockres_hash;
+	struct list_head dirty_list;
+	struct list_head purge_list;
+	struct list_head pending_asts;
+	struct list_head pending_basts;
+	struct list_head tracking_list;
+	unsigned int purge_count;
+	spinlock_t spinlock;
+	spinlock_t ast_lock;
+	spinlock_t track_lock;
+	char *name;
+	u8 node_num;
+	u32 key;
+	u8  joining_node;
+	wait_queue_head_t dlm_join_events;
+	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct dlm_recovery_ctxt reco;
+	spinlock_t master_lock;
+	struct hlist_head **master_hash;
+	struct list_head mle_hb_events;
+
+	/* these give a really vague idea of the system load */
+	atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
+	atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
+	atomic_t res_tot_count;
+	atomic_t res_cur_count;
+
+	struct dlm_debug_ctxt *dlm_debug_ctxt;
+	struct dentry *dlm_debugfs_subroot;
+
+	/* NOTE: Next three are protected by dlm_domain_lock */
+	struct kref dlm_refs;
+	enum dlm_ctxt_state dlm_state;
+	unsigned int num_joins;
+
+	struct o2hb_callback_func dlm_hb_up;
+	struct o2hb_callback_func dlm_hb_down;
+	struct task_struct *dlm_thread_task;
+	struct task_struct *dlm_reco_thread_task;
+	struct workqueue_struct *dlm_worker;
+	wait_queue_head_t dlm_thread_wq;
+	wait_queue_head_t dlm_reco_thread_wq;
+	wait_queue_head_t ast_wq;
+	wait_queue_head_t migration_wq;
+
+	struct work_struct dispatched_work;
+	struct list_head work_list;
+	spinlock_t work_lock;
+	struct list_head dlm_domain_handlers;
+	struct list_head	dlm_eviction_callbacks;
+
+	/* The filesystem specifies this at domain registration.  We
+	 * cache it here to know what to tell other nodes. */
+	struct dlm_protocol_version fs_locking_proto;
+	/* This is the inter-dlm communication version */
+	struct dlm_protocol_version dlm_locking_proto;
+};
+
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+	return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
+
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+						 unsigned i)
+{
+	return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+			(i % DLM_BUCKETS_PER_PAGE);
+}
+
+/* these keventd work queue items are for less-frequently
+ * called functions that cannot be directly called from the
+ * net message handlers for some reason, usually because
+ * they need to send net messages of their own. */
+void dlm_dispatch_work(struct work_struct *work);
+
+struct dlm_lock_resource;
+struct dlm_work_item;
+
+typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
+
+struct dlm_request_all_locks_priv
+{
+	u8 reco_master;
+	u8 dead_node;
+};
+
+struct dlm_mig_lockres_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 real_master;
+	u8 extra_ref;
+};
+
+struct dlm_assert_master_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 request_from;
+	u32 flags;
+	unsigned ignore_higher:1;
+};
+
+struct dlm_deref_lockres_priv
+{
+	struct dlm_lock_resource *deref_res;
+	u8 deref_node;
+};
+
+struct dlm_work_item
+{
+	struct list_head list;
+	dlm_workfunc_t *func;
+	struct dlm_ctxt *dlm;
+	void *data;
+	union {
+		struct dlm_request_all_locks_priv ral;
+		struct dlm_mig_lockres_priv ml;
+		struct dlm_assert_master_priv am;
+		struct dlm_deref_lockres_priv dl;
+	} u;
+};
+
+static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
+				      struct dlm_work_item *i,
+				      dlm_workfunc_t *f, void *data)
+{
+	memset(i, 0, sizeof(*i));
+	i->func = f;
+	INIT_LIST_HEAD(&i->list);
+	i->data = data;
+	i->dlm = dlm;  /* must have already done a dlm_grab on this! */
+}
+
+
+
+static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
+					  u8 node)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	dlm->joining_node = node;
+	wake_up(&dlm->dlm_join_events);
+}
+
+#define DLM_LOCK_RES_UNINITED             0x00000001
+#define DLM_LOCK_RES_RECOVERING           0x00000002
+#define DLM_LOCK_RES_READY                0x00000004
+#define DLM_LOCK_RES_DIRTY                0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010
+#define DLM_LOCK_RES_MIGRATING            0x00000020
+#define DLM_LOCK_RES_DROPPING_REF         0x00000040
+#define DLM_LOCK_RES_BLOCK_DIRTY          0x00001000
+#define DLM_LOCK_RES_SETREF_INPROG        0x00002000
+
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
+#define DLM_PURGE_INTERVAL_MS   (8 * 1000)
+
+struct dlm_lock_resource
+{
+	/* WARNING: Please see the comment in dlm_init_lockres before
+	 * adding fields here. */
+	struct hlist_node hash_node;
+	struct qstr lockname;
+	struct kref      refs;
+
+	/*
+	 * Please keep granted, converting, and blocked in this order,
+	 * as some funcs want to iterate over all lists.
+	 *
+	 * All four lists are protected by the hash's reference.
+	 */
+	struct list_head granted;
+	struct list_head converting;
+	struct list_head blocked;
+	struct list_head purge;
+
+	/*
+	 * These two lists require you to hold an additional reference
+	 * while they are on the list.
+	 */
+	struct list_head dirty;
+	struct list_head recovering; // dlm_recovery_ctxt.resources list
+
+	/* Added during init and removed during release */
+	struct list_head tracking;	/* dlm->tracking_list */
+
+	/* unused lock resources have their last_used stamped and are
+	 * put on a list for the dlm thread to run. */
+	unsigned long    last_used;
+
+	struct dlm_ctxt *dlm;
+
+	unsigned migration_pending:1;
+	atomic_t asts_reserved;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	u8  owner;              //node which owns the lock resource, or unknown
+	u16 state;
+	char lvb[DLM_LVB_LEN];
+	unsigned int inflight_locks;
+	unsigned int inflight_assert_workers;
+	unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+struct dlm_migratable_lock
+{
+	__be64 cookie;
+
+	/* these 3 are just padding for the in-memory structure, but
+	 * list and flags are actually used when sent over the wire */
+	__be16 pad1;
+	u8 list;  // 0=granted, 1=converting, 2=blocked
+	u8 flags;
+
+	s8 type;
+	s8 convert_type;
+	s8 highest_blocked;
+	u8 node;
+};  // 16 bytes
+
+struct dlm_lock
+{
+	struct dlm_migratable_lock ml;
+
+	struct list_head list;
+	struct list_head ast_list;
+	struct list_head bast_list;
+	struct dlm_lock_resource *lockres;
+	spinlock_t spinlock;
+	struct kref lock_refs;
+
+	// ast and bast must be callable while holding a spinlock!
+	dlm_astlockfunc_t *ast;
+	dlm_bastlockfunc_t *bast;
+	void *astdata;
+	struct dlm_lockstatus *lksb;
+	unsigned ast_pending:1,
+		 bast_pending:1,
+		 convert_pending:1,
+		 lock_pending:1,
+		 cancel_pending:1,
+		 unlock_pending:1,
+		 lksb_kernel_allocated:1;
+};
+
+
+#define DLM_LKSB_UNUSED1           0x01
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+
+enum dlm_lockres_list {
+	DLM_GRANTED_LIST = 0,
+	DLM_CONVERTING_LIST = 1,
+	DLM_BLOCKED_LIST = 2,
+};
+
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+	int i;
+	for (i=0; i<DLM_LVB_LEN; i++)
+		if (lvb[i])
+			return 0;
+	return 1;
+}
+
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+	if (idx == DLM_GRANTED_LIST)
+		return "granted";
+	else if (idx == DLM_CONVERTING_LIST)
+		return "converting";
+	else if (idx == DLM_BLOCKED_LIST)
+		return "blocked";
+	else
+		return "unknown";
+}
+
+static inline struct list_head *
+dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
+{
+	struct list_head *ret = NULL;
+	if (idx == DLM_GRANTED_LIST)
+		ret = &res->granted;
+	else if (idx == DLM_CONVERTING_LIST)
+		ret = &res->converting;
+	else if (idx == DLM_BLOCKED_LIST)
+		ret = &res->blocked;
+	else
+		BUG();
+	return ret;
+}
+
+
+
+
+struct dlm_node_iter
+{
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int curnode;
+};
+
+
+enum {
+	DLM_MASTER_REQUEST_MSG		= 500,
+	DLM_UNUSED_MSG1			= 501,
+	DLM_ASSERT_MASTER_MSG		= 502,
+	DLM_CREATE_LOCK_MSG		= 503,
+	DLM_CONVERT_LOCK_MSG		= 504,
+	DLM_PROXY_AST_MSG		= 505,
+	DLM_UNLOCK_LOCK_MSG		= 506,
+	DLM_DEREF_LOCKRES_MSG		= 507,
+	DLM_MIGRATE_REQUEST_MSG		= 508,
+	DLM_MIG_LOCKRES_MSG		= 509,
+	DLM_QUERY_JOIN_MSG		= 510,
+	DLM_ASSERT_JOINED_MSG		= 511,
+	DLM_CANCEL_JOIN_MSG		= 512,
+	DLM_EXIT_DOMAIN_MSG		= 513,
+	DLM_MASTER_REQUERY_MSG		= 514,
+	DLM_LOCK_REQUEST_MSG		= 515,
+	DLM_RECO_DATA_DONE_MSG		= 516,
+	DLM_BEGIN_RECO_MSG		= 517,
+	DLM_FINALIZE_RECO_MSG		= 518,
+	DLM_QUERY_REGION		= 519,
+	DLM_QUERY_NODEINFO		= 520,
+	DLM_BEGIN_EXIT_DOMAIN_MSG	= 521,
+};
+
+struct dlm_reco_node_data
+{
+	int state;
+	u8 node_num;
+	struct list_head list;
+};
+
+enum {
+	DLM_RECO_NODE_DATA_DEAD = -1,
+	DLM_RECO_NODE_DATA_INIT = 0,
+	DLM_RECO_NODE_DATA_REQUESTING = 1,
+	DLM_RECO_NODE_DATA_REQUESTED = 2,
+	DLM_RECO_NODE_DATA_RECEIVING = 3,
+	DLM_RECO_NODE_DATA_DONE = 4,
+	DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
+};
+
+
+enum {
+	DLM_MASTER_RESP_NO = 0,
+	DLM_MASTER_RESP_YES = 1,
+	DLM_MASTER_RESP_MAYBE = 2,
+	DLM_MASTER_RESP_ERROR = 3,
+};
+
+
+struct dlm_master_request
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_ASSERT_RESPONSE_REASSERT       0x00000001
+#define DLM_ASSERT_RESPONSE_MASTERY_REF    0x00000002
+
+#define DLM_ASSERT_MASTER_MLE_CLEANUP      0x00000001
+#define DLM_ASSERT_MASTER_REQUERY          0x00000002
+#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
+struct dlm_assert_master
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_MIGRATE_RESPONSE_MASTERY_REF   0x00000001
+
+struct dlm_migrate_request
+{
+	u8 master;
+	u8 new_master;
+	u8 namelen;
+	u8 pad1;
+	__be32 pad2;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_master_requery
+{
+	u8 pad1;
+	u8 pad2;
+	u8 node_idx;
+	u8 namelen;
+	__be32 pad3;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_MRES_RECOVERY   0x01
+#define DLM_MRES_MIGRATION  0x02
+#define DLM_MRES_ALL_DONE   0x04
+
+/*
+ * We would like to get one whole lockres into a single network
+ * message whenever possible.  Generally speaking, there will be
+ * at most one dlm_lock on a lockres for each node in the cluster,
+ * plus (infrequently) any additional locks coming in from userdlm.
+ *
+ * struct _dlm_lockres_page
+ * {
+ * 	dlm_migratable_lockres mres;
+ * 	dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
+ * 	u8 pad[DLM_MIG_LOCKRES_RESERVED];
+ * };
+ *
+ * from ../cluster/tcp.h
+ *    NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
+ *    (roughly 4080 bytes)
+ * and sizeof(dlm_migratable_lockres) = 112 bytes
+ * and sizeof(dlm_migratable_lock) = 16 bytes
+ *
+ * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
+ * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
+ *
+ *  (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
+ *     sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
+ *        NET_MAX_PAYLOAD_BYTES
+ *  (240 * 16) + 112 + 128 = 4080
+ *
+ * So a lockres would need more than 240 locks before it would
+ * use more than one network packet to recover.  Not too bad.
+ */
+#define DLM_MAX_MIGRATABLE_LOCKS   240
+
+struct dlm_migratable_lockres
+{
+	u8 master;
+	u8 lockname_len;
+	u8 num_locks;    // locks sent in this structure
+	u8 flags;
+	__be32 total_locks; // locks to be sent for this migration cookie
+	__be64 mig_cookie;  // cookie for this lockres migration
+			 // or zero if not needed
+	// 16 bytes
+	u8 lockname[DLM_LOCKID_NAME_MAX];
+	// 48 bytes
+	u8 lvb[DLM_LVB_LEN];
+	// 112 bytes
+	struct dlm_migratable_lock ml[0];  // 16 bytes each, begins at byte 112
+};
+#define DLM_MIG_LOCKRES_MAX_LEN  \
+	(sizeof(struct dlm_migratable_lockres) + \
+	 (sizeof(struct dlm_migratable_lock) * \
+	  DLM_MAX_MIGRATABLE_LOCKS) )
+
+/* from above, 128 bytes
+ * for some undetermined future use */
+#define DLM_MIG_LOCKRES_RESERVED   (NET_MAX_PAYLOAD_BYTES - \
+				    DLM_MIG_LOCKRES_MAX_LEN)
+
+struct dlm_create_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_convert_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_CONVERT_LOCK_MAX_LEN  (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
+
+struct dlm_unlock_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	__be16 pad1;
+	u8 node_idx;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_UNLOCK_LOCK_MAX_LEN  (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
+
+struct dlm_proxy_ast
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 node_idx;
+	u8 type;
+	u8 blocked_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_PROXY_AST_MAX_LEN  (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
+
+#define DLM_MOD_KEY (0x666c6172)
+enum dlm_query_join_response_code {
+	JOIN_DISALLOW = 0,
+	JOIN_OK = 1,
+	JOIN_OK_NO_MAP = 2,
+	JOIN_PROTOCOL_MISMATCH = 3,
+};
+
+struct dlm_query_join_packet {
+	u8 code;	/* Response code.  dlm_minor and fs_minor
+			   are only valid if this is JOIN_OK */
+	u8 dlm_minor;	/* The minor version of the protocol the
+			   dlm is speaking. */
+	u8 fs_minor;	/* The minor version of the protocol the
+			   filesystem is speaking. */
+	u8 reserved;
+};
+
+union dlm_query_join_response {
+	__be32 intval;
+	struct dlm_query_join_packet packet;
+};
+
+struct dlm_lock_request
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+struct dlm_reco_data_done
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+
+	/* unused for now */
+	/* eventually we can use this to attempt
+	 * lvb recovery based on each node's info */
+	u8 reco_lvb[DLM_LVB_LEN];
+};
+
+struct dlm_begin_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+
+#define BITS_PER_BYTE 8
+#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
+
+struct dlm_query_join_request
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	struct dlm_protocol_version dlm_proto;
+	struct dlm_protocol_version fs_proto;
+	u8 domain[O2NM_MAX_NAME_LEN];
+	u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)];
+};
+
+struct dlm_assert_joined
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_cancel_join
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_query_region {
+	u8 qr_node;
+	u8 qr_numregions;
+	u8 qr_namelen;
+	u8 pad1;
+	u8 qr_domain[O2NM_MAX_NAME_LEN];
+	u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+	u8 ni_nodenum;
+	u8 pad1;
+	__be16 ni_ipv4_port;
+	__be32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+	u8 qn_nodenum;
+	u8 qn_numnodes;
+	u8 qn_namelen;
+	u8 pad1;
+	u8 qn_domain[O2NM_MAX_NAME_LEN];
+	struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
+struct dlm_exit_domain
+{
+	u8 node_idx;
+	u8 pad1[3];
+};
+
+struct dlm_finalize_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	u8 flags;
+	u8 pad1;
+	__be32 pad2;
+};
+
+struct dlm_deref_lockres
+{
+	u32 pad1;
+	u16 pad2;
+	u8 node_idx;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+static inline enum dlm_status
+__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
+{
+	enum dlm_status status = DLM_NORMAL;
+
+	assert_spin_locked(&res->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		status = DLM_RECOVERING;
+	else if (res->state & DLM_LOCK_RES_MIGRATING)
+		status = DLM_MIGRATING;
+	else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
+		status = DLM_FORWARD;
+
+	return status;
+}
+
+static inline u8 dlm_get_lock_cookie_node(u64 cookie)
+{
+	u8 ret;
+	cookie >>= 56;
+	ret = (u8)(cookie & 0xffULL);
+	return ret;
+}
+
+static inline unsigned long long dlm_get_lock_cookie_seq(u64 cookie)
+{
+	unsigned long long ret;
+	ret = ((unsigned long long)cookie) & 0x00ffffffffffffffULL;
+	return ret;
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb);
+void dlm_lock_get(struct dlm_lock *lock);
+void dlm_lock_put(struct dlm_lock *lock);
+
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res);
+
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data);
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			     void **ret_data);
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+			  void **ret_data);
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock);
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock);
+
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data);
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+
+int dlm_launch_thread(struct dlm_ctxt *dlm);
+void dlm_complete_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+
+void dlm_put(struct dlm_ctxt *dlm);
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
+
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res);
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	/* This is called on every lookup, so it might be worth
+	 * inlining. */
+	kref_get(&res->refs);
+}
+void dlm_lockres_put(struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+						     const char *name,
+						     unsigned int len,
+						     unsigned int hash);
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+						const char *name,
+						unsigned int len,
+						unsigned int hash);
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					      const char *name,
+					      unsigned int len);
+
+int dlm_is_host_down(int errno);
+
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+						 const char *lockid,
+						 int namelen,
+						 int flags);
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+					  const char *name,
+					  unsigned int namelen);
+
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res, int bit);
+
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res);
+
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res);
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_do_local_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+int dlm_do_remote_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+void dlm_do_local_bast(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res,
+		       struct dlm_lock *lock,
+		       int blocked_type);
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
+			   struct dlm_lock_resource *res,
+			   struct dlm_lock *lock,
+			   int msg_type,
+			   int blocked_type, int flags);
+static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock,
+				      int blocked_type)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
+				      blocked_type, 0);
+}
+
+static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_lock *lock,
+				     int flags)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
+				      0, flags);
+}
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+
+u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+
+
+int dlm_nm_init(struct dlm_ctxt *dlm);
+int dlm_heartbeat_init(struct dlm_ctxt *dlm);
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
+
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+int dlm_finish_migration(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 u8 old_master);
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res);
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
+
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data);
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data);
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data);
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
+				void **ret_data);
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data);
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data);
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data);
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data);
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			   void **ret_data);
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data);
+int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			  u8 nodenum, u8 *real_master);
+
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher,
+			       u8 request_from,
+			       u32 flags);
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to,
+			 u8 flags);
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
+void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
+
+/* will exit holding res->spinlock, but may drop in function */
+static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
+{
+	__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
+				    	  DLM_LOCK_RES_RECOVERING|
+					  DLM_LOCK_RES_MIGRATING));
+}
+
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
+
+int dlm_init_mle_cache(void);
+void dlm_destroy_mle_cache(void);
+
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res);
+void dlm_clean_master_list(struct dlm_ctxt *dlm,
+			   u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
+
+static inline const char * dlm_lock_mode_name(int mode)
+{
+	switch (mode) {
+		case LKM_EXMODE:
+			return "EX";
+		case LKM_PRMODE:
+			return "PR";
+		case LKM_NLMODE:
+			return "NL";
+	}
+	return "UNKNOWN";
+}
+
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+	/* NO_LOCK compatible with all */
+	if (request == LKM_NLMODE ||
+	    existing == LKM_NLMODE)
+		return 1;
+
+	/* EX incompatible with all non-NO_LOCK */
+	if (request == LKM_EXMODE)
+		return 0;
+
+	/* request must be PR, which is compatible with PR */
+	if (existing == LKM_PRMODE)
+		return 1;
+
+	return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head,
+				   struct dlm_lock *lock)
+{
+	struct dlm_lock *tmplock;
+
+	list_for_each_entry(tmplock, head, list) {
+		if (tmplock == lock)
+			return 1;
+	}
+	return 0;
+}
+
+
+static inline enum dlm_status dlm_err_to_dlm_status(int err)
+{
+	enum dlm_status ret;
+	if (err == -ENOMEM)
+		ret = DLM_SYSERR;
+	else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
+		ret = DLM_NOLOCKMGR;
+	else if (err == -EINVAL)
+		ret = DLM_BADPARAM;
+	else if (err == -ENAMETOOLONG)
+		ret = DLM_IVBUFLEN;
+	else
+		ret = DLM_BADARGS;
+	return ret;
+}
+
+
+static inline void dlm_node_iter_init(unsigned long *map,
+				      struct dlm_node_iter *iter)
+{
+	memcpy(iter->node_map, map, sizeof(iter->node_map));
+	iter->curnode = -1;
+}
+
+static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
+{
+	int bit;
+	bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+	iter->curnode = bit;
+	return bit;
+}
+
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+					 struct dlm_lock_resource *res,
+					 u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	res->owner = owner;
+}
+
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+					    struct dlm_lock_resource *res,
+					    u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	if (owner != res->owner)
+		dlm_set_lockres_owner(dlm, res, owner);
+}
+
+#endif /* DLMCOMMON_H */
diff --git a/kernel/fs/ocfs2/dlm/dlmconvert.c b/kernel/fs/ocfs2/dlm/dlmconvert.c
new file mode 100644
index 000000000..e36d63ff1
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,544 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.c
+ *
+ * underlying calls for lock conversion
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+/* NOTE: __dlmconvert_master is the only function in here that
+ * needs a spinlock held on entry (res->spinlock) and it is the
+ * only one that holds a lock on exit (res->spinlock).
+ * All other functions in here need no locks and drop all of
+ * the locks that they acquire. */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread);
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type);
+
+/*
+ * this is only called directly by dlmlock(), and only when the
+ * local node is the owner of the lockres
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: see __dlmconvert_master
+ */
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status;
+
+	spin_lock(&res->spinlock);
+	/* we are not in a network handler, this is fine */
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	status = __dlmconvert_master(dlm, res, lock, flags, type,
+				     &call_ast, &kick_thread);
+
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
+		dlm_error(status);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+/* performs lock conversion at the lockres master site
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         takes and drops lock->spinlock
+ *   held on exit:  res->spinlock
+ * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
+ *   call_ast: whether ast should be called for this lock
+ *   kick_thread: whether dlm_kick_thread should be called
+ */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread)
+{
+	enum dlm_status status = DLM_NORMAL;
+	struct dlm_lock *tmplock=NULL;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
+	     lock->ml.type, lock->ml.convert_type, type);
+
+	spin_lock(&lock->spinlock);
+
+	/* already converting? */
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		mlog(ML_ERROR, "attempted to convert a lock with a lock "
+		     "conversion pending\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	/* must be on grant queue to convert */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		mlog(ML_ERROR, "attempted to convert a lock not on grant "
+		     "queue\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	if (flags & LKM_VALBLK) {
+		switch (lock->ml.type) {
+			case LKM_EXMODE:
+				/* EX + LKM_VALBLK + convert == set lvb */
+				mlog(0, "will set lvb: converting %s->%s\n",
+				     dlm_lock_mode_name(lock->ml.type),
+				     dlm_lock_mode_name(type));
+				lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+				break;
+			case LKM_PRMODE:
+			case LKM_NLMODE:
+				/* refetch if new level is not NL */
+				if (type > LKM_NLMODE) {
+					mlog(0, "will fetch new value into "
+					     "lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					lock->lksb->flags |= DLM_LKSB_GET_LVB;
+				} else {
+					mlog(0, "will NOT fetch new value "
+					     "into lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					flags &= ~(LKM_VALBLK);
+				}
+				break;
+		}
+	}
+
+
+	/* in-place downconvert? */
+	if (type <= lock->ml.type)
+		goto grant;
+
+	/* upconvert from here on */
+	status = DLM_NORMAL;
+	list_for_each_entry(tmplock, &res->granted, list) {
+		if (tmplock == lock)
+			continue;
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+	}
+
+	list_for_each_entry(tmplock, &res->converting, list) {
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+		/* existing conversion requests take precedence */
+		if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
+			goto switch_queues;
+	}
+
+	/* fall thru to grant */
+
+grant:
+	mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
+	     res->lockname.name, dlm_lock_mode_name(type));
+	/* immediately grant the new lock type */
+	lock->lksb->status = DLM_NORMAL;
+	if (lock->ml.node == dlm->node_num)
+		mlog(0, "doing in-place convert for nonlocal lock\n");
+	lock->ml.type = type;
+	if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+		memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+
+	status = DLM_NORMAL;
+	*call_ast = 1;
+	goto unlock_exit;
+
+switch_queues:
+	if (flags & LKM_NOQUEUE) {
+		mlog(0, "failed to convert NOQUEUE lock %.*s from "
+		     "%d to %d...\n", res->lockname.len, res->lockname.name,
+		     lock->ml.type, type);
+		status = DLM_NOTQUEUED;
+		goto unlock_exit;
+	}
+	mlog(0, "res %.*s, queueing...\n", res->lockname.len,
+	     res->lockname.name);
+
+	lock->ml.convert_type = type;
+	/* do not alter lock refcount.  switching lists. */
+	list_move_tail(&lock->list, &res->converting);
+
+unlock_exit:
+	spin_unlock(&lock->spinlock);
+	if (status == DLM_DENIED) {
+		__dlm_print_one_lock_resource(res);
+	}
+	if (status == DLM_NORMAL)
+		*kick_thread = 1;
+	return status;
+}
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock)
+{
+	/* do not alter lock refcount.  switching lists. */
+	list_move_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+}
+
+/* messages the master site to do lock conversion
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
+ */
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	enum dlm_status status;
+
+	mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
+	     lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "bailing out early since res is RECOVERING "
+		     "on secondary queue\n");
+		/* __dlm_print_one_lock_resource(res); */
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		__dlm_print_one_lock_resource(res);
+		mlog(ML_ERROR, "converting a remote lock that is already "
+		     "converting! (cookie=%u:%llu, conv=%d)\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     lock->ml.convert_type);
+		status = DLM_DENIED;
+		goto bail;
+	}
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	/* move lock to local convert queue */
+	/* do not alter lock refcount.  switching lists. */
+	list_move_tail(&lock->list, &res->converting);
+	lock->convert_pending = 1;
+	lock->ml.convert_type = type;
+
+	if (flags & LKM_VALBLK) {
+		if (lock->ml.type == LKM_EXMODE) {
+			flags |= LKM_PUT_LVB;
+			lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+		} else {
+			if (lock->ml.convert_type == LKM_NLMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	/* no locks held here.
+	 * need to wait for a reply as to whether it got queued or not. */
+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->convert_pending = 0;
+	/* if it failed, move it back to granted queue */
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_convert(res, lock);
+	}
+bail:
+	spin_unlock(&res->spinlock);
+
+	/* TODO: should this be a wake_one? */
+	/* wake up any IN_PROGRESS waiters */
+	wake_up(&res->wq);
+
+	return status;
+}
+
+/* sends DLM_CONVERT_LOCK_MSG to master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, status from remote node
+ */
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type)
+{
+	struct dlm_convert_lock convert;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&convert, 0, sizeof(struct dlm_convert_lock));
+	convert.node_idx = dlm->node_num;
+	convert.requested_type = type;
+	convert.cookie = lock->ml.cookie;
+	convert.namelen = res->lockname.len;
+	convert.flags = cpu_to_be32(flags);
+	memcpy(convert.name, res->lockname.name, convert.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_convert_lock);
+	vec[0].iov_base = &convert;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
+					vec, veclen, res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+		if (ret == DLM_RECOVERING) {
+			mlog(0, "node %u returned DLM_RECOVERING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_MIGRATING) {
+			mlog(0, "node %u returned DLM_MIGRATING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_FORWARD) {
+			mlog(0, "node %u returned DLM_FORWARD from convert "
+			     "message!\n", res->owner);
+		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
+			dlm_error(ret);
+	} else {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+		     res->owner);
+		if (dlm_is_host_down(tmpret)) {
+			/* instead of logging the same network error over
+			 * and over, sleep here and wait for the heartbeat
+			 * to notice the node is dead.  times out after 5s. */
+			dlm_wait_for_node_death(dlm, res->owner,
+						DLM_NODE_DEATH_WAIT_MAX);
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from convert message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+/* handler for DLM_CONVERT_LOCK_MSG on master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drop res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
+ *          status from __dlmconvert_master
+ */
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			     void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	struct dlm_lock *tmp_lock;
+	struct dlm_lockstatus *lksb;
+	enum dlm_status status = DLM_NORMAL;
+	u32 flags;
+	int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
+		status = DLM_IVBUFLEN;
+		dlm_error(status);
+		goto leave;
+	}
+
+	flags = be32_to_cpu(cnv->flags);
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		status = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+	     (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL) {
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		goto leave;
+	}
+	list_for_each_entry(tmp_lock, &res->granted, list) {
+		if (tmp_lock->ml.cookie == cnv->cookie &&
+		    tmp_lock->ml.node == cnv->node_idx) {
+			lock = tmp_lock;
+			dlm_lock_get(lock);
+			break;
+		}
+	}
+	spin_unlock(&res->spinlock);
+	if (!lock) {
+		status = DLM_IVLOCKID;
+		mlog(ML_ERROR, "did not find lock to convert on grant queue! "
+			       "cookie=%u:%llu\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
+		dlm_print_one_lock_resource(res);
+		goto leave;
+	}
+
+	/* found the lock */
+	lksb = lock->lksb;
+
+	/* see if caller needed to get/put lvb */
+	if (flags & LKM_PUT_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
+	} else if (flags & LKM_GET_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_GET_LVB;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status == DLM_NORMAL) {
+		__dlm_lockres_reserve_ast(res);
+		ast_reserved = 1;
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+		status = __dlmconvert_master(dlm, res, lock, flags,
+					     cnv->requested_type,
+					     &call_ast, &kick_thread);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		wake = 1;
+	}
+	spin_unlock(&res->spinlock);
+	if (wake)
+		wake_up(&res->wq);
+
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+	}
+
+leave:
+	if (lock)
+		dlm_lock_put(lock);
+
+	/* either queue the ast or release it, if reserved */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else if (ast_reserved)
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
diff --git a/kernel/fs/ocfs2/dlm/dlmconvert.h b/kernel/fs/ocfs2/dlm/dlmconvert.h
new file mode 100644
index 000000000..b2e3677df
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCONVERT_H
+#define DLMCONVERT_H
+
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+
+#endif
diff --git a/kernel/fs/ocfs2/dlm/dlmdebug.c b/kernel/fs/ocfs2/dlm/dlmdebug.c
new file mode 100644
index 000000000..825136070
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,1000 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.c
+ *
+ * debug functionality for the dlm
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static int stringify_lockname(const char *lockname, int locklen, char *buf,
+			      int len);
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	spin_lock(&res->spinlock);
+	__dlm_print_one_lock_resource(res);
+	spin_unlock(&res->spinlock);
+}
+
+static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
+{
+	int bit;
+	assert_spin_locked(&res->spinlock);
+
+	printk("  refmap nodes: [ ");
+	bit = 0;
+	while (1) {
+		bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+		if (bit >= O2NM_MAX_NODES)
+			break;
+		printk("%u ", bit);
+		bit++;
+	}
+	printk("], inflight=%u\n", res->inflight_locks);
+}
+
+static void __dlm_print_lock(struct dlm_lock *lock)
+{
+	spin_lock(&lock->spinlock);
+
+	printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+	       "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+	       "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
+	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	       atomic_read(&lock->lock_refs.refcount),
+	       (list_empty(&lock->ast_list) ? 'y' : 'n'),
+	       (lock->ast_pending ? 'y' : 'n'),
+	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
+	       (lock->bast_pending ? 'y' : 'n'),
+	       (lock->convert_pending ? 'y' : 'n'),
+	       (lock->lock_pending ? 'y' : 'n'),
+	       (lock->cancel_pending ? 'y' : 'n'),
+	       (lock->unlock_pending ? 'y' : 'n'));
+
+	spin_unlock(&lock->spinlock);
+}
+
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	struct dlm_lock *lock;
+	char buf[DLM_LOCKID_NAME_MAX];
+
+	assert_spin_locked(&res->spinlock);
+
+	stringify_lockname(res->lockname.name, res->lockname.len,
+			   buf, sizeof(buf));
+	printk("lockres: %s, owner=%u, state=%u\n",
+	       buf, res->owner, res->state);
+	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
+	       res->last_used, atomic_read(&res->refs.refcount),
+	       list_empty(&res->purge) ? "no" : "yes");
+	printk("  on dirty list: %s, on reco list: %s, "
+	       "migrating pending: %s\n",
+	       list_empty(&res->dirty) ? "no" : "yes",
+	       list_empty(&res->recovering) ? "no" : "yes",
+	       res->migration_pending ? "yes" : "no");
+	printk("  inflight locks: %d, asts reserved: %d\n",
+	       res->inflight_locks, atomic_read(&res->asts_reserved));
+	dlm_print_lockres_refmap(res);
+	printk("  granted queue:\n");
+	list_for_each_entry(lock, &res->granted, list) {
+		__dlm_print_lock(lock);
+	}
+	printk("  converting queue:\n");
+	list_for_each_entry(lock, &res->converting, list) {
+		__dlm_print_lock(lock);
+	}
+	printk("  blocked queue:\n");
+	list_for_each_entry(lock, &res->blocked, list) {
+		__dlm_print_lock(lock);
+	}
+}
+
+void dlm_print_one_lock(struct dlm_lock *lockid)
+{
+	dlm_print_one_lock_resource(lockid->lockres);
+}
+EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+
+static const char *dlm_errnames[] = {
+	[DLM_NORMAL] =			"DLM_NORMAL",
+	[DLM_GRANTED] =			"DLM_GRANTED",
+	[DLM_DENIED] =			"DLM_DENIED",
+	[DLM_DENIED_NOLOCKS] =		"DLM_DENIED_NOLOCKS",
+	[DLM_WORKING] =			"DLM_WORKING",
+	[DLM_BLOCKED] =			"DLM_BLOCKED",
+	[DLM_BLOCKED_ORPHAN] =		"DLM_BLOCKED_ORPHAN",
+	[DLM_DENIED_GRACE_PERIOD] =	"DLM_DENIED_GRACE_PERIOD",
+	[DLM_SYSERR] =			"DLM_SYSERR",
+	[DLM_NOSUPPORT] =		"DLM_NOSUPPORT",
+	[DLM_CANCELGRANT] =		"DLM_CANCELGRANT",
+	[DLM_IVLOCKID] =		"DLM_IVLOCKID",
+	[DLM_SYNC] =			"DLM_SYNC",
+	[DLM_BADTYPE] =			"DLM_BADTYPE",
+	[DLM_BADRESOURCE] =		"DLM_BADRESOURCE",
+	[DLM_MAXHANDLES] =		"DLM_MAXHANDLES",
+	[DLM_NOCLINFO] =		"DLM_NOCLINFO",
+	[DLM_NOLOCKMGR] =		"DLM_NOLOCKMGR",
+	[DLM_NOPURGED] =		"DLM_NOPURGED",
+	[DLM_BADARGS] =			"DLM_BADARGS",
+	[DLM_VOID] =			"DLM_VOID",
+	[DLM_NOTQUEUED] =		"DLM_NOTQUEUED",
+	[DLM_IVBUFLEN] =		"DLM_IVBUFLEN",
+	[DLM_CVTUNGRANT] =		"DLM_CVTUNGRANT",
+	[DLM_BADPARAM] =		"DLM_BADPARAM",
+	[DLM_VALNOTVALID] =		"DLM_VALNOTVALID",
+	[DLM_REJECTED] =		"DLM_REJECTED",
+	[DLM_ABORT] =			"DLM_ABORT",
+	[DLM_CANCEL] =			"DLM_CANCEL",
+	[DLM_IVRESHANDLE] =		"DLM_IVRESHANDLE",
+	[DLM_DEADLOCK] =		"DLM_DEADLOCK",
+	[DLM_DENIED_NOASTS] =		"DLM_DENIED_NOASTS",
+	[DLM_FORWARD] =			"DLM_FORWARD",
+	[DLM_TIMEOUT] =			"DLM_TIMEOUT",
+	[DLM_IVGROUPID] =		"DLM_IVGROUPID",
+	[DLM_VERS_CONFLICT] =		"DLM_VERS_CONFLICT",
+	[DLM_BAD_DEVICE_PATH] =		"DLM_BAD_DEVICE_PATH",
+	[DLM_NO_DEVICE_PERMISSION] =	"DLM_NO_DEVICE_PERMISSION",
+	[DLM_NO_CONTROL_DEVICE ] =	"DLM_NO_CONTROL_DEVICE ",
+	[DLM_RECOVERING] =		"DLM_RECOVERING",
+	[DLM_MIGRATING] =		"DLM_MIGRATING",
+	[DLM_MAXSTATS] =		"DLM_MAXSTATS",
+};
+
+static const char *dlm_errmsgs[] = {
+	[DLM_NORMAL] = 			"request in progress",
+	[DLM_GRANTED] = 		"request granted",
+	[DLM_DENIED] = 			"request denied",
+	[DLM_DENIED_NOLOCKS] = 		"request denied, out of system resources",
+	[DLM_WORKING] = 		"async request in progress",
+	[DLM_BLOCKED] = 		"lock request blocked",
+	[DLM_BLOCKED_ORPHAN] = 		"lock request blocked by a orphan lock",
+	[DLM_DENIED_GRACE_PERIOD] = 	"topological change in progress",
+	[DLM_SYSERR] = 			"system error",
+	[DLM_NOSUPPORT] = 		"unsupported",
+	[DLM_CANCELGRANT] = 		"can't cancel convert: already granted",
+	[DLM_IVLOCKID] = 		"bad lockid",
+	[DLM_SYNC] = 			"synchronous request granted",
+	[DLM_BADTYPE] = 		"bad resource type",
+	[DLM_BADRESOURCE] = 		"bad resource handle",
+	[DLM_MAXHANDLES] = 		"no more resource handles",
+	[DLM_NOCLINFO] = 		"can't contact cluster manager",
+	[DLM_NOLOCKMGR] = 		"can't contact lock manager",
+	[DLM_NOPURGED] = 		"can't contact purge daemon",
+	[DLM_BADARGS] = 		"bad api args",
+	[DLM_VOID] = 			"no status",
+	[DLM_NOTQUEUED] = 		"NOQUEUE was specified and request failed",
+	[DLM_IVBUFLEN] = 		"invalid resource name length",
+	[DLM_CVTUNGRANT] = 		"attempted to convert ungranted lock",
+	[DLM_BADPARAM] = 		"invalid lock mode specified",
+	[DLM_VALNOTVALID] = 		"value block has been invalidated",
+	[DLM_REJECTED] = 		"request rejected, unrecognized client",
+	[DLM_ABORT] = 			"blocked lock request cancelled",
+	[DLM_CANCEL] = 			"conversion request cancelled",
+	[DLM_IVRESHANDLE] = 		"invalid resource handle",
+	[DLM_DEADLOCK] = 		"deadlock recovery refused this request",
+	[DLM_DENIED_NOASTS] = 		"failed to allocate AST",
+	[DLM_FORWARD] = 		"request must wait for primary's response",
+	[DLM_TIMEOUT] = 		"timeout value for lock has expired",
+	[DLM_IVGROUPID] = 		"invalid group specification",
+	[DLM_VERS_CONFLICT] = 		"version conflicts prevent request handling",
+	[DLM_BAD_DEVICE_PATH] = 	"Locks device does not exist or path wrong",
+	[DLM_NO_DEVICE_PERMISSION] = 	"Client has insufficient perms for device",
+	[DLM_NO_CONTROL_DEVICE] = 	"Cannot set options on opened device ",
+	[DLM_RECOVERING] = 		"lock resource being recovered",
+	[DLM_MIGRATING] = 		"lock resource being migrated",
+	[DLM_MAXSTATS] = 		"invalid error number",
+};
+
+const char *dlm_errmsg(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errmsgs[DLM_MAXSTATS];
+	return dlm_errmsgs[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errmsg);
+
+const char *dlm_errname(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errnames[DLM_MAXSTATS];
+	return dlm_errnames[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errname);
+
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+static int stringify_lockname(const char *lockname, int locklen, char *buf,
+			      int len)
+{
+	int out = 0;
+	__be64 inode_blkno_be;
+
+#define OCFS2_DENTRY_LOCK_INO_START	18
+	if (*lockname == 'N') {
+		memcpy((__be64 *)&inode_blkno_be,
+		       (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+		       sizeof(__be64));
+		out += snprintf(buf + out, len - out, "%.*s%08x",
+				OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+				(unsigned int)be64_to_cpu(inode_blkno_be));
+	} else
+		out += snprintf(buf + out, len - out, "%.*s",
+				locklen, lockname);
+	return out;
+}
+
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+			     char *buf, int len)
+{
+	int out = 0;
+	int i = -1;
+
+	while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+		out += snprintf(buf + out, len - out, "%d ", i);
+
+	return out;
+}
+
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+	int out = 0;
+	char *mle_type;
+
+	if (mle->type == DLM_MLE_BLOCK)
+		mle_type = "BLK";
+	else if (mle->type == DLM_MLE_MASTER)
+		mle_type = "MAS";
+	else
+		mle_type = "MIG";
+
+	out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
+	out += snprintf(buf + out, len - out,
+			"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+			mle_type, mle->master, mle->new_master,
+			!list_empty(&mle->hb_events),
+			!!mle->inuse,
+			atomic_read(&mle->mle_refs.refcount));
+
+	out += snprintf(buf + out, len - out, "Maybe=");
+	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Vote=");
+	out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Response=");
+	out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Node=");
+	out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "\n");
+
+	return out;
+}
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+	char *buf;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (buf) {
+		dump_mle(mle, buf, PAGE_SIZE - 1);
+		free_page((unsigned long)buf);
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *dlm_debugfs_root;
+
+#define DLM_DEBUGFS_DIR				"o2dlm"
+#define DLM_DEBUGFS_DLM_STATE			"dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE		"locking_state"
+#define DLM_DEBUGFS_MLE_STATE			"mle_state"
+#define DLM_DEBUGFS_PURGE_LIST			"purge_list"
+
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+	struct dlm_debug_ctxt *dc;
+
+	dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+
+	kfree(dc);
+}
+
+static void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+	if (dc)
+		kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+	kref_get(&dc->debug_refcnt);
+}
+
+static int debug_release(struct inode *inode, struct file *file)
+{
+	free_page((unsigned long)file->private_data);
+	return 0;
+}
+
+static ssize_t debug_read(struct file *file, char __user *buf,
+			  size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+/* end - util funcs */
+
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+	struct dlm_lock_resource *res;
+	int out = 0;
+	unsigned long total = 0;
+
+	out += snprintf(buf + out, len - out,
+			"Dumping Purgelist for Domain: %s\n", dlm->name);
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_entry(res, &dlm->purge_list, purge) {
+		++total;
+		if (len - out < 100)
+			continue;
+		spin_lock(&res->spinlock);
+		out += stringify_lockname(res->lockname.name,
+					  res->lockname.len,
+					  buf + out, len - out);
+		out += snprintf(buf + out, len - out, "\t%ld\n",
+				(jiffies - res->last_used)/HZ);
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
+
+	return out;
+}
+
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	char *buf = NULL;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static const struct file_operations debug_purgelist_fops = {
+	.open =		debug_purgelist_open,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
+};
+/* end - purge list funcs */
+
+/* begin - debug mle funcs */
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+	struct dlm_master_list_entry *mle;
+	struct hlist_head *bucket;
+	int i, out = 0;
+	unsigned long total = 0, longest = 0, bucket_count = 0;
+
+	out += snprintf(buf + out, len - out,
+			"Dumping MLEs for Domain: %s\n", dlm->name);
+
+	spin_lock(&dlm->master_lock);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each_entry(mle, bucket, master_hash_node) {
+			++total;
+			++bucket_count;
+			if (len - out < 200)
+				continue;
+			out += dump_mle(mle, buf + out, len - out);
+		}
+		longest = max(longest, bucket_count);
+		bucket_count = 0;
+	}
+	spin_unlock(&dlm->master_lock);
+
+	out += snprintf(buf + out, len - out,
+			"Total: %lu, Longest: %lu\n", total, longest);
+	return out;
+}
+
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	char *buf = NULL;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static const struct file_operations debug_mle_fops = {
+	.open =		debug_mle_open,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
+};
+
+/* end - debug mle funcs */
+
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+	int out;
+
+#define DEBUG_LOCK_VERSION	1
+	spin_lock(&lock->spinlock);
+	out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+		       "%d,%d,%d,%d\n",
+		       DEBUG_LOCK_VERSION,
+		       list_type, lock->ml.type, lock->ml.convert_type,
+		       lock->ml.node,
+		       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		       !list_empty(&lock->ast_list),
+		       !list_empty(&lock->bast_list),
+		       lock->ast_pending, lock->bast_pending,
+		       lock->convert_pending, lock->lock_pending,
+		       lock->cancel_pending, lock->unlock_pending,
+		       atomic_read(&lock->lock_refs.refcount));
+	spin_unlock(&lock->spinlock);
+
+	return out;
+}
+
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+	struct dlm_lock *lock;
+	int i;
+	int out = 0;
+
+	out += snprintf(buf + out, len - out, "NAME:");
+	out += stringify_lockname(res->lockname.name, res->lockname.len,
+				  buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+#define DEBUG_LRES_VERSION	1
+	out += snprintf(buf + out, len - out,
+			"LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+			DEBUG_LRES_VERSION,
+			res->owner, res->state, res->last_used,
+			!list_empty(&res->purge),
+			!list_empty(&res->dirty),
+			!list_empty(&res->recovering),
+			res->inflight_locks, res->migration_pending,
+			atomic_read(&res->asts_reserved),
+			atomic_read(&res->refs.refcount));
+
+	/* refmap */
+	out += snprintf(buf + out, len - out, "RMAP:");
+	out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* lvb */
+	out += snprintf(buf + out, len - out, "LVBX:");
+	for (i = 0; i < DLM_LVB_LEN; i++)
+		out += snprintf(buf + out, len - out,
+					"%02x", (unsigned char)res->lvb[i]);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* granted */
+	list_for_each_entry(lock, &res->granted, list)
+		out += dump_lock(lock, 0, buf + out, len - out);
+
+	/* converting */
+	list_for_each_entry(lock, &res->converting, list)
+		out += dump_lock(lock, 1, buf + out, len - out);
+
+	/* blocked */
+	list_for_each_entry(lock, &res->blocked, list)
+		out += dump_lock(lock, 2, buf + out, len - out);
+
+	out += snprintf(buf + out, len - out, "\n");
+
+	return out;
+}
+
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct debug_lockres *dl = m->private;
+	struct dlm_ctxt *dlm = dl->dl_ctxt;
+	struct dlm_lock_resource *oldres = dl->dl_res;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *track_list;
+
+	spin_lock(&dlm->track_lock);
+	if (oldres)
+		track_list = &oldres->tracking;
+	else {
+		track_list = &dlm->tracking_list;
+		if (list_empty(track_list)) {
+			dl = NULL;
+			spin_unlock(&dlm->track_lock);
+			goto bail;
+		}
+	}
+
+	list_for_each_entry(res, track_list, tracking) {
+		if (&res->tracking == &dlm->tracking_list)
+			res = NULL;
+		else
+			dlm_lockres_get(res);
+		break;
+	}
+	spin_unlock(&dlm->track_lock);
+
+	if (oldres)
+		dlm_lockres_put(oldres);
+
+	dl->dl_res = res;
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+		spin_unlock(&res->spinlock);
+	} else
+		dl = NULL;
+
+bail:
+	/* passed to seq_show */
+	return dl;
+}
+
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+	struct debug_lockres *dl = (struct debug_lockres *)v;
+
+	seq_printf(s, "%s", dl->dl_buf);
+
+	return 0;
+}
+
+static const struct seq_operations debug_lockres_ops = {
+	.start =	lockres_seq_start,
+	.stop =		lockres_seq_stop,
+	.next =		lockres_seq_next,
+	.show =		lockres_seq_show,
+};
+
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	struct debug_lockres *dl;
+	void *buf;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto bail;
+
+	dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
+	if (!dl)
+		goto bailfree;
+
+	dl->dl_len = PAGE_SIZE;
+	dl->dl_buf = buf;
+
+	dlm_grab(dlm);
+	dl->dl_ctxt = dlm;
+
+	return 0;
+
+bailfree:
+	kfree(buf);
+bail:
+	mlog_errno(-ENOMEM);
+	return -ENOMEM;
+}
+
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+
+	if (dl->dl_res)
+		dlm_lockres_put(dl->dl_res);
+	dlm_put(dl->dl_ctxt);
+	kfree(dl->dl_buf);
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations debug_lockres_fops = {
+	.open =		debug_lockres_open,
+	.release =	debug_lockres_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+/* end - debug lockres funcs */
+
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+	int out = 0;
+	struct dlm_reco_node_data *node;
+	char *state;
+	int cur_mles = 0, tot_mles = 0;
+	int i;
+
+	spin_lock(&dlm->spinlock);
+
+	switch (dlm->dlm_state) {
+	case DLM_CTXT_NEW:
+		state = "NEW"; break;
+	case DLM_CTXT_JOINED:
+		state = "JOINED"; break;
+	case DLM_CTXT_IN_SHUTDOWN:
+		state = "SHUTDOWN"; break;
+	case DLM_CTXT_LEAVING:
+		state = "LEAVING"; break;
+	default:
+		state = "UNKNOWN"; break;
+	}
+
+	/* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
+	out += snprintf(buf + out, len - out,
+			"Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
+			dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+			dlm->dlm_locking_proto.pv_minor);
+
+	/* Thread Pid: xxx  Node: xxx  State: xxxxx */
+	out += snprintf(buf + out, len - out,
+			"Thread Pid: %d  Node: %d  State: %s\n",
+			task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
+
+	/* Number of Joins: xxx  Joining Node: xxx */
+	out += snprintf(buf + out, len - out,
+			"Number of Joins: %d  Joining Node: %d\n",
+			dlm->num_joins, dlm->joining_node);
+
+	/* Domain Map: xx xx xx */
+	out += snprintf(buf + out, len - out, "Domain Map: ");
+	out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Exit Domain Map: xx xx xx */
+	out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+	out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Live Map: xx xx xx */
+	out += snprintf(buf + out, len - out, "Live Map: ");
+	out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Lock Resources: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"Lock Resources: %d (%d)\n",
+			atomic_read(&dlm->res_cur_count),
+			atomic_read(&dlm->res_tot_count));
+
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+		tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+		cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+
+	/* MLEs: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"MLEs: %d (%d)\n", cur_mles, tot_mles);
+
+	/*  Blocking: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"  Blocking: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+
+	/*  Mastery: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"  Mastery: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+
+	/*  Migration: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"  Migration: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
+
+	/* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
+	out += snprintf(buf + out, len - out,
+			"Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
+			"PendingBASTs=%s\n",
+			(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+			(list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+			(list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+			(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
+
+	/* Purge Count: xxx  Refs: xxx */
+	out += snprintf(buf + out, len - out,
+			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
+			atomic_read(&dlm->dlm_refs.refcount));
+
+	/* Dead Node: xxx */
+	out += snprintf(buf + out, len - out,
+			"Dead Node: %d\n", dlm->reco.dead_node);
+
+	/* What about DLM_RECO_STATE_FINALIZE? */
+	if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+		state = "ACTIVE";
+	else
+		state = "INACTIVE";
+
+	/* Recovery Pid: xxxx  Master: xxx  State: xxxx */
+	out += snprintf(buf + out, len - out,
+			"Recovery Pid: %d  Master: %d  State: %s\n",
+			task_pid_nr(dlm->dlm_reco_thread_task),
+			dlm->reco.new_master, state);
+
+	/* Recovery Map: xx xx */
+	out += snprintf(buf + out, len - out, "Recovery Map: ");
+	out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Recovery Node State: */
+	out += snprintf(buf + out, len - out, "Recovery Node State:\n");
+	list_for_each_entry(node, &dlm->reco.node_data, list) {
+		switch (node->state) {
+		case DLM_RECO_NODE_DATA_INIT:
+			state = "INIT";
+			break;
+		case DLM_RECO_NODE_DATA_REQUESTING:
+			state = "REQUESTING";
+			break;
+		case DLM_RECO_NODE_DATA_DEAD:
+			state = "DEAD";
+			break;
+		case DLM_RECO_NODE_DATA_RECEIVING:
+			state = "RECEIVING";
+			break;
+		case DLM_RECO_NODE_DATA_REQUESTED:
+			state = "REQUESTED";
+			break;
+		case DLM_RECO_NODE_DATA_DONE:
+			state = "DONE";
+			break;
+		case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			state = "FINALIZE-SENT";
+			break;
+		default:
+			state = "BAD";
+			break;
+		}
+		out += snprintf(buf + out, len - out, "\t%u - %s\n",
+				node->node_num, state);
+	}
+
+	spin_unlock(&dlm->spinlock);
+
+	return out;
+}
+
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	char *buf = NULL;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static const struct file_operations debug_state_fops = {
+	.open =		debug_state_open,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
+};
+/* end  - debug state funcs */
+
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+	/* for dumping dlm_ctxt */
+	dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+						     S_IFREG|S_IRUSR,
+						     dlm->dlm_debugfs_subroot,
+						     dlm, &debug_state_fops);
+	if (!dc->debug_state_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	/* for dumping lockres */
+	dc->debug_lockres_dentry =
+			debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+					    S_IFREG|S_IRUSR,
+					    dlm->dlm_debugfs_subroot,
+					    dlm, &debug_lockres_fops);
+	if (!dc->debug_lockres_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	/* for dumping mles */
+	dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+						   S_IFREG|S_IRUSR,
+						   dlm->dlm_debugfs_subroot,
+						   dlm, &debug_mle_fops);
+	if (!dc->debug_mle_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	/* for dumping lockres on the purge list */
+	dc->debug_purgelist_dentry =
+			debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+					    S_IFREG|S_IRUSR,
+					    dlm->dlm_debugfs_subroot,
+					    dlm, &debug_purgelist_fops);
+	if (!dc->debug_purgelist_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	dlm_debug_get(dc);
+	return 0;
+
+bail:
+	dlm_debug_shutdown(dlm);
+	return -ENOMEM;
+}
+
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+	if (dc) {
+		debugfs_remove(dc->debug_purgelist_dentry);
+		debugfs_remove(dc->debug_mle_dentry);
+		debugfs_remove(dc->debug_lockres_dentry);
+		debugfs_remove(dc->debug_state_dentry);
+		dlm_debug_put(dc);
+	}
+}
+
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+						      dlm_debugfs_root);
+	if (!dlm->dlm_debugfs_subroot) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+				      GFP_KERNEL);
+	if (!dlm->dlm_debug_ctxt) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+	kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+
+	return 0;
+bail:
+	dlm_destroy_debugfs_subroot(dlm);
+	return -ENOMEM;
+}
+
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+	dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+	if (!dlm_debugfs_root) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void dlm_destroy_debugfs_root(void)
+{
+	debugfs_remove(dlm_debugfs_root);
+}
+#endif	/* CONFIG_DEBUG_FS */
diff --git a/kernel/fs/ocfs2/dlm/dlmdebug.h b/kernel/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000..1f27c4812
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+
+#ifdef CONFIG_DEBUG_FS
+
+struct dlm_debug_ctxt {
+	struct kref debug_refcnt;
+	struct dentry *debug_state_dentry;
+	struct dentry *debug_lockres_dentry;
+	struct dentry *debug_mle_dentry;
+	struct dentry *debug_purgelist_dentry;
+};
+
+struct debug_lockres {
+	int dl_len;
+	char *dl_buf;
+	struct dlm_ctxt *dl_ctxt;
+	struct dlm_lock_resource *dl_res;
+};
+
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+
+#else
+
+static inline int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+	return 0;
+}
+static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
+static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	return 0;
+}
+static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static inline int dlm_create_debugfs_root(void)
+{
+	return 0;
+}
+static inline void dlm_destroy_debugfs_root(void)
+{
+}
+
+#endif	/* CONFIG_DEBUG_FS */
+#endif	/* DLMDEBUG_H */
diff --git a/kernel/fs/ocfs2/dlm/dlmdomain.c b/kernel/fs/ocfs2/dlm/dlmdomain.c
new file mode 100644
index 000000000..7df88a6dd
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,2375 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.c
+ *
+ * defines domain join / leave apis
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/debugfs.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
+#include "cluster/masklog.h"
+
+/*
+ * ocfs2 node maps are array of long int, which limits to send them freely
+ * across the wire due to endianness issues. To workaround this, we convert
+ * long ints to byte arrays. Following 3 routines are helper functions to
+ * set/test/copy bits within those array of bytes
+ */
+static inline void byte_set_bit(u8 nr, u8 map[])
+{
+	map[nr >> 3] |= (1UL << (nr & 7));
+}
+
+static inline int byte_test_bit(u8 nr, u8 map[])
+{
+	return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0;
+}
+
+static inline void byte_copymap(u8 dmap[], unsigned long smap[],
+			unsigned int sz)
+{
+	unsigned int nn;
+
+	if (!sz)
+		return;
+
+	memset(dmap, 0, ((sz + 7) >> 3));
+	for (nn = 0 ; nn < sz; nn++)
+		if (test_bit(nn, smap))
+			byte_set_bit(nn, dmap);
+}
+
+static void dlm_free_pagevec(void **vec, int pages)
+{
+	while (pages--)
+		free_page((unsigned long)vec[pages]);
+	kfree(vec);
+}
+
+static void **dlm_alloc_pagevec(int pages)
+{
+	void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+	int i;
+
+	if (!vec)
+		return NULL;
+
+	for (i = 0; i < pages; i++)
+		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+			goto out_free;
+
+	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+	     pages, (unsigned long)DLM_HASH_PAGES,
+	     (unsigned long)DLM_BUCKETS_PER_PAGE);
+	return vec;
+out_free:
+	dlm_free_pagevec(vec, i);
+	return NULL;
+}
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, obey this ordering:
+ *    dlm_domain_lock
+ *    struct dlm_ctxt->spinlock
+ *    struct dlm_lock_resource->spinlock
+ *    struct dlm_ctxt->master_lock
+ *    struct dlm_ctxt->ast_lock
+ *    dlm_master_list_entry->spinlock
+ *    dlm_lock->spinlock
+ *
+ */
+
+DEFINE_SPINLOCK(dlm_domain_lock);
+LIST_HEAD(dlm_domains);
+static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
+
+/*
+ * The supported protocol version for DLM communication.  Running domains
+ * will have a negotiated version with the same major number and a minor
+ * number equal or smaller.  The dlm_ctxt->dlm_locking_proto field should
+ * be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ *	- Message DLM_QUERY_REGION added to support global heartbeat
+ *	- Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ * 	- Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ */
+static const struct dlm_protocol_version dlm_protocol = {
+	.pv_major = 1,
+	.pv_minor = 2,
+};
+
+#define DLM_DOMAIN_BACKOFF_MS 200
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+				     void **ret_data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data);
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+				struct dlm_protocol_version *request);
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
+
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	if (hlist_unhashed(&res->hash_node))
+		return;
+
+	mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+	hlist_del_init(&res->hash_node);
+	dlm_lockres_put(res);
+}
+
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	struct hlist_head *bucket;
+	struct qstr *q;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	q = &res->lockname;
+	bucket = dlm_lockres_hash(dlm, q->hash);
+
+	/* get a reference for our hashtable */
+	dlm_lockres_get(res);
+
+	hlist_add_head(&res->hash_node, bucket);
+
+	mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+}
+
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+						     const char *name,
+						     unsigned int len,
+						     unsigned int hash)
+{
+	struct hlist_head *bucket;
+	struct dlm_lock_resource *res;
+
+	mlog(0, "%.*s\n", len, name);
+
+	assert_spin_locked(&dlm->spinlock);
+
+	bucket = dlm_lockres_hash(dlm, hash);
+
+	hlist_for_each_entry(res, bucket, hash_node) {
+		if (res->lockname.name[0] != name[0])
+			continue;
+		if (unlikely(res->lockname.len != len))
+			continue;
+		if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+			continue;
+		dlm_lockres_get(res);
+		return res;
+	}
+	return NULL;
+}
+
+/* intended to be called by functions which do not care about lock
+ * resources which are being purged (most net _handler functions).
+ * this will return NULL for any lock resource which is found but
+ * currently in the process of dropping its mastery reference.
+ * use __dlm_lookup_lockres_full when you need the lock resource
+ * regardless (e.g. dlm_get_lock_resource) */
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+						const char *name,
+						unsigned int len,
+						unsigned int hash)
+{
+	struct dlm_lock_resource *res = NULL;
+
+	mlog(0, "%.*s\n", len, name);
+
+	assert_spin_locked(&dlm->spinlock);
+
+	res = __dlm_lookup_lockres_full(dlm, name, len, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+			spin_unlock(&res->spinlock);
+			dlm_lockres_put(res);
+			return NULL;
+		}
+		spin_unlock(&res->spinlock);
+	}
+
+	return res;
+}
+
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+				    const char *name,
+				    unsigned int len)
+{
+	struct dlm_lock_resource *res;
+	unsigned int hash = dlm_lockid_hash(name, len);
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, len, hash);
+	spin_unlock(&dlm->spinlock);
+	return res;
+}
+
+static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
+{
+	struct dlm_ctxt *tmp;
+
+	assert_spin_locked(&dlm_domain_lock);
+
+	/* tmp->name here is always NULL terminated,
+	 * but domain may not be! */
+	list_for_each_entry(tmp, &dlm_domains, list) {
+		if (strlen(tmp->name) == len &&
+		    memcmp(tmp->name, domain, len)==0)
+			return tmp;
+	}
+
+	return NULL;
+}
+
+/* For null terminated domain strings ONLY */
+static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
+{
+	assert_spin_locked(&dlm_domain_lock);
+
+	return __dlm_lookup_domain_full(domain, strlen(domain));
+}
+
+
+/* returns true on one of two conditions:
+ * 1) the domain does not exist
+ * 2) the domain exists and it's state is "joined" */
+static int dlm_wait_on_domain_helper(const char *domain)
+{
+	int ret = 0;
+	struct dlm_ctxt *tmp = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	tmp = __dlm_lookup_domain(domain);
+	if (!tmp)
+		ret = 1;
+	else if (tmp->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+	return ret;
+}
+
+static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
+{
+	dlm_destroy_debugfs_subroot(dlm);
+
+	if (dlm->lockres_hash)
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+
+	if (dlm->master_hash)
+		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+
+	kfree(dlm->name);
+	kfree(dlm);
+}
+
+/* A little strange - this function will be called while holding
+ * dlm_domain_lock and is expected to be holding it on the way out. We
+ * will however drop and reacquire it multiple times */
+static void dlm_ctxt_release(struct kref *kref)
+{
+	struct dlm_ctxt *dlm;
+
+	dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
+
+	BUG_ON(dlm->num_joins);
+	BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
+
+	/* we may still be in the list if we hit an error during join. */
+	list_del_init(&dlm->list);
+
+	spin_unlock(&dlm_domain_lock);
+
+	mlog(0, "freeing memory from domain %s\n", dlm->name);
+
+	wake_up(&dlm_domain_events);
+
+	dlm_free_ctxt_mem(dlm);
+
+	spin_lock(&dlm_domain_lock);
+}
+
+void dlm_put(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm_domain_lock);
+	kref_put(&dlm->dlm_refs, dlm_ctxt_release);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_get(struct dlm_ctxt *dlm)
+{
+	kref_get(&dlm->dlm_refs);
+}
+
+/* given a questionable reference to a dlm object, gets a reference if
+ * it can find it in the list, otherwise returns NULL in which case
+ * you shouldn't trust your pointer. */
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
+{
+	struct dlm_ctxt *target;
+	struct dlm_ctxt *ret = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	list_for_each_entry(target, &dlm_domains, list) {
+		if (target == dlm) {
+			__dlm_get(target);
+			ret = target;
+			break;
+		}
+	}
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm_domain_lock);
+	ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
+		(dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_worker) {
+		flush_workqueue(dlm->dlm_worker);
+		destroy_workqueue(dlm->dlm_worker);
+		dlm->dlm_worker = NULL;
+	}
+}
+
+static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain_handlers(dlm);
+	dlm_debug_shutdown(dlm);
+	dlm_complete_thread(dlm);
+	dlm_complete_recovery_thread(dlm);
+	dlm_destroy_dlm_worker(dlm);
+
+	/* We've left the domain. Now we can take ourselves out of the
+	 * list and allow the kref stuff to help us free the
+	 * memory. */
+	spin_lock(&dlm_domain_lock);
+	list_del_init(&dlm->list);
+	spin_unlock(&dlm_domain_lock);
+
+	/* Wake up anyone waiting for us to remove this domain */
+	wake_up(&dlm_domain_events);
+}
+
+static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+{
+	int i, num, n, ret = 0;
+	struct dlm_lock_resource *res;
+	struct hlist_node *iter;
+	struct hlist_head *bucket;
+	int dropped;
+
+	mlog(0, "Migrating locks from domain %s\n", dlm->name);
+
+	num = 0;
+	spin_lock(&dlm->spinlock);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+redo_bucket:
+		n = 0;
+		bucket = dlm_lockres_hash(dlm, i);
+		iter = bucket->first;
+		while (iter) {
+			n++;
+			res = hlist_entry(iter, struct dlm_lock_resource,
+					  hash_node);
+			dlm_lockres_get(res);
+			/* migrate, if necessary.  this will drop the dlm
+			 * spinlock and retake it if it does migration. */
+			dropped = dlm_empty_lockres(dlm, res);
+
+			spin_lock(&res->spinlock);
+			if (dropped)
+				__dlm_lockres_calc_usage(dlm, res);
+			else
+				iter = res->hash_node.next;
+			spin_unlock(&res->spinlock);
+
+			dlm_lockres_put(res);
+
+			if (dropped) {
+				cond_resched_lock(&dlm->spinlock);
+				goto redo_bucket;
+			}
+		}
+		cond_resched_lock(&dlm->spinlock);
+		num += n;
+	}
+	spin_unlock(&dlm->spinlock);
+	wake_up(&dlm->dlm_thread_wq);
+
+	/* let the dlm thread take care of purging, keep scanning until
+	 * nothing remains in the hash */
+	if (num) {
+		mlog(0, "%s: %d lock resources in hash last pass\n",
+		     dlm->name, num);
+		ret = -EAGAIN;
+	}
+	mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+	return ret;
+}
+
+static int dlm_no_joining_node(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm->spinlock);
+	ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
+	spin_unlock(&dlm->spinlock);
+
+	return ret;
+}
+
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+					 void *data, void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+	mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+	spin_lock(&dlm->spinlock);
+	set_bit(node, dlm->exit_domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
+static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
+{
+	/* Yikes, a double spinlock! I need domain_lock for the dlm
+	 * state and the dlm spinlock for join state... Sorry! */
+again:
+	spin_lock(&dlm_domain_lock);
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "Node %d is joining, we wait on it.\n",
+			  dlm->joining_node);
+		spin_unlock(&dlm->spinlock);
+		spin_unlock(&dlm_domain_lock);
+
+		wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
+		goto again;
+	}
+
+	dlm->dlm_state = DLM_CTXT_LEAVING;
+	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_print_nodes(struct dlm_ctxt *dlm)
+{
+	int node = -1, num = 0;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	printk("( ");
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		printk("%d ", node);
+		++num;
+	}
+	printk(") %u nodes\n", num);
+}
+
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	mlog(0, "%p %u %p", msg, len, data);
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+
+	spin_lock(&dlm->spinlock);
+	clear_bit(node, dlm->domain_map);
+	clear_bit(node, dlm->exit_domain_map);
+	printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
+	__dlm_print_nodes(dlm);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, node, 0);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_exit_domain leave_msg;
+
+	mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+	     msg_type, node);
+
+	memset(&leave_msg, 0, sizeof(leave_msg));
+	leave_msg.node_idx = dlm->node_num;
+
+	status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+				    sizeof(leave_msg), node, NULL);
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d sending domain exit message %u "
+		     "to node %u on domain %s\n", status, msg_type, node,
+		     dlm->name);
+
+	return status;
+}
+
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+	int node = -1;
+
+	/* Support for begin exit domain was added in 1.2 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor < 2)
+		return;
+
+	/*
+	 * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+	 * informational. Meaning if a node does not receive the message,
+	 * so be it.
+	 */
+	spin_lock(&dlm->spinlock);
+	while (1) {
+		node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+		if (node >= O2NM_MAX_NODES)
+			break;
+		if (node == dlm->node_num)
+			continue;
+
+		spin_unlock(&dlm->spinlock);
+		dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+		spin_lock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_leave_domain(struct dlm_ctxt *dlm)
+{
+	int node, clear_node, status;
+
+	/* At this point we've migrated away all our locks and won't
+	 * accept mastership of new ones. The dlm is responsible for
+	 * almost nothing now. We make sure not to confuse any joining
+	 * nodes and then commence shutdown procedure. */
+
+	spin_lock(&dlm->spinlock);
+	/* Clear ourselves from the domain map */
+	clear_bit(dlm->node_num, dlm->domain_map);
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     0)) < O2NM_MAX_NODES) {
+		/* Drop the dlm spinlock. This is safe wrt the domain_map.
+		 * -nodes cannot be added now as the
+		 *   query_join_handlers knows to respond with OK_NO_MAP
+		 * -we catch the right network errors if a node is
+		 *   removed from the map while we're sending him the
+		 *   exit message. */
+		spin_unlock(&dlm->spinlock);
+
+		clear_node = 1;
+
+		status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+						  node);
+		if (status < 0 &&
+		    status != -ENOPROTOOPT &&
+		    status != -ENOTCONN) {
+			mlog(ML_NOTICE, "Error %d sending domain exit message "
+			     "to node %d\n", status, node);
+
+			/* Not sure what to do here but lets sleep for
+			 * a bit in case this was a transient
+			 * error... */
+			msleep(DLM_DOMAIN_BACKOFF_MS);
+			clear_node = 0;
+		}
+
+		spin_lock(&dlm->spinlock);
+		/* If we're not clearing the node bit then we intend
+		 * to loop back around to try again. */
+		if (clear_node)
+			clear_bit(node, dlm->domain_map);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm)
+{
+	int leave = 0;
+	struct dlm_lock_resource *res;
+
+	spin_lock(&dlm_domain_lock);
+	BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
+	BUG_ON(!dlm->num_joins);
+
+	dlm->num_joins--;
+	if (!dlm->num_joins) {
+		/* We mark it "in shutdown" now so new register
+		 * requests wait until we've completely left the
+		 * domain. Don't use DLM_CTXT_LEAVING yet as we still
+		 * want new domain joins to communicate with us at
+		 * least until we've completed migration of our
+		 * resources. */
+		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
+		leave = 1;
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	if (leave) {
+		mlog(0, "shutting down domain %s\n", dlm->name);
+		dlm_begin_exit_domain(dlm);
+
+		/* We changed dlm state, notify the thread */
+		dlm_kick_thread(dlm, NULL);
+
+		while (dlm_migrate_all_locks(dlm)) {
+			/* Give dlm_thread time to purge the lockres' */
+			msleep(500);
+			mlog(0, "%s: more migration to do\n", dlm->name);
+		}
+
+		/* This list should be empty. If not, print remaining lockres */
+		if (!list_empty(&dlm->tracking_list)) {
+			mlog(ML_ERROR, "Following lockres' are still on the "
+			     "tracking list:\n");
+			list_for_each_entry(res, &dlm->tracking_list, tracking)
+				dlm_print_one_lock_resource(res);
+		}
+
+		dlm_mark_domain_leaving(dlm);
+		dlm_leave_domain(dlm);
+		printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
+		dlm_force_free_mles(dlm);
+		dlm_complete_dlm_shutdown(dlm);
+	}
+	dlm_put(dlm);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_domain);
+
+static int dlm_query_join_proto_check(char *proto_type, int node,
+				      struct dlm_protocol_version *ours,
+				      struct dlm_protocol_version *request)
+{
+	int rc;
+	struct dlm_protocol_version proto = *request;
+
+	if (!dlm_protocol_compare(ours, &proto)) {
+		mlog(0,
+		     "node %u wanted to join with %s locking protocol "
+		     "%u.%u, we respond with %u.%u\n",
+		     node, proto_type,
+		     request->pv_major,
+		     request->pv_minor,
+		     proto.pv_major, proto.pv_minor);
+		request->pv_minor = proto.pv_minor;
+		rc = 0;
+	} else {
+		mlog(ML_NOTICE,
+		     "Node %u wanted to join with %s locking "
+		     "protocol %u.%u, but we have %u.%u, disallowing\n",
+		     node, proto_type,
+		     request->pv_major,
+		     request->pv_minor,
+		     ours->pv_major,
+		     ours->pv_minor);
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/*
+ * struct dlm_query_join_packet is made up of four one-byte fields.  They
+ * are effectively in big-endian order already.  However, little-endian
+ * machines swap them before putting the packet on the wire (because
+ * query_join's response is a status, and that status is treated as a u32
+ * on the wire).  Thus, a big-endian and little-endian machines will treat
+ * this structure differently.
+ *
+ * The solution is to have little-endian machines swap the structure when
+ * converting from the structure to the u32 representation.  This will
+ * result in the structure having the correct format on the wire no matter
+ * the host endian format.
+ */
+static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
+					  u32 *wire)
+{
+	union dlm_query_join_response response;
+
+	response.packet = *packet;
+	*wire = be32_to_cpu(response.intval);
+}
+
+static void dlm_query_join_wire_to_packet(u32 wire,
+					  struct dlm_query_join_packet *packet)
+{
+	union dlm_query_join_response response;
+
+	response.intval = cpu_to_be32(wire);
+	*packet = response.packet;
+}
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data)
+{
+	struct dlm_query_join_request *query;
+	struct dlm_query_join_packet packet = {
+		.code = JOIN_DISALLOW,
+	};
+	struct dlm_ctxt *dlm = NULL;
+	u32 response;
+	u8 nodenum;
+
+	query = (struct dlm_query_join_request *) msg->buf;
+
+	mlog(0, "node %u wants to join domain %s\n", query->node_idx,
+		  query->domain);
+
+	/*
+	 * If heartbeat doesn't consider the node live, tell it
+	 * to back off and try again.  This gives heartbeat a chance
+	 * to catch up.
+	 */
+	if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
+		mlog(0, "node %u is not in our live map yet\n",
+		     query->node_idx);
+
+		packet.code = JOIN_DISALLOW;
+		goto respond;
+	}
+
+	packet.code = JOIN_OK_NO_MAP;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+	if (!dlm)
+		goto unlock_respond;
+
+	/*
+	 * There is a small window where the joining node may not see the
+	 * node(s) that just left but still part of the cluster. DISALLOW
+	 * join request if joining node has different node map.
+	 */
+	nodenum=0;
+	while (nodenum < O2NM_MAX_NODES) {
+		if (test_bit(nodenum, dlm->domain_map)) {
+			if (!byte_test_bit(nodenum, query->node_map)) {
+				mlog(0, "disallow join as node %u does not "
+				     "have node %u in its nodemap\n",
+				     query->node_idx, nodenum);
+				packet.code = JOIN_DISALLOW;
+				goto unlock_respond;
+			}
+		}
+		nodenum++;
+	}
+
+	/* Once the dlm ctxt is marked as leaving then we don't want
+	 * to be put in someone's domain map.
+	 * Also, explicitly disallow joining at certain troublesome
+	 * times (ie. during recovery). */
+	if (dlm->dlm_state != DLM_CTXT_LEAVING) {
+		int bit = query->node_idx;
+		spin_lock(&dlm->spinlock);
+
+		if (dlm->dlm_state == DLM_CTXT_NEW &&
+		    dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/*If this is a brand new context and we
+			 * haven't started our join process yet, then
+			 * the other node won the race. */
+			packet.code = JOIN_OK_NO_MAP;
+		} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* Disallow parallel joins. */
+			packet.code = JOIN_DISALLOW;
+		} else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+			mlog(0, "node %u trying to join, but recovery "
+			     "is ongoing.\n", bit);
+			packet.code = JOIN_DISALLOW;
+		} else if (test_bit(bit, dlm->recovery_map)) {
+			mlog(0, "node %u trying to join, but it "
+			     "still needs recovery.\n", bit);
+			packet.code = JOIN_DISALLOW;
+		} else if (test_bit(bit, dlm->domain_map)) {
+			mlog(0, "node %u trying to join, but it "
+			     "is still in the domain! needs recovery?\n",
+			     bit);
+			packet.code = JOIN_DISALLOW;
+		} else {
+			/* Alright we're fully a part of this domain
+			 * so we keep some state as to who's joining
+			 * and indicate to him that needs to be fixed
+			 * up. */
+
+			/* Make sure we speak compatible locking protocols.  */
+			if (dlm_query_join_proto_check("DLM", bit,
+						       &dlm->dlm_locking_proto,
+						       &query->dlm_proto)) {
+				packet.code = JOIN_PROTOCOL_MISMATCH;
+			} else if (dlm_query_join_proto_check("fs", bit,
+							      &dlm->fs_locking_proto,
+							      &query->fs_proto)) {
+				packet.code = JOIN_PROTOCOL_MISMATCH;
+			} else {
+				packet.dlm_minor = query->dlm_proto.pv_minor;
+				packet.fs_minor = query->fs_proto.pv_minor;
+				packet.code = JOIN_OK;
+				__dlm_set_joining_node(dlm, query->node_idx);
+			}
+		}
+
+		spin_unlock(&dlm->spinlock);
+	}
+unlock_respond:
+	spin_unlock(&dlm_domain_lock);
+
+respond:
+	mlog(0, "We respond with %u\n", packet.code);
+
+	dlm_query_join_packet_to_wire(&packet, &response);
+	return response;
+}
+
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+				     void **ret_data)
+{
+	struct dlm_assert_joined *assert;
+	struct dlm_ctxt *dlm = NULL;
+
+	assert = (struct dlm_assert_joined *) msg->buf;
+
+	mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
+		  assert->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
+	/* XXX should we consider no dlm ctxt an error? */
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Alright, this node has officially joined our
+		 * domain. Set him in the map and clean up our
+		 * leftover join state. */
+		BUG_ON(dlm->joining_node != assert->node_idx);
+
+		if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+			mlog(0, "dlm recovery is ongoing, disallow join\n");
+			spin_unlock(&dlm->spinlock);
+			spin_unlock(&dlm_domain_lock);
+			return -EAGAIN;
+		}
+
+		set_bit(assert->node_idx, dlm->domain_map);
+		clear_bit(assert->node_idx, dlm->exit_domain_map);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
+		       assert->node_idx, dlm->name);
+		__dlm_print_nodes(dlm);
+
+		/* notify anything attached to the heartbeat events */
+		dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+			     struct dlm_query_region *qr,
+			     char *local, int locallen)
+{
+	char *remote = qr->qr_regions;
+	char *l, *r;
+	int localnr, i, j, foundit;
+	int status = 0;
+
+	if (!o2hb_global_heartbeat_active()) {
+		if (qr->qr_numregions) {
+			mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+			     "heartbeat enabled but local node %d does not\n",
+			     qr->qr_domain, qr->qr_node, dlm->node_num);
+			status = -EINVAL;
+		}
+		goto bail;
+	}
+
+	if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+		mlog(ML_ERROR, "Domain %s: Local node %d has global "
+		     "heartbeat enabled but joining node %d does not\n",
+		     qr->qr_domain, dlm->node_num, qr->qr_node);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
+	localnr = o2hb_get_all_regions(local, (u8)localnr);
+
+	/* compare local regions with remote */
+	l = local;
+	for (i = 0; i < localnr; ++i) {
+		foundit = 0;
+		r = remote;
+		for (j = 0; j <= qr->qr_numregions; ++j) {
+			if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			r += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in local node %d but not in joining node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+			     dlm->node_num, qr->qr_node);
+			goto bail;
+		}
+		l += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	/* compare remote with local regions */
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		foundit = 0;
+		l = local;
+		for (j = 0; j < localnr; ++j) {
+			if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			l += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in joining node %d but not in local node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+			     qr->qr_node, dlm->node_num);
+			goto bail;
+		}
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+bail:
+	return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_region *qr = NULL;
+	int status, ret = 0, i;
+	char *p;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+	if (!qr) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	qr->qr_node = dlm->node_num;
+	qr->qr_namelen = strlen(dlm->name);
+	memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+	/* if local hb, the numregions will be zero */
+	if (o2hb_global_heartbeat_active())
+		qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+							 O2NM_MAX_REGIONS);
+
+	p = qr->qr_regions;
+	for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending regions to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+					 sizeof(struct dlm_query_region),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+			     ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qr);
+	return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data)
+{
+	struct dlm_query_region *qr;
+	struct dlm_ctxt *dlm = NULL;
+	char *local = NULL;
+	int status = 0;
+
+	qr = (struct dlm_query_region *) msg->buf;
+
+	mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+	     qr->qr_domain);
+
+	/* buffer used in dlm_mast_regions() */
+	local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+	if (!local)
+		return -ENOMEM;
+
+	status = -EINVAL;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "before join domain\n", qr->qr_node, qr->qr_domain);
+		goto out_domain_lock;
+	}
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->joining_node != qr->qr_node) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+		     dlm->joining_node);
+		goto out_dlm_lock;
+	}
+
+	/* Support for global heartbeat was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but active dlm protocol is %d.%d\n", qr->qr_node,
+		     qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto out_dlm_lock;
+	}
+
+	status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
+
+out_dlm_lock:
+	spin_unlock(&dlm->spinlock);
+
+out_domain_lock:
+	spin_unlock(&dlm_domain_lock);
+
+	kfree(local);
+
+	return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+	struct o2nm_node *local;
+	struct dlm_node_info *remote;
+	int i, j;
+	int status = 0;
+
+	for (j = 0; j < qn->qn_numnodes; ++j)
+		mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+		     &(qn->qn_nodes[j].ni_ipv4_address),
+		     ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+	for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+		local = o2nm_get_node_by_num(i);
+		remote = NULL;
+		for (j = 0; j < qn->qn_numnodes; ++j) {
+			if (qn->qn_nodes[j].ni_nodenum == i) {
+				remote = &(qn->qn_nodes[j]);
+				break;
+			}
+		}
+
+		if (!local && !remote)
+			continue;
+
+		if ((local && !remote) || (!local && remote))
+			status = -EINVAL;
+
+		if (!status &&
+		    ((remote->ni_nodenum != local->nd_num) ||
+		     (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+		     (remote->ni_ipv4_address != local->nd_ipv4_address)))
+			status = -EINVAL;
+
+		if (status) {
+			if (remote && !local)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in joining node %d but not in "
+				     "local node %d\n", qn->qn_domain,
+				     remote->ni_nodenum,
+				     &(remote->ni_ipv4_address),
+				     ntohs(remote->ni_ipv4_port),
+				     qn->qn_nodenum, dlm->node_num);
+			if (local && !remote)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in local node %d but not in "
+				     "joining node %d\n", qn->qn_domain,
+				     local->nd_num, &(local->nd_ipv4_address),
+				     ntohs(local->nd_ipv4_port),
+				     dlm->node_num, qn->qn_nodenum);
+			BUG_ON((!local && !remote));
+		}
+
+		if (local)
+			o2nm_node_put(local);
+	}
+
+	return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_nodeinfo *qn = NULL;
+	struct o2nm_node *node;
+	int ret = 0, status, count, i;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+	if (!qn) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+		node = o2nm_get_node_by_num(i);
+		if (!node)
+			continue;
+		qn->qn_nodes[count].ni_nodenum = node->nd_num;
+		qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+		qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+		mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+		     &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+		++count;
+		o2nm_node_put(node);
+	}
+
+	qn->qn_nodenum = dlm->node_num;
+	qn->qn_numnodes = count;
+	qn->qn_namelen = strlen(dlm->name);
+	memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending nodeinfo to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					 qn, sizeof(struct dlm_query_nodeinfo),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qn);
+	return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+				      void *data, void **ret_data)
+{
+	struct dlm_query_nodeinfo *qn;
+	struct dlm_ctxt *dlm = NULL;
+	int locked = 0, status = -EINVAL;
+
+	qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+	mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+	     qn->qn_domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+		     "join domain\n", qn->qn_nodenum, qn->qn_domain);
+		goto bail;
+	}
+
+	spin_lock(&dlm->spinlock);
+	locked = 1;
+	if (dlm->joining_node != qn->qn_nodenum) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+		     "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+		     dlm->joining_node);
+		goto bail;
+	}
+
+	/* Support for node query was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+		     "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+		     qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto bail;
+	}
+
+	status = dlm_match_nodes(dlm, qn);
+
+bail:
+	if (locked)
+		spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+
+	return status;
+}
+
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data)
+{
+	struct dlm_cancel_join *cancel;
+	struct dlm_ctxt *dlm = NULL;
+
+	cancel = (struct dlm_cancel_join *) msg->buf;
+
+	mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
+		  cancel->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
+
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Yikes, this guy wants to cancel his join. No
+		 * problem, we simply cleanup our join state. */
+		BUG_ON(dlm->joining_node != cancel->node_idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_cancel_join cancel_msg;
+
+	memset(&cancel_msg, 0, sizeof(cancel_msg));
+	cancel_msg.node_idx = dlm->node_num;
+	cancel_msg.name_len = strlen(dlm->name);
+	memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
+
+	status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+				    &cancel_msg, sizeof(cancel_msg), node,
+				    NULL);
+	if (status < 0) {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+		     node);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+/* map_size should be in bytes. */
+static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
+				 unsigned long *node_map,
+				 unsigned int map_size)
+{
+	int status, tmpstat;
+	unsigned int node;
+
+	if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
+			 sizeof(unsigned long))) {
+		mlog(ML_ERROR,
+		     "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
+		     map_size, (unsigned)BITS_TO_LONGS(O2NM_MAX_NODES));
+		return -EINVAL;
+	}
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		tmpstat = dlm_send_one_join_cancel(dlm, node);
+		if (tmpstat) {
+			mlog(ML_ERROR, "Error return %d cancelling join on "
+			     "node %d\n", tmpstat, node);
+			if (!status)
+				status = tmpstat;
+		}
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int dlm_request_join(struct dlm_ctxt *dlm,
+			    int node,
+			    enum dlm_query_join_response_code *response)
+{
+	int status;
+	struct dlm_query_join_request join_msg;
+	struct dlm_query_join_packet packet;
+	u32 join_resp;
+
+	mlog(0, "querying node %d\n", node);
+
+	memset(&join_msg, 0, sizeof(join_msg));
+	join_msg.node_idx = dlm->node_num;
+	join_msg.name_len = strlen(dlm->name);
+	memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+	join_msg.dlm_proto = dlm->dlm_locking_proto;
+	join_msg.fs_proto = dlm->fs_locking_proto;
+
+	/* copy live node map to join message */
+	byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
+
+	status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
+				    sizeof(join_msg), node, &join_resp);
+	if (status < 0 && status != -ENOPROTOOPT) {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+		     node);
+		goto bail;
+	}
+	dlm_query_join_wire_to_packet(join_resp, &packet);
+
+	/* -ENOPROTOOPT from the net code means the other side isn't
+	    listening for our message type -- that's fine, it means
+	    his dlm isn't up, so we can consider him a 'yes' but not
+	    joined into the domain.  */
+	if (status == -ENOPROTOOPT) {
+		status = 0;
+		*response = JOIN_OK_NO_MAP;
+	} else if (packet.code == JOIN_DISALLOW ||
+		   packet.code == JOIN_OK_NO_MAP) {
+		*response = packet.code;
+	} else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
+		mlog(ML_NOTICE,
+		     "This node requested DLM locking protocol %u.%u and "
+		     "filesystem locking protocol %u.%u.  At least one of "
+		     "the protocol versions on node %d is not compatible, "
+		     "disconnecting\n",
+		     dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor,
+		     dlm->fs_locking_proto.pv_major,
+		     dlm->fs_locking_proto.pv_minor,
+		     node);
+		status = -EPROTO;
+		*response = packet.code;
+	} else if (packet.code == JOIN_OK) {
+		*response = packet.code;
+		/* Use the same locking protocol as the remote node */
+		dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+		dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+		mlog(0,
+		     "Node %d responds JOIN_OK with DLM locking protocol "
+		     "%u.%u and fs locking protocol %u.%u\n",
+		     node,
+		     dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor,
+		     dlm->fs_locking_proto.pv_major,
+		     dlm->fs_locking_proto.pv_minor);
+	} else {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid response %d from node %u\n",
+		     packet.code, node);
+	}
+
+	mlog(0, "status %d, node %d response is %d\n", status, node,
+	     *response);
+
+bail:
+	return status;
+}
+
+static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	int ret;
+	struct dlm_assert_joined assert_msg;
+
+	mlog(0, "Sending join assert to node %u\n", node);
+
+	memset(&assert_msg, 0, sizeof(assert_msg));
+	assert_msg.node_idx = dlm->node_num;
+	assert_msg.name_len = strlen(dlm->name);
+	memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
+
+	status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+				    &assert_msg, sizeof(assert_msg), node,
+				    &ret);
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+		     node);
+	else
+		status = ret;
+
+	return status;
+}
+
+static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
+				  unsigned long *node_map)
+{
+	int status, node, live;
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		do {
+			/* It is very important that this message be
+			 * received so we spin until either the node
+			 * has died or it gets the message. */
+			status = dlm_send_one_join_assert(dlm, node);
+
+			spin_lock(&dlm->spinlock);
+			live = test_bit(node, dlm->live_nodes_map);
+			spin_unlock(&dlm->spinlock);
+
+			if (status) {
+				mlog(ML_ERROR, "Error return %d asserting "
+				     "join on node %d\n", status, node);
+
+				/* give us some time between errors... */
+				if (live)
+					msleep(DLM_DOMAIN_BACKOFF_MS);
+			}
+		} while (status && live);
+	}
+}
+
+struct domain_join_ctxt {
+	unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+static int dlm_should_restart_join(struct dlm_ctxt *dlm,
+				   struct domain_join_ctxt *ctxt,
+				   enum dlm_query_join_response_code response)
+{
+	int ret;
+
+	if (response == JOIN_DISALLOW) {
+		mlog(0, "Latest response of disallow -- should restart\n");
+		return 1;
+	}
+
+	spin_lock(&dlm->spinlock);
+	/* For now, we restart the process if the node maps have
+	 * changed at all */
+	ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
+		     sizeof(dlm->live_nodes_map));
+	spin_unlock(&dlm->spinlock);
+
+	if (ret)
+		mlog(0, "Node maps changed -- should restart\n");
+
+	return ret;
+}
+
+static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
+{
+	int status = 0, tmpstat, node;
+	struct domain_join_ctxt *ctxt;
+	enum dlm_query_join_response_code response = JOIN_DISALLOW;
+
+	mlog(0, "%p", dlm);
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* group sem locking should work for us here -- we're already
+	 * registered for heartbeat events so filling this should be
+	 * atomic wrt getting those handlers called. */
+	o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+
+	spin_lock(&dlm->spinlock);
+	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
+
+	__dlm_set_joining_node(dlm, dlm->node_num);
+
+	spin_unlock(&dlm->spinlock);
+
+	node = -1;
+	while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		status = dlm_request_join(dlm, node, &response);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Ok, either we got a response or the node doesn't have a
+		 * dlm up. */
+		if (response == JOIN_OK)
+			set_bit(node, ctxt->yes_resp_map);
+
+		if (dlm_should_restart_join(dlm, ctxt, response)) {
+			status = -EAGAIN;
+			goto bail;
+		}
+	}
+
+	mlog(0, "Yay, done querying nodes!\n");
+
+	/* Yay, everyone agree's we can join the domain. My domain is
+	 * comprised of all nodes who were put in the
+	 * yes_resp_map. Copy that into our domain map and send a join
+	 * assert message to clean up everyone elses state. */
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->domain_map, ctxt->yes_resp_map,
+	       sizeof(ctxt->yes_resp_map));
+	set_bit(dlm->node_num, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	/* Support for global heartbeat and node info was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major > 1 ||
+	    dlm->dlm_locking_proto.pv_minor > 0) {
+		status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
+
+	/* Joined state *must* be set before the joining node
+	 * information, otherwise the query_join handler may read no
+	 * current joiner but a state of NEW and tell joining nodes
+	 * we're not in the domain. */
+	spin_lock(&dlm_domain_lock);
+	dlm->dlm_state = DLM_CTXT_JOINED;
+	dlm->num_joins++;
+	spin_unlock(&dlm_domain_lock);
+
+bail:
+	spin_lock(&dlm->spinlock);
+	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	if (!status) {
+		printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
+		__dlm_print_nodes(dlm);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	if (ctxt) {
+		/* Do we need to send a cancel message to any nodes? */
+		if (status < 0) {
+			tmpstat = dlm_send_join_cancels(dlm,
+							ctxt->yes_resp_map,
+							sizeof(ctxt->yes_resp_map));
+			if (tmpstat < 0)
+				mlog_errno(tmpstat);
+		}
+		kfree(ctxt);
+	}
+
+	mlog(0, "returning %d\n", status);
+	return status;
+}
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
+{
+	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
+	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
+	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
+}
+
+static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	mlog(0, "registering handlers.\n");
+
+	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
+			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
+	if (status)
+		goto bail;
+
+	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_master_request),
+					dlm_master_request_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
+					sizeof(struct dlm_assert_master),
+					dlm_assert_master_handler,
+					dlm, dlm_assert_master_post_handler,
+					&dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
+					sizeof(struct dlm_create_lock),
+					dlm_create_lock_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
+					DLM_CONVERT_LOCK_MAX_LEN,
+					dlm_convert_lock_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					DLM_UNLOCK_LOCK_MAX_LEN,
+					dlm_unlock_lock_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
+					DLM_PROXY_AST_MAX_LEN,
+					dlm_proxy_ast_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_exit_domain_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key,
+					sizeof(struct dlm_deref_lockres),
+					dlm_deref_lockres_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_migrate_request),
+					dlm_migrate_request_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
+					DLM_MIG_LOCKRES_MAX_LEN,
+					dlm_mig_lockres_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
+					sizeof(struct dlm_master_requery),
+					dlm_master_requery_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_lock_request),
+					dlm_request_all_locks_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
+					sizeof(struct dlm_reco_data_done),
+					dlm_reco_data_done_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
+					sizeof(struct dlm_begin_reco),
+					dlm_begin_reco_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
+					sizeof(struct dlm_finalize_reco),
+					dlm_finalize_reco_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_begin_exit_domain_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+bail:
+	if (status)
+		dlm_unregister_domain_handlers(dlm);
+
+	return status;
+}
+
+static int dlm_join_domain(struct dlm_ctxt *dlm)
+{
+	int status;
+	unsigned int backoff;
+	unsigned int total_backoff = 0;
+
+	BUG_ON(!dlm);
+
+	mlog(0, "Join domain %s\n", dlm->name);
+
+	status = dlm_register_domain_handlers(dlm);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_recovery_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_debug_init(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+	if (!dlm->dlm_worker) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	do {
+		status = dlm_try_to_join_domain(dlm);
+
+		/* If we're racing another node to the join, then we
+		 * need to back off temporarily and let them
+		 * complete. */
+#define	DLM_JOIN_TIMEOUT_MSECS	90000
+		if (status == -EAGAIN) {
+			if (signal_pending(current)) {
+				status = -ERESTARTSYS;
+				goto bail;
+			}
+
+			if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) {
+				status = -ERESTARTSYS;
+				mlog(ML_NOTICE, "Timed out joining dlm domain "
+				     "%s after %u msecs\n", dlm->name,
+				     total_backoff);
+				goto bail;
+			}
+
+			/*
+			 * <chip> After you!
+			 * <dale> No, after you!
+			 * <chip> I insist!
+			 * <dale> But you first!
+			 * ...
+			 */
+			backoff = (unsigned int)(jiffies & 0x3);
+			backoff *= DLM_DOMAIN_BACKOFF_MS;
+			total_backoff += backoff;
+			mlog(0, "backoff %d\n", backoff);
+			msleep(backoff);
+		}
+	} while (status == -EAGAIN);
+
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	wake_up(&dlm_domain_events);
+
+	if (status) {
+		dlm_unregister_domain_handlers(dlm);
+		dlm_debug_shutdown(dlm);
+		dlm_complete_thread(dlm);
+		dlm_complete_recovery_thread(dlm);
+		dlm_destroy_dlm_worker(dlm);
+	}
+
+	return status;
+}
+
+static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
+				u32 key)
+{
+	int i;
+	int ret;
+	struct dlm_ctxt *dlm = NULL;
+
+	dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
+	if (!dlm) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	dlm->name = kstrdup(domain, GFP_KERNEL);
+	if (dlm->name == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
+	if (!dlm->lockres_hash) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
+
+	dlm->master_hash = (struct hlist_head **)
+				dlm_alloc_pagevec(DLM_HASH_PAGES);
+	if (!dlm->master_hash) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
+
+	dlm->key = key;
+	dlm->node_num = o2nm_this_node();
+
+	ret = dlm_create_debugfs_subroot(dlm);
+	if (ret < 0)
+		goto leave;
+
+	spin_lock_init(&dlm->spinlock);
+	spin_lock_init(&dlm->master_lock);
+	spin_lock_init(&dlm->ast_lock);
+	spin_lock_init(&dlm->track_lock);
+	INIT_LIST_HEAD(&dlm->list);
+	INIT_LIST_HEAD(&dlm->dirty_list);
+	INIT_LIST_HEAD(&dlm->reco.resources);
+	INIT_LIST_HEAD(&dlm->reco.node_data);
+	INIT_LIST_HEAD(&dlm->purge_list);
+	INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+	INIT_LIST_HEAD(&dlm->tracking_list);
+	dlm->reco.state = 0;
+
+	INIT_LIST_HEAD(&dlm->pending_asts);
+	INIT_LIST_HEAD(&dlm->pending_basts);
+
+	mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
+		  dlm->recovery_map, &(dlm->recovery_map[0]));
+
+	memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
+	memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
+	memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+
+	dlm->dlm_thread_task = NULL;
+	dlm->dlm_reco_thread_task = NULL;
+	dlm->dlm_worker = NULL;
+	init_waitqueue_head(&dlm->dlm_thread_wq);
+	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
+	init_waitqueue_head(&dlm->reco.event);
+	init_waitqueue_head(&dlm->ast_wq);
+	init_waitqueue_head(&dlm->migration_wq);
+	INIT_LIST_HEAD(&dlm->mle_hb_events);
+
+	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
+	init_waitqueue_head(&dlm->dlm_join_events);
+
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+
+	atomic_set(&dlm->res_tot_count, 0);
+	atomic_set(&dlm->res_cur_count, 0);
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+		atomic_set(&dlm->mle_tot_count[i], 0);
+		atomic_set(&dlm->mle_cur_count[i], 0);
+	}
+
+	spin_lock_init(&dlm->work_lock);
+	INIT_LIST_HEAD(&dlm->work_list);
+	INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work);
+
+	kref_init(&dlm->dlm_refs);
+	dlm->dlm_state = DLM_CTXT_NEW;
+
+	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
+
+	mlog(0, "context init: refcount %u\n",
+		  atomic_read(&dlm->dlm_refs.refcount));
+
+leave:
+	if (ret < 0 && dlm) {
+		if (dlm->master_hash)
+			dlm_free_pagevec((void **)dlm->master_hash,
+					DLM_HASH_PAGES);
+
+		if (dlm->lockres_hash)
+			dlm_free_pagevec((void **)dlm->lockres_hash,
+					DLM_HASH_PAGES);
+
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+	}
+	return dlm;
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+				struct dlm_protocol_version *request)
+{
+	if (existing->pv_major != request->pv_major)
+		return 1;
+
+	if (existing->pv_minor > request->pv_minor)
+		return 1;
+
+	if (existing->pv_minor < request->pv_minor)
+		request->pv_minor = existing->pv_minor;
+
+	return 0;
+}
+
+/*
+ * dlm_register_domain: one-time setup per "domain".
+ *
+ * The filesystem passes in the requested locking version via proto.
+ * If registration was successful, proto will contain the negotiated
+ * locking protocol.
+ */
+struct dlm_ctxt * dlm_register_domain(const char *domain,
+			       u32 key,
+			       struct dlm_protocol_version *fs_proto)
+{
+	int ret;
+	struct dlm_ctxt *dlm = NULL;
+	struct dlm_ctxt *new_ctxt = NULL;
+
+	if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		mlog(ML_ERROR, "domain name length too long\n");
+		goto leave;
+	}
+
+	mlog(0, "register called for domain \"%s\"\n", domain);
+
+retry:
+	dlm = NULL;
+	if (signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	spin_lock(&dlm_domain_lock);
+
+	dlm = __dlm_lookup_domain(domain);
+	if (dlm) {
+		if (dlm->dlm_state != DLM_CTXT_JOINED) {
+			spin_unlock(&dlm_domain_lock);
+
+			mlog(0, "This ctxt is not joined yet!\n");
+			wait_event_interruptible(dlm_domain_events,
+						 dlm_wait_on_domain_helper(
+							 domain));
+			goto retry;
+		}
+
+		if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+			spin_unlock(&dlm_domain_lock);
+			mlog(ML_ERROR,
+			     "Requested locking protocol version is not "
+			     "compatible with already registered domain "
+			     "\"%s\"\n", domain);
+			ret = -EPROTO;
+			goto leave;
+		}
+
+		__dlm_get(dlm);
+		dlm->num_joins++;
+
+		spin_unlock(&dlm_domain_lock);
+
+		ret = 0;
+		goto leave;
+	}
+
+	/* doesn't exist */
+	if (!new_ctxt) {
+		spin_unlock(&dlm_domain_lock);
+
+		new_ctxt = dlm_alloc_ctxt(domain, key);
+		if (new_ctxt)
+			goto retry;
+
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	/* a little variable switch-a-roo here... */
+	dlm = new_ctxt;
+	new_ctxt = NULL;
+
+	/* add the new domain */
+	list_add_tail(&dlm->list, &dlm_domains);
+	spin_unlock(&dlm_domain_lock);
+
+	/*
+	 * Pass the locking protocol version into the join.  If the join
+	 * succeeds, it will have the negotiated protocol set.
+	 */
+	dlm->dlm_locking_proto = dlm_protocol;
+	dlm->fs_locking_proto = *fs_proto;
+
+	ret = dlm_join_domain(dlm);
+	if (ret) {
+		mlog_errno(ret);
+		dlm_put(dlm);
+		goto leave;
+	}
+
+	/* Tell the caller what locking protocol we negotiated */
+	*fs_proto = dlm->fs_locking_proto;
+
+	ret = 0;
+leave:
+	if (new_ctxt)
+		dlm_free_ctxt_mem(new_ctxt);
+
+	if (ret < 0)
+		dlm = ERR_PTR(ret);
+
+	return dlm;
+}
+EXPORT_SYMBOL_GPL(dlm_register_domain);
+
+static LIST_HEAD(dlm_join_handlers);
+
+static void dlm_unregister_net_handlers(void)
+{
+	o2net_unregister_handler_list(&dlm_join_handlers);
+}
+
+static int dlm_register_net_handlers(void)
+{
+	int status = 0;
+
+	status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_query_join_request),
+					dlm_query_join_handler,
+					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_assert_joined),
+					dlm_assert_joined_handler,
+					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_cancel_join),
+					dlm_cancel_join_handler,
+					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+					sizeof(struct dlm_query_region),
+					dlm_query_region_handler,
+					NULL, NULL, &dlm_join_handlers);
+
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					sizeof(struct dlm_query_nodeinfo),
+					dlm_query_nodeinfo_handler,
+					NULL, NULL, &dlm_join_handlers);
+bail:
+	if (status < 0)
+		dlm_unregister_net_handlers();
+
+	return status;
+}
+
+/* Domain eviction callback handling.
+ *
+ * The file system requires notification of node death *before* the
+ * dlm completes it's recovery work, otherwise it may be able to
+ * acquire locks on resources requiring recovery. Since the dlm can
+ * evict a node from it's domain *before* heartbeat fires, a similar
+ * mechanism is required. */
+
+/* Eviction is not expected to happen often, so a per-domain lock is
+ * not necessary. Eviction callbacks are allowed to sleep for short
+ * periods of time. */
+static DECLARE_RWSEM(dlm_callback_sem);
+
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num)
+{
+	struct dlm_eviction_cb *cb;
+
+	down_read(&dlm_callback_sem);
+	list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
+		cb->ec_func(node_num, cb->ec_data);
+	}
+	up_read(&dlm_callback_sem);
+}
+
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data)
+{
+	INIT_LIST_HEAD(&cb->ec_item);
+	cb->ec_func = f;
+	cb->ec_data = data;
+}
+EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
+
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
+
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_del_init(&cb->ec_item);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
+
+static int __init dlm_init(void)
+{
+	int status;
+
+	status = dlm_init_mle_cache();
+	if (status) {
+		mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+		goto error;
+	}
+
+	status = dlm_init_master_caches();
+	if (status) {
+		mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+		     "o2dlm_lockname slabcaches\n");
+		goto error;
+	}
+
+	status = dlm_init_lock_cache();
+	if (status) {
+		mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+		goto error;
+	}
+
+	status = dlm_register_net_handlers();
+	if (status) {
+		mlog(ML_ERROR, "Unable to register network handlers\n");
+		goto error;
+	}
+
+	status = dlm_create_debugfs_root();
+	if (status)
+		goto error;
+
+	return 0;
+error:
+	dlm_unregister_net_handlers();
+	dlm_destroy_lock_cache();
+	dlm_destroy_master_caches();
+	dlm_destroy_mle_cache();
+	return -1;
+}
+
+static void __exit dlm_exit (void)
+{
+	dlm_destroy_debugfs_root();
+	dlm_unregister_net_handlers();
+	dlm_destroy_lock_cache();
+	dlm_destroy_master_caches();
+	dlm_destroy_mle_cache();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
+
+module_init(dlm_init);
+module_exit(dlm_exit);
diff --git a/kernel/fs/ocfs2/dlm/dlmdomain.h b/kernel/fs/ocfs2/dlm/dlmdomain.h
new file mode 100644
index 000000000..fd6122a38
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,35 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDOMAIN_H
+#define DLMDOMAIN_H
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+int dlm_shutting_down(struct dlm_ctxt *dlm);
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num);
+
+#endif
diff --git a/kernel/fs/ocfs2/dlm/dlmlock.c b/kernel/fs/ocfs2/dlm/dlmlock.c
new file mode 100644
index 000000000..66c2a491f
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,761 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmlock.c
+ *
+ * underlying calls for lock creation
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static struct kmem_cache *dlm_lock_cache;
+
+static DEFINE_SPINLOCK(dlm_cookie_lock);
+static u64 dlm_next_cookie = 1;
+
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags);
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie);
+static void dlm_lock_release(struct kref *kref);
+static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+
+int dlm_init_lock_cache(void)
+{
+	dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+					   sizeof(struct dlm_lock),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (dlm_lock_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_lock_cache(void)
+{
+	if (dlm_lock_cache)
+		kmem_cache_destroy(dlm_lock_cache);
+}
+
+/* Tell us whether we can grant a new lock request.
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         none
+ *   held on exit:  none
+ * returns: 1 if the lock can be granted, 0 otherwise.
+ */
+static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
+				  struct dlm_lock *lock)
+{
+	struct dlm_lock *tmplock;
+
+	list_for_each_entry(tmplock, &res->granted, list) {
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	list_for_each_entry(tmplock, &res->converting, list) {
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+		if (!dlm_lock_compatible(tmplock->ml.convert_type,
+					 lock->ml.type))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* performs lock creation at the lockres master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOTQUEUED
+ */
+static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status = DLM_NORMAL;
+
+	mlog(0, "type=%d\n", lock->ml.type);
+
+	spin_lock(&res->spinlock);
+	/* if called from dlm_create_lock_handler, need to
+	 * ensure it will not sleep in dlm_wait_on_lockres */
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL &&
+	    lock->ml.node != dlm->node_num) {
+		/* erf.  state changed after lock was dropped. */
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		return status;
+	}
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+
+	if (dlm_can_grant_new_lock(res, lock)) {
+		mlog(0, "I can grant this lock right away\n");
+		/* got it right away */
+		lock->lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+
+		/* for the recovery lock, we can't allow the ast
+		 * to be queued since the dlmthread is already
+		 * frozen.  but the recovery lock is always locked
+		 * with LKM_NOQUEUE so we do not need the ast in
+		 * this special case */
+		if (!dlm_is_recovery_lock(res->lockname.name,
+					  res->lockname.len)) {
+			kick_thread = 1;
+			call_ast = 1;
+		} else {
+			mlog(0, "%s: returning DLM_NORMAL to "
+			     "node %u for reco lock\n", dlm->name,
+			     lock->ml.node);
+		}
+	} else {
+		/* for NOQUEUE request, unless we get the
+		 * lock right away, return DLM_NOTQUEUED */
+		if (flags & LKM_NOQUEUE) {
+			status = DLM_NOTQUEUED;
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len)) {
+				mlog(0, "%s: returning NOTQUEUED to "
+				     "node %u for reco lock\n", dlm->name,
+				     lock->ml.node);
+			}
+		} else {
+			status = DLM_NORMAL;
+			dlm_lock_get(lock);
+			list_add_tail(&lock->list, &res->blocked);
+			kick_thread = 1;
+		}
+	}
+
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	dlm_lockres_calc_usage(dlm, res);
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock)
+{
+	/* remove from local queue if it failed */
+	list_del_init(&lock->list);
+	lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+}
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_DENIED, DLM_RECOVERING, or net status
+ */
+static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	enum dlm_status status = DLM_DENIED;
+	int lockres_changed = 1;
+
+	mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
+	     lock->ml.type, res->lockname.len,
+	     res->lockname.name, flags);
+
+	/*
+	 * Wait if resource is getting recovered, remastered, etc.
+	 * If the resource was remastered and new owner is self, then exit.
+	 */
+	spin_lock(&res->spinlock);
+	__dlm_wait_on_lockres(res);
+	if (res->owner == dlm->node_num) {
+		spin_unlock(&res->spinlock);
+		return DLM_RECOVERING;
+	}
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	/* add lock to local (secondary) queue */
+	dlm_lock_get(lock);
+	list_add_tail(&lock->list, &res->blocked);
+	lock->lock_pending = 1;
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->lock_pending = 0;
+	if (status != DLM_NORMAL) {
+		if (status == DLM_RECOVERING &&
+		    dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len)) {
+			/* recovery lock was mastered by dead node.
+			 * we need to have calc_usage shoot down this
+			 * lockres and completely remaster it. */
+			mlog(0, "%s: recovery lock was owned by "
+			     "dead node %u, remaster it now.\n",
+			     dlm->name, res->owner);
+		} else if (status != DLM_NOTQUEUED) {
+			/*
+			 * DO NOT call calc_usage, as this would unhash
+			 * the remote lockres before we ever get to use
+			 * it.  treat as if we never made any change to
+			 * the lockres.
+			 */
+			lockres_changed = 0;
+			dlm_error(status);
+		}
+		dlm_revert_pending_lock(res, lock);
+		dlm_lock_put(lock);
+	} else if (dlm_is_recovery_lock(res->lockname.name,
+					res->lockname.len)) {
+		/* special case for the $RECOVERY lock.
+		 * there will never be an AST delivered to put
+		 * this lock on the proper secondary queue
+		 * (granted), so do it manually. */
+		mlog(0, "%s: $RECOVERY lock for this node (%u) is "
+		     "mastered by %u; got lock, manually granting (no ast)\n",
+		     dlm->name, dlm->node_num, res->owner);
+		list_move_tail(&lock->list, &res->granted);
+	}
+	spin_unlock(&res->spinlock);
+
+	if (lockres_changed)
+		dlm_lockres_calc_usage(dlm, res);
+
+	wake_up(&res->wq);
+	return status;
+}
+
+
+/* for remote lock creation.
+ * locking:
+ *   caller needs:  none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, or net status
+ */
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags)
+{
+	struct dlm_create_lock create;
+	int tmpret, status = 0;
+	enum dlm_status ret;
+
+	memset(&create, 0, sizeof(create));
+	create.node_idx = dlm->node_num;
+	create.requested_type = lock->ml.type;
+	create.cookie = lock->ml.cookie;
+	create.namelen = res->lockname.len;
+	create.flags = cpu_to_be32(flags);
+	memcpy(create.name, res->lockname.name, create.namelen);
+
+	tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
+				    sizeof(create), res->owner, &status);
+	if (tmpret >= 0) {
+		ret = status;
+		if (ret == DLM_REJECTED) {
+			mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+			     "owned by node %u. That node is coming back up "
+			     "currently.\n", dlm->name, create.namelen,
+			     create.name, res->owner);
+			dlm_print_one_lock_resource(res);
+			BUG();
+		}
+	} else {
+		mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+		     "node %u\n", dlm->name, create.namelen, create.name,
+		     tmpret, res->owner);
+		if (dlm_is_host_down(tmpret))
+			ret = DLM_RECOVERING;
+		else
+			ret = dlm_err_to_dlm_status(tmpret);
+	}
+
+	return ret;
+}
+
+void dlm_lock_get(struct dlm_lock *lock)
+{
+	kref_get(&lock->lock_refs);
+}
+
+void dlm_lock_put(struct dlm_lock *lock)
+{
+	kref_put(&lock->lock_refs, dlm_lock_release);
+}
+
+static void dlm_lock_release(struct kref *kref)
+{
+	struct dlm_lock *lock;
+
+	lock = container_of(kref, struct dlm_lock, lock_refs);
+
+	BUG_ON(!list_empty(&lock->list));
+	BUG_ON(!list_empty(&lock->ast_list));
+	BUG_ON(!list_empty(&lock->bast_list));
+	BUG_ON(lock->ast_pending);
+	BUG_ON(lock->bast_pending);
+
+	dlm_lock_detach_lockres(lock);
+
+	if (lock->lksb_kernel_allocated) {
+		mlog(0, "freeing kernel-allocated lksb\n");
+		kfree(lock->lksb);
+	}
+	kmem_cache_free(dlm_lock_cache, lock);
+}
+
+/* associate a lock with it's lockres, getting a ref on the lockres */
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res)
+{
+	dlm_lockres_get(res);
+	lock->lockres = res;
+}
+
+/* drop ref on lockres, if there is still one associated with lock */
+static void dlm_lock_detach_lockres(struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	res = lock->lockres;
+	if (res) {
+		lock->lockres = NULL;
+		mlog(0, "removing lock's lockres reference\n");
+		dlm_lockres_put(res);
+	}
+}
+
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie)
+{
+	INIT_LIST_HEAD(&newlock->list);
+	INIT_LIST_HEAD(&newlock->ast_list);
+	INIT_LIST_HEAD(&newlock->bast_list);
+	spin_lock_init(&newlock->spinlock);
+	newlock->ml.type = type;
+	newlock->ml.convert_type = LKM_IVMODE;
+	newlock->ml.highest_blocked = LKM_IVMODE;
+	newlock->ml.node = node;
+	newlock->ml.pad1 = 0;
+	newlock->ml.list = 0;
+	newlock->ml.flags = 0;
+	newlock->ast = NULL;
+	newlock->bast = NULL;
+	newlock->astdata = NULL;
+	newlock->ml.cookie = cpu_to_be64(cookie);
+	newlock->ast_pending = 0;
+	newlock->bast_pending = 0;
+	newlock->convert_pending = 0;
+	newlock->lock_pending = 0;
+	newlock->unlock_pending = 0;
+	newlock->cancel_pending = 0;
+	newlock->lksb_kernel_allocated = 0;
+
+	kref_init(&newlock->lock_refs);
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb)
+{
+	struct dlm_lock *lock;
+	int kernel_allocated = 0;
+
+	lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+	if (!lock)
+		return NULL;
+
+	if (!lksb) {
+		/* zero memory only if kernel-allocated */
+		lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
+		if (!lksb) {
+			kmem_cache_free(dlm_lock_cache, lock);
+			return NULL;
+		}
+		kernel_allocated = 1;
+	}
+
+	dlm_init_lock(lock, type, node, cookie);
+	if (kernel_allocated)
+		lock->lksb_kernel_allocated = 1;
+	lock->lksb = lksb;
+	lksb->lockid = lock;
+	return lock;
+}
+
+/* handler for lock creation net message
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
+ */
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	char *name;
+	unsigned int namelen;
+
+	BUG_ON(!dlm);
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	name = create->name;
+	namelen = create->namelen;
+	status = DLM_REJECTED;
+	if (!dlm_domain_fully_joined(dlm)) {
+		mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+		     "sending a create_lock message for lock %.*s!\n",
+		     dlm->name, create->node_idx, namelen, name);
+		dlm_error(status);
+		goto leave;
+	}
+
+	status = DLM_IVBUFLEN;
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	status = DLM_SYSERR;
+	newlock = dlm_new_lock(create->requested_type,
+			       create->node_idx,
+			       be64_to_cpu(create->cookie), NULL);
+	if (!newlock) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	lksb = newlock->lksb;
+
+	if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
+		lksb->flags |= DLM_LKSB_GET_LVB;
+		mlog(0, "set DLM_LKSB_GET_LVB flag\n");
+	}
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, name, namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		mlog(0, "lockres recovering/migrating/in-progress\n");
+		goto leave;
+	}
+
+	dlm_lock_attach_lockres(newlock, res);
+
+	status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
+leave:
+	if (status != DLM_NORMAL)
+		if (newlock)
+			dlm_lock_put(newlock);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
+static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
+{
+	u64 tmpnode = node_num;
+
+	/* shift single byte of node num into top 8 bits */
+	tmpnode <<= 56;
+
+	spin_lock(&dlm_cookie_lock);
+	*cookie = (dlm_next_cookie | tmpnode);
+	if (++dlm_next_cookie & 0xff00000000000000ull) {
+		mlog(0, "This node's cookie will now wrap!\n");
+		dlm_next_cookie = 1;
+	}
+	spin_unlock(&dlm_cookie_lock);
+}
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
+			struct dlm_lockstatus *lksb, int flags,
+			const char *name, int namelen, dlm_astlockfunc_t *ast,
+			void *data, dlm_bastlockfunc_t *bast)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	int convert = 0, recovery = 0;
+
+	/* yes this function is a mess.
+	 * TODO: clean this up.  lots of common code in the
+	 *       lock and convert paths, especially in the retry blocks */
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	status = DLM_BADPARAM;
+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
+		dlm_error(status);
+		goto error;
+	}
+
+	if (flags & ~LKM_VALID_FLAGS) {
+		dlm_error(status);
+		goto error;
+	}
+
+	convert = (flags & LKM_CONVERT);
+	recovery = (flags & LKM_RECOVERY);
+
+	if (recovery &&
+	    (!dlm_is_recovery_lock(name, namelen) || convert) ) {
+		dlm_error(status);
+		goto error;
+	}
+	if (convert && (flags & LKM_LOCAL)) {
+		mlog(ML_ERROR, "strange LOCAL convert request!\n");
+		goto error;
+	}
+
+	if (convert) {
+		/* CONVERT request */
+
+		/* if converting, must pass in a valid dlm_lock */
+		lock = lksb->lockid;
+		if (!lock) {
+			mlog(ML_ERROR, "NULL lock pointer in convert "
+			     "request\n");
+			goto error;
+		}
+
+		res = lock->lockres;
+		if (!res) {
+			mlog(ML_ERROR, "NULL lockres pointer in convert "
+			     "request\n");
+			goto error;
+		}
+		dlm_lockres_get(res);
+
+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
+	 	 * static after the original lock call.  convert requests will
+		 * ensure that everything is the same, or return DLM_BADARGS.
+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
+	 	 */
+		if (lock->lksb != lksb || lock->ast != ast ||
+		    lock->bast != bast || lock->astdata != data) {
+			status = DLM_BADARGS;
+			mlog(ML_ERROR, "new args:  lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lksb, ast, bast, data);
+			mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lock->lksb, lock->ast,
+			     lock->bast, lock->astdata);
+			goto error;
+		}
+retry_convert:
+		dlm_wait_for_recovery(dlm);
+
+		if (res->owner == dlm->node_num)
+			status = dlmconvert_master(dlm, res, lock, flags, mode);
+		else
+			status = dlmconvert_remote(dlm, res, lock, flags, mode);
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			/* for now, see how this works without sleeping
+			 * and just retry right away.  I suspect the reco
+			 * or migration will complete fast enough that
+			 * no waiting will be necessary */
+			mlog(0, "retrying convert with migration/recovery/"
+			     "in-progress\n");
+			msleep(100);
+			goto retry_convert;
+		}
+	} else {
+		u64 tmpcookie;
+
+		/* LOCK request */
+		status = DLM_BADARGS;
+		if (!name) {
+			dlm_error(status);
+			goto error;
+		}
+
+		status = DLM_IVBUFLEN;
+		if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) {
+			dlm_error(status);
+			goto error;
+		}
+
+		dlm_get_next_cookie(dlm->node_num, &tmpcookie);
+		lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
+		if (!lock) {
+			dlm_error(status);
+			goto error;
+		}
+
+		if (!recovery)
+			dlm_wait_for_recovery(dlm);
+
+		/* find or create the lock resource */
+		res = dlm_get_lock_resource(dlm, name, namelen, flags);
+		if (!res) {
+			status = DLM_IVLOCKID;
+			dlm_error(status);
+			goto error;
+		}
+
+		mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
+		mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
+
+		dlm_lock_attach_lockres(lock, res);
+		lock->ast = ast;
+		lock->bast = bast;
+		lock->astdata = data;
+
+retry_lock:
+		if (flags & LKM_VALBLK) {
+			mlog(0, "LKM_VALBLK passed by caller\n");
+
+			/* LVB requests for non PR, PW or EX locks are
+			 * ignored. */
+			if (mode < LKM_PRMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+
+		if (res->owner == dlm->node_num)
+			status = dlmlock_master(dlm, res, lock, flags);
+		else
+			status = dlmlock_remote(dlm, res, lock, flags);
+
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			msleep(100);
+			if (recovery) {
+				if (status != DLM_RECOVERING)
+					goto retry_lock;
+				/* wait to see the node go down, then
+				 * drop down and allow the lockres to
+				 * get cleaned up.  need to remaster. */
+				dlm_wait_for_node_death(dlm, res->owner,
+						DLM_NODE_DEATH_WAIT_MAX);
+			} else {
+				dlm_wait_for_recovery(dlm);
+				goto retry_lock;
+			}
+		}
+
+		/* Inflight taken in dlm_get_lock_resource() is dropped here */
+		spin_lock(&res->spinlock);
+		dlm_lockres_drop_inflight_ref(dlm, res);
+		spin_unlock(&res->spinlock);
+
+		dlm_lockres_calc_usage(dlm, res);
+		dlm_kick_thread(dlm, res);
+
+		if (status != DLM_NORMAL) {
+			lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+			if (status != DLM_NOTQUEUED)
+				dlm_error(status);
+			goto error;
+		}
+	}
+
+error:
+	if (status != DLM_NORMAL) {
+		if (lock && !convert)
+			dlm_lock_put(lock);
+		// this is kind of unnecessary
+		lksb->status = status;
+	}
+
+	/* put lockres ref from the convert path
+	 * or from dlm_get_lock_resource */
+	if (res)
+		dlm_lockres_put(res);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmlock);
diff --git a/kernel/fs/ocfs2/dlm/dlmmaster.c b/kernel/fs/ocfs2/dlm/dlmmaster.c
new file mode 100644
index 000000000..fdf4b41d0
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,3487 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
+#include "cluster/masklog.h"
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node,
+			      int idx);
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node,
+			    int idx);
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res,
+				void *nodemap, u32 flags);
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
+
+static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle,
+				const char *name,
+				unsigned int namelen)
+{
+	if (dlm != mle->dlm)
+		return 0;
+
+	if (namelen != mle->mnamelen ||
+	    memcmp(name, mle->mname, namelen) != 0)
+		return 0;
+
+	return 1;
+}
+
+static struct kmem_cache *dlm_lockres_cache;
+static struct kmem_cache *dlm_lockname_cache;
+static struct kmem_cache *dlm_mle_cache;
+
+static void dlm_mle_release(struct kref *kref);
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen);
+static void dlm_put_mle(struct dlm_master_list_entry *mle);
+static void __dlm_put_mle(struct dlm_master_list_entry *mle);
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen);
+
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle, int to);
+
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked);
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked);
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master);
+
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res);
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res);
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target);
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
+
+
+int dlm_is_host_down(int errno)
+{
+	switch (errno) {
+		case -EBADF:
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ETIMEDOUT:
+		case -ECONNABORTED:
+		case -ENETDOWN:
+		case -ENETUNREACH:
+		case -ENETRESET:
+		case -ESHUTDOWN:
+		case -ENOPROTOOPT:
+		case -EINVAL:   /* if returned from our tcp code,
+				   this means there is no socket */
+			return 1;
+	}
+	return 0;
+}
+
+
+/*
+ * MASTER LIST FUNCTIONS
+ */
+
+
+/*
+ * regarding master list entries and heartbeat callbacks:
+ *
+ * in order to avoid sleeping and allocation that occurs in
+ * heartbeat, master list entries are simply attached to the
+ * dlm's established heartbeat callbacks.  the mle is attached
+ * when it is created, and since the dlm->spinlock is held at
+ * that time, any heartbeat event will be properly discovered
+ * by the mle.  the mle needs to be detached from the
+ * dlm->mle_hb_events list as soon as heartbeat events are no
+ * longer useful to the mle, and before the mle is freed.
+ *
+ * as a general rule, heartbeat events are no longer needed by
+ * the mle once an "answer" regarding the lock master has been
+ * received.
+ */
+static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
+}
+
+
+static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	if (!list_empty(&mle->hb_events))
+		list_del_init(&mle->hb_events);
+}
+
+
+static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					    struct dlm_master_list_entry *mle)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_mle_detach_hb_events(dlm, mle);
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	mle->inuse++;
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	mle->inuse--;
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+}
+
+/* remove from list and free */
+static void __dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	if (!atomic_read(&mle->mle_refs.refcount)) {
+		/* this may or may not crash, but who cares.
+		 * it's a BUG. */
+		mlog(ML_ERROR, "bad mle: %p\n", mle);
+		dlm_print_one_mle(mle);
+		BUG();
+	} else
+		kref_put(&mle->mle_refs, dlm_mle_release);
+}
+
+
+/* must not have any spinlocks coming in */
+static void dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
+{
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	mle->dlm = dlm;
+	mle->type = type;
+	INIT_HLIST_NODE(&mle->master_hash_node);
+	INIT_LIST_HEAD(&mle->hb_events);
+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	spin_lock_init(&mle->spinlock);
+	init_waitqueue_head(&mle->wq);
+	atomic_set(&mle->woken, 0);
+	kref_init(&mle->mle_refs);
+	memset(mle->response_map, 0, sizeof(mle->response_map));
+	mle->master = O2NM_MAX_NODES;
+	mle->new_master = O2NM_MAX_NODES;
+	mle->inuse = 0;
+
+	BUG_ON(mle->type != DLM_MLE_BLOCK &&
+	       mle->type != DLM_MLE_MASTER &&
+	       mle->type != DLM_MLE_MIGRATION);
+
+	if (mle->type == DLM_MLE_MASTER) {
+		BUG_ON(!res);
+		mle->mleres = res;
+		memcpy(mle->mname, res->lockname.name, res->lockname.len);
+		mle->mnamelen = res->lockname.len;
+		mle->mnamehash = res->lockname.hash;
+	} else {
+		BUG_ON(!name);
+		mle->mleres = NULL;
+		memcpy(mle->mname, name, namelen);
+		mle->mnamelen = namelen;
+		mle->mnamehash = dlm_lockid_hash(name, namelen);
+	}
+
+	atomic_inc(&dlm->mle_tot_count[mle->type]);
+	atomic_inc(&dlm->mle_cur_count[mle->type]);
+
+	/* copy off the node_map and register hb callbacks on our copy */
+	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
+	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+	clear_bit(dlm->node_num, mle->vote_map);
+	clear_bit(dlm->node_num, mle->node_map);
+
+	/* attach the mle to the domain node up/down events */
+	__dlm_mle_attach_hb_events(dlm, mle);
+}
+
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	if (!hlist_unhashed(&mle->master_hash_node))
+		hlist_del_init(&mle->master_hash_node);
+}
+
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+	struct hlist_head *bucket;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	bucket = dlm_master_hash(dlm, mle->mnamehash);
+	hlist_add_head(&mle->master_hash_node, bucket);
+}
+
+/* returns 1 if found, 0 if not */
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen)
+{
+	struct dlm_master_list_entry *tmpmle;
+	struct hlist_head *bucket;
+	unsigned int hash;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	hash = dlm_lockid_hash(name, namelen);
+	bucket = dlm_master_hash(dlm, hash);
+	hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
+		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
+			continue;
+		dlm_get_mle(tmpmle);
+		*mle = tmpmle;
+		return 1;
+	}
+	return 0;
+}
+
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
+{
+	struct dlm_master_list_entry *mle;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
+		if (node_up)
+			dlm_mle_node_up(dlm, mle, NULL, idx);
+		else
+			dlm_mle_node_down(dlm, mle, NULL, idx);
+	}
+}
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (!test_bit(idx, mle->node_map))
+		mlog(0, "node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (test_bit(idx, mle->node_map))
+		mlog(0, "node %u already in node map!\n", idx);
+	else
+		set_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+
+int dlm_init_mle_cache(void)
+{
+	dlm_mle_cache = kmem_cache_create("o2dlm_mle",
+					  sizeof(struct dlm_master_list_entry),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL);
+	if (dlm_mle_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_mle_cache(void)
+{
+	if (dlm_mle_cache)
+		kmem_cache_destroy(dlm_mle_cache);
+}
+
+static void dlm_mle_release(struct kref *kref)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_ctxt *dlm;
+
+	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+	     mle->type);
+
+	/* remove from list if not already */
+	__dlm_unlink_mle(dlm, mle);
+
+	/* detach the mle from the domain node up/down events */
+	__dlm_mle_detach_hb_events(dlm, mle);
+
+	atomic_dec(&dlm->mle_cur_count[mle->type]);
+
+	/* NOTE: kfree under spinlock here.
+	 * if this is bad, we can move this to a freelist. */
+	kmem_cache_free(dlm_mle_cache, mle);
+}
+
+
+/*
+ * LOCK RESOURCE FUNCTIONS
+ */
+
+int dlm_init_master_caches(void)
+{
+	dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+					      sizeof(struct dlm_lock_resource),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!dlm_lockres_cache)
+		goto bail;
+
+	dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+					       DLM_LOCKID_NAME_MAX, 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (!dlm_lockname_cache)
+		goto bail;
+
+	return 0;
+bail:
+	dlm_destroy_master_caches();
+	return -ENOMEM;
+}
+
+void dlm_destroy_master_caches(void)
+{
+	if (dlm_lockname_cache) {
+		kmem_cache_destroy(dlm_lockname_cache);
+		dlm_lockname_cache = NULL;
+	}
+
+	if (dlm_lockres_cache) {
+		kmem_cache_destroy(dlm_lockres_cache);
+		dlm_lockres_cache = NULL;
+	}
+}
+
+static void dlm_lockres_release(struct kref *kref)
+{
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+
+	res = container_of(kref, struct dlm_lock_resource, refs);
+	dlm = res->dlm;
+
+	/* This should not happen -- all lockres' have a name
+	 * associated with them at init time. */
+	BUG_ON(!res->lockname.name);
+
+	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
+	     res->lockname.name);
+
+	spin_lock(&dlm->track_lock);
+	if (!list_empty(&res->tracking))
+		list_del_init(&res->tracking);
+	else {
+		mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+		     res->lockname.len, res->lockname.name);
+		dlm_print_one_lock_resource(res);
+	}
+	spin_unlock(&dlm->track_lock);
+
+	atomic_dec(&dlm->res_cur_count);
+
+	if (!hlist_unhashed(&res->hash_node) ||
+	    !list_empty(&res->granted) ||
+	    !list_empty(&res->converting) ||
+	    !list_empty(&res->blocked) ||
+	    !list_empty(&res->dirty) ||
+	    !list_empty(&res->recovering) ||
+	    !list_empty(&res->purge)) {
+		mlog(ML_ERROR,
+		     "Going to BUG for resource %.*s."
+		     "  We're on a list! [%c%c%c%c%c%c%c]\n",
+		     res->lockname.len, res->lockname.name,
+		     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+		     !list_empty(&res->granted) ? 'G' : ' ',
+		     !list_empty(&res->converting) ? 'C' : ' ',
+		     !list_empty(&res->blocked) ? 'B' : ' ',
+		     !list_empty(&res->dirty) ? 'D' : ' ',
+		     !list_empty(&res->recovering) ? 'R' : ' ',
+		     !list_empty(&res->purge) ? 'P' : ' ');
+
+		dlm_print_one_lock_resource(res);
+	}
+
+	/* By the time we're ready to blow this guy away, we shouldn't
+	 * be on any lists. */
+	BUG_ON(!hlist_unhashed(&res->hash_node));
+	BUG_ON(!list_empty(&res->granted));
+	BUG_ON(!list_empty(&res->converting));
+	BUG_ON(!list_empty(&res->blocked));
+	BUG_ON(!list_empty(&res->dirty));
+	BUG_ON(!list_empty(&res->recovering));
+	BUG_ON(!list_empty(&res->purge));
+
+	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+
+	kmem_cache_free(dlm_lockres_cache, res);
+}
+
+void dlm_lockres_put(struct dlm_lock_resource *res)
+{
+	kref_put(&res->refs, dlm_lockres_release);
+}
+
+static void dlm_init_lockres(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res,
+			     const char *name, unsigned int namelen)
+{
+	char *qname;
+
+	/* If we memset here, we lose our reference to the kmalloc'd
+	 * res->lockname.name, so be sure to init every field
+	 * correctly! */
+
+	qname = (char *) res->lockname.name;
+	memcpy(qname, name, namelen);
+
+	res->lockname.len = namelen;
+	res->lockname.hash = dlm_lockid_hash(name, namelen);
+
+	init_waitqueue_head(&res->wq);
+	spin_lock_init(&res->spinlock);
+	INIT_HLIST_NODE(&res->hash_node);
+	INIT_LIST_HEAD(&res->granted);
+	INIT_LIST_HEAD(&res->converting);
+	INIT_LIST_HEAD(&res->blocked);
+	INIT_LIST_HEAD(&res->dirty);
+	INIT_LIST_HEAD(&res->recovering);
+	INIT_LIST_HEAD(&res->purge);
+	INIT_LIST_HEAD(&res->tracking);
+	atomic_set(&res->asts_reserved, 0);
+	res->migration_pending = 0;
+	res->inflight_locks = 0;
+	res->inflight_assert_workers = 0;
+
+	res->dlm = dlm;
+
+	kref_init(&res->refs);
+
+	atomic_inc(&dlm->res_tot_count);
+	atomic_inc(&dlm->res_cur_count);
+
+	/* just for consistency */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+	spin_unlock(&res->spinlock);
+
+	res->state = DLM_LOCK_RES_IN_PROGRESS;
+
+	res->last_used = 0;
+
+	spin_lock(&dlm->spinlock);
+	list_add_tail(&res->tracking, &dlm->tracking_list);
+	spin_unlock(&dlm->spinlock);
+
+	memset(res->lvb, 0, DLM_LVB_LEN);
+	memset(res->refmap, 0, sizeof(res->refmap));
+}
+
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+				   const char *name,
+				   unsigned int namelen)
+{
+	struct dlm_lock_resource *res = NULL;
+
+	res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
+	if (!res)
+		goto error;
+
+	res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+	if (!res->lockname.name)
+		goto error;
+
+	dlm_init_lockres(dlm, res, name, namelen);
+	return res;
+
+error:
+	if (res)
+		kmem_cache_free(dlm_lockres_cache, res);
+	return NULL;
+}
+
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res, int bit)
+{
+	assert_spin_locked(&res->spinlock);
+
+	mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+	     res->lockname.name, bit, __builtin_return_address(0));
+
+	set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res, int bit)
+{
+	assert_spin_locked(&res->spinlock);
+
+	mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+	     res->lockname.name, bit, __builtin_return_address(0));
+
+	clear_bit(bit, res->refmap);
+}
+
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res)
+{
+	res->inflight_locks++;
+
+	mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+	     res->lockname.len, res->lockname.name, res->inflight_locks,
+	     __builtin_return_address(0));
+}
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	__dlm_lockres_grab_inflight_ref(dlm, res);
+}
+
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+
+	BUG_ON(res->inflight_locks == 0);
+
+	res->inflight_locks--;
+
+	mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+	     res->lockname.len, res->lockname.name, res->inflight_locks,
+	     __builtin_return_address(0));
+
+	wake_up(&res->wq);
+}
+
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	res->inflight_assert_workers++;
+	mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+			dlm->name, res->lockname.len, res->lockname.name,
+			res->inflight_assert_workers);
+}
+
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	BUG_ON(res->inflight_assert_workers == 0);
+	res->inflight_assert_workers--;
+	mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+			dlm->name, res->lockname.len, res->lockname.name,
+			res->inflight_assert_workers);
+}
+
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+		struct dlm_lock_resource *res)
+{
+	spin_lock(&res->spinlock);
+	__dlm_lockres_drop_inflight_worker(dlm, res);
+	spin_unlock(&res->spinlock);
+}
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ * lockid is null terminated
+ *
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ *
+ * also, do a lookup in the dlm->master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here.   need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+					  const char *lockid,
+					  int namelen,
+					  int flags)
+{
+	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *alloc_mle = NULL;
+	int blocked = 0;
+	int ret, nodenum;
+	struct dlm_node_iter iter;
+	unsigned int hash;
+	int tries = 0;
+	int bit, wait_on_recovery = 0;
+
+	BUG_ON(!lockid);
+
+	hash = dlm_lockid_hash(lockid, namelen);
+
+	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
+
+lookup:
+	spin_lock(&dlm->spinlock);
+	tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
+	if (tmpres) {
+		spin_unlock(&dlm->spinlock);
+		spin_lock(&tmpres->spinlock);
+
+		/*
+		 * Right after dlm spinlock was released, dlm_thread could have
+		 * purged the lockres. Check if lockres got unhashed. If so
+		 * start over.
+		 */
+		if (hlist_unhashed(&tmpres->hash_node)) {
+			spin_unlock(&tmpres->spinlock);
+			dlm_lockres_put(tmpres);
+			tmpres = NULL;
+			goto lookup;
+		}
+
+		/* Wait on the thread that is mastering the resource */
+		if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			__dlm_wait_on_lockres(tmpres);
+			BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+			spin_unlock(&tmpres->spinlock);
+			dlm_lockres_put(tmpres);
+			tmpres = NULL;
+			goto lookup;
+		}
+
+		/* Wait on the resource purge to complete before continuing */
+		if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+			BUG_ON(tmpres->owner == dlm->node_num);
+			__dlm_wait_on_lockres_flags(tmpres,
+						    DLM_LOCK_RES_DROPPING_REF);
+			spin_unlock(&tmpres->spinlock);
+			dlm_lockres_put(tmpres);
+			tmpres = NULL;
+			goto lookup;
+		}
+
+		/* Grab inflight ref to pin the resource */
+		dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+		spin_unlock(&tmpres->spinlock);
+		if (res)
+			dlm_lockres_put(res);
+		res = tmpres;
+		goto leave;
+	}
+
+	if (!res) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "allocating a new resource\n");
+		/* nothing found and we need to allocate one. */
+		alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+		if (!alloc_mle)
+			goto leave;
+		res = dlm_new_lockres(dlm, lockid, namelen);
+		if (!res)
+			goto leave;
+		goto lookup;
+	}
+
+	mlog(0, "no lockres found, allocated our own: %p\n", res);
+
+	if (flags & LKM_LOCAL) {
+		/* caller knows it's safe to assume it's not mastered elsewhere
+		 * DONE!  return right away */
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		__dlm_insert_lockres(dlm, res);
+		dlm_lockres_grab_inflight_ref(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+		/* lockres still marked IN_PROGRESS */
+		goto wake_waiters;
+	}
+
+	/* check master list to see if another node has started mastering it */
+	spin_lock(&dlm->master_lock);
+
+	/* if we found a block, wait for lock to be mastered by another node */
+	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
+	if (blocked) {
+		int mig;
+		if (mle->type == DLM_MLE_MASTER) {
+			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
+			BUG();
+		}
+		mig = (mle->type == DLM_MLE_MIGRATION);
+		/* if there is a migration in progress, let the migration
+		 * finish before continuing.  we can wait for the absence
+		 * of the MIGRATION mle: either the migrate finished or
+		 * one of the nodes died and the mle was cleaned up.
+		 * if there is a BLOCK here, but it already has a master
+		 * set, we are too late.  the master does not have a ref
+		 * for us in the refmap.  detach the mle and drop it.
+		 * either way, go back to the top and start over. */
+		if (mig || mle->master != O2NM_MAX_NODES) {
+			BUG_ON(mig && mle->master == dlm->node_num);
+			/* we arrived too late.  the master does not
+			 * have a ref for us. retry. */
+			mlog(0, "%s:%.*s: late on %s\n",
+			     dlm->name, namelen, lockid,
+			     mig ?  "MIGRATION" : "BLOCK");
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+
+			/* master is known, detach */
+			if (!mig)
+				dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			mle = NULL;
+			/* this is lame, but we can't wait on either
+			 * the mle or lockres waitqueue here */
+			if (mig)
+				msleep(100);
+			goto lookup;
+		}
+	} else {
+		/* go ahead and try to master lock on this node */
+		mle = alloc_mle;
+		/* make sure this does not get freed below */
+		alloc_mle = NULL;
+		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
+		set_bit(dlm->node_num, mle->maybe_map);
+		__dlm_insert_mle(dlm, mle);
+
+		/* still holding the dlm spinlock, check the recovery map
+		 * to see if there are any nodes that still need to be
+		 * considered.  these will not appear in the mle nodemap
+		 * but they might own this lockres.  wait on them. */
+		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit < O2NM_MAX_NODES) {
+			mlog(0, "%s: res %.*s, At least one node (%d) "
+			     "to recover before lock mastery can begin\n",
+			     dlm->name, namelen, (char *)lockid, bit);
+			wait_on_recovery = 1;
+		}
+	}
+
+	/* at this point there is either a DLM_MLE_BLOCK or a
+	 * DLM_MLE_MASTER on the master list, so it's safe to add the
+	 * lockres to the hashtable.  anyone who finds the lock will
+	 * still have to wait on the IN_PROGRESS. */
+
+	/* finally add the lockres to its hash bucket */
+	__dlm_insert_lockres(dlm, res);
+
+	/* since this lockres is new it doesn't not require the spinlock */
+	__dlm_lockres_grab_inflight_ref(dlm, res);
+
+	/* get an extra ref on the mle in case this is a BLOCK
+	 * if so, the creator of the BLOCK may try to put the last
+	 * ref at this time in the assert master handler, so we
+	 * need an extra one to keep from a bad ptr deref. */
+	dlm_get_mle_inuse(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+redo_request:
+	while (wait_on_recovery) {
+		/* any cluster changes that occurred after dropping the
+		 * dlm spinlock would be detectable be a change on the mle,
+		 * so we only need to clear out the recovery map once. */
+		if (dlm_is_recovery_lock(lockid, namelen)) {
+			mlog(0, "%s: Recovery map is not empty, but must "
+			     "master $RECOVERY lock now\n", dlm->name);
+			if (!dlm_pre_master_reco_lockres(dlm, res))
+				wait_on_recovery = 0;
+			else {
+				mlog(0, "%s: waiting 500ms for heartbeat state "
+				    "change\n", dlm->name);
+				msleep(500);
+			}
+			continue;
+		}
+
+		dlm_kick_recovery_thread(dlm);
+		msleep(1000);
+		dlm_wait_for_recovery(dlm);
+
+		spin_lock(&dlm->spinlock);
+		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit < O2NM_MAX_NODES) {
+			mlog(0, "%s: res %.*s, At least one node (%d) "
+			     "to recover before lock mastery can begin\n",
+			     dlm->name, namelen, (char *)lockid, bit);
+			wait_on_recovery = 1;
+		} else
+			wait_on_recovery = 0;
+		spin_unlock(&dlm->spinlock);
+
+		if (wait_on_recovery)
+			dlm_wait_for_node_recovery(dlm, bit, 10000);
+	}
+
+	/* must wait for lock to be mastered elsewhere */
+	if (blocked)
+		goto wait;
+
+	ret = -EINVAL;
+	dlm_node_iter_init(mle->vote_map, &iter);
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = dlm_do_master_request(res, mle, nodenum);
+		if (ret < 0)
+			mlog_errno(ret);
+		if (mle->master != O2NM_MAX_NODES) {
+			/* found a master ! */
+			if (mle->master <= nodenum)
+				break;
+			/* if our master request has not reached the master
+			 * yet, keep going until it does.  this is how the
+			 * master will know that asserts are needed back to
+			 * the lower nodes. */
+			mlog(0, "%s: res %.*s, Requests only up to %u but "
+			     "master is %u, keep going\n", dlm->name, namelen,
+			     lockid, nodenum, mle->master);
+		}
+	}
+
+wait:
+	/* keep going until the response map includes all nodes */
+	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
+	if (ret < 0) {
+		wait_on_recovery = 1;
+		mlog(0, "%s: res %.*s, Node map changed, redo the master "
+		     "request now, blocked=%d\n", dlm->name, res->lockname.len,
+		     res->lockname.name, blocked);
+		if (++tries > 20) {
+			mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+			     "dlm_wait_for_lock_mastery, blocked = %d\n",
+			     dlm->name, res->lockname.len,
+			     res->lockname.name, blocked);
+			dlm_print_one_lock_resource(res);
+			dlm_print_one_mle(mle);
+			tries = 0;
+		}
+		goto redo_request;
+	}
+
+	mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+	     res->lockname.name, res->owner);
+	/* make sure we never continue without this */
+	BUG_ON(res->owner == O2NM_MAX_NODES);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	/* put the extra ref */
+	dlm_put_mle_inuse(mle);
+
+wake_waiters:
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+leave:
+	/* need to free the unused mle */
+	if (alloc_mle)
+		kmem_cache_free(dlm_mle_cache, alloc_mle);
+
+	return res;
+}
+
+
+#define DLM_MASTERY_TIMEOUT_MS   5000
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked)
+{
+	u8 m;
+	int ret, bit;
+	int map_changed, voting_done;
+	int assert, sleep;
+
+recheck:
+	ret = 0;
+	assert = 0;
+
+	/* check if another node has already become the owner */
+	spin_lock(&res->spinlock);
+	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
+		     res->lockname.len, res->lockname.name, res->owner);
+		spin_unlock(&res->spinlock);
+		/* this will cause the master to re-assert across
+		 * the whole cluster, freeing up mles */
+		if (res->owner != dlm->node_num) {
+			ret = dlm_do_master_request(res, mle, res->owner);
+			if (ret < 0) {
+				/* give recovery a chance to run */
+				mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+				msleep(500);
+				goto recheck;
+			}
+		}
+		ret = 0;
+		goto leave;
+	}
+	spin_unlock(&res->spinlock);
+
+	spin_lock(&mle->spinlock);
+	m = mle->master;
+	map_changed = (memcmp(mle->vote_map, mle->node_map,
+			      sizeof(mle->vote_map)) != 0);
+	voting_done = (memcmp(mle->vote_map, mle->response_map,
+			     sizeof(mle->vote_map)) == 0);
+
+	/* restart if we hit any errors */
+	if (map_changed) {
+		int b;
+		mlog(0, "%s: %.*s: node map changed, restarting\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
+		b = (mle->type == DLM_MLE_BLOCK);
+		if ((*blocked && !b) || (!*blocked && b)) {
+			mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     *blocked, b);
+			*blocked = b;
+		}
+		spin_unlock(&mle->spinlock);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto leave;
+		}
+		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
+		     "rechecking now\n", dlm->name, res->lockname.len,
+		     res->lockname.name);
+		goto recheck;
+	} else {
+		if (!voting_done) {
+			mlog(0, "map not changed and voting not done "
+			     "for %s:%.*s\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		}
+	}
+
+	if (m != O2NM_MAX_NODES) {
+		/* another node has done an assert!
+		 * all done! */
+		sleep = 0;
+	} else {
+		sleep = 1;
+		/* have all nodes responded? */
+		if (voting_done && !*blocked) {
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (dlm->node_num <= bit) {
+				/* my node number is lowest.
+			 	 * now tell other nodes that I am
+				 * mastering this. */
+				mle->master = dlm->node_num;
+				/* ref was grabbed in get_lock_resource
+				 * will be dropped in dlmlock_master */
+				assert = 1;
+				sleep = 0;
+			}
+			/* if voting is done, but we have not received
+			 * an assert master yet, we must sleep */
+		}
+	}
+
+	spin_unlock(&mle->spinlock);
+
+	/* sleep if we haven't finished voting yet */
+	if (sleep) {
+		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
+
+		/*
+		if (atomic_read(&mle->mle_refs.refcount) < 2)
+			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
+			atomic_read(&mle->mle_refs.refcount),
+			res->lockname.len, res->lockname.name);
+		*/
+		atomic_set(&mle->woken, 0);
+		(void)wait_event_timeout(mle->wq,
+					 (atomic_read(&mle->woken) == 1),
+					 timeo);
+		if (res->owner == O2NM_MAX_NODES) {
+			mlog(0, "%s:%.*s: waiting again\n", dlm->name,
+			     res->lockname.len, res->lockname.name);
+			goto recheck;
+		}
+		mlog(0, "done waiting, master is %u\n", res->owner);
+		ret = 0;
+		goto leave;
+	}
+
+	ret = 0;   /* done */
+	if (assert) {
+		m = dlm->node_num;
+		mlog(0, "about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, m);
+		ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
+		if (ret) {
+			/* This is a failure in the network path,
+			 * not in the response to the assert_master
+			 * (any nonzero response is a BUG on this node).
+			 * Most likely a socket just got disconnected
+			 * due to node death. */
+			mlog_errno(ret);
+		}
+		/* no longer need to restart lock mastery.
+		 * all living nodes have been contacted. */
+		ret = 0;
+	}
+
+	/* set the lockres owner */
+	spin_lock(&res->spinlock);
+	/* mastery reference obtained either during
+	 * assert_master_handler or in get_lock_resource */
+	dlm_change_lockres_owner(dlm, res, m);
+	spin_unlock(&res->spinlock);
+
+leave:
+	return ret;
+}
+
+struct dlm_bitmap_diff_iter
+{
+	int curnode;
+	unsigned long *orig_bm;
+	unsigned long *cur_bm;
+	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+enum dlm_node_state_change
+{
+	NODE_DOWN = -1,
+	NODE_NO_CHANGE = 0,
+	NODE_UP
+};
+
+static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
+				      unsigned long *orig_bm,
+				      unsigned long *cur_bm)
+{
+	unsigned long p1, p2;
+	int i;
+
+	iter->curnode = -1;
+	iter->orig_bm = orig_bm;
+	iter->cur_bm = cur_bm;
+
+	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
+       		p1 = *(iter->orig_bm + i);
+	       	p2 = *(iter->cur_bm + i);
+		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
+	}
+}
+
+static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
+				     enum dlm_node_state_change *state)
+{
+	int bit;
+
+	if (iter->curnode >= O2NM_MAX_NODES)
+		return -ENOENT;
+
+	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
+			    iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+
+	/* if it was there in the original then this node died */
+	if (test_bit(bit, iter->orig_bm))
+		*state = NODE_DOWN;
+	else
+		*state = NODE_UP;
+
+	iter->curnode = bit;
+	return bit;
+}
+
+
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked)
+{
+	struct dlm_bitmap_diff_iter bdi;
+	enum dlm_node_state_change sc;
+	int node;
+	int ret = 0;
+
+	mlog(0, "something happened such that the "
+	     "master process may need to be restarted!\n");
+
+	assert_spin_locked(&mle->spinlock);
+
+	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
+	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	while (node >= 0) {
+		if (sc == NODE_UP) {
+			/* a node came up.  clear any old vote from
+			 * the response map and set it in the vote map
+			 * then restart the mastery. */
+			mlog(ML_NOTICE, "node %d up while restarting\n", node);
+
+			/* redo the master request, but only for the new node */
+			mlog(0, "sending request to new node\n");
+			clear_bit(node, mle->response_map);
+			set_bit(node, mle->vote_map);
+		} else {
+			mlog(ML_ERROR, "node down! %d\n", node);
+			if (blocked) {
+				int lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES, 0);
+
+				/* act like it was never there */
+				clear_bit(node, mle->maybe_map);
+
+			       	if (node == lowest) {
+					mlog(0, "expected master %u died"
+					    " while this node was blocked "
+					    "waiting on it!\n", node);
+					lowest = find_next_bit(mle->maybe_map,
+						       	O2NM_MAX_NODES,
+						       	lowest+1);
+					if (lowest < O2NM_MAX_NODES) {
+						mlog(0, "%s:%.*s:still "
+						     "blocked. waiting on %u "
+						     "now\n", dlm->name,
+						     res->lockname.len,
+						     res->lockname.name,
+						     lowest);
+					} else {
+						/* mle is an MLE_BLOCK, but
+						 * there is now nothing left to
+						 * block on.  we need to return
+						 * all the way back out and try
+						 * again with an MLE_MASTER.
+						 * dlm_do_local_recovery_cleanup
+						 * has already run, so the mle
+						 * refcount is ok */
+						mlog(0, "%s:%.*s: no "
+						     "longer blocking. try to "
+						     "master this here\n",
+						     dlm->name,
+						     res->lockname.len,
+						     res->lockname.name);
+						mle->type = DLM_MLE_MASTER;
+						mle->mleres = res;
+					}
+				}
+			}
+
+			/* now blank out everything, as if we had never
+			 * contacted anyone */
+			memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+			memset(mle->response_map, 0, sizeof(mle->response_map));
+			/* reset the vote_map to the current node_map */
+			memcpy(mle->vote_map, mle->node_map,
+			       sizeof(mle->node_map));
+			/* put myself into the maybe map */
+			if (mle->type != DLM_MLE_BLOCK)
+				set_bit(dlm->node_num, mle->maybe_map);
+		}
+		ret = -EAGAIN;
+		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	}
+	return ret;
+}
+
+
+/*
+ * DLM_MASTER_REQUEST_MSG
+ *
+ * returns: 0 on success,
+ *          -errno on a network error
+ *
+ * on error, the caller should assume the target node is "dead"
+ *
+ */
+
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle, int to)
+{
+	struct dlm_ctxt *dlm = mle->dlm;
+	struct dlm_master_request request;
+	int ret, response=0, resend;
+
+	memset(&request, 0, sizeof(request));
+	request.node_idx = dlm->node_num;
+
+	BUG_ON(mle->type == DLM_MLE_MIGRATION);
+
+	request.namelen = (u8)mle->mnamelen;
+	memcpy(request.name, mle->mname, request.namelen);
+
+again:
+	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
+				 sizeof(request), to, &response);
+	if (ret < 0)  {
+		if (ret == -ESRCH) {
+			/* should never happen */
+			mlog(ML_ERROR, "TCP stack not ready!\n");
+			BUG();
+		} else if (ret == -EINVAL) {
+			mlog(ML_ERROR, "bad args passed to o2net!\n");
+			BUG();
+		} else if (ret == -ENOMEM) {
+			mlog(ML_ERROR, "out of memory while trying to send "
+			     "network message!  retrying\n");
+			/* this is totally crude */
+			msleep(50);
+			goto again;
+		} else if (!dlm_is_host_down(ret)) {
+			/* not a network error. bad. */
+			mlog_errno(ret);
+			mlog(ML_ERROR, "unhandled error!");
+			BUG();
+		}
+		/* all other errors should be network errors,
+		 * and likely indicate node death */
+		mlog(ML_ERROR, "link to %d went down!\n", to);
+		goto out;
+	}
+
+	ret = 0;
+	resend = 0;
+	spin_lock(&mle->spinlock);
+	switch (response) {
+		case DLM_MASTER_RESP_YES:
+			set_bit(to, mle->response_map);
+			mlog(0, "node %u is the master, response=YES\n", to);
+			mlog(0, "%s:%.*s: master node %u now knows I have a "
+			     "reference\n", dlm->name, res->lockname.len,
+			     res->lockname.name, to);
+			mle->master = to;
+			break;
+		case DLM_MASTER_RESP_NO:
+			mlog(0, "node %u not master, response=NO\n", to);
+			set_bit(to, mle->response_map);
+			break;
+		case DLM_MASTER_RESP_MAYBE:
+			mlog(0, "node %u not master, response=MAYBE\n", to);
+			set_bit(to, mle->response_map);
+			set_bit(to, mle->maybe_map);
+			break;
+		case DLM_MASTER_RESP_ERROR:
+			mlog(0, "node %u hit an error, resending\n", to);
+			resend = 1;
+			response = 0;
+			break;
+		default:
+			mlog(ML_ERROR, "bad response! %u\n", response);
+			BUG();
+	}
+	spin_unlock(&mle->spinlock);
+	if (resend) {
+		/* this is also totally crude */
+		msleep(50);
+		goto again;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data)
+{
+	u8 response = DLM_MASTER_RESP_MAYBE;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	char *name;
+	unsigned int namelen, hash;
+	int found, ret;
+	int set_maybe;
+	int dispatch_assert = 0;
+
+	if (!dlm_grab(dlm))
+		return DLM_MASTER_RESP_NO;
+
+	if (!dlm_domain_fully_joined(dlm)) {
+		response = DLM_MASTER_RESP_NO;
+		goto send_response;
+	}
+
+	name = request->name;
+	namelen = request->namelen;
+	hash = dlm_lockid_hash(name, namelen);
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		response = DLM_IVBUFLEN;
+		goto send_response;
+	}
+
+way_up_top:
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
+	if (res) {
+		spin_unlock(&dlm->spinlock);
+
+		/* take care of the easy cases up front */
+		spin_lock(&res->spinlock);
+
+		/*
+		 * Right after dlm spinlock was released, dlm_thread could have
+		 * purged the lockres. Check if lockres got unhashed. If so
+		 * start over.
+		 */
+		if (hlist_unhashed(&res->hash_node)) {
+			spin_unlock(&res->spinlock);
+			dlm_lockres_put(res);
+			goto way_up_top;
+		}
+
+		if (res->state & (DLM_LOCK_RES_RECOVERING|
+				  DLM_LOCK_RES_MIGRATING)) {
+			spin_unlock(&res->spinlock);
+			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
+			     "being recovered/migrated\n");
+			response = DLM_MASTER_RESP_ERROR;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		if (res->owner == dlm->node_num) {
+			dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
+			spin_unlock(&res->spinlock);
+			response = DLM_MASTER_RESP_YES;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+
+			/* this node is the owner.
+			 * there is some extra work that needs to
+			 * happen now.  the requesting node has
+			 * caused all nodes up to this one to
+			 * create mles.  this node now needs to
+			 * go back and clean those up. */
+			dispatch_assert = 1;
+			goto send_response;
+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			spin_unlock(&res->spinlock);
+			// mlog(0, "node %u is the master\n", res->owner);
+			response = DLM_MASTER_RESP_NO;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		/* ok, there is no owner.  either this node is
+		 * being blocked, or it is actively trying to
+		 * master this lock. */
+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+			mlog(ML_ERROR, "lock with no owner should be "
+			     "in-progress!\n");
+			BUG();
+		}
+
+		// mlog(0, "lockres is in progress...\n");
+		spin_lock(&dlm->master_lock);
+		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+		if (!found) {
+			mlog(ML_ERROR, "no mle found for this lock!\n");
+			BUG();
+		}
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK) {
+			// mlog(0, "this node is waiting for "
+			// "lockres to be mastered\n");
+			response = DLM_MASTER_RESP_NO;
+		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "node %u is master, but trying to migrate to "
+			     "node %u.\n", tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				mlog(ML_ERROR, "no owner on lockres, but this "
+				     "node is trying to migrate it to %u?!\n",
+				     tmpmle->new_master);
+				BUG();
+			} else {
+				/* the real master can respond on its own */
+				response = DLM_MASTER_RESP_NO;
+			}
+		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			set_maybe = 0;
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				/* this node will be the owner.
+				 * go back and clean the mles on any
+				 * other nodes */
+				dispatch_assert = 1;
+				dlm_lockres_set_refmap_bit(dlm, res,
+							   request->node_idx);
+			} else
+				response = DLM_MASTER_RESP_NO;
+		} else {
+			// mlog(0, "this node is attempting to "
+			// "master lockres\n");
+			response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&res->spinlock);
+
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+		if (mle)
+			kmem_cache_free(dlm_mle_cache, mle);
+		goto send_response;
+	}
+
+	/*
+	 * lockres doesn't exist on this node
+	 * if there is an MLE_BLOCK, return NO
+	 * if there is an MLE_MASTER, return MAYBE
+	 * otherwise, add an MLE_BLOCK, return NO
+	 */
+	spin_lock(&dlm->master_lock);
+	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+	if (!found) {
+		/* this lockid has never been seen on this node yet */
+		// mlog(0, "no mle found\n");
+		if (!mle) {
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+
+			mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+			if (!mle) {
+				response = DLM_MASTER_RESP_ERROR;
+				mlog_errno(-ENOMEM);
+				goto send_response;
+			}
+			goto way_up_top;
+		}
+
+		// mlog(0, "this is second time thru, already allocated, "
+		// "add the block.\n");
+		dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
+		set_bit(request->node_idx, mle->maybe_map);
+		__dlm_insert_mle(dlm, mle);
+		response = DLM_MASTER_RESP_NO;
+	} else {
+		// mlog(0, "mle was found\n");
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->master == dlm->node_num) {
+			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
+			BUG();
+		}
+		if (tmpmle->type == DLM_MLE_BLOCK)
+			response = DLM_MASTER_RESP_NO;
+		else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "migration mle was found (%u->%u)\n",
+			     tmpmle->master, tmpmle->new_master);
+			/* real master can respond on its own */
+			response = DLM_MASTER_RESP_NO;
+		} else
+			response = DLM_MASTER_RESP_MAYBE;
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (found) {
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+	}
+send_response:
+	/*
+	 * __dlm_lookup_lockres() grabbed a reference to this lockres.
+	 * The reference is released by dlm_assert_master_worker() under
+	 * the call to dlm_dispatch_assert_master().  If
+	 * dlm_assert_master_worker() isn't called, we drop it here.
+	 */
+	if (dispatch_assert) {
+		if (response != DLM_MASTER_RESP_YES)
+			mlog(ML_ERROR, "invalid response %d\n", response);
+		if (!res) {
+			mlog(ML_ERROR, "bad lockres while trying to assert!\n");
+			BUG();
+		}
+		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
+			     dlm->node_num, res->lockname.len, res->lockname.name);
+		spin_lock(&res->spinlock);
+		ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
+						 DLM_ASSERT_MASTER_MLE_CLEANUP);
+		if (ret < 0) {
+			mlog(ML_ERROR, "failed to dispatch assert master work\n");
+			response = DLM_MASTER_RESP_ERROR;
+			dlm_lockres_put(res);
+		} else
+			__dlm_lockres_grab_inflight_worker(dlm, res);
+		spin_unlock(&res->spinlock);
+	} else {
+		if (res)
+			dlm_lockres_put(res);
+	}
+
+	dlm_put(dlm);
+	return response;
+}
+
+/*
+ * DLM_ASSERT_MASTER_MSG
+ */
+
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res,
+				void *nodemap, u32 flags)
+{
+	struct dlm_assert_master assert;
+	int to, tmpret;
+	struct dlm_node_iter iter;
+	int ret = 0;
+	int reassert;
+	const char *lockname = res->lockname.name;
+	unsigned int namelen = res->lockname.len;
+
+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+	spin_lock(&res->spinlock);
+	res->state |= DLM_LOCK_RES_SETREF_INPROG;
+	spin_unlock(&res->spinlock);
+
+again:
+	reassert = 0;
+
+	/* note that if this nodemap is empty, it returns 0 */
+	dlm_node_iter_init(nodemap, &iter);
+	while ((to = dlm_node_iter_next(&iter)) >= 0) {
+		int r = 0;
+		struct dlm_master_list_entry *mle = NULL;
+
+		mlog(0, "sending assert master to %d (%.*s)\n", to,
+		     namelen, lockname);
+		memset(&assert, 0, sizeof(assert));
+		assert.node_idx = dlm->node_num;
+		assert.namelen = namelen;
+		memcpy(assert.name, lockname, namelen);
+		assert.flags = cpu_to_be32(flags);
+
+		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
+					    &assert, sizeof(assert), to, &r);
+		if (tmpret < 0) {
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", tmpret,
+			     DLM_ASSERT_MASTER_MSG, dlm->key, to);
+			if (!dlm_is_host_down(tmpret)) {
+				mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
+				BUG();
+			}
+			/* a node died.  finish out the rest of the nodes. */
+			mlog(0, "link to %d went down!\n", to);
+			/* any nonzero status return will do */
+			ret = tmpret;
+			r = 0;
+		} else if (r < 0) {
+			/* ok, something horribly messed.  kill thyself. */
+			mlog(ML_ERROR,"during assert master of %.*s to %u, "
+			     "got %d.\n", namelen, lockname, to, r);
+			spin_lock(&dlm->spinlock);
+			spin_lock(&dlm->master_lock);
+			if (dlm_find_mle(dlm, &mle, (char *)lockname,
+					 namelen)) {
+				dlm_print_one_mle(mle);
+				__dlm_put_mle(mle);
+			}
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+			BUG();
+		}
+
+		if (r & DLM_ASSERT_RESPONSE_REASSERT &&
+		    !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
+				mlog(ML_ERROR, "%.*s: very strange, "
+				     "master MLE but no lockres on %u\n",
+				     namelen, lockname, to);
+		}
+
+		if (r & DLM_ASSERT_RESPONSE_REASSERT) {
+			mlog(0, "%.*s: node %u create mles on other "
+			     "nodes and requests a re-assert\n",
+			     namelen, lockname, to);
+			reassert = 1;
+		}
+		if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
+			mlog(0, "%.*s: node %u has a reference to this "
+			     "lockres, set the bit in the refmap\n",
+			     namelen, lockname, to);
+			spin_lock(&res->spinlock);
+			dlm_lockres_set_refmap_bit(dlm, res, to);
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+	if (reassert)
+		goto again;
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	char *name;
+	unsigned int namelen, hash;
+	u32 flags;
+	int master_request = 0, have_lockres_ref = 0;
+	int ret = 0;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	name = assert->name;
+	namelen = assert->namelen;
+	hash = dlm_lockid_hash(name, namelen);
+	flags = be32_to_cpu(assert->flags);
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length!");
+		goto done;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+	if (flags)
+		mlog(0, "assert_master with flags: %u\n", flags);
+
+	/* find the MLE */
+	spin_lock(&dlm->master_lock);
+	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
+		/* not an error, could be master just re-asserting */
+		mlog(0, "just got an assert_master from %u, but no "
+		     "MLE for it! (%.*s)\n", assert->node_idx,
+		     namelen, name);
+	} else {
+		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
+		if (bit >= O2NM_MAX_NODES) {
+			/* not necessarily an error, though less likely.
+			 * could be master just re-asserting. */
+			mlog(0, "no bits set in the maybe_map, but %u "
+			     "is asserting! (%.*s)\n", assert->node_idx,
+			     namelen, name);
+		} else if (bit != assert->node_idx) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "master %u was found, %u should "
+				     "back off\n", assert->node_idx, bit);
+			} else {
+				/* with the fix for bug 569, a higher node
+				 * number winning the mastery will respond
+				 * YES to mastery requests, but this node
+				 * had no way of knowing.  let it pass. */
+				mlog(0, "%u is the lowest node, "
+				     "%u is asserting. (%.*s)  %u must "
+				     "have begun after %u won.\n", bit,
+				     assert->node_idx, namelen, name, bit,
+				     assert->node_idx);
+			}
+		}
+		if (mle->type == DLM_MLE_MIGRATION) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "%s:%.*s: got cleanup assert"
+				     " from %u for migration\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+			} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+				mlog(0, "%s:%.*s: got unrelated assert"
+				     " from %u for migration, ignoring\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+				__dlm_put_mle(mle);
+				spin_unlock(&dlm->master_lock);
+				spin_unlock(&dlm->spinlock);
+				goto done;
+			}
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+
+	/* ok everything checks out with the MLE
+	 * now check to see if there is a lockres */
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING)  {
+			mlog(ML_ERROR, "%u asserting but %.*s is "
+			     "RECOVERING!\n", assert->node_idx, namelen, name);
+			goto kill;
+		}
+		if (!mle) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+			    res->owner != assert->node_idx) {
+				mlog(ML_ERROR, "DIE! Mastery assert from %u, "
+				     "but current owner is %u! (%.*s)\n",
+				     assert->node_idx, res->owner, namelen,
+				     name);
+				__dlm_print_one_lock_resource(res);
+				BUG();
+			}
+		} else if (mle->type != DLM_MLE_MIGRATION) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				/* owner is just re-asserting */
+				if (res->owner == assert->node_idx) {
+					mlog(0, "owner %u re-asserting on "
+					     "lock %.*s\n", assert->node_idx,
+					     namelen, name);
+					goto ok;
+				}
+				mlog(ML_ERROR, "got assert_master from "
+				     "node %u, but %u is the owner! "
+				     "(%.*s)\n", assert->node_idx,
+				     res->owner, namelen, name);
+				goto kill;
+			}
+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+				mlog(ML_ERROR, "got assert from %u, but lock "
+				     "with no owner should be "
+				     "in-progress! (%.*s)\n",
+				     assert->node_idx,
+				     namelen, name);
+				goto kill;
+			}
+		} else /* mle->type == DLM_MLE_MIGRATION */ {
+			/* should only be getting an assert from new master */
+			if (assert->node_idx != mle->new_master) {
+				mlog(ML_ERROR, "got assert from %u, but "
+				     "new master is %u, and old master "
+				     "was %u (%.*s)\n",
+				     assert->node_idx, mle->new_master,
+				     mle->master, namelen, name);
+				goto kill;
+			}
+
+		}
+ok:
+		spin_unlock(&res->spinlock);
+	}
+
+	// mlog(0, "woo!  got an assert_master from node %u!\n",
+	// 	     assert->node_idx);
+	if (mle) {
+		int extra_ref = 0;
+		int nn = -1;
+		int rr, err = 0;
+
+		spin_lock(&mle->spinlock);
+		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
+			extra_ref = 1;
+		else {
+			/* MASTER mle: if any bits set in the response map
+			 * then the calling node needs to re-assert to clear
+			 * up nodes that this node contacted */
+			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
+						    nn+1)) < O2NM_MAX_NODES) {
+				if (nn != dlm->node_num && nn != assert->node_idx) {
+					master_request = 1;
+					break;
+				}
+			}
+		}
+		mle->master = assert->node_idx;
+		atomic_set(&mle->woken, 1);
+		wake_up(&mle->wq);
+		spin_unlock(&mle->spinlock);
+
+		if (res) {
+			int wake = 0;
+			spin_lock(&res->spinlock);
+			if (mle->type == DLM_MLE_MIGRATION) {
+				mlog(0, "finishing off migration of lockres %.*s, "
+			     		"from %u to %u\n",
+			       		res->lockname.len, res->lockname.name,
+			       		dlm->node_num, mle->new_master);
+				res->state &= ~DLM_LOCK_RES_MIGRATING;
+				wake = 1;
+				dlm_change_lockres_owner(dlm, res, mle->new_master);
+				BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			} else {
+				dlm_change_lockres_owner(dlm, res, mle->master);
+			}
+			spin_unlock(&res->spinlock);
+			have_lockres_ref = 1;
+			if (wake)
+				wake_up(&res->wq);
+		}
+
+		/* master is known, detach if not already detached.
+		 * ensures that only one assert_master call will happen
+		 * on this mle. */
+		spin_lock(&dlm->master_lock);
+
+		rr = atomic_read(&mle->mle_refs.refcount);
+		if (mle->inuse > 0) {
+			if (extra_ref && rr < 3)
+				err = 1;
+			else if (!extra_ref && rr < 2)
+				err = 1;
+		} else {
+			if (extra_ref && rr < 2)
+				err = 1;
+			else if (!extra_ref && rr < 1)
+				err = 1;
+		}
+		if (err) {
+			mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+			     "that will mess up this node, refs=%d, extra=%d, "
+			     "inuse=%d\n", dlm->name, namelen, name,
+			     assert->node_idx, rr, extra_ref, mle->inuse);
+			dlm_print_one_mle(mle);
+		}
+		__dlm_unlink_mle(dlm, mle);
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
+		if (extra_ref) {
+			/* the assert master message now balances the extra
+		 	 * ref given by the master / migration request message.
+		 	 * if this is the last put, it will be removed
+		 	 * from the list. */
+			__dlm_put_mle(mle);
+		}
+		spin_unlock(&dlm->master_lock);
+	} else if (res) {
+		if (res->owner != assert->node_idx) {
+			mlog(0, "assert_master from %u, but current "
+			     "owner is %u (%.*s), no mle\n", assert->node_idx,
+			     res->owner, namelen, name);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+
+done:
+	ret = 0;
+	if (res) {
+		spin_lock(&res->spinlock);
+		res->state |= DLM_LOCK_RES_SETREF_INPROG;
+		spin_unlock(&res->spinlock);
+		*ret_data = (void *)res;
+	}
+	dlm_put(dlm);
+	if (master_request) {
+		mlog(0, "need to tell master to reassert\n");
+		/* positive. negative would shoot down the node. */
+		ret |= DLM_ASSERT_RESPONSE_REASSERT;
+		if (!have_lockres_ref) {
+			mlog(ML_ERROR, "strange, got assert from %u, MASTER "
+			     "mle present here for %s:%.*s, but no lockres!\n",
+			     assert->node_idx, dlm->name, namelen, name);
+		}
+	}
+	if (have_lockres_ref) {
+		/* let the master know we have a reference to the lockres */
+		ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
+		mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
+		     dlm->name, namelen, name, assert->node_idx);
+	}
+	return ret;
+
+kill:
+	/* kill the caller! */
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	__dlm_print_one_lock_resource(res);
+	spin_unlock(&res->spinlock);
+	spin_lock(&dlm->master_lock);
+	if (mle)
+		__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+	*ret_data = (void *)res;
+	dlm_put(dlm);
+	return -EINVAL;
+}
+
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
+{
+	struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
+
+	if (ret_data) {
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+		spin_unlock(&res->spinlock);
+		wake_up(&res->wq);
+		dlm_lockres_put(res);
+	}
+	return;
+}
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher, u8 request_from, u32 flags)
+{
+	struct dlm_work_item *item;
+	item = kzalloc(sizeof(*item), GFP_ATOMIC);
+	if (!item)
+		return -ENOMEM;
+
+
+	/* queue up work for dlm_assert_master_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
+	item->u.am.lockres = res; /* already have a ref */
+	/* can optionally ignore node numbers higher than this node */
+	item->u.am.ignore_higher = ignore_higher;
+	item->u.am.request_from = request_from;
+	item->u.am.flags = flags;
+
+	if (ignore_higher)
+		mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
+		     res->lockname.name);
+
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+	return 0;
+}
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int ignore_higher;
+	int bit;
+	u8 request_from;
+	u32 flags;
+
+	dlm = item->dlm;
+	res = item->u.am.lockres;
+	ignore_higher = item->u.am.ignore_higher;
+	request_from = item->u.am.request_from;
+	flags = item->u.am.flags;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dlm->node_num, nodemap);
+	if (ignore_higher) {
+		/* if is this just to clear up mles for nodes below
+		 * this node, do not send the message to the original
+		 * caller or any node number higher than this */
+		clear_bit(request_from, nodemap);
+		bit = dlm->node_num;
+		while (1) {
+			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
+					    bit+1);
+		       	if (bit >= O2NM_MAX_NODES)
+				break;
+			clear_bit(bit, nodemap);
+		}
+	}
+
+	/*
+	 * If we're migrating this lock to someone else, we are no
+	 * longer allowed to assert out own mastery.  OTOH, we need to
+	 * prevent migration from starting while we're still asserting
+	 * our dominance.  The reserved ast delays migration.
+	 */
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "Someone asked us to assert mastery, but we're "
+		     "in the middle of migration.  Skipping assert, "
+		     "the new master will handle that.\n");
+		spin_unlock(&res->spinlock);
+		goto put;
+	} else
+		__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	mlog(0, "worker about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, dlm->node_num);
+	ret = dlm_do_assert_master(dlm, res, nodemap, flags);
+	if (ret < 0) {
+		/* no need to restart, we are done */
+		if (!dlm_is_host_down(ret))
+			mlog_errno(ret);
+	}
+
+	/* Ok, we've asserted ourselves.  Let's let migration start. */
+	dlm_lockres_release_ast(dlm, res);
+
+put:
+	dlm_lockres_drop_inflight_worker(dlm, res);
+
+	dlm_lockres_put(res);
+
+	mlog(0, "finished with dlm_assert_master_worker\n");
+}
+
+/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
+ * We cannot wait for node recovery to complete to begin mastering this
+ * lockres because this lockres is used to kick off recovery! ;-)
+ * So, do a pre-check on all living nodes to see if any of those nodes
+ * think that $RECOVERY is currently mastered by a dead node.  If so,
+ * we wait a short time to allow that node to get notified by its own
+ * heartbeat stack, then check again.  All $RECOVERY lock resources
+ * mastered by dead nodes are purged when the hearbeat callback is
+ * fired, so we can know for sure that it is safe to continue once
+ * the node returns a live node or no node.  */
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, &master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			if (!dlm_is_host_down(ret))
+				BUG();
+			/* host is down, so answer for that node would be
+			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+			ret = 0;
+		}
+
+		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* check to see if this master is in the recovery map */
+			spin_lock(&dlm->spinlock);
+			if (test_bit(master, dlm->recovery_map)) {
+				mlog(ML_NOTICE, "%s: node %u has not seen "
+				     "node %u go down yet, and thinks the "
+				     "dead node is mastering the recovery "
+				     "lock.  must wait.\n", dlm->name,
+				     nodenum, master);
+				ret = -EAGAIN;
+			}
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "%s: reco lock master is %u\n", dlm->name,
+			     master);
+			break;
+		}
+	}
+	return ret;
+}
+
+/*
+ * DLM_DEREF_LOCKRES_MSG
+ */
+
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	struct dlm_deref_lockres deref;
+	int ret = 0, r;
+	const char *lockname;
+	unsigned int namelen;
+
+	lockname = res->lockname.name;
+	namelen = res->lockname.len;
+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+	memset(&deref, 0, sizeof(deref));
+	deref.node_idx = dlm->node_num;
+	deref.namelen = namelen;
+	memcpy(deref.name, lockname, namelen);
+
+	ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
+				 &deref, sizeof(deref), res->owner, &r);
+	if (ret < 0)
+		mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+		     dlm->name, namelen, lockname, ret, res->owner);
+	else if (r < 0) {
+		/* BAD.  other node says I did not have a ref. */
+		mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+		     dlm->name, namelen, lockname, res->owner, r);
+		dlm_print_one_lock_resource(res);
+		BUG();
+	}
+	return ret;
+}
+
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	char *name;
+	unsigned int namelen;
+	int ret = -EINVAL;
+	u8 node;
+	unsigned int hash;
+	struct dlm_work_item *item;
+	int cleared = 0;
+	int dispatch = 0;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	name = deref->name;
+	namelen = deref->namelen;
+	node = deref->node_idx;
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length!");
+		goto done;
+	}
+	if (deref->node_idx >= O2NM_MAX_NODES) {
+		mlog(ML_ERROR, "Invalid node number: %u\n", node);
+		goto done;
+	}
+
+	hash = dlm_lockid_hash(name, namelen);
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+	if (!res) {
+		spin_unlock(&dlm->spinlock);
+		mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+		     dlm->name, namelen, name);
+		goto done;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_SETREF_INPROG)
+		dispatch = 1;
+	else {
+		BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+		if (test_bit(node, res->refmap)) {
+			dlm_lockres_clear_refmap_bit(dlm, res, node);
+			cleared = 1;
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	if (!dispatch) {
+		if (cleared)
+			dlm_lockres_calc_usage(dlm, res);
+		else {
+			mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+		     	"but it is already dropped!\n", dlm->name,
+		     	res->lockname.len, res->lockname.name, node);
+			dlm_print_one_lock_resource(res);
+		}
+		ret = 0;
+		goto done;
+	}
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (!item) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto done;
+	}
+
+	dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
+	item->u.dl.deref_res = res;
+	item->u.dl.deref_node = node;
+
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+	return 0;
+
+done:
+	if (res)
+		dlm_lockres_put(res);
+	dlm_put(dlm);
+
+	return ret;
+}
+
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm;
+	struct dlm_lock_resource *res;
+	u8 node;
+	u8 cleared = 0;
+
+	dlm = item->dlm;
+	res = item->u.dl.deref_res;
+	node = item->u.dl.deref_node;
+
+	spin_lock(&res->spinlock);
+	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+	if (test_bit(node, res->refmap)) {
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+		dlm_lockres_clear_refmap_bit(dlm, res, node);
+		cleared = 1;
+	}
+	spin_unlock(&res->spinlock);
+
+	if (cleared) {
+		mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
+		     dlm->name, res->lockname.len, res->lockname.name, node);
+		dlm_lockres_calc_usage(dlm, res);
+	} else {
+		mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+		     "but it is already dropped!\n", dlm->name,
+		     res->lockname.len, res->lockname.name, node);
+		dlm_print_one_lock_resource(res);
+	}
+
+	dlm_lockres_put(res);
+}
+
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
+ */
+static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res)
+{
+	enum dlm_lockres_list idx;
+	int nonlocal = 0, node_ref;
+	struct list_head *queue;
+	struct dlm_lock *lock;
+	u64 cookie;
+
+	assert_spin_locked(&res->spinlock);
+
+	/* delay migration when the lockres is in MIGRATING state */
+	if (res->state & DLM_LOCK_RES_MIGRATING)
+		return 0;
+
+	/* delay migration when the lockres is in RECOCERING state */
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		return 0;
+
+	if (res->owner != dlm->node_num)
+		return 0;
+
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+		queue = dlm_list_idx_to_ptr(res, idx);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.node != dlm->node_num) {
+				nonlocal++;
+				continue;
+			}
+			cookie = be64_to_cpu(lock->ml.cookie);
+			mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+			     "%s list\n", dlm->name, res->lockname.len,
+			     res->lockname.name,
+			     dlm_get_lock_cookie_node(cookie),
+			     dlm_get_lock_cookie_seq(cookie),
+			     dlm_list_in_text(idx));
+			return 0;
+		}
+	}
+
+	if (!nonlocal) {
+		node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+		if (node_ref >= O2NM_MAX_NODES)
+			return 0;
+	}
+
+	mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+
+	return 1;
+}
+
+/*
+ * DLM_MIGRATE_LOCKRES
+ */
+
+
+static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 target)
+{
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *oldmle = NULL;
+ 	struct dlm_migratable_lockres *mres = NULL;
+	int ret = 0;
+	const char *name;
+	unsigned int namelen;
+	int mle_added = 0;
+	int wake = 0;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(target == O2NM_MAX_NODES);
+
+	name = res->lockname.name;
+	namelen = res->lockname.len;
+
+	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+	     target);
+
+	/* preallocate up front. if this fails, abort */
+	ret = -ENOMEM;
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
+	if (!mres) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+	if (!mle) {
+		mlog_errno(ret);
+		goto leave;
+	}
+	ret = 0;
+
+	/*
+	 * clear any existing master requests and
+	 * add the migration mle to the list
+	 */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
+				    namelen, target, dlm->node_num);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (ret == -EEXIST) {
+		mlog(0, "another process is already migrating it\n");
+		goto fail;
+	}
+	mle_added = 1;
+
+	/*
+	 * set the MIGRATING flag and flush asts
+	 * if we fail after this we need to re-dirty the lockres
+	 */
+	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
+		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
+		     "the target went down.\n", res->lockname.len,
+		     res->lockname.name, target);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		wake = 1;
+		spin_unlock(&res->spinlock);
+		ret = -EINVAL;
+	}
+
+fail:
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (ret < 0) {
+		if (mle_added) {
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+		} else if (mle) {
+			kmem_cache_free(dlm_mle_cache, mle);
+			mle = NULL;
+		}
+		goto leave;
+	}
+
+	/*
+	 * at this point, we have a migration target, an mle
+	 * in the master list, and the MIGRATING flag set on
+	 * the lockres
+	 */
+
+	/* now that remote nodes are spinning on the MIGRATING flag,
+	 * ensure that all assert_master work is flushed. */
+	flush_workqueue(dlm->dlm_worker);
+
+	/* get an extra reference on the mle.
+	 * otherwise the assert_master from the new
+	 * master will destroy this.
+	 * also, make sure that all callers of dlm_get_mle
+	 * take both dlm->spinlock and dlm->master_lock */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	dlm_get_mle_inuse(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* notify new node and send all lock state */
+	/* call send_one_lockres with migration flag.
+	 * this serves as notice to the target node that a
+	 * migration is starting. */
+	ret = dlm_send_one_lockres(dlm, res, mres, target,
+				   DLM_MRES_MIGRATION);
+
+	if (ret < 0) {
+		mlog(0, "migration to node %u failed with %d\n",
+		     target, ret);
+		/* migration failed, detach and clean up mle */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		dlm_put_mle_inuse(mle);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		wake = 1;
+		spin_unlock(&res->spinlock);
+		if (dlm_is_host_down(ret))
+			dlm_wait_for_node_death(dlm, target,
+						DLM_NODE_DEATH_WAIT_MAX);
+		goto leave;
+	}
+
+	/* at this point, the target sends a message to all nodes,
+	 * (using dlm_do_migrate_request).  this node is skipped since
+	 * we had to put an mle in the list to begin the process.  this
+	 * node now waits for target to do an assert master.  this node
+	 * will be the last one notified, ensuring that the migration
+	 * is complete everywhere.  if the target dies while this is
+	 * going on, some nodes could potentially see the target as the
+	 * master, so it is important that my recovery finds the migration
+	 * mle and sets the master to UNKNOWN. */
+
+
+	/* wait for new node to assert master */
+	while (1) {
+		ret = wait_event_interruptible_timeout(mle->wq,
+					(atomic_read(&mle->woken) == 1),
+					msecs_to_jiffies(5000));
+
+		if (ret >= 0) {
+		       	if (atomic_read(&mle->woken) == 1 ||
+			    res->owner == target)
+				break;
+
+			mlog(0, "%s:%.*s: timed out during migration\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
+			/* avoid hang during shutdown when migrating lockres
+			 * to a node which also goes down */
+			if (dlm_is_node_dead(dlm, target)) {
+				mlog(0, "%s:%.*s: expected migration "
+				     "target %u is no longer up, restarting\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name, target);
+				ret = -EINVAL;
+				/* migration failed, detach and clean up mle */
+				dlm_mle_detach_hb_events(dlm, mle);
+				dlm_put_mle(mle);
+				dlm_put_mle_inuse(mle);
+				spin_lock(&res->spinlock);
+				res->state &= ~DLM_LOCK_RES_MIGRATING;
+				wake = 1;
+				spin_unlock(&res->spinlock);
+				goto leave;
+			}
+		} else
+			mlog(0, "%s:%.*s: caught signal during migration\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, target);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	dlm_remove_nonlocal_locks(dlm, res);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle_inuse(mle);
+	ret = 0;
+
+	dlm_lockres_calc_usage(dlm, res);
+
+leave:
+	/* re-dirty the lockres if we failed */
+	if (ret < 0)
+		dlm_kick_thread(dlm, res);
+
+	/* wake up waiters if the MIGRATING flag got set
+	 * but migration failed */
+	if (wake)
+		wake_up(&res->wq);
+
+	if (mres)
+		free_page((unsigned long)mres);
+
+	dlm_put(dlm);
+
+	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+	     name, target, ret);
+	return ret;
+}
+
+#define DLM_MIGRATION_RETRY_MS  100
+
+/*
+ * Should be called only after beginning the domain leave process.
+ * There should not be any remaining locks on nonlocal lock resources,
+ * and there should be no local locks left on locally mastered resources.
+ *
+ * Called with the dlm spinlock held, may drop it to do migration, but
+ * will re-acquire before exit.
+ *
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	int ret;
+	int lock_dropped = 0;
+	u8 target = O2NM_MAX_NODES;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	spin_lock(&res->spinlock);
+	if (dlm_is_lockres_migrateable(dlm, res))
+		target = dlm_pick_migration_target(dlm, res);
+	spin_unlock(&res->spinlock);
+
+	if (target == O2NM_MAX_NODES)
+		goto leave;
+
+	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
+	spin_unlock(&dlm->spinlock);
+	lock_dropped = 1;
+	ret = dlm_migrate_lockres(dlm, res, target);
+	if (ret)
+		mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     target, ret);
+	spin_lock(&dlm->spinlock);
+leave:
+	return lock_dropped;
+}
+
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	int ret;
+	spin_lock(&dlm->ast_lock);
+	spin_lock(&lock->spinlock);
+	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&dlm->ast_lock);
+	return ret;
+}
+
+static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     u8 mig_target)
+{
+	int can_proceed;
+	spin_lock(&res->spinlock);
+	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
+	spin_unlock(&res->spinlock);
+
+	/* target has died, so make the caller break out of the
+	 * wait_event, but caller must recheck the domain_map */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(mig_target, dlm->domain_map))
+		can_proceed = 1;
+	spin_unlock(&dlm->spinlock);
+	return can_proceed;
+}
+
+static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res)
+{
+	int ret;
+	spin_lock(&res->spinlock);
+	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
+	spin_unlock(&res->spinlock);
+	return ret;
+}
+
+
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target)
+{
+	int ret = 0;
+
+	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
+	       res->lockname.len, res->lockname.name, dlm->node_num,
+	       target);
+	/* need to set MIGRATING flag on lockres.  this is done by
+	 * ensuring that all asts have been flushed for this lockres. */
+	spin_lock(&res->spinlock);
+	BUG_ON(res->migration_pending);
+	res->migration_pending = 1;
+	/* strategy is to reserve an extra ast then release
+	 * it below, letting the release do all of the work */
+	__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
+	/* now flush all the pending asts */
+	dlm_kick_thread(dlm, res);
+	/* before waiting on DIRTY, block processes which may
+	 * try to dirty the lockres before MIGRATING is set */
+	spin_lock(&res->spinlock);
+	BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
+	res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
+	spin_unlock(&res->spinlock);
+	/* now wait on any pending asts and the DIRTY state */
+	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+	dlm_lockres_release_ast(dlm, res);
+
+	mlog(0, "about to wait on migration_wq, dirty=%s\n",
+	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+	/* if the extra ref we just put was the final one, this
+	 * will pass thru immediately.  otherwise, we need to wait
+	 * for the last ast to finish. */
+again:
+	ret = wait_event_interruptible_timeout(dlm->migration_wq,
+		   dlm_migration_can_proceed(dlm, res, target),
+		   msecs_to_jiffies(1000));
+	if (ret < 0) {
+		mlog(0, "woken again: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	} else {
+		mlog(0, "all is well: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	}
+	if (!dlm_migration_can_proceed(dlm, res, target)) {
+		mlog(0, "trying again...\n");
+		goto again;
+	}
+
+	ret = 0;
+	/* did the target go down or die? */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(target, dlm->domain_map)) {
+		mlog(ML_ERROR, "aha. migration target %u just went down\n",
+		     target);
+		ret = -EHOSTDOWN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	/*
+	 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+	 * another try; otherwise, we are sure the MIGRATING state is there,
+	 * drop the unneded state which blocked threads trying to DIRTY
+	 */
+	spin_lock(&res->spinlock);
+	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+	if (!ret)
+		BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+	spin_unlock(&res->spinlock);
+
+	/*
+	 * at this point:
+	 *
+	 *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
+	 *   o there are no pending asts on this lockres
+	 *   o all processes trying to reserve an ast on this
+	 *     lockres must wait for the MIGRATING flag to clear
+	 */
+	return ret;
+}
+
+/* last step in the migration process.
+ * original master calls this to free all of the dlm_lock
+ * structures that used to be for other nodes. */
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res)
+{
+	struct list_head *queue = &res->granted;
+	int i, bit;
+	struct dlm_lock *lock, *next;
+
+	assert_spin_locked(&res->spinlock);
+
+	BUG_ON(res->owner == dlm->node_num);
+
+	for (i=0; i<3; i++) {
+		list_for_each_entry_safe(lock, next, queue, list) {
+			if (lock->ml.node != dlm->node_num) {
+				mlog(0, "putting lock for node %u\n",
+				     lock->ml.node);
+				/* be extra careful */
+				BUG_ON(!list_empty(&lock->ast_list));
+				BUG_ON(!list_empty(&lock->bast_list));
+				BUG_ON(lock->ast_pending);
+				BUG_ON(lock->bast_pending);
+				dlm_lockres_clear_refmap_bit(dlm, res,
+							     lock->ml.node);
+				list_del_init(&lock->list);
+				dlm_lock_put(lock);
+				/* In a normal unlock, we would have added a
+				 * DLM_UNLOCK_FREE_LOCK action. Force it. */
+				dlm_lock_put(lock);
+			}
+		}
+		queue++;
+	}
+	bit = 0;
+	while (1) {
+		bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+		if (bit >= O2NM_MAX_NODES)
+			break;
+		/* do not clear the local node reference, if there is a
+		 * process holding this, let it drop the ref itself */
+		if (bit != dlm->node_num) {
+			mlog(0, "%s:%.*s: node %u had a ref to this "
+			     "migrating lockres, clearing\n", dlm->name,
+			     res->lockname.len, res->lockname.name, bit);
+			dlm_lockres_clear_refmap_bit(dlm, res, bit);
+		}
+		bit++;
+	}
+}
+
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res)
+{
+	enum dlm_lockres_list idx;
+	struct list_head *queue = &res->granted;
+	struct dlm_lock *lock;
+	int noderef;
+	u8 nodenum = O2NM_MAX_NODES;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* Go through all the locks */
+	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+		queue = dlm_list_idx_to_ptr(res, idx);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.node == dlm->node_num)
+				continue;
+			if (test_bit(lock->ml.node, dlm->exit_domain_map))
+				continue;
+			nodenum = lock->ml.node;
+			goto bail;
+		}
+	}
+
+	/* Go thru the refmap */
+	noderef = -1;
+	while (1) {
+		noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+					noderef + 1);
+		if (noderef >= O2NM_MAX_NODES)
+			break;
+		if (noderef == dlm->node_num)
+			continue;
+		if (test_bit(noderef, dlm->exit_domain_map))
+			continue;
+		nodenum = noderef;
+		goto bail;
+	}
+
+bail:
+	return nodenum;
+}
+
+/* this is called by the new master once all lockres
+ * data has been received */
+static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 master, u8 new_master,
+				  struct dlm_node_iter *iter)
+{
+	struct dlm_migrate_request migrate;
+	int ret, skip, status = 0;
+	int nodenum;
+
+	memset(&migrate, 0, sizeof(migrate));
+	migrate.namelen = res->lockname.len;
+	memcpy(migrate.name, res->lockname.name, migrate.namelen);
+	migrate.new_master = new_master;
+	migrate.master = master;
+
+	ret = 0;
+
+	/* send message to all nodes, except the master and myself */
+	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
+		if (nodenum == master ||
+		    nodenum == new_master)
+			continue;
+
+		/* We could race exit domain. If exited, skip. */
+		spin_lock(&dlm->spinlock);
+		skip = (!test_bit(nodenum, dlm->domain_map));
+		spin_unlock(&dlm->spinlock);
+		if (skip) {
+			clear_bit(nodenum, iter->node_map);
+			continue;
+		}
+
+		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					 &migrate, sizeof(migrate), nodenum,
+					 &status);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+			     "MIGRATE_REQUEST to node %u\n", dlm->name,
+			     migrate.namelen, migrate.name, ret, nodenum);
+			if (!dlm_is_host_down(ret)) {
+				mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+				BUG();
+			}
+			clear_bit(nodenum, iter->node_map);
+			ret = 0;
+		} else if (status < 0) {
+			mlog(0, "migrate request (node %u) returned %d!\n",
+			     nodenum, status);
+			ret = status;
+		} else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
+			/* during the migration request we short-circuited
+			 * the mastery of the lockres.  make sure we have
+			 * a mastery ref for nodenum */
+			mlog(0, "%s:%.*s: need ref for node %u\n",
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     nodenum);
+			spin_lock(&res->spinlock);
+			dlm_lockres_set_refmap_bit(dlm, res, nodenum);
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	mlog(0, "returning ret=%d\n", ret);
+	return ret;
+}
+
+
+/* if there is an existing mle for this lockres, we now know who the master is.
+ * (the one who sent us *this* message) we can clear it up right away.
+ * since the process that put the mle on the list still has a reference to it,
+ * we can unhash it now, set the master and wake the process.  as a result,
+ * we will have no mle in the list to start with.  now we can add an mle for
+ * the migration and this should be the only one found for those scanning the
+ * list.  */
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
+				void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
+	const char *name;
+	unsigned int namelen, hash;
+	int ret = 0;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = migrate->name;
+	namelen = migrate->namelen;
+	hash = dlm_lockid_hash(name, namelen);
+
+	/* preallocate.. if this fails, abort */
+	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+
+	if (!mle) {
+		ret = -ENOMEM;
+		goto leave;
+	}
+
+	/* check for pre-existing lock */
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			/* if all is working ok, this can only mean that we got
+		 	* a migrate request from a node that we now see as
+		 	* dead.  what can we do here?  drop it to the floor? */
+			spin_unlock(&res->spinlock);
+			mlog(ML_ERROR, "Got a migrate request, but the "
+			     "lockres is marked as recovering!");
+			kmem_cache_free(dlm_mle_cache, mle);
+			ret = -EINVAL; /* need a better solution */
+			goto unlock;
+		}
+		res->state |= DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+	}
+
+	spin_lock(&dlm->master_lock);
+	/* ignore status.  only nonzero status would BUG. */
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
+				    name, namelen,
+				    migrate->new_master,
+				    migrate->master);
+
+	spin_unlock(&dlm->master_lock);
+unlock:
+	spin_unlock(&dlm->spinlock);
+
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (res)
+		dlm_lockres_put(res);
+leave:
+	dlm_put(dlm);
+	return ret;
+}
+
+/* must be holding dlm->spinlock and dlm->master_lock
+ * when adding a migration mle, we can clear any other mles
+ * in the master list because we know with certainty that
+ * the master is "master".  so we remove any old mle from
+ * the list after setting it's master field, and then add
+ * the new migration mle.  this way we can hold with the rule
+ * of having only one mle for a given lock name at all times. */
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master)
+{
+	int found;
+	int ret = 0;
+
+	*oldmle = NULL;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* caller is responsible for any ref taken here on oldmle */
+	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
+	if (found) {
+		struct dlm_master_list_entry *tmp = *oldmle;
+		spin_lock(&tmp->spinlock);
+		if (tmp->type == DLM_MLE_MIGRATION) {
+			if (master == dlm->node_num) {
+				/* ah another process raced me to it */
+				mlog(0, "tried to migrate %.*s, but some "
+				     "process beat me to it\n",
+				     namelen, name);
+				ret = -EEXIST;
+			} else {
+				/* bad.  2 NODES are trying to migrate! */
+				mlog(ML_ERROR, "migration error  mle: "
+				     "master=%u new_master=%u // request: "
+				     "master=%u new_master=%u // "
+				     "lockres=%.*s\n",
+				     tmp->master, tmp->new_master,
+				     master, new_master,
+				     namelen, name);
+				BUG();
+			}
+		} else {
+			/* this is essentially what assert_master does */
+			tmp->master = master;
+			atomic_set(&tmp->woken, 1);
+			wake_up(&tmp->wq);
+			/* remove it so that only one mle will be found */
+			__dlm_unlink_mle(dlm, tmp);
+			__dlm_mle_detach_hb_events(dlm, tmp);
+			if (tmp->type == DLM_MLE_MASTER) {
+				ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+				mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+						"telling master to get ref "
+						"for cleared out mle during "
+						"migration\n", dlm->name,
+						namelen, name, master,
+						new_master);
+			}
+		}
+		spin_unlock(&tmp->spinlock);
+	}
+
+	/* now add a migration mle to the tail of the list */
+	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
+	mle->new_master = new_master;
+	/* the new master will be sending an assert master for this.
+	 * at that point we will get the refmap reference */
+	mle->master = master;
+	/* do this for consistency with other mle types */
+	set_bit(new_master, mle->maybe_map);
+	__dlm_insert_mle(dlm, mle);
+
+	return ret;
+}
+
+/*
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+					struct dlm_master_list_entry *mle)
+{
+	struct dlm_lock_resource *res;
+
+	/* Find the lockres associated to the mle and set its owner to UNK */
+	res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
+				   mle->mnamehash);
+	if (res) {
+		spin_unlock(&dlm->master_lock);
+
+		/* move lockres onto recovery list */
+		spin_lock(&res->spinlock);
+		dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+		dlm_move_lockres_to_recovery_list(dlm, res);
+		spin_unlock(&res->spinlock);
+		dlm_lockres_put(res);
+
+		/* about to get rid of mle, detach from heartbeat */
+		__dlm_mle_detach_hb_events(dlm, mle);
+
+		/* dump the mle */
+		spin_lock(&dlm->master_lock);
+		__dlm_put_mle(mle);
+		spin_unlock(&dlm->master_lock);
+	}
+
+	return res;
+}
+
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+				    struct dlm_master_list_entry *mle)
+{
+	__dlm_mle_detach_hb_events(dlm, mle);
+
+	spin_lock(&mle->spinlock);
+	__dlm_unlink_mle(dlm, mle);
+	atomic_set(&mle->woken, 1);
+	spin_unlock(&mle->spinlock);
+
+	wake_up(&mle->wq);
+}
+
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle, u8 dead_node)
+{
+	int bit;
+
+	BUG_ON(mle->type != DLM_MLE_BLOCK);
+
+	spin_lock(&mle->spinlock);
+	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+	if (bit != dead_node) {
+		mlog(0, "mle found, but dead node %u would not have been "
+		     "master\n", dead_node);
+		spin_unlock(&mle->spinlock);
+	} else {
+		/* Must drop the refcount by one since the assert_master will
+		 * never arrive. This may result in the mle being unlinked and
+		 * freed, but there may still be a process waiting in the
+		 * dlmlock path which is fine. */
+		mlog(0, "node %u was expected master\n", dead_node);
+		atomic_set(&mle->woken, 1);
+		spin_unlock(&mle->spinlock);
+		wake_up(&mle->wq);
+
+		/* Do not need events any longer, so detach from heartbeat */
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
+	}
+}
+
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_lock_resource *res;
+	struct hlist_head *bucket;
+	struct hlist_node *tmp;
+	unsigned int i;
+
+	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+	assert_spin_locked(&dlm->spinlock);
+
+	/* clean the master list */
+	spin_lock(&dlm->master_lock);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
+			BUG_ON(mle->type != DLM_MLE_BLOCK &&
+			       mle->type != DLM_MLE_MASTER &&
+			       mle->type != DLM_MLE_MIGRATION);
+
+			/* MASTER mles are initiated locally. The waiting
+			 * process will notice the node map change shortly.
+			 * Let that happen as normal. */
+			if (mle->type == DLM_MLE_MASTER)
+				continue;
+
+			/* BLOCK mles are initiated by other nodes. Need to
+			 * clean up if the dead node would have been the
+			 * master. */
+			if (mle->type == DLM_MLE_BLOCK) {
+				dlm_clean_block_mle(dlm, mle, dead_node);
+				continue;
+			}
+
+			/* Everything else is a MIGRATION mle */
+
+			/* The rule for MIGRATION mles is that the master
+			 * becomes UNKNOWN if *either* the original or the new
+			 * master dies. All UNKNOWN lockres' are sent to
+			 * whichever node becomes the recovery master. The new
+			 * master is responsible for determining if there is
+			 * still a master for this lockres, or if he needs to
+			 * take over mastery. Either way, this node should
+			 * expect another message to resolve this. */
+
+			if (mle->master != dead_node &&
+			    mle->new_master != dead_node)
+				continue;
+
+			/* If we have reached this point, this mle needs to be
+			 * removed from the list and freed. */
+			dlm_clean_migration_mle(dlm, mle);
+
+			mlog(0, "%s: node %u died during migration from "
+			     "%u to %u!\n", dlm->name, dead_node, mle->master,
+			     mle->new_master);
+
+			/* If we find a lockres associated with the mle, we've
+			 * hit this rare case that messes up our lock ordering.
+			 * If so, we need to drop the master lock so that we can
+			 * take the lockres lock, meaning that we will have to
+			 * restart from the head of list. */
+			res = dlm_reset_mleres_owner(dlm, mle);
+			if (res)
+				/* restart */
+				goto top;
+
+			/* This may be the last reference */
+			__dlm_put_mle(mle);
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 u8 old_master)
+{
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	clear_bit(old_master, iter.node_map);
+	clear_bit(dlm->node_num, iter.node_map);
+	spin_unlock(&dlm->spinlock);
+
+	/* ownership of the lockres is changing.  account for the
+	 * mastery reference here since old_master will briefly have
+	 * a reference after the migration completes */
+	spin_lock(&res->spinlock);
+	dlm_lockres_set_refmap_bit(dlm, res, old_master);
+	spin_unlock(&res->spinlock);
+
+	mlog(0, "now time to do a migrate request to other nodes\n");
+	ret = dlm_do_migrate_request(dlm, res, old_master,
+				     dlm->node_num, &iter);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mlog(0, "doing assert master of %.*s to all except the original node\n",
+	     res->lockname.len, res->lockname.name);
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	ret = dlm_do_assert_master(dlm, res, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		/* no longer need to retry.  all living nodes contacted. */
+		mlog_errno(ret);
+		ret = 0;
+	}
+
+	memset(iter.node_map, 0, sizeof(iter.node_map));
+	set_bit(old_master, iter.node_map);
+	mlog(0, "doing assert master of %.*s back to %u\n",
+	     res->lockname.len, res->lockname.name, old_master);
+	ret = dlm_do_assert_master(dlm, res, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		mlog(0, "assert master to original master failed "
+		     "with %d.\n", ret);
+		/* the only nonzero status here would be because of
+		 * a dead original node.  we're done. */
+		ret = 0;
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, dlm->node_num);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	/* re-dirty it on the new master */
+	dlm_kick_thread(dlm, res);
+	wake_up(&res->wq);
+leave:
+	return ret;
+}
+
+/*
+ * LOCKRES AST REFCOUNT
+ * this is integral to migration
+ */
+
+/* for future intent to call an ast, reserve one ahead of time.
+ * this should be called only after waiting on the lockres
+ * with dlm_wait_on_lockres, and while still holding the
+ * spinlock after the call. */
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		__dlm_print_one_lock_resource(res);
+	}
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+
+	atomic_inc(&res->asts_reserved);
+}
+
+/*
+ * used to drop the reserved ast, either because it went unused,
+ * or because the ast/bast was actually called.
+ *
+ * also, if there is a pending migration on this lockres,
+ * and this was the last pending ast on the lockres,
+ * atomically set the MIGRATING flag before we drop the lock.
+ * this is how we ensure that migration can proceed with no
+ * asts in progress.  note that it is ok if the state of the
+ * queues is such that a lock should be granted in the future
+ * or that a bast should be fired, because the new master will
+ * shuffle the lists on this lockres as soon as it is migrated.
+ */
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res)
+{
+	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
+		return;
+
+	if (!res->migration_pending) {
+		spin_unlock(&res->spinlock);
+		return;
+	}
+
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+	res->migration_pending = 0;
+	res->state |= DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	wake_up(&dlm->migration_wq);
+}
+
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+	int i;
+	struct hlist_head *bucket;
+	struct dlm_master_list_entry *mle;
+	struct hlist_node *tmp;
+
+	/*
+	 * We notified all other nodes that we are exiting the domain and
+	 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+	 * around we force free them and wake any processes that are waiting
+	 * on the mles
+	 */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+
+	BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+	BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
+			if (mle->type != DLM_MLE_BLOCK) {
+				mlog(ML_ERROR, "bad mle: %p\n", mle);
+				dlm_print_one_mle(mle);
+			}
+			atomic_set(&mle->woken, 1);
+			wake_up(&mle->wq);
+
+			__dlm_unlink_mle(dlm, mle);
+			__dlm_mle_detach_hb_events(dlm, mle);
+			__dlm_put_mle(mle);
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
diff --git a/kernel/fs/ocfs2/dlm/dlmrecovery.c b/kernel/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 000000000..ce12e0b1a
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2935 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
+#include "cluster/masklog.h"
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
+
+static int dlm_recovery_thread(void *data);
+static int dlm_do_recovery(struct dlm_ctxt *dlm);
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_request_all_locks(struct dlm_ctxt *dlm,
+				 u8 request_from, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master);
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lockres *mres,
+				    u8 send_to,
+				    struct dlm_lock_resource *res,
+				    int total_locks);
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres);
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
+				 u8 dead_node, u8 send_to);
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list, u8 dead_node);
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master);
+static void dlm_reco_ast(void *astdata);
+static void dlm_reco_bast(void *astdata, int blocked_type);
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
+static void dlm_request_all_locks_worker(struct dlm_work_item *item,
+					 void *data);
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
+
+static u64 dlm_get_next_mig_cookie(void);
+
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
+static u64 dlm_mig_cookie = 1;
+
+static u64 dlm_get_next_mig_cookie(void)
+{
+	u64 c;
+	spin_lock(&dlm_mig_cookie_lock);
+	c = dlm_mig_cookie;
+	if (dlm_mig_cookie == (~0ULL))
+		dlm_mig_cookie = 1;
+	else
+		dlm_mig_cookie++;
+	spin_unlock(&dlm_mig_cookie_lock);
+	return c;
+}
+
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+					  u8 dead_node)
+{
+	assert_spin_locked(&dlm->spinlock);
+	if (dlm->reco.dead_node != dead_node)
+		mlog(0, "%s: changing dead_node from %u to %u\n",
+		     dlm->name, dlm->reco.dead_node, dead_node);
+	dlm->reco.dead_node = dead_node;
+}
+
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+				       u8 master)
+{
+	assert_spin_locked(&dlm->spinlock);
+	mlog(0, "%s: changing new_master from %u to %u\n",
+	     dlm->name, dlm->reco.new_master, master);
+	dlm->reco.new_master = master;
+}
+
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	assert_spin_locked(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_reset_recovery(dlm);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* Worker function used during recovery. */
+void dlm_dispatch_work(struct work_struct *work)
+{
+	struct dlm_ctxt *dlm =
+		container_of(work, struct dlm_ctxt, dispatched_work);
+	LIST_HEAD(tmp_list);
+	struct dlm_work_item *item, *next;
+	dlm_workfunc_t *workfunc;
+	int tot=0;
+
+	spin_lock(&dlm->work_lock);
+	list_splice_init(&dlm->work_list, &tmp_list);
+	spin_unlock(&dlm->work_lock);
+
+	list_for_each_entry(item, &tmp_list, list) {
+		tot++;
+	}
+	mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+	list_for_each_entry_safe(item, next, &tmp_list, list) {
+		workfunc = item->func;
+		list_del_init(&item->list);
+
+		/* already have ref on dlm to avoid having
+		 * it disappear.  just double-check. */
+		BUG_ON(item->dlm != dlm);
+
+		/* this is allowed to sleep and
+		 * call network stuff */
+		workfunc(item, item->data);
+
+		dlm_put(dlm);
+		kfree(item);
+	}
+}
+
+/*
+ * RECOVERY THREAD
+ */
+
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
+{
+	/* wake the recovery thread
+	 * this will wake the reco thread in one of three places
+	 * 1) sleeping with no recovery happening
+	 * 2) sleeping with recovery mastered elsewhere
+	 * 3) recovery mastered here, waiting on reco data */
+
+	wake_up(&dlm->dlm_reco_thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm recovery thread...\n");
+
+	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
+						"dlm_reco_thread");
+	if (IS_ERR(dlm->dlm_reco_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
+		dlm->dlm_reco_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_reco_thread_task) {
+		mlog(0, "waiting for dlm recovery thread to exit\n");
+		kthread_stop(dlm->dlm_reco_thread_task);
+		dlm->dlm_reco_thread_task = NULL;
+	}
+}
+
+
+
+/*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ *    ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ *    thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ *    and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ *    one lock at a time, forcing each node to communicate back
+ *    before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ *    message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ *    and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ *    everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+	struct dlm_reco_node_data *ndata;
+	struct dlm_lock_resource *res;
+
+	mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
+	     dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		char *st = "unknown";
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+				st = "init";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				st = "requesting";
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				st = "dead";
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				st = "receiving";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				st = "requested";
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				st = "done";
+				break;
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				st = "finalize-sent";
+				break;
+			default:
+				st = "bad";
+				break;
+		}
+		mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+		     dlm->name, ndata->node_num, st);
+	}
+	list_for_each_entry(res, &dlm->reco.resources, recovering) {
+		mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+	}
+}
+
+#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
+
+static int dlm_recovery_thread(void *data)
+{
+	int status;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		if (dlm_domain_fully_joined(dlm)) {
+			status = dlm_do_recovery(dlm);
+			if (status == -EAGAIN) {
+				/* do not sleep, recheck immediately. */
+				continue;
+			}
+			if (status < 0)
+				mlog_errno(status);
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM recovery thread\n");
+	return 0;
+}
+
+/* returns true when the recovery master has contacted us */
+static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
+{
+	int ready;
+	spin_lock(&dlm->spinlock);
+	ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
+	spin_unlock(&dlm->spinlock);
+	return ready;
+}
+
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
+{
+	int dead;
+	spin_lock(&dlm->spinlock);
+	dead = !test_bit(node, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+	return dead;
+}
+
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+	int recovered;
+	spin_lock(&dlm->spinlock);
+	recovered = !test_bit(node, dlm->recovery_map);
+	spin_unlock(&dlm->spinlock);
+	return recovered;
+}
+
+
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (dlm_is_node_dead(dlm, node))
+		return;
+
+	printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+	       "domain %s\n", node, dlm->name);
+
+	if (timeout)
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+				   dlm_is_node_dead(dlm, node),
+				   msecs_to_jiffies(timeout));
+	else
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node));
+}
+
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (dlm_is_node_recovered(dlm, node))
+		return;
+
+	printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+	       "domain %s\n", node, dlm->name);
+
+	if (timeout)
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+				   dlm_is_node_recovered(dlm, node),
+				   msecs_to_jiffies(timeout));
+	else
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node));
+}
+
+/* callers of the top-level api calls (dlmlock/dlmunlock) should
+ * block on the dlm->reco.event when recovery is in progress.
+ * the dlm recovery thread will set this state when it begins
+ * recovering a dead node (as the new master or not) and clear
+ * the state and wake as soon as all affected lock resources have
+ * been marked with the RECOVERY flag */
+static int dlm_in_recovery(struct dlm_ctxt *dlm)
+{
+	int in_recovery;
+	spin_lock(&dlm->spinlock);
+	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	spin_unlock(&dlm->spinlock);
+	return in_recovery;
+}
+
+
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
+{
+	if (dlm_in_recovery(dlm)) {
+		mlog(0, "%s: reco thread %d in recovery: "
+		     "state=%d, master=%u, dead=%u\n",
+		     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
+		     dlm->reco.state, dlm->reco.new_master,
+		     dlm->reco.dead_node);
+	}
+	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
+}
+
+static void dlm_begin_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+	       dlm->name, dlm->reco.dead_node);
+	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_end_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
+	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+	printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
+	wake_up(&dlm->reco.event);
+}
+
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+	printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+	       "dead node %u in domain %s\n", dlm->reco.new_master,
+	       (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+	       dlm->reco.dead_node, dlm->name);
+}
+
+static int dlm_do_recovery(struct dlm_ctxt *dlm)
+{
+	int status = 0;
+	int ret;
+
+	spin_lock(&dlm->spinlock);
+
+	/* check to see if the new master has died */
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
+	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+		mlog(0, "new master %u died while recovering %u!\n",
+		     dlm->reco.new_master, dlm->reco.dead_node);
+		/* unset the new_master, leave dead_node */
+		dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+	}
+
+	/* select a target to recover */
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		int bit;
+
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit >= O2NM_MAX_NODES || bit < 0)
+			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+		else
+			dlm_set_reco_dead_node(dlm, bit);
+	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+		/* BUG? */
+		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
+		     dlm->reco.dead_node);
+		dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+	}
+
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		// mlog(0, "nothing to recover!  sleeping now!\n");
+		spin_unlock(&dlm->spinlock);
+		/* return to main thread loop and sleep. */
+		return 0;
+	}
+	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
+	     dlm->reco.dead_node);
+	spin_unlock(&dlm->spinlock);
+
+	/* take write barrier */
+	/* (stops the list reshuffling thread, proxy ast handling) */
+	dlm_begin_recovery(dlm);
+
+	if (dlm->reco.new_master == dlm->node_num)
+		goto master_here;
+
+	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+		/* choose a new master, returns 0 if this node
+		 * is the master, -EEXIST if it's another node.
+		 * this does not return until a new master is chosen
+		 * or recovery completes entirely. */
+		ret = dlm_pick_recovery_master(dlm);
+		if (!ret) {
+			/* already notified everyone.  go. */
+			goto master_here;
+		}
+		mlog(0, "another node will master this recovery session.\n");
+	}
+
+	dlm_print_recovery_master(dlm);
+
+	/* it is safe to start everything back up here
+	 * because all of the dead node's lock resources
+	 * have been marked as in-recovery */
+	dlm_end_recovery(dlm);
+
+	/* sleep out in main dlm_recovery_thread loop. */
+	return 0;
+
+master_here:
+	dlm_print_recovery_master(dlm);
+
+	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
+	if (status < 0) {
+		/* we should never hit this anymore */
+		mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+		     "retrying.\n", dlm->name, status, dlm->reco.dead_node);
+		/* yield a bit to allow any final network messages
+		 * to get handled on remaining nodes */
+		msleep(100);
+	} else {
+		/* success!  see if any other nodes need recovery */
+		mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
+		     dlm->name, dlm->reco.dead_node, dlm->node_num);
+		spin_lock(&dlm->spinlock);
+		__dlm_reset_recovery(dlm);
+		dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+		spin_unlock(&dlm->spinlock);
+	}
+	dlm_end_recovery(dlm);
+
+	/* continue and look for another dead node */
+	return -EAGAIN;
+}
+
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int status = 0;
+	struct dlm_reco_node_data *ndata;
+	int all_nodes_done;
+	int destroy = 0;
+	int pass = 0;
+
+	do {
+		/* we have become recovery master.  there is no escaping
+		 * this, so just keep trying until we get it. */
+		status = dlm_init_recovery_area(dlm, dead_node);
+		if (status < 0) {
+			mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+			     "retrying\n", dlm->name);
+			msleep(1000);
+		}
+	} while (status != 0);
+
+	/* safe to access the node data list without a lock, since this
+	 * process is the only one to change the list */
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
+		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
+
+		mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
+		     ndata->node_num);
+
+		if (ndata->node_num == dlm->node_num) {
+			ndata->state = DLM_RECO_NODE_DATA_DONE;
+			continue;
+		}
+
+		do {
+			status = dlm_request_all_locks(dlm, ndata->node_num,
+						       dead_node);
+			if (status < 0) {
+				mlog_errno(status);
+				if (dlm_is_host_down(status)) {
+					/* node died, ignore it for recovery */
+					status = 0;
+					ndata->state = DLM_RECO_NODE_DATA_DEAD;
+					/* wait for the domain map to catch up
+					 * with the network state. */
+					wait_event_timeout(dlm->dlm_reco_thread_wq,
+							   dlm_is_node_dead(dlm,
+								ndata->node_num),
+							   msecs_to_jiffies(1000));
+					mlog(0, "waited 1 sec for %u, "
+					     "dead? %s\n", ndata->node_num,
+					     dlm_is_node_dead(dlm, ndata->node_num) ?
+					     "yes" : "no");
+				} else {
+					/* -ENOMEM on the other node */
+					mlog(0, "%s: node %u returned "
+					     "%d during recovery, retrying "
+					     "after a short wait\n",
+					     dlm->name, ndata->node_num,
+					     status);
+					msleep(100);
+				}
+			}
+		} while (status != 0);
+
+		spin_lock(&dlm_reco_state_lock);
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				mlog(0, "node %u died after requesting "
+				     "recovery info for node %u\n",
+				     ndata->node_num, dead_node);
+				/* fine.  don't need this node's info.
+				 * continue without it. */
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
+				mlog(0, "now receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				mlog(0, "already receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				mlog(0, "already DONE receiving recovery data "
+				     "from node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+		}
+		spin_unlock(&dlm_reco_state_lock);
+	}
+
+	mlog(0, "%s: Done requesting all lock info\n", dlm->name);
+
+	/* nodes should be sending reco data now
+	 * just need to wait */
+
+	while (1) {
+		/* check all the nodes now to see if we are
+		 * done, or if anyone died */
+		all_nodes_done = 1;
+		spin_lock(&dlm_reco_state_lock);
+		list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+			mlog(0, "checking recovery state of node %u\n",
+			     ndata->node_num);
+			switch (ndata->state) {
+				case DLM_RECO_NODE_DATA_INIT:
+				case DLM_RECO_NODE_DATA_REQUESTING:
+					mlog(ML_ERROR, "bad ndata state for "
+					     "node %u: state=%d\n",
+					     ndata->node_num, ndata->state);
+					BUG();
+					break;
+				case DLM_RECO_NODE_DATA_DEAD:
+					mlog(0, "node %u died after "
+					     "requesting recovery info for "
+					     "node %u\n", ndata->node_num,
+					     dead_node);
+					break;
+				case DLM_RECO_NODE_DATA_RECEIVING:
+				case DLM_RECO_NODE_DATA_REQUESTED:
+					mlog(0, "%s: node %u still in state %s\n",
+					     dlm->name, ndata->node_num,
+					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+					     "receiving" : "requested");
+					all_nodes_done = 0;
+					break;
+				case DLM_RECO_NODE_DATA_DONE:
+					mlog(0, "%s: node %u state is done\n",
+					     dlm->name, ndata->node_num);
+					break;
+				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					mlog(0, "%s: node %u state is finalize\n",
+					     dlm->name, ndata->node_num);
+					break;
+			}
+		}
+		spin_unlock(&dlm_reco_state_lock);
+
+		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
+		     all_nodes_done?"yes":"no");
+		if (all_nodes_done) {
+			int ret;
+
+			/* Set this flag on recovery master to avoid
+			 * a new recovery for another dead node start
+			 * before the recovery is not done. That may
+			 * cause recovery hung.*/
+			spin_lock(&dlm->spinlock);
+			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+
+			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
+	 		 * just send a finalize message to everyone and
+	 		 * clean up */
+			mlog(0, "all nodes are done! send finalize\n");
+			ret = dlm_send_finalize_reco_message(dlm);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			spin_lock(&dlm->spinlock);
+			dlm_finish_local_lockres_recovery(dlm, dead_node,
+							  dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "should be done with recovery!\n");
+
+			mlog(0, "finishing recovery of %s at %lu, "
+			     "dead=%u, this=%u, new=%u\n", dlm->name,
+			     jiffies, dlm->reco.dead_node,
+			     dlm->node_num, dlm->reco.new_master);
+			destroy = 1;
+			status = 0;
+			/* rescan everything marked dirty along the way */
+			dlm_kick_thread(dlm, NULL);
+			break;
+		}
+		/* wait to be signalled, with periodic timeout
+		 * to check for node death */
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+					 kthread_should_stop(),
+					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
+
+	}
+
+	if (destroy)
+		dlm_destroy_recovery_area(dlm, dead_node);
+
+	return status;
+}
+
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int num=0;
+	struct dlm_reco_node_data *ndata;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+	/* nodes can only be removed (by dying) after dropping
+	 * this lock, and death will be trapped later, so this should do */
+	spin_unlock(&dlm->spinlock);
+
+	while (1) {
+		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
+		if (num >= O2NM_MAX_NODES) {
+			break;
+		}
+		BUG_ON(num == dead_node);
+
+		ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
+		if (!ndata) {
+			dlm_destroy_recovery_area(dlm, dead_node);
+			return -ENOMEM;
+		}
+		ndata->node_num = num;
+		ndata->state = DLM_RECO_NODE_DATA_INIT;
+		spin_lock(&dlm_reco_state_lock);
+		list_add_tail(&ndata->list, &dlm->reco.node_data);
+		spin_unlock(&dlm_reco_state_lock);
+		num++;
+	}
+
+	return 0;
+}
+
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_reco_node_data *ndata, *next;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_splice_init(&dlm->reco.node_data, &tmplist);
+	spin_unlock(&dlm_reco_state_lock);
+
+	list_for_each_entry_safe(ndata, next, &tmplist, list) {
+		list_del_init(&ndata->list);
+		kfree(ndata);
+	}
+}
+
+static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
+				 u8 dead_node)
+{
+	struct dlm_lock_request lr;
+	int ret;
+	int status;
+
+	mlog(0, "\n");
+
+
+	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
+		  "to %u\n", dead_node, request_from);
+
+	memset(&lr, 0, sizeof(lr));
+	lr.node_idx = dlm->node_num;
+	lr.dead_node = dead_node;
+
+	// send message
+	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
+				 &lr, sizeof(lr), request_from, &status);
+
+	/* negative status is handled by caller */
+	if (ret < 0)
+		mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+		     "to recover dead node %u\n", dlm->name, ret,
+		     request_from, dead_node);
+	else
+		ret = status;
+	// return from here, then
+	// sleep until all received or error
+	return ret;
+
+}
+
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	if (lr->dead_node != dlm->reco.dead_node) {
+		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+		     "dead_node is %u\n", dlm->name, lr->node_idx,
+		     lr->dead_node, dlm->reco.dead_node);
+		dlm_print_reco_node_status(dlm);
+		/* this is a hack */
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+	BUG_ON(lr->dead_node != dlm->reco.dead_node);
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (!item) {
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* this will get freed by dlm_request_all_locks_worker */
+	buf = (char *) __get_free_page(GFP_NOFS);
+	if (!buf) {
+		kfree(item);
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* queue up work for dlm_request_all_locks_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
+	item->u.ral.reco_master = lr->node_idx;
+	item->u.ral.dead_node = lr->dead_node;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_migratable_lockres *mres;
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+	LIST_HEAD(resources);
+	int ret;
+	u8 dead_node, reco_master;
+	int skip_all_done = 0;
+
+	dlm = item->dlm;
+	dead_node = item->u.ral.dead_node;
+	reco_master = item->u.ral.reco_master;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+	     dlm->name, dead_node, reco_master);
+
+	if (dead_node != dlm->reco.dead_node ||
+	    reco_master != dlm->reco.new_master) {
+		/* worker could have been created before the recovery master
+		 * died.  if so, do not continue, but do not error. */
+		if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+			mlog(ML_NOTICE, "%s: will not send recovery state, "
+			     "recovery master %u died, thread=(dead=%u,mas=%u)"
+			     " current=(dead=%u,mas=%u)\n", dlm->name,
+			     reco_master, dead_node, reco_master,
+			     dlm->reco.dead_node, dlm->reco.new_master);
+		} else {
+			mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+			     "master=%u), request(dead=%u, master=%u)\n",
+			     dlm->name, dlm->reco.dead_node,
+			     dlm->reco.new_master, dead_node, reco_master);
+		}
+		goto leave;
+	}
+
+	/* lock resources should have already been moved to the
+ 	 * dlm->reco.resources list.  now move items from that list
+ 	 * to a temp list if the dead owner matches.  note that the
+	 * whole cluster recovers only one node at a time, so we
+	 * can safely move UNKNOWN lock resources for each recovery
+	 * session. */
+	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
+
+	/* now we can begin blasting lockreses without the dlm lock */
+
+	/* any errors returned will be due to the new_master dying,
+	 * the dlm_reco_thread should detect this */
+	list_for_each_entry(res, &resources, recovering) {
+		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
+				   	DLM_MRES_RECOVERY);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery state for dead node %u, ret=%d\n", dlm->name,
+			     reco_master, dead_node, ret);
+			skip_all_done = 1;
+			break;
+		}
+	}
+
+	/* move the resources back to the list */
+	spin_lock(&dlm->spinlock);
+	list_splice_init(&resources, &dlm->reco.resources);
+	spin_unlock(&dlm->spinlock);
+
+	if (!skip_all_done) {
+		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery all-done for dead node %u, ret=%d\n",
+			     dlm->name, reco_master, dead_node, ret);
+		}
+	}
+leave:
+	free_page((unsigned long)data);
+}
+
+
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
+{
+	int ret, tmpret;
+	struct dlm_reco_data_done done_msg;
+
+	memset(&done_msg, 0, sizeof(done_msg));
+	done_msg.node_idx = dlm->node_num;
+	done_msg.dead_node = dead_node;
+	mlog(0, "sending DATA DONE message to %u, "
+	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
+	     done_msg.dead_node);
+
+	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
+				 sizeof(done_msg), send_to, &tmpret);
+	if (ret < 0) {
+		mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+		     "to recover dead node %u\n", dlm->name, ret, send_to,
+		     dead_node);
+		if (!dlm_is_host_down(ret)) {
+			BUG();
+		}
+	} else
+		ret = tmpret;
+	return ret;
+}
+
+
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
+	struct dlm_reco_node_data *ndata = NULL;
+	int ret = -EINVAL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+	     "node_idx=%u, this node=%u\n", done->dead_node,
+	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
+
+	mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+			"Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+			"node_idx=%u, this node=%u\n", done->dead_node,
+			dlm->reco.dead_node, done->node_idx, dlm->node_num);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		if (ndata->node_num != done->node_idx)
+			continue;
+
+		switch (ndata->state) {
+			/* should have moved beyond INIT but not to FINALIZE yet */
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_DEAD:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				mlog(ML_ERROR, "bad ndata state for node %u:"
+				     " state=%d\n", ndata->node_num,
+				     ndata->state);
+				BUG();
+				break;
+			/* these states are possible at this point, anywhere along
+			 * the line of recovery */
+			case DLM_RECO_NODE_DATA_DONE:
+			case DLM_RECO_NODE_DATA_RECEIVING:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				mlog(0, "node %u is DONE sending "
+					  "recovery data!\n",
+					  ndata->node_num);
+
+				ndata->state = DLM_RECO_NODE_DATA_DONE;
+				ret = 0;
+				break;
+		}
+	}
+	spin_unlock(&dlm_reco_state_lock);
+
+	/* wake the recovery thread, some node is done */
+	if (!ret)
+		dlm_kick_recovery_thread(dlm);
+
+	if (ret < 0)
+		mlog(ML_ERROR, "failed to find recovery node data for node "
+		     "%u\n", done->node_idx);
+	dlm_put(dlm);
+
+	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
+	return ret;
+}
+
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list,
+				       	u8 dead_node)
+{
+	struct dlm_lock_resource *res, *next;
+	struct dlm_lock *lock;
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
+		/* always prune any $RECOVERY entries for dead nodes,
+		 * otherwise hangs can occur during later recovery */
+		if (dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len)) {
+			spin_lock(&res->spinlock);
+			list_for_each_entry(lock, &res->granted, list) {
+				if (lock->ml.node == dead_node) {
+					mlog(0, "AHA! there was "
+					     "a $RECOVERY lock for dead "
+					     "node %u (%s)!\n",
+					     dead_node, dlm->name);
+					list_del_init(&lock->list);
+					dlm_lock_put(lock);
+					/* Can't schedule DLM_UNLOCK_FREE_LOCK
+					 * - do manually */
+					dlm_lock_put(lock);
+					break;
+				}
+			}
+			spin_unlock(&res->spinlock);
+			continue;
+		}
+
+		if (res->owner == dead_node) {
+			mlog(0, "found lockres owned by dead node while "
+				  "doing recovery for node %u. sending it.\n",
+				  dead_node);
+			list_move_tail(&res->recovering, list);
+		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "found UNKNOWN owner while doing recovery "
+				  "for node %u. sending it.\n", dead_node);
+			list_move_tail(&res->recovering, list);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
+{
+	int total_locks = 0;
+	struct list_head *iter, *queue = &res->granted;
+	int i;
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue)
+			total_locks++;
+		queue++;
+	}
+	return total_locks;
+}
+
+
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				      struct dlm_migratable_lockres *mres,
+				      u8 send_to,
+				      struct dlm_lock_resource *res,
+				      int total_locks)
+{
+	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
+	int mres_total_locks = be32_to_cpu(mres->total_locks);
+	int sz, ret = 0, status = 0;
+	u8 orig_flags = mres->flags,
+	   orig_master = mres->master;
+
+	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
+	if (!mres->num_locks)
+		return 0;
+
+	sz = sizeof(struct dlm_migratable_lockres) +
+		(mres->num_locks * sizeof(struct dlm_migratable_lock));
+
+	/* add an all-done flag if we reached the last lock */
+	orig_flags = mres->flags;
+	BUG_ON(total_locks > mres_total_locks);
+	if (total_locks == mres_total_locks)
+		mres->flags |= DLM_MRES_ALL_DONE;
+
+	mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
+	     dlm->name, res->lockname.len, res->lockname.name,
+	     orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
+	     send_to);
+
+	/* send it */
+	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
+				 sz, send_to, &status);
+	if (ret < 0) {
+		/* XXX: negative status is not handled.
+		 * this will end up killing this node. */
+		mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+		     "node %u (%s)\n", dlm->name, mres->lockname_len,
+		     mres->lockname, ret, send_to,
+		     (orig_flags & DLM_MRES_MIGRATION ?
+		      "migration" : "recovery"));
+	} else {
+		/* might get an -ENOMEM back here */
+		ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
+
+			if (ret == -EFAULT) {
+				mlog(ML_ERROR, "node %u told me to kill "
+				     "myself!\n", send_to);
+				BUG();
+			}
+		}
+	}
+
+	/* zero and reinit the message buffer */
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, mres_total_locks,
+				    mig_cookie, orig_flags, orig_master);
+	return ret;
+}
+
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master)
+{
+	/* mres here is one full page */
+	clear_page(mres);
+	mres->lockname_len = namelen;
+	memcpy(mres->lockname, lockname, namelen);
+	mres->num_locks = 0;
+	mres->total_locks = cpu_to_be32(total_locks);
+	mres->mig_cookie = cpu_to_be64(cookie);
+	mres->flags = flags;
+	mres->master = master;
+}
+
+static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
+					  struct dlm_migratable_lockres *mres,
+					  int queue)
+{
+	if (!lock->lksb)
+	       return;
+
+	/* Ignore lvb in all locks in the blocked list */
+	if (queue == DLM_BLOCKED_LIST)
+		return;
+
+	/* Only consider lvbs in locks with granted EX or PR lock levels */
+	if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
+		return;
+
+	if (dlm_lvb_is_empty(mres->lvb)) {
+		memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+		return;
+	}
+
+	/* Ensure the lvb copied for migration matches in other valid locks */
+	if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
+		return;
+
+	mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n",
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	     lock->lockres->lockname.len, lock->lockres->lockname.name,
+	     lock->ml.node);
+	dlm_print_one_lock_resource(lock->lockres);
+	BUG();
+}
+
+/* returns 1 if this lock fills the network structure,
+ * 0 otherwise */
+static int dlm_add_lock_to_array(struct dlm_lock *lock,
+				 struct dlm_migratable_lockres *mres, int queue)
+{
+	struct dlm_migratable_lock *ml;
+	int lock_num = mres->num_locks;
+
+	ml = &(mres->ml[lock_num]);
+	ml->cookie = lock->ml.cookie;
+	ml->type = lock->ml.type;
+	ml->convert_type = lock->ml.convert_type;
+	ml->highest_blocked = lock->ml.highest_blocked;
+	ml->list = queue;
+	if (lock->lksb) {
+		ml->flags = lock->lksb->flags;
+		dlm_prepare_lvb_for_migration(lock, mres, queue);
+	}
+	ml->node = lock->ml.node;
+	mres->num_locks++;
+	/* we reached the max, send this network message */
+	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
+		return 1;
+	return 0;
+}
+
+static void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
+			       struct dlm_migratable_lockres *mres)
+{
+	struct dlm_lock dummy;
+	memset(&dummy, 0, sizeof(dummy));
+	dummy.ml.cookie = 0;
+	dummy.ml.type = LKM_IVMODE;
+	dummy.ml.convert_type = LKM_IVMODE;
+	dummy.ml.highest_blocked = LKM_IVMODE;
+	dummy.lksb = NULL;
+	dummy.ml.node = dlm->node_num;
+	dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST);
+}
+
+static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lock *ml,
+				    u8 *nodenum)
+{
+	if (unlikely(ml->cookie == 0 &&
+	    ml->type == LKM_IVMODE &&
+	    ml->convert_type == LKM_IVMODE &&
+	    ml->highest_blocked == LKM_IVMODE &&
+	    ml->list == DLM_BLOCKED_LIST)) {
+		*nodenum = ml->node;
+		return 1;
+	}
+	return 0;
+}
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to, u8 flags)
+{
+	struct list_head *queue;
+	int total_locks, i;
+	u64 mig_cookie = 0;
+	struct dlm_lock *lock;
+	int ret = 0;
+
+	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	mlog(0, "sending to %u\n", send_to);
+
+	total_locks = dlm_num_locks_in_lockres(res);
+	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
+		/* rare, but possible */
+		mlog(0, "argh.  lockres has %d locks.  this will "
+			  "require more than one network packet to "
+			  "migrate\n", total_locks);
+		mig_cookie = dlm_get_next_mig_cookie();
+	}
+
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, total_locks,
+				    mig_cookie, flags, res->owner);
+
+	total_locks = 0;
+	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_entry(lock, queue, list) {
+			/* add another lock. */
+			total_locks++;
+			if (!dlm_add_lock_to_array(lock, mres, i))
+				continue;
+
+			/* this filled the lock message,
+			 * we must send it immediately. */
+			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
+						       res, total_locks);
+			if (ret < 0)
+				goto error;
+		}
+	}
+	if (total_locks == 0) {
+		/* send a dummy lock to indicate a mastery reference only */
+		mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     send_to, flags & DLM_MRES_RECOVERY ? "recovery" :
+		     "migration");
+		dlm_add_dummy_lock(dlm, mres);
+	}
+	/* flush any remaining locks */
+	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
+	if (ret < 0)
+		goto error;
+	return ret;
+
+error:
+	mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+	     dlm->name, ret);
+	if (!dlm_is_host_down(ret))
+		BUG();
+	mlog(0, "%s: node %u went down while sending %s "
+	     "lockres %.*s\n", dlm->name, send_to,
+	     flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
+	     res->lockname.len, res->lockname.name);
+	return ret;
+}
+
+
+
+/*
+ * this message will contain no more than one page worth of
+ * recovery data, and it will work on only one lockres.
+ * there may be many locks in this page, and we may need to wait
+ * for additional packets to complete all the locks (rare, but
+ * possible).
+ */
+/*
+ * NOTE: the allocation error cases here are scary
+ * we really cannot afford to fail an alloc in recovery
+ * do we spin?  returning an error only delays the problem really
+ */
+
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres =
+		(struct dlm_migratable_lockres *)msg->buf;
+	int ret = 0;
+	u8 real_master;
+	u8 extra_refs = 0;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+	struct dlm_lock_resource *res = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	real_master = mres->master;
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* cannot migrate a lockres with no master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+	}
+
+	mlog(0, "%s message received from node %u\n",
+		  (mres->flags & DLM_MRES_RECOVERY) ?
+		  "recovery" : "migration", mres->master);
+	if (mres->flags & DLM_MRES_ALL_DONE)
+		mlog(0, "all done flag.  all lockres data received!\n");
+
+	ret = -ENOMEM;
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (!buf || !item)
+		goto leave;
+
+	/* lookup the lock to see if we have a secondary queue for this
+	 * already...  just add the locks in and this will have its owner
+	 * and RECOVERY flag changed when it completes. */
+	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	if (res) {
+	 	/* this will get a ref on res */
+		/* mark it as recovering/migrating and hash it */
+		spin_lock(&res->spinlock);
+		if (mres->flags & DLM_MRES_RECOVERY) {
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		} else {
+			if (res->state & DLM_LOCK_RES_MIGRATING) {
+				/* this is at least the second
+				 * lockres message */
+				mlog(0, "lock %.*s is already migrating\n",
+					  mres->lockname_len,
+					  mres->lockname);
+			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
+				/* caller should BUG */
+				mlog(ML_ERROR, "node is attempting to migrate "
+				     "lock %.*s, but marked as recovering!\n",
+				     mres->lockname_len, mres->lockname);
+				ret = -EFAULT;
+				spin_unlock(&res->spinlock);
+				dlm_lockres_put(res);
+				goto leave;
+			}
+			res->state |= DLM_LOCK_RES_MIGRATING;
+		}
+		spin_unlock(&res->spinlock);
+	} else {
+		/* need to allocate, just like if it was
+		 * mastered here normally  */
+		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
+		if (!res)
+			goto leave;
+
+		/* to match the ref that we would have gotten if
+		 * dlm_lookup_lockres had succeeded */
+		dlm_lockres_get(res);
+
+		/* mark it as recovering/migrating and hash it */
+		if (mres->flags & DLM_MRES_RECOVERY)
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		else
+			res->state |= DLM_LOCK_RES_MIGRATING;
+
+		spin_lock(&dlm->spinlock);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&dlm->spinlock);
+
+		/* Add an extra ref for this lock-less lockres lest the
+		 * dlm_thread purges it before we get the chance to add
+		 * locks to it */
+		dlm_lockres_get(res);
+
+		/* There are three refs that need to be put.
+		 * 1. Taken above.
+		 * 2. kref_init in dlm_new_lockres()->dlm_init_lockres().
+		 * 3. dlm_lookup_lockres()
+		 * The first one is handled at the end of this function. The
+		 * other two are handled in the worker thread after locks have
+		 * been attached. Yes, we don't wait for purge time to match
+		 * kref_init. The lockres will still have atleast one ref
+		 * added because it is in the hash __dlm_insert_lockres() */
+		extra_refs++;
+
+		/* now that the new lockres is inserted,
+		 * make it usable by other processes */
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		spin_unlock(&res->spinlock);
+		wake_up(&res->wq);
+	}
+
+	/* at this point we have allocated everything we need,
+	 * and we have a hashed lockres with an extra ref and
+	 * the proper res->state flags. */
+	ret = 0;
+	spin_lock(&res->spinlock);
+	/* drop this either when master requery finds a different master
+	 * or when a lock is added by the recovery worker */
+	dlm_lockres_grab_inflight_ref(dlm, res);
+	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* migration cannot have an unknown master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+		mlog(0, "recovery has passed me a lockres with an "
+			  "unknown owner.. will need to requery: "
+			  "%.*s\n", mres->lockname_len, mres->lockname);
+	} else {
+		/* take a reference now to pin the lockres, drop it
+		 * when locks are added in the worker */
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+	}
+	spin_unlock(&res->spinlock);
+
+	/* queue up work for dlm_mig_lockres_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
+	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
+	item->u.ml.lockres = res; /* already have a ref */
+	item->u.ml.real_master = real_master;
+	item->u.ml.extra_ref = extra_refs;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+
+leave:
+	/* One extra ref taken needs to be put here */
+	if (extra_refs)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+	if (ret < 0) {
+		kfree(buf);
+		kfree(item);
+		mlog_errno(ret);
+	}
+
+	return ret;
+}
+
+
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm;
+	struct dlm_migratable_lockres *mres;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	u8 real_master;
+	u8 extra_ref;
+
+	dlm = item->dlm;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	res = item->u.ml.lockres;
+	real_master = item->u.ml.real_master;
+	extra_ref = item->u.ml.extra_ref;
+
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* this case is super-rare. only occurs if
+		 * node death happens during migration. */
+again:
+		ret = dlm_lockres_master_requery(dlm, res, &real_master);
+		if (ret < 0) {
+			mlog(0, "dlm_lockres_master_requery ret=%d\n",
+				  ret);
+			goto again;
+		}
+		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lockres %.*s not claimed.  "
+				   "this node will take it.\n",
+				   res->lockname.len, res->lockname.name);
+		} else {
+			spin_lock(&res->spinlock);
+			dlm_lockres_drop_inflight_ref(dlm, res);
+			spin_unlock(&res->spinlock);
+			mlog(0, "master needs to respond to sender "
+				  "that node %u still owns %.*s\n",
+				  real_master, res->lockname.len,
+				  res->lockname.name);
+			/* cannot touch this lockres */
+			goto leave;
+		}
+	}
+
+	ret = dlm_process_recovery_data(dlm, res, mres);
+	if (ret < 0)
+		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
+	else
+		mlog(0, "dlm_process_recovery_data succeeded\n");
+
+	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
+	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
+		ret = dlm_finish_migration(dlm, res, mres->master);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+leave:
+	/* See comment in dlm_mig_lockres_handler() */
+	if (res) {
+		if (extra_ref)
+			dlm_lockres_put(res);
+		dlm_lockres_put(res);
+	}
+	kfree(data);
+}
+
+
+
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+
+	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	/* we only reach here if one of the two nodes in a
+	 * migration died while the migration was in progress.
+	 * at this point we need to requery the master.  we
+	 * know that the new_master got as far as creating
+	 * an mle on at least one node, but we do not know
+	 * if any nodes had actually cleared the mle and set
+	 * the master to the new_master.  the old master
+	 * is supposed to set the owner to UNKNOWN in the
+	 * event of a new_master death, so the only possible
+	 * responses that we can get from nodes here are
+	 * that the master is new_master, or that the master
+	 * is UNKNOWN.
+	 * if all nodes come back with UNKNOWN then we know
+	 * the lock needs remastering here.
+	 * if any node comes back with a valid master, check
+	 * to see if that master is the one that we are
+	 * recovering.  if so, then the new_master died and
+	 * we need to remaster this lock.  if not, then the
+	 * new_master survived and that node will respond to
+	 * other nodes about the owner.
+	 * if there is an owner, this node needs to dump this
+	 * lockres and alert the sender that this lockres
+	 * was rejected. */
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			if (!dlm_is_host_down(ret))
+				BUG();
+			/* host is down, so answer for that node would be
+			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+		}
+		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lock master is %u\n", *real_master);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			  u8 nodenum, u8 *real_master)
+{
+	int ret = -EINVAL;
+	struct dlm_master_requery req;
+	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	memset(&req, 0, sizeof(req));
+	req.node_idx = dlm->node_num;
+	req.namelen = res->lockname.len;
+	memcpy(req.name, res->lockname.name, res->lockname.len);
+
+resend:
+	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
+				 &req, sizeof(req), nodenum, &status);
+	if (ret < 0)
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+		     dlm->key, nodenum);
+	else if (status == -ENOMEM) {
+		mlog_errno(status);
+		msleep(50);
+		goto resend;
+	} else {
+		BUG_ON(status < 0);
+		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
+		*real_master = (u8) (status & 0xff);
+		mlog(0, "node %u responded to master requery with %u\n",
+			  nodenum, *real_master);
+		ret = 0;
+	}
+	return ret;
+}
+
+
+/* this function cannot error, so unless the sending
+ * or receiving of the message failed, the owner can
+ * be trusted */
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	unsigned int hash;
+	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
+	u32 flags = DLM_ASSERT_MASTER_REQUERY;
+
+	if (!dlm_grab(dlm)) {
+		/* since the domain has gone away on this
+		 * node, the proper response is UNKNOWN */
+		return master;
+	}
+
+	hash = dlm_lockid_hash(req->name, req->namelen);
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		master = res->owner;
+		if (master == dlm->node_num) {
+			int ret = dlm_dispatch_assert_master(dlm, res,
+							     0, 0, flags);
+			if (ret < 0) {
+				mlog_errno(ret);
+				spin_unlock(&res->spinlock);
+				dlm_lockres_put(res);
+				spin_unlock(&dlm->spinlock);
+				dlm_put(dlm);
+				/* sender will take care of this and retry */
+				return ret;
+			} else
+				__dlm_lockres_grab_inflight_worker(dlm, res);
+			spin_unlock(&res->spinlock);
+		} else {
+			/* put.. incase we are not the master */
+			spin_unlock(&res->spinlock);
+			dlm_lockres_put(res);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+	return master;
+}
+
+static inline struct list_head *
+dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
+{
+	struct list_head *ret;
+	BUG_ON(list_num < 0);
+	BUG_ON(list_num > 2);
+	ret = &(res->granted);
+	ret += list_num;
+	return ret;
+}
+/* TODO: do ast flush business
+ * TODO: do MIGRATING and RECOVERING spinning
+ */
+
+/*
+* NOTE about in-flight requests during migration:
+*
+* Before attempting the migrate, the master has marked the lockres as
+* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
+* requests either got queued before the MIGRATING flag got set, in which
+* case the lock data will reflect the change and a return message is on
+* the way, or the request failed to get in before MIGRATING got set.  In
+* this case, the caller will be told to spin and wait for the MIGRATING
+* flag to be dropped, then recheck the master.
+* This holds true for the convert, cancel and unlock cases, and since lvb
+* updates are tied to these same messages, it applies to lvb updates as
+* well.  For the lock case, there is no way a lock can be on the master
+* queue and not be on the secondary queue since the lock is always added
+* locally first.  This means that the new target node will never be sent
+* a lock that he doesn't already have on the list.
+* In total, this means that the local lock is correct and should not be
+* updated to match the one sent by the master.  Any messages sent back
+* from the master before the MIGRATING flag will bring the lock properly
+* up-to-date, and the change will be ordered properly for the waiter.
+* We will *not* attempt to modify the lock underneath the waiter.
+*/
+
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres)
+{
+	struct dlm_migratable_lock *ml;
+	struct list_head *queue, *iter;
+	struct list_head *tmpq = NULL;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	int ret = 0;
+	int i, j, bad;
+	struct dlm_lock *lock;
+	u8 from = O2NM_MAX_NODES;
+	unsigned int added = 0;
+	__be64 c;
+
+	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
+	for (i=0; i<mres->num_locks; i++) {
+		ml = &(mres->ml[i]);
+
+		if (dlm_is_dummy_lock(dlm, ml, &from)) {
+			/* placeholder, just need to set the refmap bit */
+			BUG_ON(mres->num_locks != 1);
+			mlog(0, "%s:%.*s: dummy lock for %u\n",
+			     dlm->name, mres->lockname_len, mres->lockname,
+			     from);
+			spin_lock(&res->spinlock);
+			dlm_lockres_set_refmap_bit(dlm, res, from);
+			spin_unlock(&res->spinlock);
+			added++;
+			break;
+		}
+		BUG_ON(ml->highest_blocked != LKM_IVMODE);
+		newlock = NULL;
+		lksb = NULL;
+
+		queue = dlm_list_num_to_pointer(res, ml->list);
+		tmpq = NULL;
+
+		/* if the lock is for the local node it needs to
+		 * be moved to the proper location within the queue.
+		 * do not allocate a new lock structure. */
+		if (ml->node == dlm->node_num) {
+			/* MIGRATION ONLY! */
+			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+
+			lock = NULL;
+			spin_lock(&res->spinlock);
+			for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
+				tmpq = dlm_list_idx_to_ptr(res, j);
+				list_for_each(iter, tmpq) {
+					lock = list_entry(iter,
+						  struct dlm_lock, list);
+					if (lock->ml.cookie == ml->cookie)
+						break;
+					lock = NULL;
+				}
+				if (lock)
+					break;
+			}
+
+			/* lock is always created locally first, and
+			 * destroyed locally last.  it must be on the list */
+			if (!lock) {
+				c = ml->cookie;
+				mlog(ML_ERROR, "Could not find local lock "
+					       "with cookie %u:%llu, node %u, "
+					       "list %u, flags 0x%x, type %d, "
+					       "conv %d, highest blocked %d\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     ml->node, ml->list, ml->flags, ml->type,
+				     ml->convert_type, ml->highest_blocked);
+				__dlm_print_one_lock_resource(res);
+				BUG();
+			}
+
+			if (lock->ml.node != ml->node) {
+				c = lock->ml.cookie;
+				mlog(ML_ERROR, "Mismatched node# in lock "
+				     "cookie %u:%llu, name %.*s, node %u\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     res->lockname.len, res->lockname.name,
+				     lock->ml.node);
+				c = ml->cookie;
+				mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
+				     "node %u, list %u, flags 0x%x, type %d, "
+				     "conv %d, highest blocked %d\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     ml->node, ml->list, ml->flags, ml->type,
+				     ml->convert_type, ml->highest_blocked);
+				__dlm_print_one_lock_resource(res);
+				BUG();
+			}
+
+			if (tmpq != queue) {
+				c = ml->cookie;
+				mlog(0, "Lock cookie %u:%llu was on list %u "
+				     "instead of list %u for %.*s\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     j, ml->list, res->lockname.len,
+				     res->lockname.name);
+				__dlm_print_one_lock_resource(res);
+				spin_unlock(&res->spinlock);
+				continue;
+			}
+
+			/* see NOTE above about why we do not update
+			 * to match the master here */
+
+			/* move the lock to its proper place */
+			/* do not alter lock refcount.  switching lists. */
+			list_move_tail(&lock->list, queue);
+			spin_unlock(&res->spinlock);
+			added++;
+
+			mlog(0, "just reordered a local lock!\n");
+			continue;
+		}
+
+		/* lock is for another node. */
+		newlock = dlm_new_lock(ml->type, ml->node,
+				       be64_to_cpu(ml->cookie), NULL);
+		if (!newlock) {
+			ret = -ENOMEM;
+			goto leave;
+		}
+		lksb = newlock->lksb;
+		dlm_lock_attach_lockres(newlock, res);
+
+		if (ml->convert_type != LKM_IVMODE) {
+			BUG_ON(queue != &res->converting);
+			newlock->ml.convert_type = ml->convert_type;
+		}
+		lksb->flags |= (ml->flags &
+				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+
+		if (ml->type == LKM_NLMODE)
+			goto skip_lvb;
+
+		/*
+		 * If the lock is in the blocked list it can't have a valid lvb,
+		 * so skip it
+		 */
+		if (ml->list == DLM_BLOCKED_LIST)
+			goto skip_lvb;
+
+		if (!dlm_lvb_is_empty(mres->lvb)) {
+			if (lksb->flags & DLM_LKSB_PUT_LVB) {
+				/* other node was trying to update
+				 * lvb when node died.  recreate the
+				 * lksb with the updated lvb. */
+				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+				/* the lock resource lvb update must happen
+				 * NOW, before the spinlock is dropped.
+				 * we no longer wait for the AST to update
+				 * the lvb. */
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
+			} else {
+				/* otherwise, the node is sending its
+				 * most recent valid lvb info */
+				BUG_ON(ml->type != LKM_EXMODE &&
+				       ml->type != LKM_PRMODE);
+				if (!dlm_lvb_is_empty(res->lvb) &&
+ 				    (ml->type == LKM_EXMODE ||
+ 				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+ 					int i;
+ 					mlog(ML_ERROR, "%s:%.*s: received bad "
+ 					     "lvb! type=%d\n", dlm->name,
+ 					     res->lockname.len,
+ 					     res->lockname.name, ml->type);
+ 					printk("lockres lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", res->lvb[i]);
+ 					printk("]\nmigrated lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", mres->lvb[i]);
+ 					printk("]\n");
+ 					dlm_print_one_lock_resource(res);
+ 					BUG();
+				}
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
+			}
+		}
+skip_lvb:
+
+		/* NOTE:
+		 * wrt lock queue ordering and recovery:
+		 *    1. order of locks on granted queue is
+		 *       meaningless.
+		 *    2. order of locks on converting queue is
+		 *       LOST with the node death.  sorry charlie.
+		 *    3. order of locks on the blocked queue is
+		 *       also LOST.
+		 * order of locks does not affect integrity, it
+		 * just means that a lock request may get pushed
+		 * back in line as a result of the node death.
+		 * also note that for a given node the lock order
+		 * for its secondary queue locks is preserved
+		 * relative to each other, but clearly *not*
+		 * preserved relative to locks from other nodes.
+		 */
+		bad = 0;
+		spin_lock(&res->spinlock);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.cookie == ml->cookie) {
+				c = lock->ml.cookie;
+				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+				     "exists on this lockres!\n", dlm->name,
+				     res->lockname.len, res->lockname.name,
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
+
+				mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+				     "node=%u, cookie=%u:%llu, queue=%d\n",
+	      			     ml->type, ml->convert_type, ml->node,
+				     dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)),
+				     ml->list);
+
+				__dlm_print_one_lock_resource(res);
+				bad = 1;
+				break;
+			}
+		}
+		if (!bad) {
+			dlm_lock_get(newlock);
+			if (mres->flags & DLM_MRES_RECOVERY &&
+					ml->list == DLM_CONVERTING_LIST &&
+					newlock->ml.type >
+					newlock->ml.convert_type) {
+				/* newlock is doing downconvert, add it to the
+				 * head of converting list */
+				list_add(&newlock->list, queue);
+			} else
+				list_add_tail(&newlock->list, queue);
+			mlog(0, "%s:%.*s: added lock for node %u, "
+			     "setting refmap bit\n", dlm->name,
+			     res->lockname.len, res->lockname.name, ml->node);
+			dlm_lockres_set_refmap_bit(dlm, res, ml->node);
+			added++;
+		}
+		spin_unlock(&res->spinlock);
+	}
+	mlog(0, "done running all the locks\n");
+
+leave:
+	/* balance the ref taken when the work was queued */
+	spin_lock(&res->spinlock);
+	dlm_lockres_drop_inflight_ref(dlm, res);
+	spin_unlock(&res->spinlock);
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue;
+	struct dlm_lock *lock, *next;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+	res->state |= DLM_LOCK_RES_RECOVERING;
+	if (!list_empty(&res->recovering)) {
+		mlog(0,
+		     "Recovering res %s:%.*s, is already on recovery list!\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		list_del_init(&res->recovering);
+		dlm_lockres_put(res);
+	}
+	/* We need to hold a reference while on the recovery list */
+	dlm_lockres_get(res);
+	list_add_tail(&res->recovering, &dlm->reco.resources);
+
+	/* find any pending locks and put them back on proper list */
+	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_entry_safe(lock, next, queue, list) {
+			dlm_lock_get(lock);
+			if (lock->convert_pending) {
+				/* move converting lock back to granted */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with convert pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_revert_pending_convert(res, lock);
+				lock->convert_pending = 0;
+			} else if (lock->lock_pending) {
+				/* remove pending lock requests completely */
+				BUG_ON(i != DLM_BLOCKED_LIST);
+				mlog(0, "node died with lock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				/* lock will be floating until ref in
+				 * dlmlock_remote is freed after the network
+				 * call returns.  ok for it to not be on any
+				 * list since no ast can be called
+				 * (the master is dead). */
+				dlm_revert_pending_lock(res, lock);
+				lock->lock_pending = 0;
+			} else if (lock->unlock_pending) {
+				/* if an unlock was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master.  note that the dlm_unlock
+				 * call is still responsible for calling
+				 * the unlockast.  that will happen after
+				 * the network call times out.  for now,
+				 * just move lists to prepare the new
+				 * recovery master.  */
+				BUG_ON(i != DLM_GRANTED_LIST);
+				mlog(0, "node died with unlock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_unlock(res, lock);
+				lock->unlock_pending = 0;
+			} else if (lock->cancel_pending) {
+				/* if a cancel was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with cancel pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_cancel(res, lock);
+				lock->cancel_pending = 0;
+			}
+			dlm_lock_put(lock);
+		}
+	}
+}
+
+
+
+/* removes all recovered locks from the recovery list.
+ * sets the res->owner to the new master.
+ * unsets the RECOVERY flag and wakes waiters. */
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master)
+{
+	int i;
+	struct hlist_head *bucket;
+	struct dlm_lock_resource *res, *next;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
+		if (res->owner == dead_node) {
+			mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     res->owner, new_master);
+			list_del_init(&res->recovering);
+			spin_lock(&res->spinlock);
+			/* new_master has our reference from
+			 * the lock state sent during recovery */
+			dlm_change_lockres_owner(dlm, res, new_master);
+			res->state &= ~DLM_LOCK_RES_RECOVERING;
+			if (__dlm_lockres_has_locks(res))
+				__dlm_dirty_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			wake_up(&res->wq);
+			dlm_lockres_put(res);
+		}
+	}
+
+	/* this will become unnecessary eventually, but
+	 * for now we need to run the whole hash, clear
+	 * the RECOVERING state and set the owner
+	 * if necessary */
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_lockres_hash(dlm, i);
+		hlist_for_each_entry(res, bucket, hash_node) {
+			if (!(res->state & DLM_LOCK_RES_RECOVERING))
+				continue;
+
+			if (res->owner != dead_node &&
+			    res->owner != dlm->node_num)
+				continue;
+
+			if (!list_empty(&res->recovering)) {
+				list_del_init(&res->recovering);
+				dlm_lockres_put(res);
+			}
+
+			/* new_master has our reference from
+			 * the lock state sent during recovery */
+			mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     res->owner, new_master);
+			spin_lock(&res->spinlock);
+			dlm_change_lockres_owner(dlm, res, new_master);
+			res->state &= ~DLM_LOCK_RES_RECOVERING;
+			if (__dlm_lockres_has_locks(res))
+				__dlm_dirty_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			wake_up(&res->wq);
+		}
+	}
+}
+
+static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
+{
+	if (local) {
+		if (lock->ml.type != LKM_EXMODE &&
+		    lock->ml.type != LKM_PRMODE)
+			return 1;
+	} else if (lock->ml.type == LKM_EXMODE)
+		return 1;
+	return 0;
+}
+
+static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *queue;
+	struct dlm_lock *lock;
+	int blank_lvb = 0, local = 0;
+	int i;
+	u8 search_node;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (res->owner == dlm->node_num)
+		/* if this node owned the lockres, and if the dead node
+		 * had an EX when he died, blank out the lvb */
+		search_node = dead_node;
+	else {
+		/* if this is a secondary lockres, and we had no EX or PR
+		 * locks granted, we can no longer trust the lvb */
+		search_node = dlm->node_num;
+		local = 1;  /* check local state for valid lvb */
+	}
+
+	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.node == search_node) {
+				if (dlm_lvb_needs_invalidation(lock, local)) {
+					/* zero the lksb lvb and lockres lvb */
+					blank_lvb = 1;
+					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
+				}
+			}
+		}
+	}
+
+	if (blank_lvb) {
+		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
+		     res->lockname.len, res->lockname.name, dead_node);
+		memset(res->lvb, 0, DLM_LVB_LEN);
+	}
+}
+
+static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct dlm_lock *lock, *next;
+	unsigned int freed = 0;
+
+	/* this node is the lockres master:
+	 * 1) remove any stale locks for the dead node
+	 * 2) if the dead node had an EX when he died, blank out the lvb
+	 */
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* We do two dlm_lock_put(). One for removing from list and the other is
+	 * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
+
+	/* TODO: check pending_asts, pending_basts here */
+	list_for_each_entry_safe(lock, next, &res->granted, list) {
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+			dlm_lock_put(lock);
+			freed++;
+		}
+	}
+	list_for_each_entry_safe(lock, next, &res->converting, list) {
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+			dlm_lock_put(lock);
+			freed++;
+		}
+	}
+	list_for_each_entry_safe(lock, next, &res->blocked, list) {
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+			dlm_lock_put(lock);
+			freed++;
+		}
+	}
+
+	if (freed) {
+		mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
+		     "dropping ref from lockres\n", dlm->name,
+		     res->lockname.len, res->lockname.name, freed, dead_node);
+		if(!test_bit(dead_node, res->refmap)) {
+			mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
+			     "but ref was not set\n", dlm->name,
+			     res->lockname.len, res->lockname.name, freed, dead_node);
+			__dlm_print_one_lock_resource(res);
+		}
+		dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+	} else if (test_bit(dead_node, res->refmap)) {
+		mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+		     "no locks and had not purged before dying\n", dlm->name,
+		     res->lockname.len, res->lockname.name, dead_node);
+		dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+	}
+
+	/* do not kick thread yet */
+	__dlm_dirty_lockres(dlm, res);
+}
+
+/* if this node is the recovery master, and there are no
+ * locks for a given lockres owned by this node that are in
+ * either PR or EX mode, zero out the lvb before requesting.
+ *
+ */
+
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_lock_resource *res;
+	int i;
+	struct hlist_head *bucket;
+	struct dlm_lock *lock;
+
+
+	/* purge any stale mles */
+	dlm_clean_master_list(dlm, dead_node);
+
+	/*
+	 * now clean up all lock resources.  there are two rules:
+	 *
+	 * 1) if the dead node was the master, move the lockres
+	 *    to the recovering list.  set the RECOVERING flag.
+	 *    this lockres needs to be cleaned up before it can
+	 *    be used further.
+	 *
+	 * 2) if this node was the master, remove all locks from
+	 *    each of the lockres queues that were owned by the
+	 *    dead node.  once recovery finishes, the dlm thread
+	 *    can be kicked again to see if any ASTs or BASTs
+	 *    need to be fired as a result.
+	 */
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_lockres_hash(dlm, i);
+		hlist_for_each_entry(res, bucket, hash_node) {
+ 			/* always prune any $RECOVERY entries for dead nodes,
+ 			 * otherwise hangs can occur during later recovery */
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len)) {
+				spin_lock(&res->spinlock);
+				list_for_each_entry(lock, &res->granted, list) {
+					if (lock->ml.node == dead_node) {
+						mlog(0, "AHA! there was "
+						     "a $RECOVERY lock for dead "
+						     "node %u (%s)!\n",
+						     dead_node, dlm->name);
+						list_del_init(&lock->list);
+						dlm_lock_put(lock);
+						/* Can't schedule
+						 * DLM_UNLOCK_FREE_LOCK
+						 * - do manually */
+						dlm_lock_put(lock);
+						break;
+					}
+				}
+				spin_unlock(&res->spinlock);
+				continue;
+			}
+			spin_lock(&res->spinlock);
+			/* zero the lvb if necessary */
+			dlm_revalidate_lvb(dlm, res, dead_node);
+			if (res->owner == dead_node) {
+				if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+					mlog(ML_NOTICE, "%s: res %.*s, Skip "
+					     "recovery as it is being freed\n",
+					     dlm->name, res->lockname.len,
+					     res->lockname.name);
+				} else
+					dlm_move_lockres_to_recovery_list(dlm,
+									  res);
+
+			} else if (res->owner == dlm->node_num) {
+				dlm_free_dead_locks(dlm, res, dead_node);
+				__dlm_lockres_calc_usage(dlm, res);
+			} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+				if (test_bit(dead_node, res->refmap)) {
+					mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+						"no locks and had not purged before dying\n",
+						dlm->name, res->lockname.len,
+						res->lockname.name, dead_node);
+					dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+				}
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+}
+
+static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	if (dlm->reco.new_master == idx) {
+		mlog(0, "%s: recovery master %d just died\n",
+		     dlm->name, idx);
+		if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+			/* finalize1 was reached, so it is safe to clear
+			 * the new_master and dead_node.  that recovery
+			 * is complete. */
+			mlog(0, "%s: dead master %d had reached "
+			     "finalize1 state, clearing\n", dlm->name, idx);
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			__dlm_reset_recovery(dlm);
+		}
+	}
+
+	/* Clean up join state on node death. */
+	if (dlm->joining_node == idx) {
+		mlog(0, "Clearing join state for node %u\n", idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	}
+
+	/* check to see if the node is already considered dead */
+	if (!test_bit(idx, dlm->live_nodes_map)) {
+		mlog(0, "for domain %s, node %d is already dead. "
+		     "another node likely did recovery already.\n",
+		     dlm->name, idx);
+		return;
+	}
+
+	/* check to see if we do not care about this node */
+	if (!test_bit(idx, dlm->domain_map)) {
+		/* This also catches the case that we get a node down
+		 * but haven't joined the domain yet. */
+		mlog(0, "node %u already removed from domain!\n", idx);
+		return;
+	}
+
+	clear_bit(idx, dlm->live_nodes_map);
+
+	/* make sure local cleanup occurs before the heartbeat events */
+	if (!test_bit(idx, dlm->recovery_map))
+		dlm_do_local_recovery_cleanup(dlm, idx);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 0);
+
+	mlog(0, "node %u being removed from domain map!\n", idx);
+	clear_bit(idx, dlm->domain_map);
+	clear_bit(idx, dlm->exit_domain_map);
+	/* wake up migration waiters if a node goes down.
+	 * perhaps later we can genericize this for other waiters. */
+	wake_up(&dlm->migration_wq);
+
+	if (test_bit(idx, dlm->recovery_map))
+		mlog(0, "domain %s, node %u already added "
+		     "to recovery map!\n", dlm->name, idx);
+	else
+		set_bit(idx, dlm->recovery_map);
+}
+
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	/*
+	 * This will notify any dlm users that a node in our domain
+	 * went away without notifying us first.
+	 */
+	if (test_bit(idx, dlm->domain_map))
+		dlm_fire_domain_eviction_callbacks(dlm, idx);
+
+	spin_lock(&dlm->spinlock);
+	__dlm_hb_node_down(dlm, idx);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+	set_bit(idx, dlm->live_nodes_map);
+	/* do NOT notify mle attached to the heartbeat events.
+	 * new nodes are not interesting in mastery until joined. */
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+static void dlm_reco_ast(void *astdata)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_bast(void *astdata, int blocked_type)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
+{
+	mlog(0, "unlockast for recovery lock fired!\n");
+}
+
+/*
+ * dlm_pick_recovery_master will continually attempt to use
+ * dlmlock() on the special "$RECOVERY" lockres with the
+ * LKM_NOQUEUE flag to get an EX.  every thread that enters
+ * this function on each node racing to become the recovery
+ * master will not stop attempting this until either:
+ * a) this node gets the EX (and becomes the recovery master),
+ * or b) dlm->reco.new_master gets set to some nodenum
+ * != O2NM_INVALID_NODE_NUM (another node will do the reco).
+ * so each time a recovery master is needed, the entire cluster
+ * will sync at this point.  if the new master dies, that will
+ * be detected in dlm_do_recovery */
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
+{
+	enum dlm_status ret;
+	struct dlm_lockstatus lksb;
+	int status = -EINVAL;
+
+	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
+	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
+again:
+	memset(&lksb, 0, sizeof(lksb));
+
+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
+		      DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
+		      dlm_reco_ast, dlm, dlm_reco_bast);
+
+	mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
+	     dlm->name, ret, lksb.status);
+
+	if (ret == DLM_NORMAL) {
+		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+
+		/* got the EX lock.  check to see if another node
+		 * just became the reco master */
+		if (dlm_reco_master_ready(dlm)) {
+			mlog(0, "%s: got reco EX lock, but %u will "
+			     "do the recovery\n", dlm->name,
+			     dlm->reco.new_master);
+			status = -EEXIST;
+		} else {
+			status = 0;
+
+			/* see if recovery was already finished elsewhere */
+			spin_lock(&dlm->spinlock);
+			if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+				status = -EINVAL;
+				mlog(0, "%s: got reco EX lock, but "
+				     "node got recovered already\n", dlm->name);
+				if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+					mlog(ML_ERROR, "%s: new master is %u "
+					     "but no dead node!\n",
+					     dlm->name, dlm->reco.new_master);
+					BUG();
+				}
+			}
+			spin_unlock(&dlm->spinlock);
+		}
+
+		/* if this node has actually become the recovery master,
+		 * set the master and send the messages to begin recovery */
+		if (!status) {
+			mlog(0, "%s: dead=%u, this=%u, sending "
+			     "begin_reco now\n", dlm->name,
+			     dlm->reco.dead_node, dlm->node_num);
+			status = dlm_send_begin_reco_message(dlm,
+				      dlm->reco.dead_node);
+			/* this always succeeds */
+			BUG_ON(status);
+
+			/* set the new_master to this node */
+			spin_lock(&dlm->spinlock);
+			dlm_set_reco_master(dlm, dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+		}
+
+		/* recovery lock is a special case.  ast will not get fired,
+		 * so just go ahead and unlock it. */
+		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
+		if (ret == DLM_DENIED) {
+			mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
+			ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
+		}
+		if (ret != DLM_NORMAL) {
+			/* this would really suck. this could only happen
+			 * if there was a network error during the unlock
+			 * because of node death.  this means the unlock
+			 * is actually "done" and the lock structure is
+			 * even freed.  we can continue, but only
+			 * because this specific lock name is special. */
+			mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
+		}
+	} else if (ret == DLM_NOTQUEUED) {
+		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* another node is master. wait on
+		 * reco.new_master != O2NM_INVALID_NODE_NUM
+		 * for at most one second */
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+					 dlm_reco_master_ready(dlm),
+					 msecs_to_jiffies(1000));
+		if (!dlm_reco_master_ready(dlm)) {
+			mlog(0, "%s: reco master taking awhile\n",
+			     dlm->name);
+			goto again;
+		}
+		/* another node has informed this one that it is reco master */
+		mlog(0, "%s: reco master %u is ready to recover %u\n",
+		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
+		status = -EEXIST;
+	} else if (ret == DLM_RECOVERING) {
+		mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		goto again;
+	} else {
+		struct dlm_lock_resource *res;
+
+		/* dlmlock returned something other than NOTQUEUED or NORMAL */
+		mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
+		     "lksb.status=%s\n", dlm->name, dlm_errname(ret),
+		     dlm_errname(lksb.status));
+		res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+					 DLM_RECOVERY_LOCK_NAME_LEN);
+		if (res) {
+			dlm_print_one_lock_resource(res);
+			dlm_lockres_put(res);
+		} else {
+			mlog(ML_ERROR, "recovery lock not found\n");
+		}
+		BUG();
+	}
+
+	return status;
+}
+
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_begin_reco br;
+	int ret = 0;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dead_node, iter.node_map);
+
+	memset(&br, 0, sizeof(br));
+	br.node_idx = dlm->node_num;
+	br.dead_node = dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = 0;
+		if (nodenum == dead_node) {
+			mlog(0, "not sending begin reco to dead node "
+				  "%u\n", dead_node);
+			continue;
+		}
+		if (nodenum == dlm->node_num) {
+			mlog(0, "not sending begin reco to self\n");
+			continue;
+		}
+retry:
+		ret = -EINVAL;
+		mlog(0, "attempting to send begin reco msg to %d\n",
+			  nodenum);
+		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
+					 &br, sizeof(br), nodenum, &status);
+		/* negative status is handled ok by caller here */
+		if (ret >= 0)
+			ret = status;
+		if (dlm_is_host_down(ret)) {
+			/* node is down.  not involved in recovery
+			 * so just keep going */
+			mlog(ML_NOTICE, "%s: node %u was down when sending "
+			     "begin reco msg (%d)\n", dlm->name, nodenum, ret);
+			ret = 0;
+		}
+
+		/*
+		 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
+		 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
+		 * We are handling both for compatibility reasons.
+		 */
+		if (ret == -EAGAIN || ret == EAGAIN) {
+			mlog(0, "%s: trying to start recovery of node "
+			     "%u, but node %u is waiting for last recovery "
+			     "to complete, backoff for a bit\n", dlm->name,
+			     dead_node, nodenum);
+			msleep(100);
+			goto retry;
+		}
+		if (ret < 0) {
+			struct dlm_lock_resource *res;
+
+			/* this is now a serious problem, possibly ENOMEM
+			 * in the network stack.  must retry */
+			mlog_errno(ret);
+			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
+			     "returned %d\n", dlm->name, nodenum, ret);
+			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+						 DLM_RECOVERY_LOCK_NAME_LEN);
+			if (res) {
+				dlm_print_one_lock_resource(res);
+				dlm_lockres_put(res);
+			} else {
+				mlog(ML_ERROR, "recovery lock not found\n");
+			}
+			/* sleep for a bit in hopes that we can avoid
+			 * another ENOMEM */
+			msleep(100);
+			goto retry;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			   void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+		mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+		     "but this node is in finalize state, waiting on finalize2\n",
+		     dlm->name, br->node_idx, br->dead_node,
+		     dlm->reco.dead_node, dlm->reco.new_master);
+		spin_unlock(&dlm->spinlock);
+		dlm_put(dlm);
+		return -EAGAIN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+		if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+			mlog(0, "%s: new_master %u died, changing "
+			     "to %u\n", dlm->name, dlm->reco.new_master,
+			     br->node_idx);
+		} else {
+			mlog(0, "%s: new_master %u NOT DEAD, changing "
+			     "to %u\n", dlm->name, dlm->reco.new_master,
+			     br->node_idx);
+			/* may not have seen the new master as dead yet */
+		}
+	}
+	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
+		mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
+		     "node %u changing it to %u\n", dlm->name,
+		     dlm->reco.dead_node, br->node_idx, br->dead_node);
+	}
+	dlm_set_reco_master(dlm, br->node_idx);
+	dlm_set_reco_dead_node(dlm, br->dead_node);
+	if (!test_bit(br->dead_node, dlm->recovery_map)) {
+		mlog(0, "recovery master %u sees %u as dead, but this "
+		     "node has not yet.  marking %u as dead\n",
+		     br->node_idx, br->dead_node, br->dead_node);
+		if (!test_bit(br->dead_node, dlm->domain_map) ||
+		    !test_bit(br->dead_node, dlm->live_nodes_map))
+			mlog(0, "%u not in domain/live_nodes map "
+			     "so setting it in reco map manually\n",
+			     br->dead_node);
+		/* force the recovery cleanup in __dlm_hb_node_down
+		 * both of these will be cleared in a moment */
+		set_bit(br->dead_node, dlm->domain_map);
+		set_bit(br->dead_node, dlm->live_nodes_map);
+		__dlm_hb_node_down(dlm, br->dead_node);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_kick_recovery_thread(dlm);
+
+	mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+#define DLM_FINALIZE_STAGE2  0x01
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+	struct dlm_finalize_reco fr;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+	int stage = 1;
+
+	mlog(0, "finishing recovery for node %s:%u, "
+	     "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+stage2:
+	memset(&fr, 0, sizeof(fr));
+	fr.node_idx = dlm->node_num;
+	fr.dead_node = dlm->reco.dead_node;
+	if (stage == 2)
+		fr.flags |= DLM_FINALIZE_STAGE2;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
+					 &fr, sizeof(fr), nodenum, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret < 0) {
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+			     dlm->key, nodenum);
+			if (dlm_is_host_down(ret)) {
+				/* this has no effect on this recovery
+				 * session, so set the status to zero to
+				 * finish out the last recovery */
+				mlog(ML_ERROR, "node %u went down after this "
+				     "node finished recovery.\n", nodenum);
+				ret = 0;
+				continue;
+			}
+			break;
+		}
+	}
+	if (stage == 1) {
+		/* reset the node_iter back to the top and send finalize2 */
+		iter.curnode = -1;
+		stage = 2;
+		goto stage2;
+	}
+
+	return ret;
+}
+
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+	int stage = 1;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	if (fr->flags & DLM_FINALIZE_STAGE2)
+		stage = 2;
+
+	mlog(0, "%s: node %u finalizing recovery stage%d of "
+	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->reco.new_master != fr->node_idx) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
+		     "%u is supposed to be the new master, dead=%u\n",
+		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
+		BUG();
+	}
+	if (dlm->reco.dead_node != fr->dead_node) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
+		     "node %u, but node %u is supposed to be dead\n",
+		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
+		BUG();
+	}
+
+	switch (stage) {
+		case 1:
+			dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+			if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+				mlog(ML_ERROR, "%s: received finalize1 from "
+				     "new master %u for dead node %u, but "
+				     "this node has already received it!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			break;
+		case 2:
+			if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+				mlog(ML_ERROR, "%s: received finalize2 from "
+				     "new master %u for dead node %u, but "
+				     "this node did not have finalize1!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			__dlm_reset_recovery(dlm);
+			spin_unlock(&dlm->spinlock);
+			dlm_kick_recovery_thread(dlm);
+			break;
+		default:
+			BUG();
+	}
+
+	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
+
+	dlm_put(dlm);
+	return 0;
+}
diff --git a/kernel/fs/ocfs2/dlm/dlmthread.c b/kernel/fs/ocfs2/dlm/dlmthread.c
new file mode 100644
index 000000000..69aac6f08
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,756 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
+#include "cluster/masklog.h"
+
+static int dlm_thread(void *data);
+static void dlm_flush_asts(struct dlm_ctxt *dlm);
+
+#define dlm_lock_is_remote(dlm, lock)     ((lock)->ml.node != (dlm)->node_num)
+
+/* will exit holding res->spinlock, but may drop in function */
+/* waits until flags are cleared on res->state */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	assert_spin_locked(&res->spinlock);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (res->state & flags) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		spin_lock(&res->spinlock);
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	__set_current_state(TASK_RUNNING);
+}
+
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
+{
+	if (list_empty(&res->granted) &&
+	    list_empty(&res->converting) &&
+	    list_empty(&res->blocked))
+		return 0;
+	return 1;
+}
+
+/* "unused": the lockres has no locks, is not on the dirty list,
+ * has no inflight locks (in the gap between mastery and acquiring
+ * the first lock), and has no bits in its refmap.
+ * truly ready to be freed. */
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
+{
+	int bit;
+
+	assert_spin_locked(&res->spinlock);
+
+	if (__dlm_lockres_has_locks(res))
+		return 0;
+
+	/* Locks are in the process of being created */
+	if (res->inflight_locks)
+		return 0;
+
+	if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+		return 0;
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		return 0;
+
+	/* Another node has this resource with this node as the master */
+	bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+	if (bit < O2NM_MAX_NODES)
+		return 0;
+
+	return 1;
+}
+
+
+/* Call whenever you may have added or deleted something from one of
+ * the lockres queue's. This will figure out whether it belongs on the
+ * unused list or not and does the appropriate thing. */
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (__dlm_lockres_unused(res)){
+		if (list_empty(&res->purge)) {
+			mlog(0, "%s: Adding res %.*s to purge list\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
+
+			res->last_used = jiffies;
+			dlm_lockres_get(res);
+			list_add_tail(&res->purge, &dlm->purge_list);
+			dlm->purge_count++;
+		}
+	} else if (!list_empty(&res->purge)) {
+		mlog(0, "%s: Removing res %.*s from purge list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+
+		list_del_init(&res->purge);
+		dlm_lockres_put(res);
+		dlm->purge_count--;
+	}
+}
+
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res)
+{
+	spin_lock(&dlm->spinlock);
+	spin_lock(&res->spinlock);
+
+	__dlm_lockres_calc_usage(dlm, res);
+
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res)
+{
+	int master;
+	int ret = 0;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	master = (res->owner == dlm->node_num);
+
+	mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
+	     res->lockname.len, res->lockname.name, master);
+
+	if (!master) {
+		res->state |= DLM_LOCK_RES_DROPPING_REF;
+		/* drop spinlock...  retake below */
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+
+		spin_lock(&res->spinlock);
+		/* This ensures that clear refmap is sent after the set */
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+		spin_unlock(&res->spinlock);
+
+		/* clear our bit from the master's refmap, ignore errors */
+		ret = dlm_drop_lockres_ref(dlm, res);
+		if (ret < 0) {
+			if (!dlm_is_host_down(ret))
+				BUG();
+		}
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+	}
+
+	if (!list_empty(&res->purge)) {
+		mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name, master);
+		list_del_init(&res->purge);
+		dlm_lockres_put(res);
+		dlm->purge_count--;
+	}
+
+	if (!__dlm_lockres_unused(res)) {
+		mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		__dlm_print_one_lock_resource(res);
+		BUG();
+	}
+
+	__dlm_unhash_lockres(dlm, res);
+
+	/* lockres is not in the hash now.  drop the flag and wake up
+	 * any processes waiting in dlm_get_lock_resource. */
+	if (!master) {
+		res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+		spin_unlock(&res->spinlock);
+		wake_up(&res->wq);
+	} else
+		spin_unlock(&res->spinlock);
+}
+
+static void dlm_run_purge_list(struct dlm_ctxt *dlm,
+			       int purge_now)
+{
+	unsigned int run_max, unused;
+	unsigned long purge_jiffies;
+	struct dlm_lock_resource *lockres;
+
+	spin_lock(&dlm->spinlock);
+	run_max = dlm->purge_count;
+
+	while(run_max && !list_empty(&dlm->purge_list)) {
+		run_max--;
+
+		lockres = list_entry(dlm->purge_list.next,
+				     struct dlm_lock_resource, purge);
+
+		spin_lock(&lockres->spinlock);
+
+		purge_jiffies = lockres->last_used +
+			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
+
+		/* Make sure that we want to be processing this guy at
+		 * this time. */
+		if (!purge_now && time_after(purge_jiffies, jiffies)) {
+			/* Since resources are added to the purge list
+			 * in tail order, we can stop at the first
+			 * unpurgable resource -- anyone added after
+			 * him will have a greater last_used value */
+			spin_unlock(&lockres->spinlock);
+			break;
+		}
+
+		/* Status of the lockres *might* change so double
+		 * check. If the lockres is unused, holding the dlm
+		 * spinlock will prevent people from getting and more
+		 * refs on it. */
+		unused = __dlm_lockres_unused(lockres);
+		if (!unused ||
+		    (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+		    (lockres->inflight_assert_workers != 0)) {
+			mlog(0, "%s: res %.*s is in use or being remastered, "
+			     "used %d, state %d, assert master workers %u\n",
+			     dlm->name, lockres->lockname.len,
+			     lockres->lockname.name,
+			     !unused, lockres->state,
+			     lockres->inflight_assert_workers);
+			list_move_tail(&lockres->purge, &dlm->purge_list);
+			spin_unlock(&lockres->spinlock);
+			continue;
+		}
+
+		dlm_lockres_get(lockres);
+
+		dlm_purge_lockres(dlm, lockres);
+
+		dlm_lockres_put(lockres);
+
+		/* Avoid adding any scheduling latencies */
+		cond_resched_lock(&dlm->spinlock);
+	}
+
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	struct dlm_lock *lock, *target;
+	int can_grant = 1;
+
+	/*
+	 * Because this function is called with the lockres
+	 * spinlock, and because we know that it is not migrating/
+	 * recovering/in-progress, it is fine to reserve asts and
+	 * basts right before queueing them all throughout
+	 */
+	assert_spin_locked(&dlm->ast_lock);
+	assert_spin_locked(&res->spinlock);
+	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
+			      DLM_LOCK_RES_RECOVERING|
+			      DLM_LOCK_RES_IN_PROGRESS)));
+
+converting:
+	if (list_empty(&res->converting))
+		goto blocked;
+	mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
+	     res->lockname.len, res->lockname.name);
+
+	target = list_entry(res->converting.next, struct dlm_lock, list);
+	if (target->ml.convert_type == LKM_IVMODE) {
+		mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		BUG();
+	}
+	list_for_each_entry(lock, &res->granted, list) {
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			/* queue the BAST if not already */
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			/* update the highest_blocked if needed */
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+
+	list_for_each_entry(lock, &res->converting, list) {
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+
+	/* we can convert the lock */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
+		     "%d => %d, node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+		     target->ml.type,
+		     target->ml.convert_type, target->ml.node);
+
+		target->ml.type = target->ml.convert_type;
+		target->ml.convert_type = LKM_IVMODE;
+		list_move_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		__dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+blocked:
+	if (list_empty(&res->blocked))
+		goto leave;
+	target = list_entry(res->blocked.next, struct dlm_lock, list);
+
+	list_for_each_entry(lock, &res->granted, list) {
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	list_for_each_entry(lock, &res->converting, list) {
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	/* we can grant the blocked lock (only
+	 * possible if converting list empty) */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
+		     "node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+		     target->ml.type, target->ml.node);
+
+		/* target->ml.type is already correct */
+		list_move_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		__dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+leave:
+	return;
+}
+
+/* must have NO locks when calling this with res !=NULL * */
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	if (res) {
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+		__dlm_dirty_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+	}
+	wake_up(&dlm->dlm_thread_wq);
+}
+
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* don't shuffle secondary queues */
+	if ((res->owner == dlm->node_num)) {
+		if (res->state & (DLM_LOCK_RES_MIGRATING |
+				  DLM_LOCK_RES_BLOCK_DIRTY))
+		    return;
+
+		if (list_empty(&res->dirty)) {
+			/* ref for dirty_list */
+			dlm_lockres_get(res);
+			list_add_tail(&res->dirty, &dlm->dirty_list);
+			res->state |= DLM_LOCK_RES_DIRTY;
+		}
+	}
+
+	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+}
+
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "Starting dlm_thread...\n");
+
+	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+	if (IS_ERR(dlm->dlm_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_thread_task));
+		dlm->dlm_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_thread_task) {
+		mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
+		kthread_stop(dlm->dlm_thread_task);
+		dlm->dlm_thread_task = NULL;
+	}
+}
+
+static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
+{
+	int empty;
+
+	spin_lock(&dlm->spinlock);
+	empty = list_empty(&dlm->dirty_list);
+	spin_unlock(&dlm->spinlock);
+
+	return empty;
+}
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm)
+{
+	int ret;
+	struct dlm_lock *lock;
+	struct dlm_lock_resource *res;
+	u8 hi;
+
+	spin_lock(&dlm->ast_lock);
+	while (!list_empty(&dlm->pending_asts)) {
+		lock = list_entry(dlm->pending_asts.next,
+				  struct dlm_lock, ast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+		mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+		     "node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     lock->ml.type, lock->ml.node);
+
+		BUG_ON(!lock->ast_pending);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->ast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_do_remote_ast(dlm, res, lock);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_ast(dlm, res, lock);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another ast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->ast_list)) {
+			mlog(0, "%s: res %.*s, AST queued while flushing last "
+			     "one\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		} else
+			lock->ast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+
+	while (!list_empty(&dlm->pending_basts)) {
+		lock = list_entry(dlm->pending_basts.next,
+				  struct dlm_lock, bast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+
+		BUG_ON(!lock->bast_pending);
+
+		/* get the highest blocked lock, and reset */
+		spin_lock(&lock->spinlock);
+		BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
+		hi = lock->ml.highest_blocked;
+		lock->ml.highest_blocked = LKM_IVMODE;
+		spin_unlock(&lock->spinlock);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->bast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
+		     "blocked %d, node %u\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     hi, lock->ml.node);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_bast(dlm, res, lock, hi);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another bast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->bast_list)) {
+			mlog(0, "%s: res %.*s, BAST queued while flushing last "
+			     "one\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		} else
+			lock->bast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+	wake_up(&dlm->ast_wq);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
+#define DLM_THREAD_MAX_DIRTY  100
+#define DLM_THREAD_MAX_ASTS   10
+
+static int dlm_thread(void *data)
+{
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		int n = DLM_THREAD_MAX_DIRTY;
+
+		/* dlm_shutting_down is very point-in-time, but that
+		 * doesn't matter as we'll just loop back around if we
+		 * get false on the leading edge of a state
+		 * transition. */
+		dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
+
+		/* We really don't want to hold dlm->spinlock while
+		 * calling dlm_shuffle_lists on each lockres that
+		 * needs to have its queues adjusted and AST/BASTs
+		 * run.  So let's pull each entry off the dirty_list
+		 * and drop dlm->spinlock ASAP.  Once off the list,
+		 * res->spinlock needs to be taken again to protect
+		 * the queues while calling dlm_shuffle_lists.  */
+		spin_lock(&dlm->spinlock);
+		while (!list_empty(&dlm->dirty_list)) {
+			int delay = 0;
+			res = list_entry(dlm->dirty_list.next,
+					 struct dlm_lock_resource, dirty);
+
+			/* peel a lockres off, remove it from the list,
+			 * unset the dirty flag and drop the dlm lock */
+			BUG_ON(!res);
+			dlm_lockres_get(res);
+
+			spin_lock(&res->spinlock);
+			/* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */
+			list_del_init(&res->dirty);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+			/* Drop dirty_list ref */
+			dlm_lockres_put(res);
+
+		 	/* lockres can be re-dirtied/re-added to the
+			 * dirty_list in this gap, but that is ok */
+
+			spin_lock(&dlm->ast_lock);
+			spin_lock(&res->spinlock);
+			if (res->owner != dlm->node_num) {
+				__dlm_print_one_lock_resource(res);
+				mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
+				     " dirty %d\n", dlm->name,
+				     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
+				     !!(res->state & DLM_LOCK_RES_MIGRATING),
+				     !!(res->state & DLM_LOCK_RES_RECOVERING),
+				     !!(res->state & DLM_LOCK_RES_DIRTY));
+			}
+			BUG_ON(res->owner != dlm->node_num);
+
+			/* it is now ok to move lockreses in these states
+			 * to the dirty list, assuming that they will only be
+			 * dirty for a short while. */
+			BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+			if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
+					  DLM_LOCK_RES_RECOVERING)) {
+				/* move it to the tail and keep going */
+				res->state &= ~DLM_LOCK_RES_DIRTY;
+				spin_unlock(&res->spinlock);
+				spin_unlock(&dlm->ast_lock);
+				mlog(0, "%s: res %.*s, inprogress, delay list "
+				     "shuffle, state %d\n", dlm->name,
+				     res->lockname.len, res->lockname.name,
+				     res->state);
+				delay = 1;
+				goto in_progress;
+			}
+
+			/* at this point the lockres is not migrating/
+			 * recovering/in-progress.  we have the lockres
+			 * spinlock and do NOT have the dlm lock.
+			 * safe to reserve/queue asts and run the lists. */
+
+			/* called while holding lockres lock */
+			dlm_shuffle_lists(dlm, res);
+			res->state &= ~DLM_LOCK_RES_DIRTY;
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->ast_lock);
+
+			dlm_lockres_calc_usage(dlm, res);
+
+in_progress:
+
+			spin_lock(&dlm->spinlock);
+			/* if the lock was in-progress, stick
+			 * it on the back of the list */
+			if (delay) {
+				spin_lock(&res->spinlock);
+				__dlm_dirty_lockres(dlm, res);
+				spin_unlock(&res->spinlock);
+			}
+			dlm_lockres_put(res);
+
+			/* unlikely, but we may need to give time to
+			 * other tasks */
+			if (!--n) {
+				mlog(0, "%s: Throttling dlm thread\n",
+				     dlm->name);
+				break;
+			}
+		}
+
+		spin_unlock(&dlm->spinlock);
+		dlm_flush_asts(dlm);
+
+		/* yield and continue right away if there is more work to do */
+		if (!n) {
+			cond_resched();
+			continue;
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_thread_wq,
+						 !dlm_dirty_list_empty(dlm) ||
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM thread\n");
+	return 0;
+}
diff --git a/kernel/fs/ocfs2/dlm/dlmunlock.c b/kernel/fs/ocfs2/dlm/dlmunlock.c
new file mode 100644
index 000000000..2e3c9dbab
--- /dev/null
+++ b/kernel/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,698 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmunlock.c
+ *
+ * underlying calls for unlocking locks
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+#define DLM_UNLOCK_FREE_LOCK           0x00000001
+#define DLM_UNLOCK_CALL_AST            0x00000002
+#define DLM_UNLOCK_REMOVE_LOCK         0x00000004
+#define DLM_UNLOCK_REGRANT_LOCK        0x00000008
+#define DLM_UNLOCK_CLEAR_CONVERT_TYPE  0x00000010
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner);
+
+
+/*
+ * according to the spec:
+ * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+ *
+ *  flags & LKM_CANCEL != 0: must be converting or blocked
+ *  flags & LKM_CANCEL == 0: must be granted
+ *
+ * So to unlock a converting lock, you must first cancel the
+ * convert (passing LKM_CANCEL in flags), then call the unlock
+ * again (with no LKM_CANCEL in flags).
+ */
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         res->spinlock and lock->spinlock taken and dropped
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ * all callers should have taken an extra ref on lock coming in
+ */
+static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
+					struct dlm_lock_resource *res,
+					struct dlm_lock *lock,
+					struct dlm_lockstatus *lksb,
+					int flags, int *call_ast,
+					int master_node)
+{
+	enum dlm_status status;
+	int actions = 0;
+	int in_use;
+        u8 owner;
+
+	mlog(0, "master_node = %d, valblk = %d\n", master_node,
+	     flags & LKM_VALBLK);
+
+	if (master_node)
+		BUG_ON(res->owner != dlm->node_num);
+	else
+		BUG_ON(res->owner == dlm->node_num);
+
+	spin_lock(&dlm->ast_lock);
+	/* We want to be sure that we're not freeing a lock
+	 * that still has AST's pending... */
+	in_use = !list_empty(&lock->ast_list);
+	spin_unlock(&dlm->ast_lock);
+	if (in_use && !(flags & LKM_CANCEL)) {
+	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
+		    "while waiting for an ast!", res->lockname.len,
+		    res->lockname.name);
+		return DLM_BADPARAM;
+	}
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		if (master_node && !(flags & LKM_CANCEL)) {
+			mlog(ML_ERROR, "lockres in progress!\n");
+			spin_unlock(&res->spinlock);
+			return DLM_FORWARD;
+		}
+		/* ok for this to sleep if not in a network handler */
+		__dlm_wait_on_lockres(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_lock(&lock->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		status = DLM_MIGRATING;
+		goto leave;
+	}
+
+	/* see above for what the spec says about
+	 * LKM_CANCEL and the lock queue state */
+	if (flags & LKM_CANCEL)
+		status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
+	else
+		status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
+
+	if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node))
+		goto leave;
+
+	/* By now this has been masked out of cancel requests. */
+	if (flags & LKM_VALBLK) {
+		/* make the final update to the lvb */
+		if (master_node)
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		else
+			flags |= LKM_PUT_LVB; /* let the send function
+					       * handle it. */
+	}
+
+	if (!master_node) {
+		owner = res->owner;
+		/* drop locks and send message */
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 1;
+		else
+			lock->unlock_pending = 1;
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
+							flags, owner);
+		spin_lock(&res->spinlock);
+		spin_lock(&lock->spinlock);
+		/* if the master told us the lock was already granted,
+		 * let the ast handle all of these actions */
+		if (status == DLM_CANCELGRANT) {
+			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
+				     DLM_UNLOCK_REGRANT_LOCK|
+				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+		} else if (status == DLM_RECOVERING ||
+			   status == DLM_MIGRATING ||
+			   status == DLM_FORWARD ||
+			   status == DLM_NOLOCKMGR
+			   ) {
+			/* must clear the actions because this unlock
+			 * is about to be retried.  cannot free or do
+			 * any list manipulation. */
+			mlog(0, "%s:%.*s: clearing actions, %s\n",
+			     dlm->name, res->lockname.len,
+			     res->lockname.name,
+			     status==DLM_RECOVERING?"recovering":
+			     (status==DLM_MIGRATING?"migrating":
+				(status == DLM_FORWARD ? "forward" :
+						"nolockmanager")));
+			actions = 0;
+		}
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 0;
+		else
+			lock->unlock_pending = 0;
+
+	}
+
+	/* get an extra ref on lock.  if we are just switching
+	 * lists here, we dont want the lock to go away. */
+	dlm_lock_get(lock);
+
+	if (actions & DLM_UNLOCK_REMOVE_LOCK) {
+		list_del_init(&lock->list);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_REGRANT_LOCK) {
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+	}
+	if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
+		mlog(0, "clearing convert_type at %smaster node\n",
+		     master_node ? "" : "non-");
+		lock->ml.convert_type = LKM_IVMODE;
+	}
+
+	/* remove the extra ref on lock */
+	dlm_lock_put(lock);
+
+leave:
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	if (!dlm_lock_on_list(&res->converting, lock))
+		BUG_ON(lock->ml.convert_type != LKM_IVMODE);
+	else
+		BUG_ON(lock->ml.convert_type == LKM_IVMODE);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* let the caller's final dlm_lock_put handle the actual kfree */
+	if (actions & DLM_UNLOCK_FREE_LOCK) {
+		/* this should always be coupled with list removal */
+		BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
+		mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     atomic_read(&lock->lock_refs.refcount)-1);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_CALL_AST)
+		*call_ast = 1;
+
+	/* if cancel or unlock succeeded, lvb work is done */
+	if (status == DLM_NORMAL)
+		lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+
+	return status;
+}
+
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	/* leave DLM_LKSB_PUT_LVB on the lksb so any final
+	 * update of the lvb will be sent to the new master */
+	list_del_init(&lock->list);
+}
+
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	list_move_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+}
+
+
+static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags,
+					  int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
+}
+
+static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags, int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ */
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner)
+{
+	struct dlm_unlock_lock unlock;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
+
+	if (owner == dlm->node_num) {
+		/* ended up trying to contact ourself.  this means
+		 * that the lockres had been remote but became local
+		 * via a migration.  just retry it, now as local */
+		mlog(0, "%s:%.*s: this node became the master due to a "
+		     "migration, re-evaluate now\n", dlm->name,
+		     res->lockname.len, res->lockname.name);
+		return DLM_FORWARD;
+	}
+
+	memset(&unlock, 0, sizeof(unlock));
+	unlock.node_idx = dlm->node_num;
+	unlock.flags = cpu_to_be32(flags);
+	unlock.cookie = lock->ml.cookie;
+	unlock.namelen = res->lockname.len;
+	memcpy(unlock.name, res->lockname.name, unlock.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_unlock_lock);
+	vec[0].iov_base = &unlock;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					vec, veclen, owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		if (status == DLM_FORWARD)
+			mlog(0, "master was in-progress.  retry\n");
+		ret = status;
+	} else {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
+		if (dlm_is_host_down(tmpret)) {
+			/* NOTE: this seems strange, but it is what we want.
+			 * when the master goes down during a cancel or
+			 * unlock, the recovery code completes the operation
+			 * as if the master had not died, then passes the
+			 * updated state to the recovery master.  this thread
+			 * just needs to finish out the operation and call
+			 * the unlockast. */
+			if (dlm_is_node_dead(dlm, owner))
+				ret = DLM_NORMAL;
+			else
+				ret = DLM_NOLOCKMGR;
+		} else {
+			/* something bad.  this will BUG in ocfs2 */
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
+ *          return value from dlmunlock_master
+ */
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	int found = 0, i;
+	struct dlm_lockstatus *lksb = NULL;
+	int ignore;
+	u32 flags;
+	struct list_head *queue;
+
+	flags = be32_to_cpu(unlock->flags);
+
+	if (flags & LKM_GET_LVB) {
+		mlog(ML_ERROR, "bad args!  GET_LVB specified on unlock!\n");
+		return DLM_BADARGS;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
+		mlog(ML_ERROR, "bad args!  cannot modify lvb on a CANCEL "
+		     "request!\n");
+		return DLM_BADARGS;
+	}
+
+	if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
+		return DLM_IVBUFLEN;
+	}
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
+
+	res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
+	if (!res) {
+		/* We assume here that a no lock resource simply means
+		 * it was migrated away and destroyed before the other
+		 * node could detect it. */
+		mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
+		status = DLM_FORWARD;
+		goto not_found;
+	}
+
+	queue=&res->granted;
+	found = 0;
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_RECOVERING\n");
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_MIGRATING\n");
+		status = DLM_MIGRATING;
+		goto leave;
+	}
+
+	if (res->owner != dlm->node_num) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_FORWARD -- not master\n");
+		status = DLM_FORWARD;
+		goto leave;
+	}
+
+	for (i=0; i<3; i++) {
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.cookie == unlock->cookie &&
+		    	    lock->ml.node == unlock->node_idx) {
+				dlm_lock_get(lock);
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			break;
+		/* scan granted -> converting -> blocked queues */
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	if (!found) {
+		status = DLM_IVLOCKID;
+		goto not_found;
+	}
+
+	/* lock was found on queue */
+	lksb = lock->lksb;
+	if (flags & (LKM_VALBLK|LKM_PUT_LVB) &&
+	    lock->ml.type != LKM_EXMODE)
+		flags &= ~(LKM_VALBLK|LKM_PUT_LVB);
+
+	/* unlockast only called on originating node */
+	if (flags & LKM_PUT_LVB) {
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
+	}
+
+	/* if this is in-progress, propagate the DLM_FORWARD
+	 * all the way back out */
+	status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
+	if (status == DLM_FORWARD)
+		mlog(0, "lockres is in progress\n");
+
+	if (flags & LKM_PUT_LVB)
+		lksb->flags &= ~DLM_LKSB_PUT_LVB;
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_kick_thread(dlm, res);
+
+not_found:
+	if (!found)
+		mlog(ML_ERROR, "failed to find lock to unlock! "
+			       "cookie=%u:%llu\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie)));
+	else
+		dlm_lock_put(lock);
+
+leave:
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	if (dlm_lock_on_list(&res->blocked, lock)) {
+		/* cancel this outright */
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	} else if (dlm_lock_on_list(&res->converting, lock)) {
+		/* cancel the request, put back on granted */
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK |
+			    DLM_UNLOCK_REGRANT_LOCK |
+			    DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+	} else if (dlm_lock_on_list(&res->granted, lock)) {
+		/* too late, already granted. */
+		status = DLM_CANCELGRANT;
+		*actions = DLM_UNLOCK_CALL_AST;
+	} else {
+		mlog(ML_ERROR, "lock to cancel is not on any list!\n");
+		status = DLM_IVLOCKID;
+		*actions = 0;
+	}
+	return status;
+}
+
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	/* unlock request */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		status = DLM_DENIED;
+		dlm_error(status);
+		*actions = 0;
+	} else {
+		/* unlock granted lock */
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_FREE_LOCK |
+			    DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	}
+	return status;
+}
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
+			  int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res;
+	struct dlm_lock *lock = NULL;
+	int call_ast, is_master;
+
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+		mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
+		flags &= ~LKM_VALBLK;
+	}
+
+	if (!lksb->lockid || !lksb->lockid->lockres) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	lock = lksb->lockid;
+	BUG_ON(!lock);
+	dlm_lock_get(lock);
+
+	res = lock->lockres;
+	BUG_ON(!res);
+	dlm_lockres_get(res);
+retry:
+	call_ast = 0;
+	/* need to retry up here because owner may have changed */
+	mlog(0, "lock=%p res=%p\n", lock, res);
+
+	spin_lock(&res->spinlock);
+	is_master = (res->owner == dlm->node_num);
+	if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE)
+		flags &= ~LKM_VALBLK;
+	spin_unlock(&res->spinlock);
+
+	if (is_master) {
+		status = dlmunlock_master(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_master: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	} else {
+		status = dlmunlock_remote(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_remote: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	}
+
+	if (status == DLM_RECOVERING ||
+	    status == DLM_MIGRATING ||
+	    status == DLM_FORWARD ||
+	    status == DLM_NOLOCKMGR) {
+
+		/* We want to go away for a tiny bit to allow recovery
+		 * / migration to complete on this resource. I don't
+		 * know of any wait queue we could sleep on as this
+		 * may be happening on another node. Perhaps the
+		 * proper solution is to queue up requests on the
+		 * other end? */
+
+		/* do we want to yield(); ?? */
+		msleep(50);
+
+		mlog(0, "retrying unlock due to pending recovery/"
+		     "migration/in-progress/reconnect\n");
+		goto retry;
+	}
+
+	if (call_ast) {
+		mlog(0, "calling unlockast(%p, %d)\n", data, status);
+		if (is_master) {
+			/* it is possible that there is one last bast
+			 * pending.  make sure it is flushed, then
+			 * call the unlockast.
+			 * not an issue if this is a mastered remotely,
+			 * since this lock has been removed from the
+			 * lockres queues and cannot be found. */
+			dlm_kick_thread(dlm, NULL);
+			wait_event(dlm->ast_wq,
+				   dlm_lock_basts_flushed(dlm, lock));
+		}
+		(*unlockast)(data, status);
+	}
+
+	if (status == DLM_CANCELGRANT)
+		status = DLM_NORMAL;
+
+	if (status == DLM_NORMAL) {
+		mlog(0, "kicking the thread\n");
+		dlm_kick_thread(dlm, res);
+	} else
+		dlm_error(status);
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_lockres_put(res);
+	dlm_lock_put(lock);
+
+	mlog(0, "returning status=%d!\n", status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmunlock);
+
diff --git a/kernel/fs/ocfs2/dlmfs/Makefile b/kernel/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000..eed3db8c5
--- /dev/null
+++ b/kernel/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/kernel/fs/ocfs2/dlmfs/dlmfs.c b/kernel/fs/ocfs2/dlmfs/dlmfs.c
new file mode 100644
index 000000000..b5cf27dcb
--- /dev/null
+++ b/kernel/fs/ocfs2/dlmfs/dlmfs.c
@@ -0,0 +1,690 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfs.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM. This file handles the virtual file system
+ * used for communication with userspace. Credit should go to ramfs,
+ * which was a template for the fs side of this module.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* Simple VFS hooks based on: */
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/poll.h>
+
+#include <asm/uaccess.h>
+
+#include "stackglue.h"
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+
+static const struct super_operations dlmfs_ops;
+static const struct file_operations dlmfs_file_operations;
+static const struct inode_operations dlmfs_dir_inode_operations;
+static const struct inode_operations dlmfs_root_inode_operations;
+static const struct inode_operations dlmfs_file_inode_operations;
+static struct kmem_cache *dlmfs_inode_cache;
+
+struct workqueue_struct *user_dlm_worker;
+
+
+
+/*
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI.  Unfortunately, some of these features are not detectable
+ * via standard usage.  For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support.  Instead, we provide this list of new capabilities.
+ *
+ * Capabilities is a read-only attribute.  We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
+ *
+ * The ABI features are local to this machine's dlmfs mount.  This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ *
+ * Capabilities:
+ * - bast	: POLLIN against the file descriptor of a held lock
+ *		  signifies a bast fired on the lock.
+ */
+#define DLMFS_CAPABILITIES "bast stackglue"
+static int param_set_dlmfs_capabilities(const char *val,
+					struct kernel_param *kp)
+{
+	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+	return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+					struct kernel_param *kp)
+{
+	return strlcpy(buffer, DLMFS_CAPABILITIES,
+		       strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+		  param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
+
+
+/*
+ * decodes a set of open flags into a valid lock level and a set of flags.
+ * returns < 0 if we have invalid flags
+ * flags which mean something to us:
+ * O_RDONLY -> PRMODE level
+ * O_WRONLY -> EXMODE level
+ *
+ * O_NONBLOCK -> NOQUEUE
+ */
+static int dlmfs_decode_open_flags(int open_flags,
+				   int *level,
+				   int *flags)
+{
+	if (open_flags & (O_WRONLY|O_RDWR))
+		*level = DLM_LOCK_EX;
+	else
+		*level = DLM_LOCK_PR;
+
+	*flags = 0;
+	if (open_flags & O_NONBLOCK)
+		*flags |= DLM_LKF_NOQUEUE;
+
+	return 0;
+}
+
+static int dlmfs_file_open(struct inode *inode,
+			   struct file *file)
+{
+	int status, level, flags;
+	struct dlmfs_filp_private *fp = NULL;
+	struct dlmfs_inode_private *ip;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
+		file->f_flags);
+
+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
+	if (status < 0)
+		goto bail;
+
+	/* We don't want to honor O_APPEND at read/write time as it
+	 * doesn't make sense for LVB writes. */
+	file->f_flags &= ~O_APPEND;
+
+	fp = kmalloc(sizeof(*fp), GFP_NOFS);
+	if (!fp) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	fp->fp_lock_level = level;
+
+	ip = DLMFS_I(inode);
+
+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
+	if (status < 0) {
+		/* this is a strange error to return here but I want
+		 * to be able userspace to be able to distinguish a
+		 * valid lock request from one that simply couldn't be
+		 * granted. */
+		if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
+			status = -ETXTBSY;
+		kfree(fp);
+		goto bail;
+	}
+
+	file->private_data = fp;
+bail:
+	return status;
+}
+
+static int dlmfs_file_release(struct inode *inode,
+			      struct file *file)
+{
+	int level, status;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+	struct dlmfs_filp_private *fp = file->private_data;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "close called on inode %lu\n", inode->i_ino);
+
+	status = 0;
+	if (fp) {
+		level = fp->fp_lock_level;
+		if (level != DLM_LOCK_IV)
+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
+
+		kfree(fp);
+		file->private_data = NULL;
+	}
+
+	return 0;
+}
+
+/*
+ * We do ->setattr() just to override size changes.  Our size is the size
+ * of the LVB and nothing else.
+ */
+static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int error;
+	struct inode *inode = d_inode(dentry);
+
+	attr->ia_valid &= ~ATTR_SIZE;
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+	int event = 0;
+	struct inode *inode = file_inode(file);
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+
+	poll_wait(file, &ip->ip_lockres.l_event, wait);
+
+	spin_lock(&ip->ip_lockres.l_lock);
+	if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+		event = POLLIN | POLLRDNORM;
+	spin_unlock(&ip->ip_lockres.l_lock);
+
+	return event;
+}
+
+static ssize_t dlmfs_file_read(struct file *filp,
+			       char __user *buf,
+			       size_t count,
+			       loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t readlen, got;
+	char *lvb_buf;
+	struct inode *inode = file_inode(filp);
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	/* don't read past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		readlen = i_size_read(inode) - *ppos;
+	else
+		readlen = count;
+
+	lvb_buf = kmalloc(readlen, GFP_NOFS);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	got = user_dlm_read_lvb(inode, lvb_buf, readlen);
+	if (got) {
+		BUG_ON(got != readlen);
+		bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+		readlen -= bytes_left;
+	} else
+		readlen = 0;
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + readlen;
+
+	mlog(0, "read %zd bytes\n", readlen);
+	return readlen;
+}
+
+static ssize_t dlmfs_file_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t writelen;
+	char *lvb_buf;
+	struct inode *inode = file_inode(filp);
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return -ENOSPC;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* don't write past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		writelen = i_size_read(inode) - *ppos;
+	else
+		writelen = count - *ppos;
+
+	lvb_buf = kmalloc(writelen, GFP_NOFS);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
+	writelen -= bytes_left;
+	if (writelen)
+		user_dlm_write_lvb(inode, lvb_buf, writelen);
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + writelen;
+	mlog(0, "wrote %zd bytes\n", writelen);
+	return writelen;
+}
+
+static void dlmfs_init_once(void *foo)
+{
+	struct dlmfs_inode_private *ip =
+		(struct dlmfs_inode_private *) foo;
+
+	ip->ip_conn = NULL;
+	ip->ip_parent = NULL;
+
+	inode_init_once(&ip->ip_vfs_inode);
+}
+
+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
+	if (!ip)
+		return NULL;
+
+	return &ip->ip_vfs_inode;
+}
+
+static void dlmfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
+}
+
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
+
+static void dlmfs_evict_inode(struct inode *inode)
+{
+	int status;
+	struct dlmfs_inode_private *ip;
+
+	clear_inode(inode);
+
+	mlog(0, "inode %lu\n", inode->i_ino);
+
+	ip = DLMFS_I(inode);
+
+	if (S_ISREG(inode->i_mode)) {
+		status = user_dlm_destroy_lock(&ip->ip_lockres);
+		if (status < 0)
+			mlog_errno(status);
+		iput(ip->ip_parent);
+		goto clear_fields;
+	}
+
+	mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
+	/* we must be a directory. If required, lets unregister the
+	 * dlm context now. */
+	if (ip->ip_conn)
+		user_dlm_unregister(ip->ip_conn);
+clear_fields:
+	ip->ip_parent = NULL;
+	ip->ip_conn = NULL;
+}
+
+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	umode_t mode = S_IFDIR | 0755;
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode_init_owner(inode, NULL, mode);
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inc_nlink(inode);
+
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &dlmfs_root_inode_operations;
+	}
+
+	return inode;
+}
+
+static struct inode *dlmfs_get_inode(struct inode *parent,
+				     struct dentry *dentry,
+				     umode_t mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode * inode = new_inode(sb);
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = get_next_ino();
+	inode_init_owner(inode, parent, mode);
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ip = DLMFS_I(inode);
+	ip->ip_conn = DLMFS_I(parent)->ip_conn;
+
+	switch (mode & S_IFMT) {
+	default:
+		/* for now we don't support anything other than
+		 * directories and regular files. */
+		BUG();
+		break;
+	case S_IFREG:
+		inode->i_op = &dlmfs_file_inode_operations;
+		inode->i_fop = &dlmfs_file_operations;
+
+		i_size_write(inode,  DLM_LVB_LEN);
+
+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
+
+		/* released at clear_inode time, this insures that we
+		 * get to drop the dlm reference on each lock *before*
+		 * we call the unregister code for releasing parent
+		 * directories. */
+		ip->ip_parent = igrab(parent);
+		BUG_ON(!ip->ip_parent);
+		break;
+	case S_IFDIR:
+		inode->i_op = &dlmfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink ==
+		 * 2 (for "." entry) */
+		inc_nlink(inode);
+		break;
+	}
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int dlmfs_mkdir(struct inode * dir,
+		       struct dentry * dentry,
+		       umode_t mode)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct qstr *domain = &dentry->d_name;
+	struct dlmfs_inode_private *ip;
+	struct ocfs2_cluster_connection *conn;
+
+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
+
+	/* verify that we have a proper domain */
+	if (domain->len >= GROUP_NAME_MAX) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid domain name for directory.\n");
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ip = DLMFS_I(inode);
+
+	conn = user_dlm_register(domain);
+	if (IS_ERR(conn)) {
+		status = PTR_ERR(conn);
+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
+		     status, domain->len, domain->name);
+		goto bail;
+	}
+	ip->ip_conn = conn;
+
+	inc_nlink(dir);
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+
+	status = 0;
+bail:
+	if (status < 0)
+		iput(inode);
+	return status;
+}
+
+static int dlmfs_create(struct inode *dir,
+			struct dentry *dentry,
+			umode_t mode,
+			bool excl)
+{
+	int status = 0;
+	struct inode *inode;
+	struct qstr *name = &dentry->d_name;
+
+	mlog(0, "create %.*s\n", name->len, name->name);
+
+	/* verify name is valid and doesn't contain any dlm reserved
+	 * characters */
+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
+	    name->name[0] == '$') {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
+		     name->name);
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+bail:
+	return status;
+}
+
+static int dlmfs_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	struct inode *inode = d_inode(dentry);
+
+	mlog(0, "unlink inode %lu\n", inode->i_ino);
+
+	/* if there are no current holders, or none that are waiting
+	 * to acquire a lock, this basically destroys our lockres. */
+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
+	if (status < 0) {
+		mlog(ML_ERROR, "unlink %pd, error %d from destroy\n",
+		     dentry, status);
+		goto bail;
+	}
+	status = simple_unlink(dir, dentry);
+bail:
+	return status;
+}
+
+static int dlmfs_fill_super(struct super_block * sb,
+			    void * data,
+			    int silent)
+{
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = DLMFS_MAGIC;
+	sb->s_op = &dlmfs_ops;
+	sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
+	if (!sb->s_root)
+		return -ENOMEM;
+	return 0;
+}
+
+static const struct file_operations dlmfs_file_operations = {
+	.open		= dlmfs_file_open,
+	.release	= dlmfs_file_release,
+	.poll		= dlmfs_file_poll,
+	.read		= dlmfs_file_read,
+	.write		= dlmfs_file_write,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations dlmfs_dir_inode_operations = {
+	.create		= dlmfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= dlmfs_unlink,
+};
+
+/* this way we can restrict mkdir to only the toplevel of the fs. */
+static const struct inode_operations dlmfs_root_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= dlmfs_mkdir,
+	.rmdir		= simple_rmdir,
+};
+
+static const struct super_operations dlmfs_ops = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= dlmfs_alloc_inode,
+	.destroy_inode	= dlmfs_destroy_inode,
+	.evict_inode	= dlmfs_evict_inode,
+	.drop_inode	= generic_delete_inode,
+};
+
+static const struct inode_operations dlmfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+	.setattr	= dlmfs_file_setattr,
+};
+
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
+}
+
+static struct file_system_type dlmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ocfs2_dlmfs",
+	.mount		= dlmfs_mount,
+	.kill_sb	= kill_litter_super,
+};
+MODULE_ALIAS_FS("ocfs2_dlmfs");
+
+static int __init init_dlmfs_fs(void)
+{
+	int status;
+	int cleanup_inode = 0, cleanup_worker = 0;
+
+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
+				sizeof(struct dlmfs_inode_private),
+				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+					SLAB_MEM_SPREAD),
+				dlmfs_init_once);
+	if (!dlmfs_inode_cache) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_inode = 1;
+
+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	if (!user_dlm_worker) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_worker = 1;
+
+	user_dlm_set_locking_protocol();
+	status = register_filesystem(&dlmfs_fs_type);
+bail:
+	if (status) {
+		if (cleanup_inode)
+			kmem_cache_destroy(dlmfs_inode_cache);
+		if (cleanup_worker)
+			destroy_workqueue(user_dlm_worker);
+	} else
+		printk("OCFS2 User DLM kernel interface loaded\n");
+	return status;
+}
+
+static void __exit exit_dlmfs_fs(void)
+{
+	unregister_filesystem(&dlmfs_fs_type);
+
+	flush_workqueue(user_dlm_worker);
+	destroy_workqueue(user_dlm_worker);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(dlmfs_inode_cache);
+
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
+
+module_init(init_dlmfs_fs)
+module_exit(exit_dlmfs_fs)
diff --git a/kernel/fs/ocfs2/dlmfs/userdlm.c b/kernel/fs/ocfs2/dlmfs/userdlm.c
new file mode 100644
index 000000000..0499e3fb7
--- /dev/null
+++ b/kernel/fs/ocfs2/dlmfs/userdlm.c
@@ -0,0 +1,688 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM.
+ *
+ * Many of the functions here are pared down versions of dlmglue.c
+ * functions.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/signal.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+
+#include "ocfs2_lockingver.h"
+#include "stackglue.h"
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+
+static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct user_lock_res, l_lksb);
+}
+
+static inline int user_check_wait_flag(struct user_lock_res *lockres,
+				       int flag)
+{
+	int ret;
+
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
+}
+
+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
+}
+
+/* I heart container_of... */
+static inline struct ocfs2_cluster_connection *
+cluster_connection_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return ip->ip_conn;
+}
+
+static struct inode *
+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return &ip->ip_vfs_inode;
+}
+
+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+}
+
+#define user_log_dlm_error(_func, _stat, _lockres) do {			\
+	mlog(ML_ERROR, "Dlm error %d while calling %s on "		\
+		"resource %.*s\n", _stat, _func,			\
+		_lockres->l_namelen, _lockres->l_name); 		\
+} while (0)
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int user_highest_compat_lock_level(int level)
+{
+	int new_level = DLM_LOCK_EX;
+
+	if (level == DLM_LOCK_EX)
+		new_level = DLM_LOCK_NL;
+	else if (level == DLM_LOCK_PR)
+		new_level = DLM_LOCK_PR;
+	return new_level;
+}
+
+static void user_ast(struct ocfs2_dlm_lksb *lksb)
+{
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+	int status;
+
+	mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level,
+	     lockres->l_requested);
+
+	spin_lock(&lockres->l_lock);
+
+	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+	if (status) {
+		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
+		     status, lockres->l_namelen, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
+			"Lockres %.*s, requested ivmode. flags 0x%x\n",
+			lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	/* we're downconverting. */
+	if (lockres->l_requested < lockres->l_level) {
+		if (lockres->l_requested <=
+		    user_highest_compat_lock_level(lockres->l_blocking)) {
+			lockres->l_blocking = DLM_LOCK_NL;
+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
+		}
+	}
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_requested = DLM_LOCK_IV;
+	lockres->l_flags |= USER_LOCK_ATTACHED;
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	if (!igrab(inode))
+		BUG();
+}
+
+static void user_dlm_unblock_lock(struct work_struct *work);
+
+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
+{
+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
+		user_dlm_grab_inode_ref(lockres);
+
+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
+
+		queue_work(user_dlm_worker, &lockres->l_work);
+		lockres->l_flags |= USER_LOCK_QUEUED;
+	}
+}
+
+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
+{
+	int queue = 0;
+
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
+		return;
+
+	switch (lockres->l_blocking) {
+	case DLM_LOCK_EX:
+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+			queue = 1;
+		break;
+	case DLM_LOCK_PR:
+		if (!lockres->l_ex_holders)
+			queue = 1;
+		break;
+	default:
+		BUG();
+	}
+
+	if (queue)
+		__user_dlm_queue_lockres(lockres);
+}
+
+static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
+{
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+
+	mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
+	     lockres->l_namelen, lockres->l_name, level, lockres->l_level);
+
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= USER_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+
+	__user_dlm_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
+{
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	if (status)
+		mlog(ML_ERROR, "dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	/* The teardown flag gets set early during the unlock process,
+	 * so test the cancel flag to make sure that this ast isn't
+	 * for a concurrent cancel. */
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
+	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
+		lockres->l_level = DLM_LOCK_IV;
+	} else if (status == DLM_CANCELGRANT) {
+		/* We tried to cancel a convert request, but it was
+		 * already granted. Don't clear the busy flag - the
+		 * ast should've done this already. */
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		goto out_noclear;
+	} else {
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		/* Cancel succeeded, we want to re-queue */
+		lockres->l_requested = DLM_LOCK_IV; /* cancel an
+						    * upconvert
+						    * request. */
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		/* we want the unblock thread to look at it again
+		 * now. */
+		if (lockres->l_flags & USER_LOCK_BLOCKED)
+			__user_dlm_queue_lockres(lockres);
+	}
+
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+out_noclear:
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static struct ocfs2_locking_protocol user_dlm_lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= user_ast,
+	.lp_blocking_ast	= user_bast,
+	.lp_unlock_ast		= user_unlock_ast,
+};
+
+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	iput(inode);
+}
+
+static void user_dlm_unblock_lock(struct work_struct *work)
+{
+	int new_level, status;
+	struct user_lock_res *lockres =
+		container_of(work, struct user_lock_res, l_work);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
+
+	mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
+			"Lockres %.*s, flags 0x%x\n",
+			lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
+	 * set, we want user_ast clear it. */
+	lockres->l_flags &= ~USER_LOCK_QUEUED;
+
+	/* It's valid to get here and no longer be blocked - if we get
+	 * several basts in a row, we might be queued by the first
+	 * one, the unblock thread might run and clear the queued
+	 * flag, and finally we might get another bast which re-queues
+	 * us before our ast for the downconvert is called. */
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
+		     lockres->l_namelen, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
+		     lockres->l_namelen, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_BUSY) {
+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
+			     lockres->l_namelen, lockres->l_name);
+			spin_unlock(&lockres->l_lock);
+			goto drop_ref;
+		}
+
+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
+		spin_unlock(&lockres->l_lock);
+
+		status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
+					  DLM_LKF_CANCEL);
+		if (status)
+			user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
+		goto drop_ref;
+	}
+
+	/* If there are still incompat holders, we can exit safely
+	 * without worrying about re-queueing this lock as that will
+	 * happen on the last call to user_cluster_unlock. */
+	if ((lockres->l_blocking == DLM_LOCK_EX)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders, lockres->l_ro_holders);
+		goto drop_ref;
+	}
+
+	if ((lockres->l_blocking == DLM_LOCK_PR)
+	    && lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	/* yay, we can downconvert now. */
+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
+	lockres->l_requested = new_level;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
+	spin_unlock(&lockres->l_lock);
+
+	/* need lock downconvert request now... */
+	status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
+				DLM_LKF_CONVERT|DLM_LKF_VALBLK,
+				lockres->l_name,
+				lockres->l_namelen);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
+		user_recover_from_dlm_error(lockres);
+	}
+
+drop_ref:
+	user_dlm_drop_inode_ref(lockres);
+}
+
+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case DLM_LOCK_EX:
+		lockres->l_ex_holders++;
+		break;
+	case DLM_LOCK_PR:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int
+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
+				  int wanted)
+{
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+
+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
+}
+
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags)
+{
+	int status, local_flags;
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
+
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
+		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
+		     lockres->l_namelen, lockres->l_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, level, lkm_flags);
+
+again:
+	if (signal_pending(current)) {
+		status = -ERESTARTSYS;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
+	    (level > lockres->l_level)) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		local_flags = lkm_flags | DLM_LKF_VALBLK;
+		if (lockres->l_level != DLM_LOCK_IV)
+			local_flags |= DLM_LKF_CONVERT;
+
+		lockres->l_requested = level;
+		lockres->l_flags |= USER_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		BUG_ON(level == DLM_LOCK_IV);
+		BUG_ON(level == DLM_LOCK_NL);
+
+		/* call dlm_lock to upgrade lock now */
+		status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
+					local_flags, lockres->l_name,
+					lockres->l_namelen);
+		if (status) {
+			if ((lkm_flags & DLM_LKF_NOQUEUE) &&
+			    (status != -EAGAIN))
+				user_log_dlm_error("ocfs2_dlm_lock",
+						   status, lockres);
+			user_recover_from_dlm_error(lockres);
+			goto bail;
+		}
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	user_dlm_inc_holders(lockres, level);
+	spin_unlock(&lockres->l_lock);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case DLM_LOCK_EX:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case DLM_LOCK_PR:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level)
+{
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
+		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
+		     lockres->l_namelen, lockres->l_name);
+		return;
+	}
+
+	spin_lock(&lockres->l_lock);
+	user_dlm_dec_holders(lockres, level);
+	__user_dlm_cond_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < DLM_LOCK_EX);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	memcpy(lvb, val, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb;
+	ssize_t ret = len;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < DLM_LOCK_PR);
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+		memcpy(val, lvb, len);
+	} else
+		ret = 0;
+
+	spin_unlock(&lockres->l_lock);
+	return ret;
+}
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry)
+{
+	memset(lockres, 0, sizeof(*lockres));
+
+	spin_lock_init(&lockres->l_lock);
+	init_waitqueue_head(&lockres->l_event);
+	lockres->l_level = DLM_LOCK_IV;
+	lockres->l_requested = DLM_LOCK_IV;
+	lockres->l_blocking = DLM_LOCK_IV;
+
+	/* should have been checked before getting here. */
+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
+
+	memcpy(lockres->l_name,
+	       dentry->d_name.name,
+	       dentry->d_name.len);
+	lockres->l_namelen = dentry->d_name.len;
+}
+
+int user_dlm_destroy_lock(struct user_lock_res *lockres)
+{
+	int status = -EBUSY;
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
+
+	mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		spin_unlock(&lockres->l_lock);
+		return 0;
+	}
+
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+
+	while (lockres->l_flags & USER_LOCK_BUSY) {
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+
+		spin_lock(&lockres->l_lock);
+	}
+
+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	status = 0;
+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
+		goto bail;
+	}
+
+	user_wait_on_busy_lock(lockres);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static void user_dlm_recovery_handler_noop(int node_num,
+					   void *recovery_data)
+{
+	/* We ignore recovery events */
+	return;
+}
+
+void user_dlm_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
+}
+
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+{
+	int rc;
+	struct ocfs2_cluster_connection *conn;
+
+	rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
+					    &user_dlm_lproto,
+					    user_dlm_recovery_handler_noop,
+					    NULL, &conn);
+	if (rc)
+		mlog_errno(rc);
+
+	return rc ? ERR_PTR(rc) : conn;
+}
+
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
+{
+	ocfs2_cluster_disconnect(conn, 0);
+}
diff --git a/kernel/fs/ocfs2/dlmfs/userdlm.h b/kernel/fs/ocfs2/dlmfs/userdlm.h
new file mode 100644
index 000000000..3b42d7953
--- /dev/null
+++ b/kernel/fs/ocfs2/dlmfs/userdlm.h
@@ -0,0 +1,113 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.h
+ *
+ * Userspace dlm defines
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef USERDLM_H
+#define USERDLM_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* user_lock_res->l_flags flags. */
+#define USER_LOCK_ATTACHED      (0x00000001) /* we have initialized
+					       * the lvb */
+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					      * downconvert*/
+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
+					      * destroying this
+					      * lock. */
+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
+					      * workqueue */
+#define USER_LOCK_IN_CANCEL     (0x00000020)
+
+struct user_lock_res {
+	spinlock_t               l_lock;
+
+	int                      l_flags;
+
+#define USER_DLM_LOCK_ID_MAX_LEN  32
+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+	int                      l_namelen;
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct ocfs2_dlm_lksb    l_lksb;
+
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct work_struct       l_work;
+};
+
+extern struct workqueue_struct *user_dlm_worker;
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry);
+int user_dlm_destroy_lock(struct user_lock_res *lockres);
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags);
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level);
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len);
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
+void user_dlm_set_locking_protocol(void);
+
+struct dlmfs_inode_private {
+	struct ocfs2_cluster_connection	*ip_conn;
+
+	struct user_lock_res ip_lockres; /* unused for directories. */
+	struct inode         *ip_parent;
+
+	struct inode         ip_vfs_inode;
+};
+
+static inline struct dlmfs_inode_private *
+DLMFS_I(struct inode *inode)
+{
+        return container_of(inode,
+			    struct dlmfs_inode_private,
+			    ip_vfs_inode);
+}
+
+struct dlmfs_filp_private {
+	int                  fp_lock_level;
+};
+
+#define DLMFS_MAGIC	0x76a9f425
+
+#endif /* USERDLM_H */
diff --git a/kernel/fs/ocfs2/dlmglue.c b/kernel/fs/ocfs2/dlmglue.c
new file mode 100644
index 000000000..8b23aa2f5
--- /dev/null
+++ b/kernel/fs/ocfs2/dlmglue.c
@@ -0,0 +1,4106 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.c
+ *
+ * Code which implements an OCFS2 specific interface to our DLM.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+#include <linux/quotaops.h>
+
+#define MLOG_MASK_PREFIX ML_DLM_GLUE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_lockingver.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "stackglue.h"
+#include "slot_map.h"
+#include "super.h"
+#include "uptodate.h"
+#include "quota.h"
+#include "refcounttree.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_mask_waiter {
+	struct list_head	mw_item;
+	int			mw_status;
+	struct completion	mw_complete;
+	unsigned long		mw_mask;
+	unsigned long		mw_goal;
+#ifdef CONFIG_OCFS2_FS_STATS
+	ktime_t			mw_lock_start;
+#endif
+};
+
+static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
+
+/*
+ * Return value from ->downconvert_worker functions.
+ *
+ * These control the precise actions of ocfs2_unblock_lock()
+ * and ocfs2_process_blocked_lock()
+ *
+ */
+enum ocfs2_unblock_action {
+	UNBLOCK_CONTINUE	= 0, /* Continue downconvert */
+	UNBLOCK_CONTINUE_POST	= 1, /* Continue downconvert, fire
+				      * ->post_unlock callback */
+	UNBLOCK_STOP_POST	= 2, /* Do not downconvert, fire
+				      * ->post_unlock() callback. */
+};
+
+struct ocfs2_unblock_ctl {
+	int requeue;
+	enum ocfs2_unblock_action unblock_action;
+};
+
+/* Lockdep class keys */
+struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+					int new_level);
+static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
+
+static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				     int blocking);
+
+static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
+				       int blocking);
+
+static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
+				     struct ocfs2_lock_res *lockres);
+
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
+
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level);
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking);
+
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+static void ocfs2_dump_meta_lvb_info(u64 level,
+				     const char *function,
+				     unsigned int line,
+				     struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
+	     lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
+	     be32_to_cpu(lvb->lvb_igeneration));
+	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
+	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
+	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
+	     be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
+	     "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
+	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_imtime_packed),
+	     be32_to_cpu(lvb->lvb_iattr));
+}
+
+
+/*
+ * OCFS2 Lock Resource Operations
+ *
+ * These fine tune the behavior of the generic dlmglue locking infrastructure.
+ *
+ * The most basic of lock types can point ->l_priv to their respective
+ * struct ocfs2_super and allow the default actions to manage things.
+ *
+ * Right now, each lock type also needs to implement an init function,
+ * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres()
+ * should be called when the lock is no longer needed (i.e., object
+ * destruction time).
+ */
+struct ocfs2_lock_res_ops {
+	/*
+	 * Translate an ocfs2_lock_res * into an ocfs2_super *. Define
+	 * this callback if ->l_priv is not an ocfs2_super pointer
+	 */
+	struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
+
+	/*
+	 * Optionally called in the downconvert thread after a
+	 * successful downconvert. The lockres will not be referenced
+	 * after this callback is called, so it is safe to free
+	 * memory, etc.
+	 *
+	 * The exact semantics of when this is called are controlled
+	 * by ->downconvert_worker()
+	 */
+	void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *);
+
+	/*
+	 * Allow a lock type to add checks to determine whether it is
+	 * safe to downconvert a lock. Return 0 to re-queue the
+	 * downconvert at a later time, nonzero to continue.
+	 *
+	 * For most locks, the default checks that there are no
+	 * incompatible holders are sufficient.
+	 *
+	 * Called with the lockres spinlock held.
+	 */
+	int (*check_downconvert)(struct ocfs2_lock_res *, int);
+
+	/*
+	 * Allows a lock type to populate the lock value block. This
+	 * is called on downconvert, and when we drop a lock.
+	 *
+	 * Locks that want to use this should set LOCK_TYPE_USES_LVB
+	 * in the flags field.
+	 *
+	 * Called with the lockres spinlock held.
+	 */
+	void (*set_lvb)(struct ocfs2_lock_res *);
+
+	/*
+	 * Called from the downconvert thread when it is determined
+	 * that a lock will be downconverted. This is called without
+	 * any locks held so the function can do work that might
+	 * schedule (syncing out data, etc).
+	 *
+	 * This should return any one of the ocfs2_unblock_action
+	 * values, depending on what it wants the thread to do.
+	 */
+	int (*downconvert_worker)(struct ocfs2_lock_res *, int);
+
+	/*
+	 * LOCK_TYPE_* flags which describe the specific requirements
+	 * of a lock type. Descriptions of each individual flag follow.
+	 */
+	int flags;
+};
+
+/*
+ * Some locks want to "refresh" potentially stale data when a
+ * meaningful (PRMODE or EXMODE) lock level is first obtained. If this
+ * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
+ * individual lockres l_flags member from the ast function. It is
+ * expected that the locking wrapper will clear the
+ * OCFS2_LOCK_NEEDS_REFRESH flag when done.
+ */
+#define LOCK_TYPE_REQUIRES_REFRESH 0x1
+
+/*
+ * Indicate that a lock type makes use of the lock value block. The
+ * ->set_lvb lock type callback must be defined.
+ */
+#define LOCK_TYPE_USES_LVB		0x2
+
+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.check_downconvert = ocfs2_check_meta_downconvert,
+	.set_lvb	= ocfs2_set_meta_lvb,
+	.downconvert_worker = ocfs2_data_convert_worker,
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
+	.get_osb	= ocfs2_get_dentry_osb,
+	.post_unlock	= ocfs2_dentry_post_unlock,
+	.downconvert_worker = ocfs2_dentry_convert_worker,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+	.get_osb	= ocfs2_get_file_osb,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+	.set_lvb	= ocfs2_set_qinfo_lvb,
+	.get_osb	= ocfs2_get_qinfo_osb,
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+	.check_downconvert = ocfs2_check_refcount_downconvert,
+	.downconvert_worker = ocfs2_refcount_convert_worker,
+	.flags		= 0,
+};
+
+static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
+		lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
+}
+
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct ocfs2_lock_res, l_lksb);
+}
+
+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	return (struct inode *) lockres->l_priv;
+}
+
+static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
+
+	return (struct ocfs2_dentry_lock *)lockres->l_priv;
+}
+
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+	return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
+static inline struct ocfs2_refcount_tree *
+ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
+{
+	return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
+}
+
+static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
+{
+	if (lockres->l_ops->get_osb)
+		return lockres->l_ops->get_osb(lockres);
+
+	return (struct ocfs2_super *)lockres->l_priv;
+}
+
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     u32 dlm_flags);
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted);
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				   struct ocfs2_lock_res *lockres,
+				   int level, unsigned long caller_ip);
+static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres,
+					int level)
+{
+	__ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert);
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {					\
+	if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY)				\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n",	\
+		     _err, _func, _lockres->l_name);					\
+	else										\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n",	\
+		     _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name,	\
+		     (unsigned int)ocfs2_get_dentry_lock_ino(_lockres));		\
+} while (0)
+static int ocfs2_downconvert_thread(void *arg);
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static int ocfs2_inode_lock_update(struct inode *inode,
+				  struct buffer_head **bh);
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+					      int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb,
+				  unsigned int generation);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+
+
+static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				  u64 blkno,
+				  u32 generation,
+				  char *name)
+{
+	int len;
+
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+
+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
+		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
+		       (long long)blkno, generation);
+
+	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+	mlog(0, "built lock resource with name: %s\n", name);
+}
+
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
+
+static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
+				       struct ocfs2_dlm_debug *dlm_debug)
+{
+	mlog(0, "Add tracking for lockres %s\n", res->l_name);
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
+{
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	if (!list_empty(&res->l_debug_list))
+		list_del_init(&res->l_debug_list);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+#ifdef CONFIG_OCFS2_FS_STATS
+static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+	res->l_lock_refresh = 0;
+	memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
+	memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
+}
+
+static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
+				    struct ocfs2_mask_waiter *mw, int ret)
+{
+	u32 usec;
+	ktime_t kt;
+	struct ocfs2_lock_stats *stats;
+
+	if (level == LKM_PRMODE)
+		stats = &res->l_lock_prmode;
+	else if (level == LKM_EXMODE)
+		stats = &res->l_lock_exmode;
+	else
+		return;
+
+	kt = ktime_sub(ktime_get(), mw->mw_lock_start);
+	usec = ktime_to_us(kt);
+
+	stats->ls_gets++;
+	stats->ls_total += ktime_to_ns(kt);
+	/* overflow */
+	if (unlikely(stats->ls_gets == 0)) {
+		stats->ls_gets++;
+		stats->ls_total = ktime_to_ns(kt);
+	}
+
+	if (stats->ls_max < usec)
+		stats->ls_max = usec;
+
+	if (ret)
+		stats->ls_fail++;
+}
+
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+	lockres->l_lock_refresh++;
+}
+
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+	mw->mw_lock_start = ktime_get();
+}
+#else
+static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+}
+static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
+			   int level, struct ocfs2_mask_waiter *mw, int ret)
+{
+}
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+}
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+}
+#endif
+
+static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       struct ocfs2_lock_res_ops *ops,
+				       void *priv)
+{
+	res->l_type          = type;
+	res->l_ops           = ops;
+	res->l_priv          = priv;
+
+	res->l_level         = DLM_LOCK_IV;
+	res->l_requested     = DLM_LOCK_IV;
+	res->l_blocking      = DLM_LOCK_IV;
+	res->l_action        = OCFS2_AST_INVALID;
+	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	res->l_flags         = OCFS2_LOCK_INITIALIZED;
+
+	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+
+	ocfs2_init_lock_stats(res);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (type != OCFS2_LOCK_TYPE_OPEN)
+		lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
+				 &lockdep_keys[type], 0);
+	else
+		res->l_lockdep_map.key = NULL;
+#endif
+}
+
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
+{
+	/* This also clears out the lock status block */
+	memset(res, 0, sizeof(struct ocfs2_lock_res));
+	spin_lock_init(&res->l_lock);
+	init_waitqueue_head(&res->l_event);
+	INIT_LIST_HEAD(&res->l_blocked_list);
+	INIT_LIST_HEAD(&res->l_mask_waiters);
+}
+
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       unsigned int generation,
+			       struct inode *inode)
+{
+	struct ocfs2_lock_res_ops *ops;
+
+	switch(type) {
+		case OCFS2_LOCK_TYPE_RW:
+			ops = &ocfs2_inode_rw_lops;
+			break;
+		case OCFS2_LOCK_TYPE_META:
+			ops = &ocfs2_inode_inode_lops;
+			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			ops = &ocfs2_inode_open_lops;
+			break;
+		default:
+			mlog_bug_on_msg(1, "type: %d\n", type);
+			ops = NULL; /* thanks, gcc */
+			break;
+	};
+
+	ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
+			      generation, res->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
+}
+
+static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	return OCFS2_SB(inode->i_sb);
+}
+
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+	return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_file_private *fp = lockres->l_priv;
+
+	return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
+
+static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
+{
+	__be64 inode_blkno_be;
+
+	memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
+	       sizeof(__be64));
+
+	return be64_to_cpu(inode_blkno_be);
+}
+
+static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_dentry_lock *dl = lockres->l_priv;
+
+	return OCFS2_SB(dl->dl_inode->i_sb);
+}
+
+void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
+				u64 parent, struct inode *inode)
+{
+	int len;
+	u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
+	__be64 inode_blkno_be = cpu_to_be64(inode_blkno);
+	struct ocfs2_lock_res *lockres = &dl->dl_lockres;
+
+	ocfs2_lock_res_init_once(lockres);
+
+	/*
+	 * Unfortunately, the standard lock naming scheme won't work
+	 * here because we have two 16 byte values to use. Instead,
+	 * we'll stuff the inode number as a binary value. We still
+	 * want error prints to show something without garbling the
+	 * display, so drop a null byte in there before the inode
+	 * number. A future version of OCFS2 will likely use all
+	 * binary lock names. The stringified names have been a
+	 * tremendous aid in debugging, but now that the debugfs
+	 * interface exists, we can mangle things there if need be.
+	 *
+	 * NOTE: We also drop the standard "pad" value (the total lock
+	 * name size stays the same though - the last part is all
+	 * zeros due to the memset in ocfs2_lock_res_init_once()
+	 */
+	len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
+		       "%c%016llx",
+		       ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
+		       (long long)parent);
+
+	BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
+
+	memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
+	       sizeof(__be64));
+
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+				   OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
+				   dl);
+}
+
+static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
+				      struct ocfs2_super *osb)
+{
+	/* Superblock lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
+			      0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
+				   &ocfs2_super_lops, osb);
+}
+
+static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
+				       struct ocfs2_super *osb)
+{
+	/* Rename lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
+				   &ocfs2_rename_lops, osb);
+}
+
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+					 struct ocfs2_super *osb)
+{
+	/* nfs_sync lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+				   &ocfs2_nfs_sync_lops, osb);
+}
+
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+					    struct ocfs2_super *osb)
+{
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+				   &ocfs2_orphan_scan_lops, osb);
+}
+
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+			      struct ocfs2_file_private *fp)
+{
+	struct inode *inode = fp->fp_file->f_mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+			      inode->i_generation, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+				   OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+				   fp);
+	lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
+
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+			       struct ocfs2_mem_dqinfo *info)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+			      0, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+				   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+				   info);
+}
+
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno,
+				  unsigned int generation)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
+			      generation, lockres->l_name);
+	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
+				   &ocfs2_refcount_block_lops, osb);
+}
+
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
+	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
+		return;
+
+	ocfs2_remove_lockres_tracking(res);
+
+	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
+			"Lockres %s is on the blocked list\n",
+			res->l_name);
+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
+			"Lockres %s has mask waiters pending\n",
+			res->l_name);
+	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
+			"Lockres %s is locked\n",
+			res->l_name);
+	mlog_bug_on_msg(res->l_ro_holders,
+			"Lockres %s has %u ro holders\n",
+			res->l_name, res->l_ro_holders);
+	mlog_bug_on_msg(res->l_ex_holders,
+			"Lockres %s has %u ex holders\n",
+			res->l_name, res->l_ex_holders);
+
+	/* Need to clear out the lock status block for the dlm */
+	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
+
+	res->l_flags = 0UL;
+}
+
+static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case DLM_LOCK_EX:
+		lockres->l_ex_holders++;
+		break;
+	case DLM_LOCK_PR:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case DLM_LOCK_EX:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case DLM_LOCK_PR:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+	int new_level = DLM_LOCK_EX;
+
+	if (level == DLM_LOCK_EX)
+		new_level = DLM_LOCK_NL;
+	else if (level == DLM_LOCK_PR)
+		new_level = DLM_LOCK_PR;
+	return new_level;
+}
+
+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
+			      unsigned long newflags)
+{
+	struct ocfs2_mask_waiter *mw, *tmp;
+
+ 	assert_spin_locked(&lockres->l_lock);
+
+	lockres->l_flags = newflags;
+
+	list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			continue;
+
+		list_del_init(&mw->mw_item);
+		mw->mw_status = 0;
+		complete(&mw->mw_complete);
+	}
+}
+static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
+{
+	lockres_set_flags(lockres, lockres->l_flags | or);
+}
+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
+				unsigned long clear)
+{
+	lockres_set_flags(lockres, lockres->l_flags & ~clear);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
+
+	lockres->l_level = lockres->l_requested;
+	if (lockres->l_level <=
+	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
+		lockres->l_blocking = DLM_LOCK_NL;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+	}
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+	/* Convert from RO to EX doesn't really need anything as our
+	 * information is already up to data. Convert from NL to
+	 * *anything* however should mark ourselves as needing an
+	 * update */
+	if (lockres->l_level == DLM_LOCK_NL &&
+	    lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+
+	/*
+	 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
+	 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
+	 * downconverting the lock before the upconvert has fully completed.
+	 * Do not prevent the dc thread from downconverting if NONBLOCK lock
+	 * had already returned.
+	 */
+	if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED))
+		lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+	else
+		lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED);
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+	if (lockres->l_requested > DLM_LOCK_NL &&
+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
+	    lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int needs_downconvert = 0;
+
+	assert_spin_locked(&lockres->l_lock);
+
+	if (level > lockres->l_blocking) {
+		/* only schedule a downconvert if we haven't already scheduled
+		 * one that goes low enough to satisfy the level we're
+		 * blocking.  this also catches the case where we get
+		 * duplicate BASTs */
+		if (ocfs2_highest_compat_lock_level(level) <
+		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
+			needs_downconvert = 1;
+
+		lockres->l_blocking = level;
+	}
+
+	mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
+	     lockres->l_name, level, lockres->l_level, lockres->l_blocking,
+	     needs_downconvert);
+
+	if (needs_downconvert)
+		lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+	mlog(0, "needs_downconvert = %d\n", needs_downconvert);
+	return needs_downconvert;
+}
+
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist?  To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock().  See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely.  However, it introduces
+ * a race on itself.  In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns.  The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time.  When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again.  If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action?  The other path has re-set PENDING.  Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ *  ocfs2_cluster_lock()
+ *   set BUSY
+ *   set PENDING
+ *   drop l_lock
+ *   ocfs2_dlm_lock()
+ *    ocfs2_locking_ast()		ocfs2_downconvert_thread()
+ *     clear PENDING			 ocfs2_unblock_lock()
+ *					  take_l_lock
+ *					  !BUSY
+ *					  ocfs2_prepare_downconvert()
+ *					   set BUSY
+ *					   set PENDING
+ *					  drop l_lock
+ *   take l_lock
+ *   clear PENDING
+ *   drop l_lock
+ *			<window>
+ *					  ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert().  That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen.  A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres.  lockres_set_pending() will return the
+ * current generation number.  When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending().  In our
+ * example above, the generation numbers will *not* match.  Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+				    unsigned int generation,
+				    struct ocfs2_super *osb)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	/*
+	 * The ast and locking functions can race us here.  The winner
+	 * will clear pending, the loser will not.
+	 */
+	if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+	    (lockres->l_pending_gen != generation))
+		return;
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+	lockres->l_pending_gen++;
+
+	/*
+	 * The downconvert thread may have skipped us because we
+	 * were PENDING.  Wake it up.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		ocfs2_wake_downconvert_thread(osb);
+}
+
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+				  unsigned int generation,
+				  struct ocfs2_super *osb)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	__lockres_clear_pending(lockres, generation, osb);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+	lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+
+	return lockres->l_pending_gen;
+}
+
+static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+	int needs_downconvert;
+	unsigned long flags;
+
+	BUG_ON(level <= DLM_LOCK_NL);
+
+	mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
+	     "type %s\n", lockres->l_name, level, lockres->l_level,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	/*
+	 * We can skip the bast for locks which don't enable caching -
+	 * they'll be dropped at the earliest possible time anyway.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+		return;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
+	if (needs_downconvert)
+		ocfs2_schedule_blocked_lock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	ocfs2_wake_downconvert_thread(osb);
+}
+
+static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+	unsigned long flags;
+	int status;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+
+	if (status == -EAGAIN) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+		goto out;
+	}
+
+	if (status) {
+		mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+		     lockres->l_name, status);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
+	     "level %d => %d\n", lockres->l_name, lockres->l_action,
+	     lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
+		     "flags 0x%lx, unlock: %u\n",
+		     lockres->l_name, lockres->l_action, lockres->l_flags,
+		     lockres->l_unlock_action);
+		BUG();
+	}
+out:
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+
+	/* Did we try to cancel this lock?  Clear that state */
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	/*
+	 * We may have beaten the locking functions here.  We certainly
+	 * know that dlm_lock() has been called :-)
+	 * Because we can't have two lock calls in flight at once, we
+	 * can use lockres->l_pending_gen.
+	 */
+	__lockres_clear_pending(lockres, lockres->l_pending_gen,  osb);
+
+	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	unsigned long flags;
+
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
+	     lockres->l_name, lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (error) {
+		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+		     "unlock_action %d\n", error, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		/* Downconvert thread may have requeued this lock, we
+		 * need to wake it. */
+		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = DLM_LOCK_IV;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= ocfs2_locking_ast,
+	.lp_blocking_ast	= ocfs2_blocking_ast,
+	.lp_unlock_ast		= ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
+}
+
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+	if (convert)
+		lockres->l_action = OCFS2_AST_INVALID;
+	else
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+/* Note: If we detect another process working on the lock (i.e.,
+ * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
+ * to do the right thing in that case.
+ */
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     u32 dlm_flags)
+{
+	int ret = 0;
+	unsigned long flags;
+	unsigned int gen;
+
+	mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
+	     dlm_flags);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
+	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	lockres->l_action = OCFS2_AST_ATTACH;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	gen = lockres_set_pending(lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = ocfs2_dlm_lock(osb->cconn,
+			     level,
+			     &lockres->l_lksb,
+			     dlm_flags,
+			     lockres->l_name,
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
+	lockres_clear_pending(lockres, gen, osb);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
+		ocfs2_recover_from_dlm_error(lockres, 1);
+	}
+
+	mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
+
+bail:
+	return ret;
+}
+
+static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
+					int flag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ret = lockres->l_flags & flag;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
+{
+	INIT_LIST_HEAD(&mw->mw_item);
+	init_completion(&mw->mw_complete);
+	ocfs2_init_start_time(mw);
+}
+
+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
+{
+	wait_for_completion(&mw->mw_complete);
+	/* Re-arm the completion in case we want to wait on it again */
+	reinit_completion(&mw->mw_complete);
+	return mw->mw_status;
+}
+
+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
+				    struct ocfs2_mask_waiter *mw,
+				    unsigned long mask,
+				    unsigned long goal)
+{
+	BUG_ON(!list_empty(&mw->mw_item));
+
+	assert_spin_locked(&lockres->l_lock);
+
+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
+	mw->mw_mask = mask;
+	mw->mw_goal = goal;
+}
+
+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
+ * if the mask still hadn't reached its goal */
+static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
+{
+	int ret = 0;
+
+	assert_spin_locked(&lockres->l_lock);
+	if (!list_empty(&mw->mw_item)) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			ret = -EBUSY;
+
+		list_del_init(&mw->mw_item);
+		init_completion(&mw->mw_complete);
+	}
+
+	return ret;
+}
+
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ret = __lockres_remove_mask_waiter(lockres, mw);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+
+}
+
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+					     struct ocfs2_lock_res *lockres)
+{
+	int ret;
+
+	ret = wait_for_completion_interruptible(&mw->mw_complete);
+	if (ret)
+		lockres_remove_mask_waiter(lockres, mw);
+	else
+		ret = mw->mw_status;
+	/* Re-arm the completion in case we want to wait on it again */
+	reinit_completion(&mw->mw_complete);
+	return ret;
+}
+
+static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres,
+				int level,
+				u32 lkm_flags,
+				int arg_flags,
+				int l_subclass,
+				unsigned long caller_ip)
+{
+	struct ocfs2_mask_waiter mw;
+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
+	unsigned long flags;
+	unsigned int gen;
+	int noqueue_attempted = 0;
+	int dlm_locked = 0;
+
+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
+		mlog_errno(-EINVAL);
+		return -EINVAL;
+	}
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+		lkm_flags |= DLM_LKF_VALBLK;
+
+again:
+	wait = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	if (catch_signals && signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto unlock;
+	}
+
+	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
+			"Cluster lock called on freeing lockres %s! flags "
+			"0x%lx\n", lockres->l_name, lockres->l_flags);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+	    level > lockres->l_level) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
+		/*
+		 * We've upconverted. If the lock now has a level we can
+		 * work with, we take it. If, however, the lock is not at the
+		 * required level, we go thru the full cycle. One way this could
+		 * happen is if a process requesting an upconvert to PR is
+		 * closely followed by another requesting upconvert to an EX.
+		 * If the process requesting EX lands here, we want it to
+		 * continue attempting to upconvert and let the process
+		 * requesting PR take the lock.
+		 * If multiple processes request upconvert to PR, the first one
+		 * here will take the lock. The others will have to go thru the
+		 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
+		 * downconvert request.
+		 */
+		if (level <= lockres->l_level)
+			goto update_holders;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (level > lockres->l_level) {
+		if (noqueue_attempted > 0) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+		if (lkm_flags & DLM_LKF_NOQUEUE)
+			noqueue_attempted = 1;
+
+		if (lockres->l_action != OCFS2_AST_INVALID)
+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
+			     lockres->l_name, lockres->l_action);
+
+		if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+			lockres->l_action = OCFS2_AST_ATTACH;
+			lkm_flags &= ~DLM_LKF_CONVERT;
+		} else {
+			lockres->l_action = OCFS2_AST_CONVERT;
+			lkm_flags |= DLM_LKF_CONVERT;
+		}
+
+		lockres->l_requested = level;
+		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+		gen = lockres_set_pending(lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		BUG_ON(level == DLM_LOCK_IV);
+		BUG_ON(level == DLM_LOCK_NL);
+
+		mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
+		     lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		ret = ocfs2_dlm_lock(osb->cconn,
+				     level,
+				     &lockres->l_lksb,
+				     lkm_flags,
+				     lockres->l_name,
+				     OCFS2_LOCK_ID_MAX_LEN - 1);
+		lockres_clear_pending(lockres, gen, osb);
+		if (ret) {
+			if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
+			    (ret != -EAGAIN)) {
+				ocfs2_log_dlm_error("ocfs2_dlm_lock",
+						    ret, lockres);
+			}
+			ocfs2_recover_from_dlm_error(lockres, 1);
+			goto out;
+		}
+		dlm_locked = 1;
+
+		mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
+		     lockres->l_name);
+
+		/* At this point we've gone inside the dlm and need to
+		 * complete our work regardless. */
+		catch_signals = 0;
+
+		/* wait for busy to clear and carry on */
+		goto again;
+	}
+
+update_holders:
+	/* Ok, if we get here then we're good to go. */
+	ocfs2_inc_holders(lockres, level);
+
+	ret = 0;
+unlock:
+	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+	/*
+	 * This is helping work around a lock inversion between the page lock
+	 * and dlm locks.  One path holds the page lock while calling aops
+	 * which block acquiring dlm locks.  The voting thread holds dlm
+	 * locks while acquiring page locks while down converting data locks.
+	 * This block is helping an aop path notice the inversion and back
+	 * off to unlock its page lock before trying the dlm lock again.
+	 */
+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
+		wait = 0;
+		spin_lock_irqsave(&lockres->l_lock, flags);
+		if (__lockres_remove_mask_waiter(lockres, &mw)) {
+			if (dlm_locked)
+				lockres_or_flags(lockres,
+					OCFS2_LOCK_NONBLOCK_FINISHED);
+			spin_unlock_irqrestore(&lockres->l_lock, flags);
+			ret = -EAGAIN;
+		} else {
+			spin_unlock_irqrestore(&lockres->l_lock, flags);
+			goto again;
+		}
+	}
+	if (wait) {
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret == 0)
+			goto again;
+		mlog_errno(ret);
+	}
+	ocfs2_update_lock_stats(lockres, level, &mw, ret);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (!ret && lockres->l_lockdep_map.key != NULL) {
+		if (level == DLM_LOCK_PR)
+			rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
+				!!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+				caller_ip);
+		else
+			rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
+				!!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+				caller_ip);
+	}
+#endif
+	return ret;
+}
+
+static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
+				     struct ocfs2_lock_res *lockres,
+				     int level,
+				     u32 lkm_flags,
+				     int arg_flags)
+{
+	return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
+				    0, _RET_IP_);
+}
+
+
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				   struct ocfs2_lock_res *lockres,
+				   int level,
+				   unsigned long caller_ip)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ocfs2_dec_holders(lockres, level);
+	ocfs2_downconvert_on_unlock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (lockres->l_lockdep_map.key != NULL)
+		rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+#endif
+}
+
+static int ocfs2_create_new_lock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int ex,
+				 int local)
+{
+	int level =  ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	unsigned long flags;
+	u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ocfs2_lock_create(osb, lockres, level, lkm_flags);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+	BUG_ON(!ocfs2_inode_is_new(inode));
+
+	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	/* NOTE: That we don't increment any of the holder counts, nor
+	 * do we add anything to a journal handle. Since this is
+	 * supposed to be a new inode which the cluster doesn't know
+	 * about yet, there is no need to.  As far as the LVB handling
+	 * is concerned, this is basically like acquiring an EX lock
+	 * on a resource which has an invalid one -- we'll set it
+	 * valid when we release the EX. */
+
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	/*
+	 * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
+	 * don't use a generation in their lock names.
+	 */
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+int ocfs2_rw_lock(struct inode *inode, int write)
+{
+	int status, level;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu take %s RW lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
+				    0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rw_unlock(struct inode *inode, int write)
+{
+	int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "inode %llu drop %s RW lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+	int status = 0;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu take PRMODE open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    DLM_LOCK_PR, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu try to take %s open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (write)
+			status = -EROFS;
+		goto out;
+	}
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	/*
+	 * The file system may already holding a PRMODE/EXMODE open lock.
+	 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
+	 * other nodes and the -EAGAIN will indicate to the caller that
+	 * this inode is still in use.
+	 */
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    level, DLM_LKF_NOQUEUE, 0);
+
+out:
+	return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "inode %llu drop open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	if(lockres->l_ro_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     DLM_LOCK_PR);
+	if(lockres->l_ex_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     DLM_LOCK_EX);
+
+out:
+	return;
+}
+
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int ret;
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+	unsigned long flags;
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+retry_cancel:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		if (ret) {
+			spin_unlock_irqrestore(&lockres->l_lock, flags);
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+			goto retry_cancel;
+		}
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_for_mask(&mw);
+		goto retry_cancel;
+	}
+
+	ret = -ERESTARTSYS;
+	/*
+	 * We may still have gotten the lock, in which case there's no
+	 * point to restarting the syscall.
+	 */
+	if (lockres->l_level == level)
+		ret = 0;
+
+	mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+	     lockres->l_flags, lockres->l_level, lockres->l_action);
+
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * separate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ *   what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ *   no-lock at unlock time. This also means flock locks never go on
+ *   the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ *   sure to allow cancellation of a misbehaving applications flock()
+ *   request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ *   can simplify the code by requiring the caller to guarantee
+ *   serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
+{
+	int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
+	unsigned long flags;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+	    (lockres->l_level > DLM_LOCK_NL)) {
+		mlog(ML_ERROR,
+		     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+		     "level: %u\n", lockres->l_name, lockres->l_flags,
+		     lockres->l_level);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/*
+		 * Get the lock at NLMODE to start - that way we
+		 * can cancel the upconvert request if need be.
+		 */
+		ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	lockres->l_action = OCFS2_AST_CONVERT;
+	lkm_flags |= DLM_LKF_CONVERT;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+
+	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
+			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
+	if (ret) {
+		if (!trylock || (ret != -EAGAIN)) {
+			ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
+			ret = -EINVAL;
+		}
+
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		lockres_remove_mask_waiter(lockres, &mw);
+		goto out;
+	}
+
+	ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+	if (ret == -ERESTARTSYS) {
+		/*
+		 * Userspace can cause deadlock itself with
+		 * flock(). Current behavior locally is to allow the
+		 * deadlock, but abort the system call if a signal is
+		 * received. We follow this example, otherwise a
+		 * poorly written program could sit in kernel until
+		 * reboot.
+		 *
+		 * Handling this is a bit more complicated for Ocfs2
+		 * though. We can't exit this function with an
+		 * outstanding lock request, so a cancel convert is
+		 * required. We intentionally overwrite 'ret' - if the
+		 * cancel fails and the lock was granted, it's easier
+		 * to just bubble success back up to the user.
+		 */
+		ret = ocfs2_flock_handle_signal(lockres, level);
+	} else if (!ret && (level > lockres->l_level)) {
+		/* Trylock failed asynchronously */
+		BUG_ON(!trylock);
+		ret = -EAGAIN;
+	}
+
+out:
+
+	mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+	     lockres->l_name, ex, trylock, ret);
+	return ret;
+}
+
+void ocfs2_file_unlock(struct file *file)
+{
+	int ret;
+	unsigned int gen;
+	unsigned long flags;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
+		return;
+
+	if (lockres->l_level == DLM_LOCK_NL)
+		return;
+
+	mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+	     lockres->l_name, lockres->l_flags, lockres->l_level,
+	     lockres->l_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	/*
+	 * Fake a blocking ast for the downconvert code.
+	 */
+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+	lockres->l_blocking = DLM_LOCK_EX;
+
+	gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
+	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
+	if (ret) {
+		mlog_errno(ret);
+		return;
+	}
+
+	ret = ocfs2_wait_for_mask(&mw);
+	if (ret)
+		mlog_errno(ret);
+}
+
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	int kick = 0;
+
+	/* If we know that another node is waiting on our lock, kick
+	 * the downconvert thread * pre-emptively when we reach a release
+	 * condition. */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+		switch(lockres->l_blocking) {
+		case DLM_LOCK_EX:
+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+				kick = 1;
+			break;
+		case DLM_LOCK_PR:
+			if (!lockres->l_ex_holders)
+				kick = 1;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (kick)
+		ocfs2_wake_downconvert_thread(osb);
+}
+
+#define OCFS2_SEC_BITS   34
+#define OCFS2_SEC_SHIFT  (64 - 34)
+#define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
+
+/* LVB only has room for 64 bits of time here so we pack it for
+ * now. */
+static u64 ocfs2_pack_timespec(struct timespec *spec)
+{
+	u64 res;
+	u64 sec = spec->tv_sec;
+	u32 nsec = spec->tv_nsec;
+
+	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
+
+	return res;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_inode_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	/*
+	 * Invalidate the LVB of a deleted inode - this way other
+	 * nodes are forced to go to disk and discover the new inode
+	 * status.
+	 */
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		lvb->lvb_version = 0;
+		goto out;
+	}
+
+	lvb->lvb_version   = OCFS2_LVB_VERSION;
+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
+	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
+	lvb->lvb_iuid      = cpu_to_be32(i_uid_read(inode));
+	lvb->lvb_igid      = cpu_to_be32(i_gid_read(inode));
+	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
+	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
+	lvb->lvb_iatime_packed  =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
+	lvb->lvb_ictime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+	lvb->lvb_imtime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
+	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
+	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
+
+out:
+	mlog_meta_lvb(0, lockres);
+}
+
+static void ocfs2_unpack_timespec(struct timespec *spec,
+				  u64 packed_time)
+{
+	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
+	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_meta_lvb(0, lockres);
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	/* We're safe here without the lockres lock... */
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
+	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
+
+	oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
+	oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
+	ocfs2_set_inode_flags(inode);
+
+	/* fast-symlinks are a special case */
+	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+
+	i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
+	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
+	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
+	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
+	ocfs2_unpack_timespec(&inode->i_atime,
+			      be64_to_cpu(lvb->lvb_iatime_packed));
+	ocfs2_unpack_timespec(&inode->i_mtime,
+			      be64_to_cpu(lvb->lvb_imtime_packed));
+	ocfs2_unpack_timespec(&inode->i_ctime,
+			      be64_to_cpu(lvb->lvb_ictime_packed));
+	spin_unlock(&oi->ip_lock);
+}
+
+static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
+					      struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
+	    && lvb->lvb_version == OCFS2_LVB_VERSION
+	    && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
+		return 1;
+	return 0;
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ *   0 means no refresh needed.
+ *
+ *   > 0 means you need to refresh this and you MUST call
+ *   ocfs2_complete_lock_res_refresh afterwards. */
+static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
+{
+	unsigned long flags;
+	int status = 0;
+
+refresh_check:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_on_refreshing_lock(lockres);
+		goto refresh_check;
+	}
+
+	/* Ok, I'll be the one to refresh this lock. */
+	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = 1;
+bail:
+	mlog(0, "status %d\n", status);
+	return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
+						   int status)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
+	if (!status)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_inode_lock_update(struct inode *inode,
+				  struct buffer_head **bh)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		mlog(0, "Orphaned inode %llu was deleted while we "
+		     "were waiting on a lock. ip_flags = 0x%x\n",
+		     (unsigned long long)oi->ip_blkno, oi->ip_flags);
+		spin_unlock(&oi->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+
+	/* This will discard any caching information we might have had
+	 * for the inode metadata. */
+	ocfs2_metadata_cache_purge(INODE_CACHE(inode));
+
+	ocfs2_extent_map_trunc(inode, 0);
+
+	if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+		mlog(0, "Trusting LVB on inode %llu\n",
+		     (unsigned long long)oi->ip_blkno);
+		ocfs2_refresh_inode_from_lvb(inode);
+	} else {
+		/* Boo, we have to go to disk. */
+		/* read bh, cast, ocfs2_refresh_inode */
+		status = ocfs2_read_inode_block(inode, bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_refresh;
+		}
+		fe = (struct ocfs2_dinode *) (*bh)->b_data;
+
+		/* This is a good chance to make sure we're not
+		 * locking an invalid object.  ocfs2_read_inode_block()
+		 * already checked that the inode block is sane.
+		 *
+		 * We bug on a stale inode here because we checked
+		 * above whether it was wiped from disk. The wiping
+		 * node provides a guarantee that we receive that
+		 * message and can mark the inode before dropping any
+		 * locks associated with it. */
+		mlog_bug_on_msg(inode->i_generation !=
+				le32_to_cpu(fe->i_generation),
+				"Invalid dinode %llu disk generation: %u "
+				"inode->i_generation: %u\n",
+				(unsigned long long)oi->ip_blkno,
+				le32_to_cpu(fe->i_generation),
+				inode->i_generation);
+		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
+				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
+				"Stale dinode %llu dtime: %llu flags: 0x%x\n",
+				(unsigned long long)oi->ip_blkno,
+				(unsigned long long)le64_to_cpu(fe->i_dtime),
+				le32_to_cpu(fe->i_flags));
+
+		ocfs2_refresh_inode(inode, fe);
+		ocfs2_track_lock_refresh(lockres);
+	}
+
+	status = 0;
+bail_refresh:
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	return status;
+}
+
+static int ocfs2_assign_bh(struct inode *inode,
+			   struct buffer_head **ret_bh,
+			   struct buffer_head *passed_bh)
+{
+	int status;
+
+	if (passed_bh) {
+		/* Ok, the update went to disk for us, use the
+		 * returned bh. */
+		*ret_bh = passed_bh;
+		get_bh(*ret_bh);
+
+		return 0;
+	}
+
+	status = ocfs2_read_inode_block(inode, ret_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * returns < 0 error if the callback will never be called, otherwise
+ * the result of the lock will be communicated via the callback.
+ */
+int ocfs2_inode_lock_full_nested(struct inode *inode,
+				 struct buffer_head **ret_bh,
+				 int ex,
+				 int arg_flags,
+				 int subclass)
+{
+	int status, level, acquired;
+	u32 dlm_flags;
+	struct ocfs2_lock_res *lockres = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *local_bh = NULL;
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu, take %s META lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	status = 0;
+	acquired = 0;
+	/* We'll allow faking a readonly metadata lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto getbh;
+	}
+
+	if (ocfs2_mount_local(osb))
+		goto local;
+
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		ocfs2_wait_for_recovery(osb);
+
+	lockres = &OCFS2_I(inode)->ip_inode_lockres;
+	level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	dlm_flags = 0;
+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
+		dlm_flags |= DLM_LKF_NOQUEUE;
+
+	status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
+				      arg_flags, subclass, _RET_IP_);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* Notify the error cleanup path to drop the cluster lock. */
+	acquired = 1;
+
+	/* We wait twice because a node may have died while we were in
+	 * the lower dlm layers. The second time though, we've
+	 * committed to owning this lock so we don't allow signals to
+	 * abort the operation. */
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		ocfs2_wait_for_recovery(osb);
+
+local:
+	/*
+	 * We only see this flag if we're being called from
+	 * ocfs2_read_locked_inode(). It means we're locking an inode
+	 * which hasn't been populated yet, so clear the refresh flag
+	 * and let the caller handle it.
+	 */
+	if (inode->i_state & I_NEW) {
+		status = 0;
+		if (lockres)
+			ocfs2_complete_lock_res_refresh(lockres, 0);
+		goto bail;
+	}
+
+	/* This is fun. The caller may want a bh back, or it may
+	 * not. ocfs2_inode_lock_update definitely wants one in, but
+	 * may or may not read one, depending on what's in the
+	 * LVB. The result of all of this is that we've *only* gone to
+	 * disk if we have to, so the complexity is worthwhile. */
+	status = ocfs2_inode_lock_update(inode, &local_bh);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+getbh:
+	if (ret_bh) {
+		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (status < 0) {
+		if (ret_bh && (*ret_bh)) {
+			brelse(*ret_bh);
+			*ret_bh = NULL;
+		}
+		if (acquired)
+			ocfs2_inode_unlock(inode, ex);
+	}
+
+	if (local_bh)
+		brelse(local_bh);
+
+	return status;
+}
+
+/*
+ * This is working around a lock inversion between tasks acquiring DLM
+ * locks while holding a page lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring page locks.
+ *
+ * ** These _with_page variantes are only intended to be called from aop
+ * methods that hold page locks and return a very specific *positive* error
+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
+ *
+ * The DLM is called such that it returns -EAGAIN if it would have
+ * blocked waiting for the downconvert thread.  In that case we unlock
+ * our page so the downconvert thread can make progress.  Once we've
+ * done this we have to return AOP_TRUNCATED_PAGE so the aop method
+ * that called us can bubble that back up into the VFS who will then
+ * immediately retry the aop call.
+ *
+ * We do a blocking lock and immediate unlock before returning, though, so that
+ * the lock has a great chance of being cached on this node by the time the VFS
+ * calls back to retry the aop.    This has a potential to livelock as nodes
+ * ping locks back and forth, but that's a risk we're willing to take to avoid
+ * the lock inversion simply.
+ */
+int ocfs2_inode_lock_with_page(struct inode *inode,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+			ocfs2_inode_unlock(inode, ex);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+int ocfs2_inode_lock_atime(struct inode *inode,
+			  struct vfsmount *vfsmnt,
+			  int *level)
+{
+	int ret;
+
+	ret = ocfs2_inode_lock(inode, NULL, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * If we should update atime, we will get EX lock,
+	 * otherwise we just get PR lock.
+	 */
+	if (ocfs2_should_update_atime(inode, vfsmnt)) {
+		struct buffer_head *bh = NULL;
+
+		ocfs2_inode_unlock(inode, 0);
+		ret = ocfs2_inode_lock(inode, &bh, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*level = 1;
+		if (ocfs2_should_update_atime(inode, vfsmnt))
+			ocfs2_update_inode_atime(inode, bh);
+		if (bh)
+			brelse(bh);
+	} else
+		*level = 0;
+
+	return ret;
+}
+
+void ocfs2_inode_unlock(struct inode *inode,
+		       int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "inode %llu drop %s META lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
+	    !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
+{
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_orphan_scan_lvb *lvb;
+	int status = 0;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	lockres = &osb->osb_orphan_scan.os_lockres;
+	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+	if (status < 0)
+		return status;
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+	    lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+		*seqno = be32_to_cpu(lvb->lvb_os_seqno);
+	else
+		*seqno = osb->osb_orphan_scan.os_seqno + 1;
+
+	return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
+{
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_orphan_scan_lvb *lvb;
+
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
+		lockres = &osb->osb_orphan_scan.os_lockres;
+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+		lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+		lvb->lvb_os_seqno = cpu_to_be32(seqno);
+		ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+	}
+}
+
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex)
+{
+	int status = 0;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* The super block lock path is really in the best position to
+	 * know when resources covered by the lock need to be
+	 * refreshed, so we do it here. Of course, making sense of
+	 * everything is up to the caller :) */
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (status) {
+		status = ocfs2_refresh_slot_info(osb);
+
+		ocfs2_complete_lock_res_refresh(lockres, status);
+
+		if (status < 0) {
+			ocfs2_cluster_unlock(osb, lockres, level);
+			mlog_errno(status);
+		}
+		ocfs2_track_lock_refresh(lockres);
+	}
+bail:
+	return status;
+}
+
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_rename_lock(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rename_unlock(struct ocfs2_super *osb)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+}
+
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+				    0, 0);
+	if (status < 0)
+		mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+
+	return status;
+}
+
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres,
+				     ex ? LKM_EXMODE : LKM_PRMODE);
+}
+
+int ocfs2_dentry_lock(struct dentry *dentry, int ex)
+{
+	int ret;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	BUG_ON(!dl);
+
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			return -EROFS;
+		return 0;
+	}
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
+}
+
+/* Reference counting of the dlm debug structure. We want this because
+ * open references on the debug inodes can live on after a mount, so
+ * we can't rely on the ocfs2_super to always exist. */
+static void ocfs2_dlm_debug_free(struct kref *kref)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
+
+	kfree(dlm_debug);
+}
+
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
+{
+	if (dlm_debug)
+		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
+}
+
+static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
+{
+	kref_get(&debug->d_refcnt);
+}
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
+	if (!dlm_debug) {
+		mlog_errno(-ENOMEM);
+		goto out;
+	}
+
+	kref_init(&dlm_debug->d_refcnt);
+	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
+	dlm_debug->d_locking_state = NULL;
+out:
+	return dlm_debug;
+}
+
+/* Access to this is arbitrated for us via seq_file->sem. */
+struct ocfs2_dlm_seq_priv {
+	struct ocfs2_dlm_debug *p_dlm_debug;
+	struct ocfs2_lock_res p_iter_res;
+	struct ocfs2_lock_res p_tmp_res;
+};
+
+static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
+						 struct ocfs2_dlm_seq_priv *priv)
+{
+	struct ocfs2_lock_res *iter, *ret = NULL;
+	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
+
+	assert_spin_locked(&ocfs2_dlm_tracking_lock);
+
+	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
+		/* discover the head of the list */
+		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
+			mlog(0, "End of list found, %p\n", ret);
+			break;
+		}
+
+		/* We track our "dummy" iteration lockres' by a NULL
+		 * l_ops field. */
+		if (iter->l_ops != NULL) {
+			ret = iter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
+	if (iter) {
+		/* Since lockres' have the lifetime of their container
+		 * (which can be inodes, ocfs2_supers, etc) we want to
+		 * copy this out to a temporary lockres while still
+		 * under the spinlock. Obviously after this we can't
+		 * trust any pointers on the copy returned, but that's
+		 * ok as the information we want isn't typically held
+		 * in them. */
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter = v;
+	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(iter, priv);
+	list_del_init(&dummy->l_debug_list);
+	if (iter) {
+		list_add(&dummy->l_debug_list, &iter->l_debug_list);
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+/*
+ * Version is used by debugfs.ocfs2 to determine the format being used
+ *
+ * New in version 2
+ *	- Lock stats printed
+ * New in version 3
+ *	- Max time in lock stats is in usecs (instead of nsecs)
+ */
+#define OCFS2_DLM_DEBUG_STR_VERSION 3
+static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+	char *lvb;
+	struct ocfs2_lock_res *lockres = v;
+
+	if (!lockres)
+		return -EINVAL;
+
+	seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
+
+	if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
+		seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
+			   lockres->l_name,
+			   (unsigned int)ocfs2_get_dentry_lock_ino(lockres));
+	else
+		seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
+
+	seq_printf(m, "%d\t"
+		   "0x%lx\t"
+		   "0x%x\t"
+		   "0x%x\t"
+		   "%u\t"
+		   "%u\t"
+		   "%d\t"
+		   "%d\t",
+		   lockres->l_level,
+		   lockres->l_flags,
+		   lockres->l_action,
+		   lockres->l_unlock_action,
+		   lockres->l_ro_holders,
+		   lockres->l_ex_holders,
+		   lockres->l_requested,
+		   lockres->l_blocking);
+
+	/* Dump the raw LVB */
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	for(i = 0; i < DLM_LVB_LEN; i++)
+		seq_printf(m, "0x%x\t", lvb[i]);
+
+#ifdef CONFIG_OCFS2_FS_STATS
+# define lock_num_prmode(_l)		((_l)->l_lock_prmode.ls_gets)
+# define lock_num_exmode(_l)		((_l)->l_lock_exmode.ls_gets)
+# define lock_num_prmode_failed(_l)	((_l)->l_lock_prmode.ls_fail)
+# define lock_num_exmode_failed(_l)	((_l)->l_lock_exmode.ls_fail)
+# define lock_total_prmode(_l)		((_l)->l_lock_prmode.ls_total)
+# define lock_total_exmode(_l)		((_l)->l_lock_exmode.ls_total)
+# define lock_max_prmode(_l)		((_l)->l_lock_prmode.ls_max)
+# define lock_max_exmode(_l)		((_l)->l_lock_exmode.ls_max)
+# define lock_refresh(_l)		((_l)->l_lock_refresh)
+#else
+# define lock_num_prmode(_l)		(0)
+# define lock_num_exmode(_l)		(0)
+# define lock_num_prmode_failed(_l)	(0)
+# define lock_num_exmode_failed(_l)	(0)
+# define lock_total_prmode(_l)		(0ULL)
+# define lock_total_exmode(_l)		(0ULL)
+# define lock_max_prmode(_l)		(0)
+# define lock_max_exmode(_l)		(0)
+# define lock_refresh(_l)		(0)
+#endif
+	/* The following seq_print was added in version 2 of this output */
+	seq_printf(m, "%u\t"
+		   "%u\t"
+		   "%u\t"
+		   "%u\t"
+		   "%llu\t"
+		   "%llu\t"
+		   "%u\t"
+		   "%u\t"
+		   "%u\t",
+		   lock_num_prmode(lockres),
+		   lock_num_exmode(lockres),
+		   lock_num_prmode_failed(lockres),
+		   lock_num_exmode_failed(lockres),
+		   lock_total_prmode(lockres),
+		   lock_total_exmode(lockres),
+		   lock_max_prmode(lockres),
+		   lock_max_exmode(lockres),
+		   lock_refresh(lockres));
+
+	/* End the line */
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations ocfs2_dlm_seq_ops = {
+	.start =	ocfs2_dlm_seq_start,
+	.stop =		ocfs2_dlm_seq_stop,
+	.next =		ocfs2_dlm_seq_next,
+	.show =		ocfs2_dlm_seq_show,
+};
+
+static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct ocfs2_dlm_seq_priv *priv = seq->private;
+	struct ocfs2_lock_res *res = &priv->p_iter_res;
+
+	ocfs2_remove_lockres_tracking(res);
+	ocfs2_put_dlm_debug(priv->p_dlm_debug);
+	return seq_release_private(inode, file);
+}
+
+static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
+{
+	struct ocfs2_dlm_seq_priv *priv;
+	struct ocfs2_super *osb;
+
+	priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
+	if (!priv) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	osb = inode->i_private;
+	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
+	priv->p_dlm_debug = osb->osb_dlm_debug;
+	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
+
+	ocfs2_add_lockres_tracking(&priv->p_iter_res,
+				   priv->p_dlm_debug);
+
+	return 0;
+}
+
+static const struct file_operations ocfs2_dlm_debug_fops = {
+	.open =		ocfs2_dlm_debug_open,
+	.release =	ocfs2_dlm_debug_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+
+static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
+							 S_IFREG|S_IRUSR,
+							 osb->osb_debug_root,
+							 osb,
+							 &ocfs2_dlm_debug_fops);
+	if (!dlm_debug->d_locking_state) {
+		ret = -EINVAL;
+		mlog(ML_ERROR,
+		     "Unable to create locking state debugfs file.\n");
+		goto out;
+	}
+
+	ocfs2_get_dlm_debug(dlm_debug);
+out:
+	return ret;
+}
+
+static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
+{
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	if (dlm_debug) {
+		debugfs_remove(dlm_debug->d_locking_state);
+		ocfs2_put_dlm_debug(dlm_debug);
+	}
+}
+
+int ocfs2_dlm_init(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_cluster_connection *conn = NULL;
+
+	if (ocfs2_mount_local(osb)) {
+		osb->node_num = 0;
+		goto local;
+	}
+
+	status = ocfs2_dlm_init_debug(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* launch downconvert thread */
+	osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+	if (IS_ERR(osb->dc_task)) {
+		status = PTR_ERR(osb->dc_task);
+		osb->dc_task = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* for now, uuid == domain */
+	status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+				       osb->osb_cluster_name,
+				       strlen(osb->osb_cluster_name),
+				       osb->uuid_str,
+				       strlen(osb->uuid_str),
+				       &lproto, ocfs2_do_node_down, osb,
+				       &conn);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_cluster_this_node(conn, &osb->node_num);
+	if (status < 0) {
+		mlog_errno(status);
+		mlog(ML_ERROR,
+		     "could not find this host's node number\n");
+		ocfs2_cluster_disconnect(conn, 0);
+		goto bail;
+	}
+
+local:
+	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+	ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+	ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
+
+	osb->cconn = conn;
+
+	status = 0;
+bail:
+	if (status < 0) {
+		ocfs2_dlm_shutdown_debug(osb);
+		if (osb->dc_task)
+			kthread_stop(osb->dc_task);
+	}
+
+	return status;
+}
+
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+			int hangup_pending)
+{
+	ocfs2_drop_osb_locks(osb);
+
+	/*
+	 * Now that we have dropped all locks and ocfs2_dismount_volume()
+	 * has disabled recovery, the DLM won't be talking to us.  It's
+	 * safe to tear things down before disconnecting the cluster.
+	 */
+
+	if (osb->dc_task) {
+		kthread_stop(osb->dc_task);
+		osb->dc_task = NULL;
+	}
+
+	ocfs2_lock_res_free(&osb->osb_super_lockres);
+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+	ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+	ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
+
+	ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+	osb->cconn = NULL;
+
+	ocfs2_dlm_shutdown_debug(osb);
+}
+
+static int ocfs2_drop_lock(struct ocfs2_super *osb,
+			   struct ocfs2_lock_res *lockres)
+{
+	int ret;
+	unsigned long flags;
+	u32 lkm_flags = 0;
+
+	/* We didn't get anywhere near actually using this lockres. */
+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
+		goto out;
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+		lkm_flags |= DLM_LKF_VALBLK;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
+			"lockres %s, flags 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
+		     "%u, unlock_action = %u\n",
+		     lockres->l_name, lockres->l_flags, lockres->l_action,
+		     lockres->l_unlock_action);
+
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/* XXX: Today we just wait on any busy
+		 * locks... Perhaps we need to cancel converts in the
+		 * future? */
+		ocfs2_wait_on_busy_lock(lockres);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
+		if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+		    lockres->l_level == DLM_LOCK_EX &&
+		    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+			lockres->l_ops->set_lvb(lockres);
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
+		     lockres->l_name);
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto out;
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
+		ocfs2_dlm_dump_lksb(&lockres->l_lksb);
+		BUG();
+	}
+	mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
+	     lockres->l_name);
+
+	ocfs2_wait_on_busy_lock(lockres);
+out:
+	return 0;
+}
+
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *lockres);
+
+/* Mark the lockres as being dropped. It will no longer be
+ * queued if blocking, but we still may have to wait on it
+ * being dequeued from the downconvert thread before we can consider
+ * it safe to drop.
+ *
+ * You can *not* attempt to call cluster_lock on this lockres anymore. */
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_mask_waiter mw;
+	unsigned long flags, flags2;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres->l_flags |= OCFS2_LOCK_FREEING;
+	if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+		/*
+		 * We know the downconvert is queued but not in progress
+		 * because we are the downconvert thread and processing
+		 * different lock. So we can just remove the lock from the
+		 * queue. This is not only an optimization but also a way
+		 * to avoid the following deadlock:
+		 *   ocfs2_dentry_post_unlock()
+		 *     ocfs2_dentry_lock_put()
+		 *       ocfs2_drop_dentry_lock()
+		 *         iput()
+		 *           ocfs2_evict_inode()
+		 *             ocfs2_clear_inode()
+		 *               ocfs2_mark_lockres_freeing()
+		 *                 ... blocks waiting for OCFS2_LOCK_QUEUED
+		 *                 since we are the downconvert thread which
+		 *                 should clear the flag.
+		 */
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		spin_lock_irqsave(&osb->dc_task_lock, flags2);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+		/*
+		 * Warn if we recurse into another post_unlock call.  Strictly
+		 * speaking it isn't a problem but we need to be careful if
+		 * that happens (stack overflow, deadlocks, ...) so warn if
+		 * ocfs2 grows a path for which this can happen.
+		 */
+		WARN_ON_ONCE(lockres->l_ops->post_unlock);
+		/* Since the lock is freeing we don't do much in the fn below */
+		ocfs2_process_blocked_lock(osb, lockres);
+		return;
+	}
+	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
+
+		status = ocfs2_wait_for_mask(&mw);
+		if (status)
+			mlog_errno(status);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
+			       struct ocfs2_lock_res *lockres)
+{
+	int ret;
+
+	ocfs2_mark_lockres_freeing(osb, lockres);
+	ret = ocfs2_drop_lock(osb, lockres);
+	if (ret)
+		mlog_errno(ret);
+}
+
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
+{
+	ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+	int status, err;
+
+	/* No need to call ocfs2_mark_lockres_freeing here -
+	 * ocfs2_clear_inode has done it for us. */
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_open_lockres);
+	if (err < 0)
+		mlog_errno(err);
+
+	status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_inode_lockres);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_rw_lockres);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	return status;
+}
+
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+					      int new_level)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
+
+	if (lockres->l_level <= new_level) {
+		mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
+		     "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
+		     "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
+		     new_level, list_empty(&lockres->l_blocked_list),
+		     list_empty(&lockres->l_mask_waiters), lockres->l_type,
+		     lockres->l_flags, lockres->l_ro_holders,
+		     lockres->l_ex_holders, lockres->l_action,
+		     lockres->l_unlock_action, lockres->l_requested,
+		     lockres->l_blocking, lockres->l_pending_gen);
+		BUG();
+	}
+
+	mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
+	     lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
+
+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
+	lockres->l_requested = new_level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	return lockres_set_pending(lockres);
+}
+
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb,
+				  unsigned int generation)
+{
+	int ret;
+	u32 dlm_flags = DLM_LKF_CONVERT;
+
+	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
+	     lockres->l_level, new_level);
+
+	if (lvb)
+		dlm_flags |= DLM_LKF_VALBLK;
+
+	ret = ocfs2_dlm_lock(osb->cconn,
+			     new_level,
+			     &lockres->l_lksb,
+			     dlm_flags,
+			     lockres->l_name,
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
+	lockres_clear_pending(lockres, generation, osb);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	return ret;
+}
+
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		/* If we're already trying to cancel a lock conversion
+		 * then just drop the spinlock and allow the caller to
+		 * requeue this lock. */
+		mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
+		return 0;
+	}
+
+	/* were we in a convert when we got the bast fire? */
+	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
+	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
+	/* set things up for the unlockast to know to just
+	 * clear out the ast_action and unset busy, etc. */
+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
+			"lock %s, invalid flags: 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
+	return 1;
+}
+
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int ret;
+
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
+			       DLM_LKF_CANCEL);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
+		ocfs2_recover_from_dlm_error(lockres, 0);
+	}
+
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
+	return ret;
+}
+
+static int ocfs2_unblock_lock(struct ocfs2_super *osb,
+			      struct ocfs2_lock_res *lockres,
+			      struct ocfs2_unblock_ctl *ctl)
+{
+	unsigned long flags;
+	int blocking;
+	int new_level;
+	int level;
+	int ret = 0;
+	int set_lvb = 0;
+	unsigned int gen;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+recheck:
+	/*
+	 * Is it still blocking? If not, we have no more work to do.
+	 */
+	if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+		BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = 0;
+		goto leave;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		/* XXX
+		 * This is a *big* race.  The OCFS2_LOCK_PENDING flag
+		 * exists entirely for one reason - another thread has set
+		 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+		 *
+		 * If we do ocfs2_cancel_convert() before the other thread
+		 * calls dlm_lock(), our cancel will do nothing.  We will
+		 * get no ast, and we will have no way of knowing the
+		 * cancel failed.  Meanwhile, the other thread will call
+		 * into dlm_lock() and wait...forever.
+		 *
+		 * Why forever?  Because another node has asked for the
+		 * lock first; that's why we're here in unblock_lock().
+		 *
+		 * The solution is OCFS2_LOCK_PENDING.  When PENDING is
+		 * set, we just requeue the unblock.  Only when the other
+		 * thread has called dlm_lock() and cleared PENDING will
+		 * we then cancel their request.
+		 *
+		 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+		 * at the same time they set OCFS2_DLM_BUSY.  They must
+		 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+		 */
+		if (lockres->l_flags & OCFS2_LOCK_PENDING) {
+			mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
+			     lockres->l_name);
+			goto leave_requeue;
+		}
+
+		ctl->requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	/*
+	 * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
+	 * set when the ast is received for an upconvert just before the
+	 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
+	 * on the heels of the ast, we want to delay the downconvert just
+	 * enough to allow the up requestor to do its task. Because this
+	 * lock is in the blocked queue, the lock will be downconverted
+	 * as soon as the requestor is done with the lock.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
+		goto leave_requeue;
+
+	/*
+	 * How can we block and yet be at NL?  We were trying to upconvert
+	 * from NL and got canceled.  The code comes back here, and now
+	 * we notice and clear BLOCKING.
+	 */
+	if (lockres->l_level == DLM_LOCK_NL) {
+		BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
+		mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
+		lockres->l_blocking = DLM_LOCK_NL;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto leave;
+	}
+
+	/* if we're blocking an exclusive and we have *any* holders,
+	 * then requeue. */
+	if ((lockres->l_blocking == DLM_LOCK_EX)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
+		     lockres->l_name, lockres->l_ex_holders,
+		     lockres->l_ro_holders);
+		goto leave_requeue;
+	}
+
+	/* If it's a PR we're blocking, then only
+	 * requeue if we've got any EX holders */
+	if (lockres->l_blocking == DLM_LOCK_PR &&
+	    lockres->l_ex_holders) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
+		     lockres->l_name, lockres->l_ex_holders);
+		goto leave_requeue;
+	}
+
+	/*
+	 * Can we get a lock in this state if the holder counts are
+	 * zero? The meta data unblock code used to check this.
+	 */
+	if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
+	    && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
+		     lockres->l_name);
+		goto leave_requeue;
+	}
+
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	if (lockres->l_ops->check_downconvert
+	    && !lockres->l_ops->check_downconvert(lockres, new_level)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
+		     lockres->l_name);
+		goto leave_requeue;
+	}
+
+	/* If we get here, then we know that there are no more
+	 * incompatible holders (and anyone asking for an incompatible
+	 * lock is blocked). We can now downconvert the lock */
+	if (!lockres->l_ops->downconvert_worker)
+		goto downconvert;
+
+	/* Some lockres types want to do a bit of work before
+	 * downconverting a lock. Allow that here. The worker function
+	 * may sleep, so we save off a copy of what we're blocking as
+	 * it may change while we're not holding the spin lock. */
+	blocking = lockres->l_blocking;
+	level = lockres->l_level;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
+
+	if (ctl->unblock_action == UNBLOCK_STOP_POST) {
+		mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
+		     lockres->l_name);
+		goto leave;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
+		/* If this changed underneath us, then we can't drop
+		 * it just yet. */
+		mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
+		     "Recheck\n", lockres->l_name, blocking,
+		     lockres->l_blocking, level, lockres->l_level);
+		goto recheck;
+	}
+
+downconvert:
+	ctl->requeue = 0;
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
+		if (lockres->l_level == DLM_LOCK_EX)
+			set_lvb = 1;
+
+		/*
+		 * We only set the lvb if the lock has been fully
+		 * refreshed - otherwise we risk setting stale
+		 * data. Otherwise, there's no need to actually clear
+		 * out the lvb here as it's value is still valid.
+		 */
+		if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+			lockres->l_ops->set_lvb(lockres);
+	}
+
+	gen = ocfs2_prepare_downconvert(lockres, new_level);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+				     gen);
+
+leave:
+	if (ret)
+		mlog_errno(ret);
+	return ret;
+
+leave_requeue:
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ctl->requeue = 1;
+
+	return 0;
+}
+
+static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				     int blocking)
+{
+	struct inode *inode;
+	struct address_space *mapping;
+	struct ocfs2_inode_info *oi;
+
+       	inode = ocfs2_lock_res_inode(lockres);
+	mapping = inode->i_mapping;
+
+	if (S_ISDIR(inode->i_mode)) {
+		oi = OCFS2_I(inode);
+		oi->ip_dir_lock_gen++;
+		mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+		goto out;
+	}
+
+	if (!S_ISREG(inode->i_mode))
+		goto out;
+
+	/*
+	 * We need this before the filemap_fdatawrite() so that it can
+	 * transfer the dirty bit from the PTE to the
+	 * page. Unfortunately this means that even for EX->PR
+	 * downconverts, we'll lose our mappings and have to build
+	 * them up again.
+	 */
+	unmap_mapping_range(mapping, 0, 0, 0);
+
+	if (filemap_fdatawrite(mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+	sync_mapping_buffers(mapping);
+	if (blocking == DLM_LOCK_EX) {
+		truncate_inode_pages(mapping, 0);
+	} else {
+		/* We only need to wait on the I/O if we're not also
+		 * truncating pages because truncate_inode_pages waits
+		 * for us above. We don't truncate pages if we're
+		 * blocking anything < EXMODE because we want to keep
+		 * them around in that case. */
+		filemap_fdatawait(mapping);
+	}
+
+out:
+	return UNBLOCK_CONTINUE;
+}
+
+static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
+				 struct ocfs2_lock_res *lockres,
+				 int new_level)
+{
+	int checkpointed = ocfs2_ci_fully_checkpointed(ci);
+
+	BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
+	BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
+
+	if (checkpointed)
+		return 1;
+
+	ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
+	return 0;
+}
+
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+					int new_level)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
+}
+
+static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	__ocfs2_stuff_meta_lvb(inode);
+}
+
+/*
+ * Does the final reference drop on our dentry lock. Right now this
+ * happens in the downconvert thread, but we could choose to simplify the
+ * dlmglue API and push these off to the ocfs2_wq in the future.
+ */
+static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
+				     struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
+	ocfs2_dentry_lock_put(osb, dl);
+}
+
+/*
+ * d_delete() matching dentries before the lock downconvert.
+ *
+ * At this point, any process waiting to destroy the
+ * dentry_lock due to last ref count is stopped by the
+ * OCFS2_LOCK_QUEUED flag.
+ *
+ * We have two potential problems
+ *
+ * 1) If we do the last reference drop on our dentry_lock (via dput)
+ *    we'll wind up in ocfs2_release_dentry_lock(), waiting on
+ *    the downconvert to finish. Instead we take an elevated
+ *    reference and push the drop until after we've completed our
+ *    unblock processing.
+ *
+ * 2) There might be another process with a final reference,
+ *    waiting on us to finish processing. If this is the case, we
+ *    detect it and exit out - there's no more dentries anyway.
+ */
+static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
+				       int blocking)
+{
+	struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
+	struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
+	struct dentry *dentry;
+	unsigned long flags;
+	int extra_ref = 0;
+
+	/*
+	 * This node is blocking another node from getting a read
+	 * lock. This happens when we've renamed within a
+	 * directory. We've forced the other nodes to d_delete(), but
+	 * we never actually dropped our lock because it's still
+	 * valid. The downconvert code will retain a PR for this node,
+	 * so there's no further work to do.
+	 */
+	if (blocking == DLM_LOCK_PR)
+		return UNBLOCK_CONTINUE;
+
+	/*
+	 * Mark this inode as potentially orphaned. The code in
+	 * ocfs2_delete_inode() will figure out whether it actually
+	 * needs to be freed or not.
+	 */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+	spin_unlock(&oi->ip_lock);
+
+	/*
+	 * Yuck. We need to make sure however that the check of
+	 * OCFS2_LOCK_FREEING and the extra reference are atomic with
+	 * respect to a reference decrement or the setting of that
+	 * flag.
+	 */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	spin_lock(&dentry_attach_lock);
+	if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
+	    && dl->dl_count) {
+		dl->dl_count++;
+		extra_ref = 1;
+	}
+	spin_unlock(&dentry_attach_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "extra_ref = %d\n", extra_ref);
+
+	/*
+	 * We have a process waiting on us in ocfs2_dentry_iput(),
+	 * which means we can't have any more outstanding
+	 * aliases. There's no need to do any more work.
+	 */
+	if (!extra_ref)
+		return UNBLOCK_CONTINUE;
+
+	spin_lock(&dentry_attach_lock);
+	while (1) {
+		dentry = ocfs2_find_local_alias(dl->dl_inode,
+						dl->dl_parent_blkno, 1);
+		if (!dentry)
+			break;
+		spin_unlock(&dentry_attach_lock);
+
+		if (S_ISDIR(dl->dl_inode->i_mode))
+			shrink_dcache_parent(dentry);
+
+		mlog(0, "d_delete(%pd);\n", dentry);
+
+		/*
+		 * The following dcache calls may do an
+		 * iput(). Normally we don't want that from the
+		 * downconverting thread, but in this case it's ok
+		 * because the requesting node already has an
+		 * exclusive lock on the inode, so it can't be queued
+		 * for a downconvert.
+		 */
+		d_delete(dentry);
+		dput(dentry);
+
+		spin_lock(&dentry_attach_lock);
+	}
+	spin_unlock(&dentry_attach_lock);
+
+	/*
+	 * If we are the last holder of this dentry lock, there is no
+	 * reason to downconvert so skip straight to the unlock.
+	 */
+	if (dl->dl_count == 1)
+		return UNBLOCK_STOP_POST;
+
+	return UNBLOCK_CONTINUE_POST;
+}
+
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level)
+{
+	struct ocfs2_refcount_tree *tree =
+				ocfs2_lock_res_refcount_tree(lockres);
+
+	return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
+}
+
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking)
+{
+	struct ocfs2_refcount_tree *tree =
+				ocfs2_lock_res_refcount_tree(lockres);
+
+	ocfs2_metadata_cache_purge(&tree->rf_ci);
+
+	return UNBLOCK_CONTINUE;
+}
+
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_qinfo_lvb *lvb;
+	struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+	lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+	lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_global_disk_dqinfo *gdinfo;
+	int status = 0;
+
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+	    lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+		info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+		info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+		oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+		oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+		oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					be32_to_cpu(lvb->lvb_free_entry);
+	} else {
+		status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+						     oinfo->dqi_giblk, &bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		gdinfo = (struct ocfs2_global_disk_dqinfo *)
+					(bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+		info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+		info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+		oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+		oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+		oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					le32_to_cpu(gdinfo->dqi_free_entry);
+		brelse(bh);
+		ocfs2_track_lock_refresh(lockres);
+	}
+
+bail:
+	return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	int status = 0;
+
+	/* On RO devices, locking really isn't needed... */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+	/* OK, we have the lock but we need to refresh the quota info */
+	status = ocfs2_refresh_qinfo(oinfo);
+	if (status)
+		ocfs2_qinfo_unlock(oinfo, ex);
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	return status;
+}
+
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+	int status;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_unblock_ctl ctl = {0, 0,};
+	unsigned long flags;
+
+	/* Our reference to the lockres in this function can be
+	 * considered valid until we remove the OCFS2_LOCK_QUEUED
+	 * flag. */
+
+	BUG_ON(!lockres);
+	BUG_ON(!lockres->l_ops);
+
+	mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
+
+	/* Detect whether a lock has been marked as going away while
+	 * the downconvert thread was processing other things. A lock can
+	 * still be marked with OCFS2_LOCK_FREEING after this check,
+	 * but short circuiting here will still save us some
+	 * performance. */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_FREEING)
+		goto unqueue;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = ocfs2_unblock_lock(osb, lockres, &ctl);
+	if (status < 0)
+		mlog_errno(status);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+unqueue:
+	if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
+	} else
+		ocfs2_schedule_blocked_lock(osb, lockres);
+
+	mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
+	     ctl.requeue ? "yes" : "no");
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	if (ctl.unblock_action != UNBLOCK_CONTINUE
+	    && lockres->l_ops->post_unlock)
+		lockres->l_ops->post_unlock(osb, lockres);
+}
+
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	unsigned long flags;
+
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
+		/* Do not schedule a lock for downconvert when it's on
+		 * the way to destruction - any nodes wanting access
+		 * to the resource will get it soon. */
+		mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
+		     lockres->l_name, lockres->l_flags);
+		return;
+	}
+
+	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
+
+	spin_lock_irqsave(&osb->dc_task_lock, flags);
+	if (list_empty(&lockres->l_blocked_list)) {
+		list_add_tail(&lockres->l_blocked_list,
+			      &osb->blocked_lock_list);
+		osb->blocked_lock_count++;
+	}
+	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+}
+
+static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
+{
+	unsigned long processed;
+	unsigned long flags;
+	struct ocfs2_lock_res *lockres;
+
+	spin_lock_irqsave(&osb->dc_task_lock, flags);
+	/* grab this early so we know to try again if a state change and
+	 * wake happens part-way through our work  */
+	osb->dc_work_sequence = osb->dc_wake_sequence;
+
+	processed = osb->blocked_lock_count;
+	while (processed) {
+		BUG_ON(list_empty(&osb->blocked_lock_list));
+
+		lockres = list_entry(osb->blocked_lock_list.next,
+				     struct ocfs2_lock_res, l_blocked_list);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+
+		BUG_ON(!processed);
+		processed--;
+
+		ocfs2_process_blocked_lock(osb, lockres);
+
+		spin_lock_irqsave(&osb->dc_task_lock, flags);
+	}
+	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+}
+
+static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
+{
+	int empty = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&osb->dc_task_lock, flags);
+	if (list_empty(&osb->blocked_lock_list))
+		empty = 1;
+
+	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+	return empty;
+}
+
+static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
+{
+	int should_wake = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&osb->dc_task_lock, flags);
+	if (osb->dc_work_sequence != osb->dc_wake_sequence)
+		should_wake = 1;
+	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+
+	return should_wake;
+}
+
+static int ocfs2_downconvert_thread(void *arg)
+{
+	int status = 0;
+	struct ocfs2_super *osb = arg;
+
+	/* only quit once we've been asked to stop and there is no more
+	 * work available */
+	while (!(kthread_should_stop() &&
+		ocfs2_downconvert_thread_lists_empty(osb))) {
+
+		wait_event_interruptible(osb->dc_event,
+					 ocfs2_downconvert_thread_should_wake(osb) ||
+					 kthread_should_stop());
+
+		mlog(0, "downconvert_thread: awoken\n");
+
+		ocfs2_downconvert_thread_do_work(osb);
+	}
+
+	osb->dc_task = NULL;
+	return status;
+}
+
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&osb->dc_task_lock, flags);
+	/* make sure the voting thread gets a swipe at whatever changes
+	 * the caller may have made to the voting state */
+	osb->dc_wake_sequence++;
+	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+	wake_up(&osb->dc_event);
+}
diff --git a/kernel/fs/ocfs2/dlmglue.h b/kernel/fs/ocfs2/dlmglue.h
new file mode 100644
index 000000000..d293a22c3
--- /dev/null
+++ b/kernel/fs/ocfs2/dlmglue.h
@@ -0,0 +1,173 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef DLMGLUE_H
+#define DLMGLUE_H
+
+#include "dcache.h"
+
+#define OCFS2_LVB_VERSION 5
+
+struct ocfs2_meta_lvb {
+	__u8         lvb_version;
+	__u8         lvb_reserved0;
+	__be16       lvb_idynfeatures;
+	__be32       lvb_iclusters;
+	__be32       lvb_iuid;
+	__be32       lvb_igid;
+	__be64       lvb_iatime_packed;
+	__be64       lvb_ictime_packed;
+	__be64       lvb_imtime_packed;
+	__be64       lvb_isize;
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_iattr;
+	__be32       lvb_igeneration;
+	__be32       lvb_reserved2;
+};
+
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_bgrace;
+	__be32	lvb_igrace;
+	__be32	lvb_syncms;
+	__be32	lvb_blocks;
+	__be32	lvb_free_blk;
+	__be32	lvb_free_entry;
+};
+
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_os_seqno;
+};
+
+/* ocfs2_inode_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the downconvert thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+
+/* Locking subclasses of inode cluster lock */
+enum {
+	OI_LS_NORMAL = 0,
+	OI_LS_PARENT,
+	OI_LS_RENAME1,
+	OI_LS_RENAME2,
+	OI_LS_REFLINK_TARGET,
+};
+
+int ocfs2_dlm_init(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       unsigned int generation,
+			       struct inode *inode);
+void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
+				u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+			      struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno,
+				  unsigned int generation);
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
+int ocfs2_inode_lock_atime(struct inode *inode,
+			  struct vfsmount *vfsmnt,
+			  int *level);
+int ocfs2_inode_lock_full_nested(struct inode *inode,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags,
+			 int subclass);
+int ocfs2_inode_lock_with_page(struct inode *inode,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
+/* Variants without special locking class or flags */
+#define ocfs2_inode_lock_full(i, r, e, f)\
+		ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
+#define ocfs2_inode_lock_nested(i, b, e, s)\
+		ocfs2_inode_lock_full_nested(i, b, e, 0, s)
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
+void ocfs2_inode_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex);
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
+
+int ocfs2_rename_lock(struct ocfs2_super *osb);
+void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
+int ocfs2_dentry_lock(struct dentry *dentry, int ex);
+void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+struct ocfs2_refcount_tree;
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
+
+
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
+			       struct ocfs2_lock_res *lockres);
+
+/* for the downconvert thread */
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
+
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
+#endif	/* DLMGLUE_H */
diff --git a/kernel/fs/ocfs2/export.c b/kernel/fs/ocfs2/export.c
new file mode 100644
index 000000000..827fc9809
--- /dev/null
+++ b/kernel/fs/ocfs2/export.c
@@ -0,0 +1,271 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.c
+ *
+ * Functions to facilitate NFS exporting
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "dcache.h"
+#include "export.h"
+#include "inode.h"
+
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "ocfs2_trace.h"
+
+struct ocfs2_inode_handle
+{
+	u64 ih_blkno;
+	u32 ih_generation;
+};
+
+static struct dentry *ocfs2_get_dentry(struct super_block *sb,
+		struct ocfs2_inode_handle *handle)
+{
+	struct inode *inode;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 blkno = handle->ih_blkno;
+	int status, set;
+	struct dentry *result;
+
+	trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
+
+	if (blkno == 0) {
+		result = ERR_PTR(-ESTALE);
+		goto bail;
+	}
+
+	inode = ocfs2_ilookup(sb, blkno);
+	/*
+	 * If the inode exists in memory, we only need to check it's
+	 * generation number
+	 */
+	if (inode)
+		goto check_gen;
+
+	/*
+	 * This will synchronize us against ocfs2_delete_inode() on
+	 * all nodes
+	 */
+	status = ocfs2_nfs_sync_lock(osb, 1);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+		goto check_err;
+	}
+
+	status = ocfs2_test_inode_bit(osb, blkno, &set);
+	if (status < 0) {
+		if (status == -EINVAL) {
+			/*
+			 * The blkno NFS gave us doesn't even show up
+			 * as an inode, we return -ESTALE to be
+			 * nice
+			 */
+			status = -ESTALE;
+		} else
+			mlog(ML_ERROR, "test inode bit failed %d\n", status);
+		goto unlock_nfs_sync;
+	}
+
+	trace_ocfs2_get_dentry_test_bit(status, set);
+	/* If the inode allocator bit is clear, this inode must be stale */
+	if (!set) {
+		status = -ESTALE;
+		goto unlock_nfs_sync;
+	}
+
+	inode = ocfs2_iget(osb, blkno, 0, 0);
+
+unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(osb, 1);
+
+check_err:
+	if (status < 0) {
+		if (status == -ESTALE) {
+			trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
+						     handle->ih_generation);
+		}
+		result = ERR_PTR(status);
+		goto bail;
+	}
+
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		result = (void *)inode;
+		goto bail;
+	}
+
+check_gen:
+	if (handle->ih_generation != inode->i_generation) {
+		iput(inode);
+		trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
+						  handle->ih_generation,
+						  inode->i_generation);
+		result = ERR_PTR(-ESTALE);
+		goto bail;
+	}
+
+	result = d_obtain_alias(inode);
+	if (IS_ERR(result))
+		mlog_errno(PTR_ERR(result));
+
+bail:
+	trace_ocfs2_get_dentry_end(result);
+	return result;
+}
+
+static struct dentry *ocfs2_get_parent(struct dentry *child)
+{
+	int status;
+	u64 blkno;
+	struct dentry *parent;
+	struct inode *dir = d_inode(child);
+
+	trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
+			       (unsigned long long)OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_inode_lock(dir, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+
+	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
+
+bail_unlock:
+	ocfs2_inode_unlock(dir, 0);
+
+bail:
+	trace_ocfs2_get_parent_end(parent);
+
+	return parent;
+}
+
+static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
+			   struct inode *parent)
+{
+	int len = *max_len;
+	int type = 1;
+	u64 blkno;
+	u32 generation;
+	__le32 *fh = (__force __le32 *) fh_in;
+
+#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
+#error "You go ahead and fix that mess, then.  Somehow"
+	trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
+				    dentry->d_name.name,
+				    fh, len, connectable);
+#endif
+
+	if (parent && (len < 6)) {
+		*max_len = 6;
+		type = FILEID_INVALID;
+		goto bail;
+	} else if (len < 3) {
+		*max_len = 3;
+		type = FILEID_INVALID;
+		goto bail;
+	}
+
+	blkno = OCFS2_I(inode)->ip_blkno;
+	generation = inode->i_generation;
+
+	trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
+
+	len = 3;
+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
+	fh[2] = cpu_to_le32(generation);
+
+	if (parent) {
+		blkno = OCFS2_I(parent)->ip_blkno;
+		generation = parent->i_generation;
+
+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
+		fh[5] = cpu_to_le32(generation);
+
+		len = 6;
+		type = 2;
+
+		trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
+					     generation);
+	}
+
+	*max_len = len;
+
+bail:
+	trace_ocfs2_encode_fh_type(type);
+	return type;
+}
+
+static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct ocfs2_inode_handle handle;
+
+	if (fh_len < 3 || fh_type > 2)
+		return NULL;
+
+	handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32;
+	handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]);
+	handle.ih_generation = le32_to_cpu(fid->raw[2]);
+	return ocfs2_get_dentry(sb, &handle);
+}
+
+static struct dentry *ocfs2_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct ocfs2_inode_handle parent;
+
+	if (fh_type != 2 || fh_len < 6)
+		return NULL;
+
+	parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32;
+	parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]);
+	parent.ih_generation = le32_to_cpu(fid->raw[5]);
+	return ocfs2_get_dentry(sb, &parent);
+}
+
+const struct export_operations ocfs2_export_ops = {
+	.encode_fh	= ocfs2_encode_fh,
+	.fh_to_dentry	= ocfs2_fh_to_dentry,
+	.fh_to_parent	= ocfs2_fh_to_parent,
+	.get_parent	= ocfs2_get_parent,
+};
diff --git a/kernel/fs/ocfs2/export.h b/kernel/fs/ocfs2/export.h
new file mode 100644
index 000000000..41a738678
--- /dev/null
+++ b/kernel/fs/ocfs2/export.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_EXPORT_H
+#define OCFS2_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations ocfs2_export_ops;
+
+#endif /* OCFS2_EXPORT_H */
diff --git a/kernel/fs/ocfs2/extent_map.c b/kernel/fs/ocfs2/extent_map.c
new file mode 100644
index 000000000..767370b65
--- /dev/null
+++ b/kernel/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.c
+ *
+ * Block/Cluster mapping functions
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fiemap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "super.h"
+#include "symlink.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+/*
+ * The extent caching implementation is intentionally trivial.
+ *
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
+ */
+
+void ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	oi->ip_extent_map.em_num_items = 0;
+	INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
+}
+
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+				      unsigned int cpos,
+				      struct ocfs2_extent_map_item **ret_emi)
+{
+	unsigned int range;
+	struct ocfs2_extent_map_item *emi;
+
+	*ret_emi = NULL;
+
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		range = emi->ei_cpos + emi->ei_clusters;
+
+		if (cpos >= emi->ei_cpos && cpos < range) {
+			list_move(&emi->ei_list, &em->em_list);
+
+			*ret_emi = emi;
+			break;
+		}
+	}
+}
+
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+				   unsigned int *phys, unsigned int *len,
+				   unsigned int *flags)
+{
+	unsigned int coff;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map_item *emi;
+
+	spin_lock(&oi->ip_lock);
+
+	__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+	if (emi) {
+		coff = cpos - emi->ei_cpos;
+		*phys = emi->ei_phys + coff;
+		if (len)
+			*len = emi->ei_clusters - coff;
+		if (flags)
+			*flags = emi->ei_flags;
+	}
+
+	spin_unlock(&oi->ip_lock);
+
+	if (emi == NULL)
+		return -ENOENT;
+
+	return 0;
+}
+
+/*
+ * Forget about all clusters equal to or greater than cpos.
+ */
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
+{
+	struct ocfs2_extent_map_item *emi, *n;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	LIST_HEAD(tmp_list);
+	unsigned int range;
+
+	spin_lock(&oi->ip_lock);
+	list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
+		if (emi->ei_cpos >= cpos) {
+			/* Full truncate of this record. */
+			list_move(&emi->ei_list, &tmp_list);
+			BUG_ON(em->em_num_items == 0);
+			em->em_num_items--;
+			continue;
+		}
+
+		range = emi->ei_cpos + emi->ei_clusters;
+		if (range > cpos) {
+			/* Partial truncate */
+			emi->ei_clusters = cpos - emi->ei_cpos;
+		}
+	}
+	spin_unlock(&oi->ip_lock);
+
+	list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
+		list_del(&emi->ei_list);
+		kfree(emi);
+	}
+}
+
+/*
+ * Is any part of emi2 contained within emi1
+ */
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+				 struct ocfs2_extent_map_item *emi2)
+{
+	unsigned int range1, range2;
+
+	/*
+	 * Check if logical start of emi2 is inside emi1
+	 */
+	range1 = emi1->ei_cpos + emi1->ei_clusters;
+	if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+		return 1;
+
+	/*
+	 * Check if logical end of emi2 is inside emi1
+	 */
+	range2 = emi2->ei_cpos + emi2->ei_clusters;
+	if (range2 > emi1->ei_cpos && range2 <= range1)
+		return 1;
+
+	return 0;
+}
+
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+				  struct ocfs2_extent_map_item *src)
+{
+	dest->ei_cpos = src->ei_cpos;
+	dest->ei_phys = src->ei_phys;
+	dest->ei_clusters = src->ei_clusters;
+	dest->ei_flags = src->ei_flags;
+}
+
+/*
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
+ */
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+					 struct ocfs2_extent_map_item *ins)
+{
+	/*
+	 * Handle contiguousness
+	 */
+	if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+	    ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+	    ins->ei_flags == emi->ei_flags) {
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
+	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+		   (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
+		   ins->ei_flags == emi->ei_flags) {
+		emi->ei_phys = ins->ei_phys;
+		emi->ei_cpos = ins->ei_cpos;
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
+	}
+
+	/*
+	 * Overlapping extents - this shouldn't happen unless we've
+	 * split an extent to change it's flags. That is exceedingly
+	 * rare, so there's no sense in trying to optimize it yet.
+	 */
+	if (ocfs2_ei_is_contained(emi, ins) ||
+	    ocfs2_ei_is_contained(ins, emi)) {
+		ocfs2_copy_emi_fields(emi, ins);
+		return 1;
+	}
+
+	/* No merge was possible. */
+	return 0;
+}
+
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+	struct ocfs2_extent_map_item ins;
+
+	ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+	ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+					       le64_to_cpu(rec->e_blkno));
+	ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	ins.ei_flags = rec->e_flags;
+
+search:
+	spin_lock(&oi->ip_lock);
+
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+			list_move(&emi->ei_list, &em->em_list);
+			spin_unlock(&oi->ip_lock);
+			goto out;
+		}
+	}
+
+	/*
+	 * No item could be merged.
+	 *
+	 * Either allocate and add a new item, or overwrite the last recently
+	 * inserted.
+	 */
+
+	if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+		if (new_emi == NULL) {
+			spin_unlock(&oi->ip_lock);
+
+			new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+			if (new_emi == NULL)
+				goto out;
+
+			goto search;
+		}
+
+		ocfs2_copy_emi_fields(new_emi, &ins);
+		list_add(&new_emi->ei_list, &em->em_list);
+		em->em_num_items++;
+		new_emi = NULL;
+	} else {
+		BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+		emi = list_entry(em->em_list.prev,
+				 struct ocfs2_extent_map_item, ei_list);
+		list_move(&emi->ei_list, &em->em_list);
+		ocfs2_copy_emi_fields(emi, &ins);
+	}
+
+	spin_unlock(&oi->ip_lock);
+
+out:
+	kfree(new_emi);
+}
+
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	int ret, next_free;
+	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+	el = &eb->h_list;
+
+	if (el->l_tree_depth) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has non zero tree depth in "
+			    "leaf block %llu\n", inode->i_ino,
+			    (unsigned long long)eb_bh->b_blocknr);
+		ret = -EROFS;
+		goto out;
+	}
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+		ret = 1;
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
+ */
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+				       u32 v_cluster)
+{
+	int i;
+	struct ocfs2_extent_rec *rec;
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		if (v_cluster < le32_to_cpu(rec->e_cpos))
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
+ *
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
+ *
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
+ */
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters)
+{
+	int ret, i;
+	struct buffer_head *next_eb_bh = NULL;
+	struct ocfs2_extent_block *eb, *next_eb;
+
+	i = ocfs2_search_for_hole_index(el, v_cluster);
+
+	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+		/*
+		 * Check the next leaf for any extents.
+		 */
+
+		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+			goto no_more_extents;
+
+		ret = ocfs2_read_extent_block(ci,
+					      le64_to_cpu(eb->h_next_leaf_blk),
+					      &next_eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
+		el = &next_eb->h_list;
+		i = ocfs2_search_for_hole_index(el, v_cluster);
+	}
+
+no_more_extents:
+	if (i == le16_to_cpu(el->l_next_free_rec)) {
+		/*
+		 * We're at the end of our existing allocation. Just
+		 * return the maximum number of clusters we could
+		 * possibly allocate.
+		 */
+		*num_clusters = UINT_MAX - v_cluster;
+	} else {
+		*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
+	}
+
+	ret = 0;
+out:
+	brelse(next_eb_bh);
+	return ret;
+}
+
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 v_cluster, unsigned int *hole_len,
+				      struct ocfs2_extent_rec *ret_rec,
+				      unsigned int *is_last)
+{
+	int i, ret, tree_height, len;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_block *uninitialized_var(eb);
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+	struct buffer_head *eb_bh = NULL;
+
+	memset(ret_rec, 0, sizeof(*ret_rec));
+	if (is_last)
+		*is_last = 0;
+
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	el = &di->id2.i_list;
+	tree_height = le16_to_cpu(el->l_tree_depth);
+
+	if (tree_height > 0) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		/*
+		 * Holes can be larger than the maximum size of an
+		 * extent, so we return their lengths in a separate
+		 * field.
+		 */
+		if (hole_len) {
+			ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
+							 el, eb_bh,
+							 v_cluster, &len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			*hole_len = len;
+		}
+		goto out_hole;
+	}
+
+	rec = &el->l_recs[i];
+
+	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+	if (!rec->e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0)", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*ret_rec = *rec;
+
+	/*
+	 * Checking for last extent is potentially expensive - we
+	 * might have to look at the next leaf over to see if it's
+	 * empty.
+	 *
+	 * The first two checks are to see whether the caller even
+	 * cares for this information, and if the extent is at least
+	 * the last in it's list.
+	 *
+	 * If those hold true, then the extent is last if any of the
+	 * additional conditions hold true:
+	 *  - Extent list is in-inode
+	 *  - Extent list is right-most
+	 *  - Extent list is 2nd to rightmost, with empty right-most
+	 */
+	if (is_last) {
+		if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+			if (tree_height == 0)
+				*is_last = 1;
+			else if (eb->h_blkno == di->i_last_eb_blk)
+				*is_last = 1;
+			else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+				ret = ocfs2_last_eb_is_empty(inode, di);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+				if (ret == 1)
+					*is_last = 1;
+			}
+		}
+	}
+
+out_hole:
+	ret = 0;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+					  u32 v_cluster,
+					  struct ocfs2_extent_rec *rec,
+					  u32 *p_cluster, u32 *num_clusters)
+
+{
+	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+	*p_cluster = *p_cluster + coff;
+
+	if (num_clusters)
+		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	} else {
+		rec = &el->l_recs[i];
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0) in xattr", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+		if (extent_flags)
+			*extent_flags = rec->e_flags;
+	}
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
+{
+	int ret;
+	unsigned int uninitialized_var(hole_len), flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+					 &rec, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rec.e_blkno == 0ULL) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			*num_clusters = hole_len;
+		}
+	} else {
+		ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+					      p_cluster, num_clusters);
+		flags = rec.e_flags;
+
+		ocfs2_extent_map_insert_rec(inode, &rec);
+	}
+
+	if (extent_flags)
+		*extent_flags = flags;
+
+out:
+	brelse(di_bh);
+	return ret;
+}
+
+/*
+ * This expects alloc_sem to be held. The allocation cannot change at
+ * all while the map is in the process of being updated.
+ */
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				u64 *ret_count, unsigned int *extent_flags)
+{
+	int ret;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 cpos, num_clusters, p_cluster;
+	u64 boff = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+
+	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
+				 extent_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * p_cluster == 0 indicates a hole.
+	 */
+	if (p_cluster) {
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		boff += (v_blkno & (u64)(bpc - 1));
+	}
+
+	*p_blkno = boff;
+
+	if (ret_count) {
+		*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		*ret_count -= v_blkno & (u64)(bpc - 1);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * The ocfs2_fiemap_inline() may be a little bit misleading, since
+ * it not only handles the fiemap for inlined files, but also deals
+ * with the fast symlink, cause they have no difference for extent
+ * mapping per se.
+ */
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+			       struct fiemap_extent_info *fieinfo,
+			       u64 map_start)
+{
+	int ret;
+	unsigned int id_count;
+	struct ocfs2_dinode *di;
+	u64 phys;
+	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	if (ocfs2_inode_is_fast_symlink(inode))
+		id_count = ocfs2_fast_symlink_chars(inode->i_sb);
+	else
+		id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+	if (map_start < id_count) {
+		phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+		if (ocfs2_inode_is_fast_symlink(inode))
+			phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
+		else
+			phys += offsetof(struct ocfs2_dinode,
+					 id2.i_data.id_data);
+
+		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+					      flags);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len)
+{
+	int ret, is_last;
+	u32 mapping_end, cpos;
+	unsigned int hole_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 len_bytes, phys_bytes, virt_bytes;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * Handle inline-data and fast symlink separately.
+	 */
+	if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+	    ocfs2_inode_is_fast_symlink(inode)) {
+		ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+		goto out_unlock;
+	}
+
+	cpos = map_start >> osb->s_clustersize_bits;
+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+					       map_start + map_len);
+	is_last = 0;
+	while (cpos < mapping_end && !is_last) {
+		u32 fe_flags;
+
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+						 &hole_size, &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
+		if (rec.e_blkno == 0ULL) {
+			cpos += hole_size;
+			continue;
+		}
+
+		fe_flags = 0;
+		if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+			fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+			fe_flags |= FIEMAP_EXTENT_SHARED;
+		if (is_last)
+			fe_flags |= FIEMAP_EXTENT_LAST;
+		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+					      len_bytes, fe_flags);
+		if (ret)
+			break;
+
+		cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out_unlock:
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+
+	return ret;
+}
+
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+	unsigned int is_last = 0, is_data = 0;
+	u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 cpos, cend, clen, hole_size;
+	u64 extoff, extlen;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	if (*offset >= i_size_read(inode)) {
+		ret = -ENXIO;
+		goto out_unlock;
+	}
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		if (whence == SEEK_HOLE)
+			*offset = i_size_read(inode);
+		goto out_unlock;
+	}
+
+	clen = 0;
+	cpos = *offset >> cs_bits;
+	cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
+	while (cpos < cend && !is_last) {
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+						 &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
+		extoff = cpos;
+		extoff <<= cs_bits;
+
+		if (rec.e_blkno == 0ULL) {
+			clen = hole_size;
+			is_data = 0;
+		} else {
+			clen = le16_to_cpu(rec.e_leaf_clusters) -
+				(cpos - le32_to_cpu(rec.e_cpos));
+			is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ?  0 : 1;
+		}
+
+		if ((!is_data && whence == SEEK_HOLE) ||
+		    (is_data && whence == SEEK_DATA)) {
+			if (extoff > *offset)
+				*offset = extoff;
+			goto out_unlock;
+		}
+
+		if (!is_last)
+			cpos += clen;
+	}
+
+	if (whence == SEEK_HOLE) {
+		extoff = cpos;
+		extoff <<= cs_bits;
+		extlen = clen;
+		extlen <<=  cs_bits;
+
+		if ((extoff + extlen) > i_size_read(inode))
+			extlen = i_size_read(inode) - extoff;
+		extoff += extlen;
+		if (extoff > *offset)
+			*offset = extoff;
+		goto out_unlock;
+	}
+
+	ret = -ENXIO;
+
+out_unlock:
+
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+	return ret;
+}
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh))
+{
+	int rc = 0;
+	u64 p_block, p_count;
+	int i, count, done = 0;
+
+	trace_ocfs2_read_virt_blocks(
+	     inode, (unsigned long long)v_block, nr, bhs, flags,
+	     validate);
+
+	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+		goto out;
+	}
+
+	while (done < nr) {
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+		rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+						 &p_block, &p_count, NULL);
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!p_block) {
+			rc = -EIO;
+			mlog(ML_ERROR,
+			     "Inode #%llu contains a hole at offset %llu\n",
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			     (unsigned long long)(v_block + done) <<
+			     inode->i_sb->s_blocksize_bits);
+			break;
+		}
+
+		count = nr - done;
+		if (p_count < count)
+			count = p_count;
+
+		/*
+		 * If the caller passed us bhs, they should have come
+		 * from a previous readahead call to this function.  Thus,
+		 * they should have the right b_blocknr.
+		 */
+		for (i = 0; i < count; i++) {
+			if (!bhs[done + i])
+				continue;
+			BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+		}
+
+		rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
+				       bhs + done, flags, validate);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+		done += count;
+	}
+
+out:
+	return rc;
+}
+
+
diff --git a/kernel/fs/ocfs2/extent_map.h b/kernel/fs/ocfs2/extent_map.h
new file mode 100644
index 000000000..67ea57d2f
--- /dev/null
+++ b/kernel/fs/ocfs2/extent_map.h
@@ -0,0 +1,92 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+
+struct ocfs2_extent_map_item {
+	unsigned int			ei_cpos;
+	unsigned int			ei_phys;
+	unsigned int			ei_clusters;
+	unsigned int			ei_flags;
+
+	struct list_head		ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS			3
+struct ocfs2_extent_map {
+	unsigned int			em_num_items;
+	struct list_head		em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec);
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+		       u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				u64 *ret_count, unsigned int *extent_flags);
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len);
+
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags);
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh));
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters);
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+					struct buffer_head **bh,
+					int (*validate)(struct super_block *sb,
+							struct buffer_head *bh))
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+	return status;
+}
+
+
+#endif  /* _EXTENT_MAP_H */
diff --git a/kernel/fs/ocfs2/file.c b/kernel/fs/ocfs2/file.c
new file mode 100644
index 000000000..d8b670cbd
--- /dev/null
+++ b/kernel/fs/ocfs2/file.c
@@ -0,0 +1,2702 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c
+ *
+ * File open, close, extend, truncate
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "ioctl.h"
+#include "journal.h"
+#include "locks.h"
+#include "mmap.h"
+#include "suballoc.h"
+#include "super.h"
+#include "xattr.h"
+#include "acl.h"
+#include "quota.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
+{
+	struct ocfs2_file_private *fp;
+
+	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+
+	fp->fp_file = file;
+	mutex_init(&fp->fp_mutex);
+	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+	file->private_data = fp;
+
+	return 0;
+}
+
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (fp) {
+		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+		ocfs2_lock_res_free(&fp->fp_flock);
+		kfree(fp);
+		file->private_data = NULL;
+	}
+}
+
+static int ocfs2_file_open(struct inode *inode, struct file *file)
+{
+	int status;
+	int mode = file->f_flags;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+			      (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name, mode);
+
+	if (file->f_mode & FMODE_WRITE)
+		dquot_initialize(inode);
+
+	spin_lock(&oi->ip_lock);
+
+	/* Check that the inode hasn't been wiped from disk by another
+	 * node. If it hasn't then we're safe as long as we hold the
+	 * spin lock until our increment of open count. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&oi->ip_lock);
+
+		status = -ENOENT;
+		goto leave;
+	}
+
+	if (mode & O_DIRECT)
+		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
+
+	oi->ip_open_count++;
+	spin_unlock(&oi->ip_lock);
+
+	status = ocfs2_init_file_private(inode, file);
+	if (status) {
+		/*
+		 * We want to set open count back if we're failing the
+		 * open.
+		 */
+		spin_lock(&oi->ip_lock);
+		oi->ip_open_count--;
+		spin_unlock(&oi->ip_lock);
+	}
+
+leave:
+	return status;
+}
+
+static int ocfs2_file_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	spin_lock(&oi->ip_lock);
+	if (!--oi->ip_open_count)
+		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+				 oi->ip_blkno,
+				 file->f_path.dentry->d_name.len,
+				 file->f_path.dentry->d_name.name,
+				 oi->ip_open_count);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_free_file_private(inode, file);
+
+	return 0;
+}
+
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+	return ocfs2_init_file_private(inode, file);
+}
+
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+	ocfs2_free_file_private(inode, file);
+	return 0;
+}
+
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+			   int datasync)
+{
+	int err = 0;
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	journal_t *journal = osb->journal->j_journal;
+	int ret;
+	tid_t commit_tid;
+	bool needs_barrier = false;
+
+	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+			      OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name,
+			      (unsigned long long)datasync);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
+	if (journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = true;
+	err = jbd2_complete_transaction(journal, commit_tid);
+	if (needs_barrier) {
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+		if (!err)
+			err = ret;
+	}
+
+	if (err)
+		mlog_errno(err);
+
+	return (err < 0) ? -EIO : 0;
+}
+
+int ocfs2_should_update_atime(struct inode *inode,
+			      struct vfsmount *vfsmnt)
+{
+	struct timespec now;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return 0;
+
+	if ((inode->i_flags & S_NOATIME) ||
+	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+		return 0;
+
+	/*
+	 * We can be called with no vfsmnt structure - NFSD will
+	 * sometimes do this.
+	 *
+	 * Note that our action here is different than touch_atime() -
+	 * if we can't tell whether this is a noatime mount, then we
+	 * don't know whether to trust the value of s_atime_quantum.
+	 */
+	if (vfsmnt == NULL)
+		return 0;
+
+	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
+	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+		return 0;
+
+	if (vfsmnt->mnt_flags & MNT_RELATIME) {
+		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
+		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
+			return 1;
+
+		return 0;
+	}
+
+	now = CURRENT_TIME;
+	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+		return 0;
+	else
+		return 1;
+}
+
+int ocfs2_update_inode_atime(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
+	 * have i_mutex to guard against concurrent changes to other
+	 * inode fields.
+	 */
+	inode->i_atime = CURRENT_TIME;
+	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+	ocfs2_journal_dirty(handle, bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+int ocfs2_set_inode_size(handle_t *handle,
+				struct inode *inode,
+				struct buffer_head *fe_bh,
+				u64 new_i_size)
+{
+	int status;
+
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_cow_file_pos(struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      u64 offset)
+{
+	int status;
+	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+
+	/*
+	 * If the new offset is aligned to the range of the cluster, there is
+	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
+	 * CoW either.
+	 */
+	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
+		return 0;
+
+	status = ocfs2_get_clusters(inode, cpos, &phys,
+				    &num_clusters, &ext_flags);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+
+out:
+	return status;
+}
+
+static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     u64 new_i_size)
+{
+	int status;
+	handle_t *handle;
+	struct ocfs2_dinode *di;
+	u64 cluster_bytes;
+
+	/*
+	 * We need to CoW the cluster contains the offset if it is reflinked
+	 * since we will call ocfs2_zero_range_for_truncate later which will
+	 * write "0" from offset to the end of the cluster.
+	 */
+	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
+	if (status) {
+		mlog_errno(status);
+		return status;
+	}
+
+	/* TODO: This needs to actually orphan the inode in this
+	 * transaction. */
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	/*
+	 * Do this before setting i_size.
+	 */
+	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
+					       cluster_bytes);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	i_size_write(inode, new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di = (struct ocfs2_dinode *) fe_bh->b_data;
+	di->i_size = cpu_to_le64(new_i_size);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+	ocfs2_journal_dirty(handle, fe_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return status;
+}
+
+int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
+	 * already validated it */
+	fe = (struct ocfs2_dinode *) di_bh->b_data;
+
+	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)le64_to_cpu(fe->i_size),
+				  (unsigned long long)new_i_size);
+
+	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
+			"Inode %llu, inode i_size = %lld != di "
+			"i_size = %llu, i_flags = 0x%x\n",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			i_size_read(inode),
+			(unsigned long long)le64_to_cpu(fe->i_size),
+			le32_to_cpu(fe->i_flags));
+
+	if (new_i_size > le64_to_cpu(fe->i_size)) {
+		trace_ocfs2_truncate_file_error(
+			(unsigned long long)le64_to_cpu(fe->i_size),
+			(unsigned long long)new_i_size);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_resv_discard(&osb->osb_la_resmap,
+			   &OCFS2_I(inode)->ip_la_data_resv);
+
+	/*
+	 * The inode lock forced other nodes to sync and drop their
+	 * pages, which (correctly) happens even if we have a truncate
+	 * without allocation change - ocfs2 cluster sizes can be much
+	 * greater than page size, so we have to truncate them
+	 * anyway.
+	 */
+	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+	truncate_inode_pages(inode->i_mapping, new_i_size);
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
+					       i_size_read(inode), 1);
+		if (status)
+			mlog_errno(status);
+
+		goto bail_unlock_sem;
+	}
+
+	/* alright, we're going to need to do a full blown alloc size
+	 * change. Orphan the inode so that recovery can complete the
+	 * truncate if necessary. This does the task of marking
+	 * i_size. */
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_sem;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_sem;
+	}
+
+	/* TODO: orphan dir cleanup here. */
+bail_unlock_sem:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+bail:
+	if (!status && OCFS2_I(inode)->ip_clusters == 0)
+		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
+
+	return status;
+}
+
+/*
+ * extend file allocation only here.
+ * we'll update all the disk stuff, and oip->alloc_size
+ *
+ * expect stuff to be locked, a transaction started and enough data /
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
+ */
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret)
+{
+	int ret;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
+	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+					  clusters_to_add, mark_unwritten,
+					  data_ac, meta_ac, reason_ret);
+
+	return ret;
+}
+
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+				     u32 clusters_to_add, int mark_unwritten)
+{
+	int status = 0;
+	int restart_func = 0;
+	int credits;
+	u32 prev_clusters;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why = RESTART_NONE;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
+	int did_quota = 0;
+
+	/*
+	 * Unwritten extent only exists for file systems which
+	 * support holes.
+	 */
+	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
+
+	status = ocfs2_read_inode_block(inode, &bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	trace_ocfs2_extend_allocation(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)i_size_read(inode),
+		le32_to_cpu(fe->i_clusters), clusters_to_add,
+		why, restart_func);
+
+	status = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	if (status)
+		goto leave;
+	did_quota = 1;
+
+	/* reserve a write to the file entry early on - that we if we
+	 * run out of credits in the allocation path, we can still
+	 * update i_size. */
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
+
+	status = ocfs2_add_inode_data(osb,
+				      inode,
+				      &logical_start,
+				      clusters_to_add,
+				      mark_unwritten,
+				      bh,
+				      handle,
+				      data_ac,
+				      meta_ac,
+				      &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+	ocfs2_journal_dirty(handle, bh);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	/* Release unused quota reservation */
+	dquot_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	did_quota = 0;
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			restart_func = 1;
+			status = 0;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			status = ocfs2_allocate_extend_trans(handle, 1);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
+	     le32_to_cpu(fe->i_clusters),
+	     (unsigned long long)le64_to_cpu(fe->i_size),
+	     OCFS2_I(inode)->ip_clusters,
+	     (unsigned long long)i_size_read(inode));
+
+leave:
+	if (status < 0 && did_quota)
+		dquot_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	if (handle) {
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	return status;
+}
+
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+		u32 clusters_to_add, int mark_unwritten)
+{
+	return __ocfs2_extend_allocation(inode, logical_start,
+			clusters_to_add, mark_unwritten);
+}
+
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+						struct buffer_head *di_bh)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	if (!ocfs2_should_order_data(inode))
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_jbd2_file_inode(handle, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret)
+		mlog_errno(ret);
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+out:
+	if (ret) {
+		if (!IS_ERR(handle))
+			ocfs2_commit_trans(osb, handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+				 u64 abs_to, struct buffer_head *di_bh)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
+	handle_t *handle;
+	int ret = 0;
+	unsigned zero_from, zero_to, block_start, block_end;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	BUG_ON(abs_from >= abs_to);
+	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+	BUG_ON(abs_from & (inode->i_blkbits - 1));
+
+	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	page = find_or_create_page(mapping, index, GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit_trans;
+	}
+
+	/* Get the offsets within the page that we want to zero */
+	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+	if (!zero_to)
+		zero_to = PAGE_CACHE_SIZE;
+
+	trace_ocfs2_write_zero_page(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)abs_from,
+			(unsigned long long)abs_to,
+			index, zero_from, zero_to);
+
+	/* We know that zero_from is block aligned */
+	for (block_start = zero_from; block_start < zero_to;
+	     block_start = block_end) {
+		block_end = block_start + (1 << inode->i_blkbits);
+
+		/*
+		 * block_start is block-aligned.  Bump it by one to force
+		 * __block_write_begin and block_commit_write to zero the
+		 * whole block.
+		 */
+		ret = __block_write_begin(page, block_start + 1, 0,
+					  ocfs2_get_block);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
+
+		/* must not update i_size! */
+		ret = block_commit_write(page, block_start + 1,
+					 block_start + 1);
+		if (ret < 0)
+			mlog_errno(ret);
+		else
+			ret = 0;
+	}
+
+	/*
+	 * fs-writeback will release the dirty pages without page lock
+	 * whose offset are over inode size, the release happens at
+	 * block_write_full_page().
+	 */
+	i_size_write(inode, abs_to);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	di->i_mtime_nsec = di->i_ctime_nsec;
+	if (handle) {
+		ocfs2_journal_dirty(handle, di_bh);
+		ocfs2_update_inode_fsync_trans(handle, inode, 1);
+	}
+
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out_commit_trans:
+	if (handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+/*
+ * Find the next range to zero.  We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache.  We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed.  range_start and range_end return the next zeroing
+ * range.  A subsequent call should pass the previous range_end as its
+ * zero_start.  If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over.  Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+				       struct buffer_head *di_bh,
+				       u64 zero_start, u64 zero_end,
+				       u64 *range_start, u64 *range_end)
+{
+	int rc = 0, needs_cow = 0;
+	u32 p_cpos, zero_clusters = 0;
+	u32 zero_cpos =
+		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+
+	while (zero_cpos < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+					&num_clusters, &ext_flags);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+
+		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+			zero_clusters = num_clusters;
+			if (ext_flags & OCFS2_EXT_REFCOUNTED)
+				needs_cow = 1;
+			break;
+		}
+
+		zero_cpos += num_clusters;
+	}
+	if (!zero_clusters) {
+		*range_end = 0;
+		goto out;
+	}
+
+	while ((zero_cpos + zero_clusters) < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+					&p_cpos, &num_clusters,
+					&ext_flags);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+
+		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+			break;
+		if (ext_flags & OCFS2_EXT_REFCOUNTED)
+			needs_cow = 1;
+		zero_clusters += num_clusters;
+	}
+	if ((zero_cpos + zero_clusters) > last_cpos)
+		zero_clusters = last_cpos - zero_cpos;
+
+	if (needs_cow) {
+		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
+					zero_clusters, UINT_MAX);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+	}
+
+	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+					     zero_cpos + zero_clusters);
+
+out:
+	return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+				   u64 range_end, struct buffer_head *di_bh)
+{
+	int rc = 0;
+	u64 next_pos;
+	u64 zero_pos = range_start;
+
+	trace_ocfs2_zero_extend_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)range_start,
+			(unsigned long long)range_end);
+	BUG_ON(range_start >= range_end);
+
+	while (zero_pos < range_end) {
+		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+		if (next_pos > range_end)
+			next_pos = range_end;
+		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
+		if (rc < 0) {
+			mlog_errno(rc);
+			break;
+		}
+		zero_pos = next_pos;
+
+		/*
+		 * Very large extends have the potential to lock up
+		 * the cpu for extended periods of time.
+		 */
+		cond_resched();
+	}
+
+	return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to_size)
+{
+	int ret = 0;
+	u64 zero_start, range_start = 0, range_end = 0;
+	struct super_block *sb = inode->i_sb;
+
+	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)zero_start,
+				(unsigned long long)i_size_read(inode));
+	while (zero_start < zero_to_size) {
+		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+						  zero_to_size,
+						  &range_start,
+						  &range_end);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		if (!range_end)
+			break;
+		/* Trim the ends */
+		if (range_start < zero_start)
+			range_start = zero_start;
+		if (range_end > zero_to_size)
+			range_end = zero_to_size;
+
+		ret = ocfs2_zero_extend_range(inode, range_start,
+					      range_end, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		zero_start = range_end;
+	}
+
+	return ret;
+}
+
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to)
+{
+	int ret;
+	u32 clusters_to_add;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/*
+	 * Only quota files call this without a bh, and they can't be
+	 * refcounted.
+	 */
+	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
+	if (clusters_to_add < oi->ip_clusters)
+		clusters_to_add = 0;
+	else
+		clusters_to_add -= oi->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
+						clusters_to_add, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Call this even if we don't add any clusters to the tree. We
+	 * still need to zero the area between the old i_size and the
+	 * new i_size.
+	 */
+	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	BUG_ON(!di_bh);
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	/*
+	 * The alloc sem blocks people in read/write from reading our
+	 * allocation until we're done changing it. We depend on
+	 * i_mutex to block other extend/truncate calls while we're
+	 * here.  We even have to hold it for sparse files because there
+	 * might be some tail zeroing.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		/*
+		 * We can optimize small extends by keeping the inodes
+		 * inline data.
+		 */
+		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
+			up_write(&oi->ip_alloc_sem);
+			goto out_update_size;
+		}
+
+		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+		if (ret) {
+			up_write(&oi->ip_alloc_sem);
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+	else
+		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+					    new_i_size);
+
+	up_write(&oi->ip_alloc_sem);
+
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out_update_size:
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int status = 0, size_change;
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *bh = NULL;
+	handle_t *handle = NULL;
+	struct dquot *transfer_to[MAXQUOTAS] = { };
+	int qtype;
+
+	trace_ocfs2_setattr(inode, dentry,
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    dentry->d_name.len, dentry->d_name.name,
+			    attr->ia_valid, attr->ia_mode,
+			    from_kuid(&init_user_ns, attr->ia_uid),
+			    from_kgid(&init_user_ns, attr->ia_gid));
+
+	/* ensuring we don't even attempt to truncate a symlink */
+	if (S_ISLNK(inode->i_mode))
+		attr->ia_valid &= ~ATTR_SIZE;
+
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
+		return 0;
+
+	status = inode_change_ok(inode, attr);
+	if (status)
+		return status;
+
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
+	if (size_change) {
+		status = ocfs2_rw_lock(inode, 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_inode_lock(inode, &bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail_unlock_rw;
+	}
+
+	if (size_change) {
+		status = inode_newsize_ok(inode, attr->ia_size);
+		if (status)
+			goto bail_unlock;
+
+		inode_dio_wait(inode);
+
+		if (i_size_read(inode) >= attr->ia_size) {
+			if (ocfs2_should_order_data(inode)) {
+				status = ocfs2_begin_ordered_truncate(inode,
+								      attr->ia_size);
+				if (status)
+					goto bail_unlock;
+			}
+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
+		} else
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			status = -ENOSPC;
+			goto bail_unlock;
+		}
+	}
+
+	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+	    (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+		/*
+		 * Gather pointers to quota structures so that allocation /
+		 * freeing of quota structures happens here and not inside
+		 * dquot_transfer() where we have problems with lock ordering
+		 */
+		if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
+			if (!transfer_to[USRQUOTA]) {
+				status = -ESRCH;
+				goto bail_unlock;
+			}
+		}
+		if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
+			if (!transfer_to[GRPQUOTA]) {
+				status = -ESRCH;
+				goto bail_unlock;
+			}
+		}
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
+					   2 * ocfs2_quota_trans_credits(sb));
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
+		status = __dquot_transfer(inode, transfer_to);
+		if (status < 0)
+			goto bail_commit;
+	} else {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(osb, handle);
+bail_unlock:
+	ocfs2_inode_unlock(inode, 1);
+bail_unlock_rw:
+	if (size_change)
+		ocfs2_rw_unlock(inode, 1);
+bail:
+	brelse(bh);
+
+	/* Release quota pointers in case we acquired them */
+	for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
+		dqput(transfer_to[qtype]);
+
+	if (!status && attr->ia_valid & ATTR_MODE) {
+		status = posix_acl_chmod(inode, inode->i_mode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	return status;
+}
+
+int ocfs2_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct super_block *sb = d_inode(dentry)->i_sb;
+	struct ocfs2_super *osb = sb->s_fs_info;
+	int err;
+
+	err = ocfs2_inode_revalidate(dentry);
+	if (err) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	generic_fillattr(inode, stat);
+
+	/* We set the blksize from the cluster size for performance */
+	stat->blksize = osb->s_clustersize;
+
+bail:
+	return err;
+}
+
+int ocfs2_permission(struct inode *inode, int mask)
+{
+	int ret;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	ret = ocfs2_inode_lock(inode, NULL, 0);
+	if (ret) {
+		if (ret != -ENOENT)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = generic_permission(inode, mask);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+	return ret;
+}
+
+static int __ocfs2_write_remove_suid(struct inode *inode,
+				     struct buffer_head *bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+
+	trace_ocfs2_write_remove_suid(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			inode->i_mode);
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_trans;
+	}
+
+	inode->i_mode &= ~S_ISUID;
+	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
+		inode->i_mode &= ~S_ISGID;
+
+	di = (struct ocfs2_dinode *) bh->b_data;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+	ocfs2_journal_dirty(handle, bh);
+
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+				       size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+
+	ret = ocfs2_read_inode_block(inode, &bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret =  __ocfs2_write_remove_suid(inode, bh);
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+					    u64 start, u64 len)
+{
+	int ret;
+	u32 cpos, phys_cpos, clusters, alloc_size;
+	u64 end = start + len;
+	struct buffer_head *di_bh = NULL;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_read_inode_block(inode, &di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Nothing to do if the requested reservation range
+		 * fits within the inode.
+		 */
+		if (ocfs2_size_fits_inline_data(di_bh, end))
+			goto out;
+
+		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * We consider both start and len to be inclusive.
+	 */
+	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+	clusters -= cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+					 &alloc_size, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Hole or existing extent len can be arbitrary, so
+		 * cap it to our own allocation request.
+		 */
+		if (alloc_size > clusters)
+			alloc_size = clusters;
+
+		if (phys_cpos) {
+			/*
+			 * We already have an allocation at this
+			 * region so we can safely skip it.
+			 */
+			goto next;
+		}
+
+		ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+next:
+		cpos += alloc_size;
+		clusters -= alloc_size;
+	}
+
+	ret = 0;
+out:
+
+	brelse(di_bh);
+	return ret;
+}
+
+/*
+ * Truncate a byte range, avoiding pages within partial clusters. This
+ * preserves those pages for the zeroing code to write to.
+ */
+static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
+					 u64 byte_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	loff_t start, end;
+	struct address_space *mapping = inode->i_mapping;
+
+	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
+	end = byte_start + byte_len;
+	end = end & ~(osb->s_clustersize - 1);
+
+	if (start < end) {
+		unmap_mapping_range(mapping, start, end - start, 0);
+		truncate_inode_pages_range(mapping, start, end - 1);
+	}
+}
+
+static int ocfs2_zero_partial_clusters(struct inode *inode,
+				       u64 start, u64 len)
+{
+	int ret = 0;
+	u64 tmpend, end = start + len;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int csize = osb->s_clustersize;
+	handle_t *handle;
+
+	/*
+	 * The "start" and "end" values are NOT necessarily part of
+	 * the range whose allocation is being deleted. Rather, this
+	 * is what the user passed in with the request. We must zero
+	 * partial clusters here. There's no need to worry about
+	 * physical allocation - the zeroing code knows to skip holes.
+	 */
+	trace_ocfs2_zero_partial_clusters(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)start, (unsigned long long)end);
+
+	/*
+	 * If both edges are on a cluster boundary then there's no
+	 * zeroing required as the region is part of the allocation to
+	 * be truncated.
+	 */
+	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We want to get the byte offset of the end of the 1st cluster.
+	 */
+	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+	if (tmpend > end)
+		tmpend = end;
+
+	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+						 (unsigned long long)tmpend);
+
+	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+	if (ret)
+		mlog_errno(ret);
+
+	if (tmpend < end) {
+		/*
+		 * This may make start and end equal, but the zeroing
+		 * code will skip any work in that case so there's no
+		 * need to catch it up here.
+		 */
+		start = end & ~(osb->s_clustersize - 1);
+
+		trace_ocfs2_zero_partial_clusters_range2(
+			(unsigned long long)start, (unsigned long long)end);
+
+		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
+		if (ret)
+			mlog_errno(ret);
+	}
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+	int i;
+	struct ocfs2_extent_rec *rec = NULL;
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) < pos)
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_extent_rec *rec,
+				 u32 trunc_start, u32 *trunc_cpos,
+				 u32 *trunc_len, u32 *trunc_end,
+				 u64 *blkno, int *done)
+{
+	int ret = 0;
+	u32 coff, range;
+
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+		/*
+		 * remove an entire extent record.
+		 */
+		*trunc_cpos = le32_to_cpu(rec->e_cpos);
+		/*
+		 * Skip holes if any.
+		 */
+		if (range < *trunc_end)
+			*trunc_end = range;
+		*trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+		*blkno = le64_to_cpu(rec->e_blkno);
+		*trunc_end = le32_to_cpu(rec->e_cpos);
+	} else if (range > trunc_start) {
+		/*
+		 * remove a partial extent record, which means we're
+		 * removing the last extent record.
+		 */
+		*trunc_cpos = trunc_start;
+		/*
+		 * skip hole if any.
+		 */
+		if (range < *trunc_end)
+			*trunc_end = range;
+		*trunc_len = *trunc_end - trunc_start;
+		coff = trunc_start - le32_to_cpu(rec->e_cpos);
+		*blkno = le64_to_cpu(rec->e_blkno) +
+				ocfs2_clusters_to_blocks(inode->i_sb, coff);
+		*trunc_end = trunc_start;
+	} else {
+		/*
+		 * It may have two following possibilities:
+		 *
+		 * - last record has been removed
+		 * - trunc_start was within a hole
+		 *
+		 * both two cases mean the completion of hole punching.
+		 */
+		ret = 1;
+	}
+
+	*done = ret;
+}
+
+static int ocfs2_remove_inode_range(struct inode *inode,
+				    struct buffer_head *di_bh, u64 byte_start,
+				    u64 byte_len)
+{
+	int ret = 0, flags = 0, done = 0, i;
+	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+	u32 cluster_in_el;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct address_space *mapping = inode->i_mapping;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el = NULL;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	trace_ocfs2_remove_inode_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)byte_start,
+			(unsigned long long)byte_len);
+
+	if (byte_len == 0)
+		return 0;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
+					    byte_start + byte_len, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		/*
+		 * There's no need to get fancy with the page cache
+		 * truncate of an inline-data inode. We're talking
+		 * about less than a page here, which will be cached
+		 * in the dinode buffer anyway.
+		 */
+		unmap_mapping_range(mapping, 0, 0, 0);
+		truncate_inode_pages(mapping, 0);
+		goto out;
+	}
+
+	/*
+	 * For reflinks, we may need to CoW 2 clusters which might be
+	 * partially zero'd later, if hole's start and end offset were
+	 * within one cluster(means is not exactly aligned to clustersize).
+	 */
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+	cluster_in_el = trunc_end;
+
+	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	path = ocfs2_new_path_from_et(&et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (trunc_end > trunc_start) {
+
+		ret = ocfs2_find_path(INODE_CACHE(inode), path,
+				      cluster_in_el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		el = path_leaf_el(path);
+
+		i = ocfs2_find_rec(el, trunc_end);
+		/*
+		 * Need to go to previous extent block.
+		 */
+		if (i < 0) {
+			if (path->p_tree_depth == 0)
+				break;
+
+			ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+							    path,
+							    &cluster_in_el);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/*
+			 * We've reached the leftmost extent block,
+			 * it's safe to leave.
+			 */
+			if (cluster_in_el == 0)
+				break;
+
+			/*
+			 * The 'pos' searched for previous extent block is
+			 * always one cluster less than actual trunc_end.
+			 */
+			trunc_end = cluster_in_el + 1;
+
+			ocfs2_reinit_path(path, 1);
+
+			continue;
+
+		} else
+			rec = &el->l_recs[i];
+
+		ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+				     &trunc_len, &trunc_end, &blkno, &done);
+		if (done)
+			break;
+
+		flags = rec->e_flags;
+		phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+		ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+					       phys_cpos, trunc_len, flags,
+					       &dealloc, refcount_loc, false);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		cluster_in_el = trunc_end;
+
+		ocfs2_reinit_path(path, 1);
+	}
+
+	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
+
+out:
+	ocfs2_free_path(path);
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+/*
+ * Parts of this function taken from xfs_change_file_space()
+ */
+static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
+				     loff_t f_pos, unsigned int cmd,
+				     struct ocfs2_space_resv *sr,
+				     int change_size)
+{
+	int ret;
+	s64 llen;
+	loff_t size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	handle_t *handle;
+	unsigned long long max_off = inode->i_sb->s_maxbytes;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * This prevents concurrent writes on other nodes
+	 */
+	ret = ocfs2_rw_lock(inode, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_rw_unlock;
+	}
+
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		ret = -EPERM;
+		goto out_inode_unlock;
+	}
+
+	switch (sr->l_whence) {
+	case 0: /*SEEK_SET*/
+		break;
+	case 1: /*SEEK_CUR*/
+		sr->l_start += f_pos;
+		break;
+	case 2: /*SEEK_END*/
+		sr->l_start += i_size_read(inode);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out_inode_unlock;
+	}
+	sr->l_whence = 0;
+
+	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
+
+	if (sr->l_start < 0
+	    || sr->l_start > max_off
+	    || (sr->l_start + llen) < 0
+	    || (sr->l_start + llen) > max_off) {
+		ret = -EINVAL;
+		goto out_inode_unlock;
+	}
+	size = sr->l_start + sr->l_len;
+
+	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+	    cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
+		if (sr->l_len <= 0) {
+			ret = -EINVAL;
+			goto out_inode_unlock;
+		}
+	}
+
+	if (file && should_remove_suid(file->f_path.dentry)) {
+		ret = __ocfs2_write_remove_suid(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_inode_unlock;
+		}
+	}
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	switch (cmd) {
+	case OCFS2_IOC_RESVSP:
+	case OCFS2_IOC_RESVSP64:
+		/*
+		 * This takes unsigned offsets, but the signed ones we
+		 * pass have been checked against overflow above.
+		 */
+		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
+						       sr->l_len);
+		break;
+	case OCFS2_IOC_UNRESVSP:
+	case OCFS2_IOC_UNRESVSP64:
+		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
+					       sr->l_len);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_inode_unlock;
+	}
+
+	/*
+	 * We update c/mtime for these changes
+	 */
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_inode_unlock;
+	}
+
+	if (change_size && i_size_read(inode) < size)
+		i_size_write(inode, size);
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	if (file && (file->f_flags & O_SYNC))
+		handle->h_sync = 1;
+
+	ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+	ocfs2_rw_unlock(inode, 1);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+			    struct ocfs2_space_resv *sr)
+{
+	struct inode *inode = file_inode(file);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int ret;
+
+	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
+	    !ocfs2_writes_unwritten_extents(osb))
+		return -ENOTTY;
+	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
+		 !ocfs2_sparse_alloc(osb))
+		return -ENOTTY;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
+			    loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_space_resv sr;
+	int change_size = 1;
+	int cmd = OCFS2_IOC_RESVSP64;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+	if (!ocfs2_writes_unwritten_extents(osb))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		change_size = 0;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = OCFS2_IOC_UNRESVSP64;
+
+	sr.l_whence = 0;
+	sr.l_start = (s64)offset;
+	sr.l_len = (s64)len;
+
+	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+					 change_size);
+}
+
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+				   size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
+	    !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+	int blockmask = inode->i_sb->s_blocksize - 1;
+	loff_t final_size = pos + count;
+
+	if ((pos & blockmask) || (final_size & blockmask))
+		return 1;
+	return 0;
+}
+
+static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+					    struct file *file,
+					    loff_t pos, size_t count,
+					    int *meta_level)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*meta_level = 1;
+
+	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(di_bh);
+	return ret;
+}
+
+static int ocfs2_prepare_inode_for_write(struct file *file,
+					 loff_t pos,
+					 size_t count,
+					 int appending,
+					 int *direct_io,
+					 int *has_refcount)
+{
+	int ret = 0, meta_level = 0;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	loff_t end;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int full_coherency = !(osb->s_mount_opt &
+		OCFS2_MOUNT_COHERENCY_BUFFERED);
+
+	/*
+	 * We start with a read level meta lock and only jump to an ex
+	 * if we need to make modifications here.
+	 */
+	for(;;) {
+		ret = ocfs2_inode_lock(inode, NULL, meta_level);
+		if (ret < 0) {
+			meta_level = -1;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Clear suid / sgid if necessary. We do this here
+		 * instead of later in the write path because
+		 * remove_suid() calls ->setattr without any hint that
+		 * we may have already done our cluster locking. Since
+		 * ocfs2_setattr() *must* take cluster locks to
+		 * proceed, this will lead us to recursively lock the
+		 * inode. There's also the dinode i_size state which
+		 * can be lost via setattr during extending writes (we
+		 * set inode->i_size at the end of a write. */
+		if (should_remove_suid(dentry)) {
+			if (meta_level == 0) {
+				ocfs2_inode_unlock(inode, meta_level);
+				meta_level = 1;
+				continue;
+			}
+
+			ret = ocfs2_write_remove_suid(inode);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_unlock;
+			}
+		}
+
+		end = pos + count;
+
+		ret = ocfs2_check_range_for_refcount(inode, pos, count);
+		if (ret == 1) {
+			ocfs2_inode_unlock(inode, meta_level);
+			meta_level = -1;
+
+			ret = ocfs2_prepare_inode_for_refcount(inode,
+							       file,
+							       pos,
+							       count,
+							       &meta_level);
+			if (has_refcount)
+				*has_refcount = 1;
+			if (direct_io)
+				*direct_io = 0;
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
+		/*
+		 * Skip the O_DIRECT checks if we don't need
+		 * them.
+		 */
+		if (!direct_io || !(*direct_io))
+			break;
+
+		/*
+		 * There's no sane way to do direct writes to an inode
+		 * with inline data.
+		 */
+		if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+			*direct_io = 0;
+			break;
+		}
+
+		/*
+		 * Allowing concurrent direct writes means
+		 * i_size changes wouldn't be synchronized, so
+		 * one node could wind up truncating another
+		 * nodes writes.
+		 */
+		if (end > i_size_read(inode) && !full_coherency) {
+			*direct_io = 0;
+			break;
+		}
+
+		/*
+		 * Fallback to old way if the feature bit is not set.
+		 */
+		if (end > i_size_read(inode) &&
+				!ocfs2_supports_append_dio(osb)) {
+			*direct_io = 0;
+			break;
+		}
+
+		/*
+		 * We don't fill holes during direct io, so
+		 * check for them here. If any are found, the
+		 * caller will have to retake some cluster
+		 * locks and initiate the io as buffered.
+		 */
+		ret = ocfs2_check_range_for_holes(inode, pos, count);
+		if (ret == 1) {
+			/*
+			 * Fallback to old way if the feature bit is not set.
+			 * Otherwise try dio first and then complete the rest
+			 * request through buffer io.
+			 */
+			if (!ocfs2_supports_append_dio(osb))
+				*direct_io = 0;
+			ret = 0;
+		} else if (ret < 0)
+			mlog_errno(ret);
+		break;
+	}
+
+out_unlock:
+	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+					    pos, appending, count,
+					    direct_io, has_refcount);
+
+	if (meta_level >= 0)
+		ocfs2_inode_unlock(inode, meta_level);
+
+out:
+	return ret;
+}
+
+static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+				    struct iov_iter *from)
+{
+	int direct_io, appending, rw_level, have_alloc_sem  = 0;
+	int can_do_direct, has_refcount = 0;
+	ssize_t written = 0;
+	ssize_t ret;
+	size_t count = iov_iter_count(from), orig_count;
+	loff_t old_size;
+	u32 old_clusters;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int full_coherency = !(osb->s_mount_opt &
+			       OCFS2_MOUNT_COHERENCY_BUFFERED);
+	int unaligned_dio = 0;
+	int dropped_dio = 0;
+
+	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		file->f_path.dentry->d_name.len,
+		file->f_path.dentry->d_name.name,
+		(unsigned int)from->nr_segs);	/* GRRRRR */
+
+	if (count == 0)
+		return 0;
+
+	appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
+	direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	ocfs2_iocb_clear_sem_locked(iocb);
+
+relock:
+	/* to match setattr's i_mutex -> rw_lock ordering */
+	if (direct_io) {
+		have_alloc_sem = 1;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_sem_locked(iocb);
+	}
+
+	/*
+	 * Concurrent O_DIRECT writes are allowed with
+	 * mount_option "coherency=buffered".
+	 */
+	rw_level = (!direct_io || full_coherency);
+
+	ret = ocfs2_rw_lock(inode, rw_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_sems;
+	}
+
+	/*
+	 * O_DIRECT writes with "coherency=full" need to take EX cluster
+	 * inode_lock to guarantee coherency.
+	 */
+	if (direct_io && full_coherency) {
+		/*
+		 * We need to take and drop the inode lock to force
+		 * other nodes to drop their caches.  Buffered I/O
+		 * already does this in write_begin().
+		 */
+		ret = ocfs2_inode_lock(inode, NULL, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_inode_unlock(inode, 1);
+	}
+
+	orig_count = iov_iter_count(from);
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0) {
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+	count = ret;
+
+	can_do_direct = direct_io;
+	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
+					    &can_do_direct, &has_refcount);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (direct_io && !is_sync_kiocb(iocb))
+		unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
+
+	/*
+	 * We can't complete the direct I/O as requested, fall back to
+	 * buffered I/O.
+	 */
+	if (direct_io && !can_do_direct) {
+		ocfs2_rw_unlock(inode, rw_level);
+
+		have_alloc_sem = 0;
+		rw_level = -1;
+
+		direct_io = 0;
+		iocb->ki_flags &= ~IOCB_DIRECT;
+		iov_iter_reexpand(from, orig_count);
+		dropped_dio = 1;
+		goto relock;
+	}
+
+	if (unaligned_dio) {
+		/*
+		 * Wait on previous unaligned aio to complete before
+		 * proceeding.
+		 */
+		mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+		/* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
+		ocfs2_iocb_set_unaligned_aio(iocb);
+	}
+
+	/*
+	 * To later detect whether a journal commit for sync writes is
+	 * necessary, we sample i_size, and cluster count here.
+	 */
+	old_size = i_size_read(inode);
+	old_clusters = OCFS2_I(inode)->ip_clusters;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb, rw_level);
+
+	written = __generic_file_write_iter(iocb, from);
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+
+	if (unlikely(written <= 0))
+		goto no_sync;
+
+	if (((file->f_flags & O_DSYNC) && !direct_io) ||
+	    IS_SYNC(inode) || dropped_dio) {
+		ret = filemap_fdatawrite_range(file->f_mapping,
+					       iocb->ki_pos - written,
+					       iocb->ki_pos - 1);
+		if (ret < 0)
+			written = ret;
+
+		if (!ret) {
+			ret = jbd2_journal_force_commit(osb->journal->j_journal);
+			if (ret < 0)
+				written = ret;
+		}
+
+		if (!ret)
+			ret = filemap_fdatawait_range(file->f_mapping,
+						      iocb->ki_pos - written,
+						      iocb->ki_pos - 1);
+	}
+
+no_sync:
+	/*
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+		unaligned_dio = 0;
+	}
+
+	if (unaligned_dio) {
+		ocfs2_iocb_clear_unaligned_aio(iocb);
+		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+	}
+
+out:
+	if (rw_level != -1)
+		ocfs2_rw_unlock(inode, rw_level);
+
+out_sems:
+	if (have_alloc_sem)
+		ocfs2_iocb_clear_sem_locked(iocb);
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (written)
+		ret = written;
+	return ret;
+}
+
+static ssize_t ocfs2_file_splice_read(struct file *in,
+				      loff_t *ppos,
+				      struct pipe_inode_info *pipe,
+				      size_t len,
+				      unsigned int flags)
+{
+	int ret = 0, lock_level = 0;
+	struct inode *inode = file_inode(in);
+
+	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			in->f_path.dentry->d_name.len,
+			in->f_path.dentry->d_name.name, len);
+
+	/*
+	 * See the comment in ocfs2_file_read_iter()
+	 */
+	ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+	ocfs2_inode_unlock(inode, lock_level);
+
+	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
+
+bail:
+	return ret;
+}
+
+static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+				   struct iov_iter *to)
+{
+	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = file_inode(filp);
+
+	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			filp->f_path.dentry->d_name.len,
+			filp->f_path.dentry->d_name.name,
+			to->nr_segs);	/* GRRRRR */
+
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ocfs2_iocb_clear_sem_locked(iocb);
+
+	/*
+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
+	 * need locks to protect pending reads from racing with truncate.
+	 */
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		have_alloc_sem = 1;
+		ocfs2_iocb_set_sem_locked(iocb);
+
+		ret = ocfs2_rw_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+		rw_level = 0;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_rw_locked(iocb, rw_level);
+	}
+
+	/*
+	 * We're fine letting folks race truncates and extending
+	 * writes with read across the cluster, just like they can
+	 * locally. Hence no rw_lock during read.
+	 *
+	 * Take and drop the meta data lock to update inode fields
+	 * like i_size. This allows the checks down below
+	 * generic_file_aio_read() a chance of actually working.
+	 */
+	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+	ocfs2_inode_unlock(inode, lock_level);
+
+	ret = generic_file_read_iter(iocb, to);
+	trace_generic_file_aio_read_ret(ret);
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+
+	/* see ocfs2_file_write_iter */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+bail:
+	if (have_alloc_sem)
+		ocfs2_iocb_clear_sem_locked(iocb);
+
+	if (rw_level != -1)
+		ocfs2_rw_unlock(inode, rw_level);
+
+	return ret;
+}
+
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret = 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	switch (whence) {
+	case SEEK_SET:
+		break;
+	case SEEK_END:
+		/* SEEK_END requires the OCFS2 inode lock for the file
+		 * because it references the file's size.
+		 */
+		ret = ocfs2_inode_lock(inode, NULL, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		offset += i_size_read(inode);
+		ocfs2_inode_unlock(inode, 0);
+		break;
+	case SEEK_CUR:
+		if (offset == 0) {
+			offset = file->f_pos;
+			goto out;
+		}
+		offset += file->f_pos;
+		break;
+	case SEEK_DATA:
+	case SEEK_HOLE:
+		ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
+		if (ret)
+			goto out;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	if (ret)
+		return ret;
+	return offset;
+}
+
+const struct inode_operations ocfs2_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
+	.fiemap		= ocfs2_fiemap,
+	.get_acl	= ocfs2_iop_get_acl,
+	.set_acl	= ocfs2_iop_set_acl,
+};
+
+const struct inode_operations ocfs2_special_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+	.permission	= ocfs2_permission,
+	.get_acl	= ocfs2_iop_get_acl,
+	.set_acl	= ocfs2_iop_set_acl,
+};
+
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
+const struct file_operations ocfs2_fops = {
+	.llseek		= ocfs2_file_llseek,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.read_iter	= ocfs2_file_read_iter,
+	.write_iter	= ocfs2_file_write_iter,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
+};
+
+const struct file_operations ocfs2_dops = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+	.llseek		= ocfs2_file_llseek,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.read_iter	= ocfs2_file_read_iter,
+	.write_iter	= ocfs2_file_write_iter,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.flock		= ocfs2_flock,
+};
diff --git a/kernel/fs/ocfs2/file.h b/kernel/fs/ocfs2/file.h
new file mode 100644
index 000000000..e8c62f222
--- /dev/null
+++ b/kernel/fs/ocfs2/file.h
@@ -0,0 +1,85 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_FILE_H
+#define OCFS2_FILE_H
+
+extern const struct file_operations ocfs2_fops;
+extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
+extern const struct inode_operations ocfs2_file_iops;
+extern const struct inode_operations ocfs2_special_file_iops;
+struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
+
+struct ocfs2_file_private {
+	struct file		*fp_file;
+	struct mutex		fp_mutex;
+	struct ocfs2_lock_res	fp_flock;
+};
+
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_set_inode_size(handle_t *handle,
+		struct inode *inode,
+		struct buffer_head *fe_bh,
+		u64 new_i_size);
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size);
+int ocfs2_truncate_file(struct inode *inode,
+		struct buffer_head *di_bh,
+		u64 new_i_size);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to);
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+		u32 clusters_to_add, int mark_unwritten);
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+int ocfs2_permission(struct inode *inode, int mask);
+
+int ocfs2_should_update_atime(struct inode *inode,
+			      struct vfsmount *vfsmnt);
+int ocfs2_update_inode_atime(struct inode *inode,
+			     struct buffer_head *bh);
+
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+			    struct ocfs2_space_resv *sr);
+
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+				   size_t count);
+#endif /* OCFS2_FILE_H */
diff --git a/kernel/fs/ocfs2/heartbeat.c b/kernel/fs/ocfs2/heartbeat.c
new file mode 100644
index 000000000..d8208b20d
--- /dev/null
+++ b/kernel/fs/ocfs2/heartbeat.c
@@ -0,0 +1,134 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recovery when needed.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit);
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit);
+
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+static void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+	       sizeof(unsigned long));
+}
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb)
+{
+	spin_lock_init(&osb->node_map_lock);
+	ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
+}
+
+void ocfs2_do_node_down(int node_num, void *data)
+{
+	struct ocfs2_super *osb = data;
+
+	BUG_ON(osb->node_num == node_num);
+
+	trace_ocfs2_do_node_down(node_num);
+
+	if (!osb->cconn) {
+		/*
+		 * No cluster connection means we're not even ready to
+		 * participate yet.  We check the slots after the cluster
+		 * comes up, so we will notice the node death then.  We
+		 * can safely ignore it here.
+		 */
+		return;
+	}
+
+	ocfs2_recovery_thread(osb, node_num);
+}
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit)
+{
+	set_bit(bit, map->map);
+}
+
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit)
+{
+	clear_bit(bit, map->map);
+}
+
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
+	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
diff --git a/kernel/fs/ocfs2/heartbeat.h b/kernel/fs/ocfs2/heartbeat.h
new file mode 100644
index 000000000..74b9c5dda
--- /dev/null
+++ b/kernel/fs/ocfs2/heartbeat.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_HEARTBEAT_H
+#define OCFS2_HEARTBEAT_H
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb);
+
+void ocfs2_do_node_down(int node_num, void *data);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit);
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+
+#endif /* OCFS2_HEARTBEAT_H */
diff --git a/kernel/fs/ocfs2/inode.c b/kernel/fs/ocfs2/inode.c
new file mode 100644
index 000000000..b254416dc
--- /dev/null
+++ b/kernel/fs/ocfs2/inode.c
@@ -0,0 +1,1460 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c
+ *
+ * vfs' aops, fops, dops and iops
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_find_inode_args
+{
+	u64		fi_blkno;
+	unsigned long	fi_ino;
+	unsigned int	fi_flags;
+	unsigned int	fi_sysfile_type;
+};
+
+static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args);
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
+static int ocfs2_find_actor(struct inode *inode, void *opaque);
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh);
+
+void ocfs2_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = OCFS2_I(inode)->ip_attr;
+
+	inode->i_flags &= ~(S_IMMUTABLE |
+		S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC);
+
+	if (flags & OCFS2_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+
+	if (flags & OCFS2_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & OCFS2_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & OCFS2_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & OCFS2_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
+{
+	unsigned int flags = oi->vfs_inode.i_flags;
+
+	oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL|
+			OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		oi->ip_attr |= OCFS2_SYNC_FL;
+	if (flags & S_APPEND)
+		oi->ip_attr |= OCFS2_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		oi->ip_attr |= OCFS2_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		oi->ip_attr |= OCFS2_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		oi->ip_attr |= OCFS2_DIRSYNC_FL;
+}
+
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+	struct ocfs2_find_inode_args args;
+
+	args.fi_blkno = blkno;
+	args.fi_flags = 0;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+	args.fi_sysfile_type = 0;
+
+	return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
+			 int sysfile_type)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_find_inode_args args;
+	journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
+
+	trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+			       sysfile_type);
+
+	/* Ok. By now we've either got the offsets passed to us by the
+	 * caller, or we just pulled them off the bh. Lets do some
+	 * sanity checks to make sure they're OK. */
+	if (blkno == 0) {
+		inode = ERR_PTR(-EINVAL);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+	args.fi_blkno = blkno;
+	args.fi_flags = flags;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+	args.fi_sysfile_type = sysfile_type;
+
+	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
+			     ocfs2_init_locked_inode, &args);
+	/* inode was *not* in the inode cache. 2.6.x requires
+	 * us to do our own read_inode call and unlock it
+	 * afterwards. */
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+	trace_ocfs2_iget5_locked(inode->i_state);
+	if (inode->i_state & I_NEW) {
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
+		goto bail;
+	}
+
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		transaction_t *transaction;
+		tid_t tid;
+		struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+		read_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		read_unlock(&journal->j_state_lock);
+		oi->i_sync_tid = tid;
+		oi->i_datasync_tid = tid;
+	}
+
+bail:
+	if (!IS_ERR(inode)) {
+		trace_ocfs2_iget_end(inode, 
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+
+	return inode;
+}
+
+
+/*
+ * here's how inodes get read from disk:
+ * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
+ * found? : return the in-memory inode
+ * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
+ */
+
+static int ocfs2_find_actor(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	args = opaque;
+
+	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+
+	trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
+
+	if (oi->ip_blkno != args->fi_blkno)
+		goto bail;
+
+	ret = 1;
+bail:
+	return ret;
+}
+
+/*
+ * initialize the new inode, but don't do anything that would cause
+ * us to sleep.
+ * return 0 on success, 1 on failure
+ */
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = opaque;
+	static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
+				     ocfs2_file_ip_alloc_sem_key;
+
+	inode->i_ino = args->fi_ino;
+	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+	if (args->fi_sysfile_type != 0)
+		lockdep_set_class(&inode->i_mutex,
+			&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+	if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
+	    args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
+	    args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+	    args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE)
+		lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+				  &ocfs2_quota_ip_alloc_sem_key);
+	else
+		lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+				  &ocfs2_file_ip_alloc_sem_key);
+
+	return 0;
+}
+
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	int use_plocks = 1;
+
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+		use_plocks = 0;
+
+	/*
+	 * These have all been checked by ocfs2_read_inode_block() or set
+	 * by ocfs2_mknod_locked(), so a failure is a code bug.
+	 */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
+						cannot create a superblock
+						inode today.  change if
+						that is needed. */
+	BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+	BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
+
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
+
+	inode->i_version = 1;
+	inode->i_generation = le32_to_cpu(fe->i_generation);
+	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	i_uid_write(inode, le32_to_cpu(fe->i_uid));
+	i_gid_write(inode, le32_to_cpu(fe->i_gid));
+
+	/* Fast symlinks will have i_size but no allocated clusters. */
+	if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
+		inode->i_blocks = 0;
+		inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+	} else {
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+		inode->i_mapping->a_ops = &ocfs2_aops;
+	}
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
+		mlog(ML_ERROR,
+		     "ip_blkno %llu != i_blkno %llu!\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (unsigned long long)le64_to_cpu(fe->i_blkno));
+
+	set_nlink(inode, ocfs2_read_links_count(fe));
+
+	trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+				   le32_to_cpu(fe->i_flags));
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+		inode->i_flags |= S_NOQUOTA;
+	}
+  
+	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+		inode->i_flags |= S_NOQUOTA;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
+		/* we can't actually hit this as read_inode can't
+		 * handle superblocks today ;-) */
+		BUG();
+	}
+
+	switch (inode->i_mode & S_IFMT) {
+	    case S_IFREG:
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_fops;
+		    else
+			    inode->i_fop = &ocfs2_fops_no_plocks;
+		    inode->i_op = &ocfs2_file_iops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFDIR:
+		    inode->i_op = &ocfs2_dir_iops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_dops;
+		    else
+			    inode->i_fop = &ocfs2_dops_no_plocks;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    OCFS2_I(inode)->ip_dir_lock_gen = 1;
+		    break;
+	    case S_IFLNK:
+		    inode->i_op = &ocfs2_symlink_inode_operations;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    default:
+		    inode->i_op = &ocfs2_special_file_iops;
+		    init_special_inode(inode, inode->i_mode,
+				       inode->i_rdev);
+		    break;
+	}
+
+	if (create_ino) {
+		inode->i_ino = ino_from_blkno(inode->i_sb,
+			       le64_to_cpu(fe->i_blkno));
+
+		/*
+		 * If we ever want to create system files from kernel,
+		 * the generation argument to
+		 * ocfs2_inode_lock_res_init() will have to change.
+		 */
+		BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
+
+		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
+					  OCFS2_LOCK_TYPE_META, 0, inode);
+
+		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+					  OCFS2_LOCK_TYPE_OPEN, 0, inode);
+	}
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
+				  OCFS2_LOCK_TYPE_RW, inode->i_generation,
+				  inode);
+
+	ocfs2_set_inode_flags(inode);
+
+	OCFS2_I(inode)->ip_last_used_slot = 0;
+	OCFS2_I(inode)->ip_last_used_group = 0;
+
+	if (S_ISDIR(inode->i_mode))
+		ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+				    OCFS2_RESV_FLAG_DIR);
+}
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *fe;
+	struct buffer_head *bh = NULL;
+	int status, can_lock;
+	u32 generation = 0;
+
+	status = -EINVAL;
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	/*
+	 * To improve performance of cold-cache inode stats, we take
+	 * the cluster lock here if possible.
+	 *
+	 * Generally, OCFS2 never trusts the contents of an inode
+	 * unless it's holding a cluster lock, so taking it here isn't
+	 * a correctness issue as much as it is a performance
+	 * improvement.
+	 *
+	 * There are three times when taking the lock is not a good idea:
+	 *
+	 * 1) During startup, before we have initialized the DLM.
+	 *
+	 * 2) If we are reading certain system files which never get
+	 *    cluster locks (local alloc, truncate log).
+	 *
+	 * 3) If the process doing the iget() is responsible for
+	 *    orphan dir recovery. We're holding the orphan dir lock and
+	 *    can get into a deadlock with another process on another
+	 *    node in ->delete_inode().
+	 *
+	 * #1 and #2 can be simply solved by never taking the lock
+	 * here for system files (which are the only type we read
+	 * during mount). It's a heavier approach, but our main
+	 * concern is user-accessible files anyway.
+	 *
+	 * #3 works itself out because we'll eventually take the
+	 * cluster lock before trusting anything anyway.
+	 */
+	can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
+		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
+		&& !ocfs2_mount_local(osb);
+
+	trace_ocfs2_read_locked_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
+
+	/*
+	 * To maintain backwards compatibility with older versions of
+	 * ocfs2-tools, we still store the generation value for system
+	 * files. The only ones that actually matter to userspace are
+	 * the journals, but it's easier and inexpensive to just flag
+	 * all system files similarly.
+	 */
+	if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
+		generation = osb->fs_generation;
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
+				  OCFS2_LOCK_TYPE_META,
+				  generation, inode);
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+				  OCFS2_LOCK_TYPE_OPEN,
+				  0, inode);
+
+	if (can_lock) {
+		status = ocfs2_open_lock(inode);
+		if (status) {
+			make_bad_inode(inode);
+			mlog_errno(status);
+			return status;
+		}
+		status = ocfs2_inode_lock(inode, NULL, 0);
+		if (status) {
+			make_bad_inode(inode);
+			mlog_errno(status);
+			return status;
+		}
+	}
+
+	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+		status = ocfs2_try_open_lock(inode, 0);
+		if (status) {
+			make_bad_inode(inode);
+			return status;
+		}
+	}
+
+	if (can_lock) {
+		status = ocfs2_read_inode_block_full(inode, &bh,
+						     OCFS2_BH_IGNORE_CACHE);
+	} else {
+		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+		/*
+		 * If buffer is in jbd, then its checksum may not have been
+		 * computed as yet.
+		 */
+		if (!status && !buffer_jbd(bh))
+			status = ocfs2_validate_inode_block(osb->sb, bh);
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = -EINVAL;
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	/*
+	 * This is a code bug. Right now the caller needs to
+	 * understand whether it is asking for a system file inode or
+	 * not so the proper lock names can be built.
+	 */
+	mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
+			!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
+			"Inode %llu: system file state is ambigous\n",
+			(unsigned long long)args->fi_blkno);
+
+	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
+	    S_ISBLK(le16_to_cpu(fe->i_mode)))
+		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+
+	ocfs2_populate_inode(inode, fe, 0);
+
+	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+
+	status = 0;
+
+bail:
+	if (can_lock)
+		ocfs2_inode_unlock(inode, 0);
+
+	if (status < 0)
+		make_bad_inode(inode);
+
+	brelse(bh);
+
+	return status;
+}
+
+void ocfs2_sync_blockdev(struct super_block *sb)
+{
+	sync_blockdev(sb->s_bdev);
+}
+
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe;
+	handle_t *handle = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	/*
+	 * This check will also skip truncate of inodes with inline
+	 * data and fast symlinks.
+	 */
+	if (fe->i_clusters) {
+		if (ocfs2_should_order_data(inode))
+			ocfs2_begin_ordered_truncate(inode, 0);
+
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			handle = NULL;
+			mlog_errno(status);
+			goto out;
+		}
+
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+						 fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		i_size_write(inode, 0);
+
+		status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+
+		status = ocfs2_commit_truncate(osb, inode, fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+	}
+
+out:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+	return status;
+}
+
+static int ocfs2_remove_inode(struct inode *inode,
+			      struct buffer_head *di_bh,
+			      struct inode *orphan_dir_inode,
+			      struct buffer_head *orphan_dir_bh)
+{
+	int status;
+	struct inode *inode_alloc_inode = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    le16_to_cpu(di->i_suballoc_slot));
+	if (!inode_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mutex_lock(&inode_alloc_inode->i_mutex);
+	status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
+	if (status < 0) {
+		mutex_unlock(&inode_alloc_inode->i_mutex);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+				   ocfs2_quota_trans_credits(inode->i_sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+					  orphan_dir_bh, false);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_commit;
+		}
+	}
+
+	/* set the inodes dtime */
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+	ocfs2_journal_dirty(handle, di_bh);
+
+	ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
+	dquot_free_inode(inode);
+
+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
+				   inode_alloc_bh, di);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(osb, handle);
+bail_unlock:
+	ocfs2_inode_unlock(inode_alloc_inode, 1);
+	mutex_unlock(&inode_alloc_inode->i_mutex);
+	brelse(inode_alloc_bh);
+bail:
+	iput(inode_alloc_inode);
+
+	return status;
+}
+
+/*
+ * Serialize with orphan dir recovery. If the process doing
+ * recovery on this orphan dir does an iget() with the dir
+ * i_mutex held, we'll deadlock here. Instead we detect this
+ * and exit early - recovery will wipe this inode for us.
+ */
+static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
+					     int slot)
+{
+	int ret = 0;
+
+	spin_lock(&osb->osb_lock);
+	if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
+		ret = -EDEADLK;
+		goto out;
+	}
+	/* This signals to the orphan recovery process that it should
+	 * wait for us to handle the wipe. */
+	osb->osb_orphan_wipes[slot]++;
+out:
+	spin_unlock(&osb->osb_lock);
+	trace_ocfs2_check_orphan_recovery_state(slot, ret);
+	return ret;
+}
+
+static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
+					 int slot)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_orphan_wipes[slot]--;
+	spin_unlock(&osb->osb_lock);
+
+	wake_up(&osb->osb_wipe_event);
+}
+
+static int ocfs2_wipe_inode(struct inode *inode,
+			    struct buffer_head *di_bh)
+{
+	int status, orphaned_slot = -1;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+
+		status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+		if (status)
+			return status;
+
+		orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+							       ORPHAN_DIR_SYSTEM_INODE,
+							       orphaned_slot);
+		if (!orphan_dir_inode) {
+			status = -ENOENT;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Lock the orphan dir. The lock will be held for the entire
+		 * delete_inode operation. We do this now to avoid races with
+		 * recovery completion on other nodes. */
+		mutex_lock(&orphan_dir_inode->i_mutex);
+		status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+		if (status < 0) {
+			mutex_unlock(&orphan_dir_inode->i_mutex);
+
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to try an
+	 * inode delete underneath us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	/* Remove any dir index tree */
+	if (S_ISDIR(inode->i_mode)) {
+		status = ocfs2_dx_dir_truncate(inode, di_bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail_unlock_dir;
+		}
+	}
+
+	/*Free extended attribute resources associated with this inode.*/
+	status = ocfs2_xattr_remove(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
+				    orphan_dir_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_unlock_dir:
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+		return status;
+
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	brelse(orphan_dir_bh);
+bail:
+	iput(orphan_dir_inode);
+	ocfs2_signal_wipe_completion(osb, orphaned_slot);
+
+	return status;
+}
+
+/* There is a series of simple checks that should be done before a
+ * trylock is even considered. Encapsulate those in this function. */
+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+					     (unsigned long long)oi->ip_blkno,
+					     oi->ip_flags);
+
+	/* We shouldn't be getting here for the root directory
+	 * inode.. */
+	if (inode == osb->root_inode) {
+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
+		goto bail;
+	}
+
+	/*
+	 * If we're coming from downconvert_thread we can't go into our own
+	 * voting [hello, deadlock city!] so we cannot delete the inode. But
+	 * since we dropped last inode ref when downconverting dentry lock,
+	 * we cannot have the file open and thus the node doing unlink will
+	 * take care of deleting the inode.
+	 */
+	if (current == osb->dc_task)
+		goto bail;
+
+	spin_lock(&oi->ip_lock);
+	/* OCFS2 *never* deletes system files. This should technically
+	 * never get here as system file inodes should always have a
+	 * positive link count. */
+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		mlog(ML_ERROR, "Skipping delete of system file %llu\n",
+		     (unsigned long long)oi->ip_blkno);
+		goto bail_unlock;
+	}
+
+	ret = 1;
+bail_unlock:
+	spin_unlock(&oi->ip_lock);
+bail:
+	return ret;
+}
+
+/* Query the cluster to determine whether we should wipe an inode from
+ * disk or not.
+ *
+ * Requires the inode to have the cluster lock. */
+static int ocfs2_query_inode_wipe(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  int *wipe)
+{
+	int status = 0, reason = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di;
+
+	*wipe = 0;
+
+	trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+					   inode->i_nlink);
+
+	/* While we were waiting for the cluster lock in
+	 * ocfs2_delete_inode, another node might have asked to delete
+	 * the inode. Recheck our flags to catch this. */
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		reason = 1;
+		goto bail;
+	}
+
+	/* Now that we have an up to date inode, we can double check
+	 * the link count. */
+	if (inode->i_nlink)
+		goto bail;
+
+	/* Do some basic inode verification... */
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+	    !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		/*
+		 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+		 * inodes that come back out of the orphan dir are reflink
+		 * targets. A reflink target may be moved out of the orphan
+		 * dir between the time we scan the directory and the time we
+		 * process it. This would lead to HAS_REFCOUNT_FL being set but
+		 * ORPHANED_FL not.
+		 */
+		if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+			reason = 2;
+			goto bail;
+		}
+
+		/* for lack of a better error? */
+		status = -EEXIST;
+		mlog(ML_ERROR,
+		     "Inode %llu (on-disk %llu) not orphaned! "
+		     "Disk flags  0x%x, inode flags 0x%x\n",
+		     (unsigned long long)oi->ip_blkno,
+		     (unsigned long long)le64_to_cpu(di->i_blkno),
+		     le32_to_cpu(di->i_flags), oi->ip_flags);
+		goto bail;
+	}
+
+	/* has someone already deleted us?! baaad... */
+	if (di->i_dtime) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * This is how ocfs2 determines whether an inode is still live
+	 * within the cluster. Every node takes a shared read lock on
+	 * the inode open lock in ocfs2_read_locked_inode(). When we
+	 * get to ->delete_inode(), each node tries to convert it's
+	 * lock to an exclusive. Trylocks are serialized by the inode
+	 * meta data lock. If the upconvert succeeds, we know the inode
+	 * is no longer live and can be deleted.
+	 *
+	 * Though we call this with the meta data lock held, the
+	 * trylock keeps us from ABBA deadlock.
+	 */
+	status = ocfs2_try_open_lock(inode, 1);
+	if (status == -EAGAIN) {
+		status = 0;
+		reason = 3;
+		goto bail;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*wipe = 1;
+	trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
+
+bail:
+	trace_ocfs2_query_inode_wipe_end(status, reason);
+	return status;
+}
+
+/* Support function for ocfs2_delete_inode. Will help us keep the
+ * inode data in a consistent state for clear_inode. Always truncates
+ * pages, optionally sync's them first. */
+static void ocfs2_cleanup_delete_inode(struct inode *inode,
+				       int sync_data)
+{
+	trace_ocfs2_cleanup_delete_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+	if (sync_data)
+		filemap_write_and_wait(inode->i_mapping);
+	truncate_inode_pages_final(&inode->i_data);
+}
+
+static void ocfs2_delete_inode(struct inode *inode)
+{
+	int wipe, status;
+	sigset_t oldset;
+	struct buffer_head *di_bh = NULL;
+
+	trace_ocfs2_delete_inode(inode->i_ino,
+				 (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				 is_bad_inode(inode));
+
+	/* When we fail in read_inode() we mark inode as bad. The second test
+	 * catches the case when inode allocation fails before allocating
+	 * a block for inode. */
+	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
+		goto bail;
+
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		/* It's probably not necessary to truncate_inode_pages
+		 * here but we do it for safety anyway (it will most
+		 * likely be a no-op anyway) */
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail;
+	}
+
+	dquot_initialize(inode);
+
+	/* We want to block signals in delete_inode as the lock and
+	 * messaging paths may return us -ERESTARTSYS. Which would
+	 * cause us to exit early, resulting in inodes being orphaned
+	 * forever. */
+	ocfs2_block_signals(&oldset);
+
+	/*
+	 * Synchronize us against ocfs2_get_dentry. We take this in
+	 * shared mode so that all nodes can still concurrently
+	 * process deletes.
+	 */
+	status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unblock;
+	}
+	/* Lock down the inode. This gives us an up to date view of
+	 * it's metadata (for verification), and allows us to
+	 * serialize delete_inode on multiple nodes.
+	 *
+	 * Even though we might be doing a truncate, we don't take the
+	 * allocation lock here as it won't be needed - nobody will
+	 * have the file open.
+	 */
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unlock_nfs_sync;
+	}
+
+	/* Query the cluster. This will be the final decision made
+	 * before we go ahead and wipe the inode. */
+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
+	if (!wipe || status < 0) {
+		/* Error and remote inode busy both mean we won't be
+		 * removing the inode, so they take almost the same
+		 * path. */
+		if (status < 0)
+			mlog_errno(status);
+
+		/* Someone in the cluster has disallowed a wipe of
+		 * this inode, or it was never completely
+		 * orphaned. Write out the pages and exit now. */
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail_unlock_inode;
+	}
+
+	ocfs2_cleanup_delete_inode(inode, 0);
+
+	status = ocfs2_wipe_inode(inode, di_bh);
+	if (status < 0) {
+		if (status != -EDEADLK)
+			mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	/*
+	 * Mark the inode as successfully deleted.
+	 *
+	 * This is important for ocfs2_clear_inode() as it will check
+	 * this flag and skip any checkpointing work
+	 *
+	 * ocfs2_stuff_meta_lvb() also uses this flag to invalidate
+	 * the LVB for other nodes.
+	 */
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
+bail_unlock_inode:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+
+bail_unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
+
+bail_unblock:
+	ocfs2_unblock_signals(&oldset);
+bail:
+	return;
+}
+
+static void ocfs2_clear_inode(struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	clear_inode(inode);
+	trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink);
+
+	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+			"Inode=%lu\n", inode->i_ino);
+
+	dquot_drop(inode);
+
+	/* To preven remote deletes we hold open lock before, now it
+	 * is time to unlock PR and EX open locks. */
+	ocfs2_open_unlock(inode);
+
+	/* Do these before all the other work so that we don't bounce
+	 * the downconvert thread while waiting to destroy the locks. */
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
+
+	ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+			   &oi->ip_la_data_resv);
+	ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
+	/* We very well may get a clear_inode before all an inodes
+	 * metadata has hit disk. Of course, we can't drop any cluster
+	 * locks until the journal has finished with it. The only
+	 * exception here are successfully wiped inodes - their
+	 * metadata can now be considered to be part of the system
+	 * inodes from which it came. */
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+		ocfs2_checkpoint_inode(inode);
+
+	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
+			"Clear inode of %llu, inode has io markers\n",
+			(unsigned long long)oi->ip_blkno);
+
+	ocfs2_extent_map_trunc(inode, 0);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_inode_lockres);
+	ocfs2_lock_res_free(&oi->ip_open_lockres);
+
+	ocfs2_metadata_cache_exit(INODE_CACHE(inode));
+
+	mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
+			"Clear inode of %llu, inode has %u cache items\n",
+			(unsigned long long)oi->ip_blkno,
+			INODE_CACHE(inode)->ci_num_cached);
+
+	mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
+			"Clear inode of %llu, inode has a bad flag\n",
+			(unsigned long long)oi->ip_blkno);
+
+	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
+			"Clear inode of %llu, inode is locked\n",
+			(unsigned long long)oi->ip_blkno);
+
+	mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
+			"Clear inode of %llu, io_mutex is locked\n",
+			(unsigned long long)oi->ip_blkno);
+	mutex_unlock(&oi->ip_io_mutex);
+
+	/*
+	 * down_trylock() returns 0, down_write_trylock() returns 1
+	 * kernel 1, world 0
+	 */
+	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
+			"Clear inode of %llu, alloc_sem is locked\n",
+			(unsigned long long)oi->ip_blkno);
+	up_write(&oi->ip_alloc_sem);
+
+	mlog_bug_on_msg(oi->ip_open_count,
+			"Clear inode of %llu has open count %d\n",
+			(unsigned long long)oi->ip_blkno, oi->ip_open_count);
+
+	/* Clear all other flags. */
+	oi->ip_flags = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+	/*
+	 * ip_jinode is used to track txns against this inode. We ensure that
+	 * the journal is flushed before journal shutdown. Thus it is safe to
+	 * have inodes get cleaned up after journal shutdown.
+	 */
+	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+				       &oi->ip_jinode);
+}
+
+void ocfs2_evict_inode(struct inode *inode)
+{
+	if (!inode->i_nlink ||
+	    (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
+		ocfs2_delete_inode(inode);
+	} else {
+		truncate_inode_pages_final(&inode->i_data);
+	}
+	ocfs2_clear_inode(inode);
+}
+
+/* Called under inode_lock, with no more references on the
+ * struct inode, so it's safe here to check the flags field
+ * and to manipulate i_nlink without any other locks. */
+int ocfs2_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int res;
+
+	trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink, oi->ip_flags);
+
+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
+		res = 1;
+	else
+		res = generic_drop_inode(inode);
+
+	return res;
+}
+
+/*
+ * This is called from our getattr.
+ */
+int ocfs2_inode_revalidate(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	int status = 0;
+
+	trace_ocfs2_inode_revalidate(inode,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
+
+	if (!inode) {
+		status = -ENOENT;
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Let ocfs2_inode_lock do the work of updating our struct
+	 * inode for us. */
+	status = ocfs2_inode_lock(inode, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_inode_unlock(inode, 0);
+bail:
+	return status;
+}
+
+/*
+ * Updates a disk inode from a
+ * struct inode.
+ * Only takes ip_lock.
+ */
+int ocfs2_mark_inode_dirty(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh)
+{
+	int status;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+
+	trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	ocfs2_get_inode_flags(OCFS2_I(inode));
+	fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
+	fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	fe->i_size = cpu_to_le64(i_size_read(inode));
+	ocfs2_set_links_count(fe, inode->i_nlink);
+	fe->i_uid = cpu_to_le32(i_uid_read(inode));
+	fe->i_gid = cpu_to_le32(i_gid_read(inode));
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, bh);
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+leave:
+	return status;
+}
+
+/*
+ *
+ * Updates a struct inode from a disk inode.
+ * does no i/o, only takes ip_lock.
+ */
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe)
+{
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
+	ocfs2_set_inode_flags(inode);
+	i_size_write(inode, le64_to_cpu(fe->i_size));
+	set_nlink(inode, ocfs2_read_links_count(fe));
+	i_uid_write(inode, le32_to_cpu(fe->i_uid));
+	i_gid_write(inode, le32_to_cpu(fe->i_gid));
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	rc = -EINVAL;
+
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    di->i_signature);
+		goto bail;
+	}
+
+	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(di->i_blkno));
+		goto bail;
+	}
+
+	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+			    (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	if (le32_to_cpu(di->i_fs_generation) !=
+	    OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: fs_generation is %u\n",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(di->i_fs_generation));
+		goto bail;
+	}
+
+	rc = 0;
+
+bail:
+	return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
+			       1, &tmp, flags, ocfs2_validate_inode_block);
+
+	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+	return ocfs2_read_inode_block_full(inode, bh, 0);
+}
+
+
+static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	return oi->ip_blkno;
+}
+
+static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	return oi->vfs_inode.i_sb;
+}
+
+static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	spin_lock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	mutex_lock(&oi->ip_io_mutex);
+}
+
+static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	mutex_unlock(&oi->ip_io_mutex);
+}
+
+const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
+	.co_owner		= ocfs2_inode_cache_owner,
+	.co_get_super		= ocfs2_inode_cache_get_super,
+	.co_cache_lock		= ocfs2_inode_cache_lock,
+	.co_cache_unlock	= ocfs2_inode_cache_unlock,
+	.co_io_lock		= ocfs2_inode_cache_io_lock,
+	.co_io_unlock		= ocfs2_inode_cache_io_unlock,
+};
+
diff --git a/kernel/fs/ocfs2/inode.h b/kernel/fs/ocfs2/inode.h
new file mode 100644
index 000000000..5e86b247c
--- /dev/null
+++ b/kernel/fs/ocfs2/inode.h
@@ -0,0 +1,190 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_INODE_H
+#define OCFS2_INODE_H
+
+#include "extent_map.h"
+
+/* OCFS2 Inode Private Data */
+struct ocfs2_inode_info
+{
+	u64			ip_blkno;
+
+	struct ocfs2_lock_res		ip_rw_lockres;
+	struct ocfs2_lock_res		ip_inode_lockres;
+	struct ocfs2_lock_res		ip_open_lockres;
+
+	/* protects allocation changes on this inode. */
+	struct rw_semaphore		ip_alloc_sem;
+
+	/* protects extended attribute changes on this inode */
+	struct rw_semaphore		ip_xattr_sem;
+
+	/* Number of outstanding AIO's which are not page aligned */
+	struct mutex			ip_unaligned_aio;
+
+	/* These fields are protected by ip_lock */
+	spinlock_t			ip_lock;
+	u32				ip_open_count;
+	struct list_head		ip_io_markers;
+	u32				ip_clusters;
+
+	u16				ip_dyn_features;
+	struct mutex			ip_io_mutex;
+	u32				ip_flags; /* see below */
+	u32				ip_attr; /* inode attributes */
+
+	/* protected by recovery_lock. */
+	struct inode			*ip_next_orphan;
+
+	struct ocfs2_caching_info	ip_metadata_cache;
+	struct ocfs2_extent_map		ip_extent_map;
+	struct inode			vfs_inode;
+	struct jbd2_inode		ip_jinode;
+
+	u32				ip_dir_start_lookup;
+
+	/* Only valid if the inode is the dir. */
+	u32				ip_last_used_slot;
+	u64				ip_last_used_group;
+	u32				ip_dir_lock_gen;
+
+	struct ocfs2_alloc_reservation	ip_la_data_resv;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	tid_t i_sync_tid;
+	tid_t i_datasync_tid;
+
+	wait_queue_head_t append_dio_wq;
+
+	struct dquot *i_dquot[MAXQUOTAS];
+};
+
+/*
+ * Flags for the ip_flags field
+ */
+/* System file inodes  */
+#define OCFS2_INODE_SYSTEM_FILE		0x00000001
+#define OCFS2_INODE_JOURNAL		0x00000002
+#define OCFS2_INODE_BITMAP		0x00000004
+/* This inode has been wiped from disk */
+#define OCFS2_INODE_DELETED		0x00000008
+/* Has the inode been orphaned on another node?
+ *
+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
+ * continuing.
+ *
+ * We *only* set this on unlink vote from another node. If the inode
+ * was locally orphaned, then we're sure of the state and don't need
+ * to twiddle i_nlink later - it's either zero or not depending on
+ * whether our unlink succeeded. Otherwise we got this from a node
+ * whose intention was to orphan the inode, however he may have
+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
+ * rely on ocfs2_delete_inode to sort things out under the proper
+ * cluster locks.
+ */
+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000010
+/* Does someone have the file open O_DIRECT */
+#define OCFS2_INODE_OPEN_DIRECT		0x00000020
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000040
+
+static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
+}
+
+#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
+#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
+
+extern struct kmem_cache *ocfs2_inode_cache;
+
+extern const struct address_space_operations ocfs2_aops;
+extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
+
+static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
+{
+	return &OCFS2_I(inode)->ip_metadata_cache;
+}
+
+void ocfs2_evict_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
+
+/* Flags for ocfs2_iget() */
+#define OCFS2_FI_FLAG_SYSFILE		0x1
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
+			 int sysfile_type);
+int ocfs2_inode_init_private(struct inode *inode);
+int ocfs2_inode_revalidate(struct dentry *dentry);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino);
+void ocfs2_read_inode(struct inode *inode);
+void ocfs2_read_inode2(struct inode *inode, void *opaque);
+ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
+			size_t size, loff_t *offp);
+void ocfs2_sync_blockdev(struct super_block *sb);
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe);
+int ocfs2_mark_inode_dirty(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada);
+
+void ocfs2_set_inode_flags(struct inode *inode);
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
+
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+	int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+
+	return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
+}
+
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags);
+
+static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
+{
+	return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
+}
+
+#endif /* OCFS2_INODE_H */
diff --git a/kernel/fs/ocfs2/ioctl.c b/kernel/fs/ocfs2/ioctl.c
new file mode 100644
index 000000000..53e6c40ed
--- /dev/null
+++ b/kernel/fs/ocfs2/ioctl.c
@@ -0,0 +1,1005 @@
+/*
+ * linux/fs/ocfs2/ioctl.c
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ * adapted from Remy Card's ext2/ioctl.c
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+
+#include "ocfs2_fs.h"
+#include "ioctl.h"
+#include "resize.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
+
+#define o2info_from_user(a, b)	\
+		copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b)	\
+		copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This is just a best-effort to tell userspace that this request
+ * caused the error.
+ */
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
+					struct ocfs2_info_request __user *req)
+{
+	kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+	(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
+{
+	req->ir_flags |= OCFS2_INFO_FL_FILLED;
+}
+
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
+{
+	req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
+}
+
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+	return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
+
+static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
+{
+	int status;
+
+	status = ocfs2_inode_lock(inode, NULL, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+	ocfs2_get_inode_flags(OCFS2_I(inode));
+	*flags = OCFS2_I(inode)->ip_attr;
+	ocfs2_inode_unlock(inode, 0);
+
+	return status;
+}
+
+static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
+				unsigned mask)
+{
+	struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+	struct buffer_head *bh = NULL;
+	unsigned oldflags;
+	int status;
+
+	mutex_lock(&inode->i_mutex);
+
+	status = ocfs2_inode_lock(inode, &bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = -EACCES;
+	if (!inode_owner_or_capable(inode))
+		goto bail_unlock;
+
+	if (!S_ISDIR(inode->i_mode))
+		flags &= ~OCFS2_DIRSYNC_FL;
+
+	oldflags = ocfs2_inode->ip_attr;
+	flags = flags & mask;
+	flags |= oldflags & ~mask;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 */
+	status = -EPERM;
+	if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
+		(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
+		if (!capable(CAP_LINUX_IMMUTABLE))
+			goto bail_unlock;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	ocfs2_inode->ip_attr = flags;
+	ocfs2_set_inode_flags(inode);
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(osb, handle);
+
+bail_unlock:
+	ocfs2_inode_unlock(inode, 1);
+bail:
+	mutex_unlock(&inode->i_mutex);
+
+	brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_info_handle_blocksize(struct inode *inode,
+				       struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_blocksize oib;
+
+	if (o2info_from_user(oib, req))
+		return -EFAULT;
+
+	oib.ib_blocksize = inode->i_sb->s_blocksize;
+
+	o2info_set_request_filled(&oib.ib_req);
+
+	if (o2info_to_user(oib, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ocfs2_info_handle_clustersize(struct inode *inode,
+					 struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_clustersize oic;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oic, req))
+		return -EFAULT;
+
+	oic.ic_clustersize = osb->s_clustersize;
+
+	o2info_set_request_filled(&oic.ic_req);
+
+	if (o2info_to_user(oic, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ocfs2_info_handle_maxslots(struct inode *inode,
+				      struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_maxslots oim;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oim, req))
+		return -EFAULT;
+
+	oim.im_max_slots = osb->max_slots;
+
+	o2info_set_request_filled(&oim.im_req);
+
+	if (o2info_to_user(oim, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ocfs2_info_handle_label(struct inode *inode,
+				   struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_label oil;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oil, req))
+		return -EFAULT;
+
+	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+
+	o2info_set_request_filled(&oil.il_req);
+
+	if (o2info_to_user(oil, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ocfs2_info_handle_uuid(struct inode *inode,
+				  struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_uuid oiu;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oiu, req))
+		return -EFAULT;
+
+	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+
+	o2info_set_request_filled(&oiu.iu_req);
+
+	if (o2info_to_user(oiu, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ocfs2_info_handle_fs_features(struct inode *inode,
+					 struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_fs_features oif;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oif, req))
+		return -EFAULT;
+
+	oif.if_compat_features = osb->s_feature_compat;
+	oif.if_incompat_features = osb->s_feature_incompat;
+	oif.if_ro_compat_features = osb->s_feature_ro_compat;
+
+	o2info_set_request_filled(&oif.if_req);
+
+	if (o2info_to_user(oif, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ocfs2_info_handle_journal_size(struct inode *inode,
+					  struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_journal_size oij;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oij, req))
+		return -EFAULT;
+
+	oij.ij_journal_size = i_size_read(osb->journal->j_inode);
+
+	o2info_set_request_filled(&oij.ij_req);
+
+	if (o2info_to_user(oij, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+				       struct inode *inode_alloc, u64 blkno,
+				       struct ocfs2_info_freeinode *fi,
+				       u32 slot)
+{
+	int status = 0, unlock = 0;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *dinode_alloc = NULL;
+
+	if (inode_alloc)
+		mutex_lock(&inode_alloc->i_mutex);
+
+	if (o2info_coherent(&fi->ifi_req)) {
+		status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		unlock = 1;
+	} else {
+		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+	fi->ifi_stat[slot].lfi_total =
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+	fi->ifi_stat[slot].lfi_free =
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+	if (unlock)
+		ocfs2_inode_unlock(inode_alloc, 0);
+
+	if (inode_alloc)
+		mutex_unlock(&inode_alloc->i_mutex);
+
+	brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_info_handle_freeinode(struct inode *inode,
+				       struct ocfs2_info_request __user *req)
+{
+	u32 i;
+	u64 blkno = -1;
+	char namebuf[40];
+	int status, type = INODE_ALLOC_SYSTEM_INODE;
+	struct ocfs2_info_freeinode *oifi = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *inode_alloc = NULL;
+
+	oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+	if (!oifi) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	if (o2info_from_user(*oifi, req)) {
+		status = -EFAULT;
+		goto out_free;
+	}
+
+	oifi->ifi_slotnum = osb->max_slots;
+
+	for (i = 0; i < oifi->ifi_slotnum; i++) {
+		if (o2info_coherent(&oifi->ifi_req)) {
+			inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+			if (!inode_alloc) {
+				mlog(ML_ERROR, "unable to get alloc inode in "
+				    "slot %u\n", i);
+				status = -EIO;
+				goto bail;
+			}
+		} else {
+			ocfs2_sprintf_system_inode_name(namebuf,
+							sizeof(namebuf),
+							type, i);
+			status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+							    namebuf,
+							    strlen(namebuf),
+							    &blkno);
+			if (status < 0) {
+				status = -ENOENT;
+				goto bail;
+			}
+		}
+
+		status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+
+		iput(inode_alloc);
+		inode_alloc = NULL;
+
+		if (status < 0)
+			goto bail;
+	}
+
+	o2info_set_request_filled(&oifi->ifi_req);
+
+	if (o2info_to_user(*oifi, req)) {
+		status = -EFAULT;
+		goto out_free;
+	}
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oifi->ifi_req, req);
+out_free:
+	kfree(oifi);
+out_err:
+	return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+				   unsigned int chunksize)
+{
+	int index;
+
+	index = __ilog2_u32(chunksize);
+	if (index >= OCFS2_INFO_MAX_HIST)
+		index = OCFS2_INFO_MAX_HIST - 1;
+
+	hist->fc_chunks[index]++;
+	hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+			       unsigned int chunksize)
+{
+	if (chunksize > stats->ffs_max)
+		stats->ffs_max = chunksize;
+
+	if (chunksize < stats->ffs_min)
+		stats->ffs_min = chunksize;
+
+	stats->ffs_avg += chunksize;
+	stats->ffs_free_chunks_real++;
+}
+
+static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+				  unsigned int chunksize)
+{
+	o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+	o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+					  struct inode *gb_inode,
+					  struct ocfs2_dinode *gb_dinode,
+					  struct ocfs2_chain_rec *rec,
+					  struct ocfs2_info_freefrag *ffg,
+					  u32 chunks_in_group)
+{
+	int status = 0, used;
+	u64 blkno;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_group_desc *bg = NULL;
+
+	unsigned int max_bits, num_clusters;
+	unsigned int offset = 0, cluster, chunk;
+	unsigned int chunk_free, last_chunksize = 0;
+
+	if (!le32_to_cpu(rec->c_free))
+		goto bail;
+
+	do {
+		if (!bg)
+			blkno = le64_to_cpu(rec->c_blkno);
+		else
+			blkno = le64_to_cpu(bg->bg_next_group);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		if (o2info_coherent(&ffg->iff_req))
+			status = ocfs2_read_group_descriptor(gb_inode,
+							     gb_dinode,
+							     blkno, &bh);
+		else
+			status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+		if (status < 0) {
+			mlog(ML_ERROR, "Can't read the group descriptor # "
+			     "%llu from device.", (unsigned long long)blkno);
+			status = -EIO;
+			goto bail;
+		}
+
+		bg = (struct ocfs2_group_desc *)bh->b_data;
+
+		if (!le16_to_cpu(bg->bg_free_bits_count))
+			continue;
+
+		max_bits = le16_to_cpu(bg->bg_bits);
+		offset = 0;
+
+		for (chunk = 0; chunk < chunks_in_group; chunk++) {
+			/*
+			 * last chunk may be not an entire one.
+			 */
+			if ((offset + ffg->iff_chunksize) > max_bits)
+				num_clusters = max_bits - offset;
+			else
+				num_clusters = ffg->iff_chunksize;
+
+			chunk_free = 0;
+			for (cluster = 0; cluster < num_clusters; cluster++) {
+				used = ocfs2_test_bit(offset,
+						(unsigned long *)bg->bg_bitmap);
+				/*
+				 * - chunk_free counts free clusters in #N chunk.
+				 * - last_chunksize records the size(in) clusters
+				 *   for the last real free chunk being counted.
+				 */
+				if (!used) {
+					last_chunksize++;
+					chunk_free++;
+				}
+
+				if (used && last_chunksize) {
+					ocfs2_info_update_ffg(ffg,
+							      last_chunksize);
+					last_chunksize = 0;
+				}
+
+				offset++;
+			}
+
+			if (chunk_free == ffg->iff_chunksize)
+				ffg->iff_ffs.ffs_free_chunks++;
+		}
+
+		/*
+		 * need to update the info for last free chunk.
+		 */
+		if (last_chunksize)
+			ocfs2_info_update_ffg(ffg, last_chunksize);
+
+	} while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+	brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+					   struct inode *gb_inode, u64 blkno,
+					   struct ocfs2_info_freefrag *ffg)
+{
+	u32 chunks_in_group;
+	int status = 0, unlock = 0, i;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_chain_list *cl = NULL;
+	struct ocfs2_chain_rec *rec = NULL;
+	struct ocfs2_dinode *gb_dinode = NULL;
+
+	if (gb_inode)
+		mutex_lock(&gb_inode->i_mutex);
+
+	if (o2info_coherent(&ffg->iff_req)) {
+		status = ocfs2_inode_lock(gb_inode, &bh, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		unlock = 1;
+	} else {
+		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+	cl = &(gb_dinode->id2.i_chain);
+
+	/*
+	 * Chunksize(in) clusters from userspace should be
+	 * less than clusters in a group.
+	 */
+	if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+	ffg->iff_ffs.ffs_min = ~0U;
+	ffg->iff_ffs.ffs_clusters =
+			le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+	ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+			le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+	chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+		rec = &(cl->cl_recs[i]);
+		status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+							gb_dinode,
+							rec, ffg,
+							chunks_in_group);
+		if (status)
+			goto bail;
+	}
+
+	if (ffg->iff_ffs.ffs_free_chunks_real)
+		ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+					ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+	if (unlock)
+		ocfs2_inode_unlock(gb_inode, 0);
+
+	if (gb_inode)
+		mutex_unlock(&gb_inode->i_mutex);
+
+	if (gb_inode)
+		iput(gb_inode);
+
+	brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_info_handle_freefrag(struct inode *inode,
+				      struct ocfs2_info_request __user *req)
+{
+	u64 blkno = -1;
+	char namebuf[40];
+	int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+	struct ocfs2_info_freefrag *oiff;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *gb_inode = NULL;
+
+	oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+	if (!oiff) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	if (o2info_from_user(*oiff, req)) {
+		status = -EFAULT;
+		goto out_free;
+	}
+	/*
+	 * chunksize from userspace should be power of 2.
+	 */
+	if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+	    (!oiff->iff_chunksize)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (o2info_coherent(&oiff->iff_req)) {
+		gb_inode = ocfs2_get_system_file_inode(osb, type,
+						       OCFS2_INVALID_SLOT);
+		if (!gb_inode) {
+			mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+			status = -EIO;
+			goto bail;
+		}
+	} else {
+		ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+						OCFS2_INVALID_SLOT);
+		status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+						    namebuf,
+						    strlen(namebuf),
+						    &blkno);
+		if (status < 0) {
+			status = -ENOENT;
+			goto bail;
+		}
+	}
+
+	status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+	if (status < 0)
+		goto bail;
+
+	o2info_set_request_filled(&oiff->iff_req);
+
+	if (o2info_to_user(*oiff, req)) {
+		status = -EFAULT;
+		goto out_free;
+	}
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oiff->iff_req, req);
+out_free:
+	kfree(oiff);
+out_err:
+	return status;
+}
+
+static int ocfs2_info_handle_unknown(struct inode *inode,
+				     struct ocfs2_info_request __user *req)
+{
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		return -EFAULT;
+
+	o2info_clear_request_filled(&oir);
+
+	if (o2info_to_user(oir, req))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+static int ocfs2_info_handle_request(struct inode *inode,
+				     struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		goto bail;
+
+	status = -EINVAL;
+	if (oir.ir_magic != OCFS2_INFO_MAGIC)
+		goto bail;
+
+	switch (oir.ir_code) {
+	case OCFS2_INFO_BLOCKSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+			status = ocfs2_info_handle_blocksize(inode, req);
+		break;
+	case OCFS2_INFO_CLUSTERSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+			status = ocfs2_info_handle_clustersize(inode, req);
+		break;
+	case OCFS2_INFO_MAXSLOTS:
+		if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+			status = ocfs2_info_handle_maxslots(inode, req);
+		break;
+	case OCFS2_INFO_LABEL:
+		if (oir.ir_size == sizeof(struct ocfs2_info_label))
+			status = ocfs2_info_handle_label(inode, req);
+		break;
+	case OCFS2_INFO_UUID:
+		if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+			status = ocfs2_info_handle_uuid(inode, req);
+		break;
+	case OCFS2_INFO_FS_FEATURES:
+		if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+			status = ocfs2_info_handle_fs_features(inode, req);
+		break;
+	case OCFS2_INFO_JOURNAL_SIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+			status = ocfs2_info_handle_journal_size(inode, req);
+		break;
+	case OCFS2_INFO_FREEINODE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+			status = ocfs2_info_handle_freeinode(inode, req);
+		break;
+	case OCFS2_INFO_FREEFRAG:
+		if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+			status = ocfs2_info_handle_freefrag(inode, req);
+		break;
+	default:
+		status = ocfs2_info_handle_unknown(inode, req);
+		break;
+	}
+
+bail:
+	return status;
+}
+
+static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+				 u64 *req_addr, int compat_flag)
+{
+	int status = -EFAULT;
+	u64 __user *bp = NULL;
+
+	if (compat_flag) {
+#ifdef CONFIG_COMPAT
+		/*
+		 * pointer bp stores the base address of a pointers array,
+		 * which collects all addresses of separate request.
+		 */
+		bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+		BUG();
+#endif
+	} else
+		bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+	if (o2info_from_user(*req_addr, bp + idx))
+		goto bail;
+
+	status = 0;
+bail:
+	return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+			     int compat_flag)
+{
+	int i, status = 0;
+	u64 req_addr;
+	struct ocfs2_info_request __user *reqp;
+
+	if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+	    (!info->oi_requests)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	for (i = 0; i < info->oi_count; i++) {
+
+		status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+		if (status)
+			break;
+
+		reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
+		if (!reqp) {
+			status = -EINVAL;
+			goto bail;
+		}
+
+		status = ocfs2_info_handle_request(inode, reqp);
+		if (status)
+			break;
+	}
+
+bail:
+	return status;
+}
+
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	unsigned int flags;
+	int new_clusters;
+	int status;
+	struct ocfs2_space_resv sr;
+	struct ocfs2_new_group_input input;
+	struct reflink_arguments args;
+	const char __user *old_path;
+	const char __user *new_path;
+	bool preserve;
+	struct ocfs2_info info;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case OCFS2_IOC_GETFLAGS:
+		status = ocfs2_get_inode_attr(inode, &flags);
+		if (status < 0)
+			return status;
+
+		flags &= OCFS2_FL_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case OCFS2_IOC_SETFLAGS:
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		status = mnt_want_write_file(filp);
+		if (status)
+			return status;
+		status = ocfs2_set_inode_attr(inode, flags,
+			OCFS2_FL_MODIFIABLE);
+		mnt_drop_write_file(filp);
+		return status;
+	case OCFS2_IOC_RESVSP:
+	case OCFS2_IOC_RESVSP64:
+	case OCFS2_IOC_UNRESVSP:
+	case OCFS2_IOC_UNRESVSP64:
+		if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
+			return -EFAULT;
+
+		return ocfs2_change_file_space(filp, cmd, &sr);
+	case OCFS2_IOC_GROUP_EXTEND:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (get_user(new_clusters, (int __user *)arg))
+			return -EFAULT;
+
+		status = mnt_want_write_file(filp);
+		if (status)
+			return status;
+		status = ocfs2_group_extend(inode, new_clusters);
+		mnt_drop_write_file(filp);
+		return status;
+	case OCFS2_IOC_GROUP_ADD:
+	case OCFS2_IOC_GROUP_ADD64:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
+			return -EFAULT;
+
+		status = mnt_want_write_file(filp);
+		if (status)
+			return status;
+		status = ocfs2_group_add(inode, &input);
+		mnt_drop_write_file(filp);
+		return status;
+	case OCFS2_IOC_REFLINK:
+		if (copy_from_user(&args, argp, sizeof(args)))
+			return -EFAULT;
+		old_path = (const char __user *)(unsigned long)args.old_path;
+		new_path = (const char __user *)(unsigned long)args.new_path;
+		preserve = (args.preserve != 0);
+
+		return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 0);
+	case FITRIM:
+	{
+		struct super_block *sb = inode->i_sb;
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (!blk_queue_discard(q))
+			return -EOPNOTSUPP;
+
+		if (copy_from_user(&range, argp, sizeof(range)))
+			return -EFAULT;
+
+		range.minlen = max_t(u64, q->limits.discard_granularity,
+				     range.minlen);
+		ret = ocfs2_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user(argp, &range, sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case OCFS2_IOC_MOVE_EXT:
+		return ocfs2_ioctl_move_extents(filp, argp);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	bool preserve;
+	struct reflink_arguments args;
+	struct inode *inode = file_inode(file);
+	struct ocfs2_info info;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case OCFS2_IOC32_GETFLAGS:
+		cmd = OCFS2_IOC_GETFLAGS;
+		break;
+	case OCFS2_IOC32_SETFLAGS:
+		cmd = OCFS2_IOC_SETFLAGS;
+		break;
+	case OCFS2_IOC_RESVSP:
+	case OCFS2_IOC_RESVSP64:
+	case OCFS2_IOC_UNRESVSP:
+	case OCFS2_IOC_UNRESVSP64:
+	case OCFS2_IOC_GROUP_EXTEND:
+	case OCFS2_IOC_GROUP_ADD:
+	case OCFS2_IOC_GROUP_ADD64:
+	case FITRIM:
+		break;
+	case OCFS2_IOC_REFLINK:
+		if (copy_from_user(&args, argp, sizeof(args)))
+			return -EFAULT;
+		preserve = (args.preserve != 0);
+
+		return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
+					   compat_ptr(args.new_path), preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 1);
+	case OCFS2_IOC_MOVE_EXT:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return ocfs2_ioctl(file, cmd, arg);
+}
+#endif
diff --git a/kernel/fs/ocfs2/ioctl.h b/kernel/fs/ocfs2/ioctl.h
new file mode 100644
index 000000000..0cd5323bd
--- /dev/null
+++ b/kernel/fs/ocfs2/ioctl.h
@@ -0,0 +1,16 @@
+/*
+ * ioctl.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ *
+ */
+
+#ifndef OCFS2_IOCTL_PROTO_H
+#define OCFS2_IOCTL_PROTO_H
+
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+
+#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/kernel/fs/ocfs2/journal.c b/kernel/fs/ocfs2/journal.c
new file mode 100644
index 000000000..ff5319282
--- /dev/null
+++ b/kernel/fs/ocfs2/journal.c
@@ -0,0 +1,2323 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.c
+ *
+ * Defines functions of journalling api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "quota.h"
+#include "file.h"
+#include "namei.h"
+
+#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
+
+DEFINE_SPINLOCK(trans_inc_lock);
+
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
+static int ocfs2_force_read_journal(struct inode *inode);
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num, int slot_num);
+static int __ocfs2_recovery_thread(void *arg);
+static int ocfs2_commit_cache(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty, int replayed);
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num);
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot,
+				 enum ocfs2_orphan_reco_type orphan_reco_type);
+static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec,
+					    enum ocfs2_orphan_reco_type orphan_reco_type);
+
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 0);
+}
+
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 1);
+}
+
+/*
+ * This replay_map is to track online/offline slots, so we could recover
+ * offline slots during recovery and mount
+ */
+
+enum ocfs2_replay_state {
+	REPLAY_UNNEEDED = 0,	/* Replay is not needed, so ignore this map */
+	REPLAY_NEEDED, 		/* Replay slots marked in rm_replay_slots */
+	REPLAY_DONE 		/* Replay was already queued */
+};
+
+struct ocfs2_replay_map {
+	unsigned int rm_slots;
+	enum ocfs2_replay_state rm_state;
+	unsigned char rm_replay_slots[0];
+};
+
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+	if (!osb->replay_map)
+		return;
+
+	/* If we've already queued the replay, we don't have any more to do */
+	if (osb->replay_map->rm_state == REPLAY_DONE)
+		return;
+
+	osb->replay_map->rm_state = state;
+}
+
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map;
+	int i, node_num;
+
+	/* If replay map is already set, we don't do it again */
+	if (osb->replay_map)
+		return 0;
+
+	replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+			     (osb->max_slots * sizeof(char)), GFP_KERNEL);
+
+	if (!replay_map) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	spin_lock(&osb->osb_lock);
+
+	replay_map->rm_slots = osb->max_slots;
+	replay_map->rm_state = REPLAY_UNNEEDED;
+
+	/* set rm_replay_slots for offline slot(s) */
+	for (i = 0; i < replay_map->rm_slots; i++) {
+		if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+			replay_map->rm_replay_slots[i] = 1;
+	}
+
+	osb->replay_map = replay_map;
+	spin_unlock(&osb->osb_lock);
+	return 0;
+}
+
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+		enum ocfs2_orphan_reco_type orphan_reco_type)
+{
+	struct ocfs2_replay_map *replay_map = osb->replay_map;
+	int i;
+
+	if (!replay_map)
+		return;
+
+	if (replay_map->rm_state != REPLAY_NEEDED)
+		return;
+
+	for (i = 0; i < replay_map->rm_slots; i++)
+		if (replay_map->rm_replay_slots[i])
+			ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+							NULL, NULL,
+							orphan_reco_type);
+	replay_map->rm_state = REPLAY_DONE;
+}
+
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map = osb->replay_map;
+
+	if (!osb->replay_map)
+		return;
+
+	kfree(replay_map);
+	osb->replay_map = NULL;
+}
+
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+	struct ocfs2_recovery_map *rm;
+
+	mutex_init(&osb->recovery_lock);
+	osb->disable_recovery = 0;
+	osb->recovery_thread_task = NULL;
+	init_waitqueue_head(&osb->recovery_event);
+
+	rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+		     osb->max_slots * sizeof(unsigned int),
+		     GFP_KERNEL);
+	if (!rm) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	rm->rm_entries = (unsigned int *)((char *)rm +
+					  sizeof(struct ocfs2_recovery_map));
+	osb->recovery_map = rm;
+
+	return 0;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+	mb();
+	return osb->recovery_thread_task != NULL;
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+	struct ocfs2_recovery_map *rm;
+
+	/* disable any new recovery threads and wait for any currently
+	 * running ones to exit. Do this before setting the vol_state. */
+	mutex_lock(&osb->recovery_lock);
+	osb->disable_recovery = 1;
+	mutex_unlock(&osb->recovery_lock);
+	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+	/* At this point, we know that no more recovery threads can be
+	 * launched, so wait for any recovery completion work to
+	 * complete. */
+	flush_workqueue(ocfs2_wq);
+
+	/*
+	 * Now that recovery is shut down, and the osb is about to be
+	 * freed,  the osb_lock is not taken here.
+	 */
+	rm = osb->recovery_map;
+	/* XXX: Should we bug if there are dirty entries? */
+
+	kfree(rm);
+}
+
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+				     unsigned int node_num)
+{
+	int i;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	assert_spin_locked(&osb->osb_lock);
+
+	for (i = 0; i < rm->rm_used; i++) {
+		if (rm->rm_entries[i] == node_num)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* Behaves like test-and-set.  Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+				  unsigned int node_num)
+{
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+	if (__ocfs2_recovery_map_test(osb, node_num)) {
+		spin_unlock(&osb->osb_lock);
+		return 1;
+	}
+
+	/* XXX: Can this be exploited? Not from o2dlm... */
+	BUG_ON(rm->rm_used >= osb->max_slots);
+
+	rm->rm_entries[rm->rm_used] = node_num;
+	rm->rm_used++;
+	spin_unlock(&osb->osb_lock);
+
+	return 0;
+}
+
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+				     unsigned int node_num)
+{
+	int i;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+
+	for (i = 0; i < rm->rm_used; i++) {
+		if (rm->rm_entries[i] == node_num)
+			break;
+	}
+
+	if (i < rm->rm_used) {
+		/* XXX: be careful with the pointer math */
+		memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+			(rm->rm_used - i - 1) * sizeof(unsigned int));
+		rm->rm_used--;
+	}
+
+	spin_unlock(&osb->osb_lock);
+}
+
+static int ocfs2_commit_cache(struct ocfs2_super *osb)
+{
+	int status = 0;
+	unsigned int flushed;
+	struct ocfs2_journal *journal = NULL;
+
+	journal = osb->journal;
+
+	/* Flush all pending commits and checkpoint the journal. */
+	down_write(&journal->j_trans_barrier);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	trace_ocfs2_commit_cache_begin(flushed);
+	if (flushed == 0) {
+		up_write(&journal->j_trans_barrier);
+		goto finally;
+	}
+
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog_errno(status);
+		goto finally;
+	}
+
+	ocfs2_inc_trans_id(journal);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	atomic_set(&journal->j_num_trans, 0);
+	up_write(&journal->j_trans_barrier);
+
+	trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
+
+	ocfs2_wake_downconvert_thread(osb);
+	wake_up(&journal->j_checkpointed);
+finally:
+	return status;
+}
+
+handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
+{
+	journal_t *journal = osb->journal->j_journal;
+	handle_t *handle;
+
+	BUG_ON(!osb || !osb->journal->j_journal);
+
+	if (ocfs2_is_hard_readonly(osb))
+		return ERR_PTR(-EROFS);
+
+	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
+	BUG_ON(max_buffs <= 0);
+
+	/* Nested transaction? Just return the handle... */
+	if (journal_current_handle())
+		return jbd2_journal_start(journal, max_buffs);
+
+	sb_start_intwrite(osb->sb);
+
+	down_read(&osb->journal->j_trans_barrier);
+
+	handle = jbd2_journal_start(journal, max_buffs);
+	if (IS_ERR(handle)) {
+		up_read(&osb->journal->j_trans_barrier);
+		sb_end_intwrite(osb->sb);
+
+		mlog_errno(PTR_ERR(handle));
+
+		if (is_journal_aborted(journal)) {
+			ocfs2_abort(osb->sb, "Detected aborted journal");
+			handle = ERR_PTR(-EROFS);
+		}
+	} else {
+		if (!ocfs2_mount_local(osb))
+			atomic_inc(&(osb->journal->j_num_trans));
+	}
+
+	return handle;
+}
+
+int ocfs2_commit_trans(struct ocfs2_super *osb,
+		       handle_t *handle)
+{
+	int ret, nested;
+	struct ocfs2_journal *journal = osb->journal;
+
+	BUG_ON(!handle);
+
+	nested = handle->h_ref > 1;
+	ret = jbd2_journal_stop(handle);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	if (!nested) {
+		up_read(&journal->j_trans_barrier);
+		sb_end_intwrite(osb->sb);
+	}
+
+	return ret;
+}
+
+/*
+ * 'nblocks' is what you want to add to the current transaction.
+ *
+ * This might call jbd2_journal_restart() which will commit dirty buffers
+ * and then restart the transaction. Before calling
+ * ocfs2_extend_trans(), any changed blocks should have been
+ * dirtied. After calling it, all blocks which need to be changed must
+ * go through another set of journal_access/journal_dirty calls.
+ *
+ * WARNING: This will not release any semaphores or disk locks taken
+ * during the transaction, so make sure they were taken *before*
+ * start_trans or we'll have ordering deadlocks.
+ *
+ * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
+ */
+int ocfs2_extend_trans(handle_t *handle, int nblocks)
+{
+	int status, old_nblocks;
+
+	BUG_ON(!handle);
+	BUG_ON(nblocks < 0);
+
+	if (!nblocks)
+		return 0;
+
+	old_nblocks = handle->h_buffer_credits;
+
+	trace_ocfs2_extend_trans(old_nblocks, nblocks);
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+	status = 1;
+#else
+	status = jbd2_journal_extend(handle, nblocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+#endif
+
+	if (status > 0) {
+		trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
+		status = jbd2_journal_restart(handle,
+					      old_nblocks + nblocks);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	return status;
+}
+
+/*
+ * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for metadata modifications.
+ * Taken from Ext4: extend_or_restart_transaction()
+ */
+int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
+{
+	int status, old_nblks;
+
+	BUG_ON(!handle);
+
+	old_nblks = handle->h_buffer_credits;
+	trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
+
+	if (old_nblks < thresh)
+		return 0;
+
+	status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (status > 0) {
+		status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+bail:
+	return status;
+}
+
+
+struct ocfs2_triggers {
+	struct jbd2_buffer_trigger_type	ot_triggers;
+	int				ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+	return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_dir_trailer_from_size(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+				struct buffer_head *bh)
+{
+	mlog(ML_ERROR,
+	     "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+	     "bh->b_blocknr = %llu\n",
+	     (unsigned long)bh,
+	     (unsigned long long)bh->b_blocknr);
+
+	/* We aren't guaranteed to have the superblock here - but if we
+	 * don't, it'll just crash. */
+	ocfs2_error(bh->b_assoc_map->host->i_sb,
+		    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers rb_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_refcount_block, rf_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers db_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_db_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static struct ocfs2_triggers xb_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_dq_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static struct ocfs2_triggers dr_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+
+static struct ocfs2_triggers dl_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dx_leaf, dl_check),
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+				  struct ocfs2_caching_info *ci,
+				  struct buffer_head *bh,
+				  struct ocfs2_triggers *triggers,
+				  int type)
+{
+	int status;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+	BUG_ON(!ci || !ci->ci_ops);
+	BUG_ON(!handle);
+	BUG_ON(!bh);
+
+	trace_ocfs2_journal_access(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr, type, bh->b_size);
+
+	/* we can safely remove this assertion after testing. */
+	if (!buffer_uptodate(bh)) {
+		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
+		mlog(ML_ERROR, "b_blocknr=%llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		BUG();
+	}
+
+	/* Set the current transaction information on the ci so
+	 * that the locking code knows whether it can drop it's locks
+	 * on this ci or not. We're protected from the commit
+	 * thread updating the current transaction id until
+	 * ocfs2_commit_trans() because ocfs2_start_trans() took
+	 * j_trans_barrier for us. */
+	ocfs2_set_ci_lock_trans(osb->journal, ci);
+
+	ocfs2_metadata_cache_io_lock(ci);
+	switch (type) {
+	case OCFS2_JOURNAL_ACCESS_CREATE:
+	case OCFS2_JOURNAL_ACCESS_WRITE:
+		status = jbd2_journal_get_write_access(handle, bh);
+		break;
+
+	case OCFS2_JOURNAL_ACCESS_UNDO:
+		status = jbd2_journal_get_undo_access(handle, bh);
+		break;
+
+	default:
+		status = -EINVAL;
+		mlog(ML_ERROR, "Unknown access type!\n");
+	}
+	if (!status && ocfs2_meta_ecc(osb) && triggers)
+		jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
+	ocfs2_metadata_cache_io_unlock(ci);
+
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
+		     status, type);
+
+	return status;
+}
+
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
+}
+
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
+}
+
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
+}
+
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
+}
+
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
+}
+
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
+}
+
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+}
+
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
+}
+
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
+			 struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, NULL, type);
+}
+
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
+{
+	int status;
+
+	trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
+
+	status = jbd2_journal_dirty_metadata(handle, bh);
+	BUG_ON(status);
+}
+
+#define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
+
+void ocfs2_set_journal_params(struct ocfs2_super *osb)
+{
+	journal_t *journal = osb->journal->j_journal;
+	unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+
+	if (osb->osb_commit_interval)
+		commit_interval = osb->osb_commit_interval;
+
+	write_lock(&journal->j_state_lock);
+	journal->j_commit_interval = commit_interval;
+	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+		journal->j_flags |= JBD2_BARRIER;
+	else
+		journal->j_flags &= ~JBD2_BARRIER;
+	write_unlock(&journal->j_state_lock);
+}
+
+int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+{
+	int status = -1;
+	struct inode *inode = NULL; /* the journal inode */
+	journal_t *j_journal = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_super *osb;
+	int inode_lock = 0;
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	/* already have the inode for our journal */
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    osb->slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto done;
+	}
+
+	SET_INODE_JOURNAL(inode);
+	OCFS2_I(inode)->ip_open_count++;
+
+	/* Skip recovery waits here - journal inode metadata never
+	 * changes in a live cluster so it can be considered an
+	 * exception to the rule. */
+	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not get lock on journal!\n");
+		goto done;
+	}
+
+	inode_lock = 1;
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	if (i_size_read(inode) <  OCFS2_MIN_JOURNAL_SIZE) {
+		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
+		     i_size_read(inode));
+		status = -EINVAL;
+		goto done;
+	}
+
+	trace_ocfs2_journal_init(i_size_read(inode),
+				 (unsigned long long)inode->i_blocks,
+				 OCFS2_I(inode)->ip_clusters);
+
+	/* call the kernels journal init function now */
+	j_journal = jbd2_journal_init_inode(inode);
+	if (j_journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EINVAL;
+		goto done;
+	}
+
+	trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
+
+	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
+		  OCFS2_JOURNAL_DIRTY_FL);
+
+	journal->j_journal = j_journal;
+	journal->j_inode = inode;
+	journal->j_bh = bh;
+
+	ocfs2_set_journal_params(osb);
+
+	journal->j_state = OCFS2_JOURNAL_LOADED;
+
+	status = 0;
+done:
+	if (status < 0) {
+		if (inode_lock)
+			ocfs2_inode_unlock(inode, 1);
+		brelse(bh);
+		if (inode) {
+			OCFS2_I(inode)->ip_open_count--;
+			iput(inode);
+		}
+	}
+
+	return status;
+}
+
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+	le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+	return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
+
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty, int replayed)
+{
+	int status;
+	unsigned int flags;
+	struct ocfs2_journal *journal = osb->journal;
+	struct buffer_head *bh = journal->j_bh;
+	struct ocfs2_dinode *fe;
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+
+	/* The journal bh on the osb always comes from ocfs2_journal_init()
+	 * and was validated there inside ocfs2_inode_lock_full().  It's a
+	 * code bug if we mess it up. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	if (dirty)
+		flags |= OCFS2_JOURNAL_DIRTY_FL;
+	else
+		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	if (replayed)
+		ocfs2_bump_recovery_generation(fe);
+
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
+	status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * If the journal has been kmalloc'd it needs to be freed after this
+ * call.
+ */
+void ocfs2_journal_shutdown(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = NULL;
+	int status = 0;
+	struct inode *inode = NULL;
+	int num_running_trans = 0;
+
+	BUG_ON(!osb);
+
+	journal = osb->journal;
+	if (!journal)
+		goto done;
+
+	inode = journal->j_inode;
+
+	if (journal->j_state != OCFS2_JOURNAL_LOADED)
+		goto done;
+
+	/* need to inc inode use count - jbd2_journal_destroy will iput. */
+	if (!igrab(inode))
+		BUG();
+
+	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+	trace_ocfs2_journal_shutdown(num_running_trans);
+
+	/* Do a commit_cache here. It will flush our journal, *and*
+	 * release any locks that are still held.
+	 * set the SHUTDOWN flag and release the trans lock.
+	 * the commit thread will take the trans lock for us below. */
+	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
+
+	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
+	 * drop the trans_lock (which we want to hold until we
+	 * completely destroy the journal. */
+	if (osb->commit_task) {
+		/* Wait for the commit thread */
+		trace_ocfs2_journal_shutdown_wait(osb->commit_task);
+		kthread_stop(osb->commit_task);
+		osb->commit_task = NULL;
+	}
+
+	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+
+	if (ocfs2_mount_local(osb)) {
+		jbd2_journal_lock_updates(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal);
+		jbd2_journal_unlock_updates(journal->j_journal);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	if (status == 0) {
+		/*
+		 * Do not toggle if flush was unsuccessful otherwise
+		 * will leave dirty metadata in a "clean" journal
+		 */
+		status = ocfs2_journal_toggle_dirty(osb, 0, 0);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	/* Shutdown the kernel journal system */
+	jbd2_journal_destroy(journal->j_journal);
+	journal->j_journal = NULL;
+
+	OCFS2_I(inode)->ip_open_count--;
+
+	/* unlock our journal */
+	ocfs2_inode_unlock(inode, 1);
+
+	brelse(journal->j_bh);
+	journal->j_bh = NULL;
+
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+//	up_write(&journal->j_trans_barrier);
+done:
+	if (inode)
+		iput(inode);
+}
+
+static void ocfs2_clear_journal_error(struct super_block *sb,
+				      journal_t *journal,
+				      int slot)
+{
+	int olderr;
+
+	olderr = jbd2_journal_errno(journal);
+	if (olderr) {
+		mlog(ML_ERROR, "File system error %d recorded in "
+		     "journal %u.\n", olderr, slot);
+		mlog(ML_ERROR, "File system on device %s needs checking.\n",
+		     sb->s_id);
+
+		jbd2_journal_ack_err(journal);
+		jbd2_journal_clear_err(journal);
+	}
+}
+
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
+{
+	int status = 0;
+	struct ocfs2_super *osb;
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	status = jbd2_journal_load(journal->j_journal);
+	if (status < 0) {
+		mlog(ML_ERROR, "Failed to load journal!\n");
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+
+	status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Launch the commit thread */
+	if (!local) {
+		osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
+					       "ocfs2cmt");
+		if (IS_ERR(osb->commit_task)) {
+			status = PTR_ERR(osb->commit_task);
+			osb->commit_task = NULL;
+			mlog(ML_ERROR, "unable to launch ocfs2commit thread, "
+			     "error=%d", status);
+			goto done;
+		}
+	} else
+		osb->commit_task = NULL;
+
+done:
+	return status;
+}
+
+
+/* 'full' flag tells us whether we clear out all blocks or if we just
+ * mark the journal clean */
+int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
+{
+	int status;
+
+	BUG_ON(!journal);
+
+	status = jbd2_journal_wipe(journal->j_journal, full);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+	int empty;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+	empty = (rm->rm_used == 0);
+	spin_unlock(&osb->osb_lock);
+
+	return empty;
+}
+
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+	wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
+
+/*
+ * JBD Might read a cached version of another nodes journal file. We
+ * don't want this as this file changes often and we get no
+ * notification on those changes. The only way to be sure that we've
+ * got the most up to date version of those blocks then is to force
+ * read them off disk. Just searching through the buffer cache won't
+ * work as there may be pages backing this file which are still marked
+ * up to date. We know things can't change on this file underneath us
+ * as we have the lock by now :)
+ */
+static int ocfs2_force_read_journal(struct inode *inode)
+{
+	int status = 0;
+	int i;
+	u64 v_blkno, p_blkno, p_blocks, num_blocks;
+#define CONCURRENT_JOURNAL_FILL 32ULL
+	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
+
+	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+
+	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	v_blkno = 0;
+	while (v_blkno < num_blocks) {
+		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
+						     &p_blkno, &p_blocks, NULL);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (p_blocks > CONCURRENT_JOURNAL_FILL)
+			p_blocks = CONCURRENT_JOURNAL_FILL;
+
+		/* We are reading journal data which should not
+		 * be put in the uptodate cache */
+		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+						p_blkno, p_blocks, bhs);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = 0; i < p_blocks; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+
+		v_blkno += p_blocks;
+	}
+
+bail:
+	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
+		brelse(bhs[i]);
+	return status;
+}
+
+struct ocfs2_la_recovery_item {
+	struct list_head	lri_list;
+	int			lri_slot;
+	struct ocfs2_dinode	*lri_la_dinode;
+	struct ocfs2_dinode	*lri_tl_dinode;
+	struct ocfs2_quota_recovery *lri_qrec;
+	enum ocfs2_orphan_reco_type  lri_orphan_reco_type;
+};
+
+/* Does the second half of the recovery process. By this point, the
+ * node is marked clean and can actually be considered recovered,
+ * hence it's no longer in the recovery map, but there's still some
+ * cleanup we can do which shouldn't happen within the recovery thread
+ * as locking in that context becomes very difficult if we are to take
+ * recovering nodes into account.
+ *
+ * NOTE: This function can and will sleep on recovery of other nodes
+ * during cluster locking, just like any other ocfs2 process.
+ */
+void ocfs2_complete_recovery(struct work_struct *work)
+{
+	int ret = 0;
+	struct ocfs2_journal *journal =
+		container_of(work, struct ocfs2_journal, j_recovery_work);
+	struct ocfs2_super *osb = journal->j_osb;
+	struct ocfs2_dinode *la_dinode, *tl_dinode;
+	struct ocfs2_la_recovery_item *item, *n;
+	struct ocfs2_quota_recovery *qrec;
+	enum ocfs2_orphan_reco_type orphan_reco_type;
+	LIST_HEAD(tmp_la_list);
+
+	trace_ocfs2_complete_recovery(
+		(unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
+
+	spin_lock(&journal->j_lock);
+	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
+	spin_unlock(&journal->j_lock);
+
+	list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
+		list_del_init(&item->lri_list);
+
+		ocfs2_wait_on_quotas(osb);
+
+		la_dinode = item->lri_la_dinode;
+		tl_dinode = item->lri_tl_dinode;
+		qrec = item->lri_qrec;
+		orphan_reco_type = item->lri_orphan_reco_type;
+
+		trace_ocfs2_complete_recovery_slot(item->lri_slot,
+			la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+			tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+			qrec);
+
+		if (la_dinode) {
+			ret = ocfs2_complete_local_alloc_recovery(osb,
+								  la_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(la_dinode);
+		}
+
+		if (tl_dinode) {
+			ret = ocfs2_complete_truncate_log_recovery(osb,
+								   tl_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(tl_dinode);
+		}
+
+		ret = ocfs2_recover_orphans(osb, item->lri_slot,
+				orphan_reco_type);
+		if (ret < 0)
+			mlog_errno(ret);
+
+		if (qrec) {
+			ret = ocfs2_finish_quota_recovery(osb, qrec,
+							  item->lri_slot);
+			if (ret < 0)
+				mlog_errno(ret);
+			/* Recovery info is already freed now */
+		}
+
+		kfree(item);
+	}
+
+	trace_ocfs2_complete_recovery_end(ret);
+}
+
+/* NOTE: This function always eats your references to la_dinode and
+ * tl_dinode, either manually on error, or by passing them to
+ * ocfs2_complete_recovery */
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec,
+					    enum ocfs2_orphan_reco_type orphan_reco_type)
+{
+	struct ocfs2_la_recovery_item *item;
+
+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS);
+	if (!item) {
+		/* Though we wish to avoid it, we are in fact safe in
+		 * skipping local alloc cleanup as fsck.ocfs2 is more
+		 * than capable of reclaiming unused space. */
+		kfree(la_dinode);
+		kfree(tl_dinode);
+
+		if (qrec)
+			ocfs2_free_quota_recovery(qrec);
+
+		mlog_errno(-ENOMEM);
+		return;
+	}
+
+	INIT_LIST_HEAD(&item->lri_list);
+	item->lri_la_dinode = la_dinode;
+	item->lri_slot = slot_num;
+	item->lri_tl_dinode = tl_dinode;
+	item->lri_qrec = qrec;
+	item->lri_orphan_reco_type = orphan_reco_type;
+
+	spin_lock(&journal->j_lock);
+	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
+	queue_work(ocfs2_wq, &journal->j_recovery_work);
+	spin_unlock(&journal->j_lock);
+}
+
+/* Called by the mount code to queue recovery the last part of
+ * recovery for it's own and offline slot(s). */
+void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = osb->journal;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return;
+
+	/* No need to queue up our truncate_log as regular cleanup will catch
+	 * that */
+	ocfs2_queue_recovery_completion(journal, osb->slot_num,
+					osb->local_alloc_copy, NULL, NULL,
+					ORPHAN_NEED_TRUNCATE);
+	ocfs2_schedule_truncate_log_flush(osb, 0);
+
+	osb->local_alloc_copy = NULL;
+	osb->dirty = 0;
+
+	/* queue to recover orphan slots for all offline slots */
+	ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+	ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
+	ocfs2_free_replay_slots(osb);
+}
+
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+	if (osb->quota_rec) {
+		ocfs2_queue_recovery_completion(osb->journal,
+						osb->slot_num,
+						NULL,
+						NULL,
+						osb->quota_rec,
+						ORPHAN_NEED_TRUNCATE);
+		osb->quota_rec = NULL;
+	}
+}
+
+static int __ocfs2_recovery_thread(void *arg)
+{
+	int status, node_num, slot_num;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	int *rm_quota = NULL;
+	int rm_quota_used = 0, i;
+	struct ocfs2_quota_recovery *qrec;
+
+	status = ocfs2_wait_on_mount(osb);
+	if (status < 0) {
+		goto bail;
+	}
+
+	rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+	if (!rm_quota) {
+		status = -ENOMEM;
+		goto bail;
+	}
+restart:
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_compute_replay_slots(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* queue recovery for our own slot */
+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+					NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
+
+	spin_lock(&osb->osb_lock);
+	while (rm->rm_used) {
+		/* It's always safe to remove entry zero, as we won't
+		 * clear it until ocfs2_recover_node() has succeeded. */
+		node_num = rm->rm_entries[0];
+		spin_unlock(&osb->osb_lock);
+		slot_num = ocfs2_node_num_to_slot(osb, node_num);
+		trace_ocfs2_recovery_thread_node(node_num, slot_num);
+		if (slot_num == -ENOENT) {
+			status = 0;
+			goto skip_recovery;
+		}
+
+		/* It is a bit subtle with quota recovery. We cannot do it
+		 * immediately because we have to obtain cluster locks from
+		 * quota files and we also don't want to just skip it because
+		 * then quota usage would be out of sync until some node takes
+		 * the slot. So we remember which nodes need quota recovery
+		 * and when everything else is done, we recover quotas. */
+		for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+		if (i == rm_quota_used)
+			rm_quota[rm_quota_used++] = slot_num;
+
+		status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
+		if (!status) {
+			ocfs2_recovery_map_clear(osb, node_num);
+		} else {
+			mlog(ML_ERROR,
+			     "Error %d recovering node %d on device (%u,%u)!\n",
+			     status, node_num,
+			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+			mlog(ML_ERROR, "Volume requires unmount.\n");
+		}
+
+		spin_lock(&osb->osb_lock);
+	}
+	spin_unlock(&osb->osb_lock);
+	trace_ocfs2_recovery_thread_end(status);
+
+	/* Refresh all journal recovery generations from disk */
+	status = ocfs2_check_journals_nolocks(osb);
+	status = (status == -EROFS) ? 0 : status;
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Now it is right time to recover quotas... We have to do this under
+	 * superblock lock so that no one can start using the slot (and crash)
+	 * before we recover it */
+	for (i = 0; i < rm_quota_used; i++) {
+		qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+		if (IS_ERR(qrec)) {
+			status = PTR_ERR(qrec);
+			mlog_errno(status);
+			continue;
+		}
+		ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+						NULL, NULL, qrec,
+						ORPHAN_NEED_TRUNCATE);
+	}
+
+	ocfs2_super_unlock(osb, 1);
+
+	/* queue recovery for offline slots */
+	ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
+
+bail:
+	mutex_lock(&osb->recovery_lock);
+	if (!status && !ocfs2_recovery_completed(osb)) {
+		mutex_unlock(&osb->recovery_lock);
+		goto restart;
+	}
+
+	ocfs2_free_replay_slots(osb);
+	osb->recovery_thread_task = NULL;
+	mb(); /* sync with ocfs2_recovery_thread_running */
+	wake_up(&osb->recovery_event);
+
+	mutex_unlock(&osb->recovery_lock);
+
+	kfree(rm_quota);
+
+	/* no one is callint kthread_stop() for us so the kthread() api
+	 * requires that we call do_exit().  And it isn't exported, but
+	 * complete_and_exit() seems to be a minimal wrapper around it. */
+	complete_and_exit(NULL, status);
+}
+
+void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
+{
+	mutex_lock(&osb->recovery_lock);
+
+	trace_ocfs2_recovery_thread(node_num, osb->node_num,
+		osb->disable_recovery, osb->recovery_thread_task,
+		osb->disable_recovery ?
+		-1 : ocfs2_recovery_map_set(osb, node_num));
+
+	if (osb->disable_recovery)
+		goto out;
+
+	if (osb->recovery_thread_task)
+		goto out;
+
+	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
+						 "ocfs2rec");
+	if (IS_ERR(osb->recovery_thread_task)) {
+		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
+		osb->recovery_thread_task = NULL;
+	}
+
+out:
+	mutex_unlock(&osb->recovery_lock);
+	wake_up(&osb->recovery_event);
+}
+
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+				    int slot_num,
+				    struct buffer_head **bh,
+				    struct inode **ret_inode)
+{
+	int status = -EACCES;
+	struct inode *inode = NULL;
+
+	BUG_ON(slot_num >= osb->max_slots);
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (!inode || is_bad_inode(inode)) {
+		mlog_errno(status);
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+
+bail:
+	if (inode) {
+		if (status || !ret_inode)
+			iput(inode);
+		else
+			*ret_inode = inode;
+	}
+	return status;
+}
+
+/* Does the actual journal replay and marks the journal inode as
+ * clean. Will only replay if the journal inode is marked dirty. */
+static int ocfs2_replay_journal(struct ocfs2_super *osb,
+				int node_num,
+				int slot_num)
+{
+	int status;
+	int got_lock = 0;
+	unsigned int flags;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *fe;
+	journal_t *journal = NULL;
+	struct buffer_head *bh = NULL;
+	u32 slot_reco_gen;
+
+	status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
+	if (status) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+	slot_reco_gen = ocfs2_get_recovery_generation(fe);
+	brelse(bh);
+	bh = NULL;
+
+	/*
+	 * As the fs recovery is asynchronous, there is a small chance that
+	 * another node mounted (and recovered) the slot before the recovery
+	 * thread could get the lock. To handle that, we dirty read the journal
+	 * inode for that slot to get the recovery generation. If it is
+	 * different than what we expected, the slot has been recovered.
+	 * If not, it needs recovery.
+	 */
+	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+		trace_ocfs2_replay_journal_recovered(slot_num,
+		     osb->slot_recovery_generations[slot_num], slot_reco_gen);
+		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+		status = -EBUSY;
+		goto done;
+	}
+
+	/* Continue with recovery as the journal has not yet been recovered */
+
+	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		trace_ocfs2_replay_journal_lock_err(status);
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not lock journal!\n");
+		goto done;
+	}
+	got_lock = 1;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	slot_reco_gen = ocfs2_get_recovery_generation(fe);
+
+	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
+		trace_ocfs2_replay_journal_skip(node_num);
+		/* Refresh recovery generation for the slot */
+		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+		goto done;
+	}
+
+	/* we need to run complete recovery for offline orphan slots */
+	ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+
+	printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+	       "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+	       MINOR(osb->sb->s_dev));
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+
+	status = ocfs2_force_read_journal(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	journal = jbd2_journal_init_inode(inode);
+	if (journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EIO;
+		goto done;
+	}
+
+	status = jbd2_journal_load(journal);
+	if (status < 0) {
+		mlog_errno(status);
+		if (!igrab(inode))
+			BUG();
+		jbd2_journal_destroy(journal);
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
+
+	/* wipe the journal */
+	jbd2_journal_lock_updates(journal);
+	status = jbd2_journal_flush(journal);
+	jbd2_journal_unlock_updates(journal);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will mark the node clean */
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	/* Increment recovery generation to indicate successful recovery */
+	ocfs2_bump_recovery_generation(fe);
+	osb->slot_recovery_generations[slot_num] =
+					ocfs2_get_recovery_generation(fe);
+
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
+	status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+	if (status < 0)
+		mlog_errno(status);
+
+	if (!igrab(inode))
+		BUG();
+
+	jbd2_journal_destroy(journal);
+
+	printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+	       "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+	       MINOR(osb->sb->s_dev));
+done:
+	/* drop the lock on this nodes journal */
+	if (got_lock)
+		ocfs2_inode_unlock(inode, 1);
+
+	if (inode)
+		iput(inode);
+
+	brelse(bh);
+
+	return status;
+}
+
+/*
+ * Do the most important parts of node recovery:
+ *  - Replay it's journal
+ *  - Stamp a clean local allocator file
+ *  - Stamp a clean truncate log
+ *  - Mark the node clean
+ *
+ * If this function completes without error, a node in OCFS2 can be
+ * said to have been safely recovered. As a result, failure during the
+ * second part of a nodes recovery process (local alloc recovery) is
+ * far less concerning.
+ */
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num, int slot_num)
+{
+	int status = 0;
+	struct ocfs2_dinode *la_copy = NULL;
+	struct ocfs2_dinode *tl_copy = NULL;
+
+	trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
+
+	/* Should not ever be called to recover ourselves -- in that
+	 * case we should've called ocfs2_journal_load instead. */
+	BUG_ON(osb->node_num == node_num);
+
+	status = ocfs2_replay_journal(osb, node_num, slot_num);
+	if (status < 0) {
+		if (status == -EBUSY) {
+			trace_ocfs2_recover_node_skip(slot_num, node_num);
+			status = 0;
+			goto done;
+		}
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Stamp a clean local alloc file AFTER recovering the journal... */
+	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* An error from begin_truncate_log_recovery is not
+	 * serious enough to warrant halting the rest of
+	 * recovery. */
+	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Likewise, this would be a strange but ultimately not so
+	 * harmful place to get an error... */
+	status = ocfs2_clear_slot(osb, slot_num);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will kfree the memory pointed to by la_copy and tl_copy */
+	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
+					tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
+
+	status = 0;
+done:
+
+	return status;
+}
+
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num)
+{
+	int status, flags;
+	struct inode *inode = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		mlog(ML_ERROR, "access error\n");
+		status = -EACCES;
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
+	status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_inode_unlock(inode, 1);
+bail:
+	if (inode)
+		iput(inode);
+
+	return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
+{
+	unsigned int node_num;
+	int status, i;
+	u32 gen;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *di;
+
+	/* This is called with the super block cluster lock, so we
+	 * know that the slot map can't change underneath us. */
+
+	for (i = 0; i < osb->max_slots; i++) {
+		/* Read journal inode to get the recovery generation */
+		status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		di = (struct ocfs2_dinode *)bh->b_data;
+		gen = ocfs2_get_recovery_generation(di);
+		brelse(bh);
+		bh = NULL;
+
+		spin_lock(&osb->osb_lock);
+		osb->slot_recovery_generations[i] = gen;
+
+		trace_ocfs2_mark_dead_nodes(i,
+					    osb->slot_recovery_generations[i]);
+
+		if (i == osb->slot_num) {
+			spin_unlock(&osb->osb_lock);
+			continue;
+		}
+
+		status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+		if (status == -ENOENT) {
+			spin_unlock(&osb->osb_lock);
+			continue;
+		}
+
+		if (__ocfs2_recovery_map_test(osb, node_num)) {
+			spin_unlock(&osb->osb_lock);
+			continue;
+		}
+		spin_unlock(&osb->osb_lock);
+
+		/* Ok, we have a slot occupied by another node which
+		 * is not in the recovery map. We trylock his journal
+		 * file here to test if he's alive. */
+		status = ocfs2_trylock_journal(osb, i);
+		if (!status) {
+			/* Since we're called from mount, we know that
+			 * the recovery thread can't race us on
+			 * setting / checking the recovery bits. */
+			ocfs2_recovery_thread(osb, node_num);
+		} else if ((status < 0) && (status != -EAGAIN)) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	return status;
+}
+
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+	unsigned long time;
+
+	get_random_bytes(&time, sizeof(time));
+	time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+	return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
+ * is done to catch any orphans that are left over in orphan directories.
+ *
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ *   Node 1 has an inode it was using. The dentry went away due to memory
+ *   pressure.  Node 1 closes the inode, but it's on the free list. The node
+ *   has the open lock.
+ *   Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ *   but node 1 has no dentry and doesn't get the message. It trylocks the
+ *   open lock, sees that another node has a PR, and does nothing.
+ *   Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ *   open lock, sees the PR still, and does nothing.
+ *   Basically, we have to trigger an orphan iput on node 1. The only way
+ *   for this to happen is if node 1 runs node 2's orphan dir.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
+ * seconds.  It gets an EX lock on os_lockres and checks sequence number
+ * stored in LVB. If the sequence number has changed, it means some other
+ * node has done the scan.  This node skips the scan and tracks the
+ * sequence number.  If the sequence number didn't change, it means a scan
+ * hasn't happened.  The node queues a scan and increments the
+ * sequence number in the LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+	int status, i;
+	u32 seqno = 0;
+
+	os = &osb->osb_orphan_scan;
+
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+		goto out;
+
+	trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+					    atomic_read(&os->os_state));
+
+	status = ocfs2_orphan_scan_lock(osb, &seqno);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto out;
+	}
+
+	/* Do no queue the tasks if the volume is being umounted */
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+		goto unlock;
+
+	if (os->os_seqno != seqno) {
+		os->os_seqno = seqno;
+		goto unlock;
+	}
+
+	for (i = 0; i < osb->max_slots; i++)
+		ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+						NULL, ORPHAN_NO_NEED_TRUNCATE);
+	/*
+	 * We queued a recovery on orphan slots, increment the sequence
+	 * number and update LVB so other node will skip the scan for a while
+	 */
+	seqno++;
+	os->os_count++;
+	os->os_scantime = CURRENT_TIME;
+unlock:
+	ocfs2_orphan_scan_unlock(osb, seqno);
+out:
+	trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+					  atomic_read(&os->os_state));
+	return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+	struct ocfs2_orphan_scan *os;
+	struct ocfs2_super *osb;
+
+	os = container_of(work, struct ocfs2_orphan_scan,
+			  os_orphan_scan_work.work);
+	osb = os->os_osb;
+
+	mutex_lock(&os->os_lock);
+	ocfs2_queue_orphan_scan(osb);
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+				      ocfs2_orphan_scan_timeout());
+	mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) {
+		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+		mutex_lock(&os->os_lock);
+		cancel_delayed_work(&os->os_orphan_scan_work);
+		mutex_unlock(&os->os_lock);
+	}
+}
+
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	os->os_osb = osb;
+	os->os_count = 0;
+	os->os_seqno = 0;
+	mutex_init(&os->os_lock);
+	INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	os->os_scantime = CURRENT_TIME;
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+	else {
+		atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+				   ocfs2_orphan_scan_timeout());
+	}
+}
+
+struct ocfs2_orphan_filldir_priv {
+	struct dir_context	ctx;
+	struct inode		*head;
+	struct ocfs2_super	*osb;
+};
+
+static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
+				int name_len, loff_t pos, u64 ino,
+				unsigned type)
+{
+	struct ocfs2_orphan_filldir_priv *p =
+		container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx);
+	struct inode *iter;
+
+	if (name_len == 1 && !strncmp(".", name, 1))
+		return 0;
+	if (name_len == 2 && !strncmp("..", name, 2))
+		return 0;
+
+	/* Skip bad inodes so that recovery can continue */
+	iter = ocfs2_iget(p->osb, ino,
+			  OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
+	if (IS_ERR(iter))
+		return 0;
+
+	/* Skip inodes which are already added to recover list, since dio may
+	 * happen concurrently with unlink/rename */
+	if (OCFS2_I(iter)->ip_next_orphan) {
+		iput(iter);
+		return 0;
+	}
+
+	trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
+	/* No locking is required for the next_orphan queue as there
+	 * is only ever a single process doing orphan recovery. */
+	OCFS2_I(iter)->ip_next_orphan = p->head;
+	p->head = iter;
+
+	return 0;
+}
+
+static int ocfs2_queue_orphans(struct ocfs2_super *osb,
+			       int slot,
+			       struct inode **head)
+{
+	int status;
+	struct inode *orphan_dir_inode = NULL;
+	struct ocfs2_orphan_filldir_priv priv = {
+		.ctx.actor = ocfs2_orphan_filldir,
+		.osb = osb,
+		.head = *head
+	};
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       slot);
+	if  (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		return status;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+	status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
+	if (status) {
+		mlog_errno(status);
+		goto out_cluster;
+	}
+
+	*head = priv.head;
+
+out_cluster:
+	ocfs2_inode_unlock(orphan_dir_inode, 0);
+out:
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	iput(orphan_dir_inode);
+	return status;
+}
+
+static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
+					      int slot)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = !osb->osb_orphan_wipes[slot];
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
+					     int slot)
+{
+	spin_lock(&osb->osb_lock);
+	/* Mark ourselves such that new processes in delete_inode()
+	 * know to quit early. */
+	ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+	while (osb->osb_orphan_wipes[slot]) {
+		/* If any processes are already in the middle of an
+		 * orphan wipe on this dir, then we need to wait for
+		 * them. */
+		spin_unlock(&osb->osb_lock);
+		wait_event_interruptible(osb->osb_wipe_event,
+					 ocfs2_orphan_recovery_can_continue(osb, slot));
+		spin_lock(&osb->osb_lock);
+	}
+	spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
+					      int slot)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+}
+
+/*
+ * Orphan recovery. Each mounted node has it's own orphan dir which we
+ * must run during recovery. Our strategy here is to build a list of
+ * the inodes in the orphan dir and iget/iput them. The VFS does
+ * (most) of the rest of the work.
+ *
+ * Orphan recovery can happen at any time, not just mount so we have a
+ * couple of extra considerations.
+ *
+ * - We grab as many inodes as we can under the orphan dir lock -
+ *   doing iget() outside the orphan dir risks getting a reference on
+ *   an invalid inode.
+ * - We must be sure not to deadlock with other processes on the
+ *   system wanting to run delete_inode(). This can happen when they go
+ *   to lock the orphan dir and the orphan recovery process attempts to
+ *   iget() inside the orphan dir lock. This can be avoided by
+ *   advertising our state to ocfs2_delete_inode().
+ */
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot,
+				 enum ocfs2_orphan_reco_type orphan_reco_type)
+{
+	int ret = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct ocfs2_inode_info *oi;
+
+	trace_ocfs2_recover_orphans(slot);
+
+	ocfs2_mark_recovering_orphan_dir(osb, slot);
+	ret = ocfs2_queue_orphans(osb, slot, &inode);
+	ocfs2_clear_recovering_orphan_dir(osb, slot);
+
+	/* Error here should be noted, but we want to continue with as
+	 * many queued inodes as we've got. */
+	if (ret)
+		mlog_errno(ret);
+
+	while (inode) {
+		oi = OCFS2_I(inode);
+		trace_ocfs2_recover_orphans_iput(
+					(unsigned long long)oi->ip_blkno);
+
+		iter = oi->ip_next_orphan;
+		oi->ip_next_orphan = NULL;
+
+		/*
+		 * We need to take and drop the inode lock to
+		 * force read inode from disk.
+		 */
+		ret = ocfs2_inode_lock(inode, NULL, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto next;
+		}
+		ocfs2_inode_unlock(inode, 0);
+
+		if (inode->i_nlink == 0) {
+			spin_lock(&oi->ip_lock);
+			/* Set the proper information to get us going into
+			 * ocfs2_delete_inode. */
+			oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+			spin_unlock(&oi->ip_lock);
+		} else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
+			struct buffer_head *di_bh = NULL;
+
+			ret = ocfs2_rw_lock(inode, 1);
+			if (ret) {
+				mlog_errno(ret);
+				goto next;
+			}
+
+			ret = ocfs2_inode_lock(inode, &di_bh, 1);
+			if (ret < 0) {
+				ocfs2_rw_unlock(inode, 1);
+				mlog_errno(ret);
+				goto next;
+			}
+
+			ret = ocfs2_truncate_file(inode, di_bh,
+					i_size_read(inode));
+			ocfs2_inode_unlock(inode, 1);
+			ocfs2_rw_unlock(inode, 1);
+			brelse(di_bh);
+			if (ret < 0) {
+				if (ret != -ENOSPC)
+					mlog_errno(ret);
+				goto next;
+			}
+
+			ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+			if (ret)
+				mlog_errno(ret);
+
+			wake_up(&OCFS2_I(inode)->append_dio_wq);
+		} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
+
+next:
+		iput(inode);
+
+		inode = iter;
+	}
+
+	return ret;
+}
+
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
+{
+	/* This check is good because ocfs2 will wait on our recovery
+	 * thread before changing it to something other than MOUNTED
+	 * or DISABLED. */
+	wait_event(osb->osb_mount_event,
+		  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
+		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
+
+	/* If there's an error on mount, then we may never get to the
+	 * MOUNTED flag, but this is set right before
+	 * dismount_volume() so we can trust it. */
+	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
+		mlog(0, "mount error, exiting!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int ocfs2_commit_thread(void *arg)
+{
+	int status;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_journal *journal = osb->journal;
+
+	/* we can trust j_num_trans here because _should_stop() is only set in
+	 * shutdown and nobody other than ourselves should be able to start
+	 * transactions.  committing on shutdown might take a few iterations
+	 * as final transactions put deleted inodes on the list */
+	while (!(kthread_should_stop() &&
+		 atomic_read(&journal->j_num_trans) == 0)) {
+
+		wait_event_interruptible(osb->checkpoint_event,
+					 atomic_read(&journal->j_num_trans)
+					 || kthread_should_stop());
+
+		status = ocfs2_commit_cache(osb);
+		if (status < 0) {
+			static unsigned long abort_warn_time;
+
+			/* Warn about this once per minute */
+			if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
+				mlog(ML_ERROR, "status = %d, journal is "
+						"already aborted.\n", status);
+			/*
+			 * After ocfs2_commit_cache() fails, j_num_trans has a
+			 * non-zero value.  Sleep here to avoid a busy-wait
+			 * loop.
+			 */
+			msleep_interruptible(1000);
+		}
+
+		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
+			mlog(ML_KTHREAD,
+			     "commit_thread: %u transactions pending on "
+			     "shutdown\n",
+			     atomic_read(&journal->j_num_trans));
+		}
+	}
+
+	return 0;
+}
+
+/* Reads all the journal inodes without taking any cluster locks. Used
+ * for hard readonly access to determine whether any journal requires
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
+int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	unsigned int slot;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int journal_dirty = 0;
+
+	for(slot = 0; slot < osb->max_slots; slot++) {
+		ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di = (struct ocfs2_dinode *) di_bh->b_data;
+
+		osb->slot_recovery_generations[slot] =
+					ocfs2_get_recovery_generation(di);
+
+		if (le32_to_cpu(di->id1.journal1.ij_flags) &
+		    OCFS2_JOURNAL_DIRTY_FL)
+			journal_dirty = 1;
+
+		brelse(di_bh);
+		di_bh = NULL;
+	}
+
+out:
+	if (journal_dirty)
+		ret = -EROFS;
+	return ret;
+}
diff --git a/kernel/fs/ocfs2/journal.h b/kernel/fs/ocfs2/journal.h
new file mode 100644
index 000000000..f4cd3c3e9
--- /dev/null
+++ b/kernel/fs/ocfs2/journal.h
@@ -0,0 +1,645 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.h
+ *
+ * Defines journalling api and structures.
+ *
+ * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_JOURNAL_H
+#define OCFS2_JOURNAL_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+
+enum ocfs2_journal_state {
+	OCFS2_JOURNAL_FREE = 0,
+	OCFS2_JOURNAL_LOADED,
+	OCFS2_JOURNAL_IN_SHUTDOWN,
+};
+
+struct ocfs2_super;
+struct ocfs2_dinode;
+
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+	unsigned int rm_used;
+	unsigned int *rm_entries;
+};
+
+
+struct ocfs2_journal {
+	enum ocfs2_journal_state   j_state;    /* Journals current state   */
+
+	journal_t                 *j_journal; /* The kernels journal type */
+	struct inode              *j_inode;   /* Kernel inode pointing to
+					       * this journal             */
+	struct ocfs2_super        *j_osb;     /* pointer to the super
+					       * block for the node
+					       * we're currently
+					       * running on -- not
+					       * necessarily the super
+					       * block from the node
+					       * which we usually run
+					       * from (recovery,
+					       * etc)                     */
+	struct buffer_head        *j_bh;      /* Journal disk inode block */
+	atomic_t                  j_num_trans; /* Number of transactions
+					        * currently in the system. */
+	spinlock_t                j_lock;
+	unsigned long             j_trans_id;
+	struct rw_semaphore       j_trans_barrier;
+	wait_queue_head_t         j_checkpointed;
+
+	/* both fields protected by j_lock*/
+	struct list_head          j_la_cleanups;
+	struct work_struct        j_recovery_work;
+};
+
+extern spinlock_t trans_inc_lock;
+
+/* wrap j_trans_id so we never have it equal to zero. */
+static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
+{
+	unsigned long old_id;
+	spin_lock(&trans_inc_lock);
+	old_id = j->j_trans_id++;
+	if (unlikely(!j->j_trans_id))
+		j->j_trans_id = 1;
+	spin_unlock(&trans_inc_lock);
+	return old_id;
+}
+
+static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
+					   struct ocfs2_caching_info *ci)
+{
+	spin_lock(&trans_inc_lock);
+	ci->ci_last_trans = journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * cached object. Returns true if all the object's changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock and put it on another transaction. */
+static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
+{
+	int ret;
+	struct ocfs2_journal *journal =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = time_after(journal->j_trans_id, ci->ci_last_trans);
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* convenience function to check if an object backed by struct
+ * ocfs2_caching_info  is still new (has never hit disk) Will do you a
+ * favor and set created_trans = 0 when you've
+ * been checkpointed.  returns '1' if the ci is still new. */
+static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
+{
+	int ret;
+	struct ocfs2_journal *journal =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
+	if (!ret)
+		ci->ci_created_trans = 0;
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* Wrapper for inodes so we can check system files */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
+	/* System files are never "new" as they're written out by
+	 * mkfs. This helps us early during mount, before we have the
+	 * journal open and j_trans_id could be junk. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		return 0;
+
+	return ocfs2_ci_is_new(INODE_CACHE(inode));
+}
+
+static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
+				    struct ocfs2_caching_info *ci)
+{
+	spin_lock(&trans_inc_lock);
+	ci->ci_created_trans = osb->journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
+void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
+
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+/*
+ *  Journal Control:
+ *  Initialize, Load, Shutdown, Wipe a journal.
+ *
+ *  ocfs2_journal_init     - Initialize journal structures in the OSB.
+ *  ocfs2_journal_load     - Load the given journal off disk. Replay it if
+ *                          there's transactions still in there.
+ *  ocfs2_journal_shutdown - Shutdown a journal, this will flush all
+ *                          uncommitted, uncheckpointed transactions.
+ *  ocfs2_journal_wipe     - Wipe transactions from a journal. Optionally
+ *                          zero out each block.
+ *  ocfs2_recovery_thread  - Perform recovery on a node. osb is our own osb.
+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ *                          event on.
+ *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
+ */
+void   ocfs2_set_journal_params(struct ocfs2_super *osb);
+int    ocfs2_journal_init(struct ocfs2_journal *journal,
+			  int *dirty);
+void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
+int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
+			  int full);
+int    ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+			  int replayed);
+int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
+void   ocfs2_recovery_thread(struct ocfs2_super *osb,
+			     int node_num);
+int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
+void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
+
+static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
+{
+	wake_up(&osb->checkpoint_event);
+}
+
+static inline void ocfs2_checkpoint_inode(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_mount_local(osb))
+		return;
+
+	if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
+		/* WARNING: This only kicks off a single
+		 * checkpoint. If someone races you and adds more
+		 * metadata to the journal, you won't know, and will
+		 * wind up waiting *a lot* longer than necessary. Right
+		 * now we only use this in clear_inode so that's
+		 * OK. */
+		ocfs2_start_checkpoint(osb);
+
+		wait_event(osb->journal->j_checkpointed,
+			   ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
+	}
+}
+
+/*
+ *  Transaction Handling:
+ *  Manage the lifetime of a transaction handle.
+ *
+ *  ocfs2_start_trans      - Begin a transaction. Give it an upper estimate of
+ *                          the number of blocks that will be changed during
+ *                          this handle.
+ *  ocfs2_commit_trans - Complete a handle. It might return -EIO if
+ *                       the journal was aborted. The majority of paths don't
+ *                       check the return value as an error there comes too
+ *                       late to do anything (and will be picked up in a
+ *                       later transaction).
+ *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
+ *                          commit the handle to disk in the process, but will
+ *                          not release any locks taken during the transaction.
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
+ *                          buffer. Will have to call ocfs2_journal_dirty once
+ *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
+ *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
+ *                           the current handle commits.
+ */
+
+/* You must always start_trans with a number of buffs > 0, but it's
+ * perfectly legal to go through an entire transaction without having
+ * dirtied any buffers. */
+handle_t		    *ocfs2_start_trans(struct ocfs2_super *osb,
+					       int max_buffs);
+int			     ocfs2_commit_trans(struct ocfs2_super *osb,
+						handle_t *handle);
+int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
+int			     ocfs2_allocate_extend_trans(handle_t *handle,
+						int thresh);
+
+/*
+ * Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction.  For unbounded transactions such as
+ * fallocate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go.
+ */
+#define OCFS2_MAX_TRANS_DATA	64U
+
+/*
+ * Create access is for when we get a newly created buffer and we're
+ * not gonna read it off disk, but rather fill it ourselves.  Right
+ * now, we don't do anything special with this (it turns into a write
+ * request), but this is a good placeholder in case we do...
+ *
+ * Write access is for when we read a block off disk and are going to
+ * modify it. This way the journalling layer knows it may need to make
+ * a copy of that block (if it's part of another, uncommitted
+ * transaction) before we do so.
+ */
+#define OCFS2_JOURNAL_ACCESS_CREATE 0
+#define OCFS2_JOURNAL_ACCESS_WRITE  1
+#define OCFS2_JOURNAL_ACCESS_UNDO   2
+
+
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_refcount_block */
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
+			 struct buffer_head *bh, int type);
+
+/*
+ * A word about the journal_access/journal_dirty "dance". It is
+ * entirely legal to journal_access a buffer more than once (as long
+ * as the access type is the same -- I'm not sure what will happen if
+ * access type is different but this should never happen anyway) It is
+ * also legal to journal_dirty a buffer more than once. In fact, you
+ * can even journal_access a buffer after you've done a
+ * journal_access/journal_dirty pair. The only thing you cannot do
+ * however, is journal_dirty a buffer which you haven't yet passed to
+ * journal_access at least once.
+ *
+ * That said, 99% of the time this doesn't matter and this is what the
+ * path looks like:
+ *
+ *	<read a bh>
+ *	ocfs2_journal_access(handle, bh,	OCFS2_JOURNAL_ACCESS_WRITE);
+ *	<modify the bh>
+ * 	ocfs2_journal_dirty(handle, bh);
+ */
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
+
+/*
+ *  Credit Macros:
+ *  Convenience macros to calculate number of credits needed.
+ *
+ *  For convenience sake, I have a set of macros here which calculate
+ *  the *maximum* number of sectors which will be changed for various
+ *  metadata updates.
+ */
+
+/* simple file updates like chmod, etc. */
+#define OCFS2_INODE_UPDATE_CREDITS 1
+
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
+
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			      OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			     2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+	int credits = 0;
+
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	return credits;
+}
+
+/* group extend. inode update and last group update. */
+#define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* group add. inode update and the new group update. */
+#define OCFS2_GROUP_ADD_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* get one bit out of a suballocator: dinode + group descriptor +
+ * prev. group desc. if we relink. */
+#define OCFS2_SUBALLOC_ALLOC (3)
+
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+	return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* dinode + group descriptor update. We don't relink on free yet. */
+#define OCFS2_SUBALLOC_FREE  (2)
+
+#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
+#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
+					 + OCFS2_TRUNCATE_LOG_UPDATE)
+
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+	return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* data block for new dir/symlink, allocation of directory block, dx_root
+ * update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
+
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+	/* 1 block for index, 2 allocs (data, metadata), 1 clusters
+	 * worth of blocks for initial extent. */
+	return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+		ocfs2_clusters_to_blocks(sb, 1);
+}
+
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+				      int xattr_credits)
+{
+	int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+
+	if (is_dir)
+		dir_credits += ocfs2_add_dir_index_credits(sb);
+
+	return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* local alloc metadata change + main bitmap updates */
+#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
+				  + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
+
+/* used when we don't need an allocation change for a dir extend. One
+ * for the dinode, one for the new block. */
+#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
+
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+	return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
+ * dir inode link + dir inode index leaf + dir index root */
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+	/* The quota update from ocfs2_link_credits is unused here... */
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
+}
+
+/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
+ * inode alloc group descriptor + orphan dir index root +
+ * orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
+
+/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
+ * orphan dir index root + orphan dir index leaf */
+#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
+#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS  OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
+
+/* dinode update, old dir dinode update, new dir dinode update, old
+ * dir dir entry, new dir dir entry, dir entry update for renaming
+ * directory + target unlink + 3 x dir index leaves */
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+	return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
+}
+
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+					  + OCFS2_INODE_UPDATE_CREDITS \
+					  + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS +	\
+				      OCFS2_SUBALLOC_FREE)
+
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+	int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+
+	credits += ocfs2_clusters_to_blocks(sb, 1);
+	credits += ocfs2_quota_trans_credits(sb);
+
+	return credits;
+}
+
+/* inode update, new refcount block and its allocation credits. */
+#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
+					    + OCFS2_SUBALLOC_ALLOC)
+
+/* inode and the refcount block update. */
+#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * inode and the refcount block update.
+ * It doesn't include the credits for sub alloc change.
+ * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
+ */
+#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* 2 metadata alloc, 2 new blocks and root refcount block */
+#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_calc_extend_credits(struct super_block *sb,
+					    struct ocfs2_extent_list *root_el)
+{
+	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
+
+	/* bitmap dinode, group desc. + relinked group. */
+	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
+
+	/* we might need to shift tree depth so lets assume an
+	 * absolute worst case of complete fragmentation.  Even with
+	 * that, we only need one update for the dinode, and then
+	 * however many metadata chunks needed * a remaining suballoc
+	 * alloc. */
+	sysfile_bitmap_blocks = 1 +
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
+
+	/* this does not include *new* metadata blocks, which are
+	 * accounted for in sysfile_bitmap_blocks. root_el +
+	 * prev. last_eb_blk + blocks along edge of tree.
+	 * calc_symlink_credits passes because we just need 1
+	 * credit for the dinode there. */
+	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
+
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
+{
+	int blocks = ocfs2_mknod_credits(sb, 0, 0);
+
+	/* links can be longer than one block so we may update many
+	 * within our single allocated extent. */
+	blocks += ocfs2_clusters_to_blocks(sb, 1);
+
+	return blocks + ocfs2_quota_trans_credits(sb);
+}
+
+static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
+						 unsigned int cpg)
+{
+	int blocks;
+	int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
+	/* parent inode update + new block group header + bitmap inode update
+	   + bitmap blocks affected */
+	blocks = 1 + 1 + 1 + bitmap_blocks;
+	return blocks;
+}
+
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+	return ocfs2_extent_recs_per_gd(sb);
+}
+
+static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
+						unsigned int clusters_to_del,
+						struct ocfs2_dinode *fe,
+						struct ocfs2_extent_list *last_el)
+{
+ 	/* for dinode + all headers in this pass + update to next leaf */
+	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
+	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	int credits = 1 + tree_depth + 1;
+	int i;
+
+	i = next_free - 1;
+	BUG_ON(i < 0);
+
+	/* We may be deleting metadata blocks, so metadata alloc dinode +
+	   one desc. block for each possible delete. */
+	if (tree_depth && next_free == 1 &&
+	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
+		credits += 1 + tree_depth;
+
+	/* update to the truncate log. */
+	credits += OCFS2_TRUNCATE_LOG_UPDATE;
+
+	credits += ocfs2_quota_trans_credits(sb);
+
+	return credits;
+}
+
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+					       loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(
+				OCFS2_SB(inode->i_sb)->journal->j_journal,
+				&OCFS2_I(inode)->ip_jinode,
+				new_size);
+}
+
+static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
+						  struct inode *inode,
+						  int datasync)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	oi->i_sync_tid = handle->h_transaction->t_tid;
+	if (datasync)
+		oi->i_datasync_tid = handle->h_transaction->t_tid;
+}
+
+#endif /* OCFS2_JOURNAL_H */
diff --git a/kernel/fs/ocfs2/localalloc.c b/kernel/fs/ocfs2/localalloc.c
new file mode 100644
index 000000000..857bbbcd3
--- /dev/null
+++ b/kernel/fs/ocfs2/localalloc.c
@@ -0,0 +1,1343 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.c
+ *
+ * Node local data allocation
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 *numbits,
+					     struct ocfs2_alloc_reservation *resv);
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
+
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    handle_t *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh);
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh);
+
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_alloc_context *ac);
+
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode);
+
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K	group: 126M	la: 121M
+ * csize: 8K	group: 252M	la: 243M
+ * csize: 16K	group: 504M	la: 486M
+ * csize: 32K	group: 1008M	la: 972M
+ * csize: 64K	group: 2016M	la: 1944M
+ * csize: 128K	group: 4032M	la: 3888M
+ * csize: 256K	group: 8064M	la: 7776M
+ * csize: 512K	group: 16128M	la: 15552M
+ * csize: 1024K	group: 32256M	la: 31104M
+ */
+#define	OCFS2_LA_MAX_DEFAULT_MB	256
+#define	OCFS2_LA_OLD_DEFAULT	8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+	unsigned int la_mb;
+	unsigned int gd_mb;
+	unsigned int la_max_mb;
+	unsigned int megs_per_slot;
+	struct super_block *sb = osb->sb;
+
+	gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+		8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+	/*
+	 * This takes care of files systems with very small group
+	 * descriptors - 512 byte blocksize at cluster sizes lower
+	 * than 16K and also 1k blocksize with 4k cluster size.
+	 */
+	if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+	    || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+		return OCFS2_LA_OLD_DEFAULT;
+
+	/*
+	 * Leave enough room for some block groups and make the final
+	 * value we work from a multiple of 4.
+	 */
+	gd_mb -= 16;
+	gd_mb &= 0xFFFFFFFB;
+
+	la_mb = gd_mb;
+
+	/*
+	 * Keep window sizes down to a reasonable default
+	 */
+	if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+		/*
+		 * Some clustersize / blocksize combinations will have
+		 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+		 * default size, but get poor distribution when
+		 * limited to exactly 256 megabytes.
+		 *
+		 * As an example, 16K clustersize at 4K blocksize
+		 * gives us a cluster group size of 504M. Paring the
+		 * local alloc size down to 256 however, would give us
+		 * only one window and around 200MB left in the
+		 * cluster group. Instead, find the first size below
+		 * 256 which would give us an even distribution.
+		 *
+		 * Larger cluster group sizes actually work out pretty
+		 * well when pared to 256, so we don't have to do this
+		 * for any group that fits more than two
+		 * OCFS2_LA_MAX_DEFAULT_MB windows.
+		 */
+		if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+			la_mb = 256;
+		else {
+			unsigned int gd_mult = gd_mb;
+
+			while (gd_mult > 256)
+				gd_mult = gd_mult >> 1;
+
+			la_mb = gd_mult;
+		}
+	}
+
+	megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+	megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+	/* Too many nodes, too few disk clusters. */
+	if (megs_per_slot < la_mb)
+		la_mb = megs_per_slot;
+
+	/* We can't store more bits than we can in a block. */
+	la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+						ocfs2_local_alloc_size(sb) * 8);
+	if (la_mb > la_max_mb)
+		la_mb = la_max_mb;
+
+	return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+	struct super_block *sb = osb->sb;
+	unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+	unsigned int la_max_mb;
+
+	la_max_mb = ocfs2_clusters_to_megabytes(sb,
+						ocfs2_local_alloc_size(sb) * 8);
+
+	trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
+
+	if (requested_mb == -1) {
+		/* No user request - use defaults */
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, la_default_mb);
+	} else if (requested_mb > la_max_mb) {
+		/* Request is too big, we give the maximum available */
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, la_max_mb);
+	} else {
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, requested_mb);
+	}
+
+	osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
+{
+	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+		osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters)
+{
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
+		if (num_clusters >= osb->local_alloc_default_bits) {
+			cancel_delayed_work(&osb->la_enable_wq);
+			osb->local_alloc_state = OCFS2_LA_ENABLED;
+		}
+	spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     la_enable_wq.work);
+	spin_lock(&osb->osb_lock);
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+	spin_unlock(&osb->osb_lock);
+}
+
+/*
+ * Tell us whether a given allocation should use the local alloc
+ * file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
+ */
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
+{
+	int ret = 0;
+	int la_bits;
+
+	spin_lock(&osb->osb_lock);
+	la_bits = osb->local_alloc_bits;
+
+	if (!ocfs2_la_state_enabled(osb))
+		goto bail;
+
+	/* la_bits should be at least twice the size (in clusters) of
+	 * a new block group. We want to be sure block group
+	 * allocations go through the local alloc, so allow an
+	 * allocation to take up to half the bitmap. */
+	if (bits > (la_bits / 2))
+		goto bail;
+
+	ret = 1;
+bail:
+	trace_ocfs2_alloc_should_use_local(
+	     (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_dinode *alloc = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	u32 num_used;
+	struct inode *inode = NULL;
+	struct ocfs2_local_alloc *la;
+
+	if (osb->local_alloc_bits == 0)
+		goto bail;
+
+	if (osb->local_alloc_bits >= osb->bitmap_cpg) {
+		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
+		     "than max possible %u. Using defaults.\n",
+		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+		osb->local_alloc_bits =
+			ocfs2_megabytes_to_clusters(osb->sb,
+						    ocfs2_la_default_mb(osb));
+	}
+
+	/* read the alloc off disk */
+	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (!(le32_to_cpu(alloc->i_flags) &
+	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
+		mlog(ML_ERROR, "Invalid local alloc inode, %llu\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if ((la->la_size == 0) ||
+	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
+		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
+		     le16_to_cpu(la->la_size));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* do a little verification. */
+	num_used = ocfs2_local_alloc_count_bits(alloc);
+
+	/* hopefully the local alloc has always been recovered before
+	 * we load it. */
+	if (num_used
+	    || alloc->id1.bitmap1.i_used
+	    || alloc->id1.bitmap1.i_total
+	    || la->la_bm_off)
+		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+		     "found = %u, set = %u, taken = %u, off = %u\n",
+		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
+		     le32_to_cpu(alloc->id1.bitmap1.i_total),
+		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+
+	osb->local_alloc_bh = alloc_bh;
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+
+bail:
+	if (status < 0)
+		brelse(alloc_bh);
+	if (inode)
+		iput(inode);
+
+	trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * return any unused bits to the bitmap and write out a clean
+ * local_alloc.
+ *
+ * local_alloc_bh is optional. If not passed, we will simply use the
+ * one off osb. If you do pass it however, be warned that it *will* be
+ * returned brelse'd and NULL'd out.*/
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
+{
+	int status;
+	handle_t *handle;
+	struct inode *local_alloc_inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_dinode *alloc = NULL;
+
+	cancel_delayed_work(&osb->la_enable_wq);
+	flush_workqueue(ocfs2_wq);
+
+	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
+		goto out;
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto out;
+	}
+
+	osb->local_alloc_state = OCFS2_LA_DISABLED;
+
+	ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_mutex;
+	}
+
+	/* WINDOW_MOVE_CREDITS is a bit heavy... */
+	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		handle = NULL;
+		goto out_unlock;
+	}
+
+	bh = osb->local_alloc_bh;
+	alloc = (struct ocfs2_dinode *) bh->b_data;
+
+	alloc_copy = kmalloc(bh->b_size, GFP_NOFS);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		goto out_commit;
+	}
+	memcpy(alloc_copy, alloc, bh->b_size);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
+					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+	ocfs2_journal_dirty(handle, bh);
+
+	brelse(bh);
+	osb->local_alloc_bh = NULL;
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	kfree(alloc_copy);
+}
+
+/*
+ * We want to free the bitmap bits outside of any recovery context as
+ * we'll need a cluster lock to do so, but we must clear the local
+ * alloc before giving up the recovered nodes journal. To solve this,
+ * we kmalloc a copy of the local alloc before it's change for the
+ * caller to process with ocfs2_complete_local_alloc_recovery
+ */
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int slot_num,
+				     struct ocfs2_dinode **alloc_copy)
+{
+	int status = 0;
+	struct buffer_head *alloc_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *alloc;
+
+	trace_ocfs2_begin_local_alloc_recovery(slot_num);
+
+	*alloc_copy = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
+	if (!(*alloc_copy)) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	ocfs2_clear_local_alloc(alloc);
+
+	ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
+	status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (status < 0) {
+		kfree(*alloc_copy);
+		*alloc_copy = NULL;
+	}
+
+	brelse(alloc_bh);
+
+	if (inode) {
+		mutex_unlock(&inode->i_mutex);
+		iput(inode);
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * Step 2: By now, we've completed the journal recovery, we've stamped
+ * a clean local alloc on disk and dropped the node out of the
+ * recovery map. Dlm locks will no longer stall, so lets clear out the
+ * main bitmap.
+ */
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc)
+{
+	int status;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto out_unlock;
+	}
+
+	/* we want the bitmap change to be recorded on disk asap */
+	handle->h_sync = 1;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+
+	brelse(main_bm_bh);
+
+	iput(main_bm_inode);
+
+out:
+	if (!status)
+		ocfs2_init_steal_slots(osb);
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * make sure we've got at least bits_wanted contiguous bits in the
+ * local alloc. You lose them when you drop i_mutex.
+ *
+ * We will add ourselves to the transaction passed in, but may start
+ * our own in order to shift windows.
+ */
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac)
+{
+	int status;
+	struct ocfs2_dinode *alloc;
+	struct inode *local_alloc_inode;
+	unsigned int free_bits;
+
+	BUG_ON(!ac);
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mutex_lock(&local_alloc_inode->i_mutex);
+
+	/*
+	 * We must double check state and allocator bits because
+	 * another process may have changed them while holding i_mutex.
+	 */
+	spin_lock(&osb->osb_lock);
+	if (!ocfs2_la_state_enabled(osb) ||
+	    (bits_wanted > osb->local_alloc_bits)) {
+		spin_unlock(&osb->osb_lock);
+		status = -ENOSPC;
+		goto bail;
+	}
+	spin_unlock(&osb->osb_lock);
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
+	    ocfs2_local_alloc_count_bits(alloc)) {
+		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
+			    "%u used bits, but a count shows %u",
+			    (unsigned long long)le64_to_cpu(alloc->i_blkno),
+			    le32_to_cpu(alloc->id1.bitmap1.i_used),
+			    ocfs2_local_alloc_count_bits(alloc));
+		status = -EIO;
+		goto bail;
+	}
+#endif
+
+	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(alloc->id1.bitmap1.i_used);
+	if (bits_wanted > free_bits) {
+		/* uhoh, window change time. */
+		status =
+			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		/*
+		 * Under certain conditions, the window slide code
+		 * might have reduced the number of bits available or
+		 * disabled the the local alloc entirely. Re-check
+		 * here and return -ENOSPC if necessary.
+		 */
+		status = -ENOSPC;
+		if (!ocfs2_la_state_enabled(osb))
+			goto bail;
+
+		free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+			le32_to_cpu(alloc->id1.bitmap1.i_used);
+		if (bits_wanted > free_bits)
+			goto bail;
+	}
+
+	ac->ac_inode = local_alloc_inode;
+	/* We should never use localalloc from another slot */
+	ac->ac_alloc_slot = osb->slot_num;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
+	get_bh(osb->local_alloc_bh);
+	ac->ac_bh = osb->local_alloc_bh;
+	status = 0;
+bail:
+	if (status < 0 && local_alloc_inode) {
+		mutex_unlock(&local_alloc_inode->i_mutex);
+		iput(local_alloc_inode);
+	}
+
+	trace_ocfs2_reserve_local_alloc_bits(
+		(unsigned long long)ac->ac_max_block,
+		bits_wanted, osb->slot_num, status);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 bits_wanted,
+				 u32 *bit_off,
+				 u32 *num_bits)
+{
+	int status, start;
+	struct inode *local_alloc_inode;
+	void *bitmap;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+	local_alloc_inode = ac->ac_inode;
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+						  ac->ac_resv);
+	if (start == -1) {
+		/* TODO: Shouldn't we just BUG here? */
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bitmap = la->la_bitmap;
+	*bit_off = le32_to_cpu(la->la_bm_off) + start;
+	*num_bits = bits_wanted;
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(local_alloc_inode),
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+				  bits_wanted);
+
+	while(bits_wanted--)
+		ocfs2_set_bit(start++, bitmap);
+
+	le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+				handle_t *handle,
+				struct ocfs2_alloc_context *ac,
+				u32 bit_off,
+				u32 num_bits)
+{
+	int status, start;
+	u32 clear_bits;
+	struct inode *local_alloc_inode;
+	void *bitmap;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+	local_alloc_inode = ac->ac_inode;
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	bitmap = la->la_bitmap;
+	start = bit_off - le32_to_cpu(la->la_bm_off);
+	clear_bits = num_bits;
+
+	status = ocfs2_journal_access_di(handle,
+			INODE_CACHE(local_alloc_inode),
+			osb->local_alloc_bh,
+			OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while (clear_bits--)
+		ocfs2_clear_bit(start++, bitmap);
+
+	le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+bail:
+	return status;
+}
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
+{
+	u32 count;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	count = memweight(la->la_bitmap, le16_to_cpu(la->la_size));
+
+	trace_ocfs2_local_alloc_count_bits(count);
+	return count;
+}
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+				     struct ocfs2_dinode *alloc,
+				     u32 *numbits,
+				     struct ocfs2_alloc_reservation *resv)
+{
+	int numfound = 0, bitoff, left, startoff, lastzero;
+	int local_resv = 0;
+	struct ocfs2_alloc_reservation r;
+	void *bitmap = NULL;
+	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
+
+	if (!alloc->id1.bitmap1.i_total) {
+		bitoff = -1;
+		goto bail;
+	}
+
+	if (!resv) {
+		local_resv = 1;
+		ocfs2_resv_init_once(&r);
+		ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+		resv = &r;
+	}
+
+	numfound = *numbits;
+	if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+		if (numfound < *numbits)
+			*numbits = numfound;
+		goto bail;
+	}
+
+	/*
+	 * Code error. While reservations are enabled, local
+	 * allocation should _always_ go through them.
+	 */
+	BUG_ON(osb->osb_resv_level != 0);
+
+	/*
+	 * Reservations are disabled. Handle this the old way.
+	 */
+
+	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
+
+	numfound = bitoff = startoff = 0;
+	lastzero = -1;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
+		if (bitoff == left) {
+			/* mlog(0, "bitoff (%d) == left", bitoff); */
+			break;
+		}
+		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
+		   "numfound = %d\n", bitoff, startoff, numfound);*/
+
+		/* Ok, we found a zero bit... is it contig. or do we
+		 * start over?*/
+		if (bitoff == startoff) {
+			/* we found a zero */
+			numfound++;
+			startoff++;
+		} else {
+			/* got a zero after some ones */
+			numfound = 1;
+			startoff = bitoff+1;
+		}
+		/* we got everything we needed */
+		if (numfound == *numbits) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
+
+	if (numfound == *numbits)
+		bitoff = startoff - numfound;
+	else
+		bitoff = -1;
+
+bail:
+	if (local_resv)
+		ocfs2_resv_discard(resmap, resv);
+
+	trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		bitoff, numfound);
+
+	return bitoff;
+}
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
+{
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+	int i;
+
+	alloc->id1.bitmap1.i_total = 0;
+	alloc->id1.bitmap1.i_used = 0;
+	la->la_bm_off = 0;
+	for(i = 0; i < le16_to_cpu(la->la_size); i++)
+		la->la_bitmap[i] = 0;
+}
+
+#if 0
+/* turn this on and uncomment below to aid debugging window shifts. */
+static void ocfs2_verify_zero_bits(unsigned long *bitmap,
+				   unsigned int start,
+				   unsigned int count)
+{
+	unsigned int tmp = count;
+	while(tmp--) {
+		if (ocfs2_test_bit(start + tmp, bitmap)) {
+			printk("ocfs2_verify_zero_bits: start = %u, count = "
+			       "%u\n", start, count);
+			printk("ocfs2_verify_zero_bits: bit %u is set!",
+			       start + tmp);
+			BUG();
+		}
+	}
+}
+#endif
+
+/*
+ * sync the local alloc to main bitmap.
+ *
+ * assumes you've already locked the main bitmap -- the bitmap inode
+ * passed is used for caching.
+ */
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    handle_t *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh)
+{
+	int status = 0;
+	int bit_off, left, count, start;
+	u64 la_start_blk;
+	u64 blkno;
+	void *bitmap;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	trace_ocfs2_sync_local_to_main(
+	     le32_to_cpu(alloc->id1.bitmap1.i_total),
+	     le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	if (!alloc->id1.bitmap1.i_total) {
+		goto bail;
+	}
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
+	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
+		goto bail;
+	}
+
+	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(la->la_bm_off));
+	bitmap = la->la_bitmap;
+	start = count = bit_off = 0;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+
+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
+	       != -1) {
+		if ((bit_off < left) && (bit_off == start)) {
+			count++;
+			start++;
+			continue;
+		}
+		if (count) {
+			blkno = la_start_blk +
+				ocfs2_clusters_to_blocks(osb->sb,
+							 start - count);
+
+			trace_ocfs2_sync_local_to_main_free(
+			     count, start - count,
+			     (unsigned long long)la_start_blk,
+			     (unsigned long long)blkno);
+
+			status = ocfs2_release_clusters(handle,
+							main_bm_inode,
+							main_bm_bh, blkno,
+							count);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		if (bit_off >= left)
+			break;
+		count = 1;
+		start = bit_off + 1;
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+enum ocfs2_la_event {
+	OCFS2_LA_EVENT_SLIDE,		/* Normal window slide. */
+	OCFS2_LA_EVENT_FRAGMENTED,	/* The global bitmap has
+					 * enough bits theoretically
+					 * free, but a contiguous
+					 * allocation could not be
+					 * found. */
+	OCFS2_LA_EVENT_ENOSPC,		/* Global bitmap doesn't have
+					 * enough bits free to satisfy
+					 * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+				  enum ocfs2_la_event event)
+{
+	unsigned int bits;
+	int state;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+		WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+		goto out_unlock;
+	}
+
+	/*
+	 * ENOSPC and fragmentation are treated similarly for now.
+	 */
+	if (event == OCFS2_LA_EVENT_ENOSPC ||
+	    event == OCFS2_LA_EVENT_FRAGMENTED) {
+		/*
+		 * We ran out of contiguous space in the primary
+		 * bitmap. Drastically reduce the number of bits used
+		 * by local alloc until we have to disable it.
+		 */
+		bits = osb->local_alloc_bits >> 1;
+		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+			/*
+			 * By setting state to THROTTLED, we'll keep
+			 * the number of local alloc bits used down
+			 * until an event occurs which would give us
+			 * reason to assume the bitmap situation might
+			 * have changed.
+			 */
+			osb->local_alloc_state = OCFS2_LA_THROTTLED;
+			osb->local_alloc_bits = bits;
+		} else {
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+				   OCFS2_LA_ENABLE_INTERVAL);
+		goto out_unlock;
+	}
+
+	/*
+	 * Don't increase the size of the local alloc window until we
+	 * know we might be able to fulfill the request. Otherwise, we
+	 * risk bouncing around the global bitmap during periods of
+	 * low space.
+	 */
+	if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+		osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+	state = osb->local_alloc_state;
+	spin_unlock(&osb->osb_lock);
+
+	return state;
+}
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh)
+{
+	int status;
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+retry_enospc:
+	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
+	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status == -ENOSPC) {
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ocfs2_free_ac_resource(*ac);
+		memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+		goto retry_enospc;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*bitmap_inode = (*ac)->ac_inode;
+	igrab(*bitmap_inode);
+	*bitmap_bh = (*ac)->ac_bh;
+	get_bh(*bitmap_bh);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * pass it the bitmap lock in lock_bh if you have it.
+ */
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	u32 cluster_off, cluster_count;
+	struct ocfs2_dinode *alloc = NULL;
+	struct ocfs2_local_alloc *la;
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	trace_ocfs2_local_alloc_new_window(
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		osb->local_alloc_bits);
+
+	/* Instruct the allocation code to try the most recently used
+	 * cluster group. We'll re-record the group used this pass
+	 * below. */
+	ac->ac_last_group = osb->la_last_gd;
+
+	/* we used the generic suballoc reserve function, but we set
+	 * everything up nicely, so there's no reason why we can't use
+	 * the more specific cluster api to claim bits. */
+	status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
+				      &cluster_off, &cluster_count);
+	if (status == -ENOSPC) {
+retry_enospc:
+		/*
+		 * Note: We could also try syncing the journal here to
+		 * allow use of any free bits which the current
+		 * transaction can't give us access to. --Mark
+		 */
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ac->ac_bits_wanted = osb->local_alloc_bits;
+		status = ocfs2_claim_clusters(handle, ac,
+					      osb->local_alloc_bits,
+					      &cluster_off,
+					      &cluster_count);
+		if (status == -ENOSPC)
+			goto retry_enospc;
+		/*
+		 * We only shrunk the *minimum* number of in our
+		 * request - it's entirely possible that the allocator
+		 * might give us more than we asked for.
+		 */
+		if (status == 0) {
+			spin_lock(&osb->osb_lock);
+			osb->local_alloc_bits = cluster_count;
+			spin_unlock(&osb->osb_lock);
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	osb->la_last_gd = ac->ac_last_group;
+
+	la->la_bm_off = cpu_to_le32(cluster_off);
+	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
+	/* just in case... In the future when we find space ourselves,
+	 * we don't have to get all contiguous -- but we'll have to
+	 * set all previously used bits in bitmap and update
+	 * la_bits_set before setting the bits in the main bitmap. */
+	alloc->id1.bitmap1.i_used = 0;
+	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
+	       le16_to_cpu(la->la_size));
+
+	ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+			     OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
+	trace_ocfs2_local_alloc_new_window_result(
+		OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+		le32_to_cpu(alloc->id1.bitmap1.i_total));
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* Note that we do *NOT* lock the local alloc inode here as
+ * it's been locked already for us. */
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode)
+{
+	int status = 0;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_alloc_context *ac = NULL;
+
+	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
+	/* This will lock the main bitmap for us. */
+	status = ocfs2_local_alloc_reserve_for_window(osb,
+						      &ac,
+						      &main_bm_inode,
+						      &main_bm_bh);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	/* We want to clear the local alloc before doing anything
+	 * else, so that if we error later during this operation,
+	 * local alloc shutdown won't try to double free main bitmap
+	 * bits. Make a copy so the sync function knows which bits to
+	 * free. */
+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(local_alloc_inode),
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_local_alloc_new_window(osb, handle, ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&osb->alloc_stats.moves);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	kfree(alloc_copy);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
diff --git a/kernel/fs/ocfs2/localalloc.h b/kernel/fs/ocfs2/localalloc.h
new file mode 100644
index 000000000..44a7d1fb2
--- /dev/null
+++ b/kernel/fs/ocfs2/localalloc.h
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCALALLOC_H
+#define OCFS2_LOCALALLOC_H
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int node_num,
+				     struct ocfs2_dinode **alloc_copy);
+
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc);
+
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
+				 u64 bits);
+
+struct ocfs2_alloc_context;
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac);
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 bits_wanted,
+				 u32 *bit_off,
+				 u32 *num_bits);
+
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+				handle_t *handle,
+				struct ocfs2_alloc_context *ac,
+				u32 bit_off,
+				u32 num_bits);
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
+#endif /* OCFS2_LOCALALLOC_H */
diff --git a/kernel/fs/ocfs2/locks.c b/kernel/fs/ocfs2/locks.c
new file mode 100644
index 000000000..6b6d092b0
--- /dev/null
+++ b/kernel/fs/ocfs2/locks.c
@@ -0,0 +1,141 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.c
+ *
+ * Userspace file locking support
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "locks.h"
+
+static int ocfs2_do_flock(struct file *file, struct inode *inode,
+			  int cmd, struct file_lock *fl)
+{
+	int ret = 0, level = 0, trylock = 0;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+
+	if (fl->fl_type == F_WRLCK)
+		level = 1;
+	if (!IS_SETLKW(cmd))
+		trylock = 1;
+
+	mutex_lock(&fp->fp_mutex);
+
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+	    lockres->l_level > LKM_NLMODE) {
+		int old_level = 0;
+
+		if (lockres->l_level == LKM_EXMODE)
+			old_level = 1;
+
+		if (level == old_level)
+			goto out;
+
+		/*
+		 * Converting an existing lock is not guaranteed to be
+		 * atomic, so we can get away with simply unlocking
+		 * here and allowing the lock code to try at the new
+		 * level.
+		 */
+
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+
+		ocfs2_file_unlock(file);
+	}
+
+	ret = ocfs2_file_lock(file, level, trylock);
+	if (ret) {
+		if (ret == -EAGAIN && trylock)
+			ret = -EWOULDBLOCK;
+		else
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = flock_lock_file_wait(file, fl);
+	if (ret)
+		ocfs2_file_unlock(file);
+
+out:
+	mutex_unlock(&fp->fp_mutex);
+
+	return ret;
+}
+
+static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
+{
+	int ret;
+	struct ocfs2_file_private *fp = file->private_data;
+
+	mutex_lock(&fp->fp_mutex);
+	ocfs2_file_unlock(file);
+	ret = flock_lock_file_wait(file, fl);
+	mutex_unlock(&fp->fp_mutex);
+
+	return ret;
+}
+
+/*
+ * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
+ */
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if (__mandatory_lock(inode))
+		return -ENOLCK;
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb))
+		return flock_lock_file_wait(file, fl);
+
+	if (fl->fl_type == F_UNLCK)
+		return ocfs2_do_funlock(file, cmd, fl);
+	else
+		return ocfs2_do_flock(file, inode, cmd, fl);
+}
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/kernel/fs/ocfs2/locks.h b/kernel/fs/ocfs2/locks.h
new file mode 100644
index 000000000..496d488b2
--- /dev/null
+++ b/kernel/fs/ocfs2/locks.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.h
+ *
+ * Function prototypes for Userspace file locking support
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKS_H
+#define OCFS2_LOCKS_H
+
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
+
+#endif /* OCFS2_LOCKS_H */
diff --git a/kernel/fs/ocfs2/mmap.c b/kernel/fs/ocfs2/mmap.c
new file mode 100644
index 000000000..9581d190f
--- /dev/null
+++ b/kernel/fs/ocfs2/mmap.c
@@ -0,0 +1,193 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mmap.c
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/signal.h>
+#include <linux/rbtree.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "aops.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "mmap.h"
+#include "super.h"
+#include "ocfs2_trace.h"
+
+
+static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+{
+	sigset_t oldset;
+	int ret;
+
+	ocfs2_block_signals(&oldset);
+	ret = filemap_fault(area, vmf);
+	ocfs2_unblock_signals(&oldset);
+
+	trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+			  area, vmf->page, vmf->pgoff);
+	return ret;
+}
+
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
+				struct page *page)
+{
+	int ret = VM_FAULT_NOPAGE;
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	loff_t pos = page_offset(page);
+	unsigned int len = PAGE_CACHE_SIZE;
+	pgoff_t last_index;
+	struct page *locked_page = NULL;
+	void *fsdata;
+	loff_t size = i_size_read(inode);
+
+	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * There are cases that lead to the page no longer bebongs to the
+	 * mapping.
+	 * 1) pagecache truncates locally due to memory pressure.
+	 * 2) pagecache truncates when another is taking EX lock against 
+	 * inode lock. see ocfs2_data_convert_worker.
+	 * 
+	 * The i_size check doesn't catch the case where nodes truncated and
+	 * then re-extended the file. We'll re-check the page mapping after
+	 * taking the page lock inside of ocfs2_write_begin_nolock().
+	 *
+	 * Let VM retry with these cases.
+	 */
+	if ((page->mapping != inode->i_mapping) ||
+	    (!PageUptodate(page)) ||
+	    (page_offset(page) >= size))
+		goto out;
+
+	/*
+	 * Call ocfs2_write_begin() and ocfs2_write_end() to take
+	 * advantage of the allocation code there. We pass a write
+	 * length of the whole page (chopped to i_size) to make sure
+	 * the whole thing is allocated.
+	 *
+	 * Since we know the page is up to date, we don't have to
+	 * worry about ocfs2_write_begin() skipping some buffer reads
+	 * because the "write" would invalidate their data.
+	 */
+	if (page->index == last_index)
+		len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
+
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
+				       &fsdata, di_bh, page);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	if (!locked_page) {
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+	ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+				     fsdata);
+	BUG_ON(ret != len);
+	ret = VM_FAULT_LOCKED;
+out:
+	return ret;
+}
+
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct buffer_head *di_bh = NULL;
+	sigset_t oldset;
+	int ret;
+
+	sb_start_pagefault(inode->i_sb);
+	ocfs2_block_signals(&oldset);
+
+	/*
+	 * The cluster locks taken will block a truncate from another
+	 * node. Taking the data lock will also ensure that we don't
+	 * attempt page truncation as part of a downconvert.
+	 */
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * The alloc sem should be enough to serialize with
+	 * ocfs2_truncate_file() changing i_size as well as any thread
+	 * modifying the inode btree.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
+
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+
+out:
+	ocfs2_unblock_signals(&oldset);
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
+static const struct vm_operations_struct ocfs2_file_vm_ops = {
+	.fault		= ocfs2_fault,
+	.page_mkwrite	= ocfs2_page_mkwrite,
+};
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret = 0, lock_level = 0;
+
+	ret = ocfs2_inode_lock_atime(file_inode(file),
+				    file->f_path.mnt, &lock_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_inode_unlock(file_inode(file), lock_level);
+out:
+	vma->vm_ops = &ocfs2_file_vm_ops;
+	return 0;
+}
+
diff --git a/kernel/fs/ocfs2/mmap.h b/kernel/fs/ocfs2/mmap.h
new file mode 100644
index 000000000..1274ee0f1
--- /dev/null
+++ b/kernel/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
+#ifndef OCFS2_MMAP_H
+#define OCFS2_MMAP_H
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif  /* OCFS2_MMAP_H */
diff --git a/kernel/fs/ocfs2/move_extents.c b/kernel/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000..56a768d06
--- /dev/null
+++ b/kernel/fs/ocfs2/move_extents.c
@@ -0,0 +1,1075 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+	struct inode *inode;
+	struct file *file;
+	int auto_defrag;
+	int partial;
+	int credits;
+	u32 new_phys_cpos;
+	u32 clusters_moved;
+	u64 refcount_loc;
+	struct ocfs2_move_extents *range;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+			       struct ocfs2_move_extents_context *context,
+			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+			       int ext_flags)
+{
+	int ret = 0, index;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_rec *rec, replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+	ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
+					       p_cpos, new_p_cpos, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+								   new_p_cpos));
+
+	path = ocfs2_new_path_from_et(&context->et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	rec = &el->l_recs[index];
+
+	BUG_ON(ext_flags != rec->e_flags);
+	/*
+	 * after moving/defraging to new location, the extent is not going
+	 * to be refcounted anymore.
+	 */
+	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+				      context->et.et_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, &context->et, path, index,
+				 &replace_rec, context->meta_ac,
+				 &context->dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+	context->new_phys_cpos = new_p_cpos;
+
+	/*
+	 * need I to append truncate log for old clusters?
+	 */
+	if (old_blkno) {
+		if (ext_flags & OCFS2_EXT_REFCOUNTED)
+			ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 old_blkno),
+					len, context->meta_ac,
+					&context->dealloc, 1);
+		else
+			ret = ocfs2_truncate_log_append(osb, handle,
+							old_blkno, len);
+	}
+
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+					struct ocfs2_extent_tree *et,
+					u32 clusters_to_move,
+					u32 extents_to_split,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int extra_blocks,
+					int *credits)
+{
+	int ret, num_free_extents;
+	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
+
+	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+	     extra_blocks, clusters_to_move, *credits);
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ *  XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	u32 new_phys_cpos, new_len;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		BUG_ON(!context->refcount_loc);
+
+		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							context->refcount_loc,
+							phys_blkno,
+							*len,
+							&credits,
+							&extra_blocks);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+						 &context->meta_ac,
+						 &context->data_ac,
+						 extra_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * should be using allocation reservation strategy there?
+	 *
+	 * if (context->data_ac)
+	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+	 */
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock_mutex;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock_mutex;
+	}
+
+	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+				     &new_phys_cpos, &new_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * allowing partial extent moving is kind of 'pros and cons', it makes
+	 * whole defragmentation less likely to fail, on the contrary, the bad
+	 * thing is it may make the fs even more fragmented after moving, let
+	 * userspace make a good decision here.
+	 */
+	if (new_len != *len) {
+		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+		if (!partial) {
+			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+			ret = -ENOSPC;
+			goto out_commit;
+		}
+	}
+
+	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+	     phys_cpos, new_phys_cpos);
+
+	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+				  new_phys_cpos, ext_flags);
+	if (ret)
+		mlog_errno(ret);
+
+	if (partial && (new_len != *len))
+		*len = new_len;
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+out:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+					 u64 vict_blkno,
+					 int type, int slot,
+					 int *vict_bit,
+					 struct buffer_head **ret_bh)
+{
+	int ret, i, bits_per_unit = 0;
+	u64 blkno;
+	char namebuf[40];
+
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_chain_rec *rec;
+	struct ocfs2_dinode *ac_dinode;
+	struct ocfs2_group_desc *bg;
+
+	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+					 strlen(namebuf), &blkno);
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+	cl = &(ac_dinode->id2.i_chain);
+	rec = &(cl->cl_recs[0]);
+
+	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+		bits_per_unit = osb->s_clustersize_bits -
+					inode->i_sb->s_blocksize_bits;
+	/*
+	 * 'vict_blkno' was out of the valid range.
+	 */
+	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+	    (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+				bits_per_unit))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+		rec = &(cl->cl_recs[i]);
+		if (!rec)
+			continue;
+
+		bg = NULL;
+
+		do {
+			if (!bg)
+				blkno = le64_to_cpu(rec->c_blkno);
+			else
+				blkno = le64_to_cpu(bg->bg_next_group);
+
+			if (gd_bh) {
+				brelse(gd_bh);
+				gd_bh = NULL;
+			}
+
+			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+						le16_to_cpu(bg->bg_bits))) {
+
+				*ret_bh = gd_bh;
+				*vict_bit = (vict_blkno - blkno) >>
+							bits_per_unit;
+				mlog(0, "find the victim group: #%llu, "
+				     "total_bits: %u, vict_bit: %u\n",
+				     blkno, le16_to_cpu(bg->bg_bits),
+				     *vict_bit);
+				goto out;
+			}
+
+		} while (le64_to_cpu(bg->bg_next_group));
+	}
+
+	ret = -EINVAL;
+out:
+	brelse(ac_bh);
+
+	/*
+	 * caller has to release the gd_bh properly.
+	 */
+	return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+					       struct ocfs2_move_extents *range)
+{
+	int ret, goal_bit = 0;
+
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int c_to_b = 1 << (osb->s_clustersize_bits -
+					inode->i_sb->s_blocksize_bits);
+
+	/*
+	 * make goal become cluster aligned.
+	 */
+	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
+						      range->me_goal);
+	/*
+	 * validate goal sits within global_bitmap, and return the victim
+	 * group desc
+	 */
+	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT,
+					    &goal_bit, &gd_bh);
+	if (ret)
+		goto out;
+
+	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+	/*
+	 * moving goal is not allowd to start with a group desc blok(#0 blk)
+	 * let's compromise to the latter cluster.
+	 */
+	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+		range->me_goal += c_to_b;
+
+	/*
+	 * movement is not gonna cross two groups.
+	 */
+	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+								range->me_len) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/*
+	 * more exact validations/adjustments will be performed later during
+	 * moving operation for each extent range.
+	 */
+	mlog(0, "extents get ready to be moved to #%llu block\n",
+	     range->me_goal);
+
+out:
+	brelse(gd_bh);
+
+	return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+				    int *goal_bit, u32 move_len, u32 max_hop,
+				    u32 *phys_cpos)
+{
+	int i, used, last_free_bits = 0, base_bit = *goal_bit;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+						 le64_to_cpu(gd->bg_blkno));
+
+	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+		if (used) {
+			/*
+			 * we even tried searching the free chunk by jumping
+			 * a 'max_hop' distance, but still failed.
+			 */
+			if ((i - base_bit) > max_hop) {
+				*phys_cpos = 0;
+				break;
+			}
+
+			if (last_free_bits)
+				last_free_bits = 0;
+
+			continue;
+		} else
+			last_free_bits++;
+
+		if (last_free_bits == move_len) {
+			*goal_bit = i;
+			*phys_cpos = base_cpos + i;
+			break;
+		}
+	}
+
+	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+			     u32 len, int ext_flags)
+{
+	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *gb_inode = NULL;
+	struct buffer_head *gb_bh = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_group_desc *gd;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+						    context->range->me_threshold);
+	u64 phys_blkno, new_phys_blkno;
+
+	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		BUG_ON(!context->refcount_loc);
+
+		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							context->refcount_loc,
+							phys_blkno,
+							len,
+							&credits,
+							&extra_blocks);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+						 &context->meta_ac,
+						 NULL, extra_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * need to count 2 extra credits for global_bitmap inode and
+	 * group descriptor.
+	 */
+	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+	/*
+	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+	 * logic, while we still need to lock the global_bitmap.
+	 */
+	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					       OCFS2_INVALID_SLOT);
+	if (!gb_inode) {
+		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	mutex_lock(&gb_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock_gb_mutex;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock_tl_inode;
+	}
+
+	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT,
+					    &goal_bit, &gd_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * probe the victim cluster group to find a proper
+	 * region to fit wanted movement, it even will perfrom
+	 * a best-effort attempt by compromising to a threshold
+	 * around the goal.
+	 */
+	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+				new_phys_cpos);
+	if (!*new_phys_cpos) {
+		ret = -ENOSPC;
+		goto out_commit;
+	}
+
+	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+				  *new_phys_cpos, ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+					       le16_to_cpu(gd->bg_chain));
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+					 goal_bit, len);
+	if (ret) {
+		ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
+					       le16_to_cpu(gd->bg_chain));
+		mlog_errno(ret);
+	}
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+	brelse(gd_bh);
+
+out_unlock_tl_inode:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+	mutex_unlock(&gb_inode->i_mutex);
+	brelse(gb_bh);
+	iput(gb_inode);
+
+out:
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+					 u32 threshold, int *skip)
+{
+	if ((*alloc_size + *len_defraged) < threshold) {
+		/*
+		 * proceed defragmentation until we meet the thresh
+		 */
+		*len_defraged += *alloc_size;
+	} else if (*len_defraged == 0) {
+		/*
+		 * XXX: skip a large extent.
+		 */
+		*skip = 1;
+	} else {
+		/*
+		 * split this extent to coalesce with former pieces as
+		 * to reach the threshold.
+		 *
+		 * we're done here with one cycle of defragmentation
+		 * in a size of 'thresh', resetting 'len_defraged'
+		 * forces a new defragmentation.
+		 */
+		*alloc_size = threshold - *len_defraged;
+		*len_defraged = 0;
+	}
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+				struct ocfs2_move_extents_context *context)
+{
+	int ret = 0, flags, do_defrag, skip = 0;
+	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+	struct inode *inode = context->inode;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_move_extents *range = context->range;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if ((i_size_read(inode) == 0) || (range->me_len == 0))
+		return 0;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+	/*
+	 * TO-DO XXX:
+	 *
+	 * - xattr extents.
+	 */
+
+	do_defrag = context->auto_defrag;
+
+	/*
+	 * extents moving happens in unit of clusters, for the sake
+	 * of simplicity, we may ignore two clusters where 'byte_start'
+	 * and 'byte_start + len' were within.
+	 */
+	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+	len_to_move = (range->me_start + range->me_len) >>
+						osb->s_clustersize_bits;
+	if (len_to_move >= move_start)
+		len_to_move -= move_start;
+	else
+		len_to_move = 0;
+
+	if (do_defrag) {
+		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+		if (defrag_thresh <= 1)
+			goto done;
+	} else
+		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+							 range->me_goal);
+
+	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+	     "thresh: %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (unsigned long long)range->me_start,
+	     (unsigned long long)range->me_len,
+	     move_start, len_to_move, defrag_thresh);
+
+	cpos = move_start;
+	while (len_to_move) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+					 &flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > len_to_move)
+			alloc_size = len_to_move;
+
+		/*
+		 * XXX: how to deal with a hole:
+		 *
+		 * - skip the hole of course
+		 * - force a new defragmentation
+		 */
+		if (!phys_cpos) {
+			if (do_defrag)
+				len_defraged = 0;
+
+			goto next;
+		}
+
+		if (do_defrag) {
+			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+						     defrag_thresh, &skip);
+			/*
+			 * skip large extents
+			 */
+			if (skip) {
+				skip = 0;
+				goto next;
+			}
+
+			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+			     "alloc_size: %u, len_defraged: %u\n",
+			     cpos, phys_cpos, alloc_size, len_defraged);
+
+			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+						  &alloc_size, flags);
+		} else {
+			ret = ocfs2_move_extent(context, cpos, phys_cpos,
+						&new_phys_cpos, alloc_size,
+						flags);
+
+			new_phys_cpos += alloc_size;
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		context->clusters_moved += alloc_size;
+next:
+		cpos += alloc_size;
+		len_to_move -= alloc_size;
+	}
+
+done:
+	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+						      context->clusters_moved);
+	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+						       context->new_phys_cpos);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &context->dealloc);
+
+	return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+	int status;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_dinode *di;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * This prevents concurrent writes from other nodes
+	 */
+	status = ocfs2_rw_lock(inode, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out_rw_unlock;
+	}
+
+	/*
+	 * rememer ip_xattr_sem also needs to be held if necessary
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	status = __ocfs2_move_extents_range(di_bh, context);
+
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	if (status) {
+		mlog_errno(status);
+		goto out_inode_unlock;
+	}
+
+	/*
+	 * We update ctime for these changes
+	 */
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_inode_unlock;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+	ocfs2_rw_unlock(inode, 1);
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+	int status;
+
+	struct inode *inode = file_inode(filp);
+	struct ocfs2_move_extents range;
+	struct ocfs2_move_extents_context *context;
+
+	if (!argp)
+		return -EINVAL;
+
+	status = mnt_want_write_file(filp);
+	if (status)
+		return status;
+
+	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+		status = -EPERM;
+		goto out_drop;
+	}
+
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		status = -EPERM;
+		goto out_drop;
+	}
+
+	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+	if (!context) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_drop;
+	}
+
+	context->inode = inode;
+	context->file = filp;
+
+	if (copy_from_user(&range, argp, sizeof(range))) {
+		status = -EFAULT;
+		goto out_free;
+	}
+
+	if (range.me_start > i_size_read(inode)) {
+		status = -EINVAL;
+		goto out_free;
+	}
+
+	if (range.me_start + range.me_len > i_size_read(inode))
+			range.me_len = i_size_read(inode) - range.me_start;
+
+	context->range = &range;
+
+	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+		context->auto_defrag = 1;
+		/*
+		 * ok, the default theshold for the defragmentation
+		 * is 1M, since our maximum clustersize was 1M also.
+		 * any thought?
+		 */
+		if (!range.me_threshold)
+			range.me_threshold = 1024 * 1024;
+
+		if (range.me_threshold > i_size_read(inode))
+			range.me_threshold = i_size_read(inode);
+
+		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+			context->partial = 1;
+	} else {
+		/*
+		 * first best-effort attempt to validate and adjust the goal
+		 * (physical address in block), while it can't guarantee later
+		 * operation can succeed all the time since global_bitmap may
+		 * change a bit over time.
+		 */
+
+		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+		if (status)
+			goto out_copy;
+	}
+
+	status = ocfs2_move_extents(context);
+	if (status)
+		mlog_errno(status);
+out_copy:
+	/*
+	 * movement/defragmentation may end up being partially completed,
+	 * that's the reason why we need to return userspace the finished
+	 * length and new_offset even if failure happens somewhere.
+	 */
+	if (copy_to_user(argp, &range, sizeof(range)))
+		status = -EFAULT;
+
+out_free:
+	kfree(context);
+out_drop:
+	mnt_drop_write_file(filp);
+
+	return status;
+}
diff --git a/kernel/fs/ocfs2/move_extents.h b/kernel/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000..4e143e811
--- /dev/null
+++ b/kernel/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/kernel/fs/ocfs2/namei.c b/kernel/fs/ocfs2/namei.c
new file mode 100644
index 000000000..176fe6afd
--- /dev/null
+++ b/kernel/fs/ocfs2/namei.c
@@ -0,0 +1,2921 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.c
+ *
+ * Create and rename file, directory, symlinks
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/quotaops.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct inode *inode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      handle_t *handle,
+			      struct ocfs2_alloc_context *inode_ac);
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct inode **ret_orphan_dir,
+				    u64 blkno,
+				    char *name,
+				    struct ocfs2_dir_lookup_result *lookup,
+				    bool dio);
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    char *name,
+			    struct ocfs2_dir_lookup_result *lookup,
+			    struct inode *orphan_dir_inode,
+			    bool dio);
+
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     handle_t *handle,
+				     struct inode *inode,
+				     const char *symname);
+
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2,
+			     int rename);
+
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
+/* An orphan dir name is an 8 byte value, printed as a hex string */
+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	int status;
+	u64 blkno;
+	struct inode *inode = NULL;
+	struct dentry *ret;
+	struct ocfs2_inode_info *oi;
+
+	trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
+
+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
+		ret = ERR_PTR(-ENAMETOOLONG);
+		goto bail;
+	}
+
+	status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name,
+					    dentry->d_name.len, &blkno);
+	if (status < 0)
+		goto bail_add;
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
+	if (IS_ERR(inode)) {
+		ret = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	oi = OCFS2_I(inode);
+	/* Clear any orphaned state... If we were able to look up the
+	 * inode from a directory, it certainly can't be orphaned. We
+	 * might have the bad state from a node which intended to
+	 * orphan this inode but crashed before it could commit the
+	 * unlink. */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	spin_unlock(&oi->ip_lock);
+
+bail_add:
+	ret = d_splice_alias(inode, dentry);
+
+	if (inode) {
+		/*
+		 * If d_splice_alias() finds a DCACHE_DISCONNECTED
+		 * dentry, it will d_move() it on top of ourse. The
+		 * return value will indicate this however, so in
+		 * those cases, we switch them around for the locking
+		 * code.
+		 *
+		 * NOTE: This dentry already has ->d_op set from
+		 * ocfs2_get_parent() and ocfs2_get_dentry()
+		 */
+		if (!IS_ERR_OR_NULL(ret))
+			dentry = ret;
+
+		status = ocfs2_dentry_attach_lock(dentry, inode,
+						  OCFS2_I(dir)->ip_blkno);
+		if (status) {
+			mlog_errno(status);
+			ret = ERR_PTR(status);
+			goto bail_unlock;
+		}
+	} else
+		ocfs2_dentry_attach_gen(dentry);
+
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the downconvert thread and have a stale dentry. */
+	ocfs2_inode_unlock(dir, 0);
+
+bail:
+
+	trace_ocfs2_lookup_ret(ret);
+
+	return ret;
+}
+
+static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode) {
+		mlog(ML_ERROR, "new_inode failed!\n");
+		return NULL;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	if (S_ISDIR(mode))
+		set_nlink(inode, 2);
+	inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+	return inode;
+}
+
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+		struct dentry *dentry, struct inode *inode)
+{
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+	ocfs2_lock_res_free(&dl->dl_lockres);
+	BUG_ON(dl->dl_count != 1);
+	spin_lock(&dentry_attach_lock);
+	dentry->d_fsdata = NULL;
+	spin_unlock(&dentry_attach_lock);
+	kfree(dl);
+	iput(inode);
+}
+
+static int ocfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       umode_t mode,
+		       dev_t dev)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dirfe;
+	struct buffer_head *new_fe_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	int want_clusters = 0;
+	int want_meta = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota_inode = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+	int did_block_signals = 0;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	struct ocfs2_dentry_lock *dl = NULL;
+
+	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			  (unsigned long)dev, mode);
+
+	dquot_initialize(dir);
+
+	/* get our super block */
+	osb = OCFS2_SB(dir->i_sb);
+
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
+		status = -EMLINK;
+		goto leave;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!ocfs2_read_links_count(dirfe)) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security and acl xattr */
+	status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+				       &si, &want_clusters,
+				       &xattr_credits, &want_meta);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* Reserve a cluster if creating an extent based directory. */
+	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
+		want_clusters += 1;
+
+		/* Dir indexing requires extra space as well */
+		if (ocfs2_supports_indexed_dirs(osb))
+			want_meta++;
+	}
+
+	status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+							    S_ISDIR(mode),
+							    xattr_credits));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+	did_block_signals = 1;
+
+	status = dquot_alloc_inode(inode);
+	if (status)
+		goto leave;
+	did_quota_inode = 1;
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, inode, dev,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(mode)) {
+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
+					    new_fe_bh, data_ac, meta_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
+						 parent_fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		ocfs2_add_links_count(dirfe, 1);
+		ocfs2_journal_dirty(handle, parent_fe_bh);
+		inc_nlink(dir);
+	}
+
+	if (default_acl) {
+		status = ocfs2_set_acl(handle, inode, new_fe_bh,
+				       ACL_TYPE_DEFAULT, default_acl,
+				       meta_ac, data_ac);
+	}
+	if (!status && acl) {
+		status = ocfs2_set_acl(handle, inode, new_fe_bh,
+				       ACL_TYPE_ACCESS, acl,
+				       meta_ac, data_ac);
+	}
+
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 meta_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/*
+	 * Do this before adding the entry to the directory. We add
+	 * also set d_op after success so that ->d_iput() will cleanup
+	 * the dentry lock even if ocfs2_add_entry() fails below.
+	 */
+	status = ocfs2_dentry_attach_lock(dentry, inode,
+					  OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	dl = dentry->d_fsdata;
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	status = 0;
+leave:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
+	if (status < 0 && did_quota_inode)
+		dquot_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	ocfs2_inode_unlock(dir, 1);
+	if (did_block_signals)
+		ocfs2_unblock_signals(&oldset);
+
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	kfree(si.value);
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	/*
+	 * We should call iput after the i_mutex of the bitmap been
+	 * unlocked in ocfs2_free_alloc_context, or the
+	 * ocfs2_delete_inode will mutex_lock again.
+	 */
+	if ((status < 0) && inode) {
+		if (dl)
+			ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static int __ocfs2_mknod_locked(struct inode *dir,
+				struct inode *inode,
+				dev_t dev,
+				struct buffer_head **new_fe_bh,
+				struct buffer_head *parent_fe_bh,
+				handle_t *handle,
+				struct ocfs2_alloc_context *inode_ac,
+				u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
+{
+	int status = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_extent_list *fel;
+	u16 feat;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	*new_fe_bh = NULL;
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
+	OCFS2_I(inode)->ip_blkno = fe_blkno;
+	spin_lock(&osb->osb_lock);
+	inode->i_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
+	if (!*new_fe_bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+					 *new_fe_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
+	memset(fe, 0, osb->sb->s_blocksize);
+
+	fe->i_generation = cpu_to_le32(inode->i_generation);
+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
+	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
+	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
+	fe->i_uid = cpu_to_le32(i_uid_read(inode));
+	fe->i_gid = cpu_to_le32(i_gid_read(inode));
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
+
+	ocfs2_set_links_count(fe, inode->i_nlink);
+
+	fe->i_last_eb_blk = 0;
+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
+	fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+	fe->i_atime = fe->i_ctime = fe->i_mtime =
+		cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
+		cpu_to_le32(CURRENT_TIME.tv_nsec);
+	fe->i_dtime = 0;
+
+	/*
+	 * If supported, directories start with inline data. If inline
+	 * isn't supported, but indexing is, we start them as indexed.
+	 */
+	feat = le16_to_cpu(fe->i_dyn_features);
+	if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
+		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
+
+		fe->id2.i_data.id_count = cpu_to_le16(
+				ocfs2_max_inline_data_with_xattr(osb->sb, fe));
+	} else {
+		fel = &fe->id2.i_list;
+		fel->l_tree_depth = 0;
+		fel->l_next_free_rec = 0;
+		fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+	}
+
+	ocfs2_journal_dirty(handle, *new_fe_bh);
+
+	ocfs2_populate_inode(inode, fe, 1);
+	ocfs2_ci_set_new(osb, INODE_CACHE(inode));
+	if (!ocfs2_mount_local(osb)) {
+		status = ocfs2_create_new_inode_locks(inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	oi->i_sync_tid = handle->h_transaction->t_tid;
+	oi->i_datasync_tid = handle->h_transaction->t_tid;
+
+leave:
+	if (status < 0) {
+		if (*new_fe_bh) {
+			brelse(*new_fe_bh);
+			*new_fe_bh = NULL;
+		}
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct inode *inode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      handle_t *handle,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	u64 suballoc_loc, fe_blkno = 0;
+	u16 suballoc_bit;
+
+	*new_fe_bh = NULL;
+
+	status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+				       inode_ac, &suballoc_loc,
+				       &suballoc_bit, &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+
+	return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+				    parent_fe_bh, handle, inode_ac,
+				    fe_blkno, suballoc_loc, suballoc_bit);
+}
+
+static int ocfs2_mkdir(struct inode *dir,
+		       struct dentry *dentry,
+		       umode_t mode)
+{
+	int ret;
+
+	trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  OCFS2_I(dir)->ip_blkno, mode);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			umode_t mode,
+			bool excl)
+{
+	int ret;
+
+	trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	handle_t *handle;
+	struct inode *inode = d_inode(old_dentry);
+	struct inode *old_dir = d_inode(old_dentry->d_parent);
+	int err;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+	u64 old_de_ino;
+
+	trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 old_dentry->d_name.len, old_dentry->d_name.name,
+			 dentry->d_name.len, dentry->d_name.name);
+
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	dquot_initialize(dir);
+
+	err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+			&parent_fe_bh, dir, 0);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		return err;
+	}
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!parent_fe_bh) {
+		if (old_dir_bh) {
+			parent_fe_bh = old_dir_bh;
+			get_bh(parent_fe_bh);
+		} else {
+			mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	if (!dir->i_nlink) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
+			old_dentry->d_name.len, &old_de_ino);
+	if (err) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	/*
+	 * Check whether another node removed the source inode while we
+	 * were in the vfs.
+	 */
+	if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					dentry->d_name.len);
+	if (err)
+		goto out;
+
+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					   dentry->d_name.name,
+					   dentry->d_name.len, &lookup);
+	if (err < 0) {
+		mlog_errno(err);
+		goto out;
+	}
+
+	err = ocfs2_inode_lock(inode, &fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto out;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
+		err = -EMLINK;
+		goto out_unlock_inode;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(err);
+		goto out_unlock_inode;
+	}
+
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+
+	err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (err < 0) {
+		mlog_errno(err);
+		goto out_commit;
+	}
+
+	inc_nlink(inode);
+	inode->i_ctime = CURRENT_TIME;
+	ocfs2_set_links_count(fe, inode->i_nlink);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ocfs2_journal_dirty(handle, fe_bh);
+
+	err = ocfs2_add_entry(handle, dentry, inode,
+			      OCFS2_I(inode)->ip_blkno,
+			      parent_fe_bh, &lookup);
+	if (err) {
+		ocfs2_add_links_count(fe, -1);
+		drop_nlink(inode);
+		mlog_errno(err);
+		goto out_commit;
+	}
+
+	err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+	if (err) {
+		mlog_errno(err);
+		goto out_commit;
+	}
+
+	ihold(inode);
+	d_instantiate(dentry, inode);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+	ocfs2_unblock_signals(&oldset);
+out_unlock_inode:
+	ocfs2_inode_unlock(inode, 1);
+
+out:
+	ocfs2_double_unlock(old_dir, dir);
+
+	brelse(fe_bh);
+	brelse(parent_fe_bh);
+	brelse(old_dir_bh);
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (err)
+		mlog_errno(err);
+
+	return err;
+}
+
+/*
+ * Takes and drops an exclusive lock on the given dentry. This will
+ * force other nodes to drop it.
+ */
+static int ocfs2_remote_dentry_delete(struct dentry *dentry)
+{
+	int ret;
+
+	ret = ocfs2_dentry_lock(dentry, 1);
+	if (ret)
+		mlog_errno(ret);
+	else
+		ocfs2_dentry_unlock(dentry, 1);
+
+	return ret;
+}
+
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode)) {
+		if (inode->i_nlink == 2)
+			return 1;
+		return 0;
+	}
+
+	if (inode->i_nlink == 1)
+		return 1;
+	return 0;
+}
+
+static int ocfs2_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	int child_locked = 0;
+	bool is_unlinkable = false;
+	struct inode *inode = d_inode(dentry);
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	u64 blkno;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_node_bh = NULL;
+	handle_t *handle = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+
+	trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	dquot_initialize(dir);
+
+	BUG_ON(d_inode(dentry->d_parent) != dir);
+
+	if (inode == osb->root_inode)
+		return -EPERM;
+
+	status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
+					 OI_LS_PARENT);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno, dir,
+					  &lookup);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (OCFS2_I(inode)->ip_blkno != blkno) {
+		status = -ENOENT;
+
+		trace_ocfs2_unlink_noent(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)blkno,
+				OCFS2_I(inode)->ip_flags);
+		goto leave;
+	}
+
+	status = ocfs2_inode_lock(inode, &fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+	child_locked = 1;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
+			status = -ENOTEMPTY;
+			goto leave;
+		}
+	}
+
+	status = ocfs2_remote_dentry_delete(dentry);
+	if (status < 0) {
+		/* This remote delete should succeed under all normal
+		 * circumstances. */
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (ocfs2_inode_is_unlinkable(inode)) {
+		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+						  OCFS2_I(inode)->ip_blkno,
+						  orphan_name, &orphan_insert,
+						  false);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		is_unlinkable = true;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	/* delete the name from the parent dir */
+	status = ocfs2_delete_entry(handle, dir, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode))
+		drop_nlink(inode);
+	drop_nlink(inode);
+	ocfs2_set_links_count(fe, inode->i_nlink);
+	ocfs2_journal_dirty(handle, fe_bh);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	if (S_ISDIR(inode->i_mode))
+		drop_nlink(dir);
+
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		if (S_ISDIR(inode->i_mode))
+			inc_nlink(dir);
+		goto leave;
+	}
+
+	if (is_unlinkable) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+				orphan_name, &orphan_insert, orphan_dir, false);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+leave:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (child_locked)
+		ocfs2_inode_unlock(inode, 1);
+
+	ocfs2_inode_unlock(dir, 1);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	brelse(fe_bh);
+	brelse(parent_node_bh);
+
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
+		mlog_errno(status);
+
+	return status;
+}
+
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+		u64 src_inode_no, u64 dest_inode_no)
+{
+	int ret = 0, i = 0;
+	u64 parent_inode_no = 0;
+	u64 child_inode_no = src_inode_no;
+	struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+	while (1) {
+		child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+		if (IS_ERR(child_inode)) {
+			ret = PTR_ERR(child_inode);
+			break;
+		}
+
+		ret = ocfs2_inode_lock(child_inode, NULL, 0);
+		if (ret < 0) {
+			iput(child_inode);
+			if (ret != -ENOENT)
+				mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+				&parent_inode_no);
+		ocfs2_inode_unlock(child_inode, 0);
+		iput(child_inode);
+		if (ret < 0) {
+			ret = -ENOENT;
+			break;
+		}
+
+		if (parent_inode_no == dest_inode_no) {
+			ret = 1;
+			break;
+		}
+
+		if (parent_inode_no == osb->root_inode->i_ino) {
+			ret = 0;
+			break;
+		}
+
+		child_inode_no = parent_inode_no;
+
+		if (++i >= MAX_LOOKUP_TIMES) {
+			mlog(ML_NOTICE, "max lookup times reached, filesystem "
+					"may have nested directories, "
+					"src inode: %llu, dest inode: %llu.\n",
+					(unsigned long long)src_inode_no,
+					(unsigned long long)dest_inode_no);
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * The only place this should be used is rename and link!
+ * if they have the same id, then the 1st one is the only one locked.
+ */
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2,
+			     int rename)
+{
+	int status;
+	int inode1_is_ancestor, inode2_is_ancestor;
+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
+	struct buffer_head **tmpbh;
+	struct inode *tmpinode;
+
+	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+				(unsigned long long)oi2->ip_blkno);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* we always want to lock the one with the lower lockid first.
+	 * and if they are nested, we lock ancestor first */
+	if (oi1->ip_blkno != oi2->ip_blkno) {
+		inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+				oi1->ip_blkno);
+		if (inode1_is_ancestor < 0) {
+			status = inode1_is_ancestor;
+			goto bail;
+		}
+
+		inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+				oi2->ip_blkno);
+		if (inode2_is_ancestor < 0) {
+			status = inode2_is_ancestor;
+			goto bail;
+		}
+
+		if ((inode1_is_ancestor == 1) ||
+				(oi1->ip_blkno < oi2->ip_blkno &&
+				inode2_is_ancestor == 0)) {
+			/* switch id1 and id2 around */
+			tmpbh = bh2;
+			bh2 = bh1;
+			bh1 = tmpbh;
+
+			tmpinode = inode2;
+			inode2 = inode1;
+			inode1 = tmpinode;
+		}
+		/* lock id2 */
+		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+				rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* lock id1 */
+	status = ocfs2_inode_lock_nested(inode1, bh1, 1,
+			rename == 1 ?  OI_LS_RENAME2 : OI_LS_PARENT);
+	if (status < 0) {
+		/*
+		 * An error return must mean that no cluster locks
+		 * were held on function exit.
+		 */
+		if (oi1->ip_blkno != oi2->ip_blkno) {
+			ocfs2_inode_unlock(inode2, 1);
+			brelse(*bh2);
+			*bh2 = NULL;
+		}
+
+		if (status != -ENOENT)
+			mlog_errno(status);
+	}
+
+	trace_ocfs2_double_lock_end(
+			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
+			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+	ocfs2_inode_unlock(inode1, 1);
+
+	if (inode1 != inode2)
+		ocfs2_inode_unlock(inode2, 1);
+}
+
+static int ocfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
+	int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_dinode *newfe = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *old_inode_bh = NULL;
+	struct ocfs2_super *osb = NULL;
+	u64 newfe_blkno, old_de_ino;
+	handle_t *handle = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *new_dir_bh = NULL;
+	u32 old_dir_nlink = old_dir->i_nlink;
+	struct ocfs2_dinode *old_di;
+	struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+	struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+	struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+	struct ocfs2_dir_lookup_result target_insert = { NULL, };
+	bool should_add_orphan = false;
+
+	/* At some point it might be nice to break this function up a
+	 * bit. */
+
+	trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
+			   old_dentry->d_name.len, old_dentry->d_name.name,
+			   new_dentry->d_name.len, new_dentry->d_name.name);
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	osb = OCFS2_SB(old_dir->i_sb);
+
+	if (new_inode) {
+		if (!igrab(new_inode))
+			BUG();
+	}
+
+	/* Assume a directory hierarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+
+		/* here we cannot guarantee the inodes haven't just been
+		 * changed, so check if they are nested again */
+		status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+				old_inode->i_ino);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		} else if (status == 1) {
+			status = -EPERM;
+			trace_ocfs2_rename_not_permitted(
+					(unsigned long long)old_inode->i_ino,
+					(unsigned long long)new_dir->i_ino);
+			goto bail;
+		}
+	}
+
+	/* if old and new are the same, this'll just do one lock. */
+	status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+				   &new_dir_bh, new_dir, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	parents_locked = 1;
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!new_dir_bh) {
+		if (old_dir_bh) {
+			new_dir_bh = old_dir_bh;
+			get_bh(new_dir_bh);
+		} else {
+			mlog(ML_ERROR, "no old_dir_bh!\n");
+			status = -EIO;
+			goto bail;
+		}
+	}
+
+	/*
+	 * Aside from allowing a meta data update, the locking here
+	 * also ensures that the downconvert thread on other nodes
+	 * won't have to concurrently downconvert the inode and the
+	 * dentry locks.
+	 */
+	status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1,
+					 OI_LS_PARENT);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	old_child_locked = 1;
+
+	status = ocfs2_remote_dentry_delete(old_dentry);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		u64 old_inode_parent;
+
+		update_dot_dot = 1;
+		status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
+						  old_inode,
+						  &old_inode_dot_dot_res);
+		if (status) {
+			status = -EIO;
+			goto bail;
+		}
+
+		if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) {
+			status = -EIO;
+			goto bail;
+		}
+
+		if (!new_inode && new_dir != old_dir &&
+		    new_dir->i_nlink >= ocfs2_link_max(osb)) {
+			status = -EMLINK;
+			goto bail;
+		}
+	}
+
+	status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
+					    old_dentry->d_name.len,
+					    &old_de_ino);
+	if (status) {
+		status = -ENOENT;
+		goto bail;
+	}
+
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) {
+		status = -ENOENT;
+		goto bail;
+	}
+
+	/* check if the target already exists (in which case we need
+	 * to delete it */
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
+					  new_dentry->d_name.len,
+					  &newfe_blkno, new_dir,
+					  &target_lookup_res);
+	/* The only error we allow here is -ENOENT because the new
+	 * file not existing is perfectly valid. */
+	if ((status < 0) && (status != -ENOENT)) {
+		/* If we cannot find the file specified we should just */
+		/* return the error... */
+		mlog_errno(status);
+		goto bail;
+	}
+	if (status == 0)
+		target_exists = 1;
+
+	if (!target_exists && new_inode) {
+		/*
+		 * Target was unlinked by another node while we were
+		 * waiting to get to ocfs2_rename(). There isn't
+		 * anything we can do here to help the situation, so
+		 * bubble up the appropriate error.
+		 */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	/* In case we need to overwrite an existing file, we blow it
+	 * away first */
+	if (target_exists) {
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
+
+			trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
+						new_dentry->d_name.name);
+			goto bail;
+		}
+
+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
+			status = -EACCES;
+
+			trace_ocfs2_rename_disagree(
+			     (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
+			     (unsigned long long)newfe_blkno,
+			     OCFS2_I(new_inode)->ip_flags);
+			goto bail;
+		}
+
+		status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+		new_child_locked = 1;
+
+		status = ocfs2_remote_dentry_delete(new_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
+
+		trace_ocfs2_rename_over_existing(
+		     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
+		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+
+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
+			status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+						OCFS2_I(new_inode)->ip_blkno,
+						orphan_name, &orphan_insert,
+						false);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			should_add_orphan = true;
+		}
+	} else {
+		BUG_ON(d_inode(new_dentry->d_parent) != new_dir);
+
+		status = ocfs2_check_dir_for_entry(new_dir,
+						   new_dentry->d_name.name,
+						   new_dentry->d_name.len);
+		if (status)
+			goto bail;
+
+		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
+						      new_dentry->d_name.name,
+						      new_dentry->d_name.len,
+						      &target_insert);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (target_exists) {
+		if (S_ISDIR(new_inode->i_mode)) {
+			if (new_inode->i_nlink != 2 ||
+			    !ocfs2_empty_dir(new_inode)) {
+				status = -ENOTEMPTY;
+				goto bail;
+			}
+		}
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
+						 newfe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* change the dirent to point to the correct inode */
+		status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
+					    old_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		new_dir->i_version++;
+
+		if (S_ISDIR(new_inode->i_mode))
+			ocfs2_set_links_count(newfe, 0);
+		else
+			ocfs2_add_links_count(newfe, -1);
+		ocfs2_journal_dirty(handle, newfe_bh);
+		if (should_add_orphan) {
+			status = ocfs2_orphan_add(osb, handle, new_inode,
+					newfe_bh, orphan_name,
+					&orphan_insert, orphan_dir, false);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		/* if the name was not found in new_dir, add it now */
+		status = ocfs2_add_entry(handle, new_dentry, old_inode,
+					 OCFS2_I(old_inode)->ip_blkno,
+					 new_dir_bh, &target_insert);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_inode);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
+					 old_inode_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status >= 0) {
+		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
+
+		old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
+		old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+		ocfs2_journal_dirty(handle, old_inode_bh);
+	} else
+		mlog_errno(status);
+
+	/*
+	 * Now that the name has been added to new_dir, remove the old name.
+	 *
+	 * We don't keep any directory entry context around until now
+	 * because the insert might have changed the type of directory
+	 * we're dealing with.
+	 */
+	status = ocfs2_find_entry(old_dentry->d_name.name,
+				  old_dentry->d_name.len, old_dir,
+				  &old_entry_lookup);
+	if (status)
+		goto bail;
+
+	status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+
+	if (update_dot_dot) {
+		status = ocfs2_update_entry(old_inode, handle,
+					    &old_inode_dot_dot_res, new_dir);
+		drop_nlink(old_dir);
+		if (new_inode) {
+			drop_nlink(new_inode);
+		} else {
+			inc_nlink(new_dir);
+			mark_inode_dirty(new_dir);
+		}
+	}
+	mark_inode_dirty(old_dir);
+	ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh);
+	if (new_inode) {
+		mark_inode_dirty(new_inode);
+		ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh);
+	}
+
+	if (old_dir != new_dir) {
+		/* Keep the same times on both directories.*/
+		new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
+
+		/*
+		 * This will also pick up the i_nlink change from the
+		 * block above.
+		 */
+		ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh);
+	}
+
+	if (old_dir_nlink != old_dir->i_nlink) {
+		if (!old_dir_bh) {
+			mlog(ML_ERROR, "need to change nlink for old dir "
+			     "%llu from %d to %d but bh is NULL!\n",
+			     (unsigned long long)OCFS2_I(old_dir)->ip_blkno,
+			     (int)old_dir_nlink, old_dir->i_nlink);
+		} else {
+			struct ocfs2_dinode *fe;
+			status = ocfs2_journal_access_di(handle,
+							 INODE_CACHE(old_dir),
+							 old_dir_bh,
+							 OCFS2_JOURNAL_ACCESS_WRITE);
+			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
+			ocfs2_set_links_count(fe, old_dir->i_nlink);
+			ocfs2_journal_dirty(handle, old_dir_bh);
+		}
+	}
+	ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
+	status = 0;
+bail:
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (parents_locked)
+		ocfs2_double_unlock(old_dir, new_dir);
+
+	if (old_child_locked)
+		ocfs2_inode_unlock(old_inode, 1);
+
+	if (new_child_locked)
+		ocfs2_inode_unlock(new_inode, 1);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	if (new_inode)
+		sync_mapping_buffers(old_inode->i_mapping);
+
+	if (new_inode)
+		iput(new_inode);
+
+	ocfs2_free_dir_lookup_result(&target_lookup_res);
+	ocfs2_free_dir_lookup_result(&old_entry_lookup);
+	ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+	ocfs2_free_dir_lookup_result(&target_insert);
+
+	brelse(newfe_bh);
+	brelse(old_inode_bh);
+	brelse(old_dir_bh);
+	brelse(new_dir_bh);
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * we expect i_size = strlen(symname). Copy symname into the file
+ * data, including the null terminator.
+ */
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     handle_t *handle,
+				     struct inode *inode,
+				     const char *symname)
+{
+	struct buffer_head **bhs = NULL;
+	const char *c;
+	struct super_block *sb = osb->sb;
+	u64 p_blkno, p_blocks;
+	int virtual, blocks, status, i, bytes_left;
+
+	bytes_left = i_size_read(inode) + 1;
+	/* we can't trust i_blocks because we're actually going to
+	 * write i_size + 1 bytes. */
+	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
+					i_size_read(inode), blocks);
+
+	/* Sanity check -- make sure we're going to fit. */
+	if (bytes_left >
+	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
+					     NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* links can never be larger than one cluster so we know this
+	 * is all going to be contiguous, but do a sanity check
+	 * anyway. */
+	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	virtual = 0;
+	while(bytes_left > 0) {
+		c = &symname[virtual * sb->s_blocksize];
+
+		bhs[virtual] = sb_getblk(sb, p_blkno);
+		if (!bhs[virtual]) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+		ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
+					      bhs[virtual]);
+
+		status = ocfs2_journal_access(handle, INODE_CACHE(inode),
+					      bhs[virtual],
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
+
+		memcpy(bhs[virtual]->b_data, c,
+		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
+		       bytes_left);
+
+		ocfs2_journal_dirty(handle, bhs[virtual]);
+
+		virtual++;
+		p_blkno++;
+		bytes_left -= sb->s_blocksize;
+	}
+
+	status = 0;
+bail:
+
+	if (bhs) {
+		for(i = 0; i < blocks; i++)
+			brelse(bhs[i]);
+		kfree(bhs);
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	int status, l, credits;
+	u64 newsize;
+	struct ocfs2_super *osb = NULL;
+	struct inode *inode = NULL;
+	struct super_block *sb;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_dinode *dirfe;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota = 0, did_quota_inode = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+	int did_block_signals = 0;
+	struct ocfs2_dentry_lock *dl = NULL;
+
+	trace_ocfs2_symlink_begin(dir, dentry, symname,
+				  dentry->d_name.len, dentry->d_name.name);
+
+	dquot_initialize(dir);
+
+	sb = dir->i_sb;
+	osb = OCFS2_SB(sb);
+
+	l = strlen(symname) + 1;
+
+	credits = ocfs2_calc_symlink_credits(sb);
+
+	/* lock the parent directory */
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!ocfs2_read_links_count(dirfe)) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto bail;
+
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security xattr */
+	if (si.enable) {
+		status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+						  &xattr_credits, &xattr_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, credits + xattr_credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+	did_block_signals = 1;
+
+	status = dquot_alloc_inode(inode);
+	if (status)
+		goto bail;
+	did_quota_inode = 1;
+
+	trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
+				   dentry->d_name.name,
+				   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				   inode->i_mode);
+
+	status = ocfs2_mknod_locked(osb, dir, inode,
+				    0, &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
+	inode->i_rdev = 0;
+	newsize = l - 1;
+	inode->i_op = &ocfs2_symlink_inode_operations;
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		u32 offset = 0;
+
+		status = dquot_alloc_space_nodirty(inode,
+		    ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (status)
+			goto bail;
+		did_quota = 1;
+		inode->i_mapping->a_ops = &ocfs2_aops;
+		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+					      new_fe_bh,
+					      handle, data_ac, NULL,
+					      NULL);
+		if (status < 0) {
+			if (status != -ENOSPC && status != -EINTR) {
+				mlog(ML_ERROR,
+				     "Failed to extend file to %llu\n",
+				     (unsigned long long)newsize);
+				mlog_errno(status);
+				status = -ENOSPC;
+			}
+			goto bail;
+		}
+		i_size_write(inode, newsize);
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	} else {
+		inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+		memcpy((char *) fe->id2.i_symlink, symname, l);
+		i_size_write(inode, newsize);
+		inode->i_blocks = 0;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_fast_symlink(inode)) {
+		status = ocfs2_create_symlink_data(osb, handle, inode,
+						   symname);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/*
+	 * Do this before adding the entry to the directory. We add
+	 * also set d_op after success so that ->d_iput() will cleanup
+	 * the dentry lock even if ocfs2_add_entry() fails below.
+	 */
+	status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	dl = dentry->d_fsdata;
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+bail:
+	if (status < 0 && did_quota)
+		dquot_free_space_nodirty(inode,
+					ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (status < 0 && did_quota_inode)
+		dquot_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	ocfs2_inode_unlock(dir, 1);
+	if (did_block_signals)
+		ocfs2_unblock_signals(&oldset);
+
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	kfree(si.value);
+	ocfs2_free_dir_lookup_result(&lookup);
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
+	if ((status < 0) && inode) {
+		if (dl)
+			ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static int ocfs2_blkno_stringify(u64 blkno, char *name)
+{
+	int status, namelen;
+
+	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
+			   (long long)blkno);
+	if (namelen <= 0) {
+		if (namelen)
+			status = namelen;
+		else
+			status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (namelen != OCFS2_ORPHAN_NAMELEN) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_blkno_stringify(blkno, name, namelen);
+
+	status = 0;
+bail:
+	if (status < 0)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+					struct inode **ret_orphan_dir,
+					struct buffer_head **ret_orphan_dir_bh)
+{
+	struct inode *orphan_dir_inode;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int ret = 0;
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		ret = -ENOENT;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (ret < 0) {
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+
+		mlog_errno(ret);
+		return ret;
+	}
+
+	*ret_orphan_dir = orphan_dir_inode;
+	*ret_orphan_dir_bh = orphan_dir_bh;
+
+	return 0;
+}
+
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+				      struct buffer_head *orphan_dir_bh,
+				      u64 blkno,
+				      char *name,
+				      struct ocfs2_dir_lookup_result *lookup,
+				      bool dio)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+	int namelen = dio ?
+			(OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+			OCFS2_ORPHAN_NAMELEN;
+
+	if (dio) {
+		ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+				OCFS2_DIO_ORPHAN_PREFIX);
+		if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+			ret = -EINVAL;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_blkno_stringify(blkno,
+				name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+	} else
+		ret = ocfs2_blkno_stringify(blkno, name);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+					   orphan_dir_bh, name,
+					   namelen, lookup);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ *          ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct inode **ret_orphan_dir,
+				    u64 blkno,
+				    char *name,
+				    struct ocfs2_dir_lookup_result *lookup,
+				    bool dio)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int ret = 0;
+
+	ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+					   &orphan_dir_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+					 blkno, name, lookup, dio);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_orphan_dir = orphan_dir_inode;
+
+out:
+	brelse(orphan_dir_bh);
+
+	if (ret) {
+		ocfs2_inode_unlock(orphan_dir_inode, 1);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+	}
+
+	if (ret)
+		mlog_errno(ret);
+	return ret;
+}
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    char *name,
+			    struct ocfs2_dir_lookup_result *lookup,
+			    struct inode *orphan_dir_inode,
+			    bool dio)
+{
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	struct ocfs2_dinode *orphan_fe;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	int namelen = dio ?
+			(OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+			OCFS2_ORPHAN_NAMELEN;
+
+	trace_ocfs2_orphan_add_begin(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(orphan_dir_inode),
+					 orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/*
+	 * We're going to journal the change of i_flags and i_orphaned_slot.
+	 * It's safe anyway, though some callers may duplicate the journaling.
+	 * Journaling within the func just make the logic look more
+	 * straightforward.
+	 */
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(inode),
+					 fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* we're a cluster, and nlink can change on disk from
+	 * underneath us... */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		ocfs2_add_links_count(orphan_fe, 1);
+	set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+	ocfs2_journal_dirty(handle, orphan_dir_bh);
+
+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+				   namelen, inode,
+				   OCFS2_I(inode)->ip_blkno,
+				   orphan_dir_bh, lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto rollback;
+	}
+
+	if (dio) {
+		/* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
+		 * slot.
+		 */
+		fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+		fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
+	} else {
+		fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+		OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+
+		/* Record which orphan dir our inode now resides
+		 * in. delete_inode will use this to determine which orphan
+		 * dir to lock. */
+		fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+	}
+
+	ocfs2_journal_dirty(handle, fe_bh);
+
+	trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				   osb->slot_num);
+
+rollback:
+	if (status < 0) {
+		if (S_ISDIR(inode->i_mode))
+			ocfs2_add_links_count(orphan_fe, -1);
+		set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+	}
+
+leave:
+	brelse(orphan_dir_bh);
+
+	return status;
+}
+
+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     handle_t *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh,
+		     bool dio)
+{
+	const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
+	char name[namelen + 1];
+	struct ocfs2_dinode *orphan_fe;
+	int status = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	if (dio) {
+		status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+				OCFS2_DIO_ORPHAN_PREFIX);
+		if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+			status = -EINVAL;
+			mlog_errno(status);
+			return status;
+		}
+
+		status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
+				name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+	} else
+		status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	trace_ocfs2_orphan_del(
+	     (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+	     name, strlen(name));
+
+	/* find it's spot in the orphan directory */
+	status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
+				  &lookup);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* remove it from the orphan directory */
+	status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(orphan_dir_inode),
+					 orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the i_nlink dance! :) */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		ocfs2_add_links_count(orphan_fe, -1);
+	set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+	ocfs2_journal_dirty(handle, orphan_dir_bh);
+
+leave:
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+					struct buffer_head *dir_bh,
+					char *orphan_name,
+					struct inode **ret_orphan_dir,
+					u64 *ret_di_blkno,
+					struct ocfs2_dir_lookup_result *orphan_insert,
+					struct ocfs2_alloc_context **ret_inode_ac)
+{
+	int ret;
+	u64 di_blkno;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct inode *orphan_dir = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+
+	ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* reserve an inode spot */
+	ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+				       &di_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+					 di_blkno, orphan_name, orphan_insert,
+					 false);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret == 0) {
+		*ret_orphan_dir = orphan_dir;
+		*ret_di_blkno = di_blkno;
+		*ret_inode_ac = inode_ac;
+		/*
+		 * orphan_name and orphan_insert are already up to
+		 * date via prepare_orphan_dir
+		 */
+	} else {
+		/* Unroll reserve_new_inode* */
+		if (inode_ac)
+			ocfs2_free_alloc_context(inode_ac);
+
+		/* Unroll orphan dir locking */
+		mutex_unlock(&orphan_dir->i_mutex);
+		ocfs2_inode_unlock(orphan_dir, 1);
+		iput(orphan_dir);
+	}
+
+	brelse(orphan_dir_bh);
+
+	return ret;
+}
+
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode)
+{
+	int status, did_quota_inode = 0;
+	struct inode *inode = NULL;
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *di = NULL;
+	handle_t *handle = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *parent_di_bh = NULL;
+	struct buffer_head *new_di_bh = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+	u64 uninitialized_var(di_blkno), suballoc_loc;
+	u16 suballoc_bit;
+
+	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+					      orphan_name, &orphan_dir,
+					      &di_blkno, &orphan_insert, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = dquot_alloc_inode(inode);
+	if (status)
+		goto leave;
+	did_quota_inode = 1;
+
+	status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+					      &suballoc_loc,
+					      &suballoc_bit, di_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clear_nlink(inode);
+	/* do the real work now. */
+	status = __ocfs2_mknod_locked(dir, inode,
+				      0, &new_di_bh, parent_di_bh, handle,
+				      inode_ac, di_blkno, suballoc_loc,
+				      suballoc_bit);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	di = (struct ocfs2_dinode *)new_di_bh->b_data;
+	status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
+				  &orphan_insert, orphan_dir, false);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* get open lock so that only nodes can't remove it from orphan dir. */
+	status = ocfs2_open_lock(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	insert_inode_hash(inode);
+leave:
+	if (status < 0 && did_quota_inode)
+		dquot_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	brelse(new_di_bh);
+
+	if (!status)
+		*new_inode = inode;
+
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+
+	ocfs2_inode_unlock(dir, 1);
+	brelse(parent_di_bh);
+	return status;
+}
+
+static int ocfs2_dio_orphan_recovered(struct inode *inode)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return 0;
+	}
+
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+
+	return ret;
+}
+
+#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+	struct inode *inode)
+{
+	char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
+	struct inode *orphan_dir_inode = NULL;
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+	struct buffer_head *di_bh = NULL;
+	int status = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = NULL;
+
+restart:
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	/*
+	 * Another append dio crashed?
+	 * If so, wait for recovery first.
+	 */
+	if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+		ocfs2_inode_unlock(inode, 1);
+		brelse(di_bh);
+		wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
+				ocfs2_dio_orphan_recovered(inode),
+				msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
+		goto restart;
+	}
+
+	status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
+			OCFS2_I(inode)->ip_blkno,
+			orphan_name,
+			&orphan_insert,
+			true);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	handle = ocfs2_start_trans(osb,
+			OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		goto bail_unlock_orphan;
+	}
+
+	status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
+			&orphan_insert, orphan_dir_inode, true);
+	if (status)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	iput(orphan_dir_inode);
+
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+
+bail_unlock_inode:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+
+bail:
+	return status;
+}
+
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+		struct inode *inode, int update_isize,
+		loff_t end)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+	handle_t *handle = NULL;
+	int status = 0;
+
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+			ORPHAN_DIR_SYSTEM_INODE,
+			le16_to_cpu(di->i_dio_orphaned_slot));
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+		mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	handle = ocfs2_start_trans(osb,
+			OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		goto bail_unlock_orphan;
+	}
+
+	BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
+				inode, orphan_dir_bh, true);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	status = ocfs2_journal_access_di(handle,
+			INODE_CACHE(inode),
+			di_bh,
+			OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+	di->i_dio_orphaned_slot = 0;
+
+	if (update_isize) {
+		status = ocfs2_set_inode_size(handle, inode, di_bh, end);
+		if (status)
+			mlog_errno(status);
+	} else
+		ocfs2_journal_dirty(handle, di_bh);
+
+bail_commit:
+	ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	brelse(orphan_dir_bh);
+	iput(orphan_dir_inode);
+
+bail_unlock_inode:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+
+bail:
+	return status;
+}
+
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *inode,
+				   struct dentry *dentry)
+{
+	int status = 0;
+	struct buffer_head *parent_di_bh = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *dir_di, *di;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
+				dentry->d_name.len, dentry->d_name.name,
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+	if (!dir_di->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+
+	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+		goto leave;
+	}
+
+	status = ocfs2_read_inode_block(inode, &di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+					 di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh, false);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
+	di->i_orphaned_slot = 0;
+	set_nlink(inode, 1);
+	ocfs2_set_links_count(di, inode->i_nlink);
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	status = ocfs2_dentry_attach_lock(dentry, inode,
+					  OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	d_instantiate(dentry, inode);
+	status = 0;
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+orphan_unlock:
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	iput(orphan_dir_inode);
+leave:
+
+	ocfs2_inode_unlock(dir, 1);
+
+	brelse(di_bh);
+	brelse(parent_di_bh);
+	brelse(orphan_dir_bh);
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+const struct inode_operations ocfs2_dir_iops = {
+	.create		= ocfs2_create,
+	.lookup		= ocfs2_lookup,
+	.link		= ocfs2_link,
+	.unlink		= ocfs2_unlink,
+	.rmdir		= ocfs2_unlink,
+	.symlink	= ocfs2_symlink,
+	.mkdir		= ocfs2_mkdir,
+	.mknod		= ocfs2_mknod,
+	.rename		= ocfs2_rename,
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
+	.fiemap         = ocfs2_fiemap,
+	.get_acl	= ocfs2_iop_get_acl,
+	.set_acl	= ocfs2_iop_set_acl,
+};
diff --git a/kernel/fs/ocfs2/namei.h b/kernel/fs/ocfs2/namei.h
new file mode 100644
index 000000000..5ddecce17
--- /dev/null
+++ b/kernel/fs/ocfs2/namei.h
@@ -0,0 +1,51 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_NAMEI_H
+#define OCFS2_NAMEI_H
+
+extern const struct inode_operations ocfs2_dir_iops;
+
+struct dentry *ocfs2_get_parent(struct dentry *child);
+
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     handle_t *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh,
+		     bool dio);
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode);
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+		struct inode *inode);
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+		struct inode *inode, int update_isize,
+		loff_t end);
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *new_inode,
+				   struct dentry *new_dentry);
+
+#endif /* OCFS2_NAMEI_H */
diff --git a/kernel/fs/ocfs2/ocfs1_fs_compat.h b/kernel/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 000000000..dfb313bda
--- /dev/null
+++ b/kernel/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs1_fs_compat.h
+ *
+ * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
+ * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
+ * This allows an OCFS1 volume to see the partition and cleanly fail to
+ * mount it.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS1_FS_COMPAT_H
+#define _OCFS1_FS_COMPAT_H
+
+#define OCFS1_MAX_VOL_SIGNATURE_LEN          128
+#define OCFS1_MAX_MOUNT_POINT_LEN            128
+#define OCFS1_MAX_VOL_ID_LENGTH               16
+#define OCFS1_MAX_VOL_LABEL_LEN               64
+#define OCFS1_MAX_CLUSTER_NAME_LEN            64
+
+#define OCFS1_MAJOR_VERSION              (2)
+#define OCFS1_MINOR_VERSION              (0)
+#define OCFS1_VOLUME_SIGNATURE		 "OracleCFS"
+
+/*
+ * OCFS1 superblock.  Lives at sector 0.
+ */
+struct ocfs1_vol_disk_hdr
+{
+/*00*/	__u32 minor_version;
+	__u32 major_version;
+/*08*/	__u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
+/*88*/	__u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
+/*108*/	__u64 serial_num;
+/*110*/	__u64 device_size;
+	__u64 start_off;
+/*120*/	__u64 bitmap_off;
+	__u64 publ_off;
+/*130*/	__u64 vote_off;
+	__u64 root_bitmap_off;
+/*140*/	__u64 data_start_off;
+	__u64 root_bitmap_size;
+/*150*/	__u64 root_off;
+	__u64 root_size;
+/*160*/	__u64 cluster_size;
+	__u64 num_nodes;
+/*170*/	__u64 num_clusters;
+	__u64 dir_node_size;
+/*180*/	__u64 file_node_size;
+	__u64 internal_off;
+/*190*/	__u64 node_cfg_off;
+	__u64 node_cfg_size;
+/*1A0*/	__u64 new_cfg_off;
+	__u32 prot_bits;
+	__s32 excl_mount;
+/*1B0*/
+};
+
+
+struct ocfs1_disk_lock
+{
+/*00*/	__u32 curr_master;
+	__u8 file_lock;
+	__u8 compat_pad[3];  /* Not in original definition.  Used to
+				make the already existing alignment
+				explicit */
+	__u64 last_write_time;
+/*10*/	__u64 last_read_time;
+	__u32 writer_node_num;
+	__u32 reader_node_num;
+/*20*/	__u64 oin_node_map;
+	__u64 dlock_seq_num;
+/*30*/
+};
+
+/*
+ * OCFS1 volume label.  Lives at sector 1.
+ */
+struct ocfs1_vol_label
+{
+/*00*/	struct ocfs1_disk_lock disk_lock;
+/*30*/	__u8 label[OCFS1_MAX_VOL_LABEL_LEN];
+/*70*/	__u16 label_len;
+/*72*/	__u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
+/*82*/	__u16 vol_id_len;
+/*84*/	__u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
+/*A4*/	__u16 cluster_name_len;
+/*A6*/
+};
+
+
+#endif /* _OCFS1_FS_COMPAT_H */
+
diff --git a/kernel/fs/ocfs2/ocfs2.h b/kernel/fs/ocfs2/ocfs2.h
new file mode 100644
index 000000000..460c6c37e
--- /dev/null
+++ b/kernel/fs/ocfs2/ocfs2.h
@@ -0,0 +1,919 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2.h
+ *
+ * Defines macros and structures used in OCFS2
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_H
+#define OCFS2_H
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/lockdep.h>
+#include <linux/jbd2.h>
+
+/* For union ocfs2_dlm_lksb */
+#include "stackglue.h"
+
+#include "ocfs2_fs.h"
+#include "ocfs2_lockid.h"
+#include "ocfs2_ioctl.h"
+
+/* For struct ocfs2_blockcheck_stats */
+#include "blockcheck.h"
+
+#include "reservations.h"
+
+/* Caching of metadata buffers */
+
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_CACHE_INFO_MAX_ARRAY 2
+
+/* Flags for ocfs2_caching_info */
+
+enum ocfs2_caching_info_flags {
+	/* Indicates that the metadata cache is using the inline array */
+	OCFS2_CACHE_FL_INLINE	= 1<<1,
+};
+
+struct ocfs2_caching_operations;
+struct ocfs2_caching_info {
+	/*
+	 * The parent structure provides the locks, but because the
+	 * parent structure can differ, it provides locking operations
+	 * to struct ocfs2_caching_info.
+	 */
+	const struct ocfs2_caching_operations *ci_ops;
+
+	/* next two are protected by trans_inc_lock */
+	/* which transaction were we created on? Zero if none. */
+	unsigned long		ci_created_trans;
+	/* last transaction we were a part of. */
+	unsigned long		ci_last_trans;
+
+	/* Cache structures */
+	unsigned int		ci_flags;
+	unsigned int		ci_num_cached;
+	union {
+	sector_t	ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
+		struct rb_root	ci_tree;
+	} ci_cache;
+};
+/*
+ * Need this prototype here instead of in uptodate.h because journal.h
+ * uses it.
+ */
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
+
+/* this limits us to 256 nodes
+ * if we need more, we can do a kmalloc for the map */
+#define OCFS2_NODE_MAP_MAX_NODES    256
+struct ocfs2_node_map {
+	u16 num_nodes;
+	unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
+};
+
+enum ocfs2_ast_action {
+	OCFS2_AST_INVALID = 0,
+	OCFS2_AST_ATTACH,
+	OCFS2_AST_CONVERT,
+	OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+	OCFS2_UNLOCK_INVALID = 0,
+	OCFS2_UNLOCK_CANCEL_CONVERT,
+	OCFS2_UNLOCK_DROP_LOCK,
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* we have initialized
+					       * the lvb */
+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					       * downconvert*/
+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING    (0x00000020)
+#define OCFS2_LOCK_INITIALIZED   (0x00000040) /* track initialization
+					       * for shutdown paths */
+#define OCFS2_LOCK_FREEING       (0x00000080) /* help dlmglue track
+					       * when to skip queueing
+					       * a lock because it's
+					       * about to be
+					       * dropped. */
+#define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
+						 call to dlm_lock.  Only
+						 exists with BUSY set. */
+#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
+						     * from downconverting
+						     * before the upconvert
+						     * has completed */
+
+#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster
+						   * lock has already
+						   * returned, do not block
+						   * dc thread from
+						   * downconverting */
+
+struct ocfs2_lock_res_ops;
+
+typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+
+#ifdef CONFIG_OCFS2_FS_STATS
+struct ocfs2_lock_stats {
+	u64		ls_total;	/* Total wait in NSEC */
+	u32		ls_gets;	/* Num acquires */
+	u32		ls_fail;	/* Num failed acquires */
+
+	/* Storing max wait in usecs saves 24 bytes per inode */
+	u32		ls_max;		/* Max wait in USEC */
+};
+#endif
+
+struct ocfs2_lock_res {
+	void                    *l_priv;
+	struct ocfs2_lock_res_ops *l_ops;
+
+
+	struct list_head         l_blocked_list;
+	struct list_head         l_mask_waiters;
+
+	unsigned long		 l_flags;
+	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	signed char		 l_level;
+	signed char		 l_requested;
+	signed char		 l_blocking;
+
+	/* Data packed - type enum ocfs2_lock_type */
+	unsigned char            l_type;
+
+	/* used from AST/BAST funcs. */
+	/* Data packed - enum type ocfs2_ast_action */
+	unsigned char            l_action;
+	/* Data packed - enum type ocfs2_unlock_action */
+	unsigned char            l_unlock_action;
+	unsigned int             l_pending_gen;
+
+	spinlock_t               l_lock;
+
+	struct ocfs2_dlm_lksb    l_lksb;
+
+	wait_queue_head_t        l_event;
+
+	struct list_head         l_debug_list;
+
+#ifdef CONFIG_OCFS2_FS_STATS
+	struct ocfs2_lock_stats  l_lock_prmode;		/* PR mode stats */
+	u32                      l_lock_refresh;	/* Disk refreshes */
+	struct ocfs2_lock_stats  l_lock_exmode;		/* EX mode stats */
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	 l_lockdep_map;
+#endif
+};
+
+enum ocfs2_orphan_reco_type {
+	ORPHAN_NO_NEED_TRUNCATE = 0,
+	ORPHAN_NEED_TRUNCATE,
+};
+
+enum ocfs2_orphan_scan_state {
+	ORPHAN_SCAN_ACTIVE,
+	ORPHAN_SCAN_INACTIVE
+};
+
+struct ocfs2_orphan_scan {
+	struct mutex 		os_lock;
+	struct ocfs2_super 	*os_osb;
+	struct ocfs2_lock_res 	os_lockres;     /* lock to synchronize scans */
+	struct delayed_work 	os_orphan_scan_work;
+	struct timespec		os_scantime;  /* time this node ran the scan */
+	u32			os_count;      /* tracks node specific scans */
+	u32  			os_seqno;       /* tracks cluster wide scans */
+	atomic_t		os_state;              /* ACTIVE or INACTIVE */
+};
+
+struct ocfs2_dlm_debug {
+	struct kref d_refcnt;
+	struct dentry *d_locking_state;
+	struct list_head d_lockres_tracking;
+};
+
+enum ocfs2_vol_state
+{
+	VOLUME_INIT = 0,
+	VOLUME_MOUNTED,
+	VOLUME_MOUNTED_QUOTAS,
+	VOLUME_DISMOUNTED,
+	VOLUME_DISABLED
+};
+
+struct ocfs2_alloc_stats
+{
+	atomic_t moves;
+	atomic_t local_data;
+	atomic_t bitmap_data;
+	atomic_t bg_allocs;
+	atomic_t bg_extends;
+};
+
+enum ocfs2_local_alloc_state
+{
+	OCFS2_LA_UNUSED = 0,	/* Local alloc will never be used for
+				 * this mountpoint. */
+	OCFS2_LA_ENABLED,	/* Local alloc is in use. */
+	OCFS2_LA_THROTTLED,	/* Local alloc is in use, but number
+				 * of bits has been reduced. */
+	OCFS2_LA_DISABLED	/* Local alloc has temporarily been
+				 * disabled. */
+};
+
+enum ocfs2_mount_options
+{
+	OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
+	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
+	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
+	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
+	OCFS2_MOUNT_POSIX_ACL = 1 << 8,	/* Force POSIX access control lists */
+	OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9,	/* Disable POSIX access
+						   control lists */
+	OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
+	OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+	OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+						     writes */
+	OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+	OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
+
+	OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15,  /* Journal Async Commit */
+};
+
+#define OCFS2_OSB_SOFT_RO	0x0001
+#define OCFS2_OSB_HARD_RO	0x0002
+#define OCFS2_OSB_ERROR_FS	0x0004
+#define OCFS2_DEFAULT_ATIME_QUANTUM	60
+
+struct ocfs2_journal;
+struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
+struct ocfs2_quota_recovery;
+struct ocfs2_super
+{
+	struct task_struct *commit_task;
+	struct super_block *sb;
+	struct inode *root_inode;
+	struct inode *sys_root_inode;
+	struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+	struct inode **local_system_inodes;
+
+	struct ocfs2_slot_info *slot_info;
+
+	u32 *slot_recovery_generations;
+
+	spinlock_t node_map_lock;
+
+	u64 root_blkno;
+	u64 system_dir_blkno;
+	u64 bitmap_blkno;
+	u32 bitmap_cpg;
+	u8 *uuid;
+	char *uuid_str;
+	u32 uuid_hash;
+	u8 *vol_label;
+	u64 first_cluster_group_blkno;
+	u32 fs_generation;
+
+	u32 s_feature_compat;
+	u32 s_feature_incompat;
+	u32 s_feature_ro_compat;
+
+	/* Protects s_next_generation, osb_flags and s_inode_steal_slot.
+	 * Could protect more on osb as it's very short lived.
+	 */
+	spinlock_t osb_lock;
+	u32 s_next_generation;
+	unsigned long osb_flags;
+	s16 s_inode_steal_slot;
+	s16 s_meta_steal_slot;
+	atomic_t s_num_inodes_stolen;
+	atomic_t s_num_meta_stolen;
+
+	unsigned long s_mount_opt;
+	unsigned int s_atime_quantum;
+
+	unsigned int max_slots;
+	unsigned int node_num;
+	int slot_num;
+	int preferred_slot;
+	int s_sectsize_bits;
+	int s_clustersize;
+	int s_clustersize_bits;
+	unsigned int s_xattr_inline_size;
+
+	atomic_t vol_state;
+	struct mutex recovery_lock;
+	struct ocfs2_recovery_map *recovery_map;
+	struct ocfs2_replay_map *replay_map;
+	struct task_struct *recovery_thread_task;
+	int disable_recovery;
+	wait_queue_head_t checkpoint_event;
+	struct ocfs2_journal *journal;
+	unsigned long osb_commit_interval;
+
+	struct delayed_work		la_enable_wq;
+
+	/*
+	 * Must hold local alloc i_mutex and osb->osb_lock to change
+	 * local_alloc_bits. Reads can be done under either lock.
+	 */
+	unsigned int local_alloc_bits;
+	unsigned int local_alloc_default_bits;
+	/* osb_clusters_at_boot can become stale! Do not trust it to
+	 * be up to date. */
+	unsigned int osb_clusters_at_boot;
+
+	enum ocfs2_local_alloc_state local_alloc_state; /* protected
+							 * by osb_lock */
+
+	struct buffer_head *local_alloc_bh;
+
+	u64 la_last_gd;
+
+	struct ocfs2_reservation_map	osb_la_resmap;
+
+	unsigned int	osb_resv_level;
+	unsigned int	osb_dir_resv_level;
+
+	/* Next three fields are for local node slot recovery during
+	 * mount. */
+	int dirty;
+	struct ocfs2_dinode *local_alloc_copy;
+	struct ocfs2_quota_recovery *quota_rec;
+
+	struct ocfs2_blockcheck_stats osb_ecc_stats;
+	struct ocfs2_alloc_stats alloc_stats;
+	char dev_str[20];		/* "major,minor" of the device */
+
+	u8 osb_stackflags;
+
+	char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+	char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
+	struct ocfs2_cluster_connection *cconn;
+	struct ocfs2_lock_res osb_super_lockres;
+	struct ocfs2_lock_res osb_rename_lockres;
+	struct ocfs2_lock_res osb_nfs_sync_lockres;
+	struct ocfs2_dlm_debug *osb_dlm_debug;
+
+	struct dentry *osb_debug_root;
+	struct dentry *osb_ctxt;
+
+	wait_queue_head_t recovery_event;
+
+	spinlock_t dc_task_lock;
+	struct task_struct *dc_task;
+	wait_queue_head_t dc_event;
+	unsigned long dc_wake_sequence;
+	unsigned long dc_work_sequence;
+
+	/*
+	 * Any thread can add locks to the list, but the downconvert
+	 * thread is the only one allowed to remove locks. Any change
+	 * to this rule requires updating
+	 * ocfs2_downconvert_thread_do_work().
+	 */
+	struct list_head blocked_lock_list;
+	unsigned long blocked_lock_count;
+
+	/* List of dquot structures to drop last reference to */
+	struct llist_head dquot_drop_list;
+	struct work_struct dquot_drop_work;
+
+	wait_queue_head_t		osb_mount_event;
+
+	/* Truncate log info */
+	struct inode			*osb_tl_inode;
+	struct buffer_head		*osb_tl_bh;
+	struct delayed_work		osb_truncate_log_wq;
+	atomic_t			osb_tl_disable;
+	/*
+	 * How many clusters in our truncate log.
+	 * It must be protected by osb_tl_inode->i_mutex.
+	 */
+	unsigned int truncated_clusters;
+
+	struct ocfs2_node_map		osb_recovering_orphan_dirs;
+	unsigned int			*osb_orphan_wipes;
+	wait_queue_head_t		osb_wipe_event;
+
+	struct ocfs2_orphan_scan	osb_orphan_scan;
+
+	/* used to protect metaecc calculation check of xattr. */
+	spinlock_t osb_xattr_lock;
+
+	unsigned int			osb_dx_mask;
+	u32				osb_dx_seed[4];
+
+	/* the group we used to allocate inodes. */
+	u64				osb_inode_alloc_group;
+
+	/* rb tree root for refcount lock. */
+	struct rb_root	osb_rf_lock_tree;
+	struct ocfs2_refcount_tree *osb_ref_tree_lru;
+
+	struct mutex system_file_mutex;
+};
+
+#define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
+
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle,
+					 struct ocfs2_caching_info *ci,
+					 struct buffer_head *bh, int type);
+
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
+		return 0;
+	return 1;
+}
+
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
+{
+	/*
+	 * Support for sparse files is a pre-requisite
+	 */
+	if (!ocfs2_sparse_alloc(osb))
+		return 0;
+
+	if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
+		return 1;
+	return 0;
+}
+
+
+static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+		return 1;
+	return 0;
+}
+
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+	if (ocfs2_supports_indexed_dirs(osb))
+		return OCFS2_DX_LINK_MAX;
+	return OCFS2_LINK_MAX;
+}
+
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+	u32 nlink = le16_to_cpu(di->i_links_count);
+	u32 hi = le16_to_cpu(di->i_links_count_hi);
+
+	if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+		nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+
+	return nlink;
+}
+
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+	u16 lo, hi;
+
+	lo = nlink;
+	hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+
+	di->i_links_count = cpu_to_le16(lo);
+	di->i_links_count_hi = cpu_to_le16(hi);
+}
+
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+	u32 links = ocfs2_read_links_count(di);
+
+	links += n;
+
+	ocfs2_set_links_count(di, links);
+}
+
+static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+		return 1;
+	return 0;
+}
+
+/* set / clear functions because cluster events can make these happen
+ * in parallel so we want the transitions to be atomic. this also
+ * means that any future flags osb_flags must be protected by spinlock
+ * too! */
+static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
+				      unsigned long flag)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags |= flag;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
+				     int hard)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
+	if (hard)
+		osb->osb_flags |= OCFS2_OSB_HARD_RO;
+	else
+		osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat &
+		(OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+		 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+	return ocfs2_o2cb_stack(osb) &&
+		(osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
+}
+
+static inline int ocfs2_mount_local(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
+}
+
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat &
+		OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
+
+
+#define OCFS2_IS_VALID_DINODE(ptr)					\
+	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
+
+#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
+	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
+
+#define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
+
+
+#define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
+	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)					\
+	(!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_ROOT(ptr)					\
+	(!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_LEAF(ptr)					\
+	(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
+
+#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr)				\
+	(!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
+
+static inline unsigned long ino_from_blkno(struct super_block *sb,
+					   u64 blkno)
+{
+	return (unsigned long)(blkno & (u64)ULONG_MAX);
+}
+
+static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
+					   u32 clusters)
+{
+	int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u64)clusters << c_to_b_bits;
+}
+
+static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
+					   u64 blocks)
+{
+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u32)(blocks >> b_to_c_bits);
+}
+
+static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
+						    u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	bytes += OCFS2_SB(sb)->s_clustersize - 1;
+	/* OCFS2 just cannot have enough clusters to overflow this */
+	clusters = (unsigned int)(bytes >> cl_bits);
+
+	return clusters;
+}
+
+static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
+		u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	clusters = (unsigned int)(bytes >> cl_bits);
+	return clusters;
+}
+
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
+static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
+					  u32 clusters)
+{
+	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
+}
+
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+					       u64 blocks)
+{
+	int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_blocks_to_clusters(sb, blocks);
+	return (u64)clusters << bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
+						u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_clusters_for_bytes(sb, bytes);
+	return (u64)clusters << cl_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
+{
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
+}
+
+static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
+{
+	return (unsigned long)((bytes + 511) >> 9);
+}
+
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+							unsigned long pg_index)
+{
+	u32 clusters = pg_index;
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+
+	if (unlikely(PAGE_CACHE_SHIFT > cbits))
+		clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+	else if (PAGE_CACHE_SHIFT < cbits)
+		clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+
+	return clusters;
+}
+
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
+							u32 clusters)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        pgoff_t index = clusters;
+
+	if (PAGE_CACHE_SHIFT > cbits) {
+		index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
+	} else if (PAGE_CACHE_SHIFT < cbits) {
+		index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
+	}
+
+	return index;
+}
+
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int pages_per_cluster = 1;
+
+	if (PAGE_CACHE_SHIFT < cbits)
+		pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+	return pages_per_cluster;
+}
+
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+						       unsigned int megs)
+{
+	BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+						       unsigned int clusters)
+{
+	return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+	__set_bit_le(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+	__clear_bit_le(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
+
+#define ocfs2_test_bit test_bit_le
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
+#define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+	*bit += ((unsigned long) addr & 7UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+	*bit += ((unsigned long) addr & 3UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+	return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+	bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+	ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+	bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+	ocfs2_clear_bit(bit, bitmap);
+}
+
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+	bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+	return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+							int start)
+{
+	int fix = 0, ret, tmpmax;
+	bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+	tmpmax = max + fix;
+	start += fix;
+
+	ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
+}
+
+#endif  /* OCFS2_H */
+
diff --git a/kernel/fs/ocfs2/ocfs2_fs.h b/kernel/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 000000000..db64ce2d4
--- /dev/null
+++ b/kernel/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,1651 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_fs.h
+ *
+ * On-disk structures for OCFS2.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_FS_H
+#define _OCFS2_FS_H
+
+/* Version */
+#define OCFS2_MAJOR_REV_LEVEL		0
+#define OCFS2_MINOR_REV_LEVEL          	90
+
+/*
+ * An OCFS2 volume starts this way:
+ * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
+ * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
+ * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
+ *
+ * All other structures are found from the superblock information.
+ *
+ * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors.  eg, for a
+ * blocksize of 2K, it is 4096 bytes into disk.
+ */
+#define OCFS2_SUPER_BLOCK_BLKNO		2
+
+/*
+ * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
+ * grow if needed.
+ */
+#define OCFS2_MIN_CLUSTERSIZE		4096
+#define OCFS2_MAX_CLUSTERSIZE		1048576
+
+/*
+ * Blocks cannot be bigger than clusters, so the maximum blocksize is the
+ * minimum cluster size.
+ */
+#define OCFS2_MIN_BLOCKSIZE		512
+#define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
+
+/* Filesystem magic number */
+#define OCFS2_SUPER_MAGIC		0x7461636f
+
+/* Object signatures */
+#define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
+#define OCFS2_INODE_SIGNATURE		"INODE01"
+#define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE		"DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE		"DXLEAF1"
+#define OCFS2_REFCOUNT_BLOCK_SIGNATURE	"REFCNT1"
+
+/* Compatibility flags */
+#define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_compat & (mask) )
+#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
+#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_incompat & (mask) )
+#define OCFS2_SET_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat |= (mask)
+#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
+#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat |= (mask)
+#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat &= ~(mask)
+#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
+#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
+
+#define OCFS2_FEATURE_COMPAT_SUPP	(OCFS2_FEATURE_COMPAT_BACKUP_SB	\
+					 | OCFS2_FEATURE_COMPAT_JBD2_SB)
+#define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
+					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+					 | OCFS2_FEATURE_INCOMPAT_XATTR \
+					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
+					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
+					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
+					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+					 | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+
+/*
+ * Heartbeat-only devices are missing journals and other files.  The
+ * filesystem driver can't load them, but the library can.  Never put
+ * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
+ */
+#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV	0x0002
+
+/*
+ * tunefs sets this incompat flag before starting the resize and clears it
+ * at the end. This flag protects users from inadvertently mounting the fs
+ * after an aborted run without fsck-ing.
+ */
+#define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG    0x0004
+
+/* Used to denote a non-clustered volume */
+#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT	0x0008
+
+/* Support for sparse allocation in b-trees */
+#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC	0x0010
+
+/*
+ * Tunefs sets this incompat flag before starting an operation which
+ * would require cleanup on abort. This is done to protect users from
+ * inadvertently mounting the fs after an aborted run without
+ * fsck-ing.
+ *
+ * s_tunefs_flags on the super block describes precisely which
+ * operations were in progress.
+ */
+#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG	0x0020
+
+/* Support for data packed into inode blocks */
+#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040
+
+/*
+ * Support for alternate, userspace cluster stacks.  If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used.  This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080
+
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
+
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS	0x0400
+
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
+
+/* Refcount tree support */
+#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE	0x1000
+
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	0x2000
+
+/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
+
+/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO	0x8000
+
+/*
+ * backup superblock flag is used to indicate that this volume
+ * has backup superblocks.
+ */
+#define OCFS2_FEATURE_COMPAT_BACKUP_SB		0x0001
+
+/*
+ * The filesystem will correctly handle journal feature bits.
+ */
+#define OCFS2_FEATURE_COMPAT_JBD2_SB		0x0002
+
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
+
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA	0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA	0x0004
+
+
+/* The byte offset of the first backup block will be 1G.
+ * The following will be 4G, 16G, 64G, 256G and 1T.
+ */
+#define OCFS2_BACKUP_SB_START			1 << 30
+
+/* the max backup superblock nums */
+#define OCFS2_MAX_BACKUP_SUPERBLOCKS	6
+
+/*
+ * Flags on ocfs2_super_block.s_tunefs_flags
+ */
+#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT		0x0001	/* Removing slots */
+
+/*
+ * Flags on ocfs2_dinode.i_flags
+ */
+#define OCFS2_VALID_FL		(0x00000001)	/* Inode is valid */
+#define OCFS2_UNUSED2_FL	(0x00000002)
+#define OCFS2_ORPHANED_FL	(0x00000004)	/* On the orphan list */
+#define OCFS2_UNUSED3_FL	(0x00000008)
+/* System inode flags */
+#define OCFS2_SYSTEM_FL		(0x00000010)	/* System inode */
+#define OCFS2_SUPER_BLOCK_FL	(0x00000020)	/* Super block */
+#define OCFS2_LOCAL_ALLOC_FL	(0x00000040)	/* Slot local alloc bitmap */
+#define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
+#define OCFS2_JOURNAL_FL	(0x00000100)	/* Slot local journal */
+#define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
+#define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+#define OCFS2_QUOTA_FL		(0x00001000)	/* Quota file */
+#define OCFS2_DIO_ORPHANED_FL	(0X00002000)	/* On the orphan list especially
+						 * for dio */
+
+/*
+ * Flags on ocfs2_dinode.i_dyn_features
+ *
+ * These can change much more often than i_flags. When adding flags,
+ * keep in mind that i_dyn_features is only 16 bits wide.
+ */
+#define OCFS2_INLINE_DATA_FL	(0x0001)	/* Data stored in inode block */
+#define OCFS2_HAS_XATTR_FL	(0x0002)
+#define OCFS2_INLINE_XATTR_FL	(0x0004)
+#define OCFS2_INDEXED_DIR_FL	(0x0008)
+#define OCFS2_HAS_REFCOUNT_FL   (0x0010)
+
+/* Inode attributes, keep in sync with EXT2 */
+#define OCFS2_SECRM_FL			FS_SECRM_FL	/* Secure deletion */
+#define OCFS2_UNRM_FL			FS_UNRM_FL	/* Undelete */
+#define OCFS2_COMPR_FL			FS_COMPR_FL	/* Compress file */
+#define OCFS2_SYNC_FL			FS_SYNC_FL	/* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL		FS_IMMUTABLE_FL	/* Immutable file */
+#define OCFS2_APPEND_FL			FS_APPEND_FL	/* writes to file may only append */
+#define OCFS2_NODUMP_FL			FS_NODUMP_FL	/* do not dump file */
+#define OCFS2_NOATIME_FL		FS_NOATIME_FL	/* do not update atime */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL			FS_DIRTY_FL
+#define OCFS2_COMPRBLK_FL		FS_COMPRBLK_FL	/* One or more compressed clusters */
+#define OCFS2_NOCOMP_FL			FS_NOCOMP_FL	/* Don't compress */
+#define OCFS2_ECOMPR_FL			FS_ECOMPR_FL	/* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL			FS_BTREE_FL	/* btree format dir */
+#define OCFS2_INDEX_FL			FS_INDEX_FL	/* hash-indexed directory */
+#define OCFS2_IMAGIC_FL			FS_IMAGIC_FL	/* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL		FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL			FS_NOTAIL_FL	/* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL		FS_DIRSYNC_FL	/* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL			FS_TOPDIR_FL	/* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL		FS_RESERVED_FL	/* reserved for ext2 lib */
+
+#define OCFS2_FL_VISIBLE		FS_FL_USER_VISIBLE	/* User visible flags */
+#define OCFS2_FL_MODIFIABLE		FS_FL_USER_MODIFIABLE	/* User modifiable flags */
+
+/*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN		(0x01)	/* Extent is allocated but
+						 * unwritten */
+#define OCFS2_EXT_REFCOUNTED		(0x02)  /* Extent is reference
+						 * counted in an associated
+						 * refcount tree */
+
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
+
+/*
+ * superblock s_state flags
+ */
+#define OCFS2_ERROR_FS		(0x00000001)	/* FS saw errors */
+
+/* Limit of space in ocfs2_dir_entry */
+#define OCFS2_MAX_FILENAME_LEN		255
+
+/* Maximum slots on an ocfs2 file system */
+#define OCFS2_MAX_SLOTS			255
+
+/* Slot map indicator for an empty slot */
+#define OCFS2_INVALID_SLOT		-1
+
+#define OCFS2_VOL_UUID_LEN		16
+#define OCFS2_MAX_VOL_LABEL_LEN		64
+
+/* The cluster stack fields */
+#define OCFS2_STACK_LABEL_LEN		4
+#define OCFS2_CLUSTER_NAME_LEN		16
+
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK	"o2cb"
+
+/* Journal limits (in bytes) */
+#define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
+
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE     256
+
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT	(0x01)
+
+struct ocfs2_system_inode_info {
+	char	*si_name;
+	int	si_iflags;
+	int	si_mode;
+};
+
+/* System file index */
+enum {
+	BAD_BLOCK_SYSTEM_INODE = 0,
+	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+	SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
+	HEARTBEAT_SYSTEM_INODE,
+	GLOBAL_BITMAP_SYSTEM_INODE,
+	USER_QUOTA_SYSTEM_INODE,
+	GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
+	ORPHAN_DIR_SYSTEM_INODE,
+	EXTENT_ALLOC_SYSTEM_INODE,
+	INODE_ALLOC_SYSTEM_INODE,
+	JOURNAL_SYSTEM_INODE,
+	LOCAL_ALLOC_SYSTEM_INODE,
+	TRUNCATE_LOG_SYSTEM_INODE,
+	LOCAL_USER_QUOTA_SYSTEM_INODE,
+	LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
+	NUM_SYSTEM_INODES
+};
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES	\
+		(NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
+
+static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
+	/* Global system inodes (single copy) */
+	/* The first two are only used from userspace mfks/tunefs */
+	[BAD_BLOCK_SYSTEM_INODE]		= { "bad_blocks", 0, S_IFREG | 0644 },
+	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	= { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+
+	/* These are used by the running filesystem */
+	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
+	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
+	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+	[USER_QUOTA_SYSTEM_INODE]		= { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[GROUP_QUOTA_SYSTEM_INODE]		= { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+
+	/* Slot-specific system inodes (one copy per slot) */
+	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
+	[EXTENT_ALLOC_SYSTEM_INODE]		= { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
+	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+	[LOCAL_USER_QUOTA_SYSTEM_INODE]		= { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[LOCAL_GROUP_QUOTA_SYSTEM_INODE]	= { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+};
+
+/* Parameter passed from mount.ocfs2 to module */
+#define OCFS2_HB_NONE			"heartbeat=none"
+#define OCFS2_HB_LOCAL			"heartbeat=local"
+#define OCFS2_HB_GLOBAL			"heartbeat=global"
+
+/*
+ * OCFS2 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define OCFS2_FT_UNKNOWN	0
+#define OCFS2_FT_REG_FILE	1
+#define OCFS2_FT_DIR		2
+#define OCFS2_FT_CHRDEV		3
+#define OCFS2_FT_BLKDEV		4
+#define OCFS2_FT_FIFO		5
+#define OCFS2_FT_SOCK		6
+#define OCFS2_FT_SYMLINK	7
+
+#define OCFS2_FT_MAX		8
+
+/*
+ * OCFS2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define OCFS2_DIR_PAD			4
+#define OCFS2_DIR_ROUND			(OCFS2_DIR_PAD - 1)
+#define OCFS2_DIR_MEMBER_LEN 		offsetof(struct ocfs2_dir_entry, name)
+#define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
+                                          OCFS2_DIR_ROUND) & \
+					 ~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN	OCFS2_DIR_REC_LEN(1)
+
+#define OCFS2_LINK_MAX		32000
+#define	OCFS2_DX_LINK_MAX	((1U << 31) - 1U)
+#define	OCFS2_LINKS_HI_SHIFT	16
+#define	OCFS2_DX_ENTRIES_MAX	(0xffffffffU)
+
+#define S_SHIFT			12
+static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
+};
+
+
+/*
+ * Convenience casts
+ */
+#define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
+
+/*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/	__le32 bc_crc32e;	/* 802.3 Ethernet II CRC32 */
+	__le16 bc_ecc;		/* Single-error-correction parity vector.
+				   This is a simple Hamming code dependent
+				   on the blocksize.  OCFS2's maximum
+				   blocksize, 4K, requires 16 parity bits,
+				   so we fit in __le16. */
+	__le16 bc_reserved1;
+/*08*/
+};
+
+/*
+ * On disk extent record for OCFS2
+ * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
+ */
+struct ocfs2_extent_rec {
+/*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
+	union {
+		__le32 e_int_clusters; /* Clusters covered by all children */
+		struct {
+			__le16 e_leaf_clusters; /* Clusters covered by this
+						   extent */
+			__u8 e_reserved1;
+			__u8 e_flags; /* Extent flags */
+		};
+	};
+	__le64 e_blkno;		/* Physical disk offset, in blocks */
+/*10*/
+};
+
+struct ocfs2_chain_rec {
+	__le32 c_free;	/* Number of free bits in this chain. */
+	__le32 c_total;	/* Number of total bits in this chain */
+	__le64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
+};
+
+struct ocfs2_truncate_rec {
+	__le32 t_start;		/* 1st cluster in this log */
+	__le32 t_clusters;	/* Number of total clusters covered */
+};
+
+/*
+ * On disk extent list for OCFS2 (node in the tree).  Note that this
+ * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
+ * offsets are relative to ocfs2_dinode.id2.i_list or
+ * ocfs2_extent_block.h_list, respectively.
+ */
+struct ocfs2_extent_list {
+/*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
+					   point.  0 means data extents
+					   hang directly off this
+					   header (a leaf)
+					   NOTE: The high 8 bits cannot be
+					   used - tree_depth is never that big.
+					*/
+	__le16 l_count;			/* Number of extent records */
+	__le16 l_next_free_rec;		/* Next unused extent slot */
+	__le16 l_reserved1;
+	__le64 l_reserved2;		/* Pad to
+					   sizeof(ocfs2_extent_rec) */
+/*10*/	struct ocfs2_extent_rec l_recs[0];	/* Extent records */
+};
+
+/*
+ * On disk allocation chain list for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_chain.
+ */
+struct ocfs2_chain_list {
+/*00*/	__le16 cl_cpg;			/* Clusters per Block Group */
+	__le16 cl_bpc;			/* Bits per cluster */
+	__le16 cl_count;		/* Total chains in this list */
+	__le16 cl_next_free_rec;	/* Next unused chain slot */
+	__le64 cl_reserved1;
+/*10*/	struct ocfs2_chain_rec cl_recs[0];	/* Chain records */
+};
+
+/*
+ * On disk deallocation log for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_dealloc.
+ */
+struct ocfs2_truncate_log {
+/*00*/	__le16 tl_count;		/* Total records in this log */
+	__le16 tl_used;			/* Number of records in use */
+	__le32 tl_reserved1;
+/*08*/	struct ocfs2_truncate_rec tl_recs[0];	/* Truncate records */
+};
+
+/*
+ * On disk extent block (indirect block) for OCFS2
+ */
+struct ocfs2_extent_block
+{
+/*00*/	__u8 h_signature[8];		/* Signature for verification */
+	struct ocfs2_block_check h_check;	/* Error checking */
+/*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
+					   extent_header belongs to */
+	__le16 h_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 h_fs_generation;		/* Must match super block */
+	__le64 h_blkno;			/* Offset on disk, in blocks */
+/*20*/	__le64 h_suballoc_loc;		/* Suballocator block group this
+					   eb belongs to.  Only valid
+					   if allocated from a
+					   discontiguous block group */
+	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
+					   of next leaf header pointing
+					   to data */
+/*30*/	struct ocfs2_extent_list h_list;	/* Extent record list */
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk slot map for OCFS2.  This defines the contents of the "slot_map"
+ * system file.  A slot is valid if it contains a node number >= 0.  The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT.  This marks a slot empty.
+ */
+struct ocfs2_slot_map {
+/*00*/	__le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block.  OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+
+struct ocfs2_extended_slot {
+/*00*/	__u8	es_valid;
+	__u8	es_reserved1[3];
+	__le32	es_node_num;
+/*10*/
+};
+
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set.  It separates out the valid marker from the node number, and
+ * has room to grow.  Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/	struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file.  It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
+struct ocfs2_cluster_info {
+/*00*/	__u8   ci_stack[OCFS2_STACK_LABEL_LEN];
+	union {
+		__le32 ci_reserved;
+		struct {
+			__u8 ci_stackflags;
+			__u8 ci_reserved1;
+			__u8 ci_reserved2;
+			__u8 ci_reserved3;
+		};
+	};
+/*08*/	__u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+
+/*
+ * On disk superblock for OCFS2
+ * Note that it is contained inside an ocfs2_dinode, so all offsets
+ * are relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_super_block {
+/*00*/	__le16 s_major_rev_level;
+	__le16 s_minor_rev_level;
+	__le16 s_mnt_count;
+	__le16 s_max_mnt_count;
+	__le16 s_state;			/* File system state */
+	__le16 s_errors;			/* Behaviour when detecting errors */
+	__le32 s_checkinterval;		/* Max time between checks */
+/*10*/	__le64 s_lastcheck;		/* Time of last check */
+	__le32 s_creator_os;		/* OS */
+	__le32 s_feature_compat;		/* Compatible feature set */
+/*20*/	__le32 s_feature_incompat;	/* Incompatible feature set */
+	__le32 s_feature_ro_compat;	/* Readonly-compatible feature set */
+	__le64 s_root_blkno;		/* Offset, in blocks, of root directory
+					   dinode */
+/*30*/	__le64 s_system_dir_blkno;	/* Offset, in blocks, of system
+					   directory dinode */
+	__le32 s_blocksize_bits;		/* Blocksize for this fs */
+	__le32 s_clustersize_bits;	/* Clustersize for this fs */
+/*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
+					   before tunefs required */
+	__le16 s_tunefs_flag;
+	__le32 s_uuid_hash;		/* hash value of uuid */
+	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
+					 * group header */
+/*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
+/*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+						     userspace or clusterinfo
+						     INCOMPAT flag set. */
+/*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
+					   for this fs*/
+	__le16 s_reserved0;
+	__le32 s_dx_seed[3];		/* seed[0-2] for dx dir hash.
+					 * s_uuid_hash serves as seed[3]. */
+/*C0*/  __le64 s_reserved2[15];		/* Fill out superblock */
+/*140*/
+
+	/*
+	 * NOTE: As stated above, all offsets are relative to
+	 * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+	 * 0xC0 + 0x140 = 0x200 or 512 bytes.  A superblock must fit within
+	 * our smallest blocksize, which is 512 bytes.  To ensure this,
+	 * we reserve the space in s_reserved2.  Anything past s_reserved2
+	 * will not be available on the smallest blocksize.
+	 */
+};
+
+/*
+ * Local allocation bitmap for OCFS2 slots
+ * Note that it exists inside an ocfs2_dinode, so all offsets are
+ * relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_local_alloc
+{
+/*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
+	__le16 la_size;		/* Size of included bitmap, in bytes */
+	__le16 la_reserved1;
+	__le64 la_reserved2;
+/*10*/	__u8   la_bitmap[0];
+};
+
+/*
+ * Data-in-inode header. This is only used if i_dyn_features has
+ * OCFS2_INLINE_DATA_FL set.
+ */
+struct ocfs2_inline_data
+{
+/*00*/	__le16	id_count;	/* Number of bytes that can be used
+				 * for data, starting at id_data */
+	__le16	id_reserved0;
+	__le32	id_reserved1;
+	__u8	id_data[0];	/* Start of user data */
+};
+
+/*
+ * On disk inode for OCFS2
+ */
+struct ocfs2_dinode {
+/*00*/	__u8 i_signature[8];		/* Signature for validation */
+	__le32 i_generation;		/* Generation number */
+	__le16 i_suballoc_slot;		/* Slot suballocator this inode
+					   belongs to */
+	__le16 i_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+/*10*/	__le16 i_links_count_hi;	/* High 16 bits of links count */
+	__le16 i_xattr_inline_size;
+	__le32 i_clusters;		/* Cluster count */
+	__le32 i_uid;			/* Owner UID */
+	__le32 i_gid;			/* Owning GID */
+/*20*/	__le64 i_size;			/* Size in bytes */
+	__le16 i_mode;			/* File mode */
+	__le16 i_links_count;		/* Links count */
+	__le32 i_flags;			/* File flags */
+/*30*/	__le64 i_atime;			/* Access time */
+	__le64 i_ctime;			/* Creation time */
+/*40*/	__le64 i_mtime;			/* Modification time */
+	__le64 i_dtime;			/* Deletion time */
+/*50*/	__le64 i_blkno;			/* Offset on disk, in blocks */
+	__le64 i_last_eb_blk;		/* Pointer to last extent
+					   block */
+/*60*/	__le32 i_fs_generation;		/* Generation per fs-instance */
+	__le32 i_atime_nsec;
+	__le32 i_ctime_nsec;
+	__le32 i_mtime_nsec;
+/*70*/	__le32 i_attr;
+	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
+					   was set in i_flags */
+	__le16 i_dyn_features;
+	__le64 i_xattr_loc;
+/*80*/	struct ocfs2_block_check i_check;	/* Error checking */
+/*88*/	__le64 i_dx_root;		/* Pointer to dir index root block */
+/*90*/	__le64 i_refcount_loc;
+	__le64 i_suballoc_loc;		/* Suballocator block group this
+					   inode belongs to.  Only valid
+					   if allocated from a
+					   discontiguous block group */
+/*A0*/	__le16 i_dio_orphaned_slot;	/* only used for append dio write */
+	__le16 i_reserved1[3];
+	__le64 i_reserved2[2];
+/*B8*/	union {
+		__le64 i_pad1;		/* Generic way to refer to this
+					   64bit union */
+		struct {
+			__le64 i_rdev;	/* Device number */
+		} dev1;
+		struct {		/* Info for bitmap system
+					   inodes */
+			__le32 i_used;	/* Bits (ie, clusters) used  */
+			__le32 i_total;	/* Total bits (clusters)
+					   available */
+		} bitmap1;
+		struct {		/* Info for journal system
+					   inodes */
+			__le32 ij_flags;	/* Mounted, version, etc. */
+			__le32 ij_recovery_generation; /* Incremented when the
+							  journal is recovered
+							  after an unclean
+							  shutdown */
+		} journal1;
+	} id1;				/* Inode type dependent 1 */
+/*C0*/	union {
+		struct ocfs2_super_block	i_super;
+		struct ocfs2_local_alloc	i_lab;
+		struct ocfs2_chain_list		i_chain;
+		struct ocfs2_extent_list	i_list;
+		struct ocfs2_truncate_log	i_dealloc;
+		struct ocfs2_inline_data	i_data;
+		__u8               		i_symlink[0];
+	} id2;
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On-disk directory entry structure for OCFS2
+ *
+ * Packed as this structure could be accessed unaligned on 64-bit platforms
+ */
+struct ocfs2_dir_entry {
+/*00*/	__le64   inode;                  /* Inode number */
+	__le16   rec_len;                /* Directory entry length */
+	__u8    name_len;               /* Name length */
+	__u8    file_type;
+/*0C*/	char    name[OCFS2_MAX_FILENAME_LEN];   /* File name */
+/* Actual on-disk length specified by rec_len */
+} __attribute__ ((packed));
+
+/*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/	__le64		db_compat_inode;	/* Always zero. Was inode */
+
+	__le16		db_compat_rec_len;	/* Backwards compatible with
+						 * ocfs2_dir_entry. */
+	__u8		db_compat_name_len;	/* Always zero. Was name_len */
+	__u8		db_reserved0;
+	__le16		db_reserved1;
+	__le16		db_free_rec_len;	/* Size of largest empty hole
+						 * in this block. (unused) */
+/*10*/	__u8		db_signature[8];	/* Signature for verification */
+	__le64		db_reserved2;
+	__le64		db_free_next;		/* Next block in list (unused) */
+/*20*/	__le64		db_blkno;		/* Offset on disk, in blocks */
+	__le64		db_parent_dinode;	/* dinode which owns me, in
+						   blocks */
+/*30*/	struct ocfs2_block_check db_check;	/* Error checking */
+/*40*/
+};
+
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+	__le32		dx_major_hash;	/* Used to find logical
+					 * cluster in index */
+	__le32		dx_minor_hash;	/* Lower bits used to find
+					 * block in cluster */
+	__le64		dx_dirent_blk;	/* Physical block in unindexed
+					 * tree holding this dirent. */
+};
+
+struct ocfs2_dx_entry_list {
+	__le32		de_reserved;
+	__le16		de_count;	/* Maximum number of entries
+					 * possible in de_entries */
+	__le16		de_num_used;	/* Current number of
+					 * de_entries entries */
+	struct	ocfs2_dx_entry		de_entries[0];	/* Indexed dir entries
+							 * in a packed array of
+							 * length de_num_used */
+};
+
+#define OCFS2_DX_FLAG_INLINE	0x01
+
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+	__u8		dr_signature[8];	/* Signature for verification */
+	struct ocfs2_block_check dr_check;	/* Error checking */
+	__le16		dr_suballoc_slot;	/* Slot suballocator this
+						 * block belongs to. */
+	__le16		dr_suballoc_bit;	/* Bit offset in suballocator
+						 * block group */
+	__le32		dr_fs_generation;	/* Must match super block */
+	__le64		dr_blkno;		/* Offset on disk, in blocks */
+	__le64		dr_last_eb_blk;		/* Pointer to last
+						 * extent block */
+	__le32		dr_clusters;		/* Clusters allocated
+						 * to the indexed tree. */
+	__u8		dr_flags;		/* OCFS2_DX_FLAG_* flags */
+	__u8		dr_reserved0;
+	__le16		dr_reserved1;
+	__le64		dr_dir_blkno;		/* Pointer to parent inode */
+	__le32		dr_num_entries;		/* Total number of
+						 * names stored in
+						 * this directory.*/
+	__le32		dr_reserved2;
+	__le64		dr_free_blk;		/* Pointer to head of free
+						 * unindexed block list. */
+	__le64		dr_suballoc_loc;	/* Suballocator block group
+						   this root belongs to.
+						   Only valid if allocated
+						   from a discontiguous
+						   block group */
+	__le64		dr_reserved3[14];
+	union {
+		struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+						   * bits for maximum space
+						   * efficiency. */
+		struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+							* entries. We grow out
+							* to extents if this
+							* gets too big. */
+	};
+};
+
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+	__u8		dl_signature[8];/* Signature for verification */
+	struct ocfs2_block_check dl_check;	/* Error checking */
+	__le64		dl_blkno;	/* Offset on disk, in blocks */
+	__le32		dl_fs_generation;/* Must match super block */
+	__le32		dl_reserved0;
+	__le64		dl_reserved1;
+	struct ocfs2_dx_entry_list	dl_list;
+};
+
+/*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE	256
+
+/*
+ * On disk allocator group structure for OCFS2
+ */
+struct ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__le16   bg_size;                /* Size of included bitmap in
+					   bytes. */
+	__le16   bg_bits;                /* Bits represented by this
+					   group. */
+	__le16	bg_free_bits_count;     /* Free bits count */
+	__le16   bg_chain;               /* What chain I am in. */
+/*10*/	__le32   bg_generation;
+	__le32	bg_reserved1;
+	__le64   bg_next_group;          /* Next group in my list, in
+					   blocks */
+/*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
+					   blocks */
+	__le64   bg_blkno;               /* Offset on disk, in blocks */
+/*30*/	struct ocfs2_block_check bg_check;	/* Error checking */
+	__le64   bg_reserved2;
+/*40*/	union {
+		__u8    bg_bitmap[0];
+		struct {
+			/*
+			 * Block groups may be discontiguous when
+			 * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+			 * The extents of a discontigous block group are
+			 * stored in bg_list.  It is a flat list.
+			 * l_tree_depth must always be zero.  A
+			 * discontiguous group is signified by a non-zero
+			 * bg_list->l_next_free_rec.  Only block groups
+			 * can be discontiguous; Cluster groups cannot.
+			 * We've never made a block group with more than
+			 * 2048 blocks (256 bytes of bg_bitmap).  This
+			 * codifies that limit so that we can fit bg_list.
+			 * bg_size of a discontiguous block group will
+			 * be 256 to match bg_bitmap_filler.
+			 */
+			__u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/			struct ocfs2_extent_list bg_list;
+		};
+	};
+/* Actual on-disk size is one block */
+};
+
+struct ocfs2_refcount_rec {
+/*00*/	__le64 r_cpos;		/* Physical offset, in clusters */
+	__le32 r_clusters;	/* Clusters covered by this extent */
+	__le32 r_refcount;	/* Reference count of this extent */
+/*10*/
+};
+#define OCFS2_32BIT_POS_MASK		(0xffffffffULL)
+
+#define OCFS2_REFCOUNT_LEAF_FL          (0x00000001)
+#define OCFS2_REFCOUNT_TREE_FL          (0x00000002)
+
+struct ocfs2_refcount_list {
+/*00*/	__le16 rl_count;	/* Maximum number of entries possible
+				   in rl_records */
+	__le16 rl_used;		/* Current number of used records */
+	__le32 rl_reserved2;
+	__le64 rl_reserved1;	/* Pad to sizeof(ocfs2_refcount_record) */
+/*10*/	struct ocfs2_refcount_rec rl_recs[0];	/* Refcount records */
+};
+
+
+struct ocfs2_refcount_block {
+/*00*/	__u8 rf_signature[8];		/* Signature for verification */
+	__le16 rf_suballoc_slot;	/* Slot suballocator this block
+					   belongs to */
+	__le16 rf_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 rf_fs_generation;	/* Must match superblock */
+/*10*/	__le64 rf_blkno;		/* Offset on disk, in blocks */
+	__le64 rf_parent;		/* Parent block, only valid if
+					   OCFS2_REFCOUNT_LEAF_FL is set in
+					   rf_flags */
+/*20*/	struct ocfs2_block_check rf_check;	/* Error checking */
+	__le64 rf_last_eb_blk;		/* Pointer to last extent block */
+/*30*/	__le32 rf_count;		/* Number of inodes sharing this
+					   refcount tree */
+	__le32 rf_flags;		/* See the flags above */
+	__le32 rf_clusters;		/* clusters covered by refcount tree. */
+	__le32 rf_cpos;			/* cluster offset in refcount tree.*/
+/*40*/	__le32 rf_generation;		/* generation number. all be the same
+					 * for the same refcount tree. */
+	__le32 rf_reserved0;
+	__le64 rf_suballoc_loc;		/* Suballocator block group this
+					   refcount block belongs to. Only
+					   valid if allocated from a
+					   discontiguous block group */
+/*50*/	__le64 rf_reserved1[6];
+/*80*/	union {
+		struct ocfs2_refcount_list rf_records;  /* List of refcount
+							  records */
+		struct ocfs2_extent_list rf_list;	/* Extent record list,
+							only valid if
+							OCFS2_REFCOUNT_TREE_FL
+							is set in rf_flags */
+	};
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
+	__le16	xe_name_offset;  /* byte offset from the 1st entry in the
+				    local xattr storage(inode, xattr block or
+				    xattr bucket). */
+	__u8	xe_name_len;	 /* xattr name len, doesn't include prefix. */
+	__u8	xe_type;         /* the low 7 bits indicate the name prefix
+				  * type and the highest bit indicates whether
+				  * the EA is stored in the local storage. */
+	__le64	xe_value_size;	 /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+	__le16	xh_count;                       /* contains the count of how
+						   many records are in the
+						   local xattr storage. */
+	__le16	xh_free_start;                  /* current offset for storing
+						   xattr. */
+	__le16	xh_name_value_len;              /* total length of name/value
+						   length in this bucket. */
+	__le16	xh_num_buckets;                 /* Number of xattr buckets
+						   in this extent record,
+						   only valid in the first
+						   bucket. */
+	struct ocfs2_block_check xh_check;	/* Error checking
+						   (Note, this is only
+						    used for xattr
+						    buckets.  A block uses
+						    xb_check and sets
+						    this field to zero.) */
+	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * When an xattr's value is large enough, it is stored in an external
+ * b-tree like file data.  The xattr value root points to this structure.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
+	__le32	xr_reserved0;
+	__le64	xr_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/	__le32	xt_clusters;              /* clusters covered by xattr. */
+	__le32	xt_reserved0;
+	__le64	xt_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED	0x1
+#define OCFS2_HASH_SHIFT	5
+#define OCFS2_XATTR_ROUND	3
+#define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
+				~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE			4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET 	(OCFS2_XATTR_BUCKET_SIZE \
+						 / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/	__u8	xb_signature[8];     /* Signature for verification */
+	__le16	xb_suballoc_slot;    /* Slot suballocator this
+					block belongs to. */
+	__le16	xb_suballoc_bit;     /* Bit offset in suballocator
+					block group */
+	__le32	xb_fs_generation;    /* Must match super block */
+/*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
+	struct ocfs2_block_check xb_check;	/* Error checking */
+/*20*/	__le16	xb_flags;            /* Indicates whether this block contains
+					real xattr or a xattr tree. */
+	__le16	xb_reserved0;
+	__le32  xb_reserved1;
+	__le64	xb_suballoc_loc;	/* Suballocator block group this
+					   xattr block belongs to. Only
+					   valid if allocated from a
+					   discontiguous block group */
+/*30*/	union {
+		struct ocfs2_xattr_header xb_header; /* xattr header if this
+							block contains xattr */
+		struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+							block cotains xattr
+							tree. */
+	} xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL		0x80
+#define OCFS2_XATTR_TYPE_MASK		0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+					 int local)
+{
+	if (local)
+		xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+	else
+		xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+	xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
+/*
+ *  On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+	0x0cf52470, /* USRQUOTA */ \
+	0x0cf52471  /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/	__le32 dqi_bgrace;	/* Grace time for space softlimit excess */
+	__le32 dqi_igrace;	/* Grace time for inode softlimit excess */
+	__le32 dqi_syncms;	/* Time after which we sync local changes to
+				 * global quota file */
+	__le32 dqi_blocks;	/* Number of blocks in quota file */
+/*10*/	__le32 dqi_free_blk;	/* First free block in quota file */
+	__le32 dqi_free_entry;	/* First block with free dquot entry in quota
+				 * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/	__le32 dqb_id;          /* ID the structure belongs to */
+	__le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+	__le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/	__le64 dqb_isoftlimit;  /* preferred inode limit */
+	__le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/	__le64 dqb_bhardlimit;  /* absolute limit on disk space */
+	__le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/	__le64 dqb_curspace;    /* current space occupied */
+	__le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/	__le64 dqb_itime;       /* time limit for excessive inode use */
+	__le64 dqb_pad1;
+/*50*/	__le64 dqb_pad2;
+};
+
+/*
+ *  On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+	0x0cf524c0, /* USRQUOTA */ \
+	0x0cf524c1  /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN	0x0001	/* Quota file is empty (this should be after\
+				 * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+	__le32 dqi_flags;	/* Flags for quota file */
+	__le32 dqi_chunks;	/* Number of chunks of quota structures
+				 * with a bitmap */
+	__le32 dqi_blocks;	/* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+	__le32 dqc_free;	/* Number of free entries in the bitmap */
+	__u8 dqc_bitmap[0];	/* Bitmap of entries in the corresponding
+				 * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/	__le64 dqb_id;		/* id this quota applies to */
+	__le64 dqb_spacemod;	/* Change in the amount of used space */
+/*10*/	__le64 dqb_inodemod;	/* Change in the amount of used inodes */
+};
+
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/	struct ocfs2_block_check dq_check;	/* Error checking */
+/*08*/	/* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+								 void *buf)
+{
+	char *ptr = buf;
+	ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+	return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
+#ifdef __KERNEL__
+static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
+{
+	return  sb->s_blocksize -
+		 offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+						   struct ocfs2_dinode *di)
+{
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			xattrsize;
+	else
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
+static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+						struct super_block *sb,
+						struct ocfs2_dinode *di)
+{
+	int size;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			xattrsize;
+	else
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+
+	return size / sizeof(struct ocfs2_dx_entry);
+}
+
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+
+	return size / sizeof(struct ocfs2_dx_entry);
+}
+
+static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
+{
+	u16 size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+					  int suballocator,
+					  u32 feature_incompat)
+{
+	int size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	/*
+	 * The cluster allocator uses the entire block.  Suballocators have
+	 * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+	 * code expects bg_size set to the maximum.  Thus we must keep
+	 * bg_size as-is unless discontig_bg is enabled.
+	 */
+	if (suballocator &&
+	    (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+		size = OCFS2_MAX_BG_BITMAP_SIZE;
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+
+static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
+{
+	u64 offset = OCFS2_BACKUP_SB_START;
+
+	if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) {
+		offset <<= (2 * index);
+		offset >>= sb->s_blocksize_bits;
+		return offset;
+	}
+
+	return 0;
+
+}
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
+
+	return size / sizeof(struct ocfs2_refcount_rec);
+}
+
+static inline u32
+ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
+{
+	return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+}
+#else
+static inline int ocfs2_fast_symlink_chars(int blocksize)
+{
+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
+						   struct ocfs2_dinode *di)
+{
+	if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			di->i_xattr_inline_size;
+	else
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
+static inline int ocfs2_extent_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline int ocfs2_extent_recs_per_eb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_local_alloc_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+					  int suballocator,
+					  uint32_t feature_incompat)
+{
+	int size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	/*
+	 * The cluster allocator uses the entire block.  Suballocators have
+	 * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+	 * code expects bg_size set to the maximum.  Thus we must keep
+	 * bg_size as-is unless discontig_bg is enabled.
+	 */
+	if (suballocator &&
+	    (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+		size = OCFS2_MAX_BG_BITMAP_SIZE;
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+
+static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
+{
+	uint64_t offset = OCFS2_BACKUP_SB_START;
+
+	if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) {
+		offset <<= (2 * index);
+		offset /= blocksize;
+		return offset;
+	}
+
+	return 0;
+}
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+#endif  /* __KERNEL__ */
+
+
+static inline int ocfs2_system_inode_is_global(int type)
+{
+	return ((type >= 0) &&
+		(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
+}
+
+static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
+						  int type, int slot)
+{
+	int chars;
+
+        /*
+         * Global system inodes can only have one copy.  Everything
+         * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
+         * list has a copy per slot.
+         */
+	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
+		chars = snprintf(buf, len, "%s",
+				 ocfs2_system_inodes[type].si_name);
+	else
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name,
+				 slot);
+
+	return chars;
+}
+
+static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
+				    umode_t mode)
+{
+	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+	if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+	     le16_to_cpu(gd->bg_size)) !=
+	    offsetof(struct ocfs2_group_desc, bg_list))
+		return 0;
+	/*
+	 * Only valid to check l_next_free_rec if
+	 * bg_bitmap + bg_size == bg_list.
+	 */
+	if (!gd->bg_list.l_next_free_rec)
+		return 0;
+	return 1;
+}
+#endif  /* _OCFS2_FS_H */
+
diff --git a/kernel/fs/ocfs2/ocfs2_ioctl.h b/kernel/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000..5b27ff1fa
--- /dev/null
+++ b/kernel/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,242 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_ioctl.h
+ *
+ * Defines OCFS2 ioctls.
+ *
+ * Copyright (C) 2010 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS	FS_IOC_GETFLAGS
+#define OCFS2_IOC_SETFLAGS	FS_IOC_SETFLAGS
+#define OCFS2_IOC32_GETFLAGS	FS_IOC32_GETFLAGS
+#define OCFS2_IOC32_SETFLAGS	FS_IOC32_SETFLAGS
+
+/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+};
+
+#define OCFS2_IOC_ALLOCSP		_IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP		_IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP		_IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP	_IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64	_IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64	_IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
+
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+	__u64 group;		/* Group descriptor's blkno. */
+	__u32 clusters;		/* Total number of clusters in this group */
+	__u32 frees;		/* Total free clusters in this group */
+	__u16 chain;		/* Chain for this group */
+	__u16 reserved1;
+	__u32 reserved2;
+};
+
+#define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
+
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+	__u64 old_path;
+	__u64 new_path;
+	__u64 preserve;
+};
+#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
+
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST		(50)
+#define OCFS2_TEXT_UUID_LEN		(OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC		(0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+	__u64 oi_requests;	/* Array of __u64 pointers to requests */
+	__u32 oi_count;		/* Number of requests in info_requests */
+	__u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/	__u32 ir_magic;	/* Magic number */
+	__u32 ir_code;	/* Info request code */
+	__u32 ir_size;	/* Size of request */
+	__u32 ir_flags;	/* Request flags */
+/*10*/			/* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+	struct ocfs2_info_request ic_req;
+	__u32 ic_clustersize;
+	__u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+	struct ocfs2_info_request ib_req;
+	__u32 ib_blocksize;
+	__u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+	struct ocfs2_info_request im_req;
+	__u32 im_max_slots;
+	__u32 im_pad;
+};
+
+struct ocfs2_info_label {
+	struct ocfs2_info_request il_req;
+	__u8	il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+	struct ocfs2_info_request iu_req;
+	__u8	iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+	struct ocfs2_info_request if_req;
+	__u32 if_compat_features;
+	__u32 if_incompat_features;
+	__u32 if_ro_compat_features;
+	__u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+	struct ocfs2_info_request ij_req;
+	__u64 ij_journal_size;
+};
+
+struct ocfs2_info_freeinode {
+	struct ocfs2_info_request ifi_req;
+	struct ocfs2_info_local_freeinode {
+		__u64 lfi_total;
+		__u64 lfi_free;
+	} ifi_stat[OCFS2_MAX_SLOTS];
+	__u32 ifi_slotnum; /* out */
+	__u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST     (32)
+
+struct ocfs2_info_freefrag {
+	struct ocfs2_info_request iff_req;
+	struct ocfs2_info_freefrag_stats { /* (out) */
+		struct ocfs2_info_free_chunk_list {
+			__u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+			__u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+		} ffs_fc_hist;
+		__u32 ffs_clusters;
+		__u32 ffs_free_clusters;
+		__u32 ffs_free_chunks;
+		__u32 ffs_free_chunks_real;
+		__u32 ffs_min; /* Minimum free chunksize in clusters */
+		__u32 ffs_max;
+		__u32 ffs_avg;
+		__u32 ffs_pad;
+	} iff_ffs;
+	__u32 iff_chunksize; /* chunksize in clusters(in) */
+	__u32 iff_pad;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+	OCFS2_INFO_CLUSTERSIZE = 1,
+	OCFS2_INFO_BLOCKSIZE,
+	OCFS2_INFO_MAXSLOTS,
+	OCFS2_INFO_LABEL,
+	OCFS2_INFO_UUID,
+	OCFS2_INFO_FS_FEATURES,
+	OCFS2_INFO_JOURNAL_SIZE,
+	OCFS2_INFO_FREEINODE,
+	OCFS2_INFO_FREEFRAG,
+	OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT	(0x00000001)	/* Cluster coherency not
+							   required. This is a hint.
+							   It is up to ocfs2 whether
+							   the request can be fulfilled
+							   without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED		(0x40000000)	/* Filesystem understood
+							   this request and
+							   filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR		(0x80000000)	/* Error happened during
+							   request handling. */
+
+#define OCFS2_IOC_INFO		_IOR('o', 5, struct ocfs2_info)
+
+struct ocfs2_move_extents {
+/* All values are in bytes */
+	/* in */
+	__u64 me_start;		/* Virtual start in the file to move */
+	__u64 me_len;		/* Length of the extents to be moved */
+	__u64 me_goal;		/* Physical offset of the goal,
+				   it's in block unit */
+	__u64 me_threshold;	/* Maximum distance from goal or threshold
+				   for auto defragmentation */
+	__u64 me_flags;		/* Flags for the operation:
+				 * - auto defragmentation.
+				 * - refcount,xattr cases.
+				 */
+	/* out */
+	__u64 me_moved_len;	/* Moved/defraged length */
+	__u64 me_new_offset;	/* Resulting physical location */
+	__u32 me_reserved[2];	/* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG	(0x00000001)	/* Kernel manages to
+							   claim new clusters
+							   as the goal place
+							   for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG	(0x00000002)	/* Allow partial extent
+							   moving, is to make
+							   movement less likely
+							   to fail, may make fs
+							   even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE	(0x00000004)	/* Move or defragmenation
+							   completely gets done.
+							 */
+
+#define OCFS2_IOC_MOVE_EXT	_IOW('o', 6, struct ocfs2_move_extents)
+
+#endif /* OCFS2_IOCTL_H */
diff --git a/kernel/fs/ocfs2/ocfs2_lockid.h b/kernel/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 000000000..d277aabf5
--- /dev/null
+++ b/kernel/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockid.h
+ *
+ * Defines OCFS2 lockid bits.
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKID_H
+#define OCFS2_LOCKID_H
+
+/* lock ids are made up in the following manner:
+ * name[0]     --> type
+ * name[1-6]   --> 6 pad characters, reserved for now
+ * name[7-22]  --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31]    --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN  32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+#define OCFS2_DENTRY_LOCK_INO_START 18
+
+enum ocfs2_lock_type {
+	OCFS2_LOCK_TYPE_META = 0,
+	OCFS2_LOCK_TYPE_DATA,
+	OCFS2_LOCK_TYPE_SUPER,
+	OCFS2_LOCK_TYPE_RENAME,
+	OCFS2_LOCK_TYPE_RW,
+	OCFS2_LOCK_TYPE_DENTRY,
+	OCFS2_LOCK_TYPE_OPEN,
+	OCFS2_LOCK_TYPE_FLOCK,
+	OCFS2_LOCK_TYPE_QINFO,
+	OCFS2_LOCK_TYPE_NFS_SYNC,
+	OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+	OCFS2_LOCK_TYPE_REFCOUNT,
+	OCFS2_NUM_LOCK_TYPES
+};
+
+static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
+{
+	char c;
+	switch (type) {
+		case OCFS2_LOCK_TYPE_META:
+			c = 'M';
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			c = 'D';
+			break;
+		case OCFS2_LOCK_TYPE_SUPER:
+			c = 'S';
+			break;
+		case OCFS2_LOCK_TYPE_RENAME:
+			c = 'R';
+			break;
+		case OCFS2_LOCK_TYPE_RW:
+			c = 'W';
+			break;
+		case OCFS2_LOCK_TYPE_DENTRY:
+			c = 'N';
+			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			c = 'O';
+			break;
+		case OCFS2_LOCK_TYPE_FLOCK:
+			c = 'F';
+			break;
+		case OCFS2_LOCK_TYPE_QINFO:
+			c = 'Q';
+			break;
+		case OCFS2_LOCK_TYPE_NFS_SYNC:
+			c = 'Y';
+			break;
+		case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+			c = 'P';
+			break;
+		case OCFS2_LOCK_TYPE_REFCOUNT:
+			c = 'T';
+			break;
+		default:
+			c = '\0';
+	}
+
+	return c;
+}
+
+static char *ocfs2_lock_type_strings[] = {
+	[OCFS2_LOCK_TYPE_META] = "Meta",
+	[OCFS2_LOCK_TYPE_DATA] = "Data",
+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
+	/* Need to differntiate from [R]ename.. serializing writes is the
+	 * important job it does, anyway. */
+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
+	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+	[OCFS2_LOCK_TYPE_OPEN] = "Open",
+	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+	[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
+	[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
+	[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
+};
+
+static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+#ifdef __KERNEL__
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+#endif
+	return ocfs2_lock_type_strings[type];
+}
+
+#endif  /* OCFS2_LOCKID_H */
diff --git a/kernel/fs/ocfs2/ocfs2_lockingver.h b/kernel/fs/ocfs2/ocfs2_lockingver.h
new file mode 100644
index 000000000..2e45c8d2e
--- /dev/null
+++ b/kernel/fs/ocfs2/ocfs2_lockingver.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockingver.h
+ *
+ * Defines OCFS2 Locking version values.
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_LOCKINGVER_H
+#define OCFS2_LOCKINGVER_H
+
+/*
+ * The protocol version for ocfs2 cluster locking.  See dlmglue.c for
+ * more details.
+ *
+ * 1.0 - Initial locking version from ocfs2 1.4.
+ */
+#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
+#define OCFS2_LOCKING_PROTOCOL_MINOR 0
+
+#endif  /* OCFS2_LOCKINGVER_H */
diff --git a/kernel/fs/ocfs2/ocfs2_trace.h b/kernel/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 000000000..6cb019b7c
--- /dev/null
+++ b/kernel/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2768 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ocfs2__int,
+	TP_PROTO(int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(int, num)
+	),
+	TP_fast_assign(
+		__entry->num = num;
+	),
+	TP_printk("%d", __entry->num)
+);
+
+#define DEFINE_OCFS2_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int, name,	\
+	TP_PROTO(int num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__uint,
+	TP_PROTO(unsigned int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	num		)
+	),
+	TP_fast_assign(
+		__entry->num	= 	num;
+	),
+	TP_printk("%u", __entry->num)
+);
+
+#define DEFINE_OCFS2_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint, name,	\
+	TP_PROTO(unsigned int num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__ull,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu", __entry->blkno)
+);
+
+#define DEFINE_OCFS2_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull, name,	\
+	TP_PROTO(unsigned long long num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+	TP_PROTO(void *pointer),
+	TP_ARGS(pointer),
+	TP_STRUCT__entry(
+		__field(void *, pointer)
+	),
+	TP_fast_assign(
+		__entry->pointer = pointer;
+	),
+	TP_printk("%p", __entry->pointer)
+);
+
+#define DEFINE_OCFS2_POINTER_EVENT(name)	\
+DEFINE_EVENT(ocfs2__pointer, name,	\
+	TP_PROTO(void *pointer),	\
+	TP_ARGS(pointer))
+
+DECLARE_EVENT_CLASS(ocfs2__string,
+	TP_PROTO(const char *name),
+	TP_ARGS(name),
+	TP_STRUCT__entry(
+		__string(name,name)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+	),
+	TP_printk("%s", __get_str(name))
+);
+
+#define DEFINE_OCFS2_STRING_EVENT(name)	\
+DEFINE_EVENT(ocfs2__string, name,	\
+	TP_PROTO(const char *name),	\
+	TP_ARGS(name))
+
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+	TP_PROTO(int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int_int, name,	\
+	TP_PROTO(int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+	TP_PROTO(unsigned int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_int, name,	\
+	TP_PROTO(unsigned int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint, name,	\
+	TP_PROTO(unsigned int val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+	TP_PROTO(unsigned long long value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint, name,	\
+	TP_PROTO(unsigned long long val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+	TP_PROTO(unsigned long long value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int, name,	\
+	TP_PROTO(unsigned long long val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+	TP_PROTO(unsigned long long value1, unsigned long long value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull, name,	\
+	TP_PROTO(unsigned long long val1, unsigned long long val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %u",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned long long val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u", __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned int val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2,
+		 unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	value1		)
+		__field(	unsigned int,	value2		)
+		__field(	unsigned int,	value3		)
+	),
+	TP_fast_assign(
+		__entry->value1	= 	value1;
+		__entry->value2	= 	value2;
+		__entry->value3	= 	value3;
+	),
+	TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint_uint, name,	\
+	TP_PROTO(unsigned int value1, unsigned int value2,	\
+		 unsigned int value3),	\
+	TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned long long value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned long long, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %llu",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_ull, name,	\
+	TP_PROTO(unsigned long long value1, unsigned long long value2,	\
+		 unsigned long long value3),	\
+	TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+	TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	ull	)
+		__field(	int,	value1			)
+		__field(	int,	value2			)
+		__field(	int,	value3			)
+	),
+	TP_fast_assign(
+		__entry->ull		= ull;
+		__entry->value1		= value1;
+		__entry->value2		= value2;
+		__entry->value3		= value3;
+	),
+	TP_printk("%llu %d %d %d",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int_int_int, name,	\
+	TP_PROTO(unsigned long long ull, int value1,	\
+		 int value2, int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+	TP_PROTO(unsigned long long ull, unsigned int value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ull)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->ull = ull;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned int value1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+	TP_PROTO(unsigned long long value1, unsigned long long value2,
+		 unsigned int value3, unsigned int value4),
+	TP_ARGS(value1, value2, value3, value4),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+		__field(unsigned int, value4)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+		__entry->value4 = value4;
+	),
+	TP_printk("%llu %llu %u %u",
+		  __entry->value1, __entry->value2,
+		  __entry->value3, __entry->value4)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned long long ull1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, ull1, value2, value3))
+
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+	TP_PROTO(unsigned long long owner,\
+		 unsigned int value1, unsigned int value2),
+	TP_ARGS(owner, value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %u %u",
+		  __entry->owner, __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_BTREE_EVENT(name)	\
+DEFINE_EVENT(ocfs2__btree_ops, name,	\
+	TP_PROTO(unsigned long long owner,	\
+		 unsigned int value1, unsigned int value2),	\
+	TP_ARGS(owner, value1, value2))
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+
+TRACE_EVENT(ocfs2_grow_tree,
+	TP_PROTO(unsigned long long owner, int depth),
+	TP_ARGS(owner, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_rotate_subtree,
+	TP_PROTO(int subtree_root, unsigned long long blkno,
+		 int depth),
+	TP_ARGS(subtree_root, blkno, depth),
+	TP_STRUCT__entry(
+		__field(int, subtree_root)
+		__field(unsigned long long, blkno)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->subtree_root = subtree_root;
+		__entry->blkno = blkno;
+		__entry->depth = depth;
+	),
+	TP_printk("%d %llu %d", __entry->subtree_root,
+		  __entry->blkno, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_insert_extent,
+	TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+		 int ins_contig_index, int free_records, int ins_tree_depth),
+	TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+		ins_tree_depth),
+	TP_STRUCT__entry(
+		__field(unsigned int, ins_appending)
+		__field(unsigned int, ins_contig)
+		__field(int, ins_contig_index)
+		__field(int, free_records)
+		__field(int, ins_tree_depth)
+	),
+	TP_fast_assign(
+		__entry->ins_appending = ins_appending;
+		__entry->ins_contig = ins_contig;
+		__entry->ins_contig_index = ins_contig_index;
+		__entry->free_records = free_records;
+		__entry->ins_tree_depth = ins_tree_depth;
+	),
+	TP_printk("%u %u %d %d %d",
+		  __entry->ins_appending, __entry->ins_contig,
+		  __entry->ins_contig_index, __entry->free_records,
+		  __entry->ins_tree_depth)
+);
+
+TRACE_EVENT(ocfs2_split_extent,
+	TP_PROTO(int split_index, unsigned int c_contig_type,
+		 unsigned int c_has_empty_extent,
+		 unsigned int c_split_covers_rec),
+	TP_ARGS(split_index, c_contig_type,
+		c_has_empty_extent, c_split_covers_rec),
+	TP_STRUCT__entry(
+		__field(int, split_index)
+		__field(unsigned int, c_contig_type)
+		__field(unsigned int, c_has_empty_extent)
+		__field(unsigned int, c_split_covers_rec)
+	),
+	TP_fast_assign(
+		__entry->split_index = split_index;
+		__entry->c_contig_type = c_contig_type;
+		__entry->c_has_empty_extent = c_has_empty_extent;
+		__entry->c_split_covers_rec = c_split_covers_rec;
+	),
+	TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+		  __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+
+TRACE_EVENT(ocfs2_remove_extent,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, int index,
+		 unsigned int e_cpos, unsigned int clusters),
+	TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(int, index)
+		__field(unsigned int, e_cpos)
+		__field(unsigned int, clusters)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->index = index;
+		__entry->e_cpos = e_cpos;
+		__entry->clusters = clusters;
+	),
+	TP_printk("%llu %u %u %d %u %u",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->index,
+		  __entry->e_cpos, __entry->clusters)
+);
+
+TRACE_EVENT(ocfs2_commit_truncate,
+	TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+		 unsigned int clusters, unsigned int depth),
+	TP_ARGS(ino, new_cpos, clusters, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, new_cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, depth)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->new_cpos = new_cpos;
+		__entry->clusters = clusters;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ino, __entry->new_cpos,
+		  __entry->clusters, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_validate_extent_block,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu ", __entry->blkno)
+);
+
+TRACE_EVENT(ocfs2_rotate_leaf,
+	TP_PROTO(unsigned int insert_cpos, int insert_index,
+		 int has_empty, int next_free,
+		 unsigned int l_count),
+	TP_ARGS(insert_cpos, insert_index, has_empty,
+		next_free, l_count),
+	TP_STRUCT__entry(
+		__field(unsigned int, insert_cpos)
+		__field(int, insert_index)
+		__field(int, has_empty)
+		__field(int, next_free)
+		__field(unsigned int, l_count)
+	),
+	TP_fast_assign(
+		__entry->insert_cpos = insert_cpos;
+		__entry->insert_index = insert_index;
+		__entry->has_empty = has_empty;
+		__entry->next_free = next_free;
+		__entry->l_count = l_count;
+	),
+	TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+		  __entry->insert_index, __entry->has_empty,
+		  __entry->next_free, __entry->l_count)
+);
+
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+	TP_PROTO(int status, int reason, int err),
+	TP_ARGS(status, reason, err),
+	TP_STRUCT__entry(
+		__field(int, status)
+		__field(int, reason)
+		__field(int, err)
+	),
+	TP_fast_assign(
+		__entry->status = status;
+		__entry->reason = reason;
+		__entry->err = err;
+	),
+	TP_printk("%d %d %d", __entry->status,
+		  __entry->reason, __entry->err)
+);
+
+TRACE_EVENT(ocfs2_mark_extent_written,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, unsigned int phys),
+	TP_ARGS(owner, cpos, len, phys),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, phys)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->phys = phys;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->owner, __entry->cpos,
+		  __entry->len, __entry->phys)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned int start, unsigned int num),
+	TP_ARGS(blkno, index, start, num),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned int, start)
+		__field(unsigned int, num)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->start = start;
+		__entry->num = num;
+	),
+	TP_printk("%llu %d %u %u",
+		  __entry->blkno, __entry->index,
+		  __entry->start, __entry->num)
+);
+
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__truncate_log_ops, name,	\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned int start, unsigned int num),	\
+	TP_ARGS(blkno, index, start, num))
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+	TP_PROTO(int type, int slot, unsigned long long suballoc,
+		 unsigned long long blkno, unsigned int bit),
+	TP_ARGS(type, slot, suballoc, blkno, bit),
+	TP_STRUCT__entry(
+		__field(int, type)
+		__field(int, slot)
+		__field(unsigned long long, suballoc)
+		__field(unsigned long long, blkno)
+		__field(unsigned int, bit)
+	),
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->slot = slot;
+		__entry->suballoc = suballoc;
+		__entry->blkno = blkno;
+		__entry->bit = bit;
+	),
+	TP_printk("%d %d %llu %llu %u",
+		  __entry->type, __entry->slot, __entry->suballoc,
+		  __entry->blkno, __entry->bit)
+);
+
+TRACE_EVENT(ocfs2_trim_extent,
+	TP_PROTO(struct super_block *sb, unsigned long long blk,
+		 unsigned long long count),
+	TP_ARGS(sb, blk, count),
+	TP_STRUCT__entry(
+		__field(int, dev_major)
+		__field(int, dev_minor)
+		__field(unsigned long long, blk)
+		__field(__u64,	count)
+	),
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(sb->s_dev);
+		__entry->dev_minor = MINOR(sb->s_dev);
+		__entry->blk = blk;
+		__entry->count = count;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->dev_major, __entry->dev_minor,
+		  __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
+/* End of trace events for fs/ocfs2/alloc.c. */
+
+/* Trace events for fs/ocfs2/localalloc.c. */
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+	TP_PROTO(int count, int bit, unsigned long long start_blk,
+		 unsigned long long blkno),
+	TP_ARGS(count, bit, start_blk, blkno),
+	TP_STRUCT__entry(
+		__field(int, count)
+		__field(int, bit)
+		__field(unsigned long long, start_blk)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->count = count;
+		__entry->bit = bit;
+		__entry->start_blk = start_blk;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->count, __entry->bit, __entry->start_blk,
+		  __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+
+/* End of trace events for fs/ocfs2/localalloc.c. */
+
+/* Trace events for fs/ocfs2/resize.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+
+/* End of trace events for fs/ocfs2/resize.c. */
+
+/* Trace events for fs/ocfs2/suballoc.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+
+TRACE_EVENT(ocfs2_relink_block_group,
+	TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+		 unsigned long long bg_blkno,
+		 unsigned long long prev_blkno),
+	TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, i_blkno)
+		__field(unsigned int, chain)
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, prev_blkno)
+	),
+	TP_fast_assign(
+		__entry->i_blkno = i_blkno;
+		__entry->chain = chain;
+		__entry->bg_blkno = bg_blkno;
+		__entry->prev_blkno = prev_blkno;
+	),
+	TP_printk("%llu %u %llu %llu",
+		  __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+		  __entry->prev_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+	TP_PROTO(unsigned long long inode, unsigned long long group,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(inode, group, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, inode)
+		__field(unsigned long long, group)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->group = group;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+		  __entry->start_bit, __entry->count)
+);
+
+TRACE_EVENT(ocfs2_free_clusters,
+	TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(bg_blkno, start_blk, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, start_blk)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->bg_blkno = bg_blkno;
+		__entry->start_blk = start_blk;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+		  __entry->start_bit, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+
+/* End of trace events for fs/ocfs2/suballoc.c. */
+
+/* Trace events for fs/ocfs2/refcounttree.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount),
+	TP_ARGS(blkno, index, cpos, clusters, refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+	),
+	TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+		  __entry->cpos, __entry->clusters, __entry->refcount)
+);
+
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name,		\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned long long cpos,		\
+		 unsigned int count, unsigned int refcount),	\
+	TP_ARGS(blkno, index, cpos, count, refcount))
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+
+TRACE_EVENT(ocfs2_split_refcount_rec,
+	TP_PROTO(unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount,
+		 unsigned long long split_cpos,
+		 unsigned int split_clusters, unsigned int split_refcount),
+	TP_ARGS(cpos, clusters, refcount,
+		split_cpos, split_clusters, split_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+		__field(unsigned long long, split_cpos)
+		__field(unsigned int, split_clusters)
+		__field(unsigned int, split_refcount)
+	),
+	TP_fast_assign(
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+		__entry->split_cpos = split_cpos;
+		__entry->split_clusters = split_clusters;
+		__entry->split_refcount	= split_refcount;
+	),
+	TP_printk("%llu %u %u %llu %u %u",
+		  __entry->cpos, __entry->clusters, __entry->refcount,
+		  __entry->split_cpos, __entry->split_clusters,
+		  __entry->split_refcount)
+);
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+
+TRACE_EVENT(ocfs2_decrease_refcount,
+	TP_PROTO(unsigned long long owner,
+		 unsigned long long cpos,
+		 unsigned int len, int delete),
+	TP_ARGS(owner, cpos, len, delete),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, len)
+		__field(int, delete)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->delete = delete;
+	),
+	TP_printk("%llu %llu %u %d",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+	TP_PROTO(int recs_add, unsigned long long cpos,
+		 unsigned int clusters, unsigned long long r_cpos,
+		 unsigned int r_clusters, unsigned int refcount, int index),
+	TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+	TP_STRUCT__entry(
+		__field(int, recs_add)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned long long, r_cpos)
+		__field(unsigned int, r_clusters)
+		__field(unsigned int, refcount)
+		__field(int, index)
+	),
+	TP_fast_assign(
+		__entry->recs_add = recs_add;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->r_cpos = r_cpos;
+		__entry->r_clusters = r_clusters;
+		__entry->refcount = refcount;
+		__entry->index = index;
+	),
+	TP_printk("%d %llu %u %llu %u %u %d",
+		  __entry->recs_add, __entry->cpos, __entry->clusters,
+		  __entry->r_cpos, __entry->r_clusters,
+		  __entry->refcount, __entry->index)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int len, unsigned int p_cluster,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, p_cluster)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->p_cluster = p_cluster;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->len,
+		  __entry->p_cluster, __entry->ext_flags)
+);
+
+TRACE_EVENT(ocfs2_replace_clusters,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int old, unsigned int new, unsigned int len,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, old, new, len, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, old)
+		__field(unsigned int, new)
+		__field(unsigned int, len)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->old = old;
+		__entry->new = new;
+		__entry->len = len;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->old, __entry->new,
+		  __entry->len, __entry->ext_flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int write_len, unsigned int max_cpos,
+		 unsigned int cow_start, unsigned int cow_len),
+	TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, write_len)
+		__field(unsigned int, max_cpos)
+		__field(unsigned int, cow_start)
+		__field(unsigned int, cow_len)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->write_len = write_len;
+		__entry->max_cpos = max_cpos;
+		__entry->cow_start = cow_start;
+		__entry->cow_len = cow_len;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->write_len,
+		  __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+
+/* End of trace events for fs/ocfs2/refcounttree.c. */
+
+/* Trace events for fs/ocfs2/aops.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,
+		 void *bh_result, int create),
+	TP_ARGS(ino, iblock, bh_result, create),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, iblock)
+		__field(void *, bh_result)
+		__field(int, create)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->iblock = iblock;
+		__entry->bh_result = bh_result;
+		__entry->create = create;
+	),
+	TP_printk("%llu %llu %p %d",
+		  __entry->ino, __entry->iblock,
+		  __entry->bh_result, __entry->create)
+);
+
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name)	\
+DEFINE_EVENT(ocfs2__get_block, name,	\
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,	\
+		 void *bh_result, int create),	\
+	TP_ARGS(ino, iblock, bh_result, create))
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+	TP_PROTO(unsigned long long ino, unsigned int len,
+		 unsigned long long pos, unsigned int flags),
+	TP_ARGS(ino, len, pos, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, len)
+		__field(unsigned long long, pos)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->len = len;
+		__entry->pos = pos;
+		__entry->flags = flags;
+	),
+	TP_printk("%llu %u %llu 0x%x",
+		  __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_write_begin_nolock,
+	TP_PROTO(unsigned long long ino,
+		 long long i_size, unsigned int i_clusters,
+		 unsigned long long pos, unsigned int len,
+		 unsigned int flags, void *page,
+		 unsigned int clusters, unsigned int extents_to_split),
+	TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+		page, clusters, extents_to_split),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(long long, i_size)
+		__field(unsigned int, i_clusters)
+		__field(unsigned long long, pos)
+		__field(unsigned int, len)
+		__field(unsigned int, flags)
+		__field(void *, page)
+		__field(unsigned int, clusters)
+		__field(unsigned int, extents_to_split)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->i_size = i_size;
+		__entry->i_clusters = i_clusters;
+		__entry->pos = pos;
+		__entry->len = len;
+		__entry->flags = flags;
+		__entry->page = page;
+		__entry->clusters = clusters;
+		__entry->extents_to_split = extents_to_split;
+	),
+	TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+		  __entry->ino, __entry->i_size, __entry->i_clusters,
+		  __entry->pos, __entry->len,
+		  __entry->flags, __entry->page, __entry->clusters,
+		  __entry->extents_to_split)
+);
+
+TRACE_EVENT(ocfs2_write_end_inline,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long pos, unsigned int copied,
+		 unsigned int id_count, unsigned int features),
+	TP_ARGS(ino, pos, copied, id_count, features),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, pos)
+		__field(unsigned int, copied)
+		__field(unsigned int, id_count)
+		__field(unsigned int, features)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->pos = pos;
+		__entry->copied = copied;
+		__entry->id_count = id_count;
+		__entry->features = features;
+	),
+	TP_printk("%llu %llu %u %u %u",
+		  __entry->ino, __entry->pos, __entry->copied,
+		  __entry->id_count, __entry->features)
+);
+
+/* End of trace events for fs/ocfs2/aops.c. */
+
+/* Trace events for fs/ocfs2/mmap.c. */
+
+TRACE_EVENT(ocfs2_fault,
+	TP_PROTO(unsigned long long ino,
+		 void *area, void *page, unsigned long pgoff),
+	TP_ARGS(ino, area, page, pgoff),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(void *, area)
+		__field(void *, page)
+		__field(unsigned long, pgoff)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->area = area;
+		__entry->page = page;
+		__entry->pgoff = pgoff;
+	),
+	TP_printk("%llu %p %p %lu",
+		  __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+
+/* End of trace events for fs/ocfs2/mmap.c. */
+
+/* Trace events for fs/ocfs2/file.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+	TP_PROTO(void *inode, void *file, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned long long para),
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, file)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned long long, para)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->file = file;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->para = para;
+	),
+	TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+		  __entry->dentry, __entry->ino, __entry->para,
+		  __entry->d_len, __get_str(d_name))
+);
+
+#define DEFINE_OCFS2_FILE_OPS(name)				\
+DEFINE_EVENT(ocfs2__file_ops, name,				\
+TP_PROTO(void *inode, void *file, void *dentry,			\
+	 unsigned long long ino,				\
+	 unsigned int d_len, const unsigned char *d_name,	\
+	 unsigned long long mode),				\
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+
+TRACE_EVENT(ocfs2_extend_allocation,
+	TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+		 unsigned int clusters, unsigned int clusters_to_add,
+		 int why, int restart_func),
+	TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ip_blkno)
+		__field(unsigned long long, size)
+		__field(unsigned int, clusters)
+		__field(unsigned int, clusters_to_add)
+		__field(int, why)
+		__field(int, restart_func)
+	),
+	TP_fast_assign(
+		__entry->ip_blkno = ip_blkno;
+		__entry->size = size;
+		__entry->clusters = clusters;
+		__entry->clusters_to_add = clusters_to_add;
+		__entry->why = why;
+		__entry->restart_func = restart_func;
+	),
+	TP_printk("%llu %llu %u %u %d %d",
+		  __entry->ip_blkno, __entry->size, __entry->clusters,
+		  __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+
+TRACE_EVENT(ocfs2_extend_allocation_end,
+	TP_PROTO(unsigned long long ino,
+		 unsigned int di_clusters, unsigned long long di_size,
+		 unsigned int ip_clusters, unsigned long long i_size),
+	TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, di_clusters)
+		__field(unsigned long long, di_size)
+		__field(unsigned int, ip_clusters)
+		__field(unsigned long long, i_size)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->di_clusters = di_clusters;
+		__entry->di_size = di_size;
+		__entry->ip_clusters = ip_clusters;
+		__entry->i_size = i_size;
+	),
+	TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+		  __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+
+TRACE_EVENT(ocfs2_write_zero_page,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long abs_from, unsigned long long abs_to,
+		 unsigned long index, unsigned int zero_from,
+		 unsigned int zero_to),
+	TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, abs_from)
+		__field(unsigned long long, abs_to)
+		__field(unsigned long, index)
+		__field(unsigned int, zero_from)
+		__field(unsigned int, zero_to)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->abs_from = abs_from;
+		__entry->abs_to = abs_to;
+		__entry->index = index;
+		__entry->zero_from = zero_from;
+		__entry->zero_to = zero_to;
+	),
+	TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+		  __entry->abs_from, __entry->abs_to,
+		  __entry->index, __entry->zero_from, __entry->zero_to)
+);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+
+TRACE_EVENT(ocfs2_setattr,
+	TP_PROTO(void *inode, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned int ia_valid, unsigned int ia_mode,
+		 unsigned int ia_uid, unsigned int ia_gid),
+	TP_ARGS(inode, dentry, ino, d_len, d_name,
+		ia_valid, ia_mode, ia_uid, ia_gid),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned int, ia_valid)
+		__field(unsigned int, ia_mode)
+		__field(unsigned int, ia_uid)
+		__field(unsigned int, ia_gid)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->ia_valid = ia_valid;
+		__entry->ia_mode = ia_mode;
+		__entry->ia_uid = ia_uid;
+		__entry->ia_gid = ia_gid;
+	),
+	TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+		  __entry->dentry, __entry->ino, __entry->d_len,
+		  __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+		  __entry->ia_uid, __entry->ia_gid)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+	TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+		 int appending, unsigned long count,
+		 int *direct_io, int *has_refcount),
+	TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, saved_pos)
+		__field(int, appending)
+		__field(unsigned long, count)
+		__field(int, direct_io)
+		__field(int, has_refcount)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->saved_pos = saved_pos;
+		__entry->appending = appending;
+		__entry->count = count;
+		__entry->direct_io = direct_io ? *direct_io : -1;
+		__entry->has_refcount = has_refcount ? *has_refcount : -1;
+	),
+	TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+		  __entry->saved_pos, __entry->appending, __entry->count,
+		  __entry->direct_io, __entry->has_refcount)
+);
+
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+
+/* End of trace events for fs/ocfs2/file.c. */
+
+/* Trace events for fs/ocfs2/inode.c. */
+
+TRACE_EVENT(ocfs2_iget_begin,
+	TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+	TP_ARGS(ino, flags, sysfile_type),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+		__field(int, sysfile_type)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->flags = flags;
+		__entry->sysfile_type = sysfile_type;
+	),
+	TP_printk("%llu %u %d", __entry->ino,
+		  __entry->flags, __entry->sysfile_type)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+
+TRACE_EVENT(ocfs2_iget_end,
+	TP_PROTO(void *inode, unsigned long long ino),
+	TP_ARGS(inode, ino),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+	),
+	TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+
+TRACE_EVENT(ocfs2_find_actor,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 void *args,  unsigned long long fi_blkno),
+	TP_ARGS(inode, ino, args, fi_blkno),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(void *, args)
+		__field(unsigned long long, fi_blkno)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->args = args;
+		__entry->fi_blkno = fi_blkno;
+	),
+	TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+		  __entry->args, __entry->fi_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+	TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(task, dc_task, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, task)
+		__field(void *, dc_task)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->task = task;
+		__entry->dc_task = dc_task;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+		  __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+
+TRACE_EVENT(ocfs2_inode_revalidate,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(inode, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+
+/* End of trace events for fs/ocfs2/inode.c. */
+
+/* Trace events for fs/ocfs2/extent_map.c. */
+
+TRACE_EVENT(ocfs2_read_virt_blocks,
+	TP_PROTO(void *inode, unsigned long long vblock, int nr,
+		 void *bhs, unsigned int flags, void *validate),
+	TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, vblock)
+		__field(int, nr)
+		__field(void *, bhs)
+		__field(unsigned int, flags)
+		__field(void *, validate)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->vblock = vblock;
+		__entry->nr = nr;
+		__entry->bhs = bhs;
+		__entry->flags = flags;
+		__entry->validate = validate;
+	),
+	TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+		  __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+
+/* End of trace events for fs/ocfs2/extent_map.c. */
+
+/* Trace events for fs/ocfs2/slot_map.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+
+/* End of trace events for fs/ocfs2/slot_map.c. */
+
+/* Trace events for fs/ocfs2/heartbeat.c. */
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+
+/* Trace events for fs/ocfs2/super.c. */
+
+TRACE_EVENT(ocfs2_remount,
+	TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+	TP_ARGS(s_flags, osb_flags, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long, s_flags)
+		__field(unsigned long, osb_flags)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->s_flags = s_flags;
+		__entry->osb_flags = osb_flags;
+		__entry->flags = flags;
+	),
+	TP_printk("%lu %lu %d", __entry->s_flags,
+		  __entry->osb_flags, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_fill_super,
+	TP_PROTO(void *sb, void *data, int silent),
+	TP_ARGS(sb, data, silent),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, data)
+		__field(int, silent)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->data = data;
+		__entry->silent = silent;
+	),
+	TP_printk("%p %p %d", __entry->sb,
+		  __entry->data, __entry->silent)
+);
+
+TRACE_EVENT(ocfs2_parse_options,
+	TP_PROTO(int is_remount, char *options),
+	TP_ARGS(is_remount, options),
+	TP_STRUCT__entry(
+		__field(int, is_remount)
+		__string(options, options)
+	),
+	TP_fast_assign(
+		__entry->is_remount = is_remount;
+		__assign_str(options, options);
+	),
+	TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+
+TRACE_EVENT(ocfs2_statfs,
+	TP_PROTO(void *sb, void *buf),
+	TP_ARGS(sb, buf),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, buf)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->buf = buf;
+	),
+	TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+
+TRACE_EVENT(ocfs2_initialize_super,
+	TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+		 unsigned long long system_dir, int cluster_bits),
+	TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+	TP_STRUCT__entry(
+		__string(label, label)
+		__string(uuid_str, uuid_str)
+		__field(unsigned long long, root_dir)
+		__field(unsigned long long, system_dir)
+		__field(int, cluster_bits)
+	),
+	TP_fast_assign(
+		__assign_str(label, label);
+		__assign_str(uuid_str, uuid_str);
+		__entry->root_dir = root_dir;
+		__entry->system_dir = system_dir;
+		__entry->cluster_bits = cluster_bits;
+	),
+	TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+		  __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+
+/* End of trace events for fs/ocfs2/super.c. */
+
+/* Trace events for fs/ocfs2/xattr.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+	TP_PROTO(const char *name, int meta, int clusters, int credits),
+	TP_ARGS(name, meta, clusters, credits),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(int, meta)
+		__field(int, clusters)
+		__field(int, credits)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->meta = meta;
+		__entry->clusters = clusters;
+		__entry->credits = credits;
+	),
+	TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+		  __entry->clusters, __entry->credits)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+	TP_PROTO(unsigned long long ino, const char *name, int name_index,
+		 unsigned int hash, unsigned long long location,
+		 int xe_index),
+	TP_ARGS(ino, name, name_index, hash, location, xe_index),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__string(name, name)
+		__field(int, name_index)
+		__field(unsigned int, hash)
+		__field(unsigned long long, location)
+		__field(int, xe_index)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__assign_str(name, name);
+		__entry->name_index = name_index;
+		__entry->hash = hash;
+		__entry->location = location;
+		__entry->xe_index = xe_index;
+	),
+	TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+		  __entry->name_index, __entry->hash, __entry->location,
+		  __entry->xe_index)
+);
+
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name)					\
+DEFINE_EVENT(ocfs2__xattr_find, name,					\
+TP_PROTO(unsigned long long ino, const char *name, int name_index,	\
+	 unsigned int hash, unsigned long long bucket,			\
+	 int xe_index),							\
+	TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+
+/* End of trace events for fs/ocfs2/xattr.c. */
+
+/* Trace events for fs/ocfs2/reservations.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+	TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+		 unsigned int wanted, int empty_root),
+	TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+	TP_STRUCT__entry(
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, goal)
+		__field(unsigned int, wanted)
+		__field(int, empty_root)
+	),
+	TP_fast_assign(
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->goal = goal;
+		__entry->wanted = wanted;
+		__entry->empty_root = empty_root;
+	),
+	TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+		  __entry->goal, __entry->wanted, __entry->empty_root)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+	TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+		 unsigned int r_start, unsigned int r_end, unsigned int r_len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(cstart, cend, clen, r_start, r_end,
+		r_len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, cstart)
+		__field(unsigned int, cend)
+		__field(unsigned int, clen)
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, r_len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->cstart = cstart;
+		__entry->cend = cend;
+		__entry->clen = clen;
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->r_len = r_len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u %u %u %u",
+		  __entry->cstart, __entry->cend, __entry->clen,
+		  __entry->r_start, __entry->r_end, __entry->r_len,
+		  __entry->last_start, __entry->last_len)
+);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+/* End of trace events for fs/ocfs2/reservations.c. */
+
+/* Trace events for fs/ocfs2/quota_local.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+
+/* End of trace events for fs/ocfs2/quota_local.c. */
+
+/* Trace events for fs/ocfs2/quota_global.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+
+TRACE_EVENT(ocfs2_sync_dquot,
+	TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+		 long long spacechange, long long curinodes,
+		 long long inodechange),
+	TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(long long, dqb_curspace)
+		__field(long long, spacechange)
+		__field(long long, curinodes)
+		__field(long long, inodechange)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dqb_curspace = dqb_curspace;
+		__entry->spacechange = spacechange;
+		__entry->curinodes = curinodes;
+		__entry->inodechange = inodechange;
+	),
+	TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+		  __entry->dqb_curspace, __entry->spacechange,
+		  __entry->curinodes, __entry->inodechange)
+);
+
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+	TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+		 const char *s_id),
+	TP_ARGS(dq_id, dq_type, type, s_id),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(unsigned int, dq_type)
+		__field(unsigned long, type)
+		__string(s_id, s_id)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dq_type = dq_type;
+		__entry->type = type;
+		__assign_str(s_id, s_id);
+	),
+	TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+		  __entry->type, __get_str(s_id))
+);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+
+/* End of trace events for fs/ocfs2/quota_global.c. */
+
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+
+TRACE_EVENT(ocfs2_dx_dir_search,
+	TP_PROTO(unsigned long long ino, int namelen, const char *name,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 unsigned long long blkno),
+	TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, major_hash)
+		__field(unsigned int,minor_hash)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+		   __entry->namelen, __get_str(name),
+		  __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+
+TRACE_EVENT(ocfs2_find_files_on_disk,
+	TP_PROTO(int namelen, const char *name, void *blkno,
+		 unsigned long long dir),
+	TP_ARGS(namelen, name, blkno, dir),
+	TP_STRUCT__entry(
+		__field(int, namelen)
+		__string(name, name)
+		__field(void *, blkno)
+		__field(unsigned long long, dir)
+	),
+	TP_fast_assign(
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->blkno = blkno;
+		__entry->dir = dir;
+	),
+	TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+		  __entry->blkno, __entry->dir)
+);
+
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+	TP_PROTO(unsigned long long dir, int namelen, const char *name),
+	TP_ARGS(dir, namelen, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(int, namelen)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s", __entry->dir,
+		  __entry->namelen, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+	TP_PROTO(unsigned long long dir,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 int namelen, const char *name, unsigned int num_used),
+	TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(unsigned int, major_hash)
+		__field(unsigned int, minor_hash)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, num_used)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->num_used = num_used;
+	),
+	TP_printk("%llu %x %x %.*s %u", __entry->dir,
+		  __entry->major_hash, __entry->minor_hash,
+		   __entry->namelen, __get_str(name), __entry->num_used)
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+
+/* End of trace events for fs/ocfs2/dir.c. */
+
+/* Trace events for fs/ocfs2/namei.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long long extra),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long long, extra)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->extra = extra;
+	),
+	TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->extra)
+);
+
+#define DEFINE_OCFS2_DENTRY_OPS(name)					\
+DEFINE_EVENT(ocfs2__dentry_ops, name,					\
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name,	\
+	 unsigned long long dir_blkno, unsigned long long extra),	\
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+
+TRACE_EVENT(ocfs2_mknod,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long dev, int mode),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long, dev)
+		__field(int, mode)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->dev = dev;
+		__entry->mode = mode;
+	),
+	TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+
+TRACE_EVENT(ocfs2_link,
+	TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+		 int name_len, const char *name),
+	TP_ARGS(ino, old_len, old_name, name_len, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, name_len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s %.*s", __entry->ino,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->name_len, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+
+TRACE_EVENT(ocfs2_rename,
+	TP_PROTO(void *old_dir, void *old_dentry,
+		 void *new_dir, void *new_dentry,
+		 int old_len, const char *old_name,
+		 int new_len, const char *new_name),
+	TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+		old_len, old_name, new_len, new_name),
+	TP_STRUCT__entry(
+		__field(void *, old_dir)
+		__field(void *, old_dentry)
+		__field(void *, new_dir)
+		__field(void *, new_dentry)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->old_dir = old_dir;
+		__entry->old_dentry = old_dentry;
+		__entry->new_dir = new_dir;
+		__entry->new_dentry = new_dentry;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%p %p %p %p %.*s %.*s",
+		  __entry->old_dir, __entry->old_dentry,
+		  __entry->new_dir, __entry->new_dentry,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
+
+TRACE_EVENT(ocfs2_rename_target_exists,
+	TP_PROTO(int new_len, const char *new_name),
+	TP_ARGS(new_len, new_name),
+	TP_STRUCT__entry(
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+
+TRACE_EVENT(ocfs2_rename_over_existing,
+	TP_PROTO(unsigned long long new_blkno, void *new_bh,
+		 unsigned long long newdi_blkno),
+	TP_ARGS(new_blkno, new_bh, newdi_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, new_blkno)
+		__field(void *, new_bh)
+		__field(unsigned long long, newdi_blkno)
+	),
+	TP_fast_assign(
+		__entry->new_blkno = new_blkno;
+		__entry->new_bh = new_bh;
+		__entry->newdi_blkno = newdi_blkno;
+	),
+	TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+		  __entry->newdi_blkno)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+
+TRACE_EVENT(ocfs2_symlink_begin,
+	TP_PROTO(void *dir, void *dentry, const char *symname,
+		 int len, const char *name),
+	TP_ARGS(dir, dentry, symname, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(const char *, symname)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->symname = symname;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+		  __entry->symname, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_blkno_stringify,
+	TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+	TP_ARGS(blkno, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+		  __entry->namelen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+
+TRACE_EVENT(ocfs2_orphan_del,
+	TP_PROTO(unsigned long long dir, const char *name, int namelen),
+	TP_ARGS(dir, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+		  __entry->namelen)
+);
+
+/* End of trace events for fs/ocfs2/namei.c. */
+
+/* Trace events for fs/ocfs2/dcache.c. */
+
+TRACE_EVENT(ocfs2_dentry_revalidate,
+	TP_PROTO(void *dentry, int len, const char *name),
+	TP_ARGS(dentry, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+	TP_PROTO(int len, const char *name, unsigned long pgen,
+		 unsigned long gen),
+	TP_ARGS(len, name, pgen, gen),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long, pgen)
+		__field(unsigned long, gen)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->pgen = pgen;
+		__entry->gen = gen;
+	),
+	TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+		  __entry->pgen, __entry->gen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+
+TRACE_EVENT(ocfs2_find_local_alias,
+	TP_PROTO(int len, const char *name),
+	TP_ARGS(len, name),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%.*s", __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+	TP_PROTO(int len, const char *name,
+		 unsigned long long parent, void *fsdata),
+	TP_ARGS(len, name, parent, fsdata),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(void *, fsdata)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->fsdata = fsdata;
+	),
+	TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+		  __entry->parent, __entry->fsdata)
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+	TP_PROTO(const char *name, unsigned long long parent,
+		 unsigned long long ino),
+	TP_ARGS(name, parent, ino),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->ino = ino;
+	),
+	TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+/* End of trace events for fs/ocfs2/dcache.c. */
+
+/* Trace events for fs/ocfs2/export.c. */
+
+TRACE_EVENT(ocfs2_get_dentry_begin,
+	TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+	TP_ARGS(sb, handle, blkno),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, handle)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->handle = handle;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+
+TRACE_EVENT(ocfs2_get_parent,
+	TP_PROTO(void *child, int len, const char *name,
+		 unsigned long long ino),
+	TP_ARGS(child, len, name, ino),
+	TP_STRUCT__entry(
+		__field(void *,	child)
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->child = child;
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->ino = ino;
+	),
+	TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+		  __get_str(name), __entry->ino)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+
+TRACE_EVENT(ocfs2_encode_fh_begin,
+	TP_PROTO(void *dentry, int name_len, const char *name,
+		 void *fh, int len, int connectable),
+	TP_ARGS(dentry, name_len, name, fh, len, connectable),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(void *, fh)
+		__field(int, len)
+		__field(int, connectable)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->fh = fh;
+		__entry->len = len;
+		__entry->connectable = connectable;
+	),
+	TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+		  __get_str(name), __entry->fh, __entry->len,
+		  __entry->connectable)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+
+/* End of trace events for fs/ocfs2/export.c. */
+
+/* Trace events for fs/ocfs2/journal.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+	TP_PROTO(int slot, unsigned long long la_ino,
+		 unsigned long long tl_ino, void *qrec),
+	TP_ARGS(slot, la_ino, tl_ino, qrec),
+	TP_STRUCT__entry(
+		__field(int, slot)
+		__field(unsigned long long, la_ino)
+		__field(unsigned long long, tl_ino)
+		__field(void *, qrec)
+	),
+	TP_fast_assign(
+		__entry->slot = slot;
+		__entry->la_ino = la_ino;
+		__entry->tl_ino = tl_ino;
+		__entry->qrec = qrec;
+	),
+	TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+		  __entry->tl_ino, __entry->qrec)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+
+TRACE_EVENT(ocfs2_recovery_thread,
+	TP_PROTO(int node_num, int osb_node_num, int disable,
+		 void *recovery_thread, int map_set),
+	TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+	TP_STRUCT__entry(
+		__field(int, node_num)
+		__field(int, osb_node_num)
+		__field(int,disable)
+		__field(void *, recovery_thread)
+		__field(int,map_set)
+	),
+	TP_fast_assign(
+		__entry->node_num = node_num;
+		__entry->osb_node_num = osb_node_num;
+		__entry->disable = disable;
+		__entry->recovery_thread = recovery_thread;
+		__entry->map_set = map_set;
+	),
+	TP_printk("%d %d %d %p %d", __entry->node_num,
+		   __entry->osb_node_num, __entry->disable,
+		   __entry->recovery_thread, __entry->map_set)
+);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+
+/* End of trace events for fs/ocfs2/journal.c. */
+
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+
+TRACE_EVENT(ocfs2_write_block,
+	TP_PROTO(unsigned long long block, void *ci),
+	TP_ARGS(block, ci),
+	TP_STRUCT__entry(
+		__field(unsigned long long, block)
+		__field(void *, ci)
+	),
+	TP_fast_assign(
+		__entry->block = block;
+		__entry->ci = ci;
+	),
+	TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+
+TRACE_EVENT(ocfs2_read_blocks_begin,
+	TP_PROTO(void *ci, unsigned long long block,
+		 unsigned int nr, int flags),
+	TP_ARGS(ci, block, nr, flags),
+	TP_STRUCT__entry(
+		__field(void *, ci)
+		__field(unsigned long long, block)
+		__field(unsigned int, nr)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->ci = ci;
+		__entry->block = block;
+		__entry->nr = nr;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+		  __entry->nr, __entry->flags)
+);
+
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
+
+/* Trace events for fs/ocfs2/uptodate.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+
+TRACE_EVENT(ocfs2_buffer_cached_end,
+	TP_PROTO(int index, void *item),
+	TP_ARGS(index, item),
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(void *, item)
+	),
+	TP_fast_assign(
+		__entry->index = index;
+		__entry->item = item;
+	),
+	TP_printk("%d %p", __entry->index, __entry->item)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+
+/* End of trace events for fs/ocfs2/uptodate.c. */
+#endif /* _TRACE_OCFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/kernel/fs/ocfs2/quota.h b/kernel/fs/ocfs2/quota.h
new file mode 100644
index 000000000..b6d51333a
--- /dev/null
+++ b/kernel/fs/ocfs2/quota.h
@@ -0,0 +1,123 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/* Number of quota types we support */
+#define OCFS2_MAXQUOTAS 2
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+	struct dquot dq_dquot;	/* Generic VFS dquot */
+	loff_t dq_local_off;	/* Offset in the local quota file */
+	u64 dq_local_phys_blk;	/* Physical block carrying quota structure */
+	struct ocfs2_quota_chunk *dq_chunk;	/* Chunk dquot is in */
+	unsigned int dq_use_count;	/* Number of nodes having reference to this entry in global quota file */
+	s64 dq_origspace;	/* Last globally synced space usage */
+	s64 dq_originodes;	/* Last globally synced inode usage */
+	struct llist_node list;	/* Member of list of dquots to drop */
+};
+
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+	struct list_head rc_list;	/* List of chunks */
+	int rc_chunk;			/* Chunk number */
+	unsigned long *rc_bitmap;	/* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+	struct list_head r_list[OCFS2_MAXQUOTAS];	/* List of chunks to recover */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+	unsigned int dqi_type;		/* Quota type this structure describes */
+	unsigned int dqi_flags;		/* Flags OLQF_* */
+	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
+	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
+	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
+	struct list_head dqi_chunk;	/* List of chunks */
+	struct inode *dqi_gqinode;	/* Global quota file inode */
+	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
+	struct buffer_head *dqi_gqi_bh;	/* Buffer head with global quota file inode - set only if inode lock is obtained */
+	int dqi_gqi_count;		/* Number of holders of dqi_gqi_bh */
+	u64 dqi_giblk;			/* Number of block with global information header */
+	struct buffer_head *dqi_lqi_bh;	/* Buffer head with local quota file inode */
+	struct buffer_head *dqi_libh;	/* Buffer with local information header */
+	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
+	struct delayed_work dqi_sync_work;	/* Work for syncing dquots */
+	struct ocfs2_quota_recovery *dqi_rec;	/* Pointer to recovery
+						 * information, in case we
+						 * enable quotas on file
+						 * needing it */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+	return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+	struct list_head qc_chunk;	/* List of quotafile chunks */
+	int qc_num;			/* Number of quota chunk */
+	struct buffer_head *qc_headerbh;	/* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+				struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+				struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
+
+extern const struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+#endif /* _OCFS2_QUOTA_H */
diff --git a/kernel/fs/ocfs2/quota_global.c b/kernel/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000..c93d67220
--- /dev/null
+++ b/kernel/fs/ocfs2/quota_global.c
@@ -0,0 +1,971 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+#include <linux/llist.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "quota.h"
+#include "ocfs2_trace.h"
+
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ *   by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ *   and ip_alloc_sem of the global quota file (achieved by
+ *   ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ *   its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ *   start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ *   ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *     write to gf
+ *						       -> write to lf
+ * Acquire dquot for the first time:
+ *   dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ *				     -> alloc space for gf
+ *				     -> start_trans -> qinfo_lock -> write to gf
+ *	     -> ip_alloc_sem of lf -> alloc space for lf
+ *	     -> write to lf
+ * Release last reference to dquot:
+ *   dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ *	     -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ *   inode cluster lock of recovered lf
+ *     -> read bitmaps -> ip_alloc_sem of lf
+ *     -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *        write to gf
+ */
+
+static void qsync_work_fn(struct work_struct *work);
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	/* Update from disk only entries not set by the admin */
+	if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+		m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+		m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+		m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+		m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+		m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+		m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
+	d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_pad1 = d->dqb_pad2 = 0;
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+
+	if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+		return 0;
+
+	return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
+				le32_to_cpu(d->dqb_id)),
+		      dquot->dq_id);
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+	.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+	.disk2mem_dqblk = ocfs2_global_disk2memdqb,
+	.is_id = ocfs2_global_is_id,
+};
+
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+
+	trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+				struct buffer_head **bhp)
+{
+	int rc;
+
+	*bhp = NULL;
+	rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
+			       ocfs2_validate_quota_block);
+	if (rc)
+		mlog_errno(rc);
+	return rc;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	loff_t i_size = i_size_read(gqinode);
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	struct buffer_head *bh;
+	size_t toread, tocopy;
+	u64 pblock = 0, pcount = 0;
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+		if (!pcount) {
+			err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+							  &pcount, NULL);
+			if (err) {
+				mlog_errno(err);
+				return err;
+			}
+		} else {
+			pcount--;
+			pblock++;
+		}
+		bh = NULL;
+		err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
+		if (err) {
+			mlog_errno(err);
+			return err;
+		}
+		memcpy(data, bh->b_data + offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0, new = 0, ja_type;
+	struct buffer_head *bh = NULL;
+	handle_t *handle = journal_current_handle();
+	u64 pblock, pcount;
+
+	if (!handle) {
+		mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+		     "because transaction was not started.\n",
+		     (unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+		WARN_ON(1);
+		len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+	}
+
+	if (i_size_read(gqinode) < off + len) {
+		loff_t rounded_end =
+				ocfs2_align_bytes_to_blocks(sb, off + len);
+
+		/* Space is already allocated in ocfs2_acquire_dquot() */
+		err = ocfs2_simple_size_update(gqinode,
+					       oinfo->dqi_gqi_bh,
+					       rounded_end);
+		if (err < 0)
+			goto out;
+		new = 1;
+	}
+	err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+	if (err) {
+		mlog_errno(err);
+		goto out;
+	}
+	/* Not rewriting whole block? */
+	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+	    !new) {
+		err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
+		ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	} else {
+		bh = sb_getblk(sb, pblock);
+		if (!bh)
+			err = -ENOMEM;
+		ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+	}
+	if (err) {
+		mlog_errno(err);
+		goto out;
+	}
+	lock_buffer(bh);
+	if (new)
+		memset(bh->b_data, 0, sb->s_blocksize);
+	memcpy(bh->b_data + offset, data, len);
+	flush_dcache_page(bh->b_page);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
+	err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
+				      ja_type);
+	if (err < 0) {
+		brelse(bh);
+		goto out;
+	}
+	ocfs2_journal_dirty(handle, bh);
+	brelse(bh);
+out:
+	if (err) {
+		mlog_errno(err);
+		return err;
+	}
+	gqinode->i_version++;
+	ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+	return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	int status;
+	struct buffer_head *bh = NULL;
+
+	status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+	if (status < 0)
+		return status;
+	spin_lock(&dq_data_lock);
+	if (!oinfo->dqi_gqi_count++)
+		oinfo->dqi_gqi_bh = bh;
+	else
+		WARN_ON(bh != oinfo->dqi_gqi_bh);
+	spin_unlock(&dq_data_lock);
+	if (ex) {
+		mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+		down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	} else {
+		down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	}
+	return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	if (ex) {
+		up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+		mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+	} else {
+		up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	}
+	ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+	brelse(oinfo->dqi_gqi_bh);
+	spin_lock(&dq_data_lock);
+	if (!--oinfo->dqi_gqi_count)
+		oinfo->dqi_gqi_bh = NULL;
+	spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+	struct inode *gqinode = NULL;
+	unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					      GROUP_QUOTA_SYSTEM_INODE };
+	struct ocfs2_global_disk_dqinfo dinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	u64 pcount;
+	int status;
+
+	/* Read global header */
+	gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+			OCFS2_INVALID_SLOT);
+	if (!gqinode) {
+		mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+			type);
+		status = -EINVAL;
+		goto out_err;
+	}
+	oinfo->dqi_gi.dqi_sb = sb;
+	oinfo->dqi_gi.dqi_type = type;
+	ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+	oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+	oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+	oinfo->dqi_gqi_bh = NULL;
+	oinfo->dqi_gqi_count = 0;
+	oinfo->dqi_gqinode = gqinode;
+	status = ocfs2_lock_global_qf(oinfo, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+					     &pcount, NULL);
+	if (status < 0)
+		goto out_unlock;
+
+	status = ocfs2_qinfo_lock(oinfo, 0);
+	if (status < 0)
+		goto out_unlock;
+	status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+				      sizeof(struct ocfs2_global_disk_dqinfo),
+				      OCFS2_GLOBAL_INFO_OFF);
+	ocfs2_qinfo_unlock(oinfo, 0);
+	ocfs2_unlock_global_qf(oinfo, 0);
+	if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+		     status);
+		if (status >= 0)
+			status = -EIO;
+		mlog_errno(status);
+		goto out_err;
+	}
+	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+	oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+						OCFS2_QBLK_RESERVED_SPACE;
+	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+	schedule_delayed_work(&oinfo->dqi_sync_work,
+			      msecs_to_jiffies(oinfo->dqi_syncms));
+
+out_err:
+	return status;
+out_unlock:
+	ocfs2_unlock_global_qf(oinfo, 0);
+	mlog_errno(status);
+	goto out_err;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_global_disk_dqinfo dinfo;
+	ssize_t size;
+
+	spin_lock(&dq_data_lock);
+	info->dqi_flags &= ~DQF_INFO_DIRTY;
+	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+	spin_unlock(&dq_data_lock);
+	dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+	dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+				     sizeof(struct ocfs2_global_disk_dqinfo),
+				     OCFS2_GLOBAL_INFO_OFF);
+	if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot write global quota info structure\n");
+		if (size >= 0)
+			size = -EIO;
+		return size;
+	}
+	return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	int err;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 1);
+	if (err < 0)
+		return err;
+	err = __ocfs2_global_write_info(sb, type);
+	ocfs2_qinfo_unlock(info, 1);
+	return err;
+}
+
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	/*
+	 * We may need to allocate tree blocks and a leaf block but not the
+	 * root block
+	 */
+	return oinfo->dqi_gi.dqi_qtree_depth;
+}
+
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+	/* We modify all the allocated blocks, tree root, info block and
+	 * the inode */
+	return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+			OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+	int err, err2;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_id.type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_global_disk_dqblk dqblk;
+	s64 spacechange, inodechange;
+	time_t olditime, oldbtime;
+
+	err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				   sizeof(struct ocfs2_global_disk_dqblk),
+				   dquot->dq_off);
+	if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+		if (err >= 0) {
+			mlog(ML_ERROR, "Short read from global quota file "
+				       "(%u read)\n", err);
+			err = -EIO;
+		}
+		goto out;
+	}
+
+	/* Update space and inode usage. Get also other information from
+	 * global quota file so that we don't overwrite any changes there.
+	 * We are */
+	spin_lock(&dq_data_lock);
+	spacechange = dquot->dq_dqb.dqb_curspace -
+					OCFS2_DQUOT(dquot)->dq_origspace;
+	inodechange = dquot->dq_dqb.dqb_curinodes -
+					OCFS2_DQUOT(dquot)->dq_originodes;
+	olditime = dquot->dq_dqb.dqb_itime;
+	oldbtime = dquot->dq_dqb.dqb_btime;
+	ocfs2_global_disk2memdqb(dquot, &dqblk);
+	trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+			       dquot->dq_dqb.dqb_curspace,
+			       (long long)spacechange,
+			       dquot->dq_dqb.dqb_curinodes,
+			       (long long)inodechange);
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curspace += spacechange;
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curinodes += inodechange;
+	/* Set properly space grace time... */
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+		    oldbtime > 0) {
+			if (dquot->dq_dqb.dqb_btime > 0)
+				dquot->dq_dqb.dqb_btime =
+					min(dquot->dq_dqb.dqb_btime, oldbtime);
+			else
+				dquot->dq_dqb.dqb_btime = oldbtime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_btime = 0;
+		clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+	}
+	/* Set properly inode grace time... */
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+		    olditime > 0) {
+			if (dquot->dq_dqb.dqb_itime > 0)
+				dquot->dq_dqb.dqb_itime =
+					min(dquot->dq_dqb.dqb_itime, olditime);
+			else
+				dquot->dq_dqb.dqb_itime = olditime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_itime = 0;
+		clear_bit(DQ_INODES_B, &dquot->dq_flags);
+	}
+	/* All information is properly updated, clear the flags */
+	__clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	spin_unlock(&dq_data_lock);
+	err = ocfs2_qinfo_lock(info, freeing);
+	if (err < 0) {
+		mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
+			       " (type=%d, id=%u)\n", dquot->dq_id.type,
+			       (unsigned)from_kqid(&init_user_ns, dquot->dq_id));
+		goto out;
+	}
+	if (freeing)
+		OCFS2_DQUOT(dquot)->dq_use_count--;
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+		err = qtree_release_dquot(&info->dqi_gi, dquot);
+		if (info_dirty(sb_dqinfo(sb, type))) {
+			err2 = __ocfs2_global_write_info(sb, type);
+			if (!err)
+				err = err2;
+		}
+	}
+out_qlock:
+	ocfs2_qinfo_unlock(info, freeing);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+	handle_t *handle;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int status = 0;
+
+	trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
+				      dquot->dq_id.type,
+				      type, sb->s_id);
+	if (type != dquot->dq_id.type)
+		goto out;
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	status = ocfs2_sync_dquot(dquot);
+	if (status < 0)
+		mlog_errno(status);
+	/* We have to write local structure as well... */
+	status = ocfs2_local_write_dquot(dquot);
+	if (status < 0)
+		mlog_errno(status);
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+	struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+						      struct ocfs2_mem_dqinfo,
+						      dqi_sync_work.work);
+	struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+	schedule_delayed_work(&oinfo->dqi_sync_work,
+			      msecs_to_jiffies(oinfo->dqi_syncms));
+}
+
+/*
+ *  Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+				dquot->dq_id.type);
+
+	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+	status = ocfs2_local_write_dquot(dquot);
+	mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+	ocfs2_commit_trans(osb, handle);
+out:
+	return status;
+}
+
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	/*
+	 * We modify tree, leaf block, global info, local chunk header,
+	 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
+	 * accounts for inode update
+	 */
+	return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
+	       OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+	       OCFS2_QINFO_WRITE_CREDITS +
+	       OCFS2_INODE_UPDATE_CREDITS;
+}
+
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+					       dquot_drop_work);
+	struct llist_node *list;
+	struct ocfs2_dquot *odquot, *next_odquot;
+
+	list = llist_del_all(&osb->dquot_drop_list);
+	llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+		/* Drop the reference we acquired in ocfs2_dquot_release() */
+		dqput(&odquot->dq_dquot);
+	}
+}
+
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+				  dquot->dq_id.type);
+
+	mutex_lock(&dquot->dq_lock);
+	/* Check whether we are not racing with some other dqget() */
+	if (atomic_read(&dquot->dq_count) > 1)
+		goto out;
+	/* Running from downconvert thread? Postpone quota processing to wq */
+	if (current == osb->dc_task) {
+		/*
+		 * Grab our own reference to dquot and queue it for delayed
+		 * dropping.  Quota code rechecks after calling
+		 * ->release_dquot() and won't free dquot structure.
+		 */
+		dqgrab(dquot);
+		/* First entry on list -> queue work */
+		if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
+			queue_work(ocfs2_wq, &osb->dquot_drop_work);
+		goto out;
+	}
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+
+	status = ocfs2_global_release_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_local_release_dquot(handle, dquot);
+	/*
+	 * If we fail here, we cannot do much as global structure is
+	 * already released. So just complain...
+	 */
+	if (status < 0)
+		mlog_errno(status);
+	/*
+	 * Clear dq_off so that we search for the structure in quota file next
+	 * time we acquire it. The structure might be deleted and reallocated
+	 * elsewhere by another node while our dquot structure is on freelist.
+	 */
+	dquot->dq_off = 0;
+	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mutex_unlock(&dquot->dq_lock);
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+	int status = 0, err;
+	int ex = 0;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int type = dquot->dq_id.type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = info->dqi_gqinode;
+	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+	handle_t *handle;
+
+	trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+				  type);
+	mutex_lock(&dquot->dq_lock);
+	/*
+	 * We need an exclusive lock, because we're going to update use count
+	 * and instantiate possibly new dquot structure
+	 */
+	status = ocfs2_lock_global_qf(info, 1);
+	if (status < 0)
+		goto out;
+	status = ocfs2_qinfo_lock(info, 0);
+	if (status < 0)
+		goto out_dq;
+	/*
+	 * We always want to read dquot structure from disk because we don't
+	 * know what happened with it while it was on freelist.
+	 */
+	status = qtree_read_dquot(&info->dqi_gi, dquot);
+	ocfs2_qinfo_unlock(info, 0);
+	if (status < 0)
+		goto out_dq;
+
+	OCFS2_DQUOT(dquot)->dq_use_count++;
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	if (!dquot->dq_off) {	/* No real quota entry? */
+		ex = 1;
+		/*
+		 * Add blocks to quota file before we start a transaction since
+		 * locking allocators ranks above a transaction start
+		 */
+		WARN_ON(journal_current_handle());
+		status = ocfs2_extend_no_holes(gqinode, NULL,
+			i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
+			i_size_read(gqinode));
+		if (status < 0)
+			goto out_dq;
+	}
+
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_calc_global_qinit_credits(sb, type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		goto out_dq;
+	}
+	status = ocfs2_qinfo_lock(info, ex);
+	if (status < 0)
+		goto out_trans;
+	status = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (ex && info_dirty(sb_dqinfo(sb, type))) {
+		err = __ocfs2_global_write_info(sb, type);
+		if (!status)
+			status = err;
+	}
+	ocfs2_qinfo_unlock(info, ex);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_dq:
+	ocfs2_unlock_global_qf(info, 1);
+	if (status < 0)
+		goto out;
+
+	status = ocfs2_create_local_dquot(dquot);
+	if (status < 0)
+		goto out;
+	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out:
+	mutex_unlock(&dquot->dq_lock);
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+	unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+	int sync = 0;
+	int status;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_id.type;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
+				     type);
+
+	/* In case user set some limits, sync dquot immediately to global
+	 * quota file so that information propagates quicker */
+	spin_lock(&dq_data_lock);
+	if (dquot->dq_flags & mask)
+		sync = 1;
+	spin_unlock(&dq_data_lock);
+	/* This is a slight hack but we can't afford getting global quota
+	 * lock if we already have a transaction started. */
+	if (!sync || journal_current_handle()) {
+		status = ocfs2_write_dquot(dquot);
+		goto out;
+	}
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	status = ocfs2_sync_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_dlock;
+	}
+	/* Now write updated local dquot structure */
+	status = ocfs2_local_write_dquot(dquot);
+out_dlock:
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+	handle_t *handle;
+	int status = 0;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_commit_info(sb, type);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+	struct ocfs2_dquot *dquot =
+				kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+	if (!dquot)
+		return NULL;
+	return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+	kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+const struct dquot_operations ocfs2_quota_operations = {
+	/* We never make dquot dirty so .write_dquot is never called */
+	.acquire_dquot	= ocfs2_acquire_dquot,
+	.release_dquot	= ocfs2_release_dquot,
+	.mark_dirty	= ocfs2_mark_dquot_dirty,
+	.write_info	= ocfs2_write_info,
+	.alloc_dquot	= ocfs2_alloc_dquot,
+	.destroy_dquot	= ocfs2_destroy_dquot,
+};
diff --git a/kernel/fs/ocfs2/quota_local.c b/kernel/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000..3d0b63d34
--- /dev/null
+++ b/kernel/fs/ocfs2/quota_local.c
@@ -0,0 +1,1315 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+#include "uptodate.h"
+#include "super.h"
+#include "ocfs2_trace.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+	return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+		sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+	return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+		 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+	       ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+	return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+	/* 1 block for local quota file info, 1 block per chunk for chunk info */
+	return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+	return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+	       ol_dqblk_block_off(sb, c, off);
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+	return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ((off >> sb->s_blocksize_bits) -
+			ol_quota_chunk_block(sb, c) - 1) * epb
+	       + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+		 sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+		void (*modify)(struct buffer_head *, void *), void *private)
+{
+	struct super_block *sb = inode->i_sb;
+	handle_t *handle;
+	int status;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		return status;
+	}
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	lock_buffer(bh);
+	modify(bh, private);
+	unlock_buffer(bh);
+	ocfs2_journal_dirty(handle, bh);
+
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+	return 0;
+}
+
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+				  struct buffer_head **bh)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+
+	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+		ocfs2_error(inode->i_sb,
+			    "Quota file %llu is probably corrupted! Requested "
+			    "to read block %Lu but file has size only %Lu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_block,
+			    (unsigned long long)i_size_read(inode));
+		return -EIO;
+	}
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+				    ocfs2_validate_quota_block);
+	if (rc)
+		mlog_errno(rc);
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+	unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+	unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+	unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+	unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+	unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					      GROUP_QUOTA_SYSTEM_INODE };
+	struct buffer_head *bh = NULL;
+	struct inode *linode = sb_dqopt(sb)->files[type];
+	struct inode *ginode = NULL;
+	struct ocfs2_disk_dqheader *dqhead;
+	int status, ret = 0;
+
+	/* First check whether we understand local quota file */
+	status = ocfs2_read_quota_block(linode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+			type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+		mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+			lmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+		mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_version),
+			lversions[type], type);
+		goto out_err;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	/* Next check whether we understand global quota file */
+	ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+						OCFS2_INVALID_SLOT);
+	if (!ginode) {
+		mlog(ML_ERROR, "cannot get global quota file inode "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	/* Since the header is read only, we don't care about locking */
+	status = ocfs2_read_quota_block(ginode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read global quota file header "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+		mlog(ML_ERROR, "global quota file magic does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+		mlog(ML_ERROR, "global quota file version does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_version), gversions[type],
+			type);
+		goto out_err;
+	}
+
+	ret = 1;
+out_err:
+	brelse(bh);
+	iput(ginode);
+	return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+	struct ocfs2_quota_chunk *pos, *next;
+
+	list_for_each_entry_safe(pos, next, head, qc_chunk) {
+		list_del(&pos->qc_chunk);
+		brelse(pos->qc_headerbh);
+		kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+	}
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+			struct ocfs2_local_disk_dqinfo *ldinfo,
+			struct list_head *head)
+{
+	struct ocfs2_quota_chunk *newchunk;
+	int i, status;
+
+	INIT_LIST_HEAD(head);
+	for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+		newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+		if (!newchunk) {
+			ocfs2_release_local_quota_bitmaps(head);
+			return -ENOMEM;
+		}
+		newchunk->qc_num = i;
+		newchunk->qc_headerbh = NULL;
+		status = ocfs2_read_quota_block(inode,
+				ol_quota_chunk_block(inode->i_sb, i),
+				&newchunk->qc_headerbh);
+		if (status) {
+			mlog_errno(status);
+			kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+			ocfs2_release_local_quota_bitmaps(head);
+			return status;
+		}
+		list_add_tail(&newchunk->qc_chunk, head);
+	}
+	return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+	struct mem_dqinfo *info = private;
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	spin_lock(&dq_data_lock);
+	ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
+	ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+	ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+	spin_unlock(&dq_data_lock);
+}
+
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+				    struct ocfs2_local_disk_chunk *dchunk,
+				    int chunk,
+				    struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *rc;
+
+	rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+	if (!rc)
+		return -ENOMEM;
+	rc->rc_chunk = chunk;
+	rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+	if (!rc->rc_bitmap) {
+		kfree(rc);
+		return -ENOMEM;
+	}
+	memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+	       (ol_chunk_entries(sb) + 7) >> 3);
+	list_add_tail(&rc->rc_list, head);
+	return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *next;
+	struct ocfs2_recovery_chunk *rchunk;
+
+	list_for_each_entry_safe(rchunk, next, head, rc_list) {
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+	}
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+	int type;
+
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++)
+		free_recovery_list(&(rec->r_list[type]));
+	kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+				     struct ocfs2_local_disk_dqinfo *ldinfo,
+				     int type,
+				     struct list_head *head)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct buffer_head *hbh;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	int status = 0;
+
+	for (i = 0; i < chunks; i++) {
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, i),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+			status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+		brelse(hbh);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		free_recovery_list(head);
+	return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+	int type;
+	struct ocfs2_quota_recovery *rec;
+
+	rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+	if (!rec)
+		return NULL;
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++)
+		INIT_LIST_HEAD(&(rec->r_list[type]));
+	return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+						struct ocfs2_super *osb,
+						int slot_num)
+{
+	unsigned int feature[OCFS2_MAXQUOTAS] = {
+					OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					      LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct inode *lqinode;
+	struct buffer_head *bh;
+	int type;
+	int status = 0;
+	struct ocfs2_quota_recovery *rec;
+
+	printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+	       "slot %u\n", osb->dev_str, slot_num);
+
+	rec = ocfs2_alloc_quota_recovery();
+	if (!rec)
+		return ERR_PTR(-ENOMEM);
+	/* First init... */
+
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		/* At this point, journal of the slot is already replayed so
+		 * we can trust metadata and data of the quota file */
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+					       OCFS2_META_LOCK_RECOVERY);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+						   &rec->r_list[type]);
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	if (status < 0) {
+		ocfs2_free_quota_recovery(rec);
+		rec = ERR_PTR(status);
+	}
+	return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+					  int type,
+					  struct ocfs2_quota_recovery *rec)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_local_disk_chunk *dchunk;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct dquot *dquot;
+	handle_t *handle;
+	struct buffer_head *hbh = NULL, *qbh = NULL;
+	int status = 0;
+	int bit, chunk;
+	struct ocfs2_recovery_chunk *rchunk, *next;
+	qsize_t spacechange, inodechange;
+
+	trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
+
+	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+		chunk = rchunk->rc_chunk;
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, chunk),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+			qbh = NULL;
+			status = ocfs2_read_quota_block(lqinode,
+						ol_dqblk_block(sb, chunk, bit),
+						&qbh);
+			if (status) {
+				mlog_errno(status);
+				break;
+			}
+			dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+				ol_dqblk_block_off(sb, chunk, bit));
+			dquot = dqget(sb,
+				      make_kqid(&init_user_ns, type,
+						le64_to_cpu(dqblk->dqb_id)));
+			if (!dquot) {
+				status = -EIO;
+				mlog(ML_ERROR, "Failed to get quota structure "
+				     "for id %u, type %d. Cannot finish quota "
+				     "file recovery.\n",
+				     (unsigned)le64_to_cpu(dqblk->dqb_id),
+				     type);
+				goto out_put_bh;
+			}
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_put_dquot;
+			}
+
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+						   OCFS2_QSYNC_CREDITS);
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_drop_lock;
+			}
+			mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+			spin_lock(&dq_data_lock);
+			/* Add usage from quota entry into quota changes
+			 * of our node. Auxiliary variables are important
+			 * due to signedness */
+			spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+			inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+			dquot->dq_dqb.dqb_curspace += spacechange;
+			dquot->dq_dqb.dqb_curinodes += inodechange;
+			spin_unlock(&dq_data_lock);
+			/* We want to drop reference held by the crashed
+			 * node. Since we have our own reference we know
+			 * global structure actually won't be freed. */
+			status = ocfs2_global_release_dquot(dquot);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			/* Release local quota file entry */
+			status = ocfs2_journal_access_dq(handle,
+					INODE_CACHE(lqinode),
+					qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			lock_buffer(qbh);
+			WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+			ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
+			le32_add_cpu(&dchunk->dqc_free, 1);
+			unlock_buffer(qbh);
+			ocfs2_journal_dirty(handle, qbh);
+out_commit:
+			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_drop_lock:
+			ocfs2_unlock_global_qf(oinfo, 1);
+out_put_dquot:
+			dqput(dquot);
+out_put_bh:
+			brelse(qbh);
+			if (status < 0)
+				break;
+		}
+		brelse(hbh);
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		free_recovery_list(&(rec->r_list[type]));
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num)
+{
+	unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					      LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct buffer_head *bh;
+	handle_t *handle;
+	int type;
+	int status = 0;
+	struct inode *lqinode;
+	unsigned int flags;
+
+	printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+	       "slot %u\n", osb->dev_str, slot_num);
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
+		if (list_empty(&(rec->r_list[type])))
+			continue;
+		trace_ocfs2_finish_quota_recovery(slot_num);
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+						       OCFS2_META_LOCK_NOQUEUE);
+		/* Someone else is holding the lock? Then he must be
+		 * doing the recovery. Just skip the file... */
+		if (status == -EAGAIN) {
+			printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+			       "device (%s) for slot %d because quota file is "
+			       "locked.\n", osb->dev_str, slot_num);
+			status = 0;
+			goto out_put;
+		} else if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		/* Is recovery still needed? */
+		flags = le32_to_cpu(ldinfo->dqi_flags);
+		if (!(flags & OLQF_CLEAN))
+			status = ocfs2_recover_local_quota_file(lqinode,
+								type,
+								rec);
+		/* We don't want to mark file as clean when it is actually
+		 * active */
+		if (slot_num == osb->slot_num)
+			goto out_bh;
+		/* Mark quota file as clean if we are recovering quota file of
+		 * some other node. */
+		handle = ocfs2_start_trans(osb,
+					   OCFS2_LOCAL_QINFO_WRITE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out_bh;
+		}
+		status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+						 bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_trans;
+		}
+		lock_buffer(bh);
+		ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+		unlock_buffer(bh);
+		ocfs2_journal_dirty(handle, bh);
+out_trans:
+		ocfs2_commit_trans(osb, handle);
+out_bh:
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	kfree(rec);
+	return status;
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	int status;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_quota_recovery *rec;
+	int locked = 0;
+
+	/* We don't need the lock and we have to acquire quota file locks
+	 * which will later depend on this lock */
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
+	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
+	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+	if (!oinfo) {
+		mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+			       " info.");
+		goto out_err;
+	}
+	info->dqi_priv = oinfo;
+	oinfo->dqi_type = type;
+	INIT_LIST_HEAD(&oinfo->dqi_chunk);
+	oinfo->dqi_rec = NULL;
+	oinfo->dqi_lqi_bh = NULL;
+	oinfo->dqi_libh = NULL;
+
+	status = ocfs2_global_read_info(sb, type);
+	if (status < 0)
+		goto out_err;
+
+	status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	locked = 1;
+
+	/* Now read local header */
+	status = ocfs2_read_quota_block(lqinode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file info header "
+			"(type=%d)\n", type);
+		goto out_err;
+	}
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+	oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+	oinfo->dqi_libh = bh;
+
+	/* We crashed when using local quota file? */
+	if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
+		rec = OCFS2_SB(sb)->quota_rec;
+		if (!rec) {
+			rec = ocfs2_alloc_quota_recovery();
+			if (!rec) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto out_err;
+			}
+			OCFS2_SB(sb)->quota_rec = rec;
+		}
+
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_err;
+		}
+	}
+
+	status = ocfs2_load_local_quota_bitmaps(lqinode,
+						ldinfo,
+						&oinfo->dqi_chunk);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now mark quota file as used */
+	oinfo->dqi_flags &= ~OLQF_CLEAN;
+	status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	return 0;
+out_err:
+	if (oinfo) {
+		iput(oinfo->dqi_gqinode);
+		ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+		ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+		brelse(oinfo->dqi_lqi_bh);
+		if (locked)
+			ocfs2_inode_unlock(lqinode, 1);
+		ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+		kfree(oinfo);
+	}
+	brelse(bh);
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+						->dqi_libh;
+	int status;
+
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int mark_clean = 1, len;
+	int status;
+
+	iput(oinfo->dqi_gqinode);
+	ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+	ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+					(chunk->qc_headerbh->b_data);
+		if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+			len = ol_chunk_entries(sb);
+		} else {
+			len = (oinfo->dqi_blocks -
+			       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+			      * ol_quota_entries_per_block(sb);
+		}
+		/* Not all entries free? Bug! */
+		if (le32_to_cpu(dchunk->dqc_free) != len) {
+			mlog(ML_ERROR, "releasing quota file with used "
+					"entries (type=%d)\n", type);
+			mark_clean = 0;
+		}
+	}
+	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+	/* dqonoff_mutex protects us against racing with recovery thread... */
+	if (oinfo->dqi_rec) {
+		ocfs2_free_quota_recovery(oinfo->dqi_rec);
+		mark_clean = 0;
+	}
+
+	if (!mark_clean)
+		goto out;
+
+	/* Mark local file as clean */
+	oinfo->dqi_flags |= OLQF_CLEAN;
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+				 oinfo->dqi_libh,
+				 olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+out:
+	ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+	brelse(oinfo->dqi_libh);
+	brelse(oinfo->dqi_lqi_bh);
+	kfree(oinfo);
+	return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+	struct ocfs2_dquot *od = private;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct super_block *sb = od->dq_dquot.dq_sb;
+
+	dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+		+ ol_dqblk_block_offset(sb, od->dq_local_off));
+
+	dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
+					      od->dq_dquot.dq_id));
+	spin_lock(&dq_data_lock);
+	dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+					  od->dq_origspace);
+	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+					  od->dq_originodes);
+	spin_unlock(&dq_data_lock);
+	trace_olq_set_dquot(
+		(unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
+		(unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+		from_kqid(&init_user_ns, od->dq_dquot.dq_id));
+}
+
+/* Write dquot to local quota file */
+int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct buffer_head *bh;
+	struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type];
+	int status;
+
+	status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
+					     &bh);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	brelse(bh);
+	return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int found = 0, len;
+
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+						chunk->qc_headerbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) > 0) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return NULL;
+
+	if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+		len = ol_chunk_entries(sb);
+	} else {
+		len = (oinfo->dqi_blocks -
+		       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+		      * ol_quota_entries_per_block(sb);
+	}
+
+	found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
+	/* We failed? */
+	if (found == len) {
+		mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+		     " entries free (type=%d)\n", chunk->qc_num,
+		     le32_to_cpu(dchunk->dqc_free), type);
+		return ERR_PTR(-EIO);
+	}
+	*offset = found;
+	return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+							struct super_block *sb,
+							int type,
+							int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk = NULL;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int status;
+	handle_t *handle;
+	struct buffer_head *bh = NULL, *dbh = NULL;
+	u64 p_blkno;
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode, NULL,
+				       i_size_read(lqinode) + 2 * sb->s_blocksize,
+				       i_size_read(lqinode));
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  i_size_read(lqinode) + 2 * sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+	if (!chunk) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	/* Local quota info and two new blocks we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Initialize chunk header */
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_trans;
+	}
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+	memset(dchunk->dqc_bitmap, 0,
+	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+	       OCFS2_QBLK_RESERVED_SPACE);
+	unlock_buffer(bh);
+	ocfs2_journal_dirty(handle, bh);
+
+	/* Initialize new block with structures */
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+					     &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	dbh = sb_getblk(sb, p_blkno);
+	if (!dbh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_trans;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(dbh);
+	memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+	unlock_buffer(dbh);
+	ocfs2_journal_dirty(handle, dbh);
+
+	/* Update local quotafile info */
+	oinfo->dqi_blocks += 2;
+	oinfo->dqi_chunks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+	chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+				   struct ocfs2_quota_chunk,
+				   qc_chunk)->qc_num + 1;
+	chunk->qc_headerbh = bh;
+	*offset = 0;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	brelse(bh);
+	brelse(dbh);
+	kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+	return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+						       struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_local_disk_chunk *dchunk;
+	int epb = ol_quota_entries_per_block(sb);
+	unsigned int chunk_blocks;
+	struct buffer_head *bh;
+	u64 p_blkno;
+	int status;
+	handle_t *handle;
+
+	if (list_empty(&oinfo->dqi_chunk))
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+	/* Is the last chunk full? */
+	chunk = list_entry(oinfo->dqi_chunk.prev,
+			struct ocfs2_quota_chunk, qc_chunk);
+	chunk_blocks = oinfo->dqi_blocks -
+			ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+	if (ol_chunk_blocks(sb) == chunk_blocks)
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode, NULL,
+				       i_size_read(lqinode) + sb->s_blocksize,
+				       i_size_read(lqinode));
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  i_size_read(lqinode) + sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Get buffer from the just added block */
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+
+	/* Local quota info, chunk header and the new block we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	/* Zero created block */
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+				 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	memset(bh->b_data, 0, sb->s_blocksize);
+	unlock_buffer(bh);
+	ocfs2_journal_dirty(handle, bh);
+
+	/* Update chunk header */
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+					 chunk->qc_headerbh,
+				 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+	lock_buffer(chunk->qc_headerbh);
+	le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+	unlock_buffer(chunk->qc_headerbh);
+	ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
+	/* Update file header */
+	oinfo->dqi_blocks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	*offset = chunk_blocks * epb;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	return ERR_PTR(status);
+}
+
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+	int *offset = private;
+	struct ocfs2_local_disk_chunk *dchunk;
+
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+	ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_id.type;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	int offset;
+	int status;
+	u64 pcount;
+
+	down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
+	chunk = ocfs2_find_free_entry(sb, type, &offset);
+	if (!chunk) {
+		chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+		if (IS_ERR(chunk)) {
+			status = PTR_ERR(chunk);
+			goto out;
+		}
+	} else if (IS_ERR(chunk)) {
+		status = PTR_ERR(chunk);
+		goto out;
+	}
+	od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+	od->dq_chunk = chunk;
+	status = ocfs2_extent_map_get_blocks(lqinode,
+				     ol_dqblk_block(sb, chunk->qc_num, offset),
+				     &od->dq_local_phys_blk,
+				     &pcount,
+				     NULL);
+
+	/* Initialize dquot structure on disk */
+	status = ocfs2_local_write_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Mark structure as allocated */
+	status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+				 &offset);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
+	return status;
+}
+
+/*
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and written all changes to global quota file
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
+{
+	int status;
+	int type = dquot->dq_id.type;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int offset;
+
+	status = ocfs2_journal_access_dq(handle,
+			INODE_CACHE(sb_dqopt(sb)->files[type]),
+			od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+					     od->dq_local_off);
+	dchunk = (struct ocfs2_local_disk_chunk *)
+			(od->dq_chunk->qc_headerbh->b_data);
+	/* Mark structure as freed */
+	lock_buffer(od->dq_chunk->qc_headerbh);
+	ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, 1);
+	unlock_buffer(od->dq_chunk->qc_headerbh);
+	ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
+out:
+	return status;
+}
+
+static const struct quota_format_ops ocfs2_format_ops = {
+	.check_quota_file	= ocfs2_local_check_quota_file,
+	.read_file_info		= ocfs2_local_read_info,
+	.write_file_info	= ocfs2_global_write_info,
+	.free_file_info		= ocfs2_local_free_info,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+	.qf_fmt_id = QFMT_OCFS2,
+	.qf_ops = &ocfs2_format_ops,
+	.qf_owner = THIS_MODULE
+};
diff --git a/kernel/fs/ocfs2/refcounttree.c b/kernel/fs/ocfs2/refcounttree.c
new file mode 100644
index 000000000..d8c6af101
--- /dev/null
+++ b/kernel/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4474 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.c
+ *
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/sort.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "suballoc.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "blockcheck.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "aops.h"
+#include "xattr.h"
+#include "namei.h"
+#include "ocfs2_trace.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/posix_acl.h>
+
+struct ocfs2_cow_context {
+	struct inode *inode;
+	u32 cow_start;
+	u32 cow_len;
+	struct ocfs2_extent_tree data_et;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	void *cow_object;
+	struct ocfs2_post_refcount *post_refcount;
+	int extra_credits;
+	int (*get_clusters)(struct ocfs2_cow_context *context,
+			    u32 v_cluster, u32 *p_cluster,
+			    u32 *num_clusters,
+			    unsigned int *extent_flags);
+	int (*cow_duplicate_clusters)(handle_t *handle,
+				      struct inode *inode,
+				      u32 cpos, u32 old_cluster,
+				      u32 new_cluster, u32 new_len);
+};
+
+static inline struct ocfs2_refcount_tree *
+cache_info_to_refcount(struct ocfs2_caching_info *ci)
+{
+	return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
+}
+
+static int ocfs2_validate_refcount_block(struct super_block *sb,
+					 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)bh->b_data;
+
+	trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return rc;
+	}
+
+
+	if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    rb->rf_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid rf_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(rb->rf_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid "
+			    "rf_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(rb->rf_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
+				     u64 rb_blkno,
+				     struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(ci, rb_blkno, &tmp,
+			      ocfs2_validate_refcount_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_blkno;
+}
+
+static struct super_block *
+ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_sb;
+}
+
+static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_lock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_unlock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_lock(&rf->rf_io_mutex);
+}
+
+static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_unlock(&rf->rf_io_mutex);
+}
+
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
+	.co_owner		= ocfs2_refcount_cache_owner,
+	.co_get_super		= ocfs2_refcount_cache_get_super,
+	.co_cache_lock		= ocfs2_refcount_cache_lock,
+	.co_cache_unlock	= ocfs2_refcount_cache_unlock,
+	.co_io_lock		= ocfs2_refcount_cache_io_lock,
+	.co_io_unlock		= ocfs2_refcount_cache_io_unlock,
+};
+
+static struct ocfs2_refcount_tree *
+ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
+{
+	struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tree = NULL;
+
+	while (n) {
+		tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
+
+		if (blkno < tree->rf_blkno)
+			n = n->rb_left;
+		else if (blkno > tree->rf_blkno)
+			n = n->rb_right;
+		else
+			return tree;
+	}
+
+	return NULL;
+}
+
+/* osb_lock is already locked. */
+static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
+				       struct ocfs2_refcount_tree *new)
+{
+	u64 rf_blkno = new->rf_blkno;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tmp;
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_refcount_tree,
+			       rf_node);
+
+		if (rf_blkno < tmp->rf_blkno)
+			p = &(*p)->rb_left;
+		else if (rf_blkno > tmp->rf_blkno)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
+			     (unsigned long long)rf_blkno);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->rf_node, parent, p);
+	rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
+}
+
+static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
+{
+	ocfs2_metadata_cache_exit(&tree->rf_ci);
+	ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
+	ocfs2_lock_res_free(&tree->rf_lockres);
+	kfree(tree);
+}
+
+static inline void
+ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *tree)
+{
+	rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
+	if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
+		osb->osb_ref_tree_lru = NULL;
+}
+
+static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *tree)
+{
+	spin_lock(&osb->osb_lock);
+	ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+	spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+{
+	struct ocfs2_refcount_tree *tree =
+		container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
+
+	ocfs2_free_refcount_tree(tree);
+}
+
+static inline void
+ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
+{
+	kref_get(&tree->rf_getcnt);
+}
+
+static inline void
+ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
+{
+	kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
+}
+
+static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
+					       struct super_block *sb)
+{
+	ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
+	mutex_init(&new->rf_io_mutex);
+	new->rf_sb = sb;
+	spin_lock_init(&new->rf_lock);
+}
+
+static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *new,
+					u64 rf_blkno, u32 generation)
+{
+	init_rwsem(&new->rf_sem);
+	ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
+				     rf_blkno, generation);
+}
+
+static struct ocfs2_refcount_tree*
+ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
+{
+	struct ocfs2_refcount_tree *new;
+
+	new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+	if (!new)
+		return NULL;
+
+	new->rf_blkno = rf_blkno;
+	kref_init(&new->rf_getcnt);
+	ocfs2_init_refcount_tree_ci(new, osb->sb);
+
+	return new;
+}
+
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+				   struct ocfs2_refcount_tree **ret_tree)
+{
+	int ret = 0;
+	struct ocfs2_refcount_tree *tree, *new = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *ref_rb;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->osb_ref_tree_lru &&
+	    osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
+		tree = osb->osb_ref_tree_lru;
+	else
+		tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	spin_unlock(&osb->osb_lock);
+
+	new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
+	if (!new) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+	/*
+	 * We need the generation to create the refcount tree lock and since
+	 * it isn't changed during the tree modification, we are safe here to
+	 * read without protection.
+	 * We also have to purge the cache after we create the lock since the
+	 * refcount block may have the stale data. It can only be trusted when
+	 * we hold the refcount lock.
+	 */
+	ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_metadata_cache_exit(&new->rf_ci);
+		kfree(new);
+		return ret;
+	}
+
+	ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
+	ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
+				      new->rf_generation);
+	ocfs2_metadata_cache_purge(&new->rf_ci);
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	ocfs2_insert_refcount_tree(osb, new);
+
+	tree = new;
+	new = NULL;
+
+out:
+	*ret_tree = tree;
+
+	osb->osb_ref_tree_lru = tree;
+
+	spin_unlock(&osb->osb_lock);
+
+	if (new)
+		ocfs2_free_refcount_tree(new);
+
+	brelse(ref_root_bh);
+	return ret;
+}
+
+static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	*ref_blkno = le64_to_cpu(di->i_refcount_loc);
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+				      struct ocfs2_refcount_tree *tree, int rw)
+{
+	int ret;
+
+	ret = ocfs2_refcount_lock(tree, rw);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rw)
+		down_write(&tree->rf_sem);
+	else
+		down_read(&tree->rf_sem);
+
+out:
+	return ret;
+}
+
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really needs it.
+ *
+ * If the tree has been re-created by other node, it will free the
+ * old one and re-create it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+			     u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **ret_tree,
+			     struct buffer_head **ref_bh)
+{
+	int ret, delete_tree = 0;
+	struct ocfs2_refcount_tree *tree = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+
+again:
+	ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_refcount_tree_get(tree);
+
+	ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_refcount_tree_put(tree);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_unlock_refcount_tree(osb, tree, rw);
+		ocfs2_refcount_tree_put(tree);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	/*
+	 * If the refcount block has been freed and re-created, we may need
+	 * to recreate the refcount tree also.
+	 *
+	 * Here we just remove the tree from the rb-tree, and the last
+	 * kref holder will unlock and delete this refcount_tree.
+	 * Then we goto "again" and ocfs2_get_refcount_tree will create
+	 * the new refcount tree for us.
+	 */
+	if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
+		if (!tree->rf_removed) {
+			ocfs2_erase_refcount_tree_from_list(osb, tree);
+			tree->rf_removed = 1;
+			delete_tree = 1;
+		}
+
+		ocfs2_unlock_refcount_tree(osb, tree, rw);
+		/*
+		 * We get an extra reference when we create the refcount
+		 * tree, so another put will destroy it.
+		 */
+		if (delete_tree)
+			ocfs2_refcount_tree_put(tree);
+		brelse(ref_root_bh);
+		ref_root_bh = NULL;
+		goto again;
+	}
+
+	*ret_tree = tree;
+	if (ref_bh) {
+		*ref_bh = ref_root_bh;
+		ref_root_bh = NULL;
+	}
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree, int rw)
+{
+	if (rw)
+		up_write(&tree->rf_sem);
+	else
+		up_read(&tree->rf_sem);
+
+	ocfs2_refcount_unlock(tree, rw);
+	ocfs2_refcount_tree_put(tree);
+}
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
+{
+	struct rb_node *node;
+	struct ocfs2_refcount_tree *tree;
+	struct rb_root *root = &osb->osb_rf_lock_tree;
+
+	while ((node = rb_last(root)) != NULL) {
+		tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
+
+		trace_ocfs2_purge_refcount_trees(
+				(unsigned long long) tree->rf_blkno);
+
+		rb_erase(&tree->rf_node, root);
+		ocfs2_free_refcount_tree(tree);
+	}
+}
+
+/*
+ * Create a refcount tree for an inode.
+ * We take for granted that the inode is already locked.
+ */
+static int ocfs2_create_refcount_tree(struct inode *inode,
+				      struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, first_blkno;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+	trace_ocfs2_create_refcount_tree(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &suballoc_bit_start, &num_got,
+				   &first_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
+	if (!new_tree) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	if (!new_bh) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+	ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(rb, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+	rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
+	rb->rf_blkno = cpu_to_le64(first_blkno);
+	rb->rf_count = cpu_to_le32(1);
+	rb->rf_records.rl_count =
+			cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
+	spin_lock(&osb->osb_lock);
+	rb->rf_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	ocfs2_journal_dirty(handle, new_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(first_blkno);
+	spin_unlock(&oi->ip_lock);
+
+	trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	/*
+	 * We have to init the tree lock here since it will use
+	 * the generation number to create it.
+	 */
+	new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
+	ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
+				      new_tree->rf_generation);
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, first_blkno);
+
+	/*
+	 * We've just created a new refcount tree in this block.  If
+	 * we found a refcount tree on the ocfs2_super, it must be
+	 * one we just deleted.  We free the old tree before
+	 * inserting the new tree.
+	 */
+	BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
+	if (tree)
+		ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+	ocfs2_insert_refcount_tree(osb, new_tree);
+	spin_unlock(&osb->osb_lock);
+	new_tree = NULL;
+	if (tree)
+		ocfs2_refcount_tree_put(tree);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (new_tree) {
+		ocfs2_metadata_cache_exit(&new_tree->rf_ci);
+		kfree(new_tree);
+	}
+
+	brelse(new_bh);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+static int ocfs2_set_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh,
+				   u64 refcount_loc)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+	ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+				       &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	le32_add_cpu(&rb->rf_count, 1);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(refcount_loc);
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+
+	return ret;
+}
+
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+	int ret, delete_tree = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct inode *alloc_inode = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+	int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
+	u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
+	u16 bit = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	BUG_ON(!ref_blkno);
+	ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
+
+	/*
+	 * If we are the last user, we need to free the block.
+	 * So lock the allocator ahead.
+	 */
+	if (le32_to_cpu(rb->rf_count) == 1) {
+		blk = le64_to_cpu(rb->rf_blkno);
+		bit = le16_to_cpu(rb->rf_suballoc_bit);
+		if (rb->rf_suballoc_loc)
+			bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+		else
+			bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+		alloc_inode = ocfs2_get_system_file_inode(osb,
+					EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(rb->rf_suballoc_slot));
+		if (!alloc_inode) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+		mutex_lock(&alloc_inode->i_mutex);
+
+		ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_mutex;
+		}
+
+		credits += OCFS2_SUBALLOC_FREE;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = 0;
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	le32_add_cpu(&rb->rf_count , -1);
+	ocfs2_journal_dirty(handle, blk_bh);
+
+	if (!rb->rf_count) {
+		delete_tree = 1;
+		ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
+		ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
+					       alloc_bh, bit, bg_blkno, 1);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	if (alloc_inode) {
+		ocfs2_inode_unlock(alloc_inode, 1);
+		brelse(alloc_bh);
+	}
+out_mutex:
+	if (alloc_inode) {
+		mutex_unlock(&alloc_inode->i_mutex);
+		iput(alloc_inode);
+	}
+out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	if (delete_tree)
+		ocfs2_refcount_tree_put(ref_tree);
+	brelse(blk_bh);
+
+	return ret;
+}
+
+static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
+					  struct buffer_head *ref_leaf_bh,
+					  u64 cpos, unsigned int len,
+					  struct ocfs2_refcount_rec *ret_rec,
+					  int *index)
+{
+	int i = 0;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = NULL;
+
+	for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
+		rec = &rb->rf_records.rl_recs[i];
+
+		if (le64_to_cpu(rec->r_cpos) +
+		    le32_to_cpu(rec->r_clusters) <= cpos)
+			continue;
+		else if (le64_to_cpu(rec->r_cpos) > cpos)
+			break;
+
+		/* ok, cpos fail in this rec. Just return. */
+		if (ret_rec)
+			*ret_rec = *rec;
+		goto out;
+	}
+
+	if (ret_rec) {
+		/* We meet with a hole here, so fake the rec. */
+		ret_rec->r_cpos = cpu_to_le64(cpos);
+		ret_rec->r_refcount = 0;
+		if (i < le16_to_cpu(rb->rf_records.rl_used) &&
+		    le64_to_cpu(rec->r_cpos) < cpos + len)
+			ret_rec->r_clusters =
+				cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
+		else
+			ret_rec->r_clusters = cpu_to_le32(len);
+	}
+
+out:
+	*index = i;
+}
+
+/*
+ * Try to remove refcount tree. The mechanism is:
+ * 1) Check whether i_clusters == 0, if no, exit.
+ * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
+ * 3) Check whether we have inline xattr stored outside, if yes, exit.
+ * 4) Remove the tree.
+ */
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&oi->ip_xattr_sem);
+	down_write(&oi->ip_alloc_sem);
+
+	if (oi->ip_clusters)
+		goto out;
+
+	if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
+		goto out;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
+	    ocfs2_has_inline_xattr_value_outside(inode, di))
+		goto out;
+
+	ret = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (ret)
+		mlog_errno(ret);
+out:
+	up_write(&oi->ip_alloc_sem);
+	up_write(&oi->ip_xattr_sem);
+	return 0;
+}
+
+/*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_extent_block *eb,
+				       struct ocfs2_extent_list *el,
+				       int index,  u32 *cpos_end)
+{
+	int ret, i, subtree_root;
+	u32 cpos;
+	u64 blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_path *left_path = NULL, *right_path = NULL;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_extent_list *tmp_el;
+
+	if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+		/*
+		 * We have a extent rec after index, so just use the e_cpos
+		 * of the next extent rec.
+		 */
+		*cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+		return 0;
+	}
+
+	if (!eb || (eb && !eb->h_next_leaf_blk)) {
+		/*
+		 * We are the last extent rec, so any high cpos should
+		 * be stored in this leaf refcount block.
+		 */
+		*cpos_end = UINT_MAX;
+		return 0;
+	}
+
+	/*
+	 * If the extent block isn't the last one, we have to find
+	 * the subtree root between this extent block and the next
+	 * leaf extent block and get the corresponding e_cpos from
+	 * the subroot. Otherwise we may corrupt the b-tree.
+	 */
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+	left_path = ocfs2_new_path_from_et(&et);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+	ret = ocfs2_find_path(ci, left_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	right_path = ocfs2_new_path_from_path(left_path);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(ci, right_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	subtree_root = ocfs2_find_subtree_root(&et, left_path,
+					       right_path);
+
+	tmp_el = left_path->p_node[subtree_root].el;
+	blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+	for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
+		if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+			*cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+			break;
+		}
+	}
+
+	BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
+
+out:
+	ocfs2_free_path(left_path);
+	ocfs2_free_path(right_path);
+	return ret;
+}
+
+/*
+ * Given a cpos and len, try to find the refcount record which contains cpos.
+ * 1. If cpos can be found in one refcount record, return the record.
+ * 2. If cpos can't be found, return a fake record which start from cpos
+ *    and end at a small value between cpos+len and start of the next record.
+ *    This fake record has r_refcount = 0.
+ */
+static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
+				  struct buffer_head *ref_root_bh,
+				  u64 cpos, unsigned int len,
+				  struct ocfs2_refcount_rec *ret_rec,
+				  int *index,
+				  struct buffer_head **ret_bh)
+{
+	int ret = 0, i, found;
+	u32 low_cpos, uninitialized_var(cpos_end);
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
+		ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
+					      ret_rec, index);
+		*ret_bh = ref_root_bh;
+		get_bh(ref_root_bh);
+		return 0;
+	}
+
+	el = &rb->rf_list;
+	low_cpos = cpos & OCFS2_32BIT_POS_MASK;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(sb,
+			"refcount tree %llu has non zero tree "
+			"depth in leaf btree tree block %llu\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci),
+			(unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	found = 0;
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
+						  eb, el, i, &cpos_end);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (cpos_end < low_cpos + len)
+			len = cpos_end - low_cpos;
+	}
+
+	ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
+					&ref_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
+				      ret_rec, index);
+	*ret_bh = ref_leaf_bh;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+enum ocfs2_ref_rec_contig {
+	REF_CONTIG_NONE = 0,
+	REF_CONTIG_LEFT,
+	REF_CONTIG_RIGHT,
+	REF_CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
+				    int index)
+{
+	if ((rb->rf_records.rl_recs[index].r_refcount ==
+	    rb->rf_records.rl_recs[index + 1].r_refcount) &&
+	    (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
+	    le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
+	    le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
+		return REF_CONTIG_RIGHT;
+
+	return REF_CONTIG_NONE;
+}
+
+static enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
+				  int index)
+{
+	enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
+		ret = ocfs2_refcount_rec_adjacent(rb, index);
+
+	if (index > 0) {
+		enum ocfs2_ref_rec_contig tmp;
+
+		tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
+
+		if (tmp == REF_CONTIG_RIGHT) {
+			if (ret == REF_CONTIG_RIGHT)
+				ret = REF_CONTIG_LEFTRIGHT;
+			else
+				ret = REF_CONTIG_LEFT;
+		}
+	}
+
+	return ret;
+}
+
+static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
+					   int index)
+{
+	BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
+	       rb->rf_records.rl_recs[index+1].r_refcount);
+
+	le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
+		     le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
+		memmove(&rb->rf_records.rl_recs[index + 1],
+			&rb->rf_records.rl_recs[index + 2],
+			sizeof(struct ocfs2_refcount_rec) *
+			(le16_to_cpu(rb->rf_records.rl_used) - index - 2));
+
+	memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
+	       0, sizeof(struct ocfs2_refcount_rec));
+	le16_add_cpu(&rb->rf_records.rl_used, -1);
+}
+
+/*
+ * Merge the refcount rec if we are contiguous with the adjacent recs.
+ */
+static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
+				     int index)
+{
+	enum ocfs2_ref_rec_contig contig =
+				ocfs2_refcount_rec_contig(rb, index);
+
+	if (contig == REF_CONTIG_NONE)
+		return;
+
+	if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
+		BUG_ON(index == 0);
+		index--;
+	}
+
+	ocfs2_rotate_refcount_rec_left(rb, index);
+
+	if (contig == REF_CONTIG_LEFTRIGHT)
+		ocfs2_rotate_refcount_rec_left(rb, index);
+}
+
+/*
+ * Change the refcount indexed by "index" in ref_bh.
+ * If refcount reaches 0, remove it.
+ */
+static int ocfs2_change_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_leaf_bh,
+				     int index, int merge, int change)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_change_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, le32_to_cpu(rec->r_refcount), change);
+	le32_add_cpu(&rec->r_refcount, change);
+
+	if (!rec->r_refcount) {
+		if (index != le16_to_cpu(rl->rl_used) - 1) {
+			memmove(rec, rec + 1,
+				(le16_to_cpu(rl->rl_used) - index - 1) *
+				sizeof(struct ocfs2_refcount_rec));
+			memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
+			       0, sizeof(struct ocfs2_refcount_rec));
+		}
+
+		le16_add_cpu(&rl->rl_used, -1);
+	} else if (merge)
+		ocfs2_refcount_rec_merge(rb, index);
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+out:
+	return ret;
+}
+
+static int ocfs2_expand_inline_ref_root(handle_t *handle,
+					struct ocfs2_caching_info *ci,
+					struct buffer_head *ref_root_bh,
+					struct buffer_head **ref_leaf_bh,
+					struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+	struct ocfs2_refcount_block *root_rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Initialize ocfs2_refcount_block.
+	 * It should contain the same information as the old root.
+	 * so just memcpy it and change the corresponding field.
+	 */
+	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
+
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_cpos = cpu_to_le32(0);
+	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	/* Now change the root. */
+	memset(&root_rb->rf_list, 0, sb->s_blocksize -
+	       offsetof(struct ocfs2_refcount_block, rf_list));
+	root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
+	root_rb->rf_clusters = cpu_to_le32(1);
+	root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
+	root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+	root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
+		le16_to_cpu(new_rb->rf_records.rl_used));
+
+	*ref_leaf_bh = new_bh;
+	new_bh = NULL;
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
+					   struct ocfs2_refcount_rec *next)
+{
+	if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
+		ocfs2_get_ref_rec_low_cpos(next))
+		return 1;
+
+	return 0;
+}
+
+static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
+	u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u64 l_cpos = le64_to_cpu(l->r_cpos);
+	u64 r_cpos = le64_to_cpu(r->r_cpos);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static void swap_refcount_rec(void *a, void *b, int size)
+{
+	struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+
+	tmp = *l;
+	*l = *r;
+	*r = tmp;
+}
+
+/*
+ * The refcount cpos are ordered by their 64bit cpos,
+ * But we will use the low 32 bit to be the e_cpos in the b-tree.
+ * So we need to make sure that this pos isn't intersected with others.
+ *
+ * Note: The refcount block is already sorted by their low 32 bit cpos,
+ *       So just try the middle pos first, and we will exit when we find
+ *       the good position.
+ */
+static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
+					 u32 *split_pos, int *split_index)
+{
+	int num_used = le16_to_cpu(rl->rl_used);
+	int delta, middle = num_used / 2;
+
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle - delta - 1],
+					&rl->rl_recs[middle - delta])) {
+			*split_index = middle - delta;
+			break;
+		}
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == num_used)
+			continue;
+
+		/* Now try delta past middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle + delta],
+					&rl->rl_recs[middle + delta + 1])) {
+			*split_index = middle + delta + 1;
+			break;
+		}
+	}
+
+	if (delta >= middle)
+		return -ENOSPC;
+
+	*split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
+	return 0;
+}
+
+static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
+					    struct buffer_head *new_bh,
+					    u32 *split_cpos)
+{
+	int split_index = 0, num_moved, ret;
+	u32 cpos = 0;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_block *new_rb =
+			(struct ocfs2_refcount_block *)new_bh->b_data;
+	struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
+
+	trace_ocfs2_divide_leaf_refcount_block(
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
+
+	/*
+	 * XXX: Improvement later.
+	 * If we know all the high 32 bit cpos is the same, no need to sort.
+	 *
+	 * In order to make the whole process safe, we do:
+	 * 1. sort the entries by their low 32 bit cpos first so that we can
+	 *    find the split cpos easily.
+	 * 2. call ocfs2_insert_extent to insert the new refcount block.
+	 * 3. move the refcount rec to the new block.
+	 * 4. sort the entries by their 64 bit cpos.
+	 * 5. dirty the new_rb and rb.
+	 */
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+
+	ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	new_rb->rf_cpos = cpu_to_le32(cpos);
+
+	/* move refcount records starting from split_index to the new block. */
+	num_moved = le16_to_cpu(rl->rl_used) - split_index;
+	memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	/*ok, remove the entries we just moved over to the other block. */
+	memset(&rl->rl_recs[split_index], 0,
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	/* change old and new rl_used accordingly. */
+	le16_add_cpu(&rl->rl_used, -num_moved);
+	new_rl->rl_used = cpu_to_le16(num_moved);
+
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+	sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+	*split_cpos = cpos;
+	return 0;
+}
+
+static int ocfs2_new_leaf_refcount_block(handle_t *handle,
+					 struct ocfs2_caching_info *ci,
+					 struct buffer_head *ref_root_bh,
+					 struct buffer_head *ref_leaf_bh,
+					 struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got, new_cpos;
+	u64 suballoc_loc, blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *root_rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+	struct ocfs2_extent_tree ref_et;
+
+	BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(new_rb, 0, sb->s_blocksize);
+	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	new_rb->rf_records.rl_count =
+				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+	new_rb->rf_generation = root_rb->rf_generation;
+
+	ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
+
+	trace_ocfs2_new_leaf_refcount_block(
+			(unsigned long long)new_bh->b_blocknr, new_cpos);
+
+	/* Insert the new leaf block with the specific offset cpos. */
+	ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
+				  1, 0, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_expand_refcount_tree(handle_t *handle,
+				      struct ocfs2_caching_info *ci,
+				      struct buffer_head *ref_root_bh,
+				      struct buffer_head *ref_leaf_bh,
+				      struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct buffer_head *expand_bh = NULL;
+
+	if (ref_root_bh == ref_leaf_bh) {
+		/*
+		 * the old root bh hasn't been expanded to a b-tree,
+		 * so expand it first.
+		 */
+		ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
+						   &expand_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		expand_bh = ref_leaf_bh;
+		get_bh(expand_bh);
+	}
+
+
+	/* Now add a new refcount block into the tree.*/
+	ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
+					    expand_bh, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(expand_bh);
+	return ret;
+}
+
+/*
+ * Adjust the extent rec in b-tree representing ref_leaf_bh.
+ *
+ * Only called when we have inserted a new refcount rec at index 0
+ * which means ocfs2_extent_rec.e_cpos may need some change.
+ */
+static int ocfs2_adjust_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     struct buffer_head *ref_leaf_bh,
+				     struct ocfs2_refcount_rec *rec)
+{
+	int ret = 0, i;
+	u32 new_cpos, old_cpos;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	struct ocfs2_extent_list *el;
+
+	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
+		goto out;
+
+	rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	old_cpos = le32_to_cpu(rb->rf_cpos);
+	new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+	if (old_cpos <= new_cpos)
+		goto out;
+
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+	path = ocfs2_new_path_from_et(&et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(ci, path, old_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * 2 more credits, one for the leaf refcount block, one for
+	 * the extent block contains the extent rec.
+	 */
+	ret = ocfs2_extend_trans(handle, 2);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* change the leaf extent block first. */
+	el = path_leaf_el(path);
+
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
+		if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
+			break;
+
+	BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
+
+	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+
+	/* change the r_cpos in the leaf block. */
+	rb->rf_cpos = cpu_to_le32(new_cpos);
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_insert_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     struct buffer_head *ref_leaf_bh,
+				     struct ocfs2_refcount_rec *rec,
+				     int index, int merge,
+				     struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+	if (rf_list->rl_used == rf_list->rl_count) {
+		u64 cpos = le64_to_cpu(rec->r_cpos);
+		u32 len = le32_to_cpu(rec->r_clusters);
+
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, NULL, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (index < le16_to_cpu(rf_list->rl_used))
+		memmove(&rf_list->rl_recs[index + 1],
+			&rf_list->rl_recs[index],
+			(le16_to_cpu(rf_list->rl_used) - index) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	trace_ocfs2_insert_refcount_rec(
+		(unsigned long long)ref_leaf_bh->b_blocknr, index,
+		(unsigned long long)le64_to_cpu(rec->r_cpos),
+		le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
+
+	rf_list->rl_recs[index] = *rec;
+
+	le16_add_cpu(&rf_list->rl_used, 1);
+
+	if (merge)
+		ocfs2_refcount_rec_merge(rb, index);
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+	if (index == 0) {
+		ret = ocfs2_adjust_refcount_rec(handle, ci,
+						ref_root_bh,
+						ref_leaf_bh, rec);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+/*
+ * Split the refcount_rec indexed by "index" in ref_leaf_bh.
+ * This is much simple than our b-tree code.
+ * split_rec is the new refcount rec we want to insert.
+ * If split_rec->r_refcount > 0, we are changing the refcount(in case we
+ * increase refcount or decrease a refcount to non-zero).
+ * If split_rec->r_refcount == 0, we are punching a hole in current refcount
+ * rec( in case we decrease a refcount to zero).
+ */
+static int ocfs2_split_refcount_rec(handle_t *handle,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *ref_root_bh,
+				    struct buffer_head *ref_leaf_bh,
+				    struct ocfs2_refcount_rec *split_rec,
+				    int index, int merge,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, recs_need;
+	u32 len;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
+	struct ocfs2_refcount_rec *tail_rec = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+	trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
+		le32_to_cpu(orig_rec->r_clusters),
+		le32_to_cpu(orig_rec->r_refcount),
+		le64_to_cpu(split_rec->r_cpos),
+		le32_to_cpu(split_rec->r_clusters),
+		le32_to_cpu(split_rec->r_refcount));
+
+	/*
+	 * If we just need to split the header or tail clusters,
+	 * no more recs are needed, just split is OK.
+	 * Otherwise we at least need one new recs.
+	 */
+	if (!split_rec->r_refcount &&
+	    (split_rec->r_cpos == orig_rec->r_cpos ||
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) ==
+	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need = 0;
+	else
+		recs_need = 1;
+
+	/*
+	 * We need one more rec if we split in the middle and the new rec have
+	 * some refcount in it.
+	 */
+	if (split_rec->r_refcount &&
+	    (split_rec->r_cpos != orig_rec->r_cpos &&
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) !=
+	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need++;
+
+	/* If the leaf block don't have enough record, expand it. */
+	if (le16_to_cpu(rf_list->rl_used) + recs_need >
+					 le16_to_cpu(rf_list->rl_count)) {
+		struct ocfs2_refcount_rec tmp_rec;
+		u64 cpos = le64_to_cpu(orig_rec->r_cpos);
+		len = le32_to_cpu(orig_rec->r_clusters);
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We have to re-get it since now cpos may be moved to
+		 * another leaf block.
+		 */
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &tmp_rec, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+		orig_rec = &rf_list->rl_recs[index];
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We have calculated out how many new records we need and store
+	 * in recs_need, so spare enough space first by moving the records
+	 * after "index" to the end.
+	 */
+	if (index != le16_to_cpu(rf_list->rl_used) - 1)
+		memmove(&rf_list->rl_recs[index + 1 + recs_need],
+			&rf_list->rl_recs[index + 1],
+			(le16_to_cpu(rf_list->rl_used) - index - 1) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	len = (le64_to_cpu(orig_rec->r_cpos) +
+	      le32_to_cpu(orig_rec->r_clusters)) -
+	      (le64_to_cpu(split_rec->r_cpos) +
+	      le32_to_cpu(split_rec->r_clusters));
+
+	/*
+	 * If we have "len", the we will split in the tail and move it
+	 * to the end of the space we have just spared.
+	 */
+	if (len) {
+		tail_rec = &rf_list->rl_recs[index + recs_need];
+
+		memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
+		le64_add_cpu(&tail_rec->r_cpos,
+			     le32_to_cpu(tail_rec->r_clusters) - len);
+		tail_rec->r_clusters = cpu_to_le32(len);
+	}
+
+	/*
+	 * If the split pos isn't the same as the original one, we need to
+	 * split in the head.
+	 *
+	 * Note: We have the chance that split_rec.r_refcount = 0,
+	 * recs_need = 0 and len > 0, which means we just cut the head from
+	 * the orig_rec and in that case we have done some modification in
+	 * orig_rec above, so the check for r_cpos is faked.
+	 */
+	if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
+		len = le64_to_cpu(split_rec->r_cpos) -
+		      le64_to_cpu(orig_rec->r_cpos);
+		orig_rec->r_clusters = cpu_to_le32(len);
+		index++;
+	}
+
+	le16_add_cpu(&rf_list->rl_used, recs_need);
+
+	if (split_rec->r_refcount) {
+		rf_list->rl_recs[index] = *split_rec;
+		trace_ocfs2_split_refcount_rec_insert(
+			(unsigned long long)ref_leaf_bh->b_blocknr, index,
+			(unsigned long long)le64_to_cpu(split_rec->r_cpos),
+			le32_to_cpu(split_rec->r_clusters),
+			le32_to_cpu(split_rec->r_refcount));
+
+		if (merge)
+			ocfs2_refcount_rec_merge(rb, index);
+	}
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int __ocfs2_increase_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len, int merge,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0, index;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_refcount_rec rec;
+	unsigned int set_len = 0;
+
+	trace_ocfs2_increase_refcount_begin(
+	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+	     (unsigned long long)cpos, len);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		set_len = le32_to_cpu(rec.r_clusters);
+
+		/*
+		 * Here we may meet with 3 situations:
+		 *
+		 * 1. If we find an already existing record, and the length
+		 *    is the same, cool, we just need to increase the r_refcount
+		 *    and it is OK.
+		 * 2. If we find a hole, just insert it with r_refcount = 1.
+		 * 3. If we are in the middle of one extent record, split
+		 *    it.
+		 */
+		if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
+		    set_len <= len) {
+			trace_ocfs2_increase_refcount_change(
+				(unsigned long long)cpos, set_len,
+				le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_change_refcount_rec(handle, ci,
+							ref_leaf_bh, index,
+							merge, 1);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else if (!rec.r_refcount) {
+			rec.r_refcount = cpu_to_le32(1);
+
+			trace_ocfs2_increase_refcount_insert(
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len);
+			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
+							ref_leaf_bh,
+							&rec, index,
+							merge, meta_ac);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else  {
+			set_len = min((u64)(cpos + len),
+				      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
+			rec.r_cpos = cpu_to_le64(cpos);
+			rec.r_clusters = cpu_to_le32(set_len);
+			le32_add_cpu(&rec.r_refcount, 1);
+
+			trace_ocfs2_increase_refcount_split(
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len, le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_split_refcount_rec(handle, ci,
+						       ref_root_bh, ref_leaf_bh,
+						       &rec, index, merge,
+						       meta_ac, dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += set_len;
+		len -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_remove_refcount_extent(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_extent_tree et;
+
+	BUG_ON(rb->rf_records.rl_used);
+
+	trace_ocfs2_remove_refcount_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le32_to_cpu(rb->rf_cpos));
+
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+	ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
+				  1, meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_from_cache(ci, ref_leaf_bh);
+
+	/*
+	 * add the freed block to the dealloc so that it will be freed
+	 * when we run dealloc.
+	 */
+	ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(rb->rf_suballoc_slot),
+					le64_to_cpu(rb->rf_suballoc_loc),
+					le64_to_cpu(rb->rf_blkno),
+					le16_to_cpu(rb->rf_suballoc_bit));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	le32_add_cpu(&rb->rf_clusters, -1);
+
+	/*
+	 * check whether we need to restore the root refcount block if
+	 * there is no leaf extent block at atll.
+	 */
+	if (!rb->rf_list.l_next_free_rec) {
+		BUG_ON(rb->rf_clusters);
+
+		trace_ocfs2_restore_refcount_block(
+		     (unsigned long long)ref_root_bh->b_blocknr);
+
+		rb->rf_flags = 0;
+		rb->rf_parent = 0;
+		rb->rf_cpos = 0;
+		memset(&rb->rf_records, 0, sb->s_blocksize -
+		       offsetof(struct ocfs2_refcount_block, rf_records));
+		rb->rf_records.rl_count =
+				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+	}
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+out:
+	return ret;
+}
+
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
+					 cpos, len, 1,
+					 meta_ac, dealloc);
+}
+
+static int ocfs2_decrease_refcount_rec(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				int index, u64 cpos, unsigned int len,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+	BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
+	BUG_ON(cpos + len >
+	       le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+
+	trace_ocfs2_decrease_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len);
+
+	if (cpos == le64_to_cpu(rec->r_cpos) &&
+	    len == le32_to_cpu(rec->r_clusters))
+		ret = ocfs2_change_refcount_rec(handle, ci,
+						ref_leaf_bh, index, 1, -1);
+	else {
+		struct ocfs2_refcount_rec split = *rec;
+		split.r_cpos = cpu_to_le64(cpos);
+		split.r_clusters = cpu_to_le32(len);
+
+		le32_add_cpu(&split.r_refcount, -1);
+
+		ret = ocfs2_split_refcount_rec(handle, ci,
+					       ref_root_bh, ref_leaf_bh,
+					       &split, index, 1,
+					       meta_ac, dealloc);
+	}
+
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Remove the leaf refcount block if it contains no refcount record. */
+	if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
+		ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
+						   ref_leaf_bh, meta_ac,
+						   dealloc);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	return ret;
+}
+
+static int __ocfs2_decrease_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     int delete)
+{
+	int ret = 0, index = 0;
+	struct ocfs2_refcount_rec rec;
+	unsigned int r_count = 0, r_len;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	trace_ocfs2_decrease_refcount(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len, delete);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		r_count = le32_to_cpu(rec.r_refcount);
+		BUG_ON(r_count == 0);
+		if (!delete)
+			BUG_ON(r_count > 1);
+
+		r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - cpos;
+
+		ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
+						  ref_leaf_bh, index,
+						  cpos, r_len,
+						  meta_ac, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
+			ret = ocfs2_cache_cluster_dealloc(dealloc,
+					  ocfs2_clusters_to_blocks(sb, cpos),
+							  r_len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += r_len;
+		len -= r_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+/* Caller must hold refcount tree lock. */
+int ocfs2_decrease_refcount(struct inode *inode,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete)
+{
+	int ret;
+	u64 ref_blkno;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *tree;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
+					cpos, len, meta_ac, dealloc, delete);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/*
+ * Mark the already-existing extent at cpos as refcounted for len clusters.
+ * This adds the refcount extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+static int ocfs2_mark_extent_refcounted(struct inode *inode,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle, u32 cpos,
+				u32 len, u32 phys,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
+					   cpos, len, phys);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       OCFS2_EXT_REFCOUNTED, 0);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+/*
+ * Given some contiguous physical clusters, calculate what we need
+ * for modifying their refcount.
+ */
+static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
+					    struct ocfs2_caching_info *ci,
+					    struct buffer_head *ref_root_bh,
+					    u64 start_cpos,
+					    u32 clusters,
+					    int *meta_add,
+					    int *credits)
+{
+	int ret = 0, index, ref_blocks = 0, recs_add = 0;
+	u64 cpos = start_cpos;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
+	u32 len;
+
+	while (clusters) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, clusters, &rec,
+					     &index, &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (ref_leaf_bh != prev_bh) {
+			/*
+			 * Now we encounter a new leaf block, so calculate
+			 * whether we need to extend the old leaf.
+			 */
+			if (prev_bh) {
+				rb = (struct ocfs2_refcount_block *)
+							prev_bh->b_data;
+
+				if (le16_to_cpu(rb->rf_records.rl_used) +
+				    recs_add >
+				    le16_to_cpu(rb->rf_records.rl_count))
+					ref_blocks++;
+			}
+
+			recs_add = 0;
+			*credits += 1;
+			brelse(prev_bh);
+			prev_bh = ref_leaf_bh;
+			get_bh(prev_bh);
+		}
+
+		trace_ocfs2_calc_refcount_meta_credits_iterate(
+				recs_add, (unsigned long long)cpos, clusters,
+				(unsigned long long)le64_to_cpu(rec.r_cpos),
+				le32_to_cpu(rec.r_clusters),
+				le32_to_cpu(rec.r_refcount), index);
+
+		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
+			  le32_to_cpu(rec.r_clusters)) - cpos;
+		/*
+		 * We record all the records which will be inserted to the
+		 * same refcount block, so that we can tell exactly whether
+		 * we need a new refcount block or not.
+		 *
+		 * If we will insert a new one, this is easy and only happens
+		 * during adding refcounted flag to the extent, so we don't
+		 * have a chance of spliting. We just need one record.
+		 *
+		 * If the refcount rec already exists, that would be a little
+		 * complicated. we may have to:
+		 * 1) split at the beginning if the start pos isn't aligned.
+		 *    we need 1 more record in this case.
+		 * 2) split int the end if the end pos isn't aligned.
+		 *    we need 1 more record in this case.
+		 * 3) split in the middle because of file system fragmentation.
+		 *    we need 2 more records in this case(we can't detect this
+		 *    beforehand, so always think of the worst case).
+		 */
+		if (rec.r_refcount) {
+			recs_add += 2;
+			/* Check whether we need a split at the beginning. */
+			if (cpos == start_cpos &&
+			    cpos != le64_to_cpu(rec.r_cpos))
+				recs_add++;
+
+			/* Check whether we need a split in the end. */
+			if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
+			    le32_to_cpu(rec.r_clusters))
+				recs_add++;
+		} else
+			recs_add++;
+
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+		clusters -= len;
+		cpos += len;
+	}
+
+	if (prev_bh) {
+		rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
+
+		if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
+		    le16_to_cpu(rb->rf_records.rl_count))
+			ref_blocks++;
+
+		*credits += 1;
+	}
+
+	if (!ref_blocks)
+		goto out;
+
+	*meta_add += ref_blocks;
+	*credits += ref_blocks;
+
+	/*
+	 * So we may need ref_blocks to insert into the tree.
+	 * That also means we need to change the b-tree and add that number
+	 * of records since we never merge them.
+	 * We need one more block for expansion since the new created leaf
+	 * block is also full and needs split.
+	 */
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+		*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
+		*credits += ocfs2_calc_extend_credits(sb,
+						      et.et_root_el);
+	} else {
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+		*meta_add += 1;
+	}
+
+out:
+
+	trace_ocfs2_calc_refcount_meta_credits(
+		(unsigned long long)start_cpos, clusters,
+		*meta_add, *credits);
+	brelse(ref_leaf_bh);
+	brelse(prev_bh);
+	return ret;
+}
+
+/*
+ * For refcount tree, we will decrease some contiguous clusters
+ * refcount count, so just go through it to see how many blocks
+ * we gonna touch and whether we need to create new blocks.
+ *
+ * Normally the refcount blocks store these refcount should be
+ * contiguous also, so that we can get the number easily.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
+ *
+ * Caller must hold refcount tree lock.
+ */
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  u64 refcount_loc,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  int *ref_blocks)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *tree;
+	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+				      refcount_loc, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       &tree->rf_ci,
+					       ref_root_bh,
+					       start_cpos, clusters,
+					       ref_blocks, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+#define	MAX_CONTIG_BYTES	1048576
+
+static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
+{
+	return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
+}
+
+static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
+{
+	return ~(ocfs2_cow_contig_clusters(sb) - 1);
+}
+
+/*
+ * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
+ * find an offset (start + (n * contig_clusters)) that is closest to cpos
+ * while still being less than or equal to it.
+ *
+ * The goal is to break the extent at a multiple of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
+						 unsigned int start,
+						 unsigned int cpos)
+{
+	BUG_ON(start > cpos);
+
+	return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
+}
+
+/*
+ * Given a cluster count of len, pad it out so that it is a multiple
+ * of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
+						  unsigned int len)
+{
+	unsigned int padded =
+		(len + (ocfs2_cow_contig_clusters(sb) - 1)) &
+		ocfs2_cow_contig_mask(sb);
+
+	/* Did we wrap? */
+	if (padded < len)
+		padded = UINT_MAX;
+
+	return padded;
+}
+
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ * max_cpos is the place where we want to stop CoW intentionally.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+					   struct ocfs2_extent_list *el,
+					   u32 cpos,
+					   u32 write_len,
+					   u32 max_cpos,
+					   u32 *cow_start,
+					   u32 *cow_len)
+{
+	int ret = 0;
+	int tree_height = le16_to_cpu(el->l_tree_depth), i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct ocfs2_extent_rec *rec;
+	unsigned int want_clusters, rec_end = 0;
+	int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
+	int leaf_clusters;
+
+	BUG_ON(cpos + write_len > max_cpos);
+
+	if (tree_height > 0) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	*cow_len = 0;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		if (ocfs2_is_empty_extent(rec)) {
+			mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+					"index %d\n", inode->i_ino, i);
+			continue;
+		}
+
+		if (le32_to_cpu(rec->e_cpos) +
+		    le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+			continue;
+
+		if (*cow_len == 0) {
+			/*
+			 * We should find a refcounted record in the
+			 * first pass.
+			 */
+			BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+			*cow_start = le32_to_cpu(rec->e_cpos);
+		}
+
+		/*
+		 * If we encounter a hole, a non-refcounted record or
+		 * pass the max_cpos, stop the search.
+		 */
+		if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+		    (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
+		    (max_cpos <= le32_to_cpu(rec->e_cpos)))
+			break;
+
+		leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+		rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+		if (rec_end > max_cpos) {
+			rec_end = max_cpos;
+			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
+		}
+
+		/*
+		 * How many clusters do we actually need from
+		 * this extent?  First we see how many we actually
+		 * need to complete the write.  If that's smaller
+		 * than contig_clusters, we try for contig_clusters.
+		 */
+		if (!*cow_len)
+			want_clusters = write_len;
+		else
+			want_clusters = (cpos + write_len) -
+				(*cow_start + *cow_len);
+		if (want_clusters < contig_clusters)
+			want_clusters = contig_clusters;
+
+		/*
+		 * If the write does not cover the whole extent, we
+		 * need to calculate how we're going to split the extent.
+		 * We try to do it on contig_clusters boundaries.
+		 *
+		 * Any extent smaller than contig_clusters will be
+		 * CoWed in its entirety.
+		 */
+		if (leaf_clusters <= contig_clusters)
+			*cow_len += leaf_clusters;
+		else if (*cow_len || (*cow_start == cpos)) {
+			/*
+			 * This extent needs to be CoW'd from its
+			 * beginning, so all we have to do is compute
+			 * how many clusters to grab.  We align
+			 * want_clusters to the edge of contig_clusters
+			 * to get better I/O.
+			 */
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+
+			if (leaf_clusters < want_clusters)
+				*cow_len += leaf_clusters;
+			else
+				*cow_len += want_clusters;
+		} else if ((*cow_start + contig_clusters) >=
+			   (cpos + write_len)) {
+			/*
+			 * Breaking off contig_clusters at the front
+			 * of the extent will cover our write.  That's
+			 * easy.
+			 */
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= contig_clusters) {
+			/*
+			 * Breaking off contig_clusters at the tail of
+			 * this extent will cover cpos.
+			 */
+			*cow_start = rec_end - contig_clusters;
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= want_clusters) {
+			/*
+			 * While we can't fit the entire write in this
+			 * extent, we know that the write goes from cpos
+			 * to the end of the extent.  Break that off.
+			 * We try to break it at some multiple of
+			 * contig_clusters from the front of the extent.
+			 * Failing that (ie, cpos is within
+			 * contig_clusters of the front), we'll CoW the
+			 * entire extent.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+			*cow_len = rec_end - *cow_start;
+		} else {
+			/*
+			 * Ok, the entire write lives in the middle of
+			 * this extent.  Let's try to slice the extent up
+			 * nicely.  Optimally, our CoW region starts at
+			 * m*contig_clusters from the beginning of the
+			 * extent and goes for n*contig_clusters,
+			 * covering the entire write.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+
+			want_clusters = (cpos + write_len) - *cow_start;
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+			if (*cow_start + want_clusters <= rec_end)
+				*cow_len = want_clusters;
+			else
+				*cow_len = rec_end - *cow_start;
+		}
+
+		/* Have we covered our entire write yet? */
+		if ((*cow_start + *cow_len) >= (cpos + write_len))
+			break;
+
+		/*
+		 * If we reach the end of the extent block and don't get enough
+		 * clusters, continue with the next extent block if possible.
+		 */
+		if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
+		    eb && eb->h_next_leaf_blk) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+
+			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+					       le64_to_cpu(eb->h_next_leaf_blk),
+					       &eb_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+			el = &eb->h_list;
+			i = -1;
+		}
+	}
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ *    more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ *    just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+					u32 p_cluster, u32 num_clusters,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int *credits)
+{
+	int ret = 0, meta_add = 0;
+	int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < num_clusters + 2)
+		meta_add =
+			ocfs2_extend_meta_needed(et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
+
+	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+						meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+					     data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUG_ON(buffer_dirty(bh));
+
+	clear_buffer_mapped(bh);
+
+	return 0;
+}
+
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+				     struct inode *inode,
+				     u32 cpos, u32 old_cluster,
+				     u32 new_cluster, u32 new_len)
+{
+	int ret = 0, partial;
+	struct super_block *sb = inode->i_sb;
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct page *page;
+	pgoff_t page_index;
+	unsigned int from, to, readahead_pages;
+	loff_t offset, end, map_end;
+	struct address_space *mapping = inode->i_mapping;
+
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
+
+	readahead_pages =
+		(ocfs2_cow_contig_clusters(sb) <<
+		 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+	/*
+	 * We only duplicate pages until we reach the page contains i_size - 1.
+	 * So trim 'end' to i_size.
+	 */
+	if (end > i_size_read(inode))
+		end = i_size_read(inode);
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		/* from, to is the offset within the page. */
+		from = offset & (PAGE_CACHE_SIZE - 1);
+		to = PAGE_CACHE_SIZE;
+		if (map_end & (PAGE_CACHE_SIZE - 1))
+			to = map_end & (PAGE_CACHE_SIZE - 1);
+
+		page = find_or_create_page(mapping, page_index, GFP_NOFS);
+		if (!page) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+		 * can't be dirtied before we CoW it out.
+		 */
+		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+			BUG_ON(PageDirty(page));
+
+		if (!PageUptodate(page)) {
+			ret = block_read_full_page(page, ocfs2_get_block);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+			lock_page(page);
+		}
+
+		if (page_has_buffers(page)) {
+			ret = walk_page_buffers(handle, page_buffers(page),
+						from, to, &partial,
+						ocfs2_clear_cow_buffer);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+		}
+
+		ocfs2_map_and_dirty_page(inode,
+					 handle, from, to,
+					 page, 0, &new_block);
+		mark_page_accessed(page);
+unlock:
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+				    struct inode *inode,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len)
+{
+	int ret = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
+	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
+	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *old_bh = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
+
+	for (i = 0; i < blocks; i++, old_block++, new_block++) {
+		new_bh = sb_getblk(osb->sb, new_block);
+		if (new_bh == NULL) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+		ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_journal_access(handle, ci, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
+		ocfs2_journal_dirty(handle, new_bh);
+
+		brelse(new_bh);
+		brelse(old_bh);
+		new_bh = NULL;
+		old_bh = NULL;
+	}
+
+	brelse(new_bh);
+	brelse(old_bh);
+	return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, u32 p_cluster, u32 len,
+				    unsigned int ext_flags,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	struct ocfs2_extent_rec replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+	trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
+				       cpos, len, p_cluster, ext_flags);
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+								   p_cluster));
+	replace_rec.e_flags = ext_flags;
+	replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1) {
+		ocfs2_error(sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, et, path, index,
+				 &replace_rec, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+				  struct ocfs2_cow_context *context,
+				  u32 cpos, u32 old,
+				  u32 new, u32 len,
+				  unsigned int ext_flags)
+{
+	int ret;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
+	u64 ino = ocfs2_metadata_cache_owner(ci);
+
+	trace_ocfs2_replace_clusters((unsigned long long)ino,
+				     cpos, old, new, len, ext_flags);
+
+	/*If the old clusters is unwritten, no need to duplicate. */
+	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+		ret = context->cow_duplicate_clusters(handle, context->inode,
+						      cpos, old, new, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
+				       cpos, new, len, ext_flags,
+				       context->meta_ac, &context->dealloc);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+			     struct inode *inode,
+			     u32 cpos, u32 num_clusters)
+{
+	int ret = 0;
+	loff_t offset, end, map_end;
+	pgoff_t page_index;
+	struct page *page;
+
+	if (ocfs2_should_order_data(inode))
+		return 0;
+
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+
+	ret = filemap_fdatawrite_range(inode->i_mapping,
+				       offset, end - 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		page = find_or_create_page(inode->i_mapping,
+					   page_index, GFP_NOFS);
+		BUG_ON(!page);
+
+		wait_on_page_writeback(page);
+		if (PageError(page)) {
+			ret = -EIO;
+			mlog_errno(ret);
+		} else
+			mark_page_accessed(page);
+
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
+				 u32 v_cluster, u32 *p_cluster,
+				 u32 *num_clusters,
+				 unsigned int *extent_flags)
+{
+	return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
+				  num_clusters, extent_flags);
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+					struct ocfs2_cow_context *context,
+					u32 cpos, u32 p_cluster,
+					u32 num_clusters, unsigned int e_flags)
+{
+	int ret, delete, index, credits =  0;
+	u32 new_bit, new_len, orig_num_clusters;
+	unsigned int set_len;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	handle_t *handle;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
+	struct ocfs2_refcount_rec rec;
+
+	trace_ocfs2_make_clusters_writable(cpos, p_cluster,
+					   num_clusters, e_flags);
+
+	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+					     &context->data_et,
+					     ref_ci,
+					     context->ref_root_bh,
+					     &context->meta_ac,
+					     &context->data_ac, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (context->post_refcount)
+		credits += context->post_refcount->credits;
+
+	credits += context->extra_credits;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	orig_num_clusters = num_clusters;
+
+	while (num_clusters) {
+		ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
+					     p_cluster, num_clusters,
+					     &rec, &index, &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		BUG_ON(!rec.r_refcount);
+		set_len = min((u64)p_cluster + num_clusters,
+			      le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - p_cluster;
+
+		/*
+		 * There are many different situation here.
+		 * 1. If refcount == 1, remove the flag and don't COW.
+		 * 2. If refcount > 1, allocate clusters.
+		 *    Here we may not allocate r_len once at a time, so continue
+		 *    until we reach num_clusters.
+		 */
+		if (le32_to_cpu(rec.r_refcount) == 1) {
+			delete = 0;
+			ret = ocfs2_clear_ext_refcount(handle,
+						       &context->data_et,
+						       cpos, p_cluster,
+						       set_len, e_flags,
+						       context->meta_ac,
+						       &context->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+		} else {
+			delete = 1;
+
+			ret = __ocfs2_claim_clusters(handle,
+						     context->data_ac,
+						     1, set_len,
+						     &new_bit, &new_len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			ret = ocfs2_replace_clusters(handle, context,
+						     cpos, p_cluster, new_bit,
+						     new_len, e_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+			set_len = new_len;
+		}
+
+		ret = __ocfs2_decrease_refcount(handle, ref_ci,
+						context->ref_root_bh,
+						p_cluster, set_len,
+						context->meta_ac,
+						&context->dealloc, delete);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		cpos += set_len;
+		p_cluster += set_len;
+		num_clusters -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+	/* handle any post_cow action. */
+	if (context->post_refcount && context->post_refcount->func) {
+		ret = context->post_refcount->func(context->inode, handle,
+						context->post_refcount->para);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	if (context->get_clusters == ocfs2_di_get_clusters) {
+		ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
+					       orig_num_clusters);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+	brelse(ref_leaf_bh);
+
+	return ret;
+}
+
+static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
+{
+	int ret = 0;
+	struct inode *inode = context->inode;
+	u32 cow_start = context->cow_start, cow_len = context->cow_len;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		return -EROFS;
+	}
+
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+	while (cow_len) {
+		ret = context->get_clusters(context, cow_start, &p_cluster,
+					    &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+		if (cow_len < num_clusters)
+			num_clusters = cow_len;
+
+		ret = ocfs2_make_clusters_writable(inode->i_sb, context,
+						   cow_start, p_cluster,
+						   num_clusters, ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		cow_len -= num_clusters;
+		cow_start += num_clusters;
+	}
+
+	if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &context->dealloc);
+	}
+
+	return ret;
+}
+
+/*
+ * Starting at cpos, try to CoW write_len clusters.  Don't CoW
+ * past max_cpos.  This will stop when it runs into a hole or an
+ * unrefcounted extent.
+ */
+static int ocfs2_refcount_cow_hunk(struct inode *inode,
+				   struct buffer_head *di_bh,
+				   u32 cpos, u32 write_len, u32 max_cpos)
+{
+	int ret;
+	u32 cow_start = 0, cow_len = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct ocfs2_cow_context *context = NULL;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
+					      cpos, write_len, max_cpos,
+					      &cow_start, &cow_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
+				      cpos, write_len, max_cpos,
+				      cow_start, cow_len);
+
+	BUG_ON(cow_len == 0);
+
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_tree = ref_tree;
+	context->ref_root_bh = ref_root_bh;
+	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
+	context->get_clusters = ocfs2_di_get_clusters;
+
+	ocfs2_init_dinode_extent_tree(&context->data_et,
+				      INODE_CACHE(inode), di_bh);
+
+	ret = ocfs2_replace_cow(context);
+	if (ret)
+		mlog_errno(ret);
+
+	/*
+	 * truncate the extent map here since no matter whether we meet with
+	 * any error during the action, we shouldn't trust cached extent map
+	 * any more.
+	 */
+	ocfs2_extent_map_trunc(inode, cow_start);
+
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+out:
+	kfree(context);
+	return ret;
+}
+
+/*
+ * CoW any and all clusters between cpos and cpos+write_len.
+ * Don't CoW past max_cpos.  If this returns successfully, all
+ * clusters between cpos and cpos+write_len are safe to modify.
+ */
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len, u32 max_cpos)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	while (write_len) {
+		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (write_len < num_clusters)
+			num_clusters = write_len;
+
+		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+			ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+						      num_clusters, max_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		write_len -= num_clusters;
+		cpos += num_clusters;
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
+					  u32 v_cluster, u32 *p_cluster,
+					  u32 *num_clusters,
+					  unsigned int *extent_flags)
+{
+	struct inode *inode = context->inode;
+	struct ocfs2_xattr_value_root *xv = context->cow_object;
+
+	return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
+					num_clusters, &xv->xr_list,
+					extent_flags);
+}
+
+/*
+ * Given a xattr value root, calculate the most meta/credits we need for
+ * refcount tree change if we truncate it to 0.
+ */
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits)
+{
+	int ret = 0, index, ref_blocks = 0;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list,
+					       NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		cpos += num_clusters;
+
+		while (num_clusters) {
+			ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
+						     p_cluster, num_clusters,
+						     &rec, &index,
+						     &ref_leaf_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!rec.r_refcount);
+
+			rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+			/*
+			 * We really don't know whether the other clusters is in
+			 * this refcount block or not, so just take the worst
+			 * case that all the clusters are in this block and each
+			 * one will split a refcount rec, so totally we need
+			 * clusters * 2 new refcount rec.
+			 */
+			if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+			    le16_to_cpu(rb->rf_records.rl_count))
+				ref_blocks++;
+
+			*credits += 1;
+			brelse(ref_leaf_bh);
+			ref_leaf_bh = NULL;
+
+			if (num_clusters <= le32_to_cpu(rec.r_clusters))
+				break;
+			else
+				num_clusters -= le32_to_cpu(rec.r_clusters);
+			p_cluster += num_clusters;
+		}
+	}
+
+	*meta_add += ref_blocks;
+	if (!ref_blocks)
+		goto out;
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	else {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
+		*credits += ocfs2_calc_extend_credits(inode->i_sb,
+						      et.et_root_el);
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+/*
+ * Do CoW for xattr.
+ */
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_refcount_tree *ref_tree,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post)
+{
+	int ret;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_cow_context *context = NULL;
+	u32 cow_start, cow_len;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
+					      cpos, write_len, UINT_MAX,
+					      &cow_start, &cow_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(cow_len == 0);
+
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_tree = ref_tree;
+	context->ref_root_bh = ref_root_bh;
+	context->cow_object = xv;
+
+	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
+	/* We need the extra credits for duplicate_clusters by jbd. */
+	context->extra_credits =
+		ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
+	context->get_clusters = ocfs2_xattr_value_get_clusters;
+	context->post_refcount = post;
+
+	ocfs2_init_xattr_value_extent_tree(&context->data_et,
+					   INODE_CACHE(inode), vb);
+
+	ret = ocfs2_replace_cow(context);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	kfree(context);
+	return ret;
+}
+
+/*
+ * Insert a new extent into refcount tree and mark a extent rec
+ * as refcounted in the dinode tree.
+ */
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 1, ref_blocks = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &ref_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_add_refcount_flag(ref_blocks, credits);
+
+	if (ref_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+							ref_blocks, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (post)
+		credits += post->credits;
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
+					   cpos, num_clusters, p_cluster,
+					   meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters, 0,
+					meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (post && post->func) {
+		ret = post->func(inode, handle, post->para);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_change_ctime(struct inode *inode,
+			      struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_attach_refcount_tree(struct inode *inode,
+				      struct buffer_head *di_bh)
+{
+	int ret, data_changed = 0;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_tree *ref_tree;
+	unsigned int ext_flags;
+	loff_t size;
+	u32 cpos, num_clusters, clusters, p_cluster;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree di_et;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+		ret = ocfs2_create_refcount_tree(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	BUG_ON(!di->i_refcount_loc);
+	ret = ocfs2_lock_refcount_tree(osb,
+				       le64_to_cpu(di->i_refcount_loc), 1,
+				       &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		goto attach_xattr;
+
+	ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
+
+	size = i_size_read(inode);
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto unlock;
+		}
+		if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = ocfs2_add_refcount_flag(inode, &di_et,
+						      &ref_tree->rf_ci,
+						      ref_root_bh, cpos,
+						      p_cluster, num_clusters,
+						      &dealloc, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+
+			data_changed = 1;
+		}
+		cpos += num_clusters;
+	}
+
+attach_xattr:
+	if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
+						       &ref_tree->rf_ci,
+						       ref_root_bh,
+						       &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto unlock;
+		}
+	}
+
+	if (data_changed) {
+		ret = ocfs2_change_ctime(inode, di_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+unlock:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+
+	if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+out:
+	/*
+	 * Empty the extent map so that we may get the right extent
+	 * record from the disk.
+	 */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	return ret;
+}
+
+static int ocfs2_add_refcounted_extent(struct inode *inode,
+				   struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ref_ci,
+				   struct buffer_head *ref_root_bh,
+				   u32 cpos, u32 p_cluster, u32 num_clusters,
+				   unsigned int ext_flags,
+				   struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_refcount_allocators(inode->i_sb,
+					     p_cluster, num_clusters,
+					     et, ref_ci,
+					     ref_root_bh, &meta_ac,
+					     NULL, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_insert_extent(handle, et, cpos,
+			ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
+			num_clusters, ext_flags, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+				      p_cluster, num_clusters,
+				      meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_duplicate_inline_data(struct inode *s_inode,
+				       struct buffer_head *s_bh,
+				       struct inode *t_inode,
+				       struct buffer_head *t_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
+
+	BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
+	memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
+	       le16_to_cpu(s_di->id2.i_data.id_count));
+	spin_lock(&OCFS2_I(t_inode)->ip_lock);
+	OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+	t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+
+	ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_duplicate_extent_list(struct inode *s_inode,
+				struct inode *t_inode,
+				struct buffer_head *t_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters, clusters, cpos;
+	loff_t size;
+	unsigned int ext_flags;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
+
+	size = i_size_read(s_inode);
+	clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		if (p_cluster) {
+			ret = ocfs2_add_refcounted_extent(t_inode, &et,
+							  ref_ci, ref_root_bh,
+							  cpos, p_cluster,
+							  num_clusters,
+							  ext_flags,
+							  dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += num_clusters;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * change the new file's attributes to the src.
+ *
+ * reflink creates a snapshot of a file, that means the attributes
+ * must be identical except for three exceptions - nlink, ino, and ctime.
+ */
+static int ocfs2_complete_reflink(struct inode *s_inode,
+				  struct buffer_head *s_bh,
+				  struct inode *t_inode,
+				  struct buffer_head *t_bh,
+				  bool preserve)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
+	loff_t size = i_size_read(s_inode);
+
+	handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&OCFS2_I(t_inode)->ip_lock);
+	OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
+	OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
+	OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
+	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+	i_size_write(t_inode, size);
+	t_inode->i_blocks = s_inode->i_blocks;
+
+	di->i_xattr_inline_size = s_di->i_xattr_inline_size;
+	di->i_clusters = s_di->i_clusters;
+	di->i_size = s_di->i_size;
+	di->i_dyn_features = s_di->i_dyn_features;
+	di->i_attr = s_di->i_attr;
+
+	if (preserve) {
+		t_inode->i_uid = s_inode->i_uid;
+		t_inode->i_gid = s_inode->i_gid;
+		t_inode->i_mode = s_inode->i_mode;
+		di->i_uid = s_di->i_uid;
+		di->i_gid = s_di->i_gid;
+		di->i_mode = s_di->i_mode;
+
+		/*
+		 * update time.
+		 * we want mtime to appear identical to the source and
+		 * update ctime.
+		 */
+		t_inode->i_ctime = CURRENT_TIME;
+
+		di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
+		di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+
+		t_inode->i_mtime = s_inode->i_mtime;
+		di->i_mtime = s_di->i_mtime;
+		di->i_mtime_nsec = s_di->i_mtime_nsec;
+	}
+
+	ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
+	return ret;
+}
+
+static int ocfs2_create_reflink_node(struct inode *s_inode,
+				     struct buffer_head *s_bh,
+				     struct inode *t_inode,
+				     struct buffer_head *t_bh,
+				     bool preserve)
+{
+	int ret;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+				      le64_to_cpu(di->i_refcount_loc));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
+						  t_inode, t_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
+					  &ref_tree->rf_ci, ref_root_bh,
+					  &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock_refcount;
+	}
+
+out_unlock_refcount:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+out:
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+
+	return ret;
+}
+
+static int __ocfs2_reflink(struct dentry *old_dentry,
+			   struct buffer_head *old_bh,
+			   struct inode *new_inode,
+			   bool preserve)
+{
+	int ret;
+	struct inode *inode = d_inode(old_dentry);
+	struct buffer_head *new_bh = NULL;
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = filemap_fdatawrite(inode->i_mapping);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_attach_refcount_tree(inode, old_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+	ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+				      OI_LS_REFLINK_TARGET);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_create_reflink_node(inode, old_bh,
+					new_inode, new_bh, preserve);
+	if (ret) {
+		mlog_errno(ret);
+		goto inode_unlock;
+	}
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_reflink_xattrs(inode, old_bh,
+					   new_inode, new_bh,
+					   preserve);
+		if (ret) {
+			mlog_errno(ret);
+			goto inode_unlock;
+		}
+	}
+
+	ret = ocfs2_complete_reflink(inode, old_bh,
+				     new_inode, new_bh, preserve);
+	if (ret)
+		mlog_errno(ret);
+
+inode_unlock:
+	ocfs2_inode_unlock(new_inode, 1);
+	brelse(new_bh);
+out_unlock:
+	mutex_unlock(&new_inode->i_mutex);
+out:
+	if (!ret) {
+		ret = filemap_fdatawait(inode->i_mapping);
+		if (ret)
+			mlog_errno(ret);
+	}
+	return ret;
+}
+
+static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *new_dentry, bool preserve)
+{
+	int error;
+	struct inode *inode = d_inode(old_dentry);
+	struct buffer_head *old_bh = NULL;
+	struct inode *new_orphan_inode = NULL;
+	struct posix_acl *default_acl, *acl;
+	umode_t mode;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	mode = inode->i_mode;
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error) {
+		mlog_errno(error);
+		return error;
+	}
+
+	error = ocfs2_create_inode_in_orphan(dir, mode,
+					     &new_orphan_inode);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = ocfs2_rw_lock(inode, 1);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = ocfs2_inode_lock(inode, &old_bh, 1);
+	if (error) {
+		mlog_errno(error);
+		ocfs2_rw_unlock(inode, 1);
+		goto out;
+	}
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	error = __ocfs2_reflink(old_dentry, old_bh,
+				new_orphan_inode, preserve);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ocfs2_inode_unlock(inode, 1);
+	ocfs2_rw_unlock(inode, 1);
+	brelse(old_bh);
+
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	/* If the security isn't preserved, we need to re-initialize them. */
+	if (!preserve) {
+		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+						    &new_dentry->d_name,
+						    default_acl, acl);
+		if (error)
+			mlog_errno(error);
+	}
+out:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
+	if (!error) {
+		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+						       new_dentry);
+		if (error)
+			mlog_errno(error);
+	}
+
+	if (new_orphan_inode) {
+		/*
+		 * We need to open_unlock the inode no matter whether we
+		 * succeed or not, so that other nodes can delete it later.
+		 */
+		ocfs2_open_unlock(new_orphan_inode);
+		if (error)
+			iput(new_orphan_inode);
+	}
+
+	return error;
+}
+
+/*
+ * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
+ * sys_reflink().  This will go away when vfs_reflink() exists in
+ * fs/namei.c.
+ */
+
+/* copied from may_create in VFS. */
+static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
+{
+	if (d_really_is_positive(child))
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/**
+ * ocfs2_vfs_reflink - Create a reference-counted link
+ *
+ * @old_dentry:        source dentry + inode
+ * @dir:       directory to create the target
+ * @new_dentry:        target dentry
+ * @preserve:  if true, preserve all file attributes
+ */
+static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
+			     struct dentry *new_dentry, bool preserve)
+{
+	struct inode *inode = d_inode(old_dentry);
+	int error;
+
+	if (!inode)
+		return -ENOENT;
+
+	error = ocfs2_may_create(dir, new_dentry);
+	if (error)
+		return error;
+
+	if (dir->i_sb != inode->i_sb)
+		return -EXDEV;
+
+	/*
+	 * A reflink to an append-only or immutable file cannot be created.
+	 */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	/* Only regular files can be reflinked. */
+	if (!S_ISREG(inode->i_mode))
+		return -EPERM;
+
+	/*
+	 * If the caller wants to preserve ownership, they require the
+	 * rights to do so.
+	 */
+	if (preserve) {
+		if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
+			return -EPERM;
+		if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
+			return -EPERM;
+	}
+
+	/*
+	 * If the caller is modifying any aspect of the attributes, they
+	 * are not creating a snapshot.  They need read permission on the
+	 * file.
+	 */
+	if (!preserve) {
+		error = inode_permission(inode, MAY_READ);
+		if (error)
+			return error;
+	}
+
+	mutex_lock(&inode->i_mutex);
+	dquot_initialize(dir);
+	error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
+	mutex_unlock(&inode->i_mutex);
+	if (!error)
+		fsnotify_create(dir, new_dentry);
+	return error;
+}
+/*
+ * Most codes are copied from sys_linkat.
+ */
+int ocfs2_reflink_ioctl(struct inode *inode,
+			const char __user *oldname,
+			const char __user *newname,
+			bool preserve)
+{
+	struct dentry *new_dentry;
+	struct path old_path, new_path;
+	int error;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
+	if (error) {
+		mlog_errno(error);
+		return error;
+	}
+
+	new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry)) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = -EXDEV;
+	if (old_path.mnt != new_path.mnt) {
+		mlog_errno(error);
+		goto out_dput;
+	}
+
+	error = ocfs2_vfs_reflink(old_path.dentry,
+				  d_inode(new_path.dentry),
+				  new_dentry, preserve);
+out_dput:
+	done_path_create(&new_path, new_dentry);
+out:
+	path_put(&old_path);
+
+	return error;
+}
diff --git a/kernel/fs/ocfs2/refcounttree.h b/kernel/fs/ocfs2/refcounttree.h
new file mode 100644
index 000000000..6422bbcdb
--- /dev/null
+++ b/kernel/fs/ocfs2/refcounttree.h
@@ -0,0 +1,118 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.h
+ *
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_REFCOUNTTREE_H
+#define OCFS2_REFCOUNTTREE_H
+
+struct ocfs2_refcount_tree {
+	struct rb_node rf_node;
+	u64 rf_blkno;
+	u32 rf_generation;
+	struct kref rf_getcnt;
+	struct rw_semaphore rf_sem;
+	struct ocfs2_lock_res rf_lockres;
+	int rf_removed;
+
+	/* the following 4 fields are used by caching_info. */
+	spinlock_t rf_lock;
+	struct ocfs2_caching_info rf_ci;
+	struct mutex rf_io_mutex;
+	struct super_block *rf_sb;
+};
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **tree,
+			     struct buffer_head **ref_bh);
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree,
+				int rw);
+
+int ocfs2_decrease_refcount(struct inode *inode,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete);
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  u64 refcount_loc,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  int *ref_blocks);
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len, u32 max_cpos);
+
+typedef int (ocfs2_post_refcount_func)(struct inode *inode,
+				       handle_t *handle,
+				       void *para);
+/*
+ * Some refcount caller need to do more work after we modify the data b-tree
+ * during refcount operation(including CoW and add refcount flag), and make the
+ * transaction complete. So it must give us this structure so that we can do it
+ * within our transaction.
+ *
+ */
+struct ocfs2_post_refcount {
+	int credits;			/* credits it need for journal. */
+	ocfs2_post_refcount_func *func;	/* real function. */
+	void *para;
+};
+
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits);
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_refcount_tree *ref_tree,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+				     struct inode *inode,
+				     u32 cpos, u32 old_cluster,
+				     u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+				    struct inode *inode,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+			     struct inode *inode,
+			     u32 cpos, u32 num_clusters);
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post);
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh);
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_ioctl(struct inode *inode,
+			const char __user *oldname,
+			const char __user *newname,
+			bool preserve);
+#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/kernel/fs/ocfs2/reservations.c b/kernel/fs/ocfs2/reservations.c
new file mode 100644
index 000000000..6a348b029
--- /dev/null
+++ b/kernel/fs/ocfs2/reservations.c
@@ -0,0 +1,839 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_trace.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+static DEFINE_SPINLOCK(resv_lock);
+
+#define	OCFS2_MIN_RESV_WINDOW_BITS	8
+#define	OCFS2_MAX_RESV_WINDOW_BITS	1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+	return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+					   struct ocfs2_alloc_reservation *resv)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	unsigned int bits;
+
+	if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+		/* 8, 16, 32, 64, 128, 256, 512, 1024 */
+		bits = 4 << osb->osb_resv_level;
+	} else {
+		bits = 4 << osb->osb_dir_resv_level;
+	}
+	return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_len)
+		return resv->r_start + resv->r_len - 1;
+	return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+	return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+	if (resmap->m_osb->osb_resv_level == 0)
+		return 1;
+	return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+	int i = 0;
+
+	mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+	     osb->dev_str, resmap->m_bitmap_len);
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+		     "\tlast_len: %u\n", resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		node = rb_next(node);
+		i++;
+	}
+
+	mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+	i = 0;
+	list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+		mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+		     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		i++;
+	}
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+				      int i,
+				      struct ocfs2_alloc_reservation *resv)
+{
+	char *disk_bitmap = resmap->m_disk_bitmap;
+	unsigned int start = resv->r_start;
+	unsigned int end = ocfs2_resv_end(resv);
+
+	while (start <= end) {
+		if (ocfs2_test_bit(start, disk_bitmap)) {
+			mlog(ML_ERROR,
+			     "reservation %d covers an allocated area "
+			     "starting at bit %u!\n", i, start);
+			return 1;
+		}
+
+		start++;
+	}
+	return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+	unsigned int off = 0;
+	int i = 0;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (i > 0 && resv->r_start <= off) {
+			mlog(ML_ERROR, "reservation %d has bad start off!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_len == 0) {
+			mlog(ML_ERROR, "reservation %d has no length!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_start > ocfs2_resv_end(resv)) {
+			mlog(ML_ERROR, "reservation %d has invalid range!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+			mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_validate_resmap_bits(resmap, i, resv))
+			goto bad;
+
+		off = ocfs2_resv_end(resv);
+		node = rb_next(node);
+
+		i++;
+	}
+	return;
+
+bad:
+	ocfs2_dump_resv(resmap);
+	BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+	memset(resv, 0, sizeof(*resv));
+	INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags)
+{
+	BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+	resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap)
+{
+	memset(resmap, 0, sizeof(*resmap));
+
+	resmap->m_osb = osb;
+	resmap->m_reservations = RB_ROOT;
+	/* m_bitmap_len is initialized to zero by the above memset. */
+	INIT_LIST_HEAD(&resmap->m_lru);
+
+	return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+				struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	if (!list_empty(&resv->r_lru))
+		list_del_init(&resv->r_lru);
+
+	list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+	resv->r_len = 0;
+	resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+		list_del_init(&resv->r_lru);
+		rb_erase(&resv->r_node, &resmap->m_reservations);
+		resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+	}
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+				 struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	__ocfs2_resv_trunc(resv);
+	/*
+	 * last_len and last_start no longer make sense if
+	 * we're changing the range of our allocations.
+	 */
+	resv->r_last_len = resv->r_last_start = 0;
+
+	ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv)
+{
+	if (resv) {
+		spin_lock(&resv_lock);
+		__ocfs2_resv_discard(resmap, resv);
+		spin_unlock(&resv_lock);
+	}
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	assert_spin_locked(&resv_lock);
+
+	while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		__ocfs2_resv_discard(resmap, resv);
+	}
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap)
+{
+	if (ocfs2_resmap_disabled(resmap))
+		return;
+
+	spin_lock(&resv_lock);
+
+	ocfs2_resmap_clear_all_resv(resmap);
+	resmap->m_bitmap_len = clen;
+	resmap->m_disk_bitmap = disk_bitmap;
+
+	spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+	/* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *new)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct ocfs2_alloc_reservation *tmp;
+
+	assert_spin_locked(&resv_lock);
+
+	trace_ocfs2_resv_insert(new->r_start, new->r_len);
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+		if (new->r_start < tmp->r_start) {
+			p = &(*p)->rb_left;
+
+			/*
+			 * This is a good place to check for
+			 * overlapping reservations.
+			 */
+			BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+		} else if (new->r_start > ocfs2_resv_end(tmp)) {
+			p = &(*p)->rb_right;
+		} else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate reservation window!\n");
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, root);
+	new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+	ocfs2_resv_mark_lru(resmap, new);
+
+	ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+	struct ocfs2_alloc_reservation *resv = NULL;
+	struct ocfs2_alloc_reservation *prev_resv = NULL;
+	struct rb_node *node = resmap->m_reservations.rb_node;
+
+	assert_spin_locked(&resv_lock);
+
+	if (!node)
+		return NULL;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+			break;
+
+		/* Check if we overshot the reservation just before goal? */
+		if (resv->r_start > goal) {
+			resv = prev_resv;
+			break;
+		}
+
+		prev_resv = resv;
+		node = rb_next(node);
+	}
+
+	return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+				       unsigned int wanted,
+				       unsigned int search_start,
+				       unsigned int search_len,
+				       unsigned int *rstart,
+				       unsigned int *rlen)
+{
+	void *bitmap = resmap->m_disk_bitmap;
+	unsigned int best_start, best_len = 0;
+	int offset, start, found;
+
+	trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
+						wanted, resmap->m_bitmap_len);
+
+	found = best_start = best_len = 0;
+
+	start = search_start;
+	while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+						 start)) != -1) {
+		/* Search reached end of the region */
+		if (offset >= (search_start + search_len))
+			break;
+
+		if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_len) {
+			best_len = found;
+			best_start = start - found;
+		}
+
+		if (found >= wanted)
+			break;
+	}
+
+	if (best_len == 0)
+		return 0;
+
+	if (best_len >= wanted)
+		best_len = wanted;
+
+	*rlen = best_len;
+	*rstart = best_start;
+
+	trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
+
+	return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int goal, unsigned int wanted)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	unsigned int gap_start, gap_end, gap_len;
+	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+	struct rb_node *prev, *next;
+	unsigned int cstart, clen;
+	unsigned int best_start = 0, best_len = 0;
+
+	/*
+	 * Nasty cases to consider:
+	 *
+	 * - rbtree is empty
+	 * - our window should be first in all reservations
+	 * - our window should be last in all reservations
+	 * - need to make sure we don't go past end of bitmap
+	 */
+	trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
+					   goal, wanted, RB_EMPTY_ROOT(root));
+
+	assert_spin_locked(&resv_lock);
+
+	if (RB_EMPTY_ROOT(root)) {
+		/*
+		 * Easiest case - empty tree. We can just take
+		 * whatever window of free bits we want.
+		 */
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   resmap->m_bitmap_len - goal,
+						   &cstart, &clen);
+
+		/*
+		 * This should never happen - the local alloc window
+		 * will always have free bits when we're called.
+		 */
+		BUG_ON(goal == 0 && clen == 0);
+
+		if (clen == 0)
+			return;
+
+		resv->r_start = cstart;
+		resv->r_len = clen;
+
+		ocfs2_resv_insert(resmap, resv);
+		return;
+	}
+
+	prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+	if (prev_resv == NULL) {
+		/*
+		 * A NULL here means that the search code couldn't
+		 * find a window that starts before goal.
+		 *
+		 * However, we can take the first window after goal,
+		 * which is also by definition, the leftmost window in
+		 * the entire tree. If we can find free bits in the
+		 * gap between goal and the LHS window, then the
+		 * reservation can safely be placed there.
+		 *
+		 * Otherwise we fall back to a linear search, checking
+		 * the gaps in between windows for a place to
+		 * allocate.
+		 */
+
+		next = rb_first(root);
+		next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+				     r_node);
+
+		/*
+		 * The search should never return such a window. (see
+		 * comment above
+		 */
+		if (next_resv->r_start <= goal) {
+			mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+			     goal, next_resv->r_start, next_resv->r_len);
+			ocfs2_dump_resv(resmap);
+			BUG();
+		}
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   next_resv->r_start - goal,
+						   &cstart, &clen);
+		if (clen) {
+			best_len = clen;
+			best_start = cstart;
+			if (best_len == wanted)
+				goto out_insert;
+		}
+
+		prev_resv = next_resv;
+		next_resv = NULL;
+	}
+
+	trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+					  ocfs2_resv_end(prev_resv));
+
+	prev = &prev_resv->r_node;
+
+	/* Now we do a linear search for a window, starting at 'prev_rsv' */
+	while (1) {
+		next = rb_next(prev);
+		if (next) {
+			next_resv = rb_entry(next,
+					     struct ocfs2_alloc_reservation,
+					     r_node);
+
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_end = next_resv->r_start - 1;
+			gap_len = gap_end - gap_start + 1;
+		} else {
+			/*
+			 * We're at the rightmost edge of the
+			 * tree. See if a reservation between this
+			 * window and the end of the bitmap will work.
+			 */
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_len = resmap->m_bitmap_len - gap_start;
+			gap_end = resmap->m_bitmap_len - 1;
+		}
+
+		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+					next ? ocfs2_resv_end(next_resv) : -1);
+		/*
+		 * No need to check this gap if we have already found
+		 * a larger region of free bits.
+		 */
+		if (gap_len <= best_len)
+			goto next_resv;
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+						   gap_len, &cstart, &clen);
+		if (clen == wanted) {
+			best_len = clen;
+			best_start = cstart;
+			goto out_insert;
+		} else if (clen > best_len) {
+			best_len = clen;
+			best_start = cstart;
+		}
+
+next_resv:
+		if (!next)
+			break;
+
+		prev = next;
+		prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+				     r_node);
+	}
+
+out_insert:
+	if (best_len) {
+		resv->r_start = best_start;
+		resv->r_len = best_len;
+		ocfs2_resv_insert(resmap, resv);
+	}
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	struct ocfs2_alloc_reservation *lru_resv;
+	int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+	unsigned int min_bits;
+
+	if (!tmpwindow)
+		min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+	else
+		min_bits = wanted; /* We at know the temp window will use all
+				    * of these bits */
+
+	/*
+	 * Take the first reservation off the LRU as our 'target'. We
+	 * don't try to be smart about it. There might be a case for
+	 * searching based on size but I don't have enough data to be
+	 * sure. --Mark (3/16/2010)
+	 */
+	lru_resv = list_first_entry(&resmap->m_lru,
+				    struct ocfs2_alloc_reservation, r_lru);
+
+	trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
+					   lru_resv->r_len,
+					   ocfs2_resv_end(lru_resv));
+
+	/*
+	 * Cannibalize (some or all) of the target reservation and
+	 * feed it to the current window.
+	 */
+	if (lru_resv->r_len <= min_bits) {
+		/*
+		 * Discard completely if size is less than or equal to a
+		 * reasonable threshold - 50% of window bits for non temporary
+		 * windows.
+		 */
+		resv->r_start = lru_resv->r_start;
+		resv->r_len = lru_resv->r_len;
+
+		__ocfs2_resv_discard(resmap, lru_resv);
+	} else {
+		unsigned int shrink;
+		if (tmpwindow)
+			shrink = min_bits;
+		else
+			shrink = lru_resv->r_len / 2;
+
+		lru_resv->r_len -= shrink;
+
+		resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+		resv->r_len = shrink;
+	}
+
+	trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
+					 resv->r_len, resv->r_last_start,
+					 resv->r_last_len);
+
+	ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	unsigned int goal = 0;
+
+	BUG_ON(!ocfs2_resv_empty(resv));
+
+	/*
+	 * Begin by trying to get a window as close to the previous
+	 * one as possible. Using the most recent allocation as a
+	 * start goal makes sense.
+	 */
+	if (resv->r_last_len) {
+		goal = resv->r_last_start + resv->r_last_len;
+		if (goal >= resmap->m_bitmap_len)
+			goal = 0;
+	}
+
+	__ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+	/* Search from last alloc didn't work, try once more from beginning. */
+	if (ocfs2_resv_empty(resv) && goal != 0)
+		__ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+	if (ocfs2_resv_empty(resv)) {
+		/*
+		 * Still empty? Pull oldest one off the LRU, remove it from
+		 * tree, put this one in it's place.
+		 */
+		ocfs2_cannibalize_resv(resmap, resv, wanted);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen)
+{
+	if (resv == NULL || ocfs2_resmap_disabled(resmap))
+		return -ENOSPC;
+
+	spin_lock(&resv_lock);
+
+	if (ocfs2_resv_empty(resv)) {
+		/*
+		 * We don't want to over-allocate for temporary
+		 * windows. Otherwise, we run the risk of fragmenting the
+		 * allocation space.
+		 */
+		unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+
+		if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+			wanted = *clen;
+
+		/*
+		 * Try to get a window here. If it works, we must fall
+		 * through and test the bitmap . This avoids some
+		 * ping-ponging of windows due to non-reserved space
+		 * being allocation before we initialize a window for
+		 * that inode.
+		 */
+		ocfs2_resv_find_window(resmap, resv, wanted);
+		trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+
+	*cstart = resv->r_start;
+	*clen = resv->r_len;
+
+	spin_unlock(&resv_lock);
+	return 0;
+}
+
+static void
+	ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int start, unsigned int end)
+{
+	unsigned int rhs = 0;
+	unsigned int old_end = ocfs2_resv_end(resv);
+
+	BUG_ON(start != resv->r_start || old_end < end);
+
+	/*
+	 * Completely used? We can remove it then.
+	 */
+	if (old_end == end) {
+		__ocfs2_resv_discard(resmap, resv);
+		return;
+	}
+
+	rhs = old_end - end;
+
+	/*
+	 * This should have been trapped above.
+	 */
+	BUG_ON(rhs == 0);
+
+	resv->r_start = end + 1;
+	resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen)
+{
+	unsigned int cend = cstart + clen - 1;
+
+	if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+		return;
+
+	if (resv == NULL)
+		return;
+
+	BUG_ON(cstart != resv->r_start);
+
+	spin_lock(&resv_lock);
+
+	trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
+					      ocfs2_resv_end(resv), resv->r_len,
+					      resv->r_last_start,
+					      resv->r_last_len);
+
+	BUG_ON(cstart < resv->r_start);
+	BUG_ON(cstart > ocfs2_resv_end(resv));
+	BUG_ON(cend > ocfs2_resv_end(resv));
+
+	ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+	resv->r_last_start = cstart;
+	resv->r_last_len = clen;
+
+	/*
+	 * May have been discarded above from
+	 * ocfs2_adjust_resv_from_alloc().
+	 */
+	if (!ocfs2_resv_empty(resv))
+		ocfs2_resv_mark_lru(resmap, resv);
+
+	trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
+					    resv->r_len, resv->r_last_start,
+					    resv->r_last_len);
+
+	ocfs2_check_resmap(resmap);
+
+	spin_unlock(&resv_lock);
+}
diff --git a/kernel/fs/ocfs2/reservations.h b/kernel/fs/ocfs2/reservations.h
new file mode 100644
index 000000000..42c2b804f
--- /dev/null
+++ b/kernel/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef	OCFS2_RESERVATIONS_H
+#define	OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL	2
+#define OCFS2_MAX_RESV_LEVEL	9
+#define OCFS2_MIN_RESV_LEVEL	0
+
+struct ocfs2_alloc_reservation {
+	struct rb_node	r_node;
+
+	unsigned int	r_start;	/* Beginning of current window */
+	unsigned int	r_len;		/* Length of the window */
+
+	unsigned int	r_last_len;	/* Length of most recent alloc */
+	unsigned int	r_last_start;	/* Start of most recent alloc */
+	struct list_head	r_lru;	/* LRU list head */
+
+	unsigned int	r_flags;
+};
+
+#define	OCFS2_RESV_FLAG_INUSE	0x01	/* Set when r_node is part of a btree */
+#define	OCFS2_RESV_FLAG_TMP	0x02	/* Temporary reservation, will be
+					 * destroyed immedately after use */
+#define	OCFS2_RESV_FLAG_DIR	0x04	/* Reservation is for an unindexed
+					 * directory btree */
+
+struct ocfs2_reservation_map {
+	struct rb_root		m_reservations;
+	char			*m_disk_bitmap;
+
+	struct ocfs2_super	*m_osb;
+
+	/* The following are not initialized to meaningful values until a disk
+	 * bitmap is provided. */
+	u32			m_bitmap_len;	/* Number of valid
+						 * bits available */
+
+	struct list_head	m_lru;		/* LRU of reservations
+						 * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES	(OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen);
+
+#endif	/* OCFS2_RESERVATIONS_H */
diff --git a/kernel/fs/ocfs2/resize.c b/kernel/fs/ocfs2/resize.c
new file mode 100644
index 000000000..d5da6f624
--- /dev/null
+++ b/kernel/fs/ocfs2/resize.c
@@ -0,0 +1,589 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.c
+ *
+ * volume resize.
+ * Inspired by ext3/resize.c.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "resize.h"
+
+/*
+ * Check whether there are new backup superblocks exist
+ * in the last group. If there are some, mark them or clear
+ * them in the bitmap.
+ *
+ * Return how many backups we find in the last group.
+ */
+static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+				       struct ocfs2_group_desc *gd,
+				       u16 cl_cpg,
+				       int set)
+{
+	int i;
+	u16 backups = 0;
+	u32 cluster;
+	u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+		gd_blkno = ocfs2_which_cluster_group(inode, cluster);
+		if (gd_blkno < lgd_blkno)
+			continue;
+		else if (gd_blkno > lgd_blkno)
+			break;
+
+		if (set)
+			ocfs2_set_bit(cluster % cl_cpg,
+				      (unsigned long *)gd->bg_bitmap);
+		else
+			ocfs2_clear_bit(cluster % cl_cpg,
+					(unsigned long *)gd->bg_bitmap);
+		backups++;
+	}
+
+	return backups;
+}
+
+static int ocfs2_update_last_group_and_inode(handle_t *handle,
+					     struct inode *bm_inode,
+					     struct buffer_head *bm_bh,
+					     struct buffer_head *group_bh,
+					     u32 first_new_cluster,
+					     int new_clusters)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct ocfs2_chain_rec *cr;
+	struct ocfs2_group_desc *group;
+	u16 chain, num_bits, backups = 0;
+	u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+	u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+
+	trace_ocfs2_update_last_group_and_inode(new_clusters,
+						first_new_cluster);
+
+	ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
+				      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+	/* update the group first. */
+	num_bits = new_clusters * cl_bpc;
+	le16_add_cpu(&group->bg_bits, num_bits);
+	le16_add_cpu(&group->bg_free_bits_count, num_bits);
+
+	/*
+	 * check whether there are some new backup superblocks exist in
+	 * this group and update the group bitmap accordingly.
+	 */
+	if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+				     OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+		backups = ocfs2_calc_new_backup_super(bm_inode,
+						     group,
+						     cl_cpg, 1);
+		le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+	}
+
+	ocfs2_journal_dirty(handle, group_bh);
+
+	/* update the inode accordingly. */
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_rollback;
+	}
+
+	chain = le16_to_cpu(group->bg_chain);
+	cr = (&cl->cl_recs[chain]);
+	le32_add_cpu(&cr->c_total, num_bits);
+	le32_add_cpu(&cr->c_free, num_bits);
+	le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
+	le32_add_cpu(&fe->i_clusters, new_clusters);
+
+	if (backups) {
+		le32_add_cpu(&cr->c_free, -1 * backups);
+		le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
+	}
+
+	spin_lock(&OCFS2_I(bm_inode)->ip_lock);
+	OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
+	spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
+	i_size_write(bm_inode, le64_to_cpu(fe->i_size));
+
+	ocfs2_journal_dirty(handle, bm_bh);
+
+out_rollback:
+	if (ret < 0) {
+		ocfs2_calc_new_backup_super(bm_inode,
+					    group,
+					    cl_cpg, 0);
+		le16_add_cpu(&group->bg_free_bits_count, backups);
+		le16_add_cpu(&group->bg_bits, -1 * num_bits);
+		le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+	}
+out:
+	if (ret)
+		mlog_errno(ret);
+	return ret;
+}
+
+static int update_backups(struct inode * inode, u32 clusters, char *data)
+{
+	int i, ret = 0;
+	u32 cluster;
+	u64 blkno;
+	struct buffer_head *backup = NULL;
+	struct ocfs2_dinode *backup_di = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* calculate the real backups we need to update. */
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+		if (cluster > clusters)
+			break;
+
+		ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
+
+		backup_di = (struct ocfs2_dinode *)backup->b_data;
+		backup_di->i_blkno = cpu_to_le64(blkno);
+
+		ret = ocfs2_write_super_or_backup(osb, backup);
+		brelse(backup);
+		backup = NULL;
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void ocfs2_update_super_and_backups(struct inode *inode,
+					   int new_clusters)
+{
+	int ret;
+	u32 clusters = 0;
+	struct buffer_head *super_bh = NULL;
+	struct ocfs2_dinode *super_di = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/*
+	 * update the superblock last.
+	 * It doesn't matter if the write failed.
+	 */
+	ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+				     &super_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	super_di = (struct ocfs2_dinode *)super_bh->b_data;
+	le32_add_cpu(&super_di->i_clusters, new_clusters);
+	clusters = le32_to_cpu(super_di->i_clusters);
+
+	ret = ocfs2_write_super_or_backup(osb, super_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
+		ret = update_backups(inode, clusters, super_bh->b_data);
+
+out:
+	brelse(super_bh);
+	if (ret)
+		printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
+			" during fs resize. This condition is not fatal,"
+			" but fsck.ocfs2 should be run to fix it\n",
+			osb->dev_str);
+	return;
+}
+
+/*
+ * Extend the filesystem to the new number of clusters specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.
+ */
+int ocfs2_group_extend(struct inode * inode, int new_clusters)
+{
+	int ret;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct buffer_head *group_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_group_desc *group = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 cl_bpc;
+	u32 first_new_cluster;
+	u64 lgd_blkno;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	if (new_clusters < 0)
+		return -EINVAL;
+	else if (new_clusters == 0)
+		return 0;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	/* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+	 * so any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+		ocfs2_group_bitmap_size(osb->sb, 0,
+					osb->s_feature_incompat) * 8) {
+		mlog(ML_ERROR, "The disk is too old and small. "
+		     "Force to do offline resize.");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	first_new_cluster = le32_to_cpu(fe->i_clusters);
+	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
+					      first_new_cluster - 1);
+
+	ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+					  &group_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+	if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
+		le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+
+	trace_ocfs2_group_extend(
+	     (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
+
+	handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* update the last group descriptor and inode. */
+	ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
+						main_bm_bh, group_bh,
+						first_new_cluster,
+						new_clusters);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	brelse(group_bh);
+	brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	return ret;
+}
+
+static int ocfs2_check_new_group(struct inode *inode,
+				 struct ocfs2_dinode *di,
+				 struct ocfs2_new_group_input *input,
+				 struct buffer_head *group_bh)
+{
+	int ret;
+	struct ocfs2_group_desc *gd =
+		(struct ocfs2_group_desc *)group_bh->b_data;
+	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
+
+	ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (le16_to_cpu(gd->bg_chain) != input->chain)
+		mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
+		     "while input has %u set.\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_chain), input->chain);
+	else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
+		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+		     "input has %u clusters set\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits), input->clusters);
+	else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
+		mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
+		     "but it should have %u set\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits),
+		     input->frees * cl_bpc);
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+static int ocfs2_verify_group_and_input(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_new_group_input *input,
+					struct buffer_head *group_bh)
+{
+	u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
+	u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+	u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
+	u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
+	u32 total_clusters = le32_to_cpu(di->i_clusters);
+	int ret = -EINVAL;
+
+	if (cluster < total_clusters)
+		mlog(ML_ERROR, "add a group which is in the current volume.\n");
+	else if (input->chain >= cl_count)
+		mlog(ML_ERROR, "input chain exceeds the limit.\n");
+	else if (next_free != cl_count && next_free != input->chain)
+		mlog(ML_ERROR,
+		     "the add group should be in chain %u\n", next_free);
+	else if (total_clusters + input->clusters < total_clusters)
+		mlog(ML_ERROR, "add group's clusters overflow.\n");
+	else if (input->clusters > cl_cpg)
+		mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
+	else if (input->frees > input->clusters)
+		mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
+	else if (total_clusters % cl_cpg != 0)
+		mlog(ML_ERROR,
+		     "the last group isn't full. Use group extend first.\n");
+	else if (input->group != ocfs2_which_cluster_group(inode, cluster))
+		mlog(ML_ERROR, "group blkno is invalid\n");
+	else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
+		mlog(ML_ERROR, "group descriptor check failed.\n");
+	else
+		ret = 0;
+
+	return ret;
+}
+
+/* Add a new group descriptor to global_bitmap. */
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
+{
+	int ret;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group = NULL;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_chain_rec *cr;
+	u16 cl_bpc;
+	u64 bg_ptr;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+		ocfs2_group_bitmap_size(osb->sb, 0,
+					osb->s_feature_incompat) * 8) {
+		mlog(ML_ERROR, "The disk is too old and small."
+		     " Force to do offline resize.");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
+	if (ret < 0) {
+		mlog(ML_ERROR, "Can't read the group descriptor # %llu "
+		     "from the device.", (unsigned long long)input->group);
+		goto out_unlock;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
+
+	ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_free_group_bh;
+	}
+
+	trace_ocfs2_group_add((unsigned long long)input->group,
+			       input->chain, input->clusters, input->frees);
+
+	handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		ret = -EINVAL;
+		goto out_free_group_bh;
+	}
+
+	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+	cl = &fe->id2.i_chain;
+	cr = &cl->cl_recs[input->chain];
+
+	ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
+				      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+	bg_ptr = le64_to_cpu(group->bg_next_group);
+	group->bg_next_group = cr->c_blkno;
+	ocfs2_journal_dirty(handle, group_bh);
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
+				      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		group->bg_next_group = cpu_to_le64(bg_ptr);
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+		memset(cr, 0, sizeof(struct ocfs2_chain_rec));
+	}
+
+	cr->c_blkno = cpu_to_le64(input->group);
+	le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
+	le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
+	le32_add_cpu(&fe->id1.bitmap1.i_used,
+		     (input->clusters - input->frees) * cl_bpc);
+	le32_add_cpu(&fe->i_clusters, input->clusters);
+
+	ocfs2_journal_dirty(handle, main_bm_bh);
+
+	spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
+	OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
+	spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
+	i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
+
+	ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_free_group_bh:
+	brelse(group_bh);
+
+out_unlock:
+	brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	return ret;
+}
diff --git a/kernel/fs/ocfs2/resize.h b/kernel/fs/ocfs2/resize.h
new file mode 100644
index 000000000..f38841abf
--- /dev/null
+++ b/kernel/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_RESIZE_H
+#define OCFS2_RESIZE_H
+
+int ocfs2_group_extend(struct inode * inode, int new_clusters);
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
+
+#endif /* OCFS2_RESIZE_H */
diff --git a/kernel/fs/ocfs2/slot_map.c b/kernel/fs/ocfs2/slot_map.c
new file mode 100644
index 000000000..e78a203d4
--- /dev/null
+++ b/kernel/fs/ocfs2/slot_map.c
@@ -0,0 +1,538 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+
+struct ocfs2_slot {
+	int sl_valid;
+	unsigned int sl_node_num;
+};
+
+struct ocfs2_slot_info {
+	int si_extended;
+	int si_slots_per_block;
+	struct inode *si_inode;
+	unsigned int si_blocks;
+	struct buffer_head **si_bh;
+	unsigned int si_num_slots;
+	struct ocfs2_slot *si_slots;
+};
+
+
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    unsigned int node_num);
+
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+				  int slot_num)
+{
+	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+	si->si_slots[slot_num].sl_valid = 0;
+}
+
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+			   int slot_num, unsigned int node_num)
+{
+	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+
+	si->si_slots[slot_num].sl_valid = 1;
+	si->si_slots[slot_num].sl_node_num = node_num;
+}
+
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+	int b, i, slotno;
+	struct ocfs2_slot_map_extended *se;
+
+	slotno = 0;
+	for (b = 0; b < si->si_blocks; b++) {
+		se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+		for (i = 0;
+		     (i < si->si_slots_per_block) &&
+		     (slotno < si->si_num_slots);
+		     i++, slotno++) {
+			if (se->se_slots[i].es_valid)
+				ocfs2_set_slot(si, slotno,
+					       le32_to_cpu(se->se_slots[i].es_node_num));
+			else
+				ocfs2_invalidate_slot(si, slotno);
+		}
+	}
+}
+
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
+{
+	int i;
+	struct ocfs2_slot_map *sm;
+
+	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+
+	for (i = 0; i < si->si_num_slots; i++) {
+		if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
+			ocfs2_invalidate_slot(si, i);
+		else
+			ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+	}
+}
+
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+	/*
+	 * The slot data will have been refreshed when ocfs2_super_lock
+	 * was taken.
+	 */
+	if (si->si_extended)
+		ocfs2_update_slot_info_extended(si);
+	else
+		ocfs2_update_slot_info_old(si);
+}
+
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+	int ret;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (si == NULL)
+		return 0;
+
+	BUG_ON(si->si_blocks == 0);
+	BUG_ON(si->si_bh == NULL);
+
+	trace_ocfs2_refresh_slot_info(si->si_blocks);
+
+	/*
+	 * We pass -1 as blocknr because we expect all of si->si_bh to
+	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
+	 * this is not true, the read of -1 (UINT64_MAX) will fail.
+	 */
+	ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
+				si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
+	if (ret == 0) {
+		spin_lock(&osb->osb_lock);
+		ocfs2_update_slot_info(si);
+		spin_unlock(&osb->osb_lock);
+	}
+
+	return ret;
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
+					    int slot_num,
+					    struct buffer_head **bh)
+{
+	int blkind = slot_num / si->si_slots_per_block;
+	int slotno = slot_num % si->si_slots_per_block;
+	struct ocfs2_slot_map_extended *se;
+
+	BUG_ON(blkind >= si->si_blocks);
+
+	se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+	se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+	if (si->si_slots[slot_num].sl_valid)
+		se->se_slots[slotno].es_node_num =
+			cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+	*bh = si->si_bh[blkind];
+}
+
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
+				       int slot_num,
+				       struct buffer_head **bh)
+{
+	int i;
+	struct ocfs2_slot_map *sm;
+
+	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+	for (i = 0; i < si->si_num_slots; i++) {
+		if (si->si_slots[i].sl_valid)
+			sm->sm_slots[i] =
+				cpu_to_le16(si->si_slots[i].sl_node_num);
+		else
+			sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+	}
+	*bh = si->si_bh[0];
+}
+
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+				  struct ocfs2_slot_info *si,
+				  int slot_num)
+{
+	int status;
+	struct buffer_head *bh;
+
+	spin_lock(&osb->osb_lock);
+	if (si->si_extended)
+		ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+	else
+		ocfs2_update_disk_slot_old(si, slot_num, &bh);
+	spin_unlock(&osb->osb_lock);
+
+	status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * Calculate how many bytes are needed by the slot map.  Returns
+ * an error if the slot map file is too small.
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+					struct inode *inode,
+					unsigned long long *bytes)
+{
+	unsigned long long bytes_needed;
+
+	if (ocfs2_uses_extended_slot_map(osb)) {
+		bytes_needed = osb->max_slots *
+			sizeof(struct ocfs2_extended_slot);
+	} else {
+		bytes_needed = osb->max_slots * sizeof(__le16);
+	}
+	if (bytes_needed > i_size_read(inode)) {
+		mlog(ML_ERROR,
+		     "Slot map file is too small!  (size %llu, needed %llu)\n",
+		     i_size_read(inode), bytes_needed);
+		return -ENOSPC;
+	}
+
+	*bytes = bytes_needed;
+	return 0;
+}
+
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    unsigned int node_num)
+{
+	int i, ret = -ENOENT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (si->si_slots[i].sl_valid &&
+		    (node_num == si->si_slots[i].sl_node_num)) {
+			ret = i;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+				   int preferred)
+{
+	int i, ret = -ENOSPC;
+
+	if ((preferred >= 0) && (preferred < si->si_num_slots)) {
+		if (!si->si_slots[preferred].sl_valid) {
+			ret = preferred;
+			goto out;
+		}
+	}
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (!si->si_slots[i].sl_valid) {
+			ret = i;
+			break;
+		}
+	}
+out:
+	return ret;
+}
+
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
+{
+	int slot;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	spin_lock(&osb->osb_lock);
+	slot = __ocfs2_node_num_to_slot(si, node_num);
+	spin_unlock(&osb->osb_lock);
+
+	return slot;
+}
+
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+				  unsigned int *node_num)
+{
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	assert_spin_locked(&osb->osb_lock);
+
+	BUG_ON(slot_num < 0);
+	BUG_ON(slot_num >= osb->max_slots);
+
+	if (!si->si_slots[slot_num].sl_valid)
+		return -ENOENT;
+
+	*node_num = si->si_slots[slot_num].sl_node_num;
+	return 0;
+}
+
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+	unsigned int i;
+
+	if (si == NULL)
+		return;
+
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh) {
+		for (i = 0; i < si->si_blocks; i++) {
+			if (si->si_bh[i]) {
+				brelse(si->si_bh[i]);
+				si->si_bh[i] = NULL;
+			}
+		}
+		kfree(si->si_bh);
+	}
+
+	kfree(si);
+}
+
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
+{
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (si == NULL)
+		return 0;
+
+	spin_lock(&osb->osb_lock);
+	ocfs2_invalidate_slot(si, slot_num);
+	spin_unlock(&osb->osb_lock);
+
+	return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
+}
+
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+				  struct ocfs2_slot_info *si)
+{
+	int status = 0;
+	u64 blkno;
+	unsigned long long blocks, bytes = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+	if (status)
+		goto bail;
+
+	blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+	BUG_ON(blocks > UINT_MAX);
+	si->si_blocks = blocks;
+	if (!si->si_blocks)
+		goto bail;
+
+	if (si->si_extended)
+		si->si_slots_per_block =
+			(osb->sb->s_blocksize /
+			 sizeof(struct ocfs2_extended_slot));
+	else
+		si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+
+	/* The size checks above should ensure this */
+	BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+
+	trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
+
+	si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
+			    GFP_KERNEL);
+	if (!si->si_bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	for (i = 0; i < si->si_blocks; i++) {
+		status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+						     &blkno, NULL, NULL);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
+
+		bh = NULL;  /* Acquire a fresh bh */
+		status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
+					   1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		si->si_bh[i] = bh;
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct ocfs2_slot_info *si;
+
+	si = kzalloc(sizeof(struct ocfs2_slot_info) +
+		     (sizeof(struct ocfs2_slot) * osb->max_slots),
+		     GFP_KERNEL);
+	if (!si) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		return status;
+	}
+
+	si->si_extended = ocfs2_uses_extended_slot_map(osb);
+	si->si_num_slots = osb->max_slots;
+	si->si_slots = (struct ocfs2_slot *)((char *)si +
+					     sizeof(struct ocfs2_slot_info));
+
+	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	si->si_inode = inode;
+	status = ocfs2_map_slot_buffers(osb, si);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->slot_info = (struct ocfs2_slot_info *)si;
+bail:
+	if (status < 0)
+		__ocfs2_free_slot_info(si);
+
+	return status;
+}
+
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
+{
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	osb->slot_info = NULL;
+	__ocfs2_free_slot_info(si);
+}
+
+int ocfs2_find_slot(struct ocfs2_super *osb)
+{
+	int status;
+	int slot;
+	struct ocfs2_slot_info *si;
+
+	si = osb->slot_info;
+
+	spin_lock(&osb->osb_lock);
+	ocfs2_update_slot_info(si);
+
+	/* search for ourselves first and take the slot if it already
+	 * exists. Perhaps we need to mark this in a variable for our
+	 * own journal recovery? Possibly not, though we certainly
+	 * need to warn to the user */
+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+	if (slot < 0) {
+		/* if no slot yet, then just take 1st available
+		 * one. */
+		slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
+		if (slot < 0) {
+			spin_unlock(&osb->osb_lock);
+			mlog(ML_ERROR, "no free slots available!\n");
+			status = -EINVAL;
+			goto bail;
+		}
+	} else
+		printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+		       "allocated to this node!\n", slot, osb->dev_str);
+
+	ocfs2_set_slot(si, slot, osb->node_num);
+	osb->slot_num = slot;
+	spin_unlock(&osb->osb_lock);
+
+	trace_ocfs2_find_slot(osb->slot_num);
+
+	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+void ocfs2_put_slot(struct ocfs2_super *osb)
+{
+	int status, slot_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (!si)
+		return;
+
+	spin_lock(&osb->osb_lock);
+	ocfs2_update_slot_info(si);
+
+	slot_num = osb->slot_num;
+	ocfs2_invalidate_slot(si, osb->slot_num);
+	osb->slot_num = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+
+	status = ocfs2_update_disk_slot(osb, si, slot_num);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	ocfs2_free_slot_info(osb);
+}
+
diff --git a/kernel/fs/ocfs2/slot_map.h b/kernel/fs/ocfs2/slot_map.h
new file mode 100644
index 000000000..601c95fd7
--- /dev/null
+++ b/kernel/fs/ocfs2/slot_map.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
+
+int ocfs2_find_slot(struct ocfs2_super *osb);
+void ocfs2_put_slot(struct ocfs2_super *osb);
+
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
+
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+				  unsigned int *node_num);
+
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
+
+#endif
diff --git a/kernel/fs/ocfs2/stack_o2cb.c b/kernel/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 000000000..220cae7bb
--- /dev/null
+++ b/kernel/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "stackglue.h"
+
+struct o2dlm_private {
+	struct dlm_eviction_cb op_eviction_cb;
+};
+
+static struct ocfs2_stack_plugin o2cb_stack;
+
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+	BUG_ON(mode > LKM_MAXMODE);
+
+	return mode;
+}
+
+#define map_flag(_generic, _o2dlm)		\
+	if (flags & (_generic)) {		\
+		flags &= ~(_generic);		\
+		o2dlm_flags |= (_o2dlm);	\
+	}
+static int flags_to_o2dlm(u32 flags)
+{
+	int o2dlm_flags = 0;
+
+	map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+	map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+	map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+	map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+	map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+	map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+	map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+	map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+	map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+
+	/* map_flag() should have cleared every flag passed in */
+	BUG_ON(flags != 0);
+
+	return o2dlm_flags;
+}
+#undef map_flag
+
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL:		0
+ * DLM_NOTQUEUED:	-EAGAIN
+ * DLM_CANCELGRANT:	-EBUSY
+ * DLM_CANCEL:		-DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+	[DLM_NORMAL]			= 0,		/* Success */
+	[DLM_GRANTED]			= -EINVAL,
+	[DLM_DENIED]			= -EACCES,
+	[DLM_DENIED_NOLOCKS]		= -EACCES,
+	[DLM_WORKING]			= -EACCES,
+	[DLM_BLOCKED]			= -EINVAL,
+	[DLM_BLOCKED_ORPHAN]		= -EINVAL,
+	[DLM_DENIED_GRACE_PERIOD]	= -EACCES,
+	[DLM_SYSERR]			= -ENOMEM,	/* It is what it is */
+	[DLM_NOSUPPORT]			= -EPROTO,
+	[DLM_CANCELGRANT]		= -EBUSY,	/* Cancel after grant */
+	[DLM_IVLOCKID]			= -EINVAL,
+	[DLM_SYNC]			= -EINVAL,
+	[DLM_BADTYPE]			= -EINVAL,
+	[DLM_BADRESOURCE]		= -EINVAL,
+	[DLM_MAXHANDLES]		= -ENOMEM,
+	[DLM_NOCLINFO]			= -EINVAL,
+	[DLM_NOLOCKMGR]			= -EINVAL,
+	[DLM_NOPURGED]			= -EINVAL,
+	[DLM_BADARGS]			= -EINVAL,
+	[DLM_VOID]			= -EINVAL,
+	[DLM_NOTQUEUED]			= -EAGAIN,	/* Trylock failed */
+	[DLM_IVBUFLEN]			= -EINVAL,
+	[DLM_CVTUNGRANT]		= -EPERM,
+	[DLM_BADPARAM]			= -EINVAL,
+	[DLM_VALNOTVALID]		= -EINVAL,
+	[DLM_REJECTED]			= -EPERM,
+	[DLM_ABORT]			= -EINVAL,
+	[DLM_CANCEL]			= -DLM_ECANCEL,	/* Successful cancel */
+	[DLM_IVRESHANDLE]		= -EINVAL,
+	[DLM_DEADLOCK]			= -EDEADLK,
+	[DLM_DENIED_NOASTS]		= -EINVAL,
+	[DLM_FORWARD]			= -EINVAL,
+	[DLM_TIMEOUT]			= -ETIMEDOUT,
+	[DLM_IVGROUPID]			= -EINVAL,
+	[DLM_VERS_CONFLICT]		= -EOPNOTSUPP,
+	[DLM_BAD_DEVICE_PATH]		= -ENOENT,
+	[DLM_NO_DEVICE_PERMISSION]	= -EPERM,
+	[DLM_NO_CONTROL_DEVICE]		= -ENOENT,
+	[DLM_RECOVERING]		= -ENOTCONN,
+	[DLM_MIGRATING]			= -ERESTART,
+	[DLM_MAXSTATS]			= -EINVAL,
+};
+
+static int dlm_status_to_errno(enum dlm_status status)
+{
+	BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
+
+	return status_map[status];
+}
+
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+
+	lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
+}
+
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
+}
+
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+	int error = dlm_status_to_errno(status);
+
+	/*
+	 * In o2dlm, you can get both the lock_ast() for the lock being
+	 * granted and the unlock_ast() for the CANCEL failing.  A
+	 * successful cancel sends DLM_NORMAL here.  If the
+	 * lock grant happened before the cancel arrived, you get
+	 * DLM_CANCELGRANT.
+	 *
+	 * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
+	 * we just ignore it.  We expect the lock_ast() to handle the
+	 * granted lock.
+	 */
+	if (status == DLM_CANCELGRANT)
+		return;
+
+	lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
+}
+
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+			 int mode,
+			 struct ocfs2_dlm_lksb *lksb,
+			 u32 flags,
+			 void *name,
+			 unsigned int namelen)
+{
+	enum dlm_status status;
+	int o2dlm_mode = mode_to_o2dlm(mode);
+	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
+
+	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+			 o2dlm_flags, name, namelen,
+			 o2dlm_lock_ast_wrapper, lksb,
+			 o2dlm_blocking_ast_wrapper);
+	ret = dlm_status_to_errno(status);
+	return ret;
+}
+
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+			   struct ocfs2_dlm_lksb *lksb,
+			   u32 flags)
+{
+	enum dlm_status status;
+	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
+
+	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+			   o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
+	ret = dlm_status_to_errno(status);
+	return ret;
+}
+
+static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+
+/*
+ * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * contents, it will zero out the LVB.  Thus the caller can always trust
+ * the contents.
+ */
+static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+	return 1;
+}
+
+static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+	return (void *)(lksb->lksb_o2dlm.lvb);
+}
+
+static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+
+/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+	u8 node_num;
+	int i;
+	unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_MAX_NODES) {
+		printk(KERN_ERR "o2cb: This node has not been configured.\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * o2dlm expects o2net sockets to be created. If not, then
+	 * dlm_join_domain() fails with a stack of errors which are both cryptic
+	 * and incomplete. The idea here is to detect upfront whether we have
+	 * managed to connect to all nodes or not. If not, then list the nodes
+	 * to allow the user to check the configuration (incorrect IP, firewall,
+	 * etc.) Yes, this is racy. But its not the end of the world.
+	 */
+#define	O2CB_MAP_STABILIZE_COUNT	60
+	for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+		o2hb_fill_node_map(hbmap, sizeof(hbmap));
+		if (!test_bit(node_num, hbmap)) {
+			printk(KERN_ERR "o2cb: %s heartbeat has not been "
+			       "started.\n", (o2hb_global_heartbeat_active() ?
+					      "Global" : "Local"));
+			return -EINVAL;
+		}
+		o2net_fill_node_map(netmap, sizeof(netmap));
+		/* Force set the current node to allow easy compare */
+		set_bit(node_num, netmap);
+		if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+			return 0;
+		if (i < O2CB_MAP_STABILIZE_COUNT - 1)
+			msleep(1000);
+	}
+
+	printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+	i = -1;
+	while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (!test_bit(i, netmap))
+			printk(" %u", i);
+	}
+	printk(".\n");
+
+	return -ENOTCONN;
+}
+
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+	struct ocfs2_cluster_connection *conn = data;
+
+	printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+	       node_num, conn->cc_namelen, conn->cc_name);
+
+	conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+	int rc = 0;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+	struct o2dlm_private *priv;
+	struct dlm_protocol_version fs_version;
+
+	BUG_ON(conn == NULL);
+	BUG_ON(conn->cc_proto == NULL);
+
+	/* Ensure cluster stack is up and all nodes are connected */
+	rc = o2cb_cluster_check();
+	if (rc) {
+		printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+		       "before retrying.\n");
+		goto out;
+	}
+
+	priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+	if (!priv) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	/* This just fills the structure in.  It is safe to pass conn. */
+	dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+			      conn);
+
+	conn->cc_private = priv;
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+	fs_version.pv_major = conn->cc_version.pv_major;
+	fs_version.pv_minor = conn->cc_version.pv_minor;
+
+	dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
+	if (IS_ERR(dlm)) {
+		rc = PTR_ERR(dlm);
+		mlog_errno(rc);
+		goto out_free;
+	}
+
+	conn->cc_version.pv_major = fs_version.pv_major;
+	conn->cc_version.pv_minor = fs_version.pv_minor;
+	conn->cc_lockspace = dlm;
+
+	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+out_free:
+	if (rc)
+		kfree(conn->cc_private);
+
+out:
+	return rc;
+}
+
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	struct dlm_ctxt *dlm = conn->cc_lockspace;
+	struct o2dlm_private *priv = conn->cc_private;
+
+	dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+	conn->cc_private = NULL;
+	kfree(priv);
+
+	dlm_unregister_domain(dlm);
+	conn->cc_lockspace = NULL;
+
+	return 0;
+}
+
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+				  unsigned int *node)
+{
+	int node_num;
+
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_INVALID_NODE_NUM)
+		return -ENOENT;
+
+	if (node_num >= O2NM_MAX_NODES)
+		return -EOVERFLOW;
+
+	*node = node_num;
+	return 0;
+}
+
+static struct ocfs2_stack_operations o2cb_stack_ops = {
+	.connect	= o2cb_cluster_connect,
+	.disconnect	= o2cb_cluster_disconnect,
+	.this_node	= o2cb_cluster_this_node,
+	.dlm_lock	= o2cb_dlm_lock,
+	.dlm_unlock	= o2cb_dlm_unlock,
+	.lock_status	= o2cb_dlm_lock_status,
+	.lvb_valid	= o2cb_dlm_lvb_valid,
+	.lock_lvb	= o2cb_dlm_lvb,
+	.dump_lksb	= o2cb_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin o2cb_stack = {
+	.sp_name	= "o2cb",
+	.sp_ops		= &o2cb_stack_ops,
+	.sp_owner	= THIS_MODULE,
+};
+
+static int __init o2cb_stack_init(void)
+{
+	return ocfs2_stack_glue_register(&o2cb_stack);
+}
+
+static void __exit o2cb_stack_exit(void)
+{
+	ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/kernel/fs/ocfs2/stack_user.c b/kernel/fs/ocfs2/stack_user.c
new file mode 100644
index 000000000..2768eb1da
--- /dev/null
+++ b/kernel/fs/ocfs2/stack_user.c
@@ -0,0 +1,1134 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+
+#include "stackglue.h"
+
+#include <linux/dlm_plock.h>
+
+/*
+ * The control protocol starts with a handshake.  Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple.  First, the client reads until EOF.  Each line
+ * of output is a supported protocol tag.  All protocol tags are a single
+ * character followed by a two hex digit version number.  Currently the
+ * only things supported is T01, for "Text-base version 0x01".  Next, the
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'.  If the version tag written is
+ * unknown, -EINVAL is returned.  Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol has three messages.  First is the "SETN" message.
+ * It has the following syntax:
+ *
+ *  SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Next comes the "SETV" message.  It has the following syntax:
+ *
+ *  SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client.  The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
+ * number from the "SETV" message must match
+ * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
+ * must be less than or equal to ...sp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed.  From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
+ *
+ *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ *  DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
+ */
+
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO			"T01\n"
+#define OCFS2_CONTROL_PROTO_LEN			4
+
+/* Handshake states */
+#define OCFS2_CONTROL_HANDSHAKE_INVALID		(0)
+#define OCFS2_CONTROL_HANDSHAKE_READ		(1)
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL	(2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID		(3)
+
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN		4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP	"SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN	14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP	"SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN	11
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP		"DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN	47
+#define OCFS2_TEXT_UUID_LEN			32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN	2
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8
+#define VERSION_LOCK				"version_lock"
+
+enum ocfs2_connection_type {
+	WITH_CONTROLD,
+	NO_CONTROLD
+};
+
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order.  Let's just be safe.
+ */
+struct ocfs2_live_connection {
+	struct list_head		oc_list;
+	struct ocfs2_cluster_connection	*oc_conn;
+	enum ocfs2_connection_type	oc_type;
+	atomic_t                        oc_this_node;
+	int                             oc_our_slot;
+	struct dlm_lksb                 oc_version_lksb;
+	char                            oc_lvb[DLM_LVB_LEN];
+	struct completion               oc_sync_wait;
+	wait_queue_head_t		oc_wait;
+};
+
+struct ocfs2_control_private {
+	struct list_head op_list;
+	int op_state;
+	int op_this_node;
+	struct ocfs2_protocol_version op_proto;
+};
+
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space;
+	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+	char	newline;
+};
+
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space1;
+	char	major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+	char	space2;
+	char	minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+	char	newline;
+};
+
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space1;
+	char	uuid[OCFS2_TEXT_UUID_LEN];
+	char	space2;
+	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+	char	newline;
+};
+
+union ocfs2_control_message {
+	char					tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	struct ocfs2_control_message_setn	u_setn;
+	struct ocfs2_control_message_setv	u_setv;
+	struct ocfs2_control_message_down	u_down;
+};
+
+static struct ocfs2_stack_plugin ocfs2_user_plugin;
+
+static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
+
+static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+						     int state)
+{
+	struct ocfs2_control_private *p = file->private_data;
+	p->op_state = state;
+}
+
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+	struct ocfs2_control_private *p = file->private_data;
+	return p->op_state;
+}
+
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+	size_t len = strlen(name);
+	struct ocfs2_live_connection *c;
+
+	BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+
+	list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+		if ((c->oc_conn->cc_namelen == len) &&
+		    !strncmp(c->oc_conn->cc_name, name, len))
+			return c;
+	}
+
+	return NULL;
+}
+
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path.  Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+				     struct ocfs2_live_connection *c)
+{
+	int rc = 0;
+
+	mutex_lock(&ocfs2_control_lock);
+	c->oc_conn = conn;
+
+	if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
+		list_add(&c->oc_list, &ocfs2_live_connection_list);
+	else {
+		printk(KERN_ERR
+		       "ocfs2: Userspace control daemon is not present\n");
+		rc = -ESRCH;
+	}
+
+	mutex_unlock(&ocfs2_control_lock);
+	return rc;
+}
+
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+	mutex_lock(&ocfs2_control_lock);
+	list_del_init(&c->oc_list);
+	c->oc_conn = NULL;
+	mutex_unlock(&ocfs2_control_lock);
+
+	kfree(c);
+}
+
+static int ocfs2_control_cfu(void *target, size_t target_len,
+			     const char __user *buf, size_t count)
+{
+	/* The T01 expects write(2) calls to have exactly one command */
+	if ((count != target_len) ||
+	    (count > sizeof(union ocfs2_control_message)))
+		return -EINVAL;
+
+	if (copy_from_user(target, buf, target_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+					       const char __user *buf,
+					       size_t count)
+{
+	ssize_t ret;
+	char kbuf[OCFS2_CONTROL_PROTO_LEN];
+
+	ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+				buf, count);
+	if (ret)
+		return ret;
+
+	if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+		return -EINVAL;
+
+	ocfs2_control_set_handshake_state(file,
+					  OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+	return count;
+}
+
+static void ocfs2_control_send_down(const char *uuid,
+				    int nodenum)
+{
+	struct ocfs2_live_connection *c;
+
+	mutex_lock(&ocfs2_control_lock);
+
+	c = ocfs2_connection_find(uuid);
+	if (c) {
+		BUG_ON(c->oc_conn == NULL);
+		c->oc_conn->cc_recovery_handler(nodenum,
+						c->oc_conn->cc_recovery_data);
+	}
+
+	mutex_unlock(&ocfs2_control_lock);
+}
+
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values.  If there is a problem, return an error.  Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+	int rc = 0;
+	int set_p = 1;
+	struct ocfs2_control_private *p = file->private_data;
+
+	BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+	mutex_lock(&ocfs2_control_lock);
+
+	if (p->op_this_node < 0) {
+		set_p = 0;
+	} else if ((ocfs2_control_this_node >= 0) &&
+		   (ocfs2_control_this_node != p->op_this_node)) {
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (!p->op_proto.pv_major) {
+		set_p = 0;
+	} else if (!list_empty(&ocfs2_live_connection_list) &&
+		   ((running_proto.pv_major != p->op_proto.pv_major) ||
+		    (running_proto.pv_minor != p->op_proto.pv_minor))) {
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (set_p) {
+		ocfs2_control_this_node = p->op_this_node;
+		running_proto.pv_major = p->op_proto.pv_major;
+		running_proto.pv_minor = p->op_proto.pv_minor;
+	}
+
+out_unlock:
+	mutex_unlock(&ocfs2_control_lock);
+
+	if (!rc && set_p) {
+		/* We set the global values successfully */
+		atomic_inc(&ocfs2_control_opened);
+		ocfs2_control_set_handshake_state(file,
+					OCFS2_CONTROL_HANDSHAKE_VALID);
+	}
+
+	return rc;
+}
+
+static int ocfs2_control_get_this_node(void)
+{
+	int rc;
+
+	mutex_lock(&ocfs2_control_lock);
+	if (ocfs2_control_this_node < 0)
+		rc = -EINVAL;
+	else
+		rc = ocfs2_control_this_node;
+	mutex_unlock(&ocfs2_control_lock);
+
+	return rc;
+}
+
+static int ocfs2_control_do_setnode_msg(struct file *file,
+					struct ocfs2_control_message_setn *msg)
+{
+	long nodenum;
+	char *ptr = NULL;
+	struct ocfs2_control_private *p = file->private_data;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+		return -EINVAL;
+
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space != ' ') || (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space = msg->newline = '\0';
+
+	nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+
+	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+	    (nodenum > INT_MAX) || (nodenum < 0))
+		return -ERANGE;
+	p->op_this_node = nodenum;
+
+	return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_setversion_msg(struct file *file,
+					   struct ocfs2_control_message_setv *msg)
+ {
+	long major, minor;
+	char *ptr = NULL;
+	struct ocfs2_control_private *p = file->private_data;
+	struct ocfs2_protocol_version *max =
+		&ocfs2_user_plugin.sp_max_proto;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+		return -EINVAL;
+
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+	    (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space1 = msg->space2 = msg->newline = '\0';
+
+	major = simple_strtol(msg->major, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+	minor = simple_strtol(msg->minor, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+
+	/*
+	 * The major must be between 1 and 255, inclusive.  The minor
+	 * must be between 0 and 255, inclusive.  The version passed in
+	 * must be within the maximum version supported by the filesystem.
+	 */
+	if ((major == LONG_MIN) || (major == LONG_MAX) ||
+	    (major > (u8)-1) || (major < 1))
+		return -ERANGE;
+	if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+	    (minor > (u8)-1) || (minor < 0))
+		return -ERANGE;
+	if ((major != max->pv_major) ||
+	    (minor > max->pv_minor))
+		return -EINVAL;
+
+	p->op_proto.pv_major = major;
+	p->op_proto.pv_minor = minor;
+
+	return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_down_msg(struct file *file,
+				     struct ocfs2_control_message_down *msg)
+{
+	long nodenum;
+	char *p = NULL;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_VALID)
+		return -EINVAL;
+
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+	    (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space1 = msg->space2 = msg->newline = '\0';
+
+	nodenum = simple_strtol(msg->nodestr, &p, 16);
+	if (!p || *p)
+		return -EINVAL;
+
+	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+	    (nodenum > INT_MAX) || (nodenum < 0))
+		return -ERANGE;
+
+	ocfs2_control_send_down(msg->uuid, nodenum);
+
+	return 0;
+}
+
+static ssize_t ocfs2_control_message(struct file *file,
+				     const char __user *buf,
+				     size_t count)
+{
+	ssize_t ret;
+	union ocfs2_control_message msg;
+
+	/* Try to catch padding issues */
+	WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+		(sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+
+	memset(&msg, 0, sizeof(union ocfs2_control_message));
+	ret = ocfs2_control_cfu(&msg, count, buf, count);
+	if (ret)
+		goto out;
+
+	if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+	    !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+		     OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+	else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+		 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+			  OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
+	else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+		 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+			  OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+	else
+		ret = -EINVAL;
+
+out:
+	return ret ? ret : count;
+}
+
+static ssize_t ocfs2_control_write(struct file *file,
+				   const char __user *buf,
+				   size_t count,
+				   loff_t *ppos)
+{
+	ssize_t ret;
+
+	switch (ocfs2_control_get_handshake_state(file)) {
+		case OCFS2_CONTROL_HANDSHAKE_INVALID:
+			ret = -EINVAL;
+			break;
+
+		case OCFS2_CONTROL_HANDSHAKE_READ:
+			ret = ocfs2_control_validate_protocol(file, buf,
+							      count);
+			break;
+
+		case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
+		case OCFS2_CONTROL_HANDSHAKE_VALID:
+			ret = ocfs2_control_message(file, buf, count);
+			break;
+
+		default:
+			BUG();
+			ret = -EIO;
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * This is a naive version.  If we ever have a new protocol, we'll expand
+ * it.  Probably using seq_file.
+ */
+static ssize_t ocfs2_control_read(struct file *file,
+				  char __user *buf,
+				  size_t count,
+				  loff_t *ppos)
+{
+	ssize_t ret;
+
+	ret = simple_read_from_buffer(buf, count, ppos,
+			OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
+
+	/* Have we read the whole protocol list? */
+	if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
+		ocfs2_control_set_handshake_state(file,
+						  OCFS2_CONTROL_HANDSHAKE_READ);
+
+	return ret;
+}
+
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_control_private *p = file->private_data;
+
+	mutex_lock(&ocfs2_control_lock);
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_VALID)
+		goto out;
+
+	if (atomic_dec_and_test(&ocfs2_control_opened)) {
+		if (!list_empty(&ocfs2_live_connection_list)) {
+			/* XXX: Do bad things! */
+			printk(KERN_ERR
+			       "ocfs2: Unexpected release of ocfs2_control!\n"
+			       "       Loss of cluster connection requires "
+			       "an emergency restart!\n");
+			emergency_restart();
+		}
+		/*
+		 * Last valid close clears the node number and resets
+		 * the locking protocol version
+		 */
+		ocfs2_control_this_node = -1;
+		running_proto.pv_major = 0;
+		running_proto.pv_minor = 0;
+	}
+
+out:
+	list_del_init(&p->op_list);
+	file->private_data = NULL;
+
+	mutex_unlock(&ocfs2_control_lock);
+
+	kfree(p);
+
+	return 0;
+}
+
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+	struct ocfs2_control_private *p;
+
+	p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	p->op_this_node = -1;
+
+	mutex_lock(&ocfs2_control_lock);
+	file->private_data = p;
+	list_add(&p->op_list, &ocfs2_control_private_list);
+	mutex_unlock(&ocfs2_control_lock);
+
+	return 0;
+}
+
+static const struct file_operations ocfs2_control_fops = {
+	.open    = ocfs2_control_open,
+	.release = ocfs2_control_release,
+	.read    = ocfs2_control_read,
+	.write   = ocfs2_control_write,
+	.owner   = THIS_MODULE,
+	.llseek  = default_llseek,
+};
+
+static struct miscdevice ocfs2_control_device = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "ocfs2_control",
+	.fops		= &ocfs2_control_fops,
+};
+
+static int ocfs2_control_init(void)
+{
+	int rc;
+
+	atomic_set(&ocfs2_control_opened, 0);
+
+	rc = misc_register(&ocfs2_control_device);
+	if (rc)
+		printk(KERN_ERR
+		       "ocfs2: Unable to register ocfs2_control device "
+		       "(errno %d)\n",
+		       -rc);
+
+	return rc;
+}
+
+static void ocfs2_control_exit(void)
+{
+	int rc;
+
+	rc = misc_deregister(&ocfs2_control_device);
+	if (rc)
+		printk(KERN_ERR
+		       "ocfs2: Unable to deregister ocfs2_control device "
+		       "(errno %d)\n",
+		       -rc);
+}
+
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+	int status = lksb->lksb_fsdlm.sb_status;
+
+	/*
+	 * For now we're punting on the issue of other non-standard errors
+	 * where we can't tell if the unlock_ast or lock_ast should be called.
+	 * The main "other error" that's possible is EINVAL which means the
+	 * function was called with invalid args, which shouldn't be possible
+	 * since the caller here is under our control.  Other non-standard
+	 * errors probably fall into the same category, or otherwise are fatal
+	 * which means we can't carry on anyway.
+	 */
+
+	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+		lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
+	else
+		lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
+}
+
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
+}
+
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+			 int mode,
+			 struct ocfs2_dlm_lksb *lksb,
+			 u32 flags,
+			 void *name,
+			 unsigned int namelen)
+{
+	int ret;
+
+	if (!lksb->lksb_fsdlm.sb_lvbptr)
+		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+					     sizeof(struct dlm_lksb);
+
+	ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+		       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+		       fsdlm_lock_ast_wrapper, lksb,
+		       fsdlm_blocking_ast_wrapper);
+	return ret;
+}
+
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+			   struct ocfs2_dlm_lksb *lksb,
+			   u32 flags)
+{
+	int ret;
+
+	ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+			 flags, &lksb->lksb_fsdlm, lksb);
+	return ret;
+}
+
+static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+	return lksb->lksb_fsdlm.sb_status;
+}
+
+static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+	int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
+
+	return !invalid;
+}
+
+static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+	if (!lksb->lksb_fsdlm.sb_lvbptr)
+		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+					     sizeof(struct dlm_lksb);
+	return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+
+static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+}
+
+static int user_plock(struct ocfs2_cluster_connection *conn,
+		      u64 ino,
+		      struct file *file,
+		      int cmd,
+		      struct file_lock *fl)
+{
+	/*
+	 * This more or less just demuxes the plock request into any
+	 * one of three dlm calls.
+	 *
+	 * Internally, fs/dlm will pass these to a misc device, which
+	 * a userspace daemon will read and write to.
+	 *
+	 * For now, cancel requests (which happen internally only),
+	 * are turned into unlocks. Most of this function taken from
+	 * gfs2_lock.
+	 */
+
+	if (cmd == F_CANCELLK) {
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+	else
+		return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+			       struct ocfs2_protocol_version *request)
+{
+	if (existing->pv_major != request->pv_major)
+		return 1;
+
+	if (existing->pv_minor > request->pv_minor)
+		return 1;
+
+	if (existing->pv_minor < request->pv_minor)
+		request->pv_minor = existing->pv_minor;
+
+	return 0;
+}
+
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+	struct ocfs2_protocol_version *pv =
+		(struct ocfs2_protocol_version *)lvb;
+	/*
+	 * ocfs2_protocol_version has two u8 variables, so we don't
+	 * need any endian conversion.
+	 */
+	ver->pv_major = pv->pv_major;
+	ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+	struct ocfs2_protocol_version *pv =
+		(struct ocfs2_protocol_version *)lvb;
+	/*
+	 * ocfs2_protocol_version has two u8 variables, so we don't
+	 * need any endian conversion.
+	 */
+	pv->pv_major = ver->pv_major;
+	pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+	struct ocfs2_cluster_connection *conn = arg;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+		struct dlm_lksb *lksb, char *name)
+{
+	int error;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+
+	error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+	if (error) {
+		printk(KERN_ERR "%s lkid %x error %d\n",
+				name, lksb->sb_lkid, error);
+		return error;
+	}
+
+	wait_for_completion(&lc->oc_sync_wait);
+
+	if (lksb->sb_status != -DLM_EUNLOCK) {
+		printk(KERN_ERR "%s lkid %x status %d\n",
+				name, lksb->sb_lkid, lksb->sb_status);
+		return -1;
+	}
+	return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+		int mode, uint32_t flags,
+		struct dlm_lksb *lksb, char *name)
+{
+	int error, status;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+
+	error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+			name, strlen(name),
+			0, sync_wait_cb, conn, NULL);
+	if (error) {
+		printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+				name, lksb->sb_lkid, flags, mode, error);
+		return error;
+	}
+
+	wait_for_completion(&lc->oc_sync_wait);
+
+	status = lksb->sb_status;
+
+	if (status && status != -EAGAIN) {
+		printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+				name, lksb->sb_lkid, flags, mode, status);
+	}
+
+	return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+		int flags)
+{
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	return sync_lock(conn, mode, flags,
+			&lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ *    version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ *    taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+	int ret;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	struct ocfs2_protocol_version pv;
+
+	running_proto.pv_major =
+		ocfs2_user_plugin.sp_max_proto.pv_major;
+	running_proto.pv_minor =
+		ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+	lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+	ret = version_lock(conn, DLM_LOCK_EX,
+			DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+	if (!ret) {
+		conn->cc_version.pv_major = running_proto.pv_major;
+		conn->cc_version.pv_minor = running_proto.pv_minor;
+		version_to_lvb(&running_proto, lc->oc_lvb);
+		version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+	} else if (ret == -EAGAIN) {
+		ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+		if (ret)
+			goto out;
+		lvb_to_version(lc->oc_lvb, &pv);
+
+		if ((pv.pv_major != running_proto.pv_major) ||
+				(pv.pv_minor > running_proto.pv_minor)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		conn->cc_version.pv_major = pv.pv_major;
+		conn->cc_version.pv_minor = pv.pv_minor;
+	}
+out:
+	return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+	struct ocfs2_cluster_connection *conn = arg;
+	printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+			slot->nodeid, slot->slot);
+	conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+		int num_slots, int our_slot,
+		uint32_t generation)
+{
+	struct ocfs2_cluster_connection *conn = arg;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	int i;
+
+	for (i = 0; i < num_slots; i++)
+		if (slots[i].slot == our_slot) {
+			atomic_set(&lc->oc_this_node, slots[i].nodeid);
+			break;
+		}
+
+	lc->oc_our_slot = our_slot;
+	wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+	.recover_prep = user_recover_prep,
+	.recover_slot = user_recover_slot,
+	.recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	version_unlock(conn);
+	dlm_release_lockspace(conn->cc_lockspace, 2);
+	conn->cc_lockspace = NULL;
+	ocfs2_live_connection_drop(conn->cc_private);
+	conn->cc_private = NULL;
+	return 0;
+}
+
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+	dlm_lockspace_t *fsdlm;
+	struct ocfs2_live_connection *lc;
+	int rc, ops_rv;
+
+	BUG_ON(conn == NULL);
+
+	lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+	if (!lc)
+		return -ENOMEM;
+
+	init_waitqueue_head(&lc->oc_wait);
+	init_completion(&lc->oc_sync_wait);
+	atomic_set(&lc->oc_this_node, 0);
+	conn->cc_private = lc;
+	lc->oc_type = NO_CONTROLD;
+
+	rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+			       DLM_LSFL_FS, DLM_LVB_LEN,
+			       &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+	if (rc)
+		goto out;
+
+	if (ops_rv == -EOPNOTSUPP) {
+		lc->oc_type = WITH_CONTROLD;
+		printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+				"version of dlm_controld and/or ocfs2-tools."
+				" Please consider upgrading.\n");
+	} else if (ops_rv) {
+		rc = ops_rv;
+		goto out;
+	}
+	conn->cc_lockspace = fsdlm;
+
+	rc = ocfs2_live_connection_attach(conn, lc);
+	if (rc)
+		goto out;
+
+	if (lc->oc_type == NO_CONTROLD) {
+		rc = get_protocol_version(conn);
+		if (rc) {
+			printk(KERN_ERR "ocfs2: Could not determine"
+					" locking version\n");
+			user_cluster_disconnect(conn);
+			goto out;
+		}
+		wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+	}
+
+	/*
+	 * running_proto must have been set before we allowed any mounts
+	 * to proceed.
+	 */
+	if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+		printk(KERN_ERR
+		       "Unable to mount with fs locking protocol version "
+		       "%u.%u because negotiated protocol is %u.%u\n",
+		       conn->cc_version.pv_major, conn->cc_version.pv_minor,
+		       running_proto.pv_major, running_proto.pv_minor);
+		rc = -EPROTO;
+		ocfs2_live_connection_drop(lc);
+		lc = NULL;
+	}
+
+out:
+	if (rc)
+		kfree(lc);
+	return rc;
+}
+
+
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+				  unsigned int *this_node)
+{
+	int rc;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+
+	if (lc->oc_type == WITH_CONTROLD)
+		rc = ocfs2_control_get_this_node();
+	else if (lc->oc_type == NO_CONTROLD)
+		rc = atomic_read(&lc->oc_this_node);
+	else
+		rc = -EINVAL;
+
+	if (rc < 0)
+		return rc;
+
+	*this_node = rc;
+	return 0;
+}
+
+static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
+	.connect	= user_cluster_connect,
+	.disconnect	= user_cluster_disconnect,
+	.this_node	= user_cluster_this_node,
+	.dlm_lock	= user_dlm_lock,
+	.dlm_unlock	= user_dlm_unlock,
+	.lock_status	= user_dlm_lock_status,
+	.lvb_valid	= user_dlm_lvb_valid,
+	.lock_lvb	= user_dlm_lvb,
+	.plock		= user_plock,
+	.dump_lksb	= user_dlm_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin ocfs2_user_plugin = {
+	.sp_name	= "user",
+	.sp_ops		= &ocfs2_user_plugin_ops,
+	.sp_owner	= THIS_MODULE,
+};
+
+
+static int __init ocfs2_user_plugin_init(void)
+{
+	int rc;
+
+	rc = ocfs2_control_init();
+	if (!rc) {
+		rc = ocfs2_stack_glue_register(&ocfs2_user_plugin);
+		if (rc)
+			ocfs2_control_exit();
+	}
+
+	return rc;
+}
+
+static void __exit ocfs2_user_plugin_exit(void)
+{
+	ocfs2_stack_glue_unregister(&ocfs2_user_plugin);
+	ocfs2_control_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_user_plugin_init);
+module_exit(ocfs2_user_plugin_exit);
diff --git a/kernel/fs/ocfs2/stackglue.c b/kernel/fs/ocfs2/stackglue.c
new file mode 100644
index 000000000..5d965e83b
--- /dev/null
+++ b/kernel/fs/ocfs2/stackglue.c
@@ -0,0 +1,748 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007, 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+
+#include "ocfs2_fs.h"
+
+#include "stackglue.h"
+
+#define OCFS2_STACK_PLUGIN_O2CB		"o2cb"
+#define OCFS2_STACK_PLUGIN_USER		"user"
+#define OCFS2_MAX_HB_CTL_PATH		256
+
+static struct ocfs2_protocol_version locking_max_version;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
+
+/*
+ * The stack currently in use.  If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+	struct ocfs2_stack_plugin *p;
+
+	assert_spin_locked(&ocfs2_stack_lock);
+
+	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+		if (!strcmp(p->sp_name, name))
+			return p;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_stack_driver_request(const char *stack_name,
+				      const char *plugin_name)
+{
+	int rc;
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+
+	/*
+	 * If the stack passed by the filesystem isn't the selected one,
+	 * we can't continue.
+	 */
+	if (strcmp(stack_name, cluster_stack_name)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	if (active_stack) {
+		/*
+		 * If the active stack isn't the one we want, it cannot
+		 * be selected right now.
+		 */
+		if (!strcmp(active_stack->sp_name, plugin_name))
+			rc = 0;
+		else
+			rc = -EBUSY;
+		goto out;
+	}
+
+	p = ocfs2_stack_lookup(plugin_name);
+	if (!p || !try_module_get(p->sp_owner)) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	active_stack = p;
+	rc = 0;
+
+out:
+	/* If we found it, pin it */
+	if (!rc)
+		active_stack->sp_count++;
+
+	spin_unlock(&ocfs2_stack_lock);
+	return rc;
+}
+
+/*
+ * This function looks up the appropriate stack and makes it active.  If
+ * there is no stack, it tries to load it.  It will fail if the stack still
+ * cannot be found.  It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *stack_name)
+{
+	int rc;
+	char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+
+	/*
+	 * Classic stack does not pass in a stack name.  This is
+	 * compatible with older tools as well.
+	 */
+	if (!stack_name || !*stack_name)
+		stack_name = OCFS2_STACK_PLUGIN_O2CB;
+
+	if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+		printk(KERN_ERR
+		       "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+		       stack_name);
+		return -EINVAL;
+	}
+
+	/* Anything that isn't the classic stack is a user stack */
+	if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+		plugin_name = OCFS2_STACK_PLUGIN_USER;
+
+	rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+	if (rc == -ENOENT) {
+		request_module("ocfs2_stack_%s", plugin_name);
+		rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+	}
+
+	if (rc == -ENOENT) {
+		printk(KERN_ERR
+		       "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+		       plugin_name);
+	} else if (rc == -EBUSY) {
+		printk(KERN_ERR
+		       "ocfs2: A different cluster stack is in use\n");
+	}
+
+	return rc;
+}
+
+static void ocfs2_stack_driver_put(void)
+{
+	spin_lock(&ocfs2_stack_lock);
+	BUG_ON(active_stack == NULL);
+	BUG_ON(active_stack->sp_count == 0);
+
+	active_stack->sp_count--;
+	if (!active_stack->sp_count) {
+		module_put(active_stack->sp_owner);
+		active_stack = NULL;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+	int rc;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (!ocfs2_stack_lookup(plugin->sp_name)) {
+		plugin->sp_count = 0;
+		plugin->sp_max_proto = locking_max_version;
+		list_add(&plugin->sp_list, &ocfs2_stack_list);
+		printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+		       plugin->sp_name);
+		rc = 0;
+	} else {
+		printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+		       plugin->sp_name);
+		rc = -EEXIST;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	p = ocfs2_stack_lookup(plugin->sp_name);
+	if (p) {
+		BUG_ON(p != plugin);
+		BUG_ON(plugin == active_stack);
+		BUG_ON(plugin->sp_count != 0);
+		list_del_init(&plugin->sp_list);
+		printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+		       plugin->sp_name);
+	} else {
+		printk(KERN_ERR "Stack \"%s\" is not registered\n",
+		       plugin->sp_name);
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
+{
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (memcmp(max_proto, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		BUG_ON(locking_max_version.pv_major != 0);
+
+		locking_max_version = *max_proto;
+		list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+			p->sp_max_proto = locking_max_version;
+		}
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
+
+
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
+ * for the ast and bast functions.  They will pass the lksb to the ast
+ * and bast.  The caller can wrap the lksb with their own structure to
+ * get more information.
+ */
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+		   int mode,
+		   struct ocfs2_dlm_lksb *lksb,
+		   u32 flags,
+		   void *name,
+		   unsigned int namelen)
+{
+	if (!lksb->lksb_conn)
+		lksb->lksb_conn = conn;
+	else
+		BUG_ON(lksb->lksb_conn != conn);
+	return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+					      name, namelen);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
+
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+		     struct ocfs2_dlm_lksb *lksb,
+		     u32 flags)
+{
+	BUG_ON(lksb->lksb_conn == NULL);
+
+	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
+
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+	return active_stack->sp_ops->lock_status(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
+
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+	return active_stack->sp_ops->lvb_valid(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
+
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+	return active_stack->sp_ops->lock_lvb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
+
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+	active_stack->sp_ops->dump_lksb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+
+int ocfs2_stack_supports_plocks(void)
+{
+	return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl)
+{
+	WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+	if (active_stack->sp_ops->plock)
+		return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
+int ocfs2_cluster_connect(const char *stack_name,
+			  const char *cluster_name,
+			  int cluster_name_len,
+			  const char *group,
+			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
+			  void (*recovery_handler)(int node_num,
+						   void *recovery_data),
+			  void *recovery_data,
+			  struct ocfs2_cluster_connection **conn)
+{
+	int rc = 0;
+	struct ocfs2_cluster_connection *new_conn;
+
+	BUG_ON(group == NULL);
+	BUG_ON(conn == NULL);
+	BUG_ON(recovery_handler == NULL);
+
+	if (grouplen > GROUP_NAME_MAX) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (memcmp(&lproto->lp_max_version, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+			   GFP_KERNEL);
+	if (!new_conn) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
+	new_conn->cc_namelen = grouplen;
+	if (cluster_name_len)
+		strlcpy(new_conn->cc_cluster_name, cluster_name,
+			CLUSTER_NAME_MAX + 1);
+	new_conn->cc_cluster_name_len = cluster_name_len;
+	new_conn->cc_recovery_handler = recovery_handler;
+	new_conn->cc_recovery_data = recovery_data;
+
+	new_conn->cc_proto = lproto;
+	/* Start the new connection at our maximum compatibility level */
+	new_conn->cc_version = lproto->lp_max_version;
+
+	/* This will pin the stack driver if successful */
+	rc = ocfs2_stack_driver_get(stack_name);
+	if (rc)
+		goto out_free;
+
+	rc = active_stack->sp_ops->connect(new_conn);
+	if (rc) {
+		ocfs2_stack_driver_put();
+		goto out_free;
+	}
+
+	*conn = new_conn;
+
+out_free:
+	if (rc)
+		kfree(new_conn);
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+
+/* The caller will ensure all nodes have the same cluster stack */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn)
+{
+	char *stack_name = NULL;
+
+	if (cluster_stack_name[0])
+		stack_name = cluster_stack_name;
+	return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+				     lproto, recovery_handler, recovery_data,
+				     conn);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
+
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+			     int hangup_pending)
+{
+	int ret;
+
+	BUG_ON(conn == NULL);
+
+	ret = active_stack->sp_ops->disconnect(conn);
+
+	/* XXX Should we free it anyway? */
+	if (!ret) {
+		kfree(conn);
+		if (!hangup_pending)
+			ocfs2_stack_driver_put();
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+
+/*
+ * Leave the group for this filesystem.  This is executed by a userspace
+ * program (stored in ocfs2_hb_ctl_path).
+ */
+static void ocfs2_leave_group(const char *group)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	argv[0] = ocfs2_hb_ctl_path;
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = (char *)group;
+	argv[4] = NULL;
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (ret < 0) {
+		printk(KERN_ERR
+		       "ocfs2: Error %d running user helper "
+		       "\"%s %s %s %s\"\n",
+		       ret, argv[0], argv[1], argv[2], argv[3]);
+	}
+}
+
+/*
+ * Hangup is a required post-umount.  ocfs2-tools software expects the
+ * filesystem to call "ocfs2_hb_ctl" during unmount.  This happens
+ * regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  The ocfs2_leave_group() function does
+ * the actual work.
+ */
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+	BUG_ON(group == NULL);
+	BUG_ON(group[grouplen] != '\0');
+
+	ocfs2_leave_group(group);
+
+	/* cluster_disconnect() was called with hangup_pending==1 */
+	ocfs2_stack_driver_put();
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
+
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+			    unsigned int *node)
+{
+	return active_stack->sp_ops->this_node(conn, node);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
+
+
+/*
+ * Sysfs bits
+ */
+
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+					       struct kobj_attribute *attr,
+					       char *buf)
+{
+	ssize_t ret = 0;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (locking_max_version.pv_major)
+		ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+			       locking_max_version.pv_major,
+			       locking_max_version.pv_minor);
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+	__ATTR(max_locking_protocol, S_IRUGO,
+	       ocfs2_max_locking_protocol_show, NULL);
+
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+						 struct kobj_attribute *attr,
+						 char *buf)
+{
+	ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+		ret = snprintf(buf, remain, "%s\n",
+			       p->sp_name);
+		if (ret < 0) {
+			total = ret;
+			break;
+		}
+		if (ret == remain) {
+			/* snprintf() didn't fit */
+			total = -E2BIG;
+			break;
+		}
+		total += ret;
+		remain -= ret;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return total;
+}
+
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+	__ATTR(loaded_cluster_plugins, S_IRUGO,
+	       ocfs2_loaded_cluster_plugins_show, NULL);
+
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+						struct kobj_attribute *attr,
+						char *buf)
+{
+	ssize_t ret = 0;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (active_stack) {
+		ret = snprintf(buf, PAGE_SIZE, "%s\n",
+			       active_stack->sp_name);
+		if (ret == PAGE_SIZE)
+			ret = -E2BIG;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+	__ATTR(active_cluster_plugin, S_IRUGO,
+	       ocfs2_active_cluster_plugin_show, NULL);
+
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	ssize_t ret;
+	spin_lock(&ocfs2_stack_lock);
+	ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	size_t len = count;
+	ssize_t ret;
+
+	if (len == 0)
+		return len;
+
+	if (buf[len - 1] == '\n')
+		len--;
+
+	if ((len != OCFS2_STACK_LABEL_LEN) ||
+	    (strnlen(buf, len) != len))
+		return -EINVAL;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (active_stack) {
+		if (!strncmp(buf, cluster_stack_name, len))
+			ret = count;
+		else
+			ret = -EBUSY;
+	} else {
+		memcpy(cluster_stack_name, buf, len);
+		ret = count;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+	__ATTR(cluster_stack, S_IRUGO | S_IWUSR,
+	       ocfs2_cluster_stack_show,
+	       ocfs2_cluster_stack_store);
+
+
+
+static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "1\n");
+}
+
+static struct kobj_attribute ocfs2_attr_dlm_recover_support =
+	__ATTR(dlm_recover_callback_support, S_IRUGO,
+	       ocfs2_dlm_recover_show, NULL);
+
+static struct attribute *ocfs2_attrs[] = {
+	&ocfs2_attr_max_locking_protocol.attr,
+	&ocfs2_attr_loaded_cluster_plugins.attr,
+	&ocfs2_attr_active_cluster_plugin.attr,
+	&ocfs2_attr_cluster_stack.attr,
+	&ocfs2_attr_dlm_recover_support.attr,
+	NULL,
+};
+
+static struct attribute_group ocfs2_attr_group = {
+	.attrs = ocfs2_attrs,
+};
+
+static struct kset *ocfs2_kset;
+
+static void ocfs2_sysfs_exit(void)
+{
+	kset_unregister(ocfs2_kset);
+}
+
+static int ocfs2_sysfs_init(void)
+{
+	int ret;
+
+	ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+	if (!ocfs2_kset)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+	if (ret)
+		goto error;
+
+	return 0;
+
+error:
+	kset_unregister(ocfs2_kset);
+	return ret;
+}
+
+/*
+ * Sysctl bits
+ *
+ * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path.  The 'nm' doesn't
+ * make as much sense in a multiple cluster stack world, but it's safer
+ * and easier to preserve the name.
+ */
+
+#define FS_OCFS2_NM		1
+
+static struct ctl_table ocfs2_nm_table[] = {
+	{
+		.procname	= "hb_ctl_path",
+		.data		= ocfs2_hb_ctl_path,
+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
+		.mode		= 0644,
+		.proc_handler	= proc_dostring,
+	},
+	{ }
+};
+
+static struct ctl_table ocfs2_mod_table[] = {
+	{
+		.procname	= "nm",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_nm_table
+	},
+	{ }
+};
+
+static struct ctl_table ocfs2_kern_table[] = {
+	{
+		.procname	= "ocfs2",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_mod_table
+	},
+	{ }
+};
+
+static struct ctl_table ocfs2_root_table[] = {
+	{
+		.procname	= "fs",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_kern_table
+	},
+	{ }
+};
+
+static struct ctl_table_header *ocfs2_table_header;
+
+
+/*
+ * Initialization
+ */
+
+static int __init ocfs2_stack_glue_init(void)
+{
+	strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+
+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+	if (!ocfs2_table_header) {
+		printk(KERN_ERR
+		       "ocfs2 stack glue: unable to register sysctl\n");
+		return -ENOMEM; /* or something. */
+	}
+
+	return ocfs2_sysfs_init();
+}
+
+static void __exit ocfs2_stack_glue_exit(void)
+{
+	memset(&locking_max_version, 0,
+	       sizeof(struct ocfs2_protocol_version));
+	locking_max_version.pv_major = 0;
+	locking_max_version.pv_minor = 0;
+	ocfs2_sysfs_exit();
+	if (ocfs2_table_header)
+		unregister_sysctl_table(ocfs2_table_header);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/kernel/fs/ocfs2/stackglue.h b/kernel/fs/ocfs2/stackglue.h
new file mode 100644
index 000000000..66334a30c
--- /dev/null
+++ b/kernel/fs/ocfs2/stackglue.h
@@ -0,0 +1,301 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+
+#include "dlm/dlmapi.h"
+#include <linux/dlm.h>
+
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
+/*
+ * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
+ * some day, but right now we need it.  Let's fake it.  This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL		0x00100000
+
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h.  That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX		64
+
+/* This shadows  OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX	16
+
+
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior.  See dlmglue.c for more information.
+ */
+struct ocfs2_protocol_version {
+	u8 pv_major;
+	u8 pv_minor;
+};
+
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space.  This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+	struct dlm_lksb lksb;
+	char lvb[DLM_LVB_LEN];
+};
+
+/*
+ * A union of all lock status structures.  We define it here so that the
+ * size of the union is known.  Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
+struct ocfs2_cluster_connection;
+struct ocfs2_dlm_lksb {
+	 union {
+		 struct dlm_lockstatus lksb_o2dlm;
+		 struct dlm_lksb lksb_fsdlm;
+		 struct fsdlm_lksb_plus_lvb padding;
+	 };
+	 struct ocfs2_cluster_connection *lksb_conn;
+};
+
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+	struct ocfs2_protocol_version lp_max_version;
+	void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
+	void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
+	void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
+};
+
+
+/*
+ * A cluster connection.  Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack.  ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
+struct ocfs2_cluster_connection {
+	char cc_name[GROUP_NAME_MAX + 1];
+	int cc_namelen;
+	char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+	int cc_cluster_name_len;
+	struct ocfs2_protocol_version cc_version;
+	struct ocfs2_locking_protocol *cc_proto;
+	void (*cc_recovery_handler)(int node_num, void *recovery_data);
+	void *cc_recovery_data;
+	void *cc_lockspace;
+	void *cc_private;
+};
+
+/*
+ * Each cluster stack implements the stack operations structure.  Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+	/*
+	 * The fs code calls ocfs2_cluster_connect() to attach a new
+	 * filesystem to the cluster stack.  The ->connect() op is passed
+	 * an ocfs2_cluster_connection with the name and recovery field
+	 * filled in.
+	 *
+	 * The stack must set up any notification mechanisms and create
+	 * the filesystem lockspace in the DLM.  The lockspace should be
+	 * stored on cc_lockspace.  Any other information can be stored on
+	 * cc_private.
+	 *
+	 * ->connect() must not return until it is guaranteed that
+	 *
+	 *  - Node down notifications for the filesystem will be received
+	 *    and passed to conn->cc_recovery_handler().
+	 *  - Locking requests for the filesystem will be processed.
+	 */
+	int (*connect)(struct ocfs2_cluster_connection *conn);
+
+	/*
+	 * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+	 * no longer needs cluster services.  All DLM locks have been
+	 * dropped, and recovery notification is being ignored by the
+	 * fs code.  The stack must disengage from the DLM and discontinue
+	 * recovery notification.
+	 *
+	 * Once ->disconnect() has returned, the connection structure will
+	 * be freed.  Thus, a stack must not return from ->disconnect()
+	 * until it will no longer reference the conn pointer.
+	 *
+	 * Once this call returns, the stack glue will be dropping this
+	 * connection's reference on the module.
+	 */
+	int (*disconnect)(struct ocfs2_cluster_connection *conn);
+
+	/*
+	 * ->this_node() returns the cluster's unique identifier for the
+	 * local node.
+	 */
+	int (*this_node)(struct ocfs2_cluster_connection *conn,
+			 unsigned int *node);
+
+	/*
+	 * Call the underlying dlm lock function.  The ->dlm_lock()
+	 * callback should convert the flags and mode as appropriate.
+	 *
+	 * ast and bast functions are not part of the call because the
+	 * stack will likely want to wrap ast and bast calls before passing
+	 * them to stack->sp_proto.  There is no astarg.  The lksb will
+	 * be passed back to the ast and bast functions.  The caller can
+	 * use this to find their object.
+	 */
+	int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+			int mode,
+			struct ocfs2_dlm_lksb *lksb,
+			u32 flags,
+			void *name,
+			unsigned int namelen);
+
+	/*
+	 * Call the underlying dlm unlock function.  The ->dlm_unlock()
+	 * function should convert the flags as appropriate.
+	 *
+	 * The unlock ast is not passed, as the stack will want to wrap
+	 * it before calling stack->sp_proto->lp_unlock_ast().  There is
+	 * no astarg.  The lksb will be passed back to the unlock ast
+	 * function.  The caller can use this to find their object.
+	 */
+	int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+			  struct ocfs2_dlm_lksb *lksb,
+			  u32 flags);
+
+	/*
+	 * Return the status of the current lock status block.  The fs
+	 * code should never dereference the union.  The ->lock_status()
+	 * callback pulls out the stack-specific lksb, converts the status
+	 * to a proper errno, and returns it.
+	 */
+	int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * Return non-zero if the LVB is valid.
+	 */
+	int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * Pull the lvb pointer off of the stack-specific lksb.
+	 */
+	void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * Cluster-aware posix locks
+	 *
+	 * This is NULL for stacks which do not support posix locks.
+	 */
+	int (*plock)(struct ocfs2_cluster_connection *conn,
+		     u64 ino,
+		     struct file *file,
+		     int cmd,
+		     struct file_lock *fl);
+
+	/*
+	 * This is an optoinal debugging hook.  If provided, the
+	 * stack can dump debugging information about this lock.
+	 */
+	void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
+};
+
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure.  This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+	char *sp_name;
+	struct ocfs2_stack_operations *sp_ops;
+	struct module *sp_owner;
+
+	/* These are managed by the stackglue code. */
+	struct list_head sp_list;
+	unsigned int sp_count;
+	struct ocfs2_protocol_version sp_max_proto;
+};
+
+
+/* Used by the filesystem */
+int ocfs2_cluster_connect(const char *stack_name,
+			  const char *cluster_name,
+			  int cluster_name_len,
+			  const char *group,
+			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
+			  void (*recovery_handler)(int node_num,
+						   void *recovery_data),
+			  void *recovery_data,
+			  struct ocfs2_cluster_connection **conn);
+/*
+ * Used by callers that don't store their stack name.  They must ensure
+ * all nodes have the same stack.
+ */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+			     int hangup_pending);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+			    unsigned int *node);
+
+struct ocfs2_lock_res;
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+		   int mode,
+		   struct ocfs2_dlm_lksb *lksb,
+		   u32 flags,
+		   void *name,
+		   unsigned int namelen);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+		     struct ocfs2_dlm_lksb *lksb,
+		     u32 flags);
+
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
+
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl);
+
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
+
+
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+
+#endif  /* STACKGLUE_H */
diff --git a/kernel/fs/ocfs2/suballoc.c b/kernel/fs/ocfs2/suballoc.c
new file mode 100644
index 000000000..447902963
--- /dev/null
+++ b/kernel/fs/ocfs2/suballoc.c
@@ -0,0 +1,2919 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ * Inspired by ext3 block groups.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+#define NOT_ALLOC_NEW_GROUP		0
+#define ALLOC_NEW_GROUP			0x1
+#define ALLOC_GROUPS_FROM_GLOBAL	0x2
+
+#define OCFS2_MAX_TO_STEAL		1024
+
+struct ocfs2_suballoc_result {
+	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
+					   to 0 when a block group is
+					   contiguous. */
+	u64		sr_bg_stable_blkno; /*
+					     * Doesn't change, always
+					     * set to target block
+					     * group descriptor
+					     * block.
+					     */
+	u64		sr_blkno;	/* The first allocated block */
+	unsigned int	sr_bit_offset;	/* The bit in the bg */
+	unsigned int	sr_bits;	/* How many bits we claimed */
+};
+
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+	if (res->sr_blkno == 0)
+		return 0;
+
+	if (res->sr_bg_blkno)
+		return res->sr_bg_blkno;
+
+	return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(handle_t *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  unsigned int group_clusters,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh,
+				   u64 max_block,
+				   u64 *last_alloc_group,
+				   int flags);
+
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
+				      struct ocfs2_suballoc_result *res);
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
+				    struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
+				     handle_t *handle,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     struct ocfs2_suballoc_result *res);
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr);
+static int ocfs2_relink_block_group(handle_t *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain);
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted);
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off);
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     int flags,
+					     struct ocfs2_alloc_context **ac);
+
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+{
+	struct inode *inode = ac->ac_inode;
+
+	if (inode) {
+		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
+			ocfs2_inode_unlock(inode, 1);
+
+		mutex_unlock(&inode->i_mutex);
+
+		iput(inode);
+		ac->ac_inode = NULL;
+	}
+	brelse(ac->ac_bh);
+	ac->ac_bh = NULL;
+	ac->ac_resv = NULL;
+	if (ac->ac_find_loc_priv) {
+		kfree(ac->ac_find_loc_priv);
+		ac->ac_find_loc_priv = NULL;
+	}
+}
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+	ocfs2_free_ac_resource(ac);
+	kfree(ac);
+}
+
+static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
+{
+	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
+}
+
+#define do_error(fmt, ...)						\
+	do{								\
+		if (resize)					\
+			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
+		else							\
+			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
+	} while (0)
+
+static int ocfs2_validate_gd_self(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int resize)
+{
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
+		do_error("Group descriptor #%llu has bad signature %.*s",
+			 (unsigned long long)bh->b_blocknr, 7,
+			 gd->bg_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+		do_error("Group descriptor #%llu has an invalid bg_blkno "
+			 "of %llu",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+		do_error("Group descriptor #%llu has an invalid "
+			 "fs_generation of #%u",
+			 (unsigned long long)bh->b_blocknr,
+			 le32_to_cpu(gd->bg_generation));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "claims that %u are free",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 le16_to_cpu(gd->bg_free_bits_count));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "max bitmap bits of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 8 * le16_to_cpu(gd->bg_size));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+				    struct ocfs2_dinode *di,
+				    struct buffer_head *bh,
+				    int resize)
+{
+	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	if (di->i_blkno != gd->bg_parent_dinode) {
+		do_error("Group descriptor #%llu has bad parent "
+			 "pointer (%llu, expected %llu)",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+			 (unsigned long long)le64_to_cpu(di->i_blkno));
+		return -EINVAL;
+	}
+
+	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
+	if (le16_to_cpu(gd->bg_bits) > max_bits) {
+		do_error("Group descriptor #%llu has bit count of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits));
+		return -EINVAL;
+	}
+
+	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+	if ((le16_to_cpu(gd->bg_chain) >
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+	    ((le16_to_cpu(gd->bg_chain) ==
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
+		do_error("Group descriptor #%llu has bad chain %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_chain));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#undef do_error
+
+/*
+ * This version only prints errors.  It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc) {
+		mlog(ML_ERROR,
+		     "Checksum failed for group descriptor %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+	} else
+		rc = ocfs2_validate_gd_self(sb, bh, 1);
+	if (!rc)
+		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+	return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+					   struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	trace_ocfs2_validate_group_descriptor(
+					(unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	return ocfs2_validate_gd_self(sb, bh, 0);
+}
+
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
+			      ocfs2_validate_group_descriptor);
+	if (rc)
+		goto out;
+
+	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+	if (rc) {
+		brelse(tmp);
+		goto out;
+	}
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc;
+}
+
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+					  struct ocfs2_group_desc *bg,
+					  struct ocfs2_chain_list *cl,
+					  u64 p_blkno, unsigned int clusters)
+{
+	struct ocfs2_extent_list *el = &bg->bg_list;
+	struct ocfs2_extent_rec *rec;
+
+	BUG_ON(!ocfs2_supports_discontig_bg(osb));
+	if (!el->l_next_free_rec)
+		el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+	rec->e_blkno = cpu_to_le64(p_blkno);
+	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+				  le16_to_cpu(cl->cl_bpc));
+	rec->e_leaf_clusters = cpu_to_le16(clusters);
+	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+	le16_add_cpu(&bg->bg_free_bits_count,
+		     clusters * le16_to_cpu(cl->cl_bpc));
+	le16_add_cpu(&el->l_next_free_rec, 1);
+}
+
+static int ocfs2_block_group_fill(handle_t *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  unsigned int group_clusters,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl)
+{
+	int status = 0;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
+		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
+			    "b_blocknr (%llu)",
+			    (unsigned long long)group_blkno,
+			    (unsigned long long) bg_bh->b_blocknr);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+						osb->s_feature_incompat));
+	bg->bg_chain = cpu_to_le16(my_chain);
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
+	bg->bg_blkno = cpu_to_le64(group_blkno);
+	if (group_clusters == le16_to_cpu(cl->cl_cpg))
+		bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	else
+		ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+					      group_clusters);
+
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
+	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
+
+	ocfs2_journal_dirty(handle, bg_bh);
+
+	/* There is no need to zero out or otherwise initialize the
+	 * other blocks in a group - All valid FS metadata in a block
+	 * group stores the superblock fs_generation value at
+	 * allocation time. */
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_count)) {
+		if (le32_to_cpu(cl->cl_recs[best].c_total) >
+		    le32_to_cpu(cl->cl_recs[curr].c_total))
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+			       struct inode *alloc_inode,
+			       struct ocfs2_alloc_context *ac,
+			       struct ocfs2_chain_list *cl)
+{
+	int status;
+	u32 bit_off, num_bits;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh;
+	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	status = ocfs2_claim_clusters(handle, ac,
+				      le16_to_cpu(cl->cl_cpg), &bit_off,
+				      &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_block_group_alloc_contig(
+	     (unsigned long long)bg_blkno, alloc_rec);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+					bg_blkno, num_bits, alloc_rec, cl);
+	if (status < 0) {
+		brelse(bg_bh);
+		mlog_errno(status);
+	}
+
+bail:
+	return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_alloc_context *ac,
+					unsigned int min_bits,
+					u32 *bit_off, u32 *num_bits)
+{
+	int status = 0;
+
+	while (min_bits) {
+		status = ocfs2_claim_clusters(handle, ac, min_bits,
+					      bit_off, num_bits);
+		if (status != -ENOSPC)
+			break;
+
+		min_bits >>= 1;
+	}
+
+	return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+					    struct inode *alloc_inode,
+					    struct buffer_head *bg_bh,
+					    struct ocfs2_alloc_context *ac,
+					    struct ocfs2_chain_list *cl,
+					    unsigned int min_bits)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_group_desc *bg =
+		(struct ocfs2_group_desc *)bg_bh->b_data;
+	unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+	u32 p_cpos, clusters;
+	u64 p_blkno;
+	struct ocfs2_extent_list *el = &bg->bg_list;
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+				le16_to_cpu(el->l_count))) {
+		if (min_bits > needed)
+			min_bits = needed;
+		status = ocfs2_block_group_claim_bits(osb, handle, ac,
+						      min_bits, &p_cpos,
+						      &clusters);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+		ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+					      clusters);
+
+		min_bits = clusters;
+		needed = le16_to_cpu(cl->cl_cpg) -
+			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+	}
+
+	if (needed > 0) {
+		/*
+		 * We have used up all the extent rec but can't fill up
+		 * the cpg. So bail out.
+		 */
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+	return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+				   struct ocfs2_alloc_context *cluster_ac,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bg_bh)
+{
+	int i, ret;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	if (!bg_bh)
+		return;
+
+	bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+	el = &bg->bg_list;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+					  cluster_ac->ac_bh,
+					  le64_to_cpu(rec->e_blkno),
+					  le16_to_cpu(rec->e_leaf_clusters));
+		if (ret)
+			mlog_errno(ret);
+		/* Try all the clusters to free */
+	}
+
+	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+	brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+				  struct inode *alloc_inode,
+				  struct ocfs2_alloc_context *ac,
+				  struct ocfs2_chain_list *cl)
+{
+	int status;
+	u32 bit_off, num_bits;
+	u64 bg_blkno;
+	unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+	struct buffer_head *bg_bh = NULL;
+	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+	if (!ocfs2_supports_discontig_bg(osb)) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	status = ocfs2_extend_trans(handle,
+				    ocfs2_calc_bg_discontig_credits(osb->sb));
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * We're going to be grabbing from multiple cluster groups.
+	 * We don't have enough credits to relink them all, and the
+	 * cluster groups will be staying in cache for the duration of
+	 * this operation.
+	 */
+	ac->ac_disable_chain_relink = 1;
+
+	/* Claim the first region */
+	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+					      &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+	min_bits = num_bits;
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_block_group_alloc_discontig(
+				(unsigned long long)bg_blkno, alloc_rec);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+					bg_blkno, num_bits, alloc_rec, cl);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+						  bg_bh, ac, cl, min_bits);
+	if (status)
+		mlog_errno(status);
+
+bail:
+	if (status)
+		ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+	return status ? ERR_PTR(status) : bg_bh;
+}
+
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh,
+				   u64 max_block,
+				   u64 *last_alloc_group,
+				   int flags)
+{
+	int status, credits;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_alloc_context *ac = NULL;
+	handle_t *handle = NULL;
+	u16 alloc_rec;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+
+	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+
+	cl = &fe->id2.i_chain;
+	status = ocfs2_reserve_clusters_with_limit(osb,
+						   le16_to_cpu(cl->cl_cpg),
+						   max_block, flags, &ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	credits = ocfs2_calc_group_alloc_credits(osb->sb,
+						 le16_to_cpu(cl->cl_cpg));
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (last_alloc_group && *last_alloc_group != 0) {
+		trace_ocfs2_block_group_alloc(
+				(unsigned long long)*last_alloc_group);
+		ac->ac_last_group = *last_alloc_group;
+	}
+
+	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+					       ac, cl);
+	if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+		bg_bh = ocfs2_block_group_alloc_discontig(handle,
+							  alloc_inode,
+							  ac, cl);
+	if (IS_ERR(bg_bh)) {
+		status = PTR_ERR(bg_bh);
+		bg_bh = NULL;
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc_rec = le16_to_cpu(bg->bg_chain);
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
+		     le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+		     le16_to_cpu(bg->bg_bits));
+	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
+	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
+					le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
+	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
+
+	ocfs2_journal_dirty(handle, bh);
+
+	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
+	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
+					     le32_to_cpu(fe->i_clusters)));
+	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
+	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
+	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
+
+	status = 0;
+
+	/* save the new last alloc group so that the caller can cache it. */
+	if (last_alloc_group)
+		*last_alloc_group = ac->ac_last_group;
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	brelse(bg_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac,
+				       int type,
+				       u32 slot,
+				       u64 *last_alloc_group,
+				       int flags)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe;
+	u32 free_bits;
+
+	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
+	if (!alloc_inode) {
+		mlog_errno(-EINVAL);
+		return -EINVAL;
+	}
+
+	mutex_lock(&alloc_inode->i_mutex);
+
+	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
+	if (status < 0) {
+		mutex_unlock(&alloc_inode->i_mutex);
+		iput(alloc_inode);
+
+		mlog_errno(status);
+		return status;
+	}
+
+	ac->ac_inode = alloc_inode;
+	ac->ac_alloc_slot = slot;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	/* The bh was validated by the inode read inside
+	 * ocfs2_inode_lock().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
+		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
+			    (unsigned long long)le64_to_cpu(fe->i_blkno));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
+		le32_to_cpu(fe->id1.bitmap1.i_used);
+
+	if (bits_wanted > free_bits) {
+		/* cluster bitmap never grows */
+		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
+								free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		if (!(flags & ALLOC_NEW_GROUP)) {
+			trace_ocfs2_reserve_suballoc_bits_no_new_group(
+						slot, bits_wanted, free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+						 ac->ac_max_block,
+						 last_alloc_group, flags);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		atomic_inc(&osb->alloc_stats.bg_extends);
+
+		/* You should never ask for this much metadata */
+		BUG_ON(bits_wanted >
+		       (le32_to_cpu(fe->id1.bitmap1.i_total)
+			- le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+bail:
+	brelse(bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_meta_stolen, 0);
+}
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb)
+{
+	ocfs2_init_inode_steal_slot(osb);
+	ocfs2_init_meta_steal_slot(osb);
+}
+
+static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
+{
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		osb->s_inode_steal_slot = slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		osb->s_meta_steal_slot = slot;
+	spin_unlock(&osb->osb_lock);
+}
+
+static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
+{
+	int slot = OCFS2_INVALID_SLOT;
+
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		slot = osb->s_inode_steal_slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		slot = osb->s_meta_steal_slot;
+	spin_unlock(&osb->osb_lock);
+
+	return slot;
+}
+
+static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_resource(struct ocfs2_super *osb,
+				struct ocfs2_alloc_context *ac,
+				int type)
+{
+	int i, status = -ENOSPC;
+	int slot = __ocfs2_get_steal_slot(osb, type);
+
+	/* Start to steal resource from the first slot after ours. */
+	if (slot == OCFS2_INVALID_SLOT)
+		slot = osb->slot_num + 1;
+
+	for (i = 0; i < osb->max_slots; i++, slot++) {
+		if (slot == osb->max_slots)
+			slot = 0;
+
+		if (slot == osb->slot_num)
+			continue;
+
+		status = ocfs2_reserve_suballoc_bits(osb, ac,
+						     type,
+						     (u32)slot, NULL,
+						     NOT_ALLOC_NEW_GROUP);
+		if (status >= 0) {
+			__ocfs2_set_steal_slot(osb, slot, type);
+			break;
+		}
+
+		ocfs2_free_ac_resource(ac);
+	}
+
+	return status;
+}
+
+static int ocfs2_steal_inode(struct ocfs2_super *osb,
+			     struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_meta(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac)
+{
+	int status;
+	int slot = ocfs2_get_meta_steal_slot(osb);
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = blocks;
+	(*ac)->ac_which = OCFS2_AC_USE_META;
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	if (slot != OCFS2_INVALID_SLOT &&
+		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
+		goto extent_steal;
+
+	atomic_set(&osb->s_num_meta_stolen, 0);
+	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
+					     EXTENT_ALLOC_SYSTEM_INODE,
+					     (u32)osb->slot_num, NULL,
+					     ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
+
+
+	if (status >= 0) {
+		status = 0;
+		if (slot != OCFS2_INVALID_SLOT)
+			ocfs2_init_meta_steal_slot(osb);
+		goto bail;
+	} else if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_free_ac_resource(*ac);
+
+extent_steal:
+	status = ocfs2_steal_meta(osb, *ac);
+	atomic_inc(&osb->s_num_meta_stolen);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_new_metadata_blocks(osb,
+					ocfs2_extend_meta_needed(root_el),
+					ac);
+}
+
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context **ac)
+{
+	int status;
+	int slot = ocfs2_get_inode_steal_slot(osb);
+	u64 alloc_group;
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_which = OCFS2_AC_USE_INODE;
+
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	/*
+	 * stat(2) can't handle i_ino > 32bits, so we tell the
+	 * lower levels not to allocate us a block group past that
+	 * limit.  The 'inode64' mount option avoids this behavior.
+	 */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+		(*ac)->ac_max_block = (u32)~0U;
+
+	/*
+	 * slot is set when we successfully steal inode from other nodes.
+	 * It is reset in 3 places:
+	 * 1. when we flush the truncate log
+	 * 2. when we complete local alloc recovery.
+	 * 3. when we successfully allocate from our own slot.
+	 * After it is set, we will go on stealing inodes until we find the
+	 * need to check our slots to see whether there is some space for us.
+	 */
+	if (slot != OCFS2_INVALID_SLOT &&
+	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
+		goto inode_steal;
+
+	atomic_set(&osb->s_num_inodes_stolen, 0);
+	alloc_group = osb->osb_inode_alloc_group;
+	status = ocfs2_reserve_suballoc_bits(osb, *ac,
+					     INODE_ALLOC_SYSTEM_INODE,
+					     (u32)osb->slot_num,
+					     &alloc_group,
+					     ALLOC_NEW_GROUP |
+					     ALLOC_GROUPS_FROM_GLOBAL);
+	if (status >= 0) {
+		status = 0;
+
+		spin_lock(&osb->osb_lock);
+		osb->osb_inode_alloc_group = alloc_group;
+		spin_unlock(&osb->osb_lock);
+		trace_ocfs2_reserve_new_inode_new_group(
+			(unsigned long long)alloc_group);
+
+		/*
+		 * Some inodes must be freed by us, so try to allocate
+		 * from our own next time.
+		 */
+		if (slot != OCFS2_INVALID_SLOT)
+			ocfs2_init_inode_steal_slot(osb);
+		goto bail;
+	} else if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_free_ac_resource(*ac);
+
+inode_steal:
+	status = ocfs2_steal_inode(osb, *ac);
+	atomic_inc(&osb->s_num_inodes_stolen);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* local alloc code has to do the same thing, so rather than do this
+ * twice.. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac)
+{
+	int status;
+
+	ac->ac_which = OCFS2_AC_USE_MAIN;
+	ac->ac_group_search = ocfs2_cluster_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, ac,
+					     GLOBAL_BITMAP_SYSTEM_INODE,
+					     OCFS2_INVALID_SLOT, NULL,
+					     ALLOC_NEW_GROUP);
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+/* Callers don't need to care which bitmap (local alloc or main) to
+ * use so we figure it out for them, but unfortunately this clutters
+ * things a bit. */
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     int flags,
+					     struct ocfs2_alloc_context **ac)
+{
+	int status;
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_max_block = max_block;
+
+	status = -ENOSPC;
+	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+		status = ocfs2_reserve_local_alloc_bits(osb,
+							bits_wanted,
+							*ac);
+		if ((status < 0) && (status != -ENOSPC)) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (status == -ENOSPC) {
+		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+						 ALLOC_NEW_GROUP, ac);
+}
+
+/*
+ * More or less lifted from ext3. I'll leave their description below:
+ *
+ * "For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes."
+ *
+ * Note: OCFS2 already does this differently for metadata vs data
+ * allocations, as those bitmaps are separate and undo access is never
+ * called on a metadata group descriptor.
+ */
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr)
+{
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	int ret;
+
+	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
+		return 0;
+
+	if (!buffer_jbd(bg_bh))
+		return 1;
+
+	jbd_lock_bh_state(bg_bh);
+	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+	if (bg)
+		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+	else
+		ret = 1;
+	jbd_unlock_bh_state(bg_bh);
+
+	return ret;
+}
+
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     unsigned int total_bits,
+					     struct ocfs2_suballoc_result *res)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	/* Callers got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
+		if (offset == total_bits)
+			break;
+
+		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
+			/* We found a zero, but we can't use it as it
+			 * hasn't been put to disk yet! */
+			found = 0;
+			start = offset + 1;
+		} else if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_size) {
+			best_size = found;
+			best_offset = start - found;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	if (best_size) {
+		res->sr_bit_offset = best_offset;
+		res->sr_bits = best_size;
+	} else {
+		status = -ENOSPC;
+		/* No error log here -- see the comment above
+		 * ocfs2_test_bg_bit_allocatable */
+	}
+
+	return status;
+}
+
+int ocfs2_block_group_set_bits(handle_t *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	/* All callers get the descriptor via
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 group_bh,
+					 journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
+	while(num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	ocfs2_journal_dirty(handle, group_bh);
+
+bail:
+	return status;
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	BUG_ON(!cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
+		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
+		    le32_to_cpu(cl->cl_recs[best].c_free))
+			best = curr;
+		curr++;
+	}
+
+	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
+	return best;
+}
+
+static int ocfs2_relink_block_group(handle_t *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 bg_ptr, prev_bg_ptr;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	/* The caller got these descriptors from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
+
+	trace_ocfs2_relink_block_group(
+		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+		(unsigned long long)le64_to_cpu(bg->bg_blkno),
+		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
+
+	bg_ptr = le64_to_cpu(bg->bg_next_group);
+	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
+
+	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+					 prev_bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0)
+		goto out;
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+	ocfs2_journal_dirty(handle, prev_bg_bh);
+
+	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0)
+		goto out_rollback_prev_bg;
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+	ocfs2_journal_dirty(handle, bg_bh);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0)
+		goto out_rollback_bg;
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+	ocfs2_journal_dirty(handle, fe_bh);
+
+out:
+	if (status < 0)
+		mlog_errno(status);
+	return status;
+
+out_rollback_bg:
+	bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+	goto out;
+}
+
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted)
+{
+	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
+}
+
+/* return 0 on success, -ENOSPC to keep searching and any other < 0
+ * value on error. */
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
+				      struct ocfs2_suballoc_result *res)
+{
+	int search = -ENOSPC;
+	int ret;
+	u64 blkoff;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int max_bits, gd_cluster_off;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (gd->bg_free_bits_count) {
+		max_bits = le16_to_cpu(gd->bg_bits);
+
+		/* Tail groups in cluster bitmaps which aren't cpg
+		 * aligned are prone to partial extension by a failed
+		 * fs resize. If the file system resize never got to
+		 * update the dinode cluster count, then we don't want
+		 * to trust any clusters past it, regardless of what
+		 * the group descriptor says. */
+		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
+							  le64_to_cpu(gd->bg_blkno));
+		if ((gd_cluster_off + max_bits) >
+		    OCFS2_I(inode)->ip_clusters) {
+			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
+			trace_ocfs2_cluster_group_search_wrong_max_bits(
+				(unsigned long long)le64_to_cpu(gd->bg_blkno),
+				le16_to_cpu(gd->bg_bits),
+				OCFS2_I(inode)->ip_clusters, max_bits);
+		}
+
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							max_bits, res);
+		if (ret)
+			return ret;
+
+		if (max_block) {
+			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+							  gd_cluster_off +
+							  res->sr_bit_offset +
+							  res->sr_bits);
+			trace_ocfs2_cluster_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
+			if (blkoff > max_block)
+				return -ENOSPC;
+		}
+
+		/* ocfs2_block_group_find_clear_bits() might
+		 * return success, but we still want to return
+		 * -ENOSPC unless it found the minimum number
+		 * of bits. */
+		if (min_bits <= res->sr_bits)
+			search = 0; /* success */
+		else if (res->sr_bits) {
+			/*
+			 * Don't show bits which we'll be returning
+			 * for allocation to the local alloc bitmap.
+			 */
+			ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
+		}
+	}
+
+	return search;
+}
+
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
+				    struct ocfs2_suballoc_result *res)
+{
+	int ret = -ENOSPC;
+	u64 blkoff;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON(min_bits != 1);
+	BUG_ON(ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count) {
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							le16_to_cpu(bg->bg_bits),
+							res);
+		if (!ret && max_block) {
+			blkoff = le64_to_cpu(bg->bg_blkno) +
+				res->sr_bit_offset + res->sr_bits;
+			trace_ocfs2_block_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
+			if (blkoff > max_block)
+				ret = -ENOSPC;
+		}
+	}
+
+	return ret;
+}
+
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+				       handle_t *handle,
+				       struct buffer_head *di_bh,
+				       u32 num_bits,
+				       u16 chain)
+{
+	int ret;
+	u32 tmp_used;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out:
+	return ret;
+}
+
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+				       struct buffer_head *di_bh,
+				       u32 num_bits,
+				       u16 chain)
+{
+	u32 tmp_used;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_chain_list *cl;
+
+	cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
+	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+	di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
+}
+
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+					 struct ocfs2_extent_rec *rec,
+					 struct ocfs2_chain_list *cl)
+{
+	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+	unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
+
+	if (res->sr_bit_offset < bitoff)
+		return 0;
+	if (res->sr_bit_offset >= (bitoff + bitcount))
+		return 0;
+	res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+		(res->sr_bit_offset - bitoff);
+	if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+		res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+	return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+					  struct ocfs2_group_desc *bg,
+					  struct ocfs2_suballoc_result *res)
+{
+	int i;
+	u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+	if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+		res->sr_blkno = 0;
+		return;
+	}
+
+	res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+	res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+	if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+	    !bg->bg_list.l_next_free_rec)
+		return;
+
+	for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+		rec = &bg->bg_list.l_recs[i];
+		if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+			res->sr_bg_blkno = bg_blkno;  /* Restore */
+			break;
+		}
+	}
+}
+
+static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
+				  handle_t *handle,
+				  u32 bits_wanted,
+				  u32 min_bits,
+				  struct ocfs2_suballoc_result *res,
+				  u16 *bits_left)
+{
+	int ret;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *gd;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+	struct inode *alloc_inode = ac->ac_inode;
+
+	ret = ocfs2_read_group_descriptor(alloc_inode, di,
+					  res->sr_bg_blkno, &group_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
+				  ac->ac_max_block, res);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	if (!ret)
+		ocfs2_bg_discontig_fix_result(ac, gd, res);
+
+	/*
+	 * sr_bg_blkno might have been changed by
+	 * ocfs2_bg_discontig_fix_result
+	 */
+	res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+	if (ac->ac_find_loc_only)
+		goto out_loc_only;
+
+	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
+					       res->sr_bits,
+					       le16_to_cpu(gd->bg_chain));
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
+					 res->sr_bit_offset, res->sr_bits);
+	if (ret < 0) {
+		ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
+					       res->sr_bits,
+					       le16_to_cpu(gd->bg_chain));
+		mlog_errno(ret);
+	}
+
+out_loc_only:
+	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
+
+out:
+	brelse(group_bh);
+
+	return ret;
+}
+
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      handle_t *handle,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      struct ocfs2_suballoc_result *res,
+			      u16 *bits_left)
+{
+	int status;
+	u16 chain;
+	u64 next_group;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+	struct ocfs2_group_desc *bg;
+
+	chain = ac->ac_chain;
+	trace_ocfs2_search_chain_begin(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		bits_wanted, chain);
+
+	status = ocfs2_read_group_descriptor(alloc_inode, fe,
+					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
+					     &group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	status = -ENOSPC;
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
+					     bits_wanted, min_bits,
+					     ac->ac_max_block,
+					     res)) == -ENOSPC) {
+		if (!bg->bg_next_group)
+			break;
+
+		brelse(prev_group_bh);
+		prev_group_bh = NULL;
+
+		next_group = le64_to_cpu(bg->bg_next_group);
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs2_read_group_descriptor(alloc_inode, fe,
+						     next_group, &group_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_search_chain_succ(
+		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
+
+	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+
+	BUG_ON(res->sr_bits == 0);
+	if (!status)
+		ocfs2_bg_discontig_fix_result(ac, bg, res);
+
+	/*
+	 * sr_bg_blkno might have been changed by
+	 * ocfs2_bg_discontig_fix_result
+	 */
+	res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+	/*
+	 * Keep track of previous block descriptor read. When
+	 * we find a target, if we have read more than X
+	 * number of descriptors, and the target is reasonably
+	 * empty, relink him to top of his chain.
+	 *
+	 * We've read 0 extra blocks and only send one more to
+	 * the transaction, yet the next guy to search has a
+	 * much easier time.
+	 *
+	 * Do this *after* figuring out how many bits we're taking out
+	 * of our target group.
+	 */
+	if (!ac->ac_disable_chain_relink &&
+	    (prev_group_bh) &&
+	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
+		status = ocfs2_relink_block_group(handle, alloc_inode,
+						  ac->ac_bh, group_bh,
+						  prev_group_bh, chain);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (ac->ac_find_loc_only)
+		goto out_loc_only;
+
+	status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+						  ac->ac_bh, res->sr_bits,
+						  chain);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_set_bits(handle,
+					    alloc_inode,
+					    bg,
+					    group_bh,
+					    res->sr_bit_offset,
+					    res->sr_bits);
+	if (status < 0) {
+		ocfs2_rollback_alloc_dinode_counts(alloc_inode,
+					ac->ac_bh, res->sr_bits, chain);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_search_chain_end(
+			(unsigned long long)le64_to_cpu(fe->i_blkno),
+			res->sr_bits);
+
+out_loc_only:
+	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
+bail:
+	brelse(group_bh);
+	brelse(prev_group_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
+				     handle_t *handle,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     struct ocfs2_suballoc_result *res)
+{
+	int status;
+	u16 victim, i;
+	u16 bits_left = 0;
+	u64 hint = ac->ac_last_group;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_dinode *fe;
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
+	BUG_ON(!ac->ac_bh);
+
+	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+	/* The bh was validated by the inode read during
+	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
+	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
+		ocfs2_error(ac->ac_inode->i_sb,
+			    "Chain allocator dinode %llu has %u used "
+			    "bits but only %u total.",
+			    (unsigned long long)le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->id1.bitmap1.i_used),
+			    le32_to_cpu(fe->id1.bitmap1.i_total));
+		status = -EIO;
+		goto bail;
+	}
+
+	res->sr_bg_blkno = hint;
+	if (res->sr_bg_blkno) {
+		/* Attempt to short-circuit the usual search mechanism
+		 * by jumping straight to the most recently used
+		 * allocation group. This helps us maintain some
+		 * contiguousness across allocations. */
+		status = ocfs2_search_one_group(ac, handle, bits_wanted,
+						min_bits, res, &bits_left);
+		if (!status)
+			goto set_hint;
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+
+	victim = ocfs2_find_victim_chain(cl);
+	ac->ac_chain = victim;
+
+	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+				    res, &bits_left);
+	if (!status) {
+		hint = ocfs2_group_from_res(res);
+		goto set_hint;
+	}
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_claim_suballoc_bits(victim);
+
+	/* If we didn't pick a good victim, then just default to
+	 * searching each chain in order. Don't allow chain relinking
+	 * because we only calculate enough journal credits for one
+	 * relink per alloc. */
+	ac->ac_disable_chain_relink = 1;
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
+		if (i == victim)
+			continue;
+		if (!cl->cl_recs[i].c_free)
+			continue;
+
+		ac->ac_chain = i;
+		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+					    res, &bits_left);
+		if (!status) {
+			hint = ocfs2_group_from_res(res);
+			break;
+		}
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+set_hint:
+	if (status != -ENOSPC) {
+		/* If the next search of this group is not likely to
+		 * yield a suitable extent, then we reset the last
+		 * group hint so as to not waste a disk read */
+		if (bits_left < min_bits)
+			ac->ac_last_group = 0;
+		else
+			ac->ac_last_group = hint;
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_claim_metadata(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u64 *suballoc_loc,
+			 u16 *suballoc_bit_start,
+			 unsigned int *num_bits,
+			 u64 *blkno_start)
+{
+	int status;
+	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
+
+	status = ocfs2_claim_suballoc_bits(ac,
+					   handle,
+					   bits_wanted,
+					   1,
+					   &res);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+	*suballoc_loc = res.sr_bg_blkno;
+	*suballoc_bit_start = res.sr_bit_offset;
+	*blkno_start = res.sr_blkno;
+	ac->ac_bits_given += res.sr_bits;
+	*num_bits = res.sr_bits;
+	status = 0;
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+				      struct buffer_head *parent_di_bh,
+				      struct ocfs2_alloc_context *ac)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
+	/*
+	 * Try to allocate inodes from some specific group.
+	 *
+	 * If the parent dir has recorded the last group used in allocation,
+	 * cool, use it. Otherwise if we try to allocate new inode from the
+	 * same slot the parent dir belongs to, use the same chunk.
+	 *
+	 * We are very careful here to avoid the mistake of setting
+	 * ac_last_group to a group descriptor from a different (unlocked) slot.
+	 */
+	if (OCFS2_I(dir)->ip_last_used_group &&
+	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+	else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+		if (di->i_suballoc_loc)
+			ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+		else
+			ac->ac_last_group = ocfs2_which_suballoc_group(
+					le64_to_cpu(di->i_blkno),
+					le16_to_cpu(di->i_suballoc_bit));
+	}
+}
+
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+					     struct ocfs2_alloc_context *ac)
+{
+	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
+
+int ocfs2_find_new_inode_loc(struct inode *dir,
+			     struct buffer_head *parent_fe_bh,
+			     struct ocfs2_alloc_context *ac,
+			     u64 *fe_blkno)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_suballoc_result *res;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+	res = kzalloc(sizeof(*res), GFP_NOFS);
+	if (res == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+	/*
+	 * The handle started here is for chain relink. Alternatively,
+	 * we could just disable relink for these calls.
+	 */
+	handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * This will instruct ocfs2_claim_suballoc_bits and
+	 * ocfs2_search_one_group to search but save actual allocation
+	 * for later.
+	 */
+	ac->ac_find_loc_only = 1;
+
+	ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ac->ac_find_loc_priv = res;
+	*fe_blkno = res->sr_blkno;
+	ocfs2_update_inode_fsync_trans(handle, dir, 0);
+out:
+	if (handle)
+		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+	if (ret)
+		kfree(res);
+
+	return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+				 struct inode *dir,
+				 struct ocfs2_alloc_context *ac,
+				 u64 *suballoc_loc,
+				 u16 *suballoc_bit,
+				 u64 di_blkno)
+{
+	int ret;
+	u16 chain;
+	struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+	/*
+	 * Since di_blkno is being passed back in, we check for any
+	 * inconsistencies which may have happened between
+	 * calls. These are code bugs as di_blkno is not expected to
+	 * change once returned from ocfs2_find_new_inode_loc()
+	 */
+	BUG_ON(res->sr_blkno != di_blkno);
+
+	ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+					  res->sr_bg_stable_blkno, &bg_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	chain = le16_to_cpu(bg->bg_chain);
+
+	ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+					       ac->ac_bh, res->sr_bits,
+					       chain);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle,
+					 ac->ac_inode,
+					 bg,
+					 bg_bh,
+					 res->sr_bit_offset,
+					 res->sr_bits);
+	if (ret < 0) {
+		ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
+					       ac->ac_bh, res->sr_bits, chain);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
+					   res->sr_bits);
+
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+	BUG_ON(res->sr_bits != 1);
+
+	*suballoc_loc = res->sr_bg_blkno;
+	*suballoc_bit = res->sr_bit_offset;
+	ac->ac_bits_given++;
+	ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+	brelse(bg_bh);
+
+	return ret;
+}
+
+int ocfs2_claim_new_inode(handle_t *handle,
+			  struct inode *dir,
+			  struct buffer_head *parent_fe_bh,
+			  struct ocfs2_alloc_context *ac,
+			  u64 *suballoc_loc,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno)
+{
+	int status;
+	struct ocfs2_suballoc_result res;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+	status = ocfs2_claim_suballoc_bits(ac,
+					   handle,
+					   1,
+					   1,
+					   &res);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+	BUG_ON(res.sr_bits != 1);
+
+	*suballoc_loc = res.sr_bg_blkno;
+	*suballoc_bit = res.sr_bit_offset;
+	*fe_blkno = res.sr_blkno;
+	ac->ac_bits_given++;
+	ocfs2_save_inode_ac_group(dir, ac);
+	status = 0;
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* translate a group desc. blkno and it's bitmap offset into
+ * disk cluster offset. */
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 cluster = 0;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg_blkno != osb->first_cluster_group_blkno)
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
+	cluster += (u32) bg_bit_off;
+	return cluster;
+}
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 group_no;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	group_no = cluster / osb->bitmap_cpg;
+	if (!group_no)
+		return osb->first_cluster_group_blkno;
+	return ocfs2_clusters_to_blocks(inode->i_sb,
+					group_no * osb->bitmap_cpg);
+}
+
+/* given the block number of a cluster start, calculate which cluster
+ * group and descriptor bitmap offset that corresponds to. */
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	*bg_blkno = ocfs2_which_cluster_group(inode,
+					      data_cluster);
+
+	if (*bg_blkno == osb->first_cluster_group_blkno)
+		*bg_bit_off = (u16) data_cluster;
+	else
+		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
+							     data_blkno - *bg_blkno);
+}
+
+/*
+ * min_bits - minimum contiguous chunk from this total allocation we
+ * can handle. set to what we asked for originally for a full
+ * contig. allocation, set to '1' to indicate we can deal with extents
+ * of any size.
+ */
+int __ocfs2_claim_clusters(handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters)
+{
+	int status;
+	unsigned int bits_wanted = max_clusters;
+	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
+	       && ac->ac_which != OCFS2_AC_USE_MAIN);
+
+	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+		WARN_ON(min_clusters > 1);
+
+		status = ocfs2_claim_local_alloc_bits(osb,
+						      handle,
+						      ac,
+						      bits_wanted,
+						      cluster_start,
+						      num_clusters);
+		if (!status)
+			atomic_inc(&osb->alloc_stats.local_data);
+	} else {
+		if (min_clusters > (osb->bitmap_cpg - 1)) {
+			/* The only paths asking for contiguousness
+			 * should know about this already. */
+			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
+			     "group bitmap size %u!\n", min_clusters,
+			     osb->bitmap_cpg);
+			status = -ENOSPC;
+			goto bail;
+		}
+		/* clamp the current request down to a realistic size. */
+		if (bits_wanted > (osb->bitmap_cpg - 1))
+			bits_wanted = osb->bitmap_cpg - 1;
+
+		status = ocfs2_claim_suballoc_bits(ac,
+						   handle,
+						   bits_wanted,
+						   min_clusters,
+						   &res);
+		if (!status) {
+			BUG_ON(res.sr_blkno); /* cluster alloc can't set */
+			*cluster_start =
+				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
+								 res.sr_bg_blkno,
+								 res.sr_bit_offset);
+			atomic_inc(&osb->alloc_stats.bitmap_data);
+			*num_clusters = res.sr_bits;
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ac->ac_bits_given += *num_clusters;
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_claim_clusters(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+
+	return __ocfs2_claim_clusters(handle, ac, min_clusters,
+				      bits_wanted, cluster_start, num_clusters);
+}
+
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+					struct inode *alloc_inode,
+					struct ocfs2_group_desc *bg,
+					struct buffer_head *group_bh,
+					unsigned int bit_off,
+					unsigned int num_bits,
+					void (*undo_fn)(unsigned int bit,
+							unsigned long *bmap))
+{
+	int status;
+	unsigned int tmp;
+	struct ocfs2_group_desc *undo_bg = NULL;
+
+	/* The caller got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+
+	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
+
+	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
+	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+					 group_bh,
+					 undo_fn ?
+					 OCFS2_JOURNAL_ACCESS_UNDO :
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (undo_fn) {
+		jbd_lock_bh_state(group_bh);
+		undo_bg = (struct ocfs2_group_desc *)
+					bh2jh(group_bh)->b_committed_data;
+		BUG_ON(!undo_bg);
+	}
+
+	tmp = num_bits;
+	while(tmp--) {
+		ocfs2_clear_bit((bit_off + tmp),
+				(unsigned long *) bg->bg_bitmap);
+		if (undo_fn)
+			undo_fn(bit_off + tmp,
+				(unsigned long *) undo_bg->bg_bitmap);
+	}
+	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
+
+	if (undo_fn)
+		jbd_unlock_bh_state(group_bh);
+
+	ocfs2_journal_dirty(handle, group_bh);
+bail:
+	return status;
+}
+
+/*
+ * expects the suballoc inode to already be locked.
+ */
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+				     struct inode *alloc_inode,
+				     struct buffer_head *alloc_bh,
+				     unsigned int start_bit,
+				     u64 bg_blkno,
+				     unsigned int count,
+				     void (*undo_fn)(unsigned int bit,
+						     unsigned long *bitmap))
+{
+	int status = 0;
+	u32 tmp_used;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group;
+
+	/* The alloc_bh comes from ocfs2_free_dinode() or
+	 * ocfs2_free_clusters().  The callers have all locked the
+	 * allocator and gotten alloc_bh from the lock call.  This
+	 * validates the dinode buffer.  Any corruption that has happened
+	 * is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
+
+	trace_ocfs2_free_suballoc_bits(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		(unsigned long long)bg_blkno,
+		start_bit, count);
+
+	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+					     &group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+
+	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
+					      group, group_bh,
+					      start_bit, count, undo_fn);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
+				start_bit, count);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
+		     count);
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+	ocfs2_journal_dirty(handle, alloc_bh);
+
+bail:
+	brelse(group_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_free_suballoc_bits(handle_t *handle,
+			     struct inode *alloc_inode,
+			     struct buffer_head *alloc_bh,
+			     unsigned int start_bit,
+			     u64 bg_blkno,
+			     unsigned int count)
+{
+	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+					 start_bit, bg_blkno, count, NULL);
+}
+
+int ocfs2_free_dinode(handle_t *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di)
+{
+	u64 blk = le64_to_cpu(di->i_blkno);
+	u16 bit = le16_to_cpu(di->i_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	if (di->i_suballoc_loc)
+		bg_blkno = le64_to_cpu(di->i_suballoc_loc);
+	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
+					inode_alloc_bh, bit, bg_blkno, 1);
+}
+
+static int _ocfs2_free_clusters(handle_t *handle,
+				struct inode *bitmap_inode,
+				struct buffer_head *bitmap_bh,
+				u64 start_blk,
+				unsigned int num_clusters,
+				void (*undo_fn)(unsigned int bit,
+						unsigned long *bitmap))
+{
+	int status;
+	u16 bg_start_bit;
+	u64 bg_blkno;
+	struct ocfs2_dinode *fe;
+
+	/* You can't ever have a contiguous set of clusters
+	 * bigger than a block group bitmap so we never have to worry
+	 * about looping on them.
+	 * This is expensive. We can safely remove once this stuff has
+	 * gotten tested really well. */
+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+
+	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
+
+	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
+				     &bg_start_bit);
+
+	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
+			(unsigned long long)start_blk,
+			bg_start_bit, num_clusters);
+
+	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					   bg_start_bit, bg_blkno,
+					   num_clusters, undo_fn);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+					 num_clusters);
+
+out:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_free_clusters(handle_t *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_clear_bit);
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
+	printk("bg_next_group:      %llu\n",
+	       (unsigned long long)bg->bg_next_group);
+	printk("bg_parent_dinode:   %llu\n",
+	       (unsigned long long)bg->bg_parent_dinode);
+	printk("bg_blkno:           %llu\n",
+	       (unsigned long long)bg->bg_blkno);
+}
+
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %llu\n",
+	       (unsigned long long)fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n",
+	       le32_to_cpu(fe->i_generation));
+	printk("id1.bitmap1.i_used:           %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_used));
+	printk("id1.bitmap1.i_total:          %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_total));
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n",
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
+		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+}
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+			  struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_to_add == 0)
+		goto out;
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
+
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+				       u16 *suballoc_slot, u64 *group_blkno,
+				       u16 *suballoc_bit)
+{
+	int status;
+	struct buffer_head *inode_bh = NULL;
+	struct ocfs2_dinode *inode_fe;
+
+	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
+
+	/* dirty read disk */
+	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+	if (status < 0) {
+		mlog(ML_ERROR, "read block %llu failed %d\n",
+		     (unsigned long long)blkno, status);
+		goto bail;
+	}
+
+	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+		mlog(ML_ERROR, "invalid inode %llu requested\n",
+		     (unsigned long long)blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
+	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+		     (unsigned long long)blkno,
+		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (suballoc_slot)
+		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+	if (suballoc_bit)
+		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+	if (group_blkno)
+		*group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
+
+bail:
+	brelse(inode_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * test whether bit is SET in allocator bitmap or not.  on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
+ * is returned and *res is meaningless.  Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+				   struct inode *suballoc,
+				   struct buffer_head *alloc_bh,
+				   u64 group_blkno, u64 blkno,
+				   u16 bit, int *res)
+{
+	struct ocfs2_dinode *alloc_di;
+	struct ocfs2_group_desc *group;
+	struct buffer_head *group_bh = NULL;
+	u64 bg_blkno;
+	int status;
+
+	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
+				      (unsigned int)bit);
+
+	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
+		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+		     (unsigned int)bit,
+		     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	bg_blkno = group_blkno ? group_blkno :
+		   ocfs2_which_suballoc_group(blkno, bit);
+	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
+					     &group_bh);
+	if (status < 0) {
+		mlog(ML_ERROR, "read group %llu failed %d\n",
+		     (unsigned long long)bg_blkno, status);
+		goto bail;
+	}
+
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+
+bail:
+	brelse(group_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+	int status;
+	u64 group_blkno = 0;
+	u16 suballoc_bit = 0, suballoc_slot = 0;
+	struct inode *inode_alloc_inode;
+	struct buffer_head *alloc_bh = NULL;
+
+	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
+
+	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+					     &group_blkno, &suballoc_bit);
+	if (status < 0) {
+		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+		goto bail;
+	}
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    suballoc_slot);
+	if (!inode_alloc_inode) {
+		/* the error code could be inaccurate, but we are not able to
+		 * get the correct one. */
+		status = -EINVAL;
+		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+		     (u32)suballoc_slot);
+		goto bail;
+	}
+
+	mutex_lock(&inode_alloc_inode->i_mutex);
+	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+	if (status < 0) {
+		mutex_unlock(&inode_alloc_inode->i_mutex);
+		iput(inode_alloc_inode);
+		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+		     (u32)suballoc_slot, status);
+		goto bail;
+	}
+
+	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+					 group_blkno, blkno, suballoc_bit, res);
+	if (status < 0)
+		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+
+	ocfs2_inode_unlock(inode_alloc_inode, 0);
+	mutex_unlock(&inode_alloc_inode->i_mutex);
+
+	iput(inode_alloc_inode);
+	brelse(alloc_bh);
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
diff --git a/kernel/fs/ocfs2/suballoc.h b/kernel/fs/ocfs2/suballoc.h
new file mode 100644
index 000000000..2d2501767
--- /dev/null
+++ b/kernel/fs/ocfs2/suballoc.h
@@ -0,0 +1,237 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+struct ocfs2_suballoc_result;
+typedef int (group_search_t)(struct inode *,
+			     struct buffer_head *,
+			     u32,			/* bits_wanted */
+			     u32,			/* min_bits */
+			     u64,			/* max_block */
+			     struct ocfs2_suballoc_result *);
+							/* found bits */
+
+struct ocfs2_alloc_context {
+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
+	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_alloc_slot;   /* which slot are we allocating from? */
+	u32    ac_bits_wanted;
+	u32    ac_bits_given;
+#define OCFS2_AC_USE_LOCAL 1
+#define OCFS2_AC_USE_MAIN  2
+#define OCFS2_AC_USE_INODE 3
+#define OCFS2_AC_USE_META  4
+	u32    ac_which;
+
+	/* these are used by the chain search */
+	u16    ac_chain;
+	int    ac_disable_chain_relink;
+	group_search_t *ac_group_search;
+
+	u64    ac_last_group;
+	u64    ac_max_block;  /* Highest block number to allocate. 0 is
+				 is the same as ~0 - unlimited */
+
+	int    ac_find_loc_only;  /* hack for reflink operation ordering */
+	struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
+	struct ocfs2_alloc_reservation	*ac_resv;
+};
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb);
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
+static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
+{
+	return ac->ac_bits_wanted - ac->ac_bits_given;
+}
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac);
+
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+			 handle_t *handle,
+			 struct buffer_head *di_bh,
+			 u32 num_bits,
+			 u16 chain);
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+			 struct buffer_head *di_bh,
+			 u32 num_bits,
+			 u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+			 struct inode *alloc_inode,
+			 struct ocfs2_group_desc *bg,
+			 struct buffer_head *group_bh,
+			 unsigned int bit_off,
+			 unsigned int num_bits);
+
+int ocfs2_claim_metadata(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u64 *suballoc_loc,
+			 u16 *suballoc_bit_start,
+			 u32 *num_bits,
+			 u64 *blkno_start);
+int ocfs2_claim_new_inode(handle_t *handle,
+			  struct inode *dir,
+			  struct buffer_head *parent_fe_bh,
+			  struct ocfs2_alloc_context *ac,
+			  u64 *suballoc_loc,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno);
+int ocfs2_claim_clusters(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters);
+/*
+ * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * number of clusters smaller than the allocation reserved.
+ */
+int __ocfs2_claim_clusters(handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters);
+
+int ocfs2_free_suballoc_bits(handle_t *handle,
+			     struct inode *alloc_inode,
+			     struct buffer_head *alloc_bh,
+			     unsigned int start_bit,
+			     u64 bg_blkno,
+			     unsigned int count);
+int ocfs2_free_dinode(handle_t *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di);
+int ocfs2_free_clusters(handle_t *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters);
+
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+	u64 group = block - (u64) bit;
+
+	return group;
+}
+
+static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
+					  u64 bg_blkno)
+{
+	/* This should work for all block group descriptors as only
+	 * the 1st group descriptor of the cluster bitmap is
+	 * different. */
+
+	if (bg_blkno == osb->first_cluster_group_blkno)
+		return 0;
+
+	/* the rest of the block groups are located at the beginning
+	 * of their 1st cluster, so a direct translation just
+	 * works. */
+	return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
+}
+
+static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
+}
+
+/* This is for local alloc ONLY. Others should use the task-specific
+ * apis above. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
+
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh);
+
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
+
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+			     struct buffer_head *parent_fe_bh,
+			     struct ocfs2_alloc_context *ac,
+			     u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+				 struct inode *dir,
+				 struct ocfs2_alloc_context *ac,
+				 u64 *suballoc_loc,
+				 u16 *suballoc_bit,
+				 u64 di_blkno);
+
+#endif /* _CHAINALLOC_H_ */
diff --git a/kernel/fs/ocfs2/super.c b/kernel/fs/ocfs2/super.c
new file mode 100644
index 000000000..403c5660b
--- /dev/null
+++ b/kernel/fs/ocfs2/super.c
@@ -0,0 +1,2644 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.c
+ *
+ * load/unload driver, mount/dismount volumes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/parser.h>
+#include <linux/crc32.h>
+#include <linux/debugfs.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/quotaops.h>
+#include <linux/cleancache.h>
+
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+/* this should be the only file to include a version 1 header */
+#include "ocfs1_fs_compat.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "xattr.h"
+#include "quota.h"
+#include "refcounttree.h"
+#include "suballoc.h"
+
+#include "buffer_head_io.h"
+
+static struct kmem_cache *ocfs2_inode_cachep;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+/* OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own. */
+struct workqueue_struct *ocfs2_wq = NULL;
+
+static struct dentry *ocfs2_debugfs_root;
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
+
+struct mount_options
+{
+	unsigned long	commit_interval;
+	unsigned long	mount_opt;
+	unsigned int	atime_quantum;
+	signed short	slot;
+	int		localalloc_opt;
+	unsigned int	resv_level;
+	int		dir_resv_level;
+	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+};
+
+static int ocfs2_parse_options(struct super_block *sb, char *options,
+			       struct mount_options *mopt,
+			       int is_remount);
+static int ocfs2_check_set_options(struct super_block *sb,
+				   struct mount_options *options);
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
+static void ocfs2_put_super(struct super_block *sb);
+static int ocfs2_mount_volume(struct super_block *sb);
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
+static int ocfs2_initialize_mem_caches(void);
+static void ocfs2_free_mem_caches(void);
+static void ocfs2_delete_osb(struct ocfs2_super *osb);
+
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait);
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_check_volume(struct ocfs2_super *osb);
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 sectsize,
+			       struct ocfs2_blockcheck_stats *stats);
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size,
+				  struct ocfs2_blockcheck_stats *stats);
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size);
+static struct inode *ocfs2_alloc_inode(struct super_block *sb);
+static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
+
+static struct dquot **ocfs2_get_dquots(struct inode *inode)
+{
+	return OCFS2_I(inode)->i_dquot;
+}
+
+static const struct super_operations ocfs2_sops = {
+	.statfs		= ocfs2_statfs,
+	.alloc_inode	= ocfs2_alloc_inode,
+	.destroy_inode	= ocfs2_destroy_inode,
+	.drop_inode	= ocfs2_drop_inode,
+	.evict_inode	= ocfs2_evict_inode,
+	.sync_fs	= ocfs2_sync_fs,
+	.put_super	= ocfs2_put_super,
+	.remount_fs	= ocfs2_remount,
+	.show_options   = ocfs2_show_options,
+	.quota_read	= ocfs2_quota_read,
+	.quota_write	= ocfs2_quota_write,
+	.get_dquots	= ocfs2_get_dquots,
+};
+
+enum {
+	Opt_barrier,
+	Opt_err_panic,
+	Opt_err_ro,
+	Opt_intr,
+	Opt_nointr,
+	Opt_hb_none,
+	Opt_hb_local,
+	Opt_hb_global,
+	Opt_data_ordered,
+	Opt_data_writeback,
+	Opt_atime_quantum,
+	Opt_slot,
+	Opt_commit,
+	Opt_localalloc,
+	Opt_localflocks,
+	Opt_stack,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_inode64,
+	Opt_acl,
+	Opt_noacl,
+	Opt_usrquota,
+	Opt_grpquota,
+	Opt_coherency_buffered,
+	Opt_coherency_full,
+	Opt_resv_level,
+	Opt_dir_resv_level,
+	Opt_journal_async_commit,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_barrier, "barrier=%u"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_intr, "intr"},
+	{Opt_nointr, "nointr"},
+	{Opt_hb_none, OCFS2_HB_NONE},
+	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_hb_global, OCFS2_HB_GLOBAL},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_atime_quantum, "atime_quantum=%u"},
+	{Opt_slot, "preferred_slot=%u"},
+	{Opt_commit, "commit=%u"},
+	{Opt_localalloc, "localalloc=%d"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_stack, "cluster_stack=%s"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_inode64, "inode64"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_coherency_buffered, "coherency=buffered"},
+	{Opt_coherency_full, "coherency=full"},
+	{Opt_resv_level, "resv_level=%u"},
+	{Opt_dir_resv_level, "dir_resv_level=%u"},
+	{Opt_journal_async_commit, "journal_async_commit"},
+	{Opt_err, NULL}
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+	struct ocfs2_cluster_connection *cconn = osb->cconn;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
+	int i, out = 0;
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
+			"Device", osb->dev_str, osb->uuid_str,
+			osb->fs_generation, osb->vol_label);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %d  Flags: 0x%lX\n", "Volume",
+			atomic_read(&osb->vol_state), osb->osb_flags);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Block: %lu  Cluster: %d\n", "Sizes",
+			osb->sb->s_blocksize, osb->s_clustersize);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Compat: 0x%X  Incompat: 0x%X  "
+			"ROcompat: 0x%X\n",
+			"Features", osb->s_feature_compat,
+			osb->s_feature_incompat, osb->s_feature_ro_compat);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+			osb->s_mount_opt, osb->s_atime_quantum);
+
+	if (cconn) {
+		out += snprintf(buf + out, len - out,
+				"%10s => Stack: %s  Name: %*s  "
+				"Version: %d.%d\n", "Cluster",
+				(*osb->osb_cluster_stack == '\0' ?
+				 "o2cb" : osb->osb_cluster_stack),
+				cconn->cc_namelen, cconn->cc_name,
+				cconn->cc_version.pv_major,
+				cconn->cc_version.pv_minor);
+	}
+
+	spin_lock(&osb->dc_task_lock);
+	out += snprintf(buf + out, len - out,
+			"%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+			"WorkSeq: %lu\n", "DownCnvt",
+			(osb->dc_task ?  task_pid_nr(osb->dc_task) : -1),
+			osb->blocked_lock_count, osb->dc_wake_sequence,
+			osb->dc_work_sequence);
+	spin_unlock(&osb->dc_task_lock);
+
+	spin_lock(&osb->osb_lock);
+	out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
+			"Recovery",
+			(osb->recovery_thread_task ?
+			 task_pid_nr(osb->recovery_thread_task) : -1));
+	if (rm->rm_used == 0)
+		out += snprintf(buf + out, len - out, " None\n");
+	else {
+		for (i = 0; i < rm->rm_used; i++)
+			out += snprintf(buf + out, len - out, " %d",
+					rm->rm_entries[i]);
+		out += snprintf(buf + out, len - out, "\n");
+	}
+	spin_unlock(&osb->osb_lock);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Pid: %d  Interval: %lu\n", "Commit",
+			(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
+			osb->osb_commit_interval);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
+			"Journal", osb->journal->j_state,
+			osb->journal->j_trans_id,
+			atomic_read(&osb->journal->j_num_trans));
+
+	out += snprintf(buf + out, len - out,
+			"%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+			"SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
+			"Stats",
+			atomic_read(&osb->alloc_stats.bitmap_data),
+			atomic_read(&osb->alloc_stats.local_data),
+			atomic_read(&osb->alloc_stats.bg_allocs),
+			atomic_read(&osb->alloc_stats.moves),
+			atomic_read(&osb->alloc_stats.bg_extends));
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %u  Descriptor: %llu  Size: %u bits  "
+			"Default: %u bits\n",
+			"LocalAlloc", osb->local_alloc_state,
+			(unsigned long long)osb->la_last_gd,
+			osb->local_alloc_bits, osb->local_alloc_default_bits);
+
+	spin_lock(&osb->osb_lock);
+	out += snprintf(buf + out, len - out,
+			"%10s => InodeSlot: %d  StolenInodes: %d, "
+			"MetaSlot: %d  StolenMeta: %d\n", "Steal",
+			osb->s_inode_steal_slot,
+			atomic_read(&osb->s_num_inodes_stolen),
+			osb->s_meta_steal_slot,
+			atomic_read(&osb->s_num_meta_stolen));
+	spin_unlock(&osb->osb_lock);
+
+	out += snprintf(buf + out, len - out, "OrphanScan => ");
+	out += snprintf(buf + out, len - out, "Local: %u  Global: %u ",
+			os->os_count, os->os_seqno);
+	out += snprintf(buf + out, len - out, " Last Scan: ");
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+		out += snprintf(buf + out, len - out, "Disabled\n");
+	else
+		out += snprintf(buf + out, len - out, "%lu seconds ago\n",
+				(get_seconds() - os->os_scantime.tv_sec));
+
+	out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+			"Slots", "Num", "RecoGen");
+	for (i = 0; i < osb->max_slots; ++i) {
+		out += snprintf(buf + out, len - out,
+				"%10s  %c %3d  %10d\n",
+				" ",
+				(i == osb->slot_num ? '*' : ' '),
+				i, osb->slot_recovery_generations[i]);
+	}
+
+	return out;
+}
+
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+	struct ocfs2_super *osb = inode->i_private;
+	char *buf = NULL;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return 0;
+}
+#endif	/* CONFIG_DEBUG_FS */
+
+static const struct file_operations ocfs2_osb_debug_fops = {
+	.open =		ocfs2_osb_debug_open,
+	.release =	ocfs2_debug_release,
+	.read =		ocfs2_debug_read,
+	.llseek =	generic_file_llseek,
+};
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait)
+{
+	int status;
+	tid_t target;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (wait) {
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+	} else {
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+	}
+
+	if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+				      &target)) {
+		if (wait)
+			jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					     target);
+	}
+	return 0;
+}
+
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+	    && (ino == USER_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+		return 0;
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+	    && (ino == GROUP_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+		return 0;
+	return 1;
+}
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->root_inode = new;
+
+	new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->sys_root_inode = new;
+
+	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
+	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog_errno(status);
+			/* FIXME: Should ERROR_RO_FS */
+			mlog(ML_ERROR, "Unable to load system inode %d, "
+			     "possibly corrupt fs?", i);
+			goto bail;
+		}
+		// the array now has one ref, so drop this one
+		iput(new);
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
+	     i < NUM_SYSTEM_INODES;
+	     i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
+			     status, i, osb->slot_num);
+			goto bail;
+		}
+		/* the array now has one ref, so drop this one */
+		iput(new);
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
+{
+	int i;
+	struct inode *inode;
+
+	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+		inode = osb->global_system_inodes[i];
+		if (inode) {
+			iput(inode);
+			osb->global_system_inodes[i] = NULL;
+		}
+	}
+
+	inode = osb->sys_root_inode;
+	if (inode) {
+		iput(inode);
+		osb->sys_root_inode = NULL;
+	}
+
+	inode = osb->root_inode;
+	if (inode) {
+		iput(inode);
+		osb->root_inode = NULL;
+	}
+
+	if (!osb->local_system_inodes)
+		return;
+
+	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+		if (osb->local_system_inodes[i]) {
+			iput(osb->local_system_inodes[i]);
+			osb->local_system_inodes[i] = NULL;
+		}
+	}
+
+	kfree(osb->local_system_inodes);
+	osb->local_system_inodes = NULL;
+}
+
+/* We're allocating fs objects, use GFP_NOFS */
+static struct inode *ocfs2_alloc_inode(struct super_block *sb)
+{
+	struct ocfs2_inode_info *oi;
+
+	oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
+	if (!oi)
+		return NULL;
+
+	oi->i_sync_tid = 0;
+	oi->i_datasync_tid = 0;
+	memset(&oi->i_dquot, 0, sizeof(oi->i_dquot));
+
+	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
+	return &oi->vfs_inode;
+}
+
+static void ocfs2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
+}
+
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
+
+static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
+						unsigned int cbits)
+{
+	unsigned int bytes = 1 << cbits;
+	unsigned int trim = bytes;
+	unsigned int bitshift = 32;
+
+	/*
+	 * i_size and all block offsets in ocfs2 are always 64 bits
+	 * wide. i_clusters is 32 bits, in cluster-sized units. So on
+	 * 64 bit platforms, cluster size will be the limiting factor.
+	 */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBDAF)
+	BUILD_BUG_ON(sizeof(sector_t) != 8);
+	/*
+	 * We might be limited by page cache size.
+	 */
+	if (bytes > PAGE_CACHE_SIZE) {
+		bytes = PAGE_CACHE_SIZE;
+		trim = 1;
+		/*
+		 * Shift by 31 here so that we don't get larger than
+		 * MAX_LFS_FILESIZE
+		 */
+		bitshift = 31;
+	}
+# else
+	/*
+	 * We are limited by the size of sector_t. Use block size, as
+	 * that's what we expose to the VFS.
+	 */
+	bytes = 1 << bbits;
+	trim = 1;
+	bitshift = 31;
+# endif
+#endif
+
+	/*
+	 * Trim by a whole cluster when we can actually approach the
+	 * on-disk limits. Otherwise we can overflow i_clusters when
+	 * an extent start is at the max offset.
+	 */
+	return (((unsigned long long)bytes) << bitshift) - trim;
+}
+
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
+{
+	int incompat_features;
+	int ret = 0;
+	struct mount_options parsed_options;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u32 tmp;
+
+	sync_filesystem(sb);
+
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
+	    !ocfs2_check_set_options(sb, &parsed_options)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+		OCFS2_MOUNT_HB_NONE;
+	if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
+	    (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change data mode on remount\n");
+		goto out;
+	}
+
+	/* Probably don't want this on remount; it might
+	 * mess with other nodes */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+		goto out;
+	}
+
+	/* We're going to/from readonly mode. */
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Disable quota accounting before remounting RO */
+		if (*flags & MS_RDONLY) {
+			ret = ocfs2_susp_quotas(osb, 0);
+			if (ret < 0)
+				goto out;
+		}
+		/* Lock here so the check of HARD_RO and the potential
+		 * setting of SOFT_RO is atomic. */
+		spin_lock(&osb->osb_lock);
+		if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
+			mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
+			ret = -EROFS;
+			goto unlock_osb;
+		}
+
+		if (*flags & MS_RDONLY) {
+			sb->s_flags |= MS_RDONLY;
+			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+		} else {
+			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
+				mlog(ML_ERROR, "Cannot remount RDWR "
+				     "filesystem due to previous errors.\n");
+				ret = -EROFS;
+				goto unlock_osb;
+			}
+			incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
+			if (incompat_features) {
+				mlog(ML_ERROR, "Cannot remount RDWR because "
+				     "of unsupported optional features "
+				     "(%x).\n", incompat_features);
+				ret = -EINVAL;
+				goto unlock_osb;
+			}
+			sb->s_flags &= ~MS_RDONLY;
+			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
+		}
+		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
+unlock_osb:
+		spin_unlock(&osb->osb_lock);
+		/* Enable quota accounting after remounting RW */
+		if (!ret && !(*flags & MS_RDONLY)) {
+			if (sb_any_quota_suspended(sb))
+				ret = ocfs2_susp_quotas(osb, 1);
+			else
+				ret = ocfs2_enable_quotas(osb);
+			if (ret < 0) {
+				/* Return back changes... */
+				spin_lock(&osb->osb_lock);
+				sb->s_flags |= MS_RDONLY;
+				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+				spin_unlock(&osb->osb_lock);
+				goto out;
+			}
+		}
+	}
+
+	if (!ret) {
+		/* Only save off the new mount options in case of a successful
+		 * remount. */
+		osb->s_mount_opt = parsed_options.mount_opt;
+		osb->s_atime_quantum = parsed_options.atime_quantum;
+		osb->preferred_slot = parsed_options.slot;
+		if (parsed_options.commit_interval)
+			osb->osb_commit_interval = parsed_options.commit_interval;
+
+		if (!ocfs2_is_hard_readonly(osb))
+			ocfs2_set_journal_params(osb);
+
+		sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+							MS_POSIXACL : 0);
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_sb_probe(struct super_block *sb,
+			  struct buffer_head **bh,
+			  int *sector_size,
+			  struct ocfs2_blockcheck_stats *stats)
+{
+	int status, tmpstat;
+	struct ocfs1_vol_disk_hdr *hdr;
+	struct ocfs2_dinode *di;
+	int blksize;
+
+	*bh = NULL;
+
+	/* may be > 512 */
+	*sector_size = bdev_logical_block_size(sb->s_bdev);
+	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
+		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
+		     *sector_size, OCFS2_MAX_BLOCKSIZE);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* Can this really happen? */
+	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
+		*sector_size = OCFS2_MIN_BLOCKSIZE;
+
+	/* check block zero for old format */
+	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
+	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
+		mlog(ML_ERROR, "incompatible version: %u.%u\n",
+		     hdr->major_version, hdr->minor_version);
+		status = -EINVAL;
+	}
+	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
+		   strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
+		mlog(ML_ERROR, "incompatible volume signature: %8s\n",
+		     hdr->signature);
+		status = -EINVAL;
+	}
+	brelse(*bh);
+	*bh = NULL;
+	if (status < 0) {
+		mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
+		     "upgraded before mounting with ocfs v2\n");
+		goto bail;
+	}
+
+	/*
+	 * Now check at magic offset for 512, 1024, 2048, 4096
+	 * blocksizes.  4096 is the maximum blocksize because it is
+	 * the minimum clustersize.
+	 */
+	status = -EINVAL;
+	for (blksize = *sector_size;
+	     blksize <= OCFS2_MAX_BLOCKSIZE;
+	     blksize <<= 1) {
+		tmpstat = ocfs2_get_sector(sb, bh,
+					   OCFS2_SUPER_BLOCK_BLKNO,
+					   blksize);
+		if (tmpstat < 0) {
+			status = tmpstat;
+			mlog_errno(status);
+			break;
+		}
+		di = (struct ocfs2_dinode *) (*bh)->b_data;
+		memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+		spin_lock_init(&stats->b_lock);
+		tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
+		if (tmpstat < 0) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+		if (tmpstat != -EAGAIN) {
+			status = tmpstat;
+			break;
+		}
+	}
+
+bail:
+	return status;
+}
+
+static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
+{
+	u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+	if (osb->s_mount_opt & hb_enabled) {
+		if (ocfs2_mount_local(osb)) {
+			mlog(ML_ERROR, "Cannot heartbeat on a locally "
+			     "mounted device.\n");
+			return -EINVAL;
+		}
+		if (ocfs2_userspace_stack(osb)) {
+			mlog(ML_ERROR, "Userspace stack expected, but "
+			     "o2cb heartbeat arguments passed to mount\n");
+			return -EINVAL;
+		}
+		if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+		     !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+		    ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+		     ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+			mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!(osb->s_mount_opt & hb_enabled)) {
+		if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+		    !ocfs2_userspace_stack(osb)) {
+			mlog(ML_ERROR, "Heartbeat has to be started to mount "
+			     "a read-write clustered device.\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk.  If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+					struct mount_options *mopt)
+{
+	if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+		mlog(ML_ERROR,
+		     "cluster stack passed to mount, but this filesystem "
+		     "does not support it\n");
+		return -EINVAL;
+	}
+
+	if (ocfs2_userspace_stack(osb) &&
+	    strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+		    OCFS2_STACK_LABEL_LEN)) {
+		mlog(ML_ERROR,
+		     "cluster stack passed to mount (\"%s\") does not "
+		     "match the filesystem (\"%s\")\n",
+		     mopt->cluster_stack,
+		     osb->osb_cluster_stack);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+	int type;
+	struct super_block *sb = osb->sb;
+	unsigned int feature[OCFS2_MAXQUOTAS] = {
+					OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	int status = 0;
+
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		if (unsuspend)
+			status = dquot_resume(sb, type);
+		else {
+			struct ocfs2_mem_dqinfo *oinfo;
+
+			/* Cancel periodic syncing before suspending */
+			oinfo = sb_dqinfo(sb, type)->dqi_priv;
+			cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+			status = dquot_suspend(sb, type);
+		}
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+		     "remount (error = %d).\n", status);
+	return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+	struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL };
+	struct super_block *sb = osb->sb;
+	unsigned int feature[OCFS2_MAXQUOTAS] = {
+					OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[OCFS2_MAXQUOTAS] = {
+					LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	int status;
+	int type;
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+							osb->slot_num);
+		if (!inode[type]) {
+			status = -ENOENT;
+			goto out_quota_off;
+		}
+		status = dquot_enable(inode[type], type, QFMT_OCFS2,
+				      DQUOT_USAGE_ENABLED);
+		if (status < 0)
+			goto out_quota_off;
+	}
+
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++)
+		iput(inode[type]);
+	return 0;
+out_quota_off:
+	ocfs2_disable_quotas(osb);
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++)
+		iput(inode[type]);
+	mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+	int type;
+	struct inode *inode;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+
+	/* We mostly ignore errors in this function because there's not much
+	 * we can do when we see them */
+	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
+		if (!sb_has_quota_loaded(sb, type))
+			continue;
+		/* Cancel periodic syncing before we grab dqonoff_mutex */
+		oinfo = sb_dqinfo(sb, type)->dqi_priv;
+		cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+		inode = igrab(sb->s_dquot.files[type]);
+		/* Turn off quotas. This will remove all dquot structures from
+		 * memory and so they will be automatically synced to global
+		 * quota files */
+		dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
+					DQUOT_LIMITS_ENABLED);
+		if (!inode)
+			continue;
+		iput(inode);
+	}
+}
+
+static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct dentry *root;
+	int status, sector_size;
+	struct mount_options parsed_options;
+	struct inode *inode = NULL;
+	struct ocfs2_super *osb = NULL;
+	struct buffer_head *bh = NULL;
+	char nodestr[12];
+	struct ocfs2_blockcheck_stats stats;
+
+	trace_ocfs2_fill_super(sb, data, silent);
+
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+
+	/* probe for superblock */
+	status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
+	if (status < 0) {
+		mlog(ML_ERROR, "superblock probe failed!\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
+	osb = OCFS2_SB(sb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto read_super_error;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	if (!ocfs2_check_set_options(sb, &parsed_options)) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+	osb->s_mount_opt = parsed_options.mount_opt;
+	osb->s_atime_quantum = parsed_options.atime_quantum;
+	osb->preferred_slot = parsed_options.slot;
+	osb->osb_commit_interval = parsed_options.commit_interval;
+
+	ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+	osb->osb_resv_level = parsed_options.resv_level;
+	osb->osb_dir_resv_level = parsed_options.resv_level;
+	if (parsed_options.dir_resv_level == -1)
+		osb->osb_dir_resv_level = parsed_options.resv_level;
+	else
+		osb->osb_dir_resv_level = parsed_options.dir_resv_level;
+
+	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+	if (status)
+		goto read_super_error;
+
+	sb->s_magic = OCFS2_SUPER_MAGIC;
+
+	sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	 * heartbeat=none */
+	if (bdev_read_only(sb->s_bdev)) {
+		if (!(sb->s_flags & MS_RDONLY)) {
+			status = -EACCES;
+			mlog(ML_ERROR, "Readonly device detected but readonly "
+			     "mount was not specified.\n");
+			goto read_super_error;
+		}
+
+		/* You should not be able to start a local heartbeat
+		 * on a readonly device. */
+		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+			status = -EROFS;
+			mlog(ML_ERROR, "Local heartbeat specified on readonly "
+			     "device.\n");
+			goto read_super_error;
+		}
+
+		status = ocfs2_check_journals_nolocks(osb);
+		if (status < 0) {
+			if (status == -EROFS)
+				mlog(ML_ERROR, "Recovery required on readonly "
+				     "file system, but write access is "
+				     "unavailable.\n");
+			else
+				mlog_errno(status);
+			goto read_super_error;
+		}
+
+		ocfs2_set_ro_flag(osb, 1);
+
+		printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+		       "Cluster services will not be used for this mount. "
+		       "Recovery will be skipped.\n", osb->dev_str);
+	}
+
+	if (!ocfs2_is_hard_readonly(osb)) {
+		if (sb->s_flags & MS_RDONLY)
+			ocfs2_set_ro_flag(osb, 0);
+	}
+
+	status = ocfs2_verify_heartbeat(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
+						 ocfs2_debugfs_root);
+	if (!osb->osb_debug_root) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
+		goto read_super_error;
+	}
+
+	osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+					    osb->osb_debug_root,
+					    osb,
+					    &ocfs2_osb_debug_fops);
+	if (!osb->osb_ctxt) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	if (ocfs2_meta_ecc(osb)) {
+		status = ocfs2_blockcheck_stats_debugfs_install(
+						&osb->osb_ecc_stats,
+						osb->osb_debug_root);
+		if (status) {
+			mlog(ML_ERROR,
+			     "Unable to create blockcheck statistics "
+			     "files\n");
+			goto read_super_error;
+		}
+	}
+
+	status = ocfs2_mount_volume(sb);
+	if (status < 0)
+		goto read_super_error;
+
+	if (osb->root_inode)
+		inode = igrab(osb->root_inode);
+
+	if (!inode) {
+		status = -EIO;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	root = d_make_root(inode);
+	if (!root) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	sb->s_root = root;
+
+	ocfs2_complete_mount_recovery(osb);
+
+	if (ocfs2_mount_local(osb))
+		snprintf(nodestr, sizeof(nodestr), "local");
+	else
+		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
+
+	printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
+	       "with %s data mode.\n",
+	       osb->dev_str, nodestr, osb->slot_num,
+	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
+	       "ordered");
+
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
+	wake_up(&osb->osb_mount_event);
+
+	/* Now we can initialize quotas because we can afford to wait
+	 * for cluster locks recovery now. That also means that truncation
+	 * log recovery can happen but that waits for proper quota setup */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		status = ocfs2_enable_quotas(osb);
+		if (status < 0) {
+			/* We have to err-out specially here because
+			 * s_root is already set */
+			mlog_errno(status);
+			atomic_set(&osb->vol_state, VOLUME_DISABLED);
+			wake_up(&osb->osb_mount_event);
+			return status;
+		}
+	}
+
+	ocfs2_complete_quota_recovery(osb);
+
+	/* Now we wake up again for processes waiting for quotas */
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+	wake_up(&osb->osb_mount_event);
+
+	/* Start this when the mount is almost sure of being successful */
+	ocfs2_orphan_scan_start(osb);
+
+	return status;
+
+read_super_error:
+	brelse(bh);
+
+	if (osb) {
+		atomic_set(&osb->vol_state, VOLUME_DISABLED);
+		wake_up(&osb->osb_mount_event);
+		ocfs2_dismount_volume(sb, 1);
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
+			int flags,
+			const char *dev_name,
+			void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+}
+
+static struct file_system_type ocfs2_fs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "ocfs2",
+	.mount          = ocfs2_mount,
+	.kill_sb        = kill_block_super,
+	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
+	.next           = NULL
+};
+MODULE_ALIAS_FS("ocfs2");
+
+static int ocfs2_check_set_options(struct super_block *sb,
+				   struct mount_options *options)
+{
+	if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+		mlog(ML_ERROR, "User quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		return 0;
+	}
+	if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+		mlog(ML_ERROR, "Group quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		return 0;
+	}
+	if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+	    !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
+		mlog(ML_ERROR, "ACL support requested but extended attributes "
+		     "feature is not enabled\n");
+		return 0;
+	}
+	/* No ACL setting specified? Use XATTR feature... */
+	if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
+				    OCFS2_MOUNT_NO_POSIX_ACL))) {
+		if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
+			options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+		else
+			options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+	}
+	return 1;
+}
+
+static int ocfs2_parse_options(struct super_block *sb,
+			       char *options,
+			       struct mount_options *mopt,
+			       int is_remount)
+{
+	int status, user_stack = 0;
+	char *p;
+	u32 tmp;
+
+	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
+
+	mopt->commit_interval = 0;
+	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
+	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+	mopt->slot = OCFS2_INVALID_SLOT;
+	mopt->localalloc_opt = -1;
+	mopt->cluster_stack[0] = '\0';
+	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+	mopt->dir_resv_level = -1;
+
+	if (!options) {
+		status = 1;
+		goto bail;
+	}
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, option;
+		substring_t args[MAX_OPT_ARGS];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_hb_local:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_hb_none:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+			break;
+		case Opt_hb_global:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
+			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option)
+				mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
+			else
+				mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
+			break;
+		case Opt_intr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_nointr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_err_panic:
+			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_data_ordered:
+			mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_data_writeback:
+			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_user_xattr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_nouser_xattr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_atime_quantum:
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= 0)
+				mopt->atime_quantum = option;
+			break;
+		case Opt_slot:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option)
+				mopt->slot = (s16)option;
+			break;
+		case Opt_commit:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
+			mopt->commit_interval = HZ * option;
+			break;
+		case Opt_localalloc:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= 0)
+				mopt->localalloc_opt = option;
+			break;
+		case Opt_localflocks:
+			/*
+			 * Changing this during remount could race
+			 * flock() requests, or "unbalance" existing
+			 * ones (e.g., a lock is taken in one mode but
+			 * dropped in the other). If users care enough
+			 * to flip locking modes during remount, we
+			 * could add a "local" flag to individual
+			 * flock structures for proper tracking of
+			 * state.
+			 */
+			if (!is_remount)
+				mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+			break;
+		case Opt_stack:
+			/* Check both that the option we were passed
+			 * is of the right length and that it is a proper
+			 * string of the right length.
+			 */
+			if (((args[0].to - args[0].from) !=
+			     OCFS2_STACK_LABEL_LEN) ||
+			    (strnlen(args[0].from,
+				     OCFS2_STACK_LABEL_LEN) !=
+			     OCFS2_STACK_LABEL_LEN)) {
+				mlog(ML_ERROR,
+				     "Invalid cluster_stack option\n");
+				status = 0;
+				goto bail;
+			}
+			memcpy(mopt->cluster_stack, args[0].from,
+			       OCFS2_STACK_LABEL_LEN);
+			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+			/*
+			 * Open code the memcmp here as we don't have
+			 * an osb to pass to
+			 * ocfs2_userspace_stack().
+			 */
+			if (memcmp(mopt->cluster_stack,
+				   OCFS2_CLASSIC_CLUSTER_STACK,
+				   OCFS2_STACK_LABEL_LEN))
+				user_stack = 1;
+			break;
+		case Opt_inode64:
+			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+			break;
+		case Opt_usrquota:
+			mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+			break;
+		case Opt_coherency_buffered:
+			mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
+		case Opt_coherency_full:
+			mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
+		case Opt_acl:
+			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
+			break;
+		case Opt_noacl:
+			mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+			break;
+		case Opt_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->resv_level = option;
+			break;
+		case Opt_dir_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->dir_resv_level = option;
+			break;
+		case Opt_journal_async_commit:
+			mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+			break;
+		default:
+			mlog(ML_ERROR,
+			     "Unrecognized mount option \"%s\" "
+			     "or missing value\n", p);
+			status = 0;
+			goto bail;
+		}
+	}
+
+	if (user_stack == 0) {
+		/* Ensure only one heartbeat mode */
+		tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+					 OCFS2_MOUNT_HB_GLOBAL |
+					 OCFS2_MOUNT_HB_NONE);
+		if (hweight32(tmp) != 1) {
+			mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+			status = 0;
+			goto bail;
+		}
+	}
+
+	status = 1;
+
+bail:
+	return status;
+}
+
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
+{
+	struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
+	unsigned long opts = osb->s_mount_opt;
+	unsigned int local_alloc_megs;
+
+	if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+		seq_printf(s, ",_netdev");
+		if (opts & OCFS2_MOUNT_HB_LOCAL)
+			seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+		else
+			seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+	} else
+		seq_printf(s, ",%s", OCFS2_HB_NONE);
+
+	if (opts & OCFS2_MOUNT_NOINTR)
+		seq_printf(s, ",nointr");
+
+	if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
+		seq_printf(s, ",data=writeback");
+	else
+		seq_printf(s, ",data=ordered");
+
+	if (opts & OCFS2_MOUNT_BARRIER)
+		seq_printf(s, ",barrier=1");
+
+	if (opts & OCFS2_MOUNT_ERRORS_PANIC)
+		seq_printf(s, ",errors=panic");
+	else
+		seq_printf(s, ",errors=remount-ro");
+
+	if (osb->preferred_slot != OCFS2_INVALID_SLOT)
+		seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
+
+	seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+
+	if (osb->osb_commit_interval)
+		seq_printf(s, ",commit=%u",
+			   (unsigned) (osb->osb_commit_interval / HZ));
+
+	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+	if (local_alloc_megs != ocfs2_la_default_mb(osb))
+		seq_printf(s, ",localalloc=%d", local_alloc_megs);
+
+	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+		seq_printf(s, ",localflocks,");
+
+	if (osb->osb_cluster_stack[0])
+		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+			   osb->osb_cluster_stack);
+	if (opts & OCFS2_MOUNT_USRQUOTA)
+		seq_printf(s, ",usrquota");
+	if (opts & OCFS2_MOUNT_GRPQUOTA)
+		seq_printf(s, ",grpquota");
+
+	if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+		seq_printf(s, ",coherency=buffered");
+	else
+		seq_printf(s, ",coherency=full");
+
+	if (opts & OCFS2_MOUNT_NOUSERXATTR)
+		seq_printf(s, ",nouser_xattr");
+	else
+		seq_printf(s, ",user_xattr");
+
+	if (opts & OCFS2_MOUNT_INODE64)
+		seq_printf(s, ",inode64");
+
+	if (opts & OCFS2_MOUNT_POSIX_ACL)
+		seq_printf(s, ",acl");
+	else
+		seq_printf(s, ",noacl");
+
+	if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+		seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+	if (osb->osb_dir_resv_level != osb->osb_resv_level)
+		seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
+	if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+		seq_printf(s, ",journal_async_commit");
+
+	return 0;
+}
+
+static int __init ocfs2_init(void)
+{
+	int status;
+
+	status = init_ocfs2_uptodate_cache();
+	if (status < 0)
+		goto out1;
+
+	status = ocfs2_initialize_mem_caches();
+	if (status < 0)
+		goto out2;
+
+	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+	if (!ocfs2_wq) {
+		status = -ENOMEM;
+		goto out3;
+	}
+
+	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
+	if (!ocfs2_debugfs_root) {
+		status = -ENOMEM;
+		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
+		goto out4;
+	}
+
+	ocfs2_set_locking_protocol();
+
+	status = register_quota_format(&ocfs2_quota_format);
+	if (status < 0)
+		goto out4;
+	status = register_filesystem(&ocfs2_fs_type);
+	if (!status)
+		return 0;
+
+	unregister_quota_format(&ocfs2_quota_format);
+out4:
+	destroy_workqueue(ocfs2_wq);
+	debugfs_remove(ocfs2_debugfs_root);
+out3:
+	ocfs2_free_mem_caches();
+out2:
+	exit_ocfs2_uptodate_cache();
+out1:
+	mlog_errno(status);
+	return status;
+}
+
+static void __exit ocfs2_exit(void)
+{
+	if (ocfs2_wq) {
+		flush_workqueue(ocfs2_wq);
+		destroy_workqueue(ocfs2_wq);
+	}
+
+	unregister_quota_format(&ocfs2_quota_format);
+
+	debugfs_remove(ocfs2_debugfs_root);
+
+	ocfs2_free_mem_caches();
+
+	unregister_filesystem(&ocfs2_fs_type);
+
+	exit_ocfs2_uptodate_cache();
+}
+
+static void ocfs2_put_super(struct super_block *sb)
+{
+	trace_ocfs2_put_super(sb);
+
+	ocfs2_sync_blockdev(sb);
+	ocfs2_dismount_volume(sb, 0);
+}
+
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ocfs2_super *osb;
+	u32 numbits, freebits;
+	int status;
+	struct ocfs2_dinode *bm_lock;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = NULL;
+
+	trace_ocfs2_statfs(dentry->d_sb, buf);
+
+	osb = OCFS2_SB(dentry->d_sb);
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		mlog(ML_ERROR, "failed to get bitmap inode\n");
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_inode_lock(inode, &bh, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bm_lock = (struct ocfs2_dinode *) bh->b_data;
+
+	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
+	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
+
+	buf->f_type = OCFS2_SUPER_MAGIC;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
+	buf->f_blocks = ((sector_t) numbits) *
+			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bfree = ((sector_t) freebits) *
+		       (osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = numbits;
+	buf->f_ffree = freebits;
+	buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
+				& 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
+				OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
+
+	brelse(bh);
+
+	ocfs2_inode_unlock(inode, 0);
+	status = 0;
+bail:
+	if (inode)
+		iput(inode);
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static void ocfs2_inode_init_once(void *data)
+{
+	struct ocfs2_inode_info *oi = data;
+
+	oi->ip_flags = 0;
+	oi->ip_open_count = 0;
+	spin_lock_init(&oi->ip_lock);
+	ocfs2_extent_map_init(&oi->vfs_inode);
+	INIT_LIST_HEAD(&oi->ip_io_markers);
+	oi->ip_dir_start_lookup = 0;
+	mutex_init(&oi->ip_unaligned_aio);
+	init_rwsem(&oi->ip_alloc_sem);
+	init_rwsem(&oi->ip_xattr_sem);
+	mutex_init(&oi->ip_io_mutex);
+
+	oi->ip_blkno = 0ULL;
+	oi->ip_clusters = 0;
+
+	ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
+	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
+
+	init_waitqueue_head(&oi->append_dio_wq);
+
+	ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
+				  &ocfs2_inode_caching_ops);
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+static int ocfs2_initialize_mem_caches(void)
+{
+	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
+				       sizeof(struct ocfs2_inode_info),
+				       0,
+				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+				       ocfs2_inode_init_once);
+	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+					sizeof(struct ocfs2_dquot),
+					0,
+					(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					NULL);
+	ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+					sizeof(struct ocfs2_quota_chunk),
+					0,
+					(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+					NULL);
+	if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+	    !ocfs2_qf_chunk_cachep) {
+		if (ocfs2_inode_cachep)
+			kmem_cache_destroy(ocfs2_inode_cachep);
+		if (ocfs2_dquot_cachep)
+			kmem_cache_destroy(ocfs2_dquot_cachep);
+		if (ocfs2_qf_chunk_cachep)
+			kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void ocfs2_free_mem_caches(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	if (ocfs2_inode_cachep)
+		kmem_cache_destroy(ocfs2_inode_cachep);
+	ocfs2_inode_cachep = NULL;
+
+	if (ocfs2_dquot_cachep)
+		kmem_cache_destroy(ocfs2_dquot_cachep);
+	ocfs2_dquot_cachep = NULL;
+
+	if (ocfs2_qf_chunk_cachep)
+		kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+	ocfs2_qf_chunk_cachep = NULL;
+}
+
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size)
+{
+	if (!sb_set_blocksize(sb, sect_size)) {
+		mlog(ML_ERROR, "unable to set blocksize\n");
+		return -EIO;
+	}
+
+	*bh = sb_getblk(sb, block);
+	if (!*bh) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+	lock_buffer(*bh);
+	if (!buffer_dirty(*bh))
+		clear_buffer_uptodate(*bh);
+	unlock_buffer(*bh);
+	ll_rw_block(READ, 1, bh);
+	wait_on_buffer(*bh);
+	if (!buffer_uptodate(*bh)) {
+		mlog_errno(-EIO);
+		brelse(*bh);
+		*bh = NULL;
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int ocfs2_mount_volume(struct super_block *sb)
+{
+	int status = 0;
+	int unlock_super = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_is_hard_readonly(osb))
+		goto leave;
+
+	status = ocfs2_dlm_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+	unlock_super = 1;
+
+	/* This will load up the node map and add ourselves to it. */
+	status = ocfs2_find_slot(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* load all node-local system inodes */
+	status = ocfs2_init_local_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_check_volume(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_truncate_log_init(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+leave:
+	if (unlock_super)
+		ocfs2_super_unlock(osb, 1);
+
+	return status;
+}
+
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
+{
+	int tmp, hangup_needed = 0;
+	struct ocfs2_super *osb = NULL;
+	char nodestr[12];
+
+	trace_ocfs2_dismount_volume(sb);
+
+	BUG_ON(!sb);
+	osb = OCFS2_SB(sb);
+	BUG_ON(!osb);
+
+	debugfs_remove(osb->osb_ctxt);
+
+	/* Orphan scan should be stopped as early as possible */
+	ocfs2_orphan_scan_stop(osb);
+
+	ocfs2_disable_quotas(osb);
+
+	/* All dquots should be freed by now */
+	WARN_ON(!llist_empty(&osb->dquot_drop_list));
+	/* Wait for worker to be done with the work structure in osb */
+	cancel_work_sync(&osb->dquot_drop_work);
+
+	ocfs2_shutdown_local_alloc(osb);
+
+	ocfs2_truncate_log_shutdown(osb);
+
+	/* This will disable recovery and flush any recovery work. */
+	ocfs2_recovery_exit(osb);
+
+	ocfs2_journal_shutdown(osb);
+
+	ocfs2_sync_blockdev(sb);
+
+	ocfs2_purge_refcount_trees(osb);
+
+	/* No cluster connection means we've failed during mount, so skip
+	 * all the steps which depended on that to complete. */
+	if (osb->cconn) {
+		tmp = ocfs2_super_lock(osb, 1);
+		if (tmp < 0) {
+			mlog_errno(tmp);
+			return;
+		}
+	}
+
+	if (osb->slot_num != OCFS2_INVALID_SLOT)
+		ocfs2_put_slot(osb);
+
+	if (osb->cconn)
+		ocfs2_super_unlock(osb, 1);
+
+	ocfs2_release_system_inodes(osb);
+
+	/*
+	 * If we're dismounting due to mount error, mount.ocfs2 will clean
+	 * up heartbeat.  If we're a local mount, there is no heartbeat.
+	 * If we failed before we got a uuid_str yet, we can't stop
+	 * heartbeat.  Otherwise, do it.
+	 */
+	if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+	    !ocfs2_is_hard_readonly(osb))
+		hangup_needed = 1;
+
+	if (osb->cconn)
+		ocfs2_dlm_shutdown(osb, hangup_needed);
+
+	ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
+	debugfs_remove(osb->osb_debug_root);
+
+	if (hangup_needed)
+		ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
+
+	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
+
+	if (ocfs2_mount_local(osb))
+		snprintf(nodestr, sizeof(nodestr), "local");
+	else
+		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
+
+	printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
+	       osb->dev_str, nodestr);
+
+	ocfs2_delete_osb(osb);
+	kfree(osb);
+	sb->s_dev = 0;
+	sb->s_fs_info = NULL;
+}
+
+static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
+				unsigned uuid_bytes)
+{
+	int i, ret;
+	char *ptr;
+
+	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
+
+	osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
+	if (osb->uuid_str == NULL)
+		return -ENOMEM;
+
+	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
+		/* print with null */
+		ret = snprintf(ptr, 3, "%02X", uuid[i]);
+		if (ret != 2) /* drop super cleans up */
+			return -EINVAL;
+		/* then only advance past the last char */
+		ptr += 2;
+	}
+
+	return 0;
+}
+
+/* Make sure entire volume is addressable by our journal.  Requires
+   osb_clusters_at_boot to be valid and for the journal to have been
+   initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+	int status = 0;
+	u64 max_block =
+		ocfs2_clusters_to_blocks(osb->sb,
+					 osb->osb_clusters_at_boot) - 1;
+
+	/* 32-bit block number is always OK. */
+	if (max_block <= (u32)~0ULL)
+		goto out;
+
+	/* Volume is "huge", so see if our journal is new enough to
+	   support it. */
+	if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+				       OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+	      jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+					       JBD2_FEATURE_INCOMPAT_64BIT))) {
+		mlog(ML_ERROR, "The journal cannot address the entire volume. "
+		     "Enable the 'block64' journal option with tunefs.ocfs2");
+		status = -EFBIG;
+		goto out;
+	}
+
+ out:
+	return status;
+}
+
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size,
+				  struct ocfs2_blockcheck_stats *stats)
+{
+	int status;
+	int i, cbits, bbits;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+	struct inode *inode = NULL;
+	struct ocfs2_journal *journal;
+	struct ocfs2_super *osb;
+	u64 total_blocks;
+
+	osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
+	if (!osb) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	sb->s_fs_info = osb;
+	sb->s_op = &ocfs2_sops;
+	sb->s_d_op = &ocfs2_dentry_ops;
+	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_qcop = &dquot_quotactl_sysfile_ops;
+	sb->dq_op = &ocfs2_quota_operations;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+	sb->s_xattr = ocfs2_xattr_handlers;
+	sb->s_time_gran = 1;
+	sb->s_flags |= MS_NOATIME;
+	/* this is needed to support O_LARGEFILE */
+	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+	memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+	       sizeof(di->id2.i_super.s_uuid));
+
+	osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+
+	for (i = 0; i < 3; i++)
+		osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+	osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
+
+	osb->sb = sb;
+	/* Save off for ocfs2_rw_direct */
+	osb->s_sectsize_bits = blksize_bits(sector_size);
+	BUG_ON(!osb->s_sectsize_bits);
+
+	spin_lock_init(&osb->dc_task_lock);
+	init_waitqueue_head(&osb->dc_event);
+	osb->dc_work_sequence = 0;
+	osb->dc_wake_sequence = 0;
+	INIT_LIST_HEAD(&osb->blocked_lock_list);
+	osb->blocked_lock_count = 0;
+	spin_lock_init(&osb->osb_lock);
+	spin_lock_init(&osb->osb_xattr_lock);
+	ocfs2_init_steal_slots(osb);
+
+	mutex_init(&osb->system_file_mutex);
+
+	atomic_set(&osb->alloc_stats.moves, 0);
+	atomic_set(&osb->alloc_stats.local_data, 0);
+	atomic_set(&osb->alloc_stats.bitmap_data, 0);
+	atomic_set(&osb->alloc_stats.bg_allocs, 0);
+	atomic_set(&osb->alloc_stats.bg_extends, 0);
+
+	/* Copy the blockcheck stats from the superblock probe */
+	osb->osb_ecc_stats = *stats;
+
+	ocfs2_init_node_maps(osb);
+
+	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
+		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+		     osb->max_slots);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	ocfs2_orphan_scan_init(osb);
+
+	status = ocfs2_recovery_init(osb);
+	if (status) {
+		mlog(ML_ERROR, "Unable to initialize recovery state\n");
+		mlog_errno(status);
+		goto bail;
+	}
+
+	init_waitqueue_head(&osb->checkpoint_event);
+
+	osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+
+	osb->slot_num = OCFS2_INVALID_SLOT;
+
+	osb->s_xattr_inline_size = le16_to_cpu(
+					di->id2.i_super.s_xattr_inline_size);
+
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+	osb->local_alloc_bh = NULL;
+	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
+
+	init_waitqueue_head(&osb->osb_mount_event);
+
+	status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
+	if (!osb->vol_label) {
+		mlog(ML_ERROR, "unable to alloc vol label\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	osb->slot_recovery_generations =
+		kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+			GFP_KERNEL);
+	if (!osb->slot_recovery_generations) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	init_waitqueue_head(&osb->osb_wipe_event);
+	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
+					sizeof(*osb->osb_orphan_wipes),
+					GFP_KERNEL);
+	if (!osb->osb_orphan_wipes) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->osb_rf_lock_tree = RB_ROOT;
+
+	osb->s_feature_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
+	osb->s_feature_ro_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
+	osb->s_feature_incompat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
+
+	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount because of unsupported "
+		     "optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+	if (!(osb->sb->s_flags & MS_RDONLY) &&
+	    (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount RDWR because of "
+		     "unsupported optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_clusterinfo_valid(osb)) {
+		osb->osb_stackflags =
+			OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
+		strlcpy(osb->osb_cluster_stack,
+		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+		       OCFS2_STACK_LABEL_LEN + 1);
+		if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+			mlog(ML_ERROR,
+			     "couldn't mount because of an invalid "
+			     "cluster stack label (%s) \n",
+			     osb->osb_cluster_stack);
+			status = -EINVAL;
+			goto bail;
+		}
+		strlcpy(osb->osb_cluster_name,
+			OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+			OCFS2_CLUSTER_NAME_LEN + 1);
+	} else {
+		/* The empty string is identical with classic tools that
+		 * don't know about s_cluster_info. */
+		osb->osb_cluster_stack[0] = '\0';
+	}
+
+	get_random_bytes(&osb->s_next_generation, sizeof(u32));
+
+	/* FIXME
+	 * This should be done in ocfs2_journal_init(), but unknown
+	 * ordering issues will cause the filesystem to crash.
+	 * If anyone wants to figure out what part of the code
+	 * refers to osb->journal before ocfs2_journal_init() is run,
+	 * be my guest.
+	 */
+	/* initialize our journal structure */
+
+	journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
+	if (!journal) {
+		mlog(ML_ERROR, "unable to alloc journal\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+	osb->journal = journal;
+	journal->j_osb = osb;
+
+	atomic_set(&journal->j_num_trans, 0);
+	init_rwsem(&journal->j_trans_barrier);
+	init_waitqueue_head(&journal->j_checkpointed);
+	spin_lock_init(&journal->j_lock);
+	journal->j_trans_id = (unsigned long) 1;
+	INIT_LIST_HEAD(&journal->j_la_cleanups);
+	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+	INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
+	init_llist_head(&osb->dquot_drop_list);
+
+	/* get some pseudo constants for clustersize bits */
+	osb->s_clustersize_bits =
+		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	osb->s_clustersize = 1 << osb->s_clustersize_bits;
+
+	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
+	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
+		mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
+		     osb->s_clustersize);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(di->i_clusters));
+
+	status = generic_check_addressable(osb->sb->s_blocksize_bits,
+					   total_blocks);
+	if (status) {
+		mlog(ML_ERROR, "Volume too large "
+		     "to mount safely on this system");
+		status = -EFBIG;
+		goto bail;
+	}
+
+	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
+				 sizeof(di->id2.i_super.s_uuid))) {
+		mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	strlcpy(osb->vol_label, di->id2.i_super.s_label,
+		OCFS2_MAX_VOL_LABEL_LEN);
+	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
+	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
+	osb->first_cluster_group_blkno =
+		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
+	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
+	trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
+				     (unsigned long long)osb->root_blkno,
+				     (unsigned long long)osb->system_dir_blkno,
+				     osb->s_clustersize_bits);
+
+	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
+	if (!osb->osb_dlm_debug) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_set(&osb->vol_state, VOLUME_INIT);
+
+	/* load root, system_dir, and all global system inodes */
+	status = ocfs2_init_global_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * global bitmap
+	 */
+	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+	osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
+	iput(inode);
+
+	osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+				 osb->s_feature_incompat) * 8;
+
+	status = ocfs2_init_slot_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	cleancache_init_shared_fs(sb);
+
+bail:
+	return status;
+}
+
+/*
+ * will return: -EAGAIN if it is ok to keep searching for superblocks
+ *              -EINVAL if there is a bad superblock
+ *              0 on success
+ */
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 blksz,
+			       struct ocfs2_blockcheck_stats *stats)
+{
+	int status = -EAGAIN;
+
+	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
+		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		/* We have to do a raw check of the feature here */
+		if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+		    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+			status = ocfs2_block_check_validate(bh->b_data,
+							    bh->b_size,
+							    &di->i_check,
+							    stats);
+			if (status)
+				goto out;
+		}
+		status = -EINVAL;
+		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+			mlog(ML_ERROR, "found superblock with incorrect block "
+			     "size: found %u, should be %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
+			       blksz);
+		} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
+			   OCFS2_MAJOR_REV_LEVEL ||
+			   le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
+			   OCFS2_MINOR_REV_LEVEL) {
+			mlog(ML_ERROR, "found superblock with bad version: "
+			     "found %u.%u, should be %u.%u\n",
+			     le16_to_cpu(di->id2.i_super.s_major_rev_level),
+			     le16_to_cpu(di->id2.i_super.s_minor_rev_level),
+			     OCFS2_MAJOR_REV_LEVEL,
+			     OCFS2_MINOR_REV_LEVEL);
+		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
+			mlog(ML_ERROR, "bad block number on superblock: "
+			     "found %llu, should be %llu\n",
+			     (unsigned long long)le64_to_cpu(di->i_blkno),
+			     (unsigned long long)bh->b_blocknr);
+		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
+			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
+			mlog(ML_ERROR, "bad cluster size found: %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
+		} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
+			mlog(ML_ERROR, "bad root_blkno: 0\n");
+		} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
+			mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
+		} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
+			mlog(ML_ERROR,
+			     "Superblock slots found greater than file system "
+			     "maximum: found %u, max %u\n",
+			     le16_to_cpu(di->id2.i_super.s_max_slots),
+			     OCFS2_MAX_SLOTS);
+		} else {
+			/* found it! */
+			status = 0;
+		}
+	}
+
+out:
+	if (status && status != -EAGAIN)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_check_volume(struct ocfs2_super *osb)
+{
+	int status;
+	int dirty;
+	int local;
+	struct ocfs2_dinode *local_alloc = NULL; /* only used if we
+						  * recover
+						  * ourselves. */
+
+	/* Init our journal object. */
+	status = ocfs2_journal_init(osb->journal, &dirty);
+	if (status < 0) {
+		mlog(ML_ERROR, "Could not initialize journal!\n");
+		goto finally;
+	}
+
+	/* Now that journal has been initialized, check to make sure
+	   entire volume is addressable. */
+	status = ocfs2_journal_addressable(osb);
+	if (status)
+		goto finally;
+
+	/* If the journal was unmounted cleanly then we don't want to
+	 * recover anything. Otherwise, journal_load will do that
+	 * dirty work for us :) */
+	if (!dirty) {
+		status = ocfs2_journal_wipe(osb->journal, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+	} else {
+		printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+		       "unmounted cleanly, recovering it.\n", osb->dev_str);
+	}
+
+	local = ocfs2_mount_local(osb);
+
+	/* will play back anything left in the journal. */
+	status = ocfs2_journal_load(osb->journal, local, dirty);
+	if (status < 0) {
+		mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
+		goto finally;
+	}
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+		jbd2_journal_set_features(osb->journal->j_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	else
+		jbd2_journal_clear_features(osb->journal->j_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+
+	if (dirty) {
+		/* recover my local alloc if we didn't unmount cleanly. */
+		status = ocfs2_begin_local_alloc_recovery(osb,
+							  osb->slot_num,
+							  &local_alloc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+		/* we complete the recovery process after we've marked
+		 * ourselves as mounted. */
+	}
+
+	status = ocfs2_load_local_alloc(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	if (dirty) {
+		/* Recovery will be completed after we've mounted the
+		 * rest of the volume. */
+		osb->dirty = 1;
+		osb->local_alloc_copy = local_alloc;
+		local_alloc = NULL;
+	}
+
+	/* go through each journal, trylock it and if you get the
+	 * lock, and it's marked as dirty, set the bit in the recover
+	 * map and launch a recovery thread for it. */
+	status = ocfs2_mark_dead_nodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	status = ocfs2_compute_replay_slots(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+finally:
+	kfree(local_alloc);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * The routine gets called from dismount or close whenever a dismount on
+ * volume is requested and the osb open count becomes 1.
+ * It will remove the osb from the global list and also free up all the
+ * initialized resources and fileobject.
+ */
+static void ocfs2_delete_osb(struct ocfs2_super *osb)
+{
+	/* This function assumes that the caller has the main osb resource */
+
+	ocfs2_free_slot_info(osb);
+
+	kfree(osb->osb_orphan_wipes);
+	kfree(osb->slot_recovery_generations);
+	/* FIXME
+	 * This belongs in journal shutdown, but because we have to
+	 * allocate osb->journal at the start of ocfs2_initialize_osb(),
+	 * we free it here.
+	 */
+	kfree(osb->journal);
+	kfree(osb->local_alloc_copy);
+	kfree(osb->uuid_str);
+	kfree(osb->vol_label);
+	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+	memset(osb, 0, sizeof(struct ocfs2_super));
+}
+
+/* Put OCFS2 into a readonly state, or (if the user specifies it),
+ * panic(). We do not support continue-on-error operation. */
+static void ocfs2_handle_error(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
+		panic("OCFS2: (device %s): panic forced after error\n",
+		      sb->s_id);
+
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY &&
+	    (ocfs2_is_soft_readonly(osb) ||
+	     ocfs2_is_hard_readonly(osb)))
+		return;
+
+	printk(KERN_CRIT "File system is now read-only due to the potential "
+	       "of on-disk corruption. Please run fsck.ocfs2 once the file "
+	       "system is unmounted.\n");
+	sb->s_flags |= MS_RDONLY;
+	ocfs2_set_ro_flag(osb, 0);
+}
+
+void __ocfs2_error(struct super_block *sb, const char *function,
+		  const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	/* Not using mlog here because we want to show the actual
+	 * function the error came from. */
+	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	ocfs2_handle_error(sb);
+}
+
+/* Handle critical errors. This is intentionally more drastic than
+ * ocfs2_handle_error, so we only use for things like journal errors,
+ * etc. */
+void __ocfs2_abort(struct super_block *sb, const char *function,
+		   const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	/* We don't have the cluster support yet to go straight to
+	 * hard readonly in here. Until then, we want to keep
+	 * ocfs2_abort() so that we can at least mark critical
+	 * errors.
+	 *
+	 * TODO: This should abort the journal and alert other nodes
+	 * that our slot needs recovery. */
+
+	/* Force a panic(). This stinks, but it's better than letting
+	 * things continue without having a proper hard readonly
+	 * here. */
+	if (!ocfs2_mount_local(OCFS2_SB(sb)))
+		OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+	ocfs2_handle_error(sb);
+}
+
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+	int rc;
+	sigset_t blocked;
+
+	sigfillset(&blocked);
+	rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+	BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+	int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+	BUG_ON(rc);
+}
+
+module_init(ocfs2_init);
+module_exit(ocfs2_exit);
diff --git a/kernel/fs/ocfs2/super.h b/kernel/fs/ocfs2/super.h
new file mode 100644
index 000000000..74ff74cf7
--- /dev/null
+++ b/kernel/fs/ocfs2/super.h
@@ -0,0 +1,53 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SUPER_H
+#define OCFS2_SUPER_H
+
+extern struct workqueue_struct *ocfs2_wq;
+
+int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
+				  int node_num);
+
+__printf(3, 4)
+void __ocfs2_error(struct super_block *sb, const char *function,
+		   const char *fmt, ...);
+
+#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+__printf(3, 4)
+void __ocfs2_abort(struct super_block *sb, const char *function,
+		   const char *fmt, ...);
+
+#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
+#endif /* OCFS2_SUPER_H */
diff --git a/kernel/fs/ocfs2/symlink.c b/kernel/fs/ocfs2/symlink.c
new file mode 100644
index 000000000..66edce7ec
--- /dev/null
+++ b/kernel/fs/ocfs2/symlink.c
@@ -0,0 +1,100 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ *  linux/cluster/ssi/cfs/symlink.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation; either version 2 of
+ *	the License, or (at your option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
+ *	or NON INFRINGEMENT.  See the GNU General Public License for more
+ *	details.
+ *
+ * 	You should have received a copy of the GNU General Public License
+ * 	along with this program; if not, write to the Free Software
+ * 	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *	Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ *  Jun 7 1999, cache symlink lookups in the page cache.  -DaveM
+ *
+ *  Portions Copyright (C) 2001 Compaq Computer Corporation
+ *
+ *  ocfs2 symlink handling code.
+ *
+ *  Copyright (C) 2004, 2005 Oracle.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "symlink.h"
+#include "xattr.h"
+
+#include "buffer_head_io.h"
+
+
+static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *bh = NULL;
+	int status = ocfs2_read_inode_block(inode, &bh);
+	struct ocfs2_dinode *fe;
+	const char *link;
+	void *kaddr;
+	size_t len;
+
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	link = (char *) fe->id2.i_symlink;
+	/* will be less than a page size */
+	len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
+	kaddr = kmap_atomic(page);
+	memcpy(kaddr, link, len + 1);
+	kunmap_atomic(kaddr);
+	SetPageUptodate(page);
+	unlock_page(page);
+	brelse(bh);
+	return 0;
+}
+
+const struct address_space_operations ocfs2_fast_symlink_aops = {
+	.readpage		= ocfs2_fast_symlink_readpage,
+};
+
+const struct inode_operations ocfs2_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= ocfs2_getattr,
+	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
+	.fiemap		= ocfs2_fiemap,
+};
diff --git a/kernel/fs/ocfs2/symlink.h b/kernel/fs/ocfs2/symlink.h
new file mode 100644
index 000000000..71ee4245e
--- /dev/null
+++ b/kernel/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYMLINK_H
+#define OCFS2_SYMLINK_H
+
+extern const struct inode_operations ocfs2_symlink_inode_operations;
+extern const struct address_space_operations ocfs2_fast_symlink_aops;
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
+{
+	return (S_ISLNK(inode->i_mode) &&
+		inode->i_blocks == 0);
+}
+
+
+#endif /* OCFS2_SYMLINK_H */
diff --git a/kernel/fs/ocfs2/sysfile.c b/kernel/fs/ocfs2/sysfile.c
new file mode 100644
index 000000000..af155c183
--- /dev/null
+++ b/kernel/fs/ocfs2/sysfile.c
@@ -0,0 +1,182 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.c
+ *
+ * Initialize, read, write, etc. system files.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "inode.h"
+#include "journal.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
+#endif
+
+static inline int is_global_system_inode(int type)
+{
+	return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
+		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
+}
+
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+					     int type,
+					     u32 slot)
+{
+	int index;
+	struct inode **local_system_inodes, **free = NULL;
+
+	BUG_ON(slot == OCFS2_INVALID_SLOT);
+	BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+	       type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+	spin_lock(&osb->osb_lock);
+	local_system_inodes = osb->local_system_inodes;
+	spin_unlock(&osb->osb_lock);
+
+	if (unlikely(!local_system_inodes)) {
+		local_system_inodes = kzalloc(sizeof(struct inode *) *
+					      NUM_LOCAL_SYSTEM_INODES *
+					      osb->max_slots,
+					      GFP_NOFS);
+		if (!local_system_inodes) {
+			mlog_errno(-ENOMEM);
+			/*
+			 * return NULL here so that ocfs2_get_sytem_file_inodes
+			 * will try to create an inode and use it. We will try
+			 * to initialize local_system_inodes next time.
+			 */
+			return NULL;
+		}
+
+		spin_lock(&osb->osb_lock);
+		if (osb->local_system_inodes) {
+			/* Someone has initialized it for us. */
+			free = local_system_inodes;
+			local_system_inodes = osb->local_system_inodes;
+		} else
+			osb->local_system_inodes = local_system_inodes;
+		spin_unlock(&osb->osb_lock);
+		kfree(free);
+	}
+
+	index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+		(type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+	return &local_system_inodes[index];
+}
+
+struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					  int type,
+					  u32 slot)
+{
+	struct inode *inode = NULL;
+	struct inode **arr = NULL;
+
+	/* avoid the lookup if cached in local system file array */
+	if (is_global_system_inode(type)) {
+		arr = &(osb->global_system_inodes[type]);
+	} else
+		arr = get_local_system_inode(osb, type, slot);
+
+	mutex_lock(&osb->system_file_mutex);
+	if (arr && ((inode = *arr) != NULL)) {
+		/* get a ref in addition to the array ref */
+		inode = igrab(inode);
+		mutex_unlock(&osb->system_file_mutex);
+		BUG_ON(!inode);
+
+		return inode;
+	}
+
+	/* this gets one ref thru iget */
+	inode = _ocfs2_get_system_file_inode(osb, type, slot);
+
+	/* add one more if putting into array for first time */
+	if (arr && inode) {
+		*arr = igrab(inode);
+		BUG_ON(!*arr);
+	}
+	mutex_unlock(&osb->system_file_mutex);
+	return inode;
+}
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot)
+{
+	char namebuf[40];
+	struct inode *inode = NULL;
+	u64 blkno;
+	int status = 0;
+
+	ocfs2_sprintf_system_inode_name(namebuf,
+					sizeof(namebuf),
+					type, slot);
+
+	status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+					    strlen(namebuf), &blkno);
+	if (status < 0) {
+		goto bail;
+	}
+
+	inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		inode = NULL;
+		goto bail;
+	}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+	    type == LOCAL_GROUP_QUOTA_SYSTEM_INODE ||
+	    type == JOURNAL_SYSTEM_INODE) {
+		/* Ignore inode lock on these inodes as the lock does not
+		 * really belong to any process and lockdep cannot handle
+		 * that */
+		OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL;
+	} else {
+		lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres.
+								l_lockdep_map,
+				 ocfs2_system_inodes[type].si_name,
+				 &ocfs2_sysfile_cluster_lock_key[type], 0);
+	}
+#endif
+bail:
+
+	return inode;
+}
+
diff --git a/kernel/fs/ocfs2/sysfile.h b/kernel/fs/ocfs2/sysfile.h
new file mode 100644
index 000000000..cc9ea661f
--- /dev/null
+++ b/kernel/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYSFILE_H
+#define OCFS2_SYSFILE_H
+
+struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+#endif /* OCFS2_SYSFILE_H */
diff --git a/kernel/fs/ocfs2/uptodate.c b/kernel/fs/ocfs2/uptodate.c
new file mode 100644
index 000000000..82e17b076
--- /dev/null
+++ b/kernel/fs/ocfs2/uptodate.c
@@ -0,0 +1,638 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.c
+ *
+ * Tracking the up-to-date-ness of a local buffer_head with respect to
+ * the cluster.
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Standard buffer head caching flags (uptodate, etc) are insufficient
+ * in a clustered environment - a buffer may be marked up to date on
+ * our local node but could have been modified by another cluster
+ * member. As a result an additional (and performant) caching scheme
+ * is required. A further requirement is that we consume as little
+ * memory as possible - we never pin buffer_head structures in order
+ * to cache them.
+ *
+ * We track the existence of up to date buffers on the inodes which
+ * are associated with them. Because we don't want to pin
+ * buffer_heads, this is only a (strong) hint and several other checks
+ * are made in the I/O path to ensure that we don't use a stale or
+ * invalid buffer without going to disk:
+ *	- buffer_jbd is used liberally - if a bh is in the journal on
+ *	  this node then it *must* be up to date.
+ *	- the standard buffer_uptodate() macro is used to detect buffers
+ *	  which may be invalid (even if we have an up to date tracking
+ * 	  item for them)
+ *
+ * For a full understanding of how this code works together, one
+ * should read the callers in dlmglue.c, the I/O functions in
+ * buffer_head_io.c and ocfs2_journal_access in journal.c
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "inode.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+struct ocfs2_meta_cache_item {
+	struct rb_node	c_node;
+	sector_t	c_block;
+};
+
+static struct kmem_cache *ocfs2_uptodate_cachep;
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	return ci->ci_ops->co_owner(ci);
+}
+
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	return ci->ci_ops->co_get_super(ci);
+}
+
+static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_cache_lock(ci);
+}
+
+static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_cache_unlock(ci);
+}
+
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_io_lock(ci);
+}
+
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_io_unlock(ci);
+}
+
+
+static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
+				       int clear)
+{
+	ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
+	ci->ci_num_cached = 0;
+
+	if (clear) {
+		ci->ci_created_trans = 0;
+		ci->ci_last_trans = 0;
+	}
+}
+
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+			       const struct ocfs2_caching_operations *ops)
+{
+	BUG_ON(!ops);
+
+	ci->ci_ops = ops;
+	ocfs2_metadata_cache_reset(ci, 1);
+}
+
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
+{
+	ocfs2_metadata_cache_purge(ci);
+	ocfs2_metadata_cache_reset(ci, 1);
+}
+
+
+/* No lock taken here as 'root' is not expected to be visible to other
+ * processes. */
+static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
+{
+	unsigned int purged = 0;
+	struct rb_node *node;
+	struct ocfs2_meta_cache_item *item;
+
+	while ((node = rb_last(root)) != NULL) {
+		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
+
+		trace_ocfs2_purge_copied_metadata_tree(
+					(unsigned long long) item->c_block);
+
+		rb_erase(&item->c_node, root);
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+
+		purged++;
+	}
+	return purged;
+}
+
+/* Called from locking and called from ocfs2_clear_inode. Dump the
+ * cache for a given inode.
+ *
+ * This function is a few more lines longer than necessary due to some
+ * accounting done here, but I think it's worth tracking down those
+ * bugs sooner -- Mark */
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
+{
+	unsigned int tree, to_purge, purged;
+	struct rb_root root = RB_ROOT;
+
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ocfs2_metadata_cache_lock(ci);
+	tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
+	to_purge = ci->ci_num_cached;
+
+	trace_ocfs2_metadata_cache_purge(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		to_purge, tree);
+
+	/* If we're a tree, save off the root so that we can safely
+	 * initialize the cache. We do the work to free tree members
+	 * without the spinlock. */
+	if (tree)
+		root = ci->ci_cache.ci_tree;
+
+	ocfs2_metadata_cache_reset(ci, 0);
+	ocfs2_metadata_cache_unlock(ci);
+
+	purged = ocfs2_purge_copied_metadata_tree(&root);
+	/* If possible, track the number wiped so that we can more
+	 * easily detect counting errors. Unfortunately, this is only
+	 * meaningful for trees. */
+	if (tree && purged != to_purge)
+		mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
+		     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+		     to_purge, purged);
+}
+
+/* Returns the index in the cache array, -1 if not found.
+ * Requires ip_lock. */
+static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
+				    sector_t item)
+{
+	int i;
+
+	for (i = 0; i < ci->ci_num_cached; i++) {
+		if (item == ci->ci_cache.ci_array[i])
+			return i;
+	}
+
+	return -1;
+}
+
+/* Returns the cache item if found, otherwise NULL.
+ * Requires ip_lock. */
+static struct ocfs2_meta_cache_item *
+ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
+			sector_t block)
+{
+	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	while (n) {
+		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < item->c_block)
+			n = n->rb_left;
+		else if (block > item->c_block)
+			n = n->rb_right;
+		else
+			return item;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
+			       struct buffer_head *bh)
+{
+	int index = -1;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	ocfs2_metadata_cache_lock(ci);
+
+	trace_ocfs2_buffer_cached_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) bh->b_blocknr,
+		!!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
+
+	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
+		index = ocfs2_search_cache_array(ci, bh->b_blocknr);
+	else
+		item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
+
+	ocfs2_metadata_cache_unlock(ci);
+
+	trace_ocfs2_buffer_cached_end(index, item);
+
+	return (index != -1) || (item != NULL);
+}
+
+/* Warning: even if it returns true, this does *not* guarantee that
+ * the block is stored in our inode metadata cache.
+ *
+ * This can be called under lock_buffer()
+ */
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
+			  struct buffer_head *bh)
+{
+	/* Doesn't matter if the bh is in our cache or not -- if it's
+	 * not marked uptodate then we know it can't have correct
+	 * data. */
+	if (!buffer_uptodate(bh))
+		return 0;
+
+	/* OCFS2 does not allow multiple nodes to be changing the same
+	 * block at the same time. */
+	if (buffer_jbd(bh))
+		return 1;
+
+	/* Ok, locally the buffer is marked as up to date, now search
+	 * our cache to see if we can trust that. */
+	return ocfs2_buffer_cached(ci, bh);
+}
+
+/*
+ * Determine whether a buffer is currently out on a read-ahead request.
+ * ci_io_sem should be held to serialize submitters with the logic here.
+ */
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh)
+{
+	return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
+}
+
+/* Requires ip_lock */
+static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
+				     sector_t block)
+{
+	BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
+
+	trace_ocfs2_append_cache_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
+
+	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
+	ci->ci_num_cached++;
+}
+
+/* By now the caller should have checked that the item does *not*
+ * exist in the tree.
+ * Requires ip_lock. */
+static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
+				      struct ocfs2_meta_cache_item *new)
+{
+	sector_t block = new->c_block;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *tmp;
+
+	trace_ocfs2_insert_cache_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
+
+	while(*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < tmp->c_block)
+			p = &(*p)->rb_left;
+		else if (block > tmp->c_block)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
+			     (unsigned long long) block);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->c_node, parent, p);
+	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached++;
+}
+
+/* co_cache_lock() must be held */
+static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
+{
+	return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
+		(ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
+}
+
+/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
+ * pointers in tree after we use them - this allows caller to detect
+ * when to free in case of error.
+ *
+ * The co_cache_lock() must be held. */
+static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
+			       struct ocfs2_meta_cache_item **tree)
+{
+	int i;
+
+	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
+			"Owner %llu, num cached = %u, should be %u\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci),
+			ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
+	mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
+			"Owner %llu not marked as inline anymore!\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci));
+
+	/* Be careful to initialize the tree members *first* because
+	 * once the ci_tree is used, the array is junk... */
+	for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
+		tree[i]->c_block = ci->ci_cache.ci_array[i];
+
+	ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
+	ci->ci_cache.ci_tree = RB_ROOT;
+	/* this will be set again by __ocfs2_insert_cache_tree */
+	ci->ci_num_cached = 0;
+
+	for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
+		__ocfs2_insert_cache_tree(ci, tree[i]);
+		tree[i] = NULL;
+	}
+
+	trace_ocfs2_expand_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		ci->ci_flags, ci->ci_num_cached);
+}
+
+/* Slow path function - memory allocation is necessary. See the
+ * comment above ocfs2_set_buffer_uptodate for more information. */
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
+					sector_t block,
+					int expand_tree)
+{
+	int i;
+	struct ocfs2_meta_cache_item *new = NULL;
+	struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
+		{ NULL, };
+
+	trace_ocfs2_set_buffer_uptodate(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, expand_tree);
+
+	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
+	if (!new) {
+		mlog_errno(-ENOMEM);
+		return;
+	}
+	new->c_block = block;
+
+	if (expand_tree) {
+		/* Do *not* allocate an array here - the removal code
+		 * has no way of tracking that. */
+		for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
+			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
+						   GFP_NOFS);
+			if (!tree[i]) {
+				mlog_errno(-ENOMEM);
+				goto out_free;
+			}
+
+			/* These are initialized in ocfs2_expand_cache! */
+		}
+	}
+
+	ocfs2_metadata_cache_lock(ci);
+	if (ocfs2_insert_can_use_array(ci)) {
+		/* Ok, items were removed from the cache in between
+		 * locks. Detect this and revert back to the fast path */
+		ocfs2_append_cache_array(ci, block);
+		ocfs2_metadata_cache_unlock(ci);
+		goto out_free;
+	}
+
+	if (expand_tree)
+		ocfs2_expand_cache(ci, tree);
+
+	__ocfs2_insert_cache_tree(ci, new);
+	ocfs2_metadata_cache_unlock(ci);
+
+	new = NULL;
+out_free:
+	if (new)
+		kmem_cache_free(ocfs2_uptodate_cachep, new);
+
+	/* If these were used, then ocfs2_expand_cache re-set them to
+	 * NULL for us. */
+	if (tree[0]) {
+		for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
+			if (tree[i])
+				kmem_cache_free(ocfs2_uptodate_cachep,
+						tree[i]);
+	}
+}
+
+/* Item insertion is guarded by co_io_lock(), so the insertion path takes
+ * advantage of this by not rechecking for a duplicate insert during
+ * the slow case. Additionally, if the cache needs to be bumped up to
+ * a tree, the code will not recheck after acquiring the lock --
+ * multiple paths cannot be expanding to a tree at the same time.
+ *
+ * The slow path takes into account that items can be removed
+ * (including the whole tree wiped and reset) when this process it out
+ * allocating memory. In those cases, it reverts back to the fast
+ * path.
+ *
+ * Note that this function may actually fail to insert the block if
+ * memory cannot be allocated. This is not fatal however (but may
+ * result in a performance penalty)
+ *
+ * Readahead buffers can be passed in here before the I/O request is
+ * completed.
+ */
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
+			       struct buffer_head *bh)
+{
+	int expand;
+
+	/* The block may very well exist in our cache already, so avoid
+	 * doing any more work in that case. */
+	if (ocfs2_buffer_cached(ci, bh))
+		return;
+
+	trace_ocfs2_set_buffer_uptodate_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr);
+
+	/* No need to recheck under spinlock - insertion is guarded by
+	 * co_io_lock() */
+	ocfs2_metadata_cache_lock(ci);
+	if (ocfs2_insert_can_use_array(ci)) {
+		/* Fast case - it's an array and there's a free
+		 * spot. */
+		ocfs2_append_cache_array(ci, bh->b_blocknr);
+		ocfs2_metadata_cache_unlock(ci);
+		return;
+	}
+
+	expand = 0;
+	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
+		/* We need to bump things up to a tree. */
+		expand = 1;
+	}
+	ocfs2_metadata_cache_unlock(ci);
+
+	__ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
+}
+
+/* Called against a newly allocated buffer. Most likely nobody should
+ * be able to read this sort of metadata while it's still being
+ * allocated, but this is careful to take co_io_lock() anyway. */
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh)
+{
+	/* This should definitely *not* exist in our cache */
+	BUG_ON(ocfs2_buffer_cached(ci, bh));
+
+	set_buffer_uptodate(bh);
+
+	ocfs2_metadata_cache_io_lock(ci);
+	ocfs2_set_buffer_uptodate(ci, bh);
+	ocfs2_metadata_cache_io_unlock(ci);
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
+					int index)
+{
+	sector_t *array = ci->ci_cache.ci_array;
+	int bytes;
+
+	BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
+	BUG_ON(index >= ci->ci_num_cached);
+	BUG_ON(!ci->ci_num_cached);
+
+	trace_ocfs2_remove_metadata_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, ci->ci_num_cached);
+
+	ci->ci_num_cached--;
+
+	/* don't need to copy if the array is now empty, or if we
+	 * removed at the tail */
+	if (ci->ci_num_cached && index < ci->ci_num_cached) {
+		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
+		memmove(&array[index], &array[index + 1], bytes);
+	}
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
+				       struct ocfs2_meta_cache_item *item)
+{
+	trace_ocfs2_remove_metadata_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)item->c_block);
+
+	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached--;
+}
+
+static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
+					  sector_t block)
+{
+	int index;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	ocfs2_metadata_cache_lock(ci);
+	trace_ocfs2_remove_block_from_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) block, ci->ci_num_cached,
+		ci->ci_flags);
+
+	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
+		index = ocfs2_search_cache_array(ci, block);
+		if (index != -1)
+			ocfs2_remove_metadata_array(ci, index);
+	} else {
+		item = ocfs2_search_cache_tree(ci, block);
+		if (item)
+			ocfs2_remove_metadata_tree(ci, item);
+	}
+	ocfs2_metadata_cache_unlock(ci);
+
+	if (item)
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+}
+
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
+			     struct buffer_head *bh)
+{
+	sector_t block = bh->b_blocknr;
+
+	ocfs2_remove_block_from_cache(ci, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
+					    sector_t block,
+					    u32 c_len)
+{
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
+
+	for (i = 0; i < b_len; i++, block++)
+		ocfs2_remove_block_from_cache(ci, block);
+}
+
+int __init init_ocfs2_uptodate_cache(void)
+{
+	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
+				  sizeof(struct ocfs2_meta_cache_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ocfs2_uptodate_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void exit_ocfs2_uptodate_cache(void)
+{
+	if (ocfs2_uptodate_cachep)
+		kmem_cache_destroy(ocfs2_uptodate_cachep);
+}
diff --git a/kernel/fs/ocfs2/uptodate.h b/kernel/fs/ocfs2/uptodate.h
new file mode 100644
index 000000000..0d826fe2d
--- /dev/null
+++ b/kernel/fs/ocfs2/uptodate.h
@@ -0,0 +1,84 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.h
+ *
+ * Cluster uptodate tracking
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_UPTODATE_H
+#define OCFS2_UPTODATE_H
+
+/*
+ * The caching code relies on locking provided by the user of
+ * struct ocfs2_caching_info.  These operations connect that up.
+ */
+struct ocfs2_caching_operations {
+	/*
+	 * A u64 representing the owning structure.  Usually this
+	 * is the block number (i_blkno or whatnot).  This is used so
+	 * that caching log messages can identify the owning structure.
+	 */
+	u64	(*co_owner)(struct ocfs2_caching_info *ci);
+
+	/* The superblock is needed during I/O. */
+	struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
+	/*
+	 * Lock and unlock the caching data.  These will not sleep, and
+	 * should probably be spinlocks.
+	 */
+	void	(*co_cache_lock)(struct ocfs2_caching_info *ci);
+	void	(*co_cache_unlock)(struct ocfs2_caching_info *ci);
+
+	/*
+	 * Lock and unlock for disk I/O.  These will sleep, and should
+	 * be mutexes.
+	 */
+	void	(*co_io_lock)(struct ocfs2_caching_info *ci);
+	void	(*co_io_unlock)(struct ocfs2_caching_info *ci);
+};
+
+int __init init_ocfs2_uptodate_cache(void);
+void exit_ocfs2_uptodate_cache(void);
+
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+			       const struct ocfs2_caching_operations *ops);
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
+
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
+			  struct buffer_head *bh);
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
+			       struct buffer_head *bh);
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh);
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
+			     struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
+					    sector_t block,
+					    u32 c_len);
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh);
+
+#endif /* OCFS2_UPTODATE_H */
diff --git a/kernel/fs/ocfs2/xattr.c b/kernel/fs/ocfs2/xattr.c
new file mode 100644
index 000000000..d03bfbf3d
--- /dev/null
+++ b/kernel/fs/ocfs2/xattr.c
@@ -0,0 +1,7425 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/security.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "acl.h"
+#include "ocfs2_trace.h"
+
+struct ocfs2_xattr_def_value_root {
+	struct ocfs2_xattr_value_root	xv;
+	struct ocfs2_extent_rec		er;
+};
+
+struct ocfs2_xattr_bucket {
+	/* The inode these xattrs are associated with */
+	struct inode *bu_inode;
+
+	/* The actual buffers that make up the bucket */
+	struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+	/* How many blocks make up one bucket for this filesystem */
+	int bu_blocks;
+};
+
+struct ocfs2_xattr_set_ctxt {
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	int set_abort;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_HEADER_GAP	4
+#define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - OCFS2_XATTR_HEADER_GAP)
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
+					 - sizeof(struct ocfs2_xattr_block) \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - OCFS2_XATTR_HEADER_GAP)
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+	.xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
+	&ocfs2_xattr_user_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	&ocfs2_xattr_trusted_handler,
+	&ocfs2_xattr_security_handler,
+	NULL
+};
+
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+					= &posix_acl_access_xattr_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+					= &posix_acl_default_xattr_handler,
+	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
+};
+
+struct ocfs2_xattr_info {
+	int		xi_name_index;
+	const char	*xi_name;
+	int		xi_name_len;
+	const void	*xi_value;
+	size_t		xi_value_len;
+};
+
+struct ocfs2_xattr_search {
+	struct buffer_head *inode_bh;
+	/*
+	 * xattr_bh point to the block buffer head which has extended attribute
+	 * when extended attribute in inode, xattr_bh is equal to inode_bh.
+	 */
+	struct buffer_head *xattr_bh;
+	struct ocfs2_xattr_header *header;
+	struct ocfs2_xattr_bucket *bucket;
+	void *base;
+	void *end;
+	struct ocfs2_xattr_entry *here;
+	int not_found;
+};
+
+/* Operations on struct ocfs2_xa_entry */
+struct ocfs2_xa_loc;
+struct ocfs2_xa_loc_operations {
+	/*
+	 * Journal functions
+	 */
+	int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
+				  int type);
+	void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
+
+	/*
+	 * Return a pointer to the appropriate buffer in loc->xl_storage
+	 * at the given offset from loc->xl_header.
+	 */
+	void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
+
+	/* Can we reuse the existing entry for the new value? */
+	int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
+			     struct ocfs2_xattr_info *xi);
+
+	/* How much space is needed for the new value? */
+	int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
+			       struct ocfs2_xattr_info *xi);
+
+	/*
+	 * Return the offset of the first name+value pair.  This is
+	 * the start of our downward-filling free space.
+	 */
+	int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
+
+	/*
+	 * Remove the name+value at this location.  Do whatever is
+	 * appropriate with the remaining name+value pairs.
+	 */
+	void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+
+	/* Fill xl_entry with a new entry */
+	void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
+
+	/* Add name+value storage to an entry */
+	void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
+
+	/*
+	 * Initialize the value buf's access and bh fields for this entry.
+	 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
+	 */
+	void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_value_buf *vb);
+};
+
+/*
+ * Describes an xattr entry location.  This is a memory structure
+ * tracking the on-disk structure.
+ */
+struct ocfs2_xa_loc {
+	/* This xattr belongs to this inode */
+	struct inode *xl_inode;
+
+	/* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
+	struct ocfs2_xattr_header *xl_header;
+
+	/* Bytes from xl_header to the end of the storage */
+	int xl_size;
+
+	/*
+	 * The ocfs2_xattr_entry this location describes.  If this is
+	 * NULL, this location describes the on-disk structure where it
+	 * would have been.
+	 */
+	struct ocfs2_xattr_entry *xl_entry;
+
+	/*
+	 * Internal housekeeping
+	 */
+
+	/* Buffer(s) containing this entry */
+	void *xl_storage;
+
+	/* Operations on the storage backing this location */
+	const struct ocfs2_xa_loc_operations *xl_ops;
+};
+
+/*
+ * Convenience functions to calculate how much space is needed for a
+ * given name+value pair
+ */
+static int namevalue_size(int name_len, uint64_t value_len)
+{
+	if (value_len > OCFS2_XATTR_INLINE_SIZE)
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	else
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+}
+
+static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size(xi->xi_name_len, xi->xi_value_len);
+}
+
+static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
+{
+	u64 value_len = le64_to_cpu(xe->xe_value_size);
+
+	BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
+	       ocfs2_xattr_is_local(xe));
+	return namevalue_size(xe->xe_name_len, value_len);
+}
+
+
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset);
+
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs);
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					struct buffer_head *blk_bh,
+					char *buffer,
+					size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt);
+
+typedef int (xattr_tree_rec_func)(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno, u32 cpos, u32 len, void *para);
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *root_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para);
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para);
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len,
+				  void *para);
+
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
+				  u32 *first_hash);
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_need,
+					int *credits);
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh);
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+	struct ocfs2_xattr_bucket *bucket;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+
+	bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+	if (bucket) {
+		bucket->bu_inode = inode;
+		bucket->bu_blocks = blks;
+	}
+
+	return bucket;
+}
+
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+	int i;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		brelse(bucket->bu_bhs[i]);
+		bucket->bu_bhs[i] = NULL;
+	}
+}
+
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+	if (bucket) {
+		ocfs2_xattr_bucket_relse(bucket);
+		bucket->bu_inode = NULL;
+		kfree(bucket);
+	}
+}
+
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno, int new)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+					      xb_blkno + i);
+		if (!bucket->bu_bhs[i]) {
+			rc = -ENOMEM;
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+					   bucket->bu_bhs[i])) {
+			if (new)
+				ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+							      bucket->bu_bhs[i]);
+			else {
+				set_buffer_uptodate(bucket->bu_bhs[i]);
+				ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+							  bucket->bu_bhs[i]);
+			}
+		}
+	}
+
+	if (rc)
+		ocfs2_xattr_bucket_relse(bucket);
+	return rc;
+}
+
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno)
+{
+	int rc;
+
+	rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
+			       bucket->bu_blocks, bucket->bu_bhs, 0,
+			       NULL);
+	if (!rc) {
+		spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+						 bucket->bu_bhs,
+						 bucket->bu_blocks,
+						 &bucket_xh(bucket)->xh_check);
+		spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+		if (rc)
+			mlog_errno(rc);
+	}
+
+	if (rc)
+		ocfs2_xattr_bucket_relse(bucket);
+	return rc;
+}
+
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+					     struct ocfs2_xattr_bucket *bucket,
+					     int type)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		rc = ocfs2_journal_access(handle,
+					  INODE_CACHE(bucket->bu_inode),
+					  bucket->bu_bhs[i], type);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+					     struct ocfs2_xattr_bucket *bucket)
+{
+	int i;
+
+	spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+				   bucket->bu_bhs, bucket->bu_blocks,
+				   &bucket_xh(bucket)->xh_check);
+	spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+
+	for (i = 0; i < bucket->bu_blocks; i++)
+		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+					 struct ocfs2_xattr_bucket *src)
+{
+	int i;
+	int blocksize = src->bu_inode->i_sb->s_blocksize;
+
+	BUG_ON(dest->bu_blocks != src->bu_blocks);
+	BUG_ON(dest->bu_inode != src->bu_inode);
+
+	for (i = 0; i < src->bu_blocks; i++) {
+		memcpy(bucket_block(dest, i), bucket_block(src, i),
+		       blocksize);
+	}
+}
+
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal
+	 */
+
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has bad "
+			    "signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    xb->xb_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an "
+			    "invalid xb_blkno of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(xb->xb_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an invalid "
+			    "xb_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(xb->xb_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+				  struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
+			      ocfs2_validate_xattr_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+		handler = ocfs2_xattr_handler_map[name_index];
+
+	return handler ? handler->prefix : NULL;
+}
+
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+				 const char *name,
+				 int name_len)
+{
+	/* Get hash value of uuid from super block */
+	u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+	int i;
+
+	/* hash extended attribute name */
+	for (i = 0; i < name_len; i++) {
+		hash = (hash << OCFS2_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	return hash;
+}
+
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+	return namevalue_size(name_len, value_len) +
+		sizeof(struct ocfs2_xattr_entry);
+}
+
+static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size_xi(xi) +
+		sizeof(struct ocfs2_xattr_entry);
+}
+
+static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
+{
+	return namevalue_size_xe(xe) +
+		sizeof(struct ocfs2_xattr_entry);
+}
+
+int ocfs2_calc_security_init(struct inode *dir,
+			     struct ocfs2_security_xattr_info *si,
+			     int *want_clusters,
+			     int *xattr_credits,
+			     struct ocfs2_alloc_context **xattr_ac)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						 si->value_len);
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * So reserve one metadata block for it is ok.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	/* reserve clusters for xattr value which will be set in B tree*/
+	if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							    si->value_len);
+
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+	return ret;
+}
+
+int ocfs2_calc_xattr_init(struct inode *dir,
+			  struct buffer_head *dir_bh,
+			  umode_t mode,
+			  struct ocfs2_security_xattr_info *si,
+			  int *want_clusters,
+			  int *xattr_credits,
+			  int *want_meta)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+
+	if (si->enable)
+		s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						     si->value_len);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+		acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+					OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+					"", NULL, 0);
+		if (acl_len > 0) {
+			a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+			if (S_ISDIR(mode))
+				a_size <<= 1;
+		} else if (acl_len != 0 && acl_len != -ENODATA) {
+			mlog_errno(ret);
+			return ret;
+		}
+	}
+
+	if (!(s_size + a_size))
+		return ret;
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * The max space of acl xattr taken inline is
+	 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+	 * when blocksize = 512, may reserve one more cluser for
+	 * xattr bucket, otherwise reserve one metadata block
+	 * for them is ok.
+	 * If this is a new directory with inline data,
+	 * we choose to reserve the entire inline area for
+	 * directory contents and force an external xattr block.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+		*want_meta = *want_meta + 1;
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+		*want_clusters += 1;
+		*xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+	}
+
+	/*
+	 * reserve credits and clusters for xattrs which has large value
+	 * and have to be set outside
+	 */
+	if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							si->value_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+	    acl_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* for directory, it has DEFAULT and ACCESS two types of acls */
+		new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+				ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+					 u32 clusters_to_add,
+					 struct ocfs2_xattr_value_buf *vb,
+					 struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int status = 0, credits;
+	handle_t *handle = ctxt->handle;
+	enum ocfs2_alloc_restarted why;
+	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+	while (clusters_to_add) {
+		trace_ocfs2_xattr_extend_allocation(clusters_to_add);
+
+		status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+				       OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			break;
+		}
+
+		prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+		status = ocfs2_add_clusters_in_btree(handle,
+						     &et,
+						     &logical_start,
+						     clusters_to_add,
+						     0,
+						     ctxt->data_ac,
+						     ctxt->meta_ac,
+						     &why);
+		if ((status < 0) && (status != -EAGAIN)) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			break;
+		}
+
+		ocfs2_journal_dirty(handle, vb->vb_bh);
+
+		clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+					 prev_clusters;
+
+		if (why != RESTART_NONE && clusters_to_add) {
+			/*
+			 * We can only fail in case the alloc file doesn't give
+			 * up enough clusters.
+			 */
+			BUG_ON(why == RESTART_META);
+
+			credits = ocfs2_calc_extend_credits(inode->i_sb,
+							    &vb->vb_xv->xr_list);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				break;
+			}
+		}
+	}
+
+	return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+				      struct ocfs2_xattr_value_buf *vb,
+				      u32 cpos, u32 phys_cpos, u32 len,
+				      unsigned int ext_flags,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	handle_t *handle = ctxt->handle;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
+				  &ctxt->dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+	ocfs2_journal_dirty(handle, vb->vb_bh);
+
+	if (ext_flags & OCFS2_EXT_REFCOUNTED)
+		ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(inode->i_sb,
+								 phys_blkno),
+					len, ctxt->meta_ac, &ctxt->dealloc, 1);
+	else
+		ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
+						  phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+				   u32 old_clusters,
+				   u32 new_clusters,
+				   struct ocfs2_xattr_value_buf *vb,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret = 0;
+	unsigned int ext_flags;
+	u32 trunc_len, cpos, phys_cpos, alloc_size;
+	u64 block;
+
+	if (old_clusters <= new_clusters)
+		return 0;
+
+	cpos = new_clusters;
+	trunc_len = old_clusters - new_clusters;
+	while (trunc_len) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+					       &alloc_size,
+					       &vb->vb_xv->xr_list, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > trunc_len)
+			alloc_size = trunc_len;
+
+		ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
+						 phys_cpos, alloc_size,
+						 ext_flags, ctxt);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+		ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
+						       block, alloc_size);
+		cpos += alloc_size;
+		trunc_len -= alloc_size;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+				      struct ocfs2_xattr_value_buf *vb,
+				      int len,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+	u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+
+	if (new_clusters == old_clusters)
+		return 0;
+
+	if (new_clusters > old_clusters)
+		ret = ocfs2_xattr_extend_allocation(inode,
+						    new_clusters - old_clusters,
+						    vb, ctxt);
+	else
+		ret = ocfs2_xattr_shrink_size(inode,
+					      old_clusters, new_clusters,
+					      vb, ctxt);
+
+	return ret;
+}
+
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+				  size_t *result, const char *prefix,
+				  const char *name, int name_len)
+{
+	char *p = buffer + *result;
+	int prefix_len = strlen(prefix);
+	int total_len = prefix_len + name_len + 1;
+
+	*result += total_len;
+
+	/* we are just looking for how big our buffer needs to be */
+	if (!size)
+		return 0;
+
+	if (*result > size)
+		return -ERANGE;
+
+	memcpy(p, prefix, prefix_len);
+	memcpy(p + prefix_len, name, name_len);
+	p[prefix_len + name_len] = '\0';
+
+	return 0;
+}
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+				    struct ocfs2_xattr_header *header,
+				    char *buffer, size_t buffer_size)
+{
+	size_t result = 0;
+	int i, type, ret;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			name = (const char *)header +
+				le16_to_cpu(entry->xe_name_offset);
+
+			ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+						     &result, prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return result;
+}
+
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di)
+{
+	struct ocfs2_xattr_header *xh;
+	int i;
+
+	xh = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
+		if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
+			return 1;
+
+	return 0;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct ocfs2_xattr_header *header = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		ret = ocfs2_xattr_list_entries(inode, header,
+					       buffer, buffer_size);
+	} else
+		ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
+						   buffer, buffer_size);
+
+	brelse(blk_bh);
+
+	return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+			char *buffer,
+			size_t size)
+{
+	int ret = 0, i_ret = 0, b_ret = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry));
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return ret;
+
+	ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size);
+	if (i_ret < 0)
+		b_ret = 0;
+	else {
+		if (buffer) {
+			buffer += i_ret;
+			size -= i_ret;
+		}
+		b_ret = ocfs2_xattr_block_list(d_inode(dentry), di,
+					       buffer, size);
+		if (b_ret < 0)
+			i_ret = 0;
+	}
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(d_inode(dentry), 0);
+
+	brelse(di_bh);
+
+	return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *entry;
+	size_t name_len;
+	int i, cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	entry = xs->here;
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		cmp = name_index - ocfs2_xattr_get_type(entry);
+		if (!cmp)
+			cmp = name_len - entry->xe_name_len;
+		if (!cmp)
+			cmp = memcmp(name, (xs->base +
+				     le16_to_cpu(entry->xe_name_offset)),
+				     name_len);
+		if (cmp == 0)
+			break;
+		entry += 1;
+	}
+	xs->here = entry;
+
+	return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_value_root *xv,
+					 void *buffer,
+					 size_t len)
+{
+	u32 cpos, p_cluster, num_clusters, bpc, clusters;
+	u64 blkno;
+	int i, ret = 0;
+	size_t cplen, blocksize;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_list *el;
+
+	el = &xv->xr_list;
+	clusters = le32_to_cpu(xv->xr_clusters);
+	bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	blocksize = inode->i_sb->s_blocksize;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		/* Copy ocfs2_xattr_value */
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+					       &bh, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cplen = len >= blocksize ? blocksize : len;
+			memcpy(buffer, bh->b_data, cplen);
+			len -= cplen;
+			buffer += cplen;
+
+			brelse(bh);
+			bh = NULL;
+			if (len == 0)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return -ENODATA;
+
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		return ret;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		if (size > buffer_size)
+			return -ERANGE;
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + le16_to_cpu(
+				 xs->here->xe_name_offset) +
+				OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = -ENODATA, name_offset, name_len, i;
+	int uninitialized_var(block_off);
+
+	xs->bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xs->bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
+	if (ret) {
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	if (xs->not_found) {
+		ret = -ENODATA;
+		goto cleanup;
+	}
+
+	xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		ret = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+
+		name_offset = le16_to_cpu(xs->here->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+		i = xs->here - xs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+								bucket_xh(xs->bucket),
+								i,
+								&block_off,
+								&name_offset);
+			if (ret) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+			xs->base = bucket_block(xs->bucket, block_off);
+		}
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       name_offset + name_len, size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + name_offset + name_len);
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
+	}
+	ret = size;
+cleanup:
+	ocfs2_xattr_bucket_free(xs->bucket);
+
+	brelse(xs->xattr_bh);
+	xs->xattr_bh = NULL;
+	return ret;
+}
+
+int ocfs2_xattr_get_nolock(struct inode *inode,
+			   struct buffer_head *di_bh,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return -ENODATA;
+
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+				    buffer_size, &xis);
+	if (ret == -ENODATA && di->i_xattr_loc)
+		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+					    buffer_size, &xbs);
+
+	return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	down_read(&OCFS2_I(inode)->ip_xattr_sem);
+	ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+				     name, buffer, buffer_size);
+	up_read(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   handle_t *handle,
+					   struct ocfs2_xattr_value_buf *vb,
+					   const void *value,
+					   int value_len)
+{
+	int ret = 0, i, cp_len;
+	u16 blocksize = inode->i_sb->s_blocksize;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	unsigned int ext_flags;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+
+	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list,
+					       &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+					       &bh, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_journal_access(handle,
+						   INODE_CACHE(inode),
+						   bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cp_len = value_len > blocksize ? blocksize : value_len;
+			memcpy(bh->b_data, value, cp_len);
+			value_len -= cp_len;
+			value += cp_len;
+			if (cp_len < blocksize)
+				memset(bh->b_data + cp_len, 0,
+				       blocksize - cp_len);
+
+			ocfs2_journal_dirty(handle, bh);
+			brelse(bh);
+			bh = NULL;
+
+			/*
+			 * XXX: do we need to empty all the following
+			 * blocks in this cluster?
+			 */
+			if (!value_len)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
+				       int num_entries)
+{
+	int free_space;
+
+	if (!needed_space)
+		return 0;
+
+	free_space = free_start -
+		sizeof(struct ocfs2_xattr_header) -
+		(num_entries * sizeof(struct ocfs2_xattr_entry)) -
+		OCFS2_XATTR_HEADER_GAP;
+	if (free_space < 0)
+		return -EIO;
+	if (free_space < needed_space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
+				   int type)
+{
+	return loc->xl_ops->xlo_journal_access(handle, loc, type);
+}
+
+static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_journal_dirty(handle, loc);
+}
+
+/* Give a pointer into the storage for the given offset */
+static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
+{
+	BUG_ON(offset >= loc->xl_size);
+	return loc->xl_ops->xlo_offset_pointer(loc, offset);
+}
+
+/*
+ * Wipe the name+value pair and allow the storage to reclaim it.  This
+ * must be followed by either removal of the entry or a call to
+ * ocfs2_xa_add_namevalue().
+ */
+static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_wipe_namevalue(loc);
+}
+
+/*
+ * Find lowest offset to a name+value pair.  This is the start of our
+ * downward-growing free space.
+ */
+static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	return loc->xl_ops->xlo_get_free_start(loc);
+}
+
+/* Can we reuse loc->xl_entry for xi? */
+static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_can_reuse(loc, xi);
+}
+
+/* How much free space is needed to set the new value */
+static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_check_space(loc, xi);
+}
+
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	loc->xl_ops->xlo_add_entry(loc, name_hash);
+	loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+	/*
+	 * We can't leave the new entry's xe_name_offset at zero or
+	 * add_namevalue() will go nuts.  We set it to the size of our
+	 * storage so that it can never be less than any other entry.
+	 */
+	loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+
+static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_info *xi)
+{
+	int size = namevalue_size_xi(xi);
+	int nameval_offset;
+	char *nameval_buf;
+
+	loc->xl_ops->xlo_add_namevalue(loc, size);
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	loc->xl_entry->xe_name_len = xi->xi_name_len;
+	ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
+	ocfs2_xattr_set_local(loc->xl_entry,
+			      xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
+
+	nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	memset(nameval_buf, 0, size);
+	memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
+}
+
+static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_value_buf *vb)
+{
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+
+	/* Value bufs are for value trees */
+	BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
+	BUG_ON(namevalue_size_xe(loc->xl_entry) !=
+	       (name_size + OCFS2_XATTR_ROOT_SIZE));
+
+	loc->xl_ops->xlo_fill_value_buf(loc, vb);
+	vb->vb_xv =
+		(struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
+							nameval_offset +
+							name_size);
+}
+
+static int ocfs2_xa_block_journal_access(handle_t *handle,
+					 struct ocfs2_xa_loc *loc, int type)
+{
+	struct buffer_head *bh = loc->xl_storage;
+	ocfs2_journal_access_func access;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		access = ocfs2_journal_access_xb;
+	else
+		access = ocfs2_journal_access_di;
+	return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
+}
+
+static void ocfs2_xa_block_journal_dirty(handle_t *handle,
+					 struct ocfs2_xa_loc *loc)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	ocfs2_journal_dirty(handle, bh);
+}
+
+static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
+					   int offset)
+{
+	return (char *)loc->xl_header + offset;
+}
+
+static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	/*
+	 * Block storage is strict.  If the sizes aren't exact, we will
+	 * remove the old one and reinsert the new.
+	 */
+	return namevalue_size_xe(loc->xl_entry) ==
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int i, count = le16_to_cpu(xh->xh_count);
+	int offset, free_start = loc->xl_size;
+
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset < free_start)
+			free_start = offset;
+	}
+
+	return free_start;
+}
+
+static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
+				      struct ocfs2_xattr_info *xi)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+
+	/*
+	 * Block storage will reclaim the original entry before inserting
+	 * the new value, so we only need the difference.  If the new
+	 * entry is smaller than the old one, we don't need anything.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
+		else
+			needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
+	}
+	if (needed_space < 0)
+		needed_space = 0;
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
+/*
+ * Block storage for xattrs keeps the name+value pairs compacted.  When
+ * we remove one, we have to shift any that preceded it towards the end.
+ */
+static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	int i, offset;
+	int namevalue_offset, first_namevalue_offset, namevalue_size;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int count = le16_to_cpu(xh->xh_count);
+
+	namevalue_offset = le16_to_cpu(entry->xe_name_offset);
+	namevalue_size = namevalue_size_xe(entry);
+	first_namevalue_offset = ocfs2_xa_get_free_start(loc);
+
+	/* Shift the name+value pairs */
+	memmove((char *)xh + first_namevalue_offset + namevalue_size,
+		(char *)xh + first_namevalue_offset,
+		namevalue_offset - first_namevalue_offset);
+	memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
+
+	/* Now tell xh->xh_entries about it */
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset <= namevalue_offset)
+			le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
+				     namevalue_size);
+	}
+
+	/*
+	 * Note that we don't update xh_free_start or xh_name_value_len
+	 * because they're not used in block-stored xattrs.
+	 */
+}
+
+static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	loc->xl_entry = &(loc->xl_header->xh_entries[count]);
+	le16_add_cpu(&loc->xl_header->xh_count, 1);
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+
+	loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
+}
+
+static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
+					  struct ocfs2_xattr_value_buf *vb)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		vb->vb_access = ocfs2_journal_access_xb;
+	else
+		vb->vb_access = ocfs2_journal_access_di;
+	vb->vb_bh = bh;
+}
+
+/*
+ * Operations for xattrs stored in blocks.  This includes inline inode
+ * storage and unindexed ocfs2_xattr_blocks.
+ */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_block_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_block_journal_dirty,
+	.xlo_offset_pointer	= ocfs2_xa_block_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_block_check_space,
+	.xlo_can_reuse		= ocfs2_xa_block_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_block_get_free_start,
+	.xlo_wipe_namevalue	= ocfs2_xa_block_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_block_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_block_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_block_fill_value_buf,
+};
+
+static int ocfs2_xa_bucket_journal_access(handle_t *handle,
+					  struct ocfs2_xa_loc *loc, int type)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+	return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
+}
+
+static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
+					  struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+}
+
+static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
+					    int offset)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	int block, block_offset;
+
+	/* The header is at the front of the bucket */
+	block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
+	block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
+
+	return bucket_block(bucket, block) + block_offset;
+}
+
+static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
+				     struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size_xe(loc->xl_entry) >=
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
+}
+
+static int ocfs2_bucket_align_free_start(struct super_block *sb,
+					 int free_start, int size)
+{
+	/*
+	 * We need to make sure that the name+value pair fits within
+	 * one block.
+	 */
+	if (((free_start - size) >> sb->s_blocksize_bits) !=
+	    ((free_start - 1) >> sb->s_blocksize_bits))
+		free_start -= free_start % sb->s_blocksize;
+
+	return free_start;
+}
+
+static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
+				       struct ocfs2_xattr_info *xi)
+{
+	int rc;
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+	int size = namevalue_size_xi(xi);
+	struct super_block *sb = loc->xl_inode->i_sb;
+
+	/*
+	 * Bucket storage does not reclaim name+value pairs it cannot
+	 * reuse.  They live as holes until the bucket fills, and then
+	 * the bucket is defragmented.  However, the bucket can reclaim
+	 * the ocfs2_xattr_entry.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
+		else
+			needed_space -= sizeof(struct ocfs2_xattr_entry);
+	}
+	BUG_ON(needed_space < 0);
+
+	if (free_start < size) {
+		if (needed_space)
+			return -ENOSPC;
+	} else {
+		/*
+		 * First we check if it would fit in the first place.
+		 * Below, we align the free start to a block.  This may
+		 * slide us below the minimum gap.  By checking unaligned
+		 * first, we avoid that error.
+		 */
+		rc = ocfs2_xa_check_space_helper(needed_space, free_start,
+						 count);
+		if (rc)
+			return rc;
+		free_start = ocfs2_bucket_align_free_start(sb, free_start,
+							   size);
+	}
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
+static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	le16_add_cpu(&loc->xl_header->xh_name_value_len,
+		     -namevalue_size_xe(loc->xl_entry));
+}
+
+static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int count = le16_to_cpu(xh->xh_count);
+	int low = 0, high = count - 1, tmp;
+	struct ocfs2_xattr_entry *tmp_xe;
+
+	/*
+	 * We keep buckets sorted by name_hash, so we need to find
+	 * our insert place.
+	 */
+	while (low <= high && count) {
+		tmp = (low + high) / 2;
+		tmp_xe = &xh->xh_entries[tmp];
+
+		if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+			low = tmp + 1;
+		else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
+			high = tmp - 1;
+		else {
+			low = tmp;
+			break;
+		}
+	}
+
+	if (low != count)
+		memmove(&xh->xh_entries[low + 1],
+			&xh->xh_entries[low],
+			((count - low) * sizeof(struct ocfs2_xattr_entry)));
+
+	le16_add_cpu(&xh->xh_count, 1);
+	loc->xl_entry = &xh->xh_entries[low];
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct super_block *sb = loc->xl_inode->i_sb;
+	int nameval_offset;
+
+	free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
+	nameval_offset = free_start - size;
+	loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
+	xh->xh_free_start = cpu_to_le16(nameval_offset);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+}
+
+static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_value_buf *vb)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	struct super_block *sb = loc->xl_inode->i_sb;
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int size = namevalue_size_xe(loc->xl_entry);
+	int block_offset = nameval_offset >> sb->s_blocksize_bits;
+
+	/* Values are not allowed to straddle block boundaries */
+	BUG_ON(block_offset !=
+	       ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
+	/* We expect the bucket to be filled in */
+	BUG_ON(!bucket->bu_bhs[block_offset]);
+
+	vb->vb_access = ocfs2_journal_access;
+	vb->vb_bh = bucket->bu_bhs[block_offset];
+}
+
+/* Operations for xattrs stored in buckets. */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_bucket_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_bucket_journal_dirty,
+	.xlo_offset_pointer	= ocfs2_xa_bucket_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_bucket_check_space,
+	.xlo_can_reuse		= ocfs2_xa_bucket_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_bucket_get_free_start,
+	.xlo_wipe_namevalue	= ocfs2_xa_bucket_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_bucket_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_bucket_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_bucket_fill_value_buf,
+};
+
+static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_value_buf vb;
+
+	if (ocfs2_xattr_is_local(loc->xl_entry))
+		return 0;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	return le32_to_cpu(vb.vb_xv->xr_clusters);
+}
+
+static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int trunc_rc, access_rc;
+	struct ocfs2_xattr_value_buf vb;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
+					      ctxt);
+
+	/*
+	 * The caller of ocfs2_xa_value_truncate() has already called
+	 * ocfs2_xa_journal_access on the loc.  However, The truncate code
+	 * calls ocfs2_extend_trans().  This may commit the previous
+	 * transaction and open a new one.  If this is a bucket, truncate
+	 * could leave only vb->vb_bh set up for journaling.  Meanwhile,
+	 * the caller is expecting to dirty the entire bucket.  So we must
+	 * reset the journal work.  We do this even if truncate has failed,
+	 * as it could have failed after committing the extend.
+	 */
+	access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
+					    OCFS2_JOURNAL_ACCESS_WRITE);
+
+	/* Errors in truncate take precedence */
+	return trunc_rc ? trunc_rc : access_rc;
+}
+
+static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
+{
+	int index, count;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+
+	ocfs2_xa_wipe_namevalue(loc);
+	loc->xl_entry = NULL;
+
+	le16_add_cpu(&xh->xh_count, -1);
+	count = le16_to_cpu(xh->xh_count);
+
+	/*
+	 * Only zero out the entry if there are more remaining.  This is
+	 * important for an empty bucket, as it keeps track of the
+	 * bucket's hash value.  It doesn't hurt empty block storage.
+	 */
+	if (count) {
+		index = ((char *)entry - (char *)&xh->xh_entries) /
+			sizeof(struct ocfs2_xattr_entry);
+		memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
+			(count - index) * sizeof(struct ocfs2_xattr_entry));
+		memset(&xh->xh_entries[count], 0,
+		       sizeof(struct ocfs2_xattr_entry));
+	}
+}
+
+/*
+ * If we have a problem adjusting the size of an external value during
+ * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
+ * in an intermediate state.  For example, the value may be partially
+ * truncated.
+ *
+ * If the value tree hasn't changed, the extend/truncate went nowhere.
+ * We have nothing to do.  The caller can treat it as a straight error.
+ *
+ * If the value tree got partially truncated, we now have a corrupted
+ * extended attribute.  We're going to wipe its entry and leak the
+ * clusters.  Better to leak some storage than leave a corrupt entry.
+ *
+ * If the value tree grew, it obviously didn't grow enough for the
+ * new entry.  We're not going to try and reclaim those clusters either.
+ * If there was already an external value there (orig_clusters != 0),
+ * the new clusters are attached safely and we can just leave the old
+ * value in place.  If there was no external value there, we remove
+ * the entry.
+ *
+ * This way, the xattr block we store in the journal will be consistent.
+ * If the size change broke because of the journal, no changes will hit
+ * disk anyway.
+ */
+static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
+					    const char *what,
+					    unsigned int orig_clusters)
+{
+	unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
+	char *nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+
+	if (new_clusters < orig_clusters) {
+		mlog(ML_ERROR,
+		     "Partial truncate while %s xattr %.*s.  Leaking "
+		     "%u clusters and removing the entry\n",
+		     what, loc->xl_entry->xe_name_len, nameval_buf,
+		     orig_clusters - new_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (!orig_clusters) {
+		mlog(ML_ERROR,
+		     "Unable to allocate an external value for xattr "
+		     "%.*s safely.  Leaking %u clusters and removing the "
+		     "entry\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (new_clusters > orig_clusters)
+		mlog(ML_ERROR,
+		     "Unable to grow xattr %.*s safely.  %u new clusters "
+		     "have been added, but the value will not be "
+		     "modified\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+}
+
+static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
+			   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	unsigned int orig_clusters;
+
+	if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+		if (rc) {
+			mlog_errno(rc);
+			/*
+			 * Since this is remove, we can return 0 if
+			 * ocfs2_xa_cleanup_value_truncate() is going to
+			 * wipe the entry anyway.  So we check the
+			 * cluster count as well.
+			 */
+			if (orig_clusters != ocfs2_xa_value_clusters(loc))
+				rc = 0;
+			ocfs2_xa_cleanup_value_truncate(loc, "removing",
+							orig_clusters);
+			if (rc)
+				goto out;
+		}
+	}
+
+	ocfs2_xa_remove_entry(loc);
+
+out:
+	return rc;
+}
+
+static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
+{
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+	char *nameval_buf;
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
+}
+
+/*
+ * Take an existing entry and make it ready for the new value.  This
+ * won't allocate space, but it may free space.  It should be ready for
+ * ocfs2_xa_prepare_entry() to finish the work.
+ */
+static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	unsigned int orig_clusters;
+	char *nameval_buf;
+	int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
+	int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
+
+	BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
+	       name_size);
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	if (xe_local) {
+		memset(nameval_buf + name_size, 0,
+		       namevalue_size_xe(loc->xl_entry) - name_size);
+		if (!xi_local)
+			ocfs2_xa_install_value_root(loc);
+	} else {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		if (xi_local) {
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc < 0)
+				mlog_errno(rc);
+			else
+				memset(nameval_buf + name_size, 0,
+				       namevalue_size_xe(loc->xl_entry) -
+				       name_size);
+		} else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
+			   xi->xi_value_len) {
+			rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
+						     ctxt);
+			if (rc < 0)
+				mlog_errno(rc);
+		}
+
+		if (rc) {
+			ocfs2_xa_cleanup_value_truncate(loc, "reusing",
+							orig_clusters);
+			goto out;
+		}
+	}
+
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	ocfs2_xattr_set_local(loc->xl_entry, xi_local);
+
+out:
+	return rc;
+}
+
+/*
+ * Prepares loc->xl_entry to receive the new xattr.  This includes
+ * properly setting up the name+value pair region.  If loc->xl_entry
+ * already exists, it will take care of modifying it appropriately.
+ *
+ * Note that this modifies the data.  You did journal_access already,
+ * right?
+ */
+static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
+				  struct ocfs2_xattr_info *xi,
+				  u32 name_hash,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	unsigned int orig_clusters;
+	__le64 orig_value_size = 0;
+
+	rc = ocfs2_xa_check_space(loc, xi);
+	if (rc)
+		goto out;
+
+	if (loc->xl_entry) {
+		if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+			orig_value_size = loc->xl_entry->xe_value_size;
+			rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+			if (rc)
+				goto out;
+			goto alloc_value;
+		}
+
+		if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+			orig_clusters = ocfs2_xa_value_clusters(loc);
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc) {
+				mlog_errno(rc);
+				ocfs2_xa_cleanup_value_truncate(loc,
+								"overwriting",
+								orig_clusters);
+				goto out;
+			}
+		}
+		ocfs2_xa_wipe_namevalue(loc);
+	} else
+		ocfs2_xa_add_entry(loc, name_hash);
+
+	/*
+	 * If we get here, we have a blank entry.  Fill it.  We grow our
+	 * name+value pair back from the end.
+	 */
+	ocfs2_xa_add_namevalue(loc, xi);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+		ocfs2_xa_install_value_root(loc);
+
+alloc_value:
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
+		if (rc < 0) {
+			ctxt->set_abort = 1;
+			ocfs2_xa_cleanup_value_truncate(loc, "growing",
+							orig_clusters);
+			/*
+			 * If we were growing an existing value,
+			 * ocfs2_xa_cleanup_value_truncate() won't remove
+			 * the entry. We need to restore the original value
+			 * size.
+			 */
+			if (loc->xl_entry) {
+				BUG_ON(!orig_value_size);
+				loc->xl_entry->xe_value_size = orig_value_size;
+			}
+			mlog_errno(rc);
+		}
+	}
+
+out:
+	return rc;
+}
+
+/*
+ * Store the value portion of the name+value pair.  This will skip
+ * values that are stored externally.  Their tree roots were set up
+ * by ocfs2_xa_prepare_entry().
+ */
+static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	char *nameval_buf;
+	struct ocfs2_xattr_value_buf vb;
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		ocfs2_xa_fill_value_buf(loc, &vb);
+		rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
+						     ctxt->handle, &vb,
+						     xi->xi_value,
+						     xi->xi_value_len);
+	} else
+		memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
+
+	return rc;
+}
+
+static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
+			struct ocfs2_xattr_info *xi,
+			struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
+					      xi->xi_name_len);
+
+	ret = ocfs2_xa_journal_access(ctxt->handle, loc,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * From here on out, everything is going to modify the buffer a
+	 * little.  Errors are going to leave the xattr header in a
+	 * sane state.  Thus, even with errors we dirty the sucker.
+	 */
+
+	/* Don't worry, we are never called with !xi_value and !xl_entry */
+	if (!xi->xi_value) {
+		ret = ocfs2_xa_remove(loc, ctxt);
+		goto out_dirty;
+	}
+
+	ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out_dirty;
+	}
+
+	ret = ocfs2_xa_store_value(loc, xi, ctxt);
+	if (ret)
+		mlog_errno(ret);
+
+out_dirty:
+	ocfs2_xa_journal_dirty(ctxt->handle, loc);
+
+out:
+	return ret;
+}
+
+static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
+				     struct inode *inode,
+				     struct buffer_head *bh,
+				     struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
+
+	loc->xl_inode = inode;
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_entry = entry;
+	loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
+	loc->xl_header =
+		(struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
+					      loc->xl_size);
+}
+
+static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+					  struct inode *inode,
+					  struct buffer_head *bh,
+					  struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
+
+	loc->xl_inode = inode;
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_header = &(xb->xb_attrs.xb_header);
+	loc->xl_entry = entry;
+	loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
+					     xb_attrs.xb_header);
+}
+
+static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_bucket *bucket,
+					   struct ocfs2_xattr_entry *entry)
+{
+	loc->xl_inode = bucket->bu_inode;
+	loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
+	loc->xl_storage = bucket;
+	loc->xl_header = bucket_xh(bucket);
+	loc->xl_entry = entry;
+	loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
+}
+
+/*
+ * In xattr remove, if it is stored outside and refcounted, we may have
+ * the chance to split the refcount tree. So need the allocators.
+ */
+static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
+					struct ocfs2_xattr_value_root *xv,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					int *ref_credits)
+{
+	int ret, meta_add = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	*ref_credits = 0;
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters,
+				       &xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
+						 ref_root_bh, xv,
+						 &meta_add, ref_credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+						meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+				      struct ocfs2_xattr_value_buf *vb,
+				      struct ocfs2_xattr_header *header,
+				      struct ocfs2_caching_info *ref_ci,
+				      struct buffer_head *ref_root_bh)
+{
+	int ret = 0, i, ref_credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	void *val;
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(entry))
+			continue;
+
+		val = (void *)header +
+			le16_to_cpu(entry->xe_name_offset);
+		vb->vb_xv = (struct ocfs2_xattr_value_root *)
+			(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
+							 ref_ci, ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, ref_credits +
+					ocfs2_remove_extent_credits(osb->sb));
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
+
+		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+	}
+
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
+{
+
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_xattr_header *header;
+	int ret;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = di_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_remove_value_outside(inode, &vb, header,
+					 ref_ci, ref_root_bh);
+
+	return ret;
+}
+
+struct ocfs2_rm_xattr_bucket_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+};
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+				    struct buffer_head *blk_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
+	struct ocfs2_rm_xattr_bucket_para args = {
+		.ref_ci = ref_ci,
+		.ref_root_bh = ref_root_bh,
+	};
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+		ret = ocfs2_remove_value_outside(inode, &vb, header,
+						 ref_ci, ref_root_bh);
+	} else
+		ret = ocfs2_iterate_xattr_index_block(inode,
+						blk_bh,
+						ocfs2_rm_xattr_cluster,
+						&args);
+
+	return ret;
+}
+
+static int ocfs2_xattr_free_block(struct inode *inode,
+				  u64 block,
+				  struct ocfs2_caching_info *ref_ci,
+				  struct buffer_head *ref_root_bh)
+{
+	struct inode *xb_alloc_inode;
+	struct buffer_head *xb_alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	int ret = 0;
+	u64 blk, bg_blkno;
+	u16 bit;
+
+	ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	blk = le64_to_cpu(xb->xb_blkno);
+	bit = le16_to_cpu(xb->xb_suballoc_bit);
+	if (xb->xb_suballoc_loc)
+		bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+	else
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+				EXTENT_ALLOC_SYSTEM_INODE,
+				le16_to_cpu(xb->xb_suballoc_slot));
+	if (!xb_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&xb_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	ocfs2_inode_unlock(xb_alloc_inode, 1);
+	brelse(xb_alloc_bh);
+out_mutex:
+	mutex_unlock(&xb_alloc_inode->i_mutex);
+	iput(xb_alloc_inode);
+out:
+	brelse(blk_bh);
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_caching_info *ref_ci = NULL;
+	handle_t *handle;
+	int ret;
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return 0;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+		ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
+					       le64_to_cpu(di->i_refcount_loc),
+					       1, &ref_tree, &ref_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ref_ci = &ref_tree->rf_ci;
+
+	}
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh,
+					       ref_ci, ref_root_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (di->i_xattr_loc) {
+		ret = ocfs2_xattr_free_block(inode,
+					     le64_to_cpu(di->i_xattr_loc),
+					     ref_ci, ref_root_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	di->i_xattr_loc = 0;
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+	ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
+	brelse(ref_root_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+					struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+	int free;
+
+	if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+	} else if (ocfs2_inode_is_fast_symlink(inode)) {
+		free = ocfs2_fast_symlink_chars(inode->i_sb) -
+			le64_to_cpu(di->i_size);
+	} else {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		free = (le16_to_cpu(el->l_count) -
+			le16_to_cpu(el->l_next_free_rec)) *
+			sizeof(struct ocfs2_extent_rec);
+	}
+	if (free >= xattrsize)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+	int has_space = 0;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		down_read(&oi->ip_alloc_sem);
+		has_space = ocfs2_xattr_has_space_inline(inode, di);
+		up_read(&oi->ip_alloc_sem);
+		if (!has_space)
+			return 0;
+	}
+
+	xs->xattr_bh = xs->inode_bh;
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	else
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	/* Find the named attribute. */
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+		if (ret && ret != -ENODATA)
+			return ret;
+		xs->not_found = ret;
+	}
+
+	return 0;
+}
+
+static int ocfs2_xattr_ibody_init(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int xattrsize = osb->s_xattr_inline_size;
+
+	if (!ocfs2_xattr_has_space_inline(inode, di)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Adjust extent record count or inline data size
+	 * to reserve space for extended attribute.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		le16_add_cpu(&idata->id_count, -xattrsize);
+	} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		le16_add_cpu(&el->l_count, -(xattrsize /
+					     sizeof(struct ocfs2_extent_rec)));
+	}
+	di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_journal_dirty(ctxt->handle, di_bh);
+
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_xa_loc loc;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return -ENOSPC;
+
+	down_write(&oi->ip_alloc_sem);
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+				 xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+	xs->here = loc.xl_entry;
+
+out:
+	up_write(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
+	if (ret && ret != -ENODATA) {
+		xs->xattr_bh = NULL;
+		goto cleanup;
+	}
+	xs->not_found = ret;
+	return 0;
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+static int ocfs2_create_xattr_block(struct inode *inode,
+				    struct buffer_head *inode_bh,
+				    struct ocfs2_xattr_set_ctxt *ctxt,
+				    int indexed,
+				    struct buffer_head **ret_bh)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, first_blkno;
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_xattr_block *xblk;
+
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+				      inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+				   &suballoc_loc, &suballoc_bit_start,
+				   &num_got, &first_blkno);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	if (!new_bh) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto end;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+	ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
+				      new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	/* Initialize ocfs2_xattr_block */
+	xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+	memset(xblk, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+	xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+	xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
+	xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	xblk->xb_fs_generation =
+		cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
+	xblk->xb_blkno = cpu_to_le64(first_blkno);
+	if (indexed) {
+		struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
+		xr->xt_clusters = cpu_to_le32(1);
+		xr->xt_last_eb_blk = 0;
+		xr->xt_list.l_tree_depth = 0;
+		xr->xt_list.l_count = cpu_to_le16(
+					ocfs2_xattr_recs_per_xb(inode->i_sb));
+		xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+		xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
+	}
+	ocfs2_journal_dirty(ctxt->handle, new_bh);
+
+	/* Add it to the inode */
+	di->i_xattr_loc = cpu_to_le64(first_blkno);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	ocfs2_journal_dirty(ctxt->handle, inode_bh);
+
+	*ret_bh = new_bh;
+	new_bh = NULL;
+
+end:
+	brelse(new_bh);
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_xattr_block *xblk = NULL;
+	int ret;
+	struct ocfs2_xa_loc loc;
+
+	if (!xs->xattr_bh) {
+		ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
+					       0, &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto end;
+		}
+
+		xs->xattr_bh = new_bh;
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+		xs->header = &xblk->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+		xs->here = xs->header->xh_entries;
+	} else
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
+					      xs->not_found ? NULL : xs->here);
+
+		ret = ocfs2_xa_set(&loc, xi, ctxt);
+		if (!ret)
+			xs->here = loc.xl_entry;
+		else if ((ret != -ENOSPC) || ctxt->set_abort)
+			goto end;
+		else {
+			ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+			if (ret)
+				goto end;
+		}
+	}
+
+	if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
+		ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
+
+end:
+	return ret;
+}
+
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+				       struct ocfs2_xattr_info *xi,
+				       struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *last;
+	int free, i;
+	size_t min_offs = xs->end - xs->base;
+
+	if (!xs->header)
+		return 0;
+
+	last = xs->header->xh_entries;
+
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
+	if (free < 0)
+		return 0;
+
+	BUG_ON(!xs->not_found);
+
+	if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
+		return 1;
+
+	return 0;
+}
+
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     int *clusters_need,
+				     int *meta_need,
+				     int *credits_need)
+{
+	int ret = 0, old_in_xb = 0;
+	int clusters_add = 0, meta_add = 0, credits = 0;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_block *xb = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	char *base = NULL;
+	int name_offset, name_len = 0;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+						    xi->xi_value_len);
+	u64 value_size;
+
+	/*
+	 * Calculate the clusters we need to write.
+	 * No matter whether we replace an old one or add a new one,
+	 * we need this for writing.
+	 */
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+		credits += new_clusters *
+			   ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+	if (xis->not_found && xbs->not_found) {
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+			clusters_add += new_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							&def_xv.xv.xr_list);
+		}
+
+		goto meta_guess;
+	}
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+		credits += OCFS2_INODE_UPDATE_CREDITS;
+	} else {
+		int i, block_off = 0;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+		old_in_xb = 1;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			base = bucket_block(xbs->bucket, block_off);
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		} else {
+			base = xbs->base;
+			credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+		}
+	}
+
+	/*
+	 * delete a xattr doesn't need metadata and cluster allocation.
+	 * so just calculate the credits and return.
+	 *
+	 * The credits for removing the value tree will be extended
+	 * by ocfs2_remove_extent itself.
+	 */
+	if (!xi->xi_value) {
+		if (!ocfs2_xattr_is_local(xe))
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
+
+		goto out;
+	}
+
+	/* do cluster allocation guess first. */
+	value_size = le64_to_cpu(xe->xe_value_size);
+
+	if (old_in_xb) {
+		/*
+		 * In xattr set, we always try to set the xe in inode first,
+		 * so if it can be inserted into inode successfully, the old
+		 * one will be removed from the xattr block, and this xattr
+		 * will be inserted into inode as a new xattr in inode.
+		 */
+		if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+			clusters_add += new_clusters;
+			credits += ocfs2_remove_extent_credits(inode->i_sb) +
+				    OCFS2_INODE_UPDATE_CREDITS;
+			if (!ocfs2_xattr_is_local(xe))
+				credits += ocfs2_calc_extend_credits(
+							inode->i_sb,
+							&def_xv.xv.xr_list);
+			goto out;
+		}
+	}
+
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* the new values will be stored outside. */
+		u32 old_clusters = 0;
+
+		if (!ocfs2_xattr_is_local(xe)) {
+			old_clusters =	ocfs2_clusters_for_bytes(inode->i_sb,
+								 value_size);
+			xv = (struct ocfs2_xattr_value_root *)
+			     (base + name_offset + name_len);
+			value_size = OCFS2_XATTR_ROOT_SIZE;
+		} else
+			xv = &def_xv.xv;
+
+		if (old_clusters >= new_clusters) {
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
+			goto out;
+		} else {
+			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+			clusters_add += new_clusters - old_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     &xv->xr_list);
+			if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+				goto out;
+		}
+	} else {
+		/*
+		 * Now the new value will be stored inside. So if the new
+		 * value is smaller than the size of value root or the old
+		 * value, we don't need any allocation, otherwise we have
+		 * to guess metadata allocation.
+		 */
+		if ((ocfs2_xattr_is_local(xe) &&
+		     (value_size >= xi->xi_value_len)) ||
+		    (!ocfs2_xattr_is_local(xe) &&
+		     OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
+			goto out;
+	}
+
+meta_guess:
+	/* calculate metadata allocation. */
+	if (di->i_xattr_loc) {
+		if (!xbs->xattr_bh) {
+			ret = ocfs2_read_xattr_block(inode,
+						     le64_to_cpu(di->i_xattr_loc),
+						     &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			xb = (struct ocfs2_xattr_block *)bh->b_data;
+		} else
+			xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+
+		/*
+		 * If there is already an xattr tree, good, we can calculate
+		 * like other b-trees. Otherwise we may have the chance of
+		 * create a tree, the credit calculation is borrowed from
+		 * ocfs2_calc_extend_credits with root_el = NULL. And the
+		 * new tree will be cluster based, so no meta is needed.
+		 */
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			struct ocfs2_extent_list *el =
+				 &xb->xb_attrs.xb_root.xt_list;
+			meta_add += ocfs2_extend_meta_needed(el);
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     el);
+		} else
+			credits += OCFS2_SUBALLOC_ALLOC + 1;
+
+		/*
+		 * This cluster will be used either for new bucket or for
+		 * new xattr block.
+		 * If the cluster size is the same as the bucket size, one
+		 * more is needed since we may need to extend the bucket
+		 * also.
+		 */
+		clusters_add += 1;
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		if (OCFS2_XATTR_BUCKET_SIZE ==
+			OCFS2_SB(inode->i_sb)->s_clustersize) {
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+			clusters_add += 1;
+		}
+	} else {
+		credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+			struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
+			meta_add += ocfs2_extend_meta_needed(el);
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     el);
+		} else {
+			meta_add += 1;
+		}
+	}
+out:
+	if (clusters_need)
+		*clusters_need = clusters_add;
+	if (meta_need)
+		*meta_need = meta_add;
+	if (credits_need)
+		*credits_need = credits;
+	brelse(bh);
+	return ret;
+}
+
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     struct ocfs2_xattr_set_ctxt *ctxt,
+				     int extra_meta,
+				     int *credits)
+{
+	int clusters_add, meta_add, ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+
+	ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+
+	ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+					&clusters_add, &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	meta_add += extra_meta;
+	trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
+					clusters_add, *credits);
+
+	if (meta_add) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+							&ctxt->meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_add) {
+		ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (ctxt->meta_ac) {
+			ocfs2_free_alloc_context(ctxt->meta_ac);
+			ctxt->meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null ctxt->data_ac.
+		 */
+	}
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+				    struct ocfs2_dinode *di,
+				    struct ocfs2_xattr_info *xi,
+				    struct ocfs2_xattr_search *xis,
+				    struct ocfs2_xattr_search *xbs,
+				    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret = 0, credits, old_found;
+
+	if (!xi->xi_value) {
+		/* Remove existing extended attribute */
+		if (!xis->not_found)
+			ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		else if (!xbs->not_found)
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+	} else {
+		/* We always try to set extended attribute into inode first*/
+		ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		if (!ret && !xbs->not_found) {
+			/*
+			 * If succeed and that extended attribute existing in
+			 * external block, then we will remove it.
+			 */
+			xi->xi_value = NULL;
+			xi->xi_value_len = 0;
+
+			old_found = xis->not_found;
+			xis->not_found = -ENODATA;
+			ret = ocfs2_calc_xattr_set_need(inode,
+							di,
+							xi,
+							xis,
+							xbs,
+							NULL,
+							NULL,
+							&credits);
+			xis->not_found = old_found;
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_extend_trans(ctxt->handle, credits);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+		} else if ((ret == -ENOSPC) && !ctxt->set_abort) {
+			if (di->i_xattr_loc && !xbs->xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode,
+							     xi->xi_name_index,
+							     xi->xi_name, xbs);
+				if (ret)
+					goto out;
+
+				old_found = xis->not_found;
+				xis->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				xis->not_found = old_found;
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+			/*
+			 * If no space in inode, we will set extended attribute
+			 * into external block.
+			 */
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+			if (ret)
+				goto out;
+			if (!xis->not_found) {
+				/*
+				 * If succeed and that extended attribute
+				 * existing in inode, we will remove it.
+				 */
+				xi->xi_value = NULL;
+				xi->xi_value_len = 0;
+				xbs->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+				ret = ocfs2_xattr_ibody_set(inode, xi,
+							    xis, ctxt);
+			}
+		}
+	}
+
+	if (!ret) {
+		/* Update inode ctime. */
+		ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+					      xis->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		inode->i_ctime = CURRENT_TIME;
+		di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+		di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+		ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+	}
+out:
+	return ret;
+}
+
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *di_bh,
+			   int name_index,
+			   const char *name,
+			   const void *value,
+			   size_t value_len,
+			   int flags,
+			   struct ocfs2_alloc_context *meta_ac,
+			   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_dinode *di;
+	int ret;
+
+	struct ocfs2_xattr_info xi = {
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_name_len = strlen(name),
+		.xi_value = value,
+		.xi_value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_set_ctxt ctxt = {
+		.handle = handle,
+		.meta_ac = meta_ac,
+		.data_ac = data_ac,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	/*
+	 * In extreme situation, may need xattr bucket when
+	 * block size is too small. And we have already reserved
+	 * the credits for bucket in mknod.
+	 */
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+		xbs.bucket = ocfs2_xattr_bucket_new(inode);
+		if (!xbs.bucket) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+	}
+
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+cleanup:
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	brelse(xbs.xattr_bh);
+	ocfs2_xattr_bucket_free(xbs.bucket);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    const void *value,
+		    size_t value_len,
+		    int flags)
+{
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int ret, credits, ref_meta = 0, ref_credits = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+
+	struct ocfs2_xattr_info xi = {
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_name_len = strlen(name),
+		.xi_value = value,
+		.xi_value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Only xbs will be used on indexed trees.  xis doesn't need a
+	 * bucket.
+	 */
+	xbs.bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xbs.bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto cleanup_nolock;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	/*
+	 * Scan inode and external block to find the same name
+	 * extended attribute and collect search information.
+	 */
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	if (xis.not_found && xbs.not_found) {
+		ret = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		ret = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		ret = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+
+	/* Check whether the value is refcounted and do some preparation. */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+	    (!xis.not_found || !xbs.not_found)) {
+		ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
+						   &xis, &xbs, &ref_tree,
+						   &ref_meta, &ref_credits);
+		if (ret) {
+			mlog_errno(ret);
+			goto cleanup;
+		}
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mutex_unlock(&tl_inode->i_mutex);
+			mlog_errno(ret);
+			goto cleanup;
+		}
+	}
+	mutex_unlock(&tl_inode->i_mutex);
+
+	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+					&xbs, &ctxt, ref_meta, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	/* we need to update inode's ctime field, so add credit for it. */
+	credits += OCFS2_INODE_UPDATE_CREDITS;
+	ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto out_free_ac;
+	}
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+	ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
+
+	ocfs2_commit_trans(osb, ctxt.handle);
+
+out_free_ac:
+	if (ctxt.data_ac)
+		ocfs2_free_alloc_context(ctxt.data_ac);
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
+cleanup:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	if (!value && !ret) {
+		ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
+	ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
+	brelse(di_bh);
+	brelse(xbs.xattr_bh);
+	ocfs2_xattr_bucket_free(xbs.bucket);
+
+	return ret;
+}
+
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+			       u32 name_hash,
+			       u64 *p_blkno,
+			       u32 *e_cpos,
+			       u32 *num_clusters,
+			       struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+	u64 e_blkno = 0;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+			e_blkno = le64_to_cpu(rec->e_blkno);
+			break;
+		}
+	}
+
+	if (!e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in xattr", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*p_blkno = le64_to_cpu(rec->e_blkno);
+	*num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	if (e_cpos)
+		*e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+				struct ocfs2_xattr_bucket *bucket,
+				void *para);
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u16 *xe_index,
+				   int *found)
+{
+	int i, ret = 0, cmp = 1, block_off, new_offset;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	size_t name_len = strlen(name);
+	struct ocfs2_xattr_entry *xe = NULL;
+	char *xe_name;
+
+	/*
+	 * We don't use binary search in the bucket because there
+	 * may be multiple entries with the same name hash.
+	 */
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash))
+			continue;
+		else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+			break;
+
+		cmp = name_index - ocfs2_xattr_get_type(xe);
+		if (!cmp)
+			cmp = name_len - xe->xe_name_len;
+		if (cmp)
+			continue;
+
+		ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							xh,
+							i,
+							&block_off,
+							&new_offset);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+
+		xe_name = bucket_block(bucket, block_off) + new_offset;
+		if (!memcmp(name, xe_name, name_len)) {
+			*xe_index = i;
+			*found = 1;
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u64 p_blkno,
+				   u32 first_hash,
+				   u32 num_clusters,
+				   struct ocfs2_xattr_search *xs)
+{
+	int ret, found = 0;
+	struct ocfs2_xattr_header *xh = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	u16 index = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int low_bucket = 0, bucket, high_bucket;
+	struct ocfs2_xattr_bucket *search;
+	u32 last_hash;
+	u64 blkno, lower_blkno = 0;
+
+	search = ocfs2_xattr_bucket_new(inode);
+	if (!search) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(search, p_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = bucket_xh(search);
+	high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+	while (low_bucket <= high_bucket) {
+		ocfs2_xattr_bucket_relse(search);
+
+		bucket = (low_bucket + high_bucket) / 2;
+		blkno = p_blkno + bucket * blk_per_bucket;
+		ret = ocfs2_read_xattr_bucket(search, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xh = bucket_xh(search);
+		xe = &xh->xh_entries[0];
+		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+			high_bucket = bucket - 1;
+			continue;
+		}
+
+		/*
+		 * Check whether the hash of the last entry in our
+		 * bucket is larger than the search one. for an empty
+		 * bucket, the last one is also the first one.
+		 */
+		if (xh->xh_count)
+			xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
+		last_hash = le32_to_cpu(xe->xe_name_hash);
+
+		/* record lower_blkno which may be the insert place. */
+		lower_blkno = blkno;
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+			low_bucket = bucket + 1;
+			continue;
+		}
+
+		/* the searched xattr should reside in this bucket if exists. */
+		ret = ocfs2_find_xe_in_bucket(inode, search,
+					      name_index, name, name_hash,
+					      &index, &found);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/*
+	 * Record the bucket we have found.
+	 * When the xattr's hash value is in the gap of 2 buckets, we will
+	 * always set it to the previous bucket.
+	 */
+	if (!lower_blkno)
+		lower_blkno = p_blkno;
+
+	/* This should be in cache - we just read it during the search */
+	ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xs->header = bucket_xh(xs->bucket);
+	xs->base = bucket_block(xs->bucket, 0);
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (found) {
+		xs->here = &xs->header->xh_entries[index];
+		trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
+			name, name_index, name_hash,
+			(unsigned long long)bucket_blkno(xs->bucket),
+			index);
+	} else
+		ret = -ENODATA;
+
+out:
+	ocfs2_xattr_bucket_free(search);
+	return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u64 p_blkno = 0;
+	u32 first_hash, num_clusters = 0;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return -ENODATA;
+
+	trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
+					name, name_index, name_hash,
+					(unsigned long long)root_bh->b_blocknr,
+					-1);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+	trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
+					name, name_index, first_hash,
+					(unsigned long long)p_blkno,
+					num_clusters);
+
+	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+				      p_blkno, first_hash, num_clusters, xs);
+
+out:
+	return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+				       u64 blkno,
+				       u32 clusters,
+				       xattr_bucket_func *func,
+				       void *para)
+{
+	int i, ret = 0;
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+	u32 num_buckets = clusters * bpc;
+	struct ocfs2_xattr_bucket *bucket;
+
+	bucket = ocfs2_xattr_bucket_new(inode);
+	if (!bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	trace_ocfs2_iterate_xattr_buckets(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)blkno, clusters);
+
+	for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
+		ret = ocfs2_read_xattr_bucket(bucket, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
+
+		trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
+		     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
+		if (func) {
+			ret = func(inode, bucket, para);
+			if (ret && ret != -ERANGE)
+				mlog_errno(ret);
+			/* Fall through to bucket_relse() */
+		}
+
+		ocfs2_xattr_bucket_relse(bucket);
+		if (ret)
+			break;
+	}
+
+	ocfs2_xattr_bucket_free(bucket);
+	return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+	char *buffer;
+	size_t buffer_size;
+	size_t result;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset)
+{
+	u16 name_offset;
+
+	if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+		return -EINVAL;
+
+	name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+	*block_off = name_offset >> sb->s_blocksize_bits;
+	*new_offset = name_offset % sb->s_blocksize;
+
+	return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   void *para)
+{
+	int ret = 0, type;
+	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+	int i, block_off, new_offset;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+								bucket_xh(bucket),
+								i,
+								&block_off,
+								&new_offset);
+			if (ret)
+				break;
+
+			name = (const char *)bucket_block(bucket, block_off) +
+				new_offset;
+			ret = ocfs2_xattr_list_entry(xl->buffer,
+						     xl->buffer_size,
+						     &xl->result,
+						     prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *blk_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+	u64 p_blkno = 0;
+
+	if (!el->l_next_free_rec || !rec_func)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
+			       num_clusters, para);
+		if (ret) {
+			if (ret != -ERANGE)
+				mlog_errno(ret);
+			break;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+	return ret;
+
+}
+
+static int ocfs2_list_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_list_xattr_bucket, para);
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					     struct buffer_head *blk_bh,
+					     char *buffer,
+					     size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_xattr_tree_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+		.result = 0,
+	};
+
+	ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+					      ocfs2_list_xattr_tree_rec, &xl);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = xl.result;
+out:
+	return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_hash = le32_to_cpu(l->xe_name_hash);
+	u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+	if (l_hash > r_hash)
+		return 1;
+	if (l_hash < r_hash)
+		return -1;
+	return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+	struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+	tmp = *l;
+	memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+	memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+					   struct buffer_head *xb_bh,
+					   struct ocfs2_xattr_bucket *bucket)
+{
+	int i, blocksize = inode->i_sb->s_blocksize;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u16 offset, size, off_change;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	u16 count = le16_to_cpu(xb_xh->xh_count);
+	char *src = xb_bh->b_data;
+	char *target = bucket_block(bucket, blks - 1);
+
+	trace_ocfs2_cp_xattr_block_to_bucket_begin(
+				(unsigned long long)xb_bh->b_blocknr,
+				(unsigned long long)bucket_blkno(bucket));
+
+	for (i = 0; i < blks; i++)
+		memset(bucket_block(bucket, i), 0, blocksize);
+
+	/*
+	 * Since the xe_name_offset is based on ocfs2_xattr_header,
+	 * there is a offset change corresponding to the change of
+	 * ocfs2_xattr_header's position.
+	 */
+	off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	xe = &xb_xh->xh_entries[count - 1];
+	offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+	size = blocksize - offset;
+
+	/* copy all the names and values. */
+	memcpy(target + offset, src + offset, size);
+
+	/* Init new header now. */
+	xh->xh_count = xb_xh->xh_count;
+	xh->xh_num_buckets = cpu_to_le16(1);
+	xh->xh_name_value_len = cpu_to_le16(size);
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+	/* copy all the entries. */
+	target = bucket_block(bucket, 0);
+	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+	size = count * sizeof(struct ocfs2_xattr_entry);
+	memcpy(target + offset, (char *)xb_xh + offset, size);
+
+	/* Change the xe offset for all the xe because of the move. */
+	off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+		 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	for (i = 0; i < count; i++)
+		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+	trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
+
+	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
+					    struct ocfs2_xattr_search *xs,
+					    struct buffer_head *old_bh)
+{
+	char *buf = old_bh->b_data;
+	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+	int i;
+
+	xs->header = bucket_xh(xs->bucket);
+	xs->base = bucket_block(xs->bucket, 0);
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (xs->not_found)
+		return;
+
+	i = xs->here - old_xh->xh_entries;
+	xs->here = &xs->header->xh_entries[i];
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 bit_off, len;
+	u64 blkno;
+	handle_t *handle = ctxt->handle;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *xb_bh = xs->xattr_bh;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xr;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+	trace_ocfs2_xattr_create_index_block_begin(
+				(unsigned long long)xb_bh->b_blocknr);
+
+	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+	BUG_ON(!xs->bucket);
+
+	/*
+	 * XXX:
+	 * We can use this lock for now, and maybe move to a dedicated mutex
+	 * if performance becomes a problem later.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
+				     1, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * The bucket may spread in many blocks, and
+	 * we will only touch the 1st block and the last block
+	 * in the whole bucket(one for entry and one for data).
+	 */
+	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+	trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
+
+	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
+
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
+
+	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xr = &xb->xb_attrs.xb_root;
+	xr->xt_clusters = cpu_to_le32(1);
+	xr->xt_last_eb_blk = 0;
+	xr->xt_list.l_tree_depth = 0;
+	xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+	xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+	xr->xt_list.l_recs[0].e_cpos = 0;
+	xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+	ocfs2_journal_dirty(handle, xb_bh);
+
+out:
+	up_write(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+	u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+	if (l_name_offset < r_name_offset)
+		return 1;
+	if (l_name_offset > r_name_offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     handle_t *handle,
+				     struct ocfs2_xattr_bucket *bucket)
+{
+	int ret, i;
+	size_t end, offset, len;
+	struct ocfs2_xattr_header *xh;
+	char *entries, *buf, *bucket_buf = NULL;
+	u64 blkno = bucket_blkno(bucket);
+	u16 xh_free_start;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	struct ocfs2_xattr_entry *xe;
+
+	/*
+	 * In order to make the operation more efficient and generic,
+	 * we copy all the blocks into a contiguous memory and do the
+	 * defragment there, so if anything is error, we will not touch
+	 * the real block.
+	 */
+	bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+	if (!bucket_buf) {
+		ret = -EIO;
+		goto out;
+	}
+
+	buf = bucket_buf;
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(buf, bucket_block(bucket, i), blocksize);
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bucket_buf;
+	entries = (char *)xh->xh_entries;
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+	trace_ocfs2_defrag_xattr_bucket(
+	     (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
+	     xh_free_start, le16_to_cpu(xh->xh_name_value_len));
+
+	/*
+	 * sort all the entries by their offset.
+	 * the largest will be the first, so that we can
+	 * move them to the end one by one.
+	 */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe_offset, swap_xe);
+
+	/* Move all name/values to the end of the bucket. */
+	xe = xh->xh_entries;
+	end = OCFS2_XATTR_BUCKET_SIZE;
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+		offset = le16_to_cpu(xe->xe_name_offset);
+		len = namevalue_size_xe(xe);
+
+		/*
+		 * We must make sure that the name/value pair
+		 * exist in the same block. So adjust end to
+		 * the previous block end if needed.
+		 */
+		if (((end - len) / blocksize !=
+			(end - 1) / blocksize))
+			end = end - end % blocksize;
+
+		if (end > offset + len) {
+			memmove(bucket_buf + end - len,
+				bucket_buf + offset, len);
+			xe->xe_name_offset = cpu_to_le16(end - len);
+		}
+
+		mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+				"bucket %llu\n", (unsigned long long)blkno);
+
+		end -= len;
+	}
+
+	mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+			"bucket %llu\n", (unsigned long long)blkno);
+
+	if (xh_free_start == end)
+		goto out;
+
+	memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+	xh->xh_free_start = cpu_to_le16(end);
+
+	/* sort the entries by their name_hash. */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+
+	buf = bucket_buf;
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(bucket_block(bucket, i), buf, blocksize);
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+out:
+	kfree(bucket_buf);
+	return ret;
+}
+
+/*
+ * prev_blkno points to the start of an existing extent.  new_blkno
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
+ *
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+					       handle_t *handle,
+					       struct ocfs2_xattr_bucket *first,
+					       struct ocfs2_xattr_bucket *target,
+					       u64 new_blkno,
+					       u32 num_clusters,
+					       u32 *first_hash)
+{
+	int ret;
+	struct super_block *sb = inode->i_sb;
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+	int to_move = num_buckets / 2;
+	u64 src_blkno;
+	u64 last_cluster_blkno = bucket_blkno(first) +
+		((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
+
+	BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
+
+	trace_ocfs2_mv_xattr_bucket_cross_cluster(
+				(unsigned long long)last_cluster_blkno,
+				(unsigned long long)new_blkno);
+
+	ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
+				     last_cluster_blkno, new_blkno,
+				     to_move, first_hash);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This is the first bucket that got moved */
+	src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
+
+	/*
+	 * If the target bucket was part of the moved buckets, we need to
+	 * update first and target.
+	 */
+	if (bucket_blkno(target) >= src_blkno) {
+		/* Find the block for the new target bucket */
+		src_blkno = new_blkno +
+			(bucket_blkno(target) - src_blkno);
+
+		ocfs2_xattr_bucket_relse(first);
+		ocfs2_xattr_bucket_relse(target);
+
+		/*
+		 * These shouldn't fail - the buffers are in the
+		 * journal from ocfs2_cp_xattr_bucket().
+		 */
+		ret = ocfs2_read_xattr_bucket(first, new_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ret = ocfs2_read_xattr_bucket(target, src_blkno);
+		if (ret)
+			mlog_errno(ret);
+
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Find the suitable pos when we divide a bucket into 2.
+ * We have to make sure the xattrs with the same hash value exist
+ * in the same bucket.
+ *
+ * If this ocfs2_xattr_header covers more than one hash value, find a
+ * place where the hash value changes.  Try to find the most even split.
+ * The most common case is that all entries have different hash values,
+ * and the first check we make will find a place to split.
+ */
+static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh)
+{
+	struct ocfs2_xattr_entry *entries = xh->xh_entries;
+	int count = le16_to_cpu(xh->xh_count);
+	int delta, middle = count / 2;
+
+	/*
+	 * We start at the middle.  Each step gets farther away in both
+	 * directions.  We therefore hit the change in hash value
+	 * nearest to the middle.  Note that this loop does not execute for
+	 * count < 2.
+	 */
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (cmp_xe(&entries[middle - delta - 1],
+			   &entries[middle - delta]))
+			return middle - delta;
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == count)
+			continue;
+
+		/* Now try delta past middle */
+		if (cmp_xe(&entries[middle + delta],
+			   &entries[middle + delta + 1]))
+			return middle + delta + 1;
+	}
+
+	/* Every entry had the same hash */
+	return count;
+}
+
+/*
+ * Move some xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ *
+ * Normally half of the xattrs will be moved.  But we have to make
+ * sure that the xattrs with the same hash value are stored in the
+ * same bucket. If all the xattrs in this bucket have the same hash
+ * value, the new bucket will be initialized as an empty one and the
+ * first_hash will be initialized as (hash_value+1).
+ */
+static int ocfs2_divide_xattr_bucket(struct inode *inode,
+				    handle_t *handle,
+				    u64 blk,
+				    u64 new_blk,
+				    u32 *first_hash,
+				    int new_bucket_head)
+{
+	int ret, i;
+	int count, start, len, name_value_len = 0, name_offset = 0;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	int blocksize = inode->i_sb->s_blocksize;
+
+	trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
+					      (unsigned long long)new_blk);
+
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(s_bucket, blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
+	 * same part of ocfs2_cp_xattr_bucket().
+	 */
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+						new_bucket_head ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = bucket_xh(s_bucket);
+	count = le16_to_cpu(xh->xh_count);
+	start = ocfs2_xattr_find_divide_pos(xh);
+
+	if (start == count) {
+		xe = &xh->xh_entries[start-1];
+
+		/*
+		 * initialized a new empty bucket here.
+		 * The hash value is set as one larger than
+		 * that of the last entry in the previous bucket.
+		 */
+		for (i = 0; i < t_bucket->bu_blocks; i++)
+			memset(bucket_block(t_bucket, i), 0, blocksize);
+
+		xh = bucket_xh(t_bucket);
+		xh->xh_free_start = cpu_to_le16(blocksize);
+		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
+		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
+
+		goto set_num_buckets;
+	}
+
+	/* copy the whole bucket to the new first. */
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+
+	/* update the new bucket. */
+	xh = bucket_xh(t_bucket);
+
+	/*
+	 * Calculate the total name/value len and xh_free_start for
+	 * the old bucket first.
+	 */
+	name_offset = OCFS2_XATTR_BUCKET_SIZE;
+	name_value_len = 0;
+	for (i = 0; i < start; i++) {
+		xe = &xh->xh_entries[i];
+		name_value_len += namevalue_size_xe(xe);
+		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+			name_offset = le16_to_cpu(xe->xe_name_offset);
+	}
+
+	/*
+	 * Now begin the modification to the new bucket.
+	 *
+	 * In the new bucket, We just move the xattr entry to the beginning
+	 * and don't touch the name/value. So there will be some holes in the
+	 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+	 * called.
+	 */
+	xe = &xh->xh_entries[start];
+	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+	trace_ocfs2_divide_xattr_bucket_move(len,
+			(int)((char *)xe - (char *)xh),
+			(int)((char *)xh->xh_entries - (char *)xh));
+	memmove((char *)xh->xh_entries, (char *)xe, len);
+	xe = &xh->xh_entries[count - start];
+	len = sizeof(struct ocfs2_xattr_entry) * start;
+	memset((char *)xe, 0, len);
+
+	le16_add_cpu(&xh->xh_count, -start);
+	le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+	/* Calculate xh_free_start for the new bucket. */
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (le16_to_cpu(xe->xe_name_offset) <
+		    le16_to_cpu(xh->xh_free_start))
+			xh->xh_free_start = xe->xe_name_offset;
+	}
+
+set_num_buckets:
+	/* set xh->xh_num_buckets for the new xh. */
+	if (new_bucket_head)
+		xh->xh_num_buckets = cpu_to_le16(1);
+	else
+		xh->xh_num_buckets = 0;
+
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
+
+	/* store the first_hash of the new bucket. */
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+	/*
+	 * Now only update the 1st block of the old bucket.  If we
+	 * just added a new empty bucket, there is no need to modify
+	 * it.
+	 */
+	if (start == count)
+		goto out;
+
+	xh = bucket_xh(s_bucket);
+	memset(&xh->xh_entries[start], 0,
+	       sizeof(struct ocfs2_xattr_entry) * (count - start));
+	xh->xh_count = cpu_to_le16(start);
+	xh->xh_free_start = cpu_to_le16(name_offset);
+	xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+	ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
+
+out:
+	ocfs2_xattr_bucket_free(s_bucket);
+	ocfs2_xattr_bucket_free(t_bucket);
+
+	return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+				 handle_t *handle,
+				 u64 s_blkno,
+				 u64 t_blkno,
+				 int t_is_new)
+{
+	int ret;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
+
+	BUG_ON(s_blkno == t_blkno);
+
+	trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
+				    (unsigned long long)t_blkno,
+				    t_is_new);
+
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
+	if (ret)
+		goto out;
+
+	/*
+	 * Even if !t_is_new, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
+	if (ret)
+		goto out;
+
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+	 * cluster to fill, we came here from
+	 * ocfs2_mv_xattr_buckets(), and it is really new -
+	 * ACCESS_CREATE is required.  But we also might have moved data
+	 * out of t_bucket before extending back into it.
+	 * ocfs2_add_new_xattr_bucket() can do this - its call to
+	 * ocfs2_add_new_xattr_cluster() may have created a new extent
+	 * and copied out the end of the old extent.  Then it re-extends
+	 * the old extent back to create space for new xattrs.  That's
+	 * how we get here, and the bucket isn't really new.
+	 */
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+						t_is_new ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret)
+		goto out;
+
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
+
+out:
+	ocfs2_xattr_bucket_free(t_bucket);
+	ocfs2_xattr_bucket_free(s_bucket);
+
+	return ret;
+}
+
+/*
+ * src_blk points to the start of an existing extent.  last_blk points to
+ * last cluster in that extent.  to_blk points to a newly allocated
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
+ */
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
+				  u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	struct ocfs2_xattr_bucket *old_first, *new_first;
+
+	trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
+				     (unsigned long long)to_blk);
+
+	BUG_ON(start_bucket >= num_buckets);
+	if (start_bucket) {
+		num_buckets -= start_bucket;
+		last_blk += (start_bucket * blks_per_bucket);
+	}
+
+	/* The first bucket of the original extent */
+	old_first = ocfs2_xattr_bucket_new(inode);
+	/* The first bucket of the new extent */
+	new_first = ocfs2_xattr_bucket_new(inode);
+	if (!old_first || !new_first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(old_first, src_blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We need to update the first bucket of the old extent and all
+	 * the buckets going to the new extent.
+	 */
+	credits = ((num_buckets + 1) * blks_per_bucket);
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < num_buckets; i++) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    last_blk + (i * blks_per_bucket),
+					    to_blk + (i * blks_per_bucket),
+					    1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Get the new bucket ready before we dirty anything
+	 * (This actually shouldn't fail, because we already dirtied
+	 * it once in ocfs2_cp_xattr_bucket()).
+	 */
+	ret = ocfs2_read_xattr_bucket(new_first, to_blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Now update the headers */
+	le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, old_first);
+
+	bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, new_first);
+
+	if (first_hash)
+		*first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+
+out:
+	ocfs2_xattr_bucket_free(new_first);
+	ocfs2_xattr_bucket_free(old_first);
+	return ret;
+}
+
+/*
+ * Move some xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_divide_xattr_cluster(struct inode *inode,
+				      handle_t *handle,
+				      u64 prev_blk,
+				      u64 new_blk,
+				      u32 *first_hash)
+{
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int ret, credits = 2 * blk_per_bucket;
+
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	return  ocfs2_divide_xattr_bucket(inode, handle, prev_blk,
+					  new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ *    than 1 bucket, so just move half nums of bucket into the new cluster and
+ *    update the first_bh and header_bh if the insert bucket has been moved
+ *    to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ *    a) If the previous extent rec has more than one cluster and the insert
+ *       place isn't in the last cluster, copy the entire last cluster to the
+ *       new one. This time, we don't need to upate the first_bh and header_bh
+ *       since they will not be moved into the new cluster.
+ *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ *       the new one. And we set the extend flag to zero if the insert place is
+ *       moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+					    handle_t *handle,
+					    struct ocfs2_xattr_bucket *first,
+					    struct ocfs2_xattr_bucket *target,
+					    u64 new_blk,
+					    u32 prev_clusters,
+					    u32 *v_start,
+					    int *extend)
+{
+	int ret;
+
+	trace_ocfs2_adjust_xattr_cross_cluster(
+			(unsigned long long)bucket_blkno(first),
+			(unsigned long long)new_blk, prev_clusters);
+
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
+		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+							  handle,
+							  first, target,
+							  new_blk,
+							  prev_clusters,
+							  v_start);
+		if (ret)
+			mlog_errno(ret);
+	} else {
+		/* The start of the last cluster in the first extent */
+		u64 last_blk = bucket_blkno(first) +
+			((prev_clusters - 1) *
+			 ocfs2_clusters_to_blocks(inode->i_sb, 1));
+
+		if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+			ret = ocfs2_mv_xattr_buckets(inode, handle,
+						     bucket_blkno(first),
+						     last_blk, new_blk, 0,
+						     v_start);
+			if (ret)
+				mlog_errno(ret);
+		} else {
+			ret = ocfs2_divide_xattr_cluster(inode, handle,
+							 last_blk, new_blk,
+							 v_start);
+			if (ret)
+				mlog_errno(ret);
+
+			if ((bucket_blkno(target) == last_blk) && extend)
+				*extend = 0;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+				       struct buffer_head *root_bh,
+				       struct ocfs2_xattr_bucket *first,
+				       struct ocfs2_xattr_bucket *target,
+				       u32 *num_clusters,
+				       u32 prev_cpos,
+				       int *extend,
+				       struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 prev_clusters = *num_clusters;
+	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+	u64 block;
+	handle_t *handle = ctxt->handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
+
+	trace_ocfs2_add_new_xattr_cluster_begin(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)bucket_blkno(first),
+		prev_cpos, prev_clusters);
+
+	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
+				     clusters_to_add, &bit_off, &num_bits);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
+
+	if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
+	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+	     OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+		/*
+		 * If this cluster is contiguous with the old one and
+		 * adding this new cluster, we don't surpass the limit of
+		 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+		 * initialized and used like other buckets in the previous
+		 * cluster.
+		 * So add it as a contiguous one. The caller will handle
+		 * its init process.
+		 */
+		v_start = prev_cpos + prev_clusters;
+		*num_clusters = prev_clusters + num_bits;
+	} else {
+		ret = ocfs2_adjust_xattr_cross_cluster(inode,
+						       handle,
+						       first,
+						       target,
+						       block,
+						       prev_clusters,
+						       &v_start,
+						       extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+
+	trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
+						 v_start, num_bits);
+	ret = ocfs2_insert_extent(handle, &et, v_start, block,
+				  num_bits, 0, ctxt->meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ocfs2_journal_dirty(handle, root_bh);
+
+leave:
+	return ret;
+}
+
+/*
+ * We are given an extent.  'first' is the bucket at the very front of
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+				     handle_t *handle,
+				     struct ocfs2_xattr_bucket *first,
+				     u64 target_blk,
+				     u32 num_clusters)
+{
+	int ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u64 end_blk;
+	u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
+
+	trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
+					(unsigned long long)bucket_blkno(first),
+					num_clusters, new_bucket);
+
+	/* The extent must have room for an additional bucket */
+	BUG_ON(new_bucket >=
+	       (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
+
+	/* end_blk points to the last existing bucket */
+	end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
+
+	/*
+	 * end_blk is the start of the last existing bucket.
+	 * Thus, (end_blk - target_blk) covers the target bucket and
+	 * every bucket after it up to, but not including, the last
+	 * existing bucket.  Then we add the last existing bucket, the
+	 * new bucket, and the first bucket (3 * blk_per_bucket).
+	 */
+	credits = (end_blk - target_blk) + (3 * blk_per_bucket);
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (end_blk != target_blk) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+					    end_blk + blk_per_bucket, 0);
+		if (ret)
+			goto out;
+		end_blk -= blk_per_bucket;
+	}
+
+	/* Move half of the xattr in target_blkno to the next bucket. */
+	ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
+					target_blk + blk_per_bucket, NULL, 0);
+
+	le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
+	ocfs2_xattr_bucket_journal_dirty(handle, first);
+
+out:
+	return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
+ * bucket we want to insert into.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
+ *
+ * If current cluster is full, we'll allocate a new one.  This may not
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+				      struct buffer_head *xb_bh,
+				      struct ocfs2_xattr_bucket *target,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u32 name_hash =
+		le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int ret, num_buckets, extend = 1;
+	u64 p_blkno;
+	u32 e_cpos, num_clusters;
+	/* The bucket at the front of the extent */
+	struct ocfs2_xattr_bucket *first;
+
+	trace_ocfs2_add_new_xattr_bucket(
+				(unsigned long long)bucket_blkno(target));
+
+	/* The first bucket of the original extent */
+	first = ocfs2_xattr_bucket_new(inode);
+	if (!first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(first, p_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+	if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+		/*
+		 * This can move first+target if the target bucket moves
+		 * to the new extent.
+		 */
+		ret = ocfs2_add_new_xattr_cluster(inode,
+						  xb_bh,
+						  first,
+						  target,
+						  &num_clusters,
+						  e_cpos,
+						  &extend,
+						  ctxt);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (extend) {
+		ret = ocfs2_extend_xattr_bucket(inode,
+						ctxt->handle,
+						first,
+						bucket_blkno(target),
+						num_clusters);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	ocfs2_xattr_bucket_free(first);
+
+	return ret;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket,
+					     int xe_off,
+					     int len,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret, offset;
+	u64 value_blk;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+
+	xe = &xh->xh_entries[xe_off];
+
+	BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	value_blk = offset / blocksize;
+
+	/* We don't allow ocfs2_xattr_value to be stored in different block. */
+	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+
+	vb.vb_bh = bucket->bu_bhs[value_blk];
+	BUG_ON(!vb.vb_bh);
+
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+		(vb.vb_bh->b_data + offset % blocksize);
+
+	/*
+	 * From here on out we have to dirty the bucket.  The generic
+	 * value calls only modify one of the bucket's bhs, but we need
+	 * to send the bucket at once.  So if they error, they *could* have
+	 * modified something.  We have to assume they did, and dirty
+	 * the whole bucket.  This leaves us in a consistent state.
+	 */
+	trace_ocfs2_xattr_bucket_value_truncate(
+			(unsigned long long)bucket_blkno(bucket), xe_off, len);
+	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xe->xe_value_size = cpu_to_le64(len);
+
+	ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
+
+out:
+	return ret;
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len,
+				  void *para)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					  ocfs2_delete_xattr_in_bucket, para);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	trace_ocfs2_rm_xattr_cluster(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)blkno, cpos, len);
+
+	ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
+					       len);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
+				  &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+	ocfs2_journal_dirty(handle, root_bh);
+
+	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+	if (ret)
+		mlog_errno(ret);
+	ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+					      struct ocfs2_xattr_bucket *bucket,
+					      const char *name)
+{
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+		return 0;
+
+	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+	    xh->xh_entries[0].xe_name_hash) {
+		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+		     "hash = %u\n",
+		     (unsigned long long)bucket_blkno(bucket),
+		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to set the entry in the current bucket.  If we fail, the caller
+ * will handle getting us another bucket.
+ */
+static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_xa_loc loc;
+
+	trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
+
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
+	}
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Ok, we need space.  Let's try defragmenting the bucket. */
+	ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+					xs->bucket);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
+	}
+	if (ret != -ENOSPC)
+		mlog_errno(ret);
+
+
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+
+	trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
+
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (!ret)
+		goto out;
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Ack, need more space.  Let's try to get another bucket! */
+
+	/*
+	 * We do not allow for overlapping ranges between buckets. And
+	 * the maximum number of collisions we will allow for then is
+	 * one bucket's worth, so check it here whether we need to
+	 * add a new bucket for the insert.
+	 */
+	ret = ocfs2_check_xattr_bucket_collision(inode,
+						 xs->bucket,
+						 xi->xi_name);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_add_new_xattr_bucket(inode,
+					 xs->xattr_bh,
+					 xs->bucket,
+					 ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * ocfs2_add_new_xattr_bucket() will have updated
+	 * xs->bucket if it moved, but it will not have updated
+	 * any of the other search fields.  Thus, we drop it and
+	 * re-search.  Everything should be cached, so it'll be
+	 * quick.
+	 */
+	ocfs2_xattr_bucket_relse(xs->bucket);
+	ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+					   xi->xi_name_index,
+					   xi->xi_name, xs);
+	if (ret && ret != -ENODATA)
+		goto out;
+	xs->not_found = ret;
+
+	/* Ok, we have a new bucket, let's try again */
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (ret && (ret != -ENOSPC))
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para)
+{
+	int ret = 0, ref_credits;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	u16 i;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+	int credits = ocfs2_remove_extent_credits(osb->sb) +
+		ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_rm_xattr_bucket_para *args =
+			(struct ocfs2_rm_xattr_bucket_para *)para;
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
+						      i, &xv, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
+							 args->ref_ci,
+							 args->ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+							i, 0, &ctxt);
+
+		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
+		}
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+	return ret;
+}
+
+/*
+ * Whenever we modify a xattr value root in the bucket(e.g, CoW
+ * or change the extent record flag), we need to recalculate
+ * the metaecc for the whole bucket. So it is done here.
+ *
+ * Note:
+ * We have to give the extra credits for the caller.
+ */
+static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
+					    handle_t *handle,
+					    void *para)
+{
+	int ret;
+	struct ocfs2_xattr_bucket *bucket =
+			(struct ocfs2_xattr_bucket *)para;
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+	return 0;
+}
+
+/*
+ * Special action we need if the xattr value is refcounted.
+ *
+ * 1. If the xattr is refcounted, lock the tree.
+ * 2. CoW the xattr if we are setting the new value and the value
+ *    will be stored outside.
+ * 3. In other case, decrease_refcount will work for us, so just
+ *    lock the refcount tree, calculate the meta and credits is OK.
+ *
+ * We have to do CoW before ocfs2_init_xattr_set_ctxt since
+ * currently CoW is a completed transaction, while this function
+ * will also lock the allocators and let us deadlock. So we will
+ * CoW the whole xattr value.
+ */
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_add,
+					int *credits)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_entry *xe;
+	char *base;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+	int name_offset, name_len;
+	struct ocfs2_xattr_value_buf vb;
+	struct ocfs2_xattr_bucket *bucket = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_post_refcount refcount;
+	struct ocfs2_post_refcount *p = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+		vb.vb_bh = xis->inode_bh;
+		vb.vb_access = ocfs2_journal_access_di;
+	} else {
+		int i, block_off = 0;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			base = bucket_block(xbs->bucket, block_off);
+			vb.vb_bh = xbs->bucket->bu_bhs[block_off];
+			vb.vb_access = ocfs2_journal_access;
+
+			if (ocfs2_meta_ecc(osb)) {
+				/*create parameters for ocfs2_post_refcount. */
+				bucket = xbs->bucket;
+				refcount.credits = bucket->bu_blocks;
+				refcount.para = bucket;
+				refcount.func =
+					ocfs2_xattr_bucket_post_refcount;
+				p = &refcount;
+			}
+		} else {
+			base = xbs->base;
+			vb.vb_bh = xbs->xattr_bh;
+			vb.vb_access = ocfs2_journal_access_xb;
+		}
+	}
+
+	if (ocfs2_xattr_is_local(xe))
+		goto out;
+
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+				(base + name_offset + name_len);
+
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters, &vb.vb_xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We just need to check the 1st extent record, since we always
+	 * CoW the whole xattr. So there shouldn't be a xattr with
+	 * some REFCOUNT extent recs after the 1st one.
+	 */
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * If we are deleting the xattr or the new size will be stored inside,
+	 * cool, leave it there, the xattr truncate process will remove them
+	 * for us(it still needs the refcount tree lock and the meta, credits).
+	 * And the worse case is that every cluster truncate will split the
+	 * refcount tree, and make the original extent become 3. So we will need
+	 * 2 * cluster more extent recs at most.
+	 */
+	if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
+
+		ret = ocfs2_refcounted_xattr_delete_need(inode,
+							 &(*ref_tree)->rf_ci,
+							 ref_root_bh, vb.vb_xv,
+							 meta_add, credits);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
+				       *ref_tree, ref_root_bh, 0,
+				       le32_to_cpu(vb.vb_xv->xr_clusters), p);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/*
+ * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
+ * The physical clusters will be added to refcount tree.
+ */
+static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
+				struct ocfs2_xattr_value_root *xv,
+				struct ocfs2_extent_tree *value_et,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_post_refcount *refcount)
+{
+	int ret = 0;
+	u32 clusters = le32_to_cpu(xv->xr_clusters);
+	u32 cpos, p_cluster, num_clusters;
+	struct ocfs2_extent_list *el = &xv->xr_list;
+	unsigned int ext_flags;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		cpos += num_clusters;
+		if ((ext_flags & OCFS2_EXT_REFCOUNTED))
+			continue;
+
+		BUG_ON(!p_cluster);
+
+		ret = ocfs2_add_refcount_flag(inode, value_et,
+					      ref_ci, ref_root_bh,
+					      cpos - num_clusters,
+					      p_cluster, num_clusters,
+					      dealloc, refcount);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Given a normal ocfs2_xattr_header, refcount all the entries which
+ * have value stored outside.
+ * Used for xattrs stored in inode and ocfs2_xattr_block.
+ */
+static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
+				struct ocfs2_xattr_value_buf *vb,
+				struct ocfs2_xattr_header *header,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_extent_tree et;
+	int i, ret = 0;
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		xe = &header->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		xv = (struct ocfs2_xattr_value_root *)((void *)header +
+			le16_to_cpu(xe->xe_name_offset) +
+			OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+		vb->vb_xv = xv;
+		ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
+							ref_ci, ref_root_bh,
+							dealloc, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
+				struct buffer_head *fe_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
+				(fe_bh->b_data + inode->i_sb->s_blocksize -
+				le16_to_cpu(di->i_xattr_inline_size));
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = fe_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+						  ref_ci, ref_root_bh, dealloc);
+}
+
+struct ocfs2_xattr_tree_value_refcount_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh)
+{
+	int ret, block_off, name_offset;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+	void *base;
+
+	ret = ocfs2_xattr_bucket_get_name_value(sb,
+						bucket_xh(bucket),
+						offset,
+						&block_off,
+						&name_offset);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	base = bucket_block(bucket, block_off);
+
+	*xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
+			 OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (bh)
+		*bh = bucket->bu_bhs[block_off];
+out:
+	return ret;
+}
+
+/*
+ * For a given xattr bucket, refcount all the entries which
+ * have value stored outside.
+ */
+static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_xattr_tree_value_refcount_para *ref =
+			(struct ocfs2_xattr_tree_value_refcount_para *)para;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+	struct ocfs2_post_refcount refcount = {
+		.credits = bucket->bu_blocks,
+		.para = bucket,
+		.func = ocfs2_xattr_bucket_post_refcount,
+	};
+	struct ocfs2_post_refcount *p = NULL;
+
+	/* We only need post_refcount if we support metaecc. */
+	if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
+		p = &refcount;
+
+	trace_ocfs2_xattr_bucket_value_refcount(
+				(unsigned long long)bucket_blkno(bucket),
+				le16_to_cpu(xh->xh_count));
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
+						      &vb.vb_xv, &vb.vb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_init_xattr_value_extent_tree(&et,
+						   INODE_CACHE(inode), &vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
+							&et, ref->ref_ci,
+							ref->ref_root_bh,
+							ref->dealloc, p);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+
+}
+
+static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_xattr_bucket_value_refcount,
+					   para);
+}
+
+static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
+				struct buffer_head *blk_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		struct ocfs2_xattr_value_buf vb = {
+			.vb_bh = blk_bh,
+			.vb_access = ocfs2_journal_access_xb,
+		};
+
+		ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+	} else {
+		struct ocfs2_xattr_tree_value_refcount_para para = {
+			.ref_ci = ref_ci,
+			.ref_root_bh = ref_root_bh,
+			.dealloc = dealloc,
+		};
+
+		ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+						ocfs2_refcount_xattr_tree_rec,
+						&para);
+	}
+
+	return ret;
+}
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
+						ref_root_bh, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+out:
+
+	return ret;
+}
+
+typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
+/*
+ * Store the information we need in xattr reflink.
+ * old_bh and new_bh are inode bh for the old and new inode.
+ */
+struct ocfs2_xattr_reflink {
+	struct inode *old_inode;
+	struct inode *new_inode;
+	struct buffer_head *old_bh;
+	struct buffer_head *new_bh;
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+	should_xattr_reflinked *xattr_reflinked;
+};
+
+/*
+ * Given a xattr header and xe offset,
+ * return the proper xv and the corresponding bh.
+ * xattr in inode, block and xattr tree have different implementaions.
+ */
+typedef int (get_xattr_value_root)(struct super_block *sb,
+				   struct buffer_head *bh,
+				   struct ocfs2_xattr_header *xh,
+				   int offset,
+				   struct ocfs2_xattr_value_root **xv,
+				   struct buffer_head **ret_bh,
+				   void *para);
+
+/*
+ * Calculate all the xattr value root metadata stored in this xattr header and
+ * credits we need if we create them from the scratch.
+ * We use get_xattr_value_root so that all types of xattr container can use it.
+ */
+static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
+					     struct buffer_head *bh,
+					     struct ocfs2_xattr_header *xh,
+					     int *metas, int *credits,
+					     int *num_recs,
+					     get_xattr_value_root *func,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		*metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
+			  le16_to_cpu(xv->xr_list.l_next_free_rec);
+
+		*credits += ocfs2_calc_extend_credits(sb,
+						&def_xv.xv.xr_list);
+
+		/*
+		 * If the value is a tree with depth > 1, We don't go deep
+		 * to the extent block, so just calculate a maximum record num.
+		 */
+		if (!xv->xr_list.l_tree_depth)
+			*num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
+		else
+			*num_recs += ocfs2_clusters_for_bytes(sb,
+							      XATTR_SIZE_MAX);
+	}
+
+	return ret;
+}
+
+/* Used by xattr inode and block to return the right xv and buffer_head. */
+static int ocfs2_get_xattr_value_root(struct super_block *sb,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *xh,
+				      int offset,
+				      struct ocfs2_xattr_value_root **xv,
+				      struct buffer_head **ret_bh,
+				      void *para)
+{
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+
+	*xv = (struct ocfs2_xattr_value_root *)((void *)xh +
+		le16_to_cpu(xe->xe_name_offset) +
+		OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (ret_bh)
+		*ret_bh = bh;
+
+	return 0;
+}
+
+/*
+ * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * It is only used for inline xattr and xattr block.
+ */
+static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
+					struct ocfs2_xattr_header *xh,
+					struct buffer_head *ref_root_bh,
+					int *credits,
+					struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, meta_add = 0, num_recs = 0;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	*credits = 0;
+
+	ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
+						&meta_add, credits, &num_recs,
+						ocfs2_get_xattr_value_root,
+						NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We need to add/modify num_recs in refcount tree, so just calculate
+	 * an approximate number we need for refcount tree change.
+	 * Sometimes we need to split the tree, and after split,  half recs
+	 * will be moved to the new block, and a new block can only provide
+	 * half number of recs. So we multiple new blocks by 2.
+	 */
+	num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+	meta_add += num_recs;
+	*credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+	else
+		*credits += 1;
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a xattr header, reflink all the xattrs in this container.
+ * It can be used for inode, block and bucket.
+ *
+ * NOTE:
+ * Before we call this function, the caller has memcpy the xattr in
+ * old_xh to the new_xh.
+ *
+ * If args.xattr_reflinked is set, call it to decide whether the xe should
+ * be reflinked or not. If not, remove it from the new xattr header.
+ */
+static int ocfs2_reflink_xattr_header(handle_t *handle,
+				      struct ocfs2_xattr_reflink *args,
+				      struct buffer_head *old_bh,
+				      struct ocfs2_xattr_header *xh,
+				      struct buffer_head *new_bh,
+				      struct ocfs2_xattr_header *new_xh,
+				      struct ocfs2_xattr_value_buf *vb,
+				      struct ocfs2_alloc_context *meta_ac,
+				      get_xattr_value_root *func,
+				      void *para)
+{
+	int ret = 0, i, j;
+	struct super_block *sb = args->old_inode->i_sb;
+	struct buffer_head *value_bh;
+	struct ocfs2_xattr_entry *xe, *last;
+	struct ocfs2_xattr_value_root *xv, *new_xv;
+	struct ocfs2_extent_tree data_et;
+	u32 clusters, cpos, p_cluster, num_clusters;
+	unsigned int ext_flags = 0;
+
+	trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
+					 le16_to_cpu(xh->xh_count));
+
+	last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
+	for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
+		xe = &xh->xh_entries[i];
+
+		if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
+			xe = &new_xh->xh_entries[j];
+
+			le16_add_cpu(&new_xh->xh_count, -1);
+			if (new_xh->xh_count) {
+				memmove(xe, xe + 1,
+					(void *)last - (void *)xe);
+				memset(last, 0,
+				       sizeof(struct ocfs2_xattr_entry));
+			}
+
+			/*
+			 * We don't want j to increase in the next round since
+			 * it is already moved ahead.
+			 */
+			j--;
+			continue;
+		}
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, old_bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * For the xattr which has l_tree_depth = 0, all the extent
+		 * recs have already be copied to the new xh with the
+		 * propriate OCFS2_EXT_REFCOUNTED flag we just need to
+		 * increase the refount count int the refcount tree.
+		 *
+		 * For the xattr which has l_tree_depth > 0, we need
+		 * to initialize it to the empty default value root,
+		 * and then insert the extents one by one.
+		 */
+		if (xv->xr_list.l_tree_depth) {
+			memcpy(new_xv, &def_xv, sizeof(def_xv));
+			vb->vb_xv = new_xv;
+			vb->vb_bh = value_bh;
+			ocfs2_init_xattr_value_extent_tree(&data_et,
+					INODE_CACHE(args->new_inode), vb);
+		}
+
+		clusters = le32_to_cpu(xv->xr_clusters);
+		cpos = 0;
+		while (cpos < clusters) {
+			ret = ocfs2_xattr_get_clusters(args->old_inode,
+						       cpos,
+						       &p_cluster,
+						       &num_clusters,
+						       &xv->xr_list,
+						       &ext_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!p_cluster);
+
+			if (xv->xr_list.l_tree_depth) {
+				ret = ocfs2_insert_extent(handle,
+						&data_et, cpos,
+						ocfs2_clusters_to_blocks(
+							args->old_inode->i_sb,
+							p_cluster),
+						num_clusters, ext_flags,
+						meta_ac);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+
+			ret = ocfs2_increase_refcount(handle, args->ref_ci,
+						      args->ref_root_bh,
+						      p_cluster, num_clusters,
+						      meta_ac, args->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cpos += num_clusters;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
+	int inline_size = le16_to_cpu(di->i_xattr_inline_size);
+	int header_off = osb->sb->s_blocksize - inline_size;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
+					(args->old_bh->b_data + header_off);
+	struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
+					(args->new_bh->b_data + header_off);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_inode_info *new_oi;
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = args->new_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
+				      args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(args->new_bh->b_data + header_off,
+	       args->old_bh->b_data + header_off, inline_size);
+
+	new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+	new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
+					 args->new_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_oi = OCFS2_I(args->new_inode);
+	/*
+	 * Adjust extent record count to reserve space for extended attribute.
+	 * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+	 */
+	if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+	    !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+		struct ocfs2_extent_list *el = &new_di->id2.i_list;
+		le16_add_cpu(&el->l_count, -(inline_size /
+					sizeof(struct ocfs2_extent_rec)));
+	}
+	spin_lock(&new_oi->ip_lock);
+	new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
+	new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+	spin_unlock(&new_oi->ip_lock);
+
+	ocfs2_journal_dirty(handle, args->new_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_create_empty_xattr_block(struct inode *inode,
+					  struct buffer_head *fe_bh,
+					  struct buffer_head **ret_bh,
+					  int indexed)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_create_empty_xattr_block(
+				(unsigned long long)fe_bh->b_blocknr, indexed);
+	ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
+				       ret_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, ctxt.handle);
+out:
+	ocfs2_free_alloc_context(ctxt.meta_ac);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
+				     struct buffer_head *blk_bh,
+				     struct buffer_head *new_blk_bh)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
+	int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+	struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_block *new_xb =
+			(struct ocfs2_xattr_block *)new_blk_bh->b_data;
+	struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = new_blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* One more credits in case we need to add xattr flags in new inode. */
+	handle = ocfs2_start_trans(osb, credits + 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		ret = ocfs2_journal_access_di(handle,
+					      INODE_CACHE(args->new_inode),
+					      args->new_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
+				      new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
+	       osb->sb->s_blocksize - header_off);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
+					 new_blk_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_journal_dirty(handle, new_blk_bh);
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+		spin_lock(&new_oi->ip_lock);
+		new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+		new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+		spin_unlock(&new_oi->ip_lock);
+
+		ocfs2_journal_dirty(handle, args->new_bh);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+struct ocfs2_reflink_xattr_tree_args {
+	struct ocfs2_xattr_reflink *reflink;
+	struct buffer_head *old_blk_bh;
+	struct buffer_head *new_blk_bh;
+	struct ocfs2_xattr_bucket *old_bucket;
+	struct ocfs2_xattr_bucket *new_bucket;
+};
+
+/*
+ * NOTE:
+ * We have to handle the case that both old bucket and new bucket
+ * will call this function to get the right ret_bh.
+ * So The caller must give us the right bh.
+ */
+static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_reflink_xattr_tree_args *args =
+			(struct ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_xattr_bucket *bucket;
+
+	if (bh == args->old_bucket->bu_bhs[0])
+		bucket = args->old_bucket;
+	else
+		bucket = args->new_bucket;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+struct ocfs2_value_tree_metas {
+	int num_metas;
+	int credits;
+	int num_recs;
+};
+
+static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_xattr_bucket *bucket =
+				(struct ocfs2_xattr_bucket *)para;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+static int ocfs2_calc_value_tree_metas(struct inode *inode,
+				      struct ocfs2_xattr_bucket *bucket,
+				      void *para)
+{
+	struct ocfs2_value_tree_metas *metas =
+			(struct ocfs2_value_tree_metas *)para;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+
+	/* Add the credits for this bucket first. */
+	metas->credits += bucket->bu_blocks;
+	return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
+					xh, &metas->num_metas,
+					&metas->credits, &metas->num_recs,
+					ocfs2_value_tree_metas_in_bucket,
+					bucket);
+}
+
+/*
+ * Given a xattr extent rec starting from blkno and having len clusters,
+ * iterate all the buckets calculate how much metadata we need for reflinking
+ * all the ocfs2_xattr_value_root and lock the allocators accordingly.
+ */
+static int ocfs2_lock_reflink_xattr_rec_allocators(
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *xt_et,
+				u64 blkno, u32 len, int *credits,
+				struct ocfs2_alloc_context **meta_ac,
+				struct ocfs2_alloc_context **data_ac)
+{
+	int ret, num_free_extents;
+	struct ocfs2_value_tree_metas metas;
+	struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+
+	memset(&metas, 0, sizeof(metas));
+
+	ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
+					  ocfs2_calc_value_tree_metas, &metas);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*credits = metas.credits;
+
+	/*
+	 * Calculate we need for refcount tree change.
+	 *
+	 * We need to add/modify num_recs in refcount tree, so just calculate
+	 * an approximate number we need for refcount tree change.
+	 * Sometimes we need to split the tree, and after split,  half recs
+	 * will be moved to the new block, and a new block can only provide
+	 * half number of recs. So we multiple new blocks by 2.
+	 * In the end, we have to add credits for modifying the already
+	 * existed refcount block.
+	 */
+	rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
+	metas.num_recs =
+		(metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
+		 ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+	metas.num_metas += metas.num_recs;
+	*credits += metas.num_recs +
+		    metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+	else
+		*credits += 1;
+
+	/* count in the xattr tree change. */
+	num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < len)
+		metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(osb->sb,
+					      xt_et->et_root_el);
+
+	if (metas.num_metas) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
+							meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (len) {
+		ret = ocfs2_reserve_clusters(osb, len, data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
+				u64 blkno, u64 new_blkno, u32 clusters,
+				u32 *cpos, int num_buckets,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_reflink_xattr_tree_args *args)
+{
+	int i, j, ret = 0;
+	struct super_block *sb = args->reflink->old_inode->i_sb;
+	int bpb = args->old_bucket->bu_blocks;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+
+	for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
+		ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		for (j = 0; j < bpb; j++)
+			memcpy(bucket_block(args->new_bucket, j),
+			       bucket_block(args->old_bucket, j),
+			       sb->s_blocksize);
+
+		/*
+		 * Record the start cpos so that we can use it to initialize
+		 * our xattr tree we also set the xh_num_bucket for the new
+		 * bucket.
+		 */
+		if (i == 0) {
+			*cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+					    xh_entries[0].xe_name_hash);
+			bucket_xh(args->new_bucket)->xh_num_buckets =
+				cpu_to_le16(num_buckets);
+		}
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+		ret = ocfs2_reflink_xattr_header(handle, args->reflink,
+					args->old_bucket->bu_bhs[0],
+					bucket_xh(args->old_bucket),
+					args->new_bucket->bu_bhs[0],
+					bucket_xh(args->new_bucket),
+					&vb, meta_ac,
+					ocfs2_get_reflink_xattr_value_root,
+					args);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * Re-access and dirty the bucket to calculate metaecc.
+		 * Because we may extend the transaction in reflink_xattr_header
+		 * which will let the already accessed block gone.
+		 */
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+		ocfs2_xattr_bucket_relse(args->old_bucket);
+		ocfs2_xattr_bucket_relse(args->new_bucket);
+	}
+
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+	ocfs2_xattr_bucket_relse(args->new_bucket);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+				struct inode *inode,
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				u64 blkno, u32 cpos, u32 len)
+{
+	int ret, first_inserted = 0;
+	u32 p_cluster, num_clusters, reflink_cpos = 0;
+	u64 new_blkno;
+	unsigned int num_buckets, reflink_buckets;
+	unsigned int bpc =
+		ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+	ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+
+	while (len && num_buckets) {
+		ret = ocfs2_claim_clusters(handle, data_ac,
+					   1, &p_cluster, &num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+		ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+						 new_blkno, num_clusters,
+						 &reflink_cpos, reflink_buckets,
+						 meta_ac, data_ac, args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * For the 1st allocated cluster, we make it use the same cpos
+		 * so that the xattr tree looks the same as the original one
+		 * in the most case.
+		 */
+		if (!first_inserted) {
+			reflink_cpos = cpos;
+			first_inserted = 1;
+		}
+		ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+					  num_clusters, 0, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+
+		trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
+						  num_clusters, reflink_cpos);
+
+		len -= num_clusters;
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		num_buckets -= reflink_buckets;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Create the same xattr extent record in the new inode's xattr tree.
+ */
+static int ocfs2_reflink_xattr_rec(struct inode *inode,
+				   struct buffer_head *root_bh,
+				   u64 blkno,
+				   u32 cpos,
+				   u32 len,
+				   void *para)
+{
+	int ret, credits = 0;
+	handle_t *handle;
+	struct ocfs2_reflink_xattr_tree_args *args =
+			(struct ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_extent_tree et;
+
+	trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
+
+	ocfs2_init_xattr_tree_extent_tree(&et,
+					  INODE_CACHE(args->reflink->new_inode),
+					  args->new_blk_bh);
+
+	ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
+						      len, &credits,
+						      &meta_ac, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+					  meta_ac, data_ac,
+					  blkno, cpos, len);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	return ret;
+}
+
+/*
+ * Create reflinked xattr buckets.
+ * We will add bucket one by one, and refcount all the xattrs in the bucket
+ * if they are stored outside.
+ */
+static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
+				    struct buffer_head *blk_bh,
+				    struct buffer_head *new_blk_bh)
+{
+	int ret;
+	struct ocfs2_reflink_xattr_tree_args para;
+
+	memset(&para, 0, sizeof(para));
+	para.reflink = args;
+	para.old_blk_bh = blk_bh;
+	para.new_blk_bh = new_blk_bh;
+
+	para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
+	if (!para.old_bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
+	if (!para.new_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
+					      ocfs2_reflink_xattr_rec,
+					      &para);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_xattr_bucket_free(para.old_bucket);
+	ocfs2_xattr_bucket_free(para.new_bucket);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
+					struct buffer_head *blk_bh)
+{
+	int ret, indexed = 0;
+	struct buffer_head *new_blk_bh = NULL;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+
+
+	if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
+		indexed = 1;
+
+	ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
+					     &new_blk_bh, indexed);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!indexed)
+		ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
+	else
+		ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_blk_bh);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
+{
+	int type = ocfs2_xattr_get_type(xe);
+
+	return type != OCFS2_XATTR_INDEX_SECURITY &&
+	       type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
+	       type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+}
+
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh,
+			 bool preserve_security)
+{
+	int ret;
+	struct ocfs2_xattr_reflink args;
+	struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct buffer_head *ref_root_bh = NULL;
+
+	ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				       le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	args.old_inode = old_inode;
+	args.new_inode = new_inode;
+	args.old_bh = old_bh;
+	args.new_bh = new_bh;
+	args.ref_ci = &ref_tree->rf_ci;
+	args.ref_root_bh = ref_root_bh;
+	args.dealloc = &dealloc;
+	if (preserve_security)
+		args.xattr_reflinked = NULL;
+	else
+		args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_reflink_xattr_inline(&args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out_unlock;
+
+	ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+
+out_unlock:
+	ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				   ref_tree, 1);
+	brelse(ref_root_bh);
+
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
+		ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Initialize security and acl for a already created inode.
+ * Used for reflink a non-preserve-security file.
+ *
+ * It uses common api like ocfs2_xattr_set, so the caller
+ * must not hold any lock expect i_mutex.
+ */
+int ocfs2_init_security_and_acl(struct inode *dir,
+				struct inode *inode,
+				const struct qstr *qstr,
+				struct posix_acl *default_acl,
+				struct posix_acl *acl)
+{
+	struct buffer_head *dir_bh = NULL;
+	int ret = 0;
+
+	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	if (!ret && default_acl)
+		ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+	if (!ret && acl)
+		ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+
+	ocfs2_inode_unlock(dir, 0);
+	brelse(dir_bh);
+leave:
+	return ret;
+}
+/*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
+					size_t list_size, const char *name,
+					size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
+				    void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+			       name, buffer, size);
+}
+
+static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+			       name, value, size, flags);
+}
+
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		     void *fs_info)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+				      xattr->name, xattr->value,
+				      xattr->value_len, XATTR_CREATE);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+int ocfs2_init_security_get(struct inode *inode,
+			    struct inode *dir,
+			    const struct qstr *qstr,
+			    struct ocfs2_security_xattr_info *si)
+{
+	/* check whether ocfs2 support feature xattr */
+	if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+		return -EOPNOTSUPP;
+	if (si)
+		return security_old_inode_init_security(inode, dir, qstr,
+							&si->name, &si->value,
+							&si->value_len);
+
+	return security_inode_init_security(inode, dir, qstr,
+					    &ocfs2_initxattrs, NULL);
+}
+
+int ocfs2_init_security_set(handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *di_bh,
+			    struct ocfs2_security_xattr_info *si,
+			    struct ocfs2_alloc_context *xattr_ac,
+			    struct ocfs2_alloc_context *data_ac)
+{
+	return ocfs2_xattr_set_handle(handle, inode, di_bh,
+				     OCFS2_XATTR_INDEX_SECURITY,
+				     si->name, si->value, si->value_len, 0,
+				     xattr_ac, data_ac);
+}
+
+const struct xattr_handler ocfs2_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ocfs2_xattr_security_list,
+	.get	= ocfs2_xattr_security_get,
+	.set	= ocfs2_xattr_security_set,
+};
+
+/*
+ * 'trusted' attributes support
+ */
+static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+			       name, buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+			       name, value, size, flags);
+}
+
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ocfs2_xattr_trusted_list,
+	.get	= ocfs2_xattr_trusted_get,
+	.set	= ocfs2_xattr_trusted_set,
+};
+
+/*
+ * 'user' attributes support
+ */
+static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+
+	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
+			       name, value, size, flags);
+}
+
+const struct xattr_handler ocfs2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ocfs2_xattr_user_list,
+	.get	= ocfs2_xattr_user_get,
+	.set	= ocfs2_xattr_user_set,
+};
diff --git a/kernel/fs/ocfs2/xattr.h b/kernel/fs/ocfs2/xattr.h
new file mode 100644
index 000000000..f10d5b93c
--- /dev/null
+++ b/kernel/fs/ocfs2/xattr.h
@@ -0,0 +1,100 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+	OCFS2_XATTR_INDEX_USER = 1,
+	OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+	OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	OCFS2_XATTR_INDEX_TRUSTED,
+	OCFS2_XATTR_INDEX_SECURITY,
+	OCFS2_XATTR_MAX
+};
+
+struct ocfs2_security_xattr_info {
+	int enable;
+	const char *name;
+	void *value;
+	size_t value_len;
+};
+
+extern const struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
+
+ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+			   const char *, void *, size_t);
+int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+		    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+			   int, const char *, const void *, size_t, int,
+			   struct ocfs2_alloc_context *,
+			   struct ocfs2_alloc_context *);
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di);
+int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+			    const struct qstr *,
+			    struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+			    struct buffer_head *,
+			    struct ocfs2_security_xattr_info *,
+			    struct ocfs2_alloc_context *,
+			    struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+			     struct ocfs2_security_xattr_info *,
+			     int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+			  umode_t, struct ocfs2_security_xattr_info *,
+			  int *, int *, int *);
+
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+	struct buffer_head		*vb_bh;
+	ocfs2_journal_access_func	vb_access;
+	struct ocfs2_xattr_value_root	*vb_xv;
+};
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh,
+			 bool preserve_security);
+int ocfs2_init_security_and_acl(struct inode *dir,
+				struct inode *inode,
+				const struct qstr *qstr,
+				struct posix_acl *default_acl,
+				struct posix_acl *acl);
+#endif /* OCFS2_XATTR_H */
diff --git a/kernel/fs/omfs/Kconfig b/kernel/fs/omfs/Kconfig
new file mode 100644
index 000000000..b1b9a0aba
--- /dev/null
+++ b/kernel/fs/omfs/Kconfig
@@ -0,0 +1,13 @@
+config OMFS_FS
+	tristate "SonicBlue Optimized MPEG File System support"
+	depends on BLOCK
+	select CRC_ITU_T
+	help
+	  This is the proprietary file system used by the Rio Karma music
+	  player and ReplayTV DVR.  Despite the name, this filesystem is not
+	  more efficient than a standard FS for MPEG files, in fact likely
+	  the opposite is true.  Say Y if you have either of these devices
+	  and wish to mount its disk.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called omfs.  If unsure, say N.
diff --git a/kernel/fs/omfs/Makefile b/kernel/fs/omfs/Makefile
new file mode 100644
index 000000000..8b82b63f1
--- /dev/null
+++ b/kernel/fs/omfs/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_OMFS_FS) += omfs.o
+
+omfs-y := bitmap.o dir.o file.o inode.o
diff --git a/kernel/fs/omfs/bitmap.c b/kernel/fs/omfs/bitmap.c
new file mode 100644
index 000000000..83f4e7651
--- /dev/null
+++ b/kernel/fs/omfs/bitmap.c
@@ -0,0 +1,193 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/div64.h>
+#include "omfs.h"
+
+unsigned long omfs_count_free(struct super_block *sb)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int nbits = sb->s_blocksize * 8;
+
+	for (i = 0; i < sbi->s_imap_size; i++)
+		sum += nbits - bitmap_weight(sbi->s_imap[i], nbits);
+
+	return sum;
+}
+
+/*
+ *  Counts the run of zero bits starting at bit up to max.
+ *  It handles the case where a run might spill over a buffer.
+ *  Called with bitmap lock.
+ */
+static int count_run(unsigned long **addr, int nbits,
+		int addrlen, int bit, int max)
+{
+	int count = 0;
+	int x;
+
+	for (; addrlen > 0; addrlen--, addr++) {
+		x = find_next_bit(*addr, nbits, bit);
+		count += x - bit;
+
+		if (x < nbits || count > max)
+			return min(count, max);
+
+		bit = 0;
+	}
+	return min(count, max);
+}
+
+/*
+ * Sets or clears the run of count bits starting with bit.
+ * Called with bitmap lock.
+ */
+static int set_run(struct super_block *sb, int map,
+		int nbits, int bit, int count, int set)
+{
+	int i;
+	int err;
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+
+ 	err = -ENOMEM;
+	bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+	if (!bh)
+		goto out;
+
+	for (i = 0; i < count; i++, bit++) {
+		if (bit >= nbits) {
+			bit = 0;
+			map++;
+
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			bh = sb_bread(sb,
+				clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+			if (!bh)
+				goto out;
+		}
+		if (set) {
+			set_bit(bit, sbi->s_imap[map]);
+			set_bit(bit, (unsigned long *)bh->b_data);
+		} else {
+			clear_bit(bit, sbi->s_imap[map]);
+			clear_bit(bit, (unsigned long *)bh->b_data);
+		}
+	}
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * Tries to allocate exactly one block.  Returns true if successful.
+ */
+int omfs_allocate_block(struct super_block *sb, u64 block)
+{
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	unsigned int map, bit;
+	int ret = 0;
+	u64 tmp;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map]))
+		goto out;
+
+	if (sbi->s_bitmap_ino > 0) {
+		bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+		if (!bh)
+			goto out;
+
+		set_bit(bit, (unsigned long *)bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	ret = 1;
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+
+/*
+ *  Tries to allocate a set of blocks.	The request size depends on the
+ *  type: for inodes, we must allocate sbi->s_mirrors blocks, and for file
+ *  blocks, we try to allocate sbi->s_clustersize, but can always get away
+ *  with just one block.
+ */
+int omfs_allocate_range(struct super_block *sb,
+			int min_request,
+			int max_request,
+			u64 *return_block,
+			int *return_size)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	int ret = 0;
+	int i, run, bit;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	for (i = 0; i < sbi->s_imap_size; i++) {
+		bit = 0;
+		while (bit < bits_per_entry) {
+			bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry,
+				bit);
+
+			if (bit == bits_per_entry)
+				break;
+
+			run = count_run(&sbi->s_imap[i], bits_per_entry,
+				sbi->s_imap_size-i, bit, max_request);
+
+			if (run >= min_request)
+				goto found;
+			bit += run;
+		}
+	}
+	ret = -ENOSPC;
+	goto out;
+
+found:
+	*return_block = (u64) i * bits_per_entry + bit;
+	*return_size = run;
+	ret = set_run(sb, i, bits_per_entry, bit, run, 1);
+
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+/*
+ * Clears count bits starting at a given block.
+ */
+int omfs_clear_range(struct super_block *sb, u64 block, int count)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	u64 tmp;
+	unsigned int map, bit;
+	int ret;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	if (map >= sbi->s_imap_size)
+		return 0;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	ret = set_run(sb, map, bits_per_entry, bit, count, 0);
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
diff --git a/kernel/fs/omfs/dir.c b/kernel/fs/omfs/dir.c
new file mode 100644
index 000000000..f833bf8d5
--- /dev/null
+++ b/kernel/fs/omfs/dir.c
@@ -0,0 +1,457 @@
+/*
+ * OMFS (as used by RIO Karma) directory operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/buffer_head.h>
+#include "omfs.h"
+
+static int omfs_hash(const char *name, int namelen, int mod)
+{
+	int i, hash = 0;
+	for (i = 0; i < namelen; i++)
+		hash ^= tolower(name[i]) << (i % 24);
+	return hash % mod;
+}
+
+/*
+ * Finds the bucket for a given name and reads the containing block;
+ * *ofs is set to the offset of the first list entry.
+ */
+static struct buffer_head *omfs_get_bucket(struct inode *dir,
+		const char *name, int namelen, int *ofs)
+{
+	int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
+	int bucket = omfs_hash(name, namelen, nbuckets);
+
+	*ofs = OMFS_DIR_START + bucket * 8;
+	return omfs_bread(dir->i_sb, dir->i_ino);
+}
+
+static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
+				const char *name, int namelen,
+				u64 *prev_block)
+{
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+	int err = -ENOENT;
+	*prev_block = ~0;
+
+	while (block != ~0) {
+		bh = omfs_bread(dir->i_sb, block);
+		if (!bh) {
+			err = -EIO;
+			goto err;
+		}
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) {
+			brelse(bh);
+			goto err;
+		}
+
+		if (strncmp(oi->i_name, name, namelen) == 0)
+			return bh;
+
+		*prev_block = block;
+		block = be64_to_cpu(oi->i_sibling);
+		brelse(bh);
+	}
+err:
+	return ERR_PTR(err);
+}
+
+static struct buffer_head *omfs_find_entry(struct inode *dir,
+					   const char *name, int namelen)
+{
+	struct buffer_head *bh;
+	int ofs;
+	u64 block, dummy;
+
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		return ERR_PTR(-EIO);
+
+	block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs]));
+	brelse(bh);
+
+	return omfs_scan_list(dir, block, name, namelen, &dummy);
+}
+
+int omfs_make_empty(struct inode *inode, struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+
+	bh = omfs_bread(sb, inode->i_ino);
+	if (!bh)
+		return -ENOMEM;
+
+	memset(bh->b_data, 0, sizeof(struct omfs_inode));
+
+	if (S_ISDIR(inode->i_mode)) {
+		memset(&bh->b_data[OMFS_DIR_START], 0xff,
+			sbi->s_sys_blocksize - OMFS_DIR_START);
+	} else
+		omfs_make_empty_table(bh, OMFS_EXTENT_START);
+
+	oi = (struct omfs_inode *) bh->b_data;
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	oi->i_sibling = ~cpu_to_be64(0ULL);
+
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	return 0;
+}
+
+static int omfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	u64 block;
+	__be64 *entry;
+	int ofs;
+
+	/* just prepend to head of queue in proper bucket */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+	*entry = cpu_to_be64(inode->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	/* now set the sibling and parent pointers on the new inode */
+	bh = omfs_bread(dir->i_sb, inode->i_ino);
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+	memcpy(oi->i_name, name, namelen);
+	memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen);
+	oi->i_sibling = cpu_to_be64(block);
+	oi->i_parent = cpu_to_be64(dir->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* mark affected inodes dirty to rebuild checksums */
+	mark_inode_dirty(dir);
+	mark_inode_dirty(inode);
+	return 0;
+out:
+	return -ENOMEM;
+}
+
+static int omfs_delete_entry(struct dentry *dentry)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct inode *dirty;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh, *bh2;
+	__be64 *entry, next;
+	u64 block, prev;
+	int ofs;
+	int err = -ENOMEM;
+
+	/* delete the proper node in the bucket's linked list */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+
+	bh2 = omfs_scan_list(dir, block, name, namelen, &prev);
+	if (IS_ERR(bh2)) {
+		err = PTR_ERR(bh2);
+		goto out_free_bh;
+	}
+
+	oi = (struct omfs_inode *) bh2->b_data;
+	next = oi->i_sibling;
+	brelse(bh2);
+
+	if (prev != ~0) {
+		/* found in middle of list, get list ptr */
+		brelse(bh);
+		bh = omfs_bread(dir->i_sb, prev);
+		if (!bh)
+			goto out;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		entry = &oi->i_sibling;
+	}
+
+	*entry = next;
+	mark_buffer_dirty(bh);
+
+	if (prev != ~0) {
+		dirty = omfs_iget(dir->i_sb, prev);
+		if (!IS_ERR(dirty)) {
+			mark_inode_dirty(dirty);
+			iput(dirty);
+		}
+	}
+
+	err = 0;
+out_free_bh:
+	brelse(bh);
+out:
+	return err;
+}
+
+static int omfs_dir_is_empty(struct inode *inode)
+{
+	int nbuckets = (inode->i_size - OMFS_DIR_START) / 8;
+	struct buffer_head *bh;
+	u64 *ptr;
+	int i;
+
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
+
+	if (!bh)
+		return 0;
+
+	ptr = (u64 *) &bh->b_data[OMFS_DIR_START];
+
+	for (i = 0; i < nbuckets; i++, ptr++)
+		if (*ptr != ~0)
+			break;
+
+	brelse(bh);
+	return *ptr != ~0;
+}
+
+static int omfs_remove(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	int ret;
+
+
+	if (S_ISDIR(inode->i_mode) &&
+	    !omfs_dir_is_empty(inode))
+		return -ENOTEMPTY;
+
+	ret = omfs_delete_entry(dentry);
+	if (ret)
+		return ret;
+	
+	clear_nlink(inode);
+	mark_inode_dirty(inode);
+	mark_inode_dirty(dir);
+	return 0;
+}
+
+static int omfs_add_node(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int err;
+	struct inode *inode = omfs_new_inode(dir, mode);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	err = omfs_make_empty(inode, dir->i_sb);
+	if (err)
+		goto out_free_inode;
+
+	err = omfs_add_link(dentry, inode);
+	if (err)
+		goto out_free_inode;
+
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_free_inode:
+	iput(inode);
+	return err;
+}
+
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFDIR);
+}
+
+static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool excl)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFREG);
+}
+
+static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NULL;
+
+	if (dentry->d_name.len > OMFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+	if (!IS_ERR(bh)) {
+		struct omfs_inode *oi = (struct omfs_inode *)bh->b_data;
+		ino_t ino = be64_to_cpu(oi->i_head.h_self);
+		brelse(bh);
+		inode = omfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+/* sanity check block's self pointer */
+int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+	u64 fsblock)
+{
+	int is_bad;
+	u64 ino = be64_to_cpu(header->h_self);
+	is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) ||
+		(ino > sbi->s_num_blocks));
+
+	if (is_bad)
+		printk(KERN_WARNING "omfs: bad hash chain detected\n");
+
+	return is_bad;
+}
+
+static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
+		u64 fsblock, int hindex)
+{
+	/* follow chain in this bucket */
+	while (fsblock != ~0) {
+		struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
+		struct omfs_inode *oi;
+		u64 self;
+		unsigned char d_type;
+
+		if (!bh)
+			return true;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
+			brelse(bh);
+			return true;
+		}
+
+		self = fsblock;
+		fsblock = be64_to_cpu(oi->i_sibling);
+
+		/* skip visited nodes */
+		if (hindex) {
+			hindex--;
+			brelse(bh);
+			continue;
+		}
+
+		d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
+
+		if (!dir_emit(ctx, oi->i_name,
+			      strnlen(oi->i_name, OMFS_NAMELEN),
+			      self, d_type)) {
+			brelse(bh);
+			return false;
+		}
+		brelse(bh);
+		ctx->pos++;
+	}
+	return true;
+}
+
+static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *new_inode = d_inode(new_dentry);
+	struct inode *old_inode = d_inode(old_dentry);
+	int err;
+
+	if (new_inode) {
+		/* overwriting existing file/dir */
+		err = omfs_remove(new_dir, new_dentry);
+		if (err)
+			goto out;
+	}
+
+	/* since omfs locates files by name, we need to unlink _before_
+	 * adding the new link or we won't find the old one */
+	err = omfs_delete_entry(old_dentry);
+	if (err)
+		goto out;
+
+	mark_inode_dirty(old_dir);
+	err = omfs_add_link(new_dentry, old_inode);
+	if (err)
+		goto out;
+
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
+out:
+	return err;
+}
+
+static int omfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *dir = file_inode(file);
+	struct buffer_head *bh;
+	__be64 *p;
+	unsigned int hchain, hindex;
+	int nbuckets;
+
+	if (ctx->pos >> 32)
+		return -EINVAL;
+
+	if (ctx->pos < 1 << 20) {
+		if (!dir_emit_dots(file, ctx))
+			return 0;
+		ctx->pos = 1 << 20;
+	}
+
+	nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
+
+	/* high 12 bits store bucket + 1 and low 20 bits store hash index */
+	hchain = (ctx->pos >> 20) - 1;
+	hindex = ctx->pos & 0xfffff;
+
+	bh = omfs_bread(dir->i_sb, dir->i_ino);
+	if (!bh)
+		return -EINVAL;
+
+	p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
+
+	for (; hchain < nbuckets; hchain++) {
+		__u64 fsblock = be64_to_cpu(*p++);
+		if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
+			break;
+		hindex = 0;
+		ctx->pos = (hchain+2) << 20;
+	}
+	brelse(bh);
+	return 0;
+}
+
+const struct inode_operations omfs_dir_inops = {
+	.lookup = omfs_lookup,
+	.mkdir = omfs_mkdir,
+	.rename = omfs_rename,
+	.create = omfs_create,
+	.unlink = omfs_remove,
+	.rmdir = omfs_remove,
+};
+
+const struct file_operations omfs_dir_operations = {
+	.read = generic_read_dir,
+	.iterate = omfs_readdir,
+	.llseek = generic_file_llseek,
+};
diff --git a/kernel/fs/omfs/file.c b/kernel/fs/omfs/file.c
new file mode 100644
index 000000000..d9e26cfbb
--- /dev/null
+++ b/kernel/fs/omfs/file.c
@@ -0,0 +1,383 @@
+/*
+ * OMFS (as used by RIO Karma) file operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include "omfs.h"
+
+static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
+{
+	return (sbi->s_sys_blocksize - offset -
+		sizeof(struct omfs_extent)) /
+		sizeof(struct omfs_extent_entry) + 1;
+}
+
+void omfs_make_empty_table(struct buffer_head *bh, int offset)
+{
+	struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
+
+	oe->e_next = ~cpu_to_be64(0ULL);
+	oe->e_extent_count = cpu_to_be32(1),
+	oe->e_fill = cpu_to_be32(0x22),
+	oe->e_entry.e_cluster = ~cpu_to_be64(0ULL);
+	oe->e_entry.e_blocks = ~cpu_to_be64(0ULL);
+}
+
+int omfs_shrink_inode(struct inode *inode)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct buffer_head *bh;
+	u64 next, last;
+	u32 extent_count;
+	u32 max_extents;
+	int ret;
+
+	/* traverse extent table, freeing each entry that is greater
+	 * than inode->i_size;
+	 */
+	next = inode->i_ino;
+
+	/* only support truncate -> 0 for now */
+	ret = -EIO;
+	if (inode->i_size != 0)
+		goto out;
+
+	bh = omfs_bread(inode->i_sb, next);
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+			goto out_brelse;
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+
+		if (extent_count > max_extents)
+			goto out_brelse;
+
+		last = next;
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		/* ignore last entry as it is the terminator */
+		for (; extent_count > 1; extent_count--) {
+			u64 start, count;
+			start = be64_to_cpu(entry->e_cluster);
+			count = be64_to_cpu(entry->e_blocks);
+
+			omfs_clear_range(inode->i_sb, start, (int) count);
+			entry++;
+		}
+		omfs_make_empty_table(bh, (char *) oe - bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+
+		if (last != inode->i_ino)
+			omfs_clear_range(inode->i_sb, last, sbi->s_mirrors);
+
+		if (next == ~0)
+			break;
+
+		bh = omfs_bread(inode->i_sb, next);
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+	}
+	ret = 0;
+out:
+	return ret;
+out_brelse:
+	brelse(bh);
+	return ret;
+}
+
+static void omfs_truncate(struct inode *inode)
+{
+	omfs_shrink_inode(inode);
+	mark_inode_dirty(inode);
+}
+
+/*
+ * Add new blocks to the current extent, or create new entries/continuations
+ * as necessary.
+ */
+static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
+			u64 *ret_block)
+{
+	struct omfs_extent_entry *terminator;
+	struct omfs_extent_entry *entry = &oe->e_entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	u32 extent_count = be32_to_cpu(oe->e_extent_count);
+	u64 new_block = 0;
+	u32 max_count;
+	int new_count;
+	int ret = 0;
+
+	/* reached the end of the extent table with no blocks mapped.
+	 * there are three possibilities for adding: grow last extent,
+	 * add a new extent to the current extent table, and add a
+	 * continuation inode.  in last two cases need an allocator for
+	 * sbi->s_cluster_size
+	 */
+
+	/* TODO: handle holes */
+
+	/* should always have a terminator */
+	if (extent_count < 1)
+		return -EIO;
+
+	/* trivially grow current extent, if next block is not taken */
+	terminator = entry + extent_count - 1;
+	if (extent_count > 1) {
+		entry = terminator-1;
+		new_block = be64_to_cpu(entry->e_cluster) +
+			be64_to_cpu(entry->e_blocks);
+
+		if (omfs_allocate_block(inode->i_sb, new_block)) {
+			be64_add_cpu(&entry->e_blocks, 1);
+			terminator->e_blocks = ~(cpu_to_be64(
+				be64_to_cpu(~terminator->e_blocks) + 1));
+			goto out;
+		}
+	}
+	max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+	/* TODO: add a continuation block here */
+	if (be32_to_cpu(oe->e_extent_count) > max_count-1)
+		return -EIO;
+
+	/* try to allocate a new cluster */
+	ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize,
+		&new_block, &new_count);
+	if (ret)
+		goto out_fail;
+
+	/* copy terminator down an entry */
+	entry = terminator;
+	terminator++;
+	memcpy(terminator, entry, sizeof(struct omfs_extent_entry));
+
+	entry->e_cluster = cpu_to_be64(new_block);
+	entry->e_blocks = cpu_to_be64((u64) new_count);
+
+	terminator->e_blocks = ~(cpu_to_be64(
+		be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
+
+	/* write in new entry */
+	be32_add_cpu(&oe->e_extent_count, 1);
+
+out:
+	*ret_block = new_block;
+out_fail:
+	return ret;
+}
+
+/*
+ * Scans across the directory table for a given file block number.
+ * If block not found, return 0.
+ */
+static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent,
+			sector_t block, int count, int *left)
+{
+	/* count > 1 because of terminator */
+	sector_t searched = 0;
+	for (; count > 1; count--) {
+		int numblocks = clus_to_blk(OMFS_SB(inode->i_sb),
+			be64_to_cpu(ent->e_blocks));
+
+		if (block >= searched  &&
+		    block < searched + numblocks) {
+			/*
+			 * found it at cluster + (block - searched)
+			 * numblocks - (block - searched) is remainder
+			 */
+			*left = numblocks - (block - searched);
+			return clus_to_blk(OMFS_SB(inode->i_sb),
+				be64_to_cpu(ent->e_cluster)) +
+				block - searched;
+		}
+		searched += numblocks;
+		ent++;
+	}
+	return 0;
+}
+
+static int omfs_get_block(struct inode *inode, sector_t block,
+			  struct buffer_head *bh_result, int create)
+{
+	struct buffer_head *bh;
+	sector_t next, offset;
+	int ret;
+	u64 uninitialized_var(new_block);
+	u32 max_extents;
+	int extent_count;
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	int max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int remain;
+
+	ret = -EIO;
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+	next = inode->i_ino;
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+			goto out_brelse;
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		if (extent_count > max_extents)
+			goto out_brelse;
+
+		offset = find_block(inode, entry, block, extent_count, &remain);
+		if (offset > 0) {
+			ret = 0;
+			map_bh(bh_result, inode->i_sb, offset);
+			if (remain > max_blocks)
+				remain = max_blocks;
+			bh_result->b_size = (remain << inode->i_blkbits);
+			goto out_brelse;
+		}
+		if (next == ~0)
+			break;
+
+		brelse(bh);
+		bh = omfs_bread(inode->i_sb, next);
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+	}
+	if (create) {
+		ret = omfs_grow_extent(inode, oe, &new_block);
+		if (ret == 0) {
+			mark_buffer_dirty(bh);
+			mark_inode_dirty(inode);
+			map_bh(bh_result, inode->i_sb,
+					clus_to_blk(sbi, new_block));
+		}
+	}
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+static int omfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, omfs_get_block);
+}
+
+static int omfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, omfs_get_block);
+}
+
+static int omfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, omfs_get_block, wbc);
+}
+
+static int
+omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, omfs_get_block);
+}
+
+static void omfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		omfs_truncate(inode);
+	}
+}
+
+static int omfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				omfs_get_block);
+	if (unlikely(ret))
+		omfs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, omfs_get_block);
+}
+
+const struct file_operations omfs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read_iter = generic_file_read_iter,
+	.write_iter = generic_file_write_iter,
+	.mmap = generic_file_mmap,
+	.fsync = generic_file_fsync,
+	.splice_read = generic_file_splice_read,
+};
+
+static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+		truncate_setsize(inode, attr->ia_size);
+		omfs_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations omfs_file_inops = {
+	.setattr = omfs_setattr,
+};
+
+const struct address_space_operations omfs_aops = {
+	.readpage = omfs_readpage,
+	.readpages = omfs_readpages,
+	.writepage = omfs_writepage,
+	.writepages = omfs_writepages,
+	.write_begin = omfs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = omfs_bmap,
+};
+
diff --git a/kernel/fs/omfs/inode.c b/kernel/fs/omfs/inode.c
new file mode 100644
index 000000000..3d935c817
--- /dev/null
+++ b/kernel/fs/omfs/inode.c
@@ -0,0 +1,596 @@
+/*
+ * Optimized MPEG FS - inode and super operations.
+ * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
+#include <linux/writeback.h>
+#include <linux/crc-itu-t.h>
+#include "omfs.h"
+
+MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
+MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
+MODULE_LICENSE("GPL");
+
+struct buffer_head *omfs_bread(struct super_block *sb, sector_t block)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	if (block >= sbi->s_num_blocks)
+		return NULL;
+
+	return sb_bread(sb, clus_to_blk(sbi, block));
+}
+
+struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
+{
+	struct inode *inode;
+	u64 new_block;
+	int err;
+	int len;
+	struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb);
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors,
+			&new_block, &len);
+	if (err)
+		goto fail;
+
+	inode->i_ino = new_block;
+	inode_init_owner(inode, NULL, mode);
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = sbi->s_sys_blocksize;
+		inc_nlink(inode);
+		break;
+	case S_IFREG:
+		inode->i_op = &omfs_file_inops;
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = 0;
+		break;
+	}
+
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	return inode;
+fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+}
+
+/*
+ * Update the header checksums for a dirty inode based on its contents.
+ * Caller is expected to hold the buffer head underlying oi and mark it
+ * dirty.
+ */
+static void omfs_update_checksums(struct omfs_inode *oi)
+{
+	int xor, i, ofs = 0, count;
+	u16 crc = 0;
+	unsigned char *ptr = (unsigned char *) oi;
+
+	count = be32_to_cpu(oi->i_head.h_body_size);
+	ofs = sizeof(struct omfs_header);
+
+	crc = crc_itu_t(crc, ptr + ofs, count);
+	oi->i_head.h_crc = cpu_to_be16(crc);
+
+	xor = ptr[0];
+	for (i = 1; i < OMFS_XOR_COUNT; i++)
+		xor ^= ptr[i];
+
+	oi->i_head.h_check_xor = xor;
+}
+
+static int __omfs_write_inode(struct inode *inode, int wait)
+{
+	struct omfs_inode *oi;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct buffer_head *bh, *bh2;
+	u64 ctime;
+	int i;
+	int ret = -EIO;
+	int sync_failed = 0;
+
+	/* get current inode since we may have written sibling ptrs etc. */
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	if (S_ISDIR(inode->i_mode))
+		oi->i_type = OMFS_DIR;
+	else if (S_ISREG(inode->i_mode))
+		oi->i_type = OMFS_FILE;
+	else {
+		printk(KERN_WARNING "omfs: unknown file type: %d\n",
+			inode->i_mode);
+		goto out_brelse;
+	}
+
+	oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize -
+		sizeof(struct omfs_header));
+	oi->i_head.h_version = 1;
+	oi->i_head.h_type = OMFS_INODE_NORMAL;
+	oi->i_head.h_magic = OMFS_IMAGIC;
+	oi->i_size = cpu_to_be64(inode->i_size);
+
+	ctime = inode->i_ctime.tv_sec * 1000LL +
+		((inode->i_ctime.tv_nsec + 999)/1000);
+	oi->i_ctime = cpu_to_be64(ctime);
+
+	omfs_update_checksums(oi);
+
+	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			sync_failed = 1;
+	}
+
+	/* if mirroring writes, copy to next fsblock */
+	for (i = 1; i < sbi->s_mirrors; i++) {
+		bh2 = omfs_bread(inode->i_sb, inode->i_ino + i);
+		if (!bh2)
+			goto out_brelse;
+
+		memcpy(bh2->b_data, bh->b_data, bh->b_size);
+		mark_buffer_dirty(bh2);
+		if (wait) {
+			sync_dirty_buffer(bh2);
+			if (buffer_req(bh2) && !buffer_uptodate(bh2))
+				sync_failed = 1;
+		}
+		brelse(bh2);
+	}
+	ret = (sync_failed) ? -EIO : 0;
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int omfs_sync_inode(struct inode *inode)
+{
+	return __omfs_write_inode(inode, 1);
+}
+
+/*
+ * called when an entry is deleted, need to clear the bits in the
+ * bitmaps.
+ */
+static void omfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	if (inode->i_nlink)
+		return;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_size = 0;
+		omfs_shrink_inode(inode);
+	}
+
+	omfs_clear_range(inode->i_sb, inode->i_ino, 2);
+}
+
+struct inode *omfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	u64 ctime;
+	unsigned long nsecs;
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	bh = omfs_bread(inode->i_sb, ino);
+	if (!bh)
+		goto iget_failed;
+
+	oi = (struct omfs_inode *)bh->b_data;
+
+	/* check self */
+	if (ino != be64_to_cpu(oi->i_head.h_self))
+		goto fail_bh;
+
+	inode->i_uid = sbi->s_uid;
+	inode->i_gid = sbi->s_gid;
+
+	ctime = be64_to_cpu(oi->i_ctime);
+	nsecs = do_div(ctime, 1000) * 1000L;
+
+	inode->i_atime.tv_sec = ctime;
+	inode->i_mtime.tv_sec = ctime;
+	inode->i_ctime.tv_sec = ctime;
+	inode->i_atime.tv_nsec = nsecs;
+	inode->i_mtime.tv_nsec = nsecs;
+	inode->i_ctime.tv_nsec = nsecs;
+
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	switch (oi->i_type) {
+	case OMFS_DIR:
+		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = sbi->s_sys_blocksize;
+		inc_nlink(inode);
+		break;
+	case OMFS_FILE:
+		inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask);
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = be64_to_cpu(oi->i_size);
+		break;
+	}
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+fail_bh:
+	brelse(bh);
+iget_failed:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static void omfs_put_super(struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	kfree(sbi->s_imap);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct omfs_sb_info *sbi = OMFS_SB(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
+	buf->f_type = OMFS_MAGIC;
+	buf->f_bsize = sbi->s_blocksize;
+	buf->f_blocks = sbi->s_num_blocks;
+	buf->f_files = sbi->s_num_blocks;
+	buf->f_namelen = OMFS_NAMELEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	buf->f_bfree = buf->f_bavail = buf->f_ffree =
+		omfs_count_free(s);
+
+	return 0;
+}
+
+static const struct super_operations omfs_sops = {
+	.write_inode	= omfs_write_inode,
+	.evict_inode	= omfs_evict_inode,
+	.put_super	= omfs_put_super,
+	.statfs		= omfs_statfs,
+	.show_options	= generic_show_options,
+};
+
+/*
+ * For Rio Karma, there is an on-disk free bitmap whose location is
+ * stored in the root block.  For ReplayTV, there is no such free bitmap
+ * so we have to walk the tree.  Both inodes and file data are allocated
+ * from the same map.  This array can be big (300k) so we allocate
+ * in units of the blocksize.
+ */
+static int omfs_get_imap(struct super_block *sb)
+{
+	unsigned int bitmap_size, array_size;
+	int count;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct buffer_head *bh;
+	unsigned long **ptr;
+	sector_t block;
+
+	bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8);
+	array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize);
+
+	if (sbi->s_bitmap_ino == ~0ULL)
+		goto out;
+
+	sbi->s_imap_size = array_size;
+	sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL);
+	if (!sbi->s_imap)
+		goto nomem;
+
+	block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+	if (block >= sbi->s_num_blocks)
+		goto nomem;
+
+	ptr = sbi->s_imap;
+	for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
+		bh = sb_bread(sb, block++);
+		if (!bh)
+			goto nomem_free;
+		*ptr = kmalloc(sb->s_blocksize, GFP_KERNEL);
+		if (!*ptr) {
+			brelse(bh);
+			goto nomem_free;
+		}
+		memcpy(*ptr, bh->b_data, sb->s_blocksize);
+		if (count < sb->s_blocksize)
+			memset((void *)*ptr + count, 0xff,
+				sb->s_blocksize - count);
+		brelse(bh);
+		ptr++;
+	}
+out:
+	return 0;
+
+nomem_free:
+	for (count = 0; count < array_size; count++)
+		kfree(sbi->s_imap[count]);
+
+	kfree(sbi->s_imap);
+nomem:
+	sbi->s_imap = NULL;
+	sbi->s_imap_size = 0;
+	return -ENOMEM;
+}
+
+enum {
+	Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+	{Opt_err, NULL},
+};
+
+static int parse_options(char *options, struct omfs_sb_info *sbi)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(sbi->s_uid))
+				return 0;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(sbi->s_gid))
+				return 0;
+			break;
+		case Opt_umask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = sbi->s_dmask = option;
+			break;
+		case Opt_dmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_dmask = option;
+			break;
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = option;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh, *bh2;
+	struct omfs_super_block *omfs_sb;
+	struct omfs_root_block *omfs_rb;
+	struct omfs_sb_info *sbi;
+	struct inode *root;
+	int ret = -EINVAL;
+
+	save_mount_options(sb, (char *) data);
+
+	sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	sbi->s_uid = current_uid();
+	sbi->s_gid = current_gid();
+	sbi->s_dmask = sbi->s_fmask = current_umask();
+
+	if (!parse_options((char *) data, sbi))
+		goto end;
+
+	sb->s_maxbytes = 0xffffffff;
+
+	sb_set_blocksize(sb, 0x200);
+
+	bh = sb_bread(sb, 0);
+	if (!bh)
+		goto end;
+
+	omfs_sb = (struct omfs_super_block *)bh->b_data;
+
+	if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) {
+		if (!silent)
+			printk(KERN_ERR "omfs: Invalid superblock (%x)\n",
+				   omfs_sb->s_magic);
+		goto out_brelse_bh;
+	}
+	sb->s_magic = OMFS_MAGIC;
+
+	sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks);
+	sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize);
+	sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors);
+	sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block);
+	sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
+	mutex_init(&sbi->s_bitmap_lock);
+
+	if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) {
+		printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n",
+		       (unsigned long long)sbi->s_num_blocks);
+		goto out_brelse_bh;
+	}
+
+	if (sbi->s_sys_blocksize > PAGE_SIZE) {
+		printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
+			sbi->s_sys_blocksize);
+		goto out_brelse_bh;
+	}
+
+	if (sbi->s_blocksize < sbi->s_sys_blocksize ||
+	    sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "omfs: block size (%d) is out of range\n",
+			sbi->s_blocksize);
+		goto out_brelse_bh;
+	}
+
+	/*
+	 * Use sys_blocksize as the fs block since it is smaller than a
+	 * page while the fs blocksize can be larger.
+	 */
+	sb_set_blocksize(sb, sbi->s_sys_blocksize);
+
+	/*
+	 * ...and the difference goes into a shift.  sys_blocksize is always
+	 * a power of two factor of blocksize.
+	 */
+	sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
+		get_bitmask_order(sbi->s_sys_blocksize);
+
+	bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block));
+	if (!bh2)
+		goto out_brelse_bh;
+
+	omfs_rb = (struct omfs_root_block *)bh2->b_data;
+
+	sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap);
+	sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize);
+
+	if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) {
+		printk(KERN_ERR "omfs: block count discrepancy between "
+			"super and root blocks (%llx, %llx)\n",
+			(unsigned long long)sbi->s_num_blocks,
+			(unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks));
+		goto out_brelse_bh2;
+	}
+
+	if (sbi->s_bitmap_ino != ~0ULL &&
+	    sbi->s_bitmap_ino > sbi->s_num_blocks) {
+		printk(KERN_ERR "omfs: free space bitmap location is corrupt "
+			"(%llx, total blocks %llx)\n",
+			(unsigned long long) sbi->s_bitmap_ino,
+			(unsigned long long) sbi->s_num_blocks);
+		goto out_brelse_bh2;
+	}
+	if (sbi->s_clustersize < 1 ||
+	    sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) {
+		printk(KERN_ERR "omfs: cluster size out of range (%d)",
+			sbi->s_clustersize);
+		goto out_brelse_bh2;
+	}
+
+	ret = omfs_get_imap(sb);
+	if (ret)
+		goto out_brelse_bh2;
+
+	sb->s_op = &omfs_sops;
+
+	root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir));
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out_brelse_bh2;
+	}
+
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto out_brelse_bh2;
+	}
+	printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
+
+	ret = 0;
+out_brelse_bh2:
+	brelse(bh2);
+out_brelse_bh:
+	brelse(bh);
+end:
+	if (ret)
+		kfree(sbi);
+	return ret;
+}
+
+static struct dentry *omfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
+}
+
+static struct file_system_type omfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "omfs",
+	.mount = omfs_mount,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("omfs");
+
+static int __init init_omfs_fs(void)
+{
+	return register_filesystem(&omfs_fs_type);
+}
+
+static void __exit exit_omfs_fs(void)
+{
+	unregister_filesystem(&omfs_fs_type);
+}
+
+module_init(init_omfs_fs);
+module_exit(exit_omfs_fs);
diff --git a/kernel/fs/omfs/omfs.h b/kernel/fs/omfs/omfs.h
new file mode 100644
index 000000000..f0f8bc75e
--- /dev/null
+++ b/kernel/fs/omfs/omfs.h
@@ -0,0 +1,68 @@
+#ifndef _OMFS_H
+#define _OMFS_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "omfs_fs.h"
+
+/* In-memory structures */
+struct omfs_sb_info {
+	u64 s_num_blocks;
+	u64 s_bitmap_ino;
+	u64 s_root_ino;
+	u32 s_blocksize;
+	u32 s_mirrors;
+	u32 s_sys_blocksize;
+	u32 s_clustersize;
+	int s_block_shift;
+	unsigned long **s_imap;
+	int s_imap_size;
+	struct mutex s_bitmap_lock;
+	kuid_t s_uid;
+	kgid_t s_gid;
+	int s_dmask;
+	int s_fmask;
+};
+
+/* convert a cluster number to a scaled block number */
+static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block)
+{
+	return block << sbi->s_block_shift;
+}
+
+static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* bitmap.c */
+extern unsigned long omfs_count_free(struct super_block *sb);
+extern int omfs_allocate_block(struct super_block *sb, u64 block);
+extern int omfs_allocate_range(struct super_block *sb, int min_request,
+			int max_request, u64 *return_block, int *return_size);
+extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
+
+/* dir.c */
+extern const struct file_operations omfs_dir_operations;
+extern const struct inode_operations omfs_dir_inops;
+extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
+extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+			u64 fsblock);
+
+/* file.c */
+extern const struct file_operations omfs_file_operations;
+extern const struct inode_operations omfs_file_inops;
+extern const struct address_space_operations omfs_aops;
+extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
+extern int omfs_shrink_inode(struct inode *inode);
+
+/* inode.c */
+extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block);
+extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
+extern struct inode *omfs_new_inode(struct inode *dir, umode_t mode);
+extern int omfs_reserve_block(struct super_block *sb, sector_t block);
+extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino);
+extern int omfs_sync_inode(struct inode *inode);
+
+#endif
diff --git a/kernel/fs/omfs/omfs_fs.h b/kernel/fs/omfs/omfs_fs.h
new file mode 100644
index 000000000..83a98330e
--- /dev/null
+++ b/kernel/fs/omfs/omfs_fs.h
@@ -0,0 +1,82 @@
+#ifndef _OMFS_FS_H
+#define _OMFS_FS_H
+
+/* OMFS On-disk structures */
+
+#define OMFS_MAGIC 0xC2993D87
+#define OMFS_IMAGIC 0xD2
+
+#define OMFS_DIR 'D'
+#define OMFS_FILE 'F'
+#define OMFS_INODE_NORMAL 'e'
+#define OMFS_INODE_CONTINUATION 'c'
+#define OMFS_INODE_SYSTEM 's'
+#define OMFS_NAMELEN 256
+#define OMFS_DIR_START 0x1b8
+#define OMFS_EXTENT_START 0x1d0
+#define OMFS_EXTENT_CONT 0x40
+#define OMFS_XOR_COUNT 19
+#define OMFS_MAX_BLOCK_SIZE 8192
+#define OMFS_MAX_CLUSTER_SIZE 8
+#define OMFS_MAX_BLOCKS (1ul << 31)
+
+struct omfs_super_block {
+	char s_fill1[256];
+	__be64 s_root_block;		/* block number of omfs_root_block */
+	__be64 s_num_blocks;		/* total number of FS blocks */
+	__be32 s_magic;			/* OMFS_MAGIC */
+	__be32 s_blocksize;		/* size of a block */
+	__be32 s_mirrors;		/* # of mirrors of system blocks */
+	__be32 s_sys_blocksize;		/* size of non-data blocks */
+};
+
+struct omfs_header {
+	__be64 h_self;			/* FS block where this is located */
+	__be32 h_body_size;		/* size of useful data after header */
+	__be16 h_crc;			/* crc-ccitt of body_size bytes */
+	char h_fill1[2];
+	u8 h_version;			/* version, always 1 */
+	char h_type;			/* OMFS_INODE_X */
+	u8 h_magic;			/* OMFS_IMAGIC */
+	u8 h_check_xor;			/* XOR of header bytes before this */
+	__be32 h_fill2;
+};
+
+struct omfs_root_block {
+	struct omfs_header r_head;	/* header */
+	__be64 r_fill1;
+	__be64 r_num_blocks;		/* total number of FS blocks */
+	__be64 r_root_dir;		/* block # of root directory */
+	__be64 r_bitmap;		/* block # of free space bitmap */
+	__be32 r_blocksize;		/* size of a block */
+	__be32 r_clustersize;		/* size allocated for data blocks */
+	__be64 r_mirrors;		/* # of mirrors of system blocks */
+	char r_name[OMFS_NAMELEN];	/* partition label */
+};
+
+struct omfs_inode {
+	struct omfs_header i_head;	/* header */
+	__be64 i_parent;		/* parent containing this inode */
+	__be64 i_sibling;		/* next inode in hash bucket */
+	__be64 i_ctime;			/* ctime, in milliseconds */
+	char i_fill1[35];
+	char i_type;			/* OMFS_[DIR,FILE] */
+	__be32 i_fill2;
+	char i_fill3[64];
+	char i_name[OMFS_NAMELEN];	/* filename */
+	__be64 i_size;			/* size of file, in bytes */
+};
+
+struct omfs_extent_entry {
+	__be64 e_cluster;		/* start location of a set of blocks */
+	__be64 e_blocks;		/* number of blocks after e_cluster */
+};
+
+struct omfs_extent {
+	__be64 e_next;			/* next extent table location */
+	__be32 e_extent_count;		/* total # extents in this table */
+	__be32 e_fill;
+	struct omfs_extent_entry e_entry;	/* start of extent entries */
+};
+
+#endif
diff --git a/kernel/fs/open.c b/kernel/fs/open.c
new file mode 100644
index 000000000..98e5a52dc
--- /dev/null
+++ b/kernel/fs/open.c
@@ -0,0 +1,1143 @@
+/*
+ *  linux/fs/open.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify.h>
+#include <linux/module.h>
+#include <linux/tty.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/securebits.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/personality.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/rcupdate.h>
+#include <linux/audit.h>
+#include <linux/falloc.h>
+#include <linux/fs_struct.h>
+#include <linux/ima.h>
+#include <linux/dnotify.h>
+#include <linux/compat.h>
+
+#include "internal.h"
+
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
+{
+	int ret;
+	struct iattr newattrs;
+
+	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
+	if (length < 0)
+		return -EINVAL;
+
+	newattrs.ia_size = length;
+	newattrs.ia_valid = ATTR_SIZE | time_attrs;
+	if (filp) {
+		newattrs.ia_file = filp;
+		newattrs.ia_valid |= ATTR_FILE;
+	}
+
+	/* Remove suid/sgid on truncate too */
+	ret = should_remove_suid(dentry);
+	if (ret)
+		newattrs.ia_valid |= ret | ATTR_FORCE;
+
+	mutex_lock(&dentry->d_inode->i_mutex);
+	/* Note any delegations or leases have already been broken: */
+	ret = notify_change(dentry, &newattrs, NULL);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	return ret;
+}
+
+long vfs_truncate(struct path *path, loff_t length)
+{
+	struct inode *inode;
+	long error;
+
+	inode = path->dentry->d_inode;
+
+	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
+	if (S_ISDIR(inode->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	error = mnt_want_write(path->mnt);
+	if (error)
+		goto out;
+
+	error = inode_permission(inode, MAY_WRITE);
+	if (error)
+		goto mnt_drop_write_and_out;
+
+	error = -EPERM;
+	if (IS_APPEND(inode))
+		goto mnt_drop_write_and_out;
+
+	error = get_write_access(inode);
+	if (error)
+		goto mnt_drop_write_and_out;
+
+	/*
+	 * Make sure that there are no leases.  get_write_access() protects
+	 * against the truncate racing with a lease-granting setlease().
+	 */
+	error = break_lease(inode, O_WRONLY);
+	if (error)
+		goto put_write_and_out;
+
+	error = locks_verify_truncate(inode, NULL, length);
+	if (!error)
+		error = security_path_truncate(path);
+	if (!error)
+		error = do_truncate(path->dentry, length, 0, NULL);
+
+put_write_and_out:
+	put_write_access(inode);
+mnt_drop_write_and_out:
+	mnt_drop_write(path->mnt);
+out:
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_truncate);
+
+static long do_sys_truncate(const char __user *pathname, loff_t length)
+{
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	struct path path;
+	int error;
+
+	if (length < 0)	/* sorry, but loff_t says... */
+		return -EINVAL;
+
+retry:
+	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+	if (!error) {
+		error = vfs_truncate(&path, length);
+		path_put(&path);
+	}
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
+{
+	return do_sys_truncate(path, length);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
+{
+	return do_sys_truncate(path, length);
+}
+#endif
+
+static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	struct fd f;
+	int error;
+
+	error = -EINVAL;
+	if (length < 0)
+		goto out;
+	error = -EBADF;
+	f = fdget(fd);
+	if (!f.file)
+		goto out;
+
+	/* explicitly opened as large or we are on 64-bit box */
+	if (f.file->f_flags & O_LARGEFILE)
+		small = 0;
+
+	dentry = f.file->f_path.dentry;
+	inode = dentry->d_inode;
+	error = -EINVAL;
+	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
+		goto out_putf;
+
+	error = -EINVAL;
+	/* Cannot ftruncate over 2^31 bytes without large file support */
+	if (small && length > MAX_NON_LFS)
+		goto out_putf;
+
+	error = -EPERM;
+	if (IS_APPEND(inode))
+		goto out_putf;
+
+	sb_start_write(inode->i_sb);
+	error = locks_verify_truncate(inode, f.file, length);
+	if (!error)
+		error = security_path_truncate(&f.file->f_path);
+	if (!error)
+		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+	sb_end_write(inode->i_sb);
+out_putf:
+	fdput(f);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+{
+	return do_sys_ftruncate(fd, length, 1);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+{
+	return do_sys_ftruncate(fd, length, 1);
+}
+#endif
+
+/* LFS versions of truncate are only needed on 32 bit machines */
+#if BITS_PER_LONG == 32
+SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
+{
+	return do_sys_truncate(path, length);
+}
+
+SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
+{
+	return do_sys_ftruncate(fd, length, 0);
+}
+#endif /* BITS_PER_LONG == 32 */
+
+
+int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	long ret;
+
+	if (offset < 0 || len <= 0)
+		return -EINVAL;
+
+	/* Return error if mode is not supported */
+	if (mode & ~FALLOC_FL_SUPPORTED_MASK)
+		return -EOPNOTSUPP;
+
+	/* Punch hole and zero range are mutually exclusive */
+	if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
+	    (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
+		return -EOPNOTSUPP;
+
+	/* Punch hole must have keep size set */
+	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+	    !(mode & FALLOC_FL_KEEP_SIZE))
+		return -EOPNOTSUPP;
+
+	/* Collapse range should only be used exclusively. */
+	if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
+	    (mode & ~FALLOC_FL_COLLAPSE_RANGE))
+		return -EINVAL;
+
+	/* Insert range should only be used exclusively. */
+	if ((mode & FALLOC_FL_INSERT_RANGE) &&
+	    (mode & ~FALLOC_FL_INSERT_RANGE))
+		return -EINVAL;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	/*
+	 * We can only allow pure fallocate on append only files
+	 */
+	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
+		return -EPERM;
+
+	if (IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	/*
+	 * We cannot allow any fallocate operation on an active swapfile
+	 */
+	if (IS_SWAPFILE(inode))
+		return -ETXTBSY;
+
+	/*
+	 * Revalidate the write permissions, in case security policy has
+	 * changed since the files were opened.
+	 */
+	ret = security_file_permission(file, MAY_WRITE);
+	if (ret)
+		return ret;
+
+	if (S_ISFIFO(inode->i_mode))
+		return -ESPIPE;
+
+	/*
+	 * Let individual file system decide if it supports preallocation
+	 * for directories or not.
+	 */
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		return -ENODEV;
+
+	/* Check for wrap through zero too */
+	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
+		return -EFBIG;
+
+	if (!file->f_op->fallocate)
+		return -EOPNOTSUPP;
+
+	sb_start_write(inode->i_sb);
+	ret = file->f_op->fallocate(file, mode, offset, len);
+
+	/*
+	 * Create inotify and fanotify events.
+	 *
+	 * To keep the logic simple always create events if fallocate succeeds.
+	 * This implies that events are even created if the file size remains
+	 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
+	 */
+	if (ret == 0)
+		fsnotify_modify(file);
+
+	sb_end_write(inode->i_sb);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfs_fallocate);
+
+SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
+{
+	struct fd f = fdget(fd);
+	int error = -EBADF;
+
+	if (f.file) {
+		error = vfs_fallocate(f.file, mode, offset, len);
+		fdput(f);
+	}
+	return error;
+}
+
+/*
+ * access() needs to use the real uid/gid, not the effective uid/gid.
+ * We do this by temporarily clearing all FS-related capabilities and
+ * switching the fsuid/fsgid around to the real ones.
+ */
+SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
+{
+	const struct cred *old_cred;
+	struct cred *override_cred;
+	struct path path;
+	struct inode *inode;
+	int res;
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+
+	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
+		return -EINVAL;
+
+	override_cred = prepare_creds();
+	if (!override_cred)
+		return -ENOMEM;
+
+	override_cred->fsuid = override_cred->uid;
+	override_cred->fsgid = override_cred->gid;
+
+	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+		/* Clear the capabilities if we switch to a non-root user */
+		kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
+		if (!uid_eq(override_cred->uid, root_uid))
+			cap_clear(override_cred->cap_effective);
+		else
+			override_cred->cap_effective =
+				override_cred->cap_permitted;
+	}
+
+	old_cred = override_creds(override_cred);
+retry:
+	res = user_path_at(dfd, filename, lookup_flags, &path);
+	if (res)
+		goto out;
+
+	inode = path.dentry->d_inode;
+
+	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+		/*
+		 * MAY_EXEC on regular files is denied if the fs is mounted
+		 * with the "noexec" flag.
+		 */
+		res = -EACCES;
+		if (path.mnt->mnt_flags & MNT_NOEXEC)
+			goto out_path_release;
+	}
+
+	res = inode_permission(inode, mode | MAY_ACCESS);
+	/* SuS v2 requires we report a read only fs too */
+	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
+		goto out_path_release;
+	/*
+	 * This is a rare case where using __mnt_is_readonly()
+	 * is OK without a mnt_want/drop_write() pair.  Since
+	 * no actual write to the fs is performed here, we do
+	 * not need to telegraph to that to anyone.
+	 *
+	 * By doing this, we accept that this access is
+	 * inherently racy and know that the fs may change
+	 * state before we even see this result.
+	 */
+	if (__mnt_is_readonly(path.mnt))
+		res = -EROFS;
+
+out_path_release:
+	path_put(&path);
+	if (retry_estale(res, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+out:
+	revert_creds(old_cred);
+	put_cred(override_cred);
+	return res;
+}
+
+SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
+{
+	return sys_faccessat(AT_FDCWD, filename, mode);
+}
+
+SYSCALL_DEFINE1(chdir, const char __user *, filename)
+{
+	struct path path;
+	int error;
+	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+retry:
+	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	if (error)
+		goto dput_and_out;
+
+	set_fs_pwd(current->fs, &path);
+
+dput_and_out:
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+out:
+	return error;
+}
+
+SYSCALL_DEFINE1(fchdir, unsigned int, fd)
+{
+	struct fd f = fdget_raw(fd);
+	struct inode *inode;
+	int error = -EBADF;
+
+	error = -EBADF;
+	if (!f.file)
+		goto out;
+
+	inode = file_inode(f.file);
+
+	error = -ENOTDIR;
+	if (!S_ISDIR(inode->i_mode))
+		goto out_putf;
+
+	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+	if (!error)
+		set_fs_pwd(current->fs, &f.file->f_path);
+out_putf:
+	fdput(f);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE1(chroot, const char __user *, filename)
+{
+	struct path path;
+	int error;
+	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+retry:
+	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	if (error)
+		goto dput_and_out;
+
+	error = -EPERM;
+	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
+		goto dput_and_out;
+	error = security_path_chroot(&path);
+	if (error)
+		goto dput_and_out;
+
+	set_fs_root(current->fs, &path);
+	error = 0;
+dput_and_out:
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+out:
+	return error;
+}
+
+static int chmod_common(struct path *path, umode_t mode)
+{
+	struct inode *inode = path->dentry->d_inode;
+	struct inode *delegated_inode = NULL;
+	struct iattr newattrs;
+	int error;
+
+	error = mnt_want_write(path->mnt);
+	if (error)
+		return error;
+retry_deleg:
+	mutex_lock(&inode->i_mutex);
+	error = security_path_chmod(path, mode);
+	if (error)
+		goto out_unlock;
+	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+	mnt_drop_write(path->mnt);
+	return error;
+}
+
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
+{
+	struct fd f = fdget(fd);
+	int err = -EBADF;
+
+	if (f.file) {
+		audit_file(f.file);
+		err = chmod_common(&f.file->f_path, mode);
+		fdput(f);
+	}
+	return err;
+}
+
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode)
+{
+	struct path path;
+	int error;
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (!error) {
+		error = chmod_common(&path, mode);
+		path_put(&path);
+		if (retry_estale(error, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+	return error;
+}
+
+SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
+{
+	return sys_fchmodat(AT_FDCWD, filename, mode);
+}
+
+static int chown_common(struct path *path, uid_t user, gid_t group)
+{
+	struct inode *inode = path->dentry->d_inode;
+	struct inode *delegated_inode = NULL;
+	int error;
+	struct iattr newattrs;
+	kuid_t uid;
+	kgid_t gid;
+
+	uid = make_kuid(current_user_ns(), user);
+	gid = make_kgid(current_user_ns(), group);
+
+retry_deleg:
+	newattrs.ia_valid =  ATTR_CTIME;
+	if (user != (uid_t) -1) {
+		if (!uid_valid(uid))
+			return -EINVAL;
+		newattrs.ia_valid |= ATTR_UID;
+		newattrs.ia_uid = uid;
+	}
+	if (group != (gid_t) -1) {
+		if (!gid_valid(gid))
+			return -EINVAL;
+		newattrs.ia_valid |= ATTR_GID;
+		newattrs.ia_gid = gid;
+	}
+	if (!S_ISDIR(inode->i_mode))
+		newattrs.ia_valid |=
+			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+	mutex_lock(&inode->i_mutex);
+	error = security_path_chown(path, uid, gid);
+	if (!error)
+		error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	mutex_unlock(&inode->i_mutex);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
+		gid_t, group, int, flag)
+{
+	struct path path;
+	int error = -EINVAL;
+	int lookup_flags;
+
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		goto out;
+
+	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+retry:
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+	error = mnt_want_write(path.mnt);
+	if (error)
+		goto out_release;
+	error = chown_common(&path, user, group);
+	mnt_drop_write(path.mnt);
+out_release:
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+out:
+	return error;
+}
+
+SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
+{
+	return sys_fchownat(AT_FDCWD, filename, user, group, 0);
+}
+
+SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
+{
+	return sys_fchownat(AT_FDCWD, filename, user, group,
+			    AT_SYMLINK_NOFOLLOW);
+}
+
+SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
+{
+	struct fd f = fdget(fd);
+	int error = -EBADF;
+
+	if (!f.file)
+		goto out;
+
+	error = mnt_want_write_file(f.file);
+	if (error)
+		goto out_fput;
+	audit_file(f.file);
+	error = chown_common(&f.file->f_path, user, group);
+	mnt_drop_write_file(f.file);
+out_fput:
+	fdput(f);
+out:
+	return error;
+}
+
+int open_check_o_direct(struct file *f)
+{
+	/* NB: we're sure to have correct a_ops only after f_op->open */
+	if (f->f_flags & O_DIRECT) {
+		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static int do_dentry_open(struct file *f,
+			  int (*open)(struct inode *, struct file *),
+			  const struct cred *cred)
+{
+	static const struct file_operations empty_fops = {};
+	struct inode *inode;
+	int error;
+
+	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
+				FMODE_PREAD | FMODE_PWRITE;
+
+	path_get(&f->f_path);
+	inode = f->f_inode = f->f_path.dentry->d_inode;
+	f->f_mapping = inode->i_mapping;
+
+	if (unlikely(f->f_flags & O_PATH)) {
+		f->f_mode = FMODE_PATH;
+		f->f_op = &empty_fops;
+		return 0;
+	}
+
+	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+		error = get_write_access(inode);
+		if (unlikely(error))
+			goto cleanup_file;
+		error = __mnt_want_write(f->f_path.mnt);
+		if (unlikely(error)) {
+			put_write_access(inode);
+			goto cleanup_file;
+		}
+		f->f_mode |= FMODE_WRITER;
+	}
+
+	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+	if (S_ISREG(inode->i_mode))
+		f->f_mode |= FMODE_ATOMIC_POS;
+
+	f->f_op = fops_get(inode->i_fop);
+	if (unlikely(WARN_ON(!f->f_op))) {
+		error = -ENODEV;
+		goto cleanup_all;
+	}
+
+	error = security_file_open(f, cred);
+	if (error)
+		goto cleanup_all;
+
+	error = break_lease(inode, f->f_flags);
+	if (error)
+		goto cleanup_all;
+
+	if (!open)
+		open = f->f_op->open;
+	if (open) {
+		error = open(inode, f);
+		if (error)
+			goto cleanup_all;
+	}
+	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(inode);
+	if ((f->f_mode & FMODE_READ) &&
+	     likely(f->f_op->read || f->f_op->read_iter))
+		f->f_mode |= FMODE_CAN_READ;
+	if ((f->f_mode & FMODE_WRITE) &&
+	     likely(f->f_op->write || f->f_op->write_iter))
+		f->f_mode |= FMODE_CAN_WRITE;
+
+	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
+
+	return 0;
+
+cleanup_all:
+	fops_put(f->f_op);
+	if (f->f_mode & FMODE_WRITER) {
+		put_write_access(inode);
+		__mnt_drop_write(f->f_path.mnt);
+	}
+cleanup_file:
+	path_put(&f->f_path);
+	f->f_path.mnt = NULL;
+	f->f_path.dentry = NULL;
+	f->f_inode = NULL;
+	return error;
+}
+
+/**
+ * finish_open - finish opening a file
+ * @file: file pointer
+ * @dentry: pointer to dentry
+ * @open: open callback
+ * @opened: state of open
+ *
+ * This can be used to finish opening a file passed to i_op->atomic_open().
+ *
+ * If the open callback is set to NULL, then the standard f_op->open()
+ * filesystem callback is substituted.
+ *
+ * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
+ * the return value of d_splice_alias(), then the caller needs to perform dput()
+ * on it after finish_open().
+ *
+ * On successful return @file is a fully instantiated open file.  After this, if
+ * an error occurs in ->atomic_open(), it needs to clean up with fput().
+ *
+ * Returns zero on success or -errno if the open failed.
+ */
+int finish_open(struct file *file, struct dentry *dentry,
+		int (*open)(struct inode *, struct file *),
+		int *opened)
+{
+	int error;
+	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
+
+	file->f_path.dentry = dentry;
+	error = do_dentry_open(file, open, current_cred());
+	if (!error)
+		*opened |= FILE_OPENED;
+
+	return error;
+}
+EXPORT_SYMBOL(finish_open);
+
+/**
+ * finish_no_open - finish ->atomic_open() without opening the file
+ *
+ * @file: file pointer
+ * @dentry: dentry or NULL (as returned from ->lookup())
+ *
+ * This can be used to set the result of a successful lookup in ->atomic_open().
+ *
+ * NB: unlike finish_open() this function does consume the dentry reference and
+ * the caller need not dput() it.
+ *
+ * Returns "1" which must be the return value of ->atomic_open() after having
+ * called this function.
+ */
+int finish_no_open(struct file *file, struct dentry *dentry)
+{
+	file->f_path.dentry = dentry;
+	return 1;
+}
+EXPORT_SYMBOL(finish_no_open);
+
+struct file *dentry_open(const struct path *path, int flags,
+			 const struct cred *cred)
+{
+	int error;
+	struct file *f;
+
+	validate_creds(cred);
+
+	/* We must always pass in a valid mount pointer. */
+	BUG_ON(!path->mnt);
+
+	f = get_empty_filp();
+	if (!IS_ERR(f)) {
+		f->f_flags = flags;
+		error = vfs_open(path, f, cred);
+		if (!error) {
+			/* from now on we need fput() to dispose of f */
+			error = open_check_o_direct(f);
+			if (error) {
+				fput(f);
+				f = ERR_PTR(error);
+			}
+		} else { 
+			put_filp(f);
+			f = ERR_PTR(error);
+		}
+	}
+	return f;
+}
+EXPORT_SYMBOL(dentry_open);
+
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @filp: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *filp,
+	     const struct cred *cred)
+{
+	struct inode *inode = path->dentry->d_inode;
+
+	if (inode->i_op->dentry_open)
+		return inode->i_op->dentry_open(path->dentry, filp, cred);
+	else {
+		filp->f_path = *path;
+		return do_dentry_open(filp, NULL, cred);
+	}
+}
+EXPORT_SYMBOL(vfs_open);
+
+static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
+{
+	int lookup_flags = 0;
+	int acc_mode;
+
+	if (flags & (O_CREAT | __O_TMPFILE))
+		op->mode = (mode & S_IALLUGO) | S_IFREG;
+	else
+		op->mode = 0;
+
+	/* Must never be set by userspace */
+	flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
+
+	/*
+	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+	 * check for O_DSYNC if the need any syncing at all we enforce it's
+	 * always set instead of having to deal with possibly weird behaviour
+	 * for malicious applications setting only __O_SYNC.
+	 */
+	if (flags & __O_SYNC)
+		flags |= O_DSYNC;
+
+	if (flags & __O_TMPFILE) {
+		if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
+			return -EINVAL;
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+		if (!(acc_mode & MAY_WRITE))
+			return -EINVAL;
+	} else if (flags & O_PATH) {
+		/*
+		 * If we have O_PATH in the open flag. Then we
+		 * cannot have anything other than the below set of flags
+		 */
+		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+		acc_mode = 0;
+	} else {
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+	}
+
+	op->open_flag = flags;
+
+	/* O_TRUNC implies we need access checks for write permissions */
+	if (flags & O_TRUNC)
+		acc_mode |= MAY_WRITE;
+
+	/* Allow the LSM permission hook to distinguish append
+	   access from general write access. */
+	if (flags & O_APPEND)
+		acc_mode |= MAY_APPEND;
+
+	op->acc_mode = acc_mode;
+
+	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+
+	if (flags & O_CREAT) {
+		op->intent |= LOOKUP_CREATE;
+		if (flags & O_EXCL)
+			op->intent |= LOOKUP_EXCL;
+	}
+
+	if (flags & O_DIRECTORY)
+		lookup_flags |= LOOKUP_DIRECTORY;
+	if (!(flags & O_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+	op->lookup_flags = lookup_flags;
+	return 0;
+}
+
+/**
+ * file_open_name - open file and return file pointer
+ *
+ * @name:	struct filename containing path to open
+ * @flags:	open flags as per the open(2) second argument
+ * @mode:	mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *file_open_name(struct filename *name, int flags, umode_t mode)
+{
+	struct open_flags op;
+	int err = build_open_flags(flags, mode, &op);
+	return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
+}
+
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:	path to open
+ * @flags:	open flags as per the open(2) second argument
+ * @mode:	mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, umode_t mode)
+{
+	struct filename *name = getname_kernel(filename);
+	struct file *file = ERR_CAST(name);
+	
+	if (!IS_ERR(name)) {
+		file = file_open_name(name, flags, mode);
+		putname(name);
+	}
+	return file;
+}
+EXPORT_SYMBOL(filp_open);
+
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+			    const char *filename, int flags)
+{
+	struct open_flags op;
+	int err = build_open_flags(flags, 0, &op);
+	if (err)
+		return ERR_PTR(err);
+	if (flags & O_CREAT)
+		return ERR_PTR(-EINVAL);
+	return do_file_open_root(dentry, mnt, filename, &op);
+}
+EXPORT_SYMBOL(file_open_root);
+
+long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+{
+	struct open_flags op;
+	int fd = build_open_flags(flags, mode, &op);
+	struct filename *tmp;
+
+	if (fd)
+		return fd;
+
+	tmp = getname(filename);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	fd = get_unused_fd_flags(flags);
+	if (fd >= 0) {
+		struct file *f = do_filp_open(dfd, tmp, &op);
+		if (IS_ERR(f)) {
+			put_unused_fd(fd);
+			fd = PTR_ERR(f);
+		} else {
+			fsnotify_open(f);
+			fd_install(fd, f);
+		}
+	}
+	putname(tmp);
+	return fd;
+}
+
+SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+{
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
+		umode_t, mode)
+{
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	return do_sys_open(dfd, filename, flags, mode);
+}
+
+#ifndef __alpha__
+
+/*
+ * For backward compatibility?  Maybe this should be moved
+ * into arch/i386 instead?
+ */
+SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
+{
+	return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
+}
+
+#endif
+
+/*
+ * "id" is the POSIX thread ID. We use the
+ * files pointer for this..
+ */
+int filp_close(struct file *filp, fl_owner_t id)
+{
+	int retval = 0;
+
+	if (!file_count(filp)) {
+		printk(KERN_ERR "VFS: Close: file count is 0\n");
+		return 0;
+	}
+
+	if (filp->f_op->flush)
+		retval = filp->f_op->flush(filp, id);
+
+	if (likely(!(filp->f_mode & FMODE_PATH))) {
+		dnotify_flush(filp, id);
+		locks_remove_posix(filp, id);
+	}
+	fput(filp);
+	return retval;
+}
+
+EXPORT_SYMBOL(filp_close);
+
+/*
+ * Careful here! We test whether the file pointer is NULL before
+ * releasing the fd. This ensures that one clone task can't release
+ * an fd while another clone is opening it.
+ */
+SYSCALL_DEFINE1(close, unsigned int, fd)
+{
+	int retval = __close_fd(current->files, fd);
+
+	/* can't restart close syscall because file table entry was cleared */
+	if (unlikely(retval == -ERESTARTSYS ||
+		     retval == -ERESTARTNOINTR ||
+		     retval == -ERESTARTNOHAND ||
+		     retval == -ERESTART_RESTARTBLOCK))
+		retval = -EINTR;
+
+	return retval;
+}
+EXPORT_SYMBOL(sys_close);
+
+/*
+ * This routine simulates a hangup on the tty, to arrange that users
+ * are given clean terminals at login time.
+ */
+SYSCALL_DEFINE0(vhangup)
+{
+	if (capable(CAP_SYS_TTY_CONFIG)) {
+		tty_vhangup_self();
+		return 0;
+	}
+	return -EPERM;
+}
+
+/*
+ * Called when an inode is about to be open.
+ * We use this to disallow opening large files on 32bit systems if
+ * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
+ * on this flag in sys_open.
+ */
+int generic_file_open(struct inode * inode, struct file * filp)
+{
+	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EOVERFLOW;
+	return 0;
+}
+
+EXPORT_SYMBOL(generic_file_open);
+
+/*
+ * This is used by subsystems that don't want seekable
+ * file descriptors. The function is not supposed to ever fail, the only
+ * reason it returns an 'int' and not 'void' is so that it can be plugged
+ * directly into file_operations structure.
+ */
+int nonseekable_open(struct inode *inode, struct file *filp)
+{
+	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+	return 0;
+}
+
+EXPORT_SYMBOL(nonseekable_open);
diff --git a/kernel/fs/openpromfs/Makefile b/kernel/fs/openpromfs/Makefile
new file mode 100644
index 000000000..4563199b8
--- /dev/null
+++ b/kernel/fs/openpromfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Sun Openprom filesystem routines.
+#
+
+obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs.o
+
+openpromfs-objs := inode.o
diff --git a/kernel/fs/openpromfs/inode.c b/kernel/fs/openpromfs/inode.c
new file mode 100644
index 000000000..15e4500cd
--- /dev/null
+++ b/kernel/fs/openpromfs/inode.c
@@ -0,0 +1,471 @@
+/* inode.c: /proc/openprom handling routines
+ *
+ * Copyright (C) 1996-1999 Jakub Jelinek  (jakub@redhat.com)
+ * Copyright (C) 1998      Eddie C. Dost  (ecd@skynet.be)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/magic.h>
+
+#include <asm/openprom.h>
+#include <asm/oplib.h>
+#include <asm/prom.h>
+#include <asm/uaccess.h>
+
+static DEFINE_MUTEX(op_mutex);
+
+#define OPENPROM_ROOT_INO	0
+
+enum op_inode_type {
+	op_inode_node,
+	op_inode_prop,
+};
+
+union op_inode_data {
+	struct device_node	*node;
+	struct property		*prop;
+};
+
+struct op_inode_info {
+	struct inode		vfs_inode;
+	enum op_inode_type	type;
+	union op_inode_data	u;
+};
+
+static struct inode *openprom_iget(struct super_block *sb, ino_t ino);
+
+static inline struct op_inode_info *OP_I(struct inode *inode)
+{
+	return container_of(inode, struct op_inode_info, vfs_inode);
+}
+
+static int is_string(unsigned char *p, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		unsigned char val = p[i];
+
+		if ((i && !val) ||
+		    (val >= ' ' && val <= '~'))
+			continue;
+
+		return 0;
+	}
+
+	return 1;
+}
+
+static int property_show(struct seq_file *f, void *v)
+{
+	struct property *prop = f->private;
+	void *pval;
+	int len;
+
+	len = prop->length;
+	pval = prop->value;
+
+	if (is_string(pval, len)) {
+		while (len > 0) {
+			int n = strlen(pval);
+
+			seq_printf(f, "%s", (char *) pval);
+
+			/* Skip over the NULL byte too.  */
+			pval += n + 1;
+			len -= n + 1;
+
+			if (len > 0)
+				seq_printf(f, " + ");
+		}
+	} else {
+		if (len & 3) {
+			while (len) {
+				len--;
+				if (len)
+					seq_printf(f, "%02x.",
+						   *(unsigned char *) pval);
+				else
+					seq_printf(f, "%02x",
+						   *(unsigned char *) pval);
+				pval++;
+			}
+		} else {
+			while (len >= 4) {
+				len -= 4;
+
+				if (len)
+					seq_printf(f, "%08x.",
+						   *(unsigned int *) pval);
+				else
+					seq_printf(f, "%08x",
+						   *(unsigned int *) pval);
+				pval += 4;
+			}
+		}
+	}
+	seq_printf(f, "\n");
+
+	return 0;
+}
+
+static void *property_start(struct seq_file *f, loff_t *pos)
+{
+	if (*pos == 0)
+		return pos;
+	return NULL;
+}
+
+static void *property_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return NULL;
+}
+
+static void property_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations property_op = {
+	.start		= property_start,
+	.next		= property_next,
+	.stop		= property_stop,
+	.show		= property_show
+};
+
+static int property_open(struct inode *inode, struct file *file)
+{
+	struct op_inode_info *oi = OP_I(inode);
+	int ret;
+
+	BUG_ON(oi->type != op_inode_prop);
+
+	ret = seq_open(file, &property_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = oi->u.prop;
+	}
+	return ret;
+}
+
+static const struct file_operations openpromfs_prop_ops = {
+	.open		= property_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int openpromfs_readdir(struct file *, struct dir_context *);
+
+static const struct file_operations openprom_operations = {
+	.read		= generic_read_dir,
+	.iterate	= openpromfs_readdir,
+	.llseek		= generic_file_llseek,
+};
+
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, unsigned int);
+
+static const struct inode_operations openprom_inode_operations = {
+	.lookup		= openpromfs_lookup,
+};
+
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	struct op_inode_info *ent_oi, *oi = OP_I(dir);
+	struct device_node *dp, *child;
+	struct property *prop;
+	enum op_inode_type ent_type;
+	union op_inode_data ent_data;
+	const char *name;
+	struct inode *inode;
+	unsigned int ino;
+	int len;
+	
+	BUG_ON(oi->type != op_inode_node);
+
+	dp = oi->u.node;
+
+	name = dentry->d_name.name;
+	len = dentry->d_name.len;
+
+	mutex_lock(&op_mutex);
+
+	child = dp->child;
+	while (child) {
+		int n = strlen(child->path_component_name);
+
+		if (len == n &&
+		    !strncmp(child->path_component_name, name, len)) {
+			ent_type = op_inode_node;
+			ent_data.node = child;
+			ino = child->unique_id;
+			goto found;
+		}
+		child = child->sibling;
+	}
+
+	prop = dp->properties;
+	while (prop) {
+		int n = strlen(prop->name);
+
+		if (len == n && !strncmp(prop->name, name, len)) {
+			ent_type = op_inode_prop;
+			ent_data.prop = prop;
+			ino = prop->unique_id;
+			goto found;
+		}
+
+		prop = prop->next;
+	}
+
+	mutex_unlock(&op_mutex);
+	return ERR_PTR(-ENOENT);
+
+found:
+	inode = openprom_iget(dir->i_sb, ino);
+	mutex_unlock(&op_mutex);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	ent_oi = OP_I(inode);
+	ent_oi->type = ent_type;
+	ent_oi->u = ent_data;
+
+	switch (ent_type) {
+	case op_inode_node:
+		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+		inode->i_op = &openprom_inode_operations;
+		inode->i_fop = &openprom_operations;
+		set_nlink(inode, 2);
+		break;
+	case op_inode_prop:
+		if (!strcmp(dp->name, "options") && (len == 17) &&
+		    !strncmp (name, "security-password", 17))
+			inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+		else
+			inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_fop = &openpromfs_prop_ops;
+		set_nlink(inode, 1);
+		inode->i_size = ent_oi->u.prop->length;
+		break;
+	}
+
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct op_inode_info *oi = OP_I(inode);
+	struct device_node *dp = oi->u.node;
+	struct device_node *child;
+	struct property *prop;
+	int i;
+
+	mutex_lock(&op_mutex);
+	
+	if (ctx->pos == 0) {
+		if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+			goto out;
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2,
+			    (dp->parent == NULL ?
+			     OPENPROM_ROOT_INO :
+			     dp->parent->unique_id), DT_DIR))
+			goto out;
+		ctx->pos = 2;
+	}
+	i = ctx->pos - 2;
+
+	/* First, the children nodes as directories.  */
+	child = dp->child;
+	while (i && child) {
+		child = child->sibling;
+		i--;
+	}
+	while (child) {
+		if (!dir_emit(ctx,
+			    child->path_component_name,
+			    strlen(child->path_component_name),
+			    child->unique_id, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+		child = child->sibling;
+	}
+
+	/* Next, the properties as files.  */
+	prop = dp->properties;
+	while (i && prop) {
+		prop = prop->next;
+		i--;
+	}
+	while (prop) {
+		if (!dir_emit(ctx, prop->name, strlen(prop->name),
+			    prop->unique_id, DT_REG))
+			goto out;
+
+		ctx->pos++;
+		prop = prop->next;
+	}
+
+out:
+	mutex_unlock(&op_mutex);
+	return 0;
+}
+
+static struct kmem_cache *op_inode_cachep;
+
+static struct inode *openprom_alloc_inode(struct super_block *sb)
+{
+	struct op_inode_info *oi;
+
+	oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL);
+	if (!oi)
+		return NULL;
+
+	return &oi->vfs_inode;
+}
+
+static void openprom_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(op_inode_cachep, OP_I(inode));
+}
+
+static void openprom_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, openprom_i_callback);
+}
+
+static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (inode->i_state & I_NEW) {
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		if (inode->i_ino == OPENPROM_ROOT_INO) {
+			inode->i_op = &openprom_inode_operations;
+			inode->i_fop = &openprom_operations;
+			inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+		}
+		unlock_new_inode(inode);
+	}
+	return inode;
+}
+
+static int openprom_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_NOATIME;
+	return 0;
+}
+
+static const struct super_operations openprom_sops = {
+	.alloc_inode	= openprom_alloc_inode,
+	.destroy_inode	= openprom_destroy_inode,
+	.statfs		= simple_statfs,
+	.remount_fs	= openprom_remount,
+};
+
+static int openprom_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode *root_inode;
+	struct op_inode_info *oi;
+	int ret;
+
+	s->s_flags |= MS_NOATIME;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = OPENPROM_SUPER_MAGIC;
+	s->s_op = &openprom_sops;
+	s->s_time_gran = 1;
+	root_inode = openprom_iget(s, OPENPROM_ROOT_INO);
+	if (IS_ERR(root_inode)) {
+		ret = PTR_ERR(root_inode);
+		goto out_no_root;
+	}
+
+	oi = OP_I(root_inode);
+	oi->type = op_inode_node;
+	oi->u.node = of_find_node_by_path("/");
+
+	s->s_root = d_make_root(root_inode);
+	if (!s->s_root)
+		goto out_no_root_dentry;
+	return 0;
+
+out_no_root_dentry:
+	ret = -ENOMEM;
+out_no_root:
+	printk("openprom_fill_super: get root inode failed\n");
+	return ret;
+}
+
+static struct dentry *openprom_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, openprom_fill_super);
+}
+
+static struct file_system_type openprom_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "openpromfs",
+	.mount		= openprom_mount,
+	.kill_sb	= kill_anon_super,
+};
+MODULE_ALIAS_FS("openpromfs");
+
+static void op_inode_init_once(void *data)
+{
+	struct op_inode_info *oi = (struct op_inode_info *) data;
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+static int __init init_openprom_fs(void)
+{
+	int err;
+
+	op_inode_cachep = kmem_cache_create("op_inode_cache",
+					    sizeof(struct op_inode_info),
+					    0,
+					    (SLAB_RECLAIM_ACCOUNT |
+					     SLAB_MEM_SPREAD),
+					    op_inode_init_once);
+	if (!op_inode_cachep)
+		return -ENOMEM;
+
+	err = register_filesystem(&openprom_fs_type);
+	if (err)
+		kmem_cache_destroy(op_inode_cachep);
+
+	return err;
+}
+
+static void __exit exit_openprom_fs(void)
+{
+	unregister_filesystem(&openprom_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(op_inode_cachep);
+}
+
+module_init(init_openprom_fs)
+module_exit(exit_openprom_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/overlayfs/Kconfig b/kernel/fs/overlayfs/Kconfig
new file mode 100644
index 000000000..34355818a
--- /dev/null
+++ b/kernel/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
+config OVERLAY_FS
+	tristate "Overlay filesystem support"
+	help
+	  An overlay filesystem combines two filesystems - an 'upper' filesystem
+	  and a 'lower' filesystem.  When a name exists in both filesystems, the
+	  object in the 'upper' filesystem is visible while the object in the
+	  'lower' filesystem is either hidden or, in the case of directories,
+	  merged with the 'upper' object.
+
+	  For more information see Documentation/filesystems/overlayfs.txt
diff --git a/kernel/fs/overlayfs/Makefile b/kernel/fs/overlayfs/Makefile
new file mode 100644
index 000000000..900daed3e
--- /dev/null
+++ b/kernel/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the overlay filesystem.
+#
+
+obj-$(CONFIG_OVERLAY_FS) += overlay.o
+
+overlay-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/kernel/fs/overlayfs/copy_up.c b/kernel/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000..84d693d37
--- /dev/null
+++ b/kernel/fs/overlayfs/copy_up.c
@@ -0,0 +1,416 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include "overlayfs.h"
+
+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+
+int ovl_copy_xattr(struct dentry *old, struct dentry *new)
+{
+	ssize_t list_size, size;
+	char *buf, *name, *value;
+	int error;
+
+	if (!old->d_inode->i_op->getxattr ||
+	    !new->d_inode->i_op->getxattr)
+		return 0;
+
+	list_size = vfs_listxattr(old, NULL, 0);
+	if (list_size <= 0) {
+		if (list_size == -EOPNOTSUPP)
+			return 0;
+		return list_size;
+	}
+
+	buf = kzalloc(list_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	error = -ENOMEM;
+	value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
+	if (!value)
+		goto out;
+
+	list_size = vfs_listxattr(old, buf, list_size);
+	if (list_size <= 0) {
+		error = list_size;
+		goto out_free_value;
+	}
+
+	for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+		size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
+		if (size <= 0) {
+			error = size;
+			goto out_free_value;
+		}
+		error = vfs_setxattr(new, name, value, size, 0);
+		if (error)
+			goto out_free_value;
+	}
+
+out_free_value:
+	kfree(value);
+out:
+	kfree(buf);
+	return error;
+}
+
+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
+{
+	struct file *old_file;
+	struct file *new_file;
+	loff_t old_pos = 0;
+	loff_t new_pos = 0;
+	int error = 0;
+
+	if (len == 0)
+		return 0;
+
+	old_file = ovl_path_open(old, O_RDONLY);
+	if (IS_ERR(old_file))
+		return PTR_ERR(old_file);
+
+	new_file = ovl_path_open(new, O_WRONLY);
+	if (IS_ERR(new_file)) {
+		error = PTR_ERR(new_file);
+		goto out_fput;
+	}
+
+	/* FIXME: copy up sparse files efficiently */
+	while (len) {
+		size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
+		long bytes;
+
+		if (len < this_len)
+			this_len = len;
+
+		if (signal_pending_state(TASK_KILLABLE, current)) {
+			error = -EINTR;
+			break;
+		}
+
+		bytes = do_splice_direct(old_file, &old_pos,
+					 new_file, &new_pos,
+					 this_len, SPLICE_F_MOVE);
+		if (bytes <= 0) {
+			error = bytes;
+			break;
+		}
+		WARN_ON(old_pos != new_pos);
+
+		len -= bytes;
+	}
+
+	fput(new_file);
+out_fput:
+	fput(old_file);
+	return error;
+}
+
+static char *ovl_read_symlink(struct dentry *realdentry)
+{
+	int res;
+	char *buf;
+	struct inode *inode = realdentry->d_inode;
+	mm_segment_t old_fs;
+
+	res = -EINVAL;
+	if (!inode->i_op->readlink)
+		goto err;
+
+	res = -ENOMEM;
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = inode->i_op->readlink(realdentry,
+				    (char __user *)buf, PAGE_SIZE - 1);
+	set_fs(old_fs);
+	if (res < 0) {
+		free_page((unsigned long) buf);
+		goto err;
+	}
+	buf[res] = '\0';
+
+	return buf;
+
+err:
+	return ERR_PTR(res);
+}
+
+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+{
+	struct iattr attr = {
+		.ia_valid =
+		     ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+		.ia_atime = stat->atime,
+		.ia_mtime = stat->mtime,
+	};
+
+	return notify_change(upperdentry, &attr, NULL);
+}
+
+int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+{
+	int err = 0;
+
+	if (!S_ISLNK(stat->mode)) {
+		struct iattr attr = {
+			.ia_valid = ATTR_MODE,
+			.ia_mode = stat->mode,
+		};
+		err = notify_change(upperdentry, &attr, NULL);
+	}
+	if (!err) {
+		struct iattr attr = {
+			.ia_valid = ATTR_UID | ATTR_GID,
+			.ia_uid = stat->uid,
+			.ia_gid = stat->gid,
+		};
+		err = notify_change(upperdentry, &attr, NULL);
+	}
+	if (!err)
+		ovl_set_timestamps(upperdentry, stat);
+
+	return err;
+}
+
+static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
+			      struct dentry *dentry, struct path *lowerpath,
+			      struct kstat *stat, struct iattr *attr,
+			      const char *link)
+{
+	struct inode *wdir = workdir->d_inode;
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *newdentry = NULL;
+	struct dentry *upper = NULL;
+	umode_t mode = stat->mode;
+	int err;
+
+	newdentry = ovl_lookup_temp(workdir, dentry);
+	err = PTR_ERR(newdentry);
+	if (IS_ERR(newdentry))
+		goto out;
+
+	upper = lookup_one_len(dentry->d_name.name, upperdir,
+			       dentry->d_name.len);
+	err = PTR_ERR(upper);
+	if (IS_ERR(upper))
+		goto out1;
+
+	/* Can't properly set mode on creation because of the umask */
+	stat->mode &= S_IFMT;
+	err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
+	stat->mode = mode;
+	if (err)
+		goto out2;
+
+	if (S_ISREG(stat->mode)) {
+		struct path upperpath;
+		ovl_path_upper(dentry, &upperpath);
+		BUG_ON(upperpath.dentry != NULL);
+		upperpath.dentry = newdentry;
+
+		err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
+		if (err)
+			goto out_cleanup;
+	}
+
+	err = ovl_copy_xattr(lowerpath->dentry, newdentry);
+	if (err)
+		goto out_cleanup;
+
+	mutex_lock(&newdentry->d_inode->i_mutex);
+	err = ovl_set_attr(newdentry, stat);
+	if (!err && attr)
+		err = notify_change(newdentry, attr, NULL);
+	mutex_unlock(&newdentry->d_inode->i_mutex);
+	if (err)
+		goto out_cleanup;
+
+	err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+	if (err)
+		goto out_cleanup;
+
+	ovl_dentry_update(dentry, newdentry);
+	newdentry = NULL;
+
+	/*
+	 * Non-directores become opaque when copied up.
+	 */
+	if (!S_ISDIR(stat->mode))
+		ovl_dentry_set_opaque(dentry, true);
+out2:
+	dput(upper);
+out1:
+	dput(newdentry);
+out:
+	return err;
+
+out_cleanup:
+	ovl_cleanup(wdir, newdentry);
+	goto out;
+}
+
+/*
+ * Copy up a single dentry
+ *
+ * Directory renames only allowed on "pure upper" (already created on
+ * upper filesystem, never copied up).  Directories which are on lower or
+ * are merged may not be renamed.  For these -EXDEV is returned and
+ * userspace has to deal with it.  This means, when copying up a
+ * directory we can rely on it and ancestors being stable.
+ *
+ * Non-directory renames start with copy up of source if necessary.  The
+ * actual rename will only proceed once the copy up was successful.  Copy
+ * up uses upper parent i_mutex for exclusion.  Since rename can change
+ * d_parent it is possible that the copy up will lock the old parent.  At
+ * that point the file will have already been copied up anyway.
+ */
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+		    struct path *lowerpath, struct kstat *stat,
+		    struct iattr *attr)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	int err;
+	struct kstat pstat;
+	struct path parentpath;
+	struct dentry *upperdir;
+	struct dentry *upperdentry;
+	const struct cred *old_cred;
+	struct cred *override_cred;
+	char *link = NULL;
+
+	if (WARN_ON(!workdir))
+		return -EROFS;
+
+	ovl_path_upper(parent, &parentpath);
+	upperdir = parentpath.dentry;
+
+	err = vfs_getattr(&parentpath, &pstat);
+	if (err)
+		return err;
+
+	if (S_ISLNK(stat->mode)) {
+		link = ovl_read_symlink(lowerpath->dentry);
+		if (IS_ERR(link))
+			return PTR_ERR(link);
+	}
+
+	err = -ENOMEM;
+	override_cred = prepare_creds();
+	if (!override_cred)
+		goto out_free_link;
+
+	override_cred->fsuid = stat->uid;
+	override_cred->fsgid = stat->gid;
+	/*
+	 * CAP_SYS_ADMIN for copying up extended attributes
+	 * CAP_DAC_OVERRIDE for create
+	 * CAP_FOWNER for chmod, timestamp update
+	 * CAP_FSETID for chmod
+	 * CAP_CHOWN for chown
+	 * CAP_MKNOD for mknod
+	 */
+	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+	cap_raise(override_cred->cap_effective, CAP_FOWNER);
+	cap_raise(override_cred->cap_effective, CAP_FSETID);
+	cap_raise(override_cred->cap_effective, CAP_CHOWN);
+	cap_raise(override_cred->cap_effective, CAP_MKNOD);
+	old_cred = override_creds(override_cred);
+
+	err = -EIO;
+	if (lock_rename(workdir, upperdir) != NULL) {
+		pr_err("overlayfs: failed to lock workdir+upperdir\n");
+		goto out_unlock;
+	}
+	upperdentry = ovl_dentry_upper(dentry);
+	if (upperdentry) {
+		unlock_rename(workdir, upperdir);
+		err = 0;
+		/* Raced with another copy-up?  Do the setattr here */
+		if (attr) {
+			mutex_lock(&upperdentry->d_inode->i_mutex);
+			err = notify_change(upperdentry, attr, NULL);
+			mutex_unlock(&upperdentry->d_inode->i_mutex);
+		}
+		goto out_put_cred;
+	}
+
+	err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
+				 stat, attr, link);
+	if (!err) {
+		/* Restore timestamps on parent (best effort) */
+		ovl_set_timestamps(upperdir, &pstat);
+	}
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out_put_cred:
+	revert_creds(old_cred);
+	put_cred(override_cred);
+
+out_free_link:
+	if (link)
+		free_page((unsigned long) link);
+
+	return err;
+}
+
+int ovl_copy_up(struct dentry *dentry)
+{
+	int err;
+
+	err = 0;
+	while (!err) {
+		struct dentry *next;
+		struct dentry *parent;
+		struct path lowerpath;
+		struct kstat stat;
+		enum ovl_path_type type = ovl_path_type(dentry);
+
+		if (OVL_TYPE_UPPER(type))
+			break;
+
+		next = dget(dentry);
+		/* find the topmost dentry not yet copied up */
+		for (;;) {
+			parent = dget_parent(next);
+
+			type = ovl_path_type(parent);
+			if (OVL_TYPE_UPPER(type))
+				break;
+
+			dput(next);
+			next = parent;
+		}
+
+		ovl_path_lower(next, &lowerpath);
+		err = vfs_getattr(&lowerpath, &stat);
+		if (!err)
+			err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+
+		dput(parent);
+		dput(next);
+	}
+
+	return err;
+}
diff --git a/kernel/fs/overlayfs/dir.c b/kernel/fs/overlayfs/dir.c
new file mode 100644
index 000000000..692ceda3b
--- /dev/null
+++ b/kernel/fs/overlayfs/dir.c
@@ -0,0 +1,951 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+{
+	int err;
+
+	dget(wdentry);
+	if (d_is_dir(wdentry))
+		err = ovl_do_rmdir(wdir, wdentry);
+	else
+		err = ovl_do_unlink(wdir, wdentry);
+	dput(wdentry);
+
+	if (err) {
+		pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
+		       wdentry, err);
+	}
+}
+
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
+{
+	struct dentry *temp;
+	char name[20];
+
+	snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
+
+	temp = lookup_one_len(name, workdir, strlen(name));
+	if (!IS_ERR(temp) && temp->d_inode) {
+		pr_err("overlayfs: workdir/%s already exists\n", name);
+		dput(temp);
+		temp = ERR_PTR(-EIO);
+	}
+
+	return temp;
+}
+
+/* caller holds i_mutex on workdir */
+static struct dentry *ovl_whiteout(struct dentry *workdir,
+				   struct dentry *dentry)
+{
+	int err;
+	struct dentry *whiteout;
+	struct inode *wdir = workdir->d_inode;
+
+	whiteout = ovl_lookup_temp(workdir, dentry);
+	if (IS_ERR(whiteout))
+		return whiteout;
+
+	err = ovl_do_whiteout(wdir, whiteout);
+	if (err) {
+		dput(whiteout);
+		whiteout = ERR_PTR(err);
+	}
+
+	return whiteout;
+}
+
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+		    struct kstat *stat, const char *link,
+		    struct dentry *hardlink, bool debug)
+{
+	int err;
+
+	if (newdentry->d_inode)
+		return -ESTALE;
+
+	if (hardlink) {
+		err = ovl_do_link(hardlink, dir, newdentry, debug);
+	} else {
+		switch (stat->mode & S_IFMT) {
+		case S_IFREG:
+			err = ovl_do_create(dir, newdentry, stat->mode, debug);
+			break;
+
+		case S_IFDIR:
+			err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+			break;
+
+		case S_IFCHR:
+		case S_IFBLK:
+		case S_IFIFO:
+		case S_IFSOCK:
+			err = ovl_do_mknod(dir, newdentry,
+					   stat->mode, stat->rdev, debug);
+			break;
+
+		case S_IFLNK:
+			err = ovl_do_symlink(dir, newdentry, link, debug);
+			break;
+
+		default:
+			err = -EPERM;
+		}
+	}
+	if (!err && WARN_ON(!newdentry->d_inode)) {
+		/*
+		 * Not quite sure if non-instantiated dentry is legal or not.
+		 * VFS doesn't seem to care so check and warn here.
+		 */
+		err = -ENOENT;
+	}
+	return err;
+}
+
+static int ovl_set_opaque(struct dentry *upperdentry)
+{
+	return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
+}
+
+static void ovl_remove_opaque(struct dentry *upperdentry)
+{
+	int err;
+
+	err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
+	if (err) {
+		pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
+			upperdentry->d_name.name, err);
+	}
+}
+
+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			 struct kstat *stat)
+{
+	int err;
+	enum ovl_path_type type;
+	struct path realpath;
+
+	type = ovl_path_real(dentry, &realpath);
+	err = vfs_getattr(&realpath, stat);
+	if (err)
+		return err;
+
+	stat->dev = dentry->d_sb->s_dev;
+	stat->ino = dentry->d_inode->i_ino;
+
+	/*
+	 * It's probably not worth it to count subdirs to get the
+	 * correct link count.  nlink=1 seems to pacify 'find' and
+	 * other utilities.
+	 */
+	if (OVL_TYPE_MERGE(type))
+		stat->nlink = 1;
+
+	return 0;
+}
+
+static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
+			    struct kstat *stat, const char *link,
+			    struct dentry *hardlink)
+{
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *newdentry;
+	int err;
+
+	mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+	newdentry = lookup_one_len(dentry->d_name.name, upperdir,
+				   dentry->d_name.len);
+	err = PTR_ERR(newdentry);
+	if (IS_ERR(newdentry))
+		goto out_unlock;
+	err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+	if (err)
+		goto out_dput;
+
+	ovl_dentry_version_inc(dentry->d_parent);
+	ovl_dentry_update(dentry, newdentry);
+	ovl_copyattr(newdentry->d_inode, inode);
+	d_instantiate(dentry, inode);
+	newdentry = NULL;
+out_dput:
+	dput(newdentry);
+out_unlock:
+	mutex_unlock(&udir->i_mutex);
+	return err;
+}
+
+static int ovl_lock_rename_workdir(struct dentry *workdir,
+				   struct dentry *upperdir)
+{
+	/* Workdir should not be the same as upperdir */
+	if (workdir == upperdir)
+		goto err;
+
+	/* Workdir should not be subdir of upperdir and vice versa */
+	if (lock_rename(workdir, upperdir) != NULL)
+		goto err_unlock;
+
+	return 0;
+
+err_unlock:
+	unlock_rename(workdir, upperdir);
+err:
+	pr_err("overlayfs: failed to lock workdir+upperdir\n");
+	return -EIO;
+}
+
+static struct dentry *ovl_clear_empty(struct dentry *dentry,
+				      struct list_head *list)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	struct inode *wdir = workdir->d_inode;
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct path upperpath;
+	struct dentry *upper;
+	struct dentry *opaquedir;
+	struct kstat stat;
+	int err;
+
+	if (WARN_ON(!workdir))
+		return ERR_PTR(-EROFS);
+
+	err = ovl_lock_rename_workdir(workdir, upperdir);
+	if (err)
+		goto out;
+
+	ovl_path_upper(dentry, &upperpath);
+	err = vfs_getattr(&upperpath, &stat);
+	if (err)
+		goto out_unlock;
+
+	err = -ESTALE;
+	if (!S_ISDIR(stat.mode))
+		goto out_unlock;
+	upper = upperpath.dentry;
+	if (upper->d_parent->d_inode != udir)
+		goto out_unlock;
+
+	opaquedir = ovl_lookup_temp(workdir, dentry);
+	err = PTR_ERR(opaquedir);
+	if (IS_ERR(opaquedir))
+		goto out_unlock;
+
+	err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+	if (err)
+		goto out_dput;
+
+	err = ovl_copy_xattr(upper, opaquedir);
+	if (err)
+		goto out_cleanup;
+
+	err = ovl_set_opaque(opaquedir);
+	if (err)
+		goto out_cleanup;
+
+	mutex_lock(&opaquedir->d_inode->i_mutex);
+	err = ovl_set_attr(opaquedir, &stat);
+	mutex_unlock(&opaquedir->d_inode->i_mutex);
+	if (err)
+		goto out_cleanup;
+
+	err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+	if (err)
+		goto out_cleanup;
+
+	ovl_cleanup_whiteouts(upper, list);
+	ovl_cleanup(wdir, upper);
+	unlock_rename(workdir, upperdir);
+
+	/* dentry's upper doesn't match now, get rid of it */
+	d_drop(dentry);
+
+	return opaquedir;
+
+out_cleanup:
+	ovl_cleanup(wdir, opaquedir);
+out_dput:
+	dput(opaquedir);
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out:
+	return ERR_PTR(err);
+}
+
+static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
+{
+	int err;
+	struct dentry *ret = NULL;
+	LIST_HEAD(list);
+
+	err = ovl_check_empty_dir(dentry, &list);
+	if (err)
+		ret = ERR_PTR(err);
+	else {
+		/*
+		 * If no upperdentry then skip clearing whiteouts.
+		 *
+		 * Can race with copy-up, since we don't hold the upperdir
+		 * mutex.  Doesn't matter, since copy-up can't create a
+		 * non-empty directory from an empty one.
+		 */
+		if (ovl_dentry_upper(dentry))
+			ret = ovl_clear_empty(dentry, &list);
+	}
+
+	ovl_cache_free(&list);
+
+	return ret;
+}
+
+static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
+				    struct kstat *stat, const char *link,
+				    struct dentry *hardlink)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	struct inode *wdir = workdir->d_inode;
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *upper;
+	struct dentry *newdentry;
+	int err;
+
+	if (WARN_ON(!workdir))
+		return -EROFS;
+
+	err = ovl_lock_rename_workdir(workdir, upperdir);
+	if (err)
+		goto out;
+
+	newdentry = ovl_lookup_temp(workdir, dentry);
+	err = PTR_ERR(newdentry);
+	if (IS_ERR(newdentry))
+		goto out_unlock;
+
+	upper = lookup_one_len(dentry->d_name.name, upperdir,
+			       dentry->d_name.len);
+	err = PTR_ERR(upper);
+	if (IS_ERR(upper))
+		goto out_dput;
+
+	err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+	if (err)
+		goto out_dput2;
+
+	if (S_ISDIR(stat->mode)) {
+		err = ovl_set_opaque(newdentry);
+		if (err)
+			goto out_cleanup;
+
+		err = ovl_do_rename(wdir, newdentry, udir, upper,
+				    RENAME_EXCHANGE);
+		if (err)
+			goto out_cleanup;
+
+		ovl_cleanup(wdir, upper);
+	} else {
+		err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+		if (err)
+			goto out_cleanup;
+	}
+	ovl_dentry_version_inc(dentry->d_parent);
+	ovl_dentry_update(dentry, newdentry);
+	ovl_copyattr(newdentry->d_inode, inode);
+	d_instantiate(dentry, inode);
+	newdentry = NULL;
+out_dput2:
+	dput(upper);
+out_dput:
+	dput(newdentry);
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out:
+	return err;
+
+out_cleanup:
+	ovl_cleanup(wdir, newdentry);
+	goto out_dput2;
+}
+
+static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
+			      const char *link, struct dentry *hardlink)
+{
+	int err;
+	struct inode *inode;
+	struct kstat stat = {
+		.mode = mode,
+		.rdev = rdev,
+	};
+
+	err = -ENOMEM;
+	inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
+	if (!inode)
+		goto out;
+
+	err = ovl_copy_up(dentry->d_parent);
+	if (err)
+		goto out_iput;
+
+	if (!ovl_dentry_is_opaque(dentry)) {
+		err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
+	} else {
+		const struct cred *old_cred;
+		struct cred *override_cred;
+
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (!override_cred)
+			goto out_iput;
+
+		/*
+		 * CAP_SYS_ADMIN for setting opaque xattr
+		 * CAP_DAC_OVERRIDE for create in workdir, rename
+		 * CAP_FOWNER for removing whiteout from sticky dir
+		 */
+		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		cap_raise(override_cred->cap_effective, CAP_FOWNER);
+		old_cred = override_creds(override_cred);
+
+		err = ovl_create_over_whiteout(dentry, inode, &stat, link,
+					       hardlink);
+
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
+
+	if (!err)
+		inode = NULL;
+out_iput:
+	iput(inode);
+out:
+	return err;
+}
+
+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+			     const char *link)
+{
+	int err;
+
+	err = ovl_want_write(dentry);
+	if (!err) {
+		err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
+		ovl_drop_write(dentry);
+	}
+
+	return err;
+}
+
+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl)
+{
+	return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
+}
+
+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+}
+
+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		     dev_t rdev)
+{
+	/* Don't allow creation of "whiteout" on overlay */
+	if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
+		return -EPERM;
+
+	return ovl_create_object(dentry, mode, rdev, NULL);
+}
+
+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *link)
+{
+	return ovl_create_object(dentry, S_IFLNK, 0, link);
+}
+
+static int ovl_link(struct dentry *old, struct inode *newdir,
+		    struct dentry *new)
+{
+	int err;
+	struct dentry *upper;
+
+	err = ovl_want_write(old);
+	if (err)
+		goto out;
+
+	err = ovl_copy_up(old);
+	if (err)
+		goto out_drop_write;
+
+	upper = ovl_dentry_upper(old);
+	err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+
+out_drop_write:
+	ovl_drop_write(old);
+out:
+	return err;
+}
+
+static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	struct inode *wdir = workdir->d_inode;
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *whiteout;
+	struct dentry *upper;
+	struct dentry *opaquedir = NULL;
+	int err;
+
+	if (WARN_ON(!workdir))
+		return -EROFS;
+
+	if (is_dir) {
+		if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
+			opaquedir = ovl_check_empty_and_clear(dentry);
+			err = PTR_ERR(opaquedir);
+			if (IS_ERR(opaquedir))
+				goto out;
+		} else {
+			LIST_HEAD(list);
+
+			/*
+			 * When removing an empty opaque directory, then it
+			 * makes no sense to replace it with an exact replica of
+			 * itself.  But emptiness still needs to be checked.
+			 */
+			err = ovl_check_empty_dir(dentry, &list);
+			ovl_cache_free(&list);
+			if (err)
+				goto out;
+		}
+	}
+
+	err = ovl_lock_rename_workdir(workdir, upperdir);
+	if (err)
+		goto out_dput;
+
+	whiteout = ovl_whiteout(workdir, dentry);
+	err = PTR_ERR(whiteout);
+	if (IS_ERR(whiteout))
+		goto out_unlock;
+
+	upper = ovl_dentry_upper(dentry);
+	if (!upper) {
+		upper = lookup_one_len(dentry->d_name.name, upperdir,
+				       dentry->d_name.len);
+		err = PTR_ERR(upper);
+		if (IS_ERR(upper))
+			goto kill_whiteout;
+
+		err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
+		dput(upper);
+		if (err)
+			goto kill_whiteout;
+	} else {
+		int flags = 0;
+
+		if (opaquedir)
+			upper = opaquedir;
+		err = -ESTALE;
+		if (upper->d_parent != upperdir)
+			goto kill_whiteout;
+
+		if (is_dir)
+			flags |= RENAME_EXCHANGE;
+
+		err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
+		if (err)
+			goto kill_whiteout;
+
+		if (is_dir)
+			ovl_cleanup(wdir, upper);
+	}
+	ovl_dentry_version_inc(dentry->d_parent);
+out_d_drop:
+	d_drop(dentry);
+	dput(whiteout);
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out_dput:
+	dput(opaquedir);
+out:
+	return err;
+
+kill_whiteout:
+	ovl_cleanup(wdir, whiteout);
+	goto out_d_drop;
+}
+
+static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
+{
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *dir = upperdir->d_inode;
+	struct dentry *upper = ovl_dentry_upper(dentry);
+	int err;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	err = -ESTALE;
+	if (upper->d_parent == upperdir) {
+		/* Don't let d_delete() think it can reset d_inode */
+		dget(upper);
+		if (is_dir)
+			err = vfs_rmdir(dir, upper);
+		else
+			err = vfs_unlink(dir, upper, NULL);
+		dput(upper);
+		ovl_dentry_version_inc(dentry->d_parent);
+	}
+
+	/*
+	 * Keeping this dentry hashed would mean having to release
+	 * upperpath/lowerpath, which could only be done if we are the
+	 * sole user of this dentry.  Too tricky...  Just unhash for
+	 * now.
+	 */
+	d_drop(dentry);
+	mutex_unlock(&dir->i_mutex);
+
+	return err;
+}
+
+static inline int ovl_check_sticky(struct dentry *dentry)
+{
+	struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
+	struct inode *inode = ovl_dentry_real(dentry)->d_inode;
+
+	if (check_sticky(dir, inode))
+		return -EPERM;
+
+	return 0;
+}
+
+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
+{
+	enum ovl_path_type type;
+	int err;
+
+	err = ovl_check_sticky(dentry);
+	if (err)
+		goto out;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	err = ovl_copy_up(dentry->d_parent);
+	if (err)
+		goto out_drop_write;
+
+	type = ovl_path_type(dentry);
+	if (OVL_TYPE_PURE_UPPER(type)) {
+		err = ovl_remove_upper(dentry, is_dir);
+	} else {
+		const struct cred *old_cred;
+		struct cred *override_cred;
+
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (!override_cred)
+			goto out_drop_write;
+
+		/*
+		 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+		 * CAP_DAC_OVERRIDE for create in workdir, rename
+		 * CAP_FOWNER for removing whiteout from sticky dir
+		 * CAP_FSETID for chmod of opaque dir
+		 * CAP_CHOWN for chown of opaque dir
+		 */
+		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		cap_raise(override_cred->cap_effective, CAP_FOWNER);
+		cap_raise(override_cred->cap_effective, CAP_FSETID);
+		cap_raise(override_cred->cap_effective, CAP_CHOWN);
+		old_cred = override_creds(override_cred);
+
+		err = ovl_remove_and_whiteout(dentry, is_dir);
+
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
+out_drop_write:
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return ovl_do_remove(dentry, false);
+}
+
+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return ovl_do_remove(dentry, true);
+}
+
+static int ovl_rename2(struct inode *olddir, struct dentry *old,
+		       struct inode *newdir, struct dentry *new,
+		       unsigned int flags)
+{
+	int err;
+	enum ovl_path_type old_type;
+	enum ovl_path_type new_type;
+	struct dentry *old_upperdir;
+	struct dentry *new_upperdir;
+	struct dentry *olddentry;
+	struct dentry *newdentry;
+	struct dentry *trap;
+	bool old_opaque;
+	bool new_opaque;
+	bool new_create = false;
+	bool cleanup_whiteout = false;
+	bool overwrite = !(flags & RENAME_EXCHANGE);
+	bool is_dir = d_is_dir(old);
+	bool new_is_dir = false;
+	struct dentry *opaquedir = NULL;
+	const struct cred *old_cred = NULL;
+	struct cred *override_cred = NULL;
+
+	err = -EINVAL;
+	if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+		goto out;
+
+	flags &= ~RENAME_NOREPLACE;
+
+	err = ovl_check_sticky(old);
+	if (err)
+		goto out;
+
+	/* Don't copy up directory trees */
+	old_type = ovl_path_type(old);
+	err = -EXDEV;
+	if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
+		goto out;
+
+	if (new->d_inode) {
+		err = ovl_check_sticky(new);
+		if (err)
+			goto out;
+
+		if (d_is_dir(new))
+			new_is_dir = true;
+
+		new_type = ovl_path_type(new);
+		err = -EXDEV;
+		if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
+			goto out;
+
+		err = 0;
+		if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
+			if (ovl_dentry_lower(old)->d_inode ==
+			    ovl_dentry_lower(new)->d_inode)
+				goto out;
+		}
+		if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
+			if (ovl_dentry_upper(old)->d_inode ==
+			    ovl_dentry_upper(new)->d_inode)
+				goto out;
+		}
+	} else {
+		if (ovl_dentry_is_opaque(new))
+			new_type = __OVL_PATH_UPPER;
+		else
+			new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
+	}
+
+	err = ovl_want_write(old);
+	if (err)
+		goto out;
+
+	err = ovl_copy_up(old);
+	if (err)
+		goto out_drop_write;
+
+	err = ovl_copy_up(new->d_parent);
+	if (err)
+		goto out_drop_write;
+	if (!overwrite) {
+		err = ovl_copy_up(new);
+		if (err)
+			goto out_drop_write;
+	}
+
+	old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
+	new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
+
+	if (old_opaque || new_opaque) {
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (!override_cred)
+			goto out_drop_write;
+
+		/*
+		 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+		 * CAP_DAC_OVERRIDE for create in workdir
+		 * CAP_FOWNER for removing whiteout from sticky dir
+		 * CAP_FSETID for chmod of opaque dir
+		 * CAP_CHOWN for chown of opaque dir
+		 */
+		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		cap_raise(override_cred->cap_effective, CAP_FOWNER);
+		cap_raise(override_cred->cap_effective, CAP_FSETID);
+		cap_raise(override_cred->cap_effective, CAP_CHOWN);
+		old_cred = override_creds(override_cred);
+	}
+
+	if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
+		opaquedir = ovl_check_empty_and_clear(new);
+		err = PTR_ERR(opaquedir);
+		if (IS_ERR(opaquedir)) {
+			opaquedir = NULL;
+			goto out_revert_creds;
+		}
+	}
+
+	if (overwrite) {
+		if (old_opaque) {
+			if (new->d_inode || !new_opaque) {
+				/* Whiteout source */
+				flags |= RENAME_WHITEOUT;
+			} else {
+				/* Switch whiteouts */
+				flags |= RENAME_EXCHANGE;
+			}
+		} else if (is_dir && !new->d_inode && new_opaque) {
+			flags |= RENAME_EXCHANGE;
+			cleanup_whiteout = true;
+		}
+	}
+
+	old_upperdir = ovl_dentry_upper(old->d_parent);
+	new_upperdir = ovl_dentry_upper(new->d_parent);
+
+	trap = lock_rename(new_upperdir, old_upperdir);
+
+	olddentry = ovl_dentry_upper(old);
+	newdentry = ovl_dentry_upper(new);
+	if (newdentry) {
+		if (opaquedir) {
+			newdentry = opaquedir;
+			opaquedir = NULL;
+		} else {
+			dget(newdentry);
+		}
+	} else {
+		new_create = true;
+		newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+					   new->d_name.len);
+		err = PTR_ERR(newdentry);
+		if (IS_ERR(newdentry))
+			goto out_unlock;
+	}
+
+	err = -ESTALE;
+	if (olddentry->d_parent != old_upperdir)
+		goto out_dput;
+	if (newdentry->d_parent != new_upperdir)
+		goto out_dput;
+	if (olddentry == trap)
+		goto out_dput;
+	if (newdentry == trap)
+		goto out_dput;
+
+	if (is_dir && !old_opaque && new_opaque) {
+		err = ovl_set_opaque(olddentry);
+		if (err)
+			goto out_dput;
+	}
+	if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
+		err = ovl_set_opaque(newdentry);
+		if (err)
+			goto out_dput;
+	}
+
+	if (old_opaque || new_opaque) {
+		err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+				    new_upperdir->d_inode, newdentry,
+				    flags);
+	} else {
+		/* No debug for the plain case */
+		BUG_ON(flags & ~RENAME_EXCHANGE);
+		err = vfs_rename(old_upperdir->d_inode, olddentry,
+				 new_upperdir->d_inode, newdentry,
+				 NULL, flags);
+	}
+
+	if (err) {
+		if (is_dir && !old_opaque && new_opaque)
+			ovl_remove_opaque(olddentry);
+		if (!overwrite && new_is_dir && old_opaque && !new_opaque)
+			ovl_remove_opaque(newdentry);
+		goto out_dput;
+	}
+
+	if (is_dir && old_opaque && !new_opaque)
+		ovl_remove_opaque(olddentry);
+	if (!overwrite && new_is_dir && !old_opaque && new_opaque)
+		ovl_remove_opaque(newdentry);
+
+	if (old_opaque != new_opaque) {
+		ovl_dentry_set_opaque(old, new_opaque);
+		if (!overwrite)
+			ovl_dentry_set_opaque(new, old_opaque);
+	}
+
+	if (cleanup_whiteout)
+		ovl_cleanup(old_upperdir->d_inode, newdentry);
+
+	ovl_dentry_version_inc(old->d_parent);
+	ovl_dentry_version_inc(new->d_parent);
+
+out_dput:
+	dput(newdentry);
+out_unlock:
+	unlock_rename(new_upperdir, old_upperdir);
+out_revert_creds:
+	if (old_opaque || new_opaque) {
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
+out_drop_write:
+	ovl_drop_write(old);
+out:
+	dput(opaquedir);
+	return err;
+}
+
+const struct inode_operations ovl_dir_inode_operations = {
+	.lookup		= ovl_lookup,
+	.mkdir		= ovl_mkdir,
+	.symlink	= ovl_symlink,
+	.unlink		= ovl_unlink,
+	.rmdir		= ovl_rmdir,
+	.rename2	= ovl_rename2,
+	.link		= ovl_link,
+	.setattr	= ovl_setattr,
+	.create		= ovl_create,
+	.mknod		= ovl_mknod,
+	.permission	= ovl_permission,
+	.getattr	= ovl_dir_getattr,
+	.setxattr	= ovl_setxattr,
+	.getxattr	= ovl_getxattr,
+	.listxattr	= ovl_listxattr,
+	.removexattr	= ovl_removexattr,
+};
diff --git a/kernel/fs/overlayfs/inode.c b/kernel/fs/overlayfs/inode.c
new file mode 100644
index 000000000..04f124884
--- /dev/null
+++ b/kernel/fs/overlayfs/inode.c
@@ -0,0 +1,436 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
+			    bool no_data)
+{
+	int err;
+	struct dentry *parent;
+	struct kstat stat;
+	struct path lowerpath;
+
+	parent = dget_parent(dentry);
+	err = ovl_copy_up(parent);
+	if (err)
+		goto out_dput_parent;
+
+	ovl_path_lower(dentry, &lowerpath);
+	err = vfs_getattr(&lowerpath, &stat);
+	if (err)
+		goto out_dput_parent;
+
+	if (no_data)
+		stat.size = 0;
+
+	err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+
+out_dput_parent:
+	dput(parent);
+	return err;
+}
+
+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int err;
+	struct dentry *upperdentry;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	upperdentry = ovl_dentry_upper(dentry);
+	if (upperdentry) {
+		mutex_lock(&upperdentry->d_inode->i_mutex);
+		err = notify_change(upperdentry, attr, NULL);
+		mutex_unlock(&upperdentry->d_inode->i_mutex);
+	} else {
+		err = ovl_copy_up_last(dentry, attr, false);
+	}
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			 struct kstat *stat)
+{
+	struct path realpath;
+
+	ovl_path_real(dentry, &realpath);
+	return vfs_getattr(&realpath, stat);
+}
+
+int ovl_permission(struct inode *inode, int mask)
+{
+	struct ovl_entry *oe;
+	struct dentry *alias = NULL;
+	struct inode *realinode;
+	struct dentry *realdentry;
+	bool is_upper;
+	int err;
+
+	if (S_ISDIR(inode->i_mode)) {
+		oe = inode->i_private;
+	} else if (mask & MAY_NOT_BLOCK) {
+		return -ECHILD;
+	} else {
+		/*
+		 * For non-directories find an alias and get the info
+		 * from there.
+		 */
+		alias = d_find_any_alias(inode);
+		if (WARN_ON(!alias))
+			return -ENOENT;
+
+		oe = alias->d_fsdata;
+	}
+
+	realdentry = ovl_entry_real(oe, &is_upper);
+
+	/* Careful in RCU walk mode */
+	realinode = ACCESS_ONCE(realdentry->d_inode);
+	if (!realinode) {
+		WARN_ON(!(mask & MAY_NOT_BLOCK));
+		err = -ENOENT;
+		goto out_dput;
+	}
+
+	if (mask & MAY_WRITE) {
+		umode_t mode = realinode->i_mode;
+
+		/*
+		 * Writes will always be redirected to upper layer, so
+		 * ignore lower layer being read-only.
+		 *
+		 * If the overlay itself is read-only then proceed
+		 * with the permission check, don't return EROFS.
+		 * This will only happen if this is the lower layer of
+		 * another overlayfs.
+		 *
+		 * If upper fs becomes read-only after the overlay was
+		 * constructed return EROFS to prevent modification of
+		 * upper layer.
+		 */
+		err = -EROFS;
+		if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
+		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+			goto out_dput;
+	}
+
+	err = __inode_permission(realinode, mask);
+out_dput:
+	dput(alias);
+	return err;
+}
+
+
+struct ovl_link_data {
+	struct dentry *realdentry;
+	void *cookie;
+};
+
+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	void *ret;
+	struct dentry *realdentry;
+	struct inode *realinode;
+
+	realdentry = ovl_dentry_real(dentry);
+	realinode = realdentry->d_inode;
+
+	if (WARN_ON(!realinode->i_op->follow_link))
+		return ERR_PTR(-EPERM);
+
+	ret = realinode->i_op->follow_link(realdentry, nd);
+	if (IS_ERR(ret))
+		return ret;
+
+	if (realinode->i_op->put_link) {
+		struct ovl_link_data *data;
+
+		data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
+		if (!data) {
+			realinode->i_op->put_link(realdentry, nd, ret);
+			return ERR_PTR(-ENOMEM);
+		}
+		data->realdentry = realdentry;
+		data->cookie = ret;
+
+		return data;
+	} else {
+		return NULL;
+	}
+}
+
+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+	struct inode *realinode;
+	struct ovl_link_data *data = c;
+
+	if (!data)
+		return;
+
+	realinode = data->realdentry->d_inode;
+	realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+	kfree(data);
+}
+
+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+	struct path realpath;
+	struct inode *realinode;
+
+	ovl_path_real(dentry, &realpath);
+	realinode = realpath.dentry->d_inode;
+
+	if (!realinode->i_op->readlink)
+		return -EINVAL;
+
+	touch_atime(&realpath);
+
+	return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+}
+
+
+static bool ovl_is_private_xattr(const char *name)
+{
+	return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
+}
+
+int ovl_setxattr(struct dentry *dentry, const char *name,
+		 const void *value, size_t size, int flags)
+{
+	int err;
+	struct dentry *upperdentry;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	err = -EPERM;
+	if (ovl_is_private_xattr(name))
+		goto out_drop_write;
+
+	err = ovl_copy_up(dentry);
+	if (err)
+		goto out_drop_write;
+
+	upperdentry = ovl_dentry_upper(dentry);
+	err = vfs_setxattr(upperdentry, name, value, size, flags);
+
+out_drop_write:
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static bool ovl_need_xattr_filter(struct dentry *dentry,
+				  enum ovl_path_type type)
+{
+	if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
+		return S_ISDIR(dentry->d_inode->i_mode);
+	else
+		return false;
+}
+
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+		     void *value, size_t size)
+{
+	struct path realpath;
+	enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+
+	if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+		return -ENODATA;
+
+	return vfs_getxattr(realpath.dentry, name, value, size);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	struct path realpath;
+	enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+	ssize_t res;
+	int off;
+
+	res = vfs_listxattr(realpath.dentry, list, size);
+	if (res <= 0 || size == 0)
+		return res;
+
+	if (!ovl_need_xattr_filter(dentry, type))
+		return res;
+
+	/* filter out private xattrs */
+	for (off = 0; off < res;) {
+		char *s = list + off;
+		size_t slen = strlen(s) + 1;
+
+		BUG_ON(off + slen > res);
+
+		if (ovl_is_private_xattr(s)) {
+			res -= slen;
+			memmove(s, s + slen, res - off);
+		} else {
+			off += slen;
+		}
+	}
+
+	return res;
+}
+
+int ovl_removexattr(struct dentry *dentry, const char *name)
+{
+	int err;
+	struct path realpath;
+	enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	err = -ENODATA;
+	if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+		goto out_drop_write;
+
+	if (!OVL_TYPE_UPPER(type)) {
+		err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+		if (err < 0)
+			goto out_drop_write;
+
+		err = ovl_copy_up(dentry);
+		if (err)
+			goto out_drop_write;
+
+		ovl_path_upper(dentry, &realpath);
+	}
+
+	err = vfs_removexattr(realpath.dentry, name);
+out_drop_write:
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
+				  struct dentry *realdentry)
+{
+	if (OVL_TYPE_UPPER(type))
+		return false;
+
+	if (special_file(realdentry->d_inode->i_mode))
+		return false;
+
+	if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
+		return false;
+
+	return true;
+}
+
+static int ovl_dentry_open(struct dentry *dentry, struct file *file,
+		    const struct cred *cred)
+{
+	int err;
+	struct path realpath;
+	enum ovl_path_type type;
+	bool want_write = false;
+
+	type = ovl_path_real(dentry, &realpath);
+	if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+		want_write = true;
+		err = ovl_want_write(dentry);
+		if (err)
+			goto out;
+
+		if (file->f_flags & O_TRUNC)
+			err = ovl_copy_up_last(dentry, NULL, true);
+		else
+			err = ovl_copy_up(dentry);
+		if (err)
+			goto out_drop_write;
+
+		ovl_path_upper(dentry, &realpath);
+	}
+
+	err = vfs_open(&realpath, file, cred);
+out_drop_write:
+	if (want_write)
+		ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static const struct inode_operations ovl_file_inode_operations = {
+	.setattr	= ovl_setattr,
+	.permission	= ovl_permission,
+	.getattr	= ovl_getattr,
+	.setxattr	= ovl_setxattr,
+	.getxattr	= ovl_getxattr,
+	.listxattr	= ovl_listxattr,
+	.removexattr	= ovl_removexattr,
+	.dentry_open	= ovl_dentry_open,
+};
+
+static const struct inode_operations ovl_symlink_inode_operations = {
+	.setattr	= ovl_setattr,
+	.follow_link	= ovl_follow_link,
+	.put_link	= ovl_put_link,
+	.readlink	= ovl_readlink,
+	.getattr	= ovl_getattr,
+	.setxattr	= ovl_setxattr,
+	.getxattr	= ovl_getxattr,
+	.listxattr	= ovl_listxattr,
+	.removexattr	= ovl_removexattr,
+};
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+			    struct ovl_entry *oe)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	mode &= S_IFMT;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+	switch (mode) {
+	case S_IFDIR:
+		inode->i_private = oe;
+		inode->i_op = &ovl_dir_inode_operations;
+		inode->i_fop = &ovl_dir_operations;
+		break;
+
+	case S_IFLNK:
+		inode->i_op = &ovl_symlink_inode_operations;
+		break;
+
+	case S_IFREG:
+	case S_IFSOCK:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFIFO:
+		inode->i_op = &ovl_file_inode_operations;
+		break;
+
+	default:
+		WARN(1, "illegal file type: %i\n", mode);
+		iput(inode);
+		inode = NULL;
+	}
+
+	return inode;
+}
diff --git a/kernel/fs/overlayfs/overlayfs.h b/kernel/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000..17ac5afc9
--- /dev/null
+++ b/kernel/fs/overlayfs/overlayfs.h
@@ -0,0 +1,199 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+
+struct ovl_entry;
+
+enum ovl_path_type {
+	__OVL_PATH_PURE		= (1 << 0),
+	__OVL_PATH_UPPER	= (1 << 1),
+	__OVL_PATH_MERGE	= (1 << 2),
+};
+
+#define OVL_TYPE_UPPER(type)	((type) & __OVL_PATH_UPPER)
+#define OVL_TYPE_MERGE(type)	((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
+#define OVL_TYPE_MERGE_OR_LOWER(type) \
+	(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
+
+#define OVL_XATTR_PRE_NAME "trusted.overlay."
+#define OVL_XATTR_PRE_LEN  16
+#define OVL_XATTR_OPAQUE   OVL_XATTR_PRE_NAME"opaque"
+
+static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int err = vfs_rmdir(dir, dentry);
+	pr_debug("rmdir(%pd2) = %i\n", dentry, err);
+	return err;
+}
+
+static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int err = vfs_unlink(dir, dentry, NULL);
+	pr_debug("unlink(%pd2) = %i\n", dentry, err);
+	return err;
+}
+
+static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
+			      struct dentry *new_dentry, bool debug)
+{
+	int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+	if (debug) {
+		pr_debug("link(%pd2, %pd2) = %i\n",
+			 old_dentry, new_dentry, err);
+	}
+	return err;
+}
+
+static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+			     umode_t mode, bool debug)
+{
+	int err = vfs_create(dir, dentry, mode, true);
+	if (debug)
+		pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+	return err;
+}
+
+static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+			       umode_t mode, bool debug)
+{
+	int err = vfs_mkdir(dir, dentry, mode);
+	if (debug)
+		pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+	return err;
+}
+
+static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+			       umode_t mode, dev_t dev, bool debug)
+{
+	int err = vfs_mknod(dir, dentry, mode, dev);
+	if (debug) {
+		pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
+			 dentry, mode, dev, err);
+	}
+	return err;
+}
+
+static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+				 const char *oldname, bool debug)
+{
+	int err = vfs_symlink(dir, dentry, oldname);
+	if (debug)
+		pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+	return err;
+}
+
+static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
+				  const void *value, size_t size, int flags)
+{
+	int err = vfs_setxattr(dentry, name, value, size, flags);
+	pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
+		 dentry, name, (int) size, (char *) value, flags, err);
+	return err;
+}
+
+static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
+{
+	int err = vfs_removexattr(dentry, name);
+	pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
+	return err;
+}
+
+static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
+				struct inode *newdir, struct dentry *newdentry,
+				unsigned int flags)
+{
+	int err;
+
+	pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
+		 olddentry, newdentry, flags);
+
+	err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+
+	if (err) {
+		pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
+			 olddentry, newdentry, err);
+	}
+	return err;
+}
+
+static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+{
+	int err = vfs_whiteout(dir, dentry);
+	pr_debug("whiteout(%pd2) = %i\n", dentry, err);
+	return err;
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+void ovl_dentry_version_inc(struct dentry *dentry);
+void ovl_path_upper(struct dentry *dentry, struct path *path);
+void ovl_path_lower(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+struct dentry *ovl_dentry_upper(struct dentry *dentry);
+struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_real(struct dentry *dentry);
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
+struct dentry *ovl_workdir(struct dentry *dentry);
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+bool ovl_dentry_is_opaque(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
+bool ovl_is_whiteout(struct dentry *dentry);
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+			  unsigned int flags);
+struct file *ovl_path_open(struct path *path, int flags);
+
+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
+				struct kstat *stat, const char *link);
+
+/* readdir.c */
+extern const struct file_operations ovl_dir_operations;
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cache_free(struct list_head *list);
+
+/* inode.c */
+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_permission(struct inode *inode, int mask);
+int ovl_setxattr(struct dentry *dentry, const char *name,
+		 const void *value, size_t size, int flags);
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+		     void *value, size_t size);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+int ovl_removexattr(struct dentry *dentry, const char *name);
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+			    struct ovl_entry *oe);
+static inline void ovl_copyattr(struct inode *from, struct inode *to)
+{
+	to->i_uid = from->i_uid;
+	to->i_gid = from->i_gid;
+}
+
+/* dir.c */
+extern const struct inode_operations ovl_dir_inode_operations;
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+		    struct kstat *stat, const char *link,
+		    struct dentry *hardlink, bool debug);
+void ovl_cleanup(struct inode *dir, struct dentry *dentry);
+
+/* copy_up.c */
+int ovl_copy_up(struct dentry *dentry);
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+		    struct path *lowerpath, struct kstat *stat,
+		    struct iattr *attr);
+int ovl_copy_xattr(struct dentry *old, struct dentry *new);
+int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/kernel/fs/overlayfs/readdir.c b/kernel/fs/overlayfs/readdir.c
new file mode 100644
index 000000000..907870e81
--- /dev/null
+++ b/kernel/fs/overlayfs/readdir.c
@@ -0,0 +1,557 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+struct ovl_cache_entry {
+	unsigned int len;
+	unsigned int type;
+	u64 ino;
+	struct list_head l_node;
+	struct rb_node node;
+	bool is_whiteout;
+	char name[];
+};
+
+struct ovl_dir_cache {
+	long refcount;
+	u64 version;
+	struct list_head entries;
+};
+
+struct ovl_readdir_data {
+	struct dir_context ctx;
+	bool is_merge;
+	struct rb_root root;
+	struct list_head *list;
+	struct list_head middle;
+	struct dentry *dir;
+	int count;
+	int err;
+};
+
+struct ovl_dir_file {
+	bool is_real;
+	bool is_upper;
+	struct ovl_dir_cache *cache;
+	struct list_head *cursor;
+	struct file *realfile;
+	struct file *upperfile;
+};
+
+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
+{
+	return container_of(n, struct ovl_cache_entry, node);
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
+						    const char *name, int len)
+{
+	struct rb_node *node = root->rb_node;
+	int cmp;
+
+	while (node) {
+		struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
+
+		cmp = strncmp(name, p->name, len);
+		if (cmp > 0)
+			node = p->node.rb_right;
+		else if (cmp < 0 || len < p->len)
+			node = p->node.rb_left;
+		else
+			return p;
+	}
+
+	return NULL;
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
+						   const char *name, int len,
+						   u64 ino, unsigned int d_type)
+{
+	struct ovl_cache_entry *p;
+	size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
+
+	p = kmalloc(size, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	memcpy(p->name, name, len);
+	p->name[len] = '\0';
+	p->len = len;
+	p->type = d_type;
+	p->ino = ino;
+	p->is_whiteout = false;
+
+	if (d_type == DT_CHR) {
+		struct dentry *dentry;
+		const struct cred *old_cred;
+		struct cred *override_cred;
+
+		override_cred = prepare_creds();
+		if (!override_cred) {
+			kfree(p);
+			return NULL;
+		}
+
+		/*
+		 * CAP_DAC_OVERRIDE for lookup
+		 */
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		old_cred = override_creds(override_cred);
+
+		dentry = lookup_one_len(name, dir, len);
+		if (!IS_ERR(dentry)) {
+			p->is_whiteout = ovl_is_whiteout(dentry);
+			dput(dentry);
+		}
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
+	return p;
+}
+
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+				  const char *name, int len, u64 ino,
+				  unsigned int d_type)
+{
+	struct rb_node **newp = &rdd->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct ovl_cache_entry *p;
+
+	while (*newp) {
+		int cmp;
+		struct ovl_cache_entry *tmp;
+
+		parent = *newp;
+		tmp = ovl_cache_entry_from_node(*newp);
+		cmp = strncmp(name, tmp->name, len);
+		if (cmp > 0)
+			newp = &tmp->node.rb_right;
+		else if (cmp < 0 || len < tmp->len)
+			newp = &tmp->node.rb_left;
+		else
+			return 0;
+	}
+
+	p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
+	if (p == NULL)
+		return -ENOMEM;
+
+	list_add_tail(&p->l_node, rdd->list);
+	rb_link_node(&p->node, parent, newp);
+	rb_insert_color(&p->node, &rdd->root);
+
+	return 0;
+}
+
+static int ovl_fill_lower(struct ovl_readdir_data *rdd,
+			  const char *name, int namelen,
+			  loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct ovl_cache_entry *p;
+
+	p = ovl_cache_entry_find(&rdd->root, name, namelen);
+	if (p) {
+		list_move_tail(&p->l_node, &rdd->middle);
+	} else {
+		p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
+		if (p == NULL)
+			rdd->err = -ENOMEM;
+		else
+			list_add_tail(&p->l_node, &rdd->middle);
+	}
+
+	return rdd->err;
+}
+
+void ovl_cache_free(struct list_head *list)
+{
+	struct ovl_cache_entry *p;
+	struct ovl_cache_entry *n;
+
+	list_for_each_entry_safe(p, n, list, l_node)
+		kfree(p);
+
+	INIT_LIST_HEAD(list);
+}
+
+static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+{
+	struct ovl_dir_cache *cache = od->cache;
+
+	WARN_ON(cache->refcount <= 0);
+	cache->refcount--;
+	if (!cache->refcount) {
+		if (ovl_dir_cache(dentry) == cache)
+			ovl_set_dir_cache(dentry, NULL);
+
+		ovl_cache_free(&cache->entries);
+		kfree(cache);
+	}
+}
+
+static int ovl_fill_merge(struct dir_context *ctx, const char *name,
+			  int namelen, loff_t offset, u64 ino,
+			  unsigned int d_type)
+{
+	struct ovl_readdir_data *rdd =
+		container_of(ctx, struct ovl_readdir_data, ctx);
+
+	rdd->count++;
+	if (!rdd->is_merge)
+		return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+	else
+		return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+}
+
+static inline int ovl_dir_read(struct path *realpath,
+			       struct ovl_readdir_data *rdd)
+{
+	struct file *realfile;
+	int err;
+
+	realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
+	if (IS_ERR(realfile))
+		return PTR_ERR(realfile);
+
+	rdd->dir = realpath->dentry;
+	rdd->ctx.pos = 0;
+	do {
+		rdd->count = 0;
+		rdd->err = 0;
+		err = iterate_dir(realfile, &rdd->ctx);
+		if (err >= 0)
+			err = rdd->err;
+	} while (!err && rdd->count);
+	fput(realfile);
+
+	return err;
+}
+
+static void ovl_dir_reset(struct file *file)
+{
+	struct ovl_dir_file *od = file->private_data;
+	struct ovl_dir_cache *cache = od->cache;
+	struct dentry *dentry = file->f_path.dentry;
+	enum ovl_path_type type = ovl_path_type(dentry);
+
+	if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+		ovl_cache_put(od, dentry);
+		od->cache = NULL;
+		od->cursor = NULL;
+	}
+	WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
+	if (od->is_real && OVL_TYPE_MERGE(type))
+		od->is_real = false;
+}
+
+static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
+{
+	int err;
+	struct path realpath;
+	struct ovl_readdir_data rdd = {
+		.ctx.actor = ovl_fill_merge,
+		.list = list,
+		.root = RB_ROOT,
+		.is_merge = false,
+	};
+	int idx, next;
+
+	for (idx = 0; idx != -1; idx = next) {
+		next = ovl_path_next(idx, dentry, &realpath);
+
+		if (next != -1) {
+			err = ovl_dir_read(&realpath, &rdd);
+			if (err)
+				break;
+		} else {
+			/*
+			 * Insert lowest layer entries before upper ones, this
+			 * allows offsets to be reasonably constant
+			 */
+			list_add(&rdd.middle, rdd.list);
+			rdd.is_merge = true;
+			err = ovl_dir_read(&realpath, &rdd);
+			list_del(&rdd.middle);
+		}
+	}
+	return err;
+}
+
+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
+{
+	struct list_head *p;
+	loff_t off = 0;
+
+	list_for_each(p, &od->cache->entries) {
+		if (off >= pos)
+			break;
+		off++;
+	}
+	/* Cursor is safe since the cache is stable */
+	od->cursor = p;
+}
+
+static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
+{
+	int res;
+	struct ovl_dir_cache *cache;
+
+	cache = ovl_dir_cache(dentry);
+	if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+		cache->refcount++;
+		return cache;
+	}
+	ovl_set_dir_cache(dentry, NULL);
+
+	cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+	if (!cache)
+		return ERR_PTR(-ENOMEM);
+
+	cache->refcount = 1;
+	INIT_LIST_HEAD(&cache->entries);
+
+	res = ovl_dir_read_merged(dentry, &cache->entries);
+	if (res) {
+		ovl_cache_free(&cache->entries);
+		kfree(cache);
+		return ERR_PTR(res);
+	}
+
+	cache->version = ovl_dentry_version_get(dentry);
+	ovl_set_dir_cache(dentry, cache);
+
+	return cache;
+}
+
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+	struct ovl_dir_file *od = file->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+	struct ovl_cache_entry *p;
+
+	if (!ctx->pos)
+		ovl_dir_reset(file);
+
+	if (od->is_real)
+		return iterate_dir(od->realfile, ctx);
+
+	if (!od->cache) {
+		struct ovl_dir_cache *cache;
+
+		cache = ovl_cache_get(dentry);
+		if (IS_ERR(cache))
+			return PTR_ERR(cache);
+
+		od->cache = cache;
+		ovl_seek_cursor(od, ctx->pos);
+	}
+
+	while (od->cursor != &od->cache->entries) {
+		p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
+		if (!p->is_whiteout)
+			if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+				break;
+		od->cursor = p->l_node.next;
+		ctx->pos++;
+	}
+	return 0;
+}
+
+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t res;
+	struct ovl_dir_file *od = file->private_data;
+
+	mutex_lock(&file_inode(file)->i_mutex);
+	if (!file->f_pos)
+		ovl_dir_reset(file);
+
+	if (od->is_real) {
+		res = vfs_llseek(od->realfile, offset, origin);
+		file->f_pos = od->realfile->f_pos;
+	} else {
+		res = -EINVAL;
+
+		switch (origin) {
+		case SEEK_CUR:
+			offset += file->f_pos;
+			break;
+		case SEEK_SET:
+			break;
+		default:
+			goto out_unlock;
+		}
+		if (offset < 0)
+			goto out_unlock;
+
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			if (od->cache)
+				ovl_seek_cursor(od, offset);
+		}
+		res = offset;
+	}
+out_unlock:
+	mutex_unlock(&file_inode(file)->i_mutex);
+
+	return res;
+}
+
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+			 int datasync)
+{
+	struct ovl_dir_file *od = file->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+	struct file *realfile = od->realfile;
+
+	/*
+	 * Need to check if we started out being a lower dir, but got copied up
+	 */
+	if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
+		struct inode *inode = file_inode(file);
+
+		realfile = lockless_dereference(od->upperfile);
+		if (!realfile) {
+			struct path upperpath;
+
+			ovl_path_upper(dentry, &upperpath);
+			realfile = ovl_path_open(&upperpath, O_RDONLY);
+			smp_mb__before_spinlock();
+			mutex_lock(&inode->i_mutex);
+			if (!od->upperfile) {
+				if (IS_ERR(realfile)) {
+					mutex_unlock(&inode->i_mutex);
+					return PTR_ERR(realfile);
+				}
+				od->upperfile = realfile;
+			} else {
+				/* somebody has beaten us to it */
+				if (!IS_ERR(realfile))
+					fput(realfile);
+				realfile = od->upperfile;
+			}
+			mutex_unlock(&inode->i_mutex);
+		}
+	}
+
+	return vfs_fsync_range(realfile, start, end, datasync);
+}
+
+static int ovl_dir_release(struct inode *inode, struct file *file)
+{
+	struct ovl_dir_file *od = file->private_data;
+
+	if (od->cache) {
+		mutex_lock(&inode->i_mutex);
+		ovl_cache_put(od, file->f_path.dentry);
+		mutex_unlock(&inode->i_mutex);
+	}
+	fput(od->realfile);
+	if (od->upperfile)
+		fput(od->upperfile);
+	kfree(od);
+
+	return 0;
+}
+
+static int ovl_dir_open(struct inode *inode, struct file *file)
+{
+	struct path realpath;
+	struct file *realfile;
+	struct ovl_dir_file *od;
+	enum ovl_path_type type;
+
+	od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
+	if (!od)
+		return -ENOMEM;
+
+	type = ovl_path_real(file->f_path.dentry, &realpath);
+	realfile = ovl_path_open(&realpath, file->f_flags);
+	if (IS_ERR(realfile)) {
+		kfree(od);
+		return PTR_ERR(realfile);
+	}
+	od->realfile = realfile;
+	od->is_real = !OVL_TYPE_MERGE(type);
+	od->is_upper = OVL_TYPE_UPPER(type);
+	file->private_data = od;
+
+	return 0;
+}
+
+const struct file_operations ovl_dir_operations = {
+	.read		= generic_read_dir,
+	.open		= ovl_dir_open,
+	.iterate	= ovl_iterate,
+	.llseek		= ovl_dir_llseek,
+	.fsync		= ovl_dir_fsync,
+	.release	= ovl_dir_release,
+};
+
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
+{
+	int err;
+	struct ovl_cache_entry *p;
+
+	err = ovl_dir_read_merged(dentry, list);
+	if (err)
+		return err;
+
+	err = 0;
+
+	list_for_each_entry(p, list, l_node) {
+		if (p->is_whiteout)
+			continue;
+
+		if (p->name[0] == '.') {
+			if (p->len == 1)
+				continue;
+			if (p->len == 2 && p->name[1] == '.')
+				continue;
+		}
+		err = -ENOTEMPTY;
+		break;
+	}
+
+	return err;
+}
+
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+{
+	struct ovl_cache_entry *p;
+
+	mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+	list_for_each_entry(p, list, l_node) {
+		struct dentry *dentry;
+
+		if (!p->is_whiteout)
+			continue;
+
+		dentry = lookup_one_len(p->name, upper, p->len);
+		if (IS_ERR(dentry)) {
+			pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
+			       upper->d_name.name, p->len, p->name,
+			       (int) PTR_ERR(dentry));
+			continue;
+		}
+		ovl_cleanup(upper->d_inode, dentry);
+		dput(dentry);
+	}
+	mutex_unlock(&upper->d_inode->i_mutex);
+}
diff --git a/kernel/fs/overlayfs/super.c b/kernel/fs/overlayfs/super.c
new file mode 100644
index 000000000..bf8537c7f
--- /dev/null
+++ b/kernel/fs/overlayfs/super.c
@@ -0,0 +1,1046 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include "overlayfs.h"
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Overlay filesystem");
+MODULE_LICENSE("GPL");
+
+#define OVERLAYFS_SUPER_MAGIC 0x794c7630
+
+struct ovl_config {
+	char *lowerdir;
+	char *upperdir;
+	char *workdir;
+};
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+	struct vfsmount *upper_mnt;
+	unsigned numlower;
+	struct vfsmount **lower_mnt;
+	struct dentry *workdir;
+	long lower_namelen;
+	/* pathnames of lower and upper dirs, for show_options */
+	struct ovl_config config;
+};
+
+struct ovl_dir_cache;
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+	struct dentry *__upperdentry;
+	struct ovl_dir_cache *cache;
+	union {
+		struct {
+			u64 version;
+			bool opaque;
+		};
+		struct rcu_head rcu;
+	};
+	unsigned numlower;
+	struct path lowerstack[];
+};
+
+#define OVL_MAX_STACK 500
+
+static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
+{
+	return oe->numlower ? oe->lowerstack[0].dentry : NULL;
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	enum ovl_path_type type = 0;
+
+	if (oe->__upperdentry) {
+		type = __OVL_PATH_UPPER;
+
+		if (oe->numlower) {
+			if (S_ISDIR(dentry->d_inode->i_mode))
+				type |= __OVL_PATH_MERGE;
+		} else if (!oe->opaque) {
+			type |= __OVL_PATH_PURE;
+		}
+	} else {
+		if (oe->numlower > 1)
+			type |= __OVL_PATH_MERGE;
+	}
+	return type;
+}
+
+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+	return lockless_dereference(oe->__upperdentry);
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	path->mnt = ofs->upper_mnt;
+	path->dentry = ovl_upperdentry_dereference(oe);
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+	enum ovl_path_type type = ovl_path_type(dentry);
+
+	if (!OVL_TYPE_UPPER(type))
+		ovl_path_lower(dentry, path);
+	else
+		ovl_path_upper(dentry, path);
+
+	return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return ovl_upperdentry_dereference(oe);
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return __ovl_dentry_lower(oe);
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	struct dentry *realdentry;
+
+	realdentry = ovl_upperdentry_dereference(oe);
+	if (!realdentry)
+		realdentry = __ovl_dentry_lower(oe);
+
+	return realdentry;
+}
+
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+{
+	struct dentry *realdentry;
+
+	realdentry = ovl_upperdentry_dereference(oe);
+	if (realdentry) {
+		*is_upper = true;
+	} else {
+		realdentry = __ovl_dentry_lower(oe);
+		*is_upper = false;
+	}
+	return realdentry;
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->cache;
+}
+
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	oe->cache = cache;
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	*path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
+}
+
+int ovl_want_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	return mnt_want_write(ofs->upper_mnt);
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	mnt_drop_write(ofs->upper_mnt);
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	return ofs->workdir;
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	return oe->opaque;
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	oe->opaque = opaque;
+}
+
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+	WARN_ON(oe->__upperdentry);
+	BUG_ON(!upperdentry->d_inode);
+	/*
+	 * Make sure upperdentry is consistent before making it visible to
+	 * ovl_upperdentry_dereference().
+	 */
+	smp_wmb();
+	oe->__upperdentry = upperdentry;
+}
+
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	oe->version++;
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	return oe->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	return inode && IS_WHITEOUT(inode);
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+	int res;
+	char val;
+	struct inode *inode = dentry->d_inode;
+
+	if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
+		return false;
+
+	res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
+	if (res == 1 && val == 'y')
+		return true;
+
+	return false;
+}
+
+static void ovl_dentry_release(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	if (oe) {
+		unsigned int i;
+
+		dput(oe->__upperdentry);
+		for (i = 0; i < oe->numlower; i++)
+			dput(oe->lowerstack[i].dentry);
+		kfree_rcu(oe, rcu);
+	}
+}
+
+static const struct dentry_operations ovl_dentry_operations = {
+	.d_release = ovl_dentry_release,
+};
+
+static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
+{
+	size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
+	struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+
+	if (oe)
+		oe->numlower = numlower;
+
+	return oe;
+}
+
+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
+					     struct qstr *name)
+{
+	struct dentry *dentry;
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	dentry = lookup_one_len(name->name, dir, name->len);
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	if (IS_ERR(dentry)) {
+		if (PTR_ERR(dentry) == -ENOENT)
+			dentry = NULL;
+	} else if (!dentry->d_inode) {
+		dput(dentry);
+		dentry = NULL;
+	}
+	return dentry;
+}
+
+/*
+ * Returns next layer in stack starting from top.
+ * Returns -1 if this is the last layer.
+ */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	BUG_ON(idx < 0);
+	if (idx == 0) {
+		ovl_path_upper(dentry, path);
+		if (path->dentry)
+			return oe->numlower ? 1 : -1;
+		idx++;
+	}
+	BUG_ON(idx > oe->numlower);
+	*path = oe->lowerstack[idx - 1];
+
+	return (idx < oe->numlower) ? idx + 1 : -1;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+			  unsigned int flags)
+{
+	struct ovl_entry *oe;
+	struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+	struct path *stack = NULL;
+	struct dentry *upperdir, *upperdentry = NULL;
+	unsigned int ctr = 0;
+	struct inode *inode = NULL;
+	bool upperopaque = false;
+	struct dentry *this, *prev = NULL;
+	unsigned int i;
+	int err;
+
+	upperdir = ovl_upperdentry_dereference(poe);
+	if (upperdir) {
+		this = ovl_lookup_real(upperdir, &dentry->d_name);
+		err = PTR_ERR(this);
+		if (IS_ERR(this))
+			goto out;
+
+		if (this) {
+			if (ovl_is_whiteout(this)) {
+				dput(this);
+				this = NULL;
+				upperopaque = true;
+			} else if (poe->numlower && ovl_is_opaquedir(this)) {
+				upperopaque = true;
+			}
+		}
+		upperdentry = prev = this;
+	}
+
+	if (!upperopaque && poe->numlower) {
+		err = -ENOMEM;
+		stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
+		if (!stack)
+			goto out_put_upper;
+	}
+
+	for (i = 0; !upperopaque && i < poe->numlower; i++) {
+		bool opaque = false;
+		struct path lowerpath = poe->lowerstack[i];
+
+		this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
+		err = PTR_ERR(this);
+		if (IS_ERR(this)) {
+			/*
+			 * If it's positive, then treat ENAMETOOLONG as ENOENT.
+			 */
+			if (err == -ENAMETOOLONG && (upperdentry || ctr))
+				continue;
+			goto out_put;
+		}
+		if (!this)
+			continue;
+		if (ovl_is_whiteout(this)) {
+			dput(this);
+			break;
+		}
+		/*
+		 * Only makes sense to check opaque dir if this is not the
+		 * lowermost layer.
+		 */
+		if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
+			opaque = true;
+
+		if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
+			     !S_ISDIR(this->d_inode->i_mode))) {
+			/*
+			 * FIXME: check for upper-opaqueness maybe better done
+			 * in remove code.
+			 */
+			if (prev == upperdentry)
+				upperopaque = true;
+			dput(this);
+			break;
+		}
+		/*
+		 * If this is a non-directory then stop here.
+		 */
+		if (!S_ISDIR(this->d_inode->i_mode))
+			opaque = true;
+
+		stack[ctr].dentry = this;
+		stack[ctr].mnt = lowerpath.mnt;
+		ctr++;
+		prev = this;
+		if (opaque)
+			break;
+	}
+
+	oe = ovl_alloc_entry(ctr);
+	err = -ENOMEM;
+	if (!oe)
+		goto out_put;
+
+	if (upperdentry || ctr) {
+		struct dentry *realdentry;
+
+		realdentry = upperdentry ? upperdentry : stack[0].dentry;
+
+		err = -ENOMEM;
+		inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
+				      oe);
+		if (!inode)
+			goto out_free_oe;
+		ovl_copyattr(realdentry->d_inode, inode);
+	}
+
+	oe->opaque = upperopaque;
+	oe->__upperdentry = upperdentry;
+	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+	kfree(stack);
+	dentry->d_fsdata = oe;
+	d_add(dentry, inode);
+
+	return NULL;
+
+out_free_oe:
+	kfree(oe);
+out_put:
+	for (i = 0; i < ctr; i++)
+		dput(stack[i].dentry);
+	kfree(stack);
+out_put_upper:
+	dput(upperdentry);
+out:
+	return ERR_PTR(err);
+}
+
+struct file *ovl_path_open(struct path *path, int flags)
+{
+	return dentry_open(path, flags, current_cred());
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+	struct ovl_fs *ufs = sb->s_fs_info;
+	unsigned i;
+
+	dput(ufs->workdir);
+	mntput(ufs->upper_mnt);
+	for (i = 0; i < ufs->numlower; i++)
+		mntput(ufs->lower_mnt[i]);
+
+	kfree(ufs->config.lowerdir);
+	kfree(ufs->config.upperdir);
+	kfree(ufs->config.workdir);
+	kfree(ufs);
+}
+
+/**
+ * ovl_statfs
+ * @sb: The overlayfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics.  As writes always target the upper layer
+ * filesystem pass the statfs to the upper filesystem (if it exists)
+ */
+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	struct dentry *root_dentry = dentry->d_sb->s_root;
+	struct path path;
+	int err;
+
+	ovl_path_real(root_dentry, &path);
+
+	err = vfs_statfs(&path, buf);
+	if (!err) {
+		buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+		buf->f_type = OVERLAYFS_SUPER_MAGIC;
+	}
+
+	return err;
+}
+
+/**
+ * ovl_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ovl_fs *ufs = sb->s_fs_info;
+
+	seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+	if (ufs->config.upperdir) {
+		seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+		seq_printf(m, ",workdir=%s", ufs->config.workdir);
+	}
+	return 0;
+}
+
+static int ovl_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct ovl_fs *ufs = sb->s_fs_info;
+
+	if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir))
+		return -EROFS;
+
+	return 0;
+}
+
+static const struct super_operations ovl_super_operations = {
+	.put_super	= ovl_put_super,
+	.statfs		= ovl_statfs,
+	.show_options	= ovl_show_options,
+	.remount_fs	= ovl_remount,
+};
+
+enum {
+	OPT_LOWERDIR,
+	OPT_UPPERDIR,
+	OPT_WORKDIR,
+	OPT_ERR,
+};
+
+static const match_table_t ovl_tokens = {
+	{OPT_LOWERDIR,			"lowerdir=%s"},
+	{OPT_UPPERDIR,			"upperdir=%s"},
+	{OPT_WORKDIR,			"workdir=%s"},
+	{OPT_ERR,			NULL}
+};
+
+static char *ovl_next_opt(char **s)
+{
+	char *sbegin = *s;
+	char *p;
+
+	if (sbegin == NULL)
+		return NULL;
+
+	for (p = sbegin; *p; p++) {
+		if (*p == '\\') {
+			p++;
+			if (!*p)
+				break;
+		} else if (*p == ',') {
+			*p = '\0';
+			*s = p + 1;
+			return sbegin;
+		}
+	}
+	*s = NULL;
+	return sbegin;
+}
+
+static int ovl_parse_opt(char *opt, struct ovl_config *config)
+{
+	char *p;
+
+	while ((p = ovl_next_opt(&opt)) != NULL) {
+		int token;
+		substring_t args[MAX_OPT_ARGS];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, ovl_tokens, args);
+		switch (token) {
+		case OPT_UPPERDIR:
+			kfree(config->upperdir);
+			config->upperdir = match_strdup(&args[0]);
+			if (!config->upperdir)
+				return -ENOMEM;
+			break;
+
+		case OPT_LOWERDIR:
+			kfree(config->lowerdir);
+			config->lowerdir = match_strdup(&args[0]);
+			if (!config->lowerdir)
+				return -ENOMEM;
+			break;
+
+		case OPT_WORKDIR:
+			kfree(config->workdir);
+			config->workdir = match_strdup(&args[0]);
+			if (!config->workdir)
+				return -ENOMEM;
+			break;
+
+		default:
+			pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
+			return -EINVAL;
+		}
+	}
+
+	/* Workdir is useless in non-upper mount */
+	if (!config->upperdir && config->workdir) {
+		pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+			config->workdir);
+		kfree(config->workdir);
+		config->workdir = NULL;
+	}
+
+	return 0;
+}
+
+#define OVL_WORKDIR_NAME "work"
+
+static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
+					 struct dentry *dentry)
+{
+	struct inode *dir = dentry->d_inode;
+	struct dentry *work;
+	int err;
+	bool retried = false;
+
+	err = mnt_want_write(mnt);
+	if (err)
+		return ERR_PTR(err);
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+retry:
+	work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
+			      strlen(OVL_WORKDIR_NAME));
+
+	if (!IS_ERR(work)) {
+		struct kstat stat = {
+			.mode = S_IFDIR | 0,
+		};
+
+		if (work->d_inode) {
+			err = -EEXIST;
+			if (retried)
+				goto out_dput;
+
+			retried = true;
+			ovl_cleanup(dir, work);
+			dput(work);
+			goto retry;
+		}
+
+		err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+		if (err)
+			goto out_dput;
+	}
+out_unlock:
+	mutex_unlock(&dir->i_mutex);
+	mnt_drop_write(mnt);
+
+	return work;
+
+out_dput:
+	dput(work);
+	work = ERR_PTR(err);
+	goto out_unlock;
+}
+
+static void ovl_unescape(char *s)
+{
+	char *d = s;
+
+	for (;; s++, d++) {
+		if (*s == '\\')
+			s++;
+		*d = *s;
+		if (!*s)
+			break;
+	}
+}
+
+static bool ovl_is_allowed_fs_type(struct dentry *root)
+{
+	const struct dentry_operations *dop = root->d_op;
+
+	/*
+	 * We don't support:
+	 *  - automount filesystems
+	 *  - filesystems with revalidate (FIXME for lower layer)
+	 *  - filesystems with case insensitive names
+	 */
+	if (dop &&
+	    (dop->d_manage || dop->d_automount ||
+	     dop->d_revalidate || dop->d_weak_revalidate ||
+	     dop->d_compare || dop->d_hash)) {
+		return false;
+	}
+	return true;
+}
+
+static int ovl_mount_dir_noesc(const char *name, struct path *path)
+{
+	int err = -EINVAL;
+
+	if (!*name) {
+		pr_err("overlayfs: empty lowerdir\n");
+		goto out;
+	}
+	err = kern_path(name, LOOKUP_FOLLOW, path);
+	if (err) {
+		pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+		goto out;
+	}
+	err = -EINVAL;
+	if (!ovl_is_allowed_fs_type(path->dentry)) {
+		pr_err("overlayfs: filesystem on '%s' not supported\n", name);
+		goto out_put;
+	}
+	if (!S_ISDIR(path->dentry->d_inode->i_mode)) {
+		pr_err("overlayfs: '%s' not a directory\n", name);
+		goto out_put;
+	}
+	return 0;
+
+out_put:
+	path_put(path);
+out:
+	return err;
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+	int err = -ENOMEM;
+	char *tmp = kstrdup(name, GFP_KERNEL);
+
+	if (tmp) {
+		ovl_unescape(tmp);
+		err = ovl_mount_dir_noesc(tmp, path);
+		kfree(tmp);
+	}
+	return err;
+}
+
+static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
+			 int *stack_depth)
+{
+	int err;
+	struct kstatfs statfs;
+
+	err = ovl_mount_dir_noesc(name, path);
+	if (err)
+		goto out;
+
+	err = vfs_statfs(path, &statfs);
+	if (err) {
+		pr_err("overlayfs: statfs failed on '%s'\n", name);
+		goto out_put;
+	}
+	*namelen = max(*namelen, statfs.f_namelen);
+	*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
+
+	return 0;
+
+out_put:
+	path_put(path);
+out:
+	return err;
+}
+
+/* Workdir should not be subdir of upperdir and vice versa */
+static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
+{
+	bool ok = false;
+
+	if (workdir != upperdir) {
+		ok = (lock_rename(workdir, upperdir) == NULL);
+		unlock_rename(workdir, upperdir);
+	}
+	return ok;
+}
+
+static unsigned int ovl_split_lowerdirs(char *str)
+{
+	unsigned int ctr = 1;
+	char *s, *d;
+
+	for (s = d = str;; s++, d++) {
+		if (*s == '\\') {
+			s++;
+		} else if (*s == ':') {
+			*d = '\0';
+			ctr++;
+			continue;
+		}
+		*d = *s;
+		if (!*s)
+			break;
+	}
+	return ctr;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct path upperpath = { NULL, NULL };
+	struct path workpath = { NULL, NULL };
+	struct dentry *root_dentry;
+	struct ovl_entry *oe;
+	struct ovl_fs *ufs;
+	struct path *stack = NULL;
+	char *lowertmp;
+	char *lower;
+	unsigned int numlower;
+	unsigned int stacklen = 0;
+	unsigned int i;
+	int err;
+
+	err = -ENOMEM;
+	ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+	if (!ufs)
+		goto out;
+
+	err = ovl_parse_opt((char *) data, &ufs->config);
+	if (err)
+		goto out_free_config;
+
+	err = -EINVAL;
+	if (!ufs->config.lowerdir) {
+		pr_err("overlayfs: missing 'lowerdir'\n");
+		goto out_free_config;
+	}
+
+	sb->s_stack_depth = 0;
+	if (ufs->config.upperdir) {
+		if (!ufs->config.workdir) {
+			pr_err("overlayfs: missing 'workdir'\n");
+			goto out_free_config;
+		}
+
+		err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
+		if (err)
+			goto out_free_config;
+
+		/* Upper fs should not be r/o */
+		if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) {
+			pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+			err = -EINVAL;
+			goto out_put_upperpath;
+		}
+
+		err = ovl_mount_dir(ufs->config.workdir, &workpath);
+		if (err)
+			goto out_put_upperpath;
+
+		err = -EINVAL;
+		if (upperpath.mnt != workpath.mnt) {
+			pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+			goto out_put_workpath;
+		}
+		if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+			pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+			goto out_put_workpath;
+		}
+		sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth;
+	}
+	err = -ENOMEM;
+	lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL);
+	if (!lowertmp)
+		goto out_put_workpath;
+
+	err = -EINVAL;
+	stacklen = ovl_split_lowerdirs(lowertmp);
+	if (stacklen > OVL_MAX_STACK) {
+		pr_err("overlayfs: too many lower directries, limit is %d\n",
+		       OVL_MAX_STACK);
+		goto out_free_lowertmp;
+	} else if (!ufs->config.upperdir && stacklen == 1) {
+		pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+		goto out_free_lowertmp;
+	}
+
+	stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
+	if (!stack)
+		goto out_free_lowertmp;
+
+	lower = lowertmp;
+	for (numlower = 0; numlower < stacklen; numlower++) {
+		err = ovl_lower_dir(lower, &stack[numlower],
+				    &ufs->lower_namelen, &sb->s_stack_depth);
+		if (err)
+			goto out_put_lowerpath;
+
+		lower = strchr(lower, '\0') + 1;
+	}
+
+	err = -EINVAL;
+	sb->s_stack_depth++;
+	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+		pr_err("overlayfs: maximum fs stacking depth exceeded\n");
+		goto out_put_lowerpath;
+	}
+
+	if (ufs->config.upperdir) {
+		ufs->upper_mnt = clone_private_mount(&upperpath);
+		err = PTR_ERR(ufs->upper_mnt);
+		if (IS_ERR(ufs->upper_mnt)) {
+			pr_err("overlayfs: failed to clone upperpath\n");
+			goto out_put_lowerpath;
+		}
+
+		ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+		err = PTR_ERR(ufs->workdir);
+		if (IS_ERR(ufs->workdir)) {
+			pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
+				ufs->config.workdir, OVL_WORKDIR_NAME, -err);
+			sb->s_flags |= MS_RDONLY;
+			ufs->workdir = NULL;
+		}
+	}
+
+	err = -ENOMEM;
+	ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL);
+	if (ufs->lower_mnt == NULL)
+		goto out_put_workdir;
+	for (i = 0; i < numlower; i++) {
+		struct vfsmount *mnt = clone_private_mount(&stack[i]);
+
+		err = PTR_ERR(mnt);
+		if (IS_ERR(mnt)) {
+			pr_err("overlayfs: failed to clone lowerpath\n");
+			goto out_put_lower_mnt;
+		}
+		/*
+		 * Make lower_mnt R/O.  That way fchmod/fchown on lower file
+		 * will fail instead of modifying lower fs.
+		 */
+		mnt->mnt_flags |= MNT_READONLY;
+
+		ufs->lower_mnt[ufs->numlower] = mnt;
+		ufs->numlower++;
+	}
+
+	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
+	if (!ufs->upper_mnt)
+		sb->s_flags |= MS_RDONLY;
+
+	sb->s_d_op = &ovl_dentry_operations;
+
+	err = -ENOMEM;
+	oe = ovl_alloc_entry(numlower);
+	if (!oe)
+		goto out_put_lower_mnt;
+
+	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
+	if (!root_dentry)
+		goto out_free_oe;
+
+	mntput(upperpath.mnt);
+	for (i = 0; i < numlower; i++)
+		mntput(stack[i].mnt);
+	path_put(&workpath);
+	kfree(lowertmp);
+
+	oe->__upperdentry = upperpath.dentry;
+	for (i = 0; i < numlower; i++) {
+		oe->lowerstack[i].dentry = stack[i].dentry;
+		oe->lowerstack[i].mnt = ufs->lower_mnt[i];
+	}
+
+	root_dentry->d_fsdata = oe;
+
+	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
+	sb->s_op = &ovl_super_operations;
+	sb->s_root = root_dentry;
+	sb->s_fs_info = ufs;
+
+	return 0;
+
+out_free_oe:
+	kfree(oe);
+out_put_lower_mnt:
+	for (i = 0; i < ufs->numlower; i++)
+		mntput(ufs->lower_mnt[i]);
+	kfree(ufs->lower_mnt);
+out_put_workdir:
+	dput(ufs->workdir);
+	mntput(ufs->upper_mnt);
+out_put_lowerpath:
+	for (i = 0; i < numlower; i++)
+		path_put(&stack[i]);
+	kfree(stack);
+out_free_lowertmp:
+	kfree(lowertmp);
+out_put_workpath:
+	path_put(&workpath);
+out_put_upperpath:
+	path_put(&upperpath);
+out_free_config:
+	kfree(ufs->config.lowerdir);
+	kfree(ufs->config.upperdir);
+	kfree(ufs->config.workdir);
+	kfree(ufs);
+out:
+	return err;
+}
+
+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
+				const char *dev_name, void *raw_data)
+{
+	return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
+}
+
+static struct file_system_type ovl_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "overlay",
+	.mount		= ovl_mount,
+	.kill_sb	= kill_anon_super,
+};
+MODULE_ALIAS_FS("overlay");
+
+static int __init ovl_init(void)
+{
+	return register_filesystem(&ovl_fs_type);
+}
+
+static void __exit ovl_exit(void)
+{
+	unregister_filesystem(&ovl_fs_type);
+}
+
+module_init(ovl_init);
+module_exit(ovl_exit);
diff --git a/kernel/fs/pipe.c b/kernel/fs/pipe.c
new file mode 100644
index 000000000..8865f7963
--- /dev/null
+++ b/kernel/fs/pipe.c
@@ -0,0 +1,1125 @@
+/*
+ *  linux/fs/pipe.c
+ *
+ *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/uio.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/audit.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+#include "internal.h"
+
+/*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-size
+ */
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
+
+/*
+ * We use a start+len construction, which provides full use of the 
+ * allocated memory.
+ * -- Florian Coosmann (FGC)
+ * 
+ * Reads with count = 0 should always return 0.
+ * -- Julian Bradfield 1999-06-07.
+ *
+ * FIFOs and Pipes now generate SIGIO for both readers and writers.
+ * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
+ *
+ * pipe_read & write cleanup
+ * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
+ */
+
+static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+{
+	if (pipe->files)
+		mutex_lock_nested(&pipe->mutex, subclass);
+}
+
+void pipe_lock(struct pipe_inode_info *pipe)
+{
+	/*
+	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
+	 */
+	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+}
+EXPORT_SYMBOL(pipe_lock);
+
+void pipe_unlock(struct pipe_inode_info *pipe)
+{
+	if (pipe->files)
+		mutex_unlock(&pipe->mutex);
+}
+EXPORT_SYMBOL(pipe_unlock);
+
+static inline void __pipe_lock(struct pipe_inode_info *pipe)
+{
+	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
+}
+
+static inline void __pipe_unlock(struct pipe_inode_info *pipe)
+{
+	mutex_unlock(&pipe->mutex);
+}
+
+void pipe_double_lock(struct pipe_inode_info *pipe1,
+		      struct pipe_inode_info *pipe2)
+{
+	BUG_ON(pipe1 == pipe2);
+
+	if (pipe1 < pipe2) {
+		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+	} else {
+		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
+		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
+	}
+}
+
+/* Drop the inode semaphore and wait for a pipe event, atomically */
+void pipe_wait(struct pipe_inode_info *pipe)
+{
+	DEFINE_WAIT(wait);
+
+	/*
+	 * Pipes are system-local resources, so sleeping on them
+	 * is considered a noninteractive wait:
+	 */
+	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+	pipe_unlock(pipe);
+	schedule();
+	finish_wait(&pipe->wait, &wait);
+	pipe_lock(pipe);
+}
+
+static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	/*
+	 * If nobody else uses this page, and we don't already have a
+	 * temporary page, let's keep track of it as a one-deep
+	 * allocation cache. (Otherwise just release our reference to it)
+	 */
+	if (page_count(page) == 1 && !pipe->tmp_page)
+		pipe->tmp_page = page;
+	else
+		page_cache_release(page);
+}
+
+/**
+ * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to attempt to steal
+ *
+ * Description:
+ *	This function attempts to steal the &struct page attached to
+ *	@buf. If successful, this function returns 0 and returns with
+ *	the page locked. The caller may then reuse the page for whatever
+ *	he wishes; the typical use is insertion into a different file
+ *	page cache.
+ */
+int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	/*
+	 * A reference of one is golden, that means that the owner of this
+	 * page is the only one holding a reference to it. lock the page
+	 * and return OK.
+	 */
+	if (page_count(page) == 1) {
+		lock_page(page);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(generic_pipe_buf_steal);
+
+/**
+ * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to get a reference to
+ *
+ * Description:
+ *	This function grabs an extra reference to @buf. It's used in
+ *	in the tee() system call, when we duplicate the buffers in one
+ *	pipe into another.
+ */
+void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+EXPORT_SYMBOL(generic_pipe_buf_get);
+
+/**
+ * generic_pipe_buf_confirm - verify contents of the pipe buffer
+ * @info:	the pipe that the buffer belongs to
+ * @buf:	the buffer to confirm
+ *
+ * Description:
+ *	This function does nothing, because the generic pipe code uses
+ *	pages that are always good when inserted into the pipe.
+ */
+int generic_pipe_buf_confirm(struct pipe_inode_info *info,
+			     struct pipe_buffer *buf)
+{
+	return 0;
+}
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
+
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to put a reference to
+ *
+ * Description:
+ *	This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+			      struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+}
+EXPORT_SYMBOL(generic_pipe_buf_release);
+
+static const struct pipe_buf_operations anon_pipe_buf_ops = {
+	.can_merge = 1,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static const struct pipe_buf_operations packet_pipe_buf_ops = {
+	.can_merge = 0,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t
+pipe_read(struct kiocb *iocb, struct iov_iter *to)
+{
+	size_t total_len = iov_iter_count(to);
+	struct file *filp = iocb->ki_filp;
+	struct pipe_inode_info *pipe = filp->private_data;
+	int do_wakeup;
+	ssize_t ret;
+
+	/* Null read succeeds. */
+	if (unlikely(total_len == 0))
+		return 0;
+
+	do_wakeup = 0;
+	ret = 0;
+	__pipe_lock(pipe);
+	for (;;) {
+		int bufs = pipe->nrbufs;
+		if (bufs) {
+			int curbuf = pipe->curbuf;
+			struct pipe_buffer *buf = pipe->bufs + curbuf;
+			const struct pipe_buf_operations *ops = buf->ops;
+			size_t chars = buf->len;
+			size_t written;
+			int error;
+
+			if (chars > total_len)
+				chars = total_len;
+
+			error = ops->confirm(pipe, buf);
+			if (error) {
+				if (!ret)
+					ret = error;
+				break;
+			}
+
+			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
+			if (unlikely(written < chars)) {
+				if (!ret)
+					ret = -EFAULT;
+				break;
+			}
+			ret += chars;
+			buf->offset += chars;
+			buf->len -= chars;
+
+			/* Was it a packet buffer? Clean up and exit */
+			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
+				total_len = chars;
+				buf->len = 0;
+			}
+
+			if (!buf->len) {
+				buf->ops = NULL;
+				ops->release(pipe, buf);
+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
+				pipe->curbuf = curbuf;
+				pipe->nrbufs = --bufs;
+				do_wakeup = 1;
+			}
+			total_len -= chars;
+			if (!total_len)
+				break;	/* common path: read succeeded */
+		}
+		if (bufs)	/* More to do? */
+			continue;
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			/* syscall merging: Usually we must not sleep
+			 * if O_NONBLOCK is set, or if we got some data.
+			 * But if a writer sleeps in kernel space, then
+			 * we can wait for that data without violating POSIX.
+			 */
+			if (ret)
+				break;
+			if (filp->f_flags & O_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
+ 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+		}
+		pipe_wait(pipe);
+	}
+	__pipe_unlock(pipe);
+
+	/* Signal writers asynchronously that there is more room. */
+	if (do_wakeup) {
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+	}
+	if (ret > 0)
+		file_accessed(filp);
+	return ret;
+}
+
+static inline int is_packetized(struct file *file)
+{
+	return (file->f_flags & O_DIRECT) != 0;
+}
+
+static ssize_t
+pipe_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *filp = iocb->ki_filp;
+	struct pipe_inode_info *pipe = filp->private_data;
+	ssize_t ret = 0;
+	int do_wakeup = 0;
+	size_t total_len = iov_iter_count(from);
+	ssize_t chars;
+
+	/* Null write succeeds. */
+	if (unlikely(total_len == 0))
+		return 0;
+
+	__pipe_lock(pipe);
+
+	if (!pipe->readers) {
+		send_sig(SIGPIPE, current, 0);
+		ret = -EPIPE;
+		goto out;
+	}
+
+	/* We try to merge small writes */
+	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
+	if (pipe->nrbufs && chars != 0) {
+		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
+							(pipe->buffers - 1);
+		struct pipe_buffer *buf = pipe->bufs + lastbuf;
+		const struct pipe_buf_operations *ops = buf->ops;
+		int offset = buf->offset + buf->len;
+
+		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
+			int error = ops->confirm(pipe, buf);
+			if (error)
+				goto out;
+
+			ret = copy_page_from_iter(buf->page, offset, chars, from);
+			if (unlikely(ret < chars)) {
+				error = -EFAULT;
+				goto out;
+			}
+			do_wakeup = 1;
+			buf->len += chars;
+			ret = chars;
+			if (!iov_iter_count(from))
+				goto out;
+		}
+	}
+
+	for (;;) {
+		int bufs;
+
+		if (!pipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+		bufs = pipe->nrbufs;
+		if (bufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
+			struct page *page = pipe->tmp_page;
+			int copied;
+
+			if (!page) {
+				page = alloc_page(GFP_HIGHUSER);
+				if (unlikely(!page)) {
+					ret = ret ? : -ENOMEM;
+					break;
+				}
+				pipe->tmp_page = page;
+			}
+			/* Always wake up, even if the copy fails. Otherwise
+			 * we lock up (O_NONBLOCK-)readers that sleep due to
+			 * syscall merging.
+			 * FIXME! Is this really true?
+			 */
+			do_wakeup = 1;
+			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
+				if (!ret)
+					ret = -EFAULT;
+				break;
+			}
+			ret += copied;
+
+			/* Insert it into the buffer array */
+			buf->page = page;
+			buf->ops = &anon_pipe_buf_ops;
+			buf->offset = 0;
+			buf->len = copied;
+			buf->flags = 0;
+			if (is_packetized(filp)) {
+				buf->ops = &packet_pipe_buf_ops;
+				buf->flags = PIPE_BUF_FLAG_PACKET;
+			}
+			pipe->nrbufs = ++bufs;
+			pipe->tmp_page = NULL;
+
+			if (!iov_iter_count(from))
+				break;
+		}
+		if (bufs < pipe->buffers)
+			continue;
+		if (filp->f_flags & O_NONBLOCK) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+			do_wakeup = 0;
+		}
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
+	}
+out:
+	__pipe_unlock(pipe);
+	if (do_wakeup) {
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	}
+	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
+		int err = file_update_time(filp);
+		if (err)
+			ret = err;
+		sb_end_write(file_inode(filp)->i_sb);
+	}
+	return ret;
+}
+
+static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct pipe_inode_info *pipe = filp->private_data;
+	int count, buf, nrbufs;
+
+	switch (cmd) {
+		case FIONREAD:
+			__pipe_lock(pipe);
+			count = 0;
+			buf = pipe->curbuf;
+			nrbufs = pipe->nrbufs;
+			while (--nrbufs >= 0) {
+				count += pipe->bufs[buf].len;
+				buf = (buf+1) & (pipe->buffers - 1);
+			}
+			__pipe_unlock(pipe);
+
+			return put_user(count, (int __user *)arg);
+		default:
+			return -ENOIOCTLCMD;
+	}
+}
+
+/* No kernel lock held - fine */
+static unsigned int
+pipe_poll(struct file *filp, poll_table *wait)
+{
+	unsigned int mask;
+	struct pipe_inode_info *pipe = filp->private_data;
+	int nrbufs;
+
+	poll_wait(filp, &pipe->wait, wait);
+
+	/* Reading only -- no need for acquiring the semaphore.  */
+	nrbufs = pipe->nrbufs;
+	mask = 0;
+	if (filp->f_mode & FMODE_READ) {
+		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
+		if (!pipe->writers && filp->f_version != pipe->w_counter)
+			mask |= POLLHUP;
+	}
+
+	if (filp->f_mode & FMODE_WRITE) {
+		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
+		/*
+		 * Most Unices do not set POLLERR for FIFOs but on Linux they
+		 * behave exactly like pipes for poll().
+		 */
+		if (!pipe->readers)
+			mask |= POLLERR;
+	}
+
+	return mask;
+}
+
+static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
+{
+	int kill = 0;
+
+	spin_lock(&inode->i_lock);
+	if (!--pipe->files) {
+		inode->i_pipe = NULL;
+		kill = 1;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (kill)
+		free_pipe_info(pipe);
+}
+
+static int
+pipe_release(struct inode *inode, struct file *file)
+{
+	struct pipe_inode_info *pipe = file->private_data;
+
+	__pipe_lock(pipe);
+	if (file->f_mode & FMODE_READ)
+		pipe->readers--;
+	if (file->f_mode & FMODE_WRITE)
+		pipe->writers--;
+
+	if (pipe->readers || pipe->writers) {
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+	}
+	__pipe_unlock(pipe);
+
+	put_pipe_info(inode, pipe);
+	return 0;
+}
+
+static int
+pipe_fasync(int fd, struct file *filp, int on)
+{
+	struct pipe_inode_info *pipe = filp->private_data;
+	int retval = 0;
+
+	__pipe_lock(pipe);
+	if (filp->f_mode & FMODE_READ)
+		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
+	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
+		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
+		if (retval < 0 && (filp->f_mode & FMODE_READ))
+			/* this can happen only if on == T */
+			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
+	}
+	__pipe_unlock(pipe);
+	return retval;
+}
+
+struct pipe_inode_info *alloc_pipe_info(void)
+{
+	struct pipe_inode_info *pipe;
+
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	if (pipe) {
+		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		if (pipe->bufs) {
+			init_waitqueue_head(&pipe->wait);
+			pipe->r_counter = pipe->w_counter = 1;
+			pipe->buffers = PIPE_DEF_BUFFERS;
+			mutex_init(&pipe->mutex);
+			return pipe;
+		}
+		kfree(pipe);
+	}
+
+	return NULL;
+}
+
+void free_pipe_info(struct pipe_inode_info *pipe)
+{
+	int i;
+
+	for (i = 0; i < pipe->buffers; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+		if (buf->ops)
+			buf->ops->release(pipe, buf);
+	}
+	if (pipe->tmp_page)
+		__free_page(pipe->tmp_page);
+	kfree(pipe->bufs);
+	kfree(pipe);
+}
+
+static struct vfsmount *pipe_mnt __read_mostly;
+
+/*
+ * pipefs_dname() is called from d_path().
+ */
+static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+				d_inode(dentry)->i_ino);
+}
+
+static const struct dentry_operations pipefs_dentry_operations = {
+	.d_dname	= pipefs_dname,
+};
+
+static struct inode * get_pipe_inode(void)
+{
+	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
+	struct pipe_inode_info *pipe;
+
+	if (!inode)
+		goto fail_inode;
+
+	inode->i_ino = get_next_ino();
+
+	pipe = alloc_pipe_info();
+	if (!pipe)
+		goto fail_iput;
+
+	inode->i_pipe = pipe;
+	pipe->files = 2;
+	pipe->readers = pipe->writers = 1;
+	inode->i_fop = &pipefifo_fops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because "mark_inode_dirty()" will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	return inode;
+
+fail_iput:
+	iput(inode);
+
+fail_inode:
+	return NULL;
+}
+
+int create_pipe_files(struct file **res, int flags)
+{
+	int err;
+	struct inode *inode = get_pipe_inode();
+	struct file *f;
+	struct path path;
+	static struct qstr name = { .name = "" };
+
+	if (!inode)
+		return -ENFILE;
+
+	err = -ENOMEM;
+	path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
+	if (!path.dentry)
+		goto err_inode;
+	path.mnt = mntget(pipe_mnt);
+
+	d_instantiate(path.dentry, inode);
+
+	err = -ENFILE;
+	f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
+	if (IS_ERR(f))
+		goto err_dentry;
+
+	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
+	f->private_data = inode->i_pipe;
+
+	res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
+	if (IS_ERR(res[0]))
+		goto err_file;
+
+	path_get(&path);
+	res[0]->private_data = inode->i_pipe;
+	res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+	res[1] = f;
+	return 0;
+
+err_file:
+	put_filp(f);
+err_dentry:
+	free_pipe_info(inode->i_pipe);
+	path_put(&path);
+	return err;
+
+err_inode:
+	free_pipe_info(inode->i_pipe);
+	iput(inode);
+	return err;
+}
+
+static int __do_pipe_flags(int *fd, struct file **files, int flags)
+{
+	int error;
+	int fdw, fdr;
+
+	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
+		return -EINVAL;
+
+	error = create_pipe_files(files, flags);
+	if (error)
+		return error;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		goto err_read_pipe;
+	fdr = error;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		goto err_fdr;
+	fdw = error;
+
+	audit_fd_pair(fdr, fdw);
+	fd[0] = fdr;
+	fd[1] = fdw;
+	return 0;
+
+ err_fdr:
+	put_unused_fd(fdr);
+ err_read_pipe:
+	fput(files[0]);
+	fput(files[1]);
+	return error;
+}
+
+int do_pipe_flags(int *fd, int flags)
+{
+	struct file *files[2];
+	int error = __do_pipe_flags(fd, files, flags);
+	if (!error) {
+		fd_install(fd[0], files[0]);
+		fd_install(fd[1], files[1]);
+	}
+	return error;
+}
+
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
+{
+	struct file *files[2];
+	int fd[2];
+	int error;
+
+	error = __do_pipe_flags(fd, files, flags);
+	if (!error) {
+		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
+			fput(files[0]);
+			fput(files[1]);
+			put_unused_fd(fd[0]);
+			put_unused_fd(fd[1]);
+			error = -EFAULT;
+		} else {
+			fd_install(fd[0], files[0]);
+			fd_install(fd[1], files[1]);
+		}
+	}
+	return error;
+}
+
+SYSCALL_DEFINE1(pipe, int __user *, fildes)
+{
+	return sys_pipe2(fildes, 0);
+}
+
+static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
+{
+	int cur = *cnt;	
+
+	while (cur == *cnt) {
+		pipe_wait(pipe);
+		if (signal_pending(current))
+			break;
+	}
+	return cur == *cnt ? -ERESTARTSYS : 0;
+}
+
+static void wake_up_partner(struct pipe_inode_info *pipe)
+{
+	wake_up_interruptible(&pipe->wait);
+}
+
+static int fifo_open(struct inode *inode, struct file *filp)
+{
+	struct pipe_inode_info *pipe;
+	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
+	int ret;
+
+	filp->f_version = 0;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_pipe) {
+		pipe = inode->i_pipe;
+		pipe->files++;
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
+		pipe = alloc_pipe_info();
+		if (!pipe)
+			return -ENOMEM;
+		pipe->files = 1;
+		spin_lock(&inode->i_lock);
+		if (unlikely(inode->i_pipe)) {
+			inode->i_pipe->files++;
+			spin_unlock(&inode->i_lock);
+			free_pipe_info(pipe);
+			pipe = inode->i_pipe;
+		} else {
+			inode->i_pipe = pipe;
+			spin_unlock(&inode->i_lock);
+		}
+	}
+	filp->private_data = pipe;
+	/* OK, we have a pipe and it's pinned down */
+
+	__pipe_lock(pipe);
+
+	/* We can only do regular read/write on fifos */
+	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
+
+	switch (filp->f_mode) {
+	case FMODE_READ:
+	/*
+	 *  O_RDONLY
+	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
+	 *  opened, even when there is no process writing the FIFO.
+	 */
+		pipe->r_counter++;
+		if (pipe->readers++ == 0)
+			wake_up_partner(pipe);
+
+		if (!is_pipe && !pipe->writers) {
+			if ((filp->f_flags & O_NONBLOCK)) {
+				/* suppress POLLHUP until we have
+				 * seen a writer */
+				filp->f_version = pipe->w_counter;
+			} else {
+				if (wait_for_partner(pipe, &pipe->w_counter))
+					goto err_rd;
+			}
+		}
+		break;
+	
+	case FMODE_WRITE:
+	/*
+	 *  O_WRONLY
+	 *  POSIX.1 says that O_NONBLOCK means return -1 with
+	 *  errno=ENXIO when there is no process reading the FIFO.
+	 */
+		ret = -ENXIO;
+		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
+			goto err;
+
+		pipe->w_counter++;
+		if (!pipe->writers++)
+			wake_up_partner(pipe);
+
+		if (!is_pipe && !pipe->readers) {
+			if (wait_for_partner(pipe, &pipe->r_counter))
+				goto err_wr;
+		}
+		break;
+	
+	case FMODE_READ | FMODE_WRITE:
+	/*
+	 *  O_RDWR
+	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
+	 *  This implementation will NEVER block on a O_RDWR open, since
+	 *  the process can at least talk to itself.
+	 */
+
+		pipe->readers++;
+		pipe->writers++;
+		pipe->r_counter++;
+		pipe->w_counter++;
+		if (pipe->readers == 1 || pipe->writers == 1)
+			wake_up_partner(pipe);
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* Ok! */
+	__pipe_unlock(pipe);
+	return 0;
+
+err_rd:
+	if (!--pipe->readers)
+		wake_up_interruptible(&pipe->wait);
+	ret = -ERESTARTSYS;
+	goto err;
+
+err_wr:
+	if (!--pipe->writers)
+		wake_up_interruptible(&pipe->wait);
+	ret = -ERESTARTSYS;
+	goto err;
+
+err:
+	__pipe_unlock(pipe);
+
+	put_pipe_info(inode, pipe);
+	return ret;
+}
+
+const struct file_operations pipefifo_fops = {
+	.open		= fifo_open,
+	.llseek		= no_llseek,
+	.read_iter	= pipe_read,
+	.write_iter	= pipe_write,
+	.poll		= pipe_poll,
+	.unlocked_ioctl	= pipe_ioctl,
+	.release	= pipe_release,
+	.fasync		= pipe_fasync,
+};
+
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
+{
+	struct pipe_buffer *bufs;
+
+	/*
+	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+	 * expect a lot of shrink+grow operations, just free and allocate
+	 * again like we would do for growing. If the pipe currently
+	 * contains more buffers than arg, then return busy.
+	 */
+	if (nr_pages < pipe->nrbufs)
+		return -EBUSY;
+
+	bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
+	if (unlikely(!bufs))
+		return -ENOMEM;
+
+	/*
+	 * The pipe array wraps around, so just start the new one at zero
+	 * and adjust the indexes.
+	 */
+	if (pipe->nrbufs) {
+		unsigned int tail;
+		unsigned int head;
+
+		tail = pipe->curbuf + pipe->nrbufs;
+		if (tail < pipe->buffers)
+			tail = 0;
+		else
+			tail &= (pipe->buffers - 1);
+
+		head = pipe->nrbufs - tail;
+		if (head)
+			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+		if (tail)
+			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+	}
+
+	pipe->curbuf = 0;
+	kfree(pipe->bufs);
+	pipe->bufs = bufs;
+	pipe->buffers = nr_pages;
+	return nr_pages * PAGE_SIZE;
+}
+
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+	unsigned long nr_pages;
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+		 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	pipe_max_size = round_pipe_size(pipe_max_size);
+	return ret;
+}
+
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+	return file->f_op == &pipefifo_fops ? file->private_data : NULL;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pipe_inode_info *pipe;
+	long ret;
+
+	pipe = get_pipe_info(file);
+	if (!pipe)
+		return -EBADF;
+
+	__pipe_lock(pipe);
+
+	switch (cmd) {
+	case F_SETPIPE_SZ: {
+		unsigned int size, nr_pages;
+
+		size = round_pipe_size(arg);
+		nr_pages = size >> PAGE_SHIFT;
+
+		ret = -EINVAL;
+		if (!nr_pages)
+			goto out;
+
+		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+			ret = -EPERM;
+			goto out;
+		}
+		ret = pipe_set_size(pipe, nr_pages);
+		break;
+		}
+	case F_GETPIPE_SZ:
+		ret = pipe->buffers * PAGE_SIZE;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+out:
+	__pipe_unlock(pipe);
+	return ret;
+}
+
+static const struct super_operations pipefs_ops = {
+	.destroy_inode = free_inode_nonrcu,
+	.statfs = simple_statfs,
+};
+
+/*
+ * pipefs should _never_ be mounted by userland - too much of security hassle,
+ * no real gain from having the whole whorehouse mounted. So we don't need
+ * any operations on the root directory. However, we need a non-trivial
+ * d_name - pipe: will go nicely and kill the special-casing in procfs.
+ */
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+			&pipefs_dentry_operations, PIPEFS_MAGIC);
+}
+
+static struct file_system_type pipe_fs_type = {
+	.name		= "pipefs",
+	.mount		= pipefs_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+static int __init init_pipe_fs(void)
+{
+	int err = register_filesystem(&pipe_fs_type);
+
+	if (!err) {
+		pipe_mnt = kern_mount(&pipe_fs_type);
+		if (IS_ERR(pipe_mnt)) {
+			err = PTR_ERR(pipe_mnt);
+			unregister_filesystem(&pipe_fs_type);
+		}
+	}
+	return err;
+}
+
+fs_initcall(init_pipe_fs);
diff --git a/kernel/fs/pnode.c b/kernel/fs/pnode.c
new file mode 100644
index 000000000..6367e1e43
--- /dev/null
+++ b/kernel/fs/pnode.c
@@ -0,0 +1,452 @@
+/*
+ *  linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ *	Released under GPL v2.
+ *	Author : Ram Pai (linuxram@us.ibm.com)
+ *
+ */
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/nsproxy.h>
+#include "internal.h"
+#include "pnode.h"
+
+/* return the next shared peer mount of @p */
+static inline struct mount *next_peer(struct mount *p)
+{
+	return list_entry(p->mnt_share.next, struct mount, mnt_share);
+}
+
+static inline struct mount *first_slave(struct mount *p)
+{
+	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
+}
+
+static inline struct mount *next_slave(struct mount *p)
+{
+	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
+}
+
+static struct mount *get_peer_under_root(struct mount *mnt,
+					 struct mnt_namespace *ns,
+					 const struct path *root)
+{
+	struct mount *m = mnt;
+
+	do {
+		/* Check the namespace first for optimization */
+		if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root))
+			return m;
+
+		m = next_peer(m);
+	} while (m != mnt);
+
+	return NULL;
+}
+
+/*
+ * Get ID of closest dominating peer group having a representative
+ * under the given root.
+ *
+ * Caller must hold namespace_sem
+ */
+int get_dominating_id(struct mount *mnt, const struct path *root)
+{
+	struct mount *m;
+
+	for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
+		struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root);
+		if (d)
+			return d->mnt_group_id;
+	}
+
+	return 0;
+}
+
+static int do_make_slave(struct mount *mnt)
+{
+	struct mount *peer_mnt = mnt, *master = mnt->mnt_master;
+	struct mount *slave_mnt;
+
+	/*
+	 * slave 'mnt' to a peer mount that has the
+	 * same root dentry. If none is available then
+	 * slave it to anything that is available.
+	 */
+	while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
+	       peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ;
+
+	if (peer_mnt == mnt) {
+		peer_mnt = next_peer(mnt);
+		if (peer_mnt == mnt)
+			peer_mnt = NULL;
+	}
+	if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) &&
+	    list_empty(&mnt->mnt_share))
+		mnt_release_group_id(mnt);
+
+	list_del_init(&mnt->mnt_share);
+	mnt->mnt_group_id = 0;
+
+	if (peer_mnt)
+		master = peer_mnt;
+
+	if (master) {
+		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
+			slave_mnt->mnt_master = master;
+		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
+		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
+		INIT_LIST_HEAD(&mnt->mnt_slave_list);
+	} else {
+		struct list_head *p = &mnt->mnt_slave_list;
+		while (!list_empty(p)) {
+                        slave_mnt = list_first_entry(p,
+					struct mount, mnt_slave);
+			list_del_init(&slave_mnt->mnt_slave);
+			slave_mnt->mnt_master = NULL;
+		}
+	}
+	mnt->mnt_master = master;
+	CLEAR_MNT_SHARED(mnt);
+	return 0;
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+void change_mnt_propagation(struct mount *mnt, int type)
+{
+	if (type == MS_SHARED) {
+		set_mnt_shared(mnt);
+		return;
+	}
+	do_make_slave(mnt);
+	if (type != MS_SLAVE) {
+		list_del_init(&mnt->mnt_slave);
+		mnt->mnt_master = NULL;
+		if (type == MS_UNBINDABLE)
+			mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
+		else
+			mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
+	}
+}
+
+/*
+ * get the next mount in the propagation tree.
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	/* are there any slaves of this mount? */
+	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+		return first_slave(m);
+
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			return (next == origin) ? NULL : next;
+		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+			return next_slave(m);
+
+		/* back at master */
+		m = master;
+	}
+}
+
+static struct mount *next_group(struct mount *m, struct mount *origin)
+{
+	while (1) {
+		while (1) {
+			struct mount *next;
+			if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+				return first_slave(m);
+			next = next_peer(m);
+			if (m->mnt_group_id == origin->mnt_group_id) {
+				if (next == origin)
+					return NULL;
+			} else if (m->mnt_slave.next != &next->mnt_slave)
+				break;
+			m = next;
+		}
+		/* m is the last peer */
+		while (1) {
+			struct mount *master = m->mnt_master;
+			if (m->mnt_slave.next != &master->mnt_slave_list)
+				return next_slave(m);
+			m = next_peer(master);
+			if (master->mnt_group_id == origin->mnt_group_id)
+				break;
+			if (master->mnt_slave.next == &m->mnt_slave)
+				break;
+			m = master;
+		}
+		if (m == origin)
+			return NULL;
+	}
+}
+
+/* all accesses are serialized by namespace_sem */
+static struct user_namespace *user_ns;
+static struct mount *last_dest, *last_source, *dest_master;
+static struct mountpoint *mp;
+static struct hlist_head *list;
+
+static int propagate_one(struct mount *m)
+{
+	struct mount *child;
+	int type;
+	/* skip ones added by this propagate_mnt() */
+	if (IS_MNT_NEW(m))
+		return 0;
+	/* skip if mountpoint isn't covered by it */
+	if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
+		return 0;
+	if (m->mnt_group_id == last_dest->mnt_group_id) {
+		type = CL_MAKE_SHARED;
+	} else {
+		struct mount *n, *p;
+		for (n = m; ; n = p) {
+			p = n->mnt_master;
+			if (p == dest_master || IS_MNT_MARKED(p)) {
+				while (last_dest->mnt_master != p) {
+					last_source = last_source->mnt_master;
+					last_dest = last_source->mnt_parent;
+				}
+				if (n->mnt_group_id != last_dest->mnt_group_id) {
+					last_source = last_source->mnt_master;
+					last_dest = last_source->mnt_parent;
+				}
+				break;
+			}
+		}
+		type = CL_SLAVE;
+		/* beginning of peer group among the slaves? */
+		if (IS_MNT_SHARED(m))
+			type |= CL_MAKE_SHARED;
+	}
+		
+	/* Notice when we are propagating across user namespaces */
+	if (m->mnt_ns->user_ns != user_ns)
+		type |= CL_UNPRIVILEGED;
+	child = copy_tree(last_source, last_source->mnt.mnt_root, type);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+	child->mnt.mnt_flags &= ~MNT_LOCKED;
+	mnt_set_mountpoint(m, mp, child);
+	last_dest = m;
+	last_source = child;
+	if (m->mnt_master != dest_master) {
+		read_seqlock_excl(&mount_lock);
+		SET_MNT_MARK(m->mnt_master);
+		read_sequnlock_excl(&mount_lock);
+	}
+	hlist_add_head(&child->mnt_hash, list);
+	return 0;
+}
+
+/*
+ * mount 'source_mnt' under the destination 'dest_mnt' at
+ * dentry 'dest_dentry'. And propagate that mount to
+ * all the peer and slave mounts of 'dest_mnt'.
+ * Link all the new mounts into a propagation tree headed at
+ * source_mnt. Also link all the new mounts using ->mnt_list
+ * headed at source_mnt's ->mnt_list
+ *
+ * @dest_mnt: destination mount.
+ * @dest_dentry: destination dentry.
+ * @source_mnt: source mount.
+ * @tree_list : list of heads of trees to be attached.
+ */
+int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
+		    struct mount *source_mnt, struct hlist_head *tree_list)
+{
+	struct mount *m, *n;
+	int ret = 0;
+
+	/*
+	 * we don't want to bother passing tons of arguments to
+	 * propagate_one(); everything is serialized by namespace_sem,
+	 * so globals will do just fine.
+	 */
+	user_ns = current->nsproxy->mnt_ns->user_ns;
+	last_dest = dest_mnt;
+	last_source = source_mnt;
+	mp = dest_mp;
+	list = tree_list;
+	dest_master = dest_mnt->mnt_master;
+
+	/* all peers of dest_mnt, except dest_mnt itself */
+	for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
+		ret = propagate_one(n);
+		if (ret)
+			goto out;
+	}
+
+	/* all slave groups */
+	for (m = next_group(dest_mnt, dest_mnt); m;
+			m = next_group(m, dest_mnt)) {
+		/* everything in that slave group */
+		n = m;
+		do {
+			ret = propagate_one(n);
+			if (ret)
+				goto out;
+			n = next_peer(n);
+		} while (n != m);
+	}
+out:
+	read_seqlock_excl(&mount_lock);
+	hlist_for_each_entry(n, tree_list, mnt_hash) {
+		m = n->mnt_parent;
+		if (m->mnt_master != dest_mnt->mnt_master)
+			CLEAR_MNT_MARK(m->mnt_master);
+	}
+	read_sequnlock_excl(&mount_lock);
+	return ret;
+}
+
+/*
+ * return true if the refcount is greater than count
+ */
+static inline int do_refcount_check(struct mount *mnt, int count)
+{
+	return mnt_get_count(mnt) > count;
+}
+
+/*
+ * check if the mount 'mnt' can be unmounted successfully.
+ * @mnt: the mount to be checked for unmount
+ * NOTE: unmounting 'mnt' would naturally propagate to all
+ * other mounts its parent propagates to.
+ * Check if any of these mounts that **do not have submounts**
+ * have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for write
+ */
+int propagate_mount_busy(struct mount *mnt, int refcnt)
+{
+	struct mount *m, *child;
+	struct mount *parent = mnt->mnt_parent;
+	int ret = 0;
+
+	if (mnt == parent)
+		return do_refcount_check(mnt, refcnt);
+
+	/*
+	 * quickly check if the current mount can be unmounted.
+	 * If not, we don't have to go checking for all other
+	 * mounts
+	 */
+	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
+		return 1;
+
+	for (m = propagation_next(parent, parent); m;
+	     		m = propagation_next(m, parent)) {
+		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		if (child && list_empty(&child->mnt_mounts) &&
+		    (ret = do_refcount_check(child, 1)))
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Clear MNT_LOCKED when it can be shown to be safe.
+ *
+ * mount_lock lock must be held for write
+ */
+void propagate_mount_unlock(struct mount *mnt)
+{
+	struct mount *parent = mnt->mnt_parent;
+	struct mount *m, *child;
+
+	BUG_ON(parent == mnt);
+
+	for (m = propagation_next(parent, parent); m;
+			m = propagation_next(m, parent)) {
+		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		if (child)
+			child->mnt.mnt_flags &= ~MNT_LOCKED;
+	}
+}
+
+/*
+ * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
+ */
+static void mark_umount_candidates(struct mount *mnt)
+{
+	struct mount *parent = mnt->mnt_parent;
+	struct mount *m;
+
+	BUG_ON(parent == mnt);
+
+	for (m = propagation_next(parent, parent); m;
+			m = propagation_next(m, parent)) {
+		struct mount *child = __lookup_mnt_last(&m->mnt,
+						mnt->mnt_mountpoint);
+		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+			SET_MNT_MARK(child);
+		}
+	}
+}
+
+/*
+ * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
+ * parent propagates to.
+ */
+static void __propagate_umount(struct mount *mnt)
+{
+	struct mount *parent = mnt->mnt_parent;
+	struct mount *m;
+
+	BUG_ON(parent == mnt);
+
+	for (m = propagation_next(parent, parent); m;
+			m = propagation_next(m, parent)) {
+
+		struct mount *child = __lookup_mnt_last(&m->mnt,
+						mnt->mnt_mountpoint);
+		/*
+		 * umount the child only if the child has no children
+		 * and the child is marked safe to unmount.
+		 */
+		if (!child || !IS_MNT_MARKED(child))
+			continue;
+		CLEAR_MNT_MARK(child);
+		if (list_empty(&child->mnt_mounts)) {
+			list_del_init(&child->mnt_child);
+			child->mnt.mnt_flags |= MNT_UMOUNT;
+			list_move_tail(&child->mnt_list, &mnt->mnt_list);
+		}
+	}
+}
+
+/*
+ * collect all mounts that receive propagation from the mount in @list,
+ * and return these additional mounts in the same list.
+ * @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
+ */
+int propagate_umount(struct list_head *list)
+{
+	struct mount *mnt;
+
+	list_for_each_entry_reverse(mnt, list, mnt_list)
+		mark_umount_candidates(mnt);
+
+	list_for_each_entry(mnt, list, mnt_list)
+		__propagate_umount(mnt);
+	return 0;
+}
diff --git a/kernel/fs/pnode.h b/kernel/fs/pnode.h
new file mode 100644
index 000000000..7114ce6e6
--- /dev/null
+++ b/kernel/fs/pnode.h
@@ -0,0 +1,57 @@
+/*
+ *  linux/fs/pnode.h
+ *
+ * (C) Copyright IBM Corporation 2005.
+ *	Released under GPL v2.
+ *
+ */
+#ifndef _LINUX_PNODE_H
+#define _LINUX_PNODE_H
+
+#include <linux/list.h>
+#include "mount.h"
+
+#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
+#define IS_MNT_SLAVE(m) ((m)->mnt_master)
+#define IS_MNT_NEW(m)  (!(m)->mnt_ns)
+#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
+#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
+#define IS_MNT_LOCKED_AND_LAZY(m) \
+	(((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED)
+
+#define CL_EXPIRE    		0x01
+#define CL_SLAVE     		0x02
+#define CL_COPY_UNBINDABLE	0x04
+#define CL_MAKE_SHARED 		0x08
+#define CL_PRIVATE 		0x10
+#define CL_SHARED_TO_SLAVE	0x20
+#define CL_UNPRIVILEGED		0x40
+#define CL_COPY_MNT_NS_FILE	0x80
+
+#define CL_COPY_ALL		(CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
+
+static inline void set_mnt_shared(struct mount *mnt)
+{
+	mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
+	mnt->mnt.mnt_flags |= MNT_SHARED;
+}
+
+void change_mnt_propagation(struct mount *, int);
+int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
+		struct hlist_head *);
+int propagate_umount(struct list_head *);
+int propagate_mount_busy(struct mount *, int);
+void propagate_mount_unlock(struct mount *);
+void mnt_release_group_id(struct mount *);
+int get_dominating_id(struct mount *mnt, const struct path *root);
+unsigned int mnt_get_count(struct mount *mnt);
+void mnt_set_mountpoint(struct mount *, struct mountpoint *,
+			struct mount *);
+struct mount *copy_tree(struct mount *, struct dentry *, int);
+bool is_path_reachable(struct mount *, struct dentry *,
+			 const struct path *root);
+#endif /* _LINUX_PNODE_H */
diff --git a/kernel/fs/posix_acl.c b/kernel/fs/posix_acl.c
new file mode 100644
index 000000000..84bb65b83
--- /dev/null
+++ b/kernel/fs/posix_acl.c
@@ -0,0 +1,905 @@
+/*
+ * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+ *
+ * Fixes from William Schumacher incorporated on 15 March 2001.
+ *    (Reported by Charles Bertsch, <CBertsch@microtest.com>).
+ */
+
+/*
+ *  This file contains generic functions for manipulating
+ *  POSIX 1003.1e draft standard 17 ACLs.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+#include <linux/export.h>
+#include <linux/user_namespace.h>
+
+struct posix_acl **acl_by_type(struct inode *inode, int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return &inode->i_acl;
+	case ACL_TYPE_DEFAULT:
+		return &inode->i_default_acl;
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(acl_by_type);
+
+struct posix_acl *get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl **p = acl_by_type(inode, type);
+	struct posix_acl *acl = ACCESS_ONCE(*p);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *p;
+		if (acl != ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
+	return acl;
+}
+EXPORT_SYMBOL(get_cached_acl);
+
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
+{
+	return rcu_dereference(*acl_by_type(inode, type));
+}
+EXPORT_SYMBOL(get_cached_acl_rcu);
+
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct posix_acl **p = acl_by_type(inode, type);
+	struct posix_acl *old;
+	spin_lock(&inode->i_lock);
+	old = *p;
+	rcu_assign_pointer(*p, posix_acl_dup(acl));
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+EXPORT_SYMBOL(set_cached_acl);
+
+void forget_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl **p = acl_by_type(inode, type);
+	struct posix_acl *old;
+	spin_lock(&inode->i_lock);
+	old = *p;
+	*p = ACL_NOT_CACHED;
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+EXPORT_SYMBOL(forget_cached_acl);
+
+void forget_all_cached_acls(struct inode *inode)
+{
+	struct posix_acl *old_access, *old_default;
+	spin_lock(&inode->i_lock);
+	old_access = inode->i_acl;
+	old_default = inode->i_default_acl;
+	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+	spin_unlock(&inode->i_lock);
+	if (old_access != ACL_NOT_CACHED)
+		posix_acl_release(old_access);
+	if (old_default != ACL_NOT_CACHED)
+		posix_acl_release(old_default);
+}
+EXPORT_SYMBOL(forget_all_cached_acls);
+
+struct posix_acl *get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	if (!IS_POSIXACL(inode))
+		return NULL;
+
+	/*
+	 * A filesystem can force a ACL callback by just never filling the
+	 * ACL cache. But normally you'd fill the cache either at inode
+	 * instantiation time, or on the first ->get_acl call.
+	 *
+	 * If the filesystem doesn't have a get_acl() function at all, we'll
+	 * just create the negative cache entry.
+	 */
+	if (!inode->i_op->get_acl) {
+		set_cached_acl(inode, type, NULL);
+		return NULL;
+	}
+	return inode->i_op->get_acl(inode, type);
+}
+EXPORT_SYMBOL(get_acl);
+
+/*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+	atomic_set(&acl->a_refcount, 1);
+	acl->a_count = count;
+}
+EXPORT_SYMBOL(posix_acl_init);
+
+/*
+ * Allocate a new ACL with the specified number of entries.
+ */
+struct posix_acl *
+posix_acl_alloc(int count, gfp_t flags)
+{
+	const size_t size = sizeof(struct posix_acl) +
+	                    count * sizeof(struct posix_acl_entry);
+	struct posix_acl *acl = kmalloc(size, flags);
+	if (acl)
+		posix_acl_init(acl, count);
+	return acl;
+}
+EXPORT_SYMBOL(posix_acl_alloc);
+
+/*
+ * Clone an ACL.
+ */
+static struct posix_acl *
+posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
+{
+	struct posix_acl *clone = NULL;
+
+	if (acl) {
+		int size = sizeof(struct posix_acl) + acl->a_count *
+		           sizeof(struct posix_acl_entry);
+		clone = kmemdup(acl, size, flags);
+		if (clone)
+			atomic_set(&clone->a_refcount, 1);
+	}
+	return clone;
+}
+
+/*
+ * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
+ */
+int
+posix_acl_valid(const struct posix_acl *acl)
+{
+	const struct posix_acl_entry *pa, *pe;
+	int state = ACL_USER_OBJ;
+	int needs_mask = 0;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
+			return -EINVAL;
+		switch (pa->e_tag) {
+			case ACL_USER_OBJ:
+				if (state == ACL_USER_OBJ) {
+					state = ACL_USER;
+					break;
+				}
+				return -EINVAL;
+
+			case ACL_USER:
+				if (state != ACL_USER)
+					return -EINVAL;
+				if (!uid_valid(pa->e_uid))
+					return -EINVAL;
+				needs_mask = 1;
+				break;
+
+			case ACL_GROUP_OBJ:
+				if (state == ACL_USER) {
+					state = ACL_GROUP;
+					break;
+				}
+				return -EINVAL;
+
+			case ACL_GROUP:
+				if (state != ACL_GROUP)
+					return -EINVAL;
+				if (!gid_valid(pa->e_gid))
+					return -EINVAL;
+				needs_mask = 1;
+				break;
+
+			case ACL_MASK:
+				if (state != ACL_GROUP)
+					return -EINVAL;
+				state = ACL_OTHER;
+				break;
+
+			case ACL_OTHER:
+				if (state == ACL_OTHER ||
+				    (state == ACL_GROUP && !needs_mask)) {
+					state = 0;
+					break;
+				}
+				return -EINVAL;
+
+			default:
+				return -EINVAL;
+		}
+	}
+	if (state == 0)
+		return 0;
+	return -EINVAL;
+}
+EXPORT_SYMBOL(posix_acl_valid);
+
+/*
+ * Returns 0 if the acl can be exactly represented in the traditional
+ * file mode permission bits, or else 1. Returns -E... on error.
+ */
+int
+posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
+{
+	const struct posix_acl_entry *pa, *pe;
+	umode_t mode = 0;
+	int not_equiv = 0;
+
+	/*
+	 * A null ACL can always be presented as mode bits.
+	 */
+	if (!acl)
+		return 0;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch (pa->e_tag) {
+			case ACL_USER_OBJ:
+				mode |= (pa->e_perm & S_IRWXO) << 6;
+				break;
+			case ACL_GROUP_OBJ:
+				mode |= (pa->e_perm & S_IRWXO) << 3;
+				break;
+			case ACL_OTHER:
+				mode |= pa->e_perm & S_IRWXO;
+				break;
+			case ACL_MASK:
+				mode = (mode & ~S_IRWXG) |
+				       ((pa->e_perm & S_IRWXO) << 3);
+				not_equiv = 1;
+				break;
+			case ACL_USER:
+			case ACL_GROUP:
+				not_equiv = 1;
+				break;
+			default:
+				return -EINVAL;
+		}
+	}
+        if (mode_p)
+                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+        return not_equiv;
+}
+EXPORT_SYMBOL(posix_acl_equiv_mode);
+
+/*
+ * Create an ACL representing the file mode permission bits of an inode.
+ */
+struct posix_acl *
+posix_acl_from_mode(umode_t mode, gfp_t flags)
+{
+	struct posix_acl *acl = posix_acl_alloc(3, flags);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	acl->a_entries[0].e_tag  = ACL_USER_OBJ;
+	acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6;
+
+	acl->a_entries[1].e_tag  = ACL_GROUP_OBJ;
+	acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3;
+
+	acl->a_entries[2].e_tag  = ACL_OTHER;
+	acl->a_entries[2].e_perm = (mode & S_IRWXO);
+	return acl;
+}
+EXPORT_SYMBOL(posix_acl_from_mode);
+
+/*
+ * Return 0 if current is granted want access to the inode
+ * by the acl. Returns -E... otherwise.
+ */
+int
+posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
+{
+	const struct posix_acl_entry *pa, *pe, *mask_obj;
+	int found = 0;
+
+	want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+                switch(pa->e_tag) {
+                        case ACL_USER_OBJ:
+				/* (May have been checked already) */
+				if (uid_eq(inode->i_uid, current_fsuid()))
+                                        goto check_perm;
+                                break;
+                        case ACL_USER:
+				if (uid_eq(pa->e_uid, current_fsuid()))
+                                        goto mask;
+				break;
+                        case ACL_GROUP_OBJ:
+                                if (in_group_p(inode->i_gid)) {
+					found = 1;
+					if ((pa->e_perm & want) == want)
+						goto mask;
+                                }
+				break;
+                        case ACL_GROUP:
+				if (in_group_p(pa->e_gid)) {
+					found = 1;
+					if ((pa->e_perm & want) == want)
+						goto mask;
+                                }
+                                break;
+                        case ACL_MASK:
+                                break;
+                        case ACL_OTHER:
+				if (found)
+					return -EACCES;
+				else
+					goto check_perm;
+			default:
+				return -EIO;
+                }
+        }
+	return -EIO;
+
+mask:
+	for (mask_obj = pa+1; mask_obj != pe; mask_obj++) {
+		if (mask_obj->e_tag == ACL_MASK) {
+			if ((pa->e_perm & mask_obj->e_perm & want) == want)
+				return 0;
+			return -EACCES;
+		}
+	}
+
+check_perm:
+	if ((pa->e_perm & want) == want)
+		return 0;
+	return -EACCES;
+}
+
+/*
+ * Modify acl when creating a new inode. The caller must ensure the acl is
+ * only referenced once.
+ *
+ * mode_p initially must contain the mode parameter to the open() / creat()
+ * system calls. All permissions that are not granted by the acl are removed.
+ * The permissions in the acl are changed to reflect the mode_p parameter.
+ */
+static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
+{
+	struct posix_acl_entry *pa, *pe;
+	struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
+	umode_t mode = *mode_p;
+	int not_equiv = 0;
+
+	/* assert(atomic_read(acl->a_refcount) == 1); */
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+                switch(pa->e_tag) {
+                        case ACL_USER_OBJ:
+				pa->e_perm &= (mode >> 6) | ~S_IRWXO;
+				mode &= (pa->e_perm << 6) | ~S_IRWXU;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				not_equiv = 1;
+				break;
+
+                        case ACL_GROUP_OBJ:
+				group_obj = pa;
+                                break;
+
+                        case ACL_OTHER:
+				pa->e_perm &= mode | ~S_IRWXO;
+				mode &= pa->e_perm | ~S_IRWXO;
+                                break;
+
+                        case ACL_MASK:
+				mask_obj = pa;
+				not_equiv = 1;
+                                break;
+
+			default:
+				return -EIO;
+                }
+        }
+
+	if (mask_obj) {
+		mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+		mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
+	} else {
+		if (!group_obj)
+			return -EIO;
+		group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+		mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
+	}
+
+	*mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+        return not_equiv;
+}
+
+/*
+ * Modify the ACL for the chmod syscall.
+ */
+static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
+{
+	struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
+	struct posix_acl_entry *pa, *pe;
+
+	/* assert(atomic_read(acl->a_refcount) == 1); */
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch(pa->e_tag) {
+			case ACL_USER_OBJ:
+				pa->e_perm = (mode & S_IRWXU) >> 6;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				break;
+
+			case ACL_GROUP_OBJ:
+				group_obj = pa;
+				break;
+
+			case ACL_MASK:
+				mask_obj = pa;
+				break;
+
+			case ACL_OTHER:
+				pa->e_perm = (mode & S_IRWXO);
+				break;
+
+			default:
+				return -EIO;
+		}
+	}
+
+	if (mask_obj) {
+		mask_obj->e_perm = (mode & S_IRWXG) >> 3;
+	} else {
+		if (!group_obj)
+			return -EIO;
+		group_obj->e_perm = (mode & S_IRWXG) >> 3;
+	}
+
+	return 0;
+}
+
+int
+__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
+{
+	struct posix_acl *clone = posix_acl_clone(*acl, gfp);
+	int err = -ENOMEM;
+	if (clone) {
+		err = posix_acl_create_masq(clone, mode_p);
+		if (err < 0) {
+			posix_acl_release(clone);
+			clone = NULL;
+		}
+	}
+	posix_acl_release(*acl);
+	*acl = clone;
+	return err;
+}
+EXPORT_SYMBOL(__posix_acl_create);
+
+int
+__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
+{
+	struct posix_acl *clone = posix_acl_clone(*acl, gfp);
+	int err = -ENOMEM;
+	if (clone) {
+		err = __posix_acl_chmod_masq(clone, mode);
+		if (err) {
+			posix_acl_release(clone);
+			clone = NULL;
+		}
+	}
+	posix_acl_release(*acl);
+	*acl = clone;
+	return err;
+}
+EXPORT_SYMBOL(__posix_acl_chmod);
+
+int
+posix_acl_chmod(struct inode *inode, umode_t mode)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (!IS_POSIXACL(inode))
+		return 0;
+	if (!inode->i_op->set_acl)
+		return -EOPNOTSUPP;
+
+	acl = get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR_OR_NULL(acl)) {
+		if (acl == ERR_PTR(-EOPNOTSUPP))
+			return 0;
+		return PTR_ERR(acl);
+	}
+
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	if (ret)
+		return ret;
+	ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
+	posix_acl_release(acl);
+	return ret;
+}
+EXPORT_SYMBOL(posix_acl_chmod);
+
+int
+posix_acl_create(struct inode *dir, umode_t *mode,
+		struct posix_acl **default_acl, struct posix_acl **acl)
+{
+	struct posix_acl *p;
+	int ret;
+
+	if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
+		goto no_acl;
+
+	p = get_acl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(p)) {
+		if (p == ERR_PTR(-EOPNOTSUPP))
+			goto apply_umask;
+		return PTR_ERR(p);
+	}
+
+	if (!p)
+		goto apply_umask;
+
+	*acl = posix_acl_clone(p, GFP_NOFS);
+	if (!*acl)
+		goto no_mem;
+
+	ret = posix_acl_create_masq(*acl, mode);
+	if (ret < 0)
+		goto no_mem_clone;
+
+	if (ret == 0) {
+		posix_acl_release(*acl);
+		*acl = NULL;
+	}
+
+	if (!S_ISDIR(*mode)) {
+		posix_acl_release(p);
+		*default_acl = NULL;
+	} else {
+		*default_acl = p;
+	}
+	return 0;
+
+apply_umask:
+	*mode &= ~current_umask();
+no_acl:
+	*default_acl = NULL;
+	*acl = NULL;
+	return 0;
+
+no_mem_clone:
+	posix_acl_release(*acl);
+no_mem:
+	posix_acl_release(p);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(posix_acl_create);
+
+/*
+ * Fix up the uids and gids in posix acl extended attributes in place.
+ */
+static void posix_acl_fix_xattr_userns(
+	struct user_namespace *to, struct user_namespace *from,
+	void *value, size_t size)
+{
+	posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+	posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+	int count;
+	kuid_t uid;
+	kgid_t gid;
+
+	if (!value)
+		return;
+	if (size < sizeof(posix_acl_xattr_header))
+		return;
+	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return;
+
+	count = posix_acl_xattr_count(size);
+	if (count < 0)
+		return;
+	if (count == 0)
+		return;
+
+	for (end = entry + count; entry != end; entry++) {
+		switch(le16_to_cpu(entry->e_tag)) {
+		case ACL_USER:
+			uid = make_kuid(from, le32_to_cpu(entry->e_id));
+			entry->e_id = cpu_to_le32(from_kuid(to, uid));
+			break;
+		case ACL_GROUP:
+			gid = make_kgid(from, le32_to_cpu(entry->e_id));
+			entry->e_id = cpu_to_le32(from_kgid(to, gid));
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void posix_acl_fix_xattr_from_user(void *value, size_t size)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	if (user_ns == &init_user_ns)
+		return;
+	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
+}
+
+void posix_acl_fix_xattr_to_user(void *value, size_t size)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	if (user_ns == &init_user_ns)
+		return;
+	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
+}
+
+/*
+ * Convert from extended attribute to in-memory representation.
+ */
+struct posix_acl *
+posix_acl_from_xattr(struct user_namespace *user_ns,
+		     const void *value, size_t size)
+{
+	posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+	posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+	int count;
+	struct posix_acl *acl;
+	struct posix_acl_entry *acl_e;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(posix_acl_xattr_header))
+		 return ERR_PTR(-EINVAL);
+	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	count = posix_acl_xattr_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	acl_e = acl->a_entries;
+	
+	for (end = entry + count; entry != end; acl_e++, entry++) {
+		acl_e->e_tag  = le16_to_cpu(entry->e_tag);
+		acl_e->e_perm = le16_to_cpu(entry->e_perm);
+
+		switch(acl_e->e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				break;
+
+			case ACL_USER:
+				acl_e->e_uid =
+					make_kuid(user_ns,
+						  le32_to_cpu(entry->e_id));
+				if (!uid_valid(acl_e->e_uid))
+					goto fail;
+				break;
+			case ACL_GROUP:
+				acl_e->e_gid =
+					make_kgid(user_ns,
+						  le32_to_cpu(entry->e_id));
+				if (!gid_valid(acl_e->e_gid))
+					goto fail;
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL (posix_acl_from_xattr);
+
+/*
+ * Convert from in-memory to extended attribute representation.
+ */
+int
+posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
+		   void *buffer, size_t size)
+{
+	posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
+	posix_acl_xattr_entry *ext_entry;
+	int real_size, n;
+
+	real_size = posix_acl_xattr_size(acl->a_count);
+	if (!buffer)
+		return real_size;
+	if (real_size > size)
+		return -ERANGE;
+
+	ext_entry = ext_acl->a_entries;
+	ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+
+	for (n=0; n < acl->a_count; n++, ext_entry++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+		ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch(acl_e->e_tag) {
+		case ACL_USER:
+			ext_entry->e_id =
+				cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			ext_entry->e_id =
+				cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
+			break;
+		default:
+			ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			break;
+		}
+	}
+	return real_size;
+}
+EXPORT_SYMBOL (posix_acl_to_xattr);
+
+static int
+posix_acl_xattr_get(struct dentry *dentry, const char *name,
+		void *value, size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (!IS_POSIXACL(d_backing_inode(dentry)))
+		return -EOPNOTSUPP;
+	if (d_is_symlink(dentry))
+		return -EOPNOTSUPP;
+
+	acl = get_acl(d_backing_inode(dentry), type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+posix_acl_xattr_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct inode *inode = d_backing_inode(dentry);
+	struct posix_acl *acl = NULL;
+	int ret;
+
+	if (!IS_POSIXACL(inode))
+		return -EOPNOTSUPP;
+	if (!inode->i_op->set_acl)
+		return -EOPNOTSUPP;
+
+	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return value ? -EACCES : 0;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+
+		if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto out;
+		}
+	}
+
+	ret = inode->i_op->set_acl(inode, acl, type);
+out:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static size_t
+posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const char *xname;
+	size_t size;
+
+	if (!IS_POSIXACL(d_backing_inode(dentry)))
+		return -EOPNOTSUPP;
+	if (d_is_symlink(dentry))
+		return -EOPNOTSUPP;
+
+	if (type == ACL_TYPE_ACCESS)
+		xname = POSIX_ACL_XATTR_ACCESS;
+	else
+		xname = POSIX_ACL_XATTR_DEFAULT;
+
+	size = strlen(xname) + 1;
+	if (list && size <= list_size)
+		memcpy(list, xname, size);
+	return size;
+}
+
+const struct xattr_handler posix_acl_access_xattr_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.flags = ACL_TYPE_ACCESS,
+	.list = posix_acl_xattr_list,
+	.get = posix_acl_xattr_get,
+	.set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
+
+const struct xattr_handler posix_acl_default_xattr_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.flags = ACL_TYPE_DEFAULT,
+	.list = posix_acl_xattr_list,
+	.get = posix_acl_xattr_get,
+	.set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
+
+int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int error;
+
+	if (type == ACL_TYPE_ACCESS) {
+		error = posix_acl_equiv_mode(acl, &inode->i_mode);
+		if (error < 0)
+			return 0;
+		if (error == 0)
+			acl = NULL;
+	}
+
+	inode->i_ctime = CURRENT_TIME;
+	set_cached_acl(inode, type, acl);
+	return 0;
+}
+
+int simple_acl_create(struct inode *dir, struct inode *inode)
+{
+	struct posix_acl *default_acl, *acl;
+	int error;
+
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+	set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
+	return 0;
+}
diff --git a/kernel/fs/proc/Kconfig b/kernel/fs/proc/Kconfig
new file mode 100644
index 000000000..2183fcf41
--- /dev/null
+++ b/kernel/fs/proc/Kconfig
@@ -0,0 +1,73 @@
+config PROC_FS
+	bool "/proc file system support" if EXPERT
+	default y
+	help
+	  This is a virtual file system providing information about the status
+	  of the system. "Virtual" means that it doesn't take up any space on
+	  your hard disk: the files are created on the fly by the kernel when
+	  you try to access them. Also, you cannot read the files with older
+	  version of the program less: you need to use more or cat.
+
+	  It's totally cool; for example, "cat /proc/interrupts" gives
+	  information about what the different IRQs are used for at the moment
+	  (there is a small number of Interrupt ReQuest lines in your computer
+	  that are used by the attached devices to gain the CPU's attention --
+	  often a source of trouble if two devices are mistakenly configured
+	  to use the same IRQ). The program procinfo to display some
+	  information about your system gathered from the /proc file system.
+
+	  Before you can use the /proc file system, it has to be mounted,
+	  meaning it has to be given a location in the directory hierarchy.
+	  That location should be /proc. A command such as "mount -t proc proc
+	  /proc" or the equivalent line in /etc/fstab does the job.
+
+	  The /proc file system is explained in the file
+	  <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+	  ("man 5 proc").
+
+	  This option will enlarge your kernel by about 67 KB. Several
+	  programs depend on this, so everyone should say Y here.
+
+config PROC_KCORE
+	bool "/proc/kcore support" if !ARM
+	depends on PROC_FS && MMU
+	help
+	  Provides a virtual ELF core file of the live kernel.  This can
+	  be read with gdb and other ELF tools.  No modifications can be
+	  made using this mechanism.
+
+config PROC_VMCORE
+	bool "/proc/vmcore support"
+	depends on PROC_FS && CRASH_DUMP
+	default y
+        help
+        Exports the dump image of crashed kernel in ELF format.
+
+config PROC_SYSCTL
+	bool "Sysctl support (/proc/sys)" if EXPERT
+	depends on PROC_FS
+	select SYSCTL
+	default y
+	---help---
+	  The sysctl interface provides a means of dynamically changing
+	  certain kernel parameters and variables on the fly without requiring
+	  a recompile of the kernel or reboot of the system.  The primary
+	  interface is through /proc/sys.  If you say Y here a tree of
+	  modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+	  in <file:Documentation/sysctl/>.  Note that enabling this
+	  option will enlarge the kernel by at least 8 KB.
+
+	  As it is generally a good thing, you should say Y here unless
+	  building a kernel for install/rescue disks or your system is very
+	  limited in memory.
+
+config PROC_PAGE_MONITOR
+ 	default y
+	depends on PROC_FS && MMU
+	bool "Enable /proc page monitoring" if EXPERT
+ 	help
+	  Various /proc files exist to monitor process memory utilization:
+	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+	  /proc/kpagecount, and /proc/kpageflags. Disabling these
+          interfaces will reduce the size of the kernel by approximately 4kb.
diff --git a/kernel/fs/proc/Makefile b/kernel/fs/proc/Makefile
new file mode 100644
index 000000000..7151ea428
--- /dev/null
+++ b/kernel/fs/proc/Makefile
@@ -0,0 +1,32 @@
+#
+# Makefile for the Linux proc filesystem routines.
+#
+
+obj-y   += proc.o
+
+proc-y			:= nommu.o task_nommu.o
+proc-$(CONFIG_MMU)	:= task_mmu.o
+
+proc-y       += inode.o root.o base.o generic.o array.o \
+		fd.o
+proc-$(CONFIG_TTY)      += proc_tty.o
+proc-y	+= cmdline.o
+proc-y	+= consoles.o
+proc-y	+= cpuinfo.o
+proc-y	+= devices.o
+proc-y	+= interrupts.o
+proc-y	+= loadavg.o
+proc-y	+= meminfo.o
+proc-y	+= stat.o
+proc-y	+= uptime.o
+proc-y	+= version.o
+proc-y	+= softirqs.o
+proc-y	+= namespaces.o
+proc-y	+= self.o
+proc-y	+= thread_self.o
+proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
+proc-$(CONFIG_NET)		+= proc_net.o
+proc-$(CONFIG_PROC_KCORE)	+= kcore.o
+proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
+proc-$(CONFIG_PRINTK)	+= kmsg.o
+proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
diff --git a/kernel/fs/proc/array.c b/kernel/fs/proc/array.c
new file mode 100644
index 000000000..fd02a9ebf
--- /dev/null
+++ b/kernel/fs/proc/array.c
@@ -0,0 +1,695 @@
+/*
+ *  linux/fs/proc/array.c
+ *
+ *  Copyright (C) 1992  by Linus Torvalds
+ *  based on ideas by Darren Senn
+ *
+ * Fixes:
+ * Michael. K. Johnson: stat,statm extensions.
+ *                      <johnsonm@stolaf.edu>
+ *
+ * Pauline Middelink :  Made cmdline,envline only break at '\0's, to
+ *                      make sure SET_PROCTITLE works. Also removed
+ *                      bad '!' which forced address recalculation for
+ *                      EVERY character on the current page.
+ *                      <middelin@polyware.iaf.nl>
+ *
+ * Danny ter Haar    :	added cpuinfo
+ *			<dth@cistron.nl>
+ *
+ * Alessandro Rubini :  profile extension.
+ *                      <rubini@ipvvis.unipv.it>
+ *
+ * Jeff Tranter      :  added BogoMips field to cpuinfo
+ *                      <Jeff_Tranter@Mitel.COM>
+ *
+ * Bruno Haible      :  remove 4K limit for the maps file
+ *			<haible@ma2s2.mathematik.uni-karlsruhe.de>
+ *
+ * Yves Arrouye      :  remove removal of trailing spaces in get_array.
+ *			<Yves.Arrouye@marin.fdn.fr>
+ *
+ * Jerome Forissier  :  added per-CPU time information to /proc/stat
+ *                      and /proc/<pid>/cpu extension
+ *                      <forissier@isia.cma.fr>
+ *			- Incorporation and non-SMP safe operation
+ *			of forissier patch in 2.1.78 by
+ *			Hans Marcus <crowbar@concepts.nl>
+ *
+ * aeb@cwi.nl        :  /proc/partitions
+ *
+ *
+ * Alan Cox	     :  security fixes.
+ *			<alan@lxorguk.ukuu.org.uk>
+ *
+ * Al Viro           :  safe handling of mm_struct
+ *
+ * Gerhard Wichert   :  added BIGMEM support
+ * Siemens AG           <Gerhard.Wichert@pdb.siemens.de>
+ *
+ * Al Viro & Jeff Garzik :  moved most of the thing into base.c and
+ *			 :  proc_misc.c. The rest may eventually go into
+ *			 :  base.c too.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/tty.h>
+#include <linux/string.h>
+#include <linux/mman.h>
+#include <linux/proc_fs.h>
+#include <linux/ioport.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/signal.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/times.h>
+#include <linux/cpuset.h>
+#include <linux/rcupdate.h>
+#include <linux/delayacct.h>
+#include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/string_helpers.h>
+#include <linux/user_namespace.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include "internal.h"
+
+static inline void task_name(struct seq_file *m, struct task_struct *p)
+{
+	char *buf;
+	char tcomm[sizeof(p->comm)];
+
+	get_task_comm(tcomm, p);
+
+	seq_puts(m, "Name:\t");
+	buf = m->buf + m->count;
+
+	/* Ignore error for now */
+	buf += string_escape_str(tcomm, buf, m->size - m->count,
+				 ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+
+	m->count = buf - m->buf;
+	seq_putc(m, '\n');
+}
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const char * const task_state_array[] = {
+	"R (running)",		/*   0 */
+	"S (sleeping)",		/*   1 */
+	"D (disk sleep)",	/*   2 */
+	"T (stopped)",		/*   4 */
+	"t (tracing stop)",	/*   8 */
+	"X (dead)",		/*  16 */
+	"Z (zombie)",		/*  32 */
+};
+
+static inline const char *get_task_state(struct task_struct *tsk)
+{
+	unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+
+	BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
+
+	return task_state_array[fls(state)];
+}
+
+static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *p)
+{
+	struct user_namespace *user_ns = seq_user_ns(m);
+	struct group_info *group_info;
+	int g;
+	struct task_struct *tracer;
+	const struct cred *cred;
+	pid_t ppid, tpid = 0, tgid, ngid;
+	unsigned int max_fds = 0;
+
+	rcu_read_lock();
+	ppid = pid_alive(p) ?
+		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+
+	tracer = ptrace_parent(p);
+	if (tracer)
+		tpid = task_pid_nr_ns(tracer, ns);
+
+	tgid = task_tgid_nr_ns(p, ns);
+	ngid = task_numa_group_id(p);
+	cred = get_task_cred(p);
+
+	task_lock(p);
+	if (p->files)
+		max_fds = files_fdtable(p->files)->max_fds;
+	task_unlock(p);
+	rcu_read_unlock();
+
+	seq_printf(m,
+		"State:\t%s\n"
+		"Tgid:\t%d\n"
+		"Ngid:\t%d\n"
+		"Pid:\t%d\n"
+		"PPid:\t%d\n"
+		"TracerPid:\t%d\n"
+		"Uid:\t%d\t%d\t%d\t%d\n"
+		"Gid:\t%d\t%d\t%d\t%d\n"
+		"FDSize:\t%d\nGroups:\t",
+		get_task_state(p),
+		tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
+		from_kuid_munged(user_ns, cred->uid),
+		from_kuid_munged(user_ns, cred->euid),
+		from_kuid_munged(user_ns, cred->suid),
+		from_kuid_munged(user_ns, cred->fsuid),
+		from_kgid_munged(user_ns, cred->gid),
+		from_kgid_munged(user_ns, cred->egid),
+		from_kgid_munged(user_ns, cred->sgid),
+		from_kgid_munged(user_ns, cred->fsgid),
+		max_fds);
+
+	group_info = cred->group_info;
+	for (g = 0; g < group_info->ngroups; g++)
+		seq_printf(m, "%d ",
+			   from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
+	put_cred(cred);
+
+#ifdef CONFIG_PID_NS
+	seq_puts(m, "\nNStgid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_tgid_nr_ns(p, pid->numbers[g].ns));
+	seq_puts(m, "\nNSpid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_pid_nr_ns(p, pid->numbers[g].ns));
+	seq_puts(m, "\nNSpgid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_pgrp_nr_ns(p, pid->numbers[g].ns));
+	seq_puts(m, "\nNSsid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_session_nr_ns(p, pid->numbers[g].ns));
+#endif
+	seq_putc(m, '\n');
+}
+
+void render_sigset_t(struct seq_file *m, const char *header,
+				sigset_t *set)
+{
+	int i;
+
+	seq_puts(m, header);
+
+	i = _NSIG;
+	do {
+		int x = 0;
+
+		i -= 4;
+		if (sigismember(set, i+1)) x |= 1;
+		if (sigismember(set, i+2)) x |= 2;
+		if (sigismember(set, i+3)) x |= 4;
+		if (sigismember(set, i+4)) x |= 8;
+		seq_printf(m, "%x", x);
+	} while (i >= 4);
+
+	seq_putc(m, '\n');
+}
+
+static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
+				    sigset_t *catch)
+{
+	struct k_sigaction *k;
+	int i;
+
+	k = p->sighand->action;
+	for (i = 1; i <= _NSIG; ++i, ++k) {
+		if (k->sa.sa_handler == SIG_IGN)
+			sigaddset(ign, i);
+		else if (k->sa.sa_handler != SIG_DFL)
+			sigaddset(catch, i);
+	}
+}
+
+static inline void task_sig(struct seq_file *m, struct task_struct *p)
+{
+	unsigned long flags;
+	sigset_t pending, shpending, blocked, ignored, caught;
+	int num_threads = 0;
+	unsigned long qsize = 0;
+	unsigned long qlim = 0;
+
+	sigemptyset(&pending);
+	sigemptyset(&shpending);
+	sigemptyset(&blocked);
+	sigemptyset(&ignored);
+	sigemptyset(&caught);
+
+	if (lock_task_sighand(p, &flags)) {
+		pending = p->pending.signal;
+		shpending = p->signal->shared_pending.signal;
+		blocked = p->blocked;
+		collect_sigign_sigcatch(p, &ignored, &caught);
+		num_threads = get_nr_threads(p);
+		rcu_read_lock();  /* FIXME: is this correct? */
+		qsize = atomic_read(&__task_cred(p)->user->sigpending);
+		rcu_read_unlock();
+		qlim = task_rlimit(p, RLIMIT_SIGPENDING);
+		unlock_task_sighand(p, &flags);
+	}
+
+	seq_printf(m, "Threads:\t%d\n", num_threads);
+	seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
+
+	/* render them all */
+	render_sigset_t(m, "SigPnd:\t", &pending);
+	render_sigset_t(m, "ShdPnd:\t", &shpending);
+	render_sigset_t(m, "SigBlk:\t", &blocked);
+	render_sigset_t(m, "SigIgn:\t", &ignored);
+	render_sigset_t(m, "SigCgt:\t", &caught);
+}
+
+static void render_cap_t(struct seq_file *m, const char *header,
+			kernel_cap_t *a)
+{
+	unsigned __capi;
+
+	seq_puts(m, header);
+	CAP_FOR_EACH_U32(__capi) {
+		seq_printf(m, "%08x",
+			   a->cap[CAP_LAST_U32 - __capi]);
+	}
+	seq_putc(m, '\n');
+}
+
+static inline void task_cap(struct seq_file *m, struct task_struct *p)
+{
+	const struct cred *cred;
+	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+
+	rcu_read_lock();
+	cred = __task_cred(p);
+	cap_inheritable	= cred->cap_inheritable;
+	cap_permitted	= cred->cap_permitted;
+	cap_effective	= cred->cap_effective;
+	cap_bset	= cred->cap_bset;
+	rcu_read_unlock();
+
+	render_cap_t(m, "CapInh:\t", &cap_inheritable);
+	render_cap_t(m, "CapPrm:\t", &cap_permitted);
+	render_cap_t(m, "CapEff:\t", &cap_effective);
+	render_cap_t(m, "CapBnd:\t", &cap_bset);
+}
+
+static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+	seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+#endif
+}
+
+static inline void task_context_switch_counts(struct seq_file *m,
+						struct task_struct *p)
+{
+	seq_printf(m,	"voluntary_ctxt_switches:\t%lu\n"
+			"nonvoluntary_ctxt_switches:\t%lu\n",
+			p->nvcsw,
+			p->nivcsw);
+}
+
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Cpus_allowed:\t%*pb\n",
+		   cpumask_pr_args(&task->cpus_allowed));
+	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
+		   cpumask_pr_args(&task->cpus_allowed));
+}
+
+int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	struct mm_struct *mm = get_task_mm(task);
+
+	task_name(m, task);
+	task_state(m, ns, pid, task);
+
+	if (mm) {
+		task_mem(m, mm);
+		mmput(mm);
+	}
+	task_sig(m, task);
+	task_cap(m, task);
+	task_seccomp(m, task);
+	task_cpus_allowed(m, task);
+	cpuset_task_status_allowed(m, task);
+	task_context_switch_counts(m, task);
+	return 0;
+}
+
+static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task, int whole)
+{
+	unsigned long vsize, eip, esp, wchan = ~0UL;
+	int priority, nice;
+	int tty_pgrp = -1, tty_nr = 0;
+	sigset_t sigign, sigcatch;
+	char state;
+	pid_t ppid = 0, pgid = -1, sid = -1;
+	int num_threads = 0;
+	int permitted;
+	struct mm_struct *mm;
+	unsigned long long start_time;
+	unsigned long cmin_flt = 0, cmaj_flt = 0;
+	unsigned long  min_flt = 0,  maj_flt = 0;
+	cputime_t cutime, cstime, utime, stime;
+	cputime_t cgtime, gtime;
+	unsigned long rsslim = 0;
+	char tcomm[sizeof(task->comm)];
+	unsigned long flags;
+
+	state = *get_task_state(task);
+	vsize = eip = esp = 0;
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
+	mm = get_task_mm(task);
+	if (mm) {
+		vsize = task_vsize(mm);
+		if (permitted) {
+			eip = KSTK_EIP(task);
+			esp = KSTK_ESP(task);
+		}
+	}
+
+	get_task_comm(tcomm, task);
+
+	sigemptyset(&sigign);
+	sigemptyset(&sigcatch);
+	cutime = cstime = utime = stime = 0;
+	cgtime = gtime = 0;
+
+	if (lock_task_sighand(task, &flags)) {
+		struct signal_struct *sig = task->signal;
+
+		if (sig->tty) {
+			struct pid *pgrp = tty_get_pgrp(sig->tty);
+			tty_pgrp = pid_nr_ns(pgrp, ns);
+			put_pid(pgrp);
+			tty_nr = new_encode_dev(tty_devnum(sig->tty));
+		}
+
+		num_threads = get_nr_threads(task);
+		collect_sigign_sigcatch(task, &sigign, &sigcatch);
+
+		cmin_flt = sig->cmin_flt;
+		cmaj_flt = sig->cmaj_flt;
+		cutime = sig->cutime;
+		cstime = sig->cstime;
+		cgtime = sig->cgtime;
+		rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
+
+		/* add up live thread stats at the group level */
+		if (whole) {
+			struct task_struct *t = task;
+			do {
+				min_flt += t->min_flt;
+				maj_flt += t->maj_flt;
+				gtime += task_gtime(t);
+			} while_each_thread(task, t);
+
+			min_flt += sig->min_flt;
+			maj_flt += sig->maj_flt;
+			thread_group_cputime_adjusted(task, &utime, &stime);
+			gtime += sig->gtime;
+		}
+
+		sid = task_session_nr_ns(task, ns);
+		ppid = task_tgid_nr_ns(task->real_parent, ns);
+		pgid = task_pgrp_nr_ns(task, ns);
+
+		unlock_task_sighand(task, &flags);
+	}
+
+	if (permitted && (!whole || num_threads < 2))
+		wchan = get_wchan(task);
+	if (!whole) {
+		min_flt = task->min_flt;
+		maj_flt = task->maj_flt;
+		task_cputime_adjusted(task, &utime, &stime);
+		gtime = task_gtime(task);
+	}
+
+	/* scale priority and nice values from timeslices to -20..20 */
+	/* to make it look like a "normal" Unix priority/nice value  */
+	priority = task_prio(task);
+	nice = task_nice(task);
+
+	/* convert nsec -> ticks */
+	start_time = nsec_to_clock_t(task->real_start_time);
+
+	seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
+	seq_put_decimal_ll(m, ' ', ppid);
+	seq_put_decimal_ll(m, ' ', pgid);
+	seq_put_decimal_ll(m, ' ', sid);
+	seq_put_decimal_ll(m, ' ', tty_nr);
+	seq_put_decimal_ll(m, ' ', tty_pgrp);
+	seq_put_decimal_ull(m, ' ', task->flags);
+	seq_put_decimal_ull(m, ' ', min_flt);
+	seq_put_decimal_ull(m, ' ', cmin_flt);
+	seq_put_decimal_ull(m, ' ', maj_flt);
+	seq_put_decimal_ull(m, ' ', cmaj_flt);
+	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
+	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
+	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
+	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
+	seq_put_decimal_ll(m, ' ', priority);
+	seq_put_decimal_ll(m, ' ', nice);
+	seq_put_decimal_ll(m, ' ', num_threads);
+	seq_put_decimal_ull(m, ' ', 0);
+	seq_put_decimal_ull(m, ' ', start_time);
+	seq_put_decimal_ull(m, ' ', vsize);
+	seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
+	seq_put_decimal_ull(m, ' ', rsslim);
+	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
+	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
+	seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
+	seq_put_decimal_ull(m, ' ', esp);
+	seq_put_decimal_ull(m, ' ', eip);
+	/* The signal information here is obsolete.
+	 * It must be decimal for Linux 2.0 compatibility.
+	 * Use /proc/#/status for real-time signals.
+	 */
+	seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, ' ', wchan);
+	seq_put_decimal_ull(m, ' ', 0);
+	seq_put_decimal_ull(m, ' ', 0);
+	seq_put_decimal_ll(m, ' ', task->exit_signal);
+	seq_put_decimal_ll(m, ' ', task_cpu(task));
+	seq_put_decimal_ull(m, ' ', task->rt_priority);
+	seq_put_decimal_ull(m, ' ', task->policy);
+	seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
+	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
+	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+
+	if (mm && permitted) {
+		seq_put_decimal_ull(m, ' ', mm->start_data);
+		seq_put_decimal_ull(m, ' ', mm->end_data);
+		seq_put_decimal_ull(m, ' ', mm->start_brk);
+		seq_put_decimal_ull(m, ' ', mm->arg_start);
+		seq_put_decimal_ull(m, ' ', mm->arg_end);
+		seq_put_decimal_ull(m, ' ', mm->env_start);
+		seq_put_decimal_ull(m, ' ', mm->env_end);
+	} else
+		seq_printf(m, " 0 0 0 0 0 0 0");
+
+	if (permitted)
+		seq_put_decimal_ll(m, ' ', task->exit_code);
+	else
+		seq_put_decimal_ll(m, ' ', 0);
+
+	seq_putc(m, '\n');
+	if (mm)
+		mmput(mm);
+	return 0;
+}
+
+int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	return do_task_stat(m, ns, pid, task, 0);
+}
+
+int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	return do_task_stat(m, ns, pid, task, 1);
+}
+
+int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
+	struct mm_struct *mm = get_task_mm(task);
+
+	if (mm) {
+		size = task_statm(mm, &shared, &text, &data, &resident);
+		mmput(mm);
+	}
+	/*
+	 * For quick read, open code by putting numbers directly
+	 * expected format is
+	 * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+	 *               size, resident, shared, text, data);
+	 */
+	seq_put_decimal_ull(m, 0, size);
+	seq_put_decimal_ull(m, ' ', resident);
+	seq_put_decimal_ull(m, ' ', shared);
+	seq_put_decimal_ull(m, ' ', text);
+	seq_put_decimal_ull(m, ' ', 0);
+	seq_put_decimal_ull(m, ' ', data);
+	seq_put_decimal_ull(m, ' ', 0);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static struct pid *
+get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
+{
+	struct task_struct *start, *task;
+	struct pid *pid = NULL;
+
+	read_lock(&tasklist_lock);
+
+	start = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (!start)
+		goto out;
+
+	/*
+	 * Lets try to continue searching first, this gives
+	 * us significant speedup on children-rich processes.
+	 */
+	if (pid_prev) {
+		task = pid_task(pid_prev, PIDTYPE_PID);
+		if (task && task->real_parent == start &&
+		    !(list_empty(&task->sibling))) {
+			if (list_is_last(&task->sibling, &start->children))
+				goto out;
+			task = list_first_entry(&task->sibling,
+						struct task_struct, sibling);
+			pid = get_pid(task_pid(task));
+			goto out;
+		}
+	}
+
+	/*
+	 * Slow search case.
+	 *
+	 * We might miss some children here if children
+	 * are exited while we were not holding the lock,
+	 * but it was never promised to be accurate that
+	 * much.
+	 *
+	 * "Just suppose that the parent sleeps, but N children
+	 *  exit after we printed their tids. Now the slow paths
+	 *  skips N extra children, we miss N tasks." (c)
+	 *
+	 * So one need to stop or freeze the leader and all
+	 * its children to get a precise result.
+	 */
+	list_for_each_entry(task, &start->children, sibling) {
+		if (pos-- == 0) {
+			pid = get_pid(task_pid(task));
+			break;
+		}
+	}
+
+out:
+	read_unlock(&tasklist_lock);
+	return pid;
+}
+
+static int children_seq_show(struct seq_file *seq, void *v)
+{
+	struct inode *inode = seq->private;
+	pid_t pid;
+
+	pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
+	seq_printf(seq, "%d ", pid);
+
+	return 0;
+}
+
+static void *children_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return get_children_pid(seq->private, NULL, *pos);
+}
+
+static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct pid *pid;
+
+	pid = get_children_pid(seq->private, v, *pos + 1);
+	put_pid(v);
+
+	++*pos;
+	return pid;
+}
+
+static void children_seq_stop(struct seq_file *seq, void *v)
+{
+	put_pid(v);
+}
+
+static const struct seq_operations children_seq_ops = {
+	.start	= children_seq_start,
+	.next	= children_seq_next,
+	.stop	= children_seq_stop,
+	.show	= children_seq_show,
+};
+
+static int children_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &children_seq_ops);
+	if (ret)
+		return ret;
+
+	m = file->private_data;
+	m->private = inode;
+
+	return ret;
+}
+
+int children_seq_release(struct inode *inode, struct file *file)
+{
+	seq_release(inode, file);
+	return 0;
+}
+
+const struct file_operations proc_tid_children_operations = {
+	.open    = children_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = children_seq_release,
+};
+#endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/kernel/fs/proc/base.c b/kernel/fs/proc/base.c
new file mode 100644
index 000000000..093ca14f5
--- /dev/null
+++ b/kernel/fs/proc/base.c
@@ -0,0 +1,3214 @@
+/*
+ *  linux/fs/proc/base.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  proc base directory handling functions
+ *
+ *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
+ *  Instead of using magical inumbers to determine the kind of object
+ *  we allocate and fill in-core inodes upon lookup. They don't even
+ *  go into icache. We cache the reference to task_struct upon lookup too.
+ *  Eventually it should become a filesystem in its own. We don't use the
+ *  rest of procfs anymore.
+ *
+ *
+ *  Changelog:
+ *  17-Jan-2005
+ *  Allan Bezerra
+ *  Bruna Moreira <bruna.moreira@indt.org.br>
+ *  Edjard Mota <edjard.mota@indt.org.br>
+ *  Ilias Biris <ilias.biris@indt.org.br>
+ *  Mauricio Lin <mauricio.lin@indt.org.br>
+ *
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ *  A new process specific entry (smaps) included in /proc. It shows the
+ *  size of rss for each memory area. The maps entry lacks information
+ *  about physical memory size (rss) for each mapped file, i.e.,
+ *  rss information for executables and library files.
+ *  This additional information is useful for any tools that need to know
+ *  about physical memory consumption for a process specific library.
+ *
+ *  Changelog:
+ *  21-Feb-2005
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *  Pud inclusion in the page table walking.
+ *
+ *  ChangeLog:
+ *  10-Mar-2005
+ *  10LE Instituto Nokia de Tecnologia - INdT:
+ *  A better way to walks through the page table as suggested by Hugh Dickins.
+ *
+ *  Simo Piiroinen <simo.piiroinen@nokia.com>:
+ *  Smaps information related to shared, private, clean and dirty pages.
+ *
+ *  Paul Mundt <paul.mundt@nokia.com>:
+ *  Overall revision about smaps.
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+#include <linux/resource.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/printk.h>
+#include <linux/cgroup.h>
+#include <linux/cpuset.h>
+#include <linux/audit.h>
+#include <linux/poll.h>
+#include <linux/nsproxy.h>
+#include <linux/oom.h>
+#include <linux/elf.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/slab.h>
+#include <linux/flex_array.h>
+#include <linux/posix-timers.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
+#include <trace/events/oom.h>
+#include "internal.h"
+#include "fd.h"
+
+/* NOTE:
+ *	Implementing inode permission operations in /proc is almost
+ *	certainly an error.  Permission checks need to happen during
+ *	each system call not at open time.  The reason is that most of
+ *	what we wish to check for permissions in /proc varies at runtime.
+ *
+ *	The classic example of a problem is opening file descriptors
+ *	in /proc for a task before it execs a suid executable.
+ */
+
+struct pid_entry {
+	const char *name;
+	int len;
+	umode_t mode;
+	const struct inode_operations *iop;
+	const struct file_operations *fop;
+	union proc_op op;
+};
+
+#define NOD(NAME, MODE, IOP, FOP, OP) {			\
+	.name = (NAME),					\
+	.len  = sizeof(NAME) - 1,			\
+	.mode = MODE,					\
+	.iop  = IOP,					\
+	.fop  = FOP,					\
+	.op   = OP,					\
+}
+
+#define DIR(NAME, MODE, iops, fops)	\
+	NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
+#define LNK(NAME, get_link)					\
+	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
+		&proc_pid_link_inode_operations, NULL,		\
+		{ .proc_get_link = get_link } )
+#define REG(NAME, MODE, fops)				\
+	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
+#define ONE(NAME, MODE, show)				\
+	NOD(NAME, (S_IFREG|(MODE)), 			\
+		NULL, &proc_single_file_operations,	\
+		{ .proc_show = show } )
+
+/*
+ * Count the number of hardlinks for the pid_entry table, excluding the .
+ * and .. links.
+ */
+static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+	unsigned int n)
+{
+	unsigned int i;
+	unsigned int count;
+
+	count = 0;
+	for (i = 0; i < n; ++i) {
+		if (S_ISDIR(entries[i].mode))
+			++count;
+	}
+
+	return count;
+}
+
+static int get_task_root(struct task_struct *task, struct path *root)
+{
+	int result = -ENOENT;
+
+	task_lock(task);
+	if (task->fs) {
+		get_fs_root(task->fs, root);
+		result = 0;
+	}
+	task_unlock(task);
+	return result;
+}
+
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
+{
+	struct task_struct *task = get_proc_task(d_inode(dentry));
+	int result = -ENOENT;
+
+	if (task) {
+		task_lock(task);
+		if (task->fs) {
+			get_fs_pwd(task->fs, path);
+			result = 0;
+		}
+		task_unlock(task);
+		put_task_struct(task);
+	}
+	return result;
+}
+
+static int proc_root_link(struct dentry *dentry, struct path *path)
+{
+	struct task_struct *task = get_proc_task(d_inode(dentry));
+	int result = -ENOENT;
+
+	if (task) {
+		result = get_task_root(task, path);
+		put_task_struct(task);
+	}
+	return result;
+}
+
+static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
+			    struct pid *pid, struct task_struct *task)
+{
+	/*
+	 * Rely on struct seq_operations::show() being called once
+	 * per internal buffer allocation. See single_open(), traverse().
+	 */
+	BUG_ON(m->size < PAGE_SIZE);
+	m->count += get_cmdline(task, m->buf, PAGE_SIZE);
+	return 0;
+}
+
+static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
+			 struct pid *pid, struct task_struct *task)
+{
+	struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+	if (mm && !IS_ERR(mm)) {
+		unsigned int nwords = 0;
+		do {
+			nwords += 2;
+		} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+		seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
+		mmput(mm);
+		return 0;
+	} else
+		return PTR_ERR(mm);
+}
+
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * Provides a wchan file via kallsyms in a proper one-value-per-file format.
+ * Returns the resolved symbol.  If that fails, simply return the address.
+ */
+static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+			  struct pid *pid, struct task_struct *task)
+{
+	unsigned long wchan;
+	char symname[KSYM_NAME_LEN];
+
+	wchan = get_wchan(task);
+
+	if (lookup_symbol_name(wchan, symname) < 0) {
+		if (!ptrace_may_access(task, PTRACE_MODE_READ))
+			return 0;
+		seq_printf(m, "%lu", wchan);
+	} else {
+		seq_printf(m, "%s", symname);
+	}
+
+	return 0;
+}
+#endif /* CONFIG_KALLSYMS */
+
+static int lock_trace(struct task_struct *task)
+{
+	int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return err;
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+		mutex_unlock(&task->signal->cred_guard_mutex);
+		return -EPERM;
+	}
+	return 0;
+}
+
+static void unlock_trace(struct task_struct *task)
+{
+	mutex_unlock(&task->signal->cred_guard_mutex);
+}
+
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_STACK_TRACE_DEPTH	64
+
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+			  struct pid *pid, struct task_struct *task)
+{
+	struct stack_trace trace;
+	unsigned long *entries;
+	int err;
+	int i;
+
+	entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
+	trace.entries		= entries;
+	trace.skip		= 0;
+
+	err = lock_trace(task);
+	if (!err) {
+		save_stack_trace_tsk(task, &trace);
+
+		for (i = 0; i < trace.nr_entries; i++) {
+			seq_printf(m, "[<%pK>] %pS\n",
+				   (void *)entries[i], (void *)entries[i]);
+		}
+		unlock_trace(task);
+	}
+	kfree(entries);
+
+	return err;
+}
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * Provides /proc/PID/schedstat
+ */
+static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+			      struct pid *pid, struct task_struct *task)
+{
+	seq_printf(m, "%llu %llu %lu\n",
+		   (unsigned long long)task->se.sum_exec_runtime,
+		   (unsigned long long)task->sched_info.run_delay,
+		   task->sched_info.pcount);
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+	int i;
+	struct inode *inode = m->private;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+	seq_puts(m, "Latency Top version : v0.1\n");
+	for (i = 0; i < 32; i++) {
+		struct latency_record *lr = &task->latency_record[i];
+		if (lr->backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li",
+				   lr->count, lr->time, lr->max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				unsigned long bt = lr->backtrace[q];
+				if (!bt)
+					break;
+				if (bt == ULONG_MAX)
+					break;
+				seq_printf(m, " %ps", (void *)bt);
+			}
+			seq_putc(m, '\n');
+		}
+
+	}
+	put_task_struct(task);
+	return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lstats_show_proc, inode);
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+
+	if (!task)
+		return -ESRCH;
+	clear_all_latency_tracing(task);
+	put_task_struct(task);
+
+	return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
+static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+			  struct pid *pid, struct task_struct *task)
+{
+	unsigned long totalpages = totalram_pages + total_swap_pages;
+	unsigned long points = 0;
+
+	read_lock(&tasklist_lock);
+	if (pid_alive(task))
+		points = oom_badness(task, NULL, NULL, totalpages) *
+						1000 / totalpages;
+	read_unlock(&tasklist_lock);
+	seq_printf(m, "%lu\n", points);
+
+	return 0;
+}
+
+struct limit_names {
+	const char *name;
+	const char *unit;
+};
+
+static const struct limit_names lnames[RLIM_NLIMITS] = {
+	[RLIMIT_CPU] = {"Max cpu time", "seconds"},
+	[RLIMIT_FSIZE] = {"Max file size", "bytes"},
+	[RLIMIT_DATA] = {"Max data size", "bytes"},
+	[RLIMIT_STACK] = {"Max stack size", "bytes"},
+	[RLIMIT_CORE] = {"Max core file size", "bytes"},
+	[RLIMIT_RSS] = {"Max resident set", "bytes"},
+	[RLIMIT_NPROC] = {"Max processes", "processes"},
+	[RLIMIT_NOFILE] = {"Max open files", "files"},
+	[RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
+	[RLIMIT_AS] = {"Max address space", "bytes"},
+	[RLIMIT_LOCKS] = {"Max file locks", "locks"},
+	[RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
+	[RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
+	[RLIMIT_NICE] = {"Max nice priority", NULL},
+	[RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
+	[RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
+};
+
+/* Display limits for a process */
+static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task)
+{
+	unsigned int i;
+	unsigned long flags;
+
+	struct rlimit rlim[RLIM_NLIMITS];
+
+	if (!lock_task_sighand(task, &flags))
+		return 0;
+	memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
+	unlock_task_sighand(task, &flags);
+
+	/*
+	 * print the file header
+	 */
+       seq_printf(m, "%-25s %-20s %-20s %-10s\n",
+		  "Limit", "Soft Limit", "Hard Limit", "Units");
+
+	for (i = 0; i < RLIM_NLIMITS; i++) {
+		if (rlim[i].rlim_cur == RLIM_INFINITY)
+			seq_printf(m, "%-25s %-20s ",
+				   lnames[i].name, "unlimited");
+		else
+			seq_printf(m, "%-25s %-20lu ",
+				   lnames[i].name, rlim[i].rlim_cur);
+
+		if (rlim[i].rlim_max == RLIM_INFINITY)
+			seq_printf(m, "%-20s ", "unlimited");
+		else
+			seq_printf(m, "%-20lu ", rlim[i].rlim_max);
+
+		if (lnames[i].unit)
+			seq_printf(m, "%-10s\n", lnames[i].unit);
+		else
+			seq_putc(m, '\n');
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+			    struct pid *pid, struct task_struct *task)
+{
+	long nr;
+	unsigned long args[6], sp, pc;
+	int res;
+
+	res = lock_trace(task);
+	if (res)
+		return res;
+
+	if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+		seq_puts(m, "running\n");
+	else if (nr < 0)
+		seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+	else
+		seq_printf(m,
+		       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
+		       nr,
+		       args[0], args[1], args[2], args[3], args[4], args[5],
+		       sp, pc);
+	unlock_trace(task);
+
+	return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+
+/************************************************************************/
+/*                       Here the fs part begins                        */
+/************************************************************************/
+
+/* permission checks */
+static int proc_fd_access_allowed(struct inode *inode)
+{
+	struct task_struct *task;
+	int allowed = 0;
+	/* Allow access to a task's file descriptors if it is us or we
+	 * may use ptrace attach to the process and find out that
+	 * information.
+	 */
+	task = get_proc_task(inode);
+	if (task) {
+		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+		put_task_struct(task);
+	}
+	return allowed;
+}
+
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int error;
+	struct inode *inode = d_inode(dentry);
+
+	if (attr->ia_valid & ATTR_MODE)
+		return -EPERM;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+				 struct task_struct *task,
+				 int hide_pid_min)
+{
+	if (pid->hide_pid < hide_pid_min)
+		return true;
+	if (in_group_p(pid->pid_gid))
+		return true;
+	return ptrace_may_access(task, PTRACE_MODE_READ);
+}
+
+
+static int proc_pid_permission(struct inode *inode, int mask)
+{
+	struct pid_namespace *pid = inode->i_sb->s_fs_info;
+	struct task_struct *task;
+	bool has_perms;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+	has_perms = has_pid_permissions(pid, task, 1);
+	put_task_struct(task);
+
+	if (!has_perms) {
+		if (pid->hide_pid == 2) {
+			/*
+			 * Let's make getdents(), stat(), and open()
+			 * consistent with each other.  If a process
+			 * may not stat() a file, it shouldn't be seen
+			 * in procfs at all.
+			 */
+			return -ENOENT;
+		}
+
+		return -EPERM;
+	}
+	return generic_permission(inode, mask);
+}
+
+
+
+static const struct inode_operations proc_def_inode_operations = {
+	.setattr	= proc_setattr,
+};
+
+static int proc_single_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct pid_namespace *ns;
+	struct pid *pid;
+	struct task_struct *task;
+	int ret;
+
+	ns = inode->i_sb->s_fs_info;
+	pid = proc_pid(inode);
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		return -ESRCH;
+
+	ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_single_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, proc_single_show, inode);
+}
+
+static const struct file_operations proc_single_file_operations = {
+	.open		= proc_single_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
+{
+	struct task_struct *task = get_proc_task(inode);
+	struct mm_struct *mm = ERR_PTR(-ESRCH);
+
+	if (task) {
+		mm = mm_access(task, mode);
+		put_task_struct(task);
+
+		if (!IS_ERR_OR_NULL(mm)) {
+			/* ensure this mm_struct can't be freed */
+			atomic_inc(&mm->mm_count);
+			/* but do not pin its memory */
+			mmput(mm);
+		}
+	}
+
+	return mm;
+}
+
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+{
+	struct mm_struct *mm = proc_mem_open(inode, mode);
+
+	if (IS_ERR(mm))
+		return PTR_ERR(mm);
+
+	file->private_data = mm;
+	return 0;
+}
+
+static int mem_open(struct inode *inode, struct file *file)
+{
+	int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
+
+	/* OK to pass negative loff_t, we can catch out-of-range */
+	file->f_mode |= FMODE_UNSIGNED_OFFSET;
+
+	return ret;
+}
+
+static ssize_t mem_rw(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos, int write)
+{
+	struct mm_struct *mm = file->private_data;
+	unsigned long addr = *ppos;
+	ssize_t copied;
+	char *page;
+
+	if (!mm)
+		return 0;
+
+	page = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		return -ENOMEM;
+
+	copied = 0;
+	if (!atomic_inc_not_zero(&mm->mm_users))
+		goto free;
+
+	while (count > 0) {
+		int this_len = min_t(int, count, PAGE_SIZE);
+
+		if (write && copy_from_user(page, buf, this_len)) {
+			copied = -EFAULT;
+			break;
+		}
+
+		this_len = access_remote_vm(mm, addr, page, this_len, write);
+		if (!this_len) {
+			if (!copied)
+				copied = -EIO;
+			break;
+		}
+
+		if (!write && copy_to_user(buf, page, this_len)) {
+			copied = -EFAULT;
+			break;
+		}
+
+		buf += this_len;
+		addr += this_len;
+		copied += this_len;
+		count -= this_len;
+	}
+	*ppos = addr;
+
+	mmput(mm);
+free:
+	free_page((unsigned long) page);
+	return copied;
+}
+
+static ssize_t mem_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
+loff_t mem_lseek(struct file *file, loff_t offset, int orig)
+{
+	switch (orig) {
+	case 0:
+		file->f_pos = offset;
+		break;
+	case 1:
+		file->f_pos += offset;
+		break;
+	default:
+		return -EINVAL;
+	}
+	force_successful_syscall_return();
+	return file->f_pos;
+}
+
+static int mem_release(struct inode *inode, struct file *file)
+{
+	struct mm_struct *mm = file->private_data;
+	if (mm)
+		mmdrop(mm);
+	return 0;
+}
+
+static const struct file_operations proc_mem_operations = {
+	.llseek		= mem_lseek,
+	.read		= mem_read,
+	.write		= mem_write,
+	.open		= mem_open,
+	.release	= mem_release,
+};
+
+static int environ_open(struct inode *inode, struct file *file)
+{
+	return __mem_open(inode, file, PTRACE_MODE_READ);
+}
+
+static ssize_t environ_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	char *page;
+	unsigned long src = *ppos;
+	int ret = 0;
+	struct mm_struct *mm = file->private_data;
+
+	if (!mm)
+		return 0;
+
+	page = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		return -ENOMEM;
+
+	ret = 0;
+	if (!atomic_inc_not_zero(&mm->mm_users))
+		goto free;
+	while (count > 0) {
+		size_t this_len, max_len;
+		int retval;
+
+		if (src >= (mm->env_end - mm->env_start))
+			break;
+
+		this_len = mm->env_end - (mm->env_start + src);
+
+		max_len = min_t(size_t, PAGE_SIZE, count);
+		this_len = min(max_len, this_len);
+
+		retval = access_remote_vm(mm, (mm->env_start + src),
+			page, this_len, 0);
+
+		if (retval <= 0) {
+			ret = retval;
+			break;
+		}
+
+		if (copy_to_user(buf, page, retval)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret += retval;
+		src += retval;
+		buf += retval;
+		count -= retval;
+	}
+	*ppos = src;
+	mmput(mm);
+
+free:
+	free_page((unsigned long) page);
+	return ret;
+}
+
+static const struct file_operations proc_environ_operations = {
+	.open		= environ_open,
+	.read		= environ_read,
+	.llseek		= generic_file_llseek,
+	.release	= mem_release,
+};
+
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF];
+	int oom_adj = OOM_ADJUST_MIN;
+	size_t len;
+	unsigned long flags;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+			oom_adj = OOM_ADJUST_MAX;
+		else
+			oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+				  OOM_SCORE_ADJ_MAX;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	int oom_adj;
+	unsigned long flags;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+	if (err)
+		goto out;
+	if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+	     oom_adj != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file_inode(file));
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (oom_adj == OOM_ADJUST_MAX)
+		oom_adj = OOM_SCORE_ADJ_MAX;
+	else
+		oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+	if (oom_adj < task->signal->oom_score_adj &&
+	    !capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	/*
+	 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+	 * /proc/pid/oom_score_adj instead.
+	 */
+	pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+		  current->comm, task_pid_nr(current), task_pid_nr(task),
+		  task_pid_nr(task));
+
+	task->signal->oom_score_adj = oom_adj;
+	trace_oom_score_adj_update(task);
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+	.read		= oom_adj_read,
+	.write		= oom_adj_write,
+	.llseek		= generic_file_llseek,
+};
+
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF];
+	short oom_score_adj = OOM_SCORE_ADJ_MIN;
+	unsigned long flags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		oom_score_adj = task->signal->oom_score_adj;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	unsigned long flags;
+	int oom_score_adj;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
+	if (err)
+		goto out;
+	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file_inode(file));
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
+			!capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	task->signal->oom_score_adj = (short)oom_score_adj;
+	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+		task->signal->oom_score_adj_min = (short)oom_score_adj;
+	trace_oom_score_adj_update(task);
+
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_score_adj_operations = {
+	.read		= oom_score_adj_read,
+	.write		= oom_score_adj_write,
+	.llseek		= default_llseek,
+};
+
+#ifdef CONFIG_AUDITSYSCALL
+#define TMPBUFLEN 21
+static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode * inode = file_inode(file);
+	struct task_struct *task = get_proc_task(inode);
+	ssize_t length;
+	char tmpbuf[TMPBUFLEN];
+
+	if (!task)
+		return -ESRCH;
+	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+			   from_kuid(file->f_cred->user_ns,
+				     audit_get_loginuid(task)));
+	put_task_struct(task);
+	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode * inode = file_inode(file);
+	char *page, *tmp;
+	ssize_t length;
+	uid_t loginuid;
+	kuid_t kloginuid;
+
+	rcu_read_lock();
+	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+		rcu_read_unlock();
+		return -EPERM;
+	}
+	rcu_read_unlock();
+
+	if (count >= PAGE_SIZE)
+		count = PAGE_SIZE - 1;
+
+	if (*ppos != 0) {
+		/* No partial writes. */
+		return -EINVAL;
+	}
+	page = (char*)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		return -ENOMEM;
+	length = -EFAULT;
+	if (copy_from_user(page, buf, count))
+		goto out_free_page;
+
+	page[count] = '\0';
+	loginuid = simple_strtoul(page, &tmp, 10);
+	if (tmp == page) {
+		length = -EINVAL;
+		goto out_free_page;
+
+	}
+
+	/* is userspace tring to explicitly UNSET the loginuid? */
+	if (loginuid == AUDIT_UID_UNSET) {
+		kloginuid = INVALID_UID;
+	} else {
+		kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+		if (!uid_valid(kloginuid)) {
+			length = -EINVAL;
+			goto out_free_page;
+		}
+	}
+
+	length = audit_set_loginuid(kloginuid);
+	if (likely(length == 0))
+		length = count;
+
+out_free_page:
+	free_page((unsigned long) page);
+	return length;
+}
+
+static const struct file_operations proc_loginuid_operations = {
+	.read		= proc_loginuid_read,
+	.write		= proc_loginuid_write,
+	.llseek		= generic_file_llseek,
+};
+
+static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode * inode = file_inode(file);
+	struct task_struct *task = get_proc_task(inode);
+	ssize_t length;
+	char tmpbuf[TMPBUFLEN];
+
+	if (!task)
+		return -ESRCH;
+	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+				audit_get_sessionid(task));
+	put_task_struct(task);
+	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static const struct file_operations proc_sessionid_operations = {
+	.read		= proc_sessionid_read,
+	.llseek		= generic_file_llseek,
+};
+#endif
+
+#ifdef CONFIG_FAULT_INJECTION
+static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
+				      size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	int make_it_fail;
+
+	if (!task)
+		return -ESRCH;
+	make_it_fail = task->make_it_fail;
+	put_task_struct(task);
+
+	len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
+
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t proc_fault_inject_write(struct file * file,
+			const char __user * buf, size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF], *end;
+	int make_it_fail;
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
+	if (*end)
+		return -EINVAL;
+	if (make_it_fail < 0 || make_it_fail > 1)
+		return -EINVAL;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+	task->make_it_fail = make_it_fail;
+	put_task_struct(task);
+
+	return count;
+}
+
+static const struct file_operations proc_fault_inject_operations = {
+	.read		= proc_fault_inject_read,
+	.write		= proc_fault_inject_write,
+	.llseek		= generic_file_llseek,
+};
+#endif
+
+
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * Print out various scheduling related per-task fields:
+ */
+static int sched_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_show_task(p, m);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t
+sched_write(struct file *file, const char __user *buf,
+	    size_t count, loff_t *offset)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_set_task(p);
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int sched_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_show, inode);
+}
+
+static const struct file_operations proc_pid_sched_operations = {
+	.open		= sched_open,
+	.read		= seq_read,
+	.write		= sched_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_autogroup_show_task(p, m);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+	    size_t count, loff_t *offset)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+	char buffer[PROC_NUMBUF];
+	int nice;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = kstrtoint(strstrip(buffer), 0, &nice);
+	if (err < 0)
+		return err;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	err = proc_sched_autogroup_set_nice(p, nice);
+	if (err)
+		count = err;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+
+	ret = single_open(filp, sched_autogroup_show, NULL);
+	if (!ret) {
+		struct seq_file *m = filp->private_data;
+
+		m->private = inode;
+	}
+	return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+	.open		= sched_autogroup_open,
+	.read		= seq_read,
+	.write		= sched_autogroup_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
+static ssize_t comm_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *offset)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+	char buffer[TASK_COMM_LEN];
+	const size_t maxlen = sizeof(buffer) - 1;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
+		return -EFAULT;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (same_thread_group(current, p))
+		set_task_comm(p, buffer);
+	else
+		count = -EINVAL;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int comm_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	task_lock(p);
+	seq_printf(m, "%s\n", p->comm);
+	task_unlock(p);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static int comm_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, comm_show, inode);
+}
+
+static const struct file_operations proc_pid_set_comm_operations = {
+	.open		= comm_open,
+	.read		= seq_read,
+	.write		= comm_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	struct file *exe_file;
+
+	task = get_proc_task(d_inode(dentry));
+	if (!task)
+		return -ENOENT;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		return -ENOENT;
+	exe_file = get_mm_exe_file(mm);
+	mmput(mm);
+	if (exe_file) {
+		*exe_path = exe_file->f_path;
+		path_get(&exe_file->f_path);
+		fput(exe_file);
+		return 0;
+	} else
+		return -ENOENT;
+}
+
+static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = d_inode(dentry);
+	struct path path;
+	int error = -EACCES;
+
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
+		goto out;
+
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+	if (error)
+		goto out;
+
+	nd_jump_link(nd, &path);
+	return NULL;
+out:
+	return ERR_PTR(error);
+}
+
+static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+{
+	char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
+	char *pathname;
+	int len;
+
+	if (!tmp)
+		return -ENOMEM;
+
+	pathname = d_path(path, tmp, PAGE_SIZE);
+	len = PTR_ERR(pathname);
+	if (IS_ERR(pathname))
+		goto out;
+	len = tmp + PAGE_SIZE - 1 - pathname;
+
+	if (len > buflen)
+		len = buflen;
+	if (copy_to_user(buffer, pathname, len))
+		len = -EFAULT;
+ out:
+	free_page((unsigned long)tmp);
+	return len;
+}
+
+static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
+{
+	int error = -EACCES;
+	struct inode *inode = d_inode(dentry);
+	struct path path;
+
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
+		goto out;
+
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+	if (error)
+		goto out;
+
+	error = do_proc_readlink(&path, buffer, buflen);
+	path_put(&path);
+out:
+	return error;
+}
+
+const struct inode_operations proc_pid_link_inode_operations = {
+	.readlink	= proc_pid_readlink,
+	.follow_link	= proc_pid_follow_link,
+	.setattr	= proc_setattr,
+};
+
+
+/* building an inode */
+
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+{
+	struct inode * inode;
+	struct proc_inode *ei;
+	const struct cred *cred;
+
+	/* We need a new inode */
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	/* Common stuff */
+	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &proc_def_inode_operations;
+
+	/*
+	 * grab the reference to task.
+	 */
+	ei->pid = get_task_pid(task, PIDTYPE_PID);
+	if (!ei->pid)
+		goto out_unlock;
+
+	if (task_dumpable(task)) {
+		rcu_read_lock();
+		cred = __task_cred(task);
+		inode->i_uid = cred->euid;
+		inode->i_gid = cred->egid;
+		rcu_read_unlock();
+	}
+	security_task_to_inode(task, inode);
+
+out:
+	return inode;
+
+out_unlock:
+	iput(inode);
+	return NULL;
+}
+
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct task_struct *task;
+	const struct cred *cred;
+	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
+
+	generic_fillattr(inode, stat);
+
+	rcu_read_lock();
+	stat->uid = GLOBAL_ROOT_UID;
+	stat->gid = GLOBAL_ROOT_GID;
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		if (!has_pid_permissions(pid, task, 2)) {
+			rcu_read_unlock();
+			/*
+			 * This doesn't prevent learning whether PID exists,
+			 * it only makes getattr() consistent with readdir().
+			 */
+			return -ENOENT;
+		}
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			cred = __task_cred(task);
+			stat->uid = cred->euid;
+			stat->gid = cred->egid;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+/* dentry stuff */
+
+/*
+ *	Exceptional case: normally we are not allowed to unhash a busy
+ * directory. In this case, however, we can do it - no aliasing problems
+ * due to the way we treat inodes.
+ *
+ * Rewrite the inode's ownerships here because the owning task may have
+ * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
+ */
+int pid_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode;
+	struct task_struct *task;
+	const struct cred *cred;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = d_inode(dentry);
+	task = get_proc_task(inode);
+
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = GLOBAL_ROOT_UID;
+			inode->i_gid = GLOBAL_ROOT_GID;
+		}
+		inode->i_mode &= ~(S_ISUID | S_ISGID);
+		security_task_to_inode(task, inode);
+		put_task_struct(task);
+		return 1;
+	}
+	return 0;
+}
+
+static inline bool proc_inode_is_dead(struct inode *inode)
+{
+	return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+}
+
+int pid_delete_dentry(const struct dentry *dentry)
+{
+	/* Is the task we represent dead?
+	 * If so, then don't put the dentry on the lru list,
+	 * kill it immediately.
+	 */
+	return proc_inode_is_dead(d_inode(dentry));
+}
+
+const struct dentry_operations pid_dentry_operations =
+{
+	.d_revalidate	= pid_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+/* Lookups */
+
+/*
+ * Fill a directory entry.
+ *
+ * If possible create the dcache entry and derive our inode number and
+ * file type from dcache entry.
+ *
+ * Since all of the proc inode numbers are dynamically generated, the inode
+ * numbers do not exist until the inode is cache.  This means creating the
+ * the dcache entry in readdir is necessary to keep the inode numbers
+ * reported by readdir in sync with the inode numbers reported
+ * by stat.
+ */
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
+	const char *name, int len,
+	instantiate_t instantiate, struct task_struct *task, const void *ptr)
+{
+	struct dentry *child, *dir = file->f_path.dentry;
+	struct qstr qname = QSTR_INIT(name, len);
+	struct inode *inode;
+	unsigned type;
+	ino_t ino;
+
+	child = d_hash_and_lookup(dir, &qname);
+	if (!child) {
+		child = d_alloc(dir, &qname);
+		if (!child)
+			goto end_instantiate;
+		if (instantiate(d_inode(dir), child, task, ptr) < 0) {
+			dput(child);
+			goto end_instantiate;
+		}
+	}
+	inode = d_inode(child);
+	ino = inode->i_ino;
+	type = inode->i_mode >> 12;
+	dput(child);
+	return dir_emit(ctx, name, len, ino, type);
+
+end_instantiate:
+	return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct mm_struct *mm = NULL;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct inode *inode;
+	int status = 0;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		status = -EPERM;
+		goto out_notask;
+	}
+
+	inode = d_inode(dentry);
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_notask;
+
+	mm = mm_access(task, PTRACE_MODE_READ);
+	if (IS_ERR_OR_NULL(mm))
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = GLOBAL_ROOT_UID;
+			inode->i_gid = GLOBAL_ROOT_GID;
+		}
+		security_task_to_inode(task, inode);
+		status = 1;
+	}
+
+out:
+	put_task_struct(task);
+
+out_notask:
+	return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc;
+
+	rc = -ENOENT;
+	task = get_proc_task(d_inode(dentry));
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	rc = -ENOENT;
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	fmode_t		mode;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static int
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	fmode_t mode = (fmode_t)(unsigned long)ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return -ENOENT;
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR;
+	if (mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return 0;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, unsigned int flags)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	int result;
+	struct mm_struct *mm;
+
+	result = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	result = -ENOENT;
+	task = get_proc_task(dir);
+	if (!task)
+		goto out;
+
+	result = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	result = -ENOENT;
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_put_task;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_put_task;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	if (vma->vm_file)
+		result = proc_map_files_instantiate(dir, dentry, task,
+				(void *)(unsigned long)vma->vm_file->f_mode);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_put_task:
+	put_task_struct(task);
+out:
+	return ERR_PTR(result);
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned long nr_files, pos, i;
+	struct flex_array *fa = NULL;
+	struct map_files_info info;
+	struct map_files_info *p;
+	int ret;
+
+	ret = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -ENOENT;
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		goto out;
+
+	ret = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	ret = 0;
+	if (!dir_emit_dots(file, ctx))
+		goto out_put_task;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_put_task;
+	down_read(&mm->mmap_sem);
+
+	nr_files = 0;
+
+	/*
+	 * We need two passes here:
+	 *
+	 *  1) Collect vmas of mapped files with mmap_sem taken
+	 *  2) Release mmap_sem and instantiate entries
+	 *
+	 * otherwise we get lockdep complained, since filldir()
+	 * routine might require mmap_sem taken in might_fault().
+	 */
+
+	for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+		if (vma->vm_file && ++pos > ctx->pos)
+			nr_files++;
+	}
+
+	if (nr_files) {
+		fa = flex_array_alloc(sizeof(info), nr_files,
+					GFP_KERNEL);
+		if (!fa || flex_array_prealloc(fa, 0, nr_files,
+						GFP_KERNEL)) {
+			ret = -ENOMEM;
+			if (fa)
+				flex_array_free(fa);
+			up_read(&mm->mmap_sem);
+			mmput(mm);
+			goto out_put_task;
+		}
+		for (i = 0, vma = mm->mmap, pos = 2; vma;
+				vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (++pos <= ctx->pos)
+				continue;
+
+			info.mode = vma->vm_file->f_mode;
+			info.len = snprintf(info.name,
+					sizeof(info.name), "%lx-%lx",
+					vma->vm_start, vma->vm_end);
+			if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+				BUG();
+		}
+	}
+	up_read(&mm->mmap_sem);
+
+	for (i = 0; i < nr_files; i++) {
+		p = flex_array_get(fa, i);
+		if (!proc_fill_cache(file, ctx,
+				      p->name, p->len,
+				      proc_map_files_instantiate,
+				      task,
+				      (void *)(unsigned long)p->mode))
+			break;
+		ctx->pos++;
+	}
+	if (fa)
+		flex_array_free(fa);
+	mmput(mm);
+
+out_put_task:
+	put_task_struct(task);
+out:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+struct timers_private {
+	struct pid *pid;
+	struct task_struct *task;
+	struct sighand_struct *sighand;
+	struct pid_namespace *ns;
+	unsigned long flags;
+};
+
+static void *timers_start(struct seq_file *m, loff_t *pos)
+{
+	struct timers_private *tp = m->private;
+
+	tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
+	if (!tp->task)
+		return ERR_PTR(-ESRCH);
+
+	tp->sighand = lock_task_sighand(tp->task, &tp->flags);
+	if (!tp->sighand)
+		return ERR_PTR(-ESRCH);
+
+	return seq_list_start(&tp->task->signal->posix_timers, *pos);
+}
+
+static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct timers_private *tp = m->private;
+	return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+}
+
+static void timers_stop(struct seq_file *m, void *v)
+{
+	struct timers_private *tp = m->private;
+
+	if (tp->sighand) {
+		unlock_task_sighand(tp->task, &tp->flags);
+		tp->sighand = NULL;
+	}
+
+	if (tp->task) {
+		put_task_struct(tp->task);
+		tp->task = NULL;
+	}
+}
+
+static int show_timer(struct seq_file *m, void *v)
+{
+	struct k_itimer *timer;
+	struct timers_private *tp = m->private;
+	int notify;
+	static const char * const nstr[] = {
+		[SIGEV_SIGNAL] = "signal",
+		[SIGEV_NONE] = "none",
+		[SIGEV_THREAD] = "thread",
+	};
+
+	timer = list_entry((struct list_head *)v, struct k_itimer, list);
+	notify = timer->it_sigev_notify;
+
+	seq_printf(m, "ID: %d\n", timer->it_id);
+	seq_printf(m, "signal: %d/%p\n",
+		   timer->sigq->info.si_signo,
+		   timer->sigq->info.si_value.sival_ptr);
+	seq_printf(m, "notify: %s/%s.%d\n",
+		   nstr[notify & ~SIGEV_THREAD_ID],
+		   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+		   pid_nr_ns(timer->it_pid, tp->ns));
+	seq_printf(m, "ClockID: %d\n", timer->it_clock);
+
+	return 0;
+}
+
+static const struct seq_operations proc_timers_seq_ops = {
+	.start	= timers_start,
+	.next	= timers_next,
+	.stop	= timers_stop,
+	.show	= show_timer,
+};
+
+static int proc_timers_open(struct inode *inode, struct file *file)
+{
+	struct timers_private *tp;
+
+	tp = __seq_open_private(file, &proc_timers_seq_ops,
+			sizeof(struct timers_private));
+	if (!tp)
+		return -ENOMEM;
+
+	tp->pid = proc_pid(inode);
+	tp->ns = inode->i_sb->s_fs_info;
+	return 0;
+}
+
+static const struct file_operations proc_timers_operations = {
+	.open		= proc_timers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+static int proc_pident_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	const struct pid_entry *p = ptr;
+	struct inode *inode;
+	struct proc_inode *ei;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	inode->i_mode = p->mode;
+	if (S_ISDIR(inode->i_mode))
+		set_nlink(inode, 2);	/* Use getattr to fix if necessary */
+	if (p->iop)
+		inode->i_op = p->iop;
+	if (p->fop)
+		inode->i_fop = p->fop;
+	ei->op = p->op;
+	d_set_d_op(dentry, &pid_dentry_operations);
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, 0))
+		return 0;
+out:
+	return -ENOENT;
+}
+
+static struct dentry *proc_pident_lookup(struct inode *dir, 
+					 struct dentry *dentry,
+					 const struct pid_entry *ents,
+					 unsigned int nents)
+{
+	int error;
+	struct task_struct *task = get_proc_task(dir);
+	const struct pid_entry *p, *last;
+
+	error = -ENOENT;
+
+	if (!task)
+		goto out_no_task;
+
+	/*
+	 * Yes, it does not scale. And it should not. Don't add
+	 * new entries into /proc/<tgid>/ without very good reasons.
+	 */
+	last = &ents[nents - 1];
+	for (p = ents; p <= last; p++) {
+		if (p->len != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, p->name, p->len))
+			break;
+	}
+	if (p > last)
+		goto out;
+
+	error = proc_pident_instantiate(dir, dentry, task, p);
+out:
+	put_task_struct(task);
+out_no_task:
+	return ERR_PTR(error);
+}
+
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
+		const struct pid_entry *ents, unsigned int nents)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	const struct pid_entry *p;
+
+	if (!task)
+		return -ENOENT;
+
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+
+	if (ctx->pos >= nents + 2)
+		goto out;
+
+	for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+		if (!proc_fill_cache(file, ctx, p->name, p->len,
+				proc_pident_instantiate, task, p))
+			break;
+		ctx->pos++;
+	}
+out:
+	put_task_struct(task);
+	return 0;
+}
+
+#ifdef CONFIG_SECURITY
+static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode * inode = file_inode(file);
+	char *p = NULL;
+	ssize_t length;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+
+	length = security_getprocattr(task,
+				      (char*)file->f_path.dentry->d_name.name,
+				      &p);
+	put_task_struct(task);
+	if (length > 0)
+		length = simple_read_from_buffer(buf, count, ppos, p, length);
+	kfree(p);
+	return length;
+}
+
+static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode * inode = file_inode(file);
+	char *page;
+	ssize_t length;
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	/* No partial writes. */
+	length = -EINVAL;
+	if (*ppos != 0)
+		goto out;
+
+	length = -ENOMEM;
+	page = (char*)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		goto out;
+
+	length = -EFAULT;
+	if (copy_from_user(page, buf, count))
+		goto out_free;
+
+	/* Guard against adverse ptrace interaction */
+	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+	if (length < 0)
+		goto out_free;
+
+	length = security_setprocattr(task,
+				      (char*)file->f_path.dentry->d_name.name,
+				      (void*)page, count);
+	mutex_unlock(&task->signal->cred_guard_mutex);
+out_free:
+	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
+	return length;
+}
+
+static const struct file_operations proc_pid_attr_operations = {
+	.read		= proc_pid_attr_read,
+	.write		= proc_pid_attr_write,
+	.llseek		= generic_file_llseek,
+};
+
+static const struct pid_entry attr_dir_stuff[] = {
+	REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("prev",       S_IRUGO,	   proc_pid_attr_operations),
+	REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+};
+
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+	return proc_pident_readdir(file, ctx, 
+				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+}
+
+static const struct file_operations proc_attr_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_attr_dir_readdir,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_attr_dir_lookup(struct inode *dir,
+				struct dentry *dentry, unsigned int flags)
+{
+	return proc_pident_lookup(dir, dentry,
+				  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+}
+
+static const struct inode_operations proc_attr_dir_inode_operations = {
+	.lookup		= proc_attr_dir_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
+#endif
+
+#ifdef CONFIG_ELF_CORE
+static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
+					 size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	int ret;
+
+	if (!task)
+		return -ESRCH;
+
+	ret = 0;
+	mm = get_task_mm(task);
+	if (mm) {
+		len = snprintf(buffer, sizeof(buffer), "%08lx\n",
+			       ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+				MMF_DUMP_FILTER_SHIFT));
+		mmput(mm);
+		ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
+	}
+
+	put_task_struct(task);
+
+	return ret;
+}
+
+static ssize_t proc_coredump_filter_write(struct file *file,
+					  const char __user *buf,
+					  size_t count,
+					  loff_t *ppos)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF], *end;
+	unsigned int val;
+	int ret;
+	int i;
+	unsigned long mask;
+
+	ret = -EFAULT;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		goto out_no_task;
+
+	ret = -EINVAL;
+	val = (unsigned int)simple_strtoul(buffer, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (end - buffer == 0)
+		goto out_no_task;
+
+	ret = -ESRCH;
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		goto out_no_task;
+
+	ret = end - buffer;
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+		if (val & mask)
+			set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+		else
+			clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+	}
+
+	mmput(mm);
+ out_no_mm:
+	put_task_struct(task);
+ out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_coredump_filter_operations = {
+	.read		= proc_coredump_filter_read,
+	.write		= proc_coredump_filter_write,
+	.llseek		= generic_file_llseek,
+};
+#endif
+
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
+{
+	struct task_io_accounting acct = task->ioac;
+	unsigned long flags;
+	int result;
+
+	result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (result)
+		return result;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		result = -EACCES;
+		goto out_unlock;
+	}
+
+	if (whole && lock_task_sighand(task, &flags)) {
+		struct task_struct *t = task;
+
+		task_io_accounting_add(&acct, &task->signal->ioac);
+		while_each_thread(task, t)
+			task_io_accounting_add(&acct, &t->ioac);
+
+		unlock_task_sighand(task, &flags);
+	}
+	seq_printf(m,
+		   "rchar: %llu\n"
+		   "wchar: %llu\n"
+		   "syscr: %llu\n"
+		   "syscw: %llu\n"
+		   "read_bytes: %llu\n"
+		   "write_bytes: %llu\n"
+		   "cancelled_write_bytes: %llu\n",
+		   (unsigned long long)acct.rchar,
+		   (unsigned long long)acct.wchar,
+		   (unsigned long long)acct.syscr,
+		   (unsigned long long)acct.syscw,
+		   (unsigned long long)acct.read_bytes,
+		   (unsigned long long)acct.write_bytes,
+		   (unsigned long long)acct.cancelled_write_bytes);
+	result = 0;
+
+out_unlock:
+	mutex_unlock(&task->signal->cred_guard_mutex);
+	return result;
+}
+
+static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+				  struct pid *pid, struct task_struct *task)
+{
+	return do_io_accounting(task, m, 0);
+}
+
+static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+				   struct pid *pid, struct task_struct *task)
+{
+	return do_io_accounting(task, m, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+#ifdef CONFIG_USER_NS
+static int proc_id_map_open(struct inode *inode, struct file *file,
+	const struct seq_operations *seq_ops)
+{
+	struct user_namespace *ns = NULL;
+	struct task_struct *task;
+	struct seq_file *seq;
+	int ret = -EINVAL;
+
+	task = get_proc_task(inode);
+	if (task) {
+		rcu_read_lock();
+		ns = get_user_ns(task_cred_xxx(task, user_ns));
+		rcu_read_unlock();
+		put_task_struct(task);
+	}
+	if (!ns)
+		goto err;
+
+	ret = seq_open(file, seq_ops);
+	if (ret)
+		goto err_put_ns;
+
+	seq = file->private_data;
+	seq->private = ns;
+
+	return 0;
+err_put_ns:
+	put_user_ns(ns);
+err:
+	return ret;
+}
+
+static int proc_id_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct user_namespace *ns = seq->private;
+	put_user_ns(ns);
+	return seq_release(inode, file);
+}
+
+static int proc_uid_map_open(struct inode *inode, struct file *file)
+{
+	return proc_id_map_open(inode, file, &proc_uid_seq_operations);
+}
+
+static int proc_gid_map_open(struct inode *inode, struct file *file)
+{
+	return proc_id_map_open(inode, file, &proc_gid_seq_operations);
+}
+
+static int proc_projid_map_open(struct inode *inode, struct file *file)
+{
+	return proc_id_map_open(inode, file, &proc_projid_seq_operations);
+}
+
+static const struct file_operations proc_uid_map_operations = {
+	.open		= proc_uid_map_open,
+	.write		= proc_uid_map_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_id_map_release,
+};
+
+static const struct file_operations proc_gid_map_operations = {
+	.open		= proc_gid_map_open,
+	.write		= proc_gid_map_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_id_map_release,
+};
+
+static const struct file_operations proc_projid_map_operations = {
+	.open		= proc_projid_map_open,
+	.write		= proc_projid_map_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_id_map_release,
+};
+
+static int proc_setgroups_open(struct inode *inode, struct file *file)
+{
+	struct user_namespace *ns = NULL;
+	struct task_struct *task;
+	int ret;
+
+	ret = -ESRCH;
+	task = get_proc_task(inode);
+	if (task) {
+		rcu_read_lock();
+		ns = get_user_ns(task_cred_xxx(task, user_ns));
+		rcu_read_unlock();
+		put_task_struct(task);
+	}
+	if (!ns)
+		goto err;
+
+	if (file->f_mode & FMODE_WRITE) {
+		ret = -EACCES;
+		if (!ns_capable(ns, CAP_SYS_ADMIN))
+			goto err_put_ns;
+	}
+
+	ret = single_open(file, &proc_setgroups_show, ns);
+	if (ret)
+		goto err_put_ns;
+
+	return 0;
+err_put_ns:
+	put_user_ns(ns);
+err:
+	return ret;
+}
+
+static int proc_setgroups_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct user_namespace *ns = seq->private;
+	int ret = single_release(inode, file);
+	put_user_ns(ns);
+	return ret;
+}
+
+static const struct file_operations proc_setgroups_operations = {
+	.open		= proc_setgroups_open,
+	.write		= proc_setgroups_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_setgroups_release,
+};
+#endif /* CONFIG_USER_NS */
+
+static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task)
+{
+	int err = lock_trace(task);
+	if (!err) {
+		seq_printf(m, "%08x\n", task->personality);
+		unlock_trace(task);
+	}
+	return err;
+}
+
+/*
+ * Thread groups
+ */
+static const struct file_operations proc_task_operations;
+static const struct inode_operations proc_task_inode_operations;
+
+static const struct pid_entry tgid_base_stuff[] = {
+	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
+	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
+	REG("environ",    S_IRUSR, proc_environ_operations),
+	ONE("auxv",       S_IRUSR, proc_pid_auxv),
+	ONE("status",     S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUSR, proc_pid_personality),
+	ONE("limits",	  S_IRUGO, proc_pid_limits),
+#ifdef CONFIG_SCHED_DEBUG
+	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
+	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	ONE("syscall",    S_IRUSR, proc_pid_syscall),
+#endif
+	ONE("cmdline",    S_IRUGO, proc_pid_cmdline),
+	ONE("stat",       S_IRUGO, proc_tgid_stat),
+	ONE("statm",      S_IRUGO, proc_pid_statm),
+	REG("maps",       S_IRUGO, proc_pid_maps_operations),
+#ifdef CONFIG_NUMA
+	REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
+#endif
+	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",        proc_cwd_link),
+	LNK("root",       proc_root_link),
+	LNK("exe",        proc_exe_link),
+	REG("mounts",     S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+	REG("mountstats", S_IRUSR, proc_mountstats_operations),
+#ifdef CONFIG_PROC_PAGE_MONITOR
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
+	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+#endif
+#ifdef CONFIG_SECURITY
+	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+#endif
+#ifdef CONFIG_KALLSYMS
+	ONE("wchan",      S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUSR, proc_pid_stack),
+#endif
+#ifdef CONFIG_SCHEDSTATS
+	ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
+#endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, proc_lstats_operations),
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+	ONE("cpuset",     S_IRUGO, proc_cpuset_show),
+#endif
+#ifdef CONFIG_CGROUPS
+	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+#endif
+	ONE("oom_score",  S_IRUGO, proc_oom_score),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+#ifdef CONFIG_AUDITSYSCALL
+	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+#endif
+#ifdef CONFIG_FAULT_INJECTION
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+#endif
+#ifdef CONFIG_ELF_CORE
+	REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	ONE("io",	S_IRUSR, proc_tgid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
+#ifdef CONFIG_USER_NS
+	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
+	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
+#endif
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	REG("timers",	  S_IRUGO, proc_timers_operations),
+#endif
+};
+
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
+{
+	return proc_pident_readdir(file, ctx,
+				   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
+
+static const struct file_operations proc_tgid_base_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_tgid_base_readdir,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return proc_pident_lookup(dir, dentry,
+				  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
+
+static const struct inode_operations proc_tgid_base_inode_operations = {
+	.lookup		= proc_tgid_base_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+	.permission	= proc_pid_permission,
+};
+
+static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
+{
+	struct dentry *dentry, *leader, *dir;
+	char buf[PROC_NUMBUF];
+	struct qstr name;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", pid);
+	/* no ->d_hash() rejects on procfs */
+	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
+	if (dentry) {
+		d_invalidate(dentry);
+		dput(dentry);
+	}
+
+	if (pid == tgid)
+		return;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", tgid);
+	leader = d_hash_and_lookup(mnt->mnt_root, &name);
+	if (!leader)
+		goto out;
+
+	name.name = "task";
+	name.len = strlen(name.name);
+	dir = d_hash_and_lookup(leader, &name);
+	if (!dir)
+		goto out_put_leader;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", pid);
+	dentry = d_hash_and_lookup(dir, &name);
+	if (dentry) {
+		d_invalidate(dentry);
+		dput(dentry);
+	}
+
+	dput(dir);
+out_put_leader:
+	dput(leader);
+out:
+	return;
+}
+
+/**
+ * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
+ * @task: task that should be flushed.
+ *
+ * When flushing dentries from proc, one needs to flush them from global
+ * proc (proc_mnt) and from all the namespaces' procs this task was seen
+ * in. This call is supposed to do all of this job.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
+ *
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits.  After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead.  This routine is proved to flush those useless
+ * dcache entries at process exit time.
+ *
+ * NOTE: This routine is just an optimization so it does not guarantee
+ *       that no dcache entries will exist at process exit time it
+ *       just makes it very unlikely that any will persist.
+ */
+
+void proc_flush_task(struct task_struct *task)
+{
+	int i;
+	struct pid *pid, *tgid;
+	struct upid *upid;
+
+	pid = task_pid(task);
+	tgid = task_tgid(task);
+
+	for (i = 0; i <= pid->level; i++) {
+		upid = &pid->numbers[i];
+		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
+					tgid->numbers[i].nr);
+	}
+}
+
+static int proc_pid_instantiate(struct inode *dir,
+				   struct dentry * dentry,
+				   struct task_struct *task, const void *ptr)
+{
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
+	inode->i_op = &proc_tgid_base_inode_operations;
+	inode->i_fop = &proc_tgid_base_operations;
+	inode->i_flags|=S_IMMUTABLE;
+
+	set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
+						  ARRAY_SIZE(tgid_base_stuff)));
+
+	d_set_d_op(dentry, &pid_dentry_operations);
+
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, 0))
+		return 0;
+out:
+	return -ENOENT;
+}
+
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+{
+	int result = -ENOENT;
+	struct task_struct *task;
+	unsigned tgid;
+	struct pid_namespace *ns;
+
+	tgid = name_to_int(&dentry->d_name);
+	if (tgid == ~0U)
+		goto out;
+
+	ns = dentry->d_sb->s_fs_info;
+	rcu_read_lock();
+	task = find_task_by_pid_ns(tgid, ns);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	if (!task)
+		goto out;
+
+	result = proc_pid_instantiate(dir, dentry, task, NULL);
+	put_task_struct(task);
+out:
+	return ERR_PTR(result);
+}
+
+/*
+ * Find the first task with tgid >= tgid
+ *
+ */
+struct tgid_iter {
+	unsigned int tgid;
+	struct task_struct *task;
+};
+static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
+{
+	struct pid *pid;
+
+	if (iter.task)
+		put_task_struct(iter.task);
+	rcu_read_lock();
+retry:
+	iter.task = NULL;
+	pid = find_ge_pid(iter.tgid, ns);
+	if (pid) {
+		iter.tgid = pid_nr_ns(pid, ns);
+		iter.task = pid_task(pid, PIDTYPE_PID);
+		/* What we to know is if the pid we have find is the
+		 * pid of a thread_group_leader.  Testing for task
+		 * being a thread_group_leader is the obvious thing
+		 * todo but there is a window when it fails, due to
+		 * the pid transfer logic in de_thread.
+		 *
+		 * So we perform the straight forward test of seeing
+		 * if the pid we have found is the pid of a thread
+		 * group leader, and don't worry if the task we have
+		 * found doesn't happen to be a thread group leader.
+		 * As we don't care in the case of readdir.
+		 */
+		if (!iter.task || !has_group_leader_pid(iter.task)) {
+			iter.tgid += 1;
+			goto retry;
+		}
+		get_task_struct(iter.task);
+	}
+	rcu_read_unlock();
+	return iter;
+}
+
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
+
+/* for the /proc/ directory itself, after non-process stuff has been done */
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct tgid_iter iter;
+	struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+	loff_t pos = ctx->pos;
+
+	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
+		return 0;
+
+	if (pos == TGID_OFFSET - 2) {
+		struct inode *inode = d_inode(ns->proc_self);
+		if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+			return 0;
+		ctx->pos = pos = pos + 1;
+	}
+	if (pos == TGID_OFFSET - 1) {
+		struct inode *inode = d_inode(ns->proc_thread_self);
+		if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+			return 0;
+		ctx->pos = pos = pos + 1;
+	}
+	iter.tgid = pos - TGID_OFFSET;
+	iter.task = NULL;
+	for (iter = next_tgid(ns, iter);
+	     iter.task;
+	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
+		char name[PROC_NUMBUF];
+		int len;
+		if (!has_pid_permissions(ns, iter.task, 2))
+			continue;
+
+		len = snprintf(name, sizeof(name), "%d", iter.tgid);
+		ctx->pos = iter.tgid + TGID_OFFSET;
+		if (!proc_fill_cache(file, ctx, name, len,
+				     proc_pid_instantiate, iter.task, NULL)) {
+			put_task_struct(iter.task);
+			return 0;
+		}
+	}
+	ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
+	return 0;
+}
+
+/*
+ * Tasks
+ */
+static const struct pid_entry tid_base_stuff[] = {
+	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+	DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
+	REG("environ",   S_IRUSR, proc_environ_operations),
+	ONE("auxv",      S_IRUSR, proc_pid_auxv),
+	ONE("status",    S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUSR, proc_pid_personality),
+	ONE("limits",	 S_IRUGO, proc_pid_limits),
+#ifdef CONFIG_SCHED_DEBUG
+	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	ONE("syscall",   S_IRUSR, proc_pid_syscall),
+#endif
+	ONE("cmdline",   S_IRUGO, proc_pid_cmdline),
+	ONE("stat",      S_IRUGO, proc_tid_stat),
+	ONE("statm",     S_IRUGO, proc_pid_statm),
+	REG("maps",      S_IRUGO, proc_tid_maps_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	REG("children",  S_IRUGO, proc_tid_children_operations),
+#endif
+#ifdef CONFIG_NUMA
+	REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
+#endif
+	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",       proc_cwd_link),
+	LNK("root",      proc_root_link),
+	LNK("exe",       proc_exe_link),
+	REG("mounts",    S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+#ifdef CONFIG_PROC_PAGE_MONITOR
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
+	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+#endif
+#ifdef CONFIG_SECURITY
+	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+#endif
+#ifdef CONFIG_KALLSYMS
+	ONE("wchan",     S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUSR, proc_pid_stack),
+#endif
+#ifdef CONFIG_SCHEDSTATS
+	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
+#endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, proc_lstats_operations),
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+	ONE("cpuset",    S_IRUGO, proc_cpuset_show),
+#endif
+#ifdef CONFIG_CGROUPS
+	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+#endif
+	ONE("oom_score", S_IRUGO, proc_oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+#ifdef CONFIG_AUDITSYSCALL
+	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+#endif
+#ifdef CONFIG_FAULT_INJECTION
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	ONE("io",	S_IRUSR, proc_tid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
+#ifdef CONFIG_USER_NS
+	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
+	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
+#endif
+};
+
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
+{
+	return proc_pident_readdir(file, ctx,
+				   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+}
+
+static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return proc_pident_lookup(dir, dentry,
+				  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+}
+
+static const struct file_operations proc_tid_base_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_tid_base_readdir,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations proc_tid_base_inode_operations = {
+	.lookup		= proc_tid_base_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
+static int proc_task_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	struct inode *inode;
+	inode = proc_pid_make_inode(dir->i_sb, task);
+
+	if (!inode)
+		goto out;
+	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
+	inode->i_op = &proc_tid_base_inode_operations;
+	inode->i_fop = &proc_tid_base_operations;
+	inode->i_flags|=S_IMMUTABLE;
+
+	set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
+						  ARRAY_SIZE(tid_base_stuff)));
+
+	d_set_d_op(dentry, &pid_dentry_operations);
+
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, 0))
+		return 0;
+out:
+	return -ENOENT;
+}
+
+static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+{
+	int result = -ENOENT;
+	struct task_struct *task;
+	struct task_struct *leader = get_proc_task(dir);
+	unsigned tid;
+	struct pid_namespace *ns;
+
+	if (!leader)
+		goto out_no_task;
+
+	tid = name_to_int(&dentry->d_name);
+	if (tid == ~0U)
+		goto out;
+
+	ns = dentry->d_sb->s_fs_info;
+	rcu_read_lock();
+	task = find_task_by_pid_ns(tid, ns);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	if (!task)
+		goto out;
+	if (!same_thread_group(leader, task))
+		goto out_drop_task;
+
+	result = proc_task_instantiate(dir, dentry, task, NULL);
+out_drop_task:
+	put_task_struct(task);
+out:
+	put_task_struct(leader);
+out_no_task:
+	return ERR_PTR(result);
+}
+
+/*
+ * Find the first tid of a thread group to return to user space.
+ *
+ * Usually this is just the thread group leader, but if the users
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+					struct pid_namespace *ns)
+{
+	struct task_struct *pos, *task;
+	unsigned long nr = f_pos;
+
+	if (nr != f_pos)	/* 32bit overflow? */
+		return NULL;
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		goto fail;
+
+	/* Attempt to start with the tid of a thread */
+	if (tid && nr) {
+		pos = find_task_by_pid_ns(tid, ns);
+		if (pos && same_thread_group(pos, task))
+			goto found;
+	}
+
+	/* If nr exceeds the number of threads there is nothing todo */
+	if (nr >= get_nr_threads(task))
+		goto fail;
+
+	/* If we haven't found our starting place yet start
+	 * with the leader and walk nr threads forward.
+	 */
+	pos = task = task->group_leader;
+	do {
+		if (!nr--)
+			goto found;
+	} while_each_thread(task, pos);
+fail:
+	pos = NULL;
+	goto out;
+found:
+	get_task_struct(pos);
+out:
+	rcu_read_unlock();
+	return pos;
+}
+
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+	struct task_struct *pos = NULL;
+	rcu_read_lock();
+	if (pid_alive(start)) {
+		pos = next_thread(start);
+		if (thread_group_leader(pos))
+			pos = NULL;
+		else
+			get_task_struct(pos);
+	}
+	rcu_read_unlock();
+	put_task_struct(start);
+	return pos;
+}
+
+/* for the /proc/TGID/task/ directories */
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *task;
+	struct pid_namespace *ns;
+	int tid;
+
+	if (proc_inode_is_dead(inode))
+		return -ENOENT;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	/* f_version caches the tgid value that the last readdir call couldn't
+	 * return. lseek aka telldir automagically resets f_version to 0.
+	 */
+	ns = inode->i_sb->s_fs_info;
+	tid = (int)file->f_version;
+	file->f_version = 0;
+	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
+	     task;
+	     task = next_tid(task), ctx->pos++) {
+		char name[PROC_NUMBUF];
+		int len;
+		tid = task_pid_nr_ns(task, ns);
+		len = snprintf(name, sizeof(name), "%d", tid);
+		if (!proc_fill_cache(file, ctx, name, len,
+				proc_task_instantiate, task, NULL)) {
+			/* returning this tgid failed, save it as the first
+			 * pid for the next readir call */
+			file->f_version = (u64)tid;
+			put_task_struct(task);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct task_struct *p = get_proc_task(inode);
+	generic_fillattr(inode, stat);
+
+	if (p) {
+		stat->nlink += get_nr_threads(p);
+		put_task_struct(p);
+	}
+
+	return 0;
+}
+
+static const struct inode_operations proc_task_inode_operations = {
+	.lookup		= proc_task_lookup,
+	.getattr	= proc_task_getattr,
+	.setattr	= proc_setattr,
+	.permission	= proc_pid_permission,
+};
+
+static const struct file_operations proc_task_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_task_readdir,
+	.llseek		= default_llseek,
+};
diff --git a/kernel/fs/proc/cmdline.c b/kernel/fs/proc/cmdline.c
new file mode 100644
index 000000000..cbd82dff7
--- /dev/null
+++ b/kernel/fs/proc/cmdline.c
@@ -0,0 +1,29 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int cmdline_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", saved_command_line);
+	return 0;
+}
+
+static int cmdline_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cmdline_proc_show, NULL);
+}
+
+static const struct file_operations cmdline_proc_fops = {
+	.open		= cmdline_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_cmdline_init(void)
+{
+	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	return 0;
+}
+fs_initcall(proc_cmdline_init);
diff --git a/kernel/fs/proc/consoles.c b/kernel/fs/proc/consoles.c
new file mode 100644
index 000000000..290ba85cb
--- /dev/null
+++ b/kernel/fs/proc/consoles.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+	static const struct {
+		short flag;
+		char name;
+	} con_flags[] = {
+		{ CON_ENABLED,		'E' },
+		{ CON_CONSDEV,		'C' },
+		{ CON_BOOT,		'B' },
+		{ CON_PRINTBUFFER,	'p' },
+		{ CON_BRL,		'b' },
+		{ CON_ANYTIME,		'a' },
+	};
+	char flags[ARRAY_SIZE(con_flags) + 1];
+	struct console *con = v;
+	unsigned int a;
+	dev_t dev = 0;
+
+	if (con->device) {
+		const struct tty_driver *driver;
+		int index;
+		driver = con->device(con, &index);
+		if (driver) {
+			dev = MKDEV(driver->major, driver->minor_start);
+			dev += index;
+		}
+	}
+
+	for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+		flags[a] = (con->flags & con_flags[a].flag) ?
+			con_flags[a].name : ' ';
+	flags[a] = 0;
+
+	seq_setwidth(m, 21 - 1);
+	seq_printf(m, "%s%d", con->name, con->index);
+	seq_pad(m, ' ');
+	seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
+			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+			flags);
+	if (dev)
+		seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+
+	seq_printf(m, "\n");
+
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	struct console *con;
+	loff_t off = 0;
+
+	console_lock();
+	for_each_console(con)
+		if (off++ == *pos)
+			break;
+
+	return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct console *con = v;
+	++*pos;
+	return con->next;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+	console_unlock();
+}
+
+static const struct seq_operations consoles_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= show_console_dev
+};
+
+static int consoles_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &consoles_op);
+}
+
+static const struct file_operations proc_consoles_operations = {
+	.open		= consoles_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_consoles_init(void)
+{
+	proc_create("consoles", 0, NULL, &proc_consoles_operations);
+	return 0;
+}
+fs_initcall(proc_consoles_init);
diff --git a/kernel/fs/proc/cpuinfo.c b/kernel/fs/proc/cpuinfo.c
new file mode 100644
index 000000000..06f4d31e0
--- /dev/null
+++ b/kernel/fs/proc/cpuinfo.c
@@ -0,0 +1,24 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+extern const struct seq_operations cpuinfo_op;
+static int cpuinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuinfo_op);
+}
+
+static const struct file_operations proc_cpuinfo_operations = {
+	.open		= cpuinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_cpuinfo_init(void)
+{
+	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+	return 0;
+}
+fs_initcall(proc_cpuinfo_init);
diff --git a/kernel/fs/proc/devices.c b/kernel/fs/proc/devices.c
new file mode 100644
index 000000000..50493edc3
--- /dev/null
+++ b/kernel/fs/proc/devices.c
@@ -0,0 +1,70 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int devinfo_show(struct seq_file *f, void *v)
+{
+	int i = *(loff_t *) v;
+
+	if (i < CHRDEV_MAJOR_HASH_SIZE) {
+		if (i == 0)
+			seq_puts(f, "Character devices:\n");
+		chrdev_show(f, i);
+	}
+#ifdef CONFIG_BLOCK
+	else {
+		i -= CHRDEV_MAJOR_HASH_SIZE;
+		if (i == 0)
+			seq_puts(f, "\nBlock devices:\n");
+		blkdev_show(f, i);
+	}
+#endif
+	return 0;
+}
+
+static void *devinfo_start(struct seq_file *f, loff_t *pos)
+{
+	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return pos;
+	return NULL;
+}
+
+static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return NULL;
+	return pos;
+}
+
+static void devinfo_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations devinfo_ops = {
+	.start = devinfo_start,
+	.next  = devinfo_next,
+	.stop  = devinfo_stop,
+	.show  = devinfo_show
+};
+
+static int devinfo_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &devinfo_ops);
+}
+
+static const struct file_operations proc_devinfo_operations = {
+	.open		= devinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_devices_init(void)
+{
+	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	return 0;
+}
+fs_initcall(proc_devices_init);
diff --git a/kernel/fs/proc/fd.c b/kernel/fs/proc/fd.c
new file mode 100644
index 000000000..6e5fcd007
--- /dev/null
+++ b/kernel/fs/proc/fd.c
@@ -0,0 +1,357 @@
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/dcache.h>
+#include <linux/path.h>
+#include <linux/fdtable.h>
+#include <linux/namei.h>
+#include <linux/pid.h>
+#include <linux/security.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+
+#include <linux/proc_fs.h>
+
+#include "../mount.h"
+#include "internal.h"
+#include "fd.h"
+
+static int seq_show(struct seq_file *m, void *v)
+{
+	struct files_struct *files = NULL;
+	int f_flags = 0, ret = -ENOENT;
+	struct file *file = NULL;
+	struct task_struct *task;
+
+	task = get_proc_task(m->private);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+
+	if (files) {
+		int fd = proc_fd(m->private);
+
+		spin_lock(&files->file_lock);
+		file = fcheck_files(files, fd);
+		if (file) {
+			struct fdtable *fdt = files_fdtable(files);
+
+			f_flags = file->f_flags;
+			if (close_on_exec(fd, fdt))
+				f_flags |= O_CLOEXEC;
+
+			get_file(file);
+			ret = 0;
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+
+	if (ret)
+		return ret;
+
+	seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+		   (long long)file->f_pos, f_flags,
+		   real_mount(file->f_path.mnt)->mnt_id);
+
+	show_fd_locks(m, file, files);
+	if (seq_has_overflowed(m))
+		goto out;
+
+	if (file->f_op->show_fdinfo)
+		file->f_op->show_fdinfo(m, file);
+
+out:
+	fput(file);
+	return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_show, inode);
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+	.open		= seq_fdinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct files_struct *files;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct inode *inode;
+	int fd;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = d_inode(dentry);
+	task = get_proc_task(inode);
+	fd = proc_fd(inode);
+
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			struct file *file;
+
+			rcu_read_lock();
+			file = fcheck_files(files, fd);
+			if (file) {
+				unsigned f_mode = file->f_mode;
+
+				rcu_read_unlock();
+				put_files_struct(files);
+
+				if (task_dumpable(task)) {
+					rcu_read_lock();
+					cred = __task_cred(task);
+					inode->i_uid = cred->euid;
+					inode->i_gid = cred->egid;
+					rcu_read_unlock();
+				} else {
+					inode->i_uid = GLOBAL_ROOT_UID;
+					inode->i_gid = GLOBAL_ROOT_GID;
+				}
+
+				if (S_ISLNK(inode->i_mode)) {
+					unsigned i_mode = S_IFLNK;
+					if (f_mode & FMODE_READ)
+						i_mode |= S_IRUSR | S_IXUSR;
+					if (f_mode & FMODE_WRITE)
+						i_mode |= S_IWUSR | S_IXUSR;
+					inode->i_mode = i_mode;
+				}
+
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
+			rcu_read_unlock();
+			put_files_struct(files);
+		}
+		put_task_struct(task);
+	}
+	return 0;
+}
+
+static const struct dentry_operations tid_fd_dentry_operations = {
+	.d_revalidate	= tid_fd_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_fd_link(struct dentry *dentry, struct path *path)
+{
+	struct files_struct *files = NULL;
+	struct task_struct *task;
+	int ret = -ENOENT;
+
+	task = get_proc_task(d_inode(dentry));
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+
+	if (files) {
+		int fd = proc_fd(d_inode(dentry));
+		struct file *fd_file;
+
+		spin_lock(&files->file_lock);
+		fd_file = fcheck_files(files, fd);
+		if (fd_file) {
+			*path = fd_file->f_path;
+			path_get(&fd_file->f_path);
+			ret = 0;
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+
+	return ret;
+}
+
+static int
+proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
+		    struct task_struct *task, const void *ptr)
+{
+	unsigned fd = (unsigned long)ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	ei->fd = fd;
+
+	inode->i_mode = S_IFLNK;
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+
+	ei->op.proc_get_link = proc_fd_link;
+
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, 0))
+		return 0;
+ out:
+	return -ENOENT;
+}
+
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+					   struct dentry *dentry,
+					   instantiate_t instantiate)
+{
+	struct task_struct *task = get_proc_task(dir);
+	int result = -ENOENT;
+	unsigned fd = name_to_int(&dentry->d_name);
+
+	if (!task)
+		goto out_no_task;
+	if (fd == ~0U)
+		goto out;
+
+	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+out:
+	put_task_struct(task);
+out_no_task:
+	return ERR_PTR(result);
+}
+
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+			      instantiate_t instantiate)
+{
+	struct task_struct *p = get_proc_task(file_inode(file));
+	struct files_struct *files;
+	unsigned int fd;
+
+	if (!p)
+		return -ENOENT;
+
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	files = get_files_struct(p);
+	if (!files)
+		goto out;
+
+	rcu_read_lock();
+	for (fd = ctx->pos - 2;
+	     fd < files_fdtable(files)->max_fds;
+	     fd++, ctx->pos++) {
+		char name[PROC_NUMBUF];
+		int len;
+
+		if (!fcheck_files(files, fd))
+			continue;
+		rcu_read_unlock();
+
+		len = snprintf(name, sizeof(name), "%d", fd);
+		if (!proc_fill_cache(file, ctx,
+				     name, len, instantiate, p,
+				     (void *)(unsigned long)fd))
+			goto out_fd_loop;
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+out_fd_loop:
+	put_files_struct(files);
+out:
+	put_task_struct(p);
+	return 0;
+}
+
+static int proc_readfd(struct file *file, struct dir_context *ctx)
+{
+	return proc_readfd_common(file, ctx, proc_fd_instantiate);
+}
+
+const struct file_operations proc_fd_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_readfd,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
+				    unsigned int flags)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
+}
+
+/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+int proc_fd_permission(struct inode *inode, int mask)
+{
+	int rv = generic_permission(inode, mask);
+	if (rv == 0)
+		return 0;
+	if (task_tgid(current) == proc_pid(inode))
+		rv = 0;
+	return rv;
+}
+
+const struct inode_operations proc_fd_inode_operations = {
+	.lookup		= proc_lookupfd,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static int
+proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
+			struct task_struct *task, const void *ptr)
+{
+	unsigned fd = (unsigned long)ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	ei->fd = fd;
+
+	inode->i_mode = S_IFREG | S_IRUSR;
+	inode->i_fop = &proc_fdinfo_file_operations;
+
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, 0))
+		return 0;
+ out:
+	return -ENOENT;
+}
+
+static struct dentry *
+proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
+}
+
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+{
+	return proc_readfd_common(file, ctx,
+				  proc_fdinfo_instantiate);
+}
+
+const struct inode_operations proc_fdinfo_inode_operations = {
+	.lookup		= proc_lookupfdinfo,
+	.setattr	= proc_setattr,
+};
+
+const struct file_operations proc_fdinfo_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_readfdinfo,
+	.llseek		= default_llseek,
+};
diff --git a/kernel/fs/proc/fd.h b/kernel/fs/proc/fd.h
new file mode 100644
index 000000000..7c047f256
--- /dev/null
+++ b/kernel/fs/proc/fd.h
@@ -0,0 +1,19 @@
+#ifndef __PROCFS_FD_H__
+#define __PROCFS_FD_H__
+
+#include <linux/fs.h>
+
+extern const struct file_operations proc_fd_operations;
+extern const struct inode_operations proc_fd_inode_operations;
+
+extern const struct file_operations proc_fdinfo_operations;
+extern const struct inode_operations proc_fdinfo_inode_operations;
+
+extern int proc_fd_permission(struct inode *inode, int mask);
+
+static inline int proc_fd(struct inode *inode)
+{
+	return PROC_I(inode)->fd;
+}
+
+#endif /* __PROCFS_FD_H__ */
diff --git a/kernel/fs/proc/generic.c b/kernel/fs/proc/generic.c
new file mode 100644
index 000000000..e5dee5c31
--- /dev/null
+++ b/kernel/fs/proc/generic.c
@@ -0,0 +1,645 @@
+/*
+ * proc/fs/generic.c --- generic routines for the proc-fs
+ *
+ * This file contains generic proc-fs routines for handling
+ * directories and files.
+ * 
+ * Copyright (C) 1991, 1992 Linus Torvalds.
+ * Copyright (C) 1997 Theodore Ts'o
+ */
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/printk.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/idr.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+static DEFINE_SPINLOCK(proc_subdir_lock);
+
+static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
+{
+	if (len < de->namelen)
+		return -1;
+	if (len > de->namelen)
+		return 1;
+
+	return memcmp(name, de->name, len);
+}
+
+static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
+{
+	return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
+			     subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
+{
+	return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
+			     subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+					      const char *name,
+					      unsigned int len)
+{
+	struct rb_node *node = dir->subdir.rb_node;
+
+	while (node) {
+		struct proc_dir_entry *de = container_of(node,
+							 struct proc_dir_entry,
+							 subdir_node);
+		int result = proc_match(len, name, de);
+
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return de;
+	}
+	return NULL;
+}
+
+static bool pde_subdir_insert(struct proc_dir_entry *dir,
+			      struct proc_dir_entry *de)
+{
+	struct rb_root *root = &dir->subdir;
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct proc_dir_entry *this =
+			container_of(*new, struct proc_dir_entry, subdir_node);
+		int result = proc_match(de->namelen, de->name, this);
+
+		parent = *new;
+		if (result < 0)
+			new = &(*new)->rb_left;
+		else if (result > 0)
+			new = &(*new)->rb_right;
+		else
+			return false;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&de->subdir_node, parent, new);
+	rb_insert_color(&de->subdir_node, root);
+	return true;
+}
+
+static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct proc_dir_entry *de = PDE(inode);
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	proc_set_user(de, inode->i_uid, inode->i_gid);
+	de->mode = inode->i_mode;
+	return 0;
+}
+
+static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct proc_dir_entry *de = PDE(inode);
+	if (de && de->nlink)
+		set_nlink(inode, de->nlink);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static const struct inode_operations proc_file_inode_operations = {
+	.setattr	= proc_notify_change,
+};
+
+/*
+ * This function parses a name such as "tty/driver/serial", and
+ * returns the struct proc_dir_entry for "/proc/tty/driver", and
+ * returns "serial" in residual.
+ */
+static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+			     const char **residual)
+{
+	const char     		*cp = name, *next;
+	struct proc_dir_entry	*de;
+	unsigned int		len;
+
+	de = *ret;
+	if (!de)
+		de = &proc_root;
+
+	while (1) {
+		next = strchr(cp, '/');
+		if (!next)
+			break;
+
+		len = next - cp;
+		de = pde_subdir_find(de, cp, len);
+		if (!de) {
+			WARN(1, "name '%s'\n", name);
+			return -ENOENT;
+		}
+		cp += len + 1;
+	}
+	*residual = cp;
+	*ret = de;
+	return 0;
+}
+
+static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+			   const char **residual)
+{
+	int rv;
+
+	spin_lock(&proc_subdir_lock);
+	rv = __xlate_proc_name(name, ret, residual);
+	spin_unlock(&proc_subdir_lock);
+	return rv;
+}
+
+static DEFINE_IDA(proc_inum_ida);
+static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
+
+#define PROC_DYNAMIC_FIRST 0xF0000000U
+
+/*
+ * Return an inode number between PROC_DYNAMIC_FIRST and
+ * 0xffffffff, or zero on failure.
+ */
+int proc_alloc_inum(unsigned int *inum)
+{
+	unsigned int i;
+	int error;
+
+retry:
+	if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock_irq(&proc_inum_lock);
+	error = ida_get_new(&proc_inum_ida, &i);
+	spin_unlock_irq(&proc_inum_lock);
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error)
+		return error;
+
+	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
+		spin_lock_irq(&proc_inum_lock);
+		ida_remove(&proc_inum_ida, i);
+		spin_unlock_irq(&proc_inum_lock);
+		return -ENOSPC;
+	}
+	*inum = PROC_DYNAMIC_FIRST + i;
+	return 0;
+}
+
+void proc_free_inum(unsigned int inum)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&proc_inum_lock, flags);
+	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+	spin_unlock_irqrestore(&proc_inum_lock, flags);
+}
+
+/*
+ * Don't create negative dentries here, return -ENOENT by hand
+ * instead.
+ */
+struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode;
+
+	spin_lock(&proc_subdir_lock);
+	de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
+	if (de) {
+		pde_get(de);
+		spin_unlock(&proc_subdir_lock);
+		inode = proc_get_inode(dir->i_sb, de);
+		if (!inode)
+			return ERR_PTR(-ENOMEM);
+		d_set_d_op(dentry, &simple_dentry_operations);
+		d_add(dentry, inode);
+		return NULL;
+	}
+	spin_unlock(&proc_subdir_lock);
+	return ERR_PTR(-ENOENT);
+}
+
+struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
+		unsigned int flags)
+{
+	return proc_lookup_de(PDE(dir), dir, dentry);
+}
+
+/*
+ * This returns non-zero if at EOF, so that the /proc
+ * root directory can use this and check if it should
+ * continue with the <pid> entries..
+ *
+ * Note that the VFS-layer doesn't care about the return
+ * value of the readdir() call, as long as it's non-negative
+ * for success..
+ */
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
+		    struct dir_context *ctx)
+{
+	int i;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	spin_lock(&proc_subdir_lock);
+	de = pde_subdir_first(de);
+	i = ctx->pos - 2;
+	for (;;) {
+		if (!de) {
+			spin_unlock(&proc_subdir_lock);
+			return 0;
+		}
+		if (!i)
+			break;
+		de = pde_subdir_next(de);
+		i--;
+	}
+
+	do {
+		struct proc_dir_entry *next;
+		pde_get(de);
+		spin_unlock(&proc_subdir_lock);
+		if (!dir_emit(ctx, de->name, de->namelen,
+			    de->low_ino, de->mode >> 12)) {
+			pde_put(de);
+			return 0;
+		}
+		spin_lock(&proc_subdir_lock);
+		ctx->pos++;
+		next = pde_subdir_next(de);
+		pde_put(de);
+		de = next;
+	} while (de);
+	spin_unlock(&proc_subdir_lock);
+	return 1;
+}
+
+int proc_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+
+	return proc_readdir_de(PDE(inode), file, ctx);
+}
+
+/*
+ * These are the generic /proc directory operations. They
+ * use the in-memory "struct proc_dir_entry" tree to parse
+ * the /proc directory.
+ */
+static const struct file_operations proc_dir_operations = {
+	.llseek			= generic_file_llseek,
+	.read			= generic_read_dir,
+	.iterate		= proc_readdir,
+};
+
+/*
+ * proc directories can do almost nothing..
+ */
+static const struct inode_operations proc_dir_inode_operations = {
+	.lookup		= proc_lookup,
+	.getattr	= proc_getattr,
+	.setattr	= proc_notify_change,
+};
+
+static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
+{
+	int ret;
+
+	ret = proc_alloc_inum(&dp->low_ino);
+	if (ret)
+		return ret;
+
+	spin_lock(&proc_subdir_lock);
+	dp->parent = dir;
+	if (pde_subdir_insert(dir, dp) == false) {
+		WARN(1, "proc_dir_entry '%s/%s' already registered\n",
+		     dir->name, dp->name);
+		spin_unlock(&proc_subdir_lock);
+		proc_free_inum(dp->low_ino);
+		return -EEXIST;
+	}
+	spin_unlock(&proc_subdir_lock);
+
+	return 0;
+}
+
+static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
+					  const char *name,
+					  umode_t mode,
+					  nlink_t nlink)
+{
+	struct proc_dir_entry *ent = NULL;
+	const char *fn;
+	struct qstr qstr;
+
+	if (xlate_proc_name(name, parent, &fn) != 0)
+		goto out;
+	qstr.name = fn;
+	qstr.len = strlen(fn);
+	if (qstr.len == 0 || qstr.len >= 256) {
+		WARN(1, "name len %u\n", qstr.len);
+		return NULL;
+	}
+	if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
+		WARN(1, "create '/proc/%s' by hand\n", qstr.name);
+		return NULL;
+	}
+	if (is_empty_pde(*parent)) {
+		WARN(1, "attempt to add to permanently empty directory");
+		return NULL;
+	}
+
+	ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
+	if (!ent)
+		goto out;
+
+	memcpy(ent->name, fn, qstr.len + 1);
+	ent->namelen = qstr.len;
+	ent->mode = mode;
+	ent->nlink = nlink;
+	ent->subdir = RB_ROOT;
+	atomic_set(&ent->count, 1);
+	spin_lock_init(&ent->pde_unload_lock);
+	INIT_LIST_HEAD(&ent->pde_openers);
+out:
+	return ent;
+}
+
+struct proc_dir_entry *proc_symlink(const char *name,
+		struct proc_dir_entry *parent, const char *dest)
+{
+	struct proc_dir_entry *ent;
+
+	ent = __proc_create(&parent, name,
+			  (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
+
+	if (ent) {
+		ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+		if (ent->data) {
+			strcpy((char*)ent->data,dest);
+			ent->proc_iops = &proc_link_inode_operations;
+			if (proc_register(parent, ent) < 0) {
+				kfree(ent->data);
+				kfree(ent);
+				ent = NULL;
+			}
+		} else {
+			kfree(ent);
+			ent = NULL;
+		}
+	}
+	return ent;
+}
+EXPORT_SYMBOL(proc_symlink);
+
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, void *data)
+{
+	struct proc_dir_entry *ent;
+
+	if (mode == 0)
+		mode = S_IRUGO | S_IXUGO;
+
+	ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
+	if (ent) {
+		ent->data = data;
+		ent->proc_fops = &proc_dir_operations;
+		ent->proc_iops = &proc_dir_inode_operations;
+		parent->nlink++;
+		if (proc_register(parent, ent) < 0) {
+			kfree(ent);
+			parent->nlink--;
+			ent = NULL;
+		}
+	}
+	return ent;
+}
+EXPORT_SYMBOL_GPL(proc_mkdir_data);
+
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
+				       struct proc_dir_entry *parent)
+{
+	return proc_mkdir_data(name, mode, parent, NULL);
+}
+EXPORT_SYMBOL(proc_mkdir_mode);
+
+struct proc_dir_entry *proc_mkdir(const char *name,
+		struct proc_dir_entry *parent)
+{
+	return proc_mkdir_data(name, 0, parent, NULL);
+}
+EXPORT_SYMBOL(proc_mkdir);
+
+struct proc_dir_entry *proc_create_mount_point(const char *name)
+{
+	umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	struct proc_dir_entry *ent, *parent = NULL;
+
+	ent = __proc_create(&parent, name, mode, 2);
+	if (ent) {
+		ent->data = NULL;
+		ent->proc_fops = NULL;
+		ent->proc_iops = NULL;
+		if (proc_register(parent, ent) < 0) {
+			kfree(ent);
+			parent->nlink--;
+			ent = NULL;
+		}
+	}
+	return ent;
+}
+
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
+					struct proc_dir_entry *parent,
+					const struct file_operations *proc_fops,
+					void *data)
+{
+	struct proc_dir_entry *pde;
+	if ((mode & S_IFMT) == 0)
+		mode |= S_IFREG;
+
+	if (!S_ISREG(mode)) {
+		WARN_ON(1);	/* use proc_mkdir() */
+		return NULL;
+	}
+
+	BUG_ON(proc_fops == NULL);
+
+	if ((mode & S_IALLUGO) == 0)
+		mode |= S_IRUGO;
+	pde = __proc_create(&parent, name, mode, 1);
+	if (!pde)
+		goto out;
+	pde->proc_fops = proc_fops;
+	pde->data = data;
+	pde->proc_iops = &proc_file_inode_operations;
+	if (proc_register(parent, pde) < 0)
+		goto out_free;
+	return pde;
+out_free:
+	kfree(pde);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(proc_create_data);
+ 
+void proc_set_size(struct proc_dir_entry *de, loff_t size)
+{
+	de->size = size;
+}
+EXPORT_SYMBOL(proc_set_size);
+
+void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
+{
+	de->uid = uid;
+	de->gid = gid;
+}
+EXPORT_SYMBOL(proc_set_user);
+
+static void free_proc_entry(struct proc_dir_entry *de)
+{
+	proc_free_inum(de->low_ino);
+
+	if (S_ISLNK(de->mode))
+		kfree(de->data);
+	kfree(de);
+}
+
+void pde_put(struct proc_dir_entry *pde)
+{
+	if (atomic_dec_and_test(&pde->count))
+		free_proc_entry(pde);
+}
+
+/*
+ * Remove a /proc entry and free it if it's not currently in use.
+ */
+void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry *de = NULL;
+	const char *fn = name;
+	unsigned int len;
+
+	spin_lock(&proc_subdir_lock);
+	if (__xlate_proc_name(name, &parent, &fn) != 0) {
+		spin_unlock(&proc_subdir_lock);
+		return;
+	}
+	len = strlen(fn);
+
+	de = pde_subdir_find(parent, fn, len);
+	if (de)
+		rb_erase(&de->subdir_node, &parent->subdir);
+	spin_unlock(&proc_subdir_lock);
+	if (!de) {
+		WARN(1, "name '%s'\n", name);
+		return;
+	}
+
+	proc_entry_rundown(de);
+
+	if (S_ISDIR(de->mode))
+		parent->nlink--;
+	de->nlink = 0;
+	WARN(pde_subdir_first(de),
+	     "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
+	     __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
+	pde_put(de);
+}
+EXPORT_SYMBOL(remove_proc_entry);
+
+int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry *root = NULL, *de, *next;
+	const char *fn = name;
+	unsigned int len;
+
+	spin_lock(&proc_subdir_lock);
+	if (__xlate_proc_name(name, &parent, &fn) != 0) {
+		spin_unlock(&proc_subdir_lock);
+		return -ENOENT;
+	}
+	len = strlen(fn);
+
+	root = pde_subdir_find(parent, fn, len);
+	if (!root) {
+		spin_unlock(&proc_subdir_lock);
+		return -ENOENT;
+	}
+	rb_erase(&root->subdir_node, &parent->subdir);
+
+	de = root;
+	while (1) {
+		next = pde_subdir_first(de);
+		if (next) {
+			rb_erase(&next->subdir_node, &de->subdir);
+			de = next;
+			continue;
+		}
+		spin_unlock(&proc_subdir_lock);
+
+		proc_entry_rundown(de);
+		next = de->parent;
+		if (S_ISDIR(de->mode))
+			next->nlink--;
+		de->nlink = 0;
+		if (de == root)
+			break;
+		pde_put(de);
+
+		spin_lock(&proc_subdir_lock);
+		de = next;
+	}
+	pde_put(root);
+	return 0;
+}
+EXPORT_SYMBOL(remove_proc_subtree);
+
+void *proc_get_parent_data(const struct inode *inode)
+{
+	struct proc_dir_entry *de = PDE(inode);
+	return de->parent->data;
+}
+EXPORT_SYMBOL_GPL(proc_get_parent_data);
+
+void proc_remove(struct proc_dir_entry *de)
+{
+	if (de)
+		remove_proc_subtree(de->name, de->parent);
+}
+EXPORT_SYMBOL(proc_remove);
+
+void *PDE_DATA(const struct inode *inode)
+{
+	return __PDE_DATA(inode);
+}
+EXPORT_SYMBOL(PDE_DATA);
diff --git a/kernel/fs/proc/inode.c b/kernel/fs/proc/inode.c
new file mode 100644
index 000000000..e3eb55246
--- /dev/null
+++ b/kernel/fs/proc/inode.c
@@ -0,0 +1,489 @@
+/*
+ *  linux/fs/proc/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <linux/pid_namespace.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/poll.h>
+#include <linux/printk.h>
+#include <linux/file.h>
+#include <linux/limits.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include <linux/namei.h>
+
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+static void proc_evict_inode(struct inode *inode)
+{
+	struct proc_dir_entry *de;
+	struct ctl_table_header *head;
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	/* Stop tracking associated processes */
+	put_pid(PROC_I(inode)->pid);
+
+	/* Let go of any associated proc directory entry */
+	de = PDE(inode);
+	if (de)
+		pde_put(de);
+	head = PROC_I(inode)->sysctl;
+	if (head) {
+		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+		sysctl_head_put(head);
+	}
+}
+
+static struct kmem_cache * proc_inode_cachep;
+
+static struct inode *proc_alloc_inode(struct super_block *sb)
+{
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	ei->pid = NULL;
+	ei->fd = 0;
+	ei->op.proc_get_link = NULL;
+	ei->pde = NULL;
+	ei->sysctl = NULL;
+	ei->sysctl_entry = NULL;
+	ei->ns_ops = NULL;
+	inode = &ei->vfs_inode;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	return inode;
+}
+
+static void proc_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
+}
+
+static void proc_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, proc_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct proc_inode *ei = (struct proc_inode *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+void __init proc_init_inodecache(void)
+{
+	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
+					     sizeof(struct proc_inode),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD|SLAB_PANIC),
+					     init_once);
+}
+
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct pid_namespace *pid = sb->s_fs_info;
+
+	if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
+		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
+	if (pid->hide_pid != 0)
+		seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
+	return 0;
+}
+
+static const struct super_operations proc_sops = {
+	.alloc_inode	= proc_alloc_inode,
+	.destroy_inode	= proc_destroy_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= proc_evict_inode,
+	.statfs		= simple_statfs,
+	.remount_fs	= proc_remount,
+	.show_options	= proc_show_options,
+};
+
+enum {BIAS = -1U<<31};
+
+static inline int use_pde(struct proc_dir_entry *pde)
+{
+	return atomic_inc_unless_negative(&pde->in_use);
+}
+
+static void unuse_pde(struct proc_dir_entry *pde)
+{
+	if (atomic_dec_return(&pde->in_use) == BIAS)
+		complete(pde->pde_unload_completion);
+}
+
+/* pde is locked */
+static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
+{
+	if (pdeo->closing) {
+		/* somebody else is doing that, just wait */
+		DECLARE_COMPLETION_ONSTACK(c);
+		pdeo->c = &c;
+		spin_unlock(&pde->pde_unload_lock);
+		wait_for_completion(&c);
+		spin_lock(&pde->pde_unload_lock);
+	} else {
+		struct file *file;
+		pdeo->closing = 1;
+		spin_unlock(&pde->pde_unload_lock);
+		file = pdeo->file;
+		pde->proc_fops->release(file_inode(file), file);
+		spin_lock(&pde->pde_unload_lock);
+		list_del_init(&pdeo->lh);
+		if (pdeo->c)
+			complete(pdeo->c);
+		kfree(pdeo);
+	}
+}
+
+void proc_entry_rundown(struct proc_dir_entry *de)
+{
+	DECLARE_COMPLETION_ONSTACK(c);
+	/* Wait until all existing callers into module are done. */
+	de->pde_unload_completion = &c;
+	if (atomic_add_return(BIAS, &de->in_use) != BIAS)
+		wait_for_completion(&c);
+
+	spin_lock(&de->pde_unload_lock);
+	while (!list_empty(&de->pde_openers)) {
+		struct pde_opener *pdeo;
+		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+		close_pdeo(de, pdeo);
+	}
+	spin_unlock(&de->pde_unload_lock);
+}
+
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	loff_t rv = -EINVAL;
+	if (use_pde(pde)) {
+		loff_t (*llseek)(struct file *, loff_t, int);
+		llseek = pde->proc_fops->llseek;
+		if (!llseek)
+			llseek = default_llseek;
+		rv = llseek(file, offset, whence);
+		unuse_pde(pde);
+	}
+	return rv;
+}
+
+static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	ssize_t rv = -EIO;
+	if (use_pde(pde)) {
+		read = pde->proc_fops->read;
+		if (read)
+			rv = read(file, buf, count, ppos);
+		unuse_pde(pde);
+	}
+	return rv;
+}
+
+static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	ssize_t rv = -EIO;
+	if (use_pde(pde)) {
+		write = pde->proc_fops->write;
+		if (write)
+			rv = write(file, buf, count, ppos);
+		unuse_pde(pde);
+	}
+	return rv;
+}
+
+static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	unsigned int rv = DEFAULT_POLLMASK;
+	unsigned int (*poll)(struct file *, struct poll_table_struct *);
+	if (use_pde(pde)) {
+		poll = pde->proc_fops->poll;
+		if (poll)
+			rv = poll(file, pts);
+		unuse_pde(pde);
+	}
+	return rv;
+}
+
+static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	long rv = -ENOTTY;
+	long (*ioctl)(struct file *, unsigned int, unsigned long);
+	if (use_pde(pde)) {
+		ioctl = pde->proc_fops->unlocked_ioctl;
+		if (ioctl)
+			rv = ioctl(file, cmd, arg);
+		unuse_pde(pde);
+	}
+	return rv;
+}
+
+#ifdef CONFIG_COMPAT
+static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	long rv = -ENOTTY;
+	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+	if (use_pde(pde)) {
+		compat_ioctl = pde->proc_fops->compat_ioctl;
+		if (compat_ioctl)
+			rv = compat_ioctl(file, cmd, arg);
+		unuse_pde(pde);
+	}
+	return rv;
+}
+#endif
+
+static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	int rv = -EIO;
+	int (*mmap)(struct file *, struct vm_area_struct *);
+	if (use_pde(pde)) {
+		mmap = pde->proc_fops->mmap;
+		if (mmap)
+			rv = mmap(file, vma);
+		unuse_pde(pde);
+	}
+	return rv;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+			   unsigned long len, unsigned long pgoff,
+			   unsigned long flags)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	unsigned long rv = -EIO;
+
+	if (use_pde(pde)) {
+		typeof(proc_reg_get_unmapped_area) *get_area;
+
+		get_area = pde->proc_fops->get_unmapped_area;
+#ifdef CONFIG_MMU
+		if (!get_area)
+			get_area = current->mm->get_unmapped_area;
+#endif
+
+		if (get_area)
+			rv = get_area(file, orig_addr, len, pgoff, flags);
+		else
+			rv = orig_addr;
+		unuse_pde(pde);
+	}
+	return rv;
+}
+
+static int proc_reg_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	int rv = 0;
+	int (*open)(struct inode *, struct file *);
+	int (*release)(struct inode *, struct file *);
+	struct pde_opener *pdeo;
+
+	/*
+	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
+	 * sequence. ->release won't be called because ->proc_fops will be
+	 * cleared. Depending on complexity of ->release, consequences vary.
+	 *
+	 * We can't wait for mercy when close will be done for real, it's
+	 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
+	 * by hand in remove_proc_entry(). For this, save opener's credentials
+	 * for later.
+	 */
+	pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	if (!pdeo)
+		return -ENOMEM;
+
+	if (!use_pde(pde)) {
+		kfree(pdeo);
+		return -ENOENT;
+	}
+	open = pde->proc_fops->open;
+	release = pde->proc_fops->release;
+
+	if (open)
+		rv = open(inode, file);
+
+	if (rv == 0 && release) {
+		/* To know what to release. */
+		pdeo->file = file;
+		/* Strictly for "too late" ->release in proc_reg_release(). */
+		spin_lock(&pde->pde_unload_lock);
+		list_add(&pdeo->lh, &pde->pde_openers);
+		spin_unlock(&pde->pde_unload_lock);
+	} else
+		kfree(pdeo);
+
+	unuse_pde(pde);
+	return rv;
+}
+
+static int proc_reg_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct pde_opener *pdeo;
+	spin_lock(&pde->pde_unload_lock);
+	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+		if (pdeo->file == file) {
+			close_pdeo(pde, pdeo);
+			break;
+		}
+	}
+	spin_unlock(&pde->pde_unload_lock);
+	return 0;
+}
+
+static const struct file_operations proc_reg_file_ops = {
+	.llseek		= proc_reg_llseek,
+	.read		= proc_reg_read,
+	.write		= proc_reg_write,
+	.poll		= proc_reg_poll,
+	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= proc_reg_compat_ioctl,
+#endif
+	.mmap		= proc_reg_mmap,
+	.get_unmapped_area = proc_reg_get_unmapped_area,
+	.open		= proc_reg_open,
+	.release	= proc_reg_release,
+};
+
+#ifdef CONFIG_COMPAT
+static const struct file_operations proc_reg_file_ops_no_compat = {
+	.llseek		= proc_reg_llseek,
+	.read		= proc_reg_read,
+	.write		= proc_reg_write,
+	.poll		= proc_reg_poll,
+	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
+	.mmap		= proc_reg_mmap,
+	.get_unmapped_area = proc_reg_get_unmapped_area,
+	.open		= proc_reg_open,
+	.release	= proc_reg_release,
+};
+#endif
+
+static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct proc_dir_entry *pde = PDE(d_inode(dentry));
+	if (unlikely(!use_pde(pde)))
+		return ERR_PTR(-EINVAL);
+	nd_set_link(nd, pde->data);
+	return pde;
+}
+
+static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	unuse_pde(p);
+}
+
+const struct inode_operations proc_link_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= proc_follow_link,
+	.put_link	= proc_put_link,
+};
+
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
+{
+	struct inode *inode = new_inode_pseudo(sb);
+
+	if (inode) {
+		inode->i_ino = de->low_ino;
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		PROC_I(inode)->pde = de;
+
+		if (is_empty_pde(de)) {
+			make_empty_dir_inode(inode);
+			return inode;
+		}
+		if (de->mode) {
+			inode->i_mode = de->mode;
+			inode->i_uid = de->uid;
+			inode->i_gid = de->gid;
+		}
+		if (de->size)
+			inode->i_size = de->size;
+		if (de->nlink)
+			set_nlink(inode, de->nlink);
+		WARN_ON(!de->proc_iops);
+		inode->i_op = de->proc_iops;
+		if (de->proc_fops) {
+			if (S_ISREG(inode->i_mode)) {
+#ifdef CONFIG_COMPAT
+				if (!de->proc_fops->compat_ioctl)
+					inode->i_fop =
+						&proc_reg_file_ops_no_compat;
+				else
+#endif
+					inode->i_fop = &proc_reg_file_ops;
+			} else {
+				inode->i_fop = de->proc_fops;
+			}
+		}
+	} else
+	       pde_put(de);
+	return inode;
+}
+
+int proc_fill_super(struct super_block *s)
+{
+	struct inode *root_inode;
+	int ret;
+
+	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = PROC_SUPER_MAGIC;
+	s->s_op = &proc_sops;
+	s->s_time_gran = 1;
+	
+	pde_get(&proc_root);
+	root_inode = proc_get_inode(s, &proc_root);
+	if (!root_inode) {
+		pr_err("proc_fill_super: get root inode failed\n");
+		return -ENOMEM;
+	}
+
+	s->s_root = d_make_root(root_inode);
+	if (!s->s_root) {
+		pr_err("proc_fill_super: allocate dentry failed\n");
+		return -ENOMEM;
+	}
+
+	ret = proc_setup_self(s);
+	if (ret) {
+		return ret;
+	}
+	return proc_setup_thread_self(s);
+}
diff --git a/kernel/fs/proc/internal.h b/kernel/fs/proc/internal.h
new file mode 100644
index 000000000..aa2781095
--- /dev/null
+++ b/kernel/fs/proc/internal.h
@@ -0,0 +1,305 @@
+/* Internal procfs definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/binfmts.h>
+
+struct ctl_table_header;
+struct mempolicy;
+
+/*
+ * This is not completely implemented yet. The idea is to
+ * create an in-memory tree (like the actual /proc filesystem
+ * tree) of these proc_dir_entries, so that we can dynamically
+ * add new files to /proc.
+ *
+ * parent/subdir are used for the directory structure (every /proc file has a
+ * parent, but "subdir" is empty for all non-directory entries).
+ * subdir_node is used to build the rb tree "subdir" of the parent.
+ */
+struct proc_dir_entry {
+	unsigned int low_ino;
+	umode_t mode;
+	nlink_t nlink;
+	kuid_t uid;
+	kgid_t gid;
+	loff_t size;
+	const struct inode_operations *proc_iops;
+	const struct file_operations *proc_fops;
+	struct proc_dir_entry *parent;
+	struct rb_root subdir;
+	struct rb_node subdir_node;
+	void *data;
+	atomic_t count;		/* use count */
+	atomic_t in_use;	/* number of callers into module in progress; */
+			/* negative -> it's going away RSN */
+	struct completion *pde_unload_completion;
+	struct list_head pde_openers;	/* who did ->open, but not ->release */
+	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	u8 namelen;
+	char name[];
+};
+
+union proc_op {
+	int (*proc_get_link)(struct dentry *, struct path *);
+	int (*proc_show)(struct seq_file *m,
+		struct pid_namespace *ns, struct pid *pid,
+		struct task_struct *task);
+};
+
+struct proc_inode {
+	struct pid *pid;
+	int fd;
+	union proc_op op;
+	struct proc_dir_entry *pde;
+	struct ctl_table_header *sysctl;
+	struct ctl_table *sysctl_entry;
+	const struct proc_ns_operations *ns_ops;
+	struct inode vfs_inode;
+};
+
+/*
+ * General functions
+ */
+static inline struct proc_inode *PROC_I(const struct inode *inode)
+{
+	return container_of(inode, struct proc_inode, vfs_inode);
+}
+
+static inline struct proc_dir_entry *PDE(const struct inode *inode)
+{
+	return PROC_I(inode)->pde;
+}
+
+static inline void *__PDE_DATA(const struct inode *inode)
+{
+	return PDE(inode)->data;
+}
+
+static inline struct pid *proc_pid(struct inode *inode)
+{
+	return PROC_I(inode)->pid;
+}
+
+static inline struct task_struct *get_proc_task(struct inode *inode)
+{
+	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
+}
+
+static inline int task_dumpable(struct task_struct *task)
+{
+	int dumpable = 0;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm)
+		dumpable = get_dumpable(mm);
+	task_unlock(task);
+	if (dumpable == SUID_DUMP_USER)
+		return 1;
+	return 0;
+}
+
+static inline unsigned name_to_int(const struct qstr *qstr)
+{
+	const char *name = qstr->name;
+	int len = qstr->len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	while (len-- > 0) {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	}
+	return n;
+out:
+	return ~0U;
+}
+
+/*
+ * Offset of the first process in the /proc root directory..
+ */
+#define FIRST_PROCESS_ENTRY 256
+
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 13
+
+/*
+ * array.c
+ */
+extern const struct file_operations proc_tid_children_operations;
+
+extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
+			 struct pid *, struct task_struct *);
+extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
+			  struct pid *, struct task_struct *);
+extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
+			   struct pid *, struct task_struct *);
+extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
+			  struct pid *, struct task_struct *);
+
+/*
+ * base.c
+ */
+extern const struct dentry_operations pid_dentry_operations;
+extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int proc_setattr(struct dentry *, struct iattr *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern int pid_revalidate(struct dentry *, unsigned int);
+extern int pid_delete_dentry(const struct dentry *);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
+extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+extern loff_t mem_lseek(struct file *, loff_t, int);
+
+/* Lookups */
+typedef int instantiate_t(struct inode *, struct dentry *,
+				     struct task_struct *, const void *);
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
+			   instantiate_t, struct task_struct *, const void *);
+
+/*
+ * generic.c
+ */
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
+				     struct dentry *);
+extern int proc_readdir(struct file *, struct dir_context *);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
+
+static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
+{
+	atomic_inc(&pde->count);
+	return pde;
+}
+extern void pde_put(struct proc_dir_entry *);
+
+static inline bool is_empty_pde(const struct proc_dir_entry *pde)
+{
+	return S_ISDIR(pde->mode) && !pde->proc_iops;
+}
+struct proc_dir_entry *proc_create_mount_point(const char *name);
+
+/*
+ * inode.c
+ */
+struct pde_opener {
+	struct file *file;
+	struct list_head lh;
+	int closing;
+	struct completion *c;
+};
+extern const struct inode_operations proc_link_inode_operations;
+
+extern const struct inode_operations proc_pid_link_inode_operations;
+
+extern void proc_init_inodecache(void);
+extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+extern int proc_fill_super(struct super_block *);
+extern void proc_entry_rundown(struct proc_dir_entry *);
+
+/*
+ * proc_namespaces.c
+ */
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
+
+/*
+ * proc_net.c
+ */
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
+
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
+
+/*
+ * proc_self.c
+ */
+extern int proc_setup_self(struct super_block *);
+
+/*
+ * proc_thread_self.c
+ */
+extern int proc_setup_thread_self(struct super_block *);
+extern void proc_thread_self_init(void);
+
+/*
+ * proc_sysctl.c
+ */
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *);
+#else
+static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
+#endif
+
+/*
+ * proc_tty.c
+ */
+#ifdef CONFIG_TTY
+extern void proc_tty_init(void);
+#else
+static inline void proc_tty_init(void) {}
+#endif
+
+/*
+ * root.c
+ */
+extern struct proc_dir_entry proc_root;
+
+extern void proc_self_init(void);
+extern int proc_remount(struct super_block *, int *, char *);
+
+/*
+ * task_[no]mmu.c
+ */
+struct proc_maps_private {
+	struct inode *inode;
+	struct task_struct *task;
+	struct mm_struct *mm;
+#ifdef CONFIG_MMU
+	struct vm_area_struct *tail_vma;
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *task_mempolicy;
+#endif
+};
+
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
+
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
+extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
+
+extern unsigned long task_vsize(struct mm_struct *);
+extern unsigned long task_statm(struct mm_struct *,
+				unsigned long *, unsigned long *,
+				unsigned long *, unsigned long *);
+extern void task_mem(struct seq_file *, struct mm_struct *);
diff --git a/kernel/fs/proc/interrupts.c b/kernel/fs/proc/interrupts.c
new file mode 100644
index 000000000..a352d5703
--- /dev/null
+++ b/kernel/fs/proc/interrupts.c
@@ -0,0 +1,53 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irqnr.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/interrupts
+ */
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	return (*pos <= nr_irqs) ? pos : NULL;
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos > nr_irqs)
+		return NULL;
+	return pos;
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations int_seq_ops = {
+	.start = int_seq_start,
+	.next  = int_seq_next,
+	.stop  = int_seq_stop,
+	.show  = show_interrupts
+};
+
+static int interrupts_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &int_seq_ops);
+}
+
+static const struct file_operations proc_interrupts_operations = {
+	.open		= interrupts_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_interrupts_init(void)
+{
+	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+	return 0;
+}
+fs_initcall(proc_interrupts_init);
diff --git a/kernel/fs/proc/kcore.c b/kernel/fs/proc/kcore.c
new file mode 100644
index 000000000..91a4e6426
--- /dev/null
+++ b/kernel/fs/proc/kcore.c
@@ -0,0 +1,644 @@
+/*
+ *	fs/proc/kcore.c kernel ELF core dumper
+ *
+ *	Modelled on fs/exec.c:aout_core_dump()
+ *	Jeremy Fitzhardinge <jeremy@sw.oz.au>
+ *	ELF version written by David Howells <David.Howells@nexor.co.uk>
+ *	Modified and incorporated into 2.3.x by Tigran Aivazian <tigran@veritas.com>
+ *	Support to dump vmalloc'd areas (ELF only), Tigran Aivazian <tigran@veritas.com>
+ *	Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/kcore.h>
+#include <linux/user.h>
+#include <linux/capability.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/printk.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <linux/list.h>
+#include <linux/ioport.h>
+#include <linux/memory.h>
+#include <asm/sections.h>
+#include "internal.h"
+
+#define CORE_STR "CORE"
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS	0
+#endif
+
+static struct proc_dir_entry *proc_root_kcore;
+
+
+#ifndef kc_vaddr_to_offset
+#define	kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
+#endif
+#ifndef	kc_offset_to_vaddr
+#define	kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
+#endif
+
+/* An ELF note in memory */
+struct memelfnote
+{
+	const char *name;
+	int type;
+	unsigned int datasz;
+	void *data;
+};
+
+static LIST_HEAD(kclist_head);
+static DEFINE_RWLOCK(kclist_lock);
+static int kcore_need_update = 1;
+
+void
+kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+{
+	new->addr = (unsigned long)addr;
+	new->size = size;
+	new->type = type;
+
+	write_lock(&kclist_lock);
+	list_add_tail(&new->list, &kclist_head);
+	write_unlock(&kclist_lock);
+}
+
+static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+{
+	size_t try, size;
+	struct kcore_list *m;
+
+	*nphdr = 1; /* PT_NOTE */
+	size = 0;
+
+	list_for_each_entry(m, &kclist_head, list) {
+		try = kc_vaddr_to_offset((size_t)m->addr + m->size);
+		if (try > size)
+			size = try;
+		*nphdr = *nphdr + 1;
+	}
+	*elf_buflen =	sizeof(struct elfhdr) + 
+			(*nphdr + 2)*sizeof(struct elf_phdr) + 
+			3 * ((sizeof(struct elf_note)) +
+			     roundup(sizeof(CORE_STR), 4)) +
+			roundup(sizeof(struct elf_prstatus), 4) +
+			roundup(sizeof(struct elf_prpsinfo), 4) +
+			roundup(sizeof(struct task_struct), 4);
+	*elf_buflen = PAGE_ALIGN(*elf_buflen);
+	return size + *elf_buflen;
+}
+
+static void free_kclist_ents(struct list_head *head)
+{
+	struct kcore_list *tmp, *pos;
+
+	list_for_each_entry_safe(pos, tmp, head, list) {
+		list_del(&pos->list);
+		kfree(pos);
+	}
+}
+/*
+ * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
+ */
+static void __kcore_update_ram(struct list_head *list)
+{
+	int nphdr;
+	size_t size;
+	struct kcore_list *tmp, *pos;
+	LIST_HEAD(garbage);
+
+	write_lock(&kclist_lock);
+	if (kcore_need_update) {
+		list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+			if (pos->type == KCORE_RAM
+				|| pos->type == KCORE_VMEMMAP)
+				list_move(&pos->list, &garbage);
+		}
+		list_splice_tail(list, &kclist_head);
+	} else
+		list_splice(list, &garbage);
+	kcore_need_update = 0;
+	proc_root_kcore->size = get_kcore_size(&nphdr, &size);
+	write_unlock(&kclist_lock);
+
+	free_kclist_ents(&garbage);
+}
+
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
+ * because memory hole is not as big as !HIGHMEM case.
+ * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
+ */
+static int kcore_update_ram(void)
+{
+	LIST_HEAD(head);
+	struct kcore_list *ent;
+	int ret = 0;
+
+	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+	ent->addr = (unsigned long)__va(0);
+	ent->size = max_low_pfn << PAGE_SHIFT;
+	ent->type = KCORE_RAM;
+	list_add(&ent->list, &head);
+	__kcore_update_ram(&head);
+	return ret;
+}
+
+#else /* !CONFIG_HIGHMEM */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/* calculate vmemmap's address from given system ram pfn and register it */
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+	unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
+	unsigned long nr_pages = ent->size >> PAGE_SHIFT;
+	unsigned long start, end;
+	struct kcore_list *vmm, *tmp;
+
+
+	start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
+	end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
+	end = PAGE_ALIGN(end);
+	/* overlap check (because we have to align page */
+	list_for_each_entry(tmp, head, list) {
+		if (tmp->type != KCORE_VMEMMAP)
+			continue;
+		if (start < tmp->addr + tmp->size)
+			if (end > tmp->addr)
+				end = tmp->addr;
+	}
+	if (start < end) {
+		vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
+		if (!vmm)
+			return 0;
+		vmm->addr = start;
+		vmm->size = end - start;
+		vmm->type = KCORE_VMEMMAP;
+		list_add_tail(&vmm->list, head);
+	}
+	return 1;
+
+}
+#else
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+	return 1;
+}
+
+#endif
+
+static int
+kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+	struct list_head *head = (struct list_head *)arg;
+	struct kcore_list *ent;
+
+	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+	ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
+	ent->size = nr_pages << PAGE_SHIFT;
+
+	/* Sanity check: Can happen in 32bit arch...maybe */
+	if (ent->addr < (unsigned long) __va(0))
+		goto free_out;
+
+	/* cut not-mapped area. ....from ppc-32 code. */
+	if (ULONG_MAX - ent->addr < ent->size)
+		ent->size = ULONG_MAX - ent->addr;
+
+	/* cut when vmalloc() area is higher than direct-map area */
+	if (VMALLOC_START > (unsigned long)__va(0)) {
+		if (ent->addr > VMALLOC_START)
+			goto free_out;
+		if (VMALLOC_START - ent->addr < ent->size)
+			ent->size = VMALLOC_START - ent->addr;
+	}
+
+	ent->type = KCORE_RAM;
+	list_add_tail(&ent->list, head);
+
+	if (!get_sparsemem_vmemmap_info(ent, head)) {
+		list_del(&ent->list);
+		goto free_out;
+	}
+
+	return 0;
+free_out:
+	kfree(ent);
+	return 1;
+}
+
+static int kcore_update_ram(void)
+{
+	int nid, ret;
+	unsigned long end_pfn;
+	LIST_HEAD(head);
+
+	/* Not inialized....update now */
+	/* find out "max pfn" */
+	end_pfn = 0;
+	for_each_node_state(nid, N_MEMORY) {
+		unsigned long node_end;
+		node_end = node_end_pfn(nid);
+		if (end_pfn < node_end)
+			end_pfn = node_end;
+	}
+	/* scan 0 to max_pfn */
+	ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
+	if (ret) {
+		free_kclist_ents(&head);
+		return -ENOMEM;
+	}
+	__kcore_update_ram(&head);
+	return ret;
+}
+#endif /* CONFIG_HIGHMEM */
+
+/*****************************************************************************/
+/*
+ * determine size of ELF note
+ */
+static int notesize(struct memelfnote *en)
+{
+	int sz;
+
+	sz = sizeof(struct elf_note);
+	sz += roundup((strlen(en->name) + 1), 4);
+	sz += roundup(en->datasz, 4);
+
+	return sz;
+} /* end notesize() */
+
+/*****************************************************************************/
+/*
+ * store a note in the header buffer
+ */
+static char *storenote(struct memelfnote *men, char *bufp)
+{
+	struct elf_note en;
+
+#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
+
+	en.n_namesz = strlen(men->name) + 1;
+	en.n_descsz = men->datasz;
+	en.n_type = men->type;
+
+	DUMP_WRITE(&en, sizeof(en));
+	DUMP_WRITE(men->name, en.n_namesz);
+
+	/* XXX - cast from long long to long to avoid need for libgcc.a */
+	bufp = (char*) roundup((unsigned long)bufp,4);
+	DUMP_WRITE(men->data, men->datasz);
+	bufp = (char*) roundup((unsigned long)bufp,4);
+
+#undef DUMP_WRITE
+
+	return bufp;
+} /* end storenote() */
+
+/*
+ * store an ELF coredump header in the supplied buffer
+ * nphdr is the number of elf_phdr to insert
+ */
+static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
+{
+	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+	struct elf_prpsinfo prpsinfo;	/* NT_PRPSINFO */
+	struct elf_phdr *nhdr, *phdr;
+	struct elfhdr *elf;
+	struct memelfnote notes[3];
+	off_t offset = 0;
+	struct kcore_list *m;
+
+	/* setup ELF header */
+	elf = (struct elfhdr *) bufp;
+	bufp += sizeof(struct elfhdr);
+	offset += sizeof(struct elfhdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS]	= ELF_CLASS;
+	elf->e_ident[EI_DATA]	= ELF_DATA;
+	elf->e_ident[EI_VERSION]= EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type	= ET_CORE;
+	elf->e_machine	= ELF_ARCH;
+	elf->e_version	= EV_CURRENT;
+	elf->e_entry	= 0;
+	elf->e_phoff	= sizeof(struct elfhdr);
+	elf->e_shoff	= 0;
+	elf->e_flags	= ELF_CORE_EFLAGS;
+	elf->e_ehsize	= sizeof(struct elfhdr);
+	elf->e_phentsize= sizeof(struct elf_phdr);
+	elf->e_phnum	= nphdr;
+	elf->e_shentsize= 0;
+	elf->e_shnum	= 0;
+	elf->e_shstrndx	= 0;
+
+	/* setup ELF PT_NOTE program header */
+	nhdr = (struct elf_phdr *) bufp;
+	bufp += sizeof(struct elf_phdr);
+	offset += sizeof(struct elf_phdr);
+	nhdr->p_type	= PT_NOTE;
+	nhdr->p_offset	= 0;
+	nhdr->p_vaddr	= 0;
+	nhdr->p_paddr	= 0;
+	nhdr->p_filesz	= 0;
+	nhdr->p_memsz	= 0;
+	nhdr->p_flags	= 0;
+	nhdr->p_align	= 0;
+
+	/* setup ELF PT_LOAD program header for every area */
+	list_for_each_entry(m, &kclist_head, list) {
+		phdr = (struct elf_phdr *) bufp;
+		bufp += sizeof(struct elf_phdr);
+		offset += sizeof(struct elf_phdr);
+
+		phdr->p_type	= PT_LOAD;
+		phdr->p_flags	= PF_R|PF_W|PF_X;
+		phdr->p_offset	= kc_vaddr_to_offset(m->addr) + dataoff;
+		phdr->p_vaddr	= (size_t)m->addr;
+		phdr->p_paddr	= 0;
+		phdr->p_filesz	= phdr->p_memsz	= m->size;
+		phdr->p_align	= PAGE_SIZE;
+	}
+
+	/*
+	 * Set up the notes in similar form to SVR4 core dumps made
+	 * with info from their /proc.
+	 */
+	nhdr->p_offset	= offset;
+
+	/* set up the process status */
+	notes[0].name = CORE_STR;
+	notes[0].type = NT_PRSTATUS;
+	notes[0].datasz = sizeof(struct elf_prstatus);
+	notes[0].data = &prstatus;
+
+	memset(&prstatus, 0, sizeof(struct elf_prstatus));
+
+	nhdr->p_filesz	= notesize(&notes[0]);
+	bufp = storenote(&notes[0], bufp);
+
+	/* set up the process info */
+	notes[1].name	= CORE_STR;
+	notes[1].type	= NT_PRPSINFO;
+	notes[1].datasz	= sizeof(struct elf_prpsinfo);
+	notes[1].data	= &prpsinfo;
+
+	memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo));
+	prpsinfo.pr_state	= 0;
+	prpsinfo.pr_sname	= 'R';
+	prpsinfo.pr_zomb	= 0;
+
+	strcpy(prpsinfo.pr_fname, "vmlinux");
+	strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
+
+	nhdr->p_filesz	+= notesize(&notes[1]);
+	bufp = storenote(&notes[1], bufp);
+
+	/* set up the task structure */
+	notes[2].name	= CORE_STR;
+	notes[2].type	= NT_TASKSTRUCT;
+	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].data	= current;
+
+	nhdr->p_filesz	+= notesize(&notes[2]);
+	bufp = storenote(&notes[2], bufp);
+
+} /* end elf_kcore_store_hdr() */
+
+/*****************************************************************************/
+/*
+ * read from the ELF header and then kernel memory
+ */
+static ssize_t
+read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+{
+	ssize_t acc = 0;
+	size_t size, tsz;
+	size_t elf_buflen;
+	int nphdr;
+	unsigned long start;
+
+	read_lock(&kclist_lock);
+	size = get_kcore_size(&nphdr, &elf_buflen);
+
+	if (buflen == 0 || *fpos >= size) {
+		read_unlock(&kclist_lock);
+		return 0;
+	}
+
+	/* trim buflen to not go beyond EOF */
+	if (buflen > size - *fpos)
+		buflen = size - *fpos;
+
+	/* construct an ELF core header if we'll need some of it */
+	if (*fpos < elf_buflen) {
+		char * elf_buf;
+
+		tsz = elf_buflen - *fpos;
+		if (buflen < tsz)
+			tsz = buflen;
+		elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+		if (!elf_buf) {
+			read_unlock(&kclist_lock);
+			return -ENOMEM;
+		}
+		elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
+		read_unlock(&kclist_lock);
+		if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
+			kfree(elf_buf);
+			return -EFAULT;
+		}
+		kfree(elf_buf);
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	} else
+		read_unlock(&kclist_lock);
+
+	/*
+	 * Check to see if our file offset matches with any of
+	 * the addresses in the elf_phdr on our list.
+	 */
+	start = kc_offset_to_vaddr(*fpos - elf_buflen);
+	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+		tsz = buflen;
+		
+	while (buflen) {
+		struct kcore_list *m;
+
+		read_lock(&kclist_lock);
+		list_for_each_entry(m, &kclist_head, list) {
+			if (start >= m->addr && start < (m->addr+m->size))
+				break;
+		}
+		read_unlock(&kclist_lock);
+
+		if (&m->list == &kclist_head) {
+			if (clear_user(buffer, tsz))
+				return -EFAULT;
+		} else if (is_vmalloc_or_module_addr((void *)start)) {
+			char * elf_buf;
+
+			elf_buf = kzalloc(tsz, GFP_KERNEL);
+			if (!elf_buf)
+				return -ENOMEM;
+			vread(elf_buf, (char *)start, tsz);
+			/* we have to zero-fill user buffer even if no read */
+			if (copy_to_user(buffer, elf_buf, tsz)) {
+				kfree(elf_buf);
+				return -EFAULT;
+			}
+			kfree(elf_buf);
+		} else {
+			if (kern_addr_valid(start)) {
+				unsigned long n;
+
+				n = copy_to_user(buffer, (char *)start, tsz);
+				/*
+				 * We cannot distinguish between fault on source
+				 * and fault on destination. When this happens
+				 * we clear too and hope it will trigger the
+				 * EFAULT again.
+				 */
+				if (n) { 
+					if (clear_user(buffer + tsz - n,
+								n))
+						return -EFAULT;
+				}
+			} else {
+				if (clear_user(buffer, tsz))
+					return -EFAULT;
+			}
+		}
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+		start += tsz;
+		tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
+	}
+
+	return acc;
+}
+
+
+static int open_kcore(struct inode *inode, struct file *filp)
+{
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	if (kcore_need_update)
+		kcore_update_ram();
+	if (i_size_read(inode) != proc_root_kcore->size) {
+		mutex_lock(&inode->i_mutex);
+		i_size_write(inode, proc_root_kcore->size);
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+
+static const struct file_operations proc_kcore_operations = {
+	.read		= read_kcore,
+	.open		= open_kcore,
+	.llseek		= default_llseek,
+};
+
+/* just remember that we have to update kcore */
+static int __meminit kcore_callback(struct notifier_block *self,
+				    unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		write_lock(&kclist_lock);
+		kcore_need_update = 1;
+		write_unlock(&kclist_lock);
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kcore_callback_nb __meminitdata = {
+	.notifier_call = kcore_callback,
+	.priority = 0,
+};
+
+static struct kcore_list kcore_vmalloc;
+
+#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
+static struct kcore_list kcore_text;
+/*
+ * If defined, special segment is used for mapping kernel text instead of
+ * direct-map area. We need to create special TEXT section.
+ */
+static void __init proc_kcore_text_init(void)
+{
+	kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
+}
+#else
+static void __init proc_kcore_text_init(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+/*
+ * MODULES_VADDR has no intersection with VMALLOC_ADDR.
+ */
+struct kcore_list kcore_modules;
+static void __init add_modules_range(void)
+{
+	if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
+		kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+			MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+	}
+}
+#else
+static void __init add_modules_range(void)
+{
+}
+#endif
+
+static int __init proc_kcore_init(void)
+{
+	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
+				      &proc_kcore_operations);
+	if (!proc_root_kcore) {
+		pr_err("couldn't create /proc/kcore\n");
+		return 0; /* Always returns 0. */
+	}
+	/* Store text area if it's special */
+	proc_kcore_text_init();
+	/* Store vmalloc area */
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+		VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
+	add_modules_range();
+	/* Store direct-map area from physical memory map */
+	kcore_update_ram();
+	register_hotmemory_notifier(&kcore_callback_nb);
+
+	return 0;
+}
+fs_initcall(proc_kcore_init);
diff --git a/kernel/fs/proc/kmsg.c b/kernel/fs/proc/kmsg.c
new file mode 100644
index 000000000..05f8dcdb0
--- /dev/null
+++ b/kernel/fs/proc/kmsg.c
@@ -0,0 +1,64 @@
+/*
+ *  linux/fs/proc/kmsg.c
+ *
+ *  Copyright (C) 1992  by Linus Torvalds
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/syslog.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+extern wait_queue_head_t log_wait;
+
+static int kmsg_open(struct inode * inode, struct file * file)
+{
+	return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
+}
+
+static int kmsg_release(struct inode * inode, struct file * file)
+{
+	(void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
+	return 0;
+}
+
+static ssize_t kmsg_read(struct file *file, char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	if ((file->f_flags & O_NONBLOCK) &&
+	    !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
+		return -EAGAIN;
+	return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
+}
+
+static unsigned int kmsg_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &log_wait, wait);
+	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+
+static const struct file_operations proc_kmsg_operations = {
+	.read		= kmsg_read,
+	.poll		= kmsg_poll,
+	.open		= kmsg_open,
+	.release	= kmsg_release,
+	.llseek		= generic_file_llseek,
+};
+
+static int __init proc_kmsg_init(void)
+{
+	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+	return 0;
+}
+fs_initcall(proc_kmsg_init);
diff --git a/kernel/fs/proc/loadavg.c b/kernel/fs/proc/loadavg.c
new file mode 100644
index 000000000..aec66e6c2
--- /dev/null
+++ b/kernel/fs/proc/loadavg.c
@@ -0,0 +1,45 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/time.h>
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static int loadavg_proc_show(struct seq_file *m, void *v)
+{
+	unsigned long avnrun[3];
+
+	get_avenrun(avnrun, FIXED_1/200, 0);
+
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+		nr_running(), nr_threads,
+		task_active_pid_ns(current)->last_pid);
+	return 0;
+}
+
+static int loadavg_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, loadavg_proc_show, NULL);
+}
+
+static const struct file_operations loadavg_proc_fops = {
+	.open		= loadavg_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_loadavg_init(void)
+{
+	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	return 0;
+}
+fs_initcall(proc_loadavg_init);
diff --git a/kernel/fs/proc/meminfo.c b/kernel/fs/proc/meminfo.c
new file mode 100644
index 000000000..d3ebf2e61
--- /dev/null
+++ b/kernel/fs/proc/meminfo.c
@@ -0,0 +1,234 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/quicklist.h>
+#include <linux/seq_file.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <linux/atomic.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_CMA
+#include <linux/cma.h>
+#endif
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include "internal.h"
+
+void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
+{
+}
+
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	struct sysinfo i;
+	unsigned long committed;
+	struct vmalloc_info vmi;
+	long cached;
+	long available;
+	unsigned long pagecache;
+	unsigned long wmark_low = 0;
+	unsigned long pages[NR_LRU_LISTS];
+	struct zone *zone;
+	int lru;
+
+/*
+ * display in kilobytes.
+ */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+	si_meminfo(&i);
+	si_swapinfo(&i);
+	committed = percpu_counter_read_positive(&vm_committed_as);
+
+	cached = global_page_state(NR_FILE_PAGES) -
+			total_swapcache_pages() - i.bufferram;
+	if (cached < 0)
+		cached = 0;
+
+	get_vmalloc_info(&vmi);
+
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+	for_each_zone(zone)
+		wmark_low += zone->watermark[WMARK_LOW];
+
+	/*
+	 * Estimate the amount of memory available for userspace allocations,
+	 * without causing swapping.
+	 *
+	 * Free memory cannot be taken below the low watermark, before the
+	 * system starts swapping.
+	 */
+	available = i.freeram - wmark_low;
+
+	/*
+	 * Not all the page cache can be freed, otherwise the system will
+	 * start swapping. Assume at least half of the page cache, or the
+	 * low watermark worth of cache, needs to stay.
+	 */
+	pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+	pagecache -= min(pagecache / 2, wmark_low);
+	available += pagecache;
+
+	/*
+	 * Part of the reclaimable slab consists of items that are in use,
+	 * and cannot be freed. Cap this estimate at the low watermark.
+	 */
+	available += global_page_state(NR_SLAB_RECLAIMABLE) -
+		     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+	if (available < 0)
+		available = 0;
+
+	/*
+	 * Tagged format, for easy grepping and expansion.
+	 */
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"MemAvailable:   %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+#ifdef CONFIG_HIGHMEM
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
+#endif
+#ifndef CONFIG_MMU
+		"MmapCopy:       %8lu kB\n"
+#endif
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Shmem:          %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"KernelStack:    %8lu kB\n"
+		"PageTables:     %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+		"Quicklists:     %8lu kB\n"
+#endif
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n"
+#ifdef CONFIG_MEMORY_FAILURE
+		"HardwareCorrupted: %5lu kB\n"
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		"AnonHugePages:  %8lu kB\n"
+#endif
+#ifdef CONFIG_CMA
+		"CmaTotal:       %8lu kB\n"
+		"CmaFree:        %8lu kB\n"
+#endif
+		,
+		K(i.totalram),
+		K(i.freeram),
+		K(available),
+		K(i.bufferram),
+		K(cached),
+		K(total_swapcache_pages()),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
+#ifdef CONFIG_HIGHMEM
+		K(i.totalhigh),
+		K(i.freehigh),
+		K(i.totalram-i.totalhigh),
+		K(i.freeram-i.freehigh),
+#endif
+#ifndef CONFIG_MMU
+		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
+#endif
+		K(i.totalswap),
+		K(i.freeswap),
+		K(global_page_state(NR_FILE_DIRTY)),
+		K(global_page_state(NR_WRITEBACK)),
+		K(global_page_state(NR_ANON_PAGES)),
+		K(global_page_state(NR_FILE_MAPPED)),
+		K(i.sharedram),
+		K(global_page_state(NR_SLAB_RECLAIMABLE) +
+				global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE)),
+		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
+		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+		K(quicklist_total_size()),
+#endif
+		K(global_page_state(NR_UNSTABLE_NFS)),
+		K(global_page_state(NR_BOUNCE)),
+		K(global_page_state(NR_WRITEBACK_TEMP)),
+		K(vm_commit_limit()),
+		K(committed),
+		(unsigned long)VMALLOC_TOTAL >> 10,
+		vmi.used >> 10,
+		vmi.largest_chunk >> 10
+#ifdef CONFIG_MEMORY_FAILURE
+		, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		, K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		   HPAGE_PMD_NR)
+#endif
+#ifdef CONFIG_CMA
+		, K(totalcma_pages)
+		, K(global_page_state(NR_FREE_CMA_PAGES))
+#endif
+		);
+
+	hugetlb_report_meminfo(m);
+
+	arch_report_meminfo(m);
+
+	return 0;
+#undef K
+}
+
+static int meminfo_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, meminfo_proc_show, NULL);
+}
+
+static const struct file_operations meminfo_proc_fops = {
+	.open		= meminfo_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_meminfo_init(void)
+{
+	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	return 0;
+}
+fs_initcall(proc_meminfo_init);
diff --git a/kernel/fs/proc/namespaces.c b/kernel/fs/proc/namespaces.c
new file mode 100644
index 000000000..e512642db
--- /dev/null
+++ b/kernel/fs/proc/namespaces.c
@@ -0,0 +1,172 @@
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include "internal.h"
+
+
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+	&netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+	&utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+	&ipcns_operations,
+#endif
+#ifdef CONFIG_PID_NS
+	&pidns_operations,
+#endif
+#ifdef CONFIG_USER_NS
+	&userns_operations,
+#endif
+	&mntns_operations,
+};
+
+static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = d_inode(dentry);
+	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+	struct task_struct *task;
+	struct path ns_path;
+	void *error = ERR_PTR(-EACCES);
+
+	task = get_proc_task(inode);
+	if (!task)
+		return error;
+
+	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+		error = ns_get_path(&ns_path, task, ns_ops);
+		if (!error)
+			nd_jump_link(nd, &ns_path);
+	}
+	put_task_struct(task);
+	return error;
+}
+
+static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct inode *inode = d_inode(dentry);
+	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+	struct task_struct *task;
+	char name[50];
+	int res = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return res;
+
+	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+		res = ns_get_name(name, sizeof(name), task, ns_ops);
+		if (res >= 0)
+			res = readlink_copy(buffer, buflen, name);
+	}
+	put_task_struct(task);
+	return res;
+}
+
+static const struct inode_operations proc_ns_link_inode_operations = {
+	.readlink	= proc_ns_readlink,
+	.follow_link	= proc_ns_follow_link,
+	.setattr	= proc_setattr,
+};
+
+static int proc_ns_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	const struct proc_ns_operations *ns_ops = ptr;
+	struct inode *inode;
+	struct proc_inode *ei;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	inode->i_mode = S_IFLNK|S_IRWXUGO;
+	inode->i_op = &proc_ns_link_inode_operations;
+	ei->ns_ops = ns_ops;
+
+	d_set_d_op(dentry, &pid_dentry_operations);
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, 0))
+		return 0;
+out:
+	return -ENOENT;
+}
+
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	const struct proc_ns_operations **entry, **last;
+
+	if (!task)
+		return -ENOENT;
+
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
+		goto out;
+	entry = ns_entries + (ctx->pos - 2);
+	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+	while (entry <= last) {
+		const struct proc_ns_operations *ops = *entry;
+		if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
+				     proc_ns_instantiate, task, ops))
+			break;
+		ctx->pos++;
+		entry++;
+	}
+out:
+	put_task_struct(task);
+	return 0;
+}
+
+const struct file_operations proc_ns_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_ns_dir_readdir,
+};
+
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+				struct dentry *dentry, unsigned int flags)
+{
+	int error;
+	struct task_struct *task = get_proc_task(dir);
+	const struct proc_ns_operations **entry, **last;
+	unsigned int len = dentry->d_name.len;
+
+	error = -ENOENT;
+
+	if (!task)
+		goto out_no_task;
+
+	last = &ns_entries[ARRAY_SIZE(ns_entries)];
+	for (entry = ns_entries; entry < last; entry++) {
+		if (strlen((*entry)->name) != len)
+			continue;
+		if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+			break;
+	}
+	if (entry == last)
+		goto out;
+
+	error = proc_ns_instantiate(dir, dentry, task, *entry);
+out:
+	put_task_struct(task);
+out_no_task:
+	return ERR_PTR(error);
+}
+
+const struct inode_operations proc_ns_dir_inode_operations = {
+	.lookup		= proc_ns_dir_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
diff --git a/kernel/fs/proc/nommu.c b/kernel/fs/proc/nommu.c
new file mode 100644
index 000000000..d4a35746c
--- /dev/null
+++ b/kernel/fs/proc/nommu.c
@@ -0,0 +1,134 @@
+/* nommu.c: mmu-less memory info files
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mman.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/seq_file.h>
+#include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/div64.h>
+#include "internal.h"
+
+/*
+ * display a single region to a sequenced file
+ */
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
+{
+	unsigned long ino = 0;
+	struct file *file;
+	dev_t dev = 0;
+	int flags;
+
+	flags = region->vm_flags;
+	file = region->vm_file;
+
+	if (file) {
+		struct inode *inode = file_inode(region->vm_file);
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+	}
+
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+	seq_printf(m,
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+		   region->vm_start,
+		   region->vm_end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+		   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
+		   MAJOR(dev), MINOR(dev), ino);
+
+	if (file) {
+		seq_pad(m, ' ');
+		seq_path(m, &file->f_path, "");
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+/*
+ * display a list of all the REGIONs the kernel knows about
+ * - nommu kernels have a single flat list
+ */
+static int nommu_region_list_show(struct seq_file *m, void *_p)
+{
+	struct rb_node *p = _p;
+
+	return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
+}
+
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
+{
+	struct rb_node *p;
+	loff_t pos = *_pos;
+
+	down_read(&nommu_region_sem);
+
+	for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
+		if (pos-- == 0)
+			return p;
+	return NULL;
+}
+
+static void nommu_region_list_stop(struct seq_file *m, void *v)
+{
+	up_read(&nommu_region_sem);
+}
+
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return rb_next((struct rb_node *) v);
+}
+
+static const struct seq_operations proc_nommu_region_list_seqop = {
+	.start	= nommu_region_list_start,
+	.next	= nommu_region_list_next,
+	.stop	= nommu_region_list_stop,
+	.show	= nommu_region_list_show
+};
+
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &proc_nommu_region_list_seqop);
+}
+
+static const struct file_operations proc_nommu_region_list_operations = {
+	.open    = proc_nommu_region_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init proc_nommu_init(void)
+{
+	proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
+	return 0;
+}
+
+fs_initcall(proc_nommu_init);
diff --git a/kernel/fs/proc/page.c b/kernel/fs/proc/page.c
new file mode 100644
index 000000000..7eee2d8b9
--- /dev/null
+++ b/kernel/fs/proc/page.c
@@ -0,0 +1,234 @@
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/ksm.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/huge_mm.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel-page-flags.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+#define KPMSIZE sizeof(u64)
+#define KPMMASK (KPMSIZE - 1)
+
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 pcount;
+
+	pfn = src / KPMSIZE;
+	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		else
+			ppage = NULL;
+		if (!ppage || PageSlab(ppage))
+			pcount = 0;
+		else
+			pcount = page_mapcount(ppage);
+
+		if (put_user(pcount, out)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn++;
+		out++;
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpagecount_operations = {
+	.llseek = mem_lseek,
+	.read = kpagecount_read,
+};
+
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
+
+static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
+{
+	return ((kflags >> kbit) & 1) << ubit;
+}
+
+u64 stable_page_flags(struct page *page)
+{
+	u64 k;
+	u64 u;
+
+	/*
+	 * pseudo flag: KPF_NOPAGE
+	 * it differentiates a memory hole from a page with no flags
+	 */
+	if (!page)
+		return 1 << KPF_NOPAGE;
+
+	k = page->flags;
+	u = 0;
+
+	/*
+	 * pseudo flags for the well known (anonymous) memory mapped pages
+	 *
+	 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+	 * simple test in page_mapped() is not enough.
+	 */
+	if (!PageSlab(page) && page_mapped(page))
+		u |= 1 << KPF_MMAP;
+	if (PageAnon(page))
+		u |= 1 << KPF_ANON;
+	if (PageKsm(page))
+		u |= 1 << KPF_KSM;
+
+	/*
+	 * compound pages: export both head/tail info
+	 * they together define a compound page's start/end pos and order
+	 */
+	if (PageHead(page))
+		u |= 1 << KPF_COMPOUND_HEAD;
+	if (PageTail(page))
+		u |= 1 << KPF_COMPOUND_TAIL;
+	if (PageHuge(page))
+		u |= 1 << KPF_HUGE;
+	/*
+	 * PageTransCompound can be true for non-huge compound pages (slab
+	 * pages or pages allocated by drivers with __GFP_COMP) because it
+	 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+	 * to make sure a given page is a thp, not a non-huge compound page.
+	 */
+	else if (PageTransCompound(page)) {
+		struct page *head = compound_head(page);
+
+		if (PageLRU(head) || PageAnon(head))
+			u |= 1 << KPF_THP;
+		else if (is_huge_zero_page(head)) {
+			u |= 1 << KPF_ZERO_PAGE;
+			u |= 1 << KPF_THP;
+		}
+	} else if (is_zero_pfn(page_to_pfn(page)))
+		u |= 1 << KPF_ZERO_PAGE;
+
+
+	/*
+	 * Caveats on high order pages: page->_count will only be set
+	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+	 * SLOB won't set PG_slab at all on compound pages.
+	 */
+	if (PageBuddy(page))
+		u |= 1 << KPF_BUDDY;
+
+	if (PageBalloon(page))
+		u |= 1 << KPF_BALLOON;
+
+	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
+
+	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
+
+	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
+	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
+	u |= kpf_copy_bit(k, KPF_UPTODATE,	PG_uptodate);
+	u |= kpf_copy_bit(k, KPF_WRITEBACK,	PG_writeback);
+
+	u |= kpf_copy_bit(k, KPF_LRU,		PG_lru);
+	u |= kpf_copy_bit(k, KPF_REFERENCED,	PG_referenced);
+	u |= kpf_copy_bit(k, KPF_ACTIVE,	PG_active);
+	u |= kpf_copy_bit(k, KPF_RECLAIM,	PG_reclaim);
+
+	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
+	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
+
+	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
+	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
+
+#ifdef CONFIG_MEMORY_FAILURE
+	u |= kpf_copy_bit(k, KPF_HWPOISON,	PG_hwpoison);
+#endif
+
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
+#endif
+
+	u |= kpf_copy_bit(k, KPF_RESERVED,	PG_reserved);
+	u |= kpf_copy_bit(k, KPF_MAPPEDTODISK,	PG_mappedtodisk);
+	u |= kpf_copy_bit(k, KPF_PRIVATE,	PG_private);
+	u |= kpf_copy_bit(k, KPF_PRIVATE_2,	PG_private_2);
+	u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE,	PG_owner_priv_1);
+	u |= kpf_copy_bit(k, KPF_ARCH,		PG_arch_1);
+
+	return u;
+};
+
+static ssize_t kpageflags_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+
+	pfn = src / KPMSIZE;
+	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		else
+			ppage = NULL;
+
+		if (put_user(stable_page_flags(ppage), out)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn++;
+		out++;
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpageflags_operations = {
+	.llseek = mem_lseek,
+	.read = kpageflags_read,
+};
+
+static int __init proc_page_init(void)
+{
+	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
+	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+	return 0;
+}
+fs_initcall(proc_page_init);
diff --git a/kernel/fs/proc/proc_net.c b/kernel/fs/proc/proc_net.c
new file mode 100644
index 000000000..350984a19
--- /dev/null
+++ b/kernel/fs/proc/proc_net.c
@@ -0,0 +1,233 @@
+/*
+ *  linux/fs/proc/net.c
+ *
+ *  Copyright (C) 2007
+ *
+ *  Author: Eric Biederman <ebiederm@xmission.com>
+ *
+ *  proc net directory handling functions
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+
+static inline struct net *PDE_NET(struct proc_dir_entry *pde)
+{
+	return pde->parent->data;
+}
+
+static struct net *get_proc_net(const struct inode *inode)
+{
+	return maybe_get_net(PDE_NET(PDE(inode)));
+}
+
+int seq_open_net(struct inode *ino, struct file *f,
+		 const struct seq_operations *ops, int size)
+{
+	struct net *net;
+	struct seq_net_private *p;
+
+	BUG_ON(size < sizeof(*p));
+
+	net = get_proc_net(ino);
+	if (net == NULL)
+		return -ENXIO;
+
+	p = __seq_open_private(f, ops, size);
+	if (p == NULL) {
+		put_net(net);
+		return -ENOMEM;
+	}
+#ifdef CONFIG_NET_NS
+	p->net = net;
+#endif
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seq_open_net);
+
+int single_open_net(struct inode *inode, struct file *file,
+		int (*show)(struct seq_file *, void *))
+{
+	int err;
+	struct net *net;
+
+	err = -ENXIO;
+	net = get_proc_net(inode);
+	if (net == NULL)
+		goto err_net;
+
+	err = single_open(file, show, net);
+	if (err < 0)
+		goto err_open;
+
+	return 0;
+
+err_open:
+	put_net(net);
+err_net:
+	return err;
+}
+EXPORT_SYMBOL_GPL(single_open_net);
+
+int seq_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq;
+
+	seq = f->private_data;
+
+	put_net(seq_file_net(seq));
+	seq_release_private(ino, f);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seq_release_net);
+
+int single_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq = f->private_data;
+	put_net(seq->private);
+	return single_release(ino, f);
+}
+EXPORT_SYMBOL_GPL(single_release_net);
+
+static struct net *get_proc_task_net(struct inode *dir)
+{
+	struct task_struct *task;
+	struct nsproxy *ns;
+	struct net *net = NULL;
+
+	rcu_read_lock();
+	task = pid_task(proc_pid(dir), PIDTYPE_PID);
+	if (task != NULL) {
+		task_lock(task);
+		ns = task->nsproxy;
+		if (ns != NULL)
+			net = get_net(ns->net_ns);
+		task_unlock(task);
+	}
+	rcu_read_unlock();
+
+	return net;
+}
+
+static struct dentry *proc_tgid_net_lookup(struct inode *dir,
+		struct dentry *dentry, unsigned int flags)
+{
+	struct dentry *de;
+	struct net *net;
+
+	de = ERR_PTR(-ENOENT);
+	net = get_proc_task_net(dir);
+	if (net != NULL) {
+		de = proc_lookup_de(net->proc_net, dir, dentry);
+		put_net(net);
+	}
+	return de;
+}
+
+static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct net *net;
+
+	net = get_proc_task_net(inode);
+
+	generic_fillattr(inode, stat);
+
+	if (net != NULL) {
+		stat->nlink = net->proc_net->nlink;
+		put_net(net);
+	}
+
+	return 0;
+}
+
+const struct inode_operations proc_net_inode_operations = {
+	.lookup		= proc_tgid_net_lookup,
+	.getattr	= proc_tgid_net_getattr,
+};
+
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
+{
+	int ret;
+	struct net *net;
+
+	ret = -EINVAL;
+	net = get_proc_task_net(file_inode(file));
+	if (net != NULL) {
+		ret = proc_readdir_de(net->proc_net, file, ctx);
+		put_net(net);
+	}
+	return ret;
+}
+
+const struct file_operations proc_net_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= proc_tgid_net_readdir,
+};
+
+static __net_init int proc_net_ns_init(struct net *net)
+{
+	struct proc_dir_entry *netd, *net_statd;
+	int err;
+
+	err = -ENOMEM;
+	netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
+	if (!netd)
+		goto out;
+
+	netd->subdir = RB_ROOT;
+	netd->data = net;
+	netd->nlink = 2;
+	netd->namelen = 3;
+	netd->parent = &proc_root;
+	memcpy(netd->name, "net", 4);
+
+	err = -EEXIST;
+	net_statd = proc_net_mkdir(net, "stat", netd);
+	if (!net_statd)
+		goto free_net;
+
+	net->proc_net = netd;
+	net->proc_net_stat = net_statd;
+	return 0;
+
+free_net:
+	kfree(netd);
+out:
+	return err;
+}
+
+static __net_exit void proc_net_ns_exit(struct net *net)
+{
+	remove_proc_entry("stat", net->proc_net);
+	kfree(net->proc_net);
+}
+
+static struct pernet_operations __net_initdata proc_net_ns_ops = {
+	.init = proc_net_ns_init,
+	.exit = proc_net_ns_exit,
+};
+
+int __init proc_net_init(void)
+{
+	proc_symlink("net", NULL, "self/net");
+
+	return register_pernet_subsys(&proc_net_ns_ops);
+}
diff --git a/kernel/fs/proc/proc_sysctl.c b/kernel/fs/proc/proc_sysctl.c
new file mode 100644
index 000000000..fdda62e61
--- /dev/null
+++ b/kernel/fs/proc/proc_sysctl.c
@@ -0,0 +1,1619 @@
+/*
+ * /proc/sys support
+ */
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/printk.h>
+#include <linux/security.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include "internal.h"
+
+static const struct dentry_operations proc_sys_dentry_operations;
+static const struct file_operations proc_sys_file_operations;
+static const struct inode_operations proc_sys_inode_operations;
+static const struct file_operations proc_sys_dir_file_operations;
+static const struct inode_operations proc_sys_dir_operations;
+
+/* Support for permanently empty directories */
+
+struct ctl_table sysctl_mount_point[] = {
+	{ }
+};
+
+static bool is_empty_dir(struct ctl_table_header *head)
+{
+	return head->ctl_table[0].child == sysctl_mount_point;
+}
+
+static void set_empty_dir(struct ctl_dir *dir)
+{
+	dir->header.ctl_table[0].child = sysctl_mount_point;
+}
+
+static void clear_empty_dir(struct ctl_dir *dir)
+
+{
+	dir->header.ctl_table[0].child = NULL;
+}
+
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+	if (!poll)
+		return;
+
+	atomic_inc(&poll->event);
+	wake_up_interruptible(&poll->wait);
+}
+
+static struct ctl_table root_table[] = {
+	{
+		.procname = "",
+		.mode = S_IFDIR|S_IRUGO|S_IXUGO,
+	},
+	{ }
+};
+static struct ctl_table_root sysctl_table_root = {
+	.default_set.dir.header = {
+		{{.count = 1,
+		  .nreg = 1,
+		  .ctl_table = root_table }},
+		.ctl_table_arg = root_table,
+		.root = &sysctl_table_root,
+		.set = &sysctl_table_root.default_set,
+	},
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+static void drop_sysctl_table(struct ctl_table_header *header);
+static int sysctl_follow_link(struct ctl_table_header **phead,
+	struct ctl_table **pentry, struct nsproxy *namespaces);
+static int insert_links(struct ctl_table_header *head);
+static void put_links(struct ctl_table_header *header);
+
+static void sysctl_print_dir(struct ctl_dir *dir)
+{
+	if (dir->header.parent)
+		sysctl_print_dir(dir->header.parent);
+	pr_cont("%s/", dir->header.ctl_table[0].procname);
+}
+
+static int namecmp(const char *name1, int len1, const char *name2, int len2)
+{
+	int minlen;
+	int cmp;
+
+	minlen = len1;
+	if (minlen > len2)
+		minlen = len2;
+
+	cmp = memcmp(name1, name2, minlen);
+	if (cmp == 0)
+		cmp = len1 - len2;
+	return cmp;
+}
+
+/* Called under sysctl_lock */
+static struct ctl_table *find_entry(struct ctl_table_header **phead,
+	struct ctl_dir *dir, const char *name, int namelen)
+{
+	struct ctl_table_header *head;
+	struct ctl_table *entry;
+	struct rb_node *node = dir->root.rb_node;
+
+	while (node)
+	{
+		struct ctl_node *ctl_node;
+		const char *procname;
+		int cmp;
+
+		ctl_node = rb_entry(node, struct ctl_node, node);
+		head = ctl_node->header;
+		entry = &head->ctl_table[ctl_node - head->node];
+		procname = entry->procname;
+
+		cmp = namecmp(name, namelen, procname, strlen(procname));
+		if (cmp < 0)
+			node = node->rb_left;
+		else if (cmp > 0)
+			node = node->rb_right;
+		else {
+			*phead = head;
+			return entry;
+		}
+	}
+	return NULL;
+}
+
+static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+	struct rb_node *node = &head->node[entry - head->ctl_table].node;
+	struct rb_node **p = &head->parent->root.rb_node;
+	struct rb_node *parent = NULL;
+	const char *name = entry->procname;
+	int namelen = strlen(name);
+
+	while (*p) {
+		struct ctl_table_header *parent_head;
+		struct ctl_table *parent_entry;
+		struct ctl_node *parent_node;
+		const char *parent_name;
+		int cmp;
+
+		parent = *p;
+		parent_node = rb_entry(parent, struct ctl_node, node);
+		parent_head = parent_node->header;
+		parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
+		parent_name = parent_entry->procname;
+
+		cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else {
+			pr_err("sysctl duplicate entry: ");
+			sysctl_print_dir(head->parent);
+			pr_cont("/%s\n", entry->procname);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, &head->parent->root);
+	return 0;
+}
+
+static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+	struct rb_node *node = &head->node[entry - head->ctl_table].node;
+
+	rb_erase(node, &head->parent->root);
+}
+
+static void init_header(struct ctl_table_header *head,
+	struct ctl_table_root *root, struct ctl_table_set *set,
+	struct ctl_node *node, struct ctl_table *table)
+{
+	head->ctl_table = table;
+	head->ctl_table_arg = table;
+	head->used = 0;
+	head->count = 1;
+	head->nreg = 1;
+	head->unregistering = NULL;
+	head->root = root;
+	head->set = set;
+	head->parent = NULL;
+	head->node = node;
+	if (node) {
+		struct ctl_table *entry;
+		for (entry = table; entry->procname; entry++, node++)
+			node->header = head;
+	}
+}
+
+static void erase_header(struct ctl_table_header *head)
+{
+	struct ctl_table *entry;
+	for (entry = head->ctl_table; entry->procname; entry++)
+		erase_entry(head, entry);
+}
+
+static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
+{
+	struct ctl_table *entry;
+	int err;
+
+	/* Is this a permanently empty directory? */
+	if (is_empty_dir(&dir->header))
+		return -EROFS;
+
+	/* Am I creating a permanently empty directory? */
+	if (header->ctl_table == sysctl_mount_point) {
+		if (!RB_EMPTY_ROOT(&dir->root))
+			return -EINVAL;
+		set_empty_dir(dir);
+	}
+
+	dir->header.nreg++;
+	header->parent = dir;
+	err = insert_links(header);
+	if (err)
+		goto fail_links;
+	for (entry = header->ctl_table; entry->procname; entry++) {
+		err = insert_entry(header, entry);
+		if (err)
+			goto fail;
+	}
+	return 0;
+fail:
+	erase_header(header);
+	put_links(header);
+fail_links:
+	if (header->ctl_table == sysctl_mount_point)
+		clear_empty_dir(dir);
+	header->parent = NULL;
+	drop_sysctl_table(&dir->header);
+	return err;
+}
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+	if (unlikely(p->unregistering))
+		return 0;
+	p->used++;
+	return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+	if (!--p->used)
+		if (unlikely(p->unregistering))
+			complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+	/*
+	 * if p->used is 0, nobody will ever touch that entry again;
+	 * we'll eliminate all paths to it before dropping sysctl_lock
+	 */
+	if (unlikely(p->used)) {
+		struct completion wait;
+		init_completion(&wait);
+		p->unregistering = &wait;
+		spin_unlock(&sysctl_lock);
+		wait_for_completion(&wait);
+		spin_lock(&sysctl_lock);
+	} else {
+		/* anything non-NULL; we'll never dereference it */
+		p->unregistering = ERR_PTR(-EINVAL);
+	}
+	/*
+	 * do not remove from the list until nobody holds it; walking the
+	 * list in do_sysctl() relies on that.
+	 */
+	erase_header(p);
+}
+
+static void sysctl_head_get(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	head->count++;
+	spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	if (!--head->count)
+		kfree_rcu(head, rcu);
+	spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+	BUG_ON(!head);
+	spin_lock(&sysctl_lock);
+	if (!use_table(head))
+		head = ERR_PTR(-ENOENT);
+	spin_unlock(&sysctl_lock);
+	return head;
+}
+
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+	if (!head)
+		return;
+	spin_lock(&sysctl_lock);
+	unuse_table(head);
+	spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	struct ctl_table_set *set = &root->default_set;
+	if (root->lookup)
+		set = root->lookup(root, namespaces);
+	return set;
+}
+
+static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+				      struct ctl_dir *dir,
+				      const char *name, int namelen)
+{
+	struct ctl_table_header *head;
+	struct ctl_table *entry;
+
+	spin_lock(&sysctl_lock);
+	entry = find_entry(&head, dir, name, namelen);
+	if (entry && use_table(head))
+		*phead = head;
+	else
+		entry = NULL;
+	spin_unlock(&sysctl_lock);
+	return entry;
+}
+
+static struct ctl_node *first_usable_entry(struct rb_node *node)
+{
+	struct ctl_node *ctl_node;
+
+	for (;node; node = rb_next(node)) {
+		ctl_node = rb_entry(node, struct ctl_node, node);
+		if (use_table(ctl_node->header))
+			return ctl_node;
+	}
+	return NULL;
+}
+
+static void first_entry(struct ctl_dir *dir,
+	struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+	struct ctl_table_header *head = NULL;
+	struct ctl_table *entry = NULL;
+	struct ctl_node *ctl_node;
+
+	spin_lock(&sysctl_lock);
+	ctl_node = first_usable_entry(rb_first(&dir->root));
+	spin_unlock(&sysctl_lock);
+	if (ctl_node) {
+		head = ctl_node->header;
+		entry = &head->ctl_table[ctl_node - head->node];
+	}
+	*phead = head;
+	*pentry = entry;
+}
+
+static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+	struct ctl_table_header *head = *phead;
+	struct ctl_table *entry = *pentry;
+	struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
+
+	spin_lock(&sysctl_lock);
+	unuse_table(head);
+
+	ctl_node = first_usable_entry(rb_next(&ctl_node->node));
+	spin_unlock(&sysctl_lock);
+	head = NULL;
+	if (ctl_node) {
+		head = ctl_node->header;
+		entry = &head->ctl_table[ctl_node - head->node];
+	}
+	*phead = head;
+	*pentry = entry;
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+}
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+	if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
+		mode >>= 6;
+	else if (in_egroup_p(GLOBAL_ROOT_GID))
+		mode >>= 3;
+	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+		return 0;
+	return -EACCES;
+}
+
+static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+{
+	struct ctl_table_root *root = head->root;
+	int mode;
+
+	if (root->permissions)
+		mode = root->permissions(head, table);
+	else
+		mode = table->mode;
+
+	return test_perm(mode, op);
+}
+
+static struct inode *proc_sys_make_inode(struct super_block *sb,
+		struct ctl_table_header *head, struct ctl_table *table)
+{
+	struct inode *inode;
+	struct proc_inode *ei;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	inode->i_ino = get_next_ino();
+
+	sysctl_head_get(head);
+	ei = PROC_I(inode);
+	ei->sysctl = head;
+	ei->sysctl_entry = table;
+
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mode = table->mode;
+	if (!S_ISDIR(table->mode)) {
+		inode->i_mode |= S_IFREG;
+		inode->i_op = &proc_sys_inode_operations;
+		inode->i_fop = &proc_sys_file_operations;
+	} else {
+		inode->i_mode |= S_IFDIR;
+		inode->i_op = &proc_sys_dir_operations;
+		inode->i_fop = &proc_sys_dir_file_operations;
+		if (is_empty_dir(head))
+			make_empty_dir_inode(inode);
+	}
+out:
+	return inode;
+}
+
+static struct ctl_table_header *grab_header(struct inode *inode)
+{
+	struct ctl_table_header *head = PROC_I(inode)->sysctl;
+	if (!head)
+		head = &sysctl_table_root.default_set.dir.header;
+	return sysctl_head_grab(head);
+}
+
+static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
+					unsigned int flags)
+{
+	struct ctl_table_header *head = grab_header(dir);
+	struct ctl_table_header *h = NULL;
+	struct qstr *name = &dentry->d_name;
+	struct ctl_table *p;
+	struct inode *inode;
+	struct dentry *err = ERR_PTR(-ENOENT);
+	struct ctl_dir *ctl_dir;
+	int ret;
+
+	if (IS_ERR(head))
+		return ERR_CAST(head);
+
+	ctl_dir = container_of(head, struct ctl_dir, header);
+
+	p = lookup_entry(&h, ctl_dir, name->name, name->len);
+	if (!p)
+		goto out;
+
+	if (S_ISLNK(p->mode)) {
+		ret = sysctl_follow_link(&h, &p, current->nsproxy);
+		err = ERR_PTR(ret);
+		if (ret)
+			goto out;
+	}
+
+	err = ERR_PTR(-ENOMEM);
+	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
+	if (!inode)
+		goto out;
+
+	err = NULL;
+	d_set_d_op(dentry, &proc_sys_dentry_operations);
+	d_add(dentry, inode);
+
+out:
+	if (h)
+		sysctl_head_finish(h);
+	sysctl_head_finish(head);
+	return err;
+}
+
+static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
+		size_t count, loff_t *ppos, int write)
+{
+	struct inode *inode = file_inode(filp);
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	ssize_t error;
+	size_t res;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	/*
+	 * At this point we know that the sysctl was not unregistered
+	 * and won't be until we finish.
+	 */
+	error = -EPERM;
+	if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
+		goto out;
+
+	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
+	error = -EINVAL;
+	if (!table->proc_handler)
+		goto out;
+
+	/* careful: calling conventions are nasty here */
+	res = count;
+	error = table->proc_handler(table, write, buf, &res, ppos);
+	if (!error)
+		error = res;
+out:
+	sysctl_head_finish(head);
+
+	return error;
+}
+
+static ssize_t proc_sys_read(struct file *filp, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+}
+
+static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+}
+
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	/* sysctl was unregistered */
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	if (table->poll)
+		filp->private_data = proc_sys_poll_event(table->poll);
+
+	sysctl_head_finish(head);
+
+	return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+	struct inode *inode = file_inode(filp);
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	unsigned int ret = DEFAULT_POLLMASK;
+	unsigned long event;
+
+	/* sysctl was unregistered */
+	if (IS_ERR(head))
+		return POLLERR | POLLHUP;
+
+	if (!table->proc_handler)
+		goto out;
+
+	if (!table->poll)
+		goto out;
+
+	event = (unsigned long)filp->private_data;
+	poll_wait(filp, &table->poll->wait, wait);
+
+	if (event != atomic_read(&table->poll->event)) {
+		filp->private_data = proc_sys_poll_event(table->poll);
+		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+	}
+
+out:
+	sysctl_head_finish(head);
+
+	return ret;
+}
+
+static bool proc_sys_fill_cache(struct file *file,
+				struct dir_context *ctx,
+				struct ctl_table_header *head,
+				struct ctl_table *table)
+{
+	struct dentry *child, *dir = file->f_path.dentry;
+	struct inode *inode;
+	struct qstr qname;
+	ino_t ino = 0;
+	unsigned type = DT_UNKNOWN;
+
+	qname.name = table->procname;
+	qname.len  = strlen(table->procname);
+	qname.hash = full_name_hash(qname.name, qname.len);
+
+	child = d_lookup(dir, &qname);
+	if (!child) {
+		child = d_alloc(dir, &qname);
+		if (child) {
+			inode = proc_sys_make_inode(dir->d_sb, head, table);
+			if (!inode) {
+				dput(child);
+				return false;
+			} else {
+				d_set_d_op(child, &proc_sys_dentry_operations);
+				d_add(child, inode);
+			}
+		} else {
+			return false;
+		}
+	}
+	inode = d_inode(child);
+	ino  = inode->i_ino;
+	type = inode->i_mode >> 12;
+	dput(child);
+	return dir_emit(ctx, qname.name, qname.len, ino, type);
+}
+
+static bool proc_sys_link_fill_cache(struct file *file,
+				    struct dir_context *ctx,
+				    struct ctl_table_header *head,
+				    struct ctl_table *table)
+{
+	bool ret = true;
+	head = sysctl_head_grab(head);
+
+	if (S_ISLNK(table->mode)) {
+		/* It is not an error if we can not follow the link ignore it */
+		int err = sysctl_follow_link(&head, &table, current->nsproxy);
+		if (err)
+			goto out;
+	}
+
+	ret = proc_sys_fill_cache(file, ctx, head, table);
+out:
+	sysctl_head_finish(head);
+	return ret;
+}
+
+static int scan(struct ctl_table_header *head, struct ctl_table *table,
+		unsigned long *pos, struct file *file,
+		struct dir_context *ctx)
+{
+	bool res;
+
+	if ((*pos)++ < ctx->pos)
+		return true;
+
+	if (unlikely(S_ISLNK(table->mode)))
+		res = proc_sys_link_fill_cache(file, ctx, head, table);
+	else
+		res = proc_sys_fill_cache(file, ctx, head, table);
+
+	if (res)
+		ctx->pos = *pos;
+
+	return res;
+}
+
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct ctl_table_header *head = grab_header(file_inode(file));
+	struct ctl_table_header *h = NULL;
+	struct ctl_table *entry;
+	struct ctl_dir *ctl_dir;
+	unsigned long pos;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	ctl_dir = container_of(head, struct ctl_dir, header);
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	pos = 2;
+
+	for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
+		if (!scan(h, entry, &pos, file, ctx)) {
+			sysctl_head_finish(h);
+			break;
+		}
+	}
+	sysctl_head_finish(head);
+	return 0;
+}
+
+static int proc_sys_permission(struct inode *inode, int mask)
+{
+	/*
+	 * sysctl entries that are not writeable,
+	 * are _NOT_ writeable, capabilities or not.
+	 */
+	struct ctl_table_header *head;
+	struct ctl_table *table;
+	int error;
+
+	/* Executable files are not allowed under /proc/sys/ */
+	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
+		return -EACCES;
+
+	head = grab_header(inode);
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	table = PROC_I(inode)->sysctl_entry;
+	if (!table) /* global root - r-xr-xr-x */
+		error = mask & MAY_WRITE ? -EACCES : 0;
+	else /* Use the permissions on the sysctl table entry */
+		error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
+
+	sysctl_head_finish(head);
+	return error;
+}
+
+static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
+		return -EPERM;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	generic_fillattr(inode, stat);
+	if (table)
+		stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+	sysctl_head_finish(head);
+	return 0;
+}
+
+static const struct file_operations proc_sys_file_operations = {
+	.open		= proc_sys_open,
+	.poll		= proc_sys_poll,
+	.read		= proc_sys_read,
+	.write		= proc_sys_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations proc_sys_dir_file_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_sys_readdir,
+	.llseek		= generic_file_llseek,
+};
+
+static const struct inode_operations proc_sys_inode_operations = {
+	.permission	= proc_sys_permission,
+	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
+};
+
+static const struct inode_operations proc_sys_dir_operations = {
+	.lookup		= proc_sys_lookup,
+	.permission	= proc_sys_permission,
+	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
+};
+
+static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+	return !PROC_I(d_inode(dentry))->sysctl->unregistering;
+}
+
+static int proc_sys_delete(const struct dentry *dentry)
+{
+	return !!PROC_I(d_inode(dentry))->sysctl->unregistering;
+}
+
+static int sysctl_is_seen(struct ctl_table_header *p)
+{
+	struct ctl_table_set *set = p->set;
+	int res;
+	spin_lock(&sysctl_lock);
+	if (p->unregistering)
+		res = 0;
+	else if (!set->is_seen)
+		res = 1;
+	else
+		res = set->is_seen(set);
+	spin_unlock(&sysctl_lock);
+	return res;
+}
+
+static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct ctl_table_header *head;
+	struct inode *inode;
+
+	/* Although proc doesn't have negative dentries, rcu-walk means
+	 * that inode here can be NULL */
+	/* AV: can it, indeed? */
+	inode = d_inode_rcu(dentry);
+	if (!inode)
+		return 1;
+	if (name->len != len)
+		return 1;
+	if (memcmp(name->name, str, len))
+		return 1;
+	head = rcu_dereference(PROC_I(inode)->sysctl);
+	return !head || !sysctl_is_seen(head);
+}
+
+static const struct dentry_operations proc_sys_dentry_operations = {
+	.d_revalidate	= proc_sys_revalidate,
+	.d_delete	= proc_sys_delete,
+	.d_compare	= proc_sys_compare,
+};
+
+static struct ctl_dir *find_subdir(struct ctl_dir *dir,
+				   const char *name, int namelen)
+{
+	struct ctl_table_header *head;
+	struct ctl_table *entry;
+
+	entry = find_entry(&head, dir, name, namelen);
+	if (!entry)
+		return ERR_PTR(-ENOENT);
+	if (!S_ISDIR(entry->mode))
+		return ERR_PTR(-ENOTDIR);
+	return container_of(head, struct ctl_dir, header);
+}
+
+static struct ctl_dir *new_dir(struct ctl_table_set *set,
+			       const char *name, int namelen)
+{
+	struct ctl_table *table;
+	struct ctl_dir *new;
+	struct ctl_node *node;
+	char *new_name;
+
+	new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
+		      sizeof(struct ctl_table)*2 +  namelen + 1,
+		      GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	node = (struct ctl_node *)(new + 1);
+	table = (struct ctl_table *)(node + 1);
+	new_name = (char *)(table + 2);
+	memcpy(new_name, name, namelen);
+	new_name[namelen] = '\0';
+	table[0].procname = new_name;
+	table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
+	init_header(&new->header, set->dir.header.root, set, node, table);
+
+	return new;
+}
+
+/**
+ * get_subdir - find or create a subdir with the specified name.
+ * @dir:  Directory to create the subdirectory in
+ * @name: The name of the subdirectory to find or create
+ * @namelen: The length of name
+ *
+ * Takes a directory with an elevated reference count so we know that
+ * if we drop the lock the directory will not go away.  Upon success
+ * the reference is moved from @dir to the returned subdirectory.
+ * Upon error an error code is returned and the reference on @dir is
+ * simply dropped.
+ */
+static struct ctl_dir *get_subdir(struct ctl_dir *dir,
+				  const char *name, int namelen)
+{
+	struct ctl_table_set *set = dir->header.set;
+	struct ctl_dir *subdir, *new = NULL;
+	int err;
+
+	spin_lock(&sysctl_lock);
+	subdir = find_subdir(dir, name, namelen);
+	if (!IS_ERR(subdir))
+		goto found;
+	if (PTR_ERR(subdir) != -ENOENT)
+		goto failed;
+
+	spin_unlock(&sysctl_lock);
+	new = new_dir(set, name, namelen);
+	spin_lock(&sysctl_lock);
+	subdir = ERR_PTR(-ENOMEM);
+	if (!new)
+		goto failed;
+
+	/* Was the subdir added while we dropped the lock? */
+	subdir = find_subdir(dir, name, namelen);
+	if (!IS_ERR(subdir))
+		goto found;
+	if (PTR_ERR(subdir) != -ENOENT)
+		goto failed;
+
+	/* Nope.  Use the our freshly made directory entry. */
+	err = insert_header(dir, &new->header);
+	subdir = ERR_PTR(err);
+	if (err)
+		goto failed;
+	subdir = new;
+found:
+	subdir->header.nreg++;
+failed:
+	if (unlikely(IS_ERR(subdir))) {
+		pr_err("sysctl could not get directory: ");
+		sysctl_print_dir(dir);
+		pr_cont("/%*.*s %ld\n",
+			namelen, namelen, name, PTR_ERR(subdir));
+	}
+	drop_sysctl_table(&dir->header);
+	if (new)
+		drop_sysctl_table(&new->header);
+	spin_unlock(&sysctl_lock);
+	return subdir;
+}
+
+static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
+{
+	struct ctl_dir *parent;
+	const char *procname;
+	if (!dir->header.parent)
+		return &set->dir;
+	parent = xlate_dir(set, dir->header.parent);
+	if (IS_ERR(parent))
+		return parent;
+	procname = dir->header.ctl_table[0].procname;
+	return find_subdir(parent, procname, strlen(procname));
+}
+
+static int sysctl_follow_link(struct ctl_table_header **phead,
+	struct ctl_table **pentry, struct nsproxy *namespaces)
+{
+	struct ctl_table_header *head;
+	struct ctl_table_root *root;
+	struct ctl_table_set *set;
+	struct ctl_table *entry;
+	struct ctl_dir *dir;
+	int ret;
+
+	ret = 0;
+	spin_lock(&sysctl_lock);
+	root = (*pentry)->data;
+	set = lookup_header_set(root, namespaces);
+	dir = xlate_dir(set, (*phead)->parent);
+	if (IS_ERR(dir))
+		ret = PTR_ERR(dir);
+	else {
+		const char *procname = (*pentry)->procname;
+		head = NULL;
+		entry = find_entry(&head, dir, procname, strlen(procname));
+		ret = -ENOENT;
+		if (entry && use_table(head)) {
+			unuse_table(*phead);
+			*phead = head;
+			*pentry = entry;
+			ret = 0;
+		}
+	}
+
+	spin_unlock(&sysctl_lock);
+	return ret;
+}
+
+static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("sysctl table check failed: %s/%s %pV\n",
+	       path, table->procname, &vaf);
+
+	va_end(args);
+	return -EINVAL;
+}
+
+static int sysctl_check_table(const char *path, struct ctl_table *table)
+{
+	int err = 0;
+	for (; table->procname; table++) {
+		if (table->child)
+			err = sysctl_err(path, table, "Not a file");
+
+		if ((table->proc_handler == proc_dostring) ||
+		    (table->proc_handler == proc_dointvec) ||
+		    (table->proc_handler == proc_dointvec_minmax) ||
+		    (table->proc_handler == proc_dointvec_jiffies) ||
+		    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+		    (table->proc_handler == proc_dointvec_ms_jiffies) ||
+		    (table->proc_handler == proc_doulongvec_minmax) ||
+		    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+			if (!table->data)
+				err = sysctl_err(path, table, "No data");
+			if (!table->maxlen)
+				err = sysctl_err(path, table, "No maxlen");
+		}
+		if (!table->proc_handler)
+			err = sysctl_err(path, table, "No proc_handler");
+
+		if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
+			err = sysctl_err(path, table, "bogus .mode 0%o",
+				table->mode);
+	}
+	return err;
+}
+
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
+	struct ctl_table_root *link_root)
+{
+	struct ctl_table *link_table, *entry, *link;
+	struct ctl_table_header *links;
+	struct ctl_node *node;
+	char *link_name;
+	int nr_entries, name_bytes;
+
+	name_bytes = 0;
+	nr_entries = 0;
+	for (entry = table; entry->procname; entry++) {
+		nr_entries++;
+		name_bytes += strlen(entry->procname) + 1;
+	}
+
+	links = kzalloc(sizeof(struct ctl_table_header) +
+			sizeof(struct ctl_node)*nr_entries +
+			sizeof(struct ctl_table)*(nr_entries + 1) +
+			name_bytes,
+			GFP_KERNEL);
+
+	if (!links)
+		return NULL;
+
+	node = (struct ctl_node *)(links + 1);
+	link_table = (struct ctl_table *)(node + nr_entries);
+	link_name = (char *)&link_table[nr_entries + 1];
+
+	for (link = link_table, entry = table; entry->procname; link++, entry++) {
+		int len = strlen(entry->procname) + 1;
+		memcpy(link_name, entry->procname, len);
+		link->procname = link_name;
+		link->mode = S_IFLNK|S_IRWXUGO;
+		link->data = link_root;
+		link_name += len;
+	}
+	init_header(links, dir->header.root, dir->header.set, node, link_table);
+	links->nreg = nr_entries;
+
+	return links;
+}
+
+static bool get_links(struct ctl_dir *dir,
+	struct ctl_table *table, struct ctl_table_root *link_root)
+{
+	struct ctl_table_header *head;
+	struct ctl_table *entry, *link;
+
+	/* Are there links available for every entry in table? */
+	for (entry = table; entry->procname; entry++) {
+		const char *procname = entry->procname;
+		link = find_entry(&head, dir, procname, strlen(procname));
+		if (!link)
+			return false;
+		if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
+			continue;
+		if (S_ISLNK(link->mode) && (link->data == link_root))
+			continue;
+		return false;
+	}
+
+	/* The checks passed.  Increase the registration count on the links */
+	for (entry = table; entry->procname; entry++) {
+		const char *procname = entry->procname;
+		link = find_entry(&head, dir, procname, strlen(procname));
+		head->nreg++;
+	}
+	return true;
+}
+
+static int insert_links(struct ctl_table_header *head)
+{
+	struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+	struct ctl_dir *core_parent = NULL;
+	struct ctl_table_header *links;
+	int err;
+
+	if (head->set == root_set)
+		return 0;
+
+	core_parent = xlate_dir(root_set, head->parent);
+	if (IS_ERR(core_parent))
+		return 0;
+
+	if (get_links(core_parent, head->ctl_table, head->root))
+		return 0;
+
+	core_parent->header.nreg++;
+	spin_unlock(&sysctl_lock);
+
+	links = new_links(core_parent, head->ctl_table, head->root);
+
+	spin_lock(&sysctl_lock);
+	err = -ENOMEM;
+	if (!links)
+		goto out;
+
+	err = 0;
+	if (get_links(core_parent, head->ctl_table, head->root)) {
+		kfree(links);
+		goto out;
+	}
+
+	err = insert_header(core_parent, links);
+	if (err)
+		kfree(links);
+out:
+	drop_sysctl_table(&core_parent->header);
+	return err;
+}
+
+/**
+ * __register_sysctl_table - register a leaf sysctl table
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file
+ *
+ * child - must be %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * There must be a proc_handler routine for any terminal nodes.
+ * Several default handlers are available to cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_table(
+	struct ctl_table_set *set,
+	const char *path, struct ctl_table *table)
+{
+	struct ctl_table_root *root = set->dir.header.root;
+	struct ctl_table_header *header;
+	const char *name, *nextname;
+	struct ctl_dir *dir;
+	struct ctl_table *entry;
+	struct ctl_node *node;
+	int nr_entries = 0;
+
+	for (entry = table; entry->procname; entry++)
+		nr_entries++;
+
+	header = kzalloc(sizeof(struct ctl_table_header) +
+			 sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+	if (!header)
+		return NULL;
+
+	node = (struct ctl_node *)(header + 1);
+	init_header(header, root, set, node, table);
+	if (sysctl_check_table(path, table))
+		goto fail;
+
+	spin_lock(&sysctl_lock);
+	dir = &set->dir;
+	/* Reference moved down the diretory tree get_subdir */
+	dir->header.nreg++;
+	spin_unlock(&sysctl_lock);
+
+	/* Find the directory for the ctl_table */
+	for (name = path; name; name = nextname) {
+		int namelen;
+		nextname = strchr(name, '/');
+		if (nextname) {
+			namelen = nextname - name;
+			nextname++;
+		} else {
+			namelen = strlen(name);
+		}
+		if (namelen == 0)
+			continue;
+
+		dir = get_subdir(dir, name, namelen);
+		if (IS_ERR(dir))
+			goto fail;
+	}
+
+	spin_lock(&sysctl_lock);
+	if (insert_header(dir, header))
+		goto fail_put_dir_locked;
+
+	drop_sysctl_table(&dir->header);
+	spin_unlock(&sysctl_lock);
+
+	return header;
+
+fail_put_dir_locked:
+	drop_sysctl_table(&dir->header);
+	spin_unlock(&sysctl_lock);
+fail:
+	kfree(header);
+	dump_stack();
+	return NULL;
+}
+
+/**
+ * register_sysctl - register a sysctl table
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the table structure
+ *
+ * Register a sysctl table. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+{
+	return __register_sysctl_table(&sysctl_table_root.default_set,
+					path, table);
+}
+EXPORT_SYMBOL(register_sysctl);
+
+static char *append_path(const char *path, char *pos, const char *name)
+{
+	int namelen;
+	namelen = strlen(name);
+	if (((pos - path) + namelen + 2) >= PATH_MAX)
+		return NULL;
+	memcpy(pos, name, namelen);
+	pos[namelen] = '/';
+	pos[namelen + 1] = '\0';
+	pos += namelen + 1;
+	return pos;
+}
+
+static int count_subheaders(struct ctl_table *table)
+{
+	int has_files = 0;
+	int nr_subheaders = 0;
+	struct ctl_table *entry;
+
+	/* special case: no directory and empty directory */
+	if (!table || !table->procname)
+		return 1;
+
+	for (entry = table; entry->procname; entry++) {
+		if (entry->child)
+			nr_subheaders += count_subheaders(entry->child);
+		else
+			has_files = 1;
+	}
+	return nr_subheaders + has_files;
+}
+
+static int register_leaf_sysctl_tables(const char *path, char *pos,
+	struct ctl_table_header ***subheader, struct ctl_table_set *set,
+	struct ctl_table *table)
+{
+	struct ctl_table *ctl_table_arg = NULL;
+	struct ctl_table *entry, *files;
+	int nr_files = 0;
+	int nr_dirs = 0;
+	int err = -ENOMEM;
+
+	for (entry = table; entry->procname; entry++) {
+		if (entry->child)
+			nr_dirs++;
+		else
+			nr_files++;
+	}
+
+	files = table;
+	/* If there are mixed files and directories we need a new table */
+	if (nr_dirs && nr_files) {
+		struct ctl_table *new;
+		files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
+				GFP_KERNEL);
+		if (!files)
+			goto out;
+
+		ctl_table_arg = files;
+		for (new = files, entry = table; entry->procname; entry++) {
+			if (entry->child)
+				continue;
+			*new = *entry;
+			new++;
+		}
+	}
+
+	/* Register everything except a directory full of subdirectories */
+	if (nr_files || !nr_dirs) {
+		struct ctl_table_header *header;
+		header = __register_sysctl_table(set, path, files);
+		if (!header) {
+			kfree(ctl_table_arg);
+			goto out;
+		}
+
+		/* Remember if we need to free the file table */
+		header->ctl_table_arg = ctl_table_arg;
+		**subheader = header;
+		(*subheader)++;
+	}
+
+	/* Recurse into the subdirectories. */
+	for (entry = table; entry->procname; entry++) {
+		char *child_pos;
+
+		if (!entry->child)
+			continue;
+
+		err = -ENAMETOOLONG;
+		child_pos = append_path(path, pos, entry->procname);
+		if (!child_pos)
+			goto out;
+
+		err = register_leaf_sysctl_tables(path, child_pos, subheader,
+						  set, entry->child);
+		pos[0] = '\0';
+		if (err)
+			goto out;
+	}
+	err = 0;
+out:
+	/* On failure our caller will unregister all registered subheaders */
+	return err;
+}
+
+/**
+ * __register_sysctl_paths - register a sysctl table hierarchy
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+	struct ctl_table_set *set,
+	const struct ctl_path *path, struct ctl_table *table)
+{
+	struct ctl_table *ctl_table_arg = table;
+	int nr_subheaders = count_subheaders(table);
+	struct ctl_table_header *header = NULL, **subheaders, **subheader;
+	const struct ctl_path *component;
+	char *new_path, *pos;
+
+	pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!new_path)
+		return NULL;
+
+	pos[0] = '\0';
+	for (component = path; component->procname; component++) {
+		pos = append_path(new_path, pos, component->procname);
+		if (!pos)
+			goto out;
+	}
+	while (table->procname && table->child && !table[1].procname) {
+		pos = append_path(new_path, pos, table->procname);
+		if (!pos)
+			goto out;
+		table = table->child;
+	}
+	if (nr_subheaders == 1) {
+		header = __register_sysctl_table(set, new_path, table);
+		if (header)
+			header->ctl_table_arg = ctl_table_arg;
+	} else {
+		header = kzalloc(sizeof(*header) +
+				 sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
+		if (!header)
+			goto out;
+
+		subheaders = (struct ctl_table_header **) (header + 1);
+		subheader = subheaders;
+		header->ctl_table_arg = ctl_table_arg;
+
+		if (register_leaf_sysctl_tables(new_path, pos, &subheader,
+						set, table))
+			goto err_register_leaves;
+	}
+
+out:
+	kfree(new_path);
+	return header;
+
+err_register_leaves:
+	while (subheader > subheaders) {
+		struct ctl_table_header *subh = *(--subheader);
+		struct ctl_table *table = subh->ctl_table_arg;
+		unregister_sysctl_table(subh);
+		kfree(table);
+	}
+	kfree(header);
+	header = NULL;
+	goto out;
+}
+
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table)
+{
+	return __register_sysctl_paths(&sysctl_table_root.default_set,
+					path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+	static const struct ctl_path null_path[] = { {} };
+
+	return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+
+static void put_links(struct ctl_table_header *header)
+{
+	struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+	struct ctl_table_root *root = header->root;
+	struct ctl_dir *parent = header->parent;
+	struct ctl_dir *core_parent;
+	struct ctl_table *entry;
+
+	if (header->set == root_set)
+		return;
+
+	core_parent = xlate_dir(root_set, parent);
+	if (IS_ERR(core_parent))
+		return;
+
+	for (entry = header->ctl_table; entry->procname; entry++) {
+		struct ctl_table_header *link_head;
+		struct ctl_table *link;
+		const char *name = entry->procname;
+
+		link = find_entry(&link_head, core_parent, name, strlen(name));
+		if (link &&
+		    ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
+		     (S_ISLNK(link->mode) && (link->data == root)))) {
+			drop_sysctl_table(link_head);
+		}
+		else {
+			pr_err("sysctl link missing during unregister: ");
+			sysctl_print_dir(parent);
+			pr_cont("/%s\n", name);
+		}
+	}
+}
+
+static void drop_sysctl_table(struct ctl_table_header *header)
+{
+	struct ctl_dir *parent = header->parent;
+
+	if (--header->nreg)
+		return;
+
+	put_links(header);
+	start_unregistering(header);
+	if (!--header->count)
+		kfree_rcu(header, rcu);
+
+	if (parent)
+		drop_sysctl_table(&parent->header);
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+	int nr_subheaders;
+	might_sleep();
+
+	if (header == NULL)
+		return;
+
+	nr_subheaders = count_subheaders(header->ctl_table_arg);
+	if (unlikely(nr_subheaders > 1)) {
+		struct ctl_table_header **subheaders;
+		int i;
+
+		subheaders = (struct ctl_table_header **)(header + 1);
+		for (i = nr_subheaders -1; i >= 0; i--) {
+			struct ctl_table_header *subh = subheaders[i];
+			struct ctl_table *table = subh->ctl_table_arg;
+			unregister_sysctl_table(subh);
+			kfree(table);
+		}
+		kfree(header);
+		return;
+	}
+
+	spin_lock(&sysctl_lock);
+	drop_sysctl_table(header);
+	spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+
+void setup_sysctl_set(struct ctl_table_set *set,
+	struct ctl_table_root *root,
+	int (*is_seen)(struct ctl_table_set *))
+{
+	memset(set, 0, sizeof(*set));
+	set->is_seen = is_seen;
+	init_header(&set->dir.header, root, set, NULL, root_table);
+}
+
+void retire_sysctl_set(struct ctl_table_set *set)
+{
+	WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
+}
+
+int __init proc_sys_init(void)
+{
+	struct proc_dir_entry *proc_sys_root;
+
+	proc_sys_root = proc_mkdir("sys", NULL);
+	proc_sys_root->proc_iops = &proc_sys_dir_operations;
+	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
+	proc_sys_root->nlink = 0;
+
+	return sysctl_init();
+}
diff --git a/kernel/fs/proc/proc_tty.c b/kernel/fs/proc/proc_tty.c
new file mode 100644
index 000000000..15f327bed
--- /dev/null
+++ b/kernel/fs/proc/proc_tty.c
@@ -0,0 +1,189 @@
+/*
+ * proc_tty.c -- handles /proc/tty
+ *
+ * Copyright 1997, Theodore Ts'o
+ */
+
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/tty.h>
+#include <linux/seq_file.h>
+#include <linux/bitops.h>
+
+/*
+ * The /proc/tty directory inodes...
+ */
+static struct proc_dir_entry *proc_tty_driver;
+
+/*
+ * This is the handler for /proc/tty/drivers
+ */
+static void show_tty_range(struct seq_file *m, struct tty_driver *p,
+	dev_t from, int num)
+{
+	seq_printf(m, "%-20s ", p->driver_name ? p->driver_name : "unknown");
+	seq_printf(m, "/dev/%-8s ", p->name);
+	if (p->num > 1) {
+		seq_printf(m, "%3d %d-%d ", MAJOR(from), MINOR(from),
+			MINOR(from) + num - 1);
+	} else {
+		seq_printf(m, "%3d %7d ", MAJOR(from), MINOR(from));
+	}
+	switch (p->type) {
+	case TTY_DRIVER_TYPE_SYSTEM:
+		seq_puts(m, "system");
+		if (p->subtype == SYSTEM_TYPE_TTY)
+			seq_puts(m, ":/dev/tty");
+		else if (p->subtype == SYSTEM_TYPE_SYSCONS)
+			seq_puts(m, ":console");
+		else if (p->subtype == SYSTEM_TYPE_CONSOLE)
+			seq_puts(m, ":vtmaster");
+		break;
+	case TTY_DRIVER_TYPE_CONSOLE:
+		seq_puts(m, "console");
+		break;
+	case TTY_DRIVER_TYPE_SERIAL:
+		seq_puts(m, "serial");
+		break;
+	case TTY_DRIVER_TYPE_PTY:
+		if (p->subtype == PTY_TYPE_MASTER)
+			seq_puts(m, "pty:master");
+		else if (p->subtype == PTY_TYPE_SLAVE)
+			seq_puts(m, "pty:slave");
+		else
+			seq_puts(m, "pty");
+		break;
+	default:
+		seq_printf(m, "type:%d.%d", p->type, p->subtype);
+	}
+	seq_putc(m, '\n');
+}
+
+static int show_tty_driver(struct seq_file *m, void *v)
+{
+	struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers);
+	dev_t from = MKDEV(p->major, p->minor_start);
+	dev_t to = from + p->num;
+
+	if (&p->tty_drivers == tty_drivers.next) {
+		/* pseudo-drivers first */
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
+		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
+		seq_puts(m, "system:/dev/tty\n");
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
+		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
+		seq_puts(m, "system:console\n");
+#ifdef CONFIG_UNIX98_PTYS
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
+		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
+		seq_puts(m, "system\n");
+#endif
+#ifdef CONFIG_VT
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
+		seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
+		seq_puts(m, "system:vtmaster\n");
+#endif
+	}
+
+	while (MAJOR(from) < MAJOR(to)) {
+		dev_t next = MKDEV(MAJOR(from)+1, 0);
+		show_tty_range(m, p, from, next - from);
+		from = next;
+	}
+	if (from != to)
+		show_tty_range(m, p, from, to - from);
+	return 0;
+}
+
+/* iterator */
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&tty_mutex);
+	return seq_list_start(&tty_drivers, *pos);
+}
+
+static void *t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &tty_drivers, pos);
+}
+
+static void t_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&tty_mutex);
+}
+
+static const struct seq_operations tty_drivers_op = {
+	.start	= t_start,
+	.next	= t_next,
+	.stop	= t_stop,
+	.show	= show_tty_driver
+};
+
+static int tty_drivers_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &tty_drivers_op);
+}
+
+static const struct file_operations proc_tty_drivers_operations = {
+	.open		= tty_drivers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * This function is called by tty_register_driver() to handle
+ * registering the driver's /proc handler into /proc/tty/driver/<foo>
+ */
+void proc_tty_register_driver(struct tty_driver *driver)
+{
+	struct proc_dir_entry *ent;
+		
+	if (!driver->driver_name || driver->proc_entry ||
+	    !driver->ops->proc_fops)
+		return;
+
+	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_fops, driver);
+	driver->proc_entry = ent;
+}
+
+/*
+ * This function is called by tty_unregister_driver()
+ */
+void proc_tty_unregister_driver(struct tty_driver *driver)
+{
+	struct proc_dir_entry *ent;
+
+	ent = driver->proc_entry;
+	if (!ent)
+		return;
+		
+	remove_proc_entry(driver->driver_name, proc_tty_driver);
+	
+	driver->proc_entry = NULL;
+}
+
+/*
+ * Called by proc_root_init() to initialize the /proc/tty subtree
+ */
+void __init proc_tty_init(void)
+{
+	if (!proc_mkdir("tty", NULL))
+		return;
+	proc_mkdir("tty/ldisc", NULL);	/* Preserved: it's userspace visible */
+	/*
+	 * /proc/tty/driver/serial reveals the exact character counts for
+	 * serial links which is just too easy to abuse for inferring
+	 * password lengths and inter-keystroke timings during password
+	 * entry.
+	 */
+	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
+	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
+	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+}
diff --git a/kernel/fs/proc/root.c b/kernel/fs/proc/root.c
new file mode 100644
index 000000000..68feb0f70
--- /dev/null
+++ b/kernel/fs/proc/root.c
@@ -0,0 +1,270 @@
+/*
+ *  linux/fs/proc/root.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  proc root directory handling functions
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/user_namespace.h>
+#include <linux/mount.h>
+#include <linux/pid_namespace.h>
+#include <linux/parser.h>
+
+#include "internal.h"
+
+static int proc_test_super(struct super_block *sb, void *data)
+{
+	return sb->s_fs_info == data;
+}
+
+static int proc_set_super(struct super_block *sb, void *data)
+{
+	int err = set_anon_super(sb, NULL);
+	if (!err) {
+		struct pid_namespace *ns = (struct pid_namespace *)data;
+		sb->s_fs_info = get_pid_ns(ns);
+	}
+	return err;
+}
+
+enum {
+	Opt_gid, Opt_hidepid, Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_hidepid, "hidepid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_err, NULL},
+};
+
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			pid->pid_gid = make_kgid(current_user_ns(), option);
+			break;
+		case Opt_hidepid:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 2) {
+				pr_err("proc: hidepid value must be between 0 and 2.\n");
+				return 0;
+			}
+			pid->hide_pid = option;
+			break;
+		default:
+			pr_err("proc: unrecognized mount option \"%s\" "
+			       "or missing value\n", p);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct pid_namespace *pid = sb->s_fs_info;
+
+	sync_filesystem(sb);
+	return !proc_parse_options(data, pid);
+}
+
+static struct dentry *proc_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	int err;
+	struct super_block *sb;
+	struct pid_namespace *ns;
+	char *options;
+
+	if (flags & MS_KERNMOUNT) {
+		ns = (struct pid_namespace *)data;
+		options = NULL;
+	} else {
+		ns = task_active_pid_ns(current);
+		options = data;
+
+		/* Does the mounter have privilege over the pid namespace? */
+		if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
+	}
+
+	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+
+	if (!proc_parse_options(options, ns)) {
+		deactivate_locked_super(sb);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!sb->s_root) {
+		err = proc_fill_super(sb);
+		if (err) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(err);
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+static void proc_kill_sb(struct super_block *sb)
+{
+	struct pid_namespace *ns;
+
+	ns = (struct pid_namespace *)sb->s_fs_info;
+	if (ns->proc_self)
+		dput(ns->proc_self);
+	if (ns->proc_thread_self)
+		dput(ns->proc_thread_self);
+	kill_anon_super(sb);
+	put_pid_ns(ns);
+}
+
+static struct file_system_type proc_fs_type = {
+	.name		= "proc",
+	.mount		= proc_mount,
+	.kill_sb	= proc_kill_sb,
+	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+};
+
+void __init proc_root_init(void)
+{
+	int err;
+
+	proc_init_inodecache();
+	err = register_filesystem(&proc_fs_type);
+	if (err)
+		return;
+
+	proc_self_init();
+	proc_thread_self_init();
+	proc_symlink("mounts", NULL, "self/mounts");
+
+	proc_net_init();
+
+#ifdef CONFIG_SYSVIPC
+	proc_mkdir("sysvipc", NULL);
+#endif
+	proc_mkdir("fs", NULL);
+	proc_mkdir("driver", NULL);
+	proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
+#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
+	/* just give it a mountpoint */
+	proc_create_mount_point("openprom");
+#endif
+	proc_tty_init();
+	proc_mkdir("bus", NULL);
+	proc_sys_init();
+}
+
+static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
+)
+{
+	generic_fillattr(d_inode(dentry), stat);
+	stat->nlink = proc_root.nlink + nr_processes();
+	return 0;
+}
+
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
+{
+	if (!proc_pid_lookup(dir, dentry, flags))
+		return NULL;
+	
+	return proc_lookup(dir, dentry, flags);
+}
+
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
+{
+	if (ctx->pos < FIRST_PROCESS_ENTRY) {
+		int error = proc_readdir(file, ctx);
+		if (unlikely(error <= 0))
+			return error;
+		ctx->pos = FIRST_PROCESS_ENTRY;
+	}
+
+	return proc_pid_readdir(file, ctx);
+}
+
+/*
+ * The root /proc directory is special, as it has the
+ * <pid> directories. Thus we don't use the generic
+ * directory handling functions for that..
+ */
+static const struct file_operations proc_root_operations = {
+	.read		 = generic_read_dir,
+	.iterate	 = proc_root_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
+ * proc root can do almost nothing..
+ */
+static const struct inode_operations proc_root_inode_operations = {
+	.lookup		= proc_root_lookup,
+	.getattr	= proc_root_getattr,
+};
+
+/*
+ * This is the root "inode" in the /proc tree..
+ */
+struct proc_dir_entry proc_root = {
+	.low_ino	= PROC_ROOT_INO, 
+	.namelen	= 5, 
+	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
+	.nlink		= 2, 
+	.count		= ATOMIC_INIT(1),
+	.proc_iops	= &proc_root_inode_operations, 
+	.proc_fops	= &proc_root_operations,
+	.parent		= &proc_root,
+	.subdir		= RB_ROOT,
+	.name		= "/proc",
+};
+
+int pid_ns_prepare_proc(struct pid_namespace *ns)
+{
+	struct vfsmount *mnt;
+
+	mnt = kern_mount_data(&proc_fs_type, ns);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	ns->proc_mnt = mnt;
+	return 0;
+}
+
+void pid_ns_release_proc(struct pid_namespace *ns)
+{
+	kern_unmount(ns->proc_mnt);
+}
diff --git a/kernel/fs/proc/self.c b/kernel/fs/proc/self.c
new file mode 100644
index 000000000..6195b4a7c
--- /dev/null
+++ b/kernel/fs/proc/self.c
@@ -0,0 +1,84 @@
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char tmp[PROC_NUMBUF];
+	if (!tgid)
+		return -ENOENT;
+	sprintf(tmp, "%d", tgid);
+	return readlink_copy(buffer, buflen, tmp);
+}
+
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char *name = ERR_PTR(-ENOENT);
+	if (tgid) {
+		/* 11 for max length of signed int in decimal + NULL term */
+		name = kmalloc(12, GFP_KERNEL);
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d", tgid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static const struct inode_operations proc_self_inode_operations = {
+	.readlink	= proc_self_readlink,
+	.follow_link	= proc_self_follow_link,
+	.put_link	= kfree_put_link,
+};
+
+static unsigned self_inum;
+
+int proc_setup_self(struct super_block *s)
+{
+	struct inode *root_inode = d_inode(s->s_root);
+	struct pid_namespace *ns = s->s_fs_info;
+	struct dentry *self;
+	
+	mutex_lock(&root_inode->i_mutex);
+	self = d_alloc_name(s->s_root, "self");
+	if (self) {
+		struct inode *inode = new_inode_pseudo(s);
+		if (inode) {
+			inode->i_ino = self_inum;
+			inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+			inode->i_mode = S_IFLNK | S_IRWXUGO;
+			inode->i_uid = GLOBAL_ROOT_UID;
+			inode->i_gid = GLOBAL_ROOT_GID;
+			inode->i_op = &proc_self_inode_operations;
+			d_add(self, inode);
+		} else {
+			dput(self);
+			self = ERR_PTR(-ENOMEM);
+		}
+	} else {
+		self = ERR_PTR(-ENOMEM);
+	}
+	mutex_unlock(&root_inode->i_mutex);
+	if (IS_ERR(self)) {
+		pr_err("proc_fill_super: can't allocate /proc/self\n");
+		return PTR_ERR(self);
+	}
+	ns->proc_self = self;
+	return 0;
+}
+
+void __init proc_self_init(void)
+{
+	proc_alloc_inum(&self_inum);
+}
diff --git a/kernel/fs/proc/softirqs.c b/kernel/fs/proc/softirqs.c
new file mode 100644
index 000000000..ad8a77f94
--- /dev/null
+++ b/kernel/fs/proc/softirqs.c
@@ -0,0 +1,44 @@
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/softirqs  ... display the number of softirqs
+ */
+static int show_softirqs(struct seq_file *p, void *v)
+{
+	int i, j;
+
+	seq_puts(p, "                    ");
+	for_each_possible_cpu(i)
+		seq_printf(p, "CPU%-8d", i);
+	seq_putc(p, '\n');
+
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		seq_printf(p, "%12s:", softirq_to_name[i]);
+		for_each_possible_cpu(j)
+			seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+		seq_putc(p, '\n');
+	}
+	return 0;
+}
+
+static int softirqs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_softirqs, NULL);
+}
+
+static const struct file_operations proc_softirqs_operations = {
+	.open		= softirqs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_softirqs_init(void)
+{
+	proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+	return 0;
+}
+fs_initcall(proc_softirqs_init);
diff --git a/kernel/fs/proc/stat.c b/kernel/fs/proc/stat.c
new file mode 100644
index 000000000..510413eb2
--- /dev/null
+++ b/kernel/fs/proc/stat.c
@@ -0,0 +1,206 @@
+#include <linux/cpumask.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/irqnr.h>
+#include <linux/cputime.h>
+#include <linux/tick.h>
+
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
+#ifdef arch_idle_time
+
+static cputime64_t get_idle_time(int cpu)
+{
+	cputime64_t idle;
+
+	idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+	if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+		idle += arch_idle_time(cpu);
+	return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+	cputime64_t iowait;
+
+	iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+	if (cpu_online(cpu) && nr_iowait_cpu(cpu))
+		iowait += arch_idle_time(cpu);
+	return iowait;
+}
+
+#else
+
+static u64 get_idle_time(int cpu)
+{
+	u64 idle, idle_time = -1ULL;
+
+	if (cpu_online(cpu))
+		idle_time = get_cpu_idle_time_us(cpu, NULL);
+
+	if (idle_time == -1ULL)
+		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
+		idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+	else
+		idle = usecs_to_cputime64(idle_time);
+
+	return idle;
+}
+
+static u64 get_iowait_time(int cpu)
+{
+	u64 iowait, iowait_time = -1ULL;
+
+	if (cpu_online(cpu))
+		iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+
+	if (iowait_time == -1ULL)
+		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
+		iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+	else
+		iowait = usecs_to_cputime64(iowait_time);
+
+	return iowait;
+}
+
+#endif
+
+static int show_stat(struct seq_file *p, void *v)
+{
+	int i, j;
+	unsigned long jif;
+	u64 user, nice, system, idle, iowait, irq, softirq, steal;
+	u64 guest, guest_nice;
+	u64 sum = 0;
+	u64 sum_softirq = 0;
+	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
+	struct timespec boottime;
+
+	user = nice = system = idle = iowait =
+		irq = softirq = steal = 0;
+	guest = guest_nice = 0;
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
+
+	for_each_possible_cpu(i) {
+		user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
+		nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+		system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
+		idle += get_idle_time(i);
+		iowait += get_iowait_time(i);
+		irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+		softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+		steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+		guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+		guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+		sum += kstat_cpu_irqs_sum(i);
+		sum += arch_irq_stat_cpu(i);
+
+		for (j = 0; j < NR_SOFTIRQS; j++) {
+			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+
+			per_softirq_sums[j] += softirq_stat;
+			sum_softirq += softirq_stat;
+		}
+	}
+	sum += arch_irq_stat();
+
+	seq_puts(p, "cpu ");
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+	seq_putc(p, '\n');
+
+	for_each_online_cpu(i) {
+		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+		user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
+		nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+		system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
+		idle = get_idle_time(i);
+		iowait = get_iowait_time(i);
+		irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+		softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+		steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+		guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+		seq_printf(p, "cpu%d", i);
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+		seq_putc(p, '\n');
+	}
+	seq_printf(p, "intr %llu", (unsigned long long)sum);
+
+	/* sum again ? it could be updated? */
+	for_each_irq_nr(j)
+		seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
+
+	seq_printf(p,
+		"\nctxt %llu\n"
+		"btime %lu\n"
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %lu\n",
+		nr_context_switches(),
+		(unsigned long)jif,
+		total_forks,
+		nr_running(),
+		nr_iowait());
+
+	seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+
+	for (i = 0; i < NR_SOFTIRQS; i++)
+		seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
+	seq_putc(p, '\n');
+
+	return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+	size_t size = 1024 + 128 * num_online_cpus();
+
+	/* minimum size to display an interrupt count : 2 bytes */
+	size += 2 * nr_irqs;
+	return single_open_size(file, show_stat, NULL, size);
+}
+
+static const struct file_operations proc_stat_operations = {
+	.open		= stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_stat_init(void)
+{
+	proc_create("stat", 0, NULL, &proc_stat_operations);
+	return 0;
+}
+fs_initcall(proc_stat_init);
diff --git a/kernel/fs/proc/task_mmu.c b/kernel/fs/proc/task_mmu.c
new file mode 100644
index 000000000..6dee68d01
--- /dev/null
+++ b/kernel/fs/proc/task_mmu.c
@@ -0,0 +1,1625 @@
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/elf.h>
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+void task_mem(struct seq_file *m, struct mm_struct *mm)
+{
+	unsigned long data, text, lib, swap, ptes, pmds;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = get_mm_rss(mm);
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
+
+	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
+	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
+	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, MM_SWAPENTS);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
+	seq_printf(m,
+		"VmPeak:\t%8lu kB\n"
+		"VmSize:\t%8lu kB\n"
+		"VmLck:\t%8lu kB\n"
+		"VmPin:\t%8lu kB\n"
+		"VmHWM:\t%8lu kB\n"
+		"VmRSS:\t%8lu kB\n"
+		"VmData:\t%8lu kB\n"
+		"VmStk:\t%8lu kB\n"
+		"VmExe:\t%8lu kB\n"
+		"VmLib:\t%8lu kB\n"
+		"VmPTE:\t%8lu kB\n"
+		"VmPMD:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
+		hiwater_vm << (PAGE_SHIFT-10),
+		total_vm << (PAGE_SHIFT-10),
+		mm->locked_vm << (PAGE_SHIFT-10),
+		mm->pinned_vm << (PAGE_SHIFT-10),
+		hiwater_rss << (PAGE_SHIFT-10),
+		total_rss << (PAGE_SHIFT-10),
+		data << (PAGE_SHIFT-10),
+		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
+		ptes >> 10,
+		pmds >> 10,
+		swap << (PAGE_SHIFT-10));
+}
+
+unsigned long task_vsize(struct mm_struct *mm)
+{
+	return PAGE_SIZE * mm->total_vm;
+}
+
+unsigned long task_statm(struct mm_struct *mm,
+			 unsigned long *shared, unsigned long *text,
+			 unsigned long *data, unsigned long *resident)
+{
+	*shared = get_mm_counter(mm, MM_FILEPAGES);
+	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+								>> PAGE_SHIFT;
+	*data = mm->total_vm - mm->shared_vm;
+	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+	return mm->total_vm;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Save get_task_policy() for show_numa_map().
+ */
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+	struct task_struct *task = priv->task;
+
+	task_lock(task);
+	priv->task_mempolicy = get_task_policy(task);
+	mpol_get(priv->task_mempolicy);
+	task_unlock(task);
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+	mpol_put(priv->task_mempolicy);
+}
+#else
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+#endif
+
+static void vma_stop(struct proc_maps_private *priv)
+{
+	struct mm_struct *mm = priv->mm;
+
+	release_task_mempolicy(priv);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+}
+
+static struct vm_area_struct *
+m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
+{
+	if (vma == priv->tail_vma)
+		return NULL;
+	return vma->vm_next ?: priv->tail_vma;
+}
+
+static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
+{
+	if (m->count < m->size)	/* vma is copied successfully */
+		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
+{
+	struct proc_maps_private *priv = m->private;
+	unsigned long last_addr = m->version;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned int pos = *ppos;
+
+	/* See m_cache_vma(). Zero at the start or after lseek. */
+	if (last_addr == -1UL)
+		return NULL;
+
+	priv->task = get_proc_task(priv->inode);
+	if (!priv->task)
+		return ERR_PTR(-ESRCH);
+
+	mm = priv->mm;
+	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+		return NULL;
+
+	down_read(&mm->mmap_sem);
+	hold_task_mempolicy(priv);
+	priv->tail_vma = get_gate_vma(mm);
+
+	if (last_addr) {
+		vma = find_vma(mm, last_addr);
+		if (vma && (vma = m_next_vma(priv, vma)))
+			return vma;
+	}
+
+	m->version = 0;
+	if (pos < mm->map_count) {
+		for (vma = mm->mmap; pos; pos--) {
+			m->version = vma->vm_start;
+			vma = vma->vm_next;
+		}
+		return vma;
+	}
+
+	/* we do not bother to update m->version in this case */
+	if (pos == mm->map_count && priv->tail_vma)
+		return priv->tail_vma;
+
+	vma_stop(priv);
+	return NULL;
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *next;
+
+	(*pos)++;
+	next = m_next_vma(priv, v);
+	if (!next)
+		vma_stop(priv);
+	return next;
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+
+	if (!IS_ERR_OR_NULL(v))
+		vma_stop(priv);
+	if (priv->task) {
+		put_task_struct(priv->task);
+		priv->task = NULL;
+	}
+}
+
+static int proc_maps_open(struct inode *inode, struct file *file,
+			const struct seq_operations *ops, int psize)
+{
+	struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
+
+	if (!priv)
+		return -ENOMEM;
+
+	priv->inode = inode;
+	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+	if (IS_ERR(priv->mm)) {
+		int err = PTR_ERR(priv->mm);
+
+		seq_release_private(inode, file);
+		return err;
+	}
+
+	return 0;
+}
+
+static int proc_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct proc_maps_private *priv = seq->private;
+
+	if (priv->mm)
+		mmdrop(priv->mm);
+
+	return seq_release_private(inode, file);
+}
+
+static int do_maps_open(struct inode *inode, struct file *file,
+			const struct seq_operations *ops)
+{
+	return proc_maps_open(inode, file, ops,
+				sizeof(struct proc_maps_private));
+}
+
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+				struct vm_area_struct *vma, bool is_pid)
+{
+	struct inode *inode = priv->inode;
+	struct task_struct *task;
+	pid_t ret = 0;
+
+	rcu_read_lock();
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		task = task_of_stack(task, vma, is_pid);
+		if (task)
+			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static void
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct file *file = vma->vm_file;
+	struct proc_maps_private *priv = m->private;
+	vm_flags_t flags = vma->vm_flags;
+	unsigned long ino = 0;
+	unsigned long long pgoff = 0;
+	unsigned long start, end;
+	dev_t dev = 0;
+	const char *name = NULL;
+
+	if (file) {
+		struct inode *inode = file_inode(vma->vm_file);
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
+	}
+
+	/* We don't show the stack guard page in /proc/maps */
+	start = vma->vm_start;
+	if (stack_guard_page_start(vma, start))
+		start += PAGE_SIZE;
+	end = vma->vm_end;
+	if (stack_guard_page_end(vma, end))
+		end -= PAGE_SIZE;
+
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+			start,
+			end,
+			flags & VM_READ ? 'r' : '-',
+			flags & VM_WRITE ? 'w' : '-',
+			flags & VM_EXEC ? 'x' : '-',
+			flags & VM_MAYSHARE ? 's' : 'p',
+			pgoff,
+			MAJOR(dev), MINOR(dev), ino);
+
+	/*
+	 * Print the dentry name for named mappings, and a
+	 * special [heap] marker for the heap:
+	 */
+	if (file) {
+		seq_pad(m, ' ');
+		seq_path(m, &file->f_path, "\n");
+		goto done;
+	}
+
+	if (vma->vm_ops && vma->vm_ops->name) {
+		name = vma->vm_ops->name(vma);
+		if (name)
+			goto done;
+	}
+
+	name = arch_vma_name(vma);
+	if (!name) {
+		pid_t tid;
+
+		if (!mm) {
+			name = "[vdso]";
+			goto done;
+		}
+
+		if (vma->vm_start <= mm->brk &&
+		    vma->vm_end >= mm->start_brk) {
+			name = "[heap]";
+			goto done;
+		}
+
+		tid = pid_of_stack(priv, vma, is_pid);
+		if (tid != 0) {
+			/*
+			 * Thread stack in /proc/PID/task/TID/maps or
+			 * the main process stack.
+			 */
+			if (!is_pid || (vma->vm_start <= mm->start_stack &&
+			    vma->vm_end >= mm->start_stack)) {
+				name = "[stack]";
+			} else {
+				/* Thread stack in /proc/PID/maps */
+				seq_pad(m, ' ');
+				seq_printf(m, "[stack:%d]", tid);
+			}
+		}
+	}
+
+done:
+	if (name) {
+		seq_pad(m, ' ');
+		seq_puts(m, name);
+	}
+	seq_putc(m, '\n');
+}
+
+static int show_map(struct seq_file *m, void *v, int is_pid)
+{
+	show_map_vma(m, v, is_pid);
+	m_cache_vma(m, v);
+	return 0;
+}
+
+static int show_pid_map(struct seq_file *m, void *v)
+{
+	return show_map(m, v, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *v)
+{
+	return show_map(m, v, 0);
+}
+
+static const struct seq_operations proc_pid_maps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_pid_map
+};
+
+static const struct seq_operations proc_tid_maps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_tid_map
+};
+
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_tid_maps_op);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+	.open		= pid_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_map_release,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+	.open		= tid_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_map_release,
+};
+
+/*
+ * Proportional Set Size(PSS): my share of RSS.
+ *
+ * PSS of a process is the count of pages it has in memory, where each
+ * page is divided by the number of processes sharing it.  So if a
+ * process has 1000 pages all to itself, and 1000 shared with one other
+ * process, its PSS will be 1500.
+ *
+ * To keep (accumulated) division errors low, we adopt a 64bit
+ * fixed-point pss counter to minimize division errors. So (pss >>
+ * PSS_SHIFT) would be the real byte count.
+ *
+ * A shift of 12 before division means (assuming 4K page size):
+ * 	- 1M 3-user-pages add up to 8KB errors;
+ * 	- supports mapcount up to 2^24, or 16M;
+ * 	- supports PSS up to 2^52 bytes, or 4PB.
+ */
+#define PSS_SHIFT 12
+
+#ifdef CONFIG_PROC_PAGE_MONITOR
+struct mem_size_stats {
+	unsigned long resident;
+	unsigned long shared_clean;
+	unsigned long shared_dirty;
+	unsigned long private_clean;
+	unsigned long private_dirty;
+	unsigned long referenced;
+	unsigned long anonymous;
+	unsigned long anonymous_thp;
+	unsigned long swap;
+	u64 pss;
+};
+
+static void smaps_account(struct mem_size_stats *mss, struct page *page,
+		unsigned long size, bool young, bool dirty)
+{
+	int mapcount;
+
+	if (PageAnon(page))
+		mss->anonymous += size;
+
+	mss->resident += size;
+	/* Accumulate the size in pages that have been accessed. */
+	if (young || PageReferenced(page))
+		mss->referenced += size;
+	mapcount = page_mapcount(page);
+	if (mapcount >= 2) {
+		u64 pss_delta;
+
+		if (dirty || PageDirty(page))
+			mss->shared_dirty += size;
+		else
+			mss->shared_clean += size;
+		pss_delta = (u64)size << PSS_SHIFT;
+		do_div(pss_delta, mapcount);
+		mss->pss += pss_delta;
+	} else {
+		if (dirty || PageDirty(page))
+			mss->private_dirty += size;
+		else
+			mss->private_clean += size;
+		mss->pss += (u64)size << PSS_SHIFT;
+	}
+}
+
+static void smaps_pte_entry(pte_t *pte, unsigned long addr,
+		struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	struct page *page = NULL;
+
+	if (pte_present(*pte)) {
+		page = vm_normal_page(vma, addr, *pte);
+	} else if (is_swap_pte(*pte)) {
+		swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+		if (!non_swap_entry(swpent))
+			mss->swap += PAGE_SIZE;
+		else if (is_migration_entry(swpent))
+			page = migration_entry_to_page(swpent);
+	}
+
+	if (!page)
+		return;
+	smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+		struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	struct page *page;
+
+	/* FOLL_DUMP will return -EFAULT on huge zero page */
+	page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+	if (IS_ERR_OR_NULL(page))
+		return;
+	mss->anonymous_thp += HPAGE_PMD_SIZE;
+	smaps_account(mss, page, HPAGE_PMD_SIZE,
+			pmd_young(*pmd), pmd_dirty(*pmd));
+}
+#else
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+		struct mm_walk *walk)
+{
+}
+#endif
+
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			   struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+		smaps_pmd_entry(pmd, addr, walk);
+		spin_unlock(ptl);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+	/*
+	 * The mmap_sem held all the way back in m_start() is what
+	 * keeps khugepaged out of here and from collapsing things
+	 * in here.
+	 */
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE)
+		smaps_pte_entry(pte, addr, walk);
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+	return 0;
+}
+
+static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
+{
+	/*
+	 * Don't forget to update Documentation/ on changes.
+	 */
+	static const char mnemonics[BITS_PER_LONG][2] = {
+		/*
+		 * In case if we meet a flag we don't know about.
+		 */
+		[0 ... (BITS_PER_LONG-1)] = "??",
+
+		[ilog2(VM_READ)]	= "rd",
+		[ilog2(VM_WRITE)]	= "wr",
+		[ilog2(VM_EXEC)]	= "ex",
+		[ilog2(VM_SHARED)]	= "sh",
+		[ilog2(VM_MAYREAD)]	= "mr",
+		[ilog2(VM_MAYWRITE)]	= "mw",
+		[ilog2(VM_MAYEXEC)]	= "me",
+		[ilog2(VM_MAYSHARE)]	= "ms",
+		[ilog2(VM_GROWSDOWN)]	= "gd",
+		[ilog2(VM_PFNMAP)]	= "pf",
+		[ilog2(VM_DENYWRITE)]	= "dw",
+#ifdef CONFIG_X86_INTEL_MPX
+		[ilog2(VM_MPX)]		= "mp",
+#endif
+		[ilog2(VM_LOCKED)]	= "lo",
+		[ilog2(VM_IO)]		= "io",
+		[ilog2(VM_SEQ_READ)]	= "sr",
+		[ilog2(VM_RAND_READ)]	= "rr",
+		[ilog2(VM_DONTCOPY)]	= "dc",
+		[ilog2(VM_DONTEXPAND)]	= "de",
+		[ilog2(VM_ACCOUNT)]	= "ac",
+		[ilog2(VM_NORESERVE)]	= "nr",
+		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_ARCH_1)]	= "ar",
+		[ilog2(VM_DONTDUMP)]	= "dd",
+#ifdef CONFIG_MEM_SOFT_DIRTY
+		[ilog2(VM_SOFTDIRTY)]	= "sd",
+#endif
+		[ilog2(VM_MIXEDMAP)]	= "mm",
+		[ilog2(VM_HUGEPAGE)]	= "hg",
+		[ilog2(VM_NOHUGEPAGE)]	= "nh",
+		[ilog2(VM_MERGEABLE)]	= "mg",
+	};
+	size_t i;
+
+	seq_puts(m, "VmFlags: ");
+	for (i = 0; i < BITS_PER_LONG; i++) {
+		if (vma->vm_flags & (1UL << i)) {
+			seq_printf(m, "%c%c ",
+				   mnemonics[i][0], mnemonics[i][1]);
+		}
+	}
+	seq_putc(m, '\n');
+}
+
+static int show_smap(struct seq_file *m, void *v, int is_pid)
+{
+	struct vm_area_struct *vma = v;
+	struct mem_size_stats mss;
+	struct mm_walk smaps_walk = {
+		.pmd_entry = smaps_pte_range,
+		.mm = vma->vm_mm,
+		.private = &mss,
+	};
+
+	memset(&mss, 0, sizeof mss);
+	/* mmap_sem is held in m_start */
+	walk_page_vma(vma, &smaps_walk);
+
+	show_map_vma(m, vma, is_pid);
+
+	seq_printf(m,
+		   "Size:           %8lu kB\n"
+		   "Rss:            %8lu kB\n"
+		   "Pss:            %8lu kB\n"
+		   "Shared_Clean:   %8lu kB\n"
+		   "Shared_Dirty:   %8lu kB\n"
+		   "Private_Clean:  %8lu kB\n"
+		   "Private_Dirty:  %8lu kB\n"
+		   "Referenced:     %8lu kB\n"
+		   "Anonymous:      %8lu kB\n"
+		   "AnonHugePages:  %8lu kB\n"
+		   "Swap:           %8lu kB\n"
+		   "KernelPageSize: %8lu kB\n"
+		   "MMUPageSize:    %8lu kB\n"
+		   "Locked:         %8lu kB\n",
+		   (vma->vm_end - vma->vm_start) >> 10,
+		   mss.resident >> 10,
+		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
+		   mss.shared_clean  >> 10,
+		   mss.shared_dirty  >> 10,
+		   mss.private_clean >> 10,
+		   mss.private_dirty >> 10,
+		   mss.referenced >> 10,
+		   mss.anonymous >> 10,
+		   mss.anonymous_thp >> 10,
+		   mss.swap >> 10,
+		   vma_kernel_pagesize(vma) >> 10,
+		   vma_mmu_pagesize(vma) >> 10,
+		   (vma->vm_flags & VM_LOCKED) ?
+			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+
+	show_smap_vma_flags(m, vma);
+	m_cache_vma(m, vma);
+	return 0;
+}
+
+static int show_pid_smap(struct seq_file *m, void *v)
+{
+	return show_smap(m, v, 1);
+}
+
+static int show_tid_smap(struct seq_file *m, void *v)
+{
+	return show_smap(m, v, 0);
+}
+
+static const struct seq_operations proc_pid_smaps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_pid_smap
+};
+
+static const struct seq_operations proc_tid_smaps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_tid_smap
+};
+
+static int pid_smaps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+static int tid_smaps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_tid_smaps_op);
+}
+
+const struct file_operations proc_pid_smaps_operations = {
+	.open		= pid_smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_map_release,
+};
+
+const struct file_operations proc_tid_smaps_operations = {
+	.open		= tid_smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_map_release,
+};
+
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ *    but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ *    these flag is set to denote, that user is aware of the
+ *    new API and those page-shift bits change their meaning.
+ *    The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ *    of page-shift in pagemap entries.
+ */
+
+static bool soft_dirty_cleared __read_mostly;
+
+enum clear_refs_types {
+	CLEAR_REFS_ALL = 1,
+	CLEAR_REFS_ANON,
+	CLEAR_REFS_MAPPED,
+	CLEAR_REFS_SOFT_DIRTY,
+	CLEAR_REFS_MM_HIWATER_RSS,
+	CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+	enum clear_refs_types type;
+};
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+		unsigned long addr, pte_t *pte)
+{
+	/*
+	 * The soft-dirty tracker uses #PF-s to catch writes
+	 * to pages, so write-protect the pte as well. See the
+	 * Documentation/vm/soft-dirty.txt for full description
+	 * of how soft-dirty works.
+	 */
+	pte_t ptent = *pte;
+
+	if (pte_present(ptent)) {
+		ptent = pte_wrprotect(ptent);
+		ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+	} else if (is_swap_pte(ptent)) {
+		ptent = pte_swp_clear_soft_dirty(ptent);
+	}
+
+	set_pte_at(vma->vm_mm, addr, pte, ptent);
+}
+
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+		unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t pmd = *pmdp;
+
+	pmd = pmd_wrprotect(pmd);
+	pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+
+	if (vma->vm_flags & VM_SOFTDIRTY)
+		vma->vm_flags &= ~VM_SOFTDIRTY;
+
+	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+}
+
+#else
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+		unsigned long addr, pte_t *pte)
+{
+}
+
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+		unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
+
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct clear_refs_private *cp = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+			clear_soft_dirty_pmd(vma, addr, pmd);
+			goto out;
+		}
+
+		page = pmd_page(*pmd);
+
+		/* Clear accessed and referenced bits. */
+		pmdp_test_and_clear_young(vma, addr, pmd);
+		ClearPageReferenced(page);
+out:
+		spin_unlock(ptl);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+
+		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+			clear_soft_dirty(vma, addr, pte);
+			continue;
+		}
+
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/* Clear accessed and referenced bits. */
+		ptep_test_and_clear_young(vma, addr, pte);
+		ClearPageReferenced(page);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+	return 0;
+}
+
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+				struct mm_walk *walk)
+{
+	struct clear_refs_private *cp = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+
+	/*
+	 * Writing 1 to /proc/pid/clear_refs affects all pages.
+	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+	 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+	 * Writing 4 to /proc/pid/clear_refs affects all pages.
+	 */
+	if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+		return 1;
+	if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+		return 1;
+	return 0;
+}
+
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	enum clear_refs_types type;
+	int itype;
+	int rv;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	rv = kstrtoint(strstrip(buffer), 10, &itype);
+	if (rv < 0)
+		return rv;
+	type = (enum clear_refs_types)itype;
+	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
+		return -EINVAL;
+
+	if (type == CLEAR_REFS_SOFT_DIRTY) {
+		soft_dirty_cleared = true;
+		pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
+			     " See the linux/Documentation/vm/pagemap.txt for "
+			     "details.\n");
+	}
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+	mm = get_task_mm(task);
+	if (mm) {
+		struct clear_refs_private cp = {
+			.type = type,
+		};
+		struct mm_walk clear_refs_walk = {
+			.pmd_entry = clear_refs_pte_range,
+			.test_walk = clear_refs_test_walk,
+			.mm = mm,
+			.private = &cp,
+		};
+
+		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+			/*
+			 * Writing 5 to /proc/pid/clear_refs resets the peak
+			 * resident set size to this mm's current rss value.
+			 */
+			down_write(&mm->mmap_sem);
+			reset_mm_hiwater_rss(mm);
+			up_write(&mm->mmap_sem);
+			goto out_mm;
+		}
+
+		down_read(&mm->mmap_sem);
+		if (type == CLEAR_REFS_SOFT_DIRTY) {
+			for (vma = mm->mmap; vma; vma = vma->vm_next) {
+				if (!(vma->vm_flags & VM_SOFTDIRTY))
+					continue;
+				up_read(&mm->mmap_sem);
+				down_write(&mm->mmap_sem);
+				for (vma = mm->mmap; vma; vma = vma->vm_next) {
+					vma->vm_flags &= ~VM_SOFTDIRTY;
+					vma_set_page_prot(vma);
+				}
+				downgrade_write(&mm->mmap_sem);
+				break;
+			}
+			mmu_notifier_invalidate_range_start(mm, 0, -1);
+		}
+		walk_page_range(0, ~0UL, &clear_refs_walk);
+		if (type == CLEAR_REFS_SOFT_DIRTY)
+			mmu_notifier_invalidate_range_end(mm, 0, -1);
+		flush_tlb_mm(mm);
+		up_read(&mm->mmap_sem);
+out_mm:
+		mmput(mm);
+	}
+	put_task_struct(task);
+
+	return count;
+}
+
+const struct file_operations proc_clear_refs_operations = {
+	.write		= clear_refs_write,
+	.llseek		= noop_llseek,
+};
+
+typedef struct {
+	u64 pme;
+} pagemap_entry_t;
+
+struct pagemapread {
+	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
+	pagemap_entry_t *buffer;
+	bool v2;
+};
+
+#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
+#define PAGEMAP_WALK_MASK	(PMD_MASK)
+
+#define PM_ENTRY_BYTES      sizeof(pagemap_entry_t)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define __PM_PSHIFT(x)      (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+
+#define __PM_SOFT_DIRTY      (1LL)
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+#define PM_FILE             PM_STATUS(1LL)
+#define PM_NOT_PRESENT(v2)  PM_STATUS2(v2, 0)
+#define PM_END_OF_BUFFER    1
+
+static inline pagemap_entry_t make_pme(u64 val)
+{
+	return (pagemap_entry_t) { .pme = val };
+}
+
+static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
+			  struct pagemapread *pm)
+{
+	pm->buffer[pm->pos++] = *pme;
+	if (pm->pos >= pm->len)
+		return PM_END_OF_BUFFER;
+	return 0;
+}
+
+static int pagemap_pte_hole(unsigned long start, unsigned long end,
+				struct mm_walk *walk)
+{
+	struct pagemapread *pm = walk->private;
+	unsigned long addr = start;
+	int err = 0;
+
+	while (addr < end) {
+		struct vm_area_struct *vma = find_vma(walk->mm, addr);
+		pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+		/* End of address space hole, which we mark as non-present. */
+		unsigned long hole_end;
+
+		if (vma)
+			hole_end = min(end, vma->vm_start);
+		else
+			hole_end = end;
+
+		for (; addr < hole_end; addr += PAGE_SIZE) {
+			err = add_to_pagemap(addr, &pme, pm);
+			if (err)
+				goto out;
+		}
+
+		if (!vma)
+			break;
+
+		/* Addresses in the VMA. */
+		if (vma->vm_flags & VM_SOFTDIRTY)
+			pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
+			err = add_to_pagemap(addr, &pme, pm);
+			if (err)
+				goto out;
+		}
+	}
+out:
+	return err;
+}
+
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+	u64 frame, flags;
+	struct page *page = NULL;
+	int flags2 = 0;
+
+	if (pte_present(pte)) {
+		frame = pte_pfn(pte);
+		flags = PM_PRESENT;
+		page = vm_normal_page(vma, addr, pte);
+		if (pte_soft_dirty(pte))
+			flags2 |= __PM_SOFT_DIRTY;
+	} else if (is_swap_pte(pte)) {
+		swp_entry_t entry;
+		if (pte_swp_soft_dirty(pte))
+			flags2 |= __PM_SOFT_DIRTY;
+		entry = pte_to_swp_entry(pte);
+		frame = swp_type(entry) |
+			(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+		flags = PM_SWAP;
+		if (is_migration_entry(entry))
+			page = migration_entry_to_page(entry);
+	} else {
+		if (vma->vm_flags & VM_SOFTDIRTY)
+			flags2 |= __PM_SOFT_DIRTY;
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
+		return;
+	}
+
+	if (page && !PageAnon(page))
+		flags |= PM_FILE;
+	if ((vma->vm_flags & VM_SOFTDIRTY))
+		flags2 |= __PM_SOFT_DIRTY;
+
+	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+		pmd_t pmd, int offset, int pmd_flags2)
+{
+	/*
+	 * Currently pmd for thp is always present because thp can not be
+	 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
+	 * This if-check is just to prepare for future implementation.
+	 */
+	if (pmd_present(pmd))
+		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
+				| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
+	else
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
+}
+#else
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+		pmd_t pmd, int offset, int pmd_flags2)
+{
+}
+#endif
+
+static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			     struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	struct pagemapread *pm = walk->private;
+	spinlock_t *ptl;
+	pte_t *pte, *orig_pte;
+	int err = 0;
+
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+		int pmd_flags2;
+
+		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
+			pmd_flags2 = __PM_SOFT_DIRTY;
+		else
+			pmd_flags2 = 0;
+
+		for (; addr != end; addr += PAGE_SIZE) {
+			unsigned long offset;
+			pagemap_entry_t pme;
+
+			offset = (addr & ~PAGEMAP_WALK_MASK) >>
+					PAGE_SHIFT;
+			thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
+			err = add_to_pagemap(addr, &pme, pm);
+			if (err)
+				break;
+		}
+		spin_unlock(ptl);
+		return err;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	/*
+	 * We can assume that @vma always points to a valid one and @end never
+	 * goes beyond vma->vm_end.
+	 */
+	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		pagemap_entry_t pme;
+
+		pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+		err = add_to_pagemap(addr, &pme, pm);
+		if (err)
+			break;
+	}
+	pte_unmap_unlock(orig_pte, ptl);
+
+	cond_resched();
+
+	return err;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+					pte_t pte, int offset, int flags2)
+{
+	if (pte_present(pte))
+		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)	|
+				PM_STATUS2(pm->v2, flags2)		|
+				PM_PRESENT);
+	else
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2)			|
+				PM_STATUS2(pm->v2, flags2));
+}
+
+/* This function walks within one hugetlb entry in the single call */
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+				 unsigned long addr, unsigned long end,
+				 struct mm_walk *walk)
+{
+	struct pagemapread *pm = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	int err = 0;
+	int flags2;
+	pagemap_entry_t pme;
+
+	if (vma->vm_flags & VM_SOFTDIRTY)
+		flags2 = __PM_SOFT_DIRTY;
+	else
+		flags2 = 0;
+
+	for (; addr != end; addr += PAGE_SIZE) {
+		int offset = (addr & ~hmask) >> PAGE_SHIFT;
+		huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+		err = add_to_pagemap(addr, &pme, pm);
+		if (err)
+			return err;
+	}
+
+	cond_resched();
+
+	return err;
+}
+#endif /* HUGETLB_PAGE */
+
+/*
+ * /proc/pid/pagemap - an array mapping virtual pages to pfns
+ *
+ * For each page in the address space, this file contains one 64-bit entry
+ * consisting of the following:
+ *
+ * Bits 0-54  page frame number (PFN) if present
+ * Bits 0-4   swap type if swapped
+ * Bits 5-54  swap offset if swapped
+ * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit  61    page is file-page or shared-anon
+ * Bit  62    page swapped
+ * Bit  63    page present
+ *
+ * If the page is not present but in swap, then the PFN contains an
+ * encoding of the swap file number and the page's offset into the
+ * swap. Unmapped pages return a null PFN. This allows determining
+ * precisely which pages are mapped (or in swap) and comparing mapped
+ * pages between processes.
+ *
+ * Efficient users of this interface will use /proc/pid/maps to
+ * determine which areas of memory are actually mapped and llseek to
+ * skip over unmapped regions.
+ */
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	struct mm_struct *mm;
+	struct pagemapread pm;
+	int ret = -ESRCH;
+	struct mm_walk pagemap_walk = {};
+	unsigned long src;
+	unsigned long svpfn;
+	unsigned long start_vaddr;
+	unsigned long end_vaddr;
+	int copied = 0;
+
+	if (!task)
+		goto out;
+
+	ret = -EINVAL;
+	/* file position must be aligned */
+	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
+		goto out_task;
+
+	ret = 0;
+	if (!count)
+		goto out_task;
+
+	pm.v2 = soft_dirty_cleared;
+	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
+	ret = -ENOMEM;
+	if (!pm.buffer)
+		goto out_task;
+
+	mm = mm_access(task, PTRACE_MODE_READ);
+	ret = PTR_ERR(mm);
+	if (!mm || IS_ERR(mm))
+		goto out_free;
+
+	pagemap_walk.pmd_entry = pagemap_pte_range;
+	pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
+	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
+	pagemap_walk.mm = mm;
+	pagemap_walk.private = &pm;
+
+	src = *ppos;
+	svpfn = src / PM_ENTRY_BYTES;
+	start_vaddr = svpfn << PAGE_SHIFT;
+	end_vaddr = TASK_SIZE_OF(task);
+
+	/* watch out for wraparound */
+	if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+		start_vaddr = end_vaddr;
+
+	/*
+	 * The odds are that this will stop walking way
+	 * before end_vaddr, because the length of the
+	 * user buffer is tracked in "pm", and the walk
+	 * will stop when we hit the end of the buffer.
+	 */
+	ret = 0;
+	while (count && (start_vaddr < end_vaddr)) {
+		int len;
+		unsigned long end;
+
+		pm.pos = 0;
+		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+		/* overflow ? */
+		if (end < start_vaddr || end > end_vaddr)
+			end = end_vaddr;
+		down_read(&mm->mmap_sem);
+		ret = walk_page_range(start_vaddr, end, &pagemap_walk);
+		up_read(&mm->mmap_sem);
+		start_vaddr = end;
+
+		len = min(count, PM_ENTRY_BYTES * pm.pos);
+		if (copy_to_user(buf, pm.buffer, len)) {
+			ret = -EFAULT;
+			goto out_mm;
+		}
+		copied += len;
+		buf += len;
+		count -= len;
+	}
+	*ppos += copied;
+	if (!ret || ret == PM_END_OF_BUFFER)
+		ret = copied;
+
+out_mm:
+	mmput(mm);
+out_free:
+	kfree(pm.buffer);
+out_task:
+	put_task_struct(task);
+out:
+	return ret;
+}
+
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+	/* do not disclose physical addresses: attack vector */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+			"to stop being page-shift some time soon. See the "
+			"linux/Documentation/vm/pagemap.txt for details.\n");
+	return 0;
+}
+
+const struct file_operations proc_pagemap_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= pagemap_read,
+	.open		= pagemap_open,
+};
+#endif /* CONFIG_PROC_PAGE_MONITOR */
+
+#ifdef CONFIG_NUMA
+
+struct numa_maps {
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long active;
+	unsigned long writeback;
+	unsigned long mapcount_max;
+	unsigned long dirty;
+	unsigned long swapcache;
+	unsigned long node[MAX_NUMNODES];
+};
+
+struct numa_maps_private {
+	struct proc_maps_private proc_maps;
+	struct numa_maps md;
+};
+
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+			unsigned long nr_pages)
+{
+	int count = page_mapcount(page);
+
+	md->pages += nr_pages;
+	if (pte_dirty || PageDirty(page))
+		md->dirty += nr_pages;
+
+	if (PageSwapCache(page))
+		md->swapcache += nr_pages;
+
+	if (PageActive(page) || PageUnevictable(page))
+		md->active += nr_pages;
+
+	if (PageWriteback(page))
+		md->writeback += nr_pages;
+
+	if (PageAnon(page))
+		md->anon += nr_pages;
+
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
+	md->node[page_to_nid(page)] += nr_pages;
+}
+
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pte_present(pte))
+		return NULL;
+
+	page = vm_normal_page(vma, addr, pte);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_MEMORY]))
+		return NULL;
+
+	return page;
+}
+
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+		unsigned long end, struct mm_walk *walk)
+{
+	struct numa_maps *md = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	spinlock_t *ptl;
+	pte_t *orig_pte;
+	pte_t *pte;
+
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+		pte_t huge_pte = *(pte_t *)pmd;
+		struct page *page;
+
+		page = can_gather_numa_stats(huge_pte, vma, addr);
+		if (page)
+			gather_stats(page, md, pte_dirty(huge_pte),
+				     HPAGE_PMD_SIZE/PAGE_SIZE);
+		spin_unlock(ptl);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	do {
+		struct page *page = can_gather_numa_stats(*pte, vma, addr);
+		if (!page)
+			continue;
+		gather_stats(page, md, pte_dirty(*pte), 1);
+
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(orig_pte, ptl);
+	return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
+		unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+	struct numa_maps *md;
+	struct page *page;
+
+	if (!pte_present(*pte))
+		return 0;
+
+	page = pte_page(*pte);
+	if (!page)
+		return 0;
+
+	md = walk->private;
+	gather_stats(page, md, pte_dirty(*pte), 1);
+	return 0;
+}
+
+#else
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
+		unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v, int is_pid)
+{
+	struct numa_maps_private *numa_priv = m->private;
+	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+	struct vm_area_struct *vma = v;
+	struct numa_maps *md = &numa_priv->md;
+	struct file *file = vma->vm_file;
+	struct mm_struct *mm = vma->vm_mm;
+	struct mm_walk walk = {
+		.hugetlb_entry = gather_hugetlb_stats,
+		.pmd_entry = gather_pte_stats,
+		.private = md,
+		.mm = mm,
+	};
+	struct mempolicy *pol;
+	char buffer[64];
+	int nid;
+
+	if (!mm)
+		return 0;
+
+	/* Ensure we start with an empty set of numa_maps statistics. */
+	memset(md, 0, sizeof(*md));
+
+	pol = __get_vma_policy(vma, vma->vm_start);
+	if (pol) {
+		mpol_to_str(buffer, sizeof(buffer), pol);
+		mpol_cond_put(pol);
+	} else {
+		mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
+	}
+
+	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+	if (file) {
+		seq_puts(m, " file=");
+		seq_path(m, &file->f_path, "\n\t= ");
+	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+		seq_puts(m, " heap");
+	} else {
+		pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
+		if (tid != 0) {
+			/*
+			 * Thread stack in /proc/PID/task/TID/maps or
+			 * the main process stack.
+			 */
+			if (!is_pid || (vma->vm_start <= mm->start_stack &&
+			    vma->vm_end >= mm->start_stack))
+				seq_puts(m, " stack");
+			else
+				seq_printf(m, " stack:%d", tid);
+		}
+	}
+
+	if (is_vm_hugetlb_page(vma))
+		seq_puts(m, " huge");
+
+	/* mmap_sem is held by m_start */
+	walk_page_vma(vma, &walk);
+
+	if (!md->pages)
+		goto out;
+
+	if (md->anon)
+		seq_printf(m, " anon=%lu", md->anon);
+
+	if (md->dirty)
+		seq_printf(m, " dirty=%lu", md->dirty);
+
+	if (md->pages != md->anon && md->pages != md->dirty)
+		seq_printf(m, " mapped=%lu", md->pages);
+
+	if (md->mapcount_max > 1)
+		seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+	if (md->swapcache)
+		seq_printf(m, " swapcache=%lu", md->swapcache);
+
+	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+		seq_printf(m, " active=%lu", md->active);
+
+	if (md->writeback)
+		seq_printf(m, " writeback=%lu", md->writeback);
+
+	for_each_node_state(nid, N_MEMORY)
+		if (md->node[nid])
+			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+
+	seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
+out:
+	seq_putc(m, '\n');
+	m_cache_vma(m, vma);
+	return 0;
+}
+
+static int show_pid_numa_map(struct seq_file *m, void *v)
+{
+	return show_numa_map(m, v, 1);
+}
+
+static int show_tid_numa_map(struct seq_file *m, void *v)
+{
+	return show_numa_map(m, v, 0);
+}
+
+static const struct seq_operations proc_pid_numa_maps_op = {
+	.start  = m_start,
+	.next   = m_next,
+	.stop   = m_stop,
+	.show   = show_pid_numa_map,
+};
+
+static const struct seq_operations proc_tid_numa_maps_op = {
+	.start  = m_start,
+	.next   = m_next,
+	.stop   = m_stop,
+	.show   = show_tid_numa_map,
+};
+
+static int numa_maps_open(struct inode *inode, struct file *file,
+			  const struct seq_operations *ops)
+{
+	return proc_maps_open(inode, file, ops,
+				sizeof(struct numa_maps_private));
+}
+
+static int pid_numa_maps_open(struct inode *inode, struct file *file)
+{
+	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+static int tid_numa_maps_open(struct inode *inode, struct file *file)
+{
+	return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+}
+
+const struct file_operations proc_pid_numa_maps_operations = {
+	.open		= pid_numa_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_map_release,
+};
+
+const struct file_operations proc_tid_numa_maps_operations = {
+	.open		= tid_numa_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_map_release,
+};
+#endif /* CONFIG_NUMA */
diff --git a/kernel/fs/proc/task_nommu.c b/kernel/fs/proc/task_nommu.c
new file mode 100644
index 000000000..599ec2e20
--- /dev/null
+++ b/kernel/fs/proc/task_nommu.c
@@ -0,0 +1,345 @@
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * Logic: we've got two memory sums for each process, "shared", and
+ * "non-shared". Shared memory may get counted more than once, for
+ * each process that owns it. Non-shared memory is counted
+ * accurately.
+ */
+void task_mem(struct seq_file *m, struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	struct vm_region *region;
+	struct rb_node *p;
+	unsigned long bytes = 0, sbytes = 0, slack = 0, size;
+        
+	down_read(&mm->mmap_sem);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+
+		bytes += kobjsize(vma);
+
+		region = vma->vm_region;
+		if (region) {
+			size = kobjsize(region);
+			size += region->vm_end - region->vm_start;
+		} else {
+			size = vma->vm_end - vma->vm_start;
+		}
+
+		if (atomic_read(&mm->mm_count) > 1 ||
+		    vma->vm_flags & VM_MAYSHARE) {
+			sbytes += size;
+		} else {
+			bytes += size;
+			if (region)
+				slack = region->vm_end - vma->vm_end;
+		}
+	}
+
+	if (atomic_read(&mm->mm_count) > 1)
+		sbytes += kobjsize(mm);
+	else
+		bytes += kobjsize(mm);
+	
+	if (current->fs && current->fs->users > 1)
+		sbytes += kobjsize(current->fs);
+	else
+		bytes += kobjsize(current->fs);
+
+	if (current->files && atomic_read(&current->files->count) > 1)
+		sbytes += kobjsize(current->files);
+	else
+		bytes += kobjsize(current->files);
+
+	if (current->sighand && atomic_read(&current->sighand->count) > 1)
+		sbytes += kobjsize(current->sighand);
+	else
+		bytes += kobjsize(current->sighand);
+
+	bytes += kobjsize(current); /* includes kernel stack */
+
+	seq_printf(m,
+		"Mem:\t%8lu bytes\n"
+		"Slack:\t%8lu bytes\n"
+		"Shared:\t%8lu bytes\n",
+		bytes, slack, sbytes);
+
+	up_read(&mm->mmap_sem);
+}
+
+unsigned long task_vsize(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *p;
+	unsigned long vsize = 0;
+
+	down_read(&mm->mmap_sem);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		vsize += vma->vm_end - vma->vm_start;
+	}
+	up_read(&mm->mmap_sem);
+	return vsize;
+}
+
+unsigned long task_statm(struct mm_struct *mm,
+			 unsigned long *shared, unsigned long *text,
+			 unsigned long *data, unsigned long *resident)
+{
+	struct vm_area_struct *vma;
+	struct vm_region *region;
+	struct rb_node *p;
+	unsigned long size = kobjsize(mm);
+
+	down_read(&mm->mmap_sem);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		size += kobjsize(vma);
+		region = vma->vm_region;
+		if (region) {
+			size += kobjsize(region);
+			size += region->vm_end - region->vm_start;
+		}
+	}
+
+	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+		>> PAGE_SHIFT;
+	*data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
+		>> PAGE_SHIFT;
+	up_read(&mm->mmap_sem);
+	size >>= PAGE_SHIFT;
+	size += *text + *data;
+	*resident = size;
+	return size;
+}
+
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+				struct vm_area_struct *vma, bool is_pid)
+{
+	struct inode *inode = priv->inode;
+	struct task_struct *task;
+	pid_t ret = 0;
+
+	rcu_read_lock();
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		task = task_of_stack(task, vma, is_pid);
+		if (task)
+			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
+			  int is_pid)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct proc_maps_private *priv = m->private;
+	unsigned long ino = 0;
+	struct file *file;
+	dev_t dev = 0;
+	int flags;
+	unsigned long long pgoff = 0;
+
+	flags = vma->vm_flags;
+	file = vma->vm_file;
+
+	if (file) {
+		struct inode *inode = file_inode(vma->vm_file);
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+	}
+
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+	seq_printf(m,
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+		   vma->vm_start,
+		   vma->vm_end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+		   pgoff,
+		   MAJOR(dev), MINOR(dev), ino);
+
+	if (file) {
+		seq_pad(m, ' ');
+		seq_path(m, &file->f_path, "");
+	} else if (mm) {
+		pid_t tid = pid_of_stack(priv, vma, is_pid);
+
+		if (tid != 0) {
+			seq_pad(m, ' ');
+			/*
+			 * Thread stack in /proc/PID/task/TID/maps or
+			 * the main process stack.
+			 */
+			if (!is_pid || (vma->vm_start <= mm->start_stack &&
+			    vma->vm_end >= mm->start_stack))
+				seq_printf(m, "[stack]");
+			else
+				seq_printf(m, "[stack:%d]", tid);
+		}
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+/*
+ * display mapping lines for a particular process's /proc/pid/maps
+ */
+static int show_map(struct seq_file *m, void *_p, int is_pid)
+{
+	struct rb_node *p = _p;
+
+	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
+			      is_pid);
+}
+
+static int show_pid_map(struct seq_file *m, void *_p)
+{
+	return show_map(m, _p, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *_p)
+{
+	return show_map(m, _p, 0);
+}
+
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct mm_struct *mm;
+	struct rb_node *p;
+	loff_t n = *pos;
+
+	/* pin the task and mm whilst we play with them */
+	priv->task = get_proc_task(priv->inode);
+	if (!priv->task)
+		return ERR_PTR(-ESRCH);
+
+	mm = priv->mm;
+	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+		return NULL;
+
+	down_read(&mm->mmap_sem);
+	/* start from the Nth VMA */
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
+		if (n-- == 0)
+			return p;
+
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return NULL;
+}
+
+static void m_stop(struct seq_file *m, void *_vml)
+{
+	struct proc_maps_private *priv = m->private;
+
+	if (!IS_ERR_OR_NULL(_vml)) {
+		up_read(&priv->mm->mmap_sem);
+		mmput(priv->mm);
+	}
+	if (priv->task) {
+		put_task_struct(priv->task);
+		priv->task = NULL;
+	}
+}
+
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+{
+	struct rb_node *p = _p;
+
+	(*pos)++;
+	return p ? rb_next(p) : NULL;
+}
+
+static const struct seq_operations proc_pid_maps_ops = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_pid_map
+};
+
+static const struct seq_operations proc_tid_maps_ops = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_tid_map
+};
+
+static int maps_open(struct inode *inode, struct file *file,
+		     const struct seq_operations *ops)
+{
+	struct proc_maps_private *priv;
+
+	priv = __seq_open_private(file, ops, sizeof(*priv));
+	if (!priv)
+		return -ENOMEM;
+
+	priv->inode = inode;
+	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+	if (IS_ERR(priv->mm)) {
+		int err = PTR_ERR(priv->mm);
+
+		seq_release_private(inode, file);
+		return err;
+	}
+
+	return 0;
+}
+
+
+static int map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct proc_maps_private *priv = seq->private;
+
+	if (priv->mm)
+		mmdrop(priv->mm);
+
+	return seq_release_private(inode, file);
+}
+
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+	return maps_open(inode, file, &proc_pid_maps_ops);
+}
+
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+	return maps_open(inode, file, &proc_tid_maps_ops);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+	.open		= pid_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= map_release,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+	.open		= tid_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= map_release,
+};
+
diff --git a/kernel/fs/proc/thread_self.c b/kernel/fs/proc/thread_self.c
new file mode 100644
index 000000000..a8371993b
--- /dev/null
+++ b/kernel/fs/proc/thread_self.c
@@ -0,0 +1,85 @@
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+/*
+ * /proc/thread_self:
+ */
+static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	pid_t pid = task_pid_nr_ns(current, ns);
+	char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF];
+	if (!pid)
+		return -ENOENT;
+	sprintf(tmp, "%d/task/%d", tgid, pid);
+	return readlink_copy(buffer, buflen, tmp);
+}
+
+static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	pid_t pid = task_pid_nr_ns(current, ns);
+	char *name = ERR_PTR(-ENOENT);
+	if (pid) {
+		name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d/task/%d", tgid, pid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static const struct inode_operations proc_thread_self_inode_operations = {
+	.readlink	= proc_thread_self_readlink,
+	.follow_link	= proc_thread_self_follow_link,
+	.put_link	= kfree_put_link,
+};
+
+static unsigned thread_self_inum;
+
+int proc_setup_thread_self(struct super_block *s)
+{
+	struct inode *root_inode = d_inode(s->s_root);
+	struct pid_namespace *ns = s->s_fs_info;
+	struct dentry *thread_self;
+
+	mutex_lock(&root_inode->i_mutex);
+	thread_self = d_alloc_name(s->s_root, "thread-self");
+	if (thread_self) {
+		struct inode *inode = new_inode_pseudo(s);
+		if (inode) {
+			inode->i_ino = thread_self_inum;
+			inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+			inode->i_mode = S_IFLNK | S_IRWXUGO;
+			inode->i_uid = GLOBAL_ROOT_UID;
+			inode->i_gid = GLOBAL_ROOT_GID;
+			inode->i_op = &proc_thread_self_inode_operations;
+			d_add(thread_self, inode);
+		} else {
+			dput(thread_self);
+			thread_self = ERR_PTR(-ENOMEM);
+		}
+	} else {
+		thread_self = ERR_PTR(-ENOMEM);
+	}
+	mutex_unlock(&root_inode->i_mutex);
+	if (IS_ERR(thread_self)) {
+		pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
+		return PTR_ERR(thread_self);
+	}
+	ns->proc_thread_self = thread_self;
+	return 0;
+}
+
+void __init proc_thread_self_init(void)
+{
+	proc_alloc_inum(&thread_self_inum);
+}
diff --git a/kernel/fs/proc/uptime.c b/kernel/fs/proc/uptime.c
new file mode 100644
index 000000000..33de567c2
--- /dev/null
+++ b/kernel/fs/proc/uptime.c
@@ -0,0 +1,52 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+#include <linux/kernel_stat.h>
+#include <linux/cputime.h>
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+	struct timespec uptime;
+	struct timespec idle;
+	u64 idletime;
+	u64 nsec;
+	u32 rem;
+	int i;
+
+	idletime = 0;
+	for_each_possible_cpu(i)
+		idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+
+	get_monotonic_boottime(&uptime);
+	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
+	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+	idle.tv_nsec = rem;
+	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+			(unsigned long) uptime.tv_sec,
+			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
+			(unsigned long) idle.tv_sec,
+			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
+	return 0;
+}
+
+static int uptime_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uptime_proc_show, NULL);
+}
+
+static const struct file_operations uptime_proc_fops = {
+	.open		= uptime_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_uptime_init(void)
+{
+	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	return 0;
+}
+fs_initcall(proc_uptime_init);
diff --git a/kernel/fs/proc/version.c b/kernel/fs/proc/version.c
new file mode 100644
index 000000000..d2154eb6d
--- /dev/null
+++ b/kernel/fs/proc/version.c
@@ -0,0 +1,34 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/utsname.h>
+
+static int version_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, linux_proc_banner,
+		utsname()->sysname,
+		utsname()->release,
+		utsname()->version);
+	return 0;
+}
+
+static int version_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, version_proc_show, NULL);
+}
+
+static const struct file_operations version_proc_fops = {
+	.open		= version_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_version_init(void)
+{
+	proc_create("version", 0, NULL, &version_proc_fops);
+	return 0;
+}
+fs_initcall(proc_version_init);
diff --git a/kernel/fs/proc/vmcore.c b/kernel/fs/proc/vmcore.c
new file mode 100644
index 000000000..4e61388ec
--- /dev/null
+++ b/kernel/fs/proc/vmcore.c
@@ -0,0 +1,1194 @@
+/*
+ *	fs/proc/vmcore.c Interface for accessing the crash
+ * 				 dump from the system's previous life.
+ * 	Heavily borrowed from fs/proc/kcore.c
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/kcore.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/printk.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include "internal.h"
+
+/* List representing chunks of contiguous memory areas and their offsets in
+ * vmcore file.
+ */
+static LIST_HEAD(vmcore_list);
+
+/* Stores the pointer to the buffer containing kernel elf core headers. */
+static char *elfcorebuf;
+static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
+
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
+
+/* Total size of vmcore file. */
+static u64 vmcore_size;
+
+static struct proc_dir_entry *proc_vmcore;
+
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * The called function has to take care of module refcounting.
+ */
+static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+
+int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+	if (oldmem_pfn_is_ram)
+		return -EBUSY;
+	oldmem_pfn_is_ram = fn;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+
+void unregister_oldmem_pfn_is_ram(void)
+{
+	oldmem_pfn_is_ram = NULL;
+	wmb();
+}
+EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+
+static int pfn_is_ram(unsigned long pfn)
+{
+	int (*fn)(unsigned long pfn);
+	/* pfn is ram unless fn() checks pagetype */
+	int ret = 1;
+
+	/*
+	 * Ask hypervisor if the pfn is really ram.
+	 * A ballooned page contains no data and reading from such a page
+	 * will cause high load in the hypervisor.
+	 */
+	fn = oldmem_pfn_is_ram;
+	if (fn)
+		ret = fn(pfn);
+
+	return ret;
+}
+
+/* Reads a page from the oldmem device from given offset. */
+static ssize_t read_from_oldmem(char *buf, size_t count,
+				u64 *ppos, int userbuf)
+{
+	unsigned long pfn, offset;
+	size_t nr_bytes;
+	ssize_t read = 0, tmp;
+
+	if (!count)
+		return 0;
+
+	offset = (unsigned long)(*ppos % PAGE_SIZE);
+	pfn = (unsigned long)(*ppos / PAGE_SIZE);
+
+	do {
+		if (count > (PAGE_SIZE - offset))
+			nr_bytes = PAGE_SIZE - offset;
+		else
+			nr_bytes = count;
+
+		/* If pfn is not ram, return zeros for sparse dump files */
+		if (pfn_is_ram(pfn) == 0)
+			memset(buf, 0, nr_bytes);
+		else {
+			tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+						offset, userbuf);
+			if (tmp < 0)
+				return tmp;
+		}
+		*ppos += nr_bytes;
+		count -= nr_bytes;
+		buf += nr_bytes;
+		read += nr_bytes;
+		++pfn;
+		offset = 0;
+	} while (count);
+
+	return read;
+}
+
+/*
+ * Architectures may override this function to allocate ELF header in 2nd kernel
+ */
+int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
+{
+	return 0;
+}
+
+/*
+ * Architectures may override this function to free header
+ */
+void __weak elfcorehdr_free(unsigned long long addr)
+{}
+
+/*
+ * Architectures may override this function to read from ELF header
+ */
+ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+	return read_from_oldmem(buf, count, ppos, 0);
+}
+
+/*
+ * Architectures may override this function to read from notes sections
+ */
+ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
+{
+	return read_from_oldmem(buf, count, ppos, 0);
+}
+
+/*
+ * Architectures may override this function to map oldmem
+ */
+int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
+				  unsigned long from, unsigned long pfn,
+				  unsigned long size, pgprot_t prot)
+{
+	return remap_pfn_range(vma, from, pfn, size, prot);
+}
+
+/*
+ * Copy to either kernel or user space
+ */
+static int copy_to(void *target, void *src, size_t size, int userbuf)
+{
+	if (userbuf) {
+		if (copy_to_user((char __user *) target, src, size))
+			return -EFAULT;
+	} else {
+		memcpy(target, src, size);
+	}
+	return 0;
+}
+
+/* Read from the ELF header and then the crash dump. On error, negative value is
+ * returned otherwise number of bytes read are returned.
+ */
+static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
+			     int userbuf)
+{
+	ssize_t acc = 0, tmp;
+	size_t tsz;
+	u64 start;
+	struct vmcore *m = NULL;
+
+	if (buflen == 0 || *fpos >= vmcore_size)
+		return 0;
+
+	/* trim buflen to not go beyond EOF */
+	if (buflen > vmcore_size - *fpos)
+		buflen = vmcore_size - *fpos;
+
+	/* Read ELF core header */
+	if (*fpos < elfcorebuf_sz) {
+		tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
+		if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
+			return -EFAULT;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	}
+
+	/* Read Elf note segment */
+	if (*fpos < elfcorebuf_sz + elfnotes_sz) {
+		void *kaddr;
+
+		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+		if (copy_to(buffer, kaddr, tsz, userbuf))
+			return -EFAULT;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	}
+
+	list_for_each_entry(m, &vmcore_list, list) {
+		if (*fpos < m->offset + m->size) {
+			tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+			start = m->paddr + *fpos - m->offset;
+			tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
+			if (tmp < 0)
+				return tmp;
+			buflen -= tsz;
+			*fpos += tsz;
+			buffer += tsz;
+			acc += tsz;
+
+			/* leave now if filled buffer already */
+			if (buflen == 0)
+				return acc;
+		}
+	}
+
+	return acc;
+}
+
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+			   size_t buflen, loff_t *fpos)
+{
+	return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+}
+
+/*
+ * The vmcore fault handler uses the page cache and fills data using the
+ * standard __vmcore_read() function.
+ *
+ * On s390 the fault handler is used for memory regions that can't be mapped
+ * directly with remap_pfn_range().
+ */
+static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#ifdef CONFIG_S390
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	pgoff_t index = vmf->pgoff;
+	struct page *page;
+	loff_t offset;
+	char *buf;
+	int rc;
+
+	page = find_or_create_page(mapping, index, GFP_KERNEL);
+	if (!page)
+		return VM_FAULT_OOM;
+	if (!PageUptodate(page)) {
+		offset = (loff_t) index << PAGE_CACHE_SHIFT;
+		buf = __va((page_to_pfn(page) << PAGE_SHIFT));
+		rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+		if (rc < 0) {
+			unlock_page(page);
+			page_cache_release(page);
+			return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+		}
+		SetPageUptodate(page);
+	}
+	unlock_page(page);
+	vmf->page = page;
+	return 0;
+#else
+	return VM_FAULT_SIGBUS;
+#endif
+}
+
+static const struct vm_operations_struct vmcore_mmap_ops = {
+	.fault = mmap_vmcore_fault,
+};
+
+/**
+ * alloc_elfnotes_buf - allocate buffer for ELF note segment in
+ *                      vmalloc memory
+ *
+ * @notes_sz: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *alloc_elfnotes_buf(size_t notes_sz)
+{
+#ifdef CONFIG_MMU
+	return vmalloc_user(notes_sz);
+#else
+	return vzalloc(notes_sz);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+/*
+ * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
+ * reported as not being ram with the zero page.
+ *
+ * @vma: vm_area_struct describing requested mapping
+ * @from: start remapping from
+ * @pfn: page frame number to start remapping to
+ * @size: remapping size
+ * @prot: protection bits
+ *
+ * Returns zero on success, -EAGAIN on failure.
+ */
+static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
+				    unsigned long from, unsigned long pfn,
+				    unsigned long size, pgprot_t prot)
+{
+	unsigned long map_size;
+	unsigned long pos_start, pos_end, pos;
+	unsigned long zeropage_pfn = my_zero_pfn(0);
+	size_t len = 0;
+
+	pos_start = pfn;
+	pos_end = pfn + (size >> PAGE_SHIFT);
+
+	for (pos = pos_start; pos < pos_end; ++pos) {
+		if (!pfn_is_ram(pos)) {
+			/*
+			 * We hit a page which is not ram. Remap the continuous
+			 * region between pos_start and pos-1 and replace
+			 * the non-ram page at pos with the zero page.
+			 */
+			if (pos > pos_start) {
+				/* Remap continuous region */
+				map_size = (pos - pos_start) << PAGE_SHIFT;
+				if (remap_oldmem_pfn_range(vma, from + len,
+							   pos_start, map_size,
+							   prot))
+					goto fail;
+				len += map_size;
+			}
+			/* Remap the zero page */
+			if (remap_oldmem_pfn_range(vma, from + len,
+						   zeropage_pfn,
+						   PAGE_SIZE, prot))
+				goto fail;
+			len += PAGE_SIZE;
+			pos_start = pos + 1;
+		}
+	}
+	if (pos > pos_start) {
+		/* Remap the rest */
+		map_size = (pos - pos_start) << PAGE_SHIFT;
+		if (remap_oldmem_pfn_range(vma, from + len, pos_start,
+					   map_size, prot))
+			goto fail;
+	}
+	return 0;
+fail:
+	do_munmap(vma->vm_mm, from, len);
+	return -EAGAIN;
+}
+
+static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
+			    unsigned long from, unsigned long pfn,
+			    unsigned long size, pgprot_t prot)
+{
+	/*
+	 * Check if oldmem_pfn_is_ram was registered to avoid
+	 * looping over all pages without a reason.
+	 */
+	if (oldmem_pfn_is_ram)
+		return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+	else
+		return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+}
+
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+	u64 start, end, len, tsz;
+	struct vmcore *m;
+
+	start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+	end = start + size;
+
+	if (size > vmcore_size || end > vmcore_size)
+		return -EINVAL;
+
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+
+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+	vma->vm_flags |= VM_MIXEDMAP;
+	vma->vm_ops = &vmcore_mmap_ops;
+
+	len = 0;
+
+	if (start < elfcorebuf_sz) {
+		u64 pfn;
+
+		tsz = min(elfcorebuf_sz - (size_t)start, size);
+		pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+		if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+				    vma->vm_page_prot))
+			return -EAGAIN;
+		size -= tsz;
+		start += tsz;
+		len += tsz;
+
+		if (size == 0)
+			return 0;
+	}
+
+	if (start < elfcorebuf_sz + elfnotes_sz) {
+		void *kaddr;
+
+		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+		kaddr = elfnotes_buf + start - elfcorebuf_sz;
+		if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+						kaddr, tsz))
+			goto fail;
+		size -= tsz;
+		start += tsz;
+		len += tsz;
+
+		if (size == 0)
+			return 0;
+	}
+
+	list_for_each_entry(m, &vmcore_list, list) {
+		if (start < m->offset + m->size) {
+			u64 paddr = 0;
+
+			tsz = min_t(size_t, m->offset + m->size - start, size);
+			paddr = m->paddr + start - m->offset;
+			if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
+						    paddr >> PAGE_SHIFT, tsz,
+						    vma->vm_page_prot))
+				goto fail;
+			size -= tsz;
+			start += tsz;
+			len += tsz;
+
+			if (size == 0)
+				return 0;
+		}
+	}
+
+	return 0;
+fail:
+	do_munmap(vma->vm_mm, vma->vm_start, len);
+	return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+	return -ENOSYS;
+}
+#endif
+
+static const struct file_operations proc_vmcore_operations = {
+	.read		= read_vmcore,
+	.llseek		= default_llseek,
+	.mmap		= mmap_vmcore,
+};
+
+static struct vmcore* __init get_new_element(void)
+{
+	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
+}
+
+static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+				  struct list_head *vc_list)
+{
+	u64 size;
+	struct vmcore *m;
+
+	size = elfsz + elfnotesegsz;
+	list_for_each_entry(m, vc_list, list) {
+		size += m->size;
+	}
+	return size;
+}
+
+/**
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
+{
+	int i, rc=0;
+	Elf64_Phdr *phdr_ptr;
+	Elf64_Nhdr *nhdr_ptr;
+
+	phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		void *notes_section;
+		u64 offset, max_sz, sz, real_sz = 0;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		max_sz = phdr_ptr->p_memsz;
+		offset = phdr_ptr->p_offset;
+		notes_section = kmalloc(max_sz, GFP_KERNEL);
+		if (!notes_section)
+			return -ENOMEM;
+		rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
+		if (rc < 0) {
+			kfree(notes_section);
+			return rc;
+		}
+		nhdr_ptr = notes_section;
+		while (nhdr_ptr->n_namesz != 0) {
+			sz = sizeof(Elf64_Nhdr) +
+				(((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+				(((u64)nhdr_ptr->n_descsz + 3) & ~3);
+			if ((real_sz + sz) > max_sz) {
+				pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+					nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+				break;
+			}
+			real_sz += sz;
+			nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
+		}
+		kfree(notes_section);
+		phdr_ptr->p_memsz = real_sz;
+		if (real_sz == 0) {
+			pr_warn("Warning: Zero PT_NOTE entries found\n");
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+						 int *nr_ptnote, u64 *sz_ptnote)
+{
+	int i;
+	Elf64_Phdr *phdr_ptr;
+
+	*nr_ptnote = *sz_ptnote = 0;
+
+	phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		*nr_ptnote += 1;
+		*sz_ptnote += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+	int i, rc=0;
+	Elf64_Phdr *phdr_ptr;
+
+	phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 offset;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		offset = phdr_ptr->p_offset;
+		rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+					   &offset);
+		if (rc < 0)
+			return rc;
+		notes_buf += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+					   char **notes_buf, size_t *notes_sz)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr phdr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+	rc = update_note_header_size_elf64(ehdr_ptr);
+	if (rc < 0)
+		return rc;
+
+	rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+	if (rc < 0)
+		return rc;
+
+	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
+	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	if (!*notes_buf)
+		return -ENOMEM;
+
+	rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+	if (rc < 0)
+		return rc;
+
+	/* Prepare merged PT_NOTE program header. */
+	phdr.p_type    = PT_NOTE;
+	phdr.p_flags   = 0;
+	note_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
+	phdr.p_offset  = roundup(note_off, PAGE_SIZE);
+	phdr.p_vaddr   = phdr.p_paddr = 0;
+	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
+	phdr.p_align   = 0;
+
+	/* Add merged PT_NOTE program header*/
+	tmp = elfptr + sizeof(Elf64_Ehdr);
+	memcpy(tmp, &phdr, sizeof(phdr));
+	tmp += sizeof(phdr);
+
+	/* Remove unwanted PT_NOTE program headers. */
+	i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
+	*elfsz = *elfsz - i;
+	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+	memset(elfptr + *elfsz, 0, i);
+	*elfsz = roundup(*elfsz, PAGE_SIZE);
+
+	/* Modify e_phnum to reflect merged headers. */
+	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+	return 0;
+}
+
+/**
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
+{
+	int i, rc=0;
+	Elf32_Phdr *phdr_ptr;
+	Elf32_Nhdr *nhdr_ptr;
+
+	phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		void *notes_section;
+		u64 offset, max_sz, sz, real_sz = 0;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		max_sz = phdr_ptr->p_memsz;
+		offset = phdr_ptr->p_offset;
+		notes_section = kmalloc(max_sz, GFP_KERNEL);
+		if (!notes_section)
+			return -ENOMEM;
+		rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
+		if (rc < 0) {
+			kfree(notes_section);
+			return rc;
+		}
+		nhdr_ptr = notes_section;
+		while (nhdr_ptr->n_namesz != 0) {
+			sz = sizeof(Elf32_Nhdr) +
+				(((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+				(((u64)nhdr_ptr->n_descsz + 3) & ~3);
+			if ((real_sz + sz) > max_sz) {
+				pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+					nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+				break;
+			}
+			real_sz += sz;
+			nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
+		}
+		kfree(notes_section);
+		phdr_ptr->p_memsz = real_sz;
+		if (real_sz == 0) {
+			pr_warn("Warning: Zero PT_NOTE entries found\n");
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+						 int *nr_ptnote, u64 *sz_ptnote)
+{
+	int i;
+	Elf32_Phdr *phdr_ptr;
+
+	*nr_ptnote = *sz_ptnote = 0;
+
+	phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		*nr_ptnote += 1;
+		*sz_ptnote += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+	int i, rc=0;
+	Elf32_Phdr *phdr_ptr;
+
+	phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 offset;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		offset = phdr_ptr->p_offset;
+		rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+					   &offset);
+		if (rc < 0)
+			return rc;
+		notes_buf += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+					   char **notes_buf, size_t *notes_sz)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf32_Ehdr *ehdr_ptr;
+	Elf32_Phdr phdr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+	rc = update_note_header_size_elf32(ehdr_ptr);
+	if (rc < 0)
+		return rc;
+
+	rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+	if (rc < 0)
+		return rc;
+
+	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
+	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	if (!*notes_buf)
+		return -ENOMEM;
+
+	rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+	if (rc < 0)
+		return rc;
+
+	/* Prepare merged PT_NOTE program header. */
+	phdr.p_type    = PT_NOTE;
+	phdr.p_flags   = 0;
+	note_off = sizeof(Elf32_Ehdr) +
+			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
+	phdr.p_offset  = roundup(note_off, PAGE_SIZE);
+	phdr.p_vaddr   = phdr.p_paddr = 0;
+	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
+	phdr.p_align   = 0;
+
+	/* Add merged PT_NOTE program header*/
+	tmp = elfptr + sizeof(Elf32_Ehdr);
+	memcpy(tmp, &phdr, sizeof(phdr));
+	tmp += sizeof(phdr);
+
+	/* Remove unwanted PT_NOTE program headers. */
+	i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
+	*elfsz = *elfsz - i;
+	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+	memset(elfptr + *elfsz, 0, i);
+	*elfsz = roundup(*elfsz, PAGE_SIZE);
+
+	/* Modify e_phnum to reflect merged headers. */
+	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+	return 0;
+}
+
+/* Add memory chunks represented by program headers to vmcore list. Also update
+ * the new offset fields of exported program headers. */
+static int __init process_ptload_program_headers_elf64(char *elfptr,
+						size_t elfsz,
+						size_t elfnotes_sz,
+						struct list_head *vc_list)
+{
+	int i;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr *phdr_ptr;
+	loff_t vmcore_off;
+	struct vmcore *new;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
+
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 paddr, start, end, size;
+
+		if (phdr_ptr->p_type != PT_LOAD)
+			continue;
+
+		paddr = phdr_ptr->p_offset;
+		start = rounddown(paddr, PAGE_SIZE);
+		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+		size = end - start;
+
+		/* Add this contiguous chunk of memory to vmcore list.*/
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr = start;
+		new->size = size;
+		list_add_tail(&new->list, vc_list);
+
+		/* Update the program header offset. */
+		phdr_ptr->p_offset = vmcore_off + (paddr - start);
+		vmcore_off = vmcore_off + size;
+	}
+	return 0;
+}
+
+static int __init process_ptload_program_headers_elf32(char *elfptr,
+						size_t elfsz,
+						size_t elfnotes_sz,
+						struct list_head *vc_list)
+{
+	int i;
+	Elf32_Ehdr *ehdr_ptr;
+	Elf32_Phdr *phdr_ptr;
+	loff_t vmcore_off;
+	struct vmcore *new;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
+
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 paddr, start, end, size;
+
+		if (phdr_ptr->p_type != PT_LOAD)
+			continue;
+
+		paddr = phdr_ptr->p_offset;
+		start = rounddown(paddr, PAGE_SIZE);
+		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+		size = end - start;
+
+		/* Add this contiguous chunk of memory to vmcore list.*/
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr = start;
+		new->size = size;
+		list_add_tail(&new->list, vc_list);
+
+		/* Update the program header offset */
+		phdr_ptr->p_offset = vmcore_off + (paddr - start);
+		vmcore_off = vmcore_off + size;
+	}
+	return 0;
+}
+
+/* Sets offset fields of vmcore elements. */
+static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+					   struct list_head *vc_list)
+{
+	loff_t vmcore_off;
+	struct vmcore *m;
+
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
+
+	list_for_each_entry(m, vc_list, list) {
+		m->offset = vmcore_off;
+		vmcore_off += m->size;
+	}
+}
+
+static void free_elfcorebuf(void)
+{
+	free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+	elfcorebuf = NULL;
+	vfree(elfnotes_buf);
+	elfnotes_buf = NULL;
+}
+
+static int __init parse_crash_elf64_headers(void)
+{
+	int rc=0;
+	Elf64_Ehdr ehdr;
+	u64 addr;
+
+	addr = elfcorehdr_addr;
+
+	/* Read Elf header */
+	rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
+	if (rc < 0)
+		return rc;
+
+	/* Do some basic Verification. */
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+		(ehdr.e_type != ET_CORE) ||
+		!vmcore_elf64_check_arch(&ehdr) ||
+		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
+		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+		ehdr.e_version != EV_CURRENT ||
+		ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
+		ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
+		ehdr.e_phnum == 0) {
+		pr_warn("Warning: Core image elf header is not sane\n");
+		return -EINVAL;
+	}
+
+	/* Read in all elf headers. */
+	elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
+				ehdr.e_phnum * sizeof(Elf64_Phdr);
+	elfcorebuf_sz = elfcorebuf_sz_orig;
+	elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					      get_order(elfcorebuf_sz_orig));
+	if (!elfcorebuf)
+		return -ENOMEM;
+	addr = elfcorehdr_addr;
+	rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
+	if (rc < 0)
+		goto fail;
+
+	/* Merge all PT_NOTE headers into one. */
+	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
+				      &elfnotes_buf, &elfnotes_sz);
+	if (rc)
+		goto fail;
+	rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
+						  elfnotes_sz, &vmcore_list);
+	if (rc)
+		goto fail;
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+	return 0;
+fail:
+	free_elfcorebuf();
+	return rc;
+}
+
+static int __init parse_crash_elf32_headers(void)
+{
+	int rc=0;
+	Elf32_Ehdr ehdr;
+	u64 addr;
+
+	addr = elfcorehdr_addr;
+
+	/* Read Elf header */
+	rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
+	if (rc < 0)
+		return rc;
+
+	/* Do some basic Verification. */
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+		(ehdr.e_type != ET_CORE) ||
+		!elf_check_arch(&ehdr) ||
+		ehdr.e_ident[EI_CLASS] != ELFCLASS32||
+		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+		ehdr.e_version != EV_CURRENT ||
+		ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||
+		ehdr.e_phentsize != sizeof(Elf32_Phdr) ||
+		ehdr.e_phnum == 0) {
+		pr_warn("Warning: Core image elf header is not sane\n");
+		return -EINVAL;
+	}
+
+	/* Read in all elf headers. */
+	elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+	elfcorebuf_sz = elfcorebuf_sz_orig;
+	elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					      get_order(elfcorebuf_sz_orig));
+	if (!elfcorebuf)
+		return -ENOMEM;
+	addr = elfcorehdr_addr;
+	rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
+	if (rc < 0)
+		goto fail;
+
+	/* Merge all PT_NOTE headers into one. */
+	rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
+				      &elfnotes_buf, &elfnotes_sz);
+	if (rc)
+		goto fail;
+	rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
+						  elfnotes_sz, &vmcore_list);
+	if (rc)
+		goto fail;
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+	return 0;
+fail:
+	free_elfcorebuf();
+	return rc;
+}
+
+static int __init parse_crash_elf_headers(void)
+{
+	unsigned char e_ident[EI_NIDENT];
+	u64 addr;
+	int rc=0;
+
+	addr = elfcorehdr_addr;
+	rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr);
+	if (rc < 0)
+		return rc;
+	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
+		pr_warn("Warning: Core image elf header not found\n");
+		return -EINVAL;
+	}
+
+	if (e_ident[EI_CLASS] == ELFCLASS64) {
+		rc = parse_crash_elf64_headers();
+		if (rc)
+			return rc;
+	} else if (e_ident[EI_CLASS] == ELFCLASS32) {
+		rc = parse_crash_elf32_headers();
+		if (rc)
+			return rc;
+	} else {
+		pr_warn("Warning: Core image elf header is not sane\n");
+		return -EINVAL;
+	}
+
+	/* Determine vmcore size. */
+	vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+				      &vmcore_list);
+
+	return 0;
+}
+
+/* Init function for vmcore module. */
+static int __init vmcore_init(void)
+{
+	int rc = 0;
+
+	/* Allow architectures to allocate ELF header in 2nd kernel */
+	rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size);
+	if (rc)
+		return rc;
+	/*
+	 * If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
+	 * then capture the dump.
+	 */
+	if (!(is_vmcore_usable()))
+		return rc;
+	rc = parse_crash_elf_headers();
+	if (rc) {
+		pr_warn("Kdump: vmcore not initialized\n");
+		return rc;
+	}
+	elfcorehdr_free(elfcorehdr_addr);
+	elfcorehdr_addr = ELFCORE_ADDR_ERR;
+
+	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
+	if (proc_vmcore)
+		proc_vmcore->size = vmcore_size;
+	return 0;
+}
+fs_initcall(vmcore_init);
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+	struct list_head *pos, *next;
+
+	if (proc_vmcore) {
+		proc_remove(proc_vmcore);
+		proc_vmcore = NULL;
+	}
+
+	/* clear the vmcore list. */
+	list_for_each_safe(pos, next, &vmcore_list) {
+		struct vmcore *m;
+
+		m = list_entry(pos, struct vmcore, list);
+		list_del(&m->list);
+		kfree(m);
+	}
+	free_elfcorebuf();
+}
diff --git a/kernel/fs/proc_namespace.c b/kernel/fs/proc_namespace.c
new file mode 100644
index 000000000..8db932da4
--- /dev/null
+++ b/kernel/fs/proc_namespace.c
@@ -0,0 +1,335 @@
+/*
+ * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats}
+ *
+ * In fact, that's a piece of procfs; it's *almost* isolated from
+ * the rest of fs/proc, but has rather close relationships with
+ * fs/namespace.c, thus here instead of fs/proc
+ *
+ */
+#include <linux/mnt_namespace.h>
+#include <linux/nsproxy.h>
+#include <linux/security.h>
+#include <linux/fs_struct.h>
+#include "proc/internal.h" /* only for get_proc_task() in ->open() */
+
+#include "pnode.h"
+#include "internal.h"
+
+static unsigned mounts_poll(struct file *file, poll_table *wait)
+{
+	struct proc_mounts *p = proc_mounts(file->private_data);
+	struct mnt_namespace *ns = p->ns;
+	unsigned res = POLLIN | POLLRDNORM;
+	int event;
+
+	poll_wait(file, &p->ns->poll, wait);
+
+	event = ACCESS_ONCE(ns->event);
+	if (p->m.poll_event != event) {
+		p->m.poll_event = event;
+		res |= POLLERR | POLLPRI;
+	}
+
+	return res;
+}
+
+struct proc_fs_info {
+	int flag;
+	const char *str;
+};
+
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
+{
+	static const struct proc_fs_info fs_info[] = {
+		{ MS_SYNCHRONOUS, ",sync" },
+		{ MS_DIRSYNC, ",dirsync" },
+		{ MS_MANDLOCK, ",mand" },
+		{ MS_LAZYTIME, ",lazytime" },
+		{ 0, NULL }
+	};
+	const struct proc_fs_info *fs_infop;
+
+	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
+		if (sb->s_flags & fs_infop->flag)
+			seq_puts(m, fs_infop->str);
+	}
+
+	return security_sb_show_options(m, sb);
+}
+
+static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+{
+	static const struct proc_fs_info mnt_info[] = {
+		{ MNT_NOSUID, ",nosuid" },
+		{ MNT_NODEV, ",nodev" },
+		{ MNT_NOEXEC, ",noexec" },
+		{ MNT_NOATIME, ",noatime" },
+		{ MNT_NODIRATIME, ",nodiratime" },
+		{ MNT_RELATIME, ",relatime" },
+		{ 0, NULL }
+	};
+	const struct proc_fs_info *fs_infop;
+
+	for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
+		if (mnt->mnt_flags & fs_infop->flag)
+			seq_puts(m, fs_infop->str);
+	}
+}
+
+static inline void mangle(struct seq_file *m, const char *s)
+{
+	seq_escape(m, s, " \t\n\\");
+}
+
+static void show_type(struct seq_file *m, struct super_block *sb)
+{
+	mangle(m, sb->s_type->name);
+	if (sb->s_subtype && sb->s_subtype[0]) {
+		seq_putc(m, '.');
+		mangle(m, sb->s_subtype);
+	}
+}
+
+static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct proc_mounts *p = proc_mounts(m);
+	struct mount *r = real_mount(mnt);
+	int err = 0;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	struct super_block *sb = mnt_path.dentry->d_sb;
+
+	if (sb->s_op->show_devname) {
+		err = sb->s_op->show_devname(m, mnt_path.dentry);
+		if (err)
+			goto out;
+	} else {
+		mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+	}
+	seq_putc(m, ' ');
+	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
+	err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\");
+	if (err)
+		goto out;
+	seq_putc(m, ' ');
+	show_type(m, sb);
+	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
+	err = show_sb_opts(m, sb);
+	if (err)
+		goto out;
+	show_mnt_opts(m, mnt);
+	if (sb->s_op->show_options)
+		err = sb->s_op->show_options(m, mnt_path.dentry);
+	seq_puts(m, " 0 0\n");
+out:
+	return err;
+}
+
+static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct proc_mounts *p = proc_mounts(m);
+	struct mount *r = real_mount(mnt);
+	struct super_block *sb = mnt->mnt_sb;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	int err = 0;
+
+	seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
+		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
+	if (sb->s_op->show_path)
+		err = sb->s_op->show_path(m, mnt->mnt_root);
+	else
+		seq_dentry(m, mnt->mnt_root, " \t\n\\");
+	if (err)
+		goto out;
+	seq_putc(m, ' ');
+
+	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
+	err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\");
+	if (err)
+		goto out;
+
+	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
+	show_mnt_opts(m, mnt);
+
+	/* Tagged fields ("foo:X" or "bar") */
+	if (IS_MNT_SHARED(r))
+		seq_printf(m, " shared:%i", r->mnt_group_id);
+	if (IS_MNT_SLAVE(r)) {
+		int master = r->mnt_master->mnt_group_id;
+		int dom = get_dominating_id(r, &p->root);
+		seq_printf(m, " master:%i", master);
+		if (dom && dom != master)
+			seq_printf(m, " propagate_from:%i", dom);
+	}
+	if (IS_MNT_UNBINDABLE(r))
+		seq_puts(m, " unbindable");
+
+	/* Filesystem specific data */
+	seq_puts(m, " - ");
+	show_type(m, sb);
+	seq_putc(m, ' ');
+	if (sb->s_op->show_devname)
+		err = sb->s_op->show_devname(m, mnt->mnt_root);
+	else
+		mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+	if (err)
+		goto out;
+	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
+	err = show_sb_opts(m, sb);
+	if (err)
+		goto out;
+	if (sb->s_op->show_options)
+		err = sb->s_op->show_options(m, mnt->mnt_root);
+	seq_putc(m, '\n');
+out:
+	return err;
+}
+
+static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct proc_mounts *p = proc_mounts(m);
+	struct mount *r = real_mount(mnt);
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	struct super_block *sb = mnt_path.dentry->d_sb;
+	int err = 0;
+
+	/* device */
+	if (sb->s_op->show_devname) {
+		seq_puts(m, "device ");
+		err = sb->s_op->show_devname(m, mnt_path.dentry);
+	} else {
+		if (r->mnt_devname) {
+			seq_puts(m, "device ");
+			mangle(m, r->mnt_devname);
+		} else
+			seq_puts(m, "no device");
+	}
+
+	/* mount point */
+	seq_puts(m, " mounted on ");
+	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
+	err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\");
+	if (err)
+		goto out;
+	seq_putc(m, ' ');
+
+	/* file system type */
+	seq_puts(m, "with fstype ");
+	show_type(m, sb);
+
+	/* optional statistics */
+	if (sb->s_op->show_stats) {
+		seq_putc(m, ' ');
+		if (!err)
+			err = sb->s_op->show_stats(m, mnt_path.dentry);
+	}
+
+	seq_putc(m, '\n');
+out:
+	return err;
+}
+
+static int mounts_open_common(struct inode *inode, struct file *file,
+			      int (*show)(struct seq_file *, struct vfsmount *))
+{
+	struct task_struct *task = get_proc_task(inode);
+	struct nsproxy *nsp;
+	struct mnt_namespace *ns = NULL;
+	struct path root;
+	struct proc_mounts *p;
+	int ret = -EINVAL;
+
+	if (!task)
+		goto err;
+
+	task_lock(task);
+	nsp = task->nsproxy;
+	if (!nsp || !nsp->mnt_ns) {
+		task_unlock(task);
+		put_task_struct(task);
+		goto err;
+	}
+	ns = nsp->mnt_ns;
+	get_mnt_ns(ns);
+	if (!task->fs) {
+		task_unlock(task);
+		put_task_struct(task);
+		ret = -ENOENT;
+		goto err_put_ns;
+	}
+	get_fs_root(task->fs, &root);
+	task_unlock(task);
+	put_task_struct(task);
+
+	ret = -ENOMEM;
+	p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
+	if (!p)
+		goto err_put_path;
+
+	file->private_data = &p->m;
+	ret = seq_open(file, &mounts_op);
+	if (ret)
+		goto err_free;
+
+	p->ns = ns;
+	p->root = root;
+	p->m.poll_event = ns->event;
+	p->show = show;
+	p->cached_event = ~0ULL;
+
+	return 0;
+
+ err_free:
+	kfree(p);
+ err_put_path:
+	path_put(&root);
+ err_put_ns:
+	put_mnt_ns(ns);
+ err:
+	return ret;
+}
+
+static int mounts_release(struct inode *inode, struct file *file)
+{
+	struct proc_mounts *p = proc_mounts(file->private_data);
+	path_put(&p->root);
+	put_mnt_ns(p->ns);
+	return seq_release(inode, file);
+}
+
+static int mounts_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, show_vfsmnt);
+}
+
+static int mountinfo_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, show_mountinfo);
+}
+
+static int mountstats_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, show_vfsstat);
+}
+
+const struct file_operations proc_mounts_operations = {
+	.open		= mounts_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+	.poll		= mounts_poll,
+};
+
+const struct file_operations proc_mountinfo_operations = {
+	.open		= mountinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+	.poll		= mounts_poll,
+};
+
+const struct file_operations proc_mountstats_operations = {
+	.open		= mountstats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+};
diff --git a/kernel/fs/pstore/Kconfig b/kernel/fs/pstore/Kconfig
new file mode 100644
index 000000000..916b8e23d
--- /dev/null
+++ b/kernel/fs/pstore/Kconfig
@@ -0,0 +1,62 @@
+config PSTORE
+	bool "Persistent store support"
+	default n
+	select ZLIB_DEFLATE
+	select ZLIB_INFLATE
+	help
+	   This option enables generic access to platform level
+	   persistent storage via "pstore" filesystem that can
+	   be mounted as /dev/pstore.  Only useful if you have
+	   a platform level driver that registers with pstore to
+	   provide the data, so you probably should just go say "Y"
+	   (or "M") to a platform specific persistent store driver
+	   (e.g. ACPI_APEI on X86) which will select this for you.
+	   If you don't have a platform persistent store driver,
+	   say N.
+
+config PSTORE_CONSOLE
+	bool "Log kernel console messages"
+	depends on PSTORE
+	help
+	  When the option is enabled, pstore will log all kernel
+	  messages, even if no oops or panic happened.
+
+config PSTORE_PMSG
+	bool "Log user space messages"
+	depends on PSTORE
+	help
+	  When the option is enabled, pstore will export a character
+	  interface /dev/pmsg0 to log user space messages. On reboot
+	  data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+
+	  If unsure, say N.
+
+config PSTORE_FTRACE
+	bool "Persistent function tracer"
+	depends on PSTORE
+	depends on FUNCTION_TRACER
+	depends on DEBUG_FS
+	help
+	  With this option kernel traces function calls into a persistent
+	  ram buffer that can be decoded and dumped after reboot through
+	  pstore filesystem. It can be used to determine what function
+	  was last called before a reset or panic.
+
+	  If unsure, say N.
+
+config PSTORE_RAM
+	tristate "Log panic/oops to a RAM buffer"
+	depends on PSTORE
+	depends on HAS_IOMEM
+	depends on HAVE_MEMBLOCK
+	select REED_SOLOMON
+	select REED_SOLOMON_ENC8
+	select REED_SOLOMON_DEC8
+	help
+	  This enables panic and oops messages to be logged to a circular
+	  buffer in RAM where it can be read back at some later point.
+
+	  Note that for historical reasons, the module will be named
+	  "ramoops.ko".
+
+	  For more information, see Documentation/ramoops.txt.
diff --git a/kernel/fs/pstore/Makefile b/kernel/fs/pstore/Makefile
new file mode 100644
index 000000000..e647d8e81
--- /dev/null
+++ b/kernel/fs/pstore/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+
+obj-y += pstore.o
+
+pstore-objs += inode.o platform.o
+obj-$(CONFIG_PSTORE_FTRACE)	+= ftrace.o
+
+obj-$(CONFIG_PSTORE_PMSG)	+= pmsg.o
+
+ramoops-objs += ram.o ram_core.o
+obj-$(CONFIG_PSTORE_RAM)	+= ramoops.o
diff --git a/kernel/fs/pstore/ftrace.c b/kernel/fs/pstore/ftrace.c
new file mode 100644
index 000000000..76a4eeb92
--- /dev/null
+++ b/kernel/fs/pstore/ftrace.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2012  Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/irqflags.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/cache.h>
+#include <asm/barrier.h>
+#include "internal.h"
+
+static void notrace pstore_ftrace_call(unsigned long ip,
+				       unsigned long parent_ip,
+				       struct ftrace_ops *op,
+				       struct pt_regs *regs)
+{
+	unsigned long flags;
+	struct pstore_ftrace_record rec = {};
+
+	if (unlikely(oops_in_progress))
+		return;
+
+	local_irq_save(flags);
+
+	rec.ip = ip;
+	rec.parent_ip = parent_ip;
+	pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
+	psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
+			  0, sizeof(rec), psinfo);
+
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
+	.func	= pstore_ftrace_call,
+};
+
+static DEFINE_MUTEX(pstore_ftrace_lock);
+static bool pstore_ftrace_enabled;
+
+static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	u8 on;
+	ssize_t ret;
+
+	ret = kstrtou8_from_user(buf, count, 2, &on);
+	if (ret)
+		return ret;
+
+	mutex_lock(&pstore_ftrace_lock);
+
+	if (!on ^ pstore_ftrace_enabled)
+		goto out;
+
+	if (on)
+		ret = register_ftrace_function(&pstore_ftrace_ops);
+	else
+		ret = unregister_ftrace_function(&pstore_ftrace_ops);
+	if (ret) {
+		pr_err("%s: unable to %sregister ftrace ops: %zd\n",
+		       __func__, on ? "" : "un", ret);
+		goto err;
+	}
+
+	pstore_ftrace_enabled = on;
+out:
+	ret = count;
+err:
+	mutex_unlock(&pstore_ftrace_lock);
+
+	return ret;
+}
+
+static ssize_t pstore_ftrace_knob_read(struct file *f, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	char val[] = { '0' + pstore_ftrace_enabled, '\n' };
+
+	return simple_read_from_buffer(buf, count, ppos, val, sizeof(val));
+}
+
+static const struct file_operations pstore_knob_fops = {
+	.open	= simple_open,
+	.read	= pstore_ftrace_knob_read,
+	.write	= pstore_ftrace_knob_write,
+};
+
+void pstore_register_ftrace(void)
+{
+	struct dentry *dir;
+	struct dentry *file;
+
+	if (!psinfo->write_buf)
+		return;
+
+	dir = debugfs_create_dir("pstore", NULL);
+	if (!dir) {
+		pr_err("%s: unable to create pstore directory\n", __func__);
+		return;
+	}
+
+	file = debugfs_create_file("record_ftrace", 0600, dir, NULL,
+				   &pstore_knob_fops);
+	if (!file) {
+		pr_err("%s: unable to create record_ftrace file\n", __func__);
+		goto err_file;
+	}
+
+	return;
+err_file:
+	debugfs_remove(dir);
+}
diff --git a/kernel/fs/pstore/inode.c b/kernel/fs/pstore/inode.c
new file mode 100644
index 000000000..3adcc4669
--- /dev/null
+++ b/kernel/fs/pstore/inode.c
@@ -0,0 +1,483 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/ramfs.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/syslog.h>
+
+#include "internal.h"
+
+#define	PSTORE_NAMELEN	64
+
+static DEFINE_SPINLOCK(allpstore_lock);
+static LIST_HEAD(allpstore);
+
+struct pstore_private {
+	struct list_head list;
+	struct pstore_info *psi;
+	enum pstore_type_id type;
+	u64	id;
+	int	count;
+	ssize_t	size;
+	char	data[];
+};
+
+struct pstore_ftrace_seq_data {
+	const void *ptr;
+	size_t off;
+	size_t size;
+};
+
+#define REC_SIZE sizeof(struct pstore_ftrace_record)
+
+static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct pstore_private *ps = s->private;
+	struct pstore_ftrace_seq_data *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->off = ps->size % REC_SIZE;
+	data->off += *pos * REC_SIZE;
+	if (data->off + REC_SIZE > ps->size) {
+		kfree(data);
+		return NULL;
+	}
+
+	return data;
+
+}
+
+static void pstore_ftrace_seq_stop(struct seq_file *s, void *v)
+{
+	kfree(v);
+}
+
+static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct pstore_private *ps = s->private;
+	struct pstore_ftrace_seq_data *data = v;
+
+	data->off += REC_SIZE;
+	if (data->off + REC_SIZE > ps->size)
+		return NULL;
+
+	(*pos)++;
+	return data;
+}
+
+static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
+{
+	struct pstore_private *ps = s->private;
+	struct pstore_ftrace_seq_data *data = v;
+	struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
+
+	seq_printf(s, "%d %08lx  %08lx  %pf <- %pF\n",
+		pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
+		(void *)rec->ip, (void *)rec->parent_ip);
+
+	return 0;
+}
+
+static const struct seq_operations pstore_ftrace_seq_ops = {
+	.start	= pstore_ftrace_seq_start,
+	.next	= pstore_ftrace_seq_next,
+	.stop	= pstore_ftrace_seq_stop,
+	.show	= pstore_ftrace_seq_show,
+};
+
+static int pstore_check_syslog_permissions(struct pstore_private *ps)
+{
+	switch (ps->type) {
+	case PSTORE_TYPE_DMESG:
+	case PSTORE_TYPE_CONSOLE:
+		return check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+			SYSLOG_FROM_READER);
+	default:
+		return 0;
+	}
+}
+
+static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
+						size_t count, loff_t *ppos)
+{
+	struct seq_file *sf = file->private_data;
+	struct pstore_private *ps = sf->private;
+
+	if (ps->type == PSTORE_TYPE_FTRACE)
+		return seq_read(file, userbuf, count, ppos);
+	return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+}
+
+static int pstore_file_open(struct inode *inode, struct file *file)
+{
+	struct pstore_private *ps = inode->i_private;
+	struct seq_file *sf;
+	int err;
+	const struct seq_operations *sops = NULL;
+
+	err = pstore_check_syslog_permissions(ps);
+	if (err)
+		return err;
+
+	if (ps->type == PSTORE_TYPE_FTRACE)
+		sops = &pstore_ftrace_seq_ops;
+
+	err = seq_open(file, sops);
+	if (err < 0)
+		return err;
+
+	sf = file->private_data;
+	sf->private = ps;
+
+	return 0;
+}
+
+static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
+{
+	struct seq_file *sf = file->private_data;
+
+	if (sf->op)
+		return seq_lseek(file, off, whence);
+	return default_llseek(file, off, whence);
+}
+
+static const struct file_operations pstore_file_operations = {
+	.open		= pstore_file_open,
+	.read		= pstore_file_read,
+	.llseek		= pstore_file_llseek,
+	.release	= seq_release,
+};
+
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct pstore_private *p = d_inode(dentry)->i_private;
+	int err;
+
+	err = pstore_check_syslog_permissions(p);
+	if (err)
+		return err;
+
+	if (p->psi->erase)
+		p->psi->erase(p->type, p->id, p->count,
+			      d_inode(dentry)->i_ctime, p->psi);
+	else
+		return -EPERM;
+
+	return simple_unlink(dir, dentry);
+}
+
+static void pstore_evict_inode(struct inode *inode)
+{
+	struct pstore_private	*p = inode->i_private;
+	unsigned long		flags;
+
+	clear_inode(inode);
+	if (p) {
+		spin_lock_irqsave(&allpstore_lock, flags);
+		list_del(&p->list);
+		spin_unlock_irqrestore(&allpstore_lock, flags);
+		kfree(p);
+	}
+}
+
+static const struct inode_operations pstore_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.unlink		= pstore_unlink,
+};
+
+static struct inode *pstore_get_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	}
+	return inode;
+}
+
+enum {
+	Opt_kmsg_bytes, Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_kmsg_bytes, "kmsg_bytes=%u"},
+	{Opt_err, NULL}
+};
+
+static void parse_options(char *options)
+{
+	char		*p;
+	substring_t	args[MAX_OPT_ARGS];
+	int		option;
+
+	if (!options)
+		return;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_kmsg_bytes:
+			if (!match_int(&args[0], &option))
+				pstore_set_kmsg_bytes(option);
+			break;
+		}
+	}
+}
+
+static int pstore_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	parse_options(data);
+
+	return 0;
+}
+
+static const struct super_operations pstore_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= pstore_evict_inode,
+	.remount_fs	= pstore_remount,
+	.show_options	= generic_show_options,
+};
+
+static struct super_block *pstore_sb;
+
+int pstore_is_mounted(void)
+{
+	return pstore_sb != NULL;
+}
+
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
+		  char *data, bool compressed, size_t size,
+		  struct timespec time, struct pstore_info *psi)
+{
+	struct dentry		*root = pstore_sb->s_root;
+	struct dentry		*dentry;
+	struct inode		*inode;
+	int			rc = 0;
+	char			name[PSTORE_NAMELEN];
+	struct pstore_private	*private, *pos;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&allpstore_lock, flags);
+	list_for_each_entry(pos, &allpstore, list) {
+		if (pos->type == type &&
+		    pos->id == id &&
+		    pos->psi == psi) {
+			rc = -EEXIST;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&allpstore_lock, flags);
+	if (rc)
+		return rc;
+
+	rc = -ENOMEM;
+	inode = pstore_get_inode(pstore_sb);
+	if (!inode)
+		goto fail;
+	inode->i_mode = S_IFREG | 0444;
+	inode->i_fop = &pstore_file_operations;
+	private = kmalloc(sizeof *private + size, GFP_KERNEL);
+	if (!private)
+		goto fail_alloc;
+	private->type = type;
+	private->id = id;
+	private->count = count;
+	private->psi = psi;
+
+	switch (type) {
+	case PSTORE_TYPE_DMESG:
+		scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
+			  psname, id, compressed ? ".enc.z" : "");
+		break;
+	case PSTORE_TYPE_CONSOLE:
+		scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_FTRACE:
+		scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_MCE:
+		scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_PPC_RTAS:
+		scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_PPC_OF:
+		scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
+			  psname, id);
+		break;
+	case PSTORE_TYPE_PPC_COMMON:
+		scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
+			  psname, id);
+		break;
+	case PSTORE_TYPE_PMSG:
+		scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_PPC_OPAL:
+		sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_UNKNOWN:
+		scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
+		break;
+	default:
+		scnprintf(name, sizeof(name), "type%d-%s-%lld",
+			  type, psname, id);
+		break;
+	}
+
+	mutex_lock(&d_inode(root)->i_mutex);
+
+	dentry = d_alloc_name(root, name);
+	if (!dentry)
+		goto fail_lockedalloc;
+
+	memcpy(private->data, data, size);
+	inode->i_size = private->size = size;
+
+	inode->i_private = private;
+
+	if (time.tv_sec)
+		inode->i_mtime = inode->i_ctime = time;
+
+	d_add(dentry, inode);
+
+	spin_lock_irqsave(&allpstore_lock, flags);
+	list_add(&private->list, &allpstore);
+	spin_unlock_irqrestore(&allpstore_lock, flags);
+
+	mutex_unlock(&d_inode(root)->i_mutex);
+
+	return 0;
+
+fail_lockedalloc:
+	mutex_unlock(&d_inode(root)->i_mutex);
+	kfree(private);
+fail_alloc:
+	iput(inode);
+
+fail:
+	return rc;
+}
+
+static int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+
+	save_mount_options(sb, data);
+
+	pstore_sb = sb;
+
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= PSTOREFS_MAGIC;
+	sb->s_op		= &pstore_ops;
+	sb->s_time_gran		= 1;
+
+	parse_options(data);
+
+	inode = pstore_get_inode(sb);
+	if (inode) {
+		inode->i_mode = S_IFDIR | 0755;
+		inode->i_op = &pstore_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+		inc_nlink(inode);
+	}
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	pstore_get_records(0);
+
+	return 0;
+}
+
+static struct dentry *pstore_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, pstore_fill_super);
+}
+
+static void pstore_kill_sb(struct super_block *sb)
+{
+	kill_litter_super(sb);
+	pstore_sb = NULL;
+}
+
+static struct file_system_type pstore_fs_type = {
+	.name		= "pstore",
+	.mount		= pstore_mount,
+	.kill_sb	= pstore_kill_sb,
+};
+
+static int __init init_pstore_fs(void)
+{
+	int err;
+
+	/* Create a convenient mount point for people to access pstore */
+	err = sysfs_create_mount_point(fs_kobj, "pstore");
+	if (err)
+		goto out;
+
+	err = register_filesystem(&pstore_fs_type);
+	if (err < 0)
+		sysfs_remove_mount_point(fs_kobj, "pstore");
+
+out:
+	return err;
+}
+module_init(init_pstore_fs)
+
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/pstore/internal.h b/kernel/fs/pstore/internal.h
new file mode 100644
index 000000000..c36ba2cd0
--- /dev/null
+++ b/kernel/fs/pstore/internal.h
@@ -0,0 +1,64 @@
+#ifndef __PSTORE_INTERNAL_H__
+#define __PSTORE_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/pstore.h>
+
+#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
+#define PSTORE_CPU_IN_IP 0x1
+#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
+#define PSTORE_CPU_IN_IP 0x3
+#endif
+
+struct pstore_ftrace_record {
+	unsigned long ip;
+	unsigned long parent_ip;
+#ifndef PSTORE_CPU_IN_IP
+	unsigned int cpu;
+#endif
+};
+
+static inline void
+pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
+{
+#ifndef PSTORE_CPU_IN_IP
+	rec->cpu = cpu;
+#else
+	rec->ip |= cpu;
+#endif
+}
+
+static inline unsigned int
+pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
+{
+#ifndef PSTORE_CPU_IN_IP
+	return rec->cpu;
+#else
+	return rec->ip & PSTORE_CPU_IN_IP;
+#endif
+}
+
+#ifdef CONFIG_PSTORE_FTRACE
+extern void pstore_register_ftrace(void);
+#else
+static inline void pstore_register_ftrace(void) {}
+#endif
+
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
+
+extern struct pstore_info *psinfo;
+
+extern void	pstore_set_kmsg_bytes(int);
+extern void	pstore_get_records(int);
+extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+			      int count, char *data, bool compressed,
+			      size_t size, struct timespec time,
+			      struct pstore_info *psi);
+extern int	pstore_is_mounted(void);
+
+#endif
diff --git a/kernel/fs/pstore/platform.c b/kernel/fs/pstore/platform.c
new file mode 100644
index 000000000..c4c9a10c5
--- /dev/null
+++ b/kernel/fs/pstore/platform.c
@@ -0,0 +1,547 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2007-2008 Google, Inc.
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define pr_fmt(fmt) "pstore: " fmt
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/zlib.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+
+#include "internal.h"
+
+/*
+ * We defer making "oops" entries appear in pstore - see
+ * whether the system is actually still running well enough
+ * to let someone see the entry
+ */
+static int pstore_update_ms = -1;
+module_param_named(update_ms, pstore_update_ms, int, 0600);
+MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
+		 "(default is -1, which means runtime updates are disabled; "
+		 "enabling this option is not safe, it may lead to further "
+		 "corruption on Oopses)");
+
+static int pstore_new_entry;
+
+static void pstore_timefunc(unsigned long);
+static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
+
+static void pstore_dowork(struct work_struct *);
+static DECLARE_WORK(pstore_work, pstore_dowork);
+
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+struct pstore_info *psinfo;
+
+static char *backend;
+
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
+
+static char *big_oops_buf;
+static size_t big_oops_buf_sz;
+
+/* How much of the console log to snapshot */
+static unsigned long kmsg_bytes = 10240;
+
+void pstore_set_kmsg_bytes(int bytes)
+{
+	kmsg_bytes = bytes;
+}
+
+/* Tag each group of saved records with a sequence number */
+static int	oopscount;
+
+static const char *get_reason_str(enum kmsg_dump_reason reason)
+{
+	switch (reason) {
+	case KMSG_DUMP_PANIC:
+		return "Panic";
+	case KMSG_DUMP_OOPS:
+		return "Oops";
+	case KMSG_DUMP_EMERG:
+		return "Emergency";
+	case KMSG_DUMP_RESTART:
+		return "Restart";
+	case KMSG_DUMP_HALT:
+		return "Halt";
+	case KMSG_DUMP_POWEROFF:
+		return "Poweroff";
+	default:
+		return "Unknown";
+	}
+}
+
+bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+{
+	/*
+	 * In case of NMI path, pstore shouldn't be blocked
+	 * regardless of reason.
+	 */
+	if (in_nmi())
+		return true;
+
+	switch (reason) {
+	/* In panic case, other cpus are stopped by smp_send_stop(). */
+	case KMSG_DUMP_PANIC:
+	/* Emergency restart shouldn't be blocked by spin lock. */
+	case KMSG_DUMP_EMERG:
+		return true;
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
+
+/* Derived from logfs_compress() */
+static int pstore_compress(const void *in, void *out, size_t inlen,
+							size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+						MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+/* Derived from logfs_uncompress */
+static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_inflateInit2(&stream, WINDOW_BITS);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+static void allocate_buf_for_compression(void)
+{
+	size_t size;
+	size_t cmpr;
+
+	switch (psinfo->bufsize) {
+	/* buffer range for efivars */
+	case 1000 ... 2000:
+		cmpr = 56;
+		break;
+	case 2001 ... 3000:
+		cmpr = 54;
+		break;
+	case 3001 ... 3999:
+		cmpr = 52;
+		break;
+	/* buffer range for nvram, erst */
+	case 4000 ... 10000:
+		cmpr = 45;
+		break;
+	default:
+		cmpr = 60;
+		break;
+	}
+
+	big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr;
+	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+	if (big_oops_buf) {
+		size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL),
+			zlib_inflate_workspacesize());
+		stream.workspace = kmalloc(size, GFP_KERNEL);
+		if (!stream.workspace) {
+			pr_err("No memory for compression workspace; skipping compression\n");
+			kfree(big_oops_buf);
+			big_oops_buf = NULL;
+		}
+	} else {
+		pr_err("No memory for uncompressed data; skipping compression\n");
+		stream.workspace = NULL;
+	}
+
+}
+
+/*
+ * Called when compression fails, since the printk buffer
+ * would be fetched for compression calling it again when
+ * compression fails would have moved the iterator of
+ * printk buffer which results in fetching old contents.
+ * Copy the recent messages from big_oops_buf to psinfo->buf
+ */
+static size_t copy_kmsg_to_buffer(int hsize, size_t len)
+{
+	size_t total_len;
+	size_t diff;
+
+	total_len = hsize + len;
+
+	if (total_len > psinfo->bufsize) {
+		diff = total_len - psinfo->bufsize + hsize;
+		memcpy(psinfo->buf, big_oops_buf, hsize);
+		memcpy(psinfo->buf + hsize, big_oops_buf + diff,
+					psinfo->bufsize - hsize);
+		total_len = psinfo->bufsize;
+	} else
+		memcpy(psinfo->buf, big_oops_buf, total_len);
+
+	return total_len;
+}
+
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+			enum kmsg_dump_reason reason)
+{
+	unsigned long	total = 0;
+	const char	*why;
+	u64		id;
+	unsigned int	part = 1;
+	unsigned long	flags = 0;
+	int		is_locked = 0;
+	int		ret;
+
+	why = get_reason_str(reason);
+
+	if (pstore_cannot_block_path(reason)) {
+		is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
+		if (!is_locked) {
+			pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
+				       , in_nmi() ? "NMI" : why);
+		}
+	} else
+		spin_lock_irqsave(&psinfo->buf_lock, flags);
+	oopscount++;
+	while (total < kmsg_bytes) {
+		char *dst;
+		unsigned long size;
+		int hsize;
+		int zipped_len = -1;
+		size_t len;
+		bool compressed;
+		size_t total_len;
+
+		if (big_oops_buf) {
+			dst = big_oops_buf;
+			hsize = sprintf(dst, "%s#%d Part%u\n", why,
+							oopscount, part);
+			size = big_oops_buf_sz - hsize;
+
+			if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
+								size, &len))
+				break;
+
+			zipped_len = pstore_compress(dst, psinfo->buf,
+						hsize + len, psinfo->bufsize);
+
+			if (zipped_len > 0) {
+				compressed = true;
+				total_len = zipped_len;
+			} else {
+				compressed = false;
+				total_len = copy_kmsg_to_buffer(hsize, len);
+			}
+		} else {
+			dst = psinfo->buf;
+			hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
+									part);
+			size = psinfo->bufsize - hsize;
+			dst += hsize;
+
+			if (!kmsg_dump_get_buffer(dumper, true, dst,
+								size, &len))
+				break;
+
+			compressed = false;
+			total_len = hsize + len;
+		}
+
+		ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
+				    oopscount, compressed, total_len, psinfo);
+		if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+			pstore_new_entry = 1;
+
+		total += total_len;
+		part++;
+	}
+	if (pstore_cannot_block_path(reason)) {
+		if (is_locked)
+			spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+	} else
+		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+}
+
+static struct kmsg_dumper pstore_dumper = {
+	.dump = pstore_dump,
+};
+
+#ifdef CONFIG_PSTORE_CONSOLE
+static void pstore_console_write(struct console *con, const char *s, unsigned c)
+{
+	const char *e = s + c;
+
+	while (s < e) {
+		unsigned long flags;
+		u64 id;
+
+		if (c > psinfo->bufsize)
+			c = psinfo->bufsize;
+
+		if (oops_in_progress) {
+			if (!spin_trylock_irqsave(&psinfo->buf_lock, flags))
+				break;
+		} else {
+			spin_lock_irqsave(&psinfo->buf_lock, flags);
+		}
+		memcpy(psinfo->buf, s, c);
+		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
+		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+		s += c;
+		c = e - s;
+	}
+}
+
+static struct console pstore_console = {
+	.name	= "pstore",
+	.write	= pstore_console_write,
+	.flags	= CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME,
+	.index	= -1,
+};
+
+static void pstore_register_console(void)
+{
+	register_console(&pstore_console);
+}
+#else
+static void pstore_register_console(void) {}
+#endif
+
+static int pstore_write_compat(enum pstore_type_id type,
+			       enum kmsg_dump_reason reason,
+			       u64 *id, unsigned int part, int count,
+			       bool compressed, size_t size,
+			       struct pstore_info *psi)
+{
+	return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
+			     size, psi);
+}
+
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+	struct module *owner = psi->owner;
+
+	if (backend && strcmp(backend, psi->name))
+		return -EPERM;
+
+	spin_lock(&pstore_lock);
+	if (psinfo) {
+		spin_unlock(&pstore_lock);
+		return -EBUSY;
+	}
+
+	if (!psi->write)
+		psi->write = pstore_write_compat;
+	psinfo = psi;
+	mutex_init(&psinfo->read_mutex);
+	spin_unlock(&pstore_lock);
+
+	if (owner && !try_module_get(owner)) {
+		psinfo = NULL;
+		return -EINVAL;
+	}
+
+	allocate_buf_for_compression();
+
+	if (pstore_is_mounted())
+		pstore_get_records(0);
+
+	kmsg_dump_register(&pstore_dumper);
+
+	if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+		pstore_register_console();
+		pstore_register_ftrace();
+		pstore_register_pmsg();
+	}
+
+	if (pstore_update_ms >= 0) {
+		pstore_timer.expires = jiffies +
+			msecs_to_jiffies(pstore_update_ms);
+		add_timer(&pstore_timer);
+	}
+
+	pr_info("Registered %s as persistent store backend\n", psi->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+
+/*
+ * Read all the records from the persistent store. Create
+ * files in our filesystem.  Don't warn about -EEXIST errors
+ * when we are re-scanning the backing store looking to add new
+ * error records.
+ */
+void pstore_get_records(int quiet)
+{
+	struct pstore_info *psi = psinfo;
+	char			*buf = NULL;
+	ssize_t			size;
+	u64			id;
+	int			count;
+	enum pstore_type_id	type;
+	struct timespec		time;
+	int			failed = 0, rc;
+	bool			compressed;
+	int			unzipped_len = -1;
+
+	if (!psi)
+		return;
+
+	mutex_lock(&psi->read_mutex);
+	if (psi->open && psi->open(psi))
+		goto out;
+
+	while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
+				psi)) > 0) {
+		if (compressed && (type == PSTORE_TYPE_DMESG)) {
+			if (big_oops_buf)
+				unzipped_len = pstore_decompress(buf,
+							big_oops_buf, size,
+							big_oops_buf_sz);
+
+			if (unzipped_len > 0) {
+				kfree(buf);
+				buf = big_oops_buf;
+				size = unzipped_len;
+				compressed = false;
+			} else {
+				pr_err("decompression failed;returned %d\n",
+				       unzipped_len);
+				compressed = true;
+			}
+		}
+		rc = pstore_mkfile(type, psi->name, id, count, buf,
+				  compressed, (size_t)size, time, psi);
+		if (unzipped_len < 0) {
+			/* Free buffer other than big oops */
+			kfree(buf);
+			buf = NULL;
+		} else
+			unzipped_len = -1;
+		if (rc && (rc != -EEXIST || !quiet))
+			failed++;
+	}
+	if (psi->close)
+		psi->close(psi);
+out:
+	mutex_unlock(&psi->read_mutex);
+
+	if (failed)
+		pr_warn("failed to load %d record(s) from '%s'\n",
+			failed, psi->name);
+}
+
+static void pstore_dowork(struct work_struct *work)
+{
+	pstore_get_records(1);
+}
+
+static void pstore_timefunc(unsigned long dummy)
+{
+	if (pstore_new_entry) {
+		pstore_new_entry = 0;
+		schedule_work(&pstore_work);
+	}
+
+	mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
+}
+
+module_param(backend, charp, 0444);
+MODULE_PARM_DESC(backend, "Pstore backend to use");
diff --git a/kernel/fs/pstore/pmsg.c b/kernel/fs/pstore/pmsg.c
new file mode 100644
index 000000000..feb5dd294
--- /dev/null
+++ b/kernel/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014  Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(pmsg_lock);
+#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
+
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	size_t i, buffer_size;
+	char *buffer;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	buffer_size = count;
+	if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
+		buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
+	buffer = vmalloc(buffer_size);
+
+	mutex_lock(&pmsg_lock);
+	for (i = 0; i < count; ) {
+		size_t c = min(count - i, buffer_size);
+		u64 id;
+		long ret;
+
+		ret = __copy_from_user(buffer, buf + i, c);
+		if (unlikely(ret != 0)) {
+			mutex_unlock(&pmsg_lock);
+			vfree(buffer);
+			return -EFAULT;
+		}
+		psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
+				  psinfo);
+
+		i += c;
+	}
+
+	mutex_unlock(&pmsg_lock);
+	vfree(buffer);
+	return count;
+}
+
+static const struct file_operations pmsg_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+	.write		= write_pmsg,
+};
+
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0220;
+	return NULL;
+}
+
+void pstore_register_pmsg(void)
+{
+	struct device *pmsg_device;
+
+	pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+	if (pmsg_major < 0) {
+		pr_err("register_chrdev failed\n");
+		goto err;
+	}
+
+	pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+	if (IS_ERR(pmsg_class)) {
+		pr_err("device class file already in use\n");
+		goto err_class;
+	}
+	pmsg_class->devnode = pmsg_devnode;
+
+	pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+					NULL, "%s%d", PMSG_NAME, 0);
+	if (IS_ERR(pmsg_device)) {
+		pr_err("failed to create device\n");
+		goto err_device;
+	}
+	return;
+
+err_device:
+	class_destroy(pmsg_class);
+err_class:
+	unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+	return;
+}
diff --git a/kernel/fs/pstore/ram.c b/kernel/fs/pstore/ram.c
new file mode 100644
index 000000000..44a549bee
--- /dev/null
+++ b/kernel/fs/pstore/ram.c
@@ -0,0 +1,648 @@
+/*
+ * RAM Oops/Panic logger
+ *
+ * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright (C) 2011 Kees Cook <keescook@chromium.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/pstore.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/compiler.h>
+#include <linux/pstore_ram.h>
+
+#define RAMOOPS_KERNMSG_HDR "===="
+#define MIN_MEM_SIZE 4096UL
+
+static ulong record_size = MIN_MEM_SIZE;
+module_param(record_size, ulong, 0400);
+MODULE_PARM_DESC(record_size,
+		"size of each dump done on oops/panic");
+
+static ulong ramoops_console_size = MIN_MEM_SIZE;
+module_param_named(console_size, ramoops_console_size, ulong, 0400);
+MODULE_PARM_DESC(console_size, "size of kernel console log");
+
+static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
+module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
+MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
+
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
+
+static ulong mem_address;
+module_param(mem_address, ulong, 0400);
+MODULE_PARM_DESC(mem_address,
+		"start of reserved RAM used to store oops/panic logs");
+
+static ulong mem_size;
+module_param(mem_size, ulong, 0400);
+MODULE_PARM_DESC(mem_size,
+		"size of reserved RAM used to store oops/panic logs");
+
+static unsigned int mem_type;
+module_param(mem_type, uint, 0600);
+MODULE_PARM_DESC(mem_type,
+		"set to 1 to try to use unbuffered memory (default 0)");
+
+static int dump_oops = 1;
+module_param(dump_oops, int, 0600);
+MODULE_PARM_DESC(dump_oops,
+		"set to 1 to dump oopses, 0 to only dump panics (default 1)");
+
+static int ramoops_ecc;
+module_param_named(ecc, ramoops_ecc, int, 0600);
+MODULE_PARM_DESC(ramoops_ecc,
+		"if non-zero, the option enables ECC support and specifies "
+		"ECC buffer size in bytes (1 is a special value, means 16 "
+		"bytes ECC)");
+
+struct ramoops_context {
+	struct persistent_ram_zone **przs;
+	struct persistent_ram_zone *cprz;
+	struct persistent_ram_zone *fprz;
+	struct persistent_ram_zone *mprz;
+	phys_addr_t phys_addr;
+	unsigned long size;
+	unsigned int memtype;
+	size_t record_size;
+	size_t console_size;
+	size_t ftrace_size;
+	size_t pmsg_size;
+	int dump_oops;
+	struct persistent_ram_ecc_info ecc_info;
+	unsigned int max_dump_cnt;
+	unsigned int dump_write_cnt;
+	/* _read_cnt need clear on ramoops_pstore_open */
+	unsigned int dump_read_cnt;
+	unsigned int console_read_cnt;
+	unsigned int ftrace_read_cnt;
+	unsigned int pmsg_read_cnt;
+	struct pstore_info pstore;
+};
+
+static struct platform_device *dummy;
+static struct ramoops_platform_data *dummy_data;
+
+static int ramoops_pstore_open(struct pstore_info *psi)
+{
+	struct ramoops_context *cxt = psi->data;
+
+	cxt->dump_read_cnt = 0;
+	cxt->console_read_cnt = 0;
+	cxt->ftrace_read_cnt = 0;
+	cxt->pmsg_read_cnt = 0;
+	return 0;
+}
+
+static struct persistent_ram_zone *
+ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
+		     u64 *id,
+		     enum pstore_type_id *typep, enum pstore_type_id type,
+		     bool update)
+{
+	struct persistent_ram_zone *prz;
+	int i = (*c)++;
+
+	if (i >= max)
+		return NULL;
+
+	prz = przs[i];
+	if (!prz)
+		return NULL;
+
+	/* Update old/shadowed buffer. */
+	if (update)
+		persistent_ram_save_old(prz);
+
+	if (!persistent_ram_old_size(prz))
+		return NULL;
+
+	*typep = type;
+	*id = i;
+
+	return prz;
+}
+
+static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
+				  bool *compressed)
+{
+	char data_type;
+	int header_length = 0;
+
+	if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n%n", &time->tv_sec,
+			&time->tv_nsec, &data_type, &header_length) == 3) {
+		if (data_type == 'C')
+			*compressed = true;
+		else
+			*compressed = false;
+	} else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n%n",
+			&time->tv_sec, &time->tv_nsec, &header_length) == 2) {
+			*compressed = false;
+	} else {
+		time->tv_sec = 0;
+		time->tv_nsec = 0;
+		*compressed = false;
+	}
+	return header_length;
+}
+
+static bool prz_ok(struct persistent_ram_zone *prz)
+{
+	return !!prz && !!(persistent_ram_old_size(prz) +
+			   persistent_ram_ecc_string(prz, NULL, 0));
+}
+
+static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
+				   int *count, struct timespec *time,
+				   char **buf, bool *compressed,
+				   struct pstore_info *psi)
+{
+	ssize_t size;
+	ssize_t ecc_notice_size;
+	struct ramoops_context *cxt = psi->data;
+	struct persistent_ram_zone *prz;
+	int header_length;
+
+	prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
+				   cxt->max_dump_cnt, id, type,
+				   PSTORE_TYPE_DMESG, 1);
+	if (!prz_ok(prz))
+		prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
+					   1, id, type, PSTORE_TYPE_CONSOLE, 0);
+	if (!prz_ok(prz))
+		prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
+					   1, id, type, PSTORE_TYPE_FTRACE, 0);
+	if (!prz_ok(prz))
+		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+					   1, id, type, PSTORE_TYPE_PMSG, 0);
+	if (!prz_ok(prz))
+		return 0;
+
+	if (!persistent_ram_old(prz))
+		return 0;
+
+	size = persistent_ram_old_size(prz);
+	header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time,
+			compressed);
+	size -= header_length;
+
+	/* ECC correction notice */
+	ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+
+	*buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL);
+	if (*buf == NULL)
+		return -ENOMEM;
+
+	memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+	persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1);
+
+	return size + ecc_notice_size;
+}
+
+static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
+				     bool compressed)
+{
+	char *hdr;
+	struct timespec timestamp;
+	size_t len;
+
+	/* Report zeroed timestamp if called before timekeeping has resumed. */
+	if (__getnstimeofday(&timestamp)) {
+		timestamp.tv_sec = 0;
+		timestamp.tv_nsec = 0;
+	}
+	hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
+		(long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000),
+		compressed ? 'C' : 'D');
+	WARN_ON_ONCE(!hdr);
+	len = hdr ? strlen(hdr) : 0;
+	persistent_ram_write(prz, hdr, len);
+	kfree(hdr);
+
+	return len;
+}
+
+static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
+					    enum kmsg_dump_reason reason,
+					    u64 *id, unsigned int part,
+					    const char *buf,
+					    bool compressed, size_t size,
+					    struct pstore_info *psi)
+{
+	struct ramoops_context *cxt = psi->data;
+	struct persistent_ram_zone *prz;
+	size_t hlen;
+
+	if (type == PSTORE_TYPE_CONSOLE) {
+		if (!cxt->cprz)
+			return -ENOMEM;
+		persistent_ram_write(cxt->cprz, buf, size);
+		return 0;
+	} else if (type == PSTORE_TYPE_FTRACE) {
+		if (!cxt->fprz)
+			return -ENOMEM;
+		persistent_ram_write(cxt->fprz, buf, size);
+		return 0;
+	} else if (type == PSTORE_TYPE_PMSG) {
+		if (!cxt->mprz)
+			return -ENOMEM;
+		persistent_ram_write(cxt->mprz, buf, size);
+		return 0;
+	}
+
+	if (type != PSTORE_TYPE_DMESG)
+		return -EINVAL;
+
+	/* Out of the various dmesg dump types, ramoops is currently designed
+	 * to only store crash logs, rather than storing general kernel logs.
+	 */
+	if (reason != KMSG_DUMP_OOPS &&
+	    reason != KMSG_DUMP_PANIC)
+		return -EINVAL;
+
+	/* Skip Oopes when configured to do so. */
+	if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
+		return -EINVAL;
+
+	/* Explicitly only take the first part of any new crash.
+	 * If our buffer is larger than kmsg_bytes, this can never happen,
+	 * and if our buffer is smaller than kmsg_bytes, we don't want the
+	 * report split across multiple records.
+	 */
+	if (part != 1)
+		return -ENOSPC;
+
+	if (!cxt->przs)
+		return -ENOSPC;
+
+	prz = cxt->przs[cxt->dump_write_cnt];
+
+	hlen = ramoops_write_kmsg_hdr(prz, compressed);
+	if (size + hlen > prz->buffer_size)
+		size = prz->buffer_size - hlen;
+	persistent_ram_write(prz, buf, size);
+
+	cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
+
+	return 0;
+}
+
+static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
+				struct timespec time, struct pstore_info *psi)
+{
+	struct ramoops_context *cxt = psi->data;
+	struct persistent_ram_zone *prz;
+
+	switch (type) {
+	case PSTORE_TYPE_DMESG:
+		if (id >= cxt->max_dump_cnt)
+			return -EINVAL;
+		prz = cxt->przs[id];
+		break;
+	case PSTORE_TYPE_CONSOLE:
+		prz = cxt->cprz;
+		break;
+	case PSTORE_TYPE_FTRACE:
+		prz = cxt->fprz;
+		break;
+	case PSTORE_TYPE_PMSG:
+		prz = cxt->mprz;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	persistent_ram_free_old(prz);
+	persistent_ram_zap(prz);
+
+	return 0;
+}
+
+static struct ramoops_context oops_cxt = {
+	.pstore = {
+		.owner	= THIS_MODULE,
+		.name	= "ramoops",
+		.open	= ramoops_pstore_open,
+		.read	= ramoops_pstore_read,
+		.write_buf	= ramoops_pstore_write_buf,
+		.erase	= ramoops_pstore_erase,
+	},
+};
+
+static void ramoops_free_przs(struct ramoops_context *cxt)
+{
+	int i;
+
+	cxt->max_dump_cnt = 0;
+	if (!cxt->przs)
+		return;
+
+	for (i = 0; !IS_ERR_OR_NULL(cxt->przs[i]); i++)
+		persistent_ram_free(cxt->przs[i]);
+	kfree(cxt->przs);
+}
+
+static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
+			     phys_addr_t *paddr, size_t dump_mem_sz)
+{
+	int err = -ENOMEM;
+	int i;
+
+	if (!cxt->record_size)
+		return 0;
+
+	if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
+		dev_err(dev, "no room for dumps\n");
+		return -ENOMEM;
+	}
+
+	cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
+	if (!cxt->max_dump_cnt)
+		return -ENOMEM;
+
+	cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt,
+			     GFP_KERNEL);
+	if (!cxt->przs) {
+		dev_err(dev, "failed to initialize a prz array for dumps\n");
+		goto fail_prz;
+	}
+
+	for (i = 0; i < cxt->max_dump_cnt; i++) {
+		size_t sz = cxt->record_size;
+
+		cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
+						  &cxt->ecc_info,
+						  cxt->memtype);
+		if (IS_ERR(cxt->przs[i])) {
+			err = PTR_ERR(cxt->przs[i]);
+			dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
+				sz, (unsigned long long)*paddr, err);
+			goto fail_prz;
+		}
+		*paddr += sz;
+	}
+
+	return 0;
+fail_prz:
+	ramoops_free_przs(cxt);
+	return err;
+}
+
+static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
+			    struct persistent_ram_zone **prz,
+			    phys_addr_t *paddr, size_t sz, u32 sig)
+{
+	if (!sz)
+		return 0;
+
+	if (*paddr + sz - cxt->phys_addr > cxt->size) {
+		dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+			sz, (unsigned long long)*paddr,
+			cxt->size, (unsigned long long)cxt->phys_addr);
+		return -ENOMEM;
+	}
+
+	*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
+	if (IS_ERR(*prz)) {
+		int err = PTR_ERR(*prz);
+
+		dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
+			sz, (unsigned long long)*paddr, err);
+		return err;
+	}
+
+	persistent_ram_zap(*prz);
+
+	*paddr += sz;
+
+	return 0;
+}
+
+static int ramoops_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct ramoops_platform_data *pdata = pdev->dev.platform_data;
+	struct ramoops_context *cxt = &oops_cxt;
+	size_t dump_mem_sz;
+	phys_addr_t paddr;
+	int err = -EINVAL;
+
+	/* Only a single ramoops area allowed at a time, so fail extra
+	 * probes.
+	 */
+	if (cxt->max_dump_cnt)
+		goto fail_out;
+
+	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
+			!pdata->ftrace_size && !pdata->pmsg_size)) {
+		pr_err("The memory size and the record/console size must be "
+			"non-zero\n");
+		goto fail_out;
+	}
+
+	if (pdata->record_size && !is_power_of_2(pdata->record_size))
+		pdata->record_size = rounddown_pow_of_two(pdata->record_size);
+	if (pdata->console_size && !is_power_of_2(pdata->console_size))
+		pdata->console_size = rounddown_pow_of_two(pdata->console_size);
+	if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
+		pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+	if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+		pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
+
+	cxt->size = pdata->mem_size;
+	cxt->phys_addr = pdata->mem_address;
+	cxt->memtype = pdata->mem_type;
+	cxt->record_size = pdata->record_size;
+	cxt->console_size = pdata->console_size;
+	cxt->ftrace_size = pdata->ftrace_size;
+	cxt->pmsg_size = pdata->pmsg_size;
+	cxt->dump_oops = pdata->dump_oops;
+	cxt->ecc_info = pdata->ecc_info;
+
+	paddr = cxt->phys_addr;
+
+	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+			- cxt->pmsg_size;
+	err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
+	if (err)
+		goto fail_out;
+
+	err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr,
+			       cxt->console_size, 0);
+	if (err)
+		goto fail_init_cprz;
+
+	err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
+			       LINUX_VERSION_CODE);
+	if (err)
+		goto fail_init_fprz;
+
+	err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+	if (err)
+		goto fail_init_mprz;
+
+	cxt->pstore.data = cxt;
+	/*
+	 * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we
+	 * have to handle dumps, we must have at least record_size buffer. And
+	 * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be
+	 * ZERO_SIZE_PTR).
+	 */
+	if (cxt->console_size)
+		cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */
+	cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize);
+	cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL);
+	spin_lock_init(&cxt->pstore.buf_lock);
+	if (!cxt->pstore.buf) {
+		pr_err("cannot allocate pstore buffer\n");
+		err = -ENOMEM;
+		goto fail_clear;
+	}
+
+	err = pstore_register(&cxt->pstore);
+	if (err) {
+		pr_err("registering with pstore failed\n");
+		goto fail_buf;
+	}
+
+	/*
+	 * Update the module parameter variables as well so they are visible
+	 * through /sys/module/ramoops/parameters/
+	 */
+	mem_size = pdata->mem_size;
+	mem_address = pdata->mem_address;
+	record_size = pdata->record_size;
+	dump_oops = pdata->dump_oops;
+	ramoops_console_size = pdata->console_size;
+	ramoops_pmsg_size = pdata->pmsg_size;
+	ramoops_ftrace_size = pdata->ftrace_size;
+
+	pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
+		cxt->size, (unsigned long long)cxt->phys_addr,
+		cxt->ecc_info.ecc_size, cxt->ecc_info.block_size);
+
+	return 0;
+
+fail_buf:
+	kfree(cxt->pstore.buf);
+fail_clear:
+	cxt->pstore.bufsize = 0;
+	kfree(cxt->mprz);
+fail_init_mprz:
+	kfree(cxt->fprz);
+fail_init_fprz:
+	kfree(cxt->cprz);
+fail_init_cprz:
+	ramoops_free_przs(cxt);
+fail_out:
+	return err;
+}
+
+static int __exit ramoops_remove(struct platform_device *pdev)
+{
+#if 0
+	/* TODO(kees): We cannot unload ramoops since pstore doesn't support
+	 * unregistering yet.
+	 */
+	struct ramoops_context *cxt = &oops_cxt;
+
+	iounmap(cxt->virt_addr);
+	release_mem_region(cxt->phys_addr, cxt->size);
+	cxt->max_dump_cnt = 0;
+
+	/* TODO(kees): When pstore supports unregistering, call it here. */
+	kfree(cxt->pstore.buf);
+	cxt->pstore.bufsize = 0;
+
+	return 0;
+#endif
+	return -EBUSY;
+}
+
+static struct platform_driver ramoops_driver = {
+	.probe		= ramoops_probe,
+	.remove		= __exit_p(ramoops_remove),
+	.driver		= {
+		.name	= "ramoops",
+	},
+};
+
+static void ramoops_register_dummy(void)
+{
+	if (!mem_size)
+		return;
+
+	pr_info("using module parameters\n");
+
+	dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL);
+	if (!dummy_data) {
+		pr_info("could not allocate pdata\n");
+		return;
+	}
+
+	dummy_data->mem_size = mem_size;
+	dummy_data->mem_address = mem_address;
+	dummy_data->mem_type = 0;
+	dummy_data->record_size = record_size;
+	dummy_data->console_size = ramoops_console_size;
+	dummy_data->ftrace_size = ramoops_ftrace_size;
+	dummy_data->pmsg_size = ramoops_pmsg_size;
+	dummy_data->dump_oops = dump_oops;
+	/*
+	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
+	 * (using 1 byte for ECC isn't much of use anyway).
+	 */
+	dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+
+	dummy = platform_device_register_data(NULL, "ramoops", -1,
+			dummy_data, sizeof(struct ramoops_platform_data));
+	if (IS_ERR(dummy)) {
+		pr_info("could not create platform device: %ld\n",
+			PTR_ERR(dummy));
+	}
+}
+
+static int __init ramoops_init(void)
+{
+	ramoops_register_dummy();
+	return platform_driver_register(&ramoops_driver);
+}
+postcore_initcall(ramoops_init);
+
+static void __exit ramoops_exit(void)
+{
+	platform_driver_unregister(&ramoops_driver);
+	platform_device_unregister(dummy);
+	kfree(dummy_data);
+}
+module_exit(ramoops_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marco Stornelli <marco.stornelli@gmail.com>");
+MODULE_DESCRIPTION("RAM Oops/Panic logger/driver");
diff --git a/kernel/fs/pstore/ram_core.c b/kernel/fs/pstore/ram_core.c
new file mode 100644
index 000000000..76c3f80ef
--- /dev/null
+++ b/kernel/fs/pstore/ram_core.c
@@ -0,0 +1,539 @@
+/*
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) "persistent_ram: " fmt
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/rslib.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pstore_ram.h>
+#include <asm/page.h>
+
+struct persistent_ram_buffer {
+	uint32_t    sig;
+	atomic_t    start;
+	atomic_t    size;
+	uint8_t     data[0];
+};
+
+#define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */
+
+static inline size_t buffer_size(struct persistent_ram_zone *prz)
+{
+	return atomic_read(&prz->buffer->size);
+}
+
+static inline size_t buffer_start(struct persistent_ram_zone *prz)
+{
+	return atomic_read(&prz->buffer->start);
+}
+
+/* increase and wrap the start pointer, returning the old value */
+static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
+{
+	int old;
+	int new;
+
+	do {
+		old = atomic_read(&prz->buffer->start);
+		new = old + a;
+		while (unlikely(new >= prz->buffer_size))
+			new -= prz->buffer_size;
+	} while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
+
+	return old;
+}
+
+/* increase the size counter until it hits the max size */
+static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
+{
+	size_t old;
+	size_t new;
+
+	if (atomic_read(&prz->buffer->size) == prz->buffer_size)
+		return;
+
+	do {
+		old = atomic_read(&prz->buffer->size);
+		new = old + a;
+		if (new > prz->buffer_size)
+			new = prz->buffer_size;
+	} while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
+}
+
+static DEFINE_RAW_SPINLOCK(buffer_lock);
+
+/* increase and wrap the start pointer, returning the old value */
+static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+	int old;
+	int new;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&buffer_lock, flags);
+
+	old = atomic_read(&prz->buffer->start);
+	new = old + a;
+	while (unlikely(new >= prz->buffer_size))
+		new -= prz->buffer_size;
+	atomic_set(&prz->buffer->start, new);
+
+	raw_spin_unlock_irqrestore(&buffer_lock, flags);
+
+	return old;
+}
+
+/* increase the size counter until it hits the max size */
+static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+	size_t old;
+	size_t new;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&buffer_lock, flags);
+
+	old = atomic_read(&prz->buffer->size);
+	if (old == prz->buffer_size)
+		goto exit;
+
+	new = old + a;
+	if (new > prz->buffer_size)
+		new = prz->buffer_size;
+	atomic_set(&prz->buffer->size, new);
+
+exit:
+	raw_spin_unlock_irqrestore(&buffer_lock, flags);
+}
+
+static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
+static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
+
+static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
+	uint8_t *data, size_t len, uint8_t *ecc)
+{
+	int i;
+	uint16_t par[prz->ecc_info.ecc_size];
+
+	/* Initialize the parity buffer */
+	memset(par, 0, sizeof(par));
+	encode_rs8(prz->rs_decoder, data, len, par, 0);
+	for (i = 0; i < prz->ecc_info.ecc_size; i++)
+		ecc[i] = par[i];
+}
+
+static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz,
+	void *data, size_t len, uint8_t *ecc)
+{
+	int i;
+	uint16_t par[prz->ecc_info.ecc_size];
+
+	for (i = 0; i < prz->ecc_info.ecc_size; i++)
+		par[i] = ecc[i];
+	return decode_rs8(prz->rs_decoder, data, par, len,
+				NULL, 0, NULL, 0, NULL);
+}
+
+static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz,
+	unsigned int start, unsigned int count)
+{
+	struct persistent_ram_buffer *buffer = prz->buffer;
+	uint8_t *buffer_end = buffer->data + prz->buffer_size;
+	uint8_t *block;
+	uint8_t *par;
+	int ecc_block_size = prz->ecc_info.block_size;
+	int ecc_size = prz->ecc_info.ecc_size;
+	int size = ecc_block_size;
+
+	if (!ecc_size)
+		return;
+
+	block = buffer->data + (start & ~(ecc_block_size - 1));
+	par = prz->par_buffer + (start / ecc_block_size) * ecc_size;
+
+	do {
+		if (block + ecc_block_size > buffer_end)
+			size = buffer_end - block;
+		persistent_ram_encode_rs8(prz, block, size, par);
+		block += ecc_block_size;
+		par += ecc_size;
+	} while (block < buffer->data + start + count);
+}
+
+static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz)
+{
+	struct persistent_ram_buffer *buffer = prz->buffer;
+
+	if (!prz->ecc_info.ecc_size)
+		return;
+
+	persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer),
+				  prz->par_header);
+}
+
+static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
+{
+	struct persistent_ram_buffer *buffer = prz->buffer;
+	uint8_t *block;
+	uint8_t *par;
+
+	if (!prz->ecc_info.ecc_size)
+		return;
+
+	block = buffer->data;
+	par = prz->par_buffer;
+	while (block < buffer->data + buffer_size(prz)) {
+		int numerr;
+		int size = prz->ecc_info.block_size;
+		if (block + size > buffer->data + prz->buffer_size)
+			size = buffer->data + prz->buffer_size - block;
+		numerr = persistent_ram_decode_rs8(prz, block, size, par);
+		if (numerr > 0) {
+			pr_devel("error in block %p, %d\n", block, numerr);
+			prz->corrected_bytes += numerr;
+		} else if (numerr < 0) {
+			pr_devel("uncorrectable error in block %p\n", block);
+			prz->bad_blocks++;
+		}
+		block += prz->ecc_info.block_size;
+		par += prz->ecc_info.ecc_size;
+	}
+}
+
+static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
+				   struct persistent_ram_ecc_info *ecc_info)
+{
+	int numerr;
+	struct persistent_ram_buffer *buffer = prz->buffer;
+	int ecc_blocks;
+	size_t ecc_total;
+
+	if (!ecc_info || !ecc_info->ecc_size)
+		return 0;
+
+	prz->ecc_info.block_size = ecc_info->block_size ?: 128;
+	prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16;
+	prz->ecc_info.symsize = ecc_info->symsize ?: 8;
+	prz->ecc_info.poly = ecc_info->poly ?: 0x11d;
+
+	ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size,
+				  prz->ecc_info.block_size +
+				  prz->ecc_info.ecc_size);
+	ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size;
+	if (ecc_total >= prz->buffer_size) {
+		pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n",
+		       __func__, prz->ecc_info.ecc_size,
+		       ecc_total, prz->buffer_size);
+		return -EINVAL;
+	}
+
+	prz->buffer_size -= ecc_total;
+	prz->par_buffer = buffer->data + prz->buffer_size;
+	prz->par_header = prz->par_buffer +
+			  ecc_blocks * prz->ecc_info.ecc_size;
+
+	/*
+	 * first consecutive root is 0
+	 * primitive element to generate roots = 1
+	 */
+	prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly,
+				  0, 1, prz->ecc_info.ecc_size);
+	if (prz->rs_decoder == NULL) {
+		pr_info("init_rs failed\n");
+		return -EINVAL;
+	}
+
+	prz->corrected_bytes = 0;
+	prz->bad_blocks = 0;
+
+	numerr = persistent_ram_decode_rs8(prz, buffer, sizeof(*buffer),
+					   prz->par_header);
+	if (numerr > 0) {
+		pr_info("error in header, %d\n", numerr);
+		prz->corrected_bytes += numerr;
+	} else if (numerr < 0) {
+		pr_info("uncorrectable error in header\n");
+		prz->bad_blocks++;
+	}
+
+	return 0;
+}
+
+ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
+	char *str, size_t len)
+{
+	ssize_t ret;
+
+	if (!prz->ecc_info.ecc_size)
+		return 0;
+
+	if (prz->corrected_bytes || prz->bad_blocks)
+		ret = snprintf(str, len, ""
+			"\n%d Corrected bytes, %d unrecoverable blocks\n",
+			prz->corrected_bytes, prz->bad_blocks);
+	else
+		ret = snprintf(str, len, "\nNo errors detected\n");
+
+	return ret;
+}
+
+static void notrace persistent_ram_update(struct persistent_ram_zone *prz,
+	const void *s, unsigned int start, unsigned int count)
+{
+	struct persistent_ram_buffer *buffer = prz->buffer;
+	memcpy(buffer->data + start, s, count);
+	persistent_ram_update_ecc(prz, start, count);
+}
+
+void persistent_ram_save_old(struct persistent_ram_zone *prz)
+{
+	struct persistent_ram_buffer *buffer = prz->buffer;
+	size_t size = buffer_size(prz);
+	size_t start = buffer_start(prz);
+
+	if (!size)
+		return;
+
+	if (!prz->old_log) {
+		persistent_ram_ecc_old(prz);
+		prz->old_log = kmalloc(size, GFP_KERNEL);
+	}
+	if (!prz->old_log) {
+		pr_err("failed to allocate buffer\n");
+		return;
+	}
+
+	prz->old_log_size = size;
+	memcpy(prz->old_log, &buffer->data[start], size - start);
+	memcpy(prz->old_log + size - start, &buffer->data[0], start);
+}
+
+int notrace persistent_ram_write(struct persistent_ram_zone *prz,
+	const void *s, unsigned int count)
+{
+	int rem;
+	int c = count;
+	size_t start;
+
+	if (unlikely(c > prz->buffer_size)) {
+		s += c - prz->buffer_size;
+		c = prz->buffer_size;
+	}
+
+	buffer_size_add(prz, c);
+
+	start = buffer_start_add(prz, c);
+
+	rem = prz->buffer_size - start;
+	if (unlikely(rem < c)) {
+		persistent_ram_update(prz, s, start, rem);
+		s += rem;
+		c -= rem;
+		start = 0;
+	}
+	persistent_ram_update(prz, s, start, c);
+
+	persistent_ram_update_header_ecc(prz);
+
+	return count;
+}
+
+size_t persistent_ram_old_size(struct persistent_ram_zone *prz)
+{
+	return prz->old_log_size;
+}
+
+void *persistent_ram_old(struct persistent_ram_zone *prz)
+{
+	return prz->old_log;
+}
+
+void persistent_ram_free_old(struct persistent_ram_zone *prz)
+{
+	kfree(prz->old_log);
+	prz->old_log = NULL;
+	prz->old_log_size = 0;
+}
+
+void persistent_ram_zap(struct persistent_ram_zone *prz)
+{
+	atomic_set(&prz->buffer->start, 0);
+	atomic_set(&prz->buffer->size, 0);
+	persistent_ram_update_header_ecc(prz);
+}
+
+static void *persistent_ram_vmap(phys_addr_t start, size_t size,
+		unsigned int memtype)
+{
+	struct page **pages;
+	phys_addr_t page_start;
+	unsigned int page_count;
+	pgprot_t prot;
+	unsigned int i;
+	void *vaddr;
+
+	page_start = start - offset_in_page(start);
+	page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE);
+
+	if (memtype)
+		prot = pgprot_noncached(PAGE_KERNEL);
+	else
+		prot = pgprot_writecombine(PAGE_KERNEL);
+
+	pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		pr_err("%s: Failed to allocate array for %u pages\n",
+		       __func__, page_count);
+		return NULL;
+	}
+
+	for (i = 0; i < page_count; i++) {
+		phys_addr_t addr = page_start + i * PAGE_SIZE;
+		pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+	}
+	vaddr = vmap(pages, page_count, VM_MAP, prot);
+	kfree(pages);
+
+	return vaddr;
+}
+
+static void *persistent_ram_iomap(phys_addr_t start, size_t size,
+		unsigned int memtype)
+{
+	void *va;
+
+	if (!request_mem_region(start, size, "persistent_ram")) {
+		pr_err("request mem region (0x%llx@0x%llx) failed\n",
+			(unsigned long long)size, (unsigned long long)start);
+		return NULL;
+	}
+
+	buffer_start_add = buffer_start_add_locked;
+	buffer_size_add = buffer_size_add_locked;
+
+	if (memtype)
+		va = ioremap(start, size);
+	else
+		va = ioremap_wc(start, size);
+
+	return va;
+}
+
+static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
+		struct persistent_ram_zone *prz, int memtype)
+{
+	prz->paddr = start;
+	prz->size = size;
+
+	if (pfn_valid(start >> PAGE_SHIFT))
+		prz->vaddr = persistent_ram_vmap(start, size, memtype);
+	else
+		prz->vaddr = persistent_ram_iomap(start, size, memtype);
+
+	if (!prz->vaddr) {
+		pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
+			(unsigned long long)size, (unsigned long long)start);
+		return -ENOMEM;
+	}
+
+	prz->buffer = prz->vaddr + offset_in_page(start);
+	prz->buffer_size = size - sizeof(struct persistent_ram_buffer);
+
+	return 0;
+}
+
+static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
+				    struct persistent_ram_ecc_info *ecc_info)
+{
+	int ret;
+
+	ret = persistent_ram_init_ecc(prz, ecc_info);
+	if (ret)
+		return ret;
+
+	sig ^= PERSISTENT_RAM_SIG;
+
+	if (prz->buffer->sig == sig) {
+		if (buffer_size(prz) > prz->buffer_size ||
+		    buffer_start(prz) > buffer_size(prz))
+			pr_info("found existing invalid buffer, size %zu, start %zu\n",
+				buffer_size(prz), buffer_start(prz));
+		else {
+			pr_debug("found existing buffer, size %zu, start %zu\n",
+				 buffer_size(prz), buffer_start(prz));
+			persistent_ram_save_old(prz);
+			return 0;
+		}
+	} else {
+		pr_debug("no valid data in buffer (sig = 0x%08x)\n",
+			 prz->buffer->sig);
+	}
+
+	prz->buffer->sig = sig;
+	persistent_ram_zap(prz);
+
+	return 0;
+}
+
+void persistent_ram_free(struct persistent_ram_zone *prz)
+{
+	if (!prz)
+		return;
+
+	if (prz->vaddr) {
+		if (pfn_valid(prz->paddr >> PAGE_SHIFT)) {
+			vunmap(prz->vaddr);
+		} else {
+			iounmap(prz->vaddr);
+			release_mem_region(prz->paddr, prz->size);
+		}
+		prz->vaddr = NULL;
+	}
+	persistent_ram_free_old(prz);
+	kfree(prz);
+}
+
+struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
+			u32 sig, struct persistent_ram_ecc_info *ecc_info,
+			unsigned int memtype)
+{
+	struct persistent_ram_zone *prz;
+	int ret = -ENOMEM;
+
+	prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL);
+	if (!prz) {
+		pr_err("failed to allocate persistent ram zone\n");
+		goto err;
+	}
+
+	ret = persistent_ram_buffer_map(start, size, prz, memtype);
+	if (ret)
+		goto err;
+
+	ret = persistent_ram_post_init(prz, sig, ecc_info);
+	if (ret)
+		goto err;
+
+	return prz;
+err:
+	persistent_ram_free(prz);
+	return ERR_PTR(ret);
+}
diff --git a/kernel/fs/qnx4/Kconfig b/kernel/fs/qnx4/Kconfig
new file mode 100644
index 000000000..5f6089994
--- /dev/null
+++ b/kernel/fs/qnx4/Kconfig
@@ -0,0 +1,14 @@
+config QNX4FS_FS
+	tristate "QNX4 file system support (read only)"
+	depends on BLOCK
+	help
+	  This is the file system used by the real-time operating systems
+	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
+	  Further information is available at <http://www.qnx.com/>.
+	  Say Y if you intend to mount QNX hard disks or floppies.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called qnx4.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
diff --git a/kernel/fs/qnx4/Makefile b/kernel/fs/qnx4/Makefile
new file mode 100644
index 000000000..4a283b3f8
--- /dev/null
+++ b/kernel/fs/qnx4/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux qnx4-filesystem routines.
+#
+
+obj-$(CONFIG_QNX4FS_FS) += qnx4.o
+
+qnx4-objs := inode.o dir.o namei.o bitmap.o
diff --git a/kernel/fs/qnx4/README b/kernel/fs/qnx4/README
new file mode 100644
index 000000000..1f1e320d9
--- /dev/null
+++ b/kernel/fs/qnx4/README
@@ -0,0 +1,9 @@
+
+  This is a snapshot of the QNX4 filesystem for Linux.
+  Please send diffs and remarks to <al@alarsen.net> .
+  
+Credits :
+
+Richard "Scuba" A. Frowijn     <scuba@wxs.nl>
+Frank "Jedi/Sector One" Denis  <j@pureftpd.org>
+Anders Larsen                  <al@alarsen.net> (Maintainer)
diff --git a/kernel/fs/qnx4/bitmap.c b/kernel/fs/qnx4/bitmap.c
new file mode 100644
index 000000000..76a7a697b
--- /dev/null
+++ b/kernel/fs/qnx4/bitmap.c
@@ -0,0 +1,44 @@
+/*
+ * QNX4 file system, Linux implementation.
+ *
+ * Version : 0.2.1
+ *
+ * Using parts of the xiafs filesystem.
+ *
+ * History :
+ *
+ * 28-05-1998 by Richard Frowijn : first release.
+ * 20-06-1998 by Frank Denis : basic optimisations.
+ * 25-06-1998 by Frank Denis : qnx4_is_free, qnx4_set_bitmap, qnx4_bmap .
+ * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include "qnx4.h"
+
+unsigned long qnx4_count_free_blocks(struct super_block *sb)
+{
+	int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
+	int total = 0;
+	int total_free = 0;
+	int offset = 0;
+	int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
+	struct buffer_head *bh;
+
+	while (total < size) {
+		int bytes = min(size - total, QNX4_BLOCK_SIZE);
+
+		if ((bh = sb_bread(sb, start + offset)) == NULL) {
+			printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
+			break;
+		}
+		total_free += bytes * BITS_PER_BYTE -
+				memweight(bh->b_data, bytes);
+		brelse(bh);
+		total += bytes;
+		offset++;
+	}
+
+	return total_free;
+}
diff --git a/kernel/fs/qnx4/dir.c b/kernel/fs/qnx4/dir.c
new file mode 100644
index 000000000..b218f9658
--- /dev/null
+++ b/kernel/fs/qnx4/dir.c
@@ -0,0 +1,81 @@
+/*
+ * QNX4 file system, Linux implementation.
+ *
+ * Version : 0.2.1
+ *
+ * Using parts of the xiafs filesystem.
+ *
+ * History :
+ *
+ * 28-05-1998 by Richard Frowijn : first release.
+ * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
+ */
+
+#include <linux/buffer_head.h>
+#include "qnx4.h"
+
+static int qnx4_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	unsigned int offset;
+	struct buffer_head *bh;
+	struct qnx4_inode_entry *de;
+	struct qnx4_link_info *le;
+	unsigned long blknum;
+	int ix, ino;
+	int size;
+
+	QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
+	QNX4DEBUG((KERN_INFO "pos                 = %ld\n", (long) ctx->pos));
+
+	while (ctx->pos < inode->i_size) {
+		blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
+		bh = sb_bread(inode->i_sb, blknum);
+		if (bh == NULL) {
+			printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
+			return 0;
+		}
+		ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+		for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
+			offset = ix * QNX4_DIR_ENTRY_SIZE;
+			de = (struct qnx4_inode_entry *) (bh->b_data + offset);
+			if (!de->di_fname[0])
+				continue;
+			if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+				continue;
+			if (!(de->di_status & QNX4_FILE_LINK))
+				size = QNX4_SHORT_NAME_MAX;
+			else
+				size = QNX4_NAME_MAX;
+			size = strnlen(de->di_fname, size);
+			QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+			if (!(de->di_status & QNX4_FILE_LINK))
+				ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+			else {
+				le  = (struct qnx4_link_info*)de;
+				ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+					QNX4_INODES_PER_BLOCK +
+					le->dl_inode_ndx;
+			}
+			if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
+				brelse(bh);
+				return 0;
+			}
+		}
+		brelse(bh);
+	}
+	return 0;
+}
+
+const struct file_operations qnx4_dir_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= qnx4_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+const struct inode_operations qnx4_dir_inode_operations =
+{
+	.lookup		= qnx4_lookup,
+};
diff --git a/kernel/fs/qnx4/inode.c b/kernel/fs/qnx4/inode.c
new file mode 100644
index 000000000..c4bcb7788
--- /dev/null
+++ b/kernel/fs/qnx4/inode.c
@@ -0,0 +1,426 @@
+/*
+ * QNX4 file system, Linux implementation.
+ *
+ * Version : 0.2.1
+ *
+ * Using parts of the xiafs filesystem.
+ *
+ * History :
+ *
+ * 01-06-1998 by Richard Frowijn : first release.
+ * 20-06-1998 by Frank Denis : Linux 2.1.99+ support, boot signature, misc.
+ * 30-06-1998 by Frank Denis : first step to write inodes.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
+
+#define QNX4_VERSION  4
+#define QNX4_BMNAME   ".bitmap"
+
+static const struct super_operations qnx4_sops;
+
+static struct inode *qnx4_alloc_inode(struct super_block *sb);
+static void qnx4_destroy_inode(struct inode *inode);
+static int qnx4_remount(struct super_block *sb, int *flags, char *data);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
+
+static const struct super_operations qnx4_sops =
+{
+	.alloc_inode	= qnx4_alloc_inode,
+	.destroy_inode	= qnx4_destroy_inode,
+	.statfs		= qnx4_statfs,
+	.remount_fs	= qnx4_remount,
+};
+
+static int qnx4_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct qnx4_sb_info *qs;
+
+	sync_filesystem(sb);
+	qs = qnx4_sb(sb);
+	qs->Version = QNX4_VERSION;
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )
+{
+	unsigned long phys;
+
+	QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
+
+	phys = qnx4_block_map( inode, iblock );
+	if ( phys ) {
+		// logical block is before EOF
+		map_bh(bh, inode->i_sb, phys);
+	}
+	return 0;
+}
+
+static inline u32 try_extent(qnx4_xtnt_t *extent, u32 *offset)
+{
+	u32 size = le32_to_cpu(extent->xtnt_size);
+	if (*offset < size)
+		return le32_to_cpu(extent->xtnt_blk) + *offset - 1;
+	*offset -= size;
+	return 0;
+}
+
+unsigned long qnx4_block_map( struct inode *inode, long iblock )
+{
+	int ix;
+	long i_xblk;
+	struct buffer_head *bh = NULL;
+	struct qnx4_xblk *xblk = NULL;
+	struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode);
+	u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts);
+	u32 offset = iblock;
+	u32 block = try_extent(&qnx4_inode->di_first_xtnt, &offset);
+
+	if (block) {
+		// iblock is in the first extent. This is easy.
+	} else {
+		// iblock is beyond first extent. We have to follow the extent chain.
+		i_xblk = le32_to_cpu(qnx4_inode->di_xblk);
+		ix = 0;
+		while ( --nxtnt > 0 ) {
+			if ( ix == 0 ) {
+				// read next xtnt block.
+				bh = sb_bread(inode->i_sb, i_xblk - 1);
+				if ( !bh ) {
+					QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
+					return -EIO;
+				}
+				xblk = (struct qnx4_xblk*)bh->b_data;
+				if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
+					QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
+					return -EIO;
+				}
+			}
+			block = try_extent(&xblk->xblk_xtnts[ix], &offset);
+			if (block) {
+				// got it!
+				break;
+			}
+			if ( ++ix >= xblk->xblk_num_xtnts ) {
+				i_xblk = le32_to_cpu(xblk->xblk_next_xblk);
+				ix = 0;
+				brelse( bh );
+				bh = NULL;
+			}
+		}
+		if ( bh )
+			brelse( bh );
+	}
+
+	QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
+	return block;
+}
+
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type    = sb->s_magic;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
+	buf->f_bfree   = qnx4_count_free_blocks(sb);
+	buf->f_bavail  = buf->f_bfree;
+	buf->f_namelen = QNX4_NAME_MAX;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+/*
+ * Check the root directory of the filesystem to make sure
+ * it really _is_ a qnx4 filesystem, and to check the size
+ * of the directory entry.
+ */
+static const char *qnx4_checkroot(struct super_block *sb,
+				  struct qnx4_super_block *s)
+{
+	struct buffer_head *bh;
+	struct qnx4_inode_entry *rootdir;
+	int rd, rl;
+	int i, j;
+
+	if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0')
+		return "no qnx4 filesystem (no root dir).";
+	QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+	rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1;
+	rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size);
+	for (j = 0; j < rl; j++) {
+		bh = sb_bread(sb, rd + j);	/* root dir, first block */
+		if (bh == NULL)
+			return "unable to read root entry.";
+		rootdir = (struct qnx4_inode_entry *) bh->b_data;
+		for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
+			QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+			if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
+				continue;
+			qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+						      sizeof(struct qnx4_inode_entry),
+						      GFP_KERNEL);
+			brelse(bh);
+			if (!qnx4_sb(sb)->BitMap)
+				return "not enough memory for bitmap inode";
+			/* keep bitmap inode known */
+			return NULL;
+		}
+		brelse(bh);
+	}
+	return "bitmap file not found.";
+}
+
+static int qnx4_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh;
+	struct inode *root;
+	const char *errmsg;
+	struct qnx4_sb_info *qs;
+
+	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
+	if (!qs)
+		return -ENOMEM;
+	s->s_fs_info = qs;
+
+	sb_set_blocksize(s, QNX4_BLOCK_SIZE);
+
+	s->s_op = &qnx4_sops;
+	s->s_magic = QNX4_SUPER_MAGIC;
+	s->s_flags |= MS_RDONLY;	/* Yup, read-only yet */
+
+	/* Check the superblock signature. Since the qnx4 code is
+	   dangerous, we should leave as quickly as possible
+	   if we don't belong here... */
+	bh = sb_bread(s, 1);
+	if (!bh) {
+		printk(KERN_ERR "qnx4: unable to read the superblock\n");
+		return -EINVAL;
+	}
+
+ 	/* check before allocating dentries, inodes, .. */
+	errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
+	brelse(bh);
+	if (errmsg != NULL) {
+ 		if (!silent)
+			printk(KERN_ERR "qnx4: %s\n", errmsg);
+		return -EINVAL;
+	}
+
+ 	/* does root not have inode number QNX4_ROOT_INO ?? */
+	root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "qnx4: get inode failed\n");
+		return PTR_ERR(root);
+ 	}
+
+ 	s->s_root = d_make_root(root);
+ 	if (s->s_root == NULL)
+ 		return -ENOMEM;
+
+	return 0;
+}
+
+static void qnx4_kill_sb(struct super_block *sb)
+{
+	struct qnx4_sb_info *qs = qnx4_sb(sb);
+	kill_block_super(sb);
+	if (qs) {
+		kfree(qs->BitMap);
+		kfree(qs);
+	}
+}
+
+static int qnx4_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,qnx4_get_block);
+}
+
+static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,qnx4_get_block);
+}
+static const struct address_space_operations qnx4_aops = {
+	.readpage	= qnx4_readpage,
+	.bmap		= qnx4_bmap
+};
+
+struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
+{
+	struct buffer_head *bh;
+	struct qnx4_inode_entry *raw_inode;
+	int block;
+	struct qnx4_inode_entry *qnx4_inode;
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	qnx4_inode = qnx4_raw_inode(inode);
+	inode->i_mode = 0;
+
+	QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
+	if (!ino) {
+		printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
+				"out of range\n",
+		       sb->s_id, ino);
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	block = ino / QNX4_INODES_PER_BLOCK;
+
+	if (!(bh = sb_bread(sb, block))) {
+		printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
+		       "%s\n", sb->s_id);
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	raw_inode = ((struct qnx4_inode_entry *) bh->b_data) +
+	    (ino % QNX4_INODES_PER_BLOCK);
+
+	inode->i_mode    = le16_to_cpu(raw_inode->di_mode);
+	i_uid_write(inode, (uid_t)le16_to_cpu(raw_inode->di_uid));
+	i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid));
+	set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
+	inode->i_size    = le32_to_cpu(raw_inode->di_size);
+	inode->i_mtime.tv_sec   = le32_to_cpu(raw_inode->di_mtime);
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_sec   = le32_to_cpu(raw_inode->di_atime);
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_sec   = le32_to_cpu(raw_inode->di_ctime);
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks  = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
+
+	memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mapping->a_ops = &qnx4_aops;
+		qnx4_i(inode)->mmu_private = inode->i_size;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &qnx4_dir_inode_operations;
+		inode->i_fop = &qnx4_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_mapping->a_ops = &qnx4_aops;
+		qnx4_i(inode)->mmu_private = inode->i_size;
+	} else {
+		printk(KERN_ERR "qnx4: bad inode %lu on dev %s\n",
+			ino, sb->s_id);
+		iget_failed(inode);
+		brelse(bh);
+		return ERR_PTR(-EIO);
+	}
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+static struct kmem_cache *qnx4_inode_cachep;
+
+static struct inode *qnx4_alloc_inode(struct super_block *sb)
+{
+	struct qnx4_inode_info *ei;
+	ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void qnx4_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
+}
+
+static void qnx4_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
+					     sizeof(struct qnx4_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (qnx4_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(qnx4_inode_cachep);
+}
+
+static struct dentry *qnx4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+}
+
+static struct file_system_type qnx4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "qnx4",
+	.mount		= qnx4_mount,
+	.kill_sb	= qnx4_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("qnx4");
+
+static int __init init_qnx4_fs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		return err;
+
+	err = register_filesystem(&qnx4_fs_type);
+	if (err) {
+		destroy_inodecache();
+		return err;
+	}
+
+	printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
+	return 0;
+}
+
+static void __exit exit_qnx4_fs(void)
+{
+	unregister_filesystem(&qnx4_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_qnx4_fs)
+module_exit(exit_qnx4_fs)
+MODULE_LICENSE("GPL");
+
diff --git a/kernel/fs/qnx4/namei.c b/kernel/fs/qnx4/namei.c
new file mode 100644
index 000000000..e62c81837
--- /dev/null
+++ b/kernel/fs/qnx4/namei.c
@@ -0,0 +1,125 @@
+/* 
+ * QNX4 file system, Linux implementation.
+ * 
+ * Version : 0.2.1
+ * 
+ * Using parts of the xiafs filesystem.
+ * 
+ * History :
+ * 
+ * 01-06-1998 by Richard Frowijn : first release.
+ * 21-06-1998 by Frank Denis : dcache support, fixed error codes.
+ * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
+ */
+
+#include <linux/buffer_head.h>
+#include "qnx4.h"
+
+
+/*
+ * check if the filename is correct. For some obscure reason, qnx writes a
+ * new file twice in the directory entry, first with all possible options at 0
+ * and for a second time the way it is, they want us not to access the qnx
+ * filesystem when whe are using linux.
+ */
+static int qnx4_match(int len, const char *name,
+		      struct buffer_head *bh, unsigned long *offset)
+{
+	struct qnx4_inode_entry *de;
+	int namelen, thislen;
+
+	if (bh == NULL) {
+		printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
+		return 0;
+	}
+	de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
+	*offset += QNX4_DIR_ENTRY_SIZE;
+	if ((de->di_status & QNX4_FILE_LINK) != 0) {
+		namelen = QNX4_NAME_MAX;
+	} else {
+		namelen = QNX4_SHORT_NAME_MAX;
+	}
+	thislen = strlen( de->di_fname );
+	if ( thislen > namelen )
+		thislen = namelen;
+	if (len != thislen) {
+		return 0;
+	}
+	if (strncmp(name, de->di_fname, len) == 0) {
+		if ((de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)) != 0) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
+	   const char *name, struct qnx4_inode_entry **res_dir, int *ino)
+{
+	unsigned long block, offset, blkofs;
+	struct buffer_head *bh;
+
+	*res_dir = NULL;
+	bh = NULL;
+	block = offset = blkofs = 0;
+	while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) {
+		if (!bh) {
+			block = qnx4_block_map(dir, blkofs);
+			if (block)
+				bh = sb_bread(dir->i_sb, block);
+			if (!bh) {
+				blkofs++;
+				continue;
+			}
+		}
+		*res_dir = (struct qnx4_inode_entry *) (bh->b_data + offset);
+		if (qnx4_match(len, name, bh, &offset)) {
+			*ino = block * QNX4_INODES_PER_BLOCK +
+			    (offset / QNX4_DIR_ENTRY_SIZE) - 1;
+			return bh;
+		}
+		if (offset < bh->b_size) {
+			continue;
+		}
+		brelse(bh);
+		bh = NULL;
+		offset = 0;
+		blkofs++;
+	}
+	brelse(bh);
+	*res_dir = NULL;
+	return NULL;
+}
+
+struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	int ino;
+	struct qnx4_inode_entry *de;
+	struct qnx4_link_info *lnk;
+	struct buffer_head *bh;
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	struct inode *foundinode = NULL;
+
+	if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
+		goto out;
+	/* The entry is linked, let's get the real info */
+	if ((de->di_status & QNX4_FILE_LINK) == QNX4_FILE_LINK) {
+		lnk = (struct qnx4_link_info *) de;
+		ino = (le32_to_cpu(lnk->dl_inode_blk) - 1) *
+                    QNX4_INODES_PER_BLOCK +
+		    lnk->dl_inode_ndx;
+	}
+	brelse(bh);
+
+	foundinode = qnx4_iget(dir->i_sb, ino);
+	if (IS_ERR(foundinode)) {
+		QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
+			   PTR_ERR(foundinode)));
+		return ERR_CAST(foundinode);
+	}
+out:
+	d_add(dentry, foundinode);
+
+	return NULL;
+}
diff --git a/kernel/fs/qnx4/qnx4.h b/kernel/fs/qnx4/qnx4.h
new file mode 100644
index 000000000..c9b1be2c1
--- /dev/null
+++ b/kernel/fs/qnx4/qnx4.h
@@ -0,0 +1,45 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+
+#define QNX4_DEBUG 0
+
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+
+struct qnx4_sb_info {
+	unsigned int		Version;	/* may be useful */
+	struct qnx4_inode_entry	*BitMap;	/* useful */
+};
+
+struct qnx4_inode_info {
+	struct qnx4_inode_entry raw;
+	loff_t mmu_private;
+	struct inode vfs_inode;
+};
+
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+	return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+	return &qnx4_i(inode)->raw;
+}
diff --git a/kernel/fs/qnx6/Kconfig b/kernel/fs/qnx6/Kconfig
new file mode 100644
index 000000000..edbba5c17
--- /dev/null
+++ b/kernel/fs/qnx6/Kconfig
@@ -0,0 +1,26 @@
+config QNX6FS_FS
+	tristate "QNX6 file system support (read only)"
+	depends on BLOCK && CRC32
+	help
+	  This is the file system used by the real-time operating systems
+	  QNX 6 (also called QNX RTP).
+	  Further information is available at <http://www.qnx.com/>.
+	  Say Y if you intend to mount QNX hard disks or floppies formatted
+          with a mkqnx6fs.
+	  However, keep in mind that this currently is a readonly driver!
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called qnx6.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
+
+config QNX6FS_DEBUG
+	bool "QNX6 debugging information"
+	depends on QNX6FS_FS
+	help
+	  Turns on extended debugging output.
+
+	  If you are not a developer working on the QNX6FS, you probably don't
+	  want this:
+	  answer N.
diff --git a/kernel/fs/qnx6/Makefile b/kernel/fs/qnx6/Makefile
new file mode 100644
index 000000000..5e6bae6fa
--- /dev/null
+++ b/kernel/fs/qnx6/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux qnx4-filesystem routines.
+#
+
+obj-$(CONFIG_QNX6FS_FS) += qnx6.o
+
+qnx6-objs := inode.o dir.o namei.o super_mmi.o
+ccflags-$(CONFIG_QNX6FS_DEBUG)	+= -DDEBUG
diff --git a/kernel/fs/qnx6/README b/kernel/fs/qnx6/README
new file mode 100644
index 000000000..116d62202
--- /dev/null
+++ b/kernel/fs/qnx6/README
@@ -0,0 +1,8 @@
+
+  This is a snapshot of the QNX6 filesystem for Linux.
+  Please send diffs and remarks to <chaosman@ontika.net> .
+
+Credits :
+
+Al Viro		<viro@ZenIV.linux.org.uk> (endless patience with me & support ;))
+Kai Bankett	<chaosman@ontika.net> (Maintainer)
diff --git a/kernel/fs/qnx6/dir.c b/kernel/fs/qnx6/dir.c
new file mode 100644
index 000000000..8d64bb536
--- /dev/null
+++ b/kernel/fs/qnx6/dir.c
@@ -0,0 +1,286 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 pagemap extension by Al Viro
+ *
+ */
+
+#include "qnx6.h"
+
+static unsigned qnx6_lfile_checksum(char *name, unsigned size)
+{
+	unsigned crc = 0;
+	char *end = name + size;
+	while (name < end) {
+		crc = ((crc >> 1) + *(name++)) ^
+			((crc & 0x00000001) ? 0x80000000 : 0);
+	}
+	return crc;
+}
+
+static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page))
+		kmap(page);
+	return page;
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+static unsigned last_entry(struct inode *inode, unsigned long page_nr)
+{
+	unsigned long last_byte = inode->i_size;
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte / QNX6_DIR_ENTRY_SIZE;
+}
+
+static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
+					 struct qnx6_long_dir_entry *de,
+					 struct page **p)
+{
+	struct qnx6_sb_info *sbi = QNX6_SB(sb);
+	u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */
+	u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */
+	/* within page */
+	u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK;
+	struct address_space *mapping = sbi->longfile->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (IS_ERR(page))
+		return ERR_CAST(page);
+	kmap(*p = page);
+	return (struct qnx6_long_filename *)(page_address(page) + offs);
+}
+
+static int qnx6_dir_longfilename(struct inode *inode,
+			struct qnx6_long_dir_entry *de,
+			struct dir_context *ctx,
+			unsigned de_inode)
+{
+	struct qnx6_long_filename *lf;
+	struct super_block *s = inode->i_sb;
+	struct qnx6_sb_info *sbi = QNX6_SB(s);
+	struct page *page;
+	int lf_size;
+
+	if (de->de_size != 0xff) {
+		/* error - long filename entries always have size 0xff
+		   in direntry */
+		pr_err("invalid direntry size (%i).\n", de->de_size);
+		return 0;
+	}
+	lf = qnx6_longname(s, de, &page);
+	if (IS_ERR(lf)) {
+		pr_err("Error reading longname\n");
+		return 0;
+	}
+
+	lf_size = fs16_to_cpu(sbi, lf->lf_size);
+
+	if (lf_size > QNX6_LONG_NAME_MAX) {
+		pr_debug("file %s\n", lf->lf_fname);
+		pr_err("Filename too long (%i)\n", lf_size);
+		qnx6_put_page(page);
+		return 0;
+	}
+
+	/* calc & validate longfilename checksum
+	   mmi 3g filesystem does not have that checksum */
+	if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) !=
+			qnx6_lfile_checksum(lf->lf_fname, lf_size))
+		pr_info("long filename checksum error.\n");
+
+	pr_debug("qnx6_readdir:%.*s inode:%u\n",
+		 lf_size, lf->lf_fname, de_inode);
+	if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
+		qnx6_put_page(page);
+		return 0;
+	}
+
+	qnx6_put_page(page);
+	/* success */
+	return 1;
+}
+
+static int qnx6_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *s = inode->i_sb;
+	struct qnx6_sb_info *sbi = QNX6_SB(s);
+	loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
+	unsigned long npages = dir_pages(inode);
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
+	bool done = false;
+
+	ctx->pos = pos;
+	if (ctx->pos >= inode->i_size)
+		return 0;
+
+	for ( ; !done && n < npages; n++, start = 0) {
+		struct page *page = qnx6_get_page(inode, n);
+		int limit = last_entry(inode, n);
+		struct qnx6_dir_entry *de;
+		int i = start;
+
+		if (IS_ERR(page)) {
+			pr_err("%s(): read failed\n", __func__);
+			ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
+			return PTR_ERR(page);
+		}
+		de = ((struct qnx6_dir_entry *)page_address(page)) + start;
+		for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
+			int size = de->de_size;
+			u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
+
+			if (!no_inode || !size)
+				continue;
+
+			if (size > QNX6_SHORT_NAME_MAX) {
+				/* long filename detected
+				   get the filename from long filename
+				   structure / block */
+				if (!qnx6_dir_longfilename(inode,
+					(struct qnx6_long_dir_entry *)de,
+					ctx, no_inode)) {
+					done = true;
+					break;
+				}
+			} else {
+				pr_debug("%s():%.*s inode:%u\n",
+					 __func__, size, de->de_fname,
+					 no_inode);
+				if (!dir_emit(ctx, de->de_fname, size,
+				      no_inode, DT_UNKNOWN)) {
+					done = true;
+					break;
+				}
+			}
+		}
+		qnx6_put_page(page);
+	}
+	return 0;
+}
+
+/*
+ * check if the long filename is correct.
+ */
+static unsigned qnx6_long_match(int len, const char *name,
+			struct qnx6_long_dir_entry *de, struct inode *dir)
+{
+	struct super_block *s = dir->i_sb;
+	struct qnx6_sb_info *sbi = QNX6_SB(s);
+	struct page *page;
+	int thislen;
+	struct qnx6_long_filename *lf = qnx6_longname(s, de, &page);
+
+	if (IS_ERR(lf))
+		return 0;
+
+	thislen = fs16_to_cpu(sbi, lf->lf_size);
+	if (len != thislen) {
+		qnx6_put_page(page);
+		return 0;
+	}
+	if (memcmp(name, lf->lf_fname, len) == 0) {
+		qnx6_put_page(page);
+		return fs32_to_cpu(sbi, de->de_inode);
+	}
+	qnx6_put_page(page);
+	return 0;
+}
+
+/*
+ * check if the filename is correct.
+ */
+static unsigned qnx6_match(struct super_block *s, int len, const char *name,
+			struct qnx6_dir_entry *de)
+{
+	struct qnx6_sb_info *sbi = QNX6_SB(s);
+	if (memcmp(name, de->de_fname, len) == 0)
+		return fs32_to_cpu(sbi, de->de_inode);
+	return 0;
+}
+
+
+unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
+			 struct page **res_page)
+{
+	struct super_block *s = dir->i_sb;
+	struct qnx6_inode_info *ei = QNX6_I(dir);
+	struct page *page = NULL;
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	unsigned ino;
+	struct qnx6_dir_entry *de;
+	struct qnx6_long_dir_entry *lde;
+
+	*res_page = NULL;
+
+	if (npages == 0)
+		return 0;
+	start = ei->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+
+	do {
+		page = qnx6_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			int limit = last_entry(dir, n);
+			int i;
+
+			de = (struct qnx6_dir_entry *)page_address(page);
+			for (i = 0; i < limit; i++, de++) {
+				if (len <= QNX6_SHORT_NAME_MAX) {
+					/* short filename */
+					if (len != de->de_size)
+						continue;
+					ino = qnx6_match(s, len, name, de);
+					if (ino)
+						goto found;
+				} else if (de->de_size == 0xff) {
+					/* deal with long filename */
+					lde = (struct qnx6_long_dir_entry *)de;
+					ino = qnx6_long_match(len,
+								name, lde, dir);
+					if (ino)
+						goto found;
+				} else
+					pr_err("undefined filename size in inode.\n");
+			}
+			qnx6_put_page(page);
+		}
+
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+	return 0;
+
+found:
+	*res_page = page;
+	ei->i_dir_start_lookup = n;
+	return ino;
+}
+
+const struct file_operations qnx6_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= qnx6_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+const struct inode_operations qnx6_dir_inode_operations = {
+	.lookup		= qnx6_lookup,
+};
diff --git a/kernel/fs/qnx6/inode.c b/kernel/fs/qnx6/inode.c
new file mode 100644
index 000000000..32d2e1a97
--- /dev/null
+++ b/kernel/fs/qnx6/inode.c
@@ -0,0 +1,685 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 pagemap extension by Al Viro
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/crc32.h>
+#include <linux/mpage.h>
+#include "qnx6.h"
+
+static const struct super_operations qnx6_sops;
+
+static void qnx6_put_super(struct super_block *sb);
+static struct inode *qnx6_alloc_inode(struct super_block *sb);
+static void qnx6_destroy_inode(struct inode *inode);
+static int qnx6_remount(struct super_block *sb, int *flags, char *data);
+static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int qnx6_show_options(struct seq_file *seq, struct dentry *root);
+
+static const struct super_operations qnx6_sops = {
+	.alloc_inode	= qnx6_alloc_inode,
+	.destroy_inode	= qnx6_destroy_inode,
+	.put_super	= qnx6_put_super,
+	.statfs		= qnx6_statfs,
+	.remount_fs	= qnx6_remount,
+	.show_options	= qnx6_show_options,
+};
+
+static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct qnx6_sb_info *sbi = QNX6_SB(sb);
+
+	if (sbi->s_mount_opt & QNX6_MOUNT_MMI_FS)
+		seq_puts(seq, ",mmi_fs");
+	return 0;
+}
+
+static int qnx6_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static unsigned qnx6_get_devblock(struct super_block *sb, __fs32 block)
+{
+	struct qnx6_sb_info *sbi = QNX6_SB(sb);
+	return fs32_to_cpu(sbi, block) + sbi->s_blks_off;
+}
+
+static unsigned qnx6_block_map(struct inode *inode, unsigned iblock);
+
+static int qnx6_get_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh, int create)
+{
+	unsigned phys;
+
+	pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n",
+		 inode->i_ino, (unsigned long)iblock);
+
+	phys = qnx6_block_map(inode, iblock);
+	if (phys) {
+		/* logical block is before EOF */
+		map_bh(bh, inode->i_sb, phys);
+	}
+	return 0;
+}
+
+static int qnx6_check_blockptr(__fs32 ptr)
+{
+	if (ptr == ~(__fs32)0) {
+		pr_err("hit unused blockpointer.\n");
+		return 0;
+	}
+	return 1;
+}
+
+static int qnx6_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, qnx6_get_block);
+}
+
+static int qnx6_readpages(struct file *file, struct address_space *mapping,
+		   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block);
+}
+
+/*
+ * returns the block number for the no-th element in the tree
+ * inodebits requred as there are multiple inodes in one inode block
+ */
+static unsigned qnx6_block_map(struct inode *inode, unsigned no)
+{
+	struct super_block *s = inode->i_sb;
+	struct qnx6_sb_info *sbi = QNX6_SB(s);
+	struct qnx6_inode_info *ei = QNX6_I(inode);
+	unsigned block = 0;
+	struct buffer_head *bh;
+	__fs32 ptr;
+	int levelptr;
+	int ptrbits = sbi->s_ptrbits;
+	int bitdelta;
+	u32 mask = (1 << ptrbits) - 1;
+	int depth = ei->di_filelevels;
+	int i;
+
+	bitdelta = ptrbits * depth;
+	levelptr = no >> bitdelta;
+
+	if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) {
+		pr_err("Requested file block number (%u) too big.", no);
+		return 0;
+	}
+
+	block = qnx6_get_devblock(s, ei->di_block_ptr[levelptr]);
+
+	for (i = 0; i < depth; i++) {
+		bh = sb_bread(s, block);
+		if (!bh) {
+			pr_err("Error reading block (%u)\n", block);
+			return 0;
+		}
+		bitdelta -= ptrbits;
+		levelptr = (no >> bitdelta) & mask;
+		ptr = ((__fs32 *)bh->b_data)[levelptr];
+
+		if (!qnx6_check_blockptr(ptr))
+			return 0;
+
+		block = qnx6_get_devblock(s, ptr);
+		brelse(bh);
+	}
+	return block;
+}
+
+static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct qnx6_sb_info *sbi = QNX6_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type    = sb->s_magic;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = fs32_to_cpu(sbi, sbi->sb->sb_num_blocks);
+	buf->f_bfree   = fs32_to_cpu(sbi, sbi->sb->sb_free_blocks);
+	buf->f_files   = fs32_to_cpu(sbi, sbi->sb->sb_num_inodes);
+	buf->f_ffree   = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes);
+	buf->f_bavail  = buf->f_bfree;
+	buf->f_namelen = QNX6_LONG_NAME_MAX;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+/*
+ * Check the root directory of the filesystem to make sure
+ * it really _is_ a qnx6 filesystem, and to check the size
+ * of the directory entry.
+ */
+static const char *qnx6_checkroot(struct super_block *s)
+{
+	static char match_root[2][3] = {".\0\0", "..\0"};
+	int i, error = 0;
+	struct qnx6_dir_entry *dir_entry;
+	struct inode *root = d_inode(s->s_root);
+	struct address_space *mapping = root->i_mapping;
+	struct page *page = read_mapping_page(mapping, 0, NULL);
+	if (IS_ERR(page))
+		return "error reading root directory";
+	kmap(page);
+	dir_entry = page_address(page);
+	for (i = 0; i < 2; i++) {
+		/* maximum 3 bytes - due to match_root limitation */
+		if (strncmp(dir_entry[i].de_fname, match_root[i], 3))
+			error = 1;
+	}
+	qnx6_put_page(page);
+	if (error)
+		return "error reading root directory.";
+	return NULL;
+}
+
+#ifdef CONFIG_QNX6FS_DEBUG
+void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s)
+{
+	struct qnx6_sb_info *sbi = QNX6_SB(s);
+
+	pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic));
+	pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum));
+	pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial));
+	pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags));
+	pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize));
+	pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes));
+	pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes));
+	pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks));
+	pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks));
+	pr_debug("inode_levels: %02x\n", sb->Inode.levels);
+}
+#endif
+
+enum {
+	Opt_mmifs,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_mmifs, "mmi_fs"},
+	{Opt_err, NULL}
+};
+
+static int qnx6_parse_options(char *options, struct super_block *sb)
+{
+	char *p;
+	struct qnx6_sb_info *sbi = QNX6_SB(sb);
+	substring_t args[MAX_OPT_ARGS];
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_mmifs:
+			set_opt(sbi->s_mount_opt, MMI_FS);
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
+				int offset, int silent)
+{
+	struct qnx6_sb_info *sbi = QNX6_SB(s);
+	struct buffer_head *bh;
+	struct qnx6_super_block *sb;
+
+	/* Check the superblock signatures
+	   start with the first superblock */
+	bh = sb_bread(s, offset);
+	if (!bh) {
+		pr_err("unable to read the first superblock\n");
+		return NULL;
+	}
+	sb = (struct qnx6_super_block *)bh->b_data;
+	if (fs32_to_cpu(sbi, sb->sb_magic) != QNX6_SUPER_MAGIC) {
+		sbi->s_bytesex = BYTESEX_BE;
+		if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
+			/* we got a big endian fs */
+			pr_debug("fs got different endianness.\n");
+			return bh;
+		} else
+			sbi->s_bytesex = BYTESEX_LE;
+		if (!silent) {
+			if (offset == 0) {
+				pr_err("wrong signature (magic) in superblock #1.\n");
+			} else {
+				pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n",
+					offset * s->s_blocksize);
+			}
+		}
+		brelse(bh);
+		return NULL;
+	}
+	return bh;
+}
+
+static struct inode *qnx6_private_inode(struct super_block *s,
+					struct qnx6_root_node *p);
+
+static int qnx6_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh1 = NULL, *bh2 = NULL;
+	struct qnx6_super_block *sb1 = NULL, *sb2 = NULL;
+	struct qnx6_sb_info *sbi;
+	struct inode *root;
+	const char *errmsg;
+	struct qnx6_sb_info *qs;
+	int ret = -EINVAL;
+	u64 offset;
+	int bootblock_offset = QNX6_BOOTBLOCK_SIZE;
+
+	qs = kzalloc(sizeof(struct qnx6_sb_info), GFP_KERNEL);
+	if (!qs)
+		return -ENOMEM;
+	s->s_fs_info = qs;
+
+	/* Superblock always is 512 Byte long */
+	if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) {
+		pr_err("unable to set blocksize\n");
+		goto outnobh;
+	}
+
+	/* parse the mount-options */
+	if (!qnx6_parse_options((char *) data, s)) {
+		pr_err("invalid mount options.\n");
+		goto outnobh;
+	}
+	if (test_opt(s, MMI_FS)) {
+		sb1 = qnx6_mmi_fill_super(s, silent);
+		if (sb1)
+			goto mmi_success;
+		else
+			goto outnobh;
+	}
+	sbi = QNX6_SB(s);
+	sbi->s_bytesex = BYTESEX_LE;
+	/* Check the superblock signatures
+	   start with the first superblock */
+	bh1 = qnx6_check_first_superblock(s,
+		bootblock_offset / QNX6_SUPERBLOCK_SIZE, silent);
+	if (!bh1) {
+		/* try again without bootblock offset */
+		bh1 = qnx6_check_first_superblock(s, 0, silent);
+		if (!bh1) {
+			pr_err("unable to read the first superblock\n");
+			goto outnobh;
+		}
+		/* seems that no bootblock at partition start */
+		bootblock_offset = 0;
+	}
+	sb1 = (struct qnx6_super_block *)bh1->b_data;
+
+#ifdef CONFIG_QNX6FS_DEBUG
+	qnx6_superblock_debug(sb1, s);
+#endif
+
+	/* checksum check - start at byte 8 and end at byte 512 */
+	if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
+			crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
+		pr_err("superblock #1 checksum error\n");
+		goto out;
+	}
+
+	/* set new blocksize */
+	if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
+		pr_err("unable to set blocksize\n");
+		goto out;
+	}
+	/* blocksize invalidates bh - pull it back in */
+	brelse(bh1);
+	bh1 = sb_bread(s, bootblock_offset >> s->s_blocksize_bits);
+	if (!bh1)
+		goto outnobh;
+	sb1 = (struct qnx6_super_block *)bh1->b_data;
+
+	/* calculate second superblock blocknumber */
+	offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) +
+		(bootblock_offset >> s->s_blocksize_bits) +
+		(QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits);
+
+	/* set bootblock offset */
+	sbi->s_blks_off = (bootblock_offset >> s->s_blocksize_bits) +
+			  (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits);
+
+	/* next the second superblock */
+	bh2 = sb_bread(s, offset);
+	if (!bh2) {
+		pr_err("unable to read the second superblock\n");
+		goto out;
+	}
+	sb2 = (struct qnx6_super_block *)bh2->b_data;
+	if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
+		if (!silent)
+			pr_err("wrong signature (magic) in superblock #2.\n");
+		goto out;
+	}
+
+	/* checksum check - start at byte 8 and end at byte 512 */
+	if (fs32_to_cpu(sbi, sb2->sb_checksum) !=
+				crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
+		pr_err("superblock #2 checksum error\n");
+		goto out;
+	}
+
+	if (fs64_to_cpu(sbi, sb1->sb_serial) >=
+					fs64_to_cpu(sbi, sb2->sb_serial)) {
+		/* superblock #1 active */
+		sbi->sb_buf = bh1;
+		sbi->sb = (struct qnx6_super_block *)bh1->b_data;
+		brelse(bh2);
+		pr_info("superblock #1 active\n");
+	} else {
+		/* superblock #2 active */
+		sbi->sb_buf = bh2;
+		sbi->sb = (struct qnx6_super_block *)bh2->b_data;
+		brelse(bh1);
+		pr_info("superblock #2 active\n");
+	}
+mmi_success:
+	/* sanity check - limit maximum indirect pointer levels */
+	if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) {
+		pr_err("too many inode levels (max %i, sb %i)\n",
+		       QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
+		goto out;
+	}
+	if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) {
+		pr_err("too many longfilename levels (max %i, sb %i)\n",
+		       QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
+		goto out;
+	}
+	s->s_op = &qnx6_sops;
+	s->s_magic = QNX6_SUPER_MAGIC;
+	s->s_flags |= MS_RDONLY;        /* Yup, read-only yet */
+
+	/* ease the later tree level calculations */
+	sbi = QNX6_SB(s);
+	sbi->s_ptrbits = ilog2(s->s_blocksize / 4);
+	sbi->inodes = qnx6_private_inode(s, &sb1->Inode);
+	if (!sbi->inodes)
+		goto out;
+	sbi->longfile = qnx6_private_inode(s, &sb1->Longfile);
+	if (!sbi->longfile)
+		goto out1;
+
+	/* prefetch root inode */
+	root = qnx6_iget(s, QNX6_ROOT_INO);
+	if (IS_ERR(root)) {
+		pr_err("get inode failed\n");
+		ret = PTR_ERR(root);
+		goto out2;
+	}
+
+	ret = -ENOMEM;
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
+		goto out2;
+
+	ret = -EINVAL;
+	errmsg = qnx6_checkroot(s);
+	if (errmsg != NULL) {
+		if (!silent)
+			pr_err("%s\n", errmsg);
+		goto out3;
+	}
+	return 0;
+
+out3:
+	dput(s->s_root);
+	s->s_root = NULL;
+out2:
+	iput(sbi->longfile);
+out1:
+	iput(sbi->inodes);
+out:
+	if (bh1)
+		brelse(bh1);
+	if (bh2)
+		brelse(bh2);
+outnobh:
+	kfree(qs);
+	s->s_fs_info = NULL;
+	return ret;
+}
+
+static void qnx6_put_super(struct super_block *sb)
+{
+	struct qnx6_sb_info *qs = QNX6_SB(sb);
+	brelse(qs->sb_buf);
+	iput(qs->longfile);
+	iput(qs->inodes);
+	kfree(qs);
+	sb->s_fs_info = NULL;
+	return;
+}
+
+static sector_t qnx6_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, qnx6_get_block);
+}
+static const struct address_space_operations qnx6_aops = {
+	.readpage	= qnx6_readpage,
+	.readpages	= qnx6_readpages,
+	.bmap		= qnx6_bmap
+};
+
+static struct inode *qnx6_private_inode(struct super_block *s,
+					struct qnx6_root_node *p)
+{
+	struct inode *inode = new_inode(s);
+	if (inode) {
+		struct qnx6_inode_info *ei = QNX6_I(inode);
+		struct qnx6_sb_info *sbi = QNX6_SB(s);
+		inode->i_size = fs64_to_cpu(sbi, p->size);
+		memcpy(ei->di_block_ptr, p->ptr, sizeof(p->ptr));
+		ei->di_filelevels = p->levels;
+		inode->i_mode = S_IFREG | S_IRUSR; /* probably wrong */
+		inode->i_mapping->a_ops = &qnx6_aops;
+	}
+	return inode;
+}
+
+struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
+{
+	struct qnx6_sb_info *sbi = QNX6_SB(sb);
+	struct qnx6_inode_entry *raw_inode;
+	struct inode *inode;
+	struct qnx6_inode_info	*ei;
+	struct address_space *mapping;
+	struct page *page;
+	u32 n, offs;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = QNX6_I(inode);
+
+	inode->i_mode = 0;
+
+	if (ino == 0) {
+		pr_err("bad inode number on dev %s: %u is out of range\n",
+		       sb->s_id, ino);
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS);
+	offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS);
+	mapping = sbi->inodes->i_mapping;
+	page = read_mapping_page(mapping, n, NULL);
+	if (IS_ERR(page)) {
+		pr_err("major problem: unable to read inode from dev %s\n",
+		       sb->s_id);
+		iget_failed(inode);
+		return ERR_CAST(page);
+	}
+	kmap(page);
+	raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs;
+
+	inode->i_mode    = fs16_to_cpu(sbi, raw_inode->di_mode);
+	i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
+	i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid));
+	inode->i_size    = fs64_to_cpu(sbi, raw_inode->di_size);
+	inode->i_mtime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_mtime);
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_atime);
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_ctime);
+	inode->i_ctime.tv_nsec = 0;
+
+	/* calc blocks based on 512 byte blocksize */
+	inode->i_blocks = (inode->i_size + 511) >> 9;
+
+	memcpy(&ei->di_block_ptr, &raw_inode->di_block_ptr,
+				sizeof(raw_inode->di_block_ptr));
+	ei->di_filelevels = raw_inode->di_filelevels;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mapping->a_ops = &qnx6_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &qnx6_dir_inode_operations;
+		inode->i_fop = &qnx6_dir_operations;
+		inode->i_mapping->a_ops = &qnx6_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_mapping->a_ops = &qnx6_aops;
+	} else
+		init_special_inode(inode, inode->i_mode, 0);
+	qnx6_put_page(page);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+static struct kmem_cache *qnx6_inode_cachep;
+
+static struct inode *qnx6_alloc_inode(struct super_block *sb)
+{
+	struct qnx6_inode_info *ei;
+	ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void qnx6_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode));
+}
+
+static void qnx6_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, qnx6_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
+					     sizeof(struct qnx6_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (!qnx6_inode_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(qnx6_inode_cachep);
+}
+
+static struct dentry *qnx6_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, qnx6_fill_super);
+}
+
+static struct file_system_type qnx6_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "qnx6",
+	.mount		= qnx6_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("qnx6");
+
+static int __init init_qnx6_fs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		return err;
+
+	err = register_filesystem(&qnx6_fs_type);
+	if (err) {
+		destroy_inodecache();
+		return err;
+	}
+
+	pr_info("QNX6 filesystem 1.0.0 registered.\n");
+	return 0;
+}
+
+static void __exit exit_qnx6_fs(void)
+{
+	unregister_filesystem(&qnx6_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_qnx6_fs)
+module_exit(exit_qnx6_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/qnx6/namei.c b/kernel/fs/qnx6/namei.c
new file mode 100644
index 000000000..6c1a32313
--- /dev/null
+++ b/kernel/fs/qnx6/namei.c
@@ -0,0 +1,42 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 pagemap extension by Al Viro
+ *
+ */
+
+#include "qnx6.h"
+
+struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
+				unsigned int flags)
+{
+	unsigned ino;
+	struct page *page;
+	struct inode *foundinode = NULL;
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+
+	if (len > QNX6_LONG_NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = qnx6_find_entry(len, dir, name, &page);
+	if (ino) {
+		foundinode = qnx6_iget(dir->i_sb, ino);
+		qnx6_put_page(page);
+		if (IS_ERR(foundinode)) {
+			pr_debug("lookup->iget ->  error %ld\n",
+				 PTR_ERR(foundinode));
+			return ERR_CAST(foundinode);
+		}
+	} else {
+		pr_debug("%s(): not found %s\n", __func__, name);
+		return NULL;
+	}
+	d_add(dentry, foundinode);
+	return NULL;
+}
diff --git a/kernel/fs/qnx6/qnx6.h b/kernel/fs/qnx6/qnx6.h
new file mode 100644
index 000000000..d3fb2b698
--- /dev/null
+++ b/kernel/fs/qnx6/qnx6.h
@@ -0,0 +1,135 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ * 16-02-2012 page map extension by Al Viro
+ *
+ */
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+typedef __u16 __bitwise __fs16;
+typedef __u32 __bitwise __fs32;
+typedef __u64 __bitwise __fs64;
+
+#include <linux/qnx6_fs.h>
+
+struct qnx6_sb_info {
+	struct buffer_head	*sb_buf;	/* superblock buffer */
+	struct qnx6_super_block	*sb;		/* our superblock */
+	int			s_blks_off;	/* blkoffset fs-startpoint */
+	int			s_ptrbits;	/* indirect pointer bitfield */
+	unsigned long		s_mount_opt;	/* all mount options */
+	int			s_bytesex;	/* holds endianess info */
+	struct inode *		inodes;
+	struct inode *		longfile;
+};
+
+struct qnx6_inode_info {
+	__fs32			di_block_ptr[QNX6_NO_DIRECT_POINTERS];
+	__u8			di_filelevels;
+	__u32			i_dir_start_lookup;
+	struct inode		vfs_inode;
+};
+
+extern struct inode *qnx6_iget(struct super_block *sb, unsigned ino);
+extern struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
+					unsigned int flags);
+
+#ifdef CONFIG_QNX6FS_DEBUG
+extern void qnx6_superblock_debug(struct qnx6_super_block *,
+						struct super_block *);
+#endif
+
+extern const struct inode_operations qnx6_dir_inode_operations;
+extern const struct file_operations qnx6_dir_operations;
+
+static inline struct qnx6_sb_info *QNX6_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct qnx6_inode_info *QNX6_I(struct inode *inode)
+{
+	return container_of(inode, struct qnx6_inode_info, vfs_inode);
+}
+
+#define clear_opt(o, opt)		(o &= ~(QNX6_MOUNT_##opt))
+#define set_opt(o, opt)			(o |= (QNX6_MOUNT_##opt))
+#define test_opt(sb, opt)		(QNX6_SB(sb)->s_mount_opt & \
+					 QNX6_MOUNT_##opt)
+enum {
+	BYTESEX_LE,
+	BYTESEX_BE,
+};
+
+static inline __u64 fs64_to_cpu(struct qnx6_sb_info *sbi, __fs64 n)
+{
+	if (sbi->s_bytesex == BYTESEX_LE)
+		return le64_to_cpu((__force __le64)n);
+	else
+		return be64_to_cpu((__force __be64)n);
+}
+
+static inline __fs64 cpu_to_fs64(struct qnx6_sb_info *sbi, __u64 n)
+{
+	if (sbi->s_bytesex == BYTESEX_LE)
+		return (__force __fs64)cpu_to_le64(n);
+	else
+		return (__force __fs64)cpu_to_be64(n);
+}
+
+static inline __u32 fs32_to_cpu(struct qnx6_sb_info *sbi, __fs32 n)
+{
+	if (sbi->s_bytesex == BYTESEX_LE)
+		return le32_to_cpu((__force __le32)n);
+	else
+		return be32_to_cpu((__force __be32)n);
+}
+
+static inline __fs32 cpu_to_fs32(struct qnx6_sb_info *sbi, __u32 n)
+{
+	if (sbi->s_bytesex == BYTESEX_LE)
+		return (__force __fs32)cpu_to_le32(n);
+	else
+		return (__force __fs32)cpu_to_be32(n);
+}
+
+static inline __u16 fs16_to_cpu(struct qnx6_sb_info *sbi, __fs16 n)
+{
+	if (sbi->s_bytesex == BYTESEX_LE)
+		return le16_to_cpu((__force __le16)n);
+	else
+		return be16_to_cpu((__force __be16)n);
+}
+
+static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n)
+{
+	if (sbi->s_bytesex == BYTESEX_LE)
+		return (__force __fs16)cpu_to_le16(n);
+	else
+		return (__force __fs16)cpu_to_be16(n);
+}
+
+extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s,
+						    int silent);
+
+static inline void qnx6_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
+				struct page **res_page);
diff --git a/kernel/fs/qnx6/super_mmi.c b/kernel/fs/qnx6/super_mmi.c
new file mode 100644
index 000000000..62aaf3e31
--- /dev/null
+++ b/kernel/fs/qnx6/super_mmi.c
@@ -0,0 +1,148 @@
+/*
+ * QNX6 file system, Linux implementation.
+ *
+ * Version : 1.0.0
+ *
+ * History :
+ *
+ * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include "qnx6.h"
+
+static void qnx6_mmi_copy_sb(struct qnx6_super_block *qsb,
+		struct qnx6_mmi_super_block *sb)
+{
+	qsb->sb_magic = sb->sb_magic;
+	qsb->sb_checksum = sb->sb_checksum;
+	qsb->sb_serial = sb->sb_serial;
+	qsb->sb_blocksize = sb->sb_blocksize;
+	qsb->sb_num_inodes = sb->sb_num_inodes;
+	qsb->sb_free_inodes = sb->sb_free_inodes;
+	qsb->sb_num_blocks = sb->sb_num_blocks;
+	qsb->sb_free_blocks = sb->sb_free_blocks;
+
+	/* the rest of the superblock is the same */
+	memcpy(&qsb->Inode, &sb->Inode, sizeof(sb->Inode));
+	memcpy(&qsb->Bitmap, &sb->Bitmap, sizeof(sb->Bitmap));
+	memcpy(&qsb->Longfile, &sb->Longfile, sizeof(sb->Longfile));
+}
+
+struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
+{
+	struct buffer_head *bh1, *bh2 = NULL;
+	struct qnx6_mmi_super_block *sb1, *sb2;
+	struct qnx6_super_block *qsb = NULL;
+	struct qnx6_sb_info *sbi;
+	__u64 offset;
+
+	/* Check the superblock signatures
+	   start with the first superblock */
+	bh1 = sb_bread(s, 0);
+	if (!bh1) {
+		pr_err("Unable to read first mmi superblock\n");
+		return NULL;
+	}
+	sb1 = (struct qnx6_mmi_super_block *)bh1->b_data;
+	sbi = QNX6_SB(s);
+	if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) {
+		if (!silent) {
+			pr_err("wrong signature (magic) in superblock #1.\n");
+			goto out;
+		}
+	}
+
+	/* checksum check - start at byte 8 and end at byte 512 */
+	if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
+				crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
+		pr_err("superblock #1 checksum error\n");
+		goto out;
+	}
+
+	/* calculate second superblock blocknumber */
+	offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + QNX6_SUPERBLOCK_AREA /
+					fs32_to_cpu(sbi, sb1->sb_blocksize);
+
+	/* set new blocksize */
+	if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
+		pr_err("unable to set blocksize\n");
+		goto out;
+	}
+	/* blocksize invalidates bh - pull it back in */
+	brelse(bh1);
+	bh1 = sb_bread(s, 0);
+	if (!bh1)
+		goto out;
+	sb1 = (struct qnx6_mmi_super_block *)bh1->b_data;
+
+	/* read second superblock */
+	bh2 = sb_bread(s, offset);
+	if (!bh2) {
+		pr_err("unable to read the second superblock\n");
+		goto out;
+	}
+	sb2 = (struct qnx6_mmi_super_block *)bh2->b_data;
+	if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
+		if (!silent)
+			pr_err("wrong signature (magic) in superblock #2.\n");
+		goto out;
+	}
+
+	/* checksum check - start at byte 8 and end at byte 512 */
+	if (fs32_to_cpu(sbi, sb2->sb_checksum)
+			!= crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
+		pr_err("superblock #1 checksum error\n");
+		goto out;
+	}
+
+	qsb = kmalloc(sizeof(*qsb), GFP_KERNEL);
+	if (!qsb) {
+		pr_err("unable to allocate memory.\n");
+		goto out;
+	}
+
+	if (fs64_to_cpu(sbi, sb1->sb_serial) >
+					fs64_to_cpu(sbi, sb2->sb_serial)) {
+		/* superblock #1 active */
+		qnx6_mmi_copy_sb(qsb, sb1);
+#ifdef CONFIG_QNX6FS_DEBUG
+		qnx6_superblock_debug(qsb, s);
+#endif
+		memcpy(bh1->b_data, qsb, sizeof(struct qnx6_super_block));
+
+		sbi->sb_buf = bh1;
+		sbi->sb = (struct qnx6_super_block *)bh1->b_data;
+		brelse(bh2);
+		pr_info("superblock #1 active\n");
+	} else {
+		/* superblock #2 active */
+		qnx6_mmi_copy_sb(qsb, sb2);
+#ifdef CONFIG_QNX6FS_DEBUG
+		qnx6_superblock_debug(qsb, s);
+#endif
+		memcpy(bh2->b_data, qsb, sizeof(struct qnx6_super_block));
+
+		sbi->sb_buf = bh2;
+		sbi->sb = (struct qnx6_super_block *)bh2->b_data;
+		brelse(bh1);
+		pr_info("superblock #2 active\n");
+	}
+	kfree(qsb);
+
+	/* offset for mmi_fs is just SUPERBLOCK_AREA bytes */
+	sbi->s_blks_off = QNX6_SUPERBLOCK_AREA / s->s_blocksize;
+
+	/* success */
+	return sbi->sb;
+
+out:
+	if (bh1 != NULL)
+		brelse(bh1);
+	if (bh2 != NULL)
+		brelse(bh2);
+	return NULL;
+}
diff --git a/kernel/fs/quota/Kconfig b/kernel/fs/quota/Kconfig
new file mode 100644
index 000000000..4a09975aa
--- /dev/null
+++ b/kernel/fs/quota/Kconfig
@@ -0,0 +1,76 @@
+#
+#  Quota configuration
+#
+
+config QUOTA
+	bool "Quota support"
+	select QUOTACTL
+	select SRCU
+	help
+	  If you say Y here, you will be able to set per user limits for disk
+	  usage (also called disk quotas). Currently, it works for the
+	  ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems.
+	  Note that gfs2 and xfs use their own quota system.
+	  Ext3, ext4 and reiserfs also support journaled quotas for which
+	  you don't need to run quotacheck(8) after an unclean shutdown.
+	  For further details, read the Quota mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or the documentation provided
+	  with the quota tools. Probably the quota support is only useful for
+	  multi user systems. If unsure, say N.
+
+config QUOTA_NETLINK_INTERFACE
+	bool "Report quota messages through netlink interface"
+	depends on QUOTACTL && NET
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be reported through netlink interface. If unsure,
+	  say Y.
+
+config PRINT_QUOTA_WARNING
+	bool "Print quota warnings to console (OBSOLETE)"
+	depends on QUOTA
+	default y
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be printed to the process' controlling terminal.
+	  Note that this behavior is currently deprecated and may go away in
+	  future. Please use notification via netlink socket instead.
+
+config QUOTA_DEBUG
+	bool "Additional quota sanity checks"
+	depends on QUOTA
+	default n
+	help
+	  If you say Y here, quota subsystem will perform some additional
+	  sanity checks of quota internal structures. If unsure, say N.
+
+# Generic support for tree structured quota files. Selected when needed.
+config QUOTA_TREE
+	 tristate
+
+config QFMT_V1
+	tristate "Old quota format support"
+	depends on QUOTA
+	help
+	  This quota format was (is) used by kernels earlier than 2.4.22. If
+	  you have quota working and you don't want to convert to new quota
+	  format say Y here.
+
+config QFMT_V2
+	tristate "Quota format vfsv0 and vfsv1 support"
+	depends on QUOTA
+	select QUOTA_TREE
+	help
+	  This config option enables kernel support for vfsv0 and vfsv1 quota
+	  formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format
+	  also supports 64-bit inode and block quota limits. If you need this
+	  functionality say Y here.
+
+config QUOTACTL
+	bool
+	default n
+
+config QUOTACTL_COMPAT
+	bool
+	depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
+	default y
diff --git a/kernel/fs/quota/Makefile b/kernel/fs/quota/Makefile
new file mode 100644
index 000000000..c66c37cda
--- /dev/null
+++ b/kernel/fs/quota/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_QUOTA)		+= dquot.o
+obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
+obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
+obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
+obj-$(CONFIG_QUOTACTL_COMPAT)	+= compat.o
+obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
diff --git a/kernel/fs/quota/compat.c b/kernel/fs/quota/compat.c
new file mode 100644
index 000000000..fb1892fe3
--- /dev/null
+++ b/kernel/fs/quota/compat.c
@@ -0,0 +1,118 @@
+
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/quotaops.h>
+
+/*
+ * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
+ * and is necessary due to alignment problems.
+ */
+struct compat_if_dqblk {
+	compat_u64 dqb_bhardlimit;
+	compat_u64 dqb_bsoftlimit;
+	compat_u64 dqb_curspace;
+	compat_u64 dqb_ihardlimit;
+	compat_u64 dqb_isoftlimit;
+	compat_u64 dqb_curinodes;
+	compat_u64 dqb_btime;
+	compat_u64 dqb_itime;
+	compat_uint_t dqb_valid;
+};
+
+/* XFS structures */
+struct compat_fs_qfilestat {
+	compat_u64 dqb_bhardlimit;
+	compat_u64 qfs_nblks;
+	compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+	__s8		qs_version;
+	__u16		qs_flags;
+	__s8		qs_pad;
+	struct compat_fs_qfilestat	qs_uquota;
+	struct compat_fs_qfilestat	qs_gquota;
+	compat_uint_t	qs_incoredqs;
+	compat_int_t	qs_btimelimit;
+	compat_int_t	qs_itimelimit;
+	compat_int_t	qs_rtbtimelimit;
+	__u16		qs_bwarnlimit;
+	__u16		qs_iwarnlimit;
+};
+
+asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
+						qid_t id, void __user *addr)
+{
+	unsigned int cmds;
+	struct if_dqblk __user *dqblk;
+	struct compat_if_dqblk __user *compat_dqblk;
+	struct fs_quota_stat __user *fsqstat;
+	struct compat_fs_quota_stat __user *compat_fsqstat;
+	compat_uint_t data;
+	u16 xdata;
+	long ret;
+
+	cmds = cmd >> SUBCMDSHIFT;
+
+	switch (cmds) {
+	case Q_GETQUOTA:
+		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+		compat_dqblk = addr;
+		ret = sys_quotactl(cmd, special, id, dqblk);
+		if (ret)
+			break;
+		if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
+			get_user(data, &dqblk->dqb_valid) ||
+			put_user(data, &compat_dqblk->dqb_valid))
+			ret = -EFAULT;
+		break;
+	case Q_SETQUOTA:
+		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+		compat_dqblk = addr;
+		ret = -EFAULT;
+		if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
+			get_user(data, &compat_dqblk->dqb_valid) ||
+			put_user(data, &dqblk->dqb_valid))
+			break;
+		ret = sys_quotactl(cmd, special, id, dqblk);
+		break;
+	case Q_XGETQSTAT:
+		fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
+		compat_fsqstat = addr;
+		ret = sys_quotactl(cmd, special, id, fsqstat);
+		if (ret)
+			break;
+		ret = -EFAULT;
+		/* Copying qs_version, qs_flags, qs_pad */
+		if (copy_in_user(compat_fsqstat, fsqstat,
+			offsetof(struct compat_fs_quota_stat, qs_uquota)))
+			break;
+		/* Copying qs_uquota */
+		if (copy_in_user(&compat_fsqstat->qs_uquota,
+			&fsqstat->qs_uquota,
+			sizeof(compat_fsqstat->qs_uquota)) ||
+			get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
+			put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
+			break;
+		/* Copying qs_gquota */
+		if (copy_in_user(&compat_fsqstat->qs_gquota,
+			&fsqstat->qs_gquota,
+			sizeof(compat_fsqstat->qs_gquota)) ||
+			get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
+			put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
+			break;
+		/* Copying the rest */
+		if (copy_in_user(&compat_fsqstat->qs_incoredqs,
+			&fsqstat->qs_incoredqs,
+			sizeof(struct compat_fs_quota_stat) -
+			offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
+			get_user(xdata, &fsqstat->qs_iwarnlimit) ||
+			put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
+			break;
+		ret = 0;
+		break;
+	default:
+		ret = sys_quotactl(cmd, special, id, addr);
+	}
+	return ret;
+}
diff --git a/kernel/fs/quota/dquot.c b/kernel/fs/quota/dquot.c
new file mode 100644
index 000000000..20d1f7456
--- /dev/null
+++ b/kernel/fs/quota/dquot.c
@@ -0,0 +1,2889 @@
+/*
+ * Implementation of the diskquota system for the LINUX operating system. QUOTA
+ * is implemented using the BSD system call interface as the means of
+ * communication with the user level. This file contains the generic routines
+ * called by the different filesystems on allocation of an inode or block.
+ * These routines take care of the administration needed to have a consistent
+ * diskquota tracking system. The ideas of both user and group quotas are based
+ * on the Melbourne quota system as used on BSD derived systems. The internal
+ * implementation is based on one of the several variants of the LINUX
+ * inode-subsystem with added complexity of the diskquota system.
+ * 
+ * Author:	Marco van Wieringen <mvw@planets.elm.net>
+ *
+ * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
+ *
+ *		Revised list management to avoid races
+ *		-- Bill Hawes, <whawes@star.net>, 9/98
+ *
+ *		Fixed races in dquot_transfer(), dqget() and dquot_alloc_...().
+ *		As the consequence the locking was moved from dquot_decr_...(),
+ *		dquot_incr_...() to calling functions.
+ *		invalidate_dquots() now writes modified dquots.
+ *		Serialized quota_off() and quota_on() for mount point.
+ *		Fixed a few bugs in grow_dquots().
+ *		Fixed deadlock in write_dquot() - we no longer account quotas on
+ *		quota files
+ *		remove_dquot_ref() moved to inode.c - it now traverses through inodes
+ *		add_dquot_ref() restarts after blocking
+ *		Added check for bogus uid and fixed check for group in quotactl.
+ *		Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99
+ *
+ *		Used struct list_head instead of own list struct
+ *		Invalidation of referenced dquots is no longer possible
+ *		Improved free_dquots list management
+ *		Quota and i_blocks are now updated in one place to avoid races
+ *		Warnings are now delayed so we won't block in critical section
+ *		Write updated not to require dquot lock
+ *		Jan Kara, <jack@suse.cz>, 9/2000
+ *
+ *		Added dynamic quota structure allocation
+ *		Jan Kara <jack@suse.cz> 12/2000
+ *
+ *		Rewritten quota interface. Implemented new quota format and
+ *		formats registering.
+ *		Jan Kara, <jack@suse.cz>, 2001,2002
+ *
+ *		New SMP locking.
+ *		Jan Kara, <jack@suse.cz>, 10/2002
+ *
+ *		Added journalled quota support, fix lock inversion problems
+ *		Jan Kara, <jack@suse.cz>, 2003,2004
+ *
+ * (C) Copyright 1994 - 1997 Marco van Wieringen 
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/tty.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include <linux/namei.h>
+#include <linux/capability.h>
+#include <linux/quotaops.h>
+#include "../internal.h" /* ugh */
+
+#include <linux/uaccess.h>
+
+/*
+ * There are three quota SMP locks. dq_list_lock protects all lists with quotas
+ * and quota formats.
+ * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
+ * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
+ * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
+ * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
+ * modifications of quota state (on quotaon and quotaoff) and readers who care
+ * about latest values take it as well.
+ *
+ * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
+ *   dq_list_lock > dq_state_lock
+ *
+ * Note that some things (eg. sb pointer, type, id) doesn't change during
+ * the life of the dquot structure and so needn't to be protected by a lock
+ *
+ * Operation accessing dquots via inode pointers are protected by dquot_srcu.
+ * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
+ * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
+ * inode and before dropping dquot references to avoid use of dquots after
+ * they are freed. dq_data_lock is used to serialize the pointer setting and
+ * clearing operations.
+ * Special care needs to be taken about S_NOQUOTA inode flag (marking that
+ * inode is a quota file). Functions adding pointers from inode to dquots have
+ * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dq_data_lock. This makes
+ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
+ * then drops all pointers to dquots from an inode.
+ *
+ * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
+ * from inodes (dquot_alloc_space() and such don't check the dq_lock).
+ * Currently dquot is locked only when it is being read to memory (or space for
+ * it is being allocated) on the first dqget() and when it is being released on
+ * the last dqput(). The allocation and release oparations are serialized by
+ * the dq_lock and by checking the use count in dquot_release().  Write
+ * operations on dquots don't hold dq_lock as they copy data under dq_data_lock
+ * spinlock to internal buffers before writing.
+ *
+ * Lock ordering (including related VFS locks) is the following:
+ *   dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
+ * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
+ */
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
+EXPORT_SYMBOL(dq_data_lock);
+DEFINE_STATIC_SRCU(dquot_srcu);
+
+void __quota_error(struct super_block *sb, const char *func,
+		   const char *fmt, ...)
+{
+	if (printk_ratelimit()) {
+		va_list args;
+		struct va_format vaf;
+
+		va_start(args, fmt);
+
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+		       sb->s_id, func, &vaf);
+
+		va_end(args);
+	}
+}
+EXPORT_SYMBOL(__quota_error);
+
+#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
+static char *quotatypes[] = INITQFNAMES;
+#endif
+static struct quota_format_type *quota_formats;	/* List of registered formats */
+static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
+
+/* SLAB cache for dquot structures */
+static struct kmem_cache *dquot_cachep;
+
+int register_quota_format(struct quota_format_type *fmt)
+{
+	spin_lock(&dq_list_lock);
+	fmt->qf_next = quota_formats;
+	quota_formats = fmt;
+	spin_unlock(&dq_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(register_quota_format);
+
+void unregister_quota_format(struct quota_format_type *fmt)
+{
+	struct quota_format_type **actqf;
+
+	spin_lock(&dq_list_lock);
+	for (actqf = &quota_formats; *actqf && *actqf != fmt;
+	     actqf = &(*actqf)->qf_next)
+		;
+	if (*actqf)
+		*actqf = (*actqf)->qf_next;
+	spin_unlock(&dq_list_lock);
+}
+EXPORT_SYMBOL(unregister_quota_format);
+
+static struct quota_format_type *find_quota_format(int id)
+{
+	struct quota_format_type *actqf;
+
+	spin_lock(&dq_list_lock);
+	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+	     actqf = actqf->qf_next)
+		;
+	if (!actqf || !try_module_get(actqf->qf_owner)) {
+		int qm;
+
+		spin_unlock(&dq_list_lock);
+		
+		for (qm = 0; module_names[qm].qm_fmt_id &&
+			     module_names[qm].qm_fmt_id != id; qm++)
+			;
+		if (!module_names[qm].qm_fmt_id ||
+		    request_module(module_names[qm].qm_mod_name))
+			return NULL;
+
+		spin_lock(&dq_list_lock);
+		for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+		     actqf = actqf->qf_next)
+			;
+		if (actqf && !try_module_get(actqf->qf_owner))
+			actqf = NULL;
+	}
+	spin_unlock(&dq_list_lock);
+	return actqf;
+}
+
+static void put_quota_format(struct quota_format_type *fmt)
+{
+	module_put(fmt->qf_owner);
+}
+
+/*
+ * Dquot List Management:
+ * The quota code uses three lists for dquot management: the inuse_list,
+ * free_dquots, and dquot_hash[] array. A single dquot structure may be
+ * on all three lists, depending on its current state.
+ *
+ * All dquots are placed to the end of inuse_list when first created, and this
+ * list is used for invalidate operation, which must look at every dquot.
+ *
+ * Unused dquots (dq_count == 0) are added to the free_dquots list when freed,
+ * and this list is searched whenever we need an available dquot.  Dquots are
+ * removed from the list as soon as they are used again, and
+ * dqstats.free_dquots gives the number of dquots on the list. When
+ * dquot is invalidated it's completely released from memory.
+ *
+ * Dquots with a specific identity (device, type and id) are placed on
+ * one of the dquot_hash[] hash chains. The provides an efficient search
+ * mechanism to locate a specific dquot.
+ */
+
+static LIST_HEAD(inuse_list);
+static LIST_HEAD(free_dquots);
+static unsigned int dq_hash_bits, dq_hash_mask;
+static struct hlist_head *dquot_hash;
+
+struct dqstats dqstats;
+EXPORT_SYMBOL(dqstats);
+
+static qsize_t inode_get_rsv_space(struct inode *inode);
+static void __dquot_initialize(struct inode *inode, int type);
+
+static inline unsigned int
+hashfn(const struct super_block *sb, struct kqid qid)
+{
+	unsigned int id = from_kqid(&init_user_ns, qid);
+	int type = qid.type;
+	unsigned long tmp;
+
+	tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
+	return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask;
+}
+
+/*
+ * Following list functions expect dq_list_lock to be held
+ */
+static inline void insert_dquot_hash(struct dquot *dquot)
+{
+	struct hlist_head *head;
+	head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id);
+	hlist_add_head(&dquot->dq_hash, head);
+}
+
+static inline void remove_dquot_hash(struct dquot *dquot)
+{
+	hlist_del_init(&dquot->dq_hash);
+}
+
+static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
+				struct kqid qid)
+{
+	struct hlist_node *node;
+	struct dquot *dquot;
+
+	hlist_for_each (node, dquot_hash+hashent) {
+		dquot = hlist_entry(node, struct dquot, dq_hash);
+		if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
+			return dquot;
+	}
+	return NULL;
+}
+
+/* Add a dquot to the tail of the free list */
+static inline void put_dquot_last(struct dquot *dquot)
+{
+	list_add_tail(&dquot->dq_free, &free_dquots);
+	dqstats_inc(DQST_FREE_DQUOTS);
+}
+
+static inline void remove_free_dquot(struct dquot *dquot)
+{
+	if (list_empty(&dquot->dq_free))
+		return;
+	list_del_init(&dquot->dq_free);
+	dqstats_dec(DQST_FREE_DQUOTS);
+}
+
+static inline void put_inuse(struct dquot *dquot)
+{
+	/* We add to the back of inuse list so we don't have to restart
+	 * when traversing this list and we block */
+	list_add_tail(&dquot->dq_inuse, &inuse_list);
+	dqstats_inc(DQST_ALLOC_DQUOTS);
+}
+
+static inline void remove_inuse(struct dquot *dquot)
+{
+	dqstats_dec(DQST_ALLOC_DQUOTS);
+	list_del(&dquot->dq_inuse);
+}
+/*
+ * End of list functions needing dq_list_lock
+ */
+
+static void wait_on_dquot(struct dquot *dquot)
+{
+	mutex_lock(&dquot->dq_lock);
+	mutex_unlock(&dquot->dq_lock);
+}
+
+static inline int dquot_dirty(struct dquot *dquot)
+{
+	return test_bit(DQ_MOD_B, &dquot->dq_flags);
+}
+
+static inline int mark_dquot_dirty(struct dquot *dquot)
+{
+	return dquot->dq_sb->dq_op->mark_dirty(dquot);
+}
+
+/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
+int dquot_mark_dquot_dirty(struct dquot *dquot)
+{
+	int ret = 1;
+
+	/* If quota is dirty already, we don't have to acquire dq_list_lock */
+	if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+		return 1;
+
+	spin_lock(&dq_list_lock);
+	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
+		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
+				info[dquot->dq_id.type].dqi_dirty_list);
+		ret = 0;
+	}
+	spin_unlock(&dq_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_mark_dquot_dirty);
+
+/* Dirtify all the dquots - this can block when journalling */
+static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+{
+	int ret, err, cnt;
+
+	ret = err = 0;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (dquot[cnt])
+			/* Even in case of error we have to continue */
+			ret = mark_dquot_dirty(dquot[cnt]);
+		if (!err)
+			err = ret;
+	}
+	return err;
+}
+
+static inline void dqput_all(struct dquot **dquot)
+{
+	unsigned int cnt;
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		dqput(dquot[cnt]);
+}
+
+/* This function needs dq_list_lock */
+static inline int clear_dquot_dirty(struct dquot *dquot)
+{
+	if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags))
+		return 0;
+	list_del_init(&dquot->dq_dirty);
+	return 1;
+}
+
+void mark_info_dirty(struct super_block *sb, int type)
+{
+	set_bit(DQF_INFO_DIRTY_B, &sb_dqopt(sb)->info[type].dqi_flags);
+}
+EXPORT_SYMBOL(mark_info_dirty);
+
+/*
+ *	Read dquot from disk and alloc space for it
+ */
+
+int dquot_acquire(struct dquot *dquot)
+{
+	int ret = 0, ret2 = 0;
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+
+	mutex_lock(&dquot->dq_lock);
+	mutex_lock(&dqopt->dqio_mutex);
+	if (!test_bit(DQ_READ_B, &dquot->dq_flags))
+		ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
+	if (ret < 0)
+		goto out_iolock;
+	set_bit(DQ_READ_B, &dquot->dq_flags);
+	/* Instantiate dquot if needed */
+	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
+		ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
+		/* Write the info if needed */
+		if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
+			ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
+					dquot->dq_sb, dquot->dq_id.type);
+		}
+		if (ret < 0)
+			goto out_iolock;
+		if (ret2 < 0) {
+			ret = ret2;
+			goto out_iolock;
+		}
+	}
+	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_iolock:
+	mutex_unlock(&dqopt->dqio_mutex);
+	mutex_unlock(&dquot->dq_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_acquire);
+
+/*
+ *	Write dquot to disk
+ */
+int dquot_commit(struct dquot *dquot)
+{
+	int ret = 0;
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+
+	mutex_lock(&dqopt->dqio_mutex);
+	spin_lock(&dq_list_lock);
+	if (!clear_dquot_dirty(dquot)) {
+		spin_unlock(&dq_list_lock);
+		goto out_sem;
+	}
+	spin_unlock(&dq_list_lock);
+	/* Inactive dquot can be only if there was error during read/init
+	 * => we have better not writing it */
+	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+		ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
+	else
+		ret = -EIO;
+out_sem:
+	mutex_unlock(&dqopt->dqio_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_commit);
+
+/*
+ *	Release dquot
+ */
+int dquot_release(struct dquot *dquot)
+{
+	int ret = 0, ret2 = 0;
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+
+	mutex_lock(&dquot->dq_lock);
+	/* Check whether we are not racing with some other dqget() */
+	if (atomic_read(&dquot->dq_count) > 1)
+		goto out_dqlock;
+	mutex_lock(&dqopt->dqio_mutex);
+	if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
+		ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
+		/* Write the info */
+		if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
+			ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
+						dquot->dq_sb, dquot->dq_id.type);
+		}
+		if (ret >= 0)
+			ret = ret2;
+	}
+	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+	mutex_unlock(&dqopt->dqio_mutex);
+out_dqlock:
+	mutex_unlock(&dquot->dq_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_release);
+
+void dquot_destroy(struct dquot *dquot)
+{
+	kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+	dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
+/* Invalidate all dquots on the list. Note that this function is called after
+ * quota is disabled and pointers from inodes removed so there cannot be new
+ * quota users. There can still be some users of quotas due to inodes being
+ * just deleted or pruned by prune_icache() (those are not attached to any
+ * list) or parallel quotactl call. We have to wait for such users.
+ */
+static void invalidate_dquots(struct super_block *sb, int type)
+{
+	struct dquot *dquot, *tmp;
+
+restart:
+	spin_lock(&dq_list_lock);
+	list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
+		if (dquot->dq_sb != sb)
+			continue;
+		if (dquot->dq_id.type != type)
+			continue;
+		/* Wait for dquot users */
+		if (atomic_read(&dquot->dq_count)) {
+			DEFINE_WAIT(wait);
+
+			dqgrab(dquot);
+			prepare_to_wait(&dquot->dq_wait_unused, &wait,
+					TASK_UNINTERRUPTIBLE);
+			spin_unlock(&dq_list_lock);
+			/* Once dqput() wakes us up, we know it's time to free
+			 * the dquot.
+			 * IMPORTANT: we rely on the fact that there is always
+			 * at most one process waiting for dquot to free.
+			 * Otherwise dq_count would be > 1 and we would never
+			 * wake up.
+			 */
+			if (atomic_read(&dquot->dq_count) > 1)
+				schedule();
+			finish_wait(&dquot->dq_wait_unused, &wait);
+			dqput(dquot);
+			/* At this moment dquot() need not exist (it could be
+			 * reclaimed by prune_dqcache(). Hence we must
+			 * restart. */
+			goto restart;
+		}
+		/*
+		 * Quota now has no users and it has been written on last
+		 * dqput()
+		 */
+		remove_dquot_hash(dquot);
+		remove_free_dquot(dquot);
+		remove_inuse(dquot);
+		do_destroy_dquot(dquot);
+	}
+	spin_unlock(&dq_list_lock);
+}
+
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+		      int (*fn)(struct dquot *dquot, unsigned long priv),
+		      unsigned long priv)
+{
+	struct dquot *dquot, *old_dquot = NULL;
+	int ret = 0;
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	spin_lock(&dq_list_lock);
+	list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+		if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+			continue;
+		if (dquot->dq_sb != sb)
+			continue;
+		/* Now we have active dquot so we can just increase use count */
+		atomic_inc(&dquot->dq_count);
+		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_LOOKUPS);
+		dqput(old_dquot);
+		old_dquot = dquot;
+		/*
+		 * ->release_dquot() can be racing with us. Our reference
+		 * protects us from new calls to it so just wait for any
+		 * outstanding call and recheck the DQ_ACTIVE_B after that.
+		 */
+		wait_on_dquot(dquot);
+		if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+			ret = fn(dquot, priv);
+			if (ret < 0)
+				goto out;
+		}
+		spin_lock(&dq_list_lock);
+		/* We are safe to continue now because our dquot could not
+		 * be moved out of the inuse list while we hold the reference */
+	}
+	spin_unlock(&dq_list_lock);
+out:
+	dqput(old_dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_scan_active);
+
+/* Write all dquot structures to quota files */
+int dquot_writeback_dquots(struct super_block *sb, int type)
+{
+	struct list_head *dirty;
+	struct dquot *dquot;
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int cnt;
+	int err, ret = 0;
+
+	mutex_lock(&dqopt->dqonoff_mutex);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		spin_lock(&dq_list_lock);
+		dirty = &dqopt->info[cnt].dqi_dirty_list;
+		while (!list_empty(dirty)) {
+			dquot = list_first_entry(dirty, struct dquot,
+						 dq_dirty);
+			/* Dirty and inactive can be only bad dquot... */
+			if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+				clear_dquot_dirty(dquot);
+				continue;
+			}
+			/* Now we have active dquot from which someone is
+ 			 * holding reference so we can safely just increase
+			 * use count */
+			dqgrab(dquot);
+			spin_unlock(&dq_list_lock);
+			dqstats_inc(DQST_LOOKUPS);
+			err = sb->dq_op->write_dquot(dquot);
+			if (!ret && err)
+				ret = err;
+			dqput(dquot);
+			spin_lock(&dq_list_lock);
+		}
+		spin_unlock(&dq_list_lock);
+	}
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+		    && info_dirty(&dqopt->info[cnt]))
+			sb->dq_op->write_info(sb, cnt);
+	dqstats_inc(DQST_SYNCS);
+	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(dquot_writeback_dquots);
+
+/* Write all dquot structures to disk and make them visible from userspace */
+int dquot_quota_sync(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int cnt;
+	int ret;
+
+	ret = dquot_writeback_dquots(sb, type);
+	if (ret)
+		return ret;
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		return 0;
+
+	/* This is not very clever (and fast) but currently I don't know about
+	 * any other simple way of getting quota data to disk and we must get
+	 * them there for userspace to be visible... */
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+
+	/*
+	 * Now when everything is written we can discard the pagecache so
+	 * that userspace sees the changes.
+	 */
+	mutex_lock(&dqopt->dqonoff_mutex);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		mutex_lock(&dqopt->files[cnt]->i_mutex);
+		truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
+		mutex_unlock(&dqopt->files[cnt]->i_mutex);
+	}
+	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(dquot_quota_sync);
+
+static unsigned long
+dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct list_head *head;
+	struct dquot *dquot;
+	unsigned long freed = 0;
+
+	spin_lock(&dq_list_lock);
+	head = free_dquots.prev;
+	while (head != &free_dquots && sc->nr_to_scan) {
+		dquot = list_entry(head, struct dquot, dq_free);
+		remove_dquot_hash(dquot);
+		remove_free_dquot(dquot);
+		remove_inuse(dquot);
+		do_destroy_dquot(dquot);
+		sc->nr_to_scan--;
+		freed++;
+		head = free_dquots.prev;
+	}
+	spin_unlock(&dq_list_lock);
+	return freed;
+}
+
+static unsigned long
+dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(
+	percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
+}
+
+static struct shrinker dqcache_shrinker = {
+	.count_objects = dqcache_shrink_count,
+	.scan_objects = dqcache_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * Put reference to dquot
+ */
+void dqput(struct dquot *dquot)
+{
+	int ret;
+
+	if (!dquot)
+		return;
+#ifdef CONFIG_QUOTA_DEBUG
+	if (!atomic_read(&dquot->dq_count)) {
+		quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
+			    quotatypes[dquot->dq_id.type],
+			    from_kqid(&init_user_ns, dquot->dq_id));
+		BUG();
+	}
+#endif
+	dqstats_inc(DQST_DROPS);
+we_slept:
+	spin_lock(&dq_list_lock);
+	if (atomic_read(&dquot->dq_count) > 1) {
+		/* We have more than one user... nothing to do */
+		atomic_dec(&dquot->dq_count);
+		/* Releasing dquot during quotaoff phase? */
+		if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
+		    atomic_read(&dquot->dq_count) == 1)
+			wake_up(&dquot->dq_wait_unused);
+		spin_unlock(&dq_list_lock);
+		return;
+	}
+	/* Need to release dquot? */
+	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) {
+		spin_unlock(&dq_list_lock);
+		/* Commit dquot before releasing */
+		ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+		if (ret < 0) {
+			quota_error(dquot->dq_sb, "Can't write quota structure"
+				    " (error %d). Quota may get out of sync!",
+				    ret);
+			/*
+			 * We clear dirty bit anyway, so that we avoid
+			 * infinite loop here
+			 */
+			spin_lock(&dq_list_lock);
+			clear_dquot_dirty(dquot);
+			spin_unlock(&dq_list_lock);
+		}
+		goto we_slept;
+	}
+	/* Clear flag in case dquot was inactive (something bad happened) */
+	clear_dquot_dirty(dquot);
+	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+		spin_unlock(&dq_list_lock);
+		dquot->dq_sb->dq_op->release_dquot(dquot);
+		goto we_slept;
+	}
+	atomic_dec(&dquot->dq_count);
+#ifdef CONFIG_QUOTA_DEBUG
+	/* sanity check */
+	BUG_ON(!list_empty(&dquot->dq_free));
+#endif
+	put_dquot_last(dquot);
+	spin_unlock(&dq_list_lock);
+}
+EXPORT_SYMBOL(dqput);
+
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+	return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
+
+static struct dquot *get_empty_dquot(struct super_block *sb, int type)
+{
+	struct dquot *dquot;
+
+	dquot = sb->dq_op->alloc_dquot(sb, type);
+	if(!dquot)
+		return NULL;
+
+	mutex_init(&dquot->dq_lock);
+	INIT_LIST_HEAD(&dquot->dq_free);
+	INIT_LIST_HEAD(&dquot->dq_inuse);
+	INIT_HLIST_NODE(&dquot->dq_hash);
+	INIT_LIST_HEAD(&dquot->dq_dirty);
+	init_waitqueue_head(&dquot->dq_wait_unused);
+	dquot->dq_sb = sb;
+	dquot->dq_id = make_kqid_invalid(type);
+	atomic_set(&dquot->dq_count, 1);
+
+	return dquot;
+}
+
+/*
+ * Get reference to dquot
+ *
+ * Locking is slightly tricky here. We are guarded from parallel quotaoff()
+ * destroying our dquot by:
+ *   a) checking for quota flags under dq_list_lock and
+ *   b) getting a reference to dquot before we release dq_list_lock
+ */
+struct dquot *dqget(struct super_block *sb, struct kqid qid)
+{
+	unsigned int hashent = hashfn(sb, qid);
+	struct dquot *dquot = NULL, *empty = NULL;
+
+        if (!sb_has_quota_active(sb, qid.type))
+		return NULL;
+we_slept:
+	spin_lock(&dq_list_lock);
+	spin_lock(&dq_state_lock);
+	if (!sb_has_quota_active(sb, qid.type)) {
+		spin_unlock(&dq_state_lock);
+		spin_unlock(&dq_list_lock);
+		goto out;
+	}
+	spin_unlock(&dq_state_lock);
+
+	dquot = find_dquot(hashent, sb, qid);
+	if (!dquot) {
+		if (!empty) {
+			spin_unlock(&dq_list_lock);
+			empty = get_empty_dquot(sb, qid.type);
+			if (!empty)
+				schedule();	/* Try to wait for a moment... */
+			goto we_slept;
+		}
+		dquot = empty;
+		empty = NULL;
+		dquot->dq_id = qid;
+		/* all dquots go on the inuse_list */
+		put_inuse(dquot);
+		/* hash it first so it can be found */
+		insert_dquot_hash(dquot);
+		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_LOOKUPS);
+	} else {
+		if (!atomic_read(&dquot->dq_count))
+			remove_free_dquot(dquot);
+		atomic_inc(&dquot->dq_count);
+		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_CACHE_HITS);
+		dqstats_inc(DQST_LOOKUPS);
+	}
+	/* Wait for dq_lock - after this we know that either dquot_release() is
+	 * already finished or it will be canceled due to dq_count > 1 test */
+	wait_on_dquot(dquot);
+	/* Read the dquot / allocate space in quota file */
+	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) &&
+	    sb->dq_op->acquire_dquot(dquot) < 0) {
+		dqput(dquot);
+		dquot = NULL;
+		goto out;
+	}
+#ifdef CONFIG_QUOTA_DEBUG
+	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
+#endif
+out:
+	if (empty)
+		do_destroy_dquot(empty);
+
+	return dquot;
+}
+EXPORT_SYMBOL(dqget);
+
+static inline struct dquot **i_dquot(struct inode *inode)
+{
+	return inode->i_sb->s_op->get_dquots(inode);
+}
+
+static int dqinit_needed(struct inode *inode, int type)
+{
+	struct dquot * const *dquots;
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+
+	dquots = i_dquot(inode);
+	if (type != -1)
+		return !dquots[type];
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (!dquots[cnt])
+			return 1;
+	return 0;
+}
+
+/* This routine is guarded by dqonoff_mutex mutex */
+static void add_dquot_ref(struct super_block *sb, int type)
+{
+	struct inode *inode, *old_inode = NULL;
+#ifdef CONFIG_QUOTA_DEBUG
+	int reserved = 0;
+#endif
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    !atomic_read(&inode->i_writecount) ||
+		    !dqinit_needed(inode, type)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+
+#ifdef CONFIG_QUOTA_DEBUG
+		if (unlikely(inode_get_rsv_space(inode) > 0))
+			reserved = 1;
+#endif
+		iput(old_inode);
+		__dquot_initialize(inode, type);
+
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock We cannot iput the inode now as we can be
+		 * holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
+		old_inode = inode;
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+
+#ifdef CONFIG_QUOTA_DEBUG
+	if (reserved) {
+		quota_error(sb, "Writes happened before quota was turned on "
+			"thus quota information is probably inconsistent. "
+			"Please run quotacheck(8)");
+	}
+#endif
+}
+
+/*
+ * Remove references to dquots from inode and add dquot to list for freeing
+ * if we have the last reference to dquot
+ */
+static void remove_inode_dquot_ref(struct inode *inode, int type,
+				   struct list_head *tofree_head)
+{
+	struct dquot **dquots = i_dquot(inode);
+	struct dquot *dquot = dquots[type];
+
+	if (!dquot)
+		return;
+
+	dquots[type] = NULL;
+	if (list_empty(&dquot->dq_free)) {
+		/*
+		 * The inode still has reference to dquot so it can't be in the
+		 * free list
+		 */
+		spin_lock(&dq_list_lock);
+		list_add(&dquot->dq_free, tofree_head);
+		spin_unlock(&dq_list_lock);
+	} else {
+		/*
+		 * Dquot is already in a list to put so we won't drop the last
+		 * reference here.
+		 */
+		dqput(dquot);
+	}
+}
+
+/*
+ * Free list of dquots
+ * Dquots are removed from inodes and no new references can be got so we are
+ * the only ones holding reference
+ */
+static void put_dquot_list(struct list_head *tofree_head)
+{
+	struct list_head *act_head;
+	struct dquot *dquot;
+
+	act_head = tofree_head->next;
+	while (act_head != tofree_head) {
+		dquot = list_entry(act_head, struct dquot, dq_free);
+		act_head = act_head->next;
+		/* Remove dquot from the list so we won't have problems... */
+		list_del_init(&dquot->dq_free);
+		dqput(dquot);
+	}
+}
+
+static void remove_dquot_ref(struct super_block *sb, int type,
+		struct list_head *tofree_head)
+{
+	struct inode *inode;
+	int reserved = 0;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		/*
+		 *  We have to scan also I_NEW inodes because they can already
+		 *  have quota pointer initialized. Luckily, we need to touch
+		 *  only quota pointers and these have separate locking
+		 *  (dq_data_lock).
+		 */
+		spin_lock(&dq_data_lock);
+		if (!IS_NOQUOTA(inode)) {
+			if (unlikely(inode_get_rsv_space(inode) > 0))
+				reserved = 1;
+			remove_inode_dquot_ref(inode, type, tofree_head);
+		}
+		spin_unlock(&dq_data_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+#ifdef CONFIG_QUOTA_DEBUG
+	if (reserved) {
+		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+			" was disabled thus quota information is probably "
+			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
+	}
+#endif
+}
+
+/* Gather all references from inodes and drop them */
+static void drop_dquot_ref(struct super_block *sb, int type)
+{
+	LIST_HEAD(tofree_head);
+
+	if (sb->dq_op) {
+		remove_dquot_ref(sb, type, &tofree_head);
+		synchronize_srcu(&dquot_srcu);
+		put_dquot_list(&tofree_head);
+	}
+}
+
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_curinodes += number;
+}
+
+static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_curspace += number;
+}
+
+static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace += number;
+}
+
+/*
+ * Claim reserved quota space
+ */
+static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	if (dquot->dq_dqb.dqb_rsvspace < number) {
+		WARN_ON_ONCE(1);
+		number = dquot->dq_dqb.dqb_rsvspace;
+	}
+	dquot->dq_dqb.dqb_curspace += number;
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
+static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
+		number = dquot->dq_dqb.dqb_curspace;
+	dquot->dq_dqb.dqb_rsvspace += number;
+	dquot->dq_dqb.dqb_curspace -= number;
+}
+
+static inline
+void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	if (dquot->dq_dqb.dqb_rsvspace >= number)
+		dquot->dq_dqb.dqb_rsvspace -= number;
+	else {
+		WARN_ON_ONCE(1);
+		dquot->dq_dqb.dqb_rsvspace = 0;
+	}
+}
+
+static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
+{
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curinodes >= number)
+		dquot->dq_dqb.dqb_curinodes -= number;
+	else
+		dquot->dq_dqb.dqb_curinodes = 0;
+	if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+		dquot->dq_dqb.dqb_itime = (time_t) 0;
+	clear_bit(DQ_INODES_B, &dquot->dq_flags);
+}
+
+static void dquot_decr_space(struct dquot *dquot, qsize_t number)
+{
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curspace >= number)
+		dquot->dq_dqb.dqb_curspace -= number;
+	else
+		dquot->dq_dqb.dqb_curspace = 0;
+	if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
+		dquot->dq_dqb.dqb_btime = (time_t) 0;
+	clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+}
+
+struct dquot_warn {
+	struct super_block *w_sb;
+	struct kqid w_dq_id;
+	short w_type;
+};
+
+static int warning_issued(struct dquot *dquot, const int warntype)
+{
+	int flag = (warntype == QUOTA_NL_BHARDWARN ||
+		warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
+		((warntype == QUOTA_NL_IHARDWARN ||
+		warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
+
+	if (!flag)
+		return 0;
+	return test_and_set_bit(flag, &dquot->dq_flags);
+}
+
+#ifdef CONFIG_PRINT_QUOTA_WARNING
+static int flag_print_warnings = 1;
+
+static int need_print_warning(struct dquot_warn *warn)
+{
+	if (!flag_print_warnings)
+		return 0;
+
+	switch (warn->w_dq_id.type) {
+		case USRQUOTA:
+			return uid_eq(current_fsuid(), warn->w_dq_id.uid);
+		case GRPQUOTA:
+			return in_group_p(warn->w_dq_id.gid);
+		case PRJQUOTA:
+			return 1;
+	}
+	return 0;
+}
+
+/* Print warning to user which exceeded quota */
+static void print_warning(struct dquot_warn *warn)
+{
+	char *msg = NULL;
+	struct tty_struct *tty;
+	int warntype = warn->w_type;
+
+	if (warntype == QUOTA_NL_IHARDBELOW ||
+	    warntype == QUOTA_NL_ISOFTBELOW ||
+	    warntype == QUOTA_NL_BHARDBELOW ||
+	    warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
+		return;
+
+	tty = get_current_tty();
+	if (!tty)
+		return;
+	tty_write_message(tty, warn->w_sb->s_id);
+	if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
+		tty_write_message(tty, ": warning, ");
+	else
+		tty_write_message(tty, ": write failed, ");
+	tty_write_message(tty, quotatypes[warn->w_dq_id.type]);
+	switch (warntype) {
+		case QUOTA_NL_IHARDWARN:
+			msg = " file limit reached.\r\n";
+			break;
+		case QUOTA_NL_ISOFTLONGWARN:
+			msg = " file quota exceeded too long.\r\n";
+			break;
+		case QUOTA_NL_ISOFTWARN:
+			msg = " file quota exceeded.\r\n";
+			break;
+		case QUOTA_NL_BHARDWARN:
+			msg = " block limit reached.\r\n";
+			break;
+		case QUOTA_NL_BSOFTLONGWARN:
+			msg = " block quota exceeded too long.\r\n";
+			break;
+		case QUOTA_NL_BSOFTWARN:
+			msg = " block quota exceeded.\r\n";
+			break;
+	}
+	tty_write_message(tty, msg);
+	tty_kref_put(tty);
+}
+#endif
+
+static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
+			    int warntype)
+{
+	if (warning_issued(dquot, warntype))
+		return;
+	warn->w_type = warntype;
+	warn->w_sb = dquot->dq_sb;
+	warn->w_dq_id = dquot->dq_id;
+}
+
+/*
+ * Write warnings to the console and send warning messages over netlink.
+ *
+ * Note that this function can call into tty and networking code.
+ */
+static void flush_warnings(struct dquot_warn *warn)
+{
+	int i;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (warn[i].w_type == QUOTA_NL_NOWARN)
+			continue;
+#ifdef CONFIG_PRINT_QUOTA_WARNING
+		print_warning(&warn[i]);
+#endif
+		quota_send_warning(warn[i].w_dq_id,
+				   warn[i].w_sb->s_dev, warn[i].w_type);
+	}
+}
+
+static int ignore_hardlimit(struct dquot *dquot)
+{
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
+}
+
+/* needs dq_data_lock */
+static int check_idq(struct dquot *dquot, qsize_t inodes,
+		     struct dquot_warn *warn)
+{
+	qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
+
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
+		return 0;
+
+	if (dquot->dq_dqb.dqb_ihardlimit &&
+	    newinodes > dquot->dq_dqb.dqb_ihardlimit &&
+            !ignore_hardlimit(dquot)) {
+		prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_itime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_itime &&
+            !ignore_hardlimit(dquot)) {
+		prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_itime == 0) {
+		prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
+		dquot->dq_dqb.dqb_itime = get_seconds() +
+		    sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
+	}
+
+	return 0;
+}
+
+/* needs dq_data_lock */
+static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
+		     struct dquot_warn *warn)
+{
+	qsize_t tspace;
+	struct super_block *sb = dquot->dq_sb;
+
+	if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
+		return 0;
+
+	tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+		+ space;
+
+	if (dquot->dq_dqb.dqb_bhardlimit &&
+	    tspace > dquot->dq_dqb.dqb_bhardlimit &&
+            !ignore_hardlimit(dquot)) {
+		if (!prealloc)
+			prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_btime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_btime &&
+            !ignore_hardlimit(dquot)) {
+		if (!prealloc)
+			prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_btime == 0) {
+		if (!prealloc) {
+			prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
+			dquot->dq_dqb.dqb_btime = get_seconds() +
+			    sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
+		}
+		else
+			/*
+			 * We don't allow preallocation to exceed softlimit so exceeding will
+			 * be always printed
+			 */
+			return -EDQUOT;
+	}
+
+	return 0;
+}
+
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
+{
+	qsize_t newinodes;
+
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+	    !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type))
+		return QUOTA_NL_NOWARN;
+
+	newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
+	if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
+		return QUOTA_NL_ISOFTBELOW;
+	if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
+	    newinodes < dquot->dq_dqb.dqb_ihardlimit)
+		return QUOTA_NL_IHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
+
+static int info_bdq_free(struct dquot *dquot, qsize_t space)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_NOWARN;
+
+	if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_BSOFTBELOW;
+	if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
+		return QUOTA_NL_BHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
+
+static int dquot_active(const struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+	return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
+}
+
+/*
+ * Initialize quota pointers in inode
+ *
+ * It is better to call this function outside of any transaction as it
+ * might need a lot of space in journal for dquot structure allocation.
+ */
+static void __dquot_initialize(struct inode *inode, int type)
+{
+	int cnt, init_needed = 0;
+	struct dquot **dquots, *got[MAXQUOTAS];
+	struct super_block *sb = inode->i_sb;
+	qsize_t rsv;
+
+	if (!dquot_active(inode))
+		return;
+
+	dquots = i_dquot(inode);
+
+	/* First get references to structures we might need. */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		struct kqid qid;
+		kprojid_t projid;
+		int rc;
+
+		got[cnt] = NULL;
+		if (type != -1 && cnt != type)
+			continue;
+		/*
+		 * The i_dquot should have been initialized in most cases,
+		 * we check it without locking here to avoid unnecessary
+		 * dqget()/dqput() calls.
+		 */
+		if (dquots[cnt])
+			continue;
+
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+
+		init_needed = 1;
+
+		switch (cnt) {
+		case USRQUOTA:
+			qid = make_kqid_uid(inode->i_uid);
+			break;
+		case GRPQUOTA:
+			qid = make_kqid_gid(inode->i_gid);
+			break;
+		case PRJQUOTA:
+			rc = inode->i_sb->dq_op->get_projid(inode, &projid);
+			if (rc)
+				continue;
+			qid = make_kqid_projid(projid);
+			break;
+		}
+		got[cnt] = dqget(sb, qid);
+	}
+
+	/* All required i_dquot has been initialized */
+	if (!init_needed)
+		return;
+
+	spin_lock(&dq_data_lock);
+	if (IS_NOQUOTA(inode))
+		goto out_err;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		/* We could race with quotaon or dqget() could have failed */
+		if (!got[cnt])
+			continue;
+		if (!dquots[cnt]) {
+			dquots[cnt] = got[cnt];
+			got[cnt] = NULL;
+			/*
+			 * Make quota reservation system happy if someone
+			 * did a write before quota was turned on
+			 */
+			rsv = inode_get_rsv_space(inode);
+			if (unlikely(rsv))
+				dquot_resv_space(dquots[cnt], rsv);
+		}
+	}
+out_err:
+	spin_unlock(&dq_data_lock);
+	/* Drop unused references */
+	dqput_all(got);
+}
+
+void dquot_initialize(struct inode *inode)
+{
+	__dquot_initialize(inode, -1);
+}
+EXPORT_SYMBOL(dquot_initialize);
+
+/*
+ * Release all quotas referenced by inode.
+ *
+ * This function only be called on inode free or converting
+ * a file to quota file, no other users for the i_dquot in
+ * both cases, so we needn't call synchronize_srcu() after
+ * clearing i_dquot.
+ */
+static void __dquot_drop(struct inode *inode)
+{
+	int cnt;
+	struct dquot **dquots = i_dquot(inode);
+	struct dquot *put[MAXQUOTAS];
+
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		put[cnt] = dquots[cnt];
+		dquots[cnt] = NULL;
+	}
+	spin_unlock(&dq_data_lock);
+	dqput_all(put);
+}
+
+void dquot_drop(struct inode *inode)
+{
+	struct dquot * const *dquots;
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		return;
+
+	/*
+	 * Test before calling to rule out calls from proc and such
+	 * where we are not allowed to block. Note that this is
+	 * actually reliable test even without the lock - the caller
+	 * must assure that nobody can come after the DQUOT_DROP and
+	 * add quota pointers back anyway.
+	 */
+	dquots = i_dquot(inode);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (dquots[cnt])
+			break;
+	}
+
+	if (cnt < MAXQUOTAS)
+		__dquot_drop(inode);
+}
+EXPORT_SYMBOL(dquot_drop);
+
+/*
+ * inode_reserved_space is managed internally by quota, and protected by
+ * i_lock similar to i_blocks+i_bytes.
+ */
+static qsize_t *inode_reserved_space(struct inode * inode)
+{
+	/* Filesystem must explicitly define it's own method in order to use
+	 * quota reservation interface */
+	BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
+	return inode->i_sb->dq_op->get_reserved_space(inode);
+}
+
+void inode_add_rsv_space(struct inode *inode, qsize_t number)
+{
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) += number;
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_add_rsv_space);
+
+void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+{
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) -= number;
+	__inode_add_bytes(inode, number);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_claim_rsv_space);
+
+void inode_reclaim_rsv_space(struct inode *inode, qsize_t number)
+{
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) += number;
+	__inode_sub_bytes(inode, number);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_reclaim_rsv_space);
+
+void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+{
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) -= number;
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_sub_rsv_space);
+
+static qsize_t inode_get_rsv_space(struct inode *inode)
+{
+	qsize_t ret;
+
+	if (!inode->i_sb->dq_op->get_reserved_space)
+		return 0;
+	spin_lock(&inode->i_lock);
+	ret = *inode_reserved_space(inode);
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+static void inode_incr_space(struct inode *inode, qsize_t number,
+				int reserve)
+{
+	if (reserve)
+		inode_add_rsv_space(inode, number);
+	else
+		inode_add_bytes(inode, number);
+}
+
+static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+{
+	if (reserve)
+		inode_sub_rsv_space(inode, number);
+	else
+		inode_sub_bytes(inode, number);
+}
+
+/*
+ * This functions updates i_blocks+i_bytes fields and quota information
+ * (together with appropriate checks).
+ *
+ * NOTE: We absolutely rely on the fact that caller dirties the inode
+ * (usually helpers in quotaops.h care about this) and holds a handle for
+ * the current transaction so that dquot write and inode write go into the
+ * same transaction.
+ */
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
+{
+	int cnt, ret = 0, index;
+	struct dquot_warn warn[MAXQUOTAS];
+	int reserve = flags & DQUOT_SPACE_RESERVE;
+	struct dquot **dquots;
+
+	if (!dquot_active(inode)) {
+		inode_incr_space(inode, number, reserve);
+		goto out;
+	}
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		warn[cnt].w_type = QUOTA_NL_NOWARN;
+
+	dquots = i_dquot(inode);
+	index = srcu_read_lock(&dquot_srcu);
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!dquots[cnt])
+			continue;
+		ret = check_bdq(dquots[cnt], number,
+				!(flags & DQUOT_SPACE_WARN), &warn[cnt]);
+		if (ret && !(flags & DQUOT_SPACE_NOFAIL)) {
+			spin_unlock(&dq_data_lock);
+			goto out_flush_warn;
+		}
+	}
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!dquots[cnt])
+			continue;
+		if (reserve)
+			dquot_resv_space(dquots[cnt], number);
+		else
+			dquot_incr_space(dquots[cnt], number);
+	}
+	inode_incr_space(inode, number, reserve);
+	spin_unlock(&dq_data_lock);
+
+	if (reserve)
+		goto out_flush_warn;
+	mark_all_dquot_dirty(dquots);
+out_flush_warn:
+	srcu_read_unlock(&dquot_srcu, index);
+	flush_warnings(warn);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(__dquot_alloc_space);
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+int dquot_alloc_inode(struct inode *inode)
+{
+	int cnt, ret = 0, index;
+	struct dquot_warn warn[MAXQUOTAS];
+	struct dquot * const *dquots;
+
+	if (!dquot_active(inode))
+		return 0;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		warn[cnt].w_type = QUOTA_NL_NOWARN;
+
+	dquots = i_dquot(inode);
+	index = srcu_read_lock(&dquot_srcu);
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!dquots[cnt])
+			continue;
+		ret = check_idq(dquots[cnt], 1, &warn[cnt]);
+		if (ret)
+			goto warn_put_all;
+	}
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!dquots[cnt])
+			continue;
+		dquot_incr_inodes(dquots[cnt], 1);
+	}
+
+warn_put_all:
+	spin_unlock(&dq_data_lock);
+	if (ret == 0)
+		mark_all_dquot_dirty(dquots);
+	srcu_read_unlock(&dquot_srcu, index);
+	flush_warnings(warn);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_alloc_inode);
+
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
+{
+	struct dquot **dquots;
+	int cnt, index;
+
+	if (!dquot_active(inode)) {
+		inode_claim_rsv_space(inode, number);
+		return 0;
+	}
+
+	dquots = i_dquot(inode);
+	index = srcu_read_lock(&dquot_srcu);
+	spin_lock(&dq_data_lock);
+	/* Claim reserved quotas to allocated quotas */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (dquots[cnt])
+			dquot_claim_reserved_space(dquots[cnt], number);
+	}
+	/* Update inode bytes */
+	inode_claim_rsv_space(inode, number);
+	spin_unlock(&dq_data_lock);
+	mark_all_dquot_dirty(dquots);
+	srcu_read_unlock(&dquot_srcu, index);
+	return 0;
+}
+EXPORT_SYMBOL(dquot_claim_space_nodirty);
+
+/*
+ * Convert allocated space back to in-memory reserved quotas
+ */
+void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
+{
+	struct dquot **dquots;
+	int cnt, index;
+
+	if (!dquot_active(inode)) {
+		inode_reclaim_rsv_space(inode, number);
+		return;
+	}
+
+	dquots = i_dquot(inode);
+	index = srcu_read_lock(&dquot_srcu);
+	spin_lock(&dq_data_lock);
+	/* Claim reserved quotas to allocated quotas */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (dquots[cnt])
+			dquot_reclaim_reserved_space(dquots[cnt], number);
+	}
+	/* Update inode bytes */
+	inode_reclaim_rsv_space(inode, number);
+	spin_unlock(&dq_data_lock);
+	mark_all_dquot_dirty(dquots);
+	srcu_read_unlock(&dquot_srcu, index);
+	return;
+}
+EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
+{
+	unsigned int cnt;
+	struct dquot_warn warn[MAXQUOTAS];
+	struct dquot **dquots;
+	int reserve = flags & DQUOT_SPACE_RESERVE, index;
+
+	if (!dquot_active(inode)) {
+		inode_decr_space(inode, number, reserve);
+		return;
+	}
+
+	dquots = i_dquot(inode);
+	index = srcu_read_lock(&dquot_srcu);
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		int wtype;
+
+		warn[cnt].w_type = QUOTA_NL_NOWARN;
+		if (!dquots[cnt])
+			continue;
+		wtype = info_bdq_free(dquots[cnt], number);
+		if (wtype != QUOTA_NL_NOWARN)
+			prepare_warning(&warn[cnt], dquots[cnt], wtype);
+		if (reserve)
+			dquot_free_reserved_space(dquots[cnt], number);
+		else
+			dquot_decr_space(dquots[cnt], number);
+	}
+	inode_decr_space(inode, number, reserve);
+	spin_unlock(&dq_data_lock);
+
+	if (reserve)
+		goto out_unlock;
+	mark_all_dquot_dirty(dquots);
+out_unlock:
+	srcu_read_unlock(&dquot_srcu, index);
+	flush_warnings(warn);
+}
+EXPORT_SYMBOL(__dquot_free_space);
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+void dquot_free_inode(struct inode *inode)
+{
+	unsigned int cnt;
+	struct dquot_warn warn[MAXQUOTAS];
+	struct dquot * const *dquots;
+	int index;
+
+	if (!dquot_active(inode))
+		return;
+
+	dquots = i_dquot(inode);
+	index = srcu_read_lock(&dquot_srcu);
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		int wtype;
+
+		warn[cnt].w_type = QUOTA_NL_NOWARN;
+		if (!dquots[cnt])
+			continue;
+		wtype = info_idq_free(dquots[cnt], 1);
+		if (wtype != QUOTA_NL_NOWARN)
+			prepare_warning(&warn[cnt], dquots[cnt], wtype);
+		dquot_decr_inodes(dquots[cnt], 1);
+	}
+	spin_unlock(&dq_data_lock);
+	mark_all_dquot_dirty(dquots);
+	srcu_read_unlock(&dquot_srcu, index);
+	flush_warnings(warn);
+}
+EXPORT_SYMBOL(dquot_free_inode);
+
+/*
+ * Transfer the number of inode and blocks from one diskquota to an other.
+ * On success, dquot references in transfer_to are consumed and references
+ * to original dquots that need to be released are placed there. On failure,
+ * references are kept untouched.
+ *
+ * This operation can block, but only after everything is updated
+ * A transaction must be started when entering this function.
+ *
+ * We are holding reference on transfer_from & transfer_to, no need to
+ * protect them by srcu_read_lock().
+ */
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
+{
+	qsize_t space, cur_space;
+	qsize_t rsv_space = 0;
+	struct dquot *transfer_from[MAXQUOTAS] = {};
+	int cnt, ret = 0;
+	char is_valid[MAXQUOTAS] = {};
+	struct dquot_warn warn_to[MAXQUOTAS];
+	struct dquot_warn warn_from_inodes[MAXQUOTAS];
+	struct dquot_warn warn_from_space[MAXQUOTAS];
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+	/* Initialize the arrays */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		warn_to[cnt].w_type = QUOTA_NL_NOWARN;
+		warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
+		warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
+	}
+
+	spin_lock(&dq_data_lock);
+	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
+		spin_unlock(&dq_data_lock);
+		return 0;
+	}
+	cur_space = inode_get_bytes(inode);
+	rsv_space = inode_get_rsv_space(inode);
+	space = cur_space + rsv_space;
+	/* Build the transfer_from list and check the limits */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		/*
+		 * Skip changes for same uid or gid or for turned off quota-type.
+		 */
+		if (!transfer_to[cnt])
+			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(inode->i_sb, cnt))
+			continue;
+		is_valid[cnt] = 1;
+		transfer_from[cnt] = i_dquot(inode)[cnt];
+		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
+		if (ret)
+			goto over_quota;
+		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
+		if (ret)
+			goto over_quota;
+	}
+
+	/*
+	 * Finally perform the needed transfer from transfer_from to transfer_to
+	 */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!is_valid[cnt])
+			continue;
+		/* Due to IO error we might not have transfer_from[] structure */
+		if (transfer_from[cnt]) {
+			int wtype;
+			wtype = info_idq_free(transfer_from[cnt], 1);
+			if (wtype != QUOTA_NL_NOWARN)
+				prepare_warning(&warn_from_inodes[cnt],
+						transfer_from[cnt], wtype);
+			wtype = info_bdq_free(transfer_from[cnt], space);
+			if (wtype != QUOTA_NL_NOWARN)
+				prepare_warning(&warn_from_space[cnt],
+						transfer_from[cnt], wtype);
+			dquot_decr_inodes(transfer_from[cnt], 1);
+			dquot_decr_space(transfer_from[cnt], cur_space);
+			dquot_free_reserved_space(transfer_from[cnt],
+						  rsv_space);
+		}
+
+		dquot_incr_inodes(transfer_to[cnt], 1);
+		dquot_incr_space(transfer_to[cnt], cur_space);
+		dquot_resv_space(transfer_to[cnt], rsv_space);
+
+		i_dquot(inode)[cnt] = transfer_to[cnt];
+	}
+	spin_unlock(&dq_data_lock);
+
+	mark_all_dquot_dirty(transfer_from);
+	mark_all_dquot_dirty(transfer_to);
+	flush_warnings(warn_to);
+	flush_warnings(warn_from_inodes);
+	flush_warnings(warn_from_space);
+	/* Pass back references to put */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (is_valid[cnt])
+			transfer_to[cnt] = transfer_from[cnt];
+	return 0;
+over_quota:
+	spin_unlock(&dq_data_lock);
+	flush_warnings(warn_to);
+	return ret;
+}
+EXPORT_SYMBOL(__dquot_transfer);
+
+/* Wrapper for transferring ownership of an inode for uid/gid only
+ * Called from FSXXX_setattr()
+ */
+int dquot_transfer(struct inode *inode, struct iattr *iattr)
+{
+	struct dquot *transfer_to[MAXQUOTAS] = {};
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	if (!dquot_active(inode))
+		return 0;
+
+	if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid))
+		transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid));
+	if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))
+		transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid));
+
+	ret = __dquot_transfer(inode, transfer_to);
+	dqput_all(transfer_to);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_transfer);
+
+/*
+ * Write info of quota file to disk
+ */
+int dquot_commit_info(struct super_block *sb, int type)
+{
+	int ret;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	mutex_lock(&dqopt->dqio_mutex);
+	ret = dqopt->ops[type]->write_file_info(sb, type);
+	mutex_unlock(&dqopt->dqio_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_commit_info);
+
+/*
+ * Definitions of diskquota operations.
+ */
+const struct dquot_operations dquot_operations = {
+	.write_dquot	= dquot_commit,
+	.acquire_dquot	= dquot_acquire,
+	.release_dquot	= dquot_release,
+	.mark_dirty	= dquot_mark_dquot_dirty,
+	.write_info	= dquot_commit_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+EXPORT_SYMBOL(dquot_operations);
+
+/*
+ * Generic helper for ->open on filesystems supporting disk quotas.
+ */
+int dquot_file_open(struct inode *inode, struct file *file)
+{
+	int error;
+
+	error = generic_file_open(inode, file);
+	if (!error && (file->f_mode & FMODE_WRITE))
+		dquot_initialize(inode);
+	return error;
+}
+EXPORT_SYMBOL(dquot_file_open);
+
+/*
+ * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
+ */
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
+{
+	int cnt, ret = 0;
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct inode *toputinode[MAXQUOTAS];
+
+	/* Cannot turn off usage accounting without turning off limits, or
+	 * suspend quotas and simultaneously turn quotas off. */
+	if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+	    || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+	    DQUOT_USAGE_ENABLED)))
+		return -EINVAL;
+
+	/* We need to serialize quota_off() for device */
+	mutex_lock(&dqopt->dqonoff_mutex);
+
+	/*
+	 * Skip everything if there's nothing to do. We have to do this because
+	 * sometimes we are called when fill_super() failed and calling
+	 * sync_fs() in such cases does no good.
+	 */
+	if (!sb_any_quota_loaded(sb)) {
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return 0;
+	}
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		toputinode[cnt] = NULL;
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_loaded(sb, cnt))
+			continue;
+
+		if (flags & DQUOT_SUSPENDED) {
+			spin_lock(&dq_state_lock);
+			dqopt->flags |=
+				dquot_state_flag(DQUOT_SUSPENDED, cnt);
+			spin_unlock(&dq_state_lock);
+		} else {
+			spin_lock(&dq_state_lock);
+			dqopt->flags &= ~dquot_state_flag(flags, cnt);
+			/* Turning off suspended quotas? */
+			if (!sb_has_quota_loaded(sb, cnt) &&
+			    sb_has_quota_suspended(sb, cnt)) {
+				dqopt->flags &=	~dquot_state_flag(
+							DQUOT_SUSPENDED, cnt);
+				spin_unlock(&dq_state_lock);
+				iput(dqopt->files[cnt]);
+				dqopt->files[cnt] = NULL;
+				continue;
+			}
+			spin_unlock(&dq_state_lock);
+		}
+
+		/* We still have to keep quota loaded? */
+		if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
+			continue;
+
+		/* Note: these are blocking operations */
+		drop_dquot_ref(sb, cnt);
+		invalidate_dquots(sb, cnt);
+		/*
+		 * Now all dquots should be invalidated, all writes done so we
+		 * should be only users of the info. No locks needed.
+		 */
+		if (info_dirty(&dqopt->info[cnt]))
+			sb->dq_op->write_info(sb, cnt);
+		if (dqopt->ops[cnt]->free_file_info)
+			dqopt->ops[cnt]->free_file_info(sb, cnt);
+		put_quota_format(dqopt->info[cnt].dqi_format);
+
+		toputinode[cnt] = dqopt->files[cnt];
+		if (!sb_has_quota_loaded(sb, cnt))
+			dqopt->files[cnt] = NULL;
+		dqopt->info[cnt].dqi_flags = 0;
+		dqopt->info[cnt].dqi_igrace = 0;
+		dqopt->info[cnt].dqi_bgrace = 0;
+		dqopt->ops[cnt] = NULL;
+	}
+	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	/* Skip syncing and setting flags if quota files are hidden */
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		goto put_inodes;
+
+	/* Sync the superblock so that buffers with quota data are written to
+	 * disk (and so userspace sees correct data afterwards). */
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+	/* Now the quota files are just ordinary files and we can set the
+	 * inode flags back. Moreover we discard the pagecache so that
+	 * userspace sees the writes we did bypassing the pagecache. We
+	 * must also discard the blockdev buffers so that we see the
+	 * changes done by userspace on the next quotaon() */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (toputinode[cnt]) {
+			mutex_lock(&dqopt->dqonoff_mutex);
+			/* If quota was reenabled in the meantime, we have
+			 * nothing to do */
+			if (!sb_has_quota_loaded(sb, cnt)) {
+				mutex_lock(&toputinode[cnt]->i_mutex);
+				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
+				  S_NOATIME | S_NOQUOTA);
+				truncate_inode_pages(&toputinode[cnt]->i_data,
+						     0);
+				mutex_unlock(&toputinode[cnt]->i_mutex);
+				mark_inode_dirty_sync(toputinode[cnt]);
+			}
+			mutex_unlock(&dqopt->dqonoff_mutex);
+		}
+	if (sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
+put_inodes:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (toputinode[cnt]) {
+			/* On remount RO, we keep the inode pointer so that we
+			 * can reenable quota on the subsequent remount RW. We
+			 * have to check 'flags' variable and not use sb_has_
+			 * function because another quotaon / quotaoff could
+			 * change global state before we got here. We refuse
+			 * to suspend quotas when there is pending delete on
+			 * the quota file... */
+			if (!(flags & DQUOT_SUSPENDED))
+				iput(toputinode[cnt]);
+			else if (!toputinode[cnt]->i_nlink)
+				ret = -EBUSY;
+		}
+	return ret;
+}
+EXPORT_SYMBOL(dquot_disable);
+
+int dquot_quota_off(struct super_block *sb, int type)
+{
+	return dquot_disable(sb, type,
+			     DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+}
+EXPORT_SYMBOL(dquot_quota_off);
+
+/*
+ *	Turn quotas on on a device
+ */
+
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+	unsigned int flags)
+{
+	struct quota_format_type *fmt = find_quota_format(format_id);
+	struct super_block *sb = inode->i_sb;
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int error;
+	int oldflags = -1;
+
+	if (!fmt)
+		return -ESRCH;
+	if (!S_ISREG(inode->i_mode)) {
+		error = -EACCES;
+		goto out_fmt;
+	}
+	if (IS_RDONLY(inode)) {
+		error = -EROFS;
+		goto out_fmt;
+	}
+	if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
+	    (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
+	/* Usage always has to be set... */
+	if (!(flags & DQUOT_USAGE_ENABLED)) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* As we bypass the pagecache we must now flush all the
+		 * dirty data and invalidate caches so that kernel sees
+		 * changes from userspace. It is not enough to just flush
+		 * the quota file since if blocksize < pagesize, invalidation
+		 * of the cache could fail because of other unrelated dirty
+		 * data */
+		sync_filesystem(sb);
+		invalidate_bdev(sb->s_bdev);
+	}
+	mutex_lock(&dqopt->dqonoff_mutex);
+	if (sb_has_quota_loaded(sb, type)) {
+		error = -EBUSY;
+		goto out_lock;
+	}
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* We don't want quota and atime on quota files (deadlocks
+		 * possible) Also nobody should write to the file - we use
+		 * special IO operations which ignore the immutable bit. */
+		mutex_lock(&inode->i_mutex);
+		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
+					     S_NOQUOTA);
+		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+		mutex_unlock(&inode->i_mutex);
+		/*
+		 * When S_NOQUOTA is set, remove dquot references as no more
+		 * references can be added
+		 */
+		__dquot_drop(inode);
+	}
+
+	error = -EIO;
+	dqopt->files[type] = igrab(inode);
+	if (!dqopt->files[type])
+		goto out_lock;
+	error = -EINVAL;
+	if (!fmt->qf_ops->check_quota_file(sb, type))
+		goto out_file_init;
+
+	dqopt->ops[type] = fmt->qf_ops;
+	dqopt->info[type].dqi_format = fmt;
+	dqopt->info[type].dqi_fmt_id = format_id;
+	INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
+	mutex_lock(&dqopt->dqio_mutex);
+	error = dqopt->ops[type]->read_file_info(sb, type);
+	if (error < 0) {
+		mutex_unlock(&dqopt->dqio_mutex);
+		goto out_file_init;
+	}
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
+	mutex_unlock(&dqopt->dqio_mutex);
+	spin_lock(&dq_state_lock);
+	dqopt->flags |= dquot_state_flag(flags, type);
+	spin_unlock(&dq_state_lock);
+
+	add_dquot_ref(sb, type);
+	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	return 0;
+
+out_file_init:
+	dqopt->files[type] = NULL;
+	iput(inode);
+out_lock:
+	if (oldflags != -1) {
+		mutex_lock(&inode->i_mutex);
+		/* Set the flags back (in the case of accidental quotaon()
+		 * on a wrong file we don't want to mess up the flags) */
+		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
+		inode->i_flags |= oldflags;
+		mutex_unlock(&inode->i_mutex);
+	}
+	mutex_unlock(&dqopt->dqonoff_mutex);
+out_fmt:
+	put_quota_format(fmt);
+
+	return error; 
+}
+
+/* Reenable quotas on remount RW */
+int dquot_resume(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct inode *inode;
+	int ret = 0, cnt;
+	unsigned int flags;
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+
+		mutex_lock(&dqopt->dqonoff_mutex);
+		if (!sb_has_quota_suspended(sb, cnt)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			continue;
+		}
+		inode = dqopt->files[cnt];
+		dqopt->files[cnt] = NULL;
+		spin_lock(&dq_state_lock);
+		flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+							DQUOT_LIMITS_ENABLED,
+							cnt);
+		dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+		spin_unlock(&dq_state_lock);
+		mutex_unlock(&dqopt->dqonoff_mutex);
+
+		flags = dquot_generic_flag(flags, cnt);
+		ret = vfs_load_quota_inode(inode, cnt,
+				dqopt->info[cnt].dqi_fmt_id, flags);
+		iput(inode);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(dquot_resume);
+
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
+		   struct path *path)
+{
+	int error = security_quota_on(path->dentry);
+	if (error)
+		return error;
+	/* Quota file not on the same filesystem? */
+	if (path->dentry->d_sb != sb)
+		error = -EXDEV;
+	else
+		error = vfs_load_quota_inode(d_inode(path->dentry), type,
+					     format_id, DQUOT_USAGE_ENABLED |
+					     DQUOT_LIMITS_ENABLED);
+	return error;
+}
+EXPORT_SYMBOL(dquot_quota_on);
+
+/*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int dquot_enable(struct inode *inode, int type, int format_id,
+		 unsigned int flags)
+{
+	int ret = 0;
+	struct super_block *sb = inode->i_sb;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	/* Just unsuspend quotas? */
+	BUG_ON(flags & DQUOT_SUSPENDED);
+
+	if (!flags)
+		return 0;
+	/* Just updating flags needed? */
+	if (sb_has_quota_loaded(sb, type)) {
+		mutex_lock(&dqopt->dqonoff_mutex);
+		/* Now do a reliable test... */
+		if (!sb_has_quota_loaded(sb, type)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			goto load_quota;
+		}
+		if (flags & DQUOT_USAGE_ENABLED &&
+		    sb_has_quota_usage_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		if (flags & DQUOT_LIMITS_ENABLED &&
+		    sb_has_quota_limits_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		spin_lock(&dq_state_lock);
+		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+		spin_unlock(&dq_state_lock);
+out_lock:
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return ret;
+	}
+
+load_quota:
+	return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+EXPORT_SYMBOL(dquot_enable);
+
+/*
+ * This function is used when filesystem needs to initialize quotas
+ * during mount time.
+ */
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
+		int format_id, int type)
+{
+	struct dentry *dentry;
+	int error;
+
+	mutex_lock(&d_inode(sb->s_root)->i_mutex);
+	dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
+	mutex_unlock(&d_inode(sb->s_root)->i_mutex);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	if (d_really_is_negative(dentry)) {
+		error = -ENOENT;
+		goto out;
+	}
+
+	error = security_quota_on(dentry);
+	if (!error)
+		error = vfs_load_quota_inode(d_inode(dentry), type, format_id,
+				DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+out:
+	dput(dentry);
+	return error;
+}
+EXPORT_SYMBOL(dquot_quota_on_mount);
+
+static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
+{
+	int ret;
+	int type;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+		return -ENOSYS;
+	/* Accounting cannot be turned on while fs is mounted */
+	flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
+	if (!flags)
+		return -EINVAL;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!(flags & qtype_enforce_flag(type)))
+			continue;
+		/* Can't enforce without accounting */
+		if (!sb_has_quota_usage_enabled(sb, type))
+			return -EINVAL;
+		ret = dquot_enable(dqopt->files[type], type,
+				   dqopt->info[type].dqi_fmt_id,
+				   DQUOT_LIMITS_ENABLED);
+		if (ret < 0)
+			goto out_err;
+	}
+	return 0;
+out_err:
+	/* Backout enforcement enablement we already did */
+	for (type--; type >= 0; type--)  {
+		if (flags & qtype_enforce_flag(type))
+			dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+	}
+	/* Error code translation for better compatibility with XFS */
+	if (ret == -EBUSY)
+		ret = -EEXIST;
+	return ret;
+}
+
+static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
+{
+	int ret;
+	int type;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+		return -ENOSYS;
+	/*
+	 * We don't support turning off accounting via quotactl. In principle
+	 * quota infrastructure can do this but filesystems don't expect
+	 * userspace to be able to do it.
+	 */
+	if (flags &
+		  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
+		return -EOPNOTSUPP;
+
+	/* Filter out limits not enabled */
+	for (type = 0; type < MAXQUOTAS; type++)
+		if (!sb_has_quota_limits_enabled(sb, type))
+			flags &= ~qtype_enforce_flag(type);
+	/* Nothing left? */
+	if (!flags)
+		return -EEXIST;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (flags & qtype_enforce_flag(type)) {
+			ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+			if (ret < 0)
+				goto out_err;
+		}
+	}
+	return 0;
+out_err:
+	/* Backout enforcement disabling we already did */
+	for (type--; type >= 0; type--)  {
+		if (flags & qtype_enforce_flag(type))
+			dquot_enable(dqopt->files[type], type,
+				     dqopt->info[type].dqi_fmt_id,
+				     DQUOT_LIMITS_ENABLED);
+	}
+	return ret;
+}
+
+/* Generic routine for getting common part of quota structure */
+static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
+{
+	struct mem_dqblk *dm = &dquot->dq_dqb;
+
+	memset(di, 0, sizeof(*di));
+	spin_lock(&dq_data_lock);
+	di->d_spc_hardlimit = dm->dqb_bhardlimit;
+	di->d_spc_softlimit = dm->dqb_bsoftlimit;
+	di->d_ino_hardlimit = dm->dqb_ihardlimit;
+	di->d_ino_softlimit = dm->dqb_isoftlimit;
+	di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
+	di->d_ino_count = dm->dqb_curinodes;
+	di->d_spc_timer = dm->dqb_btime;
+	di->d_ino_timer = dm->dqb_itime;
+	spin_unlock(&dq_data_lock);
+}
+
+int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
+		    struct qc_dqblk *di)
+{
+	struct dquot *dquot;
+
+	dquot = dqget(sb, qid);
+	if (!dquot)
+		return -ESRCH;
+	do_get_dqblk(dquot, di);
+	dqput(dquot);
+
+	return 0;
+}
+EXPORT_SYMBOL(dquot_get_dqblk);
+
+#define VFS_QC_MASK \
+	(QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
+	 QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
+	 QC_SPC_TIMER | QC_INO_TIMER)
+
+/* Generic routine for setting common part of quota structure */
+static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
+{
+	struct mem_dqblk *dm = &dquot->dq_dqb;
+	int check_blim = 0, check_ilim = 0;
+	struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	if (di->d_fieldmask & ~VFS_QC_MASK)
+		return -EINVAL;
+
+	if (((di->d_fieldmask & QC_SPC_SOFT) &&
+	     di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
+	    ((di->d_fieldmask & QC_SPC_HARD) &&
+	     di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
+	    ((di->d_fieldmask & QC_INO_SOFT) &&
+	     (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
+	    ((di->d_fieldmask & QC_INO_HARD) &&
+	     (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
+		return -ERANGE;
+
+	spin_lock(&dq_data_lock);
+	if (di->d_fieldmask & QC_SPACE) {
+		dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
+		check_blim = 1;
+		set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & QC_SPC_SOFT)
+		dm->dqb_bsoftlimit = di->d_spc_softlimit;
+	if (di->d_fieldmask & QC_SPC_HARD)
+		dm->dqb_bhardlimit = di->d_spc_hardlimit;
+	if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
+		check_blim = 1;
+		set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & QC_INO_COUNT) {
+		dm->dqb_curinodes = di->d_ino_count;
+		check_ilim = 1;
+		set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & QC_INO_SOFT)
+		dm->dqb_isoftlimit = di->d_ino_softlimit;
+	if (di->d_fieldmask & QC_INO_HARD)
+		dm->dqb_ihardlimit = di->d_ino_hardlimit;
+	if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
+		check_ilim = 1;
+		set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & QC_SPC_TIMER) {
+		dm->dqb_btime = di->d_spc_timer;
+		check_blim = 1;
+		set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & QC_INO_TIMER) {
+		dm->dqb_itime = di->d_ino_timer;
+		check_ilim = 1;
+		set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	}
+
+	if (check_blim) {
+		if (!dm->dqb_bsoftlimit ||
+		    dm->dqb_curspace < dm->dqb_bsoftlimit) {
+			dm->dqb_btime = 0;
+			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+		} else if (!(di->d_fieldmask & QC_SPC_TIMER))
+			/* Set grace only if user hasn't provided his own... */
+			dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
+	}
+	if (check_ilim) {
+		if (!dm->dqb_isoftlimit ||
+		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
+			dm->dqb_itime = 0;
+			clear_bit(DQ_INODES_B, &dquot->dq_flags);
+		} else if (!(di->d_fieldmask & QC_INO_TIMER))
+			/* Set grace only if user hasn't provided his own... */
+			dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
+	}
+	if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
+	    dm->dqb_isoftlimit)
+		clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+	else
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	spin_unlock(&dq_data_lock);
+	mark_dquot_dirty(dquot);
+
+	return 0;
+}
+
+int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
+		  struct qc_dqblk *di)
+{
+	struct dquot *dquot;
+	int rc;
+
+	dquot = dqget(sb, qid);
+	if (!dquot) {
+		rc = -ESRCH;
+		goto out;
+	}
+	rc = do_set_dqblk(dquot, di);
+	dqput(dquot);
+out:
+	return rc;
+}
+EXPORT_SYMBOL(dquot_set_dqblk);
+
+/* Generic routine for getting common part of quota file information */
+int dquot_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct mem_dqinfo *mi;
+	struct qc_type_state *tstate;
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int type;
+  
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	memset(state, 0, sizeof(*state));
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!sb_has_quota_active(sb, type))
+			continue;
+		tstate = state->s_state + type;
+		mi = sb_dqopt(sb)->info + type;
+		tstate->flags = QCI_ACCT_ENABLED;
+		spin_lock(&dq_data_lock);
+		if (mi->dqi_flags & DQF_SYS_FILE)
+			tstate->flags |= QCI_SYSFILE;
+		if (mi->dqi_flags & DQF_ROOT_SQUASH)
+			tstate->flags |= QCI_ROOT_SQUASH;
+		if (sb_has_quota_limits_enabled(sb, type))
+			tstate->flags |= QCI_LIMITS_ENFORCED;
+		tstate->spc_timelimit = mi->dqi_bgrace;
+		tstate->ino_timelimit = mi->dqi_igrace;
+		tstate->ino = dqopt->files[type]->i_ino;
+		tstate->blocks = dqopt->files[type]->i_blocks;
+		tstate->nextents = 1;	/* We don't know... */
+		spin_unlock(&dq_data_lock);
+	}
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(dquot_get_state);
+
+/* Generic routine for setting common part of quota file information */
+int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
+{
+	struct mem_dqinfo *mi;
+	int err = 0;
+
+	if ((ii->i_fieldmask & QC_WARNS_MASK) ||
+	    (ii->i_fieldmask & QC_RT_SPC_TIMER))
+		return -EINVAL;
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	if (!sb_has_quota_active(sb, type)) {
+		err = -ESRCH;
+		goto out;
+	}
+	mi = sb_dqopt(sb)->info + type;
+	if (ii->i_fieldmask & QC_FLAGS) {
+		if ((ii->i_flags & QCI_ROOT_SQUASH &&
+		     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+	spin_lock(&dq_data_lock);
+	if (ii->i_fieldmask & QC_SPC_TIMER)
+		mi->dqi_bgrace = ii->i_spc_timelimit;
+	if (ii->i_fieldmask & QC_INO_TIMER)
+		mi->dqi_igrace = ii->i_ino_timelimit;
+	if (ii->i_fieldmask & QC_FLAGS) {
+		if (ii->i_flags & QCI_ROOT_SQUASH)
+			mi->dqi_flags |= DQF_ROOT_SQUASH;
+		else
+			mi->dqi_flags &= ~DQF_ROOT_SQUASH;
+	}
+	spin_unlock(&dq_data_lock);
+	mark_info_dirty(sb, type);
+	/* Force write to disk */
+	sb->dq_op->write_info(sb, type);
+out:
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return err;
+}
+EXPORT_SYMBOL(dquot_set_dqinfo);
+
+const struct quotactl_ops dquot_quotactl_ops = {
+	.quota_on	= dquot_quota_on,
+	.quota_off	= dquot_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_state	= dquot_get_state,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_ops);
+
+const struct quotactl_ops dquot_quotactl_sysfile_ops = {
+	.quota_enable	= dquot_quota_enable,
+	.quota_disable	= dquot_quota_disable,
+	.quota_sync	= dquot_quota_sync,
+	.get_state	= dquot_get_state,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
+
+static int do_proc_dqstats(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	unsigned int type = (int *)table->data - dqstats.stat;
+
+	/* Update global table */
+	dqstats.stat[type] =
+			percpu_counter_sum_positive(&dqstats.counter[type]);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table fs_dqstats_table[] = {
+	{
+		.procname	= "lookups",
+		.data		= &dqstats.stat[DQST_LOOKUPS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "drops",
+		.data		= &dqstats.stat[DQST_DROPS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "reads",
+		.data		= &dqstats.stat[DQST_READS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "writes",
+		.data		= &dqstats.stat[DQST_WRITES],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "cache_hits",
+		.data		= &dqstats.stat[DQST_CACHE_HITS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "allocated_dquots",
+		.data		= &dqstats.stat[DQST_ALLOC_DQUOTS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "free_dquots",
+		.data		= &dqstats.stat[DQST_FREE_DQUOTS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "syncs",
+		.data		= &dqstats.stat[DQST_SYNCS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+#ifdef CONFIG_PRINT_QUOTA_WARNING
+	{
+		.procname	= "warnings",
+		.data		= &flag_print_warnings,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
+	{ },
+};
+
+static struct ctl_table fs_table[] = {
+	{
+		.procname	= "quota",
+		.mode		= 0555,
+		.child		= fs_dqstats_table,
+	},
+	{ },
+};
+
+static struct ctl_table sys_table[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= fs_table,
+	},
+	{ },
+};
+
+static int __init dquot_init(void)
+{
+	int i, ret;
+	unsigned long nr_hash, order;
+
+	printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
+
+	register_sysctl_table(sys_table);
+
+	dquot_cachep = kmem_cache_create("dquot",
+			sizeof(struct dquot), sizeof(unsigned long) * 4,
+			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+				SLAB_MEM_SPREAD|SLAB_PANIC),
+			NULL);
+
+	order = 0;
+	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
+	if (!dquot_hash)
+		panic("Cannot create dquot hash table");
+
+	for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
+		ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
+		if (ret)
+			panic("Cannot create dquot stat counters");
+	}
+
+	/* Find power-of-two hlist_heads which can fit into allocation */
+	nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
+	dq_hash_bits = 0;
+	do {
+		dq_hash_bits++;
+	} while (nr_hash >> dq_hash_bits);
+	dq_hash_bits--;
+
+	nr_hash = 1UL << dq_hash_bits;
+	dq_hash_mask = nr_hash - 1;
+	for (i = 0; i < nr_hash; i++)
+		INIT_HLIST_HEAD(dquot_hash + i);
+
+	pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
+		" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
+
+	register_shrinker(&dqcache_shrinker);
+
+	return 0;
+}
+module_init(dquot_init);
diff --git a/kernel/fs/quota/kqid.c b/kernel/fs/quota/kqid.c
new file mode 100644
index 000000000..ebc5e6285
--- /dev/null
+++ b/kernel/fs/quota/kqid.c
@@ -0,0 +1,132 @@
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/export.h>
+
+/**
+ *	qid_eq - Test to see if to kquid values are the same
+ *	@left: A qid value
+ *	@right: Another quid value
+ *
+ *	Return true if the two qid values are equal and false otherwise.
+ */
+bool qid_eq(struct kqid left, struct kqid right)
+{
+	if (left.type != right.type)
+		return false;
+	switch(left.type) {
+	case USRQUOTA:
+		return uid_eq(left.uid, right.uid);
+	case GRPQUOTA:
+		return gid_eq(left.gid, right.gid);
+	case PRJQUOTA:
+		return projid_eq(left.projid, right.projid);
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(qid_eq);
+
+/**
+ *	qid_lt - Test to see if one qid value is less than another
+ *	@left: The possibly lesser qid value
+ *	@right: The possibly greater qid value
+ *
+ *	Return true if left is less than right and false otherwise.
+ */
+bool qid_lt(struct kqid left, struct kqid right)
+{
+	if (left.type < right.type)
+		return true;
+	if (left.type > right.type)
+		return false;
+	switch (left.type) {
+	case USRQUOTA:
+		return uid_lt(left.uid, right.uid);
+	case GRPQUOTA:
+		return gid_lt(left.gid, right.gid);
+	case PRJQUOTA:
+		return projid_lt(left.projid, right.projid);
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(qid_lt);
+
+/**
+ *	from_kqid - Create a qid from a kqid user-namespace pair.
+ *	@targ: The user namespace we want a qid in.
+ *	@kqid: The kernel internal quota identifier to start with.
+ *
+ *	Map @kqid into the user-namespace specified by @targ and
+ *	return the resulting qid.
+ *
+ *	There is always a mapping into the initial user_namespace.
+ *
+ *	If @kqid has no mapping in @targ (qid_t)-1 is returned.
+ */
+qid_t from_kqid(struct user_namespace *targ, struct kqid kqid)
+{
+	switch (kqid.type) {
+	case USRQUOTA:
+		return from_kuid(targ, kqid.uid);
+	case GRPQUOTA:
+		return from_kgid(targ, kqid.gid);
+	case PRJQUOTA:
+		return from_kprojid(targ, kqid.projid);
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(from_kqid);
+
+/**
+ *	from_kqid_munged - Create a qid from a kqid user-namespace pair.
+ *	@targ: The user namespace we want a qid in.
+ *	@kqid: The kernel internal quota identifier to start with.
+ *
+ *	Map @kqid into the user-namespace specified by @targ and
+ *	return the resulting qid.
+ *
+ *	There is always a mapping into the initial user_namespace.
+ *
+ *	Unlike from_kqid from_kqid_munged never fails and always
+ *	returns a valid projid.  This makes from_kqid_munged
+ *	appropriate for use in places where failing to provide
+ *	a qid_t is not a good option.
+ *
+ *	If @kqid has no mapping in @targ the kqid.type specific
+ *	overflow identifier is returned.
+ */
+qid_t from_kqid_munged(struct user_namespace *targ, struct kqid kqid)
+{
+	switch (kqid.type) {
+	case USRQUOTA:
+		return from_kuid_munged(targ, kqid.uid);
+	case GRPQUOTA:
+		return from_kgid_munged(targ, kqid.gid);
+	case PRJQUOTA:
+		return from_kprojid_munged(targ, kqid.projid);
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(from_kqid_munged);
+
+/**
+ *	qid_valid - Report if a valid value is stored in a kqid.
+ *	@qid: The kernel internal quota identifier to test.
+ */
+bool qid_valid(struct kqid qid)
+{
+	switch (qid.type) {
+	case USRQUOTA:
+		return uid_valid(qid.uid);
+	case GRPQUOTA:
+		return gid_valid(qid.gid);
+	case PRJQUOTA:
+		return projid_valid(qid.projid);
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(qid_valid);
diff --git a/kernel/fs/quota/netlink.c b/kernel/fs/quota/netlink.c
new file mode 100644
index 000000000..bb2869f5d
--- /dev/null
+++ b/kernel/fs/quota/netlink.c
@@ -0,0 +1,109 @@
+
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+static const struct genl_multicast_group quota_mcgrps[] = {
+	{ .name = "events", },
+};
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+	/*
+	 * Needed due to multicast group ID abuse - old code assumed
+	 * the family ID was also a valid multicast group ID (which
+	 * isn't true) and userspace might thus rely on it. Assign a
+	 * static ID for this group to make dealing with that easier.
+	 */
+	.id = GENL_ID_VFS_DQUOT,
+	.hdrsize = 0,
+	.name = "VFS_DQUOT",
+	.version = 1,
+	.maxattr = QUOTA_NL_A_MAX,
+	.mcgrps = quota_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(quota_mcgrps),
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @qid: The kernel internal quota identifier.
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(struct kqid qid, dev_t dev,
+			const char warntype)
+{
+	static atomic_t seq;
+	struct sk_buff *skb;
+	void *msg_head;
+	int ret;
+	int msg_size = 4 * nla_total_size(sizeof(u32)) +
+		       2 * nla_total_size(sizeof(u64));
+
+	/* We have to allocate using GFP_NOFS as we are called from a
+	 * filesystem performing write and thus further recursion into
+	 * the fs to free some data could cause deadlocks. */
+	skb = genlmsg_new(msg_size, GFP_NOFS);
+	if (!skb) {
+		printk(KERN_ERR
+		  "VFS: Not enough memory to send quota warning.\n");
+		return;
+	}
+	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
+	if (!msg_head) {
+		printk(KERN_ERR
+		  "VFS: Cannot store netlink header in quota warning.\n");
+		goto err_out;
+	}
+	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID,
+			  from_kqid_munged(&init_user_ns, qid));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID,
+			  from_kuid_munged(&init_user_ns, current_uid()));
+	if (ret)
+		goto attr_err_out;
+	genlmsg_end(skb, msg_head);
+
+	genlmsg_multicast(&quota_genl_family, skb, 0, 0, GFP_NOFS);
+	return;
+attr_err_out:
+	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+	if (genl_register_family(&quota_genl_family) != 0)
+		printk(KERN_ERR
+		       "VFS: Failed to create quota netlink interface.\n");
+	return 0;
+};
+
+module_init(quota_init);
diff --git a/kernel/fs/quota/quota.c b/kernel/fs/quota/quota.c
new file mode 100644
index 000000000..86ded7375
--- /dev/null
+++ b/kernel/fs/quota/quota.c
@@ -0,0 +1,808 @@
+/*
+ * Quota code necessary even when VFS quota support is not compiled
+ * into the kernel.  The interesting stuff is over in dquot.c, here
+ * we have symbols for initial quotactl(2) handling, the sysctl(2)
+ * variables, etc - things needed even when quota support disabled.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <asm/current.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/capability.h>
+#include <linux/quotaops.h>
+#include <linux/types.h>
+#include <linux/writeback.h>
+
+static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
+				     qid_t id)
+{
+	switch (cmd) {
+	/* these commands do not require any special privilegues */
+	case Q_GETFMT:
+	case Q_SYNC:
+	case Q_GETINFO:
+	case Q_XGETQSTAT:
+	case Q_XGETQSTATV:
+	case Q_XQUOTASYNC:
+		break;
+	/* allow to query information for dquots we "own" */
+	case Q_GETQUOTA:
+	case Q_XGETQUOTA:
+		if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) ||
+		    (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))))
+			break;
+		/*FALLTHROUGH*/
+	default:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	return security_quotactl(cmd, type, id, sb);
+}
+
+static void quota_sync_one(struct super_block *sb, void *arg)
+{
+	int type = *(int *)arg;
+
+	if (sb->s_qcop && sb->s_qcop->quota_sync &&
+	    (sb->s_quota_types & (1 << type)))
+		sb->s_qcop->quota_sync(sb, type);
+}
+
+static int quota_sync_all(int type)
+{
+	int ret;
+
+	if (type >= MAXQUOTAS)
+		return -EINVAL;
+	ret = security_quotactl(Q_SYNC, type, 0, NULL);
+	if (!ret)
+		iterate_supers(quota_sync_one, &type);
+	return ret;
+}
+
+unsigned int qtype_enforce_flag(int type)
+{
+	switch (type) {
+	case USRQUOTA:
+		return FS_QUOTA_UDQ_ENFD;
+	case GRPQUOTA:
+		return FS_QUOTA_GDQ_ENFD;
+	case PRJQUOTA:
+		return FS_QUOTA_PDQ_ENFD;
+	}
+	return 0;
+}
+
+static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+		         struct path *path)
+{
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_enable)
+		return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	return sb->s_qcop->quota_on(sb, type, id, path);
+}
+
+static int quota_quotaoff(struct super_block *sb, int type)
+{
+	if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_disable)
+		return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
+	return sb->s_qcop->quota_off(sb, type);
+}
+
+static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
+{
+	__u32 fmt;
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	if (!sb_has_quota_active(sb, type)) {
+		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+		return -ESRCH;
+	}
+	fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	if (copy_to_user(addr, &fmt, sizeof(fmt)))
+		return -EFAULT;
+	return 0;
+}
+
+static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
+{
+	struct qc_state state;
+	struct qc_type_state *tstate;
+	struct if_dqinfo uinfo;
+	int ret;
+
+	/* This checks whether qc_state has enough entries... */
+	BUILD_BUG_ON(MAXQUOTAS > XQM_MAXQUOTAS);
+	if (!sb->s_qcop->get_state)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_state(sb, &state);
+	if (ret)
+		return ret;
+	tstate = state.s_state + type;
+	if (!(tstate->flags & QCI_ACCT_ENABLED))
+		return -ESRCH;
+	memset(&uinfo, 0, sizeof(uinfo));
+	uinfo.dqi_bgrace = tstate->spc_timelimit;
+	uinfo.dqi_igrace = tstate->ino_timelimit;
+	if (tstate->flags & QCI_SYSFILE)
+		uinfo.dqi_flags |= DQF_SYS_FILE;
+	if (tstate->flags & QCI_ROOT_SQUASH)
+		uinfo.dqi_flags |= DQF_ROOT_SQUASH;
+	uinfo.dqi_valid = IIF_ALL;
+	if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo)))
+		return -EFAULT;
+	return ret;
+}
+
+static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
+{
+	struct if_dqinfo info;
+	struct qc_info qinfo;
+
+	if (copy_from_user(&info, addr, sizeof(info)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_info)
+		return -ENOSYS;
+	if (info.dqi_valid & ~(IIF_FLAGS | IIF_BGRACE | IIF_IGRACE))
+		return -EINVAL;
+	memset(&qinfo, 0, sizeof(qinfo));
+	if (info.dqi_valid & IIF_FLAGS) {
+		if (info.dqi_flags & ~DQF_SETINFO_MASK)
+			return -EINVAL;
+		if (info.dqi_flags & DQF_ROOT_SQUASH)
+			qinfo.i_flags |= QCI_ROOT_SQUASH;
+		qinfo.i_fieldmask |= QC_FLAGS;
+	}
+	if (info.dqi_valid & IIF_BGRACE) {
+		qinfo.i_spc_timelimit = info.dqi_bgrace;
+		qinfo.i_fieldmask |= QC_SPC_TIMER;
+	}
+	if (info.dqi_valid & IIF_IGRACE) {
+		qinfo.i_ino_timelimit = info.dqi_igrace;
+		qinfo.i_fieldmask |= QC_INO_TIMER;
+	}
+	return sb->s_qcop->set_info(sb, type, &qinfo);
+}
+
+static inline qsize_t qbtos(qsize_t blocks)
+{
+	return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
+{
+	memset(dst, 0, sizeof(*dst));
+	dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
+	dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
+	dst->dqb_curspace = src->d_space;
+	dst->dqb_ihardlimit = src->d_ino_hardlimit;
+	dst->dqb_isoftlimit = src->d_ino_softlimit;
+	dst->dqb_curinodes = src->d_ino_count;
+	dst->dqb_btime = src->d_spc_timer;
+	dst->dqb_itime = src->d_ino_timer;
+	dst->dqb_valid = QIF_ALL;
+}
+
+static int quota_getquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
+{
+	struct kqid qid;
+	struct qc_dqblk fdq;
+	struct if_dqblk idq;
+	int ret;
+
+	if (!sb->s_qcop->get_dqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
+	if (ret)
+		return ret;
+	copy_to_if_dqblk(&idq, &fdq);
+	if (copy_to_user(addr, &idq, sizeof(idq)))
+		return -EFAULT;
+	return 0;
+}
+
+static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
+{
+	dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
+	dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
+	dst->d_space = src->dqb_curspace;
+	dst->d_ino_hardlimit = src->dqb_ihardlimit;
+	dst->d_ino_softlimit = src->dqb_isoftlimit;
+	dst->d_ino_count = src->dqb_curinodes;
+	dst->d_spc_timer = src->dqb_btime;
+	dst->d_ino_timer = src->dqb_itime;
+
+	dst->d_fieldmask = 0;
+	if (src->dqb_valid & QIF_BLIMITS)
+		dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
+	if (src->dqb_valid & QIF_SPACE)
+		dst->d_fieldmask |= QC_SPACE;
+	if (src->dqb_valid & QIF_ILIMITS)
+		dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
+	if (src->dqb_valid & QIF_INODES)
+		dst->d_fieldmask |= QC_INO_COUNT;
+	if (src->dqb_valid & QIF_BTIME)
+		dst->d_fieldmask |= QC_SPC_TIMER;
+	if (src->dqb_valid & QIF_ITIME)
+		dst->d_fieldmask |= QC_INO_TIMER;
+}
+
+static int quota_setquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
+{
+	struct qc_dqblk fdq;
+	struct if_dqblk idq;
+	struct kqid qid;
+
+	if (copy_from_user(&idq, addr, sizeof(idq)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_dqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	copy_from_if_dqblk(&fdq, &idq);
+	return sb->s_qcop->set_dqblk(sb, qid, &fdq);
+}
+
+static int quota_enable(struct super_block *sb, void __user *addr)
+{
+	__u32 flags;
+
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->quota_enable)
+		return -ENOSYS;
+	return sb->s_qcop->quota_enable(sb, flags);
+}
+
+static int quota_disable(struct super_block *sb, void __user *addr)
+{
+	__u32 flags;
+
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->quota_disable)
+		return -ENOSYS;
+	return sb->s_qcop->quota_disable(sb, flags);
+}
+
+static int quota_state_to_flags(struct qc_state *state)
+{
+	int flags = 0;
+
+	if (state->s_state[USRQUOTA].flags & QCI_ACCT_ENABLED)
+		flags |= FS_QUOTA_UDQ_ACCT;
+	if (state->s_state[USRQUOTA].flags & QCI_LIMITS_ENFORCED)
+		flags |= FS_QUOTA_UDQ_ENFD;
+	if (state->s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)
+		flags |= FS_QUOTA_GDQ_ACCT;
+	if (state->s_state[GRPQUOTA].flags & QCI_LIMITS_ENFORCED)
+		flags |= FS_QUOTA_GDQ_ENFD;
+	if (state->s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED)
+		flags |= FS_QUOTA_PDQ_ACCT;
+	if (state->s_state[PRJQUOTA].flags & QCI_LIMITS_ENFORCED)
+		flags |= FS_QUOTA_PDQ_ENFD;
+	return flags;
+}
+
+static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs)
+{
+	int type;
+	struct qc_state state;
+	int ret;
+
+	ret = sb->s_qcop->get_state(sb, &state);
+	if (ret < 0)
+		return ret;
+
+	memset(fqs, 0, sizeof(*fqs));
+	fqs->qs_version = FS_QSTAT_VERSION;
+	fqs->qs_flags = quota_state_to_flags(&state);
+	/* No quota enabled? */
+	if (!fqs->qs_flags)
+		return -ENOSYS;
+	fqs->qs_incoredqs = state.s_incoredqs;
+	/*
+	 * GETXSTATE quotactl has space for just one set of time limits so
+	 * report them for the first enabled quota type
+	 */
+	for (type = 0; type < XQM_MAXQUOTAS; type++)
+		if (state.s_state[type].flags & QCI_ACCT_ENABLED)
+			break;
+	BUG_ON(type == XQM_MAXQUOTAS);
+	fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
+	fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
+	fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
+	fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
+	fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
+	if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+		fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
+		fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
+		fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
+	}
+	if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+		fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
+		fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
+		fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
+	}
+	if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+		/*
+		 * Q_XGETQSTAT doesn't have room for both group and project
+		 * quotas.  So, allow the project quota values to be copied out
+		 * only if there is no group quota information available.
+		 */
+		if (!(state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)) {
+			fqs->qs_gquota.qfs_ino = state.s_state[PRJQUOTA].ino;
+			fqs->qs_gquota.qfs_nblks =
+					state.s_state[PRJQUOTA].blocks;
+			fqs->qs_gquota.qfs_nextents =
+					state.s_state[PRJQUOTA].nextents;
+		}
+	}
+	return 0;
+}
+
+static int quota_getxstate(struct super_block *sb, void __user *addr)
+{
+	struct fs_quota_stat fqs;
+	int ret;
+
+	if (!sb->s_qcop->get_state)
+		return -ENOSYS;
+	ret = quota_getstate(sb, &fqs);
+	if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+		return -EFAULT;
+	return ret;
+}
+
+static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs)
+{
+	int type;
+	struct qc_state state;
+	int ret;
+
+	ret = sb->s_qcop->get_state(sb, &state);
+	if (ret < 0)
+		return ret;
+
+	memset(fqs, 0, sizeof(*fqs));
+	fqs->qs_version = FS_QSTAT_VERSION;
+	fqs->qs_flags = quota_state_to_flags(&state);
+	/* No quota enabled? */
+	if (!fqs->qs_flags)
+		return -ENOSYS;
+	fqs->qs_incoredqs = state.s_incoredqs;
+	/*
+	 * GETXSTATV quotactl has space for just one set of time limits so
+	 * report them for the first enabled quota type
+	 */
+	for (type = 0; type < XQM_MAXQUOTAS; type++)
+		if (state.s_state[type].flags & QCI_ACCT_ENABLED)
+			break;
+	BUG_ON(type == XQM_MAXQUOTAS);
+	fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
+	fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
+	fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
+	fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
+	fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
+	if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+		fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
+		fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
+		fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
+	}
+	if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+		fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
+		fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
+		fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
+	}
+	if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+		fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino;
+		fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks;
+		fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents;
+	}
+	return 0;
+}
+
+static int quota_getxstatev(struct super_block *sb, void __user *addr)
+{
+	struct fs_quota_statv fqs;
+	int ret;
+
+	if (!sb->s_qcop->get_state)
+		return -ENOSYS;
+
+	memset(&fqs, 0, sizeof(fqs));
+	if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */
+		return -EFAULT;
+
+	/* If this kernel doesn't support user specified version, fail */
+	switch (fqs.qs_version) {
+	case FS_QSTATV_VERSION1:
+		break;
+	default:
+		return -EINVAL;
+	}
+	ret = quota_getstatev(sb, &fqs);
+	if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+		return -EFAULT;
+	return ret;
+}
+
+/*
+ * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
+ * out of there as xfsprogs rely on definitions being in that header file. So
+ * just define same functions here for quota purposes.
+ */
+#define XFS_BB_SHIFT 9
+
+static inline u64 quota_bbtob(u64 blocks)
+{
+	return blocks << XFS_BB_SHIFT;
+}
+
+static inline u64 quota_btobb(u64 bytes)
+{
+	return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
+}
+
+static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
+{
+	dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
+	dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
+	dst->d_ino_hardlimit = src->d_ino_hardlimit;
+	dst->d_ino_softlimit = src->d_ino_softlimit;
+	dst->d_space = quota_bbtob(src->d_bcount);
+	dst->d_ino_count = src->d_icount;
+	dst->d_ino_timer = src->d_itimer;
+	dst->d_spc_timer = src->d_btimer;
+	dst->d_ino_warns = src->d_iwarns;
+	dst->d_spc_warns = src->d_bwarns;
+	dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
+	dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
+	dst->d_rt_space = quota_bbtob(src->d_rtbcount);
+	dst->d_rt_spc_timer = src->d_rtbtimer;
+	dst->d_rt_spc_warns = src->d_rtbwarns;
+	dst->d_fieldmask = 0;
+	if (src->d_fieldmask & FS_DQ_ISOFT)
+		dst->d_fieldmask |= QC_INO_SOFT;
+	if (src->d_fieldmask & FS_DQ_IHARD)
+		dst->d_fieldmask |= QC_INO_HARD;
+	if (src->d_fieldmask & FS_DQ_BSOFT)
+		dst->d_fieldmask |= QC_SPC_SOFT;
+	if (src->d_fieldmask & FS_DQ_BHARD)
+		dst->d_fieldmask |= QC_SPC_HARD;
+	if (src->d_fieldmask & FS_DQ_RTBSOFT)
+		dst->d_fieldmask |= QC_RT_SPC_SOFT;
+	if (src->d_fieldmask & FS_DQ_RTBHARD)
+		dst->d_fieldmask |= QC_RT_SPC_HARD;
+	if (src->d_fieldmask & FS_DQ_BTIMER)
+		dst->d_fieldmask |= QC_SPC_TIMER;
+	if (src->d_fieldmask & FS_DQ_ITIMER)
+		dst->d_fieldmask |= QC_INO_TIMER;
+	if (src->d_fieldmask & FS_DQ_RTBTIMER)
+		dst->d_fieldmask |= QC_RT_SPC_TIMER;
+	if (src->d_fieldmask & FS_DQ_BWARNS)
+		dst->d_fieldmask |= QC_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_IWARNS)
+		dst->d_fieldmask |= QC_INO_WARNS;
+	if (src->d_fieldmask & FS_DQ_RTBWARNS)
+		dst->d_fieldmask |= QC_RT_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_BCOUNT)
+		dst->d_fieldmask |= QC_SPACE;
+	if (src->d_fieldmask & FS_DQ_ICOUNT)
+		dst->d_fieldmask |= QC_INO_COUNT;
+	if (src->d_fieldmask & FS_DQ_RTBCOUNT)
+		dst->d_fieldmask |= QC_RT_SPACE;
+}
+
+static void copy_qcinfo_from_xfs_dqblk(struct qc_info *dst,
+				       struct fs_disk_quota *src)
+{
+	memset(dst, 0, sizeof(*dst));
+	dst->i_spc_timelimit = src->d_btimer;
+	dst->i_ino_timelimit = src->d_itimer;
+	dst->i_rt_spc_timelimit = src->d_rtbtimer;
+	dst->i_ino_warnlimit = src->d_iwarns;
+	dst->i_spc_warnlimit = src->d_bwarns;
+	dst->i_rt_spc_warnlimit = src->d_rtbwarns;
+	if (src->d_fieldmask & FS_DQ_BWARNS)
+		dst->i_fieldmask |= QC_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_IWARNS)
+		dst->i_fieldmask |= QC_INO_WARNS;
+	if (src->d_fieldmask & FS_DQ_RTBWARNS)
+		dst->i_fieldmask |= QC_RT_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_BTIMER)
+		dst->i_fieldmask |= QC_SPC_TIMER;
+	if (src->d_fieldmask & FS_DQ_ITIMER)
+		dst->i_fieldmask |= QC_INO_TIMER;
+	if (src->d_fieldmask & FS_DQ_RTBTIMER)
+		dst->i_fieldmask |= QC_RT_SPC_TIMER;
+}
+
+static int quota_setxquota(struct super_block *sb, int type, qid_t id,
+			   void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
+	struct kqid qid;
+
+	if (copy_from_user(&fdq, addr, sizeof(fdq)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_dqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	/* Are we actually setting timer / warning limits for all users? */
+	if (from_kqid(&init_user_ns, qid) == 0 &&
+	    fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
+		struct qc_info qinfo;
+		int ret;
+
+		if (!sb->s_qcop->set_info)
+			return -EINVAL;
+		copy_qcinfo_from_xfs_dqblk(&qinfo, &fdq);
+		ret = sb->s_qcop->set_info(sb, type, &qinfo);
+		if (ret)
+			return ret;
+		/* These are already done */
+		fdq.d_fieldmask &= ~(FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK);
+	}
+	copy_from_xfs_dqblk(&qdq, &fdq);
+	return sb->s_qcop->set_dqblk(sb, qid, &qdq);
+}
+
+static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
+			      int type, qid_t id)
+{
+	memset(dst, 0, sizeof(*dst));
+	dst->d_version = FS_DQUOT_VERSION;
+	dst->d_id = id;
+	if (type == USRQUOTA)
+		dst->d_flags = FS_USER_QUOTA;
+	else if (type == PRJQUOTA)
+		dst->d_flags = FS_PROJ_QUOTA;
+	else
+		dst->d_flags = FS_GROUP_QUOTA;
+	dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
+	dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
+	dst->d_ino_hardlimit = src->d_ino_hardlimit;
+	dst->d_ino_softlimit = src->d_ino_softlimit;
+	dst->d_bcount = quota_btobb(src->d_space);
+	dst->d_icount = src->d_ino_count;
+	dst->d_itimer = src->d_ino_timer;
+	dst->d_btimer = src->d_spc_timer;
+	dst->d_iwarns = src->d_ino_warns;
+	dst->d_bwarns = src->d_spc_warns;
+	dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
+	dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
+	dst->d_rtbcount = quota_btobb(src->d_rt_space);
+	dst->d_rtbtimer = src->d_rt_spc_timer;
+	dst->d_rtbwarns = src->d_rt_spc_warns;
+}
+
+static int quota_getxquota(struct super_block *sb, int type, qid_t id,
+			   void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
+	struct kqid qid;
+	int ret;
+
+	if (!sb->s_qcop->get_dqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
+	if (ret)
+		return ret;
+	copy_to_xfs_dqblk(&fdq, &qdq, type, id);
+	if (copy_to_user(addr, &fdq, sizeof(fdq)))
+		return -EFAULT;
+	return ret;
+}
+
+static int quota_rmxquota(struct super_block *sb, void __user *addr)
+{
+	__u32 flags;
+
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->rm_xquota)
+		return -ENOSYS;
+	return sb->s_qcop->rm_xquota(sb, flags);
+}
+
+/* Copy parameters and call proper function */
+static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
+		       void __user *addr, struct path *path)
+{
+	int ret;
+
+	if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
+		return -EINVAL;
+	/*
+	 * Quota not supported on this fs? Check this before s_quota_types
+	 * since they needn't be set if quota is not supported at all.
+	 */
+	if (!sb->s_qcop)
+		return -ENOSYS;
+	if (!(sb->s_quota_types & (1 << type)))
+		return -EINVAL;
+
+	ret = check_quotactl_permission(sb, type, cmd, id);
+	if (ret < 0)
+		return ret;
+
+	switch (cmd) {
+	case Q_QUOTAON:
+		return quota_quotaon(sb, type, cmd, id, path);
+	case Q_QUOTAOFF:
+		return quota_quotaoff(sb, type);
+	case Q_GETFMT:
+		return quota_getfmt(sb, type, addr);
+	case Q_GETINFO:
+		return quota_getinfo(sb, type, addr);
+	case Q_SETINFO:
+		return quota_setinfo(sb, type, addr);
+	case Q_GETQUOTA:
+		return quota_getquota(sb, type, id, addr);
+	case Q_SETQUOTA:
+		return quota_setquota(sb, type, id, addr);
+	case Q_SYNC:
+		if (!sb->s_qcop->quota_sync)
+			return -ENOSYS;
+		return sb->s_qcop->quota_sync(sb, type);
+	case Q_XQUOTAON:
+		return quota_enable(sb, addr);
+	case Q_XQUOTAOFF:
+		return quota_disable(sb, addr);
+	case Q_XQUOTARM:
+		return quota_rmxquota(sb, addr);
+	case Q_XGETQSTAT:
+		return quota_getxstate(sb, addr);
+	case Q_XGETQSTATV:
+		return quota_getxstatev(sb, addr);
+	case Q_XSETQLIM:
+		return quota_setxquota(sb, type, id, addr);
+	case Q_XGETQUOTA:
+		return quota_getxquota(sb, type, id, addr);
+	case Q_XQUOTASYNC:
+		if (sb->s_flags & MS_RDONLY)
+			return -EROFS;
+		/* XFS quotas are fully coherent now, making this call a noop */
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+#ifdef CONFIG_BLOCK
+
+/* Return 1 if 'cmd' will block on frozen filesystem */
+static int quotactl_cmd_write(int cmd)
+{
+	switch (cmd) {
+	case Q_GETFMT:
+	case Q_GETINFO:
+	case Q_SYNC:
+	case Q_XGETQSTAT:
+	case Q_XGETQSTATV:
+	case Q_XGETQUOTA:
+	case Q_XQUOTASYNC:
+		return 0;
+	}
+	return 1;
+}
+
+#endif /* CONFIG_BLOCK */
+
+/*
+ * look up a superblock on which quota ops will be performed
+ * - use the name of a block device to find the superblock thereon
+ */
+static struct super_block *quotactl_block(const char __user *special, int cmd)
+{
+#ifdef CONFIG_BLOCK
+	struct block_device *bdev;
+	struct super_block *sb;
+	struct filename *tmp = getname(special);
+
+	if (IS_ERR(tmp))
+		return ERR_CAST(tmp);
+	bdev = lookup_bdev(tmp->name);
+	putname(tmp);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+	if (quotactl_cmd_write(cmd))
+		sb = get_super_thawed(bdev);
+	else
+		sb = get_super(bdev);
+	bdput(bdev);
+	if (!sb)
+		return ERR_PTR(-ENODEV);
+
+	return sb;
+#else
+	return ERR_PTR(-ENODEV);
+#endif
+}
+
+/*
+ * This is the system call interface. This communicates with
+ * the user-level programs. Currently this only supports diskquota
+ * calls. Maybe we need to add the process quotas etc. in the future,
+ * but we probably should use rlimits for that.
+ */
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+		qid_t, id, void __user *, addr)
+{
+	uint cmds, type;
+	struct super_block *sb = NULL;
+	struct path path, *pathp = NULL;
+	int ret;
+
+	cmds = cmd >> SUBCMDSHIFT;
+	type = cmd & SUBCMDMASK;
+
+	/*
+	 * As a special case Q_SYNC can be called without a specific device.
+	 * It will iterate all superblocks that have quota enabled and call
+	 * the sync action on each of them.
+	 */
+	if (!special) {
+		if (cmds == Q_SYNC)
+			return quota_sync_all(type);
+		return -ENODEV;
+	}
+
+	/*
+	 * Path for quotaon has to be resolved before grabbing superblock
+	 * because that gets s_umount sem which is also possibly needed by path
+	 * resolution (think about autofs) and thus deadlocks could arise.
+	 */
+	if (cmds == Q_QUOTAON) {
+		ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+		if (ret)
+			pathp = ERR_PTR(ret);
+		else
+			pathp = &path;
+	}
+
+	sb = quotactl_block(special, cmds);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		goto out;
+	}
+
+	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
+
+	drop_super(sb);
+out:
+	if (pathp && !IS_ERR(pathp))
+		path_put(pathp);
+	return ret;
+}
diff --git a/kernel/fs/quota/quota_tree.c b/kernel/fs/quota/quota_tree.c
new file mode 100644
index 000000000..58efb83de
--- /dev/null
+++ b/kernel/fs/quota/quota_tree.c
@@ -0,0 +1,670 @@
+/*
+ *	vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_QT_PARANOIA
+
+static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
+{
+	unsigned int epb = info->dqi_usable_bs >> 2;
+	qid_t id = from_kqid(&init_user_ns, qid);
+
+	depth = info->dqi_qtree_depth - depth - 1;
+	while (depth--)
+		id /= epb;
+	return id % epb;
+}
+
+/* Number of entries in one blocks */
+static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+	return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+	       / info->dqi_entry_size;
+}
+
+static char *getdqbuf(size_t size)
+{
+	char *buf = kmalloc(size, GFP_NOFS);
+	if (!buf)
+		printk(KERN_WARNING
+		       "VFS: Not enough memory for quota buffers.\n");
+	return buf;
+}
+
+static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	memset(buf, 0, info->dqi_usable_bs);
+	return sb->s_op->quota_read(sb, info->dqi_type, buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
+{
+	struct super_block *sb = info->dqi_sb;
+	ssize_t ret;
+
+	ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+	if (ret != info->dqi_usable_bs) {
+		quota_error(sb, "dquota write failed");
+		if (ret >= 0)
+			ret = -EIO;
+	}
+	return ret;
+}
+
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int ret, blk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (info->dqi_free_blk) {
+		blk = info->dqi_free_blk;
+		ret = read_blk(info, blk, buf);
+		if (ret < 0)
+			goto out_buf;
+		info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+	}
+	else {
+		memset(buf, 0, info->dqi_usable_bs);
+		/* Assure block allocation... */
+		ret = write_blk(info, info->dqi_blocks, buf);
+		if (ret < 0)
+			goto out_buf;
+		blk = info->dqi_blocks++;
+	}
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	ret = blk;
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
+{
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	dh->dqdh_entries = cpu_to_le16(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		return err;
+	info->dqi_free_blk = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+}
+
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
+{
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	if (nextblk) {
+		err = read_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							dh->dqdh_prev_free;
+		err = write_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	if (prevblk) {
+		err = read_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+							dh->dqdh_next_free;
+		err = write_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	} else {
+		info->dqi_free_entry = nextblk;
+		mark_info_dirty(info->dqi_sb, info->dqi_type);
+	}
+	kfree(tmpbuf);
+	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+	/* No matter whether write succeeds block is out of list */
+	if (write_blk(info, blk, buf) < 0)
+		quota_error(info->dqi_sb, "Can't write block (%u) "
+			    "with free entries", blk);
+	return 0;
+out_buf:
+	kfree(tmpbuf);
+	return err;
+}
+
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
+{
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		goto out_buf;
+	if (info->dqi_free_entry) {
+		err = read_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							cpu_to_le32(blk);
+		err = write_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	kfree(tmpbuf);
+	info->dqi_free_entry = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+out_buf:
+	kfree(tmpbuf);
+	return err;
+}
+
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+	int i;
+
+	for (i = 0; i < info->dqi_entry_size; i++)
+		if (disk[i])
+			return 0;
+	return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+			      struct dquot *dquot, int *err)
+{
+	uint blk, i;
+	struct qt_disk_dqdbheader *dh;
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	char *ddquot;
+
+	*err = 0;
+	if (!buf) {
+		*err = -ENOMEM;
+		return 0;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	if (info->dqi_free_entry) {
+		blk = info->dqi_free_entry;
+		*err = read_blk(info, blk, buf);
+		if (*err < 0)
+			goto out_buf;
+	} else {
+		blk = get_free_dqblk(info);
+		if ((int)blk < 0) {
+			*err = blk;
+			kfree(buf);
+			return 0;
+		}
+		memset(buf, 0, info->dqi_usable_bs);
+		/* This is enough as the block is already zeroed and the entry
+		 * list is empty... */
+		info->dqi_free_entry = blk;
+		mark_info_dirty(dquot->dq_sb, dquot->dq_id.type);
+	}
+	/* Block will be full? */
+	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+		*err = remove_free_dqentry(info, buf, blk);
+		if (*err < 0) {
+			quota_error(dquot->dq_sb, "Can't remove block (%u) "
+				    "from entry free list", blk);
+			goto out_buf;
+		}
+	}
+	le16_add_cpu(&dh->dqdh_entries, 1);
+	/* Find free structure in block */
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (qtree_entry_unused(info, ddquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
+#ifdef __QUOTA_QT_PARANOIA
+	if (i == qtree_dqstr_in_blk(info)) {
+		quota_error(dquot->dq_sb, "Data block full but it shouldn't");
+		*err = -EIO;
+		goto out_buf;
+	}
+#endif
+	*err = write_blk(info, blk, buf);
+	if (*err < 0) {
+		quota_error(dquot->dq_sb, "Can't write quota data block %u",
+			    blk);
+		goto out_buf;
+	}
+	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+			sizeof(struct qt_disk_dqdbheader) +
+			i * info->dqi_entry_size;
+	kfree(buf);
+	return blk;
+out_buf:
+	kfree(buf);
+	return 0;
+}
+
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			  uint *treeblk, int depth)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0, newson = 0, newact = 0;
+	__le32 *ref;
+	uint newblk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (!*treeblk) {
+		ret = get_free_dqblk(info);
+		if (ret < 0)
+			goto out_buf;
+		*treeblk = ret;
+		memset(buf, 0, info->dqi_usable_bs);
+		newact = 1;
+	} else {
+		ret = read_blk(info, *treeblk, buf);
+		if (ret < 0) {
+			quota_error(dquot->dq_sb, "Can't read tree quota "
+				    "block %u", *treeblk);
+			goto out_buf;
+		}
+	}
+	ref = (__le32 *)buf;
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!newblk)
+		newson = 1;
+	if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+		if (newblk) {
+			quota_error(dquot->dq_sb, "Inserting already present "
+				    "quota entry (block %u)",
+				    le32_to_cpu(ref[get_index(info,
+						dquot->dq_id, depth)]));
+			ret = -EIO;
+			goto out_buf;
+		}
+#endif
+		newblk = find_free_dqentry(info, dquot, &ret);
+	} else {
+		ret = do_insert_tree(info, dquot, &newblk, depth+1);
+	}
+	if (newson && ret >= 0) {
+		ref[get_index(info, dquot->dq_id, depth)] =
+							cpu_to_le32(newblk);
+		ret = write_blk(info, *treeblk, buf);
+	} else if (newact && ret < 0) {
+		put_free_dqblk(info, buf, *treeblk);
+	}
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot)
+{
+	int tmp = QT_TREEOFF;
+
+#ifdef __QUOTA_QT_PARANOIA
+	if (info->dqi_blocks <= QT_TREEOFF) {
+		quota_error(dquot->dq_sb, "Quota tree root isn't allocated!");
+		return -EIO;
+	}
+#endif
+	return do_insert_tree(info, dquot, &tmp, 0);
+}
+
+/*
+ * We don't have to be afraid of deadlocks as we never have quotas on quota
+ * files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_id.type;
+	struct super_block *sb = dquot->dq_sb;
+	ssize_t ret;
+	char *ddquot = getdqbuf(info->dqi_entry_size);
+
+	if (!ddquot)
+		return -ENOMEM;
+
+	/* dq_off is guarded by dqio_mutex */
+	if (!dquot->dq_off) {
+		ret = dq_insert_tree(info, dquot);
+		if (ret < 0) {
+			quota_error(sb, "Error %zd occurred while creating "
+				    "quota", ret);
+			kfree(ddquot);
+			return ret;
+		}
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+	spin_unlock(&dq_data_lock);
+	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
+				    dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		quota_error(sb, "dquota write failed");
+		if (ret >= 0)
+			ret = -ENOSPC;
+	} else {
+		ret = 0;
+	}
+	dqstats_inc(DQST_WRITES);
+	kfree(ddquot);
+
+	return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			uint blk)
+{
+	struct qt_disk_dqdbheader *dh;
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+
+	if (!buf)
+		return -ENOMEM;
+	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+		quota_error(dquot->dq_sb, "Quota structure has offset to "
+			"other block (%u) than it should (%u)", blk,
+			(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		goto out_buf;
+	}
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    blk);
+		goto out_buf;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	le16_add_cpu(&dh->dqdh_entries, -1);
+	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
+		ret = remove_free_dqentry(info, buf, blk);
+		if (ret >= 0)
+			ret = put_free_dqblk(info, buf, blk);
+		if (ret < 0) {
+			quota_error(dquot->dq_sb, "Can't move quota data block "
+				    "(%u) to free list", blk);
+			goto out_buf;
+		}
+	} else {
+		memset(buf +
+		       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+		       0, info->dqi_entry_size);
+		if (le16_to_cpu(dh->dqdh_entries) ==
+		    qtree_dqstr_in_blk(info) - 1) {
+			/* Insert will write block itself */
+			ret = insert_free_dqentry(info, buf, blk);
+			if (ret < 0) {
+				quota_error(dquot->dq_sb, "Can't insert quota "
+				    "data block (%u) to free entry list", blk);
+				goto out_buf;
+			}
+		} else {
+			ret = write_blk(info, blk, buf);
+			if (ret < 0) {
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "data block %u", blk);
+				goto out_buf;
+			}
+		}
+	}
+	dquot->dq_off = 0;	/* Quota is now unattached */
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+		       uint *blk, int depth)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+	uint newblk;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, *blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    *blk);
+		goto out_buf;
+	}
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (depth == info->dqi_qtree_depth - 1) {
+		ret = free_dqentry(info, dquot, newblk);
+		newblk = 0;
+	} else {
+		ret = remove_tree(info, dquot, &newblk, depth+1);
+	}
+	if (ret >= 0 && !newblk) {
+		int i;
+		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+		/* Block got empty? */
+		for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
+			;
+		/* Don't put the root block into the free block list */
+		if (i == (info->dqi_usable_bs >> 2)
+		    && *blk != QT_TREEOFF) {
+			put_free_dqblk(info, buf, *blk);
+			*blk = 0;
+		} else {
+			ret = write_blk(info, *blk, buf);
+			if (ret < 0)
+				quota_error(dquot->dq_sb,
+					    "Can't write quota tree block %u",
+					    *blk);
+		}
+	}
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	uint tmp = QT_TREEOFF;
+
+	if (!dquot->dq_off)	/* Even not allocated? */
+		return 0;
+	return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot, uint blk)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	int i;
+	char *ddquot;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota tree "
+			    "block %u", blk);
+		goto out_buf;
+	}
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (info->dqi_ops->is_id(ddquot, dquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
+	if (i == qtree_dqstr_in_blk(info)) {
+		quota_error(dquot->dq_sb,
+			    "Quota for id %u referenced but not present",
+			    from_kqid(&init_user_ns, dquot->dq_id));
+		ret = -EIO;
+		goto out_buf;
+	} else {
+		ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+		  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+	}
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+				struct dquot *dquot, uint blk, int depth)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota tree block %u",
+			    blk);
+		goto out_buf;
+	}
+	ret = 0;
+	blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!blk)	/* No reference? */
+		goto out_buf;
+	if (depth < info->dqi_qtree_depth - 1)
+		ret = find_tree_dqentry(info, dquot, blk, depth+1);
+	else
+		ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+				  struct dquot *dquot)
+{
+	return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_id.type;
+	struct super_block *sb = dquot->dq_sb;
+	loff_t offset;
+	char *ddquot;
+	int ret = 0;
+
+#ifdef __QUOTA_QT_PARANOIA
+	/* Invalidated quota? */
+	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+		quota_error(sb, "Quota invalidated while reading!");
+		return -EIO;
+	}
+#endif
+	/* Do we know offset of the dquot entry in the quota file? */
+	if (!dquot->dq_off) {
+		offset = find_dqentry(info, dquot);
+		if (offset <= 0) {	/* Entry not present? */
+			if (offset < 0)
+				quota_error(sb,"Can't read quota structure "
+					    "for id %u",
+					    from_kqid(&init_user_ns,
+						      dquot->dq_id));
+			dquot->dq_off = 0;
+			set_bit(DQ_FAKE_B, &dquot->dq_flags);
+			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+			ret = offset;
+			goto out;
+		}
+		dquot->dq_off = offset;
+	}
+	ddquot = getdqbuf(info->dqi_entry_size);
+	if (!ddquot)
+		return -ENOMEM;
+	ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
+				   dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		if (ret >= 0)
+			ret = -EIO;
+		quota_error(sb, "Error while reading quota structure for id %u",
+			    from_kqid(&init_user_ns, dquot->dq_id));
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+		kfree(ddquot);
+		goto out;
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+	if (!dquot->dq_dqb.dqb_bhardlimit &&
+	    !dquot->dq_dqb.dqb_bsoftlimit &&
+	    !dquot->dq_dqb.dqb_ihardlimit &&
+	    !dquot->dq_dqb.dqb_isoftlimit)
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	spin_unlock(&dq_data_lock);
+	kfree(ddquot);
+out:
+	dqstats_inc(DQST_READS);
+	return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) &&
+	    !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+		return qtree_delete_dquot(info, dquot);
+	return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/kernel/fs/quota/quota_tree.h b/kernel/fs/quota/quota_tree.h
new file mode 100644
index 000000000..a1ab8db81
--- /dev/null
+++ b/kernel/fs/quota/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+	__le32 dqdh_next_free;	/* Number of next block with free entry */
+	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
+	__le16 dqdh_entries;	/* Number of valid entries in block */
+	__le16 dqdh_pad1;
+	__le32 dqdh_pad2;
+};
+
+#define QT_TREEOFF	1		/* Offset of tree in file in blocks */
+
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/kernel/fs/quota/quota_v1.c b/kernel/fs/quota/quota_v1.c
new file mode 100644
index 000000000..8fe79bece
--- /dev/null
+++ b/kernel/fs/quota/quota_v1.c
@@ -0,0 +1,235 @@
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_v1.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/byteorder.h>
+
+#include "quotaio_v1.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Old quota format support");
+MODULE_LICENSE("GPL");
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
+static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
+{
+	m->dqb_ihardlimit = d->dqb_ihardlimit;
+	m->dqb_isoftlimit = d->dqb_isoftlimit;
+	m->dqb_curinodes = d->dqb_curinodes;
+	m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
+	m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
+	m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
+	m->dqb_itime = d->dqb_itime;
+	m->dqb_btime = d->dqb_btime;
+}
+
+static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
+{
+	d->dqb_ihardlimit = m->dqb_ihardlimit;
+	d->dqb_isoftlimit = m->dqb_isoftlimit;
+	d->dqb_curinodes = m->dqb_curinodes;
+	d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
+	d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
+	d->dqb_itime = m->dqb_itime;
+	d->dqb_btime = m->dqb_btime;
+}
+
+static int v1_read_dqblk(struct dquot *dquot)
+{
+	int type = dquot->dq_id.type;
+	struct v1_disk_dqblk dqblk;
+
+	if (!sb_dqopt(dquot->dq_sb)->files[type])
+		return -EINVAL;
+
+	/* Set structure to 0s in case read fails/is after end of file */
+	memset(&dqblk, 0, sizeof(struct v1_disk_dqblk));
+	dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk,
+			sizeof(struct v1_disk_dqblk),
+			v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id)));
+
+	v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk);
+	if (dquot->dq_dqb.dqb_bhardlimit == 0 &&
+	    dquot->dq_dqb.dqb_bsoftlimit == 0 &&
+	    dquot->dq_dqb.dqb_ihardlimit == 0 &&
+	    dquot->dq_dqb.dqb_isoftlimit == 0)
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	dqstats_inc(DQST_READS);
+
+	return 0;
+}
+
+static int v1_commit_dqblk(struct dquot *dquot)
+{
+	short type = dquot->dq_id.type;
+	ssize_t ret;
+	struct v1_disk_dqblk dqblk;
+
+	v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb);
+	if (((type == USRQUOTA) && uid_eq(dquot->dq_id.uid, GLOBAL_ROOT_UID)) ||
+	    ((type == GRPQUOTA) && gid_eq(dquot->dq_id.gid, GLOBAL_ROOT_GID))) {
+		dqblk.dqb_btime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
+		dqblk.dqb_itime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
+	}
+	ret = 0;
+	if (sb_dqopt(dquot->dq_sb)->files[type])
+		ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
+			(char *)&dqblk, sizeof(struct v1_disk_dqblk),
+			v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id)));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
+		quota_error(dquot->dq_sb, "dquota write failed");
+		if (ret >= 0)
+			ret = -EIO;
+		goto out;
+	}
+	ret = 0;
+
+out:
+	dqstats_inc(DQST_WRITES);
+
+	return ret;
+}
+
+/* Magics of new quota format */
+#define V2_INITQMAGICS {\
+	0xd9c01f11,     /* USRQUOTA */\
+	0xd9c01927      /* GRPQUOTA */\
+}
+
+/* Header of new quota format */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;        /* Magic number identifying file */
+	__le32 dqh_version;      /* File version */
+};
+
+static int v1_check_quota_file(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	ulong blocks;
+	size_t off; 
+	struct v2_disk_dqheader dqhead;
+	ssize_t size;
+	loff_t isize;
+	static const uint quota_magics[] = V2_INITQMAGICS;
+
+	isize = i_size_read(inode);
+	if (!isize)
+		return 0;
+	blocks = isize >> BLOCK_SIZE_BITS;
+	off = isize & (BLOCK_SIZE - 1);
+	if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) %
+	    sizeof(struct v1_disk_dqblk))
+		return 0;
+	/* Doublecheck whether we didn't get file with new format - with old
+	 * quotactl() this could happen */
+	size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
+	if (size != sizeof(struct v2_disk_dqheader))
+		return 1;	/* Probably not new format */
+	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type])
+		return 1;	/* Definitely not new format */
+	printk(KERN_INFO
+	       "VFS: %s: Refusing to turn on old quota format on given file."
+	       " It probably contains newer quota format.\n", sb->s_id);
+        return 0;		/* Seems like a new format file -> refuse it */
+}
+
+static int v1_read_file_info(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct v1_disk_dqblk dqblk;
+	int ret;
+
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
+		if (ret >= 0)
+			ret = -EIO;
+		goto out;
+	}
+	ret = 0;
+	/* limits are stored as unsigned 32-bit data */
+	dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+	dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
+	dqopt->info[type].dqi_igrace =
+			dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
+	dqopt->info[type].dqi_bgrace =
+			dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
+out:
+	return ret;
+}
+
+static int v1_write_file_info(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct v1_disk_dqblk dqblk;
+	int ret;
+
+	dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
+		if (ret >= 0)
+			ret = -EIO;
+		goto out;
+	}
+	dqblk.dqb_itime = dqopt->info[type].dqi_igrace;
+	dqblk.dqb_btime = dqopt->info[type].dqi_bgrace;
+	ret = sb->s_op->quota_write(sb, type, (char *)&dqblk,
+	      sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret == sizeof(struct v1_disk_dqblk))
+		ret = 0;
+	else if (ret > 0)
+		ret = -EIO;
+out:
+	return ret;
+}
+
+static const struct quota_format_ops v1_format_ops = {
+	.check_quota_file	= v1_check_quota_file,
+	.read_file_info		= v1_read_file_info,
+	.write_file_info	= v1_write_file_info,
+	.free_file_info		= NULL,
+	.read_dqblk		= v1_read_dqblk,
+	.commit_dqblk		= v1_commit_dqblk,
+};
+
+static struct quota_format_type v1_quota_format = {
+	.qf_fmt_id	= QFMT_VFS_OLD,
+	.qf_ops		= &v1_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
+static int __init init_v1_quota_format(void)
+{
+        return register_quota_format(&v1_quota_format);
+}
+
+static void __exit exit_v1_quota_format(void)
+{
+        unregister_quota_format(&v1_quota_format);
+}
+
+module_init(init_v1_quota_format);
+module_exit(exit_v1_quota_format);
+
diff --git a/kernel/fs/quota/quota_v2.c b/kernel/fs/quota/quota_v2.c
new file mode 100644
index 000000000..2aa012a68
--- /dev/null
+++ b/kernel/fs/quota/quota_v2.c
@@ -0,0 +1,346 @@
+/*
+ *	vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+#include "quotaio_v2.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota format v2 support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_V2_PARANOIA
+
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2r0_is_id(void *dp, struct dquot *dquot);
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2r1_is_id(void *dp, struct dquot *dquot);
+
+static struct qtree_fmt_operations v2r0_qtree_ops = {
+	.mem2disk_dqblk = v2r0_mem2diskdqb,
+	.disk2mem_dqblk = v2r0_disk2memdqb,
+	.is_id = v2r0_is_id,
+};
+
+static struct qtree_fmt_operations v2r1_qtree_ops = {
+	.mem2disk_dqblk = v2r1_mem2diskdqb,
+	.disk2mem_dqblk = v2r1_disk2memdqb,
+	.is_id = v2r1_is_id,
+};
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v2_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
+static int v2_read_header(struct super_block *sb, int type,
+			  struct v2_disk_dqheader *dqhead)
+{
+	ssize_t size;
+
+	size = sb->s_op->quota_read(sb, type, (char *)dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
+	if (size != sizeof(struct v2_disk_dqheader)) {
+		quota_error(sb, "Failed header read: expected=%zd got=%zd",
+			    sizeof(struct v2_disk_dqheader), size);
+		return 0;
+	}
+	return 1;
+}
+
+/* Check whether given file is really vfsv0 quotafile */
+static int v2_check_quota_file(struct super_block *sb, int type)
+{
+	struct v2_disk_dqheader dqhead;
+	static const uint quota_magics[] = V2_INITQMAGICS;
+	static const uint quota_versions[] = V2_INITQVERSIONS;
+ 
+	if (!v2_read_header(sb, type, &dqhead))
+		return 0;
+	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
+	    le32_to_cpu(dqhead.dqh_version) > quota_versions[type])
+		return 0;
+	return 1;
+}
+
+/* Read information header from quota file */
+static int v2_read_file_info(struct super_block *sb, int type)
+{
+	struct v2_disk_dqinfo dinfo;
+	struct v2_disk_dqheader dqhead;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo;
+	ssize_t size;
+	unsigned int version;
+
+	if (!v2_read_header(sb, type, &dqhead))
+		return -1;
+	version = le32_to_cpu(dqhead.dqh_version);
+	if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) ||
+	    (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1))
+		return -1;
+
+	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+	if (size != sizeof(struct v2_disk_dqinfo)) {
+		quota_error(sb, "Can't read info structure");
+		return -1;
+	}
+	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+	if (!info->dqi_priv) {
+		printk(KERN_WARNING
+		       "Not enough memory for quota information structure.\n");
+		return -ENOMEM;
+	}
+	qinfo = info->dqi_priv;
+	if (version == 0) {
+		/* limits are stored as unsigned 32-bit data */
+		info->dqi_max_spc_limit = 0xffffffffLL << QUOTABLOCK_BITS;
+		info->dqi_max_ino_limit = 0xffffffff;
+	} else {
+		/*
+		 * Used space is stored as unsigned 64-bit value in bytes but
+		 * quota core supports only signed 64-bit values so use that
+		 * as a limit
+		 */
+		info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
+		info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
+	}
+	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+	/* No flags currently supported */
+	info->dqi_flags = 0;
+	qinfo->dqi_sb = sb;
+	qinfo->dqi_type = type;
+	qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+	qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+	qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+	if (version == 0) {
+		qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk);
+		qinfo->dqi_ops = &v2r0_qtree_ops;
+	} else {
+		qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
+		qinfo->dqi_ops = &v2r1_qtree_ops;
+	}
+	return 0;
+}
+
+/* Write information header to quota file */
+static int v2_write_file_info(struct super_block *sb, int type)
+{
+	struct v2_disk_dqinfo dinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
+	ssize_t size;
+
+	spin_lock(&dq_data_lock);
+	info->dqi_flags &= ~DQF_INFO_DIRTY;
+	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+	/* No flags currently supported */
+	dinfo.dqi_flags = cpu_to_le32(0);
+	spin_unlock(&dq_data_lock);
+	dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
+	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+	if (size != sizeof(struct v2_disk_dqinfo)) {
+		quota_error(sb, "Can't write info structure");
+		return -1;
+	}
+	return 0;
+}
+
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct v2r0_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
+	m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
+	m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
+	m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
+	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2r0_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk)))
+		m->dqb_itime = 0;
+}
+
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct v2r0_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+
+	d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
+}
+
+static int v2r0_is_id(void *dp, struct dquot *dquot)
+{
+	struct v2r0_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+
+	if (qtree_entry_unused(info, dp))
+		return 0;
+	return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
+				le32_to_cpu(d->dqb_id)),
+		      dquot->dq_id);
+}
+
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct v2r1_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+	m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit));
+	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2r1_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk)))
+		m->dqb_itime = 0;
+}
+
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct v2r1_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit));
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
+}
+
+static int v2r1_is_id(void *dp, struct dquot *dquot)
+{
+	struct v2r1_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+
+	if (qtree_entry_unused(info, dp))
+		return 0;
+	return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
+				le32_to_cpu(d->dqb_id)),
+		      dquot->dq_id);
+}
+
+static int v2_read_dquot(struct dquot *dquot)
+{
+	return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+}
+
+static int v2_write_dquot(struct dquot *dquot)
+{
+	return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+}
+
+static int v2_release_dquot(struct dquot *dquot)
+{
+	return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+}
+
+static int v2_free_file_info(struct super_block *sb, int type)
+{
+	kfree(sb_dqinfo(sb, type)->dqi_priv);
+	return 0;
+}
+
+static const struct quota_format_ops v2_format_ops = {
+	.check_quota_file	= v2_check_quota_file,
+	.read_file_info		= v2_read_file_info,
+	.write_file_info	= v2_write_file_info,
+	.free_file_info		= v2_free_file_info,
+	.read_dqblk		= v2_read_dquot,
+	.commit_dqblk		= v2_write_dquot,
+	.release_dqblk		= v2_release_dquot,
+};
+
+static struct quota_format_type v2r0_quota_format = {
+	.qf_fmt_id	= QFMT_VFS_V0,
+	.qf_ops		= &v2_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
+static struct quota_format_type v2r1_quota_format = {
+	.qf_fmt_id	= QFMT_VFS_V1,
+	.qf_ops		= &v2_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
+static int __init init_v2_quota_format(void)
+{
+	int ret;
+
+	ret = register_quota_format(&v2r0_quota_format);
+	if (ret)
+		return ret;
+	return register_quota_format(&v2r1_quota_format);
+}
+
+static void __exit exit_v2_quota_format(void)
+{
+	unregister_quota_format(&v2r0_quota_format);
+	unregister_quota_format(&v2r1_quota_format);
+}
+
+module_init(init_v2_quota_format);
+module_exit(exit_v2_quota_format);
diff --git a/kernel/fs/quota/quotaio_v1.h b/kernel/fs/quota/quotaio_v1.h
new file mode 100644
index 000000000..746654b5d
--- /dev/null
+++ b/kernel/fs/quota/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+
+#include <linux/types.h>
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+	__u32 dqb_bhardlimit;	/* absolute limit on disk blks alloc */
+	__u32 dqb_bsoftlimit;	/* preferred limit on disk blks */
+	__u32 dqb_curblocks;	/* current block count */
+	__u32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__u32 dqb_isoftlimit;	/* preferred inode limit */
+	__u32 dqb_curinodes;	/* current # allocated inodes */
+	time_t dqb_btime;	/* time limit for excessive disk use */
+	time_t dqb_itime;	/* time limit for excessive inode use */
+};
+
+#define v1_dqoff(UID)      ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+
+#endif	/* _LINUX_QUOTAIO_V1_H */
diff --git a/kernel/fs/quota/quotaio_v2.h b/kernel/fs/quota/quotaio_v2.h
new file mode 100644
index 000000000..4e9543009
--- /dev/null
+++ b/kernel/fs/quota/quotaio_v2.h
@@ -0,0 +1,75 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+	0xd9c01f11,	/* USRQUOTA */\
+	0xd9c01927,	/* GRPQUOTA */\
+	0xd9c03f14,	/* PRJQUOTA */\
+}
+
+#define V2_INITQVERSIONS {\
+	1,		/* USRQUOTA */\
+	1,		/* GRPQUOTA */\
+	1,		/* PRJQUOTA */\
+}
+
+/* First generic header */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* File version */
+};
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2r0_disk_dqblk {
+	__le32 dqb_id;		/* id this quota applies to */
+	__le32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__le32 dqb_isoftlimit;	/* preferred inode limit */
+	__le32 dqb_curinodes;	/* current # allocated inodes */
+	__le32 dqb_bhardlimit;	/* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+	__le32 dqb_bsoftlimit;	/* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_curspace;	/* current space occupied (in bytes) */
+	__le64 dqb_btime;	/* time limit for excessive disk use */
+	__le64 dqb_itime;	/* time limit for excessive inode use */
+};
+
+struct v2r1_disk_dqblk {
+	__le32 dqb_id;		/* id this quota applies to */
+	__le32 dqb_pad;
+	__le64 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__le64 dqb_isoftlimit;	/* preferred inode limit */
+	__le64 dqb_curinodes;	/* current # allocated inodes */
+	__le64 dqb_bhardlimit;	/* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_bsoftlimit;	/* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_curspace;	/* current space occupied (in bytes) */
+	__le64 dqb_btime;	/* time limit for excessive disk use */
+	__le64 dqb_itime;	/* time limit for excessive inode use */
+};
+
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
+	__le32 dqi_igrace;	/* Time before inode soft limit becomes hard limit */
+	__le32 dqi_flags;	/* Flags for quotafile (DQF_*) */
+	__le32 dqi_blocks;	/* Number of blocks in file */
+	__le32 dqi_free_blk;	/* Number of first free block in the list */
+	__le32 dqi_free_entry;	/* Number of block with at least one free entry */
+};
+
+#define V2_DQINFOOFF	sizeof(struct v2_disk_dqheader)	/* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10				/* Size of leaf block in tree */
+
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/kernel/fs/ramfs/Makefile b/kernel/fs/ramfs/Makefile
new file mode 100644
index 000000000..c71e65dca
--- /dev/null
+++ b/kernel/fs/ramfs/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the linux ramfs routines.
+#
+
+obj-y += ramfs.o
+
+file-mmu-y := file-nommu.o
+file-mmu-$(CONFIG_MMU) := file-mmu.o
+ramfs-objs += inode.o $(file-mmu-y)
diff --git a/kernel/fs/ramfs/file-mmu.c b/kernel/fs/ramfs/file-mmu.c
new file mode 100644
index 000000000..183a21269
--- /dev/null
+++ b/kernel/fs/ramfs/file-mmu.c
@@ -0,0 +1,46 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/ramfs.h>
+
+#include "internal.h"
+
+const struct file_operations ramfs_file_operations = {
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.fsync		= noop_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.llseek		= generic_file_llseek,
+};
+
+const struct inode_operations ramfs_file_inode_operations = {
+	.setattr	= simple_setattr,
+	.getattr	= simple_getattr,
+};
diff --git a/kernel/fs/ramfs/file-nommu.c b/kernel/fs/ramfs/file-nommu.c
new file mode 100644
index 000000000..ba1323a94
--- /dev/null
+++ b/kernel/fs/ramfs/file-nommu.c
@@ -0,0 +1,271 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+						   unsigned long addr,
+						   unsigned long len,
+						   unsigned long pgoff,
+						   unsigned long flags);
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+		NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
+const struct file_operations ramfs_file_operations = {
+	.mmap_capabilities	= ramfs_mmap_capabilities,
+	.mmap			= ramfs_nommu_mmap,
+	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
+	.read_iter		= generic_file_read_iter,
+	.write_iter		= generic_file_write_iter,
+	.fsync			= noop_fsync,
+	.splice_read		= generic_file_splice_read,
+	.splice_write		= iter_file_splice_write,
+	.llseek			= generic_file_llseek,
+};
+
+const struct inode_operations ramfs_file_inode_operations = {
+	.setattr		= ramfs_nommu_setattr,
+	.getattr		= simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	unsigned long npages, xpages, loop;
+	struct page *pages;
+	unsigned order;
+	void *data;
+	int ret;
+
+	/* make various checks */
+	order = get_order(newsize);
+	if (unlikely(order >= MAX_ORDER))
+		return -EFBIG;
+
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
+
+	i_size_write(inode, newsize);
+
+	/* allocate enough contiguous pages to be able to satisfy the
+	 * request */
+	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	if (!pages)
+		return -ENOMEM;
+
+	/* split the high-order page into an array of single pages */
+	xpages = 1UL << order;
+	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	split_page(pages, order);
+
+	/* trim off any pages we don't actually require */
+	for (loop = npages; loop < xpages; loop++)
+		__free_page(pages + loop);
+
+	/* clear the memory we allocated */
+	newsize = PAGE_SIZE * npages;
+	data = page_address(pages);
+	memset(data, 0, newsize);
+
+	/* attach all the pages to the inode's address space */
+	for (loop = 0; loop < npages; loop++) {
+		struct page *page = pages + loop;
+
+		ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
+					GFP_KERNEL);
+		if (ret < 0)
+			goto add_error;
+
+		/* prevent the page from being discarded on memory pressure */
+		SetPageDirty(page);
+		SetPageUptodate(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+
+	return 0;
+
+add_error:
+	while (loop < npages)
+		__free_page(pages + loop++);
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+	int ret;
+
+	/* assume a truncate from zero size is going to be for the purposes of
+	 * shared mmap */
+	if (size == 0) {
+		if (unlikely(newsize >> 32))
+			return -EFBIG;
+
+		return ramfs_nommu_expand_for_mapping(inode, newsize);
+	}
+
+	/* check that a decrease in size doesn't cut off any shared mappings */
+	if (newsize < size) {
+		ret = nommu_shrink_inode_mappings(inode, size, newsize);
+		if (ret < 0)
+			return ret;
+	}
+
+	truncate_setsize(inode, newsize);
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *inode = d_inode(dentry);
+	unsigned int old_ia_valid = ia->ia_valid;
+	int ret = 0;
+
+	/* POSIX UID/GID verification for setting inode attributes */
+	ret = inode_change_ok(inode, ia);
+	if (ret)
+		return ret;
+
+	/* pick out size-changing events */
+	if (ia->ia_valid & ATTR_SIZE) {
+		loff_t size = inode->i_size;
+
+		if (ia->ia_size != size) {
+			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/* we skipped the truncate but must still update
+			 * timestamps
+			 */
+			ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+		}
+	}
+
+	setattr_copy(inode, ia);
+ out:
+	ia->ia_valid = old_ia_valid;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ *   - the pages to be mapped must exist
+ *   - the pages be physically contiguous in sequence
+ */
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+					    unsigned long addr, unsigned long len,
+					    unsigned long pgoff, unsigned long flags)
+{
+	unsigned long maxpages, lpages, nr, loop, ret;
+	struct inode *inode = file_inode(file);
+	struct page **pages = NULL, **ptr, *page;
+	loff_t isize;
+
+	if (!(flags & MAP_SHARED))
+		return addr;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+
+	ret = -EINVAL;
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= maxpages)
+		goto out;
+
+	if (maxpages - pgoff < lpages)
+		goto out;
+
+	/* gang-find the pages */
+	ret = -ENOMEM;
+	pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out_free;
+
+	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	if (nr != lpages)
+		goto out_free_pages; /* leave if some pages were missing */
+
+	/* check the pages for physical adjacency */
+	ptr = pages;
+	page = *ptr++;
+	page++;
+	for (loop = lpages; loop > 1; loop--)
+		if (*ptr++ != page++)
+			goto out_free_pages;
+
+	/* okay - all conditions fulfilled */
+	ret = (unsigned long) page_address(pages[0]);
+
+out_free_pages:
+	ptr = pages;
+	for (loop = nr; loop > 0; loop--)
+		put_page(*ptr++);
+out_free:
+	kfree(pages);
+out:
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping for shared memory segments
+ */
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!(vma->vm_flags & VM_SHARED))
+		return -ENOSYS;
+
+	file_accessed(file);
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
diff --git a/kernel/fs/ramfs/inode.c b/kernel/fs/ramfs/inode.c
new file mode 100644
index 000000000..889d558b4
--- /dev/null
+++ b/kernel/fs/ramfs/inode.c
@@ -0,0 +1,266 @@
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+#define RAMFS_DEFAULT_MODE	0755
+
+static const struct super_operations ramfs_ops;
+static const struct inode_operations ramfs_dir_inode_operations;
+
+static const struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+	.set_page_dirty	= __set_page_dirty_no_writeback,
+};
+
+struct inode *ramfs_get_inode(struct super_block *sb,
+				const struct inode *dir, umode_t mode, dev_t dev)
+{
+	struct inode * inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode_init_owner(inode, dir, mode);
+		inode->i_mapping->a_ops = &ramfs_aops;
+		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+		mapping_set_unevictable(inode->i_mapping);
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		default:
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_op = &ramfs_file_inode_operations;
+			inode->i_fop = &ramfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &ramfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+
+			/* directory inodes start off with i_nlink == 2 (for "." entry) */
+			inc_nlink(inode);
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			break;
+		}
+	}
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int
+ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
+	int error = -ENOSPC;
+
+	if (inode) {
+		d_instantiate(dentry, inode);
+		dget(dentry);	/* Extra count - pin the dentry in core */
+		error = 0;
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	}
+	return error;
+}
+
+static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+{
+	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	if (!retval)
+		inc_nlink(dir);
+	return retval;
+}
+
+static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+{
+	return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+
+	inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+	if (inode) {
+		int l = strlen(symname)+1;
+		error = page_symlink(inode, symname, l);
+		if (!error) {
+			d_instantiate(dentry, inode);
+			dget(dentry);
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+		} else
+			iput(inode);
+	}
+	return error;
+}
+
+static const struct inode_operations ramfs_dir_inode_operations = {
+	.create		= ramfs_create,
+	.lookup		= simple_lookup,
+	.link		= simple_link,
+	.unlink		= simple_unlink,
+	.symlink	= ramfs_symlink,
+	.mkdir		= ramfs_mkdir,
+	.rmdir		= simple_rmdir,
+	.mknod		= ramfs_mknod,
+	.rename		= simple_rename,
+};
+
+static const struct super_operations ramfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.show_options	= generic_show_options,
+};
+
+struct ramfs_mount_opts {
+	umode_t mode;
+};
+
+enum {
+	Opt_mode,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_mode, "mode=%o"},
+	{Opt_err, NULL}
+};
+
+struct ramfs_fs_info {
+	struct ramfs_mount_opts mount_opts;
+};
+
+static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int token;
+	char *p;
+
+	opts->mode = RAMFS_DEFAULT_MODE;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+		/*
+		 * We might like to report bad mount options here;
+		 * but traditionally ramfs has ignored all mount options,
+		 * and as it is used as a !CONFIG_SHMEM simple substitute
+		 * for tmpfs, better continue to ignore other mount options.
+		 */
+		}
+	}
+
+	return 0;
+}
+
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct ramfs_fs_info *fsi;
+	struct inode *inode;
+	int err;
+
+	save_mount_options(sb, data);
+
+	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+	sb->s_fs_info = fsi;
+	if (!fsi)
+		return -ENOMEM;
+
+	err = ramfs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		return err;
+
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= RAMFS_MAGIC;
+	sb->s_op		= &ramfs_ops;
+	sb->s_time_gran		= 1;
+
+	inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+struct dentry *ramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
+}
+
+static void ramfs_kill_sb(struct super_block *sb)
+{
+	kfree(sb->s_fs_info);
+	kill_litter_super(sb);
+}
+
+static struct file_system_type ramfs_fs_type = {
+	.name		= "ramfs",
+	.mount		= ramfs_mount,
+	.kill_sb	= ramfs_kill_sb,
+	.fs_flags	= FS_USERNS_MOUNT,
+};
+
+int __init init_ramfs_fs(void)
+{
+	static unsigned long once;
+
+	if (test_and_set_bit(0, &once))
+		return 0;
+	return register_filesystem(&ramfs_fs_type);
+}
+fs_initcall(init_ramfs_fs);
diff --git a/kernel/fs/ramfs/internal.h b/kernel/fs/ramfs/internal.h
new file mode 100644
index 000000000..a9d8ae88f
--- /dev/null
+++ b/kernel/fs/ramfs/internal.h
@@ -0,0 +1,13 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern const struct inode_operations ramfs_file_inode_operations;
diff --git a/kernel/fs/read_write.c b/kernel/fs/read_write.c
new file mode 100644
index 000000000..819ef3faf
--- /dev/null
+++ b/kernel/fs/read_write.c
@@ -0,0 +1,1329 @@
+/*
+ *  linux/fs/read_write.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/slab.h> 
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/splice.h>
+#include <linux/compat.h>
+#include "internal.h"
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
+
+const struct file_operations generic_ro_fops = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.mmap		= generic_file_readonly_mmap,
+	.splice_read	= generic_file_splice_read,
+};
+
+EXPORT_SYMBOL(generic_ro_fops);
+
+static inline int unsigned_offsets(struct file *file)
+{
+	return file->f_mode & FMODE_UNSIGNED_OFFSET;
+}
+
+/**
+ * vfs_setpos - update the file offset for lseek
+ * @file:	file structure in question
+ * @offset:	file offset to seek to
+ * @maxsize:	maximum file size
+ *
+ * This is a low-level filesystem helper for updating the file offset to
+ * the value specified by @offset if the given offset is valid and it is
+ * not equal to the current file offset.
+ *
+ * Return the specified offset on success and -EINVAL on invalid offset.
+ */
+loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
+{
+	if (offset < 0 && !unsigned_offsets(file))
+		return -EINVAL;
+	if (offset > maxsize)
+		return -EINVAL;
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	return offset;
+}
+EXPORT_SYMBOL(vfs_setpos);
+
+/**
+ * generic_file_llseek_size - generic llseek implementation for regular files
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ * @size:	max size of this file in file system
+ * @eof:	offset used for SEEK_END position
+ *
+ * This is a variant of generic_file_llseek that allows passing in a custom
+ * maximum file size and a custom EOF position, for e.g. hashed directories
+ *
+ * Synchronization:
+ * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
+ * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
+ * read/writes behave like SEEK_SET against seeks.
+ */
+loff_t
+generic_file_llseek_size(struct file *file, loff_t offset, int whence,
+		loff_t maxsize, loff_t eof)
+{
+	switch (whence) {
+	case SEEK_END:
+		offset += eof;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0)
+			return file->f_pos;
+		/*
+		 * f_lock protects against read/modify/write race with other
+		 * SEEK_CURs. Note that parallel writes and reads behave
+		 * like SEEK_SET.
+		 */
+		spin_lock(&file->f_lock);
+		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
+		spin_unlock(&file->f_lock);
+		return offset;
+	case SEEK_DATA:
+		/*
+		 * In the generic case the entire file is data, so as long as
+		 * offset isn't at the end of the file then the offset is data.
+		 */
+		if (offset >= eof)
+			return -ENXIO;
+		break;
+	case SEEK_HOLE:
+		/*
+		 * There is a virtual hole at the end of the file, so as long as
+		 * offset isn't i_size or larger, return i_size.
+		 */
+		if (offset >= eof)
+			return -ENXIO;
+		offset = eof;
+		break;
+	}
+
+	return vfs_setpos(file, offset, maxsize);
+}
+EXPORT_SYMBOL(generic_file_llseek_size);
+
+/**
+ * generic_file_llseek - generic llseek implementation for regular files
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ *
+ * This is a generic implemenation of ->llseek useable for all normal local
+ * filesystems.  It just updates the file offset to the value specified by
+ * @offset and @whence.
+ */
+loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+
+	return generic_file_llseek_size(file, offset, whence,
+					inode->i_sb->s_maxbytes,
+					i_size_read(inode));
+}
+EXPORT_SYMBOL(generic_file_llseek);
+
+/**
+ * fixed_size_llseek - llseek implementation for fixed-sized devices
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ * @size:	size of the file
+ *
+ */
+loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
+{
+	switch (whence) {
+	case SEEK_SET: case SEEK_CUR: case SEEK_END:
+		return generic_file_llseek_size(file, offset, whence,
+						size, size);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(fixed_size_llseek);
+
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int whence)
+{
+	return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
+
+loff_t no_llseek(struct file *file, loff_t offset, int whence)
+{
+	return -ESPIPE;
+}
+EXPORT_SYMBOL(no_llseek);
+
+loff_t default_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file_inode(file);
+	loff_t retval;
+
+	mutex_lock(&inode->i_mutex);
+	switch (whence) {
+		case SEEK_END:
+			offset += i_size_read(inode);
+			break;
+		case SEEK_CUR:
+			if (offset == 0) {
+				retval = file->f_pos;
+				goto out;
+			}
+			offset += file->f_pos;
+			break;
+		case SEEK_DATA:
+			/*
+			 * In the generic case the entire file is data, so as
+			 * long as offset isn't at the end of the file then the
+			 * offset is data.
+			 */
+			if (offset >= inode->i_size) {
+				retval = -ENXIO;
+				goto out;
+			}
+			break;
+		case SEEK_HOLE:
+			/*
+			 * There is a virtual hole at the end of the file, so
+			 * as long as offset isn't i_size or larger, return
+			 * i_size.
+			 */
+			if (offset >= inode->i_size) {
+				retval = -ENXIO;
+				goto out;
+			}
+			offset = inode->i_size;
+			break;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 || unsigned_offsets(file)) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		retval = offset;
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+EXPORT_SYMBOL(default_llseek);
+
+loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t (*fn)(struct file *, loff_t, int);
+
+	fn = no_llseek;
+	if (file->f_mode & FMODE_LSEEK) {
+		if (file->f_op->llseek)
+			fn = file->f_op->llseek;
+	}
+	return fn(file, offset, whence);
+}
+EXPORT_SYMBOL(vfs_llseek);
+
+static inline struct fd fdget_pos(int fd)
+{
+	return __to_fd(__fdget_pos(fd));
+}
+
+static inline void fdput_pos(struct fd f)
+{
+	if (f.flags & FDPUT_POS_UNLOCK)
+		mutex_unlock(&f.file->f_pos_lock);
+	fdput(f);
+}
+
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
+{
+	off_t retval;
+	struct fd f = fdget_pos(fd);
+	if (!f.file)
+		return -EBADF;
+
+	retval = -EINVAL;
+	if (whence <= SEEK_MAX) {
+		loff_t res = vfs_llseek(f.file, offset, whence);
+		retval = res;
+		if (res != (loff_t)retval)
+			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
+	}
+	fdput_pos(f);
+	return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
+{
+	return sys_lseek(fd, offset, whence);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_LLSEEK
+SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
+		unsigned long, offset_low, loff_t __user *, result,
+		unsigned int, whence)
+{
+	int retval;
+	struct fd f = fdget_pos(fd);
+	loff_t offset;
+
+	if (!f.file)
+		return -EBADF;
+
+	retval = -EINVAL;
+	if (whence > SEEK_MAX)
+		goto out_putf;
+
+	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
+			whence);
+
+	retval = (int)offset;
+	if (offset >= 0) {
+		retval = -EFAULT;
+		if (!copy_to_user(result, &offset, sizeof(offset)))
+			retval = 0;
+	}
+out_putf:
+	fdput_pos(f);
+	return retval;
+}
+#endif
+
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	if (!file->f_op->read_iter)
+		return -EINVAL;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+
+	iter->type |= READ;
+	ret = file->f_op->read_iter(&kiocb, iter);
+	BUG_ON(ret == -EIOCBQUEUED);
+	if (ret > 0)
+		*ppos = kiocb.ki_pos;
+	return ret;
+}
+EXPORT_SYMBOL(vfs_iter_read);
+
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	if (!file->f_op->write_iter)
+		return -EINVAL;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+
+	iter->type |= WRITE;
+	ret = file->f_op->write_iter(&kiocb, iter);
+	BUG_ON(ret == -EIOCBQUEUED);
+	if (ret > 0)
+		*ppos = kiocb.ki_pos;
+	return ret;
+}
+EXPORT_SYMBOL(vfs_iter_write);
+
+/*
+ * rw_verify_area doesn't like huge counts. We limit
+ * them to something that fits in "int" so that others
+ * won't have to do range checks all the time.
+ */
+int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
+{
+	struct inode *inode;
+	loff_t pos;
+	int retval = -EINVAL;
+
+	inode = file_inode(file);
+	if (unlikely((ssize_t) count < 0))
+		return retval;
+	pos = *ppos;
+	if (unlikely(pos < 0)) {
+		if (!unsigned_offsets(file))
+			return retval;
+		if (count >= -pos) /* both values are in 0..LLONG_MAX */
+			return -EOVERFLOW;
+	} else if (unlikely((loff_t) (pos + count) < 0)) {
+		if (!unsigned_offsets(file))
+			return retval;
+	}
+
+	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
+		retval = locks_mandatory_area(
+			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
+			inode, file, pos, count);
+		if (retval < 0)
+			return retval;
+	}
+	retval = security_file_permission(file,
+				read_write == READ ? MAY_READ : MAY_WRITE);
+	if (retval)
+		return retval;
+	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
+}
+
+static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+	struct iovec iov = { .iov_base = buf, .iov_len = len };
+	struct kiocb kiocb;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	iov_iter_init(&iter, READ, &iov, 1, len);
+
+	ret = filp->f_op->read_iter(&kiocb, &iter);
+	BUG_ON(ret == -EIOCBQUEUED);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
+		   loff_t *pos)
+{
+	if (file->f_op->read)
+		return file->f_op->read(file, buf, count, pos);
+	else if (file->f_op->read_iter)
+		return new_sync_read(file, buf, count, pos);
+	else
+		return -EINVAL;
+}
+EXPORT_SYMBOL(__vfs_read);
+
+ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
+{
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+	if (!(file->f_mode & FMODE_CAN_READ))
+		return -EINVAL;
+	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
+		return -EFAULT;
+
+	ret = rw_verify_area(READ, file, pos, count);
+	if (ret >= 0) {
+		count = ret;
+		ret = __vfs_read(file, buf, count, pos);
+		if (ret > 0) {
+			fsnotify_access(file);
+			add_rchar(current, ret);
+		}
+		inc_syscr(current);
+	}
+
+	return ret;
+}
+
+EXPORT_SYMBOL(vfs_read);
+
+static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
+{
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
+	struct kiocb kiocb;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	iov_iter_init(&iter, WRITE, &iov, 1, len);
+
+	ret = filp->f_op->write_iter(&kiocb, &iter);
+	BUG_ON(ret == -EIOCBQUEUED);
+	if (ret > 0)
+		*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
+		    loff_t *pos)
+{
+	if (file->f_op->write)
+		return file->f_op->write(file, p, count, pos);
+	else if (file->f_op->write_iter)
+		return new_sync_write(file, p, count, pos);
+	else
+		return -EINVAL;
+}
+EXPORT_SYMBOL(__vfs_write);
+
+ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	const char __user *p;
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_CAN_WRITE))
+		return -EINVAL;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	p = (__force const char __user *)buf;
+	if (count > MAX_RW_COUNT)
+		count =  MAX_RW_COUNT;
+	ret = __vfs_write(file, p, count, pos);
+	set_fs(old_fs);
+	if (ret > 0) {
+		fsnotify_modify(file);
+		add_wchar(current, ret);
+	}
+	inc_syscw(current);
+	return ret;
+}
+
+EXPORT_SYMBOL(__kernel_write);
+
+ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
+{
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+	if (!(file->f_mode & FMODE_CAN_WRITE))
+		return -EINVAL;
+	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
+		return -EFAULT;
+
+	ret = rw_verify_area(WRITE, file, pos, count);
+	if (ret >= 0) {
+		count = ret;
+		file_start_write(file);
+		ret = __vfs_write(file, buf, count, pos);
+		if (ret > 0) {
+			fsnotify_modify(file);
+			add_wchar(current, ret);
+		}
+		inc_syscw(current);
+		file_end_write(file);
+	}
+
+	return ret;
+}
+
+EXPORT_SYMBOL(vfs_write);
+
+static inline loff_t file_pos_read(struct file *file)
+{
+	return file->f_pos;
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
+	file->f_pos = pos;
+}
+
+SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
+{
+	struct fd f = fdget_pos(fd);
+	ssize_t ret = -EBADF;
+
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_read(f.file, buf, count, &pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
+		fdput_pos(f);
+	}
+	return ret;
+}
+
+SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
+		size_t, count)
+{
+	struct fd f = fdget_pos(fd);
+	ssize_t ret = -EBADF;
+
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_write(f.file, buf, count, &pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
+		fdput_pos(f);
+	}
+
+	return ret;
+}
+
+SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
+			size_t, count, loff_t, pos)
+{
+	struct fd f;
+	ssize_t ret = -EBADF;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (f.file) {
+		ret = -ESPIPE;
+		if (f.file->f_mode & FMODE_PREAD)
+			ret = vfs_read(f.file, buf, count, &pos);
+		fdput(f);
+	}
+
+	return ret;
+}
+
+SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
+			 size_t, count, loff_t, pos)
+{
+	struct fd f;
+	ssize_t ret = -EBADF;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (f.file) {
+		ret = -ESPIPE;
+		if (f.file->f_mode & FMODE_PWRITE)  
+			ret = vfs_write(f.file, buf, count, &pos);
+		fdput(f);
+	}
+
+	return ret;
+}
+
+/*
+ * Reduce an iovec's length in-place.  Return the resulting number of segments
+ */
+unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
+{
+	unsigned long seg = 0;
+	size_t len = 0;
+
+	while (seg < nr_segs) {
+		seg++;
+		if (len + iov->iov_len >= to) {
+			iov->iov_len = to - len;
+			break;
+		}
+		len += iov->iov_len;
+		iov++;
+	}
+	return seg;
+}
+EXPORT_SYMBOL(iov_shorten);
+
+static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
+		loff_t *ppos, iter_fn_t fn)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+
+	ret = fn(&kiocb, iter);
+	BUG_ON(ret == -EIOCBQUEUED);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+/* Do it by hand, with file-ops */
+static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
+		loff_t *ppos, io_fn_t fn)
+{
+	ssize_t ret = 0;
+
+	while (iov_iter_count(iter)) {
+		struct iovec iovec = iov_iter_iovec(iter);
+		ssize_t nr;
+
+		nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (nr != iovec.iov_len)
+			break;
+		iov_iter_advance(iter, nr);
+	}
+
+	return ret;
+}
+
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
+			      unsigned long nr_segs, unsigned long fast_segs,
+			      struct iovec *fast_pointer,
+			      struct iovec **ret_pointer)
+{
+	unsigned long seg;
+	ssize_t ret;
+	struct iovec *iov = fast_pointer;
+
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
+	if (nr_segs == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * First get the "struct iovec" from user memory and
+	 * verify all the pointers
+	 */
+	if (nr_segs > UIO_MAXIOV) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (nr_segs > fast_segs) {
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/*
+	 * According to the Single Unix Specification we should return EINVAL
+	 * if an element length is < 0 when cast to ssize_t or if the
+	 * total length would overflow the ssize_t return value of the
+	 * system call.
+	 *
+	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
+	 * overflow case.
+	 */
+	ret = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		void __user *buf = iov[seg].iov_base;
+		ssize_t len = (ssize_t)iov[seg].iov_len;
+
+		/* see if we we're about to use an invalid len or if
+		 * it's about to overflow ssize_t */
+		if (len < 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+		if (type >= 0
+		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len > MAX_RW_COUNT - ret) {
+			len = MAX_RW_COUNT - ret;
+			iov[seg].iov_len = len;
+		}
+		ret += len;
+	}
+out:
+	*ret_pointer = iov;
+	return ret;
+}
+
+static ssize_t do_readv_writev(int type, struct file *file,
+			       const struct iovec __user * uvector,
+			       unsigned long nr_segs, loff_t *pos)
+{
+	size_t tot_len;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	struct iov_iter iter;
+	ssize_t ret;
+	io_fn_t fn;
+	iter_fn_t iter_fn;
+
+	ret = import_iovec(type, uvector, nr_segs,
+			   ARRAY_SIZE(iovstack), &iov, &iter);
+	if (ret < 0)
+		return ret;
+
+	tot_len = iov_iter_count(&iter);
+	if (!tot_len)
+		goto out;
+	ret = rw_verify_area(type, file, pos, tot_len);
+	if (ret < 0)
+		goto out;
+
+	if (type == READ) {
+		fn = file->f_op->read;
+		iter_fn = file->f_op->read_iter;
+	} else {
+		fn = (io_fn_t)file->f_op->write;
+		iter_fn = file->f_op->write_iter;
+		file_start_write(file);
+	}
+
+	if (iter_fn)
+		ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+	else
+		ret = do_loop_readv_writev(file, &iter, pos, fn);
+
+	if (type != READ)
+		file_end_write(file);
+
+out:
+	kfree(iov);
+	if ((ret + (type == READ)) > 0) {
+		if (type == READ)
+			fsnotify_access(file);
+		else
+			fsnotify_modify(file);
+	}
+	return ret;
+}
+
+ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
+		  unsigned long vlen, loff_t *pos)
+{
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+	if (!(file->f_mode & FMODE_CAN_READ))
+		return -EINVAL;
+
+	return do_readv_writev(READ, file, vec, vlen, pos);
+}
+
+EXPORT_SYMBOL(vfs_readv);
+
+ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
+		   unsigned long vlen, loff_t *pos)
+{
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+	if (!(file->f_mode & FMODE_CAN_WRITE))
+		return -EINVAL;
+
+	return do_readv_writev(WRITE, file, vec, vlen, pos);
+}
+
+EXPORT_SYMBOL(vfs_writev);
+
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
+{
+	struct fd f = fdget_pos(fd);
+	ssize_t ret = -EBADF;
+
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_readv(f.file, vec, vlen, &pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
+		fdput_pos(f);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
+{
+	struct fd f = fdget_pos(fd);
+	ssize_t ret = -EBADF;
+
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_writev(f.file, vec, vlen, &pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
+		fdput_pos(f);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+	struct fd f;
+	ssize_t ret = -EBADF;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (f.file) {
+		ret = -ESPIPE;
+		if (f.file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(f.file, vec, vlen, &pos);
+		fdput(f);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+	struct fd f;
+	ssize_t ret = -EBADF;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (f.file) {
+		ret = -ESPIPE;
+		if (f.file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(f.file, vec, vlen, &pos);
+		fdput(f);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+
+static ssize_t compat_do_readv_writev(int type, struct file *file,
+			       const struct compat_iovec __user *uvector,
+			       unsigned long nr_segs, loff_t *pos)
+{
+	compat_ssize_t tot_len;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	struct iov_iter iter;
+	ssize_t ret;
+	io_fn_t fn;
+	iter_fn_t iter_fn;
+
+	ret = compat_import_iovec(type, uvector, nr_segs,
+				  UIO_FASTIOV, &iov, &iter);
+	if (ret < 0)
+		return ret;
+
+	tot_len = iov_iter_count(&iter);
+	if (!tot_len)
+		goto out;
+	ret = rw_verify_area(type, file, pos, tot_len);
+	if (ret < 0)
+		goto out;
+
+	if (type == READ) {
+		fn = file->f_op->read;
+		iter_fn = file->f_op->read_iter;
+	} else {
+		fn = (io_fn_t)file->f_op->write;
+		iter_fn = file->f_op->write_iter;
+		file_start_write(file);
+	}
+
+	if (iter_fn)
+		ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+	else
+		ret = do_loop_readv_writev(file, &iter, pos, fn);
+
+	if (type != READ)
+		file_end_write(file);
+
+out:
+	kfree(iov);
+	if ((ret + (type == READ)) > 0) {
+		if (type == READ)
+			fsnotify_access(file);
+		else
+			fsnotify_modify(file);
+	}
+	return ret;
+}
+
+static size_t compat_readv(struct file *file,
+			   const struct compat_iovec __user *vec,
+			   unsigned long vlen, loff_t *pos)
+{
+	ssize_t ret = -EBADF;
+
+	if (!(file->f_mode & FMODE_READ))
+		goto out;
+
+	ret = -EINVAL;
+	if (!(file->f_mode & FMODE_CAN_READ))
+		goto out;
+
+	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+
+out:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+		const struct compat_iovec __user *,vec,
+		compat_ulong_t, vlen)
+{
+	struct fd f = fdget_pos(fd);
+	ssize_t ret;
+	loff_t pos;
+
+	if (!f.file)
+		return -EBADF;
+	pos = f.file->f_pos;
+	ret = compat_readv(f.file, vec, vlen, &pos);
+	if (ret >= 0)
+		f.file->f_pos = pos;
+	fdput_pos(f);
+	return ret;
+}
+
+static long __compat_sys_preadv64(unsigned long fd,
+				  const struct compat_iovec __user *vec,
+				  unsigned long vlen, loff_t pos)
+{
+	struct fd f;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	ret = -ESPIPE;
+	if (f.file->f_mode & FMODE_PREAD)
+		ret = compat_readv(f.file, vec, vlen, &pos);
+	fdput(f);
+	return ret;
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
+COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
+		const struct compat_iovec __user *,vec,
+		unsigned long, vlen, loff_t, pos)
+{
+	return __compat_sys_preadv64(fd, vec, vlen, pos);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
+		const struct compat_iovec __user *,vec,
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+	return __compat_sys_preadv64(fd, vec, vlen, pos);
+}
+
+static size_t compat_writev(struct file *file,
+			    const struct compat_iovec __user *vec,
+			    unsigned long vlen, loff_t *pos)
+{
+	ssize_t ret = -EBADF;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		goto out;
+
+	ret = -EINVAL;
+	if (!(file->f_mode & FMODE_CAN_WRITE))
+		goto out;
+
+	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+
+out:
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+		const struct compat_iovec __user *, vec,
+		compat_ulong_t, vlen)
+{
+	struct fd f = fdget_pos(fd);
+	ssize_t ret;
+	loff_t pos;
+
+	if (!f.file)
+		return -EBADF;
+	pos = f.file->f_pos;
+	ret = compat_writev(f.file, vec, vlen, &pos);
+	if (ret >= 0)
+		f.file->f_pos = pos;
+	fdput_pos(f);
+	return ret;
+}
+
+static long __compat_sys_pwritev64(unsigned long fd,
+				   const struct compat_iovec __user *vec,
+				   unsigned long vlen, loff_t pos)
+{
+	struct fd f;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	ret = -ESPIPE;
+	if (f.file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(f.file, vec, vlen, &pos);
+	fdput(f);
+	return ret;
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
+COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
+		const struct compat_iovec __user *,vec,
+		unsigned long, vlen, loff_t, pos)
+{
+	return __compat_sys_pwritev64(fd, vec, vlen, pos);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
+		const struct compat_iovec __user *,vec,
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+	return __compat_sys_pwritev64(fd, vec, vlen, pos);
+}
+#endif
+
+static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
+		  	   size_t count, loff_t max)
+{
+	struct fd in, out;
+	struct inode *in_inode, *out_inode;
+	loff_t pos;
+	loff_t out_pos;
+	ssize_t retval;
+	int fl;
+
+	/*
+	 * Get input file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	in = fdget(in_fd);
+	if (!in.file)
+		goto out;
+	if (!(in.file->f_mode & FMODE_READ))
+		goto fput_in;
+	retval = -ESPIPE;
+	if (!ppos) {
+		pos = in.file->f_pos;
+	} else {
+		pos = *ppos;
+		if (!(in.file->f_mode & FMODE_PREAD))
+			goto fput_in;
+	}
+	retval = rw_verify_area(READ, in.file, &pos, count);
+	if (retval < 0)
+		goto fput_in;
+	count = retval;
+
+	/*
+	 * Get output file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	out = fdget(out_fd);
+	if (!out.file)
+		goto fput_in;
+	if (!(out.file->f_mode & FMODE_WRITE))
+		goto fput_out;
+	retval = -EINVAL;
+	in_inode = file_inode(in.file);
+	out_inode = file_inode(out.file);
+	out_pos = out.file->f_pos;
+	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
+	if (retval < 0)
+		goto fput_out;
+	count = retval;
+
+	if (!max)
+		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
+
+	if (unlikely(pos + count > max)) {
+		retval = -EOVERFLOW;
+		if (pos >= max)
+			goto fput_out;
+		count = max - pos;
+	}
+
+	fl = 0;
+#if 0
+	/*
+	 * We need to debate whether we can enable this or not. The
+	 * man page documents EAGAIN return for the output at least,
+	 * and the application is arguably buggy if it doesn't expect
+	 * EAGAIN on a non-blocking file descriptor.
+	 */
+	if (in.file->f_flags & O_NONBLOCK)
+		fl = SPLICE_F_NONBLOCK;
+#endif
+	file_start_write(out.file);
+	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
+	file_end_write(out.file);
+
+	if (retval > 0) {
+		add_rchar(current, retval);
+		add_wchar(current, retval);
+		fsnotify_access(in.file);
+		fsnotify_modify(out.file);
+		out.file->f_pos = out_pos;
+		if (ppos)
+			*ppos = pos;
+		else
+			in.file->f_pos = pos;
+	}
+
+	inc_syscr(current);
+	inc_syscw(current);
+	if (pos > max)
+		retval = -EOVERFLOW;
+
+fput_out:
+	fdput(out);
+fput_in:
+	fdput(in);
+out:
+	return retval;
+}
+
+SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
+{
+	loff_t pos;
+	off_t off;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(get_user(off, offset)))
+			return -EFAULT;
+		pos = off;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
+{
+	loff_t pos;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+			return -EFAULT;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
+		compat_off_t __user *, offset, compat_size_t, count)
+{
+	loff_t pos;
+	off_t off;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(get_user(off, offset)))
+			return -EFAULT;
+		pos = off;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
+		compat_loff_t __user *, offset, compat_size_t, count)
+{
+	loff_t pos;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+			return -EFAULT;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+#endif
diff --git a/kernel/fs/readdir.c b/kernel/fs/readdir.c
new file mode 100644
index 000000000..ced679179
--- /dev/null
+++ b/kernel/fs/readdir.c
@@ -0,0 +1,309 @@
+/*
+ *  linux/fs/readdir.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ */
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/dirent.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+
+#include <asm/uaccess.h>
+
+int iterate_dir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	int res = -ENOTDIR;
+	if (!file->f_op->iterate)
+		goto out;
+
+	res = security_file_permission(file, MAY_READ);
+	if (res)
+		goto out;
+
+	res = mutex_lock_killable(&inode->i_mutex);
+	if (res)
+		goto out;
+
+	res = -ENOENT;
+	if (!IS_DEADDIR(inode)) {
+		ctx->pos = file->f_pos;
+		res = file->f_op->iterate(file, ctx);
+		file->f_pos = ctx->pos;
+		fsnotify_access(file);
+		file_accessed(file);
+	}
+	mutex_unlock(&inode->i_mutex);
+out:
+	return res;
+}
+EXPORT_SYMBOL(iterate_dir);
+
+/*
+ * Traditional linux readdir() handling..
+ *
+ * "count=1" is a special case, meaning that the buffer is one
+ * dirent-structure in size and that the code can't handle more
+ * anyway. Thus the special "fillonedir()" function for that
+ * case (the low-level handlers don't need to care about this).
+ */
+
+#ifdef __ARCH_WANT_OLD_READDIR
+
+struct old_linux_dirent {
+	unsigned long	d_ino;
+	unsigned long	d_offset;
+	unsigned short	d_namlen;
+	char		d_name[1];
+};
+
+struct readdir_callback {
+	struct dir_context ctx;
+	struct old_linux_dirent __user * dirent;
+	int result;
+};
+
+static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
+		      loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_callback *buf =
+		container_of(ctx, struct readdir_callback, ctx);
+	struct old_linux_dirent __user * dirent;
+	unsigned long d_ino;
+
+	if (buf->result)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->result = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	buf->result++;
+	dirent = buf->dirent;
+	if (!access_ok(VERIFY_WRITE, dirent,
+			(unsigned long)(dirent->d_name + namlen + 1) -
+				(unsigned long)dirent))
+		goto efault;
+	if (	__put_user(d_ino, &dirent->d_ino) ||
+		__put_user(offset, &dirent->d_offset) ||
+		__put_user(namlen, &dirent->d_namlen) ||
+		__copy_to_user(dirent->d_name, name, namlen) ||
+		__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	return 0;
+efault:
+	buf->result = -EFAULT;
+	return -EFAULT;
+}
+
+SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+		struct old_linux_dirent __user *, dirent, unsigned int, count)
+{
+	int error;
+	struct fd f = fdget(fd);
+	struct readdir_callback buf = {
+		.ctx.actor = fillonedir,
+		.dirent = dirent
+	};
+
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (buf.result)
+		error = buf.result;
+
+	fdput(f);
+	return error;
+}
+
+#endif /* __ARCH_WANT_OLD_READDIR */
+
+/*
+ * New, all-improved, singing, dancing, iBCS2-compliant getdents()
+ * interface. 
+ */
+struct linux_dirent {
+	unsigned long	d_ino;
+	unsigned long	d_off;
+	unsigned short	d_reclen;
+	char		d_name[1];
+};
+
+struct getdents_callback {
+	struct dir_context ctx;
+	struct linux_dirent __user * current_dir;
+	struct linux_dirent __user * previous;
+	int count;
+	int error;
+};
+
+static int filldir(struct dir_context *ctx, const char *name, int namlen,
+		   loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct linux_dirent __user * dirent;
+	struct getdents_callback *buf =
+		container_of(ctx, struct getdents_callback, ctx);
+	unsigned long d_ino;
+	int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
+		sizeof(long));
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->error = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(d_ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, (char __user *) dirent + reclen - 1))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+SYSCALL_DEFINE3(getdents, unsigned int, fd,
+		struct linux_dirent __user *, dirent, unsigned int, count)
+{
+	struct fd f;
+	struct linux_dirent __user * lastdirent;
+	struct getdents_callback buf = {
+		.ctx.actor = filldir,
+		.count = count,
+		.current_dir = dirent
+	};
+	int error;
+
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		return -EFAULT;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fdput(f);
+	return error;
+}
+
+struct getdents_callback64 {
+	struct dir_context ctx;
+	struct linux_dirent64 __user * current_dir;
+	struct linux_dirent64 __user * previous;
+	int count;
+	int error;
+};
+
+static int filldir64(struct dir_context *ctx, const char *name, int namlen,
+		     loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct linux_dirent64 __user *dirent;
+	struct getdents_callback64 *buf =
+		container_of(ctx, struct getdents_callback64, ctx);
+	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
+		sizeof(u64));
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(0, &dirent->d_off))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (__put_user(d_type, &dirent->d_type))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+		struct linux_dirent64 __user *, dirent, unsigned int, count)
+{
+	struct fd f;
+	struct linux_dirent64 __user * lastdirent;
+	struct getdents_callback64 buf = {
+		.ctx.actor = filldir64,
+		.count = count,
+		.current_dir = dirent
+	};
+	int error;
+
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		return -EFAULT;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
+		if (__put_user(d_off, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fdput(f);
+	return error;
+}
diff --git a/kernel/fs/reiserfs/Kconfig b/kernel/fs/reiserfs/Kconfig
new file mode 100644
index 000000000..7cd46666b
--- /dev/null
+++ b/kernel/fs/reiserfs/Kconfig
@@ -0,0 +1,88 @@
+config REISERFS_FS
+	tristate "Reiserfs support"
+	select CRC32
+	help
+	  Stores not just filenames but the files themselves in a balanced
+	  tree.  Uses journalling.
+
+	  Balanced trees are more efficient than traditional file system
+	  architectural foundations.
+
+	  In general, ReiserFS is as fast as ext2, but is very efficient with
+	  large directories and small files.  Additional patches are needed
+	  for NFS and quotas, please see 
+	  <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
+
+	  It is more easily extended to have features currently found in
+	  database and keyword search systems than block allocation based file
+	  systems are.  The next version will be so extended, and will support
+	  plugins consistent with our motto ``It takes more than a license to
+	  make source code open.''
+
+	  Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
+	  to learn more about reiserfs.
+
+	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
+
+	  If you like it, you can pay us to add new features to it that you
+	  need, buy a support contract, or pay us to port it to another OS.
+
+config REISERFS_CHECK
+	bool "Enable reiserfs debug mode"
+	depends on REISERFS_FS
+	help
+	  If you set this to Y, then ReiserFS will perform every check it can
+	  possibly imagine of its internal consistency throughout its
+	  operation.  It will also go substantially slower.  More than once we
+	  have forgotten that this was on, and then gone despondent over the
+	  latest benchmarks.:-) Use of this option allows our team to go all
+	  out in checking for consistency when debugging without fear of its
+	  effect on end users.  If you are on the verge of sending in a bug
+	  report, say Y and you might get a useful error message.  Almost
+	  everyone should say N.
+
+config REISERFS_PROC_INFO
+	bool "Stats in /proc/fs/reiserfs"
+	depends on REISERFS_FS && PROC_FS
+	help
+	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
+	  various ReiserFS statistics and internal data at the expense of
+	  making your kernel or module slightly larger (+8 KB). This also
+	  increases the amount of kernel memory required for each mount.
+	  Almost everyone but ReiserFS developers and people fine-tuning
+	  reiserfs or tracing problems should say N.
+
+config REISERFS_FS_XATTR
+	bool "ReiserFS extended attributes"
+	depends on REISERFS_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config REISERFS_FS_POSIX_ACL
+	bool "ReiserFS POSIX Access Control Lists"
+	depends on REISERFS_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config REISERFS_FS_SECURITY
+	bool "ReiserFS Security Labels"
+	depends on REISERFS_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ReiserFS filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/kernel/fs/reiserfs/Makefile b/kernel/fs/reiserfs/Makefile
new file mode 100644
index 000000000..3c3b00165
--- /dev/null
+++ b/kernel/fs/reiserfs/Makefile
@@ -0,0 +1,38 @@
+#
+# Makefile for the linux reiser-filesystem routines.
+#
+
+obj-$(CONFIG_REISERFS_FS) += reiserfs.o
+
+reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
+		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
+		 hashes.o tail_conversion.o journal.o resize.o \
+		 item_ops.o ioctl.o xattr.o lock.o
+
+ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
+reiserfs-objs += procfs.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
+reiserfs-objs += xattr_user.o xattr_trusted.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
+reiserfs-objs += xattr_security.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
+reiserfs-objs += xattr_acl.o
+endif
+
+# gcc -O2 (the kernel default)  is overaggressive on ppc32 when many inline
+# functions are used.  This causes the compiler to advance the stack
+# pointer out of the available stack space, corrupting kernel space,
+# and causing a panic. Since this behavior only affects ppc32, this ifeq
+# will work around it. If any other architecture displays this behavior,
+# add it here.
+ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
+
+TAGS:
+	etags *.c
+
diff --git a/kernel/fs/reiserfs/README b/kernel/fs/reiserfs/README
new file mode 100644
index 000000000..e2f7a264e
--- /dev/null
+++ b/kernel/fs/reiserfs/README
@@ -0,0 +1,161 @@
+[LICENSING]
+
+ReiserFS is hereby licensed under the GNU General
+Public License version 2.
+
+Source code files that contain the phrase "licensing governed by
+reiserfs/README" are "governed files" throughout this file.  Governed
+files are licensed under the GPL.  The portions of them owned by Hans
+Reiser, or authorized to be licensed by him, have been in the past,
+and likely will be in the future, licensed to other parties under
+other licenses.  If you add your code to governed files, and don't
+want it to be owned by Hans Reiser, put your copyright label on that
+code so the poor blight and his customers can keep things straight.
+All portions of governed files not labeled otherwise are owned by Hans
+Reiser, and by adding your code to it, widely distributing it to
+others or sending us a patch, and leaving the sentence in stating that
+licensing is governed by the statement in this file, you accept this.
+It will be a kindness if you identify whether Hans Reiser is allowed
+to license code labeled as owned by you on your behalf other than
+under the GPL, because he wants to know if it is okay to do so and put
+a check in the mail to you (for non-trivial improvements) when he
+makes his next sale.  He makes no guarantees as to the amount if any,
+though he feels motivated to motivate contributors, and you can surely
+discuss this with him before or after contributing.  You have the
+right to decline to allow him to license your code contribution other
+than under the GPL.
+
+Further licensing options are available for commercial and/or other
+interests directly from Hans Reiser: hans@reiser.to.  If you interpret
+the GPL as not allowing those additional licensing options, you read
+it wrongly, and Richard Stallman agrees with me, when carefully read
+you can see that those restrictions on additional terms do not apply
+to the owner of the copyright, and my interpretation of this shall
+govern for this license.
+
+Finally, nothing in this license shall be interpreted to allow you to
+fail to fairly credit me, or to remove my credits, without my
+permission, unless you are an end user not redistributing to others.
+If you have doubts about how to properly do that, or about what is
+fair, ask.  (Last I spoke with him Richard was contemplating how best
+to address the fair crediting issue in the next GPL version.)
+
+[END LICENSING]
+
+Reiserfs is a file system based on balanced tree algorithms, which is
+described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
+
+Stop reading here.  Go there, then return.
+
+Send bug reports to yura@namesys.botik.ru.
+
+mkreiserfs and other utilities are in reiserfs/utils, or wherever your
+Linux provider put them.  There is some disagreement about how useful
+it is for users to get their fsck and mkreiserfs out of sync with the
+version of reiserfs that is in their kernel, with many important
+distributors wanting them out of sync.:-) Please try to remember to
+recompile and reinstall fsck and mkreiserfs with every update of
+reiserfs, this is a common source of confusion.  Note that some of the
+utilities cannot be compiled without accessing the balancing code
+which is in the kernel code, and relocating the utilities may require
+you to specify where that code can be found.
+
+Yes, if you update your reiserfs kernel module you do have to
+recompile your kernel, most of the time.  The errors you get will be
+quite cryptic if your forget to do so.
+
+Real users, as opposed to folks who want to hack and then understand
+what went wrong, will want REISERFS_CHECK off.
+
+Hideous Commercial Pitch: Spread your development costs across other OS
+vendors.  Select from the best in the world, not the best in your
+building, by buying from third party OS component suppliers.  Leverage
+the software component development power of the internet.  Be the most
+aggressive in taking advantage of the commercial possibilities of
+decentralized internet development, and add value through your branded
+integration that you sell as an operating system.  Let your competitors
+be the ones to compete against the entire internet by themselves.  Be
+hip, get with the new economic trend, before your competitors do.  Send
+email to hans@reiser.to.
+
+To understand the code, after reading the website, start reading the
+code by reading reiserfs_fs.h first.
+
+Hans Reiser was the project initiator, primary architect, source of all
+funding for the first 5.5 years, and one of the programmers.  He owns
+the copyright.
+
+Vladimir Saveljev was one of the programmers, and he worked long hours
+writing the cleanest code.  He always made the effort to be the best he
+could be, and to make his code the best that it could be.  What resulted
+was quite remarkable. I don't think that money can ever motivate someone
+to work the way he did, he is one of the most selfless men I know.
+
+Yura helps with benchmarking, coding hashes, and block pre-allocation
+code.
+
+Anatoly Pinchuk is a former member of our team who worked closely with
+Vladimir throughout the project's development.  He wrote a quite
+substantial portion of the total code.  He realized that there was a
+space problem with packing tails of files for files larger than a node
+that start on a node aligned boundary (there are reasons to want to node
+align files), and he invented and implemented indirect items and
+unformatted nodes as the solution.
+
+Konstantin Shvachko, with the help of the Russian version of a VC,
+tried to put me in a position where I was forced into giving control
+of the project to him.  (Fortunately, as the person paying the money
+for all salaries from my dayjob I owned all copyrights, and you can't
+really force takeovers of sole proprietorships.)  This was something
+curious, because he never really understood the value of our project,
+why we should do what we do, or why innovation was possible in
+general, but he was sure that he ought to be controlling it.  Every
+innovation had to be forced past him while he was with us.  He added
+two years to the time required to complete reiserfs, and was a net
+loss for me.  Mikhail Gilula was a brilliant innovator who also left
+in a destructive way that erased the value of his contributions, and
+that he was shown much generosity just makes it more painful.
+
+Grigory Zaigralin was an extremely effective system administrator for
+our group.
+
+Igor Krasheninnikov was wonderful at hardware procurement, repair, and
+network installation.
+
+Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
+textbook he got the algorithm from in the code.  Note that his analysis
+of how we could use the hashing code in making 32 bit NFS cookies work
+was probably more important than the actual algorithm.  Colin Plumb also
+contributed to it.
+
+Chris Mason dived right into our code, and in just a few months produced
+the journaling code that dramatically increased the value of ReiserFS.
+He is just an amazing programmer.
+
+Igor Zagorovsky is writing much of the new item handler and extent code
+for our next major release.
+
+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
+resizer, and is hard at work on implementing allocate on flush.  SGI
+implemented allocate on flush before us for XFS, and generously took
+the time to convince me we should do it also.  They are great people,
+and a great company.
+
+Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
+
+Vitaly Fertman is doing fsck.
+
+Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
+the endian safe patches which allow ReiserFS to run on any platform
+supported by the Linux kernel.
+
+SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
+Alpha PC Company made it possible for me to not have a day job
+anymore, and to dramatically increase our staffing.  Ecila funded
+hypertext feature development, MP3.com funded journaling, SuSE funded
+core development, IntegratedLinux.com funded squid web cache
+appliances, bigstorage.com funded HSM, and the alpha PC company funded
+the alpha port.  Many of these tasks were helped by sponsors other
+than the ones just named.  SuSE has helped in much more than just
+funding....
+
diff --git a/kernel/fs/reiserfs/acl.h b/kernel/fs/reiserfs/acl.h
new file mode 100644
index 000000000..4a211f5b3
--- /dev/null
+++ b/kernel/fs/reiserfs/acl.h
@@ -0,0 +1,76 @@
+#include <linux/init.h>
+#include <linux/posix_acl.h>
+
+#define REISERFS_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
+} reiserfs_acl_entry;
+
+typedef struct {
+	__le16 e_tag;
+	__le16 e_perm;
+} reiserfs_acl_entry_short;
+
+typedef struct {
+	__le32 a_version;
+} reiserfs_acl_header;
+
+static inline size_t reiserfs_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(reiserfs_acl_header) +
+		    count * sizeof(reiserfs_acl_entry_short);
+	} else {
+		return sizeof(reiserfs_acl_header) +
+		    4 * sizeof(reiserfs_acl_entry_short) +
+		    (count - 4) * sizeof(reiserfs_acl_entry);
+	}
+}
+
+static inline int reiserfs_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(reiserfs_acl_header);
+	s = size - 4 * sizeof(reiserfs_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(reiserfs_acl_entry_short))
+			return -1;
+		return size / sizeof(reiserfs_acl_entry_short);
+	} else {
+		if (s % sizeof(reiserfs_acl_entry))
+			return -1;
+		return s / sizeof(reiserfs_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
+int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int reiserfs_acl_chmod(struct inode *inode);
+int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+				 struct inode *dir, struct dentry *dentry,
+				 struct inode *inode);
+int reiserfs_cache_default_acl(struct inode *dir);
+
+#else
+
+#define reiserfs_cache_default_acl(inode) 0
+#define reiserfs_get_acl NULL
+#define reiserfs_set_acl NULL
+
+static inline int reiserfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int
+reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+			     const struct inode *dir, struct dentry *dentry,
+			     struct inode *inode)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/fs/reiserfs/bitmap.c b/kernel/fs/reiserfs/bitmap.c
new file mode 100644
index 000000000..dc198bc64
--- /dev/null
+++ b/kernel/fs/reiserfs/bitmap.c
@@ -0,0 +1,1468 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+/* Reiserfs block (de)allocator, bitmap-based. */
+
+#include <linux/time.h>
+#include "reiserfs.h"
+#include <linux/errno.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+
+#define PREALLOCATION_SIZE 9
+
+/* different reiserfs block allocator options */
+
+#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
+
+#define  _ALLOC_concentrating_formatted_nodes 0
+#define  _ALLOC_displacing_large_files 1
+#define  _ALLOC_displacing_new_packing_localities 2
+#define  _ALLOC_old_hashed_relocation 3
+#define  _ALLOC_new_hashed_relocation 4
+#define  _ALLOC_skip_busy 5
+#define  _ALLOC_displace_based_on_dirid 6
+#define  _ALLOC_hashed_formatted_nodes 7
+#define  _ALLOC_old_way 8
+#define  _ALLOC_hundredth_slices 9
+#define  _ALLOC_dirid_groups 10
+#define  _ALLOC_oid_groups 11
+#define  _ALLOC_packing_groups 12
+
+#define  concentrating_formatted_nodes(s)	test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
+#define  displacing_large_files(s)		test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
+#define  displacing_new_packing_localities(s)	test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
+
+#define SET_OPTION(optname) \
+   do { \
+	reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
+	set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
+    } while(0)
+#define TEST_OPTION(optname, s) \
+    test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
+
+static inline void get_bit_address(struct super_block *s,
+				   b_blocknr_t block,
+				   unsigned int *bmap_nr,
+				   unsigned int *offset)
+{
+	/*
+	 * It is in the bitmap block number equal to the block
+	 * number divided by the number of bits in a block.
+	 */
+	*bmap_nr = block >> (s->s_blocksize_bits + 3);
+	/* Within that bitmap block it is located at bit offset *offset. */
+	*offset = block & ((s->s_blocksize << 3) - 1);
+}
+
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
+{
+	unsigned int bmap, offset;
+	unsigned int bmap_count = reiserfs_bmap_count(s);
+
+	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
+		reiserfs_error(s, "vs-4010",
+			       "block number is out of range %lu (%u)",
+			       block, SB_BLOCK_COUNT(s));
+		return 0;
+	}
+
+	get_bit_address(s, block, &bmap, &offset);
+
+	/*
+	 * Old format filesystem? Unlikely, but the bitmaps are all
+	 * up front so we need to account for it.
+	 */
+	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
+			      &REISERFS_SB(s)->s_properties))) {
+		b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
+		if (block >= bmap1 &&
+		    block <= bmap1 + bmap_count) {
+			reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
+				       "can't be freed or reused",
+				       block, bmap_count);
+			return 0;
+		}
+	} else {
+		if (offset == 0) {
+			reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
+				       "can't be freed or reused",
+				       block, bmap_count);
+			return 0;
+		}
+	}
+
+	if (bmap >= bmap_count) {
+		reiserfs_error(s, "vs-4030", "bitmap for requested block "
+			       "is out of range: block=%lu, bitmap_nr=%u",
+			       block, bmap);
+		return 0;
+	}
+
+	if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
+		reiserfs_error(s, "vs-4050", "this is root block (%u), "
+			       "it must be busy", SB_ROOT_BLOCK(s));
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Searches in journal structures for a given block number (bmap, off).
+ * If block is found in reiserfs journal it suggests next free block
+ * candidate to test.
+ */
+static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
+				      int off, int *next)
+{
+	b_blocknr_t tmp;
+
+	if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
+		if (tmp) {	/* hint supplied */
+			*next = tmp;
+			PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
+		} else {
+			(*next) = off + 1;  /* inc offset to avoid looping. */
+			PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
+		}
+		PROC_INFO_INC(s, scan_bitmap.retry);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Searches for a window of zero bits with given minimum and maximum
+ * lengths in one bitmap block
+ */
+static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
+			     unsigned int bmap_n, int *beg, int boundary,
+			     int min, int max, int unfm)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
+	struct buffer_head *bh;
+	int end, next;
+	int org = *beg;
+
+	BUG_ON(!th->t_trans_id);
+	RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
+	       "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
+	PROC_INFO_INC(s, scan_bitmap.bmap);
+
+	if (!bi) {
+		reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
+			       "for bitmap %d", bmap_n);
+		return 0;
+	}
+
+	bh = reiserfs_read_bitmap_block(s, bmap_n);
+	if (bh == NULL)
+		return 0;
+
+	while (1) {
+cont:
+		if (bi->free_count < min) {
+			brelse(bh);
+			return 0;	/* No free blocks in this bitmap */
+		}
+
+		/* search for a first zero bit -- beginning of a window */
+		*beg = reiserfs_find_next_zero_le_bit
+		    ((unsigned long *)(bh->b_data), boundary, *beg);
+
+		/*
+		 * search for a zero bit fails or the rest of bitmap block
+		 * cannot contain a zero window of minimum size
+		 */
+		if (*beg + min > boundary) {
+			brelse(bh);
+			return 0;
+		}
+
+		if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
+			continue;
+		/* first zero bit found; we check next bits */
+		for (end = *beg + 1;; end++) {
+			if (end >= *beg + max || end >= boundary
+			    || reiserfs_test_le_bit(end, bh->b_data)) {
+				next = end;
+				break;
+			}
+
+			/*
+			 * finding the other end of zero bit window requires
+			 * looking into journal structures (in case of
+			 * searching for free blocks for unformatted nodes)
+			 */
+			if (unfm && is_block_in_journal(s, bmap_n, end, &next))
+				break;
+		}
+
+		/*
+		 * now (*beg) points to beginning of zero bits window,
+		 * (end) points to one bit after the window end
+		 */
+
+		/* found window of proper size */
+		if (end - *beg >= min) {
+			int i;
+			reiserfs_prepare_for_journal(s, bh, 1);
+			/*
+			 * try to set all blocks used checking are
+			 * they still free
+			 */
+			for (i = *beg; i < end; i++) {
+				/* Don't check in journal again. */
+				if (reiserfs_test_and_set_le_bit
+				    (i, bh->b_data)) {
+					/*
+					 * bit was set by another process while
+					 * we slept in prepare_for_journal()
+					 */
+					PROC_INFO_INC(s, scan_bitmap.stolen);
+
+					/*
+					 * we can continue with smaller set
+					 * of allocated blocks, if length of
+					 * this set is more or equal to `min'
+					 */
+					if (i >= *beg + min) {
+						end = i;
+						break;
+					}
+
+					/*
+					 * otherwise we clear all bit
+					 * were set ...
+					 */
+					while (--i >= *beg)
+						reiserfs_clear_le_bit
+						    (i, bh->b_data);
+					reiserfs_restore_prepared_buffer(s, bh);
+					*beg = org;
+
+					/*
+					 * Search again in current block
+					 * from beginning
+					 */
+					goto cont;
+				}
+			}
+			bi->free_count -= (end - *beg);
+			journal_mark_dirty(th, bh);
+			brelse(bh);
+
+			/* free block count calculation */
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
+			journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
+
+			return end - (*beg);
+		} else {
+			*beg = next;
+		}
+	}
+}
+
+static int bmap_hash_id(struct super_block *s, u32 id)
+{
+	char *hash_in = NULL;
+	unsigned long hash;
+	unsigned bm;
+
+	if (id <= 2) {
+		bm = 1;
+	} else {
+		hash_in = (char *)(&id);
+		hash = keyed_hash(hash_in, 4);
+		bm = hash % reiserfs_bmap_count(s);
+		if (!bm)
+			bm = 1;
+	}
+	/* this can only be true when SB_BMAP_NR = 1 */
+	if (bm >= reiserfs_bmap_count(s))
+		bm = 0;
+	return bm;
+}
+
+/*
+ * hashes the id and then returns > 0 if the block group for the
+ * corresponding hash is full
+ */
+static inline int block_group_used(struct super_block *s, u32 id)
+{
+	int bm = bmap_hash_id(s, id);
+	struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
+
+	/*
+	 * If we don't have cached information on this bitmap block, we're
+	 * going to have to load it later anyway. Loading it here allows us
+	 * to make a better decision. This favors long-term performance gain
+	 * with a better on-disk layout vs. a short term gain of skipping the
+	 * read and potentially having a bad placement.
+	 */
+	if (info->free_count == UINT_MAX) {
+		struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
+		brelse(bh);
+	}
+
+	if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * the packing is returned in disk byte order
+ */
+__le32 reiserfs_choose_packing(struct inode * dir)
+{
+	__le32 packing;
+	if (TEST_OPTION(packing_groups, dir->i_sb)) {
+		u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
+		/*
+		 * some versions of reiserfsck expect packing locality 1 to be
+		 * special
+		 */
+		if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
+			packing = INODE_PKEY(dir)->k_objectid;
+		else
+			packing = INODE_PKEY(dir)->k_dir_id;
+	} else
+		packing = INODE_PKEY(dir)->k_objectid;
+	return packing;
+}
+
+/*
+ * Tries to find contiguous zero bit window (given size) in given region of
+ * bitmap and place new blocks there. Returns number of allocated blocks.
+ */
+static int scan_bitmap(struct reiserfs_transaction_handle *th,
+		       b_blocknr_t * start, b_blocknr_t finish,
+		       int min, int max, int unfm, sector_t file_block)
+{
+	int nr_allocated = 0;
+	struct super_block *s = th->t_super;
+	unsigned int bm, off;
+	unsigned int end_bm, end_off;
+	unsigned int off_max = s->s_blocksize << 3;
+
+	BUG_ON(!th->t_trans_id);
+	PROC_INFO_INC(s, scan_bitmap.call);
+
+	/* No point in looking for more free blocks */
+	if (SB_FREE_BLOCKS(s) <= 0)
+		return 0;
+
+	get_bit_address(s, *start, &bm, &off);
+	get_bit_address(s, finish, &end_bm, &end_off);
+	if (bm > reiserfs_bmap_count(s))
+		return 0;
+	if (end_bm > reiserfs_bmap_count(s))
+		end_bm = reiserfs_bmap_count(s);
+
+	/*
+	 * When the bitmap is more than 10% free, anyone can allocate.
+	 * When it's less than 10% free, only files that already use the
+	 * bitmap are allowed. Once we pass 80% full, this restriction
+	 * is lifted.
+	 *
+	 * We do this so that files that grow later still have space close to
+	 * their original allocation. This improves locality, and presumably
+	 * performance as a result.
+	 *
+	 * This is only an allocation policy and does not make up for getting a
+	 * bad hint. Decent hinting must be implemented for this to work well.
+	 */
+	if (TEST_OPTION(skip_busy, s)
+	    && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
+		for (; bm < end_bm; bm++, off = 0) {
+			if ((off && (!unfm || (file_block != 0)))
+			    || SB_AP_BITMAP(s)[bm].free_count >
+			    (s->s_blocksize << 3) / 10)
+				nr_allocated =
+				    scan_bitmap_block(th, bm, &off, off_max,
+						      min, max, unfm);
+			if (nr_allocated)
+				goto ret;
+		}
+		/* we know from above that start is a reasonable number */
+		get_bit_address(s, *start, &bm, &off);
+	}
+
+	for (; bm < end_bm; bm++, off = 0) {
+		nr_allocated =
+		    scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
+		if (nr_allocated)
+			goto ret;
+	}
+
+	nr_allocated =
+	    scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
+
+ret:
+	*start = bm * off_max + off;
+	return nr_allocated;
+
+}
+
+static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
+				 struct inode *inode, b_blocknr_t block,
+				 int for_unformatted)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs;
+	struct buffer_head *sbh, *bmbh;
+	struct reiserfs_bitmap_info *apbi;
+	unsigned int nr, offset;
+
+	BUG_ON(!th->t_trans_id);
+	PROC_INFO_INC(s, free_block);
+	rs = SB_DISK_SUPER_BLOCK(s);
+	sbh = SB_BUFFER_WITH_SB(s);
+	apbi = SB_AP_BITMAP(s);
+
+	get_bit_address(s, block, &nr, &offset);
+
+	if (nr >= reiserfs_bmap_count(s)) {
+		reiserfs_error(s, "vs-4075", "block %lu is out of range",
+			       block);
+		return;
+	}
+
+	bmbh = reiserfs_read_bitmap_block(s, nr);
+	if (!bmbh)
+		return;
+
+	reiserfs_prepare_for_journal(s, bmbh, 1);
+
+	/* clear bit for the given block in bit map */
+	if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
+		reiserfs_error(s, "vs-4080",
+			       "block %lu: bit already cleared", block);
+	}
+	apbi[nr].free_count++;
+	journal_mark_dirty(th, bmbh);
+	brelse(bmbh);
+
+	reiserfs_prepare_for_journal(s, sbh, 1);
+	/* update super block */
+	set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
+
+	journal_mark_dirty(th, sbh);
+	if (for_unformatted) {
+		int depth = reiserfs_write_unlock_nested(s);
+		dquot_free_block_nodirty(inode, 1);
+		reiserfs_write_lock_nested(s, depth);
+	}
+}
+
+void reiserfs_free_block(struct reiserfs_transaction_handle *th,
+			 struct inode *inode, b_blocknr_t block,
+			 int for_unformatted)
+{
+	struct super_block *s = th->t_super;
+
+	BUG_ON(!th->t_trans_id);
+	RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
+	if (!is_reusable(s, block, 1))
+		return;
+
+	if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
+		reiserfs_error(th->t_super, "bitmap-4072",
+			       "Trying to free block outside file system "
+			       "boundaries (%lu > %lu)",
+			       block, sb_block_count(REISERFS_SB(s)->s_rs));
+		return;
+	}
+	/* mark it before we clear it, just in case */
+	journal_mark_freed(th, s, block);
+	_reiserfs_free_block(th, inode, block, for_unformatted);
+}
+
+/* preallocated blocks don't need to be run through journal_mark_freed */
+static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
+					 struct inode *inode, b_blocknr_t block)
+{
+	BUG_ON(!th->t_trans_id);
+	RFALSE(!th->t_super,
+	       "vs-4060: trying to free block on nonexistent device");
+	if (!is_reusable(th->t_super, block, 1))
+		return;
+	_reiserfs_free_block(th, inode, block, 1);
+}
+
+static void __discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct reiserfs_inode_info *ei)
+{
+	unsigned long save = ei->i_prealloc_block;
+	int dirty = 0;
+	struct inode *inode = &ei->vfs_inode;
+
+	BUG_ON(!th->t_trans_id);
+#ifdef CONFIG_REISERFS_CHECK
+	if (ei->i_prealloc_count < 0)
+		reiserfs_error(th->t_super, "zam-4001",
+			       "inode has negative prealloc blocks count.");
+#endif
+	while (ei->i_prealloc_count > 0) {
+		reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
+		ei->i_prealloc_block++;
+		ei->i_prealloc_count--;
+		dirty = 1;
+	}
+	if (dirty)
+		reiserfs_update_sd(th, inode);
+	ei->i_prealloc_block = save;
+	list_del_init(&ei->i_prealloc_list);
+}
+
+/* FIXME: It should be inline function */
+void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct inode *inode)
+{
+	struct reiserfs_inode_info *ei = REISERFS_I(inode);
+
+	BUG_ON(!th->t_trans_id);
+	if (ei->i_prealloc_count)
+		__discard_prealloc(th, ei);
+}
+
+void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
+{
+	struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
+
+	BUG_ON(!th->t_trans_id);
+	while (!list_empty(plist)) {
+		struct reiserfs_inode_info *ei;
+		ei = list_entry(plist->next, struct reiserfs_inode_info,
+				i_prealloc_list);
+#ifdef CONFIG_REISERFS_CHECK
+		if (!ei->i_prealloc_count) {
+			reiserfs_error(th->t_super, "zam-4001",
+				       "inode is in prealloc list but has "
+				       "no preallocated blocks.");
+		}
+#endif
+		__discard_prealloc(th, ei);
+	}
+}
+
+void reiserfs_init_alloc_options(struct super_block *s)
+{
+	set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
+	set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
+	set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
+}
+
+/* block allocator related options are parsed here */
+int reiserfs_parse_alloc_options(struct super_block *s, char *options)
+{
+	char *this_char, *value;
+
+	/* clear default settings */
+	REISERFS_SB(s)->s_alloc_options.bits = 0;
+
+	while ((this_char = strsep(&options, ":")) != NULL) {
+		if ((value = strchr(this_char, '=')) != NULL)
+			*value++ = 0;
+
+		if (!strcmp(this_char, "concentrating_formatted_nodes")) {
+			int temp;
+			SET_OPTION(concentrating_formatted_nodes);
+			temp = (value
+				&& *value) ? simple_strtoul(value, &value,
+							    0) : 10;
+			if (temp <= 0 || temp > 100) {
+				REISERFS_SB(s)->s_alloc_options.border = 10;
+			} else {
+				REISERFS_SB(s)->s_alloc_options.border =
+				    100 / temp;
+			}
+			continue;
+		}
+		if (!strcmp(this_char, "displacing_large_files")) {
+			SET_OPTION(displacing_large_files);
+			REISERFS_SB(s)->s_alloc_options.large_file_size =
+			    (value
+			     && *value) ? simple_strtoul(value, &value, 0) : 16;
+			continue;
+		}
+		if (!strcmp(this_char, "displacing_new_packing_localities")) {
+			SET_OPTION(displacing_new_packing_localities);
+			continue;
+		}
+
+		if (!strcmp(this_char, "old_hashed_relocation")) {
+			SET_OPTION(old_hashed_relocation);
+			continue;
+		}
+
+		if (!strcmp(this_char, "new_hashed_relocation")) {
+			SET_OPTION(new_hashed_relocation);
+			continue;
+		}
+
+		if (!strcmp(this_char, "dirid_groups")) {
+			SET_OPTION(dirid_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "oid_groups")) {
+			SET_OPTION(oid_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "packing_groups")) {
+			SET_OPTION(packing_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "hashed_formatted_nodes")) {
+			SET_OPTION(hashed_formatted_nodes);
+			continue;
+		}
+
+		if (!strcmp(this_char, "skip_busy")) {
+			SET_OPTION(skip_busy);
+			continue;
+		}
+
+		if (!strcmp(this_char, "hundredth_slices")) {
+			SET_OPTION(hundredth_slices);
+			continue;
+		}
+
+		if (!strcmp(this_char, "old_way")) {
+			SET_OPTION(old_way);
+			continue;
+		}
+
+		if (!strcmp(this_char, "displace_based_on_dirid")) {
+			SET_OPTION(displace_based_on_dirid);
+			continue;
+		}
+
+		if (!strcmp(this_char, "preallocmin")) {
+			REISERFS_SB(s)->s_alloc_options.preallocmin =
+			    (value
+			     && *value) ? simple_strtoul(value, &value, 0) : 4;
+			continue;
+		}
+
+		if (!strcmp(this_char, "preallocsize")) {
+			REISERFS_SB(s)->s_alloc_options.preallocsize =
+			    (value
+			     && *value) ? simple_strtoul(value, &value,
+							 0) :
+			    PREALLOCATION_SIZE;
+			continue;
+		}
+
+		reiserfs_warning(s, "zam-4001", "unknown option - %s",
+				 this_char);
+		return 1;
+	}
+
+	reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
+	return 0;
+}
+
+static void print_sep(struct seq_file *seq, int *first)
+{
+	if (!*first)
+		seq_puts(seq, ":");
+	else
+		*first = 0;
+}
+
+void show_alloc_options(struct seq_file *seq, struct super_block *s)
+{
+	int first = 1;
+
+	if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) |
+		(1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups)))
+		return;
+
+	seq_puts(seq, ",alloc=");
+
+	if (TEST_OPTION(concentrating_formatted_nodes, s)) {
+		print_sep(seq, &first);
+		if (REISERFS_SB(s)->s_alloc_options.border != 10) {
+			seq_printf(seq, "concentrating_formatted_nodes=%d",
+				100 / REISERFS_SB(s)->s_alloc_options.border);
+		} else
+			seq_puts(seq, "concentrating_formatted_nodes");
+	}
+	if (TEST_OPTION(displacing_large_files, s)) {
+		print_sep(seq, &first);
+		if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) {
+			seq_printf(seq, "displacing_large_files=%lu",
+			    REISERFS_SB(s)->s_alloc_options.large_file_size);
+		} else
+			seq_puts(seq, "displacing_large_files");
+	}
+	if (TEST_OPTION(displacing_new_packing_localities, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "displacing_new_packing_localities");
+	}
+	if (TEST_OPTION(old_hashed_relocation, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "old_hashed_relocation");
+	}
+	if (TEST_OPTION(new_hashed_relocation, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "new_hashed_relocation");
+	}
+	if (TEST_OPTION(dirid_groups, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "dirid_groups");
+	}
+	if (TEST_OPTION(oid_groups, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "oid_groups");
+	}
+	if (TEST_OPTION(packing_groups, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "packing_groups");
+	}
+	if (TEST_OPTION(hashed_formatted_nodes, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "hashed_formatted_nodes");
+	}
+	if (TEST_OPTION(skip_busy, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "skip_busy");
+	}
+	if (TEST_OPTION(hundredth_slices, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "hundredth_slices");
+	}
+	if (TEST_OPTION(old_way, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "old_way");
+	}
+	if (TEST_OPTION(displace_based_on_dirid, s)) {
+		print_sep(seq, &first);
+		seq_puts(seq, "displace_based_on_dirid");
+	}
+	if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) {
+		print_sep(seq, &first);
+		seq_printf(seq, "preallocmin=%d",
+				REISERFS_SB(s)->s_alloc_options.preallocmin);
+	}
+	if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) {
+		print_sep(seq, &first);
+		seq_printf(seq, "preallocsize=%d",
+				REISERFS_SB(s)->s_alloc_options.preallocsize);
+	}
+}
+
+static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
+{
+	char *hash_in;
+
+	if (hint->formatted_node) {
+		hash_in = (char *)&hint->key.k_dir_id;
+	} else {
+		if (!hint->inode) {
+			/*hint->search_start = hint->beg;*/
+			hash_in = (char *)&hint->key.k_dir_id;
+		} else
+		    if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+			hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+		else
+			hash_in =
+			    (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+	}
+
+	hint->search_start =
+	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+}
+
+/*
+ * Relocation based on dirid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a separate policy covers them
+ */
+static void dirid_groups(reiserfs_blocknr_hint_t * hint)
+{
+	unsigned long hash;
+	__u32 dirid = 0;
+	int bm = 0;
+	struct super_block *sb = hint->th->t_super;
+
+	if (hint->inode)
+		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+	else if (hint->formatted_node)
+		dirid = hint->key.k_dir_id;
+
+	if (dirid) {
+		bm = bmap_hash_id(sb, dirid);
+		hash = bm * (sb->s_blocksize << 3);
+		/* give a portion of the block group to metadata */
+		if (hint->inode)
+			hash += sb->s_blocksize / 2;
+		hint->search_start = hash;
+	}
+}
+
+/*
+ * Relocation based on oid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a separate policy covers them
+ */
+static void oid_groups(reiserfs_blocknr_hint_t * hint)
+{
+	if (hint->inode) {
+		unsigned long hash;
+		__u32 oid;
+		__u32 dirid;
+		int bm;
+
+		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+
+		/*
+		 * keep the root dir and it's first set of subdirs close to
+		 * the start of the disk
+		 */
+		if (dirid <= 2)
+			hash = (hint->inode->i_sb->s_blocksize << 3);
+		else {
+			oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
+			bm = bmap_hash_id(hint->inode->i_sb, oid);
+			hash = bm * (hint->inode->i_sb->s_blocksize << 3);
+		}
+		hint->search_start = hash;
+	}
+}
+
+/*
+ * returns 1 if it finds an indirect item and gets valid hint info
+ * from it, otherwise 0
+ */
+static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
+{
+	struct treepath *path;
+	struct buffer_head *bh;
+	struct item_head *ih;
+	int pos_in_item;
+	__le32 *item;
+	int ret = 0;
+
+	/*
+	 * reiserfs code can call this function w/o pointer to path
+	 * structure supplied; then we rely on supplied search_start
+	 */
+	if (!hint->path)
+		return 0;
+
+	path = hint->path;
+	bh = get_last_bh(path);
+	RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
+	ih = tp_item_head(path);
+	pos_in_item = path->pos_in_item;
+	item = tp_item_body(path);
+
+	hint->search_start = bh->b_blocknr;
+
+	/*
+	 * for indirect item: go to left and look for the first non-hole entry
+	 * in the indirect item
+	 */
+	if (!hint->formatted_node && is_indirect_le_ih(ih)) {
+		if (pos_in_item == I_UNFM_NUM(ih))
+			pos_in_item--;
+		while (pos_in_item >= 0) {
+			int t = get_block_num(item, pos_in_item);
+			if (t) {
+				hint->search_start = t;
+				ret = 1;
+				break;
+			}
+			pos_in_item--;
+		}
+	}
+
+	/* does result value fit into specified region? */
+	return ret;
+}
+
+/*
+ * should be, if formatted node, then try to put on first part of the device
+ * specified as number of percent with mount option device, else try to put
+ * on last of device.  This is not to say it is good code to do so,
+ * but the effect should be measured.
+ */
+static inline void set_border_in_hint(struct super_block *s,
+				      reiserfs_blocknr_hint_t * hint)
+{
+	b_blocknr_t border =
+	    SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
+
+	if (hint->formatted_node)
+		hint->end = border - 1;
+	else
+		hint->beg = border;
+}
+
+static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
+{
+	if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+		hint->search_start =
+		    hint->beg +
+		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
+			       4) % (hint->end - hint->beg);
+	else
+		hint->search_start =
+		    hint->beg +
+		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
+			       4) % (hint->end - hint->beg);
+}
+
+static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
+{
+	char *hash_in;
+
+	if (!hint->inode)
+		hash_in = (char *)&hint->key.k_dir_id;
+	else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+	else
+		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+
+	hint->search_start =
+	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+}
+
+static inline int
+this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
+						   hint)
+{
+	return hint->block ==
+	    REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
+}
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
+{
+	struct in_core_key *key = &hint->key;
+
+	hint->th->displace_new_blocks = 0;
+	hint->search_start =
+	    hint->beg + keyed_hash((char *)(&key->k_objectid),
+				   4) % (hint->end - hint->beg);
+}
+#endif
+
+static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
+{
+	b_blocknr_t border;
+	u32 hash_in;
+
+	if (hint->formatted_node || hint->inode == NULL) {
+		return 0;
+	}
+
+	hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
+	border =
+	    hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
+					 4) % (hint->end - hint->beg - 1);
+	if (border > hint->search_start)
+		hint->search_start = border;
+
+	return 1;
+}
+
+static inline int old_way(reiserfs_blocknr_hint_t * hint)
+{
+	b_blocknr_t border;
+
+	if (hint->formatted_node || hint->inode == NULL) {
+		return 0;
+	}
+
+	border =
+	    hint->beg +
+	    le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
+							      hint->beg);
+	if (border > hint->search_start)
+		hint->search_start = border;
+
+	return 1;
+}
+
+static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
+{
+	struct in_core_key *key = &hint->key;
+	b_blocknr_t slice_start;
+
+	slice_start =
+	    (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
+	if (slice_start > hint->search_start
+	    || slice_start + (hint->end / 100) <= hint->search_start) {
+		hint->search_start = slice_start;
+	}
+}
+
+static void determine_search_start(reiserfs_blocknr_hint_t * hint,
+				   int amount_needed)
+{
+	struct super_block *s = hint->th->t_super;
+	int unfm_hint;
+
+	hint->beg = 0;
+	hint->end = SB_BLOCK_COUNT(s) - 1;
+
+	/* This is former border algorithm. Now with tunable border offset */
+	if (concentrating_formatted_nodes(s))
+		set_border_in_hint(s, hint);
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	/*
+	 * whenever we create a new directory, we displace it.  At first
+	 * we will hash for location, later we might look for a moderately
+	 * empty place for it
+	 */
+	if (displacing_new_packing_localities(s)
+	    && hint->th->displace_new_blocks) {
+		displace_new_packing_locality(hint);
+
+		/*
+		 * we do not continue determine_search_start,
+		 * if new packing locality is being displaced
+		 */
+		return;
+	}
+#endif
+
+	/*
+	 * all persons should feel encouraged to add more special cases
+	 * here and test them
+	 */
+
+	if (displacing_large_files(s) && !hint->formatted_node
+	    && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
+		displace_large_file(hint);
+		return;
+	}
+
+	/*
+	 * if none of our special cases is relevant, use the left
+	 * neighbor in the tree order of the new node we are allocating for
+	 */
+	if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
+		hash_formatted_node(hint);
+		return;
+	}
+
+	unfm_hint = get_left_neighbor(hint);
+
+	/*
+	 * Mimic old block allocator behaviour, that is if VFS allowed for
+	 * preallocation, new blocks are displaced based on directory ID.
+	 * Also, if suggested search_start is less than last preallocated
+	 * block, we start searching from it, assuming that HDD dataflow
+	 * is faster in forward direction
+	 */
+	if (TEST_OPTION(old_way, s)) {
+		if (!hint->formatted_node) {
+			if (!reiserfs_hashed_relocation(s))
+				old_way(hint);
+			else if (!reiserfs_no_unhashed_relocation(s))
+				old_hashed_relocation(hint);
+
+			if (hint->inode
+			    && hint->search_start <
+			    REISERFS_I(hint->inode)->i_prealloc_block)
+				hint->search_start =
+				    REISERFS_I(hint->inode)->i_prealloc_block;
+		}
+		return;
+	}
+
+	/* This is an approach proposed by Hans */
+	if (TEST_OPTION(hundredth_slices, s)
+	    && !(displacing_large_files(s) && !hint->formatted_node)) {
+		hundredth_slices(hint);
+		return;
+	}
+
+	/* old_hashed_relocation only works on unformatted */
+	if (!unfm_hint && !hint->formatted_node &&
+	    TEST_OPTION(old_hashed_relocation, s)) {
+		old_hashed_relocation(hint);
+	}
+
+	/* new_hashed_relocation works with both formatted/unformatted nodes */
+	if ((!unfm_hint || hint->formatted_node) &&
+	    TEST_OPTION(new_hashed_relocation, s)) {
+		new_hashed_relocation(hint);
+	}
+
+	/* dirid grouping works only on unformatted nodes */
+	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
+		dirid_groups(hint);
+	}
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
+		dirid_groups(hint);
+	}
+#endif
+
+	/* oid grouping works only on unformatted nodes */
+	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
+		oid_groups(hint);
+	}
+	return;
+}
+
+static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
+{
+	/* make minimum size a mount option and benchmark both ways */
+	/* we preallocate blocks only for regular files, specific size */
+	/* benchmark preallocating always and see what happens */
+
+	hint->prealloc_size = 0;
+
+	if (!hint->formatted_node && hint->preallocate) {
+		if (S_ISREG(hint->inode->i_mode)
+		    && hint->inode->i_size >=
+		    REISERFS_SB(hint->th->t_super)->s_alloc_options.
+		    preallocmin * hint->inode->i_sb->s_blocksize)
+			hint->prealloc_size =
+			    REISERFS_SB(hint->th->t_super)->s_alloc_options.
+			    preallocsize - 1;
+	}
+	return CARRY_ON;
+}
+
+static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
+						 b_blocknr_t * new_blocknrs,
+						 b_blocknr_t start,
+						 b_blocknr_t finish, int min,
+						 int amount_needed,
+						 int prealloc_size)
+{
+	int rest = amount_needed;
+	int nr_allocated;
+
+	while (rest > 0 && start <= finish) {
+		nr_allocated = scan_bitmap(hint->th, &start, finish, min,
+					   rest + prealloc_size,
+					   !hint->formatted_node, hint->block);
+
+		if (nr_allocated == 0)	/* no new blocks allocated, return */
+			break;
+
+		/* fill free_blocknrs array first */
+		while (rest > 0 && nr_allocated > 0) {
+			*new_blocknrs++ = start++;
+			rest--;
+			nr_allocated--;
+		}
+
+		/* do we have something to fill prealloc. array also ? */
+		if (nr_allocated > 0) {
+			/*
+			 * it means prealloc_size was greater that 0 and
+			 * we do preallocation
+			 */
+			list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
+				 &SB_JOURNAL(hint->th->t_super)->
+				 j_prealloc_list);
+			REISERFS_I(hint->inode)->i_prealloc_block = start;
+			REISERFS_I(hint->inode)->i_prealloc_count =
+			    nr_allocated;
+			break;
+		}
+	}
+
+	return (amount_needed - rest);
+}
+
+static inline int blocknrs_and_prealloc_arrays_from_search_start
+    (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
+     int amount_needed) {
+	struct super_block *s = hint->th->t_super;
+	b_blocknr_t start = hint->search_start;
+	b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
+	int passno = 0;
+	int nr_allocated = 0;
+	int depth;
+
+	determine_prealloc_size(hint);
+	if (!hint->formatted_node) {
+		int quota_ret;
+#ifdef REISERQUOTA_DEBUG
+		reiserfs_debug(s, REISERFS_DEBUG_CODE,
+			       "reiserquota: allocating %d blocks id=%u",
+			       amount_needed, hint->inode->i_uid);
+#endif
+		depth = reiserfs_write_unlock_nested(s);
+		quota_ret =
+		    dquot_alloc_block_nodirty(hint->inode, amount_needed);
+		if (quota_ret) {	/* Quota exceeded? */
+			reiserfs_write_lock_nested(s, depth);
+			return QUOTA_EXCEEDED;
+		}
+		if (hint->preallocate && hint->prealloc_size) {
+#ifdef REISERQUOTA_DEBUG
+			reiserfs_debug(s, REISERFS_DEBUG_CODE,
+				       "reiserquota: allocating (prealloc) %d blocks id=%u",
+				       hint->prealloc_size, hint->inode->i_uid);
+#endif
+			quota_ret = dquot_prealloc_block_nodirty(hint->inode,
+							 hint->prealloc_size);
+			if (quota_ret)
+				hint->preallocate = hint->prealloc_size = 0;
+		}
+		/* for unformatted nodes, force large allocations */
+		reiserfs_write_lock_nested(s, depth);
+	}
+
+	do {
+		switch (passno++) {
+		case 0:	/* Search from hint->search_start to end of disk */
+			start = hint->search_start;
+			finish = SB_BLOCK_COUNT(s) - 1;
+			break;
+		case 1:	/* Search from hint->beg to hint->search_start */
+			start = hint->beg;
+			finish = hint->search_start;
+			break;
+		case 2:	/* Last chance: Search from 0 to hint->beg */
+			start = 0;
+			finish = hint->beg;
+			break;
+		default:
+			/* We've tried searching everywhere, not enough space */
+			/* Free the blocks */
+			if (!hint->formatted_node) {
+#ifdef REISERQUOTA_DEBUG
+				reiserfs_debug(s, REISERFS_DEBUG_CODE,
+					       "reiserquota: freeing (nospace) %d blocks id=%u",
+					       amount_needed +
+					       hint->prealloc_size -
+					       nr_allocated,
+					       hint->inode->i_uid);
+#endif
+				/* Free not allocated blocks */
+				depth = reiserfs_write_unlock_nested(s);
+				dquot_free_block_nodirty(hint->inode,
+					amount_needed + hint->prealloc_size -
+					nr_allocated);
+				reiserfs_write_lock_nested(s, depth);
+			}
+			while (nr_allocated--)
+				reiserfs_free_block(hint->th, hint->inode,
+						    new_blocknrs[nr_allocated],
+						    !hint->formatted_node);
+
+			return NO_DISK_SPACE;
+		}
+	} while ((nr_allocated += allocate_without_wrapping_disk(hint,
+								 new_blocknrs +
+								 nr_allocated,
+								 start, finish,
+								 1,
+								 amount_needed -
+								 nr_allocated,
+								 hint->
+								 prealloc_size))
+		 < amount_needed);
+	if (!hint->formatted_node &&
+	    amount_needed + hint->prealloc_size >
+	    nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
+		/* Some of preallocation blocks were not allocated */
+#ifdef REISERQUOTA_DEBUG
+		reiserfs_debug(s, REISERFS_DEBUG_CODE,
+			       "reiserquota: freeing (failed prealloc) %d blocks id=%u",
+			       amount_needed + hint->prealloc_size -
+			       nr_allocated -
+			       REISERFS_I(hint->inode)->i_prealloc_count,
+			       hint->inode->i_uid);
+#endif
+
+		depth = reiserfs_write_unlock_nested(s);
+		dquot_free_block_nodirty(hint->inode, amount_needed +
+					 hint->prealloc_size - nr_allocated -
+					 REISERFS_I(hint->inode)->
+					 i_prealloc_count);
+		reiserfs_write_lock_nested(s, depth);
+	}
+
+	return CARRY_ON;
+}
+
+/* grab new blocknrs from preallocated list */
+/* return amount still needed after using them */
+static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
+					      b_blocknr_t * new_blocknrs,
+					      int amount_needed)
+{
+	struct inode *inode = hint->inode;
+
+	if (REISERFS_I(inode)->i_prealloc_count > 0) {
+		while (amount_needed) {
+
+			*new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
+			REISERFS_I(inode)->i_prealloc_count--;
+
+			amount_needed--;
+
+			if (REISERFS_I(inode)->i_prealloc_count <= 0) {
+				list_del(&REISERFS_I(inode)->i_prealloc_list);
+				break;
+			}
+		}
+	}
+	/* return amount still needed after using preallocated blocks */
+	return amount_needed;
+}
+
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
+			       b_blocknr_t *new_blocknrs,
+			       int amount_needed,
+			       /* Amount of blocks we have already reserved */
+			       int reserved_by_us)
+{
+	int initial_amount_needed = amount_needed;
+	int ret;
+	struct super_block *s = hint->th->t_super;
+
+	/* Check if there is enough space, taking into account reserved space */
+	if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
+	    amount_needed - reserved_by_us)
+		return NO_DISK_SPACE;
+	/* should this be if !hint->inode &&  hint->preallocate? */
+	/* do you mean hint->formatted_node can be removed ? - Zam */
+	/*
+	 * hint->formatted_node cannot be removed because we try to access
+	 * inode information here, and there is often no inode associated with
+	 * metadata allocations - green
+	 */
+
+	if (!hint->formatted_node && hint->preallocate) {
+		amount_needed = use_preallocated_list_if_available
+		    (hint, new_blocknrs, amount_needed);
+
+		/*
+		 * We have all the block numbers we need from the
+		 * prealloc list
+		 */
+		if (amount_needed == 0)
+			return CARRY_ON;
+		new_blocknrs += (initial_amount_needed - amount_needed);
+	}
+
+	/* find search start and save it in hint structure */
+	determine_search_start(hint, amount_needed);
+	if (hint->search_start >= SB_BLOCK_COUNT(s))
+		hint->search_start = SB_BLOCK_COUNT(s) - 1;
+
+	/* allocation itself; fill new_blocknrs and preallocation arrays */
+	ret = blocknrs_and_prealloc_arrays_from_search_start
+	    (hint, new_blocknrs, amount_needed);
+
+	/*
+	 * We used prealloc. list to fill (partially) new_blocknrs array.
+	 * If final allocation fails we need to return blocks back to
+	 * prealloc. list or just free them. -- Zam (I chose second
+	 * variant)
+	 */
+	if (ret != CARRY_ON) {
+		while (amount_needed++ < initial_amount_needed) {
+			reiserfs_free_block(hint->th, hint->inode,
+					    *(--new_blocknrs), 1);
+		}
+	}
+	return ret;
+}
+
+void reiserfs_cache_bitmap_metadata(struct super_block *sb,
+                                    struct buffer_head *bh,
+                                    struct reiserfs_bitmap_info *info)
+{
+	unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
+
+	/* The first bit must ALWAYS be 1 */
+	if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
+		reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
+			       "corrupted: first bit must be 1", bh->b_blocknr);
+
+	info->free_count = 0;
+
+	while (--cur >= (unsigned long *)bh->b_data) {
+		/* 0 and ~0 are special, we can optimize for them */
+		if (*cur == 0)
+			info->free_count += BITS_PER_LONG;
+		else if (*cur != ~0L)	/* A mix, investigate */
+			info->free_count += BITS_PER_LONG - hweight_long(*cur);
+	}
+}
+
+struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
+                                               unsigned int bitmap)
+{
+	b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
+	struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
+	struct buffer_head *bh;
+
+	/*
+	 * Way old format filesystems had the bitmaps packed up front.
+	 * I doubt there are any of these left, but just in case...
+	 */
+	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
+			      &REISERFS_SB(sb)->s_properties)))
+		block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
+	else if (bitmap == 0)
+		block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
+
+	bh = sb_bread(sb, block);
+	if (bh == NULL)
+		reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
+		                 "reading failed", __func__, block);
+	else {
+		if (buffer_locked(bh)) {
+			int depth;
+			PROC_INFO_INC(sb, scan_bitmap.wait);
+			depth = reiserfs_write_unlock_nested(sb);
+			__wait_on_buffer(bh);
+			reiserfs_write_lock_nested(sb, depth);
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(atomic_read(&bh->b_count) == 0);
+
+		if (info->free_count == UINT_MAX)
+			reiserfs_cache_bitmap_metadata(sb, bh, info);
+	}
+
+	return bh;
+}
+
+int reiserfs_init_bitmap_cache(struct super_block *sb)
+{
+	struct reiserfs_bitmap_info *bitmap;
+	unsigned int bmap_nr = reiserfs_bmap_count(sb);
+
+	bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
+	if (bitmap == NULL)
+		return -ENOMEM;
+
+	memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr);
+
+	SB_AP_BITMAP(sb) = bitmap;
+
+	return 0;
+}
+
+void reiserfs_free_bitmap_cache(struct super_block *sb)
+{
+	if (SB_AP_BITMAP(sb)) {
+		vfree(SB_AP_BITMAP(sb));
+		SB_AP_BITMAP(sb) = NULL;
+	}
+}
diff --git a/kernel/fs/reiserfs/dir.c b/kernel/fs/reiserfs/dir.c
new file mode 100644
index 000000000..4a024e2ce
--- /dev/null
+++ b/kernel/fs/reiserfs/dir.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include "reiserfs.h"
+#include <linux/stat.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+extern const struct reiserfs_key MIN_KEY;
+
+static int reiserfs_readdir(struct file *, struct dir_context *);
+static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+			      int datasync);
+
+const struct file_operations reiserfs_dir_operations = {
+	.llseek = generic_file_llseek,
+	.read = generic_read_dir,
+	.iterate = reiserfs_readdir,
+	.fsync = reiserfs_dir_fsync,
+	.unlocked_ioctl = reiserfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = reiserfs_compat_ioctl,
+#endif
+};
+
+static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+			      int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int err;
+
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	reiserfs_write_lock(inode->i_sb);
+	err = reiserfs_commit_for_inode(inode);
+	reiserfs_write_unlock(inode->i_sb);
+	mutex_unlock(&inode->i_mutex);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+#define store_ih(where,what) copy_item_head (where, what)
+
+static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
+{
+	struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
+	return (d_really_is_positive(privroot) &&
+	        deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid);
+}
+
+int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
+{
+
+	/* key of current position in the directory (key of directory entry) */
+	struct cpu_key pos_key;
+
+	INITIALIZE_PATH(path_to_entry);
+	struct buffer_head *bh;
+	int item_num, entry_num;
+	const struct reiserfs_key *rkey;
+	struct item_head *ih, tmp_ih;
+	int search_res;
+	char *local_buf;
+	loff_t next_pos;
+	char small_buf[32];	/* avoid kmalloc if we can */
+	struct reiserfs_dir_entry de;
+	int ret = 0;
+	int depth;
+
+	reiserfs_write_lock(inode->i_sb);
+
+	reiserfs_check_lock_depth(inode->i_sb, "readdir");
+
+	/*
+	 * form key for search the next directory entry using
+	 * f_pos field of file structure
+	 */
+	make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+	next_pos = cpu_key_k_offset(&pos_key);
+
+	path_to_entry.reada = PATH_READA;
+	while (1) {
+research:
+		/*
+		 * search the directory item, containing entry with
+		 * specified key
+		 */
+		search_res =
+		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
+					&de);
+		if (search_res == IO_ERROR) {
+			/*
+			 * FIXME: we could just skip part of directory
+			 * which could not be read
+			 */
+			ret = -EIO;
+			goto out;
+		}
+		entry_num = de.de_entry_num;
+		bh = de.de_bh;
+		item_num = de.de_item_num;
+		ih = de.de_ih;
+		store_ih(&tmp_ih, ih);
+
+		/* we must have found item, that is item of this directory, */
+		RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
+		       "vs-9000: found item %h does not match to dir we readdir %K",
+		       ih, &pos_key);
+		RFALSE(item_num > B_NR_ITEMS(bh) - 1,
+		       "vs-9005 item_num == %d, item amount == %d",
+		       item_num, B_NR_ITEMS(bh));
+
+		/*
+		 * and entry must be not more than number of entries
+		 * in the item
+		 */
+		RFALSE(ih_entry_count(ih) < entry_num,
+		       "vs-9010: entry number is too big %d (%d)",
+		       entry_num, ih_entry_count(ih));
+
+		/*
+		 * go through all entries in the directory item beginning
+		 * from the entry, that has been found
+		 */
+		if (search_res == POSITION_FOUND
+		    || entry_num < ih_entry_count(ih)) {
+			struct reiserfs_de_head *deh =
+			    B_I_DEH(bh, ih) + entry_num;
+
+			for (; entry_num < ih_entry_count(ih);
+			     entry_num++, deh++) {
+				int d_reclen;
+				char *d_name;
+				ino_t d_ino;
+				loff_t cur_pos = deh_offset(deh);
+
+				/* it is hidden entry */
+				if (!de_visible(deh))
+					continue;
+				d_reclen = entry_length(bh, ih, entry_num);
+				d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
+
+				if (d_reclen <= 0 ||
+				    d_name + d_reclen > bh->b_data + bh->b_size) {
+					/*
+					 * There is corrupted data in entry,
+					 * We'd better stop here
+					 */
+					pathrelse(&path_to_entry);
+					ret = -EIO;
+					goto out;
+				}
+
+				if (!d_name[d_reclen - 1])
+					d_reclen = strlen(d_name);
+
+				/* too big to send back to VFS */
+				if (d_reclen >
+				    REISERFS_MAX_NAME(inode->i_sb->
+						      s_blocksize)) {
+					continue;
+				}
+
+				/* Ignore the .reiserfs_priv entry */
+				if (is_privroot_deh(inode, deh))
+					continue;
+
+				ctx->pos = deh_offset(deh);
+				d_ino = deh_objectid(deh);
+				if (d_reclen <= 32) {
+					local_buf = small_buf;
+				} else {
+					local_buf = kmalloc(d_reclen,
+							    GFP_NOFS);
+					if (!local_buf) {
+						pathrelse(&path_to_entry);
+						ret = -ENOMEM;
+						goto out;
+					}
+					if (item_moved(&tmp_ih, &path_to_entry)) {
+						kfree(local_buf);
+						goto research;
+					}
+				}
+
+				/*
+				 * Note, that we copy name to user space via
+				 * temporary buffer (local_buf) because
+				 * filldir will block if user space buffer is
+				 * swapped out. At that time entry can move to
+				 * somewhere else
+				 */
+				memcpy(local_buf, d_name, d_reclen);
+
+				/*
+				 * Since filldir might sleep, we can release
+				 * the write lock here for other waiters
+				 */
+				depth = reiserfs_write_unlock_nested(inode->i_sb);
+				if (!dir_emit
+				    (ctx, local_buf, d_reclen, d_ino,
+				     DT_UNKNOWN)) {
+					reiserfs_write_lock_nested(inode->i_sb, depth);
+					if (local_buf != small_buf) {
+						kfree(local_buf);
+					}
+					goto end;
+				}
+				reiserfs_write_lock_nested(inode->i_sb, depth);
+				if (local_buf != small_buf) {
+					kfree(local_buf);
+				}
+
+				/* deh_offset(deh) may be invalid now. */
+				next_pos = cur_pos + 1;
+
+				if (item_moved(&tmp_ih, &path_to_entry)) {
+					set_cpu_key_k_offset(&pos_key,
+							     next_pos);
+					goto research;
+				}
+			}	/* for */
+		}
+
+		/* end of directory has been reached */
+		if (item_num != B_NR_ITEMS(bh) - 1)
+			goto end;
+
+		/*
+		 * item we went through is last item of node. Using right
+		 * delimiting key check is it directory end
+		 */
+		rkey = get_rkey(&path_to_entry, inode->i_sb);
+		if (!comp_le_keys(rkey, &MIN_KEY)) {
+			/*
+			 * set pos_key to key, that is the smallest and greater
+			 * that key of the last entry in the item
+			 */
+			set_cpu_key_k_offset(&pos_key, next_pos);
+			continue;
+		}
+
+		/* end of directory has been reached */
+		if (COMP_SHORT_KEYS(rkey, &pos_key)) {
+			goto end;
+		}
+
+		/* directory continues in the right neighboring block */
+		set_cpu_key_k_offset(&pos_key,
+				     le_key_k_offset(KEY_FORMAT_3_5, rkey));
+
+	}			/* while */
+
+end:
+	ctx->pos = next_pos;
+	pathrelse(&path_to_entry);
+	reiserfs_check_path(&path_to_entry);
+out:
+	reiserfs_write_unlock(inode->i_sb);
+	return ret;
+}
+
+static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	return reiserfs_readdir_inode(file_inode(file), ctx);
+}
+
+/*
+ * compose directory item containing "." and ".." entries (entries are
+ * not aligned to 4 byte boundary)
+ */
+void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
+			    __le32 par_dirid, __le32 par_objid)
+{
+	struct reiserfs_de_head *dot, *dotdot;
+
+	memset(body, 0, EMPTY_DIR_SIZE_V1);
+	dot = (struct reiserfs_de_head *)body;
+	dotdot = dot + 1;
+
+	/* direntry header of "." */
+	put_deh_offset(dot, DOT_OFFSET);
+	/* these two are from make_le_item_head, and are are LE */
+	dot->deh_dir_id = dirid;
+	dot->deh_objectid = objid;
+	dot->deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
+	mark_de_visible(dot);
+
+	/* direntry header of ".." */
+	put_deh_offset(dotdot, DOT_DOT_OFFSET);
+	/* key of ".." for the root directory */
+	/* these two are from the inode, and are are LE */
+	dotdot->deh_dir_id = par_dirid;
+	dotdot->deh_objectid = par_objid;
+	dotdot->deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(dotdot, deh_location(dot) - strlen(".."));
+	mark_de_visible(dotdot);
+
+	/* copy ".." and "." */
+	memcpy(body + deh_location(dot), ".", 1);
+	memcpy(body + deh_location(dotdot), "..", 2);
+}
+
+/* compose directory item containing "." and ".." entries */
+void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
+			 __le32 par_dirid, __le32 par_objid)
+{
+	struct reiserfs_de_head *dot, *dotdot;
+
+	memset(body, 0, EMPTY_DIR_SIZE);
+	dot = (struct reiserfs_de_head *)body;
+	dotdot = dot + 1;
+
+	/* direntry header of "." */
+	put_deh_offset(dot, DOT_OFFSET);
+	/* these two are from make_le_item_head, and are are LE */
+	dot->deh_dir_id = dirid;
+	dot->deh_objectid = objid;
+	dot->deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
+	mark_de_visible(dot);
+
+	/* direntry header of ".." */
+	put_deh_offset(dotdot, DOT_DOT_OFFSET);
+	/* key of ".." for the root directory */
+	/* these two are from the inode, and are are LE */
+	dotdot->deh_dir_id = par_dirid;
+	dotdot->deh_objectid = par_objid;
+	dotdot->deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
+	mark_de_visible(dotdot);
+
+	/* copy ".." and "." */
+	memcpy(body + deh_location(dot), ".", 1);
+	memcpy(body + deh_location(dotdot), "..", 2);
+}
diff --git a/kernel/fs/reiserfs/do_balan.c b/kernel/fs/reiserfs/do_balan.c
new file mode 100644
index 000000000..9c02d96d3
--- /dev/null
+++ b/kernel/fs/reiserfs/do_balan.c
@@ -0,0 +1,1911 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/*
+ * Now we have all buffers that must be used in balancing of the tree
+ * Further calculations can not cause schedule(), and thus the buffer
+ * tree will be stable until the balancing will be finished
+ * balance the tree according to the analysis made before,
+ * and using buffers obtained after all above.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/time.h>
+#include "reiserfs.h"
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+
+static inline void buffer_info_init_left(struct tree_balance *tb,
+                                         struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = tb->L[0];
+	bi->bi_parent   = tb->FL[0];
+	bi->bi_position = get_left_neighbor_position(tb, 0);
+}
+
+static inline void buffer_info_init_right(struct tree_balance *tb,
+                                          struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = tb->R[0];
+	bi->bi_parent   = tb->FR[0];
+	bi->bi_position = get_right_neighbor_position(tb, 0);
+}
+
+static inline void buffer_info_init_tbS0(struct tree_balance *tb,
+                                         struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh        = PATH_PLAST_BUFFER(tb->tb_path);
+	bi->bi_parent   = PATH_H_PPARENT(tb->tb_path, 0);
+	bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
+}
+
+static inline void buffer_info_init_bh(struct tree_balance *tb,
+                                       struct buffer_info *bi,
+                                       struct buffer_head *bh)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = bh;
+	bi->bi_parent   = NULL;
+	bi->bi_position = 0;
+}
+
+inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
+				       struct buffer_head *bh, int flag)
+{
+	journal_mark_dirty(tb->transaction_handle, bh);
+}
+
+#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
+#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
+
+/*
+ * summary:
+ *  if deleting something ( tb->insert_size[0] < 0 )
+ *    return(balance_leaf_when_delete()); (flag d handled here)
+ *  else
+ *    if lnum is larger than 0 we put items into the left node
+ *    if rnum is larger than 0 we put items into the right node
+ *    if snum1 is larger than 0 we put items into the new node s1
+ *    if snum2 is larger than 0 we put items into the new node s2
+ * Note that all *num* count new items being created.
+ */
+
+static void balance_leaf_when_delete_del(struct tree_balance *tb)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);
+	struct buffer_info bi;
+#ifdef CONFIG_REISERFS_CHECK
+	struct item_head *ih = item_head(tbS0, item_pos);
+#endif
+
+	RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
+	       "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
+	       -tb->insert_size[0], ih);
+
+	buffer_info_init_tbS0(tb, &bi);
+	leaf_delete_items(&bi, 0, item_pos, 1, -1);
+
+	if (!item_pos && tb->CFL[0]) {
+		if (B_NR_ITEMS(tbS0)) {
+			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
+		} else {
+			if (!PATH_H_POSITION(tb->tb_path, 1))
+				replace_key(tb, tb->CFL[0], tb->lkey[0],
+					    PATH_H_PPARENT(tb->tb_path, 0), 0);
+		}
+	}
+
+	RFALSE(!item_pos && !tb->CFL[0],
+	       "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
+	       tb->L[0]);
+}
+
+/* cut item in S[0] */
+static void balance_leaf_when_delete_cut(struct tree_balance *tb)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);
+	struct item_head *ih = item_head(tbS0, item_pos);
+	int pos_in_item = tb->tb_path->pos_in_item;
+	struct buffer_info bi;
+	buffer_info_init_tbS0(tb, &bi);
+
+	if (is_direntry_le_ih(ih)) {
+		/*
+		 * UFS unlink semantics are such that you can only
+		 * delete one directory entry at a time.
+		 *
+		 * when we cut a directory tb->insert_size[0] means
+		 * number of entries to be cut (always 1)
+		 */
+		tb->insert_size[0] = -1;
+		leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+				     -tb->insert_size[0]);
+
+		RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
+		       "PAP-12030: can not change delimiting key. CFL[0]=%p",
+		       tb->CFL[0]);
+
+		if (!item_pos && !pos_in_item && tb->CFL[0])
+			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
+	} else {
+		leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+				     -tb->insert_size[0]);
+
+		RFALSE(!ih_item_len(ih),
+		       "PAP-12035: cut must leave non-zero dynamic "
+		       "length of item");
+	}
+}
+
+static int balance_leaf_when_delete_left(struct tree_balance *tb)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tbS0);
+
+	/* L[0] must be joined with S[0] */
+	if (tb->lnum[0] == -1) {
+		/* R[0] must be also joined with S[0] */
+		if (tb->rnum[0] == -1) {
+			if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
+				/*
+				 * all contents of all the
+				 * 3 buffers will be in L[0]
+				 */
+				if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
+				    1 < B_NR_ITEMS(tb->FR[0]))
+					replace_key(tb, tb->CFL[0],
+						    tb->lkey[0], tb->FR[0], 1);
+
+				leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
+						NULL);
+				leaf_move_items(LEAF_FROM_R_TO_L, tb,
+						B_NR_ITEMS(tb->R[0]), -1,
+						NULL);
+
+				reiserfs_invalidate_buffer(tb, tbS0);
+				reiserfs_invalidate_buffer(tb, tb->R[0]);
+
+				return 0;
+			}
+
+			/* all contents of all the 3 buffers will be in R[0] */
+			leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
+			leaf_move_items(LEAF_FROM_L_TO_R, tb,
+					B_NR_ITEMS(tb->L[0]), -1, NULL);
+
+			/* right_delimiting_key is correct in R[0] */
+			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+			reiserfs_invalidate_buffer(tb, tbS0);
+			reiserfs_invalidate_buffer(tb, tb->L[0]);
+
+			return -1;
+		}
+
+		RFALSE(tb->rnum[0] != 0,
+		       "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
+		/* all contents of L[0] and S[0] will be in L[0] */
+		leaf_shift_left(tb, n, -1);
+
+		reiserfs_invalidate_buffer(tb, tbS0);
+
+		return 0;
+	}
+
+	/*
+	 * a part of contents of S[0] will be in L[0] and
+	 * the rest part of S[0] will be in R[0]
+	 */
+
+	RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
+	       (tb->lnum[0] + tb->rnum[0] > n + 1),
+	       "PAP-12050: rnum(%d) and lnum(%d) and item "
+	       "number(%d) in S[0] are not consistent",
+	       tb->rnum[0], tb->lnum[0], n);
+	RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
+	       (tb->lbytes != -1 || tb->rbytes != -1),
+	       "PAP-12055: bad rbytes (%d)/lbytes (%d) "
+	       "parameters when items are not split",
+	       tb->rbytes, tb->lbytes);
+	RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
+	       (tb->lbytes < 1 || tb->rbytes != -1),
+	       "PAP-12060: bad rbytes (%d)/lbytes (%d) "
+	       "parameters when items are split",
+	       tb->rbytes, tb->lbytes);
+
+	leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+	leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+	reiserfs_invalidate_buffer(tb, tbS0);
+
+	return 0;
+}
+
+/*
+ * Balance leaf node in case of delete or cut: insert_size[0] < 0
+ *
+ * lnum, rnum can have values >= -1
+ *	-1 means that the neighbor must be joined with S
+ *	 0 means that nothing should be done with the neighbor
+ *	>0 means to shift entirely or partly the specified number of items
+ *         to the neighbor
+ */
+static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);
+	struct buffer_info bi;
+	int n;
+	struct item_head *ih;
+
+	RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
+	       "vs- 12000: level: wrong FR %z", tb->FR[0]);
+	RFALSE(tb->blknum[0] > 1,
+	       "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
+	RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
+	       "PAP-12010: tree can not be empty");
+
+	ih = item_head(tbS0, item_pos);
+	buffer_info_init_tbS0(tb, &bi);
+
+	/* Delete or truncate the item */
+
+	BUG_ON(flag != M_DELETE && flag != M_CUT);
+	if (flag == M_DELETE)
+		balance_leaf_when_delete_del(tb);
+	else /* M_CUT */
+		balance_leaf_when_delete_cut(tb);
+
+
+	/*
+	 * the rule is that no shifting occurs unless by shifting
+	 * a node can be freed
+	 */
+	n = B_NR_ITEMS(tbS0);
+
+
+	/* L[0] takes part in balancing */
+	if (tb->lnum[0])
+		return balance_leaf_when_delete_left(tb);
+
+	if (tb->rnum[0] == -1) {
+		/* all contents of R[0] and S[0] will be in R[0] */
+		leaf_shift_right(tb, n, -1);
+		reiserfs_invalidate_buffer(tb, tbS0);
+		return 0;
+	}
+
+	RFALSE(tb->rnum[0],
+	       "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
+	return 0;
+}
+
+static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
+					     struct item_head *const ih,
+					     const char * const body)
+{
+	int ret;
+	struct buffer_info bi;
+	int n = B_NR_ITEMS(tb->L[0]);
+	unsigned body_shift_bytes = 0;
+
+	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
+		/* part of new item falls into L[0] */
+		int new_item_len, shift;
+		int version;
+
+		ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
+
+		/* Calculate item length to insert to S[0] */
+		new_item_len = ih_item_len(ih) - tb->lbytes;
+
+		/* Calculate and check item length to insert to L[0] */
+		put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
+
+		RFALSE(ih_item_len(ih) <= 0,
+		       "PAP-12080: there is nothing to insert into L[0]: "
+		       "ih_item_len=%d", ih_item_len(ih));
+
+		/* Insert new item into L[0] */
+		buffer_info_init_left(tb, &bi);
+		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
+			     min_t(int, tb->zeroes_num, ih_item_len(ih)));
+
+		version = ih_version(ih);
+
+		/*
+		 * Calculate key component, item length and body to
+		 * insert into S[0]
+		 */
+		shift = 0;
+		if (is_indirect_le_ih(ih))
+			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+
+		add_le_ih_k_offset(ih, tb->lbytes << shift);
+
+		put_ih_item_len(ih, new_item_len);
+		if (tb->lbytes > tb->zeroes_num) {
+			body_shift_bytes = tb->lbytes - tb->zeroes_num;
+			tb->zeroes_num = 0;
+		} else
+			tb->zeroes_num -= tb->lbytes;
+
+		RFALSE(ih_item_len(ih) <= 0,
+		       "PAP-12085: there is nothing to insert into S[0]: "
+		       "ih_item_len=%d", ih_item_len(ih));
+	} else {
+		/* new item in whole falls into L[0] */
+		/* Shift lnum[0]-1 items to L[0] */
+		ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
+
+		/* Insert new item into L[0] */
+		buffer_info_init_left(tb, &bi);
+		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
+				     tb->zeroes_num);
+		tb->insert_size[0] = 0;
+		tb->zeroes_num = 0;
+	}
+	return body_shift_bytes;
+}
+
+static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
+						 struct item_head * const ih,
+						 const char * const body)
+{
+	int n = B_NR_ITEMS(tb->L[0]);
+	struct buffer_info bi;
+
+	RFALSE(tb->zeroes_num,
+	       "PAP-12090: invalid parameter in case of a directory");
+
+	/* directory item */
+	if (tb->lbytes > tb->pos_in_item) {
+		/* new directory entry falls into L[0] */
+		struct item_head *pasted;
+		int ret, l_pos_in_item = tb->pos_in_item;
+
+		/*
+		 * Shift lnum[0] - 1 items in whole.
+		 * Shift lbytes - 1 entries from given directory item
+		 */
+		ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
+		if (ret && !tb->item_pos) {
+			pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
+			l_pos_in_item += ih_entry_count(pasted) -
+					 (tb->lbytes - 1);
+		}
+
+		/* Append given directory entry to directory item */
+		buffer_info_init_left(tb, &bi);
+		leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
+				     l_pos_in_item, tb->insert_size[0],
+				     body, tb->zeroes_num);
+
+		/*
+		 * previous string prepared space for pasting new entry,
+		 * following string pastes this entry
+		 */
+
+		/*
+		 * when we have merge directory item, pos_in_item
+		 * has been changed too
+		 */
+
+		/* paste new directory entry. 1 is entry number */
+		leaf_paste_entries(&bi, n + tb->item_pos - ret,
+				   l_pos_in_item, 1,
+				   (struct reiserfs_de_head *) body,
+				   body + DEH_SIZE, tb->insert_size[0]);
+		tb->insert_size[0] = 0;
+	} else {
+		/* new directory item doesn't fall into L[0] */
+		/*
+		 * Shift lnum[0]-1 items in whole. Shift lbytes
+		 * directory entries from directory item number lnum[0]
+		 */
+		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+	}
+
+	/* Calculate new position to append in item body */
+	tb->pos_in_item -= tb->lbytes;
+}
+
+static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
+						  struct item_head * const ih,
+						  const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tb->L[0]);
+	struct buffer_info bi;
+	int body_shift_bytes = 0;
+
+	if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
+		balance_leaf_paste_left_shift_dirent(tb, ih, body);
+		return 0;
+	}
+
+	RFALSE(tb->lbytes <= 0,
+	       "PAP-12095: there is nothing to shift to L[0]. "
+	       "lbytes=%d", tb->lbytes);
+	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
+	       "PAP-12100: incorrect position to paste: "
+	       "item_len=%d, pos_in_item=%d",
+	       ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
+
+	/* appended item will be in L[0] in whole */
+	if (tb->lbytes >= tb->pos_in_item) {
+		struct item_head *tbS0_pos_ih, *tbL0_ih;
+		struct item_head *tbS0_0_ih;
+		struct reiserfs_key *left_delim_key;
+		int ret, l_n, version, temp_l;
+
+		tbS0_pos_ih = item_head(tbS0, tb->item_pos);
+		tbS0_0_ih = item_head(tbS0, 0);
+
+		/*
+		 * this bytes number must be appended
+		 * to the last item of L[h]
+		 */
+		l_n = tb->lbytes - tb->pos_in_item;
+
+		/* Calculate new insert_size[0] */
+		tb->insert_size[0] -= l_n;
+
+		RFALSE(tb->insert_size[0] <= 0,
+		       "PAP-12105: there is nothing to paste into "
+		       "L[0]. insert_size=%d", tb->insert_size[0]);
+
+		ret = leaf_shift_left(tb, tb->lnum[0],
+				      ih_item_len(tbS0_pos_ih));
+
+		tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
+
+		/* Append to body of item in L[0] */
+		buffer_info_init_left(tb, &bi);
+		leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
+				     ih_item_len(tbL0_ih), l_n, body,
+				     min_t(int, l_n, tb->zeroes_num));
+
+		/*
+		 * 0-th item in S0 can be only of DIRECT type
+		 * when l_n != 0
+		 */
+		temp_l = l_n;
+
+		RFALSE(ih_item_len(tbS0_0_ih),
+		       "PAP-12106: item length must be 0");
+		RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
+		       leaf_key(tb->L[0], n + tb->item_pos - ret)),
+		       "PAP-12107: items must be of the same file");
+
+		if (is_indirect_le_ih(tbL0_ih)) {
+			int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+			temp_l = l_n << shift;
+		}
+		/* update key of first item in S0 */
+		version = ih_version(tbS0_0_ih);
+		add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
+
+		/* update left delimiting key */
+		left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
+		add_le_key_k_offset(version, left_delim_key, temp_l);
+
+		/*
+		 * Calculate new body, position in item and
+		 * insert_size[0]
+		 */
+		if (l_n > tb->zeroes_num) {
+			body_shift_bytes = l_n - tb->zeroes_num;
+			tb->zeroes_num = 0;
+		} else
+			tb->zeroes_num -= l_n;
+		tb->pos_in_item = 0;
+
+		RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
+					  leaf_key(tb->L[0],
+						 B_NR_ITEMS(tb->L[0]) - 1)) ||
+		       !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
+		       !op_is_left_mergeable(left_delim_key, tbS0->b_size),
+		       "PAP-12120: item must be merge-able with left "
+		       "neighboring item");
+	} else {
+		/* only part of the appended item will be in L[0] */
+
+		/* Calculate position in item for append in S[0] */
+		tb->pos_in_item -= tb->lbytes;
+
+		RFALSE(tb->pos_in_item <= 0,
+		       "PAP-12125: no place for paste. pos_in_item=%d",
+		       tb->pos_in_item);
+
+		/*
+		 * Shift lnum[0] - 1 items in whole.
+		 * Shift lbytes - 1 byte from item number lnum[0]
+		 */
+		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+	}
+	return body_shift_bytes;
+}
+
+
+/* appended item will be in L[0] in whole */
+static void balance_leaf_paste_left_whole(struct tree_balance *tb,
+					  struct item_head * const ih,
+					  const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tb->L[0]);
+	struct buffer_info bi;
+	struct item_head *pasted;
+	int ret;
+
+	/* if we paste into first item of S[0] and it is left mergable */
+	if (!tb->item_pos &&
+	    op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
+		/*
+		 * then increment pos_in_item by the size of the
+		 * last item in L[0]
+		 */
+		pasted = item_head(tb->L[0], n - 1);
+		if (is_direntry_le_ih(pasted))
+			tb->pos_in_item += ih_entry_count(pasted);
+		else
+			tb->pos_in_item += ih_item_len(pasted);
+	}
+
+	/*
+	 * Shift lnum[0] - 1 items in whole.
+	 * Shift lbytes - 1 byte from item number lnum[0]
+	 */
+	ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+
+	/* Append to body of item in L[0] */
+	buffer_info_init_left(tb, &bi);
+	leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
+			     tb->insert_size[0], body, tb->zeroes_num);
+
+	/* if appended item is directory, paste entry */
+	pasted = item_head(tb->L[0], n + tb->item_pos - ret);
+	if (is_direntry_le_ih(pasted))
+		leaf_paste_entries(&bi, n + tb->item_pos - ret,
+				   tb->pos_in_item, 1,
+				   (struct reiserfs_de_head *)body,
+				   body + DEH_SIZE, tb->insert_size[0]);
+
+	/*
+	 * if appended item is indirect item, put unformatted node
+	 * into un list
+	 */
+	if (is_indirect_le_ih(pasted))
+		set_ih_free_space(pasted, 0);
+
+	tb->insert_size[0] = 0;
+	tb->zeroes_num = 0;
+}
+
+static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
+					    struct item_head * const ih,
+					    const char * const body)
+{
+	/* we must shift the part of the appended item */
+	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
+		return balance_leaf_paste_left_shift(tb, ih, body);
+	else
+		balance_leaf_paste_left_whole(tb, ih, body);
+	return 0;
+}
+
+/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
+static unsigned int balance_leaf_left(struct tree_balance *tb,
+				      struct item_head * const ih,
+				      const char * const body, int flag)
+{
+	if (tb->lnum[0] <= 0)
+		return 0;
+
+	/* new item or it part falls to L[0], shift it too */
+	if (tb->item_pos < tb->lnum[0]) {
+		BUG_ON(flag != M_INSERT && flag != M_PASTE);
+
+		if (flag == M_INSERT)
+			return balance_leaf_insert_left(tb, ih, body);
+		else /* M_PASTE */
+			return balance_leaf_paste_left(tb, ih, body);
+	} else
+		/* new item doesn't fall into L[0] */
+		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+	return 0;
+}
+
+
+static void balance_leaf_insert_right(struct tree_balance *tb,
+				      struct item_head * const ih,
+				      const char * const body)
+{
+
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tbS0);
+	struct buffer_info bi;
+	int ret;
+
+	/* new item or part of it doesn't fall into R[0] */
+	if (n - tb->rnum[0] >= tb->item_pos) {
+		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+		return;
+	}
+
+	/* new item or its part falls to R[0] */
+
+	/* part of new item falls into R[0] */
+	if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
+		loff_t old_key_comp, old_len, r_zeroes_number;
+		const char *r_body;
+		int version, shift;
+		loff_t offset;
+
+		leaf_shift_right(tb, tb->rnum[0] - 1, -1);
+
+		version = ih_version(ih);
+
+		/* Remember key component and item length */
+		old_key_comp = le_ih_k_offset(ih);
+		old_len = ih_item_len(ih);
+
+		/*
+		 * Calculate key component and item length to insert
+		 * into R[0]
+		 */
+		shift = 0;
+		if (is_indirect_le_ih(ih))
+			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+		offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
+		set_le_ih_k_offset(ih, offset);
+		put_ih_item_len(ih, tb->rbytes);
+
+		/* Insert part of the item into R[0] */
+		buffer_info_init_right(tb, &bi);
+		if ((old_len - tb->rbytes) > tb->zeroes_num) {
+			r_zeroes_number = 0;
+			r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
+		} else {
+			r_body = body;
+			r_zeroes_number = tb->zeroes_num -
+					  (old_len - tb->rbytes);
+			tb->zeroes_num -= r_zeroes_number;
+		}
+
+		leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
+
+		/* Replace right delimiting key by first key in R[0] */
+		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+		/*
+		 * Calculate key component and item length to
+		 * insert into S[0]
+		 */
+		set_le_ih_k_offset(ih, old_key_comp);
+		put_ih_item_len(ih, old_len - tb->rbytes);
+
+		tb->insert_size[0] -= tb->rbytes;
+
+	} else {
+		/* whole new item falls into R[0] */
+
+		/* Shift rnum[0]-1 items to R[0] */
+		ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
+
+		/* Insert new item into R[0] */
+		buffer_info_init_right(tb, &bi);
+		leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
+				     ih, body, tb->zeroes_num);
+
+		if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
+			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+		tb->zeroes_num = tb->insert_size[0] = 0;
+	}
+}
+
+
+static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
+				     struct item_head * const ih,
+				     const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	struct buffer_info bi;
+	int entry_count;
+
+	RFALSE(tb->zeroes_num,
+	       "PAP-12145: invalid parameter in case of a directory");
+	entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
+
+	/* new directory entry falls into R[0] */
+	if (entry_count - tb->rbytes < tb->pos_in_item) {
+		int paste_entry_position;
+
+		RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
+		       "PAP-12150: no enough of entries to shift to R[0]: "
+		       "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
+
+		/*
+		 * Shift rnum[0]-1 items in whole.
+		 * Shift rbytes-1 directory entries from directory
+		 * item number rnum[0]
+		 */
+		leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
+
+		/* Paste given directory entry to directory item */
+		paste_entry_position = tb->pos_in_item - entry_count +
+				       tb->rbytes - 1;
+		buffer_info_init_right(tb, &bi);
+		leaf_paste_in_buffer(&bi, 0, paste_entry_position,
+				     tb->insert_size[0], body, tb->zeroes_num);
+
+		/* paste entry */
+		leaf_paste_entries(&bi, 0, paste_entry_position, 1,
+				   (struct reiserfs_de_head *) body,
+				   body + DEH_SIZE, tb->insert_size[0]);
+
+		/* change delimiting keys */
+		if (paste_entry_position == 0)
+			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+		tb->insert_size[0] = 0;
+		tb->pos_in_item++;
+	} else {
+		/* new directory entry doesn't fall into R[0] */
+		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+	}
+}
+
+static void balance_leaf_paste_right_shift(struct tree_balance *tb,
+				     struct item_head * const ih,
+				     const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n_shift, n_rem, r_zeroes_number, version;
+	unsigned long temp_rem;
+	const char *r_body;
+	struct buffer_info bi;
+
+	/* we append to directory item */
+	if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
+		balance_leaf_paste_right_shift_dirent(tb, ih, body);
+		return;
+	}
+
+	/* regular object */
+
+	/*
+	 * Calculate number of bytes which must be shifted
+	 * from appended item
+	 */
+	n_shift = tb->rbytes - tb->insert_size[0];
+	if (n_shift < 0)
+		n_shift = 0;
+
+	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
+	       "PAP-12155: invalid position to paste. ih_item_len=%d, "
+	       "pos_in_item=%d", tb->pos_in_item,
+	       ih_item_len(item_head(tbS0, tb->item_pos)));
+
+	leaf_shift_right(tb, tb->rnum[0], n_shift);
+
+	/*
+	 * Calculate number of bytes which must remain in body
+	 * after appending to R[0]
+	 */
+	n_rem = tb->insert_size[0] - tb->rbytes;
+	if (n_rem < 0)
+		n_rem = 0;
+
+	temp_rem = n_rem;
+
+	version = ih_version(item_head(tb->R[0], 0));
+
+	if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
+		int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+		temp_rem = n_rem << shift;
+	}
+
+	add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
+	add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
+			    temp_rem);
+
+	do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
+
+	/* Append part of body into R[0] */
+	buffer_info_init_right(tb, &bi);
+	if (n_rem > tb->zeroes_num) {
+		r_zeroes_number = 0;
+		r_body = body + n_rem - tb->zeroes_num;
+	} else {
+		r_body = body;
+		r_zeroes_number = tb->zeroes_num - n_rem;
+		tb->zeroes_num -= r_zeroes_number;
+	}
+
+	leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
+			     r_body, r_zeroes_number);
+
+	if (is_indirect_le_ih(item_head(tb->R[0], 0)))
+		set_ih_free_space(item_head(tb->R[0], 0), 0);
+
+	tb->insert_size[0] = n_rem;
+	if (!n_rem)
+		tb->pos_in_item++;
+}
+
+static void balance_leaf_paste_right_whole(struct tree_balance *tb,
+				     struct item_head * const ih,
+				     const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tbS0);
+	struct item_head *pasted;
+	struct buffer_info bi;
+
+							buffer_info_init_right(tb, &bi);
+	leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+	/* append item in R[0] */
+	if (tb->pos_in_item >= 0) {
+		buffer_info_init_right(tb, &bi);
+		leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
+				     tb->pos_in_item, tb->insert_size[0], body,
+				     tb->zeroes_num);
+	}
+
+	/* paste new entry, if item is directory item */
+	pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
+	if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
+		leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
+				   tb->pos_in_item, 1,
+				   (struct reiserfs_de_head *)body,
+				   body + DEH_SIZE, tb->insert_size[0]);
+
+		if (!tb->pos_in_item) {
+
+			RFALSE(tb->item_pos - n + tb->rnum[0],
+			       "PAP-12165: directory item must be first "
+			       "item of node when pasting is in 0th position");
+
+			/* update delimiting keys */
+			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+		}
+	}
+
+	if (is_indirect_le_ih(pasted))
+		set_ih_free_space(pasted, 0);
+	tb->zeroes_num = tb->insert_size[0] = 0;
+}
+
+static void balance_leaf_paste_right(struct tree_balance *tb,
+				     struct item_head * const ih,
+				     const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tbS0);
+
+	/* new item doesn't fall into R[0] */
+	if (n - tb->rnum[0] > tb->item_pos) {
+		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+		return;
+	}
+
+	/* pasted item or part of it falls to R[0] */
+
+	if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
+		/* we must shift the part of the appended item */
+		balance_leaf_paste_right_shift(tb, ih, body);
+	else
+		/* pasted item in whole falls into R[0] */
+		balance_leaf_paste_right_whole(tb, ih, body);
+}
+
+/* shift rnum[0] items from S[0] to the right neighbor R[0] */
+static void balance_leaf_right(struct tree_balance *tb,
+			       struct item_head * const ih,
+			       const char * const body, int flag)
+{
+	if (tb->rnum[0] <= 0)
+		return;
+
+	BUG_ON(flag != M_INSERT && flag != M_PASTE);
+
+	if (flag == M_INSERT)
+		balance_leaf_insert_right(tb, ih, body);
+	else /* M_PASTE */
+		balance_leaf_paste_right(tb, ih, body);
+}
+
+static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
+					  struct item_head * const ih,
+					  const char * const body,
+					  struct item_head *insert_key,
+					  struct buffer_head **insert_ptr,
+					  int i)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tbS0);
+	struct buffer_info bi;
+	int shift;
+
+	/* new item or it part don't falls into S_new[i] */
+	if (n - tb->snum[i] >= tb->item_pos) {
+		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+				tb->snum[i], tb->sbytes[i], tb->S_new[i]);
+		return;
+	}
+
+	/* new item or it's part falls to first new node S_new[i] */
+
+	/* part of new item falls into S_new[i] */
+	if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
+		int old_key_comp, old_len, r_zeroes_number;
+		const char *r_body;
+		int version;
+
+		/* Move snum[i]-1 items from S[0] to S_new[i] */
+		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
+				tb->S_new[i]);
+
+		/* Remember key component and item length */
+		version = ih_version(ih);
+		old_key_comp = le_ih_k_offset(ih);
+		old_len = ih_item_len(ih);
+
+		/*
+		 * Calculate key component and item length to insert
+		 * into S_new[i]
+		 */
+		shift = 0;
+		if (is_indirect_le_ih(ih))
+			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+		set_le_ih_k_offset(ih,
+				   le_ih_k_offset(ih) +
+				   ((old_len - tb->sbytes[i]) << shift));
+
+		put_ih_item_len(ih, tb->sbytes[i]);
+
+		/* Insert part of the item into S_new[i] before 0-th item */
+		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+
+		if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
+			r_zeroes_number = 0;
+			r_body = body + (old_len - tb->sbytes[i]) -
+					 tb->zeroes_num;
+		} else {
+			r_body = body;
+			r_zeroes_number = tb->zeroes_num - (old_len -
+					  tb->sbytes[i]);
+			tb->zeroes_num -= r_zeroes_number;
+		}
+
+		leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
+
+		/*
+		 * Calculate key component and item length to
+		 * insert into S[i]
+		 */
+		set_le_ih_k_offset(ih, old_key_comp);
+		put_ih_item_len(ih, old_len - tb->sbytes[i]);
+		tb->insert_size[0] -= tb->sbytes[i];
+	} else {
+		/* whole new item falls into S_new[i] */
+
+		/*
+		 * Shift snum[0] - 1 items to S_new[i]
+		 * (sbytes[i] of split item)
+		 */
+		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+				tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
+
+		/* Insert new item into S_new[i] */
+		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+		leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
+				     ih, body, tb->zeroes_num);
+
+		tb->zeroes_num = tb->insert_size[0] = 0;
+	}
+}
+
+/* we append to directory item */
+static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
+					 struct item_head * const ih,
+					 const char * const body,
+					 struct item_head *insert_key,
+					 struct buffer_head **insert_ptr,
+					 int i)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
+	int entry_count = ih_entry_count(aux_ih);
+	struct buffer_info bi;
+
+	if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
+	    tb->pos_in_item <= entry_count) {
+		/* new directory entry falls into S_new[i] */
+
+		RFALSE(!tb->insert_size[0],
+		       "PAP-12215: insert_size is already 0");
+		RFALSE(tb->sbytes[i] - 1 >= entry_count,
+		       "PAP-12220: there are no so much entries (%d), only %d",
+		       tb->sbytes[i] - 1, entry_count);
+
+		/*
+		 * Shift snum[i]-1 items in whole.
+		 * Shift sbytes[i] directory entries
+		 * from directory item number snum[i]
+		 */
+		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+				tb->sbytes[i] - 1, tb->S_new[i]);
+
+		/*
+		 * Paste given directory entry to
+		 * directory item
+		 */
+		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+		leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
+				     tb->sbytes[i] - 1, tb->insert_size[0],
+				     body, tb->zeroes_num);
+
+		/* paste new directory entry */
+		leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
+				   tb->sbytes[i] - 1, 1,
+				   (struct reiserfs_de_head *) body,
+				   body + DEH_SIZE, tb->insert_size[0]);
+
+		tb->insert_size[0] = 0;
+		tb->pos_in_item++;
+	} else {
+		/* new directory entry doesn't fall into S_new[i] */
+		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+				tb->sbytes[i], tb->S_new[i]);
+	}
+
+}
+
+static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
+					 struct item_head * const ih,
+					 const char * const body,
+					 struct item_head *insert_key,
+					 struct buffer_head **insert_ptr,
+					 int i)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
+	int n_shift, n_rem, r_zeroes_number, shift;
+	const char *r_body;
+	struct item_head *tmp;
+	struct buffer_info bi;
+
+	RFALSE(ih, "PAP-12210: ih must be 0");
+
+	if (is_direntry_le_ih(aux_ih)) {
+		balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
+						    insert_ptr, i);
+		return;
+	}
+
+	/* regular object */
+
+
+	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
+	       tb->insert_size[0] <= 0,
+	       "PAP-12225: item too short or insert_size <= 0");
+
+	/*
+	 * Calculate number of bytes which must be shifted from appended item
+	 */
+	n_shift = tb->sbytes[i] - tb->insert_size[0];
+	if (n_shift < 0)
+		n_shift = 0;
+	leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
+			tb->S_new[i]);
+
+	/*
+	 * Calculate number of bytes which must remain in body after
+	 * append to S_new[i]
+	 */
+	n_rem = tb->insert_size[0] - tb->sbytes[i];
+	if (n_rem < 0)
+		n_rem = 0;
+
+	/* Append part of body into S_new[0] */
+	buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+	if (n_rem > tb->zeroes_num) {
+		r_zeroes_number = 0;
+		r_body = body + n_rem - tb->zeroes_num;
+	} else {
+		r_body = body;
+		r_zeroes_number = tb->zeroes_num - n_rem;
+		tb->zeroes_num -= r_zeroes_number;
+	}
+
+	leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
+			     r_body, r_zeroes_number);
+
+	tmp = item_head(tb->S_new[i], 0);
+	shift = 0;
+	if (is_indirect_le_ih(tmp)) {
+		set_ih_free_space(tmp, 0);
+		shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+	}
+	add_le_ih_k_offset(tmp, n_rem << shift);
+
+	tb->insert_size[0] = n_rem;
+	if (!n_rem)
+		tb->pos_in_item++;
+}
+
+static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
+					       struct item_head * const ih,
+					       const char * const body,
+					       struct item_head *insert_key,
+					       struct buffer_head **insert_ptr,
+					       int i)
+
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tbS0);
+	int leaf_mi;
+	struct item_head *pasted;
+	struct buffer_info bi;
+
+#ifdef CONFIG_REISERFS_CHECK
+	struct item_head *ih_check = item_head(tbS0, tb->item_pos);
+
+	if (!is_direntry_le_ih(ih_check) &&
+	    (tb->pos_in_item != ih_item_len(ih_check) ||
+	    tb->insert_size[0] <= 0))
+		reiserfs_panic(tb->tb_sb,
+			     "PAP-12235",
+			     "pos_in_item must be equal to ih_item_len");
+#endif
+
+	leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+				  tb->sbytes[i], tb->S_new[i]);
+
+	RFALSE(leaf_mi,
+	       "PAP-12240: unexpected value returned by leaf_move_items (%d)",
+	       leaf_mi);
+
+	/* paste into item */
+	buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+	leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
+			     tb->pos_in_item, tb->insert_size[0],
+			     body, tb->zeroes_num);
+
+	pasted = item_head(tb->S_new[i], tb->item_pos - n +
+			   tb->snum[i]);
+	if (is_direntry_le_ih(pasted))
+		leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
+				   tb->pos_in_item, 1,
+				   (struct reiserfs_de_head *)body,
+				   body + DEH_SIZE, tb->insert_size[0]);
+
+	/* if we paste to indirect item update ih_free_space */
+	if (is_indirect_le_ih(pasted))
+		set_ih_free_space(pasted, 0);
+
+	tb->zeroes_num = tb->insert_size[0] = 0;
+
+}
+static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
+					 struct item_head * const ih,
+					 const char * const body,
+					 struct item_head *insert_key,
+					 struct buffer_head **insert_ptr,
+					 int i)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int n = B_NR_ITEMS(tbS0);
+
+	/* pasted item doesn't fall into S_new[i] */
+	if (n - tb->snum[i] > tb->item_pos) {
+		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+				tb->snum[i], tb->sbytes[i], tb->S_new[i]);
+		return;
+	}
+
+	/* pasted item or part if it falls to S_new[i] */
+
+	if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
+		/* we must shift part of the appended item */
+		balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
+						   insert_ptr, i);
+	else
+		/* item falls wholly into S_new[i] */
+		balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
+						   insert_ptr, i);
+}
+
+/* Fill new nodes that appear in place of S[0] */
+static void balance_leaf_new_nodes(struct tree_balance *tb,
+				   struct item_head * const ih,
+				   const char * const body,
+				   struct item_head *insert_key,
+				   struct buffer_head **insert_ptr,
+				   int flag)
+{
+	int i;
+	for (i = tb->blknum[0] - 2; i >= 0; i--) {
+		BUG_ON(flag != M_INSERT && flag != M_PASTE);
+
+		RFALSE(!tb->snum[i],
+		       "PAP-12200: snum[%d] == %d. Must be > 0", i,
+		       tb->snum[i]);
+
+		/* here we shift from S to S_new nodes */
+
+		tb->S_new[i] = get_FEB(tb);
+
+		/* initialized block type and tree level */
+		set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
+
+		if (flag == M_INSERT)
+			balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
+						      insert_ptr, i);
+		else /* M_PASTE */
+			balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
+						     insert_ptr, i);
+
+		memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
+		insert_ptr[i] = tb->S_new[i];
+
+		RFALSE(!buffer_journaled(tb->S_new[i])
+		       || buffer_journal_dirty(tb->S_new[i])
+		       || buffer_dirty(tb->S_new[i]),
+		       "PAP-12247: S_new[%d] : (%b)",
+		       i, tb->S_new[i]);
+	}
+}
+
+static void balance_leaf_finish_node_insert(struct tree_balance *tb,
+					    struct item_head * const ih,
+					    const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	struct buffer_info bi;
+	buffer_info_init_tbS0(tb, &bi);
+	leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
+
+	/* If we insert the first key change the delimiting key */
+	if (tb->item_pos == 0) {
+		if (tb->CFL[0])	/* can be 0 in reiserfsck */
+			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
+
+	}
+}
+
+static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
+						  struct item_head * const ih,
+						  const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	struct item_head *pasted = item_head(tbS0, tb->item_pos);
+	struct buffer_info bi;
+
+	if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
+		RFALSE(!tb->insert_size[0],
+		       "PAP-12260: insert_size is 0 already");
+
+		/* prepare space */
+		buffer_info_init_tbS0(tb, &bi);
+		leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
+				     tb->insert_size[0], body, tb->zeroes_num);
+
+		/* paste entry */
+		leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
+				   (struct reiserfs_de_head *)body,
+				   body + DEH_SIZE, tb->insert_size[0]);
+
+		if (!tb->item_pos && !tb->pos_in_item) {
+			RFALSE(!tb->CFL[0] || !tb->L[0],
+			       "PAP-12270: CFL[0]/L[0] must  be specified");
+			if (tb->CFL[0])
+				replace_key(tb, tb->CFL[0], tb->lkey[0],
+					    tbS0, 0);
+		}
+
+		tb->insert_size[0] = 0;
+	}
+}
+
+static void balance_leaf_finish_node_paste(struct tree_balance *tb,
+					   struct item_head * const ih,
+					   const char * const body)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	struct buffer_info bi;
+	struct item_head *pasted = item_head(tbS0, tb->item_pos);
+
+	/* when directory, may be new entry already pasted */
+	if (is_direntry_le_ih(pasted)) {
+		balance_leaf_finish_node_paste_dirent(tb, ih, body);
+		return;
+	}
+
+	/* regular object */
+
+	if (tb->pos_in_item == ih_item_len(pasted)) {
+		RFALSE(tb->insert_size[0] <= 0,
+		       "PAP-12275: insert size must not be %d",
+		       tb->insert_size[0]);
+		buffer_info_init_tbS0(tb, &bi);
+		leaf_paste_in_buffer(&bi, tb->item_pos,
+				     tb->pos_in_item, tb->insert_size[0], body,
+				     tb->zeroes_num);
+
+		if (is_indirect_le_ih(pasted))
+			set_ih_free_space(pasted, 0);
+
+		tb->insert_size[0] = 0;
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	else if (tb->insert_size[0]) {
+		print_cur_tb("12285");
+		reiserfs_panic(tb->tb_sb, "PAP-12285",
+		    "insert_size must be 0 (%d)", tb->insert_size[0]);
+	}
+#endif
+}
+
+/*
+ * if the affected item was not wholly shifted then we
+ * perform all necessary operations on that part or whole
+ * of the affected item which remains in S
+ */
+static void balance_leaf_finish_node(struct tree_balance *tb,
+				      struct item_head * const ih,
+				      const char * const body, int flag)
+{
+	/* if we must insert or append into buffer S[0] */
+	if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
+		if (flag == M_INSERT)
+			balance_leaf_finish_node_insert(tb, ih, body);
+		else /* M_PASTE */
+			balance_leaf_finish_node_paste(tb, ih, body);
+	}
+}
+
+/**
+ * balance_leaf - reiserfs tree balancing algorithm
+ * @tb: tree balance state
+ * @ih: item header of inserted item (little endian)
+ * @body: body of inserted item or bytes to paste
+ * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
+ * passed back:
+ * @insert_key: key to insert new nodes
+ * @insert_ptr: array of nodes to insert at the next level
+ *
+ * In our processing of one level we sometimes determine what must be
+ * inserted into the next higher level.  This insertion consists of a
+ * key or two keys and their corresponding pointers.
+ */
+static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
+			const char *body, int flag,
+			struct item_head *insert_key,
+			struct buffer_head **insert_ptr)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+
+	PROC_INFO_INC(tb->tb_sb, balance_at[0]);
+
+	/* Make balance in case insert_size[0] < 0 */
+	if (tb->insert_size[0] < 0)
+		return balance_leaf_when_delete(tb, flag);
+
+	tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
+	tb->pos_in_item = tb->tb_path->pos_in_item,
+	tb->zeroes_num = 0;
+	if (flag == M_INSERT && !body)
+		tb->zeroes_num = ih_item_len(ih);
+
+	/*
+	 * for indirect item pos_in_item is measured in unformatted node
+	 * pointers. Recalculate to bytes
+	 */
+	if (flag != M_INSERT
+	    && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
+		tb->pos_in_item *= UNFM_P_SIZE;
+
+	body += balance_leaf_left(tb, ih, body, flag);
+
+	/* tb->lnum[0] > 0 */
+	/* Calculate new item position */
+	tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
+
+	balance_leaf_right(tb, ih, body, flag);
+
+	/* tb->rnum[0] > 0 */
+	RFALSE(tb->blknum[0] > 3,
+	       "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
+	RFALSE(tb->blknum[0] < 0,
+	       "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
+
+	/*
+	 * if while adding to a node we discover that it is possible to split
+	 * it in two, and merge the left part into the left neighbor and the
+	 * right part into the right neighbor, eliminating the node
+	 */
+	if (tb->blknum[0] == 0) {	/* node S[0] is empty now */
+
+		RFALSE(!tb->lnum[0] || !tb->rnum[0],
+		       "PAP-12190: lnum and rnum must not be zero");
+		/*
+		 * if insertion was done before 0-th position in R[0], right
+		 * delimiting key of the tb->L[0]'s and left delimiting key are
+		 * not set correctly
+		 */
+		if (tb->CFL[0]) {
+			if (!tb->CFR[0])
+				reiserfs_panic(tb->tb_sb, "vs-12195",
+					       "CFR not initialized");
+			copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
+				 internal_key(tb->CFR[0], tb->rkey[0]));
+			do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
+		}
+
+		reiserfs_invalidate_buffer(tb, tbS0);
+		return 0;
+	}
+
+	balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
+
+	balance_leaf_finish_node(tb, ih, body, flag);
+
+#ifdef CONFIG_REISERFS_CHECK
+	if (flag == M_PASTE && tb->insert_size[0]) {
+		print_cur_tb("12290");
+		reiserfs_panic(tb->tb_sb,
+			       "PAP-12290", "insert_size is still not 0 (%d)",
+			       tb->insert_size[0]);
+	}
+#endif
+
+	/* Leaf level of the tree is balanced (end of balance_leaf) */
+	return 0;
+}
+
+/* Make empty node */
+void make_empty_node(struct buffer_info *bi)
+{
+	struct block_head *blkh;
+
+	RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
+
+	blkh = B_BLK_HEAD(bi->bi_bh);
+	set_blkh_nr_item(blkh, 0);
+	set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
+
+	if (bi->bi_parent)
+		B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0;	/* Endian safe if 0 */
+}
+
+/* Get first empty buffer */
+struct buffer_head *get_FEB(struct tree_balance *tb)
+{
+	int i;
+	struct buffer_info bi;
+
+	for (i = 0; i < MAX_FEB_SIZE; i++)
+		if (tb->FEB[i] != NULL)
+			break;
+
+	if (i == MAX_FEB_SIZE)
+		reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
+
+	buffer_info_init_bh(tb, &bi, tb->FEB[i]);
+	make_empty_node(&bi);
+	set_buffer_uptodate(tb->FEB[i]);
+	tb->used[i] = tb->FEB[i];
+	tb->FEB[i] = NULL;
+
+	return tb->used[i];
+}
+
+/* This is now used because reiserfs_free_block has to be able to schedule. */
+static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
+{
+	int i;
+
+	if (buffer_dirty(bh))
+		reiserfs_warning(tb->tb_sb, "reiserfs-12320",
+				 "called with dirty buffer");
+	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
+		if (!tb->thrown[i]) {
+			tb->thrown[i] = bh;
+			get_bh(bh);	/* free_thrown puts this */
+			return;
+		}
+	reiserfs_warning(tb->tb_sb, "reiserfs-12321",
+			 "too many thrown buffers");
+}
+
+static void free_thrown(struct tree_balance *tb)
+{
+	int i;
+	b_blocknr_t blocknr;
+	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) {
+		if (tb->thrown[i]) {
+			blocknr = tb->thrown[i]->b_blocknr;
+			if (buffer_dirty(tb->thrown[i]))
+				reiserfs_warning(tb->tb_sb, "reiserfs-12322",
+						 "called with dirty buffer %d",
+						 blocknr);
+			brelse(tb->thrown[i]);	/* incremented in store_thrown */
+			reiserfs_free_block(tb->transaction_handle, NULL,
+					    blocknr, 0);
+		}
+	}
+}
+
+void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	blkh = B_BLK_HEAD(bh);
+	set_blkh_level(blkh, FREE_LEVEL);
+	set_blkh_nr_item(blkh, 0);
+
+	clear_buffer_dirty(bh);
+	store_thrown(tb, bh);
+}
+
+/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
+void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
+		 struct buffer_head *src, int n_src)
+{
+
+	RFALSE(dest == NULL || src == NULL,
+	       "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
+	       src, dest);
+	RFALSE(!B_IS_KEYS_LEVEL(dest),
+	       "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
+	       dest);
+	RFALSE(n_dest < 0 || n_src < 0,
+	       "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
+	RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
+	       "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
+	       n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
+
+	if (B_IS_ITEMS_LEVEL(src))
+		/* source buffer contains leaf node */
+		memcpy(internal_key(dest, n_dest), item_head(src, n_src),
+		       KEY_SIZE);
+	else
+		memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
+		       KEY_SIZE);
+
+	do_balance_mark_internal_dirty(tb, dest, 0);
+}
+
+int get_left_neighbor_position(struct tree_balance *tb, int h)
+{
+	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
+	       "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
+	       h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
+
+	if (Sh_position == 0)
+		return B_NR_ITEMS(tb->FL[h]);
+	else
+		return Sh_position - 1;
+}
+
+int get_right_neighbor_position(struct tree_balance *tb, int h)
+{
+	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
+	       "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
+	       h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
+
+	if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
+		return 0;
+	else
+		return Sh_position + 1;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
+static void check_internal_node(struct super_block *s, struct buffer_head *bh,
+				char *mes)
+{
+	struct disk_child *dc;
+	int i;
+
+	RFALSE(!bh, "PAP-12336: bh == 0");
+
+	if (!bh || !B_IS_IN_TREE(bh))
+		return;
+
+	RFALSE(!buffer_dirty(bh) &&
+	       !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
+	       "PAP-12337: buffer (%b) must be dirty", bh);
+	dc = B_N_CHILD(bh, 0);
+
+	for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
+		if (!is_reusable(s, dc_block_number(dc), 1)) {
+			print_cur_tb(mes);
+			reiserfs_panic(s, "PAP-12338",
+				       "invalid child pointer %y in %b",
+				       dc, bh);
+		}
+	}
+}
+
+static int locked_or_not_in_tree(struct tree_balance *tb,
+				  struct buffer_head *bh, char *which)
+{
+	if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
+	    !B_IS_IN_TREE(bh)) {
+		reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
+		return 1;
+	}
+	return 0;
+}
+
+static int check_before_balancing(struct tree_balance *tb)
+{
+	int retval = 0;
+
+	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
+		reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
+			       "occurred based on cur_tb not being null at "
+			       "this point in code. do_balance cannot properly "
+			       "handle concurrent tree accesses on a same "
+			       "mount point.");
+	}
+
+	/*
+	 * double check that buffers that we will modify are unlocked.
+	 * (fix_nodes should already have prepped all of these for us).
+	 */
+	if (tb->lnum[0]) {
+		retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
+		check_leaf(tb->L[0]);
+	}
+	if (tb->rnum[0]) {
+		retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
+		check_leaf(tb->R[0]);
+	}
+	retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
+					"S[0]");
+	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
+
+	return retval;
+}
+
+static void check_after_balance_leaf(struct tree_balance *tb)
+{
+	if (tb->lnum[0]) {
+		if (B_FREE_SPACE(tb->L[0]) !=
+		    MAX_CHILD_SIZE(tb->L[0]) -
+		    dc_size(B_N_CHILD
+			    (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
+			print_cur_tb("12221");
+			reiserfs_panic(tb->tb_sb, "PAP-12355",
+				       "shift to left was incorrect");
+		}
+	}
+	if (tb->rnum[0]) {
+		if (B_FREE_SPACE(tb->R[0]) !=
+		    MAX_CHILD_SIZE(tb->R[0]) -
+		    dc_size(B_N_CHILD
+			    (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
+			print_cur_tb("12222");
+			reiserfs_panic(tb->tb_sb, "PAP-12360",
+				       "shift to right was incorrect");
+		}
+	}
+	if (PATH_H_PBUFFER(tb->tb_path, 1) &&
+	    (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
+	     (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
+	      dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
+				PATH_H_POSITION(tb->tb_path, 1)))))) {
+		int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
+		int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
+			     dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
+					       PATH_H_POSITION(tb->tb_path,
+							       1))));
+		print_cur_tb("12223");
+		reiserfs_warning(tb->tb_sb, "reiserfs-12363",
+				 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
+				 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
+				 left,
+				 MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
+				 PATH_H_PBUFFER(tb->tb_path, 1),
+				 PATH_H_POSITION(tb->tb_path, 1),
+				 dc_size(B_N_CHILD
+					 (PATH_H_PBUFFER(tb->tb_path, 1),
+					  PATH_H_POSITION(tb->tb_path, 1))),
+				 right);
+		reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
+	}
+}
+
+static void check_leaf_level(struct tree_balance *tb)
+{
+	check_leaf(tb->L[0]);
+	check_leaf(tb->R[0]);
+	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
+}
+
+static void check_internal_levels(struct tree_balance *tb)
+{
+	int h;
+
+	/* check all internal nodes */
+	for (h = 1; tb->insert_size[h]; h++) {
+		check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
+				    "BAD BUFFER ON PATH");
+		if (tb->lnum[h])
+			check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
+		if (tb->rnum[h])
+			check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
+	}
+
+}
+
+#endif
+
+/*
+ * Now we have all of the buffers that must be used in balancing of
+ * the tree.  We rely on the assumption that schedule() will not occur
+ * while do_balance works. ( Only interrupt handlers are acceptable.)
+ * We balance the tree according to the analysis made before this,
+ * using buffers already obtained.  For SMP support it will someday be
+ * necessary to add ordered locking of tb.
+ */
+
+/*
+ * Some interesting rules of balancing:
+ * we delete a maximum of two nodes per level per balancing: we never
+ * delete R, when we delete two of three nodes L, S, R then we move
+ * them into R.
+ *
+ * we only delete L if we are deleting two nodes, if we delete only
+ * one node we delete S
+ *
+ * if we shift leaves then we shift as much as we can: this is a
+ * deliberate policy of extremism in node packing which results in
+ * higher average utilization after repeated random balance operations
+ * at the cost of more memory copies and more balancing as a result of
+ * small insertions to full nodes.
+ *
+ * if we shift internal nodes we try to evenly balance the node
+ * utilization, with consequent less balancing at the cost of lower
+ * utilization.
+ *
+ * one could argue that the policy for directories in leaves should be
+ * that of internal nodes, but we will wait until another day to
+ * evaluate this....  It would be nice to someday measure and prove
+ * these assumptions as to what is optimal....
+ */
+
+static inline void do_balance_starts(struct tree_balance *tb)
+{
+	/* use print_cur_tb() to see initial state of struct tree_balance */
+
+	/* store_print_tb (tb); */
+
+	/* do not delete, just comment it out */
+	/*
+	print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
+		 tb->tb_path->pos_in_item, tb, "check");
+	*/
+	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
+#ifdef CONFIG_REISERFS_CHECK
+	REISERFS_SB(tb->tb_sb)->cur_tb = tb;
+#endif
+}
+
+static inline void do_balance_completed(struct tree_balance *tb)
+{
+
+#ifdef CONFIG_REISERFS_CHECK
+	check_leaf_level(tb);
+	check_internal_levels(tb);
+	REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
+#endif
+
+	/*
+	 * reiserfs_free_block is no longer schedule safe.  So, we need to
+	 * put the buffers we want freed on the thrown list during do_balance,
+	 * and then free them now
+	 */
+
+	REISERFS_SB(tb->tb_sb)->s_do_balance++;
+
+	/* release all nodes hold to perform the balancing */
+	unfix_nodes(tb);
+
+	free_thrown(tb);
+}
+
+/*
+ * do_balance - balance the tree
+ *
+ * @tb: tree_balance structure
+ * @ih: item header of inserted item
+ * @body: body of inserted item or bytes to paste
+ * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
+ *
+ * Cut means delete part of an item (includes removing an entry from a
+ * directory).
+ *
+ * Delete means delete whole item.
+ *
+ * Insert means add a new item into the tree.
+ *
+ * Paste means to append to the end of an existing file or to
+ * insert a directory entry.
+ */
+void do_balance(struct tree_balance *tb, struct item_head *ih,
+		const char *body, int flag)
+{
+	int child_pos;		/* position of a child node in its parent */
+	int h;			/* level of the tree being processed */
+
+	/*
+	 * in our processing of one level we sometimes determine what
+	 * must be inserted into the next higher level.  This insertion
+	 * consists of a key or two keys and their corresponding
+	 * pointers
+	 */
+	struct item_head insert_key[2];
+
+	/* inserted node-ptrs for the next level */
+	struct buffer_head *insert_ptr[2];
+
+	tb->tb_mode = flag;
+	tb->need_balance_dirty = 0;
+
+	if (FILESYSTEM_CHANGED_TB(tb)) {
+		reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
+			       "changed");
+	}
+	/* if we have no real work to do  */
+	if (!tb->insert_size[0]) {
+		reiserfs_warning(tb->tb_sb, "PAP-12350",
+				 "insert_size == 0, mode == %c", flag);
+		unfix_nodes(tb);
+		return;
+	}
+
+	atomic_inc(&fs_generation(tb->tb_sb));
+	do_balance_starts(tb);
+
+	/*
+	 * balance_leaf returns 0 except if combining L R and S into
+	 * one node.  see balance_internal() for explanation of this
+	 * line of code.
+	 */
+	child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
+	    balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
+
+#ifdef CONFIG_REISERFS_CHECK
+	check_after_balance_leaf(tb);
+#endif
+
+	/* Balance internal level of the tree. */
+	for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
+		child_pos = balance_internal(tb, h, child_pos, insert_key,
+					     insert_ptr);
+
+	do_balance_completed(tb);
+}
diff --git a/kernel/fs/reiserfs/file.c b/kernel/fs/reiserfs/file.c
new file mode 100644
index 000000000..96a1bcf33
--- /dev/null
+++ b/kernel/fs/reiserfs/file.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
+#include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+
+/*
+ * We pack the tails of files on file close, not at the time they are written.
+ * This implies an unnecessary copy of the tail and an unnecessary indirect item
+ * insertion/balancing, for files that are written in one write.
+ * It avoids unnecessary tail packings (balances) for files that are written in
+ * multiple writes and are small enough to have tails.
+ *
+ * file_release is called by the VFS layer when the file is closed.  If
+ * this is the last open file descriptor, and the file
+ * small enough to have a tail, and the tail is currently in an
+ * unformatted node, the tail is converted back into a direct item.
+ *
+ * We use reiserfs_truncate_file to pack the tail, since it already has
+ * all the conditions coded.
+ */
+static int reiserfs_file_release(struct inode *inode, struct file *filp)
+{
+
+	struct reiserfs_transaction_handle th;
+	int err;
+	int jbegin_failure = 0;
+
+	BUG_ON(!S_ISREG(inode->i_mode));
+
+        if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
+		return 0;
+
+	mutex_lock(&REISERFS_I(inode)->tailpack);
+
+        if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
+		mutex_unlock(&REISERFS_I(inode)->tailpack);
+		return 0;
+	}
+
+	/* fast out for when nothing needs to be done */
+	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
+	     !tail_has_to_be_packed(inode)) &&
+	    REISERFS_I(inode)->i_prealloc_count <= 0) {
+		mutex_unlock(&REISERFS_I(inode)->tailpack);
+		return 0;
+	}
+
+	reiserfs_write_lock(inode->i_sb);
+	/*
+	 * freeing preallocation only involves relogging blocks that
+	 * are already in the current transaction.  preallocation gets
+	 * freed at the end of each transaction, so it is impossible for
+	 * us to log any additional blocks (including quota blocks)
+	 */
+	err = journal_begin(&th, inode->i_sb, 1);
+	if (err) {
+		/*
+		 * uh oh, we can't allow the inode to go away while there
+		 * is still preallocation blocks pending.  Try to join the
+		 * aborted transaction
+		 */
+		jbegin_failure = err;
+		err = journal_join_abort(&th, inode->i_sb);
+
+		if (err) {
+			/*
+			 * hmpf, our choices here aren't good.  We can pin
+			 * the inode which will disallow unmount from ever
+			 * happening, we can do nothing, which will corrupt
+			 * random memory on unmount, or we can forcibly
+			 * remove the file from the preallocation list, which
+			 * will leak blocks on disk.  Lets pin the inode
+			 * and let the admin know what is going on.
+			 */
+			igrab(inode);
+			reiserfs_warning(inode->i_sb, "clm-9001",
+					 "pinning inode %lu because the "
+					 "preallocation can't be freed",
+					 inode->i_ino);
+			goto out;
+		}
+	}
+	reiserfs_update_inode_transaction(inode);
+
+#ifdef REISERFS_PREALLOCATE
+	reiserfs_discard_prealloc(&th, inode);
+#endif
+	err = journal_end(&th);
+
+	/* copy back the error code from journal_begin */
+	if (!err)
+		err = jbegin_failure;
+
+	if (!err &&
+	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
+	    tail_has_to_be_packed(inode)) {
+
+		/*
+		 * if regular file is released by last holder and it has been
+		 * appended (we append by unformatted node only) or its direct
+		 * item(s) had to be converted, then it may have to be
+		 * indirect2direct converted
+		 */
+		err = reiserfs_truncate_file(inode, 0);
+	}
+out:
+	reiserfs_write_unlock(inode->i_sb);
+	mutex_unlock(&REISERFS_I(inode)->tailpack);
+	return err;
+}
+
+static int reiserfs_file_open(struct inode *inode, struct file *file)
+{
+	int err = dquot_file_open(inode, file);
+
+	/* somebody might be tailpacking on final close; wait for it */
+        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
+		mutex_lock(&REISERFS_I(inode)->tailpack);
+		atomic_inc(&REISERFS_I(inode)->openers);
+		mutex_unlock(&REISERFS_I(inode)->tailpack);
+	}
+	return err;
+}
+
+void reiserfs_vfs_truncate_file(struct inode *inode)
+{
+	mutex_lock(&REISERFS_I(inode)->tailpack);
+	reiserfs_truncate_file(inode, 1);
+	mutex_unlock(&REISERFS_I(inode)->tailpack);
+}
+
+/* Sync a reiserfs file. */
+
+/*
+ * FIXME: sync_mapping_buffers() never has anything to sync.  Can
+ * be removed...
+ */
+
+static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
+			      int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int err;
+	int barrier_done;
+
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	BUG_ON(!S_ISREG(inode->i_mode));
+	err = sync_mapping_buffers(inode->i_mapping);
+	reiserfs_write_lock(inode->i_sb);
+	barrier_done = reiserfs_commit_for_inode(inode);
+	reiserfs_write_unlock(inode->i_sb);
+	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+	mutex_unlock(&inode->i_mutex);
+	if (barrier_done < 0)
+		return barrier_done;
+	return (err < 0) ? -EIO : 0;
+}
+
+/* taken fs/buffer.c:__block_commit_write */
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+			 unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int new;
+	int logit = reiserfs_file_data_log(inode);
+	struct super_block *s = inode->i_sb;
+	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	struct reiserfs_transaction_handle th;
+	int ret = 0;
+
+	th.t_trans_id = 0;
+	blocksize = 1 << inode->i_blkbits;
+
+	if (logit) {
+		reiserfs_write_lock(s);
+		ret = journal_begin(&th, s, bh_per_page + 1);
+		if (ret)
+			goto drop_write_lock;
+		reiserfs_update_inode_transaction(inode);
+	}
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+
+		new = buffer_new(bh);
+		clear_buffer_new(bh);
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_buffer_uptodate(bh);
+			if (logit) {
+				reiserfs_prepare_for_journal(s, bh, 1);
+				journal_mark_dirty(&th, bh);
+			} else if (!buffer_dirty(bh)) {
+				mark_buffer_dirty(bh);
+				/*
+				 * do data=ordered on any page past the end
+				 * of file and any buffer marked BH_New.
+				 */
+				if (reiserfs_data_ordered(inode->i_sb) &&
+				    (new || page->index >= i_size_index)) {
+					reiserfs_add_ordered_list(inode, bh);
+				}
+			}
+		}
+	}
+	if (logit) {
+		ret = journal_end(&th);
+drop_write_lock:
+		reiserfs_write_unlock(s);
+	}
+	/*
+	 * If this is a partial write which happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' whether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return ret;
+}
+
+const struct file_operations reiserfs_file_operations = {
+	.unlocked_ioctl = reiserfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = reiserfs_compat_ioctl,
+#endif
+	.mmap = generic_file_mmap,
+	.open = reiserfs_file_open,
+	.release = reiserfs_file_release,
+	.fsync = reiserfs_sync_file,
+	.read_iter = generic_file_read_iter,
+	.write_iter = generic_file_write_iter,
+	.splice_read = generic_file_splice_read,
+	.splice_write = iter_file_splice_write,
+	.llseek = generic_file_llseek,
+};
+
+const struct inode_operations reiserfs_file_inode_operations = {
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+	.get_acl = reiserfs_get_acl,
+	.set_acl = reiserfs_set_acl,
+};
diff --git a/kernel/fs/reiserfs/fix_node.c b/kernel/fs/reiserfs/fix_node.c
new file mode 100644
index 000000000..6b0ddb2a9
--- /dev/null
+++ b/kernel/fs/reiserfs/fix_node.c
@@ -0,0 +1,2825 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "reiserfs.h"
+#include <linux/buffer_head.h>
+
+/*
+ * To make any changes in the tree we find a node that contains item
+ * to be changed/deleted or position in the node we insert a new item
+ * to. We call this node S. To do balancing we need to decide what we
+ * will shift to left/right neighbor, or to a new node, where new item
+ * will be etc. To make this analysis simpler we build virtual
+ * node. Virtual node is an array of items, that will replace items of
+ * node S. (For instance if we are going to delete an item, virtual
+ * node does not contain it). Virtual node keeps information about
+ * item sizes and types, mergeability of first and last items, sizes
+ * of all entries in directory item. We use this array of items when
+ * calculating what we can shift to neighbors and how many nodes we
+ * have to have if we do not any shiftings, if we shift to left/right
+ * neighbor or to both.
+ */
+
+/*
+ * Takes item number in virtual node, returns number of item
+ * that it has in source buffer
+ */
+static inline int old_item_num(int new_num, int affected_item_num, int mode)
+{
+	if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
+		return new_num;
+
+	if (mode == M_INSERT) {
+
+		RFALSE(new_num == 0,
+		       "vs-8005: for INSERT mode and item number of inserted item");
+
+		return new_num - 1;
+	}
+
+	RFALSE(mode != M_DELETE,
+	       "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
+	       mode);
+	/* delete mode */
+	return new_num + 1;
+}
+
+static void create_virtual_node(struct tree_balance *tb, int h)
+{
+	struct item_head *ih;
+	struct virtual_node *vn = tb->tb_vn;
+	int new_num;
+	struct buffer_head *Sh;	/* this comes from tb->S[h] */
+
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+
+	/* size of changed node */
+	vn->vn_size =
+	    MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
+
+	/* for internal nodes array if virtual items is not created */
+	if (h) {
+		vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
+		return;
+	}
+
+	/* number of items in virtual node  */
+	vn->vn_nr_item =
+	    B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
+	    ((vn->vn_mode == M_DELETE) ? 1 : 0);
+
+	/* first virtual item */
+	vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
+	memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
+	vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
+
+	/* first item in the node */
+	ih = item_head(Sh, 0);
+
+	/* define the mergeability for 0-th item (if it is not being deleted) */
+	if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
+	    && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
+		vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
+
+	/*
+	 * go through all items that remain in the virtual
+	 * node (except for the new (inserted) one)
+	 */
+	for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
+		int j;
+		struct virtual_item *vi = vn->vn_vi + new_num;
+		int is_affected =
+		    ((new_num != vn->vn_affected_item_num) ? 0 : 1);
+
+		if (is_affected && vn->vn_mode == M_INSERT)
+			continue;
+
+		/* get item number in source node */
+		j = old_item_num(new_num, vn->vn_affected_item_num,
+				 vn->vn_mode);
+
+		vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
+		vi->vi_ih = ih + j;
+		vi->vi_item = ih_item_body(Sh, ih + j);
+		vi->vi_uarea = vn->vn_free_ptr;
+
+		/*
+		 * FIXME: there is no check that item operation did not
+		 * consume too much memory
+		 */
+		vn->vn_free_ptr +=
+		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
+		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
+			reiserfs_panic(tb->tb_sb, "vs-8030",
+				       "virtual node space consumed");
+
+		if (!is_affected)
+			/* this is not being changed */
+			continue;
+
+		if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
+			vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
+			/* pointer to data which is going to be pasted */
+			vi->vi_new_data = vn->vn_data;
+		}
+	}
+
+	/* virtual inserted item is not defined yet */
+	if (vn->vn_mode == M_INSERT) {
+		struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
+
+		RFALSE(vn->vn_ins_ih == NULL,
+		       "vs-8040: item header of inserted item is not specified");
+		vi->vi_item_len = tb->insert_size[0];
+		vi->vi_ih = vn->vn_ins_ih;
+		vi->vi_item = vn->vn_data;
+		vi->vi_uarea = vn->vn_free_ptr;
+
+		op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
+			     tb->insert_size[0]);
+	}
+
+	/*
+	 * set right merge flag we take right delimiting key and
+	 * check whether it is a mergeable item
+	 */
+	if (tb->CFR[0]) {
+		struct reiserfs_key *key;
+
+		key = internal_key(tb->CFR[0], tb->rkey[0]);
+		if (op_is_left_mergeable(key, Sh->b_size)
+		    && (vn->vn_mode != M_DELETE
+			|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
+			vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
+			    VI_TYPE_RIGHT_MERGEABLE;
+
+#ifdef CONFIG_REISERFS_CHECK
+		if (op_is_left_mergeable(key, Sh->b_size) &&
+		    !(vn->vn_mode != M_DELETE
+		      || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
+			/*
+			 * we delete last item and it could be merged
+			 * with right neighbor's first item
+			 */
+			if (!
+			    (B_NR_ITEMS(Sh) == 1
+			     && is_direntry_le_ih(item_head(Sh, 0))
+			     && ih_entry_count(item_head(Sh, 0)) == 1)) {
+				/*
+				 * node contains more than 1 item, or item
+				 * is not directory item, or this item
+				 * contains more than 1 entry
+				 */
+				print_block(Sh, 0, -1, -1);
+				reiserfs_panic(tb->tb_sb, "vs-8045",
+					       "rdkey %k, affected item==%d "
+					       "(mode==%c) Must be %c",
+					       key, vn->vn_affected_item_num,
+					       vn->vn_mode, M_DELETE);
+			}
+		}
+#endif
+
+	}
+}
+
+/*
+ * Using virtual node check, how many items can be
+ * shifted to left neighbor
+ */
+static void check_left(struct tree_balance *tb, int h, int cur_free)
+{
+	int i;
+	struct virtual_node *vn = tb->tb_vn;
+	struct virtual_item *vi;
+	int d_size, ih_size;
+
+	RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
+
+	/* internal level */
+	if (h > 0) {
+		tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+		return;
+	}
+
+	/* leaf level */
+
+	if (!cur_free || !vn->vn_nr_item) {
+		/* no free space or nothing to move */
+		tb->lnum[h] = 0;
+		tb->lbytes = -1;
+		return;
+	}
+
+	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
+	       "vs-8055: parent does not exist or invalid");
+
+	vi = vn->vn_vi;
+	if ((unsigned int)cur_free >=
+	    (vn->vn_size -
+	     ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
+		/* all contents of S[0] fits into L[0] */
+
+		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+		       "vs-8055: invalid mode or balance condition failed");
+
+		tb->lnum[0] = vn->vn_nr_item;
+		tb->lbytes = -1;
+		return;
+	}
+
+	d_size = 0, ih_size = IH_SIZE;
+
+	/* first item may be merge with last item in left neighbor */
+	if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
+		d_size = -((int)IH_SIZE), ih_size = 0;
+
+	tb->lnum[0] = 0;
+	for (i = 0; i < vn->vn_nr_item;
+	     i++, ih_size = IH_SIZE, d_size = 0, vi++) {
+		d_size += vi->vi_item_len;
+		if (cur_free >= d_size) {
+			/* the item can be shifted entirely */
+			cur_free -= d_size;
+			tb->lnum[0]++;
+			continue;
+		}
+
+		/* the item cannot be shifted entirely, try to split it */
+		/*
+		 * check whether L[0] can hold ih and at least one byte
+		 * of the item body
+		 */
+
+		/* cannot shift even a part of the current item */
+		if (cur_free <= ih_size) {
+			tb->lbytes = -1;
+			return;
+		}
+		cur_free -= ih_size;
+
+		tb->lbytes = op_check_left(vi, cur_free, 0, 0);
+		if (tb->lbytes != -1)
+			/* count partially shifted item */
+			tb->lnum[0]++;
+
+		break;
+	}
+
+	return;
+}
+
+/*
+ * Using virtual node check, how many items can be
+ * shifted to right neighbor
+ */
+static void check_right(struct tree_balance *tb, int h, int cur_free)
+{
+	int i;
+	struct virtual_node *vn = tb->tb_vn;
+	struct virtual_item *vi;
+	int d_size, ih_size;
+
+	RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
+
+	/* internal level */
+	if (h > 0) {
+		tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+		return;
+	}
+
+	/* leaf level */
+
+	if (!cur_free || !vn->vn_nr_item) {
+		/* no free space  */
+		tb->rnum[h] = 0;
+		tb->rbytes = -1;
+		return;
+	}
+
+	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
+	       "vs-8075: parent does not exist or invalid");
+
+	vi = vn->vn_vi + vn->vn_nr_item - 1;
+	if ((unsigned int)cur_free >=
+	    (vn->vn_size -
+	     ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
+		/* all contents of S[0] fits into R[0] */
+
+		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+		       "vs-8080: invalid mode or balance condition failed");
+
+		tb->rnum[h] = vn->vn_nr_item;
+		tb->rbytes = -1;
+		return;
+	}
+
+	d_size = 0, ih_size = IH_SIZE;
+
+	/* last item may be merge with first item in right neighbor */
+	if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
+		d_size = -(int)IH_SIZE, ih_size = 0;
+
+	tb->rnum[0] = 0;
+	for (i = vn->vn_nr_item - 1; i >= 0;
+	     i--, d_size = 0, ih_size = IH_SIZE, vi--) {
+		d_size += vi->vi_item_len;
+		if (cur_free >= d_size) {
+			/* the item can be shifted entirely */
+			cur_free -= d_size;
+			tb->rnum[0]++;
+			continue;
+		}
+
+		/*
+		 * check whether R[0] can hold ih and at least one
+		 * byte of the item body
+		 */
+
+		/* cannot shift even a part of the current item */
+		if (cur_free <= ih_size) {
+			tb->rbytes = -1;
+			return;
+		}
+
+		/*
+		 * R[0] can hold the header of the item and at least
+		 * one byte of its body
+		 */
+		cur_free -= ih_size;	/* cur_free is still > 0 */
+
+		tb->rbytes = op_check_right(vi, cur_free);
+		if (tb->rbytes != -1)
+			/* count partially shifted item */
+			tb->rnum[0]++;
+
+		break;
+	}
+
+	return;
+}
+
+/*
+ * from - number of items, which are shifted to left neighbor entirely
+ * to - number of item, which are shifted to right neighbor entirely
+ * from_bytes - number of bytes of boundary item (or directory entries)
+ *              which are shifted to left neighbor
+ * to_bytes - number of bytes of boundary item (or directory entries)
+ *            which are shifted to right neighbor
+ */
+static int get_num_ver(int mode, struct tree_balance *tb, int h,
+		       int from, int from_bytes,
+		       int to, int to_bytes, short *snum012, int flow)
+{
+	int i;
+	int cur_free;
+	int units;
+	struct virtual_node *vn = tb->tb_vn;
+	int total_node_size, max_node_size, current_item_size;
+	int needed_nodes;
+
+	/* position of item we start filling node from */
+	int start_item;
+
+	/* position of item we finish filling node by */
+	int end_item;
+
+	/*
+	 * number of first bytes (entries for directory) of start_item-th item
+	 * we do not include into node that is being filled
+	 */
+	int start_bytes;
+
+	/*
+	 * number of last bytes (entries for directory) of end_item-th item
+	 * we do node include into node that is being filled
+	 */
+	int end_bytes;
+
+	/*
+	 * these are positions in virtual item of items, that are split
+	 * between S[0] and S1new and S1new and S2new
+	 */
+	int split_item_positions[2];
+
+	split_item_positions[0] = -1;
+	split_item_positions[1] = -1;
+
+	/*
+	 * We only create additional nodes if we are in insert or paste mode
+	 * or we are in replace mode at the internal level. If h is 0 and
+	 * the mode is M_REPLACE then in fix_nodes we change the mode to
+	 * paste or insert before we get here in the code.
+	 */
+	RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
+	       "vs-8100: insert_size < 0 in overflow");
+
+	max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
+
+	/*
+	 * snum012 [0-2] - number of items, that lay
+	 * to S[0], first new node and second new node
+	 */
+	snum012[3] = -1;	/* s1bytes */
+	snum012[4] = -1;	/* s2bytes */
+
+	/* internal level */
+	if (h > 0) {
+		i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
+		if (i == max_node_size)
+			return 1;
+		return (i / max_node_size + 1);
+	}
+
+	/* leaf level */
+	needed_nodes = 1;
+	total_node_size = 0;
+	cur_free = max_node_size;
+
+	/* start from 'from'-th item */
+	start_item = from;
+	/* skip its first 'start_bytes' units */
+	start_bytes = ((from_bytes != -1) ? from_bytes : 0);
+
+	/* last included item is the 'end_item'-th one */
+	end_item = vn->vn_nr_item - to - 1;
+	/* do not count last 'end_bytes' units of 'end_item'-th item */
+	end_bytes = (to_bytes != -1) ? to_bytes : 0;
+
+	/*
+	 * go through all item beginning from the start_item-th item
+	 * and ending by the end_item-th item. Do not count first
+	 * 'start_bytes' units of 'start_item'-th item and last
+	 * 'end_bytes' of 'end_item'-th item
+	 */
+	for (i = start_item; i <= end_item; i++) {
+		struct virtual_item *vi = vn->vn_vi + i;
+		int skip_from_end = ((i == end_item) ? end_bytes : 0);
+
+		RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
+
+		/* get size of current item */
+		current_item_size = vi->vi_item_len;
+
+		/*
+		 * do not take in calculation head part (from_bytes)
+		 * of from-th item
+		 */
+		current_item_size -=
+		    op_part_size(vi, 0 /*from start */ , start_bytes);
+
+		/* do not take in calculation tail part of last item */
+		current_item_size -=
+		    op_part_size(vi, 1 /*from end */ , skip_from_end);
+
+		/* if item fits into current node entierly */
+		if (total_node_size + current_item_size <= max_node_size) {
+			snum012[needed_nodes - 1]++;
+			total_node_size += current_item_size;
+			start_bytes = 0;
+			continue;
+		}
+
+		/*
+		 * virtual item length is longer, than max size of item in
+		 * a node. It is impossible for direct item
+		 */
+		if (current_item_size > max_node_size) {
+			RFALSE(is_direct_le_ih(vi->vi_ih),
+			       "vs-8110: "
+			       "direct item length is %d. It can not be longer than %d",
+			       current_item_size, max_node_size);
+			/* we will try to split it */
+			flow = 1;
+		}
+
+		/* as we do not split items, take new node and continue */
+		if (!flow) {
+			needed_nodes++;
+			i--;
+			total_node_size = 0;
+			continue;
+		}
+
+		/*
+		 * calculate number of item units which fit into node being
+		 * filled
+		 */
+		{
+			int free_space;
+
+			free_space = max_node_size - total_node_size - IH_SIZE;
+			units =
+			    op_check_left(vi, free_space, start_bytes,
+					  skip_from_end);
+			/*
+			 * nothing fits into current node, take new
+			 * node and continue
+			 */
+			if (units == -1) {
+				needed_nodes++, i--, total_node_size = 0;
+				continue;
+			}
+		}
+
+		/* something fits into the current node */
+		start_bytes += units;
+		snum012[needed_nodes - 1 + 3] = units;
+
+		if (needed_nodes > 2)
+			reiserfs_warning(tb->tb_sb, "vs-8111",
+					 "split_item_position is out of range");
+		snum012[needed_nodes - 1]++;
+		split_item_positions[needed_nodes - 1] = i;
+		needed_nodes++;
+		/* continue from the same item with start_bytes != -1 */
+		start_item = i;
+		i--;
+		total_node_size = 0;
+	}
+
+	/*
+	 * sum012[4] (if it is not -1) contains number of units of which
+	 * are to be in S1new, snum012[3] - to be in S0. They are supposed
+	 * to be S1bytes and S2bytes correspondingly, so recalculate
+	 */
+	if (snum012[4] > 0) {
+		int split_item_num;
+		int bytes_to_r, bytes_to_l;
+		int bytes_to_S1new;
+
+		split_item_num = split_item_positions[1];
+		bytes_to_l =
+		    ((from == split_item_num
+		      && from_bytes != -1) ? from_bytes : 0);
+		bytes_to_r =
+		    ((end_item == split_item_num
+		      && end_bytes != -1) ? end_bytes : 0);
+		bytes_to_S1new =
+		    ((split_item_positions[0] ==
+		      split_item_positions[1]) ? snum012[3] : 0);
+
+		/* s2bytes */
+		snum012[4] =
+		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
+		    bytes_to_r - bytes_to_l - bytes_to_S1new;
+
+		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
+		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
+			reiserfs_warning(tb->tb_sb, "vs-8115",
+					 "not directory or indirect item");
+	}
+
+	/* now we know S2bytes, calculate S1bytes */
+	if (snum012[3] > 0) {
+		int split_item_num;
+		int bytes_to_r, bytes_to_l;
+		int bytes_to_S2new;
+
+		split_item_num = split_item_positions[0];
+		bytes_to_l =
+		    ((from == split_item_num
+		      && from_bytes != -1) ? from_bytes : 0);
+		bytes_to_r =
+		    ((end_item == split_item_num
+		      && end_bytes != -1) ? end_bytes : 0);
+		bytes_to_S2new =
+		    ((split_item_positions[0] == split_item_positions[1]
+		      && snum012[4] != -1) ? snum012[4] : 0);
+
+		/* s1bytes */
+		snum012[3] =
+		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
+		    bytes_to_r - bytes_to_l - bytes_to_S2new;
+	}
+
+	return needed_nodes;
+}
+
+
+/*
+ * Set parameters for balancing.
+ * Performs write of results of analysis of balancing into structure tb,
+ * where it will later be used by the functions that actually do the balancing.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	lnum	number of items from S[h] that must be shifted to L[h];
+ *	rnum	number of items from S[h] that must be shifted to R[h];
+ *	blk_num	number of blocks that S[h] will be splitted into;
+ *	s012	number of items that fall into splitted nodes.
+ *	lbytes	number of bytes which flow to the left neighbor from the
+ *              item that is not not shifted entirely
+ *	rbytes	number of bytes which flow to the right neighbor from the
+ *              item that is not not shifted entirely
+ *	s1bytes	number of bytes which flow to the first  new node when
+ *              S[0] splits (this number is contained in s012 array)
+ */
+
+static void set_parameters(struct tree_balance *tb, int h, int lnum,
+			   int rnum, int blk_num, short *s012, int lb, int rb)
+{
+
+	tb->lnum[h] = lnum;
+	tb->rnum[h] = rnum;
+	tb->blknum[h] = blk_num;
+
+	/* only for leaf level */
+	if (h == 0) {
+		if (s012 != NULL) {
+			tb->s0num = *s012++;
+			tb->snum[0] = *s012++;
+			tb->snum[1] = *s012++;
+			tb->sbytes[0] = *s012++;
+			tb->sbytes[1] = *s012;
+		}
+		tb->lbytes = lb;
+		tb->rbytes = rb;
+	}
+	PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
+	PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
+
+	PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
+	PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
+}
+
+/*
+ * check if node disappears if we shift tb->lnum[0] items to left
+ * neighbor and tb->rnum[0] to the right one.
+ */
+static int is_leaf_removable(struct tree_balance *tb)
+{
+	struct virtual_node *vn = tb->tb_vn;
+	int to_left, to_right;
+	int size;
+	int remain_items;
+
+	/*
+	 * number of items that will be shifted to left (right) neighbor
+	 * entirely
+	 */
+	to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
+	to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
+	remain_items = vn->vn_nr_item;
+
+	/* how many items remain in S[0] after shiftings to neighbors */
+	remain_items -= (to_left + to_right);
+
+	/* all content of node can be shifted to neighbors */
+	if (remain_items < 1) {
+		set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
+			       NULL, -1, -1);
+		return 1;
+	}
+
+	/* S[0] is not removable */
+	if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
+		return 0;
+
+	/* check whether we can divide 1 remaining item between neighbors */
+
+	/* get size of remaining item (in item units) */
+	size = op_unit_num(&vn->vn_vi[to_left]);
+
+	if (tb->lbytes + tb->rbytes >= size) {
+		set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
+			       tb->lbytes, -1);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* check whether L, S, R can be joined in one node */
+static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
+{
+	struct virtual_node *vn = tb->tb_vn;
+	int ih_size;
+	struct buffer_head *S0;
+
+	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
+
+	ih_size = 0;
+	if (vn->vn_nr_item) {
+		if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
+			ih_size += IH_SIZE;
+
+		if (vn->vn_vi[vn->vn_nr_item - 1].
+		    vi_type & VI_TYPE_RIGHT_MERGEABLE)
+			ih_size += IH_SIZE;
+	} else {
+		/* there was only one item and it will be deleted */
+		struct item_head *ih;
+
+		RFALSE(B_NR_ITEMS(S0) != 1,
+		       "vs-8125: item number must be 1: it is %d",
+		       B_NR_ITEMS(S0));
+
+		ih = item_head(S0, 0);
+		if (tb->CFR[0]
+		    && !comp_short_le_keys(&ih->ih_key,
+					   internal_key(tb->CFR[0],
+							  tb->rkey[0])))
+			/*
+			 * Directory must be in correct state here: that is
+			 * somewhere at the left side should exist first
+			 * directory item. But the item being deleted can
+			 * not be that first one because its right neighbor
+			 * is item of the same directory. (But first item
+			 * always gets deleted in last turn). So, neighbors
+			 * of deleted item can be merged, so we can save
+			 * ih_size
+			 */
+			if (is_direntry_le_ih(ih)) {
+				ih_size = IH_SIZE;
+
+				/*
+				 * we might check that left neighbor exists
+				 * and is of the same directory
+				 */
+				RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
+				       "vs-8130: first directory item can not be removed until directory is not empty");
+			}
+
+	}
+
+	if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
+		set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
+		PROC_INFO_INC(tb->tb_sb, leaves_removable);
+		return 1;
+	}
+	return 0;
+
+}
+
+/* when we do not split item, lnum and rnum are numbers of entire items */
+#define SET_PAR_SHIFT_LEFT \
+if (h)\
+{\
+   int to_l;\
+   \
+   to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
+	      (MAX_NR_KEY(Sh) + 1 - lpar);\
+	      \
+	      set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
+}\
+else \
+{\
+   if (lset==LEFT_SHIFT_FLOW)\
+     set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
+		     tb->lbytes, -1);\
+   else\
+     set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
+		     -1, -1);\
+}
+
+#define SET_PAR_SHIFT_RIGHT \
+if (h)\
+{\
+   int to_r;\
+   \
+   to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
+   \
+   set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
+}\
+else \
+{\
+   if (rset==RIGHT_SHIFT_FLOW)\
+     set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
+		  -1, tb->rbytes);\
+   else\
+     set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
+		  -1, -1);\
+}
+
+static void free_buffers_in_tb(struct tree_balance *tb)
+{
+	int i;
+
+	pathrelse(tb->tb_path);
+
+	for (i = 0; i < MAX_HEIGHT; i++) {
+		brelse(tb->L[i]);
+		brelse(tb->R[i]);
+		brelse(tb->FL[i]);
+		brelse(tb->FR[i]);
+		brelse(tb->CFL[i]);
+		brelse(tb->CFR[i]);
+
+		tb->L[i] = NULL;
+		tb->R[i] = NULL;
+		tb->FL[i] = NULL;
+		tb->FR[i] = NULL;
+		tb->CFL[i] = NULL;
+		tb->CFR[i] = NULL;
+	}
+}
+
+/*
+ * Get new buffers for storing new nodes that are created while balancing.
+ * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ *	        CARRY_ON - schedule didn't occur while the function worked;
+ *	        NO_DISK_SPACE - no disk space.
+ */
+/* The function is NOT SCHEDULE-SAFE! */
+static int get_empty_nodes(struct tree_balance *tb, int h)
+{
+	struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
+	int counter, number_of_freeblk;
+	int  amount_needed;	/* number of needed empty blocks */
+	int  retval = CARRY_ON;
+	struct super_block *sb = tb->tb_sb;
+
+	/*
+	 * number_of_freeblk is the number of empty blocks which have been
+	 * acquired for use by the balancing algorithm minus the number of
+	 * empty blocks used in the previous levels of the analysis,
+	 * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
+	 * occurs after empty blocks are acquired, and the balancing analysis
+	 * is then restarted, amount_needed is the number needed by this
+	 * level (h) of the balancing analysis.
+	 *
+	 * Note that for systems with many processes writing, it would be
+	 * more layout optimal to calculate the total number needed by all
+	 * levels and then to run reiserfs_new_blocks to get all of them at
+	 * once.
+	 */
+
+	/*
+	 * Initiate number_of_freeblk to the amount acquired prior to the
+	 * restart of the analysis or 0 if not restarted, then subtract the
+	 * amount needed by all of the levels of the tree below h.
+	 */
+	/* blknum includes S[h], so we subtract 1 in this calculation */
+	for (counter = 0, number_of_freeblk = tb->cur_blknum;
+	     counter < h; counter++)
+		number_of_freeblk -=
+		    (tb->blknum[counter]) ? (tb->blknum[counter] -
+						   1) : 0;
+
+	/* Allocate missing empty blocks. */
+	/* if Sh == 0  then we are getting a new root */
+	amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
+	/*
+	 * Amount_needed = the amount that we need more than the
+	 * amount that we have.
+	 */
+	if (amount_needed > number_of_freeblk)
+		amount_needed -= number_of_freeblk;
+	else	/* If we have enough already then there is nothing to do. */
+		return CARRY_ON;
+
+	/*
+	 * No need to check quota - is not allocated for blocks used
+	 * for formatted nodes
+	 */
+	if (reiserfs_new_form_blocknrs(tb, blocknrs,
+				       amount_needed) == NO_DISK_SPACE)
+		return NO_DISK_SPACE;
+
+	/* for each blocknumber we just got, get a buffer and stick it on FEB */
+	for (blocknr = blocknrs, counter = 0;
+	     counter < amount_needed; blocknr++, counter++) {
+
+		RFALSE(!*blocknr,
+		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
+
+		new_bh = sb_getblk(sb, *blocknr);
+		RFALSE(buffer_dirty(new_bh) ||
+		       buffer_journaled(new_bh) ||
+		       buffer_journal_dirty(new_bh),
+		       "PAP-8140: journaled or dirty buffer %b for the new block",
+		       new_bh);
+
+		/* Put empty buffers into the array. */
+		RFALSE(tb->FEB[tb->cur_blknum],
+		       "PAP-8141: busy slot for new buffer");
+
+		set_buffer_journal_new(new_bh);
+		tb->FEB[tb->cur_blknum++] = new_bh;
+	}
+
+	if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
+		retval = REPEAT_SEARCH;
+
+	return retval;
+}
+
+/*
+ * Get free space of the left neighbor, which is stored in the parent
+ * node of the left neighbor.
+ */
+static int get_lfree(struct tree_balance *tb, int h)
+{
+	struct buffer_head *l, *f;
+	int order;
+
+	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
+	    (l = tb->FL[h]) == NULL)
+		return 0;
+
+	if (f == l)
+		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
+	else {
+		order = B_NR_ITEMS(l);
+		f = l;
+	}
+
+	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
+}
+
+/*
+ * Get free space of the right neighbor,
+ * which is stored in the parent node of the right neighbor.
+ */
+static int get_rfree(struct tree_balance *tb, int h)
+{
+	struct buffer_head *r, *f;
+	int order;
+
+	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
+	    (r = tb->FR[h]) == NULL)
+		return 0;
+
+	if (f == r)
+		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
+	else {
+		order = 0;
+		f = r;
+	}
+
+	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
+
+}
+
+/* Check whether left neighbor is in memory. */
+static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
+{
+	struct buffer_head *father, *left;
+	struct super_block *sb = tb->tb_sb;
+	b_blocknr_t left_neighbor_blocknr;
+	int left_neighbor_position;
+
+	/* Father of the left neighbor does not exist. */
+	if (!tb->FL[h])
+		return 0;
+
+	/* Calculate father of the node to be balanced. */
+	father = PATH_H_PBUFFER(tb->tb_path, h + 1);
+
+	RFALSE(!father ||
+	       !B_IS_IN_TREE(father) ||
+	       !B_IS_IN_TREE(tb->FL[h]) ||
+	       !buffer_uptodate(father) ||
+	       !buffer_uptodate(tb->FL[h]),
+	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
+	       father, tb->FL[h]);
+
+	/*
+	 * Get position of the pointer to the left neighbor
+	 * into the left father.
+	 */
+	left_neighbor_position = (father == tb->FL[h]) ?
+	    tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
+	/* Get left neighbor block number. */
+	left_neighbor_blocknr =
+	    B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
+	/* Look for the left neighbor in the cache. */
+	if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
+
+		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
+		       "vs-8170: left neighbor (%b %z) is not in the tree",
+		       left, left);
+		put_bh(left);
+		return 1;
+	}
+
+	return 0;
+}
+
+#define LEFT_PARENTS  'l'
+#define RIGHT_PARENTS 'r'
+
+static void decrement_key(struct cpu_key *key)
+{
+	/* call item specific function for this key */
+	item_ops[cpu_key_k_type(key)]->decrement_key(key);
+}
+
+/*
+ * Calculate far left/right parent of the left/right neighbor of the
+ * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
+ * of the parent F[h].
+ * Calculate left/right common parent of the current node and L[h]/R[h].
+ * Calculate left/right delimiting key position.
+ * Returns:	PATH_INCORRECT    - path in the tree is not correct
+ *		SCHEDULE_OCCURRED - schedule occurred while the function worked
+ *	        CARRY_ON          - schedule didn't occur while the function
+ *				    worked
+ */
+static int get_far_parent(struct tree_balance *tb,
+			  int h,
+			  struct buffer_head **pfather,
+			  struct buffer_head **pcom_father, char c_lr_par)
+{
+	struct buffer_head *parent;
+	INITIALIZE_PATH(s_path_to_neighbor_father);
+	struct treepath *path = tb->tb_path;
+	struct cpu_key s_lr_father_key;
+	int counter,
+	    position = INT_MAX,
+	    first_last_position = 0,
+	    path_offset = PATH_H_PATH_OFFSET(path, h);
+
+	/*
+	 * Starting from F[h] go upwards in the tree, and look for the common
+	 * ancestor of F[h], and its neighbor l/r, that should be obtained.
+	 */
+
+	counter = path_offset;
+
+	RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-8180: invalid path length");
+
+	for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
+		/*
+		 * Check whether parent of the current buffer in the path
+		 * is really parent in the tree.
+		 */
+		if (!B_IS_IN_TREE
+		    (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
+			return REPEAT_SEARCH;
+
+		/* Check whether position in the parent is correct. */
+		if ((position =
+		     PATH_OFFSET_POSITION(path,
+					  counter - 1)) >
+		    B_NR_ITEMS(parent))
+			return REPEAT_SEARCH;
+
+		/*
+		 * Check whether parent at the path really points
+		 * to the child.
+		 */
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
+			return REPEAT_SEARCH;
+
+		/*
+		 * Return delimiting key if position in the parent is not
+		 * equal to first/last one.
+		 */
+		if (c_lr_par == RIGHT_PARENTS)
+			first_last_position = B_NR_ITEMS(parent);
+		if (position != first_last_position) {
+			*pcom_father = parent;
+			get_bh(*pcom_father);
+			/*(*pcom_father = parent)->b_count++; */
+			break;
+		}
+	}
+
+	/* if we are in the root of the tree, then there is no common father */
+	if (counter == FIRST_PATH_ELEMENT_OFFSET) {
+		/*
+		 * Check whether first buffer in the path is the
+		 * root of the tree.
+		 */
+		if (PATH_OFFSET_PBUFFER
+		    (tb->tb_path,
+		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
+		    SB_ROOT_BLOCK(tb->tb_sb)) {
+			*pfather = *pcom_father = NULL;
+			return CARRY_ON;
+		}
+		return REPEAT_SEARCH;
+	}
+
+	RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
+	       "PAP-8185: (%b %z) level too small",
+	       *pcom_father, *pcom_father);
+
+	/* Check whether the common parent is locked. */
+
+	if (buffer_locked(*pcom_father)) {
+
+		/* Release the write lock while the buffer is busy */
+		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
+		__wait_on_buffer(*pcom_father);
+		reiserfs_write_lock_nested(tb->tb_sb, depth);
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(*pcom_father);
+			return REPEAT_SEARCH;
+		}
+	}
+
+	/*
+	 * So, we got common parent of the current node and its
+	 * left/right neighbor.  Now we are getting the parent of the
+	 * left/right neighbor.
+	 */
+
+	/* Form key to get parent of the left/right neighbor. */
+	le_key2cpu_key(&s_lr_father_key,
+		       internal_key(*pcom_father,
+				      (c_lr_par ==
+				       LEFT_PARENTS) ? (tb->lkey[h - 1] =
+							position -
+							1) : (tb->rkey[h -
+									   1] =
+							      position)));
+
+	if (c_lr_par == LEFT_PARENTS)
+		decrement_key(&s_lr_father_key);
+
+	if (search_by_key
+	    (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
+	     h + 1) == IO_ERROR)
+		/* path is released */
+		return IO_ERROR;
+
+	if (FILESYSTEM_CHANGED_TB(tb)) {
+		pathrelse(&s_path_to_neighbor_father);
+		brelse(*pcom_father);
+		return REPEAT_SEARCH;
+	}
+
+	*pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
+
+	RFALSE(B_LEVEL(*pfather) != h + 1,
+	       "PAP-8190: (%b %z) level too small", *pfather, *pfather);
+	RFALSE(s_path_to_neighbor_father.path_length <
+	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
+
+	s_path_to_neighbor_father.path_length--;
+	pathrelse(&s_path_to_neighbor_father);
+	return CARRY_ON;
+}
+
+/*
+ * Get parents of neighbors of node in the path(S[path_offset]) and
+ * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
+ * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
+ * CFR[path_offset].
+ * Calculate numbers of left and right delimiting keys position:
+ * lkey[path_offset], rkey[path_offset].
+ * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked
+ *	        CARRY_ON - schedule didn't occur while the function worked
+ */
+static int get_parents(struct tree_balance *tb, int h)
+{
+	struct treepath *path = tb->tb_path;
+	int position,
+	    ret,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
+	struct buffer_head *curf, *curcf;
+
+	/* Current node is the root of the tree or will be root of the tree */
+	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+		/*
+		 * The root can not have parents.
+		 * Release nodes which previously were obtained as
+		 * parents of the current node neighbors.
+		 */
+		brelse(tb->FL[h]);
+		brelse(tb->CFL[h]);
+		brelse(tb->FR[h]);
+		brelse(tb->CFR[h]);
+		tb->FL[h]  = NULL;
+		tb->CFL[h] = NULL;
+		tb->FR[h]  = NULL;
+		tb->CFR[h] = NULL;
+		return CARRY_ON;
+	}
+
+	/* Get parent FL[path_offset] of L[path_offset]. */
+	position = PATH_OFFSET_POSITION(path, path_offset - 1);
+	if (position) {
+		/* Current node is not the first child of its parent. */
+		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
+		tb->lkey[h] = position - 1;
+	} else {
+		/*
+		 * Calculate current parent of L[path_offset], which is the
+		 * left neighbor of the current node.  Calculate current
+		 * common parent of L[path_offset] and the current node.
+		 * Note that CFL[path_offset] not equal FL[path_offset] and
+		 * CFL[path_offset] not equal F[path_offset].
+		 * Calculate lkey[path_offset].
+		 */
+		if ((ret = get_far_parent(tb, h + 1, &curf,
+						  &curcf,
+						  LEFT_PARENTS)) != CARRY_ON)
+			return ret;
+	}
+
+	brelse(tb->FL[h]);
+	tb->FL[h] = curf;	/* New initialization of FL[h]. */
+	brelse(tb->CFL[h]);
+	tb->CFL[h] = curcf;	/* New initialization of CFL[h]. */
+
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
+
+	/* Get parent FR[h] of R[h]. */
+
+	/* Current node is the last child of F[h]. FR[h] != F[h]. */
+	if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
+		/*
+		 * Calculate current parent of R[h], which is the right
+		 * neighbor of F[h].  Calculate current common parent of
+		 * R[h] and current node. Note that CFR[h] not equal
+		 * FR[path_offset] and CFR[h] not equal F[h].
+		 */
+		if ((ret =
+		     get_far_parent(tb, h + 1, &curf, &curcf,
+				    RIGHT_PARENTS)) != CARRY_ON)
+			return ret;
+	} else {
+		/* Current node is not the last child of its parent F[h]. */
+		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
+		tb->rkey[h] = position;
+	}
+
+	brelse(tb->FR[h]);
+	/* New initialization of FR[path_offset]. */
+	tb->FR[h] = curf;
+
+	brelse(tb->CFR[h]);
+	/* New initialization of CFR[path_offset]. */
+	tb->CFR[h] = curcf;
+
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
+
+	return CARRY_ON;
+}
+
+/*
+ * it is possible to remove node as result of shiftings to
+ * neighbors even when we insert or paste item.
+ */
+static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
+				      struct tree_balance *tb, int h)
+{
+	struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	int levbytes = tb->insert_size[h];
+	struct item_head *ih;
+	struct reiserfs_key *r_key = NULL;
+
+	ih = item_head(Sh, 0);
+	if (tb->CFR[h])
+		r_key = internal_key(tb->CFR[h], tb->rkey[h]);
+
+	if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
+	    /* shifting may merge items which might save space */
+	    -
+	    ((!h
+	      && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
+	    -
+	    ((!h && r_key
+	      && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
+	    + ((h) ? KEY_SIZE : 0)) {
+		/* node can not be removed */
+		if (sfree >= levbytes) {
+			/* new item fits into node S[h] without any shifting */
+			if (!h)
+				tb->s0num =
+				    B_NR_ITEMS(Sh) +
+				    ((mode == M_INSERT) ? 1 : 0);
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;
+		}
+	}
+	PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
+	return !NO_BALANCING_NEEDED;
+}
+
+/*
+ * Check whether current node S[h] is balanced when increasing its size by
+ * Inserting or Pasting.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste;
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+/* ip means Inserting or Pasting */
+static int ip_check_balance(struct tree_balance *tb, int h)
+{
+	struct virtual_node *vn = tb->tb_vn;
+	/*
+	 * Number of bytes that must be inserted into (value is negative
+	 * if bytes are deleted) buffer which contains node being balanced.
+	 * The mnemonic is that the attempted change in node space used
+	 * level is levbytes bytes.
+	 */
+	int levbytes;
+	int ret;
+
+	int lfree, sfree, rfree /* free space in L, S and R */ ;
+
+	/*
+	 * nver is short for number of vertixes, and lnver is the number if
+	 * we shift to the left, rnver is the number if we shift to the
+	 * right, and lrnver is the number if we shift in both directions.
+	 * The goal is to minimize first the number of vertixes, and second,
+	 * the number of vertixes whose contents are changed by shifting,
+	 * and third the number of uncached vertixes whose contents are
+	 * changed by shifting and must be read from disk.
+	 */
+	int nver, lnver, rnver, lrnver;
+
+	/*
+	 * used at leaf level only, S0 = S[0] is the node being balanced,
+	 * sInum [ I = 0,1,2 ] is the number of items that will
+	 * remain in node SI after balancing.  S1 and S2 are new
+	 * nodes that might be created.
+	 */
+
+	/*
+	 * we perform 8 calls to get_num_ver().  For each call we
+	 * calculate five parameters.  where 4th parameter is s1bytes
+	 * and 5th - s2bytes
+	 *
+	 * s0num, s1num, s2num for 8 cases
+	 * 0,1 - do not shift and do not shift but bottle
+	 * 2   - shift only whole item to left
+	 * 3   - shift to left and bottle as much as possible
+	 * 4,5 - shift to right (whole items and as much as possible
+	 * 6,7 - shift to both directions (whole items and as much as possible)
+	 */
+	short snum012[40] = { 0, };
+
+	/* Sh is the node whose balance is currently being checked */
+	struct buffer_head *Sh;
+
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	levbytes = tb->insert_size[h];
+
+	/* Calculate balance parameters for creating new root. */
+	if (!Sh) {
+		if (!h)
+			reiserfs_panic(tb->tb_sb, "vs-8210",
+				       "S[0] can not be 0");
+		switch (ret = get_empty_nodes(tb, h)) {
+		/* no balancing for higher levels needed */
+		case CARRY_ON:
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;
+
+		case NO_DISK_SPACE:
+		case REPEAT_SEARCH:
+			return ret;
+		default:
+			reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
+				       "return value of get_empty_nodes");
+		}
+	}
+
+	/* get parents of S[h] neighbors. */
+	ret = get_parents(tb, h);
+	if (ret != CARRY_ON)
+		return ret;
+
+	sfree = B_FREE_SPACE(Sh);
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	/* and new item fits into node S[h] without any shifting */
+	if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
+	    NO_BALANCING_NEEDED)
+		return NO_BALANCING_NEEDED;
+
+	create_virtual_node(tb, h);
+
+	/*
+	 * determine maximal number of items we can shift to the left
+	 * neighbor (in tb structure) and the maximal number of bytes
+	 * that can flow to the left neighbor from the left most liquid
+	 * item that cannot be shifted from S[0] entirely (returned value)
+	 */
+	check_left(tb, h, lfree);
+
+	/*
+	 * determine maximal number of items we can shift to the right
+	 * neighbor (in tb structure) and the maximal number of bytes
+	 * that can flow to the right neighbor from the right most liquid
+	 * item that cannot be shifted from S[0] entirely (returned value)
+	 */
+	check_right(tb, h, rfree);
+
+	/*
+	 * all contents of internal node S[h] can be moved into its
+	 * neighbors, S[h] will be removed after balancing
+	 */
+	if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
+		int to_r;
+
+		/*
+		 * Since we are working on internal nodes, and our internal
+		 * nodes have fixed size entries, then we can balance by the
+		 * number of items rather than the space they consume.  In this
+		 * routine we set the left node equal to the right node,
+		 * allowing a difference of less than or equal to 1 child
+		 * pointer.
+		 */
+		to_r =
+		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
+		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
+						tb->rnum[h]);
+		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
+			       -1, -1);
+		return CARRY_ON;
+	}
+
+	/*
+	 * this checks balance condition, that any two neighboring nodes
+	 * can not fit in one node
+	 */
+	RFALSE(h &&
+	       (tb->lnum[h] >= vn->vn_nr_item + 1 ||
+		tb->rnum[h] >= vn->vn_nr_item + 1),
+	       "vs-8220: tree is not balanced on internal level");
+	RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
+		      (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
+	       "vs-8225: tree is not balanced on leaf level");
+
+	/*
+	 * all contents of S[0] can be moved into its neighbors
+	 * S[0] will be removed after balancing.
+	 */
+	if (!h && is_leaf_removable(tb))
+		return CARRY_ON;
+
+	/*
+	 * why do we perform this check here rather than earlier??
+	 * Answer: we can win 1 node in some cases above. Moreover we
+	 * checked it above, when we checked, that S[0] is not removable
+	 * in principle
+	 */
+
+	 /* new item fits into node S[h] without any shifting */
+	if (sfree >= levbytes) {
+		if (!h)
+			tb->s0num = vn->vn_nr_item;
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	{
+		int lpar, rpar, nset, lset, rset, lrset;
+		/* regular overflowing of the node */
+
+		/*
+		 * get_num_ver works in 2 modes (FLOW & NO_FLOW)
+		 * lpar, rpar - number of items we can shift to left/right
+		 *              neighbor (including splitting item)
+		 * nset, lset, rset, lrset - shows, whether flowing items
+		 *                           give better packing
+		 */
+#define FLOW 1
+#define NO_FLOW 0		/* do not any splitting */
+
+		/* we choose one of the following */
+#define NOTHING_SHIFT_NO_FLOW	0
+#define NOTHING_SHIFT_FLOW	5
+#define LEFT_SHIFT_NO_FLOW	10
+#define LEFT_SHIFT_FLOW		15
+#define RIGHT_SHIFT_NO_FLOW	20
+#define RIGHT_SHIFT_FLOW	25
+#define LR_SHIFT_NO_FLOW	30
+#define LR_SHIFT_FLOW		35
+
+		lpar = tb->lnum[h];
+		rpar = tb->rnum[h];
+
+		/*
+		 * calculate number of blocks S[h] must be split into when
+		 * nothing is shifted to the neighbors, as well as number of
+		 * items in each part of the split node (s012 numbers),
+		 * and number of bytes (s1bytes) of the shared drop which
+		 * flow to S1 if any
+		 */
+		nset = NOTHING_SHIFT_NO_FLOW;
+		nver = get_num_ver(vn->vn_mode, tb, h,
+				   0, -1, h ? vn->vn_nr_item : 0, -1,
+				   snum012, NO_FLOW);
+
+		if (!h) {
+			int nver1;
+
+			/*
+			 * note, that in this case we try to bottle
+			 * between S[0] and S1 (S1 - the first new node)
+			 */
+			nver1 = get_num_ver(vn->vn_mode, tb, h,
+					    0, -1, 0, -1,
+					    snum012 + NOTHING_SHIFT_FLOW, FLOW);
+			if (nver > nver1)
+				nset = NOTHING_SHIFT_FLOW, nver = nver1;
+		}
+
+		/*
+		 * calculate number of blocks S[h] must be split into when
+		 * l_shift_num first items and l_shift_bytes of the right
+		 * most liquid item to be shifted are shifted to the left
+		 * neighbor, as well as number of items in each part of the
+		 * splitted node (s012 numbers), and number of bytes
+		 * (s1bytes) of the shared drop which flow to S1 if any
+		 */
+		lset = LEFT_SHIFT_NO_FLOW;
+		lnver = get_num_ver(vn->vn_mode, tb, h,
+				    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
+				    -1, h ? vn->vn_nr_item : 0, -1,
+				    snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int lnver1;
+
+			lnver1 = get_num_ver(vn->vn_mode, tb, h,
+					     lpar -
+					     ((tb->lbytes != -1) ? 1 : 0),
+					     tb->lbytes, 0, -1,
+					     snum012 + LEFT_SHIFT_FLOW, FLOW);
+			if (lnver > lnver1)
+				lset = LEFT_SHIFT_FLOW, lnver = lnver1;
+		}
+
+		/*
+		 * calculate number of blocks S[h] must be split into when
+		 * r_shift_num first items and r_shift_bytes of the left most
+		 * liquid item to be shifted are shifted to the right neighbor,
+		 * as well as number of items in each part of the splitted
+		 * node (s012 numbers), and number of bytes (s1bytes) of the
+		 * shared drop which flow to S1 if any
+		 */
+		rset = RIGHT_SHIFT_NO_FLOW;
+		rnver = get_num_ver(vn->vn_mode, tb, h,
+				    0, -1,
+				    h ? (vn->vn_nr_item - rpar) : (rpar -
+								   ((tb->
+								     rbytes !=
+								     -1) ? 1 :
+								    0)), -1,
+				    snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int rnver1;
+
+			rnver1 = get_num_ver(vn->vn_mode, tb, h,
+					     0, -1,
+					     (rpar -
+					      ((tb->rbytes != -1) ? 1 : 0)),
+					     tb->rbytes,
+					     snum012 + RIGHT_SHIFT_FLOW, FLOW);
+
+			if (rnver > rnver1)
+				rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
+		}
+
+		/*
+		 * calculate number of blocks S[h] must be split into when
+		 * items are shifted in both directions, as well as number
+		 * of items in each part of the splitted node (s012 numbers),
+		 * and number of bytes (s1bytes) of the shared drop which
+		 * flow to S1 if any
+		 */
+		lrset = LR_SHIFT_NO_FLOW;
+		lrnver = get_num_ver(vn->vn_mode, tb, h,
+				     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
+				     -1,
+				     h ? (vn->vn_nr_item - rpar) : (rpar -
+								    ((tb->
+								      rbytes !=
+								      -1) ? 1 :
+								     0)), -1,
+				     snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int lrnver1;
+
+			lrnver1 = get_num_ver(vn->vn_mode, tb, h,
+					      lpar -
+					      ((tb->lbytes != -1) ? 1 : 0),
+					      tb->lbytes,
+					      (rpar -
+					       ((tb->rbytes != -1) ? 1 : 0)),
+					      tb->rbytes,
+					      snum012 + LR_SHIFT_FLOW, FLOW);
+			if (lrnver > lrnver1)
+				lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
+		}
+
+		/*
+		 * Our general shifting strategy is:
+		 * 1) to minimized number of new nodes;
+		 * 2) to minimized number of neighbors involved in shifting;
+		 * 3) to minimized number of disk reads;
+		 */
+
+		/* we can win TWO or ONE nodes by shifting in both directions */
+		if (lrnver < lnver && lrnver < rnver) {
+			RFALSE(h &&
+			       (tb->lnum[h] != 1 ||
+				tb->rnum[h] != 1 ||
+				lrnver != 1 || rnver != 2 || lnver != 2
+				|| h != 1), "vs-8230: bad h");
+			if (lrset == LR_SHIFT_FLOW)
+				set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
+					       lrnver, snum012 + lrset,
+					       tb->lbytes, tb->rbytes);
+			else
+				set_parameters(tb, h,
+					       tb->lnum[h] -
+					       ((tb->lbytes == -1) ? 0 : 1),
+					       tb->rnum[h] -
+					       ((tb->rbytes == -1) ? 0 : 1),
+					       lrnver, snum012 + lrset, -1, -1);
+
+			return CARRY_ON;
+		}
+
+		/*
+		 * if shifting doesn't lead to better packing
+		 * then don't shift
+		 */
+		if (nver == lrnver) {
+			set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
+				       -1);
+			return CARRY_ON;
+		}
+
+		/*
+		 * now we know that for better packing shifting in only one
+		 * direction either to the left or to the right is required
+		 */
+
+		/*
+		 * if shifting to the left is better than
+		 * shifting to the right
+		 */
+		if (lnver < rnver) {
+			SET_PAR_SHIFT_LEFT;
+			return CARRY_ON;
+		}
+
+		/*
+		 * if shifting to the right is better than
+		 * shifting to the left
+		 */
+		if (lnver > rnver) {
+			SET_PAR_SHIFT_RIGHT;
+			return CARRY_ON;
+		}
+
+		/*
+		 * now shifting in either direction gives the same number
+		 * of nodes and we can make use of the cached neighbors
+		 */
+		if (is_left_neighbor_in_cache(tb, h)) {
+			SET_PAR_SHIFT_LEFT;
+			return CARRY_ON;
+		}
+
+		/*
+		 * shift to the right independently on whether the
+		 * right neighbor in cache or not
+		 */
+		SET_PAR_SHIFT_RIGHT;
+		return CARRY_ON;
+	}
+}
+
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Cutting for INTERNAL node of S+tree.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste;
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ *
+ * Note: Items of internal nodes have fixed size, so the balance condition for
+ * the internal part of S+tree is as for the B-trees.
+ */
+static int dc_check_balance_internal(struct tree_balance *tb, int h)
+{
+	struct virtual_node *vn = tb->tb_vn;
+
+	/*
+	 * Sh is the node whose balance is currently being checked,
+	 * and Fh is its father.
+	 */
+	struct buffer_head *Sh, *Fh;
+	int maxsize, ret;
+	int lfree, rfree /* free space in L and R */ ;
+
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	Fh = PATH_H_PPARENT(tb->tb_path, h);
+
+	maxsize = MAX_CHILD_SIZE(Sh);
+
+	/*
+	 * using tb->insert_size[h], which is negative in this case,
+	 * create_virtual_node calculates:
+	 * new_nr_item = number of items node would have if operation is
+	 * performed without balancing (new_nr_item);
+	 */
+	create_virtual_node(tb, h);
+
+	if (!Fh) {		/* S[h] is the root. */
+		/* no balancing for higher levels needed */
+		if (vn->vn_nr_item > 0) {
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;
+		}
+		/*
+		 * new_nr_item == 0.
+		 * Current root will be deleted resulting in
+		 * decrementing the tree height.
+		 */
+		set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	if ((ret = get_parents(tb, h)) != CARRY_ON)
+		return ret;
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	/* determine maximal number of items we can fit into neighbors */
+	check_left(tb, h, lfree);
+	check_right(tb, h, rfree);
+
+	/*
+	 * Balance condition for the internal node is valid.
+	 * In this case we balance only if it leads to better packing.
+	 */
+	if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
+		/*
+		 * Here we join S[h] with one of its neighbors,
+		 * which is impossible with greater values of new_nr_item.
+		 */
+		if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
+			/* All contents of S[h] can be moved to L[h]. */
+			if (tb->lnum[h] >= vn->vn_nr_item + 1) {
+				int n;
+				int order_L;
+
+				order_L =
+				    ((n =
+				      PATH_H_B_ITEM_ORDER(tb->tb_path,
+							  h)) ==
+				     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+				n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
+				    (DC_SIZE + KEY_SIZE);
+				set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
+					       -1);
+				return CARRY_ON;
+			}
+
+			/* All contents of S[h] can be moved to R[h]. */
+			if (tb->rnum[h] >= vn->vn_nr_item + 1) {
+				int n;
+				int order_R;
+
+				order_R =
+				    ((n =
+				      PATH_H_B_ITEM_ORDER(tb->tb_path,
+							  h)) ==
+				     B_NR_ITEMS(Fh)) ? 0 : n + 1;
+				n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
+				    (DC_SIZE + KEY_SIZE);
+				set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
+					       -1);
+				return CARRY_ON;
+			}
+		}
+
+		/*
+		 * All contents of S[h] can be moved to the neighbors
+		 * (L[h] & R[h]).
+		 */
+		if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
+			int to_r;
+
+			to_r =
+			    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
+			     tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
+			    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
+			set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
+				       0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+		/* Balancing does not lead to better packing. */
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	/*
+	 * Current node contain insufficient number of items.
+	 * Balancing is required.
+	 */
+	/* Check whether we can merge S[h] with left neighbor. */
+	if (tb->lnum[h] >= vn->vn_nr_item + 1)
+		if (is_left_neighbor_in_cache(tb, h)
+		    || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
+			int n;
+			int order_L;
+
+			order_L =
+			    ((n =
+			      PATH_H_B_ITEM_ORDER(tb->tb_path,
+						  h)) ==
+			     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+			n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
+								      KEY_SIZE);
+			set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+	/* Check whether we can merge S[h] with right neighbor. */
+	if (tb->rnum[h] >= vn->vn_nr_item + 1) {
+		int n;
+		int order_R;
+
+		order_R =
+		    ((n =
+		      PATH_H_B_ITEM_ORDER(tb->tb_path,
+					  h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
+		n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
+							      KEY_SIZE);
+		set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
+	if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
+		int to_r;
+
+		to_r =
+		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
+		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
+						tb->rnum[h]);
+		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
+			       -1, -1);
+		return CARRY_ON;
+	}
+
+	/* For internal nodes try to borrow item from a neighbor */
+	RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
+
+	/* Borrow one or two items from caching neighbor */
+	if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
+		int from_l;
+
+		from_l =
+		    (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
+		     1) / 2 - (vn->vn_nr_item + 1);
+		set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	set_parameters(tb, h, 0,
+		       -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
+			  1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
+	return CARRY_ON;
+}
+
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Truncating for LEAF node of S+tree.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste;
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+static int dc_check_balance_leaf(struct tree_balance *tb, int h)
+{
+	struct virtual_node *vn = tb->tb_vn;
+
+	/*
+	 * Number of bytes that must be deleted from
+	 * (value is negative if bytes are deleted) buffer which
+	 * contains node being balanced.  The mnemonic is that the
+	 * attempted change in node space used level is levbytes bytes.
+	 */
+	int levbytes;
+
+	/* the maximal item size */
+	int maxsize, ret;
+
+	/*
+	 * S0 is the node whose balance is currently being checked,
+	 * and F0 is its father.
+	 */
+	struct buffer_head *S0, *F0;
+	int lfree, rfree /* free space in L and R */ ;
+
+	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
+	F0 = PATH_H_PPARENT(tb->tb_path, 0);
+
+	levbytes = tb->insert_size[h];
+
+	maxsize = MAX_CHILD_SIZE(S0);	/* maximal possible size of an item */
+
+	if (!F0) {		/* S[0] is the root now. */
+
+		RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
+		       "vs-8240: attempt to create empty buffer tree");
+
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	if ((ret = get_parents(tb, h)) != CARRY_ON)
+		return ret;
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	create_virtual_node(tb, h);
+
+	/* if 3 leaves can be merge to one, set parameters and return */
+	if (are_leaves_removable(tb, lfree, rfree))
+		return CARRY_ON;
+
+	/*
+	 * determine maximal number of items we can shift to the left/right
+	 * neighbor and the maximal number of bytes that can flow to the
+	 * left/right neighbor from the left/right most liquid item that
+	 * cannot be shifted from S[0] entirely
+	 */
+	check_left(tb, h, lfree);
+	check_right(tb, h, rfree);
+
+	/* check whether we can merge S with left neighbor. */
+	if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
+		if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||	/* S can not be merged with R */
+		    !tb->FR[h]) {
+
+			RFALSE(!tb->FL[h],
+			       "vs-8245: dc_check_balance_leaf: FL[h] must exist");
+
+			/* set parameter to merge S[0] with its left neighbor */
+			set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+	/* check whether we can merge S[0] with right neighbor. */
+	if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
+		set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	/*
+	 * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
+	 * Set parameters and return
+	 */
+	if (is_leaf_removable(tb))
+		return CARRY_ON;
+
+	/* Balancing is not required. */
+	tb->s0num = vn->vn_nr_item;
+	set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+	return NO_BALANCING_NEEDED;
+}
+
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Cutting.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	d - delete, c - cut.
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+static int dc_check_balance(struct tree_balance *tb, int h)
+{
+	RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
+	       "vs-8250: S is not initialized");
+
+	if (h)
+		return dc_check_balance_internal(tb, h);
+	else
+		return dc_check_balance_leaf(tb, h);
+}
+
+/*
+ * Check whether current node S[h] is balanced.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *
+ *	tb	tree_balance structure:
+ *
+ *              tb is a large structure that must be read about in the header
+ *		file at the same time as this procedure if the reader is
+ *		to successfully understand this procedure
+ *
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste, d - delete, c - cut.
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+static int check_balance(int mode,
+			 struct tree_balance *tb,
+			 int h,
+			 int inum,
+			 int pos_in_item,
+			 struct item_head *ins_ih, const void *data)
+{
+	struct virtual_node *vn;
+
+	vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
+	vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
+	vn->vn_mode = mode;
+	vn->vn_affected_item_num = inum;
+	vn->vn_pos_in_item = pos_in_item;
+	vn->vn_ins_ih = ins_ih;
+	vn->vn_data = data;
+
+	RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
+	       "vs-8255: ins_ih can not be 0 in insert mode");
+
+	/* Calculate balance parameters when size of node is increasing. */
+	if (tb->insert_size[h] > 0)
+		return ip_check_balance(tb, h);
+
+	/* Calculate balance parameters when  size of node is decreasing. */
+	return dc_check_balance(tb, h);
+}
+
+/* Check whether parent at the path is the really parent of the current node.*/
+static int get_direct_parent(struct tree_balance *tb, int h)
+{
+	struct buffer_head *bh;
+	struct treepath *path = tb->tb_path;
+	int position,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
+
+	/* We are in the root or in the new root. */
+	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
+		       "PAP-8260: invalid offset in the path");
+
+		if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
+		    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
+			/* Root is not changed. */
+			PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
+			PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
+			return CARRY_ON;
+		}
+		/* Root is changed and we must recalculate the path. */
+		return REPEAT_SEARCH;
+	}
+
+	/* Parent in the path is not in the tree. */
+	if (!B_IS_IN_TREE
+	    (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
+		return REPEAT_SEARCH;
+
+	if ((position =
+	     PATH_OFFSET_POSITION(path,
+				  path_offset - 1)) > B_NR_ITEMS(bh))
+		return REPEAT_SEARCH;
+
+	/* Parent in the path is not parent of the current node in the tree. */
+	if (B_N_CHILD_NUM(bh, position) !=
+	    PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
+		return REPEAT_SEARCH;
+
+	if (buffer_locked(bh)) {
+		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
+		__wait_on_buffer(bh);
+		reiserfs_write_lock_nested(tb->tb_sb, depth);
+		if (FILESYSTEM_CHANGED_TB(tb))
+			return REPEAT_SEARCH;
+	}
+
+	/*
+	 * Parent in the path is unlocked and really parent
+	 * of the current node.
+	 */
+	return CARRY_ON;
+}
+
+/*
+ * Using lnum[h] and rnum[h] we should determine what neighbors
+ * of S[h] we
+ * need in order to balance S[h], and get them if necessary.
+ * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ *	        CARRY_ON - schedule didn't occur while the function worked;
+ */
+static int get_neighbors(struct tree_balance *tb, int h)
+{
+	int child_position,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
+	unsigned long son_number;
+	struct super_block *sb = tb->tb_sb;
+	struct buffer_head *bh;
+	int depth;
+
+	PROC_INFO_INC(sb, get_neighbors[h]);
+
+	if (tb->lnum[h]) {
+		/* We need left neighbor to balance S[h]. */
+		PROC_INFO_INC(sb, need_l_neighbor[h]);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
+
+		RFALSE(bh == tb->FL[h] &&
+		       !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
+		       "PAP-8270: invalid position in the parent");
+
+		child_position =
+		    (bh ==
+		     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
+								       FL[h]);
+		son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+		depth = reiserfs_write_unlock_nested(tb->tb_sb);
+		bh = sb_bread(sb, son_number);
+		reiserfs_write_lock_nested(tb->tb_sb, depth);
+		if (!bh)
+			return IO_ERROR;
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(bh);
+			PROC_INFO_INC(sb, get_neighbors_restart[h]);
+			return REPEAT_SEARCH;
+		}
+
+		RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
+		       child_position > B_NR_ITEMS(tb->FL[h]) ||
+		       B_N_CHILD_NUM(tb->FL[h], child_position) !=
+		       bh->b_blocknr, "PAP-8275: invalid parent");
+		RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
+		RFALSE(!h &&
+		       B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
+		       dc_size(B_N_CHILD(tb->FL[0], child_position)),
+		       "PAP-8290: invalid child size of left neighbor");
+
+		brelse(tb->L[h]);
+		tb->L[h] = bh;
+	}
+
+	/* We need right neighbor to balance S[path_offset]. */
+	if (tb->rnum[h]) {
+		PROC_INFO_INC(sb, need_r_neighbor[h]);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
+
+		RFALSE(bh == tb->FR[h] &&
+		       PATH_OFFSET_POSITION(tb->tb_path,
+					    path_offset) >=
+		       B_NR_ITEMS(bh),
+		       "PAP-8295: invalid position in the parent");
+
+		child_position =
+		    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
+		son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+		depth = reiserfs_write_unlock_nested(tb->tb_sb);
+		bh = sb_bread(sb, son_number);
+		reiserfs_write_lock_nested(tb->tb_sb, depth);
+		if (!bh)
+			return IO_ERROR;
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(bh);
+			PROC_INFO_INC(sb, get_neighbors_restart[h]);
+			return REPEAT_SEARCH;
+		}
+		brelse(tb->R[h]);
+		tb->R[h] = bh;
+
+		RFALSE(!h
+		       && B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
+		       dc_size(B_N_CHILD(tb->FR[0], child_position)),
+		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
+		       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
+		       dc_size(B_N_CHILD(tb->FR[0], child_position)));
+
+	}
+	return CARRY_ON;
+}
+
+static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
+{
+	int max_num_of_items;
+	int max_num_of_entries;
+	unsigned long blocksize = sb->s_blocksize;
+
+#define MIN_NAME_LEN 1
+
+	max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
+	max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
+	    (DEH_SIZE + MIN_NAME_LEN);
+
+	return sizeof(struct virtual_node) +
+	    max(max_num_of_items * sizeof(struct virtual_item),
+		sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
+		(max_num_of_entries - 1) * sizeof(__u16));
+}
+
+/*
+ * maybe we should fail balancing we are going to perform when kmalloc
+ * fails several times. But now it will loop until kmalloc gets
+ * required memory
+ */
+static int get_mem_for_virtual_node(struct tree_balance *tb)
+{
+	int check_fs = 0;
+	int size;
+	char *buf;
+
+	size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
+
+	/* we have to allocate more memory for virtual node */
+	if (size > tb->vn_buf_size) {
+		if (tb->vn_buf) {
+			/* free memory allocated before */
+			kfree(tb->vn_buf);
+			/* this is not needed if kfree is atomic */
+			check_fs = 1;
+		}
+
+		/* virtual node requires now more memory */
+		tb->vn_buf_size = size;
+
+		/* get memory for virtual item */
+		buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+		if (!buf) {
+			/*
+			 * getting memory with GFP_KERNEL priority may involve
+			 * balancing now (due to indirect_to_direct conversion
+			 * on dcache shrinking). So, release path and collected
+			 * resources here
+			 */
+			free_buffers_in_tb(tb);
+			buf = kmalloc(size, GFP_NOFS);
+			if (!buf) {
+				tb->vn_buf_size = 0;
+			}
+			tb->vn_buf = buf;
+			schedule();
+			return REPEAT_SEARCH;
+		}
+
+		tb->vn_buf = buf;
+	}
+
+	if (check_fs && FILESYSTEM_CHANGED_TB(tb))
+		return REPEAT_SEARCH;
+
+	return CARRY_ON;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+static void tb_buffer_sanity_check(struct super_block *sb,
+				   struct buffer_head *bh,
+				   const char *descr, int level)
+{
+	if (bh) {
+		if (atomic_read(&(bh->b_count)) <= 0)
+
+			reiserfs_panic(sb, "jmacd-1", "negative or zero "
+				       "reference counter for buffer %s[%d] "
+				       "(%b)", descr, level, bh);
+
+		if (!buffer_uptodate(bh))
+			reiserfs_panic(sb, "jmacd-2", "buffer is not up "
+				       "to date %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (!B_IS_IN_TREE(bh))
+			reiserfs_panic(sb, "jmacd-3", "buffer is not "
+				       "in tree %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_bdev != sb->s_bdev)
+			reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
+				       "device %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_size != sb->s_blocksize)
+			reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
+				       "blocksize %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
+			reiserfs_panic(sb, "jmacd-6", "buffer block "
+				       "number too high %s[%d] (%b)",
+				       descr, level, bh);
+	}
+}
+#else
+static void tb_buffer_sanity_check(struct super_block *sb,
+				   struct buffer_head *bh,
+				   const char *descr, int level)
+{;
+}
+#endif
+
+static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
+{
+	return reiserfs_prepare_for_journal(s, bh, 0);
+}
+
+static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
+{
+	struct buffer_head *locked;
+#ifdef CONFIG_REISERFS_CHECK
+	int repeat_counter = 0;
+#endif
+	int i;
+
+	do {
+
+		locked = NULL;
+
+		for (i = tb->tb_path->path_length;
+		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
+			if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
+				/*
+				 * if I understand correctly, we can only
+				 * be sure the last buffer in the path is
+				 * in the tree --clm
+				 */
+#ifdef CONFIG_REISERFS_CHECK
+				if (PATH_PLAST_BUFFER(tb->tb_path) ==
+				    PATH_OFFSET_PBUFFER(tb->tb_path, i))
+					tb_buffer_sanity_check(tb->tb_sb,
+							       PATH_OFFSET_PBUFFER
+							       (tb->tb_path,
+								i), "S",
+							       tb->tb_path->
+							       path_length - i);
+#endif
+				if (!clear_all_dirty_bits(tb->tb_sb,
+							  PATH_OFFSET_PBUFFER
+							  (tb->tb_path,
+							   i))) {
+					locked =
+					    PATH_OFFSET_PBUFFER(tb->tb_path,
+								i);
+				}
+			}
+		}
+
+		for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
+		     i++) {
+
+			if (tb->lnum[i]) {
+
+				if (tb->L[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->L[i],
+							       "L", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->L[i]))
+						locked = tb->L[i];
+				}
+
+				if (!locked && tb->FL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FL[i],
+							       "FL", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->FL[i]))
+						locked = tb->FL[i];
+				}
+
+				if (!locked && tb->CFL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFL[i],
+							       "CFL", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->CFL[i]))
+						locked = tb->CFL[i];
+				}
+
+			}
+
+			if (!locked && (tb->rnum[i])) {
+
+				if (tb->R[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->R[i],
+							       "R", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->R[i]))
+						locked = tb->R[i];
+				}
+
+				if (!locked && tb->FR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FR[i],
+							       "FR", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->FR[i]))
+						locked = tb->FR[i];
+				}
+
+				if (!locked && tb->CFR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFR[i],
+							       "CFR", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->CFR[i]))
+						locked = tb->CFR[i];
+				}
+			}
+		}
+
+		/*
+		 * as far as I can tell, this is not required.  The FEB list
+		 * seems to be full of newly allocated nodes, which will
+		 * never be locked, dirty, or anything else.
+		 * To be safe, I'm putting in the checks and waits in.
+		 * For the moment, they are needed to keep the code in
+		 * journal.c from complaining about the buffer.
+		 * That code is inside CONFIG_REISERFS_CHECK as well.  --clm
+		 */
+		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
+			if (tb->FEB[i]) {
+				if (!clear_all_dirty_bits
+				    (tb->tb_sb, tb->FEB[i]))
+					locked = tb->FEB[i];
+			}
+		}
+
+		if (locked) {
+			int depth;
+#ifdef CONFIG_REISERFS_CHECK
+			repeat_counter++;
+			if ((repeat_counter % 10000) == 0) {
+				reiserfs_warning(tb->tb_sb, "reiserfs-8200",
+						 "too many iterations waiting "
+						 "for buffer to unlock "
+						 "(%b)", locked);
+
+				/* Don't loop forever.  Try to recover from possible error. */
+
+				return (FILESYSTEM_CHANGED_TB(tb)) ?
+				    REPEAT_SEARCH : CARRY_ON;
+			}
+#endif
+			depth = reiserfs_write_unlock_nested(tb->tb_sb);
+			__wait_on_buffer(locked);
+			reiserfs_write_lock_nested(tb->tb_sb, depth);
+			if (FILESYSTEM_CHANGED_TB(tb))
+				return REPEAT_SEARCH;
+		}
+
+	} while (locked);
+
+	return CARRY_ON;
+}
+
+/*
+ * Prepare for balancing, that is
+ *	get all necessary parents, and neighbors;
+ *	analyze what and where should be moved;
+ *	get sufficient number of new nodes;
+ * Balancing will start only after all resources will be collected at a time.
+ *
+ * When ported to SMP kernels, only at the last moment after all needed nodes
+ * are collected in cache, will the resources be locked using the usual
+ * textbook ordered lock acquisition algorithms.  Note that ensuring that
+ * this code neither write locks what it does not need to write lock nor locks
+ * out of order will be a pain in the butt that could have been avoided.
+ * Grumble grumble. -Hans
+ *
+ * fix is meant in the sense of render unchanging
+ *
+ * Latency might be improved by first gathering a list of what buffers
+ * are needed and then getting as many of them in parallel as possible? -Hans
+ *
+ * Parameters:
+ *	op_mode	i - insert, d - delete, c - cut (truncate), p - paste (append)
+ *	tb	tree_balance structure;
+ *	inum	item number in S[h];
+ *      pos_in_item - comment this if you can
+ *      ins_ih	item head of item being inserted
+ *	data	inserted item or data to be pasted
+ * Returns:	1 - schedule occurred while the function worked;
+ *	        0 - schedule didn't occur while the function worked;
+ *             -1 - if no_disk_space
+ */
+
+int fix_nodes(int op_mode, struct tree_balance *tb,
+	      struct item_head *ins_ih, const void *data)
+{
+	int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
+	int pos_in_item;
+
+	/*
+	 * we set wait_tb_buffers_run when we have to restore any dirty
+	 * bits cleared during wait_tb_buffers_run
+	 */
+	int wait_tb_buffers_run = 0;
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+
+	++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
+
+	pos_in_item = tb->tb_path->pos_in_item;
+
+	tb->fs_gen = get_generation(tb->tb_sb);
+
+	/*
+	 * we prepare and log the super here so it will already be in the
+	 * transaction when do_balance needs to change it.
+	 * This way do_balance won't have to schedule when trying to prepare
+	 * the super for logging
+	 */
+	reiserfs_prepare_for_journal(tb->tb_sb,
+				     SB_BUFFER_WITH_SB(tb->tb_sb), 1);
+	journal_mark_dirty(tb->transaction_handle,
+			   SB_BUFFER_WITH_SB(tb->tb_sb));
+	if (FILESYSTEM_CHANGED_TB(tb))
+		return REPEAT_SEARCH;
+
+	/* if it possible in indirect_to_direct conversion */
+	if (buffer_locked(tbS0)) {
+		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
+		__wait_on_buffer(tbS0);
+		reiserfs_write_lock_nested(tb->tb_sb, depth);
+		if (FILESYSTEM_CHANGED_TB(tb))
+			return REPEAT_SEARCH;
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
+		print_cur_tb("fix_nodes");
+		reiserfs_panic(tb->tb_sb, "PAP-8305",
+			       "there is pending do_balance");
+	}
+
+	if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
+		reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
+			       "not uptodate at the beginning of fix_nodes "
+			       "or not in tree (mode %c)",
+			       tbS0, tbS0, op_mode);
+
+	/* Check parameters. */
+	switch (op_mode) {
+	case M_INSERT:
+		if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
+			reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
+				       "item number %d (in S0 - %d) in case "
+				       "of insert", item_num,
+				       B_NR_ITEMS(tbS0));
+		break;
+	case M_PASTE:
+	case M_DELETE:
+	case M_CUT:
+		if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
+			print_block(tbS0, 0, -1, -1);
+			reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
+				       "item number(%d); mode = %c "
+				       "insert_size = %d",
+				       item_num, op_mode,
+				       tb->insert_size[0]);
+		}
+		break;
+	default:
+		reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
+			       "of operation");
+	}
+#endif
+
+	if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
+		/* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
+		return REPEAT_SEARCH;
+
+	/* Starting from the leaf level; for all levels h of the tree. */
+	for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
+		ret = get_direct_parent(tb, h);
+		if (ret != CARRY_ON)
+			goto repeat;
+
+		ret = check_balance(op_mode, tb, h, item_num,
+				    pos_in_item, ins_ih, data);
+		if (ret != CARRY_ON) {
+			if (ret == NO_BALANCING_NEEDED) {
+				/* No balancing for higher levels needed. */
+				ret = get_neighbors(tb, h);
+				if (ret != CARRY_ON)
+					goto repeat;
+				if (h != MAX_HEIGHT - 1)
+					tb->insert_size[h + 1] = 0;
+				/*
+				 * ok, analysis and resource gathering
+				 * are complete
+				 */
+				break;
+			}
+			goto repeat;
+		}
+
+		ret = get_neighbors(tb, h);
+		if (ret != CARRY_ON)
+			goto repeat;
+
+		/*
+		 * No disk space, or schedule occurred and analysis may be
+		 * invalid and needs to be redone.
+		 */
+		ret = get_empty_nodes(tb, h);
+		if (ret != CARRY_ON)
+			goto repeat;
+
+		/*
+		 * We have a positive insert size but no nodes exist on this
+		 * level, this means that we are creating a new root.
+		 */
+		if (!PATH_H_PBUFFER(tb->tb_path, h)) {
+
+			RFALSE(tb->blknum[h] != 1,
+			       "PAP-8350: creating new empty root");
+
+			if (h < MAX_HEIGHT - 1)
+				tb->insert_size[h + 1] = 0;
+		} else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
+			/*
+			 * The tree needs to be grown, so this node S[h]
+			 * which is the root node is split into two nodes,
+			 * and a new node (S[h+1]) will be created to
+			 * become the root node.
+			 */
+			if (tb->blknum[h] > 1) {
+
+				RFALSE(h == MAX_HEIGHT - 1,
+				       "PAP-8355: attempt to create too high of a tree");
+
+				tb->insert_size[h + 1] =
+				    (DC_SIZE +
+				     KEY_SIZE) * (tb->blknum[h] - 1) +
+				    DC_SIZE;
+			} else if (h < MAX_HEIGHT - 1)
+				tb->insert_size[h + 1] = 0;
+		} else
+			tb->insert_size[h + 1] =
+			    (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
+	}
+
+	ret = wait_tb_buffers_until_unlocked(tb);
+	if (ret == CARRY_ON) {
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			wait_tb_buffers_run = 1;
+			ret = REPEAT_SEARCH;
+			goto repeat;
+		} else {
+			return CARRY_ON;
+		}
+	} else {
+		wait_tb_buffers_run = 1;
+		goto repeat;
+	}
+
+repeat:
+	/*
+	 * fix_nodes was unable to perform its calculation due to
+	 * filesystem got changed under us, lack of free disk space or i/o
+	 * failure. If the first is the case - the search will be
+	 * repeated. For now - free all resources acquired so far except
+	 * for the new allocated nodes
+	 */
+	{
+		int i;
+
+		/* Release path buffers. */
+		if (wait_tb_buffers_run) {
+			pathrelse_and_restore(tb->tb_sb, tb->tb_path);
+		} else {
+			pathrelse(tb->tb_path);
+		}
+		/* brelse all resources collected for balancing */
+		for (i = 0; i < MAX_HEIGHT; i++) {
+			if (wait_tb_buffers_run) {
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->L[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->R[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FL[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FR[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
+								 CFL[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
+								 CFR[i]);
+			}
+
+			brelse(tb->L[i]);
+			brelse(tb->R[i]);
+			brelse(tb->FL[i]);
+			brelse(tb->FR[i]);
+			brelse(tb->CFL[i]);
+			brelse(tb->CFR[i]);
+
+			tb->L[i] = NULL;
+			tb->R[i] = NULL;
+			tb->FL[i] = NULL;
+			tb->FR[i] = NULL;
+			tb->CFL[i] = NULL;
+			tb->CFR[i] = NULL;
+		}
+
+		if (wait_tb_buffers_run) {
+			for (i = 0; i < MAX_FEB_SIZE; i++) {
+				if (tb->FEB[i])
+					reiserfs_restore_prepared_buffer
+					    (tb->tb_sb, tb->FEB[i]);
+			}
+		}
+		return ret;
+	}
+
+}
+
+void unfix_nodes(struct tree_balance *tb)
+{
+	int i;
+
+	/* Release path buffers. */
+	pathrelse_and_restore(tb->tb_sb, tb->tb_path);
+
+	/* brelse all resources collected for balancing */
+	for (i = 0; i < MAX_HEIGHT; i++) {
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
+
+		brelse(tb->L[i]);
+		brelse(tb->R[i]);
+		brelse(tb->FL[i]);
+		brelse(tb->FR[i]);
+		brelse(tb->CFL[i]);
+		brelse(tb->CFR[i]);
+	}
+
+	/* deal with list of allocated (used and unused) nodes */
+	for (i = 0; i < MAX_FEB_SIZE; i++) {
+		if (tb->FEB[i]) {
+			b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
+			/*
+			 * de-allocated block which was not used by
+			 * balancing and bforget about buffer for it
+			 */
+			brelse(tb->FEB[i]);
+			reiserfs_free_block(tb->transaction_handle, NULL,
+					    blocknr, 0);
+		}
+		if (tb->used[i]) {
+			/* release used as new nodes including a new root */
+			brelse(tb->used[i]);
+		}
+	}
+
+	kfree(tb->vn_buf);
+
+}
diff --git a/kernel/fs/reiserfs/hashes.c b/kernel/fs/reiserfs/hashes.c
new file mode 100644
index 000000000..7a26c4fe6
--- /dev/null
+++ b/kernel/fs/reiserfs/hashes.c
@@ -0,0 +1,177 @@
+
+/*
+ * Keyed 32-bit hash function using TEA in a Davis-Meyer function
+ *   H0 = Key
+ *   Hi = E Mi(Hi-1) + Hi-1
+ *
+ * (see Applied Cryptography, 2nd edition, p448).
+ *
+ * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
+ *
+ * Jeremy has agreed to the contents of reiserfs/README. -Hans
+ * Yura's function is added (04/07/2000)
+ */
+
+#include <linux/kernel.h>
+#include "reiserfs.h"
+#include <asm/types.h>
+
+#define DELTA 0x9E3779B9
+#define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
+#define PARTROUNDS 6		/* 6 gets complete mixing */
+
+/* a, b, c, d - data; h0, h1 - accumulated hash */
+#define TEACORE(rounds)							\
+	do {								\
+		u32 sum = 0;						\
+		int n = rounds;						\
+		u32 b0, b1;						\
+									\
+		b0 = h0;						\
+		b1 = h1;						\
+									\
+		do							\
+		{							\
+			sum += DELTA;					\
+			b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);	\
+			b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);	\
+		} while(--n);						\
+									\
+		h0 += b0;						\
+		h1 += b1;						\
+	} while(0)
+
+u32 keyed_hash(const signed char *msg, int len)
+{
+	u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
+
+	u32 h0 = k[0], h1 = k[1];
+	u32 a, b, c, d;
+	u32 pad;
+	int i;
+
+	/*      assert(len >= 0 && len < 256); */
+
+	pad = (u32) len | ((u32) len << 8);
+	pad |= pad << 16;
+
+	while (len >= 16) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+		c = (u32) msg[8] |
+		    (u32) msg[9] << 8 |
+		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
+		d = (u32) msg[12] |
+		    (u32) msg[13] << 8 |
+		    (u32) msg[14] << 16 | (u32) msg[15] << 24;
+
+		TEACORE(PARTROUNDS);
+
+		len -= 16;
+		msg += 16;
+	}
+
+	if (len >= 12) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+		c = (u32) msg[8] |
+		    (u32) msg[9] << 8 |
+		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
+
+		d = pad;
+		for (i = 12; i < len; i++) {
+			d <<= 8;
+			d |= msg[i];
+		}
+	} else if (len >= 8) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+
+		c = d = pad;
+		for (i = 8; i < len; i++) {
+			c <<= 8;
+			c |= msg[i];
+		}
+	} else if (len >= 4) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+
+		b = c = d = pad;
+		for (i = 4; i < len; i++) {
+			b <<= 8;
+			b |= msg[i];
+		}
+	} else {
+		a = b = c = d = pad;
+		for (i = 0; i < len; i++) {
+			a <<= 8;
+			a |= msg[i];
+		}
+	}
+
+	TEACORE(FULLROUNDS);
+
+/*	return 0;*/
+	return h0 ^ h1;
+}
+
+/*
+ * What follows in this file is copyright 2000 by Hans Reiser, and the
+ * licensing of what follows is governed by reiserfs/README
+ */
+u32 yura_hash(const signed char *msg, int len)
+{
+	int j, pow;
+	u32 a, c;
+	int i;
+
+	for (pow = 1, i = 1; i < len; i++)
+		pow = pow * 10;
+
+	if (len == 1)
+		a = msg[0] - 48;
+	else
+		a = (msg[0] - 48) * pow;
+
+	for (i = 1; i < len; i++) {
+		c = msg[i] - 48;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	for (; i < 40; i++) {
+		c = '0' - 48;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	for (; i < 256; i++) {
+		c = i;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	a = a << 7;
+	return a;
+}
+
+u32 r5_hash(const signed char *msg, int len)
+{
+	u32 a = 0;
+	while (*msg) {
+		a += *msg << 4;
+		a += *msg >> 4;
+		a *= 11;
+		msg++;
+	}
+	return a;
+}
diff --git a/kernel/fs/reiserfs/ibalance.c b/kernel/fs/reiserfs/ibalance.c
new file mode 100644
index 000000000..b751eea32
--- /dev/null
+++ b/kernel/fs/reiserfs/ibalance.c
@@ -0,0 +1,1160 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include "reiserfs.h"
+#include <linux/buffer_head.h>
+
+/* this is one and only function that is used outside (do_balance.c) */
+int balance_internal(struct tree_balance *,
+		     int, int, struct item_head *, struct buffer_head **);
+
+/*
+ * modes of internal_shift_left, internal_shift_right and
+ * internal_insert_childs
+ */
+#define INTERNAL_SHIFT_FROM_S_TO_L 0
+#define INTERNAL_SHIFT_FROM_R_TO_S 1
+#define INTERNAL_SHIFT_FROM_L_TO_S 2
+#define INTERNAL_SHIFT_FROM_S_TO_R 3
+#define INTERNAL_INSERT_TO_S 4
+#define INTERNAL_INSERT_TO_L 5
+#define INTERNAL_INSERT_TO_R 6
+
+static void internal_define_dest_src_infos(int shift_mode,
+					   struct tree_balance *tb,
+					   int h,
+					   struct buffer_info *dest_bi,
+					   struct buffer_info *src_bi,
+					   int *d_key, struct buffer_head **cf)
+{
+	memset(dest_bi, 0, sizeof(struct buffer_info));
+	memset(src_bi, 0, sizeof(struct buffer_info));
+	/* define dest, src, dest parent, dest position */
+	switch (shift_mode) {
+
+	/* used in internal_shift_left */
+	case INTERNAL_SHIFT_FROM_S_TO_L:
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[h];
+		dest_bi->bi_parent = tb->FL[h];
+		dest_bi->bi_position = get_left_neighbor_position(tb, h);
+		*d_key = tb->lkey[h];
+		*cf = tb->CFL[h];
+		break;
+	case INTERNAL_SHIFT_FROM_L_TO_S:
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->L[h];
+		src_bi->bi_parent = tb->FL[h];
+		src_bi->bi_position = get_left_neighbor_position(tb, h);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		/* dest position is analog of dest->b_item_order */
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		*d_key = tb->lkey[h];
+		*cf = tb->CFL[h];
+		break;
+
+	/* used in internal_shift_left */
+	case INTERNAL_SHIFT_FROM_R_TO_S:
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->R[h];
+		src_bi->bi_parent = tb->FR[h];
+		src_bi->bi_position = get_right_neighbor_position(tb, h);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		*d_key = tb->rkey[h];
+		*cf = tb->CFR[h];
+		break;
+
+	case INTERNAL_SHIFT_FROM_S_TO_R:
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[h];
+		dest_bi->bi_parent = tb->FR[h];
+		dest_bi->bi_position = get_right_neighbor_position(tb, h);
+		*d_key = tb->rkey[h];
+		*cf = tb->CFR[h];
+		break;
+
+	case INTERNAL_INSERT_TO_L:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[h];
+		dest_bi->bi_parent = tb->FL[h];
+		dest_bi->bi_position = get_left_neighbor_position(tb, h);
+		break;
+
+	case INTERNAL_INSERT_TO_S:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		break;
+
+	case INTERNAL_INSERT_TO_R:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[h];
+		dest_bi->bi_parent = tb->FR[h];
+		dest_bi->bi_position = get_right_neighbor_position(tb, h);
+		break;
+
+	default:
+		reiserfs_panic(tb->tb_sb, "ibalance-1",
+			       "shift type is unknown (%d)",
+			       shift_mode);
+	}
+}
+
+/*
+ * Insert count node pointers into buffer cur before position to + 1.
+ * Insert count items into buffer cur before position to.
+ * Items and node pointers are specified by inserted and bh respectively.
+ */
+static void internal_insert_childs(struct buffer_info *cur_bi,
+				   int to, int count,
+				   struct item_head *inserted,
+				   struct buffer_head **bh)
+{
+	struct buffer_head *cur = cur_bi->bi_bh;
+	struct block_head *blkh;
+	int nr;
+	struct reiserfs_key *ih;
+	struct disk_child new_dc[2];
+	struct disk_child *dc;
+	int i;
+
+	if (count <= 0)
+		return;
+
+	blkh = B_BLK_HEAD(cur);
+	nr = blkh_nr_item(blkh);
+
+	RFALSE(count > 2, "too many children (%d) are to be inserted", count);
+	RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
+	       "no enough free space (%d), needed %d bytes",
+	       B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
+
+	/* prepare space for count disk_child */
+	dc = B_N_CHILD(cur, to + 1);
+
+	memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
+
+	/* copy to_be_insert disk children */
+	for (i = 0; i < count; i++) {
+		put_dc_size(&new_dc[i],
+			    MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
+		put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
+	}
+	memcpy(dc, new_dc, DC_SIZE * count);
+
+	/* prepare space for count items  */
+	ih = internal_key(cur, ((to == -1) ? 0 : to));
+
+	memmove(ih + count, ih,
+		(nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
+
+	/* copy item headers (keys) */
+	memcpy(ih, inserted, KEY_SIZE);
+	if (count > 1)
+		memcpy(ih + 1, inserted + 1, KEY_SIZE);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) - count * (DC_SIZE +
+							     KEY_SIZE));
+
+	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
+
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(cur);
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (cur_bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
+		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
+					       0);
+
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(cur_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
+
+}
+
+/*
+ * Delete del_num items and node pointers from buffer cur starting from
+ * the first_i'th item and first_p'th pointers respectively.
+ */
+static void internal_delete_pointers_items(struct buffer_info *cur_bi,
+					   int first_p,
+					   int first_i, int del_num)
+{
+	struct buffer_head *cur = cur_bi->bi_bh;
+	int nr;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+
+	RFALSE(cur == NULL, "buffer is 0");
+	RFALSE(del_num < 0,
+	       "negative number of items (%d) can not be deleted", del_num);
+	RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
+	       || first_i < 0,
+	       "first pointer order (%d) < 0 or "
+	       "no so many pointers (%d), only (%d) or "
+	       "first key order %d < 0", first_p, first_p + del_num,
+	       B_NR_ITEMS(cur) + 1, first_i);
+	if (del_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(cur);
+	nr = blkh_nr_item(blkh);
+
+	if (first_p == 0 && del_num == nr + 1) {
+		RFALSE(first_i != 0,
+		       "1st deleted key must have order 0, not %d", first_i);
+		make_empty_node(cur_bi);
+		return;
+	}
+
+	RFALSE(first_i + del_num > B_NR_ITEMS(cur),
+	       "first_i = %d del_num = %d "
+	       "no so many keys (%d) in the node (%b)(%z)",
+	       first_i, del_num, first_i + del_num, cur, cur);
+
+	/* deleting */
+	dc = B_N_CHILD(cur, first_p);
+
+	memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
+	key = internal_key(cur, first_i);
+	memmove(key, key + del_num,
+		(nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
+						       del_num) * DC_SIZE);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) +
+			    (del_num * (KEY_SIZE + DC_SIZE)));
+
+	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
+	/*&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(cur);
+	/*&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (cur_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
+
+		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
+					       0);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(cur_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
+}
+
+/* delete n node pointers and items starting from given position */
+static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
+{
+	int i_from;
+
+	i_from = (from == 0) ? from : from - 1;
+
+	/*
+	 * delete n pointers starting from `from' position in CUR;
+	 * delete n keys starting from 'i_from' position in CUR;
+	 */
+	internal_delete_pointers_items(cur_bi, from, i_from, n);
+}
+
+/*
+ * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
+ * dest
+ * last_first == FIRST_TO_LAST means that we copy first items
+ *                             from src to tail of dest
+ * last_first == LAST_TO_FIRST means that we copy last items
+ *                             from src to head of dest
+ */
+static void internal_copy_pointers_items(struct buffer_info *dest_bi,
+					 struct buffer_head *src,
+					 int last_first, int cpy_num)
+{
+	/*
+	 * ATTENTION! Number of node pointers in DEST is equal to number
+	 * of items in DEST  as delimiting key have already inserted to
+	 * buffer dest.
+	 */
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int nr_dest, nr_src;
+	int dest_order, src_order;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+
+	nr_src = B_NR_ITEMS(src);
+
+	RFALSE(dest == NULL || src == NULL,
+	       "src (%p) or dest (%p) buffer is 0", src, dest);
+	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+	       "invalid last_first parameter (%d)", last_first);
+	RFALSE(nr_src < cpy_num - 1,
+	       "no so many items (%d) in src (%d)", cpy_num, nr_src);
+	RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
+	RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
+	       "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
+	       cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
+
+	if (cpy_num == 0)
+		return;
+
+	/* coping */
+	blkh = B_BLK_HEAD(dest);
+	nr_dest = blkh_nr_item(blkh);
+
+	/*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
+	/*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
+	(last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
+					 nr_src - cpy_num + 1) : (dest_order =
+								  nr_dest,
+								  src_order =
+								  0);
+
+	/* prepare space for cpy_num pointers */
+	dc = B_N_CHILD(dest, dest_order);
+
+	memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
+
+	/* insert pointers */
+	memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
+
+	/* prepare space for cpy_num - 1 item headers */
+	key = internal_key(dest, dest_order);
+	memmove(key + cpy_num - 1, key,
+		KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
+							       cpy_num));
+
+	/* insert headers */
+	memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
+						     DC_SIZE * cpy_num));
+
+	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
+
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(dest);
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
+					     DC_SIZE * cpy_num));
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(dest_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
+
+}
+
+/*
+ * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
+ * buffer dest.
+ * Delete cpy_num - del_par items and node pointers from buffer src.
+ * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
+ * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
+ */
+static void internal_move_pointers_items(struct buffer_info *dest_bi,
+					 struct buffer_info *src_bi,
+					 int last_first, int cpy_num,
+					 int del_par)
+{
+	int first_pointer;
+	int first_item;
+
+	internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
+				     cpy_num);
+
+	if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
+		first_pointer = 0;
+		first_item = 0;
+		/*
+		 * delete cpy_num - del_par pointers and keys starting for
+		 * pointers with first_pointer, for key - with first_item
+		 */
+		internal_delete_pointers_items(src_bi, first_pointer,
+					       first_item, cpy_num - del_par);
+	} else {		/* shift_right occurs */
+		int i, j;
+
+		i = (cpy_num - del_par ==
+		     (j =
+		      B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
+		    del_par;
+
+		internal_delete_pointers_items(src_bi,
+					       j + 1 - cpy_num + del_par, i,
+					       cpy_num - del_par);
+	}
+}
+
+/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
+static void internal_insert_key(struct buffer_info *dest_bi,
+				/* insert key before key with n_dest number */
+				int dest_position_before,
+				struct buffer_head *src, int src_position)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int nr;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+
+	RFALSE(dest == NULL || src == NULL,
+	       "source(%p) or dest(%p) buffer is 0", src, dest);
+	RFALSE(dest_position_before < 0 || src_position < 0,
+	       "source(%d) or dest(%d) key number less than 0",
+	       src_position, dest_position_before);
+	RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
+	       src_position >= B_NR_ITEMS(src),
+	       "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
+	       dest_position_before, B_NR_ITEMS(dest),
+	       src_position, B_NR_ITEMS(src));
+	RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
+	       "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
+
+	blkh = B_BLK_HEAD(dest);
+	nr = blkh_nr_item(blkh);
+
+	/* prepare space for inserting key */
+	key = internal_key(dest, dest_position_before);
+	memmove(key + 1, key,
+		(nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
+
+	/* insert key */
+	memcpy(key, internal_key(src, src_position), KEY_SIZE);
+
+	/* Change dirt, free space, item number fields. */
+
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
+	set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
+
+	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+	}
+}
+
+/*
+ * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
+ * Copy pointer_amount node pointers and pointer_amount - 1 items from
+ * buffer src to buffer dest.
+ * Replace  d_key'th key in buffer cfl.
+ * Delete pointer_amount items and node pointers from buffer src.
+ */
+/* this can be invoked both to shift from S to L and from R to S */
+static void internal_shift_left(
+				/*
+				 * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
+				 */
+				int mode,
+				struct tree_balance *tb,
+				int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
+				       &d_key_position, &cf);
+
+	/*printk("pointer_amount = %d\n",pointer_amount); */
+
+	if (pointer_amount) {
+		/*
+		 * insert delimiting key from common father of dest and
+		 * src to node dest into position B_NR_ITEM(dest)
+		 */
+		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
+				    d_key_position);
+
+		if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
+			if (src_bi.bi_position /*src->b_item_order */  == 0)
+				replace_key(tb, cf, d_key_position,
+					    src_bi.
+					    bi_parent /*src->b_parent */ , 0);
+		} else
+			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
+				    pointer_amount - 1);
+	}
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
+				     pointer_amount, 0);
+
+}
+
+/*
+ * Insert delimiting key to L[h].
+ * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
+ * Delete n - 1 items and node pointers from buffer S[h].
+ */
+/* it always shifts from S[h] to L[h] */
+static void internal_shift1_left(struct tree_balance *tb,
+				 int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+				       &dest_bi, &src_bi, &d_key_position, &cf);
+
+	/* insert lkey[h]-th key  from CFL[h] to left neighbor L[h] */
+	if (pointer_amount > 0)
+		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
+				    d_key_position);
+
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
+				     pointer_amount, 1);
+}
+
+/*
+ * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
+ * Copy n node pointers and n - 1 items from buffer src to buffer dest.
+ * Replace  d_key'th key in buffer cfr.
+ * Delete n items and node pointers from buffer src.
+ */
+static void internal_shift_right(
+				 /*
+				  * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
+				  */
+				 int mode,
+				 struct tree_balance *tb,
+				 int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+	int nr;
+
+	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
+				       &d_key_position, &cf);
+
+	nr = B_NR_ITEMS(src_bi.bi_bh);
+
+	if (pointer_amount > 0) {
+		/*
+		 * insert delimiting key from common father of dest
+		 * and src to dest node into position 0
+		 */
+		internal_insert_key(&dest_bi, 0, cf, d_key_position);
+		if (nr == pointer_amount - 1) {
+			RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
+			       dest_bi.bi_bh != tb->R[h],
+			       "src (%p) must be == tb->S[h](%p) when it disappears",
+			       src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
+			/* when S[h] disappers replace left delemiting key as well */
+			if (tb->CFL[h])
+				replace_key(tb, cf, d_key_position, tb->CFL[h],
+					    tb->lkey[h]);
+		} else
+			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
+				    nr - pointer_amount);
+	}
+
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
+				     pointer_amount, 0);
+}
+
+/*
+ * Insert delimiting key to R[h].
+ * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
+ * Delete n - 1 items and node pointers from buffer S[h].
+ */
+/* it always shift from S[h] to R[h] */
+static void internal_shift1_right(struct tree_balance *tb,
+				  int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+				       &dest_bi, &src_bi, &d_key_position, &cf);
+
+	/* insert rkey from CFR[h] to right neighbor R[h] */
+	if (pointer_amount > 0)
+		internal_insert_key(&dest_bi, 0, cf, d_key_position);
+
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
+				     pointer_amount, 1);
+}
+
+/*
+ * Delete insert_num node pointers together with their left items
+ * and balance current node.
+ */
+static void balance_internal_when_delete(struct tree_balance *tb,
+					 int h, int child_pos)
+{
+	int insert_num;
+	int n;
+	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+	struct buffer_info bi;
+
+	insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
+
+	/* delete child-node-pointer(s) together with their left item(s) */
+	bi.tb = tb;
+	bi.bi_bh = tbSh;
+	bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+	bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+	internal_delete_childs(&bi, child_pos, -insert_num);
+
+	RFALSE(tb->blknum[h] > 1,
+	       "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
+
+	n = B_NR_ITEMS(tbSh);
+
+	if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
+		if (tb->blknum[h] == 0) {
+			/* node S[h] (root of the tree) is empty now */
+			struct buffer_head *new_root;
+
+			RFALSE(n
+			       || B_FREE_SPACE(tbSh) !=
+			       MAX_CHILD_SIZE(tbSh) - DC_SIZE,
+			       "buffer must have only 0 keys (%d)", n);
+			RFALSE(bi.bi_parent, "root has parent (%p)",
+			       bi.bi_parent);
+
+			/* choose a new root */
+			if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
+				new_root = tb->R[h - 1];
+			else
+				new_root = tb->L[h - 1];
+			/*
+			 * switch super block's tree root block
+			 * number to the new value */
+			PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
+			/*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
+			PUT_SB_TREE_HEIGHT(tb->tb_sb,
+					   SB_TREE_HEIGHT(tb->tb_sb) - 1);
+
+			do_balance_mark_sb_dirty(tb,
+						 REISERFS_SB(tb->tb_sb)->s_sbh,
+						 1);
+			/*&&&&&&&&&&&&&&&&&&&&&& */
+			/* use check_internal if new root is an internal node */
+			if (h > 1)
+				check_internal(new_root);
+			/*&&&&&&&&&&&&&&&&&&&&&& */
+
+			/* do what is needed for buffer thrown from tree */
+			reiserfs_invalidate_buffer(tb, tbSh);
+			return;
+		}
+		return;
+	}
+
+	/* join S[h] with L[h] */
+	if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
+
+		RFALSE(tb->rnum[h] != 0,
+		       "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
+		       h, tb->rnum[h]);
+
+		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
+		reiserfs_invalidate_buffer(tb, tbSh);
+
+		return;
+	}
+
+	/* join S[h] with R[h] */
+	if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
+		RFALSE(tb->lnum[h] != 0,
+		       "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
+		       h, tb->lnum[h]);
+
+		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
+
+		reiserfs_invalidate_buffer(tb, tbSh);
+		return;
+	}
+
+	/* borrow from left neighbor L[h] */
+	if (tb->lnum[h] < 0) {
+		RFALSE(tb->rnum[h] != 0,
+		       "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
+		       tb->rnum[h]);
+		internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
+				     -tb->lnum[h]);
+		return;
+	}
+
+	/* borrow from right neighbor R[h] */
+	if (tb->rnum[h] < 0) {
+		RFALSE(tb->lnum[h] != 0,
+		       "invalid tb->lnum[%d]==%d when borrow from R[h]",
+		       h, tb->lnum[h]);
+		internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);	/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
+		return;
+	}
+
+	/* split S[h] into two parts and put them into neighbors */
+	if (tb->lnum[h] > 0) {
+		RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
+		       "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
+		       h, tb->lnum[h], h, tb->rnum[h], n);
+
+		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);	/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
+		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+				     tb->rnum[h]);
+
+		reiserfs_invalidate_buffer(tb, tbSh);
+
+		return;
+	}
+	reiserfs_panic(tb->tb_sb, "ibalance-2",
+		       "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
+		       h, tb->lnum[h], h, tb->rnum[h]);
+}
+
+/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
+static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
+{
+	RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
+	       "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
+	       tb->L[h], tb->CFL[h]);
+
+	if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
+		return;
+
+	memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
+
+	do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
+}
+
+/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
+static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
+{
+	RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
+	       "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
+	       tb->R[h], tb->CFR[h]);
+	RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
+	       "R[h] can not be empty if it exists (item number=%d)",
+	       B_NR_ITEMS(tb->R[h]));
+
+	memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
+
+	do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
+}
+
+
+/*
+ * if inserting/pasting {
+ *   child_pos is the position of the node-pointer in S[h] that
+ *   pointed to S[h-1] before balancing of the h-1 level;
+ *   this means that new pointers and items must be inserted AFTER
+ *   child_pos
+ * } else {
+ *   it is the position of the leftmost pointer that must be deleted
+ *   (together with its corresponding key to the left of the pointer)
+ *   as a result of the previous level's balancing.
+ * }
+ */
+
+int balance_internal(struct tree_balance *tb,
+		     int h,	/* level of the tree */
+		     int child_pos,
+		     /* key for insertion on higher level    */
+		     struct item_head *insert_key,
+		     /* node for insertion on higher level */
+		     struct buffer_head **insert_ptr)
+{
+	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+	struct buffer_info bi;
+
+	/*
+	 * we return this: it is 0 if there is no S[h],
+	 * else it is tb->S[h]->b_item_order
+	 */
+	int order;
+	int insert_num, n, k;
+	struct buffer_head *S_new;
+	struct item_head new_insert_key;
+	struct buffer_head *new_insert_ptr = NULL;
+	struct item_head *new_insert_key_addr = insert_key;
+
+	RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
+
+	PROC_INFO_INC(tb->tb_sb, balance_at[h]);
+
+	order =
+	    (tbSh) ? PATH_H_POSITION(tb->tb_path,
+				     h + 1) /*tb->S[h]->b_item_order */ : 0;
+
+	/*
+	 * Using insert_size[h] calculate the number insert_num of items
+	 * that must be inserted to or deleted from S[h].
+	 */
+	insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
+
+	/* Check whether insert_num is proper * */
+	RFALSE(insert_num < -2 || insert_num > 2,
+	       "incorrect number of items inserted to the internal node (%d)",
+	       insert_num);
+	RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
+	       "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
+	       insert_num, h);
+
+	/* Make balance in case insert_num < 0 */
+	if (insert_num < 0) {
+		balance_internal_when_delete(tb, h, child_pos);
+		return order;
+	}
+
+	k = 0;
+	if (tb->lnum[h] > 0) {
+		/*
+		 * shift lnum[h] items from S[h] to the left neighbor L[h].
+		 * check how many of new items fall into L[h] or CFL[h] after
+		 * shifting
+		 */
+		n = B_NR_ITEMS(tb->L[h]);	/* number of items in L[h] */
+		if (tb->lnum[h] <= child_pos) {
+			/* new items don't fall into L[h] or CFL[h] */
+			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+					    tb->lnum[h]);
+			child_pos -= tb->lnum[h];
+		} else if (tb->lnum[h] > child_pos + insert_num) {
+			/* all new items fall into L[h] */
+			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+					    tb->lnum[h] - insert_num);
+			/* insert insert_num keys and node-pointers into L[h] */
+			bi.tb = tb;
+			bi.bi_bh = tb->L[h];
+			bi.bi_parent = tb->FL[h];
+			bi.bi_position = get_left_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->L[h], tb->S[h-1]->b_next */
+					       n + child_pos + 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/*
+			 * some items fall into L[h] or CFL[h],
+			 * but some don't fall
+			 */
+			internal_shift1_left(tb, h, child_pos + 1);
+			/* calculate number of new items that fall into L[h] */
+			k = tb->lnum[h] - child_pos - 1;
+			bi.tb = tb;
+			bi.bi_bh = tb->L[h];
+			bi.bi_parent = tb->FL[h];
+			bi.bi_position = get_left_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->L[h], tb->S[h-1]->b_next, */
+					       n + child_pos + 1, k,
+					       insert_key, insert_ptr);
+
+			replace_lkey(tb, h, insert_key + k);
+
+			/*
+			 * replace the first node-ptr in S[h] by
+			 * node-ptr to insert_ptr[k]
+			 */
+			dc = B_N_CHILD(tbSh, 0);
+			put_dc_size(dc,
+				    MAX_CHILD_SIZE(insert_ptr[k]) -
+				    B_FREE_SPACE(insert_ptr[k]));
+			put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, tbSh, 0);
+
+			k++;
+			insert_key += k;
+			insert_ptr += k;
+			insert_num -= k;
+			child_pos = 0;
+		}
+	}
+	/* tb->lnum[h] > 0 */
+	if (tb->rnum[h] > 0) {
+		/*shift rnum[h] items from S[h] to the right neighbor R[h] */
+		/*
+		 * check how many of new items fall into R or CFR
+		 * after shifting
+		 */
+		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
+		if (n - tb->rnum[h] >= child_pos)
+			/* new items fall into S[h] */
+			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+					     tb->rnum[h]);
+		else if (n + insert_num - tb->rnum[h] < child_pos) {
+			/* all new items fall into R[h] */
+			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+					     tb->rnum[h] - insert_num);
+
+			/* insert insert_num keys and node-pointers into R[h] */
+			bi.tb = tb;
+			bi.bi_bh = tb->R[h];
+			bi.bi_parent = tb->FR[h];
+			bi.bi_position = get_right_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->R[h],tb->S[h-1]->b_next */
+					       child_pos - n - insert_num +
+					       tb->rnum[h] - 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* one of the items falls into CFR[h] */
+			internal_shift1_right(tb, h, n - child_pos + 1);
+			/* calculate number of new items that fall into R[h] */
+			k = tb->rnum[h] - n + child_pos - 1;
+			bi.tb = tb;
+			bi.bi_bh = tb->R[h];
+			bi.bi_parent = tb->FR[h];
+			bi.bi_position = get_right_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->R[h], tb->R[h]->b_child, */
+					       0, k, insert_key + 1,
+					       insert_ptr + 1);
+
+			replace_rkey(tb, h, insert_key + insert_num - k - 1);
+
+			/*
+			 * replace the first node-ptr in R[h] by
+			 * node-ptr insert_ptr[insert_num-k-1]
+			 */
+			dc = B_N_CHILD(tb->R[h], 0);
+			put_dc_size(dc,
+				    MAX_CHILD_SIZE(insert_ptr
+						   [insert_num - k - 1]) -
+				    B_FREE_SPACE(insert_ptr
+						 [insert_num - k - 1]));
+			put_dc_block_number(dc,
+					    insert_ptr[insert_num - k -
+						       1]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, tb->R[h], 0);
+
+			insert_num -= (k + 1);
+		}
+	}
+
+	/** Fill new node that appears instead of S[h] **/
+	RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
+	RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
+
+	if (!tb->blknum[h]) {	/* node S[h] is empty now */
+		RFALSE(!tbSh, "S[h] is equal NULL");
+
+		/* do what is needed for buffer thrown from tree */
+		reiserfs_invalidate_buffer(tb, tbSh);
+		return order;
+	}
+
+	if (!tbSh) {
+		/* create new root */
+		struct disk_child *dc;
+		struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
+		struct block_head *blkh;
+
+		if (tb->blknum[h] != 1)
+			reiserfs_panic(NULL, "ibalance-3", "One new node "
+				       "required for creating the new root");
+		/* S[h] = empty buffer from the list FEB. */
+		tbSh = get_FEB(tb);
+		blkh = B_BLK_HEAD(tbSh);
+		set_blkh_level(blkh, h + 1);
+
+		/* Put the unique node-pointer to S[h] that points to S[h-1]. */
+
+		dc = B_N_CHILD(tbSh, 0);
+		put_dc_block_number(dc, tbSh_1->b_blocknr);
+		put_dc_size(dc,
+			    (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
+
+		tb->insert_size[h] -= DC_SIZE;
+		set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
+
+		do_balance_mark_internal_dirty(tb, tbSh, 0);
+
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(tbSh);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+		/* put new root into path structure */
+		PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
+		    tbSh;
+
+		/* Change root in structure super block. */
+		PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
+		PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
+		do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
+	}
+
+	if (tb->blknum[h] == 2) {
+		int snum;
+		struct buffer_info dest_bi, src_bi;
+
+		/* S_new = free buffer from list FEB */
+		S_new = get_FEB(tb);
+
+		set_blkh_level(B_BLK_HEAD(S_new), h + 1);
+
+		dest_bi.tb = tb;
+		dest_bi.bi_bh = S_new;
+		dest_bi.bi_parent = NULL;
+		dest_bi.bi_position = 0;
+		src_bi.tb = tb;
+		src_bi.bi_bh = tbSh;
+		src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
+		snum = (insert_num + n + 1) / 2;
+		if (n - snum >= child_pos) {
+			/* new items don't fall into S_new */
+			/*  store the delimiting key for the next level */
+			/* new_insert_key = (n - snum)'th key in S[h] */
+			memcpy(&new_insert_key, internal_key(tbSh, n - snum),
+			       KEY_SIZE);
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST, snum, 0);
+		} else if (n + insert_num - snum < child_pos) {
+			/* all new items fall into S_new */
+			/*  store the delimiting key for the next level */
+			/*
+			 * new_insert_key = (n + insert_item - snum)'th
+			 * key in S[h]
+			 */
+			memcpy(&new_insert_key,
+			       internal_key(tbSh, n + insert_num - snum),
+			       KEY_SIZE);
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST,
+						     snum - insert_num, 0);
+
+			/*
+			 * insert insert_num keys and node-pointers
+			 * into S_new
+			 */
+			internal_insert_childs(&dest_bi,
+					       /*S_new,tb->S[h-1]->b_next, */
+					       child_pos - n - insert_num +
+					       snum - 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* some items fall into S_new, but some don't fall */
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST,
+						     n - child_pos + 1, 1);
+			/* calculate number of new items that fall into S_new */
+			k = snum - n + child_pos - 1;
+
+			internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
+					       insert_key + 1, insert_ptr + 1);
+
+			/* new_insert_key = insert_key[insert_num - k - 1] */
+			memcpy(&new_insert_key, insert_key + insert_num - k - 1,
+			       KEY_SIZE);
+			/*
+			 * replace first node-ptr in S_new by node-ptr
+			 * to insert_ptr[insert_num-k-1]
+			 */
+
+			dc = B_N_CHILD(S_new, 0);
+			put_dc_size(dc,
+				    (MAX_CHILD_SIZE
+				     (insert_ptr[insert_num - k - 1]) -
+				     B_FREE_SPACE(insert_ptr
+						  [insert_num - k - 1])));
+			put_dc_block_number(dc,
+					    insert_ptr[insert_num - k -
+						       1]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, S_new, 0);
+
+			insert_num -= (k + 1);
+		}
+		/* new_insert_ptr = node_pointer to S_new */
+		new_insert_ptr = S_new;
+
+		RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
+		       || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
+		       S_new);
+
+		/* S_new is released in unfix_nodes */
+	}
+
+	n = B_NR_ITEMS(tbSh);	/*number of items in S[h] */
+
+	if (0 <= child_pos && child_pos <= n && insert_num > 0) {
+		bi.tb = tb;
+		bi.bi_bh = tbSh;
+		bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		internal_insert_childs(&bi,	/*tbSh, */
+				       /*          ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next :  tb->S[h]->b_child->b_next, */
+				       child_pos, insert_num, insert_key,
+				       insert_ptr);
+	}
+
+	memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
+	insert_ptr[0] = new_insert_ptr;
+
+	return order;
+}
diff --git a/kernel/fs/reiserfs/inode.c b/kernel/fs/reiserfs/inode.c
new file mode 100644
index 000000000..f6f2fbad9
--- /dev/null
+++ b/kernel/fs/reiserfs/inode.c
@@ -0,0 +1,3461 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
+#include <linux/exportfs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/quotaops.h>
+#include <linux/swap.h>
+#include <linux/uio.h>
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to);
+
+void reiserfs_evict_inode(struct inode *inode)
+{
+	/*
+	 * We need blocks for transaction + (user+group) quota
+	 * update (possibly delete)
+	 */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 +
+	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
+	struct reiserfs_transaction_handle th;
+	int err;
+
+	if (!inode->i_nlink && !is_bad_inode(inode))
+		dquot_initialize(inode);
+
+	truncate_inode_pages_final(&inode->i_data);
+	if (inode->i_nlink)
+		goto no_delete;
+
+	/*
+	 * The = 0 happens when we abort creating a new inode
+	 * for some reason like lack of space..
+	 * also handles bad_inode case
+	 */
+	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
+
+		reiserfs_delete_xattrs(inode);
+
+		reiserfs_write_lock(inode->i_sb);
+
+		if (journal_begin(&th, inode->i_sb, jbegin_count))
+			goto out;
+		reiserfs_update_inode_transaction(inode);
+
+		reiserfs_discard_prealloc(&th, inode);
+
+		err = reiserfs_delete_object(&th, inode);
+
+		/*
+		 * Do quota update inside a transaction for journaled quotas.
+		 * We must do that after delete_object so that quota updates
+		 * go into the same transaction as stat data deletion
+		 */
+		if (!err) {
+			int depth = reiserfs_write_unlock_nested(inode->i_sb);
+			dquot_free_inode(inode);
+			reiserfs_write_lock_nested(inode->i_sb, depth);
+		}
+
+		if (journal_end(&th))
+			goto out;
+
+		/*
+		 * check return value from reiserfs_delete_object after
+		 * ending the transaction
+		 */
+		if (err)
+		    goto out;
+
+		/*
+		 * all items of file are deleted, so we can remove
+		 * "save" link
+		 * we can't do anything about an error here
+		 */
+		remove_save_link(inode, 0 /* not truncate */);
+out:
+		reiserfs_write_unlock(inode->i_sb);
+	} else {
+		/* no object items are in the tree */
+		;
+	}
+
+	/* note this must go after the journal_end to prevent deadlock */
+	clear_inode(inode);
+
+	dquot_drop(inode);
+	inode->i_blocks = 0;
+	return;
+
+no_delete:
+	clear_inode(inode);
+	dquot_drop(inode);
+}
+
+static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
+			  __u32 objectid, loff_t offset, int type, int length)
+{
+	key->version = version;
+
+	key->on_disk_key.k_dir_id = dirid;
+	key->on_disk_key.k_objectid = objectid;
+	set_cpu_key_k_offset(key, offset);
+	set_cpu_key_k_type(key, type);
+	key->key_length = length;
+}
+
+/*
+ * take base of inode_key (it comes from inode always) (dirid, objectid)
+ * and version from an inode, set offset and type of key
+ */
+void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
+		  int type, int length)
+{
+	_make_cpu_key(key, get_inode_item_key_version(inode),
+		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
+		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
+		      length);
+}
+
+/* when key is 0, do not set version and short key */
+inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
+			      int version,
+			      loff_t offset, int type, int length,
+			      int entry_count /*or ih_free_space */ )
+{
+	if (key) {
+		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
+		ih->ih_key.k_objectid =
+		    cpu_to_le32(key->on_disk_key.k_objectid);
+	}
+	put_ih_version(ih, version);
+	set_le_ih_k_offset(ih, offset);
+	set_le_ih_k_type(ih, type);
+	put_ih_item_len(ih, length);
+	/*    set_ih_free_space (ih, 0); */
+	/*
+	 * for directory items it is entry count, for directs and stat
+	 * datas - 0xffff, for indirects - 0
+	 */
+	put_ih_entry_count(ih, entry_count);
+}
+
+/*
+ * FIXME: we might cache recently accessed indirect item
+ * Ugh.  Not too eager for that....
+ * I cut the code until such time as I see a convincing argument (benchmark).
+ * I don't want a bloated inode struct..., and I don't like code complexity....
+ */
+
+/*
+ * cutting the code is fine, since it really isn't in use yet and is easy
+ * to add back in.  But, Vladimir has a really good idea here.  Think
+ * about what happens for reading a file.  For each page,
+ * The VFS layer calls reiserfs_readpage, who searches the tree to find
+ * an indirect item.  This indirect item has X number of pointers, where
+ * X is a big number if we've done the block allocation right.  But,
+ * we only use one or two of these pointers during each call to readpage,
+ * needlessly researching again later on.
+ *
+ * The size of the cache could be dynamic based on the size of the file.
+ *
+ * I'd also like to see us cache the location the stat data item, since
+ * we are needlessly researching for that frequently.
+ *
+ * --chris
+ */
+
+/*
+ * If this page has a file tail in it, and
+ * it was read in by get_block_create_0, the page data is valid,
+ * but tail is still sitting in a direct item, and we can't write to
+ * it.  So, look through this page, and check all the mapped buffers
+ * to make sure they have valid block numbers.  Any that don't need
+ * to be unmapped, so that __block_write_begin will correctly call
+ * reiserfs_get_block to convert the tail into an unformatted node
+ */
+static inline void fix_tail_page_for_writing(struct page *page)
+{
+	struct buffer_head *head, *next, *bh;
+
+	if (page && page_has_buffers(page)) {
+		head = page_buffers(page);
+		bh = head;
+		do {
+			next = bh->b_this_page;
+			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
+				reiserfs_unmap_buffer(bh);
+			}
+			bh = next;
+		} while (bh != head);
+	}
+}
+
+/*
+ * reiserfs_get_block does not need to allocate a block only if it has been
+ * done already or non-hole position has been found in the indirect item
+ */
+static inline int allocation_needed(int retval, b_blocknr_t allocated,
+				    struct item_head *ih,
+				    __le32 * item, int pos_in_item)
+{
+	if (allocated)
+		return 0;
+	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
+	    get_block_num(item, pos_in_item))
+		return 0;
+	return 1;
+}
+
+static inline int indirect_item_found(int retval, struct item_head *ih)
+{
+	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
+}
+
+static inline void set_block_dev_mapped(struct buffer_head *bh,
+					b_blocknr_t block, struct inode *inode)
+{
+	map_bh(bh, inode->i_sb, block);
+}
+
+/*
+ * files which were created in the earlier version can not be longer,
+ * than 2 gb
+ */
+static int file_capable(struct inode *inode, sector_t block)
+{
+	/* it is new file. */
+	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
+	    /* old file, but 'block' is inside of 2gb */
+	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
+		return 1;
+
+	return 0;
+}
+
+static int restart_transaction(struct reiserfs_transaction_handle *th,
+			       struct inode *inode, struct treepath *path)
+{
+	struct super_block *s = th->t_super;
+	int err;
+
+	BUG_ON(!th->t_trans_id);
+	BUG_ON(!th->t_refcount);
+
+	pathrelse(path);
+
+	/* we cannot restart while nested */
+	if (th->t_refcount > 1) {
+		return 0;
+	}
+	reiserfs_update_sd(th, inode);
+	err = journal_end(th);
+	if (!err) {
+		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
+		if (!err)
+			reiserfs_update_inode_transaction(inode);
+	}
+	return err;
+}
+
+/*
+ * it is called by get_block when create == 0. Returns block number
+ * for 'block'-th logical block of file. When it hits direct item it
+ * returns 0 (being called from bmap) or read direct item into piece
+ * of page (bh_result)
+ * Please improve the english/clarity in the comment above, as it is
+ * hard to understand.
+ */
+static int _get_block_create_0(struct inode *inode, sector_t block,
+			       struct buffer_head *bh_result, int args)
+{
+	INITIALIZE_PATH(path);
+	struct cpu_key key;
+	struct buffer_head *bh;
+	struct item_head *ih, tmp_ih;
+	b_blocknr_t blocknr;
+	char *p = NULL;
+	int chars;
+	int ret;
+	int result;
+	int done = 0;
+	unsigned long offset;
+
+	/* prepare the key to look for the 'block'-th block of file */
+	make_cpu_key(&key, inode,
+		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
+		     3);
+
+	result = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (result != POSITION_FOUND) {
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		if (result == IO_ERROR)
+			return -EIO;
+		/*
+		 * We do not return -ENOENT if there is a hole but page is
+		 * uptodate, because it means that there is some MMAPED data
+		 * associated with it that is yet to be written to disk.
+		 */
+		if ((args & GET_BLOCK_NO_HOLE)
+		    && !PageUptodate(bh_result->b_page)) {
+			return -ENOENT;
+		}
+		return 0;
+	}
+
+	bh = get_last_bh(&path);
+	ih = tp_item_head(&path);
+	if (is_indirect_le_ih(ih)) {
+		__le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
+
+		/*
+		 * FIXME: here we could cache indirect item or part of it in
+		 * the inode to avoid search_by_key in case of subsequent
+		 * access to file
+		 */
+		blocknr = get_block_num(ind_item, path.pos_in_item);
+		ret = 0;
+		if (blocknr) {
+			map_bh(bh_result, inode->i_sb, blocknr);
+			if (path.pos_in_item ==
+			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
+				set_buffer_boundary(bh_result);
+			}
+		} else
+			/*
+			 * We do not return -ENOENT if there is a hole but
+			 * page is uptodate, because it means that there is
+			 * some MMAPED data associated with it that is
+			 * yet to be written to disk.
+			 */
+		if ((args & GET_BLOCK_NO_HOLE)
+			    && !PageUptodate(bh_result->b_page)) {
+			ret = -ENOENT;
+		}
+
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		return ret;
+	}
+	/* requested data are in direct item(s) */
+	if (!(args & GET_BLOCK_READ_DIRECT)) {
+		/*
+		 * we are called by bmap. FIXME: we can not map block of file
+		 * when it is stored in direct item(s)
+		 */
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		return -ENOENT;
+	}
+
+	/*
+	 * if we've got a direct item, and the buffer or page was uptodate,
+	 * we don't want to pull data off disk again.  skip to the
+	 * end, where we map the buffer and return
+	 */
+	if (buffer_uptodate(bh_result)) {
+		goto finished;
+	} else
+		/*
+		 * grab_tail_page can trigger calls to reiserfs_get_block on
+		 * up to date pages without any buffers.  If the page is up
+		 * to date, we don't want read old data off disk.  Set the up
+		 * to date bit on the buffer instead and jump to the end
+		 */
+	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
+		set_buffer_uptodate(bh_result);
+		goto finished;
+	}
+	/* read file tail into part of page */
+	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
+	copy_item_head(&tmp_ih, ih);
+
+	/*
+	 * we only want to kmap if we are reading the tail into the page.
+	 * this is not the common case, so we don't kmap until we are
+	 * sure we need to.  But, this means the item might move if
+	 * kmap schedules
+	 */
+	if (!p)
+		p = (char *)kmap(bh_result->b_page);
+
+	p += offset;
+	memset(p, 0, inode->i_sb->s_blocksize);
+	do {
+		if (!is_direct_le_ih(ih)) {
+			BUG();
+		}
+		/*
+		 * make sure we don't read more bytes than actually exist in
+		 * the file.  This can happen in odd cases where i_size isn't
+		 * correct, and when direct item padding results in a few
+		 * extra bytes at the end of the direct item
+		 */
+		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
+			break;
+		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
+			chars =
+			    inode->i_size - (le_ih_k_offset(ih) - 1) -
+			    path.pos_in_item;
+			done = 1;
+		} else {
+			chars = ih_item_len(ih) - path.pos_in_item;
+		}
+		memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
+
+		if (done)
+			break;
+
+		p += chars;
+
+		/*
+		 * we done, if read direct item is not the last item of
+		 * node FIXME: we could try to check right delimiting key
+		 * to see whether direct item continues in the right
+		 * neighbor or rely on i_size
+		 */
+		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
+			break;
+
+		/* update key to look for the next piece */
+		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
+		result = search_for_position_by_key(inode->i_sb, &key, &path);
+		if (result != POSITION_FOUND)
+			/* i/o error most likely */
+			break;
+		bh = get_last_bh(&path);
+		ih = tp_item_head(&path);
+	} while (1);
+
+	flush_dcache_page(bh_result->b_page);
+	kunmap(bh_result->b_page);
+
+finished:
+	pathrelse(&path);
+
+	if (result == IO_ERROR)
+		return -EIO;
+
+	/*
+	 * this buffer has valid data, but isn't valid for io.  mapping it to
+	 * block #0 tells the rest of reiserfs it just has a tail in it
+	 */
+	map_bh(bh_result, inode->i_sb, 0);
+	set_buffer_uptodate(bh_result);
+	return 0;
+}
+
+/*
+ * this is called to create file map. So, _get_block_create_0 will not
+ * read direct item
+ */
+static int reiserfs_bmap(struct inode *inode, sector_t block,
+			 struct buffer_head *bh_result, int create)
+{
+	if (!file_capable(inode, block))
+		return -EFBIG;
+
+	reiserfs_write_lock(inode->i_sb);
+	/* do not read the direct item */
+	_get_block_create_0(inode, block, bh_result, 0);
+	reiserfs_write_unlock(inode->i_sb);
+	return 0;
+}
+
+/*
+ * special version of get_block that is only used by grab_tail_page right
+ * now.  It is sent to __block_write_begin, and when you try to get a
+ * block past the end of the file (or a block from a hole) it returns
+ * -ENOENT instead of a valid buffer.  __block_write_begin expects to
+ * be able to do i/o on the buffers returned, unless an error value
+ * is also returned.
+ *
+ * So, this allows __block_write_begin to be used for reading a single block
+ * in a page.  Where it does not produce a valid page for holes, or past the
+ * end of the file.  This turns out to be exactly what we need for reading
+ * tails for conversion.
+ *
+ * The point of the wrapper is forcing a certain value for create, even
+ * though the VFS layer is calling this function with create==1.  If you
+ * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
+ * don't use this function.
+*/
+static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
+				       struct buffer_head *bh_result,
+				       int create)
+{
+	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
+}
+
+/*
+ * This is special helper for reiserfs_get_block in case we are executing
+ * direct_IO request.
+ */
+static int reiserfs_get_blocks_direct_io(struct inode *inode,
+					 sector_t iblock,
+					 struct buffer_head *bh_result,
+					 int create)
+{
+	int ret;
+
+	bh_result->b_page = NULL;
+
+	/*
+	 * We set the b_size before reiserfs_get_block call since it is
+	 * referenced in convert_tail_for_hole() that may be called from
+	 * reiserfs_get_block()
+	 */
+	bh_result->b_size = (1 << inode->i_blkbits);
+
+	ret = reiserfs_get_block(inode, iblock, bh_result,
+				 create | GET_BLOCK_NO_DANGLE);
+	if (ret)
+		goto out;
+
+	/* don't allow direct io onto tail pages */
+	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+		/*
+		 * make sure future calls to the direct io funcs for this
+		 * offset in the file fail by unmapping the buffer
+		 */
+		clear_buffer_mapped(bh_result);
+		ret = -EINVAL;
+	}
+
+	/*
+	 * Possible unpacked tail. Flush the data before pages have
+	 * disappeared
+	 */
+	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
+		int err;
+
+		reiserfs_write_lock(inode->i_sb);
+
+		err = reiserfs_commit_for_inode(inode);
+		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+
+		reiserfs_write_unlock(inode->i_sb);
+
+		if (err < 0)
+			ret = err;
+	}
+out:
+	return ret;
+}
+
+/*
+ * helper function for when reiserfs_get_block is called for a hole
+ * but the file tail is still in a direct item
+ * bh_result is the buffer head for the hole
+ * tail_offset is the offset of the start of the tail in the file
+ *
+ * This calls prepare_write, which will start a new transaction
+ * you should not be in a transaction, or have any paths held when you
+ * call this.
+ */
+static int convert_tail_for_hole(struct inode *inode,
+				 struct buffer_head *bh_result,
+				 loff_t tail_offset)
+{
+	unsigned long index;
+	unsigned long tail_end;
+	unsigned long tail_start;
+	struct page *tail_page;
+	struct page *hole_page = bh_result->b_page;
+	int retval = 0;
+
+	if ((tail_offset & (bh_result->b_size - 1)) != 1)
+		return -EIO;
+
+	/* always try to read until the end of the block */
+	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
+	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
+
+	index = tail_offset >> PAGE_CACHE_SHIFT;
+	/*
+	 * hole_page can be zero in case of direct_io, we are sure
+	 * that we cannot get here if we write with O_DIRECT into tail page
+	 */
+	if (!hole_page || index != hole_page->index) {
+		tail_page = grab_cache_page(inode->i_mapping, index);
+		retval = -ENOMEM;
+		if (!tail_page) {
+			goto out;
+		}
+	} else {
+		tail_page = hole_page;
+	}
+
+	/*
+	 * we don't have to make sure the conversion did not happen while
+	 * we were locking the page because anyone that could convert
+	 * must first take i_mutex.
+	 *
+	 * We must fix the tail page for writing because it might have buffers
+	 * that are mapped, but have a block number of 0.  This indicates tail
+	 * data that has been read directly into the page, and
+	 * __block_write_begin won't trigger a get_block in this case.
+	 */
+	fix_tail_page_for_writing(tail_page);
+	retval = __reiserfs_write_begin(tail_page, tail_start,
+				      tail_end - tail_start);
+	if (retval)
+		goto unlock;
+
+	/* tail conversion might change the data in the page */
+	flush_dcache_page(tail_page);
+
+	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
+
+unlock:
+	if (tail_page != hole_page) {
+		unlock_page(tail_page);
+		page_cache_release(tail_page);
+	}
+out:
+	return retval;
+}
+
+static inline int _allocate_block(struct reiserfs_transaction_handle *th,
+				  sector_t block,
+				  struct inode *inode,
+				  b_blocknr_t * allocated_block_nr,
+				  struct treepath *path, int flags)
+{
+	BUG_ON(!th->t_trans_id);
+
+#ifdef REISERFS_PREALLOCATE
+	if (!(flags & GET_BLOCK_NO_IMUX)) {
+		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
+						  path, block);
+	}
+#endif
+	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
+					 block);
+}
+
+int reiserfs_get_block(struct inode *inode, sector_t block,
+		       struct buffer_head *bh_result, int create)
+{
+	int repeat, retval = 0;
+	/* b_blocknr_t is (unsigned) 32 bit int*/
+	b_blocknr_t allocated_block_nr = 0;
+	INITIALIZE_PATH(path);
+	int pos_in_item;
+	struct cpu_key key;
+	struct buffer_head *bh, *unbh = NULL;
+	struct item_head *ih, tmp_ih;
+	__le32 *item;
+	int done;
+	int fs_gen;
+	struct reiserfs_transaction_handle *th = NULL;
+	/*
+	 * space reserved in transaction batch:
+	 * . 3 balancings in direct->indirect conversion
+	 * . 1 block involved into reiserfs_update_sd()
+	 * XXX in practically impossible worst case direct2indirect()
+	 * can incur (much) more than 3 balancings.
+	 * quota update for user, group
+	 */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
+	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+	int version;
+	int dangle = 1;
+	loff_t new_offset =
+	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
+
+	reiserfs_write_lock(inode->i_sb);
+	version = get_inode_item_key_version(inode);
+
+	if (!file_capable(inode, block)) {
+		reiserfs_write_unlock(inode->i_sb);
+		return -EFBIG;
+	}
+
+	/*
+	 * if !create, we aren't changing the FS, so we don't need to
+	 * log anything, so we don't need to start a transaction
+	 */
+	if (!(create & GET_BLOCK_CREATE)) {
+		int ret;
+		/* find number of block-th logical block of the file */
+		ret = _get_block_create_0(inode, block, bh_result,
+					  create | GET_BLOCK_READ_DIRECT);
+		reiserfs_write_unlock(inode->i_sb);
+		return ret;
+	}
+
+	/*
+	 * if we're already in a transaction, make sure to close
+	 * any new transactions we start in this func
+	 */
+	if ((create & GET_BLOCK_NO_DANGLE) ||
+	    reiserfs_transaction_running(inode->i_sb))
+		dangle = 0;
+
+	/*
+	 * If file is of such a size, that it might have a tail and
+	 * tails are enabled  we should mark it as possibly needing
+	 * tail packing on close
+	 */
+	if ((have_large_tails(inode->i_sb)
+	     && inode->i_size < i_block_size(inode) * 4)
+	    || (have_small_tails(inode->i_sb)
+		&& inode->i_size < i_block_size(inode)))
+		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
+
+	/* set the key of the first byte in the 'block'-th block of file */
+	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
+	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
+start_trans:
+		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+		if (!th) {
+			retval = -ENOMEM;
+			goto failure;
+		}
+		reiserfs_update_inode_transaction(inode);
+	}
+research:
+
+	retval = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto failure;
+	}
+
+	bh = get_last_bh(&path);
+	ih = tp_item_head(&path);
+	item = tp_item_body(&path);
+	pos_in_item = path.pos_in_item;
+
+	fs_gen = get_generation(inode->i_sb);
+	copy_item_head(&tmp_ih, ih);
+
+	if (allocation_needed
+	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
+		/* we have to allocate block for the unformatted node */
+		if (!th) {
+			pathrelse(&path);
+			goto start_trans;
+		}
+
+		repeat =
+		    _allocate_block(th, block, inode, &allocated_block_nr,
+				    &path, create);
+
+		/*
+		 * restart the transaction to give the journal a chance to free
+		 * some blocks.  releases the path, so we have to go back to
+		 * research if we succeed on the second try
+		 */
+		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
+			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+			retval = restart_transaction(th, inode, &path);
+			if (retval)
+				goto failure;
+			repeat =
+			    _allocate_block(th, block, inode,
+					    &allocated_block_nr, NULL, create);
+
+			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
+				goto research;
+			}
+			if (repeat == QUOTA_EXCEEDED)
+				retval = -EDQUOT;
+			else
+				retval = -ENOSPC;
+			goto failure;
+		}
+
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			goto research;
+		}
+	}
+
+	if (indirect_item_found(retval, ih)) {
+		b_blocknr_t unfm_ptr;
+		/*
+		 * 'block'-th block is in the file already (there is
+		 * corresponding cell in some indirect item). But it may be
+		 * zero unformatted node pointer (hole)
+		 */
+		unfm_ptr = get_block_num(item, pos_in_item);
+		if (unfm_ptr == 0) {
+			/* use allocated block to plug the hole */
+			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+			if (fs_changed(fs_gen, inode->i_sb)
+			    && item_moved(&tmp_ih, &path)) {
+				reiserfs_restore_prepared_buffer(inode->i_sb,
+								 bh);
+				goto research;
+			}
+			set_buffer_new(bh_result);
+			if (buffer_dirty(bh_result)
+			    && reiserfs_data_ordered(inode->i_sb))
+				reiserfs_add_ordered_list(inode, bh_result);
+			put_block_num(item, pos_in_item, allocated_block_nr);
+			unfm_ptr = allocated_block_nr;
+			journal_mark_dirty(th, bh);
+			reiserfs_update_sd(th, inode);
+		}
+		set_block_dev_mapped(bh_result, unfm_ptr, inode);
+		pathrelse(&path);
+		retval = 0;
+		if (!dangle && th)
+			retval = reiserfs_end_persistent_transaction(th);
+
+		reiserfs_write_unlock(inode->i_sb);
+
+		/*
+		 * the item was found, so new blocks were not added to the file
+		 * there is no need to make sure the inode is updated with this
+		 * transaction
+		 */
+		return retval;
+	}
+
+	if (!th) {
+		pathrelse(&path);
+		goto start_trans;
+	}
+
+	/*
+	 * desired position is not found or is in the direct item. We have
+	 * to append file with holes up to 'block'-th block converting
+	 * direct items to indirect one if necessary
+	 */
+	done = 0;
+	do {
+		if (is_statdata_le_ih(ih)) {
+			__le32 unp = 0;
+			struct cpu_key tmp_key;
+
+			/* indirect item has to be inserted */
+			make_le_item_head(&tmp_ih, &key, version, 1,
+					  TYPE_INDIRECT, UNFM_P_SIZE,
+					  0 /* free_space */ );
+
+			/*
+			 * we are going to add 'block'-th block to the file.
+			 * Use allocated block for that
+			 */
+			if (cpu_key_k_offset(&key) == 1) {
+				unp = cpu_to_le32(allocated_block_nr);
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				set_buffer_new(bh_result);
+				done = 1;
+			}
+			tmp_key = key;	/* ;) */
+			set_cpu_key_k_offset(&tmp_key, 1);
+			PATH_LAST_POSITION(&path)++;
+
+			retval =
+			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
+						 inode, (char *)&unp);
+			if (retval) {
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				/*
+				 * retval == -ENOSPC, -EDQUOT or -EIO
+				 * or -EEXIST
+				 */
+				goto failure;
+			}
+		} else if (is_direct_le_ih(ih)) {
+			/* direct item has to be converted */
+			loff_t tail_offset;
+
+			tail_offset =
+			    ((le_ih_k_offset(ih) -
+			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
+
+			/*
+			 * direct item we just found fits into block we have
+			 * to map. Convert it into unformatted node: use
+			 * bh_result for the conversion
+			 */
+			if (tail_offset == cpu_key_k_offset(&key)) {
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				unbh = bh_result;
+				done = 1;
+			} else {
+				/*
+				 * we have to pad file tail stored in direct
+				 * item(s) up to block size and convert it
+				 * to unformatted node. FIXME: this should
+				 * also get into page cache
+				 */
+
+				pathrelse(&path);
+				/*
+				 * ugly, but we can only end the transaction if
+				 * we aren't nested
+				 */
+				BUG_ON(!th->t_refcount);
+				if (th->t_refcount == 1) {
+					retval =
+					    reiserfs_end_persistent_transaction
+					    (th);
+					th = NULL;
+					if (retval)
+						goto failure;
+				}
+
+				retval =
+				    convert_tail_for_hole(inode, bh_result,
+							  tail_offset);
+				if (retval) {
+					if (retval != -ENOSPC)
+						reiserfs_error(inode->i_sb,
+							"clm-6004",
+							"convert tail failed "
+							"inode %lu, error %d",
+							inode->i_ino,
+							retval);
+					if (allocated_block_nr) {
+						/*
+						 * the bitmap, the super,
+						 * and the stat data == 3
+						 */
+						if (!th)
+							th = reiserfs_persistent_transaction(inode->i_sb, 3);
+						if (th)
+							reiserfs_free_block(th,
+									    inode,
+									    allocated_block_nr,
+									    1);
+					}
+					goto failure;
+				}
+				goto research;
+			}
+			retval =
+			    direct2indirect(th, inode, &path, unbh,
+					    tail_offset);
+			if (retval) {
+				reiserfs_unmap_buffer(unbh);
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;
+			}
+			/*
+			 * it is important the set_buffer_uptodate is done
+			 * after the direct2indirect.  The buffer might
+			 * contain valid data newer than the data on disk
+			 * (read by readpage, changed, and then sent here by
+			 * writepage).  direct2indirect needs to know if unbh
+			 * was already up to date, so it can decide if the
+			 * data in unbh needs to be replaced with data from
+			 * the disk
+			 */
+			set_buffer_uptodate(unbh);
+
+			/*
+			 * unbh->b_page == NULL in case of DIRECT_IO request,
+			 * this means buffer will disappear shortly, so it
+			 * should not be added to
+			 */
+			if (unbh->b_page) {
+				/*
+				 * we've converted the tail, so we must
+				 * flush unbh before the transaction commits
+				 */
+				reiserfs_add_tail_list(inode, unbh);
+
+				/*
+				 * mark it dirty now to prevent commit_write
+				 * from adding this buffer to the inode's
+				 * dirty buffer list
+				 */
+				/*
+				 * AKPM: changed __mark_buffer_dirty to
+				 * mark_buffer_dirty().  It's still atomic,
+				 * but it sets the page dirty too, which makes
+				 * it eligible for writeback at any time by the
+				 * VM (which was also the case with
+				 * __mark_buffer_dirty())
+				 */
+				mark_buffer_dirty(unbh);
+			}
+		} else {
+			/*
+			 * append indirect item with holes if needed, when
+			 * appending pointer to 'block'-th block use block,
+			 * which is already allocated
+			 */
+			struct cpu_key tmp_key;
+			/*
+			 * We use this in case we need to allocate
+			 * only one block which is a fastpath
+			 */
+			unp_t unf_single = 0;
+			unp_t *un;
+			__u64 max_to_insert =
+			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
+			    UNFM_P_SIZE;
+			__u64 blocks_needed;
+
+			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
+			       "vs-804: invalid position for append");
+			/*
+			 * indirect item has to be appended,
+			 * set up key of that position
+			 * (key type is unimportant)
+			 */
+			make_cpu_key(&tmp_key, inode,
+				     le_key_k_offset(version,
+						     &ih->ih_key) +
+				     op_bytes_number(ih,
+						     inode->i_sb->s_blocksize),
+				     TYPE_INDIRECT, 3);
+
+			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
+			       "green-805: invalid offset");
+			blocks_needed =
+			    1 +
+			    ((cpu_key_k_offset(&key) -
+			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
+			     s_blocksize_bits);
+
+			if (blocks_needed == 1) {
+				un = &unf_single;
+			} else {
+				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
+				if (!un) {
+					un = &unf_single;
+					blocks_needed = 1;
+					max_to_insert = 0;
+				}
+			}
+			if (blocks_needed <= max_to_insert) {
+				/*
+				 * we are going to add target block to
+				 * the file. Use allocated block for that
+				 */
+				un[blocks_needed - 1] =
+				    cpu_to_le32(allocated_block_nr);
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				set_buffer_new(bh_result);
+				done = 1;
+			} else {
+				/* paste hole to the indirect item */
+				/*
+				 * If kmalloc failed, max_to_insert becomes
+				 * zero and it means we only have space for
+				 * one block
+				 */
+				blocks_needed =
+				    max_to_insert ? max_to_insert : 1;
+			}
+			retval =
+			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
+						     (char *)un,
+						     UNFM_P_SIZE *
+						     blocks_needed);
+
+			if (blocks_needed != 1)
+				kfree(un);
+
+			if (retval) {
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;
+			}
+			if (!done) {
+				/*
+				 * We need to mark new file size in case
+				 * this function will be interrupted/aborted
+				 * later on. And we may do this only for
+				 * holes.
+				 */
+				inode->i_size +=
+				    inode->i_sb->s_blocksize * blocks_needed;
+			}
+		}
+
+		if (done == 1)
+			break;
+
+		/*
+		 * this loop could log more blocks than we had originally
+		 * asked for.  So, we have to allow the transaction to end
+		 * if it is too big or too full.  Update the inode so things
+		 * are consistent if we crash before the function returns
+		 * release the path so that anybody waiting on the path before
+		 * ending their transaction will be able to continue.
+		 */
+		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+			retval = restart_transaction(th, inode, &path);
+			if (retval)
+				goto failure;
+		}
+		/*
+		 * inserting indirect pointers for a hole can take a
+		 * long time.  reschedule if needed and also release the write
+		 * lock for others.
+		 */
+		reiserfs_cond_resched(inode->i_sb);
+
+		retval = search_for_position_by_key(inode->i_sb, &key, &path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto failure;
+		}
+		if (retval == POSITION_FOUND) {
+			reiserfs_warning(inode->i_sb, "vs-825",
+					 "%K should not be found", &key);
+			retval = -EEXIST;
+			if (allocated_block_nr)
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+			pathrelse(&path);
+			goto failure;
+		}
+		bh = get_last_bh(&path);
+		ih = tp_item_head(&path);
+		item = tp_item_body(&path);
+		pos_in_item = path.pos_in_item;
+	} while (1);
+
+	retval = 0;
+
+failure:
+	if (th && (!dangle || (retval && !th->t_trans_id))) {
+		int err;
+		if (th->t_trans_id)
+			reiserfs_update_sd(th, inode);
+		err = reiserfs_end_persistent_transaction(th);
+		if (err)
+			retval = err;
+	}
+
+	reiserfs_write_unlock(inode->i_sb);
+	reiserfs_check_path(&path);
+	return retval;
+}
+
+static int
+reiserfs_readpages(struct file *file, struct address_space *mapping,
+		   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+}
+
+/*
+ * Compute real number of used bytes by file
+ * Following three functions can go away when we'll have enough space in
+ * stat item
+ */
+static int real_space_diff(struct inode *inode, int sd_size)
+{
+	int bytes;
+	loff_t blocksize = inode->i_sb->s_blocksize;
+
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
+		return sd_size;
+
+	/*
+	 * End of file is also in full block with indirect reference, so round
+	 * up to the next block.
+	 *
+	 * there is just no way to know if the tail is actually packed
+	 * on the file, so we have to assume it isn't.  When we pack the
+	 * tail, we add 4 bytes to pretend there really is an unformatted
+	 * node pointer
+	 */
+	bytes =
+	    ((inode->i_size +
+	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
+	    sd_size;
+	return bytes;
+}
+
+static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
+					int sd_size)
+{
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		return inode->i_size +
+		    (loff_t) (real_space_diff(inode, sd_size));
+	}
+	return ((loff_t) real_space_diff(inode, sd_size)) +
+	    (((loff_t) blocks) << 9);
+}
+
+/* Compute number of blocks used by file in ReiserFS counting */
+static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
+{
+	loff_t bytes = inode_get_bytes(inode);
+	loff_t real_space = real_space_diff(inode, sd_size);
+
+	/* keeps fsck and non-quota versions of reiserfs happy */
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		bytes += (loff_t) 511;
+	}
+
+	/*
+	 * files from before the quota patch might i_blocks such that
+	 * bytes < real_space.  Deal with that here to prevent it from
+	 * going negative.
+	 */
+	if (bytes < real_space)
+		return 0;
+	return (bytes - real_space) >> 9;
+}
+
+/*
+ * BAD: new directories have stat data of new type and all other items
+ * of old type. Version stored in the inode says about body items, so
+ * in update_stat_data we can not rely on inode, but have to check
+ * item version directly
+ */
+
+/* called by read_locked_inode */
+static void init_inode(struct inode *inode, struct treepath *path)
+{
+	struct buffer_head *bh;
+	struct item_head *ih;
+	__u32 rdev;
+
+	bh = PATH_PLAST_BUFFER(path);
+	ih = tp_item_head(path);
+
+	copy_key(INODE_PKEY(inode), &ih->ih_key);
+
+	INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
+	REISERFS_I(inode)->i_flags = 0;
+	REISERFS_I(inode)->i_prealloc_block = 0;
+	REISERFS_I(inode)->i_prealloc_count = 0;
+	REISERFS_I(inode)->i_trans_id = 0;
+	REISERFS_I(inode)->i_jl = NULL;
+	reiserfs_init_xattr_rwsem(inode);
+
+	if (stat_data_v1(ih)) {
+		struct stat_data_v1 *sd =
+		    (struct stat_data_v1 *)ih_item_body(bh, ih);
+		unsigned long blocks;
+
+		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+		set_inode_sd_version(inode, STAT_DATA_V1);
+		inode->i_mode = sd_v1_mode(sd);
+		set_nlink(inode, sd_v1_nlink(sd));
+		i_uid_write(inode, sd_v1_uid(sd));
+		i_gid_write(inode, sd_v1_gid(sd));
+		inode->i_size = sd_v1_size(sd);
+		inode->i_atime.tv_sec = sd_v1_atime(sd);
+		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
+		inode->i_atime.tv_nsec = 0;
+		inode->i_ctime.tv_nsec = 0;
+		inode->i_mtime.tv_nsec = 0;
+
+		inode->i_blocks = sd_v1_blocks(sd);
+		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+		blocks = (inode->i_size + 511) >> 9;
+		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
+
+		/*
+		 * there was a bug in <=3.5.23 when i_blocks could take
+		 * negative values. Starting from 3.5.17 this value could
+		 * even be stored in stat data. For such files we set
+		 * i_blocks based on file size. Just 2 notes: this can be
+		 * wrong for sparse files. On-disk value will be only
+		 * updated if file's inode will ever change
+		 */
+		if (inode->i_blocks > blocks) {
+			inode->i_blocks = blocks;
+		}
+
+		rdev = sd_v1_rdev(sd);
+		REISERFS_I(inode)->i_first_direct_byte =
+		    sd_v1_first_direct_byte(sd);
+
+		/*
+		 * an early bug in the quota code can give us an odd
+		 * number for the block count.  This is incorrect, fix it here.
+		 */
+		if (inode->i_blocks & 1) {
+			inode->i_blocks++;
+		}
+		inode_set_bytes(inode,
+				to_real_used_space(inode, inode->i_blocks,
+						   SD_V1_SIZE));
+		/*
+		 * nopack is initially zero for v1 objects. For v2 objects,
+		 * nopack is initialised from sd_attrs
+		 */
+		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+	} else {
+		/*
+		 * new stat data found, but object may have old items
+		 * (directories and symlinks)
+		 */
+		struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
+
+		inode->i_mode = sd_v2_mode(sd);
+		set_nlink(inode, sd_v2_nlink(sd));
+		i_uid_write(inode, sd_v2_uid(sd));
+		inode->i_size = sd_v2_size(sd);
+		i_gid_write(inode, sd_v2_gid(sd));
+		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
+		inode->i_atime.tv_sec = sd_v2_atime(sd);
+		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
+		inode->i_ctime.tv_nsec = 0;
+		inode->i_mtime.tv_nsec = 0;
+		inode->i_atime.tv_nsec = 0;
+		inode->i_blocks = sd_v2_blocks(sd);
+		rdev = sd_v2_rdev(sd);
+		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+			inode->i_generation =
+			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+		else
+			inode->i_generation = sd_v2_generation(sd);
+
+		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+		else
+			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
+		REISERFS_I(inode)->i_first_direct_byte = 0;
+		set_inode_sd_version(inode, STAT_DATA_V2);
+		inode_set_bytes(inode,
+				to_real_used_space(inode, inode->i_blocks,
+						   SD_V2_SIZE));
+		/*
+		 * read persistent inode attributes from sd and initialise
+		 * generic inode flags from them
+		 */
+		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
+		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
+	}
+
+	pathrelse(path);
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &reiserfs_file_inode_operations;
+		inode->i_fop = &reiserfs_file_operations;
+		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &reiserfs_dir_inode_operations;
+		inode->i_fop = &reiserfs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &reiserfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+	} else {
+		inode->i_blocks = 0;
+		inode->i_op = &reiserfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+	}
+}
+
+/* update new stat data with inode fields */
+static void inode2sd(void *sd, struct inode *inode, loff_t size)
+{
+	struct stat_data *sd_v2 = (struct stat_data *)sd;
+	__u16 flags;
+
+	set_sd_v2_mode(sd_v2, inode->i_mode);
+	set_sd_v2_nlink(sd_v2, inode->i_nlink);
+	set_sd_v2_uid(sd_v2, i_uid_read(inode));
+	set_sd_v2_size(sd_v2, size);
+	set_sd_v2_gid(sd_v2, i_gid_read(inode));
+	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
+	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
+	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
+	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
+	else
+		set_sd_v2_generation(sd_v2, inode->i_generation);
+	flags = REISERFS_I(inode)->i_attrs;
+	i_attrs_to_sd_attrs(inode, &flags);
+	set_sd_v2_attrs(sd_v2, flags);
+}
+
+/* used to copy inode's fields to old stat data */
+static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
+{
+	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
+
+	set_sd_v1_mode(sd_v1, inode->i_mode);
+	set_sd_v1_uid(sd_v1, i_uid_read(inode));
+	set_sd_v1_gid(sd_v1, i_gid_read(inode));
+	set_sd_v1_nlink(sd_v1, inode->i_nlink);
+	set_sd_v1_size(sd_v1, size);
+	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
+	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
+	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
+	else
+		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
+
+	/* Sigh. i_first_direct_byte is back */
+	set_sd_v1_first_direct_byte(sd_v1,
+				    REISERFS_I(inode)->i_first_direct_byte);
+}
+
+/*
+ * NOTE, you must prepare the buffer head before sending it here,
+ * and then log it after the call
+ */
+static void update_stat_data(struct treepath *path, struct inode *inode,
+			     loff_t size)
+{
+	struct buffer_head *bh;
+	struct item_head *ih;
+
+	bh = PATH_PLAST_BUFFER(path);
+	ih = tp_item_head(path);
+
+	if (!is_statdata_le_ih(ih))
+		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
+			       INODE_PKEY(inode), ih);
+
+	/* path points to old stat data */
+	if (stat_data_v1(ih)) {
+		inode2sd_v1(ih_item_body(bh, ih), inode, size);
+	} else {
+		inode2sd(ih_item_body(bh, ih), inode, size);
+	}
+
+	return;
+}
+
+void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
+			     struct inode *inode, loff_t size)
+{
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	struct buffer_head *bh;
+	int fs_gen;
+	struct item_head *ih, tmp_ih;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	/* key type is unimportant */
+	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
+
+	for (;;) {
+		int pos;
+		/* look for the object's stat data */
+		retval = search_item(inode->i_sb, &key, &path);
+		if (retval == IO_ERROR) {
+			reiserfs_error(inode->i_sb, "vs-13050",
+				       "i/o failure occurred trying to "
+				       "update %K stat data", &key);
+			return;
+		}
+		if (retval == ITEM_NOT_FOUND) {
+			pos = PATH_LAST_POSITION(&path);
+			pathrelse(&path);
+			if (inode->i_nlink == 0) {
+				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
+				return;
+			}
+			reiserfs_warning(inode->i_sb, "vs-13060",
+					 "stat data of object %k (nlink == %d) "
+					 "not found (pos %d)",
+					 INODE_PKEY(inode), inode->i_nlink,
+					 pos);
+			reiserfs_check_path(&path);
+			return;
+		}
+
+		/*
+		 * sigh, prepare_for_journal might schedule.  When it
+		 * schedules the FS might change.  We have to detect that,
+		 * and loop back to the search if the stat data item has moved
+		 */
+		bh = get_last_bh(&path);
+		ih = tp_item_head(&path);
+		copy_item_head(&tmp_ih, ih);
+		fs_gen = get_generation(inode->i_sb);
+		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+
+		/* Stat_data item has been moved after scheduling. */
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
+			continue;
+		}
+		break;
+	}
+	update_stat_data(&path, inode, size);
+	journal_mark_dirty(th, bh);
+	pathrelse(&path);
+	return;
+}
+
+/*
+ * reiserfs_read_locked_inode is called to read the inode off disk, and it
+ * does a make_bad_inode when things go wrong.  But, we need to make sure
+ * and clear the key in the private portion of the inode, otherwise a
+ * corresponding iput might try to delete whatever object the inode last
+ * represented.
+ */
+static void reiserfs_make_bad_inode(struct inode *inode)
+{
+	memset(INODE_PKEY(inode), 0, KEY_SIZE);
+	make_bad_inode(inode);
+}
+
+/*
+ * initially this function was derived from minix or ext2's analog and
+ * evolved as the prototype did
+ */
+int reiserfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
+	inode->i_ino = args->objectid;
+	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
+	return 0;
+}
+
+/*
+ * looks for stat data in the tree, and fills up the fields of in-core
+ * inode stat data fields
+ */
+void reiserfs_read_locked_inode(struct inode *inode,
+				struct reiserfs_iget_args *args)
+{
+	INITIALIZE_PATH(path_to_sd);
+	struct cpu_key key;
+	unsigned long dirino;
+	int retval;
+
+	dirino = args->dirid;
+
+	/*
+	 * set version 1, version 2 could be used too, because stat data
+	 * key is the same in both versions
+	 */
+	key.version = KEY_FORMAT_3_5;
+	key.on_disk_key.k_dir_id = dirino;
+	key.on_disk_key.k_objectid = inode->i_ino;
+	key.on_disk_key.k_offset = 0;
+	key.on_disk_key.k_type = 0;
+
+	/* look for the object's stat data */
+	retval = search_item(inode->i_sb, &key, &path_to_sd);
+	if (retval == IO_ERROR) {
+		reiserfs_error(inode->i_sb, "vs-13070",
+			       "i/o failure occurred trying to find "
+			       "stat data of %K", &key);
+		reiserfs_make_bad_inode(inode);
+		return;
+	}
+
+	/* a stale NFS handle can trigger this without it being an error */
+	if (retval != ITEM_FOUND) {
+		pathrelse(&path_to_sd);
+		reiserfs_make_bad_inode(inode);
+		clear_nlink(inode);
+		return;
+	}
+
+	init_inode(inode, &path_to_sd);
+
+	/*
+	 * It is possible that knfsd is trying to access inode of a file
+	 * that is being removed from the disk by some other thread. As we
+	 * update sd on unlink all that is required is to check for nlink
+	 * here. This bug was first found by Sizif when debugging
+	 * SquidNG/Butterfly, forgotten, and found again after Philippe
+	 * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
+
+	 * More logical fix would require changes in fs/inode.c:iput() to
+	 * remove inode from hash-table _after_ fs cleaned disk stuff up and
+	 * in iget() to return NULL if I_FREEING inode is found in
+	 * hash-table.
+	 */
+
+	/*
+	 * Currently there is one place where it's ok to meet inode with
+	 * nlink==0: processing of open-unlinked and half-truncated files
+	 * during mount (fs/reiserfs/super.c:finish_unfinished()).
+	 */
+	if ((inode->i_nlink == 0) &&
+	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
+		reiserfs_warning(inode->i_sb, "vs-13075",
+				 "dead inode read from disk %K. "
+				 "This is likely to be race with knfsd. Ignore",
+				 &key);
+		reiserfs_make_bad_inode(inode);
+	}
+
+	/* init inode should be relsing */
+	reiserfs_check_path(&path_to_sd);
+
+	/*
+	 * Stat data v1 doesn't support ACLs.
+	 */
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		cache_no_acl(inode);
+}
+
+/*
+ * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
+ *
+ * @inode:    inode from hash table to check
+ * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
+ *
+ * This function is called by iget5_locked() to distinguish reiserfs inodes
+ * having the same inode numbers. Such inodes can only exist due to some
+ * error condition. One of them should be bad. Inodes with identical
+ * inode numbers (objectids) are distinguished by parent directory ids.
+ *
+ */
+int reiserfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct reiserfs_iget_args *args;
+
+	args = opaque;
+	/* args is already in CPU order */
+	return (inode->i_ino == args->objectid) &&
+	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
+}
+
+struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
+{
+	struct inode *inode;
+	struct reiserfs_iget_args args;
+	int depth;
+
+	args.objectid = key->on_disk_key.k_objectid;
+	args.dirid = key->on_disk_key.k_dir_id;
+	depth = reiserfs_write_unlock_nested(s);
+	inode = iget5_locked(s, key->on_disk_key.k_objectid,
+			     reiserfs_find_actor, reiserfs_init_locked_inode,
+			     (void *)(&args));
+	reiserfs_write_lock_nested(s, depth);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		reiserfs_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+
+	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
+		/* either due to i/o error or a stale NFS handle */
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+static struct dentry *reiserfs_get_dentry(struct super_block *sb,
+	u32 objectid, u32 dir_id, u32 generation)
+
+{
+	struct cpu_key key;
+	struct inode *inode;
+
+	key.on_disk_key.k_objectid = objectid;
+	key.on_disk_key.k_dir_id = dir_id;
+	reiserfs_write_lock(sb);
+	inode = reiserfs_iget(sb, &key);
+	if (inode && !IS_ERR(inode) && generation != 0 &&
+	    generation != inode->i_generation) {
+		iput(inode);
+		inode = NULL;
+	}
+	reiserfs_write_unlock(sb);
+
+	return d_obtain_alias(inode);
+}
+
+struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	/*
+	 * fhtype happens to reflect the number of u32s encoded.
+	 * due to a bug in earlier code, fhtype might indicate there
+	 * are more u32s then actually fitted.
+	 * so if fhtype seems to be more than len, reduce fhtype.
+	 * Valid types are:
+	 *   2 - objectid + dir_id - legacy support
+	 *   3 - objectid + dir_id + generation
+	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
+	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
+	 *   6 - as above plus generation of directory
+	 * 6 does not fit in NFSv2 handles
+	 */
+	if (fh_type > fh_len) {
+		if (fh_type != 6 || fh_len != 5)
+			reiserfs_warning(sb, "reiserfs-13077",
+				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
+				fh_type, fh_len);
+		fh_type = fh_len;
+	}
+	if (fh_len < 2)
+		return NULL;
+
+	return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
+		(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
+}
+
+struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	if (fh_type > fh_len)
+		fh_type = fh_len;
+	if (fh_type < 4)
+		return NULL;
+
+	return reiserfs_get_dentry(sb,
+		(fh_type >= 5) ? fid->raw[3] : fid->raw[2],
+		(fh_type >= 5) ? fid->raw[4] : fid->raw[3],
+		(fh_type == 6) ? fid->raw[5] : 0);
+}
+
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+		       struct inode *parent)
+{
+	int maxlen = *lenp;
+
+	if (parent && (maxlen < 5)) {
+		*lenp = 5;
+		return FILEID_INVALID;
+	} else if (maxlen < 3) {
+		*lenp = 3;
+		return FILEID_INVALID;
+	}
+
+	data[0] = inode->i_ino;
+	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+	data[2] = inode->i_generation;
+	*lenp = 3;
+	if (parent) {
+		data[3] = parent->i_ino;
+		data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
+		*lenp = 5;
+		if (maxlen >= 6) {
+			data[5] = parent->i_generation;
+			*lenp = 6;
+		}
+	}
+	return *lenp;
+}
+
+/*
+ * looks for stat data, then copies fields to it, marks the buffer
+ * containing stat data as dirty
+ */
+/*
+ * reiserfs inodes are never really dirty, since the dirty inode call
+ * always logs them.  This call allows the VFS inode marking routines
+ * to properly mark inodes for datasync and such, but only actually
+ * does something when called for a synchronous update.
+ */
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct reiserfs_transaction_handle th;
+	int jbegin_count = 1;
+
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	/*
+	 * memory pressure can sometimes initiate write_inode calls with
+	 * sync == 1,
+	 * these cases are just when the system needs ram, not when the
+	 * inode needs to reach disk for safety, and they can safely be
+	 * ignored because the altered inode has already been logged.
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
+		reiserfs_write_lock(inode->i_sb);
+		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
+			reiserfs_update_sd(&th, inode);
+			journal_end_sync(&th);
+		}
+		reiserfs_write_unlock(inode->i_sb);
+	}
+	return 0;
+}
+
+/*
+ * stat data of new object is inserted already, this inserts the item
+ * containing "." and ".." entries
+ */
+static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
+				  struct inode *inode,
+				  struct item_head *ih, struct treepath *path,
+				  struct inode *dir)
+{
+	struct super_block *sb = th->t_super;
+	char empty_dir[EMPTY_DIR_SIZE];
+	char *body = empty_dir;
+	struct cpu_key key;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
+		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
+		      TYPE_DIRENTRY, 3 /*key length */ );
+
+	/*
+	 * compose item head for new item. Directories consist of items of
+	 * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
+	 * is done by reiserfs_new_inode
+	 */
+	if (old_format_only(sb)) {
+		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
+				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
+
+		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
+				       ih->ih_key.k_objectid,
+				       INODE_PKEY(dir)->k_dir_id,
+				       INODE_PKEY(dir)->k_objectid);
+	} else {
+		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
+				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
+
+		make_empty_dir_item(body, ih->ih_key.k_dir_id,
+				    ih->ih_key.k_objectid,
+				    INODE_PKEY(dir)->k_dir_id,
+				    INODE_PKEY(dir)->k_objectid);
+	}
+
+	/* look for place in the tree for new item */
+	retval = search_item(sb, &key, path);
+	if (retval == IO_ERROR) {
+		reiserfs_error(sb, "vs-13080",
+			       "i/o failure occurred creating new directory");
+		return -EIO;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(path);
+		reiserfs_warning(sb, "vs-13070",
+				 "object with this key exists (%k)",
+				 &(ih->ih_key));
+		return -EEXIST;
+	}
+
+	/* insert item, that is empty directory item */
+	return reiserfs_insert_item(th, path, &key, ih, inode, body);
+}
+
+/*
+ * stat data of object has been inserted, this inserts the item
+ * containing the body of symlink
+ */
+static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
+				struct inode *inode,
+				struct item_head *ih,
+				struct treepath *path, const char *symname,
+				int item_len)
+{
+	struct super_block *sb = th->t_super;
+	struct cpu_key key;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	_make_cpu_key(&key, KEY_FORMAT_3_5,
+		      le32_to_cpu(ih->ih_key.k_dir_id),
+		      le32_to_cpu(ih->ih_key.k_objectid),
+		      1, TYPE_DIRECT, 3 /*key length */ );
+
+	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
+			  0 /*free_space */ );
+
+	/* look for place in the tree for new item */
+	retval = search_item(sb, &key, path);
+	if (retval == IO_ERROR) {
+		reiserfs_error(sb, "vs-13080",
+			       "i/o failure occurred creating new symlink");
+		return -EIO;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(path);
+		reiserfs_warning(sb, "vs-13080",
+				 "object with this key exists (%k)",
+				 &(ih->ih_key));
+		return -EEXIST;
+	}
+
+	/* insert item, that is body of symlink */
+	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
+}
+
+/*
+ * inserts the stat data into the tree, and then calls
+ * reiserfs_new_directory (to insert ".", ".." item if new object is
+ * directory) or reiserfs_new_symlink (to insert symlink body if new
+ * object is symlink) or nothing (if new object is regular file)
+
+ * NOTE! uid and gid must already be set in the inode.  If we return
+ * non-zero due to an error, we have to drop the quota previously allocated
+ * for the fresh inode.  This can only be done outside a transaction, so
+ * if we return non-zero, we also end the transaction.
+ *
+ * @th: active transaction handle
+ * @dir: parent directory for new inode
+ * @mode: mode of new inode
+ * @symname: symlink contents if inode is symlink
+ * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
+ *         symlinks
+ * @inode: inode to be filled
+ * @security: optional security context to associate with this inode
+ */
+int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
+		       struct inode *dir, umode_t mode, const char *symname,
+		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
+		          strlen (symname) for symlinks) */
+		       loff_t i_size, struct dentry *dentry,
+		       struct inode *inode,
+		       struct reiserfs_security_handle *security)
+{
+	struct super_block *sb = dir->i_sb;
+	struct reiserfs_iget_args args;
+	INITIALIZE_PATH(path_to_key);
+	struct cpu_key key;
+	struct item_head ih;
+	struct stat_data sd;
+	int retval;
+	int err;
+	int depth;
+
+	BUG_ON(!th->t_trans_id);
+
+	depth = reiserfs_write_unlock_nested(sb);
+	err = dquot_alloc_inode(inode);
+	reiserfs_write_lock_nested(sb, depth);
+	if (err)
+		goto out_end_trans;
+	if (!dir->i_nlink) {
+		err = -EPERM;
+		goto out_bad_inode;
+	}
+
+	/* item head of new item */
+	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
+	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
+	if (!ih.ih_key.k_objectid) {
+		err = -ENOMEM;
+		goto out_bad_inode;
+	}
+	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+	if (old_format_only(sb))
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+	else
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
+	memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
+	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+
+	depth = reiserfs_write_unlock_nested(inode->i_sb);
+	err = insert_inode_locked4(inode, args.objectid,
+			     reiserfs_find_actor, &args);
+	reiserfs_write_lock_nested(inode->i_sb, depth);
+	if (err) {
+		err = -EINVAL;
+		goto out_bad_inode;
+	}
+
+	if (old_format_only(sb))
+		/*
+		 * not a perfect generation count, as object ids can be reused,
+		 * but this is as good as reiserfs can do right now.
+		 * note that the private part of inode isn't filled in yet,
+		 * we have to use the directory.
+		 */
+		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
+	else
+#if defined( USE_INODE_GENERATION_COUNTER )
+		inode->i_generation =
+		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
+#else
+		inode->i_generation = ++event;
+#endif
+
+	/* fill stat data */
+	set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
+
+	/* uid and gid must already be set by the caller for quota init */
+
+	/* symlink cannot be immutable or append only, right? */
+	if (S_ISLNK(inode->i_mode))
+		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
+
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_size = i_size;
+	inode->i_blocks = 0;
+	inode->i_bytes = 0;
+	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
+	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
+
+	INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
+	REISERFS_I(inode)->i_flags = 0;
+	REISERFS_I(inode)->i_prealloc_block = 0;
+	REISERFS_I(inode)->i_prealloc_count = 0;
+	REISERFS_I(inode)->i_trans_id = 0;
+	REISERFS_I(inode)->i_jl = NULL;
+	REISERFS_I(inode)->i_attrs =
+	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
+	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
+	reiserfs_init_xattr_rwsem(inode);
+
+	/* key to search for correct place for new stat data */
+	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
+		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
+		      TYPE_STAT_DATA, 3 /*key length */ );
+
+	/* find proper place for inserting of stat data */
+	retval = search_item(sb, &key, &path_to_key);
+	if (retval == IO_ERROR) {
+		err = -EIO;
+		goto out_bad_inode;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(&path_to_key);
+		err = -EEXIST;
+		goto out_bad_inode;
+	}
+	if (old_format_only(sb)) {
+		/* i_uid or i_gid is too big to be stored in stat data v3.5 */
+		if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
+			pathrelse(&path_to_key);
+			err = -EINVAL;
+			goto out_bad_inode;
+		}
+		inode2sd_v1(&sd, inode, inode->i_size);
+	} else {
+		inode2sd(&sd, inode, inode->i_size);
+	}
+	/*
+	 * store in in-core inode the key of stat data and version all
+	 * object items will have (directory items will have old offset
+	 * format, other new objects will consist of new items)
+	 */
+	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
+		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+	else
+		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
+	if (old_format_only(sb))
+		set_inode_sd_version(inode, STAT_DATA_V1);
+	else
+		set_inode_sd_version(inode, STAT_DATA_V2);
+
+	/* insert the stat data into the tree */
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	if (REISERFS_I(dir)->new_packing_locality)
+		th->displace_new_blocks = 1;
+#endif
+	retval =
+	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
+				 (char *)(&sd));
+	if (retval) {
+		err = retval;
+		reiserfs_check_path(&path_to_key);
+		goto out_bad_inode;
+	}
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	if (!th->displace_new_blocks)
+		REISERFS_I(dir)->new_packing_locality = 0;
+#endif
+	if (S_ISDIR(mode)) {
+		/* insert item with "." and ".." */
+		retval =
+		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
+	}
+
+	if (S_ISLNK(mode)) {
+		/* insert body of symlink */
+		if (!old_format_only(sb))
+			i_size = ROUND_UP(i_size);
+		retval =
+		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
+					 i_size);
+	}
+	if (retval) {
+		err = retval;
+		reiserfs_check_path(&path_to_key);
+		journal_end(th);
+		goto out_inserted_sd;
+	}
+
+	if (reiserfs_posixacl(inode->i_sb)) {
+		reiserfs_write_unlock(inode->i_sb);
+		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
+		reiserfs_write_lock(inode->i_sb);
+		if (retval) {
+			err = retval;
+			reiserfs_check_path(&path_to_key);
+			journal_end(th);
+			goto out_inserted_sd;
+		}
+	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
+		reiserfs_warning(inode->i_sb, "jdm-13090",
+				 "ACLs aren't enabled in the fs, "
+				 "but vfs thinks they are!");
+	} else if (IS_PRIVATE(dir))
+		inode->i_flags |= S_PRIVATE;
+
+	if (security->name) {
+		reiserfs_write_unlock(inode->i_sb);
+		retval = reiserfs_security_write(th, inode, security);
+		reiserfs_write_lock(inode->i_sb);
+		if (retval) {
+			err = retval;
+			reiserfs_check_path(&path_to_key);
+			retval = journal_end(th);
+			if (retval)
+				err = retval;
+			goto out_inserted_sd;
+		}
+	}
+
+	reiserfs_update_sd(th, inode);
+	reiserfs_check_path(&path_to_key);
+
+	return 0;
+
+out_bad_inode:
+	/* Invalidate the object, nothing was inserted yet */
+	INODE_PKEY(inode)->k_objectid = 0;
+
+	/* Quota change must be inside a transaction for journaling */
+	depth = reiserfs_write_unlock_nested(inode->i_sb);
+	dquot_free_inode(inode);
+	reiserfs_write_lock_nested(inode->i_sb, depth);
+
+out_end_trans:
+	journal_end(th);
+	/*
+	 * Drop can be outside and it needs more credits so it's better
+	 * to have it outside
+	 */
+	depth = reiserfs_write_unlock_nested(inode->i_sb);
+	dquot_drop(inode);
+	reiserfs_write_lock_nested(inode->i_sb, depth);
+	inode->i_flags |= S_NOQUOTA;
+	make_bad_inode(inode);
+
+out_inserted_sd:
+	clear_nlink(inode);
+	th->t_trans_id = 0;	/* so the caller can't use this handle later */
+	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
+	iput(inode);
+	return err;
+}
+
+/*
+ * finds the tail page in the page cache,
+ * reads the last block in.
+ *
+ * On success, page_result is set to a locked, pinned page, and bh_result
+ * is set to an up to date buffer for the last block in the file.  returns 0.
+ *
+ * tail conversion is not done, so bh_result might not be valid for writing
+ * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
+ * trying to write the block.
+ *
+ * on failure, nonzero is returned, page_result and bh_result are untouched.
+ */
+static int grab_tail_page(struct inode *inode,
+			  struct page **page_result,
+			  struct buffer_head **bh_result)
+{
+
+	/*
+	 * we want the page with the last byte in the file,
+	 * not the page that will hold the next byte for appending
+	 */
+	unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long pos = 0;
+	unsigned long start = 0;
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct page *page;
+	int error;
+
+	/*
+	 * we know that we are only called with inode->i_size > 0.
+	 * we also know that a file tail can never be as big as a block
+	 * If i_size % blocksize == 0, our file is currently block aligned
+	 * and it won't need converting or zeroing after a truncate.
+	 */
+	if ((offset & (blocksize - 1)) == 0) {
+		return -ENOENT;
+	}
+	page = grab_cache_page(inode->i_mapping, index);
+	error = -ENOMEM;
+	if (!page) {
+		goto out;
+	}
+	/* start within the page of the last block in the file */
+	start = (offset / blocksize) * blocksize;
+
+	error = __block_write_begin(page, start, offset - start,
+				    reiserfs_get_block_create_0);
+	if (error)
+		goto unlock;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (pos >= start) {
+			break;
+		}
+		bh = bh->b_this_page;
+		pos += blocksize;
+	} while (bh != head);
+
+	if (!buffer_uptodate(bh)) {
+		/*
+		 * note, this should never happen, prepare_write should be
+		 * taking care of this for us.  If the buffer isn't up to
+		 * date, I've screwed up the code to find the buffer, or the
+		 * code to call prepare_write
+		 */
+		reiserfs_error(inode->i_sb, "clm-6000",
+			       "error reading block %lu", bh->b_blocknr);
+		error = -EIO;
+		goto unlock;
+	}
+	*bh_result = bh;
+	*page_result = page;
+
+out:
+	return error;
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return error;
+}
+
+/*
+ * vfs version of truncate file.  Must NOT be called with
+ * a transaction already started.
+ *
+ * some code taken from block_truncate_page
+ */
+int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
+{
+	struct reiserfs_transaction_handle th;
+	/* we want the offset for the first byte after the end of the file */
+	unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	unsigned length;
+	struct page *page = NULL;
+	int error;
+	struct buffer_head *bh = NULL;
+	int err2;
+
+	reiserfs_write_lock(inode->i_sb);
+
+	if (inode->i_size > 0) {
+		error = grab_tail_page(inode, &page, &bh);
+		if (error) {
+			/*
+			 * -ENOENT means we truncated past the end of the
+			 * file, and get_block_create_0 could not find a
+			 * block to read in, which is ok.
+			 */
+			if (error != -ENOENT)
+				reiserfs_error(inode->i_sb, "clm-6001",
+					       "grab_tail_page failed %d",
+					       error);
+			page = NULL;
+			bh = NULL;
+		}
+	}
+
+	/*
+	 * so, if page != NULL, we have a buffer head for the offset at
+	 * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+	 * then we have an unformatted node.  Otherwise, we have a direct item,
+	 * and no zeroing is required on disk.  We zero after the truncate,
+	 * because the truncate might pack the item anyway
+	 * (it will unmap bh if it packs).
+	 *
+	 * it is enough to reserve space in transaction for 2 balancings:
+	 * one for "save" link adding and another for the first
+	 * cut_from_item. 1 is for update_sd
+	 */
+	error = journal_begin(&th, inode->i_sb,
+			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
+	if (error)
+		goto out;
+	reiserfs_update_inode_transaction(inode);
+	if (update_timestamps)
+		/*
+		 * we are doing real truncate: if the system crashes
+		 * before the last transaction of truncating gets committed
+		 * - on reboot the file either appears truncated properly
+		 * or not truncated at all
+		 */
+		add_save_link(&th, inode, 1);
+	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
+	error = journal_end(&th);
+	if (error)
+		goto out;
+
+	/* check reiserfs_do_truncate after ending the transaction */
+	if (err2) {
+		error = err2;
+  		goto out;
+	}
+	
+	if (update_timestamps) {
+		error = remove_save_link(inode, 1 /* truncate */);
+		if (error)
+			goto out;
+	}
+
+	if (page) {
+		length = offset & (blocksize - 1);
+		/* if we are not on a block boundary */
+		if (length) {
+			length = blocksize - length;
+			zero_user(page, offset, length);
+			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
+				mark_buffer_dirty(bh);
+			}
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	reiserfs_write_unlock(inode->i_sb);
+
+	return 0;
+out:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	reiserfs_write_unlock(inode->i_sb);
+
+	return error;
+}
+
+static int map_block_for_writepage(struct inode *inode,
+				   struct buffer_head *bh_result,
+				   unsigned long block)
+{
+	struct reiserfs_transaction_handle th;
+	int fs_gen;
+	struct item_head tmp_ih;
+	struct item_head *ih;
+	struct buffer_head *bh;
+	__le32 *item;
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	int pos_in_item;
+	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
+	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
+	int retval;
+	int use_get_block = 0;
+	int bytes_copied = 0;
+	int copy_size;
+	int trans_running = 0;
+
+	/*
+	 * catch places below that try to log something without
+	 * starting a trans
+	 */
+	th.t_trans_id = 0;
+
+	if (!buffer_uptodate(bh_result)) {
+		return -EIO;
+	}
+
+	kmap(bh_result->b_page);
+start_over:
+	reiserfs_write_lock(inode->i_sb);
+	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
+
+research:
+	retval = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (retval != POSITION_FOUND) {
+		use_get_block = 1;
+		goto out;
+	}
+
+	bh = get_last_bh(&path);
+	ih = tp_item_head(&path);
+	item = tp_item_body(&path);
+	pos_in_item = path.pos_in_item;
+
+	/* we've found an unformatted node */
+	if (indirect_item_found(retval, ih)) {
+		if (bytes_copied > 0) {
+			reiserfs_warning(inode->i_sb, "clm-6002",
+					 "bytes_copied %d", bytes_copied);
+		}
+		if (!get_block_num(item, pos_in_item)) {
+			/* crap, we are writing to a hole */
+			use_get_block = 1;
+			goto out;
+		}
+		set_block_dev_mapped(bh_result,
+				     get_block_num(item, pos_in_item), inode);
+	} else if (is_direct_le_ih(ih)) {
+		char *p;
+		p = page_address(bh_result->b_page);
+		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
+		copy_size = ih_item_len(ih) - pos_in_item;
+
+		fs_gen = get_generation(inode->i_sb);
+		copy_item_head(&tmp_ih, ih);
+
+		if (!trans_running) {
+			/* vs-3050 is gone, no need to drop the path */
+			retval = journal_begin(&th, inode->i_sb, jbegin_count);
+			if (retval)
+				goto out;
+			reiserfs_update_inode_transaction(inode);
+			trans_running = 1;
+			if (fs_changed(fs_gen, inode->i_sb)
+			    && item_moved(&tmp_ih, &path)) {
+				reiserfs_restore_prepared_buffer(inode->i_sb,
+								 bh);
+				goto research;
+			}
+		}
+
+		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
+			goto research;
+		}
+
+		memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
+		       copy_size);
+
+		journal_mark_dirty(&th, bh);
+		bytes_copied += copy_size;
+		set_block_dev_mapped(bh_result, 0, inode);
+
+		/* are there still bytes left? */
+		if (bytes_copied < bh_result->b_size &&
+		    (byte_offset + bytes_copied) < inode->i_size) {
+			set_cpu_key_k_offset(&key,
+					     cpu_key_k_offset(&key) +
+					     copy_size);
+			goto research;
+		}
+	} else {
+		reiserfs_warning(inode->i_sb, "clm-6003",
+				 "bad item inode %lu", inode->i_ino);
+		retval = -EIO;
+		goto out;
+	}
+	retval = 0;
+
+out:
+	pathrelse(&path);
+	if (trans_running) {
+		int err = journal_end(&th);
+		if (err)
+			retval = err;
+		trans_running = 0;
+	}
+	reiserfs_write_unlock(inode->i_sb);
+
+	/* this is where we fill in holes in the file. */
+	if (use_get_block) {
+		retval = reiserfs_get_block(inode, block, bh_result,
+					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
+					    | GET_BLOCK_NO_DANGLE);
+		if (!retval) {
+			if (!buffer_mapped(bh_result)
+			    || bh_result->b_blocknr == 0) {
+				/* get_block failed to find a mapped unformatted node. */
+				use_get_block = 0;
+				goto start_over;
+			}
+		}
+	}
+	kunmap(bh_result->b_page);
+
+	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+		/*
+		 * we've copied data from the page into the direct item, so the
+		 * buffer in the page is now clean, mark it to reflect that.
+		 */
+		lock_buffer(bh_result);
+		clear_buffer_dirty(bh_result);
+		unlock_buffer(bh_result);
+	}
+	return retval;
+}
+
+/*
+ * mason@suse.com: updated in 2.5.54 to follow the same general io
+ * start/recovery path as __block_write_full_page, along with special
+ * code to handle reiserfs tails.
+ */
+static int reiserfs_write_full_page(struct page *page,
+				    struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int error = 0;
+	unsigned long block;
+	sector_t last_block;
+	struct buffer_head *head, *bh;
+	int partial = 0;
+	int nr = 0;
+	int checked = PageChecked(page);
+	struct reiserfs_transaction_handle th;
+	struct super_block *s = inode->i_sb;
+	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	th.t_trans_id = 0;
+
+	/* no logging allowed when nonblocking or from PF_MEMALLOC */
+	if (checked && (current->flags & PF_MEMALLOC)) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+
+	/*
+	 * The page dirty bit is cleared before writepage is called, which
+	 * means we have to tell create_empty_buffers to make dirty buffers
+	 * The page really should be up to date at this point, so tossing
+	 * in the BH_Uptodate is just a sanity check.
+	 */
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, s->s_blocksize,
+				     (1 << BH_Dirty) | (1 << BH_Uptodate));
+	}
+	head = page_buffers(page);
+
+	/*
+	 * last page in the file, zero out any contents past the
+	 * last byte in the file
+	 */
+	if (page->index >= end_index) {
+		unsigned last_offset;
+
+		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+		/* no file contents in this page */
+		if (page->index >= end_index + 1 || !last_offset) {
+			unlock_page(page);
+			return 0;
+		}
+		zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
+	}
+	bh = head;
+	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
+	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+	/* first map all the buffers, logging any direct items we find */
+	do {
+		if (block > last_block) {
+			/*
+			 * This can happen when the block size is less than
+			 * the page size.  The corresponding bytes in the page
+			 * were zero filled above
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+		} else if ((checked || buffer_dirty(bh)) &&
+		           (!buffer_mapped(bh) || (buffer_mapped(bh)
+						       && bh->b_blocknr ==
+						       0))) {
+			/*
+			 * not mapped yet, or it points to a direct item, search
+			 * the btree for the mapping info, and log any direct
+			 * items found
+			 */
+			if ((error = map_block_for_writepage(inode, bh, block))) {
+				goto fail;
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	/*
+	 * we start the transaction after map_block_for_writepage,
+	 * because it can create holes in the file (an unbounded operation).
+	 * starting it here, we can make a reliable estimate for how many
+	 * blocks we're going to log
+	 */
+	if (checked) {
+		ClearPageChecked(page);
+		reiserfs_write_lock(s);
+		error = journal_begin(&th, s, bh_per_page + 1);
+		if (error) {
+			reiserfs_write_unlock(s);
+			goto fail;
+		}
+		reiserfs_update_inode_transaction(inode);
+	}
+	/* now go through and lock any dirty buffers on the page */
+	do {
+		get_bh(bh);
+		if (!buffer_mapped(bh))
+			continue;
+		if (buffer_mapped(bh) && bh->b_blocknr == 0)
+			continue;
+
+		if (checked) {
+			reiserfs_prepare_for_journal(s, bh, 1);
+			journal_mark_dirty(&th, bh);
+			continue;
+		}
+		/*
+		 * from this point on, we know the buffer is mapped to a
+		 * real block and not a direct item
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			lock_buffer(bh);
+		} else {
+			if (!trylock_buffer(bh)) {
+				redirty_page_for_writepage(wbc, page);
+				continue;
+			}
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	if (checked) {
+		error = journal_end(&th);
+		reiserfs_write_unlock(s);
+		if (error)
+			goto fail;
+	}
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+
+	/*
+	 * since any buffer might be the only dirty buffer on the page,
+	 * the first submit_bh can bring the page out of writeback.
+	 * be careful with the buffers.
+	 */
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			nr++;
+		}
+		put_bh(bh);
+		bh = next;
+	} while (bh != head);
+
+	error = 0;
+done:
+	if (nr == 0) {
+		/*
+		 * if this page only had a direct item, it is very possible for
+		 * no io to be required without there being an error.  Or,
+		 * someone else could have locked them and sent them down the
+		 * pipe without locking the page
+		 */
+		bh = head;
+		do {
+			if (!buffer_uptodate(bh)) {
+				partial = 1;
+				break;
+			}
+			bh = bh->b_this_page;
+		} while (bh != head);
+		if (!partial)
+			SetPageUptodate(page);
+		end_page_writeback(page);
+	}
+	return error;
+
+fail:
+	/*
+	 * catches various errors, we need to make sure any valid dirty blocks
+	 * get to the media.  The page is currently locked and not marked for
+	 * writeback
+	 */
+	ClearPageUptodate(page);
+	bh = head;
+	do {
+		get_bh(bh);
+		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
+			lock_buffer(bh);
+			mark_buffer_async_write(bh);
+		} else {
+			/*
+			 * clear any dirty bits that might have come from
+			 * getting attached to a dirty page
+			 */
+			clear_buffer_dirty(bh);
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(WRITE, bh);
+			nr++;
+		}
+		put_bh(bh);
+		bh = next;
+	} while (bh != head);
+	goto done;
+}
+
+static int reiserfs_readpage(struct file *f, struct page *page)
+{
+	return block_read_full_page(page, reiserfs_get_block);
+}
+
+static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	reiserfs_wait_on_write_block(inode->i_sb);
+	return reiserfs_write_full_page(page, wbc);
+}
+
+static void reiserfs_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	reiserfs_truncate_file(inode, 0);
+}
+
+static int reiserfs_write_begin(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	struct inode *inode;
+	struct page *page;
+	pgoff_t index;
+	int ret;
+	int old_ref = 0;
+
+ 	inode = mapping->host;
+	*fsdata = NULL;
+ 	if (flags & AOP_FLAG_CONT_EXPAND &&
+ 	    (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
+ 		pos ++;
+		*fsdata = (void *)(unsigned long)flags;
+	}
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	reiserfs_wait_on_write_block(inode->i_sb);
+	fix_tail_page_for_writing(page);
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th;
+		th = (struct reiserfs_transaction_handle *)current->
+		    journal_info;
+		BUG_ON(!th->t_refcount);
+		BUG_ON(!th->t_trans_id);
+		old_ref = th->t_refcount;
+		th->t_refcount++;
+	}
+	ret = __block_write_begin(page, pos, len, reiserfs_get_block);
+	if (ret && reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th = current->journal_info;
+		/*
+		 * this gets a little ugly.  If reiserfs_get_block returned an
+		 * error and left a transacstion running, we've got to close
+		 * it, and we've got to free handle if it was a persistent
+		 * transaction.
+		 *
+		 * But, if we had nested into an existing transaction, we need
+		 * to just drop the ref count on the handle.
+		 *
+		 * If old_ref == 0, the transaction is from reiserfs_get_block,
+		 * and it was a persistent trans.  Otherwise, it was nested
+		 * above.
+		 */
+		if (th->t_refcount > old_ref) {
+			if (old_ref)
+				th->t_refcount--;
+			else {
+				int err;
+				reiserfs_write_lock(inode->i_sb);
+				err = reiserfs_end_persistent_transaction(th);
+				reiserfs_write_unlock(inode->i_sb);
+				if (err)
+					ret = err;
+			}
+		}
+	}
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+		/* Truncate allocated blocks */
+		reiserfs_truncate_failed_write(inode);
+	}
+	return ret;
+}
+
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+	int old_ref = 0;
+	int depth;
+
+	depth = reiserfs_write_unlock_nested(inode->i_sb);
+	reiserfs_wait_on_write_block(inode->i_sb);
+	reiserfs_write_lock_nested(inode->i_sb, depth);
+
+	fix_tail_page_for_writing(page);
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th;
+		th = (struct reiserfs_transaction_handle *)current->
+		    journal_info;
+		BUG_ON(!th->t_refcount);
+		BUG_ON(!th->t_trans_id);
+		old_ref = th->t_refcount;
+		th->t_refcount++;
+	}
+
+	ret = __block_write_begin(page, from, len, reiserfs_get_block);
+	if (ret && reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th = current->journal_info;
+		/*
+		 * this gets a little ugly.  If reiserfs_get_block returned an
+		 * error and left a transacstion running, we've got to close
+		 * it, and we've got to free handle if it was a persistent
+		 * transaction.
+		 *
+		 * But, if we had nested into an existing transaction, we need
+		 * to just drop the ref count on the handle.
+		 *
+		 * If old_ref == 0, the transaction is from reiserfs_get_block,
+		 * and it was a persistent trans.  Otherwise, it was nested
+		 * above.
+		 */
+		if (th->t_refcount > old_ref) {
+			if (old_ref)
+				th->t_refcount--;
+			else {
+				int err;
+				reiserfs_write_lock(inode->i_sb);
+				err = reiserfs_end_persistent_transaction(th);
+				reiserfs_write_unlock(inode->i_sb);
+				if (err)
+					ret = err;
+			}
+		}
+	}
+	return ret;
+
+}
+
+static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
+{
+	return generic_block_bmap(as, block, reiserfs_bmap);
+}
+
+static int reiserfs_write_end(struct file *file, struct address_space *mapping,
+			      loff_t pos, unsigned len, unsigned copied,
+			      struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = 0;
+	int update_sd = 0;
+	struct reiserfs_transaction_handle *th;
+	unsigned start;
+	bool locked = false;
+
+	if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
+		pos ++;
+
+	reiserfs_wait_on_write_block(inode->i_sb);
+	if (reiserfs_transaction_running(inode->i_sb))
+		th = current->journal_info;
+	else
+		th = NULL;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(page))
+			copied = 0;
+
+		page_zero_new_buffers(page, start + copied, start + len);
+	}
+	flush_dcache_page(page);
+
+	reiserfs_commit_page(inode, page, start, start + copied);
+
+	/*
+	 * generic_commit_write does this for us, but does not update the
+	 * transaction tracking stuff when the size changes.  So, we have
+	 * to do the i_size updates here.
+	 */
+	if (pos + copied > inode->i_size) {
+		struct reiserfs_transaction_handle myth;
+		reiserfs_write_lock(inode->i_sb);
+		locked = true;
+		/*
+		 * If the file have grown beyond the border where it
+		 * can have a tail, unmark it as needing a tail
+		 * packing
+		 */
+		if ((have_large_tails(inode->i_sb)
+		     && inode->i_size > i_block_size(inode) * 4)
+		    || (have_small_tails(inode->i_sb)
+			&& inode->i_size > i_block_size(inode)))
+			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+
+		ret = journal_begin(&myth, inode->i_sb, 1);
+		if (ret)
+			goto journal_error;
+
+		reiserfs_update_inode_transaction(inode);
+		inode->i_size = pos + copied;
+		/*
+		 * this will just nest into our transaction.  It's important
+		 * to use mark_inode_dirty so the inode gets pushed around on
+		 * the dirty lists, and so that O_SYNC works as expected
+		 */
+		mark_inode_dirty(inode);
+		reiserfs_update_sd(&myth, inode);
+		update_sd = 1;
+		ret = journal_end(&myth);
+		if (ret)
+			goto journal_error;
+	}
+	if (th) {
+		if (!locked) {
+			reiserfs_write_lock(inode->i_sb);
+			locked = true;
+		}
+		if (!update_sd)
+			mark_inode_dirty(inode);
+		ret = reiserfs_end_persistent_transaction(th);
+		if (ret)
+			goto out;
+	}
+
+out:
+	if (locked)
+		reiserfs_write_unlock(inode->i_sb);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		reiserfs_truncate_failed_write(inode);
+
+	return ret == 0 ? copied : ret;
+
+journal_error:
+	reiserfs_write_unlock(inode->i_sb);
+	locked = false;
+	if (th) {
+		if (!update_sd)
+			reiserfs_update_sd(th, inode);
+		ret = reiserfs_end_persistent_transaction(th);
+	}
+	goto out;
+}
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
+	int ret = 0;
+	int update_sd = 0;
+	struct reiserfs_transaction_handle *th = NULL;
+	int depth;
+
+	depth = reiserfs_write_unlock_nested(inode->i_sb);
+	reiserfs_wait_on_write_block(inode->i_sb);
+	reiserfs_write_lock_nested(inode->i_sb, depth);
+
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		th = current->journal_info;
+	}
+	reiserfs_commit_page(inode, page, from, to);
+
+	/*
+	 * generic_commit_write does this for us, but does not update the
+	 * transaction tracking stuff when the size changes.  So, we have
+	 * to do the i_size updates here.
+	 */
+	if (pos > inode->i_size) {
+		struct reiserfs_transaction_handle myth;
+		/*
+		 * If the file have grown beyond the border where it
+		 * can have a tail, unmark it as needing a tail
+		 * packing
+		 */
+		if ((have_large_tails(inode->i_sb)
+		     && inode->i_size > i_block_size(inode) * 4)
+		    || (have_small_tails(inode->i_sb)
+			&& inode->i_size > i_block_size(inode)))
+			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+
+		ret = journal_begin(&myth, inode->i_sb, 1);
+		if (ret)
+			goto journal_error;
+
+		reiserfs_update_inode_transaction(inode);
+		inode->i_size = pos;
+		/*
+		 * this will just nest into our transaction.  It's important
+		 * to use mark_inode_dirty so the inode gets pushed around
+		 * on the dirty lists, and so that O_SYNC works as expected
+		 */
+		mark_inode_dirty(inode);
+		reiserfs_update_sd(&myth, inode);
+		update_sd = 1;
+		ret = journal_end(&myth);
+		if (ret)
+			goto journal_error;
+	}
+	if (th) {
+		if (!update_sd)
+			mark_inode_dirty(inode);
+		ret = reiserfs_end_persistent_transaction(th);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
+
+journal_error:
+	if (th) {
+		if (!update_sd)
+			reiserfs_update_sd(th, inode);
+		ret = reiserfs_end_persistent_transaction(th);
+	}
+
+	return ret;
+}
+
+void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
+{
+	if (reiserfs_attrs(inode->i_sb)) {
+		if (sd_attrs & REISERFS_SYNC_FL)
+			inode->i_flags |= S_SYNC;
+		else
+			inode->i_flags &= ~S_SYNC;
+		if (sd_attrs & REISERFS_IMMUTABLE_FL)
+			inode->i_flags |= S_IMMUTABLE;
+		else
+			inode->i_flags &= ~S_IMMUTABLE;
+		if (sd_attrs & REISERFS_APPEND_FL)
+			inode->i_flags |= S_APPEND;
+		else
+			inode->i_flags &= ~S_APPEND;
+		if (sd_attrs & REISERFS_NOATIME_FL)
+			inode->i_flags |= S_NOATIME;
+		else
+			inode->i_flags &= ~S_NOATIME;
+		if (sd_attrs & REISERFS_NOTAIL_FL)
+			REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		else
+			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+	}
+}
+
+void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
+{
+	if (reiserfs_attrs(inode->i_sb)) {
+		if (inode->i_flags & S_IMMUTABLE)
+			*sd_attrs |= REISERFS_IMMUTABLE_FL;
+		else
+			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
+		if (inode->i_flags & S_SYNC)
+			*sd_attrs |= REISERFS_SYNC_FL;
+		else
+			*sd_attrs &= ~REISERFS_SYNC_FL;
+		if (inode->i_flags & S_NOATIME)
+			*sd_attrs |= REISERFS_NOATIME_FL;
+		else
+			*sd_attrs &= ~REISERFS_NOATIME_FL;
+		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
+			*sd_attrs |= REISERFS_NOTAIL_FL;
+		else
+			*sd_attrs &= ~REISERFS_NOTAIL_FL;
+	}
+}
+
+/*
+ * decide if this buffer needs to stay around for data logging or ordered
+ * write purposes
+ */
+static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+{
+	int ret = 1;
+	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+
+	lock_buffer(bh);
+	spin_lock(&j->j_dirty_buffers_lock);
+	if (!buffer_mapped(bh)) {
+		goto free_jh;
+	}
+	/*
+	 * the page is locked, and the only places that log a data buffer
+	 * also lock the page.
+	 */
+	if (reiserfs_file_data_log(inode)) {
+		/*
+		 * very conservative, leave the buffer pinned if
+		 * anyone might need it.
+		 */
+		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+			ret = 0;
+		}
+	} else  if (buffer_dirty(bh)) {
+		struct reiserfs_journal_list *jl;
+		struct reiserfs_jh *jh = bh->b_private;
+
+		/*
+		 * why is this safe?
+		 * reiserfs_setattr updates i_size in the on disk
+		 * stat data before allowing vmtruncate to be called.
+		 *
+		 * If buffer was put onto the ordered list for this
+		 * transaction, we know for sure either this transaction
+		 * or an older one already has updated i_size on disk,
+		 * and this ordered data won't be referenced in the file
+		 * if we crash.
+		 *
+		 * if the buffer was put onto the ordered list for an older
+		 * transaction, we need to leave it around
+		 */
+		if (jh && (jl = jh->jl)
+		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+			ret = 0;
+	}
+free_jh:
+	if (ret && bh->b_private) {
+		reiserfs_free_jh(bh);
+	}
+	spin_unlock(&j->j_dirty_buffers_lock);
+	unlock_buffer(bh);
+	return ret;
+}
+
+/* clm -- taken from fs/buffer.c:block_invalidate_page */
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+				    unsigned int length)
+{
+	struct buffer_head *head, *bh, *next;
+	struct inode *inode = page->mapping->host;
+	unsigned int curr_off = 0;
+	unsigned int stop = offset + length;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
+	int ret = 1;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!partial_page)
+		ClearPageChecked(page);
+
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		if (next_off > stop)
+			goto out;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off) {
+			if (invalidatepage_can_drop(inode, bh))
+				reiserfs_unmap_buffer(bh);
+			else
+				ret = 0;
+		}
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * We release buffers only if the entire page is being invalidated.
+	 * The get_block cached value has been unconditionally invalidated,
+	 * so real IO is not possible anymore.
+	 */
+	if (!partial_page && ret) {
+		ret = try_to_release_page(page, 0);
+		/* maybe should BUG_ON(!ret); - neilb */
+	}
+out:
+	return;
+}
+
+static int reiserfs_set_page_dirty(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	if (reiserfs_file_data_log(inode)) {
+		SetPageChecked(page);
+		return __set_page_dirty_nobuffers(page);
+	}
+	return __set_page_dirty_buffers(page);
+}
+
+/*
+ * Returns 1 if the page's buffers were dropped.  The page is locked.
+ *
+ * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
+ * in the buffers at page_buffers(page).
+ *
+ * even in -o notail mode, we can't be sure an old mount without -o notail
+ * didn't create files with tails.
+ */
+static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+	struct inode *inode = page->mapping->host;
+	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 1;
+
+	WARN_ON(PageChecked(page));
+	spin_lock(&j->j_dirty_buffers_lock);
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (bh->b_private) {
+			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+				reiserfs_free_jh(bh);
+			} else {
+				ret = 0;
+				break;
+			}
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+	if (ret)
+		ret = try_to_free_buffers(page);
+	spin_unlock(&j->j_dirty_buffers_lock);
+	return ret;
+}
+
+/*
+ * We thank Mingming Cao for helping us understand in great detail what
+ * to do in this section of the code.
+ */
+static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+				  loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(iocb, inode, iter, offset,
+				 reiserfs_get_blocks_direct_io);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + count;
+
+		if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
+			truncate_setsize(inode, isize);
+			reiserfs_vfs_truncate_file(inode);
+		}
+	}
+
+	return ret;
+}
+
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	unsigned int ia_valid;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	/* must be turned off for recursive notify_change calls */
+	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
+
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	reiserfs_write_lock(inode->i_sb);
+	if (attr->ia_valid & ATTR_SIZE) {
+		/*
+		 * version 2 items will be caught by the s_maxbytes check
+		 * done for us in vmtruncate
+		 */
+		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
+		    attr->ia_size > MAX_NON_LFS) {
+			reiserfs_write_unlock(inode->i_sb);
+			error = -EFBIG;
+			goto out;
+		}
+
+		inode_dio_wait(inode);
+
+		/* fill in hole pointers in the expanding truncate case. */
+		if (attr->ia_size > inode->i_size) {
+			error = generic_cont_expand_simple(inode, attr->ia_size);
+			if (REISERFS_I(inode)->i_prealloc_count > 0) {
+				int err;
+				struct reiserfs_transaction_handle th;
+				/* we're changing at most 2 bitmaps, inode + super */
+				err = journal_begin(&th, inode->i_sb, 4);
+				if (!err) {
+					reiserfs_discard_prealloc(&th, inode);
+					err = journal_end(&th);
+				}
+				if (err)
+					error = err;
+			}
+			if (error) {
+				reiserfs_write_unlock(inode->i_sb);
+				goto out;
+			}
+			/*
+			 * file size is changed, ctime and mtime are
+			 * to be updated
+			 */
+			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
+		}
+	}
+	reiserfs_write_unlock(inode->i_sb);
+
+	if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
+	     ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
+	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
+		/* stat data of format v3.5 has 16 bit uid and gid */
+		error = -EINVAL;
+		goto out;
+	}
+
+	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+		struct reiserfs_transaction_handle th;
+		int jbegin_count =
+		    2 *
+		    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
+		     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
+		    2;
+
+		error = reiserfs_chown_xattrs(inode, attr);
+
+		if (error)
+			return error;
+
+		/*
+		 * (user+group)*(old+new) structure - we count quota
+		 * info and , inode write (sb, inode)
+		 */
+		reiserfs_write_lock(inode->i_sb);
+		error = journal_begin(&th, inode->i_sb, jbegin_count);
+		reiserfs_write_unlock(inode->i_sb);
+		if (error)
+			goto out;
+		error = dquot_transfer(inode, attr);
+		reiserfs_write_lock(inode->i_sb);
+		if (error) {
+			journal_end(&th);
+			reiserfs_write_unlock(inode->i_sb);
+			goto out;
+		}
+
+		/*
+		 * Update corresponding info in inode so that everything
+		 * is in one transaction
+		 */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		mark_inode_dirty(inode);
+		error = journal_end(&th);
+		reiserfs_write_unlock(inode->i_sb);
+		if (error)
+			goto out;
+	}
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (!error) {
+			/*
+			 * Could race against reiserfs_file_release
+			 * if called from NFS, so take tailpack mutex.
+			 */
+			mutex_lock(&REISERFS_I(inode)->tailpack);
+			truncate_setsize(inode, attr->ia_size);
+			reiserfs_truncate_file(inode, 1);
+			mutex_unlock(&REISERFS_I(inode)->tailpack);
+		}
+	}
+
+	if (!error) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
+
+	if (!error && reiserfs_posixacl(inode->i_sb)) {
+		if (attr->ia_valid & ATTR_MODE)
+			error = reiserfs_acl_chmod(inode);
+	}
+
+out:
+	return error;
+}
+
+const struct address_space_operations reiserfs_address_space_operations = {
+	.writepage = reiserfs_writepage,
+	.readpage = reiserfs_readpage,
+	.readpages = reiserfs_readpages,
+	.releasepage = reiserfs_releasepage,
+	.invalidatepage = reiserfs_invalidatepage,
+	.write_begin = reiserfs_write_begin,
+	.write_end = reiserfs_write_end,
+	.bmap = reiserfs_aop_bmap,
+	.direct_IO = reiserfs_direct_IO,
+	.set_page_dirty = reiserfs_set_page_dirty,
+};
diff --git a/kernel/fs/reiserfs/ioctl.c b/kernel/fs/reiserfs/ioctl.c
new file mode 100644
index 000000000..6ec8a30a0
--- /dev/null
+++ b/kernel/fs/reiserfs/ioctl.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include "reiserfs.h"
+#include <linux/time.h>
+#include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/compat.h>
+
+/*
+ * reiserfs_ioctl - handler for ioctl for inode
+ * supported commands:
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *                           and prevent packing file (argument arg has t
+ *			      be non-zero)
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  3) That's all for a while ...
+ */
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	unsigned int flags;
+	int err = 0;
+
+	reiserfs_write_lock(inode->i_sb);
+
+	switch (cmd) {
+	case REISERFS_IOC_UNPACK:
+		if (S_ISREG(inode->i_mode)) {
+			if (arg)
+				err = reiserfs_unpack(inode, filp);
+		} else
+			err = -ENOTTY;
+		break;
+		/*
+		 * following two cases are taken from fs/ext2/ioctl.c by Remy
+		 * Card (card@masi.ibp.fr)
+		 */
+	case REISERFS_IOC_GETFLAGS:
+		if (!reiserfs_attrs(inode->i_sb)) {
+			err = -ENOTTY;
+			break;
+		}
+
+		flags = REISERFS_I(inode)->i_attrs;
+		i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
+		err = put_user(flags, (int __user *)arg);
+		break;
+	case REISERFS_IOC_SETFLAGS:{
+			if (!reiserfs_attrs(inode->i_sb)) {
+				err = -ENOTTY;
+				break;
+			}
+
+			err = mnt_want_write_file(filp);
+			if (err)
+				break;
+
+			if (!inode_owner_or_capable(inode)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
+			if (get_user(flags, (int __user *)arg)) {
+				err = -EFAULT;
+				goto setflags_out;
+			}
+			/*
+			 * Is it quota file? Do not allow user to mess with it
+			 */
+			if (IS_NOQUOTA(inode)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
+			if (((flags ^ REISERFS_I(inode)->
+			      i_attrs) & (REISERFS_IMMUTABLE_FL |
+					  REISERFS_APPEND_FL))
+			    && !capable(CAP_LINUX_IMMUTABLE)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
+			if ((flags & REISERFS_NOTAIL_FL) &&
+			    S_ISREG(inode->i_mode)) {
+				int result;
+
+				result = reiserfs_unpack(inode, filp);
+				if (result) {
+					err = result;
+					goto setflags_out;
+				}
+			}
+			sd_attrs_to_i_attrs(flags, inode);
+			REISERFS_I(inode)->i_attrs = flags;
+			inode->i_ctime = CURRENT_TIME_SEC;
+			mark_inode_dirty(inode);
+setflags_out:
+			mnt_drop_write_file(filp);
+			break;
+		}
+	case REISERFS_IOC_GETVERSION:
+		err = put_user(inode->i_generation, (int __user *)arg);
+		break;
+	case REISERFS_IOC_SETVERSION:
+		if (!inode_owner_or_capable(inode)) {
+			err = -EPERM;
+			break;
+		}
+		err = mnt_want_write_file(filp);
+		if (err)
+			break;
+		if (get_user(inode->i_generation, (int __user *)arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
+setversion_out:
+		mnt_drop_write_file(filp);
+		break;
+	default:
+		err = -ENOTTY;
+	}
+
+	reiserfs_write_unlock(inode->i_sb);
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	/*
+	 * These are just misnamed, they actually
+	 * get/put from/to user an int
+	 */
+	switch (cmd) {
+	case REISERFS_IOC32_UNPACK:
+		cmd = REISERFS_IOC_UNPACK;
+		break;
+	case REISERFS_IOC32_GETFLAGS:
+		cmd = REISERFS_IOC_GETFLAGS;
+		break;
+	case REISERFS_IOC32_SETFLAGS:
+		cmd = REISERFS_IOC_SETFLAGS;
+		break;
+	case REISERFS_IOC32_GETVERSION:
+		cmd = REISERFS_IOC_GETVERSION;
+		break;
+	case REISERFS_IOC32_SETVERSION:
+		cmd = REISERFS_IOC_SETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to);
+/*
+ * reiserfs_unpack
+ * Function try to convert tail from direct item into indirect.
+ * It set up nopack attribute in the REISERFS_I(inode)->nopack
+ */
+int reiserfs_unpack(struct inode *inode, struct file *filp)
+{
+	int retval = 0;
+	int index;
+	struct page *page;
+	struct address_space *mapping;
+	unsigned long write_from;
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+
+	if (inode->i_size == 0) {
+		REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		return 0;
+	}
+	/* ioctl already done */
+	if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
+		return 0;
+	}
+
+	/* we need to make sure nobody is changing the file size beneath us */
+	reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
+
+	reiserfs_write_lock(inode->i_sb);
+
+	write_from = inode->i_size & (blocksize - 1);
+	/* if we are on a block boundary, we are already unpacked.  */
+	if (write_from == 0) {
+		REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		goto out;
+	}
+
+	/*
+	 * we unpack by finding the page with the tail, and calling
+	 * __reiserfs_write_begin on that page.  This will force a
+	 * reiserfs_get_block to unpack the tail for us.
+	 */
+	index = inode->i_size >> PAGE_CACHE_SHIFT;
+	mapping = inode->i_mapping;
+	page = grab_cache_page(mapping, index);
+	retval = -ENOMEM;
+	if (!page) {
+		goto out;
+	}
+	retval = __reiserfs_write_begin(page, write_from, 0);
+	if (retval)
+		goto out_unlock;
+
+	/* conversion can change page contents, must flush */
+	flush_dcache_page(page);
+	retval = reiserfs_commit_write(NULL, page, write_from, write_from);
+	REISERFS_I(inode)->i_flags |= i_nopack_mask;
+
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	reiserfs_write_unlock(inode->i_sb);
+	return retval;
+}
diff --git a/kernel/fs/reiserfs/item_ops.c b/kernel/fs/reiserfs/item_ops.c
new file mode 100644
index 000000000..aca73dd73
--- /dev/null
+++ b/kernel/fs/reiserfs/item_ops.c
@@ -0,0 +1,752 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include "reiserfs.h"
+
+/*
+ * this contains item handlers for old item types: sd, direct,
+ * indirect, directory
+ */
+
+/*
+ * and where are the comments? how about saying where we can find an
+ * explanation of each item handler method? -Hans
+ */
+
+/* stat data functions */
+static int sd_bytes_number(struct item_head *ih, int block_size)
+{
+	return 0;
+}
+
+static void sd_decrement_key(struct cpu_key *key)
+{
+	key->on_disk_key.k_objectid--;
+	set_cpu_key_k_type(key, TYPE_ANY);
+	set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
+}
+
+static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
+{
+	return 0;
+}
+
+static char *print_time(time_t t)
+{
+	static char timebuf[256];
+
+	sprintf(timebuf, "%ld", t);
+	return timebuf;
+}
+
+static void sd_print_item(struct item_head *ih, char *item)
+{
+	printk("\tmode | size | nlinks | first direct | mtime\n");
+	if (stat_data_v1(ih)) {
+		struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
+
+		printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
+		       sd_v1_size(sd), sd_v1_nlink(sd),
+		       sd_v1_first_direct_byte(sd),
+		       print_time(sd_v1_mtime(sd)));
+	} else {
+		struct stat_data *sd = (struct stat_data *)item;
+
+		printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd),
+		       (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
+		       sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
+	}
+}
+
+static void sd_check_item(struct item_head *ih, char *item)
+{
+	/* unused */
+}
+
+static int sd_create_vi(struct virtual_node *vn,
+			struct virtual_item *vi,
+			int is_affected, int insert_size)
+{
+	vi->vi_index = TYPE_STAT_DATA;
+	return 0;
+}
+
+static int sd_check_left(struct virtual_item *vi, int free,
+			 int start_skip, int end_skip)
+{
+	BUG_ON(start_skip || end_skip);
+	return -1;
+}
+
+static int sd_check_right(struct virtual_item *vi, int free)
+{
+	return -1;
+}
+
+static int sd_part_size(struct virtual_item *vi, int first, int count)
+{
+	BUG_ON(count);
+	return 0;
+}
+
+static int sd_unit_num(struct virtual_item *vi)
+{
+	return vi->vi_item_len - IH_SIZE;
+}
+
+static void sd_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "reiserfs-16100",
+			 "STATDATA, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations stat_data_ops = {
+	.bytes_number = sd_bytes_number,
+	.decrement_key = sd_decrement_key,
+	.is_left_mergeable = sd_is_left_mergeable,
+	.print_item = sd_print_item,
+	.check_item = sd_check_item,
+
+	.create_vi = sd_create_vi,
+	.check_left = sd_check_left,
+	.check_right = sd_check_right,
+	.part_size = sd_part_size,
+	.unit_num = sd_unit_num,
+	.print_vi = sd_print_vi
+};
+
+/* direct item functions */
+static int direct_bytes_number(struct item_head *ih, int block_size)
+{
+	return ih_item_len(ih);
+}
+
+/* FIXME: this should probably switch to indirect as well */
+static void direct_decrement_key(struct cpu_key *key)
+{
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
+}
+
+static int direct_is_left_mergeable(struct reiserfs_key *key,
+				    unsigned long bsize)
+{
+	int version = le_key_version(key);
+	return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
+}
+
+static void direct_print_item(struct item_head *ih, char *item)
+{
+	int j = 0;
+
+/*    return; */
+	printk("\"");
+	while (j < ih_item_len(ih))
+		printk("%c", item[j++]);
+	printk("\"\n");
+}
+
+static void direct_check_item(struct item_head *ih, char *item)
+{
+	/* unused */
+}
+
+static int direct_create_vi(struct virtual_node *vn,
+			    struct virtual_item *vi,
+			    int is_affected, int insert_size)
+{
+	vi->vi_index = TYPE_DIRECT;
+	return 0;
+}
+
+static int direct_check_left(struct virtual_item *vi, int free,
+			     int start_skip, int end_skip)
+{
+	int bytes;
+
+	bytes = free - free % 8;
+	return bytes ? : -1;
+}
+
+static int direct_check_right(struct virtual_item *vi, int free)
+{
+	return direct_check_left(vi, free, 0, 0);
+}
+
+static int direct_part_size(struct virtual_item *vi, int first, int count)
+{
+	return count;
+}
+
+static int direct_unit_num(struct virtual_item *vi)
+{
+	return vi->vi_item_len - IH_SIZE;
+}
+
+static void direct_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "reiserfs-16101",
+			 "DIRECT, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations direct_ops = {
+	.bytes_number = direct_bytes_number,
+	.decrement_key = direct_decrement_key,
+	.is_left_mergeable = direct_is_left_mergeable,
+	.print_item = direct_print_item,
+	.check_item = direct_check_item,
+
+	.create_vi = direct_create_vi,
+	.check_left = direct_check_left,
+	.check_right = direct_check_right,
+	.part_size = direct_part_size,
+	.unit_num = direct_unit_num,
+	.print_vi = direct_print_vi
+};
+
+/* indirect item functions */
+static int indirect_bytes_number(struct item_head *ih, int block_size)
+{
+	return ih_item_len(ih) / UNFM_P_SIZE * block_size;
+}
+
+/* decrease offset, if it becomes 0, change type to stat data */
+static void indirect_decrement_key(struct cpu_key *key)
+{
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
+}
+
+/* if it is not first item of the body, then it is mergeable */
+static int indirect_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
+{
+	int version = le_key_version(key);
+	return (le_key_k_offset(version, key) != 1);
+}
+
+/* printing of indirect item */
+static void start_new_sequence(__u32 * start, int *len, __u32 new)
+{
+	*start = new;
+	*len = 1;
+}
+
+static int sequence_finished(__u32 start, int *len, __u32 new)
+{
+	if (start == INT_MAX)
+		return 1;
+
+	if (start == 0 && new == 0) {
+		(*len)++;
+		return 0;
+	}
+	if (start != 0 && (start + *len) == new) {
+		(*len)++;
+		return 0;
+	}
+	return 1;
+}
+
+static void print_sequence(__u32 start, int len)
+{
+	if (start == INT_MAX)
+		return;
+
+	if (len == 1)
+		printk(" %d", start);
+	else
+		printk(" %d(%d)", start, len);
+}
+
+static void indirect_print_item(struct item_head *ih, char *item)
+{
+	int j;
+	__le32 *unp;
+	__u32 prev = INT_MAX;
+	int num = 0;
+
+	unp = (__le32 *) item;
+
+	if (ih_item_len(ih) % UNFM_P_SIZE)
+		reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
+
+	printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
+	for (j = 0; j < I_UNFM_NUM(ih); j++) {
+		if (sequence_finished(prev, &num, get_block_num(unp, j))) {
+			print_sequence(prev, num);
+			start_new_sequence(&prev, &num, get_block_num(unp, j));
+		}
+	}
+	print_sequence(prev, num);
+	printk("]\n");
+}
+
+static void indirect_check_item(struct item_head *ih, char *item)
+{
+	/* unused */
+}
+
+static int indirect_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
+{
+	vi->vi_index = TYPE_INDIRECT;
+	return 0;
+}
+
+static int indirect_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
+{
+	int bytes;
+
+	bytes = free - free % UNFM_P_SIZE;
+	return bytes ? : -1;
+}
+
+static int indirect_check_right(struct virtual_item *vi, int free)
+{
+	return indirect_check_left(vi, free, 0, 0);
+}
+
+/*
+ * return size in bytes of 'units' units. If first == 0 - calculate
+ * from the head (left), otherwise - from tail (right)
+ */
+static int indirect_part_size(struct virtual_item *vi, int first, int units)
+{
+	/* unit of indirect item is byte (yet) */
+	return units;
+}
+
+static int indirect_unit_num(struct virtual_item *vi)
+{
+	/* unit of indirect item is byte (yet) */
+	return vi->vi_item_len - IH_SIZE;
+}
+
+static void indirect_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "reiserfs-16103",
+			 "INDIRECT, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations indirect_ops = {
+	.bytes_number = indirect_bytes_number,
+	.decrement_key = indirect_decrement_key,
+	.is_left_mergeable = indirect_is_left_mergeable,
+	.print_item = indirect_print_item,
+	.check_item = indirect_check_item,
+
+	.create_vi = indirect_create_vi,
+	.check_left = indirect_check_left,
+	.check_right = indirect_check_right,
+	.part_size = indirect_part_size,
+	.unit_num = indirect_unit_num,
+	.print_vi = indirect_print_vi
+};
+
+/* direntry functions */
+static int direntry_bytes_number(struct item_head *ih, int block_size)
+{
+	reiserfs_warning(NULL, "vs-16090",
+			 "bytes number is asked for direntry");
+	return 0;
+}
+
+static void direntry_decrement_key(struct cpu_key *key)
+{
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
+}
+
+static int direntry_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
+{
+	if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
+		return 0;
+	return 1;
+
+}
+
+static void direntry_print_item(struct item_head *ih, char *item)
+{
+	int i;
+	int namelen;
+	struct reiserfs_de_head *deh;
+	char *name;
+	static char namebuf[80];
+
+	printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
+	       "Key of pointed object", "Hash", "Gen number", "Status");
+
+	deh = (struct reiserfs_de_head *)item;
+
+	for (i = 0; i < ih_entry_count(ih); i++, deh++) {
+		namelen =
+		    (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
+		    deh_location(deh);
+		name = item + deh_location(deh);
+		if (name[namelen - 1] == 0)
+			namelen = strlen(name);
+		namebuf[0] = '"';
+		if (namelen > sizeof(namebuf) - 3) {
+			strncpy(namebuf + 1, name, sizeof(namebuf) - 3);
+			namebuf[sizeof(namebuf) - 2] = '"';
+			namebuf[sizeof(namebuf) - 1] = 0;
+		} else {
+			memcpy(namebuf + 1, name, namelen);
+			namebuf[namelen + 1] = '"';
+			namebuf[namelen + 2] = 0;
+		}
+
+		printk("%d:  %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
+		       i, namebuf,
+		       deh_dir_id(deh), deh_objectid(deh),
+		       GET_HASH_VALUE(deh_offset(deh)),
+		       GET_GENERATION_NUMBER((deh_offset(deh))),
+		       (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
+	}
+}
+
+static void direntry_check_item(struct item_head *ih, char *item)
+{
+	int i;
+	struct reiserfs_de_head *deh;
+
+	/* unused */
+	deh = (struct reiserfs_de_head *)item;
+	for (i = 0; i < ih_entry_count(ih); i++, deh++) {
+		;
+	}
+}
+
+#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
+
+/*
+ * function returns old entry number in directory item in real node
+ * using new entry number in virtual item in virtual node
+ */
+static inline int old_entry_num(int is_affected, int virtual_entry_num,
+				int pos_in_item, int mode)
+{
+	if (mode == M_INSERT || mode == M_DELETE)
+		return virtual_entry_num;
+
+	if (!is_affected)
+		/* cut or paste is applied to another item */
+		return virtual_entry_num;
+
+	if (virtual_entry_num < pos_in_item)
+		return virtual_entry_num;
+
+	if (mode == M_CUT)
+		return virtual_entry_num + 1;
+
+	RFALSE(mode != M_PASTE || virtual_entry_num == 0,
+	       "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
+	       mode);
+
+	return virtual_entry_num - 1;
+}
+
+/*
+ * Create an array of sizes of directory entries for virtual
+ * item. Return space used by an item. FIXME: no control over
+ * consuming of space used by this item handler
+ */
+static int direntry_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
+{
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+	int i, j;
+	int size = sizeof(struct direntry_uarea);
+	struct reiserfs_de_head *deh;
+
+	vi->vi_index = TYPE_DIRENTRY;
+
+	BUG_ON(!(vi->vi_ih) || !vi->vi_item);
+
+	dir_u->flags = 0;
+	if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
+		dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
+
+	deh = (struct reiserfs_de_head *)(vi->vi_item);
+
+	/* virtual directory item have this amount of entry after */
+	dir_u->entry_count = ih_entry_count(vi->vi_ih) +
+	    ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
+			      (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
+
+	for (i = 0; i < dir_u->entry_count; i++) {
+		j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
+				  vn->vn_mode);
+		dir_u->entry_sizes[i] =
+		    (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
+		    deh_location(&deh[j]) + DEH_SIZE;
+	}
+
+	size += (dir_u->entry_count * sizeof(short));
+
+	/* set size of pasted entry */
+	if (is_affected && vn->vn_mode == M_PASTE)
+		dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
+
+#ifdef CONFIG_REISERFS_CHECK
+	/* compare total size of entries with item length */
+	{
+		int k, l;
+
+		l = 0;
+		for (k = 0; k < dir_u->entry_count; k++)
+			l += dir_u->entry_sizes[k];
+
+		if (l + IH_SIZE != vi->vi_item_len +
+		    ((is_affected
+		      && (vn->vn_mode == M_PASTE
+			  || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
+			reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
+				       "insert_size==%d), invalid length of "
+				       "directory item",
+				       vn->vn_mode, insert_size);
+		}
+	}
+#endif
+
+	return size;
+
+}
+
+/*
+ * return number of entries which may fit into specified amount of
+ * free space, or -1 if free space is not enough even for 1 entry
+ */
+static int direntry_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
+{
+	int i;
+	int entries = 0;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
+		/* i-th entry doesn't fit into the remaining free space */
+		if (dir_u->entry_sizes[i] > free)
+			break;
+
+		free -= dir_u->entry_sizes[i];
+		entries++;
+	}
+
+	if (entries == dir_u->entry_count) {
+		reiserfs_panic(NULL, "item_ops-1",
+			       "free space %d, entry_count %d", free,
+			       dir_u->entry_count);
+	}
+
+	/* "." and ".." can not be separated from each other */
+	if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
+	    && entries < 2)
+		entries = 0;
+
+	return entries ? : -1;
+}
+
+static int direntry_check_right(struct virtual_item *vi, int free)
+{
+	int i;
+	int entries = 0;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	for (i = dir_u->entry_count - 1; i >= 0; i--) {
+		/* i-th entry doesn't fit into the remaining free space */
+		if (dir_u->entry_sizes[i] > free)
+			break;
+
+		free -= dir_u->entry_sizes[i];
+		entries++;
+	}
+	BUG_ON(entries == dir_u->entry_count);
+
+	/* "." and ".." can not be separated from each other */
+	if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
+	    && entries > dir_u->entry_count - 2)
+		entries = dir_u->entry_count - 2;
+
+	return entries ? : -1;
+}
+
+/* sum of entry sizes between from-th and to-th entries including both edges */
+static int direntry_part_size(struct virtual_item *vi, int first, int count)
+{
+	int i, retval;
+	int from, to;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	retval = 0;
+	if (first == 0)
+		from = 0;
+	else
+		from = dir_u->entry_count - count;
+	to = from + count - 1;
+
+	for (i = from; i <= to; i++)
+		retval += dir_u->entry_sizes[i];
+
+	return retval;
+}
+
+static int direntry_unit_num(struct virtual_item *vi)
+{
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	return dir_u->entry_count;
+}
+
+static void direntry_print_vi(struct virtual_item *vi)
+{
+	int i;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	reiserfs_warning(NULL, "reiserfs-16104",
+			 "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
+			 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
+	printk("%d entries: ", dir_u->entry_count);
+	for (i = 0; i < dir_u->entry_count; i++)
+		printk("%d ", dir_u->entry_sizes[i]);
+	printk("\n");
+}
+
+static struct item_operations direntry_ops = {
+	.bytes_number = direntry_bytes_number,
+	.decrement_key = direntry_decrement_key,
+	.is_left_mergeable = direntry_is_left_mergeable,
+	.print_item = direntry_print_item,
+	.check_item = direntry_check_item,
+
+	.create_vi = direntry_create_vi,
+	.check_left = direntry_check_left,
+	.check_right = direntry_check_right,
+	.part_size = direntry_part_size,
+	.unit_num = direntry_unit_num,
+	.print_vi = direntry_print_vi
+};
+
+/* Error catching functions to catch errors caused by incorrect item types. */
+static int errcatch_bytes_number(struct item_head *ih, int block_size)
+{
+	reiserfs_warning(NULL, "green-16001",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static void errcatch_decrement_key(struct cpu_key *key)
+{
+	reiserfs_warning(NULL, "green-16002",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static int errcatch_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
+{
+	reiserfs_warning(NULL, "green-16003",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static void errcatch_print_item(struct item_head *ih, char *item)
+{
+	reiserfs_warning(NULL, "green-16004",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static void errcatch_check_item(struct item_head *ih, char *item)
+{
+	reiserfs_warning(NULL, "green-16005",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static int errcatch_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
+{
+	reiserfs_warning(NULL, "green-16006",
+			 "Invalid item type observed, run fsck ASAP");
+	/*
+	 * We might return -1 here as well, but it won't help as
+	 * create_virtual_node() from where this operation is called
+	 * from is of return type void.
+	 */
+	return 0;
+}
+
+static int errcatch_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
+{
+	reiserfs_warning(NULL, "green-16007",
+			 "Invalid item type observed, run fsck ASAP");
+	return -1;
+}
+
+static int errcatch_check_right(struct virtual_item *vi, int free)
+{
+	reiserfs_warning(NULL, "green-16008",
+			 "Invalid item type observed, run fsck ASAP");
+	return -1;
+}
+
+static int errcatch_part_size(struct virtual_item *vi, int first, int count)
+{
+	reiserfs_warning(NULL, "green-16009",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static int errcatch_unit_num(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "green-16010",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static void errcatch_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "green-16011",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static struct item_operations errcatch_ops = {
+	errcatch_bytes_number,
+	errcatch_decrement_key,
+	errcatch_is_left_mergeable,
+	errcatch_print_item,
+	errcatch_check_item,
+
+	errcatch_create_vi,
+	errcatch_check_left,
+	errcatch_check_right,
+	errcatch_part_size,
+	errcatch_unit_num,
+	errcatch_print_vi
+};
+
+#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
+#error Item types must use disk-format assigned values.
+#endif
+
+struct item_operations *item_ops[TYPE_ANY + 1] = {
+	&stat_data_ops,
+	&indirect_ops,
+	&direct_ops,
+	&direntry_ops,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	&errcatch_ops		/* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
+};
diff --git a/kernel/fs/reiserfs/journal.c b/kernel/fs/reiserfs/journal.c
new file mode 100644
index 000000000..9d6486d41
--- /dev/null
+++ b/kernel/fs/reiserfs/journal.c
@@ -0,0 +1,4403 @@
+/*
+ * Write ahead logging implementation copyright Chris Mason 2000
+ *
+ * The background commits make this code very interrelated, and
+ * overly complex.  I need to rethink things a bit....The major players:
+ *
+ * journal_begin -- call with the number of blocks you expect to log.
+ *                  If the current transaction is too
+ *		    old, it will block until the current transaction is
+ *		    finished, and then start a new one.
+ *		    Usually, your transaction will get joined in with
+ *                  previous ones for speed.
+ *
+ * journal_join  -- same as journal_begin, but won't block on the current
+ *                  transaction regardless of age.  Don't ever call
+ *                  this.  Ever.  There are only two places it should be
+ *                  called from, and they are both inside this file.
+ *
+ * journal_mark_dirty -- adds blocks into this transaction.  clears any flags
+ *                       that might make them get sent to disk
+ *                       and then marks them BH_JDirty.  Puts the buffer head
+ *                       into the current transaction hash.
+ *
+ * journal_end -- if the current transaction is batchable, it does nothing
+ *                   otherwise, it could do an async/synchronous commit, or
+ *                   a full flush of all log and real blocks in the
+ *                   transaction.
+ *
+ * flush_old_commits -- if the current transaction is too old, it is ended and
+ *                      commit blocks are sent to disk.  Forces commit blocks
+ *                      to disk for all backgrounded commits that have been
+ *                      around too long.
+ *		     -- Note, if you call this as an immediate flush from
+ *		        from within kupdate, it will ignore the immediate flag
+ */
+
+#include <linux/time.h>
+#include <linux/semaphore.h>
+#include <linux/vmalloc.h>
+#include "reiserfs.h"
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_working_list))
+
+/* must be correct to keep the desc and commit structs at 4k */
+#define JOURNAL_TRANS_HALF 1018
+#define BUFNR 64		/*read ahead */
+
+/* cnode stat bits.  Move these into reiserfs_fs.h */
+
+/* this block was freed, and can't be written.  */
+#define BLOCK_FREED 2
+/* this block was freed during this transaction, and can't be written */
+#define BLOCK_FREED_HOLDER 3
+
+/* used in flush_journal_list */
+#define BLOCK_NEEDS_FLUSH 4
+#define BLOCK_DIRTIED 5
+
+/* journal list state bits */
+#define LIST_TOUCHED 1
+#define LIST_DIRTY   2
+#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
+
+/* flags for do_journal_end */
+#define FLUSH_ALL   1		/* flush commit and real blocks */
+#define COMMIT_NOW  2		/* end and commit this transaction */
+#define WAIT        4		/* wait for the log blocks to hit the disk */
+
+static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
+static int flush_journal_list(struct super_block *s,
+			      struct reiserfs_journal_list *jl, int flushall);
+static int flush_commit_list(struct super_block *s,
+			     struct reiserfs_journal_list *jl, int flushall);
+static int can_dirty(struct reiserfs_journal_cnode *cn);
+static int journal_join(struct reiserfs_transaction_handle *th,
+			struct super_block *sb);
+static void release_journal_dev(struct super_block *super,
+			       struct reiserfs_journal *journal);
+static int dirty_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl);
+static void flush_async_commits(struct work_struct *work);
+static void queue_log_writer(struct super_block *s);
+
+/* values for join in do_journal_begin_r */
+enum {
+	JBEGIN_REG = 0,		/* regular journal begin */
+	/* join the running transaction if at all possible */
+	JBEGIN_JOIN = 1,
+	/* called from cleanup code, ignores aborted flag */
+	JBEGIN_ABORT = 2,
+};
+
+static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
+			      struct super_block *sb,
+			      unsigned long nblocks, int join);
+
+static void init_journal_hash(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	memset(journal->j_hash_table, 0,
+	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
+}
+
+/*
+ * clears BH_Dirty and sticks the buffer on the clean list.  Called because
+ * I can't allow refile_buffer to make schedule happen after I've freed a
+ * block.  Look at remove_from_transaction and journal_mark_freed for
+ * more details.
+ */
+static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
+{
+	if (bh) {
+		clear_buffer_dirty(bh);
+		clear_buffer_journal_test(bh);
+	}
+	return 0;
+}
+
+static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
+							 *sb)
+{
+	struct reiserfs_bitmap_node *bn;
+	static int id;
+
+	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
+	if (!bn) {
+		return NULL;
+	}
+	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
+	if (!bn->data) {
+		kfree(bn);
+		return NULL;
+	}
+	bn->id = id++;
+	INIT_LIST_HEAD(&bn->list);
+	return bn;
+}
+
+static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_bitmap_node *bn = NULL;
+	struct list_head *entry = journal->j_bitmap_nodes.next;
+
+	journal->j_used_bitmap_nodes++;
+repeat:
+
+	if (entry != &journal->j_bitmap_nodes) {
+		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
+		list_del(entry);
+		memset(bn->data, 0, sb->s_blocksize);
+		journal->j_free_bitmap_nodes--;
+		return bn;
+	}
+	bn = allocate_bitmap_node(sb);
+	if (!bn) {
+		yield();
+		goto repeat;
+	}
+	return bn;
+}
+static inline void free_bitmap_node(struct super_block *sb,
+				    struct reiserfs_bitmap_node *bn)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	journal->j_used_bitmap_nodes--;
+	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
+		kfree(bn->data);
+		kfree(bn);
+	} else {
+		list_add(&bn->list, &journal->j_bitmap_nodes);
+		journal->j_free_bitmap_nodes++;
+	}
+}
+
+static void allocate_bitmap_nodes(struct super_block *sb)
+{
+	int i;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_bitmap_node *bn = NULL;
+	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
+		bn = allocate_bitmap_node(sb);
+		if (bn) {
+			list_add(&bn->list, &journal->j_bitmap_nodes);
+			journal->j_free_bitmap_nodes++;
+		} else {
+			/* this is ok, we'll try again when more are needed */
+			break;
+		}
+	}
+}
+
+static int set_bit_in_list_bitmap(struct super_block *sb,
+				  b_blocknr_t block,
+				  struct reiserfs_list_bitmap *jb)
+{
+	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
+	unsigned int bit_nr = block % (sb->s_blocksize << 3);
+
+	if (!jb->bitmaps[bmap_nr]) {
+		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
+	}
+	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
+	return 0;
+}
+
+static void cleanup_bitmap_list(struct super_block *sb,
+				struct reiserfs_list_bitmap *jb)
+{
+	int i;
+	if (jb->bitmaps == NULL)
+		return;
+
+	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
+		if (jb->bitmaps[i]) {
+			free_bitmap_node(sb, jb->bitmaps[i]);
+			jb->bitmaps[i] = NULL;
+		}
+	}
+}
+
+/*
+ * only call this on FS unmount.
+ */
+static int free_list_bitmaps(struct super_block *sb,
+			     struct reiserfs_list_bitmap *jb_array)
+{
+	int i;
+	struct reiserfs_list_bitmap *jb;
+	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+		jb = jb_array + i;
+		jb->journal_list = NULL;
+		cleanup_bitmap_list(sb, jb);
+		vfree(jb->bitmaps);
+		jb->bitmaps = NULL;
+	}
+	return 0;
+}
+
+static int free_bitmap_nodes(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct list_head *next = journal->j_bitmap_nodes.next;
+	struct reiserfs_bitmap_node *bn;
+
+	while (next != &journal->j_bitmap_nodes) {
+		bn = list_entry(next, struct reiserfs_bitmap_node, list);
+		list_del(next);
+		kfree(bn->data);
+		kfree(bn);
+		next = journal->j_bitmap_nodes.next;
+		journal->j_free_bitmap_nodes--;
+	}
+
+	return 0;
+}
+
+/*
+ * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
+ * jb_array is the array to be filled in.
+ */
+int reiserfs_allocate_list_bitmaps(struct super_block *sb,
+				   struct reiserfs_list_bitmap *jb_array,
+				   unsigned int bmap_nr)
+{
+	int i;
+	int failed = 0;
+	struct reiserfs_list_bitmap *jb;
+	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
+
+	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+		jb = jb_array + i;
+		jb->journal_list = NULL;
+		jb->bitmaps = vzalloc(mem);
+		if (!jb->bitmaps) {
+			reiserfs_warning(sb, "clm-2000", "unable to "
+					 "allocate bitmaps for journal lists");
+			failed = 1;
+			break;
+		}
+	}
+	if (failed) {
+		free_list_bitmaps(sb, jb_array);
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * find an available list bitmap.  If you can't find one, flush a commit list
+ * and try again
+ */
+static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
+						    struct reiserfs_journal_list
+						    *jl)
+{
+	int i, j;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_list_bitmap *jb = NULL;
+
+	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
+		i = journal->j_list_bitmap_index;
+		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
+		jb = journal->j_list_bitmap + i;
+		if (journal->j_list_bitmap[i].journal_list) {
+			flush_commit_list(sb,
+					  journal->j_list_bitmap[i].
+					  journal_list, 1);
+			if (!journal->j_list_bitmap[i].journal_list) {
+				break;
+			}
+		} else {
+			break;
+		}
+	}
+	/* double check to make sure if flushed correctly */
+	if (jb->journal_list)
+		return NULL;
+	jb->journal_list = jl;
+	return jb;
+}
+
+/*
+ * allocates a new chunk of X nodes, and links them all together as a list.
+ * Uses the cnode->next and cnode->prev pointers
+ * returns NULL on failure
+ */
+static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
+{
+	struct reiserfs_journal_cnode *head;
+	int i;
+	if (num_cnodes <= 0) {
+		return NULL;
+	}
+	head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
+	if (!head) {
+		return NULL;
+	}
+	head[0].prev = NULL;
+	head[0].next = head + 1;
+	for (i = 1; i < num_cnodes; i++) {
+		head[i].prev = head + (i - 1);
+		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
+	}
+	head[num_cnodes - 1].next = NULL;
+	return head;
+}
+
+/* pulls a cnode off the free list, or returns NULL on failure */
+static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
+{
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	reiserfs_check_lock_depth(sb, "get_cnode");
+
+	if (journal->j_cnode_free <= 0) {
+		return NULL;
+	}
+	journal->j_cnode_used++;
+	journal->j_cnode_free--;
+	cn = journal->j_cnode_free_list;
+	if (!cn) {
+		return cn;
+	}
+	if (cn->next) {
+		cn->next->prev = NULL;
+	}
+	journal->j_cnode_free_list = cn->next;
+	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
+	return cn;
+}
+
+/*
+ * returns a cnode to the free list
+ */
+static void free_cnode(struct super_block *sb,
+		       struct reiserfs_journal_cnode *cn)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	reiserfs_check_lock_depth(sb, "free_cnode");
+
+	journal->j_cnode_used--;
+	journal->j_cnode_free++;
+	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
+	cn->next = journal->j_cnode_free_list;
+	if (journal->j_cnode_free_list) {
+		journal->j_cnode_free_list->prev = cn;
+	}
+	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
+	journal->j_cnode_free_list = cn;
+}
+
+static void clear_prepared_bits(struct buffer_head *bh)
+{
+	clear_buffer_journal_prepared(bh);
+	clear_buffer_journal_restore_dirty(bh);
+}
+
+/*
+ * return a cnode with same dev, block number and size in table,
+ * or null if not found
+ */
+static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
+								  super_block
+								  *sb,
+								  struct
+								  reiserfs_journal_cnode
+								  **table,
+								  long bl)
+{
+	struct reiserfs_journal_cnode *cn;
+	cn = journal_hash(table, sb, bl);
+	while (cn) {
+		if (cn->blocknr == bl && cn->sb == sb)
+			return cn;
+		cn = cn->hnext;
+	}
+	return (struct reiserfs_journal_cnode *)0;
+}
+
+/*
+ * this actually means 'can this block be reallocated yet?'.  If you set
+ * search_all, a block can only be allocated if it is not in the current
+ * transaction, was not freed by the current transaction, and has no chance
+ * of ever being overwritten by a replay after crashing.
+ *
+ * If you don't set search_all, a block can only be allocated if it is not
+ * in the current transaction.  Since deleting a block removes it from the
+ * current transaction, this case should never happen.  If you don't set
+ * search_all, make sure you never write the block without logging it.
+ *
+ * next_zero_bit is a suggestion about the next block to try for find_forward.
+ * when bl is rejected because it is set in a journal list bitmap, we search
+ * for the next zero bit in the bitmap that rejected bl.  Then, we return
+ * that through next_zero_bit for find_forward to try.
+ *
+ * Just because we return something in next_zero_bit does not mean we won't
+ * reject it on the next call to reiserfs_in_journal
+ */
+int reiserfs_in_journal(struct super_block *sb,
+			unsigned int bmap_nr, int bit_nr, int search_all,
+			b_blocknr_t * next_zero_bit)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_list_bitmap *jb;
+	int i;
+	unsigned long bl;
+
+	*next_zero_bit = 0;	/* always start this at zero. */
+
+	PROC_INFO_INC(sb, journal.in_journal);
+	/*
+	 * If we aren't doing a search_all, this is a metablock, and it
+	 * will be logged before use.  if we crash before the transaction
+	 * that freed it commits,  this transaction won't have committed
+	 * either, and the block will never be written
+	 */
+	if (search_all) {
+		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+			PROC_INFO_INC(sb, journal.in_journal_bitmap);
+			jb = journal->j_list_bitmap + i;
+			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
+			    test_bit(bit_nr,
+				     (unsigned long *)jb->bitmaps[bmap_nr]->
+				     data)) {
+				*next_zero_bit =
+				    find_next_zero_bit((unsigned long *)
+						       (jb->bitmaps[bmap_nr]->
+							data),
+						       sb->s_blocksize << 3,
+						       bit_nr + 1);
+				return 1;
+			}
+		}
+	}
+
+	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
+	/* is it in any old transactions? */
+	if (search_all
+	    && (cn =
+		get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
+		return 1;
+	}
+
+	/* is it in the current transaction.  This should never happen */
+	if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
+		BUG();
+		return 1;
+	}
+
+	PROC_INFO_INC(sb, journal.in_journal_reusable);
+	/* safe for reuse */
+	return 0;
+}
+
+/* insert cn into table */
+static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
+				       struct reiserfs_journal_cnode *cn)
+{
+	struct reiserfs_journal_cnode *cn_orig;
+
+	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
+	cn->hnext = cn_orig;
+	cn->hprev = NULL;
+	if (cn_orig) {
+		cn_orig->hprev = cn;
+	}
+	journal_hash(table, cn->sb, cn->blocknr) = cn;
+}
+
+/* lock the current transaction */
+static inline void lock_journal(struct super_block *sb)
+{
+	PROC_INFO_INC(sb, journal.lock_journal);
+
+	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
+}
+
+/* unlock the current transaction */
+static inline void unlock_journal(struct super_block *sb)
+{
+	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
+}
+
+static inline void get_journal_list(struct reiserfs_journal_list *jl)
+{
+	jl->j_refcount++;
+}
+
+static inline void put_journal_list(struct super_block *s,
+				    struct reiserfs_journal_list *jl)
+{
+	if (jl->j_refcount < 1) {
+		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
+			       jl->j_trans_id, jl->j_refcount);
+	}
+	if (--jl->j_refcount == 0)
+		kfree(jl);
+}
+
+/*
+ * this used to be much more involved, and I'm keeping it just in case
+ * things get ugly again.  it gets called by flush_commit_list, and
+ * cleans up any data stored about blocks freed during a transaction.
+ */
+static void cleanup_freed_for_journal_list(struct super_block *sb,
+					   struct reiserfs_journal_list *jl)
+{
+
+	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
+	if (jb) {
+		cleanup_bitmap_list(sb, jb);
+	}
+	jl->j_list_bitmap->journal_list = NULL;
+	jl->j_list_bitmap = NULL;
+}
+
+static int journal_list_still_alive(struct super_block *s,
+				    unsigned int trans_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct list_head *entry = &journal->j_journal_list;
+	struct reiserfs_journal_list *jl;
+
+	if (!list_empty(entry)) {
+		jl = JOURNAL_LIST_ENTRY(entry->next);
+		if (jl->j_trans_id <= trans_id) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * If page->mapping was null, we failed to truncate this page for
+ * some reason.  Most likely because it was truncated after being
+ * logged via data=journal.
+ *
+ * This does a check to see if the buffer belongs to one of these
+ * lost pages before doing the final put_bh.  If page->mapping was
+ * null, it tries to free buffers on the page, which should make the
+ * final page_cache_release drop the page from the lru.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	if (!page->mapping && trylock_page(page)) {
+		page_cache_get(page);
+		put_bh(bh);
+		if (!page->mapping)
+			try_to_free_buffers(page);
+		unlock_page(page);
+		page_cache_release(page);
+	} else {
+		put_bh(bh);
+	}
+}
+
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (buffer_journaled(bh)) {
+		reiserfs_warning(NULL, "clm-2084",
+				 "pinned buffer %lu:%s sent to disk",
+				 bh->b_blocknr, bdevname(bh->b_bdev, b));
+	}
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+
+	unlock_buffer(bh);
+	release_buffer_page(bh);
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+static void submit_logged_buffer(struct buffer_head *bh)
+{
+	get_bh(bh);
+	bh->b_end_io = reiserfs_end_buffer_io_sync;
+	clear_buffer_journal_new(bh);
+	clear_buffer_dirty(bh);
+	if (!test_clear_buffer_journal_test(bh))
+		BUG();
+	if (!buffer_uptodate(bh))
+		BUG();
+	submit_bh(WRITE, bh);
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh)
+{
+	get_bh(bh);
+	bh->b_end_io = reiserfs_end_ordered_io;
+	clear_buffer_dirty(bh);
+	if (!buffer_uptodate(bh))
+		BUG();
+	submit_bh(WRITE, bh);
+}
+
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+	struct buffer_head *bh[CHUNK_SIZE];
+	int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk)
+{
+	int i;
+	for (i = 0; i < chunk->nr; i++) {
+		submit_logged_buffer(chunk->bh[i]);
+	}
+	chunk->nr = 0;
+}
+
+static void write_ordered_chunk(struct buffer_chunk *chunk)
+{
+	int i;
+	for (i = 0; i < chunk->nr; i++) {
+		submit_ordered_buffer(chunk->bh[i]);
+	}
+	chunk->nr = 0;
+}
+
+static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
+			spinlock_t * lock, void (fn) (struct buffer_chunk *))
+{
+	int ret = 0;
+	BUG_ON(chunk->nr >= CHUNK_SIZE);
+	chunk->bh[chunk->nr++] = bh;
+	if (chunk->nr >= CHUNK_SIZE) {
+		ret = 1;
+		if (lock) {
+			spin_unlock(lock);
+			fn(chunk);
+			spin_lock(lock);
+		} else {
+			fn(chunk);
+		}
+	}
+	return ret;
+}
+
+static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
+static struct reiserfs_jh *alloc_jh(void)
+{
+	struct reiserfs_jh *jh;
+	while (1) {
+		jh = kmalloc(sizeof(*jh), GFP_NOFS);
+		if (jh) {
+			atomic_inc(&nr_reiserfs_jh);
+			return jh;
+		}
+		yield();
+	}
+}
+
+/*
+ * we want to free the jh when the buffer has been written
+ * and waited on
+ */
+void reiserfs_free_jh(struct buffer_head *bh)
+{
+	struct reiserfs_jh *jh;
+
+	jh = bh->b_private;
+	if (jh) {
+		bh->b_private = NULL;
+		jh->bh = NULL;
+		list_del_init(&jh->list);
+		kfree(jh);
+		if (atomic_read(&nr_reiserfs_jh) <= 0)
+			BUG();
+		atomic_dec(&nr_reiserfs_jh);
+		put_bh(bh);
+	}
+}
+
+static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
+			   int tail)
+{
+	struct reiserfs_jh *jh;
+
+	if (bh->b_private) {
+		spin_lock(&j->j_dirty_buffers_lock);
+		if (!bh->b_private) {
+			spin_unlock(&j->j_dirty_buffers_lock);
+			goto no_jh;
+		}
+		jh = bh->b_private;
+		list_del_init(&jh->list);
+	} else {
+no_jh:
+		get_bh(bh);
+		jh = alloc_jh();
+		spin_lock(&j->j_dirty_buffers_lock);
+		/*
+		 * buffer must be locked for __add_jh, should be able to have
+		 * two adds at the same time
+		 */
+		BUG_ON(bh->b_private);
+		jh->bh = bh;
+		bh->b_private = jh;
+	}
+	jh->jl = j->j_current_jl;
+	if (tail)
+		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+	else {
+		list_add_tail(&jh->list, &jh->jl->j_bh_list);
+	}
+	spin_unlock(&j->j_dirty_buffers_lock);
+	return 0;
+}
+
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
+{
+	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+}
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
+{
+	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+}
+
+#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
+static int write_ordered_buffers(spinlock_t * lock,
+				 struct reiserfs_journal *j,
+				 struct reiserfs_journal_list *jl,
+				 struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct reiserfs_jh *jh;
+	int ret = j->j_errno;
+	struct buffer_chunk chunk;
+	struct list_head tmp;
+	INIT_LIST_HEAD(&tmp);
+
+	chunk.nr = 0;
+	spin_lock(lock);
+	while (!list_empty(list)) {
+		jh = JH_ENTRY(list->next);
+		bh = jh->bh;
+		get_bh(bh);
+		if (!trylock_buffer(bh)) {
+			if (!buffer_dirty(bh)) {
+				list_move(&jh->list, &tmp);
+				goto loop_next;
+			}
+			spin_unlock(lock);
+			if (chunk.nr)
+				write_ordered_chunk(&chunk);
+			wait_on_buffer(bh);
+			cond_resched();
+			spin_lock(lock);
+			goto loop_next;
+		}
+		/*
+		 * in theory, dirty non-uptodate buffers should never get here,
+		 * but the upper layer io error paths still have a few quirks.
+		 * Handle them here as gracefully as we can
+		 */
+		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
+			clear_buffer_dirty(bh);
+			ret = -EIO;
+		}
+		if (buffer_dirty(bh)) {
+			list_move(&jh->list, &tmp);
+			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+		} else {
+			reiserfs_free_jh(bh);
+			unlock_buffer(bh);
+		}
+loop_next:
+		put_bh(bh);
+		cond_resched_lock(lock);
+	}
+	if (chunk.nr) {
+		spin_unlock(lock);
+		write_ordered_chunk(&chunk);
+		spin_lock(lock);
+	}
+	while (!list_empty(&tmp)) {
+		jh = JH_ENTRY(tmp.prev);
+		bh = jh->bh;
+		get_bh(bh);
+		reiserfs_free_jh(bh);
+
+		if (buffer_locked(bh)) {
+			spin_unlock(lock);
+			wait_on_buffer(bh);
+			spin_lock(lock);
+		}
+		if (!buffer_uptodate(bh)) {
+			ret = -EIO;
+		}
+		/*
+		 * ugly interaction with invalidatepage here.
+		 * reiserfs_invalidate_page will pin any buffer that has a
+		 * valid journal head from an older transaction.  If someone
+		 * else sets our buffer dirty after we write it in the first
+		 * loop, and then someone truncates the page away, nobody
+		 * will ever write the buffer. We're safe if we write the
+		 * page one last time after freeing the journal header.
+		 */
+		if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
+			spin_unlock(lock);
+			ll_rw_block(WRITE, 1, &bh);
+			spin_lock(lock);
+		}
+		put_bh(bh);
+		cond_resched_lock(lock);
+	}
+	spin_unlock(lock);
+	return ret;
+}
+
+static int flush_older_commits(struct super_block *s,
+			       struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct reiserfs_journal_list *other_jl;
+	struct reiserfs_journal_list *first_jl;
+	struct list_head *entry;
+	unsigned int trans_id = jl->j_trans_id;
+	unsigned int other_trans_id;
+	unsigned int first_trans_id;
+
+find_first:
+	/*
+	 * first we walk backwards to find the oldest uncommitted transation
+	 */
+	first_jl = jl;
+	entry = jl->j_list.prev;
+	while (1) {
+		other_jl = JOURNAL_LIST_ENTRY(entry);
+		if (entry == &journal->j_journal_list ||
+		    atomic_read(&other_jl->j_older_commits_done))
+			break;
+
+		first_jl = other_jl;
+		entry = other_jl->j_list.prev;
+	}
+
+	/* if we didn't find any older uncommitted transactions, return now */
+	if (first_jl == jl) {
+		return 0;
+	}
+
+	first_trans_id = first_jl->j_trans_id;
+
+	entry = &first_jl->j_list;
+	while (1) {
+		other_jl = JOURNAL_LIST_ENTRY(entry);
+		other_trans_id = other_jl->j_trans_id;
+
+		if (other_trans_id < trans_id) {
+			if (atomic_read(&other_jl->j_commit_left) != 0) {
+				flush_commit_list(s, other_jl, 0);
+
+				/* list we were called with is gone, return */
+				if (!journal_list_still_alive(s, trans_id))
+					return 1;
+
+				/*
+				 * the one we just flushed is gone, this means
+				 * all older lists are also gone, so first_jl
+				 * is no longer valid either.  Go back to the
+				 * beginning.
+				 */
+				if (!journal_list_still_alive
+				    (s, other_trans_id)) {
+					goto find_first;
+				}
+			}
+			entry = entry->next;
+			if (entry == &journal->j_journal_list)
+				return 0;
+		} else {
+			return 0;
+		}
+	}
+	return 0;
+}
+
+static int reiserfs_async_progress_wait(struct super_block *s)
+{
+	struct reiserfs_journal *j = SB_JOURNAL(s);
+
+	if (atomic_read(&j->j_async_throttle)) {
+		int depth;
+
+		depth = reiserfs_write_unlock_nested(s);
+		congestion_wait(BLK_RW_ASYNC, HZ / 10);
+		reiserfs_write_lock_nested(s, depth);
+	}
+
+	return 0;
+}
+
+/*
+ * if this journal list still has commit blocks unflushed, send them to disk.
+ *
+ * log areas must be flushed in order (transaction 2 can't commit before
+ * transaction 1) Before the commit block can by written, every other log
+ * block must be safely on disk
+ */
+static int flush_commit_list(struct super_block *s,
+			     struct reiserfs_journal_list *jl, int flushall)
+{
+	int i;
+	b_blocknr_t bn;
+	struct buffer_head *tbh = NULL;
+	unsigned int trans_id = jl->j_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	int retval = 0;
+	int write_len;
+	int depth;
+
+	reiserfs_check_lock_depth(s, "flush_commit_list");
+
+	if (atomic_read(&jl->j_older_commits_done)) {
+		return 0;
+	}
+
+	/*
+	 * before we can put our commit blocks on disk, we have to make
+	 * sure everyone older than us is on disk too
+	 */
+	BUG_ON(jl->j_len <= 0);
+	BUG_ON(trans_id == journal->j_trans_id);
+
+	get_journal_list(jl);
+	if (flushall) {
+		if (flush_older_commits(s, jl) == 1) {
+			/*
+			 * list disappeared during flush_older_commits.
+			 * return
+			 */
+			goto put_jl;
+		}
+	}
+
+	/* make sure nobody is trying to flush this one at the same time */
+	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
+
+	if (!journal_list_still_alive(s, trans_id)) {
+		mutex_unlock(&jl->j_commit_mutex);
+		goto put_jl;
+	}
+	BUG_ON(jl->j_trans_id == 0);
+
+	/* this commit is done, exit */
+	if (atomic_read(&jl->j_commit_left) <= 0) {
+		if (flushall) {
+			atomic_set(&jl->j_older_commits_done, 1);
+		}
+		mutex_unlock(&jl->j_commit_mutex);
+		goto put_jl;
+	}
+
+	if (!list_empty(&jl->j_bh_list)) {
+		int ret;
+
+		/*
+		 * We might sleep in numerous places inside
+		 * write_ordered_buffers. Relax the write lock.
+		 */
+		depth = reiserfs_write_unlock_nested(s);
+		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
+					    journal, jl, &jl->j_bh_list);
+		if (ret < 0 && retval == 0)
+			retval = ret;
+		reiserfs_write_lock_nested(s, depth);
+	}
+	BUG_ON(!list_empty(&jl->j_bh_list));
+	/*
+	 * for the description block and all the log blocks, submit any buffers
+	 * that haven't already reached the disk.  Try to write at least 256
+	 * log blocks. later on, we will only wait on blocks that correspond
+	 * to this transaction, but while we're unplugging we might as well
+	 * get a chunk of data on there.
+	 */
+	atomic_inc(&journal->j_async_throttle);
+	write_len = jl->j_len + 1;
+	if (write_len < 256)
+		write_len = 256;
+	for (i = 0 ; i < write_len ; i++) {
+		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
+		    SB_ONDISK_JOURNAL_SIZE(s);
+		tbh = journal_find_get_block(s, bn);
+		if (tbh) {
+			if (buffer_dirty(tbh)) {
+		            depth = reiserfs_write_unlock_nested(s);
+			    ll_rw_block(WRITE, 1, &tbh);
+			    reiserfs_write_lock_nested(s, depth);
+			}
+			put_bh(tbh) ;
+		}
+	}
+	atomic_dec(&journal->j_async_throttle);
+
+	for (i = 0; i < (jl->j_len + 1); i++) {
+		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
+		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
+		tbh = journal_find_get_block(s, bn);
+
+		depth = reiserfs_write_unlock_nested(s);
+		__wait_on_buffer(tbh);
+		reiserfs_write_lock_nested(s, depth);
+		/*
+		 * since we're using ll_rw_blk above, it might have skipped
+		 * over a locked buffer.  Double check here
+		 */
+		/* redundant, sync_dirty_buffer() checks */
+		if (buffer_dirty(tbh)) {
+			depth = reiserfs_write_unlock_nested(s);
+			sync_dirty_buffer(tbh);
+			reiserfs_write_lock_nested(s, depth);
+		}
+		if (unlikely(!buffer_uptodate(tbh))) {
+#ifdef CONFIG_REISERFS_CHECK
+			reiserfs_warning(s, "journal-601",
+					 "buffer write failed");
+#endif
+			retval = -EIO;
+		}
+		/* once for journal_find_get_block */
+		put_bh(tbh);
+		/* once due to original getblk in do_journal_end */
+		put_bh(tbh);
+		atomic_dec(&jl->j_commit_left);
+	}
+
+	BUG_ON(atomic_read(&jl->j_commit_left) != 1);
+
+	/*
+	 * If there was a write error in the journal - we can't commit
+	 * this transaction - it will be invalid and, if successful,
+	 * will just end up propagating the write error out to
+	 * the file system.
+	 */
+	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
+		if (buffer_dirty(jl->j_commit_bh))
+			BUG();
+		mark_buffer_dirty(jl->j_commit_bh) ;
+		depth = reiserfs_write_unlock_nested(s);
+		if (reiserfs_barrier_flush(s))
+			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+		else
+			sync_dirty_buffer(jl->j_commit_bh);
+		reiserfs_write_lock_nested(s, depth);
+	}
+
+	/*
+	 * If there was a write error in the journal - we can't commit this
+	 * transaction - it will be invalid and, if successful, will just end
+	 * up propagating the write error out to the filesystem.
+	 */
+	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+		reiserfs_warning(s, "journal-615", "buffer write failed");
+#endif
+		retval = -EIO;
+	}
+	bforget(jl->j_commit_bh);
+	if (journal->j_last_commit_id != 0 &&
+	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
+		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
+				 journal->j_last_commit_id, jl->j_trans_id);
+	}
+	journal->j_last_commit_id = jl->j_trans_id;
+
+	/*
+	 * now, every commit block is on the disk.  It is safe to allow
+	 * blocks freed during this transaction to be reallocated
+	 */
+	cleanup_freed_for_journal_list(s, jl);
+
+	retval = retval ? retval : journal->j_errno;
+
+	/* mark the metadata dirty */
+	if (!retval)
+		dirty_one_transaction(s, jl);
+	atomic_dec(&jl->j_commit_left);
+
+	if (flushall) {
+		atomic_set(&jl->j_older_commits_done, 1);
+	}
+	mutex_unlock(&jl->j_commit_mutex);
+put_jl:
+	put_journal_list(s, jl);
+
+	if (retval)
+		reiserfs_abort(s, retval, "Journal write error in %s",
+			       __func__);
+	return retval;
+}
+
+/*
+ * flush_journal_list frequently needs to find a newer transaction for a
+ * given block.  This does that, or returns NULL if it can't find anything
+ */
+static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
+							  reiserfs_journal_cnode
+							  *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
+
+	cn = cn->hprev;
+	while (cn) {
+		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
+			return cn->jlist;
+		}
+		cn = cn->hprev;
+	}
+	return NULL;
+}
+
+static void remove_journal_hash(struct super_block *,
+				struct reiserfs_journal_cnode **,
+				struct reiserfs_journal_list *, unsigned long,
+				int);
+
+/*
+ * once all the real blocks have been flushed, it is safe to remove them
+ * from the journal list for this transaction.  Aside from freeing the
+ * cnode, this also allows the block to be reallocated for data blocks
+ * if it had been deleted.
+ */
+static void remove_all_from_journal_list(struct super_block *sb,
+					 struct reiserfs_journal_list *jl,
+					 int debug)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn, *last;
+	cn = jl->j_realblock;
+
+	/*
+	 * which is better, to lock once around the whole loop, or
+	 * to lock for each call to remove_journal_hash?
+	 */
+	while (cn) {
+		if (cn->blocknr != 0) {
+			if (debug) {
+				reiserfs_warning(sb, "reiserfs-2201",
+						 "block %u, bh is %d, state %ld",
+						 cn->blocknr, cn->bh ? 1 : 0,
+						 cn->state);
+			}
+			cn->state = 0;
+			remove_journal_hash(sb, journal->j_list_hash_table,
+					    jl, cn->blocknr, 1);
+		}
+		last = cn;
+		cn = cn->next;
+		free_cnode(sb, last);
+	}
+	jl->j_realblock = NULL;
+}
+
+/*
+ * if this timestamp is greater than the timestamp we wrote last to the
+ * header block, write it to the header block.  once this is done, I can
+ * safely say the log area for this transaction won't ever be replayed,
+ * and I can start releasing blocks in this transaction for reuse as data
+ * blocks.  called by flush_journal_list, before it calls
+ * remove_all_from_journal_list
+ */
+static int _update_journal_header_block(struct super_block *sb,
+					unsigned long offset,
+					unsigned int trans_id)
+{
+	struct reiserfs_journal_header *jh;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	int depth;
+
+	if (reiserfs_is_journal_aborted(journal))
+		return -EIO;
+
+	if (trans_id >= journal->j_last_flush_trans_id) {
+		if (buffer_locked((journal->j_header_bh))) {
+			depth = reiserfs_write_unlock_nested(sb);
+			__wait_on_buffer(journal->j_header_bh);
+			reiserfs_write_lock_nested(sb, depth);
+			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+				reiserfs_warning(sb, "journal-699",
+						 "buffer write failed");
+#endif
+				return -EIO;
+			}
+		}
+		journal->j_last_flush_trans_id = trans_id;
+		journal->j_first_unflushed_offset = offset;
+		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
+							b_data);
+		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
+		jh->j_first_unflushed_offset = cpu_to_le32(offset);
+		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
+
+		set_buffer_dirty(journal->j_header_bh);
+		depth = reiserfs_write_unlock_nested(sb);
+
+		if (reiserfs_barrier_flush(sb))
+			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+		else
+			sync_dirty_buffer(journal->j_header_bh);
+
+		reiserfs_write_lock_nested(sb, depth);
+		if (!buffer_uptodate(journal->j_header_bh)) {
+			reiserfs_warning(sb, "journal-837",
+					 "IO error during journal replay");
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+static int update_journal_header_block(struct super_block *sb,
+				       unsigned long offset,
+				       unsigned int trans_id)
+{
+	return _update_journal_header_block(sb, offset, trans_id);
+}
+
+/*
+** flush any and all journal lists older than you are
+** can only be called from flush_journal_list
+*/
+static int flush_older_journal_lists(struct super_block *sb,
+				     struct reiserfs_journal_list *jl)
+{
+	struct list_head *entry;
+	struct reiserfs_journal_list *other_jl;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	unsigned int trans_id = jl->j_trans_id;
+
+	/*
+	 * we know we are the only ones flushing things, no extra race
+	 * protection is required.
+	 */
+restart:
+	entry = journal->j_journal_list.next;
+	/* Did we wrap? */
+	if (entry == &journal->j_journal_list)
+		return 0;
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (other_jl->j_trans_id < trans_id) {
+		BUG_ON(other_jl->j_refcount <= 0);
+		/* do not flush all */
+		flush_journal_list(sb, other_jl, 0);
+
+		/* other_jl is now deleted from the list */
+		goto restart;
+	}
+	return 0;
+}
+
+static void del_from_work_list(struct super_block *s,
+			       struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	if (!list_empty(&jl->j_working_list)) {
+		list_del_init(&jl->j_working_list);
+		journal->j_num_work_lists--;
+	}
+}
+
+/*
+ * flush a journal list, both commit and real blocks
+ *
+ * always set flushall to 1, unless you are calling from inside
+ * flush_journal_list
+ *
+ * IMPORTANT.  This can only be called while there are no journal writers,
+ * and the journal is locked.  That means it can only be called from
+ * do_journal_end, or by journal_release
+ */
+static int flush_journal_list(struct super_block *s,
+			      struct reiserfs_journal_list *jl, int flushall)
+{
+	struct reiserfs_journal_list *pjl;
+	struct reiserfs_journal_cnode *cn, *last;
+	int count;
+	int was_jwait = 0;
+	int was_dirty = 0;
+	struct buffer_head *saved_bh;
+	unsigned long j_len_saved = jl->j_len;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	int err = 0;
+	int depth;
+
+	BUG_ON(j_len_saved <= 0);
+
+	if (atomic_read(&journal->j_wcount) != 0) {
+		reiserfs_warning(s, "clm-2048", "called with wcount %d",
+				 atomic_read(&journal->j_wcount));
+	}
+
+	/* if flushall == 0, the lock is already held */
+	if (flushall) {
+		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
+	} else if (mutex_trylock(&journal->j_flush_mutex)) {
+		BUG();
+	}
+
+	count = 0;
+	if (j_len_saved > journal->j_trans_max) {
+		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
+			       j_len_saved, jl->j_trans_id);
+		return 0;
+	}
+
+	/* if all the work is already done, get out of here */
+	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
+	    atomic_read(&jl->j_commit_left) <= 0) {
+		goto flush_older_and_return;
+	}
+
+	/*
+	 * start by putting the commit list on disk.  This will also flush
+	 * the commit lists of any olders transactions
+	 */
+	flush_commit_list(s, jl, 1);
+
+	if (!(jl->j_state & LIST_DIRTY)
+	    && !reiserfs_is_journal_aborted(journal))
+		BUG();
+
+	/* are we done now? */
+	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
+	    atomic_read(&jl->j_commit_left) <= 0) {
+		goto flush_older_and_return;
+	}
+
+	/*
+	 * loop through each cnode, see if we need to write it,
+	 * or wait on a more recent transaction, or just ignore it
+	 */
+	if (atomic_read(&journal->j_wcount) != 0) {
+		reiserfs_panic(s, "journal-844", "journal list is flushing, "
+			       "wcount is not 0");
+	}
+	cn = jl->j_realblock;
+	while (cn) {
+		was_jwait = 0;
+		was_dirty = 0;
+		saved_bh = NULL;
+		/* blocknr of 0 is no longer in the hash, ignore it */
+		if (cn->blocknr == 0) {
+			goto free_cnode;
+		}
+
+		/*
+		 * This transaction failed commit.
+		 * Don't write out to the disk
+		 */
+		if (!(jl->j_state & LIST_DIRTY))
+			goto free_cnode;
+
+		pjl = find_newer_jl_for_cn(cn);
+		/*
+		 * the order is important here.  We check pjl to make sure we
+		 * don't clear BH_JDirty_wait if we aren't the one writing this
+		 * block to disk
+		 */
+		if (!pjl && cn->bh) {
+			saved_bh = cn->bh;
+
+			/*
+			 * we do this to make sure nobody releases the
+			 * buffer while we are working with it
+			 */
+			get_bh(saved_bh);
+
+			if (buffer_journal_dirty(saved_bh)) {
+				BUG_ON(!can_dirty(cn));
+				was_jwait = 1;
+				was_dirty = 1;
+			} else if (can_dirty(cn)) {
+				/*
+				 * everything with !pjl && jwait
+				 * should be writable
+				 */
+				BUG();
+			}
+		}
+
+		/*
+		 * if someone has this block in a newer transaction, just make
+		 * sure they are committed, and don't try writing it to disk
+		 */
+		if (pjl) {
+			if (atomic_read(&pjl->j_commit_left))
+				flush_commit_list(s, pjl, 1);
+			goto free_cnode;
+		}
+
+		/*
+		 * bh == NULL when the block got to disk on its own, OR,
+		 * the block got freed in a future transaction
+		 */
+		if (saved_bh == NULL) {
+			goto free_cnode;
+		}
+
+		/*
+		 * this should never happen.  kupdate_one_transaction has
+		 * this list locked while it works, so we should never see a
+		 * buffer here that is not marked JDirty_wait
+		 */
+		if ((!was_jwait) && !buffer_locked(saved_bh)) {
+			reiserfs_warning(s, "journal-813",
+					 "BAD! buffer %llu %cdirty %cjwait, "
+					 "not in a newer tranasction",
+					 (unsigned long long)saved_bh->
+					 b_blocknr, was_dirty ? ' ' : '!',
+					 was_jwait ? ' ' : '!');
+		}
+		if (was_dirty) {
+			/*
+			 * we inc again because saved_bh gets decremented
+			 * at free_cnode
+			 */
+			get_bh(saved_bh);
+			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
+			lock_buffer(saved_bh);
+			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
+			if (buffer_dirty(saved_bh))
+				submit_logged_buffer(saved_bh);
+			else
+				unlock_buffer(saved_bh);
+			count++;
+		} else {
+			reiserfs_warning(s, "clm-2082",
+					 "Unable to flush buffer %llu in %s",
+					 (unsigned long long)saved_bh->
+					 b_blocknr, __func__);
+		}
+free_cnode:
+		last = cn;
+		cn = cn->next;
+		if (saved_bh) {
+			/*
+			 * we incremented this to keep others from
+			 * taking the buffer head away
+			 */
+			put_bh(saved_bh);
+			if (atomic_read(&saved_bh->b_count) < 0) {
+				reiserfs_warning(s, "journal-945",
+						 "saved_bh->b_count < 0");
+			}
+		}
+	}
+	if (count > 0) {
+		cn = jl->j_realblock;
+		while (cn) {
+			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
+				if (!cn->bh) {
+					reiserfs_panic(s, "journal-1011",
+						       "cn->bh is NULL");
+				}
+
+				depth = reiserfs_write_unlock_nested(s);
+				__wait_on_buffer(cn->bh);
+				reiserfs_write_lock_nested(s, depth);
+
+				if (!cn->bh) {
+					reiserfs_panic(s, "journal-1012",
+						       "cn->bh is NULL");
+				}
+				if (unlikely(!buffer_uptodate(cn->bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+					reiserfs_warning(s, "journal-949",
+							 "buffer write failed");
+#endif
+					err = -EIO;
+				}
+				/*
+				 * note, we must clear the JDirty_wait bit
+				 * after the up to date check, otherwise we
+				 * race against our flushpage routine
+				 */
+				BUG_ON(!test_clear_buffer_journal_dirty
+				       (cn->bh));
+
+				/* drop one ref for us */
+				put_bh(cn->bh);
+				/* drop one ref for journal_mark_dirty */
+				release_buffer_page(cn->bh);
+			}
+			cn = cn->next;
+		}
+	}
+
+	if (err)
+		reiserfs_abort(s, -EIO,
+			       "Write error while pushing transaction to disk in %s",
+			       __func__);
+flush_older_and_return:
+
+	/*
+	 * before we can update the journal header block, we _must_ flush all
+	 * real blocks from all older transactions to disk.  This is because
+	 * once the header block is updated, this transaction will not be
+	 * replayed after a crash
+	 */
+	if (flushall) {
+		flush_older_journal_lists(s, jl);
+	}
+
+	err = journal->j_errno;
+	/*
+	 * before we can remove everything from the hash tables for this
+	 * transaction, we must make sure it can never be replayed
+	 *
+	 * since we are only called from do_journal_end, we know for sure there
+	 * are no allocations going on while we are flushing journal lists.  So,
+	 * we only need to update the journal header block for the last list
+	 * being flushed
+	 */
+	if (!err && flushall) {
+		err =
+		    update_journal_header_block(s,
+						(jl->j_start + jl->j_len +
+						 2) % SB_ONDISK_JOURNAL_SIZE(s),
+						jl->j_trans_id);
+		if (err)
+			reiserfs_abort(s, -EIO,
+				       "Write error while updating journal header in %s",
+				       __func__);
+	}
+	remove_all_from_journal_list(s, jl, 0);
+	list_del_init(&jl->j_list);
+	journal->j_num_lists--;
+	del_from_work_list(s, jl);
+
+	if (journal->j_last_flush_id != 0 &&
+	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
+		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
+				 journal->j_last_flush_id, jl->j_trans_id);
+	}
+	journal->j_last_flush_id = jl->j_trans_id;
+
+	/*
+	 * not strictly required since we are freeing the list, but it should
+	 * help find code using dead lists later on
+	 */
+	jl->j_len = 0;
+	atomic_set(&jl->j_nonzerolen, 0);
+	jl->j_start = 0;
+	jl->j_realblock = NULL;
+	jl->j_commit_bh = NULL;
+	jl->j_trans_id = 0;
+	jl->j_state = 0;
+	put_journal_list(s, jl);
+	if (flushall)
+		mutex_unlock(&journal->j_flush_mutex);
+	return err;
+}
+
+static int write_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk)
+{
+	struct reiserfs_journal_cnode *cn;
+	int ret = 0;
+
+	jl->j_state |= LIST_TOUCHED;
+	del_from_work_list(s, jl);
+	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+		return 0;
+	}
+
+	cn = jl->j_realblock;
+	while (cn) {
+		/*
+		 * if the blocknr == 0, this has been cleared from the hash,
+		 * skip it
+		 */
+		if (cn->blocknr == 0) {
+			goto next;
+		}
+		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+			struct buffer_head *tmp_bh;
+			/*
+			 * we can race against journal_mark_freed when we try
+			 * to lock_buffer(cn->bh), so we have to inc the buffer
+			 * count, and recheck things after locking
+			 */
+			tmp_bh = cn->bh;
+			get_bh(tmp_bh);
+			lock_buffer(tmp_bh);
+			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+				if (!buffer_journal_dirty(tmp_bh) ||
+				    buffer_journal_prepared(tmp_bh))
+					BUG();
+				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
+				ret++;
+			} else {
+				/* note, cn->bh might be null now */
+				unlock_buffer(tmp_bh);
+			}
+			put_bh(tmp_bh);
+		}
+next:
+		cn = cn->next;
+		cond_resched();
+	}
+	return ret;
+}
+
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal_list *pjl;
+	int ret = 0;
+
+	jl->j_state |= LIST_DIRTY;
+	cn = jl->j_realblock;
+	while (cn) {
+		/*
+		 * look for a more recent transaction that logged this
+		 * buffer.  Only the most recent transaction with a buffer in
+		 * it is allowed to send that buffer to disk
+		 */
+		pjl = find_newer_jl_for_cn(cn);
+		if (!pjl && cn->blocknr && cn->bh
+		    && buffer_journal_dirty(cn->bh)) {
+			BUG_ON(!can_dirty(cn));
+			/*
+			 * if the buffer is prepared, it will either be logged
+			 * or restored.  If restored, we need to make sure
+			 * it actually gets marked dirty
+			 */
+			clear_buffer_journal_new(cn->bh);
+			if (buffer_journal_prepared(cn->bh)) {
+				set_buffer_journal_restore_dirty(cn->bh);
+			} else {
+				set_buffer_journal_test(cn->bh);
+				mark_buffer_dirty(cn->bh);
+			}
+		}
+		cn = cn->next;
+	}
+	return ret;
+}
+
+static int kupdate_transactions(struct super_block *s,
+				struct reiserfs_journal_list *jl,
+				struct reiserfs_journal_list **next_jl,
+				unsigned int *next_trans_id,
+				int num_blocks, int num_trans)
+{
+	int ret = 0;
+	int written = 0;
+	int transactions_flushed = 0;
+	unsigned int orig_trans_id = jl->j_trans_id;
+	struct buffer_chunk chunk;
+	struct list_head *entry;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	chunk.nr = 0;
+
+	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
+	if (!journal_list_still_alive(s, orig_trans_id)) {
+		goto done;
+	}
+
+	/*
+	 * we've got j_flush_mutex held, nobody is going to delete any
+	 * of these lists out from underneath us
+	 */
+	while ((num_trans && transactions_flushed < num_trans) ||
+	       (!num_trans && written < num_blocks)) {
+
+		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+		    atomic_read(&jl->j_commit_left)
+		    || !(jl->j_state & LIST_DIRTY)) {
+			del_from_work_list(s, jl);
+			break;
+		}
+		ret = write_one_transaction(s, jl, &chunk);
+
+		if (ret < 0)
+			goto done;
+		transactions_flushed++;
+		written += ret;
+		entry = jl->j_list.next;
+
+		/* did we wrap? */
+		if (entry == &journal->j_journal_list) {
+			break;
+		}
+		jl = JOURNAL_LIST_ENTRY(entry);
+
+		/* don't bother with older transactions */
+		if (jl->j_trans_id <= orig_trans_id)
+			break;
+	}
+	if (chunk.nr) {
+		write_chunk(&chunk);
+	}
+
+done:
+	mutex_unlock(&journal->j_flush_mutex);
+	return ret;
+}
+
+/*
+ * for o_sync and fsync heavy applications, they tend to use
+ * all the journa list slots with tiny transactions.  These
+ * trigger lots and lots of calls to update the header block, which
+ * adds seeks and slows things down.
+ *
+ * This function tries to clear out a large chunk of the journal lists
+ * at once, which makes everything faster since only the newest journal
+ * list updates the header block
+ */
+static int flush_used_journal_lists(struct super_block *s,
+				    struct reiserfs_journal_list *jl)
+{
+	unsigned long len = 0;
+	unsigned long cur_len;
+	int ret;
+	int i;
+	int limit = 256;
+	struct reiserfs_journal_list *tjl;
+	struct reiserfs_journal_list *flush_jl;
+	unsigned int trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+
+	flush_jl = tjl = jl;
+
+	/* in data logging mode, try harder to flush a lot of blocks */
+	if (reiserfs_data_log(s))
+		limit = 1024;
+	/* flush for 256 transactions or limit blocks, whichever comes first */
+	for (i = 0; i < 256 && len < limit; i++) {
+		if (atomic_read(&tjl->j_commit_left) ||
+		    tjl->j_trans_id < jl->j_trans_id) {
+			break;
+		}
+		cur_len = atomic_read(&tjl->j_nonzerolen);
+		if (cur_len > 0) {
+			tjl->j_state &= ~LIST_TOUCHED;
+		}
+		len += cur_len;
+		flush_jl = tjl;
+		if (tjl->j_list.next == &journal->j_journal_list)
+			break;
+		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+	}
+	get_journal_list(jl);
+	get_journal_list(flush_jl);
+
+	/*
+	 * try to find a group of blocks we can flush across all the
+	 * transactions, but only bother if we've actually spanned
+	 * across multiple lists
+	 */
+	if (flush_jl != jl) {
+		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+	}
+	flush_journal_list(s, flush_jl, 1);
+	put_journal_list(s, flush_jl);
+	put_journal_list(s, jl);
+	return 0;
+}
+
+/*
+ * removes any nodes in table with name block and dev as bh.
+ * only touchs the hnext and hprev pointers.
+ */
+void remove_journal_hash(struct super_block *sb,
+			 struct reiserfs_journal_cnode **table,
+			 struct reiserfs_journal_list *jl,
+			 unsigned long block, int remove_freed)
+{
+	struct reiserfs_journal_cnode *cur;
+	struct reiserfs_journal_cnode **head;
+
+	head = &(journal_hash(table, sb, block));
+	if (!head) {
+		return;
+	}
+	cur = *head;
+	while (cur) {
+		if (cur->blocknr == block && cur->sb == sb
+		    && (jl == NULL || jl == cur->jlist)
+		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
+			if (cur->hnext) {
+				cur->hnext->hprev = cur->hprev;
+			}
+			if (cur->hprev) {
+				cur->hprev->hnext = cur->hnext;
+			} else {
+				*head = cur->hnext;
+			}
+			cur->blocknr = 0;
+			cur->sb = NULL;
+			cur->state = 0;
+			/*
+			 * anybody who clears the cur->bh will also
+			 * dec the nonzerolen
+			 */
+			if (cur->bh && cur->jlist)
+				atomic_dec(&cur->jlist->j_nonzerolen);
+			cur->bh = NULL;
+			cur->jlist = NULL;
+		}
+		cur = cur->hnext;
+	}
+}
+
+static void free_journal_ram(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	kfree(journal->j_current_jl);
+	journal->j_num_lists--;
+
+	vfree(journal->j_cnode_free_orig);
+	free_list_bitmaps(sb, journal->j_list_bitmap);
+	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
+	if (journal->j_header_bh) {
+		brelse(journal->j_header_bh);
+	}
+	/*
+	 * j_header_bh is on the journal dev, make sure
+	 * not to release the journal dev until we brelse j_header_bh
+	 */
+	release_journal_dev(sb, journal);
+	vfree(journal);
+}
+
+/*
+ * call on unmount.  Only set error to 1 if you haven't made your way out
+ * of read_super() yet.  Any other caller must keep error at 0.
+ */
+static int do_journal_release(struct reiserfs_transaction_handle *th,
+			      struct super_block *sb, int error)
+{
+	struct reiserfs_transaction_handle myth;
+	int flushed = 0;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	/*
+	 * we only want to flush out transactions if we were
+	 * called with error == 0
+	 */
+	if (!error && !(sb->s_flags & MS_RDONLY)) {
+		/* end the current trans */
+		BUG_ON(!th->t_trans_id);
+		do_journal_end(th, FLUSH_ALL);
+
+		/*
+		 * make sure something gets logged to force
+		 * our way into the flush code
+		 */
+		if (!journal_join(&myth, sb)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, FLUSH_ALL);
+			flushed = 1;
+		}
+	}
+
+	/* this also catches errors during the do_journal_end above */
+	if (!error && reiserfs_is_journal_aborted(journal)) {
+		memset(&myth, 0, sizeof(myth));
+		if (!journal_join_abort(&myth, sb)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, FLUSH_ALL);
+		}
+	}
+
+
+	/*
+	 * We must release the write lock here because
+	 * the workqueue job (flush_async_commit) needs this lock
+	 */
+	reiserfs_write_unlock(sb);
+
+	/*
+	 * Cancel flushing of old commits. Note that neither of these works
+	 * will be requeued because superblock is being shutdown and doesn't
+	 * have MS_ACTIVE set.
+	 */
+	cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
+	/* wait for all commits to finish */
+	cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
+
+	free_journal_ram(sb);
+
+	reiserfs_write_lock(sb);
+
+	return 0;
+}
+
+/* * call on unmount.  flush all journal trans, release all alloc'd ram */
+int journal_release(struct reiserfs_transaction_handle *th,
+		    struct super_block *sb)
+{
+	return do_journal_release(th, sb, 0);
+}
+
+/* only call from an error condition inside reiserfs_read_super!  */
+int journal_release_error(struct reiserfs_transaction_handle *th,
+			  struct super_block *sb)
+{
+	return do_journal_release(th, sb, 1);
+}
+
+/*
+ * compares description block with commit block.
+ * returns 1 if they differ, 0 if they are the same
+ */
+static int journal_compare_desc_commit(struct super_block *sb,
+				       struct reiserfs_journal_desc *desc,
+				       struct reiserfs_journal_commit *commit)
+{
+	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
+	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
+	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
+	    get_commit_trans_len(commit) <= 0) {
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * returns 0 if it did not find a description block
+ * returns -1 if it found a corrupt commit block
+ * returns 1 if both desc and commit were valid
+ * NOTE: only called during fs mount
+ */
+static int journal_transaction_is_valid(struct super_block *sb,
+					struct buffer_head *d_bh,
+					unsigned int *oldest_invalid_trans_id,
+					unsigned long *newest_mount_id)
+{
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	struct buffer_head *c_bh;
+	unsigned long offset;
+
+	if (!d_bh)
+		return 0;
+
+	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+	if (get_desc_trans_len(desc) > 0
+	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
+		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
+		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+				       "journal-986: transaction "
+				       "is valid returning because trans_id %d is greater than "
+				       "oldest_invalid %lu",
+				       get_desc_trans_id(desc),
+				       *oldest_invalid_trans_id);
+			return 0;
+		}
+		if (newest_mount_id
+		    && *newest_mount_id > get_desc_mount_id(desc)) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+				       "journal-1087: transaction "
+				       "is valid returning because mount_id %d is less than "
+				       "newest_mount_id %lu",
+				       get_desc_mount_id(desc),
+				       *newest_mount_id);
+			return -1;
+		}
+		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
+			reiserfs_warning(sb, "journal-2018",
+					 "Bad transaction length %d "
+					 "encountered, ignoring transaction",
+					 get_desc_trans_len(desc));
+			return -1;
+		}
+		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+
+		/*
+		 * ok, we have a journal description block,
+		 * let's see if the transaction was valid
+		 */
+		c_bh =
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				  ((offset + get_desc_trans_len(desc) +
+				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
+		if (!c_bh)
+			return 0;
+		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+		if (journal_compare_desc_commit(sb, desc, commit)) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+				       "journal_transaction_is_valid, commit offset %ld had bad "
+				       "time %d or length %d",
+				       c_bh->b_blocknr -
+				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+				       get_commit_trans_id(commit),
+				       get_commit_trans_len(commit));
+			brelse(c_bh);
+			if (oldest_invalid_trans_id) {
+				*oldest_invalid_trans_id =
+				    get_desc_trans_id(desc);
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1004: "
+					       "transaction_is_valid setting oldest invalid trans_id "
+					       "to %d",
+					       get_desc_trans_id(desc));
+			}
+			return -1;
+		}
+		brelse(c_bh);
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1006: found valid "
+			       "transaction start offset %llu, len %d id %d",
+			       d_bh->b_blocknr -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+			       get_desc_trans_len(desc),
+			       get_desc_trans_id(desc));
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+static void brelse_array(struct buffer_head **heads, int num)
+{
+	int i;
+	for (i = 0; i < num; i++) {
+		brelse(heads[i]);
+	}
+}
+
+/*
+ * given the start, and values for the oldest acceptable transactions,
+ * this either reads in a replays a transaction, or returns because the
+ * transaction is invalid, or too old.
+ * NOTE: only called during fs mount
+ */
+static int journal_read_transaction(struct super_block *sb,
+				    unsigned long cur_dblock,
+				    unsigned long oldest_start,
+				    unsigned int oldest_trans_id,
+				    unsigned long newest_mount_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	unsigned int trans_id = 0;
+	struct buffer_head *c_bh;
+	struct buffer_head *d_bh;
+	struct buffer_head **log_blocks = NULL;
+	struct buffer_head **real_blocks = NULL;
+	unsigned int trans_offset;
+	int i;
+	int trans_half;
+
+	d_bh = journal_bread(sb, cur_dblock);
+	if (!d_bh)
+		return 1;
+	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
+		       "journal_read_transaction, offset %llu, len %d mount_id %d",
+		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+		       get_desc_trans_len(desc), get_desc_mount_id(desc));
+	if (get_desc_trans_id(desc) < oldest_trans_id) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
+			       "journal_read_trans skipping because %lu is too old",
+			       cur_dblock -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
+		brelse(d_bh);
+		return 1;
+	}
+	if (get_desc_mount_id(desc) != newest_mount_id) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
+			       "journal_read_trans skipping because %d is != "
+			       "newest_mount_id %lu", get_desc_mount_id(desc),
+			       newest_mount_id);
+		brelse(d_bh);
+		return 1;
+	}
+	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			     ((trans_offset + get_desc_trans_len(desc) + 1) %
+			      SB_ONDISK_JOURNAL_SIZE(sb)));
+	if (!c_bh) {
+		brelse(d_bh);
+		return 1;
+	}
+	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+	if (journal_compare_desc_commit(sb, desc, commit)) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal_read_transaction, "
+			       "commit offset %llu had bad time %d or length %d",
+			       c_bh->b_blocknr -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+			       get_commit_trans_id(commit),
+			       get_commit_trans_len(commit));
+		brelse(c_bh);
+		brelse(d_bh);
+		return 1;
+	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
+				 "device is readonly, unable to replay log");
+		brelse(c_bh);
+		brelse(d_bh);
+		return -EROFS;
+	}
+
+	trans_id = get_desc_trans_id(desc);
+	/*
+	 * now we know we've got a good transaction, and it was
+	 * inside the valid time ranges
+	 */
+	log_blocks = kmalloc(get_desc_trans_len(desc) *
+			     sizeof(struct buffer_head *), GFP_NOFS);
+	real_blocks = kmalloc(get_desc_trans_len(desc) *
+			      sizeof(struct buffer_head *), GFP_NOFS);
+	if (!log_blocks || !real_blocks) {
+		brelse(c_bh);
+		brelse(d_bh);
+		kfree(log_blocks);
+		kfree(real_blocks);
+		reiserfs_warning(sb, "journal-1169",
+				 "kmalloc failed, unable to mount FS");
+		return -1;
+	}
+	/* get all the buffer heads */
+	trans_half = journal_trans_half(sb->s_blocksize);
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		log_blocks[i] =
+		    journal_getblk(sb,
+				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				   (trans_offset + 1 +
+				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
+		if (i < trans_half) {
+			real_blocks[i] =
+			    sb_getblk(sb,
+				      le32_to_cpu(desc->j_realblock[i]));
+		} else {
+			real_blocks[i] =
+			    sb_getblk(sb,
+				      le32_to_cpu(commit->
+						  j_realblock[i - trans_half]));
+		}
+		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
+			reiserfs_warning(sb, "journal-1207",
+					 "REPLAY FAILURE fsck required! "
+					 "Block to replay is outside of "
+					 "filesystem");
+			goto abort_replay;
+		}
+		/* make sure we don't try to replay onto log or reserved area */
+		if (is_block_in_log_or_reserved_area
+		    (sb, real_blocks[i]->b_blocknr)) {
+			reiserfs_warning(sb, "journal-1204",
+					 "REPLAY FAILURE fsck required! "
+					 "Trying to replay onto a log block");
+abort_replay:
+			brelse_array(log_blocks, i);
+			brelse_array(real_blocks, i);
+			brelse(c_bh);
+			brelse(d_bh);
+			kfree(log_blocks);
+			kfree(real_blocks);
+			return -1;
+		}
+	}
+	/* read in the log blocks, memcpy to the corresponding real block */
+	ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+
+		wait_on_buffer(log_blocks[i]);
+		if (!buffer_uptodate(log_blocks[i])) {
+			reiserfs_warning(sb, "journal-1212",
+					 "REPLAY FAILURE fsck required! "
+					 "buffer write failed");
+			brelse_array(log_blocks + i,
+				     get_desc_trans_len(desc) - i);
+			brelse_array(real_blocks, get_desc_trans_len(desc));
+			brelse(c_bh);
+			brelse(d_bh);
+			kfree(log_blocks);
+			kfree(real_blocks);
+			return -1;
+		}
+		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
+		       real_blocks[i]->b_size);
+		set_buffer_uptodate(real_blocks[i]);
+		brelse(log_blocks[i]);
+	}
+	/* flush out the real blocks */
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		set_buffer_dirty(real_blocks[i]);
+		write_dirty_buffer(real_blocks[i], WRITE);
+	}
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		wait_on_buffer(real_blocks[i]);
+		if (!buffer_uptodate(real_blocks[i])) {
+			reiserfs_warning(sb, "journal-1226",
+					 "REPLAY FAILURE, fsck required! "
+					 "buffer write failed");
+			brelse_array(real_blocks + i,
+				     get_desc_trans_len(desc) - i);
+			brelse(c_bh);
+			brelse(d_bh);
+			kfree(log_blocks);
+			kfree(real_blocks);
+			return -1;
+		}
+		brelse(real_blocks[i]);
+	}
+	cur_dblock =
+	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+	    ((trans_offset + get_desc_trans_len(desc) +
+	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+		       "journal-1095: setting journal " "start to offset %ld",
+		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
+
+	/*
+	 * init starting values for the first transaction, in case
+	 * this is the last transaction to be replayed.
+	 */
+	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	journal->j_last_flush_trans_id = trans_id;
+	journal->j_trans_id = trans_id + 1;
+	/* check for trans_id overflow */
+	if (journal->j_trans_id == 0)
+		journal->j_trans_id = 10;
+	brelse(c_bh);
+	brelse(d_bh);
+	kfree(log_blocks);
+	kfree(real_blocks);
+	return 0;
+}
+
+/*
+ * This function reads blocks starting from block and to max_block of bufsize
+ * size (but no more than BUFNR blocks at a time). This proved to improve
+ * mounting speed on self-rebuilding raid5 arrays at least.
+ * Right now it is only used from journal code. But later we might use it
+ * from other places.
+ * Note: Do not use journal_getblk/sb_getblk functions here!
+ */
+static struct buffer_head *reiserfs_breada(struct block_device *dev,
+					   b_blocknr_t block, int bufsize,
+					   b_blocknr_t max_block)
+{
+	struct buffer_head *bhlist[BUFNR];
+	unsigned int blocks = BUFNR;
+	struct buffer_head *bh;
+	int i, j;
+
+	bh = __getblk(dev, block, bufsize);
+	if (buffer_uptodate(bh))
+		return (bh);
+
+	if (block + BUFNR > max_block) {
+		blocks = max_block - block;
+	}
+	bhlist[0] = bh;
+	j = 1;
+	for (i = 1; i < blocks; i++) {
+		bh = __getblk(dev, block + i, bufsize);
+		if (buffer_uptodate(bh)) {
+			brelse(bh);
+			break;
+		} else
+			bhlist[j++] = bh;
+	}
+	ll_rw_block(READ, j, bhlist);
+	for (i = 1; i < j; i++)
+		brelse(bhlist[i]);
+	bh = bhlist[0];
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	brelse(bh);
+	return NULL;
+}
+
+/*
+ * read and replay the log
+ * on a clean unmount, the journal header's next unflushed pointer will be
+ * to an invalid transaction.  This tests that before finding all the
+ * transactions in the log, which makes normal mount times fast.
+ *
+ * After a crash, this starts with the next unflushed transaction, and
+ * replays until it finds one too old, or invalid.
+ *
+ * On exit, it sets things up so the first transaction will work correctly.
+ * NOTE: only called during fs mount
+ */
+static int journal_read(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_desc *desc;
+	unsigned int oldest_trans_id = 0;
+	unsigned int oldest_invalid_trans_id = 0;
+	time_t start;
+	unsigned long oldest_start = 0;
+	unsigned long cur_dblock = 0;
+	unsigned long newest_mount_id = 9;
+	struct buffer_head *d_bh;
+	struct reiserfs_journal_header *jh;
+	int valid_journal_header = 0;
+	int replay_count = 0;
+	int continue_replay = 1;
+	int ret;
+	char b[BDEVNAME_SIZE];
+
+	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_info(sb, "checking transaction log (%s)\n",
+		      bdevname(journal->j_dev_bd, b));
+	start = get_seconds();
+
+	/*
+	 * step 1, read in the journal header block.  Check the transaction
+	 * it says is the first unflushed, and if that transaction is not
+	 * valid, replay is done
+	 */
+	journal->j_header_bh = journal_bread(sb,
+					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
+					     + SB_ONDISK_JOURNAL_SIZE(sb));
+	if (!journal->j_header_bh) {
+		return 1;
+	}
+	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
+	if (le32_to_cpu(jh->j_first_unflushed_offset) <
+	    SB_ONDISK_JOURNAL_SIZE(sb)
+	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
+		oldest_start =
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+		    le32_to_cpu(jh->j_first_unflushed_offset);
+		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+		newest_mount_id = le32_to_cpu(jh->j_mount_id);
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1153: found in "
+			       "header: first_unflushed_offset %d, last_flushed_trans_id "
+			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
+			       le32_to_cpu(jh->j_last_flush_trans_id));
+		valid_journal_header = 1;
+
+		/*
+		 * now, we try to read the first unflushed offset.  If it
+		 * is not valid, there is nothing more we can do, and it
+		 * makes no sense to read through the whole log.
+		 */
+		d_bh =
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				  le32_to_cpu(jh->j_first_unflushed_offset));
+		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
+		if (!ret) {
+			continue_replay = 0;
+		}
+		brelse(d_bh);
+		goto start_log_replay;
+	}
+
+	/*
+	 * ok, there are transactions that need to be replayed.  start
+	 * with the first log block, find all the valid transactions, and
+	 * pick out the oldest.
+	 */
+	while (continue_replay
+	       && cur_dblock <
+	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+		SB_ONDISK_JOURNAL_SIZE(sb))) {
+		/*
+		 * Note that it is required for blocksize of primary fs
+		 * device and journal device to be the same
+		 */
+		d_bh =
+		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
+				    sb->s_blocksize,
+				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				    SB_ONDISK_JOURNAL_SIZE(sb));
+		ret =
+		    journal_transaction_is_valid(sb, d_bh,
+						 &oldest_invalid_trans_id,
+						 &newest_mount_id);
+		if (ret == 1) {
+			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+			if (oldest_start == 0) {	/* init all oldest_ values */
+				oldest_trans_id = get_desc_trans_id(desc);
+				oldest_start = d_bh->b_blocknr;
+				newest_mount_id = get_desc_mount_id(desc);
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1179: Setting "
+					       "oldest_start to offset %llu, trans_id %lu",
+					       oldest_start -
+					       SB_ONDISK_JOURNAL_1st_BLOCK
+					       (sb), oldest_trans_id);
+			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
+				/* one we just read was older */
+				oldest_trans_id = get_desc_trans_id(desc);
+				oldest_start = d_bh->b_blocknr;
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1180: Resetting "
+					       "oldest_start to offset %lu, trans_id %lu",
+					       oldest_start -
+					       SB_ONDISK_JOURNAL_1st_BLOCK
+					       (sb), oldest_trans_id);
+			}
+			if (newest_mount_id < get_desc_mount_id(desc)) {
+				newest_mount_id = get_desc_mount_id(desc);
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1299: Setting "
+					       "newest_mount_id to %d",
+					       get_desc_mount_id(desc));
+			}
+			cur_dblock += get_desc_trans_len(desc) + 2;
+		} else {
+			cur_dblock++;
+		}
+		brelse(d_bh);
+	}
+
+start_log_replay:
+	cur_dblock = oldest_start;
+	if (oldest_trans_id) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1206: Starting replay "
+			       "from offset %llu, trans_id %lu",
+			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+			       oldest_trans_id);
+
+	}
+	replay_count = 0;
+	while (continue_replay && oldest_trans_id > 0) {
+		ret =
+		    journal_read_transaction(sb, cur_dblock, oldest_start,
+					     oldest_trans_id, newest_mount_id);
+		if (ret < 0) {
+			return ret;
+		} else if (ret != 0) {
+			break;
+		}
+		cur_dblock =
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
+		replay_count++;
+		if (cur_dblock == oldest_start)
+			break;
+	}
+
+	if (oldest_trans_id == 0) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1225: No valid " "transactions found");
+	}
+	/*
+	 * j_start does not get set correctly if we don't replay any
+	 * transactions.  if we had a valid journal_header, set j_start
+	 * to the first unflushed transaction value, copy the trans_id
+	 * from the header
+	 */
+	if (valid_journal_header && replay_count == 0) {
+		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
+		journal->j_trans_id =
+		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+		/* check for trans_id overflow */
+		if (journal->j_trans_id == 0)
+			journal->j_trans_id = 10;
+		journal->j_last_flush_trans_id =
+		    le32_to_cpu(jh->j_last_flush_trans_id);
+		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
+	} else {
+		journal->j_mount_id = newest_mount_id + 1;
+	}
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
+		       "newest_mount_id to %lu", journal->j_mount_id);
+	journal->j_first_unflushed_offset = journal->j_start;
+	if (replay_count > 0) {
+		reiserfs_info(sb,
+			      "replayed %d transactions in %lu seconds\n",
+			      replay_count, get_seconds() - start);
+	}
+	/* needed to satisfy the locking in _update_journal_header_block */
+	reiserfs_write_lock(sb);
+	if (!bdev_read_only(sb->s_bdev) &&
+	    _update_journal_header_block(sb, journal->j_start,
+					 journal->j_last_flush_trans_id)) {
+		reiserfs_write_unlock(sb);
+		/*
+		 * replay failed, caller must call free_journal_ram and abort
+		 * the mount
+		 */
+		return -1;
+	}
+	reiserfs_write_unlock(sb);
+	return 0;
+}
+
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
+{
+	struct reiserfs_journal_list *jl;
+	jl = kzalloc(sizeof(struct reiserfs_journal_list),
+		     GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&jl->j_list);
+	INIT_LIST_HEAD(&jl->j_working_list);
+	INIT_LIST_HEAD(&jl->j_tail_bh_list);
+	INIT_LIST_HEAD(&jl->j_bh_list);
+	mutex_init(&jl->j_commit_mutex);
+	SB_JOURNAL(s)->j_num_lists++;
+	get_journal_list(jl);
+	return jl;
+}
+
+static void journal_list_init(struct super_block *sb)
+{
+	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
+}
+
+static void release_journal_dev(struct super_block *super,
+			       struct reiserfs_journal *journal)
+{
+	if (journal->j_dev_bd != NULL) {
+		blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
+		journal->j_dev_bd = NULL;
+	}
+}
+
+static int journal_init_dev(struct super_block *super,
+			    struct reiserfs_journal *journal,
+			    const char *jdev_name)
+{
+	int result;
+	dev_t jdev;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
+	char b[BDEVNAME_SIZE];
+
+	result = 0;
+
+	journal->j_dev_bd = NULL;
+	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
+	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
+
+	if (bdev_read_only(super->s_bdev))
+		blkdev_mode = FMODE_READ;
+
+	/* there is no "jdev" option and journal is on separate device */
+	if ((!jdev_name || !jdev_name[0])) {
+		if (jdev == super->s_dev)
+			blkdev_mode &= ~FMODE_EXCL;
+		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+						      journal);
+		journal->j_dev_mode = blkdev_mode;
+		if (IS_ERR(journal->j_dev_bd)) {
+			result = PTR_ERR(journal->j_dev_bd);
+			journal->j_dev_bd = NULL;
+			reiserfs_warning(super, "sh-458",
+					 "cannot init journal device '%s': %i",
+					 __bdevname(jdev, b), result);
+			return result;
+		} else if (jdev != super->s_dev)
+			set_blocksize(journal->j_dev_bd, super->s_blocksize);
+
+		return 0;
+	}
+
+	journal->j_dev_mode = blkdev_mode;
+	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
+	if (IS_ERR(journal->j_dev_bd)) {
+		result = PTR_ERR(journal->j_dev_bd);
+		journal->j_dev_bd = NULL;
+		reiserfs_warning(super,
+				 "journal_init_dev: Cannot open '%s': %i",
+				 jdev_name, result);
+		return result;
+	}
+
+	set_blocksize(journal->j_dev_bd, super->s_blocksize);
+	reiserfs_info(super,
+		      "journal_init_dev: journal device: %s\n",
+		      bdevname(journal->j_dev_bd, b));
+	return 0;
+}
+
+/*
+ * When creating/tuning a file system user can assign some
+ * journal params within boundaries which depend on the ratio
+ * blocksize/standard_blocksize.
+ *
+ * For blocks >= standard_blocksize transaction size should
+ * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
+ * then JOURNAL_TRANS_MAX_DEFAULT.
+ *
+ * For blocks < standard_blocksize these boundaries should be
+ * decreased proportionally.
+ */
+#define REISERFS_STANDARD_BLKSIZE (4096)
+
+static int check_advise_trans_params(struct super_block *sb,
+				     struct reiserfs_journal *journal)
+{
+        if (journal->j_trans_max) {
+		/* Non-default journal params.  Do sanity check for them. */
+	        int ratio = 1;
+		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
+		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
+
+		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
+		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
+		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
+		    JOURNAL_MIN_RATIO) {
+			reiserfs_warning(sb, "sh-462",
+					 "bad transaction max size (%u). "
+					 "FSCK?", journal->j_trans_max);
+			return 1;
+		}
+		if (journal->j_max_batch != (journal->j_trans_max) *
+		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
+			reiserfs_warning(sb, "sh-463",
+					 "bad transaction max batch (%u). "
+					 "FSCK?", journal->j_max_batch);
+			return 1;
+		}
+	} else {
+		/*
+		 * Default journal params.
+		 * The file system was created by old version
+		 * of mkreiserfs, so some fields contain zeros,
+		 * and we need to advise proper values for them
+		 */
+		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
+			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
+					 sb->s_blocksize);
+			return 1;
+		}
+		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
+		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
+		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
+	}
+	return 0;
+}
+
+/* must be called once on fs mount.  calls journal_read for you */
+int journal_init(struct super_block *sb, const char *j_dev_name,
+		 int old_format, unsigned int commit_max_age)
+{
+	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
+	struct buffer_head *bhjh;
+	struct reiserfs_super_block *rs;
+	struct reiserfs_journal_header *jh;
+	struct reiserfs_journal *journal;
+	struct reiserfs_journal_list *jl;
+	char b[BDEVNAME_SIZE];
+	int ret;
+
+	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
+	if (!journal) {
+		reiserfs_warning(sb, "journal-1256",
+				 "unable to get memory for journal structure");
+		return 1;
+	}
+	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
+	INIT_LIST_HEAD(&journal->j_prealloc_list);
+	INIT_LIST_HEAD(&journal->j_working_list);
+	INIT_LIST_HEAD(&journal->j_journal_list);
+	journal->j_persistent_trans = 0;
+	if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+					   reiserfs_bmap_count(sb)))
+		goto free_and_return;
+
+	allocate_bitmap_nodes(sb);
+
+	/* reserved for journal area support */
+	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
+						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
+						 / sb->s_blocksize +
+						 reiserfs_bmap_count(sb) +
+						 1 :
+						 REISERFS_DISK_OFFSET_IN_BYTES /
+						 sb->s_blocksize + 2);
+
+	/*
+	 * Sanity check to see is the standard journal fitting
+	 * within first bitmap (actual for small blocksizes)
+	 */
+	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
+	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
+	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
+		reiserfs_warning(sb, "journal-1393",
+				 "journal does not fit for area addressed "
+				 "by first of bitmap blocks. It starts at "
+				 "%u and its size is %u. Block size %ld",
+				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
+				 SB_ONDISK_JOURNAL_SIZE(sb),
+				 sb->s_blocksize);
+		goto free_and_return;
+	}
+
+	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+		reiserfs_warning(sb, "sh-462",
+				 "unable to initialize journal device");
+		goto free_and_return;
+	}
+
+	rs = SB_DISK_SUPER_BLOCK(sb);
+
+	/* read journal header */
+	bhjh = journal_bread(sb,
+			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			     SB_ONDISK_JOURNAL_SIZE(sb));
+	if (!bhjh) {
+		reiserfs_warning(sb, "sh-459",
+				 "unable to read journal header");
+		goto free_and_return;
+	}
+	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
+
+	/* make sure that journal matches to the super block */
+	if (is_reiserfs_jr(rs)
+	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
+		sb_jp_journal_magic(rs))) {
+		reiserfs_warning(sb, "sh-460",
+				 "journal header magic %x (device %s) does "
+				 "not match to magic found in super block %x",
+				 jh->jh_journal.jp_journal_magic,
+				 bdevname(journal->j_dev_bd, b),
+				 sb_jp_journal_magic(rs));
+		brelse(bhjh);
+		goto free_and_return;
+	}
+
+	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
+	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
+	journal->j_max_commit_age =
+	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
+	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
+
+	if (check_advise_trans_params(sb, journal) != 0)
+	        goto free_and_return;
+	journal->j_default_max_commit_age = journal->j_max_commit_age;
+
+	if (commit_max_age != 0) {
+		journal->j_max_commit_age = commit_max_age;
+		journal->j_max_trans_age = commit_max_age;
+	}
+
+	reiserfs_info(sb, "journal params: device %s, size %u, "
+		      "journal first block %u, max trans len %u, max batch %u, "
+		      "max commit age %u, max trans age %u\n",
+		      bdevname(journal->j_dev_bd, b),
+		      SB_ONDISK_JOURNAL_SIZE(sb),
+		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+		      journal->j_trans_max,
+		      journal->j_max_batch,
+		      journal->j_max_commit_age, journal->j_max_trans_age);
+
+	brelse(bhjh);
+
+	journal->j_list_bitmap_index = 0;
+	journal_list_init(sb);
+
+	memset(journal->j_list_hash_table, 0,
+	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
+
+	INIT_LIST_HEAD(&journal->j_dirty_buffers);
+	spin_lock_init(&journal->j_dirty_buffers_lock);
+
+	journal->j_start = 0;
+	journal->j_len = 0;
+	journal->j_len_alloc = 0;
+	atomic_set(&journal->j_wcount, 0);
+	atomic_set(&journal->j_async_throttle, 0);
+	journal->j_bcount = 0;
+	journal->j_trans_start_time = 0;
+	journal->j_last = NULL;
+	journal->j_first = NULL;
+	init_waitqueue_head(&journal->j_join_wait);
+	mutex_init(&journal->j_mutex);
+	mutex_init(&journal->j_flush_mutex);
+
+	journal->j_trans_id = 10;
+	journal->j_mount_id = 10;
+	journal->j_state = 0;
+	atomic_set(&journal->j_jlock, 0);
+	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
+	journal->j_cnode_free_orig = journal->j_cnode_free_list;
+	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
+	journal->j_cnode_used = 0;
+	journal->j_must_wait = 0;
+
+	if (journal->j_cnode_free == 0) {
+		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
+		                 "allocation failed (%ld bytes). Journal is "
+		                 "too large for available memory. Usually "
+		                 "this is due to a journal that is too large.",
+		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
+        	goto free_and_return;
+	}
+
+	init_journal_hash(sb);
+	jl = journal->j_current_jl;
+
+	/*
+	 * get_list_bitmap() may call flush_commit_list() which
+	 * requires the lock. Calling flush_commit_list() shouldn't happen
+	 * this early but I like to be paranoid.
+	 */
+	reiserfs_write_lock(sb);
+	jl->j_list_bitmap = get_list_bitmap(sb, jl);
+	reiserfs_write_unlock(sb);
+	if (!jl->j_list_bitmap) {
+		reiserfs_warning(sb, "journal-2005",
+				 "get_list_bitmap failed for journal list 0");
+		goto free_and_return;
+	}
+
+	ret = journal_read(sb);
+	if (ret < 0) {
+		reiserfs_warning(sb, "reiserfs-2006",
+				 "Replay Failure, unable to mount");
+		goto free_and_return;
+	}
+
+	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
+	journal->j_work_sb = sb;
+	return 0;
+free_and_return:
+	free_journal_ram(sb);
+	return 1;
+}
+
+/*
+ * test for a polite end of the current transaction.  Used by file_write,
+ * and should be used by delete to make sure they don't write more than
+ * can fit inside a single transaction
+ */
+int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
+				   int new_alloc)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
+	time_t now = get_seconds();
+	/* cannot restart while nested */
+	BUG_ON(!th->t_trans_id);
+	if (th->t_refcount > 1)
+		return 0;
+	if (journal->j_must_wait > 0 ||
+	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
+	    atomic_read(&journal->j_jlock) ||
+	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
+	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
+		return 1;
+	}
+
+	journal->j_len_alloc += new_alloc;
+	th->t_blocks_allocated += new_alloc ;
+	return 0;
+}
+
+/* this must be called inside a transaction */
+void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
+	BUG_ON(!th->t_trans_id);
+	journal->j_must_wait = 1;
+	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
+	return;
+}
+
+/* this must be called without a transaction started */
+void reiserfs_allow_writes(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
+	wake_up(&journal->j_join_wait);
+}
+
+/* this must be called without a transaction started */
+void reiserfs_wait_on_write_block(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	wait_event(journal->j_join_wait,
+		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
+}
+
+static void queue_log_writer(struct super_block *s)
+{
+	wait_queue_t wait;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	set_bit(J_WRITERS_QUEUED, &journal->j_state);
+
+	/*
+	 * we don't want to use wait_event here because
+	 * we only want to wait once.
+	 */
+	init_waitqueue_entry(&wait, current);
+	add_wait_queue(&journal->j_join_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+		int depth = reiserfs_write_unlock_nested(s);
+		schedule();
+		reiserfs_write_lock_nested(s, depth);
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&journal->j_join_wait, &wait);
+}
+
+static void wake_queued_writers(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
+		wake_up(&journal->j_join_wait);
+}
+
+static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	unsigned long bcount = journal->j_bcount;
+	while (1) {
+		int depth;
+
+		depth = reiserfs_write_unlock_nested(sb);
+		schedule_timeout_uninterruptible(1);
+		reiserfs_write_lock_nested(sb, depth);
+
+		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
+		while ((atomic_read(&journal->j_wcount) > 0 ||
+			atomic_read(&journal->j_jlock)) &&
+		       journal->j_trans_id == trans_id) {
+			queue_log_writer(sb);
+		}
+		if (journal->j_trans_id != trans_id)
+			break;
+		if (bcount == journal->j_bcount)
+			break;
+		bcount = journal->j_bcount;
+	}
+}
+
+/*
+ * join == true if you must join an existing transaction.
+ * join == false if you can deal with waiting for others to finish
+ *
+ * this will block until the transaction is joinable.  send the number of
+ * blocks you expect to use in nblocks.
+*/
+static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
+			      struct super_block *sb, unsigned long nblocks,
+			      int join)
+{
+	time_t now = get_seconds();
+	unsigned int old_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_transaction_handle myth;
+	int sched_count = 0;
+	int retval;
+	int depth;
+
+	reiserfs_check_lock_depth(sb, "journal_begin");
+	BUG_ON(nblocks > journal->j_trans_max);
+
+	PROC_INFO_INC(sb, journal.journal_being);
+	/* set here for journal_join */
+	th->t_refcount = 1;
+	th->t_super = sb;
+
+relock:
+	lock_journal(sb);
+	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
+		unlock_journal(sb);
+		retval = journal->j_errno;
+		goto out_fail;
+	}
+	journal->j_bcount++;
+
+	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
+		unlock_journal(sb);
+		depth = reiserfs_write_unlock_nested(sb);
+		reiserfs_wait_on_write_block(sb);
+		reiserfs_write_lock_nested(sb, depth);
+		PROC_INFO_INC(sb, journal.journal_relock_writers);
+		goto relock;
+	}
+	now = get_seconds();
+
+	/*
+	 * if there is no room in the journal OR
+	 * if this transaction is too old, and we weren't called joinable,
+	 * wait for it to finish before beginning we don't sleep if there
+	 * aren't other writers
+	 */
+
+	if ((!join && journal->j_must_wait > 0) ||
+	    (!join
+	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
+	    || (!join && atomic_read(&journal->j_wcount) > 0
+		&& journal->j_trans_start_time > 0
+		&& (now - journal->j_trans_start_time) >
+		journal->j_max_trans_age) || (!join
+					      && atomic_read(&journal->j_jlock))
+	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
+
+		old_trans_id = journal->j_trans_id;
+		/* allow others to finish this transaction */
+		unlock_journal(sb);
+
+		if (!join && (journal->j_len_alloc + nblocks + 2) >=
+		    journal->j_max_batch &&
+		    ((journal->j_len + nblocks + 2) * 100) <
+		    (journal->j_len_alloc * 75)) {
+			if (atomic_read(&journal->j_wcount) > 10) {
+				sched_count++;
+				queue_log_writer(sb);
+				goto relock;
+			}
+		}
+		/*
+		 * don't mess with joining the transaction if all we
+		 * have to do is wait for someone else to do a commit
+		 */
+		if (atomic_read(&journal->j_jlock)) {
+			while (journal->j_trans_id == old_trans_id &&
+			       atomic_read(&journal->j_jlock)) {
+				queue_log_writer(sb);
+			}
+			goto relock;
+		}
+		retval = journal_join(&myth, sb);
+		if (retval)
+			goto out_fail;
+
+		/* someone might have ended the transaction while we joined */
+		if (old_trans_id != journal->j_trans_id) {
+			retval = do_journal_end(&myth, 0);
+		} else {
+			retval = do_journal_end(&myth, COMMIT_NOW);
+		}
+
+		if (retval)
+			goto out_fail;
+
+		PROC_INFO_INC(sb, journal.journal_relock_wcount);
+		goto relock;
+	}
+	/* we are the first writer, set trans_id */
+	if (journal->j_trans_start_time == 0) {
+		journal->j_trans_start_time = get_seconds();
+	}
+	atomic_inc(&journal->j_wcount);
+	journal->j_len_alloc += nblocks;
+	th->t_blocks_logged = 0;
+	th->t_blocks_allocated = nblocks;
+	th->t_trans_id = journal->j_trans_id;
+	unlock_journal(sb);
+	INIT_LIST_HEAD(&th->t_list);
+	return 0;
+
+out_fail:
+	memset(th, 0, sizeof(*th));
+	/*
+	 * Re-set th->t_super, so we can properly keep track of how many
+	 * persistent transactions there are. We need to do this so if this
+	 * call is part of a failed restart_transaction, we can free it later
+	 */
+	th->t_super = sb;
+	return retval;
+}
+
+struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
+								    super_block
+								    *s,
+								    int nblocks)
+{
+	int ret;
+	struct reiserfs_transaction_handle *th;
+
+	/*
+	 * if we're nesting into an existing transaction.  It will be
+	 * persistent on its own
+	 */
+	if (reiserfs_transaction_running(s)) {
+		th = current->journal_info;
+		th->t_refcount++;
+		BUG_ON(th->t_refcount < 2);
+
+		return th;
+	}
+	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
+	if (!th)
+		return NULL;
+	ret = journal_begin(th, s, nblocks);
+	if (ret) {
+		kfree(th);
+		return NULL;
+	}
+
+	SB_JOURNAL(s)->j_persistent_trans++;
+	return th;
+}
+
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
+{
+	struct super_block *s = th->t_super;
+	int ret = 0;
+	if (th->t_trans_id)
+		ret = journal_end(th);
+	else
+		ret = -EIO;
+	if (th->t_refcount == 0) {
+		SB_JOURNAL(s)->j_persistent_trans--;
+		kfree(th);
+	}
+	return ret;
+}
+
+static int journal_join(struct reiserfs_transaction_handle *th,
+			struct super_block *sb)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+	/*
+	 * this keeps do_journal_end from NULLing out the
+	 * current->journal_info pointer
+	 */
+	th->t_handle_save = cur_th;
+	BUG_ON(cur_th && cur_th->t_refcount > 1);
+	return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
+}
+
+int journal_join_abort(struct reiserfs_transaction_handle *th,
+		       struct super_block *sb)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+	/*
+	 * this keeps do_journal_end from NULLing out the
+	 * current->journal_info pointer
+	 */
+	th->t_handle_save = cur_th;
+	BUG_ON(cur_th && cur_th->t_refcount > 1);
+	return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
+}
+
+int journal_begin(struct reiserfs_transaction_handle *th,
+		  struct super_block *sb, unsigned long nblocks)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+	int ret;
+
+	th->t_handle_save = NULL;
+	if (cur_th) {
+		/* we are nesting into the current transaction */
+		if (cur_th->t_super == sb) {
+			BUG_ON(!cur_th->t_refcount);
+			cur_th->t_refcount++;
+			memcpy(th, cur_th, sizeof(*th));
+			if (th->t_refcount <= 1)
+				reiserfs_warning(sb, "reiserfs-2005",
+						 "BAD: refcount <= 1, but "
+						 "journal_info != 0");
+			return 0;
+		} else {
+			/*
+			 * we've ended up with a handle from a different
+			 * filesystem.  save it and restore on journal_end.
+			 * This should never really happen...
+			 */
+			reiserfs_warning(sb, "clm-2100",
+					 "nesting info a different FS");
+			th->t_handle_save = current->journal_info;
+			current->journal_info = th;
+		}
+	} else {
+		current->journal_info = th;
+	}
+	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
+	BUG_ON(current->journal_info != th);
+
+	/*
+	 * I guess this boils down to being the reciprocal of clm-2100 above.
+	 * If do_journal_begin_r fails, we need to put it back, since
+	 * journal_end won't be called to do it. */
+	if (ret)
+		current->journal_info = th->t_handle_save;
+	else
+		BUG_ON(!th->t_refcount);
+
+	return ret;
+}
+
+/*
+ * puts bh into the current transaction.  If it was already there, reorders
+ * removes the old pointers from the hash, and puts new ones in (to make
+ * sure replay happen in the right order).
+ *
+ * if it was dirty, cleans and files onto the clean list.  I can't let it
+ * be dirty again until the transaction is committed.
+ *
+ * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
+ */
+int journal_mark_dirty(struct reiserfs_transaction_handle *th,
+		       struct buffer_head *bh)
+{
+	struct super_block *sb = th->t_super;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn = NULL;
+	int count_already_incd = 0;
+	int prepared = 0;
+	BUG_ON(!th->t_trans_id);
+
+	PROC_INFO_INC(sb, journal.mark_dirty);
+	if (th->t_trans_id != journal->j_trans_id) {
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
+			       th->t_trans_id, journal->j_trans_id);
+	}
+
+	prepared = test_clear_buffer_journal_prepared(bh);
+	clear_buffer_journal_restore_dirty(bh);
+	/* already in this transaction, we are done */
+	if (buffer_journaled(bh)) {
+		PROC_INFO_INC(sb, journal.mark_dirty_already);
+		return 0;
+	}
+
+	/*
+	 * this must be turned into a panic instead of a warning.  We can't
+	 * allow a dirty or journal_dirty or locked buffer to be logged, as
+	 * some changes could get to disk too early.  NOT GOOD.
+	 */
+	if (!prepared || buffer_dirty(bh)) {
+		reiserfs_warning(sb, "journal-1777",
+				 "buffer %llu bad state "
+				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
+				 (unsigned long long)bh->b_blocknr,
+				 prepared ? ' ' : '!',
+				 buffer_locked(bh) ? ' ' : '!',
+				 buffer_dirty(bh) ? ' ' : '!',
+				 buffer_journal_dirty(bh) ? ' ' : '!');
+	}
+
+	if (atomic_read(&journal->j_wcount) <= 0) {
+		reiserfs_warning(sb, "journal-1409",
+				 "returning because j_wcount was %d",
+				 atomic_read(&journal->j_wcount));
+		return 1;
+	}
+	/*
+	 * this error means I've screwed up, and we've overflowed
+	 * the transaction.  Nothing can be done here, except make the
+	 * FS readonly or panic.
+	 */
+	if (journal->j_len >= journal->j_trans_max) {
+		reiserfs_panic(th->t_super, "journal-1413",
+			       "j_len (%lu) is too big",
+			       journal->j_len);
+	}
+
+	if (buffer_journal_dirty(bh)) {
+		count_already_incd = 1;
+		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
+		clear_buffer_journal_dirty(bh);
+	}
+
+	if (journal->j_len > journal->j_len_alloc) {
+		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
+	}
+
+	set_buffer_journaled(bh);
+
+	/* now put this guy on the end */
+	if (!cn) {
+		cn = get_cnode(sb);
+		if (!cn) {
+			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
+		}
+
+		if (th->t_blocks_logged == th->t_blocks_allocated) {
+			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
+			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
+		}
+		th->t_blocks_logged++;
+		journal->j_len++;
+
+		cn->bh = bh;
+		cn->blocknr = bh->b_blocknr;
+		cn->sb = sb;
+		cn->jlist = NULL;
+		insert_journal_hash(journal->j_hash_table, cn);
+		if (!count_already_incd) {
+			get_bh(bh);
+		}
+	}
+	cn->next = NULL;
+	cn->prev = journal->j_last;
+	cn->bh = bh;
+	if (journal->j_last) {
+		journal->j_last->next = cn;
+		journal->j_last = cn;
+	} else {
+		journal->j_first = cn;
+		journal->j_last = cn;
+	}
+	reiserfs_schedule_old_flush(sb);
+	return 0;
+}
+
+int journal_end(struct reiserfs_transaction_handle *th)
+{
+	struct super_block *sb = th->t_super;
+	if (!current->journal_info && th->t_refcount > 1)
+		reiserfs_warning(sb, "REISER-NESTING",
+				 "th NULL, refcount %d", th->t_refcount);
+
+	if (!th->t_trans_id) {
+		WARN_ON(1);
+		return -EIO;
+	}
+
+	th->t_refcount--;
+	if (th->t_refcount > 0) {
+		struct reiserfs_transaction_handle *cur_th =
+		    current->journal_info;
+
+		/*
+		 * we aren't allowed to close a nested transaction on a
+		 * different filesystem from the one in the task struct
+		 */
+		BUG_ON(cur_th->t_super != th->t_super);
+
+		if (th != cur_th) {
+			memcpy(current->journal_info, th, sizeof(*th));
+			th->t_trans_id = 0;
+		}
+		return 0;
+	} else {
+		return do_journal_end(th, 0);
+	}
+}
+
+/*
+ * removes from the current transaction, relsing and descrementing any counters.
+ * also files the removed buffer directly onto the clean list
+ *
+ * called by journal_mark_freed when a block has been deleted
+ *
+ * returns 1 if it cleaned and relsed the buffer. 0 otherwise
+ */
+static int remove_from_transaction(struct super_block *sb,
+				   b_blocknr_t blocknr, int already_cleaned)
+{
+	struct buffer_head *bh;
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	int ret = 0;
+
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
+	if (!cn || !cn->bh) {
+		return ret;
+	}
+	bh = cn->bh;
+	if (cn->prev) {
+		cn->prev->next = cn->next;
+	}
+	if (cn->next) {
+		cn->next->prev = cn->prev;
+	}
+	if (cn == journal->j_first) {
+		journal->j_first = cn->next;
+	}
+	if (cn == journal->j_last) {
+		journal->j_last = cn->prev;
+	}
+	if (bh)
+		remove_journal_hash(sb, journal->j_hash_table, NULL,
+				    bh->b_blocknr, 0);
+	clear_buffer_journaled(bh);	/* don't log this one */
+
+	if (!already_cleaned) {
+		clear_buffer_journal_dirty(bh);
+		clear_buffer_dirty(bh);
+		clear_buffer_journal_test(bh);
+		put_bh(bh);
+		if (atomic_read(&bh->b_count) < 0) {
+			reiserfs_warning(sb, "journal-1752",
+					 "b_count < 0");
+		}
+		ret = 1;
+	}
+	journal->j_len--;
+	journal->j_len_alloc--;
+	free_cnode(sb, cn);
+	return ret;
+}
+
+/*
+ * for any cnode in a journal list, it can only be dirtied of all the
+ * transactions that include it are committed to disk.
+ * this checks through each transaction, and returns 1 if you are allowed
+ * to dirty, and 0 if you aren't
+ *
+ * it is called by dirty_journal_list, which is called after
+ * flush_commit_list has gotten all the log blocks for a given
+ * transaction on disk
+ *
+ */
+static int can_dirty(struct reiserfs_journal_cnode *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
+	struct reiserfs_journal_cnode *cur = cn->hprev;
+	int can_dirty = 1;
+
+	/*
+	 * first test hprev.  These are all newer than cn, so any node here
+	 * with the same block number and dev means this node can't be sent
+	 * to disk right now.
+	 */
+	while (cur && can_dirty) {
+		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
+		    cur->blocknr == blocknr) {
+			can_dirty = 0;
+		}
+		cur = cur->hprev;
+	}
+	/*
+	 * then test hnext.  These are all older than cn.  As long as they
+	 * are committed to the log, it is safe to write cn to disk
+	 */
+	cur = cn->hnext;
+	while (cur && can_dirty) {
+		if (cur->jlist && cur->jlist->j_len > 0 &&
+		    atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
+		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
+			can_dirty = 0;
+		}
+		cur = cur->hnext;
+	}
+	return can_dirty;
+}
+
+/*
+ * syncs the commit blocks, but does not force the real buffers to disk
+ * will wait until the current transaction is done/committed before returning
+ */
+int journal_end_sync(struct reiserfs_transaction_handle *th)
+{
+	struct super_block *sb = th->t_super;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	BUG_ON(!th->t_trans_id);
+	/* you can sync while nested, very, very bad */
+	BUG_ON(th->t_refcount > 1);
+	if (journal->j_len == 0) {
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
+					     1);
+		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
+	}
+	return do_journal_end(th, COMMIT_NOW | WAIT);
+}
+
+/* writeback the pending async commits to disk */
+static void flush_async_commits(struct work_struct *work)
+{
+	struct reiserfs_journal *journal =
+		container_of(work, struct reiserfs_journal, j_work.work);
+	struct super_block *sb = journal->j_work_sb;
+	struct reiserfs_journal_list *jl;
+	struct list_head *entry;
+
+	reiserfs_write_lock(sb);
+	if (!list_empty(&journal->j_journal_list)) {
+		/* last entry is the youngest, commit it and you get everything */
+		entry = journal->j_journal_list.prev;
+		jl = JOURNAL_LIST_ENTRY(entry);
+		flush_commit_list(sb, jl, 1);
+	}
+	reiserfs_write_unlock(sb);
+}
+
+/*
+ * flushes any old transactions to disk
+ * ends the current transaction if it is too old
+ */
+void reiserfs_flush_old_commits(struct super_block *sb)
+{
+	time_t now;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	now = get_seconds();
+	/*
+	 * safety check so we don't flush while we are replaying the log during
+	 * mount
+	 */
+	if (list_empty(&journal->j_journal_list))
+		return;
+
+	/*
+	 * check the current transaction.  If there are no writers, and it is
+	 * too old, finish it, and force the commit blocks to disk
+	 */
+	if (atomic_read(&journal->j_wcount) <= 0 &&
+	    journal->j_trans_start_time > 0 &&
+	    journal->j_len > 0 &&
+	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
+		if (!journal_join(&th, sb)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
+
+			/*
+			 * we're only being called from kreiserfsd, it makes
+			 * no sense to do an async commit so that kreiserfsd
+			 * can do it later
+			 */
+			do_journal_end(&th, COMMIT_NOW | WAIT);
+		}
+	}
+}
+
+/*
+ * returns 0 if do_journal_end should return right away, returns 1 if
+ * do_journal_end should finish the commit
+ *
+ * if the current transaction is too old, but still has writers, this will
+ * wait on j_join_wait until all the writers are done.  By the time it
+ * wakes up, the transaction it was called has already ended, so it just
+ * flushes the commit list and returns 0.
+ *
+ * Won't batch when flush or commit_now is set.  Also won't batch when
+ * others are waiting on j_join_wait.
+ *
+ * Note, we can't allow the journal_end to proceed while there are still
+ * writers in the log.
+ */
+static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
+{
+
+	time_t now;
+	int flush = flags & FLUSH_ALL;
+	int commit_now = flags & COMMIT_NOW;
+	int wait_on_commit = flags & WAIT;
+	struct reiserfs_journal_list *jl;
+	struct super_block *sb = th->t_super;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	BUG_ON(!th->t_trans_id);
+
+	if (th->t_trans_id != journal->j_trans_id) {
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
+			       th->t_trans_id, journal->j_trans_id);
+	}
+
+	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
+	/* <= 0 is allowed.  unmounting might not call begin */
+	if (atomic_read(&journal->j_wcount) > 0)
+		atomic_dec(&journal->j_wcount);
+
+	/*
+	 * BUG, deal with case where j_len is 0, but people previously
+	 * freed blocks need to be released will be dealt with by next
+	 * transaction that actually writes something, but should be taken
+	 * care of in this trans
+	 */
+	BUG_ON(journal->j_len == 0);
+
+	/*
+	 * if wcount > 0, and we are called to with flush or commit_now,
+	 * we wait on j_join_wait.  We will wake up when the last writer has
+	 * finished the transaction, and started it on its way to the disk.
+	 * Then, we flush the commit or journal list, and just return 0
+	 * because the rest of journal end was already done for this
+	 * transaction.
+	 */
+	if (atomic_read(&journal->j_wcount) > 0) {
+		if (flush || commit_now) {
+			unsigned trans_id;
+
+			jl = journal->j_current_jl;
+			trans_id = jl->j_trans_id;
+			if (wait_on_commit)
+				jl->j_state |= LIST_COMMIT_PENDING;
+			atomic_set(&journal->j_jlock, 1);
+			if (flush) {
+				journal->j_next_full_flush = 1;
+			}
+			unlock_journal(sb);
+
+			/*
+			 * sleep while the current transaction is
+			 * still j_jlocked
+			 */
+			while (journal->j_trans_id == trans_id) {
+				if (atomic_read(&journal->j_jlock)) {
+					queue_log_writer(sb);
+				} else {
+					lock_journal(sb);
+					if (journal->j_trans_id == trans_id) {
+						atomic_set(&journal->j_jlock,
+							   1);
+					}
+					unlock_journal(sb);
+				}
+			}
+			BUG_ON(journal->j_trans_id == trans_id);
+
+			if (commit_now
+			    && journal_list_still_alive(sb, trans_id)
+			    && wait_on_commit) {
+				flush_commit_list(sb, jl, 1);
+			}
+			return 0;
+		}
+		unlock_journal(sb);
+		return 0;
+	}
+
+	/* deal with old transactions where we are the last writers */
+	now = get_seconds();
+	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
+		commit_now = 1;
+		journal->j_next_async_flush = 1;
+	}
+	/* don't batch when someone is waiting on j_join_wait */
+	/* don't batch when syncing the commit or flushing the whole trans */
+	if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
+	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
+	    && journal->j_len_alloc < journal->j_max_batch
+	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
+		journal->j_bcount++;
+		unlock_journal(sb);
+		return 0;
+	}
+
+	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
+		reiserfs_panic(sb, "journal-003",
+			       "j_start (%ld) is too high",
+			       journal->j_start);
+	}
+	return 1;
+}
+
+/*
+ * Does all the work that makes deleting blocks safe.
+ * when deleting a block mark BH_JNew, just remove it from the current
+ * transaction, clean it's buffer_head and move on.
+ *
+ * otherwise:
+ * set a bit for the block in the journal bitmap.  That will prevent it from
+ * being allocated for unformatted nodes before this transaction has finished.
+ *
+ * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
+ * That will prevent any old transactions with this block from trying to flush
+ * to the real location.  Since we aren't removing the cnode from the
+ * journal_list_hash, *the block can't be reallocated yet.
+ *
+ * Then remove it from the current transaction, decrementing any counters and
+ * filing it on the clean list.
+ */
+int journal_mark_freed(struct reiserfs_transaction_handle *th,
+		       struct super_block *sb, b_blocknr_t blocknr)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn = NULL;
+	struct buffer_head *bh = NULL;
+	struct reiserfs_list_bitmap *jb = NULL;
+	int cleaned = 0;
+	BUG_ON(!th->t_trans_id);
+
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
+	if (cn && cn->bh) {
+		bh = cn->bh;
+		get_bh(bh);
+	}
+	/* if it is journal new, we just remove it from this transaction */
+	if (bh && buffer_journal_new(bh)) {
+		clear_buffer_journal_new(bh);
+		clear_prepared_bits(bh);
+		reiserfs_clean_and_file_buffer(bh);
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
+	} else {
+		/*
+		 * set the bit for this block in the journal bitmap
+		 * for this transaction
+		 */
+		jb = journal->j_current_jl->j_list_bitmap;
+		if (!jb) {
+			reiserfs_panic(sb, "journal-1702",
+				       "journal_list_bitmap is NULL");
+		}
+		set_bit_in_list_bitmap(sb, blocknr, jb);
+
+		/* Note, the entire while loop is not allowed to schedule.  */
+
+		if (bh) {
+			clear_prepared_bits(bh);
+			reiserfs_clean_and_file_buffer(bh);
+		}
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
+
+		/*
+		 * find all older transactions with this block,
+		 * make sure they don't try to write it out
+		 */
+		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
+					  blocknr);
+		while (cn) {
+			if (sb == cn->sb && blocknr == cn->blocknr) {
+				set_bit(BLOCK_FREED, &cn->state);
+				if (cn->bh) {
+					/*
+					 * remove_from_transaction will brelse
+					 * the buffer if it was in the current
+					 * trans
+					 */
+					if (!cleaned) {
+						clear_buffer_journal_dirty(cn->
+									   bh);
+						clear_buffer_dirty(cn->bh);
+						clear_buffer_journal_test(cn->
+									  bh);
+						cleaned = 1;
+						put_bh(cn->bh);
+						if (atomic_read
+						    (&cn->bh->b_count) < 0) {
+							reiserfs_warning(sb,
+								 "journal-2138",
+								 "cn->bh->b_count < 0");
+						}
+					}
+					/*
+					 * since we are clearing the bh,
+					 * we MUST dec nonzerolen
+					 */
+					if (cn->jlist) {
+						atomic_dec(&cn->jlist->
+							   j_nonzerolen);
+					}
+					cn->bh = NULL;
+				}
+			}
+			cn = cn->hnext;
+		}
+	}
+
+	if (bh)
+		release_buffer_page(bh); /* get_hash grabs the buffer */
+	return 0;
+}
+
+void reiserfs_update_inode_transaction(struct inode *inode)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
+	REISERFS_I(inode)->i_jl = journal->j_current_jl;
+	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
+}
+
+/*
+ * returns -1 on error, 0 if no commits/barriers were done and 1
+ * if a transaction was actually committed and the barrier was done
+ */
+static int __commit_trans_jl(struct inode *inode, unsigned long id,
+			     struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_transaction_handle th;
+	struct super_block *sb = inode->i_sb;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	int ret = 0;
+
+	/*
+	 * is it from the current transaction,
+	 * or from an unknown transaction?
+	 */
+	if (id == journal->j_trans_id) {
+		jl = journal->j_current_jl;
+		/*
+		 * try to let other writers come in and
+		 * grow this transaction
+		 */
+		let_transaction_grow(sb, id);
+		if (journal->j_trans_id != id) {
+			goto flush_commit_only;
+		}
+
+		ret = journal_begin(&th, sb, 1);
+		if (ret)
+			return ret;
+
+		/* someone might have ended this transaction while we joined */
+		if (journal->j_trans_id != id) {
+			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
+			ret = journal_end(&th);
+			goto flush_commit_only;
+		}
+
+		ret = journal_end_sync(&th);
+		if (!ret)
+			ret = 1;
+
+	} else {
+		/*
+		 * this gets tricky, we have to make sure the journal list in
+		 * the inode still exists.  We know the list is still around
+		 * if we've got a larger transaction id than the oldest list
+		 */
+flush_commit_only:
+		if (journal_list_still_alive(inode->i_sb, id)) {
+			/*
+			 * we only set ret to 1 when we know for sure
+			 * the barrier hasn't been started yet on the commit
+			 * block.
+			 */
+			if (atomic_read(&jl->j_commit_left) > 1)
+				ret = 1;
+			flush_commit_list(sb, jl, 1);
+			if (journal->j_errno)
+				ret = journal->j_errno;
+		}
+	}
+	/* otherwise the list is gone, and long since committed */
+	return ret;
+}
+
+int reiserfs_commit_for_inode(struct inode *inode)
+{
+	unsigned int id = REISERFS_I(inode)->i_trans_id;
+	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
+
+	/*
+	 * for the whole inode, assume unset id means it was
+	 * changed in the current transaction.  More conservative
+	 */
+	if (!id || !jl) {
+		reiserfs_update_inode_transaction(inode);
+		id = REISERFS_I(inode)->i_trans_id;
+		/* jl will be updated in __commit_trans_jl */
+	}
+
+	return __commit_trans_jl(inode, id, jl);
+}
+
+void reiserfs_restore_prepared_buffer(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	PROC_INFO_INC(sb, journal.restore_prepared);
+	if (!bh) {
+		return;
+	}
+	if (test_clear_buffer_journal_restore_dirty(bh) &&
+	    buffer_journal_dirty(bh)) {
+		struct reiserfs_journal_cnode *cn;
+		reiserfs_write_lock(sb);
+		cn = get_journal_hash_dev(sb,
+					  journal->j_list_hash_table,
+					  bh->b_blocknr);
+		if (cn && can_dirty(cn)) {
+			set_buffer_journal_test(bh);
+			mark_buffer_dirty(bh);
+		}
+		reiserfs_write_unlock(sb);
+	}
+	clear_buffer_journal_prepared(bh);
+}
+
+extern struct tree_balance *cur_tb;
+/*
+ * before we can change a metadata block, we have to make sure it won't
+ * be written to disk while we are altering it.  So, we must:
+ * clean it
+ * wait on it.
+ */
+int reiserfs_prepare_for_journal(struct super_block *sb,
+				 struct buffer_head *bh, int wait)
+{
+	PROC_INFO_INC(sb, journal.prepare);
+
+	if (!trylock_buffer(bh)) {
+		if (!wait)
+			return 0;
+		lock_buffer(bh);
+	}
+	set_buffer_journal_prepared(bh);
+	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
+		clear_buffer_journal_test(bh);
+		set_buffer_journal_restore_dirty(bh);
+	}
+	unlock_buffer(bh);
+	return 1;
+}
+
+/*
+ * long and ugly.  If flush, will not return until all commit
+ * blocks and all real buffers in the trans are on disk.
+ * If no_async, won't return until all commit blocks are on disk.
+ *
+ * keep reading, there are comments as you go along
+ *
+ * If the journal is aborted, we just clean up. Things like flushing
+ * journal lists, etc just won't happen.
+ */
+static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
+{
+	struct super_block *sb = th->t_super;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
+	struct reiserfs_journal_cnode *last_cn = NULL;
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	struct buffer_head *c_bh;	/* commit bh */
+	struct buffer_head *d_bh;	/* desc bh */
+	int cur_write_start = 0;	/* start index of current log write */
+	int old_start;
+	int i;
+	int flush;
+	int wait_on_commit;
+	struct reiserfs_journal_list *jl, *temp_jl;
+	struct list_head *entry, *safe;
+	unsigned long jindex;
+	unsigned int commit_trans_id;
+	int trans_half;
+	int depth;
+
+	BUG_ON(th->t_refcount > 1);
+	BUG_ON(!th->t_trans_id);
+	BUG_ON(!th->t_super);
+
+	/*
+	 * protect flush_older_commits from doing mistakes if the
+	 * transaction ID counter gets overflowed.
+	 */
+	if (th->t_trans_id == ~0U)
+		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
+	flush = flags & FLUSH_ALL;
+	wait_on_commit = flags & WAIT;
+
+	current->journal_info = th->t_handle_save;
+	reiserfs_check_lock_depth(sb, "journal end");
+	if (journal->j_len == 0) {
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
+					     1);
+		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
+	}
+
+	lock_journal(sb);
+	if (journal->j_next_full_flush) {
+		flags |= FLUSH_ALL;
+		flush = 1;
+	}
+	if (journal->j_next_async_flush) {
+		flags |= COMMIT_NOW | WAIT;
+		wait_on_commit = 1;
+	}
+
+	/*
+	 * check_journal_end locks the journal, and unlocks if it does
+	 * not return 1 it tells us if we should continue with the
+	 * journal_end, or just return
+	 */
+	if (!check_journal_end(th, flags)) {
+		reiserfs_schedule_old_flush(sb);
+		wake_queued_writers(sb);
+		reiserfs_async_progress_wait(sb);
+		goto out;
+	}
+
+	/* check_journal_end might set these, check again */
+	if (journal->j_next_full_flush) {
+		flush = 1;
+	}
+
+	/*
+	 * j must wait means we have to flush the log blocks, and the
+	 * real blocks for this transaction
+	 */
+	if (journal->j_must_wait > 0) {
+		flush = 1;
+	}
+#ifdef REISERFS_PREALLOCATE
+	/*
+	 * quota ops might need to nest, setup the journal_info pointer
+	 * for them and raise the refcount so that it is > 0.
+	 */
+	current->journal_info = th;
+	th->t_refcount++;
+
+	/* it should not involve new blocks into the transaction */
+	reiserfs_discard_all_prealloc(th);
+
+	th->t_refcount--;
+	current->journal_info = th->t_handle_save;
+#endif
+
+	/* setup description block */
+	d_bh =
+	    journal_getblk(sb,
+			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			   journal->j_start);
+	set_buffer_uptodate(d_bh);
+	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
+	memset(d_bh->b_data, 0, d_bh->b_size);
+	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
+	set_desc_trans_id(desc, journal->j_trans_id);
+
+	/*
+	 * setup commit block.  Don't write (keep it clean too) this one
+	 * until after everyone else is written
+	 */
+	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			      ((journal->j_start + journal->j_len +
+				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
+	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+	memset(c_bh->b_data, 0, c_bh->b_size);
+	set_commit_trans_id(commit, journal->j_trans_id);
+	set_buffer_uptodate(c_bh);
+
+	/* init this journal list */
+	jl = journal->j_current_jl;
+
+	/*
+	 * we lock the commit before doing anything because
+	 * we want to make sure nobody tries to run flush_commit_list until
+	 * the new transaction is fully setup, and we've already flushed the
+	 * ordered bh list
+	 */
+	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
+
+	/* save the transaction id in case we need to commit it later */
+	commit_trans_id = jl->j_trans_id;
+
+	atomic_set(&jl->j_older_commits_done, 0);
+	jl->j_trans_id = journal->j_trans_id;
+	jl->j_timestamp = journal->j_trans_start_time;
+	jl->j_commit_bh = c_bh;
+	jl->j_start = journal->j_start;
+	jl->j_len = journal->j_len;
+	atomic_set(&jl->j_nonzerolen, journal->j_len);
+	atomic_set(&jl->j_commit_left, journal->j_len + 2);
+	jl->j_realblock = NULL;
+
+	/*
+	 * The ENTIRE FOR LOOP MUST not cause schedule to occur.
+	 * for each real block, add it to the journal list hash,
+	 * copy into real block index array in the commit or desc block
+	 */
+	trans_half = journal_trans_half(sb->s_blocksize);
+	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
+		if (buffer_journaled(cn->bh)) {
+			jl_cn = get_cnode(sb);
+			if (!jl_cn) {
+				reiserfs_panic(sb, "journal-1676",
+					       "get_cnode returned NULL");
+			}
+			if (i == 0) {
+				jl->j_realblock = jl_cn;
+			}
+			jl_cn->prev = last_cn;
+			jl_cn->next = NULL;
+			if (last_cn) {
+				last_cn->next = jl_cn;
+			}
+			last_cn = jl_cn;
+			/*
+			 * make sure the block we are trying to log
+			 * is not a block of journal or reserved area
+			 */
+			if (is_block_in_log_or_reserved_area
+			    (sb, cn->bh->b_blocknr)) {
+				reiserfs_panic(sb, "journal-2332",
+					       "Trying to log block %lu, "
+					       "which is a log block",
+					       cn->bh->b_blocknr);
+			}
+			jl_cn->blocknr = cn->bh->b_blocknr;
+			jl_cn->state = 0;
+			jl_cn->sb = sb;
+			jl_cn->bh = cn->bh;
+			jl_cn->jlist = jl;
+			insert_journal_hash(journal->j_list_hash_table, jl_cn);
+			if (i < trans_half) {
+				desc->j_realblock[i] =
+				    cpu_to_le32(cn->bh->b_blocknr);
+			} else {
+				commit->j_realblock[i - trans_half] =
+				    cpu_to_le32(cn->bh->b_blocknr);
+			}
+		} else {
+			i--;
+		}
+	}
+	set_desc_trans_len(desc, journal->j_len);
+	set_desc_mount_id(desc, journal->j_mount_id);
+	set_desc_trans_id(desc, journal->j_trans_id);
+	set_commit_trans_len(commit, journal->j_len);
+
+	/*
+	 * special check in case all buffers in the journal
+	 * were marked for not logging
+	 */
+	BUG_ON(journal->j_len == 0);
+
+	/*
+	 * we're about to dirty all the log blocks, mark the description block
+	 * dirty now too.  Don't mark the commit block dirty until all the
+	 * others are on disk
+	 */
+	mark_buffer_dirty(d_bh);
+
+	/*
+	 * first data block is j_start + 1, so add one to
+	 * cur_write_start wherever you use it
+	 */
+	cur_write_start = journal->j_start;
+	cn = journal->j_first;
+	jindex = 1;	/* start at one so we don't get the desc again */
+	while (cn) {
+		clear_buffer_journal_new(cn->bh);
+		/* copy all the real blocks into log area.  dirty log blocks */
+		if (buffer_journaled(cn->bh)) {
+			struct buffer_head *tmp_bh;
+			char *addr;
+			struct page *page;
+			tmp_bh =
+			    journal_getblk(sb,
+					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+					   ((cur_write_start +
+					     jindex) %
+					    SB_ONDISK_JOURNAL_SIZE(sb)));
+			set_buffer_uptodate(tmp_bh);
+			page = cn->bh->b_page;
+			addr = kmap(page);
+			memcpy(tmp_bh->b_data,
+			       addr + offset_in_page(cn->bh->b_data),
+			       cn->bh->b_size);
+			kunmap(page);
+			mark_buffer_dirty(tmp_bh);
+			jindex++;
+			set_buffer_journal_dirty(cn->bh);
+			clear_buffer_journaled(cn->bh);
+		} else {
+			/*
+			 * JDirty cleared sometime during transaction.
+			 * don't log this one
+			 */
+			reiserfs_warning(sb, "journal-2048",
+					 "BAD, buffer in journal hash, "
+					 "but not JDirty!");
+			brelse(cn->bh);
+		}
+		next = cn->next;
+		free_cnode(sb, cn);
+		cn = next;
+		reiserfs_cond_resched(sb);
+	}
+
+	/*
+	 * we are done with both the c_bh and d_bh, but
+	 * c_bh must be written after all other commit blocks,
+	 * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+	 */
+
+	journal->j_current_jl = alloc_journal_list(sb);
+
+	/* now it is safe to insert this transaction on the main list */
+	list_add_tail(&jl->j_list, &journal->j_journal_list);
+	list_add_tail(&jl->j_working_list, &journal->j_working_list);
+	journal->j_num_work_lists++;
+
+	/* reset journal values for the next transaction */
+	old_start = journal->j_start;
+	journal->j_start =
+	    (journal->j_start + journal->j_len +
+	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
+	atomic_set(&journal->j_wcount, 0);
+	journal->j_bcount = 0;
+	journal->j_last = NULL;
+	journal->j_first = NULL;
+	journal->j_len = 0;
+	journal->j_trans_start_time = 0;
+	/* check for trans_id overflow */
+	if (++journal->j_trans_id == 0)
+		journal->j_trans_id = 10;
+	journal->j_current_jl->j_trans_id = journal->j_trans_id;
+	journal->j_must_wait = 0;
+	journal->j_len_alloc = 0;
+	journal->j_next_full_flush = 0;
+	journal->j_next_async_flush = 0;
+	init_journal_hash(sb);
+
+	/*
+	 * make sure reiserfs_add_jh sees the new current_jl before we
+	 * write out the tails
+	 */
+	smp_mb();
+
+	/*
+	 * tail conversion targets have to hit the disk before we end the
+	 * transaction.  Otherwise a later transaction might repack the tail
+	 * before this transaction commits, leaving the data block unflushed
+	 * and clean, if we crash before the later transaction commits, the
+	 * data block is lost.
+	 */
+	if (!list_empty(&jl->j_tail_bh_list)) {
+		depth = reiserfs_write_unlock_nested(sb);
+		write_ordered_buffers(&journal->j_dirty_buffers_lock,
+				      journal, jl, &jl->j_tail_bh_list);
+		reiserfs_write_lock_nested(sb, depth);
+	}
+	BUG_ON(!list_empty(&jl->j_tail_bh_list));
+	mutex_unlock(&jl->j_commit_mutex);
+
+	/*
+	 * honor the flush wishes from the caller, simple commits can
+	 * be done outside the journal lock, they are done below
+	 *
+	 * if we don't flush the commit list right now, we put it into
+	 * the work queue so the people waiting on the async progress work
+	 * queue don't wait for this proc to flush journal lists and such.
+	 */
+	if (flush) {
+		flush_commit_list(sb, jl, 1);
+		flush_journal_list(sb, jl, 1);
+	} else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
+		/*
+		 * Avoid queueing work when sb is being shut down. Transaction
+		 * will be flushed on journal shutdown.
+		 */
+		if (sb->s_flags & MS_ACTIVE)
+			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+					   &journal->j_work, HZ / 10);
+	}
+
+	/*
+	 * if the next transaction has any chance of wrapping, flush
+	 * transactions that might get overwritten.  If any journal lists
+	 * are very old flush them as well.
+	 */
+first_jl:
+	list_for_each_safe(entry, safe, &journal->j_journal_list) {
+		temp_jl = JOURNAL_LIST_ENTRY(entry);
+		if (journal->j_start <= temp_jl->j_start) {
+			if ((journal->j_start + journal->j_trans_max + 1) >=
+			    temp_jl->j_start) {
+				flush_used_journal_lists(sb, temp_jl);
+				goto first_jl;
+			} else if ((journal->j_start +
+				    journal->j_trans_max + 1) <
+				   SB_ONDISK_JOURNAL_SIZE(sb)) {
+				/*
+				 * if we don't cross into the next
+				 * transaction and we don't wrap, there is
+				 * no way we can overlap any later transactions
+				 * break now
+				 */
+				break;
+			}
+		} else if ((journal->j_start +
+			    journal->j_trans_max + 1) >
+			   SB_ONDISK_JOURNAL_SIZE(sb)) {
+			if (((journal->j_start + journal->j_trans_max + 1) %
+			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
+			    temp_jl->j_start) {
+				flush_used_journal_lists(sb, temp_jl);
+				goto first_jl;
+			} else {
+				/*
+				* we don't overlap anything from out start
+				* to the end of the log, and our wrapped
+				* portion doesn't overlap anything at
+				* the start of the log.  We can break
+				*/
+				break;
+			}
+		}
+	}
+
+	journal->j_current_jl->j_list_bitmap =
+	    get_list_bitmap(sb, journal->j_current_jl);
+
+	if (!(journal->j_current_jl->j_list_bitmap)) {
+		reiserfs_panic(sb, "journal-1996",
+			       "could not get a list bitmap");
+	}
+
+	atomic_set(&journal->j_jlock, 0);
+	unlock_journal(sb);
+	/* wake up any body waiting to join. */
+	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
+	wake_up(&journal->j_join_wait);
+
+	if (!flush && wait_on_commit &&
+	    journal_list_still_alive(sb, commit_trans_id)) {
+		flush_commit_list(sb, jl, 1);
+	}
+out:
+	reiserfs_check_lock_depth(sb, "journal end2");
+
+	memset(th, 0, sizeof(*th));
+	/*
+	 * Re-set th->t_super, so we can properly keep track of how many
+	 * persistent transactions there are. We need to do this so if this
+	 * call is part of a failed restart_transaction, we can free it later
+	 */
+	th->t_super = sb;
+
+	return journal->j_errno;
+}
+
+/* Send the file system read only and refuse new transactions */
+void reiserfs_abort_journal(struct super_block *sb, int errno)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	if (test_bit(J_ABORTED, &journal->j_state))
+		return;
+
+	if (!journal->j_errno)
+		journal->j_errno = errno;
+
+	sb->s_flags |= MS_RDONLY;
+	set_bit(J_ABORTED, &journal->j_state);
+
+#ifdef CONFIG_REISERFS_CHECK
+	dump_stack();
+#endif
+}
diff --git a/kernel/fs/reiserfs/lbalance.c b/kernel/fs/reiserfs/lbalance.c
new file mode 100644
index 000000000..249594a82
--- /dev/null
+++ b/kernel/fs/reiserfs/lbalance.c
@@ -0,0 +1,1427 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include "reiserfs.h"
+#include <linux/buffer_head.h>
+
+/*
+ * copy copy_count entries from source directory item to dest buffer
+ * (creating new item if needed)
+ */
+static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
+				  struct buffer_head *source, int last_first,
+				  int item_num, int from, int copy_count)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	/*
+	 * either the number of target item, or if we must create a
+	 * new item, the number of the item we will create it next to
+	 */
+	int item_num_in_dest;
+
+	struct item_head *ih;
+	struct reiserfs_de_head *deh;
+	int copy_records_len;	/* length of all records in item to be copied */
+	char *records;
+
+	ih = item_head(source, item_num);
+
+	RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
+
+	/*
+	 * length of all record to be copied and first byte of
+	 * the last of them
+	 */
+	deh = B_I_DEH(source, ih);
+	if (copy_count) {
+		copy_records_len = (from ? deh_location(&deh[from - 1]) :
+				    ih_item_len(ih)) -
+		    deh_location(&deh[from + copy_count - 1]);
+		records =
+		    source->b_data + ih_location(ih) +
+		    deh_location(&deh[from + copy_count - 1]);
+	} else {
+		copy_records_len = 0;
+		records = NULL;
+	}
+
+	/* when copy last to first, dest buffer can contain 0 items */
+	item_num_in_dest =
+	    (last_first ==
+	     LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
+							       - 1);
+
+	/*
+	 * if there are no items in dest or the first/last item in
+	 * dest is not item of the same directory
+	 */
+	if ((item_num_in_dest == -1) ||
+	    (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
+	    (last_first == LAST_TO_FIRST
+	     && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
+							 leaf_key(dest,
+								  item_num_in_dest))))
+	{
+		/* create new item in dest */
+		struct item_head new_ih;
+
+		/* form item header */
+		memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
+		put_ih_version(&new_ih, KEY_FORMAT_3_5);
+		/* calculate item len */
+		put_ih_item_len(&new_ih,
+				DEH_SIZE * copy_count + copy_records_len);
+		put_ih_entry_count(&new_ih, 0);
+
+		if (last_first == LAST_TO_FIRST) {
+			/* form key by the following way */
+			if (from < ih_entry_count(ih)) {
+				set_le_ih_k_offset(&new_ih,
+						   deh_offset(&deh[from]));
+			} else {
+				/*
+				 * no entries will be copied to this
+				 * item in this function
+				 */
+				set_le_ih_k_offset(&new_ih, U32_MAX);
+				/*
+				 * this item is not yet valid, but we
+				 * want I_IS_DIRECTORY_ITEM to return 1
+				 * for it, so we -1
+				 */
+			}
+			set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
+					  TYPE_DIRENTRY);
+		}
+
+		/* insert item into dest buffer */
+		leaf_insert_into_buf(dest_bi,
+				     (last_first ==
+				      LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
+				     &new_ih, NULL, 0);
+	} else {
+		/* prepare space for entries */
+		leaf_paste_in_buffer(dest_bi,
+				     (last_first ==
+				      FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
+							1) : 0, MAX_US_INT,
+				     DEH_SIZE * copy_count + copy_records_len,
+				     records, 0);
+	}
+
+	item_num_in_dest =
+	    (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
+
+	leaf_paste_entries(dest_bi, item_num_in_dest,
+			   (last_first ==
+			    FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
+									  item_num_in_dest))
+			   : 0, copy_count, deh + from, records,
+			   DEH_SIZE * copy_count + copy_records_len);
+}
+
+/*
+ * Copy the first (if last_first == FIRST_TO_LAST) or last
+ * (last_first == LAST_TO_FIRST) item or part of it or nothing
+ * (see the return 0 below) from SOURCE to the end (if last_first)
+ * or beginning (!last_first) of the DEST
+ */
+/* returns 1 if anything was copied, else 0 */
+static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
+				   struct buffer_head *src, int last_first,
+				   int bytes_or_entries)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	/* number of items in the source and destination buffers */
+	int dest_nr_item, src_nr_item;
+	struct item_head *ih;
+	struct item_head *dih;
+
+	dest_nr_item = B_NR_ITEMS(dest);
+
+	/*
+	 * if ( DEST is empty or first item of SOURCE and last item of
+	 * DEST are the items of different objects or of different types )
+	 * then there is no need to treat this item differently from the
+	 * other items that we copy, so we return
+	 */
+	if (last_first == FIRST_TO_LAST) {
+		ih = item_head(src, 0);
+		dih = item_head(dest, dest_nr_item - 1);
+
+		/* there is nothing to merge */
+		if (!dest_nr_item
+		    || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
+			return 0;
+
+		RFALSE(!ih_item_len(ih),
+		       "vs-10010: item can not have empty length");
+
+		if (is_direntry_le_ih(ih)) {
+			if (bytes_or_entries == -1)
+				/* copy all entries to dest */
+				bytes_or_entries = ih_entry_count(ih);
+			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
+					      bytes_or_entries);
+			return 1;
+		}
+
+		/*
+		 * copy part of the body of the first item of SOURCE
+		 * to the end of the body of the last item of the DEST
+		 * part defined by 'bytes_or_entries'; if bytes_or_entries
+		 * == -1 copy whole body; don't create new item header
+		 */
+		if (bytes_or_entries == -1)
+			bytes_or_entries = ih_item_len(ih);
+
+#ifdef CONFIG_REISERFS_CHECK
+		else {
+			if (bytes_or_entries == ih_item_len(ih)
+			    && is_indirect_le_ih(ih))
+				if (get_ih_free_space(ih))
+					reiserfs_panic(sb_from_bi(dest_bi),
+						       "vs-10020",
+						       "last unformatted node "
+						       "must be filled "
+						       "entirely (%h)", ih);
+		}
+#endif
+
+		/*
+		 * merge first item (or its part) of src buffer with the last
+		 * item of dest buffer. Both are of the same file
+		 */
+		leaf_paste_in_buffer(dest_bi,
+				     dest_nr_item - 1, ih_item_len(dih),
+				     bytes_or_entries, ih_item_body(src, ih), 0);
+
+		if (is_indirect_le_ih(dih)) {
+			RFALSE(get_ih_free_space(dih),
+			       "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
+			       ih);
+			if (bytes_or_entries == ih_item_len(ih))
+				set_ih_free_space(dih, get_ih_free_space(ih));
+		}
+
+		return 1;
+	}
+
+	/* copy boundary item to right (last_first == LAST_TO_FIRST) */
+
+	/*
+	 * (DEST is empty or last item of SOURCE and first item of DEST
+	 * are the items of different object or of different types)
+	 */
+	src_nr_item = B_NR_ITEMS(src);
+	ih = item_head(src, src_nr_item - 1);
+	dih = item_head(dest, 0);
+
+	if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
+		return 0;
+
+	if (is_direntry_le_ih(ih)) {
+		/*
+		 * bytes_or_entries = entries number in last
+		 * item body of SOURCE
+		 */
+		if (bytes_or_entries == -1)
+			bytes_or_entries = ih_entry_count(ih);
+
+		leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
+				      src_nr_item - 1,
+				      ih_entry_count(ih) - bytes_or_entries,
+				      bytes_or_entries);
+		return 1;
+	}
+
+	/*
+	 * copy part of the body of the last item of SOURCE to the
+	 * begin of the body of the first item of the DEST; part defined
+	 * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
+	 * change first item key of the DEST; don't create new item header
+	 */
+
+	RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
+	       "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
+	       ih);
+
+	if (bytes_or_entries == -1) {
+		/* bytes_or_entries = length of last item body of SOURCE */
+		bytes_or_entries = ih_item_len(ih);
+
+		RFALSE(le_ih_k_offset(dih) !=
+		       le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
+		       "vs-10050: items %h and %h do not match", ih, dih);
+
+		/* change first item key of the DEST */
+		set_le_ih_k_offset(dih, le_ih_k_offset(ih));
+
+		/* item becomes non-mergeable */
+		/* or mergeable if left item was */
+		set_le_ih_k_type(dih, le_ih_k_type(ih));
+	} else {
+		/* merge to right only part of item */
+		RFALSE(ih_item_len(ih) <= bytes_or_entries,
+		       "vs-10060: no so much bytes %lu (needed %lu)",
+		       (unsigned long)ih_item_len(ih),
+		       (unsigned long)bytes_or_entries);
+
+		/* change first item key of the DEST */
+		if (is_direct_le_ih(dih)) {
+			RFALSE(le_ih_k_offset(dih) <=
+			       (unsigned long)bytes_or_entries,
+			       "vs-10070: dih %h, bytes_or_entries(%d)", dih,
+			       bytes_or_entries);
+			set_le_ih_k_offset(dih,
+					   le_ih_k_offset(dih) -
+					   bytes_or_entries);
+		} else {
+			RFALSE(le_ih_k_offset(dih) <=
+			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
+			       "vs-10080: dih %h, bytes_or_entries(%d)",
+			       dih,
+			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
+			set_le_ih_k_offset(dih,
+					   le_ih_k_offset(dih) -
+					   ((bytes_or_entries / UNFM_P_SIZE) *
+					    dest->b_size));
+		}
+	}
+
+	leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
+			     ih_item_body(src,
+				       ih) + ih_item_len(ih) - bytes_or_entries,
+			     0);
+	return 1;
+}
+
+/*
+ * copy cpy_mun items from buffer src to buffer dest
+ * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
+ *                             from first-th item in src to tail of dest
+ * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
+ *                             from first-th item in src to head of dest
+ */
+static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
+				     struct buffer_head *src, int last_first,
+				     int first, int cpy_num)
+{
+	struct buffer_head *dest;
+	int nr, free_space;
+	int dest_before;
+	int last_loc, last_inserted_loc, location;
+	int i, j;
+	struct block_head *blkh;
+	struct item_head *ih;
+
+	RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
+	       "vs-10090: bad last_first parameter %d", last_first);
+	RFALSE(B_NR_ITEMS(src) - first < cpy_num,
+	       "vs-10100: too few items in source %d, required %d from %d",
+	       B_NR_ITEMS(src), cpy_num, first);
+	RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
+	RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
+
+	dest = dest_bi->bi_bh;
+
+	RFALSE(!dest, "vs-10130: can not copy negative amount of items");
+
+	if (cpy_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(dest);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/*
+	 * we will insert items before 0-th or nr-th item in dest buffer.
+	 * It depends of last_first parameter
+	 */
+	dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
+
+	/* location of head of first new item */
+	ih = item_head(dest, dest_before);
+
+	RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
+	       "vs-10140: not enough free space for headers %d (needed %d)",
+	       B_FREE_SPACE(dest), cpy_num * IH_SIZE);
+
+	/* prepare space for headers */
+	memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
+
+	/* copy item headers */
+	memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
+
+	free_space -= (IH_SIZE * cpy_num);
+	set_blkh_free_space(blkh, free_space);
+
+	/* location of unmovable item */
+	j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
+	for (i = dest_before; i < nr + cpy_num; i++) {
+		location -= ih_item_len(ih + i - dest_before);
+		put_ih_location(ih + i - dest_before, location);
+	}
+
+	/* prepare space for items */
+	last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
+	last_inserted_loc = ih_location(&ih[cpy_num - 1]);
+
+	/* check free space */
+	RFALSE(free_space < j - last_inserted_loc,
+	       "vs-10150: not enough free space for items %d (needed %d)",
+	       free_space, j - last_inserted_loc);
+
+	memmove(dest->b_data + last_loc,
+		dest->b_data + last_loc + j - last_inserted_loc,
+		last_inserted_loc - last_loc);
+
+	/* copy items */
+	memcpy(dest->b_data + last_inserted_loc,
+	       item_body(src, (first + cpy_num - 1)),
+	       j - last_inserted_loc);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, nr + cpy_num);
+	set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
+
+	do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
+		       "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
+		       (long unsigned)dest->b_blocknr,
+		       (long unsigned)dc_block_number(t_dc));
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (j - last_inserted_loc +
+					     IH_SIZE * cpy_num));
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+	}
+}
+
+/*
+ * This function splits the (liquid) item into two items (useful when
+ * shifting part of an item into another node.)
+ */
+static void leaf_item_bottle(struct buffer_info *dest_bi,
+			     struct buffer_head *src, int last_first,
+			     int item_num, int cpy_bytes)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	struct item_head *ih;
+
+	RFALSE(cpy_bytes == -1,
+	       "vs-10170: bytes == - 1 means: do not split item");
+
+	if (last_first == FIRST_TO_LAST) {
+		/*
+		 * if ( if item in position item_num in buffer SOURCE
+		 * is directory item )
+		 */
+		ih = item_head(src, item_num);
+		if (is_direntry_le_ih(ih))
+			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
+					      item_num, 0, cpy_bytes);
+		else {
+			struct item_head n_ih;
+
+			/*
+			 * copy part of the body of the item number 'item_num'
+			 * of SOURCE to the end of the DEST part defined by
+			 * 'cpy_bytes'; create new item header; change old
+			 * item_header (????); n_ih = new item_header;
+			 */
+			memcpy(&n_ih, ih, IH_SIZE);
+			put_ih_item_len(&n_ih, cpy_bytes);
+			if (is_indirect_le_ih(ih)) {
+				RFALSE(cpy_bytes == ih_item_len(ih)
+				       && get_ih_free_space(ih),
+				       "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
+				       (long unsigned)get_ih_free_space(ih));
+				set_ih_free_space(&n_ih, 0);
+			}
+
+			RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
+			       "vs-10190: bad mergeability of item %h", ih);
+			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
+			leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
+					     item_body(src, item_num), 0);
+		}
+	} else {
+		/*
+		 * if ( if item in position item_num in buffer
+		 * SOURCE is directory item )
+		 */
+		ih = item_head(src, item_num);
+		if (is_direntry_le_ih(ih))
+			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
+					      item_num,
+					      ih_entry_count(ih) - cpy_bytes,
+					      cpy_bytes);
+		else {
+			struct item_head n_ih;
+
+			/*
+			 * copy part of the body of the item number 'item_num'
+			 * of SOURCE to the begin of the DEST part defined by
+			 * 'cpy_bytes'; create new item header;
+			 * n_ih = new item_header;
+			 */
+			memcpy(&n_ih, ih, SHORT_KEY_SIZE);
+
+			/* Endian safe, both le */
+			n_ih.ih_version = ih->ih_version;
+
+			if (is_direct_le_ih(ih)) {
+				set_le_ih_k_offset(&n_ih,
+						   le_ih_k_offset(ih) +
+						   ih_item_len(ih) - cpy_bytes);
+				set_le_ih_k_type(&n_ih, TYPE_DIRECT);
+				set_ih_free_space(&n_ih, MAX_US_INT);
+			} else {
+				/* indirect item */
+				RFALSE(!cpy_bytes && get_ih_free_space(ih),
+				       "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
+				set_le_ih_k_offset(&n_ih,
+						   le_ih_k_offset(ih) +
+						   (ih_item_len(ih) -
+						    cpy_bytes) / UNFM_P_SIZE *
+						   dest->b_size);
+				set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
+				set_ih_free_space(&n_ih, get_ih_free_space(ih));
+			}
+
+			/* set item length */
+			put_ih_item_len(&n_ih, cpy_bytes);
+
+			/* Endian safe, both le */
+			n_ih.ih_version = ih->ih_version;
+
+			leaf_insert_into_buf(dest_bi, 0, &n_ih,
+					     item_body(src, item_num) +
+						ih_item_len(ih) - cpy_bytes, 0);
+		}
+	}
+}
+
+/*
+ * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
+ * to DEST.  If cpy_bytes not equal to minus one than copy cpy_num-1 whole
+ * items from SOURCE to DEST.  From last item copy cpy_num bytes for regular
+ * item and cpy_num directory entries for directory item.
+ */
+static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
+			   int last_first, int cpy_num, int cpy_bytes)
+{
+	struct buffer_head *dest;
+	int pos, i, src_nr_item, bytes;
+
+	dest = dest_bi->bi_bh;
+	RFALSE(!dest || !src, "vs-10210: !dest || !src");
+	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+	       "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
+	RFALSE(B_NR_ITEMS(src) < cpy_num,
+	       "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
+	       cpy_num);
+	RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
+
+	if (cpy_num == 0)
+		return 0;
+
+	if (last_first == FIRST_TO_LAST) {
+		/* copy items to left */
+		pos = 0;
+		if (cpy_num == 1)
+			bytes = cpy_bytes;
+		else
+			bytes = -1;
+
+		/*
+		 * copy the first item or it part or nothing to the end of
+		 * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
+		 */
+		i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
+		cpy_num -= i;
+		if (cpy_num == 0)
+			return i;
+		pos += i;
+		if (cpy_bytes == -1)
+			/*
+			 * copy first cpy_num items starting from position
+			 * 'pos' of SOURCE to end of DEST
+			 */
+			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
+						 pos, cpy_num);
+		else {
+			/*
+			 * copy first cpy_num-1 items starting from position
+			 * 'pos-1' of the SOURCE to the end of the DEST
+			 */
+			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
+						 pos, cpy_num - 1);
+
+			/*
+			 * copy part of the item which number is
+			 * cpy_num+pos-1 to the end of the DEST
+			 */
+			leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
+					 cpy_num + pos - 1, cpy_bytes);
+		}
+	} else {
+		/* copy items to right */
+		src_nr_item = B_NR_ITEMS(src);
+		if (cpy_num == 1)
+			bytes = cpy_bytes;
+		else
+			bytes = -1;
+
+		/*
+		 * copy the last item or it part or nothing to the
+		 * begin of the DEST
+		 * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
+		 */
+		i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
+
+		cpy_num -= i;
+		if (cpy_num == 0)
+			return i;
+
+		pos = src_nr_item - cpy_num - i;
+		if (cpy_bytes == -1) {
+			/*
+			 * starting from position 'pos' copy last cpy_num
+			 * items of SOURCE to begin of DEST
+			 */
+			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
+						 pos, cpy_num);
+		} else {
+			/*
+			 * copy last cpy_num-1 items starting from position
+			 * 'pos+1' of the SOURCE to the begin of the DEST;
+			 */
+			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
+						 pos + 1, cpy_num - 1);
+
+			/*
+			 * copy part of the item which number is pos to
+			 * the begin of the DEST
+			 */
+			leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
+					 cpy_bytes);
+		}
+	}
+	return i;
+}
+
+/*
+ * there are types of coping: from S[0] to L[0], from S[0] to R[0],
+ * from R[0] to L[0]. for each of these we have to define parent and
+ * positions of destination and source buffers
+ */
+static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
+				       struct buffer_info *dest_bi,
+				       struct buffer_info *src_bi,
+				       int *first_last,
+				       struct buffer_head *Snew)
+{
+	memset(dest_bi, 0, sizeof(struct buffer_info));
+	memset(src_bi, 0, sizeof(struct buffer_info));
+
+	/* define dest, src, dest parent, dest position */
+	switch (shift_mode) {
+	case LEAF_FROM_S_TO_L:	/* it is used in leaf_shift_left */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+
+		/* src->b_item_order */
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[0];
+		dest_bi->bi_parent = tb->FL[0];
+		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
+		*first_last = FIRST_TO_LAST;
+		break;
+
+	case LEAF_FROM_S_TO_R:	/* it is used in leaf_shift_right */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[0];
+		dest_bi->bi_parent = tb->FR[0];
+		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	case LEAF_FROM_R_TO_L:	/* it is used in balance_leaf_when_delete */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->R[0];
+		src_bi->bi_parent = tb->FR[0];
+		src_bi->bi_position = get_right_neighbor_position(tb, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[0];
+		dest_bi->bi_parent = tb->FL[0];
+		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
+		*first_last = FIRST_TO_LAST;
+		break;
+
+	case LEAF_FROM_L_TO_R:	/* it is used in balance_leaf_when_delete */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->L[0];
+		src_bi->bi_parent = tb->FL[0];
+		src_bi->bi_position = get_left_neighbor_position(tb, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[0];
+		dest_bi->bi_parent = tb->FR[0];
+		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	case LEAF_FROM_S_TO_SNEW:
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = Snew;
+		dest_bi->bi_parent = NULL;
+		dest_bi->bi_position = 0;
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	default:
+		reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
+			       "shift type is unknown (%d)", shift_mode);
+	}
+	RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
+	       "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
+	       shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
+}
+
+/*
+ * copy mov_num items and mov_bytes of the (mov_num-1)th item to
+ * neighbor. Delete them from source
+ */
+int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
+		    int mov_bytes, struct buffer_head *Snew)
+{
+	int ret_value;
+	struct buffer_info dest_bi, src_bi;
+	int first_last;
+
+	leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
+				   &first_last, Snew);
+
+	ret_value =
+	    leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
+			    mov_bytes);
+
+	leaf_delete_items(&src_bi, first_last,
+			  (first_last ==
+			   FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
+						 mov_num), mov_num, mov_bytes);
+
+	return ret_value;
+}
+
+/*
+ * Shift shift_num items (and shift_bytes of last shifted item if
+ * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
+ */
+int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
+{
+	struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int i;
+
+	/*
+	 * move shift_num (and shift_bytes bytes) items from S[0]
+	 * to left neighbor L[0]
+	 */
+	i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
+
+	if (shift_num) {
+		/* number of items in S[0] == 0 */
+		if (B_NR_ITEMS(S0) == 0) {
+
+			RFALSE(shift_bytes != -1,
+			       "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
+			       shift_bytes);
+#ifdef CONFIG_REISERFS_CHECK
+			if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
+				print_cur_tb("vs-10275");
+				reiserfs_panic(tb->tb_sb, "vs-10275",
+					       "balance condition corrupted "
+					       "(%c)", tb->tb_mode);
+			}
+#endif
+
+			if (PATH_H_POSITION(tb->tb_path, 1) == 0)
+				replace_key(tb, tb->CFL[0], tb->lkey[0],
+					    PATH_H_PPARENT(tb->tb_path, 0), 0);
+
+		} else {
+			/* replace lkey in CFL[0] by 0-th key from S[0]; */
+			replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
+
+			RFALSE((shift_bytes != -1 &&
+				!(is_direntry_le_ih(item_head(S0, 0))
+				  && !ih_entry_count(item_head(S0, 0)))) &&
+			       (!op_is_left_mergeable
+				(leaf_key(S0, 0), S0->b_size)),
+			       "vs-10280: item must be mergeable");
+		}
+	}
+
+	return i;
+}
+
+/* CLEANING STOPPED HERE */
+
+/*
+ * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
+ * and replace the delimiting key
+ */
+int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
+{
+	int ret_value;
+
+	/*
+	 * move shift_num (and shift_bytes) items from S[0] to
+	 * right neighbor R[0]
+	 */
+	ret_value =
+	    leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
+
+	/* replace rkey in CFR[0] by the 0-th key from R[0] */
+	if (shift_num) {
+		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+	}
+
+	return ret_value;
+}
+
+static void leaf_delete_items_entirely(struct buffer_info *bi,
+				       int first, int del_num);
+/*
+ * If del_bytes == -1, starting from position 'first' delete del_num
+ * items in whole in buffer CUR.
+ *   If not.
+ *   If last_first == 0. Starting from position 'first' delete del_num-1
+ *   items in whole. Delete part of body of the first item. Part defined by
+ *   del_bytes. Don't delete first item header
+ *   If last_first == 1. Starting from position 'first+1' delete del_num-1
+ *   items in whole. Delete part of body of the last item . Part defined by
+ *   del_bytes. Don't delete last item header.
+*/
+void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
+		       int first, int del_num, int del_bytes)
+{
+	struct buffer_head *bh;
+	int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
+
+	RFALSE(!bh, "10155: bh is not defined");
+	RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
+	       del_num);
+	RFALSE(first < 0
+	       || first + del_num > item_amount,
+	       "10165: invalid number of first item to be deleted (%d) or "
+	       "no so much items (%d) to delete (only %d)", first,
+	       first + del_num, item_amount);
+
+	if (del_num == 0)
+		return;
+
+	if (first == 0 && del_num == item_amount && del_bytes == -1) {
+		make_empty_node(cur_bi);
+		do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
+		return;
+	}
+
+	if (del_bytes == -1)
+		/* delete del_num items beginning from item in position first */
+		leaf_delete_items_entirely(cur_bi, first, del_num);
+	else {
+		if (last_first == FIRST_TO_LAST) {
+			/*
+			 * delete del_num-1 items beginning from
+			 * item in position first
+			 */
+			leaf_delete_items_entirely(cur_bi, first, del_num - 1);
+
+			/*
+			 * delete the part of the first item of the bh
+			 * do not delete item header
+			 */
+			leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
+		} else {
+			struct item_head *ih;
+			int len;
+
+			/*
+			 * delete del_num-1 items beginning from
+			 * item in position first+1
+			 */
+			leaf_delete_items_entirely(cur_bi, first + 1,
+						   del_num - 1);
+
+			ih = item_head(bh, B_NR_ITEMS(bh) - 1);
+			if (is_direntry_le_ih(ih))
+				/* the last item is directory  */
+				/*
+				 * len = numbers of directory entries
+				 * in this item
+				 */
+				len = ih_entry_count(ih);
+			else
+				/* len = body len of item */
+				len = ih_item_len(ih);
+
+			/*
+			 * delete the part of the last item of the bh
+			 * do not delete item header
+			 */
+			leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
+					     len - del_bytes, del_bytes);
+		}
+	}
+}
+
+/* insert item into the leaf node in position before */
+void leaf_insert_into_buf(struct buffer_info *bi, int before,
+			  struct item_head * const inserted_item_ih,
+			  const char * const inserted_item_body,
+			  int zeros_number)
+{
+	struct buffer_head *bh = bi->bi_bh;
+	int nr, free_space;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i;
+	int last_loc, unmoved_loc;
+	char *to;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* check free space */
+	RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
+	       "vs-10170: not enough free space in block %z, new item %h",
+	       bh, inserted_item_ih);
+	RFALSE(zeros_number > ih_item_len(inserted_item_ih),
+	       "vs-10172: zero number == %d, item length == %d",
+	       zeros_number, ih_item_len(inserted_item_ih));
+
+	/* get item new item must be inserted before */
+	ih = item_head(bh, before);
+
+	/* prepare space for the body of new item */
+	last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
+	unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
+
+	memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
+		bh->b_data + last_loc, unmoved_loc - last_loc);
+
+	to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
+	memset(to, 0, zeros_number);
+	to += zeros_number;
+
+	/* copy body to prepared space */
+	if (inserted_item_body)
+		memmove(to, inserted_item_body,
+			ih_item_len(inserted_item_ih) - zeros_number);
+	else
+		memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
+
+	/* insert item header */
+	memmove(ih + 1, ih, IH_SIZE * (nr - before));
+	memmove(ih, inserted_item_ih, IH_SIZE);
+
+	/* change locations */
+	for (i = before; i < nr + 1; i++) {
+		unmoved_loc -= ih_item_len(&ih[i - before]);
+		put_ih_location(&ih[i - before], unmoved_loc);
+	}
+
+	/* sizes, free space, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
+	set_blkh_free_space(blkh,
+			    free_space - (IH_SIZE +
+					  ih_item_len(inserted_item_ih)));
+	do_balance_mark_leaf_dirty(bi->tb, bh, 1);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (IH_SIZE +
+					     ih_item_len(inserted_item_ih)));
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/*
+ * paste paste_size bytes to affected_item_num-th item.
+ * When item is a directory, this only prepare space for new entries
+ */
+void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
+			  int pos_in_item, int paste_size,
+			  const char *body, int zeros_number)
+{
+	struct buffer_head *bh = bi->bi_bh;
+	int nr, free_space;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i;
+	int last_loc, unmoved_loc;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* check free space */
+	RFALSE(free_space < paste_size,
+	       "vs-10175: not enough free space: needed %d, available %d",
+	       paste_size, free_space);
+
+#ifdef CONFIG_REISERFS_CHECK
+	if (zeros_number > paste_size) {
+		struct super_block *sb = NULL;
+		if (bi && bi->tb)
+			sb = bi->tb->tb_sb;
+		print_cur_tb("10177");
+		reiserfs_panic(sb, "vs-10177",
+			       "zeros_number == %d, paste_size == %d",
+			       zeros_number, paste_size);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+	/* item to be appended */
+	ih = item_head(bh, affected_item_num);
+
+	last_loc = ih_location(&ih[nr - affected_item_num - 1]);
+	unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
+
+	/* prepare space */
+	memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
+		unmoved_loc - last_loc);
+
+	/* change locations */
+	for (i = affected_item_num; i < nr; i++)
+		put_ih_location(&ih[i - affected_item_num],
+				ih_location(&ih[i - affected_item_num]) -
+				paste_size);
+
+	if (body) {
+		if (!is_direntry_le_ih(ih)) {
+			if (!pos_in_item) {
+				/* shift data to right */
+				memmove(bh->b_data + ih_location(ih) +
+					paste_size,
+					bh->b_data + ih_location(ih),
+					ih_item_len(ih));
+				/* paste data in the head of item */
+				memset(bh->b_data + ih_location(ih), 0,
+				       zeros_number);
+				memcpy(bh->b_data + ih_location(ih) +
+				       zeros_number, body,
+				       paste_size - zeros_number);
+			} else {
+				memset(bh->b_data + unmoved_loc - paste_size, 0,
+				       zeros_number);
+				memcpy(bh->b_data + unmoved_loc - paste_size +
+				       zeros_number, body,
+				       paste_size - zeros_number);
+			}
+		}
+	} else
+		memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
+
+	put_ih_item_len(ih, ih_item_len(ih) + paste_size);
+
+	/* change free space */
+	set_blkh_free_space(blkh, free_space - paste_size);
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) + paste_size);
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/*
+ * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
+ * does not have free space, so it moves DEHs and remaining records as
+ * necessary. Return value is size of removed part of directory item
+ * in bytes.
+ */
+static int leaf_cut_entries(struct buffer_head *bh,
+			    struct item_head *ih, int from, int del_count)
+{
+	char *item;
+	struct reiserfs_de_head *deh;
+	int prev_record_offset;	/* offset of record, that is (from-1)th */
+	char *prev_record;	/* */
+	int cut_records_len;	/* length of all removed records */
+	int i;
+
+	/*
+	 * make sure that item is directory and there are enough entries to
+	 * remove
+	 */
+	RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
+	RFALSE(ih_entry_count(ih) < from + del_count,
+	       "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
+	       ih_entry_count(ih), from, del_count);
+
+	if (del_count == 0)
+		return 0;
+
+	/* first byte of item */
+	item = bh->b_data + ih_location(ih);
+
+	/* entry head array */
+	deh = B_I_DEH(bh, ih);
+
+	/*
+	 * first byte of remaining entries, those are BEFORE cut entries
+	 * (prev_record) and length of all removed records (cut_records_len)
+	 */
+	prev_record_offset =
+	    (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
+	cut_records_len = prev_record_offset /*from_record */  -
+	    deh_location(&deh[from + del_count - 1]);
+	prev_record = item + prev_record_offset;
+
+	/* adjust locations of remaining entries */
+	for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
+		put_deh_location(&deh[i],
+				 deh_location(&deh[i]) -
+				 (DEH_SIZE * del_count));
+
+	for (i = 0; i < from; i++)
+		put_deh_location(&deh[i],
+				 deh_location(&deh[i]) - (DEH_SIZE * del_count +
+							  cut_records_len));
+
+	put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
+
+	/* shift entry head array and entries those are AFTER removed entries */
+	memmove((char *)(deh + from),
+		deh + from + del_count,
+		prev_record - cut_records_len - (char *)(deh + from +
+							 del_count));
+
+	/* shift records, those are BEFORE removed entries */
+	memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
+		prev_record, item + ih_item_len(ih) - prev_record);
+
+	return DEH_SIZE * del_count + cut_records_len;
+}
+
+/*
+ * when cut item is part of regular file
+ *      pos_in_item - first byte that must be cut
+ *      cut_size - number of bytes to be cut beginning from pos_in_item
+ *
+ * when cut item is part of directory
+ *      pos_in_item - number of first deleted entry
+ *      cut_size - count of deleted entries
+ */
+void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
+			  int pos_in_item, int cut_size)
+{
+	int nr;
+	struct buffer_head *bh = bi->bi_bh;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int last_loc, unmoved_loc;
+	int i;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+
+	/* item head of truncated item */
+	ih = item_head(bh, cut_item_num);
+
+	if (is_direntry_le_ih(ih)) {
+		/* first cut entry () */
+		cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
+		if (pos_in_item == 0) {
+			/* change key */
+			RFALSE(cut_item_num,
+			       "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
+			       cut_item_num);
+			/* change item key by key of first entry in the item */
+			set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
+		}
+	} else {
+		/* item is direct or indirect */
+		RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
+		RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
+		       "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
+		       (long unsigned)pos_in_item, (long unsigned)cut_size,
+		       (long unsigned)ih_item_len(ih));
+
+		/* shift item body to left if cut is from the head of item */
+		if (pos_in_item == 0) {
+			memmove(bh->b_data + ih_location(ih),
+				bh->b_data + ih_location(ih) + cut_size,
+				ih_item_len(ih) - cut_size);
+
+			/* change key of item */
+			if (is_direct_le_ih(ih))
+				set_le_ih_k_offset(ih,
+						   le_ih_k_offset(ih) +
+						   cut_size);
+			else {
+				set_le_ih_k_offset(ih,
+						   le_ih_k_offset(ih) +
+						   (cut_size / UNFM_P_SIZE) *
+						   bh->b_size);
+				RFALSE(ih_item_len(ih) == cut_size
+				       && get_ih_free_space(ih),
+				       "10205: invalid ih_free_space (%h)", ih);
+			}
+		}
+	}
+
+	/* location of the last item */
+	last_loc = ih_location(&ih[nr - cut_item_num - 1]);
+
+	/* location of the item, which is remaining at the same place */
+	unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
+
+	/* shift */
+	memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
+		unmoved_loc - last_loc - cut_size);
+
+	/* change item length */
+	put_ih_item_len(ih, ih_item_len(ih) - cut_size);
+
+	if (is_indirect_le_ih(ih)) {
+		if (pos_in_item)
+			set_ih_free_space(ih, 0);
+	}
+
+	/* change locations */
+	for (i = cut_item_num; i < nr; i++)
+		put_ih_location(&ih[i - cut_item_num],
+				ih_location(&ih[i - cut_item_num]) + cut_size);
+
+	/* size, free space */
+	set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) - cut_size);
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/* delete del_num items from buffer starting from the first'th item */
+static void leaf_delete_items_entirely(struct buffer_info *bi,
+				       int first, int del_num)
+{
+	struct buffer_head *bh = bi->bi_bh;
+	int nr;
+	int i, j;
+	int last_loc, last_removed_loc;
+	struct block_head *blkh;
+	struct item_head *ih;
+
+	RFALSE(bh == NULL, "10210: buffer is 0");
+	RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
+
+	if (del_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+
+	RFALSE(first < 0 || first + del_num > nr,
+	       "10220: first=%d, number=%d, there is %d items", first, del_num,
+	       nr);
+
+	if (first == 0 && del_num == nr) {
+		/* this does not work */
+		make_empty_node(bi);
+
+		do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+		return;
+	}
+
+	ih = item_head(bh, first);
+
+	/* location of unmovable item */
+	j = (first == 0) ? bh->b_size : ih_location(ih - 1);
+
+	/* delete items */
+	last_loc = ih_location(&ih[nr - 1 - first]);
+	last_removed_loc = ih_location(&ih[del_num - 1]);
+
+	memmove(bh->b_data + last_loc + j - last_removed_loc,
+		bh->b_data + last_loc, last_removed_loc - last_loc);
+
+	/* delete item headers */
+	memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
+
+	/* change item location */
+	for (i = first; i < nr - del_num; i++)
+		put_ih_location(&ih[i - first],
+				ih_location(&ih[i - first]) + (j -
+								 last_removed_loc));
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) + (j - last_removed_loc +
+						     IH_SIZE * del_num));
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) - (j - last_removed_loc +
+					     IH_SIZE * del_num));
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/*
+ * paste new_entry_count entries (new_dehs, records) into position
+ * before to item_num-th item
+ */
+void leaf_paste_entries(struct buffer_info *bi,
+			int item_num,
+			int before,
+			int new_entry_count,
+			struct reiserfs_de_head *new_dehs,
+			const char *records, int paste_size)
+{
+	struct item_head *ih;
+	char *item;
+	struct reiserfs_de_head *deh;
+	char *insert_point;
+	int i, old_entry_num;
+	struct buffer_head *bh = bi->bi_bh;
+
+	if (new_entry_count == 0)
+		return;
+
+	ih = item_head(bh, item_num);
+
+	/*
+	 * make sure, that item is directory, and there are enough
+	 * records in it
+	 */
+	RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
+	RFALSE(ih_entry_count(ih) < before,
+	       "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
+	       ih_entry_count(ih), before);
+
+	/* first byte of dest item */
+	item = bh->b_data + ih_location(ih);
+
+	/* entry head array */
+	deh = B_I_DEH(bh, ih);
+
+	/* new records will be pasted at this point */
+	insert_point =
+	    item +
+	    (before ? deh_location(&deh[before - 1])
+	     : (ih_item_len(ih) - paste_size));
+
+	/* adjust locations of records that will be AFTER new records */
+	for (i = ih_entry_count(ih) - 1; i >= before; i--)
+		put_deh_location(&deh[i],
+				 deh_location(&deh[i]) +
+				 (DEH_SIZE * new_entry_count));
+
+	/* adjust locations of records that will be BEFORE new records */
+	for (i = 0; i < before; i++)
+		put_deh_location(&deh[i],
+				 deh_location(&deh[i]) + paste_size);
+
+	old_entry_num = ih_entry_count(ih);
+	put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
+
+	/* prepare space for pasted records */
+	memmove(insert_point + paste_size, insert_point,
+		item + (ih_item_len(ih) - paste_size) - insert_point);
+
+	/* copy new records */
+	memcpy(insert_point + DEH_SIZE * new_entry_count, records,
+	       paste_size - DEH_SIZE * new_entry_count);
+
+	/* prepare space for new entry heads */
+	deh += before;
+	memmove((char *)(deh + new_entry_count), deh,
+		insert_point - (char *)deh);
+
+	/* copy new entry heads */
+	deh = (struct reiserfs_de_head *)((char *)deh);
+	memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
+
+	/* set locations of new records */
+	for (i = 0; i < new_entry_count; i++) {
+		put_deh_location(&deh[i],
+				 deh_location(&deh[i]) +
+				 (-deh_location
+				  (&new_dehs[new_entry_count - 1]) +
+				  insert_point + DEH_SIZE * new_entry_count -
+				  item));
+	}
+
+	/* change item key if necessary (when we paste before 0-th entry */
+	if (!before) {
+		set_le_ih_k_offset(ih, deh_offset(new_dehs));
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	{
+		int prev, next;
+		/* check record locations */
+		deh = B_I_DEH(bh, ih);
+		for (i = 0; i < ih_entry_count(ih); i++) {
+			next =
+			    (i <
+			     ih_entry_count(ih) -
+			     1) ? deh_location(&deh[i + 1]) : 0;
+			prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
+
+			if (prev && prev <= deh_location(&deh[i]))
+				reiserfs_error(sb_from_bi(bi), "vs-10240",
+					       "directory item (%h) "
+					       "corrupted (prev %a, "
+					       "cur(%d) %a)",
+					       ih, deh + i - 1, i, deh + i);
+			if (next && next >= deh_location(&deh[i]))
+				reiserfs_error(sb_from_bi(bi), "vs-10250",
+					       "directory item (%h) "
+					       "corrupted (cur(%d) %a, "
+					       "next %a)",
+					       ih, i, deh + i, deh + i + 1);
+		}
+	}
+#endif
+
+}
diff --git a/kernel/fs/reiserfs/lock.c b/kernel/fs/reiserfs/lock.c
new file mode 100644
index 000000000..045b83ef9
--- /dev/null
+++ b/kernel/fs/reiserfs/lock.c
@@ -0,0 +1,100 @@
+#include "reiserfs.h"
+#include <linux/mutex.h>
+
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partially based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	if (sb_i->lock_owner != current) {
+		mutex_lock(&sb_i->lock);
+		sb_i->lock_owner = current;
+	}
+
+	/* No need to protect it, only the current task touches it */
+	sb_i->lock_depth++;
+}
+
+void reiserfs_write_unlock(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	/*
+	 * Are we unlocking without even holding the lock?
+	 * Such a situation must raise a BUG() if we don't want
+	 * to corrupt the data.
+	 */
+	BUG_ON(sb_i->lock_owner != current);
+
+	if (--sb_i->lock_depth == -1) {
+		sb_i->lock_owner = NULL;
+		mutex_unlock(&sb_i->lock);
+	}
+}
+
+int __must_check reiserfs_write_unlock_nested(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+	int depth;
+
+	/* this can happen when the lock isn't always held */
+	if (sb_i->lock_owner != current)
+		return -1;
+
+	depth = sb_i->lock_depth;
+
+	sb_i->lock_depth = -1;
+	sb_i->lock_owner = NULL;
+	mutex_unlock(&sb_i->lock);
+
+	return depth;
+}
+
+void reiserfs_write_lock_nested(struct super_block *s, int depth)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	/* this can happen when the lock isn't always held */
+	if (depth == -1)
+		return;
+
+	mutex_lock(&sb_i->lock);
+	sb_i->lock_owner = current;
+	sb_i->lock_depth = depth;
+}
+
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+
+	WARN_ON(sb_i->lock_depth < 0);
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+void reiserfs_lock_check_recursive(struct super_block *sb)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+
+	WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
+}
+#endif
diff --git a/kernel/fs/reiserfs/namei.c b/kernel/fs/reiserfs/namei.c
new file mode 100644
index 000000000..b55a07465
--- /dev/null
+++ b/kernel/fs/reiserfs/namei.c
@@ -0,0 +1,1659 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ *
+ * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
+ *
+ * Trivial Changes:
+ * Rights granted to Hans Reiser to redistribute under other terms providing
+ * he accepts all liability including but not limited to patent, fitness
+ * for purpose, and direct or indirect claims arising from failure to perform.
+ *
+ * NO WARRANTY
+ */
+
+#include <linux/time.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
+#include <linux/quotaops.h>
+
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
+#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
+
+/*
+ * directory item contains array of entry headers. This performs
+ * binary search through that array
+ */
+static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
+{
+	struct item_head *ih = de->de_ih;
+	struct reiserfs_de_head *deh = de->de_deh;
+	int rbound, lbound, j;
+
+	lbound = 0;
+	rbound = ih_entry_count(ih) - 1;
+
+	for (j = (rbound + lbound) / 2; lbound <= rbound;
+	     j = (rbound + lbound) / 2) {
+		if (off < deh_offset(deh + j)) {
+			rbound = j - 1;
+			continue;
+		}
+		if (off > deh_offset(deh + j)) {
+			lbound = j + 1;
+			continue;
+		}
+		/* this is not name found, but matched third key component */
+		de->de_entry_num = j;
+		return NAME_FOUND;
+	}
+
+	de->de_entry_num = lbound;
+	return NAME_NOT_FOUND;
+}
+
+/*
+ * comment?  maybe something like set de to point to what the path points to?
+ */
+static inline void set_de_item_location(struct reiserfs_dir_entry *de,
+					struct treepath *path)
+{
+	de->de_bh = get_last_bh(path);
+	de->de_ih = tp_item_head(path);
+	de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
+	de->de_item_num = PATH_LAST_POSITION(path);
+}
+
+/*
+ * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
+ */
+inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
+{
+	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
+
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
+
+	de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
+	de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
+	de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
+	if (de->de_name[de->de_namelen - 1] == 0)
+		de->de_namelen = strlen(de->de_name);
+}
+
+/* what entry points to */
+static inline void set_de_object_key(struct reiserfs_dir_entry *de)
+{
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
+	de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
+	de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
+}
+
+static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
+{
+	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
+
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
+
+	/* store key of the found entry */
+	de->de_entry_key.version = KEY_FORMAT_3_5;
+	de->de_entry_key.on_disk_key.k_dir_id =
+	    le32_to_cpu(de->de_ih->ih_key.k_dir_id);
+	de->de_entry_key.on_disk_key.k_objectid =
+	    le32_to_cpu(de->de_ih->ih_key.k_objectid);
+	set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
+	set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
+}
+
+/*
+ * We assign a key to each directory item, and place multiple entries in a
+ * single directory item.  A directory item has a key equal to the key of
+ * the first directory entry in it.
+
+ * This function first calls search_by_key, then, if item whose first entry
+ * matches is not found it looks for the entry inside directory item found
+ * by search_by_key. Fills the path to the entry, and to the entry position
+ * in the item
+ */
+/* The function is NOT SCHEDULE-SAFE! */
+int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
+			struct treepath *path, struct reiserfs_dir_entry *de)
+{
+	int retval;
+
+	retval = search_item(sb, key, path);
+	switch (retval) {
+	case ITEM_NOT_FOUND:
+		if (!PATH_LAST_POSITION(path)) {
+			reiserfs_error(sb, "vs-7000", "search_by_key "
+				       "returned item position == 0");
+			pathrelse(path);
+			return IO_ERROR;
+		}
+		PATH_LAST_POSITION(path)--;
+
+	case ITEM_FOUND:
+		break;
+
+	case IO_ERROR:
+		return retval;
+
+	default:
+		pathrelse(path);
+		reiserfs_error(sb, "vs-7002", "no path to here");
+		return IO_ERROR;
+	}
+
+	set_de_item_location(de, path);
+
+#ifdef CONFIG_REISERFS_CHECK
+	if (!is_direntry_le_ih(de->de_ih) ||
+	    COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
+		print_block(de->de_bh, 0, -1, -1);
+		reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
+			       "item or does not belong to the same directory "
+			       "as key %K", de->de_ih, key);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+	/*
+	 * binary search in directory item by third component of the
+	 * key. sets de->de_entry_num of de
+	 */
+	retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
+	path->pos_in_item = de->de_entry_num;
+	if (retval != NAME_NOT_FOUND) {
+		/*
+		 * ugly, but rename needs de_bh, de_deh, de_name,
+		 * de_namelen, de_objectid set
+		 */
+		set_de_name_and_namelen(de);
+		set_de_object_key(de);
+	}
+	return retval;
+}
+
+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
+
+/*
+ * The third component is hashed, and you can choose from more than
+ * one hash function.  Per directory hashes are not yet implemented
+ * but are thought about. This function should be moved to hashes.c
+ * Jedi, please do so.  -Hans
+ */
+static __u32 get_third_component(struct super_block *s,
+				 const char *name, int len)
+{
+	__u32 res;
+
+	if (!len || (len == 1 && name[0] == '.'))
+		return DOT_OFFSET;
+	if (len == 2 && name[0] == '.' && name[1] == '.')
+		return DOT_DOT_OFFSET;
+
+	res = REISERFS_SB(s)->s_hash_function(name, len);
+
+	/* take bits from 7-th to 30-th including both bounds */
+	res = GET_HASH_VALUE(res);
+	if (res == 0)
+		/*
+		 * needed to have no names before "." and ".." those have hash
+		 * value == 0 and generation conters 1 and 2 accordingly
+		 */
+		res = 128;
+	return res + MAX_GENERATION_NUMBER;
+}
+
+static int reiserfs_match(struct reiserfs_dir_entry *de,
+			  const char *name, int namelen)
+{
+	int retval = NAME_NOT_FOUND;
+
+	if ((namelen == de->de_namelen) &&
+	    !memcmp(de->de_name, name, de->de_namelen))
+		retval =
+		    (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
+		     NAME_FOUND_INVISIBLE);
+
+	return retval;
+}
+
+/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
+
+/* used when hash collisions exist */
+
+static int linear_search_in_dir_item(struct cpu_key *key,
+				     struct reiserfs_dir_entry *de,
+				     const char *name, int namelen)
+{
+	struct reiserfs_de_head *deh = de->de_deh;
+	int retval;
+	int i;
+
+	i = de->de_entry_num;
+
+	if (i == ih_entry_count(de->de_ih) ||
+	    GET_HASH_VALUE(deh_offset(deh + i)) !=
+	    GET_HASH_VALUE(cpu_key_k_offset(key))) {
+		i--;
+	}
+
+	RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
+	       "vs-7010: array of entry headers not found");
+
+	deh += i;
+
+	for (; i >= 0; i--, deh--) {
+		/* hash value does not match, no need to check whole name */
+		if (GET_HASH_VALUE(deh_offset(deh)) !=
+		    GET_HASH_VALUE(cpu_key_k_offset(key))) {
+			return NAME_NOT_FOUND;
+		}
+
+		/* mark that this generation number is used */
+		if (de->de_gen_number_bit_string)
+			set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
+				de->de_gen_number_bit_string);
+
+		/* calculate pointer to name and namelen */
+		de->de_entry_num = i;
+		set_de_name_and_namelen(de);
+
+		/*
+		 * de's de_name, de_namelen, de_recordlen are set.
+		 * Fill the rest.
+		 */
+		if ((retval =
+		     reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
+
+			/* key of pointed object */
+			set_de_object_key(de);
+
+			store_de_entry_key(de);
+
+			/* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
+			return retval;
+		}
+	}
+
+	if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
+		/*
+		 * we have reached left most entry in the node. In common we
+		 * have to go to the left neighbor, but if generation counter
+		 * is 0 already, we know for sure, that there is no name with
+		 * the same hash value
+		 */
+		/*
+		 * FIXME: this work correctly only because hash value can not
+		 *  be 0. Btw, in case of Yura's hash it is probably possible,
+		 * so, this is a bug
+		 */
+		return NAME_NOT_FOUND;
+
+	RFALSE(de->de_item_num,
+	       "vs-7015: two diritems of the same directory in one node?");
+
+	return GOTO_PREVIOUS_ITEM;
+}
+
+/*
+ * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
+ * FIXME: should add something like IOERROR
+ */
+static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
+			       struct treepath *path_to_entry,
+			       struct reiserfs_dir_entry *de)
+{
+	struct cpu_key key_to_search;
+	int retval;
+
+	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
+		return NAME_NOT_FOUND;
+
+	/* we will search for this key in the tree */
+	make_cpu_key(&key_to_search, dir,
+		     get_third_component(dir->i_sb, name, namelen),
+		     TYPE_DIRENTRY, 3);
+
+	while (1) {
+		retval =
+		    search_by_entry_key(dir->i_sb, &key_to_search,
+					path_to_entry, de);
+		if (retval == IO_ERROR) {
+			reiserfs_error(dir->i_sb, "zam-7001", "io error");
+			return IO_ERROR;
+		}
+
+		/* compare names for all entries having given hash value */
+		retval =
+		    linear_search_in_dir_item(&key_to_search, de, name,
+					      namelen);
+		/*
+		 * there is no need to scan directory anymore.
+		 * Given entry found or does not exist
+		 */
+		if (retval != GOTO_PREVIOUS_ITEM) {
+			path_to_entry->pos_in_item = de->de_entry_num;
+			return retval;
+		}
+
+		/*
+		 * there is left neighboring item of this directory
+		 * and given entry can be there
+		 */
+		set_cpu_key_k_offset(&key_to_search,
+				     le_ih_k_offset(de->de_ih) - 1);
+		pathrelse(path_to_entry);
+
+	}			/* while (1) */
+}
+
+static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
+				      unsigned int flags)
+{
+	int retval;
+	struct inode *inode = NULL;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path_to_entry);
+
+	if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	reiserfs_write_lock(dir->i_sb);
+
+	de.de_gen_number_bit_string = NULL;
+	retval =
+	    reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				&path_to_entry, &de);
+	pathrelse(&path_to_entry);
+	if (retval == NAME_FOUND) {
+		inode = reiserfs_iget(dir->i_sb,
+				      (struct cpu_key *)&de.de_dir_id);
+		if (!inode || IS_ERR(inode)) {
+			reiserfs_write_unlock(dir->i_sb);
+			return ERR_PTR(-EACCES);
+		}
+
+		/*
+		 * Propagate the private flag so we know we're
+		 * in the priv tree
+		 */
+		if (IS_PRIVATE(dir))
+			inode->i_flags |= S_PRIVATE;
+	}
+	reiserfs_write_unlock(dir->i_sb);
+	if (retval == IO_ERROR) {
+		return ERR_PTR(-EIO);
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * looks up the dentry of the parent directory for child.
+ * taken from ext2_get_parent
+ */
+struct dentry *reiserfs_get_parent(struct dentry *child)
+{
+	int retval;
+	struct inode *inode = NULL;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path_to_entry);
+	struct inode *dir = d_inode(child);
+
+	if (dir->i_nlink == 0) {
+		return ERR_PTR(-ENOENT);
+	}
+	de.de_gen_number_bit_string = NULL;
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
+	pathrelse(&path_to_entry);
+	if (retval != NAME_FOUND) {
+		reiserfs_write_unlock(dir->i_sb);
+		return ERR_PTR(-ENOENT);
+	}
+	inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
+	reiserfs_write_unlock(dir->i_sb);
+
+	return d_obtain_alias(inode);
+}
+
+/* add entry to the directory (entry can be hidden).
+
+insert definition of when hidden directories are used here -Hans
+
+ Does not mark dir   inode dirty, do it after successesfull call to it */
+
+static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
+			      struct inode *dir, const char *name, int namelen,
+			      struct inode *inode, int visible)
+{
+	struct cpu_key entry_key;
+	struct reiserfs_de_head *deh;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+	DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
+	int gen_number;
+
+	/*
+	 * 48 bytes now and we avoid kmalloc if we
+	 * create file with short name
+	 */
+	char small_buf[32 + DEH_SIZE];
+
+	char *buffer;
+	int buflen, paste_size;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	/* cannot allow items to be added into a busy deleted directory */
+	if (!namelen)
+		return -EINVAL;
+
+	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
+		return -ENAMETOOLONG;
+
+	/* each entry has unique key. compose it */
+	make_cpu_key(&entry_key, dir,
+		     get_third_component(dir->i_sb, name, namelen),
+		     TYPE_DIRENTRY, 3);
+
+	/* get memory for composing the entry */
+	buflen = DEH_SIZE + ROUND_UP(namelen);
+	if (buflen > sizeof(small_buf)) {
+		buffer = kmalloc(buflen, GFP_NOFS);
+		if (!buffer)
+			return -ENOMEM;
+	} else
+		buffer = small_buf;
+
+	paste_size =
+	    (get_inode_sd_version(dir) ==
+	     STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
+
+	/*
+	 * fill buffer : directory entry head, name[, dir objectid | ,
+	 * stat data | ,stat data, dir objectid ]
+	 */
+	deh = (struct reiserfs_de_head *)buffer;
+	deh->deh_location = 0;	/* JDM Endian safe if 0 */
+	put_deh_offset(deh, cpu_key_k_offset(&entry_key));
+	deh->deh_state = 0;	/* JDM Endian safe if 0 */
+	/* put key (ino analog) to de */
+
+	/* safe: k_dir_id is le */
+	deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
+	/* safe: k_objectid is le */
+	deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
+
+	/* copy name */
+	memcpy((char *)(deh + 1), name, namelen);
+	/* padd by 0s to the 4 byte boundary */
+	padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
+
+	/*
+	 * entry is ready to be pasted into tree, set 'visibility'
+	 * and 'stat data in entry' attributes
+	 */
+	mark_de_without_sd(deh);
+	visible ? mark_de_visible(deh) : mark_de_hidden(deh);
+
+	/* find the proper place for the new entry */
+	memset(bit_string, 0, sizeof(bit_string));
+	de.de_gen_number_bit_string = bit_string;
+	retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
+	if (retval != NAME_NOT_FOUND) {
+		if (buffer != small_buf)
+			kfree(buffer);
+		pathrelse(&path);
+
+		if (retval == IO_ERROR) {
+			return -EIO;
+		}
+
+		if (retval != NAME_FOUND) {
+			reiserfs_error(dir->i_sb, "zam-7002",
+				       "reiserfs_find_entry() returned "
+				       "unexpected value (%d)", retval);
+		}
+
+		return -EEXIST;
+	}
+
+	gen_number =
+	    find_first_zero_bit(bit_string,
+				MAX_GENERATION_NUMBER + 1);
+	if (gen_number > MAX_GENERATION_NUMBER) {
+		/* there is no free generation number */
+		reiserfs_warning(dir->i_sb, "reiserfs-7010",
+				 "Congratulations! we have got hash function "
+				 "screwed up");
+		if (buffer != small_buf)
+			kfree(buffer);
+		pathrelse(&path);
+		return -EBUSY;
+	}
+	/* adjust offset of directory enrty */
+	put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
+	set_cpu_key_k_offset(&entry_key, deh_offset(deh));
+
+	/* update max-hash-collisions counter in reiserfs_sb_info */
+	PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
+
+	/* we need to re-search for the insertion point */
+	if (gen_number != 0) {
+		if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
+		    NAME_NOT_FOUND) {
+			reiserfs_warning(dir->i_sb, "vs-7032",
+					 "entry with this key (%K) already "
+					 "exists", &entry_key);
+
+			if (buffer != small_buf)
+				kfree(buffer);
+			pathrelse(&path);
+			return -EBUSY;
+		}
+	}
+
+	/* perform the insertion of the entry that we have prepared */
+	retval =
+	    reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
+				     paste_size);
+	if (buffer != small_buf)
+		kfree(buffer);
+	if (retval) {
+		reiserfs_check_path(&path);
+		return retval;
+	}
+
+	dir->i_size += paste_size;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	if (!S_ISDIR(inode->i_mode) && visible)
+		/* reiserfs_mkdir or reiserfs_rename will do that by itself */
+		reiserfs_update_sd(th, dir);
+
+	reiserfs_check_path(&path);
+	return 0;
+}
+
+/*
+ * quota utility function, call if you've had to abort after calling
+ * new_inode_init, and have not called reiserfs_new_inode yet.
+ * This should only be called on inodes that do not have stat data
+ * inserted into the tree yet.
+ */
+static int drop_new_inode(struct inode *inode)
+{
+	dquot_drop(inode);
+	make_bad_inode(inode);
+	inode->i_flags |= S_NOQUOTA;
+	iput(inode);
+	return 0;
+}
+
+/*
+ * utility function that does setup for reiserfs_new_inode.
+ * dquot_initialize needs lots of credits so it's better to have it
+ * outside of a transaction, so we had to pull some bits of
+ * reiserfs_new_inode out into this func.
+ */
+static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
+{
+	/*
+	 * Make inode invalid - just in case we are going to drop it before
+	 * the initialization happens
+	 */
+	INODE_PKEY(inode)->k_objectid = 0;
+
+	/*
+	 * the quota init calls have to know who to charge the quota to, so
+	 * we have to set uid and gid here
+	 */
+	inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+	return 0;
+}
+
+static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			   bool excl)
+{
+	int retval;
+	struct inode *inode;
+	/*
+	 * We need blocks for transaction + (user+group)*(quotas
+	 * for new inode + update of quota for directory owner)
+	 */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+
+	dquot_initialize(dir);
+
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
+
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+	reiserfs_write_lock(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
+			       inode, &security);
+	if (retval)
+		goto out_failed;
+
+	inode->i_op = &reiserfs_file_inode_operations;
+	inode->i_fop = &reiserfs_file_operations;
+	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		drop_nlink(inode);
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th);
+
+out_failed:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+			  dev_t rdev)
+{
+	int retval;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+	/*
+	 * We need blocks for transaction + (user+group)*(quotas
+	 * for new inode + update of quota for directory owner)
+	 */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
+
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+	reiserfs_write_lock(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
+			       inode, &security);
+	if (retval) {
+		goto out_failed;
+	}
+
+	inode->i_op = &reiserfs_special_inode_operations;
+	init_special_inode(inode, inode->i_mode, rdev);
+
+	/* FIXME: needed for block and char devices only */
+	reiserfs_update_sd(&th, inode);
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		drop_nlink(inode);
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th);
+
+out_failed:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int retval;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+	/*
+	 * We need blocks for transaction + (user+group)*(quotas
+	 * for new inode + update of quota for directory owner)
+	 */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+
+	dquot_initialize(dir);
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	/*
+	 * set flag that new packing locality created and new blocks
+	 * for the content of that directory are not displaced yet
+	 */
+	REISERFS_I(dir)->new_packing_locality = 1;
+#endif
+	mode = S_IFDIR | mode;
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
+
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+	reiserfs_write_lock(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	/*
+	 * inc the link count now, so another writer doesn't overflow
+	 * it while we sleep later on.
+	 */
+	INC_DIR_INODE_NLINK(dir)
+
+	    retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
+					old_format_only(dir->i_sb) ?
+					EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
+					dentry, inode, &security);
+	if (retval) {
+		DEC_DIR_INODE_NLINK(dir)
+		goto out_failed;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	inode->i_op = &reiserfs_dir_inode_operations;
+	inode->i_fop = &reiserfs_dir_operations;
+
+	/* note, _this_ add_entry will not update dir's stat data */
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		clear_nlink(inode);
+		DEC_DIR_INODE_NLINK(dir);
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+	/* the above add_entry did not update dir's stat data */
+	reiserfs_update_sd(&th, dir);
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th);
+out_failed:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+static inline int reiserfs_empty_dir(struct inode *inode)
+{
+	/*
+	 * we can cheat because an old format dir cannot have
+	 * EMPTY_DIR_SIZE, and a new format dir cannot have
+	 * EMPTY_DIR_SIZE_V1.  So, if the inode is either size,
+	 * regardless of disk format version, the directory is empty.
+	 */
+	if (inode->i_size != EMPTY_DIR_SIZE &&
+	    inode->i_size != EMPTY_DIR_SIZE_V1) {
+		return 0;
+	}
+	return 1;
+}
+
+static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int retval, err;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+
+	/*
+	 * we will be doing 2 balancings and update 2 stat data, we
+	 * change quotas of the owner of the directory and of the owner
+	 * of the parent directory.  The quota structure is possibly
+	 * deleted only on last iput => outside of this transaction
+	 */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	dquot_initialize(dir);
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval)
+		goto out_rmdir;
+
+	de.de_gen_number_bit_string = NULL;
+	if ((retval =
+	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				 &path, &de)) == NAME_NOT_FOUND) {
+		retval = -ENOENT;
+		goto end_rmdir;
+	} else if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto end_rmdir;
+	}
+
+	inode = d_inode(dentry);
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (de.de_objectid != inode->i_ino) {
+		/*
+		 * FIXME: compare key of an object and a key found in the entry
+		 */
+		retval = -EIO;
+		goto end_rmdir;
+	}
+	if (!reiserfs_empty_dir(inode)) {
+		retval = -ENOTEMPTY;
+		goto end_rmdir;
+	}
+
+	/* cut entry from dir directory */
+	retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
+					dir, NULL,	/* page */
+					0 /*new file size - not used here */ );
+	if (retval < 0)
+		goto end_rmdir;
+
+	if (inode->i_nlink != 2 && inode->i_nlink != 1)
+		reiserfs_error(inode->i_sb, "reiserfs-7040",
+			       "empty directory has nlink != 2 (%d)",
+			       inode->i_nlink);
+
+	clear_nlink(inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	DEC_DIR_INODE_NLINK(dir)
+	    dir->i_size -= (DEH_SIZE + de.de_entrylen);
+	reiserfs_update_sd(&th, dir);
+
+	/* prevent empty directory from getting lost */
+	add_save_link(&th, inode, 0 /* not truncate */ );
+
+	retval = journal_end(&th);
+	reiserfs_check_path(&path);
+out_rmdir:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+
+end_rmdir:
+	/*
+	 * we must release path, because we did not call
+	 * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
+	 * release path if operation was not complete
+	 */
+	pathrelse(&path);
+	err = journal_end(&th);
+	reiserfs_write_unlock(dir->i_sb);
+	return err ? err : retval;
+}
+
+static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int retval, err;
+	struct inode *inode;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path);
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	unsigned long savelink;
+
+	dquot_initialize(dir);
+
+	inode = d_inode(dentry);
+
+	/*
+	 * in this transaction we can be doing at max two balancings and
+	 * update two stat datas, we change quotas of the owner of the
+	 * directory and of the owner of the parent directory. The quota
+	 * structure is possibly deleted only on iput => outside of
+	 * this transaction
+	 */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval)
+		goto out_unlink;
+
+	de.de_gen_number_bit_string = NULL;
+	if ((retval =
+	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				 &path, &de)) == NAME_NOT_FOUND) {
+		retval = -ENOENT;
+		goto end_unlink;
+	} else if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto end_unlink;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (de.de_objectid != inode->i_ino) {
+		/*
+		 * FIXME: compare key of an object and a key found in the entry
+		 */
+		retval = -EIO;
+		goto end_unlink;
+	}
+
+	if (!inode->i_nlink) {
+		reiserfs_warning(inode->i_sb, "reiserfs-7042",
+				 "deleting nonexistent file (%lu), %d",
+				 inode->i_ino, inode->i_nlink);
+		set_nlink(inode, 1);
+	}
+
+	drop_nlink(inode);
+
+	/*
+	 * we schedule before doing the add_save_link call, save the link
+	 * count so we don't race
+	 */
+	savelink = inode->i_nlink;
+
+	retval =
+	    reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
+				   0);
+	if (retval < 0) {
+		inc_nlink(inode);
+		goto end_unlink;
+	}
+	inode->i_ctime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	dir->i_size -= (de.de_entrylen + DEH_SIZE);
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, dir);
+
+	if (!savelink)
+		/* prevent file from getting lost */
+		add_save_link(&th, inode, 0 /* not truncate */ );
+
+	retval = journal_end(&th);
+	reiserfs_check_path(&path);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+
+end_unlink:
+	pathrelse(&path);
+	err = journal_end(&th);
+	reiserfs_check_path(&path);
+	if (err)
+		retval = err;
+out_unlink:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+static int reiserfs_symlink(struct inode *parent_dir,
+			    struct dentry *dentry, const char *symname)
+{
+	int retval;
+	struct inode *inode;
+	char *name;
+	int item_len;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+	int mode = S_IFLNK | S_IRWXUGO;
+	/*
+	 * We need blocks for transaction + (user+group)*(quotas for
+	 * new inode + update of quota for directory owner)
+	 */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
+
+	dquot_initialize(parent_dir);
+
+	if (!(inode = new_inode(parent_dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, parent_dir, mode);
+
+	retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
+					&security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+
+	reiserfs_write_lock(parent_dir->i_sb);
+	item_len = ROUND_UP(strlen(symname));
+	if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
+		retval = -ENAMETOOLONG;
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	name = kmalloc(item_len, GFP_NOFS);
+	if (!name) {
+		drop_new_inode(inode);
+		retval = -ENOMEM;
+		goto out_failed;
+	}
+	memcpy(name, symname, strlen(symname));
+	padd_item(name, item_len, strlen(symname));
+
+	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		kfree(name);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
+			       dentry, inode, &security);
+	kfree(name);
+	if (retval) {		/* reiserfs_new_inode iputs for us */
+		goto out_failed;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(parent_dir);
+
+	inode->i_op = &reiserfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+
+	retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
+				    dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		drop_nlink(inode);
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th);
+out_failed:
+	reiserfs_write_unlock(parent_dir->i_sb);
+	return retval;
+}
+
+static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode = d_inode(old_dentry);
+	struct reiserfs_transaction_handle th;
+	/*
+	 * We need blocks for transaction + update of quotas for
+	 * the owners of the directory
+	 */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	dquot_initialize(dir);
+
+	reiserfs_write_lock(dir->i_sb);
+	if (inode->i_nlink >= REISERFS_LINK_MAX) {
+		/* FIXME: sd_nlink is 32 bit for new files */
+		reiserfs_write_unlock(dir->i_sb);
+		return -EMLINK;
+	}
+
+	/* inc before scheduling so reiserfs_unlink knows we are here */
+	inc_nlink(inode);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_nlink(inode);
+		reiserfs_write_unlock(dir->i_sb);
+		return retval;
+	}
+
+	/* create new entry */
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (retval) {
+		int err;
+		drop_nlink(inode);
+		err = journal_end(&th);
+		reiserfs_write_unlock(dir->i_sb);
+		return err ? err : retval;
+	}
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+/* de contains information pointing to an entry which */
+static int de_still_valid(const char *name, int len,
+			  struct reiserfs_dir_entry *de)
+{
+	struct reiserfs_dir_entry tmp = *de;
+
+	/* recalculate pointer to name and name length */
+	set_de_name_and_namelen(&tmp);
+	/* FIXME: could check more */
+	if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
+		return 0;
+	return 1;
+}
+
+static int entry_points_to_object(const char *name, int len,
+				  struct reiserfs_dir_entry *de,
+				  struct inode *inode)
+{
+	if (!de_still_valid(name, len, de))
+		return 0;
+
+	if (inode) {
+		if (!de_visible(de->de_deh + de->de_entry_num))
+			reiserfs_panic(inode->i_sb, "vs-7042",
+				       "entry must be visible");
+		return (de->de_objectid == inode->i_ino) ? 1 : 0;
+	}
+
+	/* this must be added hidden entry */
+	if (de_visible(de->de_deh + de->de_entry_num))
+		reiserfs_panic(NULL, "vs-7043", "entry must be visible");
+
+	return 1;
+}
+
+/* sets key of objectid the entry has to point to */
+static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
+				 struct reiserfs_key *key)
+{
+	/* JDM These operations are endian safe - both are le */
+	de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
+	de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
+}
+
+/*
+ * process, that is going to call fix_nodes/do_balance must hold only
+ * one path. If it holds 2 or more, it can get into endless waiting in
+ * get_empty_nodes or its clones
+ */
+static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
+{
+	int retval;
+	INITIALIZE_PATH(old_entry_path);
+	INITIALIZE_PATH(new_entry_path);
+	INITIALIZE_PATH(dot_dot_entry_path);
+	struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
+	struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
+	struct inode *old_inode, *new_dentry_inode;
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	umode_t old_inode_mode;
+	unsigned long savelink = 1;
+	struct timespec ctime;
+
+	/*
+	 * three balancings: (1) old name removal, (2) new name insertion
+	 * and (3) maybe "save" link insertion
+	 * stat data updates: (1) old directory,
+	 * (2) new directory and (3) maybe old object stat data (when it is
+	 * directory) and (4) maybe stat data of object to which new entry
+	 * pointed initially and (5) maybe block containing ".." of
+	 * renamed directory
+	 * quota updates: two parent directories
+	 */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 + 5 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_inode = d_inode(old_dentry);
+	new_dentry_inode = d_inode(new_dentry);
+
+	/*
+	 * make sure that oldname still exists and points to an object we
+	 * are going to rename
+	 */
+	old_de.de_gen_number_bit_string = NULL;
+	reiserfs_write_lock(old_dir->i_sb);
+	retval =
+	    reiserfs_find_entry(old_dir, old_dentry->d_name.name,
+				old_dentry->d_name.len, &old_entry_path,
+				&old_de);
+	pathrelse(&old_entry_path);
+	if (retval == IO_ERROR) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return -EIO;
+	}
+
+	if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return -ENOENT;
+	}
+
+	old_inode_mode = old_inode->i_mode;
+	if (S_ISDIR(old_inode_mode)) {
+		/*
+		 * make sure that directory being renamed has correct ".."
+		 * and that its new parent directory has not too many links
+		 * already
+		 */
+		if (new_dentry_inode) {
+			if (!reiserfs_empty_dir(new_dentry_inode)) {
+				reiserfs_write_unlock(old_dir->i_sb);
+				return -ENOTEMPTY;
+			}
+		}
+
+		/*
+		 * directory is renamed, its parent directory will be changed,
+		 * so find ".." entry
+		 */
+		dot_dot_de.de_gen_number_bit_string = NULL;
+		retval =
+		    reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path,
+					&dot_dot_de);
+		pathrelse(&dot_dot_entry_path);
+		if (retval != NAME_FOUND) {
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		/* inode number of .. must equal old_dir->i_ino */
+		if (dot_dot_de.de_objectid != old_dir->i_ino) {
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+	}
+
+	retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
+	if (retval) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return retval;
+	}
+
+	/* add new entry (or find the existing one) */
+	retval =
+	    reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
+			       new_dentry->d_name.len, old_inode, 0);
+	if (retval == -EEXIST) {
+		if (!new_dentry_inode) {
+			reiserfs_panic(old_dir->i_sb, "vs-7050",
+				       "new entry is found, new inode == 0");
+		}
+	} else if (retval) {
+		int err = journal_end(&th);
+		reiserfs_write_unlock(old_dir->i_sb);
+		return err ? err : retval;
+	}
+
+	reiserfs_update_inode_transaction(old_dir);
+	reiserfs_update_inode_transaction(new_dir);
+
+	/*
+	 * this makes it so an fsync on an open fd for the old name will
+	 * commit the rename operation
+	 */
+	reiserfs_update_inode_transaction(old_inode);
+
+	if (new_dentry_inode)
+		reiserfs_update_inode_transaction(new_dentry_inode);
+
+	while (1) {
+		/*
+		 * look for old name using corresponding entry key
+		 * (found by reiserfs_find_entry)
+		 */
+		if ((retval =
+		     search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
+					 &old_entry_path,
+					 &old_de)) != NAME_FOUND) {
+			pathrelse(&old_entry_path);
+			journal_end(&th);
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
+
+		reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
+
+		/* look for new name by reiserfs_find_entry */
+		new_de.de_gen_number_bit_string = NULL;
+		retval =
+		    reiserfs_find_entry(new_dir, new_dentry->d_name.name,
+					new_dentry->d_name.len, &new_entry_path,
+					&new_de);
+		/*
+		 * reiserfs_add_entry should not return IO_ERROR,
+		 * because it is called with essentially same parameters from
+		 * reiserfs_add_entry above, and we'll catch any i/o errors
+		 * before we get here.
+		 */
+		if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
+			pathrelse(&new_entry_path);
+			pathrelse(&old_entry_path);
+			journal_end(&th);
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
+
+		reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
+
+		if (S_ISDIR(old_inode->i_mode)) {
+			if ((retval =
+			     search_by_entry_key(new_dir->i_sb,
+						 &dot_dot_de.de_entry_key,
+						 &dot_dot_entry_path,
+						 &dot_dot_de)) != NAME_FOUND) {
+				pathrelse(&dot_dot_entry_path);
+				pathrelse(&new_entry_path);
+				pathrelse(&old_entry_path);
+				journal_end(&th);
+				reiserfs_write_unlock(old_dir->i_sb);
+				return -EIO;
+			}
+			copy_item_head(&dot_dot_ih,
+				       tp_item_head(&dot_dot_entry_path));
+			/* node containing ".." gets into transaction */
+			reiserfs_prepare_for_journal(old_inode->i_sb,
+						     dot_dot_de.de_bh, 1);
+		}
+		/*
+		 * we should check seals here, not do
+		 * this stuff, yes? Then, having
+		 * gathered everything into RAM we
+		 * should lock the buffers, yes?  -Hans
+		 */
+		/*
+		 * probably.  our rename needs to hold more
+		 * than one path at once.  The seals would
+		 * have to be written to deal with multi-path
+		 * issues -chris
+		 */
+		/*
+		 * sanity checking before doing the rename - avoid races many
+		 * of the above checks could have scheduled.  We have to be
+		 * sure our items haven't been shifted by another process.
+		 */
+		if (item_moved(&new_entry_ih, &new_entry_path) ||
+		    !entry_points_to_object(new_dentry->d_name.name,
+					    new_dentry->d_name.len,
+					    &new_de, new_dentry_inode) ||
+		    item_moved(&old_entry_ih, &old_entry_path) ||
+		    !entry_points_to_object(old_dentry->d_name.name,
+					    old_dentry->d_name.len,
+					    &old_de, old_inode)) {
+			reiserfs_restore_prepared_buffer(old_inode->i_sb,
+							 new_de.de_bh);
+			reiserfs_restore_prepared_buffer(old_inode->i_sb,
+							 old_de.de_bh);
+			if (S_ISDIR(old_inode_mode))
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 dot_dot_de.
+								 de_bh);
+			continue;
+		}
+		if (S_ISDIR(old_inode_mode)) {
+			if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
+			    !entry_points_to_object("..", 2, &dot_dot_de,
+						    old_dir)) {
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 old_de.de_bh);
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 new_de.de_bh);
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 dot_dot_de.
+								 de_bh);
+				continue;
+			}
+		}
+
+		RFALSE(S_ISDIR(old_inode_mode) &&
+		       !buffer_journal_prepared(dot_dot_de.de_bh), "");
+
+		break;
+	}
+
+	/*
+	 * ok, all the changes can be done in one fell swoop when we
+	 * have claimed all the buffers needed.
+	 */
+
+	mark_de_visible(new_de.de_deh + new_de.de_entry_num);
+	set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
+	journal_mark_dirty(&th, new_de.de_bh);
+
+	mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
+	journal_mark_dirty(&th, old_de.de_bh);
+	ctime = CURRENT_TIME_SEC;
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	/*
+	 * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
+	 * which adds ctime update of renamed object
+	 */
+	old_inode->i_ctime = ctime;
+
+	if (new_dentry_inode) {
+		/* adjust link number of the victim */
+		if (S_ISDIR(new_dentry_inode->i_mode)) {
+			clear_nlink(new_dentry_inode);
+		} else {
+			drop_nlink(new_dentry_inode);
+		}
+		new_dentry_inode->i_ctime = ctime;
+		savelink = new_dentry_inode->i_nlink;
+	}
+
+	if (S_ISDIR(old_inode_mode)) {
+		/* adjust ".." of renamed directory */
+		set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
+		journal_mark_dirty(&th, dot_dot_de.de_bh);
+
+		/*
+		 * there (in new_dir) was no directory, so it got new link
+		 * (".."  of renamed directory)
+		 */
+		if (!new_dentry_inode)
+			INC_DIR_INODE_NLINK(new_dir);
+
+		/* old directory lost one link - ".. " of renamed directory */
+		DEC_DIR_INODE_NLINK(old_dir);
+	}
+	/*
+	 * looks like in 2.3.99pre3 brelse is atomic.
+	 * so we can use pathrelse
+	 */
+	pathrelse(&new_entry_path);
+	pathrelse(&dot_dot_entry_path);
+
+	/*
+	 * FIXME: this reiserfs_cut_from_item's return value may screw up
+	 * anybody, but it will panic if will not be able to find the
+	 * entry. This needs one more clean up
+	 */
+	if (reiserfs_cut_from_item
+	    (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
+	     0) < 0)
+		reiserfs_error(old_dir->i_sb, "vs-7060",
+			       "couldn't not cut old name. Fsck later?");
+
+	old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
+
+	reiserfs_update_sd(&th, old_dir);
+	reiserfs_update_sd(&th, new_dir);
+	reiserfs_update_sd(&th, old_inode);
+
+	if (new_dentry_inode) {
+		if (savelink == 0)
+			add_save_link(&th, new_dentry_inode,
+				      0 /* not truncate */ );
+		reiserfs_update_sd(&th, new_dentry_inode);
+	}
+
+	retval = journal_end(&th);
+	reiserfs_write_unlock(old_dir->i_sb);
+	return retval;
+}
+
+/* directories can handle most operations...  */
+const struct inode_operations reiserfs_dir_inode_operations = {
+	.create = reiserfs_create,
+	.lookup = reiserfs_lookup,
+	.link = reiserfs_link,
+	.unlink = reiserfs_unlink,
+	.symlink = reiserfs_symlink,
+	.mkdir = reiserfs_mkdir,
+	.rmdir = reiserfs_rmdir,
+	.mknod = reiserfs_mknod,
+	.rename = reiserfs_rename,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+	.get_acl = reiserfs_get_acl,
+	.set_acl = reiserfs_set_acl,
+};
+
+/*
+ * symlink operations.. same as page_symlink_inode_operations, with xattr
+ * stuff added
+ */
+const struct inode_operations reiserfs_symlink_inode_operations = {
+	.readlink = generic_readlink,
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+};
+
+/*
+ * special file operations.. just xattr/acl stuff
+ */
+const struct inode_operations reiserfs_special_inode_operations = {
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+	.get_acl = reiserfs_get_acl,
+	.set_acl = reiserfs_set_acl,
+};
diff --git a/kernel/fs/reiserfs/objectid.c b/kernel/fs/reiserfs/objectid.c
new file mode 100644
index 000000000..99a5d5dae
--- /dev/null
+++ b/kernel/fs/reiserfs/objectid.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/time.h>
+#include "reiserfs.h"
+
+/* find where objectid map starts */
+#define objectid_map(s,rs) (old_format_only (s) ? \
+                         (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
+			 (__le32 *)((rs) + 1))
+
+#ifdef CONFIG_REISERFS_CHECK
+
+static void check_objectid_map(struct super_block *s, __le32 * map)
+{
+	if (le32_to_cpu(map[0]) != 1)
+		reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
+			       (long unsigned int)le32_to_cpu(map[0]));
+
+	/* FIXME: add something else here */
+}
+
+#else
+static void check_objectid_map(struct super_block *s, __le32 * map)
+{;
+}
+#endif
+
+/*
+ * When we allocate objectids we allocate the first unused objectid.
+ * Each sequence of objectids in use (the odd sequences) is followed
+ * by a sequence of objectids not in use (the even sequences).  We
+ * only need to record the last objectid in each of these sequences
+ * (both the odd and even sequences) in order to fully define the
+ * boundaries of the sequences.  A consequence of allocating the first
+ * objectid not in use is that under most conditions this scheme is
+ * extremely compact.  The exception is immediately after a sequence
+ * of operations which deletes a large number of objects of
+ * non-sequential objectids, and even then it will become compact
+ * again as soon as more objects are created.  Note that many
+ * interesting optimizations of layout could result from complicating
+ * objectid assignment, but we have deferred making them for now.
+ */
+
+/* get unique object identifier */
+__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	__le32 *map = objectid_map(s, rs);
+	__u32 unused_objectid;
+
+	BUG_ON(!th->t_trans_id);
+
+	check_objectid_map(s, map);
+
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	/* comment needed -Hans */
+	unused_objectid = le32_to_cpu(map[1]);
+	if (unused_objectid == U32_MAX) {
+		reiserfs_warning(s, "reiserfs-15100", "no more object ids");
+		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
+		return 0;
+	}
+
+	/*
+	 * This incrementation allocates the first unused objectid. That
+	 * is to say, the first entry on the objectid map is the first
+	 * unused objectid, and by incrementing it we use it.  See below
+	 * where we check to see if we eliminated a sequence of unused
+	 * objectids....
+	 */
+	map[1] = cpu_to_le32(unused_objectid + 1);
+
+	/*
+	 * Now we check to see if we eliminated the last remaining member of
+	 * the first even sequence (and can eliminate the sequence by
+	 * eliminating its last objectid from oids), and can collapse the
+	 * first two odd sequences into one sequence.  If so, then the net
+	 * result is to eliminate a pair of objectids from oids.  We do this
+	 * by shifting the entire map to the left.
+	 */
+	if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
+		memmove(map + 1, map + 3,
+			(sb_oid_cursize(rs) - 3) * sizeof(__u32));
+		set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
+	}
+
+	journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
+	return unused_objectid;
+}
+
+/* makes object identifier unused */
+void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
+			       __u32 objectid_to_release)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	__le32 *map = objectid_map(s, rs);
+	int i = 0;
+
+	BUG_ON(!th->t_trans_id);
+	/*return; */
+	check_objectid_map(s, map);
+
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
+
+	/*
+	 * start at the beginning of the objectid map (i = 0) and go to
+	 * the end of it (i = disk_sb->s_oid_cursize).  Linear search is
+	 * what we use, though it is possible that binary search would be
+	 * more efficient after performing lots of deletions (which is
+	 * when oids is large.)  We only check even i's.
+	 */
+	while (i < sb_oid_cursize(rs)) {
+		if (objectid_to_release == le32_to_cpu(map[i])) {
+			/* This incrementation unallocates the objectid. */
+			le32_add_cpu(&map[i], 1);
+
+			/*
+			 * Did we unallocate the last member of an
+			 * odd sequence, and can shrink oids?
+			 */
+			if (map[i] == map[i + 1]) {
+				/* shrink objectid map */
+				memmove(map + i, map + i + 2,
+					(sb_oid_cursize(rs) - i -
+					 2) * sizeof(__u32));
+				set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
+
+				RFALSE(sb_oid_cursize(rs) < 2 ||
+				       sb_oid_cursize(rs) > sb_oid_maxsize(rs),
+				       "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
+				       sb_oid_cursize(rs), sb_oid_maxsize(rs));
+			}
+			return;
+		}
+
+		if (objectid_to_release > le32_to_cpu(map[i]) &&
+		    objectid_to_release < le32_to_cpu(map[i + 1])) {
+			/* size of objectid map is not changed */
+			if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
+				le32_add_cpu(&map[i + 1], -1);
+				return;
+			}
+
+			/*
+			 * JDM comparing two little-endian values for
+			 * equality -- safe
+			 */
+			/*
+			 * objectid map must be expanded, but
+			 * there is no space
+			 */
+			if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
+				PROC_INFO_INC(s, leaked_oid);
+				return;
+			}
+
+			/* expand the objectid map */
+			memmove(map + i + 3, map + i + 1,
+				(sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
+			map[i + 1] = cpu_to_le32(objectid_to_release);
+			map[i + 2] = cpu_to_le32(objectid_to_release + 1);
+			set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
+			return;
+		}
+		i += 2;
+	}
+
+	reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
+		       (long unsigned)objectid_to_release);
+}
+
+int reiserfs_convert_objectid_map_v1(struct super_block *s)
+{
+	struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
+	int cur_size = sb_oid_cursize(disk_sb);
+	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
+	int old_max = sb_oid_maxsize(disk_sb);
+	struct reiserfs_super_block_v1 *disk_sb_v1;
+	__le32 *objectid_map, *new_objectid_map;
+	int i;
+
+	disk_sb_v1 =
+	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
+	objectid_map = (__le32 *) (disk_sb_v1 + 1);
+	new_objectid_map = (__le32 *) (disk_sb + 1);
+
+	if (cur_size > new_size) {
+		/*
+		 * mark everyone used that was listed as free at
+		 * the end of the objectid map
+		 */
+		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
+		set_sb_oid_cursize(disk_sb, new_size);
+	}
+	/* move the smaller objectid map past the end of the new super */
+	for (i = new_size - 1; i >= 0; i--) {
+		objectid_map[i + (old_max - new_size)] = objectid_map[i];
+	}
+
+	/* set the max size so we don't overflow later */
+	set_sb_oid_maxsize(disk_sb, new_size);
+
+	/* Zero out label and generate random UUID */
+	memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
+	generate_random_uuid(disk_sb->s_uuid);
+
+	/* finally, zero out the unused chunk of the new super */
+	memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
+	return 0;
+}
diff --git a/kernel/fs/reiserfs/prints.c b/kernel/fs/reiserfs/prints.c
new file mode 100644
index 000000000..ae1dc841d
--- /dev/null
+++ b/kernel/fs/reiserfs/prints.c
@@ -0,0 +1,777 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include "reiserfs.h"
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+
+#include <stdarg.h>
+
+static char error_buf[1024];
+static char fmt_buf[1024];
+static char off_buf[80];
+
+static char *reiserfs_cpu_offset(struct cpu_key *key)
+{
+	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
+		sprintf(off_buf, "%llu(%llu)",
+			(unsigned long long)
+			GET_HASH_VALUE(cpu_key_k_offset(key)),
+			(unsigned long long)
+			GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
+	else
+		sprintf(off_buf, "0x%Lx",
+			(unsigned long long)cpu_key_k_offset(key));
+	return off_buf;
+}
+
+static char *le_offset(struct reiserfs_key *key)
+{
+	int version;
+
+	version = le_key_version(key);
+	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
+		sprintf(off_buf, "%llu(%llu)",
+			(unsigned long long)
+			GET_HASH_VALUE(le_key_k_offset(version, key)),
+			(unsigned long long)
+			GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
+	else
+		sprintf(off_buf, "0x%Lx",
+			(unsigned long long)le_key_k_offset(version, key));
+	return off_buf;
+}
+
+static char *cpu_type(struct cpu_key *key)
+{
+	if (cpu_key_k_type(key) == TYPE_STAT_DATA)
+		return "SD";
+	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
+		return "DIR";
+	if (cpu_key_k_type(key) == TYPE_DIRECT)
+		return "DIRECT";
+	if (cpu_key_k_type(key) == TYPE_INDIRECT)
+		return "IND";
+	return "UNKNOWN";
+}
+
+static char *le_type(struct reiserfs_key *key)
+{
+	int version;
+
+	version = le_key_version(key);
+
+	if (le_key_k_type(version, key) == TYPE_STAT_DATA)
+		return "SD";
+	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
+		return "DIR";
+	if (le_key_k_type(version, key) == TYPE_DIRECT)
+		return "DIRECT";
+	if (le_key_k_type(version, key) == TYPE_INDIRECT)
+		return "IND";
+	return "UNKNOWN";
+}
+
+/* %k */
+static void sprintf_le_key(char *buf, struct reiserfs_key *key)
+{
+	if (key)
+		sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id),
+			le32_to_cpu(key->k_objectid), le_offset(key),
+			le_type(key));
+	else
+		sprintf(buf, "[NULL]");
+}
+
+/* %K */
+static void sprintf_cpu_key(char *buf, struct cpu_key *key)
+{
+	if (key)
+		sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id,
+			key->on_disk_key.k_objectid, reiserfs_cpu_offset(key),
+			cpu_type(key));
+	else
+		sprintf(buf, "[NULL]");
+}
+
+static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh)
+{
+	if (deh)
+		sprintf(buf,
+			"[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
+			deh_offset(deh), deh_dir_id(deh), deh_objectid(deh),
+			deh_location(deh), deh_state(deh));
+	else
+		sprintf(buf, "[NULL]");
+
+}
+
+static void sprintf_item_head(char *buf, struct item_head *ih)
+{
+	if (ih) {
+		strcpy(buf,
+		       (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*");
+		sprintf_le_key(buf + strlen(buf), &(ih->ih_key));
+		sprintf(buf + strlen(buf), ", item_len %d, item_location %d, "
+			"free_space(entry_count) %d",
+			ih_item_len(ih), ih_location(ih), ih_free_space(ih));
+	} else
+		sprintf(buf, "[NULL]");
+}
+
+static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de)
+{
+	char name[20];
+
+	memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
+	name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
+	sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid);
+}
+
+static void sprintf_block_head(char *buf, struct buffer_head *bh)
+{
+	sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ",
+		B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
+}
+
+static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	sprintf(buf,
+		"dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+		bdevname(bh->b_bdev, b), bh->b_size,
+		(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
+		bh->b_state, bh->b_page,
+		buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
+		buffer_dirty(bh) ? "DIRTY" : "CLEAN",
+		buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
+}
+
+static void sprintf_disk_child(char *buf, struct disk_child *dc)
+{
+	sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc),
+		dc_size(dc));
+}
+
+static char *is_there_reiserfs_struct(char *fmt, int *what)
+{
+	char *k = fmt;
+
+	while ((k = strchr(k, '%')) != NULL) {
+		if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
+		    k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
+			*what = k[1];
+			break;
+		}
+		k++;
+	}
+	return k;
+}
+
+/*
+ * debugging reiserfs we used to print out a lot of different
+ * variables, like keys, item headers, buffer heads etc. Values of
+ * most fields matter. So it took a long time just to write
+ * appropriative printk. With this reiserfs_warning you can use format
+ * specification for complex structures like you used to do with
+ * printfs for integers, doubles and pointers. For instance, to print
+ * out key structure you have to write just:
+ * reiserfs_warning ("bad key %k", key);
+ * instead of
+ * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
+ *         key->k_offset, key->k_uniqueness);
+ */
+static DEFINE_SPINLOCK(error_lock);
+static void prepare_error_buf(const char *fmt, va_list args)
+{
+	char *fmt1 = fmt_buf;
+	char *k;
+	char *p = error_buf;
+	int what;
+
+	spin_lock(&error_lock);
+
+	strcpy(fmt1, fmt);
+
+	while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
+		*k = 0;
+
+		p += vsprintf(p, fmt1, args);
+
+		switch (what) {
+		case 'k':
+			sprintf_le_key(p, va_arg(args, struct reiserfs_key *));
+			break;
+		case 'K':
+			sprintf_cpu_key(p, va_arg(args, struct cpu_key *));
+			break;
+		case 'h':
+			sprintf_item_head(p, va_arg(args, struct item_head *));
+			break;
+		case 't':
+			sprintf_direntry(p,
+					 va_arg(args,
+						struct reiserfs_dir_entry *));
+			break;
+		case 'y':
+			sprintf_disk_child(p,
+					   va_arg(args, struct disk_child *));
+			break;
+		case 'z':
+			sprintf_block_head(p,
+					   va_arg(args, struct buffer_head *));
+			break;
+		case 'b':
+			sprintf_buffer_head(p,
+					    va_arg(args, struct buffer_head *));
+			break;
+		case 'a':
+			sprintf_de_head(p,
+					va_arg(args,
+					       struct reiserfs_de_head *));
+			break;
+		}
+
+		p += strlen(p);
+		fmt1 = k + 2;
+	}
+	vsprintf(p, fmt1, args);
+	spin_unlock(&error_lock);
+
+}
+
+/*
+ * in addition to usual conversion specifiers this accepts reiserfs
+ * specific conversion specifiers:
+ * %k to print little endian key,
+ * %K to print cpu key,
+ * %h to print item_head,
+ * %t to print directory entry
+ * %z to print block head (arg must be struct buffer_head *
+ * %b to print buffer_head
+ */
+
+#define do_reiserfs_warning(fmt)\
+{\
+    va_list args;\
+    va_start( args, fmt );\
+    prepare_error_buf( fmt, args );\
+    va_end( args );\
+}
+
+void __reiserfs_warning(struct super_block *sb, const char *id,
+			 const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+	if (sb)
+		printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
+		       "%s\n", sb->s_id, id ? id : "", id ? " " : "",
+		       function, error_buf);
+	else
+		printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
+		       id ? id : "", id ? " " : "", function, error_buf);
+}
+
+/* No newline.. reiserfs_info calls can be followed by printk's */
+void reiserfs_info(struct super_block *sb, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+	if (sb)
+		printk(KERN_NOTICE "REISERFS (device %s): %s",
+		       sb->s_id, error_buf);
+	else
+		printk(KERN_NOTICE "REISERFS %s:", error_buf);
+}
+
+/* No newline.. reiserfs_printk calls can be followed by printk's */
+static void reiserfs_printk(const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+	printk(error_buf);
+}
+
+void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
+{
+#ifdef CONFIG_REISERFS_CHECK
+	do_reiserfs_warning(fmt);
+	if (s)
+		printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
+		       s->s_id, error_buf);
+	else
+		printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
+#endif
+}
+
+/*
+ * The format:
+ *
+ *          maintainer-errorid: [function-name:] message
+ *
+ *   where errorid is unique to the maintainer and function-name is
+ *   optional, is recommended, so that anyone can easily find the bug
+ *   with a simple grep for the short to type string
+ *   maintainer-errorid.  Don't bother with reusing errorids, there are
+ *   lots of numbers out there.
+ *
+ *   Example:
+ *
+ *   reiserfs_panic(
+ *     p_sb, "reiser-29: reiserfs_new_blocknrs: "
+ *     "one of search_start or rn(%d) is equal to MAX_B_NUM,"
+ *     "which means that we are optimizing location based on the "
+ *     "bogus location of a temp buffer (%p).",
+ *     rn, bh
+ *   );
+ *
+ *   Regular panic()s sometimes clear the screen before the message can
+ *   be read, thus the need for the while loop.
+ *
+ *   Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
+ *   ignores this scheme, and considers it pointless complexity):
+ *
+ *   panics in reiserfs_fs.h have numbers from 1000 to 1999
+ *   super.c			2000 to 2999
+ *   preserve.c (unused)	3000 to 3999
+ *   bitmap.c			4000 to 4999
+ *   stree.c			5000 to 5999
+ *   prints.c			6000 to 6999
+ *   namei.c			7000 to 7999
+ *   fix_nodes.c		8000 to 8999
+ *   dir.c			9000 to 9999
+ *   lbalance.c			10000 to 10999
+ *   ibalance.c			11000 to 11999 not ready
+ *   do_balan.c			12000 to 12999
+ *   inode.c			13000 to 13999
+ *   file.c			14000 to 14999
+ *   objectid.c			15000 - 15999
+ *   buffer.c			16000 - 16999
+ *   symlink.c			17000 - 17999
+ *
+ *  .  */
+
+void __reiserfs_panic(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+
+#ifdef CONFIG_REISERFS_CHECK
+	dump_stack();
+#endif
+	if (sb)
+		printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
+		      sb->s_id, id ? id : "", id ? " " : "",
+		      function, error_buf);
+	else
+		printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
+		      id ? id : "", id ? " " : "", function, error_buf);
+	BUG();
+}
+
+void __reiserfs_error(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+
+	BUG_ON(sb == NULL);
+
+	if (reiserfs_error_panic(sb))
+		__reiserfs_panic(sb, id, function, error_buf);
+
+	if (id && id[0])
+		printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
+		       sb->s_id, id, function, error_buf);
+	else
+		printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
+		       sb->s_id, function, error_buf);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	reiserfs_info(sb, "Remounting filesystem read-only\n");
+	sb->s_flags |= MS_RDONLY;
+	reiserfs_abort_journal(sb, -EIO);
+}
+
+void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+
+	if (reiserfs_error_panic(sb)) {
+		panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
+		      error_buf);
+	}
+
+	if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
+		return;
+
+	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
+	       error_buf);
+
+	sb->s_flags |= MS_RDONLY;
+	reiserfs_abort_journal(sb, errno);
+}
+
+/*
+ * this prints internal nodes (4 keys/items in line) (dc_number,
+ * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
+ * dc_size)...
+ */
+static int print_internal(struct buffer_head *bh, int first, int last)
+{
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+	int i;
+	int from, to;
+
+	if (!B_IS_KEYS_LEVEL(bh))
+		return 1;
+
+	check_internal(bh);
+
+	if (first == -1) {
+		from = 0;
+		to = B_NR_ITEMS(bh);
+	} else {
+		from = first;
+		to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh);
+	}
+
+	reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
+
+	dc = B_N_CHILD(bh, from);
+	reiserfs_printk("PTR %d: %y ", from, dc);
+
+	for (i = from, key = internal_key(bh, from), dc++; i < to;
+	     i++, key++, dc++) {
+		reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
+		if (i && i % 4 == 0)
+			printk("\n");
+	}
+	printk("\n");
+	return 0;
+}
+
+static int print_leaf(struct buffer_head *bh, int print_mode, int first,
+		      int last)
+{
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i, nr;
+	int from, to;
+
+	if (!B_IS_ITEMS_LEVEL(bh))
+		return 1;
+
+	check_leaf(bh);
+
+	blkh = B_BLK_HEAD(bh);
+	ih = item_head(bh, 0);
+	nr = blkh_nr_item(blkh);
+
+	printk
+	    ("\n===================================================================\n");
+	reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
+
+	if (!(print_mode & PRINT_LEAF_ITEMS)) {
+		reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
+				&(ih->ih_key), &((ih + nr - 1)->ih_key));
+		return 0;
+	}
+
+	if (first < 0 || first > nr - 1)
+		from = 0;
+	else
+		from = first;
+
+	if (last < 0 || last > nr)
+		to = nr;
+	else
+		to = last;
+
+	ih += from;
+	printk
+	    ("-------------------------------------------------------------------------------\n");
+	printk
+	    ("|##|   type    |           key           | ilen | free_space | version | loc  |\n");
+	for (i = from; i < to; i++, ih++) {
+		printk
+		    ("-------------------------------------------------------------------------------\n");
+		reiserfs_printk("|%2d| %h |\n", i, ih);
+		if (print_mode & PRINT_LEAF_ITEMS)
+			op_print_item(ih, ih_item_body(bh, ih));
+	}
+
+	printk
+	    ("===================================================================\n");
+
+	return 0;
+}
+
+char *reiserfs_hashname(int code)
+{
+	if (code == YURA_HASH)
+		return "rupasov";
+	if (code == TEA_HASH)
+		return "tea";
+	if (code == R5_HASH)
+		return "r5";
+
+	return "unknown";
+}
+
+/* return 1 if this is not super block */
+static int print_super_block(struct buffer_head *bh)
+{
+	struct reiserfs_super_block *rs =
+	    (struct reiserfs_super_block *)(bh->b_data);
+	int skipped, data_blocks;
+	char *version;
+	char b[BDEVNAME_SIZE];
+
+	if (is_reiserfs_3_5(rs)) {
+		version = "3.5";
+	} else if (is_reiserfs_3_6(rs)) {
+		version = "3.6";
+	} else if (is_reiserfs_jr(rs)) {
+		version = ((sb_version(rs) == REISERFS_VERSION_2) ?
+			   "3.6" : "3.5");
+	} else {
+		return 1;
+	}
+
+	printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b),
+	       (unsigned long long)bh->b_blocknr);
+	printk("Reiserfs version %s\n", version);
+	printk("Block count %u\n", sb_block_count(rs));
+	printk("Blocksize %d\n", sb_blocksize(rs));
+	printk("Free blocks %u\n", sb_free_blocks(rs));
+	/*
+	 * FIXME: this would be confusing if
+	 * someone stores reiserfs super block in some data block ;)
+//    skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
+	 */
+	skipped = bh->b_blocknr;
+	data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
+	    (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
+	     1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
+	printk
+	    ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
+	     "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
+	     (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
+	      sb_reserved_for_journal(rs)), data_blocks);
+	printk("Root block %u\n", sb_root_block(rs));
+	printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
+	printk("Journal dev %d\n", sb_jp_journal_dev(rs));
+	printk("Journal orig size %d\n", sb_jp_journal_size(rs));
+	printk("FS state %d\n", sb_fs_state(rs));
+	printk("Hash function \"%s\"\n",
+	       reiserfs_hashname(sb_hash_function_code(rs)));
+
+	printk("Tree height %d\n", sb_tree_height(rs));
+	return 0;
+}
+
+static int print_desc_block(struct buffer_head *bh)
+{
+	struct reiserfs_journal_desc *desc;
+
+	if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
+		return 1;
+
+	desc = (struct reiserfs_journal_desc *)(bh->b_data);
+	printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
+	       (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
+	       get_desc_mount_id(desc), get_desc_trans_len(desc));
+
+	return 0;
+}
+/* ..., int print_mode, int first, int last) */
+void print_block(struct buffer_head *bh, ...)
+{
+	va_list args;
+	int mode, first, last;
+
+	if (!bh) {
+		printk("print_block: buffer is NULL\n");
+		return;
+	}
+
+	va_start(args, bh);
+
+	mode = va_arg(args, int);
+	first = va_arg(args, int);
+	last = va_arg(args, int);
+	if (print_leaf(bh, mode, first, last))
+		if (print_internal(bh, first, last))
+			if (print_super_block(bh))
+				if (print_desc_block(bh))
+					printk
+					    ("Block %llu contains unformatted data\n",
+					     (unsigned long long)bh->b_blocknr);
+
+	va_end(args);
+}
+
+static char print_tb_buf[2048];
+
+/* this stores initial state of tree balance in the print_tb_buf */
+void store_print_tb(struct tree_balance *tb)
+{
+	int h = 0;
+	int i;
+	struct buffer_head *tbSh, *tbFh;
+
+	if (!tb)
+		return;
+
+	sprintf(print_tb_buf, "\n"
+		"BALANCING %d\n"
+		"MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
+		"=====================================================================\n"
+		"* h *    S    *    L    *    R    *   F   *   FL  *   FR  *  CFL  *  CFR  *\n",
+		REISERFS_SB(tb->tb_sb)->s_do_balance,
+		tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
+		tb->tb_path->pos_in_item);
+
+	for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) {
+		if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
+		    tb->tb_path->path_length
+		    && PATH_H_PATH_OFFSET(tb->tb_path,
+					  h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
+			tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+			tbFh = PATH_H_PPARENT(tb->tb_path, h);
+		} else {
+			tbSh = NULL;
+			tbFh = NULL;
+		}
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
+			h,
+			(tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
+			(tbSh) ? atomic_read(&tbSh->b_count) : -1,
+			(tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
+			(tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
+			(tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
+			(tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
+			(tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
+			(tb->FL[h]) ? (long long)(tb->FL[h]->
+						  b_blocknr) : (-1LL),
+			(tb->FR[h]) ? (long long)(tb->FR[h]->
+						  b_blocknr) : (-1LL),
+			(tb->CFL[h]) ? (long long)(tb->CFL[h]->
+						   b_blocknr) : (-1LL),
+			(tb->CFR[h]) ? (long long)(tb->CFR[h]->
+						   b_blocknr) : (-1LL));
+	}
+
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"=====================================================================\n"
+		"* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
+		"* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
+		tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
+		tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
+		tb->sbytes[0], tb->snum[1], tb->sbytes[1],
+		tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
+
+	/* this prints balance parameters for non-leaf levels */
+	h = 0;
+	do {
+		h++;
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"* %d * %4d * %2d *    * %2d *    * %2d *\n",
+			h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
+			tb->blknum[h]);
+	} while (tb->insert_size[h]);
+
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"=====================================================================\n"
+		"FEB list: ");
+
+	/* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
+	h = 0;
+	for (i = 0; i < ARRAY_SIZE(tb->FEB); i++)
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"%p (%llu %d)%s", tb->FEB[i],
+			tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
+			b_blocknr : 0ULL,
+			tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
+			(i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
+
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"======================== the end ====================================\n");
+}
+
+void print_cur_tb(char *mes)
+{
+	printk("%s\n%s", mes, print_tb_buf);
+}
+
+static void check_leaf_block_head(struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	int nr;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
+		reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
+			       bh);
+	if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
+		reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
+			       bh);
+
+}
+
+static void check_internal_block_head(struct buffer_head *bh)
+{
+	struct block_head *blkh;
+
+	blkh = B_BLK_HEAD(bh);
+	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
+		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
+
+	if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
+		reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
+
+	if (B_FREE_SPACE(bh) !=
+	    bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
+	    DC_SIZE * (B_NR_ITEMS(bh) + 1))
+		reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
+
+}
+
+void check_leaf(struct buffer_head *bh)
+{
+	int i;
+	struct item_head *ih;
+
+	if (!bh)
+		return;
+	check_leaf_block_head(bh);
+	for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
+		op_check_item(ih, ih_item_body(bh, ih));
+}
+
+void check_internal(struct buffer_head *bh)
+{
+	if (!bh)
+		return;
+	check_internal_block_head(bh);
+}
+
+void print_statistics(struct super_block *s)
+{
+
+	/*
+	   printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
+	   bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
+	   REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
+	   REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
+	   REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
+	 */
+
+}
diff --git a/kernel/fs/reiserfs/procfs.c b/kernel/fs/reiserfs/procfs.c
new file mode 100644
index 000000000..621b9f381
--- /dev/null
+++ b/kernel/fs/reiserfs/procfs.c
@@ -0,0 +1,508 @@
+/* -*- linux-c -*- */
+
+/* fs/reiserfs/procfs.c */
+
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/* proc info support a la one created by Sizif@Botik.RU for PGC */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include "reiserfs.h"
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+
+/*
+ * LOCKING:
+ *
+ * These guys are evicted from procfs as the very first step in ->kill_sb().
+ *
+ */
+
+static int show_version(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	char *format;
+
+	if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
+		format = "3.6";
+	} else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
+		format = "3.5";
+	} else {
+		format = "unknown";
+	}
+
+	seq_printf(m, "%s format\twith checks %s\n", format,
+#if defined( CONFIG_REISERFS_CHECK )
+		   "on"
+#else
+		   "off"
+#endif
+	    );
+	return 0;
+}
+
+#define SF( x ) ( r -> x )
+#define SFP( x ) SF( s_proc_info_data.x )
+#define SFPL( x ) SFP( x[ level ] )
+#define SFPF( x ) SFP( scan_bitmap.x )
+#define SFPJ( x ) SFP( journal.x )
+
+#define D2C( x ) le16_to_cpu( x )
+#define D4C( x ) le32_to_cpu( x )
+#define DF( x ) D2C( rs -> s_v1.x )
+#define DFL( x ) D4C( rs -> s_v1.x )
+
+#define objectid_map( s, rs ) (old_format_only (s) ?				\
+                         (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) :	\
+			 (__le32 *)(rs + 1))
+#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
+
+#define DJF( x ) le32_to_cpu( rs -> x )
+#define DJV( x ) le32_to_cpu( s_v1 -> x )
+#define DJP( x ) le32_to_cpu( jp -> x )
+#define JF( x ) ( r -> s_journal -> x )
+
+static int show_super(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+
+	seq_printf(m, "state: \t%s\n"
+		   "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
+		   "gen. counter: \t%i\n"
+		   "s_disk_reads: \t%i\n"
+		   "s_disk_writes: \t%i\n"
+		   "s_fix_nodes: \t%i\n"
+		   "s_do_balance: \t%i\n"
+		   "s_unneeded_left_neighbor: \t%i\n"
+		   "s_good_search_by_key_reada: \t%i\n"
+		   "s_bmaps: \t%i\n"
+		   "s_bmaps_without_search: \t%i\n"
+		   "s_direct2indirect: \t%i\n"
+		   "s_indirect2direct: \t%i\n"
+		   "\n"
+		   "max_hash_collisions: \t%i\n"
+		   "breads: \t%lu\n"
+		   "bread_misses: \t%lu\n"
+		   "search_by_key: \t%lu\n"
+		   "search_by_key_fs_changed: \t%lu\n"
+		   "search_by_key_restarted: \t%lu\n"
+		   "insert_item_restarted: \t%lu\n"
+		   "paste_into_item_restarted: \t%lu\n"
+		   "cut_from_item_restarted: \t%lu\n"
+		   "delete_solid_item_restarted: \t%lu\n"
+		   "delete_item_restarted: \t%lu\n"
+		   "leaked_oid: \t%lu\n"
+		   "leaves_removable: \t%lu\n",
+		   SF(s_mount_state) == REISERFS_VALID_FS ?
+		   "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
+		   reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
+		   reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
+		   reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
+		   reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
+		   reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
+		   reiserfs_no_unhashed_relocation(sb) ?
+		   "NO_UNHASHED_RELOCATION " : "",
+		   reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
+		   reiserfs_test4(sb) ? "TEST4 " : "",
+		   have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
+		   "SMALL_TAILS " : "NO_TAILS ",
+		   replay_only(sb) ? "REPLAY_ONLY " : "",
+		   convert_reiserfs(sb) ? "CONV " : "",
+		   atomic_read(&r->s_generation_counter),
+		   SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
+		   SF(s_do_balance), SF(s_unneeded_left_neighbor),
+		   SF(s_good_search_by_key_reada), SF(s_bmaps),
+		   SF(s_bmaps_without_search), SF(s_direct2indirect),
+		   SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
+		   SFP(bread_miss), SFP(search_by_key),
+		   SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
+		   SFP(insert_item_restarted), SFP(paste_into_item_restarted),
+		   SFP(cut_from_item_restarted),
+		   SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
+		   SFP(leaked_oid), SFP(leaves_removable));
+
+	return 0;
+}
+
+static int show_per_level(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+	int level;
+
+	seq_printf(m, "level\t"
+		   "     balances"
+		   " [sbk:  reads"
+		   "   fs_changed"
+		   "   restarted]"
+		   "   free space"
+		   "        items"
+		   "   can_remove"
+		   "         lnum"
+		   "         rnum"
+		   "       lbytes"
+		   "       rbytes"
+		   "     get_neig"
+		   " get_neig_res" "  need_l_neig" "  need_r_neig" "\n");
+
+	for (level = 0; level < MAX_HEIGHT; ++level) {
+		seq_printf(m, "%i\t"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12li"
+			   " %12li"
+			   " %12li"
+			   " %12li"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   "\n",
+			   level,
+			   SFPL(balance_at),
+			   SFPL(sbk_read_at),
+			   SFPL(sbk_fs_changed),
+			   SFPL(sbk_restarted),
+			   SFPL(free_at),
+			   SFPL(items_at),
+			   SFPL(can_node_be_removed),
+			   SFPL(lnum),
+			   SFPL(rnum),
+			   SFPL(lbytes),
+			   SFPL(rbytes),
+			   SFPL(get_neighbors),
+			   SFPL(get_neighbors_restart),
+			   SFPL(need_l_neighbor), SFPL(need_r_neighbor)
+		    );
+	}
+	return 0;
+}
+
+static int show_bitmap(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+
+	seq_printf(m, "free_block: %lu\n"
+		   "  scan_bitmap:"
+		   "          wait"
+		   "          bmap"
+		   "         retry"
+		   "        stolen"
+		   "  journal_hint"
+		   "journal_nohint"
+		   "\n"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   "\n",
+		   SFP(free_block),
+		   SFPF(call),
+		   SFPF(wait),
+		   SFPF(bmap),
+		   SFPF(retry),
+		   SFPF(stolen),
+		   SFPF(in_journal_hint), SFPF(in_journal_nohint));
+
+	return 0;
+}
+
+static int show_on_disk_super(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
+	struct reiserfs_super_block *rs = sb_info->s_rs;
+	int hash_code = DFL(s_hash_function_code);
+	__u32 flags = DJF(s_flags);
+
+	seq_printf(m, "block_count: \t%i\n"
+		   "free_blocks: \t%i\n"
+		   "root_block: \t%i\n"
+		   "blocksize: \t%i\n"
+		   "oid_maxsize: \t%i\n"
+		   "oid_cursize: \t%i\n"
+		   "umount_state: \t%i\n"
+		   "magic: \t%10.10s\n"
+		   "fs_state: \t%i\n"
+		   "hash: \t%s\n"
+		   "tree_height: \t%i\n"
+		   "bmap_nr: \t%i\n"
+		   "version: \t%i\n"
+		   "flags: \t%x[%s]\n"
+		   "reserved_for_journal: \t%i\n",
+		   DFL(s_block_count),
+		   DFL(s_free_blocks),
+		   DFL(s_root_block),
+		   DF(s_blocksize),
+		   DF(s_oid_maxsize),
+		   DF(s_oid_cursize),
+		   DF(s_umount_state),
+		   rs->s_v1.s_magic,
+		   DF(s_fs_state),
+		   hash_code == TEA_HASH ? "tea" :
+		   (hash_code == YURA_HASH) ? "rupasov" :
+		   (hash_code == R5_HASH) ? "r5" :
+		   (hash_code == UNSET_HASH) ? "unset" : "unknown",
+		   DF(s_tree_height),
+		   DF(s_bmap_nr),
+		   DF(s_version), flags, (flags & reiserfs_attrs_cleared)
+		   ? "attrs_cleared" : "", DF(s_reserved_for_journal));
+
+	return 0;
+}
+
+static int show_oidmap(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
+	struct reiserfs_super_block *rs = sb_info->s_rs;
+	unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
+	unsigned long total_used = 0;
+	int i;
+
+	for (i = 0; i < mapsize; ++i) {
+		__u32 right;
+
+		right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
+		seq_printf(m, "%s: [ %x .. %x )\n",
+			   (i & 1) ? "free" : "used", MAP(i), right);
+		if (!(i & 1)) {
+			total_used += right - MAP(i);
+		}
+	}
+#if defined( REISERFS_USE_OIDMAPF )
+	if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
+		loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
+		total_used += size / sizeof(reiserfs_oidinterval_d_t);
+	}
+#endif
+	seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
+		   mapsize,
+		   mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
+	return 0;
+}
+
+static int show_journal(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+	struct reiserfs_super_block *rs = r->s_rs;
+	struct journal_params *jp = &rs->s_v1.s_journal;
+	char b[BDEVNAME_SIZE];
+
+	seq_printf(m,		/* on-disk fields */
+		   "jp_journal_1st_block: \t%i\n"
+		   "jp_journal_dev: \t%s[%x]\n"
+		   "jp_journal_size: \t%i\n"
+		   "jp_journal_trans_max: \t%i\n"
+		   "jp_journal_magic: \t%i\n"
+		   "jp_journal_max_batch: \t%i\n"
+		   "jp_journal_max_commit_age: \t%i\n"
+		   "jp_journal_max_trans_age: \t%i\n"
+		   /* incore fields */
+		   "j_1st_reserved_block: \t%i\n"
+		   "j_state: \t%li\n"
+		   "j_trans_id: \t%u\n"
+		   "j_mount_id: \t%lu\n"
+		   "j_start: \t%lu\n"
+		   "j_len: \t%lu\n"
+		   "j_len_alloc: \t%lu\n"
+		   "j_wcount: \t%i\n"
+		   "j_bcount: \t%lu\n"
+		   "j_first_unflushed_offset: \t%lu\n"
+		   "j_last_flush_trans_id: \t%u\n"
+		   "j_trans_start_time: \t%li\n"
+		   "j_list_bitmap_index: \t%i\n"
+		   "j_must_wait: \t%i\n"
+		   "j_next_full_flush: \t%i\n"
+		   "j_next_async_flush: \t%i\n"
+		   "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
+		   /* reiserfs_proc_info_data_t.journal fields */
+		   "in_journal: \t%12lu\n"
+		   "in_journal_bitmap: \t%12lu\n"
+		   "in_journal_reusable: \t%12lu\n"
+		   "lock_journal: \t%12lu\n"
+		   "lock_journal_wait: \t%12lu\n"
+		   "journal_begin: \t%12lu\n"
+		   "journal_relock_writers: \t%12lu\n"
+		   "journal_relock_wcount: \t%12lu\n"
+		   "mark_dirty: \t%12lu\n"
+		   "mark_dirty_already: \t%12lu\n"
+		   "mark_dirty_notjournal: \t%12lu\n"
+		   "restore_prepared: \t%12lu\n"
+		   "prepare: \t%12lu\n"
+		   "prepare_retry: \t%12lu\n",
+		   DJP(jp_journal_1st_block),
+		   bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
+		   DJP(jp_journal_dev),
+		   DJP(jp_journal_size),
+		   DJP(jp_journal_trans_max),
+		   DJP(jp_journal_magic),
+		   DJP(jp_journal_max_batch),
+		   SB_JOURNAL(sb)->j_max_commit_age,
+		   DJP(jp_journal_max_trans_age),
+		   JF(j_1st_reserved_block),
+		   JF(j_state),
+		   JF(j_trans_id),
+		   JF(j_mount_id),
+		   JF(j_start),
+		   JF(j_len),
+		   JF(j_len_alloc),
+		   atomic_read(&r->s_journal->j_wcount),
+		   JF(j_bcount),
+		   JF(j_first_unflushed_offset),
+		   JF(j_last_flush_trans_id),
+		   JF(j_trans_start_time),
+		   JF(j_list_bitmap_index),
+		   JF(j_must_wait),
+		   JF(j_next_full_flush),
+		   JF(j_next_async_flush),
+		   JF(j_cnode_used),
+		   JF(j_cnode_free),
+		   SFPJ(in_journal),
+		   SFPJ(in_journal_bitmap),
+		   SFPJ(in_journal_reusable),
+		   SFPJ(lock_journal),
+		   SFPJ(lock_journal_wait),
+		   SFPJ(journal_being),
+		   SFPJ(journal_relock_writers),
+		   SFPJ(journal_relock_wcount),
+		   SFPJ(mark_dirty),
+		   SFPJ(mark_dirty_already),
+		   SFPJ(mark_dirty_notjournal),
+		   SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
+	    );
+	return 0;
+}
+
+static int r_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, PDE_DATA(inode), 
+				proc_get_parent_data(inode));
+}
+
+static const struct file_operations r_file_operations = {
+	.open = r_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct proc_dir_entry *proc_info_root = NULL;
+static const char proc_info_root_name[] = "fs/reiserfs";
+
+static void add_file(struct super_block *sb, char *name,
+		     int (*func) (struct seq_file *, void *))
+{
+	proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
+			 &r_file_operations, func);
+}
+
+int reiserfs_proc_info_init(struct super_block *sb)
+{
+	char b[BDEVNAME_SIZE];
+	char *s;
+
+	/* Some block devices use /'s */
+	strlcpy(b, sb->s_id, BDEVNAME_SIZE);
+	s = strchr(b, '/');
+	if (s)
+		*s = '!';
+
+	spin_lock_init(&__PINFO(sb).lock);
+	REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
+	if (REISERFS_SB(sb)->procdir) {
+		add_file(sb, "version", show_version);
+		add_file(sb, "super", show_super);
+		add_file(sb, "per-level", show_per_level);
+		add_file(sb, "bitmap", show_bitmap);
+		add_file(sb, "on-disk-super", show_on_disk_super);
+		add_file(sb, "oidmap", show_oidmap);
+		add_file(sb, "journal", show_journal);
+		return 0;
+	}
+	reiserfs_warning(sb, "cannot create /proc/%s/%s",
+			 proc_info_root_name, b);
+	return 1;
+}
+
+int reiserfs_proc_info_done(struct super_block *sb)
+{
+	struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
+	if (de) {
+		char b[BDEVNAME_SIZE];
+		char *s;
+
+		/* Some block devices use /'s */
+		strlcpy(b, sb->s_id, BDEVNAME_SIZE);
+		s = strchr(b, '/');
+		if (s)
+			*s = '!';
+
+		remove_proc_subtree(b, proc_info_root);
+		REISERFS_SB(sb)->procdir = NULL;
+	}
+	return 0;
+}
+
+int reiserfs_proc_info_global_init(void)
+{
+	if (proc_info_root == NULL) {
+		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
+		if (!proc_info_root) {
+			reiserfs_warning(NULL, "cannot create /proc/%s",
+					 proc_info_root_name);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int reiserfs_proc_info_global_done(void)
+{
+	if (proc_info_root != NULL) {
+		proc_info_root = NULL;
+		remove_proc_entry(proc_info_root_name, NULL);
+	}
+	return 0;
+}
+/*
+ * Revision 1.1.8.2  2001/07/15 17:08:42  god
+ *  . use get_super() in procfs.c
+ *  . remove remove_save_link() from reiserfs_do_truncate()
+ *
+ * I accept terms and conditions stated in the Legal Agreement
+ * (available at http://www.namesys.com/legalese.html)
+ *
+ * Revision 1.1.8.1  2001/07/11 16:48:50  god
+ * proc info support
+ *
+ * I accept terms and conditions stated in the Legal Agreement
+ * (available at http://www.namesys.com/legalese.html)
+ *
+ */
+
+/*
+ * Make Linus happy.
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * End:
+ */
diff --git a/kernel/fs/reiserfs/reiserfs.h b/kernel/fs/reiserfs/reiserfs.h
new file mode 100644
index 000000000..2adcde137
--- /dev/null
+++ b/kernel/fs/reiserfs/reiserfs.h
@@ -0,0 +1,3411 @@
+/*
+ * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
+ * licensing and copyright details
+ */
+
+#include <linux/reiserfs_fs.h>
+
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <linux/workqueue.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/proc_fs.h>
+#include <linux/buffer_head.h>
+
+/* the 32 bit compat definitions with int argument */
+#define REISERFS_IOC32_UNPACK		_IOW(0xCD, 1, int)
+#define REISERFS_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define REISERFS_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define REISERFS_IOC32_GETVERSION	FS_IOC32_GETVERSION
+#define REISERFS_IOC32_SETVERSION	FS_IOC32_SETVERSION
+
+struct reiserfs_journal_list;
+
+/* bitmasks for i_flags field in reiserfs-specific part of inode */
+typedef enum {
+	/*
+	 * this says what format of key do all items (but stat data) of
+	 * an object have.  If this is set, that format is 3.6 otherwise - 3.5
+	 */
+	i_item_key_version_mask = 0x0001,
+
+	/*
+	 * If this is unset, object has 3.5 stat data, otherwise,
+	 * it has 3.6 stat data with 64bit size, 32bit nlink etc.
+	 */
+	i_stat_data_version_mask = 0x0002,
+
+	/* file might need tail packing on close */
+	i_pack_on_close_mask = 0x0004,
+
+	/* don't pack tail of file */
+	i_nopack_mask = 0x0008,
+
+	/*
+	 * If either of these are set, "safe link" was created for this
+	 * file during truncate or unlink. Safe link is used to avoid
+	 * leakage of disk space on crash with some files open, but unlinked.
+	 */
+	i_link_saved_unlink_mask = 0x0010,
+	i_link_saved_truncate_mask = 0x0020,
+
+	i_has_xattr_dir = 0x0040,
+	i_data_log = 0x0080,
+} reiserfs_inode_flags;
+
+struct reiserfs_inode_info {
+	__u32 i_key[4];		/* key is still 4 32 bit integers */
+
+	/*
+	 * transient inode flags that are never stored on disk. Bitmasks
+	 * for this field are defined above.
+	 */
+	__u32 i_flags;
+
+	/* offset of first byte stored in direct item. */
+	__u32 i_first_direct_byte;
+
+	/* copy of persistent inode flags read from sd_attrs. */
+	__u32 i_attrs;
+
+	/* first unused block of a sequence of unused blocks */
+	int i_prealloc_block;
+	int i_prealloc_count;	/* length of that sequence */
+
+	/* per-transaction list of inodes which  have preallocated blocks */
+	struct list_head i_prealloc_list;
+
+	/*
+	 * new_packing_locality is created; new blocks for the contents
+	 * of this directory should be displaced
+	 */
+	unsigned new_packing_locality:1;
+
+	/*
+	 * we use these for fsync or O_SYNC to decide which transaction
+	 * needs to be committed in order for this inode to be properly
+	 * flushed
+	 */
+	unsigned int i_trans_id;
+
+	struct reiserfs_journal_list *i_jl;
+	atomic_t openers;
+	struct mutex tailpack;
+#ifdef CONFIG_REISERFS_FS_XATTR
+	struct rw_semaphore i_xattr_sem;
+#endif
+#ifdef CONFIG_QUOTA
+	struct dquot *i_dquot[MAXQUOTAS];
+#endif
+
+	struct inode vfs_inode;
+};
+
+typedef enum {
+	reiserfs_attrs_cleared = 0x00000001,
+} reiserfs_super_block_flags;
+
+/*
+ * struct reiserfs_super_block accessors/mutators since this is a disk
+ * structure, it will always be in little endian format.
+ */
+#define sb_block_count(sbp)         (le32_to_cpu((sbp)->s_v1.s_block_count))
+#define set_sb_block_count(sbp,v)   ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
+#define sb_free_blocks(sbp)         (le32_to_cpu((sbp)->s_v1.s_free_blocks))
+#define set_sb_free_blocks(sbp,v)   ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v))
+#define sb_root_block(sbp)          (le32_to_cpu((sbp)->s_v1.s_root_block))
+#define set_sb_root_block(sbp,v)    ((sbp)->s_v1.s_root_block = cpu_to_le32(v))
+
+#define sb_jp_journal_1st_block(sbp)  \
+              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block))
+#define set_sb_jp_journal_1st_block(sbp,v) \
+              ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v))
+#define sb_jp_journal_dev(sbp) \
+              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev))
+#define set_sb_jp_journal_dev(sbp,v) \
+              ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v))
+#define sb_jp_journal_size(sbp) \
+              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size))
+#define set_sb_jp_journal_size(sbp,v) \
+              ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v))
+#define sb_jp_journal_trans_max(sbp) \
+              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max))
+#define set_sb_jp_journal_trans_max(sbp,v) \
+              ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v))
+#define sb_jp_journal_magic(sbp) \
+              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic))
+#define set_sb_jp_journal_magic(sbp,v) \
+              ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v))
+#define sb_jp_journal_max_batch(sbp) \
+              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch))
+#define set_sb_jp_journal_max_batch(sbp,v) \
+              ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v))
+#define sb_jp_jourmal_max_commit_age(sbp) \
+              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age))
+#define set_sb_jp_journal_max_commit_age(sbp,v) \
+              ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v))
+
+#define sb_blocksize(sbp)          (le16_to_cpu((sbp)->s_v1.s_blocksize))
+#define set_sb_blocksize(sbp,v)    ((sbp)->s_v1.s_blocksize = cpu_to_le16(v))
+#define sb_oid_maxsize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_maxsize))
+#define set_sb_oid_maxsize(sbp,v)  ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v))
+#define sb_oid_cursize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_cursize))
+#define set_sb_oid_cursize(sbp,v)  ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v))
+#define sb_umount_state(sbp)       (le16_to_cpu((sbp)->s_v1.s_umount_state))
+#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
+#define sb_fs_state(sbp)           (le16_to_cpu((sbp)->s_v1.s_fs_state))
+#define set_sb_fs_state(sbp,v)     ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
+#define sb_hash_function_code(sbp) \
+              (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
+#define set_sb_hash_function_code(sbp,v) \
+              ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v))
+#define sb_tree_height(sbp)        (le16_to_cpu((sbp)->s_v1.s_tree_height))
+#define set_sb_tree_height(sbp,v)  ((sbp)->s_v1.s_tree_height = cpu_to_le16(v))
+#define sb_bmap_nr(sbp)            (le16_to_cpu((sbp)->s_v1.s_bmap_nr))
+#define set_sb_bmap_nr(sbp,v)      ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v))
+#define sb_version(sbp)            (le16_to_cpu((sbp)->s_v1.s_version))
+#define set_sb_version(sbp,v)      ((sbp)->s_v1.s_version = cpu_to_le16(v))
+
+#define sb_mnt_count(sbp)	   (le16_to_cpu((sbp)->s_mnt_count))
+#define set_sb_mnt_count(sbp, v)   ((sbp)->s_mnt_count = cpu_to_le16(v))
+
+#define sb_reserved_for_journal(sbp) \
+              (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
+#define set_sb_reserved_for_journal(sbp,v) \
+              ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v))
+
+/* LOGGING -- */
+
+/*
+ * These all interelate for performance.
+ *
+ * If the journal block count is smaller than n transactions, you lose speed.
+ * I don't know what n is yet, I'm guessing 8-16.
+ *
+ * typical transaction size depends on the application, how often fsync is
+ * called, and how many metadata blocks you dirty in a 30 second period.
+ * The more small files (<16k) you use, the larger your transactions will
+ * be.
+ *
+ * If your journal fills faster than dirty buffers get flushed to disk, it
+ * must flush them before allowing the journal to wrap, which slows things
+ * down.  If you need high speed meta data updates, the journal should be
+ * big enough to prevent wrapping before dirty meta blocks get to disk.
+ *
+ * If the batch max is smaller than the transaction max, you'll waste space
+ * at the end of the journal because journal_end sets the next transaction
+ * to start at 0 if the next transaction has any chance of wrapping.
+ *
+ * The large the batch max age, the better the speed, and the more meta
+ * data changes you'll lose after a crash.
+ */
+
+/* don't mess with these for a while */
+/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
+#define JOURNAL_BLOCK_SIZE  4096	/* BUG gotta get rid of this */
+#define JOURNAL_MAX_CNODE   1500	/* max cnodes to allocate. */
+#define JOURNAL_HASH_SIZE 8192
+
+/* number of copies of the bitmaps to have floating.  Must be >= 2 */
+#define JOURNAL_NUM_BITMAPS 5
+
+/*
+ * One of these for every block in every transaction
+ * Each one is in two hash tables.  First, a hash of the current transaction,
+ * and after journal_end, a hash of all the in memory transactions.
+ * next and prev are used by the current transaction (journal_hash).
+ * hnext and hprev are used by journal_list_hash.  If a block is in more
+ * than one transaction, the journal_list_hash links it in multiple times.
+ * This allows flush_journal_list to remove just the cnode belonging to a
+ * given transaction.
+ */
+struct reiserfs_journal_cnode {
+	struct buffer_head *bh;	/* real buffer head */
+	struct super_block *sb;	/* dev of real buffer head */
+
+	/* block number of real buffer head, == 0 when buffer on disk */
+	__u32 blocknr;
+
+	unsigned long state;
+
+	/* journal list this cnode lives in */
+	struct reiserfs_journal_list *jlist;
+
+	struct reiserfs_journal_cnode *next;	/* next in transaction list */
+	struct reiserfs_journal_cnode *prev;	/* prev in transaction list */
+	struct reiserfs_journal_cnode *hprev;	/* prev in hash list */
+	struct reiserfs_journal_cnode *hnext;	/* next in hash list */
+};
+
+struct reiserfs_bitmap_node {
+	int id;
+	char *data;
+	struct list_head list;
+};
+
+struct reiserfs_list_bitmap {
+	struct reiserfs_journal_list *journal_list;
+	struct reiserfs_bitmap_node **bitmaps;
+};
+
+/*
+ * one of these for each transaction.  The most important part here is the
+ * j_realblock.  this list of cnodes is used to hash all the blocks in all
+ * the commits, to mark all the real buffer heads dirty once all the commits
+ * hit the disk, and to make sure every real block in a transaction is on
+ * disk before allowing the log area to be overwritten
+ */
+struct reiserfs_journal_list {
+	unsigned long j_start;
+	unsigned long j_state;
+	unsigned long j_len;
+	atomic_t j_nonzerolen;
+	atomic_t j_commit_left;
+
+	/* all commits older than this on disk */
+	atomic_t j_older_commits_done;
+
+	struct mutex j_commit_mutex;
+	unsigned int j_trans_id;
+	time_t j_timestamp;
+	struct reiserfs_list_bitmap *j_list_bitmap;
+	struct buffer_head *j_commit_bh;	/* commit buffer head */
+	struct reiserfs_journal_cnode *j_realblock;
+	struct reiserfs_journal_cnode *j_freedlist;	/* list of buffers that were freed during this trans.  free each of these on flush */
+	/* time ordered list of all active transactions */
+	struct list_head j_list;
+
+	/*
+	 * time ordered list of all transactions we haven't tried
+	 * to flush yet
+	 */
+	struct list_head j_working_list;
+
+	/* list of tail conversion targets in need of flush before commit */
+	struct list_head j_tail_bh_list;
+
+	/* list of data=ordered buffers in need of flush before commit */
+	struct list_head j_bh_list;
+	int j_refcount;
+};
+
+struct reiserfs_journal {
+	struct buffer_head **j_ap_blocks;	/* journal blocks on disk */
+	/* newest journal block */
+	struct reiserfs_journal_cnode *j_last;
+
+	/* oldest journal block.  start here for traverse */
+	struct reiserfs_journal_cnode *j_first;
+
+	struct block_device *j_dev_bd;
+	fmode_t j_dev_mode;
+
+	/* first block on s_dev of reserved area journal */
+	int j_1st_reserved_block;
+
+	unsigned long j_state;
+	unsigned int j_trans_id;
+	unsigned long j_mount_id;
+
+	/* start of current waiting commit (index into j_ap_blocks) */
+	unsigned long j_start;
+	unsigned long j_len;	/* length of current waiting commit */
+
+	/* number of buffers requested by journal_begin() */
+	unsigned long j_len_alloc;
+
+	atomic_t j_wcount;	/* count of writers for current commit */
+
+	/* batch count. allows turning X transactions into 1 */
+	unsigned long j_bcount;
+
+	/* first unflushed transactions offset */
+	unsigned long j_first_unflushed_offset;
+
+	/* last fully flushed journal timestamp */
+	unsigned j_last_flush_trans_id;
+
+	struct buffer_head *j_header_bh;
+
+	time_t j_trans_start_time;	/* time this transaction started */
+	struct mutex j_mutex;
+	struct mutex j_flush_mutex;
+
+	/* wait for current transaction to finish before starting new one */
+	wait_queue_head_t j_join_wait;
+
+	atomic_t j_jlock;		/* lock for j_join_wait */
+	int j_list_bitmap_index;	/* number of next list bitmap to use */
+
+	/* no more journal begins allowed. MUST sleep on j_join_wait */
+	int j_must_wait;
+
+	/* next journal_end will flush all journal list */
+	int j_next_full_flush;
+
+	/* next journal_end will flush all async commits */
+	int j_next_async_flush;
+
+	int j_cnode_used;	/* number of cnodes on the used list */
+	int j_cnode_free;	/* number of cnodes on the free list */
+
+	/* max number of blocks in a transaction.  */
+	unsigned int j_trans_max;
+
+	/* max number of blocks to batch into a trans */
+	unsigned int j_max_batch;
+
+	/* in seconds, how old can an async commit be */
+	unsigned int j_max_commit_age;
+
+	/* in seconds, how old can a transaction be */
+	unsigned int j_max_trans_age;
+
+	/* the default for the max commit age */
+	unsigned int j_default_max_commit_age;
+
+	struct reiserfs_journal_cnode *j_cnode_free_list;
+
+	/* orig pointer returned from vmalloc */
+	struct reiserfs_journal_cnode *j_cnode_free_orig;
+
+	struct reiserfs_journal_list *j_current_jl;
+	int j_free_bitmap_nodes;
+	int j_used_bitmap_nodes;
+
+	int j_num_lists;	/* total number of active transactions */
+	int j_num_work_lists;	/* number that need attention from kreiserfsd */
+
+	/* debugging to make sure things are flushed in order */
+	unsigned int j_last_flush_id;
+
+	/* debugging to make sure things are committed in order */
+	unsigned int j_last_commit_id;
+
+	struct list_head j_bitmap_nodes;
+	struct list_head j_dirty_buffers;
+	spinlock_t j_dirty_buffers_lock;	/* protects j_dirty_buffers */
+
+	/* list of all active transactions */
+	struct list_head j_journal_list;
+
+	/* lists that haven't been touched by writeback attempts */
+	struct list_head j_working_list;
+
+	/* hash table for real buffer heads in current trans */
+	struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
+
+	/* hash table for all the real buffer heads in all the transactions */
+	struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
+
+	/* array of bitmaps to record the deleted blocks */
+	struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
+
+	/* list of inodes which have preallocated blocks */
+	struct list_head j_prealloc_list;
+	int j_persistent_trans;
+	unsigned long j_max_trans_size;
+	unsigned long j_max_batch_size;
+
+	int j_errno;
+
+	/* when flushing ordered buffers, throttle new ordered writers */
+	struct delayed_work j_work;
+	struct super_block *j_work_sb;
+	atomic_t j_async_throttle;
+};
+
+enum journal_state_bits {
+	J_WRITERS_BLOCKED = 1,	/* set when new writers not allowed */
+	J_WRITERS_QUEUED,    /* set when log is full due to too many writers */
+	J_ABORTED,           /* set when log is aborted */
+};
+
+/* ick.  magic string to find desc blocks in the journal */
+#define JOURNAL_DESC_MAGIC "ReIsErLB"
+
+typedef __u32(*hashf_t) (const signed char *, int);
+
+struct reiserfs_bitmap_info {
+	__u32 free_count;
+};
+
+struct proc_dir_entry;
+
+#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
+typedef unsigned long int stat_cnt_t;
+typedef struct reiserfs_proc_info_data {
+	spinlock_t lock;
+	int exiting;
+	int max_hash_collisions;
+
+	stat_cnt_t breads;
+	stat_cnt_t bread_miss;
+	stat_cnt_t search_by_key;
+	stat_cnt_t search_by_key_fs_changed;
+	stat_cnt_t search_by_key_restarted;
+
+	stat_cnt_t insert_item_restarted;
+	stat_cnt_t paste_into_item_restarted;
+	stat_cnt_t cut_from_item_restarted;
+	stat_cnt_t delete_solid_item_restarted;
+	stat_cnt_t delete_item_restarted;
+
+	stat_cnt_t leaked_oid;
+	stat_cnt_t leaves_removable;
+
+	/*
+	 * balances per level.
+	 * Use explicit 5 as MAX_HEIGHT is not visible yet.
+	 */
+	stat_cnt_t balance_at[5];	/* XXX */
+	/* sbk == search_by_key */
+	stat_cnt_t sbk_read_at[5];	/* XXX */
+	stat_cnt_t sbk_fs_changed[5];
+	stat_cnt_t sbk_restarted[5];
+	stat_cnt_t items_at[5];	/* XXX */
+	stat_cnt_t free_at[5];	/* XXX */
+	stat_cnt_t can_node_be_removed[5];	/* XXX */
+	long int lnum[5];	/* XXX */
+	long int rnum[5];	/* XXX */
+	long int lbytes[5];	/* XXX */
+	long int rbytes[5];	/* XXX */
+	stat_cnt_t get_neighbors[5];
+	stat_cnt_t get_neighbors_restart[5];
+	stat_cnt_t need_l_neighbor[5];
+	stat_cnt_t need_r_neighbor[5];
+
+	stat_cnt_t free_block;
+	struct __scan_bitmap_stats {
+		stat_cnt_t call;
+		stat_cnt_t wait;
+		stat_cnt_t bmap;
+		stat_cnt_t retry;
+		stat_cnt_t in_journal_hint;
+		stat_cnt_t in_journal_nohint;
+		stat_cnt_t stolen;
+	} scan_bitmap;
+	struct __journal_stats {
+		stat_cnt_t in_journal;
+		stat_cnt_t in_journal_bitmap;
+		stat_cnt_t in_journal_reusable;
+		stat_cnt_t lock_journal;
+		stat_cnt_t lock_journal_wait;
+		stat_cnt_t journal_being;
+		stat_cnt_t journal_relock_writers;
+		stat_cnt_t journal_relock_wcount;
+		stat_cnt_t mark_dirty;
+		stat_cnt_t mark_dirty_already;
+		stat_cnt_t mark_dirty_notjournal;
+		stat_cnt_t restore_prepared;
+		stat_cnt_t prepare;
+		stat_cnt_t prepare_retry;
+	} journal;
+} reiserfs_proc_info_data_t;
+#else
+typedef struct reiserfs_proc_info_data {
+} reiserfs_proc_info_data_t;
+#endif
+
+/* Number of quota types we support */
+#define REISERFS_MAXQUOTAS 2
+
+/* reiserfs union of in-core super block data */
+struct reiserfs_sb_info {
+	/* Buffer containing the super block */
+	struct buffer_head *s_sbh;
+
+	/* Pointer to the on-disk super block in the buffer */
+	struct reiserfs_super_block *s_rs;
+	struct reiserfs_bitmap_info *s_ap_bitmap;
+
+	/* pointer to journal information */
+	struct reiserfs_journal *s_journal;
+
+	unsigned short s_mount_state;	/* reiserfs state (valid, invalid) */
+
+	/* Serialize writers access, replace the old bkl */
+	struct mutex lock;
+
+	/* Owner of the lock (can be recursive) */
+	struct task_struct *lock_owner;
+
+	/* Depth of the lock, start from -1 like the bkl */
+	int lock_depth;
+
+	struct workqueue_struct *commit_wq;
+
+	/* Comment? -Hans */
+	void (*end_io_handler) (struct buffer_head *, int);
+
+	/*
+	 * pointer to function which is used to sort names in directory.
+	 * Set on mount
+	 */
+	hashf_t s_hash_function;
+
+	/* reiserfs's mount options are set here */
+	unsigned long s_mount_opt;
+
+	/* This is a structure that describes block allocator options */
+	struct {
+		/* Bitfield for enable/disable kind of options */
+		unsigned long bits;
+
+		/*
+		 * size started from which we consider file
+		 * to be a large one (in blocks)
+		 */
+		unsigned long large_file_size;
+
+		int border;	/* percentage of disk, border takes */
+
+		/*
+		 * Minimal file size (in blocks) starting
+		 * from which we do preallocations
+		 */
+		int preallocmin;
+
+		/*
+		 * Number of blocks we try to prealloc when file
+		 * reaches preallocmin size (in blocks) or prealloc_list
+		 is empty.
+		 */
+		int preallocsize;
+	} s_alloc_options;
+
+	/* Comment? -Hans */
+	wait_queue_head_t s_wait;
+	/* increased by one every time the  tree gets re-balanced */
+	atomic_t s_generation_counter;
+
+	/* File system properties. Currently holds on-disk FS format */
+	unsigned long s_properties;
+
+	/* session statistics */
+	int s_disk_reads;
+	int s_disk_writes;
+	int s_fix_nodes;
+	int s_do_balance;
+	int s_unneeded_left_neighbor;
+	int s_good_search_by_key_reada;
+	int s_bmaps;
+	int s_bmaps_without_search;
+	int s_direct2indirect;
+	int s_indirect2direct;
+
+	/*
+	 * set up when it's ok for reiserfs_read_inode2() to read from
+	 * disk inode with nlink==0. Currently this is only used during
+	 * finish_unfinished() processing at mount time
+	 */
+	int s_is_unlinked_ok;
+
+	reiserfs_proc_info_data_t s_proc_info_data;
+	struct proc_dir_entry *procdir;
+
+	/* amount of blocks reserved for further allocations */
+	int reserved_blocks;
+
+
+	/* this lock on now only used to protect reserved_blocks variable */
+	spinlock_t bitmap_lock;
+	struct dentry *priv_root;	/* root of /.reiserfs_priv */
+	struct dentry *xattr_root;	/* root of /.reiserfs_priv/xattrs */
+	int j_errno;
+
+	int work_queued;              /* non-zero delayed work is queued */
+	struct delayed_work old_work; /* old transactions flush delayed work */
+	spinlock_t old_work_lock;     /* protects old_work and work_queued */
+
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[REISERFS_MAXQUOTAS];
+	int s_jquota_fmt;
+#endif
+	char *s_jdev;		/* Stored jdev for mount option showing */
+#ifdef CONFIG_REISERFS_CHECK
+
+	/*
+	 * Detects whether more than one copy of tb exists per superblock
+	 * as a means of checking whether do_balance is executing
+	 * concurrently against another tree reader/writer on a same
+	 * mount point.
+	 */
+	struct tree_balance *cur_tb;
+#endif
+};
+
+/* Definitions of reiserfs on-disk properties: */
+#define REISERFS_3_5 0
+#define REISERFS_3_6 1
+#define REISERFS_OLD_FORMAT 2
+
+/* Mount options */
+enum reiserfs_mount_options {
+	/* large tails will be created in a session */
+	REISERFS_LARGETAIL,
+	/*
+	 * small (for files less than block size) tails will
+	 * be created in a session
+	 */
+	REISERFS_SMALLTAIL,
+
+	/* replay journal and return 0. Use by fsck */
+	REPLAYONLY,
+
+	/*
+	 * -o conv: causes conversion of old format super block to the
+	 * new format. If not specified - old partition will be dealt
+	 * with in a manner of 3.5.x
+	 */
+	REISERFS_CONVERT,
+
+	/*
+	 * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
+	 * reiserfs disks from 3.5.19 or earlier.  99% of the time, this
+	 * option is not required.  If the normal autodection code can't
+	 * determine which hash to use (because both hashes had the same
+	 * value for a file) use this option to force a specific hash.
+	 * It won't allow you to override the existing hash on the FS, so
+	 * if you have a tea hash disk, and mount with -o hash=rupasov,
+	 * the mount will fail.
+	 */
+	FORCE_TEA_HASH,		/* try to force tea hash on mount */
+	FORCE_RUPASOV_HASH,	/* try to force rupasov hash on mount */
+	FORCE_R5_HASH,		/* try to force rupasov hash on mount */
+	FORCE_HASH_DETECT,	/* try to detect hash function on mount */
+
+	REISERFS_DATA_LOG,
+	REISERFS_DATA_ORDERED,
+	REISERFS_DATA_WRITEBACK,
+
+	/*
+	 * used for testing experimental features, makes benchmarking new
+	 * features with and without more convenient, should never be used by
+	 * users in any code shipped to users (ideally)
+	 */
+
+	REISERFS_NO_BORDER,
+	REISERFS_NO_UNHASHED_RELOCATION,
+	REISERFS_HASHED_RELOCATION,
+	REISERFS_ATTRS,
+	REISERFS_XATTRS_USER,
+	REISERFS_POSIXACL,
+	REISERFS_EXPOSE_PRIVROOT,
+	REISERFS_BARRIER_NONE,
+	REISERFS_BARRIER_FLUSH,
+
+	/* Actions on error */
+	REISERFS_ERROR_PANIC,
+	REISERFS_ERROR_RO,
+	REISERFS_ERROR_CONTINUE,
+
+	REISERFS_USRQUOTA,	/* User quota option specified */
+	REISERFS_GRPQUOTA,	/* Group quota option specified */
+
+	REISERFS_TEST1,
+	REISERFS_TEST2,
+	REISERFS_TEST3,
+	REISERFS_TEST4,
+	REISERFS_UNSUPPORTED_OPT,
+};
+
+#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
+#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
+#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH))
+#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT))
+#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER))
+#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
+#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
+#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4))
+
+#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
+#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
+#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
+#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
+#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
+#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
+#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
+#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
+#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
+#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
+#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
+#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
+#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
+#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
+#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
+
+#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
+#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
+
+void reiserfs_file_buffer(struct buffer_head *bh, int list);
+extern struct file_system_type reiserfs_fs_type;
+int reiserfs_resize(struct super_block *, unsigned long);
+
+#define CARRY_ON                0
+#define SCHEDULE_OCCURRED       1
+
+#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
+#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
+#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
+#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
+#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
+
+#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
+
+#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
+static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
+						*journal)
+{
+	return test_bit(J_ABORTED, &journal->j_state);
+}
+
+/*
+ * Locking primitives. The write lock is a per superblock
+ * special mutex that has properties close to the Big Kernel Lock
+ * which was used in the previous locking scheme.
+ */
+void reiserfs_write_lock(struct super_block *s);
+void reiserfs_write_unlock(struct super_block *s);
+int __must_check reiserfs_write_unlock_nested(struct super_block *s);
+void reiserfs_write_lock_nested(struct super_block *s, int depth);
+
+#ifdef CONFIG_REISERFS_CHECK
+void reiserfs_lock_check_recursive(struct super_block *s);
+#else
+static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
+#endif
+
+/*
+ * Several mutexes depend on the write lock.
+ * However sometimes we want to relax the write lock while we hold
+ * these mutexes, according to the release/reacquire on schedule()
+ * properties of the Bkl that were used.
+ * Reiserfs performances and locking were based on this scheme.
+ * Now that the write lock is a mutex and not the bkl anymore, doing so
+ * may result in a deadlock:
+ *
+ * A acquire write_lock
+ * A acquire j_commit_mutex
+ * A release write_lock and wait for something
+ * B acquire write_lock
+ * B can't acquire j_commit_mutex and sleep
+ * A can't acquire write lock anymore
+ * deadlock
+ *
+ * What we do here is avoiding such deadlock by playing the same game
+ * than the Bkl: if we can't acquire a mutex that depends on the write lock,
+ * we release the write lock, wait a bit and then retry.
+ *
+ * The mutexes concerned by this hack are:
+ * - The commit mutex of a journal list
+ * - The flush mutex
+ * - The journal lock
+ * - The inode mutex
+ */
+static inline void reiserfs_mutex_lock_safe(struct mutex *m,
+					    struct super_block *s)
+{
+	int depth;
+
+	depth = reiserfs_write_unlock_nested(s);
+	mutex_lock(m);
+	reiserfs_write_lock_nested(s, depth);
+}
+
+static inline void
+reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
+				struct super_block *s)
+{
+	int depth;
+
+	depth = reiserfs_write_unlock_nested(s);
+	mutex_lock_nested(m, subclass);
+	reiserfs_write_lock_nested(s, depth);
+}
+
+static inline void
+reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
+{
+       int depth;
+       depth = reiserfs_write_unlock_nested(s);
+       down_read(sem);
+       reiserfs_write_lock_nested(s, depth);
+}
+
+/*
+ * When we schedule, we usually want to also release the write lock,
+ * according to the previous bkl based locking scheme of reiserfs.
+ */
+static inline void reiserfs_cond_resched(struct super_block *s)
+{
+	if (need_resched()) {
+		int depth;
+
+		depth = reiserfs_write_unlock_nested(s);
+		schedule();
+		reiserfs_write_lock_nested(s, depth);
+	}
+}
+
+struct fid;
+
+/*
+ * in reading the #defines, it may help to understand that they employ
+ *  the following abbreviations:
+ *
+ *  B = Buffer
+ *  I = Item header
+ *  H = Height within the tree (should be changed to LEV)
+ *  N = Number of the item in the node
+ *  STAT = stat data
+ *  DEH = Directory Entry Header
+ *  EC = Entry Count
+ *  E = Entry number
+ *  UL = Unsigned Long
+ *  BLKH = BLocK Header
+ *  UNFM = UNForMatted node
+ *  DC = Disk Child
+ *  P = Path
+ *
+ *  These #defines are named by concatenating these abbreviations,
+ *  where first comes the arguments, and last comes the return value,
+ *  of the macro.
+ */
+
+#define USE_INODE_GENERATION_COUNTER
+
+#define REISERFS_PREALLOCATE
+#define DISPLACE_NEW_PACKING_LOCALITIES
+#define PREALLOCATION_SIZE 9
+
+/* n must be power of 2 */
+#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
+
+/*
+ * to be ok for alpha and others we have to align structures to 8 byte
+ * boundary.
+ * FIXME: do not change 4 by anything else: there is code which relies on that
+ */
+#define ROUND_UP(x) _ROUND_UP(x,8LL)
+
+/*
+ * debug levels.  Right now, CONFIG_REISERFS_CHECK means print all debug
+ * messages.
+ */
+#define REISERFS_DEBUG_CODE 5	/* extra messages to help find/debug errors */
+
+void __reiserfs_warning(struct super_block *s, const char *id,
+			 const char *func, const char *fmt, ...);
+#define reiserfs_warning(s, id, fmt, args...) \
+	 __reiserfs_warning(s, id, __func__, fmt, ##args)
+/* assertions handling */
+
+/* always check a condition and panic if it's false. */
+#define __RASSERT(cond, scond, format, args...)			\
+do {									\
+	if (!(cond))							\
+		reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
+			       __FILE__ ":%i:%s: " format "\n",		\
+			       __LINE__, __func__ , ##args);		\
+} while (0)
+
+#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
+
+#if defined( CONFIG_REISERFS_CHECK )
+#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args)
+#else
+#define RFALSE( cond, format, args... ) do {;} while( 0 )
+#endif
+
+#define CONSTF __attribute_const__
+/*
+ * Disk Data Structures
+ */
+
+/***************************************************************************
+ *                             SUPER BLOCK                                 *
+ ***************************************************************************/
+
+/*
+ * Structure of super block on disk, a version of which in RAM is often
+ * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
+ * structure containing fields never written to disk.
+ */
+#define UNSET_HASH 0	/* Detect hash on disk */
+#define TEA_HASH  1
+#define YURA_HASH 2
+#define R5_HASH   3
+#define DEFAULT_HASH R5_HASH
+
+struct journal_params {
+	/* where does journal start from on its * device */
+	__le32 jp_journal_1st_block;
+
+	/* journal device st_rdev */
+	__le32 jp_journal_dev;
+
+	/* size of the journal */
+	__le32 jp_journal_size;
+
+	/* max number of blocks in a transaction. */
+	__le32 jp_journal_trans_max;
+
+	/*
+	 * random value made on fs creation
+	 * (this was sb_journal_block_count)
+	 */
+	__le32 jp_journal_magic;
+
+	/* max number of blocks to batch into a trans */
+	__le32 jp_journal_max_batch;
+
+	/* in seconds, how old can an async  commit be */
+	__le32 jp_journal_max_commit_age;
+
+	/* in seconds, how old can a transaction be */
+	__le32 jp_journal_max_trans_age;
+};
+
+/* this is the super from 3.5.X, where X >= 10 */
+struct reiserfs_super_block_v1 {
+	__le32 s_block_count;	/* blocks count         */
+	__le32 s_free_blocks;	/* free blocks count    */
+	__le32 s_root_block;	/* root block number    */
+	struct journal_params s_journal;
+	__le16 s_blocksize;	/* block size */
+
+	/* max size of object id array, see get_objectid() commentary  */
+	__le16 s_oid_maxsize;
+	__le16 s_oid_cursize;	/* current size of object id array */
+
+	/* this is set to 1 when filesystem was umounted, to 2 - when not */
+	__le16 s_umount_state;
+
+	/*
+	 * reiserfs magic string indicates that file system is reiserfs:
+	 * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
+	 */
+	char s_magic[10];
+
+	/*
+	 * it is set to used by fsck to mark which
+	 * phase of rebuilding is done
+	 */
+	__le16 s_fs_state;
+	/*
+	 * indicate, what hash function is being use
+	 * to sort names in a directory
+	 */
+	__le32 s_hash_function_code;
+	__le16 s_tree_height;	/* height of disk tree */
+
+	/*
+	 * amount of bitmap blocks needed to address
+	 * each block of file system
+	 */
+	__le16 s_bmap_nr;
+
+	/*
+	 * this field is only reliable on filesystem with non-standard journal
+	 */
+	__le16 s_version;
+
+	/*
+	 * size in blocks of journal area on main device, we need to
+	 * keep after making fs with non-standard journal
+	 */
+	__le16 s_reserved_for_journal;
+} __attribute__ ((__packed__));
+
+#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
+
+/* this is the on disk super block */
+struct reiserfs_super_block {
+	struct reiserfs_super_block_v1 s_v1;
+	__le32 s_inode_generation;
+
+	/* Right now used only by inode-attributes, if enabled */
+	__le32 s_flags;
+
+	unsigned char s_uuid[16];	/* filesystem unique identifier */
+	unsigned char s_label[16];	/* filesystem volume label */
+	__le16 s_mnt_count;		/* Count of mounts since last fsck */
+	__le16 s_max_mnt_count;		/* Maximum mounts before check */
+	__le32 s_lastcheck;		/* Timestamp of last fsck */
+	__le32 s_check_interval;	/* Interval between checks */
+
+	/*
+	 * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
+	 * so any additions must be updated there as well. */
+	char s_unused[76];
+} __attribute__ ((__packed__));
+
+#define SB_SIZE (sizeof(struct reiserfs_super_block))
+
+#define REISERFS_VERSION_1 0
+#define REISERFS_VERSION_2 2
+
+/* on-disk super block fields converted to cpu form */
+#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
+#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
+#define SB_BLOCKSIZE(s) \
+        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize))
+#define SB_BLOCK_COUNT(s) \
+        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count))
+#define SB_FREE_BLOCKS(s) \
+        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks))
+#define SB_REISERFS_MAGIC(s) \
+        (SB_V1_DISK_SUPER_BLOCK(s)->s_magic)
+#define SB_ROOT_BLOCK(s) \
+        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block))
+#define SB_TREE_HEIGHT(s) \
+        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height))
+#define SB_REISERFS_STATE(s) \
+        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state))
+#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version))
+#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr))
+
+#define PUT_SB_BLOCK_COUNT(s, val) \
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0)
+#define PUT_SB_FREE_BLOCKS(s, val) \
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0)
+#define PUT_SB_ROOT_BLOCK(s, val) \
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0)
+#define PUT_SB_TREE_HEIGHT(s, val) \
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
+#define PUT_SB_REISERFS_STATE(s, val) \
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
+#define PUT_SB_VERSION(s, val) \
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
+#define PUT_SB_BMAP_NR(s, val) \
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
+
+#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
+#define SB_ONDISK_JOURNAL_SIZE(s) \
+         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
+#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \
+         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block))
+#define SB_ONDISK_JOURNAL_DEVICE(s) \
+         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev))
+#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \
+         le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal))
+
+#define is_block_in_log_or_reserved_area(s, block) \
+         block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
+         && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) +  \
+         ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
+         SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
+
+int is_reiserfs_3_5(struct reiserfs_super_block *rs);
+int is_reiserfs_3_6(struct reiserfs_super_block *rs);
+int is_reiserfs_jr(struct reiserfs_super_block *rs);
+
+/*
+ * ReiserFS leaves the first 64k unused, so that partition labels have
+ * enough space.  If someone wants to write a fancy bootloader that
+ * needs more than 64k, let us know, and this will be increased in size.
+ * This number must be larger than than the largest block size on any
+ * platform, or code will break.  -Hans
+ */
+#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
+#define REISERFS_FIRST_BLOCK unused_define
+#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
+
+/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
+#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
+
+/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
+#define CARRY_ON      0
+#define REPEAT_SEARCH -1
+#define IO_ERROR      -2
+#define NO_DISK_SPACE -3
+#define NO_BALANCING_NEEDED  (-4)
+#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
+#define QUOTA_EXCEEDED -6
+
+typedef __u32 b_blocknr_t;
+typedef __le32 unp_t;
+
+struct unfm_nodeinfo {
+	unp_t unfm_nodenum;
+	unsigned short unfm_freespace;
+};
+
+/* there are two formats of keys: 3.5 and 3.6 */
+#define KEY_FORMAT_3_5 0
+#define KEY_FORMAT_3_6 1
+
+/* there are two stat datas */
+#define STAT_DATA_V1 0
+#define STAT_DATA_V2 1
+
+static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
+{
+	return container_of(inode, struct reiserfs_inode_info, vfs_inode);
+}
+
+static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/*
+ * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
+ * which overflows on large file systems.
+ */
+static inline __u32 reiserfs_bmap_count(struct super_block *sb)
+{
+	return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
+}
+
+static inline int bmap_would_wrap(unsigned bmap_nr)
+{
+	return bmap_nr > ((1LL << 16) - 1);
+}
+
+/*
+ * this says about version of key of all items (but stat data) the
+ * object consists of
+ */
+#define get_inode_item_key_version( inode )                                    \
+    ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
+
+#define set_inode_item_key_version( inode, version )                           \
+         ({ if((version)==KEY_FORMAT_3_6)                                      \
+                REISERFS_I(inode)->i_flags |= i_item_key_version_mask;      \
+            else                                                               \
+                REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; })
+
+#define get_inode_sd_version(inode)                                            \
+    ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1)
+
+#define set_inode_sd_version(inode, version)                                   \
+         ({ if((version)==STAT_DATA_V2)                                        \
+                REISERFS_I(inode)->i_flags |= i_stat_data_version_mask;     \
+            else                                                               \
+                REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
+
+/*
+ * This is an aggressive tail suppression policy, I am hoping it
+ * improves our benchmarks. The principle behind it is that percentage
+ * space saving is what matters, not absolute space saving.  This is
+ * non-intuitive, but it helps to understand it if you consider that the
+ * cost to access 4 blocks is not much more than the cost to access 1
+ * block, if you have to do a seek and rotate.  A tail risks a
+ * non-linear disk access that is significant as a percentage of total
+ * time cost for a 4 block file and saves an amount of space that is
+ * less significant as a percentage of space, or so goes the hypothesis.
+ * -Hans
+ */
+#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
+(\
+  (!(n_tail_size)) || \
+  (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \
+   ( (n_file_size) >= (n_block_size) * 4 ) || \
+   ( ( (n_file_size) >= (n_block_size) * 3 ) && \
+     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \
+   ( ( (n_file_size) >= (n_block_size) * 2 ) && \
+     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \
+   ( ( (n_file_size) >= (n_block_size) ) && \
+     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
+)
+
+/*
+ * Another strategy for tails, this one means only create a tail if all the
+ * file would fit into one DIRECT item.
+ * Primary intention for this one is to increase performance by decreasing
+ * seeking.
+*/
+#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
+(\
+  (!(n_tail_size)) || \
+  (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
+)
+
+/*
+ * values for s_umount_state field
+ */
+#define REISERFS_VALID_FS    1
+#define REISERFS_ERROR_FS    2
+
+/*
+ * there are 5 item types currently
+ */
+#define TYPE_STAT_DATA 0
+#define TYPE_INDIRECT 1
+#define TYPE_DIRECT 2
+#define TYPE_DIRENTRY 3
+#define TYPE_MAXTYPE 3
+#define TYPE_ANY 15		/* FIXME: comment is required */
+
+/***************************************************************************
+ *                       KEY & ITEM HEAD                                   *
+ ***************************************************************************/
+
+/* * directories use this key as well as old files */
+struct offset_v1 {
+	__le32 k_offset;
+	__le32 k_uniqueness;
+} __attribute__ ((__packed__));
+
+struct offset_v2 {
+	__le64 v;
+} __attribute__ ((__packed__));
+
+static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
+{
+	__u8 type = le64_to_cpu(v2->v) >> 60;
+	return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
+}
+
+static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
+{
+	v2->v =
+	    (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
+}
+
+static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
+{
+	return le64_to_cpu(v2->v) & (~0ULL >> 4);
+}
+
+static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
+{
+	offset &= (~0ULL >> 4);
+	v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
+}
+
+/*
+ * Key of an item determines its location in the S+tree, and
+ * is composed of 4 components
+ */
+struct reiserfs_key {
+	/* packing locality: by default parent directory object id */
+	__le32 k_dir_id;
+
+	__le32 k_objectid;	/* object identifier */
+	union {
+		struct offset_v1 k_offset_v1;
+		struct offset_v2 k_offset_v2;
+	} __attribute__ ((__packed__)) u;
+} __attribute__ ((__packed__));
+
+struct in_core_key {
+	/* packing locality: by default parent directory object id */
+	__u32 k_dir_id;
+	__u32 k_objectid;	/* object identifier */
+	__u64 k_offset;
+	__u8 k_type;
+};
+
+struct cpu_key {
+	struct in_core_key on_disk_key;
+	int version;
+	/* 3 in all cases but direct2indirect and indirect2direct conversion */
+	int key_length;
+};
+
+/*
+ * Our function for comparing keys can compare keys of different
+ * lengths.  It takes as a parameter the length of the keys it is to
+ * compare.  These defines are used in determining what is to be passed
+ * to it as that parameter.
+ */
+#define REISERFS_FULL_KEY_LEN     4
+#define REISERFS_SHORT_KEY_LEN    2
+
+/* The result of the key compare */
+#define FIRST_GREATER 1
+#define SECOND_GREATER -1
+#define KEYS_IDENTICAL 0
+#define KEY_FOUND 1
+#define KEY_NOT_FOUND 0
+
+#define KEY_SIZE (sizeof(struct reiserfs_key))
+#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32))
+
+/* return values for search_by_key and clones */
+#define ITEM_FOUND 1
+#define ITEM_NOT_FOUND 0
+#define ENTRY_FOUND 1
+#define ENTRY_NOT_FOUND 0
+#define DIRECTORY_NOT_FOUND -1
+#define REGULAR_FILE_FOUND -2
+#define DIRECTORY_FOUND -3
+#define BYTE_FOUND 1
+#define BYTE_NOT_FOUND 0
+#define FILE_NOT_FOUND -1
+
+#define POSITION_FOUND 1
+#define POSITION_NOT_FOUND 0
+
+/* return values for reiserfs_find_entry and search_by_entry_key */
+#define NAME_FOUND 1
+#define NAME_NOT_FOUND 0
+#define GOTO_PREVIOUS_ITEM 2
+#define NAME_FOUND_INVISIBLE 3
+
+/*
+ * Everything in the filesystem is stored as a set of items.  The
+ * item head contains the key of the item, its free space (for
+ * indirect items) and specifies the location of the item itself
+ * within the block.
+ */
+
+struct item_head {
+	/*
+	 * Everything in the tree is found by searching for it based on
+	 * its key.
+	 */
+	struct reiserfs_key ih_key;
+	union {
+		/*
+		 * The free space in the last unformatted node of an
+		 * indirect item if this is an indirect item.  This
+		 * equals 0xFFFF iff this is a direct item or stat data
+		 * item. Note that the key, not this field, is used to
+		 * determine the item type, and thus which field this
+		 * union contains.
+		 */
+		__le16 ih_free_space_reserved;
+
+		/*
+		 * Iff this is a directory item, this field equals the
+		 * number of directory entries in the directory item.
+		 */
+		__le16 ih_entry_count;
+	} __attribute__ ((__packed__)) u;
+	__le16 ih_item_len;	/* total size of the item body */
+
+	/* an offset to the item body within the block */
+	__le16 ih_item_location;
+
+	/*
+	 * 0 for all old items, 2 for new ones. Highest bit is set by fsck
+	 * temporary, cleaned after all done
+	 */
+	__le16 ih_version;
+} __attribute__ ((__packed__));
+/* size of item header     */
+#define IH_SIZE (sizeof(struct item_head))
+
+#define ih_free_space(ih)            le16_to_cpu((ih)->u.ih_free_space_reserved)
+#define ih_version(ih)               le16_to_cpu((ih)->ih_version)
+#define ih_entry_count(ih)           le16_to_cpu((ih)->u.ih_entry_count)
+#define ih_location(ih)              le16_to_cpu((ih)->ih_item_location)
+#define ih_item_len(ih)              le16_to_cpu((ih)->ih_item_len)
+
+#define put_ih_free_space(ih, val)   do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0)
+#define put_ih_version(ih, val)      do { (ih)->ih_version = cpu_to_le16(val); } while (0)
+#define put_ih_entry_count(ih, val)  do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0)
+#define put_ih_location(ih, val)     do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
+#define put_ih_item_len(ih, val)     do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
+
+#define unreachable_item(ih) (ih_version(ih) & (1 << 15))
+
+#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
+#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
+
+/*
+ * these operate on indirect items, where you've got an array of ints
+ * at a possibly unaligned location.  These are a noop on ia32
+ *
+ * p is the array of __u32, i is the index into the array, v is the value
+ * to store there.
+ */
+#define get_block_num(p, i) get_unaligned_le32((p) + (i))
+#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
+
+/* * in old version uniqueness field shows key type */
+#define V1_SD_UNIQUENESS 0
+#define V1_INDIRECT_UNIQUENESS 0xfffffffe
+#define V1_DIRECT_UNIQUENESS 0xffffffff
+#define V1_DIRENTRY_UNIQUENESS 500
+#define V1_ANY_UNIQUENESS 555	/* FIXME: comment is required */
+
+/* here are conversion routines */
+static inline int uniqueness2type(__u32 uniqueness) CONSTF;
+static inline int uniqueness2type(__u32 uniqueness)
+{
+	switch ((int)uniqueness) {
+	case V1_SD_UNIQUENESS:
+		return TYPE_STAT_DATA;
+	case V1_INDIRECT_UNIQUENESS:
+		return TYPE_INDIRECT;
+	case V1_DIRECT_UNIQUENESS:
+		return TYPE_DIRECT;
+	case V1_DIRENTRY_UNIQUENESS:
+		return TYPE_DIRENTRY;
+	case V1_ANY_UNIQUENESS:
+	default:
+		return TYPE_ANY;
+	}
+}
+
+static inline __u32 type2uniqueness(int type) CONSTF;
+static inline __u32 type2uniqueness(int type)
+{
+	switch (type) {
+	case TYPE_STAT_DATA:
+		return V1_SD_UNIQUENESS;
+	case TYPE_INDIRECT:
+		return V1_INDIRECT_UNIQUENESS;
+	case TYPE_DIRECT:
+		return V1_DIRECT_UNIQUENESS;
+	case TYPE_DIRENTRY:
+		return V1_DIRENTRY_UNIQUENESS;
+	case TYPE_ANY:
+	default:
+		return V1_ANY_UNIQUENESS;
+	}
+}
+
+/*
+ * key is pointer to on disk key which is stored in le, result is cpu,
+ * there is no way to get version of object from key, so, provide
+ * version to these defines
+ */
+static inline loff_t le_key_k_offset(int version,
+				     const struct reiserfs_key *key)
+{
+	return (version == KEY_FORMAT_3_5) ?
+	    le32_to_cpu(key->u.k_offset_v1.k_offset) :
+	    offset_v2_k_offset(&(key->u.k_offset_v2));
+}
+
+static inline loff_t le_ih_k_offset(const struct item_head *ih)
+{
+	return le_key_k_offset(ih_version(ih), &(ih->ih_key));
+}
+
+static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
+{
+	if (version == KEY_FORMAT_3_5) {
+		loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
+		return uniqueness2type(val);
+	} else
+		return offset_v2_k_type(&(key->u.k_offset_v2));
+}
+
+static inline loff_t le_ih_k_type(const struct item_head *ih)
+{
+	return le_key_k_type(ih_version(ih), &(ih->ih_key));
+}
+
+static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
+				       loff_t offset)
+{
+	if (version == KEY_FORMAT_3_5)
+		key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
+	else
+		set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
+}
+
+static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
+				       loff_t offset)
+{
+	set_le_key_k_offset(version, key,
+			    le_key_k_offset(version, key) + offset);
+}
+
+static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
+{
+	add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
+}
+
+static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
+{
+	set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
+}
+
+static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
+				     int type)
+{
+	if (version == KEY_FORMAT_3_5) {
+		type = type2uniqueness(type);
+		key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
+	} else
+	       set_offset_v2_k_type(&key->u.k_offset_v2, type);
+}
+
+static inline void set_le_ih_k_type(struct item_head *ih, int type)
+{
+	set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
+}
+
+static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_DIRENTRY;
+}
+
+static inline int is_direct_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_DIRECT;
+}
+
+static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_INDIRECT;
+}
+
+static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_STAT_DATA;
+}
+
+/* item header has version.  */
+static inline int is_direntry_le_ih(struct item_head *ih)
+{
+	return is_direntry_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_direct_le_ih(struct item_head *ih)
+{
+	return is_direct_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_indirect_le_ih(struct item_head *ih)
+{
+	return is_indirect_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_statdata_le_ih(struct item_head *ih)
+{
+	return is_statdata_le_key(ih_version(ih), &ih->ih_key);
+}
+
+/* key is pointer to cpu key, result is cpu */
+static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
+{
+	return key->on_disk_key.k_offset;
+}
+
+static inline loff_t cpu_key_k_type(const struct cpu_key *key)
+{
+	return key->on_disk_key.k_type;
+}
+
+static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
+{
+	key->on_disk_key.k_offset = offset;
+}
+
+static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
+{
+	key->on_disk_key.k_type = type;
+}
+
+static inline void cpu_key_k_offset_dec(struct cpu_key *key)
+{
+	key->on_disk_key.k_offset--;
+}
+
+#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
+#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT)
+#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
+#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
+
+/* are these used ? */
+#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
+#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
+#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
+#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
+
+#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
+    (!COMP_SHORT_KEYS(ih, key) && \
+	  I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
+
+/* maximal length of item */
+#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
+#define MIN_ITEM_LEN 1
+
+/* object identifier for root dir */
+#define REISERFS_ROOT_OBJECTID 2
+#define REISERFS_ROOT_PARENT_OBJECTID 1
+
+extern struct reiserfs_key root_key;
+
+/*
+ * Picture represents a leaf of the S+tree
+ *  ______________________________________________________
+ * |      |  Array of     |                   |           |
+ * |Block |  Object-Item  |      F r e e      |  Objects- |
+ * | head |  Headers      |     S p a c e     |   Items   |
+ * |______|_______________|___________________|___________|
+ */
+
+/*
+ * Header of a disk block.  More precisely, header of a formatted leaf
+ * or internal node, and not the header of an unformatted node.
+ */
+struct block_head {
+	__le16 blk_level;	/* Level of a block in the tree. */
+	__le16 blk_nr_item;	/* Number of keys/items in a block. */
+	__le16 blk_free_space;	/* Block free space in bytes. */
+	__le16 blk_reserved;
+	/* dump this in v4/planA */
+
+	/* kept only for compatibility */
+	struct reiserfs_key blk_right_delim_key;
+};
+
+#define BLKH_SIZE                     (sizeof(struct block_head))
+#define blkh_level(p_blkh)            (le16_to_cpu((p_blkh)->blk_level))
+#define blkh_nr_item(p_blkh)          (le16_to_cpu((p_blkh)->blk_nr_item))
+#define blkh_free_space(p_blkh)       (le16_to_cpu((p_blkh)->blk_free_space))
+#define blkh_reserved(p_blkh)         (le16_to_cpu((p_blkh)->blk_reserved))
+#define set_blkh_level(p_blkh,val)    ((p_blkh)->blk_level = cpu_to_le16(val))
+#define set_blkh_nr_item(p_blkh,val)  ((p_blkh)->blk_nr_item = cpu_to_le16(val))
+#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val))
+#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val))
+#define blkh_right_delim_key(p_blkh)  ((p_blkh)->blk_right_delim_key)
+#define set_blkh_right_delim_key(p_blkh,val)  ((p_blkh)->blk_right_delim_key = val)
+
+/* values for blk_level field of the struct block_head */
+
+/*
+ * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
+ * It is then  used to see whether the node is still in the tree
+ */
+#define FREE_LEVEL 0
+
+#define DISK_LEAF_NODE_LEVEL  1	/* Leaf node level. */
+
+/*
+ * Given the buffer head of a formatted node, resolve to the
+ * block head of that node.
+ */
+#define B_BLK_HEAD(bh)			((struct block_head *)((bh)->b_data))
+/* Number of items that are in buffer. */
+#define B_NR_ITEMS(bh)			(blkh_nr_item(B_BLK_HEAD(bh)))
+#define B_LEVEL(bh)			(blkh_level(B_BLK_HEAD(bh)))
+#define B_FREE_SPACE(bh)		(blkh_free_space(B_BLK_HEAD(bh)))
+
+#define PUT_B_NR_ITEMS(bh, val)		do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
+#define PUT_B_LEVEL(bh, val)		do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
+#define PUT_B_FREE_SPACE(bh, val)	do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
+
+/* Get right delimiting key. -- little endian */
+#define B_PRIGHT_DELIM_KEY(bh)		(&(blk_right_delim_key(B_BLK_HEAD(bh))))
+
+/* Does the buffer contain a disk leaf. */
+#define B_IS_ITEMS_LEVEL(bh)		(B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
+
+/* Does the buffer contain a disk internal node */
+#define B_IS_KEYS_LEVEL(bh)      (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
+					    && B_LEVEL(bh) <= MAX_HEIGHT)
+
+/***************************************************************************
+ *                             STAT DATA                                   *
+ ***************************************************************************/
+
+/*
+ * old stat data is 32 bytes long. We are going to distinguish new one by
+ * different size
+*/
+struct stat_data_v1 {
+	__le16 sd_mode;		/* file type, permissions */
+	__le16 sd_nlink;	/* number of hard links */
+	__le16 sd_uid;		/* owner */
+	__le16 sd_gid;		/* group */
+	__le32 sd_size;		/* file size */
+	__le32 sd_atime;	/* time of last access */
+	__le32 sd_mtime;	/* time file was last modified  */
+
+	/*
+	 * time inode (stat data) was last changed
+	 * (except changes to sd_atime and sd_mtime)
+	 */
+	__le32 sd_ctime;
+	union {
+		__le32 sd_rdev;
+		__le32 sd_blocks;	/* number of blocks file uses */
+	} __attribute__ ((__packed__)) u;
+
+	/*
+	 * first byte of file which is stored in a direct item: except that if
+	 * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
+	 * direct item.  The existence of this field really grates on me.
+	 * Let's replace it with a macro based on sd_size and our tail
+	 * suppression policy.  Someday.  -Hans
+	 */
+	__le32 sd_first_direct_byte;
+} __attribute__ ((__packed__));
+
+#define SD_V1_SIZE              (sizeof(struct stat_data_v1))
+#define stat_data_v1(ih)        (ih_version (ih) == KEY_FORMAT_3_5)
+#define sd_v1_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
+#define set_sd_v1_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
+#define sd_v1_nlink(sdp)        (le16_to_cpu((sdp)->sd_nlink))
+#define set_sd_v1_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le16(v))
+#define sd_v1_uid(sdp)          (le16_to_cpu((sdp)->sd_uid))
+#define set_sd_v1_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le16(v))
+#define sd_v1_gid(sdp)          (le16_to_cpu((sdp)->sd_gid))
+#define set_sd_v1_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le16(v))
+#define sd_v1_size(sdp)         (le32_to_cpu((sdp)->sd_size))
+#define set_sd_v1_size(sdp,v)   ((sdp)->sd_size = cpu_to_le32(v))
+#define sd_v1_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
+#define set_sd_v1_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
+#define sd_v1_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
+#define set_sd_v1_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
+#define sd_v1_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
+#define set_sd_v1_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
+#define sd_v1_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
+#define set_sd_v1_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
+#define sd_v1_blocks(sdp)       (le32_to_cpu((sdp)->u.sd_blocks))
+#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v))
+#define sd_v1_first_direct_byte(sdp) \
+                                (le32_to_cpu((sdp)->sd_first_direct_byte))
+#define set_sd_v1_first_direct_byte(sdp,v) \
+                                ((sdp)->sd_first_direct_byte = cpu_to_le32(v))
+
+/* inode flags stored in sd_attrs (nee sd_reserved) */
+
+/*
+ * we want common flags to have the same values as in ext2,
+ * so chattr(1) will work without problems
+ */
+#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
+#define REISERFS_APPEND_FL    FS_APPEND_FL
+#define REISERFS_SYNC_FL      FS_SYNC_FL
+#define REISERFS_NOATIME_FL   FS_NOATIME_FL
+#define REISERFS_NODUMP_FL    FS_NODUMP_FL
+#define REISERFS_SECRM_FL     FS_SECRM_FL
+#define REISERFS_UNRM_FL      FS_UNRM_FL
+#define REISERFS_COMPR_FL     FS_COMPR_FL
+#define REISERFS_NOTAIL_FL    FS_NOTAIL_FL
+
+/* persistent flags that file inherits from the parent directory */
+#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL |	\
+				REISERFS_SYNC_FL |	\
+				REISERFS_NOATIME_FL |	\
+				REISERFS_NODUMP_FL |	\
+				REISERFS_SECRM_FL |	\
+				REISERFS_COMPR_FL |	\
+				REISERFS_NOTAIL_FL )
+
+/*
+ * Stat Data on disk (reiserfs version of UFS disk inode minus the
+ * address blocks)
+ */
+struct stat_data {
+	__le16 sd_mode;		/* file type, permissions */
+	__le16 sd_attrs;	/* persistent inode flags */
+	__le32 sd_nlink;	/* number of hard links */
+	__le64 sd_size;		/* file size */
+	__le32 sd_uid;		/* owner */
+	__le32 sd_gid;		/* group */
+	__le32 sd_atime;	/* time of last access */
+	__le32 sd_mtime;	/* time file was last modified  */
+
+	/*
+	 * time inode (stat data) was last changed
+	 * (except changes to sd_atime and sd_mtime)
+	 */
+	__le32 sd_ctime;
+	__le32 sd_blocks;
+	union {
+		__le32 sd_rdev;
+		__le32 sd_generation;
+	} __attribute__ ((__packed__)) u;
+} __attribute__ ((__packed__));
+
+/* this is 44 bytes long */
+#define SD_SIZE (sizeof(struct stat_data))
+#define SD_V2_SIZE              SD_SIZE
+#define stat_data_v2(ih)        (ih_version (ih) == KEY_FORMAT_3_6)
+#define sd_v2_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
+#define set_sd_v2_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
+/* sd_reserved */
+/* set_sd_reserved */
+#define sd_v2_nlink(sdp)        (le32_to_cpu((sdp)->sd_nlink))
+#define set_sd_v2_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le32(v))
+#define sd_v2_size(sdp)         (le64_to_cpu((sdp)->sd_size))
+#define set_sd_v2_size(sdp,v)   ((sdp)->sd_size = cpu_to_le64(v))
+#define sd_v2_uid(sdp)          (le32_to_cpu((sdp)->sd_uid))
+#define set_sd_v2_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le32(v))
+#define sd_v2_gid(sdp)          (le32_to_cpu((sdp)->sd_gid))
+#define set_sd_v2_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le32(v))
+#define sd_v2_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
+#define set_sd_v2_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
+#define sd_v2_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
+#define set_sd_v2_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
+#define sd_v2_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
+#define set_sd_v2_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
+#define sd_v2_blocks(sdp)       (le32_to_cpu((sdp)->sd_blocks))
+#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v))
+#define sd_v2_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
+#define set_sd_v2_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
+#define sd_v2_generation(sdp)   (le32_to_cpu((sdp)->u.sd_generation))
+#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v))
+#define sd_v2_attrs(sdp)         (le16_to_cpu((sdp)->sd_attrs))
+#define set_sd_v2_attrs(sdp,v)   ((sdp)->sd_attrs = cpu_to_le16(v))
+
+/***************************************************************************
+ *                      DIRECTORY STRUCTURE                                *
+ ***************************************************************************/
+/*
+ * Picture represents the structure of directory items
+ * ________________________________________________
+ * |  Array of     |   |     |        |       |   |
+ * | directory     |N-1| N-2 | ....   |   1st |0th|
+ * | entry headers |   |     |        |       |   |
+ * |_______________|___|_____|________|_______|___|
+ *                  <----   directory entries         ------>
+ *
+ * First directory item has k_offset component 1. We store "." and ".."
+ * in one item, always, we never split "." and ".." into differing
+ * items.  This makes, among other things, the code for removing
+ * directories simpler.
+ */
+#define SD_OFFSET  0
+#define SD_UNIQUENESS 0
+#define DOT_OFFSET 1
+#define DOT_DOT_OFFSET 2
+#define DIRENTRY_UNIQUENESS 500
+
+#define FIRST_ITEM_OFFSET 1
+
+/*
+ * Q: How to get key of object pointed to by entry from entry?
+ *
+ * A: Each directory entry has its header. This header has deh_dir_id
+ *    and deh_objectid fields, those are key of object, entry points to
+ */
+
+/*
+ * NOT IMPLEMENTED:
+ * Directory will someday contain stat data of object
+ */
+
+struct reiserfs_de_head {
+	__le32 deh_offset;	/* third component of the directory entry key */
+
+	/*
+	 * objectid of the parent directory of the object, that is referenced
+	 * by directory entry
+	 */
+	__le32 deh_dir_id;
+
+	/* objectid of the object, that is referenced by directory entry */
+	__le32 deh_objectid;
+	__le16 deh_location;	/* offset of name in the whole item */
+
+	/*
+	 * whether 1) entry contains stat data (for future), and
+	 * 2) whether entry is hidden (unlinked)
+	 */
+	__le16 deh_state;
+} __attribute__ ((__packed__));
+#define DEH_SIZE                  sizeof(struct reiserfs_de_head)
+#define deh_offset(p_deh)         (le32_to_cpu((p_deh)->deh_offset))
+#define deh_dir_id(p_deh)         (le32_to_cpu((p_deh)->deh_dir_id))
+#define deh_objectid(p_deh)       (le32_to_cpu((p_deh)->deh_objectid))
+#define deh_location(p_deh)       (le16_to_cpu((p_deh)->deh_location))
+#define deh_state(p_deh)          (le16_to_cpu((p_deh)->deh_state))
+
+#define put_deh_offset(p_deh,v)   ((p_deh)->deh_offset = cpu_to_le32((v)))
+#define put_deh_dir_id(p_deh,v)   ((p_deh)->deh_dir_id = cpu_to_le32((v)))
+#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v)))
+#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v)))
+#define put_deh_state(p_deh,v)    ((p_deh)->deh_state = cpu_to_le16((v)))
+
+/* empty directory contains two entries "." and ".." and their headers */
+#define EMPTY_DIR_SIZE \
+(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen ("..")))
+
+/* old format directories have this size when empty */
+#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
+
+#define DEH_Statdata 0		/* not used now */
+#define DEH_Visible 2
+
+/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
+#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__)
+#   define ADDR_UNALIGNED_BITS  (3)
+#endif
+
+/*
+ * These are only used to manipulate deh_state.
+ * Because of this, we'll use the ext2_ bit routines,
+ * since they are little endian
+ */
+#ifdef ADDR_UNALIGNED_BITS
+
+#   define aligned_address(addr)           ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
+#   define unaligned_offset(addr)          (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3)
+
+#   define set_bit_unaligned(nr, addr)	\
+	__test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
+#   define clear_bit_unaligned(nr, addr)	\
+	__test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
+#   define test_bit_unaligned(nr, addr)	\
+	test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
+
+#else
+
+#   define set_bit_unaligned(nr, addr)	__test_and_set_bit_le(nr, addr)
+#   define clear_bit_unaligned(nr, addr)	__test_and_clear_bit_le(nr, addr)
+#   define test_bit_unaligned(nr, addr)	test_bit_le(nr, addr)
+
+#endif
+
+#define mark_de_with_sd(deh)        set_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
+#define mark_de_without_sd(deh)     clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
+#define mark_de_visible(deh)	    set_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+#define mark_de_hidden(deh)	    clear_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+
+#define de_with_sd(deh)		    test_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
+#define de_visible(deh)	    	    test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+#define de_hidden(deh)	    	    !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
+
+extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
+				   __le32 par_dirid, __le32 par_objid);
+extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
+				__le32 par_dirid, __le32 par_objid);
+
+/* two entries per block (at least) */
+#define REISERFS_MAX_NAME(block_size) 255
+
+/*
+ * this structure is used for operations on directory entries. It is
+ * not a disk structure.
+ *
+ * When reiserfs_find_entry or search_by_entry_key find directory
+ * entry, they return filled reiserfs_dir_entry structure
+ */
+struct reiserfs_dir_entry {
+	struct buffer_head *de_bh;
+	int de_item_num;
+	struct item_head *de_ih;
+	int de_entry_num;
+	struct reiserfs_de_head *de_deh;
+	int de_entrylen;
+	int de_namelen;
+	char *de_name;
+	unsigned long *de_gen_number_bit_string;
+
+	__u32 de_dir_id;
+	__u32 de_objectid;
+
+	struct cpu_key de_entry_key;
+};
+
+/*
+ * these defines are useful when a particular member of
+ * a reiserfs_dir_entry is needed
+ */
+
+/* pointer to file name, stored in entry */
+#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
+				(ih_item_body(bh, ih) + deh_location(deh))
+
+/* length of name */
+#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
+(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
+
+/* hash value occupies bits from 7 up to 30 */
+#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
+/* generation number occupies 7 bits starting from 0 up to 6 */
+#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL)
+#define MAX_GENERATION_NUMBER  127
+
+#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
+
+/*
+ * Picture represents an internal node of the reiserfs tree
+ *  ______________________________________________________
+ * |      |  Array of     |  Array of         |  Free     |
+ * |block |    keys       |  pointers         | space     |
+ * | head |      N        |      N+1          |           |
+ * |______|_______________|___________________|___________|
+ */
+
+/***************************************************************************
+ *                      DISK CHILD                                         *
+ ***************************************************************************/
+/*
+ * Disk child pointer:
+ * The pointer from an internal node of the tree to a node that is on disk.
+ */
+struct disk_child {
+	__le32 dc_block_number;	/* Disk child's block number. */
+	__le16 dc_size;		/* Disk child's used space.   */
+	__le16 dc_reserved;
+};
+
+#define DC_SIZE (sizeof(struct disk_child))
+#define dc_block_number(dc_p)	(le32_to_cpu((dc_p)->dc_block_number))
+#define dc_size(dc_p)		(le16_to_cpu((dc_p)->dc_size))
+#define put_dc_block_number(dc_p, val)   do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0)
+#define put_dc_size(dc_p, val)   do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
+
+/* Get disk child by buffer header and position in the tree node. */
+#define B_N_CHILD(bh, n_pos)  ((struct disk_child *)\
+((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
+
+/* Get disk child number by buffer header and position in the tree node. */
+#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
+#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
+				(put_dc_block_number(B_N_CHILD(bh, n_pos), val))
+
+ /* maximal value of field child_size in structure disk_child */
+ /* child size is the combined size of all items and their headers */
+#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
+
+/* amount of used space in buffer (not including block head) */
+#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur)))
+
+/* max and min number of keys in internal node */
+#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
+#define MIN_NR_KEY(bh)    (MAX_NR_KEY(bh)/2)
+
+/***************************************************************************
+ *                      PATH STRUCTURES AND DEFINES                        *
+ ***************************************************************************/
+
+/*
+ * search_by_key fills up the path from the root to the leaf as it descends
+ * the tree looking for the key.  It uses reiserfs_bread to try to find
+ * buffers in the cache given their block number.  If it does not find
+ * them in the cache it reads them from disk.  For each node search_by_key
+ * finds using reiserfs_bread it then uses bin_search to look through that
+ * node.  bin_search will find the position of the block_number of the next
+ * node if it is looking through an internal node.  If it is looking through
+ * a leaf node bin_search will find the position of the item which has key
+ * either equal to given key, or which is the maximal key less than the
+ * given key.
+ */
+
+struct path_element {
+	/* Pointer to the buffer at the path in the tree. */
+	struct buffer_head *pe_buffer;
+	/* Position in the tree node which is placed in the buffer above. */
+	int pe_position;
+};
+
+/*
+ * maximal height of a tree. don't change this without
+ * changing JOURNAL_PER_BALANCE_CNT
+ */
+#define MAX_HEIGHT 5
+
+/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
+#define EXTENDED_MAX_HEIGHT         7
+
+/* Must be equal to at least 2. */
+#define FIRST_PATH_ELEMENT_OFFSET   2
+
+/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
+#define ILLEGAL_PATH_ELEMENT_OFFSET 1
+
+/* this MUST be MAX_HEIGHT + 1. See about FEB below */
+#define MAX_FEB_SIZE 6
+
+/*
+ * We need to keep track of who the ancestors of nodes are.  When we
+ * perform a search we record which nodes were visited while
+ * descending the tree looking for the node we searched for. This list
+ * of nodes is called the path.  This information is used while
+ * performing balancing.  Note that this path information may become
+ * invalid, and this means we must check it when using it to see if it
+ * is still valid. You'll need to read search_by_key and the comments
+ * in it, especially about decrement_counters_in_path(), to understand
+ * this structure.
+ *
+ * Paths make the code so much harder to work with and debug.... An
+ * enormous number of bugs are due to them, and trying to write or modify
+ * code that uses them just makes my head hurt.  They are based on an
+ * excessive effort to avoid disturbing the precious VFS code.:-( The
+ * gods only know how we are going to SMP the code that uses them.
+ * znodes are the way!
+ */
+
+#define PATH_READA	0x1	/* do read ahead */
+#define PATH_READA_BACK 0x2	/* read backwards */
+
+struct treepath {
+	int path_length;	/* Length of the array above.   */
+	int reada;
+	/* Array of the path elements.  */
+	struct path_element path_elements[EXTENDED_MAX_HEIGHT];
+	int pos_in_item;
+};
+
+#define pos_in_item(path) ((path)->pos_in_item)
+
+#define INITIALIZE_PATH(var) \
+struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
+
+/* Get path element by path and path position. */
+#define PATH_OFFSET_PELEMENT(path, n_offset)  ((path)->path_elements + (n_offset))
+
+/* Get buffer header at the path by path and path position. */
+#define PATH_OFFSET_PBUFFER(path, n_offset)   (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
+
+/* Get position in the element at the path by path and path position. */
+#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
+
+#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
+
+/*
+ * you know, to the person who didn't write this the macro name does not
+ * at first suggest what it does.  Maybe POSITION_FROM_PATH_END? Or
+ * maybe we should just focus on dumping paths... -Hans
+ */
+#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
+
+/*
+ * in do_balance leaf has h == 0 in contrast with path structure,
+ * where root has level == 0. That is why we need these defines
+ */
+
+/* tb->S[h] */
+#define PATH_H_PBUFFER(path, h) \
+			PATH_OFFSET_PBUFFER(path, path->path_length - (h))
+
+/* tb->F[h] or tb->S[0]->b_parent */
+#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
+
+#define PATH_H_POSITION(path, h) \
+			PATH_OFFSET_POSITION(path, path->path_length - (h))
+
+/* tb->S[h]->b_item_order */
+#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
+
+#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
+
+static inline void *reiserfs_node_data(const struct buffer_head *bh)
+{
+	return bh->b_data + sizeof(struct block_head);
+}
+
+/* get key from internal node */
+static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
+						int item_num)
+{
+	struct reiserfs_key *key = reiserfs_node_data(bh);
+
+	return &key[item_num];
+}
+
+/* get the item header from leaf node */
+static inline struct item_head *item_head(const struct buffer_head *bh,
+					  int item_num)
+{
+	struct item_head *ih = reiserfs_node_data(bh);
+
+	return &ih[item_num];
+}
+
+/* get the key from leaf node */
+static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
+					    int item_num)
+{
+	return &item_head(bh, item_num)->ih_key;
+}
+
+static inline void *ih_item_body(const struct buffer_head *bh,
+				 const struct item_head *ih)
+{
+	return bh->b_data + ih_location(ih);
+}
+
+/* get item body from leaf node */
+static inline void *item_body(const struct buffer_head *bh, int item_num)
+{
+	return ih_item_body(bh, item_head(bh, item_num));
+}
+
+static inline struct item_head *tp_item_head(const struct treepath *path)
+{
+	return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
+}
+
+static inline void *tp_item_body(const struct treepath *path)
+{
+	return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
+}
+
+#define get_last_bh(path) PATH_PLAST_BUFFER(path)
+#define get_item_pos(path) PATH_LAST_POSITION(path)
+#define item_moved(ih,path) comp_items(ih, path)
+#define path_changed(ih,path) comp_items (ih, path)
+
+/* array of the entry headers */
+ /* get item body */
+#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
+
+/*
+ * length of the directory entry in directory item. This define
+ * calculates length of i-th directory entry using directory entry
+ * locations from dir entry head. When it calculates length of 0-th
+ * directory entry, it uses length of whole item in place of entry
+ * location of the non-existent following entry in the calculation.
+ * See picture above.
+ */
+static inline int entry_length(const struct buffer_head *bh,
+			       const struct item_head *ih, int pos_in_item)
+{
+	struct reiserfs_de_head *deh;
+
+	deh = B_I_DEH(bh, ih) + pos_in_item;
+	if (pos_in_item)
+		return deh_location(deh - 1) - deh_location(deh);
+
+	return ih_item_len(ih) - deh_location(deh);
+}
+
+/***************************************************************************
+ *                       MISC                                              *
+ ***************************************************************************/
+
+/* Size of pointer to the unformatted node. */
+#define UNFM_P_SIZE (sizeof(unp_t))
+#define UNFM_P_SHIFT 2
+
+/* in in-core inode key is stored on le form */
+#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
+
+#define MAX_UL_INT 0xffffffff
+#define MAX_INT    0x7ffffff
+#define MAX_US_INT 0xffff
+
+// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
+static inline loff_t max_reiserfs_offset(struct inode *inode)
+{
+	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
+		return (loff_t) U32_MAX;
+
+	return (loff_t) ((~(__u64) 0) >> 4);
+}
+
+#define MAX_KEY_OBJECTID	MAX_UL_INT
+
+#define MAX_B_NUM  MAX_UL_INT
+#define MAX_FC_NUM MAX_US_INT
+
+/* the purpose is to detect overflow of an unsigned short */
+#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
+
+/*
+ * The following defines are used in reiserfs_insert_item
+ * and reiserfs_append_item
+ */
+#define REISERFS_KERNEL_MEM		0	/* kernel memory mode */
+#define REISERFS_USER_MEM		1	/* user memory mode */
+
+#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
+#define get_generation(s) atomic_read (&fs_generation(s))
+#define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
+#define __fs_changed(gen,s) (gen != get_generation (s))
+#define fs_changed(gen,s)		\
+({					\
+	reiserfs_cond_resched(s);	\
+	__fs_changed(gen, s);		\
+})
+
+/***************************************************************************
+ *                  FIXATE NODES                                           *
+ ***************************************************************************/
+
+#define VI_TYPE_LEFT_MERGEABLE 1
+#define VI_TYPE_RIGHT_MERGEABLE 2
+
+/*
+ * To make any changes in the tree we always first find node, that
+ * contains item to be changed/deleted or place to insert a new
+ * item. We call this node S. To do balancing we need to decide what
+ * we will shift to left/right neighbor, or to a new node, where new
+ * item will be etc. To make this analysis simpler we build virtual
+ * node. Virtual node is an array of items, that will replace items of
+ * node S. (For instance if we are going to delete an item, virtual
+ * node does not contain it). Virtual node keeps information about
+ * item sizes and types, mergeability of first and last items, sizes
+ * of all entries in directory item. We use this array of items when
+ * calculating what we can shift to neighbors and how many nodes we
+ * have to have if we do not any shiftings, if we shift to left/right
+ * neighbor or to both.
+ */
+struct virtual_item {
+	int vi_index;		/* index in the array of item operations */
+	unsigned short vi_type;	/* left/right mergeability */
+
+	/* length of item that it will have after balancing */
+	unsigned short vi_item_len;
+
+	struct item_head *vi_ih;
+	const char *vi_item;	/* body of item (old or new) */
+	const void *vi_new_data;	/* 0 always but paste mode */
+	void *vi_uarea;		/* item specific area */
+};
+
+struct virtual_node {
+	/* this is a pointer to the free space in the buffer */
+	char *vn_free_ptr;
+
+	unsigned short vn_nr_item;	/* number of items in virtual node */
+
+	/*
+	 * size of node , that node would have if it has
+	 * unlimited size and no balancing is performed
+	 */
+	short vn_size;
+
+	/* mode of balancing (paste, insert, delete, cut) */
+	short vn_mode;
+
+	short vn_affected_item_num;
+	short vn_pos_in_item;
+
+	/* item header of inserted item, 0 for other modes */
+	struct item_head *vn_ins_ih;
+	const void *vn_data;
+
+	/* array of items (including a new one, excluding item to be deleted) */
+	struct virtual_item *vn_vi;
+};
+
+/* used by directory items when creating virtual nodes */
+struct direntry_uarea {
+	int flags;
+	__u16 entry_count;
+	__u16 entry_sizes[1];
+} __attribute__ ((__packed__));
+
+/***************************************************************************
+ *                  TREE BALANCE                                           *
+ ***************************************************************************/
+
+/*
+ * This temporary structure is used in tree balance algorithms, and
+ * constructed as we go to the extent that its various parts are
+ * needed.  It contains arrays of nodes that can potentially be
+ * involved in the balancing of node S, and parameters that define how
+ * each of the nodes must be balanced.  Note that in these algorithms
+ * for balancing the worst case is to need to balance the current node
+ * S and the left and right neighbors and all of their parents plus
+ * create a new node.  We implement S1 balancing for the leaf nodes
+ * and S0 balancing for the internal nodes (S1 and S0 are defined in
+ * our papers.)
+ */
+
+/* size of the array of buffers to free at end of do_balance */
+#define MAX_FREE_BLOCK 7
+
+/* maximum number of FEB blocknrs on a single level */
+#define MAX_AMOUNT_NEEDED 2
+
+/* someday somebody will prefix every field in this struct with tb_ */
+struct tree_balance {
+	int tb_mode;
+	int need_balance_dirty;
+	struct super_block *tb_sb;
+	struct reiserfs_transaction_handle *transaction_handle;
+	struct treepath *tb_path;
+
+	/* array of left neighbors of nodes in the path */
+	struct buffer_head *L[MAX_HEIGHT];
+
+	/* array of right neighbors of nodes in the path */
+	struct buffer_head *R[MAX_HEIGHT];
+
+	/* array of fathers of the left neighbors */
+	struct buffer_head *FL[MAX_HEIGHT];
+
+	/* array of fathers of the right neighbors */
+	struct buffer_head *FR[MAX_HEIGHT];
+	/* array of common parents of center node and its left neighbor */
+	struct buffer_head *CFL[MAX_HEIGHT];
+
+	/* array of common parents of center node and its right neighbor */
+	struct buffer_head *CFR[MAX_HEIGHT];
+
+	/*
+	 * array of empty buffers. Number of buffers in array equals
+	 * cur_blknum.
+	 */
+	struct buffer_head *FEB[MAX_FEB_SIZE];
+	struct buffer_head *used[MAX_FEB_SIZE];
+	struct buffer_head *thrown[MAX_FEB_SIZE];
+
+	/*
+	 * array of number of items which must be shifted to the left in
+	 * order to balance the current node; for leaves includes item that
+	 * will be partially shifted; for internal nodes, it is the number
+	 * of child pointers rather than items. It includes the new item
+	 * being created. The code sometimes subtracts one to get the
+	 * number of wholly shifted items for other purposes.
+	 */
+	int lnum[MAX_HEIGHT];
+
+	/* substitute right for left in comment above */
+	int rnum[MAX_HEIGHT];
+
+	/*
+	 * array indexed by height h mapping the key delimiting L[h] and
+	 * S[h] to its item number within the node CFL[h]
+	 */
+	int lkey[MAX_HEIGHT];
+
+	/* substitute r for l in comment above */
+	int rkey[MAX_HEIGHT];
+
+	/*
+	 * the number of bytes by we are trying to add or remove from
+	 * S[h]. A negative value means removing.
+	 */
+	int insert_size[MAX_HEIGHT];
+
+	/*
+	 * number of nodes that will replace node S[h] after balancing
+	 * on the level h of the tree.  If 0 then S is being deleted,
+	 * if 1 then S is remaining and no new nodes are being created,
+	 * if 2 or 3 then 1 or 2 new nodes is being created
+	 */
+	int blknum[MAX_HEIGHT];
+
+	/* fields that are used only for balancing leaves of the tree */
+
+	/* number of empty blocks having been already allocated */
+	int cur_blknum;
+
+	/* number of items that fall into left most node when S[0] splits */
+	int s0num;
+
+	/*
+	 * number of bytes which can flow to the left neighbor from the left
+	 * most liquid item that cannot be shifted from S[0] entirely
+	 * if -1 then nothing will be partially shifted
+	 */
+	int lbytes;
+
+	/*
+	 * number of bytes which will flow to the right neighbor from the right
+	 * most liquid item that cannot be shifted from S[0] entirely
+	 * if -1 then nothing will be partially shifted
+	 */
+	int rbytes;
+
+
+	/*
+	 * index into the array of item headers in
+	 * S[0] of the affected item
+	 */
+	int item_pos;
+
+	/* new nodes allocated to hold what could not fit into S */
+	struct buffer_head *S_new[2];
+
+	/*
+	 * number of items that will be placed into nodes in S_new
+	 * when S[0] splits
+	 */
+	int snum[2];
+
+	/*
+	 * number of bytes which flow to nodes in S_new when S[0] splits
+	 * note: if S[0] splits into 3 nodes, then items do not need to be cut
+	 */
+	int sbytes[2];
+
+	int pos_in_item;
+	int zeroes_num;
+
+	/*
+	 * buffers which are to be freed after do_balance finishes
+	 * by unfix_nodes
+	 */
+	struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
+
+	/*
+	 * kmalloced memory. Used to create virtual node and keep
+	 * map of dirtied bitmap blocks
+	 */
+	char *vn_buf;
+
+	int vn_buf_size;	/* size of the vn_buf */
+
+	/* VN starts after bitmap of bitmap blocks */
+	struct virtual_node *tb_vn;
+
+	/*
+	 * saved value of `reiserfs_generation' counter see
+	 * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
+	 */
+	int fs_gen;
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	/*
+	 * key pointer, to pass to block allocator or
+	 * another low-level subsystem
+	 */
+	struct in_core_key key;
+#endif
+};
+
+/* These are modes of balancing */
+
+/* When inserting an item. */
+#define M_INSERT	'i'
+/*
+ * When inserting into (directories only) or appending onto an already
+ * existent item.
+ */
+#define M_PASTE		'p'
+/* When deleting an item. */
+#define M_DELETE	'd'
+/* When truncating an item or removing an entry from a (directory) item. */
+#define M_CUT		'c'
+
+/* used when balancing on leaf level skipped (in reiserfsck) */
+#define M_INTERNAL	'n'
+
+/*
+ * When further balancing is not needed, then do_balance does not need
+ * to be called.
+ */
+#define M_SKIP_BALANCING		's'
+#define M_CONVERT	'v'
+
+/* modes of leaf_move_items */
+#define LEAF_FROM_S_TO_L 0
+#define LEAF_FROM_S_TO_R 1
+#define LEAF_FROM_R_TO_L 2
+#define LEAF_FROM_L_TO_R 3
+#define LEAF_FROM_S_TO_SNEW 4
+
+#define FIRST_TO_LAST 0
+#define LAST_TO_FIRST 1
+
+/*
+ * used in do_balance for passing parent of node information that has
+ * been gotten from tb struct
+ */
+struct buffer_info {
+	struct tree_balance *tb;
+	struct buffer_head *bi_bh;
+	struct buffer_head *bi_parent;
+	int bi_position;
+};
+
+static inline struct super_block *sb_from_tb(struct tree_balance *tb)
+{
+	return tb ? tb->tb_sb : NULL;
+}
+
+static inline struct super_block *sb_from_bi(struct buffer_info *bi)
+{
+	return bi ? sb_from_tb(bi->tb) : NULL;
+}
+
+/*
+ * there are 4 types of items: stat data, directory item, indirect, direct.
+ * +-------------------+------------+--------------+------------+
+ * |                   |  k_offset  | k_uniqueness | mergeable? |
+ * +-------------------+------------+--------------+------------+
+ * |     stat data     |     0      |      0       |   no       |
+ * +-------------------+------------+--------------+------------+
+ * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. |   no       |
+ * | non 1st directory | hash value | UNIQUENESS   |   yes      |
+ * |     item          |            |              |            |
+ * +-------------------+------------+--------------+------------+
+ * | indirect item     | offset + 1 |TYPE_INDIRECT |    [1]	|
+ * +-------------------+------------+--------------+------------+
+ * | direct item       | offset + 1 |TYPE_DIRECT   |    [2]     |
+ * +-------------------+------------+--------------+------------+
+ *
+ * [1] if this is not the first indirect item of the object
+ * [2] if this is not the first direct item of the object
+*/
+
+struct item_operations {
+	int (*bytes_number) (struct item_head * ih, int block_size);
+	void (*decrement_key) (struct cpu_key *);
+	int (*is_left_mergeable) (struct reiserfs_key * ih,
+				  unsigned long bsize);
+	void (*print_item) (struct item_head *, char *item);
+	void (*check_item) (struct item_head *, char *item);
+
+	int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
+			  int is_affected, int insert_size);
+	int (*check_left) (struct virtual_item * vi, int free,
+			   int start_skip, int end_skip);
+	int (*check_right) (struct virtual_item * vi, int free);
+	int (*part_size) (struct virtual_item * vi, int from, int to);
+	int (*unit_num) (struct virtual_item * vi);
+	void (*print_vi) (struct virtual_item * vi);
+};
+
+extern struct item_operations *item_ops[TYPE_ANY + 1];
+
+#define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
+#define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
+#define op_print_item(ih,item)                       item_ops[le_ih_k_type (ih)]->print_item (ih, item)
+#define op_check_item(ih,item)                       item_ops[le_ih_k_type (ih)]->check_item (ih, item)
+#define op_create_vi(vn,vi,is_affected,insert_size)  item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size)
+#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip)
+#define op_check_right(vi,free)                      item_ops[(vi)->vi_index]->check_right (vi, free)
+#define op_part_size(vi,from,to)                     item_ops[(vi)->vi_index]->part_size (vi, from, to)
+#define op_unit_num(vi)				     item_ops[(vi)->vi_index]->unit_num (vi)
+#define op_print_vi(vi)                              item_ops[(vi)->vi_index]->print_vi (vi)
+
+#define COMP_SHORT_KEYS comp_short_keys
+
+/* number of blocks pointed to by the indirect item */
+#define I_UNFM_NUM(ih)	(ih_item_len(ih) / UNFM_P_SIZE)
+
+/*
+ * the used space within the unformatted node corresponding
+ * to pos within the item pointed to by ih
+ */
+#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
+
+/*
+ * number of bytes contained by the direct item or the
+ * unformatted nodes the indirect item points to
+ */
+
+/* following defines use reiserfs buffer header and item header */
+
+/* get stat-data */
+#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
+
+/* this is 3976 for size==4096 */
+#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
+
+/*
+ * indirect items consist of entries which contain blocknrs, pos
+ * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
+ * blocknr contained by the entry pos points to
+ */
+#define B_I_POS_UNFM_POINTER(bh, ih, pos)				\
+	le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
+#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val)			\
+	(*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
+
+struct reiserfs_iget_args {
+	__u32 objectid;
+	__u32 dirid;
+};
+
+/***************************************************************************
+ *                    FUNCTION DECLARATIONS                                *
+ ***************************************************************************/
+
+#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
+
+#define journal_trans_half(blocksize) \
+	((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32))
+
+/* journal.c see journal.c for all the comments here */
+
+/* first block written in a commit.  */
+struct reiserfs_journal_desc {
+	__le32 j_trans_id;	/* id of commit */
+
+	/* length of commit. len +1 is the commit block */
+	__le32 j_len;
+
+	__le32 j_mount_id;	/* mount id of this trans */
+	__le32 j_realblock[1];	/* real locations for each block */
+};
+
+#define get_desc_trans_id(d)   le32_to_cpu((d)->j_trans_id)
+#define get_desc_trans_len(d)  le32_to_cpu((d)->j_len)
+#define get_desc_mount_id(d)   le32_to_cpu((d)->j_mount_id)
+
+#define set_desc_trans_id(d,val)       do { (d)->j_trans_id = cpu_to_le32 (val); } while (0)
+#define set_desc_trans_len(d,val)      do { (d)->j_len = cpu_to_le32 (val); } while (0)
+#define set_desc_mount_id(d,val)       do { (d)->j_mount_id = cpu_to_le32 (val); } while (0)
+
+/* last block written in a commit */
+struct reiserfs_journal_commit {
+	__le32 j_trans_id;	/* must match j_trans_id from the desc block */
+	__le32 j_len;		/* ditto */
+	__le32 j_realblock[1];	/* real locations for each block */
+};
+
+#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
+#define get_commit_trans_len(c)        le32_to_cpu((c)->j_len)
+#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id)
+
+#define set_commit_trans_id(c,val)     do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
+#define set_commit_trans_len(c,val)    do { (c)->j_len = cpu_to_le32 (val); } while (0)
+
+/*
+ * this header block gets written whenever a transaction is considered
+ * fully flushed, and is more recent than the last fully flushed transaction.
+ * fully flushed means all the log blocks and all the real blocks are on
+ * disk, and this transaction does not need to be replayed.
+ */
+struct reiserfs_journal_header {
+	/* id of last fully flushed transaction */
+	__le32 j_last_flush_trans_id;
+
+	/* offset in the log of where to start replay after a crash */
+	__le32 j_first_unflushed_offset;
+
+	__le32 j_mount_id;
+	/* 12 */ struct journal_params jh_journal;
+};
+
+/* biggest tunable defines are right here */
+#define JOURNAL_BLOCK_COUNT 8192	/* number of blocks in the journal */
+
+/* biggest possible single transaction, don't change for now (8/3/99) */
+#define JOURNAL_TRANS_MAX_DEFAULT 1024
+#define JOURNAL_TRANS_MIN_DEFAULT 256
+
+/*
+ * max blocks to batch into one transaction,
+ * don't make this any bigger than 900
+ */
+#define JOURNAL_MAX_BATCH_DEFAULT   900
+#define JOURNAL_MIN_RATIO 2
+#define JOURNAL_MAX_COMMIT_AGE 30
+#define JOURNAL_MAX_TRANS_AGE 30
+#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
+#define JOURNAL_BLOCKS_PER_OBJECT(sb)  (JOURNAL_PER_BALANCE_CNT * 3 + \
+					 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
+					      REISERFS_QUOTA_TRANS_BLOCKS(sb)))
+
+#ifdef CONFIG_QUOTA
+#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA))
+/* We need to update data and inode (atime) */
+#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0)
+/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
+#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
+(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
+/* same as with INIT */
+#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
+(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
+#else
+#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
+#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
+#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
+#endif
+
+/*
+ * both of these can be as low as 1, or as high as you want.  The min is the
+ * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
+ * as needed, and released when transactions are committed.  On release, if
+ * the current number of nodes is > max, the node is freed, otherwise,
+ * it is put on a free list for faster use later.
+*/
+#define REISERFS_MIN_BITMAP_NODES 10
+#define REISERFS_MAX_BITMAP_NODES 100
+
+/* these are based on journal hash size of 8192 */
+#define JBH_HASH_SHIFT 13
+#define JBH_HASH_MASK 8191
+
+#define _jhashfn(sb,block)	\
+	(((unsigned long)sb>>L1_CACHE_SHIFT) ^ \
+	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
+#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
+
+/* We need these to make journal.c code more readable */
+#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+
+enum reiserfs_bh_state_bits {
+	BH_JDirty = BH_PrivateStart,	/* buffer is in current transaction */
+	BH_JDirty_wait,
+	/*
+	 * disk block was taken off free list before being in a
+	 * finished transaction, or written to disk. Can be reused immed.
+	 */
+	BH_JNew,
+	BH_JPrepared,
+	BH_JRestore_dirty,
+	BH_JTest,		/* debugging only will go away */
+};
+
+BUFFER_FNS(JDirty, journaled);
+TAS_BUFFER_FNS(JDirty, journaled);
+BUFFER_FNS(JDirty_wait, journal_dirty);
+TAS_BUFFER_FNS(JDirty_wait, journal_dirty);
+BUFFER_FNS(JNew, journal_new);
+TAS_BUFFER_FNS(JNew, journal_new);
+BUFFER_FNS(JPrepared, journal_prepared);
+TAS_BUFFER_FNS(JPrepared, journal_prepared);
+BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
+TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
+BUFFER_FNS(JTest, journal_test);
+TAS_BUFFER_FNS(JTest, journal_test);
+
+/* transaction handle which is passed around for all journal calls */
+struct reiserfs_transaction_handle {
+	/*
+	 * super for this FS when journal_begin was called. saves calls to
+	 * reiserfs_get_super also used by nested transactions to make
+	 * sure they are nesting on the right FS _must_ be first
+	 * in the handle
+	 */
+	struct super_block *t_super;
+
+	int t_refcount;
+	int t_blocks_logged;	/* number of blocks this writer has logged */
+	int t_blocks_allocated;	/* number of blocks this writer allocated */
+
+	/* sanity check, equals the current trans id */
+	unsigned int t_trans_id;
+
+	void *t_handle_save;	/* save existing current->journal_info */
+
+	/*
+	 * if new block allocation occurres, that block
+	 * should be displaced from others
+	 */
+	unsigned displace_new_blocks:1;
+
+	struct list_head t_list;
+};
+
+/*
+ * used to keep track of ordered and tail writes, attached to the buffer
+ * head through b_journal_head.
+ */
+struct reiserfs_jh {
+	struct reiserfs_journal_list *jl;
+	struct buffer_head *bh;
+	struct list_head list;
+};
+
+void reiserfs_free_jh(struct buffer_head *bh);
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
+int journal_mark_dirty(struct reiserfs_transaction_handle *,
+		       struct buffer_head *bh);
+
+static inline int reiserfs_file_data_log(struct inode *inode)
+{
+	if (reiserfs_data_log(inode->i_sb) ||
+	    (REISERFS_I(inode)->i_flags & i_data_log))
+		return 1;
+	return 0;
+}
+
+static inline int reiserfs_transaction_running(struct super_block *s)
+{
+	struct reiserfs_transaction_handle *th = current->journal_info;
+	if (th && th->t_super == s)
+		return 1;
+	if (th && th->t_super == NULL)
+		BUG();
+	return 0;
+}
+
+static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
+{
+	return th->t_blocks_allocated - th->t_blocks_logged;
+}
+
+struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
+								    super_block
+								    *,
+								    int count);
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+void reiserfs_vfs_truncate_file(struct inode *inode);
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+			 unsigned from, unsigned to);
+void reiserfs_flush_old_commits(struct super_block *);
+int reiserfs_commit_for_inode(struct inode *);
+int reiserfs_inode_needs_commit(struct inode *);
+void reiserfs_update_inode_transaction(struct inode *);
+void reiserfs_wait_on_write_block(struct super_block *s);
+void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
+void reiserfs_allow_writes(struct super_block *s);
+void reiserfs_check_lock_depth(struct super_block *s, char *caller);
+int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
+				 int wait);
+void reiserfs_restore_prepared_buffer(struct super_block *,
+				      struct buffer_head *bh);
+int journal_init(struct super_block *, const char *j_dev_name, int old_format,
+		 unsigned int);
+int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
+int journal_release_error(struct reiserfs_transaction_handle *,
+			  struct super_block *);
+int journal_end(struct reiserfs_transaction_handle *);
+int journal_end_sync(struct reiserfs_transaction_handle *);
+int journal_mark_freed(struct reiserfs_transaction_handle *,
+		       struct super_block *, b_blocknr_t blocknr);
+int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
+int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
+			 int bit_nr, int searchall, b_blocknr_t *next);
+int journal_begin(struct reiserfs_transaction_handle *,
+		  struct super_block *sb, unsigned long);
+int journal_join_abort(struct reiserfs_transaction_handle *,
+		       struct super_block *sb);
+void reiserfs_abort_journal(struct super_block *sb, int errno);
+void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
+int reiserfs_allocate_list_bitmaps(struct super_block *s,
+				   struct reiserfs_list_bitmap *, unsigned int);
+
+void reiserfs_schedule_old_flush(struct super_block *s);
+void add_save_link(struct reiserfs_transaction_handle *th,
+		   struct inode *inode, int truncate);
+int remove_save_link(struct inode *inode, int truncate);
+
+/* objectid.c */
+__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
+void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
+			       __u32 objectid_to_release);
+int reiserfs_convert_objectid_map_v1(struct super_block *);
+
+/* stree.c */
+int B_IS_IN_TREE(const struct buffer_head *);
+extern void copy_item_head(struct item_head *to,
+			   const struct item_head *from);
+
+/* first key is in cpu form, second - le */
+extern int comp_short_keys(const struct reiserfs_key *le_key,
+			   const struct cpu_key *cpu_key);
+extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
+
+/* both are in le form */
+extern int comp_le_keys(const struct reiserfs_key *,
+			const struct reiserfs_key *);
+extern int comp_short_le_keys(const struct reiserfs_key *,
+			      const struct reiserfs_key *);
+
+/* * get key version from on disk key - kludge */
+static inline int le_key_version(const struct reiserfs_key *key)
+{
+	int type;
+
+	type = offset_v2_k_type(&(key->u.k_offset_v2));
+	if (type != TYPE_DIRECT && type != TYPE_INDIRECT
+	    && type != TYPE_DIRENTRY)
+		return KEY_FORMAT_3_5;
+
+	return KEY_FORMAT_3_6;
+
+}
+
+static inline void copy_key(struct reiserfs_key *to,
+			    const struct reiserfs_key *from)
+{
+	memcpy(to, from, KEY_SIZE);
+}
+
+int comp_items(const struct item_head *stored_ih, const struct treepath *path);
+const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
+				    const struct super_block *sb);
+int search_by_key(struct super_block *, const struct cpu_key *,
+		  struct treepath *, int);
+#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
+int search_for_position_by_key(struct super_block *sb,
+			       const struct cpu_key *cpu_key,
+			       struct treepath *search_path);
+extern void decrement_bcount(struct buffer_head *bh);
+void decrement_counters_in_path(struct treepath *search_path);
+void pathrelse(struct treepath *search_path);
+int reiserfs_check_path(struct treepath *p);
+void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
+
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path,
+			 const struct cpu_key *key,
+			 struct item_head *ih,
+			 struct inode *inode, const char *body);
+
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
+			     struct treepath *path,
+			     const struct cpu_key *key,
+			     struct inode *inode,
+			     const char *body, int paste_size);
+
+int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
+			   struct treepath *path,
+			   struct cpu_key *key,
+			   struct inode *inode,
+			   struct page *page, loff_t new_file_size);
+
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path,
+			 const struct cpu_key *key,
+			 struct inode *inode, struct buffer_head *un_bh);
+
+void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
+				struct inode *inode, struct reiserfs_key *key);
+int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
+			   struct inode *inode);
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
+			 struct inode *inode, struct page *,
+			 int update_timestamps);
+
+#define i_block_size(inode) ((inode)->i_sb->s_blocksize)
+#define file_size(inode) ((inode)->i_size)
+#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1))
+
+#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
+!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
+
+void padd_item(char *item, int total_length, int length);
+
+/* inode.c */
+/* args for the create parameter of reiserfs_get_block */
+#define GET_BLOCK_NO_CREATE 0	 /* don't create new blocks or convert tails */
+#define GET_BLOCK_CREATE 1	 /* add anything you need to find block */
+#define GET_BLOCK_NO_HOLE 2	 /* return -ENOENT for file holes */
+#define GET_BLOCK_READ_DIRECT 4	 /* read the tail if indirect item not found */
+#define GET_BLOCK_NO_IMUX     8	 /* i_mutex is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
+
+void reiserfs_read_locked_inode(struct inode *inode,
+				struct reiserfs_iget_args *args);
+int reiserfs_find_actor(struct inode *inode, void *p);
+int reiserfs_init_locked_inode(struct inode *inode, void *p);
+void reiserfs_evict_inode(struct inode *inode);
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
+int reiserfs_get_block(struct inode *inode, sector_t block,
+		       struct buffer_head *bh_result, int create);
+struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				     int fh_len, int fh_type);
+struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+				     int fh_len, int fh_type);
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+		       struct inode *parent);
+
+int reiserfs_truncate_file(struct inode *, int update_timestamps);
+void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
+		  int type, int key_length);
+void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
+		       int version,
+		       loff_t offset, int type, int length, int entry_count);
+struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
+
+struct reiserfs_security_handle;
+int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
+		       struct inode *dir, umode_t mode,
+		       const char *symname, loff_t i_size,
+		       struct dentry *dentry, struct inode *inode,
+		       struct reiserfs_security_handle *security);
+
+void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
+			     struct inode *inode, loff_t size);
+
+static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
+				      struct inode *inode)
+{
+	reiserfs_update_sd_size(th, inode, inode->i_size);
+}
+
+void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
+void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
+
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
+
+/* namei.c */
+void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
+int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
+			struct treepath *path, struct reiserfs_dir_entry *de);
+struct dentry *reiserfs_get_parent(struct dentry *);
+
+#ifdef CONFIG_REISERFS_PROC_INFO
+int reiserfs_proc_info_init(struct super_block *sb);
+int reiserfs_proc_info_done(struct super_block *sb);
+int reiserfs_proc_info_global_init(void);
+int reiserfs_proc_info_global_done(void);
+
+#define PROC_EXP( e )   e
+
+#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
+#define PROC_INFO_MAX( sb, field, value )								\
+    __PINFO( sb ).field =												\
+        max( REISERFS_SB( sb ) -> s_proc_info_data.field, value )
+#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) )
+#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) )
+#define PROC_INFO_BH_STAT( sb, bh, level )							\
+    PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] );						\
+    PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) );	\
+    PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
+#else
+static inline int reiserfs_proc_info_init(struct super_block *sb)
+{
+	return 0;
+}
+
+static inline int reiserfs_proc_info_done(struct super_block *sb)
+{
+	return 0;
+}
+
+static inline int reiserfs_proc_info_global_init(void)
+{
+	return 0;
+}
+
+static inline int reiserfs_proc_info_global_done(void)
+{
+	return 0;
+}
+
+#define PROC_EXP( e )
+#define VOID_V ( ( void ) 0 )
+#define PROC_INFO_MAX( sb, field, value ) VOID_V
+#define PROC_INFO_INC( sb, field ) VOID_V
+#define PROC_INFO_ADD( sb, field, val ) VOID_V
+#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
+#endif
+
+/* dir.c */
+extern const struct inode_operations reiserfs_dir_inode_operations;
+extern const struct inode_operations reiserfs_symlink_inode_operations;
+extern const struct inode_operations reiserfs_special_inode_operations;
+extern const struct file_operations reiserfs_dir_operations;
+int reiserfs_readdir_inode(struct inode *, struct dir_context *);
+
+/* tail_conversion.c */
+int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
+		    struct treepath *, struct buffer_head *, loff_t);
+int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
+		    struct page *, struct treepath *, const struct cpu_key *,
+		    loff_t, char *);
+void reiserfs_unmap_buffer(struct buffer_head *);
+
+/* file.c */
+extern const struct inode_operations reiserfs_file_inode_operations;
+extern const struct file_operations reiserfs_file_operations;
+extern const struct address_space_operations reiserfs_address_space_operations;
+
+/* fix_nodes.c */
+
+int fix_nodes(int n_op_mode, struct tree_balance *tb,
+	      struct item_head *ins_ih, const void *);
+void unfix_nodes(struct tree_balance *);
+
+/* prints.c */
+void __reiserfs_panic(struct super_block *s, const char *id,
+		      const char *function, const char *fmt, ...)
+    __attribute__ ((noreturn));
+#define reiserfs_panic(s, id, fmt, args...) \
+	__reiserfs_panic(s, id, __func__, fmt, ##args)
+void __reiserfs_error(struct super_block *s, const char *id,
+		      const char *function, const char *fmt, ...);
+#define reiserfs_error(s, id, fmt, args...) \
+	 __reiserfs_error(s, id, __func__, fmt, ##args)
+void reiserfs_info(struct super_block *s, const char *fmt, ...);
+void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
+void print_indirect_item(struct buffer_head *bh, int item_num);
+void store_print_tb(struct tree_balance *tb);
+void print_cur_tb(char *mes);
+void print_de(struct reiserfs_dir_entry *de);
+void print_bi(struct buffer_info *bi, char *mes);
+#define PRINT_LEAF_ITEMS 1	/* print all items */
+#define PRINT_DIRECTORY_ITEMS 2	/* print directory items */
+#define PRINT_DIRECT_ITEMS 4	/* print contents of direct items */
+void print_block(struct buffer_head *bh, ...);
+void print_bmap(struct super_block *s, int silent);
+void print_bmap_block(int i, char *data, int size, int silent);
+/*void print_super_block (struct super_block * s, char * mes);*/
+void print_objectid_map(struct super_block *s);
+void print_block_head(struct buffer_head *bh, char *mes);
+void check_leaf(struct buffer_head *bh);
+void check_internal(struct buffer_head *bh);
+void print_statistics(struct super_block *s);
+char *reiserfs_hashname(int code);
+
+/* lbalance.c */
+int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
+		    int mov_bytes, struct buffer_head *Snew);
+int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
+int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
+void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
+		       int del_num, int del_bytes);
+void leaf_insert_into_buf(struct buffer_info *bi, int before,
+			  struct item_head * const inserted_item_ih,
+			  const char * const inserted_item_body,
+			  int zeros_number);
+void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
+			  int pos_in_item, int paste_size,
+			  const char * const body, int zeros_number);
+void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
+			  int pos_in_item, int cut_size);
+void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
+			int new_entry_count, struct reiserfs_de_head *new_dehs,
+			const char *records, int paste_size);
+/* ibalance.c */
+int balance_internal(struct tree_balance *, int, int, struct item_head *,
+		     struct buffer_head **);
+
+/* do_balance.c */
+void do_balance_mark_leaf_dirty(struct tree_balance *tb,
+				struct buffer_head *bh, int flag);
+#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
+#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
+
+void do_balance(struct tree_balance *tb, struct item_head *ih,
+		const char *body, int flag);
+void reiserfs_invalidate_buffer(struct tree_balance *tb,
+				struct buffer_head *bh);
+
+int get_left_neighbor_position(struct tree_balance *tb, int h);
+int get_right_neighbor_position(struct tree_balance *tb, int h);
+void replace_key(struct tree_balance *tb, struct buffer_head *, int,
+		 struct buffer_head *, int);
+void make_empty_node(struct buffer_info *);
+struct buffer_head *get_FEB(struct tree_balance *);
+
+/* bitmap.c */
+
+/*
+ * structure contains hints for block allocator, and it is a container for
+ * arguments, such as node, search path, transaction_handle, etc.
+ */
+struct __reiserfs_blocknr_hint {
+	/* inode passed to allocator, if we allocate unf. nodes */
+	struct inode *inode;
+
+	sector_t block;		/* file offset, in blocks */
+	struct in_core_key key;
+
+	/*
+	 * search path, used by allocator to deternine search_start by
+	 * various ways
+	 */
+	struct treepath *path;
+
+	/*
+	 * transaction handle is needed to log super blocks
+	 * and bitmap blocks changes
+	 */
+	struct reiserfs_transaction_handle *th;
+
+	b_blocknr_t beg, end;
+
+	/*
+	 * a field used to transfer search start value (block number)
+	 * between different block allocator procedures
+	 * (determine_search_start() and others)
+	 */
+	b_blocknr_t search_start;
+
+	/*
+	 * is set in determine_prealloc_size() function,
+	 * used by underlayed function that do actual allocation
+	 */
+	int prealloc_size;
+
+	/*
+	 * the allocator uses different polices for getting disk
+	 * space for formatted/unformatted blocks with/without preallocation
+	 */
+	unsigned formatted_node:1;
+	unsigned preallocate:1;
+};
+
+typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
+
+int reiserfs_parse_alloc_options(struct super_block *, char *);
+void reiserfs_init_alloc_options(struct super_block *s);
+
+/*
+ * given a directory, this will tell you what packing locality
+ * to use for a new object underneat it.  The locality is returned
+ * in disk byte order (le).
+ */
+__le32 reiserfs_choose_packing(struct inode *dir);
+
+void show_alloc_options(struct seq_file *seq, struct super_block *s);
+int reiserfs_init_bitmap_cache(struct super_block *sb);
+void reiserfs_free_bitmap_cache(struct super_block *sb);
+void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
+struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap);
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
+void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
+			 b_blocknr_t, int for_unformatted);
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
+			       int);
+static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
+					     b_blocknr_t * new_blocknrs,
+					     int amount_needed)
+{
+	reiserfs_blocknr_hint_t hint = {
+		.th = tb->transaction_handle,
+		.path = tb->tb_path,
+		.inode = NULL,
+		.key = tb->key,
+		.block = 0,
+		.formatted_node = 1
+	};
+	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
+					  0);
+}
+
+static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
+					    *th, struct inode *inode,
+					    b_blocknr_t * new_blocknrs,
+					    struct treepath *path,
+					    sector_t block)
+{
+	reiserfs_blocknr_hint_t hint = {
+		.th = th,
+		.path = path,
+		.inode = inode,
+		.block = block,
+		.formatted_node = 0,
+		.preallocate = 0
+	};
+	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
+}
+
+#ifdef REISERFS_PREALLOCATE
+static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
+					     *th, struct inode *inode,
+					     b_blocknr_t * new_blocknrs,
+					     struct treepath *path,
+					     sector_t block)
+{
+	reiserfs_blocknr_hint_t hint = {
+		.th = th,
+		.path = path,
+		.inode = inode,
+		.block = block,
+		.formatted_node = 0,
+		.preallocate = 1
+	};
+	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
+}
+
+void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct inode *inode);
+void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
+#endif
+
+/* hashes.c */
+__u32 keyed_hash(const signed char *msg, int len);
+__u32 yura_hash(const signed char *msg, int len);
+__u32 r5_hash(const signed char *msg, int len);
+
+#define reiserfs_set_le_bit		__set_bit_le
+#define reiserfs_test_and_set_le_bit	__test_and_set_bit_le
+#define reiserfs_clear_le_bit		__clear_bit_le
+#define reiserfs_test_and_clear_le_bit	__test_and_clear_bit_le
+#define reiserfs_test_le_bit		test_bit_le
+#define reiserfs_find_next_zero_le_bit	find_next_zero_bit_le
+
+/*
+ * sometimes reiserfs_truncate may require to allocate few new blocks
+ * to perform indirect2direct conversion. People probably used to
+ * think, that truncate should work without problems on a filesystem
+ * without free disk space. They may complain that they can not
+ * truncate due to lack of free disk space. This spare space allows us
+ * to not worry about it. 500 is probably too much, but it should be
+ * absolutely safe
+ */
+#define SPARE_SPACE 500
+
+/* prototypes from ioctl.c */
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long reiserfs_compat_ioctl(struct file *filp,
+		   unsigned int cmd, unsigned long arg);
+int reiserfs_unpack(struct inode *inode, struct file *filp);
diff --git a/kernel/fs/reiserfs/resize.c b/kernel/fs/reiserfs/resize.c
new file mode 100644
index 000000000..6052d323b
--- /dev/null
+++ b/kernel/fs/reiserfs/resize.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/*
+ * Written by Alexander Zarochentcev.
+ *
+ * The kernel part of the (on-line) reiserfs resizer.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "reiserfs.h"
+#include <linux/buffer_head.h>
+
+int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
+{
+	int err = 0;
+	struct reiserfs_super_block *sb;
+	struct reiserfs_bitmap_info *bitmap;
+	struct reiserfs_bitmap_info *info;
+	struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
+	struct buffer_head *bh;
+	struct reiserfs_transaction_handle th;
+	unsigned int bmap_nr_new, bmap_nr;
+	unsigned int block_r_new, block_r;
+
+	struct reiserfs_list_bitmap *jb;
+	struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
+
+	unsigned long int block_count, free_blocks;
+	int i;
+	int copy_size;
+	int depth;
+
+	sb = SB_DISK_SUPER_BLOCK(s);
+
+	if (SB_BLOCK_COUNT(s) >= block_count_new) {
+		printk("can\'t shrink filesystem on-line\n");
+		return -EINVAL;
+	}
+
+	/* check the device size */
+	depth = reiserfs_write_unlock_nested(s);
+	bh = sb_bread(s, block_count_new - 1);
+	reiserfs_write_lock_nested(s, depth);
+	if (!bh) {
+		printk("reiserfs_resize: can\'t read last block\n");
+		return -EINVAL;
+	}
+	bforget(bh);
+
+	/*
+	 * old disk layout detection; those partitions can be mounted, but
+	 * cannot be resized
+	 */
+	if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
+	    != REISERFS_DISK_OFFSET_IN_BYTES) {
+		printk
+		    ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
+		return -ENOTSUPP;
+	}
+
+	/* count used bits in last bitmap block */
+	block_r = SB_BLOCK_COUNT(s) -
+			(reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
+
+	/* count bitmap blocks in new fs */
+	bmap_nr_new = block_count_new / (s->s_blocksize * 8);
+	block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
+	if (block_r_new)
+		bmap_nr_new++;
+	else
+		block_r_new = s->s_blocksize * 8;
+
+	/* save old values */
+	block_count = SB_BLOCK_COUNT(s);
+	bmap_nr = reiserfs_bmap_count(s);
+
+	/* resizing of reiserfs bitmaps (journal and real), if needed */
+	if (bmap_nr_new > bmap_nr) {
+		/* reallocate journal bitmaps */
+		if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
+			printk
+			    ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
+			return -ENOMEM;
+		}
+		/*
+		 * the new journal bitmaps are zero filled, now we copy i
+		 * the bitmap node pointers from the old journal bitmap
+		 * structs, and then transfer the new data structures
+		 * into the journal struct.
+		 *
+		 * using the copy_size var below allows this code to work for
+		 * both shrinking and expanding the FS.
+		 */
+		copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
+		copy_size =
+		    copy_size * sizeof(struct reiserfs_list_bitmap_node *);
+		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+			struct reiserfs_bitmap_node **node_tmp;
+			jb = SB_JOURNAL(s)->j_list_bitmap + i;
+			memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
+
+			/*
+			 * just in case vfree schedules on us, copy the new
+			 * pointer into the journal struct before freeing the
+			 * old one
+			 */
+			node_tmp = jb->bitmaps;
+			jb->bitmaps = jbitmap[i].bitmaps;
+			vfree(node_tmp);
+		}
+
+		/*
+		 * allocate additional bitmap blocks, reallocate
+		 * array of bitmap block pointers
+		 */
+		bitmap =
+		    vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+		if (!bitmap) {
+			/*
+			 * Journal bitmaps are still supersized, but the
+			 * memory isn't leaked, so I guess it's ok
+			 */
+			printk("reiserfs_resize: unable to allocate memory.\n");
+			return -ENOMEM;
+		}
+		for (i = 0; i < bmap_nr; i++)
+			bitmap[i] = old_bitmap[i];
+
+		/*
+		 * This doesn't go through the journal, but it doesn't have to.
+		 * The changes are still atomic: We're synced up when the
+		 * journal transaction begins, and the new bitmaps don't
+		 * matter if the transaction fails.
+		 */
+		for (i = bmap_nr; i < bmap_nr_new; i++) {
+			int depth;
+			/*
+			 * don't use read_bitmap_block since it will cache
+			 * the uninitialized bitmap
+			 */
+			depth = reiserfs_write_unlock_nested(s);
+			bh = sb_bread(s, i * s->s_blocksize * 8);
+			reiserfs_write_lock_nested(s, depth);
+			if (!bh) {
+				vfree(bitmap);
+				return -EIO;
+			}
+			memset(bh->b_data, 0, sb_blocksize(sb));
+			reiserfs_set_le_bit(0, bh->b_data);
+			reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
+
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+			depth = reiserfs_write_unlock_nested(s);
+			sync_dirty_buffer(bh);
+			reiserfs_write_lock_nested(s, depth);
+			/* update bitmap_info stuff */
+			bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
+			brelse(bh);
+		}
+		/* free old bitmap blocks array */
+		SB_AP_BITMAP(s) = bitmap;
+		vfree(old_bitmap);
+	}
+
+	/*
+	 * begin transaction, if there was an error, it's fine. Yes, we have
+	 * incorrect bitmaps now, but none of it is ever going to touch the
+	 * disk anyway.
+	 */
+	err = journal_begin(&th, s, 10);
+	if (err)
+		return err;
+
+	/* Extend old last bitmap block - new blocks have been made available */
+	info = SB_AP_BITMAP(s) + bmap_nr - 1;
+	bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
+	if (!bh) {
+		int jerr = journal_end(&th);
+		if (jerr)
+			return jerr;
+		return -EIO;
+	}
+
+	reiserfs_prepare_for_journal(s, bh, 1);
+	for (i = block_r; i < s->s_blocksize * 8; i++)
+		reiserfs_clear_le_bit(i, bh->b_data);
+	info->free_count += s->s_blocksize * 8 - block_r;
+
+	journal_mark_dirty(&th, bh);
+	brelse(bh);
+
+	/* Correct new last bitmap block - It may not be full */
+	info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
+	bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
+	if (!bh) {
+		int jerr = journal_end(&th);
+		if (jerr)
+			return jerr;
+		return -EIO;
+	}
+
+	reiserfs_prepare_for_journal(s, bh, 1);
+	for (i = block_r_new; i < s->s_blocksize * 8; i++)
+		reiserfs_set_le_bit(i, bh->b_data);
+	journal_mark_dirty(&th, bh);
+	brelse(bh);
+
+	info->free_count -= s->s_blocksize * 8 - block_r_new;
+	/* update super */
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	free_blocks = SB_FREE_BLOCKS(s);
+	PUT_SB_FREE_BLOCKS(s,
+			   free_blocks + (block_count_new - block_count -
+					  (bmap_nr_new - bmap_nr)));
+	PUT_SB_BLOCK_COUNT(s, block_count_new);
+	PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
+
+	journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+
+	SB_JOURNAL(s)->j_must_wait = 1;
+	return journal_end(&th);
+}
diff --git a/kernel/fs/reiserfs/stree.c b/kernel/fs/reiserfs/stree.c
new file mode 100644
index 000000000..24cbe0132
--- /dev/null
+++ b/kernel/fs/reiserfs/stree.c
@@ -0,0 +1,2262 @@
+/*
+ *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/*
+ *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
+ *  Programm System Institute
+ *  Pereslavl-Zalessky Russia
+ */
+
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include "reiserfs.h"
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+
+/* Does the buffer contain a disk block which is in the tree. */
+inline int B_IS_IN_TREE(const struct buffer_head *bh)
+{
+
+	RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
+	       "PAP-1010: block (%b) has too big level (%z)", bh, bh);
+
+	return (B_LEVEL(bh) != FREE_LEVEL);
+}
+
+/* to get item head in le form */
+inline void copy_item_head(struct item_head *to,
+			   const struct item_head *from)
+{
+	memcpy(to, from, IH_SIZE);
+}
+
+/*
+ * k1 is pointer to on-disk structure which is stored in little-endian
+ * form. k2 is pointer to cpu variable. For key of items of the same
+ * object this returns 0.
+ * Returns: -1 if key1 < key2
+ * 0 if key1 == key2
+ * 1 if key1 > key2
+ */
+inline int comp_short_keys(const struct reiserfs_key *le_key,
+			   const struct cpu_key *cpu_key)
+{
+	__u32 n;
+	n = le32_to_cpu(le_key->k_dir_id);
+	if (n < cpu_key->on_disk_key.k_dir_id)
+		return -1;
+	if (n > cpu_key->on_disk_key.k_dir_id)
+		return 1;
+	n = le32_to_cpu(le_key->k_objectid);
+	if (n < cpu_key->on_disk_key.k_objectid)
+		return -1;
+	if (n > cpu_key->on_disk_key.k_objectid)
+		return 1;
+	return 0;
+}
+
+/*
+ * k1 is pointer to on-disk structure which is stored in little-endian
+ * form. k2 is pointer to cpu variable.
+ * Compare keys using all 4 key fields.
+ * Returns: -1 if key1 < key2 0
+ * if key1 = key2 1 if key1 > key2
+ */
+static inline int comp_keys(const struct reiserfs_key *le_key,
+			    const struct cpu_key *cpu_key)
+{
+	int retval;
+
+	retval = comp_short_keys(le_key, cpu_key);
+	if (retval)
+		return retval;
+	if (le_key_k_offset(le_key_version(le_key), le_key) <
+	    cpu_key_k_offset(cpu_key))
+		return -1;
+	if (le_key_k_offset(le_key_version(le_key), le_key) >
+	    cpu_key_k_offset(cpu_key))
+		return 1;
+
+	if (cpu_key->key_length == 3)
+		return 0;
+
+	/* this part is needed only when tail conversion is in progress */
+	if (le_key_k_type(le_key_version(le_key), le_key) <
+	    cpu_key_k_type(cpu_key))
+		return -1;
+
+	if (le_key_k_type(le_key_version(le_key), le_key) >
+	    cpu_key_k_type(cpu_key))
+		return 1;
+
+	return 0;
+}
+
+inline int comp_short_le_keys(const struct reiserfs_key *key1,
+			      const struct reiserfs_key *key2)
+{
+	__u32 *k1_u32, *k2_u32;
+	int key_length = REISERFS_SHORT_KEY_LEN;
+
+	k1_u32 = (__u32 *) key1;
+	k2_u32 = (__u32 *) key2;
+	for (; key_length--; ++k1_u32, ++k2_u32) {
+		if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
+			return -1;
+		if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
+			return 1;
+	}
+	return 0;
+}
+
+inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
+{
+	int version;
+	to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
+	to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
+
+	/* find out version of the key */
+	version = le_key_version(from);
+	to->version = version;
+	to->on_disk_key.k_offset = le_key_k_offset(version, from);
+	to->on_disk_key.k_type = le_key_k_type(version, from);
+}
+
+/*
+ * this does not say which one is bigger, it only returns 1 if keys
+ * are not equal, 0 otherwise
+ */
+inline int comp_le_keys(const struct reiserfs_key *k1,
+			const struct reiserfs_key *k2)
+{
+	return memcmp(k1, k2, sizeof(struct reiserfs_key));
+}
+
+/**************************************************************************
+ *  Binary search toolkit function                                        *
+ *  Search for an item in the array by the item key                       *
+ *  Returns:    1 if found,  0 if not found;                              *
+ *        *pos = number of the searched element if found, else the        *
+ *        number of the first element that is larger than key.            *
+ **************************************************************************/
+/*
+ * For those not familiar with binary search: lbound is the leftmost item
+ * that it could be, rbound the rightmost item that it could be.  We examine
+ * the item halfway between lbound and rbound, and that tells us either
+ * that we can increase lbound, or decrease rbound, or that we have found it,
+ * or if lbound <= rbound that there are no possible items, and we have not
+ * found it. With each examination we cut the number of possible items it
+ * could be by one more than half rounded down, or we find it.
+ */
+static inline int bin_search(const void *key,	/* Key to search for. */
+			     const void *base,	/* First item in the array. */
+			     int num,	/* Number of items in the array. */
+			     /*
+			      * Item size in the array.  searched. Lest the
+			      * reader be confused, note that this is crafted
+			      * as a general function, and when it is applied
+			      * specifically to the array of item headers in a
+			      * node, width is actually the item header size
+			      * not the item size.
+			      */
+			     int width,
+			     int *pos /* Number of the searched for element. */
+    )
+{
+	int rbound, lbound, j;
+
+	for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
+	     lbound <= rbound; j = (rbound + lbound) / 2)
+		switch (comp_keys
+			((struct reiserfs_key *)((char *)base + j * width),
+			 (struct cpu_key *)key)) {
+		case -1:
+			lbound = j + 1;
+			continue;
+		case 1:
+			rbound = j - 1;
+			continue;
+		case 0:
+			*pos = j;
+			return ITEM_FOUND;	/* Key found in the array.  */
+		}
+
+	/*
+	 * bin_search did not find given key, it returns position of key,
+	 * that is minimal and greater than the given one.
+	 */
+	*pos = lbound;
+	return ITEM_NOT_FOUND;
+}
+
+
+/* Minimal possible key. It is never in the tree. */
+const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
+
+/* Maximal possible key. It is never in the tree. */
+static const struct reiserfs_key MAX_KEY = {
+	cpu_to_le32(0xffffffff),
+	cpu_to_le32(0xffffffff),
+	{{cpu_to_le32(0xffffffff),
+	  cpu_to_le32(0xffffffff)},}
+};
+
+/*
+ * Get delimiting key of the buffer by looking for it in the buffers in the
+ * path, starting from the bottom of the path, and going upwards.  We must
+ * check the path's validity at each step.  If the key is not in the path,
+ * there is no delimiting key in the tree (buffer is first or last buffer
+ * in tree), and in this case we return a special key, either MIN_KEY or
+ * MAX_KEY.
+ */
+static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
+						  const struct super_block *sb)
+{
+	int position, path_offset = chk_path->path_length;
+	struct buffer_head *parent;
+
+	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-5010: invalid offset in the path");
+
+	/* While not higher in path than first element. */
+	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(!buffer_uptodate
+		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
+		       "PAP-5020: parent is not uptodate");
+
+		/* Parent at the path is not in the tree now. */
+		if (!B_IS_IN_TREE
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
+			return &MAX_KEY;
+		/* Check whether position in the parent is correct. */
+		if ((position =
+		     PATH_OFFSET_POSITION(chk_path,
+					  path_offset)) >
+		    B_NR_ITEMS(parent))
+			return &MAX_KEY;
+		/* Check whether parent at the path really points to the child. */
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
+					path_offset + 1)->b_blocknr)
+			return &MAX_KEY;
+		/*
+		 * Return delimiting key if position in the parent
+		 * is not equal to zero.
+		 */
+		if (position)
+			return internal_key(parent, position - 1);
+	}
+	/* Return MIN_KEY if we are in the root of the buffer tree. */
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(sb))
+		return &MIN_KEY;
+	return &MAX_KEY;
+}
+
+/* Get delimiting key of the buffer at the path and its right neighbor. */
+inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
+					   const struct super_block *sb)
+{
+	int position, path_offset = chk_path->path_length;
+	struct buffer_head *parent;
+
+	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-5030: invalid offset in the path");
+
+	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(!buffer_uptodate
+		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
+		       "PAP-5040: parent is not uptodate");
+
+		/* Parent at the path is not in the tree now. */
+		if (!B_IS_IN_TREE
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
+			return &MIN_KEY;
+		/* Check whether position in the parent is correct. */
+		if ((position =
+		     PATH_OFFSET_POSITION(chk_path,
+					  path_offset)) >
+		    B_NR_ITEMS(parent))
+			return &MIN_KEY;
+		/*
+		 * Check whether parent at the path really points
+		 * to the child.
+		 */
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
+					path_offset + 1)->b_blocknr)
+			return &MIN_KEY;
+
+		/*
+		 * Return delimiting key if position in the parent
+		 * is not the last one.
+		 */
+		if (position != B_NR_ITEMS(parent))
+			return internal_key(parent, position);
+	}
+
+	/* Return MAX_KEY if we are in the root of the buffer tree. */
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(sb))
+		return &MAX_KEY;
+	return &MIN_KEY;
+}
+
+/*
+ * Check whether a key is contained in the tree rooted from a buffer at a path.
+ * This works by looking at the left and right delimiting keys for the buffer
+ * in the last path_element in the path.  These delimiting keys are stored
+ * at least one level above that buffer in the tree. If the buffer is the
+ * first or last node in the tree order then one of the delimiting keys may
+ * be absent, and in this case get_lkey and get_rkey return a special key
+ * which is MIN_KEY or MAX_KEY.
+ */
+static inline int key_in_buffer(
+				/* Path which should be checked. */
+				struct treepath *chk_path,
+				/* Key which should be checked. */
+				const struct cpu_key *key,
+				struct super_block *sb
+    )
+{
+
+	RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
+	       || chk_path->path_length > MAX_HEIGHT,
+	       "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
+	       key, chk_path->path_length);
+	RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
+	       "PAP-5060: device must not be NODEV");
+
+	if (comp_keys(get_lkey(chk_path, sb), key) == 1)
+		/* left delimiting key is bigger, that the key we look for */
+		return 0;
+	/*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
+	if (comp_keys(get_rkey(chk_path, sb), key) != 1)
+		/* key must be less than right delimitiing key */
+		return 0;
+	return 1;
+}
+
+int reiserfs_check_path(struct treepath *p)
+{
+	RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "path not properly relsed");
+	return 0;
+}
+
+/*
+ * Drop the reference to each buffer in a path and restore
+ * dirty bits clean when preparing the buffer for the log.
+ * This version should only be called from fix_nodes()
+ */
+void pathrelse_and_restore(struct super_block *sb,
+			   struct treepath *search_path)
+{
+	int path_offset = search_path->path_length;
+
+	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "clm-4000: invalid path offset");
+
+	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
+		struct buffer_head *bh;
+		bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
+		reiserfs_restore_prepared_buffer(sb, bh);
+		brelse(bh);
+	}
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+}
+
+/* Drop the reference to each buffer in a path */
+void pathrelse(struct treepath *search_path)
+{
+	int path_offset = search_path->path_length;
+
+	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "PAP-5090: invalid path offset");
+
+	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
+		brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
+
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+}
+
+static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	struct item_head *ih;
+	int used_space;
+	int prev_location;
+	int i;
+	int nr;
+
+	blkh = (struct block_head *)buf;
+	if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
+		reiserfs_warning(NULL, "reiserfs-5080",
+				 "this should be caught earlier");
+		return 0;
+	}
+
+	nr = blkh_nr_item(blkh);
+	if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
+		/* item number is too big or too small */
+		reiserfs_warning(NULL, "reiserfs-5081",
+				 "nr_item seems wrong: %z", bh);
+		return 0;
+	}
+	ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
+	used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
+
+	/* free space does not match to calculated amount of use space */
+	if (used_space != blocksize - blkh_free_space(blkh)) {
+		reiserfs_warning(NULL, "reiserfs-5082",
+				 "free space seems wrong: %z", bh);
+		return 0;
+	}
+	/*
+	 * FIXME: it is_leaf will hit performance too much - we may have
+	 * return 1 here
+	 */
+
+	/* check tables of item heads */
+	ih = (struct item_head *)(buf + BLKH_SIZE);
+	prev_location = blocksize;
+	for (i = 0; i < nr; i++, ih++) {
+		if (le_ih_k_type(ih) == TYPE_ANY) {
+			reiserfs_warning(NULL, "reiserfs-5083",
+					 "wrong item type for item %h",
+					 ih);
+			return 0;
+		}
+		if (ih_location(ih) >= blocksize
+		    || ih_location(ih) < IH_SIZE * nr) {
+			reiserfs_warning(NULL, "reiserfs-5084",
+					 "item location seems wrong: %h",
+					 ih);
+			return 0;
+		}
+		if (ih_item_len(ih) < 1
+		    || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
+			reiserfs_warning(NULL, "reiserfs-5085",
+					 "item length seems wrong: %h",
+					 ih);
+			return 0;
+		}
+		if (prev_location - ih_location(ih) != ih_item_len(ih)) {
+			reiserfs_warning(NULL, "reiserfs-5086",
+					 "item location seems wrong "
+					 "(second one): %h", ih);
+			return 0;
+		}
+		prev_location = ih_location(ih);
+	}
+
+	/* one may imagine many more checks */
+	return 1;
+}
+
+/* returns 1 if buf looks like an internal node, 0 otherwise */
+static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	int nr;
+	int used_space;
+
+	blkh = (struct block_head *)buf;
+	nr = blkh_level(blkh);
+	if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
+		/* this level is not possible for internal nodes */
+		reiserfs_warning(NULL, "reiserfs-5087",
+				 "this should be caught earlier");
+		return 0;
+	}
+
+	nr = blkh_nr_item(blkh);
+	/* for internal which is not root we might check min number of keys */
+	if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
+		reiserfs_warning(NULL, "reiserfs-5088",
+				 "number of key seems wrong: %z", bh);
+		return 0;
+	}
+
+	used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
+	if (used_space != blocksize - blkh_free_space(blkh)) {
+		reiserfs_warning(NULL, "reiserfs-5089",
+				 "free space seems wrong: %z", bh);
+		return 0;
+	}
+
+	/* one may imagine many more checks */
+	return 1;
+}
+
+/*
+ * make sure that bh contains formatted node of reiserfs tree of
+ * 'level'-th level
+ */
+static int is_tree_node(struct buffer_head *bh, int level)
+{
+	if (B_LEVEL(bh) != level) {
+		reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
+				 "not match to the expected one %d",
+				 B_LEVEL(bh), level);
+		return 0;
+	}
+	if (level == DISK_LEAF_NODE_LEVEL)
+		return is_leaf(bh->b_data, bh->b_size, bh);
+
+	return is_internal(bh->b_data, bh->b_size, bh);
+}
+
+#define SEARCH_BY_KEY_READA 16
+
+/*
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return depth of lock to be restored after read completes
+ */
+static int search_by_key_reada(struct super_block *s,
+				struct buffer_head **bh,
+				b_blocknr_t *b, int num)
+{
+	int i, j;
+	int depth = -1;
+
+	for (i = 0; i < num; i++) {
+		bh[i] = sb_getblk(s, b[i]);
+	}
+	/*
+	 * We are going to read some blocks on which we
+	 * have a reference. It's safe, though we might be
+	 * reading blocks concurrently changed if we release
+	 * the lock. But it's still fine because we check later
+	 * if the tree changed
+	 */
+	for (j = 0; j < i; j++) {
+		/*
+		 * note, this needs attention if we are getting rid of the BKL
+		 * you have to make sure the prepared bit isn't set on this
+		 * buffer
+		 */
+		if (!buffer_uptodate(bh[j])) {
+			if (depth == -1)
+				depth = reiserfs_write_unlock_nested(s);
+			ll_rw_block(READA, 1, bh + j);
+		}
+		brelse(bh[j]);
+	}
+	return depth;
+}
+
+/*
+ * This function fills up the path from the root to the leaf as it
+ * descends the tree looking for the key.  It uses reiserfs_bread to
+ * try to find buffers in the cache given their block number.  If it
+ * does not find them in the cache it reads them from disk.  For each
+ * node search_by_key finds using reiserfs_bread it then uses
+ * bin_search to look through that node.  bin_search will find the
+ * position of the block_number of the next node if it is looking
+ * through an internal node.  If it is looking through a leaf node
+ * bin_search will find the position of the item which has key either
+ * equal to given key, or which is the maximal key less than the given
+ * key.  search_by_key returns a path that must be checked for the
+ * correctness of the top of the path but need not be checked for the
+ * correctness of the bottom of the path
+ */
+/*
+ * search_by_key - search for key (and item) in stree
+ * @sb: superblock
+ * @key: pointer to key to search for
+ * @search_path: Allocated and initialized struct treepath; Returned filled
+ *		 on success.
+ * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
+ *		stop at leaf level.
+ *
+ * The function is NOT SCHEDULE-SAFE!
+ */
+int search_by_key(struct super_block *sb, const struct cpu_key *key,
+		  struct treepath *search_path, int stop_level)
+{
+	b_blocknr_t block_number;
+	int expected_level;
+	struct buffer_head *bh;
+	struct path_element *last_element;
+	int node_level, retval;
+	int right_neighbor_of_leaf_node;
+	int fs_gen;
+	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
+	b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
+	int reada_count = 0;
+
+#ifdef CONFIG_REISERFS_CHECK
+	int repeat_counter = 0;
+#endif
+
+	PROC_INFO_INC(sb, search_by_key);
+
+	/*
+	 * As we add each node to a path we increase its count.  This means
+	 * that we must be careful to release all nodes in a path before we
+	 * either discard the path struct or re-use the path struct, as we
+	 * do here.
+	 */
+
+	pathrelse(search_path);
+
+	right_neighbor_of_leaf_node = 0;
+
+	/*
+	 * With each iteration of this loop we search through the items in the
+	 * current node, and calculate the next current node(next path element)
+	 * for the next iteration of this loop..
+	 */
+	block_number = SB_ROOT_BLOCK(sb);
+	expected_level = -1;
+	while (1) {
+
+#ifdef CONFIG_REISERFS_CHECK
+		if (!(++repeat_counter % 50000))
+			reiserfs_warning(sb, "PAP-5100",
+					 "%s: there were %d iterations of "
+					 "while loop looking for key %K",
+					 current->comm, repeat_counter,
+					 key);
+#endif
+
+		/* prep path to have another element added to it. */
+		last_element =
+		    PATH_OFFSET_PELEMENT(search_path,
+					 ++search_path->path_length);
+		fs_gen = get_generation(sb);
+
+		/*
+		 * Read the next tree node, and set the last element
+		 * in the path to have a pointer to it.
+		 */
+		if ((bh = last_element->pe_buffer =
+		     sb_getblk(sb, block_number))) {
+
+			/*
+			 * We'll need to drop the lock if we encounter any
+			 * buffers that need to be read. If all of them are
+			 * already up to date, we don't need to drop the lock.
+			 */
+			int depth = -1;
+
+			if (!buffer_uptodate(bh) && reada_count > 1)
+				depth = search_by_key_reada(sb, reada_bh,
+						    reada_blocks, reada_count);
+
+			if (!buffer_uptodate(bh) && depth == -1)
+				depth = reiserfs_write_unlock_nested(sb);
+
+			ll_rw_block(READ, 1, &bh);
+			wait_on_buffer(bh);
+
+			if (depth != -1)
+				reiserfs_write_lock_nested(sb, depth);
+			if (!buffer_uptodate(bh))
+				goto io_error;
+		} else {
+io_error:
+			search_path->path_length--;
+			pathrelse(search_path);
+			return IO_ERROR;
+		}
+		reada_count = 0;
+		if (expected_level == -1)
+			expected_level = SB_TREE_HEIGHT(sb);
+		expected_level--;
+
+		/*
+		 * It is possible that schedule occurred. We must check
+		 * whether the key to search is still in the tree rooted
+		 * from the current buffer. If not then repeat search
+		 * from the root.
+		 */
+		if (fs_changed(fs_gen, sb) &&
+		    (!B_IS_IN_TREE(bh) ||
+		     B_LEVEL(bh) != expected_level ||
+		     !key_in_buffer(search_path, key, sb))) {
+			PROC_INFO_INC(sb, search_by_key_fs_changed);
+			PROC_INFO_INC(sb, search_by_key_restarted);
+			PROC_INFO_INC(sb,
+				      sbk_restarted[expected_level - 1]);
+			pathrelse(search_path);
+
+			/*
+			 * Get the root block number so that we can
+			 * repeat the search starting from the root.
+			 */
+			block_number = SB_ROOT_BLOCK(sb);
+			expected_level = -1;
+			right_neighbor_of_leaf_node = 0;
+
+			/* repeat search from the root */
+			continue;
+		}
+
+		/*
+		 * only check that the key is in the buffer if key is not
+		 * equal to the MAX_KEY. Latter case is only possible in
+		 * "finish_unfinished()" processing during mount.
+		 */
+		RFALSE(comp_keys(&MAX_KEY, key) &&
+		       !key_in_buffer(search_path, key, sb),
+		       "PAP-5130: key is not in the buffer");
+#ifdef CONFIG_REISERFS_CHECK
+		if (REISERFS_SB(sb)->cur_tb) {
+			print_cur_tb("5140");
+			reiserfs_panic(sb, "PAP-5140",
+				       "schedule occurred in do_balance!");
+		}
+#endif
+
+		/*
+		 * make sure, that the node contents look like a node of
+		 * certain level
+		 */
+		if (!is_tree_node(bh, expected_level)) {
+			reiserfs_error(sb, "vs-5150",
+				       "invalid format found in block %ld. "
+				       "Fsck?", bh->b_blocknr);
+			pathrelse(search_path);
+			return IO_ERROR;
+		}
+
+		/* ok, we have acquired next formatted node in the tree */
+		node_level = B_LEVEL(bh);
+
+		PROC_INFO_BH_STAT(sb, bh, node_level - 1);
+
+		RFALSE(node_level < stop_level,
+		       "vs-5152: tree level (%d) is less than stop level (%d)",
+		       node_level, stop_level);
+
+		retval = bin_search(key, item_head(bh, 0),
+				      B_NR_ITEMS(bh),
+				      (node_level ==
+				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
+				      KEY_SIZE,
+				      &last_element->pe_position);
+		if (node_level == stop_level) {
+			return retval;
+		}
+
+		/* we are not in the stop level */
+		/*
+		 * item has been found, so we choose the pointer which
+		 * is to the right of the found one
+		 */
+		if (retval == ITEM_FOUND)
+			last_element->pe_position++;
+
+		/*
+		 * if item was not found we choose the position which is to
+		 * the left of the found item. This requires no code,
+		 * bin_search did it already.
+		 */
+
+		/*
+		 * So we have chosen a position in the current node which is
+		 * an internal node.  Now we calculate child block number by
+		 * position in the node.
+		 */
+		block_number =
+		    B_N_CHILD_NUM(bh, last_element->pe_position);
+
+		/*
+		 * if we are going to read leaf nodes, try for read
+		 * ahead as well
+		 */
+		if ((search_path->reada & PATH_READA) &&
+		    node_level == DISK_LEAF_NODE_LEVEL + 1) {
+			int pos = last_element->pe_position;
+			int limit = B_NR_ITEMS(bh);
+			struct reiserfs_key *le_key;
+
+			if (search_path->reada & PATH_READA_BACK)
+				limit = 0;
+			while (reada_count < SEARCH_BY_KEY_READA) {
+				if (pos == limit)
+					break;
+				reada_blocks[reada_count++] =
+				    B_N_CHILD_NUM(bh, pos);
+				if (search_path->reada & PATH_READA_BACK)
+					pos--;
+				else
+					pos++;
+
+				/*
+				 * check to make sure we're in the same object
+				 */
+				le_key = internal_key(bh, pos);
+				if (le32_to_cpu(le_key->k_objectid) !=
+				    key->on_disk_key.k_objectid) {
+					break;
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Form the path to an item and position in this item which contains
+ * file byte defined by key. If there is no such item
+ * corresponding to the key, we point the path to the item with
+ * maximal key less than key, and *pos_in_item is set to one
+ * past the last entry/byte in the item.  If searching for entry in a
+ * directory item, and it is not found, *pos_in_item is set to one
+ * entry more than the entry with maximal key which is less than the
+ * sought key.
+ *
+ * Note that if there is no entry in this same node which is one more,
+ * then we point to an imaginary entry.  for direct items, the
+ * position is in units of bytes, for indirect items the position is
+ * in units of blocknr entries, for directory items the position is in
+ * units of directory entries.
+ */
+/* The function is NOT SCHEDULE-SAFE! */
+int search_for_position_by_key(struct super_block *sb,
+			       /* Key to search (cpu variable) */
+			       const struct cpu_key *p_cpu_key,
+			       /* Filled up by this function. */
+			       struct treepath *search_path)
+{
+	struct item_head *p_le_ih;	/* pointer to on-disk structure */
+	int blk_size;
+	loff_t item_offset, offset;
+	struct reiserfs_dir_entry de;
+	int retval;
+
+	/* If searching for directory entry. */
+	if (is_direntry_cpu_key(p_cpu_key))
+		return search_by_entry_key(sb, p_cpu_key, search_path,
+					   &de);
+
+	/* If not searching for directory entry. */
+
+	/* If item is found. */
+	retval = search_item(sb, p_cpu_key, search_path);
+	if (retval == IO_ERROR)
+		return retval;
+	if (retval == ITEM_FOUND) {
+
+		RFALSE(!ih_item_len
+		       (item_head
+			(PATH_PLAST_BUFFER(search_path),
+			 PATH_LAST_POSITION(search_path))),
+		       "PAP-5165: item length equals zero");
+
+		pos_in_item(search_path) = 0;
+		return POSITION_FOUND;
+	}
+
+	RFALSE(!PATH_LAST_POSITION(search_path),
+	       "PAP-5170: position equals zero");
+
+	/* Item is not found. Set path to the previous item. */
+	p_le_ih =
+	    item_head(PATH_PLAST_BUFFER(search_path),
+			   --PATH_LAST_POSITION(search_path));
+	blk_size = sb->s_blocksize;
+
+	if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
+		return FILE_NOT_FOUND;
+
+	/* FIXME: quite ugly this far */
+
+	item_offset = le_ih_k_offset(p_le_ih);
+	offset = cpu_key_k_offset(p_cpu_key);
+
+	/* Needed byte is contained in the item pointed to by the path. */
+	if (item_offset <= offset &&
+	    item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
+		pos_in_item(search_path) = offset - item_offset;
+		if (is_indirect_le_ih(p_le_ih)) {
+			pos_in_item(search_path) /= blk_size;
+		}
+		return POSITION_FOUND;
+	}
+
+	/*
+	 * Needed byte is not contained in the item pointed to by the
+	 * path. Set pos_in_item out of the item.
+	 */
+	if (is_indirect_le_ih(p_le_ih))
+		pos_in_item(search_path) =
+		    ih_item_len(p_le_ih) / UNFM_P_SIZE;
+	else
+		pos_in_item(search_path) = ih_item_len(p_le_ih);
+
+	return POSITION_NOT_FOUND;
+}
+
+/* Compare given item and item pointed to by the path. */
+int comp_items(const struct item_head *stored_ih, const struct treepath *path)
+{
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
+	struct item_head *ih;
+
+	/* Last buffer at the path is not in the tree. */
+	if (!B_IS_IN_TREE(bh))
+		return 1;
+
+	/* Last path position is invalid. */
+	if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
+		return 1;
+
+	/* we need only to know, whether it is the same item */
+	ih = tp_item_head(path);
+	return memcmp(stored_ih, ih, IH_SIZE);
+}
+
+/* unformatted nodes are not logged anymore, ever.  This is safe now */
+#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
+
+/* block can not be forgotten as it is in I/O or held by someone */
+#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
+
+/* prepare for delete or cut of direct item */
+static inline int prepare_for_direct_item(struct treepath *path,
+					  struct item_head *le_ih,
+					  struct inode *inode,
+					  loff_t new_file_length, int *cut_size)
+{
+	loff_t round_len;
+
+	if (new_file_length == max_reiserfs_offset(inode)) {
+		/* item has to be deleted */
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;
+	}
+	/* new file gets truncated */
+	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
+		round_len = ROUND_UP(new_file_length);
+		/* this was new_file_length < le_ih ... */
+		if (round_len < le_ih_k_offset(le_ih)) {
+			*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+			return M_DELETE;	/* Delete this item. */
+		}
+		/* Calculate first position and size for cutting from item. */
+		pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
+		*cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
+
+		return M_CUT;	/* Cut from this item. */
+	}
+
+	/* old file: items may have any length */
+
+	if (new_file_length < le_ih_k_offset(le_ih)) {
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;	/* Delete this item. */
+	}
+
+	/* Calculate first position and size for cutting from item. */
+	*cut_size = -(ih_item_len(le_ih) -
+		      (pos_in_item(path) =
+		       new_file_length + 1 - le_ih_k_offset(le_ih)));
+	return M_CUT;		/* Cut from this item. */
+}
+
+static inline int prepare_for_direntry_item(struct treepath *path,
+					    struct item_head *le_ih,
+					    struct inode *inode,
+					    loff_t new_file_length,
+					    int *cut_size)
+{
+	if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
+	    new_file_length == max_reiserfs_offset(inode)) {
+		RFALSE(ih_entry_count(le_ih) != 2,
+		       "PAP-5220: incorrect empty directory item (%h)", le_ih);
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		/* Delete the directory item containing "." and ".." entry. */
+		return M_DELETE;
+	}
+
+	if (ih_entry_count(le_ih) == 1) {
+		/*
+		 * Delete the directory item such as there is one record only
+		 * in this item
+		 */
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;
+	}
+
+	/* Cut one record from the directory item. */
+	*cut_size =
+	    -(DEH_SIZE +
+	      entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
+	return M_CUT;
+}
+
+#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
+
+/*
+ * If the path points to a directory or direct item, calculate mode
+ * and the size cut, for balance.
+ * If the path points to an indirect item, remove some number of its
+ * unformatted nodes.
+ * In case of file truncate calculate whether this item must be
+ * deleted/truncated or last unformatted node of this item will be
+ * converted to a direct item.
+ * This function returns a determination of what balance mode the
+ * calling function should employ.
+ */
+static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
+				      struct inode *inode,
+				      struct treepath *path,
+				      const struct cpu_key *item_key,
+				      /*
+				       * Number of unformatted nodes
+				       * which were removed from end
+				       * of the file.
+				       */
+				      int *removed,
+				      int *cut_size,
+				      /* MAX_KEY_OFFSET in case of delete. */
+				      unsigned long long new_file_length
+    )
+{
+	struct super_block *sb = inode->i_sb;
+	struct item_head *p_le_ih = tp_item_head(path);
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
+
+	BUG_ON(!th->t_trans_id);
+
+	/* Stat_data item. */
+	if (is_statdata_le_ih(p_le_ih)) {
+
+		RFALSE(new_file_length != max_reiserfs_offset(inode),
+		       "PAP-5210: mode must be M_DELETE");
+
+		*cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
+		return M_DELETE;
+	}
+
+	/* Directory item. */
+	if (is_direntry_le_ih(p_le_ih))
+		return prepare_for_direntry_item(path, p_le_ih, inode,
+						 new_file_length,
+						 cut_size);
+
+	/* Direct item. */
+	if (is_direct_le_ih(p_le_ih))
+		return prepare_for_direct_item(path, p_le_ih, inode,
+					       new_file_length, cut_size);
+
+	/* Case of an indirect item. */
+	{
+	    int blk_size = sb->s_blocksize;
+	    struct item_head s_ih;
+	    int need_re_search;
+	    int delete = 0;
+	    int result = M_CUT;
+	    int pos = 0;
+
+	    if ( new_file_length == max_reiserfs_offset (inode) ) {
+		/*
+		 * prepare_for_delete_or_cut() is called by
+		 * reiserfs_delete_item()
+		 */
+		new_file_length = 0;
+		delete = 1;
+	    }
+
+	    do {
+		need_re_search = 0;
+		*cut_size = 0;
+		bh = PATH_PLAST_BUFFER(path);
+		copy_item_head(&s_ih, tp_item_head(path));
+		pos = I_UNFM_NUM(&s_ih);
+
+		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
+		    __le32 *unfm;
+		    __u32 block;
+
+		    /*
+		     * Each unformatted block deletion may involve
+		     * one additional bitmap block into the transaction,
+		     * thereby the initial journal space reservation
+		     * might not be enough.
+		     */
+		    if (!delete && (*cut_size) != 0 &&
+			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
+			break;
+
+		    unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
+		    block = get_block_num(unfm, 0);
+
+		    if (block != 0) {
+			reiserfs_prepare_for_journal(sb, bh, 1);
+			put_block_num(unfm, 0, 0);
+			journal_mark_dirty(th, bh);
+			reiserfs_free_block(th, inode, block, 1);
+		    }
+
+		    reiserfs_cond_resched(sb);
+
+		    if (item_moved (&s_ih, path))  {
+			need_re_search = 1;
+			break;
+		    }
+
+		    pos --;
+		    (*removed)++;
+		    (*cut_size) -= UNFM_P_SIZE;
+
+		    if (pos == 0) {
+			(*cut_size) -= IH_SIZE;
+			result = M_DELETE;
+			break;
+		    }
+		}
+		/*
+		 * a trick.  If the buffer has been logged, this will
+		 * do nothing.  If we've broken the loop without logging
+		 * it, it will restore the buffer
+		 */
+		reiserfs_restore_prepared_buffer(sb, bh);
+	    } while (need_re_search &&
+		     search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
+	    pos_in_item(path) = pos * UNFM_P_SIZE;
+
+	    if (*cut_size == 0) {
+		/*
+		 * Nothing was cut. maybe convert last unformatted node to the
+		 * direct item?
+		 */
+		result = M_CONVERT;
+	    }
+	    return result;
+	}
+}
+
+/* Calculate number of bytes which will be deleted or cut during balance */
+static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
+{
+	int del_size;
+	struct item_head *p_le_ih = tp_item_head(tb->tb_path);
+
+	if (is_statdata_le_ih(p_le_ih))
+		return 0;
+
+	del_size =
+	    (mode ==
+	     M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
+	if (is_direntry_le_ih(p_le_ih)) {
+		/*
+		 * return EMPTY_DIR_SIZE; We delete emty directories only.
+		 * we can't use EMPTY_DIR_SIZE, as old format dirs have a
+		 * different empty size.  ick. FIXME, is this right?
+		 */
+		return del_size;
+	}
+
+	if (is_indirect_le_ih(p_le_ih))
+		del_size = (del_size / UNFM_P_SIZE) *
+				(PATH_PLAST_BUFFER(tb->tb_path)->b_size);
+	return del_size;
+}
+
+static void init_tb_struct(struct reiserfs_transaction_handle *th,
+			   struct tree_balance *tb,
+			   struct super_block *sb,
+			   struct treepath *path, int size)
+{
+
+	BUG_ON(!th->t_trans_id);
+
+	memset(tb, '\0', sizeof(struct tree_balance));
+	tb->transaction_handle = th;
+	tb->tb_sb = sb;
+	tb->tb_path = path;
+	PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
+	PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
+	tb->insert_size[0] = size;
+}
+
+void padd_item(char *item, int total_length, int length)
+{
+	int i;
+
+	for (i = total_length; i > length;)
+		item[--i] = 0;
+}
+
+#ifdef REISERQUOTA_DEBUG
+char key2type(struct reiserfs_key *ih)
+{
+	if (is_direntry_le_key(2, ih))
+		return 'd';
+	if (is_direct_le_key(2, ih))
+		return 'D';
+	if (is_indirect_le_key(2, ih))
+		return 'i';
+	if (is_statdata_le_key(2, ih))
+		return 's';
+	return 'u';
+}
+
+char head2type(struct item_head *ih)
+{
+	if (is_direntry_le_ih(ih))
+		return 'd';
+	if (is_direct_le_ih(ih))
+		return 'D';
+	if (is_indirect_le_ih(ih))
+		return 'i';
+	if (is_statdata_le_ih(ih))
+		return 's';
+	return 'u';
+}
+#endif
+
+/*
+ * Delete object item.
+ * th       - active transaction handle
+ * path     - path to the deleted item
+ * item_key - key to search for the deleted item
+ * indode   - used for updating i_blocks and quotas
+ * un_bh    - NULL or unformatted node pointer
+ */
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *item_key,
+			 struct inode *inode, struct buffer_head *un_bh)
+{
+	struct super_block *sb = inode->i_sb;
+	struct tree_balance s_del_balance;
+	struct item_head s_ih;
+	struct item_head *q_ih;
+	int quota_cut_bytes;
+	int ret_value, del_size, removed;
+	int depth;
+
+#ifdef CONFIG_REISERFS_CHECK
+	char mode;
+	int iter = 0;
+#endif
+
+	BUG_ON(!th->t_trans_id);
+
+	init_tb_struct(th, &s_del_balance, sb, path,
+		       0 /*size is unknown */ );
+
+	while (1) {
+		removed = 0;
+
+#ifdef CONFIG_REISERFS_CHECK
+		iter++;
+		mode =
+#endif
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &removed,
+					      &del_size,
+					      max_reiserfs_offset(inode));
+
+		RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
+
+		copy_item_head(&s_ih, tp_item_head(path));
+		s_del_balance.insert_size[0] = del_size;
+
+		ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
+		if (ret_value != REPEAT_SEARCH)
+			break;
+
+		PROC_INFO_INC(sb, delete_item_restarted);
+
+		/* file system changed, repeat search */
+		ret_value =
+		    search_for_position_by_key(sb, item_key, path);
+		if (ret_value == IO_ERROR)
+			break;
+		if (ret_value == FILE_NOT_FOUND) {
+			reiserfs_warning(sb, "vs-5340",
+					 "no items of the file %K found",
+					 item_key);
+			break;
+		}
+	}			/* while (1) */
+
+	if (ret_value != CARRY_ON) {
+		unfix_nodes(&s_del_balance);
+		return 0;
+	}
+
+	/* reiserfs_delete_item returns item length when success */
+	ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
+	q_ih = tp_item_head(path);
+	quota_cut_bytes = ih_item_len(q_ih);
+
+	/*
+	 * hack so the quota code doesn't have to guess if the file has a
+	 * tail.  On tail insert, we allocate quota for 1 unformatted node.
+	 * We test the offset because the tail might have been
+	 * split into multiple items, and we only want to decrement for
+	 * the unfm node once
+	 */
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
+		if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
+		} else {
+			quota_cut_bytes = 0;
+		}
+	}
+
+	if (un_bh) {
+		int off;
+		char *data;
+
+		/*
+		 * We are in direct2indirect conversion, so move tail contents
+		 * to the unformatted node
+		 */
+		/*
+		 * note, we do the copy before preparing the buffer because we
+		 * don't care about the contents of the unformatted node yet.
+		 * the only thing we really care about is the direct item's
+		 * data is in the unformatted node.
+		 *
+		 * Otherwise, we would have to call
+		 * reiserfs_prepare_for_journal on the unformatted node,
+		 * which might schedule, meaning we'd have to loop all the
+		 * way back up to the start of the while loop.
+		 *
+		 * The unformatted node must be dirtied later on.  We can't be
+		 * sure here if the entire tail has been deleted yet.
+		 *
+		 * un_bh is from the page cache (all unformatted nodes are
+		 * from the page cache) and might be a highmem page.  So, we
+		 * can't use un_bh->b_data.
+		 * -clm
+		 */
+
+		data = kmap_atomic(un_bh->b_page);
+		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
+		memcpy(data + off,
+		       ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
+		       ret_value);
+		kunmap_atomic(data);
+	}
+
+	/* Perform balancing after all resources have been collected at once. */
+	do_balance(&s_del_balance, NULL, NULL, M_DELETE);
+
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
+		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
+#endif
+	depth = reiserfs_write_unlock_nested(inode->i_sb);
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
+	reiserfs_write_lock_nested(inode->i_sb, depth);
+
+	/* Return deleted body length */
+	return ret_value;
+}
+
+/*
+ * Summary Of Mechanisms For Handling Collisions Between Processes:
+ *
+ *  deletion of the body of the object is performed by iput(), with the
+ *  result that if multiple processes are operating on a file, the
+ *  deletion of the body of the file is deferred until the last process
+ *  that has an open inode performs its iput().
+ *
+ *  writes and truncates are protected from collisions by use of
+ *  semaphores.
+ *
+ *  creates, linking, and mknod are protected from collisions with other
+ *  processes by making the reiserfs_add_entry() the last step in the
+ *  creation, and then rolling back all changes if there was a collision.
+ *  - Hans
+*/
+
+/* this deletes item which never gets split */
+void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
+				struct inode *inode, struct reiserfs_key *key)
+{
+	struct super_block *sb = th->t_super;
+	struct tree_balance tb;
+	INITIALIZE_PATH(path);
+	int item_len = 0;
+	int tb_init = 0;
+	struct cpu_key cpu_key;
+	int retval;
+	int quota_cut_bytes = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	le_key2cpu_key(&cpu_key, key);
+
+	while (1) {
+		retval = search_item(th->t_super, &cpu_key, &path);
+		if (retval == IO_ERROR) {
+			reiserfs_error(th->t_super, "vs-5350",
+				       "i/o failure occurred trying "
+				       "to delete %K", &cpu_key);
+			break;
+		}
+		if (retval != ITEM_FOUND) {
+			pathrelse(&path);
+			/*
+			 * No need for a warning, if there is just no free
+			 * space to insert '..' item into the
+			 * newly-created subdir
+			 */
+			if (!
+			    ((unsigned long long)
+			     GET_HASH_VALUE(le_key_k_offset
+					    (le_key_version(key), key)) == 0
+			     && (unsigned long long)
+			     GET_GENERATION_NUMBER(le_key_k_offset
+						   (le_key_version(key),
+						    key)) == 1))
+				reiserfs_warning(th->t_super, "vs-5355",
+						 "%k not found", key);
+			break;
+		}
+		if (!tb_init) {
+			tb_init = 1;
+			item_len = ih_item_len(tp_item_head(&path));
+			init_tb_struct(th, &tb, th->t_super, &path,
+				       -(IH_SIZE + item_len));
+		}
+		quota_cut_bytes = ih_item_len(tp_item_head(&path));
+
+		retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
+		if (retval == REPEAT_SEARCH) {
+			PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
+			continue;
+		}
+
+		if (retval == CARRY_ON) {
+			do_balance(&tb, NULL, NULL, M_DELETE);
+			/*
+			 * Should we count quota for item? (we don't
+			 * count quotas for save-links)
+			 */
+			if (inode) {
+				int depth;
+#ifdef REISERQUOTA_DEBUG
+				reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+					       "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
+					       quota_cut_bytes, inode->i_uid,
+					       key2type(key));
+#endif
+				depth = reiserfs_write_unlock_nested(sb);
+				dquot_free_space_nodirty(inode,
+							 quota_cut_bytes);
+				reiserfs_write_lock_nested(sb, depth);
+			}
+			break;
+		}
+
+		/* IO_ERROR, NO_DISK_SPACE, etc */
+		reiserfs_warning(th->t_super, "vs-5360",
+				 "could not delete %K due to fix_nodes failure",
+				 &cpu_key);
+		unfix_nodes(&tb);
+		break;
+	}
+
+	reiserfs_check_path(&path);
+}
+
+int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
+			   struct inode *inode)
+{
+	int err;
+	inode->i_size = 0;
+	BUG_ON(!th->t_trans_id);
+
+	/* for directory this deletes item containing "." and ".." */
+	err =
+	    reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
+	if (err)
+		return err;
+
+#if defined( USE_INODE_GENERATION_COUNTER )
+	if (!old_format_only(th->t_super)) {
+		__le32 *inode_generation;
+
+		inode_generation =
+		    &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
+		le32_add_cpu(inode_generation, 1);
+	}
+/* USE_INODE_GENERATION_COUNTER */
+#endif
+	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
+
+	return err;
+}
+
+static void unmap_buffers(struct page *page, loff_t pos)
+{
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct buffer_head *next;
+	unsigned long tail_index;
+	unsigned long cur_index;
+
+	if (page) {
+		if (page_has_buffers(page)) {
+			tail_index = pos & (PAGE_CACHE_SIZE - 1);
+			cur_index = 0;
+			head = page_buffers(page);
+			bh = head;
+			do {
+				next = bh->b_this_page;
+
+				/*
+				 * we want to unmap the buffers that contain
+				 * the tail, and all the buffers after it
+				 * (since the tail must be at the end of the
+				 * file).  We don't want to unmap file data
+				 * before the tail, since it might be dirty
+				 * and waiting to reach disk
+				 */
+				cur_index += bh->b_size;
+				if (cur_index > tail_index) {
+					reiserfs_unmap_buffer(bh);
+				}
+				bh = next;
+			} while (bh != head);
+		}
+	}
+}
+
+static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
+				    struct inode *inode,
+				    struct page *page,
+				    struct treepath *path,
+				    const struct cpu_key *item_key,
+				    loff_t new_file_size, char *mode)
+{
+	struct super_block *sb = inode->i_sb;
+	int block_size = sb->s_blocksize;
+	int cut_bytes;
+	BUG_ON(!th->t_trans_id);
+	BUG_ON(new_file_size != inode->i_size);
+
+	/*
+	 * the page being sent in could be NULL if there was an i/o error
+	 * reading in the last block.  The user will hit problems trying to
+	 * read the file, but for now we just skip the indirect2direct
+	 */
+	if (atomic_read(&inode->i_count) > 1 ||
+	    !tail_has_to_be_packed(inode) ||
+	    !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
+		/* leave tail in an unformatted node */
+		*mode = M_SKIP_BALANCING;
+		cut_bytes =
+		    block_size - (new_file_size & (block_size - 1));
+		pathrelse(path);
+		return cut_bytes;
+	}
+
+	/* Perform the conversion to a direct_item. */
+	return indirect2direct(th, inode, page, path, item_key,
+			       new_file_size, mode);
+}
+
+/*
+ * we did indirect_to_direct conversion. And we have inserted direct
+ * item successesfully, but there were no disk space to cut unfm
+ * pointer being converted. Therefore we have to delete inserted
+ * direct item(s)
+ */
+static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
+					 struct inode *inode, struct treepath *path)
+{
+	struct cpu_key tail_key;
+	int tail_len;
+	int removed;
+	BUG_ON(!th->t_trans_id);
+
+	make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
+	tail_key.key_length = 4;
+
+	tail_len =
+	    (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
+	while (tail_len) {
+		/* look for the last byte of the tail */
+		if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
+		    POSITION_NOT_FOUND)
+			reiserfs_panic(inode->i_sb, "vs-5615",
+				       "found invalid item");
+		RFALSE(path->pos_in_item !=
+		       ih_item_len(tp_item_head(path)) - 1,
+		       "vs-5616: appended bytes found");
+		PATH_LAST_POSITION(path)--;
+
+		removed =
+		    reiserfs_delete_item(th, path, &tail_key, inode,
+					 NULL /*unbh not needed */ );
+		RFALSE(removed <= 0
+		       || removed > tail_len,
+		       "vs-5617: there was tail %d bytes, removed item length %d bytes",
+		       tail_len, removed);
+		tail_len -= removed;
+		set_cpu_key_k_offset(&tail_key,
+				     cpu_key_k_offset(&tail_key) - removed);
+	}
+	reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
+			 "conversion has been rolled back due to "
+			 "lack of disk space");
+	mark_inode_dirty(inode);
+}
+
+/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
+int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
+			   struct treepath *path,
+			   struct cpu_key *item_key,
+			   struct inode *inode,
+			   struct page *page, loff_t new_file_size)
+{
+	struct super_block *sb = inode->i_sb;
+	/*
+	 * Every function which is going to call do_balance must first
+	 * create a tree_balance structure.  Then it must fill up this
+	 * structure by using the init_tb_struct and fix_nodes functions.
+	 * After that we can make tree balancing.
+	 */
+	struct tree_balance s_cut_balance;
+	struct item_head *p_le_ih;
+	int cut_size = 0;	/* Amount to be cut. */
+	int ret_value = CARRY_ON;
+	int removed = 0;	/* Number of the removed unformatted nodes. */
+	int is_inode_locked = 0;
+	char mode;		/* Mode of the balance. */
+	int retval2 = -1;
+	int quota_cut_bytes;
+	loff_t tail_pos = 0;
+	int depth;
+
+	BUG_ON(!th->t_trans_id);
+
+	init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
+		       cut_size);
+
+	/*
+	 * Repeat this loop until we either cut the item without needing
+	 * to balance, or we fix_nodes without schedule occurring
+	 */
+	while (1) {
+		/*
+		 * Determine the balance mode, position of the first byte to
+		 * be cut, and size to be cut.  In case of the indirect item
+		 * free unformatted nodes which are pointed to by the cut
+		 * pointers.
+		 */
+
+		mode =
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &removed,
+					      &cut_size, new_file_size);
+		if (mode == M_CONVERT) {
+			/*
+			 * convert last unformatted node to direct item or
+			 * leave tail in the unformatted node
+			 */
+			RFALSE(ret_value != CARRY_ON,
+			       "PAP-5570: can not convert twice");
+
+			ret_value =
+			    maybe_indirect_to_direct(th, inode, page,
+						     path, item_key,
+						     new_file_size, &mode);
+			if (mode == M_SKIP_BALANCING)
+				/* tail has been left in the unformatted node */
+				return ret_value;
+
+			is_inode_locked = 1;
+
+			/*
+			 * removing of last unformatted node will
+			 * change value we have to return to truncate.
+			 * Save it
+			 */
+			retval2 = ret_value;
+
+			/*
+			 * So, we have performed the first part of the
+			 * conversion:
+			 * inserting the new direct item.  Now we are
+			 * removing the last unformatted node pointer.
+			 * Set key to search for it.
+			 */
+			set_cpu_key_k_type(item_key, TYPE_INDIRECT);
+			item_key->key_length = 4;
+			new_file_size -=
+			    (new_file_size & (sb->s_blocksize - 1));
+			tail_pos = new_file_size;
+			set_cpu_key_k_offset(item_key, new_file_size + 1);
+			if (search_for_position_by_key
+			    (sb, item_key,
+			     path) == POSITION_NOT_FOUND) {
+				print_block(PATH_PLAST_BUFFER(path), 3,
+					    PATH_LAST_POSITION(path) - 1,
+					    PATH_LAST_POSITION(path) + 1);
+				reiserfs_panic(sb, "PAP-5580", "item to "
+					       "convert does not exist (%K)",
+					       item_key);
+			}
+			continue;
+		}
+		if (cut_size == 0) {
+			pathrelse(path);
+			return 0;
+		}
+
+		s_cut_balance.insert_size[0] = cut_size;
+
+		ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
+		if (ret_value != REPEAT_SEARCH)
+			break;
+
+		PROC_INFO_INC(sb, cut_from_item_restarted);
+
+		ret_value =
+		    search_for_position_by_key(sb, item_key, path);
+		if (ret_value == POSITION_FOUND)
+			continue;
+
+		reiserfs_warning(sb, "PAP-5610", "item %K not found",
+				 item_key);
+		unfix_nodes(&s_cut_balance);
+		return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
+	}			/* while */
+
+	/* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
+	if (ret_value != CARRY_ON) {
+		if (is_inode_locked) {
+			/*
+			 * FIXME: this seems to be not needed: we are always
+			 * able to cut item
+			 */
+			indirect_to_direct_roll_back(th, inode, path);
+		}
+		if (ret_value == NO_DISK_SPACE)
+			reiserfs_warning(sb, "reiserfs-5092",
+					 "NO_DISK_SPACE");
+		unfix_nodes(&s_cut_balance);
+		return -EIO;
+	}
+
+	/* go ahead and perform balancing */
+
+	RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
+
+	/* Calculate number of bytes that need to be cut from the item. */
+	quota_cut_bytes =
+	    (mode ==
+	     M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
+	    insert_size[0];
+	if (retval2 == -1)
+		ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
+	else
+		ret_value = retval2;
+
+	/*
+	 * For direct items, we only change the quota when deleting the last
+	 * item.
+	 */
+	p_le_ih = tp_item_head(s_cut_balance.tb_path);
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
+		if (mode == M_DELETE &&
+		    (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
+		    1) {
+			/* FIXME: this is to keep 3.5 happy */
+			REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
+		} else {
+			quota_cut_bytes = 0;
+		}
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	if (is_inode_locked) {
+		struct item_head *le_ih =
+		    tp_item_head(s_cut_balance.tb_path);
+		/*
+		 * we are going to complete indirect2direct conversion. Make
+		 * sure, that we exactly remove last unformatted node pointer
+		 * of the item
+		 */
+		if (!is_indirect_le_ih(le_ih))
+			reiserfs_panic(sb, "vs-5652",
+				       "item must be indirect %h", le_ih);
+
+		if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
+			reiserfs_panic(sb, "vs-5653", "completing "
+				       "indirect2direct conversion indirect "
+				       "item %h being deleted must be of "
+				       "4 byte long", le_ih);
+
+		if (mode == M_CUT
+		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
+			reiserfs_panic(sb, "vs-5654", "can not complete "
+				       "indirect2direct conversion of %h "
+				       "(CUT, insert_size==%d)",
+				       le_ih, s_cut_balance.insert_size[0]);
+		}
+		/*
+		 * it would be useful to make sure, that right neighboring
+		 * item is direct item of this file
+		 */
+	}
+#endif
+
+	do_balance(&s_cut_balance, NULL, NULL, mode);
+	if (is_inode_locked) {
+		/*
+		 * we've done an indirect->direct conversion.  when the
+		 * data block was freed, it was removed from the list of
+		 * blocks that must be flushed before the transaction
+		 * commits, make sure to unmap and invalidate it
+		 */
+		unmap_buffers(page, tail_pos);
+		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+	}
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
+		       quota_cut_bytes, inode->i_uid, '?');
+#endif
+	depth = reiserfs_write_unlock_nested(sb);
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
+	reiserfs_write_lock_nested(sb, depth);
+	return ret_value;
+}
+
+static void truncate_directory(struct reiserfs_transaction_handle *th,
+			       struct inode *inode)
+{
+	BUG_ON(!th->t_trans_id);
+	if (inode->i_nlink)
+		reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
+
+	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
+	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
+	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
+	reiserfs_update_sd(th, inode);
+	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
+	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
+}
+
+/*
+ * Truncate file to the new size. Note, this must be called with a
+ * transaction already started
+ */
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
+			 struct inode *inode,	/* ->i_size contains new size */
+			 struct page *page,	/* up to date for last block */
+			 /*
+			  * when it is called by file_release to convert
+			  * the tail - no timestamps should be updated
+			  */
+			 int update_timestamps
+    )
+{
+	INITIALIZE_PATH(s_search_path);	/* Path to the current object item. */
+	struct item_head *p_le_ih;	/* Pointer to an item header. */
+
+	/* Key to search for a previous file item. */
+	struct cpu_key s_item_key;
+	loff_t file_size,	/* Old file size. */
+	 new_file_size;	/* New file size. */
+	int deleted;		/* Number of deleted or truncated bytes. */
+	int retval;
+	int err = 0;
+
+	BUG_ON(!th->t_trans_id);
+	if (!
+	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+	     || S_ISLNK(inode->i_mode)))
+		return 0;
+
+	/* deletion of directory - no need to update timestamps */
+	if (S_ISDIR(inode->i_mode)) {
+		truncate_directory(th, inode);
+		return 0;
+	}
+
+	/* Get new file size. */
+	new_file_size = inode->i_size;
+
+	/* FIXME: note, that key type is unimportant here */
+	make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
+		     TYPE_DIRECT, 3);
+
+	retval =
+	    search_for_position_by_key(inode->i_sb, &s_item_key,
+				       &s_search_path);
+	if (retval == IO_ERROR) {
+		reiserfs_error(inode->i_sb, "vs-5657",
+			       "i/o failure occurred trying to truncate %K",
+			       &s_item_key);
+		err = -EIO;
+		goto out;
+	}
+	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
+		reiserfs_error(inode->i_sb, "PAP-5660",
+			       "wrong result %d of search for %K", retval,
+			       &s_item_key);
+
+		err = -EIO;
+		goto out;
+	}
+
+	s_search_path.pos_in_item--;
+
+	/* Get real file size (total length of all file items) */
+	p_le_ih = tp_item_head(&s_search_path);
+	if (is_statdata_le_ih(p_le_ih))
+		file_size = 0;
+	else {
+		loff_t offset = le_ih_k_offset(p_le_ih);
+		int bytes =
+		    op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
+
+		/*
+		 * this may mismatch with real file size: if last direct item
+		 * had no padding zeros and last unformatted node had no free
+		 * space, this file would have this file size
+		 */
+		file_size = offset + bytes - 1;
+	}
+	/*
+	 * are we doing a full truncate or delete, if so
+	 * kick in the reada code
+	 */
+	if (new_file_size == 0)
+		s_search_path.reada = PATH_READA | PATH_READA_BACK;
+
+	if (file_size == 0 || file_size < new_file_size) {
+		goto update_and_out;
+	}
+
+	/* Update key to search for the last file item. */
+	set_cpu_key_k_offset(&s_item_key, file_size);
+
+	do {
+		/* Cut or delete file item. */
+		deleted =
+		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
+					   inode, page, new_file_size);
+		if (deleted < 0) {
+			reiserfs_warning(inode->i_sb, "vs-5665",
+					 "reiserfs_cut_from_item failed");
+			reiserfs_check_path(&s_search_path);
+			return 0;
+		}
+
+		RFALSE(deleted > file_size,
+		       "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
+		       deleted, file_size, &s_item_key);
+
+		/* Change key to search the last file item. */
+		file_size -= deleted;
+
+		set_cpu_key_k_offset(&s_item_key, file_size);
+
+		/*
+		 * While there are bytes to truncate and previous
+		 * file item is presented in the tree.
+		 */
+
+		/*
+		 * This loop could take a really long time, and could log
+		 * many more blocks than a transaction can hold.  So, we do
+		 * a polite journal end here, and if the transaction needs
+		 * ending, we make sure the file is consistent before ending
+		 * the current trans and starting a new one
+		 */
+		if (journal_transaction_should_end(th, 0) ||
+		    reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
+			pathrelse(&s_search_path);
+
+			if (update_timestamps) {
+				inode->i_mtime = CURRENT_TIME_SEC;
+				inode->i_ctime = CURRENT_TIME_SEC;
+			}
+			reiserfs_update_sd(th, inode);
+
+			err = journal_end(th);
+			if (err)
+				goto out;
+			err = journal_begin(th, inode->i_sb,
+					    JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
+			if (err)
+				goto out;
+			reiserfs_update_inode_transaction(inode);
+		}
+	} while (file_size > ROUND_UP(new_file_size) &&
+		 search_for_position_by_key(inode->i_sb, &s_item_key,
+					    &s_search_path) == POSITION_FOUND);
+
+	RFALSE(file_size > ROUND_UP(new_file_size),
+	       "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
+	       new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
+
+update_and_out:
+	if (update_timestamps) {
+		/* this is truncate, not file closing */
+		inode->i_mtime = CURRENT_TIME_SEC;
+		inode->i_ctime = CURRENT_TIME_SEC;
+	}
+	reiserfs_update_sd(th, inode);
+
+out:
+	pathrelse(&s_search_path);
+	return err;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+/* this makes sure, that we __append__, not overwrite or add holes */
+static void check_research_for_paste(struct treepath *path,
+				     const struct cpu_key *key)
+{
+	struct item_head *found_ih = tp_item_head(path);
+
+	if (is_direct_le_ih(found_ih)) {
+		if (le_ih_k_offset(found_ih) +
+		    op_bytes_number(found_ih,
+				    get_last_bh(path)->b_size) !=
+		    cpu_key_k_offset(key)
+		    || op_bytes_number(found_ih,
+				       get_last_bh(path)->b_size) !=
+		    pos_in_item(path))
+			reiserfs_panic(NULL, "PAP-5720", "found direct item "
+				       "%h or position (%d) does not match "
+				       "to key %K", found_ih,
+				       pos_in_item(path), key);
+	}
+	if (is_indirect_le_ih(found_ih)) {
+		if (le_ih_k_offset(found_ih) +
+		    op_bytes_number(found_ih,
+				    get_last_bh(path)->b_size) !=
+		    cpu_key_k_offset(key)
+		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
+		    || get_ih_free_space(found_ih) != 0)
+			reiserfs_panic(NULL, "PAP-5730", "found indirect "
+				       "item (%h) or position (%d) does not "
+				       "match to key (%K)",
+				       found_ih, pos_in_item(path), key);
+	}
+}
+#endif				/* config reiserfs check */
+
+/*
+ * Paste bytes to the existing item.
+ * Returns bytes number pasted into the item.
+ */
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
+			     /* Path to the pasted item. */
+			     struct treepath *search_path,
+			     /* Key to search for the needed item. */
+			     const struct cpu_key *key,
+			     /* Inode item belongs to */
+			     struct inode *inode,
+			     /* Pointer to the bytes to paste. */
+			     const char *body,
+			     /* Size of pasted bytes. */
+			     int pasted_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct tree_balance s_paste_balance;
+	int retval;
+	int fs_gen;
+	int depth;
+
+	BUG_ON(!th->t_trans_id);
+
+	fs_gen = get_generation(inode->i_sb);
+
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota paste_into_item(): allocating %u id=%u type=%c",
+		       pasted_size, inode->i_uid,
+		       key2type(&key->on_disk_key));
+#endif
+
+	depth = reiserfs_write_unlock_nested(sb);
+	retval = dquot_alloc_space_nodirty(inode, pasted_size);
+	reiserfs_write_lock_nested(sb, depth);
+	if (retval) {
+		pathrelse(search_path);
+		return retval;
+	}
+	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
+		       pasted_size);
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	s_paste_balance.key = key->on_disk_key;
+#endif
+
+	/* DQUOT_* can schedule, must check before the fix_nodes */
+	if (fs_changed(fs_gen, inode->i_sb)) {
+		goto search_again;
+	}
+
+	while ((retval =
+		fix_nodes(M_PASTE, &s_paste_balance, NULL,
+			  body)) == REPEAT_SEARCH) {
+search_again:
+		/* file system changed while we were in the fix_nodes */
+		PROC_INFO_INC(th->t_super, paste_into_item_restarted);
+		retval =
+		    search_for_position_by_key(th->t_super, key,
+					       search_path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto error_out;
+		}
+		if (retval == POSITION_FOUND) {
+			reiserfs_warning(inode->i_sb, "PAP-5710",
+					 "entry or pasted byte (%K) exists",
+					 key);
+			retval = -EEXIST;
+			goto error_out;
+		}
+#ifdef CONFIG_REISERFS_CHECK
+		check_research_for_paste(search_path, key);
+#endif
+	}
+
+	/*
+	 * Perform balancing after all resources are collected by fix_nodes,
+	 * and accessing them will not risk triggering schedule.
+	 */
+	if (retval == CARRY_ON) {
+		do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
+		return 0;
+	}
+	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
+error_out:
+	/* this also releases the path */
+	unfix_nodes(&s_paste_balance);
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota paste_into_item(): freeing %u id=%u type=%c",
+		       pasted_size, inode->i_uid,
+		       key2type(&key->on_disk_key));
+#endif
+	depth = reiserfs_write_unlock_nested(sb);
+	dquot_free_space_nodirty(inode, pasted_size);
+	reiserfs_write_lock_nested(sb, depth);
+	return retval;
+}
+
+/*
+ * Insert new item into the buffer at the path.
+ * th   - active transaction handle
+ * path - path to the inserted item
+ * ih   - pointer to the item header to insert
+ * body - pointer to the bytes to insert
+ */
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *key,
+			 struct item_head *ih, struct inode *inode,
+			 const char *body)
+{
+	struct tree_balance s_ins_balance;
+	int retval;
+	int fs_gen = 0;
+	int quota_bytes = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	if (inode) {		/* Do we count quotas for item? */
+		int depth;
+		fs_gen = get_generation(inode->i_sb);
+		quota_bytes = ih_item_len(ih);
+
+		/*
+		 * hack so the quota code doesn't have to guess
+		 * if the file has a tail, links are always tails,
+		 * so there's no guessing needed
+		 */
+		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
+			quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
+#ifdef REISERQUOTA_DEBUG
+		reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+			       "reiserquota insert_item(): allocating %u id=%u type=%c",
+			       quota_bytes, inode->i_uid, head2type(ih));
+#endif
+		/*
+		 * We can't dirty inode here. It would be immediately
+		 * written but appropriate stat item isn't inserted yet...
+		 */
+		depth = reiserfs_write_unlock_nested(inode->i_sb);
+		retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+		reiserfs_write_lock_nested(inode->i_sb, depth);
+		if (retval) {
+			pathrelse(path);
+			return retval;
+		}
+	}
+	init_tb_struct(th, &s_ins_balance, th->t_super, path,
+		       IH_SIZE + ih_item_len(ih));
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	s_ins_balance.key = key->on_disk_key;
+#endif
+	/*
+	 * DQUOT_* can schedule, must check to be sure calling
+	 * fix_nodes is safe
+	 */
+	if (inode && fs_changed(fs_gen, inode->i_sb)) {
+		goto search_again;
+	}
+
+	while ((retval =
+		fix_nodes(M_INSERT, &s_ins_balance, ih,
+			  body)) == REPEAT_SEARCH) {
+search_again:
+		/* file system changed while we were in the fix_nodes */
+		PROC_INFO_INC(th->t_super, insert_item_restarted);
+		retval = search_item(th->t_super, key, path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto error_out;
+		}
+		if (retval == ITEM_FOUND) {
+			reiserfs_warning(th->t_super, "PAP-5760",
+					 "key %K already exists in the tree",
+					 key);
+			retval = -EEXIST;
+			goto error_out;
+		}
+	}
+
+	/* make balancing after all resources will be collected at a time */
+	if (retval == CARRY_ON) {
+		do_balance(&s_ins_balance, ih, body, M_INSERT);
+		return 0;
+	}
+
+	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
+error_out:
+	/* also releases the path */
+	unfix_nodes(&s_ins_balance);
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+		       "reiserquota insert_item(): freeing %u id=%u type=%c",
+		       quota_bytes, inode->i_uid, head2type(ih));
+#endif
+	if (inode) {
+		int depth = reiserfs_write_unlock_nested(inode->i_sb);
+		dquot_free_space_nodirty(inode, quota_bytes);
+		reiserfs_write_lock_nested(inode->i_sb, depth);
+	}
+	return retval;
+}
diff --git a/kernel/fs/reiserfs/super.c b/kernel/fs/reiserfs/super.c
new file mode 100644
index 000000000..0111ad046
--- /dev/null
+++ b/kernel/fs/reiserfs/super.c
@@ -0,0 +1,2563 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ *
+ * Trivial changes by Alan Cox to add the LFS fixes
+ *
+ * Trivial Changes:
+ * Rights granted to Hans Reiser to redistribute under other terms providing
+ * he accepts all liability including but not limited to patent, fitness
+ * for purpose, and direct or indirect claims arising from failure to perform.
+ *
+ * NO WARRANTY
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/time.h>
+#include <linux/uaccess.h>
+#include "reiserfs.h"
+#include "acl.h"
+#include "xattr.h"
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/quotaops.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/crc32.h>
+#include <linux/seq_file.h>
+
+struct file_system_type reiserfs_fs_type;
+
+static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
+static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
+static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
+
+int is_reiserfs_3_5(struct reiserfs_super_block *rs)
+{
+	return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
+			strlen(reiserfs_3_5_magic_string));
+}
+
+int is_reiserfs_3_6(struct reiserfs_super_block *rs)
+{
+	return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
+			strlen(reiserfs_3_6_magic_string));
+}
+
+int is_reiserfs_jr(struct reiserfs_super_block *rs)
+{
+	return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
+			strlen(reiserfs_jr_magic_string));
+}
+
+static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
+{
+	return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
+		is_reiserfs_jr(rs));
+}
+
+static int reiserfs_remount(struct super_block *s, int *flags, char *data);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+static int reiserfs_sync_fs(struct super_block *s, int wait)
+{
+	struct reiserfs_transaction_handle th;
+
+	/*
+	 * Writeback quota in non-journalled quota case - journalled quota has
+	 * no dirty dquots
+	 */
+	dquot_writeback_dquots(s, -1);
+	reiserfs_write_lock(s);
+	if (!journal_begin(&th, s, 1))
+		if (!journal_end_sync(&th))
+			reiserfs_flush_old_commits(s);
+	reiserfs_write_unlock(s);
+	return 0;
+}
+
+static void flush_old_commits(struct work_struct *work)
+{
+	struct reiserfs_sb_info *sbi;
+	struct super_block *s;
+
+	sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
+	s = sbi->s_journal->j_work_sb;
+
+	spin_lock(&sbi->old_work_lock);
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->old_work_lock);
+
+	reiserfs_sync_fs(s, 1);
+}
+
+void reiserfs_schedule_old_flush(struct super_block *s)
+{
+	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+	unsigned long delay;
+
+	/*
+	 * Avoid scheduling flush when sb is being shut down. It can race
+	 * with journal shutdown and free still queued delayed work.
+	 */
+	if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE))
+		return;
+
+	spin_lock(&sbi->old_work_lock);
+	if (!sbi->work_queued) {
+		delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+		queue_delayed_work(system_long_wq, &sbi->old_work, delay);
+		sbi->work_queued = 1;
+	}
+	spin_unlock(&sbi->old_work_lock);
+}
+
+static void cancel_old_flush(struct super_block *s)
+{
+	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+
+	cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+	spin_lock(&sbi->old_work_lock);
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->old_work_lock);
+}
+
+static int reiserfs_freeze(struct super_block *s)
+{
+	struct reiserfs_transaction_handle th;
+
+	cancel_old_flush(s);
+
+	reiserfs_write_lock(s);
+	if (!(s->s_flags & MS_RDONLY)) {
+		int err = journal_begin(&th, s, 1);
+		if (err) {
+			reiserfs_block_writes(&th);
+		} else {
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+			reiserfs_block_writes(&th);
+			journal_end_sync(&th);
+		}
+	}
+	reiserfs_write_unlock(s);
+	return 0;
+}
+
+static int reiserfs_unfreeze(struct super_block *s)
+{
+	reiserfs_allow_writes(s);
+	return 0;
+}
+
+extern const struct in_core_key MAX_IN_CORE_KEY;
+
+/*
+ * this is used to delete "save link" when there are no items of a
+ * file it points to. It can either happen if unlink is completed but
+ * "save unlink" removal, or if file has both unlink and truncate
+ * pending and as unlink completes first (because key of "save link"
+ * protecting unlink is bigger that a key lf "save link" which
+ * protects truncate), so there left no items to make truncate
+ * completion on
+ */
+static int remove_save_link_only(struct super_block *s,
+				 struct reiserfs_key *key, int oid_free)
+{
+	struct reiserfs_transaction_handle th;
+	int err;
+
+	/* we are going to do one balancing */
+	err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
+	if (err)
+		return err;
+
+	reiserfs_delete_solid_item(&th, NULL, key);
+	if (oid_free)
+		/* removals are protected by direct items */
+		reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
+
+	return journal_end(&th);
+}
+
+#ifdef CONFIG_QUOTA
+static int reiserfs_quota_on_mount(struct super_block *, int);
+#endif
+
+/* look for uncompleted unlinks and truncates and complete them */
+static int finish_unfinished(struct super_block *s)
+{
+	INITIALIZE_PATH(path);
+	struct cpu_key max_cpu_key, obj_key;
+	struct reiserfs_key save_link_key, last_inode_key;
+	int retval = 0;
+	struct item_head *ih;
+	struct buffer_head *bh;
+	int item_pos;
+	char *item;
+	int done;
+	struct inode *inode;
+	int truncate;
+#ifdef CONFIG_QUOTA
+	int i;
+	int ms_active_set;
+	int quota_enabled[REISERFS_MAXQUOTAS];
+#endif
+
+	/* compose key to look for "save" links */
+	max_cpu_key.version = KEY_FORMAT_3_5;
+	max_cpu_key.on_disk_key.k_dir_id = ~0U;
+	max_cpu_key.on_disk_key.k_objectid = ~0U;
+	set_cpu_key_k_offset(&max_cpu_key, ~0U);
+	max_cpu_key.key_length = 3;
+
+	memset(&last_inode_key, 0, sizeof(last_inode_key));
+
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	if (s->s_flags & MS_ACTIVE) {
+		ms_active_set = 0;
+	} else {
+		ms_active_set = 1;
+		s->s_flags |= MS_ACTIVE;
+	}
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
+		quota_enabled[i] = 1;
+		if (REISERFS_SB(s)->s_qf_names[i]) {
+			int ret;
+
+			if (sb_has_quota_active(s, i)) {
+				quota_enabled[i] = 0;
+				continue;
+			}
+			ret = reiserfs_quota_on_mount(s, i);
+			if (ret < 0)
+				reiserfs_warning(s, "reiserfs-2500",
+						 "cannot turn on journaled "
+						 "quota: error %d", ret);
+		}
+	}
+#endif
+
+	done = 0;
+	REISERFS_SB(s)->s_is_unlinked_ok = 1;
+	while (!retval) {
+		int depth;
+		retval = search_item(s, &max_cpu_key, &path);
+		if (retval != ITEM_NOT_FOUND) {
+			reiserfs_error(s, "vs-2140",
+				       "search_by_key returned %d", retval);
+			break;
+		}
+
+		bh = get_last_bh(&path);
+		item_pos = get_item_pos(&path);
+		if (item_pos != B_NR_ITEMS(bh)) {
+			reiserfs_warning(s, "vs-2060",
+					 "wrong position found");
+			break;
+		}
+		item_pos--;
+		ih = item_head(bh, item_pos);
+
+		if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
+			/* there are no "save" links anymore */
+			break;
+
+		save_link_key = ih->ih_key;
+		if (is_indirect_le_ih(ih))
+			truncate = 1;
+		else
+			truncate = 0;
+
+		/* reiserfs_iget needs k_dirid and k_objectid only */
+		item = ih_item_body(bh, ih);
+		obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
+		obj_key.on_disk_key.k_objectid =
+		    le32_to_cpu(ih->ih_key.k_objectid);
+		obj_key.on_disk_key.k_offset = 0;
+		obj_key.on_disk_key.k_type = 0;
+
+		pathrelse(&path);
+
+		inode = reiserfs_iget(s, &obj_key);
+		if (!inode) {
+			/*
+			 * the unlink almost completed, it just did not
+			 * manage to remove "save" link and release objectid
+			 */
+			reiserfs_warning(s, "vs-2180", "iget failed for %K",
+					 &obj_key);
+			retval = remove_save_link_only(s, &save_link_key, 1);
+			continue;
+		}
+
+		if (!truncate && inode->i_nlink) {
+			/* file is not unlinked */
+			reiserfs_warning(s, "vs-2185",
+					 "file %K is not unlinked",
+					 &obj_key);
+			retval = remove_save_link_only(s, &save_link_key, 0);
+			continue;
+		}
+		depth = reiserfs_write_unlock_nested(inode->i_sb);
+		dquot_initialize(inode);
+		reiserfs_write_lock_nested(inode->i_sb, depth);
+
+		if (truncate && S_ISDIR(inode->i_mode)) {
+			/*
+			 * We got a truncate request for a dir which
+			 * is impossible.  The only imaginable way is to
+			 * execute unfinished truncate request then boot
+			 * into old kernel, remove the file and create dir
+			 * with the same key.
+			 */
+			reiserfs_warning(s, "green-2101",
+					 "impossible truncate on a "
+					 "directory %k. Please report",
+					 INODE_PKEY(inode));
+			retval = remove_save_link_only(s, &save_link_key, 0);
+			truncate = 0;
+			iput(inode);
+			continue;
+		}
+
+		if (truncate) {
+			REISERFS_I(inode)->i_flags |=
+			    i_link_saved_truncate_mask;
+			/*
+			 * not completed truncate found. New size was
+			 * committed together with "save" link
+			 */
+			reiserfs_info(s, "Truncating %k to %lld ..",
+				      INODE_PKEY(inode), inode->i_size);
+
+			/* don't update modification time */
+			reiserfs_truncate_file(inode, 0);
+
+			retval = remove_save_link(inode, truncate);
+		} else {
+			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
+			/* not completed unlink (rmdir) found */
+			reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
+			if (memcmp(&last_inode_key, INODE_PKEY(inode),
+					sizeof(last_inode_key))){
+				last_inode_key = *INODE_PKEY(inode);
+				/* removal gets completed in iput */
+				retval = 0;
+			} else {
+				reiserfs_warning(s, "super-2189", "Dead loop "
+						 "in finish_unfinished "
+						 "detected, just remove "
+						 "save link\n");
+				retval = remove_save_link_only(s,
+							&save_link_key, 0);
+			}
+		}
+
+		iput(inode);
+		printk("done\n");
+		done++;
+	}
+	REISERFS_SB(s)->s_is_unlinked_ok = 0;
+
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	reiserfs_write_unlock(s);
+	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
+		if (sb_dqopt(s)->files[i] && quota_enabled[i])
+			dquot_quota_off(s, i);
+	}
+	reiserfs_write_lock(s);
+	if (ms_active_set)
+		/* Restore the flag back */
+		s->s_flags &= ~MS_ACTIVE;
+#endif
+	pathrelse(&path);
+	if (done)
+		reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
+			      "Completed\n", done);
+	return retval;
+}
+
+/*
+ * to protect file being unlinked from getting lost we "safe" link files
+ * being unlinked. This link will be deleted in the same transaction with last
+ * item of file. mounting the filesystem we scan all these links and remove
+ * files which almost got lost
+ */
+void add_save_link(struct reiserfs_transaction_handle *th,
+		   struct inode *inode, int truncate)
+{
+	INITIALIZE_PATH(path);
+	int retval;
+	struct cpu_key key;
+	struct item_head ih;
+	__le32 link;
+
+	BUG_ON(!th->t_trans_id);
+
+	/* file can only get one "save link" of each kind */
+	RFALSE(truncate &&
+	       (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
+	       "saved link already exists for truncated inode %lx",
+	       (long)inode->i_ino);
+	RFALSE(!truncate &&
+	       (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
+	       "saved link already exists for unlinked inode %lx",
+	       (long)inode->i_ino);
+
+	/* setup key of "save" link */
+	key.version = KEY_FORMAT_3_5;
+	key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
+	key.on_disk_key.k_objectid = inode->i_ino;
+	if (!truncate) {
+		/* unlink, rmdir, rename */
+		set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
+		set_cpu_key_k_type(&key, TYPE_DIRECT);
+
+		/* item head of "safe" link */
+		make_le_item_head(&ih, &key, key.version,
+				  1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
+				  4 /*length */ , 0xffff /*free space */ );
+	} else {
+		/* truncate */
+		if (S_ISDIR(inode->i_mode))
+			reiserfs_warning(inode->i_sb, "green-2102",
+					 "Adding a truncate savelink for "
+					 "a directory %k! Please report",
+					 INODE_PKEY(inode));
+		set_cpu_key_k_offset(&key, 1);
+		set_cpu_key_k_type(&key, TYPE_INDIRECT);
+
+		/* item head of "safe" link */
+		make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
+				  4 /*length */ , 0 /*free space */ );
+	}
+	key.key_length = 3;
+
+	/* look for its place in the tree */
+	retval = search_item(inode->i_sb, &key, &path);
+	if (retval != ITEM_NOT_FOUND) {
+		if (retval != -ENOSPC)
+			reiserfs_error(inode->i_sb, "vs-2100",
+				       "search_by_key (%K) returned %d", &key,
+				       retval);
+		pathrelse(&path);
+		return;
+	}
+
+	/* body of "save" link */
+	link = INODE_PKEY(inode)->k_dir_id;
+
+	/* put "save" link into tree, don't charge quota to anyone */
+	retval =
+	    reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
+	if (retval) {
+		if (retval != -ENOSPC)
+			reiserfs_error(inode->i_sb, "vs-2120",
+				       "insert_item returned %d", retval);
+	} else {
+		if (truncate)
+			REISERFS_I(inode)->i_flags |=
+			    i_link_saved_truncate_mask;
+		else
+			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
+	}
+}
+
+/* this opens transaction unlike add_save_link */
+int remove_save_link(struct inode *inode, int truncate)
+{
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_key key;
+	int err;
+
+	/* we are going to do one balancing only */
+	err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
+	if (err)
+		return err;
+
+	/* setup key of "save" link */
+	key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
+	key.k_objectid = INODE_PKEY(inode)->k_objectid;
+	if (!truncate) {
+		/* unlink, rmdir, rename */
+		set_le_key_k_offset(KEY_FORMAT_3_5, &key,
+				    1 + inode->i_sb->s_blocksize);
+		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
+	} else {
+		/* truncate */
+		set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
+		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
+	}
+
+	if ((truncate &&
+	     (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
+	    (!truncate &&
+	     (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
+		/* don't take quota bytes from anywhere */
+		reiserfs_delete_solid_item(&th, NULL, &key);
+	if (!truncate) {
+		reiserfs_release_objectid(&th, inode->i_ino);
+		REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
+	} else
+		REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
+
+	return journal_end(&th);
+}
+
+static void reiserfs_kill_sb(struct super_block *s)
+{
+	if (REISERFS_SB(s)) {
+		reiserfs_proc_info_done(s);
+		/*
+		 * Force any pending inode evictions to occur now. Any
+		 * inodes to be removed that have extended attributes
+		 * associated with them need to clean them up before
+		 * we can release the extended attribute root dentries.
+		 * shrink_dcache_for_umount will BUG if we don't release
+		 * those before it's called so ->put_super is too late.
+		 */
+		shrink_dcache_sb(s);
+
+		dput(REISERFS_SB(s)->xattr_root);
+		REISERFS_SB(s)->xattr_root = NULL;
+		dput(REISERFS_SB(s)->priv_root);
+		REISERFS_SB(s)->priv_root = NULL;
+	}
+
+	kill_block_super(s);
+}
+
+static void reiserfs_put_super(struct super_block *s)
+{
+	struct reiserfs_transaction_handle th;
+	th.t_trans_id = 0;
+
+	dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	reiserfs_write_lock(s);
+
+	/*
+	 * change file system state to current state if it was mounted
+	 * with read-write permissions
+	 */
+	if (!(s->s_flags & MS_RDONLY)) {
+		if (!journal_begin(&th, s, 10)) {
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
+					    REISERFS_SB(s)->s_mount_state);
+			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+		}
+	}
+
+	/*
+	 * note, journal_release checks for readonly mount, and can
+	 * decide not to do a journal_end
+	 */
+	journal_release(&th, s);
+
+	reiserfs_free_bitmap_cache(s);
+
+	brelse(SB_BUFFER_WITH_SB(s));
+
+	print_statistics(s);
+
+	if (REISERFS_SB(s)->reserved_blocks != 0) {
+		reiserfs_warning(s, "green-2005", "reserved blocks left %d",
+				 REISERFS_SB(s)->reserved_blocks);
+	}
+
+	reiserfs_write_unlock(s);
+	mutex_destroy(&REISERFS_SB(s)->lock);
+	destroy_workqueue(REISERFS_SB(s)->commit_wq);
+	kfree(s->s_fs_info);
+	s->s_fs_info = NULL;
+}
+
+static struct kmem_cache *reiserfs_inode_cachep;
+
+static struct inode *reiserfs_alloc_inode(struct super_block *sb)
+{
+	struct reiserfs_inode_info *ei;
+	ei = (struct reiserfs_inode_info *)
+	    kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	atomic_set(&ei->openers, 0);
+	mutex_init(&ei->tailpack);
+#ifdef CONFIG_QUOTA
+	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
+#endif
+
+	return &ei->vfs_inode;
+}
+
+static void reiserfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
+}
+
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
+
+	INIT_LIST_HEAD(&ei->i_prealloc_list);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
+						  sizeof(struct
+							 reiserfs_inode_info),
+						  0, (SLAB_RECLAIM_ACCOUNT|
+							SLAB_MEM_SPREAD),
+						  init_once);
+	if (reiserfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(reiserfs_inode_cachep);
+}
+
+/* we don't mark inodes dirty, we just log them */
+static void reiserfs_dirty_inode(struct inode *inode, int flags)
+{
+	struct reiserfs_transaction_handle th;
+
+	int err = 0;
+
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		reiserfs_warning(inode->i_sb, "clm-6006",
+				 "writing inode %lu on readonly FS",
+				 inode->i_ino);
+		return;
+	}
+	reiserfs_write_lock(inode->i_sb);
+
+	/*
+	 * this is really only used for atime updates, so they don't have
+	 * to be included in O_SYNC or fsync
+	 */
+	err = journal_begin(&th, inode->i_sb, 1);
+	if (err)
+		goto out;
+
+	reiserfs_update_sd(&th, inode);
+	journal_end(&th);
+
+out:
+	reiserfs_write_unlock(inode->i_sb);
+}
+
+static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct super_block *s = root->d_sb;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	long opts = REISERFS_SB(s)->s_mount_opt;
+
+	if (opts & (1 << REISERFS_LARGETAIL))
+		seq_puts(seq, ",tails=on");
+	else if (!(opts & (1 << REISERFS_SMALLTAIL)))
+		seq_puts(seq, ",notail");
+	/* tails=small is default so we don't show it */
+
+	if (!(opts & (1 << REISERFS_BARRIER_FLUSH)))
+		seq_puts(seq, ",barrier=none");
+	/* barrier=flush is default so we don't show it */
+
+	if (opts & (1 << REISERFS_ERROR_CONTINUE))
+		seq_puts(seq, ",errors=continue");
+	else if (opts & (1 << REISERFS_ERROR_PANIC))
+		seq_puts(seq, ",errors=panic");
+	/* errors=ro is default so we don't show it */
+
+	if (opts & (1 << REISERFS_DATA_LOG))
+		seq_puts(seq, ",data=journal");
+	else if (opts & (1 << REISERFS_DATA_WRITEBACK))
+		seq_puts(seq, ",data=writeback");
+	/* data=ordered is default so we don't show it */
+
+	if (opts & (1 << REISERFS_ATTRS))
+		seq_puts(seq, ",attrs");
+
+	if (opts & (1 << REISERFS_XATTRS_USER))
+		seq_puts(seq, ",user_xattr");
+
+	if (opts & (1 << REISERFS_EXPOSE_PRIVROOT))
+		seq_puts(seq, ",expose_privroot");
+
+	if (opts & (1 << REISERFS_POSIXACL))
+		seq_puts(seq, ",acl");
+
+	if (REISERFS_SB(s)->s_jdev)
+		seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+
+	if (journal->j_max_commit_age != journal->j_default_max_commit_age)
+		seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
+
+#ifdef CONFIG_QUOTA
+	if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+	else if (opts & (1 << REISERFS_USRQUOTA))
+		seq_puts(seq, ",usrquota");
+	if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+	else if (opts & (1 << REISERFS_GRPQUOTA))
+		seq_puts(seq, ",grpquota");
+	if (REISERFS_SB(s)->s_jquota_fmt) {
+		if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD)
+			seq_puts(seq, ",jqfmt=vfsold");
+		else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0)
+			seq_puts(seq, ",jqfmt=vfsv0");
+	}
+#endif
+
+	/* Block allocator options */
+	if (opts & (1 << REISERFS_NO_BORDER))
+		seq_puts(seq, ",block-allocator=noborder");
+	if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION))
+		seq_puts(seq, ",block-allocator=no_unhashed_relocation");
+	if (opts & (1 << REISERFS_HASHED_RELOCATION))
+		seq_puts(seq, ",block-allocator=hashed_relocation");
+	if (opts & (1 << REISERFS_TEST4))
+		seq_puts(seq, ",block-allocator=test4");
+	show_alloc_options(seq, s);
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
+static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
+				    size_t, loff_t);
+static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
+				   loff_t);
+
+static struct dquot **reiserfs_get_dquots(struct inode *inode)
+{
+	return REISERFS_I(inode)->i_dquot;
+}
+#endif
+
+static const struct super_operations reiserfs_sops = {
+	.alloc_inode = reiserfs_alloc_inode,
+	.destroy_inode = reiserfs_destroy_inode,
+	.write_inode = reiserfs_write_inode,
+	.dirty_inode = reiserfs_dirty_inode,
+	.evict_inode = reiserfs_evict_inode,
+	.put_super = reiserfs_put_super,
+	.sync_fs = reiserfs_sync_fs,
+	.freeze_fs = reiserfs_freeze,
+	.unfreeze_fs = reiserfs_unfreeze,
+	.statfs = reiserfs_statfs,
+	.remount_fs = reiserfs_remount,
+	.show_options = reiserfs_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read = reiserfs_quota_read,
+	.quota_write = reiserfs_quota_write,
+	.get_dquots = reiserfs_get_dquots,
+#endif
+};
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+
+static int reiserfs_write_dquot(struct dquot *);
+static int reiserfs_acquire_dquot(struct dquot *);
+static int reiserfs_release_dquot(struct dquot *);
+static int reiserfs_mark_dquot_dirty(struct dquot *);
+static int reiserfs_write_info(struct super_block *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
+
+static const struct dquot_operations reiserfs_quota_operations = {
+	.write_dquot = reiserfs_write_dquot,
+	.acquire_dquot = reiserfs_acquire_dquot,
+	.release_dquot = reiserfs_release_dquot,
+	.mark_dirty = reiserfs_mark_dquot_dirty,
+	.write_info = reiserfs_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+
+static const struct quotactl_ops reiserfs_qctl_operations = {
+	.quota_on = reiserfs_quota_on,
+	.quota_off = dquot_quota_off,
+	.quota_sync = dquot_quota_sync,
+	.get_state = dquot_get_state,
+	.set_info = dquot_set_dqinfo,
+	.get_dqblk = dquot_get_dqblk,
+	.set_dqblk = dquot_set_dqblk,
+};
+#endif
+
+static const struct export_operations reiserfs_export_ops = {
+	.encode_fh = reiserfs_encode_fh,
+	.fh_to_dentry = reiserfs_fh_to_dentry,
+	.fh_to_parent = reiserfs_fh_to_parent,
+	.get_parent = reiserfs_get_parent,
+};
+
+/*
+ * this struct is used in reiserfs_getopt () for containing the value for
+ * those mount options that have values rather than being toggles.
+ */
+typedef struct {
+	char *value;
+	/*
+	 * bitmask which is to set on mount_options bitmask
+	 * when this value is found, 0 is no bits are to be changed.
+	 */
+	int setmask;
+	/*
+	 * bitmask which is to clear on mount_options bitmask
+	 * when this value is found, 0 is no bits are to be changed.
+	 * This is applied BEFORE setmask
+	 */
+	int clrmask;
+} arg_desc_t;
+
+/* Set this bit in arg_required to allow empty arguments */
+#define REISERFS_OPT_ALLOWEMPTY 31
+
+/*
+ * this struct is used in reiserfs_getopt() for describing the
+ * set of reiserfs mount options
+ */
+typedef struct {
+	char *option_name;
+
+	/* 0 if argument is not required, not 0 otherwise */
+	int arg_required;
+
+	/* list of values accepted by an option */
+	const arg_desc_t *values;
+
+	/*
+	 * bitmask which is to set on mount_options bitmask
+	 * when this value is found, 0 is no bits are to be changed.
+	 */
+	int setmask;
+
+	/*
+	 * bitmask which is to clear on mount_options bitmask
+	 * when this value is found, 0 is no bits are to be changed.
+	 * This is applied BEFORE setmask
+	 */
+	int clrmask;
+} opt_desc_t;
+
+/* possible values for -o data= */
+static const arg_desc_t logging_mode[] = {
+	{"ordered", 1 << REISERFS_DATA_ORDERED,
+	 (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
+	{"journal", 1 << REISERFS_DATA_LOG,
+	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
+	{"writeback", 1 << REISERFS_DATA_WRITEBACK,
+	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
+	{.value = NULL}
+};
+
+/* possible values for -o barrier= */
+static const arg_desc_t barrier_mode[] = {
+	{"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
+	{"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
+	{.value = NULL}
+};
+
+/*
+ * possible values for "-o block-allocator=" and bits which are to be set in
+ * s_mount_opt of reiserfs specific part of in-core super block
+ */
+static const arg_desc_t balloc[] = {
+	{"noborder", 1 << REISERFS_NO_BORDER, 0},
+	{"border", 0, 1 << REISERFS_NO_BORDER},
+	{"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
+	{"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
+	{"test4", 1 << REISERFS_TEST4, 0},
+	{"notest4", 0, 1 << REISERFS_TEST4},
+	{NULL, 0, 0}
+};
+
+static const arg_desc_t tails[] = {
+	{"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
+	{"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
+	{"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
+	{NULL, 0, 0}
+};
+
+static const arg_desc_t error_actions[] = {
+	{"panic", 1 << REISERFS_ERROR_PANIC,
+	 (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
+	{"ro-remount", 1 << REISERFS_ERROR_RO,
+	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
+#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
+	{"continue", 1 << REISERFS_ERROR_CONTINUE,
+	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
+#endif
+	{NULL, 0, 0},
+};
+
+/*
+ * proceed only one option from a list *cur - string containing of mount
+ * options
+ * opts - array of options which are accepted
+ * opt_arg - if option is found and requires an argument and if it is specifed
+ * in the input - pointer to the argument is stored here
+ * bit_flags - if option requires to set a certain bit - it is set here
+ * return -1 if unknown option is found, opt->arg_required otherwise
+ */
+static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
+			   char **opt_arg, unsigned long *bit_flags)
+{
+	char *p;
+	/*
+	 * foo=bar,
+	 * ^   ^  ^
+	 * |   |  +-- option_end
+	 * |   +-- arg_start
+	 * +-- option_start
+	 */
+	const opt_desc_t *opt;
+	const arg_desc_t *arg;
+
+	p = *cur;
+
+	/* assume argument cannot contain commas */
+	*cur = strchr(p, ',');
+	if (*cur) {
+		*(*cur) = '\0';
+		(*cur)++;
+	}
+
+	if (!strncmp(p, "alloc=", 6)) {
+		/*
+		 * Ugly special case, probably we should redo options
+		 * parser so that it can understand several arguments for
+		 * some options, also so that it can fill several bitfields
+		 * with option values.
+		 */
+		if (reiserfs_parse_alloc_options(s, p + 6)) {
+			return -1;
+		} else {
+			return 0;
+		}
+	}
+
+	/* for every option in the list */
+	for (opt = opts; opt->option_name; opt++) {
+		if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
+			if (bit_flags) {
+				if (opt->clrmask ==
+				    (1 << REISERFS_UNSUPPORTED_OPT))
+					reiserfs_warning(s, "super-6500",
+							 "%s not supported.\n",
+							 p);
+				else
+					*bit_flags &= ~opt->clrmask;
+				if (opt->setmask ==
+				    (1 << REISERFS_UNSUPPORTED_OPT))
+					reiserfs_warning(s, "super-6501",
+							 "%s not supported.\n",
+							 p);
+				else
+					*bit_flags |= opt->setmask;
+			}
+			break;
+		}
+	}
+	if (!opt->option_name) {
+		reiserfs_warning(s, "super-6502",
+				 "unknown mount option \"%s\"", p);
+		return -1;
+	}
+
+	p += strlen(opt->option_name);
+	switch (*p) {
+	case '=':
+		if (!opt->arg_required) {
+			reiserfs_warning(s, "super-6503",
+					 "the option \"%s\" does not "
+					 "require an argument\n",
+					 opt->option_name);
+			return -1;
+		}
+		break;
+
+	case 0:
+		if (opt->arg_required) {
+			reiserfs_warning(s, "super-6504",
+					 "the option \"%s\" requires an "
+					 "argument\n", opt->option_name);
+			return -1;
+		}
+		break;
+	default:
+		reiserfs_warning(s, "super-6505",
+				 "head of option \"%s\" is only correct\n",
+				 opt->option_name);
+		return -1;
+	}
+
+	/*
+	 * move to the argument, or to next option if argument is not
+	 * required
+	 */
+	p++;
+
+	if (opt->arg_required
+	    && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
+	    && !strlen(p)) {
+		/* this catches "option=," if not allowed */
+		reiserfs_warning(s, "super-6506",
+				 "empty argument for \"%s\"\n",
+				 opt->option_name);
+		return -1;
+	}
+
+	if (!opt->values) {
+		/* *=NULLopt_arg contains pointer to argument */
+		*opt_arg = p;
+		return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
+	}
+
+	/* values possible for this option are listed in opt->values */
+	for (arg = opt->values; arg->value; arg++) {
+		if (!strcmp(p, arg->value)) {
+			if (bit_flags) {
+				*bit_flags &= ~arg->clrmask;
+				*bit_flags |= arg->setmask;
+			}
+			return opt->arg_required;
+		}
+	}
+
+	reiserfs_warning(s, "super-6506",
+			 "bad value \"%s\" for option \"%s\"\n", p,
+			 opt->option_name);
+	return -1;
+}
+
+/* returns 0 if something is wrong in option string, 1 - otherwise */
+static int reiserfs_parse_options(struct super_block *s,
+
+				  /* string given via mount's -o */
+				  char *options,
+
+				  /*
+				   * after the parsing phase, contains the
+				   * collection of bitflags defining what
+				   * mount options were selected.
+				   */
+				  unsigned long *mount_options,
+
+				  /* strtol-ed from NNN of resize=NNN */
+				  unsigned long *blocks,
+				  char **jdev_name,
+				  unsigned int *commit_max_age,
+				  char **qf_names,
+				  unsigned int *qfmt)
+{
+	int c;
+	char *arg = NULL;
+	char *pos;
+	opt_desc_t opts[] = {
+		/*
+		 * Compatibility stuff, so that -o notail for old
+		 * setups still work
+		 */
+		{"tails",.arg_required = 't',.values = tails},
+		{"notail",.clrmask =
+		 (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
+		{"conv",.setmask = 1 << REISERFS_CONVERT},
+		{"attrs",.setmask = 1 << REISERFS_ATTRS},
+		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
+#ifdef CONFIG_REISERFS_FS_XATTR
+		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
+		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
+#else
+		{"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
+		{"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+		{"acl",.setmask = 1 << REISERFS_POSIXACL},
+		{"noacl",.clrmask = 1 << REISERFS_POSIXACL},
+#else
+		{"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
+		{"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
+#endif
+		{.option_name = "nolog"},
+		{"replayonly",.setmask = 1 << REPLAYONLY},
+		{"block-allocator",.arg_required = 'a',.values = balloc},
+		{"data",.arg_required = 'd',.values = logging_mode},
+		{"barrier",.arg_required = 'b',.values = barrier_mode},
+		{"resize",.arg_required = 'r',.values = NULL},
+		{"jdev",.arg_required = 'j',.values = NULL},
+		{"nolargeio",.arg_required = 'w',.values = NULL},
+		{"commit",.arg_required = 'c',.values = NULL},
+		{"usrquota",.setmask = 1 << REISERFS_USRQUOTA},
+		{"grpquota",.setmask = 1 << REISERFS_GRPQUOTA},
+		{"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA},
+		{"errors",.arg_required = 'e',.values = error_actions},
+		{"usrjquota",.arg_required =
+		 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
+		{"grpjquota",.arg_required =
+		 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
+		{"jqfmt",.arg_required = 'f',.values = NULL},
+		{.option_name = NULL}
+	};
+
+	*blocks = 0;
+	if (!options || !*options)
+		/*
+		 * use default configuration: create tails, journaling on, no
+		 * conversion to newest format
+		 */
+		return 1;
+
+	for (pos = options; pos;) {
+		c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
+		if (c == -1)
+			/* wrong option is given */
+			return 0;
+
+		if (c == 'r') {
+			char *p;
+
+			p = NULL;
+			/* "resize=NNN" or "resize=auto" */
+
+			if (!strcmp(arg, "auto")) {
+				/* From JFS code, to auto-get the size. */
+				*blocks =
+				    s->s_bdev->bd_inode->i_size >> s->
+				    s_blocksize_bits;
+			} else {
+				*blocks = simple_strtoul(arg, &p, 0);
+				if (*p != '\0') {
+					/* NNN does not look like a number */
+					reiserfs_warning(s, "super-6507",
+							 "bad value %s for "
+							 "-oresize\n", arg);
+					return 0;
+				}
+			}
+		}
+
+		if (c == 'c') {
+			char *p = NULL;
+			unsigned long val = simple_strtoul(arg, &p, 0);
+			/* commit=NNN (time in seconds) */
+			if (*p != '\0' || val >= (unsigned int)-1) {
+				reiserfs_warning(s, "super-6508",
+						 "bad value %s for -ocommit\n",
+						 arg);
+				return 0;
+			}
+			*commit_max_age = (unsigned int)val;
+		}
+
+		if (c == 'w') {
+			reiserfs_warning(s, "super-6509", "nolargeio option "
+					 "is no longer supported");
+			return 0;
+		}
+
+		if (c == 'j') {
+			if (arg && *arg && jdev_name) {
+				/* Hm, already assigned? */
+				if (*jdev_name) {
+					reiserfs_warning(s, "super-6510",
+							 "journal device was "
+							 "already specified to "
+							 "be %s", *jdev_name);
+					return 0;
+				}
+				*jdev_name = arg;
+			}
+		}
+#ifdef CONFIG_QUOTA
+		if (c == 'u' || c == 'g') {
+			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
+
+			if (sb_any_quota_loaded(s) &&
+			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
+				reiserfs_warning(s, "super-6511",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
+				return 0;
+			}
+			if (*arg) {	/* Some filename specified? */
+				if (REISERFS_SB(s)->s_qf_names[qtype]
+				    && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
+					      arg)) {
+					reiserfs_warning(s, "super-6512",
+							 "%s quota file "
+							 "already specified.",
+							 QTYPE2NAME(qtype));
+					return 0;
+				}
+				if (strchr(arg, '/')) {
+					reiserfs_warning(s, "super-6513",
+							 "quotafile must be "
+							 "on filesystem root.");
+					return 0;
+				}
+				qf_names[qtype] = kstrdup(arg, GFP_KERNEL);
+				if (!qf_names[qtype]) {
+					reiserfs_warning(s, "reiserfs-2502",
+							 "not enough memory "
+							 "for storing "
+							 "quotafile name.");
+					return 0;
+				}
+				if (qtype == USRQUOTA)
+					*mount_options |= 1 << REISERFS_USRQUOTA;
+				else
+					*mount_options |= 1 << REISERFS_GRPQUOTA;
+			} else {
+				if (qf_names[qtype] !=
+				    REISERFS_SB(s)->s_qf_names[qtype])
+					kfree(qf_names[qtype]);
+				qf_names[qtype] = NULL;
+				if (qtype == USRQUOTA)
+					*mount_options &= ~(1 << REISERFS_USRQUOTA);
+				else
+					*mount_options &= ~(1 << REISERFS_GRPQUOTA);
+			}
+		}
+		if (c == 'f') {
+			if (!strcmp(arg, "vfsold"))
+				*qfmt = QFMT_VFS_OLD;
+			else if (!strcmp(arg, "vfsv0"))
+				*qfmt = QFMT_VFS_V0;
+			else {
+				reiserfs_warning(s, "super-6514",
+						 "unknown quota format "
+						 "specified.");
+				return 0;
+			}
+			if (sb_any_quota_loaded(s) &&
+			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
+				reiserfs_warning(s, "super-6515",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
+				return 0;
+			}
+		}
+#else
+		if (c == 'u' || c == 'g' || c == 'f') {
+			reiserfs_warning(s, "reiserfs-2503", "journaled "
+					 "quota options not supported.");
+			return 0;
+		}
+#endif
+	}
+
+#ifdef CONFIG_QUOTA
+	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
+	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
+		reiserfs_warning(s, "super-6515",
+				 "journaled quota format not specified.");
+		return 0;
+	}
+	if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) &&
+	       sb_has_quota_loaded(s, USRQUOTA)) ||
+	    (!(*mount_options & (1 << REISERFS_GRPQUOTA)) &&
+	       sb_has_quota_loaded(s, GRPQUOTA))) {
+		reiserfs_warning(s, "super-6516", "quota options must "
+				 "be present when quota is turned on.");
+		return 0;
+	}
+#endif
+
+	return 1;
+}
+
+static void switch_data_mode(struct super_block *s, unsigned long mode)
+{
+	REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+					 (1 << REISERFS_DATA_ORDERED) |
+					 (1 << REISERFS_DATA_WRITEBACK));
+	REISERFS_SB(s)->s_mount_opt |= (1 << mode);
+}
+
+static void handle_data_mode(struct super_block *s, unsigned long mount_options)
+{
+	if (mount_options & (1 << REISERFS_DATA_LOG)) {
+		if (!reiserfs_data_log(s)) {
+			switch_data_mode(s, REISERFS_DATA_LOG);
+			reiserfs_info(s, "switching to journaled data mode\n");
+		}
+	} else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+		if (!reiserfs_data_ordered(s)) {
+			switch_data_mode(s, REISERFS_DATA_ORDERED);
+			reiserfs_info(s, "switching to ordered data mode\n");
+		}
+	} else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+		if (!reiserfs_data_writeback(s)) {
+			switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+			reiserfs_info(s, "switching to writeback data mode\n");
+		}
+	}
+}
+
+static void handle_barrier_mode(struct super_block *s, unsigned long bits)
+{
+	int flush = (1 << REISERFS_BARRIER_FLUSH);
+	int none = (1 << REISERFS_BARRIER_NONE);
+	int all_barrier = flush | none;
+
+	if (bits & all_barrier) {
+		REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
+		if (bits & flush) {
+			REISERFS_SB(s)->s_mount_opt |= flush;
+			printk("reiserfs: enabling write barrier flush mode\n");
+		} else if (bits & none) {
+			REISERFS_SB(s)->s_mount_opt |= none;
+			printk("reiserfs: write barriers turned off\n");
+		}
+	}
+}
+
+static void handle_attrs(struct super_block *s)
+{
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+
+	if (reiserfs_attrs(s)) {
+		if (old_format_only(s)) {
+			reiserfs_warning(s, "super-6517", "cannot support "
+					 "attributes on 3.5.x disk format");
+			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
+			return;
+		}
+		if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
+			reiserfs_warning(s, "super-6518", "cannot support "
+					 "attributes until flag is set in "
+					 "super-block");
+			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
+		}
+	}
+}
+
+#ifdef CONFIG_QUOTA
+static void handle_quota_files(struct super_block *s, char **qf_names,
+			       unsigned int *qfmt)
+{
+	int i;
+
+	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
+		if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+			kfree(REISERFS_SB(s)->s_qf_names[i]);
+		REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
+	}
+	if (*qfmt)
+		REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+}
+#endif
+
+static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
+{
+	struct reiserfs_super_block *rs;
+	struct reiserfs_transaction_handle th;
+	unsigned long blocks;
+	unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
+	unsigned long safe_mask = 0;
+	unsigned int commit_max_age = (unsigned int)-1;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	char *new_opts = kstrdup(arg, GFP_KERNEL);
+	int err;
+	char *qf_names[REISERFS_MAXQUOTAS];
+	unsigned int qfmt = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+
+	sync_filesystem(s);
+	reiserfs_write_lock(s);
+
+#ifdef CONFIG_QUOTA
+	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
+#endif
+
+	rs = SB_DISK_SUPER_BLOCK(s);
+
+	if (!reiserfs_parse_options
+	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
+	    qf_names, &qfmt)) {
+#ifdef CONFIG_QUOTA
+		for (i = 0; i < REISERFS_MAXQUOTAS; i++)
+			if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+				kfree(qf_names[i]);
+#endif
+		err = -EINVAL;
+		goto out_err_unlock;
+	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
+
+	handle_attrs(s);
+
+	/* Add options that are safe here */
+	safe_mask |= 1 << REISERFS_SMALLTAIL;
+	safe_mask |= 1 << REISERFS_LARGETAIL;
+	safe_mask |= 1 << REISERFS_NO_BORDER;
+	safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
+	safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
+	safe_mask |= 1 << REISERFS_TEST4;
+	safe_mask |= 1 << REISERFS_ATTRS;
+	safe_mask |= 1 << REISERFS_XATTRS_USER;
+	safe_mask |= 1 << REISERFS_POSIXACL;
+	safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
+	safe_mask |= 1 << REISERFS_BARRIER_NONE;
+	safe_mask |= 1 << REISERFS_ERROR_RO;
+	safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
+	safe_mask |= 1 << REISERFS_ERROR_PANIC;
+	safe_mask |= 1 << REISERFS_USRQUOTA;
+	safe_mask |= 1 << REISERFS_GRPQUOTA;
+
+	/*
+	 * Update the bitmask, taking care to keep
+	 * the bits we're not allowed to change here
+	 */
+	REISERFS_SB(s)->s_mount_opt =
+	    (REISERFS_SB(s)->
+	     s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
+
+	if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
+		journal->j_max_commit_age = commit_max_age;
+		journal->j_max_trans_age = commit_max_age;
+	} else if (commit_max_age == 0) {
+		/* 0 means restore defaults. */
+		journal->j_max_commit_age = journal->j_default_max_commit_age;
+		journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
+	}
+
+	if (blocks) {
+		err = reiserfs_resize(s, blocks);
+		if (err != 0)
+			goto out_err_unlock;
+	}
+
+	if (*mount_flags & MS_RDONLY) {
+		reiserfs_write_unlock(s);
+		reiserfs_xattr_init(s, *mount_flags);
+		/* remount read-only */
+		if (s->s_flags & MS_RDONLY)
+			/* it is read-only already */
+			goto out_ok_unlocked;
+
+		err = dquot_suspend(s, -1);
+		if (err < 0)
+			goto out_err;
+
+		/* try to remount file system with read-only permissions */
+		if (sb_umount_state(rs) == REISERFS_VALID_FS
+		    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
+			goto out_ok_unlocked;
+		}
+
+		reiserfs_write_lock(s);
+
+		err = journal_begin(&th, s, 10);
+		if (err)
+			goto out_err_unlock;
+
+		/* Mounting a rw partition read-only. */
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+		set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
+		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+	} else {
+		/* remount read-write */
+		if (!(s->s_flags & MS_RDONLY)) {
+			reiserfs_write_unlock(s);
+			reiserfs_xattr_init(s, *mount_flags);
+			goto out_ok_unlocked;	/* We are read-write already */
+		}
+
+		if (reiserfs_is_journal_aborted(journal)) {
+			err = journal->j_errno;
+			goto out_err_unlock;
+		}
+
+		handle_data_mode(s, mount_options);
+		handle_barrier_mode(s, mount_options);
+		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
+
+		/* now it is safe to call journal_begin */
+		s->s_flags &= ~MS_RDONLY;
+		err = journal_begin(&th, s, 10);
+		if (err)
+			goto out_err_unlock;
+
+		/* Mount a partition which is read-only, read-write */
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
+		s->s_flags &= ~MS_RDONLY;
+		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		if (!old_format_only(s))
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
+		/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
+		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+		REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
+	}
+	/* this will force a full flush of all journal lists */
+	SB_JOURNAL(s)->j_must_wait = 1;
+	err = journal_end(&th);
+	if (err)
+		goto out_err_unlock;
+
+	reiserfs_write_unlock(s);
+	if (!(*mount_flags & MS_RDONLY)) {
+		dquot_resume(s, -1);
+		reiserfs_write_lock(s);
+		finish_unfinished(s);
+		reiserfs_write_unlock(s);
+		reiserfs_xattr_init(s, *mount_flags);
+	}
+
+out_ok_unlocked:
+	replace_mount_options(s, new_opts);
+	return 0;
+
+out_err_unlock:
+	reiserfs_write_unlock(s);
+out_err:
+	kfree(new_opts);
+	return err;
+}
+
+static int read_super_block(struct super_block *s, int offset)
+{
+	struct buffer_head *bh;
+	struct reiserfs_super_block *rs;
+	int fs_blocksize;
+
+	bh = sb_bread(s, offset / s->s_blocksize);
+	if (!bh) {
+		reiserfs_warning(s, "sh-2006",
+				 "bread failed (dev %s, block %lu, size %lu)",
+				 s->s_id, offset / s->s_blocksize,
+				 s->s_blocksize);
+		return 1;
+	}
+
+	rs = (struct reiserfs_super_block *)bh->b_data;
+	if (!is_any_reiserfs_magic_string(rs)) {
+		brelse(bh);
+		return 1;
+	}
+	/*
+	 * ok, reiserfs signature (old or new) found in at the given offset
+	 */
+	fs_blocksize = sb_blocksize(rs);
+	brelse(bh);
+	sb_set_blocksize(s, fs_blocksize);
+
+	bh = sb_bread(s, offset / s->s_blocksize);
+	if (!bh) {
+		reiserfs_warning(s, "sh-2007",
+				 "bread failed (dev %s, block %lu, size %lu)",
+				 s->s_id, offset / s->s_blocksize,
+				 s->s_blocksize);
+		return 1;
+	}
+
+	rs = (struct reiserfs_super_block *)bh->b_data;
+	if (sb_blocksize(rs) != s->s_blocksize) {
+		reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
+				 "filesystem on (dev %s, block %llu, size %lu)",
+				 s->s_id,
+				 (unsigned long long)bh->b_blocknr,
+				 s->s_blocksize);
+		brelse(bh);
+		return 1;
+	}
+
+	if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
+		brelse(bh);
+		reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
+				 "--rebuild-tree run detected. Please run\n"
+				 "reiserfsck --rebuild-tree and wait for a "
+				 "completion. If that fails\n"
+				 "get newer reiserfsprogs package");
+		return 1;
+	}
+
+	SB_BUFFER_WITH_SB(s) = bh;
+	SB_DISK_SUPER_BLOCK(s) = rs;
+
+	/*
+	 * magic is of non-standard journal filesystem, look at s_version to
+	 * find which format is in use
+	 */
+	if (is_reiserfs_jr(rs)) {
+		if (sb_version(rs) == REISERFS_VERSION_2)
+			reiserfs_info(s, "found reiserfs format \"3.6\""
+				      " with non-standard journal\n");
+		else if (sb_version(rs) == REISERFS_VERSION_1)
+			reiserfs_info(s, "found reiserfs format \"3.5\""
+				      " with non-standard journal\n");
+		else {
+			reiserfs_warning(s, "sh-2012", "found unknown "
+					 "format \"%u\" of reiserfs with "
+					 "non-standard magic", sb_version(rs));
+			return 1;
+		}
+	} else
+		/*
+		 * s_version of standard format may contain incorrect
+		 * information, so we just look at the magic string
+		 */
+		reiserfs_info(s,
+			      "found reiserfs format \"%s\" with standard journal\n",
+			      is_reiserfs_3_5(rs) ? "3.5" : "3.6");
+
+	s->s_op = &reiserfs_sops;
+	s->s_export_op = &reiserfs_export_ops;
+#ifdef CONFIG_QUOTA
+	s->s_qcop = &reiserfs_qctl_operations;
+	s->dq_op = &reiserfs_quota_operations;
+	s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+#endif
+
+	/*
+	 * new format is limited by the 32 bit wide i_blocks field, want to
+	 * be one full block below that.
+	 */
+	s->s_maxbytes = (512LL << 32) - s->s_blocksize;
+	return 0;
+}
+
+/* after journal replay, reread all bitmap and super blocks */
+static int reread_meta_blocks(struct super_block *s)
+{
+	ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s));
+	wait_on_buffer(SB_BUFFER_WITH_SB(s));
+	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
+		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
+		return 1;
+	}
+
+	return 0;
+}
+
+/* hash detection stuff */
+
+/*
+ * if root directory is empty - we set default - Yura's - hash and
+ * warn about it
+ * FIXME: we look for only one name in a directory. If tea and yura
+ * both have the same value - we ask user to send report to the
+ * mailing list
+ */
+static __u32 find_hash_out(struct super_block *s)
+{
+	int retval;
+	struct inode *inode;
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+	struct reiserfs_de_head *deh;
+	__u32 hash = DEFAULT_HASH;
+	__u32 deh_hashval, teahash, r5hash, yurahash;
+
+	inode = d_inode(s->s_root);
+
+	make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
+	retval = search_by_entry_key(s, &key, &path, &de);
+	if (retval == IO_ERROR) {
+		pathrelse(&path);
+		return UNSET_HASH;
+	}
+	if (retval == NAME_NOT_FOUND)
+		de.de_entry_num--;
+
+	set_de_name_and_namelen(&de);
+	deh = de.de_deh + de.de_entry_num;
+
+	if (deh_offset(deh) == DOT_DOT_OFFSET) {
+		/* allow override in this case */
+		if (reiserfs_rupasov_hash(s))
+			hash = YURA_HASH;
+		reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
+		goto out;
+	}
+
+	deh_hashval = GET_HASH_VALUE(deh_offset(deh));
+	r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
+	teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
+	yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
+
+	if ((teahash == r5hash && deh_hashval == r5hash) ||
+	    (teahash == yurahash && deh_hashval == yurahash) ||
+	    (r5hash == yurahash && deh_hashval == yurahash)) {
+		reiserfs_warning(s, "reiserfs-2506",
+				 "Unable to automatically detect hash "
+				 "function. Please mount with -o "
+				 "hash={tea,rupasov,r5}");
+		hash = UNSET_HASH;
+		goto out;
+	}
+
+	if (deh_hashval == yurahash)
+		hash = YURA_HASH;
+	else if (deh_hashval == teahash)
+		hash = TEA_HASH;
+	else if (deh_hashval == r5hash)
+		hash = R5_HASH;
+	else {
+		reiserfs_warning(s, "reiserfs-2506",
+				 "Unrecognised hash function");
+		hash = UNSET_HASH;
+	}
+out:
+	pathrelse(&path);
+	return hash;
+}
+
+/* finds out which hash names are sorted with */
+static int what_hash(struct super_block *s)
+{
+	__u32 code;
+
+	code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
+
+	/*
+	 * reiserfs_hash_detect() == true if any of the hash mount options
+	 * were used.  We must check them to make sure the user isn't
+	 * using a bad hash value
+	 */
+	if (code == UNSET_HASH || reiserfs_hash_detect(s))
+		code = find_hash_out(s);
+
+	if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
+		/*
+		 * detection has found the hash, and we must check against the
+		 * mount options
+		 */
+		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
+			reiserfs_warning(s, "reiserfs-2507",
+					 "Error, %s hash detected, "
+					 "unable to force rupasov hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
+			reiserfs_warning(s, "reiserfs-2508",
+					 "Error, %s hash detected, "
+					 "unable to force tea hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
+			reiserfs_warning(s, "reiserfs-2509",
+					 "Error, %s hash detected, "
+					 "unable to force r5 hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		}
+	} else {
+		/*
+		 * find_hash_out was not called or
+		 * could not determine the hash
+		 */
+		if (reiserfs_rupasov_hash(s)) {
+			code = YURA_HASH;
+		} else if (reiserfs_tea_hash(s)) {
+			code = TEA_HASH;
+		} else if (reiserfs_r5_hash(s)) {
+			code = R5_HASH;
+		}
+	}
+
+	/*
+	 * if we are mounted RW, and we have a new valid hash code, update
+	 * the super
+	 */
+	if (code != UNSET_HASH &&
+	    !(s->s_flags & MS_RDONLY) &&
+	    code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
+		set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
+	}
+	return code;
+}
+
+/* return pointer to appropriate function */
+static hashf_t hash_function(struct super_block *s)
+{
+	switch (what_hash(s)) {
+	case TEA_HASH:
+		reiserfs_info(s, "Using tea hash to sort names\n");
+		return keyed_hash;
+	case YURA_HASH:
+		reiserfs_info(s, "Using rupasov hash to sort names\n");
+		return yura_hash;
+	case R5_HASH:
+		reiserfs_info(s, "Using r5 hash to sort names\n");
+		return r5_hash;
+	}
+	return NULL;
+}
+
+/* this is used to set up correct value for old partitions */
+static int function2code(hashf_t func)
+{
+	if (func == keyed_hash)
+		return TEA_HASH;
+	if (func == yura_hash)
+		return YURA_HASH;
+	if (func == r5_hash)
+		return R5_HASH;
+
+	BUG();			/* should never happen */
+
+	return 0;
+}
+
+#define SWARN(silent, s, id, ...)			\
+	if (!(silent))				\
+		reiserfs_warning(s, id, __VA_ARGS__)
+
+static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode *root_inode;
+	struct reiserfs_transaction_handle th;
+	int old_format = 0;
+	unsigned long blocks;
+	unsigned int commit_max_age = 0;
+	int jinit_done = 0;
+	struct reiserfs_iget_args args;
+	struct reiserfs_super_block *rs;
+	char *jdev_name;
+	struct reiserfs_sb_info *sbi;
+	int errval = -EINVAL;
+	char *qf_names[REISERFS_MAXQUOTAS] = {};
+	unsigned int qfmt = 0;
+
+	save_mount_options(s, data);
+
+	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	s->s_fs_info = sbi;
+	/* Set default values for options: non-aggressive tails, RO on errors */
+	sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
+	sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
+	sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
+	/* no preallocation minimum, be smart in reiserfs_file_write instead */
+	sbi->s_alloc_options.preallocmin = 0;
+	/* Preallocate by 16 blocks (17-1) at once */
+	sbi->s_alloc_options.preallocsize = 17;
+	/* setup default block allocator options */
+	reiserfs_init_alloc_options(s);
+
+	spin_lock_init(&sbi->old_work_lock);
+	INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
+	mutex_init(&sbi->lock);
+	sbi->lock_depth = -1;
+
+	sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
+					 s->s_id);
+	if (!sbi->commit_wq) {
+		SWARN(silent, s, "", "Cannot allocate commit workqueue");
+		errval = -ENOMEM;
+		goto error_unlocked;
+	}
+
+	jdev_name = NULL;
+	if (reiserfs_parse_options
+	    (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
+	     &commit_max_age, qf_names, &qfmt) == 0) {
+		goto error_unlocked;
+	}
+	if (jdev_name && jdev_name[0]) {
+		sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
+		if (!sbi->s_jdev) {
+			SWARN(silent, s, "", "Cannot allocate memory for "
+				"journal device name");
+			goto error;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
+
+	if (blocks) {
+		SWARN(silent, s, "jmacd-7", "resize option for remount only");
+		goto error_unlocked;
+	}
+
+	/*
+	 * try old format (undistributed bitmap, super block in 8-th 1k
+	 * block of a device)
+	 */
+	if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
+		old_format = 1;
+
+	/*
+	 * try new format (64-th 1k block), which can contain reiserfs
+	 * super block
+	 */
+	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
+		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
+		      s->s_id);
+		goto error_unlocked;
+	}
+
+	rs = SB_DISK_SUPER_BLOCK(s);
+	/*
+	 * Let's do basic sanity check to verify that underlying device is not
+	 * smaller than the filesystem. If the check fails then abort and
+	 * scream, because bad stuff will happen otherwise.
+	 */
+	if (s->s_bdev && s->s_bdev->bd_inode
+	    && i_size_read(s->s_bdev->bd_inode) <
+	    sb_block_count(rs) * sb_blocksize(rs)) {
+		SWARN(silent, s, "", "Filesystem cannot be "
+		      "mounted because it is bigger than the device");
+		SWARN(silent, s, "", "You may need to run fsck "
+		      "or increase size of your LVM partition");
+		SWARN(silent, s, "", "Or may be you forgot to "
+		      "reboot after fdisk when it told you to");
+		goto error_unlocked;
+	}
+
+	sbi->s_mount_state = SB_REISERFS_STATE(s);
+	sbi->s_mount_state = REISERFS_VALID_FS;
+
+	if ((errval = reiserfs_init_bitmap_cache(s))) {
+		SWARN(silent, s, "jmacd-8", "unable to read bitmap");
+		goto error_unlocked;
+	}
+
+	errval = -EINVAL;
+#ifdef CONFIG_REISERFS_CHECK
+	SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
+	SWARN(silent, s, "", "- it is slow mode for debugging.");
+#endif
+
+	/* make data=ordered the default */
+	if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+	    !reiserfs_data_writeback(s)) {
+		sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+	}
+
+	if (reiserfs_data_log(s)) {
+		reiserfs_info(s, "using journaled data mode\n");
+	} else if (reiserfs_data_ordered(s)) {
+		reiserfs_info(s, "using ordered data mode\n");
+	} else {
+		reiserfs_info(s, "using writeback data mode\n");
+	}
+	if (reiserfs_barrier_flush(s)) {
+		printk("reiserfs: using flush barriers\n");
+	}
+
+	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
+		SWARN(silent, s, "sh-2022",
+		      "unable to initialize journal space");
+		goto error_unlocked;
+	} else {
+		/*
+		 * once this is set, journal_release must be called
+		 * if we error out of the mount
+		 */
+		jinit_done = 1;
+	}
+
+	if (reread_meta_blocks(s)) {
+		SWARN(silent, s, "jmacd-9",
+		      "unable to reread meta blocks after journal init");
+		goto error_unlocked;
+	}
+
+	if (replay_only(s))
+		goto error_unlocked;
+
+	if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
+		SWARN(silent, s, "clm-7000",
+		      "Detected readonly device, marking FS readonly");
+		s->s_flags |= MS_RDONLY;
+	}
+	args.objectid = REISERFS_ROOT_OBJECTID;
+	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
+	root_inode =
+	    iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
+			 reiserfs_init_locked_inode, (void *)&args);
+	if (!root_inode) {
+		SWARN(silent, s, "jmacd-10", "get root inode failed");
+		goto error_unlocked;
+	}
+
+	/*
+	 * This path assumed to be called with the BKL in the old times.
+	 * Now we have inherited the big reiserfs lock from it and many
+	 * reiserfs helpers called in the mount path and elsewhere require
+	 * this lock to be held even if it's not always necessary. Let's be
+	 * conservative and hold it early. The window can be reduced after
+	 * careful review of the code.
+	 */
+	reiserfs_write_lock(s);
+
+	if (root_inode->i_state & I_NEW) {
+		reiserfs_read_locked_inode(root_inode, &args);
+		unlock_new_inode(root_inode);
+	}
+
+	s->s_root = d_make_root(root_inode);
+	if (!s->s_root)
+		goto error;
+	/* define and initialize hash function */
+	sbi->s_hash_function = hash_function(s);
+	if (sbi->s_hash_function == NULL) {
+		dput(s->s_root);
+		s->s_root = NULL;
+		goto error;
+	}
+
+	if (is_reiserfs_3_5(rs)
+	    || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
+		set_bit(REISERFS_3_5, &sbi->s_properties);
+	else if (old_format)
+		set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
+	else
+		set_bit(REISERFS_3_6, &sbi->s_properties);
+
+	if (!(s->s_flags & MS_RDONLY)) {
+
+		errval = journal_begin(&th, s, 1);
+		if (errval) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+
+		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		set_sb_fs_state(rs, 0);
+
+		/*
+		 * Clear out s_bmap_nr if it would wrap. We can handle this
+		 * case, but older revisions can't. This will cause the
+		 * file system to fail mount on those older implementations,
+		 * avoiding corruption. -jeffm
+		 */
+		if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
+		    sb_bmap_nr(rs) != 0) {
+			reiserfs_warning(s, "super-2030", "This file system "
+					"claims to use %u bitmap blocks in "
+					"its super block, but requires %u. "
+					"Clearing to zero.", sb_bmap_nr(rs),
+					reiserfs_bmap_count(s));
+
+			set_sb_bmap_nr(rs, 0);
+		}
+
+		if (old_format_only(s)) {
+			/*
+			 * filesystem of format 3.5 either with standard
+			 * or non-standard journal
+			 */
+			if (convert_reiserfs(s)) {
+				/* and -o conv is given */
+				if (!silent)
+					reiserfs_info(s,
+						      "converting 3.5 filesystem to the 3.6 format");
+
+				if (is_reiserfs_3_5(rs))
+					/*
+					 * put magic string of 3.6 format.
+					 * 2.2 will not be able to
+					 * mount this filesystem anymore
+					 */
+					memcpy(rs->s_v1.s_magic,
+					       reiserfs_3_6_magic_string,
+					       sizeof
+					       (reiserfs_3_6_magic_string));
+
+				set_sb_version(rs, REISERFS_VERSION_2);
+				reiserfs_convert_objectid_map_v1(s);
+				set_bit(REISERFS_3_6, &sbi->s_properties);
+				clear_bit(REISERFS_3_5, &sbi->s_properties);
+			} else if (!silent) {
+				reiserfs_info(s, "using 3.5.x disk format\n");
+			}
+		} else
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
+
+
+		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+		errval = journal_end(&th);
+		if (errval) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+
+		reiserfs_write_unlock(s);
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error_unlocked;
+		}
+		reiserfs_write_lock(s);
+
+		/*
+		 * look for files which were to be removed in previous session
+		 */
+		finish_unfinished(s);
+	} else {
+		if (old_format_only(s) && !silent) {
+			reiserfs_info(s, "using 3.5.x disk format\n");
+		}
+
+		reiserfs_write_unlock(s);
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error_unlocked;
+		}
+		reiserfs_write_lock(s);
+	}
+	/*
+	 * mark hash in super block: it could be unset. overwrite should be ok
+	 */
+	set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
+
+	handle_attrs(s);
+
+	reiserfs_proc_info_init(s);
+
+	init_waitqueue_head(&(sbi->s_wait));
+	spin_lock_init(&sbi->bitmap_lock);
+
+	reiserfs_write_unlock(s);
+
+	return (0);
+
+error:
+	reiserfs_write_unlock(s);
+
+error_unlocked:
+	/* kill the commit thread, free journal ram */
+	if (jinit_done) {
+		reiserfs_write_lock(s);
+		journal_release_error(NULL, s);
+		reiserfs_write_unlock(s);
+	}
+
+	if (sbi->commit_wq)
+		destroy_workqueue(sbi->commit_wq);
+
+	cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+
+	reiserfs_free_bitmap_cache(s);
+	if (SB_BUFFER_WITH_SB(s))
+		brelse(SB_BUFFER_WITH_SB(s));
+#ifdef CONFIG_QUOTA
+	{
+		int j;
+		for (j = 0; j < REISERFS_MAXQUOTAS; j++)
+			kfree(qf_names[j]);
+	}
+#endif
+	kfree(sbi);
+
+	s->s_fs_info = NULL;
+	return errval;
+}
+
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
+
+	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
+	buf->f_bfree = sb_free_blocks(rs);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	/* changed to accommodate gcc folks. */
+	buf->f_type = REISERFS_SUPER_MAGIC;
+	buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
+	buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
+				sizeof(rs->s_uuid)/2);
+
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
+static int reiserfs_write_dquot(struct dquot *dquot)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+	int depth;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (ret)
+		goto out;
+	depth = reiserfs_write_unlock_nested(dquot->dq_sb);
+	ret = dquot_commit(dquot);
+	reiserfs_write_lock_nested(dquot->dq_sb, depth);
+	err = journal_end(&th);
+	if (!ret && err)
+		ret = err;
+out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
+}
+
+static int reiserfs_acquire_dquot(struct dquot *dquot)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+	int depth;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (ret)
+		goto out;
+	depth = reiserfs_write_unlock_nested(dquot->dq_sb);
+	ret = dquot_acquire(dquot);
+	reiserfs_write_lock_nested(dquot->dq_sb, depth);
+	err = journal_end(&th);
+	if (!ret && err)
+		ret = err;
+out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
+}
+
+static int reiserfs_release_dquot(struct dquot *dquot)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	reiserfs_write_unlock(dquot->dq_sb);
+	if (ret) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
+		goto out;
+	}
+	ret = dquot_release(dquot);
+	reiserfs_write_lock(dquot->dq_sb);
+	err = journal_end(&th);
+	if (!ret && err)
+		ret = err;
+	reiserfs_write_unlock(dquot->dq_sb);
+out:
+	return ret;
+}
+
+static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
+{
+	/* Are we journaling quotas? */
+	if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+	    REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return reiserfs_write_dquot(dquot);
+	} else
+		return dquot_mark_dquot_dirty(dquot);
+}
+
+static int reiserfs_write_info(struct super_block *sb, int type)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+	int depth;
+
+	/* Data block + inode block */
+	reiserfs_write_lock(sb);
+	ret = journal_begin(&th, sb, 2);
+	if (ret)
+		goto out;
+	depth = reiserfs_write_unlock_nested(sb);
+	ret = dquot_commit_info(sb, type);
+	reiserfs_write_lock_nested(sb, depth);
+	err = journal_end(&th);
+	if (!ret && err)
+		ret = err;
+out:
+	reiserfs_write_unlock(sb);
+	return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find the quota file and such...
+ */
+static int reiserfs_quota_on_mount(struct super_block *sb, int type)
+{
+	return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+					REISERFS_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
+			     struct path *path)
+{
+	int err;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
+
+	reiserfs_write_lock(sb);
+	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Quotafile not on the same filesystem? */
+	if (path->dentry->d_sb != sb) {
+		err = -EXDEV;
+		goto out;
+	}
+	inode = d_inode(path->dentry);
+	/*
+	 * We must not pack tails for quota files on reiserfs for quota
+	 * IO to work
+	 */
+	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
+		err = reiserfs_unpack(inode, NULL);
+		if (err) {
+			reiserfs_warning(sb, "super-6520",
+				"Unpacking tail of quota file failed"
+				" (%d). Cannot turn on quotas.", err);
+			err = -EINVAL;
+			goto out;
+		}
+		mark_inode_dirty(inode);
+	}
+	/* Journaling quota? */
+	if (REISERFS_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (path->dentry->d_parent != sb->s_root)
+			reiserfs_warning(sb, "super-6521",
+				 "Quota file not on filesystem root. "
+				 "Journalled quota will not work.");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (reiserfs_file_data_log(inode)) {
+		/* Just start temporary transaction and finish it */
+		err = journal_begin(&th, sb, 1);
+		if (err)
+			goto out;
+		err = journal_end_sync(&th);
+		if (err)
+			goto out;
+	}
+	reiserfs_write_unlock(sb);
+	return dquot_quota_on(sb, type, format_id, path);
+out:
+	reiserfs_write_unlock(sb);
+	return err;
+}
+
+/*
+ * Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races
+ */
+static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
+				   size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	unsigned long blk = off >> sb->s_blocksize_bits;
+	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
+	size_t toread;
+	struct buffer_head tmp_bh, *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy =
+		    sb->s_blocksize - offset <
+		    toread ? sb->s_blocksize - offset : toread;
+		tmp_bh.b_state = 0;
+		/*
+		 * Quota files are without tails so we can safely
+		 * use this function
+		 */
+		reiserfs_write_lock(sb);
+		err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
+		reiserfs_write_unlock(sb);
+		if (err)
+			return err;
+		if (!buffer_mapped(&tmp_bh))	/* A hole? */
+			memset(data, 0, tocopy);
+		else {
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			if (!bh)
+				return -EIO;
+			memcpy(data, bh->b_data + offset, tocopy);
+			brelse(bh);
+		}
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/*
+ * Write to quotafile (we know the transaction is already started and has
+ * enough credits)
+ */
+static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
+				    const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	unsigned long blk = off >> sb->s_blocksize_bits;
+	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
+	int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
+	size_t towrite = len;
+	struct buffer_head tmp_bh, *bh;
+
+	if (!current->journal_info) {
+		printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+		    sb->s_blocksize - offset : towrite;
+		tmp_bh.b_state = 0;
+		reiserfs_write_lock(sb);
+		err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
+		reiserfs_write_unlock(sb);
+		if (err)
+			goto out;
+		if (offset || tocopy != sb->s_blocksize)
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+		else
+			bh = sb_getblk(sb, tmp_bh.b_blocknr);
+		if (!bh) {
+			err = -EIO;
+			goto out;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data + offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		reiserfs_write_lock(sb);
+		reiserfs_prepare_for_journal(sb, bh, 1);
+		journal_mark_dirty(current->journal_info, bh);
+		if (!journal_quota)
+			reiserfs_add_ordered_list(inode, bh);
+		reiserfs_write_unlock(sb);
+		brelse(bh);
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+out:
+	if (len == towrite)
+		return err;
+	if (inode->i_size < off + len - towrite)
+		i_size_write(inode, off + len - towrite);
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	return len - towrite;
+}
+
+#endif
+
+static struct dentry *get_super_block(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+}
+
+static int __init init_reiserfs_fs(void)
+{
+	int ret;
+
+	ret = init_inodecache();
+	if (ret)
+		return ret;
+
+	reiserfs_proc_info_global_init();
+
+	ret = register_filesystem(&reiserfs_fs_type);
+	if (ret)
+		goto out;
+
+	return 0;
+out:
+	reiserfs_proc_info_global_done();
+	destroy_inodecache();
+
+	return ret;
+}
+
+static void __exit exit_reiserfs_fs(void)
+{
+	reiserfs_proc_info_global_done();
+	unregister_filesystem(&reiserfs_fs_type);
+	destroy_inodecache();
+}
+
+struct file_system_type reiserfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "reiserfs",
+	.mount = get_super_block,
+	.kill_sb = reiserfs_kill_sb,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("reiserfs");
+
+MODULE_DESCRIPTION("ReiserFS journaled filesystem");
+MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
+MODULE_LICENSE("GPL");
+
+module_init(init_reiserfs_fs);
+module_exit(exit_reiserfs_fs);
diff --git a/kernel/fs/reiserfs/tail_conversion.c b/kernel/fs/reiserfs/tail_conversion.c
new file mode 100644
index 000000000..f41e19b4b
--- /dev/null
+++ b/kernel/fs/reiserfs/tail_conversion.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
+ * details
+ */
+
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include "reiserfs.h"
+
+/*
+ * access to tail : when one is going to read tail it must make sure, that is
+ * not running.  direct2indirect and indirect2direct can not run concurrently
+ */
+
+/*
+ * Converts direct items to an unformatted node. Panics if file has no
+ * tail. -ENOSPC if no disk space for conversion
+ */
+/*
+ * path points to first direct item of the file regardless of how many of
+ * them are there
+ */
+int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
+		    struct treepath *path, struct buffer_head *unbh,
+		    loff_t tail_offset)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *up_to_date_bh;
+	struct item_head *p_le_ih = tp_item_head(path);
+	unsigned long total_tail = 0;
+
+	/* Key to search for the last byte of the converted item. */
+	struct cpu_key end_key;
+
+	/*
+	 * new indirect item to be inserted or key
+	 * of unfm pointer to be pasted
+	 */
+	struct item_head ind_ih;
+	int blk_size;
+	/* returned value for reiserfs_insert_item and clones */
+	int  retval;
+	/* Handle on an unformatted node that will be inserted in the tree. */
+	unp_t unfm_ptr;
+
+	BUG_ON(!th->t_trans_id);
+
+	REISERFS_SB(sb)->s_direct2indirect++;
+
+	blk_size = sb->s_blocksize;
+
+	/*
+	 * and key to search for append or insert pointer to the new
+	 * unformatted node.
+	 */
+	copy_item_head(&ind_ih, p_le_ih);
+	set_le_ih_k_offset(&ind_ih, tail_offset);
+	set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
+
+	/* Set the key to search for the place for new unfm pointer */
+	make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
+
+	/* FIXME: we could avoid this */
+	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
+		reiserfs_error(sb, "PAP-14030",
+			       "pasted or inserted byte exists in "
+			       "the tree %K. Use fsck to repair.", &end_key);
+		pathrelse(path);
+		return -EIO;
+	}
+
+	p_le_ih = tp_item_head(path);
+
+	unfm_ptr = cpu_to_le32(unbh->b_blocknr);
+
+	if (is_statdata_le_ih(p_le_ih)) {
+		/* Insert new indirect item. */
+		set_ih_free_space(&ind_ih, 0);	/* delete at nearest future */
+		put_ih_item_len(&ind_ih, UNFM_P_SIZE);
+		PATH_LAST_POSITION(path)++;
+		retval =
+		    reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
+					 (char *)&unfm_ptr);
+	} else {
+		/* Paste into last indirect item of an object. */
+		retval = reiserfs_paste_into_item(th, path, &end_key, inode,
+						    (char *)&unfm_ptr,
+						    UNFM_P_SIZE);
+	}
+	if (retval) {
+		return retval;
+	}
+	/*
+	 * note: from here there are two keys which have matching first
+	 *  three key components. They only differ by the fourth one.
+	 */
+
+	/* Set the key to search for the direct items of the file */
+	make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
+		     4);
+
+	/*
+	 * Move bytes from the direct items to the new unformatted node
+	 * and delete them.
+	 */
+	while (1) {
+		int tail_size;
+
+		/*
+		 * end_key.k_offset is set so, that we will always have found
+		 * last item of the file
+		 */
+		if (search_for_position_by_key(sb, &end_key, path) ==
+		    POSITION_FOUND)
+			reiserfs_panic(sb, "PAP-14050",
+				       "direct item (%K) not found", &end_key);
+		p_le_ih = tp_item_head(path);
+		RFALSE(!is_direct_le_ih(p_le_ih),
+		       "vs-14055: direct item expected(%K), found %h",
+		       &end_key, p_le_ih);
+		tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
+		    + ih_item_len(p_le_ih) - 1;
+
+		/*
+		 * we only send the unbh pointer if the buffer is not
+		 * up to date.  this avoids overwriting good data from
+		 * writepage() with old data from the disk or buffer cache
+		 * Special case: unbh->b_page will be NULL if we are coming
+		 * through DIRECT_IO handler here.
+		 */
+		if (!unbh->b_page || buffer_uptodate(unbh)
+		    || PageUptodate(unbh->b_page)) {
+			up_to_date_bh = NULL;
+		} else {
+			up_to_date_bh = unbh;
+		}
+		retval = reiserfs_delete_item(th, path, &end_key, inode,
+						up_to_date_bh);
+
+		total_tail += retval;
+
+		/* done: file does not have direct items anymore */
+		if (tail_size == retval)
+			break;
+
+	}
+	/*
+	 * if we've copied bytes from disk into the page, we need to zero
+	 * out the unused part of the block (it was not up to date before)
+	 */
+	if (up_to_date_bh) {
+		unsigned pgoff =
+		    (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
+		char *kaddr = kmap_atomic(up_to_date_bh->b_page);
+		memset(kaddr + pgoff, 0, blk_size - total_tail);
+		kunmap_atomic(kaddr);
+	}
+
+	REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
+
+	return 0;
+}
+
+/* stolen from fs/buffer.c */
+void reiserfs_unmap_buffer(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+		BUG();
+	}
+	clear_buffer_dirty(bh);
+	/*
+	 * Remove the buffer from whatever list it belongs to. We are mostly
+	 * interested in removing it from per-sb j_dirty_buffers list, to avoid
+	 * BUG() on attempt to write not mapped buffer
+	 */
+	if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
+		struct inode *inode = bh->b_page->mapping->host;
+		struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+		spin_lock(&j->j_dirty_buffers_lock);
+		list_del_init(&bh->b_assoc_buffers);
+		reiserfs_free_jh(bh);
+		spin_unlock(&j->j_dirty_buffers_lock);
+	}
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	bh->b_bdev = NULL;
+	unlock_buffer(bh);
+}
+
+/*
+ * this first locks inode (neither reads nor sync are permitted),
+ * reads tail through page cache, insert direct item. When direct item
+ * inserted successfully inode is left locked. Return value is always
+ * what we expect from it (number of cut bytes). But when tail remains
+ * in the unformatted node, we set mode to SKIP_BALANCING and unlock
+ * inode
+ */
+int indirect2direct(struct reiserfs_transaction_handle *th,
+		    struct inode *inode, struct page *page,
+		    struct treepath *path,	/* path to the indirect item. */
+		    const struct cpu_key *item_key,	/* Key to look for
+							 * unformatted node
+							 * pointer to be cut. */
+		    loff_t n_new_file_size,	/* New file size. */
+		    char *mode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct item_head s_ih;
+	unsigned long block_size = sb->s_blocksize;
+	char *tail;
+	int tail_len, round_tail_len;
+	loff_t pos, pos1;	/* position of first byte of the tail */
+	struct cpu_key key;
+
+	BUG_ON(!th->t_trans_id);
+
+	REISERFS_SB(sb)->s_indirect2direct++;
+
+	*mode = M_SKIP_BALANCING;
+
+	/* store item head path points to. */
+	copy_item_head(&s_ih, tp_item_head(path));
+
+	tail_len = (n_new_file_size & (block_size - 1));
+	if (get_inode_sd_version(inode) == STAT_DATA_V2)
+		round_tail_len = ROUND_UP(tail_len);
+	else
+		round_tail_len = tail_len;
+
+	pos =
+	    le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
+					 1) * sb->s_blocksize;
+	pos1 = pos;
+
+	/*
+	 * we are protected by i_mutex. The tail can not disapper, not
+	 * append can be done either
+	 * we are in truncate or packing tail in file_release
+	 */
+
+	tail = (char *)kmap(page);	/* this can schedule */
+
+	if (path_changed(&s_ih, path)) {
+		/* re-search indirect item */
+		if (search_for_position_by_key(sb, item_key, path)
+		    == POSITION_NOT_FOUND)
+			reiserfs_panic(sb, "PAP-5520",
+				       "item to be converted %K does not exist",
+				       item_key);
+		copy_item_head(&s_ih, tp_item_head(path));
+#ifdef CONFIG_REISERFS_CHECK
+		pos = le_ih_k_offset(&s_ih) - 1 +
+		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
+		     1) * sb->s_blocksize;
+		if (pos != pos1)
+			reiserfs_panic(sb, "vs-5530", "tail position "
+				       "changed while we were reading it");
+#endif
+	}
+
+	/* Set direct item header to insert. */
+	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
+			  pos1 + 1, TYPE_DIRECT, round_tail_len,
+			  0xffff /*ih_free_space */ );
+
+	/*
+	 * we want a pointer to the first byte of the tail in the page.
+	 * the page was locked and this part of the page was up to date when
+	 * indirect2direct was called, so we know the bytes are still valid
+	 */
+	tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
+
+	PATH_LAST_POSITION(path)++;
+
+	key = *item_key;
+	set_cpu_key_k_type(&key, TYPE_DIRECT);
+	key.key_length = 4;
+	/* Insert tail as new direct item in the tree */
+	if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
+				 tail ? tail : NULL) < 0) {
+		/*
+		 * No disk memory. So we can not convert last unformatted node
+		 * to the direct item.  In this case we used to adjust
+		 * indirect items's ih_free_space. Now ih_free_space is not
+		 * used, it would be ideal to write zeros to corresponding
+		 * unformatted node. For now i_size is considered as guard for
+		 * going out of file size
+		 */
+		kunmap(page);
+		return block_size - round_tail_len;
+	}
+	kunmap(page);
+
+	/* make sure to get the i_blocks changes from reiserfs_insert_item */
+	reiserfs_update_sd(th, inode);
+
+	/*
+	 * note: we have now the same as in above direct2indirect
+	 * conversion: there are two keys which have matching first three
+	 * key components. They only differ by the fourth one.
+	 */
+
+	/*
+	 * We have inserted new direct item and must remove last
+	 * unformatted node.
+	 */
+	*mode = M_CUT;
+
+	/* we store position of first direct item in the in-core inode */
+	/* mark_file_with_tail (inode, pos1 + 1); */
+	REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
+
+	return block_size - round_tail_len;
+}
diff --git a/kernel/fs/reiserfs/xattr.c b/kernel/fs/reiserfs/xattr.c
new file mode 100644
index 000000000..e87f9b52b
--- /dev/null
+++ b/kernel/fs/reiserfs/xattr.c
@@ -0,0 +1,1064 @@
+/*
+ * linux/fs/reiserfs/xattr.c
+ *
+ * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
+ *
+ */
+
+/*
+ * In order to implement EA/ACLs in a clean, backwards compatible manner,
+ * they are implemented as files in a "private" directory.
+ * Each EA is in it's own file, with the directory layout like so (/ is assumed
+ * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
+ * directories named using the capital-hex form of the objectid and
+ * generation number are used. Inside each directory are individual files
+ * named with the name of the extended attribute.
+ *
+ * So, for objectid 12648430, we could have:
+ * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
+ * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
+ * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
+ * .. or similar.
+ *
+ * The file contents are the text of the EA. The size is known based on the
+ * stat data describing the file.
+ *
+ * In the case of system.posix_acl_access and system.posix_acl_default, since
+ * these are special cases for filesystem ACLs, they are interpreted by the
+ * kernel, in addition, they are negatively and positively cached and attached
+ * to the inode so that unnecessary lookups are avoided.
+ *
+ * Locking works like so:
+ * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
+ * The xattrs themselves are protected by the xattr_sem.
+ */
+
+#include "reiserfs.h"
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include <linux/uaccess.h>
+#include <net/checksum.h>
+#include <linux/stat.h>
+#include <linux/quotaops.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+
+#define PRIVROOT_NAME ".reiserfs_priv"
+#define XAROOT_NAME   "xattrs"
+
+
+/*
+ * Helpers for inode ops. We do this so that we don't have all the VFS
+ * overhead and also for proper i_mutex annotation.
+ * dir->i_mutex must be held for all of them.
+ */
+#ifdef CONFIG_REISERFS_FS_XATTR
+static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
+{
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	return dir->i_op->create(dir, dentry, mode, true);
+}
+#endif
+
+static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	return dir->i_op->mkdir(dir, dentry, mode);
+}
+
+/*
+ * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
+ * mutation ops aren't called during rename or splace, which are the
+ * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
+ * better than allocating another subclass just for this code.
+ */
+static int xattr_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+
+	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	error = dir->i_op->unlink(dir, dentry);
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+
+	if (!error)
+		d_delete(dentry);
+	return error;
+}
+
+static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+
+	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	error = dir->i_op->rmdir(dir, dentry);
+	if (!error)
+		d_inode(dentry)->i_flags |= S_DEAD;
+	mutex_unlock(&d_inode(dentry)->i_mutex);
+	if (!error)
+		d_delete(dentry);
+
+	return error;
+}
+
+#define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
+
+static struct dentry *open_xa_root(struct super_block *sb, int flags)
+{
+	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
+	struct dentry *xaroot;
+
+	if (d_really_is_negative(privroot))
+		return ERR_PTR(-ENODATA);
+
+	mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR);
+
+	xaroot = dget(REISERFS_SB(sb)->xattr_root);
+	if (!xaroot)
+		xaroot = ERR_PTR(-ENODATA);
+	else if (d_really_is_negative(xaroot)) {
+		int err = -ENODATA;
+
+		if (xattr_may_create(flags))
+			err = xattr_mkdir(d_inode(privroot), xaroot, 0700);
+		if (err) {
+			dput(xaroot);
+			xaroot = ERR_PTR(err);
+		}
+	}
+
+	mutex_unlock(&d_inode(privroot)->i_mutex);
+	return xaroot;
+}
+
+static struct dentry *open_xa_dir(const struct inode *inode, int flags)
+{
+	struct dentry *xaroot, *xadir;
+	char namebuf[17];
+
+	xaroot = open_xa_root(inode->i_sb, flags);
+	if (IS_ERR(xaroot))
+		return xaroot;
+
+	snprintf(namebuf, sizeof(namebuf), "%X.%X",
+		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
+		 inode->i_generation);
+
+	mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR);
+
+	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
+	if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
+		int err = -ENODATA;
+
+		if (xattr_may_create(flags))
+			err = xattr_mkdir(d_inode(xaroot), xadir, 0700);
+		if (err) {
+			dput(xadir);
+			xadir = ERR_PTR(err);
+		}
+	}
+
+	mutex_unlock(&d_inode(xaroot)->i_mutex);
+	dput(xaroot);
+	return xadir;
+}
+
+/*
+ * The following are side effects of other operations that aren't explicitly
+ * modifying extended attributes. This includes operations such as permissions
+ * or ownership changes, object deletions, etc.
+ */
+struct reiserfs_dentry_buf {
+	struct dir_context ctx;
+	struct dentry *xadir;
+	int count;
+	struct dentry *dentries[8];
+};
+
+static int
+fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
+		   loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct reiserfs_dentry_buf *dbuf =
+		container_of(ctx, struct reiserfs_dentry_buf, ctx);
+	struct dentry *dentry;
+
+	WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex));
+
+	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
+		return -ENOSPC;
+
+	if (name[0] == '.' && (namelen < 2 ||
+			       (namelen == 2 && name[1] == '.')))
+		return 0;
+
+	dentry = lookup_one_len(name, dbuf->xadir, namelen);
+	if (IS_ERR(dentry)) {
+		return PTR_ERR(dentry);
+	} else if (d_really_is_negative(dentry)) {
+		/* A directory entry exists, but no file? */
+		reiserfs_error(dentry->d_sb, "xattr-20003",
+			       "Corrupted directory: xattr %pd listed but "
+			       "not found for file %pd.\n",
+			       dentry, dbuf->xadir);
+		dput(dentry);
+		return -EIO;
+	}
+
+	dbuf->dentries[dbuf->count++] = dentry;
+	return 0;
+}
+
+static void
+cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
+{
+	int i;
+
+	for (i = 0; i < buf->count; i++)
+		if (buf->dentries[i])
+			dput(buf->dentries[i]);
+}
+
+static int reiserfs_for_each_xattr(struct inode *inode,
+				   int (*action)(struct dentry *, void *),
+				   void *data)
+{
+	struct dentry *dir;
+	int i, err = 0;
+	struct reiserfs_dentry_buf buf = {
+		.ctx.actor = fill_with_dentries,
+	};
+
+	/* Skip out, an xattr has no xattrs associated with it */
+	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
+		return 0;
+
+	dir = open_xa_dir(inode, XATTR_REPLACE);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		goto out;
+	} else if (d_really_is_negative(dir)) {
+		err = 0;
+		goto out_dir;
+	}
+
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+
+	buf.xadir = dir;
+	while (1) {
+		err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
+		if (err)
+			break;
+		if (!buf.count)
+			break;
+		for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
+			struct dentry *dentry = buf.dentries[i];
+
+			if (!d_is_dir(dentry))
+				err = action(dentry, data);
+
+			dput(dentry);
+			buf.dentries[i] = NULL;
+		}
+		if (err)
+			break;
+		buf.count = 0;
+	}
+	mutex_unlock(&d_inode(dir)->i_mutex);
+
+	cleanup_dentry_buf(&buf);
+
+	if (!err) {
+		/*
+		 * We start a transaction here to avoid a ABBA situation
+		 * between the xattr root's i_mutex and the journal lock.
+		 * This doesn't incur much additional overhead since the
+		 * new transaction will just nest inside the
+		 * outer transaction.
+		 */
+		int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+			     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+		struct reiserfs_transaction_handle th;
+
+		reiserfs_write_lock(inode->i_sb);
+		err = journal_begin(&th, inode->i_sb, blocks);
+		reiserfs_write_unlock(inode->i_sb);
+		if (!err) {
+			int jerror;
+
+			mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex,
+					  I_MUTEX_XATTR);
+			err = action(dir, data);
+			reiserfs_write_lock(inode->i_sb);
+			jerror = journal_end(&th);
+			reiserfs_write_unlock(inode->i_sb);
+			mutex_unlock(&d_inode(dir->d_parent)->i_mutex);
+			err = jerror ?: err;
+		}
+	}
+out_dir:
+	dput(dir);
+out:
+	/* -ENODATA isn't an error */
+	if (err == -ENODATA)
+		err = 0;
+	return err;
+}
+
+static int delete_one_xattr(struct dentry *dentry, void *data)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+
+	/* This is the xattr dir, handle specially. */
+	if (d_is_dir(dentry))
+		return xattr_rmdir(dir, dentry);
+
+	return xattr_unlink(dir, dentry);
+}
+
+static int chown_one_xattr(struct dentry *dentry, void *data)
+{
+	struct iattr *attrs = data;
+	int ia_valid = attrs->ia_valid;
+	int err;
+
+	/*
+	 * We only want the ownership bits. Otherwise, we'll do
+	 * things like change a directory to a regular file if
+	 * ATTR_MODE is set.
+	 */
+	attrs->ia_valid &= (ATTR_UID|ATTR_GID);
+	err = reiserfs_setattr(dentry, attrs);
+	attrs->ia_valid = ia_valid;
+
+	return err;
+}
+
+/* No i_mutex, but the inode is unconnected. */
+int reiserfs_delete_xattrs(struct inode *inode)
+{
+	int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
+
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20004",
+				 "Couldn't delete all xattrs (%d)\n", err);
+	return err;
+}
+
+/* inode->i_mutex: down */
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
+{
+	int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
+
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20007",
+				 "Couldn't chown all xattrs (%d)\n", err);
+	return err;
+}
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+/*
+ * Returns a dentry corresponding to a specific extended attribute file
+ * for the inode. If flags allow, the file is created. Otherwise, a
+ * valid or negative dentry, or an error is returned.
+ */
+static struct dentry *xattr_lookup(struct inode *inode, const char *name,
+				    int flags)
+{
+	struct dentry *xadir, *xafile;
+	int err = 0;
+
+	xadir = open_xa_dir(inode, flags);
+	if (IS_ERR(xadir))
+		return ERR_CAST(xadir);
+
+	mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+	xafile = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(xafile)) {
+		err = PTR_ERR(xafile);
+		goto out;
+	}
+
+	if (d_really_is_positive(xafile) && (flags & XATTR_CREATE))
+		err = -EEXIST;
+
+	if (d_really_is_negative(xafile)) {
+		err = -ENODATA;
+		if (xattr_may_create(flags))
+			err = xattr_create(d_inode(xadir), xafile,
+					      0700|S_IFREG);
+	}
+
+	if (err)
+		dput(xafile);
+out:
+	mutex_unlock(&d_inode(xadir)->i_mutex);
+	dput(xadir);
+	if (err)
+		return ERR_PTR(err);
+	return xafile;
+}
+
+/* Internal operations on file data */
+static inline void reiserfs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static struct page *reiserfs_get_page(struct inode *dir, size_t n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page;
+	/*
+	 * We can deadlock if we try to free dentries,
+	 * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
+	 */
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	reiserfs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+static inline __u32 xattr_hash(const char *msg, int len)
+{
+	return csum_partial(msg, len, 0);
+}
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to);
+
+static void update_ctime(struct inode *inode)
+{
+	struct timespec now = current_fs_time(inode->i_sb);
+
+	if (inode_unhashed(inode) || !inode->i_nlink ||
+	    timespec_equal(&inode->i_ctime, &now))
+		return;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+static int lookup_and_delete_xattr(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct dentry *dentry, *xadir;
+
+	xadir = open_xa_dir(inode, XATTR_REPLACE);
+	if (IS_ERR(xadir))
+		return PTR_ERR(xadir);
+
+	mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+	dentry = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_dput;
+	}
+
+	if (d_really_is_positive(dentry)) {
+		err = xattr_unlink(d_inode(xadir), dentry);
+		update_ctime(inode);
+	}
+
+	dput(dentry);
+out_dput:
+	mutex_unlock(&d_inode(xadir)->i_mutex);
+	dput(xadir);
+	return err;
+}
+
+
+/* Generic extended attribute operations that can be used by xa plugins */
+
+/*
+ * inode->i_mutex: down
+ */
+int
+reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
+			  struct inode *inode, const char *name,
+			  const void *buffer, size_t buffer_size, int flags)
+{
+	int err = 0;
+	struct dentry *dentry;
+	struct page *page;
+	char *data;
+	size_t file_pos = 0;
+	size_t buffer_pos = 0;
+	size_t new_size;
+	__u32 xahash = 0;
+
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	if (!buffer) {
+		err = lookup_and_delete_xattr(inode, name);
+		return err;
+	}
+
+	dentry = xattr_lookup(inode, name, flags);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	down_write(&REISERFS_I(inode)->i_xattr_sem);
+
+	xahash = xattr_hash(buffer, buffer_size);
+	while (buffer_pos < buffer_size || buffer_pos == 0) {
+		size_t chunk;
+		size_t skip = 0;
+		size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+
+		if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
+			chunk = PAGE_CACHE_SIZE;
+		else
+			chunk = buffer_size - buffer_pos;
+
+		page = reiserfs_get_page(d_inode(dentry), file_pos);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out_unlock;
+		}
+
+		lock_page(page);
+		data = page_address(page);
+
+		if (file_pos == 0) {
+			struct reiserfs_xattr_header *rxh;
+
+			skip = file_pos = sizeof(struct reiserfs_xattr_header);
+			if (chunk + skip > PAGE_CACHE_SIZE)
+				chunk = PAGE_CACHE_SIZE - skip;
+			rxh = (struct reiserfs_xattr_header *)data;
+			rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
+			rxh->h_hash = cpu_to_le32(xahash);
+		}
+
+		reiserfs_write_lock(inode->i_sb);
+		err = __reiserfs_write_begin(page, page_offset, chunk + skip);
+		if (!err) {
+			if (buffer)
+				memcpy(data + skip, buffer + buffer_pos, chunk);
+			err = reiserfs_commit_write(NULL, page, page_offset,
+						    page_offset + chunk +
+						    skip);
+		}
+		reiserfs_write_unlock(inode->i_sb);
+		unlock_page(page);
+		reiserfs_put_page(page);
+		buffer_pos += chunk;
+		file_pos += chunk;
+		skip = 0;
+		if (err || buffer_size == 0 || !buffer)
+			break;
+	}
+
+	new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
+	if (!err && new_size < i_size_read(d_inode(dentry))) {
+		struct iattr newattrs = {
+			.ia_ctime = current_fs_time(inode->i_sb),
+			.ia_size = new_size,
+			.ia_valid = ATTR_SIZE | ATTR_CTIME,
+		};
+
+		mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR);
+		inode_dio_wait(d_inode(dentry));
+
+		err = reiserfs_setattr(dentry, &newattrs);
+		mutex_unlock(&d_inode(dentry)->i_mutex);
+	} else
+		update_ctime(inode);
+out_unlock:
+	up_write(&REISERFS_I(inode)->i_xattr_sem);
+	dput(dentry);
+	return err;
+}
+
+/* We need to start a transaction to maintain lock ordering */
+int reiserfs_xattr_set(struct inode *inode, const char *name,
+		       const void *buffer, size_t buffer_size, int flags)
+{
+
+	struct reiserfs_transaction_handle th;
+	int error, error2;
+	size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+
+	if (!(flags & XATTR_REPLACE))
+		jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jbegin_count);
+	reiserfs_write_unlock(inode->i_sb);
+	if (error) {
+		return error;
+	}
+
+	error = reiserfs_xattr_set_handle(&th, inode, name,
+					  buffer, buffer_size, flags);
+
+	reiserfs_write_lock(inode->i_sb);
+	error2 = journal_end(&th);
+	reiserfs_write_unlock(inode->i_sb);
+	if (error == 0)
+		error = error2;
+
+	return error;
+}
+
+/*
+ * inode->i_mutex: down
+ */
+int
+reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
+		   size_t buffer_size)
+{
+	ssize_t err = 0;
+	struct dentry *dentry;
+	size_t isize;
+	size_t file_pos = 0;
+	size_t buffer_pos = 0;
+	struct page *page;
+	__u32 hash = 0;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	/*
+	 * We can't have xattrs attached to v1 items since they don't have
+	 * generation numbers
+	 */
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	dentry = xattr_lookup(inode, name, XATTR_REPLACE);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out;
+	}
+
+	down_read(&REISERFS_I(inode)->i_xattr_sem);
+
+	isize = i_size_read(d_inode(dentry));
+
+	/* Just return the size needed */
+	if (buffer == NULL) {
+		err = isize - sizeof(struct reiserfs_xattr_header);
+		goto out_unlock;
+	}
+
+	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
+		err = -ERANGE;
+		goto out_unlock;
+	}
+
+	while (file_pos < isize) {
+		size_t chunk;
+		char *data;
+		size_t skip = 0;
+
+		if (isize - file_pos > PAGE_CACHE_SIZE)
+			chunk = PAGE_CACHE_SIZE;
+		else
+			chunk = isize - file_pos;
+
+		page = reiserfs_get_page(d_inode(dentry), file_pos);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out_unlock;
+		}
+
+		lock_page(page);
+		data = page_address(page);
+		if (file_pos == 0) {
+			struct reiserfs_xattr_header *rxh =
+			    (struct reiserfs_xattr_header *)data;
+			skip = file_pos = sizeof(struct reiserfs_xattr_header);
+			chunk -= skip;
+			/* Magic doesn't match up.. */
+			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
+				unlock_page(page);
+				reiserfs_put_page(page);
+				reiserfs_warning(inode->i_sb, "jdm-20001",
+						 "Invalid magic for xattr (%s) "
+						 "associated with %k", name,
+						 INODE_PKEY(inode));
+				err = -EIO;
+				goto out_unlock;
+			}
+			hash = le32_to_cpu(rxh->h_hash);
+		}
+		memcpy(buffer + buffer_pos, data + skip, chunk);
+		unlock_page(page);
+		reiserfs_put_page(page);
+		file_pos += chunk;
+		buffer_pos += chunk;
+		skip = 0;
+	}
+	err = isize - sizeof(struct reiserfs_xattr_header);
+
+	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
+	    hash) {
+		reiserfs_warning(inode->i_sb, "jdm-20002",
+				 "Invalid hash for xattr (%s) associated "
+				 "with %k", name, INODE_PKEY(inode));
+		err = -EIO;
+	}
+
+out_unlock:
+	up_read(&REISERFS_I(inode)->i_xattr_sem);
+	dput(dentry);
+
+out:
+	return err;
+}
+
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix with the generic xattr API, a filesystem should create a
+ * null-terminated array of struct xattr_handler (one for each prefix) and
+ * hang a pointer to it off of the s_xattr field of the superblock.
+ *
+ * The generic_fooxattr() functions will use this list to dispatch xattr
+ * operations to the correct xattr_handler.
+ */
+#define for_each_xattr_handler(handlers, handler)		\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
+
+/* This is the implementation for the xattr plugin infrastructure */
+static inline const struct xattr_handler *
+find_xattr_handler_prefix(const struct xattr_handler **handlers,
+			   const char *name)
+{
+	const struct xattr_handler *xah;
+
+	if (!handlers)
+		return NULL;
+
+	for_each_xattr_handler(handlers, xah) {
+		if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+			break;
+	}
+
+	return xah;
+}
+
+
+/*
+ * Inode operation getxattr()
+ */
+ssize_t
+reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
+		  size_t size)
+{
+	const struct xattr_handler *handler;
+
+	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	return handler->get(dentry, name, buffer, size, handler->flags);
+}
+
+/*
+ * Inode operation setxattr()
+ *
+ * d_inode(dentry)->i_mutex down
+ */
+int
+reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags)
+{
+	const struct xattr_handler *handler;
+
+	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	return handler->set(dentry, name, value, size, flags, handler->flags);
+}
+
+/*
+ * Inode operation removexattr()
+ *
+ * d_inode(dentry)->i_mutex down
+ */
+int reiserfs_removexattr(struct dentry *dentry, const char *name)
+{
+	const struct xattr_handler *handler;
+
+	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
+}
+
+struct listxattr_buf {
+	struct dir_context ctx;
+	size_t size;
+	size_t pos;
+	char *buf;
+	struct dentry *dentry;
+};
+
+static int listxattr_filler(struct dir_context *ctx, const char *name,
+			    int namelen, loff_t offset, u64 ino,
+			    unsigned int d_type)
+{
+	struct listxattr_buf *b =
+		container_of(ctx, struct listxattr_buf, ctx);
+	size_t size;
+
+	if (name[0] != '.' ||
+	    (namelen != 1 && (name[1] != '.' || namelen != 2))) {
+		const struct xattr_handler *handler;
+
+		handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
+						    name);
+		if (!handler)	/* Unsupported xattr name */
+			return 0;
+		if (b->buf) {
+			size = handler->list(b->dentry, b->buf + b->pos,
+					 b->size, name, namelen,
+					 handler->flags);
+			if (size > b->size)
+				return -ERANGE;
+		} else {
+			size = handler->list(b->dentry, NULL, 0, name,
+					     namelen, handler->flags);
+		}
+
+		b->pos += size;
+	}
+	return 0;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * We totally ignore the generic listxattr here because it would be stupid
+ * not to. Since the xattrs are organized in a directory, we can just
+ * readdir to find them.
+ */
+ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
+{
+	struct dentry *dir;
+	int err = 0;
+	struct listxattr_buf buf = {
+		.ctx.actor = listxattr_filler,
+		.dentry = dentry,
+		.buf = buffer,
+		.size = buffer ? size : 0,
+	};
+
+	if (d_really_is_negative(dentry))
+		return -EINVAL;
+
+	if (!dentry->d_sb->s_xattr ||
+	    get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		if (err == -ENODATA)
+			err = 0;  /* Not an error if there aren't any xattrs */
+		goto out;
+	}
+
+	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+	err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
+	mutex_unlock(&d_inode(dir)->i_mutex);
+
+	if (!err)
+		err = buf.pos;
+
+	dput(dir);
+out:
+	return err;
+}
+
+static int create_privroot(struct dentry *dentry)
+{
+	int err;
+	struct inode *inode = d_inode(dentry->d_parent);
+
+	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+
+	err = xattr_mkdir(inode, dentry, 0700);
+	if (err || d_really_is_negative(dentry)) {
+		reiserfs_warning(dentry->d_sb, "jdm-20006",
+				 "xattrs/ACLs enabled and couldn't "
+				 "find/create .reiserfs_priv. "
+				 "Failing mount.");
+		return -EOPNOTSUPP;
+	}
+
+	d_inode(dentry)->i_flags |= S_PRIVATE;
+	reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
+		      "storage.\n", PRIVROOT_NAME);
+
+	return 0;
+}
+
+#else
+int __init reiserfs_xattr_register_handlers(void) { return 0; }
+void reiserfs_xattr_unregister_handlers(void) {}
+static int create_privroot(struct dentry *dentry) { return 0; }
+#endif
+
+/* Actual operations that are exported to VFS-land */
+static const struct xattr_handler *reiserfs_xattr_handlers[] = {
+#ifdef CONFIG_REISERFS_FS_XATTR
+	&reiserfs_xattr_user_handler,
+	&reiserfs_xattr_trusted_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_SECURITY
+	&reiserfs_xattr_security_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL
+};
+
+static int xattr_mount_check(struct super_block *s)
+{
+	/*
+	 * We need generation numbers to ensure that the oid mapping is correct
+	 * v3.5 filesystems don't have them.
+	 */
+	if (old_format_only(s)) {
+		if (reiserfs_xattrs_optional(s)) {
+			/*
+			 * Old format filesystem, but optional xattrs have
+			 * been enabled. Error out.
+			 */
+			reiserfs_warning(s, "jdm-2005",
+					 "xattrs/ACLs not supported "
+					 "on pre-v3.6 format filesystems. "
+					 "Failing mount.");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
+}
+
+int reiserfs_permission(struct inode *inode, int mask)
+{
+	/*
+	 * We don't do permission checks on the internal objects.
+	 * Permissions are determined by the "owning" object.
+	 */
+	if (IS_PRIVATE(inode))
+		return 0;
+
+	return generic_permission(inode, mask);
+}
+
+static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	return -EPERM;
+}
+
+static const struct dentry_operations xattr_lookup_poison_ops = {
+	.d_revalidate = xattr_hide_revalidate,
+};
+
+int reiserfs_lookup_privroot(struct super_block *s)
+{
+	struct dentry *dentry;
+	int err = 0;
+
+	/* If we don't have the privroot located yet - go find it */
+	mutex_lock(&d_inode(s->s_root)->i_mutex);
+	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
+				strlen(PRIVROOT_NAME));
+	if (!IS_ERR(dentry)) {
+		REISERFS_SB(s)->priv_root = dentry;
+		d_set_d_op(dentry, &xattr_lookup_poison_ops);
+		if (d_really_is_positive(dentry))
+			d_inode(dentry)->i_flags |= S_PRIVATE;
+	} else
+		err = PTR_ERR(dentry);
+	mutex_unlock(&d_inode(s->s_root)->i_mutex);
+
+	return err;
+}
+
+/*
+ * We need to take a copy of the mount flags since things like
+ * MS_RDONLY don't get set until *after* we're called.
+ * mount_flags != mount_options
+ */
+int reiserfs_xattr_init(struct super_block *s, int mount_flags)
+{
+	int err = 0;
+	struct dentry *privroot = REISERFS_SB(s)->priv_root;
+
+	err = xattr_mount_check(s);
+	if (err)
+		goto error;
+
+	if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
+		mutex_lock(&d_inode(s->s_root)->i_mutex);
+		err = create_privroot(REISERFS_SB(s)->priv_root);
+		mutex_unlock(&d_inode(s->s_root)->i_mutex);
+	}
+
+	if (d_really_is_positive(privroot)) {
+		s->s_xattr = reiserfs_xattr_handlers;
+		mutex_lock(&d_inode(privroot)->i_mutex);
+		if (!REISERFS_SB(s)->xattr_root) {
+			struct dentry *dentry;
+
+			dentry = lookup_one_len(XAROOT_NAME, privroot,
+						strlen(XAROOT_NAME));
+			if (!IS_ERR(dentry))
+				REISERFS_SB(s)->xattr_root = dentry;
+			else
+				err = PTR_ERR(dentry);
+		}
+		mutex_unlock(&d_inode(privroot)->i_mutex);
+	}
+
+error:
+	if (err) {
+		clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
+		clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
+	}
+
+	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+	if (reiserfs_posixacl(s))
+		s->s_flags |= MS_POSIXACL;
+	else
+		s->s_flags &= ~MS_POSIXACL;
+
+	return err;
+}
diff --git a/kernel/fs/reiserfs/xattr.h b/kernel/fs/reiserfs/xattr.h
new file mode 100644
index 000000000..15dde6262
--- /dev/null
+++ b/kernel/fs/reiserfs/xattr.h
@@ -0,0 +1,122 @@
+#include <linux/reiserfs_xattr.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+struct inode;
+struct dentry;
+struct iattr;
+struct super_block;
+
+int reiserfs_xattr_register_handlers(void) __init;
+void reiserfs_xattr_unregister_handlers(void);
+int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
+int reiserfs_lookup_privroot(struct super_block *sb);
+int reiserfs_delete_xattrs(struct inode *inode);
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
+int reiserfs_permission(struct inode *inode, int mask);
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
+ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
+			  void *buffer, size_t size);
+int reiserfs_setxattr(struct dentry *dentry, const char *name,
+		      const void *value, size_t size, int flags);
+ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int reiserfs_removexattr(struct dentry *dentry, const char *name);
+
+int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
+int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
+int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
+			      struct inode *, const char *, const void *,
+			      size_t, int);
+
+extern const struct xattr_handler reiserfs_xattr_user_handler;
+extern const struct xattr_handler reiserfs_xattr_trusted_handler;
+extern const struct xattr_handler reiserfs_xattr_security_handler;
+#ifdef CONFIG_REISERFS_FS_SECURITY
+int reiserfs_security_init(struct inode *dir, struct inode *inode,
+			   const struct qstr *qstr,
+			   struct reiserfs_security_handle *sec);
+int reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			    struct inode *inode,
+			    struct reiserfs_security_handle *sec);
+void reiserfs_security_free(struct reiserfs_security_handle *sec);
+#endif
+
+static inline int reiserfs_xattrs_initialized(struct super_block *sb)
+{
+	return REISERFS_SB(sb)->priv_root != NULL;
+}
+
+#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
+static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
+{
+	loff_t ret = 0;
+	if (reiserfs_file_data_log(inode)) {
+		ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
+		ret >>= inode->i_sb->s_blocksize_bits;
+	}
+	return ret;
+}
+
+/*
+ * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
+ * Let's try to be smart about it.
+ * xattr root: We cache it. If it's not cached, we may need to create it.
+ * xattr dir: If anything has been loaded for this inode, we can set a flag
+ *            saying so.
+ * xattr file: Since we don't cache xattrs, we can't tell. We always include
+ *             blocks for it.
+ *
+ * However, since root and dir can be created between calls - YOU MUST SAVE
+ * THIS VALUE.
+ */
+static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
+{
+	size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+
+	if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
+		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+		if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root))
+			nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+	}
+
+	return nblocks;
+}
+
+static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
+{
+	init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
+}
+
+#else
+
+#define reiserfs_getxattr NULL
+#define reiserfs_setxattr NULL
+#define reiserfs_listxattr NULL
+#define reiserfs_removexattr NULL
+
+static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
+{
+}
+#endif  /*  CONFIG_REISERFS_FS_XATTR  */
+
+#ifndef CONFIG_REISERFS_FS_SECURITY
+static inline int reiserfs_security_init(struct inode *dir,
+					 struct inode *inode,
+					 const struct qstr *qstr,
+					 struct reiserfs_security_handle *sec)
+{
+	return 0;
+}
+static inline int
+reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			struct inode *inode,
+			struct reiserfs_security_handle *sec)
+{
+	return 0;
+}
+static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
+{}
+#endif
diff --git a/kernel/fs/reiserfs/xattr_acl.c b/kernel/fs/reiserfs/xattr_acl.c
new file mode 100644
index 000000000..4b34b9dc0
--- /dev/null
+++ b/kernel/fs/reiserfs/xattr_acl.c
@@ -0,0 +1,407 @@
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include "reiserfs.h"
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include <linux/uaccess.h>
+
+static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
+			    struct inode *inode, int type,
+			    struct posix_acl *acl);
+
+
+int
+reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int error, error2;
+	struct reiserfs_transaction_handle th;
+	size_t jcreate_blocks;
+	int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
+
+
+	/*
+	 * Pessimism: We can't assume that anything from the xattr root up
+	 * has been created.
+	 */
+
+	jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, size) * 2;
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jcreate_blocks);
+	reiserfs_write_unlock(inode->i_sb);
+	if (error == 0) {
+		error = __reiserfs_set_acl(&th, inode, type, acl);
+		reiserfs_write_lock(inode->i_sb);
+		error2 = journal_end(&th);
+		reiserfs_write_unlock(inode->i_sb);
+		if (error2)
+			error = error2;
+	}
+
+	return error;
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(reiserfs_acl_header))
+		return ERR_PTR(-EINVAL);
+	if (((reiserfs_acl_header *) value)->a_version !=
+	    cpu_to_le32(REISERFS_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(reiserfs_acl_header);
+	count = reiserfs_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
+		if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+			    sizeof(reiserfs_acl_entry_short);
+			break;
+
+		case ACL_USER:
+			value = (char *)value + sizeof(reiserfs_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_uid = 
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			value = (char *)value + sizeof(reiserfs_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
+{
+	reiserfs_acl_header *ext_acl;
+	char *e;
+	int n;
+
+	*size = reiserfs_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(reiserfs_acl_header) +
+						  acl->a_count *
+						  sizeof(reiserfs_acl_entry),
+						  GFP_NOFS);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(reiserfs_acl_header);
+	for (n = 0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
+		entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			e += sizeof(reiserfs_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			e += sizeof(reiserfs_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(reiserfs_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: down
+ * BKL held [before 2.5.x]
+ */
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
+{
+	char *name, *value;
+	struct posix_acl *acl;
+	int size;
+	int retval;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	size = reiserfs_xattr_get(inode, name, NULL, 0);
+	if (size < 0) {
+		if (size == -ENODATA || size == -ENOSYS) {
+			set_cached_acl(inode, type, NULL);
+			return NULL;
+		}
+		return ERR_PTR(size);
+	}
+
+	value = kmalloc(size, GFP_NOFS);
+	if (!value)
+		return ERR_PTR(-ENOMEM);
+
+	retval = reiserfs_xattr_get(inode, name, value, size);
+	if (retval == -ENODATA || retval == -ENOSYS) {
+		/*
+		 * This shouldn't actually happen as it should have
+		 * been caught above.. but just in case
+		 */
+		acl = NULL;
+	} else if (retval < 0) {
+		acl = ERR_PTR(retval);
+	} else {
+		acl = reiserfs_posix_acl_from_disk(value, retval);
+	}
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	kfree(value);
+	return acl;
+}
+
+/*
+ * Inode operation set_posix_acl().
+ *
+ * inode->i_mutex: down
+ * BKL held [before 2.5.x]
+ */
+static int
+__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
+		 int type, struct posix_acl *acl)
+{
+	char *name;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			error = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (error < 0)
+				return error;
+			else {
+				if (error == 0)
+					acl = NULL;
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = reiserfs_posix_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
+
+	/*
+	 * Ensure that the inode gets dirtied if we're only using
+	 * the mode bits and an old ACL didn't exist. We don't need
+	 * to check if the inode is hashed here since we won't get
+	 * called by reiserfs_inherit_default_acl().
+	 */
+	if (error == -ENODATA) {
+		error = 0;
+		if (type == ACL_TYPE_ACCESS) {
+			inode->i_ctime = CURRENT_TIME_SEC;
+			mark_inode_dirty(inode);
+		}
+	}
+
+	kfree(value);
+
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
+	return error;
+}
+
+/*
+ * dir->i_mutex: locked,
+ * inode is new and not released into the wild yet
+ */
+int
+reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+			     struct inode *dir, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct posix_acl *default_acl, *acl;
+	int err = 0;
+
+	/* ACLs only get applied to files and directories */
+	if (S_ISLNK(inode->i_mode))
+		return 0;
+
+	/*
+	 * ACLs can only be used on "new" objects, so if it's an old object
+	 * there is nothing to inherit from
+	 */
+	if (get_inode_sd_version(dir) == STAT_DATA_V1)
+		goto apply_umask;
+
+	/*
+	 * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
+	 * would be useless since permissions are ignored, and a pain because
+	 * it introduces locking cycles
+	 */
+	if (IS_PRIVATE(dir)) {
+		inode->i_flags |= S_PRIVATE;
+		goto apply_umask;
+	}
+
+	err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (err)
+		return err;
+
+	if (default_acl) {
+		err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+					 default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!err)
+			err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
+						 acl);
+		posix_acl_release(acl);
+	}
+
+	return err;
+
+apply_umask:
+	/* no ACL, apply umask */
+	inode->i_mode &= ~current_umask();
+	return err;
+}
+
+/* This is used to cache the default acl before a new object is created.
+ * The biggest reason for this is to get an idea of how many blocks will
+ * actually be required for the create operation if we must inherit an ACL.
+ * An ACL write can add up to 3 object creations and an additional file write
+ * so we'd prefer not to reserve that many blocks in the journal if we can.
+ * It also has the advantage of not loading the ACL with a transaction open,
+ * this may seem silly, but if the owner of the directory is doing the
+ * creation, the ACL may not be loaded since the permissions wouldn't require
+ * it.
+ * We return the number of blocks required for the transaction.
+ */
+int reiserfs_cache_default_acl(struct inode *inode)
+{
+	struct posix_acl *acl;
+	int nblocks = 0;
+
+	if (IS_PRIVATE(inode))
+		return 0;
+
+	acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
+
+	if (acl && !IS_ERR(acl)) {
+		int size = reiserfs_acl_size(acl->a_count);
+
+		/* Other xattrs can be created during inode creation. We don't
+		 * want to claim too many blocks, so we check to see if we
+		 * we need to create the tree to the xattrs, and then we
+		 * just want two files. */
+		nblocks = reiserfs_xattr_jcreate_nblocks(inode);
+		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+		/* We need to account for writes + bitmaps for two files */
+		nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
+		posix_acl_release(acl);
+	}
+
+	return nblocks;
+}
+
+/*
+ * Called under i_mutex
+ */
+int reiserfs_acl_chmod(struct inode *inode)
+{
+	if (IS_PRIVATE(inode))
+		return 0;
+	if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
+	    !reiserfs_posixacl(inode->i_sb))
+		return 0;
+
+	return posix_acl_chmod(inode, inode->i_mode);
+}
diff --git a/kernel/fs/reiserfs/xattr_security.c b/kernel/fs/reiserfs/xattr_security.c
new file mode 100644
index 000000000..9a3b0616f
--- /dev/null
+++ b/kernel/fs/reiserfs/xattr_security.c
@@ -0,0 +1,120 @@
+#include "reiserfs.h"
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include "xattr.h"
+#include <linux/security.h>
+#include <linux/uaccess.h>
+
+static int
+security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+		int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	if (IS_PRIVATE(d_inode(dentry)))
+		return -EPERM;
+
+	return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
+}
+
+static int
+security_set(struct dentry *dentry, const char *name, const void *buffer,
+	     size_t size, int flags, int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	if (IS_PRIVATE(d_inode(dentry)))
+		return -EPERM;
+
+	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
+}
+
+static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
+			    const char *name, size_t namelen, int handler_flags)
+{
+	const size_t len = namelen + 1;
+
+	if (IS_PRIVATE(d_inode(dentry)))
+		return 0;
+
+	if (list && len <= list_len) {
+		memcpy(list, name, namelen);
+		list[namelen] = '\0';
+	}
+
+	return len;
+}
+
+/* Initializes the security context for a new inode and returns the number
+ * of blocks needed for the transaction. If successful, reiserfs_security
+ * must be released using reiserfs_security_free when the caller is done. */
+int reiserfs_security_init(struct inode *dir, struct inode *inode,
+			   const struct qstr *qstr,
+			   struct reiserfs_security_handle *sec)
+{
+	int blocks = 0;
+	int error;
+
+	sec->name = NULL;
+
+	/* Don't add selinux attributes on xattrs - they'll never get used */
+	if (IS_PRIVATE(dir))
+		return 0;
+
+	error = security_old_inode_init_security(inode, dir, qstr, &sec->name,
+						 &sec->value, &sec->length);
+	if (error) {
+		if (error == -EOPNOTSUPP)
+			error = 0;
+
+		sec->name = NULL;
+		sec->value = NULL;
+		sec->length = 0;
+		return error;
+	}
+
+	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
+		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, sec->length);
+		/* We don't want to count the directories twice if we have
+		 * a default ACL. */
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+	}
+	return blocks;
+}
+
+int reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			    struct inode *inode,
+			    struct reiserfs_security_handle *sec)
+{
+	int error;
+	if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value,
+					  sec->length, XATTR_CREATE);
+	if (error == -ENODATA || error == -EOPNOTSUPP)
+		error = 0;
+
+	return error;
+}
+
+void reiserfs_security_free(struct reiserfs_security_handle *sec)
+{
+	kfree(sec->name);
+	kfree(sec->value);
+	sec->name = NULL;
+	sec->value = NULL;
+}
+
+const struct xattr_handler reiserfs_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get = security_get,
+	.set = security_set,
+	.list = security_list,
+};
diff --git a/kernel/fs/reiserfs/xattr_trusted.c b/kernel/fs/reiserfs/xattr_trusted.c
new file mode 100644
index 000000000..e4f134371
--- /dev/null
+++ b/kernel/fs/reiserfs/xattr_trusted.c
@@ -0,0 +1,56 @@
+#include "reiserfs.h"
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include "xattr.h"
+#include <linux/uaccess.h>
+
+static int
+trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+	    int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+		return -EPERM;
+
+	return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
+}
+
+static int
+trusted_set(struct dentry *dentry, const char *name, const void *buffer,
+	    size_t size, int flags, int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+		return -EPERM;
+
+	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
+}
+
+static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
+			   const char *name, size_t name_len, int handler_flags)
+{
+	const size_t len = name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+		return 0;
+
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
+	return len;
+}
+
+const struct xattr_handler reiserfs_xattr_trusted_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.get = trusted_get,
+	.set = trusted_set,
+	.list = trusted_list,
+};
diff --git a/kernel/fs/reiserfs/xattr_user.c b/kernel/fs/reiserfs/xattr_user.c
new file mode 100644
index 000000000..d0b08d3e5
--- /dev/null
+++ b/kernel/fs/reiserfs/xattr_user.c
@@ -0,0 +1,52 @@
+#include "reiserfs.h"
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include "xattr.h"
+#include <linux/uaccess.h>
+
+static int
+user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+	 int handler_flags)
+{
+
+	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+		return -EINVAL;
+	if (!reiserfs_xattrs_user(dentry->d_sb))
+		return -EOPNOTSUPP;
+	return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
+}
+
+static int
+user_set(struct dentry *dentry, const char *name, const void *buffer,
+	 size_t size, int flags, int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+		return -EINVAL;
+
+	if (!reiserfs_xattrs_user(dentry->d_sb))
+		return -EOPNOTSUPP;
+	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
+}
+
+static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
+			const char *name, size_t name_len, int handler_flags)
+{
+	const size_t len = name_len + 1;
+
+	if (!reiserfs_xattrs_user(dentry->d_sb))
+		return 0;
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
+	return len;
+}
+
+const struct xattr_handler reiserfs_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get = user_get,
+	.set = user_set,
+	.list = user_list,
+};
diff --git a/kernel/fs/romfs/Kconfig b/kernel/fs/romfs/Kconfig
new file mode 100644
index 000000000..ce2d6bcc6
--- /dev/null
+++ b/kernel/fs/romfs/Kconfig
@@ -0,0 +1,62 @@
+config ROMFS_FS
+	tristate "ROM file system support"
+	depends on BLOCK || MTD
+	---help---
+	  This is a very small read-only file system mainly intended for
+	  initial ram disks of installation disks, but it could be used for
+	  other read-only media as well.  Read
+	  <file:Documentation/filesystems/romfs.txt> for details.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called romfs.  Note that the file system of your
+	  root partition (the one containing the directory /) cannot be a
+	  module.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
+
+#
+# Select the backing stores to be supported
+#
+choice
+	prompt "RomFS backing stores"
+	depends on ROMFS_FS
+	default ROMFS_BACKED_BY_BLOCK
+	help
+	  Select the backing stores to be supported.
+
+config ROMFS_BACKED_BY_BLOCK
+	bool "Block device-backed ROM file system support"
+	depends on BLOCK
+	help
+	  This permits ROMFS to use block devices buffered through the page
+	  cache as the medium from which to retrieve data.  It does not allow
+	  direct mapping of the medium.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_MTD
+	bool "MTD-backed ROM file system support"
+	depends on MTD=y || (ROMFS_FS=m && MTD)
+	help
+	  This permits ROMFS to use MTD based devices directly, without the
+	  intercession of the block layer (which may have been disabled).  It
+	  also allows direct mapping of MTD devices through romfs files under
+	  NOMMU conditions if the underlying device is directly addressable by
+	  the CPU.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_BOTH
+	bool "Both the above"
+	depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+
+
+config ROMFS_ON_BLOCK
+	bool
+	default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+
+config ROMFS_ON_MTD
+	bool
+	default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/kernel/fs/romfs/Makefile b/kernel/fs/romfs/Makefile
new file mode 100644
index 000000000..420beb7d4
--- /dev/null
+++ b/kernel/fs/romfs/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux RomFS filesystem routines.
+#
+
+obj-$(CONFIG_ROMFS_FS) += romfs.o
+
+romfs-y := storage.o super.o
+
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
+
diff --git a/kernel/fs/romfs/internal.h b/kernel/fs/romfs/internal.h
new file mode 100644
index 000000000..95217b830
--- /dev/null
+++ b/kernel/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/romfs_fs.h>
+
+struct romfs_inode_info {
+	struct inode	vfs_inode;
+	unsigned long	i_metasize;	/* size of non-data area */
+	unsigned long	i_dataoffset;	/* from the start of fs */
+};
+
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+	return (size_t) (unsigned long) sb->s_fs_info;
+}
+
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+	return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops	generic_ro_fops
+#endif
+
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size);
diff --git a/kernel/fs/romfs/mmap-nommu.c b/kernel/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000..1118a0dc6
--- /dev/null
+++ b/kernel/fs/romfs/mmap-nommu.c
@@ -0,0 +1,89 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct mtd_info *mtd = inode->i_sb->s_mtd;
+	unsigned long isize, offset, maxpages, lpages;
+	int ret;
+
+	if (!mtd)
+		return (unsigned long) -ENOSYS;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+	offset = pgoff << PAGE_SHIFT;
+
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if ((pgoff >= maxpages) || (maxpages - pgoff < lpages))
+		return (unsigned long) -EINVAL;
+
+	if (addr != 0)
+		return (unsigned long) -EINVAL;
+
+	if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+		return (unsigned long) -EINVAL;
+
+	offset += ROMFS_I(inode)->i_dataoffset;
+	if (offset >= mtd->size)
+		return (unsigned long) -EINVAL;
+	/* the mapping mustn't extend beyond the EOF */
+	if ((offset + len) > mtd->size)
+		len = mtd->size - offset;
+
+	ret = mtd_get_unmapped_area(mtd, len, offset, flags);
+	if (ret == -EOPNOTSUPP)
+		ret = -ENOSYS;
+	return (unsigned long) ret;
+}
+
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+	struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+
+	if (!mtd)
+		return NOMMU_MAP_COPY;
+	return mtd_mmap_capabilities(mtd);
+}
+
+const struct file_operations romfs_ro_fops = {
+	.llseek			= generic_file_llseek,
+	.read_iter		= generic_file_read_iter,
+	.splice_read		= generic_file_splice_read,
+	.mmap			= romfs_mmap,
+	.get_unmapped_area	= romfs_get_unmapped_area,
+	.mmap_capabilities	= romfs_mmap_capabilities,
+};
diff --git a/kernel/fs/romfs/storage.c b/kernel/fs/romfs/storage.c
new file mode 100644
index 000000000..f86f51f99
--- /dev/null
+++ b/kernel/fs/romfs/storage.c
@@ -0,0 +1,293 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) mtd_read((sb)->s_mtd, ##__VA_ARGS__)
+
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	size_t rlen;
+	int ret;
+
+	ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+	return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen)
+{
+	ssize_t n = 0;
+	size_t segment;
+	u_char buf[16], *p;
+	size_t len;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time */
+	while (maxlen > 0) {
+		segment = min_t(size_t, maxlen, 16);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		p = memchr(buf, 0, len);
+		if (p)
+			return n + (p - buf);
+		maxlen -= len;
+		pos += len;
+		n += len;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
+{
+	u_char buf[17];
+	size_t len, segment;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time, and attempt to grab the
+	 * trailing NUL whilst we're at it */
+	buf[0] = 0xff;
+
+	while (size > 0) {
+		segment = min_t(size_t, size + 1, 17);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		len--;
+		if (memcmp(buf, str, len) != 0)
+			return 0;
+		buf[0] = buf[len];
+		size -= len;
+		pos += len;
+		str += len;
+	}
+
+	/* check the trailing NUL was */
+	if (buf[0])
+		return 0;
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+
+	/* copy the string up to blocksize bytes at a time */
+	while (buflen > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, buflen, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		memcpy(buf, bh->b_data + offset, segment);
+		brelse(bh);
+		buf += segment;
+		buflen -= segment;
+		pos += segment;
+	}
+
+	return 0;
+}
+
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t limit)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ssize_t n = 0;
+	size_t segment;
+	u_char *buf, *p;
+
+	/* scan the string up to blocksize bytes at a time */
+	while (limit > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, limit, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		buf = bh->b_data + offset;
+		p = memchr(buf, 0, segment);
+		brelse(bh);
+		if (p)
+			return n + (p - buf);
+		limit -= segment;
+		pos += segment;
+		n += segment;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+	bool matched, terminated = false;
+
+	/* compare string up to a block at a time */
+	while (size > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, size, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		matched = (memcmp(bh->b_data + offset, str, segment) == 0);
+
+		size -= segment;
+		pos += segment;
+		str += segment;
+		if (matched && size == 0 && offset + segment < ROMBSIZE) {
+			if (!bh->b_data[offset + segment])
+				terminated = true;
+			else
+				matched = false;
+		}
+		brelse(bh);
+		if (!matched)
+			return 0;
+	}
+
+	if (!terminated) {
+		/* the terminating NUL must be on the first byte of the next
+		 * block */
+		BUG_ON((pos & (ROMBSIZE - 1)) != 0);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		matched = !bh->b_data[0];
+		brelse(bh);
+		if (!matched)
+			return 0;
+	}
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+		   void *buf, size_t buflen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (buflen > limit - pos)
+		buflen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+	return -EIO;
+}
+
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+			  unsigned long pos, size_t maxlen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (maxlen > limit - pos)
+		maxlen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strnlen(sb, pos, maxlen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strnlen(sb, pos, maxlen);
+#endif
+	return -EIO;
+}
+
+/*
+ * compare a string to one in romfs
+ * - the string to be compared to, str, may not be NUL-terminated; instead the
+ *   string is of the specified size
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+		     const char *str, size_t size)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (size > ROMFS_MAXFN)
+		return -ENAMETOOLONG;
+	if (size + 1 > limit - pos)
+		return -EIO;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strcmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strcmp(sb, pos, str, size);
+#endif
+	return -EIO;
+}
diff --git a/kernel/fs/romfs/super.c b/kernel/fs/romfs/super.c
new file mode 100644
index 000000000..268733cda
--- /dev/null
+++ b/kernel/fs/romfs/super.c
@@ -0,0 +1,659 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *					Changed for 2.1.19 modules
+ *	Jan 1997			Initial release
+ *	Jun 1997			2.1.43+ changes
+ *					Proper page locking in readpage
+ *					Changed to work with 2.1.45+ fs
+ *	Jul 1997			Fixed follow_link
+ *			2.1.47
+ *					lookup shouldn't return -ENOENT
+ *					from Horst von Brand:
+ *					  fail on wrong checksum
+ *					  double unlock_super was possible
+ *					  correct namelen for statfs
+ *					spotted by Bill Hawes:
+ *					  readlink shouldn't iput()
+ *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
+ *					  exposed a problem in readdir
+ *			2.1.107		code-freeze spellchecker run
+ *	Aug 1998			2.1.118+ VFS changes
+ *	Sep 1998	2.1.122		another VFS change (follow_link)
+ *	Apr 1999	2.2.7		no more EBADF checking in
+ *					  lookup/readdir, use ERR_PTR
+ *	Jun 1999	2.3.6		d_alloc_root use changed
+ *			2.3.9		clean up usage of ENOENT/negative
+ *					  dentries in lookup
+ *					clean up page flags setting
+ *					  (error, uptodate, locking) in
+ *					  in readpage
+ *					use init_special_inode for
+ *					  fifos/sockets (and streamline) in
+ *					  read_inode, fix _ops table order
+ *	Aug 1999	2.3.16		__initfunc() => __init change
+ *	Oct 1999	2.3.24		page->owner hack obsoleted
+ *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+static struct kmem_cache *romfs_inode_cachep;
+
+static const umode_t romfs_modemap[8] = {
+	0,			/* hard link */
+	S_IFDIR  | 0644,	/* directory */
+	S_IFREG  | 0644,	/* regular file */
+	S_IFLNK  | 0777,	/* symlink */
+	S_IFBLK  | 0600,	/* blockdev */
+	S_IFCHR  | 0600,	/* chardev */
+	S_IFSOCK | 0644,	/* socket */
+	S_IFIFO  | 0644		/* FIFO */
+};
+
+static const unsigned char romfs_dtype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t offset, size;
+	unsigned long fillsize, pos;
+	void *buf;
+	int ret;
+
+	buf = kmap(page);
+	if (!buf)
+		return -ENOMEM;
+
+	/* 32 bit warning -- but not for us :) */
+	offset = page_offset(page);
+	size = i_size_read(inode);
+	fillsize = 0;
+	ret = 0;
+	if (offset < size) {
+		size -= offset;
+		fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+		pos = ROMFS_I(inode)->i_dataoffset + offset;
+
+		ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+		if (ret < 0) {
+			SetPageError(page);
+			fillsize = 0;
+			ret = -EIO;
+		}
+	}
+
+	if (fillsize < PAGE_SIZE)
+		memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+	if (ret == 0)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+	kunmap(page);
+	unlock_page(page);
+	return ret;
+}
+
+static const struct address_space_operations romfs_aops = {
+	.readpage	= romfs_readpage
+};
+
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *i = file_inode(file);
+	struct romfs_inode ri;
+	unsigned long offset, maxoff;
+	int j, ino, nextfh;
+	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
+	int ret;
+
+	maxoff = romfs_maxsize(i->i_sb);
+
+	offset = ctx->pos;
+	if (!offset) {
+		offset = i->i_ino & ROMFH_MASK;
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* Not really failsafe, but we are read-only... */
+	for (;;) {
+		if (!offset || offset >= maxoff) {
+			offset = maxoff;
+			ctx->pos = offset;
+			goto out;
+		}
+		ctx->pos = offset;
+
+		/* Fetch inode info */
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+
+		j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+				      sizeof(fsname) - 1);
+		if (j < 0)
+			goto out;
+
+		ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+		if (ret < 0)
+			goto out;
+		fsname[j] = '\0';
+
+		ino = offset;
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+			ino = be32_to_cpu(ri.spec);
+		if (!dir_emit(ctx, fsname, j, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]))
+			goto out;
+
+		offset = nextfh & ROMFH_MASK;
+	}
+out:
+	return 0;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	unsigned long offset, maxoff;
+	struct inode *inode;
+	struct romfs_inode ri;
+	const char *name;		/* got from dentry */
+	int len, ret;
+
+	offset = dir->i_ino & ROMFH_MASK;
+	ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+	if (ret < 0)
+		goto error;
+
+	/* search all the file entries in the list starting from the one
+	 * pointed to by the directory's special data */
+	maxoff = romfs_maxsize(dir->i_sb);
+	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	name = dentry->d_name.name;
+	len = dentry->d_name.len;
+
+	for (;;) {
+		if (!offset || offset >= maxoff)
+			goto out0;
+
+		ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* try to match the first 16 bytes of name */
+		ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
+				       len);
+		if (ret < 0)
+			goto error;
+		if (ret == 1)
+			break;
+
+		/* next entry */
+		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+	}
+
+	/* Hard link handling */
+	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	inode = romfs_iget(dir->i_sb, offset);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto error;
+	}
+	goto outi;
+
+	/*
+	 * it's a bit funky, _lookup needs to return an error code
+	 * (negative) or a NULL, both as a dentry.  ENOENT should not
+	 * be returned, instead we need to create a negative dentry by
+	 * d_add(dentry, NULL); and return 0 as no error.
+	 * (Although as I see, it only matters on writable file
+	 * systems).
+	 */
+out0:
+	inode = NULL;
+outi:
+	d_add(dentry, inode);
+	ret = 0;
+error:
+	return ERR_PTR(ret);
+}
+
+static const struct file_operations romfs_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= romfs_readdir,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations romfs_dir_inode_operations = {
+	.lookup		= romfs_lookup,
+};
+
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+	struct romfs_inode_info *inode;
+	struct romfs_inode ri;
+	struct inode *i;
+	unsigned long nlen;
+	unsigned nextfh;
+	int ret;
+	umode_t mode;
+
+	/* we might have to traverse a chain of "hard link" file entries to get
+	 * to the actual file */
+	for (;;) {
+		ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* XXX: do romfs_checksum here too (with name) */
+
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+			break;
+
+		pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* determine the length of the filename */
+	nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+	if (IS_ERR_VALUE(nlen))
+		goto eio;
+
+	/* get an inode for this image position */
+	i = iget_locked(sb, pos);
+	if (!i)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(i->i_state & I_NEW))
+		return i;
+
+	/* precalculate the data offset */
+	inode = ROMFS_I(i);
+	inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+	inode->i_dataoffset = pos + inode->i_metasize;
+
+	set_nlink(i, 1);		/* Hard to decide.. */
+	i->i_size = be32_to_cpu(ri.size);
+	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+
+	/* set up mode and ops */
+	mode = romfs_modemap[nextfh & ROMFH_TYPE];
+
+	switch (nextfh & ROMFH_TYPE) {
+	case ROMFH_DIR:
+		i->i_size = ROMFS_I(i)->i_metasize;
+		i->i_op = &romfs_dir_inode_operations;
+		i->i_fop = &romfs_dir_operations;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_REG:
+		i->i_fop = &romfs_ro_fops;
+		i->i_data.a_ops = &romfs_aops;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_SYM:
+		i->i_op = &page_symlink_inode_operations;
+		i->i_data.a_ops = &romfs_aops;
+		mode |= S_IRWXUGO;
+		break;
+	default:
+		/* depending on MBZ for sock/fifos */
+		nextfh = be32_to_cpu(ri.spec);
+		init_special_inode(i, mode, MKDEV(nextfh >> 16,
+						  nextfh & 0xffff));
+		break;
+	}
+
+	i->i_mode = mode;
+
+	unlock_new_inode(i);
+	return i;
+
+eio:
+	ret = -EIO;
+error:
+	pr_err("read error for inode 0x%lx\n", pos);
+	return ERR_PTR(ret);
+}
+
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+	struct romfs_inode_info *inode;
+
+	inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+	return inode ? &inode->vfs_inode : NULL;
+}
+
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+
+static void romfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, romfs_i_callback);
+}
+
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = ROMFS_MAGIC;
+	buf->f_namelen = ROMFS_MAXFN;
+	buf->f_bsize = ROMBSIZE;
+	buf->f_bfree = buf->f_bavail = buf->f_ffree;
+	buf->f_blocks =
+		(romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	return 0;
+}
+
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations romfs_super_ops = {
+	.alloc_inode	= romfs_alloc_inode,
+	.destroy_inode	= romfs_destroy_inode,
+	.statfs		= romfs_statfs,
+	.remount_fs	= romfs_remount,
+};
+
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+	const __be32 *ptr = data;
+	__u32 sum;
+
+	sum = 0;
+	size >>= 2;
+	while (size > 0) {
+		sum += be32_to_cpu(*ptr++);
+		size--;
+	}
+	return sum;
+}
+
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct romfs_super_block *rsb;
+	struct inode *root;
+	unsigned long pos, img_size;
+	const char *storage;
+	size_t len;
+	int ret;
+
+#ifdef CONFIG_BLOCK
+	if (!sb->s_mtd) {
+		sb_set_blocksize(sb, ROMBSIZE);
+	} else {
+		sb->s_blocksize = ROMBSIZE;
+		sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+	}
+#endif
+
+	sb->s_maxbytes = 0xFFFFFFFF;
+	sb->s_magic = ROMFS_MAGIC;
+	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_op = &romfs_super_ops;
+
+	/* read the image superblock and check it */
+	rsb = kmalloc(512, GFP_KERNEL);
+	if (!rsb)
+		return -ENOMEM;
+
+	sb->s_fs_info = (void *) 512;
+	ret = romfs_dev_read(sb, 0, rsb, 512);
+	if (ret < 0)
+		goto error_rsb;
+
+	img_size = be32_to_cpu(rsb->size);
+
+	if (sb->s_mtd && img_size > sb->s_mtd->size)
+		goto error_rsb_inval;
+
+	sb->s_fs_info = (void *) img_size;
+
+	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+	    img_size < ROMFH_SIZE) {
+		if (!silent)
+			pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n",
+			       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+		pr_err("bad initial checksum on dev %s.\n", sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	storage = sb->s_mtd ? "MTD" : "the block layer";
+
+	len = strnlen(rsb->name, ROMFS_MAXFN);
+	if (!silent)
+		pr_notice("Mounting image '%*.*s' through %s\n",
+			  (unsigned) len, (unsigned) len, rsb->name, storage);
+
+	kfree(rsb);
+	rsb = NULL;
+
+	/* find the root directory */
+	pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+
+	root = romfs_iget(sb, pos);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	return 0;
+
+error_rsb_inval:
+	ret = -EINVAL;
+error_rsb:
+	kfree(rsb);
+	return ret;
+}
+
+/*
+ * get a superblock for mounting
+ */
+static struct dentry *romfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	struct dentry *ret = ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (ret == ERR_PTR(-EINVAL))
+		ret = mount_bdev(fs_type, flags, dev_name, data,
+				  romfs_fill_super);
+#endif
+	return ret;
+}
+
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd) {
+		kill_mtd_super(sb);
+		return;
+	}
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev) {
+		kill_block_super(sb);
+		return;
+	}
+#endif
+}
+
+static struct file_system_type romfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "romfs",
+	.mount		= romfs_mount,
+	.kill_sb	= romfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("romfs");
+
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+	struct romfs_inode_info *inode = _inode;
+
+	inode_init_once(&inode->vfs_inode);
+}
+
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+	int ret;
+
+	pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+
+	romfs_inode_cachep =
+		kmem_cache_create("romfs_i",
+				  sizeof(struct romfs_inode_info), 0,
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				  romfs_i_init_once);
+
+	if (!romfs_inode_cachep) {
+		pr_err("Failed to initialise inode cache\n");
+		return -ENOMEM;
+	}
+	ret = register_filesystem(&romfs_fs_type);
+	if (ret) {
+		pr_err("Failed to register filesystem\n");
+		goto error_register;
+	}
+	return 0;
+
+error_register:
+	kmem_cache_destroy(romfs_inode_cachep);
+	return ret;
+}
+
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+	unregister_filesystem(&romfs_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(romfs_inode_cachep);
+}
+
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/kernel/fs/select.c b/kernel/fs/select.c
new file mode 100644
index 000000000..f684c750e
--- /dev/null
+++ b/kernel/fs/select.c
@@ -0,0 +1,1040 @@
+/*
+ * This file contains the procedures for the handling of select and poll
+ *
+ * Created for Linux based loosely upon Mathius Lattner's minix
+ * patches by Peter MacDonald. Heavily edited by Linus.
+ *
+ *  4 February 1994
+ *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
+ *     flag set in its personality we do *not* modify the given timeout
+ *     parameter to reflect time remaining.
+ *
+ *  24 January 2000
+ *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
+ *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/personality.h> /* for STICKY_TIMEOUTS */
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs.h>
+#include <linux/rcupdate.h>
+#include <linux/hrtimer.h>
+#include <linux/sched/rt.h>
+#include <linux/freezer.h>
+#include <net/busy_poll.h>
+
+#include <asm/uaccess.h>
+
+
+/*
+ * Estimate expected accuracy in ns from a timeval.
+ *
+ * After quite a bit of churning around, we've settled on
+ * a simple thing of taking 0.1% of the timeout as the
+ * slack, with a cap of 100 msec.
+ * "nice" tasks get a 0.5% slack instead.
+ *
+ * Consider this comment an open invitation to come up with even
+ * better solutions..
+ */
+
+#define MAX_SLACK	(100 * NSEC_PER_MSEC)
+
+static long __estimate_accuracy(struct timespec *tv)
+{
+	long slack;
+	int divfactor = 1000;
+
+	if (tv->tv_sec < 0)
+		return 0;
+
+	if (task_nice(current) > 0)
+		divfactor = divfactor / 5;
+
+	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
+		return MAX_SLACK;
+
+	slack = tv->tv_nsec / divfactor;
+	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
+
+	if (slack > MAX_SLACK)
+		return MAX_SLACK;
+
+	return slack;
+}
+
+long select_estimate_accuracy(struct timespec *tv)
+{
+	unsigned long ret;
+	struct timespec now;
+
+	/*
+	 * Realtime tasks get a slack of 0 for obvious reasons.
+	 */
+
+	if (rt_task(current))
+		return 0;
+
+	ktime_get_ts(&now);
+	now = timespec_sub(*tv, now);
+	ret = __estimate_accuracy(&now);
+	if (ret < current->timer_slack_ns)
+		return current->timer_slack_ns;
+	return ret;
+}
+
+
+
+struct poll_table_page {
+	struct poll_table_page * next;
+	struct poll_table_entry * entry;
+	struct poll_table_entry entries[0];
+};
+
+#define POLL_TABLE_FULL(table) \
+	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+
+/*
+ * Ok, Peter made a complicated, but straightforward multiple_wait() function.
+ * I have rewritten this, taking some shortcuts: This code may not be easy to
+ * follow, but it should be free of race-conditions, and it's practical. If you
+ * understand what I'm doing here, then you understand how the linux
+ * sleep/wakeup mechanism works.
+ *
+ * Two very simple procedures, poll_wait() and poll_freewait() make all the
+ * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
+ * as all select/poll functions have to call it to add an entry to the
+ * poll table.
+ */
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+		       poll_table *p);
+
+void poll_initwait(struct poll_wqueues *pwq)
+{
+	init_poll_funcptr(&pwq->pt, __pollwait);
+	pwq->polling_task = current;
+	pwq->triggered = 0;
+	pwq->error = 0;
+	pwq->table = NULL;
+	pwq->inline_index = 0;
+}
+EXPORT_SYMBOL(poll_initwait);
+
+static void free_poll_entry(struct poll_table_entry *entry)
+{
+	remove_wait_queue(entry->wait_address, &entry->wait);
+	fput(entry->filp);
+}
+
+void poll_freewait(struct poll_wqueues *pwq)
+{
+	struct poll_table_page * p = pwq->table;
+	int i;
+	for (i = 0; i < pwq->inline_index; i++)
+		free_poll_entry(pwq->inline_entries + i);
+	while (p) {
+		struct poll_table_entry * entry;
+		struct poll_table_page *old;
+
+		entry = p->entry;
+		do {
+			entry--;
+			free_poll_entry(entry);
+		} while (entry > p->entries);
+		old = p;
+		p = p->next;
+		free_page((unsigned long) old);
+	}
+}
+EXPORT_SYMBOL(poll_freewait);
+
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
+{
+	struct poll_table_page *table = p->table;
+
+	if (p->inline_index < N_INLINE_POLL_ENTRIES)
+		return p->inline_entries + p->inline_index++;
+
+	if (!table || POLL_TABLE_FULL(table)) {
+		struct poll_table_page *new_table;
+
+		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
+		if (!new_table) {
+			p->error = -ENOMEM;
+			return NULL;
+		}
+		new_table->entry = new_table->entries;
+		new_table->next = table;
+		p->table = new_table;
+		table = new_table;
+	}
+
+	return table->entry++;
+}
+
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_wqueues *pwq = wait->private;
+	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+	/*
+	 * Although this function is called under waitqueue lock, LOCK
+	 * doesn't imply write barrier and the users expect write
+	 * barrier semantics on wakeup functions.  The following
+	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+	 * and is paired with set_mb() in poll_schedule_timeout.
+	 */
+	smp_wmb();
+	pwq->triggered = 1;
+
+	/*
+	 * Perform the default wake up operation using a dummy
+	 * waitqueue.
+	 *
+	 * TODO: This is hacky but there currently is no interface to
+	 * pass in @sync.  @sync is scheduled to be removed and once
+	 * that happens, wake_up_process() can be used directly.
+	 */
+	return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_table_entry *entry;
+
+	entry = container_of(wait, struct poll_table_entry, wait);
+	if (key && !((unsigned long)key & entry->key))
+		return 0;
+	return __pollwake(wait, mode, sync, key);
+}
+
+/* Add a new entry */
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+				poll_table *p)
+{
+	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+	struct poll_table_entry *entry = poll_get_entry(pwq);
+	if (!entry)
+		return;
+	entry->filp = get_file(filp);
+	entry->wait_address = wait_address;
+	entry->key = p->_key;
+	init_waitqueue_func_entry(&entry->wait, pollwake);
+	entry->wait.private = pwq;
+	add_wait_queue(wait_address, &entry->wait);
+}
+
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+			  ktime_t *expires, unsigned long slack)
+{
+	int rc = -EINTR;
+
+	set_current_state(state);
+	if (!pwq->triggered)
+		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+	__set_current_state(TASK_RUNNING);
+
+	/*
+	 * Prepare for the next iteration.
+	 *
+	 * The following set_mb() serves two purposes.  First, it's
+	 * the counterpart rmb of the wmb in pollwake() such that data
+	 * written before wake up is always visible after wake up.
+	 * Second, the full barrier guarantees that triggered clearing
+	 * doesn't pass event check of the next iteration.  Note that
+	 * this problem doesn't exist for the first iteration as
+	 * add_wait_queue() has full barrier semantics.
+	 */
+	set_mb(pwq->triggered, 0);
+
+	return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
+/**
+ * poll_select_set_timeout - helper function to setup the timeout value
+ * @to:		pointer to timespec variable for the final timeout
+ * @sec:	seconds (from user space)
+ * @nsec:	nanoseconds (from user space)
+ *
+ * Note, we do not use a timespec for the user space value here, That
+ * way we can use the function for timeval and compat interfaces as well.
+ *
+ * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
+ */
+int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+{
+	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+
+	if (!timespec_valid(&ts))
+		return -EINVAL;
+
+	/* Optimize for the zero timeout value here */
+	if (!sec && !nsec) {
+		to->tv_sec = to->tv_nsec = 0;
+	} else {
+		ktime_get_ts(to);
+		*to = timespec_add_safe(*to, ts);
+	}
+	return 0;
+}
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec rts;
+	struct timeval rtv;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&rts);
+	rts = timespec_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
+
+	if (timeval) {
+		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+			memset(&rtv, 0, sizeof(rtv));
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+
+	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+		return ret;
+
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+#define FDS_IN(fds, n)		(fds->in + n)
+#define FDS_OUT(fds, n)		(fds->out + n)
+#define FDS_EX(fds, n)		(fds->ex + n)
+
+#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
+
+static int max_select_fd(unsigned long n, fd_set_bits *fds)
+{
+	unsigned long *open_fds;
+	unsigned long set;
+	int max;
+	struct fdtable *fdt;
+
+	/* handle last in-complete long-word first */
+	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
+	n /= BITS_PER_LONG;
+	fdt = files_fdtable(current->files);
+	open_fds = fdt->open_fds + n;
+	max = 0;
+	if (set) {
+		set &= BITS(fds, n);
+		if (set) {
+			if (!(set & ~*open_fds))
+				goto get_max;
+			return -EBADF;
+		}
+	}
+	while (n) {
+		open_fds--;
+		n--;
+		set = BITS(fds, n);
+		if (!set)
+			continue;
+		if (set & ~*open_fds)
+			return -EBADF;
+		if (max)
+			continue;
+get_max:
+		do {
+			max++;
+			set >>= 1;
+		} while (set);
+		max += n * BITS_PER_LONG;
+	}
+
+	return max;
+}
+
+#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
+#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
+#define POLLEX_SET (POLLPRI)
+
+static inline void wait_key_set(poll_table *wait, unsigned long in,
+				unsigned long out, unsigned long bit,
+				unsigned int ll_flag)
+{
+	wait->_key = POLLEX_SET | ll_flag;
+	if (in & bit)
+		wait->_key |= POLLIN_SET;
+	if (out & bit)
+		wait->_key |= POLLOUT_SET;
+}
+
+int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+{
+	ktime_t expire, *to = NULL;
+	struct poll_wqueues table;
+	poll_table *wait;
+	int retval, i, timed_out = 0;
+	unsigned long slack = 0;
+	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+	unsigned long busy_end = 0;
+
+	rcu_read_lock();
+	retval = max_select_fd(n, fds);
+	rcu_read_unlock();
+
+	if (retval < 0)
+		return retval;
+	n = retval;
+
+	poll_initwait(&table);
+	wait = &table.pt;
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
+		wait->_qproc = NULL;
+		timed_out = 1;
+	}
+
+	if (end_time && !timed_out)
+		slack = select_estimate_accuracy(end_time);
+
+	retval = 0;
+	for (;;) {
+		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+		bool can_busy_loop = false;
+
+		inp = fds->in; outp = fds->out; exp = fds->ex;
+		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
+
+		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
+			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
+			unsigned long res_in = 0, res_out = 0, res_ex = 0;
+
+			in = *inp++; out = *outp++; ex = *exp++;
+			all_bits = in | out | ex;
+			if (all_bits == 0) {
+				i += BITS_PER_LONG;
+				continue;
+			}
+
+			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
+				struct fd f;
+				if (i >= n)
+					break;
+				if (!(bit & all_bits))
+					continue;
+				f = fdget(i);
+				if (f.file) {
+					const struct file_operations *f_op;
+					f_op = f.file->f_op;
+					mask = DEFAULT_POLLMASK;
+					if (f_op->poll) {
+						wait_key_set(wait, in, out,
+							     bit, busy_flag);
+						mask = (*f_op->poll)(f.file, wait);
+					}
+					fdput(f);
+					if ((mask & POLLIN_SET) && (in & bit)) {
+						res_in |= bit;
+						retval++;
+						wait->_qproc = NULL;
+					}
+					if ((mask & POLLOUT_SET) && (out & bit)) {
+						res_out |= bit;
+						retval++;
+						wait->_qproc = NULL;
+					}
+					if ((mask & POLLEX_SET) && (ex & bit)) {
+						res_ex |= bit;
+						retval++;
+						wait->_qproc = NULL;
+					}
+					/* got something, stop busy polling */
+					if (retval) {
+						can_busy_loop = false;
+						busy_flag = 0;
+
+					/*
+					 * only remember a returned
+					 * POLL_BUSY_LOOP if we asked for it
+					 */
+					} else if (busy_flag & mask)
+						can_busy_loop = true;
+
+				}
+			}
+			if (res_in)
+				*rinp = res_in;
+			if (res_out)
+				*routp = res_out;
+			if (res_ex)
+				*rexp = res_ex;
+			cond_resched();
+		}
+		wait->_qproc = NULL;
+		if (retval || timed_out || signal_pending(current))
+			break;
+		if (table.error) {
+			retval = table.error;
+			break;
+		}
+
+		/* only if found POLL_BUSY_LOOP sockets && not out of time */
+		if (can_busy_loop && !need_resched()) {
+			if (!busy_end) {
+				busy_end = busy_loop_end_time();
+				continue;
+			}
+			if (!busy_loop_timeout(busy_end))
+				continue;
+		}
+		busy_flag = 0;
+
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
+		}
+
+		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+					   to, slack))
+			timed_out = 1;
+	}
+
+	poll_freewait(&table);
+
+	return retval;
+}
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+			   fd_set __user *exp, struct timespec *end_time)
+{
+	fd_set_bits fds;
+	void *bits;
+	int ret, max_fds;
+	unsigned int size;
+	struct fdtable *fdt;
+	/* Allocate small arguments on the stack to save memory and be faster */
+	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+	ret = -EINVAL;
+	if (n < 0)
+		goto out_nofds;
+
+	/* max_fds can increase, so grab it once to avoid race */
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fds = fdt->max_fds;
+	rcu_read_unlock();
+	if (n > max_fds)
+		n = max_fds;
+
+	/*
+	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+	 * since we used fdset we need to allocate memory in units of
+	 * long-words. 
+	 */
+	size = FDS_BYTES(n);
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		/* Not enough space in on-stack array; must use kmalloc */
+		ret = -ENOMEM;
+		bits = kmalloc(6 * size, GFP_KERNEL);
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = bits;
+	fds.out     = bits +   size;
+	fds.ex      = bits + 2*size;
+	fds.res_in  = bits + 3*size;
+	fds.res_out = bits + 4*size;
+	fds.res_ex  = bits + 5*size;
+
+	if ((ret = get_fd_set(n, inp, fds.in)) ||
+	    (ret = get_fd_set(n, outp, fds.out)) ||
+	    (ret = get_fd_set(n, exp, fds.ex)))
+		goto out;
+	zero_fd_set(n, fds.res_in);
+	zero_fd_set(n, fds.res_out);
+	zero_fd_set(n, fds.res_ex);
+
+	ret = do_select(n, &fds, end_time);
+
+	if (ret < 0)
+		goto out;
+	if (!ret) {
+		ret = -ERESTARTNOHAND;
+		if (signal_pending(current))
+			goto out;
+		ret = 0;
+	}
+
+	if (set_fd_set(n, inp, fds.res_in) ||
+	    set_fd_set(n, outp, fds.res_out) ||
+	    set_fd_set(n, exp, fds.res_ex))
+		ret = -EFAULT;
+
+out:
+	if (bits != stack_fds)
+		kfree(bits);
+out_nofds:
+	return ret;
+}
+
+SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timeval __user *, tvp)
+{
+	struct timespec end_time, *to = NULL;
+	struct timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+			return -EINVAL;
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+	return ret;
+}
+
+static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
+		       fd_set __user *exp, struct timespec __user *tsp,
+		       const sigset_t __user *sigmask, size_t sigsetsize)
+{
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts, end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+/*
+ * Most architectures can't handle 7-argument syscalls. So we provide a
+ * 6-argument version where the sixth argument is a pointer to a structure
+ * which has a pointer to the sigset_t itself followed by a size_t containing
+ * the sigset size.
+ */
+SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timespec __user *, tsp,
+		void __user *, sig)
+{
+	size_t sigsetsize = 0;
+	sigset_t __user *up = NULL;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+		    || __get_user(up, (sigset_t __user * __user *)sig)
+		    || __get_user(sigsetsize,
+				(size_t __user *)(sig+sizeof(void *))))
+			return -EFAULT;
+	}
+
+	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_SELECT
+struct sel_arg_struct {
+	unsigned long n;
+	fd_set __user *inp, *outp, *exp;
+	struct timeval __user *tvp;
+};
+
+SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
+{
+	struct sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+#endif
+
+struct poll_list {
+	struct poll_list *next;
+	int len;
+	struct pollfd entries[0];
+};
+
+#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
+
+/*
+ * Fish for pollable events on the pollfd->fd file descriptor. We're only
+ * interested in events matching the pollfd->events mask, and the result
+ * matching that mask is both recorded in pollfd->revents and returned. The
+ * pwait poll_table will be used by the fd-provided poll handler for waiting,
+ * if pwait->_qproc is non-NULL.
+ */
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
+				     bool *can_busy_poll,
+				     unsigned int busy_flag)
+{
+	unsigned int mask;
+	int fd;
+
+	mask = 0;
+	fd = pollfd->fd;
+	if (fd >= 0) {
+		struct fd f = fdget(fd);
+		mask = POLLNVAL;
+		if (f.file) {
+			mask = DEFAULT_POLLMASK;
+			if (f.file->f_op->poll) {
+				pwait->_key = pollfd->events|POLLERR|POLLHUP;
+				pwait->_key |= busy_flag;
+				mask = f.file->f_op->poll(f.file, pwait);
+				if (mask & busy_flag)
+					*can_busy_poll = true;
+			}
+			/* Mask out unneeded events. */
+			mask &= pollfd->events | POLLERR | POLLHUP;
+			fdput(f);
+		}
+	}
+	pollfd->revents = mask;
+
+	return mask;
+}
+
+static int do_poll(unsigned int nfds,  struct poll_list *list,
+		   struct poll_wqueues *wait, struct timespec *end_time)
+{
+	poll_table* pt = &wait->pt;
+	ktime_t expire, *to = NULL;
+	int timed_out = 0, count = 0;
+	unsigned long slack = 0;
+	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+	unsigned long busy_end = 0;
+
+	/* Optimise the no-wait case */
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
+		pt->_qproc = NULL;
+		timed_out = 1;
+	}
+
+	if (end_time && !timed_out)
+		slack = select_estimate_accuracy(end_time);
+
+	for (;;) {
+		struct poll_list *walk;
+		bool can_busy_loop = false;
+
+		for (walk = list; walk != NULL; walk = walk->next) {
+			struct pollfd * pfd, * pfd_end;
+
+			pfd = walk->entries;
+			pfd_end = pfd + walk->len;
+			for (; pfd != pfd_end; pfd++) {
+				/*
+				 * Fish for events. If we found one, record it
+				 * and kill poll_table->_qproc, so we don't
+				 * needlessly register any other waiters after
+				 * this. They'll get immediately deregistered
+				 * when we break out and return.
+				 */
+				if (do_pollfd(pfd, pt, &can_busy_loop,
+					      busy_flag)) {
+					count++;
+					pt->_qproc = NULL;
+					/* found something, stop busy polling */
+					busy_flag = 0;
+					can_busy_loop = false;
+				}
+			}
+		}
+		/*
+		 * All waiters have already been registered, so don't provide
+		 * a poll_table->_qproc to them on the next loop iteration.
+		 */
+		pt->_qproc = NULL;
+		if (!count) {
+			count = wait->error;
+			if (signal_pending(current))
+				count = -EINTR;
+		}
+		if (count || timed_out)
+			break;
+
+		/* only if found POLL_BUSY_LOOP sockets && not out of time */
+		if (can_busy_loop && !need_resched()) {
+			if (!busy_end) {
+				busy_end = busy_loop_end_time();
+				continue;
+			}
+			if (!busy_loop_timeout(busy_end))
+				continue;
+		}
+		busy_flag = 0;
+
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
+		}
+
+		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
+			timed_out = 1;
+	}
+	return count;
+}
+
+#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
+			sizeof(struct pollfd))
+
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+		struct timespec *end_time)
+{
+	struct poll_wqueues table;
+ 	int err = -EFAULT, fdcount, len, size;
+	/* Allocate small arguments on the stack to save memory and be
+	   faster - use long to make sure the buffer is aligned properly
+	   on 64 bit archs to avoid unaligned access */
+	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
+	struct poll_list *const head = (struct poll_list *)stack_pps;
+ 	struct poll_list *walk = head;
+ 	unsigned long todo = nfds;
+
+	if (nfds > rlimit(RLIMIT_NOFILE))
+		return -EINVAL;
+
+	len = min_t(unsigned int, nfds, N_STACK_PPS);
+	for (;;) {
+		walk->next = NULL;
+		walk->len = len;
+		if (!len)
+			break;
+
+		if (copy_from_user(walk->entries, ufds + nfds-todo,
+					sizeof(struct pollfd) * walk->len))
+			goto out_fds;
+
+		todo -= walk->len;
+		if (!todo)
+			break;
+
+		len = min(todo, POLLFD_PER_PAGE);
+		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
+		walk = walk->next = kmalloc(size, GFP_KERNEL);
+		if (!walk) {
+			err = -ENOMEM;
+			goto out_fds;
+		}
+	}
+
+	poll_initwait(&table);
+	fdcount = do_poll(nfds, head, &table, end_time);
+	poll_freewait(&table);
+
+	for (walk = head; walk; walk = walk->next) {
+		struct pollfd *fds = walk->entries;
+		int j;
+
+		for (j = 0; j < walk->len; j++, ufds++)
+			if (__put_user(fds[j].revents, &ufds->revents))
+				goto out_fds;
+  	}
+
+	err = fdcount;
+out_fds:
+	walk = head->next;
+	while (walk) {
+		struct poll_list *pos = walk;
+		walk = walk->next;
+		kfree(pos);
+	}
+
+	return err;
+}
+
+static long do_restart_poll(struct restart_block *restart_block)
+{
+	struct pollfd __user *ufds = restart_block->poll.ufds;
+	int nfds = restart_block->poll.nfds;
+	struct timespec *to = NULL, end_time;
+	int ret;
+
+	if (restart_block->poll.has_timeout) {
+		end_time.tv_sec = restart_block->poll.tv_sec;
+		end_time.tv_nsec = restart_block->poll.tv_nsec;
+		to = &end_time;
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	if (ret == -EINTR) {
+		restart_block->fn = do_restart_poll;
+		ret = -ERESTART_RESTARTBLOCK;
+	}
+	return ret;
+}
+
+SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
+		int, timeout_msecs)
+{
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (timeout_msecs >= 0) {
+		to = &end_time;
+		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
+			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	if (ret == -EINTR) {
+		struct restart_block *restart_block;
+
+		restart_block = &current->restart_block;
+		restart_block->fn = do_restart_poll;
+		restart_block->poll.ufds = ufds;
+		restart_block->poll.nfds = nfds;
+
+		if (timeout_msecs >= 0) {
+			restart_block->poll.tv_sec = end_time.tv_sec;
+			restart_block->poll.tv_nsec = end_time.tv_nsec;
+			restart_block->poll.has_timeout = 1;
+		} else
+			restart_block->poll.has_timeout = 0;
+
+		ret = -ERESTART_RESTARTBLOCK;
+	}
+	return ret;
+}
+
+SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
+		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
+{
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts, end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	return ret;
+}
diff --git a/kernel/fs/seq_file.c b/kernel/fs/seq_file.c
new file mode 100644
index 000000000..555f82155
--- /dev/null
+++ b/kernel/fs/seq_file.c
@@ -0,0 +1,968 @@
+/*
+ * linux/fs/seq_file.c
+ *
+ * helper functions for making synthetic files from sequences of records.
+ * initial implementation -- AV, Oct 2001.
+ */
+
+#include <linux/fs.h>
+#include <linux/export.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/mm.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+
+static void seq_set_overflow(struct seq_file *m)
+{
+	m->count = m->size;
+}
+
+static void *seq_buf_alloc(unsigned long size)
+{
+	void *buf;
+
+	/*
+	 * __GFP_NORETRY to avoid oom-killings with high-order allocations -
+	 * it's better to fall back to vmalloc() than to kill things.
+	 */
+	buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+	if (!buf && size > PAGE_SIZE)
+		buf = vmalloc(size);
+	return buf;
+}
+
+/**
+ *	seq_open -	initialize sequential file
+ *	@file: file we initialize
+ *	@op: method table describing the sequence
+ *
+ *	seq_open() sets @file, associating it with a sequence described
+ *	by @op.  @op->start() sets the iterator up and returns the first
+ *	element of sequence. @op->stop() shuts it down.  @op->next()
+ *	returns the next element of sequence.  @op->show() prints element
+ *	into the buffer.  In case of error ->start() and ->next() return
+ *	ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
+ *	returns 0 in case of success and negative number in case of error.
+ *	Returning SEQ_SKIP means "discard this element and move on".
+ */
+int seq_open(struct file *file, const struct seq_operations *op)
+{
+	struct seq_file *p = file->private_data;
+
+	if (!p) {
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+		file->private_data = p;
+	}
+	memset(p, 0, sizeof(*p));
+	mutex_init(&p->lock);
+	p->op = op;
+#ifdef CONFIG_USER_NS
+	p->user_ns = file->f_cred->user_ns;
+#endif
+
+	/*
+	 * Wrappers around seq_open(e.g. swaps_open) need to be
+	 * aware of this. If they set f_version themselves, they
+	 * should call seq_open first and then set f_version.
+	 */
+	file->f_version = 0;
+
+	/*
+	 * seq_files support lseek() and pread().  They do not implement
+	 * write() at all, but we clear FMODE_PWRITE here for historical
+	 * reasons.
+	 *
+	 * If a client of seq_files a) implements file.write() and b) wishes to
+	 * support pwrite() then that client will need to implement its own
+	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+	 */
+	file->f_mode &= ~FMODE_PWRITE;
+	return 0;
+}
+EXPORT_SYMBOL(seq_open);
+
+static int traverse(struct seq_file *m, loff_t offset)
+{
+	loff_t pos = 0, index;
+	int error = 0;
+	void *p;
+
+	m->version = 0;
+	index = 0;
+	m->count = m->from = 0;
+	if (!offset) {
+		m->index = index;
+		return 0;
+	}
+	if (!m->buf) {
+		m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
+		if (!m->buf)
+			return -ENOMEM;
+	}
+	p = m->op->start(m, &index);
+	while (p) {
+		error = PTR_ERR(p);
+		if (IS_ERR(p))
+			break;
+		error = m->op->show(m, p);
+		if (error < 0)
+			break;
+		if (unlikely(error)) {
+			error = 0;
+			m->count = 0;
+		}
+		if (seq_has_overflowed(m))
+			goto Eoverflow;
+		if (pos + m->count > offset) {
+			m->from = offset - pos;
+			m->count -= m->from;
+			m->index = index;
+			break;
+		}
+		pos += m->count;
+		m->count = 0;
+		if (pos == offset) {
+			index++;
+			m->index = index;
+			break;
+		}
+		p = m->op->next(m, p, &index);
+	}
+	m->op->stop(m, p);
+	m->index = index;
+	return error;
+
+Eoverflow:
+	m->op->stop(m, p);
+	kvfree(m->buf);
+	m->count = 0;
+	m->buf = seq_buf_alloc(m->size <<= 1);
+	return !m->buf ? -ENOMEM : -EAGAIN;
+}
+
+/**
+ *	seq_read -	->read() method for sequential files.
+ *	@file: the file to read from
+ *	@buf: the buffer to read to
+ *	@size: the maximum number of bytes to read
+ *	@ppos: the current position in the file
+ *
+ *	Ready-made ->f_op->read()
+ */
+ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	size_t copied = 0;
+	loff_t pos;
+	size_t n;
+	void *p;
+	int err = 0;
+
+	mutex_lock(&m->lock);
+
+	/*
+	 * seq_file->op->..m_start/m_stop/m_next may do special actions
+	 * or optimisations based on the file->f_version, so we want to
+	 * pass the file->f_version to those methods.
+	 *
+	 * seq_file->version is just copy of f_version, and seq_file
+	 * methods can treat it simply as file version.
+	 * It is copied in first and copied out after all operations.
+	 * It is convenient to have it as  part of structure to avoid the
+	 * need of passing another argument to all the seq_file methods.
+	 */
+	m->version = file->f_version;
+
+	/* Don't assume *ppos is where we left it */
+	if (unlikely(*ppos != m->read_pos)) {
+		while ((err = traverse(m, *ppos)) == -EAGAIN)
+			;
+		if (err) {
+			/* With prejudice... */
+			m->read_pos = 0;
+			m->version = 0;
+			m->index = 0;
+			m->count = 0;
+			goto Done;
+		} else {
+			m->read_pos = *ppos;
+		}
+	}
+
+	/* grab buffer if we didn't have one */
+	if (!m->buf) {
+		m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
+		if (!m->buf)
+			goto Enomem;
+	}
+	/* if not empty - flush it first */
+	if (m->count) {
+		n = min(m->count, size);
+		err = copy_to_user(buf, m->buf + m->from, n);
+		if (err)
+			goto Efault;
+		m->count -= n;
+		m->from += n;
+		size -= n;
+		buf += n;
+		copied += n;
+		if (!m->count)
+			m->index++;
+		if (!size)
+			goto Done;
+	}
+	/* we need at least one record in buffer */
+	pos = m->index;
+	p = m->op->start(m, &pos);
+	while (1) {
+		err = PTR_ERR(p);
+		if (!p || IS_ERR(p))
+			break;
+		err = m->op->show(m, p);
+		if (err < 0)
+			break;
+		if (unlikely(err))
+			m->count = 0;
+		if (unlikely(!m->count)) {
+			p = m->op->next(m, p, &pos);
+			m->index = pos;
+			continue;
+		}
+		if (m->count < m->size)
+			goto Fill;
+		m->op->stop(m, p);
+		kvfree(m->buf);
+		m->count = 0;
+		m->buf = seq_buf_alloc(m->size <<= 1);
+		if (!m->buf)
+			goto Enomem;
+		m->version = 0;
+		pos = m->index;
+		p = m->op->start(m, &pos);
+	}
+	m->op->stop(m, p);
+	m->count = 0;
+	goto Done;
+Fill:
+	/* they want more? let's try to get some more */
+	while (m->count < size) {
+		size_t offs = m->count;
+		loff_t next = pos;
+		p = m->op->next(m, p, &next);
+		if (!p || IS_ERR(p)) {
+			err = PTR_ERR(p);
+			break;
+		}
+		err = m->op->show(m, p);
+		if (seq_has_overflowed(m) || err) {
+			m->count = offs;
+			if (likely(err <= 0))
+				break;
+		}
+		pos = next;
+	}
+	m->op->stop(m, p);
+	n = min(m->count, size);
+	err = copy_to_user(buf, m->buf, n);
+	if (err)
+		goto Efault;
+	copied += n;
+	m->count -= n;
+	if (m->count)
+		m->from = n;
+	else
+		pos++;
+	m->index = pos;
+Done:
+	if (!copied)
+		copied = err;
+	else {
+		*ppos += copied;
+		m->read_pos += copied;
+	}
+	file->f_version = m->version;
+	mutex_unlock(&m->lock);
+	return copied;
+Enomem:
+	err = -ENOMEM;
+	goto Done;
+Efault:
+	err = -EFAULT;
+	goto Done;
+}
+EXPORT_SYMBOL(seq_read);
+
+/**
+ *	seq_lseek -	->llseek() method for sequential files.
+ *	@file: the file in question
+ *	@offset: new position
+ *	@whence: 0 for absolute, 1 for relative position
+ *
+ *	Ready-made ->f_op->llseek()
+ */
+loff_t seq_lseek(struct file *file, loff_t offset, int whence)
+{
+	struct seq_file *m = file->private_data;
+	loff_t retval = -EINVAL;
+
+	mutex_lock(&m->lock);
+	m->version = file->f_version;
+	switch (whence) {
+	case SEEK_CUR:
+		offset += file->f_pos;
+	case SEEK_SET:
+		if (offset < 0)
+			break;
+		retval = offset;
+		if (offset != m->read_pos) {
+			while ((retval = traverse(m, offset)) == -EAGAIN)
+				;
+			if (retval) {
+				/* with extreme prejudice... */
+				file->f_pos = 0;
+				m->read_pos = 0;
+				m->version = 0;
+				m->index = 0;
+				m->count = 0;
+			} else {
+				m->read_pos = offset;
+				retval = file->f_pos = offset;
+			}
+		} else {
+			file->f_pos = offset;
+		}
+	}
+	file->f_version = m->version;
+	mutex_unlock(&m->lock);
+	return retval;
+}
+EXPORT_SYMBOL(seq_lseek);
+
+/**
+ *	seq_release -	free the structures associated with sequential file.
+ *	@file: file in question
+ *	@inode: its inode
+ *
+ *	Frees the structures associated with sequential file; can be used
+ *	as ->f_op->release() if you don't have private data to destroy.
+ */
+int seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	kvfree(m->buf);
+	kfree(m);
+	return 0;
+}
+EXPORT_SYMBOL(seq_release);
+
+/**
+ *	seq_escape -	print string into buffer, escaping some characters
+ *	@m:	target buffer
+ *	@s:	string
+ *	@esc:	set of characters that need escaping
+ *
+ *	Puts string into buffer, replacing each occurrence of character from
+ *	@esc with usual octal escape.  Returns 0 in case of success, -1 - in
+ *	case of overflow.
+ */
+int seq_escape(struct seq_file *m, const char *s, const char *esc)
+{
+	char *end = m->buf + m->size;
+        char *p;
+	char c;
+
+        for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
+		if (!strchr(esc, c)) {
+			*p++ = c;
+			continue;
+		}
+		if (p + 3 < end) {
+			*p++ = '\\';
+			*p++ = '0' + ((c & 0300) >> 6);
+			*p++ = '0' + ((c & 070) >> 3);
+			*p++ = '0' + (c & 07);
+			continue;
+		}
+		seq_set_overflow(m);
+		return -1;
+        }
+	m->count = p - m->buf;
+        return 0;
+}
+EXPORT_SYMBOL(seq_escape);
+
+int seq_vprintf(struct seq_file *m, const char *f, va_list args)
+{
+	int len;
+
+	if (m->count < m->size) {
+		len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	seq_set_overflow(m);
+	return -1;
+}
+EXPORT_SYMBOL(seq_vprintf);
+
+int seq_printf(struct seq_file *m, const char *f, ...)
+{
+	int ret;
+	va_list args;
+
+	va_start(args, f);
+	ret = seq_vprintf(m, f, args);
+	va_end(args);
+
+	return ret;
+}
+EXPORT_SYMBOL(seq_printf);
+
+/**
+ *	mangle_path -	mangle and copy path to buffer beginning
+ *	@s: buffer start
+ *	@p: beginning of path in above buffer
+ *	@esc: set of characters that need escaping
+ *
+ *      Copy the path from @p to @s, replacing each occurrence of character from
+ *      @esc with usual octal escape.
+ *      Returns pointer past last written character in @s, or NULL in case of
+ *      failure.
+ */
+char *mangle_path(char *s, const char *p, const char *esc)
+{
+	while (s <= p) {
+		char c = *p++;
+		if (!c) {
+			return s;
+		} else if (!strchr(esc, c)) {
+			*s++ = c;
+		} else if (s + 4 > p) {
+			break;
+		} else {
+			*s++ = '\\';
+			*s++ = '0' + ((c & 0300) >> 6);
+			*s++ = '0' + ((c & 070) >> 3);
+			*s++ = '0' + (c & 07);
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(mangle_path);
+
+/**
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
+ */
+int seq_path(struct seq_file *m, const struct path *path, const char *esc)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = d_path(path, buf, size);
+		if (!IS_ERR(p)) {
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+		}
+	}
+	seq_commit(m, res);
+
+	return res;
+}
+EXPORT_SYMBOL(seq_path);
+
+/*
+ * Same as seq_path, but relative to supplied root.
+ */
+int seq_path_root(struct seq_file *m, const struct path *path,
+		  const struct path *root, const char *esc)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -ENAMETOOLONG;
+
+	if (size) {
+		char *p;
+
+		p = __d_path(path, root, buf, size);
+		if (!p)
+			return SEQ_SKIP;
+		res = PTR_ERR(p);
+		if (!IS_ERR(p)) {
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+			else
+				res = -ENAMETOOLONG;
+		}
+	}
+	seq_commit(m, res);
+
+	return res < 0 && res != -ENAMETOOLONG ? res : 0;
+}
+
+/*
+ * returns the path of the 'dentry' from the root of its filesystem.
+ */
+int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = dentry_path(dentry, buf, size);
+		if (!IS_ERR(p)) {
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+		}
+	}
+	seq_commit(m, res);
+
+	return res;
+}
+
+static void *single_start(struct seq_file *p, loff_t *pos)
+{
+	return NULL + (*pos == 0);
+}
+
+static void *single_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void single_stop(struct seq_file *p, void *v)
+{
+}
+
+int single_open(struct file *file, int (*show)(struct seq_file *, void *),
+		void *data)
+{
+	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
+	int res = -ENOMEM;
+
+	if (op) {
+		op->start = single_start;
+		op->next = single_next;
+		op->stop = single_stop;
+		op->show = show;
+		res = seq_open(file, op);
+		if (!res)
+			((struct seq_file *)file->private_data)->private = data;
+		else
+			kfree(op);
+	}
+	return res;
+}
+EXPORT_SYMBOL(single_open);
+
+int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
+		void *data, size_t size)
+{
+	char *buf = seq_buf_alloc(size);
+	int ret;
+	if (!buf)
+		return -ENOMEM;
+	ret = single_open(file, show, data);
+	if (ret) {
+		kvfree(buf);
+		return ret;
+	}
+	((struct seq_file *)file->private_data)->buf = buf;
+	((struct seq_file *)file->private_data)->size = size;
+	return 0;
+}
+EXPORT_SYMBOL(single_open_size);
+
+int single_release(struct inode *inode, struct file *file)
+{
+	const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
+	int res = seq_release(inode, file);
+	kfree(op);
+	return res;
+}
+EXPORT_SYMBOL(single_release);
+
+int seq_release_private(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+
+	kfree(seq->private);
+	seq->private = NULL;
+	return seq_release(inode, file);
+}
+EXPORT_SYMBOL(seq_release_private);
+
+void *__seq_open_private(struct file *f, const struct seq_operations *ops,
+		int psize)
+{
+	int rc;
+	void *private;
+	struct seq_file *seq;
+
+	private = kzalloc(psize, GFP_KERNEL);
+	if (private == NULL)
+		goto out;
+
+	rc = seq_open(f, ops);
+	if (rc < 0)
+		goto out_free;
+
+	seq = f->private_data;
+	seq->private = private;
+	return private;
+
+out_free:
+	kfree(private);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(__seq_open_private);
+
+int seq_open_private(struct file *filp, const struct seq_operations *ops,
+		int psize)
+{
+	return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(seq_open_private);
+
+int seq_putc(struct seq_file *m, char c)
+{
+	if (m->count < m->size) {
+		m->buf[m->count++] = c;
+		return 0;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(seq_putc);
+
+int seq_puts(struct seq_file *m, const char *s)
+{
+	int len = strlen(s);
+	if (m->count + len < m->size) {
+		memcpy(m->buf + m->count, s, len);
+		m->count += len;
+		return 0;
+	}
+	seq_set_overflow(m);
+	return -1;
+}
+EXPORT_SYMBOL(seq_puts);
+
+/*
+ * A helper routine for putting decimal numbers without rich format of printf().
+ * only 'unsigned long long' is supported.
+ * This routine will put one byte delimiter + number into seq_file.
+ * This routine is very quick when you show lots of numbers.
+ * In usual cases, it will be better to use seq_printf(). It's easier to read.
+ */
+int seq_put_decimal_ull(struct seq_file *m, char delimiter,
+			unsigned long long num)
+{
+	int len;
+
+	if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
+		goto overflow;
+
+	if (delimiter)
+		m->buf[m->count++] = delimiter;
+
+	if (num < 10) {
+		m->buf[m->count++] = num + '0';
+		return 0;
+	}
+
+	len = num_to_str(m->buf + m->count, m->size - m->count, num);
+	if (!len)
+		goto overflow;
+	m->count += len;
+	return 0;
+overflow:
+	seq_set_overflow(m);
+	return -1;
+}
+EXPORT_SYMBOL(seq_put_decimal_ull);
+
+int seq_put_decimal_ll(struct seq_file *m, char delimiter,
+			long long num)
+{
+	if (num < 0) {
+		if (m->count + 3 >= m->size) {
+			seq_set_overflow(m);
+			return -1;
+		}
+		if (delimiter)
+			m->buf[m->count++] = delimiter;
+		num = -num;
+		delimiter = '-';
+	}
+	return seq_put_decimal_ull(m, delimiter, num);
+
+}
+EXPORT_SYMBOL(seq_put_decimal_ll);
+
+/**
+ * seq_write - write arbitrary data to buffer
+ * @seq: seq_file identifying the buffer to which data should be written
+ * @data: data address
+ * @len: number of bytes
+ *
+ * Return 0 on success, non-zero otherwise.
+ */
+int seq_write(struct seq_file *seq, const void *data, size_t len)
+{
+	if (seq->count + len < seq->size) {
+		memcpy(seq->buf + seq->count, data, len);
+		seq->count += len;
+		return 0;
+	}
+	seq_set_overflow(seq);
+	return -1;
+}
+EXPORT_SYMBOL(seq_write);
+
+/**
+ * seq_pad - write padding spaces to buffer
+ * @m: seq_file identifying the buffer to which data should be written
+ * @c: the byte to append after padding if non-zero
+ */
+void seq_pad(struct seq_file *m, char c)
+{
+	int size = m->pad_until - m->count;
+	if (size > 0)
+		seq_printf(m, "%*s", size, "");
+	if (c)
+		seq_putc(m, c);
+}
+EXPORT_SYMBOL(seq_pad);
+
+struct list_head *seq_list_start(struct list_head *head, loff_t pos)
+{
+	struct list_head *lh;
+
+	list_for_each(lh, head)
+		if (pos-- == 0)
+			return lh;
+
+	return NULL;
+}
+EXPORT_SYMBOL(seq_list_start);
+
+struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
+{
+	if (!pos)
+		return head;
+
+	return seq_list_start(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_list_start_head);
+
+struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
+{
+	struct list_head *lh;
+
+	lh = ((struct list_head *)v)->next;
+	++*ppos;
+	return lh == head ? NULL : lh;
+}
+EXPORT_SYMBOL(seq_list_next);
+
+/**
+ * seq_hlist_start - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
+{
+	struct hlist_node *node;
+
+	hlist_for_each(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start);
+
+/**
+ * seq_hlist_start_head - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ */
+struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head);
+
+/**
+ * seq_hlist_next - move to the next position of the hlist
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
+				  loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return head->first;
+	else
+		return node->next;
+}
+EXPORT_SYMBOL(seq_hlist_next);
+
+/**
+ * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
+				       loff_t pos)
+{
+	struct hlist_node *node;
+
+	__hlist_for_each_rcu(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_rcu);
+
+/**
+ * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
+					    loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head_rcu);
+
+/**
+ * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_next_rcu(void *v,
+				      struct hlist_head *head,
+				      loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return rcu_dereference(head->first);
+	else
+		return rcu_dereference(node->next);
+}
+EXPORT_SYMBOL(seq_hlist_next_rcu);
+
+/**
+ * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *
+seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
+{
+	struct hlist_node *node;
+
+	for_each_possible_cpu(*cpu) {
+		hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
+			if (pos-- == 0)
+				return node;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_percpu);
+
+/**
+ * seq_hlist_next_percpu - move to the next position of the percpu hlist array
+ * @v:    pointer to current hlist_node
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *
+seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
+			int *cpu, loff_t *pos)
+{
+	struct hlist_node *node = v;
+
+	++*pos;
+
+	if (node->next)
+		return node->next;
+
+	for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
+	     *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
+		struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
+
+		if (!hlist_empty(bucket))
+			return bucket->first;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/kernel/fs/signalfd.c b/kernel/fs/signalfd.c
new file mode 100644
index 000000000..7e412ad74
--- /dev/null
+++ b/kernel/fs/signalfd.c
@@ -0,0 +1,342 @@
+/*
+ *  fs/signalfd.c
+ *
+ *  Copyright (C) 2003  Linus Torvalds
+ *
+ *  Mon Mar 5, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Changed ->read() to return a siginfo strcture instead of signal number.
+ *      Fixed locking in ->poll().
+ *      Added sighand-detach notification.
+ *      Added fd re-use in sys_signalfd() syscall.
+ *      Now using anonymous inode source.
+ *      Thanks to Oleg Nesterov for useful code review and suggestions.
+ *      More comments and suggestions from Arnd Bergmann.
+ *  Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
+ *      Retrieve multiple signals with one read() call
+ *  Sun Jul 15, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Attach to the sighand only during read() and poll().
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/signalfd.h>
+#include <linux/syscalls.h>
+#include <linux/proc_fs.h>
+#include <linux/compat.h>
+
+void signalfd_cleanup(struct sighand_struct *sighand)
+{
+	wait_queue_head_t *wqh = &sighand->signalfd_wqh;
+	/*
+	 * The lockless check can race with remove_wait_queue() in progress,
+	 * but in this case its caller should run under rcu_read_lock() and
+	 * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+	 */
+	if (likely(!waitqueue_active(wqh)))
+		return;
+
+	/* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+	wake_up_poll(wqh, POLLHUP | POLLFREE);
+}
+
+struct signalfd_ctx {
+	sigset_t sigmask;
+};
+
+static int signalfd_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static unsigned int signalfd_poll(struct file *file, poll_table *wait)
+{
+	struct signalfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+
+	poll_wait(file, &current->sighand->signalfd_wqh, wait);
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (next_signal(&current->pending, &ctx->sigmask) ||
+	    next_signal(&current->signal->shared_pending,
+			&ctx->sigmask))
+		events |= POLLIN;
+	spin_unlock_irq(&current->sighand->siglock);
+
+	return events;
+}
+
+/*
+ * Copied from copy_siginfo_to_user() in kernel/signal.c
+ */
+static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
+			     siginfo_t const *kinfo)
+{
+	long err;
+
+	BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
+
+	/*
+	 * Unused members should be zero ...
+	 */
+	err = __clear_user(uinfo, sizeof(*uinfo));
+
+	/*
+	 * If you change siginfo_t structure, please be sure
+	 * this code is fixed accordingly.
+	 */
+	err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo);
+	err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno);
+	err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code);
+	switch (kinfo->si_code & __SI_MASK) {
+	case __SI_KILL:
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		break;
+	case __SI_TIMER:
+		 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
+		 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
+		 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		break;
+	case __SI_POLL:
+		err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
+		err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd);
+		break;
+	case __SI_FAULT:
+		err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
+#ifdef __ARCH_SI_TRAPNO
+		err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
+#endif
+#ifdef BUS_MCEERR_AO
+		/* 
+		 * Other callers might not initialize the si_lsb field,
+		 * so check explicitly for the right codes here.
+		 */
+		if (kinfo->si_code == BUS_MCEERR_AR ||
+		    kinfo->si_code == BUS_MCEERR_AO)
+			err |= __put_user((short) kinfo->si_addr_lsb,
+					  &uinfo->ssi_addr_lsb);
+#endif
+		break;
+	case __SI_CHLD:
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		err |= __put_user(kinfo->si_status, &uinfo->ssi_status);
+		err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime);
+		err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime);
+		break;
+	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		break;
+	default:
+		/*
+		 * This case catches also the signals queued by sigqueue().
+		 */
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		break;
+	}
+
+	return err ? -EFAULT: sizeof(*uinfo);
+}
+
+static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
+				int nonblock)
+{
+	ssize_t ret;
+	DECLARE_WAITQUEUE(wait, current);
+
+	spin_lock_irq(&current->sighand->siglock);
+	ret = dequeue_signal(current, &ctx->sigmask, info);
+	switch (ret) {
+	case 0:
+		if (!nonblock)
+			break;
+		ret = -EAGAIN;
+	default:
+		spin_unlock_irq(&current->sighand->siglock);
+		return ret;
+	}
+
+	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		ret = dequeue_signal(current, &ctx->sigmask, info);
+		if (ret != 0)
+			break;
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		spin_unlock_irq(&current->sighand->siglock);
+		schedule();
+		spin_lock_irq(&current->sighand->siglock);
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+
+	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	__set_current_state(TASK_RUNNING);
+
+	return ret;
+}
+
+/*
+ * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative
+ * error code. The "count" parameter must be at least the size of a
+ * "struct signalfd_siginfo".
+ */
+static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct signalfd_ctx *ctx = file->private_data;
+	struct signalfd_siginfo __user *siginfo;
+	int nonblock = file->f_flags & O_NONBLOCK;
+	ssize_t ret, total = 0;
+	siginfo_t info;
+
+	count /= sizeof(struct signalfd_siginfo);
+	if (!count)
+		return -EINVAL;
+
+	siginfo = (struct signalfd_siginfo __user *) buf;
+	do {
+		ret = signalfd_dequeue(ctx, &info, nonblock);
+		if (unlikely(ret <= 0))
+			break;
+		ret = signalfd_copyinfo(siginfo, &info);
+		if (ret < 0)
+			break;
+		siginfo++;
+		total += ret;
+		nonblock = 1;
+	} while (--count);
+
+	return total ? total: ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void signalfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct signalfd_ctx *ctx = f->private_data;
+	sigset_t sigmask;
+
+	sigmask = ctx->sigmask;
+	signotset(&sigmask);
+	render_sigset_t(m, "sigmask:\t", &sigmask);
+}
+#endif
+
+static const struct file_operations signalfd_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= signalfd_show_fdinfo,
+#endif
+	.release	= signalfd_release,
+	.poll		= signalfd_poll,
+	.read		= signalfd_read,
+	.llseek		= noop_llseek,
+};
+
+SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask, int, flags)
+{
+	sigset_t sigmask;
+	struct signalfd_ctx *ctx;
+
+	/* Check the SFD_* constants for consistency.  */
+	BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
+		return -EINVAL;
+
+	if (sizemask != sizeof(sigset_t) ||
+	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
+		return -EINVAL;
+	sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+	signotset(&sigmask);
+
+	if (ufd == -1) {
+		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+		if (!ctx)
+			return -ENOMEM;
+
+		ctx->sigmask = sigmask;
+
+		/*
+		 * When we call this, the initialization must be complete, since
+		 * anon_inode_getfd() will install the fd.
+		 */
+		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
+				       O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK)));
+		if (ufd < 0)
+			kfree(ctx);
+	} else {
+		struct fd f = fdget(ufd);
+		if (!f.file)
+			return -EBADF;
+		ctx = f.file->private_data;
+		if (f.file->f_op != &signalfd_fops) {
+			fdput(f);
+			return -EINVAL;
+		}
+		spin_lock_irq(&current->sighand->siglock);
+		ctx->sigmask = sigmask;
+		spin_unlock_irq(&current->sighand->siglock);
+
+		wake_up(&current->sighand->signalfd_wqh);
+		fdput(f);
+	}
+
+	return ufd;
+}
+
+SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask)
+{
+	return sys_signalfd4(ufd, user_mask, sizemask, 0);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
+		     const compat_sigset_t __user *,sigmask,
+		     compat_size_t, sigsetsize,
+		     int, flags)
+{
+	compat_sigset_t ss32;
+	sigset_t tmp;
+	sigset_t __user *ksigmask;
+
+	if (sigsetsize != sizeof(compat_sigset_t))
+		return -EINVAL;
+	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		return -EFAULT;
+	sigset_from_compat(&tmp, &ss32);
+	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
+	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
+		return -EFAULT;
+
+	return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
+}
+
+COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd,
+		     const compat_sigset_t __user *,sigmask,
+		     compat_size_t, sigsetsize)
+{
+	return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
+#endif
diff --git a/kernel/fs/splice.c b/kernel/fs/splice.c
new file mode 100644
index 000000000..bfe62ae40
--- /dev/null
+++ b/kernel/fs/splice.c
@@ -0,0 +1,2034 @@
+/*
+ * "splice": joining two ropes together by interweaving their strands.
+ *
+ * This is the "extended pipe" functionality, where a pipe is used as
+ * an arbitrary in-memory buffer. Think of a pipe as a small kernel
+ * buffer that you can use to transfer data from one end to the other.
+ *
+ * The traditional unix read/write is extended with a "splice()" operation
+ * that transfers data buffers to or from a pipe buffer.
+ *
+ * Named by Larry McVoy, original implementation from Linus, extended by
+ * Jens to support splicing to files, network, direct splicing, etc and
+ * fixing lots of bugs.
+ *
+ * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
+ * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
+ *
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/splice.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/uio.h>
+#include <linux/security.h>
+#include <linux/gfp.h>
+#include <linux/socket.h>
+#include <linux/compat.h>
+#include "internal.h"
+
+/*
+ * Attempt to steal a page from a pipe buffer. This should perhaps go into
+ * a vm helper function, it's already simplified quite a bit by the
+ * addition of remove_mapping(). If success is returned, the caller may
+ * attempt to reuse this page for another destination.
+ */
+static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+	struct address_space *mapping;
+
+	lock_page(page);
+
+	mapping = page_mapping(page);
+	if (mapping) {
+		WARN_ON(!PageUptodate(page));
+
+		/*
+		 * At least for ext2 with nobh option, we need to wait on
+		 * writeback completing on this page, since we'll remove it
+		 * from the pagecache.  Otherwise truncate wont wait on the
+		 * page, allowing the disk blocks to be reused by someone else
+		 * before we actually wrote our data to them. fs corruption
+		 * ensues.
+		 */
+		wait_on_page_writeback(page);
+
+		if (page_has_private(page) &&
+		    !try_to_release_page(page, GFP_KERNEL))
+			goto out_unlock;
+
+		/*
+		 * If we succeeded in removing the mapping, set LRU flag
+		 * and return good.
+		 */
+		if (remove_mapping(mapping, page)) {
+			buf->flags |= PIPE_BUF_FLAG_LRU;
+			return 0;
+		}
+	}
+
+	/*
+	 * Raced with truncate or failed to remove page from current
+	 * address space, unlock and return failure.
+	 */
+out_unlock:
+	unlock_page(page);
+	return 1;
+}
+
+static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
+					struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+	buf->flags &= ~PIPE_BUF_FLAG_LRU;
+}
+
+/*
+ * Check whether the contents of buf is OK to access. Since the content
+ * is a page cache page, IO may be in flight.
+ */
+static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
+				       struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+	int err;
+
+	if (!PageUptodate(page)) {
+		lock_page(page);
+
+		/*
+		 * Page got truncated/unhashed. This will cause a 0-byte
+		 * splice, if this is the first page.
+		 */
+		if (!page->mapping) {
+			err = -ENODATA;
+			goto error;
+		}
+
+		/*
+		 * Uh oh, read-error from disk.
+		 */
+		if (!PageUptodate(page)) {
+			err = -EIO;
+			goto error;
+		}
+
+		/*
+		 * Page is ok afterall, we are done.
+		 */
+		unlock_page(page);
+	}
+
+	return 0;
+error:
+	unlock_page(page);
+	return err;
+}
+
+const struct pipe_buf_operations page_cache_pipe_buf_ops = {
+	.can_merge = 0,
+	.confirm = page_cache_pipe_buf_confirm,
+	.release = page_cache_pipe_buf_release,
+	.steal = page_cache_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
+		return 1;
+
+	buf->flags |= PIPE_BUF_FLAG_LRU;
+	return generic_pipe_buf_steal(pipe, buf);
+}
+
+static const struct pipe_buf_operations user_page_pipe_buf_ops = {
+	.can_merge = 0,
+	.confirm = generic_pipe_buf_confirm,
+	.release = page_cache_pipe_buf_release,
+	.steal = user_page_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+	smp_mb();
+	if (waitqueue_active(&pipe->wait))
+		wake_up_interruptible(&pipe->wait);
+	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
+
+/**
+ * splice_to_pipe - fill passed data into a pipe
+ * @pipe:	pipe to fill
+ * @spd:	data to fill
+ *
+ * Description:
+ *    @spd contains a map of pages and len/offset tuples, along with
+ *    the struct pipe_buf_operations associated with these pages. This
+ *    function will link that data to the pipe.
+ *
+ */
+ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+		       struct splice_pipe_desc *spd)
+{
+	unsigned int spd_pages = spd->nr_pages;
+	int ret, do_wakeup, page_nr;
+
+	ret = 0;
+	do_wakeup = 0;
+	page_nr = 0;
+
+	pipe_lock(pipe);
+
+	for (;;) {
+		if (!pipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		if (pipe->nrbufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+			buf->page = spd->pages[page_nr];
+			buf->offset = spd->partial[page_nr].offset;
+			buf->len = spd->partial[page_nr].len;
+			buf->private = spd->partial[page_nr].private;
+			buf->ops = spd->ops;
+			if (spd->flags & SPLICE_F_GIFT)
+				buf->flags |= PIPE_BUF_FLAG_GIFT;
+
+			pipe->nrbufs++;
+			page_nr++;
+			ret += buf->len;
+
+			if (pipe->files)
+				do_wakeup = 1;
+
+			if (!--spd->nr_pages)
+				break;
+			if (pipe->nrbufs < pipe->buffers)
+				continue;
+
+			break;
+		}
+
+		if (spd->flags & SPLICE_F_NONBLOCK) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			smp_mb();
+			if (waitqueue_active(&pipe->wait))
+				wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+			do_wakeup = 0;
+		}
+
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
+	}
+
+	pipe_unlock(pipe);
+
+	if (do_wakeup)
+		wakeup_pipe_readers(pipe);
+
+	while (page_nr < spd_pages)
+		spd->spd_release(spd, page_nr++);
+
+	return ret;
+}
+
+void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+{
+	page_cache_release(spd->pages[i]);
+}
+
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+	unsigned int buffers = ACCESS_ONCE(pipe->buffers);
+
+	spd->nr_pages_max = buffers;
+	if (buffers <= PIPE_DEF_BUFFERS)
+		return 0;
+
+	spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
+	spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+	if (spd->pages && spd->partial)
+		return 0;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+	return -ENOMEM;
+}
+
+void splice_shrink_spd(struct splice_pipe_desc *spd)
+{
+	if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
+		return;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+}
+
+static int
+__generic_file_splice_read(struct file *in, loff_t *ppos,
+			   struct pipe_inode_info *pipe, size_t len,
+			   unsigned int flags)
+{
+	struct address_space *mapping = in->f_mapping;
+	unsigned int loff, nr_pages, req_pages;
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct page *page;
+	pgoff_t index, end_index;
+	loff_t isize;
+	int error, page_nr;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.nr_pages_max = PIPE_DEF_BUFFERS,
+		.flags = flags,
+		.ops = &page_cache_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	loff = *ppos & ~PAGE_CACHE_MASK;
+	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = min(req_pages, spd.nr_pages_max);
+
+	/*
+	 * Lookup the (hopefully) full range of pages we need.
+	 */
+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
+	index += spd.nr_pages;
+
+	/*
+	 * If find_get_pages_contig() returned fewer pages than we needed,
+	 * readahead/allocate the rest and fill in the holes.
+	 */
+	if (spd.nr_pages < nr_pages)
+		page_cache_sync_readahead(mapping, &in->f_ra, in,
+				index, req_pages - spd.nr_pages);
+
+	error = 0;
+	while (spd.nr_pages < nr_pages) {
+		/*
+		 * Page could be there, find_get_pages_contig() breaks on
+		 * the first hole.
+		 */
+		page = find_get_page(mapping, index);
+		if (!page) {
+			/*
+			 * page didn't exist, allocate one.
+			 */
+			page = page_cache_alloc_cold(mapping);
+			if (!page)
+				break;
+
+			error = add_to_page_cache_lru(page, mapping, index,
+						GFP_KERNEL);
+			if (unlikely(error)) {
+				page_cache_release(page);
+				if (error == -EEXIST)
+					continue;
+				break;
+			}
+			/*
+			 * add_to_page_cache() locks the page, unlock it
+			 * to avoid convoluting the logic below even more.
+			 */
+			unlock_page(page);
+		}
+
+		spd.pages[spd.nr_pages++] = page;
+		index++;
+	}
+
+	/*
+	 * Now loop over the map and see if we need to start IO on any
+	 * pages, fill in the partial map, etc.
+	 */
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	nr_pages = spd.nr_pages;
+	spd.nr_pages = 0;
+	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+		unsigned int this_len;
+
+		if (!len)
+			break;
+
+		/*
+		 * this_len is the max we'll use from this page
+		 */
+		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+		page = spd.pages[page_nr];
+
+		if (PageReadahead(page))
+			page_cache_async_readahead(mapping, &in->f_ra, in,
+					page, index, req_pages - page_nr);
+
+		/*
+		 * If the page isn't uptodate, we may need to start io on it
+		 */
+		if (!PageUptodate(page)) {
+			lock_page(page);
+
+			/*
+			 * Page was truncated, or invalidated by the
+			 * filesystem.  Redo the find/create, but this time the
+			 * page is kept locked, so there's no chance of another
+			 * race with truncate/invalidate.
+			 */
+			if (!page->mapping) {
+				unlock_page(page);
+				page = find_or_create_page(mapping, index,
+						mapping_gfp_mask(mapping));
+
+				if (!page) {
+					error = -ENOMEM;
+					break;
+				}
+				page_cache_release(spd.pages[page_nr]);
+				spd.pages[page_nr] = page;
+			}
+			/*
+			 * page was already under io and is now done, great
+			 */
+			if (PageUptodate(page)) {
+				unlock_page(page);
+				goto fill_it;
+			}
+
+			/*
+			 * need to read in the page
+			 */
+			error = mapping->a_ops->readpage(in, page);
+			if (unlikely(error)) {
+				/*
+				 * We really should re-lookup the page here,
+				 * but it complicates things a lot. Instead
+				 * lets just do what we already stored, and
+				 * we'll get it the next time we are called.
+				 */
+				if (error == AOP_TRUNCATED_PAGE)
+					error = 0;
+
+				break;
+			}
+		}
+fill_it:
+		/*
+		 * i_size must be checked after PageUptodate.
+		 */
+		isize = i_size_read(mapping->host);
+		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+		if (unlikely(!isize || index > end_index))
+			break;
+
+		/*
+		 * if this is the last page, see if we need to shrink
+		 * the length and stop
+		 */
+		if (end_index == index) {
+			unsigned int plen;
+
+			/*
+			 * max good bytes in this page
+			 */
+			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			if (plen <= loff)
+				break;
+
+			/*
+			 * force quit after adding this page
+			 */
+			this_len = min(this_len, plen - loff);
+			len = this_len;
+		}
+
+		spd.partial[page_nr].offset = loff;
+		spd.partial[page_nr].len = this_len;
+		len -= this_len;
+		loff = 0;
+		spd.nr_pages++;
+		index++;
+	}
+
+	/*
+	 * Release any pages at the end, if we quit early. 'page_nr' is how far
+	 * we got, 'nr_pages' is how many pages are in the map.
+	 */
+	while (page_nr < nr_pages)
+		page_cache_release(spd.pages[page_nr++]);
+	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+
+	if (spd.nr_pages)
+		error = splice_to_pipe(pipe, &spd);
+
+	splice_shrink_spd(&spd);
+	return error;
+}
+
+/**
+ * generic_file_splice_read - splice data from file to a pipe
+ * @in:		file to splice from
+ * @ppos:	position in @in
+ * @pipe:	pipe to splice to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will read pages from given file and fill them into a pipe. Can be
+ *    used as long as the address_space operations for the source implements
+ *    a readpage() hook.
+ *
+ */
+ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	loff_t isize, left;
+	int ret;
+
+	if (IS_DAX(in->f_mapping->host))
+		return default_file_splice_read(in, ppos, pipe, len, flags);
+
+	isize = i_size_read(in->f_mapping->host);
+	if (unlikely(*ppos >= isize))
+		return 0;
+
+	left = isize - *ppos;
+	if (unlikely(left < len))
+		len = left;
+
+	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+	if (ret > 0) {
+		*ppos += ret;
+		file_accessed(in);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_splice_read);
+
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+	.can_merge = 0,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+/* Pipe buffer operations for a socket and similar. */
+const struct pipe_buf_operations nosteal_pipe_buf_ops = {
+	.can_merge = 0,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_nosteal,
+	.get = generic_pipe_buf_get,
+};
+EXPORT_SYMBOL(nosteal_pipe_buf_ops);
+
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t offset)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+
+ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+			    loff_t pos)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_write(file, (__force const char __user *)buf, count, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+EXPORT_SYMBOL(kernel_write);
+
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	unsigned int nr_pages;
+	unsigned int nr_freed;
+	size_t offset;
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
+	ssize_t res;
+	size_t this_len;
+	int error;
+	int i;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.nr_pages_max = PIPE_DEF_BUFFERS,
+		.flags = flags,
+		.ops = &default_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	res = -ENOMEM;
+	vec = __vec;
+	if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
+		vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
+		if (!vec)
+			goto shrink_ret;
+	}
+
+	offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
+		struct page *page;
+
+		page = alloc_page(GFP_USER);
+		error = -ENOMEM;
+		if (!page)
+			goto err;
+
+		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+		vec[i].iov_base = (void __user *) page_address(page);
+		vec[i].iov_len = this_len;
+		spd.pages[i] = page;
+		spd.nr_pages++;
+		len -= this_len;
+		offset = 0;
+	}
+
+	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+	if (res < 0) {
+		error = res;
+		goto err;
+	}
+
+	error = 0;
+	if (!res)
+		goto err;
+
+	nr_freed = 0;
+	for (i = 0; i < spd.nr_pages; i++) {
+		this_len = min_t(size_t, vec[i].iov_len, res);
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = this_len;
+		if (!this_len) {
+			__free_page(spd.pages[i]);
+			spd.pages[i] = NULL;
+			nr_freed++;
+		}
+		res -= this_len;
+	}
+	spd.nr_pages -= nr_freed;
+
+	res = splice_to_pipe(pipe, &spd);
+	if (res > 0)
+		*ppos += res;
+
+shrink_ret:
+	if (vec != __vec)
+		kfree(vec);
+	splice_shrink_spd(&spd);
+	return res;
+
+err:
+	for (i = 0; i < spd.nr_pages; i++)
+		__free_page(spd.pages[i]);
+
+	res = error;
+	goto shrink_ret;
+}
+EXPORT_SYMBOL(default_file_splice_read);
+
+/*
+ * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
+ * using sendpage(). Return the number of bytes sent.
+ */
+static int pipe_to_sendpage(struct pipe_inode_info *pipe,
+			    struct pipe_buffer *buf, struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+	loff_t pos = sd->pos;
+	int more;
+
+	if (!likely(file->f_op->sendpage))
+		return -EINVAL;
+
+	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
+
+	if (sd->len < sd->total_len && pipe->nrbufs > 1)
+		more |= MSG_SENDPAGE_NOTLAST;
+
+	return file->f_op->sendpage(file, buf->page, buf->offset,
+				    sd->len, &pos, more);
+}
+
+static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
+{
+	smp_mb();
+	if (waitqueue_active(&pipe->wait))
+		wake_up_interruptible(&pipe->wait);
+	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+}
+
+/**
+ * splice_from_pipe_feed - feed available data from a pipe to a file
+ * @pipe:	pipe to splice from
+ * @sd:		information to @actor
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    This function loops over the pipe and calls @actor to do the
+ *    actual moving of a single struct pipe_buffer to the desired
+ *    destination.  It returns when there's no more buffers left in
+ *    the pipe or if the requested number of bytes (@sd->total_len)
+ *    have been copied.  It returns a positive number (one) if the
+ *    pipe needs to be filled with more data, zero if the required
+ *    number of bytes have been copied and -errno on error.
+ *
+ *    This, together with splice_from_pipe_{begin,end,next}, may be
+ *    used to implement the functionality of __splice_from_pipe() when
+ *    locking is required around copying the pipe buffers to the
+ *    destination.
+ */
+static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			  splice_actor *actor)
+{
+	int ret;
+
+	while (pipe->nrbufs) {
+		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+		const struct pipe_buf_operations *ops = buf->ops;
+
+		sd->len = buf->len;
+		if (sd->len > sd->total_len)
+			sd->len = sd->total_len;
+
+		ret = buf->ops->confirm(pipe, buf);
+		if (unlikely(ret)) {
+			if (ret == -ENODATA)
+				ret = 0;
+			return ret;
+		}
+
+		ret = actor(pipe, buf, sd);
+		if (ret <= 0)
+			return ret;
+
+		buf->offset += ret;
+		buf->len -= ret;
+
+		sd->num_spliced += ret;
+		sd->len -= ret;
+		sd->pos += ret;
+		sd->total_len -= ret;
+
+		if (!buf->len) {
+			buf->ops = NULL;
+			ops->release(pipe, buf);
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+			pipe->nrbufs--;
+			if (pipe->files)
+				sd->need_wakeup = true;
+		}
+
+		if (!sd->total_len)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * splice_from_pipe_next - wait for some data to splice from
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wait for some data and return a positive
+ *    value (one) if pipe buffers are available.  It will return zero
+ *    or -errno if no more data needs to be spliced.
+ */
+static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	while (!pipe->nrbufs) {
+		if (!pipe->writers)
+			return 0;
+
+		if (!pipe->waiting_writers && sd->num_spliced)
+			return 0;
+
+		if (sd->flags & SPLICE_F_NONBLOCK)
+			return -EAGAIN;
+
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+
+		if (sd->need_wakeup) {
+			wakeup_pipe_writers(pipe);
+			sd->need_wakeup = false;
+		}
+
+		pipe_wait(pipe);
+	}
+
+	return 1;
+}
+
+/**
+ * splice_from_pipe_begin - start splicing from pipe
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function should be called before a loop containing
+ *    splice_from_pipe_next() and splice_from_pipe_feed() to
+ *    initialize the necessary fields of @sd.
+ */
+static void splice_from_pipe_begin(struct splice_desc *sd)
+{
+	sd->num_spliced = 0;
+	sd->need_wakeup = false;
+}
+
+/**
+ * splice_from_pipe_end - finish splicing from pipe
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wake up pipe writers if necessary.  It should
+ *    be called after a loop containing splice_from_pipe_next() and
+ *    splice_from_pipe_feed().
+ */
+static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	if (sd->need_wakeup)
+		wakeup_pipe_writers(pipe);
+}
+
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe:	pipe to splice from
+ * @sd:		information to @actor
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    This function does little more than loop over the pipe and call
+ *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			   splice_actor *actor)
+{
+	int ret;
+
+	splice_from_pipe_begin(sd);
+	do {
+		ret = splice_from_pipe_next(pipe, sd);
+		if (ret > 0)
+			ret = splice_from_pipe_feed(pipe, sd, actor);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, sd);
+
+	return sd->num_spliced ? sd->num_spliced : ret;
+}
+EXPORT_SYMBOL(__splice_from_pipe);
+
+/**
+ * splice_from_pipe - splice data from a pipe to a file
+ * @pipe:	pipe to splice from
+ * @out:	file to splice to
+ * @ppos:	position in @out
+ * @len:	how many bytes to splice
+ * @flags:	splice modifier flags
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    See __splice_from_pipe. This function locks the pipe inode,
+ *    otherwise it's identical to __splice_from_pipe().
+ *
+ */
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor)
+{
+	ssize_t ret;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+
+	pipe_lock(pipe);
+	ret = __splice_from_pipe(pipe, &sd, actor);
+	pipe_unlock(pipe);
+
+	return ret;
+}
+
+/**
+ * iter_file_splice_write - splice data from a pipe to a file
+ * @pipe:	pipe info
+ * @out:	file to write to
+ * @ppos:	position in @out
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will either move or copy pages (determined by @flags options) from
+ *    the given pipe inode to the given file.
+ *    This one is ->write_iter-based.
+ *
+ */
+ssize_t
+iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+	int nbufs = pipe->buffers;
+	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
+					GFP_KERNEL);
+	ssize_t ret;
+
+	if (unlikely(!array))
+		return -ENOMEM;
+
+	pipe_lock(pipe);
+
+	splice_from_pipe_begin(&sd);
+	while (sd.total_len) {
+		struct iov_iter from;
+		size_t left;
+		int n, idx;
+
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
+
+		if (unlikely(nbufs < pipe->buffers)) {
+			kfree(array);
+			nbufs = pipe->buffers;
+			array = kcalloc(nbufs, sizeof(struct bio_vec),
+					GFP_KERNEL);
+			if (!array) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
+		/* build the vector */
+		left = sd.total_len;
+		for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
+			struct pipe_buffer *buf = pipe->bufs + idx;
+			size_t this_len = buf->len;
+
+			if (this_len > left)
+				this_len = left;
+
+			if (idx == pipe->buffers - 1)
+				idx = -1;
+
+			ret = buf->ops->confirm(pipe, buf);
+			if (unlikely(ret)) {
+				if (ret == -ENODATA)
+					ret = 0;
+				goto done;
+			}
+
+			array[n].bv_page = buf->page;
+			array[n].bv_len = this_len;
+			array[n].bv_offset = buf->offset;
+			left -= this_len;
+		}
+
+		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
+			      sd.total_len - left);
+		ret = vfs_iter_write(out, &from, &sd.pos);
+		if (ret <= 0)
+			break;
+
+		sd.num_spliced += ret;
+		sd.total_len -= ret;
+		*ppos = sd.pos;
+
+		/* dismiss the fully eaten buffers, adjust the partial one */
+		while (ret) {
+			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+			if (ret >= buf->len) {
+				const struct pipe_buf_operations *ops = buf->ops;
+				ret -= buf->len;
+				buf->len = 0;
+				buf->ops = NULL;
+				ops->release(pipe, buf);
+				pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+				pipe->nrbufs--;
+				if (pipe->files)
+					sd.need_wakeup = true;
+			} else {
+				buf->offset += ret;
+				buf->len -= ret;
+				ret = 0;
+			}
+		}
+	}
+done:
+	kfree(array);
+	splice_from_pipe_end(pipe, &sd);
+
+	pipe_unlock(pipe);
+
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	return ret;
+}
+
+EXPORT_SYMBOL(iter_file_splice_write);
+
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			  struct splice_desc *sd)
+{
+	int ret;
+	void *data;
+	loff_t tmp = sd->pos;
+
+	data = kmap(buf->page);
+	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
+	kunmap(buf->page);
+
+	return ret;
+}
+
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out, loff_t *ppos,
+					 size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+	if (ret > 0)
+		*ppos += ret;
+
+	return ret;
+}
+
+/**
+ * generic_splice_sendpage - splice data from a pipe to a socket
+ * @pipe:	pipe to splice from
+ * @out:	socket to write to
+ * @ppos:	position in @out
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will send @len bytes from the pipe to a network socket. No data copying
+ *    is involved.
+ *
+ */
+ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
+				loff_t *ppos, size_t len, unsigned int flags)
+{
+	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
+}
+
+EXPORT_SYMBOL(generic_splice_sendpage);
+
+/*
+ * Attempt to initiate a splice from pipe to file.
+ */
+static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags)
+{
+	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+				loff_t *, size_t, unsigned int);
+
+	if (out->f_op->splice_write)
+		splice_write = out->f_op->splice_write;
+	else
+		splice_write = default_file_splice_write;
+
+	return splice_write(pipe, out, ppos, len, flags);
+}
+
+/*
+ * Attempt to initiate a splice from a file to a pipe.
+ */
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
+{
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
+	int ret;
+
+	if (unlikely(!(in->f_mode & FMODE_READ)))
+		return -EBADF;
+
+	ret = rw_verify_area(READ, in, ppos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	if (in->f_op->splice_read)
+		splice_read = in->f_op->splice_read;
+	else
+		splice_read = default_file_splice_read;
+
+	return splice_read(in, ppos, pipe, len, flags);
+}
+
+/**
+ * splice_direct_to_actor - splices data directly between two non-pipes
+ * @in:		file to splice from
+ * @sd:		actor information on where to splice to
+ * @actor:	handles the data splicing
+ *
+ * Description:
+ *    This is a special case helper to splice directly between two
+ *    points, without requiring an explicit pipe. Internally an allocated
+ *    pipe is cached in the process, and reused during the lifetime of
+ *    that process.
+ *
+ */
+ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
+			       splice_direct_actor *actor)
+{
+	struct pipe_inode_info *pipe;
+	long ret, bytes;
+	umode_t i_mode;
+	size_t len;
+	int i, flags, more;
+
+	/*
+	 * We require the input being a regular file, as we don't want to
+	 * randomly drop data for eg socket -> socket splicing. Use the
+	 * piped splicing for that!
+	 */
+	i_mode = file_inode(in)->i_mode;
+	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+		return -EINVAL;
+
+	/*
+	 * neither in nor out is a pipe, setup an internal pipe attached to
+	 * 'out' and transfer the wanted data from 'in' to 'out' through that
+	 */
+	pipe = current->splice_pipe;
+	if (unlikely(!pipe)) {
+		pipe = alloc_pipe_info();
+		if (!pipe)
+			return -ENOMEM;
+
+		/*
+		 * We don't have an immediate reader, but we'll read the stuff
+		 * out of the pipe right after the splice_to_pipe(). So set
+		 * PIPE_READERS appropriately.
+		 */
+		pipe->readers = 1;
+
+		current->splice_pipe = pipe;
+	}
+
+	/*
+	 * Do the splice.
+	 */
+	ret = 0;
+	bytes = 0;
+	len = sd->total_len;
+	flags = sd->flags;
+
+	/*
+	 * Don't block on output, we have to drain the direct pipe.
+	 */
+	sd->flags &= ~SPLICE_F_NONBLOCK;
+	more = sd->flags & SPLICE_F_MORE;
+
+	while (len) {
+		size_t read_len;
+		loff_t pos = sd->pos, prev_pos = pos;
+
+		ret = do_splice_to(in, &pos, pipe, len, flags);
+		if (unlikely(ret <= 0))
+			goto out_release;
+
+		read_len = ret;
+		sd->total_len = read_len;
+
+		/*
+		 * If more data is pending, set SPLICE_F_MORE
+		 * If this is the last data and SPLICE_F_MORE was not set
+		 * initially, clears it.
+		 */
+		if (read_len < len)
+			sd->flags |= SPLICE_F_MORE;
+		else if (!more)
+			sd->flags &= ~SPLICE_F_MORE;
+		/*
+		 * NOTE: nonblocking mode only applies to the input. We
+		 * must not do the output in nonblocking mode as then we
+		 * could get stuck data in the internal pipe:
+		 */
+		ret = actor(pipe, sd);
+		if (unlikely(ret <= 0)) {
+			sd->pos = prev_pos;
+			goto out_release;
+		}
+
+		bytes += ret;
+		len -= ret;
+		sd->pos = pos;
+
+		if (ret < read_len) {
+			sd->pos = prev_pos + ret;
+			goto out_release;
+		}
+	}
+
+done:
+	pipe->nrbufs = pipe->curbuf = 0;
+	file_accessed(in);
+	return bytes;
+
+out_release:
+	/*
+	 * If we did an incomplete transfer we must release
+	 * the pipe buffers in question:
+	 */
+	for (i = 0; i < pipe->buffers; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+
+		if (buf->ops) {
+			buf->ops->release(pipe, buf);
+			buf->ops = NULL;
+		}
+	}
+
+	if (!bytes)
+		bytes = ret;
+
+	goto done;
+}
+EXPORT_SYMBOL(splice_direct_to_actor);
+
+static int direct_splice_actor(struct pipe_inode_info *pipe,
+			       struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+
+	return do_splice_from(pipe, file, sd->opos, sd->total_len,
+			      sd->flags);
+}
+
+/**
+ * do_splice_direct - splices data directly between two files
+ * @in:		file to splice from
+ * @ppos:	input file offset
+ * @out:	file to splice to
+ * @opos:	output file offset
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    For use by do_sendfile(). splice can easily emulate sendfile, but
+ *    doing it in the application would incur an extra system call
+ *    (splice in + splice out, as compared to just sendfile()). So this helper
+ *    can splice directly through a process-private pipe.
+ *
+ */
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      loff_t *opos, size_t len, unsigned int flags)
+{
+	struct splice_desc sd = {
+		.len		= len,
+		.total_len	= len,
+		.flags		= flags,
+		.pos		= *ppos,
+		.u.file		= out,
+		.opos		= opos,
+	};
+	long ret;
+
+	if (unlikely(!(out->f_mode & FMODE_WRITE)))
+		return -EBADF;
+
+	if (unlikely(out->f_flags & O_APPEND))
+		return -EINVAL;
+
+	ret = rw_verify_area(WRITE, out, opos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
+	if (ret > 0)
+		*ppos = sd.pos;
+
+	return ret;
+}
+EXPORT_SYMBOL(do_splice_direct);
+
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags);
+
+/*
+ * Determine where to splice to/from.
+ */
+static long do_splice(struct file *in, loff_t __user *off_in,
+		      struct file *out, loff_t __user *off_out,
+		      size_t len, unsigned int flags)
+{
+	struct pipe_inode_info *ipipe;
+	struct pipe_inode_info *opipe;
+	loff_t offset;
+	long ret;
+
+	ipipe = get_pipe_info(in);
+	opipe = get_pipe_info(out);
+
+	if (ipipe && opipe) {
+		if (off_in || off_out)
+			return -ESPIPE;
+
+		if (!(in->f_mode & FMODE_READ))
+			return -EBADF;
+
+		if (!(out->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		/* Splicing to self would be fun, but... */
+		if (ipipe == opipe)
+			return -EINVAL;
+
+		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+	}
+
+	if (ipipe) {
+		if (off_in)
+			return -ESPIPE;
+		if (off_out) {
+			if (!(out->f_mode & FMODE_PWRITE))
+				return -EINVAL;
+			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
+				return -EFAULT;
+		} else {
+			offset = out->f_pos;
+		}
+
+		if (unlikely(!(out->f_mode & FMODE_WRITE)))
+			return -EBADF;
+
+		if (unlikely(out->f_flags & O_APPEND))
+			return -EINVAL;
+
+		ret = rw_verify_area(WRITE, out, &offset, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		file_start_write(out);
+		ret = do_splice_from(ipipe, out, &offset, len, flags);
+		file_end_write(out);
+
+		if (!off_out)
+			out->f_pos = offset;
+		else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
+	}
+
+	if (opipe) {
+		if (off_out)
+			return -ESPIPE;
+		if (off_in) {
+			if (!(in->f_mode & FMODE_PREAD))
+				return -EINVAL;
+			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
+				return -EFAULT;
+		} else {
+			offset = in->f_pos;
+		}
+
+		ret = do_splice_to(in, &offset, opipe, len, flags);
+
+		if (!off_in)
+			in->f_pos = offset;
+		else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Map an iov into an array of pages and offset/length tupples. With the
+ * partial_page structure, we can map several non-contiguous ranges into
+ * our ones pages[] map instead of splitting that operation into pieces.
+ * Could easily be exported as a generic helper for other users, in which
+ * case one would probably want to add a 'max_nr_pages' parameter as well.
+ */
+static int get_iovec_page_array(const struct iovec __user *iov,
+				unsigned int nr_vecs, struct page **pages,
+				struct partial_page *partial, bool aligned,
+				unsigned int pipe_buffers)
+{
+	int buffers = 0, error = 0;
+
+	while (nr_vecs) {
+		unsigned long off, npages;
+		struct iovec entry;
+		void __user *base;
+		size_t len;
+		int i;
+
+		error = -EFAULT;
+		if (copy_from_user(&entry, iov, sizeof(entry)))
+			break;
+
+		base = entry.iov_base;
+		len = entry.iov_len;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		error = 0;
+		if (unlikely(!len))
+			break;
+		error = -EFAULT;
+		if (!access_ok(VERIFY_READ, base, len))
+			break;
+
+		/*
+		 * Get this base offset and number of pages, then map
+		 * in the user pages.
+		 */
+		off = (unsigned long) base & ~PAGE_MASK;
+
+		/*
+		 * If asked for alignment, the offset must be zero and the
+		 * length a multiple of the PAGE_SIZE.
+		 */
+		error = -EINVAL;
+		if (aligned && (off || len & ~PAGE_MASK))
+			break;
+
+		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (npages > pipe_buffers - buffers)
+			npages = pipe_buffers - buffers;
+
+		error = get_user_pages_fast((unsigned long)base, npages,
+					0, &pages[buffers]);
+
+		if (unlikely(error <= 0))
+			break;
+
+		/*
+		 * Fill this contiguous range into the partial page map.
+		 */
+		for (i = 0; i < error; i++) {
+			const int plen = min_t(size_t, len, PAGE_SIZE - off);
+
+			partial[buffers].offset = off;
+			partial[buffers].len = plen;
+
+			off = 0;
+			len -= plen;
+			buffers++;
+		}
+
+		/*
+		 * We didn't complete this iov, stop here since it probably
+		 * means we have to move some of this into a pipe to
+		 * be able to continue.
+		 */
+		if (len)
+			break;
+
+		/*
+		 * Don't continue if we mapped fewer pages than we asked for,
+		 * or if we mapped the max number of pages that we have
+		 * room for.
+		 */
+		if (error < npages || buffers == pipe_buffers)
+			break;
+
+		nr_vecs--;
+		iov++;
+	}
+
+	if (buffers)
+		return buffers;
+
+	return error;
+}
+
+static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
+	return n == sd->len ? n : -EFAULT;
+}
+
+/*
+ * For lack of a better implementation, implement vmsplice() to userspace
+ * as a simple copy of the pipes pages to the user iov.
+ */
+static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	struct splice_desc sd;
+	long ret;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	struct iov_iter iter;
+
+	pipe = get_pipe_info(file);
+	if (!pipe)
+		return -EBADF;
+
+	ret = import_iovec(READ, uiov, nr_segs,
+			   ARRAY_SIZE(iovstack), &iov, &iter);
+	if (ret < 0)
+		return ret;
+
+	sd.total_len = iov_iter_count(&iter);
+	sd.len = 0;
+	sd.flags = flags;
+	sd.u.data = &iter;
+	sd.pos = 0;
+
+	if (sd.total_len) {
+		pipe_lock(pipe);
+		ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+		pipe_unlock(pipe);
+	}
+
+	kfree(iov);
+	return ret;
+}
+
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ */
+static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.nr_pages_max = PIPE_DEF_BUFFERS,
+		.flags = flags,
+		.ops = &user_page_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+	long ret;
+
+	pipe = get_pipe_info(file);
+	if (!pipe)
+		return -EBADF;
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+					    spd.partial, false,
+					    spd.nr_pages_max);
+	if (spd.nr_pages <= 0)
+		ret = spd.nr_pages;
+	else
+		ret = splice_to_pipe(pipe, &spd);
+
+	splice_shrink_spd(&spd);
+	return ret;
+}
+
+/*
+ * Note that vmsplice only really supports true splicing _from_ user memory
+ * to a pipe, not the other way around. Splicing from user memory is a simple
+ * operation that can be supported without any funky alignment restrictions
+ * or nasty vm tricks. We simply map in the user memory and fill them into
+ * a pipe. The reverse isn't quite as easy, though. There are two possible
+ * solutions for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Currently we punt and implement it as a normal copy, see pipe_to_user().
+ *
+ */
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
+		unsigned long, nr_segs, unsigned int, flags)
+{
+	struct fd f;
+	long error;
+
+	if (unlikely(nr_segs > UIO_MAXIOV))
+		return -EINVAL;
+	else if (unlikely(!nr_segs))
+		return 0;
+
+	error = -EBADF;
+	f = fdget(fd);
+	if (f.file) {
+		if (f.file->f_mode & FMODE_WRITE)
+			error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
+		else if (f.file->f_mode & FMODE_READ)
+			error = vmsplice_to_user(f.file, iov, nr_segs, flags);
+
+		fdput(f);
+	}
+
+	return error;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
+		    unsigned int, nr_segs, unsigned int, flags)
+{
+	unsigned i;
+	struct iovec __user *iov;
+	if (nr_segs > UIO_MAXIOV)
+		return -EINVAL;
+	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+	for (i = 0; i < nr_segs; i++) {
+		struct compat_iovec v;
+		if (get_user(v.iov_base, &iov32[i].iov_base) ||
+		    get_user(v.iov_len, &iov32[i].iov_len) ||
+		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+		    put_user(v.iov_len, &iov[i].iov_len))
+			return -EFAULT;
+	}
+	return sys_vmsplice(fd, iov, nr_segs, flags);
+}
+#endif
+
+SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
+		int, fd_out, loff_t __user *, off_out,
+		size_t, len, unsigned int, flags)
+{
+	struct fd in, out;
+	long error;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fdget(fd_in);
+	if (in.file) {
+		if (in.file->f_mode & FMODE_READ) {
+			out = fdget(fd_out);
+			if (out.file) {
+				if (out.file->f_mode & FMODE_WRITE)
+					error = do_splice(in.file, off_in,
+							  out.file, off_out,
+							  len, flags);
+				fdput(out);
+			}
+		}
+		fdput(in);
+	}
+	return error;
+}
+
+/*
+ * Make sure there's data to read. Wait for input if we can, otherwise
+ * return an appropriate error.
+ */
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+{
+	int ret;
+
+	/*
+	 * Check ->nrbufs without the inode lock first. This function
+	 * is speculative anyways, so missing one is ok.
+	 */
+	if (pipe->nrbufs)
+		return 0;
+
+	ret = 0;
+	pipe_lock(pipe);
+
+	while (!pipe->nrbufs) {
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
+		pipe_wait(pipe);
+	}
+
+	pipe_unlock(pipe);
+	return ret;
+}
+
+/*
+ * Make sure there's writeable room. Wait for room if we can, otherwise
+ * return an appropriate error.
+ */
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+{
+	int ret;
+
+	/*
+	 * Check ->nrbufs without the inode lock first. This function
+	 * is speculative anyways, so missing one is ok.
+	 */
+	if (pipe->nrbufs < pipe->buffers)
+		return 0;
+
+	ret = 0;
+	pipe_lock(pipe);
+
+	while (pipe->nrbufs >= pipe->buffers) {
+		if (!pipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			ret = -EPIPE;
+			break;
+		}
+		if (flags & SPLICE_F_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
+	}
+
+	pipe_unlock(pipe);
+	return ret;
+}
+
+/*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, nbuf;
+	bool input_wakeup = false;
+
+
+retry:
+	ret = ipipe_prep(ipipe, flags);
+	if (ret)
+		return ret;
+
+	ret = opipe_prep(opipe, flags);
+	if (ret)
+		return ret;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by pipe info address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	pipe_double_lock(ipipe, opipe);
+
+	do {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		if (!ipipe->nrbufs && !ipipe->writers)
+			break;
+
+		/*
+		 * Cannot make any progress, because either the input
+		 * pipe is empty or the output pipe is full.
+		 */
+		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
+			/* Already processed some buffers, break */
+			if (ret)
+				break;
+
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			/*
+			 * We raced with another reader/writer and haven't
+			 * managed to process any buffers.  A zero return
+			 * value means EOF, so retry instead.
+			 */
+			pipe_unlock(ipipe);
+			pipe_unlock(opipe);
+			goto retry;
+		}
+
+		ibuf = ipipe->bufs + ipipe->curbuf;
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+		obuf = opipe->bufs + nbuf;
+
+		if (len >= ibuf->len) {
+			/*
+			 * Simply move the whole buffer from ipipe to opipe
+			 */
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			opipe->nrbufs++;
+			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
+			ipipe->nrbufs--;
+			input_wakeup = true;
+		} else {
+			/*
+			 * Get a reference to this pipe buffer,
+			 * so we can copy the contents over.
+			 */
+			ibuf->ops->get(ipipe, ibuf);
+			*obuf = *ibuf;
+
+			/*
+			 * Don't inherit the gift flag, we need to
+			 * prevent multiple steals of this page.
+			 */
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+			obuf->len = len;
+			opipe->nrbufs++;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		ret += obuf->len;
+		len -= obuf->len;
+	} while (len);
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential readers.
+	 */
+	if (ret > 0)
+		wakeup_pipe_readers(opipe);
+
+	if (input_wakeup)
+		wakeup_pipe_writers(ipipe);
+
+	return ret;
+}
+
+/*
+ * Link contents of ipipe to opipe.
+ */
+static int link_pipe(struct pipe_inode_info *ipipe,
+		     struct pipe_inode_info *opipe,
+		     size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, i = 0, nbuf;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by pipe info address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	pipe_double_lock(ipipe, opipe);
+
+	do {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		/*
+		 * If we have iterated all input buffers or ran out of
+		 * output room, break.
+		 */
+		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
+			break;
+
+		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+
+		/*
+		 * Get a reference to this pipe buffer,
+		 * so we can copy the contents over.
+		 */
+		ibuf->ops->get(ipipe, ibuf);
+
+		obuf = opipe->bufs + nbuf;
+		*obuf = *ibuf;
+
+		/*
+		 * Don't inherit the gift flag, we need to
+		 * prevent multiple steals of this page.
+		 */
+		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+		if (obuf->len > len)
+			obuf->len = len;
+
+		opipe->nrbufs++;
+		ret += obuf->len;
+		len -= obuf->len;
+		i++;
+	} while (len);
+
+	/*
+	 * return EAGAIN if we have the potential of some data in the
+	 * future, otherwise just return 0
+	 */
+	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
+		ret = -EAGAIN;
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential readers.
+	 */
+	if (ret > 0)
+		wakeup_pipe_readers(opipe);
+
+	return ret;
+}
+
+/*
+ * This is a tee(1) implementation that works on pipes. It doesn't copy
+ * any data, it simply references the 'in' pages on the 'out' pipe.
+ * The 'flags' used are the SPLICE_F_* variants, currently the only
+ * applicable one is SPLICE_F_NONBLOCK.
+ */
+static long do_tee(struct file *in, struct file *out, size_t len,
+		   unsigned int flags)
+{
+	struct pipe_inode_info *ipipe = get_pipe_info(in);
+	struct pipe_inode_info *opipe = get_pipe_info(out);
+	int ret = -EINVAL;
+
+	/*
+	 * Duplicate the contents of ipipe to opipe without actually
+	 * copying the data.
+	 */
+	if (ipipe && opipe && ipipe != opipe) {
+		/*
+		 * Keep going, unless we encounter an error. The ipipe/opipe
+		 * ordering doesn't really matter.
+		 */
+		ret = ipipe_prep(ipipe, flags);
+		if (!ret) {
+			ret = opipe_prep(opipe, flags);
+			if (!ret)
+				ret = link_pipe(ipipe, opipe, len, flags);
+		}
+	}
+
+	return ret;
+}
+
+SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
+{
+	struct fd in;
+	int error;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fdget(fdin);
+	if (in.file) {
+		if (in.file->f_mode & FMODE_READ) {
+			struct fd out = fdget(fdout);
+			if (out.file) {
+				if (out.file->f_mode & FMODE_WRITE)
+					error = do_tee(in.file, out.file,
+							len, flags);
+				fdput(out);
+			}
+		}
+ 		fdput(in);
+ 	}
+
+	return error;
+}
diff --git a/kernel/fs/squashfs/Kconfig b/kernel/fs/squashfs/Kconfig
new file mode 100644
index 000000000..ffb093e72
--- /dev/null
+++ b/kernel/fs/squashfs/Kconfig
@@ -0,0 +1,210 @@
+config SQUASHFS
+	tristate "SquashFS 4.0 - Squashed file system support"
+	depends on BLOCK
+	help
+	  Saying Y here includes support for SquashFS 4.0 (a Compressed
+	  Read-Only File System).  Squashfs is a highly compressed read-only
+	  filesystem for Linux.  It uses zlib, lzo or xz compression to
+	  compress both files, inodes and directories.  Inodes in the system
+	  are very small and all blocks are packed to minimise data overhead.
+	  Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
+	  (default block size 128K).  SquashFS 4.0 supports 64 bit filesystems
+	  and files (larger than 4GB), full uid/gid information, hard links and
+	  timestamps.
+
+	  Squashfs is intended for general read-only filesystem use, for
+	  archival use (i.e. in cases where a .tar.gz file may be used), and in
+	  embedded systems where low overhead is needed.  Further information
+	  and tools are available from http://squashfs.sourceforge.net.
+
+	  If you want to compile this as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want),
+	  say M here.  The module will be called squashfs.  Note that the root
+	  file system (the one containing the directory /) cannot be compiled
+	  as a module.
+
+	  If unsure, say N.
+
+choice
+	prompt "File decompression options"
+	depends on SQUASHFS
+	help
+	  Squashfs now supports two options for decompressing file
+	  data.  Traditionally Squashfs has decompressed into an
+	  intermediate buffer and then memcopied it into the page cache.
+	  Squashfs now supports the ability to decompress directly into
+	  the page cache.
+
+	  If unsure, select "Decompress file data into an intermediate buffer"
+
+config SQUASHFS_FILE_CACHE
+	bool "Decompress file data into an intermediate buffer"
+	help
+	  Decompress file data into an intermediate buffer and then
+	  memcopy it into the page cache.
+
+config SQUASHFS_FILE_DIRECT
+	bool "Decompress files directly into the page cache"
+	help
+	  Directly decompress file data into the page cache.
+	  Doing so can significantly improve performance because
+	  it eliminates a memcpy and it also removes the lock contention
+	  on the single buffer.
+
+endchoice
+
+choice
+	prompt "Decompressor parallelisation options"
+	depends on SQUASHFS
+	help
+	  Squashfs now supports three parallelisation options for
+	  decompression.  Each one exhibits various trade-offs between
+	  decompression performance and CPU and memory usage.
+
+	  If in doubt, select "Single threaded compression"
+
+config SQUASHFS_DECOMP_SINGLE
+	bool "Single threaded compression"
+	help
+	  Traditionally Squashfs has used single-threaded decompression.
+	  Only one block (data or metadata) can be decompressed at any
+	  one time.  This limits CPU and memory usage to a minimum.
+
+config SQUASHFS_DECOMP_MULTI
+	bool "Use multiple decompressors for parallel I/O"
+	help
+	  By default Squashfs uses a single decompressor but it gives
+	  poor performance on parallel I/O workloads when using multiple CPU
+	  machines due to waiting on decompressor availability.
+
+	  If you have a parallel I/O workload and your system has enough memory,
+	  using this option may improve overall I/O performance.
+
+	  This decompressor implementation uses up to two parallel
+	  decompressors per core.  It dynamically allocates decompressors
+	  on a demand basis.
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+	bool "Use percpu multiple decompressors for parallel I/O"
+	help
+	  By default Squashfs uses a single decompressor but it gives
+	  poor performance on parallel I/O workloads when using multiple CPU
+	  machines due to waiting on decompressor availability.
+
+	  This decompressor implementation uses a maximum of one
+	  decompressor per core.  It uses percpu variables to ensure
+	  decompression is load-balanced across the cores.
+
+endchoice
+
+config SQUASHFS_XATTR
+	bool "Squashfs XATTR support"
+	depends on SQUASHFS
+	help
+	  Saying Y here includes support for extended attributes (xattrs).
+	  Xattrs are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page).
+
+	  If unsure, say N.
+
+config SQUASHFS_ZLIB
+	bool "Include support for ZLIB compressed file systems"
+	depends on SQUASHFS
+	select ZLIB_INFLATE
+	default y
+	help
+	  ZLIB compression is the standard compression used by Squashfs
+	  file systems.  It offers a good trade-off between compression
+	  achieved and the amount of CPU time and memory necessary to
+	  compress and decompress.
+
+	  If unsure, say Y.
+
+config SQUASHFS_LZ4
+	bool "Include support for LZ4 compressed file systems"
+	depends on SQUASHFS
+	select LZ4_DECOMPRESS
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with LZ4 compression.  LZ4 compression is mainly
+	  aimed at embedded systems with slower CPUs where the overheads
+	  of zlib are too high.
+
+	  LZ4 is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
+config SQUASHFS_LZO
+	bool "Include support for LZO compressed file systems"
+	depends on SQUASHFS
+	select LZO_DECOMPRESS
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with LZO compression.  LZO compression is mainly
+	  aimed at embedded systems with slower CPUs where the overheads
+	  of zlib are too high.
+
+	  LZO is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
+config SQUASHFS_XZ
+	bool "Include support for XZ compressed file systems"
+	depends on SQUASHFS
+	select XZ_DEC
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with XZ compression.  XZ gives better compression than
+	  the default zlib compression, at the expense of greater CPU and
+	  memory overhead.
+
+	  XZ is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
+config SQUASHFS_4K_DEVBLK_SIZE
+	bool "Use 4K device block size?"
+	depends on SQUASHFS
+	help
+	  By default Squashfs sets the dev block size (sb_min_blocksize)
+	  to 1K or the smallest block size supported by the block device
+	  (if larger).  This, because blocks are packed together and
+	  unaligned in Squashfs, should reduce latency.
+
+	  This, however, gives poor performance on MTD NAND devices where
+	  the optimal I/O size is 4K (even though the devices can support
+	  smaller block sizes).
+
+	  Using a 4K device block size may also improve overall I/O
+	  performance for some file access patterns (e.g. sequential
+	  accesses of files in filesystem order) on all media.
+
+	  Setting this option will force Squashfs to use a 4K device block
+	  size by default.
+
+	  If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+	bool "Additional option for memory-constrained systems"
+	depends on SQUASHFS
+	help
+	  Saying Y here allows you to specify cache size.
+
+	  If unsure, say N.
+
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+	int "Number of fragments cached" if SQUASHFS_EMBEDDED
+	depends on SQUASHFS
+	default "3"
+	help
+	  By default SquashFS caches the last 3 fragments read from
+	  the filesystem.  Increasing this amount may mean SquashFS
+	  has to re-read fragments less often from disk, at the expense
+	  of extra system memory.  Decreasing this amount will mean
+	  SquashFS uses less memory at the expense of extra reads from disk.
+
+	  Note there must be at least one cached fragment.  Anything
+	  much more than three will probably not make much difference.
diff --git a/kernel/fs/squashfs/Makefile b/kernel/fs/squashfs/Makefile
new file mode 100644
index 000000000..246a6f329
--- /dev/null
+++ b/kernel/fs/squashfs/Makefile
@@ -0,0 +1,17 @@
+#
+# Makefile for the linux squashfs routines.
+#
+
+obj-$(CONFIG_SQUASHFS) += squashfs.o
+squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
+squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
+squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o
diff --git a/kernel/fs/squashfs/block.c b/kernel/fs/squashfs/block.c
new file mode 100644
index 000000000..0cea9b923
--- /dev/null
+++ b/kernel/fs/squashfs/block.c
@@ -0,0 +1,214 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * block.c
+ */
+
+/*
+ * This file implements the low-level routines to read and decompress
+ * datablocks and metadata blocks.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "page_actor.h"
+
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+			u64 *cur_index, int *offset, int *length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head *bh;
+
+	bh = sb_bread(sb, *cur_index);
+	if (bh == NULL)
+		return NULL;
+
+	if (msblk->devblksize - *offset == 1) {
+		*length = (unsigned char) bh->b_data[*offset];
+		put_bh(bh);
+		bh = sb_bread(sb, ++(*cur_index));
+		if (bh == NULL)
+			return NULL;
+		*length |= (unsigned char) bh->b_data[0] << 8;
+		*offset = 1;
+	} else {
+		*length = (unsigned char) bh->b_data[*offset] |
+			(unsigned char) bh->b_data[*offset + 1] << 8;
+		*offset += 2;
+
+		if (*offset == msblk->devblksize) {
+			put_bh(bh);
+			bh = sb_bread(sb, ++(*cur_index));
+			if (bh == NULL)
+				return NULL;
+			*offset = 0;
+		}
+	}
+
+	return bh;
+}
+
+
+/*
+ * Read and decompress a metadata block or datablock.  Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block.  A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with compression
+ * algorithms).
+ */
+int squashfs_read_data(struct super_block *sb, u64 index, int length,
+		u64 *next_index, struct squashfs_page_actor *output)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head **bh;
+	int offset = index & ((1 << msblk->devblksize_log2) - 1);
+	u64 cur_index = index >> msblk->devblksize_log2;
+	int bytes, compressed, b = 0, k = 0, avail, i;
+
+	bh = kcalloc(((output->length + msblk->devblksize - 1)
+		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
+	if (bh == NULL)
+		return -ENOMEM;
+
+	if (length) {
+		/*
+		 * Datablock.
+		 */
+		bytes = -offset;
+		compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+		length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+		if (next_index)
+			*next_index = index + length;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+			index, compressed ? "" : "un", length, output->length);
+
+		if (length < 0 || length > output->length ||
+				(index + length) > msblk->bytes_used)
+			goto read_failure;
+
+		for (b = 0; bytes < length; b++, cur_index++) {
+			bh[b] = sb_getblk(sb, cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b, bh);
+	} else {
+		/*
+		 * Metadata block.
+		 */
+		if ((index + 2) > msblk->bytes_used)
+			goto read_failure;
+
+		bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+		if (bh[0] == NULL)
+			goto read_failure;
+		b = 1;
+
+		bytes = msblk->devblksize - offset;
+		compressed = SQUASHFS_COMPRESSED(length);
+		length = SQUASHFS_COMPRESSED_SIZE(length);
+		if (next_index)
+			*next_index = index + length + 2;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+				compressed ? "" : "un", length);
+
+		if (length < 0 || length > output->length ||
+					(index + length) > msblk->bytes_used)
+			goto block_release;
+
+		for (; bytes < length; b++) {
+			bh[b] = sb_getblk(sb, ++cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b - 1, bh + 1);
+	}
+
+	for (i = 0; i < b; i++) {
+		wait_on_buffer(bh[i]);
+		if (!buffer_uptodate(bh[i]))
+			goto block_release;
+	}
+
+	if (compressed) {
+		length = squashfs_decompress(msblk, bh, b, offset, length,
+			output);
+		if (length < 0)
+			goto read_failure;
+	} else {
+		/*
+		 * Block is uncompressed.
+		 */
+		int in, pg_offset = 0;
+		void *data = squashfs_first_page(output);
+
+		for (bytes = length; k < b; k++) {
+			in = min(bytes, msblk->devblksize - offset);
+			bytes -= in;
+			while (in) {
+				if (pg_offset == PAGE_CACHE_SIZE) {
+					data = squashfs_next_page(output);
+					pg_offset = 0;
+				}
+				avail = min_t(int, in, PAGE_CACHE_SIZE -
+						pg_offset);
+				memcpy(data + pg_offset, bh[k]->b_data + offset,
+						avail);
+				in -= avail;
+				pg_offset += avail;
+				offset += avail;
+			}
+			offset = 0;
+			put_bh(bh[k]);
+		}
+		squashfs_finish_page(output);
+	}
+
+	kfree(bh);
+	return length;
+
+block_release:
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+read_failure:
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
+	kfree(bh);
+	return -EIO;
+}
diff --git a/kernel/fs/squashfs/cache.c b/kernel/fs/squashfs/cache.c
new file mode 100644
index 000000000..1cb70a0b2
--- /dev/null
+++ b/kernel/fs/squashfs/cache.c
@@ -0,0 +1,458 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+
+/*
+ * Blocks in Squashfs are compressed.  To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ *
+ * To avoid out of memory and fragmentation issues with vmalloc the cache
+ * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ *
+ * It should be noted that the cache is not used for file datablocks, these
+ * are decompressed and cached in the page-cache in the normal way.  The
+ * cache is only used to temporarily cache fragment and metadata blocks
+ * which have been read as as a result of a metadata (i.e. inode or
+ * directory) or fragment access.  Because metadata and fragments are packed
+ * together into blocks (to gain greater compression) the read of a particular
+ * piece of metadata or fragment will retrieve other metadata/fragments which
+ * have been packed with it, these because of locality-of-reference may be read
+ * in the near future. Temporarily caching them ensures they are available for
+ * near future access without requiring an additional read and decompress.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/pagemap.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "page_actor.h"
+
+/*
+ * Look-up block in cache, and increment usage count.  If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+	struct squashfs_cache *cache, u64 block, int length)
+{
+	int i, n;
+	struct squashfs_cache_entry *entry;
+
+	spin_lock(&cache->lock);
+
+	while (1) {
+		for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
+			if (cache->entry[i].block == block) {
+				cache->curr_blk = i;
+				break;
+			}
+			i = (i + 1) % cache->entries;
+		}
+
+		if (n == cache->entries) {
+			/*
+			 * Block not in cache, if all cache entries are used
+			 * go to sleep waiting for one to become available.
+			 */
+			if (cache->unused == 0) {
+				cache->num_waiters++;
+				spin_unlock(&cache->lock);
+				wait_event(cache->wait_queue, cache->unused);
+				spin_lock(&cache->lock);
+				cache->num_waiters--;
+				continue;
+			}
+
+			/*
+			 * At least one unused cache entry.  A simple
+			 * round-robin strategy is used to choose the entry to
+			 * be evicted from the cache.
+			 */
+			i = cache->next_blk;
+			for (n = 0; n < cache->entries; n++) {
+				if (cache->entry[i].refcount == 0)
+					break;
+				i = (i + 1) % cache->entries;
+			}
+
+			cache->next_blk = (i + 1) % cache->entries;
+			entry = &cache->entry[i];
+
+			/*
+			 * Initialise chosen cache entry, and fill it in from
+			 * disk.
+			 */
+			cache->unused--;
+			entry->block = block;
+			entry->refcount = 1;
+			entry->pending = 1;
+			entry->num_waiters = 0;
+			entry->error = 0;
+			spin_unlock(&cache->lock);
+
+			entry->length = squashfs_read_data(sb, block, length,
+				&entry->next_index, entry->actor);
+
+			spin_lock(&cache->lock);
+
+			if (entry->length < 0)
+				entry->error = entry->length;
+
+			entry->pending = 0;
+
+			/*
+			 * While filling this entry one or more other processes
+			 * have looked it up in the cache, and have slept
+			 * waiting for it to become available.
+			 */
+			if (entry->num_waiters) {
+				spin_unlock(&cache->lock);
+				wake_up_all(&entry->wait_queue);
+			} else
+				spin_unlock(&cache->lock);
+
+			goto out;
+		}
+
+		/*
+		 * Block already in cache.  Increment refcount so it doesn't
+		 * get reused until we're finished with it, if it was
+		 * previously unused there's one less cache entry available
+		 * for reuse.
+		 */
+		entry = &cache->entry[i];
+		if (entry->refcount == 0)
+			cache->unused--;
+		entry->refcount++;
+
+		/*
+		 * If the entry is currently being filled in by another process
+		 * go to sleep waiting for it to become available.
+		 */
+		if (entry->pending) {
+			entry->num_waiters++;
+			spin_unlock(&cache->lock);
+			wait_event(entry->wait_queue, !entry->pending);
+		} else
+			spin_unlock(&cache->lock);
+
+		goto out;
+	}
+
+out:
+	TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
+		cache->name, i, entry->block, entry->refcount, entry->error);
+
+	if (entry->error)
+		ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+							block);
+	return entry;
+}
+
+
+/*
+ * Release cache entry, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache_entry *entry)
+{
+	struct squashfs_cache *cache = entry->cache;
+
+	spin_lock(&cache->lock);
+	entry->refcount--;
+	if (entry->refcount == 0) {
+		cache->unused++;
+		/*
+		 * If there's any processes waiting for a block to become
+		 * available, wake one up.
+		 */
+		if (cache->num_waiters) {
+			spin_unlock(&cache->lock);
+			wake_up(&cache->wait_queue);
+			return;
+		}
+	}
+	spin_unlock(&cache->lock);
+}
+
+/*
+ * Delete cache reclaiming all kmalloced buffers.
+ */
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+	int i, j;
+
+	if (cache == NULL)
+		return;
+
+	for (i = 0; i < cache->entries; i++) {
+		if (cache->entry[i].data) {
+			for (j = 0; j < cache->pages; j++)
+				kfree(cache->entry[i].data[j]);
+			kfree(cache->entry[i].data);
+		}
+		kfree(cache->entry[i].actor);
+	}
+
+	kfree(cache->entry);
+	kfree(cache);
+}
+
+
+/*
+ * Initialise cache allocating the specified number of entries, each of
+ * size block_size.  To avoid vmalloc fragmentation issues each entry
+ * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ */
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+	int block_size)
+{
+	int i, j;
+	struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+
+	if (cache == NULL) {
+		ERROR("Failed to allocate %s cache\n", name);
+		return NULL;
+	}
+
+	cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
+	if (cache->entry == NULL) {
+		ERROR("Failed to allocate %s cache\n", name);
+		goto cleanup;
+	}
+
+	cache->curr_blk = 0;
+	cache->next_blk = 0;
+	cache->unused = entries;
+	cache->entries = entries;
+	cache->block_size = block_size;
+	cache->pages = block_size >> PAGE_CACHE_SHIFT;
+	cache->pages = cache->pages ? cache->pages : 1;
+	cache->name = name;
+	cache->num_waiters = 0;
+	spin_lock_init(&cache->lock);
+	init_waitqueue_head(&cache->wait_queue);
+
+	for (i = 0; i < entries; i++) {
+		struct squashfs_cache_entry *entry = &cache->entry[i];
+
+		init_waitqueue_head(&cache->entry[i].wait_queue);
+		entry->cache = cache;
+		entry->block = SQUASHFS_INVALID_BLK;
+		entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+		if (entry->data == NULL) {
+			ERROR("Failed to allocate %s cache entry\n", name);
+			goto cleanup;
+		}
+
+		for (j = 0; j < cache->pages; j++) {
+			entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+			if (entry->data[j] == NULL) {
+				ERROR("Failed to allocate %s buffer\n", name);
+				goto cleanup;
+			}
+		}
+
+		entry->actor = squashfs_page_actor_init(entry->data,
+						cache->pages, 0);
+		if (entry->actor == NULL) {
+			ERROR("Failed to allocate %s cache entry\n", name);
+			goto cleanup;
+		}
+	}
+
+	return cache;
+
+cleanup:
+	squashfs_cache_delete(cache);
+	return NULL;
+}
+
+
+/*
+ * Copy up to length bytes from cache entry to buffer starting at offset bytes
+ * into the cache entry.  If there's not length bytes then copy the number of
+ * bytes available.  In all cases return the number of bytes copied.
+ */
+int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
+		int offset, int length)
+{
+	int remaining = length;
+
+	if (length == 0)
+		return 0;
+	else if (buffer == NULL)
+		return min(length, entry->length - offset);
+
+	while (offset < entry->length) {
+		void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+				+ (offset % PAGE_CACHE_SIZE);
+		int bytes = min_t(int, entry->length - offset,
+				PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+
+		if (bytes >= remaining) {
+			memcpy(buffer, buff, remaining);
+			remaining = 0;
+			break;
+		}
+
+		memcpy(buffer, buff, bytes);
+		buffer += bytes;
+		remaining -= bytes;
+		offset += bytes;
+	}
+
+	return length - remaining;
+}
+
+
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed).  Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+		u64 *block, int *offset, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int bytes, res = length;
+	struct squashfs_cache_entry *entry;
+
+	TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
+
+	while (length) {
+		entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
+		if (entry->error) {
+			res = entry->error;
+			goto error;
+		} else if (*offset >= entry->length) {
+			res = -EIO;
+			goto error;
+		}
+
+		bytes = squashfs_copy_data(buffer, entry, *offset, length);
+		if (buffer)
+			buffer += bytes;
+		length -= bytes;
+		*offset += bytes;
+
+		if (*offset == entry->length) {
+			*block = entry->next_index;
+			*offset = 0;
+		}
+
+		squashfs_cache_put(entry);
+	}
+
+	return res;
+
+error:
+	squashfs_cache_put(entry);
+	return res;
+}
+
+
+/*
+ * Look-up in the fragmment cache the fragment located at <start_block> in the
+ * filesystem.  If necessary read and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
+				u64 start_block, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+	return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
+		length);
+}
+
+
+/*
+ * Read and decompress the datablock located at <start_block> in the
+ * filesystem.  The cache is used here to avoid duplicating locking and
+ * read/decompress code.
+ */
+struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
+				u64 start_block, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+	return squashfs_cache_get(sb, msblk->read_page, start_block, length);
+}
+
+
+/*
+ * Read a filesystem table (uncompressed sequence of bytes) from disk
+ */
+void *squashfs_read_table(struct super_block *sb, u64 block, int length)
+{
+	int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int i, res;
+	void *table, *buffer, **data;
+	struct squashfs_page_actor *actor;
+
+	table = buffer = kmalloc(length, GFP_KERNEL);
+	if (table == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	if (data == NULL) {
+		res = -ENOMEM;
+		goto failed;
+	}
+
+	actor = squashfs_page_actor_init(data, pages, length);
+	if (actor == NULL) {
+		res = -ENOMEM;
+		goto failed2;
+	}
+
+	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+		data[i] = buffer;
+
+	res = squashfs_read_data(sb, block, length |
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
+
+	kfree(data);
+	kfree(actor);
+
+	if (res < 0)
+		goto failed;
+
+	return table;
+
+failed2:
+	kfree(data);
+failed:
+	kfree(table);
+	return ERR_PTR(res);
+}
diff --git a/kernel/fs/squashfs/decompressor.c b/kernel/fs/squashfs/decompressor.c
new file mode 100644
index 000000000..e9034bf6e
--- /dev/null
+++ b/kernel/fs/squashfs/decompressor.c
@@ -0,0 +1,148 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.c
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+#include "page_actor.h"
+
+/*
+ * This file (and decompressor.h) implements a decompressor framework for
+ * Squashfs, allowing multiple decompressors to be easily supported
+ */
+
+static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
+	NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+};
+
+#ifndef CONFIG_SQUASHFS_LZ4
+static const struct squashfs_decompressor squashfs_lz4_comp_ops = {
+	NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0
+};
+#endif
+
+#ifndef CONFIG_SQUASHFS_LZO
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
+	NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+};
+#endif
+
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
+
+#ifndef CONFIG_SQUASHFS_ZLIB
+static const struct squashfs_decompressor squashfs_zlib_comp_ops = {
+	NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
+};
+#endif
+
+static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
+	NULL, NULL, NULL, NULL, 0, "unknown", 0
+};
+
+static const struct squashfs_decompressor *decompressor[] = {
+	&squashfs_zlib_comp_ops,
+	&squashfs_lz4_comp_ops,
+	&squashfs_lzo_comp_ops,
+	&squashfs_xz_comp_ops,
+	&squashfs_lzma_unsupported_comp_ops,
+	&squashfs_unknown_comp_ops
+};
+
+
+const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
+{
+	int i;
+
+	for (i = 0; decompressor[i]->id; i++)
+		if (id == decompressor[i]->id)
+			break;
+
+	return decompressor[i];
+}
+
+
+static void *get_comp_opts(struct super_block *sb, unsigned short flags)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	void *buffer = NULL, *comp_opts;
+	struct squashfs_page_actor *actor = NULL;
+	int length = 0;
+
+	/*
+	 * Read decompressor specific options from file system if present
+	 */
+	if (SQUASHFS_COMP_OPTS(flags)) {
+		buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		if (buffer == NULL) {
+			comp_opts = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+
+		actor = squashfs_page_actor_init(&buffer, 1, 0);
+		if (actor == NULL) {
+			comp_opts = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+
+		length = squashfs_read_data(sb,
+			sizeof(struct squashfs_super_block), 0, NULL, actor);
+
+		if (length < 0) {
+			comp_opts = ERR_PTR(length);
+			goto out;
+		}
+	}
+
+	comp_opts = squashfs_comp_opts(msblk, buffer, length);
+
+out:
+	kfree(actor);
+	kfree(buffer);
+	return comp_opts;
+}
+
+
+void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	void *stream, *comp_opts = get_comp_opts(sb, flags);
+
+	if (IS_ERR(comp_opts))
+		return comp_opts;
+
+	stream = squashfs_decompressor_create(msblk, comp_opts);
+	if (IS_ERR(stream))
+		kfree(comp_opts);
+
+	return stream;
+}
diff --git a/kernel/fs/squashfs/decompressor.h b/kernel/fs/squashfs/decompressor.h
new file mode 100644
index 000000000..a25713c03
--- /dev/null
+++ b/kernel/fs/squashfs/decompressor.h
@@ -0,0 +1,61 @@
+#ifndef DECOMPRESSOR_H
+#define DECOMPRESSOR_H
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.h
+ */
+
+struct squashfs_decompressor {
+	void	*(*init)(struct squashfs_sb_info *, void *);
+	void	*(*comp_opts)(struct squashfs_sb_info *, void *, int);
+	void	(*free)(void *);
+	int	(*decompress)(struct squashfs_sb_info *, void *,
+		struct buffer_head **, int, int, int,
+		struct squashfs_page_actor *);
+	int	id;
+	char	*name;
+	int	supported;
+};
+
+static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
+							void *buff, int length)
+{
+	return msblk->decompressor->comp_opts ?
+		msblk->decompressor->comp_opts(msblk, buff, length) : NULL;
+}
+
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+
+#ifdef CONFIG_SQUASHFS_LZ4
+extern const struct squashfs_decompressor squashfs_lz4_comp_ops;
+#endif
+
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
+
+#ifdef CONFIG_SQUASHFS_ZLIB
+extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
+#endif
+
+#endif
diff --git a/kernel/fs/squashfs/decompressor_multi.c b/kernel/fs/squashfs/decompressor_multi.c
new file mode 100644
index 000000000..d6008a636
--- /dev/null
+++ b/kernel/fs/squashfs/decompressor_multi.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2013
+ *  Minchan Kim <minchan@kernel.org>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/cpumask.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression in the
+ * decompressor framework
+ */
+
+
+/*
+ * The reason that multiply two is that a CPU can request new I/O
+ * while it is waiting previous request.
+ */
+#define MAX_DECOMPRESSOR	(num_online_cpus() * 2)
+
+
+int squashfs_max_decompressors(void)
+{
+	return MAX_DECOMPRESSOR;
+}
+
+
+struct squashfs_stream {
+	void			*comp_opts;
+	struct list_head	strm_list;
+	struct mutex		mutex;
+	int			avail_decomp;
+	wait_queue_head_t	wait;
+};
+
+
+struct decomp_stream {
+	void *stream;
+	struct list_head list;
+};
+
+
+static void put_decomp_stream(struct decomp_stream *decomp_strm,
+				struct squashfs_stream *stream)
+{
+	mutex_lock(&stream->mutex);
+	list_add(&decomp_strm->list, &stream->strm_list);
+	mutex_unlock(&stream->mutex);
+	wake_up(&stream->wait);
+}
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+				void *comp_opts)
+{
+	struct squashfs_stream *stream;
+	struct decomp_stream *decomp_strm = NULL;
+	int err = -ENOMEM;
+
+	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+	if (!stream)
+		goto out;
+
+	stream->comp_opts = comp_opts;
+	mutex_init(&stream->mutex);
+	INIT_LIST_HEAD(&stream->strm_list);
+	init_waitqueue_head(&stream->wait);
+
+	/*
+	 * We should have a decompressor at least as default
+	 * so if we fail to allocate new decompressor dynamically,
+	 * we could always fall back to default decompressor and
+	 * file system works.
+	 */
+	decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+	if (!decomp_strm)
+		goto out;
+
+	decomp_strm->stream = msblk->decompressor->init(msblk,
+						stream->comp_opts);
+	if (IS_ERR(decomp_strm->stream)) {
+		err = PTR_ERR(decomp_strm->stream);
+		goto out;
+	}
+
+	list_add(&decomp_strm->list, &stream->strm_list);
+	stream->avail_decomp = 1;
+	return stream;
+
+out:
+	kfree(decomp_strm);
+	kfree(stream);
+	return ERR_PTR(err);
+}
+
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+	struct squashfs_stream *stream = msblk->stream;
+	if (stream) {
+		struct decomp_stream *decomp_strm;
+
+		while (!list_empty(&stream->strm_list)) {
+			decomp_strm = list_entry(stream->strm_list.prev,
+						struct decomp_stream, list);
+			list_del(&decomp_strm->list);
+			msblk->decompressor->free(decomp_strm->stream);
+			kfree(decomp_strm);
+			stream->avail_decomp--;
+		}
+		WARN_ON(stream->avail_decomp);
+		kfree(stream->comp_opts);
+		kfree(stream);
+	}
+}
+
+
+static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
+					struct squashfs_stream *stream)
+{
+	struct decomp_stream *decomp_strm;
+
+	while (1) {
+		mutex_lock(&stream->mutex);
+
+		/* There is available decomp_stream */
+		if (!list_empty(&stream->strm_list)) {
+			decomp_strm = list_entry(stream->strm_list.prev,
+				struct decomp_stream, list);
+			list_del(&decomp_strm->list);
+			mutex_unlock(&stream->mutex);
+			break;
+		}
+
+		/*
+		 * If there is no available decomp and already full,
+		 * let's wait for releasing decomp from other users.
+		 */
+		if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+			goto wait;
+
+		/* Let's allocate new decomp */
+		decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+		if (!decomp_strm)
+			goto wait;
+
+		decomp_strm->stream = msblk->decompressor->init(msblk,
+						stream->comp_opts);
+		if (IS_ERR(decomp_strm->stream)) {
+			kfree(decomp_strm);
+			goto wait;
+		}
+
+		stream->avail_decomp++;
+		WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+
+		mutex_unlock(&stream->mutex);
+		break;
+wait:
+		/*
+		 * If system memory is tough, let's for other's
+		 * releasing instead of hurting VM because it could
+		 * make page cache thrashing.
+		 */
+		mutex_unlock(&stream->mutex);
+		wait_event(stream->wait,
+			!list_empty(&stream->strm_list));
+	}
+
+	return decomp_strm;
+}
+
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+	int b, int offset, int length, struct squashfs_page_actor *output)
+{
+	int res;
+	struct squashfs_stream *stream = msblk->stream;
+	struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
+	res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
+		bh, b, offset, length, output);
+	put_decomp_stream(decomp_stream, stream);
+	if (res < 0)
+		ERROR("%s decompression failed, data probably corrupt\n",
+			msblk->decompressor->name);
+	return res;
+}
diff --git a/kernel/fs/squashfs/decompressor_multi_percpu.c b/kernel/fs/squashfs/decompressor_multi_percpu.c
new file mode 100644
index 000000000..23a9c28ad
--- /dev/null
+++ b/kernel/fs/squashfs/decompressor_multi_percpu.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression using percpu
+ * variables, one thread per cpu core.
+ */
+
+struct squashfs_stream {
+	void		*stream;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+						void *comp_opts)
+{
+	struct squashfs_stream *stream;
+	struct squashfs_stream __percpu *percpu;
+	int err, cpu;
+
+	percpu = alloc_percpu(struct squashfs_stream);
+	if (percpu == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	for_each_possible_cpu(cpu) {
+		stream = per_cpu_ptr(percpu, cpu);
+		stream->stream = msblk->decompressor->init(msblk, comp_opts);
+		if (IS_ERR(stream->stream)) {
+			err = PTR_ERR(stream->stream);
+			goto out;
+		}
+	}
+
+	kfree(comp_opts);
+	return (__force void *) percpu;
+
+out:
+	for_each_possible_cpu(cpu) {
+		stream = per_cpu_ptr(percpu, cpu);
+		if (!IS_ERR_OR_NULL(stream->stream))
+			msblk->decompressor->free(stream->stream);
+	}
+	free_percpu(percpu);
+	return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+	struct squashfs_stream __percpu *percpu =
+			(struct squashfs_stream __percpu *) msblk->stream;
+	struct squashfs_stream *stream;
+	int cpu;
+
+	if (msblk->stream) {
+		for_each_possible_cpu(cpu) {
+			stream = per_cpu_ptr(percpu, cpu);
+			msblk->decompressor->free(stream->stream);
+		}
+		free_percpu(percpu);
+	}
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+	int b, int offset, int length, struct squashfs_page_actor *output)
+{
+	struct squashfs_stream __percpu *percpu =
+			(struct squashfs_stream __percpu *) msblk->stream;
+	struct squashfs_stream *stream = get_cpu_ptr(percpu);
+	int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+		offset, length, output);
+	put_cpu_ptr(stream);
+
+	if (res < 0)
+		ERROR("%s decompression failed, data probably corrupt\n",
+			msblk->decompressor->name);
+
+	return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+	return num_possible_cpus();
+}
diff --git a/kernel/fs/squashfs/decompressor_single.c b/kernel/fs/squashfs/decompressor_single.c
new file mode 100644
index 000000000..a6c75929a
--- /dev/null
+++ b/kernel/fs/squashfs/decompressor_single.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements single-threaded decompression in the
+ * decompressor framework
+ */
+
+struct squashfs_stream {
+	void		*stream;
+	struct mutex	mutex;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+						void *comp_opts)
+{
+	struct squashfs_stream *stream;
+	int err = -ENOMEM;
+
+	stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto out;
+
+	stream->stream = msblk->decompressor->init(msblk, comp_opts);
+	if (IS_ERR(stream->stream)) {
+		err = PTR_ERR(stream->stream);
+		goto out;
+	}
+
+	kfree(comp_opts);
+	mutex_init(&stream->mutex);
+	return stream;
+
+out:
+	kfree(stream);
+	return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+	struct squashfs_stream *stream = msblk->stream;
+
+	if (stream) {
+		msblk->decompressor->free(stream->stream);
+		kfree(stream);
+	}
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+	int b, int offset, int length, struct squashfs_page_actor *output)
+{
+	int res;
+	struct squashfs_stream *stream = msblk->stream;
+
+	mutex_lock(&stream->mutex);
+	res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+		offset, length, output);
+	mutex_unlock(&stream->mutex);
+
+	if (res < 0)
+		ERROR("%s decompression failed, data probably corrupt\n",
+			msblk->decompressor->name);
+
+	return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+	return 1;
+}
diff --git a/kernel/fs/squashfs/dir.c b/kernel/fs/squashfs/dir.c
new file mode 100644
index 000000000..d8c2d747b
--- /dev/null
+++ b/kernel/fs/squashfs/dir.c
@@ -0,0 +1,236 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * dir.c
+ */
+
+/*
+ * This file implements code to read directories from disk.
+ *
+ * See namei.c for a description of directory organisation on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const unsigned char squashfs_filetype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
+};
+
+/*
+ * Lookup offset (f_pos) in the directory index, returning the
+ * metadata block containing it.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_offset(struct super_block *sb,
+	u64 *next_block, int *next_offset, u64 index_start, int index_offset,
+	int i_count, u64 f_pos)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int err, i, index, length = 0;
+	unsigned int size;
+	struct squashfs_dir_index dir_index;
+
+	TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
+					i_count, f_pos);
+
+	/*
+	 * Translate from external f_pos to the internal f_pos.  This
+	 * is offset by 3 because we invent "." and ".." entries which are
+	 * not actually stored in the directory.
+	 */
+	if (f_pos <= 3)
+		return f_pos;
+	f_pos -= 3;
+
+	for (i = 0; i < i_count; i++) {
+		err = squashfs_read_metadata(sb, &dir_index, &index_start,
+				&index_offset, sizeof(dir_index));
+		if (err < 0)
+			break;
+
+		index = le32_to_cpu(dir_index.index);
+		if (index > f_pos)
+			/*
+			 * Found the index we're looking for.
+			 */
+			break;
+
+		size = le32_to_cpu(dir_index.size) + 1;
+
+		/* size should never be larger than SQUASHFS_NAME_LEN */
+		if (size > SQUASHFS_NAME_LEN)
+			break;
+
+		err = squashfs_read_metadata(sb, NULL, &index_start,
+				&index_offset, size);
+		if (err < 0)
+			break;
+
+		length = index;
+		*next_block = le32_to_cpu(dir_index.start_block) +
+					msblk->directory_table;
+	}
+
+	*next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+
+	/*
+	 * Translate back from internal f_pos to external f_pos.
+	 */
+	return length + 3;
+}
+
+
+static int squashfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	u64 block = squashfs_i(inode)->start + msblk->directory_table;
+	int offset = squashfs_i(inode)->offset, length, err;
+	unsigned int inode_number, dir_count, size, type;
+	struct squashfs_dir_header dirh;
+	struct squashfs_dir_entry *dire;
+
+	TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
+
+	dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+	if (dire == NULL) {
+		ERROR("Failed to allocate squashfs_dir_entry\n");
+		goto finish;
+	}
+
+	/*
+	 * Return "." and  ".." entries as the first two filenames in the
+	 * directory.  To maximise compression these two entries are not
+	 * stored in the directory, and so we invent them here.
+	 *
+	 * It also means that the external f_pos is offset by 3 from the
+	 * on-disk directory f_pos.
+	 */
+	while (ctx->pos < 3) {
+		char *name;
+		int i_ino;
+
+		if (ctx->pos == 0) {
+			name = ".";
+			size = 1;
+			i_ino = inode->i_ino;
+		} else {
+			name = "..";
+			size = 2;
+			i_ino = squashfs_i(inode)->parent;
+		}
+
+		if (!dir_emit(ctx, name, size, i_ino,
+				squashfs_filetype_table[1]))
+			goto finish;
+
+		ctx->pos += size;
+	}
+
+	length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
+				squashfs_i(inode)->dir_idx_start,
+				squashfs_i(inode)->dir_idx_offset,
+				squashfs_i(inode)->dir_idx_cnt,
+				ctx->pos);
+
+	while (length < i_size_read(inode)) {
+		/*
+		 * Read directory header
+		 */
+		err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
+					&offset, sizeof(dirh));
+		if (err < 0)
+			goto failed_read;
+
+		length += sizeof(dirh);
+
+		dir_count = le32_to_cpu(dirh.count) + 1;
+
+		if (dir_count > SQUASHFS_DIR_COUNT)
+			goto failed_read;
+
+		while (dir_count--) {
+			/*
+			 * Read directory entry.
+			 */
+			err = squashfs_read_metadata(inode->i_sb, dire, &block,
+					&offset, sizeof(*dire));
+			if (err < 0)
+				goto failed_read;
+
+			size = le16_to_cpu(dire->size) + 1;
+
+			/* size should never be larger than SQUASHFS_NAME_LEN */
+			if (size > SQUASHFS_NAME_LEN)
+				goto failed_read;
+
+			err = squashfs_read_metadata(inode->i_sb, dire->name,
+					&block, &offset, size);
+			if (err < 0)
+				goto failed_read;
+
+			length += sizeof(*dire) + size;
+
+			if (ctx->pos >= length)
+				continue;
+
+			dire->name[size] = '\0';
+			inode_number = le32_to_cpu(dirh.inode_number) +
+				((short) le16_to_cpu(dire->inode_number));
+			type = le16_to_cpu(dire->type);
+
+			if (type > SQUASHFS_MAX_DIR_TYPE)
+				goto failed_read;
+
+			if (!dir_emit(ctx, dire->name, size,
+					inode_number,
+					squashfs_filetype_table[type]))
+				goto finish;
+
+			ctx->pos = length;
+		}
+	}
+
+finish:
+	kfree(dire);
+	return 0;
+
+failed_read:
+	ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
+	kfree(dire);
+	return 0;
+}
+
+
+const struct file_operations squashfs_dir_ops = {
+	.read = generic_read_dir,
+	.iterate = squashfs_readdir,
+	.llseek = default_llseek,
+};
diff --git a/kernel/fs/squashfs/export.c b/kernel/fs/squashfs/export.c
new file mode 100644
index 000000000..8073b6532
--- /dev/null
+++ b/kernel/fs/squashfs/export.c
@@ -0,0 +1,163 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * export.c
+ */
+
+/*
+ * This file implements code to make Squashfs filesystems exportable (NFS etc.)
+ *
+ * The export code uses an inode lookup table to map inode numbers passed in
+ * filehandles to an inode location on disk.  This table is stored compressed
+ * into metadata blocks.  A second index table is used to locate these.  This
+ * second index table for speed of access (and because it is small) is read at
+ * mount time and cached in memory.
+ *
+ * The inode lookup table is used only by the export code, inode disk
+ * locations are directly encoded in directories, enabling direct access
+ * without an intermediate lookup for all operations except the export ops.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up inode number (ino) in table, returning the inode location.
+ */
+static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
+	int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
+	u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+	__le64 ino;
+	int err;
+
+	TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+
+	err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
+	if (err < 0)
+		return err;
+
+	TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
+		(u64) le64_to_cpu(ino));
+
+	return le64_to_cpu(ino);
+}
+
+
+static struct dentry *squashfs_export_iget(struct super_block *sb,
+	unsigned int ino_num)
+{
+	long long ino;
+	struct dentry *dentry = ERR_PTR(-ENOENT);
+
+	TRACE("Entered squashfs_export_iget\n");
+
+	ino = squashfs_inode_lookup(sb, ino_num);
+	if (ino >= 0)
+		dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
+
+	return dentry;
+}
+
+
+static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
+			|| fh_len < 2)
+		return NULL;
+
+	return squashfs_export_iget(sb, fid->i32.ino);
+}
+
+
+static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
+		return NULL;
+
+	return squashfs_export_iget(sb, fid->i32.parent_ino);
+}
+
+
+static struct dentry *squashfs_get_parent(struct dentry *child)
+{
+	struct inode *inode = d_inode(child);
+	unsigned int parent_ino = squashfs_i(inode)->parent;
+
+	return squashfs_export_iget(inode->i_sb, parent_ino);
+}
+
+
+/*
+ * Read uncompressed inode lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
+		u64 lookup_table_start, u64 next_table, unsigned int inodes)
+{
+	unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+	__le64 *table;
+
+	TRACE("In read_inode_lookup_table, length %d\n", length);
+
+	/* Sanity check values */
+
+	/* there should always be at least one inode */
+	if (inodes == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* length bytes should not extend into the next table - this check
+	 * also traps instances where lookup_table_start is incorrectly larger
+	 * than the next table start
+	 */
+	if (lookup_table_start + length > next_table)
+		return ERR_PTR(-EINVAL);
+
+	table = squashfs_read_table(sb, lookup_table_start, length);
+
+	/*
+	 * table[0] points to the first inode lookup table metadata block,
+	 * this should be less than lookup_table_start
+	 */
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return table;
+}
+
+
+const struct export_operations squashfs_export_ops = {
+	.fh_to_dentry = squashfs_fh_to_dentry,
+	.fh_to_parent = squashfs_fh_to_parent,
+	.get_parent = squashfs_get_parent
+};
diff --git a/kernel/fs/squashfs/file.c b/kernel/fs/squashfs/file.c
new file mode 100644
index 000000000..e5c968906
--- /dev/null
+++ b/kernel/fs/squashfs/file.c
@@ -0,0 +1,503 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * file.c
+ */
+
+/*
+ * This file contains code for handling regular files.  A regular file
+ * consists of a sequence of contiguous compressed blocks, and/or a
+ * compressed fragment block (tail-end packed block).   The compressed size
+ * of each datablock is stored in a block list contained within the
+ * file inode (itself stored in one or more compressed metadata blocks).
+ *
+ * To speed up access to datablocks when reading 'large' files (256 Mbytes or
+ * larger), the code implements an index cache that caches the mapping from
+ * block index to datablock location on disk.
+ *
+ * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+ * retaining a simple and space-efficient block list on disk.  The cache
+ * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+ * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+ * The index cache is designed to be memory efficient, and by default uses
+ * 16 KiB.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Locate cache slot in range [offset, index] for specified inode.  If
+ * there's more than one return the slot closest to index.
+ */
+static struct meta_index *locate_meta_index(struct inode *inode, int offset,
+				int index)
+{
+	struct meta_index *meta = NULL;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int i;
+
+	mutex_lock(&msblk->meta_index_mutex);
+
+	TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
+
+	if (msblk->meta_index == NULL)
+		goto not_allocated;
+
+	for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+		if (msblk->meta_index[i].inode_number == inode->i_ino &&
+				msblk->meta_index[i].offset >= offset &&
+				msblk->meta_index[i].offset <= index &&
+				msblk->meta_index[i].locked == 0) {
+			TRACE("locate_meta_index: entry %d, offset %d\n", i,
+					msblk->meta_index[i].offset);
+			meta = &msblk->meta_index[i];
+			offset = meta->offset;
+		}
+	}
+
+	if (meta)
+		meta->locked = 1;
+
+not_allocated:
+	mutex_unlock(&msblk->meta_index_mutex);
+
+	return meta;
+}
+
+
+/*
+ * Find and initialise an empty cache slot for index offset.
+ */
+static struct meta_index *empty_meta_index(struct inode *inode, int offset,
+				int skip)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	struct meta_index *meta = NULL;
+	int i;
+
+	mutex_lock(&msblk->meta_index_mutex);
+
+	TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
+
+	if (msblk->meta_index == NULL) {
+		/*
+		 * First time cache index has been used, allocate and
+		 * initialise.  The cache index could be allocated at
+		 * mount time but doing it here means it is allocated only
+		 * if a 'large' file is read.
+		 */
+		msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
+			sizeof(*(msblk->meta_index)), GFP_KERNEL);
+		if (msblk->meta_index == NULL) {
+			ERROR("Failed to allocate meta_index\n");
+			goto failed;
+		}
+		for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+			msblk->meta_index[i].inode_number = 0;
+			msblk->meta_index[i].locked = 0;
+		}
+		msblk->next_meta_index = 0;
+	}
+
+	for (i = SQUASHFS_META_SLOTS; i &&
+			msblk->meta_index[msblk->next_meta_index].locked; i--)
+		msblk->next_meta_index = (msblk->next_meta_index + 1) %
+			SQUASHFS_META_SLOTS;
+
+	if (i == 0) {
+		TRACE("empty_meta_index: failed!\n");
+		goto failed;
+	}
+
+	TRACE("empty_meta_index: returned meta entry %d, %p\n",
+			msblk->next_meta_index,
+			&msblk->meta_index[msblk->next_meta_index]);
+
+	meta = &msblk->meta_index[msblk->next_meta_index];
+	msblk->next_meta_index = (msblk->next_meta_index + 1) %
+			SQUASHFS_META_SLOTS;
+
+	meta->inode_number = inode->i_ino;
+	meta->offset = offset;
+	meta->skip = skip;
+	meta->entries = 0;
+	meta->locked = 1;
+
+failed:
+	mutex_unlock(&msblk->meta_index_mutex);
+	return meta;
+}
+
+
+static void release_meta_index(struct inode *inode, struct meta_index *meta)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	mutex_lock(&msblk->meta_index_mutex);
+	meta->locked = 0;
+	mutex_unlock(&msblk->meta_index_mutex);
+}
+
+
+/*
+ * Read the next n blocks from the block list, starting from
+ * metadata block <start_block, offset>.
+ */
+static long long read_indexes(struct super_block *sb, int n,
+				u64 *start_block, int *offset)
+{
+	int err, i;
+	long long block = 0;
+	__le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+
+	if (blist == NULL) {
+		ERROR("read_indexes: Failed to allocate block_list\n");
+		return -ENOMEM;
+	}
+
+	while (n) {
+		int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+
+		err = squashfs_read_metadata(sb, blist, start_block,
+				offset, blocks << 2);
+		if (err < 0) {
+			ERROR("read_indexes: reading block [%llx:%x]\n",
+				*start_block, *offset);
+			goto failure;
+		}
+
+		for (i = 0; i < blocks; i++) {
+			int size = le32_to_cpu(blist[i]);
+			block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
+		}
+		n -= blocks;
+	}
+
+	kfree(blist);
+	return block;
+
+failure:
+	kfree(blist);
+	return err;
+}
+
+
+/*
+ * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
+ * can cache one index -> datablock/blocklist-block mapping.  We wish
+ * to distribute these over the length of the file, entry[0] maps index x,
+ * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
+ * The larger the file, the greater the skip factor.  The skip factor is
+ * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
+ * the number of metadata blocks that need to be read fits into the cache.
+ * If the skip factor is limited in this way then the file will use multiple
+ * slots.
+ */
+static inline int calculate_skip(int blocks)
+{
+	int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+		 * SQUASHFS_META_INDEXES);
+	return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
+}
+
+
+/*
+ * Search and grow the index cache for the specified inode, returning the
+ * on-disk locations of the datablock and block list metadata block
+ * <index_block, index_offset> for index (scaled to nearest cache index).
+ */
+static int fill_meta_index(struct inode *inode, int index,
+		u64 *index_block, int *index_offset, u64 *data_block)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
+	int offset = 0;
+	struct meta_index *meta;
+	struct meta_entry *meta_entry;
+	u64 cur_index_block = squashfs_i(inode)->block_list_start;
+	int cur_offset = squashfs_i(inode)->offset;
+	u64 cur_data_block = squashfs_i(inode)->start;
+	int err, i;
+
+	/*
+	 * Scale index to cache index (cache slot entry)
+	 */
+	index /= SQUASHFS_META_INDEXES * skip;
+
+	while (offset < index) {
+		meta = locate_meta_index(inode, offset + 1, index);
+
+		if (meta == NULL) {
+			meta = empty_meta_index(inode, offset + 1, skip);
+			if (meta == NULL)
+				goto all_done;
+		} else {
+			offset = index < meta->offset + meta->entries ? index :
+				meta->offset + meta->entries - 1;
+			meta_entry = &meta->meta_entry[offset - meta->offset];
+			cur_index_block = meta_entry->index_block +
+				msblk->inode_table;
+			cur_offset = meta_entry->offset;
+			cur_data_block = meta_entry->data_block;
+			TRACE("get_meta_index: offset %d, meta->offset %d, "
+				"meta->entries %d\n", offset, meta->offset,
+				meta->entries);
+			TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
+				" data_block 0x%llx\n", cur_index_block,
+				cur_offset, cur_data_block);
+		}
+
+		/*
+		 * If necessary grow cache slot by reading block list.  Cache
+		 * slot is extended up to index or to the end of the slot, in
+		 * which case further slots will be used.
+		 */
+		for (i = meta->offset + meta->entries; i <= index &&
+				i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
+			int blocks = skip * SQUASHFS_META_INDEXES;
+			long long res = read_indexes(inode->i_sb, blocks,
+					&cur_index_block, &cur_offset);
+
+			if (res < 0) {
+				if (meta->entries == 0)
+					/*
+					 * Don't leave an empty slot on read
+					 * error allocated to this inode...
+					 */
+					meta->inode_number = 0;
+				err = res;
+				goto failed;
+			}
+
+			cur_data_block += res;
+			meta_entry = &meta->meta_entry[i - meta->offset];
+			meta_entry->index_block = cur_index_block -
+				msblk->inode_table;
+			meta_entry->offset = cur_offset;
+			meta_entry->data_block = cur_data_block;
+			meta->entries++;
+			offset++;
+		}
+
+		TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
+				meta->offset, meta->entries);
+
+		release_meta_index(inode, meta);
+	}
+
+all_done:
+	*index_block = cur_index_block;
+	*index_offset = cur_offset;
+	*data_block = cur_data_block;
+
+	/*
+	 * Scale cache index (cache slot entry) to index
+	 */
+	return offset * SQUASHFS_META_INDEXES * skip;
+
+failed:
+	release_meta_index(inode, meta);
+	return err;
+}
+
+
+/*
+ * Get the on-disk location and compressed size of the datablock
+ * specified by index.  Fill_meta_index() does most of the work.
+ */
+static int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+	u64 start;
+	long long blks;
+	int offset;
+	__le32 size;
+	int res = fill_meta_index(inode, index, &start, &offset, block);
+
+	TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
+		       " 0x%x, block 0x%llx\n", res, index, start, offset,
+			*block);
+
+	if (res < 0)
+		return res;
+
+	/*
+	 * res contains the index of the mapping returned by fill_meta_index(),
+	 * this will likely be less than the desired index (because the
+	 * meta_index cache works at a higher granularity).  Read any
+	 * extra block indexes needed.
+	 */
+	if (res < index) {
+		blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+		if (blks < 0)
+			return (int) blks;
+		*block += blks;
+	}
+
+	/*
+	 * Read length of block specified by index.
+	 */
+	res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+			sizeof(size));
+	if (res < 0)
+		return res;
+	return le32_to_cpu(size);
+}
+
+/* Copy data into page cache  */
+void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
+	int bytes, int offset)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	void *pageaddr;
+	int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int start_index = page->index & ~mask, end_index = start_index | mask;
+
+	/*
+	 * Loop copying datablock into pages.  As the datablock likely covers
+	 * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+	 * grab the pages from the page cache, except for the page that we've
+	 * been called to fill.
+	 */
+	for (i = start_index; i <= end_index && bytes > 0; i++,
+			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+		struct page *push_page;
+		int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
+
+		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
+
+		push_page = (i == page->index) ? page :
+			grab_cache_page_nowait(page->mapping, i);
+
+		if (!push_page)
+			continue;
+
+		if (PageUptodate(push_page))
+			goto skip_page;
+
+		pageaddr = kmap_atomic(push_page);
+		squashfs_copy_data(pageaddr, buffer, offset, avail);
+		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		kunmap_atomic(pageaddr);
+		flush_dcache_page(push_page);
+		SetPageUptodate(push_page);
+skip_page:
+		unlock_page(push_page);
+		if (i != page->index)
+			page_cache_release(push_page);
+	}
+}
+
+/* Read datablock stored packed inside a fragment (tail-end packed block) */
+static int squashfs_readpage_fragment(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+		squashfs_i(inode)->fragment_block,
+		squashfs_i(inode)->fragment_size);
+	int res = buffer->error;
+
+	if (res)
+		ERROR("Unable to read page, block %llx, size %x\n",
+			squashfs_i(inode)->fragment_block,
+			squashfs_i(inode)->fragment_size);
+	else
+		squashfs_copy_cache(page, buffer, i_size_read(inode) &
+			(msblk->block_size - 1),
+			squashfs_i(inode)->fragment_offset);
+
+	squashfs_cache_put(buffer);
+	return res;
+}
+
+static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int bytes = index == file_end ?
+			(i_size_read(inode) & (msblk->block_size - 1)) :
+			 msblk->block_size;
+
+	squashfs_copy_cache(page, NULL, bytes, 0);
+	return 0;
+}
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+	int file_end = i_size_read(inode) >> msblk->block_log;
+	int res;
+	void *pageaddr;
+
+	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+				page->index, squashfs_i(inode)->start);
+
+	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT))
+		goto out;
+
+	if (index < file_end || squashfs_i(inode)->fragment_block ==
+					SQUASHFS_INVALID_BLK) {
+		u64 block = 0;
+		int bsize = read_blocklist(inode, index, &block);
+		if (bsize < 0)
+			goto error_out;
+
+		if (bsize == 0)
+			res = squashfs_readpage_sparse(page, index, file_end);
+		else
+			res = squashfs_readpage_block(page, block, bsize);
+	} else
+		res = squashfs_readpage_fragment(page);
+
+	if (!res)
+		return 0;
+
+error_out:
+	SetPageError(page);
+out:
+	pageaddr = kmap_atomic(page);
+	memset(pageaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(pageaddr);
+	flush_dcache_page(page);
+	if (!PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+
+const struct address_space_operations squashfs_aops = {
+	.readpage = squashfs_readpage
+};
diff --git a/kernel/fs/squashfs/file_cache.c b/kernel/fs/squashfs/file_cache.c
new file mode 100644
index 000000000..f2310d2a2
--- /dev/null
+++ b/kernel/fs/squashfs/file_cache.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/* Read separately compressed datablock and memcopy into page cache */
+int squashfs_readpage_block(struct page *page, u64 block, int bsize)
+{
+	struct inode *i = page->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+		block, bsize);
+	int res = buffer->error;
+
+	if (res)
+		ERROR("Unable to read page, block %llx, size %x\n", block,
+			bsize);
+	else
+		squashfs_copy_cache(page, buffer, buffer->length, 0);
+
+	squashfs_cache_put(buffer);
+	return res;
+}
diff --git a/kernel/fs/squashfs/file_direct.c b/kernel/fs/squashfs/file_direct.c
new file mode 100644
index 000000000..43e7a7edd
--- /dev/null
+++ b/kernel/fs/squashfs/file_direct.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "page_actor.h"
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+	int pages, struct page **page);
+
+/* Read separately compressed datablock directly into page cache */
+int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
+
+{
+	struct inode *inode = target_page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+
+	int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int start_index = target_page->index & ~mask;
+	int end_index = start_index | mask;
+	int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+	struct page **page;
+	struct squashfs_page_actor *actor;
+	void *pageaddr;
+
+	if (end_index > file_end)
+		end_index = file_end;
+
+	pages = end_index - start_index + 1;
+
+	page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
+	if (page == NULL)
+		return res;
+
+	/*
+	 * Create a "page actor" which will kmap and kunmap the
+	 * page cache pages appropriately within the decompressor
+	 */
+	actor = squashfs_page_actor_init_special(page, pages, 0);
+	if (actor == NULL)
+		goto out;
+
+	/* Try to grab all the pages covered by the Squashfs block */
+	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+		page[i] = (n == target_page->index) ? target_page :
+			grab_cache_page_nowait(target_page->mapping, n);
+
+		if (page[i] == NULL) {
+			missing_pages++;
+			continue;
+		}
+
+		if (PageUptodate(page[i])) {
+			unlock_page(page[i]);
+			page_cache_release(page[i]);
+			page[i] = NULL;
+			missing_pages++;
+		}
+	}
+
+	if (missing_pages) {
+		/*
+		 * Couldn't get one or more pages, this page has either
+		 * been VM reclaimed, but others are still in the page cache
+		 * and uptodate, or we're racing with another thread in
+		 * squashfs_readpage also trying to grab them.  Fall back to
+		 * using an intermediate buffer.
+		 */
+		res = squashfs_read_cache(target_page, block, bsize, pages,
+								page);
+		if (res < 0)
+			goto mark_errored;
+
+		goto out;
+	}
+
+	/* Decompress directly into the page cache buffers */
+	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+	if (res < 0)
+		goto mark_errored;
+
+	/* Last page may have trailing bytes not filled */
+	bytes = res % PAGE_CACHE_SIZE;
+	if (bytes) {
+		pageaddr = kmap_atomic(page[pages - 1]);
+		memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+		kunmap_atomic(pageaddr);
+	}
+
+	/* Mark pages as uptodate, unlock and release */
+	for (i = 0; i < pages; i++) {
+		flush_dcache_page(page[i]);
+		SetPageUptodate(page[i]);
+		unlock_page(page[i]);
+		if (page[i] != target_page)
+			page_cache_release(page[i]);
+	}
+
+	kfree(actor);
+	kfree(page);
+
+	return 0;
+
+mark_errored:
+	/* Decompression failed, mark pages as errored.  Target_page is
+	 * dealt with by the caller
+	 */
+	for (i = 0; i < pages; i++) {
+		if (page[i] == NULL || page[i] == target_page)
+			continue;
+		flush_dcache_page(page[i]);
+		SetPageError(page[i]);
+		unlock_page(page[i]);
+		page_cache_release(page[i]);
+	}
+
+out:
+	kfree(actor);
+	kfree(page);
+	return res;
+}
+
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+	int pages, struct page **page)
+{
+	struct inode *i = target_page->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+						 block, bsize);
+	int bytes = buffer->length, res = buffer->error, n, offset = 0;
+	void *pageaddr;
+
+	if (res) {
+		ERROR("Unable to read page, block %llx, size %x\n", block,
+			bsize);
+		goto out;
+	}
+
+	for (n = 0; n < pages && bytes > 0; n++,
+			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+		int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+
+		if (page[n] == NULL)
+			continue;
+
+		pageaddr = kmap_atomic(page[n]);
+		squashfs_copy_data(pageaddr, buffer, offset, avail);
+		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		kunmap_atomic(pageaddr);
+		flush_dcache_page(page[n]);
+		SetPageUptodate(page[n]);
+		unlock_page(page[n]);
+		if (page[n] != target_page)
+			page_cache_release(page[n]);
+	}
+
+out:
+	squashfs_cache_put(buffer);
+	return res;
+}
diff --git a/kernel/fs/squashfs/fragment.c b/kernel/fs/squashfs/fragment.c
new file mode 100644
index 000000000..0ed6edbc5
--- /dev/null
+++ b/kernel/fs/squashfs/fragment.c
@@ -0,0 +1,99 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * fragment.c
+ */
+
+/*
+ * This file implements code to handle compressed fragments (tail-end packed
+ * datablocks).
+ *
+ * Regular files contain a fragment index which is mapped to a fragment
+ * location on disk and compressed size using a fragment lookup table.
+ * Like everything in Squashfs this fragment lookup table is itself stored
+ * compressed into metadata blocks.  A second index table is used to locate
+ * these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+
+/*
+ * Look-up fragment using the fragment index table.  Return the on disk
+ * location of the fragment and its compressed size
+ */
+int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
+				u64 *fragment_block)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_FRAGMENT_INDEX(fragment);
+	int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
+	u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
+	struct squashfs_fragment_entry fragment_entry;
+	int size;
+
+	size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
+					&offset, sizeof(fragment_entry));
+	if (size < 0)
+		return size;
+
+	*fragment_block = le64_to_cpu(fragment_entry.start_block);
+	size = le32_to_cpu(fragment_entry.size);
+
+	return size;
+}
+
+
+/*
+ * Read the uncompressed fragment lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
+	u64 fragment_table_start, u64 next_table, unsigned int fragments)
+{
+	unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
+	__le64 *table;
+
+	/*
+	 * Sanity check, length bytes should not extend into the next table -
+	 * this check also traps instances where fragment_table_start is
+	 * incorrectly larger than the next table start
+	 */
+	if (fragment_table_start + length > next_table)
+		return ERR_PTR(-EINVAL);
+
+	table = squashfs_read_table(sb, fragment_table_start, length);
+
+	/*
+	 * table[0] points to the first fragment table metadata block, this
+	 * should be less than fragment_table_start
+	 */
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return table;
+}
diff --git a/kernel/fs/squashfs/id.c b/kernel/fs/squashfs/id.c
new file mode 100644
index 000000000..d38ea3dab
--- /dev/null
+++ b/kernel/fs/squashfs/id.c
@@ -0,0 +1,102 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * id.c
+ */
+
+/*
+ * This file implements code to handle uids and gids.
+ *
+ * For space efficiency regular files store uid and gid indexes, which are
+ * converted to 32-bit uids/gids using an id look up table.  This table is
+ * stored compressed into metadata blocks.  A second index table is used to
+ * locate these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+
+/*
+ * Map uid/gid index into real 32-bit uid/gid using the id look up table
+ */
+int squashfs_get_id(struct super_block *sb, unsigned int index,
+					unsigned int *id)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_ID_BLOCK(index);
+	int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
+	u64 start_block = le64_to_cpu(msblk->id_table[block]);
+	__le32 disk_id;
+	int err;
+
+	err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
+							sizeof(disk_id));
+	if (err < 0)
+		return err;
+
+	*id = le32_to_cpu(disk_id);
+	return 0;
+}
+
+
+/*
+ * Read uncompressed id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_id_index_table(struct super_block *sb,
+		u64 id_table_start, u64 next_table, unsigned short no_ids)
+{
+	unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+	__le64 *table;
+
+	TRACE("In read_id_index_table, length %d\n", length);
+
+	/* Sanity check values */
+
+	/* there should always be at least one id */
+	if (no_ids == 0)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * length bytes should not extend into the next table - this check
+	 * also traps instances where id_table_start is incorrectly larger
+	 * than the next table start
+	 */
+	if (id_table_start + length > next_table)
+		return ERR_PTR(-EINVAL);
+
+	table = squashfs_read_table(sb, id_table_start, length);
+
+	/*
+	 * table[0] points to the first id lookup table metadata block, this
+	 * should be less than id_table_start
+	 */
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return table;
+}
diff --git a/kernel/fs/squashfs/inode.c b/kernel/fs/squashfs/inode.c
new file mode 100644
index 000000000..a1ce5ce60
--- /dev/null
+++ b/kernel/fs/squashfs/inode.c
@@ -0,0 +1,429 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * inode.c
+ */
+
+/*
+ * This file implements code to create and read inodes from disk.
+ *
+ * Inodes in Squashfs are identified by a 48-bit inode which encodes the
+ * location of the compressed metadata block containing the inode, and the byte
+ * offset into that block where the inode is placed (<block, offset>).
+ *
+ * To maximise compression there are different inodes for each file type
+ * (regular file, directory, device, etc.), the inode contents and length
+ * varying with the type.
+ *
+ * To further maximise compression, two types of regular file inode and
+ * directory inode are defined: inodes optimised for frequently occurring
+ * regular files and directories, and extended types where extra
+ * information has to be stored.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+/*
+ * Initialise VFS inode with the base inode information common to all
+ * Squashfs inode types.  Sqsh_ino contains the unswapped base inode
+ * off disk.
+ */
+static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+				struct squashfs_base_inode *sqsh_ino)
+{
+	uid_t i_uid;
+	gid_t i_gid;
+	int err;
+
+	err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
+	if (err)
+		return err;
+
+	err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid);
+	if (err)
+		return err;
+
+	i_uid_write(inode, i_uid);
+	i_gid_write(inode, i_gid);
+	inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+	inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
+	inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
+	inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+	inode->i_mode = le16_to_cpu(sqsh_ino->mode);
+	inode->i_size = 0;
+
+	return err;
+}
+
+
+struct inode *squashfs_iget(struct super_block *sb, long long ino,
+				unsigned int ino_number)
+{
+	struct inode *inode = iget_locked(sb, ino_number);
+	int err;
+
+	TRACE("Entered squashfs_iget\n");
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = squashfs_read_inode(inode, ino);
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+}
+
+
+/*
+ * Initialise VFS inode by reading inode from inode table (compressed
+ * metadata).  The format and amount of data read depends on type.
+ */
+int squashfs_read_inode(struct inode *inode, long long ino)
+{
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+	int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
+	union squashfs_inode squashfs_ino;
+	struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+	int xattr_id = SQUASHFS_INVALID_XATTR;
+
+	TRACE("Entered squashfs_read_inode\n");
+
+	/*
+	 * Read inode base common to all inode types.
+	 */
+	err = squashfs_read_metadata(sb, sqshb_ino, &block,
+				&offset, sizeof(*sqshb_ino));
+	if (err < 0)
+		goto failed_read;
+
+	err = squashfs_new_inode(sb, inode, sqshb_ino);
+	if (err)
+		goto failed_read;
+
+	block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+	offset = SQUASHFS_INODE_OFFSET(ino);
+
+	type = le16_to_cpu(sqshb_ino->inode_type);
+	switch (type) {
+	case SQUASHFS_REG_TYPE: {
+		unsigned int frag_offset, frag;
+		int frag_size;
+		u64 frag_blk;
+		struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+							sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		frag = le32_to_cpu(sqsh_ino->fragment);
+		if (frag != SQUASHFS_INVALID_FRAG) {
+			frag_offset = le32_to_cpu(sqsh_ino->offset);
+			frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+			if (frag_size < 0) {
+				err = frag_size;
+				goto failed_read;
+			}
+		} else {
+			frag_blk = SQUASHFS_INVALID_BLK;
+			frag_size = 0;
+			frag_offset = 0;
+		}
+
+		set_nlink(inode, 1);
+		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mode |= S_IFREG;
+		inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
+		squashfs_i(inode)->fragment_block = frag_blk;
+		squashfs_i(inode)->fragment_size = frag_size;
+		squashfs_i(inode)->fragment_offset = frag_offset;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->block_list_start = block;
+		squashfs_i(inode)->offset = offset;
+		inode->i_data.a_ops = &squashfs_aops;
+
+		TRACE("File inode %x:%x, start_block %llx, block_list_start "
+			"%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+			offset, squashfs_i(inode)->start, block, offset);
+		break;
+	}
+	case SQUASHFS_LREG_TYPE: {
+		unsigned int frag_offset, frag;
+		int frag_size;
+		u64 frag_blk;
+		struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+							sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		frag = le32_to_cpu(sqsh_ino->fragment);
+		if (frag != SQUASHFS_INVALID_FRAG) {
+			frag_offset = le32_to_cpu(sqsh_ino->offset);
+			frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+			if (frag_size < 0) {
+				err = frag_size;
+				goto failed_read;
+			}
+		} else {
+			frag_blk = SQUASHFS_INVALID_BLK;
+			frag_size = 0;
+			frag_offset = 0;
+		}
+
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mode |= S_IFREG;
+		inode->i_blocks = (inode->i_size -
+				le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
+
+		squashfs_i(inode)->fragment_block = frag_blk;
+		squashfs_i(inode)->fragment_size = frag_size;
+		squashfs_i(inode)->fragment_offset = frag_offset;
+		squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->block_list_start = block;
+		squashfs_i(inode)->offset = offset;
+		inode->i_data.a_ops = &squashfs_aops;
+
+		TRACE("File inode %x:%x, start_block %llx, block_list_start "
+			"%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+			offset, squashfs_i(inode)->start, block, offset);
+		break;
+	}
+	case SQUASHFS_DIR_TYPE: {
+		struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		inode->i_size = le16_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_dir_inode_ops;
+		inode->i_fop = &squashfs_dir_ops;
+		inode->i_mode |= S_IFDIR;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+		squashfs_i(inode)->dir_idx_cnt = 0;
+		squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+		TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
+				SQUASHFS_INODE_BLK(ino), offset,
+				squashfs_i(inode)->start,
+				le16_to_cpu(sqsh_ino->offset));
+		break;
+	}
+	case SQUASHFS_LDIR_TYPE: {
+		struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_dir_inode_ops;
+		inode->i_fop = &squashfs_dir_ops;
+		inode->i_mode |= S_IFDIR;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+		squashfs_i(inode)->dir_idx_start = block;
+		squashfs_i(inode)->dir_idx_offset = offset;
+		squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
+		squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+		TRACE("Long directory inode %x:%x, start_block %llx, offset "
+				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
+				squashfs_i(inode)->start,
+				le16_to_cpu(sqsh_ino->offset));
+		break;
+	}
+	case SQUASHFS_SYMLINK_TYPE:
+	case SQUASHFS_LSYMLINK_TYPE: {
+		struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+		inode->i_op = &squashfs_symlink_inode_ops;
+		inode->i_data.a_ops = &squashfs_symlink_aops;
+		inode->i_mode |= S_IFLNK;
+		squashfs_i(inode)->start = block;
+		squashfs_i(inode)->offset = offset;
+
+		if (type == SQUASHFS_LSYMLINK_TYPE) {
+			__le32 xattr;
+
+			err = squashfs_read_metadata(sb, NULL, &block,
+						&offset, inode->i_size);
+			if (err < 0)
+				goto failed_read;
+			err = squashfs_read_metadata(sb, &xattr, &block,
+						&offset, sizeof(xattr));
+			if (err < 0)
+				goto failed_read;
+			xattr_id = le32_to_cpu(xattr);
+		}
+
+		TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
+				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
+				block, offset);
+		break;
+	}
+	case SQUASHFS_BLKDEV_TYPE:
+	case SQUASHFS_CHRDEV_TYPE: {
+		struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
+		unsigned int rdev;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_CHRDEV_TYPE)
+			inode->i_mode |= S_IFCHR;
+		else
+			inode->i_mode |= S_IFBLK;
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		rdev = le32_to_cpu(sqsh_ino->rdev);
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+		TRACE("Device inode %x:%x, rdev %x\n",
+				SQUASHFS_INODE_BLK(ino), offset, rdev);
+		break;
+	}
+	case SQUASHFS_LBLKDEV_TYPE:
+	case SQUASHFS_LCHRDEV_TYPE: {
+		struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+		unsigned int rdev;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LCHRDEV_TYPE)
+			inode->i_mode |= S_IFCHR;
+		else
+			inode->i_mode |= S_IFBLK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		rdev = le32_to_cpu(sqsh_ino->rdev);
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+		TRACE("Device inode %x:%x, rdev %x\n",
+				SQUASHFS_INODE_BLK(ino), offset, rdev);
+		break;
+	}
+	case SQUASHFS_FIFO_TYPE:
+	case SQUASHFS_SOCKET_TYPE: {
+		struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_FIFO_TYPE)
+			inode->i_mode |= S_IFIFO;
+		else
+			inode->i_mode |= S_IFSOCK;
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		init_special_inode(inode, inode->i_mode, 0);
+		break;
+	}
+	case SQUASHFS_LFIFO_TYPE:
+	case SQUASHFS_LSOCKET_TYPE: {
+		struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LFIFO_TYPE)
+			inode->i_mode |= S_IFIFO;
+		else
+			inode->i_mode |= S_IFSOCK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
+		init_special_inode(inode, inode->i_mode, 0);
+		break;
+	}
+	default:
+		ERROR("Unknown inode type %d in squashfs_iget!\n", type);
+		return -EINVAL;
+	}
+
+	if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+		err = squashfs_xattr_lookup(sb, xattr_id,
+					&squashfs_i(inode)->xattr_count,
+					&squashfs_i(inode)->xattr_size,
+					&squashfs_i(inode)->xattr);
+		if (err < 0)
+			goto failed_read;
+		inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+				+ 1;
+	} else
+		squashfs_i(inode)->xattr_count = 0;
+
+	return 0;
+
+failed_read:
+	ERROR("Unable to read inode 0x%llx\n", ino);
+	return err;
+}
+
+
+const struct inode_operations squashfs_inode_ops = {
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/kernel/fs/squashfs/lz4_wrapper.c b/kernel/fs/squashfs/lz4_wrapper.c
new file mode 100644
index 000000000..c31e2bc9c
--- /dev/null
+++ b/kernel/fs/squashfs/lz4_wrapper.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2013, 2014
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "page_actor.h"
+
+#define LZ4_LEGACY	1
+
+struct lz4_comp_opts {
+	__le32 version;
+	__le32 flags;
+};
+
+struct squashfs_lz4 {
+	void *input;
+	void *output;
+};
+
+
+static void *lz4_comp_opts(struct squashfs_sb_info *msblk,
+	void *buff, int len)
+{
+	struct lz4_comp_opts *comp_opts = buff;
+
+	/* LZ4 compressed filesystems always have compression options */
+	if (comp_opts == NULL || len < sizeof(*comp_opts))
+		return ERR_PTR(-EIO);
+
+	if (le32_to_cpu(comp_opts->version) != LZ4_LEGACY) {
+		/* LZ4 format currently used by the kernel is the 'legacy'
+		 * format */
+		ERROR("Unknown LZ4 version\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return NULL;
+}
+
+
+static void *lz4_init(struct squashfs_sb_info *msblk, void *buff)
+{
+	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+	struct squashfs_lz4 *stream;
+
+	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->input = vmalloc(block_size);
+	if (stream->input == NULL)
+		goto failed2;
+	stream->output = vmalloc(block_size);
+	if (stream->output == NULL)
+		goto failed3;
+
+	return stream;
+
+failed3:
+	vfree(stream->input);
+failed2:
+	kfree(stream);
+failed:
+	ERROR("Failed to initialise LZ4 decompressor\n");
+	return ERR_PTR(-ENOMEM);
+}
+
+
+static void lz4_free(void *strm)
+{
+	struct squashfs_lz4 *stream = strm;
+
+	if (stream) {
+		vfree(stream->input);
+		vfree(stream->output);
+	}
+	kfree(stream);
+}
+
+
+static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
+{
+	struct squashfs_lz4 *stream = strm;
+	void *buff = stream->input, *data;
+	int avail, i, bytes = length, res;
+	size_t dest_len = output->length;
+
+	for (i = 0; i < b; i++) {
+		avail = min(bytes, msblk->devblksize - offset);
+		memcpy(buff, bh[i]->b_data + offset, avail);
+		buff += avail;
+		bytes -= avail;
+		offset = 0;
+		put_bh(bh[i]);
+	}
+
+	res = lz4_decompress_unknownoutputsize(stream->input, length,
+					stream->output, &dest_len);
+	if (res)
+		return -EIO;
+
+	bytes = dest_len;
+	data = squashfs_first_page(output);
+	buff = stream->output;
+	while (data) {
+		if (bytes <= PAGE_CACHE_SIZE) {
+			memcpy(data, buff, bytes);
+			break;
+		}
+		memcpy(data, buff, PAGE_CACHE_SIZE);
+		buff += PAGE_CACHE_SIZE;
+		bytes -= PAGE_CACHE_SIZE;
+		data = squashfs_next_page(output);
+	}
+	squashfs_finish_page(output);
+
+	return dest_len;
+}
+
+const struct squashfs_decompressor squashfs_lz4_comp_ops = {
+	.init = lz4_init,
+	.comp_opts = lz4_comp_opts,
+	.free = lz4_free,
+	.decompress = lz4_uncompress,
+	.id = LZ4_COMPRESSION,
+	.name = "lz4",
+	.supported = 1
+};
diff --git a/kernel/fs/squashfs/lzo_wrapper.c b/kernel/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 000000000..244b9fbff
--- /dev/null
+++ b/kernel/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,130 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010 LG Electronics
+ * Chan Jeong <chan.jeong@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * lzo_wrapper.c
+ */
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lzo.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "page_actor.h"
+
+struct squashfs_lzo {
+	void	*input;
+	void	*output;
+};
+
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff)
+{
+	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+
+	struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->input = vmalloc(block_size);
+	if (stream->input == NULL)
+		goto failed;
+	stream->output = vmalloc(block_size);
+	if (stream->output == NULL)
+		goto failed2;
+
+	return stream;
+
+failed2:
+	vfree(stream->input);
+failed:
+	ERROR("Failed to allocate lzo workspace\n");
+	kfree(stream);
+	return ERR_PTR(-ENOMEM);
+}
+
+
+static void lzo_free(void *strm)
+{
+	struct squashfs_lzo *stream = strm;
+
+	if (stream) {
+		vfree(stream->input);
+		vfree(stream->output);
+	}
+	kfree(stream);
+}
+
+
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
+{
+	struct squashfs_lzo *stream = strm;
+	void *buff = stream->input, *data;
+	int avail, i, bytes = length, res;
+	size_t out_len = output->length;
+
+	for (i = 0; i < b; i++) {
+		avail = min(bytes, msblk->devblksize - offset);
+		memcpy(buff, bh[i]->b_data + offset, avail);
+		buff += avail;
+		bytes -= avail;
+		offset = 0;
+		put_bh(bh[i]);
+	}
+
+	res = lzo1x_decompress_safe(stream->input, (size_t)length,
+					stream->output, &out_len);
+	if (res != LZO_E_OK)
+		goto failed;
+
+	res = bytes = (int)out_len;
+	data = squashfs_first_page(output);
+	buff = stream->output;
+	while (data) {
+		if (bytes <= PAGE_CACHE_SIZE) {
+			memcpy(data, buff, bytes);
+			break;
+		} else {
+			memcpy(data, buff, PAGE_CACHE_SIZE);
+			buff += PAGE_CACHE_SIZE;
+			bytes -= PAGE_CACHE_SIZE;
+			data = squashfs_next_page(output);
+		}
+	}
+	squashfs_finish_page(output);
+
+	return res;
+
+failed:
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_lzo_comp_ops = {
+	.init = lzo_init,
+	.free = lzo_free,
+	.decompress = lzo_uncompress,
+	.id = LZO_COMPRESSION,
+	.name = "lzo",
+	.supported = 1
+};
diff --git a/kernel/fs/squashfs/namei.c b/kernel/fs/squashfs/namei.c
new file mode 100644
index 000000000..67cad77fe
--- /dev/null
+++ b/kernel/fs/squashfs/namei.c
@@ -0,0 +1,252 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * namei.c
+ */
+
+/*
+ * This file implements code to do filename lookup in directories.
+ *
+ * Like inodes, directories are packed into compressed metadata blocks, stored
+ * in a directory table.  Directories are accessed using the start address of
+ * the metablock containing the directory and the offset into the
+ * decompressed block (<block, offset>).
+ *
+ * Directories are organised in a slightly complex way, and are not simply
+ * a list of file names.  The organisation takes advantage of the
+ * fact that (in most cases) the inodes of the files will be in the same
+ * compressed metadata block, and therefore, can share the start block.
+ * Directories are therefore organised in a two level list, a directory
+ * header containing the shared start block value, and a sequence of directory
+ * entries, each of which share the shared start block.  A new directory header
+ * is written once/if the inode start block changes.  The directory
+ * header/directory entry list is repeated as many times as necessary.
+ *
+ * Directories are sorted, and can contain a directory index to speed up
+ * file lookup.  Directory indexes store one entry per metablock, each entry
+ * storing the index/filename mapping to the first directory header
+ * in each metadata block.  Directories are sorted in alphabetical order,
+ * and at lookup the index is scanned linearly looking for the first filename
+ * alphabetically larger than the filename being looked up.  At this point the
+ * location of the metadata block the filename is in has been found.
+ * The general idea of the index is ensure only one metadata block needs to be
+ * decompressed to do a lookup irrespective of the length of the directory.
+ * This scheme has the advantage that it doesn't require extra memory overhead
+ * and doesn't require much extra storage on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+/*
+ * Lookup name in the directory index, returning the location of the metadata
+ * block containing it, and the directory index this represents.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_name(struct super_block *sb,
+			u64 *next_block, int *next_offset, u64 index_start,
+			int index_offset, int i_count, const char *name,
+			int len)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int i, length = 0, err;
+	unsigned int size;
+	struct squashfs_dir_index *index;
+	char *str;
+
+	TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
+
+	index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
+	if (index == NULL) {
+		ERROR("Failed to allocate squashfs_dir_index\n");
+		goto out;
+	}
+
+	str = &index->name[SQUASHFS_NAME_LEN + 1];
+	strncpy(str, name, len);
+	str[len] = '\0';
+
+	for (i = 0; i < i_count; i++) {
+		err = squashfs_read_metadata(sb, index, &index_start,
+					&index_offset, sizeof(*index));
+		if (err < 0)
+			break;
+
+
+		size = le32_to_cpu(index->size) + 1;
+		if (size > SQUASHFS_NAME_LEN)
+			break;
+
+		err = squashfs_read_metadata(sb, index->name, &index_start,
+					&index_offset, size);
+		if (err < 0)
+			break;
+
+		index->name[size] = '\0';
+
+		if (strcmp(index->name, str) > 0)
+			break;
+
+		length = le32_to_cpu(index->index);
+		*next_block = le32_to_cpu(index->start_block) +
+					msblk->directory_table;
+	}
+
+	*next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+	kfree(index);
+
+out:
+	/*
+	 * Return index (f_pos) of the looked up metadata block.  Translate
+	 * from internal f_pos to external f_pos which is offset by 3 because
+	 * we invent "." and ".." entries which are not actually stored in the
+	 * directory.
+	 */
+	return length + 3;
+}
+
+
+static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	struct inode *inode = NULL;
+	struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
+	struct squashfs_dir_header dirh;
+	struct squashfs_dir_entry *dire;
+	u64 block = squashfs_i(dir)->start + msblk->directory_table;
+	int offset = squashfs_i(dir)->offset;
+	int err, length;
+	unsigned int dir_count, size;
+
+	TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
+
+	dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+	if (dire == NULL) {
+		ERROR("Failed to allocate squashfs_dir_entry\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (len > SQUASHFS_NAME_LEN) {
+		err = -ENAMETOOLONG;
+		goto failed;
+	}
+
+	length = get_dir_index_using_name(dir->i_sb, &block, &offset,
+				squashfs_i(dir)->dir_idx_start,
+				squashfs_i(dir)->dir_idx_offset,
+				squashfs_i(dir)->dir_idx_cnt, name, len);
+
+	while (length < i_size_read(dir)) {
+		/*
+		 * Read directory header.
+		 */
+		err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
+				&offset, sizeof(dirh));
+		if (err < 0)
+			goto read_failure;
+
+		length += sizeof(dirh);
+
+		dir_count = le32_to_cpu(dirh.count) + 1;
+
+		if (dir_count > SQUASHFS_DIR_COUNT)
+			goto data_error;
+
+		while (dir_count--) {
+			/*
+			 * Read directory entry.
+			 */
+			err = squashfs_read_metadata(dir->i_sb, dire, &block,
+					&offset, sizeof(*dire));
+			if (err < 0)
+				goto read_failure;
+
+			size = le16_to_cpu(dire->size) + 1;
+
+			/* size should never be larger than SQUASHFS_NAME_LEN */
+			if (size > SQUASHFS_NAME_LEN)
+				goto data_error;
+
+			err = squashfs_read_metadata(dir->i_sb, dire->name,
+					&block, &offset, size);
+			if (err < 0)
+				goto read_failure;
+
+			length += sizeof(*dire) + size;
+
+			if (name[0] < dire->name[0])
+				goto exit_lookup;
+
+			if (len == size && !strncmp(name, dire->name, len)) {
+				unsigned int blk, off, ino_num;
+				long long ino;
+				blk = le32_to_cpu(dirh.start_block);
+				off = le16_to_cpu(dire->offset);
+				ino_num = le32_to_cpu(dirh.inode_number) +
+					(short) le16_to_cpu(dire->inode_number);
+				ino = SQUASHFS_MKINODE(blk, off);
+
+				TRACE("calling squashfs_iget for directory "
+					"entry %s, inode  %x:%x, %d\n", name,
+					blk, off, ino_num);
+
+				inode = squashfs_iget(dir->i_sb, ino, ino_num);
+				goto exit_lookup;
+			}
+		}
+	}
+
+exit_lookup:
+	kfree(dire);
+	return d_splice_alias(inode, dentry);
+
+data_error:
+	err = -EIO;
+
+read_failure:
+	ERROR("Unable to read directory block [%llx:%x]\n",
+		squashfs_i(dir)->start + msblk->directory_table,
+		squashfs_i(dir)->offset);
+failed:
+	kfree(dire);
+	return ERR_PTR(err);
+}
+
+
+const struct inode_operations squashfs_dir_inode_ops = {
+	.lookup = squashfs_lookup,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
diff --git a/kernel/fs/squashfs/page_actor.c b/kernel/fs/squashfs/page_actor.c
new file mode 100644
index 000000000..5a1c11f56
--- /dev/null
+++ b/kernel/fs/squashfs/page_actor.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include "page_actor.h"
+
+/*
+ * This file contains implementations of page_actor for decompressing into
+ * an intermediate buffer, and for decompressing directly into the
+ * page cache.
+ *
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
+
+/* Implementation of page_actor for decompressing into intermediate buffer */
+static void *cache_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->buffer[0];
+}
+
+static void *cache_next_page(struct squashfs_page_actor *actor)
+{
+	if (actor->next_page == actor->pages)
+		return NULL;
+
+	return actor->buffer[actor->next_page++];
+}
+
+static void cache_finish_page(struct squashfs_page_actor *actor)
+{
+	/* empty */
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->buffer = buffer;
+	actor->pages = pages;
+	actor->next_page = 0;
+	actor->squashfs_first_page = cache_first_page;
+	actor->squashfs_next_page = cache_next_page;
+	actor->squashfs_finish_page = cache_finish_page;
+	return actor;
+}
+
+/* Implementation of page_actor for decompressing directly into page cache. */
+static void *direct_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->pageaddr = kmap_atomic(actor->page[0]);
+}
+
+static void *direct_next_page(struct squashfs_page_actor *actor)
+{
+	if (actor->pageaddr)
+		kunmap_atomic(actor->pageaddr);
+
+	return actor->pageaddr = actor->next_page == actor->pages ? NULL :
+		kmap_atomic(actor->page[actor->next_page++]);
+}
+
+static void direct_finish_page(struct squashfs_page_actor *actor)
+{
+	if (actor->pageaddr)
+		kunmap_atomic(actor->pageaddr);
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->page = page;
+	actor->pages = pages;
+	actor->next_page = 0;
+	actor->pageaddr = NULL;
+	actor->squashfs_first_page = direct_first_page;
+	actor->squashfs_next_page = direct_next_page;
+	actor->squashfs_finish_page = direct_finish_page;
+	return actor;
+}
diff --git a/kernel/fs/squashfs/page_actor.h b/kernel/fs/squashfs/page_actor.h
new file mode 100644
index 000000000..26dd82008
--- /dev/null
+++ b/kernel/fs/squashfs/page_actor.h
@@ -0,0 +1,81 @@
+#ifndef PAGE_ACTOR_H
+#define PAGE_ACTOR_H
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef CONFIG_SQUASHFS_FILE_DIRECT
+struct squashfs_page_actor {
+	void	**page;
+	int	pages;
+	int	length;
+	int	next_page;
+};
+
+static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->page = page;
+	actor->pages = pages;
+	actor->next_page = 0;
+	return actor;
+}
+
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->page[0];
+}
+
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+	return actor->next_page == actor->pages ? NULL :
+		actor->page[actor->next_page++];
+}
+
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+	/* empty */
+}
+#else
+struct squashfs_page_actor {
+	union {
+		void		**buffer;
+		struct page	**page;
+	};
+	void	*pageaddr;
+	void    *(*squashfs_first_page)(struct squashfs_page_actor *);
+	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
+	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
+	int	pages;
+	int	length;
+	int	next_page;
+};
+
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
+							 **, int, int);
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+	return actor->squashfs_first_page(actor);
+}
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+	return actor->squashfs_next_page(actor);
+}
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+	actor->squashfs_finish_page(actor);
+}
+#endif
+#endif
diff --git a/kernel/fs/squashfs/squashfs.h b/kernel/fs/squashfs/squashfs.h
new file mode 100644
index 000000000..887d6d270
--- /dev/null
+++ b/kernel/fs/squashfs/squashfs.h
@@ -0,0 +1,113 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs.h
+ */
+
+#define TRACE(s, args...)	pr_debug("SQUASHFS: "s, ## args)
+
+#define ERROR(s, args...)	pr_err("SQUASHFS error: "s, ## args)
+
+#define WARNING(s, args...)	pr_warn("SQUASHFS: "s, ## args)
+
+/* block.c */
+extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
+				struct squashfs_page_actor *);
+
+/* cache.c */
+extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
+extern void squashfs_cache_delete(struct squashfs_cache *);
+extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
+				struct squashfs_cache *, u64, int);
+extern void squashfs_cache_put(struct squashfs_cache_entry *);
+extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
+extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
+				int *, int);
+extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
+				u64, int);
+extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
+				u64, int);
+extern void *squashfs_read_table(struct super_block *, u64, int);
+
+/* decompressor.c */
+extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
+
+/* decompressor_xxx.c */
+extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
+extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
+extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **,
+	int, int, int, struct squashfs_page_actor *);
+extern int squashfs_max_decompressors(void);
+
+/* export.c */
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
+				unsigned int);
+
+/* fragment.c */
+extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
+extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
+				u64, u64, unsigned int);
+
+/* file.c */
+void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
+				int);
+
+/* file_xxx.c */
+extern int squashfs_readpage_block(struct page *, u64, int);
+
+/* id.c */
+extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
+				unsigned short);
+
+/* inode.c */
+extern struct inode *squashfs_iget(struct super_block *, long long,
+				unsigned int);
+extern int squashfs_read_inode(struct inode *, long long);
+
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
+
+/*
+ * Inodes, files,  decompressor and xattr operations
+ */
+
+/* dir.c */
+extern const struct file_operations squashfs_dir_ops;
+
+/* export.c */
+extern const struct export_operations squashfs_export_ops;
+
+/* file.c */
+extern const struct address_space_operations squashfs_aops;
+
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
+
+/* namei.c */
+extern const struct inode_operations squashfs_dir_inode_ops;
+
+/* symlink.c */
+extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
diff --git a/kernel/fs/squashfs/squashfs_fs.h b/kernel/fs/squashfs/squashfs_fs.h
new file mode 100644
index 000000000..506f4ba5b
--- /dev/null
+++ b/kernel/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,457 @@
+#ifndef SQUASHFS_FS
+#define SQUASHFS_FS
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs.h
+ */
+
+#define SQUASHFS_CACHED_FRAGMENTS	CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
+#define SQUASHFS_MAJOR			4
+#define SQUASHFS_MINOR			0
+#define SQUASHFS_START			0
+
+/* size of metadata (inode and directory) blocks */
+#define SQUASHFS_METADATA_SIZE		8192
+
+/* default size of block device I/O */
+#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
+#define SQUASHFS_DEVBLK_SIZE 4096
+#else
+#define SQUASHFS_DEVBLK_SIZE 1024
+#endif
+
+#define SQUASHFS_FILE_MAX_SIZE		1048576
+#define SQUASHFS_FILE_MAX_LOG		20
+
+/* Max length of filename (not 255) */
+#define SQUASHFS_NAME_LEN		256
+
+/* Max value for directory header count*/
+#define SQUASHFS_DIR_COUNT		256
+
+#define SQUASHFS_INVALID_FRAG		(0xffffffffU)
+#define SQUASHFS_INVALID_XATTR		(0xffffffffU)
+#define SQUASHFS_INVALID_BLK		(-1LL)
+
+/* Filesystem flags */
+#define SQUASHFS_NOI			0
+#define SQUASHFS_NOD			1
+#define SQUASHFS_NOF			3
+#define SQUASHFS_NO_FRAG		4
+#define SQUASHFS_ALWAYS_FRAG		5
+#define SQUASHFS_DUPLICATE		6
+#define SQUASHFS_EXPORT			7
+#define SQUASHFS_COMP_OPT		10
+
+#define SQUASHFS_BIT(flag, bit)		((flag >> bit) & 1)
+
+#define SQUASHFS_UNCOMPRESSED_INODES(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOI)
+
+#define SQUASHFS_UNCOMPRESSED_DATA(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOD)
+
+#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOF)
+
+#define SQUASHFS_NO_FRAGMENTS(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_NO_FRAG)
+
+#define SQUASHFS_ALWAYS_FRAGMENTS(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_ALWAYS_FRAG)
+
+#define SQUASHFS_DUPLICATES(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_DUPLICATE)
+
+#define SQUASHFS_EXPORTABLE(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_EXPORT)
+
+#define SQUASHFS_COMP_OPTS(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_COMP_OPT)
+
+/* Inode types including extended types */
+#define SQUASHFS_DIR_TYPE		1
+#define SQUASHFS_REG_TYPE		2
+#define SQUASHFS_SYMLINK_TYPE		3
+#define SQUASHFS_BLKDEV_TYPE		4
+#define SQUASHFS_CHRDEV_TYPE		5
+#define SQUASHFS_FIFO_TYPE		6
+#define SQUASHFS_SOCKET_TYPE		7
+#define SQUASHFS_LDIR_TYPE		8
+#define SQUASHFS_LREG_TYPE		9
+#define SQUASHFS_LSYMLINK_TYPE		10
+#define SQUASHFS_LBLKDEV_TYPE		11
+#define SQUASHFS_LCHRDEV_TYPE		12
+#define SQUASHFS_LFIFO_TYPE		13
+#define SQUASHFS_LSOCKET_TYPE		14
+
+/* Max type value stored in directory entry */
+#define SQUASHFS_MAX_DIR_TYPE		7
+
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
+
+/* Flag whether block is compressed or uncompressed, bit is set if block is
+ * uncompressed */
+#define SQUASHFS_COMPRESSED_BIT		(1 << 15)
+
+#define SQUASHFS_COMPRESSED_SIZE(B)	(((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
+		(B) & ~SQUASHFS_COMPRESSED_BIT :  SQUASHFS_COMPRESSED_BIT)
+
+#define SQUASHFS_COMPRESSED(B)		(!((B) & SQUASHFS_COMPRESSED_BIT))
+
+#define SQUASHFS_COMPRESSED_BIT_BLOCK	(1 << 24)
+
+#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B)	((B) & \
+						~SQUASHFS_COMPRESSED_BIT_BLOCK)
+
+#define SQUASHFS_COMPRESSED_BLOCK(B)	(!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
+
+/*
+ * Inode number ops.  Inodes consist of a compressed block number, and an
+ * uncompressed offset within that block
+ */
+#define SQUASHFS_INODE_BLK(A)		((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_INODE_OFFSET(A)	((unsigned int) ((A) & 0xffff))
+
+#define SQUASHFS_MKINODE(A, B)		((long long)(((long long) (A)\
+					<< 16) + (B)))
+
+/* fragment and fragment table defines */
+#define SQUASHFS_FRAGMENT_BYTES(A)	\
+				((A) * sizeof(struct squashfs_fragment_entry))
+
+#define SQUASHFS_FRAGMENT_INDEX(A)	(SQUASHFS_FRAGMENT_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A)	(SQUASHFS_FRAGMENT_BYTES(A) % \
+						SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEXES(A)	((SQUASHFS_FRAGMENT_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_BYTES(A)	(SQUASHFS_FRAGMENT_INDEXES(A) *\
+						sizeof(u64))
+
+/* inode lookup table defines */
+#define SQUASHFS_LOOKUP_BYTES(A)	((A) * sizeof(u64))
+
+#define SQUASHFS_LOOKUP_BLOCK(A)	(SQUASHFS_LOOKUP_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A)	(SQUASHFS_LOOKUP_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCKS(A)	((SQUASHFS_LOOKUP_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_BYTES(A)	(SQUASHFS_LOOKUP_BLOCKS(A) *\
+					sizeof(u64))
+
+/* uid/gid lookup table defines */
+#define SQUASHFS_ID_BYTES(A)		((A) * sizeof(unsigned int))
+
+#define SQUASHFS_ID_BLOCK(A)		(SQUASHFS_ID_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_OFFSET(A)	(SQUASHFS_ID_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCKS(A)		((SQUASHFS_ID_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_BYTES(A)	(SQUASHFS_ID_BLOCKS(A) *\
+					sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)		((A) * sizeof(struct squashfs_xattr_id))
+
+#define SQUASHFS_XATTR_BLOCK(A)		(SQUASHFS_XATTR_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)	(SQUASHFS_XATTR_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCKS(A)	((SQUASHFS_XATTR_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)	(SQUASHFS_XATTR_BLOCKS(A) *\
+					sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)		((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_XATTR_OFFSET(A)	((unsigned int) ((A) & 0xffff))
+
+/* cached data constants for filesystem */
+#define SQUASHFS_CACHED_BLKS		8
+
+/* meta index cache */
+#define SQUASHFS_META_INDEXES	(SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
+#define SQUASHFS_META_ENTRIES	127
+#define SQUASHFS_META_SLOTS	8
+
+struct meta_entry {
+	u64			data_block;
+	unsigned int		index_block;
+	unsigned short		offset;
+	unsigned short		pad;
+};
+
+struct meta_index {
+	unsigned int		inode_number;
+	unsigned int		offset;
+	unsigned short		entries;
+	unsigned short		skip;
+	unsigned short		locked;
+	unsigned short		pad;
+	struct meta_entry	meta_entry[SQUASHFS_META_ENTRIES];
+};
+
+
+/*
+ * definitions for structures on disk
+ */
+#define ZLIB_COMPRESSION	1
+#define LZMA_COMPRESSION	2
+#define LZO_COMPRESSION		3
+#define XZ_COMPRESSION		4
+#define LZ4_COMPRESSION		5
+
+struct squashfs_super_block {
+	__le32			s_magic;
+	__le32			inodes;
+	__le32			mkfs_time;
+	__le32			block_size;
+	__le32			fragments;
+	__le16			compression;
+	__le16			block_log;
+	__le16			flags;
+	__le16			no_ids;
+	__le16			s_major;
+	__le16			s_minor;
+	__le64			root_inode;
+	__le64			bytes_used;
+	__le64			id_table_start;
+	__le64			xattr_id_table_start;
+	__le64			inode_table_start;
+	__le64			directory_table_start;
+	__le64			fragment_table_start;
+	__le64			lookup_table_start;
+};
+
+struct squashfs_dir_index {
+	__le32			index;
+	__le32			start_block;
+	__le32			size;
+	unsigned char		name[0];
+};
+
+struct squashfs_base_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+};
+
+struct squashfs_ipc_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+};
+
+struct squashfs_lipc_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			xattr;
+};
+
+struct squashfs_dev_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			rdev;
+};
+
+struct squashfs_ldev_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			rdev;
+	__le32			xattr;
+};
+
+struct squashfs_symlink_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			symlink_size;
+	char			symlink[0];
+};
+
+struct squashfs_reg_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			start_block;
+	__le32			fragment;
+	__le32			offset;
+	__le32			file_size;
+	__le16			block_list[0];
+};
+
+struct squashfs_lreg_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le64			start_block;
+	__le64			file_size;
+	__le64			sparse;
+	__le32			nlink;
+	__le32			fragment;
+	__le32			offset;
+	__le32			xattr;
+	__le16			block_list[0];
+};
+
+struct squashfs_dir_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			start_block;
+	__le32			nlink;
+	__le16			file_size;
+	__le16			offset;
+	__le32			parent_inode;
+};
+
+struct squashfs_ldir_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			file_size;
+	__le32			start_block;
+	__le32			parent_inode;
+	__le16			i_count;
+	__le16			offset;
+	__le32			xattr;
+	struct squashfs_dir_index	index[0];
+};
+
+union squashfs_inode {
+	struct squashfs_base_inode		base;
+	struct squashfs_dev_inode		dev;
+	struct squashfs_ldev_inode		ldev;
+	struct squashfs_symlink_inode		symlink;
+	struct squashfs_reg_inode		reg;
+	struct squashfs_lreg_inode		lreg;
+	struct squashfs_dir_inode		dir;
+	struct squashfs_ldir_inode		ldir;
+	struct squashfs_ipc_inode		ipc;
+	struct squashfs_lipc_inode		lipc;
+};
+
+struct squashfs_dir_entry {
+	__le16			offset;
+	__le16			inode_number;
+	__le16			type;
+	__le16			size;
+	char			name[0];
+};
+
+struct squashfs_dir_header {
+	__le32			count;
+	__le32			start_block;
+	__le32			inode_number;
+};
+
+struct squashfs_fragment_entry {
+	__le64			start_block;
+	__le32			size;
+	unsigned int		unused;
+};
+
+struct squashfs_xattr_entry {
+	__le16			type;
+	__le16			size;
+	char			data[0];
+};
+
+struct squashfs_xattr_val {
+	__le32			vsize;
+	char			value[0];
+};
+
+struct squashfs_xattr_id {
+	__le64			xattr;
+	__le32			count;
+	__le32			size;
+};
+
+struct squashfs_xattr_id_table {
+	__le64			xattr_table_start;
+	__le32			xattr_ids;
+	__le32			unused;
+};
+
+#endif
diff --git a/kernel/fs/squashfs/squashfs_fs_i.h b/kernel/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 000000000..73588e770
--- /dev/null
+++ b/kernel/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,54 @@
+#ifndef SQUASHFS_FS_I
+#define SQUASHFS_FS_I
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_i.h
+ */
+
+struct squashfs_inode_info {
+	u64		start;
+	int		offset;
+	u64		xattr;
+	unsigned int	xattr_size;
+	int		xattr_count;
+	union {
+		struct {
+			u64		fragment_block;
+			int		fragment_size;
+			int		fragment_offset;
+			u64		block_list_start;
+		};
+		struct {
+			u64		dir_idx_start;
+			int		dir_idx_offset;
+			int		dir_idx_cnt;
+			int		parent;
+		};
+	};
+	struct inode	vfs_inode;
+};
+
+
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
+#endif
diff --git a/kernel/fs/squashfs/squashfs_fs_sb.h b/kernel/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 000000000..1da565cb5
--- /dev/null
+++ b/kernel/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,80 @@
+#ifndef SQUASHFS_FS_SB
+#define SQUASHFS_FS_SB
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_sb.h
+ */
+
+#include "squashfs_fs.h"
+
+struct squashfs_cache {
+	char			*name;
+	int			entries;
+	int			curr_blk;
+	int			next_blk;
+	int			num_waiters;
+	int			unused;
+	int			block_size;
+	int			pages;
+	spinlock_t		lock;
+	wait_queue_head_t	wait_queue;
+	struct squashfs_cache_entry *entry;
+};
+
+struct squashfs_cache_entry {
+	u64			block;
+	int			length;
+	int			refcount;
+	u64			next_index;
+	int			pending;
+	int			error;
+	int			num_waiters;
+	wait_queue_head_t	wait_queue;
+	struct squashfs_cache	*cache;
+	void			**data;
+	struct squashfs_page_actor	*actor;
+};
+
+struct squashfs_sb_info {
+	const struct squashfs_decompressor	*decompressor;
+	int					devblksize;
+	int					devblksize_log2;
+	struct squashfs_cache			*block_cache;
+	struct squashfs_cache			*fragment_cache;
+	struct squashfs_cache			*read_page;
+	int					next_meta_index;
+	__le64					*id_table;
+	__le64					*fragment_index;
+	__le64					*xattr_id_table;
+	struct mutex				meta_index_mutex;
+	struct meta_index			*meta_index;
+	struct squashfs_stream			*stream;
+	__le64					*inode_lookup_table;
+	u64					inode_table;
+	u64					directory_table;
+	u64					xattr_table;
+	unsigned int				block_size;
+	unsigned short				block_log;
+	long long				bytes_used;
+	unsigned int				inodes;
+	int					xattr_ids;
+};
+#endif
diff --git a/kernel/fs/squashfs/super.c b/kernel/fs/squashfs/super.c
new file mode 100644
index 000000000..5056babe0
--- /dev/null
+++ b/kernel/fs/squashfs/super.c
@@ -0,0 +1,508 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * super.c
+ */
+
+/*
+ * This file implements code to read the superblock, read and initialise
+ * in-memory structures at mount time, and all the VFS glue code to register
+ * the filesystem.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "xattr.h"
+
+static struct file_system_type squashfs_fs_type;
+static const struct super_operations squashfs_super_ops;
+
+static const struct squashfs_decompressor *supported_squashfs_filesystem(short
+	major, short minor, short id)
+{
+	const struct squashfs_decompressor *decompressor;
+
+	if (major < SQUASHFS_MAJOR) {
+		ERROR("Major/Minor mismatch, older Squashfs %d.%d "
+			"filesystems are unsupported\n", major, minor);
+		return NULL;
+	} else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
+		ERROR("Major/Minor mismatch, trying to mount newer "
+			"%d.%d filesystem\n", major, minor);
+		ERROR("Please update your kernel\n");
+		return NULL;
+	}
+
+	decompressor = squashfs_lookup_decompressor(id);
+	if (!decompressor->supported) {
+		ERROR("Filesystem uses \"%s\" compression. This is not "
+			"supported\n", decompressor->name);
+		return NULL;
+	}
+
+	return decompressor;
+}
+
+
+static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct squashfs_sb_info *msblk;
+	struct squashfs_super_block *sblk = NULL;
+	char b[BDEVNAME_SIZE];
+	struct inode *root;
+	long long root_inode;
+	unsigned short flags;
+	unsigned int fragments;
+	u64 lookup_table_start, xattr_id_table_start, next_table;
+	int err;
+
+	TRACE("Entered squashfs_fill_superblock\n");
+
+	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
+	if (sb->s_fs_info == NULL) {
+		ERROR("Failed to allocate squashfs_sb_info\n");
+		return -ENOMEM;
+	}
+	msblk = sb->s_fs_info;
+
+	msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
+	msblk->devblksize_log2 = ffz(~msblk->devblksize);
+
+	mutex_init(&msblk->meta_index_mutex);
+
+	/*
+	 * msblk->bytes_used is checked in squashfs_read_table to ensure reads
+	 * are not beyond filesystem end.  But as we're using
+	 * squashfs_read_table here to read the superblock (including the value
+	 * of bytes_used) we need to set it to an initial sensible dummy value
+	 */
+	msblk->bytes_used = sizeof(*sblk);
+	sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk));
+
+	if (IS_ERR(sblk)) {
+		ERROR("unable to read squashfs_super_block\n");
+		err = PTR_ERR(sblk);
+		sblk = NULL;
+		goto failed_mount;
+	}
+
+	err = -EINVAL;
+
+	/* Check it is a SQUASHFS superblock */
+	sb->s_magic = le32_to_cpu(sblk->s_magic);
+	if (sb->s_magic != SQUASHFS_MAGIC) {
+		if (!silent)
+			ERROR("Can't find a SQUASHFS superblock on %s\n",
+						bdevname(sb->s_bdev, b));
+		goto failed_mount;
+	}
+
+	/* Check the MAJOR & MINOR versions and lookup compression type */
+	msblk->decompressor = supported_squashfs_filesystem(
+			le16_to_cpu(sblk->s_major),
+			le16_to_cpu(sblk->s_minor),
+			le16_to_cpu(sblk->compression));
+	if (msblk->decompressor == NULL)
+		goto failed_mount;
+
+	/* Check the filesystem does not extend beyond the end of the
+	   block device */
+	msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
+	if (msblk->bytes_used < 0 || msblk->bytes_used >
+			i_size_read(sb->s_bdev->bd_inode))
+		goto failed_mount;
+
+	/* Check block size for sanity */
+	msblk->block_size = le32_to_cpu(sblk->block_size);
+	if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
+		goto failed_mount;
+
+	/*
+	 * Check the system page size is not larger than the filesystem
+	 * block size (by default 128K).  This is currently not supported.
+	 */
+	if (PAGE_CACHE_SIZE > msblk->block_size) {
+		ERROR("Page size > filesystem block size (%d).  This is "
+			"currently not supported!\n", msblk->block_size);
+		goto failed_mount;
+	}
+
+	/* Check block log for sanity */
+	msblk->block_log = le16_to_cpu(sblk->block_log);
+	if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
+		goto failed_mount;
+
+	/* Check that block_size and block_log match */
+	if (msblk->block_size != (1 << msblk->block_log))
+		goto failed_mount;
+
+	/* Check the root inode for sanity */
+	root_inode = le64_to_cpu(sblk->root_inode);
+	if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
+		goto failed_mount;
+
+	msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
+	msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
+	msblk->inodes = le32_to_cpu(sblk->inodes);
+	flags = le16_to_cpu(sblk->flags);
+
+	TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+	TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
+				? "un" : "");
+	TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
+				? "un" : "");
+	TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
+	TRACE("Block size %d\n", msblk->block_size);
+	TRACE("Number of inodes %d\n", msblk->inodes);
+	TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
+	TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+	TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
+	TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
+	TRACE("sblk->fragment_table_start %llx\n",
+		(u64) le64_to_cpu(sblk->fragment_table_start));
+	TRACE("sblk->id_table_start %llx\n",
+		(u64) le64_to_cpu(sblk->id_table_start));
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_flags |= MS_RDONLY;
+	sb->s_op = &squashfs_super_ops;
+
+	err = -ENOMEM;
+
+	msblk->block_cache = squashfs_cache_init("metadata",
+			SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
+	if (msblk->block_cache == NULL)
+		goto failed_mount;
+
+	/* Allocate read_page block */
+	msblk->read_page = squashfs_cache_init("data",
+		squashfs_max_decompressors(), msblk->block_size);
+	if (msblk->read_page == NULL) {
+		ERROR("Failed to allocate read_page block\n");
+		goto failed_mount;
+	}
+
+	msblk->stream = squashfs_decompressor_setup(sb, flags);
+	if (IS_ERR(msblk->stream)) {
+		err = PTR_ERR(msblk->stream);
+		msblk->stream = NULL;
+		goto failed_mount;
+	}
+
+	/* Handle xattrs */
+	sb->s_xattr = squashfs_xattr_handlers;
+	xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+	if (xattr_id_table_start == SQUASHFS_INVALID_BLK) {
+		next_table = msblk->bytes_used;
+		goto allocate_id_index_table;
+	}
+
+	/* Allocate and read xattr id lookup table */
+	msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+		xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+	if (IS_ERR(msblk->xattr_id_table)) {
+		ERROR("unable to read xattr id index table\n");
+		err = PTR_ERR(msblk->xattr_id_table);
+		msblk->xattr_id_table = NULL;
+		if (err != -ENOTSUPP)
+			goto failed_mount;
+	}
+	next_table = msblk->xattr_table;
+
+allocate_id_index_table:
+	/* Allocate and read id index table */
+	msblk->id_table = squashfs_read_id_index_table(sb,
+		le64_to_cpu(sblk->id_table_start), next_table,
+		le16_to_cpu(sblk->no_ids));
+	if (IS_ERR(msblk->id_table)) {
+		ERROR("unable to read id index table\n");
+		err = PTR_ERR(msblk->id_table);
+		msblk->id_table = NULL;
+		goto failed_mount;
+	}
+	next_table = le64_to_cpu(msblk->id_table[0]);
+
+	/* Handle inode lookup table */
+	lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+	if (lookup_table_start == SQUASHFS_INVALID_BLK)
+		goto handle_fragments;
+
+	/* Allocate and read inode lookup table */
+	msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+		lookup_table_start, next_table, msblk->inodes);
+	if (IS_ERR(msblk->inode_lookup_table)) {
+		ERROR("unable to read inode lookup table\n");
+		err = PTR_ERR(msblk->inode_lookup_table);
+		msblk->inode_lookup_table = NULL;
+		goto failed_mount;
+	}
+	next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
+
+	sb->s_export_op = &squashfs_export_ops;
+
+handle_fragments:
+	fragments = le32_to_cpu(sblk->fragments);
+	if (fragments == 0)
+		goto check_directory_table;
+
+	msblk->fragment_cache = squashfs_cache_init("fragment",
+		SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
+	if (msblk->fragment_cache == NULL) {
+		err = -ENOMEM;
+		goto failed_mount;
+	}
+
+	/* Allocate and read fragment index table */
+	msblk->fragment_index = squashfs_read_fragment_index_table(sb,
+		le64_to_cpu(sblk->fragment_table_start), next_table, fragments);
+	if (IS_ERR(msblk->fragment_index)) {
+		ERROR("unable to read fragment index table\n");
+		err = PTR_ERR(msblk->fragment_index);
+		msblk->fragment_index = NULL;
+		goto failed_mount;
+	}
+	next_table = le64_to_cpu(msblk->fragment_index[0]);
+
+check_directory_table:
+	/* Sanity check directory_table */
+	if (msblk->directory_table > next_table) {
+		err = -EINVAL;
+		goto failed_mount;
+	}
+
+	/* Sanity check inode_table */
+	if (msblk->inode_table >= msblk->directory_table) {
+		err = -EINVAL;
+		goto failed_mount;
+	}
+
+	/* allocate root */
+	root = new_inode(sb);
+	if (!root) {
+		err = -ENOMEM;
+		goto failed_mount;
+	}
+
+	err = squashfs_read_inode(root, root_inode);
+	if (err) {
+		make_bad_inode(root);
+		iput(root);
+		goto failed_mount;
+	}
+	insert_inode_hash(root);
+
+	sb->s_root = d_make_root(root);
+	if (sb->s_root == NULL) {
+		ERROR("Root inode create failed\n");
+		err = -ENOMEM;
+		goto failed_mount;
+	}
+
+	TRACE("Leaving squashfs_fill_super\n");
+	kfree(sblk);
+	return 0;
+
+failed_mount:
+	squashfs_cache_delete(msblk->block_cache);
+	squashfs_cache_delete(msblk->fragment_cache);
+	squashfs_cache_delete(msblk->read_page);
+	squashfs_decompressor_destroy(msblk);
+	kfree(msblk->inode_lookup_table);
+	kfree(msblk->fragment_index);
+	kfree(msblk->id_table);
+	kfree(msblk->xattr_id_table);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+	kfree(sblk);
+	return err;
+}
+
+
+static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+	u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
+
+	TRACE("Entered squashfs_statfs\n");
+
+	buf->f_type = SQUASHFS_MAGIC;
+	buf->f_bsize = msblk->block_size;
+	buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
+	buf->f_bfree = buf->f_bavail = 0;
+	buf->f_files = msblk->inodes;
+	buf->f_ffree = 0;
+	buf->f_namelen = SQUASHFS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+
+static int squashfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+
+static void squashfs_put_super(struct super_block *sb)
+{
+	if (sb->s_fs_info) {
+		struct squashfs_sb_info *sbi = sb->s_fs_info;
+		squashfs_cache_delete(sbi->block_cache);
+		squashfs_cache_delete(sbi->fragment_cache);
+		squashfs_cache_delete(sbi->read_page);
+		squashfs_decompressor_destroy(sbi);
+		kfree(sbi->id_table);
+		kfree(sbi->fragment_index);
+		kfree(sbi->meta_index);
+		kfree(sbi->inode_lookup_table);
+		kfree(sbi->xattr_id_table);
+		kfree(sb->s_fs_info);
+		sb->s_fs_info = NULL;
+	}
+}
+
+
+static struct dentry *squashfs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
+}
+
+
+static struct kmem_cache *squashfs_inode_cachep;
+
+
+static void init_once(void *foo)
+{
+	struct squashfs_inode_info *ei = foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+
+static int __init init_inodecache(void)
+{
+	squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
+		sizeof(struct squashfs_inode_info), 0,
+		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+
+	return squashfs_inode_cachep ? 0 : -ENOMEM;
+}
+
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(squashfs_inode_cachep);
+}
+
+
+static int __init init_squashfs_fs(void)
+{
+	int err = init_inodecache();
+
+	if (err)
+		return err;
+
+	err = register_filesystem(&squashfs_fs_type);
+	if (err) {
+		destroy_inodecache();
+		return err;
+	}
+
+	pr_info("version 4.0 (2009/01/31) Phillip Lougher\n");
+
+	return 0;
+}
+
+
+static void __exit exit_squashfs_fs(void)
+{
+	unregister_filesystem(&squashfs_fs_type);
+	destroy_inodecache();
+}
+
+
+static struct inode *squashfs_alloc_inode(struct super_block *sb)
+{
+	struct squashfs_inode_info *ei =
+		kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+
+	return ei ? &ei->vfs_inode : NULL;
+}
+
+
+static void squashfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
+}
+
+static void squashfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
+
+
+static struct file_system_type squashfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "squashfs",
+	.mount = squashfs_mount,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV
+};
+MODULE_ALIAS_FS("squashfs");
+
+static const struct super_operations squashfs_super_ops = {
+	.alloc_inode = squashfs_alloc_inode,
+	.destroy_inode = squashfs_destroy_inode,
+	.statfs = squashfs_statfs,
+	.put_super = squashfs_put_super,
+	.remount_fs = squashfs_remount
+};
+
+module_init(init_squashfs_fs);
+module_exit(exit_squashfs_fs);
+MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
+MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/squashfs/symlink.c b/kernel/fs/squashfs/symlink.c
new file mode 100644
index 000000000..12806dffb
--- /dev/null
+++ b/kernel/fs/squashfs/symlink.c
@@ -0,0 +1,127 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * symlink.c
+ */
+
+/*
+ * This file implements code to handle symbolic links.
+ *
+ * The data contents of symbolic links are stored inside the symbolic
+ * link inode within the inode table.  This allows the normally small symbolic
+ * link to be compressed as part of the inode table, achieving much greater
+ * compression than if the symbolic link was compressed individually.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+static int squashfs_symlink_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int index = page->index << PAGE_CACHE_SHIFT;
+	u64 block = squashfs_i(inode)->start;
+	int offset = squashfs_i(inode)->offset;
+	int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+	int bytes, copied;
+	void *pageaddr;
+	struct squashfs_cache_entry *entry;
+
+	TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
+			"%llx, offset %x\n", page->index, block, offset);
+
+	/*
+	 * Skip index bytes into symlink metadata.
+	 */
+	if (index) {
+		bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
+								index);
+		if (bytes < 0) {
+			ERROR("Unable to read symlink [%llx:%x]\n",
+				squashfs_i(inode)->start,
+				squashfs_i(inode)->offset);
+			goto error_out;
+		}
+	}
+
+	/*
+	 * Read length bytes from symlink metadata.  Squashfs_read_metadata
+	 * is not used here because it can sleep and we want to use
+	 * kmap_atomic to map the page.  Instead call the underlying
+	 * squashfs_cache_get routine.  As length bytes may overlap metadata
+	 * blocks, we may need to call squashfs_cache_get multiple times.
+	 */
+	for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
+		entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
+		if (entry->error) {
+			ERROR("Unable to read symlink [%llx:%x]\n",
+				squashfs_i(inode)->start,
+				squashfs_i(inode)->offset);
+			squashfs_cache_put(entry);
+			goto error_out;
+		}
+
+		pageaddr = kmap_atomic(page);
+		copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
+								length - bytes);
+		if (copied == length - bytes)
+			memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+		else
+			block = entry->next_index;
+		kunmap_atomic(pageaddr);
+		squashfs_cache_put(entry);
+	}
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+error_out:
+	SetPageError(page);
+	unlock_page(page);
+	return 0;
+}
+
+
+const struct address_space_operations squashfs_symlink_aops = {
+	.readpage = squashfs_symlink_readpage
+};
+
+const struct inode_operations squashfs_symlink_inode_ops = {
+	.readlink = generic_readlink,
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/kernel/fs/squashfs/xattr.c b/kernel/fs/squashfs/xattr.c
new file mode 100644
index 000000000..e5e0ddf5b
--- /dev/null
+++ b/kernel/fs/squashfs/xattr.c
@@ -0,0 +1,324 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.c
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const struct xattr_handler *squashfs_xattr_handler(int);
+
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+	size_t buffer_size)
+{
+	struct inode *inode = d_inode(d);
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	size_t rest = buffer_size;
+	int err;
+
+	/* check that the file system has xattrs */
+	if (msblk->xattr_id_table == NULL)
+		return -EOPNOTSUPP;
+
+	/* loop reading each xattr name */
+	while (count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		const struct xattr_handler *handler;
+		int name_size, prefix_size = 0;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+		if (handler)
+			prefix_size = handler->list(d, buffer, rest, NULL,
+				name_size, handler->flags);
+		if (prefix_size) {
+			if (buffer) {
+				if (prefix_size + name_size + 1 > rest) {
+					err = -ERANGE;
+					goto failed;
+				}
+				buffer += prefix_size;
+			}
+			err = squashfs_read_metadata(sb, buffer, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+			if (buffer) {
+				buffer[name_size] = '\0';
+				buffer += name_size + 1;
+			}
+			rest -= prefix_size + name_size + 1;
+		} else  {
+			/* no handler or insuffficient privileges, so skip */
+			err = squashfs_read_metadata(sb, NULL, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+		}
+
+
+		/* skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+						sizeof(val));
+		if (err < 0)
+			goto failed;
+
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = buffer_size - rest;
+
+failed:
+	return err;
+}
+
+
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+	const char *name, void *buffer, size_t buffer_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	int name_len = strlen(name);
+	int err, vsize;
+	char *target = kmalloc(name_len, GFP_KERNEL);
+
+	if (target == NULL)
+		return  -ENOMEM;
+
+	/* loop reading each xattr name */
+	for (; count; count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		int type, prefix, name_size;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		type = le16_to_cpu(entry.type);
+		prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+
+		if (prefix == name_index && name_size == name_len)
+			err = squashfs_read_metadata(sb, target, &start,
+						&offset, name_size);
+		else
+			err = squashfs_read_metadata(sb, NULL, &start,
+						&offset, name_size);
+		if (err < 0)
+			goto failed;
+
+		if (prefix == name_index && name_size == name_len &&
+					strncmp(target, name, name_size) == 0) {
+			/* found xattr */
+			if (type & SQUASHFS_XATTR_VALUE_OOL) {
+				__le64 xattr_val;
+				u64 xattr;
+				/* val is a reference to the real location */
+				err = squashfs_read_metadata(sb, &val, &start,
+						&offset, sizeof(val));
+				if (err < 0)
+					goto failed;
+				err = squashfs_read_metadata(sb, &xattr_val,
+					&start, &offset, sizeof(xattr_val));
+				if (err < 0)
+					goto failed;
+				xattr = le64_to_cpu(xattr_val);
+				start = SQUASHFS_XATTR_BLK(xattr) +
+							msblk->xattr_table;
+				offset = SQUASHFS_XATTR_OFFSET(xattr);
+			}
+			/* read xattr value */
+			err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+			if (err < 0)
+				goto failed;
+
+			vsize = le32_to_cpu(val.vsize);
+			if (buffer) {
+				if (vsize > buffer_size) {
+					err = -ERANGE;
+					goto failed;
+				}
+				err = squashfs_read_metadata(sb, buffer, &start,
+					 &offset, vsize);
+				if (err < 0)
+					goto failed;
+			}
+			break;
+		}
+
+		/* no match, skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+		if (err < 0)
+			goto failed;
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = count ? vsize : -ENODATA;
+
+failed:
+	kfree(target);
+	return err;
+}
+
+
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+	const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_USER_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+	return XATTR_USER_PREFIX_LEN;
+}
+
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+	size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= squashfs_user_list,
+	.get	= squashfs_user_get
+};
+
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+	return XATTR_TRUSTED_PREFIX_LEN;
+}
+
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= squashfs_trusted_list,
+	.get	= squashfs_trusted_get
+};
+
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+	return XATTR_SECURITY_PREFIX_LEN;
+}
+
+static int squashfs_security_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= squashfs_security_list,
+	.get	= squashfs_security_get
+};
+
+static const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+	if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+		/* ignore unrecognised type */
+		return NULL;
+
+	switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+	case SQUASHFS_XATTR_USER:
+		return &squashfs_xattr_user_handler;
+	case SQUASHFS_XATTR_TRUSTED:
+		return &squashfs_xattr_trusted_handler;
+	case SQUASHFS_XATTR_SECURITY:
+		return &squashfs_xattr_security_handler;
+	default:
+		/* ignore unrecognised type */
+		return NULL;
+	}
+}
+
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+	&squashfs_xattr_user_handler,
+	&squashfs_xattr_trusted_handler,
+	&squashfs_xattr_security_handler,
+	NULL
+};
+
diff --git a/kernel/fs/squashfs/xattr.h b/kernel/fs/squashfs/xattr.h
new file mode 100644
index 000000000..c83f5d9ec
--- /dev/null
+++ b/kernel/fs/squashfs/xattr.h
@@ -0,0 +1,47 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+
+#ifdef CONFIG_SQUASHFS_XATTR
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+		u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+		unsigned int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+		u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+	ERROR("Xattrs in filesystem, these will be ignored\n");
+	*xattr_table_start = start;
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+		unsigned int index, int *count, unsigned int *size,
+		unsigned long long *xattr)
+{
+	return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/kernel/fs/squashfs/xattr_id.c b/kernel/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000..c89607d69
--- /dev/null
+++ b/kernel/fs/squashfs/xattr_id.c
@@ -0,0 +1,95 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+		int *count, unsigned int *size, unsigned long long *xattr)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_XATTR_BLOCK(index);
+	int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+	u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+	struct squashfs_xattr_id id;
+	int err;
+
+	err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+							sizeof(id));
+	if (err < 0)
+		return err;
+
+	*xattr = le64_to_cpu(id.xattr);
+	*size = le32_to_cpu(id.size);
+	*count = le32_to_cpu(id.count);
+	return 0;
+}
+
+
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+		u64 *xattr_table_start, int *xattr_ids)
+{
+	unsigned int len;
+	struct squashfs_xattr_id_table *id_table;
+
+	id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+	if (IS_ERR(id_table))
+		return (__le64 *) id_table;
+
+	*xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
+	*xattr_ids = le32_to_cpu(id_table->xattr_ids);
+	kfree(id_table);
+
+	/* Sanity check values */
+
+	/* there is always at least one xattr id */
+	if (*xattr_ids == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* xattr_table should be less than start */
+	if (*xattr_table_start >= start)
+		return ERR_PTR(-EINVAL);
+
+	len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+
+	TRACE("In read_xattr_index_table, length %d\n", len);
+
+	return squashfs_read_table(sb, start + sizeof(*id_table), len);
+}
diff --git a/kernel/fs/squashfs/xz_wrapper.c b/kernel/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000..c609624e4
--- /dev/null
+++ b/kernel/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,193 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+#include <linux/bitops.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "page_actor.h"
+
+struct squashfs_xz {
+	struct xz_dec *state;
+	struct xz_buf buf;
+};
+
+struct disk_comp_opts {
+	__le32 dictionary_size;
+	__le32 flags;
+};
+
+struct comp_opts {
+	int dict_size;
+};
+
+static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
+	void *buff, int len)
+{
+	struct disk_comp_opts *comp_opts = buff;
+	struct comp_opts *opts;
+	int err = 0, n;
+
+	opts = kmalloc(sizeof(*opts), GFP_KERNEL);
+	if (opts == NULL) {
+		err = -ENOMEM;
+		goto out2;
+	}
+
+	if (comp_opts) {
+		/* check compressor options are the expected length */
+		if (len < sizeof(*comp_opts)) {
+			err = -EIO;
+			goto out;
+		}
+
+		opts->dict_size = le32_to_cpu(comp_opts->dictionary_size);
+
+		/* the dictionary size should be 2^n or 2^n+2^(n+1) */
+		n = ffs(opts->dict_size) - 1;
+		if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) +
+						(1 << (n + 1))) {
+			err = -EIO;
+			goto out;
+		}
+	} else
+		/* use defaults */
+		opts->dict_size = max_t(int, msblk->block_size,
+							SQUASHFS_METADATA_SIZE);
+
+	return opts;
+
+out:
+	kfree(opts);
+out2:
+	return ERR_PTR(err);
+}
+
+
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff)
+{
+	struct comp_opts *comp_opts = buff;
+	struct squashfs_xz *stream;
+	int err;
+
+	stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size);
+	if (stream->state == NULL) {
+		kfree(stream);
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	return stream;
+
+failed:
+	ERROR("Failed to initialise xz decompressor\n");
+	return ERR_PTR(err);
+}
+
+
+static void squashfs_xz_free(void *strm)
+{
+	struct squashfs_xz *stream = strm;
+
+	if (stream) {
+		xz_dec_end(stream->state);
+		kfree(stream);
+	}
+}
+
+
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
+{
+	enum xz_ret xz_err;
+	int avail, total = 0, k = 0;
+	struct squashfs_xz *stream = strm;
+
+	xz_dec_reset(stream->state);
+	stream->buf.in_pos = 0;
+	stream->buf.in_size = 0;
+	stream->buf.out_pos = 0;
+	stream->buf.out_size = PAGE_CACHE_SIZE;
+	stream->buf.out = squashfs_first_page(output);
+
+	do {
+		if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+			avail = min(length, msblk->devblksize - offset);
+			length -= avail;
+			stream->buf.in = bh[k]->b_data + offset;
+			stream->buf.in_size = avail;
+			stream->buf.in_pos = 0;
+			offset = 0;
+		}
+
+		if (stream->buf.out_pos == stream->buf.out_size) {
+			stream->buf.out = squashfs_next_page(output);
+			if (stream->buf.out != NULL) {
+				stream->buf.out_pos = 0;
+				total += PAGE_CACHE_SIZE;
+			}
+		}
+
+		xz_err = xz_dec_run(stream->state, &stream->buf);
+
+		if (stream->buf.in_pos == stream->buf.in_size && k < b)
+			put_bh(bh[k++]);
+	} while (xz_err == XZ_OK);
+
+	squashfs_finish_page(output);
+
+	if (xz_err != XZ_STREAM_END || k < b)
+		goto out;
+
+	return total + stream->buf.out_pos;
+
+out:
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	.init = squashfs_xz_init,
+	.comp_opts = squashfs_xz_comp_opts,
+	.free = squashfs_xz_free,
+	.decompress = squashfs_xz_uncompress,
+	.id = XZ_COMPRESSION,
+	.name = "xz",
+	.supported = 1
+};
diff --git a/kernel/fs/squashfs/zlib_wrapper.c b/kernel/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000..8727caba6
--- /dev/null
+++ b/kernel/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,135 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * zlib_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/vmalloc.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "page_actor.h"
+
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff)
+{
+	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->workspace = vmalloc(zlib_inflate_workspacesize());
+	if (stream->workspace == NULL)
+		goto failed;
+
+	return stream;
+
+failed:
+	ERROR("Failed to allocate zlib workspace\n");
+	kfree(stream);
+	return ERR_PTR(-ENOMEM);
+}
+
+
+static void zlib_free(void *strm)
+{
+	z_stream *stream = strm;
+
+	if (stream)
+		vfree(stream->workspace);
+	kfree(stream);
+}
+
+
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
+{
+	int zlib_err, zlib_init = 0, k = 0;
+	z_stream *stream = strm;
+
+	stream->avail_out = PAGE_CACHE_SIZE;
+	stream->next_out = squashfs_first_page(output);
+	stream->avail_in = 0;
+
+	do {
+		if (stream->avail_in == 0 && k < b) {
+			int avail = min(length, msblk->devblksize - offset);
+			length -= avail;
+			stream->next_in = bh[k]->b_data + offset;
+			stream->avail_in = avail;
+			offset = 0;
+		}
+
+		if (stream->avail_out == 0) {
+			stream->next_out = squashfs_next_page(output);
+			if (stream->next_out != NULL)
+				stream->avail_out = PAGE_CACHE_SIZE;
+		}
+
+		if (!zlib_init) {
+			zlib_err = zlib_inflateInit(stream);
+			if (zlib_err != Z_OK) {
+				squashfs_finish_page(output);
+				goto out;
+			}
+			zlib_init = 1;
+		}
+
+		zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
+
+		if (stream->avail_in == 0 && k < b)
+			put_bh(bh[k++]);
+	} while (zlib_err == Z_OK);
+
+	squashfs_finish_page(output);
+
+	if (zlib_err != Z_STREAM_END)
+		goto out;
+
+	zlib_err = zlib_inflateEnd(stream);
+	if (zlib_err != Z_OK)
+		goto out;
+
+	if (k < b)
+		goto out;
+
+	return stream->total_out;
+
+out:
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_zlib_comp_ops = {
+	.init = zlib_init,
+	.free = zlib_free,
+	.decompress = zlib_uncompress,
+	.id = ZLIB_COMPRESSION,
+	.name = "zlib",
+	.supported = 1
+};
+
diff --git a/kernel/fs/stack.c b/kernel/fs/stack.c
new file mode 100644
index 000000000..a54e33ed1
--- /dev/null
+++ b/kernel/fs/stack.c
@@ -0,0 +1,76 @@
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/fs_stack.h>
+
+/* does _NOT_ require i_mutex to be held.
+ *
+ * This function cannot be inlined since i_size_{read,write} is rather
+ * heavy-weight on 32-bit systems
+ */
+void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
+{
+	loff_t i_size;
+	blkcnt_t i_blocks;
+
+	/*
+	 * i_size_read() includes its own seqlocking and protection from
+	 * preemption (see include/linux/fs.h): we need nothing extra for
+	 * that here, and prefer to avoid nesting locks than attempt to keep
+	 * i_size and i_blocks in sync together.
+	 */
+	i_size = i_size_read(src);
+
+	/*
+	 * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to
+	 * keep the two halves of i_blocks in sync despite SMP or PREEMPT -
+	 * though stat's generic_fillattr() doesn't bother, and we won't be
+	 * applying quotas (where i_blocks does become important) at the
+	 * upper level.
+	 *
+	 * We don't actually know what locking is used at the lower level;
+	 * but if it's a filesystem that supports quotas, it will be using
+	 * i_lock as in inode_add_bytes().
+	 */
+	if (sizeof(i_blocks) > sizeof(long))
+		spin_lock(&src->i_lock);
+	i_blocks = src->i_blocks;
+	if (sizeof(i_blocks) > sizeof(long))
+		spin_unlock(&src->i_lock);
+
+	/*
+	 * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for
+	 * fsstack_copy_inode_size() to hold some lock around
+	 * i_size_write(), otherwise i_size_read() may spin forever (see
+	 * include/linux/fs.h).  We don't necessarily hold i_mutex when this
+	 * is called, so take i_lock for that case.
+	 *
+	 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
+	 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
+	 * for that case too, and do both at once by combining the tests.
+	 *
+	 * There is none of this locking overhead in the 64-bit case.
+	 */
+	if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+		spin_lock(&dst->i_lock);
+	i_size_write(dst, i_size);
+	dst->i_blocks = i_blocks;
+	if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+		spin_unlock(&dst->i_lock);
+}
+EXPORT_SYMBOL_GPL(fsstack_copy_inode_size);
+
+/* copy all attributes */
+void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
+{
+	dest->i_mode = src->i_mode;
+	dest->i_uid = src->i_uid;
+	dest->i_gid = src->i_gid;
+	dest->i_rdev = src->i_rdev;
+	dest->i_atime = src->i_atime;
+	dest->i_mtime = src->i_mtime;
+	dest->i_ctime = src->i_ctime;
+	dest->i_blkbits = src->i_blkbits;
+	dest->i_flags = src->i_flags;
+	set_nlink(dest, src->i_nlink);
+}
+EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/kernel/fs/stat.c b/kernel/fs/stat.c
new file mode 100644
index 000000000..cccc1aab9
--- /dev/null
+++ b/kernel/fs/stat.c
@@ -0,0 +1,511 @@
+/*
+ *  linux/fs/stat.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/highuid.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+void generic_fillattr(struct inode *inode, struct kstat *stat)
+{
+	stat->dev = inode->i_sb->s_dev;
+	stat->ino = inode->i_ino;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->rdev = inode->i_rdev;
+	stat->size = i_size_read(inode);
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = (1 << inode->i_blkbits);
+	stat->blocks = inode->i_blocks;
+}
+
+EXPORT_SYMBOL(generic_fillattr);
+
+/**
+ * vfs_getattr_nosec - getattr without security checks
+ * @path: file to get attributes from
+ * @stat: structure to return attributes in
+ *
+ * Get attributes without calling security_inode_getattr.
+ *
+ * Currently the only caller other than vfs_getattr is internal to the
+ * filehandle lookup code, which uses only the inode number and returns
+ * no attributes to any user.  Any other code probably wants
+ * vfs_getattr.
+ */
+int vfs_getattr_nosec(struct path *path, struct kstat *stat)
+{
+	struct inode *inode = d_backing_inode(path->dentry);
+
+	if (inode->i_op->getattr)
+		return inode->i_op->getattr(path->mnt, path->dentry, stat);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+EXPORT_SYMBOL(vfs_getattr_nosec);
+
+int vfs_getattr(struct path *path, struct kstat *stat)
+{
+	int retval;
+
+	retval = security_inode_getattr(path);
+	if (retval)
+		return retval;
+	return vfs_getattr_nosec(path, stat);
+}
+
+EXPORT_SYMBOL(vfs_getattr);
+
+int vfs_fstat(unsigned int fd, struct kstat *stat)
+{
+	struct fd f = fdget_raw(fd);
+	int error = -EBADF;
+
+	if (f.file) {
+		error = vfs_getattr(&f.file->f_path, stat);
+		fdput(f);
+	}
+	return error;
+}
+EXPORT_SYMBOL(vfs_fstat);
+
+int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
+		int flag)
+{
+	struct path path;
+	int error = -EINVAL;
+	unsigned int lookup_flags = 0;
+
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+		      AT_EMPTY_PATH)) != 0)
+		goto out;
+
+	if (!(flag & AT_SYMLINK_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+retry:
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	error = vfs_getattr(&path, stat);
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+out:
+	return error;
+}
+EXPORT_SYMBOL(vfs_fstatat);
+
+int vfs_stat(const char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, 0);
+}
+EXPORT_SYMBOL(vfs_stat);
+
+int vfs_lstat(const char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
+}
+EXPORT_SYMBOL(vfs_lstat);
+
+
+#ifdef __ARCH_WANT_OLD_STAT
+
+/*
+ * For backward compatibility?  Maybe this should be moved
+ * into arch/i386 instead?
+ */
+static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf)
+{
+	static int warncount = 5;
+	struct __old_kernel_stat tmp;
+	
+	if (warncount > 0) {
+		warncount--;
+		printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
+			current->comm);
+	} else if (warncount < 0) {
+		/* it's laughable, but... */
+		warncount = 0;
+	}
+
+	memset(&tmp, 0, sizeof(struct __old_kernel_stat));
+	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
+		return -EOVERFLOW;
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+	tmp.st_rdev = old_encode_dev(stat->rdev);
+#if BITS_PER_LONG == 32
+	if (stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+#endif	
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(stat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+
+	return cp_old_stat(&stat, statbuf);
+}
+
+SYSCALL_DEFINE2(lstat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+
+	return cp_old_stat(&stat, statbuf);
+}
+
+SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_old_stat(&stat, statbuf);
+
+	return error;
+}
+
+#endif /* __ARCH_WANT_OLD_STAT */
+
+#if BITS_PER_LONG == 32
+#  define choose_32_64(a,b) a
+#else
+#  define choose_32_64(a,b) b
+#endif
+
+#define valid_dev(x)  choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
+
+#ifndef INIT_STRUCT_STAT_PADDING
+#  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
+#endif
+
+static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
+{
+	struct stat tmp;
+
+	if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+		return -EOVERFLOW;
+#if BITS_PER_LONG == 32
+	if (stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+#endif
+
+	INIT_STRUCT_STAT_PADDING(tmp);
+	tmp.st_dev = encode_dev(stat->dev);
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
+		return -EOVERFLOW;
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+	tmp.st_rdev = encode_dev(stat->rdev);
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+#ifdef STAT_HAVE_NSEC
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+#endif
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		struct stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_stat(filename, &stat);
+
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
+}
+
+SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		struct stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+
+	return cp_new_stat(&stat, statbuf);
+}
+
+#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
+SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
+		struct stat __user *, statbuf, int, flag)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
+}
+#endif
+
+SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_new_stat(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
+		char __user *, buf, int, bufsiz)
+{
+	struct path path;
+	int error;
+	int empty = 0;
+	unsigned int lookup_flags = LOOKUP_EMPTY;
+
+	if (bufsiz <= 0)
+		return -EINVAL;
+
+retry:
+	error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
+	if (!error) {
+		struct inode *inode = d_backing_inode(path.dentry);
+
+		error = empty ? -ENOENT : -EINVAL;
+		if (inode->i_op->readlink) {
+			error = security_inode_readlink(path.dentry);
+			if (!error) {
+				touch_atime(&path);
+				error = inode->i_op->readlink(path.dentry,
+							      buf, bufsiz);
+			}
+		}
+		path_put(&path);
+		if (retry_estale(error, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+	return error;
+}
+
+SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
+		int, bufsiz)
+{
+	return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
+}
+
+
+/* ---------- LFS-64 ----------- */
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
+
+#ifndef INIT_STRUCT_STAT64_PADDING
+#  define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
+#endif
+
+static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
+{
+	struct stat64 tmp;
+
+	INIT_STRUCT_STAT64_PADDING(tmp);
+#ifdef CONFIG_MIPS
+	/* mips has weird padding, so we don't get 64 bits there */
+	if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+	tmp.st_dev = new_encode_dev(stat->dev);
+	tmp.st_rdev = new_encode_dev(stat->rdev);
+#else
+	tmp.st_dev = huge_encode_dev(stat->dev);
+	tmp.st_rdev = huge_encode_dev(stat->rdev);
+#endif
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+#ifdef STAT64_HAS_BROKEN_ST_INO
+	tmp.__st_ino = stat->ino;
+#endif
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
+	tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+	tmp.st_size = stat->size;
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(stat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_stat(filename, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE2(lstat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_lstat(filename, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
+		struct stat64 __user *, statbuf, int, flag)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat64(&stat, statbuf);
+}
+#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
+
+/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
+void __inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+	inode->i_blocks += bytes >> 9;
+	bytes &= 511;
+	inode->i_bytes += bytes;
+	if (inode->i_bytes >= 512) {
+		inode->i_blocks++;
+		inode->i_bytes -= 512;
+	}
+}
+
+void inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+	spin_lock(&inode->i_lock);
+	__inode_add_bytes(inode, bytes);
+	spin_unlock(&inode->i_lock);
+}
+
+EXPORT_SYMBOL(inode_add_bytes);
+
+void __inode_sub_bytes(struct inode *inode, loff_t bytes)
+{
+	inode->i_blocks -= bytes >> 9;
+	bytes &= 511;
+	if (inode->i_bytes < bytes) {
+		inode->i_blocks--;
+		inode->i_bytes += 512;
+	}
+	inode->i_bytes -= bytes;
+}
+
+EXPORT_SYMBOL(__inode_sub_bytes);
+
+void inode_sub_bytes(struct inode *inode, loff_t bytes)
+{
+	spin_lock(&inode->i_lock);
+	__inode_sub_bytes(inode, bytes);
+	spin_unlock(&inode->i_lock);
+}
+
+EXPORT_SYMBOL(inode_sub_bytes);
+
+loff_t inode_get_bytes(struct inode *inode)
+{
+	loff_t ret;
+
+	spin_lock(&inode->i_lock);
+	ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+EXPORT_SYMBOL(inode_get_bytes);
+
+void inode_set_bytes(struct inode *inode, loff_t bytes)
+{
+	/* Caller is here responsible for sufficient locking
+	 * (ie. inode->i_lock) */
+	inode->i_blocks = bytes >> 9;
+	inode->i_bytes = bytes & 511;
+}
+
+EXPORT_SYMBOL(inode_set_bytes);
diff --git a/kernel/fs/statfs.c b/kernel/fs/statfs.c
new file mode 100644
index 000000000..083dc0ac9
--- /dev/null
+++ b/kernel/fs/statfs.c
@@ -0,0 +1,241 @@
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+static int flags_by_mnt(int mnt_flags)
+{
+	int flags = 0;
+
+	if (mnt_flags & MNT_READONLY)
+		flags |= ST_RDONLY;
+	if (mnt_flags & MNT_NOSUID)
+		flags |= ST_NOSUID;
+	if (mnt_flags & MNT_NODEV)
+		flags |= ST_NODEV;
+	if (mnt_flags & MNT_NOEXEC)
+		flags |= ST_NOEXEC;
+	if (mnt_flags & MNT_NOATIME)
+		flags |= ST_NOATIME;
+	if (mnt_flags & MNT_NODIRATIME)
+		flags |= ST_NODIRATIME;
+	if (mnt_flags & MNT_RELATIME)
+		flags |= ST_RELATIME;
+	return flags;
+}
+
+static int flags_by_sb(int s_flags)
+{
+	int flags = 0;
+	if (s_flags & MS_SYNCHRONOUS)
+		flags |= ST_SYNCHRONOUS;
+	if (s_flags & MS_MANDLOCK)
+		flags |= ST_MANDLOCK;
+	return flags;
+}
+
+static int calculate_f_flags(struct vfsmount *mnt)
+{
+	return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
+		flags_by_sb(mnt->mnt_sb->s_flags);
+}
+
+static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+{
+	int retval;
+
+	if (!dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+
+	memset(buf, 0, sizeof(*buf));
+	retval = security_sb_statfs(dentry);
+	if (retval)
+		return retval;
+	retval = dentry->d_sb->s_op->statfs(dentry, buf);
+	if (retval == 0 && buf->f_frsize == 0)
+		buf->f_frsize = buf->f_bsize;
+	return retval;
+}
+
+int vfs_statfs(struct path *path, struct kstatfs *buf)
+{
+	int error;
+
+	error = statfs_by_dentry(path->dentry, buf);
+	if (!error)
+		buf->f_flags = calculate_f_flags(path->mnt);
+	return error;
+}
+EXPORT_SYMBOL(vfs_statfs);
+
+int user_statfs(const char __user *pathname, struct kstatfs *st)
+{
+	struct path path;
+	int error;
+	unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
+retry:
+	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+	if (!error) {
+		error = vfs_statfs(&path, st);
+		path_put(&path);
+		if (retry_estale(error, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+	return error;
+}
+
+int fd_statfs(int fd, struct kstatfs *st)
+{
+	struct fd f = fdget_raw(fd);
+	int error = -EBADF;
+	if (f.file) {
+		error = vfs_statfs(&f.file->f_path, st);
+		fdput(f);
+	}
+	return error;
+}
+
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
+{
+	struct statfs buf;
+
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
+	else {
+		if (sizeof buf.f_blocks == 4) {
+			if ((st->f_blocks | st->f_bfree | st->f_bavail |
+			     st->f_bsize | st->f_frsize) &
+			    0xffffffff00000000ULL)
+				return -EOVERFLOW;
+			/*
+			 * f_files and f_ffree may be -1; it's okay to stuff
+			 * that into 32 bits
+			 */
+			if (st->f_files != -1 &&
+			    (st->f_files & 0xffffffff00000000ULL))
+				return -EOVERFLOW;
+			if (st->f_ffree != -1 &&
+			    (st->f_ffree & 0xffffffff00000000ULL))
+				return -EOVERFLOW;
+		}
+
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
+	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
+	return 0;
+}
+
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
+{
+	struct statfs64 buf;
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
+	else {
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
+	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
+	return 0;
+}
+
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
+{
+	struct kstatfs st;
+	int error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
+	return error;
+}
+
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
+{
+	struct kstatfs st;
+	int error;
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+	error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
+	return error;
+}
+
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
+{
+	struct kstatfs st;
+	int error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
+	return error;
+}
+
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
+{
+	struct kstatfs st;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
+	return error;
+}
+
+int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
+{
+	struct super_block *s = user_get_super(dev);
+	int err;
+	if (!s)
+		return -EINVAL;
+
+	err = statfs_by_dentry(s->s_root, sbuf);
+	drop_super(s);
+	return err;
+}
+
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+	struct ustat tmp;
+	struct kstatfs sbuf;
+	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	if (err)
+		return err;
+
+	memset(&tmp,0,sizeof(struct ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+
+	return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
+}
diff --git a/kernel/fs/super.c b/kernel/fs/super.c
new file mode 100644
index 000000000..928c20f47
--- /dev/null
+++ b/kernel/fs/super.c
@@ -0,0 +1,1396 @@
+/*
+ *  linux/fs/super.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  super.c contains code to handle: - mount structures
+ *                                   - super-block tables
+ *                                   - filesystem drivers list
+ *                                   - mount system call
+ *                                   - umount system call
+ *                                   - ustat system call
+ *
+ * GK 2/5/95  -  Changed to support mounting the root fs via NFS
+ *
+ *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
+ *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
+ *  Added options to /proc/mounts:
+ *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
+ *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
+ *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/writeback.h>		/* for the emergency remount stuff */
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
+#include <linux/fsnotify.h>
+#include <linux/lockdep.h>
+#include "internal.h"
+
+
+static LIST_HEAD(super_blocks);
+static DEFINE_SPINLOCK(sb_lock);
+
+static char *sb_writers_name[SB_FREEZE_LEVELS] = {
+	"sb_writers",
+	"sb_pagefaults",
+	"sb_internal",
+};
+
+/*
+ * One thing we have to be careful of with a per-sb shrinker is that we don't
+ * drop the last active reference to the superblock from within the shrinker.
+ * If that happens we could trigger unregistering the shrinker from within the
+ * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * take a passive reference to the superblock to avoid this from occurring.
+ */
+static unsigned long super_cache_scan(struct shrinker *shrink,
+				      struct shrink_control *sc)
+{
+	struct super_block *sb;
+	long	fs_objects = 0;
+	long	total_objects;
+	long	freed = 0;
+	long	dentries;
+	long	inodes;
+
+	sb = container_of(shrink, struct super_block, s_shrink);
+
+	/*
+	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
+	 * to recurse into the FS that called us in clear_inode() and friends..
+	 */
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	if (!trylock_super(sb))
+		return SHRINK_STOP;
+
+	if (sb->s_op->nr_cached_objects)
+		fs_objects = sb->s_op->nr_cached_objects(sb, sc);
+
+	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects = dentries + inodes + fs_objects + 1;
+	if (!total_objects)
+		total_objects = 1;
+
+	/* proportion the scan between the caches */
+	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
+	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+	fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
+
+	/*
+	 * prune the dcache first as the icache is pinned by it, then
+	 * prune the icache, followed by the filesystem specific caches
+	 *
+	 * Ensure that we always scan at least one object - memcg kmem
+	 * accounting uses this to fully empty the caches.
+	 */
+	sc->nr_to_scan = dentries + 1;
+	freed = prune_dcache_sb(sb, sc);
+	sc->nr_to_scan = inodes + 1;
+	freed += prune_icache_sb(sb, sc);
+
+	if (fs_objects) {
+		sc->nr_to_scan = fs_objects + 1;
+		freed += sb->s_op->free_cached_objects(sb, sc);
+	}
+
+	up_read(&sb->s_umount);
+	return freed;
+}
+
+static unsigned long super_cache_count(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct super_block *sb;
+	long	total_objects = 0;
+
+	sb = container_of(shrink, struct super_block, s_shrink);
+
+	/*
+	 * Don't call trylock_super as it is a potential
+	 * scalability bottleneck. The counts could get updated
+	 * between super_cache_count and super_cache_scan anyway.
+	 * Call to super_cache_count with shrinker_rwsem held
+	 * ensures the safety of call to list_lru_shrink_count() and
+	 * s_op->nr_cached_objects().
+	 */
+	if (sb->s_op && sb->s_op->nr_cached_objects)
+		total_objects = sb->s_op->nr_cached_objects(sb, sc);
+
+	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
+
+	total_objects = vfs_pressure_ratio(total_objects);
+	return total_objects;
+}
+
+/**
+ *	destroy_super	-	frees a superblock
+ *	@s: superblock to free
+ *
+ *	Frees a superblock.
+ */
+static void destroy_super(struct super_block *s)
+{
+	int i;
+	list_lru_destroy(&s->s_dentry_lru);
+	list_lru_destroy(&s->s_inode_lru);
+	for (i = 0; i < SB_FREEZE_LEVELS; i++)
+		percpu_counter_destroy(&s->s_writers.counter[i]);
+	security_sb_free(s);
+	WARN_ON(!list_empty(&s->s_mounts));
+	kfree(s->s_subtype);
+	kfree(s->s_options);
+	kfree_rcu(s, rcu);
+}
+
+/**
+ *	alloc_super	-	create new superblock
+ *	@type:	filesystem type superblock should belong to
+ *	@flags: the mount flags
+ *
+ *	Allocates and initializes a new &struct super_block.  alloc_super()
+ *	returns a pointer new superblock or %NULL if allocation had failed.
+ */
+static struct super_block *alloc_super(struct file_system_type *type, int flags)
+{
+	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
+	static const struct super_operations default_op;
+	int i;
+
+	if (!s)
+		return NULL;
+
+	INIT_LIST_HEAD(&s->s_mounts);
+
+	if (security_sb_alloc(s))
+		goto fail;
+
+	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+		if (percpu_counter_init(&s->s_writers.counter[i], 0,
+					GFP_KERNEL) < 0)
+			goto fail;
+		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+				 &type->s_writers_key[i], 0);
+	}
+	init_waitqueue_head(&s->s_writers.wait);
+	init_waitqueue_head(&s->s_writers.wait_unfrozen);
+	s->s_bdi = &noop_backing_dev_info;
+	s->s_flags = flags;
+	INIT_HLIST_NODE(&s->s_instances);
+	INIT_HLIST_BL_HEAD(&s->s_anon);
+	INIT_LIST_HEAD(&s->s_inodes);
+
+	if (list_lru_init_memcg(&s->s_dentry_lru))
+		goto fail;
+	if (list_lru_init_memcg(&s->s_inode_lru))
+		goto fail;
+
+	init_rwsem(&s->s_umount);
+	lockdep_set_class(&s->s_umount, &type->s_umount_key);
+	/*
+	 * sget() can have s_umount recursion.
+	 *
+	 * When it cannot find a suitable sb, it allocates a new
+	 * one (this one), and tries again to find a suitable old
+	 * one.
+	 *
+	 * In case that succeeds, it will acquire the s_umount
+	 * lock of the old one. Since these are clearly distrinct
+	 * locks, and this object isn't exposed yet, there's no
+	 * risk of deadlocks.
+	 *
+	 * Annotate this by putting this lock in a different
+	 * subclass.
+	 */
+	down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
+	s->s_count = 1;
+	atomic_set(&s->s_active, 1);
+	mutex_init(&s->s_vfs_rename_mutex);
+	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
+	mutex_init(&s->s_dquot.dqio_mutex);
+	mutex_init(&s->s_dquot.dqonoff_mutex);
+	s->s_maxbytes = MAX_NON_LFS;
+	s->s_op = &default_op;
+	s->s_time_gran = 1000000000;
+	s->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+	s->s_shrink.seeks = DEFAULT_SEEKS;
+	s->s_shrink.scan_objects = super_cache_scan;
+	s->s_shrink.count_objects = super_cache_count;
+	s->s_shrink.batch = 1024;
+	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+	return s;
+
+fail:
+	destroy_super(s);
+	return NULL;
+}
+
+/* Superblock refcounting  */
+
+/*
+ * Drop a superblock's refcount.  The caller must hold sb_lock.
+ */
+static void __put_super(struct super_block *sb)
+{
+	if (!--sb->s_count) {
+		list_del_init(&sb->s_list);
+		destroy_super(sb);
+	}
+}
+
+/**
+ *	put_super	-	drop a temporary reference to superblock
+ *	@sb: superblock in question
+ *
+ *	Drops a temporary reference, frees superblock if there's no
+ *	references left.
+ */
+static void put_super(struct super_block *sb)
+{
+	spin_lock(&sb_lock);
+	__put_super(sb);
+	spin_unlock(&sb_lock);
+}
+
+
+/**
+ *	deactivate_locked_super	-	drop an active reference to superblock
+ *	@s: superblock to deactivate
+ *
+ *	Drops an active reference to superblock, converting it into a temprory
+ *	one if there is no other active references left.  In that case we
+ *	tell fs driver to shut it down and drop the temporary reference we
+ *	had just acquired.
+ *
+ *	Caller holds exclusive lock on superblock; that lock is released.
+ */
+void deactivate_locked_super(struct super_block *s)
+{
+	struct file_system_type *fs = s->s_type;
+	if (atomic_dec_and_test(&s->s_active)) {
+		cleancache_invalidate_fs(s);
+		unregister_shrinker(&s->s_shrink);
+		fs->kill_sb(s);
+
+		/*
+		 * Since list_lru_destroy() may sleep, we cannot call it from
+		 * put_super(), where we hold the sb_lock. Therefore we destroy
+		 * the lru lists right now.
+		 */
+		list_lru_destroy(&s->s_dentry_lru);
+		list_lru_destroy(&s->s_inode_lru);
+
+		put_filesystem(fs);
+		put_super(s);
+	} else {
+		up_write(&s->s_umount);
+	}
+}
+
+EXPORT_SYMBOL(deactivate_locked_super);
+
+/**
+ *	deactivate_super	-	drop an active reference to superblock
+ *	@s: superblock to deactivate
+ *
+ *	Variant of deactivate_locked_super(), except that superblock is *not*
+ *	locked by caller.  If we are going to drop the final active reference,
+ *	lock will be acquired prior to that.
+ */
+void deactivate_super(struct super_block *s)
+{
+        if (!atomic_add_unless(&s->s_active, -1, 1)) {
+		down_write(&s->s_umount);
+		deactivate_locked_super(s);
+	}
+}
+
+EXPORT_SYMBOL(deactivate_super);
+
+/**
+ *	grab_super - acquire an active reference
+ *	@s: reference we are trying to make active
+ *
+ *	Tries to acquire an active reference.  grab_super() is used when we
+ * 	had just found a superblock in super_blocks or fs_type->fs_supers
+ *	and want to turn it into a full-blown active reference.  grab_super()
+ *	is called with sb_lock held and drops it.  Returns 1 in case of
+ *	success, 0 if we had failed (superblock contents was already dead or
+ *	dying when grab_super() had been called).  Note that this is only
+ *	called for superblocks not in rundown mode (== ones still on ->fs_supers
+ *	of their type), so increment of ->s_count is OK here.
+ */
+static int grab_super(struct super_block *s) __releases(sb_lock)
+{
+	s->s_count++;
+	spin_unlock(&sb_lock);
+	down_write(&s->s_umount);
+	if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
+		put_super(s);
+		return 1;
+	}
+	up_write(&s->s_umount);
+	put_super(s);
+	return 0;
+}
+
+/*
+ *	trylock_super - try to grab ->s_umount shared
+ *	@sb: reference we are trying to grab
+ *
+ *	Try to prevent fs shutdown.  This is used in places where we
+ *	cannot take an active reference but we need to ensure that the
+ *	filesystem is not shut down while we are working on it. It returns
+ *	false if we cannot acquire s_umount or if we lose the race and
+ *	filesystem already got into shutdown, and returns true with the s_umount
+ *	lock held in read mode in case of success. On successful return,
+ *	the caller must drop the s_umount lock when done.
+ *
+ *	Note that unlike get_super() et.al. this one does *not* bump ->s_count.
+ *	The reason why it's safe is that we are OK with doing trylock instead
+ *	of down_read().  There's a couple of places that are OK with that, but
+ *	it's very much not a general-purpose interface.
+ */
+bool trylock_super(struct super_block *sb)
+{
+	if (down_read_trylock(&sb->s_umount)) {
+		if (!hlist_unhashed(&sb->s_instances) &&
+		    sb->s_root && (sb->s_flags & MS_BORN))
+			return true;
+		up_read(&sb->s_umount);
+	}
+
+	return false;
+}
+
+/**
+ *	generic_shutdown_super	-	common helper for ->kill_sb()
+ *	@sb: superblock to kill
+ *
+ *	generic_shutdown_super() does all fs-independent work on superblock
+ *	shutdown.  Typical ->kill_sb() should pick all fs-specific objects
+ *	that need destruction out of superblock, call generic_shutdown_super()
+ *	and release aforementioned objects.  Note: dentries and inodes _are_
+ *	taken care of and do not need specific handling.
+ *
+ *	Upon calling this function, the filesystem may no longer alter or
+ *	rearrange the set of dentries belonging to this super_block, nor may it
+ *	change the attachments of dentries to inodes.
+ */
+void generic_shutdown_super(struct super_block *sb)
+{
+	const struct super_operations *sop = sb->s_op;
+
+	if (sb->s_root) {
+		shrink_dcache_for_umount(sb);
+		sync_filesystem(sb);
+		sb->s_flags &= ~MS_ACTIVE;
+
+		fsnotify_unmount_inodes(&sb->s_inodes);
+
+		evict_inodes(sb);
+
+		if (sb->s_dio_done_wq) {
+			destroy_workqueue(sb->s_dio_done_wq);
+			sb->s_dio_done_wq = NULL;
+		}
+
+		if (sop->put_super)
+			sop->put_super(sb);
+
+		if (!list_empty(&sb->s_inodes)) {
+			printk("VFS: Busy inodes after unmount of %s. "
+			   "Self-destruct in 5 seconds.  Have a nice day...\n",
+			   sb->s_id);
+		}
+	}
+	spin_lock(&sb_lock);
+	/* should be initialized for __put_super_and_need_restart() */
+	hlist_del_init(&sb->s_instances);
+	spin_unlock(&sb_lock);
+	up_write(&sb->s_umount);
+}
+
+EXPORT_SYMBOL(generic_shutdown_super);
+
+/**
+ *	sget	-	find or create a superblock
+ *	@type:	filesystem type superblock should belong to
+ *	@test:	comparison callback
+ *	@set:	setup callback
+ *	@flags:	mount flags
+ *	@data:	argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
+			int (*test)(struct super_block *,void *),
+			int (*set)(struct super_block *,void *),
+			int flags,
+			void *data)
+{
+	struct super_block *s = NULL;
+	struct super_block *old;
+	int err;
+
+retry:
+	spin_lock(&sb_lock);
+	if (test) {
+		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
+			if (!test(old, data))
+				continue;
+			if (!grab_super(old))
+				goto retry;
+			if (s) {
+				up_write(&s->s_umount);
+				destroy_super(s);
+				s = NULL;
+			}
+			return old;
+		}
+	}
+	if (!s) {
+		spin_unlock(&sb_lock);
+		s = alloc_super(type, flags);
+		if (!s)
+			return ERR_PTR(-ENOMEM);
+		goto retry;
+	}
+		
+	err = set(s, data);
+	if (err) {
+		spin_unlock(&sb_lock);
+		up_write(&s->s_umount);
+		destroy_super(s);
+		return ERR_PTR(err);
+	}
+	s->s_type = type;
+	strlcpy(s->s_id, type->name, sizeof(s->s_id));
+	list_add_tail(&s->s_list, &super_blocks);
+	hlist_add_head(&s->s_instances, &type->fs_supers);
+	spin_unlock(&sb_lock);
+	get_filesystem(type);
+	register_shrinker(&s->s_shrink);
+	return s;
+}
+
+EXPORT_SYMBOL(sget);
+
+void drop_super(struct super_block *sb)
+{
+	up_read(&sb->s_umount);
+	put_super(sb);
+}
+
+EXPORT_SYMBOL(drop_super);
+
+/**
+ *	iterate_supers - call function for all active superblocks
+ *	@f: function to call
+ *	@arg: argument to pass to it
+ *
+ *	Scans the superblock list and calls given function, passing it
+ *	locked superblock and given argument.
+ */
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+	struct super_block *sb, *p = NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+
+		down_read(&sb->s_umount);
+		if (sb->s_root && (sb->s_flags & MS_BORN))
+			f(sb, arg);
+		up_read(&sb->s_umount);
+
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		p = sb;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+}
+
+/**
+ *	iterate_supers_type - call function for superblocks of given type
+ *	@type: fs type
+ *	@f: function to call
+ *	@arg: argument to pass to it
+ *
+ *	Scans the superblock list and calls given function, passing it
+ *	locked superblock and given argument.
+ */
+void iterate_supers_type(struct file_system_type *type,
+	void (*f)(struct super_block *, void *), void *arg)
+{
+	struct super_block *sb, *p = NULL;
+
+	spin_lock(&sb_lock);
+	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+
+		down_read(&sb->s_umount);
+		if (sb->s_root && (sb->s_flags & MS_BORN))
+			f(sb, arg);
+		up_read(&sb->s_umount);
+
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		p = sb;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+}
+
+EXPORT_SYMBOL(iterate_supers_type);
+
+/**
+ *	get_super - get the superblock of a device
+ *	@bdev: device to get the superblock for
+ *	
+ *	Scans the superblock list and finds the superblock of the file system
+ *	mounted on the device given. %NULL is returned if no match is found.
+ */
+
+struct super_block *get_super(struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!bdev)
+		return NULL;
+
+	spin_lock(&sb_lock);
+rescan:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		if (sb->s_bdev == bdev) {
+			sb->s_count++;
+			spin_unlock(&sb_lock);
+			down_read(&sb->s_umount);
+			/* still alive? */
+			if (sb->s_root && (sb->s_flags & MS_BORN))
+				return sb;
+			up_read(&sb->s_umount);
+			/* nope, got unmounted */
+			spin_lock(&sb_lock);
+			__put_super(sb);
+			goto rescan;
+		}
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
+
+EXPORT_SYMBOL(get_super);
+
+/**
+ *	get_super_thawed - get thawed superblock of a device
+ *	@bdev: device to get the superblock for
+ *
+ *	Scans the superblock list and finds the superblock of the file system
+ *	mounted on the device. The superblock is returned once it is thawed
+ *	(or immediately if it was not frozen). %NULL is returned if no match
+ *	is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+	while (1) {
+		struct super_block *s = get_super(bdev);
+		if (!s || s->s_writers.frozen == SB_UNFROZEN)
+			return s;
+		up_read(&s->s_umount);
+		wait_event(s->s_writers.wait_unfrozen,
+			   s->s_writers.frozen == SB_UNFROZEN);
+		put_super(s);
+	}
+}
+EXPORT_SYMBOL(get_super_thawed);
+
+/**
+ * get_active_super - get an active reference to the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given.  Returns the superblock with an active
+ * reference or %NULL if none was found.
+ */
+struct super_block *get_active_super(struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!bdev)
+		return NULL;
+
+restart:
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		if (sb->s_bdev == bdev) {
+			if (!grab_super(sb))
+				goto restart;
+			up_write(&sb->s_umount);
+			return sb;
+		}
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
+ 
+struct super_block *user_get_super(dev_t dev)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+rescan:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		if (sb->s_dev ==  dev) {
+			sb->s_count++;
+			spin_unlock(&sb_lock);
+			down_read(&sb->s_umount);
+			/* still alive? */
+			if (sb->s_root && (sb->s_flags & MS_BORN))
+				return sb;
+			up_read(&sb->s_umount);
+			/* nope, got unmounted */
+			spin_lock(&sb_lock);
+			__put_super(sb);
+			goto rescan;
+		}
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
+
+/**
+ *	do_remount_sb - asks filesystem to change mount options.
+ *	@sb:	superblock in question
+ *	@flags:	numeric part of options
+ *	@data:	the rest of options
+ *      @force: whether or not to force the change
+ *
+ *	Alters the mount options of a mounted file system.
+ */
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+	int retval;
+	int remount_ro;
+
+	if (sb->s_writers.frozen != SB_UNFROZEN)
+		return -EBUSY;
+
+#ifdef CONFIG_BLOCK
+	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
+		return -EACCES;
+#endif
+
+	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+
+	if (remount_ro) {
+		if (!hlist_empty(&sb->s_pins)) {
+			up_write(&sb->s_umount);
+			group_pin_kill(&sb->s_pins);
+			down_write(&sb->s_umount);
+			if (!sb->s_root)
+				return 0;
+			if (sb->s_writers.frozen != SB_UNFROZEN)
+				return -EBUSY;
+			remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+		}
+	}
+	shrink_dcache_sb(sb);
+
+	/* If we are remounting RDONLY and current sb is read/write,
+	   make sure there are no rw files opened */
+	if (remount_ro) {
+		if (force) {
+			sb->s_readonly_remount = 1;
+			smp_wmb();
+		} else {
+			retval = sb_prepare_remount_readonly(sb);
+			if (retval)
+				return retval;
+		}
+	}
+
+	if (sb->s_op->remount_fs) {
+		retval = sb->s_op->remount_fs(sb, &flags, data);
+		if (retval) {
+			if (!force)
+				goto cancel_readonly;
+			/* If forced remount, go ahead despite any errors */
+			WARN(1, "forced remount of a %s fs returned %i\n",
+			     sb->s_type->name, retval);
+		}
+	}
+	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	/* Needs to be ordered wrt mnt_is_readonly() */
+	smp_wmb();
+	sb->s_readonly_remount = 0;
+
+	/*
+	 * Some filesystems modify their metadata via some other path than the
+	 * bdev buffer cache (eg. use a private mapping, or directories in
+	 * pagecache, etc). Also file data modifications go via their own
+	 * mappings. So If we try to mount readonly then copy the filesystem
+	 * from bdev, we could get stale data, so invalidate it to give a best
+	 * effort at coherency.
+	 */
+	if (remount_ro && sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
+	return 0;
+
+cancel_readonly:
+	sb->s_readonly_remount = 0;
+	return retval;
+}
+
+static void do_emergency_remount(struct work_struct *work)
+{
+	struct super_block *sb, *p = NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_write(&sb->s_umount);
+		if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) &&
+		    !(sb->s_flags & MS_RDONLY)) {
+			/*
+			 * What lock protects sb->s_flags??
+			 */
+			do_remount_sb(sb, MS_RDONLY, NULL, 1);
+		}
+		up_write(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		p = sb;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+	kfree(work);
+	printk("Emergency Remount complete\n");
+}
+
+void emergency_remount(void)
+{
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_emergency_remount);
+		schedule_work(work);
+	}
+}
+
+/*
+ * Unnamed block devices are dummy devices used by virtual
+ * filesystems which don't use real block-devices.  -- jrs
+ */
+
+static DEFINE_IDA(unnamed_dev_ida);
+static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
+/* Many userspace utilities consider an FSID of 0 invalid.
+ * Always return at least 1 from get_anon_bdev.
+ */
+static int unnamed_dev_start = 1;
+
+int get_anon_bdev(dev_t *p)
+{
+	int dev;
+	int error;
+
+ retry:
+	if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
+		return -ENOMEM;
+	spin_lock(&unnamed_dev_lock);
+	error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
+	if (!error)
+		unnamed_dev_start = dev + 1;
+	spin_unlock(&unnamed_dev_lock);
+	if (error == -EAGAIN)
+		/* We raced and lost with another CPU. */
+		goto retry;
+	else if (error)
+		return -EAGAIN;
+
+	if (dev == (1 << MINORBITS)) {
+		spin_lock(&unnamed_dev_lock);
+		ida_remove(&unnamed_dev_ida, dev);
+		if (unnamed_dev_start > dev)
+			unnamed_dev_start = dev;
+		spin_unlock(&unnamed_dev_lock);
+		return -EMFILE;
+	}
+	*p = MKDEV(0, dev & MINORMASK);
+	return 0;
+}
+EXPORT_SYMBOL(get_anon_bdev);
+
+void free_anon_bdev(dev_t dev)
+{
+	int slot = MINOR(dev);
+	spin_lock(&unnamed_dev_lock);
+	ida_remove(&unnamed_dev_ida, slot);
+	if (slot < unnamed_dev_start)
+		unnamed_dev_start = slot;
+	spin_unlock(&unnamed_dev_lock);
+}
+EXPORT_SYMBOL(free_anon_bdev);
+
+int set_anon_super(struct super_block *s, void *data)
+{
+	return get_anon_bdev(&s->s_dev);
+}
+
+EXPORT_SYMBOL(set_anon_super);
+
+void kill_anon_super(struct super_block *sb)
+{
+	dev_t dev = sb->s_dev;
+	generic_shutdown_super(sb);
+	free_anon_bdev(dev);
+}
+
+EXPORT_SYMBOL(kill_anon_super);
+
+void kill_litter_super(struct super_block *sb)
+{
+	if (sb->s_root)
+		d_genocide(sb->s_root);
+	kill_anon_super(sb);
+}
+
+EXPORT_SYMBOL(kill_litter_super);
+
+static int ns_test_super(struct super_block *sb, void *data)
+{
+	return sb->s_fs_info == data;
+}
+
+static int ns_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
+	void *data, int (*fill_super)(struct super_block *, void *, int))
+{
+	struct super_block *sb;
+
+	sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+
+	if (!sb->s_root) {
+		int err;
+		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(err);
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+EXPORT_SYMBOL(mount_ns);
+
+#ifdef CONFIG_BLOCK
+static int set_bdev_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	return 0;
+}
+
+static int test_bdev_super(struct super_block *s, void *data)
+{
+	return (void *)s->s_bdev == data;
+}
+
+struct dentry *mount_bdev(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data,
+	int (*fill_super)(struct super_block *, void *, int))
+{
+	struct block_device *bdev;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
+	int error = 0;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		error = -EBUSY;
+		goto error_bdev;
+	}
+	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
+		 bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	if (IS_ERR(s))
+		goto error_s;
+
+	if (s->s_root) {
+		if ((flags ^ s->s_flags) & MS_RDONLY) {
+			deactivate_locked_super(s);
+			error = -EBUSY;
+			goto error_bdev;
+		}
+
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
+		 */
+		up_write(&s->s_umount);
+		blkdev_put(bdev, mode);
+		down_write(&s->s_umount);
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_mode = mode;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(bdev));
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			goto error;
+		}
+
+		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
+	}
+
+	return dget(s->s_root);
+
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
+	blkdev_put(bdev, mode);
+error:
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL(mount_bdev);
+
+void kill_block_super(struct super_block *sb)
+{
+	struct block_device *bdev = sb->s_bdev;
+	fmode_t mode = sb->s_mode;
+
+	bdev->bd_super = NULL;
+	generic_shutdown_super(sb);
+	sync_blockdev(bdev);
+	WARN_ON_ONCE(!(mode & FMODE_EXCL));
+	blkdev_put(bdev, mode | FMODE_EXCL);
+}
+
+EXPORT_SYMBOL(kill_block_super);
+#endif
+
+struct dentry *mount_nodev(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int))
+{
+	int error;
+	struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
+
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+	if (error) {
+		deactivate_locked_super(s);
+		return ERR_PTR(error);
+	}
+	s->s_flags |= MS_ACTIVE;
+	return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_nodev);
+
+static int compare_single(struct super_block *s, void *p)
+{
+	return 1;
+}
+
+struct dentry *mount_single(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int))
+{
+	struct super_block *s;
+	int error;
+
+	s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+	if (!s->s_root) {
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			return ERR_PTR(error);
+		}
+		s->s_flags |= MS_ACTIVE;
+	} else {
+		do_remount_sb(s, flags, data, 0);
+	}
+	return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_single);
+
+struct dentry *
+mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+{
+	struct dentry *root;
+	struct super_block *sb;
+	char *secdata = NULL;
+	int error = -ENOMEM;
+
+	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
+		secdata = alloc_secdata();
+		if (!secdata)
+			goto out;
+
+		error = security_sb_copy_data(data, secdata);
+		if (error)
+			goto out_free_secdata;
+	}
+
+	root = type->mount(type, flags, name, data);
+	if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		goto out_free_secdata;
+	}
+	sb = root->d_sb;
+	BUG_ON(!sb);
+	WARN_ON(!sb->s_bdi);
+	sb->s_flags |= MS_BORN;
+
+	error = security_sb_kern_mount(sb, flags, secdata);
+	if (error)
+		goto out_sb;
+
+	/*
+	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
+	 * but s_maxbytes was an unsigned long long for many releases. Throw
+	 * this warning for a little while to try and catch filesystems that
+	 * violate this rule.
+	 */
+	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+		"negative value (%lld)\n", type->name, sb->s_maxbytes);
+
+	up_write(&sb->s_umount);
+	free_secdata(secdata);
+	return root;
+out_sb:
+	dput(root);
+	deactivate_locked_super(sb);
+out_free_secdata:
+	free_secdata(secdata);
+out:
+	return ERR_PTR(error);
+}
+
+/*
+ * This is an internal function, please use sb_end_{write,pagefault,intwrite}
+ * instead.
+ */
+void __sb_end_write(struct super_block *sb, int level)
+{
+	percpu_counter_dec(&sb->s_writers.counter[level-1]);
+	/*
+	 * Make sure s_writers are updated before we wake up waiters in
+	 * freeze_super().
+	 */
+	smp_mb();
+	if (waitqueue_active(&sb->s_writers.wait))
+		wake_up(&sb->s_writers.wait);
+	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+}
+EXPORT_SYMBOL(__sb_end_write);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * We want lockdep to tell us about possible deadlocks with freezing but
+ * it's it bit tricky to properly instrument it. Getting a freeze protection
+ * works as getting a read lock but there are subtle problems. XFS for example
+ * gets freeze protection on internal level twice in some cases, which is OK
+ * only because we already hold a freeze protection also on higher level. Due
+ * to these cases we have to tell lockdep we are doing trylock when we
+ * already hold a freeze protection for a higher freeze level.
+ */
+static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
+				unsigned long ip)
+{
+	int i;
+
+	if (!trylock) {
+		for (i = 0; i < level - 1; i++)
+			if (lock_is_held(&sb->s_writers.lock_map[i])) {
+				trylock = true;
+				break;
+			}
+	}
+	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
+}
+#endif
+
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+int __sb_start_write(struct super_block *sb, int level, bool wait)
+{
+retry:
+	if (unlikely(sb->s_writers.frozen >= level)) {
+		if (!wait)
+			return 0;
+		wait_event(sb->s_writers.wait_unfrozen,
+			   sb->s_writers.frozen < level);
+	}
+
+#ifdef CONFIG_LOCKDEP
+	acquire_freeze_lock(sb, level, !wait, _RET_IP_);
+#endif
+	percpu_counter_inc(&sb->s_writers.counter[level-1]);
+	/*
+	 * Make sure counter is updated before we check for frozen.
+	 * freeze_super() first sets frozen and then checks the counter.
+	 */
+	smp_mb();
+	if (unlikely(sb->s_writers.frozen >= level)) {
+		__sb_end_write(sb, level);
+		goto retry;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(__sb_start_write);
+
+/**
+ * sb_wait_write - wait until all writers to given file system finish
+ * @sb: the super for which we wait
+ * @level: type of writers we wait for (normal vs page fault)
+ *
+ * This function waits until there are no writers of given type to given file
+ * system. Caller of this function should make sure there can be no new writers
+ * of type @level before calling this function. Otherwise this function can
+ * livelock.
+ */
+static void sb_wait_write(struct super_block *sb, int level)
+{
+	s64 writers;
+
+	/*
+	 * We just cycle-through lockdep here so that it does not complain
+	 * about returning with lock to userspace
+	 */
+	rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
+
+	do {
+		DEFINE_WAIT(wait);
+
+		/*
+		 * We use a barrier in prepare_to_wait() to separate setting
+		 * of frozen and checking of the counter
+		 */
+		prepare_to_wait(&sb->s_writers.wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
+		if (writers)
+			schedule();
+
+		finish_wait(&sb->s_writers.wait, &wait);
+	} while (writers);
+}
+
+/**
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first thawing the fs will return
+ * -EBUSY.
+ *
+ * During this function, sb->s_writers.frozen goes through these values:
+ *
+ * SB_UNFROZEN: File system is normal, all writes progress as usual.
+ *
+ * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
+ * writes should be blocked, though page faults are still allowed. We wait for
+ * all writes to complete and then proceed to the next stage.
+ *
+ * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
+ * but internal fs threads can still modify the filesystem (although they
+ * should not dirty new pages or inodes), writeback can run etc. After waiting
+ * for all running page faults we sync the filesystem which will clean all
+ * dirty pages and inodes (no new dirty pages or inodes can be created when
+ * sync is running).
+ *
+ * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
+ * modification are blocked (e.g. XFS preallocation truncation on inode
+ * reclaim). This is usually implemented by blocking new transactions for
+ * filesystems that have them and need this additional guard. After all
+ * internal writers are finished we call ->freeze_fs() to finish filesystem
+ * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
+ * mostly auxiliary for filesystems to verify they do not modify frozen fs.
+ *
+ * sb->s_writers.frozen is protected by sb->s_umount.
+ */
+int freeze_super(struct super_block *sb)
+{
+	int ret;
+
+	atomic_inc(&sb->s_active);
+	down_write(&sb->s_umount);
+	if (sb->s_writers.frozen != SB_UNFROZEN) {
+		deactivate_locked_super(sb);
+		return -EBUSY;
+	}
+
+	if (!(sb->s_flags & MS_BORN)) {
+		up_write(&sb->s_umount);
+		return 0;	/* sic - it's "nothing to do" */
+	}
+
+	if (sb->s_flags & MS_RDONLY) {
+		/* Nothing to do really... */
+		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+		up_write(&sb->s_umount);
+		return 0;
+	}
+
+	/* From now on, no new normal writers can start */
+	sb->s_writers.frozen = SB_FREEZE_WRITE;
+	smp_wmb();
+
+	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
+	up_write(&sb->s_umount);
+
+	sb_wait_write(sb, SB_FREEZE_WRITE);
+
+	/* Now we go and block page faults... */
+	down_write(&sb->s_umount);
+	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
+	smp_wmb();
+
+	sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
+
+	/* All writers are done so after syncing there won't be dirty data */
+	sync_filesystem(sb);
+
+	/* Now wait for internal filesystem counter */
+	sb->s_writers.frozen = SB_FREEZE_FS;
+	smp_wmb();
+	sb_wait_write(sb, SB_FREEZE_FS);
+
+	if (sb->s_op->freeze_fs) {
+		ret = sb->s_op->freeze_fs(sb);
+		if (ret) {
+			printk(KERN_ERR
+				"VFS:Filesystem freeze failed\n");
+			sb->s_writers.frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_writers.wait_unfrozen);
+			deactivate_locked_super(sb);
+			return ret;
+		}
+	}
+	/*
+	 * This is just for debugging purposes so that fs can warn if it
+	 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
+	 */
+	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+	up_write(&sb->s_umount);
+	return 0;
+}
+EXPORT_SYMBOL(freeze_super);
+
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
+int thaw_super(struct super_block *sb)
+{
+	int error;
+
+	down_write(&sb->s_umount);
+	if (sb->s_writers.frozen == SB_UNFROZEN) {
+		up_write(&sb->s_umount);
+		return -EINVAL;
+	}
+
+	if (sb->s_flags & MS_RDONLY)
+		goto out;
+
+	if (sb->s_op->unfreeze_fs) {
+		error = sb->s_op->unfreeze_fs(sb);
+		if (error) {
+			printk(KERN_ERR
+				"VFS:Filesystem thaw failed\n");
+			up_write(&sb->s_umount);
+			return error;
+		}
+	}
+
+out:
+	sb->s_writers.frozen = SB_UNFROZEN;
+	smp_wmb();
+	wake_up(&sb->s_writers.wait_unfrozen);
+	deactivate_locked_super(sb);
+
+	return 0;
+}
+EXPORT_SYMBOL(thaw_super);
diff --git a/kernel/fs/sync.c b/kernel/fs/sync.c
new file mode 100644
index 000000000..fbc98ee62
--- /dev/null
+++ b/kernel/fs/sync.c
@@ -0,0 +1,366 @@
+/*
+ * High-level sync()-related operations
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/syscalls.h>
+#include <linux/linkage.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/backing-dev.h>
+#include "internal.h"
+
+#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
+			SYNC_FILE_RANGE_WAIT_AFTER)
+
+/*
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
+ * submit IO for these buffers via __sync_blockdev(). This also speeds up the
+ * wait == 1 case since in that case write_inode() functions do
+ * sync_dirty_buffer() and thus effectively write one block at a time.
+ */
+static int __sync_filesystem(struct super_block *sb, int wait)
+{
+	if (wait)
+		sync_inodes_sb(sb);
+	else
+		writeback_inodes_sb(sb, WB_REASON_SYNC);
+
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int sync_filesystem(struct super_block *sb)
+{
+	int ret;
+
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	/*
+	 * No point in syncing out anything if the filesystem is read-only.
+	 */
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	ret = __sync_filesystem(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __sync_filesystem(sb, 1);
+}
+EXPORT_SYMBOL(sync_filesystem);
+
+static void sync_inodes_one_sb(struct super_block *sb, void *arg)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		sync_inodes_sb(sb);
+}
+
+static void sync_fs_one_sb(struct super_block *sb, void *arg)
+{
+	if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, *(int *)arg);
+}
+
+static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
+{
+	filemap_fdatawrite(bdev->bd_inode->i_mapping);
+}
+
+static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
+{
+	filemap_fdatawait(bdev->bd_inode->i_mapping);
+}
+
+/*
+ * Sync everything. We start by waking flusher threads so that most of
+ * writeback runs on all devices in parallel. Then we sync all inodes reliably
+ * which effectively also waits for all flusher threads to finish doing
+ * writeback. At this point all data is on disk so metadata should be stable
+ * and we tell filesystems to sync their metadata via ->sync_fs() calls.
+ * Finally, we writeout all block devices because some filesystems (e.g. ext2)
+ * just write metadata (such as inodes or bitmaps) to block device page cache
+ * and do not sync it on their own in ->sync_fs().
+ */
+SYSCALL_DEFINE0(sync)
+{
+	int nowait = 0, wait = 1;
+
+	wakeup_flusher_threads(0, WB_REASON_SYNC);
+	iterate_supers(sync_inodes_one_sb, NULL);
+	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_fs_one_sb, &wait);
+	iterate_bdevs(fdatawrite_one_bdev, NULL);
+	iterate_bdevs(fdatawait_one_bdev, NULL);
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
+	return 0;
+}
+
+static void do_sync_work(struct work_struct *work)
+{
+	int nowait = 0;
+
+	/*
+	 * Sync twice to reduce the possibility we skipped some inodes / pages
+	 * because they were temporarily locked
+	 */
+	iterate_supers(sync_inodes_one_sb, &nowait);
+	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_bdevs(fdatawrite_one_bdev, NULL);
+	iterate_supers(sync_inodes_one_sb, &nowait);
+	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_bdevs(fdatawrite_one_bdev, NULL);
+	printk("Emergency Sync complete\n");
+	kfree(work);
+}
+
+void emergency_sync(void)
+{
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_sync_work);
+		schedule_work(work);
+	}
+}
+
+/*
+ * sync a single super
+ */
+SYSCALL_DEFINE1(syncfs, int, fd)
+{
+	struct fd f = fdget(fd);
+	struct super_block *sb;
+	int ret;
+
+	if (!f.file)
+		return -EBADF;
+	sb = f.file->f_path.dentry->d_sb;
+
+	down_read(&sb->s_umount);
+	ret = sync_filesystem(sb);
+	up_read(&sb->s_umount);
+
+	fdput(f);
+	return ret;
+}
+
+/**
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
+ * @file:		file to sync
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @datasync:		perform only datasync
+ *
+ * Write back data in range @start..@end and metadata for @file to disk.  If
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
+ */
+int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+
+	if (!file->f_op->fsync)
+		return -EINVAL;
+	if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_DIRTY_TIME;
+		spin_unlock(&inode->i_lock);
+		mark_inode_dirty_sync(inode);
+	}
+	return file->f_op->fsync(file, start, end, datasync);
+}
+EXPORT_SYMBOL(vfs_fsync_range);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ */
+int vfs_fsync(struct file *file, int datasync)
+{
+	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
+}
+EXPORT_SYMBOL(vfs_fsync);
+
+static int do_fsync(unsigned int fd, int datasync)
+{
+	struct fd f = fdget(fd);
+	int ret = -EBADF;
+
+	if (f.file) {
+		ret = vfs_fsync(f.file, datasync);
+		fdput(f);
+	}
+	return ret;
+}
+
+SYSCALL_DEFINE1(fsync, unsigned int, fd)
+{
+	return do_fsync(fd, 0);
+}
+
+SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
+{
+	return do_fsync(fd, 1);
+}
+
+/*
+ * sys_sync_file_range() permits finely controlled syncing over a segment of
+ * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
+ * zero then sys_sync_file_range() will operate from offset out to EOF.
+ *
+ * The flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
+ * before performing the write.
+ *
+ * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
+ *
+ * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
+ * after performing the write.
+ *
+ * Useful combinations of the flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
+ * in the range which were dirty on entry to sys_sync_file_range() are placed
+ * under writeout.  This is a start-write-for-data-integrity operation.
+ *
+ * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
+ * are not presently under writeout.  This is an asynchronous flush-to-disk
+ * operation.  Not suitable for data integrity operations.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
+ * completion of writeout of all pages in the range.  This will be used after an
+ * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
+ * for that operation to complete and to return the result.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
+ * a traditional sync() operation.  This is a write-for-data-integrity operation
+ * which will ensure that all pages in the range which were dirty on entry to
+ * sys_sync_file_range() are committed to disk.
+ *
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
+ * I/O errors or ENOSPC conditions and will return those to the caller, after
+ * clearing the EIO and ENOSPC flags in the address_space.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
+ */
+SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
+				unsigned int, flags)
+{
+	int ret;
+	struct fd f;
+	struct address_space *mapping;
+	loff_t endbyte;			/* inclusive */
+	umode_t i_mode;
+
+	ret = -EINVAL;
+	if (flags & ~VALID_FLAGS)
+		goto out;
+
+	endbyte = offset + nbytes;
+
+	if ((s64)offset < 0)
+		goto out;
+	if ((s64)endbyte < 0)
+		goto out;
+	if (endbyte < offset)
+		goto out;
+
+	if (sizeof(pgoff_t) == 4) {
+		if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+			/*
+			 * The range starts outside a 32 bit machine's
+			 * pagecache addressing capabilities.  Let it "succeed"
+			 */
+			ret = 0;
+			goto out;
+		}
+		if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+			/*
+			 * Out to EOF
+			 */
+			nbytes = 0;
+		}
+	}
+
+	if (nbytes == 0)
+		endbyte = LLONG_MAX;
+	else
+		endbyte--;		/* inclusive */
+
+	ret = -EBADF;
+	f = fdget(fd);
+	if (!f.file)
+		goto out;
+
+	i_mode = file_inode(f.file)->i_mode;
+	ret = -ESPIPE;
+	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
+			!S_ISLNK(i_mode))
+		goto out_put;
+
+	mapping = f.file->f_mapping;
+	if (!mapping) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	ret = 0;
+	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
+		ret = filemap_fdatawait_range(mapping, offset, endbyte);
+		if (ret < 0)
+			goto out_put;
+	}
+
+	if (flags & SYNC_FILE_RANGE_WRITE) {
+		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+		if (ret < 0)
+			goto out_put;
+	}
+
+	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
+		ret = filemap_fdatawait_range(mapping, offset, endbyte);
+
+out_put:
+	fdput(f);
+out:
+	return ret;
+}
+
+/* It would be nice if people remember that not all the world's an i386
+   when they introduce new system calls */
+SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
+				 loff_t, offset, loff_t, nbytes)
+{
+	return sys_sync_file_range(fd, offset, nbytes, flags);
+}
diff --git a/kernel/fs/sysfs/Kconfig b/kernel/fs/sysfs/Kconfig
new file mode 100644
index 000000000..b27560145
--- /dev/null
+++ b/kernel/fs/sysfs/Kconfig
@@ -0,0 +1,24 @@
+config SYSFS
+	bool "sysfs file system support" if EXPERT
+	default y
+	select KERNFS
+	help
+	The sysfs filesystem is a virtual filesystem that the kernel uses to
+	export internal kernel objects, their attributes, and their
+	relationships to one another.
+
+	Users can use sysfs to ascertain useful information about the running
+	kernel, such as the devices the kernel has discovered on each bus and
+	which driver each is bound to. sysfs can also be used to tune devices
+	and other kernel subsystems.
+
+	Some system agents rely on the information in sysfs to operate.
+	/sbin/hotplug uses device and object attributes in sysfs to assist in
+	delegating policy decisions, like persistently naming devices.
+
+	sysfs is currently used by the block subsystem to mount the root
+	partition.  If sysfs is disabled you must specify the boot device on
+	the kernel boot command line via its major and minor numbers.  For
+	example, "root=03:01" for /dev/hda1.
+
+	Designers of embedded systems may wish to say N here to conserve space.
diff --git a/kernel/fs/sysfs/Makefile b/kernel/fs/sysfs/Makefile
new file mode 100644
index 000000000..6eff6e120
--- /dev/null
+++ b/kernel/fs/sysfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the sysfs virtual filesystem
+#
+
+obj-y		:= file.o dir.o symlink.o mount.o group.o
diff --git a/kernel/fs/sysfs/dir.c b/kernel/fs/sysfs/dir.c
new file mode 100644
index 000000000..94374e435
--- /dev/null
+++ b/kernel/fs/sysfs/dir.c
@@ -0,0 +1,157 @@
+/*
+ * fs/sysfs/dir.c - sysfs core and dir operation implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+#include "sysfs.h"
+
+DEFINE_SPINLOCK(sysfs_symlink_target_lock);
+
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
+{
+	char *buf, *path = NULL;
+
+	buf = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (buf)
+		path = kernfs_path(parent, buf, PATH_MAX);
+
+	WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n",
+	     path, name);
+
+	kfree(buf);
+}
+
+/**
+ * sysfs_create_dir_ns - create a directory for an object with a namespace tag
+ * @kobj: object we're creating directory for
+ * @ns: the namespace tag to use
+ */
+int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
+{
+	struct kernfs_node *parent, *kn;
+
+	BUG_ON(!kobj);
+
+	if (kobj->parent)
+		parent = kobj->parent->sd;
+	else
+		parent = sysfs_root_kn;
+
+	if (!parent)
+		return -ENOENT;
+
+	kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
+				  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, kobject_name(kobj));
+		return PTR_ERR(kn);
+	}
+
+	kobj->sd = kn;
+	return 0;
+}
+
+/**
+ *	sysfs_remove_dir - remove an object's directory.
+ *	@kobj:	object.
+ *
+ *	The only thing special about this is that we remove any files in
+ *	the directory before we remove the directory, and we've inlined
+ *	what used to be sysfs_rmdir() below, instead of calling separately.
+ */
+void sysfs_remove_dir(struct kobject *kobj)
+{
+	struct kernfs_node *kn = kobj->sd;
+
+	/*
+	 * In general, kboject owner is responsible for ensuring removal
+	 * doesn't race with other operations and sysfs doesn't provide any
+	 * protection; however, when @kobj is used as a symlink target, the
+	 * symlinking entity usually doesn't own @kobj and thus has no
+	 * control over removal.  @kobj->sd may be removed anytime
+	 * and symlink code may end up dereferencing an already freed node.
+	 *
+	 * sysfs_symlink_target_lock synchronizes @kobj->sd
+	 * disassociation against symlink operations so that symlink code
+	 * can safely dereference @kobj->sd.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
+	kobj->sd = NULL;
+	spin_unlock(&sysfs_symlink_target_lock);
+
+	if (kn) {
+		WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
+		kernfs_remove(kn);
+	}
+}
+
+int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
+			const void *new_ns)
+{
+	struct kernfs_node *parent;
+	int ret;
+
+	parent = kernfs_get_parent(kobj->sd);
+	ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
+	kernfs_put(parent);
+	return ret;
+}
+
+int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
+		      const void *new_ns)
+{
+	struct kernfs_node *kn = kobj->sd;
+	struct kernfs_node *new_parent;
+
+	new_parent = new_parent_kobj && new_parent_kobj->sd ?
+		new_parent_kobj->sd : sysfs_root_kn;
+
+	return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
+}
+
+/**
+ * sysfs_create_mount_point - create an always empty directory
+ * @parent_kobj:  kobject that will contain this always empty directory
+ * @name: The name of the always empty directory to add
+ */
+int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name)
+{
+	struct kernfs_node *kn, *parent = parent_kobj->sd;
+
+	kn = kernfs_create_empty_dir(parent, name);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, name);
+		return PTR_ERR(kn);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_create_mount_point);
+
+/**
+ *	sysfs_remove_mount_point - remove an always empty directory.
+ *	@parent_kobj: kobject that will contain this always empty directory
+ *	@name: The name of the always empty directory to remove
+ *
+ */
+void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name)
+{
+	struct kernfs_node *parent = parent_kobj->sd;
+
+	kernfs_remove_by_name_ns(parent, name, NULL);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);
diff --git a/kernel/fs/sysfs/file.c b/kernel/fs/sysfs/file.c
new file mode 100644
index 000000000..7c2867b44
--- /dev/null
+++ b/kernel/fs/sysfs/file.c
@@ -0,0 +1,497 @@
+/*
+ * fs/sysfs/file.c - sysfs regular (text) file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/kallsyms.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+
+#include "sysfs.h"
+#include "../kernfs/kernfs-internal.h"
+
+/*
+ * Determine ktype->sysfs_ops for the given kernfs_node.  This function
+ * must be called while holding an active reference.
+ */
+static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
+{
+	struct kobject *kobj = kn->parent->priv;
+
+	if (kn->flags & KERNFS_LOCKDEP)
+		lockdep_assert_held(kn);
+	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
+}
+
+/*
+ * Reads on sysfs are handled through seq_file, which takes care of hairy
+ * details like buffering and seeking.  The following function pipes
+ * sysfs_ops->show() result through seq_file.
+ */
+static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+	struct kobject *kobj = of->kn->parent->priv;
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+	ssize_t count;
+	char *buf;
+
+	/* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
+	count = seq_get_buf(sf, &buf);
+	if (count < PAGE_SIZE) {
+		seq_commit(sf, -1);
+		return 0;
+	}
+	memset(buf, 0, PAGE_SIZE);
+
+	/*
+	 * Invoke show().  Control may reach here via seq file lseek even
+	 * if @ops->show() isn't implemented.
+	 */
+	if (ops->show) {
+		count = ops->show(kobj, of->kn->priv, buf);
+		if (count < 0)
+			return count;
+	}
+
+	/*
+	 * The code works fine with PAGE_SIZE return but it's likely to
+	 * indicate truncated result or overflow in normal use cases.
+	 */
+	if (count >= (ssize_t)PAGE_SIZE) {
+		print_symbol("fill_read_buffer: %s returned bad count\n",
+			(unsigned long)ops->show);
+		/* Try to struggle along */
+		count = PAGE_SIZE - 1;
+	}
+	seq_commit(sf, count);
+	return 0;
+}
+
+static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
+				 size_t count, loff_t pos)
+{
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+	loff_t size = file_inode(of->file)->i_size;
+
+	if (!count)
+		return 0;
+
+	if (size) {
+		if (pos > size)
+			return 0;
+		if (pos + count > size)
+			count = size - pos;
+	}
+
+	if (!battr->read)
+		return -EIO;
+
+	return battr->read(of->file, kobj, battr, buf, pos, count);
+}
+
+/* kernfs read callback for regular sysfs files with pre-alloc */
+static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
+			     size_t count, loff_t pos)
+{
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+	struct kobject *kobj = of->kn->parent->priv;
+
+	/*
+	 * If buf != of->prealloc_buf, we don't know how
+	 * large it is, so cannot safely pass it to ->show
+	 */
+	if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
+		return 0;
+	return ops->show(kobj, of->kn->priv, buf);
+}
+
+/* kernfs write callback for regular sysfs files */
+static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
+			      size_t count, loff_t pos)
+{
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+	struct kobject *kobj = of->kn->parent->priv;
+
+	if (!count)
+		return 0;
+
+	return ops->store(kobj, of->kn->priv, buf, count);
+}
+
+/* kernfs write callback for bin sysfs files */
+static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
+				  size_t count, loff_t pos)
+{
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+	loff_t size = file_inode(of->file)->i_size;
+
+	if (size) {
+		if (size <= pos)
+			return -EFBIG;
+		count = min_t(ssize_t, count, size - pos);
+	}
+	if (!count)
+		return 0;
+
+	if (!battr->write)
+		return -EIO;
+
+	return battr->write(of->file, kobj, battr, buf, pos, count);
+}
+
+static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
+			     struct vm_area_struct *vma)
+{
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+
+	return battr->mmap(of->file, kobj, battr, vma);
+}
+
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
+{
+	struct kernfs_node *kn = kobj->sd, *tmp;
+
+	if (kn && dir)
+		kn = kernfs_find_and_get(kn, dir);
+	else
+		kernfs_get(kn);
+
+	if (kn && attr) {
+		tmp = kernfs_find_and_get(kn, attr);
+		kernfs_put(kn);
+		kn = tmp;
+	}
+
+	if (kn) {
+		kernfs_notify(kn);
+		kernfs_put(kn);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
+
+static const struct kernfs_ops sysfs_file_kfops_empty = {
+};
+
+static const struct kernfs_ops sysfs_file_kfops_ro = {
+	.seq_show	= sysfs_kf_seq_show,
+};
+
+static const struct kernfs_ops sysfs_file_kfops_wo = {
+	.write		= sysfs_kf_write,
+};
+
+static const struct kernfs_ops sysfs_file_kfops_rw = {
+	.seq_show	= sysfs_kf_seq_show,
+	.write		= sysfs_kf_write,
+};
+
+static const struct kernfs_ops sysfs_prealloc_kfops_ro = {
+	.read		= sysfs_kf_read,
+	.prealloc	= true,
+};
+
+static const struct kernfs_ops sysfs_prealloc_kfops_wo = {
+	.write		= sysfs_kf_write,
+	.prealloc	= true,
+};
+
+static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
+	.read		= sysfs_kf_read,
+	.write		= sysfs_kf_write,
+	.prealloc	= true,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_ro = {
+	.read		= sysfs_kf_bin_read,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_wo = {
+	.write		= sysfs_kf_bin_write,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_rw = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_mmap = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
+	.mmap		= sysfs_kf_bin_mmap,
+};
+
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+			   const struct attribute *attr, bool is_bin,
+			   umode_t mode, const void *ns)
+{
+	struct lock_class_key *key = NULL;
+	const struct kernfs_ops *ops;
+	struct kernfs_node *kn;
+	loff_t size;
+
+	if (!is_bin) {
+		struct kobject *kobj = parent->priv;
+		const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
+
+		/* every kobject with an attribute needs a ktype assigned */
+		if (WARN(!sysfs_ops, KERN_ERR
+			 "missing sysfs attribute operations for kobject: %s\n",
+			 kobject_name(kobj)))
+			return -EINVAL;
+
+		if (sysfs_ops->show && sysfs_ops->store) {
+			if (mode & SYSFS_PREALLOC)
+				ops = &sysfs_prealloc_kfops_rw;
+			else
+				ops = &sysfs_file_kfops_rw;
+		} else if (sysfs_ops->show) {
+			if (mode & SYSFS_PREALLOC)
+				ops = &sysfs_prealloc_kfops_ro;
+			else
+				ops = &sysfs_file_kfops_ro;
+		} else if (sysfs_ops->store) {
+			if (mode & SYSFS_PREALLOC)
+				ops = &sysfs_prealloc_kfops_wo;
+			else
+				ops = &sysfs_file_kfops_wo;
+		} else
+			ops = &sysfs_file_kfops_empty;
+
+		size = PAGE_SIZE;
+	} else {
+		struct bin_attribute *battr = (void *)attr;
+
+		if (battr->mmap)
+			ops = &sysfs_bin_kfops_mmap;
+		else if (battr->read && battr->write)
+			ops = &sysfs_bin_kfops_rw;
+		else if (battr->read)
+			ops = &sysfs_bin_kfops_ro;
+		else if (battr->write)
+			ops = &sysfs_bin_kfops_wo;
+		else
+			ops = &sysfs_file_kfops_empty;
+
+		size = battr->size;
+	}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (!attr->ignore_lockdep)
+		key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+	kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
+				  (void *)attr, ns, key);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, attr->name);
+		return PTR_ERR(kn);
+	}
+	return 0;
+}
+
+int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
+		   bool is_bin)
+{
+	return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
+}
+
+/**
+ * sysfs_create_file_ns - create an attribute file for an object with custom ns
+ * @kobj: object we're creating for
+ * @attr: attribute descriptor
+ * @ns: namespace the new file should belong to
+ */
+int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
+			 const void *ns)
+{
+	BUG_ON(!kobj || !kobj->sd || !attr);
+
+	return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
+
+}
+EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
+
+int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; ptr[i] && !err; i++)
+		err = sysfs_create_file(kobj, ptr[i]);
+	if (err)
+		while (--i >= 0)
+			sysfs_remove_file(kobj, ptr[i]);
+	return err;
+}
+EXPORT_SYMBOL_GPL(sysfs_create_files);
+
+/**
+ * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @group: group name.
+ */
+int sysfs_add_file_to_group(struct kobject *kobj,
+		const struct attribute *attr, const char *group)
+{
+	struct kernfs_node *parent;
+	int error;
+
+	if (group) {
+		parent = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		parent = kobj->sd;
+		kernfs_get(parent);
+	}
+
+	if (!parent)
+		return -ENOENT;
+
+	error = sysfs_add_file(parent, attr, false);
+	kernfs_put(parent);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
+
+/**
+ * sysfs_chmod_file - update the modified mode value on an object attribute.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @mode: file permissions.
+ *
+ */
+int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
+		     umode_t mode)
+{
+	struct kernfs_node *kn;
+	struct iattr newattrs;
+	int rc;
+
+	kn = kernfs_find_and_get(kobj->sd, attr->name);
+	if (!kn)
+		return -ENOENT;
+
+	newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
+	newattrs.ia_valid = ATTR_MODE;
+
+	rc = kernfs_setattr(kn, &newattrs);
+
+	kernfs_put(kn);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(sysfs_chmod_file);
+
+/**
+ * sysfs_remove_file_ns - remove an object attribute with a custom ns tag
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
+ * @ns: namespace tag of the file to remove
+ *
+ * Hash the attribute name and namespace tag and kill the victim.
+ */
+void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
+			  const void *ns)
+{
+	struct kernfs_node *parent = kobj->sd;
+
+	kernfs_remove_by_name_ns(parent, attr->name, ns);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
+
+/**
+ * sysfs_remove_file_self - remove an object attribute from its own method
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
+ *
+ * See kernfs_remove_self() for details.
+ */
+bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
+{
+	struct kernfs_node *parent = kobj->sd;
+	struct kernfs_node *kn;
+	bool ret;
+
+	kn = kernfs_find_and_get(parent, attr->name);
+	if (WARN_ON_ONCE(!kn))
+		return false;
+
+	ret = kernfs_remove_self(kn);
+
+	kernfs_put(kn);
+	return ret;
+}
+
+void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
+{
+	int i;
+	for (i = 0; ptr[i]; i++)
+		sysfs_remove_file(kobj, ptr[i]);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_files);
+
+/**
+ * sysfs_remove_file_from_group - remove an attribute file from a group.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @group: group name.
+ */
+void sysfs_remove_file_from_group(struct kobject *kobj,
+		const struct attribute *attr, const char *group)
+{
+	struct kernfs_node *parent;
+
+	if (group) {
+		parent = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		parent = kobj->sd;
+		kernfs_get(parent);
+	}
+
+	if (parent) {
+		kernfs_remove_by_name(parent, attr->name);
+		kernfs_put(parent);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
+
+/**
+ *	sysfs_create_bin_file - create binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+int sysfs_create_bin_file(struct kobject *kobj,
+			  const struct bin_attribute *attr)
+{
+	BUG_ON(!kobj || !kobj->sd || !attr);
+
+	return sysfs_add_file(kobj->sd, &attr->attr, true);
+}
+EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
+
+/**
+ *	sysfs_remove_bin_file - remove binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+void sysfs_remove_bin_file(struct kobject *kobj,
+			   const struct bin_attribute *attr)
+{
+	kernfs_remove_by_name(kobj->sd, attr->attr.name);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/kernel/fs/sysfs/group.c b/kernel/fs/sysfs/group.c
new file mode 100644
index 000000000..b400c0437
--- /dev/null
+++ b/kernel/fs/sysfs/group.c
@@ -0,0 +1,354 @@
+/*
+ * fs/sysfs/group.c - Operations for adding/removing multiple files at once.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2013 Greg Kroah-Hartman
+ * Copyright (c) 2013 The Linux Foundation
+ *
+ * This file is released undert the GPL v2.
+ *
+ */
+
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/err.h>
+#include "sysfs.h"
+
+
+static void remove_files(struct kernfs_node *parent,
+			 const struct attribute_group *grp)
+{
+	struct attribute *const *attr;
+	struct bin_attribute *const *bin_attr;
+
+	if (grp->attrs)
+		for (attr = grp->attrs; *attr; attr++)
+			kernfs_remove_by_name(parent, (*attr)->name);
+	if (grp->bin_attrs)
+		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
+			kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
+}
+
+static int create_files(struct kernfs_node *parent, struct kobject *kobj,
+			const struct attribute_group *grp, int update)
+{
+	struct attribute *const *attr;
+	struct bin_attribute *const *bin_attr;
+	int error = 0, i;
+
+	if (grp->attrs) {
+		for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+			umode_t mode = (*attr)->mode;
+
+			/*
+			 * In update mode, we're changing the permissions or
+			 * visibility.  Do this by first removing then
+			 * re-adding (if required) the file.
+			 */
+			if (update)
+				kernfs_remove_by_name(parent, (*attr)->name);
+			if (grp->is_visible) {
+				mode = grp->is_visible(kobj, *attr, i);
+				if (!mode)
+					continue;
+			}
+
+			WARN(mode & ~(SYSFS_PREALLOC | 0664),
+			     "Attribute %s: Invalid permissions 0%o\n",
+			     (*attr)->name, mode);
+
+			mode &= SYSFS_PREALLOC | 0664;
+			error = sysfs_add_file_mode_ns(parent, *attr, false,
+						       mode, NULL);
+			if (unlikely(error))
+				break;
+		}
+		if (error) {
+			remove_files(parent, grp);
+			goto exit;
+		}
+	}
+
+	if (grp->bin_attrs) {
+		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+			if (update)
+				kernfs_remove_by_name(parent,
+						(*bin_attr)->attr.name);
+			error = sysfs_add_file_mode_ns(parent,
+					&(*bin_attr)->attr, true,
+					(*bin_attr)->attr.mode, NULL);
+			if (error)
+				break;
+		}
+		if (error)
+			remove_files(parent, grp);
+	}
+exit:
+	return error;
+}
+
+
+static int internal_create_group(struct kobject *kobj, int update,
+				 const struct attribute_group *grp)
+{
+	struct kernfs_node *kn;
+	int error;
+
+	BUG_ON(!kobj || (!update && !kobj->sd));
+
+	/* Updates may happen before the object has been instantiated */
+	if (unlikely(update && !kobj->sd))
+		return -EINVAL;
+	if (!grp->attrs && !grp->bin_attrs) {
+		WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
+			kobj->name, grp->name ?: "");
+		return -EINVAL;
+	}
+	if (grp->name) {
+		kn = kernfs_create_dir(kobj->sd, grp->name,
+				       S_IRWXU | S_IRUGO | S_IXUGO, kobj);
+		if (IS_ERR(kn)) {
+			if (PTR_ERR(kn) == -EEXIST)
+				sysfs_warn_dup(kobj->sd, grp->name);
+			return PTR_ERR(kn);
+		}
+	} else
+		kn = kobj->sd;
+	kernfs_get(kn);
+	error = create_files(kn, kobj, grp, update);
+	if (error) {
+		if (grp->name)
+			kernfs_remove(kn);
+	}
+	kernfs_put(kn);
+	return error;
+}
+
+/**
+ * sysfs_create_group - given a directory kobject, create an attribute group
+ * @kobj:	The kobject to create the group on
+ * @grp:	The attribute group to create
+ *
+ * This function creates a group for the first time.  It will explicitly
+ * warn and error if any of the attribute files being created already exist.
+ *
+ * Returns 0 on success or error.
+ */
+int sysfs_create_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	return internal_create_group(kobj, 0, grp);
+}
+EXPORT_SYMBOL_GPL(sysfs_create_group);
+
+/**
+ * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
+ * @kobj:	The kobject to create the group on
+ * @groups:	The attribute groups to create, NULL terminated
+ *
+ * This function creates a bunch of attribute groups.  If an error occurs when
+ * creating a group, all previously created groups will be removed, unwinding
+ * everything back to the original state when this function was called.
+ * It will explicitly warn and error if any of the attribute files being
+ * created already exist.
+ *
+ * Returns 0 on success or error code from sysfs_create_group on error.
+ */
+int sysfs_create_groups(struct kobject *kobj,
+			const struct attribute_group **groups)
+{
+	int error = 0;
+	int i;
+
+	if (!groups)
+		return 0;
+
+	for (i = 0; groups[i]; i++) {
+		error = sysfs_create_group(kobj, groups[i]);
+		if (error) {
+			while (--i >= 0)
+				sysfs_remove_group(kobj, groups[i]);
+			break;
+		}
+	}
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_create_groups);
+
+/**
+ * sysfs_update_group - given a directory kobject, update an attribute group
+ * @kobj:	The kobject to update the group on
+ * @grp:	The attribute group to update
+ *
+ * This function updates an attribute group.  Unlike
+ * sysfs_create_group(), it will explicitly not warn or error if any
+ * of the attribute files being created already exist.  Furthermore,
+ * if the visibility of the files has changed through the is_visible()
+ * callback, it will update the permissions and add or remove the
+ * relevant files.
+ *
+ * The primary use for this function is to call it after making a change
+ * that affects group visibility.
+ *
+ * Returns 0 on success or error.
+ */
+int sysfs_update_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	return internal_create_group(kobj, 1, grp);
+}
+EXPORT_SYMBOL_GPL(sysfs_update_group);
+
+/**
+ * sysfs_remove_group: remove a group from a kobject
+ * @kobj:	kobject to remove the group from
+ * @grp:	group to remove
+ *
+ * This function removes a group of attributes from a kobject.  The attributes
+ * previously have to have been created for this group, otherwise it will fail.
+ */
+void sysfs_remove_group(struct kobject *kobj,
+			const struct attribute_group *grp)
+{
+	struct kernfs_node *parent = kobj->sd;
+	struct kernfs_node *kn;
+
+	if (grp->name) {
+		kn = kernfs_find_and_get(parent, grp->name);
+		if (!kn) {
+			WARN(!kn, KERN_WARNING
+			     "sysfs group %p not found for kobject '%s'\n",
+			     grp, kobject_name(kobj));
+			return;
+		}
+	} else {
+		kn = parent;
+		kernfs_get(kn);
+	}
+
+	remove_files(kn, grp);
+	if (grp->name)
+		kernfs_remove(kn);
+
+	kernfs_put(kn);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_group);
+
+/**
+ * sysfs_remove_groups - remove a list of groups
+ *
+ * @kobj:	The kobject for the groups to be removed from
+ * @groups:	NULL terminated list of groups to be removed
+ *
+ * If groups is not NULL, remove the specified groups from the kobject.
+ */
+void sysfs_remove_groups(struct kobject *kobj,
+			 const struct attribute_group **groups)
+{
+	int i;
+
+	if (!groups)
+		return;
+	for (i = 0; groups[i]; i++)
+		sysfs_remove_group(kobj, groups[i]);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_groups);
+
+/**
+ * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to create and the attribute group they belong to.
+ *
+ * This function returns an error if the group doesn't exist or any of the
+ * files already exist in that group, in which case none of the new files
+ * are created.
+ */
+int sysfs_merge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct kernfs_node *parent;
+	int error = 0;
+	struct attribute *const *attr;
+	int i;
+
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (!parent)
+		return -ENOENT;
+
+	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
+		error = sysfs_add_file(parent, *attr, false);
+	if (error) {
+		while (--i >= 0)
+			kernfs_remove_by_name(parent, (*--attr)->name);
+	}
+	kernfs_put(parent);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_merge_group);
+
+/**
+ * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to remove and the attribute group they belong to.
+ */
+void sysfs_unmerge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct kernfs_node *parent;
+	struct attribute *const *attr;
+
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (parent) {
+		for (attr = grp->attrs; *attr; ++attr)
+			kernfs_remove_by_name(parent, (*attr)->name);
+		kernfs_put(parent);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
+
+/**
+ * sysfs_add_link_to_group - add a symlink to an attribute group.
+ * @kobj:	The kobject containing the group.
+ * @group_name:	The name of the group.
+ * @target:	The target kobject of the symlink to create.
+ * @link_name:	The name of the symlink to create.
+ */
+int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
+			    struct kobject *target, const char *link_name)
+{
+	struct kernfs_node *parent;
+	int error = 0;
+
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (!parent)
+		return -ENOENT;
+
+	error = sysfs_create_link_sd(parent, target, link_name);
+	kernfs_put(parent);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
+
+/**
+ * sysfs_remove_link_from_group - remove a symlink from an attribute group.
+ * @kobj:	The kobject containing the group.
+ * @group_name:	The name of the group.
+ * @link_name:	The name of the symlink to remove.
+ */
+void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
+				  const char *link_name)
+{
+	struct kernfs_node *parent;
+
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (parent) {
+		kernfs_remove_by_name(parent, link_name);
+		kernfs_put(parent);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/kernel/fs/sysfs/mount.c b/kernel/fs/sysfs/mount.c
new file mode 100644
index 000000000..1c6ac6fce
--- /dev/null
+++ b/kernel/fs/sysfs/mount.c
@@ -0,0 +1,79 @@
+/*
+ * fs/sysfs/symlink.c - operations for initializing and mounting sysfs
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#define DEBUG
+
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/user_namespace.h>
+
+#include "sysfs.h"
+
+static struct kernfs_root *sysfs_root;
+struct kernfs_node *sysfs_root_kn;
+
+static struct dentry *sysfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	struct dentry *root;
+	void *ns;
+	bool new_sb;
+
+	if (!(flags & MS_KERNMOUNT)) {
+		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
+			return ERR_PTR(-EPERM);
+	}
+
+	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	root = kernfs_mount_ns(fs_type, flags, sysfs_root,
+				SYSFS_MAGIC, &new_sb, ns);
+	if (IS_ERR(root) || !new_sb)
+		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+	return root;
+}
+
+static void sysfs_kill_sb(struct super_block *sb)
+{
+	void *ns = (void *)kernfs_super_ns(sb);
+
+	kernfs_kill_sb(sb);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+}
+
+static struct file_system_type sysfs_fs_type = {
+	.name		= "sysfs",
+	.mount		= sysfs_mount,
+	.kill_sb	= sysfs_kill_sb,
+	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+};
+
+int __init sysfs_init(void)
+{
+	int err;
+
+	sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+					NULL);
+	if (IS_ERR(sysfs_root))
+		return PTR_ERR(sysfs_root);
+
+	sysfs_root_kn = sysfs_root->kn;
+
+	err = register_filesystem(&sysfs_fs_type);
+	if (err) {
+		kernfs_destroy_root(sysfs_root);
+		return err;
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/sysfs/symlink.c b/kernel/fs/sysfs/symlink.c
new file mode 100644
index 000000000..aecb15f84
--- /dev/null
+++ b/kernel/fs/sysfs/symlink.c
@@ -0,0 +1,197 @@
+/*
+ * fs/sysfs/symlink.c - sysfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/security.h>
+
+#include "sysfs.h"
+
+static int sysfs_do_create_link_sd(struct kernfs_node *parent,
+				   struct kobject *target_kobj,
+				   const char *name, int warn)
+{
+	struct kernfs_node *kn, *target = NULL;
+
+	BUG_ON(!name || !parent);
+
+	/*
+	 * We don't own @target_kobj and it may be removed at any time.
+	 * Synchronize using sysfs_symlink_target_lock.  See
+	 * sysfs_remove_dir() for details.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
+	if (target_kobj->sd) {
+		target = target_kobj->sd;
+		kernfs_get(target);
+	}
+	spin_unlock(&sysfs_symlink_target_lock);
+
+	if (!target)
+		return -ENOENT;
+
+	kn = kernfs_create_link(parent, name, target);
+	kernfs_put(target);
+
+	if (!IS_ERR(kn))
+		return 0;
+
+	if (warn && PTR_ERR(kn) == -EEXIST)
+		sysfs_warn_dup(parent, name);
+	return PTR_ERR(kn);
+}
+
+/**
+ *	sysfs_create_link_sd - create symlink to a given object.
+ *	@kn:		directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ */
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
+			 const char *name)
+{
+	return sysfs_do_create_link_sd(kn, target, name, 1);
+}
+
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+				const char *name, int warn)
+{
+	struct kernfs_node *parent = NULL;
+
+	if (!kobj)
+		parent = sysfs_root_kn;
+	else
+		parent = kobj->sd;
+
+	if (!parent)
+		return -EFAULT;
+
+	return sysfs_do_create_link_sd(parent, target, name, warn);
+}
+
+/**
+ *	sysfs_create_link - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ */
+int sysfs_create_link(struct kobject *kobj, struct kobject *target,
+		      const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 1);
+}
+EXPORT_SYMBOL_GPL(sysfs_create_link);
+
+/**
+ *	sysfs_create_link_nowarn - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ *
+ *	This function does the same as sysfs_create_link(), but it
+ *	doesn't warn if the link already exists.
+ */
+int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
+			     const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 0);
+}
+
+/**
+ *	sysfs_delete_link - remove symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@targ:	object we're pointing to.
+ *	@name:	name of the symlink to remove.
+ *
+ *	Unlike sysfs_remove_link sysfs_delete_link has enough information
+ *	to successfully delete symlinks in tagged directories.
+ */
+void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
+			const char *name)
+{
+	const void *ns = NULL;
+
+	/*
+	 * We don't own @target and it may be removed at any time.
+	 * Synchronize using sysfs_symlink_target_lock.  See
+	 * sysfs_remove_dir() for details.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
+	if (targ->sd && kernfs_ns_enabled(kobj->sd))
+		ns = targ->sd->ns;
+	spin_unlock(&sysfs_symlink_target_lock);
+	kernfs_remove_by_name_ns(kobj->sd, name, ns);
+}
+
+/**
+ *	sysfs_remove_link - remove symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@name:	name of the symlink to remove.
+ */
+void sysfs_remove_link(struct kobject *kobj, const char *name)
+{
+	struct kernfs_node *parent = NULL;
+
+	if (!kobj)
+		parent = sysfs_root_kn;
+	else
+		parent = kobj->sd;
+
+	kernfs_remove_by_name(parent, name);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_link);
+
+/**
+ *	sysfs_rename_link_ns - rename symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@targ:	object we're pointing to.
+ *	@old:	previous name of the symlink.
+ *	@new:	new name of the symlink.
+ *	@new_ns: new namespace of the symlink.
+ *
+ *	A helper function for the common rename symlink idiom.
+ */
+int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
+			 const char *old, const char *new, const void *new_ns)
+{
+	struct kernfs_node *parent, *kn = NULL;
+	const void *old_ns = NULL;
+	int result;
+
+	if (!kobj)
+		parent = sysfs_root_kn;
+	else
+		parent = kobj->sd;
+
+	if (targ->sd)
+		old_ns = targ->sd->ns;
+
+	result = -ENOENT;
+	kn = kernfs_find_and_get_ns(parent, old, old_ns);
+	if (!kn)
+		goto out;
+
+	result = -EINVAL;
+	if (kernfs_type(kn) != KERNFS_LINK)
+		goto out;
+	if (kn->symlink.target_kn->priv != targ)
+		goto out;
+
+	result = kernfs_rename_ns(kn, parent, new, new_ns);
+
+out:
+	kernfs_put(kn);
+	return result;
+}
+EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
diff --git a/kernel/fs/sysfs/sysfs.h b/kernel/fs/sysfs/sysfs.h
new file mode 100644
index 000000000..0e2f1cccb
--- /dev/null
+++ b/kernel/fs/sysfs/sysfs.h
@@ -0,0 +1,43 @@
+/*
+ * fs/sysfs/sysfs.h - sysfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __SYSFS_INTERNAL_H
+#define __SYSFS_INTERNAL_H
+
+#include <linux/sysfs.h>
+
+/*
+ * mount.c
+ */
+extern struct kernfs_node *sysfs_root_kn;
+
+/*
+ * dir.c
+ */
+extern spinlock_t sysfs_symlink_target_lock;
+
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
+
+/*
+ * file.c
+ */
+int sysfs_add_file(struct kernfs_node *parent,
+		   const struct attribute *attr, bool is_bin);
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+			   const struct attribute *attr, bool is_bin,
+			   umode_t amode, const void *ns);
+
+/*
+ * symlink.c
+ */
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
+			 const char *name);
+
+#endif	/* __SYSFS_INTERNAL_H */
diff --git a/kernel/fs/sysv/Kconfig b/kernel/fs/sysv/Kconfig
new file mode 100644
index 000000000..33aeb4b75
--- /dev/null
+++ b/kernel/fs/sysv/Kconfig
@@ -0,0 +1,36 @@
+config SYSV_FS
+	tristate "System V/Xenix/V7/Coherent file system support"
+	depends on BLOCK
+	help
+	  SCO, Xenix and Coherent are commercial Unix systems for Intel
+	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
+	  here would allow you to read from their floppies and hard disk
+	  partitions.
+
+	  If you have floppies or hard disk partitions like that, it is likely
+	  that they contain binaries from those other Unix systems; in order
+	  to run these binaries, you will want to install linux-abi which is
+	  a set of kernel modules that lets you run SCO, Xenix, Wyse,
+	  UnixWare, Dell Unix and System V programs under Linux.  It is
+	  available via FTP (user: ftp) from
+	  <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
+	  NOTE: that will work only for binaries from Intel-based systems;
+	  PDP ones will have to wait until somebody ports Linux to -11 ;-)
+
+	  If you only intend to mount files from some other Unix over the
+	  network using NFS, you don't need the System V file system support
+	  (but you need NFS file system support obviously).
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").  Note also that this option has
+	  nothing whatsoever to do with the option "System V IPC". Read about
+	  the System V file system in
+	  <file:Documentation/filesystems/sysv-fs.txt>.
+	  Saying Y here will enlarge your kernel by about 27 KB.
+
+	  To compile this as a module, choose M here: the module will be called
+	  sysv.
+
+	  If you haven't heard about all of this before, it's safe to say N.
diff --git a/kernel/fs/sysv/Makefile b/kernel/fs/sysv/Makefile
new file mode 100644
index 000000000..3591f9d7a
--- /dev/null
+++ b/kernel/fs/sysv/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux SystemV/Coherent filesystem routines.
+#
+
+obj-$(CONFIG_SYSV_FS) += sysv.o
+
+sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
+	     namei.o super.o symlink.o
diff --git a/kernel/fs/sysv/balloc.c b/kernel/fs/sysv/balloc.c
new file mode 100644
index 000000000..921c053fc
--- /dev/null
+++ b/kernel/fs/sysv/balloc.c
@@ -0,0 +1,239 @@
+/*
+ *  linux/fs/sysv/balloc.c
+ *
+ *  minix/bitmap.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext/freelists.c
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *
+ *  xenix/alloc.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/alloc.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/balloc.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  This file contains code for allocating/freeing blocks.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "sysv.h"
+
+/* We don't trust the value of
+   sb->sv_sbd2->s_tfree = *sb->sv_free_blocks
+   but we nevertheless keep it up to date. */
+
+static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh)
+{
+	char *bh_data = bh->b_data;
+
+	if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4)
+		return (sysv_zone_t*)(bh_data+4);
+	else
+		return (sysv_zone_t*)(bh_data+2);
+}
+
+/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */
+
+void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
+{
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	sysv_zone_t *blocks = sbi->s_bcache;
+	unsigned count;
+	unsigned block = fs32_to_cpu(sbi, nr);
+
+	/*
+	 * This code does not work at all for AFS (it has a bitmap
+	 * free list).  As AFS is supposed to be read-only no one
+	 * should call this for an AFS filesystem anyway...
+	 */
+	if (sbi->s_type == FSTYPE_AFS)
+		return;
+
+	if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
+		printk("sysv_free_block: trying to free block not in datazone\n");
+		return;
+	}
+
+	mutex_lock(&sbi->s_lock);
+	count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
+
+	if (count > sbi->s_flc_size) {
+		printk("sysv_free_block: flc_count > flc_size\n");
+		mutex_unlock(&sbi->s_lock);
+		return;
+	}
+	/* If the free list head in super-block is full, it is copied
+	 * into this block being freed, ditto if it's completely empty
+	 * (applies only on Coherent).
+	 */
+	if (count == sbi->s_flc_size || count == 0) {
+		block += sbi->s_block_base;
+		bh = sb_getblk(sb, block);
+		if (!bh) {
+			printk("sysv_free_block: getblk() failed\n");
+			mutex_unlock(&sbi->s_lock);
+			return;
+		}
+		memset(bh->b_data, 0, sb->s_blocksize);
+		*(__fs16*)bh->b_data = cpu_to_fs16(sbi, count);
+		memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t));
+		mark_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		brelse(bh);
+		count = 0;
+	}
+	sbi->s_bcache[count++] = nr;
+
+	*sbi->s_bcache_count = cpu_to_fs16(sbi, count);
+	fs32_add(sbi, sbi->s_free_blocks, 1);
+	dirty_sb(sb);
+	mutex_unlock(&sbi->s_lock);
+}
+
+sysv_zone_t sysv_new_block(struct super_block * sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	unsigned int block;
+	sysv_zone_t nr;
+	struct buffer_head * bh;
+	unsigned count;
+
+	mutex_lock(&sbi->s_lock);
+	count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
+
+	if (count == 0) /* Applies only to Coherent FS */
+		goto Enospc;
+	nr = sbi->s_bcache[--count];
+	if (nr == 0)  /* Applies only to Xenix FS, SystemV FS */
+		goto Enospc;
+
+	block = fs32_to_cpu(sbi, nr);
+
+	*sbi->s_bcache_count = cpu_to_fs16(sbi, count);
+
+	if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
+		printk("sysv_new_block: new block %d is not in data zone\n",
+			block);
+		goto Enospc;
+	}
+
+	if (count == 0) { /* the last block continues the free list */
+		unsigned count;
+
+		block += sbi->s_block_base;
+		if (!(bh = sb_bread(sb, block))) {
+			printk("sysv_new_block: cannot read free-list block\n");
+			/* retry this same block next time */
+			*sbi->s_bcache_count = cpu_to_fs16(sbi, 1);
+			goto Enospc;
+		}
+		count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
+		if (count > sbi->s_flc_size) {
+			printk("sysv_new_block: free-list block with >flc_size entries\n");
+			brelse(bh);
+			goto Enospc;
+		}
+		*sbi->s_bcache_count = cpu_to_fs16(sbi, count);
+		memcpy(sbi->s_bcache, get_chunk(sb, bh),
+				count * sizeof(sysv_zone_t));
+		brelse(bh);
+	}
+	/* Now the free list head in the superblock is valid again. */
+	fs32_add(sbi, sbi->s_free_blocks, -1);
+	dirty_sb(sb);
+	mutex_unlock(&sbi->s_lock);
+	return nr;
+
+Enospc:
+	mutex_unlock(&sbi->s_lock);
+	return 0;
+}
+
+unsigned long sysv_count_free_blocks(struct super_block * sb)
+{
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	int sb_count;
+	int count;
+	struct buffer_head * bh = NULL;
+	sysv_zone_t *blocks;
+	unsigned block;
+	int n;
+
+	/*
+	 * This code does not work at all for AFS (it has a bitmap
+	 * free list).  As AFS is supposed to be read-only we just
+	 * lie and say it has no free block at all.
+	 */
+	if (sbi->s_type == FSTYPE_AFS)
+		return 0;
+
+	mutex_lock(&sbi->s_lock);
+	sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
+
+	if (0)
+		goto trust_sb;
+
+	/* this causes a lot of disk traffic ... */
+	count = 0;
+	n = fs16_to_cpu(sbi, *sbi->s_bcache_count);
+	blocks = sbi->s_bcache;
+	while (1) {
+		sysv_zone_t zone;
+		if (n > sbi->s_flc_size)
+			goto E2big;
+		zone = 0;
+		while (n && (zone = blocks[--n]) != 0)
+			count++;
+		if (zone == 0)
+			break;
+
+		block = fs32_to_cpu(sbi, zone);
+		if (bh)
+			brelse(bh);
+
+		if (block < sbi->s_firstdatazone || block >= sbi->s_nzones)
+			goto Einval;
+		block += sbi->s_block_base;
+		bh = sb_bread(sb, block);
+		if (!bh)
+			goto Eio;
+		n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
+		blocks = get_chunk(sb, bh);
+	}
+	if (bh)
+		brelse(bh);
+	if (count != sb_count)
+		goto Ecount;
+done:
+	mutex_unlock(&sbi->s_lock);
+	return count;
+
+Einval:
+	printk("sysv_count_free_blocks: new block %d is not in data zone\n",
+		block);
+	goto trust_sb;
+Eio:
+	printk("sysv_count_free_blocks: cannot read free-list block\n");
+	goto trust_sb;
+E2big:
+	printk("sysv_count_free_blocks: >flc_size entries in free-list block\n");
+	if (bh)
+		brelse(bh);
+trust_sb:
+	count = sb_count;
+	goto done;
+Ecount:
+	printk("sysv_count_free_blocks: free block count was %d, "
+		"correcting to %d\n", sb_count, count);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		*sbi->s_free_blocks = cpu_to_fs32(sbi, count);
+		dirty_sb(sb);
+	}
+	goto done;
+}
diff --git a/kernel/fs/sysv/dir.c b/kernel/fs/sysv/dir.c
new file mode 100644
index 000000000..8f3555f00
--- /dev/null
+++ b/kernel/fs/sysv/dir.c
@@ -0,0 +1,372 @@
+/*
+ *  linux/fs/sysv/dir.c
+ *
+ *  minix/dir.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  coh/dir.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/dir.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  SystemV/Coherent directory handling functions
+ */
+
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include "sysv.h"
+
+static int sysv_readdir(struct file *, struct dir_context *);
+
+const struct file_operations sysv_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= sysv_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+static inline void dir_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
+
+static struct page * dir_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page))
+		kmap(page);
+	return page;
+}
+
+static int sysv_readdir(struct file *file, struct dir_context *ctx)
+{
+	unsigned long pos = ctx->pos;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	unsigned long npages = dir_pages(inode);
+	unsigned offset;
+	unsigned long n;
+
+	ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+	if (pos >= inode->i_size)
+		return 0;
+
+	offset = pos & ~PAGE_CACHE_MASK;
+	n = pos >> PAGE_CACHE_SHIFT;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct sysv_dir_entry *de;
+		struct page *page = dir_get_page(inode, n);
+
+		if (IS_ERR(page))
+			continue;
+		kaddr = (char *)page_address(page);
+		de = (struct sysv_dir_entry *)(kaddr+offset);
+		limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+		for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
+			char *name = de->name;
+
+			if (!de->inode)
+				continue;
+
+			if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
+					fs16_to_cpu(SYSV_SB(sb), de->inode),
+					DT_UNKNOWN)) {
+				dir_put_page(page);
+				return 0;
+			}
+		}
+		dir_put_page(page);
+	}
+	return 0;
+}
+
+/* compare strings: name[0..len-1] (not zero-terminated) and
+ * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1])
+ */
+static inline int namecompare(int len, int maxlen,
+	const char * name, const char * buffer)
+{
+	if (len < maxlen && buffer[len])
+		return 0;
+	return !memcmp(name, buffer, len);
+}
+
+/*
+ *	sysv_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ */
+struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page)
+{
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct inode * dir = d_inode(dentry->d_parent);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct sysv_dir_entry *de;
+
+	*res_page = NULL;
+
+	start = SYSV_I(dir)->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+
+	do {
+		char *kaddr;
+		page = dir_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = (char*)page_address(page);
+			de = (struct sysv_dir_entry *) kaddr;
+			kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+			for ( ; (char *) de <= kaddr ; de++) {
+				if (!de->inode)
+					continue;
+				if (namecompare(namelen, SYSV_NAMELEN,
+							name, de->name))
+					goto found;
+			}
+			dir_put_page(page);
+		}
+
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+
+	return NULL;
+
+found:
+	SYSV_I(dir)->i_dir_start_lookup = n;
+	*res_page = page;
+	return de;
+}
+
+int sysv_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct page *page = NULL;
+	struct sysv_dir_entry * de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	/* We take care of directory expansion in the same loop */
+	for (n = 0; n <= npages; n++) {
+		page = dir_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		kaddr = (char*)page_address(page);
+		de = (struct sysv_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+		while ((char *)de <= kaddr) {
+			if (!de->inode)
+				goto got_it;
+			err = -EEXIST;
+			if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) 
+				goto out_page;
+			de++;
+		}
+		dir_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+			(char*)de - (char*)page_address(page);
+	lock_page(page);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+	if (err)
+		goto out_unlock;
+	memcpy (de->name, name, namelen);
+	memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+out_page:
+	dir_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_page;
+}
+
+int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr = (char*)page_address(page);
+	loff_t pos = page_offset(page) + (char *)de - kaddr;
+	int err;
+
+	lock_page(page);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+	BUG_ON(err);
+	de->inode = 0;
+	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+	dir_put_page(page);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	return err;
+}
+
+int sysv_make_empty(struct inode *inode, struct inode *dir)
+{
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
+	struct sysv_dir_entry * de;
+	char *base;
+	int err;
+
+	if (!page)
+		return -ENOMEM;
+	err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+	kmap(page);
+
+	base = (char*)page_address(page);
+	memset(base, 0, PAGE_CACHE_SIZE);
+
+	de = (struct sysv_dir_entry *) base;
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+	strcpy(de->name,".");
+	de++;
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino);
+	strcpy(de->name,"..");
+
+	kunmap(page);
+	err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int sysv_empty_dir(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct sysv_dir_entry * de;
+		page = dir_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = (char *)page_address(page);
+		de = (struct sysv_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE;
+
+		for ( ;(char *)de <= kaddr; de++) {
+			if (!de->inode)
+				continue;
+			/* check for . and .. */
+			if (de->name[0] != '.')
+				goto not_empty;
+			if (!de->name[1]) {
+				if (de->inode == cpu_to_fs16(SYSV_SB(sb),
+							inode->i_ino))
+					continue;
+				goto not_empty;
+			}
+			if (de->name[1] != '.' || de->name[2])
+				goto not_empty;
+		}
+		dir_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	dir_put_page(page);
+	return 0;
+}
+
+/* Releases the page */
+void sysv_set_link(struct sysv_dir_entry *de, struct page *page,
+	struct inode *inode)
+{
+	struct inode *dir = page->mapping->host;
+	loff_t pos = page_offset(page) +
+			(char *)de-(char*)page_address(page);
+	int err;
+
+	lock_page(page);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+	BUG_ON(err);
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+	dir_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
+
+struct sysv_dir_entry * sysv_dotdot (struct inode *dir, struct page **p)
+{
+	struct page *page = dir_get_page(dir, 0);
+	struct sysv_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = (struct sysv_dir_entry*) page_address(page) + 1;
+		*p = page;
+	}
+	return de;
+}
+
+ino_t sysv_inode_by_name(struct dentry *dentry)
+{
+	struct page *page;
+	struct sysv_dir_entry *de = sysv_find_entry (dentry, &page);
+	ino_t res = 0;
+	
+	if (de) {
+		res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode);
+		dir_put_page(page);
+	}
+	return res;
+}
diff --git a/kernel/fs/sysv/file.c b/kernel/fs/sysv/file.c
new file mode 100644
index 000000000..82ddc0906
--- /dev/null
+++ b/kernel/fs/sysv/file.c
@@ -0,0 +1,57 @@
+/*
+ *  linux/fs/sysv/file.c
+ *
+ *  minix/file.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  coh/file.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/file.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  SystemV/Coherent regular file handling primitives
+ */
+
+#include "sysv.h"
+
+/*
+ * We have mostly NULLs here: the current defaults are OK for
+ * the coh filesystem.
+ */
+const struct file_operations sysv_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.fsync		= generic_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+		truncate_setsize(inode, attr->ia_size);
+		sysv_truncate(inode);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations sysv_file_inode_operations = {
+	.setattr	= sysv_setattr,
+	.getattr	= sysv_getattr,
+};
diff --git a/kernel/fs/sysv/ialloc.c b/kernel/fs/sysv/ialloc.c
new file mode 100644
index 000000000..f9db4eb31
--- /dev/null
+++ b/kernel/fs/sysv/ialloc.c
@@ -0,0 +1,234 @@
+/*
+ *  linux/fs/sysv/ialloc.c
+ *
+ *  minix/bitmap.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext/freelists.c
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *
+ *  xenix/alloc.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/alloc.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/ialloc.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  This file contains code for allocating/freeing inodes.
+ */
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/sched.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include "sysv.h"
+
+/* We don't trust the value of
+   sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes
+   but we nevertheless keep it up to date. */
+
+/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */
+
+/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */
+static inline sysv_ino_t *
+sv_sb_fic_inode(struct super_block * sb, unsigned int i)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+
+	if (sbi->s_bh1 == sbi->s_bh2)
+		return &sbi->s_sb_fic_inodes[i];
+	else {
+		/* 512 byte Xenix FS */
+		unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]);
+		if (offset < 512)
+			return (sysv_ino_t*)(sbi->s_sbd1 + offset);
+		else
+			return (sysv_ino_t*)(sbi->s_sbd2 + offset);
+	}
+}
+
+struct sysv_inode *
+sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct sysv_inode *res;
+	int block = sbi->s_firstinodezone + sbi->s_block_base;
+
+	block += (ino-1) >> sbi->s_inodes_per_block_bits;
+	*bh = sb_bread(sb, block);
+	if (!*bh)
+		return NULL;
+	res = (struct sysv_inode *)(*bh)->b_data;
+	return res + ((ino-1) & sbi->s_inodes_per_block_1);
+}
+
+static int refill_free_cache(struct super_block *sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	int i = 0, ino;
+
+	ino = SYSV_ROOT_INO+1;
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode)
+		goto out;
+	while (ino <= sbi->s_ninodes) {
+		if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) {
+			*sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino);
+			if (i == sbi->s_fic_size)
+				break;
+		}
+		if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
+			brelse(bh);
+			raw_inode = sysv_raw_inode(sb, ino, &bh);
+			if (!raw_inode)
+				goto out;
+		} else
+			raw_inode++;
+	}
+	brelse(bh);
+out:
+	return i;
+}
+
+void sysv_free_inode(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	unsigned int ino;
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	unsigned count;
+
+	sb = inode->i_sb;
+	ino = inode->i_ino;
+	if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) {
+		printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n");
+		return;
+	}
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode) {
+		printk("sysv_free_inode: unable to read inode block on device "
+		       "%s\n", inode->i_sb->s_id);
+		return;
+	}
+	mutex_lock(&sbi->s_lock);
+	count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
+	if (count < sbi->s_fic_size) {
+		*sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
+		*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
+	}
+	fs16_add(sbi, sbi->s_sb_total_free_inodes, 1);
+	dirty_sb(sb);
+	memset(raw_inode, 0, sizeof(struct sysv_inode));
+	mark_buffer_dirty(bh);
+	mutex_unlock(&sbi->s_lock);
+	brelse(bh);
+}
+
+struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct inode *inode;
+	sysv_ino_t ino;
+	unsigned count;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE
+	};
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&sbi->s_lock);
+	count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
+	if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
+		count = refill_free_cache(sb);
+		if (count == 0) {
+			iput(inode);
+			mutex_unlock(&sbi->s_lock);
+			return ERR_PTR(-ENOSPC);
+		}
+	}
+	/* Now count > 0. */
+	ino = *sv_sb_fic_inode(sb,--count);
+	*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
+	fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
+	dirty_sb(sb);
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = fs16_to_cpu(sbi, ino);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_blocks = 0;
+	memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
+	SYSV_I(inode)->i_dir_start_lookup = 0;
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+
+	sysv_write_inode(inode, &wbc);	/* ensure inode not allocated again */
+	mark_inode_dirty(inode);	/* cleared by sysv_write_inode() */
+	/* That's it. */
+	mutex_unlock(&sbi->s_lock);
+	return inode;
+}
+
+unsigned long sysv_count_free_inodes(struct super_block * sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	int ino, count, sb_count;
+
+	mutex_lock(&sbi->s_lock);
+
+	sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
+
+	if (0)
+		goto trust_sb;
+
+	/* this causes a lot of disk traffic ... */
+	count = 0;
+	ino = SYSV_ROOT_INO+1;
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode)
+		goto Eio;
+	while (ino <= sbi->s_ninodes) {
+		if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0)
+			count++;
+		if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
+			brelse(bh);
+			raw_inode = sysv_raw_inode(sb, ino, &bh);
+			if (!raw_inode)
+				goto Eio;
+		} else
+			raw_inode++;
+	}
+	brelse(bh);
+	if (count != sb_count)
+		goto Einval;
+out:
+	mutex_unlock(&sbi->s_lock);
+	return count;
+
+Einval:
+	printk("sysv_count_free_inodes: "
+		"free inode count was %d, correcting to %d\n",
+		sb_count, count);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		*sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count);
+		dirty_sb(sb);
+	}
+	goto out;
+
+Eio:
+	printk("sysv_count_free_inodes: unable to read inode table\n");
+trust_sb:
+	count = sb_count;
+	goto out;
+}
diff --git a/kernel/fs/sysv/inode.c b/kernel/fs/sysv/inode.c
new file mode 100644
index 000000000..88956309c
--- /dev/null
+++ b/kernel/fs/sysv/inode.c
@@ -0,0 +1,370 @@
+/*
+ *  linux/fs/sysv/inode.c
+ *
+ *  minix/inode.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  xenix/inode.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/inode.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Paul B. Monday
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Bruno Haible
+ *  Copyright (C) 1997, 1998  Krzysztof G. Baranowski
+ *
+ *  This file contains code for allocating/freeing inodes and for read/writing
+ *  the superblock.
+ */
+
+#include <linux/highuid.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/namei.h>
+#include <asm/byteorder.h>
+#include "sysv.h"
+
+static int sysv_sync_fs(struct super_block *sb, int wait)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	unsigned long time = get_seconds(), old_time;
+
+	mutex_lock(&sbi->s_lock);
+
+	/*
+	 * If we are going to write out the super block,
+	 * then attach current time stamp.
+	 * But if the filesystem was marked clean, keep it clean.
+	 */
+	old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
+	if (sbi->s_type == FSTYPE_SYSV4) {
+		if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
+			*sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time);
+		*sbi->s_sb_time = cpu_to_fs32(sbi, time);
+		mark_buffer_dirty(sbi->s_bh2);
+	}
+
+	mutex_unlock(&sbi->s_lock);
+
+	return 0;
+}
+
+static int sysv_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+
+	sync_filesystem(sb);
+	if (sbi->s_forced_ro)
+		*flags |= MS_RDONLY;
+	return 0;
+}
+
+static void sysv_put_super(struct super_block *sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* XXX ext2 also updates the state here */
+		mark_buffer_dirty(sbi->s_bh1);
+		if (sbi->s_bh1 != sbi->s_bh2)
+			mark_buffer_dirty(sbi->s_bh2);
+	}
+
+	brelse(sbi->s_bh1);
+	if (sbi->s_bh1 != sbi->s_bh2)
+		brelse(sbi->s_bh2);
+
+	kfree(sbi);
+}
+
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = sbi->s_ndatazones;
+	buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb);
+	buf->f_files = sbi->s_ninodes;
+	buf->f_ffree = sysv_count_free_inodes(sb);
+	buf->f_namelen = SYSV_NAMELEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	return 0;
+}
+
+/* 
+ * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32
+ */
+static inline void read3byte(struct sysv_sb_info *sbi,
+	unsigned char * from, unsigned char * to)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP) {
+		to[0] = from[0];
+		to[1] = 0;
+		to[2] = from[1];
+		to[3] = from[2];
+	} else if (sbi->s_bytesex == BYTESEX_LE) {
+		to[0] = from[0];
+		to[1] = from[1];
+		to[2] = from[2];
+		to[3] = 0;
+	} else {
+		to[0] = 0;
+		to[1] = from[0];
+		to[2] = from[1];
+		to[3] = from[2];
+	}
+}
+
+static inline void write3byte(struct sysv_sb_info *sbi,
+	unsigned char * from, unsigned char * to)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP) {
+		to[0] = from[0];
+		to[1] = from[2];
+		to[2] = from[3];
+	} else if (sbi->s_bytesex == BYTESEX_LE) {
+		to[0] = from[0];
+		to[1] = from[1];
+		to[2] = from[2];
+	} else {
+		to[0] = from[1];
+		to[1] = from[2];
+		to[2] = from[3];
+	}
+}
+
+static const struct inode_operations sysv_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= sysv_getattr,
+};
+
+void sysv_set_inode(struct inode *inode, dev_t rdev)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &sysv_file_inode_operations;
+		inode->i_fop = &sysv_file_operations;
+		inode->i_mapping->a_ops = &sysv_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &sysv_dir_inode_operations;
+		inode->i_fop = &sysv_dir_operations;
+		inode->i_mapping->a_ops = &sysv_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (inode->i_blocks) {
+			inode->i_op = &sysv_symlink_inode_operations;
+			inode->i_mapping->a_ops = &sysv_aops;
+		} else {
+			inode->i_op = &sysv_fast_symlink_inode_operations;
+			nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+				sizeof(SYSV_I(inode)->i_data) - 1);
+		}
+	} else
+		init_special_inode(inode, inode->i_mode, rdev);
+}
+
+struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
+{
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	struct sysv_inode_info * si;
+	struct inode *inode;
+	unsigned int block;
+
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %d is out of range\n",
+		       sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode) {
+		printk("Major problem: unable to read inode from dev %s\n",
+		       inode->i_sb->s_id);
+		goto bad_inode;
+	}
+	/* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */
+	inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
+	i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid));
+	i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
+	set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
+	inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
+	inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
+	inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
+	inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime);
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_blocks = 0;
+
+	si = SYSV_I(inode);
+	for (block = 0; block < 10+1+1+1; block++)
+		read3byte(sbi, &raw_inode->i_data[3*block],
+				(u8 *)&si->i_data[block]);
+	brelse(bh);
+	si->i_dir_start_lookup = 0;
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		sysv_set_inode(inode,
+			       old_decode_dev(fs32_to_cpu(sbi, si->i_data[0])));
+	else
+		sysv_set_inode(inode, 0);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static int __sysv_write_inode(struct inode *inode, int wait)
+{
+	struct super_block * sb = inode->i_sb;
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	struct sysv_inode_info * si;
+	unsigned int ino, block;
+	int err = 0;
+
+	ino = inode->i_ino;
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %d is out of range\n",
+		       inode->i_sb->s_id, ino);
+		return -EIO;
+	}
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode) {
+		printk("unable to read i-node block\n");
+		return -EIO;
+	}
+
+	raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
+	raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode)));
+	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
+	raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
+	raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
+	raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
+	raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
+	raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec);
+
+	si = SYSV_I(inode);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev));
+	for (block = 0; block < 10+1+1+1; block++)
+		write3byte(sbi, (u8 *)&si->i_data[block],
+			&raw_inode->i_data[3*block]);
+	mark_buffer_dirty(bh);
+	if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk ("IO error syncing sysv inode [%s:%08x]\n",
+                                sb->s_id, ino);
+                        err = -EIO;
+                }
+        }
+	brelse(bh);
+	return 0;
+}
+
+int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int sysv_sync_inode(struct inode *inode)
+{
+	return __sysv_write_inode(inode, 1);
+}
+
+static void sysv_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		sysv_truncate(inode);
+	}
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+	if (!inode->i_nlink)
+		sysv_free_inode(inode);
+}
+
+static struct kmem_cache *sysv_inode_cachep;
+
+static struct inode *sysv_alloc_inode(struct super_block *sb)
+{
+	struct sysv_inode_info *si;
+
+	si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL);
+	if (!si)
+		return NULL;
+	return &si->vfs_inode;
+}
+
+static void sysv_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
+}
+
+static void sysv_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, sysv_i_callback);
+}
+
+static void init_once(void *p)
+{
+	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
+
+	inode_init_once(&si->vfs_inode);
+}
+
+const struct super_operations sysv_sops = {
+	.alloc_inode	= sysv_alloc_inode,
+	.destroy_inode	= sysv_destroy_inode,
+	.write_inode	= sysv_write_inode,
+	.evict_inode	= sysv_evict_inode,
+	.put_super	= sysv_put_super,
+	.sync_fs	= sysv_sync_fs,
+	.remount_fs	= sysv_remount,
+	.statfs		= sysv_statfs,
+};
+
+int __init sysv_init_icache(void)
+{
+	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
+			sizeof(struct sysv_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			init_once);
+	if (!sysv_inode_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+void sysv_destroy_icache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(sysv_inode_cachep);
+}
diff --git a/kernel/fs/sysv/itree.c b/kernel/fs/sysv/itree.c
new file mode 100644
index 000000000..2fde40acf
--- /dev/null
+++ b/kernel/fs/sysv/itree.c
@@ -0,0 +1,501 @@
+/*
+ *  linux/fs/sysv/itree.c
+ *
+ *  Handling of indirect blocks' trees.
+ *  AV, Sep--Dec 2000
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mount.h>
+#include <linux/string.h>
+#include "sysv.h"
+
+enum {DIRECT = 10, DEPTH = 4};	/* Have triple indirect */
+
+static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode)
+{
+	mark_buffer_dirty_inode(bh, inode);
+	if (IS_SYNC(inode))
+		sync_dirty_buffer(bh);
+}
+
+static int block_to_path(struct inode *inode, long block, int offsets[DEPTH])
+{
+	struct super_block *sb = inode->i_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	int ptrs_bits = sbi->s_ind_per_block_bits;
+	unsigned long	indirect_blocks = sbi->s_ind_per_block,
+			double_blocks = sbi->s_ind_per_block_2;
+	int n = 0;
+
+	if (block < 0) {
+		printk("sysv_block_map: block < 0\n");
+	} else if (block < DIRECT) {
+		offsets[n++] = block;
+	} else if ( (block -= DIRECT) < indirect_blocks) {
+		offsets[n++] = DIRECT;
+		offsets[n++] = block;
+	} else if ((block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = DIRECT+1;
+		offsets[n++] = block >> ptrs_bits;
+		offsets[n++] = block & (indirect_blocks - 1);
+	} else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) {
+		offsets[n++] = DIRECT+2;
+		offsets[n++] = block >> (ptrs_bits * 2);
+		offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1);
+		offsets[n++] = block & (indirect_blocks - 1);
+	} else {
+		/* nothing */;
+	}
+	return n;
+}
+
+static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr)
+{
+	return sbi->s_block_base + fs32_to_cpu(sbi, nr);
+}
+
+typedef struct {
+	sysv_zone_t     *p;
+	sysv_zone_t     key;
+	struct buffer_head *bh;
+} Indirect;
+
+static DEFINE_RWLOCK(pointers_lock);
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static inline int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+static inline sysv_zone_t *block_end(struct buffer_head *bh)
+{
+	return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
+}
+
+/*
+ * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
+ */
+static Indirect *get_branch(struct inode *inode,
+			    int depth,
+			    int offsets[],
+			    Indirect chain[],
+			    int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		int block = block_to_cpu(SYSV_SB(sb), p->key);
+		bh = sb_bread(sb, block);
+		if (!bh)
+			goto failure;
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+static int alloc_branch(struct inode *inode,
+			int num,
+			int *offsets,
+			Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int n = 0;
+	int i;
+
+	branch[0].key = sysv_new_block(inode->i_sb);
+	if (branch[0].key) for (n = 1; n < num; n++) {
+		struct buffer_head *bh;
+		int parent;
+		/* Allocate the next block */
+		branch[n].key = sysv_new_block(inode->i_sb);
+		if (!branch[n].key)
+			break;
+		/*
+		 * Get buffer_head for parent block, zero it out and set 
+		 * the pointer to new one, then send parent to disk.
+		 */
+		parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key);
+		bh = sb_getblk(inode->i_sb, parent);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, blocksize);
+		branch[n].bh = bh;
+		branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n];
+		*branch[n].p = branch[n].key;
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		dirty_indirect(bh, inode);
+	}
+	if (n == num)
+		return 0;
+
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < n; i++)
+		sysv_free_block(inode->i_sb, branch[i].key);
+	return -ENOSPC;
+}
+
+static inline int splice_branch(struct inode *inode,
+				Indirect chain[],
+				Indirect *where,
+				int num)
+{
+	int i;
+
+	/* Verify that place we are splicing to is still there and vacant */
+	write_lock(&pointers_lock);
+	if (!verify_chain(chain, where-1) || *where->p)
+		goto changed;
+	*where->p = where->key;
+	write_unlock(&pointers_lock);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh)
+		dirty_indirect(where->bh, inode);
+
+	if (IS_SYNC(inode))
+		sysv_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+	return 0;
+
+changed:
+	write_unlock(&pointers_lock);
+	for (i = 1; i < num; i++)
+		bforget(where[i].bh);
+	for (i = 0; i < num; i++)
+		sysv_free_block(inode->i_sb, where[i].key);
+	return -EAGAIN;
+}
+
+static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	struct super_block *sb = inode->i_sb;
+	Indirect *partial;
+	int left;
+	int depth = block_to_path(inode, iblock, offsets);
+
+	if (depth == 0)
+		goto out;
+
+reread:
+	read_lock(&pointers_lock);
+	partial = get_branch(inode, depth, offsets, chain, &err);
+	read_unlock(&pointers_lock);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+got_it:
+		map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb),
+					chain[depth-1].key));
+		/* Clean up and exit */
+		partial = chain+depth-1; /* the whole chain */
+		goto cleanup;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO) {
+cleanup:
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+out:
+		return err;
+	}
+
+	/*
+	 * Indirect block might be removed by truncate while we were
+	 * reading it. Handling of that case (forget what we've got and
+	 * reread) is taken out of the main path.
+	 */
+	if (err == -EAGAIN)
+		goto changed;
+
+	left = (chain + depth) - partial;
+	err = alloc_branch(inode, left, offsets+(partial-chain), partial);
+	if (err)
+		goto cleanup;
+
+	if (splice_branch(inode, chain, partial, left) < 0)
+		goto changed;
+
+	set_buffer_new(bh_result);
+	goto got_it;
+
+changed:
+	while (partial > chain) {
+		brelse(partial->bh);
+		partial--;
+	}
+	goto reread;
+}
+
+static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+static Indirect *find_shared(struct inode *inode,
+				int depth,
+				int offsets[],
+				Indirect chain[],
+				sysv_zone_t *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+
+	write_lock(&pointers_lock);
+	partial = get_branch(inode, k, offsets, chain, &err);
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p) {
+		write_unlock(&pointers_lock);
+		goto no_top;
+	}
+	for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		*p->p = 0;
+	}
+	write_unlock(&pointers_lock);
+
+	while (partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q)
+{
+	for ( ; p < q ; p++) {
+		sysv_zone_t nr = *p;
+		if (nr) {
+			*p = 0;
+			sysv_free_block(inode->i_sb, nr);
+			mark_inode_dirty(inode);
+		}
+	}
+}
+
+static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth)
+{
+	struct buffer_head * bh;
+	struct super_block *sb = inode->i_sb;
+
+	if (depth--) {
+		for ( ; p < q ; p++) {
+			int block;
+			sysv_zone_t nr = *p;
+			if (!nr)
+				continue;
+			*p = 0;
+			block = block_to_cpu(SYSV_SB(sb), nr);
+			bh = sb_bread(sb, block);
+			if (!bh)
+				continue;
+			free_branches(inode, (sysv_zone_t*)bh->b_data,
+					block_end(bh), depth);
+			bforget(bh);
+			sysv_free_block(sb, nr);
+			mark_inode_dirty(inode);
+		}
+	} else
+		free_data(inode, p, q);
+}
+
+void sysv_truncate (struct inode * inode)
+{
+	sysv_zone_t *i_data = SYSV_I(inode)->i_data;
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	Indirect *partial;
+	sysv_zone_t nr = 0;
+	int n;
+	long iblock;
+	unsigned blocksize;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return;
+
+	blocksize = inode->i_sb->s_blocksize;
+	iblock = (inode->i_size + blocksize-1)
+					>> inode->i_sb->s_blocksize_bits;
+
+	block_truncate_page(inode->i_mapping, inode->i_size, get_block);
+
+	n = block_to_path(inode, iblock, offsets);
+	if (n == 0)
+		return;
+
+	if (n == 1) {
+		free_data(inode, i_data+offsets[0], i_data + DIRECT);
+		goto do_indirects;
+	}
+
+	partial = find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (already detached) */
+	if (nr) {
+		if (partial == chain)
+			mark_inode_dirty(inode);
+		else
+			dirty_indirect(partial->bh, inode);
+		free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		free_branches(inode, partial->p + 1, block_end(partial->bh),
+				(chain+n-1) - partial);
+		dirty_indirect(partial->bh, inode);
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees (== subtrees deeper than...) */
+	while (n < DEPTH) {
+		nr = i_data[DIRECT + n - 1];
+		if (nr) {
+			i_data[DIRECT + n - 1] = 0;
+			mark_inode_dirty(inode);
+			free_branches(inode, &nr, &nr+1, n);
+		}
+		n++;
+	}
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	if (IS_SYNC(inode))
+		sysv_sync_inode (inode);
+	else
+		mark_inode_dirty(inode);
+}
+
+static unsigned sysv_nblocks(struct super_block *s, loff_t size)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(s);
+	int ptrs_bits = sbi->s_ind_per_block_bits;
+	unsigned blocks, res, direct = DIRECT, i = DEPTH;
+	blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits;
+	res = blocks;
+	while (--i && blocks > direct) {
+		blocks = ((blocks - direct - 1) >> ptrs_bits) + 1;
+		res += blocks;
+		direct = 1;
+	}
+	return blocks;
+}
+
+int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct super_block *s = dentry->d_sb;
+	generic_fillattr(d_inode(dentry), stat);
+	stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
+	stat->blksize = s->s_blocksize;
+	return 0;
+}
+
+static int sysv_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page,get_block,wbc);
+}
+
+static int sysv_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,get_block);
+}
+
+int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, get_block);
+}
+
+static void sysv_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		sysv_truncate(inode);
+	}
+}
+
+static int sysv_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
+	if (unlikely(ret))
+		sysv_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,get_block);
+}
+
+const struct address_space_operations sysv_aops = {
+	.readpage = sysv_readpage,
+	.writepage = sysv_writepage,
+	.write_begin = sysv_write_begin,
+	.write_end = generic_write_end,
+	.bmap = sysv_bmap
+};
diff --git a/kernel/fs/sysv/namei.c b/kernel/fs/sysv/namei.c
new file mode 100644
index 000000000..11e83ed0b
--- /dev/null
+++ b/kernel/fs/sysv/namei.c
@@ -0,0 +1,290 @@
+/*
+ *  linux/fs/sysv/namei.c
+ *
+ *  minix/namei.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  coh/namei.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/namei.c
+ *  Copyright (C) 1993  Bruno Haible
+ *  Copyright (C) 1997, 1998  Krzysztof G. Baranowski
+ */
+
+#include <linux/pagemap.h>
+#include "sysv.h"
+
+static int add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = sysv_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	/* Truncate the name in place, avoids having to define a compare
+	   function. */
+	if (qstr->len > SYSV_NAMELEN) {
+		qstr->len = SYSV_NAMELEN;
+		qstr->hash = full_name_hash(qstr->name, qstr->len);
+	}
+	return 0;
+}
+
+const struct dentry_operations sysv_dentry_operations = {
+	.d_hash		= sysv_hash,
+};
+
+static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
+{
+	struct inode * inode = NULL;
+	ino_t ino;
+
+	if (dentry->d_name.len > SYSV_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+	ino = sysv_inode_by_name(dentry);
+
+	if (ino) {
+		inode = sysv_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev)
+{
+	struct inode * inode;
+	int err;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = sysv_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+
+	if (!IS_ERR(inode)) {
+		sysv_set_inode(inode, rdev);
+		mark_inode_dirty(inode);
+		err = add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
+{
+	return sysv_mknod(dir, dentry, mode, 0);
+}
+
+static int sysv_symlink(struct inode * dir, struct dentry * dentry, 
+	const char * symname)
+{
+	int err = -ENAMETOOLONG;
+	int l = strlen(symname)+1;
+	struct inode * inode;
+
+	if (l > dir->i_sb->s_blocksize)
+		goto out;
+
+	inode = sysv_new_inode(dir, S_IFLNK|0777);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+	
+	sysv_set_inode(inode, 0);
+	err = page_symlink(inode, symname, l);
+	if (err)
+		goto out_fail;
+
+	mark_inode_dirty(inode);
+	err = add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int sysv_link(struct dentry * old_dentry, struct inode * dir, 
+	struct dentry * dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	return add_nondir(dentry, inode);
+}
+
+static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode * inode;
+	int err;
+
+	inode_inc_link_count(dir);
+
+	inode = sysv_new_inode(dir, S_IFDIR|mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	sysv_set_inode(inode, 0);
+
+	inode_inc_link_count(inode);
+
+	err = sysv_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = sysv_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+        d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int sysv_unlink(struct inode * dir, struct dentry * dentry)
+{
+	struct inode * inode = d_inode(dentry);
+	struct page * page;
+	struct sysv_dir_entry * de;
+	int err = -ENOENT;
+
+	de = sysv_find_entry(dentry, &page);
+	if (!de)
+		goto out;
+
+	err = sysv_delete_entry (de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+out:
+	return err;
+}
+
+static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	int err = -ENOTEMPTY;
+
+	if (sysv_empty_dir(inode)) {
+		err = sysv_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
+		  struct inode * new_dir, struct dentry * new_dentry)
+{
+	struct inode * old_inode = d_inode(old_dentry);
+	struct inode * new_inode = d_inode(new_dentry);
+	struct page * dir_page = NULL;
+	struct sysv_dir_entry * dir_de = NULL;
+	struct page * old_page;
+	struct sysv_dir_entry * old_de;
+	int err = -ENOENT;
+
+	old_de = sysv_find_entry(old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = sysv_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page * new_page;
+		struct sysv_dir_entry * new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !sysv_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = sysv_find_entry(new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		sysv_set_link(new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		err = sysv_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	sysv_delete_entry(old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		sysv_set_link(dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations sysv_dir_inode_operations = {
+	.create		= sysv_create,
+	.lookup		= sysv_lookup,
+	.link		= sysv_link,
+	.unlink		= sysv_unlink,
+	.symlink	= sysv_symlink,
+	.mkdir		= sysv_mkdir,
+	.rmdir		= sysv_rmdir,
+	.mknod		= sysv_mknod,
+	.rename		= sysv_rename,
+	.getattr	= sysv_getattr,
+};
diff --git a/kernel/fs/sysv/super.c b/kernel/fs/sysv/super.c
new file mode 100644
index 000000000..eda109597
--- /dev/null
+++ b/kernel/fs/sysv/super.c
@@ -0,0 +1,593 @@
+/*
+ *  linux/fs/sysv/inode.c
+ *
+ *  minix/inode.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  xenix/inode.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/inode.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Paul B. Monday
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Bruno Haible
+ *  Copyright (C) 1997, 1998  Krzysztof G. Baranowski
+ *
+ *  This file contains code for read/parsing the superblock.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include "sysv.h"
+
+/*
+ * The following functions try to recognize specific filesystems.
+ *
+ * We recognize:
+ * - Xenix FS by its magic number.
+ * - SystemV FS by its magic number.
+ * - Coherent FS by its funny fname/fpack field.
+ * - SCO AFS by s_nfree == 0xffff
+ * - V7 FS has no distinguishing features.
+ *
+ * We discriminate among SystemV4 and SystemV2 FS by the assumption that
+ * the time stamp is not < 01-01-1980.
+ */
+
+enum {
+	JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60
+};
+
+static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links)
+{
+	struct buffer_head *bh1 = sbi->s_bh1;
+	struct buffer_head *bh2 = sbi->s_bh2;
+	struct xenix_super_block * sbd1;
+	struct xenix_super_block * sbd2;
+
+	if (bh1 != bh2)
+		sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data;
+	else {
+		/* block size = 512, so bh1 != bh2 */
+		sbd1 = (struct xenix_super_block *) bh1->b_data;
+		sbd2 = (struct xenix_super_block *) (bh2->b_data - 512);
+	}
+
+	*max_links = XENIX_LINK_MAX;
+	sbi->s_fic_size = XENIX_NICINOD;
+	sbi->s_flc_size = XENIX_NICFREE;
+	sbi->s_sbd1 = (char *)sbd1;
+	sbi->s_sbd2 = (char *)sbd2;
+	sbi->s_sb_fic_count = &sbd1->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd1->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd2->s_tinode;
+	sbi->s_bcache_count = &sbd1->s_nfree;
+	sbi->s_bcache = &sbd1->s_free[0];
+	sbi->s_free_blocks = &sbd2->s_tfree;
+	sbi->s_sb_time = &sbd2->s_time;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize);
+}
+
+static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links)
+{
+	struct sysv4_super_block * sbd;
+	struct buffer_head *bh1 = sbi->s_bh1;
+	struct buffer_head *bh2 = sbi->s_bh2;
+
+	if (bh1 == bh2)
+		sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2);
+	else
+		sbd = (struct sysv4_super_block *) bh2->b_data;
+
+	*max_links = SYSV_LINK_MAX;
+	sbi->s_fic_size = SYSV_NICINOD;
+	sbi->s_flc_size = SYSV_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_sb_state = &sbd->s_state;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links)
+{
+	struct sysv2_super_block *sbd;
+	struct buffer_head *bh1 = sbi->s_bh1;
+	struct buffer_head *bh2 = sbi->s_bh2;
+
+	if (bh1 == bh2)
+		sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2);
+	else
+		sbd = (struct sysv2_super_block *) bh2->b_data;
+
+	*max_links = SYSV_LINK_MAX;
+	sbi->s_fic_size = SYSV_NICINOD;
+	sbi->s_flc_size = SYSV_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_sb_state = &sbd->s_state;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links)
+{
+	struct coh_super_block * sbd;
+	struct buffer_head *bh1 = sbi->s_bh1;
+
+	sbd = (struct coh_super_block *) bh1->b_data;
+
+	*max_links = COH_LINK_MAX;
+	sbi->s_fic_size = COH_NICINOD;
+	sbi->s_flc_size = COH_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links)
+{
+	struct buffer_head *bh2 = sbi->s_bh2;
+	struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data;
+
+	*max_links = V7_LINK_MAX;
+	sbi->s_fic_size = V7_NICINOD;
+	sbi->s_flc_size = V7_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data;
+	if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544))
+		sbi->s_bytesex = BYTESEX_LE;
+	else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544))
+		sbi->s_bytesex = BYTESEX_BE;
+	else
+		return 0;
+	switch (fs32_to_cpu(sbi, sbd->s_type)) {
+	case 1:
+		sbi->s_type = FSTYPE_XENIX;
+		return 1;
+	case 2:
+		sbi->s_type = FSTYPE_XENIX;
+		return 2;
+	default:
+		return 0;
+	}
+}
+
+static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	struct super_block *sb = sbi->s_sb;
+	/* All relevant fields are at the same offsets in R2 and R4 */
+	struct sysv4_super_block * sbd;
+	u32 type;
+
+	sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2);
+	if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20))
+		sbi->s_bytesex = BYTESEX_LE;
+	else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20))
+		sbi->s_bytesex = BYTESEX_BE;
+	else
+		return 0;
+
+	type = fs32_to_cpu(sbi, sbd->s_type);
+ 
+ 	if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) {
+ 		sbi->s_type = FSTYPE_AFS;
+		sbi->s_forced_ro = 1;
+ 		if (!(sb->s_flags & MS_RDONLY)) {
+ 			printk("SysV FS: SCO EAFS on %s detected, " 
+ 				"forcing read-only mode.\n", 
+ 				sb->s_id);
+ 		}
+ 		return type;
+ 	}
+ 
+	if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) {
+		/* this is likely to happen on SystemV2 FS */
+		if (type > 3 || type < 1)
+			return 0;
+		sbi->s_type = FSTYPE_SYSV2;
+		return type;
+	}
+	if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10))
+		return 0;
+
+	/* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10,
+	   0x20 or 0x30 indicates that symbolic links and the 14-character
+	   filename limit is gone. Due to lack of information about this
+           feature read-only mode seems to be a reasonable approach... -KGB */
+
+	if (type >= 0x10) {
+		printk("SysV FS: can't handle long file names on %s, "
+		       "forcing read-only mode.\n", sb->s_id);
+		sbi->s_forced_ro = 1;
+	}
+
+	sbi->s_type = FSTYPE_SYSV4;
+	return type >= 0x10 ? type >> 4 : type;
+}
+
+static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	struct coh_super_block * sbd;
+
+	sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2);
+	if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6))
+	    || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6)))
+		return 0;
+	sbi->s_bytesex = BYTESEX_PDP;
+	sbi->s_type = FSTYPE_COH;
+	return 1;
+}
+
+static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	int size = detect_sysv(sbi, bh);
+
+	return size>2 ? 0 : size;
+}
+
+static struct {
+	int block;
+	int (*test)(struct sysv_sb_info *, struct buffer_head *);
+} flavours[] = {
+	{1, detect_xenix},
+	{0, detect_sysv},
+	{0, detect_coherent},
+	{9, detect_sysv_odd},
+	{15,detect_sysv_odd},
+	{18,detect_sysv},
+};
+
+static char *flavour_names[] = {
+	[FSTYPE_XENIX]	= "Xenix",
+	[FSTYPE_SYSV4]	= "SystemV",
+	[FSTYPE_SYSV2]	= "SystemV Release 2",
+	[FSTYPE_COH]	= "Coherent",
+	[FSTYPE_V7]	= "V7",
+	[FSTYPE_AFS]	= "AFS",
+};
+
+static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = {
+	[FSTYPE_XENIX]	= detected_xenix,
+	[FSTYPE_SYSV4]	= detected_sysv4,
+	[FSTYPE_SYSV2]	= detected_sysv2,
+	[FSTYPE_COH]	= detected_coherent,
+	[FSTYPE_V7]	= detected_v7,
+	[FSTYPE_AFS]	= detected_sysv4,
+};
+
+static int complete_read_super(struct super_block *sb, int silent, int size)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct inode *root_inode;
+	char *found = flavour_names[sbi->s_type];
+	u_char n_bits = size+8;
+	int bsize = 1 << n_bits;
+	int bsize_4 = bsize >> 2;
+
+	sbi->s_firstinodezone = 2;
+
+	flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
+	
+	sbi->s_truncate = 1;
+	sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
+	sbi->s_inodes_per_block = bsize >> 6;
+	sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
+	sbi->s_inodes_per_block_bits = n_bits-6;
+	sbi->s_ind_per_block = bsize_4;
+	sbi->s_ind_per_block_2 = bsize_4*bsize_4;
+	sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4));
+	sbi->s_ind_per_block_bits = n_bits-2;
+
+	sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone)
+		<< sbi->s_inodes_per_block_bits;
+
+	if (!silent)
+		printk("VFS: Found a %s FS (block size = %ld) on device %s\n",
+		       found, sb->s_blocksize, sb->s_id);
+
+	sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
+	/* set up enough so that it can read an inode */
+	sb->s_op = &sysv_sops;
+	if (sbi->s_forced_ro)
+		sb->s_flags |= MS_RDONLY;
+	if (sbi->s_truncate)
+		sb->s_d_op = &sysv_dentry_operations;
+	root_inode = sysv_iget(sb, SYSV_ROOT_INO);
+	if (IS_ERR(root_inode)) {
+		printk("SysV FS: get root inode failed\n");
+		return 0;
+	}
+	sb->s_root = d_make_root(root_inode);
+	if (!sb->s_root) {
+		printk("SysV FS: get root dentry failed\n");
+		return 0;
+	}
+	return 1;
+}
+
+static int sysv_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh1, *bh = NULL;
+	struct sysv_sb_info *sbi;
+	unsigned long blocknr;
+	int size = 0, i;
+	
+	BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
+	BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
+	BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
+	BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
+	BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
+
+	sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sbi->s_sb = sb;
+	sbi->s_block_base = 0;
+	mutex_init(&sbi->s_lock);
+	sb->s_fs_info = sbi;
+
+	sb_set_blocksize(sb, BLOCK_SIZE);
+
+	for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) {
+		brelse(bh);
+		bh = sb_bread(sb, flavours[i].block);
+		if (!bh)
+			continue;
+		size = flavours[i].test(SYSV_SB(sb), bh);
+	}
+
+	if (!size)
+		goto Eunknown;
+
+	switch (size) {
+		case 1:
+			blocknr = bh->b_blocknr << 1;
+			brelse(bh);
+			sb_set_blocksize(sb, 512);
+			bh1 = sb_bread(sb, blocknr);
+			bh = sb_bread(sb, blocknr + 1);
+			break;
+		case 2:
+			bh1 = bh;
+			break;
+		case 3:
+			blocknr = bh->b_blocknr >> 1;
+			brelse(bh);
+			sb_set_blocksize(sb, 2048);
+			bh1 = bh = sb_bread(sb, blocknr);
+			break;
+		default:
+			goto Ebadsize;
+	}
+
+	if (bh && bh1) {
+		sbi->s_bh1 = bh1;
+		sbi->s_bh2 = bh;
+		if (complete_read_super(sb, silent, size))
+			return 0;
+	}
+
+	brelse(bh1);
+	brelse(bh);
+	sb_set_blocksize(sb, BLOCK_SIZE);
+	printk("oldfs: cannot read superblock\n");
+failed:
+	kfree(sbi);
+	return -EINVAL;
+
+Eunknown:
+	brelse(bh);
+	if (!silent)
+		printk("VFS: unable to find oldfs superblock on device %s\n",
+			sb->s_id);
+	goto failed;
+Ebadsize:
+	brelse(bh);
+	if (!silent)
+		printk("VFS: oldfs: unsupported block size (%dKb)\n",
+			1<<(size-2));
+	goto failed;
+}
+
+static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
+{
+	struct v7_super_block *v7sb;
+	struct sysv_inode *v7i;
+	struct buffer_head *bh2;
+	struct sysv_sb_info *sbi;
+
+	sbi = sb->s_fs_info;
+
+	/* plausibility check on superblock */
+	v7sb = (struct v7_super_block *) bh->b_data;
+	if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
+	    fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
+	    fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
+		return 0;
+
+	/* plausibility check on root inode: it is a directory,
+	   with a nonzero size that is a multiple of 16 */
+	bh2 = sb_bread(sb, 2);
+	if (bh2 == NULL)
+		return 0;
+
+	v7i = (struct sysv_inode *)(bh2->b_data + 64);
+	if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
+	    (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
+	     sizeof(struct sysv_dir_entry))) {
+		brelse(bh2);
+		return 0;
+	}
+
+	brelse(bh2);
+	return 1;
+}
+
+static int v7_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct sysv_sb_info *sbi;
+	struct buffer_head *bh;
+
+	if (440 != sizeof (struct v7_super_block))
+		panic("V7 FS: bad super-block size");
+	if (64 != sizeof (struct sysv_inode))
+		panic("sysv fs: bad i-node size");
+
+	sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sbi->s_sb = sb;
+	sbi->s_block_base = 0;
+	sbi->s_type = FSTYPE_V7;
+	mutex_init(&sbi->s_lock);
+	sb->s_fs_info = sbi;
+	
+	sb_set_blocksize(sb, 512);
+
+	if ((bh = sb_bread(sb, 1)) == NULL) {
+		if (!silent)
+			printk("VFS: unable to read V7 FS superblock on "
+			       "device %s.\n", sb->s_id);
+		goto failed;
+	}
+
+	/* Try PDP-11 UNIX */
+	sbi->s_bytesex = BYTESEX_PDP;
+	if (v7_sanity_check(sb, bh))
+		goto detected;
+
+	/* Try PC/IX, v7/x86 */
+	sbi->s_bytesex = BYTESEX_LE;
+	if (v7_sanity_check(sb, bh))
+		goto detected;
+
+	goto failed;
+
+detected:
+	sbi->s_bh1 = bh;
+	sbi->s_bh2 = bh;
+	if (complete_read_super(sb, silent, 1))
+		return 0;
+
+failed:
+	printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
+		sb->s_id);
+	brelse(bh);
+	kfree(sbi);
+	return -EINVAL;
+}
+
+/* Every kernel module contains stuff like this. */
+
+static struct dentry *sysv_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+}
+
+static struct dentry *v7_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+}
+
+static struct file_system_type sysv_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "sysv",
+	.mount		= sysv_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("sysv");
+
+static struct file_system_type v7_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "v7",
+	.mount		= v7_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("v7");
+MODULE_ALIAS("v7");
+
+static int __init init_sysv_fs(void)
+{
+	int error;
+
+	error = sysv_init_icache();
+	if (error)
+		goto out;
+	error = register_filesystem(&sysv_fs_type);
+	if (error)
+		goto destroy_icache;
+	error = register_filesystem(&v7_fs_type);
+	if (error)
+		goto unregister;
+	return 0;
+
+unregister:
+	unregister_filesystem(&sysv_fs_type);
+destroy_icache:
+	sysv_destroy_icache();
+out:
+	return error;
+}
+
+static void __exit exit_sysv_fs(void)
+{
+	unregister_filesystem(&sysv_fs_type);
+	unregister_filesystem(&v7_fs_type);
+	sysv_destroy_icache();
+}
+
+module_init(init_sysv_fs)
+module_exit(exit_sysv_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/sysv/symlink.c b/kernel/fs/sysv/symlink.c
new file mode 100644
index 000000000..d3fa0d703
--- /dev/null
+++ b/kernel/fs/sysv/symlink.c
@@ -0,0 +1,20 @@
+/*
+ *  linux/fs/sysv/symlink.c
+ *
+ *  Handling of System V filesystem fast symlinks extensions.
+ *  Aug 2001, Christoph Hellwig (hch@infradead.org)
+ */
+
+#include "sysv.h"
+#include <linux/namei.h>
+
+static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, (char *)SYSV_I(d_inode(dentry))->i_data);
+	return NULL;
+}
+
+const struct inode_operations sysv_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= sysv_follow_link,
+};
diff --git a/kernel/fs/sysv/sysv.h b/kernel/fs/sysv/sysv.h
new file mode 100644
index 000000000..69d488986
--- /dev/null
+++ b/kernel/fs/sysv/sysv.h
@@ -0,0 +1,247 @@
+#ifndef _SYSV_H
+#define _SYSV_H
+
+#include <linux/buffer_head.h>
+
+typedef __u16 __bitwise __fs16;
+typedef __u32 __bitwise __fs32;
+
+#include <linux/sysv_fs.h>
+
+/*
+ * SystemV/V7/Coherent super-block data in memory
+ *
+ * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified
+ * while the system is running). This is in contrast to the Minix and Berkeley
+ * filesystems (where the superblock is never modified). This affects the
+ * sync() operation: we must keep the superblock in a disk buffer and use this
+ * one as our "working copy".
+ */
+
+struct sysv_sb_info {
+	struct super_block *s_sb;	/* VFS superblock */
+	int	       s_type;		/* file system type: FSTYPE_{XENIX|SYSV|COH} */
+	char	       s_bytesex;	/* bytesex (le/be/pdp) */
+	char	       s_truncate;	/* if 1: names > SYSV_NAMELEN chars are truncated */
+					/* if 0: they are disallowed (ENAMETOOLONG) */
+	unsigned int   s_inodes_per_block;	/* number of inodes per block */
+	unsigned int   s_inodes_per_block_1;	/* inodes_per_block - 1 */
+	unsigned int   s_inodes_per_block_bits;	/* log2(inodes_per_block) */
+	unsigned int   s_ind_per_block;		/* number of indirections per block */
+	unsigned int   s_ind_per_block_bits;	/* log2(ind_per_block) */
+	unsigned int   s_ind_per_block_2;	/* ind_per_block ^ 2 */
+	unsigned int   s_toobig_block;		/* 10 + ipb + ipb^2 + ipb^3 */
+	unsigned int   s_block_base;	/* physical block number of block 0 */
+	unsigned short s_fic_size;	/* free inode cache size, NICINOD */
+	unsigned short s_flc_size;	/* free block list chunk size, NICFREE */
+	/* The superblock is kept in one or two disk buffers: */
+	struct buffer_head *s_bh1;
+	struct buffer_head *s_bh2;
+	/* These are pointers into the disk buffer, to compensate for
+	   different superblock layout. */
+	char *         s_sbd1;		/* entire superblock data, for part 1 */
+	char *         s_sbd2;		/* entire superblock data, for part 2 */
+	__fs16         *s_sb_fic_count;	/* pointer to s_sbd->s_ninode */
+        sysv_ino_t     *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */
+	__fs16         *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */
+	__fs16         *s_bcache_count;	/* pointer to s_sbd->s_nfree */
+	sysv_zone_t    *s_bcache;	/* pointer to s_sbd->s_free */
+	__fs32         *s_free_blocks;	/* pointer to s_sbd->s_tfree */
+	__fs32         *s_sb_time;	/* pointer to s_sbd->s_time */
+	__fs32         *s_sb_state;	/* pointer to s_sbd->s_state, only FSTYPE_SYSV */
+	/* We keep those superblock entities that don't change here;
+	   this saves us an indirection and perhaps a conversion. */
+	u32            s_firstinodezone; /* index of first inode zone */
+	u32            s_firstdatazone;	/* same as s_sbd->s_isize */
+	u32            s_ninodes;	/* total number of inodes */
+	u32            s_ndatazones;	/* total number of data zones */
+	u32            s_nzones;	/* same as s_sbd->s_fsize */
+	u16	       s_namelen;       /* max length of dir entry */
+	int	       s_forced_ro;
+	struct mutex s_lock;
+};
+
+/*
+ * SystemV/V7/Coherent FS inode data in memory
+ */
+struct sysv_inode_info {
+	__fs32		i_data[13];
+	u32		i_dir_start_lookup;
+	struct inode	vfs_inode;
+};
+
+
+static inline struct sysv_inode_info *SYSV_I(struct inode *inode)
+{
+	return list_entry(inode, struct sysv_inode_info, vfs_inode);
+}
+
+static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+
+/* identify the FS in memory */
+enum {
+	FSTYPE_NONE = 0,
+	FSTYPE_XENIX,
+	FSTYPE_SYSV4,
+	FSTYPE_SYSV2,
+	FSTYPE_COH,
+	FSTYPE_V7,
+	FSTYPE_AFS,
+	FSTYPE_END,
+};
+
+#define SYSV_MAGIC_BASE		0x012FF7B3
+
+#define XENIX_SUPER_MAGIC	(SYSV_MAGIC_BASE+FSTYPE_XENIX)
+#define SYSV4_SUPER_MAGIC	(SYSV_MAGIC_BASE+FSTYPE_SYSV4)
+#define SYSV2_SUPER_MAGIC	(SYSV_MAGIC_BASE+FSTYPE_SYSV2)
+#define COH_SUPER_MAGIC		(SYSV_MAGIC_BASE+FSTYPE_COH)
+
+
+/* Admissible values for i_nlink: 0.._LINK_MAX */
+enum {
+	XENIX_LINK_MAX	=	126,	/* ?? */
+	SYSV_LINK_MAX	=	126,	/* 127? 251? */
+	V7_LINK_MAX     =	126,	/* ?? */
+	COH_LINK_MAX	=	10000,
+};
+
+
+static inline void dirty_sb(struct super_block *sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+
+	mark_buffer_dirty(sbi->s_bh1);
+	if (sbi->s_bh1 != sbi->s_bh2)
+		mark_buffer_dirty(sbi->s_bh2);
+}
+
+
+/* ialloc.c */
+extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned,
+			struct buffer_head **);
+extern struct inode * sysv_new_inode(const struct inode *, umode_t);
+extern void sysv_free_inode(struct inode *);
+extern unsigned long sysv_count_free_inodes(struct super_block *);
+
+/* balloc.c */
+extern sysv_zone_t sysv_new_block(struct super_block *);
+extern void sysv_free_block(struct super_block *, sysv_zone_t);
+extern unsigned long sysv_count_free_blocks(struct super_block *);
+
+/* itree.c */
+extern void sysv_truncate(struct inode *);
+extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+
+/* inode.c */
+extern struct inode *sysv_iget(struct super_block *, unsigned int);
+extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
+extern int sysv_sync_inode(struct inode *);
+extern void sysv_set_inode(struct inode *, dev_t);
+extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int sysv_init_icache(void);
+extern void sysv_destroy_icache(void);
+
+
+/* dir.c */
+extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **);
+extern int sysv_add_link(struct dentry *, struct inode *);
+extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *);
+extern int sysv_make_empty(struct inode *, struct inode *);
+extern int sysv_empty_dir(struct inode *);
+extern void sysv_set_link(struct sysv_dir_entry *, struct page *,
+			struct inode *);
+extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **);
+extern ino_t sysv_inode_by_name(struct dentry *);
+
+
+extern const struct inode_operations sysv_file_inode_operations;
+extern const struct inode_operations sysv_dir_inode_operations;
+extern const struct inode_operations sysv_fast_symlink_inode_operations;
+extern const struct file_operations sysv_file_operations;
+extern const struct file_operations sysv_dir_operations;
+extern const struct address_space_operations sysv_aops;
+extern const struct super_operations sysv_sops;
+extern const struct dentry_operations sysv_dentry_operations;
+
+
+enum {
+	BYTESEX_LE,
+	BYTESEX_PDP,
+	BYTESEX_BE,
+};
+
+static inline u32 PDP_swab(u32 x)
+{
+#ifdef __LITTLE_ENDIAN
+	return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16);
+#else
+#ifdef __BIG_ENDIAN
+	return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8);
+#else
+#error BYTESEX
+#endif
+#endif
+}
+
+static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP)
+		return PDP_swab((__force __u32)n);
+	else if (sbi->s_bytesex == BYTESEX_LE)
+		return le32_to_cpu((__force __le32)n);
+	else
+		return be32_to_cpu((__force __be32)n);
+}
+
+static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP)
+		return (__force __fs32)PDP_swab(n);
+	else if (sbi->s_bytesex == BYTESEX_LE)
+		return (__force __fs32)cpu_to_le32(n);
+	else
+		return (__force __fs32)cpu_to_be32(n);
+}
+
+static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP)
+		*(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d);
+	else if (sbi->s_bytesex == BYTESEX_LE)
+		le32_add_cpu((__le32 *)n, d);
+	else
+		be32_add_cpu((__be32 *)n, d);
+	return *n;
+}
+
+static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n)
+{
+	if (sbi->s_bytesex != BYTESEX_BE)
+		return le16_to_cpu((__force __le16)n);
+	else
+		return be16_to_cpu((__force __be16)n);
+}
+
+static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n)
+{
+	if (sbi->s_bytesex != BYTESEX_BE)
+		return (__force __fs16)cpu_to_le16(n);
+	else
+		return (__force __fs16)cpu_to_be16(n);
+}
+
+static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d)
+{
+	if (sbi->s_bytesex != BYTESEX_BE)
+		le16_add_cpu((__le16 *)n, d);
+	else
+		be16_add_cpu((__be16 *)n, d);
+	return *n;
+}
+
+#endif /* _SYSV_H */
diff --git a/kernel/fs/timerfd.c b/kernel/fs/timerfd.c
new file mode 100644
index 000000000..64fb86066
--- /dev/null
+++ b/kernel/fs/timerfd.c
@@ -0,0 +1,571 @@
+/*
+ *  fs/timerfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ *
+ *  Thanks to Thomas Gleixner for code reviews and useful comments.
+ *
+ */
+
+#include <linux/alarmtimer.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/anon_inodes.h>
+#include <linux/timerfd.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/rcupdate.h>
+
+struct timerfd_ctx {
+	union {
+		struct hrtimer tmr;
+		struct alarm alarm;
+	} t;
+	ktime_t tintv;
+	ktime_t moffs;
+	wait_queue_head_t wqh;
+	u64 ticks;
+	int clockid;
+	short unsigned expired;
+	short unsigned settime_flags;	/* to show in fdinfo */
+	struct rcu_head rcu;
+	struct list_head clist;
+	bool might_cancel;
+};
+
+static LIST_HEAD(cancel_list);
+static DEFINE_SPINLOCK(cancel_lock);
+
+static inline bool isalarm(struct timerfd_ctx *ctx)
+{
+	return ctx->clockid == CLOCK_REALTIME_ALARM ||
+		ctx->clockid == CLOCK_BOOTTIME_ALARM;
+}
+
+/*
+ * This gets called when the timer event triggers. We set the "expired"
+ * flag, but we do not re-arm the timer (in case it's necessary,
+ * tintv.tv64 != 0) until the timer is accessed.
+ */
+static void timerfd_triggered(struct timerfd_ctx *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	ctx->expired = 1;
+	ctx->ticks++;
+	wake_up_locked(&ctx->wqh);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+}
+
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+	struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
+					       t.tmr);
+	timerfd_triggered(ctx);
+	return HRTIMER_NORESTART;
+}
+
+static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
+	ktime_t now)
+{
+	struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
+					       t.alarm);
+	timerfd_triggered(ctx);
+	return ALARMTIMER_NORESTART;
+}
+
+/*
+ * Called when the clock was set to cancel the timers in the cancel
+ * list. This will wake up processes waiting on these timers. The
+ * wake-up requires ctx->ticks to be non zero, therefore we increment
+ * it before calling wake_up_locked().
+ */
+void timerfd_clock_was_set(void)
+{
+	ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	struct timerfd_ctx *ctx;
+	unsigned long flags;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, &cancel_list, clist) {
+		if (!ctx->might_cancel)
+			continue;
+		spin_lock_irqsave(&ctx->wqh.lock, flags);
+		if (ctx->moffs.tv64 != moffs.tv64) {
+			ctx->moffs.tv64 = KTIME_MAX;
+			ctx->ticks++;
+			wake_up_locked(&ctx->wqh);
+		}
+		spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+	}
+	rcu_read_unlock();
+}
+
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+	if (ctx->might_cancel) {
+		ctx->might_cancel = false;
+		spin_lock(&cancel_lock);
+		list_del_rcu(&ctx->clist);
+		spin_unlock(&cancel_lock);
+	}
+}
+
+static bool timerfd_canceled(struct timerfd_ctx *ctx)
+{
+	if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+		return false;
+	ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	return true;
+}
+
+static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
+{
+	if ((ctx->clockid == CLOCK_REALTIME ||
+	     ctx->clockid == CLOCK_REALTIME_ALARM) &&
+	    (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
+		if (!ctx->might_cancel) {
+			ctx->might_cancel = true;
+			spin_lock(&cancel_lock);
+			list_add_rcu(&ctx->clist, &cancel_list);
+			spin_unlock(&cancel_lock);
+		}
+	} else if (ctx->might_cancel) {
+		timerfd_remove_cancel(ctx);
+	}
+}
+
+static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
+{
+	ktime_t remaining;
+
+	if (isalarm(ctx))
+		remaining = alarm_expires_remaining(&ctx->t.alarm);
+	else
+		remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+
+	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
+}
+
+static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
+			 const struct itimerspec *ktmr)
+{
+	enum hrtimer_mode htmode;
+	ktime_t texp;
+	int clockid = ctx->clockid;
+
+	htmode = (flags & TFD_TIMER_ABSTIME) ?
+		HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
+
+	texp = timespec_to_ktime(ktmr->it_value);
+	ctx->expired = 0;
+	ctx->ticks = 0;
+	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
+
+	if (isalarm(ctx)) {
+		alarm_init(&ctx->t.alarm,
+			   ctx->clockid == CLOCK_REALTIME_ALARM ?
+			   ALARM_REALTIME : ALARM_BOOTTIME,
+			   timerfd_alarmproc);
+	} else {
+		hrtimer_init(&ctx->t.tmr, clockid, htmode);
+		hrtimer_set_expires(&ctx->t.tmr, texp);
+		ctx->t.tmr.function = timerfd_tmrproc;
+	}
+
+	if (texp.tv64 != 0) {
+		if (isalarm(ctx)) {
+			if (flags & TFD_TIMER_ABSTIME)
+				alarm_start(&ctx->t.alarm, texp);
+			else
+				alarm_start_relative(&ctx->t.alarm, texp);
+		} else {
+			hrtimer_start(&ctx->t.tmr, texp, htmode);
+		}
+
+		if (timerfd_canceled(ctx))
+			return -ECANCELED;
+	}
+
+	ctx->settime_flags = flags & TFD_SETTIME_FLAGS;
+	return 0;
+}
+
+static int timerfd_release(struct inode *inode, struct file *file)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+
+	timerfd_remove_cancel(ctx);
+
+	if (isalarm(ctx))
+		alarm_cancel(&ctx->t.alarm);
+	else
+		hrtimer_cancel(&ctx->t.tmr);
+	kfree_rcu(ctx, rcu);
+	return 0;
+}
+
+static unsigned int timerfd_poll(struct file *file, poll_table *wait)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+	unsigned long flags;
+
+	poll_wait(file, &ctx->wqh, wait);
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ctx->ticks)
+		events |= POLLIN;
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return events;
+}
+
+static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	u64 ticks = 0;
+
+	if (count < sizeof(ticks))
+		return -EINVAL;
+	spin_lock_irq(&ctx->wqh.lock);
+	if (file->f_flags & O_NONBLOCK)
+		res = -EAGAIN;
+	else
+		res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
+
+	/*
+	 * If clock has changed, we do not care about the
+	 * ticks and we do not rearm the timer. Userspace must
+	 * reevaluate anyway.
+	 */
+	if (timerfd_canceled(ctx)) {
+		ctx->ticks = 0;
+		ctx->expired = 0;
+		res = -ECANCELED;
+	}
+
+	if (ctx->ticks) {
+		ticks = ctx->ticks;
+
+		if (ctx->expired && ctx->tintv.tv64) {
+			/*
+			 * If tintv.tv64 != 0, this is a periodic timer that
+			 * needs to be re-armed. We avoid doing it in the timer
+			 * callback to avoid DoS attacks specifying a very
+			 * short timer period.
+			 */
+			if (isalarm(ctx)) {
+				ticks += alarm_forward_now(
+					&ctx->t.alarm, ctx->tintv) - 1;
+				alarm_restart(&ctx->t.alarm);
+			} else {
+				ticks += hrtimer_forward_now(&ctx->t.tmr,
+							     ctx->tintv) - 1;
+				hrtimer_restart(&ctx->t.tmr);
+			}
+		}
+		ctx->expired = 0;
+		ctx->ticks = 0;
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+	if (ticks)
+		res = put_user(ticks, (u64 __user *) buf) ? -EFAULT: sizeof(ticks);
+	return res;
+}
+
+#ifdef CONFIG_PROC_FS
+static void timerfd_show(struct seq_file *m, struct file *file)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	struct itimerspec t;
+
+	spin_lock_irq(&ctx->wqh.lock);
+	t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	t.it_interval = ktime_to_timespec(ctx->tintv);
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	seq_printf(m,
+		   "clockid: %d\n"
+		   "ticks: %llu\n"
+		   "settime flags: 0%o\n"
+		   "it_value: (%llu, %llu)\n"
+		   "it_interval: (%llu, %llu)\n",
+		   ctx->clockid,
+		   (unsigned long long)ctx->ticks,
+		   ctx->settime_flags,
+		   (unsigned long long)t.it_value.tv_sec,
+		   (unsigned long long)t.it_value.tv_nsec,
+		   (unsigned long long)t.it_interval.tv_sec,
+		   (unsigned long long)t.it_interval.tv_nsec);
+}
+#else
+#define timerfd_show NULL
+#endif
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	int ret = 0;
+
+	switch (cmd) {
+	case TFD_IOC_SET_TICKS: {
+		u64 ticks;
+
+		if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks)))
+			return -EFAULT;
+		if (!ticks)
+			return -EINVAL;
+
+		spin_lock_irq(&ctx->wqh.lock);
+		if (!timerfd_canceled(ctx)) {
+			ctx->ticks = ticks;
+			wake_up_locked(&ctx->wqh);
+		} else
+			ret = -ECANCELED;
+		spin_unlock_irq(&ctx->wqh.lock);
+		break;
+	}
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	return ret;
+}
+#else
+#define timerfd_ioctl NULL
+#endif
+
+static const struct file_operations timerfd_fops = {
+	.release	= timerfd_release,
+	.poll		= timerfd_poll,
+	.read		= timerfd_read,
+	.llseek		= noop_llseek,
+	.show_fdinfo	= timerfd_show,
+	.unlocked_ioctl	= timerfd_ioctl,
+};
+
+static int timerfd_fget(int fd, struct fd *p)
+{
+	struct fd f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	if (f.file->f_op != &timerfd_fops) {
+		fdput(f);
+		return -EINVAL;
+	}
+	*p = f;
+	return 0;
+}
+
+SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
+{
+	int ufd;
+	struct timerfd_ctx *ctx;
+
+	/* Check the TFD_* constants for consistency.  */
+	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
+
+	if ((flags & ~TFD_CREATE_FLAGS) ||
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME &&
+	     clockid != CLOCK_REALTIME_ALARM &&
+	     clockid != CLOCK_BOOTTIME &&
+	     clockid != CLOCK_BOOTTIME_ALARM))
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	init_waitqueue_head(&ctx->wqh);
+	ctx->clockid = clockid;
+
+	if (isalarm(ctx))
+		alarm_init(&ctx->t.alarm,
+			   ctx->clockid == CLOCK_REALTIME_ALARM ?
+			   ALARM_REALTIME : ALARM_BOOTTIME,
+			   timerfd_alarmproc);
+	else
+		hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+
+	ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+
+	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
+			       O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
+	if (ufd < 0)
+		kfree(ctx);
+
+	return ufd;
+}
+
+static int do_timerfd_settime(int ufd, int flags, 
+		const struct itimerspec *new,
+		struct itimerspec *old)
+{
+	struct fd f;
+	struct timerfd_ctx *ctx;
+	int ret;
+
+	if ((flags & ~TFD_SETTIME_FLAGS) ||
+	    !timespec_valid(&new->it_value) ||
+	    !timespec_valid(&new->it_interval))
+		return -EINVAL;
+
+	ret = timerfd_fget(ufd, &f);
+	if (ret)
+		return ret;
+	ctx = f.file->private_data;
+
+	timerfd_setup_cancel(ctx, flags);
+
+	/*
+	 * We need to stop the existing timer before reprogramming
+	 * it to the new values.
+	 */
+	for (;;) {
+		spin_lock_irq(&ctx->wqh.lock);
+
+		if (isalarm(ctx)) {
+			if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
+				break;
+		} else {
+			if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
+				break;
+		}
+		spin_unlock_irq(&ctx->wqh.lock);
+		if (isalarm(ctx))
+			hrtimer_wait_for_timer(&ctx->t.alarm.timer);
+		else
+			hrtimer_wait_for_timer(&ctx->t.tmr);
+	}
+
+	/*
+	 * If the timer is expired and it's periodic, we need to advance it
+	 * because the caller may want to know the previous expiration time.
+	 * We do not update "ticks" and "expired" since the timer will be
+	 * re-programmed again in the following timerfd_setup() call.
+	 */
+	if (ctx->expired && ctx->tintv.tv64) {
+		if (isalarm(ctx))
+			alarm_forward_now(&ctx->t.alarm, ctx->tintv);
+		else
+			hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
+	}
+
+	old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	old->it_interval = ktime_to_timespec(ctx->tintv);
+
+	/*
+	 * Re-program the timer to the new value ...
+	 */
+	ret = timerfd_setup(ctx, flags, new);
+
+	spin_unlock_irq(&ctx->wqh.lock);
+	fdput(f);
+	return ret;
+}
+
+static int do_timerfd_gettime(int ufd, struct itimerspec *t)
+{
+	struct fd f;
+	struct timerfd_ctx *ctx;
+	int ret = timerfd_fget(ufd, &f);
+	if (ret)
+		return ret;
+	ctx = f.file->private_data;
+
+	spin_lock_irq(&ctx->wqh.lock);
+	if (ctx->expired && ctx->tintv.tv64) {
+		ctx->expired = 0;
+
+		if (isalarm(ctx)) {
+			ctx->ticks +=
+				alarm_forward_now(
+					&ctx->t.alarm, ctx->tintv) - 1;
+			alarm_restart(&ctx->t.alarm);
+		} else {
+			ctx->ticks +=
+				hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
+				- 1;
+			hrtimer_restart(&ctx->t.tmr);
+		}
+	}
+	t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	t->it_interval = ktime_to_timespec(ctx->tintv);
+	spin_unlock_irq(&ctx->wqh.lock);
+	fdput(f);
+	return 0;
+}
+
+SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+		const struct itimerspec __user *, utmr,
+		struct itimerspec __user *, otmr)
+{
+	struct itimerspec new, old;
+	int ret;
+
+	if (copy_from_user(&new, utmr, sizeof(new)))
+		return -EFAULT;
+	ret = do_timerfd_settime(ufd, flags, &new, &old);
+	if (ret)
+		return ret;
+	if (otmr && copy_to_user(otmr, &old, sizeof(old)))
+		return -EFAULT;
+
+	return ret;
+}
+
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
+{
+	struct itimerspec kotmr;
+	int ret = do_timerfd_gettime(ufd, &kotmr);
+	if (ret)
+		return ret;
+	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+		const struct compat_itimerspec __user *, utmr,
+		struct compat_itimerspec __user *, otmr)
+{
+	struct itimerspec new, old;
+	int ret;
+
+	if (get_compat_itimerspec(&new, utmr))
+		return -EFAULT;
+	ret = do_timerfd_settime(ufd, flags, &new, &old);
+	if (ret)
+		return ret;
+	if (otmr && put_compat_itimerspec(otmr, &old))
+		return -EFAULT;
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+		struct compat_itimerspec __user *, otmr)
+{
+	struct itimerspec kotmr;
+	int ret = do_timerfd_gettime(ufd, &kotmr);
+	if (ret)
+		return ret;
+	return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0;
+}
+#endif
diff --git a/kernel/fs/tracefs/Makefile b/kernel/fs/tracefs/Makefile
new file mode 100644
index 000000000..82fa35b65
--- /dev/null
+++ b/kernel/fs/tracefs/Makefile
@@ -0,0 +1,4 @@
+tracefs-objs	:= inode.o
+
+obj-$(CONFIG_TRACING)	+= tracefs.o
+
diff --git a/kernel/fs/tracefs/inode.c b/kernel/fs/tracefs/inode.c
new file mode 100644
index 000000000..a43df11a1
--- /dev/null
+++ b/kernel/fs/tracefs/inode.c
@@ -0,0 +1,648 @@
+/*
+ *  inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com>
+ *
+ *  Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ * tracefs is the file system that is used by the tracing infrastructure.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/tracefs.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#define TRACEFS_DEFAULT_MODE	0700
+
+static struct vfsmount *tracefs_mount;
+static int tracefs_mount_count;
+static bool tracefs_registered;
+
+static ssize_t default_read_file(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return 0;
+}
+
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+static const struct file_operations tracefs_file_operations = {
+	.read =		default_read_file,
+	.write =	default_write_file,
+	.open =		simple_open,
+	.llseek =	noop_llseek,
+};
+
+static struct tracefs_dir_ops {
+	int (*mkdir)(const char *name);
+	int (*rmdir)(const char *name);
+} tracefs_ops;
+
+static char *get_dname(struct dentry *dentry)
+{
+	const char *dname;
+	char *name;
+	int len = dentry->d_name.len;
+
+	dname = dentry->d_name.name;
+	name = kmalloc(len + 1, GFP_KERNEL);
+	if (!name)
+		return NULL;
+	memcpy(name, dname, len);
+	name[len] = 0;
+	return name;
+}
+
+static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+	char *name;
+	int ret;
+
+	name = get_dname(dentry);
+	if (!name)
+		return -ENOMEM;
+
+	/*
+	 * The mkdir call can call the generic functions that create
+	 * the files within the tracefs system. It is up to the individual
+	 * mkdir routine to handle races.
+	 */
+	mutex_unlock(&inode->i_mutex);
+	ret = tracefs_ops.mkdir(name);
+	mutex_lock(&inode->i_mutex);
+
+	kfree(name);
+
+	return ret;
+}
+
+static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
+{
+	char *name;
+	int ret;
+
+	name = get_dname(dentry);
+	if (!name)
+		return -ENOMEM;
+
+	/*
+	 * The rmdir call can call the generic functions that create
+	 * the files within the tracefs system. It is up to the individual
+	 * rmdir routine to handle races.
+	 * This time we need to unlock not only the parent (inode) but
+	 * also the directory that is being deleted.
+	 */
+	mutex_unlock(&inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	ret = tracefs_ops.rmdir(name);
+
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock(&dentry->d_inode->i_mutex);
+
+	kfree(name);
+
+	return ret;
+}
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= tracefs_syscall_mkdir,
+	.rmdir		= tracefs_syscall_rmdir,
+};
+
+static struct inode *tracefs_get_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	}
+	return inode;
+}
+
+struct tracefs_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
+	umode_t mode;
+};
+
+enum {
+	Opt_uid,
+	Opt_gid,
+	Opt_mode,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_mode, "mode=%o"},
+	{Opt_err, NULL}
+};
+
+struct tracefs_fs_info {
+	struct tracefs_mount_opts mount_opts;
+};
+
+static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int token;
+	kuid_t uid;
+	kgid_t gid;
+	char *p;
+
+	opts->mode = TRACEFS_DEFAULT_MODE;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid))
+				return -EINVAL;
+			opts->uid = uid;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid))
+				return -EINVAL;
+			opts->gid = gid;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+		/*
+		 * We might like to report bad mount options here;
+		 * but traditionally tracefs has ignored all mount options
+		 */
+		}
+	}
+
+	return 0;
+}
+
+static int tracefs_apply_options(struct super_block *sb)
+{
+	struct tracefs_fs_info *fsi = sb->s_fs_info;
+	struct inode *inode = sb->s_root->d_inode;
+	struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+	inode->i_mode &= ~S_IALLUGO;
+	inode->i_mode |= opts->mode;
+
+	inode->i_uid = opts->uid;
+	inode->i_gid = opts->gid;
+
+	return 0;
+}
+
+static int tracefs_remount(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct tracefs_fs_info *fsi = sb->s_fs_info;
+
+	sync_filesystem(sb);
+	err = tracefs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		goto fail;
+
+	tracefs_apply_options(sb);
+
+fail:
+	return err;
+}
+
+static int tracefs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
+	struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+	if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, opts->uid));
+	if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, opts->gid));
+	if (opts->mode != TRACEFS_DEFAULT_MODE)
+		seq_printf(m, ",mode=%o", opts->mode);
+
+	return 0;
+}
+
+static const struct super_operations tracefs_super_operations = {
+	.statfs		= simple_statfs,
+	.remount_fs	= tracefs_remount,
+	.show_options	= tracefs_show_options,
+};
+
+static int trace_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr trace_files[] = {{""}};
+	struct tracefs_fs_info *fsi;
+	int err;
+
+	save_mount_options(sb, data);
+
+	fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
+	sb->s_fs_info = fsi;
+	if (!fsi) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	err = tracefs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		goto fail;
+
+	err  =  simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
+	if (err)
+		goto fail;
+
+	sb->s_op = &tracefs_super_operations;
+
+	tracefs_apply_options(sb);
+
+	return 0;
+
+fail:
+	kfree(fsi);
+	sb->s_fs_info = NULL;
+	return err;
+}
+
+static struct dentry *trace_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	return mount_single(fs_type, flags, data, trace_fill_super);
+}
+
+static struct file_system_type trace_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"tracefs",
+	.mount =	trace_mount,
+	.kill_sb =	kill_litter_super,
+};
+MODULE_ALIAS_FS("tracefs");
+
+static struct dentry *start_creating(const char *name, struct dentry *parent)
+{
+	struct dentry *dentry;
+	int error;
+
+	pr_debug("tracefs: creating file '%s'\n",name);
+
+	error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+			      &tracefs_mount_count);
+	if (error)
+		return ERR_PTR(error);
+
+	/* If the parent is not specified, we create it in the root.
+	 * We need the root dentry to do this, which is in the super
+	 * block. A pointer to that is in the struct vfsmount that we
+	 * have around.
+	 */
+	if (!parent)
+		parent = tracefs_mount->mnt_root;
+
+	mutex_lock(&parent->d_inode->i_mutex);
+	dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(dentry) && dentry->d_inode) {
+		dput(dentry);
+		dentry = ERR_PTR(-EEXIST);
+	}
+	if (IS_ERR(dentry))
+		mutex_unlock(&parent->d_inode->i_mutex);
+	return dentry;
+}
+
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+	mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+	dput(dentry);
+	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+	return NULL;
+}
+
+static struct dentry *end_creating(struct dentry *dentry)
+{
+	mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+	return dentry;
+}
+
+/**
+ * tracefs_create_file - create a file in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the tracefs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ *
+ * This is the basic "create a file" function for tracefs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the tracefs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_file(const char *name, umode_t mode,
+				   struct dentry *parent, void *data,
+				   const struct file_operations *fops)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	if (!(mode & S_IFMT))
+		mode |= S_IFREG;
+	BUG_ON(!S_ISREG(mode));
+	dentry = start_creating(name, parent);
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = tracefs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = mode;
+	inode->i_fop = fops ? fops : &tracefs_file_operations;
+	inode->i_private = data;
+	d_instantiate(dentry, inode);
+	fsnotify_create(dentry->d_parent->d_inode, dentry);
+	return end_creating(dentry);
+}
+
+static struct dentry *__create_dir(const char *name, struct dentry *parent,
+				   const struct inode_operations *ops)
+{
+	struct dentry *dentry = start_creating(name, parent);
+	struct inode *inode;
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = tracefs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_op = ops;
+	inode->i_fop = &simple_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inc_nlink(inode);
+	d_instantiate(dentry, inode);
+	inc_nlink(dentry->d_parent->d_inode);
+	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+	return end_creating(dentry);
+}
+
+/**
+ * tracefs_create_dir - create a directory in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ *        create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          directory will be created in the root of the tracefs filesystem.
+ *
+ * This function creates a directory in tracefs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed. If an error occurs, %NULL will be returned.
+ *
+ * If tracing is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
+{
+	return __create_dir(name, parent, &simple_dir_inode_operations);
+}
+
+/**
+ * tracefs_create_instance_dir - create the tracing instances directory
+ * @name: The name of the instances directory to create
+ * @parent: The parent directory that the instances directory will exist
+ * @mkdir: The function to call when a mkdir is performed.
+ * @rmdir: The function to call when a rmdir is performed.
+ *
+ * Only one instances directory is allowed.
+ *
+ * The instances directory is special as it allows for mkdir and rmdir to
+ * to be done by userspace. When a mkdir or rmdir is performed, the inode
+ * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * called without locks and with the name of the directory being created
+ * within the instances directory.
+ *
+ * Returns the dentry of the instances directory.
+ */
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+					  int (*mkdir)(const char *name),
+					  int (*rmdir)(const char *name))
+{
+	struct dentry *dentry;
+
+	/* Only allow one instance of the instances directory. */
+	if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
+		return NULL;
+
+	dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+	if (!dentry)
+		return NULL;
+
+	tracefs_ops.mkdir = mkdir;
+	tracefs_ops.rmdir = rmdir;
+
+	return dentry;
+}
+
+static inline int tracefs_positive(struct dentry *dentry)
+{
+	return dentry->d_inode && !d_unhashed(dentry);
+}
+
+static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+{
+	int ret = 0;
+
+	if (tracefs_positive(dentry)) {
+		if (dentry->d_inode) {
+			dget(dentry);
+			switch (dentry->d_inode->i_mode & S_IFMT) {
+			case S_IFDIR:
+				ret = simple_rmdir(parent->d_inode, dentry);
+				break;
+			default:
+				simple_unlink(parent->d_inode, dentry);
+				break;
+			}
+			if (!ret)
+				d_delete(dentry);
+			dput(dentry);
+		}
+	}
+	return ret;
+}
+
+/**
+ * tracefs_remove - removes a file or directory from the tracefs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ *          removed.
+ *
+ * This function removes a file or directory in tracefs that was previously
+ * created with a call to another tracefs function (like
+ * tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove(struct dentry *dentry)
+{
+	struct dentry *parent;
+	int ret;
+
+	if (IS_ERR_OR_NULL(dentry))
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || !parent->d_inode)
+		return;
+
+	mutex_lock(&parent->d_inode->i_mutex);
+	ret = __tracefs_remove(dentry, parent);
+	mutex_unlock(&parent->d_inode->i_mutex);
+	if (!ret)
+		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+}
+
+/**
+ * tracefs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in tracefs that
+ * was previously created with a call to another tracefs function
+ * (like tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove_recursive(struct dentry *dentry)
+{
+	struct dentry *child, *parent;
+
+	if (IS_ERR_OR_NULL(dentry))
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || !parent->d_inode)
+		return;
+
+	parent = dentry;
+ down:
+	mutex_lock(&parent->d_inode->i_mutex);
+ loop:
+	/*
+	 * The parent->d_subdirs is protected by the d_lock. Outside that
+	 * lock, the child can be unlinked and set to be freed which can
+	 * use the d_u.d_child as the rcu head and corrupt this list.
+	 */
+	spin_lock(&parent->d_lock);
+	list_for_each_entry(child, &parent->d_subdirs, d_child) {
+		if (!tracefs_positive(child))
+			continue;
+
+		/* perhaps simple_empty(child) makes more sense */
+		if (!list_empty(&child->d_subdirs)) {
+			spin_unlock(&parent->d_lock);
+			mutex_unlock(&parent->d_inode->i_mutex);
+			parent = child;
+			goto down;
+		}
+
+		spin_unlock(&parent->d_lock);
+
+		if (!__tracefs_remove(child, parent))
+			simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+
+		/*
+		 * The parent->d_lock protects agaist child from unlinking
+		 * from d_subdirs. When releasing the parent->d_lock we can
+		 * no longer trust that the next pointer is valid.
+		 * Restart the loop. We'll skip this one with the
+		 * tracefs_positive() check.
+		 */
+		goto loop;
+	}
+	spin_unlock(&parent->d_lock);
+
+	mutex_unlock(&parent->d_inode->i_mutex);
+	child = parent;
+	parent = parent->d_parent;
+	mutex_lock(&parent->d_inode->i_mutex);
+
+	if (child != dentry)
+		/* go up */
+		goto loop;
+
+	if (!__tracefs_remove(child, parent))
+		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+	mutex_unlock(&parent->d_inode->i_mutex);
+}
+
+/**
+ * tracefs_initialized - Tells whether tracefs has been registered
+ */
+bool tracefs_initialized(void)
+{
+	return tracefs_registered;
+}
+
+static int __init tracefs_init(void)
+{
+	int retval;
+
+	retval = sysfs_create_mount_point(kernel_kobj, "tracing");
+	if (retval)
+		return -EINVAL;
+
+	retval = register_filesystem(&trace_fs_type);
+	if (!retval)
+		tracefs_registered = true;
+
+	return retval;
+}
+core_initcall(tracefs_init);
diff --git a/kernel/fs/ubifs/Kconfig b/kernel/fs/ubifs/Kconfig
new file mode 100644
index 000000000..ba66d5080
--- /dev/null
+++ b/kernel/fs/ubifs/Kconfig
@@ -0,0 +1,37 @@
+config UBIFS_FS
+	tristate "UBIFS file system support"
+	select CRC16
+	select CRC32
+	select CRYPTO if UBIFS_FS_ADVANCED_COMPR
+	select CRYPTO if UBIFS_FS_LZO
+	select CRYPTO if UBIFS_FS_ZLIB
+	select CRYPTO_LZO if UBIFS_FS_LZO
+	select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+	depends on MTD_UBI
+	help
+	  UBIFS is a file system for flash devices which works on top of UBI.
+
+config UBIFS_FS_ADVANCED_COMPR
+	bool "Advanced compression options"
+	depends on UBIFS_FS
+	help
+	  This option allows to explicitly choose which compressions, if any,
+	  are enabled in UBIFS. Removing compressors means inability to read
+	  existing file systems.
+
+	  If unsure, say 'N'.
+
+config UBIFS_FS_LZO
+	bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
+	depends on UBIFS_FS
+	default y
+	help
+	   LZO compressor is generally faster than zlib but compresses worse.
+	   Say 'Y' if unsure.
+
+config UBIFS_FS_ZLIB
+	bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
+	depends on UBIFS_FS
+	default y
+	help
+	  Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
diff --git a/kernel/fs/ubifs/Makefile b/kernel/fs/ubifs/Makefile
new file mode 100644
index 000000000..2c6f0cb81
--- /dev/null
+++ b/kernel/fs/ubifs/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_UBIFS_FS) += ubifs.o
+
+ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
+ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
+ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
diff --git a/kernel/fs/ubifs/budget.c b/kernel/fs/ubifs/budget.c
new file mode 100644
index 000000000..11a11b32a
--- /dev/null
+++ b/kernel/fs/ubifs/budget.c
@@ -0,0 +1,730 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the budgeting sub-system which is responsible for UBIFS
+ * space management.
+ *
+ * Factors such as compression, wasted space at the ends of LEBs, space in other
+ * journal heads, the effect of updates on the index, and so on, make it
+ * impossible to accurately predict the amount of space needed. Consequently
+ * approximations are used.
+ */
+
+#include "ubifs.h"
+#include <linux/writeback.h>
+#include <linux/math64.h>
+
+/*
+ * When pessimistic budget calculations say that there is no enough space,
+ * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
+ * or committing. The below constant defines maximum number of times UBIFS
+ * repeats the operations.
+ */
+#define MAX_MKSPC_RETRIES 3
+
+/*
+ * The below constant defines amount of dirty pages which should be written
+ * back at when trying to shrink the liability.
+ */
+#define NR_TO_WRITE 16
+
+/**
+ * shrink_liability - write-back some dirty pages/inodes.
+ * @c: UBIFS file-system description object
+ * @nr_to_write: how many dirty pages to write-back
+ *
+ * This function shrinks UBIFS liability by means of writing back some amount
+ * of dirty inodes and their pages.
+ *
+ * Note, this function synchronizes even VFS inodes which are locked
+ * (@i_mutex) by the caller of the budgeting function, because write-back does
+ * not touch @i_mutex.
+ */
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
+{
+	down_read(&c->vfs_sb->s_umount);
+	writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
+	up_read(&c->vfs_sb->s_umount);
+}
+
+/**
+ * run_gc - run garbage collector.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs garbage collector to make some more free space. Returns
+ * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
+ * negative error code in case of failure.
+ */
+static int run_gc(struct ubifs_info *c)
+{
+	int err, lnum;
+
+	/* Make some free space by garbage-collecting dirty space */
+	down_read(&c->commit_sem);
+	lnum = ubifs_garbage_collect(c, 1);
+	up_read(&c->commit_sem);
+	if (lnum < 0)
+		return lnum;
+
+	/* GC freed one LEB, return it to lprops */
+	dbg_budg("GC freed LEB %d", lnum);
+	err = ubifs_return_leb(c, lnum);
+	if (err)
+		return err;
+	return 0;
+}
+
+/**
+ * get_liability - calculate current liability.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns current UBIFS liability, i.e. the
+ * amount of bytes UBIFS has "promised" to write to the media.
+ */
+static long long get_liability(struct ubifs_info *c)
+{
+	long long liab;
+
+	spin_lock(&c->space_lock);
+	liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
+	spin_unlock(&c->space_lock);
+	return liab;
+}
+
+/**
+ * make_free_space - make more free space on the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called when an operation cannot be budgeted because there
+ * is supposedly no free space. But in most cases there is some free space:
+ *   o budgeting is pessimistic, so it always budgets more than it is actually
+ *     needed, so shrinking the liability is one way to make free space - the
+ *     cached data will take less space then it was budgeted for;
+ *   o GC may turn some dark space into free space (budgeting treats dark space
+ *     as not available);
+ *   o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
+ *
+ * So this function tries to do the above. Returns %-EAGAIN if some free space
+ * was presumably made and the caller has to re-try budgeting the operation.
+ * Returns %-ENOSPC if it couldn't do more free space, and other negative error
+ * codes on failures.
+ */
+static int make_free_space(struct ubifs_info *c)
+{
+	int err, retries = 0;
+	long long liab1, liab2;
+
+	do {
+		liab1 = get_liability(c);
+		/*
+		 * We probably have some dirty pages or inodes (liability), try
+		 * to write them back.
+		 */
+		dbg_budg("liability %lld, run write-back", liab1);
+		shrink_liability(c, NR_TO_WRITE);
+
+		liab2 = get_liability(c);
+		if (liab2 < liab1)
+			return -EAGAIN;
+
+		dbg_budg("new liability %lld (not shrunk)", liab2);
+
+		/* Liability did not shrink again, try GC */
+		dbg_budg("Run GC");
+		err = run_gc(c);
+		if (!err)
+			return -EAGAIN;
+
+		if (err != -EAGAIN && err != -ENOSPC)
+			/* Some real error happened */
+			return err;
+
+		dbg_budg("Run commit (retries %d)", retries);
+		err = ubifs_run_commit(c);
+		if (err)
+			return err;
+	} while (retries++ < MAX_MKSPC_RETRIES);
+
+	return -ENOSPC;
+}
+
+/**
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns the number of LEBs which should be kept
+ * for index usage.
+ */
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
+{
+	int idx_lebs;
+	long long idx_size;
+
+	idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
+	/* And make sure we have thrice the index size of space reserved */
+	idx_size += idx_size << 1;
+	/*
+	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
+	 * pair, nor similarly the two variables for the new index size, so we
+	 * have to do this costly 64-bit division on fast-path.
+	 */
+	idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
+	/*
+	 * The index head is not available for the in-the-gaps method, so add an
+	 * extra LEB to compensate.
+	 */
+	idx_lebs += 1;
+	if (idx_lebs < MIN_INDEX_LEBS)
+		idx_lebs = MIN_INDEX_LEBS;
+	return idx_lebs;
+}
+
+/**
+ * ubifs_calc_available - calculate available FS space.
+ * @c: UBIFS file-system description object
+ * @min_idx_lebs: minimum number of LEBs reserved for the index
+ *
+ * This function calculates and returns amount of FS space available for use.
+ */
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
+{
+	int subtract_lebs;
+	long long available;
+
+	available = c->main_bytes - c->lst.total_used;
+
+	/*
+	 * Now 'available' contains theoretically available flash space
+	 * assuming there is no index, so we have to subtract the space which
+	 * is reserved for the index.
+	 */
+	subtract_lebs = min_idx_lebs;
+
+	/* Take into account that GC reserves one LEB for its own needs */
+	subtract_lebs += 1;
+
+	/*
+	 * The GC journal head LEB is not really accessible. And since
+	 * different write types go to different heads, we may count only on
+	 * one head's space.
+	 */
+	subtract_lebs += c->jhead_cnt - 1;
+
+	/* We also reserve one LEB for deletions, which bypass budgeting */
+	subtract_lebs += 1;
+
+	available -= (long long)subtract_lebs * c->leb_size;
+
+	/* Subtract the dead space which is not available for use */
+	available -= c->lst.total_dead;
+
+	/*
+	 * Subtract dark space, which might or might not be usable - it depends
+	 * on the data which we have on the media and which will be written. If
+	 * this is a lot of uncompressed or not-compressible data, the dark
+	 * space cannot be used.
+	 */
+	available -= c->lst.total_dark;
+
+	/*
+	 * However, there is more dark space. The index may be bigger than
+	 * @min_idx_lebs. Those extra LEBs are assumed to be available, but
+	 * their dark space is not included in total_dark, so it is subtracted
+	 * here.
+	 */
+	if (c->lst.idx_lebs > min_idx_lebs) {
+		subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
+		available -= subtract_lebs * c->dark_wm;
+	}
+
+	/* The calculations are rough and may end up with a negative number */
+	return available > 0 ? available : 0;
+}
+
+/**
+ * can_use_rp - check whether the user is allowed to use reserved pool.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS has so-called "reserved pool" which is flash space reserved
+ * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
+ * This function checks whether current user is allowed to use reserved pool.
+ * Returns %1  current user is allowed to use reserved pool and %0 otherwise.
+ */
+static int can_use_rp(struct ubifs_info *c)
+{
+	if (uid_eq(current_fsuid(), c->rp_uid) || capable(CAP_SYS_RESOURCE) ||
+	    (!gid_eq(c->rp_gid, GLOBAL_ROOT_GID) && in_group_p(c->rp_gid)))
+		return 1;
+	return 0;
+}
+
+/**
+ * do_budget_space - reserve flash space for index and data growth.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free LEBs for index growth and
+ * data.
+ *
+ * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
+ * would take if it was consolidated and written to the flash. This guarantees
+ * that the "in-the-gaps" commit method always succeeds and UBIFS will always
+ * be able to commit dirty index. So this function basically adds amount of
+ * budgeted index space to the size of the current index, multiplies this by 3,
+ * and makes sure this does not exceed the amount of free LEBs.
+ *
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
+ * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
+ *    be large, because UBIFS does not do any index consolidation as long as
+ *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
+ *    will contain a lot of dirt.
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ *    the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
+ *
+ * This function returns zero in case of success, and %-ENOSPC in case of
+ * failure.
+ */
+static int do_budget_space(struct ubifs_info *c)
+{
+	long long outstanding, available;
+	int lebs, rsvd_idx_lebs, min_idx_lebs;
+
+	/* First budget index space */
+	min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	/* Now 'min_idx_lebs' contains number of LEBs to reserve */
+	if (min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+
+	/*
+	 * The number of LEBs that are available to be used by the index is:
+	 *
+	 *    @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
+	 *    @c->lst.taken_empty_lebs
+	 *
+	 * @c->lst.empty_lebs are available because they are empty.
+	 * @c->freeable_cnt are available because they contain only free and
+	 * dirty space, @c->idx_gc_cnt are available because they are index
+	 * LEBs that have been garbage collected and are awaiting the commit
+	 * before they can be used. And the in-the-gaps method will grab these
+	 * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have
+	 * already been allocated for some purpose.
+	 *
+	 * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because
+	 * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they
+	 * are taken until after the commit).
+	 *
+	 * Note, @c->lst.taken_empty_lebs may temporarily be higher by one
+	 * because of the way we serialize LEB allocations and budgeting. See a
+	 * comment in 'ubifs_find_free_space()'.
+	 */
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	if (unlikely(rsvd_idx_lebs > lebs)) {
+		dbg_budg("out of indexing space: min_idx_lebs %d (old %d), rsvd_idx_lebs %d",
+			 min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs);
+		return -ENOSPC;
+	}
+
+	available = ubifs_calc_available(c, min_idx_lebs);
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
+
+	if (unlikely(available < outstanding)) {
+		dbg_budg("out of data space: available %lld, outstanding %lld",
+			 available, outstanding);
+		return -ENOSPC;
+	}
+
+	if (available - outstanding <= c->rp_size && !can_use_rp(c))
+		return -ENOSPC;
+
+	c->bi.min_idx_lebs = min_idx_lebs;
+	return 0;
+}
+
+/**
+ * calc_idx_growth - calculate approximate index growth from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ *
+ * For now we assume each new node adds one znode. But this is rather poor
+ * approximation, though.
+ */
+static int calc_idx_growth(const struct ubifs_info *c,
+			   const struct ubifs_budget_req *req)
+{
+	int znodes;
+
+	znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
+		 req->new_dent;
+	return znodes * c->max_idx_node_sz;
+}
+
+/**
+ * calc_data_growth - calculate approximate amount of new data from budgeting
+ * request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_data_growth(const struct ubifs_info *c,
+			    const struct ubifs_budget_req *req)
+{
+	int data_growth;
+
+	data_growth = req->new_ino  ? c->bi.inode_budget : 0;
+	if (req->new_page)
+		data_growth += c->bi.page_budget;
+	if (req->new_dent)
+		data_growth += c->bi.dent_budget;
+	data_growth += req->new_ino_d;
+	return data_growth;
+}
+
+/**
+ * calc_dd_growth - calculate approximate amount of data which makes other data
+ * dirty from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_dd_growth(const struct ubifs_info *c,
+			  const struct ubifs_budget_req *req)
+{
+	int dd_growth;
+
+	dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
+
+	if (req->dirtied_ino)
+		dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
+	if (req->mod_dent)
+		dd_growth += c->bi.dent_budget;
+	dd_growth += req->dirtied_ino_d;
+	return dd_growth;
+}
+
+/**
+ * ubifs_budget_space - ensure there is enough space to complete an operation.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function allocates budget for an operation. It uses pessimistic
+ * approximation of how much flash space the operation needs. The goal of this
+ * function is to make sure UBIFS always has flash space to flush all dirty
+ * pages, dirty inodes, and dirty znodes (liability). This function may force
+ * commit, garbage-collection or write-back. Returns zero in case of success,
+ * %-ENOSPC if there is no free space and other negative error codes in case of
+ * failures.
+ */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+	int err, idx_growth, data_growth, dd_growth, retried = 0;
+
+	ubifs_assert(req->new_page <= 1);
+	ubifs_assert(req->dirtied_page <= 1);
+	ubifs_assert(req->new_dent <= 1);
+	ubifs_assert(req->mod_dent <= 1);
+	ubifs_assert(req->new_ino <= 1);
+	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
+	ubifs_assert(req->dirtied_ino <= 4);
+	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+	ubifs_assert(!(req->new_ino_d & 7));
+	ubifs_assert(!(req->dirtied_ino_d & 7));
+
+	data_growth = calc_data_growth(c, req);
+	dd_growth = calc_dd_growth(c, req);
+	if (!data_growth && !dd_growth)
+		return 0;
+	idx_growth = calc_idx_growth(c, req);
+
+again:
+	spin_lock(&c->space_lock);
+	ubifs_assert(c->bi.idx_growth >= 0);
+	ubifs_assert(c->bi.data_growth >= 0);
+	ubifs_assert(c->bi.dd_growth >= 0);
+
+	if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
+		dbg_budg("no space");
+		spin_unlock(&c->space_lock);
+		return -ENOSPC;
+	}
+
+	c->bi.idx_growth += idx_growth;
+	c->bi.data_growth += data_growth;
+	c->bi.dd_growth += dd_growth;
+
+	err = do_budget_space(c);
+	if (likely(!err)) {
+		req->idx_growth = idx_growth;
+		req->data_growth = data_growth;
+		req->dd_growth = dd_growth;
+		spin_unlock(&c->space_lock);
+		return 0;
+	}
+
+	/* Restore the old values */
+	c->bi.idx_growth -= idx_growth;
+	c->bi.data_growth -= data_growth;
+	c->bi.dd_growth -= dd_growth;
+	spin_unlock(&c->space_lock);
+
+	if (req->fast) {
+		dbg_budg("no space for fast budgeting");
+		return err;
+	}
+
+	err = make_free_space(c);
+	cond_resched();
+	if (err == -EAGAIN) {
+		dbg_budg("try again");
+		goto again;
+	} else if (err == -ENOSPC) {
+		if (!retried) {
+			retried = 1;
+			dbg_budg("-ENOSPC, but anyway try once again");
+			goto again;
+		}
+		dbg_budg("FS is full, -ENOSPC");
+		c->bi.nospace = 1;
+		if (can_use_rp(c) || c->rp_size == 0)
+			c->bi.nospace_rp = 1;
+		smp_wmb();
+	} else
+		ubifs_err(c, "cannot budget space, error %d", err);
+	return err;
+}
+
+/**
+ * ubifs_release_budget - release budgeted free space.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
+ * since the index changes (which were budgeted for in @req->idx_growth) will
+ * only be written to the media on commit, this function moves the index budget
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
+ * by the commit operation.
+ */
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+	ubifs_assert(req->new_page <= 1);
+	ubifs_assert(req->dirtied_page <= 1);
+	ubifs_assert(req->new_dent <= 1);
+	ubifs_assert(req->mod_dent <= 1);
+	ubifs_assert(req->new_ino <= 1);
+	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
+	ubifs_assert(req->dirtied_ino <= 4);
+	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+	ubifs_assert(!(req->new_ino_d & 7));
+	ubifs_assert(!(req->dirtied_ino_d & 7));
+	if (!req->recalculate) {
+		ubifs_assert(req->idx_growth >= 0);
+		ubifs_assert(req->data_growth >= 0);
+		ubifs_assert(req->dd_growth >= 0);
+	}
+
+	if (req->recalculate) {
+		req->data_growth = calc_data_growth(c, req);
+		req->dd_growth = calc_dd_growth(c, req);
+		req->idx_growth = calc_idx_growth(c, req);
+	}
+
+	if (!req->data_growth && !req->dd_growth)
+		return;
+
+	c->bi.nospace = c->bi.nospace_rp = 0;
+	smp_wmb();
+
+	spin_lock(&c->space_lock);
+	c->bi.idx_growth -= req->idx_growth;
+	c->bi.uncommitted_idx += req->idx_growth;
+	c->bi.data_growth -= req->data_growth;
+	c->bi.dd_growth -= req->dd_growth;
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	ubifs_assert(c->bi.idx_growth >= 0);
+	ubifs_assert(c->bi.data_growth >= 0);
+	ubifs_assert(c->bi.dd_growth >= 0);
+	ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
+	ubifs_assert(!(c->bi.idx_growth & 7));
+	ubifs_assert(!(c->bi.data_growth & 7));
+	ubifs_assert(!(c->bi.dd_growth & 7));
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_convert_page_budget - convert budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This function converts budget which was allocated for a new page of data to
+ * the budget of changing an existing page of data. The latter is smaller than
+ * the former, so this function only does simple re-calculation and does not
+ * involve any write-back.
+ */
+void ubifs_convert_page_budget(struct ubifs_info *c)
+{
+	spin_lock(&c->space_lock);
+	/* Release the index growth reservation */
+	c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	/* Release the data growth reservation */
+	c->bi.data_growth -= c->bi.page_budget;
+	/* Increase the dirty data growth reservation instead */
+	c->bi.dd_growth += c->bi.page_budget;
+	/* And re-calculate the indexing space reservation */
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_release_dirty_inode_budget - release dirty inode budget.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to release the budget for
+ *
+ * This function releases budget corresponding to a dirty inode. It is usually
+ * called when after the inode has been written to the media and marked as
+ * clean. It also causes the "no space" flags to be cleared.
+ */
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+				      struct ubifs_inode *ui)
+{
+	struct ubifs_budget_req req;
+
+	memset(&req, 0, sizeof(struct ubifs_budget_req));
+	/* The "no space" flags will be cleared because dd_growth is > 0 */
+	req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
+	ubifs_release_budget(c, &req);
+}
+
+/**
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexing nodes as well. This introduces additional
+ * overhead, and UBIFS has to report slightly less free space to meet the above
+ * expectations.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, long long free)
+{
+	int divisor, factor, f;
+
+	/*
+	 * Reported space size is @free * X, where X is UBIFS block size
+	 * divided by UBIFS block size + all overhead one data block
+	 * introduces. The overhead is the node header + indexing overhead.
+	 *
+	 * Indexing overhead calculations are based on the following formula:
+	 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+	 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+	 * as less than maximum fanout, we assume that each data node
+	 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+	 * Note, the multiplier 3 is because UBIFS reserves thrice as more space
+	 * for the index.
+	 */
+	f = c->fanout > 3 ? c->fanout >> 1 : 2;
+	factor = UBIFS_BLOCK_SIZE;
+	divisor = UBIFS_MAX_DATA_NODE_SZ;
+	divisor += (c->max_idx_node_sz * 3) / (f - 1);
+	free *= factor;
+	return div_u64(free, divisor);
+}
+
+/**
+ * ubifs_get_free_space_nolock - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
+ * free flash space it has (well, because not all dirty space is reclaimable,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
+ * bread user expectations about what free space is. Users seem to accustomed
+ * to assume that if the file-system reports N bytes of free space, they would
+ * be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
+ */
+long long ubifs_get_free_space_nolock(struct ubifs_info *c)
+{
+	int rsvd_idx_lebs, lebs;
+	long long available, outstanding, free;
+
+	ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
+	available = ubifs_calc_available(c, c->bi.min_idx_lebs);
+
+	/*
+	 * When reporting free space to user-space, UBIFS guarantees that it is
+	 * possible to write a file of free space size. This means that for
+	 * empty LEBs we may use more precise calculations than
+	 * 'ubifs_calc_available()' is using. Namely, we know that in empty
+	 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+	 * Thus, amend the available space.
+	 *
+	 * Note, the calculations below are similar to what we have in
+	 * 'do_budget_space()', so refer there for comments.
+	 */
+	if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	lebs -= rsvd_idx_lebs;
+	available += lebs * (c->dark_wm - c->leb_overhead);
+
+	if (available > outstanding)
+		free = ubifs_reported_space(c, available - outstanding);
+	else
+		free = 0;
+	return free;
+}
+
+/**
+ * ubifs_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns amount of free space to report to
+ * user-space.
+ */
+long long ubifs_get_free_space(struct ubifs_info *c)
+{
+	long long free;
+
+	spin_lock(&c->space_lock);
+	free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+
+	return free;
+}
diff --git a/kernel/fs/ubifs/commit.c b/kernel/fs/ubifs/commit.c
new file mode 100644
index 000000000..63f566199
--- /dev/null
+++ b/kernel/fs/ubifs/commit.c
@@ -0,0 +1,734 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions that manage the running of the commit process.
+ * Each affected module has its own functions to accomplish their part in the
+ * commit and those functions are called here.
+ *
+ * The commit is the process whereby all updates to the index and LEB properties
+ * are written out together and the journal becomes empty. This keeps the
+ * file system consistent - at all times the state can be recreated by reading
+ * the index and LEB properties and then replaying the journal.
+ *
+ * The commit is split into two parts named "commit start" and "commit end".
+ * During commit start, the commit process has exclusive access to the journal
+ * by holding the commit semaphore down for writing. As few I/O operations as
+ * possible are performed during commit start, instead the nodes that are to be
+ * written are merely identified. During commit end, the commit semaphore is no
+ * longer held and the journal is again in operation, allowing users to continue
+ * to use the file system while the bulk of the commit I/O is performed. The
+ * purpose of this two-step approach is to prevent the commit from causing any
+ * latency blips. Note that in any case, the commit does not prevent lookups
+ * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
+ * cache.
+ */
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/*
+ * nothing_to_commit - check if there is nothing to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which checks if there is anything to commit. It is
+ * used as an optimization to avoid starting the commit if it is not really
+ * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
+ * writing the commit start node to the log), and it is better to avoid doing
+ * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
+ * nothing to commit, it is more optimal to avoid any flash I/O.
+ *
+ * This function has to be called with @c->commit_sem locked for writing -
+ * this function does not take LPT/TNC locks because the @c->commit_sem
+ * guarantees that we have exclusive access to the TNC and LPT data structures.
+ *
+ * This function returns %1 if there is nothing to commit and %0 otherwise.
+ */
+static int nothing_to_commit(struct ubifs_info *c)
+{
+	/*
+	 * During mounting or remounting from R/O mode to R/W mode we may
+	 * commit for various recovery-related reasons.
+	 */
+	if (c->mounting || c->remounting_rw)
+		return 0;
+
+	/*
+	 * If the root TNC node is dirty, we definitely have something to
+	 * commit.
+	 */
+	if (c->zroot.znode && ubifs_zn_dirty(c->zroot.znode))
+		return 0;
+
+	/*
+	 * Even though the TNC is clean, the LPT tree may have dirty nodes. For
+	 * example, this may happen if the budgeting subsystem invoked GC to
+	 * make some free space, and the GC found an LEB with only dirty and
+	 * free space. In this case GC would just change the lprops of this
+	 * LEB (by turning all space into free space) and unmap it.
+	 */
+	if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+		return 0;
+
+	ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+	ubifs_assert(c->dirty_pn_cnt == 0);
+	ubifs_assert(c->dirty_nn_cnt == 0);
+
+	return 1;
+}
+
+/**
+ * do_commit - commit the journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function implements UBIFS commit. It has to be called with commit lock
+ * locked. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int do_commit(struct ubifs_info *c)
+{
+	int err, new_ltail_lnum, old_ltail_lnum, i;
+	struct ubifs_zbranch zroot;
+	struct ubifs_lp_stats lst;
+
+	dbg_cmt("start");
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_up;
+	}
+
+	if (nothing_to_commit(c)) {
+		up_write(&c->commit_sem);
+		err = 0;
+		goto out_cancel;
+	}
+
+	/* Sync all write buffers (necessary for recovery) */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			goto out_up;
+	}
+
+	c->cmt_no += 1;
+	err = ubifs_gc_start_commit(c);
+	if (err)
+		goto out_up;
+	err = dbg_check_lprops(c);
+	if (err)
+		goto out_up;
+	err = ubifs_log_start_commit(c, &new_ltail_lnum);
+	if (err)
+		goto out_up;
+	err = ubifs_tnc_start_commit(c, &zroot);
+	if (err)
+		goto out_up;
+	err = ubifs_lpt_start_commit(c);
+	if (err)
+		goto out_up;
+	err = ubifs_orphan_start_commit(c);
+	if (err)
+		goto out_up;
+
+	ubifs_get_lp_stats(c, &lst);
+
+	up_write(&c->commit_sem);
+
+	err = ubifs_tnc_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_lpt_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_orphan_end_commit(c);
+	if (err)
+		goto out;
+	err = dbg_check_old_index(c, &zroot);
+	if (err)
+		goto out;
+
+	c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
+	c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
+	c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
+	c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
+	c->mst_node->root_len    = cpu_to_le32(zroot.len);
+	c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
+	c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
+	c->mst_node->index_size  = cpu_to_le64(c->bi.old_idx_sz);
+	c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
+	c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
+	c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
+	c->mst_node->nhead_offs  = cpu_to_le32(c->nhead_offs);
+	c->mst_node->ltab_lnum   = cpu_to_le32(c->ltab_lnum);
+	c->mst_node->ltab_offs   = cpu_to_le32(c->ltab_offs);
+	c->mst_node->lsave_lnum  = cpu_to_le32(c->lsave_lnum);
+	c->mst_node->lsave_offs  = cpu_to_le32(c->lsave_offs);
+	c->mst_node->lscan_lnum  = cpu_to_le32(c->lscan_lnum);
+	c->mst_node->empty_lebs  = cpu_to_le32(lst.empty_lebs);
+	c->mst_node->idx_lebs    = cpu_to_le32(lst.idx_lebs);
+	c->mst_node->total_free  = cpu_to_le64(lst.total_free);
+	c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
+	c->mst_node->total_used  = cpu_to_le64(lst.total_used);
+	c->mst_node->total_dead  = cpu_to_le64(lst.total_dead);
+	c->mst_node->total_dark  = cpu_to_le64(lst.total_dark);
+	if (c->no_orphs)
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	else
+		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
+
+	old_ltail_lnum = c->ltail_lnum;
+	err = ubifs_log_end_commit(c, new_ltail_lnum);
+	if (err)
+		goto out;
+
+	err = ubifs_log_post_commit(c, old_ltail_lnum);
+	if (err)
+		goto out;
+	err = ubifs_gc_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_lpt_post_commit(c);
+	if (err)
+		goto out;
+
+out_cancel:
+	spin_lock(&c->cs_lock);
+	c->cmt_state = COMMIT_RESTING;
+	wake_up(&c->cmt_wq);
+	dbg_cmt("commit end");
+	spin_unlock(&c->cs_lock);
+	return 0;
+
+out_up:
+	up_write(&c->commit_sem);
+out:
+	ubifs_err(c, "commit failed, error %d", err);
+	spin_lock(&c->cs_lock);
+	c->cmt_state = COMMIT_BROKEN;
+	wake_up(&c->cmt_wq);
+	spin_unlock(&c->cs_lock);
+	ubifs_ro_mode(c, err);
+	return err;
+}
+
+/**
+ * run_bg_commit - run background commit if it is needed.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs background commit if it is needed. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int run_bg_commit(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	/*
+	 * Run background commit only if background commit was requested or if
+	 * commit is required.
+	 */
+	if (c->cmt_state != COMMIT_BACKGROUND &&
+	    c->cmt_state != COMMIT_REQUIRED)
+		goto out;
+	spin_unlock(&c->cs_lock);
+
+	down_write(&c->commit_sem);
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_REQUIRED)
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+	else if (c->cmt_state == COMMIT_BACKGROUND)
+		c->cmt_state = COMMIT_RUNNING_BACKGROUND;
+	else
+		goto out_cmt_unlock;
+	spin_unlock(&c->cs_lock);
+
+	return do_commit(c);
+
+out_cmt_unlock:
+	up_write(&c->commit_sem);
+out:
+	spin_unlock(&c->cs_lock);
+	return 0;
+}
+
+/**
+ * ubifs_bg_thread - UBIFS background thread function.
+ * @info: points to the file-system description object
+ *
+ * This function implements various file-system background activities:
+ * o when a write-buffer timer expires it synchronizes the appropriate
+ *   write-buffer;
+ * o when the journal is about to be full, it starts in-advance commit.
+ *
+ * Note, other stuff like background garbage collection may be added here in
+ * future.
+ */
+int ubifs_bg_thread(void *info)
+{
+	int err;
+	struct ubifs_info *c = info;
+
+	ubifs_msg(c, "background thread \"%s\" started, PID %d",
+		  c->bgt_name, current->pid);
+	set_freezable();
+
+	while (1) {
+		if (kthread_should_stop())
+			break;
+
+		if (try_to_freeze())
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		/* Check if there is something to do */
+		if (!c->need_bgt) {
+			/*
+			 * Nothing prevents us from going sleep now and
+			 * be never woken up and block the task which
+			 * could wait in 'kthread_stop()' forever.
+			 */
+			if (kthread_should_stop())
+				break;
+			schedule();
+			continue;
+		} else
+			__set_current_state(TASK_RUNNING);
+
+		c->need_bgt = 0;
+		err = ubifs_bg_wbufs_sync(c);
+		if (err)
+			ubifs_ro_mode(c, err);
+
+		run_bg_commit(c);
+		cond_resched();
+	}
+
+	ubifs_msg(c, "background thread \"%s\" stops", c->bgt_name);
+	return 0;
+}
+
+/**
+ * ubifs_commit_required - set commit state to "required".
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if a commit is required but cannot be done from the
+ * calling function, so it is just flagged instead.
+ */
+void ubifs_commit_required(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	switch (c->cmt_state) {
+	case COMMIT_RESTING:
+	case COMMIT_BACKGROUND:
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_REQUIRED));
+		c->cmt_state = COMMIT_REQUIRED;
+		break;
+	case COMMIT_RUNNING_BACKGROUND:
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_RUNNING_REQUIRED));
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+		break;
+	case COMMIT_REQUIRED:
+	case COMMIT_RUNNING_REQUIRED:
+	case COMMIT_BROKEN:
+		break;
+	}
+	spin_unlock(&c->cs_lock);
+}
+
+/**
+ * ubifs_request_bg_commit - notify the background thread to do a commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if the journal is full enough to make a commit
+ * worthwhile, so background thread is kicked to start it.
+ */
+void ubifs_request_bg_commit(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_RESTING) {
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_BACKGROUND));
+		c->cmt_state = COMMIT_BACKGROUND;
+		spin_unlock(&c->cs_lock);
+		ubifs_wake_up_bgt(c);
+	} else
+		spin_unlock(&c->cs_lock);
+}
+
+/**
+ * wait_for_commit - wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function sleeps until the commit operation is no longer running.
+ */
+static int wait_for_commit(struct ubifs_info *c)
+{
+	dbg_cmt("pid %d goes sleep", current->pid);
+
+	/*
+	 * The following sleeps if the condition is false, and will be woken
+	 * when the commit ends. It is possible, although very unlikely, that we
+	 * will wake up and see the subsequent commit running, rather than the
+	 * one we were waiting for, and go back to sleep.  However, we will be
+	 * woken again, so there is no danger of sleeping forever.
+	 */
+	wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
+			      c->cmt_state != COMMIT_RUNNING_REQUIRED);
+	dbg_cmt("commit finished, pid %d woke up", current->pid);
+	return 0;
+}
+
+/**
+ * ubifs_run_commit - run or wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs commit and returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_run_commit(struct ubifs_info *c)
+{
+	int err = 0;
+
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_BROKEN) {
+		err = -EROFS;
+		goto out;
+	}
+
+	if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+		/*
+		 * We set the commit state to 'running required' to indicate
+		 * that we want it to complete as quickly as possible.
+		 */
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+	if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		spin_unlock(&c->cs_lock);
+		return wait_for_commit(c);
+	}
+	spin_unlock(&c->cs_lock);
+
+	/* Ok, the commit is indeed needed */
+
+	down_write(&c->commit_sem);
+	spin_lock(&c->cs_lock);
+	/*
+	 * Since we unlocked 'c->cs_lock', the state may have changed, so
+	 * re-check it.
+	 */
+	if (c->cmt_state == COMMIT_BROKEN) {
+		err = -EROFS;
+		goto out_cmt_unlock;
+	}
+
+	if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+	if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		up_write(&c->commit_sem);
+		spin_unlock(&c->cs_lock);
+		return wait_for_commit(c);
+	}
+	c->cmt_state = COMMIT_RUNNING_REQUIRED;
+	spin_unlock(&c->cs_lock);
+
+	err = do_commit(c);
+	return err;
+
+out_cmt_unlock:
+	up_write(&c->commit_sem);
+out:
+	spin_unlock(&c->cs_lock);
+	return err;
+}
+
+/**
+ * ubifs_gc_should_commit - determine if it is time for GC to run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by garbage collection to determine if commit should
+ * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
+ * is full enough to start commit, this function returns true. It is not
+ * absolutely necessary to commit yet, but it feels like this should be better
+ * then to keep doing GC. This function returns %1 if GC has to initiate commit
+ * and %0 if not.
+ */
+int ubifs_gc_should_commit(struct ubifs_info *c)
+{
+	int ret = 0;
+
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_BACKGROUND) {
+		dbg_cmt("commit required now");
+		c->cmt_state = COMMIT_REQUIRED;
+	} else
+		dbg_cmt("commit not requested");
+	if (c->cmt_state == COMMIT_REQUIRED)
+		ret = 1;
+	spin_unlock(&c->cs_lock);
+	return ret;
+}
+
+/*
+ * Everything below is related to debugging.
+ */
+
+/**
+ * struct idx_node - hold index nodes during index tree traversal.
+ * @list: list
+ * @iip: index in parent (slot number of this indexing node in the parent
+ *       indexing node)
+ * @upper_key: all keys in this indexing node have to be less or equivalent to
+ *             this key
+ * @idx: index node (8-byte aligned because all node structures must be 8-byte
+ *       aligned)
+ */
+struct idx_node {
+	struct list_head list;
+	int iip;
+	union ubifs_key upper_key;
+	struct ubifs_idx_node idx __aligned(8);
+};
+
+/**
+ * dbg_old_index_check_init - get information for the next old index check.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the index
+ *
+ * This function records information about the index that will be needed for the
+ * next old index check i.e. 'dbg_check_old_index()'.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	struct ubifs_idx_node *idx;
+	int lnum, offs, len, err = 0;
+	struct ubifs_debug_info *d = c->dbg;
+
+	d->old_zroot = *zroot;
+	lnum = d->old_zroot.lnum;
+	offs = d->old_zroot.offs;
+	len = d->old_zroot.len;
+
+	idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+	if (!idx)
+		return -ENOMEM;
+
+	err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+	if (err)
+		goto out;
+
+	d->old_zroot_level = le16_to_cpu(idx->level);
+	d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+out:
+	kfree(idx);
+	return err;
+}
+
+/**
+ * dbg_check_old_index - check the old copy of the index.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the new index
+ *
+ * In order to be able to recover from an unclean unmount, a complete copy of
+ * the index must exist on flash. This is the "old" index. The commit process
+ * must write the "new" index to flash without overwriting or destroying any
+ * part of the old index. This function is run at commit end in order to check
+ * that the old index does indeed exist completely intact.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+	int first = 1, iip;
+	struct ubifs_debug_info *d = c->dbg;
+	union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
+	unsigned long long uninitialized_var(last_sqnum);
+	struct ubifs_idx_node *idx;
+	struct list_head list;
+	struct idx_node *i;
+	size_t sz;
+
+	if (!dbg_is_chk_index(c))
+		return 0;
+
+	INIT_LIST_HEAD(&list);
+
+	sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
+	     UBIFS_IDX_NODE_SZ;
+
+	/* Start at the old zroot */
+	lnum = d->old_zroot.lnum;
+	offs = d->old_zroot.offs;
+	len = d->old_zroot.len;
+	iip = 0;
+
+	/*
+	 * Traverse the index tree preorder depth-first i.e. do a node and then
+	 * its subtrees from left to right.
+	 */
+	while (1) {
+		struct ubifs_branch *br;
+
+		/* Get the next index node */
+		i = kmalloc(sz, GFP_NOFS);
+		if (!i) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		i->iip = iip;
+		/* Keep the index nodes on our path in a linked list */
+		list_add_tail(&i->list, &list);
+		/* Read the index node */
+		idx = &i->idx;
+		err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+		if (err)
+			goto out_free;
+		/* Validate index node */
+		child_cnt = le16_to_cpu(idx->child_cnt);
+		if (child_cnt < 1 || child_cnt > c->fanout) {
+			err = 1;
+			goto out_dump;
+		}
+		if (first) {
+			first = 0;
+			/* Check root level and sqnum */
+			if (le16_to_cpu(idx->level) != d->old_zroot_level) {
+				err = 2;
+				goto out_dump;
+			}
+			if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
+				err = 3;
+				goto out_dump;
+			}
+			/* Set last values as though root had a parent */
+			last_level = le16_to_cpu(idx->level) + 1;
+			last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
+			key_read(c, ubifs_idx_key(c, idx), &lower_key);
+			highest_ino_key(c, &upper_key, INUM_WATERMARK);
+		}
+		key_copy(c, &upper_key, &i->upper_key);
+		if (le16_to_cpu(idx->level) != last_level - 1) {
+			err = 3;
+			goto out_dump;
+		}
+		/*
+		 * The index is always written bottom up hence a child's sqnum
+		 * is always less than the parents.
+		 */
+		if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
+			err = 4;
+			goto out_dump;
+		}
+		/* Check key range */
+		key_read(c, ubifs_idx_key(c, idx), &l_key);
+		br = ubifs_idx_branch(c, idx, child_cnt - 1);
+		key_read(c, &br->key, &u_key);
+		if (keys_cmp(c, &lower_key, &l_key) > 0) {
+			err = 5;
+			goto out_dump;
+		}
+		if (keys_cmp(c, &upper_key, &u_key) < 0) {
+			err = 6;
+			goto out_dump;
+		}
+		if (keys_cmp(c, &upper_key, &u_key) == 0)
+			if (!is_hash_key(c, &u_key)) {
+				err = 7;
+				goto out_dump;
+			}
+		/* Go to next index node */
+		if (le16_to_cpu(idx->level) == 0) {
+			/* At the bottom, so go up until can go right */
+			while (1) {
+				/* Drop the bottom of the list */
+				list_del(&i->list);
+				kfree(i);
+				/* No more list means we are done */
+				if (list_empty(&list))
+					goto out;
+				/* Look at the new bottom */
+				i = list_entry(list.prev, struct idx_node,
+					       list);
+				idx = &i->idx;
+				/* Can we go right */
+				if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+					iip = iip + 1;
+					break;
+				} else
+					/* Nope, so go up again */
+					iip = i->iip;
+			}
+		} else
+			/* Go down left */
+			iip = 0;
+		/*
+		 * We have the parent in 'idx' and now we set up for reading the
+		 * child pointed to by slot 'iip'.
+		 */
+		last_level = le16_to_cpu(idx->level);
+		last_sqnum = le64_to_cpu(idx->ch.sqnum);
+		br = ubifs_idx_branch(c, idx, iip);
+		lnum = le32_to_cpu(br->lnum);
+		offs = le32_to_cpu(br->offs);
+		len = le32_to_cpu(br->len);
+		key_read(c, &br->key, &lower_key);
+		if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+			br = ubifs_idx_branch(c, idx, iip + 1);
+			key_read(c, &br->key, &upper_key);
+		} else
+			key_copy(c, &i->upper_key, &upper_key);
+	}
+out:
+	err = dbg_old_index_check_init(c, zroot);
+	if (err)
+		goto out_free;
+
+	return 0;
+
+out_dump:
+	ubifs_err(c, "dumping index node (iip=%d)", i->iip);
+	ubifs_dump_node(c, idx);
+	list_del(&i->list);
+	kfree(i);
+	if (!list_empty(&list)) {
+		i = list_entry(list.prev, struct idx_node, list);
+		ubifs_err(c, "dumping parent index node");
+		ubifs_dump_node(c, &i->idx);
+	}
+out_free:
+	while (!list_empty(&list)) {
+		i = list_entry(list.next, struct idx_node, list);
+		list_del(&i->list);
+		kfree(i);
+	}
+	ubifs_err(c, "failed, error %d", err);
+	if (err > 0)
+		err = -EINVAL;
+	return err;
+}
diff --git a/kernel/fs/ubifs/compress.c b/kernel/fs/ubifs/compress.c
new file mode 100644
index 000000000..565cb56d7
--- /dev/null
+++ b/kernel/fs/ubifs/compress.c
@@ -0,0 +1,250 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file provides a single place to access to compression and
+ * decompression.
+ */
+
+#include <linux/crypto.h>
+#include "ubifs.h"
+
+/* Fake description object for the "none" compressor */
+static struct ubifs_compressor none_compr = {
+	.compr_type = UBIFS_COMPR_NONE,
+	.name = "none",
+	.capi_name = "",
+};
+
+#ifdef CONFIG_UBIFS_FS_LZO
+static DEFINE_MUTEX(lzo_mutex);
+
+static struct ubifs_compressor lzo_compr = {
+	.compr_type = UBIFS_COMPR_LZO,
+	.comp_mutex = &lzo_mutex,
+	.name = "lzo",
+	.capi_name = "lzo",
+};
+#else
+static struct ubifs_compressor lzo_compr = {
+	.compr_type = UBIFS_COMPR_LZO,
+	.name = "lzo",
+};
+#endif
+
+#ifdef CONFIG_UBIFS_FS_ZLIB
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+
+static struct ubifs_compressor zlib_compr = {
+	.compr_type = UBIFS_COMPR_ZLIB,
+	.comp_mutex = &deflate_mutex,
+	.decomp_mutex = &inflate_mutex,
+	.name = "zlib",
+	.capi_name = "deflate",
+};
+#else
+static struct ubifs_compressor zlib_compr = {
+	.compr_type = UBIFS_COMPR_ZLIB,
+	.name = "zlib",
+};
+#endif
+
+/* All UBIFS compressors */
+struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/**
+ * ubifs_compress - compress data.
+ * @in_buf: data to compress
+ * @in_len: length of the data to compress
+ * @out_buf: output buffer where compressed data should be stored
+ * @out_len: output buffer length is returned here
+ * @compr_type: type of compression to use on enter, actually used compression
+ *              type on exit
+ *
+ * This function compresses input buffer @in_buf of length @in_len and stores
+ * the result in the output buffer @out_buf and the resulting length in
+ * @out_len. If the input buffer does not compress, it is just copied to the
+ * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
+ * compression error occurred.
+ *
+ * Note, if the input buffer was not compressed, it is copied to the output
+ * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
+ */
+void ubifs_compress(const struct ubifs_info *c, const void *in_buf,
+		    int in_len, void *out_buf, int *out_len, int *compr_type)
+{
+	int err;
+	struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
+
+	if (*compr_type == UBIFS_COMPR_NONE)
+		goto no_compr;
+
+	/* If the input data is small, do not even try to compress it */
+	if (in_len < UBIFS_MIN_COMPR_LEN)
+		goto no_compr;
+
+	if (compr->comp_mutex)
+		mutex_lock(compr->comp_mutex);
+	err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
+				   (unsigned int *)out_len);
+	if (compr->comp_mutex)
+		mutex_unlock(compr->comp_mutex);
+	if (unlikely(err)) {
+		ubifs_warn(c, "cannot compress %d bytes, compressor %s, error %d, leave data uncompressed",
+			   in_len, compr->name, err);
+		goto no_compr;
+	}
+
+	/*
+	 * If the data compressed only slightly, it is better to leave it
+	 * uncompressed to improve read speed.
+	 */
+	if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
+		goto no_compr;
+
+	return;
+
+no_compr:
+	memcpy(out_buf, in_buf, in_len);
+	*out_len = in_len;
+	*compr_type = UBIFS_COMPR_NONE;
+}
+
+/**
+ * ubifs_decompress - decompress data.
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_buf: output buffer where decompressed data should
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into buffer @out_buf.
+ * The length of the uncompressed data is returned in @out_len. This functions
+ * returns %0 on success or a negative error code on failure.
+ */
+int ubifs_decompress(const struct ubifs_info *c, const void *in_buf,
+		     int in_len, void *out_buf, int *out_len, int compr_type)
+{
+	int err;
+	struct ubifs_compressor *compr;
+
+	if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
+		ubifs_err(c, "invalid compression type %d", compr_type);
+		return -EINVAL;
+	}
+
+	compr = ubifs_compressors[compr_type];
+
+	if (unlikely(!compr->capi_name)) {
+		ubifs_err(c, "%s compression is not compiled in", compr->name);
+		return -EINVAL;
+	}
+
+	if (compr_type == UBIFS_COMPR_NONE) {
+		memcpy(out_buf, in_buf, in_len);
+		*out_len = in_len;
+		return 0;
+	}
+
+	if (compr->decomp_mutex)
+		mutex_lock(compr->decomp_mutex);
+	err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
+				     (unsigned int *)out_len);
+	if (compr->decomp_mutex)
+		mutex_unlock(compr->decomp_mutex);
+	if (err)
+		ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d",
+			  in_len, compr->name, err);
+
+	return err;
+}
+
+/**
+ * compr_init - initialize a compressor.
+ * @compr: compressor description object
+ *
+ * This function initializes the requested compressor and returns zero in case
+ * of success or a negative error code in case of failure.
+ */
+static int __init compr_init(struct ubifs_compressor *compr)
+{
+	if (compr->capi_name) {
+		compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
+		if (IS_ERR(compr->cc)) {
+			pr_err("UBIFS error (pid %d): cannot initialize compressor %s, error %ld",
+			       current->pid, compr->name, PTR_ERR(compr->cc));
+			return PTR_ERR(compr->cc);
+		}
+	}
+
+	ubifs_compressors[compr->compr_type] = compr;
+	return 0;
+}
+
+/**
+ * compr_exit - de-initialize a compressor.
+ * @compr: compressor description object
+ */
+static void compr_exit(struct ubifs_compressor *compr)
+{
+	if (compr->capi_name)
+		crypto_free_comp(compr->cc);
+	return;
+}
+
+/**
+ * ubifs_compressors_init - initialize UBIFS compressors.
+ *
+ * This function initializes the compressor which were compiled in. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+int __init ubifs_compressors_init(void)
+{
+	int err;
+
+	err = compr_init(&lzo_compr);
+	if (err)
+		return err;
+
+	err = compr_init(&zlib_compr);
+	if (err)
+		goto out_lzo;
+
+	ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
+	return 0;
+
+out_lzo:
+	compr_exit(&lzo_compr);
+	return err;
+}
+
+/**
+ * ubifs_compressors_exit - de-initialize UBIFS compressors.
+ */
+void ubifs_compressors_exit(void)
+{
+	compr_exit(&lzo_compr);
+	compr_exit(&zlib_compr);
+}
diff --git a/kernel/fs/ubifs/debug.c b/kernel/fs/ubifs/debug.c
new file mode 100644
index 000000000..4c46a9865
--- /dev/null
+++ b/kernel/fs/ubifs/debug.c
@@ -0,0 +1,3104 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements most of the debugging stuff which is compiled in only
+ * when it is enabled. But some debugging check functions are implemented in
+ * corresponding subsystem, just because they are closely related and utilize
+ * various local functions of those subsystems.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/math64.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include "ubifs.h"
+
+static DEFINE_SPINLOCK(dbg_lock);
+
+static const char *get_key_fmt(int fmt)
+{
+	switch (fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		return "simple";
+	default:
+		return "unknown/invalid format";
+	}
+}
+
+static const char *get_key_hash(int hash)
+{
+	switch (hash) {
+	case UBIFS_KEY_HASH_R5:
+		return "R5";
+	case UBIFS_KEY_HASH_TEST:
+		return "test";
+	default:
+		return "unknown/invalid name hash";
+	}
+}
+
+static const char *get_key_type(int type)
+{
+	switch (type) {
+	case UBIFS_INO_KEY:
+		return "inode";
+	case UBIFS_DENT_KEY:
+		return "direntry";
+	case UBIFS_XENT_KEY:
+		return "xentry";
+	case UBIFS_DATA_KEY:
+		return "data";
+	case UBIFS_TRUN_KEY:
+		return "truncate";
+	default:
+		return "unknown/invalid key";
+	}
+}
+
+static const char *get_dent_type(int type)
+{
+	switch (type) {
+	case UBIFS_ITYPE_REG:
+		return "file";
+	case UBIFS_ITYPE_DIR:
+		return "dir";
+	case UBIFS_ITYPE_LNK:
+		return "symlink";
+	case UBIFS_ITYPE_BLK:
+		return "blkdev";
+	case UBIFS_ITYPE_CHR:
+		return "char dev";
+	case UBIFS_ITYPE_FIFO:
+		return "fifo";
+	case UBIFS_ITYPE_SOCK:
+		return "socket";
+	default:
+		return "unknown/invalid type";
+	}
+}
+
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+			     const union ubifs_key *key, char *buffer, int len)
+{
+	char *p = buffer;
+	int type = key_type(c, key);
+
+	if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
+		switch (type) {
+		case UBIFS_INO_KEY:
+			len -= snprintf(p, len, "(%lu, %s)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type));
+			break;
+		case UBIFS_DENT_KEY:
+		case UBIFS_XENT_KEY:
+			len -= snprintf(p, len, "(%lu, %s, %#08x)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type), key_hash(c, key));
+			break;
+		case UBIFS_DATA_KEY:
+			len -= snprintf(p, len, "(%lu, %s, %u)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type), key_block(c, key));
+			break;
+		case UBIFS_TRUN_KEY:
+			len -= snprintf(p, len, "(%lu, %s)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type));
+			break;
+		default:
+			len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
+					key->u32[0], key->u32[1]);
+		}
+	} else
+		len -= snprintf(p, len, "bad key format %d", c->key_fmt);
+	ubifs_assert(len > 0);
+	return p;
+}
+
+const char *dbg_ntype(int type)
+{
+	switch (type) {
+	case UBIFS_PAD_NODE:
+		return "padding node";
+	case UBIFS_SB_NODE:
+		return "superblock node";
+	case UBIFS_MST_NODE:
+		return "master node";
+	case UBIFS_REF_NODE:
+		return "reference node";
+	case UBIFS_INO_NODE:
+		return "inode node";
+	case UBIFS_DENT_NODE:
+		return "direntry node";
+	case UBIFS_XENT_NODE:
+		return "xentry node";
+	case UBIFS_DATA_NODE:
+		return "data node";
+	case UBIFS_TRUN_NODE:
+		return "truncate node";
+	case UBIFS_IDX_NODE:
+		return "indexing node";
+	case UBIFS_CS_NODE:
+		return "commit start node";
+	case UBIFS_ORPH_NODE:
+		return "orphan node";
+	default:
+		return "unknown node";
+	}
+}
+
+static const char *dbg_gtype(int type)
+{
+	switch (type) {
+	case UBIFS_NO_NODE_GROUP:
+		return "no node group";
+	case UBIFS_IN_NODE_GROUP:
+		return "in node group";
+	case UBIFS_LAST_OF_NODE_GROUP:
+		return "last of node group";
+	default:
+		return "unknown";
+	}
+}
+
+const char *dbg_cstate(int cmt_state)
+{
+	switch (cmt_state) {
+	case COMMIT_RESTING:
+		return "commit resting";
+	case COMMIT_BACKGROUND:
+		return "background commit requested";
+	case COMMIT_REQUIRED:
+		return "commit required";
+	case COMMIT_RUNNING_BACKGROUND:
+		return "BACKGROUND commit running";
+	case COMMIT_RUNNING_REQUIRED:
+		return "commit running and required";
+	case COMMIT_BROKEN:
+		return "broken commit";
+	default:
+		return "unknown commit state";
+	}
+}
+
+const char *dbg_jhead(int jhead)
+{
+	switch (jhead) {
+	case GCHD:
+		return "0 (GC)";
+	case BASEHD:
+		return "1 (base)";
+	case DATAHD:
+		return "2 (data)";
+	default:
+		return "unknown journal head";
+	}
+}
+
+static void dump_ch(const struct ubifs_ch *ch)
+{
+	pr_err("\tmagic          %#x\n", le32_to_cpu(ch->magic));
+	pr_err("\tcrc            %#x\n", le32_to_cpu(ch->crc));
+	pr_err("\tnode_type      %d (%s)\n", ch->node_type,
+	       dbg_ntype(ch->node_type));
+	pr_err("\tgroup_type     %d (%s)\n", ch->group_type,
+	       dbg_gtype(ch->group_type));
+	pr_err("\tsqnum          %llu\n",
+	       (unsigned long long)le64_to_cpu(ch->sqnum));
+	pr_err("\tlen            %u\n", le32_to_cpu(ch->len));
+}
+
+void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	const struct ubifs_inode *ui = ubifs_inode(inode);
+	struct qstr nm = { .name = NULL };
+	union ubifs_key key;
+	struct ubifs_dent_node *dent, *pdent = NULL;
+	int count = 2;
+
+	pr_err("Dump in-memory inode:");
+	pr_err("\tinode          %lu\n", inode->i_ino);
+	pr_err("\tsize           %llu\n",
+	       (unsigned long long)i_size_read(inode));
+	pr_err("\tnlink          %u\n", inode->i_nlink);
+	pr_err("\tuid            %u\n", (unsigned int)i_uid_read(inode));
+	pr_err("\tgid            %u\n", (unsigned int)i_gid_read(inode));
+	pr_err("\tatime          %u.%u\n",
+	       (unsigned int)inode->i_atime.tv_sec,
+	       (unsigned int)inode->i_atime.tv_nsec);
+	pr_err("\tmtime          %u.%u\n",
+	       (unsigned int)inode->i_mtime.tv_sec,
+	       (unsigned int)inode->i_mtime.tv_nsec);
+	pr_err("\tctime          %u.%u\n",
+	       (unsigned int)inode->i_ctime.tv_sec,
+	       (unsigned int)inode->i_ctime.tv_nsec);
+	pr_err("\tcreat_sqnum    %llu\n", ui->creat_sqnum);
+	pr_err("\txattr_size     %u\n", ui->xattr_size);
+	pr_err("\txattr_cnt      %u\n", ui->xattr_cnt);
+	pr_err("\txattr_names    %u\n", ui->xattr_names);
+	pr_err("\tdirty          %u\n", ui->dirty);
+	pr_err("\txattr          %u\n", ui->xattr);
+	pr_err("\tbulk_read      %u\n", ui->xattr);
+	pr_err("\tsynced_i_size  %llu\n",
+	       (unsigned long long)ui->synced_i_size);
+	pr_err("\tui_size        %llu\n",
+	       (unsigned long long)ui->ui_size);
+	pr_err("\tflags          %d\n", ui->flags);
+	pr_err("\tcompr_type     %d\n", ui->compr_type);
+	pr_err("\tlast_page_read %lu\n", ui->last_page_read);
+	pr_err("\tread_in_a_row  %lu\n", ui->read_in_a_row);
+	pr_err("\tdata_len       %d\n", ui->data_len);
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	pr_err("List of directory entries:\n");
+	ubifs_assert(!mutex_is_locked(&c->tnc_mutex));
+
+	lowest_dent_key(c, &key, inode->i_ino);
+	while (1) {
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			if (PTR_ERR(dent) != -ENOENT)
+				pr_err("error %ld\n", PTR_ERR(dent));
+			break;
+		}
+
+		pr_err("\t%d: %s (%s)\n",
+		       count++, dent->name, get_dent_type(dent->type));
+
+		nm.name = dent->name;
+		nm.len = le16_to_cpu(dent->nlen);
+		kfree(pdent);
+		pdent = dent;
+		key_read(c, &dent->key, &key);
+	}
+	kfree(pdent);
+}
+
+void ubifs_dump_node(const struct ubifs_info *c, const void *node)
+{
+	int i, n;
+	union ubifs_key key;
+	const struct ubifs_ch *ch = node;
+	char key_buf[DBG_KEY_BUF_LEN];
+
+	/* If the magic is incorrect, just hexdump the first bytes */
+	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
+		pr_err("Not a node, first %zu bytes:", UBIFS_CH_SZ);
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,
+			       (void *)node, UBIFS_CH_SZ, 1);
+		return;
+	}
+
+	spin_lock(&dbg_lock);
+	dump_ch(node);
+
+	switch (ch->node_type) {
+	case UBIFS_PAD_NODE:
+	{
+		const struct ubifs_pad_node *pad = node;
+
+		pr_err("\tpad_len        %u\n", le32_to_cpu(pad->pad_len));
+		break;
+	}
+	case UBIFS_SB_NODE:
+	{
+		const struct ubifs_sb_node *sup = node;
+		unsigned int sup_flags = le32_to_cpu(sup->flags);
+
+		pr_err("\tkey_hash       %d (%s)\n",
+		       (int)sup->key_hash, get_key_hash(sup->key_hash));
+		pr_err("\tkey_fmt        %d (%s)\n",
+		       (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
+		pr_err("\tflags          %#x\n", sup_flags);
+		pr_err("\tbig_lpt        %u\n",
+		       !!(sup_flags & UBIFS_FLG_BIGLPT));
+		pr_err("\tspace_fixup    %u\n",
+		       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
+		pr_err("\tmin_io_size    %u\n", le32_to_cpu(sup->min_io_size));
+		pr_err("\tleb_size       %u\n", le32_to_cpu(sup->leb_size));
+		pr_err("\tleb_cnt        %u\n", le32_to_cpu(sup->leb_cnt));
+		pr_err("\tmax_leb_cnt    %u\n", le32_to_cpu(sup->max_leb_cnt));
+		pr_err("\tmax_bud_bytes  %llu\n",
+		       (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
+		pr_err("\tlog_lebs       %u\n", le32_to_cpu(sup->log_lebs));
+		pr_err("\tlpt_lebs       %u\n", le32_to_cpu(sup->lpt_lebs));
+		pr_err("\torph_lebs      %u\n", le32_to_cpu(sup->orph_lebs));
+		pr_err("\tjhead_cnt      %u\n", le32_to_cpu(sup->jhead_cnt));
+		pr_err("\tfanout         %u\n", le32_to_cpu(sup->fanout));
+		pr_err("\tlsave_cnt      %u\n", le32_to_cpu(sup->lsave_cnt));
+		pr_err("\tdefault_compr  %u\n",
+		       (int)le16_to_cpu(sup->default_compr));
+		pr_err("\trp_size        %llu\n",
+		       (unsigned long long)le64_to_cpu(sup->rp_size));
+		pr_err("\trp_uid         %u\n", le32_to_cpu(sup->rp_uid));
+		pr_err("\trp_gid         %u\n", le32_to_cpu(sup->rp_gid));
+		pr_err("\tfmt_version    %u\n", le32_to_cpu(sup->fmt_version));
+		pr_err("\ttime_gran      %u\n", le32_to_cpu(sup->time_gran));
+		pr_err("\tUUID           %pUB\n", sup->uuid);
+		break;
+	}
+	case UBIFS_MST_NODE:
+	{
+		const struct ubifs_mst_node *mst = node;
+
+		pr_err("\thighest_inum   %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->highest_inum));
+		pr_err("\tcommit number  %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->cmt_no));
+		pr_err("\tflags          %#x\n", le32_to_cpu(mst->flags));
+		pr_err("\tlog_lnum       %u\n", le32_to_cpu(mst->log_lnum));
+		pr_err("\troot_lnum      %u\n", le32_to_cpu(mst->root_lnum));
+		pr_err("\troot_offs      %u\n", le32_to_cpu(mst->root_offs));
+		pr_err("\troot_len       %u\n", le32_to_cpu(mst->root_len));
+		pr_err("\tgc_lnum        %u\n", le32_to_cpu(mst->gc_lnum));
+		pr_err("\tihead_lnum     %u\n", le32_to_cpu(mst->ihead_lnum));
+		pr_err("\tihead_offs     %u\n", le32_to_cpu(mst->ihead_offs));
+		pr_err("\tindex_size     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->index_size));
+		pr_err("\tlpt_lnum       %u\n", le32_to_cpu(mst->lpt_lnum));
+		pr_err("\tlpt_offs       %u\n", le32_to_cpu(mst->lpt_offs));
+		pr_err("\tnhead_lnum     %u\n", le32_to_cpu(mst->nhead_lnum));
+		pr_err("\tnhead_offs     %u\n", le32_to_cpu(mst->nhead_offs));
+		pr_err("\tltab_lnum      %u\n", le32_to_cpu(mst->ltab_lnum));
+		pr_err("\tltab_offs      %u\n", le32_to_cpu(mst->ltab_offs));
+		pr_err("\tlsave_lnum     %u\n", le32_to_cpu(mst->lsave_lnum));
+		pr_err("\tlsave_offs     %u\n", le32_to_cpu(mst->lsave_offs));
+		pr_err("\tlscan_lnum     %u\n", le32_to_cpu(mst->lscan_lnum));
+		pr_err("\tleb_cnt        %u\n", le32_to_cpu(mst->leb_cnt));
+		pr_err("\tempty_lebs     %u\n", le32_to_cpu(mst->empty_lebs));
+		pr_err("\tidx_lebs       %u\n", le32_to_cpu(mst->idx_lebs));
+		pr_err("\ttotal_free     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_free));
+		pr_err("\ttotal_dirty    %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dirty));
+		pr_err("\ttotal_used     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_used));
+		pr_err("\ttotal_dead     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dead));
+		pr_err("\ttotal_dark     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dark));
+		break;
+	}
+	case UBIFS_REF_NODE:
+	{
+		const struct ubifs_ref_node *ref = node;
+
+		pr_err("\tlnum           %u\n", le32_to_cpu(ref->lnum));
+		pr_err("\toffs           %u\n", le32_to_cpu(ref->offs));
+		pr_err("\tjhead          %u\n", le32_to_cpu(ref->jhead));
+		break;
+	}
+	case UBIFS_INO_NODE:
+	{
+		const struct ubifs_ino_node *ino = node;
+
+		key_read(c, &ino->key, &key);
+		pr_err("\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+		pr_err("\tcreat_sqnum    %llu\n",
+		       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
+		pr_err("\tsize           %llu\n",
+		       (unsigned long long)le64_to_cpu(ino->size));
+		pr_err("\tnlink          %u\n", le32_to_cpu(ino->nlink));
+		pr_err("\tatime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->atime_sec),
+		       le32_to_cpu(ino->atime_nsec));
+		pr_err("\tmtime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->mtime_sec),
+		       le32_to_cpu(ino->mtime_nsec));
+		pr_err("\tctime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->ctime_sec),
+		       le32_to_cpu(ino->ctime_nsec));
+		pr_err("\tuid            %u\n", le32_to_cpu(ino->uid));
+		pr_err("\tgid            %u\n", le32_to_cpu(ino->gid));
+		pr_err("\tmode           %u\n", le32_to_cpu(ino->mode));
+		pr_err("\tflags          %#x\n", le32_to_cpu(ino->flags));
+		pr_err("\txattr_cnt      %u\n", le32_to_cpu(ino->xattr_cnt));
+		pr_err("\txattr_size     %u\n", le32_to_cpu(ino->xattr_size));
+		pr_err("\txattr_names    %u\n", le32_to_cpu(ino->xattr_names));
+		pr_err("\tcompr_type     %#x\n",
+		       (int)le16_to_cpu(ino->compr_type));
+		pr_err("\tdata len       %u\n", le32_to_cpu(ino->data_len));
+		break;
+	}
+	case UBIFS_DENT_NODE:
+	case UBIFS_XENT_NODE:
+	{
+		const struct ubifs_dent_node *dent = node;
+		int nlen = le16_to_cpu(dent->nlen);
+
+		key_read(c, &dent->key, &key);
+		pr_err("\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+		pr_err("\tinum           %llu\n",
+		       (unsigned long long)le64_to_cpu(dent->inum));
+		pr_err("\ttype           %d\n", (int)dent->type);
+		pr_err("\tnlen           %d\n", nlen);
+		pr_err("\tname           ");
+
+		if (nlen > UBIFS_MAX_NLEN)
+			pr_err("(bad name length, not printing, bad or corrupted node)");
+		else {
+			for (i = 0; i < nlen && dent->name[i]; i++)
+				pr_cont("%c", dent->name[i]);
+		}
+		pr_cont("\n");
+
+		break;
+	}
+	case UBIFS_DATA_NODE:
+	{
+		const struct ubifs_data_node *dn = node;
+		int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
+
+		key_read(c, &dn->key, &key);
+		pr_err("\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+		pr_err("\tsize           %u\n", le32_to_cpu(dn->size));
+		pr_err("\tcompr_typ      %d\n",
+		       (int)le16_to_cpu(dn->compr_type));
+		pr_err("\tdata size      %d\n", dlen);
+		pr_err("\tdata:\n");
+		print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+			       (void *)&dn->data, dlen, 0);
+		break;
+	}
+	case UBIFS_TRUN_NODE:
+	{
+		const struct ubifs_trun_node *trun = node;
+
+		pr_err("\tinum           %u\n", le32_to_cpu(trun->inum));
+		pr_err("\told_size       %llu\n",
+		       (unsigned long long)le64_to_cpu(trun->old_size));
+		pr_err("\tnew_size       %llu\n",
+		       (unsigned long long)le64_to_cpu(trun->new_size));
+		break;
+	}
+	case UBIFS_IDX_NODE:
+	{
+		const struct ubifs_idx_node *idx = node;
+
+		n = le16_to_cpu(idx->child_cnt);
+		pr_err("\tchild_cnt      %d\n", n);
+		pr_err("\tlevel          %d\n", (int)le16_to_cpu(idx->level));
+		pr_err("\tBranches:\n");
+
+		for (i = 0; i < n && i < c->fanout - 1; i++) {
+			const struct ubifs_branch *br;
+
+			br = ubifs_idx_branch(c, idx, i);
+			key_read(c, &br->key, &key);
+			pr_err("\t%d: LEB %d:%d len %d key %s\n",
+			       i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
+			       le32_to_cpu(br->len),
+			       dbg_snprintf_key(c, &key, key_buf,
+						DBG_KEY_BUF_LEN));
+		}
+		break;
+	}
+	case UBIFS_CS_NODE:
+		break;
+	case UBIFS_ORPH_NODE:
+	{
+		const struct ubifs_orph_node *orph = node;
+
+		pr_err("\tcommit number  %llu\n",
+		       (unsigned long long)
+				le64_to_cpu(orph->cmt_no) & LLONG_MAX);
+		pr_err("\tlast node flag %llu\n",
+		       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
+		n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		pr_err("\t%d orphan inode numbers:\n", n);
+		for (i = 0; i < n; i++)
+			pr_err("\t  ino %llu\n",
+			       (unsigned long long)le64_to_cpu(orph->inos[i]));
+		break;
+	}
+	default:
+		pr_err("node type %d was not recognized\n",
+		       (int)ch->node_type);
+	}
+	spin_unlock(&dbg_lock);
+}
+
+void ubifs_dump_budget_req(const struct ubifs_budget_req *req)
+{
+	spin_lock(&dbg_lock);
+	pr_err("Budgeting request: new_ino %d, dirtied_ino %d\n",
+	       req->new_ino, req->dirtied_ino);
+	pr_err("\tnew_ino_d   %d, dirtied_ino_d %d\n",
+	       req->new_ino_d, req->dirtied_ino_d);
+	pr_err("\tnew_page    %d, dirtied_page %d\n",
+	       req->new_page, req->dirtied_page);
+	pr_err("\tnew_dent    %d, mod_dent     %d\n",
+	       req->new_dent, req->mod_dent);
+	pr_err("\tidx_growth  %d\n", req->idx_growth);
+	pr_err("\tdata_growth %d dd_growth     %d\n",
+	       req->data_growth, req->dd_growth);
+	spin_unlock(&dbg_lock);
+}
+
+void ubifs_dump_lstats(const struct ubifs_lp_stats *lst)
+{
+	spin_lock(&dbg_lock);
+	pr_err("(pid %d) Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
+	       current->pid, lst->empty_lebs, lst->idx_lebs);
+	pr_err("\ttaken_empty_lebs %d, total_free %lld, total_dirty %lld\n",
+	       lst->taken_empty_lebs, lst->total_free, lst->total_dirty);
+	pr_err("\ttotal_used %lld, total_dark %lld, total_dead %lld\n",
+	       lst->total_used, lst->total_dark, lst->total_dead);
+	spin_unlock(&dbg_lock);
+}
+
+void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
+{
+	int i;
+	struct rb_node *rb;
+	struct ubifs_bud *bud;
+	struct ubifs_gced_idx_leb *idx_gc;
+	long long available, outstanding, free;
+
+	spin_lock(&c->space_lock);
+	spin_lock(&dbg_lock);
+	pr_err("(pid %d) Budgeting info: data budget sum %lld, total budget sum %lld\n",
+	       current->pid, bi->data_growth + bi->dd_growth,
+	       bi->data_growth + bi->dd_growth + bi->idx_growth);
+	pr_err("\tbudg_data_growth %lld, budg_dd_growth %lld, budg_idx_growth %lld\n",
+	       bi->data_growth, bi->dd_growth, bi->idx_growth);
+	pr_err("\tmin_idx_lebs %d, old_idx_sz %llu, uncommitted_idx %lld\n",
+	       bi->min_idx_lebs, bi->old_idx_sz, bi->uncommitted_idx);
+	pr_err("\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+	       bi->page_budget, bi->inode_budget, bi->dent_budget);
+	pr_err("\tnospace %u, nospace_rp %u\n", bi->nospace, bi->nospace_rp);
+	pr_err("\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+
+	if (bi != &c->bi)
+		/*
+		 * If we are dumping saved budgeting data, do not print
+		 * additional information which is about the current state, not
+		 * the old one which corresponded to the saved budgeting data.
+		 */
+		goto out_unlock;
+
+	pr_err("\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+	       c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
+	pr_err("\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, clean_zn_cnt %ld\n",
+	       atomic_long_read(&c->dirty_pg_cnt),
+	       atomic_long_read(&c->dirty_zn_cnt),
+	       atomic_long_read(&c->clean_zn_cnt));
+	pr_err("\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum);
+
+	/* If we are in R/O mode, journal heads do not exist */
+	if (c->jheads)
+		for (i = 0; i < c->jhead_cnt; i++)
+			pr_err("\tjhead %s\t LEB %d\n",
+			       dbg_jhead(c->jheads[i].wbuf.jhead),
+			       c->jheads[i].wbuf.lnum);
+	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
+		bud = rb_entry(rb, struct ubifs_bud, rb);
+		pr_err("\tbud LEB %d\n", bud->lnum);
+	}
+	list_for_each_entry(bud, &c->old_buds, list)
+		pr_err("\told bud LEB %d\n", bud->lnum);
+	list_for_each_entry(idx_gc, &c->idx_gc, list)
+		pr_err("\tGC'ed idx LEB %d unmap %d\n",
+		       idx_gc->lnum, idx_gc->unmap);
+	pr_err("\tcommit state %d\n", c->cmt_state);
+
+	/* Print budgeting predictions */
+	available = ubifs_calc_available(c, c->bi.min_idx_lebs);
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
+	free = ubifs_get_free_space_nolock(c);
+	pr_err("Budgeting predictions:\n");
+	pr_err("\tavailable: %lld, outstanding %lld, free %lld\n",
+	       available, outstanding, free);
+out_unlock:
+	spin_unlock(&dbg_lock);
+	spin_unlock(&c->space_lock);
+}
+
+void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+{
+	int i, spc, dark = 0, dead = 0;
+	struct rb_node *rb;
+	struct ubifs_bud *bud;
+
+	spc = lp->free + lp->dirty;
+	if (spc < c->dead_wm)
+		dead = spc;
+	else
+		dark = ubifs_calc_dark(c, spc);
+
+	if (lp->flags & LPROPS_INDEX)
+		pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d flags %#x (",
+		       lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc,
+		       lp->flags);
+	else
+		pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d flags %#-4x (",
+		       lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc,
+		       dark, dead, (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
+
+	if (lp->flags & LPROPS_TAKEN) {
+		if (lp->flags & LPROPS_INDEX)
+			pr_cont("index, taken");
+		else
+			pr_cont("taken");
+	} else {
+		const char *s;
+
+		if (lp->flags & LPROPS_INDEX) {
+			switch (lp->flags & LPROPS_CAT_MASK) {
+			case LPROPS_DIRTY_IDX:
+				s = "dirty index";
+				break;
+			case LPROPS_FRDI_IDX:
+				s = "freeable index";
+				break;
+			default:
+				s = "index";
+			}
+		} else {
+			switch (lp->flags & LPROPS_CAT_MASK) {
+			case LPROPS_UNCAT:
+				s = "not categorized";
+				break;
+			case LPROPS_DIRTY:
+				s = "dirty";
+				break;
+			case LPROPS_FREE:
+				s = "free";
+				break;
+			case LPROPS_EMPTY:
+				s = "empty";
+				break;
+			case LPROPS_FREEABLE:
+				s = "freeable";
+				break;
+			default:
+				s = NULL;
+				break;
+			}
+		}
+		pr_cont("%s", s);
+	}
+
+	for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
+		bud = rb_entry(rb, struct ubifs_bud, rb);
+		if (bud->lnum == lp->lnum) {
+			int head = 0;
+			for (i = 0; i < c->jhead_cnt; i++) {
+				/*
+				 * Note, if we are in R/O mode or in the middle
+				 * of mounting/re-mounting, the write-buffers do
+				 * not exist.
+				 */
+				if (c->jheads &&
+				    lp->lnum == c->jheads[i].wbuf.lnum) {
+					pr_cont(", jhead %s", dbg_jhead(i));
+					head = 1;
+				}
+			}
+			if (!head)
+				pr_cont(", bud of jhead %s",
+				       dbg_jhead(bud->jhead));
+		}
+	}
+	if (lp->lnum == c->gc_lnum)
+		pr_cont(", GC LEB");
+	pr_cont(")\n");
+}
+
+void ubifs_dump_lprops(struct ubifs_info *c)
+{
+	int lnum, err;
+	struct ubifs_lprops lp;
+	struct ubifs_lp_stats lst;
+
+	pr_err("(pid %d) start dumping LEB properties\n", current->pid);
+	ubifs_get_lp_stats(c, &lst);
+	ubifs_dump_lstats(&lst);
+
+	for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+		err = ubifs_read_one_lp(c, lnum, &lp);
+		if (err) {
+			ubifs_err(c, "cannot read lprops for LEB %d", lnum);
+			continue;
+		}
+
+		ubifs_dump_lprop(c, &lp);
+	}
+	pr_err("(pid %d) finish dumping LEB properties\n", current->pid);
+}
+
+void ubifs_dump_lpt_info(struct ubifs_info *c)
+{
+	int i;
+
+	spin_lock(&dbg_lock);
+	pr_err("(pid %d) dumping LPT information\n", current->pid);
+	pr_err("\tlpt_sz:        %lld\n", c->lpt_sz);
+	pr_err("\tpnode_sz:      %d\n", c->pnode_sz);
+	pr_err("\tnnode_sz:      %d\n", c->nnode_sz);
+	pr_err("\tltab_sz:       %d\n", c->ltab_sz);
+	pr_err("\tlsave_sz:      %d\n", c->lsave_sz);
+	pr_err("\tbig_lpt:       %d\n", c->big_lpt);
+	pr_err("\tlpt_hght:      %d\n", c->lpt_hght);
+	pr_err("\tpnode_cnt:     %d\n", c->pnode_cnt);
+	pr_err("\tnnode_cnt:     %d\n", c->nnode_cnt);
+	pr_err("\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
+	pr_err("\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
+	pr_err("\tlsave_cnt:     %d\n", c->lsave_cnt);
+	pr_err("\tspace_bits:    %d\n", c->space_bits);
+	pr_err("\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+	pr_err("\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+	pr_err("\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
+	pr_err("\tpcnt_bits:     %d\n", c->pcnt_bits);
+	pr_err("\tlnum_bits:     %d\n", c->lnum_bits);
+	pr_err("\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+	pr_err("\tLPT head is at %d:%d\n",
+	       c->nhead_lnum, c->nhead_offs);
+	pr_err("\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		pr_err("\tLPT lsave is at %d:%d\n",
+		       c->lsave_lnum, c->lsave_offs);
+	for (i = 0; i < c->lpt_lebs; i++)
+		pr_err("\tLPT LEB %d free %d dirty %d tgc %d cmt %d\n",
+		       i + c->lpt_first, c->ltab[i].free, c->ltab[i].dirty,
+		       c->ltab[i].tgc, c->ltab[i].cmt);
+	spin_unlock(&dbg_lock);
+}
+
+void ubifs_dump_sleb(const struct ubifs_info *c,
+		     const struct ubifs_scan_leb *sleb, int offs)
+{
+	struct ubifs_scan_node *snod;
+
+	pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n",
+	       current->pid, sleb->lnum, offs);
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		pr_err("Dumping node at LEB %d:%d len %d\n",
+		       sleb->lnum, snod->offs, snod->len);
+		ubifs_dump_node(c, snod->node);
+	}
+}
+
+void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	void *buf;
+
+	pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
+
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum);
+		return;
+	}
+
+	sleb = ubifs_scan(c, lnum, 0, buf, 0);
+	if (IS_ERR(sleb)) {
+		ubifs_err(c, "scan error %d", (int)PTR_ERR(sleb));
+		goto out;
+	}
+
+	pr_err("LEB %d has %d nodes ending at %d\n", lnum,
+	       sleb->nodes_cnt, sleb->endpt);
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		pr_err("Dumping node at LEB %d:%d len %d\n", lnum,
+		       snod->offs, snod->len);
+		ubifs_dump_node(c, snod->node);
+	}
+
+	pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
+	ubifs_scan_destroy(sleb);
+
+out:
+	vfree(buf);
+	return;
+}
+
+void ubifs_dump_znode(const struct ubifs_info *c,
+		      const struct ubifs_znode *znode)
+{
+	int n;
+	const struct ubifs_zbranch *zbr;
+	char key_buf[DBG_KEY_BUF_LEN];
+
+	spin_lock(&dbg_lock);
+	if (znode->parent)
+		zbr = &znode->parent->zbranch[znode->iip];
+	else
+		zbr = &c->zroot;
+
+	pr_err("znode %p, LEB %d:%d len %d parent %p iip %d level %d child_cnt %d flags %lx\n",
+	       znode, zbr->lnum, zbr->offs, zbr->len, znode->parent, znode->iip,
+	       znode->level, znode->child_cnt, znode->flags);
+
+	if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+		spin_unlock(&dbg_lock);
+		return;
+	}
+
+	pr_err("zbranches:\n");
+	for (n = 0; n < znode->child_cnt; n++) {
+		zbr = &znode->zbranch[n];
+		if (znode->level > 0)
+			pr_err("\t%d: znode %p LEB %d:%d len %d key %s\n",
+			       n, zbr->znode, zbr->lnum, zbr->offs, zbr->len,
+			       dbg_snprintf_key(c, &zbr->key, key_buf,
+						DBG_KEY_BUF_LEN));
+		else
+			pr_err("\t%d: LNC %p LEB %d:%d len %d key %s\n",
+			       n, zbr->znode, zbr->lnum, zbr->offs, zbr->len,
+			       dbg_snprintf_key(c, &zbr->key, key_buf,
+						DBG_KEY_BUF_LEN));
+	}
+	spin_unlock(&dbg_lock);
+}
+
+void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+{
+	int i;
+
+	pr_err("(pid %d) start dumping heap cat %d (%d elements)\n",
+	       current->pid, cat, heap->cnt);
+	for (i = 0; i < heap->cnt; i++) {
+		struct ubifs_lprops *lprops = heap->arr[i];
+
+		pr_err("\t%d. LEB %d hpos %d free %d dirty %d flags %d\n",
+		       i, lprops->lnum, lprops->hpos, lprops->free,
+		       lprops->dirty, lprops->flags);
+	}
+	pr_err("(pid %d) finish dumping heap\n", current->pid);
+}
+
+void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		      struct ubifs_nnode *parent, int iip)
+{
+	int i;
+
+	pr_err("(pid %d) dumping pnode:\n", current->pid);
+	pr_err("\taddress %zx parent %zx cnext %zx\n",
+	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
+	pr_err("\tflags %lu iip %d level %d num %d\n",
+	       pnode->flags, iip, pnode->level, pnode->num);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops *lp = &pnode->lprops[i];
+
+		pr_err("\t%d: free %d dirty %d flags %d lnum %d\n",
+		       i, lp->free, lp->dirty, lp->flags, lp->lnum);
+	}
+}
+
+void ubifs_dump_tnc(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode;
+	int level;
+
+	pr_err("\n");
+	pr_err("(pid %d) start dumping TNC tree\n", current->pid);
+	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+	level = znode->level;
+	pr_err("== Level %d ==\n", level);
+	while (znode) {
+		if (level != znode->level) {
+			level = znode->level;
+			pr_err("== Level %d ==\n", level);
+		}
+		ubifs_dump_znode(c, znode);
+		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+	}
+	pr_err("(pid %d) finish dumping TNC tree\n", current->pid);
+}
+
+static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
+		      void *priv)
+{
+	ubifs_dump_znode(c, znode);
+	return 0;
+}
+
+/**
+ * ubifs_dump_index - dump the on-flash index.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps whole UBIFS indexing B-tree, unlike 'ubifs_dump_tnc()'
+ * which dumps only in-memory znodes and does not read znodes which from flash.
+ */
+void ubifs_dump_index(struct ubifs_info *c)
+{
+	dbg_walk_index(c, NULL, dump_znode, NULL);
+}
+
+/**
+ * dbg_save_space_info - save information about flash space.
+ * @c: UBIFS file-system description object
+ *
+ * This function saves information about UBIFS free space, dirty space, etc, in
+ * order to check it later.
+ */
+void dbg_save_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	int freeable_cnt;
+
+	spin_lock(&c->space_lock);
+	memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+	memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+	d->saved_idx_gc_cnt = c->idx_gc_cnt;
+
+	/*
+	 * We use a dirty hack here and zero out @c->freeable_cnt, because it
+	 * affects the free space calculations, and UBIFS might not know about
+	 * all freeable eraseblocks. Indeed, we know about freeable eraseblocks
+	 * only when we read their lprops, and we do this only lazily, upon the
+	 * need. So at any given point of time @c->freeable_cnt might be not
+	 * exactly accurate.
+	 *
+	 * Just one example about the issue we hit when we did not zero
+	 * @c->freeable_cnt.
+	 * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the
+	 *    amount of free space in @d->saved_free
+	 * 2. We re-mount R/W, which makes UBIFS to read the "lsave"
+	 *    information from flash, where we cache LEBs from various
+	 *    categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()'
+	 *    -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()'
+	 *    -> 'ubifs_get_pnode()' -> 'update_cats()'
+	 *    -> 'ubifs_add_to_cat()').
+	 * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt
+	 *    becomes %1.
+	 * 4. We calculate the amount of free space when the re-mount is
+	 *    finished in 'dbg_check_space_info()' and it does not match
+	 *    @d->saved_free.
+	 */
+	freeable_cnt = c->freeable_cnt;
+	c->freeable_cnt = 0;
+	d->saved_free = ubifs_get_free_space_nolock(c);
+	c->freeable_cnt = freeable_cnt;
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * dbg_check_space_info - check flash space information.
+ * @c: UBIFS file-system description object
+ *
+ * This function compares current flash space information with the information
+ * which was saved when the 'dbg_save_space_info()' function was called.
+ * Returns zero if the information has not changed, and %-EINVAL it it has
+ * changed.
+ */
+int dbg_check_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	struct ubifs_lp_stats lst;
+	long long free;
+	int freeable_cnt;
+
+	spin_lock(&c->space_lock);
+	freeable_cnt = c->freeable_cnt;
+	c->freeable_cnt = 0;
+	free = ubifs_get_free_space_nolock(c);
+	c->freeable_cnt = freeable_cnt;
+	spin_unlock(&c->space_lock);
+
+	if (free != d->saved_free) {
+		ubifs_err(c, "free space changed from %lld to %lld",
+			  d->saved_free, free);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_msg(c, "saved lprops statistics dump");
+	ubifs_dump_lstats(&d->saved_lst);
+	ubifs_msg(c, "saved budgeting info dump");
+	ubifs_dump_budg(c, &d->saved_bi);
+	ubifs_msg(c, "saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
+	ubifs_msg(c, "current lprops statistics dump");
+	ubifs_get_lp_stats(c, &lst);
+	ubifs_dump_lstats(&lst);
+	ubifs_msg(c, "current budgeting info dump");
+	ubifs_dump_budg(c, &c->bi);
+	dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_synced_i_size - check synchronized inode size.
+ * @c: UBIFS file-system description object
+ * @inode: inode to check
+ *
+ * If inode is clean, synchronized inode size has to be equivalent to current
+ * inode size. This function has to be called only for locked inodes (@i_mutex
+ * has to be locked). Returns %0 if synchronized inode size if correct, and
+ * %-EINVAL if not.
+ */
+int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode)
+{
+	int err = 0;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (!dbg_is_chk_gen(c))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+
+	mutex_lock(&ui->ui_mutex);
+	spin_lock(&ui->ui_lock);
+	if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
+		ubifs_err(c, "ui_size is %lld, synced_i_size is %lld, but inode is clean",
+			  ui->ui_size, ui->synced_i_size);
+		ubifs_err(c, "i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
+			  inode->i_mode, i_size_read(inode));
+		dump_stack();
+		err = -EINVAL;
+	}
+	spin_unlock(&ui->ui_lock);
+	mutex_unlock(&ui->ui_mutex);
+	return err;
+}
+
+/*
+ * dbg_check_dir - check directory inode size and link count.
+ * @c: UBIFS file-system description object
+ * @dir: the directory to calculate size for
+ * @size: the result is returned here
+ *
+ * This function makes sure that directory size and link count are correct.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, it is good idea to make sure the @dir->i_mutex is locked before
+ * calling this function.
+ */
+int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
+{
+	unsigned int nlink = 2;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent, *pdent = NULL;
+	struct qstr nm = { .name = NULL };
+	loff_t size = UBIFS_INO_NODE_SZ;
+
+	if (!dbg_is_chk_gen(c))
+		return 0;
+
+	if (!S_ISDIR(dir->i_mode))
+		return 0;
+
+	lowest_dent_key(c, &key, dir->i_ino);
+	while (1) {
+		int err;
+
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			if (err == -ENOENT)
+				break;
+			return err;
+		}
+
+		nm.name = dent->name;
+		nm.len = le16_to_cpu(dent->nlen);
+		size += CALC_DENT_SIZE(nm.len);
+		if (dent->type == UBIFS_ITYPE_DIR)
+			nlink += 1;
+		kfree(pdent);
+		pdent = dent;
+		key_read(c, &dent->key, &key);
+	}
+	kfree(pdent);
+
+	if (i_size_read(dir) != size) {
+		ubifs_err(c, "directory inode %lu has size %llu, but calculated size is %llu",
+			  dir->i_ino, (unsigned long long)i_size_read(dir),
+			  (unsigned long long)size);
+		ubifs_dump_inode(c, dir);
+		dump_stack();
+		return -EINVAL;
+	}
+	if (dir->i_nlink != nlink) {
+		ubifs_err(c, "directory inode %lu has nlink %u, but calculated nlink is %u",
+			  dir->i_ino, dir->i_nlink, nlink);
+		ubifs_dump_inode(c, dir);
+		dump_stack();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_check_key_order - make sure that colliding keys are properly ordered.
+ * @c: UBIFS file-system description object
+ * @zbr1: first zbranch
+ * @zbr2: following zbranch
+ *
+ * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
+ * names of the direntries/xentries which are referred by the keys. This
+ * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
+ * sure the name of direntry/xentry referred by @zbr1 is less than
+ * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
+ * and a negative error code in case of failure.
+ */
+static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
+			       struct ubifs_zbranch *zbr2)
+{
+	int err, nlen1, nlen2, cmp;
+	struct ubifs_dent_node *dent1, *dent2;
+	union ubifs_key key;
+	char key_buf[DBG_KEY_BUF_LEN];
+
+	ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
+	dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent1)
+		return -ENOMEM;
+	dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent2) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	err = ubifs_tnc_read_node(c, zbr1, dent1);
+	if (err)
+		goto out_free;
+	err = ubifs_validate_entry(c, dent1);
+	if (err)
+		goto out_free;
+
+	err = ubifs_tnc_read_node(c, zbr2, dent2);
+	if (err)
+		goto out_free;
+	err = ubifs_validate_entry(c, dent2);
+	if (err)
+		goto out_free;
+
+	/* Make sure node keys are the same as in zbranch */
+	err = 1;
+	key_read(c, &dent1->key, &key);
+	if (keys_cmp(c, &zbr1->key, &key)) {
+		ubifs_err(c, "1st entry at %d:%d has key %s", zbr1->lnum,
+			  zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+						       DBG_KEY_BUF_LEN));
+		ubifs_err(c, "but it should have key %s according to tnc",
+			  dbg_snprintf_key(c, &zbr1->key, key_buf,
+					   DBG_KEY_BUF_LEN));
+		ubifs_dump_node(c, dent1);
+		goto out_free;
+	}
+
+	key_read(c, &dent2->key, &key);
+	if (keys_cmp(c, &zbr2->key, &key)) {
+		ubifs_err(c, "2nd entry at %d:%d has key %s", zbr1->lnum,
+			  zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+						       DBG_KEY_BUF_LEN));
+		ubifs_err(c, "but it should have key %s according to tnc",
+			  dbg_snprintf_key(c, &zbr2->key, key_buf,
+					   DBG_KEY_BUF_LEN));
+		ubifs_dump_node(c, dent2);
+		goto out_free;
+	}
+
+	nlen1 = le16_to_cpu(dent1->nlen);
+	nlen2 = le16_to_cpu(dent2->nlen);
+
+	cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
+	if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
+		err = 0;
+		goto out_free;
+	}
+	if (cmp == 0 && nlen1 == nlen2)
+		ubifs_err(c, "2 xent/dent nodes with the same name");
+	else
+		ubifs_err(c, "bad order of colliding key %s",
+			  dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+
+	ubifs_msg(c, "first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+	ubifs_dump_node(c, dent1);
+	ubifs_msg(c, "second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+	ubifs_dump_node(c, dent2);
+
+out_free:
+	kfree(dent2);
+	kfree(dent1);
+	return err;
+}
+
+/**
+ * dbg_check_znode - check if znode is all right.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch which points to this znode
+ *
+ * This function makes sure that znode referred to by @zbr is all right.
+ * Returns zero if it is, and %-EINVAL if it is not.
+ */
+static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
+{
+	struct ubifs_znode *znode = zbr->znode;
+	struct ubifs_znode *zp = znode->parent;
+	int n, err, cmp;
+
+	if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+		err = 1;
+		goto out;
+	}
+	if (znode->level < 0) {
+		err = 2;
+		goto out;
+	}
+	if (znode->iip < 0 || znode->iip >= c->fanout) {
+		err = 3;
+		goto out;
+	}
+
+	if (zbr->len == 0)
+		/* Only dirty zbranch may have no on-flash nodes */
+		if (!ubifs_zn_dirty(znode)) {
+			err = 4;
+			goto out;
+		}
+
+	if (ubifs_zn_dirty(znode)) {
+		/*
+		 * If znode is dirty, its parent has to be dirty as well. The
+		 * order of the operation is important, so we have to have
+		 * memory barriers.
+		 */
+		smp_mb();
+		if (zp && !ubifs_zn_dirty(zp)) {
+			/*
+			 * The dirty flag is atomic and is cleared outside the
+			 * TNC mutex, so znode's dirty flag may now have
+			 * been cleared. The child is always cleared before the
+			 * parent, so we just need to check again.
+			 */
+			smp_mb();
+			if (ubifs_zn_dirty(znode)) {
+				err = 5;
+				goto out;
+			}
+		}
+	}
+
+	if (zp) {
+		const union ubifs_key *min, *max;
+
+		if (znode->level != zp->level - 1) {
+			err = 6;
+			goto out;
+		}
+
+		/* Make sure the 'parent' pointer in our znode is correct */
+		err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
+		if (!err) {
+			/* This zbranch does not exist in the parent */
+			err = 7;
+			goto out;
+		}
+
+		if (znode->iip >= zp->child_cnt) {
+			err = 8;
+			goto out;
+		}
+
+		if (znode->iip != n) {
+			/* This may happen only in case of collisions */
+			if (keys_cmp(c, &zp->zbranch[n].key,
+				     &zp->zbranch[znode->iip].key)) {
+				err = 9;
+				goto out;
+			}
+			n = znode->iip;
+		}
+
+		/*
+		 * Make sure that the first key in our znode is greater than or
+		 * equal to the key in the pointing zbranch.
+		 */
+		min = &zbr->key;
+		cmp = keys_cmp(c, min, &znode->zbranch[0].key);
+		if (cmp == 1) {
+			err = 10;
+			goto out;
+		}
+
+		if (n + 1 < zp->child_cnt) {
+			max = &zp->zbranch[n + 1].key;
+
+			/*
+			 * Make sure the last key in our znode is less or
+			 * equivalent than the key in the zbranch which goes
+			 * after our pointing zbranch.
+			 */
+			cmp = keys_cmp(c, max,
+				&znode->zbranch[znode->child_cnt - 1].key);
+			if (cmp == -1) {
+				err = 11;
+				goto out;
+			}
+		}
+	} else {
+		/* This may only be root znode */
+		if (zbr != &c->zroot) {
+			err = 12;
+			goto out;
+		}
+	}
+
+	/*
+	 * Make sure that next key is greater or equivalent then the previous
+	 * one.
+	 */
+	for (n = 1; n < znode->child_cnt; n++) {
+		cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
+			       &znode->zbranch[n].key);
+		if (cmp > 0) {
+			err = 13;
+			goto out;
+		}
+		if (cmp == 0) {
+			/* This can only be keys with colliding hash */
+			if (!is_hash_key(c, &znode->zbranch[n].key)) {
+				err = 14;
+				goto out;
+			}
+
+			if (znode->level != 0 || c->replaying)
+				continue;
+
+			/*
+			 * Colliding keys should follow binary order of
+			 * corresponding xentry/dentry names.
+			 */
+			err = dbg_check_key_order(c, &znode->zbranch[n - 1],
+						  &znode->zbranch[n]);
+			if (err < 0)
+				return err;
+			if (err) {
+				err = 15;
+				goto out;
+			}
+		}
+	}
+
+	for (n = 0; n < znode->child_cnt; n++) {
+		if (!znode->zbranch[n].znode &&
+		    (znode->zbranch[n].lnum == 0 ||
+		     znode->zbranch[n].len == 0)) {
+			err = 16;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum != 0 &&
+		    znode->zbranch[n].len == 0) {
+			err = 17;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum == 0 &&
+		    znode->zbranch[n].len != 0) {
+			err = 18;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum == 0 &&
+		    znode->zbranch[n].offs != 0) {
+			err = 19;
+			goto out;
+		}
+
+		if (znode->level != 0 && znode->zbranch[n].znode)
+			if (znode->zbranch[n].znode->parent != znode) {
+				err = 20;
+				goto out;
+			}
+	}
+
+	return 0;
+
+out:
+	ubifs_err(c, "failed, error %d", err);
+	ubifs_msg(c, "dump of the znode");
+	ubifs_dump_znode(c, znode);
+	if (zp) {
+		ubifs_msg(c, "dump of the parent znode");
+		ubifs_dump_znode(c, zp);
+	}
+	dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_tnc - check TNC tree.
+ * @c: UBIFS file-system description object
+ * @extra: do extra checks that are possible at start commit
+ *
+ * This function traverses whole TNC tree and checks every znode. Returns zero
+ * if everything is all right and %-EINVAL if something is wrong with TNC.
+ */
+int dbg_check_tnc(struct ubifs_info *c, int extra)
+{
+	struct ubifs_znode *znode;
+	long clean_cnt = 0, dirty_cnt = 0;
+	int err, last;
+
+	if (!dbg_is_chk_index(c))
+		return 0;
+
+	ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+	if (!c->zroot.znode)
+		return 0;
+
+	znode = ubifs_tnc_postorder_first(c->zroot.znode);
+	while (1) {
+		struct ubifs_znode *prev;
+		struct ubifs_zbranch *zbr;
+
+		if (!znode->parent)
+			zbr = &c->zroot;
+		else
+			zbr = &znode->parent->zbranch[znode->iip];
+
+		err = dbg_check_znode(c, zbr);
+		if (err)
+			return err;
+
+		if (extra) {
+			if (ubifs_zn_dirty(znode))
+				dirty_cnt += 1;
+			else
+				clean_cnt += 1;
+		}
+
+		prev = znode;
+		znode = ubifs_tnc_postorder_next(znode);
+		if (!znode)
+			break;
+
+		/*
+		 * If the last key of this znode is equivalent to the first key
+		 * of the next znode (collision), then check order of the keys.
+		 */
+		last = prev->child_cnt - 1;
+		if (prev->level == 0 && znode->level == 0 && !c->replaying &&
+		    !keys_cmp(c, &prev->zbranch[last].key,
+			      &znode->zbranch[0].key)) {
+			err = dbg_check_key_order(c, &prev->zbranch[last],
+						  &znode->zbranch[0]);
+			if (err < 0)
+				return err;
+			if (err) {
+				ubifs_msg(c, "first znode");
+				ubifs_dump_znode(c, prev);
+				ubifs_msg(c, "second znode");
+				ubifs_dump_znode(c, znode);
+				return -EINVAL;
+			}
+		}
+	}
+
+	if (extra) {
+		if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
+			ubifs_err(c, "incorrect clean_zn_cnt %ld, calculated %ld",
+				  atomic_long_read(&c->clean_zn_cnt),
+				  clean_cnt);
+			return -EINVAL;
+		}
+		if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
+			ubifs_err(c, "incorrect dirty_zn_cnt %ld, calculated %ld",
+				  atomic_long_read(&c->dirty_zn_cnt),
+				  dirty_cnt);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_walk_index - walk the on-flash index.
+ * @c: UBIFS file-system description object
+ * @leaf_cb: called for each leaf node
+ * @znode_cb: called for each indexing node
+ * @priv: private data which is passed to callbacks
+ *
+ * This function walks the UBIFS index and calls the @leaf_cb for each leaf
+ * node and @znode_cb for each indexing node. Returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * It would be better if this function removed every znode it pulled to into
+ * the TNC, so that the behavior more closely matched the non-debugging
+ * behavior.
+ */
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+		   dbg_znode_callback znode_cb, void *priv)
+{
+	int err;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *znode, *child;
+
+	mutex_lock(&c->tnc_mutex);
+	/* If the root indexing node is not in TNC - pull it */
+	if (!c->zroot.znode) {
+		c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(c->zroot.znode)) {
+			err = PTR_ERR(c->zroot.znode);
+			c->zroot.znode = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * We are going to traverse the indexing tree in the postorder manner.
+	 * Go down and find the leftmost indexing node where we are going to
+	 * start from.
+	 */
+	znode = c->zroot.znode;
+	while (znode->level > 0) {
+		zbr = &znode->zbranch[0];
+		child = zbr->znode;
+		if (!child) {
+			child = ubifs_load_znode(c, zbr, znode, 0);
+			if (IS_ERR(child)) {
+				err = PTR_ERR(child);
+				goto out_unlock;
+			}
+			zbr->znode = child;
+		}
+
+		znode = child;
+	}
+
+	/* Iterate over all indexing nodes */
+	while (1) {
+		int idx;
+
+		cond_resched();
+
+		if (znode_cb) {
+			err = znode_cb(c, znode, priv);
+			if (err) {
+				ubifs_err(c, "znode checking function returned error %d",
+					  err);
+				ubifs_dump_znode(c, znode);
+				goto out_dump;
+			}
+		}
+		if (leaf_cb && znode->level == 0) {
+			for (idx = 0; idx < znode->child_cnt; idx++) {
+				zbr = &znode->zbranch[idx];
+				err = leaf_cb(c, zbr, priv);
+				if (err) {
+					ubifs_err(c, "leaf checking function returned error %d, for leaf at LEB %d:%d",
+						  err, zbr->lnum, zbr->offs);
+					goto out_dump;
+				}
+			}
+		}
+
+		if (!znode->parent)
+			break;
+
+		idx = znode->iip + 1;
+		znode = znode->parent;
+		if (idx < znode->child_cnt) {
+			/* Switch to the next index in the parent */
+			zbr = &znode->zbranch[idx];
+			child = zbr->znode;
+			if (!child) {
+				child = ubifs_load_znode(c, zbr, znode, idx);
+				if (IS_ERR(child)) {
+					err = PTR_ERR(child);
+					goto out_unlock;
+				}
+				zbr->znode = child;
+			}
+			znode = child;
+		} else
+			/*
+			 * This is the last child, switch to the parent and
+			 * continue.
+			 */
+			continue;
+
+		/* Go to the lowest leftmost znode in the new sub-tree */
+		while (znode->level > 0) {
+			zbr = &znode->zbranch[0];
+			child = zbr->znode;
+			if (!child) {
+				child = ubifs_load_znode(c, zbr, znode, 0);
+				if (IS_ERR(child)) {
+					err = PTR_ERR(child);
+					goto out_unlock;
+				}
+				zbr->znode = child;
+			}
+			znode = child;
+		}
+	}
+
+	mutex_unlock(&c->tnc_mutex);
+	return 0;
+
+out_dump:
+	if (znode->parent)
+		zbr = &znode->parent->zbranch[znode->iip];
+	else
+		zbr = &c->zroot;
+	ubifs_msg(c, "dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
+	ubifs_dump_znode(c, znode);
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * add_size - add znode size to partially calculated index size.
+ * @c: UBIFS file-system description object
+ * @znode: znode to add size for
+ * @priv: partially calculated index size
+ *
+ * This is a helper function for 'dbg_check_idx_size()' which is called for
+ * every indexing node and adds its size to the 'long long' variable pointed to
+ * by @priv.
+ */
+static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
+{
+	long long *idx_size = priv;
+	int add;
+
+	add = ubifs_idx_node_sz(c, znode->child_cnt);
+	add = ALIGN(add, 8);
+	*idx_size += add;
+	return 0;
+}
+
+/**
+ * dbg_check_idx_size - check index size.
+ * @c: UBIFS file-system description object
+ * @idx_size: size to check
+ *
+ * This function walks the UBIFS index, calculates its size and checks that the
+ * size is equivalent to @idx_size. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
+{
+	int err;
+	long long calc = 0;
+
+	if (!dbg_is_chk_index(c))
+		return 0;
+
+	err = dbg_walk_index(c, NULL, add_size, &calc);
+	if (err) {
+		ubifs_err(c, "error %d while walking the index", err);
+		return err;
+	}
+
+	if (calc != idx_size) {
+		ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld",
+			  calc, idx_size);
+		dump_stack();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * struct fsck_inode - information about an inode used when checking the file-system.
+ * @rb: link in the RB-tree of inodes
+ * @inum: inode number
+ * @mode: inode type, permissions, etc
+ * @nlink: inode link count
+ * @xattr_cnt: count of extended attributes
+ * @references: how many directory/xattr entries refer this inode (calculated
+ *              while walking the index)
+ * @calc_cnt: for directory inode count of child directories
+ * @size: inode size (read from on-flash inode)
+ * @xattr_sz: summary size of all extended attributes (read from on-flash
+ *            inode)
+ * @calc_sz: for directories calculated directory size
+ * @calc_xcnt: count of extended attributes
+ * @calc_xsz: calculated summary size of all extended attributes
+ * @xattr_nms: sum of lengths of all extended attribute names belonging to this
+ *             inode (read from on-flash inode)
+ * @calc_xnms: calculated sum of lengths of all extended attribute names
+ */
+struct fsck_inode {
+	struct rb_node rb;
+	ino_t inum;
+	umode_t mode;
+	unsigned int nlink;
+	unsigned int xattr_cnt;
+	int references;
+	int calc_cnt;
+	long long size;
+	unsigned int xattr_sz;
+	long long calc_sz;
+	long long calc_xcnt;
+	long long calc_xsz;
+	unsigned int xattr_nms;
+	long long calc_xnms;
+};
+
+/**
+ * struct fsck_data - private FS checking information.
+ * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
+ */
+struct fsck_data {
+	struct rb_root inodes;
+};
+
+/**
+ * add_inode - add inode information to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @ino: raw UBIFS inode to add
+ *
+ * This is a helper function for 'check_leaf()' which adds information about
+ * inode @ino to the RB-tree of inodes. Returns inode information pointer in
+ * case of success and a negative error code in case of failure.
+ */
+static struct fsck_inode *add_inode(struct ubifs_info *c,
+				    struct fsck_data *fsckd,
+				    struct ubifs_ino_node *ino)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fsck_inode *fscki;
+	ino_t inum = key_inum_flash(c, &ino->key);
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	p = &fsckd->inodes.rb_node;
+	while (*p) {
+		parent = *p;
+		fscki = rb_entry(parent, struct fsck_inode, rb);
+		if (inum < fscki->inum)
+			p = &(*p)->rb_left;
+		else if (inum > fscki->inum)
+			p = &(*p)->rb_right;
+		else
+			return fscki;
+	}
+
+	if (inum > c->highest_inum) {
+		ubifs_err(c, "too high inode number, max. is %lu",
+			  (unsigned long)c->highest_inum);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
+	if (!fscki)
+		return ERR_PTR(-ENOMEM);
+
+	inode = ilookup(c->vfs_sb, inum);
+
+	fscki->inum = inum;
+	/*
+	 * If the inode is present in the VFS inode cache, use it instead of
+	 * the on-flash inode which might be out-of-date. E.g., the size might
+	 * be out-of-date. If we do not do this, the following may happen, for
+	 * example:
+	 *   1. A power cut happens
+	 *   2. We mount the file-system R/O, the replay process fixes up the
+	 *      inode size in the VFS cache, but on on-flash.
+	 *   3. 'check_leaf()' fails because it hits a data node beyond inode
+	 *      size.
+	 */
+	if (!inode) {
+		fscki->nlink = le32_to_cpu(ino->nlink);
+		fscki->size = le64_to_cpu(ino->size);
+		fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+		fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+		fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+		fscki->mode = le32_to_cpu(ino->mode);
+	} else {
+		ui = ubifs_inode(inode);
+		fscki->nlink = inode->i_nlink;
+		fscki->size = inode->i_size;
+		fscki->xattr_cnt = ui->xattr_cnt;
+		fscki->xattr_sz = ui->xattr_size;
+		fscki->xattr_nms = ui->xattr_names;
+		fscki->mode = inode->i_mode;
+		iput(inode);
+	}
+
+	if (S_ISDIR(fscki->mode)) {
+		fscki->calc_sz = UBIFS_INO_NODE_SZ;
+		fscki->calc_cnt = 2;
+	}
+
+	rb_link_node(&fscki->rb, parent, p);
+	rb_insert_color(&fscki->rb, &fsckd->inodes);
+
+	return fscki;
+}
+
+/**
+ * search_inode - search inode in the RB-tree of inodes.
+ * @fsckd: FS checking information
+ * @inum: inode number to search
+ *
+ * This is a helper function for 'check_leaf()' which searches inode @inum in
+ * the RB-tree of inodes and returns an inode information pointer or %NULL if
+ * the inode was not found.
+ */
+static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
+{
+	struct rb_node *p;
+	struct fsck_inode *fscki;
+
+	p = fsckd->inodes.rb_node;
+	while (p) {
+		fscki = rb_entry(p, struct fsck_inode, rb);
+		if (inum < fscki->inum)
+			p = p->rb_left;
+		else if (inum > fscki->inum)
+			p = p->rb_right;
+		else
+			return fscki;
+	}
+	return NULL;
+}
+
+/**
+ * read_add_inode - read inode node and add it to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @inum: inode number to read
+ *
+ * This is a helper function for 'check_leaf()' which finds inode node @inum in
+ * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
+ * information pointer in case of success and a negative error code in case of
+ * failure.
+ */
+static struct fsck_inode *read_add_inode(struct ubifs_info *c,
+					 struct fsck_data *fsckd, ino_t inum)
+{
+	int n, err;
+	union ubifs_key key;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_ino_node *ino;
+	struct fsck_inode *fscki;
+
+	fscki = search_inode(fsckd, inum);
+	if (fscki)
+		return fscki;
+
+	ino_key_init(c, &key, inum);
+	err = ubifs_lookup_level0(c, &key, &znode, &n);
+	if (!err) {
+		ubifs_err(c, "inode %lu not found in index", (unsigned long)inum);
+		return ERR_PTR(-ENOENT);
+	} else if (err < 0) {
+		ubifs_err(c, "error %d while looking up inode %lu",
+			  err, (unsigned long)inum);
+		return ERR_PTR(err);
+	}
+
+	zbr = &znode->zbranch[n];
+	if (zbr->len < UBIFS_INO_NODE_SZ) {
+		ubifs_err(c, "bad node %lu node length %d",
+			  (unsigned long)inum, zbr->len);
+		return ERR_PTR(-EINVAL);
+	}
+
+	ino = kmalloc(zbr->len, GFP_NOFS);
+	if (!ino)
+		return ERR_PTR(-ENOMEM);
+
+	err = ubifs_tnc_read_node(c, zbr, ino);
+	if (err) {
+		ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		kfree(ino);
+		return ERR_PTR(err);
+	}
+
+	fscki = add_inode(c, fsckd, ino);
+	kfree(ino);
+	if (IS_ERR(fscki)) {
+		ubifs_err(c, "error %ld while adding inode %lu node",
+			  PTR_ERR(fscki), (unsigned long)inum);
+		return fscki;
+	}
+
+	return fscki;
+}
+
+/**
+ * check_leaf - check leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of the leaf node to check
+ * @priv: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which is called for
+ * every single leaf node while walking the indexing tree. It checks that the
+ * leaf node referred from the indexing tree exists, has correct CRC, and does
+ * some other basic validation. This function is also responsible for building
+ * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
+ * calculates reference count, size, etc for each inode in order to later
+ * compare them to the information stored inside the inodes and detect possible
+ * inconsistencies. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+		      void *priv)
+{
+	ino_t inum;
+	void *node;
+	struct ubifs_ch *ch;
+	int err, type = key_type(c, &zbr->key);
+	struct fsck_inode *fscki;
+
+	if (zbr->len < UBIFS_CH_SZ) {
+		ubifs_err(c, "bad leaf length %d (LEB %d:%d)",
+			  zbr->len, zbr->lnum, zbr->offs);
+		return -EINVAL;
+	}
+
+	node = kmalloc(zbr->len, GFP_NOFS);
+	if (!node)
+		return -ENOMEM;
+
+	err = ubifs_tnc_read_node(c, zbr, node);
+	if (err) {
+		ubifs_err(c, "cannot read leaf node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		goto out_free;
+	}
+
+	/* If this is an inode node, add it to RB-tree of inodes */
+	if (type == UBIFS_INO_KEY) {
+		fscki = add_inode(c, priv, node);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err(c, "error %d while adding inode node", err);
+			goto out_dump;
+		}
+		goto out;
+	}
+
+	if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
+	    type != UBIFS_DATA_KEY) {
+		ubifs_err(c, "unexpected node type %d at LEB %d:%d",
+			  type, zbr->lnum, zbr->offs);
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	ch = node;
+	if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
+		ubifs_err(c, "too high sequence number, max. is %llu",
+			  c->max_sqnum);
+		err = -EINVAL;
+		goto out_dump;
+	}
+
+	if (type == UBIFS_DATA_KEY) {
+		long long blk_offs;
+		struct ubifs_data_node *dn = node;
+
+		ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ);
+
+		/*
+		 * Search the inode node this data node belongs to and insert
+		 * it to the RB-tree of inodes.
+		 */
+		inum = key_inum_flash(c, &dn->key);
+		fscki = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err(c, "error %d while processing data node and trying to find inode node %lu",
+				  err, (unsigned long)inum);
+			goto out_dump;
+		}
+
+		/* Make sure the data node is within inode size */
+		blk_offs = key_block_flash(c, &dn->key);
+		blk_offs <<= UBIFS_BLOCK_SHIFT;
+		blk_offs += le32_to_cpu(dn->size);
+		if (blk_offs > fscki->size) {
+			ubifs_err(c, "data node at LEB %d:%d is not within inode size %lld",
+				  zbr->lnum, zbr->offs, fscki->size);
+			err = -EINVAL;
+			goto out_dump;
+		}
+	} else {
+		int nlen;
+		struct ubifs_dent_node *dent = node;
+		struct fsck_inode *fscki1;
+
+		ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ);
+
+		err = ubifs_validate_entry(c, dent);
+		if (err)
+			goto out_dump;
+
+		/*
+		 * Search the inode node this entry refers to and the parent
+		 * inode node and insert them to the RB-tree of inodes.
+		 */
+		inum = le64_to_cpu(dent->inum);
+		fscki = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err(c, "error %d while processing entry node and trying to find inode node %lu",
+				  err, (unsigned long)inum);
+			goto out_dump;
+		}
+
+		/* Count how many direntries or xentries refers this inode */
+		fscki->references += 1;
+
+		inum = key_inum_flash(c, &dent->key);
+		fscki1 = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki1)) {
+			err = PTR_ERR(fscki1);
+			ubifs_err(c, "error %d while processing entry node and trying to find parent inode node %lu",
+				  err, (unsigned long)inum);
+			goto out_dump;
+		}
+
+		nlen = le16_to_cpu(dent->nlen);
+		if (type == UBIFS_XENT_KEY) {
+			fscki1->calc_xcnt += 1;
+			fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
+			fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
+			fscki1->calc_xnms += nlen;
+		} else {
+			fscki1->calc_sz += CALC_DENT_SIZE(nlen);
+			if (dent->type == UBIFS_ITYPE_DIR)
+				fscki1->calc_cnt += 1;
+		}
+	}
+
+out:
+	kfree(node);
+	return 0;
+
+out_dump:
+	ubifs_msg(c, "dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
+	ubifs_dump_node(c, node);
+out_free:
+	kfree(node);
+	return err;
+}
+
+/**
+ * free_inodes - free RB-tree of inodes.
+ * @fsckd: FS checking information
+ */
+static void free_inodes(struct fsck_data *fsckd)
+{
+	struct fsck_inode *fscki, *n;
+
+	rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb)
+		kfree(fscki);
+}
+
+/**
+ * check_inodes - checks all inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which walks the
+ * RB-tree of inodes after the index scan has been finished, and checks that
+ * inode nlink, size, etc are correct. Returns zero if inodes are fine,
+ * %-EINVAL if not, and a negative error code in case of failure.
+ */
+static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
+{
+	int n, err;
+	union ubifs_key key;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_ino_node *ino;
+	struct fsck_inode *fscki;
+	struct rb_node *this = rb_first(&fsckd->inodes);
+
+	while (this) {
+		fscki = rb_entry(this, struct fsck_inode, rb);
+		this = rb_next(this);
+
+		if (S_ISDIR(fscki->mode)) {
+			/*
+			 * Directories have to have exactly one reference (they
+			 * cannot have hardlinks), although root inode is an
+			 * exception.
+			 */
+			if (fscki->inum != UBIFS_ROOT_INO &&
+			    fscki->references != 1) {
+				ubifs_err(c, "directory inode %lu has %d direntries which refer it, but should be 1",
+					  (unsigned long)fscki->inum,
+					  fscki->references);
+				goto out_dump;
+			}
+			if (fscki->inum == UBIFS_ROOT_INO &&
+			    fscki->references != 0) {
+				ubifs_err(c, "root inode %lu has non-zero (%d) direntries which refer it",
+					  (unsigned long)fscki->inum,
+					  fscki->references);
+				goto out_dump;
+			}
+			if (fscki->calc_sz != fscki->size) {
+				ubifs_err(c, "directory inode %lu size is %lld, but calculated size is %lld",
+					  (unsigned long)fscki->inum,
+					  fscki->size, fscki->calc_sz);
+				goto out_dump;
+			}
+			if (fscki->calc_cnt != fscki->nlink) {
+				ubifs_err(c, "directory inode %lu nlink is %d, but calculated nlink is %d",
+					  (unsigned long)fscki->inum,
+					  fscki->nlink, fscki->calc_cnt);
+				goto out_dump;
+			}
+		} else {
+			if (fscki->references != fscki->nlink) {
+				ubifs_err(c, "inode %lu nlink is %d, but calculated nlink is %d",
+					  (unsigned long)fscki->inum,
+					  fscki->nlink, fscki->references);
+				goto out_dump;
+			}
+		}
+		if (fscki->xattr_sz != fscki->calc_xsz) {
+			ubifs_err(c, "inode %lu has xattr size %u, but calculated size is %lld",
+				  (unsigned long)fscki->inum, fscki->xattr_sz,
+				  fscki->calc_xsz);
+			goto out_dump;
+		}
+		if (fscki->xattr_cnt != fscki->calc_xcnt) {
+			ubifs_err(c, "inode %lu has %u xattrs, but calculated count is %lld",
+				  (unsigned long)fscki->inum,
+				  fscki->xattr_cnt, fscki->calc_xcnt);
+			goto out_dump;
+		}
+		if (fscki->xattr_nms != fscki->calc_xnms) {
+			ubifs_err(c, "inode %lu has xattr names' size %u, but calculated names' size is %lld",
+				  (unsigned long)fscki->inum, fscki->xattr_nms,
+				  fscki->calc_xnms);
+			goto out_dump;
+		}
+	}
+
+	return 0;
+
+out_dump:
+	/* Read the bad inode and dump it */
+	ino_key_init(c, &key, fscki->inum);
+	err = ubifs_lookup_level0(c, &key, &znode, &n);
+	if (!err) {
+		ubifs_err(c, "inode %lu not found in index",
+			  (unsigned long)fscki->inum);
+		return -ENOENT;
+	} else if (err < 0) {
+		ubifs_err(c, "error %d while looking up inode %lu",
+			  err, (unsigned long)fscki->inum);
+		return err;
+	}
+
+	zbr = &znode->zbranch[n];
+	ino = kmalloc(zbr->len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	err = ubifs_tnc_read_node(c, zbr, ino);
+	if (err) {
+		ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		kfree(ino);
+		return err;
+	}
+
+	ubifs_msg(c, "dump of the inode %lu sitting in LEB %d:%d",
+		  (unsigned long)fscki->inum, zbr->lnum, zbr->offs);
+	ubifs_dump_node(c, ino);
+	kfree(ino);
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_filesystem - check the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks the file system, namely:
+ * o makes sure that all leaf nodes exist and their CRCs are correct;
+ * o makes sure inode nlink, size, xattr size/count are correct (for all
+ *   inodes).
+ *
+ * The function reads whole indexing tree and all nodes, so it is pretty
+ * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
+ * not, and a negative error code in case of failure.
+ */
+int dbg_check_filesystem(struct ubifs_info *c)
+{
+	int err;
+	struct fsck_data fsckd;
+
+	if (!dbg_is_chk_fs(c))
+		return 0;
+
+	fsckd.inodes = RB_ROOT;
+	err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
+	if (err)
+		goto out_free;
+
+	err = check_inodes(c, &fsckd);
+	if (err)
+		goto out_free;
+
+	free_inodes(&fsckd);
+	return 0;
+
+out_free:
+	ubifs_err(c, "file-system check failed with error %d", err);
+	dump_stack();
+	free_inodes(&fsckd);
+	return err;
+}
+
+/**
+ * dbg_check_data_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!dbg_is_chk_gen(c))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t blka, blkb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_DATA_NODE) {
+			ubifs_err(c, "bad node type %d", sa->type);
+			ubifs_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sb->type != UBIFS_DATA_NODE) {
+			ubifs_err(c, "bad node type %d", sb->type);
+			ubifs_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err(c, "larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		blka = key_block(c, &sa->key);
+		blkb = key_block(c, &sb->key);
+
+		if (blka > blkb) {
+			ubifs_err(c, "larger block %u goes before %u", blka, blkb);
+			goto error_dump;
+		}
+		if (blka == blkb) {
+			ubifs_err(c, "two data nodes for the same block");
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	ubifs_dump_node(c, sa->node);
+	ubifs_dump_node(c, sb->node);
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of non-data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!dbg_is_chk_gen(c))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t hasha, hashb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err(c, "bad node type %d", sa->type);
+			ubifs_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err(c, "bad node type %d", sb->type);
+			ubifs_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			ubifs_err(c, "non-inode node goes before inode node");
+			goto error_dump;
+		}
+
+		if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
+			continue;
+
+		if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			/* Inode nodes are sorted in descending size order */
+			if (sa->len < sb->len) {
+				ubifs_err(c, "smaller inode node goes first");
+				goto error_dump;
+			}
+			continue;
+		}
+
+		/*
+		 * This is either a dentry or xentry, which should be sorted in
+		 * ascending (parent ino, hash) order.
+		 */
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err(c, "larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		hasha = key_block(c, &sa->key);
+		hashb = key_block(c, &sb->key);
+
+		if (hasha > hashb) {
+			ubifs_err(c, "larger hash %u goes before %u",
+				  hasha, hashb);
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	ubifs_msg(c, "dumping first node");
+	ubifs_dump_node(c, sa->node);
+	ubifs_msg(c, "dumping second node");
+	ubifs_dump_node(c, sb->node);
+	return -EINVAL;
+	return 0;
+}
+
+static inline int chance(unsigned int n, unsigned int out_of)
+{
+	return !!((prandom_u32() % out_of) + 1 <= n);
+
+}
+
+static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
+{
+	struct ubifs_debug_info *d = c->dbg;
+
+	ubifs_assert(dbg_is_tst_rcvry(c));
+
+	if (!d->pc_cnt) {
+		/* First call - decide delay to the power cut */
+		if (chance(1, 2)) {
+			unsigned long delay;
+
+			if (chance(1, 2)) {
+				d->pc_delay = 1;
+				/* Fail within 1 minute */
+				delay = prandom_u32() % 60000;
+				d->pc_timeout = jiffies;
+				d->pc_timeout += msecs_to_jiffies(delay);
+				ubifs_warn(c, "failing after %lums", delay);
+			} else {
+				d->pc_delay = 2;
+				delay = prandom_u32() % 10000;
+				/* Fail within 10000 operations */
+				d->pc_cnt_max = delay;
+				ubifs_warn(c, "failing after %lu calls", delay);
+			}
+		}
+
+		d->pc_cnt += 1;
+	}
+
+	/* Determine if failure delay has expired */
+	if (d->pc_delay == 1 && time_before(jiffies, d->pc_timeout))
+			return 0;
+	if (d->pc_delay == 2 && d->pc_cnt++ < d->pc_cnt_max)
+			return 0;
+
+	if (lnum == UBIFS_SB_LNUM) {
+		if (write && chance(1, 2))
+			return 0;
+		if (chance(19, 20))
+			return 0;
+		ubifs_warn(c, "failing in super block LEB %d", lnum);
+	} else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
+		if (chance(19, 20))
+			return 0;
+		ubifs_warn(c, "failing in master LEB %d", lnum);
+	} else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
+		if (write && chance(99, 100))
+			return 0;
+		if (chance(399, 400))
+			return 0;
+		ubifs_warn(c, "failing in log LEB %d", lnum);
+	} else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
+		if (write && chance(7, 8))
+			return 0;
+		if (chance(19, 20))
+			return 0;
+		ubifs_warn(c, "failing in LPT LEB %d", lnum);
+	} else if (lnum >= c->orph_first && lnum <= c->orph_last) {
+		if (write && chance(1, 2))
+			return 0;
+		if (chance(9, 10))
+			return 0;
+		ubifs_warn(c, "failing in orphan LEB %d", lnum);
+	} else if (lnum == c->ihead_lnum) {
+		if (chance(99, 100))
+			return 0;
+		ubifs_warn(c, "failing in index head LEB %d", lnum);
+	} else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
+		if (chance(9, 10))
+			return 0;
+		ubifs_warn(c, "failing in GC head LEB %d", lnum);
+	} else if (write && !RB_EMPTY_ROOT(&c->buds) &&
+		   !ubifs_search_bud(c, lnum)) {
+		if (chance(19, 20))
+			return 0;
+		ubifs_warn(c, "failing in non-bud LEB %d", lnum);
+	} else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
+		   c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		if (chance(999, 1000))
+			return 0;
+		ubifs_warn(c, "failing in bud LEB %d commit running", lnum);
+	} else {
+		if (chance(9999, 10000))
+			return 0;
+		ubifs_warn(c, "failing in bud LEB %d commit not running", lnum);
+	}
+
+	d->pc_happened = 1;
+	ubifs_warn(c, "========== Power cut emulated ==========");
+	dump_stack();
+	return 1;
+}
+
+static int corrupt_data(const struct ubifs_info *c, const void *buf,
+			unsigned int len)
+{
+	unsigned int from, to, ffs = chance(1, 2);
+	unsigned char *p = (void *)buf;
+
+	from = prandom_u32() % len;
+	/* Corruption span max to end of write unit */
+	to = min(len, ALIGN(from + 1, c->max_write_size));
+
+	ubifs_warn(c, "filled bytes %u-%u with %s", from, to - 1,
+		   ffs ? "0xFFs" : "random data");
+
+	if (ffs)
+		memset(p + from, 0xFF, to - from);
+	else
+		prandom_bytes(p + from, to - from);
+
+	return to;
+}
+
+int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
+		  int offs, int len)
+{
+	int err, failing;
+
+	if (c->dbg->pc_happened)
+		return -EROFS;
+
+	failing = power_cut_emulated(c, lnum, 1);
+	if (failing) {
+		len = corrupt_data(c, buf, len);
+		ubifs_warn(c, "actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
+			   len, lnum, offs);
+	}
+	err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
+	if (err)
+		return err;
+	if (failing)
+		return -EROFS;
+	return 0;
+}
+
+int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
+		   int len)
+{
+	int err;
+
+	if (c->dbg->pc_happened)
+		return -EROFS;
+	if (power_cut_emulated(c, lnum, 1))
+		return -EROFS;
+	err = ubi_leb_change(c->ubi, lnum, buf, len);
+	if (err)
+		return err;
+	if (power_cut_emulated(c, lnum, 1))
+		return -EROFS;
+	return 0;
+}
+
+int dbg_leb_unmap(struct ubifs_info *c, int lnum)
+{
+	int err;
+
+	if (c->dbg->pc_happened)
+		return -EROFS;
+	if (power_cut_emulated(c, lnum, 0))
+		return -EROFS;
+	err = ubi_leb_unmap(c->ubi, lnum);
+	if (err)
+		return err;
+	if (power_cut_emulated(c, lnum, 0))
+		return -EROFS;
+	return 0;
+}
+
+int dbg_leb_map(struct ubifs_info *c, int lnum)
+{
+	int err;
+
+	if (c->dbg->pc_happened)
+		return -EROFS;
+	if (power_cut_emulated(c, lnum, 0))
+		return -EROFS;
+	err = ubi_leb_map(c->ubi, lnum);
+	if (err)
+		return err;
+	if (power_cut_emulated(c, lnum, 0))
+		return -EROFS;
+	return 0;
+}
+
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *dfs_rootdir;
+
+static int dfs_file_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return nonseekable_open(inode, file);
+}
+
+/**
+ * provide_user_output - provide output to the user reading a debugfs file.
+ * @val: boolean value for the answer
+ * @u: the buffer to store the answer at
+ * @count: size of the buffer
+ * @ppos: position in the @u output buffer
+ *
+ * This is a simple helper function which stores @val boolean value in the user
+ * buffer when the user reads one of UBIFS debugfs files. Returns amount of
+ * bytes written to @u in case of success and a negative error code in case of
+ * failure.
+ */
+static int provide_user_output(int val, char __user *u, size_t count,
+			       loff_t *ppos)
+{
+	char buf[3];
+
+	if (val)
+		buf[0] = '1';
+	else
+		buf[0] = '0';
+	buf[1] = '\n';
+	buf[2] = 0x00;
+
+	return simple_read_from_buffer(u, count, ppos, buf, 2);
+}
+
+static ssize_t dfs_file_read(struct file *file, char __user *u, size_t count,
+			     loff_t *ppos)
+{
+	struct dentry *dent = file->f_path.dentry;
+	struct ubifs_info *c = file->private_data;
+	struct ubifs_debug_info *d = c->dbg;
+	int val;
+
+	if (dent == d->dfs_chk_gen)
+		val = d->chk_gen;
+	else if (dent == d->dfs_chk_index)
+		val = d->chk_index;
+	else if (dent == d->dfs_chk_orph)
+		val = d->chk_orph;
+	else if (dent == d->dfs_chk_lprops)
+		val = d->chk_lprops;
+	else if (dent == d->dfs_chk_fs)
+		val = d->chk_fs;
+	else if (dent == d->dfs_tst_rcvry)
+		val = d->tst_rcvry;
+	else if (dent == d->dfs_ro_error)
+		val = c->ro_error;
+	else
+		return -EINVAL;
+
+	return provide_user_output(val, u, count, ppos);
+}
+
+/**
+ * interpret_user_input - interpret user debugfs file input.
+ * @u: user-provided buffer with the input
+ * @count: buffer size
+ *
+ * This is a helper function which interpret user input to a boolean UBIFS
+ * debugfs file. Returns %0 or %1 in case of success and a negative error code
+ * in case of failure.
+ */
+static int interpret_user_input(const char __user *u, size_t count)
+{
+	size_t buf_size;
+	char buf[8];
+
+	buf_size = min_t(size_t, count, (sizeof(buf) - 1));
+	if (copy_from_user(buf, u, buf_size))
+		return -EFAULT;
+
+	if (buf[0] == '1')
+		return 1;
+	else if (buf[0] == '0')
+		return 0;
+
+	return -EINVAL;
+}
+
+static ssize_t dfs_file_write(struct file *file, const char __user *u,
+			      size_t count, loff_t *ppos)
+{
+	struct ubifs_info *c = file->private_data;
+	struct ubifs_debug_info *d = c->dbg;
+	struct dentry *dent = file->f_path.dentry;
+	int val;
+
+	/*
+	 * TODO: this is racy - the file-system might have already been
+	 * unmounted and we'd oops in this case. The plan is to fix it with
+	 * help of 'iterate_supers_type()' which we should have in v3.0: when
+	 * a debugfs opened, we rember FS's UUID in file->private_data. Then
+	 * whenever we access the FS via a debugfs file, we iterate all UBIFS
+	 * superblocks and fine the one with the same UUID, and take the
+	 * locking right.
+	 *
+	 * The other way to go suggested by Al Viro is to create a separate
+	 * 'ubifs-debug' file-system instead.
+	 */
+	if (file->f_path.dentry == d->dfs_dump_lprops) {
+		ubifs_dump_lprops(c);
+		return count;
+	}
+	if (file->f_path.dentry == d->dfs_dump_budg) {
+		ubifs_dump_budg(c, &c->bi);
+		return count;
+	}
+	if (file->f_path.dentry == d->dfs_dump_tnc) {
+		mutex_lock(&c->tnc_mutex);
+		ubifs_dump_tnc(c);
+		mutex_unlock(&c->tnc_mutex);
+		return count;
+	}
+
+	val = interpret_user_input(u, count);
+	if (val < 0)
+		return val;
+
+	if (dent == d->dfs_chk_gen)
+		d->chk_gen = val;
+	else if (dent == d->dfs_chk_index)
+		d->chk_index = val;
+	else if (dent == d->dfs_chk_orph)
+		d->chk_orph = val;
+	else if (dent == d->dfs_chk_lprops)
+		d->chk_lprops = val;
+	else if (dent == d->dfs_chk_fs)
+		d->chk_fs = val;
+	else if (dent == d->dfs_tst_rcvry)
+		d->tst_rcvry = val;
+	else if (dent == d->dfs_ro_error)
+		c->ro_error = !!val;
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static const struct file_operations dfs_fops = {
+	.open = dfs_file_open,
+	.read = dfs_file_read,
+	.write = dfs_file_write,
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+};
+
+/**
+ * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates all debugfs files for this instance of UBIFS. Returns
+ * zero in case of success and a negative error code in case of failure.
+ *
+ * Note, the only reason we have not merged this function with the
+ * 'ubifs_debugging_init()' function is because it is better to initialize
+ * debugfs interfaces at the very end of the mount process, and remove them at
+ * the very beginning of the mount process.
+ */
+int dbg_debugfs_init_fs(struct ubifs_info *c)
+{
+	int err, n;
+	const char *fname;
+	struct dentry *dent;
+	struct ubifs_debug_info *d = c->dbg;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
+	n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
+		     c->vi.ubi_num, c->vi.vol_id);
+	if (n == UBIFS_DFS_DIR_LEN) {
+		/* The array size is too small */
+		fname = UBIFS_DFS_DIR_NAME;
+		dent = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
+	fname = d->dfs_dir_name;
+	dent = debugfs_create_dir(fname, dfs_rootdir);
+	if (IS_ERR_OR_NULL(dent))
+		goto out;
+	d->dfs_dir = dent;
+
+	fname = "dump_lprops";
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_dump_lprops = dent;
+
+	fname = "dump_budg";
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_dump_budg = dent;
+
+	fname = "dump_tnc";
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_dump_tnc = dent;
+
+	fname = "chk_general";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_chk_gen = dent;
+
+	fname = "chk_index";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_chk_index = dent;
+
+	fname = "chk_orphans";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_chk_orph = dent;
+
+	fname = "chk_lprops";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_chk_lprops = dent;
+
+	fname = "chk_fs";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_chk_fs = dent;
+
+	fname = "tst_recovery";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_tst_rcvry = dent;
+
+	fname = "ro_error";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_ro_error = dent;
+
+	return 0;
+
+out_remove:
+	debugfs_remove_recursive(d->dfs_dir);
+out:
+	err = dent ? PTR_ERR(dent) : -ENODEV;
+	ubifs_err(c, "cannot create \"%s\" debugfs file or directory, error %d\n",
+		  fname, err);
+	return err;
+}
+
+/**
+ * dbg_debugfs_exit_fs - remove all debugfs files.
+ * @c: UBIFS file-system description object
+ */
+void dbg_debugfs_exit_fs(struct ubifs_info *c)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_remove_recursive(c->dbg->dfs_dir);
+}
+
+struct ubifs_global_debug_info ubifs_dbg;
+
+static struct dentry *dfs_chk_gen;
+static struct dentry *dfs_chk_index;
+static struct dentry *dfs_chk_orph;
+static struct dentry *dfs_chk_lprops;
+static struct dentry *dfs_chk_fs;
+static struct dentry *dfs_tst_rcvry;
+
+static ssize_t dfs_global_file_read(struct file *file, char __user *u,
+				    size_t count, loff_t *ppos)
+{
+	struct dentry *dent = file->f_path.dentry;
+	int val;
+
+	if (dent == dfs_chk_gen)
+		val = ubifs_dbg.chk_gen;
+	else if (dent == dfs_chk_index)
+		val = ubifs_dbg.chk_index;
+	else if (dent == dfs_chk_orph)
+		val = ubifs_dbg.chk_orph;
+	else if (dent == dfs_chk_lprops)
+		val = ubifs_dbg.chk_lprops;
+	else if (dent == dfs_chk_fs)
+		val = ubifs_dbg.chk_fs;
+	else if (dent == dfs_tst_rcvry)
+		val = ubifs_dbg.tst_rcvry;
+	else
+		return -EINVAL;
+
+	return provide_user_output(val, u, count, ppos);
+}
+
+static ssize_t dfs_global_file_write(struct file *file, const char __user *u,
+				     size_t count, loff_t *ppos)
+{
+	struct dentry *dent = file->f_path.dentry;
+	int val;
+
+	val = interpret_user_input(u, count);
+	if (val < 0)
+		return val;
+
+	if (dent == dfs_chk_gen)
+		ubifs_dbg.chk_gen = val;
+	else if (dent == dfs_chk_index)
+		ubifs_dbg.chk_index = val;
+	else if (dent == dfs_chk_orph)
+		ubifs_dbg.chk_orph = val;
+	else if (dent == dfs_chk_lprops)
+		ubifs_dbg.chk_lprops = val;
+	else if (dent == dfs_chk_fs)
+		ubifs_dbg.chk_fs = val;
+	else if (dent == dfs_tst_rcvry)
+		ubifs_dbg.tst_rcvry = val;
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static const struct file_operations dfs_global_fops = {
+	.read = dfs_global_file_read,
+	.write = dfs_global_file_write,
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+};
+
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+	int err;
+	const char *fname;
+	struct dentry *dent;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
+	fname = "ubifs";
+	dent = debugfs_create_dir(fname, NULL);
+	if (IS_ERR_OR_NULL(dent))
+		goto out;
+	dfs_rootdir = dent;
+
+	fname = "chk_general";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+				   &dfs_global_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	dfs_chk_gen = dent;
+
+	fname = "chk_index";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+				   &dfs_global_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	dfs_chk_index = dent;
+
+	fname = "chk_orphans";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+				   &dfs_global_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	dfs_chk_orph = dent;
+
+	fname = "chk_lprops";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+				   &dfs_global_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	dfs_chk_lprops = dent;
+
+	fname = "chk_fs";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+				   &dfs_global_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	dfs_chk_fs = dent;
+
+	fname = "tst_recovery";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+				   &dfs_global_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	dfs_tst_rcvry = dent;
+
+	return 0;
+
+out_remove:
+	debugfs_remove_recursive(dfs_rootdir);
+out:
+	err = dent ? PTR_ERR(dent) : -ENODEV;
+	pr_err("UBIFS error (pid %d): cannot create \"%s\" debugfs file or directory, error %d\n",
+	       current->pid, fname, err);
+	return err;
+}
+
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_remove_recursive(dfs_rootdir);
+}
+
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+	c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+	if (!c->dbg)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+	kfree(c->dbg);
+}
diff --git a/kernel/fs/ubifs/debug.h b/kernel/fs/ubifs/debug.h
new file mode 100644
index 000000000..e03d51797
--- /dev/null
+++ b/kernel/fs/ubifs/debug.h
@@ -0,0 +1,315 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+#ifndef __UBIFS_DEBUG_H__
+#define __UBIFS_DEBUG_H__
+
+/* Checking helper functions */
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+				 struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+				  struct ubifs_znode *znode, void *priv);
+
+/*
+ * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi"
+ * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte.
+ */
+#define UBIFS_DFS_DIR_NAME "ubi%d_%d"
+#define UBIFS_DFS_DIR_LEN  (3 + 1 + 2*2 + 1)
+
+/**
+ * ubifs_debug_info - per-FS debugging information.
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ *
+ * @pc_happened: non-zero if an emulated power cut happened
+ * @pc_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @pc_timeout: time in jiffies when delay of failure mode expires
+ * @pc_cnt: current number of calls to failure mode I/O functions
+ * @pc_cnt_max: number of calls by which to delay failure mode
+ *
+ * @chk_lpt_sz: used by LPT tree size checker
+ * @chk_lpt_sz2: used by LPT tree size checker
+ * @chk_lpt_wastage: used by LPT tree size checker
+ * @chk_lpt_lebs: used by LPT tree size checker
+ * @new_nhead_offs: used by LPT tree size checker
+ * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
+ * @new_ihead_offs: used by debugging to check @c->ihead_offs
+ *
+ * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
+ *
+ * @chk_gen: if general extra checks are enabled
+ * @chk_index: if index xtra checks are enabled
+ * @chk_orph: if orphans extra checks are enabled
+ * @chk_lprops: if lprops extra checks are enabled
+ * @chk_fs: if UBIFS contents extra checks are enabled
+ * @tst_rcvry: if UBIFS recovery testing mode enabled
+ *
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_chk_gen: debugfs knob to enable UBIFS general extra checks
+ * @dfs_chk_index: debugfs knob to enable UBIFS index extra checks
+ * @dfs_chk_orph: debugfs knob to enable UBIFS orphans extra checks
+ * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks
+ * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks
+ * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing
+ * @dfs_ro_error: debugfs knob to switch UBIFS to R/O mode (different to
+ *                re-mounting to R/O mode because it does not flush any buffers
+ *                and UBIFS just starts returning -EROFS on all write
+ *               operations)
+ */
+struct ubifs_debug_info {
+	struct ubifs_zbranch old_zroot;
+	int old_zroot_level;
+	unsigned long long old_zroot_sqnum;
+
+	int pc_happened;
+	int pc_delay;
+	unsigned long pc_timeout;
+	unsigned int pc_cnt;
+	unsigned int pc_cnt_max;
+
+	long long chk_lpt_sz;
+	long long chk_lpt_sz2;
+	long long chk_lpt_wastage;
+	int chk_lpt_lebs;
+	int new_nhead_offs;
+	int new_ihead_lnum;
+	int new_ihead_offs;
+
+	struct ubifs_lp_stats saved_lst;
+	struct ubifs_budg_info saved_bi;
+	long long saved_free;
+	int saved_idx_gc_cnt;
+
+	unsigned int chk_gen:1;
+	unsigned int chk_index:1;
+	unsigned int chk_orph:1;
+	unsigned int chk_lprops:1;
+	unsigned int chk_fs:1;
+	unsigned int tst_rcvry:1;
+
+	char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1];
+	struct dentry *dfs_dir;
+	struct dentry *dfs_dump_lprops;
+	struct dentry *dfs_dump_budg;
+	struct dentry *dfs_dump_tnc;
+	struct dentry *dfs_chk_gen;
+	struct dentry *dfs_chk_index;
+	struct dentry *dfs_chk_orph;
+	struct dentry *dfs_chk_lprops;
+	struct dentry *dfs_chk_fs;
+	struct dentry *dfs_tst_rcvry;
+	struct dentry *dfs_ro_error;
+};
+
+/**
+ * ubifs_global_debug_info - global (not per-FS) UBIFS debugging information.
+ *
+ * @chk_gen: if general extra checks are enabled
+ * @chk_index: if index xtra checks are enabled
+ * @chk_orph: if orphans extra checks are enabled
+ * @chk_lprops: if lprops extra checks are enabled
+ * @chk_fs: if UBIFS contents extra checks are enabled
+ * @tst_rcvry: if UBIFS recovery testing mode enabled
+ */
+struct ubifs_global_debug_info {
+	unsigned int chk_gen:1;
+	unsigned int chk_index:1;
+	unsigned int chk_orph:1;
+	unsigned int chk_lprops:1;
+	unsigned int chk_fs:1;
+	unsigned int tst_rcvry:1;
+};
+
+#define ubifs_assert(expr) do {                                                \
+	if (unlikely(!(expr))) {                                               \
+		pr_crit("UBIFS assert failed in %s at %u (pid %d)\n",          \
+		       __func__, __LINE__, current->pid);                      \
+		dump_stack();                                                  \
+	}                                                                      \
+} while (0)
+
+#define ubifs_assert_cmt_locked(c) do {                                        \
+	if (unlikely(down_write_trylock(&(c)->commit_sem))) {                  \
+		up_write(&(c)->commit_sem);                                    \
+		pr_crit("commit lock is not locked!\n");                       \
+		ubifs_assert(0);                                               \
+	}                                                                      \
+} while (0)
+
+#define ubifs_dbg_msg(type, fmt, ...) \
+	pr_debug("UBIFS DBG " type " (pid %d): " fmt "\n", current->pid,       \
+		 ##__VA_ARGS__)
+
+#define DBG_KEY_BUF_LEN 48
+#define ubifs_dbg_msg_key(type, key, fmt, ...) do {                            \
+	char __tmp_key_buf[DBG_KEY_BUF_LEN];                                   \
+	pr_debug("UBIFS DBG " type " (pid %d): " fmt "%s\n", current->pid,     \
+		 ##__VA_ARGS__,                                                \
+		 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN));    \
+} while (0)
+
+/* General messages */
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
+/* Additional journal messages */
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) \
+	ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
+/* Additional TNC messages */
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) \
+	ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
+/* Additional lprops messages */
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
+/* Additional LEB find messages */
+#define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
+/* Additional mount messages */
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) \
+	ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
+/* Additional I/O messages */
+#define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
+/* Additional commit messages */
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
+/* Additional budgeting messages */
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
+/* Additional log messages */
+#define dbg_log(fmt, ...)   ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
+/* Additional gc messages */
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
+/* Additional scan messages */
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
+/* Additional recovery messages */
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
+
+extern struct ubifs_global_debug_info ubifs_dbg;
+
+static inline int dbg_is_chk_gen(const struct ubifs_info *c)
+{
+	return !!(ubifs_dbg.chk_gen || c->dbg->chk_gen);
+}
+static inline int dbg_is_chk_index(const struct ubifs_info *c)
+{
+	return !!(ubifs_dbg.chk_index || c->dbg->chk_index);
+}
+static inline int dbg_is_chk_orph(const struct ubifs_info *c)
+{
+	return !!(ubifs_dbg.chk_orph || c->dbg->chk_orph);
+}
+static inline int dbg_is_chk_lprops(const struct ubifs_info *c)
+{
+	return !!(ubifs_dbg.chk_lprops || c->dbg->chk_lprops);
+}
+static inline int dbg_is_chk_fs(const struct ubifs_info *c)
+{
+	return !!(ubifs_dbg.chk_fs || c->dbg->chk_fs);
+}
+static inline int dbg_is_tst_rcvry(const struct ubifs_info *c)
+{
+	return !!(ubifs_dbg.tst_rcvry || c->dbg->tst_rcvry);
+}
+static inline int dbg_is_power_cut(const struct ubifs_info *c)
+{
+	return !!c->dbg->pc_happened;
+}
+
+int ubifs_debugging_init(struct ubifs_info *c);
+void ubifs_debugging_exit(struct ubifs_info *c);
+
+/* Dump functions */
+const char *dbg_ntype(int type);
+const char *dbg_cstate(int cmt_state);
+const char *dbg_jhead(int jhead);
+const char *dbg_get_key_dump(const struct ubifs_info *c,
+			     const union ubifs_key *key);
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+			     const union ubifs_key *key, char *buffer, int len);
+void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode);
+void ubifs_dump_node(const struct ubifs_info *c, const void *node);
+void ubifs_dump_budget_req(const struct ubifs_budget_req *req);
+void ubifs_dump_lstats(const struct ubifs_lp_stats *lst);
+void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
+void ubifs_dump_lprop(const struct ubifs_info *c,
+		      const struct ubifs_lprops *lp);
+void ubifs_dump_lprops(struct ubifs_info *c);
+void ubifs_dump_lpt_info(struct ubifs_info *c);
+void ubifs_dump_leb(const struct ubifs_info *c, int lnum);
+void ubifs_dump_sleb(const struct ubifs_info *c,
+		     const struct ubifs_scan_leb *sleb, int offs);
+void ubifs_dump_znode(const struct ubifs_info *c,
+		      const struct ubifs_znode *znode);
+void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+		     int cat);
+void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		      struct ubifs_nnode *parent, int iip);
+void ubifs_dump_tnc(struct ubifs_info *c);
+void ubifs_dump_index(struct ubifs_info *c);
+void ubifs_dump_lpt_lebs(const struct ubifs_info *c);
+
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+		   dbg_znode_callback znode_cb, void *priv);
+
+/* Checking functions */
+void dbg_save_space_info(struct ubifs_info *c);
+int dbg_check_space_info(struct ubifs_info *c);
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_cats(struct ubifs_info *c);
+int dbg_check_ltab(struct ubifs_info *c);
+int dbg_chk_lpt_free_spc(struct ubifs_info *c);
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len);
+int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode);
+int dbg_check_dir(struct ubifs_info *c, const struct inode *dir);
+int dbg_check_tnc(struct ubifs_info *c, int extra);
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
+int dbg_check_filesystem(struct ubifs_info *c);
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+		    int add_pos);
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+			int row, int col);
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+			 loff_t size);
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
+
+int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
+		  int len);
+int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len);
+int dbg_leb_unmap(struct ubifs_info *c, int lnum);
+int dbg_leb_map(struct ubifs_info *c, int lnum);
+
+/* Debugfs-related stuff */
+int dbg_debugfs_init(void);
+void dbg_debugfs_exit(void);
+int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_exit_fs(struct ubifs_info *c);
+
+#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/kernel/fs/ubifs/dir.c b/kernel/fs/ubifs/dir.c
new file mode 100644
index 000000000..27060fc85
--- /dev/null
+++ b/kernel/fs/ubifs/dir.c
@@ -0,0 +1,1202 @@
+/* * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file implements directory operations.
+ *
+ * All FS operations in this file allocate budget before writing anything to the
+ * media. If they fail to allocate it, the error is returned. The only
+ * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
+ * if they unable to allocate the budget, because deletion %-ENOSPC failure is
+ * not what users are usually ready to get. UBIFS budgeting subsystem has some
+ * space reserved for these purposes.
+ *
+ * All operations in this file write all inodes which they change straight
+ * away, instead of marking them dirty. For example, 'ubifs_link()' changes
+ * @i_size of the parent inode and writes the parent inode together with the
+ * target inode. This was done to simplify file-system recovery which would
+ * otherwise be very difficult to do. The only exception is rename which marks
+ * the re-named inode dirty (because its @i_ctime is updated) but does not
+ * write it, but just marks it as dirty.
+ */
+
+#include "ubifs.h"
+
+/**
+ * inherit_flags - inherit flags of the parent inode.
+ * @dir: parent inode
+ * @mode: new inode mode flags
+ *
+ * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
+ * parent directory inode @dir. UBIFS inodes inherit the following flags:
+ * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
+ *   sub-directory basis;
+ * o %UBIFS_SYNC_FL - useful for the same reasons;
+ * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
+ *
+ * This function returns the inherited flags.
+ */
+static int inherit_flags(const struct inode *dir, umode_t mode)
+{
+	int flags;
+	const struct ubifs_inode *ui = ubifs_inode(dir);
+
+	if (!S_ISDIR(dir->i_mode))
+		/*
+		 * The parent is not a directory, which means that an extended
+		 * attribute inode is being created. No flags.
+		 */
+		return 0;
+
+	flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
+	if (!S_ISDIR(mode))
+		/* The "DIRSYNC" flag only applies to directories */
+		flags &= ~UBIFS_DIRSYNC_FL;
+	return flags;
+}
+
+/**
+ * ubifs_new_inode - allocate new UBIFS inode object.
+ * @c: UBIFS file-system description object
+ * @dir: parent directory inode
+ * @mode: inode mode flags
+ *
+ * This function finds an unused inode number, allocates new inode and
+ * initializes it. Returns new inode in case of success and an error code in
+ * case of failure.
+ */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+			      umode_t mode)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	inode = new_inode(c->vfs_sb);
+	ui = ubifs_inode(inode);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
+	 * marking them dirty in file write path (see 'file_update_time()').
+	 * UBIFS has to fully control "clean <-> dirty" transitions of inodes
+	 * to make budgeting work.
+	 */
+	inode->i_flags |= S_NOCMTIME;
+
+	inode_init_owner(inode, dir, mode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime =
+			 ubifs_current_time(inode);
+	inode->i_mapping->nrpages = 0;
+
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &ubifs_file_address_operations;
+		inode->i_op = &ubifs_file_inode_operations;
+		inode->i_fop = &ubifs_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op  = &ubifs_dir_inode_operations;
+		inode->i_fop = &ubifs_dir_operations;
+		inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ubifs_symlink_inode_operations;
+		break;
+	case S_IFSOCK:
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_op  = &ubifs_file_inode_operations;
+		break;
+	default:
+		BUG();
+	}
+
+	ui->flags = inherit_flags(dir, mode);
+	ubifs_set_inode_flags(inode);
+	if (S_ISREG(mode))
+		ui->compr_type = c->default_compr;
+	else
+		ui->compr_type = UBIFS_COMPR_NONE;
+	ui->synced_i_size = 0;
+
+	spin_lock(&c->cnt_lock);
+	/* Inode number overflow is currently not supported */
+	if (c->highest_inum >= INUM_WARN_WATERMARK) {
+		if (c->highest_inum >= INUM_WATERMARK) {
+			spin_unlock(&c->cnt_lock);
+			ubifs_err(c, "out of inode numbers");
+			make_bad_inode(inode);
+			iput(inode);
+			return ERR_PTR(-EINVAL);
+		}
+		ubifs_warn(c, "running out of inode numbers (current %lu, max %u)",
+			   (unsigned long)c->highest_inum, INUM_WATERMARK);
+	}
+
+	inode->i_ino = ++c->highest_inum;
+	/*
+	 * The creation sequence number remains with this inode for its
+	 * lifetime. All nodes for this inode have a greater sequence number,
+	 * and so it is possible to distinguish obsolete nodes belonging to a
+	 * previous incarnation of the same inode number - for example, for the
+	 * purpose of rebuilding the index.
+	 */
+	ui->creat_sqnum = ++c->max_sqnum;
+	spin_unlock(&c->cnt_lock);
+	return inode;
+}
+
+static int dbg_check_name(const struct ubifs_info *c,
+			  const struct ubifs_dent_node *dent,
+			  const struct qstr *nm)
+{
+	if (!dbg_is_chk_gen(c))
+		return 0;
+	if (le16_to_cpu(dent->nlen) != nm->len)
+		return -EINVAL;
+	if (memcmp(dent->name, nm->name, nm->len))
+		return -EINVAL;
+	return 0;
+}
+
+static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	int err;
+	union ubifs_key key;
+	struct inode *inode = NULL;
+	struct ubifs_dent_node *dent;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
+
+	if (dentry->d_name.len > UBIFS_MAX_NLEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent)
+		return ERR_PTR(-ENOMEM);
+
+	dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+
+	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
+	if (err) {
+		if (err == -ENOENT) {
+			dbg_gen("not found");
+			goto done;
+		}
+		goto out;
+	}
+
+	if (dbg_check_name(c, dent, &dentry->d_name)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
+	if (IS_ERR(inode)) {
+		/*
+		 * This should not happen. Probably the file-system needs
+		 * checking.
+		 */
+		err = PTR_ERR(inode);
+		ubifs_err(c, "dead directory entry '%pd', error %d",
+			  dentry, err);
+		ubifs_ro_mode(c, err);
+		goto out;
+	}
+
+done:
+	kfree(dent);
+	/*
+	 * Note, d_splice_alias() would be required instead if we supported
+	 * NFS.
+	 */
+	d_add(dentry, inode);
+	return NULL;
+
+out:
+	kfree(dent);
+	return ERR_PTR(err);
+}
+
+static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			bool excl)
+{
+	struct inode *inode;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.dirtied_ino = 1 };
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+
+	/*
+	 * Budget request settings: new inode, new direntry, changing the
+	 * parent directory inode.
+	 */
+
+	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+		dentry, mode, dir->i_ino);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	err = ubifs_init_security(dir, inode, &dentry->d_name);
+	if (err)
+		goto out_inode;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	ubifs_err(c, "cannot create regular file, error %d", err);
+	return err;
+}
+
+/**
+ * vfs_dent_type - get VFS directory entry type.
+ * @type: UBIFS directory entry type
+ *
+ * This function converts UBIFS directory entry type into VFS directory entry
+ * type.
+ */
+static unsigned int vfs_dent_type(uint8_t type)
+{
+	switch (type) {
+	case UBIFS_ITYPE_REG:
+		return DT_REG;
+	case UBIFS_ITYPE_DIR:
+		return DT_DIR;
+	case UBIFS_ITYPE_LNK:
+		return DT_LNK;
+	case UBIFS_ITYPE_BLK:
+		return DT_BLK;
+	case UBIFS_ITYPE_CHR:
+		return DT_CHR;
+	case UBIFS_ITYPE_FIFO:
+		return DT_FIFO;
+	case UBIFS_ITYPE_SOCK:
+		return DT_SOCK;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * The classical Unix view for directory is that it is a linear array of
+ * (name, inode number) entries. Linux/VFS assumes this model as well.
+ * Particularly, 'readdir()' call wants us to return a directory entry offset
+ * which later may be used to continue 'readdir()'ing the directory or to
+ * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
+ * model because directory entries are identified by keys, which may collide.
+ *
+ * UBIFS uses directory entry hash value for directory offsets, so
+ * 'seekdir()'/'telldir()' may not always work because of possible key
+ * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
+ * properly by means of saving full directory entry name in the private field
+ * of the file description object.
+ *
+ * This means that UBIFS cannot support NFS which requires full
+ * 'seekdir()'/'telldir()' support.
+ */
+static int ubifs_readdir(struct file *file, struct dir_context *ctx)
+{
+	int err;
+	struct qstr nm;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent;
+	struct inode *dir = file_inode(file);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
+
+	if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
+		/*
+		 * The directory was seek'ed to a senseless position or there
+		 * are no more entries.
+		 */
+		return 0;
+
+	if (file->f_version == 0) {
+		/*
+		 * The file was seek'ed, which means that @file->private_data
+		 * is now invalid. This may also be just the first
+		 * 'ubifs_readdir()' invocation, in which case
+		 * @file->private_data is NULL, and the below code is
+		 * basically a no-op.
+		 */
+		kfree(file->private_data);
+		file->private_data = NULL;
+	}
+
+	/*
+	 * 'generic_file_llseek()' unconditionally sets @file->f_version to
+	 * zero, and we use this for detecting whether the file was seek'ed.
+	 */
+	file->f_version = 1;
+
+	/* File positions 0 and 1 correspond to "." and ".." */
+	if (ctx->pos < 2) {
+		ubifs_assert(!file->private_data);
+		if (!dir_emit_dots(file, ctx))
+			return 0;
+
+		/* Find the first entry in TNC and save it */
+		lowest_dent_key(c, &key, dir->i_ino);
+		nm.name = NULL;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+
+		ctx->pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+	}
+
+	dent = file->private_data;
+	if (!dent) {
+		/*
+		 * The directory was seek'ed to and is now readdir'ed.
+		 * Find the entry corresponding to @ctx->pos or the closest one.
+		 */
+		dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
+		nm.name = NULL;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+		ctx->pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+	}
+
+	while (1) {
+		dbg_gen("feed '%s', ino %llu, new f_pos %#x",
+			dent->name, (unsigned long long)le64_to_cpu(dent->inum),
+			key_hash_flash(c, &dent->key));
+		ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
+			     ubifs_inode(dir)->creat_sqnum);
+
+		nm.len = le16_to_cpu(dent->nlen);
+		if (!dir_emit(ctx, dent->name, nm.len,
+			       le64_to_cpu(dent->inum),
+			       vfs_dent_type(dent->type)))
+			return 0;
+
+		/* Switch to the next entry */
+		key_read(c, &dent->key, &key);
+		nm.name = dent->name;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+
+		kfree(file->private_data);
+		ctx->pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+		cond_resched();
+	}
+
+out:
+	if (err != -ENOENT) {
+		ubifs_err(c, "cannot find next direntry, error %d", err);
+		return err;
+	}
+
+	kfree(file->private_data);
+	file->private_data = NULL;
+	/* 2 is a special value indicating that there are no more direntries */
+	ctx->pos = 2;
+	return 0;
+}
+
+/* Free saved readdir() state when the directory is closed */
+static int ubifs_dir_release(struct inode *dir, struct file *file)
+{
+	kfree(file->private_data);
+	file->private_data = NULL;
+	return 0;
+}
+
+/**
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+}
+
+/**
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+}
+
+static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = d_inode(old_dentry);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+	/*
+	 * Budget request settings: new direntry, changing the target inode,
+	 * changing the parent inode.
+	 */
+
+	dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
+		dentry, inode->i_ino,
+		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+
+	err = dbg_check_synced_i_size(c, inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	lock_2_inodes(dir, inode);
+	inc_nlink(inode);
+	ihold(inode);
+	inode->i_ctime = ubifs_current_time(inode);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	ubifs_release_budget(c, &req);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	drop_nlink(inode);
+	unlock_2_inodes(dir, inode);
+	ubifs_release_budget(c, &req);
+	iput(inode);
+	return err;
+}
+
+static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = d_inode(dentry);
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, budgeted = 1;
+	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+	unsigned int saved_nlink = inode->i_nlink;
+
+	/*
+	 * Budget request settings: deletion direntry, deletion inode (+1 for
+	 * @dirtied_ino), changing the parent directory inode. If budgeting
+	 * fails, go ahead anyway because we have extra space reserved for
+	 * deletions.
+	 */
+
+	dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
+		dentry, inode->i_ino,
+		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	err = dbg_check_synced_i_size(c, inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
+
+	lock_2_inodes(dir, inode);
+	inode->i_ctime = ubifs_current_time(dir);
+	drop_nlink(inode);
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+	return 0;
+
+out_cancel:
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	set_nlink(inode, saved_nlink);
+	unlock_2_inodes(dir, inode);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * check_dir_empty - check if a directory is empty or not.
+ * @c: UBIFS file-system description object
+ * @dir: VFS inode object of the directory to check
+ *
+ * This function checks if directory @dir is empty. Returns zero if the
+ * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
+ * in case of of errors.
+ */
+static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+{
+	struct qstr nm = { .name = NULL };
+	struct ubifs_dent_node *dent;
+	union ubifs_key key;
+	int err;
+
+	lowest_dent_key(c, &key, dir->i_ino);
+	dent = ubifs_tnc_next_ent(c, &key, &nm);
+	if (IS_ERR(dent)) {
+		err = PTR_ERR(dent);
+		if (err == -ENOENT)
+			err = 0;
+	} else {
+		kfree(dent);
+		err = -ENOTEMPTY;
+	}
+	return err;
+}
+
+static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = d_inode(dentry);
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, budgeted = 1;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+	/*
+	 * Budget request settings: deletion direntry, deletion inode and
+	 * changing the parent inode. If budgeting fails, go ahead anyway
+	 * because we have extra space reserved for deletions.
+	 */
+
+	dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
+		inode->i_ino, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	err = check_dir_empty(c, d_inode(dentry));
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
+
+	lock_2_inodes(dir, inode);
+	inode->i_ctime = ubifs_current_time(dir);
+	clear_nlink(inode);
+	drop_nlink(dir);
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+	return 0;
+
+out_cancel:
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	inc_nlink(dir);
+	set_nlink(inode, 2);
+	unlock_2_inodes(dir, inode);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+		dentry, mode, dir->i_ino);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	err = ubifs_init_security(dir, inode, &dentry->d_name);
+	if (err)
+		goto out_inode;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	insert_inode_hash(inode);
+	inc_nlink(inode);
+	inc_nlink(dir);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err) {
+		ubifs_err(c, "cannot create directory, error %d", err);
+		goto out_cancel;
+	}
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	drop_nlink(dir);
+	mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
+		       umode_t mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	union ubifs_dev_desc *dev = NULL;
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, devlen = 0;
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.new_ino_d = ALIGN(devlen, 8),
+					.dirtied_ino = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode)) {
+		dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+		if (!dev)
+			return -ENOMEM;
+		devlen = ubifs_encode_dev(dev, rdev);
+	}
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		kfree(dev);
+		return err;
+	}
+
+	inode = ubifs_new_inode(c, dir, mode);
+	if (IS_ERR(inode)) {
+		kfree(dev);
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	init_special_inode(inode, inode->i_mode, rdev);
+	inode->i_size = ubifs_inode(inode)->ui_size = devlen;
+	ui = ubifs_inode(inode);
+	ui->data = dev;
+	ui->data_len = devlen;
+
+	err = ubifs_init_security(dir, inode, &dentry->d_name);
+	if (err)
+		goto out_inode;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, len = strlen(symname);
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.new_ino_d = ALIGN(len, 8),
+					.dirtied_ino = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry,
+		symname, dir->i_ino);
+
+	if (len > UBIFS_MAX_INO_DATA)
+		return -ENAMETOOLONG;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	ui = ubifs_inode(inode);
+	ui->data = kmalloc(len + 1, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_inode;
+	}
+
+	memcpy(ui->data, symname, len);
+	((char *)ui->data)[len] = '\0';
+	/*
+	 * The terminating zero byte is not written to the flash media and it
+	 * is put just to make later in-memory string processing simpler. Thus,
+	 * data length is @len, not @len + %1.
+	 */
+	ui->data_len = len;
+	inode->i_size = ubifs_inode(inode)->ui_size = len;
+
+	err = ubifs_init_security(dir, inode, &dentry->d_name);
+	if (err)
+		goto out_inode;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * lock_3_inodes - a wrapper for locking three UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ *
+ * This function is used for 'ubifs_rename()' and @inode1 may be the same as
+ * @inode2 whereas @inode3 may be %NULL.
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
+ */
+static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
+			  struct inode *inode3)
+{
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	if (inode2 != inode1)
+		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+	if (inode3)
+		mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
+}
+
+/**
+ * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ */
+static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
+			    struct inode *inode3)
+{
+	if (inode3)
+		mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+	if (inode1 != inode2)
+		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+}
+
+static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ubifs_info *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
+	int err, release, sync = 0, move = (new_dir != old_dir);
+	int is_dir = S_ISDIR(old_inode->i_mode);
+	int unlink = !!new_inode;
+	int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
+	int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
+					.dirtied_ino = 3 };
+	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
+			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
+	struct timespec time;
+	unsigned int uninitialized_var(saved_nlink);
+
+	/*
+	 * Budget request settings: deletion direntry, new direntry, removing
+	 * the old inode, and changing old and new parent directory inodes.
+	 *
+	 * However, this operation also marks the target inode as dirty and
+	 * does not write it, so we allocate budget for the target inode
+	 * separately.
+	 */
+
+	dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
+		old_dentry, old_inode->i_ino, old_dir->i_ino,
+		new_dentry, new_dir->i_ino);
+	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+	if (unlink)
+		ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+
+
+	if (unlink && is_dir) {
+		err = check_dir_empty(c, new_inode);
+		if (err)
+			return err;
+	}
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+	err = ubifs_budget_space(c, &ino_req);
+	if (err) {
+		ubifs_release_budget(c, &req);
+		return err;
+	}
+
+	lock_3_inodes(old_dir, new_dir, new_inode);
+
+	/*
+	 * Like most other Unix systems, set the @i_ctime for inodes on a
+	 * rename.
+	 */
+	time = ubifs_current_time(old_dir);
+	old_inode->i_ctime = time;
+
+	/* We must adjust parent link count when renaming directories */
+	if (is_dir) {
+		if (move) {
+			/*
+			 * @old_dir loses a link because we are moving
+			 * @old_inode to a different directory.
+			 */
+			drop_nlink(old_dir);
+			/*
+			 * @new_dir only gains a link if we are not also
+			 * overwriting an existing directory.
+			 */
+			if (!unlink)
+				inc_nlink(new_dir);
+		} else {
+			/*
+			 * @old_inode is not moving to a different directory,
+			 * but @old_dir still loses a link if we are
+			 * overwriting an existing directory.
+			 */
+			if (unlink)
+				drop_nlink(old_dir);
+		}
+	}
+
+	old_dir->i_size -= old_sz;
+	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+	old_dir->i_mtime = old_dir->i_ctime = time;
+	new_dir->i_mtime = new_dir->i_ctime = time;
+
+	/*
+	 * And finally, if we unlinked a direntry which happened to have the
+	 * same name as the moved direntry, we have to decrement @i_nlink of
+	 * the unlinked inode and change its ctime.
+	 */
+	if (unlink) {
+		/*
+		 * Directories cannot have hard-links, so if this is a
+		 * directory, just clear @i_nlink.
+		 */
+		saved_nlink = new_inode->i_nlink;
+		if (is_dir)
+			clear_nlink(new_inode);
+		else
+			drop_nlink(new_inode);
+		new_inode->i_ctime = time;
+	} else {
+		new_dir->i_size += new_sz;
+		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+	}
+
+	/*
+	 * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
+	 * is dirty, because this will be done later on at the end of
+	 * 'ubifs_rename()'.
+	 */
+	if (IS_SYNC(old_inode)) {
+		sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+		if (unlink && IS_SYNC(new_inode))
+			sync = 1;
+	}
+	err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
+			       sync);
+	if (err)
+		goto out_cancel;
+
+	unlock_3_inodes(old_dir, new_dir, new_inode);
+	ubifs_release_budget(c, &req);
+
+	mutex_lock(&old_inode_ui->ui_mutex);
+	release = old_inode_ui->dirty;
+	mark_inode_dirty_sync(old_inode);
+	mutex_unlock(&old_inode_ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &ino_req);
+	if (IS_SYNC(old_inode))
+		err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+	return err;
+
+out_cancel:
+	if (unlink) {
+		set_nlink(new_inode, saved_nlink);
+	} else {
+		new_dir->i_size -= new_sz;
+		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+	}
+	old_dir->i_size += old_sz;
+	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+	if (is_dir) {
+		if (move) {
+			inc_nlink(old_dir);
+			if (!unlink)
+				drop_nlink(new_dir);
+		} else {
+			if (unlink)
+				inc_nlink(old_dir);
+		}
+	}
+	unlock_3_inodes(old_dir, new_dir, new_inode);
+	ubifs_release_budget(c, &ino_req);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat)
+{
+	loff_t size;
+	struct inode *inode = d_inode(dentry);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	mutex_lock(&ui->ui_mutex);
+	generic_fillattr(inode, stat);
+	stat->blksize = UBIFS_BLOCK_SIZE;
+	stat->size = ui->ui_size;
+
+	/*
+	 * Unfortunately, the 'stat()' system call was designed for block
+	 * device based file systems, and it is not appropriate for UBIFS,
+	 * because UBIFS does not have notion of "block". For example, it is
+	 * difficult to tell how many block a directory takes - it actually
+	 * takes less than 300 bytes, but we have to round it to block size,
+	 * which introduces large mistake. This makes utilities like 'du' to
+	 * report completely senseless numbers. This is the reason why UBIFS
+	 * goes the same way as JFFS2 - it reports zero blocks for everything
+	 * but regular files, which makes more sense than reporting completely
+	 * wrong sizes.
+	 */
+	if (S_ISREG(inode->i_mode)) {
+		size = ui->xattr_size;
+		size += stat->size;
+		size = ALIGN(size, UBIFS_BLOCK_SIZE);
+		/*
+		 * Note, user-space expects 512-byte blocks count irrespectively
+		 * of what was reported in @stat->size.
+		 */
+		stat->blocks = size >> 9;
+	} else
+		stat->blocks = 0;
+	mutex_unlock(&ui->ui_mutex);
+	return 0;
+}
+
+const struct inode_operations ubifs_dir_inode_operations = {
+	.lookup      = ubifs_lookup,
+	.create      = ubifs_create,
+	.link        = ubifs_link,
+	.symlink     = ubifs_symlink,
+	.unlink      = ubifs_unlink,
+	.mkdir       = ubifs_mkdir,
+	.rmdir       = ubifs_rmdir,
+	.mknod       = ubifs_mknod,
+	.rename      = ubifs_rename,
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+	.setxattr    = ubifs_setxattr,
+	.getxattr    = ubifs_getxattr,
+	.listxattr   = ubifs_listxattr,
+	.removexattr = ubifs_removexattr,
+};
+
+const struct file_operations ubifs_dir_operations = {
+	.llseek         = generic_file_llseek,
+	.release        = ubifs_dir_release,
+	.read           = generic_read_dir,
+	.iterate        = ubifs_readdir,
+	.fsync          = ubifs_fsync,
+	.unlocked_ioctl = ubifs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/kernel/fs/ubifs/file.c b/kernel/fs/ubifs/file.c
new file mode 100644
index 000000000..35efc103c
--- /dev/null
+++ b/kernel/fs/ubifs/file.c
@@ -0,0 +1,1594 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements VFS file and inode operations for regular files, device
+ * nodes and symlinks as well as address space operations.
+ *
+ * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
+ * the page is dirty and is used for optimization purposes - dirty pages are
+ * not budgeted so the flag shows that 'ubifs_write_end()' should not release
+ * the budget for this page. The @PG_checked flag is set if full budgeting is
+ * required for the page e.g., when it corresponds to a file hole or it is
+ * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
+ * it is OK to fail in this function, and the budget is released in
+ * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
+ * information about how the page was budgeted, to make it possible to release
+ * the budget properly.
+ *
+ * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
+ * implement. However, this is not true for 'ubifs_writepage()', which may be
+ * called with @i_mutex unlocked. For example, when flusher thread is doing
+ * background write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex.
+ * At "normal" work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g.
+ * in the "sys_write -> alloc_pages -> direct reclaim path". So, in
+ * 'ubifs_writepage()' we are only guaranteed that the page is locked.
+ *
+ * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
+ * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
+ * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not
+ * set as well. However, UBIFS disables readahead.
+ */
+
+#include "ubifs.h"
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+static int read_block(struct inode *inode, void *addr, unsigned int block,
+		      struct ubifs_data_node *dn)
+{
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	int err, len, out_len;
+	union ubifs_key key;
+	unsigned int dlen;
+
+	data_key_init(c, &key, inode->i_ino, block);
+	err = ubifs_tnc_lookup(c, &key, dn);
+	if (err) {
+		if (err == -ENOENT)
+			/* Not found, so it must be a hole */
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		return err;
+	}
+
+	ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+		     ubifs_inode(inode)->creat_sqnum);
+	len = le32_to_cpu(dn->size);
+	if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+		goto dump;
+
+	dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	out_len = UBIFS_BLOCK_SIZE;
+	err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
+			       le16_to_cpu(dn->compr_type));
+	if (err || len != out_len)
+		goto dump;
+
+	/*
+	 * Data length can be less than a full block, even for blocks that are
+	 * not the last in the file (e.g., as a result of making a hole and
+	 * appending data). Ensure that the remainder is zeroed out.
+	 */
+	if (len < UBIFS_BLOCK_SIZE)
+		memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+	return 0;
+
+dump:
+	ubifs_err(c, "bad data node (block %u, inode %lu)",
+		  block, inode->i_ino);
+	ubifs_dump_node(c, dn);
+	return -EINVAL;
+}
+
+static int do_readpage(struct page *page)
+{
+	void *addr;
+	int err = 0, i;
+	unsigned int block, beyond;
+	struct ubifs_data_node *dn;
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+		inode->i_ino, page->index, i_size, page->flags);
+	ubifs_assert(!PageChecked(page));
+	ubifs_assert(!PagePrivate(page));
+
+	addr = kmap(page);
+
+	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+	if (block >= beyond) {
+		/* Reading beyond inode */
+		SetPageChecked(page);
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		goto out;
+	}
+
+	dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
+	if (!dn) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	while (1) {
+		int ret;
+
+		if (block >= beyond) {
+			/* Reading beyond inode */
+			err = -ENOENT;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		} else {
+			ret = read_block(inode, addr, block, dn);
+			if (ret) {
+				err = ret;
+				if (err != -ENOENT)
+					break;
+			} else if (block + 1 == beyond) {
+				int dlen = le32_to_cpu(dn->size);
+				int ilen = i_size & (UBIFS_BLOCK_SIZE - 1);
+
+				if (ilen && ilen < dlen)
+					memset(addr + ilen, 0, dlen - ilen);
+			}
+		}
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		block += 1;
+		addr += UBIFS_BLOCK_SIZE;
+	}
+	if (err) {
+		struct ubifs_info *c = inode->i_sb->s_fs_info;
+		if (err == -ENOENT) {
+			/* Not found, so it must be a hole */
+			SetPageChecked(page);
+			dbg_gen("hole");
+			goto out_free;
+		}
+		ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
+			  page->index, inode->i_ino, err);
+		goto error;
+	}
+
+out_free:
+	kfree(dn);
+out:
+	SetPageUptodate(page);
+	ClearPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	return 0;
+
+error:
+	kfree(dn);
+	ClearPageUptodate(page);
+	SetPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	return err;
+}
+
+/**
+ * release_new_page_budget - release budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of one new page of data.
+ */
+static void release_new_page_budget(struct ubifs_info *c)
+{
+	struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
+
+	ubifs_release_budget(c, &req);
+}
+
+/**
+ * release_existing_page_budget - release budget of an existing page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of changing one one page of data which already exists on the flash media.
+ */
+static void release_existing_page_budget(struct ubifs_info *c)
+{
+	struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
+
+	ubifs_release_budget(c, &req);
+}
+
+static int write_begin_slow(struct address_space *mapping,
+			    loff_t pos, unsigned len, struct page **pagep,
+			    unsigned flags)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct ubifs_budget_req req = { .new_page = 1 };
+	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	struct page *page;
+
+	dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
+		inode->i_ino, pos, len, inode->i_size);
+
+	/*
+	 * At the slow path we have to budget before locking the page, because
+	 * budgeting may force write-back, which would wait on locked pages and
+	 * deadlock if we had the page locked. At this point we do not know
+	 * anything about the page, so assume that this is a new page which is
+	 * written to a hole. This corresponds to largest budget. Later the
+	 * budget will be amended if this is not true.
+	 */
+	if (appending)
+		/* We are appending data, budget for inode change */
+		req.dirtied_ino = 1;
+
+	err = ubifs_budget_space(c, &req);
+	if (unlikely(err))
+		return err;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (unlikely(!page)) {
+		ubifs_release_budget(c, &req);
+		return -ENOMEM;
+	}
+
+	if (!PageUptodate(page)) {
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+			SetPageChecked(page);
+		else {
+			err = do_readpage(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				ubifs_release_budget(c, &req);
+				return err;
+			}
+		}
+
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+
+	if (PagePrivate(page))
+		/*
+		 * The page is dirty, which means it was budgeted twice:
+		 *   o first time the budget was allocated by the task which
+		 *     made the page dirty and set the PG_private flag;
+		 *   o and then we budgeted for it for the second time at the
+		 *     very beginning of this function.
+		 *
+		 * So what we have to do is to release the page budget we
+		 * allocated.
+		 */
+		release_new_page_budget(c);
+	else if (!PageChecked(page))
+		/*
+		 * We are changing a page which already exists on the media.
+		 * This means that changing the page does not make the amount
+		 * of indexing information larger, and this part of the budget
+		 * which we have already acquired may be released.
+		 */
+		ubifs_convert_page_budget(c);
+
+	if (appending) {
+		struct ubifs_inode *ui = ubifs_inode(inode);
+
+		/*
+		 * 'ubifs_write_end()' is optimized from the fast-path part of
+		 * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
+		 * if data is appended.
+		 */
+		mutex_lock(&ui->ui_mutex);
+		if (ui->dirty)
+			/*
+			 * The inode is dirty already, so we may free the
+			 * budget we allocated.
+			 */
+			ubifs_release_dirty_inode_budget(c, ui);
+	}
+
+	*pagep = page;
+	return 0;
+}
+
+/**
+ * allocate_budget - allocate budget for 'ubifs_write_begin()'.
+ * @c: UBIFS file-system description object
+ * @page: page to allocate budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for 'ubifs_write_begin()' which allocates budget
+ * for the operation. The budget is allocated differently depending on whether
+ * this is appending, whether the page is dirty or not, and so on. This
+ * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
+ * in case of success and %-ENOSPC in case of failure.
+ */
+static int allocate_budget(struct ubifs_info *c, struct page *page,
+			   struct ubifs_inode *ui, int appending)
+{
+	struct ubifs_budget_req req = { .fast = 1 };
+
+	if (PagePrivate(page)) {
+		if (!appending)
+			/*
+			 * The page is dirty and we are not appending, which
+			 * means no budget is needed at all.
+			 */
+			return 0;
+
+		mutex_lock(&ui->ui_mutex);
+		if (ui->dirty)
+			/*
+			 * The page is dirty and we are appending, so the inode
+			 * has to be marked as dirty. However, it is already
+			 * dirty, so we do not need any budget. We may return,
+			 * but @ui->ui_mutex hast to be left locked because we
+			 * should prevent write-back from flushing the inode
+			 * and freeing the budget. The lock will be released in
+			 * 'ubifs_write_end()'.
+			 */
+			return 0;
+
+		/*
+		 * The page is dirty, we are appending, the inode is clean, so
+		 * we need to budget the inode change.
+		 */
+		req.dirtied_ino = 1;
+	} else {
+		if (PageChecked(page))
+			/*
+			 * The page corresponds to a hole and does not
+			 * exist on the media. So changing it makes
+			 * make the amount of indexing information
+			 * larger, and we have to budget for a new
+			 * page.
+			 */
+			req.new_page = 1;
+		else
+			/*
+			 * Not a hole, the change will not add any new
+			 * indexing information, budget for page
+			 * change.
+			 */
+			req.dirtied_page = 1;
+
+		if (appending) {
+			mutex_lock(&ui->ui_mutex);
+			if (!ui->dirty)
+				/*
+				 * The inode is clean but we will have to mark
+				 * it as dirty because we are appending. This
+				 * needs a budget.
+				 */
+				req.dirtied_ino = 1;
+		}
+	}
+
+	return ubifs_budget_space(c, &req);
+}
+
+/*
+ * This function is called when a page of data is going to be written. Since
+ * the page of data will not necessarily go to the flash straight away, UBIFS
+ * has to reserve space on the media for it, which is done by means of
+ * budgeting.
+ *
+ * This is the hot-path of the file-system and we are trying to optimize it as
+ * much as possible. For this reasons it is split on 2 parts - slow and fast.
+ *
+ * There many budgeting cases:
+ *     o a new page is appended - we have to budget for a new page and for
+ *       changing the inode; however, if the inode is already dirty, there is
+ *       no need to budget for it;
+ *     o an existing clean page is changed - we have budget for it; if the page
+ *       does not exist on the media (a hole), we have to budget for a new
+ *       page; otherwise, we may budget for changing an existing page; the
+ *       difference between these cases is that changing an existing page does
+ *       not introduce anything new to the FS indexing information, so it does
+ *       not grow, and smaller budget is acquired in this case;
+ *     o an existing dirty page is changed - no need to budget at all, because
+ *       the page budget has been acquired by earlier, when the page has been
+ *       marked dirty.
+ *
+ * UBIFS budgeting sub-system may force write-back if it thinks there is no
+ * space to reserve. This imposes some locking restrictions and makes it
+ * impossible to take into account the above cases, and makes it impossible to
+ * optimize budgeting.
+ *
+ * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
+ * there is a plenty of flash space and the budget will be acquired quickly,
+ * without forcing write-back. The slow path does not make this assumption.
+ */
+static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int skipped_read = 0;
+	struct page *page;
+
+	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (unlikely(c->ro_error))
+		return -EROFS;
+
+	/* Try out the fast-path part first */
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	if (!PageUptodate(page)) {
+		/* The page is not loaded from the flash */
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
+			/*
+			 * We change whole page so no need to load it. But we
+			 * do not know whether this page exists on the media or
+			 * not, so we assume the latter because it requires
+			 * larger budget. The assumption is that it is better
+			 * to budget a bit more than to read the page from the
+			 * media. Thus, we are setting the @PG_checked flag
+			 * here.
+			 */
+			SetPageChecked(page);
+			skipped_read = 1;
+		} else {
+			err = do_readpage(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				return err;
+			}
+		}
+
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+
+	err = allocate_budget(c, page, ui, appending);
+	if (unlikely(err)) {
+		ubifs_assert(err == -ENOSPC);
+		/*
+		 * If we skipped reading the page because we were going to
+		 * write all of it, then it is not up to date.
+		 */
+		if (skipped_read) {
+			ClearPageChecked(page);
+			ClearPageUptodate(page);
+		}
+		/*
+		 * Budgeting failed which means it would have to force
+		 * write-back but didn't, because we set the @fast flag in the
+		 * request. Write-back cannot be done now, while we have the
+		 * page locked, because it would deadlock. Unlock and free
+		 * everything and fall-back to slow-path.
+		 */
+		if (appending) {
+			ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+			mutex_unlock(&ui->ui_mutex);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+
+		return write_begin_slow(mapping, pos, len, pagep, flags);
+	}
+
+	/*
+	 * Whee, we acquired budgeting quickly - without involving
+	 * garbage-collection, committing or forcing write-back. We return
+	 * with @ui->ui_mutex locked if we are appending pages, and unlocked
+	 * otherwise. This is an optimization (slightly hacky though).
+	 */
+	*pagep = page;
+	return 0;
+
+}
+
+/**
+ * cancel_budget - cancel budget.
+ * @c: UBIFS file-system description object
+ * @page: page to cancel budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for a page write operation. It unlocks the
+ * @ui->ui_mutex in case of appending.
+ */
+static void cancel_budget(struct ubifs_info *c, struct page *page,
+			  struct ubifs_inode *ui, int appending)
+{
+	if (appending) {
+		if (!ui->dirty)
+			ubifs_release_dirty_inode_budget(c, ui);
+		mutex_unlock(&ui->ui_mutex);
+	}
+	if (!PagePrivate(page)) {
+		if (PageChecked(page))
+			release_new_page_budget(c);
+		else
+			release_existing_page_budget(c);
+	}
+}
+
+static int ubifs_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	loff_t end_pos = pos + len;
+	int appending = !!(end_pos > inode->i_size);
+
+	dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
+		inode->i_ino, pos, page->index, len, copied, inode->i_size);
+
+	if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+		/*
+		 * VFS copied less data to the page that it intended and
+		 * declared in its '->write_begin()' call via the @len
+		 * argument. If the page was not up-to-date, and @len was
+		 * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+		 * not load it from the media (for optimization reasons). This
+		 * means that part of the page contains garbage. So read the
+		 * page now.
+		 */
+		dbg_gen("copied %d instead of %d, read page and repeat",
+			copied, len);
+		cancel_budget(c, page, ui, appending);
+		ClearPageChecked(page);
+
+		/*
+		 * Return 0 to force VFS to repeat the whole operation, or the
+		 * error code if 'do_readpage()' fails.
+		 */
+		copied = do_readpage(page);
+		goto out;
+	}
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		atomic_long_inc(&c->dirty_pg_cnt);
+		__set_page_dirty_nobuffers(page);
+	}
+
+	if (appending) {
+		i_size_write(inode, end_pos);
+		ui->ui_size = end_pos;
+		/*
+		 * Note, we do not set @I_DIRTY_PAGES (which means that the
+		 * inode has dirty pages), this has been done in
+		 * '__set_page_dirty_nobuffers()'.
+		 */
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+		mutex_unlock(&ui->ui_mutex);
+	}
+
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return copied;
+}
+
+/**
+ * populate_page - copy data nodes into a page for bulk-read.
+ * @c: UBIFS file-system description object
+ * @page: page
+ * @bu: bulk-read information
+ * @n: next zbranch slot
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int populate_page(struct ubifs_info *c, struct page *page,
+			 struct bu_info *bu, int *n)
+{
+	int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	unsigned int page_block;
+	void *addr, *zaddr;
+	pgoff_t end_index;
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+		inode->i_ino, page->index, i_size, page->flags);
+
+	addr = zaddr = kmap(page);
+
+	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	if (!i_size || page->index > end_index) {
+		hole = 1;
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		goto out_hole;
+	}
+
+	page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	while (1) {
+		int err, len, out_len, dlen;
+
+		if (nn >= bu->cnt) {
+			hole = 1;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		} else if (key_block(c, &bu->zbranch[nn].key) == page_block) {
+			struct ubifs_data_node *dn;
+
+			dn = bu->buf + (bu->zbranch[nn].offs - offs);
+
+			ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+				     ubifs_inode(inode)->creat_sqnum);
+
+			len = le32_to_cpu(dn->size);
+			if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+				goto out_err;
+
+			dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+			out_len = UBIFS_BLOCK_SIZE;
+			err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
+					       le16_to_cpu(dn->compr_type));
+			if (err || len != out_len)
+				goto out_err;
+
+			if (len < UBIFS_BLOCK_SIZE)
+				memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+			nn += 1;
+			read = (i << UBIFS_BLOCK_SHIFT) + len;
+		} else if (key_block(c, &bu->zbranch[nn].key) < page_block) {
+			nn += 1;
+			continue;
+		} else {
+			hole = 1;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		}
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		addr += UBIFS_BLOCK_SIZE;
+		page_block += 1;
+	}
+
+	if (end_index == page->index) {
+		int len = i_size & (PAGE_CACHE_SIZE - 1);
+
+		if (len && len < read)
+			memset(zaddr + len, 0, read - len);
+	}
+
+out_hole:
+	if (hole) {
+		SetPageChecked(page);
+		dbg_gen("hole");
+	}
+
+	SetPageUptodate(page);
+	ClearPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	*n = nn;
+	return 0;
+
+out_err:
+	ClearPageUptodate(page);
+	SetPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	ubifs_err(c, "bad data node (block %u, inode %lu)",
+		  page_block, inode->i_ino);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_do_bulk_read - do bulk-read.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read information
+ * @page1: first page to read
+ *
+ * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ */
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
+			      struct page *page1)
+{
+	pgoff_t offset = page1->index, end_index;
+	struct address_space *mapping = page1->mapping;
+	struct inode *inode = mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	int err, page_idx, page_cnt, ret = 0, n = 0;
+	int allocate = bu->buf ? 0 : 1;
+	loff_t isize;
+
+	err = ubifs_tnc_get_bu_keys(c, bu);
+	if (err)
+		goto out_warn;
+
+	if (bu->eof) {
+		/* Turn off bulk-read at the end of the file */
+		ui->read_in_a_row = 1;
+		ui->bulk_read = 0;
+	}
+
+	page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	if (!page_cnt) {
+		/*
+		 * This happens when there are multiple blocks per page and the
+		 * blocks for the first page we are looking for, are not
+		 * together. If all the pages were like this, bulk-read would
+		 * reduce performance, so we turn it off for a while.
+		 */
+		goto out_bu_off;
+	}
+
+	if (bu->cnt) {
+		if (allocate) {
+			/*
+			 * Allocate bulk-read buffer depending on how many data
+			 * nodes we are going to read.
+			 */
+			bu->buf_len = bu->zbranch[bu->cnt - 1].offs +
+				      bu->zbranch[bu->cnt - 1].len -
+				      bu->zbranch[0].offs;
+			ubifs_assert(bu->buf_len > 0);
+			ubifs_assert(bu->buf_len <= c->leb_size);
+			bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN);
+			if (!bu->buf)
+				goto out_bu_off;
+		}
+
+		err = ubifs_tnc_bulk_read(c, bu);
+		if (err)
+			goto out_warn;
+	}
+
+	err = populate_page(c, page1, bu, &n);
+	if (err)
+		goto out_warn;
+
+	unlock_page(page1);
+	ret = 1;
+
+	isize = i_size_read(inode);
+	if (isize == 0)
+		goto out_free;
+	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+	for (page_idx = 1; page_idx < page_cnt; page_idx++) {
+		pgoff_t page_offset = offset + page_idx;
+		struct page *page;
+
+		if (page_offset > end_index)
+			break;
+		page = find_or_create_page(mapping, page_offset,
+					   GFP_NOFS | __GFP_COLD);
+		if (!page)
+			break;
+		if (!PageUptodate(page))
+			err = populate_page(c, page, bu, &n);
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			break;
+	}
+
+	ui->last_page_read = offset + page_idx - 1;
+
+out_free:
+	if (allocate)
+		kfree(bu->buf);
+	return ret;
+
+out_warn:
+	ubifs_warn(c, "ignoring error %d and skipping bulk-read", err);
+	goto out_free;
+
+out_bu_off:
+	ui->read_in_a_row = ui->bulk_read = 0;
+	goto out_free;
+}
+
+/**
+ * ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
+ * @page: page from which to start bulk-read.
+ *
+ * Some flash media are capable of reading sequentially at faster rates. UBIFS
+ * bulk-read facility is designed to take advantage of that, by reading in one
+ * go consecutive data nodes that are also located consecutively in the same
+ * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ */
+static int ubifs_bulk_read(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	pgoff_t index = page->index, last_page_read = ui->last_page_read;
+	struct bu_info *bu;
+	int err = 0, allocated = 0;
+
+	ui->last_page_read = index;
+	if (!c->bulk_read)
+		return 0;
+
+	/*
+	 * Bulk-read is protected by @ui->ui_mutex, but it is an optimization,
+	 * so don't bother if we cannot lock the mutex.
+	 */
+	if (!mutex_trylock(&ui->ui_mutex))
+		return 0;
+
+	if (index != last_page_read + 1) {
+		/* Turn off bulk-read if we stop reading sequentially */
+		ui->read_in_a_row = 1;
+		if (ui->bulk_read)
+			ui->bulk_read = 0;
+		goto out_unlock;
+	}
+
+	if (!ui->bulk_read) {
+		ui->read_in_a_row += 1;
+		if (ui->read_in_a_row < 3)
+			goto out_unlock;
+		/* Three reads in a row, so switch on bulk-read */
+		ui->bulk_read = 1;
+	}
+
+	/*
+	 * If possible, try to use pre-allocated bulk-read information, which
+	 * is protected by @c->bu_mutex.
+	 */
+	if (mutex_trylock(&c->bu_mutex))
+		bu = &c->bu;
+	else {
+		bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
+		if (!bu)
+			goto out_unlock;
+
+		bu->buf = NULL;
+		allocated = 1;
+	}
+
+	bu->buf_len = c->max_bu_buf_len;
+	data_key_init(c, &bu->key, inode->i_ino,
+		      page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+	err = ubifs_do_bulk_read(c, bu, page);
+
+	if (!allocated)
+		mutex_unlock(&c->bu_mutex);
+	else
+		kfree(bu);
+
+out_unlock:
+	mutex_unlock(&ui->ui_mutex);
+	return err;
+}
+
+static int ubifs_readpage(struct file *file, struct page *page)
+{
+	if (ubifs_bulk_read(page))
+		return 0;
+	do_readpage(page);
+	unlock_page(page);
+	return 0;
+}
+
+static int do_writepage(struct page *page, int len)
+{
+	int err = 0, i, blen;
+	unsigned int block;
+	void *addr;
+	union ubifs_key key;
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+#ifdef UBIFS_DEBUG
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	spin_lock(&ui->ui_lock);
+	ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
+	spin_unlock(&ui->ui_lock);
+#endif
+
+	/* Update radix tree tags */
+	set_page_writeback(page);
+
+	addr = kmap(page);
+	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	i = 0;
+	while (len) {
+		blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+		data_key_init(c, &key, inode->i_ino, block);
+		err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
+		if (err)
+			break;
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		block += 1;
+		addr += blen;
+		len -= blen;
+	}
+	if (err) {
+		SetPageError(page);
+		ubifs_err(c, "cannot write page %lu of inode %lu, error %d",
+			  page->index, inode->i_ino, err);
+		ubifs_ro_mode(c, err);
+	}
+
+	ubifs_assert(PagePrivate(page));
+	if (PageChecked(page))
+		release_new_page_budget(c);
+	else
+		release_existing_page_budget(c);
+
+	atomic_long_dec(&c->dirty_pg_cnt);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+
+	kunmap(page);
+	unlock_page(page);
+	end_page_writeback(page);
+	return err;
+}
+
+/*
+ * When writing-back dirty inodes, VFS first writes-back pages belonging to the
+ * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
+ * situation when a we have an inode with size 0, then a megabyte of data is
+ * appended to the inode, then write-back starts and flushes some amount of the
+ * dirty pages, the journal becomes full, commit happens and finishes, and then
+ * an unclean reboot happens. When the file system is mounted next time, the
+ * inode size would still be 0, but there would be many pages which are beyond
+ * the inode size, they would be indexed and consume flash space. Because the
+ * journal has been committed, the replay would not be able to detect this
+ * situation and correct the inode size. This means UBIFS would have to scan
+ * whole index and correct all inode sizes, which is long an unacceptable.
+ *
+ * To prevent situations like this, UBIFS writes pages back only if they are
+ * within the last synchronized inode size, i.e. the size which has been
+ * written to the flash media last time. Otherwise, UBIFS forces inode
+ * write-back, thus making sure the on-flash inode contains current inode size,
+ * and then keeps writing pages back.
+ *
+ * Some locking issues explanation. 'ubifs_writepage()' first is called with
+ * the page locked, and it locks @ui_mutex. However, write-back does take inode
+ * @i_mutex, which means other VFS operations may be run on this inode at the
+ * same time. And the problematic one is truncation to smaller size, from where
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
+ * then drops the truncated pages. And while dropping the pages, it takes the
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
+ *
+ * XXX(truncate): with the new truncate sequence this is not true anymore,
+ * and the calls to truncate_setsize can be move around freely.  They should
+ * be moved to the very end of the truncate sequence.
+ *
+ * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
+ * inode size. How do we do this if @inode->i_size may became smaller while we
+ * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
+ * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
+ * internally and updates it under @ui_mutex.
+ *
+ * Q: why we do not worry that if we race with truncation, we may end up with a
+ * situation when the inode is truncated while we are in the middle of
+ * 'do_writepage()', so we do write beyond inode size?
+ * A: If we are in the middle of 'do_writepage()', truncation would be locked
+ * on the page lock and it would not write the truncated inode node to the
+ * journal before we have finished.
+ */
+static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	loff_t i_size =  i_size_read(inode), synced_i_size;
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+	void *kaddr;
+
+	dbg_gen("ino %lu, pg %lu, pg flags %#lx",
+		inode->i_ino, page->index, page->flags);
+	ubifs_assert(PagePrivate(page));
+
+	/* Is the page fully outside @i_size? (truncate in progress) */
+	if (page->index > end_index || (page->index == end_index && !len)) {
+		err = 0;
+		goto out_unlock;
+	}
+
+	spin_lock(&ui->ui_lock);
+	synced_i_size = ui->synced_i_size;
+	spin_unlock(&ui->ui_lock);
+
+	/* Is the page fully inside @i_size? */
+	if (page->index < end_index) {
+		if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+			err = inode->i_sb->s_op->write_inode(inode, NULL);
+			if (err)
+				goto out_unlock;
+			/*
+			 * The inode has been written, but the write-buffer has
+			 * not been synchronized, so in case of an unclean
+			 * reboot we may end up with some pages beyond inode
+			 * size, but they would be in the journal (because
+			 * commit flushes write buffers) and recovery would deal
+			 * with this.
+			 */
+		}
+		return do_writepage(page, PAGE_CACHE_SIZE);
+	}
+
+	/*
+	 * The page straddles @i_size. It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped. "A file is mapped
+	 * in multiples of the page size. For a file that is not a multiple of
+	 * the page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page);
+	memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+
+	if (i_size > synced_i_size) {
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
+		if (err)
+			goto out_unlock;
+	}
+
+	return do_writepage(page, len);
+
+out_unlock:
+	unlock_page(page);
+	return err;
+}
+
+/**
+ * do_attr_changes - change inode attributes.
+ * @inode: inode to change attributes for
+ * @attr: describes attributes to change
+ */
+static void do_attr_changes(struct inode *inode, const struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_UID)
+		inode->i_uid = attr->ia_uid;
+	if (attr->ia_valid & ATTR_GID)
+		inode->i_gid = attr->ia_gid;
+	if (attr->ia_valid & ATTR_ATIME)
+		inode->i_atime = timespec_trunc(attr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_MTIME)
+		inode->i_mtime = timespec_trunc(attr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_CTIME)
+		inode->i_ctime = timespec_trunc(attr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		inode->i_mode = mode;
+	}
+}
+
+/**
+ * do_truncation - truncate an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call when the inode is truncated
+ * to a smaller size. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int do_truncation(struct ubifs_info *c, struct inode *inode,
+			 const struct iattr *attr)
+{
+	int err;
+	struct ubifs_budget_req req;
+	loff_t old_size = inode->i_size, new_size = attr->ia_size;
+	int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
+	memset(&req, 0, sizeof(struct ubifs_budget_req));
+
+	/*
+	 * If this is truncation to a smaller size, and we do not truncate on a
+	 * block boundary, budget for changing one data block, because the last
+	 * block will be re-written.
+	 */
+	if (new_size & (UBIFS_BLOCK_SIZE - 1))
+		req.dirtied_page = 1;
+
+	req.dirtied_ino = 1;
+	/* A funny way to budget for truncation node */
+	req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		/*
+		 * Treat truncations to zero as deletion and always allow them,
+		 * just like we do for '->unlink()'.
+		 */
+		if (new_size || err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
+
+	truncate_setsize(inode, new_size);
+
+	if (offset) {
+		pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+		struct page *page;
+
+		page = find_lock_page(inode->i_mapping, index);
+		if (page) {
+			if (PageDirty(page)) {
+				/*
+				 * 'ubifs_jnl_truncate()' will try to truncate
+				 * the last data node, but it contains
+				 * out-of-date data because the page is dirty.
+				 * Write the page now, so that
+				 * 'ubifs_jnl_truncate()' will see an already
+				 * truncated (and up to date) data node.
+				 */
+				ubifs_assert(PagePrivate(page));
+
+				clear_page_dirty_for_io(page);
+				if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
+					offset = new_size &
+						 (PAGE_CACHE_SIZE - 1);
+				err = do_writepage(page, offset);
+				page_cache_release(page);
+				if (err)
+					goto out_budg;
+				/*
+				 * We could now tell 'ubifs_jnl_truncate()' not
+				 * to read the last block.
+				 */
+			} else {
+				/*
+				 * We could 'kmap()' the page and pass the data
+				 * to 'ubifs_jnl_truncate()' to save it from
+				 * having to read it.
+				 */
+				unlock_page(page);
+				page_cache_release(page);
+			}
+		}
+	}
+
+	mutex_lock(&ui->ui_mutex);
+	ui->ui_size = inode->i_size;
+	/* Truncation changes inode [mc]time */
+	inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+	/* Other attributes may be changed at the same time as well */
+	do_attr_changes(inode, attr);
+	err = ubifs_jnl_truncate(c, inode, old_size, new_size);
+	mutex_unlock(&ui->ui_mutex);
+
+out_budg:
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+	return err;
+}
+
+/**
+ * do_setattr - change inode attributes.
+ * @c: UBIFS file-system description object
+ * @inode: inode to change attributes for
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call for all cases except
+ * truncations to smaller size. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+static int do_setattr(struct ubifs_info *c, struct inode *inode,
+		      const struct iattr *attr)
+{
+	int err, release;
+	loff_t new_size = attr->ia_size;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 1,
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		dbg_gen("size %lld -> %lld", inode->i_size, new_size);
+		truncate_setsize(inode, new_size);
+	}
+
+	mutex_lock(&ui->ui_mutex);
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* Truncation changes inode [mc]time */
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		/* 'truncate_setsize()' changed @i_size, update @ui_size */
+		ui->ui_size = inode->i_size;
+	}
+
+	do_attr_changes(inode, attr);
+
+	release = ui->dirty;
+	if (attr->ia_valid & ATTR_SIZE)
+		/*
+		 * Inode length changed, so we have to make sure
+		 * @I_DIRTY_DATASYNC is set.
+		 */
+		 __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	else
+		mark_inode_dirty_sync(inode);
+	mutex_unlock(&ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &req);
+	if (IS_SYNC(inode))
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
+	return err;
+}
+
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int err;
+	struct inode *inode = d_inode(dentry);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	dbg_gen("ino %lu, mode %#x, ia_valid %#x",
+		inode->i_ino, inode->i_mode, attr->ia_valid);
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	err = dbg_check_synced_i_size(c, inode);
+	if (err)
+		return err;
+
+	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
+		/* Truncation to a smaller size */
+		err = do_truncation(c, inode, attr);
+	else
+		err = do_setattr(c, inode, attr);
+
+	return err;
+}
+
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	ubifs_assert(PagePrivate(page));
+	if (offset || length < PAGE_CACHE_SIZE)
+		/* Partial page remains dirty */
+		return;
+
+	if (PageChecked(page))
+		release_new_page_budget(c);
+	else
+		release_existing_page_budget(c);
+
+	atomic_long_dec(&c->dirty_pg_cnt);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+}
+
+static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ubifs_inode *ui = ubifs_inode(d_inode(dentry));
+
+	nd_set_link(nd, ui->data);
+	return NULL;
+}
+
+int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	int err;
+
+	dbg_gen("syncing inode %lu", inode->i_ino);
+
+	if (c->ro_mount)
+		/*
+		 * For some really strange reasons VFS does not filter out
+		 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+		 */
+		return 0;
+
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&inode->i_mutex);
+
+	/* Synchronize the inode unless this is a 'datasync()' call. */
+	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * Nodes related to this inode may still sit in a write-buffer. Flush
+	 * them.
+	 */
+	err = ubifs_sync_wbufs_by_inode(c, inode);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return err;
+}
+
+/**
+ * mctime_update_needed - check if mtime or ctime update is needed.
+ * @inode: the inode to do the check for
+ * @now: current time
+ *
+ * This helper function checks if the inode mtime/ctime should be updated or
+ * not. If current values of the time-stamps are within the UBIFS inode time
+ * granularity, they are not updated. This is an optimization.
+ */
+static inline int mctime_update_needed(const struct inode *inode,
+				       const struct timespec *now)
+{
+	if (!timespec_equal(&inode->i_mtime, now) ||
+	    !timespec_equal(&inode->i_ctime, now))
+		return 1;
+	return 0;
+}
+
+/**
+ * update_ctime - update mtime and ctime of an inode.
+ * @inode: inode to update
+ *
+ * This function updates mtime and ctime of the inode if it is not equivalent to
+ * current time. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int update_mctime(struct inode *inode)
+{
+	struct timespec now = ubifs_current_time(inode);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	if (mctime_update_needed(inode, &now)) {
+		int err, release;
+		struct ubifs_budget_req req = { .dirtied_ino = 1,
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+		err = ubifs_budget_space(c, &req);
+		if (err)
+			return err;
+
+		mutex_lock(&ui->ui_mutex);
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		release = ui->dirty;
+		mark_inode_dirty_sync(inode);
+		mutex_unlock(&ui->ui_mutex);
+		if (release)
+			ubifs_release_budget(c, &req);
+	}
+
+	return 0;
+}
+
+static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	int err = update_mctime(file_inode(iocb->ki_filp));
+	if (err)
+		return err;
+
+	return generic_file_write_iter(iocb, from);
+}
+
+static int ubifs_set_page_dirty(struct page *page)
+{
+	int ret;
+
+	ret = __set_page_dirty_nobuffers(page);
+	/*
+	 * An attempt to dirty a page without budgeting for it - should not
+	 * happen.
+	 */
+	ubifs_assert(ret == 0);
+	return ret;
+}
+
+static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+	/*
+	 * An attempt to release a dirty page without budgeting for it - should
+	 * not happen.
+	 */
+	if (PageWriteback(page))
+		return 0;
+	ubifs_assert(PagePrivate(page));
+	ubifs_assert(0);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+	return 1;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable.
+ * UBIFS must ensure page is budgeted for.
+ */
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+				 struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct timespec now = ubifs_current_time(inode);
+	struct ubifs_budget_req req = { .new_page = 1 };
+	int err, update_time;
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, page->index,
+		i_size_read(inode));
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (unlikely(c->ro_error))
+		return VM_FAULT_SIGBUS; /* -EROFS */
+
+	/*
+	 * We have not locked @page so far so we may budget for changing the
+	 * page. Note, we cannot do this after we locked the page, because
+	 * budgeting may cause write-back which would cause deadlock.
+	 *
+	 * At the moment we do not know whether the page is dirty or not, so we
+	 * assume that it is not and budget for a new page. We could look at
+	 * the @PG_private flag and figure this out, but we may race with write
+	 * back and the page state may change by the time we lock it, so this
+	 * would need additional care. We do not bother with this at the
+	 * moment, although it might be good idea to do. Instead, we allocate
+	 * budget for a new page and amend it later on if the page was in fact
+	 * dirty.
+	 *
+	 * The budgeting-related logic of this function is similar to what we
+	 * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
+	 * for more comments.
+	 */
+	update_time = mctime_update_needed(inode, &now);
+	if (update_time)
+		/*
+		 * We have to change inode time stamp which requires extra
+		 * budgeting.
+		 */
+		req.dirtied_ino = 1;
+
+	err = ubifs_budget_space(c, &req);
+	if (unlikely(err)) {
+		if (err == -ENOSPC)
+			ubifs_warn(c, "out of space for mmapped file (inode number %lu)",
+				   inode->i_ino);
+		return VM_FAULT_SIGBUS;
+	}
+
+	lock_page(page);
+	if (unlikely(page->mapping != inode->i_mapping ||
+		     page_offset(page) > i_size_read(inode))) {
+		/* Page got truncated out from underneath us */
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (PagePrivate(page))
+		release_new_page_budget(c);
+	else {
+		if (!PageChecked(page))
+			ubifs_convert_page_budget(c);
+		SetPagePrivate(page);
+		atomic_long_inc(&c->dirty_pg_cnt);
+		__set_page_dirty_nobuffers(page);
+	}
+
+	if (update_time) {
+		int release;
+		struct ubifs_inode *ui = ubifs_inode(inode);
+
+		mutex_lock(&ui->ui_mutex);
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		release = ui->dirty;
+		mark_inode_dirty_sync(inode);
+		mutex_unlock(&ui->ui_mutex);
+		if (release)
+			ubifs_release_dirty_inode_budget(c, ui);
+	}
+
+	wait_for_stable_page(page);
+	return VM_FAULT_LOCKED;
+
+out_unlock:
+	unlock_page(page);
+	ubifs_release_budget(c, &req);
+	if (err)
+		err = VM_FAULT_SIGBUS;
+	return err;
+}
+
+static const struct vm_operations_struct ubifs_file_vm_ops = {
+	.fault        = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = ubifs_vm_page_mkwrite,
+};
+
+static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int err;
+
+	err = generic_file_mmap(file, vma);
+	if (err)
+		return err;
+	vma->vm_ops = &ubifs_file_vm_ops;
+	return 0;
+}
+
+const struct address_space_operations ubifs_file_address_operations = {
+	.readpage       = ubifs_readpage,
+	.writepage      = ubifs_writepage,
+	.write_begin    = ubifs_write_begin,
+	.write_end      = ubifs_write_end,
+	.invalidatepage = ubifs_invalidatepage,
+	.set_page_dirty = ubifs_set_page_dirty,
+	.releasepage    = ubifs_releasepage,
+};
+
+const struct inode_operations ubifs_file_inode_operations = {
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+	.setxattr    = ubifs_setxattr,
+	.getxattr    = ubifs_getxattr,
+	.listxattr   = ubifs_listxattr,
+	.removexattr = ubifs_removexattr,
+};
+
+const struct inode_operations ubifs_symlink_inode_operations = {
+	.readlink    = generic_readlink,
+	.follow_link = ubifs_follow_link,
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+	.setxattr    = ubifs_setxattr,
+	.getxattr    = ubifs_getxattr,
+	.listxattr   = ubifs_listxattr,
+	.removexattr = ubifs_removexattr,
+};
+
+const struct file_operations ubifs_file_operations = {
+	.llseek         = generic_file_llseek,
+	.read_iter      = generic_file_read_iter,
+	.write_iter     = ubifs_write_iter,
+	.mmap           = ubifs_file_mmap,
+	.fsync          = ubifs_fsync,
+	.unlocked_ioctl = ubifs_ioctl,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/kernel/fs/ubifs/find.c b/kernel/fs/ubifs/find.c
new file mode 100644
index 000000000..2dcf3d473
--- /dev/null
+++ b/kernel/fs/ubifs/find.c
@@ -0,0 +1,985 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file contains functions for finding LEBs for various purposes e.g.
+ * garbage collection. In general, lprops category heaps and lists are used
+ * for fast access, falling back on scanning the LPT as a last resort.
+ */
+
+#include <linux/sort.h>
+#include "ubifs.h"
+
+/**
+ * struct scan_data - data provided to scan callback functions
+ * @min_space: minimum number of bytes for which to scan
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @lnum: LEB number found is returned here
+ * @exclude_index: whether to exclude index LEBs
+ */
+struct scan_data {
+	int min_space;
+	int pick_free;
+	int lnum;
+	int exclude_index;
+};
+
+/**
+ * valuable - determine whether LEB properties are valuable.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * This function return %1 if the LEB properties should be added to the LEB
+ * properties tree in memory. Otherwise %0 is returned.
+ */
+static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
+{
+	int n, cat = lprops->flags & LPROPS_CAT_MASK;
+	struct ubifs_lpt_heap *heap;
+
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		heap = &c->lpt_heap[cat - 1];
+		if (heap->cnt < heap->max_cnt)
+			return 1;
+		if (lprops->free + lprops->dirty >= c->dark_wm)
+			return 1;
+		return 0;
+	case LPROPS_EMPTY:
+		n = c->lst.empty_lebs + c->freeable_cnt -
+		    c->lst.taken_empty_lebs;
+		if (n < c->lsave_cnt)
+			return 1;
+		return 0;
+	case LPROPS_FREEABLE:
+		return 1;
+	case LPROPS_FRDI_IDX:
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * scan_for_dirty_cb - dirty space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_dirty_cb(struct ubifs_info *c,
+			     const struct ubifs_lprops *lprops, int in_tree,
+			     struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude LEBs with too little space */
+	if (lprops->free + lprops->dirty < data->min_space)
+		return ret;
+	/* If specified, exclude index LEBs */
+	if (data->exclude_index && lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* If specified, exclude empty or freeable LEBs */
+	if (lprops->free + lprops->dirty == c->leb_size) {
+		if (!data->pick_free)
+			return ret;
+	/* Exclude LEBs with too little dirty space (unless it is empty) */
+	} else if (lprops->dirty < c->dead_wm)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_dirty - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: if it is OK to return a free or freeable LEB
+ * @exclude_index: whether to exclude index LEBs
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
+						 int min_space, int pick_free,
+						 int exclude_index)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i;
+
+	/* There may be an LEB with enough dirty space on the free heap */
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		if (lprops->free + lprops->dirty < min_space)
+			continue;
+		if (lprops->dirty < c->dead_wm)
+			continue;
+		return lprops;
+	}
+	/*
+	 * A LEB may have fallen off of the bottom of the dirty heap, and ended
+	 * up as uncategorized even though it has enough dirty space for us now,
+	 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+	 * can end up as uncategorized because they are kept on lists not
+	 * finite-sized heaps.
+	 */
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		if (lprops->flags & LPROPS_TAKEN)
+			continue;
+		if (lprops->free + lprops->dirty < min_space)
+			continue;
+		if (exclude_index && (lprops->flags & LPROPS_INDEX))
+			continue;
+		if (lprops->dirty < c->dead_wm)
+			continue;
+		return lprops;
+	}
+	/* We have looked everywhere in main memory, now scan the flash */
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return ERR_PTR(-ENOSPC);
+	data.min_space = min_space;
+	data.pick_free = pick_free;
+	data.lnum = -1;
+	data.exclude_index = exclude_index;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty >= min_space);
+	ubifs_assert(lprops->dirty >= c->dead_wm ||
+		     (pick_free &&
+		      lprops->free + lprops->dirty == c->leb_size));
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
+ * @c: the UBIFS file-system description object
+ * @ret_lp: LEB properties are returned here on exit
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: controls whether it is OK to pick empty or index LEBs
+ *
+ * This function tries to find a dirty logical eraseblock which has at least
+ * @min_space free and dirty space. It prefers to take an LEB from the dirty or
+ * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
+ * or do not have an LEB which satisfies the @min_space criteria.
+ *
+ * Note, LEBs which have less than dead watermark of free + dirty space are
+ * never picked by this function.
+ *
+ * The additional @pick_free argument controls if this function has to return a
+ * free or freeable LEB if one is present. For example, GC must to set it to %1,
+ * when called from the journal space reservation function, because the
+ * appearance of free space may coincide with the loss of enough dirty space
+ * for GC to succeed anyway.
+ *
+ * In contrast, if the Garbage Collector is called from budgeting, it should
+ * just make free space, not return LEBs which are already free or freeable.
+ *
+ * In addition @pick_free is set to %2 by the recovery process in order to
+ * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
+ */
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+			 int min_space, int pick_free)
+{
+	int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
+	const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
+	struct ubifs_lpt_heap *heap, *idx_heap;
+
+	ubifs_get_lprops(c);
+
+	if (pick_free) {
+		int lebs, rsvd_idx_lebs = 0;
+
+		spin_lock(&c->space_lock);
+		lebs = c->lst.empty_lebs + c->idx_gc_cnt;
+		lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
+
+		/*
+		 * Note, the index may consume more LEBs than have been reserved
+		 * for it. It is OK because it might be consolidated by GC.
+		 * But if the index takes fewer LEBs than it is reserved for it,
+		 * this function must avoid picking those reserved LEBs.
+		 */
+		if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
+			rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
+			exclude_index = 1;
+		}
+		spin_unlock(&c->space_lock);
+
+		/* Check if there are enough free LEBs for the index */
+		if (rsvd_idx_lebs < lebs) {
+			/* OK, try to find an empty LEB */
+			lp = ubifs_fast_find_empty(c);
+			if (lp)
+				goto found;
+
+			/* Or a freeable LEB */
+			lp = ubifs_fast_find_freeable(c);
+			if (lp)
+				goto found;
+		} else
+			/*
+			 * We cannot pick free/freeable LEBs in the below code.
+			 */
+			pick_free = 0;
+	} else {
+		spin_lock(&c->space_lock);
+		exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
+		spin_unlock(&c->space_lock);
+	}
+
+	/* Look on the dirty and dirty index heaps */
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+
+	if (idx_heap->cnt && !exclude_index) {
+		idx_lp = idx_heap->arr[0];
+		sum = idx_lp->free + idx_lp->dirty;
+		/*
+		 * Since we reserve thrice as much space for the index than it
+		 * actually takes, it does not make sense to pick indexing LEBs
+		 * with less than, say, half LEB of dirty space. May be half is
+		 * not the optimal boundary - this should be tested and
+		 * checked. This boundary should determine how much we use
+		 * in-the-gaps to consolidate the index comparing to how much
+		 * we use garbage collector to consolidate it. The "half"
+		 * criteria just feels to be fine.
+		 */
+		if (sum < min_space || sum < c->half_leb_size)
+			idx_lp = NULL;
+	}
+
+	if (heap->cnt) {
+		lp = heap->arr[0];
+		if (lp->dirty + lp->free < min_space)
+			lp = NULL;
+	}
+
+	/* Pick the LEB with most space */
+	if (idx_lp && lp) {
+		if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
+			lp = idx_lp;
+	} else if (idx_lp && !lp)
+		lp = idx_lp;
+
+	if (lp) {
+		ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
+		goto found;
+	}
+
+	/* Did not find a dirty LEB on the dirty heaps, have to scan */
+	dbg_find("scanning LPT for a dirty LEB");
+	lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+	ubifs_assert(lp->dirty >= c->dead_wm ||
+		     (pick_free && lp->free + lp->dirty == c->leb_size));
+
+found:
+	dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+		 lp->lnum, lp->free, lp->dirty, lp->flags);
+
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * scan_for_free_cb - free space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_free_cb(struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops, int in_tree,
+			    struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude index LEBs */
+	if (lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* Exclude LEBs with too little space */
+	if (lprops->free < data->min_space)
+		return ret;
+	/* If specified, exclude empty LEBs */
+	if (!data->pick_free && lprops->free == c->leb_size)
+		return ret;
+	/*
+	 * LEBs that have only free and dirty space must not be allocated
+	 * because they may have been unmapped already or they may have data
+	 * that is obsolete only because of nodes that are still sitting in a
+	 * wbuf.
+	 */
+	if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * do_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of free space required
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static
+const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
+					      int min_space, int pick_free,
+					      int squeeze)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i;
+
+	if (squeeze) {
+		lprops = ubifs_fast_find_free(c);
+		if (lprops && lprops->free >= min_space)
+			return lprops;
+	}
+	if (pick_free) {
+		lprops = ubifs_fast_find_empty(c);
+		if (lprops)
+			return lprops;
+	}
+	if (!squeeze) {
+		lprops = ubifs_fast_find_free(c);
+		if (lprops && lprops->free >= min_space)
+			return lprops;
+	}
+	/* There may be an LEB with enough free space on the dirty heap */
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		if (lprops->free >= min_space)
+			return lprops;
+	}
+	/*
+	 * A LEB may have fallen off of the bottom of the free heap, and ended
+	 * up as uncategorized even though it has enough free space for us now,
+	 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+	 * can end up as uncategorized because they are kept on lists not
+	 * finite-sized heaps.
+	 */
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		if (lprops->flags & LPROPS_TAKEN)
+			continue;
+		if (lprops->flags & LPROPS_INDEX)
+			continue;
+		if (lprops->free >= min_space)
+			return lprops;
+	}
+	/* We have looked everywhere in main memory, now scan the flash */
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return ERR_PTR(-ENOSPC);
+	data.min_space = min_space;
+	data.pick_free = pick_free;
+	data.lnum = -1;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_free_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free >= min_space);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of required free space
+ * @offs: contains offset of where free space starts on exit
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function looks for an LEB with at least @min_space bytes of free space.
+ * It tries to find an empty LEB if possible. If no empty LEBs are available,
+ * this function searches for a non-empty data LEB. The returned LEB is marked
+ * as "taken".
+ *
+ * This function returns found LEB number in case of success, %-ENOSPC if it
+ * failed to find a LEB with @min_space bytes of free space and other a negative
+ * error codes in case of failure.
+ */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
+			  int squeeze)
+{
+	const struct ubifs_lprops *lprops;
+	int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
+
+	dbg_find("min_space %d", min_space);
+	ubifs_get_lprops(c);
+
+	/* Check if there are enough empty LEBs for commit */
+	spin_lock(&c->space_lock);
+	if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	if (rsvd_idx_lebs < lebs)
+		/*
+		 * OK to allocate an empty LEB, but we still don't want to go
+		 * looking for one if there aren't any.
+		 */
+		if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+			pick_free = 1;
+			/*
+			 * Because we release the space lock, we must account
+			 * for this allocation here. After the LEB properties
+			 * flags have been updated, we subtract one. Note, the
+			 * result of this is that lprops also decreases
+			 * @taken_empty_lebs in 'ubifs_change_lp()', so it is
+			 * off by one for a short period of time which may
+			 * introduce a small disturbance to budgeting
+			 * calculations, but this is harmless because at the
+			 * worst case this would make the budgeting subsystem
+			 * be more pessimistic than needed.
+			 *
+			 * Fundamentally, this is about serialization of the
+			 * budgeting and lprops subsystems. We could make the
+			 * @space_lock a mutex and avoid dropping it before
+			 * calling 'ubifs_change_lp()', but mutex is more
+			 * heavy-weight, and we want budgeting to be as fast as
+			 * possible.
+			 */
+			c->lst.taken_empty_lebs += 1;
+		}
+	spin_unlock(&c->space_lock);
+
+	lprops = do_find_free_space(c, min_space, pick_free, squeeze);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	lnum = lprops->lnum;
+	flags = lprops->flags | LPROPS_TAKEN;
+
+	lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	if (pick_free) {
+		spin_lock(&c->space_lock);
+		c->lst.taken_empty_lebs -= 1;
+		spin_unlock(&c->space_lock);
+	}
+
+	*offs = c->leb_size - lprops->free;
+	ubifs_release_lprops(c);
+
+	if (*offs == 0) {
+		/*
+		 * Ensure that empty LEBs have been unmapped. They may not have
+		 * been, for example, because of an unclean unmount.  Also
+		 * LEBs that were freeable LEBs (free + dirty == leb_size) will
+		 * not have been unmapped.
+		 */
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+
+	dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
+	ubifs_assert(*offs <= c->leb_size - min_space);
+	return lnum;
+
+out:
+	if (pick_free) {
+		spin_lock(&c->space_lock);
+		c->lst.taken_empty_lebs -= 1;
+		spin_unlock(&c->space_lock);
+	}
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_idx_cb(struct ubifs_info *c,
+			   const struct ubifs_lprops *lprops, int in_tree,
+			   struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude index LEBS */
+	if (lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* Exclude LEBs that cannot be made empty */
+	if (lprops->free + lprops->dirty != c->leb_size)
+		return ret;
+	/*
+	 * We are allocating for the index so it is safe to allocate LEBs with
+	 * only free and dirty space, because write buffers are sync'd at commit
+	 * start.
+	 */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_leb_for_idx - scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ */
+static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct scan_data data;
+	int err;
+
+	data.lnum = -1;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_idx_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_free_leb_for_idx - find a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ *
+ * This function looks for a free LEB and returns that LEB number. The returned
+ * LEB is marked as "taken", "index".
+ *
+ * Only empty LEBs are allocated. This is for two reasons. First, the commit
+ * calculates the number of LEBs to allocate based on the assumption that they
+ * will be empty. Secondly, free space at the end of an index LEB is not
+ * guaranteed to be empty because it may have been used by the in-the-gaps
+ * method prior to an unclean unmount.
+ *
+ * If no LEB is found %-ENOSPC is returned. For other failures another negative
+ * error code is returned.
+ */
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lprops;
+	int lnum = -1, err, flags;
+
+	ubifs_get_lprops(c);
+
+	lprops = ubifs_fast_find_empty(c);
+	if (!lprops) {
+		lprops = ubifs_fast_find_freeable(c);
+		if (!lprops) {
+			/*
+			 * The first condition means the following: go scan the
+			 * LPT if there are uncategorized lprops, which means
+			 * there may be freeable LEBs there (UBIFS does not
+			 * store the information about freeable LEBs in the
+			 * master node).
+			 */
+			if (c->in_a_category_cnt != c->main_lebs ||
+			    c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+				ubifs_assert(c->freeable_cnt == 0);
+				lprops = scan_for_leb_for_idx(c);
+				if (IS_ERR(lprops)) {
+					err = PTR_ERR(lprops);
+					goto out;
+				}
+			}
+		}
+	}
+
+	if (!lprops) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	lnum = lprops->lnum;
+
+	dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+		 lnum, lprops->free, lprops->dirty, lprops->flags);
+
+	flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
+	lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	ubifs_release_lprops(c);
+
+	/*
+	 * Ensure that empty LEBs have been unmapped. They may not have been,
+	 * for example, because of an unclean unmount. Also LEBs that were
+	 * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
+	 */
+	err = ubifs_leb_unmap(c, lnum);
+	if (err) {
+		ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				    LPROPS_TAKEN | LPROPS_INDEX, 0);
+		return err;
+	}
+
+	return lnum;
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+static int cmp_dirty_idx(const struct ubifs_lprops **a,
+			 const struct ubifs_lprops **b)
+{
+	const struct ubifs_lprops *lpa = *a;
+	const struct ubifs_lprops *lpb = *b;
+
+	return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
+}
+
+static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
+			   int size)
+{
+	struct ubifs_lprops *t = *a;
+
+	*a = *b;
+	*b = t;
+}
+
+/**
+ * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is called each commit to create an array of LEB numbers of
+ * dirty index LEBs sorted in order of dirty and free space.  This is used by
+ * the in-the-gaps method of TNC commit.
+ */
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
+{
+	int i;
+
+	ubifs_get_lprops(c);
+	/* Copy the LPROPS_DIRTY_IDX heap */
+	c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
+	memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
+	       sizeof(void *) * c->dirty_idx.cnt);
+	/* Sort it so that the dirtiest is now at the end */
+	sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
+	     (int (*)(const void *, const void *))cmp_dirty_idx,
+	     (void (*)(void *, void *, int))swap_dirty_idx);
+	dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
+	if (c->dirty_idx.cnt)
+		dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
+	/* Replace the lprops pointers with LEB numbers */
+	for (i = 0; i < c->dirty_idx.cnt; i++)
+		c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
+	ubifs_release_lprops(c);
+	return 0;
+}
+
+/**
+ * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_dirty_idx_cb(struct ubifs_info *c,
+			   const struct ubifs_lprops *lprops, int in_tree,
+			   struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude non-index LEBs */
+	if (!(lprops->flags & LPROPS_INDEX))
+		return ret;
+	/* Exclude LEBs with too little space */
+	if (lprops->free + lprops->dirty < c->min_idx_node_sz)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * find_dirty_idx_leb - find a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB number upon success and a negative error code upon
+ * failure.  In particular, -ENOSPC is returned if a dirty index LEB is not
+ * found.
+ *
+ * Note that this function scans the entire LPT but it is called very rarely.
+ */
+static int find_dirty_idx_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i, ret;
+
+	/* Check all structures in memory first */
+	data.lnum = -1;
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return -ENOSPC;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+				    &data);
+	if (err)
+		return err;
+found:
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return PTR_ERR(lprops);
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert((lprops->flags & LPROPS_INDEX));
+
+	dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
+		 lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
+
+	lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
+				 lprops->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lprops))
+		return PTR_ERR(lprops);
+
+	return lprops->lnum;
+}
+
+/**
+ * get_idx_gc_leb - try to get a LEB number from trivial GC.
+ * @c: the UBIFS file-system description object
+ */
+static int get_idx_gc_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int err, lnum;
+
+	err = ubifs_get_idx_gc_leb(c);
+	if (err < 0)
+		return err;
+	lnum = err;
+	/*
+	 * The LEB was due to be unmapped after the commit but
+	 * it is needed now for this commit.
+	 */
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_INDEX, -1);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+	dbg_find("LEB %d, dirty %d and free %d flags %#x",
+		 lp->lnum, lp->dirty, lp->free, lp->flags);
+	return lnum;
+}
+
+/**
+ * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
+ * @c: the UBIFS file-system description object
+ */
+static int find_dirtiest_idx_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int lnum;
+
+	while (1) {
+		if (!c->dirty_idx.cnt)
+			return -ENOSPC;
+		/* The lprops pointers were replaced by LEB numbers */
+		lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
+		lp = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lp))
+			return PTR_ERR(lp);
+		if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
+			continue;
+		lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+				     lp->flags | LPROPS_TAKEN, 0);
+		if (IS_ERR(lp))
+			return PTR_ERR(lp);
+		break;
+	}
+	dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
+		 lp->free, lp->flags);
+	ubifs_assert(lp->flags & LPROPS_TAKEN);
+	ubifs_assert(lp->flags & LPROPS_INDEX);
+	return lnum;
+}
+
+/**
+ * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
+ * @c: the UBIFS file-system description object
+ *
+ * This function attempts to find an untaken index LEB with the most free and
+ * dirty space that can be used without overwriting index nodes that were in the
+ * last index committed.
+ */
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
+{
+	int err;
+
+	ubifs_get_lprops(c);
+
+	/*
+	 * We made an array of the dirtiest index LEB numbers as at the start of
+	 * last commit.  Try that array first.
+	 */
+	err = find_dirtiest_idx_leb(c);
+
+	/* Next try scanning the entire LPT */
+	if (err == -ENOSPC)
+		err = find_dirty_idx_leb(c);
+
+	/* Finally take any index LEBs awaiting trivial GC */
+	if (err == -ENOSPC)
+		err = get_idx_gc_leb(c);
+
+	ubifs_release_lprops(c);
+	return err;
+}
diff --git a/kernel/fs/ubifs/gc.c b/kernel/fs/ubifs/gc.c
new file mode 100644
index 000000000..9718da86a
--- /dev/null
+++ b/kernel/fs/ubifs/gc.c
@@ -0,0 +1,984 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements garbage collection. The procedure for garbage collection
+ * is different depending on whether a LEB as an index LEB (contains index
+ * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
+ * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
+ * nodes to the journal, at which point the garbage-collected LEB is free to be
+ * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
+ * dirty in the TNC, and after the next commit, the garbage-collected LEB is
+ * to be reused. Garbage collection will cause the number of dirty index nodes
+ * to grow, however sufficient space is reserved for the index to ensure the
+ * commit will never run out of space.
+ *
+ * Notes about dead watermark. At current UBIFS implementation we assume that
+ * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
+ * and not worth garbage-collecting. The dead watermark is one min. I/O unit
+ * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
+ * Garbage Collector has to synchronize the GC head's write buffer before
+ * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
+ * actually reclaim even very small pieces of dirty space by garbage collecting
+ * enough dirty LEBs, but we do not bother doing this at this implementation.
+ *
+ * Notes about dark watermark. The results of GC work depends on how big are
+ * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
+ * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
+ * have to waste large pieces of free space at the end of LEB B, because nodes
+ * from LEB A would not fit. And the worst situation is when all nodes are of
+ * maximum size. So dark watermark is the amount of free + dirty space in LEB
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
+ * be unable to reclaim it. So, LEBs with free + dirty greater than dark
+ * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
+ * good, and GC takes extra care when moving them.
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/list_sort.h>
+#include "ubifs.h"
+
+/*
+ * GC may need to move more than one LEB to make progress. The below constants
+ * define "soft" and "hard" limits on the number of LEBs the garbage collector
+ * may move.
+ */
+#define SOFT_LEBS_LIMIT 4
+#define HARD_LEBS_LIMIT 32
+
+/**
+ * switch_gc_head - switch the garbage collection journal head.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to write
+ * @len: length of the buffer to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function switch the GC head to the next LEB which is reserved in
+ * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
+ * and other negative error code in case of failures.
+ */
+static int switch_gc_head(struct ubifs_info *c)
+{
+	int err, gc_lnum = c->gc_lnum;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	ubifs_assert(gc_lnum != -1);
+	dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
+	       wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
+	       c->leb_size - wbuf->offs - wbuf->used);
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		return err;
+
+	/*
+	 * The GC write-buffer was synchronized, we may safely unmap
+	 * 'c->gc_lnum'.
+	 */
+	err = ubifs_leb_unmap(c, gc_lnum);
+	if (err)
+		return err;
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		return err;
+
+	err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
+	if (err)
+		return err;
+
+	c->gc_lnum = -1;
+	err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0);
+	return err;
+}
+
+/**
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	if (a == b)
+		return 0;
+
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+	ubifs_assert(sa->type == UBIFS_DATA_NODE);
+	ubifs_assert(sb->type == UBIFS_DATA_NODE);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		unsigned int blka = key_block(c, &sa->key);
+		unsigned int blkb = key_block(c, &sb->key);
+
+		if (blka <= blkb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+			     struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	if (a == b)
+		return 0;
+
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+
+	ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
+		     key_type(c, &sb->key) != UBIFS_DATA_KEY);
+	ubifs_assert(sa->type != UBIFS_DATA_NODE &&
+		     sb->type != UBIFS_DATA_NODE);
+
+	/* Inodes go before directory entries */
+	if (sa->type == UBIFS_INO_NODE) {
+		if (sb->type == UBIFS_INO_NODE)
+			return sb->len - sa->len;
+		return -1;
+	}
+	if (sb->type == UBIFS_INO_NODE)
+		return 1;
+
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sa->key) == UBIFS_XENT_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sb->key) == UBIFS_XENT_KEY);
+	ubifs_assert(sa->type == UBIFS_DENT_NODE ||
+		     sa->type == UBIFS_XENT_NODE);
+	ubifs_assert(sb->type == UBIFS_DENT_NODE ||
+		     sb->type == UBIFS_XENT_NODE);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		uint32_t hasha = key_hash(c, &sa->key);
+		uint32_t hashb = key_hash(c, &sb->key);
+
+		if (hasha <= hashb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * sort_nodes - sort nodes for GC.
+ * @c: UBIFS file-system description object
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
+ *
+ * This function sorts the list of inodes to garbage collect. First of all, it
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
+ *
+ * Non-data nodes are sorted as follows.
+ *   o First go inode nodes - they are sorted in descending length order.
+ *   o Then go directory entry nodes - they are sorted in hash order, which
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
+ *     inode number go before direntry nodes with higher parent inode number,
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		      struct list_head *nondata, int *min)
+{
+	int err;
+	struct ubifs_scan_node *snod, *tmp;
+
+	*min = INT_MAX;
+
+	/* Separate data nodes and non-data nodes */
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		ubifs_assert(snod->type == UBIFS_INO_NODE  ||
+			     snod->type == UBIFS_DATA_NODE ||
+			     snod->type == UBIFS_DENT_NODE ||
+			     snod->type == UBIFS_XENT_NODE ||
+			     snod->type == UBIFS_TRUN_NODE);
+
+		if (snod->type != UBIFS_INO_NODE  &&
+		    snod->type != UBIFS_DATA_NODE &&
+		    snod->type != UBIFS_DENT_NODE &&
+		    snod->type != UBIFS_XENT_NODE) {
+			/* Probably truncation node, zap it */
+			list_del(&snod->list);
+			kfree(snod);
+			continue;
+		}
+
+		ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
+			     key_type(c, &snod->key) == UBIFS_INO_KEY  ||
+			     key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+			     key_type(c, &snod->key) == UBIFS_XENT_KEY);
+
+		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
+					 snod->offs, 0);
+		if (err < 0)
+			return err;
+
+		if (!err) {
+			/* The node is obsolete, remove it from the list */
+			list_del(&snod->list);
+			kfree(snod);
+			continue;
+		}
+
+		if (snod->len < *min)
+			*min = snod->len;
+
+		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
+			list_move_tail(&snod->list, nondata);
+	}
+
+	/* Sort data and non-data nodes */
+	list_sort(c, &sleb->nodes, &data_nodes_cmp);
+	list_sort(c, nondata, &nondata_nodes_cmp);
+
+	err = dbg_check_data_nodes_order(c, &sleb->nodes);
+	if (err)
+		return err;
+	err = dbg_check_nondata_nodes_order(c, nondata);
+	if (err)
+		return err;
+	return 0;
+}
+
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+
+	cond_resched();
+	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+	if (err)
+		return err;
+
+	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+				snod->offs, new_lnum, new_offs,
+				snod->len);
+	list_del(&snod->list);
+	kfree(snod);
+	return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+	int err, min;
+	LIST_HEAD(nondata);
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	if (wbuf->lnum == -1) {
+		/*
+		 * The GC journal head is not set, because it is the first GC
+		 * invocation since mount.
+		 */
+		err = switch_gc_head(c);
+		if (err)
+			return err;
+	}
+
+	err = sort_nodes(c, sleb, &nondata, &min);
+	if (err)
+		goto out;
+
+	/* Write nodes to their new location. Use the first-fit strategy */
+	while (1) {
+		int avail;
+		struct ubifs_scan_node *snod, *tmp;
+
+		/* Move data nodes */
+		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if  (snod->len > avail)
+				/*
+				 * Do not skip data nodes in order to optimize
+				 * bulk-read.
+				 */
+				break;
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
+
+		/* Move non-data nodes */
+		list_for_each_entry_safe(snod, tmp, &nondata, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if (avail < min)
+				break;
+
+			if  (snod->len > avail) {
+				/*
+				 * Keep going only if this is an inode with
+				 * some data. Otherwise stop and switch the GC
+				 * head. IOW, we assume that data-less inode
+				 * nodes and direntry nodes are roughly of the
+				 * same size.
+				 */
+				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+				    snod->len == UBIFS_INO_NODE_SZ)
+					break;
+				continue;
+			}
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
+
+		if (list_empty(&sleb->nodes) && list_empty(&nondata))
+			break;
+
+		/*
+		 * Waste the rest of the space in the LEB and switch to the
+		 * next LEB.
+		 */
+		err = switch_gc_head(c);
+		if (err)
+			goto out;
+	}
+
+	return 0;
+
+out:
+	list_splice_tail(&nondata, &sleb->nodes);
+	return err;
+}
+
+/**
+ * gc_sync_wbufs - sync write-buffers for GC.
+ * @c: UBIFS file-system description object
+ *
+ * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
+ * be in a write-buffer instead. That is, a node could be written to a
+ * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
+ * erased before the write-buffer is sync'd and then there is an unclean
+ * unmount, then an existing node is lost. To avoid this, we sync all
+ * write-buffers.
+ *
+ * This function returns %0 on success or a negative error code on failure.
+ */
+static int gc_sync_wbufs(struct ubifs_info *c)
+{
+	int err, i;
+
+	for (i = 0; i < c->jhead_cnt; i++) {
+		if (i == GCHD)
+			continue;
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/**
+ * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lp: describes the LEB to garbage collect
+ *
+ * This function garbage-collects an LEB and returns one of the @LEB_FREED,
+ * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of failures.
+ */
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+	int err = 0, lnum = lp->lnum;
+
+	ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
+		     c->need_recovery);
+	ubifs_assert(c->gc_lnum != lnum);
+	ubifs_assert(wbuf->lnum != lnum);
+
+	if (lp->free + lp->dirty == c->leb_size) {
+		/* Special case - a free LEB  */
+		dbg_gc("LEB %d is free, return it", lp->lnum);
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+
+		if (lp->free != c->leb_size) {
+			/*
+			 * Write buffers must be sync'd before unmapping
+			 * freeable LEBs, because one of them may contain data
+			 * which obsoletes something in 'lp->pnum'.
+			 */
+			err = gc_sync_wbufs(c);
+			if (err)
+				return err;
+			err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+						  0, 0, 0, 0);
+			if (err)
+				return err;
+		}
+		err = ubifs_leb_unmap(c, lp->lnum);
+		if (err)
+			return err;
+
+		if (c->gc_lnum == -1) {
+			c->gc_lnum = lnum;
+			return LEB_RETAINED;
+		}
+
+		return LEB_FREED;
+	}
+
+	/*
+	 * We scan the entire LEB even though we only really need to scan up to
+	 * (c->leb_size - lp->free).
+	 */
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+
+	ubifs_assert(!list_empty(&sleb->nodes));
+	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+
+	if (snod->type == UBIFS_IDX_NODE) {
+		struct ubifs_gced_idx_leb *idx_gc;
+
+		dbg_gc("indexing LEB %d (free %d, dirty %d)",
+		       lnum, lp->free, lp->dirty);
+		list_for_each_entry(snod, &sleb->nodes, list) {
+			struct ubifs_idx_node *idx = snod->node;
+			int level = le16_to_cpu(idx->level);
+
+			ubifs_assert(snod->type == UBIFS_IDX_NODE);
+			key_read(c, ubifs_idx_key(c, idx), &snod->key);
+			err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
+						   snod->offs);
+			if (err)
+				goto out;
+		}
+
+		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+		if (!idx_gc) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		idx_gc->lnum = lnum;
+		idx_gc->unmap = 0;
+		list_add(&idx_gc->list, &c->idx_gc);
+
+		/*
+		 * Don't release the LEB until after the next commit, because
+		 * it may contain data which is needed for recovery. So
+		 * although we freed this LEB, it will become usable only after
+		 * the commit.
+		 */
+		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
+					  LPROPS_INDEX, 1);
+		if (err)
+			goto out;
+		err = LEB_FREED_IDX;
+	} else {
+		dbg_gc("data LEB %d (free %d, dirty %d)",
+		       lnum, lp->free, lp->dirty);
+
+		err = move_nodes(c, sleb);
+		if (err)
+			goto out_inc_seq;
+
+		err = gc_sync_wbufs(c);
+		if (err)
+			goto out_inc_seq;
+
+		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
+		if (err)
+			goto out_inc_seq;
+
+		/* Allow for races with TNC */
+		c->gced_lnum = lnum;
+		smp_wmb();
+		c->gc_seq += 1;
+		smp_wmb();
+
+		if (c->gc_lnum == -1) {
+			c->gc_lnum = lnum;
+			err = LEB_RETAINED;
+		} else {
+			err = ubifs_wbuf_sync_nolock(wbuf);
+			if (err)
+				goto out;
+
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				goto out;
+
+			err = LEB_FREED;
+		}
+	}
+
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_inc_seq:
+	/* We may have moved at least some nodes so allow for races with TNC */
+	c->gced_lnum = lnum;
+	smp_wmb();
+	c->gc_seq += 1;
+	smp_wmb();
+	goto out;
+}
+
+/**
+ * ubifs_garbage_collect - UBIFS garbage collector.
+ * @c: UBIFS file-system description object
+ * @anyway: do GC even if there are free LEBs
+ *
+ * This function does out-of-place garbage collection. The return codes are:
+ *   o positive LEB number if the LEB has been freed and may be used;
+ *   o %-EAGAIN if the caller has to run commit;
+ *   o %-ENOSPC if GC failed to make any progress;
+ *   o other negative error codes in case of other errors.
+ *
+ * Garbage collector writes data to the journal when GC'ing data LEBs, and just
+ * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
+ * commit may be required. But commit cannot be run from inside GC, because the
+ * caller might be holding the commit lock, so %-EAGAIN is returned instead;
+ * And this error code means that the caller has to run commit, and re-run GC
+ * if there is still no free space.
+ *
+ * There are many reasons why this function may return %-EAGAIN:
+ * o the log is full and there is no space to write an LEB reference for
+ *   @c->gc_lnum;
+ * o the journal is too large and exceeds size limitations;
+ * o GC moved indexing LEBs, but they can be used only after the commit;
+ * o the shrinker fails to find clean znodes to free and requests the commit;
+ * o etc.
+ *
+ * Note, if the file-system is close to be full, this function may return
+ * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
+ * the function. E.g., this happens if the limits on the journal size are too
+ * tough and GC writes too much to the journal before an LEB is freed. This
+ * might also mean that the journal is too large, and the TNC becomes to big,
+ * so that the shrinker is constantly called, finds not clean znodes to free,
+ * and requests commit. Well, this may also happen if the journal is all right,
+ * but another kernel process consumes too much memory. Anyway, infinite
+ * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
+ */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
+{
+	int i, err, ret, min_space = c->dead_wm;
+	struct ubifs_lprops lp;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	ubifs_assert_cmt_locked(c);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (ubifs_gc_should_commit(c))
+		return -EAGAIN;
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+	if (c->ro_error) {
+		ret = -EROFS;
+		goto out_unlock;
+	}
+
+	/* We expect the write-buffer to be empty on entry */
+	ubifs_assert(!wbuf->used);
+
+	for (i = 0; ; i++) {
+		int space_before, space_after;
+
+		cond_resched();
+
+		/* Give the commit an opportunity to run */
+		if (ubifs_gc_should_commit(c)) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
+			/*
+			 * We've done enough iterations. Indexing LEBs were
+			 * moved and will be available after the commit.
+			 */
+			dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
+			ubifs_commit_required(c);
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (i > HARD_LEBS_LIMIT) {
+			/*
+			 * We've moved too many LEBs and have not made
+			 * progress, give up.
+			 */
+			dbg_gc("hard limit, -ENOSPC");
+			ret = -ENOSPC;
+			break;
+		}
+
+		/*
+		 * Empty and freeable LEBs can turn up while we waited for
+		 * the wbuf lock, or while we have been running GC. In that
+		 * case, we should just return one of those instead of
+		 * continuing to GC dirty LEBs. Hence we request
+		 * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
+		 */
+		ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
+		if (ret) {
+			if (ret == -ENOSPC)
+				dbg_gc("no more dirty LEBs");
+			break;
+		}
+
+		dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)",
+		       lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty,
+		       min_space);
+
+		space_before = c->leb_size - wbuf->offs - wbuf->used;
+		if (wbuf->lnum == -1)
+			space_before = 0;
+
+		ret = ubifs_garbage_collect_leb(c, &lp);
+		if (ret < 0) {
+			if (ret == -EAGAIN) {
+				/*
+				 * This is not error, so we have to return the
+				 * LEB to lprops. But if 'ubifs_return_leb()'
+				 * fails, its failure code is propagated to the
+				 * caller instead of the original '-EAGAIN'.
+				 */
+				err = ubifs_return_leb(c, lp.lnum);
+				if (err)
+					ret = err;
+				break;
+			}
+			goto out;
+		}
+
+		if (ret == LEB_FREED) {
+			/* An LEB has been freed and is ready for use */
+			dbg_gc("LEB %d freed, return", lp.lnum);
+			ret = lp.lnum;
+			break;
+		}
+
+		if (ret == LEB_FREED_IDX) {
+			/*
+			 * This was an indexing LEB and it cannot be
+			 * immediately used. And instead of requesting the
+			 * commit straight away, we try to garbage collect some
+			 * more.
+			 */
+			dbg_gc("indexing LEB %d freed, continue", lp.lnum);
+			continue;
+		}
+
+		ubifs_assert(ret == LEB_RETAINED);
+		space_after = c->leb_size - wbuf->offs - wbuf->used;
+		dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
+		       space_after - space_before);
+
+		if (space_after > space_before) {
+			/* GC makes progress, keep working */
+			min_space >>= 1;
+			if (min_space < c->dead_wm)
+				min_space = c->dead_wm;
+			continue;
+		}
+
+		dbg_gc("did not make progress");
+
+		/*
+		 * GC moved an LEB bud have not done any progress. This means
+		 * that the previous GC head LEB contained too few free space
+		 * and the LEB which was GC'ed contained only large nodes which
+		 * did not fit that space.
+		 *
+		 * We can do 2 things:
+		 * 1. pick another LEB in a hope it'll contain a small node
+		 *    which will fit the space we have at the end of current GC
+		 *    head LEB, but there is no guarantee, so we try this out
+		 *    unless we have already been working for too long;
+		 * 2. request an LEB with more dirty space, which will force
+		 *    'ubifs_find_dirty_leb()' to start scanning the lprops
+		 *    table, instead of just picking one from the heap
+		 *    (previously it already picked the dirtiest LEB).
+		 */
+		if (i < SOFT_LEBS_LIMIT) {
+			dbg_gc("try again");
+			continue;
+		}
+
+		min_space <<= 1;
+		if (min_space > c->dark_wm)
+			min_space = c->dark_wm;
+		dbg_gc("set min. space to %d", min_space);
+	}
+
+	if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
+		dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
+		ubifs_commit_required(c);
+		ret = -EAGAIN;
+	}
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (!err)
+		err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err) {
+		ret = err;
+		goto out;
+	}
+out_unlock:
+	mutex_unlock(&wbuf->io_mutex);
+	return ret;
+
+out:
+	ubifs_assert(ret < 0);
+	ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
+	ubifs_wbuf_sync_nolock(wbuf);
+	ubifs_ro_mode(c, ret);
+	mutex_unlock(&wbuf->io_mutex);
+	ubifs_return_leb(c, lp.lnum);
+	return ret;
+}
+
+/**
+ * ubifs_gc_start_commit - garbage collection at start of commit.
+ * @c: UBIFS file-system description object
+ *
+ * If a LEB has only dirty and free space, then we may safely unmap it and make
+ * it free.  Note, we cannot do this with indexing LEBs because dirty space may
+ * correspond index nodes that are required for recovery.  In that case, the
+ * LEB cannot be unmapped until after the next commit.
+ *
+ * This function returns %0 upon success and a negative error code upon failure.
+ */
+int ubifs_gc_start_commit(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc;
+	const struct ubifs_lprops *lp;
+	int err = 0, flags;
+
+	ubifs_get_lprops(c);
+
+	/*
+	 * Unmap (non-index) freeable LEBs. Note that recovery requires that all
+	 * wbufs are sync'd before this, which is done in 'do_commit()'.
+	 */
+	while (1) {
+		lp = ubifs_fast_find_freeable(c);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		if (!lp)
+			break;
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+		err = ubifs_leb_unmap(c, lp->lnum);
+		if (err)
+			goto out;
+		lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+	}
+
+	/* Mark GC'd index LEBs OK to unmap after this commit finishes */
+	list_for_each_entry(idx_gc, &c->idx_gc, list)
+		idx_gc->unmap = 1;
+
+	/* Record index freeable LEBs for unmapping after commit */
+	while (1) {
+		lp = ubifs_fast_find_frdi_idx(c);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		if (!lp)
+			break;
+		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+		if (!idx_gc) {
+			err = -ENOMEM;
+			goto out;
+		}
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(lp->flags & LPROPS_INDEX);
+		/* Don't release the LEB until after the next commit */
+		flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
+		lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			kfree(idx_gc);
+			goto out;
+		}
+		ubifs_assert(lp->flags & LPROPS_TAKEN);
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+		idx_gc->lnum = lp->lnum;
+		idx_gc->unmap = 1;
+		list_add(&idx_gc->list, &c->idx_gc);
+	}
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_gc_end_commit - garbage collection at end of commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function completes out-of-place garbage collection of index LEBs.
+ */
+int ubifs_gc_end_commit(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc, *tmp;
+	struct ubifs_wbuf *wbuf;
+	int err = 0;
+
+	wbuf = &c->jheads[GCHD].wbuf;
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
+		if (idx_gc->unmap) {
+			dbg_gc("LEB %d", idx_gc->lnum);
+			err = ubifs_leb_unmap(c, idx_gc->lnum);
+			if (err)
+				goto out;
+			err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
+			if (err)
+				goto out;
+			list_del(&idx_gc->list);
+			kfree(idx_gc);
+		}
+out:
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * ubifs_destroy_idx_gc - destroy idx_gc list.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the @c->idx_gc list. It is called when unmounting
+ * so locks are not needed. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
+{
+	while (!list_empty(&c->idx_gc)) {
+		struct ubifs_gced_idx_leb *idx_gc;
+
+		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
+				    list);
+		c->idx_gc_cnt -= 1;
+		list_del(&idx_gc->list);
+		kfree(idx_gc);
+	}
+}
+
+/**
+ * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
+ * @c: UBIFS file-system description object
+ *
+ * Called during start commit so locks are not needed.
+ */
+int ubifs_get_idx_gc_leb(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc;
+	int lnum;
+
+	if (list_empty(&c->idx_gc))
+		return -ENOSPC;
+	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
+	lnum = idx_gc->lnum;
+	/* c->idx_gc_cnt is updated by the caller when lprops are updated */
+	list_del(&idx_gc->list);
+	kfree(idx_gc);
+	return lnum;
+}
diff --git a/kernel/fs/ubifs/io.c b/kernel/fs/ubifs/io.c
new file mode 100644
index 000000000..97be41215
--- /dev/null
+++ b/kernel/fs/ubifs/io.c
@@ -0,0 +1,1150 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file implements UBIFS I/O subsystem which provides various I/O-related
+ * helper functions (reading/writing/checking/validating nodes) and implements
+ * write-buffering support. Write buffers help to save space which otherwise
+ * would have been wasted for padding to the nearest minimal I/O unit boundary.
+ * Instead, data first goes to the write-buffer and is flushed when the
+ * buffer is full or when it is not used for some time (by timer). This is
+ * similar to the mechanism is used by JFFS2.
+ *
+ * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
+ * write size (@c->max_write_size). The latter is the maximum amount of bytes
+ * the underlying flash is able to program at a time, and writing in
+ * @c->max_write_size units should presumably be faster. Obviously,
+ * @c->min_io_size <= @c->max_write_size. Write-buffers are of
+ * @c->max_write_size bytes in size for maximum performance. However, when a
+ * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
+ * boundary) which contains data is written, not the whole write-buffer,
+ * because this is more space-efficient.
+ *
+ * This optimization adds few complications to the code. Indeed, on the one
+ * hand, we want to write in optimal @c->max_write_size bytes chunks, which
+ * also means aligning writes at the @c->max_write_size bytes offsets. On the
+ * other hand, we do not want to waste space when synchronizing the write
+ * buffer, so during synchronization we writes in smaller chunks. And this makes
+ * the next write offset to be not aligned to @c->max_write_size bytes. So the
+ * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
+ * to @c->max_write_size bytes again. We do this by temporarily shrinking
+ * write-buffer size (@wbuf->size).
+ *
+ * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
+ * mutexes defined inside these objects. Since sometimes upper-level code
+ * has to lock the write-buffer (e.g. journal space reservation code), many
+ * functions related to write-buffers have "nolock" suffix which means that the
+ * caller has to lock the write-buffer before calling this function.
+ *
+ * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
+ * aligned, UBIFS starts the next node from the aligned address, and the padded
+ * bytes may contain any rubbish. In other words, UBIFS does not put padding
+ * bytes in those small gaps. Common headers of nodes store real node lengths,
+ * not aligned lengths. Indexing nodes also store real lengths in branches.
+ *
+ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
+ * uses padding nodes or padding bytes, if the padding node does not fit.
+ *
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
+ * they are read from the flash media.
+ */
+
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+	if (!c->ro_error) {
+		c->ro_error = 1;
+		c->no_chk_data_crc = 0;
+		c->vfs_sb->s_flags |= MS_RDONLY;
+		ubifs_warn(c, "switched to read-only mode, error %d", err);
+		dump_stack();
+	}
+}
+
+/*
+ * Below are simple wrappers over UBI I/O functions which include some
+ * additional checks and UBIFS debugging stuff. See corresponding UBI function
+ * for more information.
+ */
+
+int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
+		   int len, int even_ebadmsg)
+{
+	int err;
+
+	err = ubi_read(c->ubi, lnum, buf, offs, len);
+	/*
+	 * In case of %-EBADMSG print the error message only if the
+	 * @even_ebadmsg is true.
+	 */
+	if (err && (err != -EBADMSG || even_ebadmsg)) {
+		ubifs_err(c, "reading %d bytes from LEB %d:%d failed, error %d",
+			  len, lnum, offs, err);
+		dump_stack();
+	}
+	return err;
+}
+
+int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
+		    int len)
+{
+	int err;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+	if (!dbg_is_tst_rcvry(c))
+		err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
+	else
+		err = dbg_leb_write(c, lnum, buf, offs, len);
+	if (err) {
+		ubifs_err(c, "writing %d bytes to LEB %d:%d failed, error %d",
+			  len, lnum, offs, err);
+		ubifs_ro_mode(c, err);
+		dump_stack();
+	}
+	return err;
+}
+
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len)
+{
+	int err;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+	if (!dbg_is_tst_rcvry(c))
+		err = ubi_leb_change(c->ubi, lnum, buf, len);
+	else
+		err = dbg_leb_change(c, lnum, buf, len);
+	if (err) {
+		ubifs_err(c, "changing %d bytes in LEB %d failed, error %d",
+			  len, lnum, err);
+		ubifs_ro_mode(c, err);
+		dump_stack();
+	}
+	return err;
+}
+
+int ubifs_leb_unmap(struct ubifs_info *c, int lnum)
+{
+	int err;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+	if (!dbg_is_tst_rcvry(c))
+		err = ubi_leb_unmap(c->ubi, lnum);
+	else
+		err = dbg_leb_unmap(c, lnum);
+	if (err) {
+		ubifs_err(c, "unmap LEB %d failed, error %d", lnum, err);
+		ubifs_ro_mode(c, err);
+		dump_stack();
+	}
+	return err;
+}
+
+int ubifs_leb_map(struct ubifs_info *c, int lnum)
+{
+	int err;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+	if (!dbg_is_tst_rcvry(c))
+		err = ubi_leb_map(c->ubi, lnum);
+	else
+		err = dbg_leb_map(c, lnum);
+	if (err) {
+		ubifs_err(c, "mapping LEB %d failed, error %d", lnum, err);
+		ubifs_ro_mode(c, err);
+		dump_stack();
+	}
+	return err;
+}
+
+int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
+{
+	int err;
+
+	err = ubi_is_mapped(c->ubi, lnum);
+	if (err < 0) {
+		ubifs_err(c, "ubi_is_mapped failed for LEB %d, error %d",
+			  lnum, err);
+		dump_stack();
+	}
+	return err;
+}
+
+/**
+ * ubifs_check_node - check node.
+ * @c: UBIFS file-system description object
+ * @buf: node to check
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ * @must_chk_crc: indicates whether to always check the CRC
+ *
+ * This function checks node magic number and CRC checksum. This function also
+ * validates node length to prevent UBIFS from becoming crazy when an attacker
+ * feeds it a file-system image with incorrect nodes. For example, too large
+ * node length in the common header could cause UBIFS to read memory outside of
+ * allocated buffer when checking the CRC checksum.
+ *
+ * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
+ * true, which is controlled by corresponding UBIFS mount option. However, if
+ * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
+ * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
+ * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
+ * is checked. This is because during mounting or re-mounting from R/O mode to
+ * R/W mode we may read journal nodes (when replying the journal or doing the
+ * recovery) and the journal nodes may potentially be corrupted, so checking is
+ * required.
+ *
+ * This function returns zero in case of success and %-EUCLEAN in case of bad
+ * CRC or magic.
+ */
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+		     int offs, int quiet, int must_chk_crc)
+{
+	int err = -EINVAL, type, node_len;
+	uint32_t crc, node_crc, magic;
+	const struct ubifs_ch *ch = buf;
+
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+
+	magic = le32_to_cpu(ch->magic);
+	if (magic != UBIFS_NODE_MAGIC) {
+		if (!quiet)
+			ubifs_err(c, "bad magic %#08x, expected %#08x",
+				  magic, UBIFS_NODE_MAGIC);
+		err = -EUCLEAN;
+		goto out;
+	}
+
+	type = ch->node_type;
+	if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
+		if (!quiet)
+			ubifs_err(c, "bad node type %d", type);
+		goto out;
+	}
+
+	node_len = le32_to_cpu(ch->len);
+	if (node_len + offs > c->leb_size)
+		goto out_len;
+
+	if (c->ranges[type].max_len == 0) {
+		if (node_len != c->ranges[type].len)
+			goto out_len;
+	} else if (node_len < c->ranges[type].min_len ||
+		   node_len > c->ranges[type].max_len)
+		goto out_len;
+
+	if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
+	    !c->remounting_rw && c->no_chk_data_crc)
+		return 0;
+
+	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+	node_crc = le32_to_cpu(ch->crc);
+	if (crc != node_crc) {
+		if (!quiet)
+			ubifs_err(c, "bad CRC: calculated %#08x, read %#08x",
+				  crc, node_crc);
+		err = -EUCLEAN;
+		goto out;
+	}
+
+	return 0;
+
+out_len:
+	if (!quiet)
+		ubifs_err(c, "bad node length %d", node_len);
+out:
+	if (!quiet) {
+		ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
+		ubifs_dump_node(c, buf);
+		dump_stack();
+	}
+	return err;
+}
+
+/**
+ * ubifs_pad - pad flash space.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to put padding to
+ * @pad: how many bytes to pad
+ *
+ * The flash media obliges us to write only in chunks of %c->min_io_size and
+ * when we have to write less data we add padding node to the write-buffer and
+ * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
+ * media is being scanned. If the amount of wasted space is not enough to fit a
+ * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
+ * pattern (%UBIFS_PADDING_BYTE).
+ *
+ * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
+ * used.
+ */
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
+{
+	uint32_t crc;
+
+	ubifs_assert(pad >= 0 && !(pad & 7));
+
+	if (pad >= UBIFS_PAD_NODE_SZ) {
+		struct ubifs_ch *ch = buf;
+		struct ubifs_pad_node *pad_node = buf;
+
+		ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+		ch->node_type = UBIFS_PAD_NODE;
+		ch->group_type = UBIFS_NO_NODE_GROUP;
+		ch->padding[0] = ch->padding[1] = 0;
+		ch->sqnum = 0;
+		ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
+		pad -= UBIFS_PAD_NODE_SZ;
+		pad_node->pad_len = cpu_to_le32(pad);
+		crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
+		ch->crc = cpu_to_le32(crc);
+		memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
+	} else if (pad > 0)
+		/* Too little space, padding node won't fit */
+		memset(buf, UBIFS_PADDING_BYTE, pad);
+}
+
+/**
+ * next_sqnum - get next sequence number.
+ * @c: UBIFS file-system description object
+ */
+static unsigned long long next_sqnum(struct ubifs_info *c)
+{
+	unsigned long long sqnum;
+
+	spin_lock(&c->cnt_lock);
+	sqnum = ++c->max_sqnum;
+	spin_unlock(&c->cnt_lock);
+
+	if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
+		if (sqnum >= SQNUM_WATERMARK) {
+			ubifs_err(c, "sequence number overflow %llu, end of life",
+				  sqnum);
+			ubifs_ro_mode(c, -EINVAL);
+		}
+		ubifs_warn(c, "running out of sequence numbers, end of life soon");
+	}
+
+	return sqnum;
+}
+
+/**
+ * ubifs_prepare_node - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero.
+ */
+void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+	uint32_t crc;
+	struct ubifs_ch *ch = node;
+	unsigned long long sqnum = next_sqnum(c);
+
+	ubifs_assert(len >= UBIFS_CH_SZ);
+
+	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+	ch->len = cpu_to_le32(len);
+	ch->group_type = UBIFS_NO_NODE_GROUP;
+	ch->sqnum = cpu_to_le64(sqnum);
+	ch->padding[0] = ch->padding[1] = 0;
+	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+	ch->crc = cpu_to_le32(crc);
+
+	if (pad) {
+		len = ALIGN(len, 8);
+		pad = ALIGN(len, c->min_io_size) - len;
+		ubifs_pad(c, node + len, pad);
+	}
+}
+
+/**
+ * ubifs_prep_grp_node - prepare node of a group to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @last: indicates the last node of the group
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC and fills the common header.
+ */
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
+{
+	uint32_t crc;
+	struct ubifs_ch *ch = node;
+	unsigned long long sqnum = next_sqnum(c);
+
+	ubifs_assert(len >= UBIFS_CH_SZ);
+
+	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+	ch->len = cpu_to_le32(len);
+	if (last)
+		ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
+	else
+		ch->group_type = UBIFS_IN_NODE_GROUP;
+	ch->sqnum = cpu_to_le64(sqnum);
+	ch->padding[0] = ch->padding[1] = 0;
+	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+	ch->crc = cpu_to_le32(crc);
+}
+
+/**
+ * wbuf_timer_callback - write-buffer timer callback function.
+ * @timer: timer data (write-buffer descriptor)
+ *
+ * This function is called when the write-buffer timer expires.
+ */
+static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
+{
+	struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
+
+	dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
+	wbuf->need_sync = 1;
+	wbuf->c->need_wbuf_sync = 1;
+	ubifs_wake_up_bgt(wbuf->c);
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * new_wbuf_timer - start new write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+	ubifs_assert(!hrtimer_active(&wbuf->timer));
+
+	if (wbuf->no_timer)
+		return;
+	dbg_io("set timer for jhead %s, %llu-%llu millisecs",
+	       dbg_jhead(wbuf->jhead),
+	       div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
+	       div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
+		       USEC_PER_SEC));
+	hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+			       HRTIMER_MODE_REL);
+}
+
+/**
+ * cancel_wbuf_timer - cancel write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+	if (wbuf->no_timer)
+		return;
+	wbuf->need_sync = 0;
+	hrtimer_cancel(&wbuf->timer);
+}
+
+/**
+ * ubifs_wbuf_sync_nolock - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This function synchronizes write-buffer @buf and returns zero in case of
+ * success or a negative error code in case of failure.
+ *
+ * Note, although write-buffers are of @c->max_write_size, this function does
+ * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
+ * if the write-buffer is only partially filled with data, only the used part
+ * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
+ * This way we waste less space.
+ */
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
+{
+	struct ubifs_info *c = wbuf->c;
+	int err, dirt, sync_len;
+
+	cancel_wbuf_timer_nolock(wbuf);
+	if (!wbuf->used || wbuf->lnum == -1)
+		/* Write-buffer is empty or not seeked */
+		return 0;
+
+	dbg_io("LEB %d:%d, %d bytes, jhead %s",
+	       wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
+	ubifs_assert(!(wbuf->avail & 7));
+	ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
+	ubifs_assert(wbuf->size >= c->min_io_size);
+	ubifs_assert(wbuf->size <= c->max_write_size);
+	ubifs_assert(wbuf->size % c->min_io_size == 0);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
+
+	if (c->ro_error)
+		return -EROFS;
+
+	/*
+	 * Do not write whole write buffer but write only the minimum necessary
+	 * amount of min. I/O units.
+	 */
+	sync_len = ALIGN(wbuf->used, c->min_io_size);
+	dirt = sync_len - wbuf->used;
+	if (dirt)
+		ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
+	err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len);
+	if (err)
+		return err;
+
+	spin_lock(&wbuf->lock);
+	wbuf->offs += sync_len;
+	/*
+	 * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
+	 * But our goal is to optimize writes and make sure we write in
+	 * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
+	 * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
+	 * sure that @wbuf->offs + @wbuf->size is aligned to
+	 * @c->max_write_size. This way we make sure that after next
+	 * write-buffer flush we are again at the optimal offset (aligned to
+	 * @c->max_write_size).
+	 */
+	if (c->leb_size - wbuf->offs < c->max_write_size)
+		wbuf->size = c->leb_size - wbuf->offs;
+	else if (wbuf->offs & (c->max_write_size - 1))
+		wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+	else
+		wbuf->size = c->max_write_size;
+	wbuf->avail = wbuf->size;
+	wbuf->used = 0;
+	wbuf->next_ino = 0;
+	spin_unlock(&wbuf->lock);
+
+	if (wbuf->sync_callback)
+		err = wbuf->sync_callback(c, wbuf->lnum,
+					  c->leb_size - wbuf->offs, dirt);
+	return err;
+}
+
+/**
+ * ubifs_wbuf_seek_nolock - seek write-buffer.
+ * @wbuf: write-buffer
+ * @lnum: logical eraseblock number to seek to
+ * @offs: logical eraseblock offset to seek to
+ *
+ * This function targets the write-buffer to logical eraseblock @lnum:@offs.
+ * The write-buffer has to be empty. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+
+	dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
+	ubifs_assert(offs >= 0 && offs <= c->leb_size);
+	ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
+	ubifs_assert(lnum != wbuf->lnum);
+	ubifs_assert(wbuf->used == 0);
+
+	spin_lock(&wbuf->lock);
+	wbuf->lnum = lnum;
+	wbuf->offs = offs;
+	if (c->leb_size - wbuf->offs < c->max_write_size)
+		wbuf->size = c->leb_size - wbuf->offs;
+	else if (wbuf->offs & (c->max_write_size - 1))
+		wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+	else
+		wbuf->size = c->max_write_size;
+	wbuf->avail = wbuf->size;
+	wbuf->used = 0;
+	spin_unlock(&wbuf->lock);
+
+	return 0;
+}
+
+/**
+ * ubifs_bg_wbufs_sync - synchronize write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by background thread to synchronize write-buffers.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_bg_wbufs_sync(struct ubifs_info *c)
+{
+	int err, i;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (!c->need_wbuf_sync)
+		return 0;
+	c->need_wbuf_sync = 0;
+
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_timers;
+	}
+
+	dbg_io("synchronize");
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		cond_resched();
+
+		/*
+		 * If the mutex is locked then wbuf is being changed, so
+		 * synchronization is not necessary.
+		 */
+		if (mutex_is_locked(&wbuf->io_mutex))
+			continue;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		if (!wbuf->need_sync) {
+			mutex_unlock(&wbuf->io_mutex);
+			continue;
+		}
+
+		err = ubifs_wbuf_sync_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+		if (err) {
+			ubifs_err(c, "cannot sync write-buffer, error %d", err);
+			ubifs_ro_mode(c, err);
+			goto out_timers;
+		}
+	}
+
+	return 0;
+
+out_timers:
+	/* Cancel all timers to prevent repeated errors */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		cancel_wbuf_timer_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+	}
+	return err;
+}
+
+/**
+ * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
+ * @wbuf: write-buffer
+ * @buf: node to write
+ * @len: node length
+ *
+ * This function writes data to flash via write-buffer @wbuf. This means that
+ * the last piece of the node won't reach the flash media immediately if it
+ * does not take whole max. write unit (@c->max_write_size). Instead, the node
+ * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
+ * because more data are appended to the write-buffer).
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If the node cannot be written because there is no more
+ * space in this logical eraseblock, %-ENOSPC is returned.
+ */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
+{
+	struct ubifs_info *c = wbuf->c;
+	int err, written, n, aligned_len = ALIGN(len, 8);
+
+	dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
+	       dbg_ntype(((struct ubifs_ch *)buf)->node_type),
+	       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
+	ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
+	ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
+	ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
+	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
+	ubifs_assert(wbuf->size >= c->min_io_size);
+	ubifs_assert(wbuf->size <= c->max_write_size);
+	ubifs_assert(wbuf->size % c->min_io_size == 0);
+	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	ubifs_assert(!c->space_fixup);
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
+
+	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	cancel_wbuf_timer_nolock(wbuf);
+
+	if (c->ro_error)
+		return -EROFS;
+
+	if (aligned_len <= wbuf->avail) {
+		/*
+		 * The node is not very large and fits entirely within
+		 * write-buffer.
+		 */
+		memcpy(wbuf->buf + wbuf->used, buf, len);
+
+		if (aligned_len == wbuf->avail) {
+			dbg_io("flush jhead %s wbuf to LEB %d:%d",
+			       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+			err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf,
+					      wbuf->offs, wbuf->size);
+			if (err)
+				goto out;
+
+			spin_lock(&wbuf->lock);
+			wbuf->offs += wbuf->size;
+			if (c->leb_size - wbuf->offs >= c->max_write_size)
+				wbuf->size = c->max_write_size;
+			else
+				wbuf->size = c->leb_size - wbuf->offs;
+			wbuf->avail = wbuf->size;
+			wbuf->used = 0;
+			wbuf->next_ino = 0;
+			spin_unlock(&wbuf->lock);
+		} else {
+			spin_lock(&wbuf->lock);
+			wbuf->avail -= aligned_len;
+			wbuf->used += aligned_len;
+			spin_unlock(&wbuf->lock);
+		}
+
+		goto exit;
+	}
+
+	written = 0;
+
+	if (wbuf->used) {
+		/*
+		 * The node is large enough and does not fit entirely within
+		 * current available space. We have to fill and flush
+		 * write-buffer and switch to the next max. write unit.
+		 */
+		dbg_io("flush jhead %s wbuf to LEB %d:%d",
+		       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+		memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+		err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs,
+				      wbuf->size);
+		if (err)
+			goto out;
+
+		wbuf->offs += wbuf->size;
+		len -= wbuf->avail;
+		aligned_len -= wbuf->avail;
+		written += wbuf->avail;
+	} else if (wbuf->offs & (c->max_write_size - 1)) {
+		/*
+		 * The write-buffer offset is not aligned to
+		 * @c->max_write_size and @wbuf->size is less than
+		 * @c->max_write_size. Write @wbuf->size bytes to make sure the
+		 * following writes are done in optimal @c->max_write_size
+		 * chunks.
+		 */
+		dbg_io("write %d bytes to LEB %d:%d",
+		       wbuf->size, wbuf->lnum, wbuf->offs);
+		err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs,
+				      wbuf->size);
+		if (err)
+			goto out;
+
+		wbuf->offs += wbuf->size;
+		len -= wbuf->size;
+		aligned_len -= wbuf->size;
+		written += wbuf->size;
+	}
+
+	/*
+	 * The remaining data may take more whole max. write units, so write the
+	 * remains multiple to max. write unit size directly to the flash media.
+	 * We align node length to 8-byte boundary because we anyway flash wbuf
+	 * if the remaining space is less than 8 bytes.
+	 */
+	n = aligned_len >> c->max_write_shift;
+	if (n) {
+		n <<= c->max_write_shift;
+		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
+		       wbuf->offs);
+		err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+				      wbuf->offs, n);
+		if (err)
+			goto out;
+		wbuf->offs += n;
+		aligned_len -= n;
+		len -= n;
+		written += n;
+	}
+
+	spin_lock(&wbuf->lock);
+	if (aligned_len)
+		/*
+		 * And now we have what's left and what does not take whole
+		 * max. write unit, so write it to the write-buffer and we are
+		 * done.
+		 */
+		memcpy(wbuf->buf, buf + written, len);
+
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		wbuf->size = c->max_write_size;
+	else
+		wbuf->size = c->leb_size - wbuf->offs;
+	wbuf->avail = wbuf->size - aligned_len;
+	wbuf->used = aligned_len;
+	wbuf->next_ino = 0;
+	spin_unlock(&wbuf->lock);
+
+exit:
+	if (wbuf->sync_callback) {
+		int free = c->leb_size - wbuf->offs - wbuf->used;
+
+		err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
+		if (err)
+			goto out;
+	}
+
+	if (wbuf->used)
+		new_wbuf_timer_nolock(wbuf);
+
+	return 0;
+
+out:
+	ubifs_err(c, "cannot write %d bytes to LEB %d:%d, error %d",
+		  len, wbuf->lnum, wbuf->offs, err);
+	ubifs_dump_node(c, buf);
+	dump_stack();
+	ubifs_dump_leb(c, wbuf->lnum);
+	return err;
+}
+
+/**
+ * ubifs_write_node - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
+		     int offs)
+{
+	int err, buf_len = ALIGN(len, c->min_io_size);
+
+	dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
+	       lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
+	       buf_len);
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	ubifs_assert(!c->space_fixup);
+
+	if (c->ro_error)
+		return -EROFS;
+
+	ubifs_prepare_node(c, buf, len, 1);
+	err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
+	if (err)
+		ubifs_dump_node(c, buf);
+
+	return err;
+}
+
+/**
+ * ubifs_read_node_wbuf - read node from the media or write-buffer.
+ * @wbuf: wbuf to check for un-written data
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and length, checks it and stores
+ * in @buf. If the node partially or fully sits in the write-buffer, this
+ * function takes data from the buffer, otherwise it reads the flash media.
+ * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
+ * error code in case of failure.
+ */
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+			 int lnum, int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+	int err, rlen, overlap;
+	struct ubifs_ch *ch = buf;
+
+	dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
+	       dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
+	ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+	spin_lock(&wbuf->lock);
+	overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+	if (!overlap) {
+		/* We may safely unlock the write-buffer and read the data */
+		spin_unlock(&wbuf->lock);
+		return ubifs_read_node(c, buf, type, len, lnum, offs);
+	}
+
+	/* Don't read under wbuf */
+	rlen = wbuf->offs - offs;
+	if (rlen < 0)
+		rlen = 0;
+
+	/* Copy the rest from the write-buffer */
+	memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+	spin_unlock(&wbuf->lock);
+
+	if (rlen > 0) {
+		/* Read everything that goes before write-buffer */
+		err = ubifs_leb_read(c, lnum, buf, offs, rlen, 0);
+		if (err && err != -EBADMSG)
+			return err;
+	}
+
+	if (type != ch->node_type) {
+		ubifs_err(c, "bad node type (%d but expected %d)",
+			  ch->node_type, type);
+		goto out;
+	}
+
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
+	if (err) {
+		ubifs_err(c, "expected node type %d", type);
+		return err;
+	}
+
+	rlen = le32_to_cpu(ch->len);
+	if (rlen != len) {
+		ubifs_err(c, "bad node length %d, expected %d", rlen, len);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
+	ubifs_dump_node(c, buf);
+	dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_node - read node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and and length, checks it and
+ * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
+ * and a negative error code in case of failure.
+ */
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+		    int lnum, int offs)
+{
+	int err, l;
+	struct ubifs_ch *ch = buf;
+
+	dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+	err = ubifs_leb_read(c, lnum, buf, offs, len, 0);
+	if (err && err != -EBADMSG)
+		return err;
+
+	if (type != ch->node_type) {
+		ubifs_errc(c, "bad node type (%d but expected %d)",
+			   ch->node_type, type);
+		goto out;
+	}
+
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
+	if (err) {
+		ubifs_errc(c, "expected node type %d", type);
+		return err;
+	}
+
+	l = le32_to_cpu(ch->len);
+	if (l != len) {
+		ubifs_errc(c, "bad node length %d, expected %d", l, len);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum,
+		   offs, ubi_is_mapped(c->ubi, lnum));
+	if (!c->probing) {
+		ubifs_dump_node(c, buf);
+		dump_stack();
+	}
+	return -EINVAL;
+}
+
+/**
+ * ubifs_wbuf_init - initialize write-buffer.
+ * @c: UBIFS file-system description object
+ * @wbuf: write-buffer to initialize
+ *
+ * This function initializes write-buffer. Returns zero in case of success
+ * %-ENOMEM in case of failure.
+ */
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
+{
+	size_t size;
+
+	wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
+	if (!wbuf->buf)
+		return -ENOMEM;
+
+	size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+	wbuf->inodes = kmalloc(size, GFP_KERNEL);
+	if (!wbuf->inodes) {
+		kfree(wbuf->buf);
+		wbuf->buf = NULL;
+		return -ENOMEM;
+	}
+
+	wbuf->used = 0;
+	wbuf->lnum = wbuf->offs = -1;
+	/*
+	 * If the LEB starts at the max. write size aligned address, then
+	 * write-buffer size has to be set to @c->max_write_size. Otherwise,
+	 * set it to something smaller so that it ends at the closest max.
+	 * write size boundary.
+	 */
+	size = c->max_write_size - (c->leb_start % c->max_write_size);
+	wbuf->avail = wbuf->size = size;
+	wbuf->sync_callback = NULL;
+	mutex_init(&wbuf->io_mutex);
+	spin_lock_init(&wbuf->lock);
+	wbuf->c = c;
+	wbuf->next_ino = 0;
+
+	hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	wbuf->timer.function = wbuf_timer_callback_nolock;
+	wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
+	wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
+	wbuf->delta *= 1000000000ULL;
+	ubifs_assert(wbuf->delta <= ULONG_MAX);
+	return 0;
+}
+
+/**
+ * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
+ * @wbuf: the write-buffer where to add
+ * @inum: the inode number
+ *
+ * This function adds an inode number to the inode array of the write-buffer.
+ */
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+	if (!wbuf->buf)
+		/* NOR flash or something similar */
+		return;
+
+	spin_lock(&wbuf->lock);
+	if (wbuf->used)
+		wbuf->inodes[wbuf->next_ino++] = inum;
+	spin_unlock(&wbuf->lock);
+}
+
+/**
+ * wbuf_has_ino - returns if the wbuf contains data from the inode.
+ * @wbuf: the write-buffer
+ * @inum: the inode number
+ *
+ * This function returns with %1 if the write-buffer contains some data from the
+ * given inode otherwise it returns with %0.
+ */
+static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+	int i, ret = 0;
+
+	spin_lock(&wbuf->lock);
+	for (i = 0; i < wbuf->next_ino; i++)
+		if (inum == wbuf->inodes[i]) {
+			ret = 1;
+			break;
+		}
+	spin_unlock(&wbuf->lock);
+
+	return ret;
+}
+
+/**
+ * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to synchronize
+ *
+ * This function synchronizes write-buffers which contain nodes belonging to
+ * @inode. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
+{
+	int i, err = 0;
+
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		if (i == GCHD)
+			/*
+			 * GC head is special, do not look at it. Even if the
+			 * head contains something related to this inode, it is
+			 * a _copy_ of corresponding on-flash node which sits
+			 * somewhere else.
+			 */
+			continue;
+
+		if (!wbuf_has_ino(wbuf, inode->i_ino))
+			continue;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		if (wbuf_has_ino(wbuf, inode->i_ino))
+			err = ubifs_wbuf_sync_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+
+		if (err) {
+			ubifs_ro_mode(c, err);
+			return err;
+		}
+	}
+	return 0;
+}
diff --git a/kernel/fs/ubifs/ioctl.c b/kernel/fs/ubifs/ioctl.c
new file mode 100644
index 000000000..3c7b29de0
--- /dev/null
+++ b/kernel/fs/ubifs/ioctl.c
@@ -0,0 +1,205 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Zoltan Sogor
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/* This file implements EXT2-compatible extended attribute ioctl() calls */
+
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_set_inode_flags - set VFS inode flags.
+ * @inode: VFS inode to set flags for
+ *
+ * This function propagates flags from UBIFS inode object to VFS inode object.
+ */
+void ubifs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = ubifs_inode(inode)->flags;
+
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+	if (flags & UBIFS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & UBIFS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & UBIFS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & UBIFS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
+ * @ioctl_flags: flags to convert
+ *
+ * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * (@UBIFS_COMPR_FL, etc).
+ */
+static int ioctl2ubifs(int ioctl_flags)
+{
+	int ubifs_flags = 0;
+
+	if (ioctl_flags & FS_COMPR_FL)
+		ubifs_flags |= UBIFS_COMPR_FL;
+	if (ioctl_flags & FS_SYNC_FL)
+		ubifs_flags |= UBIFS_SYNC_FL;
+	if (ioctl_flags & FS_APPEND_FL)
+		ubifs_flags |= UBIFS_APPEND_FL;
+	if (ioctl_flags & FS_IMMUTABLE_FL)
+		ubifs_flags |= UBIFS_IMMUTABLE_FL;
+	if (ioctl_flags & FS_DIRSYNC_FL)
+		ubifs_flags |= UBIFS_DIRSYNC_FL;
+
+	return ubifs_flags;
+}
+
+/*
+ * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
+ * @ubifs_flags: flags to convert
+ *
+ * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
+ * (@FS_COMPR_FL, etc).
+ */
+static int ubifs2ioctl(int ubifs_flags)
+{
+	int ioctl_flags = 0;
+
+	if (ubifs_flags & UBIFS_COMPR_FL)
+		ioctl_flags |= FS_COMPR_FL;
+	if (ubifs_flags & UBIFS_SYNC_FL)
+		ioctl_flags |= FS_SYNC_FL;
+	if (ubifs_flags & UBIFS_APPEND_FL)
+		ioctl_flags |= FS_APPEND_FL;
+	if (ubifs_flags & UBIFS_IMMUTABLE_FL)
+		ioctl_flags |= FS_IMMUTABLE_FL;
+	if (ubifs_flags & UBIFS_DIRSYNC_FL)
+		ioctl_flags |= FS_DIRSYNC_FL;
+
+	return ioctl_flags;
+}
+
+static int setflags(struct inode *inode, int flags)
+{
+	int oldflags, err, release;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_budget_req req = { .dirtied_ino = 1,
+					.dirtied_ino_d = ui->data_len };
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 */
+	mutex_lock(&ui->ui_mutex);
+	oldflags = ubifs2ioctl(ui->flags);
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_unlock;
+		}
+	}
+
+	ui->flags = ioctl2ubifs(flags);
+	ubifs_set_inode_flags(inode);
+	inode->i_ctime = ubifs_current_time(inode);
+	release = ui->dirty;
+	mark_inode_dirty_sync(inode);
+	mutex_unlock(&ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &req);
+	if (IS_SYNC(inode))
+		err = write_inode_now(inode, 1);
+	return err;
+
+out_unlock:
+	ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino);
+	mutex_unlock(&ui->ui_mutex);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int flags, err;
+	struct inode *inode = file_inode(file);
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+
+		dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
+		return put_user(flags, (int __user *) arg);
+
+	case FS_IOC_SETFLAGS: {
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~FS_DIRSYNC_FL;
+
+		/*
+		 * Make sure the file-system is read-write and make sure it
+		 * will not become read-only while we are changing the flags.
+		 */
+		err = mnt_want_write_file(file);
+		if (err)
+			return err;
+		dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
+		err = setflags(inode, flags);
+		mnt_drop_write_file(file);
+		return err;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/kernel/fs/ubifs/journal.c b/kernel/fs/ubifs/journal.c
new file mode 100644
index 000000000..0b9da5b6e
--- /dev/null
+++ b/kernel/fs/ubifs/journal.c
@@ -0,0 +1,1466 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS journal.
+ *
+ * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
+ * length and position, while a bud logical eraseblock is any LEB in the main
+ * area. Buds contain file system data - data nodes, inode nodes, etc. The log
+ * contains only references to buds and some other stuff like commit
+ * start node. The idea is that when we commit the journal, we do
+ * not copy the data, the buds just become indexed. Since after the commit the
+ * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
+ * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
+ * become leafs in the future.
+ *
+ * The journal is multi-headed because we want to write data to the journal as
+ * optimally as possible. It is nice to have nodes belonging to the same inode
+ * in one LEB, so we may write data owned by different inodes to different
+ * journal heads, although at present only one data head is used.
+ *
+ * For recovery reasons, the base head contains all inode nodes, all directory
+ * entry nodes and all truncate nodes. This means that the other heads contain
+ * only data nodes.
+ *
+ * Bud LEBs may be half-indexed. For example, if the bud was not full at the
+ * time of commit, the bud is retained to continue to be used in the journal,
+ * even though the "front" of the LEB is now indexed. In that case, the log
+ * reference contains the offset where the bud starts for the purposes of the
+ * journal.
+ *
+ * The journal size has to be limited, because the larger is the journal, the
+ * longer it takes to mount UBIFS (scanning the journal) and the more memory it
+ * takes (indexing in the TNC).
+ *
+ * All the journal write operations like 'ubifs_jnl_update()' here, which write
+ * multiple UBIFS nodes to the journal at one go, are atomic with respect to
+ * unclean reboots. Should the unclean reboot happen, the recovery code drops
+ * all the nodes.
+ */
+
+#include "ubifs.h"
+
+/**
+ * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
+ * @ino: the inode to zero out
+ */
+static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
+{
+	memset(ino->padding1, 0, 4);
+	memset(ino->padding2, 0, 26);
+}
+
+/**
+ * zero_dent_node_unused - zero out unused fields of an on-flash directory
+ *                         entry node.
+ * @dent: the directory entry to zero out
+ */
+static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
+{
+	dent->padding1 = 0;
+	memset(dent->padding2, 0, 4);
+}
+
+/**
+ * zero_data_node_unused - zero out unused fields of an on-flash data node.
+ * @data: the data node to zero out
+ */
+static inline void zero_data_node_unused(struct ubifs_data_node *data)
+{
+	memset(data->padding, 0, 2);
+}
+
+/**
+ * zero_trun_node_unused - zero out unused fields of an on-flash truncation
+ *                         node.
+ * @trun: the truncation node to zero out
+ */
+static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
+{
+	memset(trun->padding, 0, 12);
+}
+
+/**
+ * reserve_space - reserve space in the journal.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head number
+ * @len: node length
+ *
+ * This function reserves space in journal head @head. If the reservation
+ * succeeded, the journal head stays locked and later has to be unlocked using
+ * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
+ * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
+ * other negative error codes in case of other failures.
+ */
+static int reserve_space(struct ubifs_info *c, int jhead, int len)
+{
+	int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	/*
+	 * Typically, the base head has smaller nodes written to it, so it is
+	 * better to try to allocate space at the ends of eraseblocks. This is
+	 * what the squeeze parameter does.
+	 */
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	squeeze = (jhead == BASEHD);
+again:
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_unlock;
+	}
+
+	avail = c->leb_size - wbuf->offs - wbuf->used;
+	if (wbuf->lnum != -1 && avail >= len)
+		return 0;
+
+	/*
+	 * Write buffer wasn't seek'ed or there is no enough space - look for an
+	 * LEB with some empty space.
+	 */
+	lnum = ubifs_find_free_space(c, len, &offs, squeeze);
+	if (lnum >= 0)
+		goto out;
+
+	err = lnum;
+	if (err != -ENOSPC)
+		goto out_unlock;
+
+	/*
+	 * No free space, we have to run garbage collector to make
+	 * some. But the write-buffer mutex has to be unlocked because
+	 * GC also takes it.
+	 */
+	dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
+	mutex_unlock(&wbuf->io_mutex);
+
+	lnum = ubifs_garbage_collect(c, 0);
+	if (lnum < 0) {
+		err = lnum;
+		if (err != -ENOSPC)
+			return err;
+
+		/*
+		 * GC could not make a free LEB. But someone else may
+		 * have allocated new bud for this journal head,
+		 * because we dropped @wbuf->io_mutex, so try once
+		 * again.
+		 */
+		dbg_jnl("GC couldn't make a free LEB for jhead %s",
+			dbg_jhead(jhead));
+		if (retries++ < 2) {
+			dbg_jnl("retry (%d)", retries);
+			goto again;
+		}
+
+		dbg_jnl("return -ENOSPC");
+		return err;
+	}
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
+	avail = c->leb_size - wbuf->offs - wbuf->used;
+
+	if (wbuf->lnum != -1 && avail >= len) {
+		/*
+		 * Someone else has switched the journal head and we have
+		 * enough space now. This happens when more than one process is
+		 * trying to write to the same journal head at the same time.
+		 */
+		dbg_jnl("return LEB %d back, already have LEB %d:%d",
+			lnum, wbuf->lnum, wbuf->offs + wbuf->used);
+		err = ubifs_return_leb(c, lnum);
+		if (err)
+			goto out_unlock;
+		return 0;
+	}
+
+	offs = 0;
+
+out:
+	/*
+	 * Make sure we synchronize the write-buffer before we add the new bud
+	 * to the log. Otherwise we may have a power cut after the log
+	 * reference node for the last bud (@lnum) is written but before the
+	 * write-buffer data are written to the next-to-last bud
+	 * (@wbuf->lnum). And the effect would be that the recovery would see
+	 * that there is corruption in the next-to-last bud.
+	 */
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		goto out_return;
+	err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+	if (err)
+		goto out_return;
+	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs);
+	if (err)
+		goto out_unlock;
+
+	return 0;
+
+out_unlock:
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+
+out_return:
+	/* An error occurred and the LEB has to be returned to lprops */
+	ubifs_assert(err < 0);
+	err1 = ubifs_return_leb(c, lnum);
+	if (err1 && err == -EAGAIN)
+		/*
+		 * Return original error code only if it is not %-EAGAIN,
+		 * which is not really an error. Otherwise, return the error
+		 * code of 'ubifs_return_leb()'.
+		 */
+		err = err1;
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * write_node - write node to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @node: node to write
+ * @len: node length
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function writes a node to reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
+		      int *lnum, int *offs)
+{
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	ubifs_assert(jhead != GCHD);
+
+	*lnum = c->jheads[jhead].wbuf.lnum;
+	*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+
+	dbg_jnl("jhead %s, LEB %d:%d, len %d",
+		dbg_jhead(jhead), *lnum, *offs, len);
+	ubifs_prepare_node(c, node, len, 0);
+
+	return ubifs_wbuf_write_nolock(wbuf, node, len);
+}
+
+/**
+ * write_head - write data to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @buf: buffer to write
+ * @len: length to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ * @sync: non-zero if the write-buffer has to by synchronized
+ *
+ * This function is the same as 'write_node()' but it does not assume the
+ * buffer it is writing is a node, so it does not prepare it (which means
+ * initializing common header and calculating CRC).
+ */
+static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
+		      int *lnum, int *offs, int sync)
+{
+	int err;
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	ubifs_assert(jhead != GCHD);
+
+	*lnum = c->jheads[jhead].wbuf.lnum;
+	*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+	dbg_jnl("jhead %s, LEB %d:%d, len %d",
+		dbg_jhead(jhead), *lnum, *offs, len);
+
+	err = ubifs_wbuf_write_nolock(wbuf, buf, len);
+	if (err)
+		return err;
+	if (sync)
+		err = ubifs_wbuf_sync_nolock(wbuf);
+	return err;
+}
+
+/**
+ * make_reservation - reserve journal space.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @len: how many bytes to reserve
+ *
+ * This function makes space reservation in journal head @jhead. The function
+ * takes the commit lock and locks the journal head, and the caller has to
+ * unlock the head and finish the reservation with 'finish_reservation()'.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, the journal head may be unlocked as soon as the data is written, while
+ * the commit lock has to be released after the data has been added to the
+ * TNC.
+ */
+static int make_reservation(struct ubifs_info *c, int jhead, int len)
+{
+	int err, cmt_retries = 0, nospc_retries = 0;
+
+again:
+	down_read(&c->commit_sem);
+	err = reserve_space(c, jhead, len);
+	if (!err)
+		return 0;
+	up_read(&c->commit_sem);
+
+	if (err == -ENOSPC) {
+		/*
+		 * GC could not make any progress. We should try to commit
+		 * once because it could make some dirty space and GC would
+		 * make progress, so make the error -EAGAIN so that the below
+		 * will commit and re-try.
+		 */
+		if (nospc_retries++ < 2) {
+			dbg_jnl("no space, retry");
+			err = -EAGAIN;
+		}
+
+		/*
+		 * This means that the budgeting is incorrect. We always have
+		 * to be able to write to the media, because all operations are
+		 * budgeted. Deletions are not budgeted, though, but we reserve
+		 * an extra LEB for them.
+		 */
+	}
+
+	if (err != -EAGAIN)
+		goto out;
+
+	/*
+	 * -EAGAIN means that the journal is full or too large, or the above
+	 * code wants to do one commit. Do this and re-try.
+	 */
+	if (cmt_retries > 128) {
+		/*
+		 * This should not happen unless the journal size limitations
+		 * are too tough.
+		 */
+		ubifs_err(c, "stuck in space allocation");
+		err = -ENOSPC;
+		goto out;
+	} else if (cmt_retries > 32)
+		ubifs_warn(c, "too many space allocation re-tries (%d)",
+			   cmt_retries);
+
+	dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
+		cmt_retries);
+	cmt_retries += 1;
+
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+	goto again;
+
+out:
+	ubifs_err(c, "cannot reserve %d bytes in jhead %d, error %d",
+		  len, jhead, err);
+	if (err == -ENOSPC) {
+		/* This are some budgeting problems, print useful information */
+		down_write(&c->commit_sem);
+		dump_stack();
+		ubifs_dump_budg(c, &c->bi);
+		ubifs_dump_lprops(c);
+		cmt_retries = dbg_check_lprops(c);
+		up_write(&c->commit_sem);
+	}
+	return err;
+}
+
+/**
+ * release_head - release a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ *
+ * This function releases journal head @jhead which was locked by
+ * the 'make_reservation()' function. It has to be called after each successful
+ * 'make_reservation()' invocation.
+ */
+static inline void release_head(struct ubifs_info *c, int jhead)
+{
+	mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
+}
+
+/**
+ * finish_reservation - finish a reservation.
+ * @c: UBIFS file-system description object
+ *
+ * This function finishes journal space reservation. It must be called after
+ * 'make_reservation()'.
+ */
+static void finish_reservation(struct ubifs_info *c)
+{
+	up_read(&c->commit_sem);
+}
+
+/**
+ * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
+ * @mode: inode mode
+ */
+static int get_dent_type(int mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		return UBIFS_ITYPE_REG;
+	case S_IFDIR:
+		return UBIFS_ITYPE_DIR;
+	case S_IFLNK:
+		return UBIFS_ITYPE_LNK;
+	case S_IFBLK:
+		return UBIFS_ITYPE_BLK;
+	case S_IFCHR:
+		return UBIFS_ITYPE_CHR;
+	case S_IFIFO:
+		return UBIFS_ITYPE_FIFO;
+	case S_IFSOCK:
+		return UBIFS_ITYPE_SOCK;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+/**
+ * pack_inode - pack an inode node.
+ * @c: UBIFS file-system description object
+ * @ino: buffer in which to pack inode node
+ * @inode: inode to pack
+ * @last: indicates the last node of the group
+ */
+static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
+		       const struct inode *inode, int last)
+{
+	int data_len = 0, last_reference = !inode->i_nlink;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ino->ch.node_type = UBIFS_INO_NODE;
+	ino_key_init_flash(c, &ino->key, inode->i_ino);
+	ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
+	ino->atime_sec  = cpu_to_le64(inode->i_atime.tv_sec);
+	ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	ino->ctime_sec  = cpu_to_le64(inode->i_ctime.tv_sec);
+	ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ino->mtime_sec  = cpu_to_le64(inode->i_mtime.tv_sec);
+	ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ino->uid   = cpu_to_le32(i_uid_read(inode));
+	ino->gid   = cpu_to_le32(i_gid_read(inode));
+	ino->mode  = cpu_to_le32(inode->i_mode);
+	ino->flags = cpu_to_le32(ui->flags);
+	ino->size  = cpu_to_le64(ui->ui_size);
+	ino->nlink = cpu_to_le32(inode->i_nlink);
+	ino->compr_type  = cpu_to_le16(ui->compr_type);
+	ino->data_len    = cpu_to_le32(ui->data_len);
+	ino->xattr_cnt   = cpu_to_le32(ui->xattr_cnt);
+	ino->xattr_size  = cpu_to_le32(ui->xattr_size);
+	ino->xattr_names = cpu_to_le32(ui->xattr_names);
+	zero_ino_node_unused(ino);
+
+	/*
+	 * Drop the attached data if this is a deletion inode, the data is not
+	 * needed anymore.
+	 */
+	if (!last_reference) {
+		memcpy(ino->data, ui->data, ui->data_len);
+		data_len = ui->data_len;
+	}
+
+	ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
+}
+
+/**
+ * mark_inode_clean - mark UBIFS inode as clean.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to mark as clean
+ *
+ * This helper function marks UBIFS inode @ui as clean by cleaning the
+ * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
+ * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
+ * just do nothing.
+ */
+static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
+{
+	if (ui->dirty)
+		ubifs_release_dirty_inode_budget(c, ui);
+	ui->dirty = 0;
+}
+
+/**
+ * ubifs_jnl_update - update inode.
+ * @c: UBIFS file-system description object
+ * @dir: parent inode or host inode in case of extended attributes
+ * @nm: directory entry name
+ * @inode: inode to update
+ * @deletion: indicates a directory entry deletion i.e unlink or rmdir
+ * @xent: non-zero if the directory entry is an extended attribute entry
+ *
+ * This function updates an inode by writing a directory entry (or extended
+ * attribute entry), the inode itself, and the parent directory inode (or the
+ * host inode) to the journal.
+ *
+ * The function writes the host inode @dir last, which is important in case of
+ * extended attributes. Indeed, then we guarantee that if the host inode gets
+ * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
+ * the extended attribute inode gets flushed too. And this is exactly what the
+ * user expects - synchronizing the host inode synchronizes its extended
+ * attributes. Similarly, this guarantees that if @dir is synchronized, its
+ * directory entry corresponding to @nm gets synchronized too.
+ *
+ * If the inode (@inode) or the parent directory (@dir) are synchronous, this
+ * function synchronizes the write-buffer.
+ *
+ * This function marks the @dir and @inode inodes as clean and returns zero on
+ * success. In case of failure, a negative error code is returned.
+ */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+		     const struct qstr *nm, const struct inode *inode,
+		     int deletion, int xent)
+{
+	int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+	int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
+	int last_reference = !!(deletion && inode->i_nlink == 0);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_inode *host_ui = ubifs_inode(dir);
+	struct ubifs_dent_node *dent;
+	struct ubifs_ino_node *ino;
+	union ubifs_key dent_key, ino_key;
+
+	dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+		inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+	dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	ilen = UBIFS_INO_NODE_SZ;
+
+	/*
+	 * If the last reference to the inode is being deleted, then there is
+	 * no need to attach and write inode data, it is being deleted anyway.
+	 * And if the inode is being deleted, no need to synchronize
+	 * write-buffer even if the inode is synchronous.
+	 */
+	if (!last_reference) {
+		ilen += ui->data_len;
+		sync |= IS_SYNC(inode);
+	}
+
+	aligned_dlen = ALIGN(dlen, 8);
+	aligned_ilen = ALIGN(ilen, 8);
+
+	len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+	/* Make sure to also account for extended attributes */
+	len += host_ui->data_len;
+
+	dent = kmalloc(len, GFP_NOFS);
+	if (!dent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	if (!xent) {
+		dent->ch.node_type = UBIFS_DENT_NODE;
+		dent_key_init(c, &dent_key, dir->i_ino, nm);
+	} else {
+		dent->ch.node_type = UBIFS_XENT_NODE;
+		xent_key_init(c, &dent_key, dir->i_ino, nm);
+	}
+
+	key_write(c, &dent_key, dent->key);
+	dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
+	dent->type = get_dent_type(inode->i_mode);
+	dent->nlen = cpu_to_le16(nm->len);
+	memcpy(dent->name, nm->name, nm->len);
+	dent->name[nm->len] = '\0';
+	zero_dent_node_unused(dent);
+	ubifs_prep_grp_node(c, dent, dlen, 0);
+
+	ino = (void *)dent + aligned_dlen;
+	pack_inode(c, ino, inode, 0);
+	ino = (void *)ino + aligned_ilen;
+	pack_inode(c, ino, dir, 1);
+
+	if (last_reference) {
+		err = ubifs_add_orphan(c, inode->i_ino);
+		if (err) {
+			release_head(c, BASEHD);
+			goto out_finish;
+		}
+		ui->del_cmtno = c->cmt_no;
+	}
+
+	err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
+	}
+	release_head(c, BASEHD);
+	kfree(dent);
+
+	if (deletion) {
+		err = ubifs_tnc_remove_nm(c, &dent_key, nm);
+		if (err)
+			goto out_ro;
+		err = ubifs_add_dirt(c, lnum, dlen);
+	} else
+		err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+	if (err)
+		goto out_ro;
+
+	/*
+	 * Note, we do not remove the inode from TNC even if the last reference
+	 * to it has just been deleted, because the inode may still be opened.
+	 * Instead, the inode has been added to orphan lists and the orphan
+	 * subsystem will take further care about it.
+	 */
+	ino_key_init(c, &ino_key, inode->i_ino);
+	ino_offs = dent_offs + aligned_dlen;
+	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &ino_key, dir->i_ino);
+	ino_offs += aligned_ilen;
+	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs,
+			    UBIFS_INO_NODE_SZ + host_ui->data_len);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	mark_inode_clean(c, ui);
+	mark_inode_clean(c, host_ui);
+	return 0;
+
+out_finish:
+	finish_reservation(c);
+out_free:
+	kfree(dent);
+	return err;
+
+out_release:
+	release_head(c, BASEHD);
+	kfree(dent);
+out_ro:
+	ubifs_ro_mode(c, err);
+	if (last_reference)
+		ubifs_delete_orphan(c, inode->i_ino);
+	finish_reservation(c);
+	return err;
+}
+
+/**
+ * ubifs_jnl_write_data - write a data node to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode the data node belongs to
+ * @key: node key
+ * @buf: buffer to write
+ * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
+ *
+ * This function writes a data node to the journal. Returns %0 if the data node
+ * was successfully written, and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+			 const union ubifs_key *key, const void *buf, int len)
+{
+	struct ubifs_data_node *data;
+	int err, lnum, offs, compr_type, out_len;
+	int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
+		(unsigned long)key_inum(c, key), key_block(c, key), len);
+	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+
+	data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
+	if (!data) {
+		/*
+		 * Fall-back to the write reserve buffer. Note, we might be
+		 * currently on the memory reclaim path, when the kernel is
+		 * trying to free some memory by writing out dirty pages. The
+		 * write reserve buffer helps us to guarantee that we are
+		 * always able to write the data.
+		 */
+		allocated = 0;
+		mutex_lock(&c->write_reserve_mutex);
+		data = c->write_reserve_buf;
+	}
+
+	data->ch.node_type = UBIFS_DATA_NODE;
+	key_write(c, key, &data->key);
+	data->size = cpu_to_le32(len);
+	zero_data_node_unused(data);
+
+	if (!(ui->flags & UBIFS_COMPR_FL))
+		/* Compression is disabled for this inode */
+		compr_type = UBIFS_COMPR_NONE;
+	else
+		compr_type = ui->compr_type;
+
+	out_len = dlen - UBIFS_DATA_NODE_SZ;
+	ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type);
+	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+
+	dlen = UBIFS_DATA_NODE_SZ + out_len;
+	data->compr_type = cpu_to_le16(compr_type);
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, DATAHD, dlen);
+	if (err)
+		goto out_free;
+
+	err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+	if (err)
+		goto out_release;
+	ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
+	release_head(c, DATAHD);
+
+	err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	if (!allocated)
+		mutex_unlock(&c->write_reserve_mutex);
+	else
+		kfree(data);
+	return 0;
+
+out_release:
+	release_head(c, DATAHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	if (!allocated)
+		mutex_unlock(&c->write_reserve_mutex);
+	else
+		kfree(data);
+	return err;
+}
+
+/**
+ * ubifs_jnl_write_inode - flush inode to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode to flush
+ *
+ * This function writes inode @inode to the journal. If the inode is
+ * synchronous, it also synchronizes the write-buffer. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err, lnum, offs;
+	struct ubifs_ino_node *ino;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
+
+	dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
+
+	/*
+	 * If the inode is being deleted, do not write the attached data. No
+	 * need to synchronize the write-buffer either.
+	 */
+	if (!last_reference) {
+		len += ui->data_len;
+		sync = IS_SYNC(inode);
+	}
+	ino = kmalloc(len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, inode, 1);
+	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+					  inode->i_ino);
+	release_head(c, BASEHD);
+
+	if (last_reference) {
+		err = ubifs_tnc_remove_ino(c, inode->i_ino);
+		if (err)
+			goto out_ro;
+		ubifs_delete_orphan(c, inode->i_ino);
+		err = ubifs_add_dirt(c, lnum, len);
+	} else {
+		union ubifs_key key;
+
+		ino_key_init(c, &key, inode->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, len);
+	}
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	kfree(ino);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+/**
+ * ubifs_jnl_delete_inode - delete an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to delete
+ *
+ * This function deletes inode @inode which includes removing it from orphans,
+ * deleting it from TNC and, in some cases, writing a deletion inode to the
+ * journal.
+ *
+ * When regular file inodes are unlinked or a directory inode is removed, the
+ * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
+ * direntry to the media, and adds the inode to orphans. After this, when the
+ * last reference to this inode has been dropped, this function is called. In
+ * general, it has to write one more deletion inode to the media, because if
+ * a commit happened between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
+ * anymore, and in fact it might not be on the flash anymore, because it might
+ * have been garbage-collected already. And for optimization reasons UBIFS does
+ * not read the orphan area if it has been unmounted cleanly, so it would have
+ * no indication in the journal that there is a deleted inode which has to be
+ * removed from TNC.
+ *
+ * However, if there was no commit between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
+ * inode to the media for the second time. And this is quite a typical case.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(inode->i_nlink == 0);
+
+	if (ui->del_cmtno != c->cmt_no)
+		/* A commit happened for sure */
+		return ubifs_jnl_write_inode(c, inode);
+
+	down_read(&c->commit_sem);
+	/*
+	 * Check commit number again, because the first test has been done
+	 * without @c->commit_sem, so a commit might have happened.
+	 */
+	if (ui->del_cmtno != c->cmt_no) {
+		up_read(&c->commit_sem);
+		return ubifs_jnl_write_inode(c, inode);
+	}
+
+	err = ubifs_tnc_remove_ino(c, inode->i_ino);
+	if (err)
+		ubifs_ro_mode(c, err);
+	else
+		ubifs_delete_orphan(c, inode->i_ino);
+	up_read(&c->commit_sem);
+	return err;
+}
+
+/**
+ * ubifs_jnl_rename - rename a directory entry.
+ * @c: UBIFS file-system description object
+ * @old_dir: parent inode of directory entry to rename
+ * @old_dentry: directory entry to rename
+ * @new_dir: parent inode of directory entry to rename
+ * @new_dentry: new directory entry (or directory entry to replace)
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the re-name operation which may involve writing up
+ * to 3 inodes and 2 directory entries. It marks the written inodes as clean
+ * and returns zero on success. In case of failure, a negative error code is
+ * returned.
+ */
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+		     const struct dentry *old_dentry,
+		     const struct inode *new_dir,
+		     const struct dentry *new_dentry, int sync)
+{
+	void *p;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent, *dent2;
+	int err, dlen1, dlen2, ilen, lnum, offs, len;
+	const struct inode *old_inode = d_inode(old_dentry);
+	const struct inode *new_inode = d_inode(new_dentry);
+	int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
+	int last_reference = !!(new_inode && new_inode->i_nlink == 0);
+	int move = (old_dir != new_dir);
+	struct ubifs_inode *uninitialized_var(new_ui);
+
+	dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
+		old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
+	ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
+	ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
+	ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
+	ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
+
+	dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
+	dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+	if (new_inode) {
+		new_ui = ubifs_inode(new_inode);
+		ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
+		ilen = UBIFS_INO_NODE_SZ;
+		if (!last_reference)
+			ilen += new_ui->data_len;
+	} else
+		ilen = 0;
+
+	aligned_dlen1 = ALIGN(dlen1, 8);
+	aligned_dlen2 = ALIGN(dlen2, 8);
+	len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+	if (old_dir != new_dir)
+		len += plen;
+	dent = kmalloc(len, GFP_NOFS);
+	if (!dent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	/* Make new dent */
+	dent->ch.node_type = UBIFS_DENT_NODE;
+	dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+	dent->inum = cpu_to_le64(old_inode->i_ino);
+	dent->type = get_dent_type(old_inode->i_mode);
+	dent->nlen = cpu_to_le16(new_dentry->d_name.len);
+	memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
+	dent->name[new_dentry->d_name.len] = '\0';
+	zero_dent_node_unused(dent);
+	ubifs_prep_grp_node(c, dent, dlen1, 0);
+
+	/* Make deletion dent */
+	dent2 = (void *)dent + aligned_dlen1;
+	dent2->ch.node_type = UBIFS_DENT_NODE;
+	dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
+			    &old_dentry->d_name);
+	dent2->inum = 0;
+	dent2->type = DT_UNKNOWN;
+	dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
+	memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
+	dent2->name[old_dentry->d_name.len] = '\0';
+	zero_dent_node_unused(dent2);
+	ubifs_prep_grp_node(c, dent2, dlen2, 0);
+
+	p = (void *)dent2 + aligned_dlen2;
+	if (new_inode) {
+		pack_inode(c, p, new_inode, 0);
+		p += ALIGN(ilen, 8);
+	}
+
+	if (!move)
+		pack_inode(c, p, old_dir, 1);
+	else {
+		pack_inode(c, p, old_dir, 0);
+		p += ALIGN(plen, 8);
+		pack_inode(c, p, new_dir, 1);
+	}
+
+	if (last_reference) {
+		err = ubifs_add_orphan(c, new_inode->i_ino);
+		if (err) {
+			release_head(c, BASEHD);
+			goto out_finish;
+		}
+		new_ui->del_cmtno = c->cmt_no;
+	}
+
+	err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
+		if (new_inode)
+			ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+						  new_inode->i_ino);
+	}
+	release_head(c, BASEHD);
+
+	dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+	if (err)
+		goto out_ro;
+
+	err = ubifs_add_dirt(c, lnum, dlen2);
+	if (err)
+		goto out_ro;
+
+	dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+	err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+	if (err)
+		goto out_ro;
+
+	offs += aligned_dlen1 + aligned_dlen2;
+	if (new_inode) {
+		ino_key_init(c, &key, new_inode->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+		if (err)
+			goto out_ro;
+		offs += ALIGN(ilen, 8);
+	}
+
+	ino_key_init(c, &key, old_dir->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+	if (err)
+		goto out_ro;
+
+	if (old_dir != new_dir) {
+		offs += ALIGN(plen, 8);
+		ino_key_init(c, &key, new_dir->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+		if (err)
+			goto out_ro;
+	}
+
+	finish_reservation(c);
+	if (new_inode) {
+		mark_inode_clean(c, new_ui);
+		spin_lock(&new_ui->ui_lock);
+		new_ui->synced_i_size = new_ui->ui_size;
+		spin_unlock(&new_ui->ui_lock);
+	}
+	mark_inode_clean(c, ubifs_inode(old_dir));
+	if (move)
+		mark_inode_clean(c, ubifs_inode(new_dir));
+	kfree(dent);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	if (last_reference)
+		ubifs_delete_orphan(c, new_inode->i_ino);
+out_finish:
+	finish_reservation(c);
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * recomp_data_node - re-compress a truncated data node.
+ * @dn: data node to re-compress
+ * @new_len: new length
+ *
+ * This function is used when an inode is truncated and the last data node of
+ * the inode has to be re-compressed and re-written.
+ */
+static int recomp_data_node(const struct ubifs_info *c,
+			    struct ubifs_data_node *dn, int *new_len)
+{
+	void *buf;
+	int err, len, compr_type, out_len;
+
+	out_len = le32_to_cpu(dn->size);
+	buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	compr_type = le16_to_cpu(dn->compr_type);
+	err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type);
+	if (err)
+		goto out;
+
+	ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
+	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+	dn->compr_type = cpu_to_le16(compr_type);
+	dn->size = cpu_to_le32(*new_len);
+	*new_len = UBIFS_DATA_NODE_SZ + out_len;
+out:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_jnl_truncate - update the journal for a truncation.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @old_size: old size
+ * @new_size: new size
+ *
+ * When the size of a file decreases due to truncation, a truncation node is
+ * written, the journal tree is updated, and the last data block is re-written
+ * if it has been affected. The inode is also updated in order to synchronize
+ * the new inode size.
+ *
+ * This function marks the inode as clean and returns zero on success. In case
+ * of failure, a negative error code is returned.
+ */
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+		       loff_t old_size, loff_t new_size)
+{
+	union ubifs_key key, to_key;
+	struct ubifs_ino_node *ino;
+	struct ubifs_trun_node *trun;
+	struct ubifs_data_node *uninitialized_var(dn);
+	int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	ino_t inum = inode->i_ino;
+	unsigned int blk;
+
+	dbg_jnl("ino %lu, size %lld -> %lld",
+		(unsigned long)inum, old_size, new_size);
+	ubifs_assert(!ui->data_len);
+	ubifs_assert(S_ISREG(inode->i_mode));
+	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+
+	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+	     UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+	ino = kmalloc(sz, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	trun = (void *)ino + UBIFS_INO_NODE_SZ;
+	trun->ch.node_type = UBIFS_TRUN_NODE;
+	trun->inum = cpu_to_le32(inum);
+	trun->old_size = cpu_to_le64(old_size);
+	trun->new_size = cpu_to_le64(new_size);
+	zero_trun_node_unused(trun);
+
+	dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
+	if (dlen) {
+		/* Get last data block so it can be truncated */
+		dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
+		blk = new_size >> UBIFS_BLOCK_SHIFT;
+		data_key_init(c, &key, inum, blk);
+		dbg_jnlk(&key, "last block key ");
+		err = ubifs_tnc_lookup(c, &key, dn);
+		if (err == -ENOENT)
+			dlen = 0; /* Not found (so it is a hole) */
+		else if (err)
+			goto out_free;
+		else {
+			if (le32_to_cpu(dn->size) <= dlen)
+				dlen = 0; /* Nothing to do */
+			else {
+				int compr_type = le16_to_cpu(dn->compr_type);
+
+				if (compr_type != UBIFS_COMPR_NONE) {
+					err = recomp_data_node(c, dn, &dlen);
+					if (err)
+						goto out_free;
+				} else {
+					dn->size = cpu_to_le32(dlen);
+					dlen += UBIFS_DATA_NODE_SZ;
+				}
+				zero_data_node_unused(dn);
+			}
+		}
+	}
+
+	/* Must make reservation before allocating sequence numbers */
+	len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
+	if (dlen)
+		len += dlen;
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, inode, 0);
+	ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
+	if (dlen)
+		ubifs_prep_grp_node(c, dn, dlen, 1);
+
+	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
+	release_head(c, BASEHD);
+
+	if (dlen) {
+		sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
+		err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+		if (err)
+			goto out_ro;
+	}
+
+	ino_key_init(c, &key, inum);
+	err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	bit = new_size & (UBIFS_BLOCK_SIZE - 1);
+	blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
+	data_key_init(c, &key, inum, blk);
+
+	bit = old_size & (UBIFS_BLOCK_SIZE - 1);
+	blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
+	data_key_init(c, &to_key, inum, blk);
+
+	err = ubifs_tnc_remove_range(c, &key, &to_key);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	mark_inode_clean(c, ui);
+	kfree(ino);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+
+/**
+ * ubifs_jnl_delete_xattr - delete an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @nm: extended attribute entry name
+ *
+ * This function delete an extended attribute which is very similar to
+ * un-linking regular files - it writes a deletion xentry, a deletion inode and
+ * updates the target inode. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+			   const struct inode *inode, const struct qstr *nm)
+{
+	int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+	struct ubifs_dent_node *xent;
+	struct ubifs_ino_node *ino;
+	union ubifs_key xent_key, key1, key2;
+	int sync = IS_DIRSYNC(host);
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+
+	dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+		host->i_ino, inode->i_ino, nm->name,
+		ubifs_inode(inode)->data_len);
+	ubifs_assert(inode->i_nlink == 0);
+	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+	/*
+	 * Since we are deleting the inode, we do not bother to attach any data
+	 * to it and assume its length is %UBIFS_INO_NODE_SZ.
+	 */
+	xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	aligned_xlen = ALIGN(xlen, 8);
+	hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
+	len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
+
+	xent = kmalloc(len, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err) {
+		kfree(xent);
+		return err;
+	}
+
+	xent->ch.node_type = UBIFS_XENT_NODE;
+	xent_key_init(c, &xent_key, host->i_ino, nm);
+	key_write(c, &xent_key, xent->key);
+	xent->inum = 0;
+	xent->type = get_dent_type(inode->i_mode);
+	xent->nlen = cpu_to_le16(nm->len);
+	memcpy(xent->name, nm->name, nm->len);
+	xent->name[nm->len] = '\0';
+	zero_dent_node_unused(xent);
+	ubifs_prep_grp_node(c, xent, xlen, 0);
+
+	ino = (void *)xent + aligned_xlen;
+	pack_inode(c, ino, inode, 0);
+	ino = (void *)ino + UBIFS_INO_NODE_SZ;
+	pack_inode(c, ino, host, 1);
+
+	err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+	if (!sync && !err)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
+	release_head(c, BASEHD);
+	kfree(xent);
+	if (err)
+		goto out_ro;
+
+	/* Remove the extended attribute entry from TNC */
+	err = ubifs_tnc_remove_nm(c, &xent_key, nm);
+	if (err)
+		goto out_ro;
+	err = ubifs_add_dirt(c, lnum, xlen);
+	if (err)
+		goto out_ro;
+
+	/*
+	 * Remove all nodes belonging to the extended attribute inode from TNC.
+	 * Well, there actually must be only one node - the inode itself.
+	 */
+	lowest_ino_key(c, &key1, inode->i_ino);
+	highest_ino_key(c, &key2, inode->i_ino);
+	err = ubifs_tnc_remove_range(c, &key1, &key2);
+	if (err)
+		goto out_ro;
+	err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	/* And update TNC with the new host inode position */
+	ino_key_init(c, &key1, host->i_ino);
+	err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&host_ui->ui_lock);
+	host_ui->synced_i_size = host_ui->ui_size;
+	spin_unlock(&host_ui->ui_lock);
+	mark_inode_clean(c, host_ui);
+	return 0;
+
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+	return err;
+}
+
+/**
+ * ubifs_jnl_change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @inode: extended attribute inode
+ * @host: host inode
+ *
+ * This function writes the updated version of an extended attribute inode and
+ * the host inode to the journal (to the base head). The host inode is written
+ * after the extended attribute inode in order to guarantee that the extended
+ * attribute will be flushed when the inode is synchronized by 'fsync()' and
+ * consequently, the write-buffer is synchronized. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
+			   const struct inode *host)
+{
+	int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_ino_node *ino;
+	union ubifs_key key;
+	int sync = IS_DIRSYNC(host);
+
+	dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
+	ubifs_assert(host->i_nlink > 0);
+	ubifs_assert(inode->i_nlink > 0);
+	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+	len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
+	len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
+	aligned_len1 = ALIGN(len1, 8);
+	aligned_len = aligned_len1 + ALIGN(len2, 8);
+
+	ino = kmalloc(aligned_len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, aligned_len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, host, 0);
+	pack_inode(c, (void *)ino + aligned_len1, inode, 1);
+
+	err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
+	if (!sync && !err) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+	}
+	release_head(c, BASEHD);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &key, host->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &key, inode->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&host_ui->ui_lock);
+	host_ui->synced_i_size = host_ui->ui_size;
+	spin_unlock(&host_ui->ui_lock);
+	mark_inode_clean(c, host_ui);
+	kfree(ino);
+	return 0;
+
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
diff --git a/kernel/fs/ubifs/key.h b/kernel/fs/ubifs/key.h
new file mode 100644
index 000000000..92a8491a8
--- /dev/null
+++ b/kernel/fs/ubifs/key.h
@@ -0,0 +1,548 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This header contains various key-related definitions and helper function.
+ * UBIFS allows several key schemes, so we access key fields only via these
+ * helpers. At the moment only one key scheme is supported.
+ *
+ * Simple key scheme
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * Keys are 64-bits long. First 32-bits are inode number (parent inode number
+ * in case of direntry key). Next 3 bits are node type. The last 29 bits are
+ * 4KiB offset in case of inode node, and direntry hash in case of a direntry
+ * node. We use "r5" hash borrowed from reiserfs.
+ */
+
+#ifndef __UBIFS_KEY_H__
+#define __UBIFS_KEY_H__
+
+/**
+ * key_mask_hash - mask a valid hash value.
+ * @val: value to be masked
+ *
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
+ * function makes sure the reserved values are not used.
+ */
+static inline uint32_t key_mask_hash(uint32_t hash)
+{
+	hash &= UBIFS_S_KEY_HASH_MASK;
+	if (unlikely(hash <= 2))
+		hash += 3;
+	return hash;
+}
+
+/**
+ * key_r5_hash - R5 hash function (borrowed from reiserfs).
+ * @s: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_r5_hash(const char *s, int len)
+{
+	uint32_t a = 0;
+	const signed char *str = (const signed char *)s;
+
+	while (*str) {
+		a += *str << 4;
+		a += *str >> 4;
+		a *= 11;
+		str++;
+	}
+
+	return key_mask_hash(a);
+}
+
+/**
+ * key_test_hash - testing hash function.
+ * @str: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_test_hash(const char *str, int len)
+{
+	uint32_t a = 0;
+
+	len = min_t(uint32_t, len, 4);
+	memcpy(&a, str, len);
+	return key_mask_hash(a);
+}
+
+/**
+ * ino_key_init - initialize inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * ino_key_init_flash - initialize on-flash inode key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
+				      ino_t inum)
+{
+	union ubifs_key *key = k;
+
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_ino_key - get the lowest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void lowest_ino_key(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = 0;
+}
+
+/**
+ * highest_ino_key - get the highest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_ino_key(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = 0xffffffff;
+}
+
+/**
+ * dent_key_init - initialize directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 const struct qstr *nm)
+{
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_hash - initialize directory entry key without re-calculating
+ *                      hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @hash: direntry name hash
+ */
+static inline void dent_key_init_hash(const struct ubifs_info *c,
+				      union ubifs_key *key, ino_t inum,
+				      uint32_t hash)
+{
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_flash - initialize on-flash directory entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
+				       ino_t inum, const struct qstr *nm)
+{
+	union ubifs_key *key = k;
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(hash |
+				  (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_dent_key - get the lowest possible directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: parent inode number
+ */
+static inline void lowest_dent_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * xent_key_init - initialize extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 const struct qstr *nm)
+{
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_flash - initialize on-flash extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
+				       ino_t inum, const struct qstr *nm)
+{
+	union ubifs_key *key = k;
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(hash |
+				  (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_xent_key - get the lowest possible extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: host inode number
+ */
+static inline void lowest_xent_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * data_key_init - initialize data key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 unsigned int block)
+{
+	ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
+}
+
+/**
+ * highest_data_key - get the highest possible data key for an inode.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_data_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
+}
+
+/**
+ * trun_key_init - initialize truncation node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ *
+ * Note, UBIFS does not have truncation keys on the media and this function is
+ * only used for purposes of replay.
+ */
+static inline void trun_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * invalid_key_init - initialize invalid node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ *
+ * This is a helper function which marks a @key object as invalid.
+ */
+static inline void invalid_key_init(const struct ubifs_info *c,
+				    union ubifs_key *key)
+{
+	key->u32[0] = 0xDEADBEAF;
+	key->u32[1] = UBIFS_INVALID_KEY;
+}
+
+/**
+ * key_type - get key type.
+ * @c: UBIFS file-system description object
+ * @key: key to get type of
+ */
+static inline int key_type(const struct ubifs_info *c,
+			   const union ubifs_key *key)
+{
+	return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type_flash - get type of a on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to get type of
+ */
+static inline int key_type_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_inum - fetch inode number from key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return key->u32[0];
+}
+
+/**
+ * key_inum_flash - fetch inode number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[0]);
+}
+
+/**
+ * key_hash - get directory entry hash.
+ * @c: UBIFS file-system description object
+ * @key: the key to get hash from
+ */
+static inline uint32_t key_hash(const struct ubifs_info *c,
+				const union ubifs_key *key)
+{
+	return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_hash_flash - get directory entry hash from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get hash from
+ */
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_block - get data block number.
+ * @c: UBIFS file-system description object
+ * @key: the key to get the block number from
+ */
+static inline unsigned int key_block(const struct ubifs_info *c,
+				     const union ubifs_key *key)
+{
+	return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_block_flash - get data block number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get the block number from
+ */
+static inline unsigned int key_block_flash(const struct ubifs_info *c,
+					   const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_read - transform a key to in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_read(const struct ubifs_info *c, const void *from,
+			    union ubifs_key *to)
+{
+	const union ubifs_key *f = from;
+
+	to->u32[0] = le32_to_cpu(f->j32[0]);
+	to->u32[1] = le32_to_cpu(f->j32[1]);
+}
+
+/**
+ * key_write - transform a key from in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write(const struct ubifs_info *c,
+			     const union ubifs_key *from, void *to)
+{
+	union ubifs_key *t = to;
+
+	t->j32[0] = cpu_to_le32(from->u32[0]);
+	t->j32[1] = cpu_to_le32(from->u32[1]);
+	memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * key_write_idx - transform a key from in-memory format for the index.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write_idx(const struct ubifs_info *c,
+				 const union ubifs_key *from, void *to)
+{
+	union ubifs_key *t = to;
+
+	t->j32[0] = cpu_to_le32(from->u32[0]);
+	t->j32[1] = cpu_to_le32(from->u32[1]);
+}
+
+/**
+ * key_copy - copy a key.
+ * @c: UBIFS file-system description object
+ * @from: the key to copy from
+ * @to: the key to copy to
+ */
+static inline void key_copy(const struct ubifs_info *c,
+			    const union ubifs_key *from, union ubifs_key *to)
+{
+	to->u64[0] = from->u64[0];
+}
+
+/**
+ * keys_cmp - compare keys.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %-1 if @key1 is less than
+ * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ */
+static inline int keys_cmp(const struct ubifs_info *c,
+			   const union ubifs_key *key1,
+			   const union ubifs_key *key2)
+{
+	if (key1->u32[0] < key2->u32[0])
+		return -1;
+	if (key1->u32[0] > key2->u32[0])
+		return 1;
+	if (key1->u32[1] < key2->u32[1])
+		return -1;
+	if (key1->u32[1] > key2->u32[1])
+		return 1;
+
+	return 0;
+}
+
+/**
+ * keys_eq - determine if keys are equivalent.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and
+ * %0 if not.
+ */
+static inline int keys_eq(const struct ubifs_info *c,
+			  const union ubifs_key *key1,
+			  const union ubifs_key *key2)
+{
+	if (key1->u32[0] != key2->u32[0])
+		return 0;
+	if (key1->u32[1] != key2->u32[1])
+		return 0;
+	return 1;
+}
+
+/**
+ * is_hash_key - is a key vulnerable to hash collisions.
+ * @c: UBIFS file-system description object
+ * @key: key
+ *
+ * This function returns %1 if @key is a hashed key or %0 otherwise.
+ */
+static inline int is_hash_key(const struct ubifs_info *c,
+			      const union ubifs_key *key)
+{
+	int type = key_type(c, key);
+
+	return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
+}
+
+/**
+ * key_max_inode_size - get maximum file size allowed by current key format.
+ * @c: UBIFS file-system description object
+ */
+static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
+{
+	switch (c->key_fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
+	default:
+		return 0;
+	}
+}
+
+#endif /* !__UBIFS_KEY_H__ */
diff --git a/kernel/fs/ubifs/log.c b/kernel/fs/ubifs/log.c
new file mode 100644
index 000000000..8c795e639
--- /dev/null
+++ b/kernel/fs/ubifs/log.c
@@ -0,0 +1,753 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file is a part of UBIFS journal implementation and contains various
+ * functions which manipulate the log. The log is a fixed area on the flash
+ * which does not contain any data but refers to buds. The log is a part of the
+ * journal.
+ */
+
+#include "ubifs.h"
+
+static int dbg_check_bud_bytes(struct ubifs_info *c);
+
+/**
+ * ubifs_search_bud - search bud LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This function searches bud LEB @lnum. Returns bud description object in case
+ * of success and %NULL if there is no bud with this LEB number.
+ */
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
+{
+	struct rb_node *p;
+	struct ubifs_bud *bud;
+
+	spin_lock(&c->buds_lock);
+	p = c->buds.rb_node;
+	while (p) {
+		bud = rb_entry(p, struct ubifs_bud, rb);
+		if (lnum < bud->lnum)
+			p = p->rb_left;
+		else if (lnum > bud->lnum)
+			p = p->rb_right;
+		else {
+			spin_unlock(&c->buds_lock);
+			return bud;
+		}
+	}
+	spin_unlock(&c->buds_lock);
+	return NULL;
+}
+
+/**
+ * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This functions returns the wbuf for @lnum or %NULL if there is not one.
+ */
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
+{
+	struct rb_node *p;
+	struct ubifs_bud *bud;
+	int jhead;
+
+	if (!c->jheads)
+		return NULL;
+
+	spin_lock(&c->buds_lock);
+	p = c->buds.rb_node;
+	while (p) {
+		bud = rb_entry(p, struct ubifs_bud, rb);
+		if (lnum < bud->lnum)
+			p = p->rb_left;
+		else if (lnum > bud->lnum)
+			p = p->rb_right;
+		else {
+			jhead = bud->jhead;
+			spin_unlock(&c->buds_lock);
+			return &c->jheads[jhead].wbuf;
+		}
+	}
+	spin_unlock(&c->buds_lock);
+	return NULL;
+}
+
+/**
+ * empty_log_bytes - calculate amount of empty space in the log.
+ * @c: UBIFS file-system description object
+ */
+static inline long long empty_log_bytes(const struct ubifs_info *c)
+{
+	long long h, t;
+
+	h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
+	t = (long long)c->ltail_lnum * c->leb_size;
+
+	if (h > t)
+		return c->log_bytes - h + t;
+	else if (h != t)
+		return t - h;
+	else if (c->lhead_lnum != c->ltail_lnum)
+		return 0;
+	else
+		return c->log_bytes;
+}
+
+/**
+ * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
+ * @c: UBIFS file-system description object
+ * @bud: the bud to add
+ */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+	struct rb_node **p, *parent = NULL;
+	struct ubifs_bud *b;
+	struct ubifs_jhead *jhead;
+
+	spin_lock(&c->buds_lock);
+	p = &c->buds.rb_node;
+	while (*p) {
+		parent = *p;
+		b = rb_entry(parent, struct ubifs_bud, rb);
+		ubifs_assert(bud->lnum != b->lnum);
+		if (bud->lnum < b->lnum)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&bud->rb, parent, p);
+	rb_insert_color(&bud->rb, &c->buds);
+	if (c->jheads) {
+		jhead = &c->jheads[bud->jhead];
+		list_add_tail(&bud->list, &jhead->buds_list);
+	} else
+		ubifs_assert(c->replaying && c->ro_mount);
+
+	/*
+	 * Note, although this is a new bud, we anyway account this space now,
+	 * before any data has been written to it, because this is about to
+	 * guarantee fixed mount time, and this bud will anyway be read and
+	 * scanned.
+	 */
+	c->bud_bytes += c->leb_size - bud->start;
+
+	dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
+		bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
+	spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_add_bud_to_log - add a new bud to the log.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head the bud belongs to
+ * @lnum: LEB number of the bud
+ * @offs: starting offset of the bud
+ *
+ * This function writes reference node for the new bud LEB @lnum it to the log,
+ * and adds it to the buds tress. It also makes sure that log size does not
+ * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
+ * %-EAGAIN if commit is required, and a negative error codes in case of
+ * failure.
+ */
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
+{
+	int err;
+	struct ubifs_bud *bud;
+	struct ubifs_ref_node *ref;
+
+	bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
+	if (!bud)
+		return -ENOMEM;
+	ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
+	if (!ref) {
+		kfree(bud);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&c->log_mutex);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_unlock;
+	}
+
+	/* Make sure we have enough space in the log */
+	if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
+		dbg_log("not enough log space - %lld, required %d",
+			empty_log_bytes(c), c->min_log_bytes);
+		ubifs_commit_required(c);
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	/*
+	 * Make sure the amount of space in buds will not exceed the
+	 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
+	 * limits.
+	 *
+	 * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
+	 * because we are holding @c->log_mutex. All @c->bud_bytes take place
+	 * when both @c->log_mutex and @c->bud_bytes are locked.
+	 */
+	if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
+		dbg_log("bud bytes %lld (%lld max), require commit",
+			c->bud_bytes, c->max_bud_bytes);
+		ubifs_commit_required(c);
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	/*
+	 * If the journal is full enough - start background commit. Note, it is
+	 * OK to read 'c->cmt_state' without spinlock because integer reads
+	 * are atomic in the kernel.
+	 */
+	if (c->bud_bytes >= c->bg_bud_bytes &&
+	    c->cmt_state == COMMIT_RESTING) {
+		dbg_log("bud bytes %lld (%lld max), initiate BG commit",
+			c->bud_bytes, c->max_bud_bytes);
+		ubifs_request_bg_commit(c);
+	}
+
+	bud->lnum = lnum;
+	bud->start = offs;
+	bud->jhead = jhead;
+
+	ref->ch.node_type = UBIFS_REF_NODE;
+	ref->lnum = cpu_to_le32(bud->lnum);
+	ref->offs = cpu_to_le32(bud->start);
+	ref->jhead = cpu_to_le32(jhead);
+
+	if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		ubifs_assert(c->lhead_lnum != c->ltail_lnum);
+		c->lhead_offs = 0;
+	}
+
+	if (c->lhead_offs == 0) {
+		/* Must ensure next log LEB has been unmapped */
+		err = ubifs_leb_unmap(c, c->lhead_lnum);
+		if (err)
+			goto out_unlock;
+	}
+
+	if (bud->start == 0) {
+		/*
+		 * Before writing the LEB reference which refers an empty LEB
+		 * to the log, we have to make sure it is mapped, because
+		 * otherwise we'd risk to refer an LEB with garbage in case of
+		 * an unclean reboot, because the target LEB might have been
+		 * unmapped, but not yet physically erased.
+		 */
+		err = ubifs_leb_map(c, bud->lnum);
+		if (err)
+			goto out_unlock;
+	}
+
+	dbg_log("write ref LEB %d:%d",
+		c->lhead_lnum, c->lhead_offs);
+	err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
+			       c->lhead_offs);
+	if (err)
+		goto out_unlock;
+
+	c->lhead_offs += c->ref_node_alsz;
+
+	ubifs_add_bud(c, bud);
+
+	mutex_unlock(&c->log_mutex);
+	kfree(ref);
+	return 0;
+
+out_unlock:
+	mutex_unlock(&c->log_mutex);
+	kfree(ref);
+	kfree(bud);
+	return err;
+}
+
+/**
+ * remove_buds - remove used buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function removes use buds from the buds tree. It does not remove the
+ * buds which are pointed to by journal heads.
+ */
+static void remove_buds(struct ubifs_info *c)
+{
+	struct rb_node *p;
+
+	ubifs_assert(list_empty(&c->old_buds));
+	c->cmt_bud_bytes = 0;
+	spin_lock(&c->buds_lock);
+	p = rb_first(&c->buds);
+	while (p) {
+		struct rb_node *p1 = p;
+		struct ubifs_bud *bud;
+		struct ubifs_wbuf *wbuf;
+
+		p = rb_next(p);
+		bud = rb_entry(p1, struct ubifs_bud, rb);
+		wbuf = &c->jheads[bud->jhead].wbuf;
+
+		if (wbuf->lnum == bud->lnum) {
+			/*
+			 * Do not remove buds which are pointed to by journal
+			 * heads (non-closed buds).
+			 */
+			c->cmt_bud_bytes += wbuf->offs - bud->start;
+			dbg_log("preserve %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld",
+				bud->lnum, bud->start, dbg_jhead(bud->jhead),
+				wbuf->offs - bud->start, c->cmt_bud_bytes);
+			bud->start = wbuf->offs;
+		} else {
+			c->cmt_bud_bytes += c->leb_size - bud->start;
+			dbg_log("remove %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld",
+				bud->lnum, bud->start, dbg_jhead(bud->jhead),
+				c->leb_size - bud->start, c->cmt_bud_bytes);
+			rb_erase(p1, &c->buds);
+			/*
+			 * If the commit does not finish, the recovery will need
+			 * to replay the journal, in which case the old buds
+			 * must be unchanged. Do not release them until post
+			 * commit i.e. do not allow them to be garbage
+			 * collected.
+			 */
+			list_move(&bud->list, &c->old_buds);
+		}
+	}
+	spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_log_start_commit - start commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: return new log tail LEB number
+ *
+ * The commit operation starts with writing "commit start" node to the log and
+ * reference nodes for all journal heads which will define new journal after
+ * the commit has been finished. The commit start and reference nodes are
+ * written in one go to the nearest empty log LEB (hence, when commit is
+ * finished UBIFS may safely unmap all the previous log LEBs). This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
+{
+	void *buf;
+	struct ubifs_cs_node *cs;
+	struct ubifs_ref_node *ref;
+	int err, i, max_len, len;
+
+	err = dbg_check_bud_bytes(c);
+	if (err)
+		return err;
+
+	max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
+	max_len = ALIGN(max_len, c->min_io_size);
+	buf = cs = kmalloc(max_len, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	cs->ch.node_type = UBIFS_CS_NODE;
+	cs->cmt_no = cpu_to_le64(c->cmt_no);
+	ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
+
+	/*
+	 * Note, we do not lock 'c->log_mutex' because this is the commit start
+	 * phase and we are exclusively using the log. And we do not lock
+	 * write-buffer because nobody can write to the file-system at this
+	 * phase.
+	 */
+
+	len = UBIFS_CS_NODE_SZ;
+	for (i = 0; i < c->jhead_cnt; i++) {
+		int lnum = c->jheads[i].wbuf.lnum;
+		int offs = c->jheads[i].wbuf.offs;
+
+		if (lnum == -1 || offs == c->leb_size)
+			continue;
+
+		dbg_log("add ref to LEB %d:%d for jhead %s",
+			lnum, offs, dbg_jhead(i));
+		ref = buf + len;
+		ref->ch.node_type = UBIFS_REF_NODE;
+		ref->lnum = cpu_to_le32(lnum);
+		ref->offs = cpu_to_le32(offs);
+		ref->jhead = cpu_to_le32(i);
+
+		ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
+		len += UBIFS_REF_NODE_SZ;
+	}
+
+	ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
+
+	/* Switch to the next log LEB */
+	if (c->lhead_offs) {
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		ubifs_assert(c->lhead_lnum != c->ltail_lnum);
+		c->lhead_offs = 0;
+	}
+
+	/* Must ensure next LEB has been unmapped */
+	err = ubifs_leb_unmap(c, c->lhead_lnum);
+	if (err)
+		goto out;
+
+	len = ALIGN(len, c->min_io_size);
+	dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
+	err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len);
+	if (err)
+		goto out;
+
+	*ltail_lnum = c->lhead_lnum;
+
+	c->lhead_offs += len;
+	if (c->lhead_offs == c->leb_size) {
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		c->lhead_offs = 0;
+	}
+
+	remove_buds(c);
+
+	/*
+	 * We have started the commit and now users may use the rest of the log
+	 * for new writes.
+	 */
+	c->min_log_bytes = 0;
+
+out:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_log_end_commit - end commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: new log tail LEB number
+ *
+ * This function is called on when the commit operation was finished. It
+ * moves log tail to new position and updates the master node so that it stores
+ * the new log tail LEB number. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
+{
+	int err;
+
+	/*
+	 * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
+	 * writes during commit. Its only short "commit" start phase when
+	 * writers are blocked.
+	 */
+	mutex_lock(&c->log_mutex);
+
+	dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
+		c->ltail_lnum, ltail_lnum);
+
+	c->ltail_lnum = ltail_lnum;
+	/*
+	 * The commit is finished and from now on it must be guaranteed that
+	 * there is always enough space for the next commit.
+	 */
+	c->min_log_bytes = c->leb_size;
+
+	spin_lock(&c->buds_lock);
+	c->bud_bytes -= c->cmt_bud_bytes;
+	spin_unlock(&c->buds_lock);
+
+	err = dbg_check_bud_bytes(c);
+	if (err)
+		goto out;
+
+	err = ubifs_write_master(c);
+
+out:
+	mutex_unlock(&c->log_mutex);
+	return err;
+}
+
+/**
+ * ubifs_log_post_commit - things to do after commit is completed.
+ * @c: UBIFS file-system description object
+ * @old_ltail_lnum: old log tail LEB number
+ *
+ * Release buds only after commit is completed, because they must be unchanged
+ * if recovery is needed.
+ *
+ * Unmap log LEBs only after commit is completed, because they may be needed for
+ * recovery.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
+{
+	int lnum, err = 0;
+
+	while (!list_empty(&c->old_buds)) {
+		struct ubifs_bud *bud;
+
+		bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+		err = ubifs_return_leb(c, bud->lnum);
+		if (err)
+			return err;
+		list_del(&bud->list);
+		kfree(bud);
+	}
+	mutex_lock(&c->log_mutex);
+	for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
+	     lnum = ubifs_next_log_lnum(c, lnum)) {
+		dbg_log("unmap log LEB %d", lnum);
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			goto out;
+	}
+out:
+	mutex_unlock(&c->log_mutex);
+	return err;
+}
+
+/**
+ * struct done_ref - references that have been done.
+ * @rb: rb-tree node
+ * @lnum: LEB number
+ */
+struct done_ref {
+	struct rb_node rb;
+	int lnum;
+};
+
+/**
+ * done_already - determine if a reference has been done already.
+ * @done_tree: rb-tree to store references that have been done
+ * @lnum: LEB number of reference
+ *
+ * This function returns %1 if the reference has been done, %0 if not, otherwise
+ * a negative error code is returned.
+ */
+static int done_already(struct rb_root *done_tree, int lnum)
+{
+	struct rb_node **p = &done_tree->rb_node, *parent = NULL;
+	struct done_ref *dr;
+
+	while (*p) {
+		parent = *p;
+		dr = rb_entry(parent, struct done_ref, rb);
+		if (lnum < dr->lnum)
+			p = &(*p)->rb_left;
+		else if (lnum > dr->lnum)
+			p = &(*p)->rb_right;
+		else
+			return 1;
+	}
+
+	dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
+	if (!dr)
+		return -ENOMEM;
+
+	dr->lnum = lnum;
+
+	rb_link_node(&dr->rb, parent, p);
+	rb_insert_color(&dr->rb, done_tree);
+
+	return 0;
+}
+
+/**
+ * destroy_done_tree - destroy the done tree.
+ * @done_tree: done tree to destroy
+ */
+static void destroy_done_tree(struct rb_root *done_tree)
+{
+	struct done_ref *dr, *n;
+
+	rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb)
+		kfree(dr);
+}
+
+/**
+ * add_node - add a node to the consolidated log.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to which to add
+ * @lnum: LEB number to which to write is passed and returned here
+ * @offs: offset to where to write is passed and returned here
+ * @node: node to add
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
+		    void *node)
+{
+	struct ubifs_ch *ch = node;
+	int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
+
+	if (len > remains) {
+		int sz = ALIGN(*offs, c->min_io_size), err;
+
+		ubifs_pad(c, buf + *offs, sz - *offs);
+		err = ubifs_leb_change(c, *lnum, buf, sz);
+		if (err)
+			return err;
+		*lnum = ubifs_next_log_lnum(c, *lnum);
+		*offs = 0;
+	}
+	memcpy(buf + *offs, node, len);
+	*offs += ALIGN(len, 8);
+	return 0;
+}
+
+/**
+ * ubifs_consolidate_log - consolidate the log.
+ * @c: UBIFS file-system description object
+ *
+ * Repeated failed commits could cause the log to be full, but at least 1 LEB is
+ * needed for commit. This function rewrites the reference nodes in the log
+ * omitting duplicates, and failed CS nodes, and leaving no gaps.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_consolidate_log(struct ubifs_info *c)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct rb_root done_tree = RB_ROOT;
+	int lnum, err, first = 1, write_lnum, offs = 0;
+	void *buf;
+
+	dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
+		  c->lhead_lnum);
+	buf = vmalloc(c->leb_size);
+	if (!buf)
+		return -ENOMEM;
+	lnum = c->ltail_lnum;
+	write_lnum = lnum;
+	while (1) {
+		sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
+		if (IS_ERR(sleb)) {
+			err = PTR_ERR(sleb);
+			goto out_free;
+		}
+		list_for_each_entry(snod, &sleb->nodes, list) {
+			switch (snod->type) {
+			case UBIFS_REF_NODE: {
+				struct ubifs_ref_node *ref = snod->node;
+				int ref_lnum = le32_to_cpu(ref->lnum);
+
+				err = done_already(&done_tree, ref_lnum);
+				if (err < 0)
+					goto out_scan;
+				if (err != 1) {
+					err = add_node(c, buf, &write_lnum,
+						       &offs, snod->node);
+					if (err)
+						goto out_scan;
+				}
+				break;
+			}
+			case UBIFS_CS_NODE:
+				if (!first)
+					break;
+				err = add_node(c, buf, &write_lnum, &offs,
+					       snod->node);
+				if (err)
+					goto out_scan;
+				first = 0;
+				break;
+			}
+		}
+		ubifs_scan_destroy(sleb);
+		if (lnum == c->lhead_lnum)
+			break;
+		lnum = ubifs_next_log_lnum(c, lnum);
+	}
+	if (offs) {
+		int sz = ALIGN(offs, c->min_io_size);
+
+		ubifs_pad(c, buf + offs, sz - offs);
+		err = ubifs_leb_change(c, write_lnum, buf, sz);
+		if (err)
+			goto out_free;
+		offs = ALIGN(offs, c->min_io_size);
+	}
+	destroy_done_tree(&done_tree);
+	vfree(buf);
+	if (write_lnum == c->lhead_lnum) {
+		ubifs_err(c, "log is too full");
+		return -EINVAL;
+	}
+	/* Unmap remaining LEBs */
+	lnum = write_lnum;
+	do {
+		lnum = ubifs_next_log_lnum(c, lnum);
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	} while (lnum != c->lhead_lnum);
+	c->lhead_lnum = write_lnum;
+	c->lhead_offs = offs;
+	dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
+	return 0;
+
+out_scan:
+	ubifs_scan_destroy(sleb);
+out_free:
+	destroy_done_tree(&done_tree);
+	vfree(buf);
+	return err;
+}
+
+/**
+ * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure the amount of flash space used by closed buds
+ * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
+ * case of failure.
+ */
+static int dbg_check_bud_bytes(struct ubifs_info *c)
+{
+	int i, err = 0;
+	struct ubifs_bud *bud;
+	long long bud_bytes = 0;
+
+	if (!dbg_is_chk_gen(c))
+		return 0;
+
+	spin_lock(&c->buds_lock);
+	for (i = 0; i < c->jhead_cnt; i++)
+		list_for_each_entry(bud, &c->jheads[i].buds_list, list)
+			bud_bytes += c->leb_size - bud->start;
+
+	if (c->bud_bytes != bud_bytes) {
+		ubifs_err(c, "bad bud_bytes %lld, calculated %lld",
+			  c->bud_bytes, bud_bytes);
+		err = -EINVAL;
+	}
+	spin_unlock(&c->buds_lock);
+
+	return err;
+}
diff --git a/kernel/fs/ubifs/lprops.c b/kernel/fs/ubifs/lprops.c
new file mode 100644
index 000000000..a0011aa3a
--- /dev/null
+++ b/kernel/fs/ubifs/lprops.c
@@ -0,0 +1,1321 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the functions that access LEB properties and their
+ * categories. LEBs are categorized based on the needs of UBIFS, and the
+ * categories are stored as either heaps or lists to provide a fast way of
+ * finding a LEB in a particular category. For example, UBIFS may need to find
+ * an empty LEB for the journal, or a very dirty LEB for garbage collection.
+ */
+
+#include "ubifs.h"
+
+/**
+ * get_heap_comp_val - get the LEB properties value for heap comparisons.
+ * @lprops: LEB properties
+ * @cat: LEB category
+ */
+static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
+{
+	switch (cat) {
+	case LPROPS_FREE:
+		return lprops->free;
+	case LPROPS_DIRTY_IDX:
+		return lprops->free + lprops->dirty;
+	default:
+		return lprops->dirty;
+	}
+}
+
+/**
+ * move_up_lpt_heap - move a new heap entry up as far as possible.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @cat: LEB category
+ *
+ * New entries to a heap are added at the bottom and then moved up until the
+ * parent's value is greater.  In the case of LPT's category heaps, the value
+ * is either the amount of free space or the amount of dirty space, depending
+ * on the category.
+ */
+static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+			     struct ubifs_lprops *lprops, int cat)
+{
+	int val1, val2, hpos;
+
+	hpos = lprops->hpos;
+	if (!hpos)
+		return; /* Already top of the heap */
+	val1 = get_heap_comp_val(lprops, cat);
+	/* Compare to parent and, if greater, move up the heap */
+	do {
+		int ppos = (hpos - 1) / 2;
+
+		val2 = get_heap_comp_val(heap->arr[ppos], cat);
+		if (val2 >= val1)
+			return;
+		/* Greater than parent so move up */
+		heap->arr[ppos]->hpos = hpos;
+		heap->arr[hpos] = heap->arr[ppos];
+		heap->arr[ppos] = lprops;
+		lprops->hpos = ppos;
+		hpos = ppos;
+	} while (hpos);
+}
+
+/**
+ * adjust_lpt_heap - move a changed heap entry up or down the heap.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @hpos: heap position of @lprops
+ * @cat: LEB category
+ *
+ * Changed entries in a heap are moved up or down until the parent's value is
+ * greater.  In the case of LPT's category heaps, the value is either the amount
+ * of free space or the amount of dirty space, depending on the category.
+ */
+static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+			    struct ubifs_lprops *lprops, int hpos, int cat)
+{
+	int val1, val2, val3, cpos;
+
+	val1 = get_heap_comp_val(lprops, cat);
+	/* Compare to parent and, if greater than parent, move up the heap */
+	if (hpos) {
+		int ppos = (hpos - 1) / 2;
+
+		val2 = get_heap_comp_val(heap->arr[ppos], cat);
+		if (val1 > val2) {
+			/* Greater than parent so move up */
+			while (1) {
+				heap->arr[ppos]->hpos = hpos;
+				heap->arr[hpos] = heap->arr[ppos];
+				heap->arr[ppos] = lprops;
+				lprops->hpos = ppos;
+				hpos = ppos;
+				if (!hpos)
+					return;
+				ppos = (hpos - 1) / 2;
+				val2 = get_heap_comp_val(heap->arr[ppos], cat);
+				if (val1 <= val2)
+					return;
+				/* Still greater than parent so keep going */
+			}
+		}
+	}
+
+	/* Not greater than parent, so compare to children */
+	while (1) {
+		/* Compare to left child */
+		cpos = hpos * 2 + 1;
+		if (cpos >= heap->cnt)
+			return;
+		val2 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 < val2) {
+			/* Less than left child, so promote biggest child */
+			if (cpos + 1 < heap->cnt) {
+				val3 = get_heap_comp_val(heap->arr[cpos + 1],
+							 cat);
+				if (val3 > val2)
+					cpos += 1; /* Right child is bigger */
+			}
+			heap->arr[cpos]->hpos = hpos;
+			heap->arr[hpos] = heap->arr[cpos];
+			heap->arr[cpos] = lprops;
+			lprops->hpos = cpos;
+			hpos = cpos;
+			continue;
+		}
+		/* Compare to right child */
+		cpos += 1;
+		if (cpos >= heap->cnt)
+			return;
+		val3 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 < val3) {
+			/* Less than right child, so promote right child */
+			heap->arr[cpos]->hpos = hpos;
+			heap->arr[hpos] = heap->arr[cpos];
+			heap->arr[cpos] = lprops;
+			lprops->hpos = cpos;
+			hpos = cpos;
+			continue;
+		}
+		return;
+	}
+}
+
+/**
+ * add_to_lpt_heap - add LEB properties to a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category
+ *
+ * This function returns %1 if @lprops is added to the heap for LEB category
+ * @cat, otherwise %0 is returned because the heap is full.
+ */
+static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
+			   int cat)
+{
+	struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+	if (heap->cnt >= heap->max_cnt) {
+		const int b = LPT_HEAP_SZ / 2 - 1;
+		int cpos, val1, val2;
+
+		/* Compare to some other LEB on the bottom of heap */
+		/* Pick a position kind of randomly */
+		cpos = (((size_t)lprops >> 4) & b) + b;
+		ubifs_assert(cpos >= b);
+		ubifs_assert(cpos < LPT_HEAP_SZ);
+		ubifs_assert(cpos < heap->cnt);
+
+		val1 = get_heap_comp_val(lprops, cat);
+		val2 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 > val2) {
+			struct ubifs_lprops *lp;
+
+			lp = heap->arr[cpos];
+			lp->flags &= ~LPROPS_CAT_MASK;
+			lp->flags |= LPROPS_UNCAT;
+			list_add(&lp->list, &c->uncat_list);
+			lprops->hpos = cpos;
+			heap->arr[cpos] = lprops;
+			move_up_lpt_heap(c, heap, lprops, cat);
+			dbg_check_heap(c, heap, cat, lprops->hpos);
+			return 1; /* Added to heap */
+		}
+		dbg_check_heap(c, heap, cat, -1);
+		return 0; /* Not added to heap */
+	} else {
+		lprops->hpos = heap->cnt++;
+		heap->arr[lprops->hpos] = lprops;
+		move_up_lpt_heap(c, heap, lprops, cat);
+		dbg_check_heap(c, heap, cat, lprops->hpos);
+		return 1; /* Added to heap */
+	}
+}
+
+/**
+ * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category
+ */
+static void remove_from_lpt_heap(struct ubifs_info *c,
+				 struct ubifs_lprops *lprops, int cat)
+{
+	struct ubifs_lpt_heap *heap;
+	int hpos = lprops->hpos;
+
+	heap = &c->lpt_heap[cat - 1];
+	ubifs_assert(hpos >= 0 && hpos < heap->cnt);
+	ubifs_assert(heap->arr[hpos] == lprops);
+	heap->cnt -= 1;
+	if (hpos < heap->cnt) {
+		heap->arr[hpos] = heap->arr[heap->cnt];
+		heap->arr[hpos]->hpos = hpos;
+		adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
+	}
+	dbg_check_heap(c, heap, cat, -1);
+}
+
+/**
+ * lpt_heap_replace - replace lprops in a category heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ * @cat: LEB category
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains.  When that happens, references in
+ * the category heaps to those lprops must be updated to point to the new
+ * lprops.  This function does that.
+ */
+static void lpt_heap_replace(struct ubifs_info *c,
+			     struct ubifs_lprops *old_lprops,
+			     struct ubifs_lprops *new_lprops, int cat)
+{
+	struct ubifs_lpt_heap *heap;
+	int hpos = new_lprops->hpos;
+
+	heap = &c->lpt_heap[cat - 1];
+	heap->arr[hpos] = new_lprops;
+}
+
+/**
+ * ubifs_add_to_cat - add LEB properties to a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category to which to add
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+		      int cat)
+{
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		if (add_to_lpt_heap(c, lprops, cat))
+			break;
+		/* No more room on heap so make it un-categorized */
+		cat = LPROPS_UNCAT;
+		/* Fall through */
+	case LPROPS_UNCAT:
+		list_add(&lprops->list, &c->uncat_list);
+		break;
+	case LPROPS_EMPTY:
+		list_add(&lprops->list, &c->empty_list);
+		break;
+	case LPROPS_FREEABLE:
+		list_add(&lprops->list, &c->freeable_list);
+		c->freeable_cnt += 1;
+		break;
+	case LPROPS_FRDI_IDX:
+		list_add(&lprops->list, &c->frdi_idx_list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+
+	lprops->flags &= ~LPROPS_CAT_MASK;
+	lprops->flags |= cat;
+	c->in_a_category_cnt += 1;
+	ubifs_assert(c->in_a_category_cnt <= c->main_lebs);
+}
+
+/**
+ * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category from which to remove
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+static void ubifs_remove_from_cat(struct ubifs_info *c,
+				  struct ubifs_lprops *lprops, int cat)
+{
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		remove_from_lpt_heap(c, lprops, cat);
+		break;
+	case LPROPS_FREEABLE:
+		c->freeable_cnt -= 1;
+		ubifs_assert(c->freeable_cnt >= 0);
+		/* Fall through */
+	case LPROPS_UNCAT:
+	case LPROPS_EMPTY:
+	case LPROPS_FRDI_IDX:
+		ubifs_assert(!list_empty(&lprops->list));
+		list_del(&lprops->list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+
+	c->in_a_category_cnt -= 1;
+	ubifs_assert(c->in_a_category_cnt >= 0);
+}
+
+/**
+ * ubifs_replace_cat - replace lprops in a category list or heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+		       struct ubifs_lprops *new_lprops)
+{
+	int cat;
+
+	cat = new_lprops->flags & LPROPS_CAT_MASK;
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		lpt_heap_replace(c, old_lprops, new_lprops, cat);
+		break;
+	case LPROPS_UNCAT:
+	case LPROPS_EMPTY:
+	case LPROPS_FREEABLE:
+	case LPROPS_FRDI_IDX:
+		list_replace(&old_lprops->list, &new_lprops->list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+}
+
+/**
+ * ubifs_ensure_cat - ensure LEB properties are categorized.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * A LEB may have fallen off of the bottom of a heap, and ended up as
+ * un-categorized even though it has enough space for us now. If that is the
+ * case this function will put the LEB back onto a heap.
+ */
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	int cat = lprops->flags & LPROPS_CAT_MASK;
+
+	if (cat != LPROPS_UNCAT)
+		return;
+	cat = ubifs_categorize_lprops(c, lprops);
+	if (cat == LPROPS_UNCAT)
+		return;
+	ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
+	ubifs_add_to_cat(c, lprops, cat);
+}
+
+/**
+ * ubifs_categorize_lprops - categorize LEB properties.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to categorize
+ *
+ * LEB properties are categorized to enable fast find operations. This function
+ * returns the LEB category to which the LEB properties belong. Note however
+ * that if the LEB category is stored as a heap and the heap is full, the
+ * LEB properties may have their category changed to %LPROPS_UNCAT.
+ */
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops)
+{
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPROPS_UNCAT;
+
+	if (lprops->free == c->leb_size) {
+		ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+		return LPROPS_EMPTY;
+	}
+
+	if (lprops->free + lprops->dirty == c->leb_size) {
+		if (lprops->flags & LPROPS_INDEX)
+			return LPROPS_FRDI_IDX;
+		else
+			return LPROPS_FREEABLE;
+	}
+
+	if (lprops->flags & LPROPS_INDEX) {
+		if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
+			return LPROPS_DIRTY_IDX;
+	} else {
+		if (lprops->dirty >= c->dead_wm &&
+		    lprops->dirty > lprops->free)
+			return LPROPS_DIRTY;
+		if (lprops->free > 0)
+			return LPROPS_FREE;
+	}
+
+	return LPROPS_UNCAT;
+}
+
+/**
+ * change_category - change LEB properties category.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to re-categorize
+ *
+ * LEB properties are categorized to enable fast find operations. When the LEB
+ * properties change they must be re-categorized.
+ */
+static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	int old_cat = lprops->flags & LPROPS_CAT_MASK;
+	int new_cat = ubifs_categorize_lprops(c, lprops);
+
+	if (old_cat == new_cat) {
+		struct ubifs_lpt_heap *heap;
+
+		/* lprops on a heap now must be moved up or down */
+		if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
+			return; /* Not on a heap */
+		heap = &c->lpt_heap[new_cat - 1];
+		adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
+	} else {
+		ubifs_remove_from_cat(c, lprops, old_cat);
+		ubifs_add_to_cat(c, lprops, new_cat);
+	}
+}
+
+/**
+ * ubifs_calc_dark - calculate LEB dark space size.
+ * @c: the UBIFS file-system description object
+ * @spc: amount of free and dirty space in the LEB
+ *
+ * This function calculates and returns amount of dark space in an LEB which
+ * has @spc bytes of free and dirty space.
+ *
+ * UBIFS is trying to account the space which might not be usable, and this
+ * space is called "dark space". For example, if an LEB has only %512 free
+ * bytes, it is dark space, because it cannot fit a large data node.
+ */
+int ubifs_calc_dark(const struct ubifs_info *c, int spc)
+{
+	ubifs_assert(!(spc & 7));
+
+	if (spc < c->dark_wm)
+		return spc;
+
+	/*
+	 * If we have slightly more space then the dark space watermark, we can
+	 * anyway safely assume it we'll be able to write a node of the
+	 * smallest size there.
+	 */
+	if (spc - c->dark_wm < MIN_WRITE_SZ)
+		return spc - MIN_WRITE_SZ;
+
+	return c->dark_wm;
+}
+
+/**
+ * is_lprops_dirty - determine if LEB properties are dirty.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to test
+ */
+static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	struct ubifs_pnode *pnode;
+	int pos;
+
+	pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
+	pnode = (struct ubifs_pnode *)container_of(lprops - pos,
+						   struct ubifs_pnode,
+						   lprops[0]);
+	return !test_bit(COW_CNODE, &pnode->flags) &&
+	       test_bit(DIRTY_CNODE, &pnode->flags);
+}
+
+/**
+ * ubifs_change_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to change
+ * @free: new free space amount
+ * @dirty: new dirty space amount
+ * @flags: new flags
+ * @idx_gc_cnt: change to the count of @idx_gc list
+ *
+ * This function changes LEB properties (@free, @dirty or @flag). However, the
+ * property which has the %LPROPS_NC value is not changed. Returns a pointer to
+ * the updated LEB properties on success and a negative error code on failure.
+ *
+ * Note, the LEB properties may have had to be copied (due to COW) and
+ * consequently the pointer returned may not be the same as the pointer
+ * passed.
+ */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+					   const struct ubifs_lprops *lp,
+					   int free, int dirty, int flags,
+					   int idx_gc_cnt)
+{
+	/*
+	 * This is the only function that is allowed to change lprops, so we
+	 * discard the "const" qualifier.
+	 */
+	struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
+
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d",
+	       lprops->lnum, free, dirty, flags);
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+	ubifs_assert(c->freeable_cnt >= 0);
+	ubifs_assert(c->freeable_cnt <= c->main_lebs);
+	ubifs_assert(c->lst.taken_empty_lebs >= 0);
+	ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
+	ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
+	ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
+	ubifs_assert(!(c->lst.total_used & 7));
+	ubifs_assert(free == LPROPS_NC || free >= 0);
+	ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
+
+	if (!is_lprops_dirty(c, lprops)) {
+		lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
+		if (IS_ERR(lprops))
+			return lprops;
+	} else
+		ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
+
+	ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
+
+	spin_lock(&c->space_lock);
+	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+		c->lst.taken_empty_lebs -= 1;
+
+	if (!(lprops->flags & LPROPS_INDEX)) {
+		int old_spc;
+
+		old_spc = lprops->free + lprops->dirty;
+		if (old_spc < c->dead_wm)
+			c->lst.total_dead -= old_spc;
+		else
+			c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
+
+		c->lst.total_used -= c->leb_size - old_spc;
+	}
+
+	if (free != LPROPS_NC) {
+		free = ALIGN(free, 8);
+		c->lst.total_free += free - lprops->free;
+
+		/* Increase or decrease empty LEBs counter if needed */
+		if (free == c->leb_size) {
+			if (lprops->free != c->leb_size)
+				c->lst.empty_lebs += 1;
+		} else if (lprops->free == c->leb_size)
+			c->lst.empty_lebs -= 1;
+		lprops->free = free;
+	}
+
+	if (dirty != LPROPS_NC) {
+		dirty = ALIGN(dirty, 8);
+		c->lst.total_dirty += dirty - lprops->dirty;
+		lprops->dirty = dirty;
+	}
+
+	if (flags != LPROPS_NC) {
+		/* Take care about indexing LEBs counter if needed */
+		if ((lprops->flags & LPROPS_INDEX)) {
+			if (!(flags & LPROPS_INDEX))
+				c->lst.idx_lebs -= 1;
+		} else if (flags & LPROPS_INDEX)
+			c->lst.idx_lebs += 1;
+		lprops->flags = flags;
+	}
+
+	if (!(lprops->flags & LPROPS_INDEX)) {
+		int new_spc;
+
+		new_spc = lprops->free + lprops->dirty;
+		if (new_spc < c->dead_wm)
+			c->lst.total_dead += new_spc;
+		else
+			c->lst.total_dark += ubifs_calc_dark(c, new_spc);
+
+		c->lst.total_used += c->leb_size - new_spc;
+	}
+
+	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+		c->lst.taken_empty_lebs += 1;
+
+	change_category(c, lprops);
+	c->idx_gc_cnt += idx_gc_cnt;
+	spin_unlock(&c->space_lock);
+	return lprops;
+}
+
+/**
+ * ubifs_get_lp_stats - get lprops statistics.
+ * @c: UBIFS file-system description object
+ * @st: return statistics
+ */
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
+{
+	spin_lock(&c->space_lock);
+	memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_change_one_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes properties of LEB @lnum. It is a helper wrapper over
+ * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
+ * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
+ * a negative error code in case of failure.
+ */
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean, int idx_gc_cnt)
+{
+	int err = 0, flags;
+	const struct ubifs_lprops *lp;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	flags = (lp->flags | flags_set) & ~flags_clean;
+	lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
+	if (IS_ERR(lp))
+		err = PTR_ERR(lp);
+
+out:
+	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err(c, "cannot change properties of LEB %d, error %d",
+			  lnum, err);
+	return err;
+}
+
+/**
+ * ubifs_update_one_lp - update LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ *
+ * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
+ * current dirty space, not substitutes it.
+ */
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean)
+{
+	int err = 0, flags;
+	const struct ubifs_lprops *lp;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	flags = (lp->flags | flags_set) & ~flags_clean;
+	lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
+	if (IS_ERR(lp))
+		err = PTR_ERR(lp);
+
+out:
+	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err(c, "cannot update properties of LEB %d, error %d",
+			  lnum, err);
+	return err;
+}
+
+/**
+ * ubifs_read_one_lp - read LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to read properties for
+ * @lp: where to store read properties
+ *
+ * This helper function reads properties of a LEB @lnum and stores them in @lp.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
+{
+	int err = 0;
+	const struct ubifs_lprops *lpp;
+
+	ubifs_get_lprops(c);
+
+	lpp = ubifs_lpt_lookup(c, lnum);
+	if (IS_ERR(lpp)) {
+		err = PTR_ERR(lpp);
+		ubifs_err(c, "cannot read properties of LEB %d, error %d",
+			  lnum, err);
+		goto out;
+	}
+
+	memcpy(lp, lpp, sizeof(struct ubifs_lprops));
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_fast_find_free - try to find a LEB with free space quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a LEB with free space or %NULL if
+ * the function is unable to find a LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	if (heap->cnt == 0)
+		return NULL;
+
+	lprops = heap->arr[0];
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_empty - try to find an empty LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for an empty LEB or %NULL if the
+ * function is unable to find an empty LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->empty_list))
+		return NULL;
+
+	lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free == c->leb_size);
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable LEB or %NULL if the
+ * function is unable to find a freeable LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->freeable_list))
+		return NULL;
+
+	lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	ubifs_assert(c->freeable_cnt > 0);
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable index LEB or %NULL if the
+ * function is unable to find a freeable index LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->frdi_idx_list))
+		return NULL;
+
+	lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert((lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	return lprops;
+}
+
+/*
+ * Everything below is related to debugging.
+ */
+
+/**
+ * dbg_check_cats - check category heaps and lists.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_cats(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct list_head *pos;
+	int i, cat;
+
+	if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c))
+		return 0;
+
+	list_for_each_entry(lprops, &c->empty_list, list) {
+		if (lprops->free != c->leb_size) {
+			ubifs_err(c, "non-empty LEB %d on empty list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err(c, "taken LEB %d on empty list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
+			return -EINVAL;
+		}
+	}
+
+	i = 0;
+	list_for_each_entry(lprops, &c->freeable_list, list) {
+		if (lprops->free + lprops->dirty != c->leb_size) {
+			ubifs_err(c, "non-freeable LEB %d on freeable list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err(c, "taken LEB %d on freeable list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
+			return -EINVAL;
+		}
+		i += 1;
+	}
+	if (i != c->freeable_cnt) {
+		ubifs_err(c, "freeable list count %d expected %d", i,
+			  c->freeable_cnt);
+		return -EINVAL;
+	}
+
+	i = 0;
+	list_for_each(pos, &c->idx_gc)
+		i += 1;
+	if (i != c->idx_gc_cnt) {
+		ubifs_err(c, "idx_gc list count %d expected %d", i,
+			  c->idx_gc_cnt);
+		return -EINVAL;
+	}
+
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		if (lprops->free + lprops->dirty != c->leb_size) {
+			ubifs_err(c, "non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err(c, "taken LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
+			return -EINVAL;
+		}
+		if (!(lprops->flags & LPROPS_INDEX)) {
+			ubifs_err(c, "non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
+			return -EINVAL;
+		}
+	}
+
+	for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+		for (i = 0; i < heap->cnt; i++) {
+			lprops = heap->arr[i];
+			if (!lprops) {
+				ubifs_err(c, "null ptr in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+			if (lprops->hpos != i) {
+				ubifs_err(c, "bad ptr in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+			if (lprops->flags & LPROPS_TAKEN) {
+				ubifs_err(c, "taken LEB in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+		    int add_pos)
+{
+	int i = 0, j, err = 0;
+
+	if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c))
+		return;
+
+	for (i = 0; i < heap->cnt; i++) {
+		struct ubifs_lprops *lprops = heap->arr[i];
+		struct ubifs_lprops *lp;
+
+		if (i != add_pos)
+			if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
+				err = 1;
+				goto out;
+			}
+		if (lprops->hpos != i) {
+			err = 2;
+			goto out;
+		}
+		lp = ubifs_lpt_lookup(c, lprops->lnum);
+		if (IS_ERR(lp)) {
+			err = 3;
+			goto out;
+		}
+		if (lprops != lp) {
+			ubifs_err(c, "lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+				  (size_t)lprops, (size_t)lp, lprops->lnum,
+				  lp->lnum);
+			err = 4;
+			goto out;
+		}
+		for (j = 0; j < i; j++) {
+			lp = heap->arr[j];
+			if (lp == lprops) {
+				err = 5;
+				goto out;
+			}
+			if (lp->lnum == lprops->lnum) {
+				err = 6;
+				goto out;
+			}
+		}
+	}
+out:
+	if (err) {
+		ubifs_err(c, "failed cat %d hpos %d err %d", cat, i, err);
+		dump_stack();
+		ubifs_dump_heap(c, heap, cat);
+	}
+}
+
+/**
+ * scan_check_cb - scan callback.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @lst: lprops statistics to update
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_check_cb(struct ubifs_info *c,
+			 const struct ubifs_lprops *lp, int in_tree,
+			 struct ubifs_lp_stats *lst)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
+	void *buf = NULL;
+
+	cat = lp->flags & LPROPS_CAT_MASK;
+	if (cat != LPROPS_UNCAT) {
+		cat = ubifs_categorize_lprops(c, lp);
+		if (cat != (lp->flags & LPROPS_CAT_MASK)) {
+			ubifs_err(c, "bad LEB category %d expected %d",
+				  (lp->flags & LPROPS_CAT_MASK), cat);
+			return -EINVAL;
+		}
+	}
+
+	/* Check lp is on its category list (if it has one) */
+	if (in_tree) {
+		struct list_head *list = NULL;
+
+		switch (cat) {
+		case LPROPS_EMPTY:
+			list = &c->empty_list;
+			break;
+		case LPROPS_FREEABLE:
+			list = &c->freeable_list;
+			break;
+		case LPROPS_FRDI_IDX:
+			list = &c->frdi_idx_list;
+			break;
+		case LPROPS_UNCAT:
+			list = &c->uncat_list;
+			break;
+		}
+		if (list) {
+			struct ubifs_lprops *lprops;
+			int found = 0;
+
+			list_for_each_entry(lprops, list, list) {
+				if (lprops == lp) {
+					found = 1;
+					break;
+				}
+			}
+			if (!found) {
+				ubifs_err(c, "bad LPT list (category %d)", cat);
+				return -EINVAL;
+			}
+		}
+	}
+
+	/* Check lp is on its category heap (if it has one) */
+	if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+		if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
+		    lp != heap->arr[lp->hpos]) {
+			ubifs_err(c, "bad LPT heap (category %d)", cat);
+			return -EINVAL;
+		}
+	}
+
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * After an unclean unmount, empty and freeable LEBs
+	 * may contain garbage - do not scan them.
+	 */
+	if (lp->free == c->leb_size) {
+		lst->empty_lebs += 1;
+		lst->total_free += c->leb_size;
+		lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+		return LPT_SCAN_CONTINUE;
+	}
+	if (lp->free + lp->dirty == c->leb_size &&
+	    !(lp->flags & LPROPS_INDEX)) {
+		lst->total_free  += lp->free;
+		lst->total_dirty += lp->dirty;
+		lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
+		return LPT_SCAN_CONTINUE;
+	}
+
+	sleb = ubifs_scan(c, lnum, 0, buf, 0);
+	if (IS_ERR(sleb)) {
+		ret = PTR_ERR(sleb);
+		if (ret == -EUCLEAN) {
+			ubifs_dump_lprops(c);
+			ubifs_dump_budg(c, &c->bi);
+		}
+		goto out;
+	}
+
+	is_idx = -1;
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		int found, level = 0;
+
+		cond_resched();
+
+		if (is_idx == -1)
+			is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
+
+		if (is_idx && snod->type != UBIFS_IDX_NODE) {
+			ubifs_err(c, "indexing node in data LEB %d:%d",
+				  lnum, snod->offs);
+			goto out_destroy;
+		}
+
+		if (snod->type == UBIFS_IDX_NODE) {
+			struct ubifs_idx_node *idx = snod->node;
+
+			key_read(c, ubifs_idx_key(c, idx), &snod->key);
+			level = le16_to_cpu(idx->level);
+		}
+
+		found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
+					   snod->offs, is_idx);
+		if (found) {
+			if (found < 0)
+				goto out_destroy;
+			used += ALIGN(snod->len, 8);
+		}
+	}
+
+	free = c->leb_size - sleb->endpt;
+	dirty = sleb->endpt - used;
+
+	if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
+	    dirty < 0) {
+		ubifs_err(c, "bad calculated accounting for LEB %d: free %d, dirty %d",
+			  lnum, free, dirty);
+		goto out_destroy;
+	}
+
+	if (lp->free + lp->dirty == c->leb_size &&
+	    free + dirty == c->leb_size)
+		if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
+		    (!is_idx && free == c->leb_size) ||
+		    lp->free == c->leb_size) {
+			/*
+			 * Empty or freeable LEBs could contain index
+			 * nodes from an uncompleted commit due to an
+			 * unclean unmount. Or they could be empty for
+			 * the same reason. Or it may simply not have been
+			 * unmapped.
+			 */
+			free = lp->free;
+			dirty = lp->dirty;
+			is_idx = 0;
+		    }
+
+	if (is_idx && lp->free + lp->dirty == free + dirty &&
+	    lnum != c->ihead_lnum) {
+		/*
+		 * After an unclean unmount, an index LEB could have a different
+		 * amount of free space than the value recorded by lprops. That
+		 * is because the in-the-gaps method may use free space or
+		 * create free space (as a side-effect of using ubi_leb_change
+		 * and not writing the whole LEB). The incorrect free space
+		 * value is not a problem because the index is only ever
+		 * allocated empty LEBs, so there will never be an attempt to
+		 * write to the free space at the end of an index LEB - except
+		 * by the in-the-gaps method for which it is not a problem.
+		 */
+		free = lp->free;
+		dirty = lp->dirty;
+	}
+
+	if (lp->free != free || lp->dirty != dirty)
+		goto out_print;
+
+	if (is_idx && !(lp->flags & LPROPS_INDEX)) {
+		if (free == c->leb_size)
+			/* Free but not unmapped LEB, it's fine */
+			is_idx = 0;
+		else {
+			ubifs_err(c, "indexing node without indexing flag");
+			goto out_print;
+		}
+	}
+
+	if (!is_idx && (lp->flags & LPROPS_INDEX)) {
+		ubifs_err(c, "data node with indexing flag");
+		goto out_print;
+	}
+
+	if (free == c->leb_size)
+		lst->empty_lebs += 1;
+
+	if (is_idx)
+		lst->idx_lebs += 1;
+
+	if (!(lp->flags & LPROPS_INDEX))
+		lst->total_used += c->leb_size - free - dirty;
+	lst->total_free += free;
+	lst->total_dirty += dirty;
+
+	if (!(lp->flags & LPROPS_INDEX)) {
+		int spc = free + dirty;
+
+		if (spc < c->dead_wm)
+			lst->total_dead += spc;
+		else
+			lst->total_dark += ubifs_calc_dark(c, spc);
+	}
+
+	ubifs_scan_destroy(sleb);
+	vfree(buf);
+	return LPT_SCAN_CONTINUE;
+
+out_print:
+	ubifs_err(c, "bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d",
+		  lnum, lp->free, lp->dirty, lp->flags, free, dirty);
+	ubifs_dump_leb(c, lnum);
+out_destroy:
+	ubifs_scan_destroy(sleb);
+	ret = -EINVAL;
+out:
+	vfree(buf);
+	return ret;
+}
+
+/**
+ * dbg_check_lprops - check all LEB properties.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks all LEB properties and makes sure they are all correct.
+ * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
+ * and other negative error codes in case of other errors. This function is
+ * called while the file system is locked (because of commit start), so no
+ * additional locking is required. Note that locking the LPT mutex would cause
+ * a circular lock dependency with the TNC mutex.
+ */
+int dbg_check_lprops(struct ubifs_info *c)
+{
+	int i, err;
+	struct ubifs_lp_stats lst;
+
+	if (!dbg_is_chk_lprops(c))
+		return 0;
+
+	/*
+	 * As we are going to scan the media, the write buffers have to be
+	 * synchronized.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
+	memset(&lst, 0, sizeof(struct ubifs_lp_stats));
+	err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
+				    (ubifs_lpt_scan_callback)scan_check_cb,
+				    &lst);
+	if (err && err != -ENOSPC)
+		goto out;
+
+	if (lst.empty_lebs != c->lst.empty_lebs ||
+	    lst.idx_lebs != c->lst.idx_lebs ||
+	    lst.total_free != c->lst.total_free ||
+	    lst.total_dirty != c->lst.total_dirty ||
+	    lst.total_used != c->lst.total_used) {
+		ubifs_err(c, "bad overall accounting");
+		ubifs_err(c, "calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
+			  lst.empty_lebs, lst.idx_lebs, lst.total_free,
+			  lst.total_dirty, lst.total_used);
+		ubifs_err(c, "read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
+			  c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
+			  c->lst.total_dirty, c->lst.total_used);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (lst.total_dead != c->lst.total_dead ||
+	    lst.total_dark != c->lst.total_dark) {
+		ubifs_err(c, "bad dead/dark space accounting");
+		ubifs_err(c, "calculated: total_dead %lld, total_dark %lld",
+			  lst.total_dead, lst.total_dark);
+		ubifs_err(c, "read from lprops: total_dead %lld, total_dark %lld",
+			  c->lst.total_dead, c->lst.total_dark);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = dbg_check_cats(c);
+out:
+	return err;
+}
diff --git a/kernel/fs/ubifs/lpt.c b/kernel/fs/ubifs/lpt.c
new file mode 100644
index 000000000..dc9f27e9d
--- /dev/null
+++ b/kernel/fs/ubifs/lpt.c
@@ -0,0 +1,2278 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the LEB properties tree (LPT) area. The LPT area
+ * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
+ * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
+ * between the log and the orphan area.
+ *
+ * The LPT area is like a miniature self-contained file system. It is required
+ * that it never runs out of space, is fast to access and update, and scales
+ * logarithmically. The LEB properties tree is implemented as a wandering tree
+ * much like the TNC, and the LPT area has its own garbage collection.
+ *
+ * The LPT has two slightly different forms called the "small model" and the
+ * "big model". The small model is used when the entire LEB properties table
+ * can be written into a single eraseblock. In that case, garbage collection
+ * consists of just writing the whole table, which therefore makes all other
+ * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
+ * selected for garbage collection, which consists of marking the clean nodes in
+ * that LEB as dirty, and then only the dirty nodes are written out. Also, in
+ * the case of the big model, a table of LEB numbers is saved so that the entire
+ * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
+ * mounted.
+ */
+
+#include "ubifs.h"
+#include <linux/crc16.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+
+/**
+ * do_calc_lpt_geom - calculate sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
+ * properties of the flash and whether LPT is "big" (c->big_lpt).
+ */
+static void do_calc_lpt_geom(struct ubifs_info *c)
+{
+	int i, n, bits, per_leb_wastage, max_pnode_cnt;
+	long long sz, tot_wastage;
+
+	n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
+	max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+
+	c->lpt_hght = 1;
+	n = UBIFS_LPT_FANOUT;
+	while (n < max_pnode_cnt) {
+		c->lpt_hght += 1;
+		n <<= UBIFS_LPT_FANOUT_SHIFT;
+	}
+
+	c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+
+	n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
+	c->nnode_cnt = n;
+	for (i = 1; i < c->lpt_hght; i++) {
+		n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+		c->nnode_cnt += n;
+	}
+
+	c->space_bits = fls(c->leb_size) - 3;
+	c->lpt_lnum_bits = fls(c->lpt_lebs);
+	c->lpt_offs_bits = fls(c->leb_size - 1);
+	c->lpt_spc_bits = fls(c->leb_size);
+
+	n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
+	c->pcnt_bits = fls(n - 1);
+
+	c->lnum_bits = fls(c->max_leb_cnt - 1);
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       (c->big_lpt ? c->pcnt_bits : 0) +
+	       (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
+	c->pnode_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       (c->big_lpt ? c->pcnt_bits : 0) +
+	       (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
+	c->nnode_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       c->lpt_lebs * c->lpt_spc_bits * 2;
+	c->ltab_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       c->lnum_bits * c->lsave_cnt;
+	c->lsave_sz = (bits + 7) / 8;
+
+	/* Calculate the minimum LPT size */
+	c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+	c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+	c->lpt_sz += c->ltab_sz;
+	if (c->big_lpt)
+		c->lpt_sz += c->lsave_sz;
+
+	/* Add wastage */
+	sz = c->lpt_sz;
+	per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
+	sz += per_leb_wastage;
+	tot_wastage = per_leb_wastage;
+	while (sz > c->leb_size) {
+		sz += per_leb_wastage;
+		sz -= c->leb_size;
+		tot_wastage += per_leb_wastage;
+	}
+	tot_wastage += ALIGN(sz, c->min_io_size) - sz;
+	c->lpt_sz += tot_wastage;
+}
+
+/**
+ * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_calc_lpt_geom(struct ubifs_info *c)
+{
+	int lebs_needed;
+	long long sz;
+
+	do_calc_lpt_geom(c);
+
+	/* Verify that lpt_lebs is big enough */
+	sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
+	lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
+	if (lebs_needed > c->lpt_lebs) {
+		ubifs_err(c, "too few LPT LEBs");
+		return -EINVAL;
+	}
+
+	/* Verify that ltab fits in a single LEB (since ltab is a single node */
+	if (c->ltab_sz > c->leb_size) {
+		ubifs_err(c, "LPT ltab too big");
+		return -EINVAL;
+	}
+
+	c->check_lpt_free = c->big_lpt;
+	return 0;
+}
+
+/**
+ * calc_dflt_lpt_geom - calculate default LPT geometry.
+ * @c: the UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @big_lpt: whether the LPT area is "big" is returned here
+ *
+ * The size of the LPT area depends on parameters that themselves are dependent
+ * on the size of the LPT area. This function, successively recalculates the LPT
+ * area geometry until the parameters and resultant geometry are consistent.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
+			      int *big_lpt)
+{
+	int i, lebs_needed;
+	long long sz;
+
+	/* Start by assuming the minimum number of LPT LEBs */
+	c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
+	c->main_lebs = *main_lebs - c->lpt_lebs;
+	if (c->main_lebs <= 0)
+		return -EINVAL;
+
+	/* And assume we will use the small LPT model */
+	c->big_lpt = 0;
+
+	/*
+	 * Calculate the geometry based on assumptions above and then see if it
+	 * makes sense
+	 */
+	do_calc_lpt_geom(c);
+
+	/* Small LPT model must have lpt_sz < leb_size */
+	if (c->lpt_sz > c->leb_size) {
+		/* Nope, so try again using big LPT model */
+		c->big_lpt = 1;
+		do_calc_lpt_geom(c);
+	}
+
+	/* Now check there are enough LPT LEBs */
+	for (i = 0; i < 64 ; i++) {
+		sz = c->lpt_sz * 4; /* Allow 4 times the size */
+		lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
+		if (lebs_needed > c->lpt_lebs) {
+			/* Not enough LPT LEBs so try again with more */
+			c->lpt_lebs = lebs_needed;
+			c->main_lebs = *main_lebs - c->lpt_lebs;
+			if (c->main_lebs <= 0)
+				return -EINVAL;
+			do_calc_lpt_geom(c);
+			continue;
+		}
+		if (c->ltab_sz > c->leb_size) {
+			ubifs_err(c, "LPT ltab too big");
+			return -EINVAL;
+		}
+		*main_lebs = c->main_lebs;
+		*big_lpt = c->big_lpt;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ * pack_bits - pack bit fields end-to-end.
+ * @addr: address at which to pack (passed and next address returned)
+ * @pos: bit position at which to pack (passed and next position returned)
+ * @val: value to pack
+ * @nrbits: number of bits of value to pack (1-32)
+ */
+static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
+{
+	uint8_t *p = *addr;
+	int b = *pos;
+
+	ubifs_assert(nrbits > 0);
+	ubifs_assert(nrbits <= 32);
+	ubifs_assert(*pos >= 0);
+	ubifs_assert(*pos < 8);
+	ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
+	if (b) {
+		*p |= ((uint8_t)val) << b;
+		nrbits += b;
+		if (nrbits > 8) {
+			*++p = (uint8_t)(val >>= (8 - b));
+			if (nrbits > 16) {
+				*++p = (uint8_t)(val >>= 8);
+				if (nrbits > 24) {
+					*++p = (uint8_t)(val >>= 8);
+					if (nrbits > 32)
+						*++p = (uint8_t)(val >>= 8);
+				}
+			}
+		}
+	} else {
+		*p = (uint8_t)val;
+		if (nrbits > 8) {
+			*++p = (uint8_t)(val >>= 8);
+			if (nrbits > 16) {
+				*++p = (uint8_t)(val >>= 8);
+				if (nrbits > 24)
+					*++p = (uint8_t)(val >>= 8);
+			}
+		}
+	}
+	b = nrbits & 7;
+	if (b == 0)
+		p++;
+	*addr = p;
+	*pos = b;
+}
+
+/**
+ * ubifs_unpack_bits - unpack bit fields.
+ * @addr: address at which to unpack (passed and next address returned)
+ * @pos: bit position at which to unpack (passed and next position returned)
+ * @nrbits: number of bits of value to unpack (1-32)
+ *
+ * This functions returns the value unpacked.
+ */
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
+{
+	const int k = 32 - nrbits;
+	uint8_t *p = *addr;
+	int b = *pos;
+	uint32_t uninitialized_var(val);
+	const int bytes = (nrbits + b + 7) >> 3;
+
+	ubifs_assert(nrbits > 0);
+	ubifs_assert(nrbits <= 32);
+	ubifs_assert(*pos >= 0);
+	ubifs_assert(*pos < 8);
+	if (b) {
+		switch (bytes) {
+		case 2:
+			val = p[1];
+			break;
+		case 3:
+			val = p[1] | ((uint32_t)p[2] << 8);
+			break;
+		case 4:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16);
+			break;
+		case 5:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16) |
+				     ((uint32_t)p[4] << 24);
+		}
+		val <<= (8 - b);
+		val |= *p >> b;
+		nrbits += b;
+	} else {
+		switch (bytes) {
+		case 1:
+			val = p[0];
+			break;
+		case 2:
+			val = p[0] | ((uint32_t)p[1] << 8);
+			break;
+		case 3:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16);
+			break;
+		case 4:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16) |
+				     ((uint32_t)p[3] << 24);
+			break;
+		}
+	}
+	val <<= k;
+	val >>= k;
+	b = nrbits & 7;
+	p += nrbits >> 3;
+	*addr = p;
+	*pos = b;
+	ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
+	return val;
+}
+
+/**
+ * ubifs_pack_pnode - pack all the bit fields of a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @pnode: pnode to pack
+ */
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_pnode *pnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
+	if (c->big_lpt)
+		pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
+			  c->space_bits);
+		pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
+			  c->space_bits);
+		if (pnode->lprops[i].flags & LPROPS_INDEX)
+			pack_bits(&addr, &pos, 1, 1);
+		else
+			pack_bits(&addr, &pos, 0, 1);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->pnode_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_nnode - pack all the bit fields of a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @nnode: nnode to pack
+ */
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_nnode *nnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
+	if (c->big_lpt)
+		pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum = nnode->nbranch[i].lnum;
+
+		if (lnum == 0)
+			lnum = c->lpt_last + 1;
+		pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
+		pack_bits(&addr, &pos, nnode->nbranch[i].offs,
+			  c->lpt_offs_bits);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->nnode_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_ltab - pack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @ltab: LPT's own lprops table to pack
+ */
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+		     struct ubifs_lpt_lprops *ltab)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
+	for (i = 0; i < c->lpt_lebs; i++) {
+		pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
+		pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->ltab_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_lsave - pack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @lsave: LPT's save table to pack
+ */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
+	for (i = 0; i < c->lsave_cnt; i++)
+		pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->lsave_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to which to add dirty space
+ * @dirty: amount of dirty space to add
+ */
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+	if (!dirty || !lnum)
+		return;
+	dbg_lp("LEB %d add %d to %d",
+	       lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * set_ltab - set LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ */
+static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+	dbg_lp("LEB %d free %d dirty %d to %d %d",
+	       lnum, c->ltab[lnum - c->lpt_first].free,
+	       c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].free = free;
+	c->ltab[lnum - c->lpt_first].dirty = dirty;
+}
+
+/**
+ * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode for which to add dirt
+ */
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
+{
+	struct ubifs_nnode *np = nnode->parent;
+
+	if (np)
+		ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
+				   c->nnode_sz);
+	else {
+		ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
+		if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+			c->lpt_drty_flgs |= LTAB_DIRTY;
+			ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+		}
+	}
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+			   c->pnode_sz);
+}
+
+/**
+ * calc_nnode_num - calculate nnode number.
+ * @row: the row in the tree (root is zero)
+ * @col: the column in the row (leftmost is zero)
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number for the nnode at @row
+ * and @col.
+ */
+static int calc_nnode_num(int row, int col)
+{
+	int num, bits;
+
+	num = 1;
+	while (row--) {
+		bits = (col & (UBIFS_LPT_FANOUT - 1));
+		col >>= UBIFS_LPT_FANOUT_SHIFT;
+		num <<= UBIFS_LPT_FANOUT_SHIFT;
+		num |= bits;
+	}
+	return num;
+}
+
+/**
+ * calc_nnode_num_from_parent - calculate nnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_nnode_num_from_parent(const struct ubifs_info *c,
+				      struct ubifs_nnode *parent, int iip)
+{
+	int num, shft;
+
+	if (!parent)
+		return 1;
+	shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
+	num = parent->num ^ (1 << shft);
+	num |= (UBIFS_LPT_FANOUT + iip) << shft;
+	return num;
+}
+
+/**
+ * calc_pnode_num_from_parent - calculate pnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The pnode number is a number that uniquely identifies a pnode and can be used
+ * easily to traverse the tree from the root to that pnode.
+ *
+ * This function calculates and returns the pnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_pnode_num_from_parent(const struct ubifs_info *c,
+				      struct ubifs_nnode *parent, int iip)
+{
+	int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
+
+	for (i = 0; i < n; i++) {
+		num <<= UBIFS_LPT_FANOUT_SHIFT;
+		num |= pnum & (UBIFS_LPT_FANOUT - 1);
+		pnum >>= UBIFS_LPT_FANOUT_SHIFT;
+	}
+	num <<= UBIFS_LPT_FANOUT_SHIFT;
+	num |= iip;
+	return num;
+}
+
+/**
+ * ubifs_create_dflt_lpt - create default LPT.
+ * @c: UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @lpt_first: LEB number of first LPT LEB
+ * @lpt_lebs: number of LEBs for LPT is passed and returned here
+ * @big_lpt: use big LPT model is passed and returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+			  int *lpt_lebs, int *big_lpt)
+{
+	int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
+	int blnum, boffs, bsz, bcnt;
+	struct ubifs_pnode *pnode = NULL;
+	struct ubifs_nnode *nnode = NULL;
+	void *buf = NULL, *p;
+	struct ubifs_lpt_lprops *ltab = NULL;
+	int *lsave = NULL;
+
+	err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
+	if (err)
+		return err;
+	*lpt_lebs = c->lpt_lebs;
+
+	/* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
+	c->lpt_first = lpt_first;
+	/* Needed by 'set_ltab()' */
+	c->lpt_last = lpt_first + c->lpt_lebs - 1;
+	/* Needed by 'ubifs_pack_lsave()' */
+	c->main_first = c->leb_cnt - *main_lebs;
+
+	lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
+	pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
+	nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
+	buf = vmalloc(c->leb_size);
+	ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!pnode || !nnode || !buf || !ltab || !lsave) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ubifs_assert(!c->ltab);
+	c->ltab = ltab; /* Needed by set_ltab */
+
+	/* Initialize LPT's own lprops */
+	for (i = 0; i < c->lpt_lebs; i++) {
+		ltab[i].free = c->leb_size;
+		ltab[i].dirty = 0;
+		ltab[i].tgc = 0;
+		ltab[i].cmt = 0;
+	}
+
+	lnum = lpt_first;
+	p = buf;
+	/* Number of leaf nodes (pnodes) */
+	cnt = c->pnode_cnt;
+
+	/*
+	 * The first pnode contains the LEB properties for the LEBs that contain
+	 * the root inode node and the root index node of the index tree.
+	 */
+	node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
+	iopos = ALIGN(node_sz, c->min_io_size);
+	pnode->lprops[0].free = c->leb_size - iopos;
+	pnode->lprops[0].dirty = iopos - node_sz;
+	pnode->lprops[0].flags = LPROPS_INDEX;
+
+	node_sz = UBIFS_INO_NODE_SZ;
+	iopos = ALIGN(node_sz, c->min_io_size);
+	pnode->lprops[1].free = c->leb_size - iopos;
+	pnode->lprops[1].dirty = iopos - node_sz;
+
+	for (i = 2; i < UBIFS_LPT_FANOUT; i++)
+		pnode->lprops[i].free = c->leb_size;
+
+	/* Add first pnode */
+	ubifs_pack_pnode(c, p, pnode);
+	p += c->pnode_sz;
+	len = c->pnode_sz;
+	pnode->num += 1;
+
+	/* Reset pnode values for remaining pnodes */
+	pnode->lprops[0].free = c->leb_size;
+	pnode->lprops[0].dirty = 0;
+	pnode->lprops[0].flags = 0;
+
+	pnode->lprops[1].free = c->leb_size;
+	pnode->lprops[1].dirty = 0;
+
+	/*
+	 * To calculate the internal node branches, we keep information about
+	 * the level below.
+	 */
+	blnum = lnum; /* LEB number of level below */
+	boffs = 0; /* Offset of level below */
+	bcnt = cnt; /* Number of nodes in level below */
+	bsz = c->pnode_sz; /* Size of nodes in level below */
+
+	/* Add all remaining pnodes */
+	for (i = 1; i < cnt; i++) {
+		if (len + c->pnode_sz > c->leb_size) {
+			alen = ALIGN(len, c->min_io_size);
+			set_ltab(c, lnum, c->leb_size - alen, alen - len);
+			memset(p, 0xff, alen - len);
+			err = ubifs_leb_change(c, lnum++, buf, alen);
+			if (err)
+				goto out;
+			p = buf;
+			len = 0;
+		}
+		ubifs_pack_pnode(c, p, pnode);
+		p += c->pnode_sz;
+		len += c->pnode_sz;
+		/*
+		 * pnodes are simply numbered left to right starting at zero,
+		 * which means the pnode number can be used easily to traverse
+		 * down the tree to the corresponding pnode.
+		 */
+		pnode->num += 1;
+	}
+
+	row = 0;
+	for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
+		row += 1;
+	/* Add all nnodes, one level at a time */
+	while (1) {
+		/* Number of internal nodes (nnodes) at next level */
+		cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
+		for (i = 0; i < cnt; i++) {
+			if (len + c->nnode_sz > c->leb_size) {
+				alen = ALIGN(len, c->min_io_size);
+				set_ltab(c, lnum, c->leb_size - alen,
+					    alen - len);
+				memset(p, 0xff, alen - len);
+				err = ubifs_leb_change(c, lnum++, buf, alen);
+				if (err)
+					goto out;
+				p = buf;
+				len = 0;
+			}
+			/* Only 1 nnode at this level, so it is the root */
+			if (cnt == 1) {
+				c->lpt_lnum = lnum;
+				c->lpt_offs = len;
+			}
+			/* Set branches to the level below */
+			for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
+				if (bcnt) {
+					if (boffs + bsz > c->leb_size) {
+						blnum += 1;
+						boffs = 0;
+					}
+					nnode->nbranch[j].lnum = blnum;
+					nnode->nbranch[j].offs = boffs;
+					boffs += bsz;
+					bcnt--;
+				} else {
+					nnode->nbranch[j].lnum = 0;
+					nnode->nbranch[j].offs = 0;
+				}
+			}
+			nnode->num = calc_nnode_num(row, i);
+			ubifs_pack_nnode(c, p, nnode);
+			p += c->nnode_sz;
+			len += c->nnode_sz;
+		}
+		/* Only 1 nnode at this level, so it is the root */
+		if (cnt == 1)
+			break;
+		/* Update the information about the level below */
+		bcnt = cnt;
+		bsz = c->nnode_sz;
+		row -= 1;
+	}
+
+	if (*big_lpt) {
+		/* Need to add LPT's save table */
+		if (len + c->lsave_sz > c->leb_size) {
+			alen = ALIGN(len, c->min_io_size);
+			set_ltab(c, lnum, c->leb_size - alen, alen - len);
+			memset(p, 0xff, alen - len);
+			err = ubifs_leb_change(c, lnum++, buf, alen);
+			if (err)
+				goto out;
+			p = buf;
+			len = 0;
+		}
+
+		c->lsave_lnum = lnum;
+		c->lsave_offs = len;
+
+		for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
+			lsave[i] = c->main_first + i;
+		for (; i < c->lsave_cnt; i++)
+			lsave[i] = c->main_first;
+
+		ubifs_pack_lsave(c, p, lsave);
+		p += c->lsave_sz;
+		len += c->lsave_sz;
+	}
+
+	/* Need to add LPT's own LEB properties table */
+	if (len + c->ltab_sz > c->leb_size) {
+		alen = ALIGN(len, c->min_io_size);
+		set_ltab(c, lnum, c->leb_size - alen, alen - len);
+		memset(p, 0xff, alen - len);
+		err = ubifs_leb_change(c, lnum++, buf, alen);
+		if (err)
+			goto out;
+		p = buf;
+		len = 0;
+	}
+
+	c->ltab_lnum = lnum;
+	c->ltab_offs = len;
+
+	/* Update ltab before packing it */
+	len += c->ltab_sz;
+	alen = ALIGN(len, c->min_io_size);
+	set_ltab(c, lnum, c->leb_size - alen, alen - len);
+
+	ubifs_pack_ltab(c, p, ltab);
+	p += c->ltab_sz;
+
+	/* Write remaining buffer */
+	memset(p, 0xff, alen - len);
+	err = ubifs_leb_change(c, lnum, buf, alen);
+	if (err)
+		goto out;
+
+	c->nhead_lnum = lnum;
+	c->nhead_offs = ALIGN(len, c->min_io_size);
+
+	dbg_lp("space_bits %d", c->space_bits);
+	dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+	dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+	dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+	dbg_lp("pcnt_bits %d", c->pcnt_bits);
+	dbg_lp("lnum_bits %d", c->lnum_bits);
+	dbg_lp("pnode_sz %d", c->pnode_sz);
+	dbg_lp("nnode_sz %d", c->nnode_sz);
+	dbg_lp("ltab_sz %d", c->ltab_sz);
+	dbg_lp("lsave_sz %d", c->lsave_sz);
+	dbg_lp("lsave_cnt %d", c->lsave_cnt);
+	dbg_lp("lpt_hght %d", c->lpt_hght);
+	dbg_lp("big_lpt %d", c->big_lpt);
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+out:
+	c->ltab = NULL;
+	kfree(lsave);
+	vfree(ltab);
+	vfree(buf);
+	kfree(nnode);
+	kfree(pnode);
+	return err;
+}
+
+/**
+ * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * When a pnode is loaded into memory, the LEB properties it contains are added,
+ * by this function, to the LEB category lists and heaps.
+ */
+static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	int i;
+
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
+		int lnum = pnode->lprops[i].lnum;
+
+		if (!lnum)
+			return;
+		ubifs_add_to_cat(c, &pnode->lprops[i], cat);
+	}
+}
+
+/**
+ * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @old_pnode: pnode copied
+ * @new_pnode: pnode copy
+ *
+ * During commit it is sometimes necessary to copy a pnode
+ * (see dirty_cow_pnode).  When that happens, references in
+ * category lists and heaps must be replaced.  This function does that.
+ */
+static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
+			 struct ubifs_pnode *new_pnode)
+{
+	int i;
+
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		if (!new_pnode->lprops[i].lnum)
+			return;
+		ubifs_replace_cat(c, &old_pnode->lprops[i],
+				  &new_pnode->lprops[i]);
+	}
+}
+
+/**
+ * check_lpt_crc - check LPT node crc is correct.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing node
+ * @len: length of node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_crc(const struct ubifs_info *c, void *buf, int len)
+{
+	int pos = 0;
+	uint8_t *addr = buf;
+	uint16_t crc, calc_crc;
+
+	crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+	calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+			 len - UBIFS_LPT_CRC_BYTES);
+	if (crc != calc_crc) {
+		ubifs_err(c, "invalid crc in LPT node: crc %hx calc %hx",
+			  crc, calc_crc);
+		dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * check_lpt_type - check LPT node type is correct.
+ * @c: UBIFS file-system description object
+ * @addr: address of type bit field is passed and returned updated here
+ * @pos: position of type bit field is passed and returned updated here
+ * @type: expected type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_type(const struct ubifs_info *c, uint8_t **addr,
+			  int *pos, int type)
+{
+	int node_type;
+
+	node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
+	if (node_type != type) {
+		ubifs_err(c, "invalid type (%d) in LPT node type %d",
+			  node_type, type);
+		dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * unpack_pnode - unpack a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed pnode to unpack
+ * @pnode: pnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_pnode(const struct ubifs_info *c, void *buf,
+			struct ubifs_pnode *pnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_PNODE);
+	if (err)
+		return err;
+	if (c->big_lpt)
+		pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+		lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+		lprops->free <<= 3;
+		lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+		lprops->dirty <<= 3;
+
+		if (ubifs_unpack_bits(&addr, &pos, 1))
+			lprops->flags = LPROPS_INDEX;
+		else
+			lprops->flags = 0;
+		lprops->flags |= ubifs_categorize_lprops(c, lprops);
+	}
+	err = check_lpt_crc(c, buf, c->pnode_sz);
+	return err;
+}
+
+/**
+ * ubifs_unpack_nnode - unpack a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed nnode to unpack
+ * @nnode: nnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+		       struct ubifs_nnode *nnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_NNODE);
+	if (err)
+		return err;
+	if (c->big_lpt)
+		nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum;
+
+		lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
+		       c->lpt_first;
+		if (lnum == c->lpt_last + 1)
+			lnum = 0;
+		nnode->nbranch[i].lnum = lnum;
+		nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
+						     c->lpt_offs_bits);
+	}
+	err = check_lpt_crc(c, buf, c->nnode_sz);
+	return err;
+}
+
+/**
+ * unpack_ltab - unpack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_ltab(const struct ubifs_info *c, void *buf)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LTAB);
+	if (err)
+		return err;
+	for (i = 0; i < c->lpt_lebs; i++) {
+		int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+		int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+
+		if (free < 0 || free > c->leb_size || dirty < 0 ||
+		    dirty > c->leb_size || free + dirty > c->leb_size)
+			return -EINVAL;
+
+		c->ltab[i].free = free;
+		c->ltab[i].dirty = dirty;
+		c->ltab[i].tgc = 0;
+		c->ltab[i].cmt = 0;
+	}
+	err = check_lpt_crc(c, buf, c->ltab_sz);
+	return err;
+}
+
+/**
+ * unpack_lsave - unpack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_lsave(const struct ubifs_info *c, void *buf)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LSAVE);
+	if (err)
+		return err;
+	for (i = 0; i < c->lsave_cnt; i++) {
+		int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
+
+		if (lnum < c->main_first || lnum >= c->leb_cnt)
+			return -EINVAL;
+		c->lsave[i] = lnum;
+	}
+	err = check_lpt_crc(c, buf, c->lsave_sz);
+	return err;
+}
+
+/**
+ * validate_nnode - validate a nnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to validate
+ * @parent: parent nnode (or NULL for the root nnode)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
+			  struct ubifs_nnode *parent, int iip)
+{
+	int i, lvl, max_offs;
+
+	if (c->big_lpt) {
+		int num = calc_nnode_num_from_parent(c, parent, iip);
+
+		if (nnode->num != num)
+			return -EINVAL;
+	}
+	lvl = parent ? parent->level - 1 : c->lpt_hght;
+	if (lvl < 1)
+		return -EINVAL;
+	if (lvl == 1)
+		max_offs = c->leb_size - c->pnode_sz;
+	else
+		max_offs = c->leb_size - c->nnode_sz;
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum = nnode->nbranch[i].lnum;
+		int offs = nnode->nbranch[i].offs;
+
+		if (lnum == 0) {
+			if (offs != 0)
+				return -EINVAL;
+			continue;
+		}
+		if (lnum < c->lpt_first || lnum > c->lpt_last)
+			return -EINVAL;
+		if (offs < 0 || offs > max_offs)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * validate_pnode - validate a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to validate
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
+			  struct ubifs_nnode *parent, int iip)
+{
+	int i;
+
+	if (c->big_lpt) {
+		int num = calc_pnode_num_from_parent(c, parent, iip);
+
+		if (pnode->num != num)
+			return -EINVAL;
+	}
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int free = pnode->lprops[i].free;
+		int dirty = pnode->lprops[i].dirty;
+
+		if (free < 0 || free > c->leb_size || free % c->min_io_size ||
+		    (free & 7))
+			return -EINVAL;
+		if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
+			return -EINVAL;
+		if (dirty + free > c->leb_size)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * set_pnode_lnum - set LEB numbers on a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to update
+ *
+ * This function calculates the LEB numbers for the LEB properties it contains
+ * based on the pnode number.
+ */
+static void set_pnode_lnum(const struct ubifs_info *c,
+			   struct ubifs_pnode *pnode)
+{
+	int i, lnum;
+
+	lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		if (lnum >= c->leb_cnt)
+			return;
+		pnode->lprops[i].lnum = lnum++;
+	}
+}
+
+/**
+ * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch = NULL;
+	struct ubifs_nnode *nnode = NULL;
+	void *buf = c->lpt_nod_buf;
+	int err, lnum, offs;
+
+	if (parent) {
+		branch = &parent->nbranch[iip];
+		lnum = branch->lnum;
+		offs = branch->offs;
+	} else {
+		lnum = c->lpt_lnum;
+		offs = c->lpt_offs;
+	}
+	nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+	if (!nnode) {
+		err = -ENOMEM;
+		goto out;
+	}
+	if (lnum == 0) {
+		/*
+		 * This nnode was not written which just means that the LEB
+		 * properties in the subtree below it describe empty LEBs. We
+		 * make the nnode as though we had read it, which in fact means
+		 * doing almost nothing.
+		 */
+		if (c->big_lpt)
+			nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	} else {
+		err = ubifs_leb_read(c, lnum, buf, offs, c->nnode_sz, 1);
+		if (err)
+			goto out;
+		err = ubifs_unpack_nnode(c, buf, nnode);
+		if (err)
+			goto out;
+	}
+	err = validate_nnode(c, nnode, parent, iip);
+	if (err)
+		goto out;
+	if (!c->big_lpt)
+		nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	if (parent) {
+		branch->nnode = nnode;
+		nnode->level = parent->level - 1;
+	} else {
+		c->nroot = nnode;
+		nnode->level = c->lpt_hght;
+	}
+	nnode->parent = parent;
+	nnode->iip = iip;
+	return 0;
+
+out:
+	ubifs_err(c, "error %d reading nnode at %d:%d", err, lnum, offs);
+	dump_stack();
+	kfree(nnode);
+	return err;
+}
+
+/**
+ * read_pnode - read a pnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode = NULL;
+	void *buf = c->lpt_nod_buf;
+	int err, lnum, offs;
+
+	branch = &parent->nbranch[iip];
+	lnum = branch->lnum;
+	offs = branch->offs;
+	pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+	if (!pnode)
+		return -ENOMEM;
+
+	if (lnum == 0) {
+		/*
+		 * This pnode was not written which just means that the LEB
+		 * properties in it describe empty LEBs. We make the pnode as
+		 * though we had read it.
+		 */
+		int i;
+
+		if (c->big_lpt)
+			pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+			lprops->free = c->leb_size;
+			lprops->flags = ubifs_categorize_lprops(c, lprops);
+		}
+	} else {
+		err = ubifs_leb_read(c, lnum, buf, offs, c->pnode_sz, 1);
+		if (err)
+			goto out;
+		err = unpack_pnode(c, buf, pnode);
+		if (err)
+			goto out;
+	}
+	err = validate_pnode(c, pnode, parent, iip);
+	if (err)
+		goto out;
+	if (!c->big_lpt)
+		pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+	branch->pnode = pnode;
+	pnode->parent = parent;
+	pnode->iip = iip;
+	set_pnode_lnum(c, pnode);
+	c->pnodes_have += 1;
+	return 0;
+
+out:
+	ubifs_err(c, "error %d reading pnode at %d:%d", err, lnum, offs);
+	ubifs_dump_pnode(c, pnode, parent, iip);
+	dump_stack();
+	ubifs_err(c, "calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+	kfree(pnode);
+	return err;
+}
+
+/**
+ * read_ltab - read LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_ltab(struct ubifs_info *c)
+{
+	int err;
+	void *buf;
+
+	buf = vmalloc(c->ltab_sz);
+	if (!buf)
+		return -ENOMEM;
+	err = ubifs_leb_read(c, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz, 1);
+	if (err)
+		goto out;
+	err = unpack_ltab(c, buf);
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * read_lsave - read LPT's save table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_lsave(struct ubifs_info *c)
+{
+	int err, i;
+	void *buf;
+
+	buf = vmalloc(c->lsave_sz);
+	if (!buf)
+		return -ENOMEM;
+	err = ubifs_leb_read(c, c->lsave_lnum, buf, c->lsave_offs,
+			     c->lsave_sz, 1);
+	if (err)
+		goto out;
+	err = unpack_lsave(c, buf);
+	if (err)
+		goto out;
+	for (i = 0; i < c->lsave_cnt; i++) {
+		int lnum = c->lsave[i];
+		struct ubifs_lprops *lprops;
+
+		/*
+		 * Due to automatic resizing, the values in the lsave table
+		 * could be beyond the volume size - just ignore them.
+		 */
+		if (lnum >= c->leb_cnt)
+			continue;
+		lprops = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lprops)) {
+			err = PTR_ERR(lprops);
+			goto out;
+		}
+	}
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_get_nnode - get a nnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_nnode *nnode;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	nnode = branch->nnode;
+	if (nnode)
+		return nnode;
+	err = ubifs_read_nnode(c, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	return branch->nnode;
+}
+
+/**
+ * ubifs_get_pnode - get a pnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	pnode = branch->pnode;
+	if (pnode)
+		return pnode;
+	err = read_pnode(c, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	update_cats(c, branch->pnode);
+	return branch->pnode;
+}
+
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+	int err, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	i = lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	pnode = ubifs_get_pnode(c, nnode, iip);
+	if (IS_ERR(pnode))
+		return ERR_CAST(pnode);
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+	       pnode->lprops[iip].flags);
+	return &pnode->lprops[iip];
+}
+
+/**
+ * dirty_cow_nnode - ensure a nnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to check
+ *
+ * Returns dirtied nnode on success or negative error code on failure.
+ */
+static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
+					   struct ubifs_nnode *nnode)
+{
+	struct ubifs_nnode *n;
+	int i;
+
+	if (!test_bit(COW_CNODE, &nnode->flags)) {
+		/* nnode is not being committed */
+		if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+			c->dirty_nn_cnt += 1;
+			ubifs_add_nnode_dirt(c, nnode);
+		}
+		return nnode;
+	}
+
+	/* nnode is being committed, so copy it */
+	n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+	if (unlikely(!n))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(n, nnode, sizeof(struct ubifs_nnode));
+	n->cnext = NULL;
+	__set_bit(DIRTY_CNODE, &n->flags);
+	__clear_bit(COW_CNODE, &n->flags);
+
+	/* The children now have new parent */
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_nbranch *branch = &n->nbranch[i];
+
+		if (branch->cnode)
+			branch->cnode->parent = n;
+	}
+
+	ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
+	__set_bit(OBSOLETE_CNODE, &nnode->flags);
+
+	c->dirty_nn_cnt += 1;
+	ubifs_add_nnode_dirt(c, nnode);
+	if (nnode->parent)
+		nnode->parent->nbranch[n->iip].nnode = n;
+	else
+		c->nroot = n;
+	return n;
+}
+
+/**
+ * dirty_cow_pnode - ensure a pnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to check
+ *
+ * Returns dirtied pnode on success or negative error code on failure.
+ */
+static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
+					   struct ubifs_pnode *pnode)
+{
+	struct ubifs_pnode *p;
+
+	if (!test_bit(COW_CNODE, &pnode->flags)) {
+		/* pnode is not being committed */
+		if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+			c->dirty_pn_cnt += 1;
+			add_pnode_dirt(c, pnode);
+		}
+		return pnode;
+	}
+
+	/* pnode is being committed, so copy it */
+	p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+	if (unlikely(!p))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(p, pnode, sizeof(struct ubifs_pnode));
+	p->cnext = NULL;
+	__set_bit(DIRTY_CNODE, &p->flags);
+	__clear_bit(COW_CNODE, &p->flags);
+	replace_cats(c, pnode, p);
+
+	ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
+	__set_bit(OBSOLETE_CNODE, &pnode->flags);
+
+	c->dirty_pn_cnt += 1;
+	add_pnode_dirt(c, pnode);
+	pnode->parent->nbranch[p->iip].pnode = p;
+	return p;
+}
+
+/**
+ * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
+{
+	int err, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	nnode = dirty_cow_nnode(c, nnode);
+	if (IS_ERR(nnode))
+		return ERR_CAST(nnode);
+	i = lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+		nnode = dirty_cow_nnode(c, nnode);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	pnode = ubifs_get_pnode(c, nnode, iip);
+	if (IS_ERR(pnode))
+		return ERR_CAST(pnode);
+	pnode = dirty_cow_pnode(c, pnode);
+	if (IS_ERR(pnode))
+		return ERR_CAST(pnode);
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+	       pnode->lprops[iip].flags);
+	ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
+	return &pnode->lprops[iip];
+}
+
+/**
+ * lpt_init_rd - initialize the LPT for reading.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_rd(struct ubifs_info *c)
+{
+	int err, i;
+
+	c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!c->ltab)
+		return -ENOMEM;
+
+	i = max_t(int, c->nnode_sz, c->pnode_sz);
+	c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
+	if (!c->lpt_nod_buf)
+		return -ENOMEM;
+
+	for (i = 0; i < LPROPS_HEAP_CNT; i++) {
+		c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
+					     GFP_KERNEL);
+		if (!c->lpt_heap[i].arr)
+			return -ENOMEM;
+		c->lpt_heap[i].cnt = 0;
+		c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
+	}
+
+	c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
+	if (!c->dirty_idx.arr)
+		return -ENOMEM;
+	c->dirty_idx.cnt = 0;
+	c->dirty_idx.max_cnt = LPT_HEAP_SZ;
+
+	err = read_ltab(c);
+	if (err)
+		return err;
+
+	dbg_lp("space_bits %d", c->space_bits);
+	dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+	dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+	dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+	dbg_lp("pcnt_bits %d", c->pcnt_bits);
+	dbg_lp("lnum_bits %d", c->lnum_bits);
+	dbg_lp("pnode_sz %d", c->pnode_sz);
+	dbg_lp("nnode_sz %d", c->nnode_sz);
+	dbg_lp("ltab_sz %d", c->ltab_sz);
+	dbg_lp("lsave_sz %d", c->lsave_sz);
+	dbg_lp("lsave_cnt %d", c->lsave_cnt);
+	dbg_lp("lpt_hght %d", c->lpt_hght);
+	dbg_lp("big_lpt %d", c->big_lpt);
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
+	return 0;
+}
+
+/**
+ * lpt_init_wr - initialize the LPT for writing.
+ * @c: UBIFS file-system description object
+ *
+ * 'lpt_init_rd()' must have been called already.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_wr(struct ubifs_info *c)
+{
+	int err, i;
+
+	c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!c->ltab_cmt)
+		return -ENOMEM;
+
+	c->lpt_buf = vmalloc(c->leb_size);
+	if (!c->lpt_buf)
+		return -ENOMEM;
+
+	if (c->big_lpt) {
+		c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
+		if (!c->lsave)
+			return -ENOMEM;
+		err = read_lsave(c);
+		if (err)
+			return err;
+	}
+
+	for (i = 0; i < c->lpt_lebs; i++)
+		if (c->ltab[i].free == c->leb_size) {
+			err = ubifs_leb_unmap(c, i + c->lpt_first);
+			if (err)
+				return err;
+		}
+
+	return 0;
+}
+
+/**
+ * ubifs_lpt_init - initialize the LPT.
+ * @c: UBIFS file-system description object
+ * @rd: whether to initialize lpt for reading
+ * @wr: whether to initialize lpt for writing
+ *
+ * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
+ * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
+ * true.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
+{
+	int err;
+
+	if (rd) {
+		err = lpt_init_rd(c);
+		if (err)
+			goto out_err;
+	}
+
+	if (wr) {
+		err = lpt_init_wr(c);
+		if (err)
+			goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	if (wr)
+		ubifs_lpt_free(c, 1);
+	if (rd)
+		ubifs_lpt_free(c, 0);
+	return err;
+}
+
+/**
+ * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
+ * @nnode: where to keep a nnode
+ * @pnode: where to keep a pnode
+ * @cnode: where to keep a cnode
+ * @in_tree: is the node in the tree in memory
+ * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
+ * the tree
+ * @ptr.pnode: ditto for pnode
+ * @ptr.cnode: ditto for cnode
+ */
+struct lpt_scan_node {
+	union {
+		struct ubifs_nnode nnode;
+		struct ubifs_pnode pnode;
+		struct ubifs_cnode cnode;
+	};
+	int in_tree;
+	union {
+		struct ubifs_nnode *nnode;
+		struct ubifs_pnode *pnode;
+		struct ubifs_cnode *cnode;
+	} ptr;
+};
+
+/**
+ * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the nnode
+ * @parent: parent of the nnode
+ * @iip: index in parent of the nnode
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
+					  struct lpt_scan_node *path,
+					  struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_nnode *nnode;
+	void *buf = c->lpt_nod_buf;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	nnode = branch->nnode;
+	if (nnode) {
+		path->in_tree = 1;
+		path->ptr.nnode = nnode;
+		return nnode;
+	}
+	nnode = &path->nnode;
+	path->in_tree = 0;
+	path->ptr.nnode = nnode;
+	memset(nnode, 0, sizeof(struct ubifs_nnode));
+	if (branch->lnum == 0) {
+		/*
+		 * This nnode was not written which just means that the LEB
+		 * properties in the subtree below it describe empty LEBs. We
+		 * make the nnode as though we had read it, which in fact means
+		 * doing almost nothing.
+		 */
+		if (c->big_lpt)
+			nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	} else {
+		err = ubifs_leb_read(c, branch->lnum, buf, branch->offs,
+				     c->nnode_sz, 1);
+		if (err)
+			return ERR_PTR(err);
+		err = ubifs_unpack_nnode(c, buf, nnode);
+		if (err)
+			return ERR_PTR(err);
+	}
+	err = validate_nnode(c, nnode, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	if (!c->big_lpt)
+		nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	nnode->level = parent->level - 1;
+	nnode->parent = parent;
+	nnode->iip = iip;
+	return nnode;
+}
+
+/**
+ * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the pnode
+ * @parent: parent of the pnode
+ * @iip: index in parent of the pnode
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
+					  struct lpt_scan_node *path,
+					  struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode;
+	void *buf = c->lpt_nod_buf;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	pnode = branch->pnode;
+	if (pnode) {
+		path->in_tree = 1;
+		path->ptr.pnode = pnode;
+		return pnode;
+	}
+	pnode = &path->pnode;
+	path->in_tree = 0;
+	path->ptr.pnode = pnode;
+	memset(pnode, 0, sizeof(struct ubifs_pnode));
+	if (branch->lnum == 0) {
+		/*
+		 * This pnode was not written which just means that the LEB
+		 * properties in it describe empty LEBs. We make the pnode as
+		 * though we had read it.
+		 */
+		int i;
+
+		if (c->big_lpt)
+			pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+			lprops->free = c->leb_size;
+			lprops->flags = ubifs_categorize_lprops(c, lprops);
+		}
+	} else {
+		ubifs_assert(branch->lnum >= c->lpt_first &&
+			     branch->lnum <= c->lpt_last);
+		ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
+		err = ubifs_leb_read(c, branch->lnum, buf, branch->offs,
+				     c->pnode_sz, 1);
+		if (err)
+			return ERR_PTR(err);
+		err = unpack_pnode(c, buf, pnode);
+		if (err)
+			return ERR_PTR(err);
+	}
+	err = validate_pnode(c, pnode, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	if (!c->big_lpt)
+		pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+	pnode->parent = parent;
+	pnode->iip = iip;
+	set_pnode_lnum(c, pnode);
+	return pnode;
+}
+
+/**
+ * ubifs_lpt_scan_nolock - scan the LPT.
+ * @c: the UBIFS file-system description object
+ * @start_lnum: LEB number from which to start scanning
+ * @end_lnum: LEB number at which to stop scanning
+ * @scan_cb: callback function called for each lprops
+ * @data: data to be passed to the callback function
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+			  ubifs_lpt_scan_callback scan_cb, void *data)
+{
+	int err = 0, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+	struct lpt_scan_node *path;
+
+	if (start_lnum == -1) {
+		start_lnum = end_lnum + 1;
+		if (start_lnum >= c->leb_cnt)
+			start_lnum = c->main_first;
+	}
+
+	ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
+	ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return err;
+	}
+
+	path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
+		       GFP_NOFS);
+	if (!path)
+		return -ENOMEM;
+
+	path[0].ptr.nnode = c->nroot;
+	path[0].in_tree = 1;
+again:
+	/* Descend to the pnode containing start_lnum */
+	nnode = c->nroot;
+	i = start_lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = scan_get_nnode(c, path + h, nnode, iip);
+		if (IS_ERR(nnode)) {
+			err = PTR_ERR(nnode);
+			goto out;
+		}
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	pnode = scan_get_pnode(c, path + h, nnode, iip);
+	if (IS_ERR(pnode)) {
+		err = PTR_ERR(pnode);
+		goto out;
+	}
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+
+	/* Loop for each lprops */
+	while (1) {
+		struct ubifs_lprops *lprops = &pnode->lprops[iip];
+		int ret, lnum = lprops->lnum;
+
+		ret = scan_cb(c, lprops, path[h].in_tree, data);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret & LPT_SCAN_ADD) {
+			/* Add all the nodes in path to the tree in memory */
+			for (h = 1; h < c->lpt_hght; h++) {
+				const size_t sz = sizeof(struct ubifs_nnode);
+				struct ubifs_nnode *parent;
+
+				if (path[h].in_tree)
+					continue;
+				nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS);
+				if (!nnode) {
+					err = -ENOMEM;
+					goto out;
+				}
+				parent = nnode->parent;
+				parent->nbranch[nnode->iip].nnode = nnode;
+				path[h].ptr.nnode = nnode;
+				path[h].in_tree = 1;
+				path[h + 1].cnode.parent = nnode;
+			}
+			if (path[h].in_tree)
+				ubifs_ensure_cat(c, lprops);
+			else {
+				const size_t sz = sizeof(struct ubifs_pnode);
+				struct ubifs_nnode *parent;
+
+				pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS);
+				if (!pnode) {
+					err = -ENOMEM;
+					goto out;
+				}
+				parent = pnode->parent;
+				parent->nbranch[pnode->iip].pnode = pnode;
+				path[h].ptr.pnode = pnode;
+				path[h].in_tree = 1;
+				update_cats(c, pnode);
+				c->pnodes_have += 1;
+			}
+			err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
+						  c->nroot, 0, 0);
+			if (err)
+				goto out;
+			err = dbg_check_cats(c);
+			if (err)
+				goto out;
+		}
+		if (ret & LPT_SCAN_STOP) {
+			err = 0;
+			break;
+		}
+		/* Get the next lprops */
+		if (lnum == end_lnum) {
+			/*
+			 * We got to the end without finding what we were
+			 * looking for
+			 */
+			err = -ENOSPC;
+			goto out;
+		}
+		if (lnum + 1 >= c->leb_cnt) {
+			/* Wrap-around to the beginning */
+			start_lnum = c->main_first;
+			goto again;
+		}
+		if (iip + 1 < UBIFS_LPT_FANOUT) {
+			/* Next lprops is in the same pnode */
+			iip += 1;
+			continue;
+		}
+		/* We need to get the next pnode. Go up until we can go right */
+		iip = pnode->iip;
+		while (1) {
+			h -= 1;
+			ubifs_assert(h >= 0);
+			nnode = path[h].ptr.nnode;
+			if (iip + 1 < UBIFS_LPT_FANOUT)
+				break;
+			iip = nnode->iip;
+		}
+		/* Go right */
+		iip += 1;
+		/* Descend to the pnode */
+		h += 1;
+		for (; h < c->lpt_hght; h++) {
+			nnode = scan_get_nnode(c, path + h, nnode, iip);
+			if (IS_ERR(nnode)) {
+				err = PTR_ERR(nnode);
+				goto out;
+			}
+			iip = 0;
+		}
+		pnode = scan_get_pnode(c, path + h, nnode, iip);
+		if (IS_ERR(pnode)) {
+			err = PTR_ERR(pnode);
+			goto out;
+		}
+		iip = 0;
+	}
+out:
+	kfree(path);
+	return err;
+}
+
+/**
+ * dbg_chk_pnode - check a pnode.
+ * @c: the UBIFS file-system description object
+ * @pnode: pnode to check
+ * @col: pnode column
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+			 int col)
+{
+	int i;
+
+	if (pnode->num != col) {
+		ubifs_err(c, "pnode num %d expected %d parent num %d iip %d",
+			  pnode->num, col, pnode->parent->num, pnode->iip);
+		return -EINVAL;
+	}
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
+		int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
+			   c->main_first;
+		int found, cat = lprops->flags & LPROPS_CAT_MASK;
+		struct ubifs_lpt_heap *heap;
+		struct list_head *list = NULL;
+
+		if (lnum >= c->leb_cnt)
+			continue;
+		if (lprops->lnum != lnum) {
+			ubifs_err(c, "bad LEB number %d expected %d",
+				  lprops->lnum, lnum);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			if (cat != LPROPS_UNCAT) {
+				ubifs_err(c, "LEB %d taken but not uncat %d",
+					  lprops->lnum, cat);
+				return -EINVAL;
+			}
+			continue;
+		}
+		if (lprops->flags & LPROPS_INDEX) {
+			switch (cat) {
+			case LPROPS_UNCAT:
+			case LPROPS_DIRTY_IDX:
+			case LPROPS_FRDI_IDX:
+				break;
+			default:
+				ubifs_err(c, "LEB %d index but cat %d",
+					  lprops->lnum, cat);
+				return -EINVAL;
+			}
+		} else {
+			switch (cat) {
+			case LPROPS_UNCAT:
+			case LPROPS_DIRTY:
+			case LPROPS_FREE:
+			case LPROPS_EMPTY:
+			case LPROPS_FREEABLE:
+				break;
+			default:
+				ubifs_err(c, "LEB %d not index but cat %d",
+					  lprops->lnum, cat);
+				return -EINVAL;
+			}
+		}
+		switch (cat) {
+		case LPROPS_UNCAT:
+			list = &c->uncat_list;
+			break;
+		case LPROPS_EMPTY:
+			list = &c->empty_list;
+			break;
+		case LPROPS_FREEABLE:
+			list = &c->freeable_list;
+			break;
+		case LPROPS_FRDI_IDX:
+			list = &c->frdi_idx_list;
+			break;
+		}
+		found = 0;
+		switch (cat) {
+		case LPROPS_DIRTY:
+		case LPROPS_DIRTY_IDX:
+		case LPROPS_FREE:
+			heap = &c->lpt_heap[cat - 1];
+			if (lprops->hpos < heap->cnt &&
+			    heap->arr[lprops->hpos] == lprops)
+				found = 1;
+			break;
+		case LPROPS_UNCAT:
+		case LPROPS_EMPTY:
+		case LPROPS_FREEABLE:
+		case LPROPS_FRDI_IDX:
+			list_for_each_entry(lp, list, list)
+				if (lprops == lp) {
+					found = 1;
+					break;
+				}
+			break;
+		}
+		if (!found) {
+			ubifs_err(c, "LEB %d cat %d not found in cat heap/list",
+				  lprops->lnum, cat);
+			return -EINVAL;
+		}
+		switch (cat) {
+		case LPROPS_EMPTY:
+			if (lprops->free != c->leb_size) {
+				ubifs_err(c, "LEB %d cat %d free %d dirty %d",
+					  lprops->lnum, cat, lprops->free,
+					  lprops->dirty);
+				return -EINVAL;
+			}
+			break;
+		case LPROPS_FREEABLE:
+		case LPROPS_FRDI_IDX:
+			if (lprops->free + lprops->dirty != c->leb_size) {
+				ubifs_err(c, "LEB %d cat %d free %d dirty %d",
+					  lprops->lnum, cat, lprops->free,
+					  lprops->dirty);
+				return -EINVAL;
+			}
+			break;
+		}
+	}
+	return 0;
+}
+
+/**
+ * dbg_check_lpt_nodes - check nnodes and pnodes.
+ * @c: the UBIFS file-system description object
+ * @cnode: next cnode (nnode or pnode) to check
+ * @row: row of cnode (root is zero)
+ * @col: column of cnode (leftmost is zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+			int row, int col)
+{
+	struct ubifs_nnode *nnode, *nn;
+	struct ubifs_cnode *cn;
+	int num, iip = 0, err;
+
+	if (!dbg_is_chk_lprops(c))
+		return 0;
+
+	while (cnode) {
+		ubifs_assert(row >= 0);
+		nnode = cnode->parent;
+		if (cnode->level) {
+			/* cnode is a nnode */
+			num = calc_nnode_num(row, col);
+			if (cnode->num != num) {
+				ubifs_err(c, "nnode num %d expected %d parent num %d iip %d",
+					  cnode->num, num,
+					  (nnode ? nnode->num : 0), cnode->iip);
+				return -EINVAL;
+			}
+			nn = (struct ubifs_nnode *)cnode;
+			while (iip < UBIFS_LPT_FANOUT) {
+				cn = nn->nbranch[iip].cnode;
+				if (cn) {
+					/* Go down */
+					row += 1;
+					col <<= UBIFS_LPT_FANOUT_SHIFT;
+					col += iip;
+					iip = 0;
+					cnode = cn;
+					break;
+				}
+				/* Go right */
+				iip += 1;
+			}
+			if (iip < UBIFS_LPT_FANOUT)
+				continue;
+		} else {
+			struct ubifs_pnode *pnode;
+
+			/* cnode is a pnode */
+			pnode = (struct ubifs_pnode *)cnode;
+			err = dbg_chk_pnode(c, pnode, col);
+			if (err)
+				return err;
+		}
+		/* Go up and to the right */
+		row -= 1;
+		col >>= UBIFS_LPT_FANOUT_SHIFT;
+		iip = cnode->iip + 1;
+		cnode = (struct ubifs_cnode *)nnode;
+	}
+	return 0;
+}
diff --git a/kernel/fs/ubifs/lpt_commit.c b/kernel/fs/ubifs/lpt_commit.c
new file mode 100644
index 000000000..ce89bdc3e
--- /dev/null
+++ b/kernel/fs/ubifs/lpt_commit.c
@@ -0,0 +1,2037 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements commit-related functionality of the LEB properties
+ * subsystem.
+ */
+
+#include <linux/crc16.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include "ubifs.h"
+
+static int dbg_populate_lsave(struct ubifs_info *c);
+
+/**
+ * first_dirty_cnode - find first dirty cnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode at which to start
+ *
+ * This function returns the first dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
+{
+	ubifs_assert(nnode);
+	while (1) {
+		int i, cont = 0;
+
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_cnode *cnode;
+
+			cnode = nnode->nbranch[i].cnode;
+			if (cnode &&
+			    test_bit(DIRTY_CNODE, &cnode->flags)) {
+				if (cnode->level == 0)
+					return cnode;
+				nnode = (struct ubifs_nnode *)cnode;
+				cont = 1;
+				break;
+			}
+		}
+		if (!cont)
+			return (struct ubifs_cnode *)nnode;
+	}
+}
+
+/**
+ * next_dirty_cnode - find next dirty cnode.
+ * @cnode: cnode from which to begin searching
+ *
+ * This function returns the next dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
+{
+	struct ubifs_nnode *nnode;
+	int i;
+
+	ubifs_assert(cnode);
+	nnode = cnode->parent;
+	if (!nnode)
+		return NULL;
+	for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
+		cnode = nnode->nbranch[i].cnode;
+		if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
+			if (cnode->level == 0)
+				return cnode; /* cnode is a pnode */
+			/* cnode is a nnode */
+			return first_dirty_cnode((struct ubifs_nnode *)cnode);
+		}
+	}
+	return (struct ubifs_cnode *)nnode;
+}
+
+/**
+ * get_cnodes_to_commit - create list of dirty cnodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of cnodes to commit.
+ */
+static int get_cnodes_to_commit(struct ubifs_info *c)
+{
+	struct ubifs_cnode *cnode, *cnext;
+	int cnt = 0;
+
+	if (!c->nroot)
+		return 0;
+
+	if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
+		return 0;
+
+	c->lpt_cnext = first_dirty_cnode(c->nroot);
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	cnt += 1;
+	while (1) {
+		ubifs_assert(!test_bit(COW_CNODE, &cnode->flags));
+		__set_bit(COW_CNODE, &cnode->flags);
+		cnext = next_dirty_cnode(cnode);
+		if (!cnext) {
+			cnode->cnext = c->lpt_cnext;
+			break;
+		}
+		cnode->cnext = cnext;
+		cnode = cnext;
+		cnt += 1;
+	}
+	dbg_cmt("committing %d cnodes", cnt);
+	dbg_lp("committing %d cnodes", cnt);
+	ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
+	return cnt;
+}
+
+/**
+ * upd_ltab - update LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ */
+static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+	dbg_lp("LEB %d free %d dirty %d to %d +%d",
+	       lnum, c->ltab[lnum - c->lpt_first].free,
+	       c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].free = free;
+	c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * alloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function finds the next empty LEB in the ltab starting from @lnum. If a
+ * an empty LEB is found it is returned in @lnum and the function returns %0.
+ * Otherwise the function returns -ENOSPC.  Note however, that LPT is designed
+ * never to run out of space.
+ */
+static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+	int i, n;
+
+	n = *lnum - c->lpt_first + 1;
+	for (i = n; i < c->lpt_lebs; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (c->ltab[i].free == c->leb_size) {
+			c->ltab[i].cmt = 1;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	}
+
+	for (i = 0; i < n; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (c->ltab[i].free == c->leb_size) {
+			c->ltab[i].cmt = 1;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	}
+	return -ENOSPC;
+}
+
+/**
+ * layout_cnodes - layout cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_cnodes(struct ubifs_info *c)
+{
+	int lnum, offs, len, alen, done_lsave, done_ltab, err;
+	struct ubifs_cnode *cnode;
+
+	err = dbg_chk_lpt_sz(c, 0, 0);
+	if (err)
+		return err;
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	lnum = c->nhead_lnum;
+	offs = c->nhead_offs;
+	/* Try to place lsave and ltab nicely */
+	done_lsave = !c->big_lpt;
+	done_ltab = 0;
+	if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+		done_lsave = 1;
+		c->lsave_lnum = lnum;
+		c->lsave_offs = offs;
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	if (offs + c->ltab_sz <= c->leb_size) {
+		done_ltab = 1;
+		c->ltab_lnum = lnum;
+		c->ltab_offs = offs;
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	do {
+		if (cnode->level) {
+			len = c->nnode_sz;
+			c->dirty_nn_cnt -= 1;
+		} else {
+			len = c->pnode_sz;
+			c->dirty_pn_cnt -= 1;
+		}
+		while (offs + len > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			/* Try to place lsave and ltab nicely */
+			if (!done_lsave) {
+				done_lsave = 1;
+				c->lsave_lnum = lnum;
+				c->lsave_offs = offs;
+				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+				continue;
+			}
+			if (!done_ltab) {
+				done_ltab = 1;
+				c->ltab_lnum = lnum;
+				c->ltab_offs = offs;
+				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+				continue;
+			}
+			break;
+		}
+		if (cnode->parent) {
+			cnode->parent->nbranch[cnode->iip].lnum = lnum;
+			cnode->parent->nbranch[cnode->iip].offs = offs;
+		} else {
+			c->lpt_lnum = lnum;
+			c->lpt_offs = offs;
+		}
+		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
+		cnode = cnode->cnext;
+	} while (cnode && cnode != c->lpt_cnext);
+
+	/* Make sure to place LPT's save table */
+	if (!done_lsave) {
+		if (offs + c->lsave_sz > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+		}
+		done_lsave = 1;
+		c->lsave_lnum = lnum;
+		c->lsave_offs = offs;
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	/* Make sure to place LPT's own lprops table */
+	if (!done_ltab) {
+		if (offs + c->ltab_sz > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+		}
+		c->ltab_lnum = lnum;
+		c->ltab_offs = offs;
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	alen = ALIGN(offs, c->min_io_size);
+	upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+	dbg_chk_lpt_sz(c, 4, alen - offs);
+	err = dbg_chk_lpt_sz(c, 3, alen);
+	if (err)
+		return err;
+	return 0;
+
+no_space:
+	ubifs_err(c, "LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
+		  lnum, offs, len, done_ltab, done_lsave);
+	ubifs_dump_lpt_info(c);
+	ubifs_dump_lpt_lebs(c);
+	dump_stack();
+	return err;
+}
+
+/**
+ * realloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function duplicates exactly the results of the function alloc_lpt_leb.
+ * It is used during end commit to reallocate the same LEB numbers that were
+ * allocated by alloc_lpt_leb during start commit.
+ *
+ * This function finds the next LEB that was allocated by the alloc_lpt_leb
+ * function starting from @lnum. If a LEB is found it is returned in @lnum and
+ * the function returns %0. Otherwise the function returns -ENOSPC.
+ * Note however, that LPT is designed never to run out of space.
+ */
+static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+	int i, n;
+
+	n = *lnum - c->lpt_first + 1;
+	for (i = n; i < c->lpt_lebs; i++)
+		if (c->ltab[i].cmt) {
+			c->ltab[i].cmt = 0;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+
+	for (i = 0; i < n; i++)
+		if (c->ltab[i].cmt) {
+			c->ltab[i].cmt = 0;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	return -ENOSPC;
+}
+
+/**
+ * write_cnodes - write cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_cnodes(struct ubifs_info *c)
+{
+	int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
+	struct ubifs_cnode *cnode;
+	void *buf = c->lpt_buf;
+
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	lnum = c->nhead_lnum;
+	offs = c->nhead_offs;
+	from = offs;
+	/* Ensure empty LEB is unmapped */
+	if (offs == 0) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	/* Try to place lsave and ltab nicely */
+	done_lsave = !c->big_lpt;
+	done_ltab = 0;
+	if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+		done_lsave = 1;
+		ubifs_pack_lsave(c, buf + offs, c->lsave);
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	if (offs + c->ltab_sz <= c->leb_size) {
+		done_ltab = 1;
+		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	/* Loop for each cnode */
+	do {
+		if (cnode->level)
+			len = c->nnode_sz;
+		else
+			len = c->pnode_sz;
+		while (offs + len > c->leb_size) {
+			wlen = offs - from;
+			if (wlen) {
+				alen = ALIGN(wlen, c->min_io_size);
+				memset(buf + offs, 0xff, alen - wlen);
+				err = ubifs_leb_write(c, lnum, buf + from, from,
+						       alen);
+				if (err)
+					return err;
+			}
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = from = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+			/* Try to place lsave and ltab nicely */
+			if (!done_lsave) {
+				done_lsave = 1;
+				ubifs_pack_lsave(c, buf + offs, c->lsave);
+				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+				continue;
+			}
+			if (!done_ltab) {
+				done_ltab = 1;
+				ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+				continue;
+			}
+			break;
+		}
+		if (cnode->level)
+			ubifs_pack_nnode(c, buf + offs,
+					 (struct ubifs_nnode *)cnode);
+		else
+			ubifs_pack_pnode(c, buf + offs,
+					 (struct ubifs_pnode *)cnode);
+		/*
+		 * The reason for the barriers is the same as in case of TNC.
+		 * See comment in 'write_index()'. 'dirty_cow_nnode()' and
+		 * 'dirty_cow_pnode()' are the functions for which this is
+		 * important.
+		 */
+		clear_bit(DIRTY_CNODE, &cnode->flags);
+		smp_mb__before_atomic();
+		clear_bit(COW_CNODE, &cnode->flags);
+		smp_mb__after_atomic();
+		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
+		cnode = cnode->cnext;
+	} while (cnode && cnode != c->lpt_cnext);
+
+	/* Make sure to place LPT's save table */
+	if (!done_lsave) {
+		if (offs + c->lsave_sz > c->leb_size) {
+			wlen = offs - from;
+			alen = ALIGN(wlen, c->min_io_size);
+			memset(buf + offs, 0xff, alen - wlen);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen);
+			if (err)
+				return err;
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = from = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+		done_lsave = 1;
+		ubifs_pack_lsave(c, buf + offs, c->lsave);
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	/* Make sure to place LPT's own lprops table */
+	if (!done_ltab) {
+		if (offs + c->ltab_sz > c->leb_size) {
+			wlen = offs - from;
+			alen = ALIGN(wlen, c->min_io_size);
+			memset(buf + offs, 0xff, alen - wlen);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen);
+			if (err)
+				return err;
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = from = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	/* Write remaining data in buffer */
+	wlen = offs - from;
+	alen = ALIGN(wlen, c->min_io_size);
+	memset(buf + offs, 0xff, alen - wlen);
+	err = ubifs_leb_write(c, lnum, buf + from, from, alen);
+	if (err)
+		return err;
+
+	dbg_chk_lpt_sz(c, 4, alen - wlen);
+	err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size));
+	if (err)
+		return err;
+
+	c->nhead_lnum = lnum;
+	c->nhead_offs = ALIGN(offs, c->min_io_size);
+
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
+	return 0;
+
+no_space:
+	ubifs_err(c, "LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
+		  lnum, offs, len, done_ltab, done_lsave);
+	ubifs_dump_lpt_info(c);
+	ubifs_dump_lpt_lebs(c);
+	dump_stack();
+	return err;
+}
+
+/**
+ * next_pnode_to_dirty - find next pnode to dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * This function returns the next pnode to dirty or %NULL if there are no more
+ * pnodes.  Note that pnodes that have never been written (lnum == 0) are
+ * skipped.
+ */
+static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
+					       struct ubifs_pnode *pnode)
+{
+	struct ubifs_nnode *nnode;
+	int iip;
+
+	/* Try to go right */
+	nnode = pnode->parent;
+	for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+		if (nnode->nbranch[iip].lnum)
+			return ubifs_get_pnode(c, nnode, iip);
+	}
+
+	/* Go up while can't go right */
+	do {
+		iip = nnode->iip + 1;
+		nnode = nnode->parent;
+		if (!nnode)
+			return NULL;
+		for (; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+	} while (iip >= UBIFS_LPT_FANOUT);
+
+	/* Go right */
+	nnode = ubifs_get_nnode(c, nnode, iip);
+	if (IS_ERR(nnode))
+		return (void *)nnode;
+
+	/* Go down to level 1 */
+	while (nnode->level > 1) {
+		for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+		if (iip >= UBIFS_LPT_FANOUT) {
+			/*
+			 * Should not happen, but we need to keep going
+			 * if it does.
+			 */
+			iip = 0;
+		}
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return (void *)nnode;
+	}
+
+	for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
+		if (nnode->nbranch[iip].lnum)
+			break;
+	if (iip >= UBIFS_LPT_FANOUT)
+		/* Should not happen, but we need to keep going if it does */
+		iip = 0;
+	return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * pnode_lookup - lookup a pnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: pnode number (0 to main_lebs - 1)
+ *
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
+{
+	int err, h, iip, shft;
+	struct ubifs_nnode *nnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	i <<= UBIFS_LPT_FANOUT_SHIFT;
+	nnode = c->nroot;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+			   c->pnode_sz);
+}
+
+/**
+ * do_make_pnode_dirty - mark a pnode dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to mark dirty
+ */
+static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	/* Assumes cnext list is empty i.e. not called during commit */
+	if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+		struct ubifs_nnode *nnode;
+
+		c->dirty_pn_cnt += 1;
+		add_pnode_dirt(c, pnode);
+		/* Mark parent and ancestors dirty too */
+		nnode = pnode->parent;
+		while (nnode) {
+			if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+				c->dirty_nn_cnt += 1;
+				ubifs_add_nnode_dirt(c, nnode);
+				nnode = nnode->parent;
+			} else
+				break;
+		}
+	}
+}
+
+/**
+ * make_tree_dirty - mark the entire LEB properties tree dirty.
+ * @c: UBIFS file-system description object
+ *
+ * This function is used by the "small" LPT model to cause the entire LEB
+ * properties tree to be written.  The "small" LPT model does not use LPT
+ * garbage collection because it is more efficient to write the entire tree
+ * (because it is small).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_tree_dirty(struct ubifs_info *c)
+{
+	struct ubifs_pnode *pnode;
+
+	pnode = pnode_lookup(c, 0);
+	if (IS_ERR(pnode))
+		return PTR_ERR(pnode);
+
+	while (pnode) {
+		do_make_pnode_dirty(c, pnode);
+		pnode = next_pnode_to_dirty(c, pnode);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+	}
+	return 0;
+}
+
+/**
+ * need_write_all - determine if the LPT area is running out of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %1 if the LPT area is running out of free space and %0
+ * if it is not.
+ */
+static int need_write_all(struct ubifs_info *c)
+{
+	long long free = 0;
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (i + c->lpt_first == c->nhead_lnum)
+			free += c->leb_size - c->nhead_offs;
+		else if (c->ltab[i].free == c->leb_size)
+			free += c->leb_size;
+		else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+			free += c->leb_size;
+	}
+	/* Less than twice the size left */
+	if (free <= c->lpt_sz * 2)
+		return 1;
+	return 0;
+}
+
+/**
+ * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called during start commit to mark LPT LEBs for trivial GC.
+ */
+static void lpt_tgc_start(struct ubifs_info *c)
+{
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (i + c->lpt_first == c->nhead_lnum)
+			continue;
+		if (c->ltab[i].dirty > 0 &&
+		    c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
+			c->ltab[i].tgc = 1;
+			c->ltab[i].free = c->leb_size;
+			c->ltab[i].dirty = 0;
+			dbg_lp("LEB %d", i + c->lpt_first);
+		}
+	}
+}
+
+/**
+ * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called after the commit is completed (master node has been
+ * written) and un-maps LPT LEBs that were marked for trivial GC.
+ */
+static int lpt_tgc_end(struct ubifs_info *c)
+{
+	int i, err;
+
+	for (i = 0; i < c->lpt_lebs; i++)
+		if (c->ltab[i].tgc) {
+			err = ubifs_leb_unmap(c, i + c->lpt_first);
+			if (err)
+				return err;
+			c->ltab[i].tgc = 0;
+			dbg_lp("LEB %d", i + c->lpt_first);
+		}
+	return 0;
+}
+
+/**
+ * populate_lsave - fill the lsave array with important LEB numbers.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is only called for the "big" model. It records a small number
+ * of LEB numbers of important LEBs.  Important LEBs are ones that are (from
+ * most important to least important): empty, freeable, freeable index, dirty
+ * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
+ * their pnodes into memory.  That will stop us from having to scan the LPT
+ * straight away. For the "small" model we assume that scanning the LPT is no
+ * big deal.
+ */
+static void populate_lsave(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	int i, cnt = 0;
+
+	ubifs_assert(c->big_lpt);
+	if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+		c->lpt_drty_flgs |= LSAVE_DIRTY;
+		ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+	}
+
+	if (dbg_populate_lsave(c))
+		return;
+
+	list_for_each_entry(lprops, &c->empty_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	list_for_each_entry(lprops, &c->freeable_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	/* Fill it up completely */
+	while (cnt < c->lsave_cnt)
+		c->lsave[cnt++] = c->main_first;
+}
+
+/**
+ * nnode_lookup - lookup a nnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: nnode number
+ *
+ * This function returns a pointer to the nnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
+{
+	int err, iip;
+	struct ubifs_nnode *nnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	while (1) {
+		iip = i & (UBIFS_LPT_FANOUT - 1);
+		i >>= UBIFS_LPT_FANOUT_SHIFT;
+		if (!i)
+			break;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return nnode;
+	}
+	return nnode;
+}
+
+/**
+ * make_nnode_dirty - find a nnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: nnode number of nnode to make dirty
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+			    int offs)
+{
+	struct ubifs_nnode *nnode;
+
+	nnode = nnode_lookup(c, node_num);
+	if (IS_ERR(nnode))
+		return PTR_ERR(nnode);
+	if (nnode->parent) {
+		struct ubifs_nbranch *branch;
+
+		branch = &nnode->parent->nbranch[nnode->iip];
+		if (branch->lnum != lnum || branch->offs != offs)
+			return 0; /* nnode is obsolete */
+	} else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+			return 0; /* nnode is obsolete */
+	/* Assumes cnext list is empty i.e. not called during commit */
+	if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+		c->dirty_nn_cnt += 1;
+		ubifs_add_nnode_dirt(c, nnode);
+		/* Mark parent and ancestors dirty too */
+		nnode = nnode->parent;
+		while (nnode) {
+			if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+				c->dirty_nn_cnt += 1;
+				ubifs_add_nnode_dirt(c, nnode);
+				nnode = nnode->parent;
+			} else
+				break;
+		}
+	}
+	return 0;
+}
+
+/**
+ * make_pnode_dirty - find a pnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: pnode number of pnode to make dirty
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+			    int offs)
+{
+	struct ubifs_pnode *pnode;
+	struct ubifs_nbranch *branch;
+
+	pnode = pnode_lookup(c, node_num);
+	if (IS_ERR(pnode))
+		return PTR_ERR(pnode);
+	branch = &pnode->parent->nbranch[pnode->iip];
+	if (branch->lnum != lnum || branch->offs != offs)
+		return 0;
+	do_make_pnode_dirty(c, pnode);
+	return 0;
+}
+
+/**
+ * make_ltab_dirty - make ltab node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where ltab was written
+ * @offs: offset where ltab was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+		return 0; /* This ltab node is obsolete */
+	if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+		c->lpt_drty_flgs |= LTAB_DIRTY;
+		ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+	}
+	return 0;
+}
+
+/**
+ * make_lsave_dirty - make lsave node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where lsave was written
+ * @offs: offset where lsave was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+		return 0; /* This lsave node is obsolete */
+	if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+		c->lpt_drty_flgs |= LSAVE_DIRTY;
+		ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+	}
+	return 0;
+}
+
+/**
+ * make_node_dirty - make node dirty.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ * @node_num: node number
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
+			   int lnum, int offs)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return make_nnode_dirty(c, node_num, lnum, offs);
+	case UBIFS_LPT_PNODE:
+		return make_pnode_dirty(c, node_num, lnum, offs);
+	case UBIFS_LPT_LTAB:
+		return make_ltab_dirty(c, lnum, offs);
+	case UBIFS_LPT_LSAVE:
+		return make_lsave_dirty(c, lnum, offs);
+	}
+	return -EINVAL;
+}
+
+/**
+ * get_lpt_node_len - return the length of a node based on its type.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ */
+static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return c->nnode_sz;
+	case UBIFS_LPT_PNODE:
+		return c->pnode_sz;
+	case UBIFS_LPT_LTAB:
+		return c->ltab_sz;
+	case UBIFS_LPT_LSAVE:
+		return c->lsave_sz;
+	}
+	return 0;
+}
+
+/**
+ * get_pad_len - return the length of padding in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ */
+static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
+{
+	int offs, pad_len;
+
+	if (c->min_io_size == 1)
+		return 0;
+	offs = c->leb_size - len;
+	pad_len = ALIGN(offs, c->min_io_size) - offs;
+	return pad_len;
+}
+
+/**
+ * get_lpt_node_type - return type (and node number) of a node in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @node_num: node number is returned here
+ */
+static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
+			     int *node_num)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int pos = 0, node_type;
+
+	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+	*node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	return node_type;
+}
+
+/**
+ * is_a_node - determine if a buffer contains a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer contains a node or %0 if it does not.
+ */
+static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int pos = 0, node_type, node_len;
+	uint16_t crc, calc_crc;
+
+	if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8)
+		return 0;
+	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+	if (node_type == UBIFS_LPT_NOT_A_NODE)
+		return 0;
+	node_len = get_lpt_node_len(c, node_type);
+	if (!node_len || node_len > len)
+		return 0;
+	pos = 0;
+	addr = buf;
+	crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+	calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+			 node_len - UBIFS_LPT_CRC_BYTES);
+	if (crc != calc_crc)
+		return 0;
+	return 1;
+}
+
+/**
+ * lpt_gc_lnum - garbage collect a LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to garbage collect
+ *
+ * LPT garbage collection is used only for the "big" LPT model
+ * (c->big_lpt == 1).  Garbage collection simply involves marking all the nodes
+ * in the LEB being garbage-collected as dirty.  The dirty nodes are written
+ * next commit, after which the LEB is free to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, node_type, node_num, node_len, offs;
+	void *buf = c->lpt_buf;
+
+	dbg_lp("LEB %d", lnum);
+
+	err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1);
+	if (err)
+		return err;
+
+	while (1) {
+		if (!is_a_node(c, buf, len)) {
+			int pad_len;
+
+			pad_len = get_pad_len(c, buf, len);
+			if (pad_len) {
+				buf += pad_len;
+				len -= pad_len;
+				continue;
+			}
+			return 0;
+		}
+		node_type = get_lpt_node_type(c, buf, &node_num);
+		node_len = get_lpt_node_len(c, node_type);
+		offs = c->leb_size - len;
+		ubifs_assert(node_len != 0);
+		mutex_lock(&c->lp_mutex);
+		err = make_node_dirty(c, node_type, node_num, lnum, offs);
+		mutex_unlock(&c->lp_mutex);
+		if (err)
+			return err;
+		buf += node_len;
+		len -= node_len;
+	}
+	return 0;
+}
+
+/**
+ * lpt_gc - LPT garbage collection.
+ * @c: UBIFS file-system description object
+ *
+ * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
+ * Returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc(struct ubifs_info *c)
+{
+	int i, lnum = -1, dirty = 0;
+
+	mutex_lock(&c->lp_mutex);
+	for (i = 0; i < c->lpt_lebs; i++) {
+		ubifs_assert(!c->ltab[i].tgc);
+		if (i + c->lpt_first == c->nhead_lnum ||
+		    c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+			continue;
+		if (c->ltab[i].dirty > dirty) {
+			dirty = c->ltab[i].dirty;
+			lnum = i + c->lpt_first;
+		}
+	}
+	mutex_unlock(&c->lp_mutex);
+	if (lnum == -1)
+		return -ENOSPC;
+	return lpt_gc_lnum(c, lnum);
+}
+
+/**
+ * ubifs_lpt_start_commit - UBIFS commit starts.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when UBIFS starts the commit operation.
+ * This function "freezes" all currently dirty LEB properties and does not
+ * change them anymore. Further changes are saved and tracked separately
+ * because they are not part of this commit. This function returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+int ubifs_lpt_start_commit(struct ubifs_info *c)
+{
+	int err, cnt;
+
+	dbg_lp("");
+
+	mutex_lock(&c->lp_mutex);
+	err = dbg_chk_lpt_free_spc(c);
+	if (err)
+		goto out;
+	err = dbg_check_ltab(c);
+	if (err)
+		goto out;
+
+	if (c->check_lpt_free) {
+		/*
+		 * We ensure there is enough free space in
+		 * ubifs_lpt_post_commit() by marking nodes dirty. That
+		 * information is lost when we unmount, so we also need
+		 * to check free space once after mounting also.
+		 */
+		c->check_lpt_free = 0;
+		while (need_write_all(c)) {
+			mutex_unlock(&c->lp_mutex);
+			err = lpt_gc(c);
+			if (err)
+				return err;
+			mutex_lock(&c->lp_mutex);
+		}
+	}
+
+	lpt_tgc_start(c);
+
+	if (!c->dirty_pn_cnt) {
+		dbg_cmt("no cnodes to commit");
+		err = 0;
+		goto out;
+	}
+
+	if (!c->big_lpt && need_write_all(c)) {
+		/* If needed, write everything */
+		err = make_tree_dirty(c);
+		if (err)
+			goto out;
+		lpt_tgc_start(c);
+	}
+
+	if (c->big_lpt)
+		populate_lsave(c);
+
+	cnt = get_cnodes_to_commit(c);
+	ubifs_assert(cnt != 0);
+
+	err = layout_cnodes(c);
+	if (err)
+		goto out;
+
+	/* Copy the LPT's own lprops for end commit to write */
+	memcpy(c->ltab_cmt, c->ltab,
+	       sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
+
+out:
+	mutex_unlock(&c->lp_mutex);
+	return err;
+}
+
+/**
+ * free_obsolete_cnodes - free obsolete cnodes for commit end.
+ * @c: UBIFS file-system description object
+ */
+static void free_obsolete_cnodes(struct ubifs_info *c)
+{
+	struct ubifs_cnode *cnode, *cnext;
+
+	cnext = c->lpt_cnext;
+	if (!cnext)
+		return;
+	do {
+		cnode = cnext;
+		cnext = cnode->cnext;
+		if (test_bit(OBSOLETE_CNODE, &cnode->flags))
+			kfree(cnode);
+		else
+			cnode->cnext = NULL;
+	} while (cnext != c->lpt_cnext);
+	c->lpt_cnext = NULL;
+}
+
+/**
+ * ubifs_lpt_end_commit - finish the commit operation.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when the commit operation finishes. It
+ * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
+ * the media. Returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+int ubifs_lpt_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	dbg_lp("");
+
+	if (!c->lpt_cnext)
+		return 0;
+
+	err = write_cnodes(c);
+	if (err)
+		return err;
+
+	mutex_lock(&c->lp_mutex);
+	free_obsolete_cnodes(c);
+	mutex_unlock(&c->lp_mutex);
+
+	return 0;
+}
+
+/**
+ * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial GC is completed after a commit. Also LPT GC is done after a
+ * commit for the "big" LPT model.
+ */
+int ubifs_lpt_post_commit(struct ubifs_info *c)
+{
+	int err;
+
+	mutex_lock(&c->lp_mutex);
+	err = lpt_tgc_end(c);
+	if (err)
+		goto out;
+	if (c->big_lpt)
+		while (need_write_all(c)) {
+			mutex_unlock(&c->lp_mutex);
+			err = lpt_gc(c);
+			if (err)
+				return err;
+			mutex_lock(&c->lp_mutex);
+		}
+out:
+	mutex_unlock(&c->lp_mutex);
+	return err;
+}
+
+/**
+ * first_nnode - find the first nnode in memory.
+ * @c: UBIFS file-system description object
+ * @hght: height of tree where nnode found is returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
+{
+	struct ubifs_nnode *nnode;
+	int h, i, found;
+
+	nnode = c->nroot;
+	*hght = 0;
+	if (!nnode)
+		return NULL;
+	for (h = 1; h < c->lpt_hght; h++) {
+		found = 0;
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			if (nnode->nbranch[i].nnode) {
+				found = 1;
+				nnode = nnode->nbranch[i].nnode;
+				*hght = h;
+				break;
+			}
+		}
+		if (!found)
+			break;
+	}
+	return nnode;
+}
+
+/**
+ * next_nnode - find the next nnode in memory.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode from which to start.
+ * @hght: height of tree where nnode is, is passed and returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
+				      struct ubifs_nnode *nnode, int *hght)
+{
+	struct ubifs_nnode *parent;
+	int iip, h, i, found;
+
+	parent = nnode->parent;
+	if (!parent)
+		return NULL;
+	if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
+		*hght -= 1;
+		return parent;
+	}
+	for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+		nnode = parent->nbranch[iip].nnode;
+		if (nnode)
+			break;
+	}
+	if (!nnode) {
+		*hght -= 1;
+		return parent;
+	}
+	for (h = *hght + 1; h < c->lpt_hght; h++) {
+		found = 0;
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			if (nnode->nbranch[i].nnode) {
+				found = 1;
+				nnode = nnode->nbranch[i].nnode;
+				*hght = h;
+				break;
+			}
+		}
+		if (!found)
+			break;
+	}
+	return nnode;
+}
+
+/**
+ * ubifs_lpt_free - free resources owned by the LPT.
+ * @c: UBIFS file-system description object
+ * @wr_only: free only resources used for writing
+ */
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
+{
+	struct ubifs_nnode *nnode;
+	int i, hght;
+
+	/* Free write-only things first */
+
+	free_obsolete_cnodes(c); /* Leftover from a failed commit */
+
+	vfree(c->ltab_cmt);
+	c->ltab_cmt = NULL;
+	vfree(c->lpt_buf);
+	c->lpt_buf = NULL;
+	kfree(c->lsave);
+	c->lsave = NULL;
+
+	if (wr_only)
+		return;
+
+	/* Now free the rest */
+
+	nnode = first_nnode(c, &hght);
+	while (nnode) {
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++)
+			kfree(nnode->nbranch[i].nnode);
+		nnode = next_nnode(c, nnode, &hght);
+	}
+	for (i = 0; i < LPROPS_HEAP_CNT; i++)
+		kfree(c->lpt_heap[i].arr);
+	kfree(c->dirty_idx.arr);
+	kfree(c->nroot);
+	vfree(c->ltab);
+	kfree(c->lpt_nod_buf);
+}
+
+/*
+ * Everything below is related to debugging.
+ */
+
+/**
+ * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
+ * @buf: buffer
+ * @len: buffer length
+ */
+static int dbg_is_all_ff(uint8_t *buf, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (buf[i] != 0xff)
+			return 0;
+	return 1;
+}
+
+/**
+ * dbg_is_nnode_dirty - determine if a nnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ */
+static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_nnode *nnode;
+	int hght;
+
+	/* Entire tree is in memory so first_nnode / next_nnode are OK */
+	nnode = first_nnode(c, &hght);
+	for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
+		struct ubifs_nbranch *branch;
+
+		cond_resched();
+		if (nnode->parent) {
+			branch = &nnode->parent->nbranch[nnode->iip];
+			if (branch->lnum != lnum || branch->offs != offs)
+				continue;
+			if (test_bit(DIRTY_CNODE, &nnode->flags))
+				return 1;
+			return 0;
+		} else {
+			if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+				continue;
+			if (test_bit(DIRTY_CNODE, &nnode->flags))
+				return 1;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/**
+ * dbg_is_pnode_dirty - determine if a pnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ */
+static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	int i, cnt;
+
+	cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+	for (i = 0; i < cnt; i++) {
+		struct ubifs_pnode *pnode;
+		struct ubifs_nbranch *branch;
+
+		cond_resched();
+		pnode = pnode_lookup(c, i);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+		branch = &pnode->parent->nbranch[pnode->iip];
+		if (branch->lnum != lnum || branch->offs != offs)
+			continue;
+		if (test_bit(DIRTY_CNODE, &pnode->flags))
+			return 1;
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * dbg_is_ltab_dirty - determine if a ltab node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where ltab node was written
+ * @offs: offset where ltab node was written
+ */
+static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+		return 1;
+	return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_lsave_dirty - determine if a lsave node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where lsave node was written
+ * @offs: offset where lsave node was written
+ */
+static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+		return 1;
+	return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_node_dirty - determine if a node is dirty.
+ * @c: the UBIFS file-system description object
+ * @node_type: node type
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ */
+static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
+			     int offs)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return dbg_is_nnode_dirty(c, lnum, offs);
+	case UBIFS_LPT_PNODE:
+		return dbg_is_pnode_dirty(c, lnum, offs);
+	case UBIFS_LPT_LTAB:
+		return dbg_is_ltab_dirty(c, lnum, offs);
+	case UBIFS_LPT_LSAVE:
+		return dbg_is_lsave_dirty(c, lnum, offs);
+	}
+	return 1;
+}
+
+/**
+ * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
+	int ret;
+	void *buf, *p;
+
+	if (!dbg_is_chk_lprops(c))
+		return 0;
+
+	buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err(c, "cannot allocate memory for ltab checking");
+		return 0;
+	}
+
+	dbg_lp("LEB %d", lnum);
+
+	err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1);
+	if (err)
+		goto out;
+
+	while (1) {
+		if (!is_a_node(c, p, len)) {
+			int i, pad_len;
+
+			pad_len = get_pad_len(c, p, len);
+			if (pad_len) {
+				p += pad_len;
+				len -= pad_len;
+				dirty += pad_len;
+				continue;
+			}
+			if (!dbg_is_all_ff(p, len)) {
+				ubifs_err(c, "invalid empty space in LEB %d at %d",
+					  lnum, c->leb_size - len);
+				err = -EINVAL;
+			}
+			i = lnum - c->lpt_first;
+			if (len != c->ltab[i].free) {
+				ubifs_err(c, "invalid free space in LEB %d (free %d, expected %d)",
+					  lnum, len, c->ltab[i].free);
+				err = -EINVAL;
+			}
+			if (dirty != c->ltab[i].dirty) {
+				ubifs_err(c, "invalid dirty space in LEB %d (dirty %d, expected %d)",
+					  lnum, dirty, c->ltab[i].dirty);
+				err = -EINVAL;
+			}
+			goto out;
+		}
+		node_type = get_lpt_node_type(c, p, &node_num);
+		node_len = get_lpt_node_len(c, node_type);
+		ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
+		if (ret == 1)
+			dirty += node_len;
+		p += node_len;
+		len -= node_len;
+	}
+
+	err = 0;
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * dbg_check_ltab - check the free and dirty space in the ltab.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_ltab(struct ubifs_info *c)
+{
+	int lnum, err, i, cnt;
+
+	if (!dbg_is_chk_lprops(c))
+		return 0;
+
+	/* Bring the entire tree into memory */
+	cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+	for (i = 0; i < cnt; i++) {
+		struct ubifs_pnode *pnode;
+
+		pnode = pnode_lookup(c, i);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+		cond_resched();
+	}
+
+	/* Check nodes */
+	err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
+	if (err)
+		return err;
+
+	/* Check each LEB */
+	for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+		err = dbg_check_ltab_lnum(c, lnum);
+		if (err) {
+			ubifs_err(c, "failed at LEB %d", lnum);
+			return err;
+		}
+	}
+
+	dbg_lp("succeeded");
+	return 0;
+}
+
+/**
+ * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_free_spc(struct ubifs_info *c)
+{
+	long long free = 0;
+	int i;
+
+	if (!dbg_is_chk_lprops(c))
+		return 0;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (i + c->lpt_first == c->nhead_lnum)
+			free += c->leb_size - c->nhead_offs;
+		else if (c->ltab[i].free == c->leb_size)
+			free += c->leb_size;
+	}
+	if (free < c->lpt_sz) {
+		ubifs_err(c, "LPT space error: free %lld lpt_sz %lld",
+			  free, c->lpt_sz);
+		ubifs_dump_lpt_info(c);
+		ubifs_dump_lpt_lebs(c);
+		dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
+ * @c: the UBIFS file-system description object
+ * @action: what to do
+ * @len: length written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
+ */
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	long long chk_lpt_sz, lpt_sz;
+	int err = 0;
+
+	if (!dbg_is_chk_lprops(c))
+		return 0;
+
+	switch (action) {
+	case 0:
+		d->chk_lpt_sz = 0;
+		d->chk_lpt_sz2 = 0;
+		d->chk_lpt_lebs = 0;
+		d->chk_lpt_wastage = 0;
+		if (c->dirty_pn_cnt > c->pnode_cnt) {
+			ubifs_err(c, "dirty pnodes %d exceed max %d",
+				  c->dirty_pn_cnt, c->pnode_cnt);
+			err = -EINVAL;
+		}
+		if (c->dirty_nn_cnt > c->nnode_cnt) {
+			ubifs_err(c, "dirty nnodes %d exceed max %d",
+				  c->dirty_nn_cnt, c->nnode_cnt);
+			err = -EINVAL;
+		}
+		return err;
+	case 1:
+		d->chk_lpt_sz += len;
+		return 0;
+	case 2:
+		d->chk_lpt_sz += len;
+		d->chk_lpt_wastage += len;
+		d->chk_lpt_lebs += 1;
+		return 0;
+	case 3:
+		chk_lpt_sz = c->leb_size;
+		chk_lpt_sz *= d->chk_lpt_lebs;
+		chk_lpt_sz += len - c->nhead_offs;
+		if (d->chk_lpt_sz != chk_lpt_sz) {
+			ubifs_err(c, "LPT wrote %lld but space used was %lld",
+				  d->chk_lpt_sz, chk_lpt_sz);
+			err = -EINVAL;
+		}
+		if (d->chk_lpt_sz > c->lpt_sz) {
+			ubifs_err(c, "LPT wrote %lld but lpt_sz is %lld",
+				  d->chk_lpt_sz, c->lpt_sz);
+			err = -EINVAL;
+		}
+		if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
+			ubifs_err(c, "LPT layout size %lld but wrote %lld",
+				  d->chk_lpt_sz, d->chk_lpt_sz2);
+			err = -EINVAL;
+		}
+		if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
+			ubifs_err(c, "LPT new nhead offs: expected %d was %d",
+				  d->new_nhead_offs, len);
+			err = -EINVAL;
+		}
+		lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+		lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+		lpt_sz += c->ltab_sz;
+		if (c->big_lpt)
+			lpt_sz += c->lsave_sz;
+		if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
+			ubifs_err(c, "LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+				  d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
+			err = -EINVAL;
+		}
+		if (err) {
+			ubifs_dump_lpt_info(c);
+			ubifs_dump_lpt_lebs(c);
+			dump_stack();
+		}
+		d->chk_lpt_sz2 = d->chk_lpt_sz;
+		d->chk_lpt_sz = 0;
+		d->chk_lpt_wastage = 0;
+		d->chk_lpt_lebs = 0;
+		d->new_nhead_offs = len;
+		return err;
+	case 4:
+		d->chk_lpt_sz += len;
+		d->chk_lpt_wastage += len;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * ubifs_dump_lpt_leb - dump an LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to dump
+ *
+ * This function dumps an LEB from LPT area. Nodes in this area are very
+ * different to nodes in the main area (e.g., they do not have common headers,
+ * they do not have 8-byte alignments, etc), so we have a separate function to
+ * dump LPT area LEBs. Note, LPT has to be locked by the caller.
+ */
+static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, node_type, node_num, node_len, offs;
+	void *buf, *p;
+
+	pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
+	buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err(c, "cannot allocate memory to dump LPT");
+		return;
+	}
+
+	err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1);
+	if (err)
+		goto out;
+
+	while (1) {
+		offs = c->leb_size - len;
+		if (!is_a_node(c, p, len)) {
+			int pad_len;
+
+			pad_len = get_pad_len(c, p, len);
+			if (pad_len) {
+				pr_err("LEB %d:%d, pad %d bytes\n",
+				       lnum, offs, pad_len);
+				p += pad_len;
+				len -= pad_len;
+				continue;
+			}
+			if (len)
+				pr_err("LEB %d:%d, free %d bytes\n",
+				       lnum, offs, len);
+			break;
+		}
+
+		node_type = get_lpt_node_type(c, p, &node_num);
+		switch (node_type) {
+		case UBIFS_LPT_PNODE:
+		{
+			node_len = c->pnode_sz;
+			if (c->big_lpt)
+				pr_err("LEB %d:%d, pnode num %d\n",
+				       lnum, offs, node_num);
+			else
+				pr_err("LEB %d:%d, pnode\n", lnum, offs);
+			break;
+		}
+		case UBIFS_LPT_NNODE:
+		{
+			int i;
+			struct ubifs_nnode nnode;
+
+			node_len = c->nnode_sz;
+			if (c->big_lpt)
+				pr_err("LEB %d:%d, nnode num %d, ",
+				       lnum, offs, node_num);
+			else
+				pr_err("LEB %d:%d, nnode, ",
+				       lnum, offs);
+			err = ubifs_unpack_nnode(c, p, &nnode);
+			if (err) {
+				pr_err("failed to unpack_node, error %d\n",
+				       err);
+				break;
+			}
+			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+				pr_cont("%d:%d", nnode.nbranch[i].lnum,
+				       nnode.nbranch[i].offs);
+				if (i != UBIFS_LPT_FANOUT - 1)
+					pr_cont(", ");
+			}
+			pr_cont("\n");
+			break;
+		}
+		case UBIFS_LPT_LTAB:
+			node_len = c->ltab_sz;
+			pr_err("LEB %d:%d, ltab\n", lnum, offs);
+			break;
+		case UBIFS_LPT_LSAVE:
+			node_len = c->lsave_sz;
+			pr_err("LEB %d:%d, lsave len\n", lnum, offs);
+			break;
+		default:
+			ubifs_err(c, "LPT node type %d not recognized", node_type);
+			goto out;
+		}
+
+		p += node_len;
+		len -= node_len;
+	}
+
+	pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
+out:
+	vfree(buf);
+	return;
+}
+
+/**
+ * ubifs_dump_lpt_lebs - dump LPT lebs.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps all LPT LEBs. The caller has to make sure the LPT is
+ * locked.
+ */
+void ubifs_dump_lpt_lebs(const struct ubifs_info *c)
+{
+	int i;
+
+	pr_err("(pid %d) start dumping all LPT LEBs\n", current->pid);
+	for (i = 0; i < c->lpt_lebs; i++)
+		dump_lpt_leb(c, i + c->lpt_first);
+	pr_err("(pid %d) finish dumping all LPT LEBs\n", current->pid);
+}
+
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	int i;
+
+	if (!dbg_is_chk_gen(c))
+		return 0;
+	if (prandom_u32() & 3)
+		return 0;
+
+	for (i = 0; i < c->lsave_cnt; i++)
+		c->lsave[i] = c->main_first;
+
+	list_for_each_entry(lprops, &c->empty_list, list)
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+	list_for_each_entry(lprops, &c->freeable_list, list)
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+	list_for_each_entry(lprops, &c->frdi_idx_list, list)
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+
+	return 1;
+}
diff --git a/kernel/fs/ubifs/master.c b/kernel/fs/ubifs/master.c
new file mode 100644
index 000000000..c6a5e39e2
--- /dev/null
+++ b/kernel/fs/ubifs/master.c
@@ -0,0 +1,395 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/* This file implements reading and writing the master node */
+
+#include "ubifs.h"
+
+/**
+ * scan_for_master - search the valid master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the master node LEBs and search for the latest master
+ * node. Returns zero in case of success, %-EUCLEAN if there master area is
+ * corrupted and requires recovery, and a negative error code in case of
+ * failure.
+ */
+static int scan_for_master(struct ubifs_info *c)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int lnum, offs = 0, nodes_cnt;
+
+	lnum = UBIFS_MST_LNUM;
+
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	nodes_cnt = sleb->nodes_cnt;
+	if (nodes_cnt > 0) {
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+		if (snod->type != UBIFS_MST_NODE)
+			goto out_dump;
+		memcpy(c->mst_node, snod->node, snod->len);
+		offs = snod->offs;
+	}
+	ubifs_scan_destroy(sleb);
+
+	lnum += 1;
+
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	if (sleb->nodes_cnt != nodes_cnt)
+		goto out;
+	if (!sleb->nodes_cnt)
+		goto out;
+	snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
+	if (snod->type != UBIFS_MST_NODE)
+		goto out_dump;
+	if (snod->offs != offs)
+		goto out;
+	if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
+		   (void *)snod->node + UBIFS_CH_SZ,
+		   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+		goto out;
+	c->mst_offs = offs;
+	ubifs_scan_destroy(sleb);
+	return 0;
+
+out:
+	ubifs_scan_destroy(sleb);
+	return -EUCLEAN;
+
+out_dump:
+	ubifs_err(c, "unexpected node type %d master LEB %d:%d",
+		  snod->type, lnum, snod->offs);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * validate_master - validate master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function validates data which was read from master node. Returns zero
+ * if the data is all right and %-EINVAL if not.
+ */
+static int validate_master(const struct ubifs_info *c)
+{
+	long long main_sz;
+	int err;
+
+	if (c->max_sqnum >= SQNUM_WATERMARK) {
+		err = 1;
+		goto out;
+	}
+
+	if (c->cmt_no >= c->max_sqnum) {
+		err = 2;
+		goto out;
+	}
+
+	if (c->highest_inum >= INUM_WATERMARK) {
+		err = 3;
+		goto out;
+	}
+
+	if (c->lhead_lnum < UBIFS_LOG_LNUM ||
+	    c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
+	    c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
+	    c->lhead_offs & (c->min_io_size - 1)) {
+		err = 4;
+		goto out;
+	}
+
+	if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
+	    c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
+		err = 5;
+		goto out;
+	}
+
+	if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
+	    c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
+		err = 6;
+		goto out;
+	}
+
+	if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
+		err = 7;
+		goto out;
+	}
+
+	if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
+	    c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
+	    c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
+		err = 8;
+		goto out;
+	}
+
+	main_sz = (long long)c->main_lebs * c->leb_size;
+	if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
+		err = 9;
+		goto out;
+	}
+
+	if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
+	    c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
+		err = 10;
+		goto out;
+	}
+
+	if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
+	    c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
+	    c->nhead_offs > c->leb_size) {
+		err = 11;
+		goto out;
+	}
+
+	if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
+	    c->ltab_offs < 0 ||
+	    c->ltab_offs + c->ltab_sz > c->leb_size) {
+		err = 12;
+		goto out;
+	}
+
+	if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
+	    c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
+	    c->lsave_offs + c->lsave_sz > c->leb_size)) {
+		err = 13;
+		goto out;
+	}
+
+	if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
+		err = 14;
+		goto out;
+	}
+
+	if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
+		err = 15;
+		goto out;
+	}
+
+	if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
+		err = 16;
+		goto out;
+	}
+
+	if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
+	    c->lst.total_free & 7) {
+		err = 17;
+		goto out;
+	}
+
+	if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
+		err = 18;
+		goto out;
+	}
+
+	if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
+		err = 19;
+		goto out;
+	}
+
+	if (c->lst.total_free + c->lst.total_dirty +
+	    c->lst.total_used > main_sz) {
+		err = 20;
+		goto out;
+	}
+
+	if (c->lst.total_dead + c->lst.total_dark +
+	    c->lst.total_used + c->bi.old_idx_sz > main_sz) {
+		err = 21;
+		goto out;
+	}
+
+	if (c->lst.total_dead < 0 ||
+	    c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
+	    c->lst.total_dead & 7) {
+		err = 22;
+		goto out;
+	}
+
+	if (c->lst.total_dark < 0 ||
+	    c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
+	    c->lst.total_dark & 7) {
+		err = 23;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err(c, "bad master node at offset %d error %d", c->mst_offs, err);
+	ubifs_dump_node(c, c->mst_node);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_master - read master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds and reads the master node during file-system mount. If
+ * the flash is empty, it creates default master node as well. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+int ubifs_read_master(struct ubifs_info *c)
+{
+	int err, old_leb_cnt;
+
+	c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+	if (!c->mst_node)
+		return -ENOMEM;
+
+	err = scan_for_master(c);
+	if (err) {
+		if (err == -EUCLEAN)
+			err = ubifs_recover_master_node(c);
+		if (err)
+			/*
+			 * Note, we do not free 'c->mst_node' here because the
+			 * unmount routine will take care of this.
+			 */
+			return err;
+	}
+
+	/* Make sure that the recovery flag is clear */
+	c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
+
+	c->max_sqnum       = le64_to_cpu(c->mst_node->ch.sqnum);
+	c->highest_inum    = le64_to_cpu(c->mst_node->highest_inum);
+	c->cmt_no          = le64_to_cpu(c->mst_node->cmt_no);
+	c->zroot.lnum      = le32_to_cpu(c->mst_node->root_lnum);
+	c->zroot.offs      = le32_to_cpu(c->mst_node->root_offs);
+	c->zroot.len       = le32_to_cpu(c->mst_node->root_len);
+	c->lhead_lnum      = le32_to_cpu(c->mst_node->log_lnum);
+	c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
+	c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
+	c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
+	c->bi.old_idx_sz   = le64_to_cpu(c->mst_node->index_size);
+	c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
+	c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
+	c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
+	c->nhead_offs      = le32_to_cpu(c->mst_node->nhead_offs);
+	c->ltab_lnum       = le32_to_cpu(c->mst_node->ltab_lnum);
+	c->ltab_offs       = le32_to_cpu(c->mst_node->ltab_offs);
+	c->lsave_lnum      = le32_to_cpu(c->mst_node->lsave_lnum);
+	c->lsave_offs      = le32_to_cpu(c->mst_node->lsave_offs);
+	c->lscan_lnum      = le32_to_cpu(c->mst_node->lscan_lnum);
+	c->lst.empty_lebs  = le32_to_cpu(c->mst_node->empty_lebs);
+	c->lst.idx_lebs    = le32_to_cpu(c->mst_node->idx_lebs);
+	old_leb_cnt        = le32_to_cpu(c->mst_node->leb_cnt);
+	c->lst.total_free  = le64_to_cpu(c->mst_node->total_free);
+	c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
+	c->lst.total_used  = le64_to_cpu(c->mst_node->total_used);
+	c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
+	c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
+
+	c->calc_idx_sz = c->bi.old_idx_sz;
+
+	if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
+		c->no_orphs = 1;
+
+	if (old_leb_cnt != c->leb_cnt) {
+		/* The file system has been resized */
+		int growth = c->leb_cnt - old_leb_cnt;
+
+		if (c->leb_cnt < old_leb_cnt ||
+		    c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+			ubifs_err(c, "bad leb_cnt on master node");
+			ubifs_dump_node(c, c->mst_node);
+			return -EINVAL;
+		}
+
+		dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
+			old_leb_cnt, c->leb_cnt);
+		c->lst.empty_lebs += growth;
+		c->lst.total_free += growth * (long long)c->leb_size;
+		c->lst.total_dark += growth * (long long)c->dark_wm;
+
+		/*
+		 * Reflect changes back onto the master node. N.B. the master
+		 * node gets written immediately whenever mounting (or
+		 * remounting) in read-write mode, so we do not need to write it
+		 * here.
+		 */
+		c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
+		c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
+		c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
+		c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
+	}
+
+	err = validate_master(c);
+	if (err)
+		return err;
+
+	err = dbg_old_index_check_init(c, &c->zroot);
+
+	return err;
+}
+
+/**
+ * ubifs_write_master - write master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node. Returns zero in case of success and a
+ * negative error code in case of failure. The master node is written twice to
+ * enable recovery.
+ */
+int ubifs_write_master(struct ubifs_info *c)
+{
+	int err, lnum, offs, len;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+
+	lnum = UBIFS_MST_LNUM;
+	offs = c->mst_offs + c->mst_node_alsz;
+	len = UBIFS_MST_NODE_SZ;
+
+	if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+		offs = 0;
+	}
+
+	c->mst_offs = offs;
+	c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
+
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
+	if (err)
+		return err;
+
+	lnum += 1;
+
+	if (offs == 0) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
+
+	return err;
+}
diff --git a/kernel/fs/ubifs/misc.h b/kernel/fs/ubifs/misc.h
new file mode 100644
index 000000000..ee7cb5ebb
--- /dev/null
+++ b/kernel/fs/ubifs/misc.h
@@ -0,0 +1,303 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file contains miscellaneous helper functions.
+ */
+
+#ifndef __UBIFS_MISC_H__
+#define __UBIFS_MISC_H__
+
+/**
+ * ubifs_zn_dirty - check if znode is dirty.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is dirty and %0 otherwise.
+ */
+static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
+{
+	return !!test_bit(DIRTY_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_zn_obsolete - check if znode is obsolete.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is obsolete and %0 otherwise.
+ */
+static inline int ubifs_zn_obsolete(const struct ubifs_znode *znode)
+{
+	return !!test_bit(OBSOLETE_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_zn_cow - check if znode has to be copied on write.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is has COW flag set and %0
+ * otherwise.
+ */
+static inline int ubifs_zn_cow(const struct ubifs_znode *znode)
+{
+	return !!test_bit(COW_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_wake_up_bgt - wake up background thread.
+ * @c: UBIFS file-system description object
+ */
+static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
+{
+	if (c->bgt && !c->need_bgt) {
+		c->need_bgt = 1;
+		wake_up_process(c->bgt);
+	}
+}
+
+/**
+ * ubifs_tnc_find_child - find next child in znode.
+ * @znode: znode to search at
+ * @start: the zbranch index to start at
+ *
+ * This helper function looks for znode child starting at index @start. Returns
+ * the child or %NULL if no children were found.
+ */
+static inline struct ubifs_znode *
+ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
+{
+	while (start < znode->child_cnt) {
+		if (znode->zbranch[start].znode)
+			return znode->zbranch[start].znode;
+		start += 1;
+	}
+
+	return NULL;
+}
+
+/**
+ * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
+ * @inode: the VFS 'struct inode' pointer
+ */
+static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
+{
+	return container_of(inode, struct ubifs_inode, vfs_inode);
+}
+
+/**
+ * ubifs_compr_present - check if compressor was compiled in.
+ * @compr_type: compressor type to check
+ *
+ * This function returns %1 of compressor of type @compr_type is present, and
+ * %0 if not.
+ */
+static inline int ubifs_compr_present(int compr_type)
+{
+	ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+	return !!ubifs_compressors[compr_type]->capi_name;
+}
+
+/**
+ * ubifs_compr_name - get compressor name string by its type.
+ * @compr_type: compressor type
+ *
+ * This function returns compressor type string.
+ */
+static inline const char *ubifs_compr_name(int compr_type)
+{
+	ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+	return ubifs_compressors[compr_type]->name;
+}
+
+/**
+ * ubifs_wbuf_sync - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
+ * that the write-buffer is already locked.
+ */
+static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
+{
+	int err;
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * ubifs_encode_dev - encode device node IDs.
+ * @dev: UBIFS device node information
+ * @rdev: device IDs to encode
+ *
+ * This is a helper function which encodes major/minor numbers of a device node
+ * into UBIFS device node description. We use standard Linux "new" and "huge"
+ * encodings.
+ */
+static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
+{
+	if (new_valid_dev(rdev)) {
+		dev->new = cpu_to_le32(new_encode_dev(rdev));
+		return sizeof(dev->new);
+	} else {
+		dev->huge = cpu_to_le64(huge_encode_dev(rdev));
+		return sizeof(dev->huge);
+	}
+}
+
+/**
+ * ubifs_add_dirt - add dirty space to LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to add dirty space for
+ * @dirty: dirty space to add
+ *
+ * This is a helper function which increased amount of dirty LEB space. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+	return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
+}
+
+/**
+ * ubifs_return_leb - return LEB to lprops.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to return
+ *
+ * This helper function cleans the "taken" flag of a logical eraseblock in the
+ * lprops. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
+{
+	return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				   LPROPS_TAKEN, 0);
+}
+
+/**
+ * ubifs_idx_node_sz - return index node size.
+ * @c: the UBIFS file-system description object
+ * @child_cnt: number of children of this index node
+ */
+static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
+{
+	return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+}
+
+/**
+ * ubifs_idx_branch - return pointer to an index branch.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ * @bnum: branch number
+ */
+static inline
+struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
+				      const struct ubifs_idx_node *idx,
+				      int bnum)
+{
+	return (struct ubifs_branch *)((void *)idx->branches +
+				       (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+}
+
+/**
+ * ubifs_idx_key - return pointer to an index key.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ */
+static inline void *ubifs_idx_key(const struct ubifs_info *c,
+				  const struct ubifs_idx_node *idx)
+{
+	return (void *)((struct ubifs_branch *)idx->branches)->key;
+}
+
+/**
+ * ubifs_current_time - round current time to time granularity.
+ * @inode: inode
+ */
+static inline struct timespec ubifs_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+				   const union ubifs_key *key, void *node)
+{
+	return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
+
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+static inline void ubifs_get_lprops(struct ubifs_info *c)
+{
+	mutex_lock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+static inline void ubifs_release_lprops(struct ubifs_info *c)
+{
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+	mutex_unlock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+	lnum += 1;
+	if (lnum > c->log_last)
+		lnum = UBIFS_LOG_LNUM;
+
+	return lnum;
+}
+
+#endif /* __UBIFS_MISC_H__ */
diff --git a/kernel/fs/ubifs/orphan.c b/kernel/fs/ubifs/orphan.c
new file mode 100644
index 000000000..caf2d123e
--- /dev/null
+++ b/kernel/fs/ubifs/orphan.c
@@ -0,0 +1,956 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Author: Adrian Hunter
+ */
+
+#include "ubifs.h"
+
+/*
+ * An orphan is an inode number whose inode node has been committed to the index
+ * with a link count of zero. That happens when an open file is deleted
+ * (unlinked) and then a commit is run. In the normal course of events the inode
+ * would be deleted when the file is closed. However in the case of an unclean
+ * unmount, orphans need to be accounted for. After an unclean unmount, the
+ * orphans' inodes must be deleted which means either scanning the entire index
+ * looking for them, or keeping a list on flash somewhere. This unit implements
+ * the latter approach.
+ *
+ * The orphan area is a fixed number of LEBs situated between the LPT area and
+ * the main area. The number of orphan area LEBs is specified when the file
+ * system is created. The minimum number is 1. The size of the orphan area
+ * should be so that it can hold the maximum number of orphans that are expected
+ * to ever exist at one time.
+ *
+ * The number of orphans that can fit in a LEB is:
+ *
+ *         (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
+ *
+ * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
+ *
+ * Orphans are accumulated in a rb-tree. When an inode's link count drops to
+ * zero, the inode number is added to the rb-tree. It is removed from the tree
+ * when the inode is deleted.  Any new orphans that are in the orphan tree when
+ * the commit is run, are written to the orphan area in 1 or more orphan nodes.
+ * If the orphan area is full, it is consolidated to make space.  There is
+ * always enough space because validation prevents the user from creating more
+ * than the maximum number of orphans allowed.
+ */
+
+static int dbg_check_orphans(struct ubifs_info *c);
+
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+	orphan->new = 1;
+
+	spin_lock(&c->orphan_lock);
+	if (c->tot_orphans >= c->max_orphans) {
+		spin_unlock(&c->orphan_lock);
+		kfree(orphan);
+		return -ENFILE;
+	}
+	p = &c->orph_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			ubifs_err(c, "orphaned twice");
+			spin_unlock(&c->orphan_lock);
+			kfree(orphan);
+			return 0;
+		}
+	}
+	c->tot_orphans += 1;
+	c->new_orphans += 1;
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, &c->orph_tree);
+	list_add_tail(&orphan->list, &c->orph_list);
+	list_add_tail(&orphan->new_list, &c->orph_new);
+	spin_unlock(&c->orphan_lock);
+	dbg_gen("ino %lu", (unsigned long)inum);
+	return 0;
+}
+
+/**
+ * ubifs_delete_orphan - delete an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Delete an orphan. This function is called when an inode is deleted.
+ */
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *o;
+	struct rb_node *p;
+
+	spin_lock(&c->orphan_lock);
+	p = c->orph_tree.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else {
+			if (o->del) {
+				spin_unlock(&c->orphan_lock);
+				dbg_gen("deleted twice ino %lu",
+					(unsigned long)inum);
+				return;
+			}
+			if (o->cmt) {
+				o->del = 1;
+				o->dnext = c->orph_dnext;
+				c->orph_dnext = o;
+				spin_unlock(&c->orphan_lock);
+				dbg_gen("delete later ino %lu",
+					(unsigned long)inum);
+				return;
+			}
+			rb_erase(p, &c->orph_tree);
+			list_del(&o->list);
+			c->tot_orphans -= 1;
+			if (o->new) {
+				list_del(&o->new_list);
+				c->new_orphans -= 1;
+			}
+			spin_unlock(&c->orphan_lock);
+			kfree(o);
+			dbg_gen("inum %lu", (unsigned long)inum);
+			return;
+		}
+	}
+	spin_unlock(&c->orphan_lock);
+	ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum);
+	dump_stack();
+}
+
+/**
+ * ubifs_orphan_start_commit - start commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * Start commit of orphans.
+ */
+int ubifs_orphan_start_commit(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orphan, **last;
+
+	spin_lock(&c->orphan_lock);
+	last = &c->orph_cnext;
+	list_for_each_entry(orphan, &c->orph_new, new_list) {
+		ubifs_assert(orphan->new);
+		ubifs_assert(!orphan->cmt);
+		orphan->new = 0;
+		orphan->cmt = 1;
+		*last = orphan;
+		last = &orphan->cnext;
+	}
+	*last = NULL;
+	c->cmt_orphans = c->new_orphans;
+	c->new_orphans = 0;
+	dbg_cmt("%d orphans to commit", c->cmt_orphans);
+	INIT_LIST_HEAD(&c->orph_new);
+	if (c->tot_orphans == 0)
+		c->no_orphs = 1;
+	else
+		c->no_orphs = 0;
+	spin_unlock(&c->orphan_lock);
+	return 0;
+}
+
+/**
+ * avail_orphs - calculate available space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in the
+ * available space.
+ */
+static int avail_orphs(struct ubifs_info *c)
+{
+	int avail_lebs, avail, gap;
+
+	avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
+	avail = avail_lebs *
+	       ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+	gap = c->leb_size - c->ohead_offs;
+	if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
+		avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+	return avail;
+}
+
+/**
+ * tot_avail_orphs - calculate total space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in half
+ * the total space. That leaves half the space for adding new orphans.
+ */
+static int tot_avail_orphs(struct ubifs_info *c)
+{
+	int avail_lebs, avail;
+
+	avail_lebs = c->orph_lebs;
+	avail = avail_lebs *
+	       ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+	return avail / 2;
+}
+
+/**
+ * do_write_orph_node - write a node to the orphan head.
+ * @c: UBIFS file-system description object
+ * @len: length of node
+ * @atomic: write atomically
+ *
+ * This function writes a node to the orphan head from the orphan buffer. If
+ * %atomic is not zero, then the write is done atomically. On success, %0 is
+ * returned, otherwise a negative error code is returned.
+ */
+static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
+{
+	int err = 0;
+
+	if (atomic) {
+		ubifs_assert(c->ohead_offs == 0);
+		ubifs_prepare_node(c, c->orph_buf, len, 1);
+		len = ALIGN(len, c->min_io_size);
+		err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len);
+	} else {
+		if (c->ohead_offs == 0) {
+			/* Ensure LEB has been unmapped */
+			err = ubifs_leb_unmap(c, c->ohead_lnum);
+			if (err)
+				return err;
+		}
+		err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
+				       c->ohead_offs);
+	}
+	return err;
+}
+
+/**
+ * write_orph_node - write an orphan node.
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function builds an orphan node from the cnext list and writes it to the
+ * orphan head. On success, %0 is returned, otherwise a negative error code
+ * is returned.
+ */
+static int write_orph_node(struct ubifs_info *c, int atomic)
+{
+	struct ubifs_orphan *orphan, *cnext;
+	struct ubifs_orph_node *orph;
+	int gap, err, len, cnt, i;
+
+	ubifs_assert(c->cmt_orphans > 0);
+	gap = c->leb_size - c->ohead_offs;
+	if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
+		c->ohead_lnum += 1;
+		c->ohead_offs = 0;
+		gap = c->leb_size;
+		if (c->ohead_lnum > c->orph_last) {
+			/*
+			 * We limit the number of orphans so that this should
+			 * never happen.
+			 */
+			ubifs_err(c, "out of space in orphan area");
+			return -EINVAL;
+		}
+	}
+	cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+	if (cnt > c->cmt_orphans)
+		cnt = c->cmt_orphans;
+	len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
+	ubifs_assert(c->orph_buf);
+	orph = c->orph_buf;
+	orph->ch.node_type = UBIFS_ORPH_NODE;
+	spin_lock(&c->orphan_lock);
+	cnext = c->orph_cnext;
+	for (i = 0; i < cnt; i++) {
+		orphan = cnext;
+		ubifs_assert(orphan->cmt);
+		orph->inos[i] = cpu_to_le64(orphan->inum);
+		orphan->cmt = 0;
+		cnext = orphan->cnext;
+		orphan->cnext = NULL;
+	}
+	c->orph_cnext = cnext;
+	c->cmt_orphans -= cnt;
+	spin_unlock(&c->orphan_lock);
+	if (c->cmt_orphans)
+		orph->cmt_no = cpu_to_le64(c->cmt_no);
+	else
+		/* Mark the last node of the commit */
+		orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
+	ubifs_assert(c->ohead_offs + len <= c->leb_size);
+	ubifs_assert(c->ohead_lnum >= c->orph_first);
+	ubifs_assert(c->ohead_lnum <= c->orph_last);
+	err = do_write_orph_node(c, len, atomic);
+	c->ohead_offs += ALIGN(len, c->min_io_size);
+	c->ohead_offs = ALIGN(c->ohead_offs, 8);
+	return err;
+}
+
+/**
+ * write_orph_nodes - write orphan nodes until there are no more to commit.
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function writes orphan nodes for all the orphans to commit. On success,
+ * %0 is returned, otherwise a negative error code is returned.
+ */
+static int write_orph_nodes(struct ubifs_info *c, int atomic)
+{
+	int err;
+
+	while (c->cmt_orphans > 0) {
+		err = write_orph_node(c, atomic);
+		if (err)
+			return err;
+	}
+	if (atomic) {
+		int lnum;
+
+		/* Unmap any unused LEBs after consolidation */
+		for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * consolidate - consolidate the orphan area.
+ * @c: UBIFS file-system description object
+ *
+ * This function enables consolidation by putting all the orphans into the list
+ * to commit. The list is in the order that the orphans were added, and the
+ * LEBs are written atomically in order, so at no time can orphans be lost by
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int consolidate(struct ubifs_info *c)
+{
+	int tot_avail = tot_avail_orphs(c), err = 0;
+
+	spin_lock(&c->orphan_lock);
+	dbg_cmt("there is space for %d orphans and there are %d",
+		tot_avail, c->tot_orphans);
+	if (c->tot_orphans - c->new_orphans <= tot_avail) {
+		struct ubifs_orphan *orphan, **last;
+		int cnt = 0;
+
+		/* Change the cnext list to include all non-new orphans */
+		last = &c->orph_cnext;
+		list_for_each_entry(orphan, &c->orph_list, list) {
+			if (orphan->new)
+				continue;
+			orphan->cmt = 1;
+			*last = orphan;
+			last = &orphan->cnext;
+			cnt += 1;
+		}
+		*last = NULL;
+		ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
+		c->cmt_orphans = cnt;
+		c->ohead_lnum = c->orph_first;
+		c->ohead_offs = 0;
+	} else {
+		/*
+		 * We limit the number of orphans so that this should
+		 * never happen.
+		 */
+		ubifs_err(c, "out of space in orphan area");
+		err = -EINVAL;
+	}
+	spin_unlock(&c->orphan_lock);
+	return err;
+}
+
+/**
+ * commit_orphans - commit orphans.
+ * @c: UBIFS file-system description object
+ *
+ * This function commits orphans to flash. On success, %0 is returned,
+ * otherwise a negative error code is returned.
+ */
+static int commit_orphans(struct ubifs_info *c)
+{
+	int avail, atomic = 0, err;
+
+	ubifs_assert(c->cmt_orphans > 0);
+	avail = avail_orphs(c);
+	if (avail < c->cmt_orphans) {
+		/* Not enough space to write new orphans, so consolidate */
+		err = consolidate(c);
+		if (err)
+			return err;
+		atomic = 1;
+	}
+	err = write_orph_nodes(c, atomic);
+	return err;
+}
+
+/**
+ * erase_deleted - erase the orphans marked for deletion.
+ * @c: UBIFS file-system description object
+ *
+ * During commit, the orphans being committed cannot be deleted, so they are
+ * marked for deletion and deleted by this function. Also, the recovery
+ * adds killed orphans to the deletion list, and therefore they are deleted
+ * here too.
+ */
+static void erase_deleted(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orphan, *dnext;
+
+	spin_lock(&c->orphan_lock);
+	dnext = c->orph_dnext;
+	while (dnext) {
+		orphan = dnext;
+		dnext = orphan->dnext;
+		ubifs_assert(!orphan->new);
+		ubifs_assert(orphan->del);
+		rb_erase(&orphan->rb, &c->orph_tree);
+		list_del(&orphan->list);
+		c->tot_orphans -= 1;
+		dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum);
+		kfree(orphan);
+	}
+	c->orph_dnext = NULL;
+	spin_unlock(&c->orphan_lock);
+}
+
+/**
+ * ubifs_orphan_end_commit - end commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * End commit of orphans.
+ */
+int ubifs_orphan_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	if (c->cmt_orphans != 0) {
+		err = commit_orphans(c);
+		if (err)
+			return err;
+	}
+	erase_deleted(c);
+	err = dbg_check_orphans(c);
+	return err;
+}
+
+/**
+ * ubifs_clear_orphans - erase all LEBs used for orphans.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is not required, then the orphans from the previous session
+ * are not needed. This function locates the LEBs used to record
+ * orphans, and un-maps them.
+ */
+int ubifs_clear_orphans(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	c->ohead_lnum = c->orph_first;
+	c->ohead_offs = 0;
+	return 0;
+}
+
+/**
+ * insert_dead_orphan - insert an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * This function is a helper to the 'do_kill_orphans()' function. The orphan
+ * must be kept until the next commit, so it is added to the rb-tree and the
+ * deletion list.
+ */
+static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+
+	p = &c->orph_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			/* Already added - no problem */
+			kfree(orphan);
+			return 0;
+		}
+	}
+	c->tot_orphans += 1;
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, &c->orph_tree);
+	list_add_tail(&orphan->list, &c->orph_list);
+	orphan->del = 1;
+	orphan->dnext = c->orph_dnext;
+	c->orph_dnext = orphan;
+	dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
+		c->new_orphans, c->tot_orphans);
+	return 0;
+}
+
+/**
+ * do_kill_orphans - remove orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB
+ * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
+ * @outofdate: whether the LEB is out of date is returned here
+ * @last_flagged: whether the end orphan node is encountered
+ *
+ * This function is a helper to the 'kill_orphans()' function. It goes through
+ * every orphan node in a LEB and for every inode number recorded, removes
+ * all keys for that inode from the TNC.
+ */
+static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+			   unsigned long long *last_cmt_no, int *outofdate,
+			   int *last_flagged)
+{
+	struct ubifs_scan_node *snod;
+	struct ubifs_orph_node *orph;
+	unsigned long long cmt_no;
+	ino_t inum;
+	int i, n, err, first = 1;
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		if (snod->type != UBIFS_ORPH_NODE) {
+			ubifs_err(c, "invalid node type %d in orphan area at %d:%d",
+				  snod->type, sleb->lnum, snod->offs);
+			ubifs_dump_node(c, snod->node);
+			return -EINVAL;
+		}
+
+		orph = snod->node;
+
+		/* Check commit number */
+		cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
+		/*
+		 * The commit number on the master node may be less, because
+		 * of a failed commit. If there are several failed commits in a
+		 * row, the commit number written on orphan nodes will continue
+		 * to increase (because the commit number is adjusted here) even
+		 * though the commit number on the master node stays the same
+		 * because the master node has not been re-written.
+		 */
+		if (cmt_no > c->cmt_no)
+			c->cmt_no = cmt_no;
+		if (cmt_no < *last_cmt_no && *last_flagged) {
+			/*
+			 * The last orphan node had a higher commit number and
+			 * was flagged as the last written for that commit
+			 * number. That makes this orphan node, out of date.
+			 */
+			if (!first) {
+				ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d",
+					  cmt_no, sleb->lnum, snod->offs);
+				ubifs_dump_node(c, snod->node);
+				return -EINVAL;
+			}
+			dbg_rcvry("out of date LEB %d", sleb->lnum);
+			*outofdate = 1;
+			return 0;
+		}
+
+		if (first)
+			first = 0;
+
+		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		for (i = 0; i < n; i++) {
+			inum = le64_to_cpu(orph->inos[i]);
+			dbg_rcvry("deleting orphaned inode %lu",
+				  (unsigned long)inum);
+			err = ubifs_tnc_remove_ino(c, inum);
+			if (err)
+				return err;
+			err = insert_dead_orphan(c, inum);
+			if (err)
+				return err;
+		}
+
+		*last_cmt_no = cmt_no;
+		if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
+			dbg_rcvry("last orph node for commit %llu at %d:%d",
+				  cmt_no, sleb->lnum, snod->offs);
+			*last_flagged = 1;
+		} else
+			*last_flagged = 0;
+	}
+
+	return 0;
+}
+
+/**
+ * kill_orphans - remove all orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is required, then orphan inodes recorded during the previous
+ * session (which ended with an unclean unmount) must be deleted from the index.
+ * This is done by updating the TNC, but since the index is not updated until
+ * the next commit, the LEBs where the orphan information is recorded are not
+ * erased until the next commit.
+ */
+static int kill_orphans(struct ubifs_info *c)
+{
+	unsigned long long last_cmt_no = 0;
+	int lnum, err = 0, outofdate = 0, last_flagged = 0;
+
+	c->ohead_lnum = c->orph_first;
+	c->ohead_offs = 0;
+	/* Check no-orphans flag and skip this if no orphans */
+	if (c->no_orphs) {
+		dbg_rcvry("no orphans");
+		return 0;
+	}
+	/*
+	 * Orph nodes always start at c->orph_first and are written to each
+	 * successive LEB in turn. Generally unused LEBs will have been unmapped
+	 * but may contain out of date orphan nodes if the unmap didn't go
+	 * through. In addition, the last orphan node written for each commit is
+	 * marked (top bit of orph->cmt_no is set to 1). It is possible that
+	 * there are orphan nodes from the next commit (i.e. the commit did not
+	 * complete successfully). In that case, no orphans will have been lost
+	 * due to the way that orphans are written, and any orphans added will
+	 * be valid orphans anyway and so can be deleted.
+	 */
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		struct ubifs_scan_leb *sleb;
+
+		dbg_rcvry("LEB %d", lnum);
+		sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
+		if (IS_ERR(sleb)) {
+			if (PTR_ERR(sleb) == -EUCLEAN)
+				sleb = ubifs_recover_leb(c, lnum, 0,
+							 c->sbuf, -1);
+			if (IS_ERR(sleb)) {
+				err = PTR_ERR(sleb);
+				break;
+			}
+		}
+		err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
+				      &last_flagged);
+		if (err || outofdate) {
+			ubifs_scan_destroy(sleb);
+			break;
+		}
+		if (sleb->endpt) {
+			c->ohead_lnum = lnum;
+			c->ohead_offs = sleb->endpt;
+		}
+		ubifs_scan_destroy(sleb);
+	}
+	return err;
+}
+
+/**
+ * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
+ * @c: UBIFS file-system description object
+ * @unclean: indicates recovery from unclean unmount
+ * @read_only: indicates read only mount
+ *
+ * This function is called when mounting to erase orphans from the previous
+ * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
+ * orphans are deleted.
+ */
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
+{
+	int err = 0;
+
+	c->max_orphans = tot_avail_orphs(c);
+
+	if (!read_only) {
+		c->orph_buf = vmalloc(c->leb_size);
+		if (!c->orph_buf)
+			return -ENOMEM;
+	}
+
+	if (unclean)
+		err = kill_orphans(c);
+	else if (!read_only)
+		err = ubifs_clear_orphans(c);
+
+	return err;
+}
+
+/*
+ * Everything below is related to debugging.
+ */
+
+struct check_orphan {
+	struct rb_node rb;
+	ino_t inum;
+};
+
+struct check_info {
+	unsigned long last_ino;
+	unsigned long tot_inos;
+	unsigned long missing;
+	unsigned long long leaf_cnt;
+	struct ubifs_ino_node *node;
+	struct rb_root root;
+};
+
+static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *o;
+	struct rb_node *p;
+
+	spin_lock(&c->orphan_lock);
+	p = c->orph_tree.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else {
+			spin_unlock(&c->orphan_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&c->orphan_lock);
+	return 0;
+}
+
+static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
+{
+	struct check_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+
+	p = &root->rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct check_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			kfree(orphan);
+			return 0;
+		}
+	}
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, root);
+	return 0;
+}
+
+static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
+{
+	struct check_orphan *o;
+	struct rb_node *p;
+
+	p = root->rb_node;
+	while (p) {
+		o = rb_entry(p, struct check_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+static void dbg_free_check_tree(struct rb_root *root)
+{
+	struct check_orphan *o, *n;
+
+	rbtree_postorder_for_each_entry_safe(o, n, root, rb)
+		kfree(o);
+}
+
+static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *priv)
+{
+	struct check_info *ci = priv;
+	ino_t inum;
+	int err;
+
+	inum = key_inum(c, &zbr->key);
+	if (inum != ci->last_ino) {
+		/* Lowest node type is the inode node, so it comes first */
+		if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
+			ubifs_err(c, "found orphan node ino %lu, type %d",
+				  (unsigned long)inum, key_type(c, &zbr->key));
+		ci->last_ino = inum;
+		ci->tot_inos += 1;
+		err = ubifs_tnc_read_node(c, zbr, ci->node);
+		if (err) {
+			ubifs_err(c, "node read failed, error %d", err);
+			return err;
+		}
+		if (ci->node->nlink == 0)
+			/* Must be recorded as an orphan */
+			if (!dbg_find_check_orphan(&ci->root, inum) &&
+			    !dbg_find_orphan(c, inum)) {
+				ubifs_err(c, "missing orphan, ino %lu",
+					  (unsigned long)inum);
+				ci->missing += 1;
+			}
+	}
+	ci->leaf_cnt += 1;
+	return 0;
+}
+
+static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
+{
+	struct ubifs_scan_node *snod;
+	struct ubifs_orph_node *orph;
+	ino_t inum;
+	int i, n, err;
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		if (snod->type != UBIFS_ORPH_NODE)
+			continue;
+		orph = snod->node;
+		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		for (i = 0; i < n; i++) {
+			inum = le64_to_cpu(orph->inos[i]);
+			err = dbg_ins_check_orphan(&ci->root, inum);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
+{
+	int lnum, err = 0;
+	void *buf;
+
+	/* Check no-orphans flag and skip this if no orphans */
+	if (c->no_orphs)
+		return 0;
+
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err(c, "cannot allocate memory to check orphans");
+		return 0;
+	}
+
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		struct ubifs_scan_leb *sleb;
+
+		sleb = ubifs_scan(c, lnum, 0, buf, 0);
+		if (IS_ERR(sleb)) {
+			err = PTR_ERR(sleb);
+			break;
+		}
+
+		err = dbg_read_orphans(ci, sleb);
+		ubifs_scan_destroy(sleb);
+		if (err)
+			break;
+	}
+
+	vfree(buf);
+	return err;
+}
+
+static int dbg_check_orphans(struct ubifs_info *c)
+{
+	struct check_info ci;
+	int err;
+
+	if (!dbg_is_chk_orph(c))
+		return 0;
+
+	ci.last_ino = 0;
+	ci.tot_inos = 0;
+	ci.missing  = 0;
+	ci.leaf_cnt = 0;
+	ci.root = RB_ROOT;
+	ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+	if (!ci.node) {
+		ubifs_err(c, "out of memory");
+		return -ENOMEM;
+	}
+
+	err = dbg_scan_orphans(c, &ci);
+	if (err)
+		goto out;
+
+	err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
+	if (err) {
+		ubifs_err(c, "cannot scan TNC, error %d", err);
+		goto out;
+	}
+
+	if (ci.missing) {
+		ubifs_err(c, "%lu missing orphan(s)", ci.missing);
+		err = -EINVAL;
+		goto out;
+	}
+
+	dbg_cmt("last inode number is %lu", ci.last_ino);
+	dbg_cmt("total number of inodes is %lu", ci.tot_inos);
+	dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
+
+out:
+	dbg_free_check_tree(&ci.root);
+	kfree(ci.node);
+	return err;
+}
diff --git a/kernel/fs/ubifs/recovery.c b/kernel/fs/ubifs/recovery.c
new file mode 100644
index 000000000..695fc71d5
--- /dev/null
+++ b/kernel/fs/ubifs/recovery.c
@@ -0,0 +1,1547 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions needed to recover from unclean un-mounts.
+ * When UBIFS is mounted, it checks a flag on the master node to determine if
+ * an un-mount was completed successfully. If not, the process of mounting
+ * incorporates additional checking and fixing of on-flash data structures.
+ * UBIFS always cleans away all remnants of an unclean un-mount, so that
+ * errors do not accumulate. However UBIFS defers recovery if it is mounted
+ * read-only, and the flash is not modified in that case.
+ *
+ * The general UBIFS approach to the recovery is that it recovers from
+ * corruptions which could be caused by power cuts, but it refuses to recover
+ * from corruption caused by other reasons. And UBIFS tries to distinguish
+ * between these 2 reasons of corruptions and silently recover in the former
+ * case and loudly complain in the latter case.
+ *
+ * UBIFS writes only to erased LEBs, so it writes only to the flash space
+ * containing only 0xFFs. UBIFS also always writes strictly from the beginning
+ * of the LEB to the end. And UBIFS assumes that the underlying flash media
+ * writes in @c->max_write_size bytes at a time.
+ *
+ * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
+ * I/O unit corresponding to offset X to contain corrupted data, all the
+ * following min. I/O units have to contain empty space (all 0xFFs). If this is
+ * not true, the corruption cannot be the result of a power cut, and UBIFS
+ * refuses to mount.
+ */
+
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/**
+ * is_empty - determine whether a buffer is empty (contains all 0xff).
+ * @buf: buffer to clean
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
+ * %0 is returned.
+ */
+static int is_empty(void *buf, int len)
+{
+	uint8_t *p = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (*p++ != 0xff)
+			return 0;
+	return 1;
+}
+
+/**
+ * first_non_ff - find offset of the first non-0xff byte.
+ * @buf: buffer to search in
+ * @len: length of buffer
+ *
+ * This function returns offset of the first non-0xff byte in @buf or %-1 if
+ * the buffer contains only 0xff bytes.
+ */
+static int first_non_ff(void *buf, int len)
+{
+	uint8_t *p = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (*p++ != 0xff)
+			return i;
+	return -1;
+}
+
+/**
+ * get_master_node - get the last valid master node allowing for corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @pbuf: buffer containing the LEB read, is returned here
+ * @mst: master node, if found, is returned here
+ * @cor: corruption, if found, is returned here
+ *
+ * This function allocates a buffer, reads the LEB into it, and finds and
+ * returns the last valid master node allowing for one area of corruption.
+ * The corrupt area, if there is one, must be consistent with the assumption
+ * that it is the result of an unclean unmount while the master node was being
+ * written. Under those circumstances, it is valid to use the previously written
+ * master node.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
+			   struct ubifs_mst_node **mst, void **cor)
+{
+	const int sz = c->mst_node_alsz;
+	int err, offs, len;
+	void *sbuf, *buf;
+
+	sbuf = vmalloc(c->leb_size);
+	if (!sbuf)
+		return -ENOMEM;
+
+	err = ubifs_leb_read(c, lnum, sbuf, 0, c->leb_size, 0);
+	if (err && err != -EBADMSG)
+		goto out_free;
+
+	/* Find the first position that is definitely not a node */
+	offs = 0;
+	buf = sbuf;
+	len = c->leb_size;
+	while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
+		struct ubifs_ch *ch = buf;
+
+		if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+			break;
+		offs += sz;
+		buf  += sz;
+		len  -= sz;
+	}
+	/* See if there was a valid master node before that */
+	if (offs) {
+		int ret;
+
+		offs -= sz;
+		buf  -= sz;
+		len  += sz;
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+		if (ret != SCANNED_A_NODE && offs) {
+			/* Could have been corruption so check one place back */
+			offs -= sz;
+			buf  -= sz;
+			len  += sz;
+			ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+			if (ret != SCANNED_A_NODE)
+				/*
+				 * We accept only one area of corruption because
+				 * we are assuming that it was caused while
+				 * trying to write a master node.
+				 */
+				goto out_err;
+		}
+		if (ret == SCANNED_A_NODE) {
+			struct ubifs_ch *ch = buf;
+
+			if (ch->node_type != UBIFS_MST_NODE)
+				goto out_err;
+			dbg_rcvry("found a master node at %d:%d", lnum, offs);
+			*mst = buf;
+			offs += sz;
+			buf  += sz;
+			len  -= sz;
+		}
+	}
+	/* Check for corruption */
+	if (offs < c->leb_size) {
+		if (!is_empty(buf, min_t(int, len, sz))) {
+			*cor = buf;
+			dbg_rcvry("found corruption at %d:%d", lnum, offs);
+		}
+		offs += sz;
+		buf  += sz;
+		len  -= sz;
+	}
+	/* Check remaining empty space */
+	if (offs < c->leb_size)
+		if (!is_empty(buf, len))
+			goto out_err;
+	*pbuf = sbuf;
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	vfree(sbuf);
+	*mst = NULL;
+	*cor = NULL;
+	return err;
+}
+
+/**
+ * write_rcvrd_mst_node - write recovered master node.
+ * @c: UBIFS file-system description object
+ * @mst: master node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_rcvrd_mst_node(struct ubifs_info *c,
+				struct ubifs_mst_node *mst)
+{
+	int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
+	__le32 save_flags;
+
+	dbg_rcvry("recovery");
+
+	save_flags = mst->flags;
+	mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
+
+	ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+	err = ubifs_leb_change(c, lnum, mst, sz);
+	if (err)
+		goto out;
+	err = ubifs_leb_change(c, lnum + 1, mst, sz);
+	if (err)
+		goto out;
+out:
+	mst->flags = save_flags;
+	return err;
+}
+
+/**
+ * ubifs_recover_master_node - recover the master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function recovers the master node from corruption that may occur due to
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_master_node(struct ubifs_info *c)
+{
+	void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
+	struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
+	const int sz = c->mst_node_alsz;
+	int err, offs1, offs2;
+
+	dbg_rcvry("recovery");
+
+	err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
+	if (err)
+		goto out_free;
+
+	err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
+	if (err)
+		goto out_free;
+
+	if (mst1) {
+		offs1 = (void *)mst1 - buf1;
+		if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
+		    (offs1 == 0 && !cor1)) {
+			/*
+			 * mst1 was written by recovery at offset 0 with no
+			 * corruption.
+			 */
+			dbg_rcvry("recovery recovery");
+			mst = mst1;
+		} else if (mst2) {
+			offs2 = (void *)mst2 - buf2;
+			if (offs1 == offs2) {
+				/* Same offset, so must be the same */
+				if (memcmp((void *)mst1 + UBIFS_CH_SZ,
+					   (void *)mst2 + UBIFS_CH_SZ,
+					   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+					goto out_err;
+				mst = mst1;
+			} else if (offs2 + sz == offs1) {
+				/* 1st LEB was written, 2nd was not */
+				if (cor1)
+					goto out_err;
+				mst = mst1;
+			} else if (offs1 == 0 &&
+				   c->leb_size - offs2 - sz < sz) {
+				/* 1st LEB was unmapped and written, 2nd not */
+				if (cor1)
+					goto out_err;
+				mst = mst1;
+			} else
+				goto out_err;
+		} else {
+			/*
+			 * 2nd LEB was unmapped and about to be written, so
+			 * there must be only one master node in the first LEB
+			 * and no corruption.
+			 */
+			if (offs1 != 0 || cor1)
+				goto out_err;
+			mst = mst1;
+		}
+	} else {
+		if (!mst2)
+			goto out_err;
+		/*
+		 * 1st LEB was unmapped and about to be written, so there must
+		 * be no room left in 2nd LEB.
+		 */
+		offs2 = (void *)mst2 - buf2;
+		if (offs2 + sz + sz <= c->leb_size)
+			goto out_err;
+		mst = mst2;
+	}
+
+	ubifs_msg(c, "recovered master node from LEB %d",
+		  (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
+
+	memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
+
+	if (c->ro_mount) {
+		/* Read-only mode. Keep a copy for switching to rw mode */
+		c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
+		if (!c->rcvrd_mst_node) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
+
+		/*
+		 * We had to recover the master node, which means there was an
+		 * unclean reboot. However, it is possible that the master node
+		 * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set.
+		 * E.g., consider the following chain of events:
+		 *
+		 * 1. UBIFS was cleanly unmounted, so the master node is clean
+		 * 2. UBIFS is being mounted R/W and starts changing the master
+		 *    node in the first (%UBIFS_MST_LNUM). A power cut happens,
+		 *    so this LEB ends up with some amount of garbage at the
+		 *    end.
+		 * 3. UBIFS is being mounted R/O. We reach this place and
+		 *    recover the master node from the second LEB
+		 *    (%UBIFS_MST_LNUM + 1). But we cannot update the media
+		 *    because we are being mounted R/O. We have to defer the
+		 *    operation.
+		 * 4. However, this master node (@c->mst_node) is marked as
+		 *    clean (since the step 1). And if we just return, the
+		 *    mount code will be confused and won't recover the master
+		 *    node when it is re-mounter R/W later.
+		 *
+		 *    Thus, to force the recovery by marking the master node as
+		 *    dirty.
+		 */
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	} else {
+		/* Write the recovered master node */
+		c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
+		err = write_rcvrd_mst_node(c, c->mst_node);
+		if (err)
+			goto out_free;
+	}
+
+	vfree(buf2);
+	vfree(buf1);
+
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	ubifs_err(c, "failed to recover master node");
+	if (mst1) {
+		ubifs_err(c, "dumping first master node");
+		ubifs_dump_node(c, mst1);
+	}
+	if (mst2) {
+		ubifs_err(c, "dumping second master node");
+		ubifs_dump_node(c, mst2);
+	}
+	vfree(buf2);
+	vfree(buf1);
+	return err;
+}
+
+/**
+ * ubifs_write_rcvrd_mst_node - write the recovered master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node that was recovered during mounting in
+ * read-only mode and must now be written because we are remounting rw.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
+{
+	int err;
+
+	if (!c->rcvrd_mst_node)
+		return 0;
+	c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
+	if (err)
+		return err;
+	kfree(c->rcvrd_mst_node);
+	c->rcvrd_mst_node = NULL;
+	return 0;
+}
+
+/**
+ * is_last_write - determine if an offset was in the last write to a LEB.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @offs: offset to check
+ *
+ * This function returns %1 if @offs was in the last write to the LEB whose data
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
+ * for subsequent empty space starting from the next @c->max_write_size
+ * boundary.
+ */
+static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
+{
+	int empty_offs, check_len;
+	uint8_t *p;
+
+	/*
+	 * Round up to the next @c->max_write_size boundary i.e. @offs is in
+	 * the last wbuf written. After that should be empty space.
+	 */
+	empty_offs = ALIGN(offs + 1, c->max_write_size);
+	check_len = c->leb_size - empty_offs;
+	p = buf + empty_offs - offs;
+	return is_empty(p, check_len);
+}
+
+/**
+ * clean_buf - clean the data from an LEB sitting in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to clean
+ * @lnum: LEB number to clean
+ * @offs: offset from which to clean
+ * @len: length of buffer
+ *
+ * This function pads up to the next min_io_size boundary (if there is one) and
+ * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
+ * @c->min_io_size boundary.
+ */
+static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
+		      int *offs, int *len)
+{
+	int empty_offs, pad_len;
+
+	lnum = lnum;
+	dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
+
+	ubifs_assert(!(*offs & 7));
+	empty_offs = ALIGN(*offs, c->min_io_size);
+	pad_len = empty_offs - *offs;
+	ubifs_pad(c, *buf, pad_len);
+	*offs += pad_len;
+	*buf += pad_len;
+	*len -= pad_len;
+	memset(*buf, 0xff, c->leb_size - empty_offs);
+}
+
+/**
+ * no_more_nodes - determine if there are no more nodes in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @len: length of buffer
+ * @lnum: LEB number of the LEB from which @buf was read
+ * @offs: offset from which @buf was read
+ *
+ * This function ensures that the corrupted node at @offs is the last thing
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
+ */
+static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
+			int lnum, int offs)
+{
+	struct ubifs_ch *ch = buf;
+	int skip, dlen = le32_to_cpu(ch->len);
+
+	/* Check for empty space after the corrupt node's common header */
+	skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	/*
+	 * The area after the common header size is not empty, so the common
+	 * header must be intact. Check it.
+	 */
+	if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+		dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
+		return 0;
+	}
+	/* Now we know the corrupt node's length we can skip over it */
+	skip = ALIGN(offs + dlen, c->max_write_size) - offs;
+	/* After which there should be empty space */
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+	return 0;
+}
+
+/**
+ * fix_unclean_leb - fix an unclean LEB.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB information
+ * @start: offset where scan started
+ */
+static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+			   int start)
+{
+	int lnum = sleb->lnum, endpt = start;
+
+	/* Get the end offset of the last node we are keeping */
+	if (!list_empty(&sleb->nodes)) {
+		struct ubifs_scan_node *snod;
+
+		snod = list_entry(sleb->nodes.prev,
+				  struct ubifs_scan_node, list);
+		endpt = snod->offs + snod->len;
+	}
+
+	if (c->ro_mount && !c->remounting_rw) {
+		/* Add to recovery list */
+		struct ubifs_unclean_leb *ucleb;
+
+		dbg_rcvry("need to fix LEB %d start %d endpt %d",
+			  lnum, start, sleb->endpt);
+		ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
+		if (!ucleb)
+			return -ENOMEM;
+		ucleb->lnum = lnum;
+		ucleb->endpt = endpt;
+		list_add_tail(&ucleb->list, &c->unclean_leb_list);
+	} else {
+		/* Write the fixed LEB back to flash */
+		int err;
+
+		dbg_rcvry("fixing LEB %d start %d endpt %d",
+			  lnum, start, sleb->endpt);
+		if (endpt == 0) {
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		} else {
+			int len = ALIGN(endpt, c->min_io_size);
+
+			if (start) {
+				err = ubifs_leb_read(c, lnum, sleb->buf, 0,
+						     start, 1);
+				if (err)
+					return err;
+			}
+			/* Pad to min_io_size */
+			if (len > endpt) {
+				int pad_len = len - ALIGN(endpt, 8);
+
+				if (pad_len > 0) {
+					void *buf = sleb->buf + len - pad_len;
+
+					ubifs_pad(c, buf, pad_len);
+				}
+			}
+			err = ubifs_leb_change(c, lnum, sleb->buf, len);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * drop_last_group - drop the last group of nodes.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * group of nodes of the scanned LEB.
+ */
+static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
+{
+	while (!list_empty(&sleb->nodes)) {
+		struct ubifs_scan_node *snod;
+		struct ubifs_ch *ch;
+
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+		ch = snod->node;
+		if (ch->group_type != UBIFS_IN_NODE_GROUP)
+			break;
+
+		dbg_rcvry("dropping grouped node at %d:%d",
+			  sleb->lnum, snod->offs);
+		*offs = snod->offs;
+		list_del(&snod->list);
+		kfree(snod);
+		sleb->nodes_cnt -= 1;
+	}
+}
+
+/**
+ * drop_last_node - drop the last node.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB.
+ */
+static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
+{
+	struct ubifs_scan_node *snod;
+
+	if (!list_empty(&sleb->nodes)) {
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+
+		dbg_rcvry("dropping last node at %d:%d",
+			  sleb->lnum, snod->offs);
+		*offs = snod->offs;
+		list_del(&snod->list);
+		kfree(snod);
+		sleb->nodes_cnt -= 1;
+	}
+}
+
+/**
+ * ubifs_recover_leb - scan and recover a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
+ *         belong to any journal head)
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ * Returns the scanned information on success and a negative error code on
+ * failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+					 int offs, void *sbuf, int jhead)
+{
+	int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
+	int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
+	struct ubifs_scan_leb *sleb;
+	void *buf = sbuf + offs;
+
+	dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
+
+	sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+	if (IS_ERR(sleb))
+		return sleb;
+
+	ubifs_assert(len >= 8);
+	while (len >= 8) {
+		dbg_scan("look at LEB %d:%d (%d bytes left)",
+			 lnum, offs, len);
+
+		cond_resched();
+
+		/*
+		 * Scan quietly until there is an error from which we cannot
+		 * recover
+		 */
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+		if (ret == SCANNED_A_NODE) {
+			/* A valid node, and not a padding node */
+			struct ubifs_ch *ch = buf;
+			int node_len;
+
+			err = ubifs_add_snod(c, sleb, buf, offs);
+			if (err)
+				goto error;
+			node_len = ALIGN(le32_to_cpu(ch->len), 8);
+			offs += node_len;
+			buf += node_len;
+			len -= node_len;
+		} else if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+		} else if (ret == SCANNED_EMPTY_SPACE ||
+			   ret == SCANNED_GARBAGE     ||
+			   ret == SCANNED_A_BAD_PAD_NODE ||
+			   ret == SCANNED_A_CORRUPT_NODE) {
+			dbg_rcvry("found corruption (%d) at %d:%d",
+				  ret, lnum, offs);
+			break;
+		} else {
+			ubifs_err(c, "unexpected return value %d", ret);
+			err = -EINVAL;
+			goto error;
+		}
+	}
+
+	if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
+		if (!is_last_write(c, buf, offs))
+			goto corrupted_rescan;
+	} else if (ret == SCANNED_A_CORRUPT_NODE) {
+		if (!no_more_nodes(c, buf, len, lnum, offs))
+			goto corrupted_rescan;
+	} else if (!is_empty(buf, len)) {
+		if (!is_last_write(c, buf, offs)) {
+			int corruption = first_non_ff(buf, len);
+
+			/*
+			 * See header comment for this file for more
+			 * explanations about the reasons we have this check.
+			 */
+			ubifs_err(c, "corrupt empty space LEB %d:%d, corruption starts at %d",
+				  lnum, offs, corruption);
+			/* Make sure we dump interesting non-0xFF data */
+			offs += corruption;
+			buf += corruption;
+			goto corrupted;
+		}
+	}
+
+	min_io_unit = round_down(offs, c->min_io_size);
+	if (grouped)
+		/*
+		 * If nodes are grouped, always drop the incomplete group at
+		 * the end.
+		 */
+		drop_last_group(sleb, &offs);
+
+	if (jhead == GCHD) {
+		/*
+		 * If this LEB belongs to the GC head then while we are in the
+		 * middle of the same min. I/O unit keep dropping nodes. So
+		 * basically, what we want is to make sure that the last min.
+		 * I/O unit where we saw the corruption is dropped completely
+		 * with all the uncorrupted nodes which may possibly sit there.
+		 *
+		 * In other words, let's name the min. I/O unit where the
+		 * corruption starts B, and the previous min. I/O unit A. The
+		 * below code tries to deal with a situation when half of B
+		 * contains valid nodes or the end of a valid node, and the
+		 * second half of B contains corrupted data or garbage. This
+		 * means that UBIFS had been writing to B just before the power
+		 * cut happened. I do not know how realistic is this scenario
+		 * that half of the min. I/O unit had been written successfully
+		 * and the other half not, but this is possible in our 'failure
+		 * mode emulation' infrastructure at least.
+		 *
+		 * So what is the problem, why we need to drop those nodes? Why
+		 * can't we just clean-up the second half of B by putting a
+		 * padding node there? We can, and this works fine with one
+		 * exception which was reproduced with power cut emulation
+		 * testing and happens extremely rarely.
+		 *
+		 * Imagine the file-system is full, we run GC which starts
+		 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
+		 * the current GC head LEB). The @c->gc_lnum is -1, which means
+		 * that GC will retain LEB X and will try to continue. Imagine
+		 * that LEB X is currently the dirtiest LEB, and the amount of
+		 * used space in LEB Y is exactly the same as amount of free
+		 * space in LEB X.
+		 *
+		 * And a power cut happens when nodes are moved from LEB X to
+		 * LEB Y. We are here trying to recover LEB Y which is the GC
+		 * head LEB. We find the min. I/O unit B as described above.
+		 * Then we clean-up LEB Y by padding min. I/O unit. And later
+		 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
+		 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
+		 * does not match because the amount of valid nodes there does
+		 * not fit the free space in LEB Y any more! And this is
+		 * because of the padding node which we added to LEB Y. The
+		 * user-visible effect of this which I once observed and
+		 * analysed is that we cannot mount the file-system with
+		 * -ENOSPC error.
+		 *
+		 * So obviously, to make sure that situation does not happen we
+		 * should free min. I/O unit B in LEB Y completely and the last
+		 * used min. I/O unit in LEB Y should be A. This is basically
+		 * what the below code tries to do.
+		 */
+		while (offs > min_io_unit)
+			drop_last_node(sleb, &offs);
+	}
+
+	buf = sbuf + offs;
+	len = c->leb_size - offs;
+
+	clean_buf(c, &buf, lnum, &offs, &len);
+	ubifs_end_scan(c, sleb, lnum, offs);
+
+	err = fix_unclean_leb(c, sleb, start);
+	if (err)
+		goto error;
+
+	return sleb;
+
+corrupted_rescan:
+	/* Re-scan the corrupted data with verbose messages */
+	ubifs_err(c, "corruption %d", ret);
+	ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+corrupted:
+	ubifs_scanned_corruption(c, lnum, offs, buf);
+	err = -EUCLEAN;
+error:
+	ubifs_err(c, "LEB %d scanning failed", lnum);
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+}
+
+/**
+ * get_cs_sqnum - get commit start sequence number.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of commit start node
+ * @offs: offset of commit start node
+ * @cs_sqnum: commit start sequence number is returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
+			unsigned long long *cs_sqnum)
+{
+	struct ubifs_cs_node *cs_node = NULL;
+	int err, ret;
+
+	dbg_rcvry("at %d:%d", lnum, offs);
+	cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
+	if (!cs_node)
+		return -ENOMEM;
+	if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
+		goto out_err;
+	err = ubifs_leb_read(c, lnum, (void *)cs_node, offs,
+			     UBIFS_CS_NODE_SZ, 0);
+	if (err && err != -EBADMSG)
+		goto out_free;
+	ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
+	if (ret != SCANNED_A_NODE) {
+		ubifs_err(c, "Not a valid node");
+		goto out_err;
+	}
+	if (cs_node->ch.node_type != UBIFS_CS_NODE) {
+		ubifs_err(c, "Node a CS node, type is %d", cs_node->ch.node_type);
+		goto out_err;
+	}
+	if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
+		ubifs_err(c, "CS node cmt_no %llu != current cmt_no %llu",
+			  (unsigned long long)le64_to_cpu(cs_node->cmt_no),
+			  c->cmt_no);
+		goto out_err;
+	}
+	*cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
+	dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
+	kfree(cs_node);
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	ubifs_err(c, "failed to get CS sqnum");
+	kfree(cs_node);
+	return err;
+}
+
+/**
+ * ubifs_recover_log_leb - scan and recover a log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by unclean reboots from which we are attempting to recover
+ * (assume that only the last log LEB can be corrupted by an unclean reboot).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+					     int offs, void *sbuf)
+{
+	struct ubifs_scan_leb *sleb;
+	int next_lnum;
+
+	dbg_rcvry("LEB %d", lnum);
+	next_lnum = lnum + 1;
+	if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+		next_lnum = UBIFS_LOG_LNUM;
+	if (next_lnum != c->ltail_lnum) {
+		/*
+		 * We can only recover at the end of the log, so check that the
+		 * next log LEB is empty or out of date.
+		 */
+		sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
+		if (IS_ERR(sleb))
+			return sleb;
+		if (sleb->nodes_cnt) {
+			struct ubifs_scan_node *snod;
+			unsigned long long cs_sqnum = c->cs_sqnum;
+
+			snod = list_entry(sleb->nodes.next,
+					  struct ubifs_scan_node, list);
+			if (cs_sqnum == 0) {
+				int err;
+
+				err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
+				if (err) {
+					ubifs_scan_destroy(sleb);
+					return ERR_PTR(err);
+				}
+			}
+			if (snod->sqnum > cs_sqnum) {
+				ubifs_err(c, "unrecoverable log corruption in LEB %d",
+					  lnum);
+				ubifs_scan_destroy(sleb);
+				return ERR_PTR(-EUCLEAN);
+			}
+		}
+		ubifs_scan_destroy(sleb);
+	}
+	return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
+}
+
+/**
+ * recover_head - recover a head.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of head to recover
+ * @offs: offset of head to recover
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at a head location.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf)
+{
+	int len = c->max_write_size, err;
+
+	if (offs + len > c->leb_size)
+		len = c->leb_size - offs;
+
+	if (!len)
+		return 0;
+
+	/* Read at the head location and check it is empty flash */
+	err = ubifs_leb_read(c, lnum, sbuf, offs, len, 1);
+	if (err || !is_empty(sbuf, len)) {
+		dbg_rcvry("cleaning head at %d:%d", lnum, offs);
+		if (offs == 0)
+			return ubifs_leb_unmap(c, lnum);
+		err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1);
+		if (err)
+			return err;
+		return ubifs_leb_change(c, lnum, sbuf, offs);
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_recover_inl_heads - recover index and LPT heads.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at the index and
+ * LPT head locations.
+ *
+ * This deals with the recovery of a half-completed journal commit. UBIFS is
+ * careful never to overwrite the last version of the index or the LPT. Because
+ * the index and LPT are wandering trees, data from a half-completed commit will
+ * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
+ * assumed to be empty and will be unmapped anyway before use, or in the index
+ * and LPT heads.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
+{
+	int err;
+
+	ubifs_assert(!c->ro_mount || c->remounting_rw);
+
+	dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
+	err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
+	if (err)
+		return err;
+
+	dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
+
+	return recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
+}
+
+/**
+ * clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * @c: UBIFS file-system description object
+ * @ucleb: unclean LEB information
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function reads a LEB up to a point pre-determined by the mount recovery,
+ * checks the nodes, and writes the result back to the flash, thereby cleaning
+ * off any following corruption, or non-fatal ECC errors.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int clean_an_unclean_leb(struct ubifs_info *c,
+				struct ubifs_unclean_leb *ucleb, void *sbuf)
+{
+	int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
+	void *buf = sbuf;
+
+	dbg_rcvry("LEB %d len %d", lnum, len);
+
+	if (len == 0) {
+		/* Nothing to read, just unmap it */
+		return ubifs_leb_unmap(c, lnum);
+	}
+
+	err = ubifs_leb_read(c, lnum, buf, offs, len, 0);
+	if (err && err != -EBADMSG)
+		return err;
+
+	while (len >= 8) {
+		int ret;
+
+		cond_resched();
+
+		/* Scan quietly until there is an error */
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+		if (ret == SCANNED_A_NODE) {
+			/* A valid node, and not a padding node */
+			struct ubifs_ch *ch = buf;
+			int node_len;
+
+			node_len = ALIGN(le32_to_cpu(ch->len), 8);
+			offs += node_len;
+			buf += node_len;
+			len -= node_len;
+			continue;
+		}
+
+		if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+			continue;
+		}
+
+		if (ret == SCANNED_EMPTY_SPACE) {
+			ubifs_err(c, "unexpected empty space at %d:%d",
+				  lnum, offs);
+			return -EUCLEAN;
+		}
+
+		if (quiet) {
+			/* Redo the last scan but noisily */
+			quiet = 0;
+			continue;
+		}
+
+		ubifs_scanned_corruption(c, lnum, offs, buf);
+		return -EUCLEAN;
+	}
+
+	/* Pad to min_io_size */
+	len = ALIGN(ucleb->endpt, c->min_io_size);
+	if (len > ucleb->endpt) {
+		int pad_len = len - ALIGN(ucleb->endpt, 8);
+
+		if (pad_len > 0) {
+			buf = c->sbuf + len - pad_len;
+			ubifs_pad(c, buf, pad_len);
+		}
+	}
+
+	/* Write back the LEB atomically */
+	err = ubifs_leb_change(c, lnum, sbuf, len);
+	if (err)
+		return err;
+
+	dbg_rcvry("cleaned LEB %d", lnum);
+
+	return 0;
+}
+
+/**
+ * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function cleans a LEB identified during recovery that needs to be
+ * written but was not because UBIFS was mounted read-only. This happens when
+ * remounting to read-write mode.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf)
+{
+	dbg_rcvry("recovery");
+	while (!list_empty(&c->unclean_leb_list)) {
+		struct ubifs_unclean_leb *ucleb;
+		int err;
+
+		ucleb = list_entry(c->unclean_leb_list.next,
+				   struct ubifs_unclean_leb, list);
+		err = clean_an_unclean_leb(c, ucleb, sbuf);
+		if (err)
+			return err;
+		list_del(&ucleb->list);
+		kfree(ucleb);
+	}
+	return 0;
+}
+
+/**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	/*
+	 * Note, it is very important to first search for an empty LEB and then
+	 * run the commit, not vice-versa. The reason is that there might be
+	 * only one empty LEB at the moment, the one which has been the
+	 * @c->gc_lnum just before the power cut happened. During the regular
+	 * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+	 * one but GC can grab it. But at this moment this single empty LEB is
+	 * not marked as taken, so if we run commit - what happens? Right, the
+	 * commit will grab it and write the index there. Remember that the
+	 * index always expands as long as there is free space, and it only
+	 * starts consolidating when we run out of space.
+	 *
+	 * IOW, if we run commit now, we might not be able to find a free LEB
+	 * after this.
+	 */
+	lnum = ubifs_find_free_leb_for_idx(c);
+	if (lnum < 0) {
+		ubifs_err(c, "could not find an empty LEB");
+		ubifs_dump_lprops(c);
+		ubifs_dump_budg(c, &c->bi);
+		return lnum;
+	}
+
+	/* Reset the index flag */
+	err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				  LPROPS_INDEX, 0);
+	if (err)
+		return err;
+
+	c->gc_lnum = lnum;
+	dbg_rcvry("found empty LEB %d, run commit", lnum);
+
+	return ubifs_run_commit(c);
+}
+
+/**
+ * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
+ * @c: UBIFS file-system description object
+ *
+ * Out-of-place garbage collection requires always one empty LEB with which to
+ * start garbage collection. The LEB number is recorded in c->gc_lnum and is
+ * written to the master node on unmounting. In the case of an unclean unmount
+ * the value of gc_lnum recorded in the master node is out of date and cannot
+ * be used. Instead, recovery must allocate an empty LEB for this purpose.
+ * However, there may not be enough empty space, in which case it must be
+ * possible to GC the dirtiest LEB into the GC head LEB.
+ *
+ * This function also runs the commit which causes the TNC updates from
+ * size-recovery and orphans to be written to the flash. That is important to
+ * ensure correct replay order for subsequent mounts.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_rcvry_gc_commit(struct ubifs_info *c)
+{
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+	struct ubifs_lprops lp;
+	int err;
+
+	dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
+
+	c->gc_lnum = -1;
+	if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
+		return grab_empty_leb(c);
+
+	err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+
+		dbg_rcvry("could not find a dirty LEB");
+		return grab_empty_leb(c);
+	}
+
+	ubifs_assert(!(lp.flags & LPROPS_INDEX));
+	ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
+
+	/*
+	 * We run the commit before garbage collection otherwise subsequent
+	 * mounts will see the GC and orphan deletion in a different order.
+	 */
+	dbg_rcvry("committing");
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+
+	dbg_rcvry("GC'ing LEB %d", lp.lnum);
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	err = ubifs_garbage_collect_leb(c, &lp);
+	if (err >= 0) {
+		int err2 = ubifs_wbuf_sync_nolock(wbuf);
+
+		if (err2)
+			err = err2;
+	}
+	mutex_unlock(&wbuf->io_mutex);
+	if (err < 0) {
+		ubifs_err(c, "GC failed, error %d", err);
+		if (err == -EAGAIN)
+			err = -EINVAL;
+		return err;
+	}
+
+	ubifs_assert(err == LEB_RETAINED);
+	if (err != LEB_RETAINED)
+		return -EINVAL;
+
+	err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err)
+		return err;
+
+	dbg_rcvry("allocated LEB %d for GC", lp.lnum);
+	return 0;
+}
+
+/**
+ * struct size_entry - inode size information for recovery.
+ * @rb: link in the RB-tree of sizes
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ * @inode: inode if pinned in memory awaiting rw mode to fix it
+ */
+struct size_entry {
+	struct rb_node rb;
+	ino_t inum;
+	loff_t i_size;
+	loff_t d_size;
+	int exists;
+	struct inode *inode;
+};
+
+/**
+ * add_ino - add an entry to the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ */
+static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
+		   loff_t d_size, int exists)
+{
+	struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
+	struct size_entry *e;
+
+	while (*p) {
+		parent = *p;
+		e = rb_entry(parent, struct size_entry, rb);
+		if (inum < e->inum)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	e->inum = inum;
+	e->i_size = i_size;
+	e->d_size = d_size;
+	e->exists = exists;
+
+	rb_link_node(&e->rb, parent, p);
+	rb_insert_color(&e->rb, &c->size_tree);
+
+	return 0;
+}
+
+/**
+ * find_ino - find an entry on the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
+{
+	struct rb_node *p = c->size_tree.rb_node;
+	struct size_entry *e;
+
+	while (p) {
+		e = rb_entry(p, struct size_entry, rb);
+		if (inum < e->inum)
+			p = p->rb_left;
+		else if (inum > e->inum)
+			p = p->rb_right;
+		else
+			return e;
+	}
+	return NULL;
+}
+
+/**
+ * remove_ino - remove an entry from the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static void remove_ino(struct ubifs_info *c, ino_t inum)
+{
+	struct size_entry *e = find_ino(c, inum);
+
+	if (!e)
+		return;
+	rb_erase(&e->rb, &c->size_tree);
+	kfree(e);
+}
+
+/**
+ * ubifs_destroy_size_tree - free resources related to the size tree.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_destroy_size_tree(struct ubifs_info *c)
+{
+	struct size_entry *e, *n;
+
+	rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
+		if (e->inode)
+			iput(e->inode);
+		kfree(e);
+	}
+
+	c->size_tree = RB_ROOT;
+}
+
+/**
+ * ubifs_recover_size_accum - accumulate inode sizes for recovery.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @deletion: node is for a deletion
+ * @new_size: inode size
+ *
+ * This function has two purposes:
+ *     1) to ensure there are no data nodes that fall outside the inode size
+ *     2) to ensure there are no data nodes for inodes that do not exist
+ * To accomplish those purposes, a rb-tree is constructed containing an entry
+ * for each inode number in the journal that has not been deleted, and recording
+ * the size from the inode node, the maximum size of any data node (also altered
+ * by truncations) and a flag indicating a inode number for which no inode node
+ * was present in the journal.
+ *
+ * Note that there is still the possibility that there are data nodes that have
+ * been committed that are beyond the inode size, however the only way to find
+ * them would be to scan the entire index. Alternatively, some provision could
+ * be made to record the size of inodes at the start of commit, which would seem
+ * very cumbersome for a scenario that is quite unlikely and the only negative
+ * consequence of which is wasted space.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+			     int deletion, loff_t new_size)
+{
+	ino_t inum = key_inum(c, key);
+	struct size_entry *e;
+	int err;
+
+	switch (key_type(c, key)) {
+	case UBIFS_INO_KEY:
+		if (deletion)
+			remove_ino(c, inum);
+		else {
+			e = find_ino(c, inum);
+			if (e) {
+				e->i_size = new_size;
+				e->exists = 1;
+			} else {
+				err = add_ino(c, inum, new_size, 0, 1);
+				if (err)
+					return err;
+			}
+		}
+		break;
+	case UBIFS_DATA_KEY:
+		e = find_ino(c, inum);
+		if (e) {
+			if (new_size > e->d_size)
+				e->d_size = new_size;
+		} else {
+			err = add_ino(c, inum, 0, new_size, 0);
+			if (err)
+				return err;
+		}
+		break;
+	case UBIFS_TRUN_KEY:
+		e = find_ino(c, inum);
+		if (e)
+			e->d_size = new_size;
+		break;
+	}
+	return 0;
+}
+
+/**
+ * fix_size_in_place - fix inode size in place on flash.
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
+{
+	struct ubifs_ino_node *ino = c->sbuf;
+	unsigned char *p;
+	union ubifs_key key;
+	int err, lnum, offs, len;
+	loff_t i_size;
+	uint32_t crc;
+
+	/* Locate the inode node LEB number and offset */
+	ino_key_init(c, &key, e->inum);
+	err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
+	if (err)
+		goto out;
+	/*
+	 * If the size recorded on the inode node is greater than the size that
+	 * was calculated from nodes in the journal then don't change the inode.
+	 */
+	i_size = le64_to_cpu(ino->size);
+	if (i_size >= e->d_size)
+		return 0;
+	/* Read the LEB */
+	err = ubifs_leb_read(c, lnum, c->sbuf, 0, c->leb_size, 1);
+	if (err)
+		goto out;
+	/* Change the size field and recalculate the CRC */
+	ino = c->sbuf + offs;
+	ino->size = cpu_to_le64(e->d_size);
+	len = le32_to_cpu(ino->ch.len);
+	crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
+	ino->ch.crc = cpu_to_le32(crc);
+	/* Work out where data in the LEB ends and free space begins */
+	p = c->sbuf;
+	len = c->leb_size - 1;
+	while (p[len] == 0xff)
+		len -= 1;
+	len = ALIGN(len + 1, c->min_io_size);
+	/* Atomically write the fixed LEB back again */
+	err = ubifs_leb_change(c, lnum, c->sbuf, len);
+	if (err)
+		goto out;
+	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
+		  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
+	return 0;
+
+out:
+	ubifs_warn(c, "inode %lu failed to fix size %lld -> %lld error %d",
+		   (unsigned long)e->inum, e->i_size, e->d_size, err);
+	return err;
+}
+
+/**
+ * ubifs_recover_size - recover inode size.
+ * @c: UBIFS file-system description object
+ *
+ * This function attempts to fix inode size discrepancies identified by the
+ * 'ubifs_recover_size_accum()' function.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size(struct ubifs_info *c)
+{
+	struct rb_node *this = rb_first(&c->size_tree);
+
+	while (this) {
+		struct size_entry *e;
+		int err;
+
+		e = rb_entry(this, struct size_entry, rb);
+		if (!e->exists) {
+			union ubifs_key key;
+
+			ino_key_init(c, &key, e->inum);
+			err = ubifs_tnc_lookup(c, &key, c->sbuf);
+			if (err && err != -ENOENT)
+				return err;
+			if (err == -ENOENT) {
+				/* Remove data nodes that have no inode */
+				dbg_rcvry("removing ino %lu",
+					  (unsigned long)e->inum);
+				err = ubifs_tnc_remove_ino(c, e->inum);
+				if (err)
+					return err;
+			} else {
+				struct ubifs_ino_node *ino = c->sbuf;
+
+				e->exists = 1;
+				e->i_size = le64_to_cpu(ino->size);
+			}
+		}
+
+		if (e->exists && e->i_size < e->d_size) {
+			if (c->ro_mount) {
+				/* Fix the inode size and pin it in memory */
+				struct inode *inode;
+				struct ubifs_inode *ui;
+
+				ubifs_assert(!e->inode);
+
+				inode = ubifs_iget(c->vfs_sb, e->inum);
+				if (IS_ERR(inode))
+					return PTR_ERR(inode);
+
+				ui = ubifs_inode(inode);
+				if (inode->i_size < e->d_size) {
+					dbg_rcvry("ino %lu size %lld -> %lld",
+						  (unsigned long)e->inum,
+						  inode->i_size, e->d_size);
+					inode->i_size = e->d_size;
+					ui->ui_size = e->d_size;
+					ui->synced_i_size = e->d_size;
+					e->inode = inode;
+					this = rb_next(this);
+					continue;
+				}
+				iput(inode);
+			} else {
+				/* Fix the size in place */
+				err = fix_size_in_place(c, e);
+				if (err)
+					return err;
+				if (e->inode)
+					iput(e->inode);
+			}
+		}
+
+		this = rb_next(this);
+		rb_erase(&e->rb, &c->size_tree);
+		kfree(e);
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/ubifs/replay.c b/kernel/fs/ubifs/replay.c
new file mode 100644
index 000000000..3ca454013
--- /dev/null
+++ b/kernel/fs/ubifs/replay.c
@@ -0,0 +1,1082 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains journal replay code. It runs when the file-system is being
+ * mounted and requires no locking.
+ *
+ * The larger is the journal, the longer it takes to scan it, so the longer it
+ * takes to mount UBIFS. This is why the journal has limited size which may be
+ * changed depending on the system requirements. But a larger journal gives
+ * faster I/O speed because it writes the index less frequently. So this is a
+ * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
+ * larger is the journal, the more memory its index may consume.
+ */
+
+#include "ubifs.h"
+#include <linux/list_sort.h>
+
+/**
+ * struct replay_entry - replay list entry.
+ * @lnum: logical eraseblock number of the node
+ * @offs: node offset
+ * @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
+ * @sqnum: node sequence number
+ * @list: links the replay list
+ * @key: node key
+ * @nm: directory entry name
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * The replay process first scans all buds and builds the replay list, then
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
+ */
+struct replay_entry {
+	int lnum;
+	int offs;
+	int len;
+	unsigned int deletion:1;
+	unsigned long long sqnum;
+	struct list_head list;
+	union ubifs_key key;
+	union {
+		struct qstr nm;
+		struct {
+			loff_t old_size;
+			loff_t new_size;
+		};
+	};
+};
+
+/**
+ * struct bud_entry - entry in the list of buds to replay.
+ * @list: next bud in the list
+ * @bud: bud description object
+ * @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
+ */
+struct bud_entry {
+	struct list_head list;
+	struct ubifs_bud *bud;
+	unsigned long long sqnum;
+	int free;
+	int dirty;
+};
+
+/**
+ * set_bud_lprops - set free and dirty space used by a bud.
+ * @c: UBIFS file-system description object
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
+{
+	const struct ubifs_lprops *lp;
+	int err = 0, dirty;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	dirty = lp->dirty;
+	if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+		/*
+		 * The LEB was added to the journal with a starting offset of
+		 * zero which means the LEB must have been empty. The LEB
+		 * property values should be @lp->free == @c->leb_size and
+		 * @lp->dirty == 0, but that is not the case. The reason is that
+		 * the LEB had been garbage collected before it became the bud,
+		 * and there was not commit inbetween. The garbage collector
+		 * resets the free and dirty space without recording it
+		 * anywhere except lprops, so if there was no commit then
+		 * lprops does not have that information.
+		 *
+		 * We do not need to adjust free space because the scan has told
+		 * us the exact value which is recorded in the replay entry as
+		 * @b->free.
+		 *
+		 * However we do need to subtract from the dirty space the
+		 * amount of space that the garbage collector reclaimed, which
+		 * is the whole LEB minus the amount of space that was free.
+		 */
+		dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
+			lp->free, lp->dirty);
+		dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
+			lp->free, lp->dirty);
+		dirty -= c->leb_size - lp->free;
+		/*
+		 * If the replay order was perfect the dirty space would now be
+		 * zero. The order is not perfect because the journal heads
+		 * race with each other. This is not a problem but is does mean
+		 * that the dirty space may temporarily exceed c->leb_size
+		 * during the replay.
+		 */
+		if (dirty != 0)
+			dbg_mnt("LEB %d lp: %d free %d dirty replay: %d free %d dirty",
+				b->bud->lnum, lp->free, lp->dirty, b->free,
+				b->dirty);
+	}
+	lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	/* Make sure the journal head points to the latest bud */
+	err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
+				     b->bud->lnum, c->leb_size - b->free);
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+	int err;
+
+	list_for_each_entry(b, &c->replay_buds, list) {
+		err = set_bud_lprops(c, b);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * trun_remove_range - apply a replay entry for a truncation to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of truncation
+ */
+static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
+{
+	unsigned min_blk, max_blk;
+	union ubifs_key min_key, max_key;
+	ino_t ino;
+
+	min_blk = r->new_size / UBIFS_BLOCK_SIZE;
+	if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
+		min_blk += 1;
+
+	max_blk = r->old_size / UBIFS_BLOCK_SIZE;
+	if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
+		max_blk -= 1;
+
+	ino = key_inum(c, &r->key);
+
+	data_key_init(c, &min_key, ino, min_blk);
+	data_key_init(c, &max_key, ino, max_blk);
+
+	return ubifs_tnc_remove_range(c, &min_key, &max_key);
+}
+
+/**
+ * apply_replay_entry - apply a replay entry to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry to apply
+ *
+ * Apply a replay entry to the TNC.
+ */
+static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
+{
+	int err;
+
+	dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
+		 r->lnum, r->offs, r->len, r->deletion, r->sqnum);
+
+	/* Set c->replay_sqnum to help deal with dangling branches. */
+	c->replay_sqnum = r->sqnum;
+
+	if (is_hash_key(c, &r->key)) {
+		if (r->deletion)
+			err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
+		else
+			err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
+					       r->len, &r->nm);
+	} else {
+		if (r->deletion)
+			switch (key_type(c, &r->key)) {
+			case UBIFS_INO_KEY:
+			{
+				ino_t inum = key_inum(c, &r->key);
+
+				err = ubifs_tnc_remove_ino(c, inum);
+				break;
+			}
+			case UBIFS_TRUN_KEY:
+				err = trun_remove_range(c, r);
+				break;
+			default:
+				err = ubifs_tnc_remove(c, &r->key);
+				break;
+			}
+		else
+			err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
+					    r->len);
+		if (err)
+			return err;
+
+		if (c->need_recovery)
+			err = ubifs_recover_size_accum(c, &r->key, r->deletion,
+						       r->new_size);
+	}
+
+	return err;
+}
+
+/**
+ * replay_entries_cmp - compare 2 replay entries.
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
+ *
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer.  Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
+ */
+static int replay_entries_cmp(void *priv, struct list_head *a,
+			      struct list_head *b)
+{
+	struct replay_entry *ra, *rb;
+
+	cond_resched();
+	if (a == b)
+		return 0;
+
+	ra = list_entry(a, struct replay_entry, list);
+	rb = list_entry(b, struct replay_entry, list);
+	ubifs_assert(ra->sqnum != rb->sqnum);
+	if (ra->sqnum > rb->sqnum)
+		return 1;
+	return -1;
+}
+
+/**
+ * apply_replay_list - apply the replay list to the TNC.
+ * @c: UBIFS file-system description object
+ *
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int apply_replay_list(struct ubifs_info *c)
+{
+	struct replay_entry *r;
+	int err;
+
+	list_sort(c, &c->replay_list, &replay_entries_cmp);
+
+	list_for_each_entry(r, &c->replay_list, list) {
+		cond_resched();
+
+		err = apply_replay_entry(c, r);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+	struct replay_entry *r, *tmp;
+
+	list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+		if (is_hash_key(c, &r->key))
+			kfree(r->nm.name);
+		list_del(&r->list);
+		kfree(r);
+	}
+}
+
+/**
+ * insert_node - insert a node to the replay list
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * This function inserts a scanned non-direntry node to the replay list. The
+ * replay list contains @struct replay_entry elements, and we sort this list in
+ * sequence number order before applying it. The replay list is applied at the
+ * very end of the replay process. Since the list is sorted in sequence number
+ * order, the older modifications are applied first. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
+		       union ubifs_key *key, unsigned long long sqnum,
+		       int deletion, int *used, loff_t old_size,
+		       loff_t new_size)
+{
+	struct replay_entry *r;
+
+	dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
+
+	if (key_inum(c, key) >= c->highest_inum)
+		c->highest_inum = key_inum(c, key);
+
+	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	if (!deletion)
+		*used += ALIGN(len, 8);
+	r->lnum = lnum;
+	r->offs = offs;
+	r->len = len;
+	r->deletion = !!deletion;
+	r->sqnum = sqnum;
+	key_copy(c, key, &r->key);
+	r->old_size = old_size;
+	r->new_size = new_size;
+
+	list_add_tail(&r->list, &c->replay_list);
+	return 0;
+}
+
+/**
+ * insert_dent - insert a directory entry node into the replay list.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @name: directory entry name
+ * @nlen: directory entry name length
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ *
+ * This function inserts a scanned directory entry node or an extended
+ * attribute entry to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
+		       union ubifs_key *key, const char *name, int nlen,
+		       unsigned long long sqnum, int deletion, int *used)
+{
+	struct replay_entry *r;
+	char *nbuf;
+
+	dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
+	if (key_inum(c, key) >= c->highest_inum)
+		c->highest_inum = key_inum(c, key);
+
+	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	nbuf = kmalloc(nlen + 1, GFP_KERNEL);
+	if (!nbuf) {
+		kfree(r);
+		return -ENOMEM;
+	}
+
+	if (!deletion)
+		*used += ALIGN(len, 8);
+	r->lnum = lnum;
+	r->offs = offs;
+	r->len = len;
+	r->deletion = !!deletion;
+	r->sqnum = sqnum;
+	key_copy(c, key, &r->key);
+	r->nm.len = nlen;
+	memcpy(nbuf, name, nlen);
+	nbuf[nlen] = '\0';
+	r->nm.name = nbuf;
+
+	list_add_tail(&r->list, &c->replay_list);
+	return 0;
+}
+
+/**
+ * ubifs_validate_entry - validate directory or extended attribute entry node.
+ * @c: UBIFS file-system description object
+ * @dent: the node to validate
+ *
+ * This function validates directory or extended attribute entry node @dent.
+ * Returns zero if the node is all right and a %-EINVAL if not.
+ */
+int ubifs_validate_entry(struct ubifs_info *c,
+			 const struct ubifs_dent_node *dent)
+{
+	int key_type = key_type_flash(c, dent->key);
+	int nlen = le16_to_cpu(dent->nlen);
+
+	if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
+	    dent->type >= UBIFS_ITYPES_CNT ||
+	    nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
+	    strnlen(dent->name, nlen) != nlen ||
+	    le64_to_cpu(dent->inum) > MAX_INUM) {
+		ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ?
+			  "directory entry" : "extended attribute entry");
+		return -EINVAL;
+	}
+
+	if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
+		ubifs_err(c, "bad key type %d", key_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+	struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+	struct ubifs_bud *next;
+	uint32_t data;
+	int err;
+
+	if (list_is_last(&bud->list, &jh->buds_list))
+		return 1;
+
+	/*
+	 * The following is a quirk to make sure we work correctly with UBIFS
+	 * images used with older UBIFS.
+	 *
+	 * Normally, the last bud will be the last in the journal head's list
+	 * of bud. However, there is one exception if the UBIFS image belongs
+	 * to older UBIFS. This is fairly unlikely: one would need to use old
+	 * UBIFS, then have a power cut exactly at the right point, and then
+	 * try to mount this image with new UBIFS.
+	 *
+	 * The exception is: it is possible to have 2 buds A and B, A goes
+	 * before B, and B is the last, bud B is contains no data, and bud A is
+	 * corrupted at the end. The reason is that in older versions when the
+	 * journal code switched the next bud (from A to B), it first added a
+	 * log reference node for the new bud (B), and only after this it
+	 * synchronized the write-buffer of current bud (A). But later this was
+	 * changed and UBIFS started to always synchronize the write-buffer of
+	 * the bud (A) before writing the log reference for the new bud (B).
+	 *
+	 * But because older UBIFS always synchronized A's write-buffer before
+	 * writing to B, we can recognize this exceptional situation but
+	 * checking the contents of bud B - if it is empty, then A can be
+	 * treated as the last and we can recover it.
+	 *
+	 * TODO: remove this piece of code in a couple of years (today it is
+	 * 16.05.2011).
+	 */
+	next = list_entry(bud->list.next, struct ubifs_bud, list);
+	if (!list_is_last(&next->list, &jh->buds_list))
+		return 0;
+
+	err = ubifs_leb_read(c, next->lnum, (char *)&data, next->start, 4, 1);
+	if (err)
+		return 0;
+
+	return data == 0xFFFFFFFF;
+}
+
+/**
+ * replay_bud - replay a bud logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @b: bud entry which describes the bud
+ *
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
+{
+	int is_last = is_last_bud(c, b->bud);
+	int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+
+	dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
+		lnum, b->bud->jhead, offs, is_last);
+
+	if (c->need_recovery && is_last)
+		/*
+		 * Recover only last LEBs in the journal heads, because power
+		 * cuts may cause corruptions only in these LEBs, because only
+		 * these LEBs could possibly be written to at the power cut
+		 * time.
+		 */
+		sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
+	else
+		sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+
+	/*
+	 * The bud does not have to start from offset zero - the beginning of
+	 * the 'lnum' LEB may contain previously committed data. One of the
+	 * things we have to do in replay is to correctly update lprops with
+	 * newer information about this LEB.
+	 *
+	 * At this point lprops thinks that this LEB has 'c->leb_size - offs'
+	 * bytes of free space because it only contain information about
+	 * committed data.
+	 *
+	 * But we know that real amount of free space is 'c->leb_size -
+	 * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
+	 * 'sleb->endpt' is used by bud data. We have to correctly calculate
+	 * how much of these data are dirty and update lprops with this
+	 * information.
+	 *
+	 * The dirt in that LEB region is comprised of padding nodes, deletion
+	 * nodes, truncation nodes and nodes which are obsoleted by subsequent
+	 * nodes in this LEB. So instead of calculating clean space, we
+	 * calculate used space ('used' variable).
+	 */
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		int deletion = 0;
+
+		cond_resched();
+
+		if (snod->sqnum >= SQNUM_WATERMARK) {
+			ubifs_err(c, "file system's life ended");
+			goto out_dump;
+		}
+
+		if (snod->sqnum > c->max_sqnum)
+			c->max_sqnum = snod->sqnum;
+
+		switch (snod->type) {
+		case UBIFS_INO_NODE:
+		{
+			struct ubifs_ino_node *ino = snod->node;
+			loff_t new_size = le64_to_cpu(ino->size);
+
+			if (le32_to_cpu(ino->nlink) == 0)
+				deletion = 1;
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &snod->key, snod->sqnum, deletion,
+					  &used, 0, new_size);
+			break;
+		}
+		case UBIFS_DATA_NODE:
+		{
+			struct ubifs_data_node *dn = snod->node;
+			loff_t new_size = le32_to_cpu(dn->size) +
+					  key_block(c, &snod->key) *
+					  UBIFS_BLOCK_SIZE;
+
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &snod->key, snod->sqnum, deletion,
+					  &used, 0, new_size);
+			break;
+		}
+		case UBIFS_DENT_NODE:
+		case UBIFS_XENT_NODE:
+		{
+			struct ubifs_dent_node *dent = snod->node;
+
+			err = ubifs_validate_entry(c, dent);
+			if (err)
+				goto out_dump;
+
+			err = insert_dent(c, lnum, snod->offs, snod->len,
+					  &snod->key, dent->name,
+					  le16_to_cpu(dent->nlen), snod->sqnum,
+					  !le64_to_cpu(dent->inum), &used);
+			break;
+		}
+		case UBIFS_TRUN_NODE:
+		{
+			struct ubifs_trun_node *trun = snod->node;
+			loff_t old_size = le64_to_cpu(trun->old_size);
+			loff_t new_size = le64_to_cpu(trun->new_size);
+			union ubifs_key key;
+
+			/* Validate truncation node */
+			if (old_size < 0 || old_size > c->max_inode_sz ||
+			    new_size < 0 || new_size > c->max_inode_sz ||
+			    old_size <= new_size) {
+				ubifs_err(c, "bad truncation node");
+				goto out_dump;
+			}
+
+			/*
+			 * Create a fake truncation key just to use the same
+			 * functions which expect nodes to have keys.
+			 */
+			trun_key_init(c, &key, le32_to_cpu(trun->inum));
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &key, snod->sqnum, 1, &used,
+					  old_size, new_size);
+			break;
+		}
+		default:
+			ubifs_err(c, "unexpected node type %d in bud LEB %d:%d",
+				  snod->type, lnum, snod->offs);
+			err = -EINVAL;
+			goto out_dump;
+		}
+		if (err)
+			goto out;
+	}
+
+	ubifs_assert(ubifs_search_bud(c, lnum));
+	ubifs_assert(sleb->endpt - offs >= used);
+	ubifs_assert(sleb->endpt % c->min_io_size == 0);
+
+	b->dirty = sleb->endpt - offs - used;
+	b->free = c->leb_size - sleb->endpt;
+	dbg_mnt("bud LEB %d replied: dirty %d, free %d",
+		lnum, b->dirty, b->free);
+
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_dump:
+	ubifs_err(c, "bad node is at LEB %d:%d", lnum, snod->offs);
+	ubifs_dump_node(c, snod->node);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * replay_buds - replay all buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_buds(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+	int err;
+	unsigned long long prev_sqnum = 0;
+
+	list_for_each_entry(b, &c->replay_buds, list) {
+		err = replay_bud(c, b);
+		if (err)
+			return err;
+
+		ubifs_assert(b->sqnum > prev_sqnum);
+		prev_sqnum = b->sqnum;
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_bud_list - destroy the list of buds to replay.
+ * @c: UBIFS file-system description object
+ */
+static void destroy_bud_list(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+
+	while (!list_empty(&c->replay_buds)) {
+		b = list_entry(c->replay_buds.next, struct bud_entry, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+}
+
+/**
+ * add_replay_bud - add a bud to the list of buds to replay.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @sqnum: reference node sequence number
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+			  unsigned long long sqnum)
+{
+	struct ubifs_bud *bud;
+	struct bud_entry *b;
+
+	dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
+
+	bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
+	if (!bud)
+		return -ENOMEM;
+
+	b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
+	if (!b) {
+		kfree(bud);
+		return -ENOMEM;
+	}
+
+	bud->lnum = lnum;
+	bud->start = offs;
+	bud->jhead = jhead;
+	ubifs_add_bud(c, bud);
+
+	b->bud = bud;
+	b->sqnum = sqnum;
+	list_add_tail(&b->list, &c->replay_buds);
+
+	return 0;
+}
+
+/**
+ * validate_ref - validate a reference node.
+ * @c: UBIFS file-system description object
+ * @ref: the reference node to validate
+ * @ref_lnum: LEB number of the reference node
+ * @ref_offs: reference node offset
+ *
+ * This function returns %1 if a bud reference already exists for the LEB. %0 is
+ * returned if the reference node is new, otherwise %-EINVAL is returned if
+ * validation failed.
+ */
+static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
+{
+	struct ubifs_bud *bud;
+	int lnum = le32_to_cpu(ref->lnum);
+	unsigned int offs = le32_to_cpu(ref->offs);
+	unsigned int jhead = le32_to_cpu(ref->jhead);
+
+	/*
+	 * ref->offs may point to the end of LEB when the journal head points
+	 * to the end of LEB and we write reference node for it during commit.
+	 * So this is why we require 'offs > c->leb_size'.
+	 */
+	if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
+	    lnum < c->main_first || offs > c->leb_size ||
+	    offs & (c->min_io_size - 1))
+		return -EINVAL;
+
+	/* Make sure we have not already looked at this bud */
+	bud = ubifs_search_bud(c, lnum);
+	if (bud) {
+		if (bud->jhead == jhead && bud->start <= offs)
+			return 1;
+		ubifs_err(c, "bud at LEB %d:%d was already referred", lnum, offs);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * replay_log_leb - replay a log logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: log logical eraseblock to replay
+ * @offs: offset to start replaying from
+ * @sbuf: scan buffer
+ *
+ * This function replays a log LEB and returns zero in case of success, %1 if
+ * this is the last LEB in the log, and a negative error code in case of
+ * failure.
+ */
+static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
+{
+	int err;
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	const struct ubifs_cs_node *node;
+
+	dbg_mnt("replay log LEB %d:%d", lnum, offs);
+	sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
+	if (IS_ERR(sleb)) {
+		if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
+			return PTR_ERR(sleb);
+		/*
+		 * Note, the below function will recover this log LEB only if
+		 * it is the last, because unclean reboots can possibly corrupt
+		 * only the tail of the log.
+		 */
+		sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+		if (IS_ERR(sleb))
+			return PTR_ERR(sleb);
+	}
+
+	if (sleb->nodes_cnt == 0) {
+		err = 1;
+		goto out;
+	}
+
+	node = sleb->buf;
+	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+	if (c->cs_sqnum == 0) {
+		/*
+		 * This is the first log LEB we are looking at, make sure that
+		 * the first node is a commit start node. Also record its
+		 * sequence number so that UBIFS can determine where the log
+		 * ends, because all nodes which were have higher sequence
+		 * numbers.
+		 */
+		if (snod->type != UBIFS_CS_NODE) {
+			ubifs_err(c, "first log node at LEB %d:%d is not CS node",
+				  lnum, offs);
+			goto out_dump;
+		}
+		if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
+			ubifs_err(c, "first CS node at LEB %d:%d has wrong commit number %llu expected %llu",
+				  lnum, offs,
+				  (unsigned long long)le64_to_cpu(node->cmt_no),
+				  c->cmt_no);
+			goto out_dump;
+		}
+
+		c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
+		dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+	}
+
+	if (snod->sqnum < c->cs_sqnum) {
+		/*
+		 * This means that we reached end of log and now
+		 * look to the older log data, which was already
+		 * committed but the eraseblock was not erased (UBIFS
+		 * only un-maps it). So this basically means we have to
+		 * exit with "end of log" code.
+		 */
+		err = 1;
+		goto out;
+	}
+
+	/* Make sure the first node sits at offset zero of the LEB */
+	if (snod->offs != 0) {
+		ubifs_err(c, "first node is not at zero offset");
+		goto out_dump;
+	}
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+
+		if (snod->sqnum >= SQNUM_WATERMARK) {
+			ubifs_err(c, "file system's life ended");
+			goto out_dump;
+		}
+
+		if (snod->sqnum < c->cs_sqnum) {
+			ubifs_err(c, "bad sqnum %llu, commit sqnum %llu",
+				  snod->sqnum, c->cs_sqnum);
+			goto out_dump;
+		}
+
+		if (snod->sqnum > c->max_sqnum)
+			c->max_sqnum = snod->sqnum;
+
+		switch (snod->type) {
+		case UBIFS_REF_NODE: {
+			const struct ubifs_ref_node *ref = snod->node;
+
+			err = validate_ref(c, ref);
+			if (err == 1)
+				break; /* Already have this bud */
+			if (err)
+				goto out_dump;
+
+			err = add_replay_bud(c, le32_to_cpu(ref->lnum),
+					     le32_to_cpu(ref->offs),
+					     le32_to_cpu(ref->jhead),
+					     snod->sqnum);
+			if (err)
+				goto out;
+
+			break;
+		}
+		case UBIFS_CS_NODE:
+			/* Make sure it sits at the beginning of LEB */
+			if (snod->offs != 0) {
+				ubifs_err(c, "unexpected node in log");
+				goto out_dump;
+			}
+			break;
+		default:
+			ubifs_err(c, "unexpected node in log");
+			goto out_dump;
+		}
+	}
+
+	if (sleb->endpt || c->lhead_offs >= c->leb_size) {
+		c->lhead_lnum = lnum;
+		c->lhead_offs = sleb->endpt;
+	}
+
+	err = !sleb->endpt;
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_dump:
+	ubifs_err(c, "log error detected while replaying the log at LEB %d:%d",
+		  lnum, offs + snod->offs);
+	ubifs_dump_node(c, snod->node);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * take_ihead - update the status of the index head in lprops to 'taken'.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the amount of free space in the index head LEB or a
+ * negative error code.
+ */
+static int take_ihead(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int err, free;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	free = lp->free;
+
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	err = free;
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_replay_journal - replay journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the journal, replays and cleans it up. It makes sure all
+ * memory data structures related to uncommitted journal are built (dirty TNC
+ * tree, tree of buds, modified lprops, etc).
+ */
+int ubifs_replay_journal(struct ubifs_info *c)
+{
+	int err, lnum, free;
+
+	BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
+
+	/* Update the status of the index head in lprops to 'taken' */
+	free = take_ihead(c);
+	if (free < 0)
+		return free; /* Error code */
+
+	if (c->ihead_offs != c->leb_size - free) {
+		ubifs_err(c, "bad index head LEB %d:%d", c->ihead_lnum,
+			  c->ihead_offs);
+		return -EINVAL;
+	}
+
+	dbg_mnt("start replaying the journal");
+	c->replaying = 1;
+	lnum = c->ltail_lnum = c->lhead_lnum;
+
+	do {
+		err = replay_log_leb(c, lnum, 0, c->sbuf);
+		if (err == 1) {
+			if (lnum != c->lhead_lnum)
+				/* We hit the end of the log */
+				break;
+
+			/*
+			 * The head of the log must always start with the
+			 * "commit start" node on a properly formatted UBIFS.
+			 * But we found no nodes at all, which means that
+			 * someting went wrong and we cannot proceed mounting
+			 * the file-system.
+			 */
+			ubifs_err(c, "no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
+				  lnum, 0);
+			err = -EINVAL;
+		}
+		if (err)
+			goto out;
+		lnum = ubifs_next_log_lnum(c, lnum);
+	} while (lnum != c->ltail_lnum);
+
+	err = replay_buds(c);
+	if (err)
+		goto out;
+
+	err = apply_replay_list(c);
+	if (err)
+		goto out;
+
+	err = set_buds_lprops(c);
+	if (err)
+		goto out;
+
+	/*
+	 * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
+	 * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
+	 * depend on it. This means we have to initialize it to make sure
+	 * budgeting works properly.
+	 */
+	c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+	c->bi.uncommitted_idx *= c->max_idx_node_sz;
+
+	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
+	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, highest_inum %lu",
+		c->lhead_lnum, c->lhead_offs, c->max_sqnum,
+		(unsigned long)c->highest_inum);
+out:
+	destroy_replay_list(c);
+	destroy_bud_list(c);
+	c->replaying = 0;
+	return err;
+}
diff --git a/kernel/fs/ubifs/sb.c b/kernel/fs/ubifs/sb.c
new file mode 100644
index 000000000..f4fbc7b6b
--- /dev/null
+++ b/kernel/fs/ubifs/sb.c
@@ -0,0 +1,809 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS superblock. The superblock is stored at the first
+ * LEB of the volume and is never changed by UBIFS. Only user-space tools may
+ * change it. The superblock node mostly contains geometry information.
+ */
+
+#include "ubifs.h"
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/math64.h>
+
+/*
+ * Default journal size in logical eraseblocks as a percent of total
+ * flash size.
+ */
+#define DEFAULT_JNL_PERCENT 5
+
+/* Default maximum journal size in bytes */
+#define DEFAULT_MAX_JNL (32*1024*1024)
+
+/* Default indexing tree fanout */
+#define DEFAULT_FANOUT 8
+
+/* Default number of data journal heads */
+#define DEFAULT_JHEADS_CNT 1
+
+/* Default positions of different LEBs in the main area */
+#define DEFAULT_IDX_LEB  0
+#define DEFAULT_DATA_LEB 1
+#define DEFAULT_GC_LEB   2
+
+/* Default number of LEB numbers in LPT's save table */
+#define DEFAULT_LSAVE_CNT 256
+
+/* Default reserved pool size as a percent of maximum free space */
+#define DEFAULT_RP_PERCENT 5
+
+/* The default maximum size of reserved pool in bytes */
+#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
+
+/* Default time granularity in nanoseconds */
+#define DEFAULT_TIME_GRAN 1000000000
+
+/**
+ * create_default_filesystem - format empty UBI volume.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates default empty file-system. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int create_default_filesystem(struct ubifs_info *c)
+{
+	struct ubifs_sb_node *sup;
+	struct ubifs_mst_node *mst;
+	struct ubifs_idx_node *idx;
+	struct ubifs_branch *br;
+	struct ubifs_ino_node *ino;
+	struct ubifs_cs_node *cs;
+	union ubifs_key key;
+	int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
+	int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
+	int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+	long long tmp64, main_bytes;
+	__le64 tmp_le64;
+
+	/* Some functions called from here depend on the @c->key_len filed */
+	c->key_len = UBIFS_SK_LEN;
+
+	/*
+	 * First of all, we have to calculate default file-system geometry -
+	 * log size, journal size, etc.
+	 */
+	if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
+		/* We can first multiply then divide and have no overflow */
+		jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
+	else
+		jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
+
+	if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
+		jnl_lebs = UBIFS_MIN_JNL_LEBS;
+	if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
+		jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
+
+	/*
+	 * The log should be large enough to fit reference nodes for all bud
+	 * LEBs. Because buds do not have to start from the beginning of LEBs
+	 * (half of the LEB may contain committed data), the log should
+	 * generally be larger, make it twice as large.
+	 */
+	tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
+	log_lebs = tmp / c->leb_size;
+	/* Plus one LEB reserved for commit */
+	log_lebs += 1;
+	if (c->leb_cnt - min_leb_cnt > 8) {
+		/* And some extra space to allow writes while committing */
+		log_lebs += 1;
+		min_leb_cnt += 1;
+	}
+
+	max_buds = jnl_lebs - log_lebs;
+	if (max_buds < UBIFS_MIN_BUD_LEBS)
+		max_buds = UBIFS_MIN_BUD_LEBS;
+
+	/*
+	 * Orphan nodes are stored in a separate area. One node can store a lot
+	 * of orphan inode numbers, but when new orphan comes we just add a new
+	 * orphan node. At some point the nodes are consolidated into one
+	 * orphan node.
+	 */
+	orph_lebs = UBIFS_MIN_ORPH_LEBS;
+	if (c->leb_cnt - min_leb_cnt > 1)
+		/*
+		 * For debugging purposes it is better to have at least 2
+		 * orphan LEBs, because the orphan subsystem would need to do
+		 * consolidations and would be stressed more.
+		 */
+		orph_lebs += 1;
+
+	main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
+	main_lebs -= orph_lebs;
+
+	lpt_first = UBIFS_LOG_LNUM + log_lebs;
+	c->lsave_cnt = DEFAULT_LSAVE_CNT;
+	c->max_leb_cnt = c->leb_cnt;
+	err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
+				    &big_lpt);
+	if (err)
+		return err;
+
+	dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
+		lpt_first + lpt_lebs - 1);
+
+	main_first = c->leb_cnt - main_lebs;
+
+	/* Create default superblock */
+	tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+	sup = kzalloc(tmp, GFP_KERNEL);
+	if (!sup)
+		return -ENOMEM;
+
+	tmp64 = (long long)max_buds * c->leb_size;
+	if (big_lpt)
+		sup_flags |= UBIFS_FLG_BIGLPT;
+
+	sup->ch.node_type  = UBIFS_SB_NODE;
+	sup->key_hash      = UBIFS_KEY_HASH_R5;
+	sup->flags         = cpu_to_le32(sup_flags);
+	sup->min_io_size   = cpu_to_le32(c->min_io_size);
+	sup->leb_size      = cpu_to_le32(c->leb_size);
+	sup->leb_cnt       = cpu_to_le32(c->leb_cnt);
+	sup->max_leb_cnt   = cpu_to_le32(c->max_leb_cnt);
+	sup->max_bud_bytes = cpu_to_le64(tmp64);
+	sup->log_lebs      = cpu_to_le32(log_lebs);
+	sup->lpt_lebs      = cpu_to_le32(lpt_lebs);
+	sup->orph_lebs     = cpu_to_le32(orph_lebs);
+	sup->jhead_cnt     = cpu_to_le32(DEFAULT_JHEADS_CNT);
+	sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
+	sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
+	sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
+	sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+	if (c->mount_opts.override_compr)
+		sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
+	else
+		sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+
+	generate_random_uuid(sup->uuid);
+
+	main_bytes = (long long)main_lebs * c->leb_size;
+	tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
+	if (tmp64 > DEFAULT_MAX_RP_SIZE)
+		tmp64 = DEFAULT_MAX_RP_SIZE;
+	sup->rp_size = cpu_to_le64(tmp64);
+	sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
+
+	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0);
+	kfree(sup);
+	if (err)
+		return err;
+
+	dbg_gen("default superblock created at LEB 0:0");
+
+	/* Create default master node */
+	mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+	if (!mst)
+		return -ENOMEM;
+
+	mst->ch.node_type = UBIFS_MST_NODE;
+	mst->log_lnum     = cpu_to_le32(UBIFS_LOG_LNUM);
+	mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
+	mst->cmt_no       = 0;
+	mst->root_lnum    = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+	mst->root_offs    = 0;
+	tmp = ubifs_idx_node_sz(c, 1);
+	mst->root_len     = cpu_to_le32(tmp);
+	mst->gc_lnum      = cpu_to_le32(main_first + DEFAULT_GC_LEB);
+	mst->ihead_lnum   = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+	mst->ihead_offs   = cpu_to_le32(ALIGN(tmp, c->min_io_size));
+	mst->index_size   = cpu_to_le64(ALIGN(tmp, 8));
+	mst->lpt_lnum     = cpu_to_le32(c->lpt_lnum);
+	mst->lpt_offs     = cpu_to_le32(c->lpt_offs);
+	mst->nhead_lnum   = cpu_to_le32(c->nhead_lnum);
+	mst->nhead_offs   = cpu_to_le32(c->nhead_offs);
+	mst->ltab_lnum    = cpu_to_le32(c->ltab_lnum);
+	mst->ltab_offs    = cpu_to_le32(c->ltab_offs);
+	mst->lsave_lnum   = cpu_to_le32(c->lsave_lnum);
+	mst->lsave_offs   = cpu_to_le32(c->lsave_offs);
+	mst->lscan_lnum   = cpu_to_le32(main_first);
+	mst->empty_lebs   = cpu_to_le32(main_lebs - 2);
+	mst->idx_lebs     = cpu_to_le32(1);
+	mst->leb_cnt      = cpu_to_le32(c->leb_cnt);
+
+	/* Calculate lprops statistics */
+	tmp64 = main_bytes;
+	tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+	tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+	mst->total_free = cpu_to_le64(tmp64);
+
+	tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+	ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
+			  UBIFS_INO_NODE_SZ;
+	tmp64 += ino_waste;
+	tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
+	mst->total_dirty = cpu_to_le64(tmp64);
+
+	/*  The indexing LEB does not contribute to dark space */
+	tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
+	mst->total_dark = cpu_to_le64(tmp64);
+
+	mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0);
+	if (err) {
+		kfree(mst);
+		return err;
+	}
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1,
+			       0);
+	kfree(mst);
+	if (err)
+		return err;
+
+	dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
+
+	/* Create the root indexing node */
+	tmp = ubifs_idx_node_sz(c, 1);
+	idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+	if (!idx)
+		return -ENOMEM;
+
+	c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
+	c->key_hash = key_r5_hash;
+
+	idx->ch.node_type = UBIFS_IDX_NODE;
+	idx->child_cnt = cpu_to_le16(1);
+	ino_key_init(c, &key, UBIFS_ROOT_INO);
+	br = ubifs_idx_branch(c, idx, 0);
+	key_write_idx(c, &key, &br->key);
+	br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
+	br->len  = cpu_to_le32(UBIFS_INO_NODE_SZ);
+	err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0);
+	kfree(idx);
+	if (err)
+		return err;
+
+	dbg_gen("default root indexing node created LEB %d:0",
+		main_first + DEFAULT_IDX_LEB);
+
+	/* Create default root inode */
+	tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+	ino = kzalloc(tmp, GFP_KERNEL);
+	if (!ino)
+		return -ENOMEM;
+
+	ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
+	ino->ch.node_type = UBIFS_INO_NODE;
+	ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
+	ino->nlink = cpu_to_le32(2);
+	tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+	ino->atime_sec   = tmp_le64;
+	ino->ctime_sec   = tmp_le64;
+	ino->mtime_sec   = tmp_le64;
+	ino->atime_nsec  = 0;
+	ino->ctime_nsec  = 0;
+	ino->mtime_nsec  = 0;
+	ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+	ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+	/* Set compression enabled by default */
+	ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
+
+	err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+			       main_first + DEFAULT_DATA_LEB, 0);
+	kfree(ino);
+	if (err)
+		return err;
+
+	dbg_gen("root inode created at LEB %d:0",
+		main_first + DEFAULT_DATA_LEB);
+
+	/*
+	 * The first node in the log has to be the commit start node. This is
+	 * always the case during normal file-system operation. Write a fake
+	 * commit start node to the log.
+	 */
+	tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
+	cs = kzalloc(tmp, GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->ch.node_type = UBIFS_CS_NODE;
+	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
+	kfree(cs);
+	if (err)
+		return err;
+
+	ubifs_msg(c, "default file-system created");
+	return 0;
+}
+
+/**
+ * validate_sb - validate superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node
+ *
+ * This function validates superblock node @sup. Since most of data was read
+ * from the superblock and stored in @c, the function validates fields in @c
+ * instead. Returns zero in case of success and %-EINVAL in case of validation
+ * failure.
+ */
+static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+	long long max_bytes;
+	int err = 1, min_leb_cnt;
+
+	if (!c->key_hash) {
+		err = 2;
+		goto failed;
+	}
+
+	if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
+		err = 3;
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
+		ubifs_err(c, "min. I/O unit mismatch: %d in superblock, %d real",
+			  le32_to_cpu(sup->min_io_size), c->min_io_size);
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->leb_size) != c->leb_size) {
+		ubifs_err(c, "LEB size mismatch: %d in superblock, %d real",
+			  le32_to_cpu(sup->leb_size), c->leb_size);
+		goto failed;
+	}
+
+	if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
+	    c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
+	    c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
+	    c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+		err = 4;
+		goto failed;
+	}
+
+	/*
+	 * Calculate minimum allowed amount of main area LEBs. This is very
+	 * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
+	 * have just read from the superblock.
+	 */
+	min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
+	min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
+
+	if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
+		ubifs_err(c, "bad LEB count: %d in superblock, %d on UBI volume, %d minimum required",
+			  c->leb_cnt, c->vi.size, min_leb_cnt);
+		goto failed;
+	}
+
+	if (c->max_leb_cnt < c->leb_cnt) {
+		ubifs_err(c, "max. LEB count %d less than LEB count %d",
+			  c->max_leb_cnt, c->leb_cnt);
+		goto failed;
+	}
+
+	if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+		ubifs_err(c, "too few main LEBs count %d, must be at least %d",
+			  c->main_lebs, UBIFS_MIN_MAIN_LEBS);
+		goto failed;
+	}
+
+	max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS;
+	if (c->max_bud_bytes < max_bytes) {
+		ubifs_err(c, "too small journal (%lld bytes), must be at least %lld bytes",
+			  c->max_bud_bytes, max_bytes);
+		goto failed;
+	}
+
+	max_bytes = (long long)c->leb_size * c->main_lebs;
+	if (c->max_bud_bytes > max_bytes) {
+		ubifs_err(c, "too large journal size (%lld bytes), only %lld bytes available in the main area",
+			  c->max_bud_bytes, max_bytes);
+		goto failed;
+	}
+
+	if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
+	    c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
+		err = 9;
+		goto failed;
+	}
+
+	if (c->fanout < UBIFS_MIN_FANOUT ||
+	    ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
+		err = 10;
+		goto failed;
+	}
+
+	if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
+	    c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
+	    c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
+		err = 11;
+		goto failed;
+	}
+
+	if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
+	    c->orph_lebs + c->main_lebs != c->leb_cnt) {
+		err = 12;
+		goto failed;
+	}
+
+	if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+		err = 13;
+		goto failed;
+	}
+
+	if (c->rp_size < 0 || max_bytes < c->rp_size) {
+		err = 14;
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->time_gran) > 1000000000 ||
+	    le32_to_cpu(sup->time_gran) < 1) {
+		err = 15;
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	ubifs_err(c, "bad superblock, error %d", err);
+	ubifs_dump_node(c, sup);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_sb_node - read superblock node.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a pointer to the superblock node or a negative error
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
+ */
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+{
+	struct ubifs_sb_node *sup;
+	int err;
+
+	sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
+	if (!sup)
+		return ERR_PTR(-ENOMEM);
+
+	err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
+			      UBIFS_SB_LNUM, 0);
+	if (err) {
+		kfree(sup);
+		return ERR_PTR(err);
+	}
+
+	return sup;
+}
+
+/**
+ * ubifs_write_sb_node - write superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node read with 'ubifs_read_sb_node()'
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+	int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+
+	ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
+	return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len);
+}
+
+/**
+ * ubifs_read_superblock - read superblock.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds, reads and checks the superblock. If an empty UBI volume
+ * is being mounted, this function creates default superblock. Returns zero in
+ * case of success, and a negative error code in case of failure.
+ */
+int ubifs_read_superblock(struct ubifs_info *c)
+{
+	int err, sup_flags;
+	struct ubifs_sb_node *sup;
+
+	if (c->empty) {
+		err = create_default_filesystem(c);
+		if (err)
+			return err;
+	}
+
+	sup = ubifs_read_sb_node(c);
+	if (IS_ERR(sup))
+		return PTR_ERR(sup);
+
+	c->fmt_version = le32_to_cpu(sup->fmt_version);
+	c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
+
+	/*
+	 * The software supports all previous versions but not future versions,
+	 * due to the unavailability of time-travelling equipment.
+	 */
+	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
+		ubifs_assert(!c->ro_media || c->ro_mount);
+		if (!c->ro_mount ||
+		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+			ubifs_err(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
+				  c->fmt_version, c->ro_compat_version,
+				  UBIFS_FORMAT_VERSION,
+				  UBIFS_RO_COMPAT_VERSION);
+			if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+				ubifs_msg(c, "only R/O mounting is possible");
+				err = -EROFS;
+			} else
+				err = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * The FS is mounted R/O, and the media format is
+		 * R/O-compatible with the UBIFS implementation, so we can
+		 * mount.
+		 */
+		c->rw_incompat = 1;
+	}
+
+	if (c->fmt_version < 3) {
+		ubifs_err(c, "on-flash format version %d is not supported",
+			  c->fmt_version);
+		err = -EINVAL;
+		goto out;
+	}
+
+	switch (sup->key_hash) {
+	case UBIFS_KEY_HASH_R5:
+		c->key_hash = key_r5_hash;
+		c->key_hash_type = UBIFS_KEY_HASH_R5;
+		break;
+
+	case UBIFS_KEY_HASH_TEST:
+		c->key_hash = key_test_hash;
+		c->key_hash_type = UBIFS_KEY_HASH_TEST;
+		break;
+	};
+
+	c->key_fmt = sup->key_fmt;
+
+	switch (c->key_fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		c->key_len = UBIFS_SK_LEN;
+		break;
+	default:
+		ubifs_err(c, "unsupported key format");
+		err = -EINVAL;
+		goto out;
+	}
+
+	c->leb_cnt       = le32_to_cpu(sup->leb_cnt);
+	c->max_leb_cnt   = le32_to_cpu(sup->max_leb_cnt);
+	c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
+	c->log_lebs      = le32_to_cpu(sup->log_lebs);
+	c->lpt_lebs      = le32_to_cpu(sup->lpt_lebs);
+	c->orph_lebs     = le32_to_cpu(sup->orph_lebs);
+	c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
+	c->fanout        = le32_to_cpu(sup->fanout);
+	c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
+	c->rp_size       = le64_to_cpu(sup->rp_size);
+	c->rp_uid        = make_kuid(&init_user_ns, le32_to_cpu(sup->rp_uid));
+	c->rp_gid        = make_kgid(&init_user_ns, le32_to_cpu(sup->rp_gid));
+	sup_flags        = le32_to_cpu(sup->flags);
+	if (!c->mount_opts.override_compr)
+		c->default_compr = le16_to_cpu(sup->default_compr);
+
+	c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
+	memcpy(&c->uuid, &sup->uuid, 16);
+	c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+	c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
+
+	/* Automatically increase file system size to the maximum size */
+	c->old_leb_cnt = c->leb_cnt;
+	if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
+		c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
+		if (c->ro_mount)
+			dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
+				c->old_leb_cnt,	c->leb_cnt);
+		else {
+			dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
+				c->old_leb_cnt, c->leb_cnt);
+			sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+			err = ubifs_write_sb_node(c, sup);
+			if (err)
+				goto out;
+			c->old_leb_cnt = c->leb_cnt;
+		}
+	}
+
+	c->log_bytes = (long long)c->log_lebs * c->leb_size;
+	c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
+	c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
+	c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
+	c->orph_first = c->lpt_last + 1;
+	c->orph_last = c->orph_first + c->orph_lebs - 1;
+	c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
+	c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
+	c->main_first = c->leb_cnt - c->main_lebs;
+
+	err = validate_sb(c, sup);
+out:
+	kfree(sup);
+	return err;
+}
+
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+	int err;
+
+	ubifs_assert(len >= 0);
+	ubifs_assert(len % c->min_io_size == 0);
+	ubifs_assert(len < c->leb_size);
+
+	if (len == 0) {
+		dbg_mnt("unmap empty LEB %d", lnum);
+		return ubifs_leb_unmap(c, lnum);
+	}
+
+	dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+	err = ubifs_leb_read(c, lnum, c->sbuf, 0, len, 1);
+	if (err)
+		return err;
+
+	return ubifs_leb_change(c, lnum, c->sbuf, len);
+}
+
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+	int lnum, err = 0;
+	struct ubifs_lprops *lprops;
+
+	ubifs_get_lprops(c);
+
+	/* Fixup LEBs in the master area */
+	for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+		err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+		if (err)
+			goto out;
+	}
+
+	/* Unmap unused log LEBs */
+	lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+	while (lnum != c->ltail_lnum) {
+		err = fixup_leb(c, lnum, 0);
+		if (err)
+			goto out;
+		lnum = ubifs_next_log_lnum(c, lnum);
+	}
+
+	/*
+	 * Fixup the log head which contains the only a CS node at the
+	 * beginning.
+	 */
+	err = fixup_leb(c, c->lhead_lnum,
+			ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size));
+	if (err)
+		goto out;
+
+	/* Fixup LEBs in the LPT area */
+	for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+		int free = c->ltab[lnum - c->lpt_first].free;
+
+		if (free > 0) {
+			err = fixup_leb(c, lnum, c->leb_size - free);
+			if (err)
+				goto out;
+		}
+	}
+
+	/* Unmap LEBs in the orphans area */
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		err = fixup_leb(c, lnum, 0);
+		if (err)
+			goto out;
+	}
+
+	/* Fixup LEBs in the main area */
+	for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+		lprops = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lprops)) {
+			err = PTR_ERR(lprops);
+			goto out;
+		}
+
+		if (lprops->free > 0) {
+			err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+	int err;
+	struct ubifs_sb_node *sup;
+
+	ubifs_assert(c->space_fixup);
+	ubifs_assert(!c->ro_mount);
+
+	ubifs_msg(c, "start fixing up free space");
+
+	err = fixup_free_space(c);
+	if (err)
+		return err;
+
+	sup = ubifs_read_sb_node(c);
+	if (IS_ERR(sup))
+		return PTR_ERR(sup);
+
+	/* Free-space fixup is no longer required */
+	c->space_fixup = 0;
+	sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+
+	err = ubifs_write_sb_node(c, sup);
+	kfree(sup);
+	if (err)
+		return err;
+
+	ubifs_msg(c, "free space fixup complete");
+	return err;
+}
diff --git a/kernel/fs/ubifs/scan.c b/kernel/fs/ubifs/scan.c
new file mode 100644
index 000000000..aab87340d
--- /dev/null
+++ b/kernel/fs/ubifs/scan.c
@@ -0,0 +1,379 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the scan which is a general-purpose function for
+ * determining what nodes are in an eraseblock. The scan is used to replay the
+ * journal, to do garbage collection. for the TNC in-the-gaps method, and by
+ * debugging functions.
+ */
+
+#include "ubifs.h"
+
+/**
+ * scan_padding_bytes - scan for padding bytes.
+ * @buf: buffer to scan
+ * @len: length of buffer
+ *
+ * This function returns the number of padding bytes on success and
+ * %SCANNED_GARBAGE on failure.
+ */
+static int scan_padding_bytes(void *buf, int len)
+{
+	int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
+	uint8_t *p = buf;
+
+	dbg_scan("not a node");
+
+	while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
+		pad_len += 1;
+
+	if (!pad_len || (pad_len & 7))
+		return SCANNED_GARBAGE;
+
+	dbg_scan("%d padding bytes", pad_len);
+
+	return pad_len;
+}
+
+/**
+ * ubifs_scan_a_node - scan for a node or padding.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to scan
+ * @len: length of buffer
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function returns a scanning code to indicate what was scanned.
+ */
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+		      int offs, int quiet)
+{
+	struct ubifs_ch *ch = buf;
+	uint32_t magic;
+
+	magic = le32_to_cpu(ch->magic);
+
+	if (magic == 0xFFFFFFFF) {
+		dbg_scan("hit empty space at LEB %d:%d", lnum, offs);
+		return SCANNED_EMPTY_SPACE;
+	}
+
+	if (magic != UBIFS_NODE_MAGIC)
+		return scan_padding_bytes(buf, len);
+
+	if (len < UBIFS_CH_SZ)
+		return SCANNED_GARBAGE;
+
+	dbg_scan("scanning %s at LEB %d:%d",
+		 dbg_ntype(ch->node_type), lnum, offs);
+
+	if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
+		return SCANNED_A_CORRUPT_NODE;
+
+	if (ch->node_type == UBIFS_PAD_NODE) {
+		struct ubifs_pad_node *pad = buf;
+		int pad_len = le32_to_cpu(pad->pad_len);
+		int node_len = le32_to_cpu(ch->len);
+
+		/* Validate the padding node */
+		if (pad_len < 0 ||
+		    offs + node_len + pad_len > c->leb_size) {
+			if (!quiet) {
+				ubifs_err(c, "bad pad node at LEB %d:%d",
+					  lnum, offs);
+				ubifs_dump_node(c, pad);
+			}
+			return SCANNED_A_BAD_PAD_NODE;
+		}
+
+		/* Make the node pads to 8-byte boundary */
+		if ((node_len + pad_len) & 7) {
+			if (!quiet)
+				ubifs_err(c, "bad padding length %d - %d",
+					  offs, offs + node_len + pad_len);
+			return SCANNED_A_BAD_PAD_NODE;
+		}
+
+		dbg_scan("%d bytes padded at LEB %d:%d, offset now %d", pad_len,
+			 lnum, offs, ALIGN(offs + node_len + pad_len, 8));
+
+		return node_len + pad_len;
+	}
+
+	return SCANNED_A_NODE;
+}
+
+/**
+ * ubifs_start_scan - create LEB scanning information at start of scan.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function returns the scanned information on success and a negative error
+ * code on failure.
+ */
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+					int offs, void *sbuf)
+{
+	struct ubifs_scan_leb *sleb;
+	int err;
+
+	dbg_scan("scan LEB %d:%d", lnum, offs);
+
+	sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
+	if (!sleb)
+		return ERR_PTR(-ENOMEM);
+
+	sleb->lnum = lnum;
+	INIT_LIST_HEAD(&sleb->nodes);
+	sleb->buf = sbuf;
+
+	err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0);
+	if (err && err != -EBADMSG) {
+		ubifs_err(c, "cannot read %d bytes from LEB %d:%d, error %d",
+			  c->leb_size - offs, lnum, offs, err);
+		kfree(sleb);
+		return ERR_PTR(err);
+	}
+
+	/*
+	 * Note, we ignore integrity errors (EBASMSG) because all the nodes are
+	 * protected by CRC checksums.
+	 */
+	return sleb;
+}
+
+/**
+ * ubifs_end_scan - update LEB scanning information at end of scan.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ */
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		    int lnum, int offs)
+{
+	lnum = lnum;
+	dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
+	ubifs_assert(offs % c->min_io_size == 0);
+
+	sleb->endpt = ALIGN(offs, c->min_io_size);
+}
+
+/**
+ * ubifs_add_snod - add a scanned node to LEB scanning information.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @buf: buffer containing node
+ * @offs: offset of node on flash
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		   void *buf, int offs)
+{
+	struct ubifs_ch *ch = buf;
+	struct ubifs_ino_node *ino = buf;
+	struct ubifs_scan_node *snod;
+
+	snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+	if (!snod)
+		return -ENOMEM;
+
+	snod->sqnum = le64_to_cpu(ch->sqnum);
+	snod->type = ch->node_type;
+	snod->offs = offs;
+	snod->len = le32_to_cpu(ch->len);
+	snod->node = buf;
+
+	switch (ch->node_type) {
+	case UBIFS_INO_NODE:
+	case UBIFS_DENT_NODE:
+	case UBIFS_XENT_NODE:
+	case UBIFS_DATA_NODE:
+		/*
+		 * The key is in the same place in all keyed
+		 * nodes.
+		 */
+		key_read(c, &ino->key, &snod->key);
+		break;
+	default:
+		invalid_key_init(c, &snod->key);
+		break;
+	}
+	list_add_tail(&snod->list, &sleb->nodes);
+	sleb->nodes_cnt += 1;
+	return 0;
+}
+
+/**
+ * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of corruption
+ * @offs: offset of corruption
+ * @buf: buffer containing corruption
+ */
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+			      void *buf)
+{
+	int len;
+
+	ubifs_err(c, "corruption at LEB %d:%d", lnum, offs);
+	len = c->leb_size - offs;
+	if (len > 8192)
+		len = 8192;
+	ubifs_err(c, "first %d bytes from LEB %d:%d", len, lnum, offs);
+	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
+}
+
+/**
+ * ubifs_scan - scan a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
+ * @quiet: print no messages
+ *
+ * This function scans LEB number @lnum and returns complete information about
+ * its contents. Returns the scanned information in case of success and,
+ * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
+ * of failure.
+ *
+ * If @quiet is non-zero, this function does not print large and scary
+ * error messages and flash dumps in case of errors.
+ */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+				  int offs, void *sbuf, int quiet)
+{
+	void *buf = sbuf + offs;
+	int err, len = c->leb_size - offs;
+	struct ubifs_scan_leb *sleb;
+
+	sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+	if (IS_ERR(sleb))
+		return sleb;
+
+	while (len >= 8) {
+		struct ubifs_ch *ch = buf;
+		int node_len, ret;
+
+		dbg_scan("look at LEB %d:%d (%d bytes left)",
+			 lnum, offs, len);
+
+		cond_resched();
+
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+		if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+			continue;
+		}
+
+		if (ret == SCANNED_EMPTY_SPACE)
+			/* Empty space is checked later */
+			break;
+
+		switch (ret) {
+		case SCANNED_GARBAGE:
+			ubifs_err(c, "garbage");
+			goto corrupted;
+		case SCANNED_A_NODE:
+			break;
+		case SCANNED_A_CORRUPT_NODE:
+		case SCANNED_A_BAD_PAD_NODE:
+			ubifs_err(c, "bad node");
+			goto corrupted;
+		default:
+			ubifs_err(c, "unknown");
+			err = -EINVAL;
+			goto error;
+		}
+
+		err = ubifs_add_snod(c, sleb, buf, offs);
+		if (err)
+			goto error;
+
+		node_len = ALIGN(le32_to_cpu(ch->len), 8);
+		offs += node_len;
+		buf += node_len;
+		len -= node_len;
+	}
+
+	if (offs % c->min_io_size) {
+		if (!quiet)
+			ubifs_err(c, "empty space starts at non-aligned offset %d",
+				  offs);
+		goto corrupted;
+	}
+
+	ubifs_end_scan(c, sleb, lnum, offs);
+
+	for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
+		if (*(uint32_t *)buf != 0xffffffff)
+			break;
+	for (; len; offs++, buf++, len--)
+		if (*(uint8_t *)buf != 0xff) {
+			if (!quiet)
+				ubifs_err(c, "corrupt empty space at LEB %d:%d",
+					  lnum, offs);
+			goto corrupted;
+		}
+
+	return sleb;
+
+corrupted:
+	if (!quiet) {
+		ubifs_scanned_corruption(c, lnum, offs, buf);
+		ubifs_err(c, "LEB %d scanning failed", lnum);
+	}
+	err = -EUCLEAN;
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+
+error:
+	ubifs_err(c, "LEB %d scanning failed, error %d", lnum, err);
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+}
+
+/**
+ * ubifs_scan_destroy - destroy LEB scanning information.
+ * @sleb: scanning information to free
+ */
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
+{
+	struct ubifs_scan_node *node;
+	struct list_head *head;
+
+	head = &sleb->nodes;
+	while (!list_empty(head)) {
+		node = list_entry(head->next, struct ubifs_scan_node, list);
+		list_del(&node->list);
+		kfree(node);
+	}
+	kfree(sleb);
+}
diff --git a/kernel/fs/ubifs/shrinker.c b/kernel/fs/ubifs/shrinker.c
new file mode 100644
index 000000000..9a9fb94a4
--- /dev/null
+++ b/kernel/fs/ubifs/shrinker.c
@@ -0,0 +1,331 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS shrinker which evicts clean znodes from the TNC
+ * tree when Linux VM needs more RAM.
+ *
+ * We do not implement any LRU lists to find oldest znodes to free because it
+ * would add additional overhead to the file system fast paths. So the shrinker
+ * just walks the TNC tree when searching for znodes to free.
+ *
+ * If the root of a TNC sub-tree is clean and old enough, then the children are
+ * also clean and old enough. So the shrinker walks the TNC in level order and
+ * dumps entire sub-trees.
+ *
+ * The age of znodes is just the time-stamp when they were last looked at.
+ * The current shrinker first tries to evict old znodes, then young ones.
+ *
+ * Since the shrinker is global, it has to protect against races with FS
+ * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
+ */
+
+#include "ubifs.h"
+
+/* List of all UBIFS file-system instances */
+LIST_HEAD(ubifs_infos);
+
+/*
+ * We number each shrinker run and record the number on the ubifs_info structure
+ * so that we can easily work out which ubifs_info structures have already been
+ * done by the current run.
+ */
+static unsigned int shrinker_run_no;
+
+/* Protects 'ubifs_infos' list */
+DEFINE_SPINLOCK(ubifs_infos_lock);
+
+/* Global clean znode counter (for all mounted UBIFS instances) */
+atomic_long_t ubifs_clean_zn_cnt;
+
+/**
+ * shrink_tnc - shrink TNC tree.
+ * @c: UBIFS file-system description object
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function traverses TNC tree and frees clean znodes. It does not free
+ * clean znodes which younger then @age. Returns number of freed znodes.
+ */
+static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
+{
+	int total_freed = 0;
+	struct ubifs_znode *znode, *zprev;
+	int time = get_seconds();
+
+	ubifs_assert(mutex_is_locked(&c->umount_mutex));
+	ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+
+	if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
+		return 0;
+
+	/*
+	 * Traverse the TNC tree in levelorder manner, so that it is possible
+	 * to destroy large sub-trees. Indeed, if a znode is old, then all its
+	 * children are older or of the same age.
+	 *
+	 * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
+	 * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
+	 * changed only when the 'c->tnc_mutex' is held.
+	 */
+	zprev = NULL;
+	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+	while (znode && total_freed < nr &&
+	       atomic_long_read(&c->clean_zn_cnt) > 0) {
+		int freed;
+
+		/*
+		 * If the znode is clean, but it is in the 'c->cnext' list, this
+		 * means that this znode has just been written to flash as a
+		 * part of commit and was marked clean. They will be removed
+		 * from the list at end commit. We cannot change the list,
+		 * because it is not protected by any mutex (design decision to
+		 * make commit really independent and parallel to main I/O). So
+		 * we just skip these znodes.
+		 *
+		 * Note, the 'clean_zn_cnt' counters are not updated until
+		 * after the commit, so the UBIFS shrinker does not report
+		 * the znodes which are in the 'c->cnext' list as freeable.
+		 *
+		 * Also note, if the root of a sub-tree is not in 'c->cnext',
+		 * then the whole sub-tree is not in 'c->cnext' as well, so it
+		 * is safe to dump whole sub-tree.
+		 */
+
+		if (znode->cnext) {
+			/*
+			 * Very soon these znodes will be removed from the list
+			 * and become freeable.
+			 */
+			*contention = 1;
+		} else if (!ubifs_zn_dirty(znode) &&
+			   abs(time - znode->time) >= age) {
+			if (znode->parent)
+				znode->parent->zbranch[znode->iip].znode = NULL;
+			else
+				c->zroot.znode = NULL;
+
+			freed = ubifs_destroy_tnc_subtree(znode);
+			atomic_long_sub(freed, &ubifs_clean_zn_cnt);
+			atomic_long_sub(freed, &c->clean_zn_cnt);
+			total_freed += freed;
+			znode = zprev;
+		}
+
+		if (unlikely(!c->zroot.znode))
+			break;
+
+		zprev = znode;
+		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+		cond_resched();
+	}
+
+	return total_freed;
+}
+
+/**
+ * shrink_tnc_trees - shrink UBIFS TNC trees.
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function walks the list of mounted UBIFS file-systems and frees clean
+ * znodes which are older than @age, until at least @nr znodes are freed.
+ * Returns the number of freed znodes.
+ */
+static int shrink_tnc_trees(int nr, int age, int *contention)
+{
+	struct ubifs_info *c;
+	struct list_head *p;
+	unsigned int run_no;
+	int freed = 0;
+
+	spin_lock(&ubifs_infos_lock);
+	do {
+		run_no = ++shrinker_run_no;
+	} while (run_no == 0);
+	/* Iterate over all mounted UBIFS file-systems and try to shrink them */
+	p = ubifs_infos.next;
+	while (p != &ubifs_infos) {
+		c = list_entry(p, struct ubifs_info, infos_list);
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (c->shrinker_run_no == run_no)
+			break;
+		if (!mutex_trylock(&c->umount_mutex)) {
+			/* Some un-mount is in progress, try next FS */
+			*contention = 1;
+			p = p->next;
+			continue;
+		}
+		/*
+		 * We're holding 'c->umount_mutex', so the file-system won't go
+		 * away.
+		 */
+		if (!mutex_trylock(&c->tnc_mutex)) {
+			mutex_unlock(&c->umount_mutex);
+			*contention = 1;
+			p = p->next;
+			continue;
+		}
+		spin_unlock(&ubifs_infos_lock);
+		/*
+		 * OK, now we have TNC locked, the file-system cannot go away -
+		 * it is safe to reap the cache.
+		 */
+		c->shrinker_run_no = run_no;
+		freed += shrink_tnc(c, nr, age, contention);
+		mutex_unlock(&c->tnc_mutex);
+		spin_lock(&ubifs_infos_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&c->infos_list, &ubifs_infos);
+		mutex_unlock(&c->umount_mutex);
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&ubifs_infos_lock);
+	return freed;
+}
+
+/**
+ * kick_a_thread - kick a background thread to start commit.
+ *
+ * This function kicks a background thread to start background commit. Returns
+ * %-1 if a thread was kicked or there is another reason to assume the memory
+ * will soon be freed or become freeable. If there are no dirty znodes, returns
+ * %0.
+ */
+static int kick_a_thread(void)
+{
+	int i;
+	struct ubifs_info *c;
+
+	/*
+	 * Iterate over all mounted UBIFS file-systems and find out if there is
+	 * already an ongoing commit operation there. If no, then iterate for
+	 * the second time and initiate background commit.
+	 */
+	spin_lock(&ubifs_infos_lock);
+	for (i = 0; i < 2; i++) {
+		list_for_each_entry(c, &ubifs_infos, infos_list) {
+			long dirty_zn_cnt;
+
+			if (!mutex_trylock(&c->umount_mutex)) {
+				/*
+				 * Some un-mount is in progress, it will
+				 * certainly free memory, so just return.
+				 */
+				spin_unlock(&ubifs_infos_lock);
+				return -1;
+			}
+
+			dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
+
+			if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
+			    c->ro_mount || c->ro_error) {
+				mutex_unlock(&c->umount_mutex);
+				continue;
+			}
+
+			if (c->cmt_state != COMMIT_RESTING) {
+				spin_unlock(&ubifs_infos_lock);
+				mutex_unlock(&c->umount_mutex);
+				return -1;
+			}
+
+			if (i == 1) {
+				list_move_tail(&c->infos_list, &ubifs_infos);
+				spin_unlock(&ubifs_infos_lock);
+
+				ubifs_request_bg_commit(c);
+				mutex_unlock(&c->umount_mutex);
+				return -1;
+			}
+			mutex_unlock(&c->umount_mutex);
+		}
+	}
+	spin_unlock(&ubifs_infos_lock);
+
+	return 0;
+}
+
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
+
+	/*
+	 * Due to the way UBIFS updates the clean znode counter it may
+	 * temporarily be negative.
+	 */
+	return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+}
+
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	unsigned long nr = sc->nr_to_scan;
+	int contention = 0;
+	unsigned long freed;
+	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
+
+	if (!clean_zn_cnt) {
+		/*
+		 * No clean znodes, nothing to reap. All we can do in this case
+		 * is to kick background threads to start commit, which will
+		 * probably make clean znodes which, in turn, will be freeable.
+		 * And we return -1 which means will make VM call us again
+		 * later.
+		 */
+		dbg_tnc("no clean znodes, kick a thread");
+		return kick_a_thread();
+	}
+
+	freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
+	if (freed >= nr)
+		goto out;
+
+	dbg_tnc("not enough old znodes, try to free young ones");
+	freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
+	if (freed >= nr)
+		goto out;
+
+	dbg_tnc("not enough young znodes, free all");
+	freed += shrink_tnc_trees(nr - freed, 0, &contention);
+
+	if (!freed && contention) {
+		dbg_tnc("freed nothing, but contention");
+		return SHRINK_STOP;
+	}
+
+out:
+	dbg_tnc("%lu znodes were freed, requested %lu", freed, nr);
+	return freed;
+}
diff --git a/kernel/fs/ubifs/super.c b/kernel/fs/ubifs/super.c
new file mode 100644
index 000000000..75e6f04bb
--- /dev/null
+++ b/kernel/fs/ubifs/super.c
@@ -0,0 +1,2300 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS initialization and VFS superblock operations. Some
+ * initialization stuff which is rather large and complex is placed at
+ * corresponding subsystems, but most of it is here.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
+#include "ubifs.h"
+
+/*
+ * Maximum amount of memory we may 'kmalloc()' without worrying that we are
+ * allocating too much.
+ */
+#define UBIFS_KMALLOC_OK (128*1024)
+
+/* Slab cache for UBIFS inodes */
+struct kmem_cache *ubifs_inode_slab;
+
+/* UBIFS TNC shrinker description */
+static struct shrinker ubifs_shrinker_info = {
+	.scan_objects = ubifs_shrink_scan,
+	.count_objects = ubifs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * validate_inode - validate inode.
+ * @c: UBIFS file-system description object
+ * @inode: the inode to validate
+ *
+ * This is a helper function for 'ubifs_iget()' which validates various fields
+ * of a newly built inode to make sure they contain sane values and prevent
+ * possible vulnerabilities. Returns zero if the inode is all right and
+ * a non-zero error code if not.
+ */
+static int validate_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err;
+	const struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (inode->i_size > c->max_inode_sz) {
+		ubifs_err(c, "inode is too large (%lld)",
+			  (long long)inode->i_size);
+		return 1;
+	}
+
+	if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+		ubifs_err(c, "unknown compression type %d", ui->compr_type);
+		return 2;
+	}
+
+	if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
+		return 3;
+
+	if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
+		return 4;
+
+	if (ui->xattr && !S_ISREG(inode->i_mode))
+		return 5;
+
+	if (!ubifs_compr_present(ui->compr_type)) {
+		ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in",
+			   inode->i_ino, ubifs_compr_name(ui->compr_type));
+	}
+
+	err = dbg_check_dir(c, inode);
+	return err;
+}
+
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
+{
+	int err;
+	union ubifs_key key;
+	struct ubifs_ino_node *ino;
+	struct ubifs_info *c = sb->s_fs_info;
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	dbg_gen("inode %lu", inum);
+
+	inode = iget_locked(sb, inum);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	ui = ubifs_inode(inode);
+
+	ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+	if (!ino) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ino_key_init(c, &key, inode->i_ino);
+
+	err = ubifs_tnc_lookup(c, &key, ino);
+	if (err)
+		goto out_ino;
+
+	inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+	set_nlink(inode, le32_to_cpu(ino->nlink));
+	i_uid_write(inode, le32_to_cpu(ino->uid));
+	i_gid_write(inode, le32_to_cpu(ino->gid));
+	inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
+	inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
+	inode->i_mtime.tv_sec  = (int64_t)le64_to_cpu(ino->mtime_sec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+	inode->i_ctime.tv_sec  = (int64_t)le64_to_cpu(ino->ctime_sec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+	inode->i_mode = le32_to_cpu(ino->mode);
+	inode->i_size = le64_to_cpu(ino->size);
+
+	ui->data_len    = le32_to_cpu(ino->data_len);
+	ui->flags       = le32_to_cpu(ino->flags);
+	ui->compr_type  = le16_to_cpu(ino->compr_type);
+	ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
+	ui->xattr_cnt   = le32_to_cpu(ino->xattr_cnt);
+	ui->xattr_size  = le32_to_cpu(ino->xattr_size);
+	ui->xattr_names = le32_to_cpu(ino->xattr_names);
+	ui->synced_i_size = ui->ui_size = inode->i_size;
+
+	ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
+
+	err = validate_inode(c, inode);
+	if (err)
+		goto out_invalid;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &ubifs_file_address_operations;
+		inode->i_op = &ubifs_file_inode_operations;
+		inode->i_fop = &ubifs_file_operations;
+		if (ui->xattr) {
+			ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+			if (!ui->data) {
+				err = -ENOMEM;
+				goto out_ino;
+			}
+			memcpy(ui->data, ino->data, ui->data_len);
+			((char *)ui->data)[ui->data_len] = '\0';
+		} else if (ui->data_len != 0) {
+			err = 10;
+			goto out_invalid;
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op  = &ubifs_dir_inode_operations;
+		inode->i_fop = &ubifs_dir_operations;
+		if (ui->data_len != 0) {
+			err = 11;
+			goto out_invalid;
+		}
+		break;
+	case S_IFLNK:
+		inode->i_op = &ubifs_symlink_inode_operations;
+		if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
+			err = 12;
+			goto out_invalid;
+		}
+		ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+		if (!ui->data) {
+			err = -ENOMEM;
+			goto out_ino;
+		}
+		memcpy(ui->data, ino->data, ui->data_len);
+		((char *)ui->data)[ui->data_len] = '\0';
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
+	{
+		dev_t rdev;
+		union ubifs_dev_desc *dev;
+
+		ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+		if (!ui->data) {
+			err = -ENOMEM;
+			goto out_ino;
+		}
+
+		dev = (union ubifs_dev_desc *)ino->data;
+		if (ui->data_len == sizeof(dev->new))
+			rdev = new_decode_dev(le32_to_cpu(dev->new));
+		else if (ui->data_len == sizeof(dev->huge))
+			rdev = huge_decode_dev(le64_to_cpu(dev->huge));
+		else {
+			err = 13;
+			goto out_invalid;
+		}
+		memcpy(ui->data, ino->data, ui->data_len);
+		inode->i_op = &ubifs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+	}
+	case S_IFSOCK:
+	case S_IFIFO:
+		inode->i_op = &ubifs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, 0);
+		if (ui->data_len != 0) {
+			err = 14;
+			goto out_invalid;
+		}
+		break;
+	default:
+		err = 15;
+		goto out_invalid;
+	}
+
+	kfree(ino);
+	ubifs_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+
+out_invalid:
+	ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err);
+	ubifs_dump_node(c, ino);
+	ubifs_dump_inode(c, inode);
+	err = -EINVAL;
+out_ino:
+	kfree(ino);
+out:
+	ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err);
+	iget_failed(inode);
+	return ERR_PTR(err);
+}
+
+static struct inode *ubifs_alloc_inode(struct super_block *sb)
+{
+	struct ubifs_inode *ui;
+
+	ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+	if (!ui)
+		return NULL;
+
+	memset((void *)ui + sizeof(struct inode), 0,
+	       sizeof(struct ubifs_inode) - sizeof(struct inode));
+	mutex_init(&ui->ui_mutex);
+	spin_lock_init(&ui->ui_lock);
+	return &ui->vfs_inode;
+};
+
+static void ubifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	kmem_cache_free(ubifs_inode_slab, ui);
+}
+
+static void ubifs_destroy_inode(struct inode *inode)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	kfree(ui->data);
+	call_rcu(&inode->i_rcu, ubifs_i_callback);
+}
+
+/*
+ * Note, Linux write-back code calls this without 'i_mutex'.
+ */
+static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int err = 0;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(!ui->xattr);
+	if (is_bad_inode(inode))
+		return 0;
+
+	mutex_lock(&ui->ui_mutex);
+	/*
+	 * Due to races between write-back forced by budgeting
+	 * (see 'sync_some_inodes()') and background write-back, the inode may
+	 * have already been synchronized, do not do this again. This might
+	 * also happen if it was synchronized in an VFS operation, e.g.
+	 * 'ubifs_link()'.
+	 */
+	if (!ui->dirty) {
+		mutex_unlock(&ui->ui_mutex);
+		return 0;
+	}
+
+	/*
+	 * As an optimization, do not write orphan inodes to the media just
+	 * because this is not needed.
+	 */
+	dbg_gen("inode %lu, mode %#x, nlink %u",
+		inode->i_ino, (int)inode->i_mode, inode->i_nlink);
+	if (inode->i_nlink) {
+		err = ubifs_jnl_write_inode(c, inode);
+		if (err)
+			ubifs_err(c, "can't write inode %lu, error %d",
+				  inode->i_ino, err);
+		else
+			err = dbg_check_inode_size(c, inode, ui->ui_size);
+	}
+
+	ui->dirty = 0;
+	mutex_unlock(&ui->ui_mutex);
+	ubifs_release_dirty_inode_budget(c, ui);
+	return err;
+}
+
+static void ubifs_evict_inode(struct inode *inode)
+{
+	int err;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (ui->xattr)
+		/*
+		 * Extended attribute inode deletions are fully handled in
+		 * 'ubifs_removexattr()'. These inodes are special and have
+		 * limited usage, so there is nothing to do here.
+		 */
+		goto out;
+
+	dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
+	ubifs_assert(!atomic_read(&inode->i_count));
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	if (inode->i_nlink)
+		goto done;
+
+	if (is_bad_inode(inode))
+		goto out;
+
+	ui->ui_size = inode->i_size = 0;
+	err = ubifs_jnl_delete_inode(c, inode);
+	if (err)
+		/*
+		 * Worst case we have a lost orphan inode wasting space, so a
+		 * simple error message is OK here.
+		 */
+		ubifs_err(c, "can't delete inode %lu, error %d",
+			  inode->i_ino, err);
+
+out:
+	if (ui->dirty)
+		ubifs_release_dirty_inode_budget(c, ui);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+done:
+	clear_inode(inode);
+}
+
+static void ubifs_dirty_inode(struct inode *inode, int flags)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+	if (!ui->dirty) {
+		ui->dirty = 1;
+		dbg_gen("inode %lu",  inode->i_ino);
+	}
+}
+
+static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ubifs_info *c = dentry->d_sb->s_fs_info;
+	unsigned long long free;
+	__le32 *uuid = (__le32 *)c->uuid;
+
+	free = ubifs_get_free_space(c);
+	dbg_gen("free space %lld bytes (%lld blocks)",
+		free, free >> UBIFS_BLOCK_SHIFT);
+
+	buf->f_type = UBIFS_SUPER_MAGIC;
+	buf->f_bsize = UBIFS_BLOCK_SIZE;
+	buf->f_blocks = c->block_cnt;
+	buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
+	if (free > c->report_rp_size)
+		buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
+	else
+		buf->f_bavail = 0;
+	buf->f_files = 0;
+	buf->f_ffree = 0;
+	buf->f_namelen = UBIFS_MAX_NLEN;
+	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
+	ubifs_assert(buf->f_bfree <= c->block_cnt);
+	return 0;
+}
+
+static int ubifs_show_options(struct seq_file *s, struct dentry *root)
+{
+	struct ubifs_info *c = root->d_sb->s_fs_info;
+
+	if (c->mount_opts.unmount_mode == 2)
+		seq_puts(s, ",fast_unmount");
+	else if (c->mount_opts.unmount_mode == 1)
+		seq_puts(s, ",norm_unmount");
+
+	if (c->mount_opts.bulk_read == 2)
+		seq_puts(s, ",bulk_read");
+	else if (c->mount_opts.bulk_read == 1)
+		seq_puts(s, ",no_bulk_read");
+
+	if (c->mount_opts.chk_data_crc == 2)
+		seq_puts(s, ",chk_data_crc");
+	else if (c->mount_opts.chk_data_crc == 1)
+		seq_puts(s, ",no_chk_data_crc");
+
+	if (c->mount_opts.override_compr) {
+		seq_printf(s, ",compr=%s",
+			   ubifs_compr_name(c->mount_opts.compr_type));
+	}
+
+	return 0;
+}
+
+static int ubifs_sync_fs(struct super_block *sb, int wait)
+{
+	int i, err;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	/*
+	 * Zero @wait is just an advisory thing to help the file system shove
+	 * lots of data into the queues, and there will be the second
+	 * '->sync_fs()' call, with non-zero @wait.
+	 */
+	if (!wait)
+		return 0;
+
+	/*
+	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
+	 * do this if it waits for an already running commit.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Strictly speaking, it is not necessary to commit the journal here,
+	 * synchronizing write-buffers would be enough. But committing makes
+	 * UBIFS free space predictions much more accurate, so we want to let
+	 * the user be able to get more accurate results of 'statfs()' after
+	 * they synchronize the file system.
+	 */
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+
+	return ubi_sync(c->vi.ubi_num);
+}
+
+/**
+ * init_constants_early - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This function initialize UBIFS constants which do not need the superblock to
+ * be read. It also checks that the UBI volume satisfies basic UBIFS
+ * requirements. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int init_constants_early(struct ubifs_info *c)
+{
+	if (c->vi.corrupted) {
+		ubifs_warn(c, "UBI volume is corrupted - read-only mode");
+		c->ro_media = 1;
+	}
+
+	if (c->di.ro_mode) {
+		ubifs_msg(c, "read-only UBI device");
+		c->ro_media = 1;
+	}
+
+	if (c->vi.vol_type == UBI_STATIC_VOLUME) {
+		ubifs_msg(c, "static UBI volume - read-only mode");
+		c->ro_media = 1;
+	}
+
+	c->leb_cnt = c->vi.size;
+	c->leb_size = c->vi.usable_leb_size;
+	c->leb_start = c->di.leb_start;
+	c->half_leb_size = c->leb_size / 2;
+	c->min_io_size = c->di.min_io_size;
+	c->min_io_shift = fls(c->min_io_size) - 1;
+	c->max_write_size = c->di.max_write_size;
+	c->max_write_shift = fls(c->max_write_size) - 1;
+
+	if (c->leb_size < UBIFS_MIN_LEB_SZ) {
+		ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes",
+			  c->leb_size, UBIFS_MIN_LEB_SZ);
+		return -EINVAL;
+	}
+
+	if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+		ubifs_err(c, "too few LEBs (%d), min. is %d",
+			  c->leb_cnt, UBIFS_MIN_LEB_CNT);
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(c->min_io_size)) {
+		ubifs_err(c, "bad min. I/O size %d", c->min_io_size);
+		return -EINVAL;
+	}
+
+	/*
+	 * Maximum write size has to be greater or equivalent to min. I/O
+	 * size, and be multiple of min. I/O size.
+	 */
+	if (c->max_write_size < c->min_io_size ||
+	    c->max_write_size % c->min_io_size ||
+	    !is_power_of_2(c->max_write_size)) {
+		ubifs_err(c, "bad write buffer size %d for %d min. I/O unit",
+			  c->max_write_size, c->min_io_size);
+		return -EINVAL;
+	}
+
+	/*
+	 * UBIFS aligns all node to 8-byte boundary, so to make function in
+	 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
+	 * less than 8.
+	 */
+	if (c->min_io_size < 8) {
+		c->min_io_size = 8;
+		c->min_io_shift = 3;
+		if (c->max_write_size < c->min_io_size) {
+			c->max_write_size = c->min_io_size;
+			c->max_write_shift = c->min_io_shift;
+		}
+	}
+
+	c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
+	c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
+
+	/*
+	 * Initialize node length ranges which are mostly needed for node
+	 * length validation.
+	 */
+	c->ranges[UBIFS_PAD_NODE].len  = UBIFS_PAD_NODE_SZ;
+	c->ranges[UBIFS_SB_NODE].len   = UBIFS_SB_NODE_SZ;
+	c->ranges[UBIFS_MST_NODE].len  = UBIFS_MST_NODE_SZ;
+	c->ranges[UBIFS_REF_NODE].len  = UBIFS_REF_NODE_SZ;
+	c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
+	c->ranges[UBIFS_CS_NODE].len   = UBIFS_CS_NODE_SZ;
+
+	c->ranges[UBIFS_INO_NODE].min_len  = UBIFS_INO_NODE_SZ;
+	c->ranges[UBIFS_INO_NODE].max_len  = UBIFS_MAX_INO_NODE_SZ;
+	c->ranges[UBIFS_ORPH_NODE].min_len =
+				UBIFS_ORPH_NODE_SZ + sizeof(__le64);
+	c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
+	c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
+	c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
+	c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
+	c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
+	c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
+	c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
+	/*
+	 * Minimum indexing node size is amended later when superblock is
+	 * read and the key length is known.
+	 */
+	c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
+	/*
+	 * Maximum indexing node size is amended later when superblock is
+	 * read and the fanout is known.
+	 */
+	c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
+
+	/*
+	 * Initialize dead and dark LEB space watermarks. See gc.c for comments
+	 * about these values.
+	 */
+	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
+	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+
+	/*
+	 * Calculate how many bytes would be wasted at the end of LEB if it was
+	 * fully filled with data nodes of maximum size. This is used in
+	 * calculations when reporting free space.
+	 */
+	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
+
+	/* Buffer size for bulk-reads */
+	c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
+	if (c->max_bu_buf_len > c->leb_size)
+		c->max_bu_buf_len = c->leb_size;
+	return 0;
+}
+
+/**
+ * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB the write-buffer was synchronized to
+ * @free: how many free bytes left in this LEB
+ * @pad: how many bytes were padded
+ *
+ * This is a callback function which is called by the I/O unit when the
+ * write-buffer is synchronized. We need this to correctly maintain space
+ * accounting in bud logical eraseblocks. This function returns zero in case of
+ * success and a negative error code in case of failure.
+ *
+ * This function actually belongs to the journal, but we keep it here because
+ * we want to keep it static.
+ */
+static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
+{
+	return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
+}
+
+/*
+ * init_constants_sb - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the superblock has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int init_constants_sb(struct ubifs_info *c)
+{
+	int tmp, err;
+	long long tmp64;
+
+	c->main_bytes = (long long)c->main_lebs * c->leb_size;
+	c->max_znode_sz = sizeof(struct ubifs_znode) +
+				c->fanout * sizeof(struct ubifs_zbranch);
+
+	tmp = ubifs_idx_node_sz(c, 1);
+	c->ranges[UBIFS_IDX_NODE].min_len = tmp;
+	c->min_idx_node_sz = ALIGN(tmp, 8);
+
+	tmp = ubifs_idx_node_sz(c, c->fanout);
+	c->ranges[UBIFS_IDX_NODE].max_len = tmp;
+	c->max_idx_node_sz = ALIGN(tmp, 8);
+
+	/* Make sure LEB size is large enough to fit full commit */
+	tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
+	tmp = ALIGN(tmp, c->min_io_size);
+	if (tmp > c->leb_size) {
+		ubifs_err(c, "too small LEB size %d, at least %d needed",
+			  c->leb_size, tmp);
+		return -EINVAL;
+	}
+
+	/*
+	 * Make sure that the log is large enough to fit reference nodes for
+	 * all buds plus one reserved LEB.
+	 */
+	tmp64 = c->max_bud_bytes + c->leb_size - 1;
+	c->max_bud_cnt = div_u64(tmp64, c->leb_size);
+	tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
+	tmp /= c->leb_size;
+	tmp += 1;
+	if (c->log_lebs < tmp) {
+		ubifs_err(c, "too small log %d LEBs, required min. %d LEBs",
+			  c->log_lebs, tmp);
+		return -EINVAL;
+	}
+
+	/*
+	 * When budgeting we assume worst-case scenarios when the pages are not
+	 * be compressed and direntries are of the maximum size.
+	 *
+	 * Note, data, which may be stored in inodes is budgeted separately, so
+	 * it is not included into 'c->bi.inode_budget'.
+	 */
+	c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+	c->bi.inode_budget = UBIFS_INO_NODE_SZ;
+	c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+
+	/*
+	 * When the amount of flash space used by buds becomes
+	 * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
+	 * The writers are unblocked when the commit is finished. To avoid
+	 * writers to be blocked UBIFS initiates background commit in advance,
+	 * when number of bud bytes becomes above the limit defined below.
+	 */
+	c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
+
+	/*
+	 * Ensure minimum journal size. All the bytes in the journal heads are
+	 * considered to be used, when calculating the current journal usage.
+	 * Consequently, if the journal is too small, UBIFS will treat it as
+	 * always full.
+	 */
+	tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
+	if (c->bg_bud_bytes < tmp64)
+		c->bg_bud_bytes = tmp64;
+	if (c->max_bud_bytes < tmp64 + c->leb_size)
+		c->max_bud_bytes = tmp64 + c->leb_size;
+
+	err = ubifs_calc_lpt_geom(c);
+	if (err)
+		return err;
+
+	/* Initialize effective LEB size used in budgeting calculations */
+	c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
+	return 0;
+}
+
+/*
+ * init_constants_master - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the master node has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right.
+ */
+static void init_constants_master(struct ubifs_info *c)
+{
+	long long tmp64;
+
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
+
+	/*
+	 * Calculate total amount of FS blocks. This number is not used
+	 * internally because it does not make much sense for UBIFS, but it is
+	 * necessary to report something for the 'statfs()' call.
+	 *
+	 * Subtract the LEB reserved for GC, the LEB which is reserved for
+	 * deletions, minimum LEBs for the index, and assume only one journal
+	 * head is available.
+	 */
+	tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
+	tmp64 *= (long long)c->leb_size - c->leb_overhead;
+	tmp64 = ubifs_reported_space(c, tmp64);
+	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
+}
+
+/**
+ * take_gc_lnum - reserve GC LEB.
+ * @c: UBIFS file-system description object
+ *
+ * This function ensures that the LEB reserved for garbage collection is marked
+ * as "taken" in lprops. We also have to set free space to LEB size and dirty
+ * space to zero, because lprops may contain out-of-date information if the
+ * file-system was un-mounted before it has been committed. This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int take_gc_lnum(struct ubifs_info *c)
+{
+	int err;
+
+	if (c->gc_lnum == -1) {
+		ubifs_err(c, "no LEB for GC");
+		return -EINVAL;
+	}
+
+	/* And we have to tell lprops that this LEB is taken */
+	err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
+				  LPROPS_TAKEN, 0, 0);
+	return err;
+}
+
+/**
+ * alloc_wbufs - allocate write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This helper function allocates and initializes UBIFS write-buffers. Returns
+ * zero in case of success and %-ENOMEM in case of failure.
+ */
+static int alloc_wbufs(struct ubifs_info *c)
+{
+	int i, err;
+
+	c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead),
+			    GFP_KERNEL);
+	if (!c->jheads)
+		return -ENOMEM;
+
+	/* Initialize journal heads */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		INIT_LIST_HEAD(&c->jheads[i].buds_list);
+		err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
+		if (err)
+			return err;
+
+		c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
+		c->jheads[i].wbuf.jhead = i;
+		c->jheads[i].grouped = 1;
+	}
+
+	/*
+	 * Garbage Collector head does not need to be synchronized by timer.
+	 * Also GC head nodes are not grouped.
+	 */
+	c->jheads[GCHD].wbuf.no_timer = 1;
+	c->jheads[GCHD].grouped = 0;
+
+	return 0;
+}
+
+/**
+ * free_wbufs - free write-buffers.
+ * @c: UBIFS file-system description object
+ */
+static void free_wbufs(struct ubifs_info *c)
+{
+	int i;
+
+	if (c->jheads) {
+		for (i = 0; i < c->jhead_cnt; i++) {
+			kfree(c->jheads[i].wbuf.buf);
+			kfree(c->jheads[i].wbuf.inodes);
+		}
+		kfree(c->jheads);
+		c->jheads = NULL;
+	}
+}
+
+/**
+ * free_orphans - free orphans.
+ * @c: UBIFS file-system description object
+ */
+static void free_orphans(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orph;
+
+	while (c->orph_dnext) {
+		orph = c->orph_dnext;
+		c->orph_dnext = orph->dnext;
+		list_del(&orph->list);
+		kfree(orph);
+	}
+
+	while (!list_empty(&c->orph_list)) {
+		orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
+		list_del(&orph->list);
+		kfree(orph);
+		ubifs_err(c, "orphan list not empty at unmount");
+	}
+
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+}
+
+/**
+ * free_buds - free per-bud objects.
+ * @c: UBIFS file-system description object
+ */
+static void free_buds(struct ubifs_info *c)
+{
+	struct ubifs_bud *bud, *n;
+
+	rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
+		kfree(bud);
+}
+
+/**
+ * check_volume_empty - check if the UBI volume is empty.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks if the UBIFS volume is empty by looking if its LEBs are
+ * mapped or not. The result of checking is stored in the @c->empty variable.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int check_volume_empty(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	c->empty = 1;
+	for (lnum = 0; lnum < c->leb_cnt; lnum++) {
+		err = ubifs_is_mapped(c, lnum);
+		if (unlikely(err < 0))
+			return err;
+		if (err == 1) {
+			c->empty = 0;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return 0;
+}
+
+/*
+ * UBIFS mount options.
+ *
+ * Opt_fast_unmount: do not run a journal commit before un-mounting
+ * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_bulk_read: enable bulk-reads
+ * Opt_no_bulk_read: disable bulk-reads
+ * Opt_chk_data_crc: check CRCs when reading data nodes
+ * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
+ * Opt_override_compr: override default compressor
+ * Opt_err: just end of array marker
+ */
+enum {
+	Opt_fast_unmount,
+	Opt_norm_unmount,
+	Opt_bulk_read,
+	Opt_no_bulk_read,
+	Opt_chk_data_crc,
+	Opt_no_chk_data_crc,
+	Opt_override_compr,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_fast_unmount, "fast_unmount"},
+	{Opt_norm_unmount, "norm_unmount"},
+	{Opt_bulk_read, "bulk_read"},
+	{Opt_no_bulk_read, "no_bulk_read"},
+	{Opt_chk_data_crc, "chk_data_crc"},
+	{Opt_no_chk_data_crc, "no_chk_data_crc"},
+	{Opt_override_compr, "compr=%s"},
+	{Opt_err, NULL},
+};
+
+/**
+ * parse_standard_option - parse a standard mount option.
+ * @option: the option to parse
+ *
+ * Normally, standard mount options like "sync" are passed to file-systems as
+ * flags. However, when a "rootflags=" kernel boot parameter is used, they may
+ * be present in the options string. This function tries to deal with this
+ * situation and parse standard options. Returns 0 if the option was not
+ * recognized, and the corresponding integer flag if it was.
+ *
+ * UBIFS is only interested in the "sync" option, so do not check for anything
+ * else.
+ */
+static int parse_standard_option(const char *option)
+{
+
+	pr_notice("UBIFS: parse %s\n", option);
+	if (!strcmp(option, "sync"))
+		return MS_SYNCHRONOUS;
+	return 0;
+}
+
+/**
+ * ubifs_parse_options - parse mount parameters.
+ * @c: UBIFS file-system description object
+ * @options: parameters to parse
+ * @is_remount: non-zero if this is FS re-mount
+ *
+ * This function parses UBIFS mount options and returns zero in case success
+ * and a negative error code in case of failure.
+ */
+static int ubifs_parse_options(struct ubifs_info *c, char *options,
+			       int is_remount)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ","))) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		/*
+		 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
+		 * We accept them in order to be backward-compatible. But this
+		 * should be removed at some point.
+		 */
+		case Opt_fast_unmount:
+			c->mount_opts.unmount_mode = 2;
+			break;
+		case Opt_norm_unmount:
+			c->mount_opts.unmount_mode = 1;
+			break;
+		case Opt_bulk_read:
+			c->mount_opts.bulk_read = 2;
+			c->bulk_read = 1;
+			break;
+		case Opt_no_bulk_read:
+			c->mount_opts.bulk_read = 1;
+			c->bulk_read = 0;
+			break;
+		case Opt_chk_data_crc:
+			c->mount_opts.chk_data_crc = 2;
+			c->no_chk_data_crc = 0;
+			break;
+		case Opt_no_chk_data_crc:
+			c->mount_opts.chk_data_crc = 1;
+			c->no_chk_data_crc = 1;
+			break;
+		case Opt_override_compr:
+		{
+			char *name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (!strcmp(name, "none"))
+				c->mount_opts.compr_type = UBIFS_COMPR_NONE;
+			else if (!strcmp(name, "lzo"))
+				c->mount_opts.compr_type = UBIFS_COMPR_LZO;
+			else if (!strcmp(name, "zlib"))
+				c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+			else {
+				ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready?
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			c->mount_opts.override_compr = 1;
+			c->default_compr = c->mount_opts.compr_type;
+			break;
+		}
+		default:
+		{
+			unsigned long flag;
+			struct super_block *sb = c->vfs_sb;
+
+			flag = parse_standard_option(p);
+			if (!flag) {
+				ubifs_err(c, "unrecognized mount option \"%s\" or missing value",
+					  p);
+				return -EINVAL;
+			}
+			sb->s_flags |= flag;
+			break;
+		}
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_journal - destroy journal data structures.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys journal data structures including those that may have
+ * been created by recovery functions.
+ */
+static void destroy_journal(struct ubifs_info *c)
+{
+	while (!list_empty(&c->unclean_leb_list)) {
+		struct ubifs_unclean_leb *ucleb;
+
+		ucleb = list_entry(c->unclean_leb_list.next,
+				   struct ubifs_unclean_leb, list);
+		list_del(&ucleb->list);
+		kfree(ucleb);
+	}
+	while (!list_empty(&c->old_buds)) {
+		struct ubifs_bud *bud;
+
+		bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+		list_del(&bud->list);
+		kfree(bud);
+	}
+	ubifs_destroy_idx_gc(c);
+	ubifs_destroy_size_tree(c);
+	ubifs_tnc_close(c);
+	free_buds(c);
+}
+
+/**
+ * bu_init - initialize bulk-read information.
+ * @c: UBIFS file-system description object
+ */
+static void bu_init(struct ubifs_info *c)
+{
+	ubifs_assert(c->bulk_read == 1);
+
+	if (c->bu.buf)
+		return; /* Already initialized */
+
+again:
+	c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN);
+	if (!c->bu.buf) {
+		if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) {
+			c->max_bu_buf_len = UBIFS_KMALLOC_OK;
+			goto again;
+		}
+
+		/* Just disable bulk-read */
+		ubifs_warn(c, "cannot allocate %d bytes of memory for bulk-read, disabling it",
+			   c->max_bu_buf_len);
+		c->mount_opts.bulk_read = 1;
+		c->bulk_read = 0;
+		return;
+	}
+}
+
+/**
+ * check_free_space - check if there is enough free space to mount.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free space to be mounted in
+ * read/write mode. UBIFS must always have some free space to allow deletions.
+ */
+static int check_free_space(struct ubifs_info *c)
+{
+	ubifs_assert(c->dark_wm > 0);
+	if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
+		ubifs_err(c, "insufficient free space to mount in R/W mode");
+		ubifs_dump_budg(c, &c->bi);
+		ubifs_dump_lprops(c);
+		return -ENOSPC;
+	}
+	return 0;
+}
+
+/**
+ * mount_ubifs - mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function mounts UBIFS file system. Returns zero in case of success and
+ * a negative error code in case of failure.
+ */
+static int mount_ubifs(struct ubifs_info *c)
+{
+	int err;
+	long long x, y;
+	size_t sz;
+
+	c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
+	/* Suppress error messages while probing if MS_SILENT is set */
+	c->probing = !!(c->vfs_sb->s_flags & MS_SILENT);
+
+	err = init_constants_early(c);
+	if (err)
+		return err;
+
+	err = ubifs_debugging_init(c);
+	if (err)
+		return err;
+
+	err = check_volume_empty(c);
+	if (err)
+		goto out_free;
+
+	if (c->empty && (c->ro_mount || c->ro_media)) {
+		/*
+		 * This UBI volume is empty, and read-only, or the file system
+		 * is mounted read-only - we cannot format it.
+		 */
+		ubifs_err(c, "can't format empty UBI volume: read-only %s",
+			  c->ro_media ? "UBI volume" : "mount");
+		err = -EROFS;
+		goto out_free;
+	}
+
+	if (c->ro_media && !c->ro_mount) {
+		ubifs_err(c, "cannot mount read-write - read-only media");
+		err = -EROFS;
+		goto out_free;
+	}
+
+	/*
+	 * The requirement for the buffer is that it should fit indexing B-tree
+	 * height amount of integers. We assume the height if the TNC tree will
+	 * never exceed 64.
+	 */
+	err = -ENOMEM;
+	c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
+	if (!c->bottom_up_buf)
+		goto out_free;
+
+	c->sbuf = vmalloc(c->leb_size);
+	if (!c->sbuf)
+		goto out_free;
+
+	if (!c->ro_mount) {
+		c->ileb_buf = vmalloc(c->leb_size);
+		if (!c->ileb_buf)
+			goto out_free;
+	}
+
+	if (c->bulk_read == 1)
+		bu_init(c);
+
+	if (!c->ro_mount) {
+		c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
+					       GFP_KERNEL);
+		if (!c->write_reserve_buf)
+			goto out_free;
+	}
+
+	c->mounting = 1;
+
+	err = ubifs_read_superblock(c);
+	if (err)
+		goto out_free;
+
+	c->probing = 0;
+
+	/*
+	 * Make sure the compressor which is set as default in the superblock
+	 * or overridden by mount options is actually compiled in.
+	 */
+	if (!ubifs_compr_present(c->default_compr)) {
+		ubifs_err(c, "'compressor \"%s\" is not compiled in",
+			  ubifs_compr_name(c->default_compr));
+		err = -ENOTSUPP;
+		goto out_free;
+	}
+
+	err = init_constants_sb(c);
+	if (err)
+		goto out_free;
+
+	sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
+	sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
+	c->cbuf = kmalloc(sz, GFP_NOFS);
+	if (!c->cbuf) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	err = alloc_wbufs(c);
+	if (err)
+		goto out_cbuf;
+
+	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
+	if (!c->ro_mount) {
+		/* Create background thread */
+		c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
+		if (IS_ERR(c->bgt)) {
+			err = PTR_ERR(c->bgt);
+			c->bgt = NULL;
+			ubifs_err(c, "cannot spawn \"%s\", error %d",
+				  c->bgt_name, err);
+			goto out_wbufs;
+		}
+		wake_up_process(c->bgt);
+	}
+
+	err = ubifs_read_master(c);
+	if (err)
+		goto out_master;
+
+	init_constants_master(c);
+
+	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
+		ubifs_msg(c, "recovery needed");
+		c->need_recovery = 1;
+	}
+
+	if (c->need_recovery && !c->ro_mount) {
+		err = ubifs_recover_inl_heads(c, c->sbuf);
+		if (err)
+			goto out_master;
+	}
+
+	err = ubifs_lpt_init(c, 1, !c->ro_mount);
+	if (err)
+		goto out_master;
+
+	if (!c->ro_mount && c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			goto out_lpt;
+	}
+
+	if (!c->ro_mount && !c->need_recovery) {
+		/*
+		 * Set the "dirty" flag so that if we reboot uncleanly we
+		 * will notice this immediately on the next mount.
+		 */
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+		err = ubifs_write_master(c);
+		if (err)
+			goto out_lpt;
+	}
+
+	err = dbg_check_idx_size(c, c->bi.old_idx_sz);
+	if (err)
+		goto out_lpt;
+
+	err = ubifs_replay_journal(c);
+	if (err)
+		goto out_journal;
+
+	/* Calculate 'min_idx_lebs' after journal replay */
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
+	if (err)
+		goto out_orphans;
+
+	if (!c->ro_mount) {
+		int lnum;
+
+		err = check_free_space(c);
+		if (err)
+			goto out_orphans;
+
+		/* Check for enough log space */
+		lnum = c->lhead_lnum + 1;
+		if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+			lnum = UBIFS_LOG_LNUM;
+		if (lnum == c->ltail_lnum) {
+			err = ubifs_consolidate_log(c);
+			if (err)
+				goto out_orphans;
+		}
+
+		if (c->need_recovery) {
+			err = ubifs_recover_size(c);
+			if (err)
+				goto out_orphans;
+			err = ubifs_rcvry_gc_commit(c);
+			if (err)
+				goto out_orphans;
+		} else {
+			err = take_gc_lnum(c);
+			if (err)
+				goto out_orphans;
+
+			/*
+			 * GC LEB may contain garbage if there was an unclean
+			 * reboot, and it should be un-mapped.
+			 */
+			err = ubifs_leb_unmap(c, c->gc_lnum);
+			if (err)
+				goto out_orphans;
+		}
+
+		err = dbg_check_lprops(c);
+		if (err)
+			goto out_orphans;
+	} else if (c->need_recovery) {
+		err = ubifs_recover_size(c);
+		if (err)
+			goto out_orphans;
+	} else {
+		/*
+		 * Even if we mount read-only, we have to set space in GC LEB
+		 * to proper value because this affects UBIFS free space
+		 * reporting. We do not want to have a situation when
+		 * re-mounting from R/O to R/W changes amount of free space.
+		 */
+		err = take_gc_lnum(c);
+		if (err)
+			goto out_orphans;
+	}
+
+	spin_lock(&ubifs_infos_lock);
+	list_add_tail(&c->infos_list, &ubifs_infos);
+	spin_unlock(&ubifs_infos_lock);
+
+	if (c->need_recovery) {
+		if (c->ro_mount)
+			ubifs_msg(c, "recovery deferred");
+		else {
+			c->need_recovery = 0;
+			ubifs_msg(c, "recovery completed");
+			/*
+			 * GC LEB has to be empty and taken at this point. But
+			 * the journal head LEBs may also be accounted as
+			 * "empty taken" if they are empty.
+			 */
+			ubifs_assert(c->lst.taken_empty_lebs > 0);
+		}
+	} else
+		ubifs_assert(c->lst.taken_empty_lebs > 0);
+
+	err = dbg_check_filesystem(c);
+	if (err)
+		goto out_infos;
+
+	err = dbg_debugfs_init_fs(c);
+	if (err)
+		goto out_infos;
+
+	c->mounting = 0;
+
+	ubifs_msg(c, "UBIFS: mounted UBI device %d, volume %d, name \"%s\"%s",
+		  c->vi.ubi_num, c->vi.vol_id, c->vi.name,
+		  c->ro_mount ? ", R/O mode" : "");
+	x = (long long)c->main_lebs * c->leb_size;
+	y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+	ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
+		  c->leb_size, c->leb_size >> 10, c->min_io_size,
+		  c->max_write_size);
+	ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)",
+		  x, x >> 20, c->main_lebs,
+		  y, y >> 20, c->log_lebs + c->max_bud_cnt);
+	ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)",
+		  c->report_rp_size, c->report_rp_size >> 10);
+	ubifs_msg(c, "media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s",
+		  c->fmt_version, c->ro_compat_version,
+		  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid,
+		  c->big_lpt ? ", big LPT model" : ", small LPT model");
+
+	dbg_gen("default compressor:  %s", ubifs_compr_name(c->default_compr));
+	dbg_gen("data journal heads:  %d",
+		c->jhead_cnt - NONDATA_JHEADS_CNT);
+	dbg_gen("log LEBs:            %d (%d - %d)",
+		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
+	dbg_gen("LPT area LEBs:       %d (%d - %d)",
+		c->lpt_lebs, c->lpt_first, c->lpt_last);
+	dbg_gen("orphan area LEBs:    %d (%d - %d)",
+		c->orph_lebs, c->orph_first, c->orph_last);
+	dbg_gen("main area LEBs:      %d (%d - %d)",
+		c->main_lebs, c->main_first, c->leb_cnt - 1);
+	dbg_gen("index LEBs:          %d", c->lst.idx_lebs);
+	dbg_gen("total index bytes:   %lld (%lld KiB, %lld MiB)",
+		c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+		c->bi.old_idx_sz >> 20);
+	dbg_gen("key hash type:       %d", c->key_hash_type);
+	dbg_gen("tree fanout:         %d", c->fanout);
+	dbg_gen("reserved GC LEB:     %d", c->gc_lnum);
+	dbg_gen("max. znode size      %d", c->max_znode_sz);
+	dbg_gen("max. index node size %d", c->max_idx_node_sz);
+	dbg_gen("node sizes:          data %zu, inode %zu, dentry %zu",
+		UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
+	dbg_gen("node sizes:          trun %zu, sb %zu, master %zu",
+		UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
+	dbg_gen("node sizes:          ref %zu, cmt. start %zu, orph %zu",
+		UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
+	dbg_gen("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
+		UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+		UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
+	dbg_gen("dead watermark:      %d", c->dead_wm);
+	dbg_gen("dark watermark:      %d", c->dark_wm);
+	dbg_gen("LEB overhead:        %d", c->leb_overhead);
+	x = (long long)c->main_lebs * c->dark_wm;
+	dbg_gen("max. dark space:     %lld (%lld KiB, %lld MiB)",
+		x, x >> 10, x >> 20);
+	dbg_gen("maximum bud bytes:   %lld (%lld KiB, %lld MiB)",
+		c->max_bud_bytes, c->max_bud_bytes >> 10,
+		c->max_bud_bytes >> 20);
+	dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+		c->bg_bud_bytes, c->bg_bud_bytes >> 10,
+		c->bg_bud_bytes >> 20);
+	dbg_gen("current bud bytes    %lld (%lld KiB, %lld MiB)",
+		c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
+	dbg_gen("max. seq. number:    %llu", c->max_sqnum);
+	dbg_gen("commit number:       %llu", c->cmt_no);
+
+	return 0;
+
+out_infos:
+	spin_lock(&ubifs_infos_lock);
+	list_del(&c->infos_list);
+	spin_unlock(&ubifs_infos_lock);
+out_orphans:
+	free_orphans(c);
+out_journal:
+	destroy_journal(c);
+out_lpt:
+	ubifs_lpt_free(c, 0);
+out_master:
+	kfree(c->mst_node);
+	kfree(c->rcvrd_mst_node);
+	if (c->bgt)
+		kthread_stop(c->bgt);
+out_wbufs:
+	free_wbufs(c);
+out_cbuf:
+	kfree(c->cbuf);
+out_free:
+	kfree(c->write_reserve_buf);
+	kfree(c->bu.buf);
+	vfree(c->ileb_buf);
+	vfree(c->sbuf);
+	kfree(c->bottom_up_buf);
+	ubifs_debugging_exit(c);
+	return err;
+}
+
+/**
+ * ubifs_umount - un-mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * Note, this function is called to free allocated resourced when un-mounting,
+ * as well as free resources when an error occurred while we were half way
+ * through mounting (error path cleanup function). So it has to make sure the
+ * resource was actually allocated before freeing it.
+ */
+static void ubifs_umount(struct ubifs_info *c)
+{
+	dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
+		c->vi.vol_id);
+
+	dbg_debugfs_exit_fs(c);
+	spin_lock(&ubifs_infos_lock);
+	list_del(&c->infos_list);
+	spin_unlock(&ubifs_infos_lock);
+
+	if (c->bgt)
+		kthread_stop(c->bgt);
+
+	destroy_journal(c);
+	free_wbufs(c);
+	free_orphans(c);
+	ubifs_lpt_free(c, 0);
+
+	kfree(c->cbuf);
+	kfree(c->rcvrd_mst_node);
+	kfree(c->mst_node);
+	kfree(c->write_reserve_buf);
+	kfree(c->bu.buf);
+	vfree(c->ileb_buf);
+	vfree(c->sbuf);
+	kfree(c->bottom_up_buf);
+	ubifs_debugging_exit(c);
+}
+
+/**
+ * ubifs_remount_rw - re-mount in read-write mode.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS avoids allocating many unnecessary resources when mounted in read-only
+ * mode. This function allocates the needed resources and re-mounts UBIFS in
+ * read-write mode.
+ */
+static int ubifs_remount_rw(struct ubifs_info *c)
+{
+	int err, lnum;
+
+	if (c->rw_incompat) {
+		ubifs_err(c, "the file-system is not R/W-compatible");
+		ubifs_msg(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
+			  c->fmt_version, c->ro_compat_version,
+			  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
+		return -EROFS;
+	}
+
+	mutex_lock(&c->umount_mutex);
+	dbg_save_space_info(c);
+	c->remounting_rw = 1;
+	c->ro_mount = 0;
+
+	if (c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			goto out;
+	}
+
+	err = check_free_space(c);
+	if (err)
+		goto out;
+
+	if (c->old_leb_cnt != c->leb_cnt) {
+		struct ubifs_sb_node *sup;
+
+		sup = ubifs_read_sb_node(c);
+		if (IS_ERR(sup)) {
+			err = PTR_ERR(sup);
+			goto out;
+		}
+		sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+		err = ubifs_write_sb_node(c, sup);
+		kfree(sup);
+		if (err)
+			goto out;
+	}
+
+	if (c->need_recovery) {
+		ubifs_msg(c, "completing deferred recovery");
+		err = ubifs_write_rcvrd_mst_node(c);
+		if (err)
+			goto out;
+		err = ubifs_recover_size(c);
+		if (err)
+			goto out;
+		err = ubifs_clean_lebs(c, c->sbuf);
+		if (err)
+			goto out;
+		err = ubifs_recover_inl_heads(c, c->sbuf);
+		if (err)
+			goto out;
+	} else {
+		/* A readonly mount is not allowed to have orphans */
+		ubifs_assert(c->tot_orphans == 0);
+		err = ubifs_clear_orphans(c);
+		if (err)
+			goto out;
+	}
+
+	if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+		err = ubifs_write_master(c);
+		if (err)
+			goto out;
+	}
+
+	c->ileb_buf = vmalloc(c->leb_size);
+	if (!c->ileb_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+	if (!c->write_reserve_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = ubifs_lpt_init(c, 0, 1);
+	if (err)
+		goto out;
+
+	/* Create background thread */
+	c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
+	if (IS_ERR(c->bgt)) {
+		err = PTR_ERR(c->bgt);
+		c->bgt = NULL;
+		ubifs_err(c, "cannot spawn \"%s\", error %d",
+			  c->bgt_name, err);
+		goto out;
+	}
+	wake_up_process(c->bgt);
+
+	c->orph_buf = vmalloc(c->leb_size);
+	if (!c->orph_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Check for enough log space */
+	lnum = c->lhead_lnum + 1;
+	if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+		lnum = UBIFS_LOG_LNUM;
+	if (lnum == c->ltail_lnum) {
+		err = ubifs_consolidate_log(c);
+		if (err)
+			goto out;
+	}
+
+	if (c->need_recovery)
+		err = ubifs_rcvry_gc_commit(c);
+	else
+		err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err)
+		goto out;
+
+	dbg_gen("re-mounted read-write");
+	c->remounting_rw = 0;
+
+	if (c->need_recovery) {
+		c->need_recovery = 0;
+		ubifs_msg(c, "deferred recovery completed");
+	} else {
+		/*
+		 * Do not run the debugging space check if the were doing
+		 * recovery, because when we saved the information we had the
+		 * file-system in a state where the TNC and lprops has been
+		 * modified in memory, but all the I/O operations (including a
+		 * commit) were deferred. So the file-system was in
+		 * "non-committed" state. Now the file-system is in committed
+		 * state, and of course the amount of free space will change
+		 * because, for example, the old index size was imprecise.
+		 */
+		err = dbg_check_space_info(c);
+	}
+
+	mutex_unlock(&c->umount_mutex);
+	return err;
+
+out:
+	c->ro_mount = 1;
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+	if (c->bgt) {
+		kthread_stop(c->bgt);
+		c->bgt = NULL;
+	}
+	free_wbufs(c);
+	kfree(c->write_reserve_buf);
+	c->write_reserve_buf = NULL;
+	vfree(c->ileb_buf);
+	c->ileb_buf = NULL;
+	ubifs_lpt_free(c, 1);
+	c->remounting_rw = 0;
+	mutex_unlock(&c->umount_mutex);
+	return err;
+}
+
+/**
+ * ubifs_remount_ro - re-mount in read-only mode.
+ * @c: UBIFS file-system description object
+ *
+ * We assume VFS has stopped writing. Possibly the background thread could be
+ * running a commit, however kthread_stop will wait in that case.
+ */
+static void ubifs_remount_ro(struct ubifs_info *c)
+{
+	int i, err;
+
+	ubifs_assert(!c->need_recovery);
+	ubifs_assert(!c->ro_mount);
+
+	mutex_lock(&c->umount_mutex);
+	if (c->bgt) {
+		kthread_stop(c->bgt);
+		c->bgt = NULL;
+	}
+
+	dbg_save_space_info(c);
+
+	for (i = 0; i < c->jhead_cnt; i++)
+		ubifs_wbuf_sync(&c->jheads[i].wbuf);
+
+	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+	err = ubifs_write_master(c);
+	if (err)
+		ubifs_ro_mode(c, err);
+
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+	kfree(c->write_reserve_buf);
+	c->write_reserve_buf = NULL;
+	vfree(c->ileb_buf);
+	c->ileb_buf = NULL;
+	ubifs_lpt_free(c, 1);
+	c->ro_mount = 1;
+	err = dbg_check_space_info(c);
+	if (err)
+		ubifs_ro_mode(c, err);
+	mutex_unlock(&c->umount_mutex);
+}
+
+static void ubifs_put_super(struct super_block *sb)
+{
+	int i;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	ubifs_msg(c, "un-mount UBI device %d", c->vi.ubi_num);
+
+	/*
+	 * The following asserts are only valid if there has not been a failure
+	 * of the media. For example, there will be dirty inodes if we failed
+	 * to write them back because of I/O errors.
+	 */
+	if (!c->ro_error) {
+		ubifs_assert(c->bi.idx_growth == 0);
+		ubifs_assert(c->bi.dd_growth == 0);
+		ubifs_assert(c->bi.data_growth == 0);
+	}
+
+	/*
+	 * The 'c->umount_lock' prevents races between UBIFS memory shrinker
+	 * and file system un-mount. Namely, it prevents the shrinker from
+	 * picking this superblock for shrinking - it will be just skipped if
+	 * the mutex is locked.
+	 */
+	mutex_lock(&c->umount_mutex);
+	if (!c->ro_mount) {
+		/*
+		 * First of all kill the background thread to make sure it does
+		 * not interfere with un-mounting and freeing resources.
+		 */
+		if (c->bgt) {
+			kthread_stop(c->bgt);
+			c->bgt = NULL;
+		}
+
+		/*
+		 * On fatal errors c->ro_error is set to 1, in which case we do
+		 * not write the master node.
+		 */
+		if (!c->ro_error) {
+			int err;
+
+			/* Synchronize write-buffers */
+			for (i = 0; i < c->jhead_cnt; i++)
+				ubifs_wbuf_sync(&c->jheads[i].wbuf);
+
+			/*
+			 * We are being cleanly unmounted which means the
+			 * orphans were killed - indicate this in the master
+			 * node. Also save the reserved GC LEB number.
+			 */
+			c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+			c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+			c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+			err = ubifs_write_master(c);
+			if (err)
+				/*
+				 * Recovery will attempt to fix the master area
+				 * next mount, so we just print a message and
+				 * continue to unmount normally.
+				 */
+				ubifs_err(c, "failed to write master node, error %d",
+					  err);
+		} else {
+			for (i = 0; i < c->jhead_cnt; i++)
+				/* Make sure write-buffer timers are canceled */
+				hrtimer_cancel(&c->jheads[i].wbuf.timer);
+		}
+	}
+
+	ubifs_umount(c);
+	bdi_destroy(&c->bdi);
+	ubi_close_volume(c->ubi);
+	mutex_unlock(&c->umount_mutex);
+}
+
+static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	sync_filesystem(sb);
+	dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+
+	err = ubifs_parse_options(c, data, 1);
+	if (err) {
+		ubifs_err(c, "invalid or unknown remount parameter");
+		return err;
+	}
+
+	if (c->ro_mount && !(*flags & MS_RDONLY)) {
+		if (c->ro_error) {
+			ubifs_msg(c, "cannot re-mount R/W due to prior errors");
+			return -EROFS;
+		}
+		if (c->ro_media) {
+			ubifs_msg(c, "cannot re-mount R/W - UBI volume is R/O");
+			return -EROFS;
+		}
+		err = ubifs_remount_rw(c);
+		if (err)
+			return err;
+	} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
+		if (c->ro_error) {
+			ubifs_msg(c, "cannot re-mount R/O due to prior errors");
+			return -EROFS;
+		}
+		ubifs_remount_ro(c);
+	}
+
+	if (c->bulk_read == 1)
+		bu_init(c);
+	else {
+		dbg_gen("disable bulk-read");
+		kfree(c->bu.buf);
+		c->bu.buf = NULL;
+	}
+
+	ubifs_assert(c->lst.taken_empty_lebs > 0);
+	return 0;
+}
+
+const struct super_operations ubifs_super_operations = {
+	.alloc_inode   = ubifs_alloc_inode,
+	.destroy_inode = ubifs_destroy_inode,
+	.put_super     = ubifs_put_super,
+	.write_inode   = ubifs_write_inode,
+	.evict_inode   = ubifs_evict_inode,
+	.statfs        = ubifs_statfs,
+	.dirty_inode   = ubifs_dirty_inode,
+	.remount_fs    = ubifs_remount_fs,
+	.show_options  = ubifs_show_options,
+	.sync_fs       = ubifs_sync_fs,
+};
+
+/**
+ * open_ubi - parse UBI device name string and open the UBI device.
+ * @name: UBI volume name
+ * @mode: UBI volume open mode
+ *
+ * The primary method of mounting UBIFS is by specifying the UBI volume
+ * character device node path. However, UBIFS may also be mounted withoug any
+ * character device node using one of the following methods:
+ *
+ * o ubiX_Y    - mount UBI device number X, volume Y;
+ * o ubiY      - mount UBI device number 0, volume Y;
+ * o ubiX:NAME - mount UBI device X, volume with name NAME;
+ * o ubi:NAME  - mount UBI device 0, volume with name NAME.
+ *
+ * Alternative '!' separator may be used instead of ':' (because some shells
+ * like busybox may interpret ':' as an NFS host name separator). This function
+ * returns UBI volume description object in case of success and a negative
+ * error code in case of failure.
+ */
+static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+{
+	struct ubi_volume_desc *ubi;
+	int dev, vol;
+	char *endptr;
+
+	/* First, try to open using the device node path method */
+	ubi = ubi_open_volume_path(name, mode);
+	if (!IS_ERR(ubi))
+		return ubi;
+
+	/* Try the "nodev" method */
+	if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
+		return ERR_PTR(-EINVAL);
+
+	/* ubi:NAME method */
+	if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
+		return ubi_open_volume_nm(0, name + 4, mode);
+
+	if (!isdigit(name[3]))
+		return ERR_PTR(-EINVAL);
+
+	dev = simple_strtoul(name + 3, &endptr, 0);
+
+	/* ubiY method */
+	if (*endptr == '\0')
+		return ubi_open_volume(0, dev, mode);
+
+	/* ubiX_Y method */
+	if (*endptr == '_' && isdigit(endptr[1])) {
+		vol = simple_strtoul(endptr + 1, &endptr, 0);
+		if (*endptr != '\0')
+			return ERR_PTR(-EINVAL);
+		return ubi_open_volume(dev, vol, mode);
+	}
+
+	/* ubiX:NAME method */
+	if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
+		return ubi_open_volume_nm(dev, ++endptr, mode);
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
+{
+	struct ubifs_info *c;
+
+	c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
+	if (c) {
+		spin_lock_init(&c->cnt_lock);
+		spin_lock_init(&c->cs_lock);
+		spin_lock_init(&c->buds_lock);
+		spin_lock_init(&c->space_lock);
+		spin_lock_init(&c->orphan_lock);
+		init_rwsem(&c->commit_sem);
+		mutex_init(&c->lp_mutex);
+		mutex_init(&c->tnc_mutex);
+		mutex_init(&c->log_mutex);
+		mutex_init(&c->umount_mutex);
+		mutex_init(&c->bu_mutex);
+		mutex_init(&c->write_reserve_mutex);
+		init_waitqueue_head(&c->cmt_wq);
+		c->buds = RB_ROOT;
+		c->old_idx = RB_ROOT;
+		c->size_tree = RB_ROOT;
+		c->orph_tree = RB_ROOT;
+		INIT_LIST_HEAD(&c->infos_list);
+		INIT_LIST_HEAD(&c->idx_gc);
+		INIT_LIST_HEAD(&c->replay_list);
+		INIT_LIST_HEAD(&c->replay_buds);
+		INIT_LIST_HEAD(&c->uncat_list);
+		INIT_LIST_HEAD(&c->empty_list);
+		INIT_LIST_HEAD(&c->freeable_list);
+		INIT_LIST_HEAD(&c->frdi_idx_list);
+		INIT_LIST_HEAD(&c->unclean_leb_list);
+		INIT_LIST_HEAD(&c->old_buds);
+		INIT_LIST_HEAD(&c->orph_list);
+		INIT_LIST_HEAD(&c->orph_new);
+		c->no_chk_data_crc = 1;
+
+		c->highest_inum = UBIFS_FIRST_INO;
+		c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
+
+		ubi_get_volume_info(ubi, &c->vi);
+		ubi_get_device_info(c->vi.ubi_num, &c->di);
+	}
+	return c;
+}
+
+static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct ubifs_info *c = sb->s_fs_info;
+	struct inode *root;
+	int err;
+
+	c->vfs_sb = sb;
+	/* Re-open the UBI device in read-write mode */
+	c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
+	if (IS_ERR(c->ubi)) {
+		err = PTR_ERR(c->ubi);
+		goto out;
+	}
+
+	/*
+	 * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
+	 * UBIFS, I/O is not deferred, it is done immediately in readpage,
+	 * which means the user would have to wait not just for their own I/O
+	 * but the read-ahead I/O as well i.e. completely pointless.
+	 *
+	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+	 */
+	c->bdi.name = "ubifs",
+	c->bdi.capabilities = 0;
+	err  = bdi_init(&c->bdi);
+	if (err)
+		goto out_close;
+	err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
+			   c->vi.ubi_num, c->vi.vol_id);
+	if (err)
+		goto out_bdi;
+
+	err = ubifs_parse_options(c, data, 0);
+	if (err)
+		goto out_bdi;
+
+	sb->s_bdi = &c->bdi;
+	sb->s_fs_info = c;
+	sb->s_magic = UBIFS_SUPER_MAGIC;
+	sb->s_blocksize = UBIFS_BLOCK_SIZE;
+	sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
+	sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
+	if (c->max_inode_sz > MAX_LFS_FILESIZE)
+		sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
+	sb->s_op = &ubifs_super_operations;
+	sb->s_xattr = ubifs_xattr_handlers;
+
+	mutex_lock(&c->umount_mutex);
+	err = mount_ubifs(c);
+	if (err) {
+		ubifs_assert(err < 0);
+		goto out_unlock;
+	}
+
+	/* Read the root inode */
+	root = ubifs_iget(sb, UBIFS_ROOT_INO);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out_umount;
+	}
+
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto out_umount;
+	}
+
+	mutex_unlock(&c->umount_mutex);
+	return 0;
+
+out_umount:
+	ubifs_umount(c);
+out_unlock:
+	mutex_unlock(&c->umount_mutex);
+out_bdi:
+	bdi_destroy(&c->bdi);
+out_close:
+	ubi_close_volume(c->ubi);
+out:
+	return err;
+}
+
+static int sb_test(struct super_block *sb, void *data)
+{
+	struct ubifs_info *c1 = data;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	return c->vi.cdev == c1->vi.cdev;
+}
+
+static int sb_set(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
+			const char *name, void *data)
+{
+	struct ubi_volume_desc *ubi;
+	struct ubifs_info *c;
+	struct super_block *sb;
+	int err;
+
+	dbg_gen("name %s, flags %#x", name, flags);
+
+	/*
+	 * Get UBI device number and volume ID. Mount it read-only so far
+	 * because this might be a new mount point, and UBI allows only one
+	 * read-write user at a time.
+	 */
+	ubi = open_ubi(name, UBI_READONLY);
+	if (IS_ERR(ubi)) {
+		pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
+		       current->pid, name, (int)PTR_ERR(ubi));
+		return ERR_CAST(ubi);
+	}
+
+	c = alloc_ubifs_info(ubi);
+	if (!c) {
+		err = -ENOMEM;
+		goto out_close;
+	}
+
+	dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+
+	sb = sget(fs_type, sb_test, sb_set, flags, c);
+	if (IS_ERR(sb)) {
+		err = PTR_ERR(sb);
+		kfree(c);
+		goto out_close;
+	}
+
+	if (sb->s_root) {
+		struct ubifs_info *c1 = sb->s_fs_info;
+		kfree(c);
+		/* A new mount point for already mounted UBIFS */
+		dbg_gen("this ubi volume is already mounted");
+		if (!!(flags & MS_RDONLY) != c1->ro_mount) {
+			err = -EBUSY;
+			goto out_deact;
+		}
+	} else {
+		err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err)
+			goto out_deact;
+		/* We do not support atime */
+		sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+	}
+
+	/* 'fill_super()' opens ubi again so we must close it here */
+	ubi_close_volume(ubi);
+
+	return dget(sb->s_root);
+
+out_deact:
+	deactivate_locked_super(sb);
+out_close:
+	ubi_close_volume(ubi);
+	return ERR_PTR(err);
+}
+
+static void kill_ubifs_super(struct super_block *s)
+{
+	struct ubifs_info *c = s->s_fs_info;
+	kill_anon_super(s);
+	kfree(c);
+}
+
+static struct file_system_type ubifs_fs_type = {
+	.name    = "ubifs",
+	.owner   = THIS_MODULE,
+	.mount   = ubifs_mount,
+	.kill_sb = kill_ubifs_super,
+};
+MODULE_ALIAS_FS("ubifs");
+
+/*
+ * Inode slab cache constructor.
+ */
+static void inode_slab_ctor(void *obj)
+{
+	struct ubifs_inode *ui = obj;
+	inode_init_once(&ui->vfs_inode);
+}
+
+static int __init ubifs_init(void)
+{
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
+
+	/* Make sure node sizes are 8-byte aligned */
+	BUILD_BUG_ON(UBIFS_CH_SZ        & 7);
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_SB_NODE_SZ   & 7);
+	BUILD_BUG_ON(UBIFS_MST_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_REF_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_CS_NODE_SZ   & 7);
+	BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
+
+	BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_MAX_NODE_SZ      & 7);
+	BUILD_BUG_ON(MIN_WRITE_SZ           & 7);
+
+	/* Check min. node size */
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ  < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
+
+	BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  > UBIFS_MAX_NODE_SZ);
+
+	/* Defined node sizes */
+	BUILD_BUG_ON(UBIFS_SB_NODE_SZ  != 4096);
+	BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
+	BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
+
+	/*
+	 * We use 2 bit wide bit-fields to store compression type, which should
+	 * be amended if more compressors are added. The bit-fields are:
+	 * @compr_type in 'struct ubifs_inode', @default_compr in
+	 * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
+	 */
+	BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
+
+	/*
+	 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+	 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
+	 */
+	if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+		pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
+		       current->pid, (unsigned int)PAGE_CACHE_SIZE);
+		return -EINVAL;
+	}
+
+	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
+				sizeof(struct ubifs_inode), 0,
+				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
+				&inode_slab_ctor);
+	if (!ubifs_inode_slab)
+		return -ENOMEM;
+
+	register_shrinker(&ubifs_shrinker_info);
+
+	err = ubifs_compressors_init();
+	if (err)
+		goto out_shrinker;
+
+	err = dbg_debugfs_init();
+	if (err)
+		goto out_compr;
+
+	err = register_filesystem(&ubifs_fs_type);
+	if (err) {
+		pr_err("UBIFS error (pid %d): cannot register file system, error %d",
+		       current->pid, err);
+		goto out_dbg;
+	}
+	return 0;
+
+out_dbg:
+	dbg_debugfs_exit();
+out_compr:
+	ubifs_compressors_exit();
+out_shrinker:
+	unregister_shrinker(&ubifs_shrinker_info);
+	kmem_cache_destroy(ubifs_inode_slab);
+	return err;
+}
+/* late_initcall to let compressors initialize first */
+late_initcall(ubifs_init);
+
+static void __exit ubifs_exit(void)
+{
+	ubifs_assert(list_empty(&ubifs_infos));
+	ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+
+	dbg_debugfs_exit();
+	ubifs_compressors_exit();
+	unregister_shrinker(&ubifs_shrinker_info);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ubifs_inode_slab);
+	unregister_filesystem(&ubifs_fs_type);
+}
+module_exit(ubifs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(__stringify(UBIFS_VERSION));
+MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
+MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/kernel/fs/ubifs/tnc.c b/kernel/fs/ubifs/tnc.c
new file mode 100644
index 000000000..957f5757f
--- /dev/null
+++ b/kernel/fs/ubifs/tnc.c
@@ -0,0 +1,3327 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements TNC (Tree Node Cache) which caches indexing nodes of
+ * the UBIFS B-tree.
+ *
+ * At the moment the locking rules of the TNC tree are quite simple and
+ * straightforward. We just have a mutex and lock it when we traverse the
+ * tree. If a znode is not in memory, we read it from flash while still having
+ * the mutex locked.
+ */
+
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/*
+ * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
+ * @NAME_LESS: name corresponding to the first argument is less than second
+ * @NAME_MATCHES: names match
+ * @NAME_GREATER: name corresponding to the second argument is greater than
+ *                first
+ * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
+ *
+ * These constants were introduce to improve readability.
+ */
+enum {
+	NAME_LESS    = 0,
+	NAME_MATCHES = 1,
+	NAME_GREATER = 2,
+	NOT_ON_MEDIA = 3,
+};
+
+/**
+ * insert_old_idx - record an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ *
+ * For recovery, there must always be a complete intact version of the index on
+ * flash at all times. That is called the "old index". It is the index as at the
+ * time of the last successful commit. Many of the index nodes in the old index
+ * may be dirty, but they must not be erased until the next successful commit
+ * (at which point that index becomes the old index).
+ *
+ * That means that the garbage collection and the in-the-gaps method of
+ * committing must be able to determine if an index node is in the old index.
+ * Most of the old index nodes can be found by looking up the TNC using the
+ * 'lookup_znode()' function. However, some of the old index nodes may have
+ * been deleted from the current index or may have been changed so much that
+ * they cannot be easily found. In those cases, an entry is added to an RB-tree.
+ * That is what this function does. The RB-tree is ordered by LEB number and
+ * offset because they uniquely identify the old index node.
+ */
+static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_old_idx *old_idx, *o;
+	struct rb_node **p, *parent = NULL;
+
+	old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+	if (unlikely(!old_idx))
+		return -ENOMEM;
+	old_idx->lnum = lnum;
+	old_idx->offs = offs;
+
+	p = &c->old_idx.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_old_idx, rb);
+		if (lnum < o->lnum)
+			p = &(*p)->rb_left;
+		else if (lnum > o->lnum)
+			p = &(*p)->rb_right;
+		else if (offs < o->offs)
+			p = &(*p)->rb_left;
+		else if (offs > o->offs)
+			p = &(*p)->rb_right;
+		else {
+			ubifs_err(c, "old idx added twice!");
+			kfree(old_idx);
+			return 0;
+		}
+	}
+	rb_link_node(&old_idx->rb, parent, p);
+	rb_insert_color(&old_idx->rb, &c->old_idx);
+	return 0;
+}
+
+/**
+ * insert_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
+{
+	if (znode->parent) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &znode->parent->zbranch[znode->iip];
+		if (zbr->len)
+			return insert_old_idx(c, zbr->lnum, zbr->offs);
+	} else
+		if (c->zroot.len)
+			return insert_old_idx(c, c->zroot.lnum,
+					      c->zroot.offs);
+	return 0;
+}
+
+/**
+ * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+static int ins_clr_old_idx_znode(struct ubifs_info *c,
+				 struct ubifs_znode *znode)
+{
+	int err;
+
+	if (znode->parent) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &znode->parent->zbranch[znode->iip];
+		if (zbr->len) {
+			err = insert_old_idx(c, zbr->lnum, zbr->offs);
+			if (err)
+				return err;
+			zbr->lnum = 0;
+			zbr->offs = 0;
+			zbr->len = 0;
+		}
+	} else
+		if (c->zroot.len) {
+			err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
+			if (err)
+				return err;
+			c->zroot.lnum = 0;
+			c->zroot.offs = 0;
+			c->zroot.len = 0;
+		}
+	return 0;
+}
+
+/**
+ * destroy_old_idx - destroy the old_idx RB-tree.
+ * @c: UBIFS file-system description object
+ *
+ * During start commit, the old_idx RB-tree is used to avoid overwriting index
+ * nodes that were in the index last commit but have since been deleted.  This
+ * is necessary for recovery i.e. the old index must be kept intact until the
+ * new index is successfully written.  The old-idx RB-tree is used for the
+ * in-the-gaps method of writing index nodes and is destroyed every commit.
+ */
+void destroy_old_idx(struct ubifs_info *c)
+{
+	struct ubifs_old_idx *old_idx, *n;
+
+	rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb)
+		kfree(old_idx);
+
+	c->old_idx = RB_ROOT;
+}
+
+/**
+ * copy_znode - copy a dirty znode.
+ * @c: UBIFS file-system description object
+ * @znode: znode to copy
+ *
+ * A dirty znode being committed may not be changed, so it is copied.
+ */
+static struct ubifs_znode *copy_znode(struct ubifs_info *c,
+				      struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn;
+
+	zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+	if (unlikely(!zn))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(zn, znode, c->max_znode_sz);
+	zn->cnext = NULL;
+	__set_bit(DIRTY_ZNODE, &zn->flags);
+	__clear_bit(COW_ZNODE, &zn->flags);
+
+	ubifs_assert(!ubifs_zn_obsolete(znode));
+	__set_bit(OBSOLETE_ZNODE, &znode->flags);
+
+	if (znode->level != 0) {
+		int i;
+		const int n = zn->child_cnt;
+
+		/* The children now have new parent */
+		for (i = 0; i < n; i++) {
+			struct ubifs_zbranch *zbr = &zn->zbranch[i];
+
+			if (zbr->znode)
+				zbr->znode->parent = zn;
+		}
+	}
+
+	atomic_long_inc(&c->dirty_zn_cnt);
+	return zn;
+}
+
+/**
+ * add_idx_dirt - add dirt due to a dirty znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of index node
+ * @dirt: size of index node
+ *
+ * This function updates lprops dirty space and the new size of the index.
+ */
+static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
+{
+	c->calc_idx_sz -= ALIGN(dirt, 8);
+	return ubifs_add_dirt(c, lnum, dirt);
+}
+
+/**
+ * dirty_cow_znode - ensure a znode is not being committed.
+ * @c: UBIFS file-system description object
+ * @zbr: branch of znode to check
+ *
+ * Returns dirtied znode on success or negative error code on failure.
+ */
+static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
+					   struct ubifs_zbranch *zbr)
+{
+	struct ubifs_znode *znode = zbr->znode;
+	struct ubifs_znode *zn;
+	int err;
+
+	if (!ubifs_zn_cow(znode)) {
+		/* znode is not being committed */
+		if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
+			atomic_long_inc(&c->dirty_zn_cnt);
+			atomic_long_dec(&c->clean_zn_cnt);
+			atomic_long_dec(&ubifs_clean_zn_cnt);
+			err = add_idx_dirt(c, zbr->lnum, zbr->len);
+			if (unlikely(err))
+				return ERR_PTR(err);
+		}
+		return znode;
+	}
+
+	zn = copy_znode(c, znode);
+	if (IS_ERR(zn))
+		return zn;
+
+	if (zbr->len) {
+		err = insert_old_idx(c, zbr->lnum, zbr->offs);
+		if (unlikely(err))
+			return ERR_PTR(err);
+		err = add_idx_dirt(c, zbr->lnum, zbr->len);
+	} else
+		err = 0;
+
+	zbr->znode = zn;
+	zbr->lnum = 0;
+	zbr->offs = 0;
+	zbr->len = 0;
+
+	if (unlikely(err))
+		return ERR_PTR(err);
+	return zn;
+}
+
+/**
+ * lnc_add - add a leaf node to the leaf node cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
+ * purpose of the leaf node cache is to save re-reading the same leaf node over
+ * and over again. Most things are cached by VFS, however the file system must
+ * cache directory entries for readdir and for resolving hash collisions. The
+ * present implementation of the leaf node cache is extremely simple, and
+ * allows for error returns that are not used but that may be needed if a more
+ * complex implementation is created.
+ *
+ * Note, this function does not add the @node object to LNC directly, but
+ * allocates a copy of the object and adds the copy to LNC. The reason for this
+ * is that @node has been allocated outside of the TNC subsystem and will be
+ * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
+ * may be changed at any time, e.g. freed by the shrinker.
+ */
+static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+		   const void *node)
+{
+	int err;
+	void *lnc_node;
+	const struct ubifs_dent_node *dent = node;
+
+	ubifs_assert(!zbr->leaf);
+	ubifs_assert(zbr->len != 0);
+	ubifs_assert(is_hash_key(c, &zbr->key));
+
+	err = ubifs_validate_entry(c, dent);
+	if (err) {
+		dump_stack();
+		ubifs_dump_node(c, dent);
+		return err;
+	}
+
+	lnc_node = kmemdup(node, zbr->len, GFP_NOFS);
+	if (!lnc_node)
+		/* We don't have to have the cache, so no error */
+		return 0;
+
+	zbr->leaf = lnc_node;
+	return 0;
+}
+
+ /**
+ * lnc_add_directly - add a leaf node to the leaf-node-cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * This function is similar to 'lnc_add()', but it does not create a copy of
+ * @node but inserts @node to TNC directly.
+ */
+static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *node)
+{
+	int err;
+
+	ubifs_assert(!zbr->leaf);
+	ubifs_assert(zbr->len != 0);
+
+	err = ubifs_validate_entry(c, node);
+	if (err) {
+		dump_stack();
+		ubifs_dump_node(c, node);
+		return err;
+	}
+
+	zbr->leaf = node;
+	return 0;
+}
+
+/**
+ * lnc_free - remove a leaf node from the leaf node cache.
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ */
+static void lnc_free(struct ubifs_zbranch *zbr)
+{
+	if (!zbr->leaf)
+		return;
+	kfree(zbr->leaf);
+	zbr->leaf = NULL;
+}
+
+/**
+ * tnc_read_node_nm - read a "hashed" leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a "hashed" node defined by @zbr from the leaf node cache
+ * (in it is there) or from the hash media, in which case the node is also
+ * added to LNC. Returns zero in case of success or a negative negative error
+ * code in case of failure.
+ */
+static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *node)
+{
+	int err;
+
+	ubifs_assert(is_hash_key(c, &zbr->key));
+
+	if (zbr->leaf) {
+		/* Read from the leaf node cache */
+		ubifs_assert(zbr->len != 0);
+		memcpy(node, zbr->leaf, zbr->len);
+		return 0;
+	}
+
+	err = ubifs_tnc_read_node(c, zbr, node);
+	if (err)
+		return err;
+
+	/* Add the node to the leaf node cache */
+	err = lnc_add(c, zbr, node);
+	return err;
+}
+
+/**
+ * try_read_node - read a node if it is a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: LEB number of node to read
+ * @offs: offset of node to read
+ *
+ * This function tries to read a node of known type and length, checks it and
+ * stores it in @buf. This function returns %1 if a node is present and %0 if
+ * a node is not present. A negative error code is returned for I/O errors.
+ * This function performs that same function as ubifs_read_node except that
+ * it does not require that there is actually a node present and instead
+ * the return code indicates if a node was read.
+ *
+ * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
+ * is true (it is controlled by corresponding mount option). However, if
+ * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
+ * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
+ * because during mounting or re-mounting from R/O mode to R/W mode we may read
+ * journal nodes (when replying the journal or doing the recovery) and the
+ * journal nodes may potentially be corrupted, so checking is required.
+ */
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+			 int len, int lnum, int offs)
+{
+	int err, node_len;
+	struct ubifs_ch *ch = buf;
+	uint32_t crc, node_crc;
+
+	dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+
+	err = ubifs_leb_read(c, lnum, buf, offs, len, 1);
+	if (err) {
+		ubifs_err(c, "cannot read node type %d from LEB %d:%d, error %d",
+			  type, lnum, offs, err);
+		return err;
+	}
+
+	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+		return 0;
+
+	if (ch->node_type != type)
+		return 0;
+
+	node_len = le32_to_cpu(ch->len);
+	if (node_len != len)
+		return 0;
+
+	if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
+	    !c->remounting_rw)
+		return 1;
+
+	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+	node_crc = le32_to_cpu(ch->crc);
+	if (crc != node_crc)
+		return 0;
+
+	return 1;
+}
+
+/**
+ * fallible_read_node - try to read a leaf node.
+ * @c: UBIFS file-system description object
+ * @key:  key of node to read
+ * @zbr:  position of node
+ * @node: node returned
+ *
+ * This function tries to read a node and returns %1 if the node is read, %0
+ * if the node is not present, and a negative error code in the case of error.
+ */
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+			      struct ubifs_zbranch *zbr, void *node)
+{
+	int ret;
+
+	dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
+
+	ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
+			    zbr->offs);
+	if (ret == 1) {
+		union ubifs_key node_key;
+		struct ubifs_dent_node *dent = node;
+
+		/* All nodes have key in the same place */
+		key_read(c, &dent->key, &node_key);
+		if (keys_cmp(c, key, &node_key) != 0)
+			ret = 0;
+	}
+	if (ret == 0 && c->replaying)
+		dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
+			zbr->lnum, zbr->offs, zbr->len);
+	return ret;
+}
+
+/**
+ * matches_name - determine if a direntry or xattr entry matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
+ * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
+ * of failure, a negative error code is returned.
+ */
+static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			const struct qstr *nm)
+{
+	struct ubifs_dent_node *dent;
+	int nlen, err;
+
+	/* If possible, match against the dent in the leaf node cache */
+	if (!zbr->leaf) {
+		dent = kmalloc(zbr->len, GFP_NOFS);
+		if (!dent)
+			return -ENOMEM;
+
+		err = ubifs_tnc_read_node(c, zbr, dent);
+		if (err)
+			goto out_free;
+
+		/* Add the node to the leaf node cache */
+		err = lnc_add_directly(c, zbr, dent);
+		if (err)
+			goto out_free;
+	} else
+		dent = zbr->leaf;
+
+	nlen = le16_to_cpu(dent->nlen);
+	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	if (err == 0) {
+		if (nlen == nm->len)
+			return NAME_MATCHES;
+		else if (nlen < nm->len)
+			return NAME_LESS;
+		else
+			return NAME_GREATER;
+	} else if (err < 0)
+		return NAME_LESS;
+	else
+		return NAME_GREATER;
+
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * get_znode - get a TNC znode that may not be loaded yet.
+ * @c: UBIFS file-system description object
+ * @znode: parent znode
+ * @n: znode branch slot number
+ *
+ * This function returns the znode or a negative error code.
+ */
+static struct ubifs_znode *get_znode(struct ubifs_info *c,
+				     struct ubifs_znode *znode, int n)
+{
+	struct ubifs_zbranch *zbr;
+
+	zbr = &znode->zbranch[n];
+	if (zbr->znode)
+		znode = zbr->znode;
+	else
+		znode = ubifs_load_znode(c, zbr, znode, n);
+	return znode;
+}
+
+/**
+ * tnc_next - find next TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is passed and returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
+ * no next entry, or a negative error code otherwise.
+ */
+static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+	struct ubifs_znode *znode = *zn;
+	int nn = *n;
+
+	nn += 1;
+	if (nn < znode->child_cnt) {
+		*n = nn;
+		return 0;
+	}
+	while (1) {
+		struct ubifs_znode *zp;
+
+		zp = znode->parent;
+		if (!zp)
+			return -ENOENT;
+		nn = znode->iip + 1;
+		znode = zp;
+		if (nn < znode->child_cnt) {
+			znode = get_znode(c, znode, nn);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			while (znode->level != 0) {
+				znode = get_znode(c, znode, 0);
+				if (IS_ERR(znode))
+					return PTR_ERR(znode);
+			}
+			nn = 0;
+			break;
+		}
+	}
+	*zn = znode;
+	*n = nn;
+	return 0;
+}
+
+/**
+ * tnc_prev - find previous TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the previous TNC entry is found, %-ENOENT if
+ * there is no next entry, or a negative error code otherwise.
+ */
+static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+	struct ubifs_znode *znode = *zn;
+	int nn = *n;
+
+	if (nn > 0) {
+		*n = nn - 1;
+		return 0;
+	}
+	while (1) {
+		struct ubifs_znode *zp;
+
+		zp = znode->parent;
+		if (!zp)
+			return -ENOENT;
+		nn = znode->iip - 1;
+		znode = zp;
+		if (nn >= 0) {
+			znode = get_znode(c, znode, nn);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			while (znode->level != 0) {
+				nn = znode->child_cnt - 1;
+				znode = get_znode(c, znode, nn);
+				if (IS_ERR(znode))
+					return PTR_ERR(znode);
+			}
+			nn = znode->child_cnt - 1;
+			break;
+		}
+	}
+	*zn = znode;
+	*n = nn;
+	return 0;
+}
+
+/**
+ * resolve_collision - resolve a collision.
+ * @c: UBIFS file-system description object
+ * @key: key of a directory or extended attribute entry
+ * @zn: znode is returned here
+ * @n: zbranch number is passed and returned here
+ * @nm: name of the entry
+ *
+ * This function is called for "hashed" keys to make sure that the found key
+ * really corresponds to the looked up node (directory or extended attribute
+ * entry). It returns %1 and sets @zn and @n if the collision is resolved.
+ * %0 is returned if @nm is not found and @zn and @n are set to the previous
+ * entry, i.e. to the entry after which @nm could follow if it were in TNC.
+ * This means that @n may be set to %-1 if the leftmost key in @zn is the
+ * previous one. A negative error code is returned on failures.
+ */
+static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
+			     struct ubifs_znode **zn, int *n,
+			     const struct qstr *nm)
+{
+	int err;
+
+	err = matches_name(c, &(*zn)->zbranch[*n], nm);
+	if (unlikely(err < 0))
+		return err;
+	if (err == NAME_MATCHES)
+		return 1;
+
+	if (err == NAME_GREATER) {
+		/* Look left */
+		while (1) {
+			err = tnc_prev(c, zn, n);
+			if (err == -ENOENT) {
+				ubifs_assert(*n == 0);
+				*n = -1;
+				return 0;
+			}
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+				/*
+				 * We have found the branch after which we would
+				 * like to insert, but inserting in this znode
+				 * may still be wrong. Consider the following 3
+				 * znodes, in the case where we are resolving a
+				 * collision with Key2.
+				 *
+				 *                  znode zp
+				 *            ----------------------
+				 * level 1     |  Key0  |  Key1  |
+				 *            -----------------------
+				 *                 |            |
+				 *       znode za  |            |  znode zb
+				 *          ------------      ------------
+				 * level 0  |  Key0  |        |  Key2  |
+				 *          ------------      ------------
+				 *
+				 * The lookup finds Key2 in znode zb. Lets say
+				 * there is no match and the name is greater so
+				 * we look left. When we find Key0, we end up
+				 * here. If we return now, we will insert into
+				 * znode za at slot n = 1.  But that is invalid
+				 * according to the parent's keys.  Key2 must
+				 * be inserted into znode zb.
+				 *
+				 * Note, this problem is not relevant for the
+				 * case when we go right, because
+				 * 'tnc_insert()' would correct the parent key.
+				 */
+				if (*n == (*zn)->child_cnt - 1) {
+					err = tnc_next(c, zn, n);
+					if (err) {
+						/* Should be impossible */
+						ubifs_assert(0);
+						if (err == -ENOENT)
+							err = -EINVAL;
+						return err;
+					}
+					ubifs_assert(*n == 0);
+					*n = -1;
+				}
+				return 0;
+			}
+			err = matches_name(c, &(*zn)->zbranch[*n], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_LESS)
+				return 0;
+			if (err == NAME_MATCHES)
+				return 1;
+			ubifs_assert(err == NAME_GREATER);
+		}
+	} else {
+		int nn = *n;
+		struct ubifs_znode *znode = *zn;
+
+		/* Look right */
+		while (1) {
+			err = tnc_next(c, &znode, &nn);
+			if (err == -ENOENT)
+				return 0;
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &znode->zbranch[nn].key, key))
+				return 0;
+			err = matches_name(c, &znode->zbranch[nn], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_GREATER)
+				return 0;
+			*zn = znode;
+			*n = nn;
+			if (err == NAME_MATCHES)
+				return 1;
+			ubifs_assert(err == NAME_LESS);
+		}
+	}
+}
+
+/**
+ * fallible_matches_name - determine if a dent matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This is a "fallible" version of 'matches_name()' function which does not
+ * panic if the direntry/xentry referred by @zbr does not exist on the media.
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
+ * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
+ * if xentry/direntry referred by @zbr does not exist on the media. A negative
+ * error code is returned in case of failure.
+ */
+static int fallible_matches_name(struct ubifs_info *c,
+				 struct ubifs_zbranch *zbr,
+				 const struct qstr *nm)
+{
+	struct ubifs_dent_node *dent;
+	int nlen, err;
+
+	/* If possible, match against the dent in the leaf node cache */
+	if (!zbr->leaf) {
+		dent = kmalloc(zbr->len, GFP_NOFS);
+		if (!dent)
+			return -ENOMEM;
+
+		err = fallible_read_node(c, &zbr->key, zbr, dent);
+		if (err < 0)
+			goto out_free;
+		if (err == 0) {
+			/* The node was not present */
+			err = NOT_ON_MEDIA;
+			goto out_free;
+		}
+		ubifs_assert(err == 1);
+
+		err = lnc_add_directly(c, zbr, dent);
+		if (err)
+			goto out_free;
+	} else
+		dent = zbr->leaf;
+
+	nlen = le16_to_cpu(dent->nlen);
+	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	if (err == 0) {
+		if (nlen == nm->len)
+			return NAME_MATCHES;
+		else if (nlen < nm->len)
+			return NAME_LESS;
+		else
+			return NAME_GREATER;
+	} else if (err < 0)
+		return NAME_LESS;
+	else
+		return NAME_GREATER;
+
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * fallible_resolve_collision - resolve a collision even if nodes are missing.
+ * @c: UBIFS file-system description object
+ * @key: key
+ * @zn: znode is returned here
+ * @n: branch number is passed and returned here
+ * @nm: name of directory entry
+ * @adding: indicates caller is adding a key to the TNC
+ *
+ * This is a "fallible" version of the 'resolve_collision()' function which
+ * does not panic if one of the nodes referred to by TNC does not exist on the
+ * media. This may happen when replaying the journal if a deleted node was
+ * Garbage-collected and the commit was not done. A branch that refers to a node
+ * that is not present is called a dangling branch. The following are the return
+ * codes for this function:
+ *  o if @nm was found, %1 is returned and @zn and @n are set to the found
+ *    branch;
+ *  o if we are @adding and @nm was not found, %0 is returned;
+ *  o if we are not @adding and @nm was not found, but a dangling branch was
+ *    found, then %1 is returned and @zn and @n are set to the dangling branch;
+ *  o a negative error code is returned in case of failure.
+ */
+static int fallible_resolve_collision(struct ubifs_info *c,
+				      const union ubifs_key *key,
+				      struct ubifs_znode **zn, int *n,
+				      const struct qstr *nm, int adding)
+{
+	struct ubifs_znode *o_znode = NULL, *znode = *zn;
+	int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+
+	cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
+	if (unlikely(cmp < 0))
+		return cmp;
+	if (cmp == NAME_MATCHES)
+		return 1;
+	if (cmp == NOT_ON_MEDIA) {
+		o_znode = znode;
+		o_n = nn;
+		/*
+		 * We are unlucky and hit a dangling branch straight away.
+		 * Now we do not really know where to go to find the needed
+		 * branch - to the left or to the right. Well, let's try left.
+		 */
+		unsure = 1;
+	} else if (!adding)
+		unsure = 1; /* Remove a dangling branch wherever it is */
+
+	if (cmp == NAME_GREATER || unsure) {
+		/* Look left */
+		while (1) {
+			err = tnc_prev(c, zn, n);
+			if (err == -ENOENT) {
+				ubifs_assert(*n == 0);
+				*n = -1;
+				break;
+			}
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+				/* See comments in 'resolve_collision()' */
+				if (*n == (*zn)->child_cnt - 1) {
+					err = tnc_next(c, zn, n);
+					if (err) {
+						/* Should be impossible */
+						ubifs_assert(0);
+						if (err == -ENOENT)
+							err = -EINVAL;
+						return err;
+					}
+					ubifs_assert(*n == 0);
+					*n = -1;
+				}
+				break;
+			}
+			err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_MATCHES)
+				return 1;
+			if (err == NOT_ON_MEDIA) {
+				o_znode = *zn;
+				o_n = *n;
+				continue;
+			}
+			if (!adding)
+				continue;
+			if (err == NAME_LESS)
+				break;
+			else
+				unsure = 0;
+		}
+	}
+
+	if (cmp == NAME_LESS || unsure) {
+		/* Look right */
+		*zn = znode;
+		*n = nn;
+		while (1) {
+			err = tnc_next(c, &znode, &nn);
+			if (err == -ENOENT)
+				break;
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &znode->zbranch[nn].key, key))
+				break;
+			err = fallible_matches_name(c, &znode->zbranch[nn], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_GREATER)
+				break;
+			*zn = znode;
+			*n = nn;
+			if (err == NAME_MATCHES)
+				return 1;
+			if (err == NOT_ON_MEDIA) {
+				o_znode = znode;
+				o_n = nn;
+			}
+		}
+	}
+
+	/* Never match a dangling branch when adding */
+	if (adding || !o_znode)
+		return 0;
+
+	dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
+		o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
+		o_znode->zbranch[o_n].len);
+	*zn = o_znode;
+	*n = o_n;
+	return 1;
+}
+
+/**
+ * matches_position - determine if a zbranch matches a given position.
+ * @zbr: zbranch of dent
+ * @lnum: LEB number of dent to match
+ * @offs: offset of dent to match
+ *
+ * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
+ */
+static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
+{
+	if (zbr->lnum == lnum && zbr->offs == offs)
+		return 1;
+	else
+		return 0;
+}
+
+/**
+ * resolve_collision_directly - resolve a collision directly.
+ * @c: UBIFS file-system description object
+ * @key: key of directory entry
+ * @zn: znode is passed and returned here
+ * @n: zbranch number is passed and returned here
+ * @lnum: LEB number of dent node to match
+ * @offs: offset of dent node to match
+ *
+ * This function is used for "hashed" keys to make sure the found directory or
+ * extended attribute entry node is what was looked for. It is used when the
+ * flash address of the right node is known (@lnum:@offs) which makes it much
+ * easier to resolve collisions (no need to read entries and match full
+ * names). This function returns %1 and sets @zn and @n if the collision is
+ * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
+ * previous directory entry. Otherwise a negative error code is returned.
+ */
+static int resolve_collision_directly(struct ubifs_info *c,
+				      const union ubifs_key *key,
+				      struct ubifs_znode **zn, int *n,
+				      int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+	int nn, err;
+
+	znode = *zn;
+	nn = *n;
+	if (matches_position(&znode->zbranch[nn], lnum, offs))
+		return 1;
+
+	/* Look left */
+	while (1) {
+		err = tnc_prev(c, &znode, &nn);
+		if (err == -ENOENT)
+			break;
+		if (err < 0)
+			return err;
+		if (keys_cmp(c, &znode->zbranch[nn].key, key))
+			break;
+		if (matches_position(&znode->zbranch[nn], lnum, offs)) {
+			*zn = znode;
+			*n = nn;
+			return 1;
+		}
+	}
+
+	/* Look right */
+	znode = *zn;
+	nn = *n;
+	while (1) {
+		err = tnc_next(c, &znode, &nn);
+		if (err == -ENOENT)
+			return 0;
+		if (err < 0)
+			return err;
+		if (keys_cmp(c, &znode->zbranch[nn].key, key))
+			return 0;
+		*zn = znode;
+		*n = nn;
+		if (matches_position(&znode->zbranch[nn], lnum, offs))
+			return 1;
+	}
+}
+
+/**
+ * dirty_cow_bottom_up - dirty a znode and its ancestors.
+ * @c: UBIFS file-system description object
+ * @znode: znode to dirty
+ *
+ * If we do not have a unique key that resides in a znode, then we cannot
+ * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
+ * This function records the path back to the last dirty ancestor, and then
+ * dirties the znodes on that path.
+ */
+static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
+					       struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zp;
+	int *path = c->bottom_up_buf, p = 0;
+
+	ubifs_assert(c->zroot.znode);
+	ubifs_assert(znode);
+	if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
+		kfree(c->bottom_up_buf);
+		c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
+					   GFP_NOFS);
+		if (!c->bottom_up_buf)
+			return ERR_PTR(-ENOMEM);
+		path = c->bottom_up_buf;
+	}
+	if (c->zroot.znode->level) {
+		/* Go up until parent is dirty */
+		while (1) {
+			int n;
+
+			zp = znode->parent;
+			if (!zp)
+				break;
+			n = znode->iip;
+			ubifs_assert(p < c->zroot.znode->level);
+			path[p++] = n;
+			if (!zp->cnext && ubifs_zn_dirty(znode))
+				break;
+			znode = zp;
+		}
+	}
+
+	/* Come back down, dirtying as we go */
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		zp = znode->parent;
+		if (zp) {
+			ubifs_assert(path[p - 1] >= 0);
+			ubifs_assert(path[p - 1] < zp->child_cnt);
+			zbr = &zp->zbranch[path[--p]];
+			znode = dirty_cow_znode(c, zbr);
+		} else {
+			ubifs_assert(znode == c->zroot.znode);
+			znode = dirty_cow_znode(c, &c->zroot);
+		}
+		if (IS_ERR(znode) || !p)
+			break;
+		ubifs_assert(path[p - 1] >= 0);
+		ubifs_assert(path[p - 1] < znode->child_cnt);
+		znode = znode->zbranch[path[p - 1]].znode;
+	}
+
+	return znode;
+}
+
+/**
+ * ubifs_lookup_level0 - search for zero-level znode.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain
+ *     @key, then %0 is returned and slot number of the closest branch is stored
+ *     in @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %0 is stored in @n.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+			struct ubifs_znode **zn, int *n)
+{
+	int err, exact;
+	struct ubifs_znode *znode;
+	unsigned long time = get_seconds();
+
+	dbg_tnck(key, "search key ");
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
+
+	znode = c->zroot.znode;
+	if (unlikely(!znode)) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	znode->time = time;
+
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		exact = ubifs_search_zbranch(c, znode, key, n);
+
+		if (znode->level == 0)
+			break;
+
+		if (*n < 0)
+			*n = 0;
+		zbr = &znode->zbranch[*n];
+
+		if (zbr->znode) {
+			znode->time = time;
+			znode = zbr->znode;
+			continue;
+		}
+
+		/* znode is not in TNC cache, load it from the media */
+		znode = ubifs_load_znode(c, zbr, znode, *n);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	*zn = znode;
+	if (exact || !is_hash_key(c, key) || *n != -1) {
+		dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+		return exact;
+	}
+
+	/*
+	 * Here is a tricky place. We have not found the key and this is a
+	 * "hashed" key, which may collide. The rest of the code deals with
+	 * situations like this:
+	 *
+	 *                  | 3 | 5 |
+	 *                  /       \
+	 *          | 3 | 5 |      | 6 | 7 | (x)
+	 *
+	 * Or more a complex example:
+	 *
+	 *                | 1 | 5 |
+	 *                /       \
+	 *       | 1 | 3 |         | 5 | 8 |
+	 *              \           /
+	 *          | 5 | 5 |   | 6 | 7 | (x)
+	 *
+	 * In the examples, if we are looking for key "5", we may reach nodes
+	 * marked with "(x)". In this case what we have do is to look at the
+	 * left and see if there is "5" key there. If there is, we have to
+	 * return it.
+	 *
+	 * Note, this whole situation is possible because we allow to have
+	 * elements which are equivalent to the next key in the parent in the
+	 * children of current znode. For example, this happens if we split a
+	 * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
+	 * like this:
+	 *                      | 3 | 5 |
+	 *                       /     \
+	 *                | 3 | 5 |   | 5 | 6 | 7 |
+	 *                              ^
+	 * And this becomes what is at the first "picture" after key "5" marked
+	 * with "^" is removed. What could be done is we could prohibit
+	 * splitting in the middle of the colliding sequence. Also, when
+	 * removing the leftmost key, we would have to correct the key of the
+	 * parent node, which would introduce additional complications. Namely,
+	 * if we changed the leftmost key of the parent znode, the garbage
+	 * collector would be unable to find it (GC is doing this when GC'ing
+	 * indexing LEBs). Although we already have an additional RB-tree where
+	 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
+	 * after the commit. But anyway, this does not look easy to implement
+	 * so we did not try this.
+	 */
+	err = tnc_prev(c, &znode, n);
+	if (err == -ENOENT) {
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		*n = -1;
+		return 0;
+	}
+	if (unlikely(err < 0))
+		return err;
+	if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		*n = -1;
+		return 0;
+	}
+
+	dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+	*zn = znode;
+	return 1;
+}
+
+/**
+ * lookup_level0_dirty - search for zero-level znode dirtying.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain @key
+ *     then %0 is returned and slot number of the closed branch is stored in
+ *     @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
+ *
+ * Additionally all znodes in the path from the root to the located zero-level
+ * znode are marked as dirty.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
+			       struct ubifs_znode **zn, int *n)
+{
+	int err, exact;
+	struct ubifs_znode *znode;
+	unsigned long time = get_seconds();
+
+	dbg_tnck(key, "search and dirty key ");
+
+	znode = c->zroot.znode;
+	if (unlikely(!znode)) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	znode = dirty_cow_znode(c, &c->zroot);
+	if (IS_ERR(znode))
+		return PTR_ERR(znode);
+
+	znode->time = time;
+
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		exact = ubifs_search_zbranch(c, znode, key, n);
+
+		if (znode->level == 0)
+			break;
+
+		if (*n < 0)
+			*n = 0;
+		zbr = &znode->zbranch[*n];
+
+		if (zbr->znode) {
+			znode->time = time;
+			znode = dirty_cow_znode(c, zbr);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			continue;
+		}
+
+		/* znode is not in TNC cache, load it from the media */
+		znode = ubifs_load_znode(c, zbr, znode, *n);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+		znode = dirty_cow_znode(c, zbr);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	*zn = znode;
+	if (exact || !is_hash_key(c, key) || *n != -1) {
+		dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+		return exact;
+	}
+
+	/*
+	 * See huge comment at 'lookup_level0_dirty()' what is the rest of the
+	 * code.
+	 */
+	err = tnc_prev(c, &znode, n);
+	if (err == -ENOENT) {
+		*n = -1;
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		return 0;
+	}
+	if (unlikely(err < 0))
+		return err;
+	if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+		*n = -1;
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		return 0;
+	}
+
+	if (znode->cnext || !ubifs_zn_dirty(znode)) {
+		znode = dirty_cow_bottom_up(c, znode);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+	*zn = znode;
+	return 1;
+}
+
+/**
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @gc_seq1: garbage collection sequence number
+ *
+ * This function determines if @lnum may have been garbage collected since
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
+ * %0 is returned.
+ */
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
+{
+	int gc_seq2, gced_lnum;
+
+	gced_lnum = c->gced_lnum;
+	smp_rmb();
+	gc_seq2 = c->gc_seq;
+	/* Same seq means no GC */
+	if (gc_seq1 == gc_seq2)
+		return 0;
+	/* Different by more than 1 means we don't know */
+	if (gc_seq1 + 1 != gc_seq2)
+		return 1;
+	/*
+	 * We have seen the sequence number has increased by 1. Now we need to
+	 * be sure we read the right LEB number, so read it again.
+	 */
+	smp_rmb();
+	if (gced_lnum != c->gced_lnum)
+		return 1;
+	/* Finally we can check lnum */
+	if (gced_lnum == lnum)
+		return 1;
+	return 0;
+}
+
+/**
+ * ubifs_tnc_locate - look up a file-system node and return it and its location.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @lnum: LEB number is returned here
+ * @offs: offset is returned here
+ *
+ * This function looks up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
+ */
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node, int *lnum, int *offs)
+{
+	int found, n, err, safely = 0, gc_seq1;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch zbr, *zt;
+
+again:
+	mutex_lock(&c->tnc_mutex);
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (!found) {
+		err = -ENOENT;
+		goto out;
+	} else if (found < 0) {
+		err = found;
+		goto out;
+	}
+	zt = &znode->zbranch[n];
+	if (lnum) {
+		*lnum = zt->lnum;
+		*offs = zt->offs;
+	}
+	if (is_hash_key(c, key)) {
+		/*
+		 * In this case the leaf node cache gets used, so we pass the
+		 * address of the zbranch and keep the mutex locked
+		 */
+		err = tnc_read_node_nm(c, zt, node);
+		goto out;
+	}
+	if (safely) {
+		err = ubifs_tnc_read_node(c, zt, node);
+		goto out;
+	}
+	/* Drop the TNC mutex prematurely and race with garbage collection */
+	zbr = znode->zbranch[n];
+	gc_seq1 = c->gc_seq;
+	mutex_unlock(&c->tnc_mutex);
+
+	if (ubifs_get_wbuf(c, zbr.lnum)) {
+		/* We do not GC journal heads */
+		err = ubifs_tnc_read_node(c, &zbr, node);
+		return err;
+	}
+
+	err = fallible_read_node(c, key, &zbr, node);
+	if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+		/*
+		 * The node may have been GC'ed out from under us so try again
+		 * while keeping the TNC mutex locked.
+		 */
+		safely = 1;
+		goto again;
+	}
+	return 0;
+
+out:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_get_bu_keys - lookup keys for bulk-read.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * Lookup consecutive data node keys for the same inode that reside
+ * consecutively in the same LEB. This function returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
+ * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
+ * maximum possible amount of nodes for bulk-read.
+ */
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
+{
+	int n, err = 0, lnum = -1, uninitialized_var(offs);
+	int uninitialized_var(len);
+	unsigned int block = key_block(c, &bu->key);
+	struct ubifs_znode *znode;
+
+	bu->cnt = 0;
+	bu->blk_cnt = 0;
+	bu->eof = 0;
+
+	mutex_lock(&c->tnc_mutex);
+	/* Find first key */
+	err = ubifs_lookup_level0(c, &bu->key, &znode, &n);
+	if (err < 0)
+		goto out;
+	if (err) {
+		/* Key found */
+		len = znode->zbranch[n].len;
+		/* The buffer must be big enough for at least 1 node */
+		if (len > bu->buf_len) {
+			err = -EINVAL;
+			goto out;
+		}
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = znode->zbranch[n];
+		bu->blk_cnt += 1;
+		lnum = znode->zbranch[n].lnum;
+		offs = ALIGN(znode->zbranch[n].offs + len, 8);
+	}
+	while (1) {
+		struct ubifs_zbranch *zbr;
+		union ubifs_key *key;
+		unsigned int next_block;
+
+		/* Find next key */
+		err = tnc_next(c, &znode, &n);
+		if (err)
+			goto out;
+		zbr = &znode->zbranch[n];
+		key = &zbr->key;
+		/* See if there is another data key for this file */
+		if (key_inum(c, key) != key_inum(c, &bu->key) ||
+		    key_type(c, key) != UBIFS_DATA_KEY) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (lnum < 0) {
+			/* First key found */
+			lnum = zbr->lnum;
+			offs = ALIGN(zbr->offs + zbr->len, 8);
+			len = zbr->len;
+			if (len > bu->buf_len) {
+				err = -EINVAL;
+				goto out;
+			}
+		} else {
+			/*
+			 * The data nodes must be in consecutive positions in
+			 * the same LEB.
+			 */
+			if (zbr->lnum != lnum || zbr->offs != offs)
+				goto out;
+			offs += ALIGN(zbr->len, 8);
+			len = ALIGN(len, 8) + zbr->len;
+			/* Must not exceed buffer length */
+			if (len > bu->buf_len)
+				goto out;
+		}
+		/* Allow for holes */
+		next_block = key_block(c, key);
+		bu->blk_cnt += (next_block - block - 1);
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		block = next_block;
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = *zbr;
+		bu->blk_cnt += 1;
+		/* See if we have room for more */
+		if (bu->cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+	}
+out:
+	if (err == -ENOENT) {
+		bu->eof = 1;
+		err = 0;
+	}
+	bu->gc_seq = c->gc_seq;
+	mutex_unlock(&c->tnc_mutex);
+	if (err)
+		return err;
+	/*
+	 * An enormous hole could cause bulk-read to encompass too many
+	 * page cache pages, so limit the number here.
+	 */
+	if (bu->blk_cnt > UBIFS_MAX_BULK_READ)
+		bu->blk_cnt = UBIFS_MAX_BULK_READ;
+	/*
+	 * Ensure that bulk-read covers a whole number of page cache
+	 * pages.
+	 */
+	if (UBIFS_BLOCKS_PER_PAGE == 1 ||
+	    !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1)))
+		return 0;
+	if (bu->eof) {
+		/* At the end of file we can round up */
+		bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1;
+		return 0;
+	}
+	/* Exclude data nodes that do not make up a whole page cache page */
+	block = key_block(c, &bu->key) + bu->blk_cnt;
+	block &= ~(UBIFS_BLOCKS_PER_PAGE - 1);
+	while (bu->cnt) {
+		if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block)
+			break;
+		bu->cnt -= 1;
+	}
+	return 0;
+}
+
+/**
+ * read_wbuf - bulk-read from a LEB with a wbuf.
+ * @wbuf: wbuf that may overlap the read
+ * @buf: buffer into which to read
+ * @len: read length
+ * @lnum: LEB number from which to read
+ * @offs: offset from which to read
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum,
+		     int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+	int rlen, overlap;
+
+	dbg_io("LEB %d:%d, length %d", lnum, offs, len);
+	ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(offs + len <= c->leb_size);
+
+	spin_lock(&wbuf->lock);
+	overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+	if (!overlap) {
+		/* We may safely unlock the write-buffer and read the data */
+		spin_unlock(&wbuf->lock);
+		return ubifs_leb_read(c, lnum, buf, offs, len, 0);
+	}
+
+	/* Don't read under wbuf */
+	rlen = wbuf->offs - offs;
+	if (rlen < 0)
+		rlen = 0;
+
+	/* Copy the rest from the write-buffer */
+	memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+	spin_unlock(&wbuf->lock);
+
+	if (rlen > 0)
+		/* Read everything that goes before write-buffer */
+		return ubifs_leb_read(c, lnum, buf, offs, rlen, 0);
+
+	return 0;
+}
+
+/**
+ * validate_data_node - validate data nodes for bulk-read.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing data node to validate
+ * @zbr: zbranch of data node to validate
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int validate_data_node(struct ubifs_info *c, void *buf,
+			      struct ubifs_zbranch *zbr)
+{
+	union ubifs_key key1;
+	struct ubifs_ch *ch = buf;
+	int err, len;
+
+	if (ch->node_type != UBIFS_DATA_NODE) {
+		ubifs_err(c, "bad node type (%d but expected %d)",
+			  ch->node_type, UBIFS_DATA_NODE);
+		goto out_err;
+	}
+
+	err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0);
+	if (err) {
+		ubifs_err(c, "expected node type %d", UBIFS_DATA_NODE);
+		goto out;
+	}
+
+	len = le32_to_cpu(ch->len);
+	if (len != zbr->len) {
+		ubifs_err(c, "bad node length %d, expected %d", len, zbr->len);
+		goto out_err;
+	}
+
+	/* Make sure the key of the read node is correct */
+	key_read(c, buf + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, &zbr->key, &key1)) {
+		ubifs_err(c, "bad key in node at LEB %d:%d",
+			  zbr->lnum, zbr->offs);
+		dbg_tnck(&zbr->key, "looked for key ");
+		dbg_tnck(&key1, "found node's key ");
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out:
+	ubifs_err(c, "bad node at LEB %d:%d", zbr->lnum, zbr->offs);
+	ubifs_dump_node(c, buf);
+	dump_stack();
+	return err;
+}
+
+/**
+ * ubifs_tnc_bulk_read - read a number of data nodes in one go.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * This functions reads and validates the data nodes that were identified by the
+ * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success,
+ * -EAGAIN to indicate a race with GC, or another negative error code on
+ * failure.
+ */
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
+{
+	int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i;
+	struct ubifs_wbuf *wbuf;
+	void *buf;
+
+	len = bu->zbranch[bu->cnt - 1].offs;
+	len += bu->zbranch[bu->cnt - 1].len - offs;
+	if (len > bu->buf_len) {
+		ubifs_err(c, "buffer too small %d vs %d", bu->buf_len, len);
+		return -EINVAL;
+	}
+
+	/* Do the read */
+	wbuf = ubifs_get_wbuf(c, lnum);
+	if (wbuf)
+		err = read_wbuf(wbuf, bu->buf, len, lnum, offs);
+	else
+		err = ubifs_leb_read(c, lnum, bu->buf, offs, len, 0);
+
+	/* Check for a race with GC */
+	if (maybe_leb_gced(c, lnum, bu->gc_seq))
+		return -EAGAIN;
+
+	if (err && err != -EBADMSG) {
+		ubifs_err(c, "failed to read from LEB %d:%d, error %d",
+			  lnum, offs, err);
+		dump_stack();
+		dbg_tnck(&bu->key, "key ");
+		return err;
+	}
+
+	/* Validate the nodes read */
+	buf = bu->buf;
+	for (i = 0; i < bu->cnt; i++) {
+		err = validate_data_node(c, buf, &bu->zbranch[i]);
+		if (err)
+			return err;
+		buf = buf + ALIGN(bu->zbranch[i].len, 8);
+	}
+
+	return 0;
+}
+
+/**
+ * do_lookup_nm- look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm)
+{
+	int found, n, err;
+	struct ubifs_znode *znode;
+
+	dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
+	mutex_lock(&c->tnc_mutex);
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (!found) {
+		err = -ENOENT;
+		goto out_unlock;
+	} else if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	ubifs_assert(n >= 0);
+
+	err = resolve_collision(c, key, &znode, &n, nm);
+	dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+	if (unlikely(err < 0))
+		goto out_unlock;
+	if (err == 0) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	err = tnc_read_node_nm(c, &znode->zbranch[n], node);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_lookup_nm - look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm)
+{
+	int err, len;
+	const struct ubifs_dent_node *dent = node;
+
+	/*
+	 * We assume that in most of the cases there are no name collisions and
+	 * 'ubifs_tnc_lookup()' returns us the right direntry.
+	 */
+	err = ubifs_tnc_lookup(c, key, node);
+	if (err)
+		return err;
+
+	len = le16_to_cpu(dent->nlen);
+	if (nm->len == len && !memcmp(dent->name, nm->name, len))
+		return 0;
+
+	/*
+	 * Unluckily, there are hash collisions and we have to iterate over
+	 * them look at each direntry with colliding name hash sequentially.
+	 */
+	return do_lookup_nm(c, key, node, nm);
+}
+
+/**
+ * correct_parent_keys - correct parent znodes' keys.
+ * @c: UBIFS file-system description object
+ * @znode: znode to correct parent znodes for
+ *
+ * This is a helper function for 'tnc_insert()'. When the key of the leftmost
+ * zbranch changes, keys of parent znodes have to be corrected. This helper
+ * function is called in such situations and corrects the keys if needed.
+ */
+static void correct_parent_keys(const struct ubifs_info *c,
+				struct ubifs_znode *znode)
+{
+	union ubifs_key *key, *key1;
+
+	ubifs_assert(znode->parent);
+	ubifs_assert(znode->iip == 0);
+
+	key = &znode->zbranch[0].key;
+	key1 = &znode->parent->zbranch[0].key;
+
+	while (keys_cmp(c, key, key1) < 0) {
+		key_copy(c, key, key1);
+		znode = znode->parent;
+		znode->alt = 1;
+		if (!znode->parent || znode->iip)
+			break;
+		key1 = &znode->parent->zbranch[0].key;
+	}
+}
+
+/**
+ * insert_zbranch - insert a zbranch into a znode.
+ * @znode: znode into which to insert
+ * @zbr: zbranch to insert
+ * @n: slot number to insert to
+ *
+ * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
+ * znode's array of zbranches and keeps zbranches consolidated, so when a new
+ * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
+ * slot, zbranches starting from @n have to be moved right.
+ */
+static void insert_zbranch(struct ubifs_znode *znode,
+			   const struct ubifs_zbranch *zbr, int n)
+{
+	int i;
+
+	ubifs_assert(ubifs_zn_dirty(znode));
+
+	if (znode->level) {
+		for (i = znode->child_cnt; i > n; i--) {
+			znode->zbranch[i] = znode->zbranch[i - 1];
+			if (znode->zbranch[i].znode)
+				znode->zbranch[i].znode->iip = i;
+		}
+		if (zbr->znode)
+			zbr->znode->iip = n;
+	} else
+		for (i = znode->child_cnt; i > n; i--)
+			znode->zbranch[i] = znode->zbranch[i - 1];
+
+	znode->zbranch[n] = *zbr;
+	znode->child_cnt += 1;
+
+	/*
+	 * After inserting at slot zero, the lower bound of the key range of
+	 * this znode may have changed. If this znode is subsequently split
+	 * then the upper bound of the key range may change, and furthermore
+	 * it could change to be lower than the original lower bound. If that
+	 * happens, then it will no longer be possible to find this znode in the
+	 * TNC using the key from the index node on flash. That is bad because
+	 * if it is not found, we will assume it is obsolete and may overwrite
+	 * it. Then if there is an unclean unmount, we will start using the
+	 * old index which will be broken.
+	 *
+	 * So we first mark znodes that have insertions at slot zero, and then
+	 * if they are split we add their lnum/offs to the old_idx tree.
+	 */
+	if (n == 0)
+		znode->alt = 1;
+}
+
+/**
+ * tnc_insert - insert a node into TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to insert into
+ * @zbr: branch to insert
+ * @n: slot number to insert new zbranch to
+ *
+ * This function inserts a new node described by @zbr into znode @znode. If
+ * znode does not have a free slot for new zbranch, it is split. Parent znodes
+ * are splat as well if needed. Returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
+		      struct ubifs_zbranch *zbr, int n)
+{
+	struct ubifs_znode *zn, *zi, *zp;
+	int i, keep, move, appending = 0;
+	union ubifs_key *key = &zbr->key, *key1;
+
+	ubifs_assert(n >= 0 && n <= c->fanout);
+
+	/* Implement naive insert for now */
+again:
+	zp = znode->parent;
+	if (znode->child_cnt < c->fanout) {
+		ubifs_assert(n != c->fanout);
+		dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
+
+		insert_zbranch(znode, zbr, n);
+
+		/* Ensure parent's key is correct */
+		if (n == 0 && zp && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
+		return 0;
+	}
+
+	/*
+	 * Unfortunately, @znode does not have more empty slots and we have to
+	 * split it.
+	 */
+	dbg_tnck(key, "splitting level %d, key ", znode->level);
+
+	if (znode->alt)
+		/*
+		 * We can no longer be sure of finding this znode by key, so we
+		 * record it in the old_idx tree.
+		 */
+		ins_clr_old_idx_znode(c, znode);
+
+	zn = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!zn)
+		return -ENOMEM;
+	zn->parent = zp;
+	zn->level = znode->level;
+
+	/* Decide where to split */
+	if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) {
+		/* Try not to split consecutive data keys */
+		if (n == c->fanout) {
+			key1 = &znode->zbranch[n - 1].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY)
+				appending = 1;
+		} else
+			goto check_split;
+	} else if (appending && n != c->fanout) {
+		/* Try not to split consecutive data keys */
+		appending = 0;
+check_split:
+		if (n >= (c->fanout + 1) / 2) {
+			key1 = &znode->zbranch[0].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY) {
+				key1 = &znode->zbranch[n].key;
+				if (key_inum(c, key1) != key_inum(c, key) ||
+				    key_type(c, key1) != UBIFS_DATA_KEY) {
+					keep = n;
+					move = c->fanout - keep;
+					zi = znode;
+					goto do_split;
+				}
+			}
+		}
+	}
+
+	if (appending) {
+		keep = c->fanout;
+		move = 0;
+	} else {
+		keep = (c->fanout + 1) / 2;
+		move = c->fanout - keep;
+	}
+
+	/*
+	 * Although we don't at present, we could look at the neighbors and see
+	 * if we can move some zbranches there.
+	 */
+
+	if (n < keep) {
+		/* Insert into existing znode */
+		zi = znode;
+		move += 1;
+		keep -= 1;
+	} else {
+		/* Insert into new znode */
+		zi = zn;
+		n -= keep;
+		/* Re-parent */
+		if (zn->level != 0)
+			zbr->znode->parent = zn;
+	}
+
+do_split:
+
+	__set_bit(DIRTY_ZNODE, &zn->flags);
+	atomic_long_inc(&c->dirty_zn_cnt);
+
+	zn->child_cnt = move;
+	znode->child_cnt = keep;
+
+	dbg_tnc("moving %d, keeping %d", move, keep);
+
+	/* Move zbranch */
+	for (i = 0; i < move; i++) {
+		zn->zbranch[i] = znode->zbranch[keep + i];
+		/* Re-parent */
+		if (zn->level != 0)
+			if (zn->zbranch[i].znode) {
+				zn->zbranch[i].znode->parent = zn;
+				zn->zbranch[i].znode->iip = i;
+			}
+	}
+
+	/* Insert new key and branch */
+	dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
+
+	insert_zbranch(zi, zbr, n);
+
+	/* Insert new znode (produced by spitting) into the parent */
+	if (zp) {
+		if (n == 0 && zi == znode && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
+		/* Locate insertion point */
+		n = znode->iip + 1;
+
+		/* Tail recursion */
+		zbr->key = zn->zbranch[0].key;
+		zbr->znode = zn;
+		zbr->lnum = 0;
+		zbr->offs = 0;
+		zbr->len = 0;
+		znode = zp;
+
+		goto again;
+	}
+
+	/* We have to split root znode */
+	dbg_tnc("creating new zroot at level %d", znode->level + 1);
+
+	zi = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!zi)
+		return -ENOMEM;
+
+	zi->child_cnt = 2;
+	zi->level = znode->level + 1;
+
+	__set_bit(DIRTY_ZNODE, &zi->flags);
+	atomic_long_inc(&c->dirty_zn_cnt);
+
+	zi->zbranch[0].key = znode->zbranch[0].key;
+	zi->zbranch[0].znode = znode;
+	zi->zbranch[0].lnum = c->zroot.lnum;
+	zi->zbranch[0].offs = c->zroot.offs;
+	zi->zbranch[0].len = c->zroot.len;
+	zi->zbranch[1].key = zn->zbranch[0].key;
+	zi->zbranch[1].znode = zn;
+
+	c->zroot.lnum = 0;
+	c->zroot.offs = 0;
+	c->zroot.len = 0;
+	c->zroot.znode = zi;
+
+	zn->parent = zi;
+	zn->iip = 1;
+	znode->parent = zi;
+	znode->iip = 0;
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_add - add a node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function adds a node with key @key to TNC. The node may be new or it may
+ * obsolete some existing one. Returns %0 on success or negative error code on
+ * failure.
+ */
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+		  int offs, int len)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (!found) {
+		struct ubifs_zbranch zbr;
+
+		zbr.znode = NULL;
+		zbr.lnum = lnum;
+		zbr.offs = offs;
+		zbr.len = len;
+		key_copy(c, key, &zbr.key);
+		err = tnc_insert(c, znode, &zbr, n + 1);
+	} else if (found == 1) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		lnc_free(zbr);
+		err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+		zbr->lnum = lnum;
+		zbr->offs = offs;
+		zbr->len = len;
+	} else
+		err = found;
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+
+	return err;
+}
+
+/**
+ * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @old_lnum: LEB number of old node
+ * @old_offs: old node offset
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function replaces a node with key @key in the TNC only if the old node
+ * is found.  This function is called by garbage collection when node are moved.
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+		      int old_lnum, int old_offs, int lnum, int offs, int len)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
+		 old_offs, lnum, offs, len);
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	if (found == 1) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		found = 0;
+		if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
+			lnc_free(zbr);
+			err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+			if (err)
+				goto out_unlock;
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+			found = 1;
+		} else if (is_hash_key(c, key)) {
+			found = resolve_collision_directly(c, key, &znode, &n,
+							   old_lnum, old_offs);
+			dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
+				found, znode, n, old_lnum, old_offs);
+			if (found < 0) {
+				err = found;
+				goto out_unlock;
+			}
+
+			if (found) {
+				/* Ensure the znode is dirtied */
+				if (znode->cnext || !ubifs_zn_dirty(znode)) {
+					znode = dirty_cow_bottom_up(c, znode);
+					if (IS_ERR(znode)) {
+						err = PTR_ERR(znode);
+						goto out_unlock;
+					}
+				}
+				zbr = &znode->zbranch[n];
+				lnc_free(zbr);
+				err = ubifs_add_dirt(c, zbr->lnum,
+						     zbr->len);
+				if (err)
+					goto out_unlock;
+				zbr->lnum = lnum;
+				zbr->offs = offs;
+				zbr->len = len;
+			}
+		}
+	}
+
+	if (!found)
+		err = ubifs_add_dirt(c, lnum, len);
+
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_add_nm - add a "hashed" node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ * @nm: node name
+ *
+ * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
+ * may have collisions, like directory entry keys.
+ */
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+		     int lnum, int offs, int len, const struct qstr *nm)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+		 lnum, offs, nm->len, nm->name);
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	if (found == 1) {
+		if (c->replaying)
+			found = fallible_resolve_collision(c, key, &znode, &n,
+							   nm, 1);
+		else
+			found = resolve_collision(c, key, &znode, &n, nm);
+		dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
+		if (found < 0) {
+			err = found;
+			goto out_unlock;
+		}
+
+		/* Ensure the znode is dirtied */
+		if (znode->cnext || !ubifs_zn_dirty(znode)) {
+			znode = dirty_cow_bottom_up(c, znode);
+			if (IS_ERR(znode)) {
+				err = PTR_ERR(znode);
+				goto out_unlock;
+			}
+		}
+
+		if (found == 1) {
+			struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+			lnc_free(zbr);
+			err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+			goto out_unlock;
+		}
+	}
+
+	if (!found) {
+		struct ubifs_zbranch zbr;
+
+		zbr.znode = NULL;
+		zbr.lnum = lnum;
+		zbr.offs = offs;
+		zbr.len = len;
+		key_copy(c, key, &zbr.key);
+		err = tnc_insert(c, znode, &zbr, n + 1);
+		if (err)
+			goto out_unlock;
+		if (c->replaying) {
+			/*
+			 * We did not find it in the index so there may be a
+			 * dangling branch still in the index. So we remove it
+			 * by passing 'ubifs_tnc_remove_nm()' the same key but
+			 * an unmatchable name.
+			 */
+			struct qstr noname = { .name = "" };
+
+			err = dbg_check_tnc(c, 0);
+			mutex_unlock(&c->tnc_mutex);
+			if (err)
+				return err;
+			return ubifs_tnc_remove_nm(c, key, &noname);
+		}
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * tnc_delete - delete a znode form TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to delete from
+ * @n: zbranch slot number to delete
+ *
+ * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
+{
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *zp;
+	int i, err;
+
+	/* Delete without merge for now */
+	ubifs_assert(znode->level == 0);
+	ubifs_assert(n >= 0 && n < c->fanout);
+	dbg_tnck(&znode->zbranch[n].key, "deleting key ");
+
+	zbr = &znode->zbranch[n];
+	lnc_free(zbr);
+
+	err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+	if (err) {
+		ubifs_dump_znode(c, znode);
+		return err;
+	}
+
+	/* We do not "gap" zbranch slots */
+	for (i = n; i < znode->child_cnt - 1; i++)
+		znode->zbranch[i] = znode->zbranch[i + 1];
+	znode->child_cnt -= 1;
+
+	if (znode->child_cnt > 0)
+		return 0;
+
+	/*
+	 * This was the last zbranch, we have to delete this znode from the
+	 * parent.
+	 */
+
+	do {
+		ubifs_assert(!ubifs_zn_obsolete(znode));
+		ubifs_assert(ubifs_zn_dirty(znode));
+
+		zp = znode->parent;
+		n = znode->iip;
+
+		atomic_long_dec(&c->dirty_zn_cnt);
+
+		err = insert_old_idx_znode(c, znode);
+		if (err)
+			return err;
+
+		if (znode->cnext) {
+			__set_bit(OBSOLETE_ZNODE, &znode->flags);
+			atomic_long_inc(&c->clean_zn_cnt);
+			atomic_long_inc(&ubifs_clean_zn_cnt);
+		} else
+			kfree(znode);
+		znode = zp;
+	} while (znode->child_cnt == 1); /* while removing last child */
+
+	/* Remove from znode, entry n - 1 */
+	znode->child_cnt -= 1;
+	ubifs_assert(znode->level != 0);
+	for (i = n; i < znode->child_cnt; i++) {
+		znode->zbranch[i] = znode->zbranch[i + 1];
+		if (znode->zbranch[i].znode)
+			znode->zbranch[i].znode->iip = i;
+	}
+
+	/*
+	 * If this is the root and it has only 1 child then
+	 * collapse the tree.
+	 */
+	if (!znode->parent) {
+		while (znode->child_cnt == 1 && znode->level != 0) {
+			zp = znode;
+			zbr = &znode->zbranch[0];
+			znode = get_znode(c, znode, 0);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			znode = dirty_cow_znode(c, zbr);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			znode->parent = NULL;
+			znode->iip = 0;
+			if (c->zroot.len) {
+				err = insert_old_idx(c, c->zroot.lnum,
+						     c->zroot.offs);
+				if (err)
+					return err;
+			}
+			c->zroot.lnum = zbr->lnum;
+			c->zroot.offs = zbr->offs;
+			c->zroot.len = zbr->len;
+			c->zroot.znode = znode;
+			ubifs_assert(!ubifs_zn_obsolete(zp));
+			ubifs_assert(ubifs_zn_dirty(zp));
+			atomic_long_dec(&c->dirty_zn_cnt);
+
+			if (zp->cnext) {
+				__set_bit(OBSOLETE_ZNODE, &zp->flags);
+				atomic_long_inc(&c->clean_zn_cnt);
+				atomic_long_inc(&ubifs_clean_zn_cnt);
+			} else
+				kfree(zp);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_remove - remove an index entry of a node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnck(key, "key ");
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+	if (found == 1)
+		err = tnc_delete(c, znode, n);
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ * @nm: directory entry name
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+			const struct qstr *nm)
+{
+	int n, err;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
+	err = lookup_level0_dirty(c, key, &znode, &n);
+	if (err < 0)
+		goto out_unlock;
+
+	if (err) {
+		if (c->replaying)
+			err = fallible_resolve_collision(c, key, &znode, &n,
+							 nm, 0);
+		else
+			err = resolve_collision(c, key, &znode, &n, nm);
+		dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+		if (err < 0)
+			goto out_unlock;
+		if (err) {
+			/* Ensure the znode is dirtied */
+			if (znode->cnext || !ubifs_zn_dirty(znode)) {
+				znode = dirty_cow_bottom_up(c, znode);
+				if (IS_ERR(znode)) {
+					err = PTR_ERR(znode);
+					goto out_unlock;
+				}
+			}
+			err = tnc_delete(c, znode, n);
+		}
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * key_in_range - determine if a key falls within a range of keys.
+ * @c: UBIFS file-system description object
+ * @key: key to check
+ * @from_key: lowest key in range
+ * @to_key: highest key in range
+ *
+ * This function returns %1 if the key is in range and %0 otherwise.
+ */
+static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
+			union ubifs_key *from_key, union ubifs_key *to_key)
+{
+	if (keys_cmp(c, key, from_key) < 0)
+		return 0;
+	if (keys_cmp(c, key, to_key) > 0)
+		return 0;
+	return 1;
+}
+
+/**
+ * ubifs_tnc_remove_range - remove index entries in range.
+ * @c: UBIFS file-system description object
+ * @from_key: lowest key to remove
+ * @to_key: highest key to remove
+ *
+ * This function removes index entries starting at @from_key and ending at
+ * @to_key.  This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+			   union ubifs_key *to_key)
+{
+	int i, n, k, err = 0;
+	struct ubifs_znode *znode;
+	union ubifs_key *key;
+
+	mutex_lock(&c->tnc_mutex);
+	while (1) {
+		/* Find first level 0 znode that contains keys to remove */
+		err = ubifs_lookup_level0(c, from_key, &znode, &n);
+		if (err < 0)
+			goto out_unlock;
+
+		if (err)
+			key = from_key;
+		else {
+			err = tnc_next(c, &znode, &n);
+			if (err == -ENOENT) {
+				err = 0;
+				goto out_unlock;
+			}
+			if (err < 0)
+				goto out_unlock;
+			key = &znode->zbranch[n].key;
+			if (!key_in_range(c, key, from_key, to_key)) {
+				err = 0;
+				goto out_unlock;
+			}
+		}
+
+		/* Ensure the znode is dirtied */
+		if (znode->cnext || !ubifs_zn_dirty(znode)) {
+			znode = dirty_cow_bottom_up(c, znode);
+			if (IS_ERR(znode)) {
+				err = PTR_ERR(znode);
+				goto out_unlock;
+			}
+		}
+
+		/* Remove all keys in range except the first */
+		for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
+			key = &znode->zbranch[i].key;
+			if (!key_in_range(c, key, from_key, to_key))
+				break;
+			lnc_free(&znode->zbranch[i]);
+			err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
+					     znode->zbranch[i].len);
+			if (err) {
+				ubifs_dump_znode(c, znode);
+				goto out_unlock;
+			}
+			dbg_tnck(key, "removing key ");
+		}
+		if (k) {
+			for (i = n + 1 + k; i < znode->child_cnt; i++)
+				znode->zbranch[i - k] = znode->zbranch[i];
+			znode->child_cnt -= k;
+		}
+
+		/* Now delete the first */
+		err = tnc_delete(c, znode, n);
+		if (err)
+			goto out_unlock;
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_remove_ino - remove an inode from TNC.
+ * @c: UBIFS file-system description object
+ * @inum: inode number to remove
+ *
+ * This function remove inode @inum and all the extended attributes associated
+ * with the anode from TNC and returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
+{
+	union ubifs_key key1, key2;
+	struct ubifs_dent_node *xent, *pxent = NULL;
+	struct qstr nm = { .name = NULL };
+
+	dbg_tnc("ino %lu", (unsigned long)inum);
+
+	/*
+	 * Walk all extended attribute entries and remove them together with
+	 * corresponding extended attribute inodes.
+	 */
+	lowest_xent_key(c, &key1, inum);
+	while (1) {
+		ino_t xattr_inum;
+		int err;
+
+		xent = ubifs_tnc_next_ent(c, &key1, &nm);
+		if (IS_ERR(xent)) {
+			err = PTR_ERR(xent);
+			if (err == -ENOENT)
+				break;
+			return err;
+		}
+
+		xattr_inum = le64_to_cpu(xent->inum);
+		dbg_tnc("xent '%s', ino %lu", xent->name,
+			(unsigned long)xattr_inum);
+
+		nm.name = xent->name;
+		nm.len = le16_to_cpu(xent->nlen);
+		err = ubifs_tnc_remove_nm(c, &key1, &nm);
+		if (err) {
+			kfree(xent);
+			return err;
+		}
+
+		lowest_ino_key(c, &key1, xattr_inum);
+		highest_ino_key(c, &key2, xattr_inum);
+		err = ubifs_tnc_remove_range(c, &key1, &key2);
+		if (err) {
+			kfree(xent);
+			return err;
+		}
+
+		kfree(pxent);
+		pxent = xent;
+		key_read(c, &xent->key, &key1);
+	}
+
+	kfree(pxent);
+	lowest_ino_key(c, &key1, inum);
+	highest_ino_key(c, &key2, inum);
+
+	return ubifs_tnc_remove_range(c, &key1, &key2);
+}
+
+/**
+ * ubifs_tnc_next_ent - walk directory or extended attribute entries.
+ * @c: UBIFS file-system description object
+ * @key: key of last entry
+ * @nm: name of last entry found or %NULL
+ *
+ * This function finds and reads the next directory or extended attribute entry
+ * after the given key (@key) if there is one. @nm is used to resolve
+ * collisions.
+ *
+ * If the name of the current entry is not known and only the key is known,
+ * @nm->name has to be %NULL. In this case the semantics of this function is a
+ * little bit different and it returns the entry corresponding to this key, not
+ * the next one. If the key was not found, the closest "right" entry is
+ * returned.
+ *
+ * If the fist entry has to be found, @key has to contain the lowest possible
+ * key value for this inode and @name has to be %NULL.
+ *
+ * This function returns the found directory or extended attribute entry node
+ * in case of success, %-ENOENT is returned if no entry was found, and a
+ * negative error code is returned in case of failure.
+ */
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+					   union ubifs_key *key,
+					   const struct qstr *nm)
+{
+	int n, err, type = key_type(c, key);
+	struct ubifs_znode *znode;
+	struct ubifs_dent_node *dent;
+	struct ubifs_zbranch *zbr;
+	union ubifs_key *dkey;
+
+	dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
+	ubifs_assert(is_hash_key(c, key));
+
+	mutex_lock(&c->tnc_mutex);
+	err = ubifs_lookup_level0(c, key, &znode, &n);
+	if (unlikely(err < 0))
+		goto out_unlock;
+
+	if (nm->name) {
+		if (err) {
+			/* Handle collisions */
+			err = resolve_collision(c, key, &znode, &n, nm);
+			dbg_tnc("rc returned %d, znode %p, n %d",
+				err, znode, n);
+			if (unlikely(err < 0))
+				goto out_unlock;
+		}
+
+		/* Now find next entry */
+		err = tnc_next(c, &znode, &n);
+		if (unlikely(err))
+			goto out_unlock;
+	} else {
+		/*
+		 * The full name of the entry was not given, in which case the
+		 * behavior of this function is a little different and it
+		 * returns current entry, not the next one.
+		 */
+		if (!err) {
+			/*
+			 * However, the given key does not exist in the TNC
+			 * tree and @znode/@n variables contain the closest
+			 * "preceding" element. Switch to the next one.
+			 */
+			err = tnc_next(c, &znode, &n);
+			if (err)
+				goto out_unlock;
+		}
+	}
+
+	zbr = &znode->zbranch[n];
+	dent = kmalloc(zbr->len, GFP_NOFS);
+	if (unlikely(!dent)) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/*
+	 * The above 'tnc_next()' call could lead us to the next inode, check
+	 * this.
+	 */
+	dkey = &zbr->key;
+	if (key_inum(c, dkey) != key_inum(c, key) ||
+	    key_type(c, dkey) != type) {
+		err = -ENOENT;
+		goto out_free;
+	}
+
+	err = tnc_read_node_nm(c, zbr, dent);
+	if (unlikely(err))
+		goto out_free;
+
+	mutex_unlock(&c->tnc_mutex);
+	return dent;
+
+out_free:
+	kfree(dent);
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return ERR_PTR(err);
+}
+
+/**
+ * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy left-over obsolete znodes from a failed commit.
+ */
+static void tnc_destroy_cnext(struct ubifs_info *c)
+{
+	struct ubifs_znode *cnext;
+
+	if (!c->cnext)
+		return;
+	ubifs_assert(c->cmt_state == COMMIT_BROKEN);
+	cnext = c->cnext;
+	do {
+		struct ubifs_znode *znode = cnext;
+
+		cnext = cnext->cnext;
+		if (ubifs_zn_obsolete(znode))
+			kfree(znode);
+	} while (cnext && cnext != c->cnext);
+}
+
+/**
+ * ubifs_tnc_close - close TNC subsystem and free all related resources.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_tnc_close(struct ubifs_info *c)
+{
+	tnc_destroy_cnext(c);
+	if (c->zroot.znode) {
+		long n, freed;
+
+		n = atomic_long_read(&c->clean_zn_cnt);
+		freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+		ubifs_assert(freed == n);
+		atomic_long_sub(n, &ubifs_clean_zn_cnt);
+	}
+	kfree(c->gap_lebs);
+	kfree(c->ilebs);
+	destroy_old_idx(c);
+}
+
+/**
+ * left_znode - get the znode to the left.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the left of @znode or NULL if
+ * there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *left_znode(struct ubifs_info *c,
+				      struct ubifs_znode *znode)
+{
+	int level = znode->level;
+
+	while (1) {
+		int n = znode->iip - 1;
+
+		/* Go up until we can go left */
+		znode = znode->parent;
+		if (!znode)
+			return NULL;
+		if (n >= 0) {
+			/* Now go down the rightmost branch to 'level' */
+			znode = get_znode(c, znode, n);
+			if (IS_ERR(znode))
+				return znode;
+			while (znode->level != level) {
+				n = znode->child_cnt - 1;
+				znode = get_znode(c, znode, n);
+				if (IS_ERR(znode))
+					return znode;
+			}
+			break;
+		}
+	}
+	return znode;
+}
+
+/**
+ * right_znode - get the znode to the right.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the right of @znode or NULL
+ * if there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *right_znode(struct ubifs_info *c,
+				       struct ubifs_znode *znode)
+{
+	int level = znode->level;
+
+	while (1) {
+		int n = znode->iip + 1;
+
+		/* Go up until we can go right */
+		znode = znode->parent;
+		if (!znode)
+			return NULL;
+		if (n < znode->child_cnt) {
+			/* Now go down the leftmost branch to 'level' */
+			znode = get_znode(c, znode, n);
+			if (IS_ERR(znode))
+				return znode;
+			while (znode->level != level) {
+				znode = get_znode(c, znode, 0);
+				if (IS_ERR(znode))
+					return znode;
+			}
+			break;
+		}
+	}
+	return znode;
+}
+
+/**
+ * lookup_znode - find a particular indexing node from TNC.
+ * @c: UBIFS file-system description object
+ * @key: index node key to lookup
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function searches an indexing node by its first key @key and its
+ * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
+ * nodes it traverses to TNC. This function is called for indexing nodes which
+ * were found on the media by scanning, for example when garbage-collecting or
+ * when doing in-the-gaps commit. This means that the indexing node which is
+ * looked for does not have to have exactly the same leftmost key @key, because
+ * the leftmost key may have been changed, in which case TNC will contain a
+ * dirty znode which still refers the same @lnum:@offs. This function is clever
+ * enough to recognize such indexing nodes.
+ *
+ * Note, if a znode was deleted or changed too much, then this function will
+ * not find it. For situations like this UBIFS has the old index RB-tree
+ * (indexed by @lnum:@offs).
+ *
+ * This function returns a pointer to the znode found or %NULL if it is not
+ * found. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
+					union ubifs_key *key, int level,
+					int lnum, int offs)
+{
+	struct ubifs_znode *znode, *zn;
+	int n, nn;
+
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
+
+	/*
+	 * The arguments have probably been read off flash, so don't assume
+	 * they are valid.
+	 */
+	if (level < 0)
+		return ERR_PTR(-EINVAL);
+
+	/* Get the root znode */
+	znode = c->zroot.znode;
+	if (!znode) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return znode;
+	}
+	/* Check if it is the one we are looking for */
+	if (c->zroot.lnum == lnum && c->zroot.offs == offs)
+		return znode;
+	/* Descend to the parent level i.e. (level + 1) */
+	if (level >= znode->level)
+		return NULL;
+	while (1) {
+		ubifs_search_zbranch(c, znode, key, &n);
+		if (n < 0) {
+			/*
+			 * We reached a znode where the leftmost key is greater
+			 * than the key we are searching for. This is the same
+			 * situation as the one described in a huge comment at
+			 * the end of the 'ubifs_lookup_level0()' function. And
+			 * for exactly the same reasons we have to try to look
+			 * left before giving up.
+			 */
+			znode = left_znode(c, znode);
+			if (!znode)
+				return NULL;
+			if (IS_ERR(znode))
+				return znode;
+			ubifs_search_zbranch(c, znode, key, &n);
+			ubifs_assert(n >= 0);
+		}
+		if (znode->level == level + 1)
+			break;
+		znode = get_znode(c, znode, n);
+		if (IS_ERR(znode))
+			return znode;
+	}
+	/* Check if the child is the one we are looking for */
+	if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
+		return get_znode(c, znode, n);
+	/* If the key is unique, there is nowhere else to look */
+	if (!is_hash_key(c, key))
+		return NULL;
+	/*
+	 * The key is not unique and so may be also in the znodes to either
+	 * side.
+	 */
+	zn = znode;
+	nn = n;
+	/* Look left */
+	while (1) {
+		/* Move one branch to the left */
+		if (n)
+			n -= 1;
+		else {
+			znode = left_znode(c, znode);
+			if (!znode)
+				break;
+			if (IS_ERR(znode))
+				return znode;
+			n = znode->child_cnt - 1;
+		}
+		/* Check it */
+		if (znode->zbranch[n].lnum == lnum &&
+		    znode->zbranch[n].offs == offs)
+			return get_znode(c, znode, n);
+		/* Stop if the key is less than the one we are looking for */
+		if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
+			break;
+	}
+	/* Back to the middle */
+	znode = zn;
+	n = nn;
+	/* Look right */
+	while (1) {
+		/* Move one branch to the right */
+		if (++n >= znode->child_cnt) {
+			znode = right_znode(c, znode);
+			if (!znode)
+				break;
+			if (IS_ERR(znode))
+				return znode;
+			n = 0;
+		}
+		/* Check it */
+		if (znode->zbranch[n].lnum == lnum &&
+		    znode->zbranch[n].offs == offs)
+			return get_znode(c, znode, n);
+		/* Stop if the key is greater than the one we are looking for */
+		if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
+			break;
+	}
+	return NULL;
+}
+
+/**
+ * is_idx_node_in_tnc - determine if an index node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * This function returns %0 if the index node is not referred to in the TNC, %1
+ * if the index node is referred to in the TNC and the corresponding znode is
+ * dirty, %2 if an index node is referred to in the TNC and the corresponding
+ * znode is clean, and a negative error code in case of failure.
+ *
+ * Note, the @key argument has to be the key of the first child. Also note,
+ * this function relies on the fact that 0:0 is never a valid LEB number and
+ * offset for a main-area node.
+ */
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+
+	znode = lookup_znode(c, key, level, lnum, offs);
+	if (!znode)
+		return 0;
+	if (IS_ERR(znode))
+		return PTR_ERR(znode);
+
+	return ubifs_zn_dirty(znode) ? 1 : 2;
+}
+
+/**
+ * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @lnum: node LEB number
+ * @offs: node offset
+ *
+ * This function returns %1 if the node is referred to in the TNC, %0 if it is
+ * not, and a negative error code in case of failure.
+ *
+ * Note, this function relies on the fact that 0:0 is never a valid LEB number
+ * and offset for a main-area node.
+ */
+static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
+			       int lnum, int offs)
+{
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *znode, *zn;
+	int n, found, err, nn;
+	const int unique = !is_hash_key(c, key);
+
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (found < 0)
+		return found; /* Error code */
+	if (!found)
+		return 0;
+	zbr = &znode->zbranch[n];
+	if (lnum == zbr->lnum && offs == zbr->offs)
+		return 1; /* Found it */
+	if (unique)
+		return 0;
+	/*
+	 * Because the key is not unique, we have to look left
+	 * and right as well
+	 */
+	zn = znode;
+	nn = n;
+	/* Look left */
+	while (1) {
+		err = tnc_prev(c, &znode, &n);
+		if (err == -ENOENT)
+			break;
+		if (err)
+			return err;
+		if (keys_cmp(c, key, &znode->zbranch[n].key))
+			break;
+		zbr = &znode->zbranch[n];
+		if (lnum == zbr->lnum && offs == zbr->offs)
+			return 1; /* Found it */
+	}
+	/* Look right */
+	znode = zn;
+	n = nn;
+	while (1) {
+		err = tnc_next(c, &znode, &n);
+		if (err) {
+			if (err == -ENOENT)
+				return 0;
+			return err;
+		}
+		if (keys_cmp(c, key, &znode->zbranch[n].key))
+			break;
+		zbr = &znode->zbranch[n];
+		if (lnum == zbr->lnum && offs == zbr->offs)
+			return 1; /* Found it */
+	}
+	return 0;
+}
+
+/**
+ * ubifs_tnc_has_node - determine whether a node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @level: index node level (if it is an index node)
+ * @lnum: node LEB number
+ * @offs: node offset
+ * @is_idx: non-zero if the node is an index node
+ *
+ * This function returns %1 if the node is in the TNC, %0 if it is not, and a
+ * negative error code in case of failure. For index nodes, @key has to be the
+ * key of the first child. An index node is considered to be in the TNC only if
+ * the corresponding znode is clean or has not been loaded.
+ */
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs, int is_idx)
+{
+	int err;
+
+	mutex_lock(&c->tnc_mutex);
+	if (is_idx) {
+		err = is_idx_node_in_tnc(c, key, level, lnum, offs);
+		if (err < 0)
+			goto out_unlock;
+		if (err == 1)
+			/* The index node was found but it was dirty */
+			err = 0;
+		else if (err == 2)
+			/* The index node was found and it was clean */
+			err = 1;
+		else
+			BUG_ON(err != 0);
+	} else
+		err = is_leaf_node_in_tnc(c, key, lnum, offs);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_dirty_idx_node - dirty an index node.
+ * @c: UBIFS file-system description object
+ * @key: index node key
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function loads and dirties an index node so that it can be garbage
+ * collected. The @key argument has to be the key of the first child. This
+ * function relies on the fact that 0:0 is never a valid LEB number and offset
+ * for a main-area node. Returns %0 on success and a negative error code on
+ * failure.
+ */
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+			 int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+	int err = 0;
+
+	mutex_lock(&c->tnc_mutex);
+	znode = lookup_znode(c, key, level, lnum, offs);
+	if (!znode)
+		goto out_unlock;
+	if (IS_ERR(znode)) {
+		err = PTR_ERR(znode);
+		goto out_unlock;
+	}
+	znode = dirty_cow_bottom_up(c, znode);
+	if (IS_ERR(znode)) {
+		err = PTR_ERR(znode);
+		goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * dbg_check_inode_size - check if inode size is correct.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @size: inode size
+ *
+ * This function makes sure that the inode size (@size) is correct and it does
+ * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
+ * if it has a data page beyond @size, and other negative error code in case of
+ * other errors.
+ */
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+			 loff_t size)
+{
+	int err, n;
+	union ubifs_key from_key, to_key, *key;
+	struct ubifs_znode *znode;
+	unsigned int block;
+
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (!dbg_is_chk_gen(c))
+		return 0;
+
+	block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+	data_key_init(c, &from_key, inode->i_ino, block);
+	highest_data_key(c, &to_key, inode->i_ino);
+
+	mutex_lock(&c->tnc_mutex);
+	err = ubifs_lookup_level0(c, &from_key, &znode, &n);
+	if (err < 0)
+		goto out_unlock;
+
+	if (err) {
+		key = &from_key;
+		goto out_dump;
+	}
+
+	err = tnc_next(c, &znode, &n);
+	if (err == -ENOENT) {
+		err = 0;
+		goto out_unlock;
+	}
+	if (err < 0)
+		goto out_unlock;
+
+	ubifs_assert(err == 0);
+	key = &znode->zbranch[n].key;
+	if (!key_in_range(c, key, &from_key, &to_key))
+		goto out_unlock;
+
+out_dump:
+	block = key_block(c, key);
+	ubifs_err(c, "inode %lu has size %lld, but there are data at offset %lld",
+		  (unsigned long)inode->i_ino, size,
+		  ((loff_t)block) << UBIFS_BLOCK_SHIFT);
+	mutex_unlock(&c->tnc_mutex);
+	ubifs_dump_inode(c, inode);
+	dump_stack();
+	return -EINVAL;
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
diff --git a/kernel/fs/ubifs/tnc_commit.c b/kernel/fs/ubifs/tnc_commit.c
new file mode 100644
index 000000000..b45345d70
--- /dev/null
+++ b/kernel/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1071 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/* This file implements TNC functions for committing */
+
+#include <linux/random.h>
+#include "ubifs.h"
+
+/**
+ * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
+ * @c: UBIFS file-system description object
+ * @idx: buffer in which to place new index node
+ * @znode: znode from which to make new index node
+ * @lnum: LEB number where new index node will be written
+ * @offs: offset where new index node will be written
+ * @len: length of new index node
+ */
+static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
+			 struct ubifs_znode *znode, int lnum, int offs, int len)
+{
+	struct ubifs_znode *zp;
+	int i, err;
+
+	/* Make index node */
+	idx->ch.node_type = UBIFS_IDX_NODE;
+	idx->child_cnt = cpu_to_le16(znode->child_cnt);
+	idx->level = cpu_to_le16(znode->level);
+	for (i = 0; i < znode->child_cnt; i++) {
+		struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+		struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+		key_write_idx(c, &zbr->key, &br->key);
+		br->lnum = cpu_to_le32(zbr->lnum);
+		br->offs = cpu_to_le32(zbr->offs);
+		br->len = cpu_to_le32(zbr->len);
+		if (!zbr->lnum || !zbr->len) {
+			ubifs_err(c, "bad ref in znode");
+			ubifs_dump_znode(c, znode);
+			if (zbr->znode)
+				ubifs_dump_znode(c, zbr->znode);
+		}
+	}
+	ubifs_prepare_node(c, idx, len, 0);
+
+	znode->lnum = lnum;
+	znode->offs = offs;
+	znode->len = len;
+
+	err = insert_old_idx_znode(c, znode);
+
+	/* Update the parent */
+	zp = znode->parent;
+	if (zp) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &zp->zbranch[znode->iip];
+		zbr->lnum = lnum;
+		zbr->offs = offs;
+		zbr->len = len;
+	} else {
+		c->zroot.lnum = lnum;
+		c->zroot.offs = offs;
+		c->zroot.len = len;
+	}
+	c->calc_idx_sz += ALIGN(len, 8);
+
+	atomic_long_dec(&c->dirty_zn_cnt);
+
+	ubifs_assert(ubifs_zn_dirty(znode));
+	ubifs_assert(ubifs_zn_cow(znode));
+
+	/*
+	 * Note, unlike 'write_index()' we do not add memory barriers here
+	 * because this function is called with @c->tnc_mutex locked.
+	 */
+	__clear_bit(DIRTY_ZNODE, &znode->flags);
+	__clear_bit(COW_ZNODE, &znode->flags);
+
+	return err;
+}
+
+/**
+ * fill_gap - make index nodes in gaps in dirty index LEBs.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number that gap appears in
+ * @gap_start: offset of start of gap
+ * @gap_end: offset of end of gap
+ * @dirt: adds dirty space to this
+ *
+ * This function returns the number of index nodes written into the gap.
+ */
+static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
+		    int *dirt)
+{
+	int len, gap_remains, gap_pos, written, pad_len;
+
+	ubifs_assert((gap_start & 7) == 0);
+	ubifs_assert((gap_end & 7) == 0);
+	ubifs_assert(gap_end >= gap_start);
+
+	gap_remains = gap_end - gap_start;
+	if (!gap_remains)
+		return 0;
+	gap_pos = gap_start;
+	written = 0;
+	while (c->enext) {
+		len = ubifs_idx_node_sz(c, c->enext->child_cnt);
+		if (len < gap_remains) {
+			struct ubifs_znode *znode = c->enext;
+			const int alen = ALIGN(len, 8);
+			int err;
+
+			ubifs_assert(alen <= gap_remains);
+			err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
+					    lnum, gap_pos, len);
+			if (err)
+				return err;
+			gap_remains -= alen;
+			gap_pos += alen;
+			c->enext = znode->cnext;
+			if (c->enext == c->cnext)
+				c->enext = NULL;
+			written += 1;
+		} else
+			break;
+	}
+	if (gap_end == c->leb_size) {
+		c->ileb_len = ALIGN(gap_pos, c->min_io_size);
+		/* Pad to end of min_io_size */
+		pad_len = c->ileb_len - gap_pos;
+	} else
+		/* Pad to end of gap */
+		pad_len = gap_remains;
+	dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
+	       lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
+	ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
+	*dirt += pad_len;
+	return written;
+}
+
+/**
+ * find_old_idx - find an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %1 if found and %0 otherwise.
+ */
+static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_old_idx *o;
+	struct rb_node *p;
+
+	p = c->old_idx.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_old_idx, rb);
+		if (lnum < o->lnum)
+			p = p->rb_left;
+		else if (lnum > o->lnum)
+			p = p->rb_right;
+		else if (offs < o->offs)
+			p = p->rb_left;
+		else if (offs > o->offs)
+			p = p->rb_right;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+/**
+ * is_idx_node_in_use - determine if an index node can be overwritten.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * If @key / @lnum / @offs identify an index node that was not part of the old
+ * index, then this function returns %0 (obsolete).  Else if the index node was
+ * part of the old index but is now dirty %1 is returned, else if it is clean %2
+ * is returned. A negative error code is returned on failure.
+ */
+static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
+			      int level, int lnum, int offs)
+{
+	int ret;
+
+	ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
+	if (ret < 0)
+		return ret; /* Error code */
+	if (ret == 0)
+		if (find_old_idx(c, lnum, offs))
+			return 1;
+	return ret;
+}
+
+/**
+ * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
+ * @c: UBIFS file-system description object
+ * @p: return LEB number here
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ * This function merely puts the next znode into the next gap, making no attempt
+ * to try to maximise the number of znodes that fit.
+ * This function returns the number of index nodes written into the gaps, or a
+ * negative error code on failure.
+ */
+static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
+
+	tot_written = 0;
+	/* Get an index LEB with lots of obsolete index nodes */
+	lnum = ubifs_find_dirty_idx_leb(c);
+	if (lnum < 0)
+		/*
+		 * There also may be dirt in the index head that could be
+		 * filled, however we do not check there at present.
+		 */
+		return lnum; /* Error code */
+	*p = lnum;
+	dbg_gc("LEB %d", lnum);
+	/*
+	 * Scan the index LEB.  We use the generic scan for this even though
+	 * it is more comprehensive and less efficient than is needed for this
+	 * purpose.
+	 */
+	sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
+	c->ileb_len = 0;
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	gap_start = 0;
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		struct ubifs_idx_node *idx;
+		int in_use, level;
+
+		ubifs_assert(snod->type == UBIFS_IDX_NODE);
+		idx = snod->node;
+		key_read(c, ubifs_idx_key(c, idx), &snod->key);
+		level = le16_to_cpu(idx->level);
+		/* Determine if the index node is in use (not obsolete) */
+		in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
+					    snod->offs);
+		if (in_use < 0) {
+			ubifs_scan_destroy(sleb);
+			return in_use; /* Error code */
+		}
+		if (in_use) {
+			if (in_use == 1)
+				dirt += ALIGN(snod->len, 8);
+			/*
+			 * The obsolete index nodes form gaps that can be
+			 * overwritten.  This gap has ended because we have
+			 * found an index node that is still in use
+			 * i.e. not obsolete
+			 */
+			gap_end = snod->offs;
+			/* Try to fill gap */
+			written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+			if (written < 0) {
+				ubifs_scan_destroy(sleb);
+				return written; /* Error code */
+			}
+			tot_written += written;
+			gap_start = ALIGN(snod->offs + snod->len, 8);
+		}
+	}
+	ubifs_scan_destroy(sleb);
+	c->ileb_len = c->leb_size;
+	gap_end = c->leb_size;
+	/* Try to fill gap */
+	written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+	if (written < 0)
+		return written; /* Error code */
+	tot_written += written;
+	if (tot_written == 0) {
+		struct ubifs_lprops lp;
+
+		dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+		err = ubifs_read_one_lp(c, lnum, &lp);
+		if (err)
+			return err;
+		if (lp.free == c->leb_size) {
+			/*
+			 * We must have snatched this LEB from the idx_gc list
+			 * so we need to correct the free and dirty space.
+			 */
+			err = ubifs_change_one_lp(c, lnum,
+						  c->leb_size - c->ileb_len,
+						  dirt, 0, 0, 0);
+			if (err)
+				return err;
+		}
+		return 0;
+	}
+	err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
+				  0, 0, 0);
+	if (err)
+		return err;
+	err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len);
+	if (err)
+		return err;
+	dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+	return tot_written;
+}
+
+/**
+ * get_leb_cnt - calculate the number of empty LEBs needed to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns the number of empty LEBs needed to commit @cnt znodes
+ * to the current index head.  The number is not exact and may be more than
+ * needed.
+ */
+static int get_leb_cnt(struct ubifs_info *c, int cnt)
+{
+	int d;
+
+	/* Assume maximum index node size (i.e. overestimate space needed) */
+	cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
+	if (cnt < 0)
+		cnt = 0;
+	d = c->leb_size / c->max_idx_node_sz;
+	return DIV_ROUND_UP(cnt, d);
+}
+
+/**
+ * layout_in_gaps - in-the-gaps method of committing TNC.
+ * @c: UBIFS file-system description object
+ * @cnt: number of dirty znodes to commit.
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_gaps(struct ubifs_info *c, int cnt)
+{
+	int err, leb_needed_cnt, written, *p;
+
+	dbg_gc("%d znodes to write", cnt);
+
+	c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
+	if (!c->gap_lebs)
+		return -ENOMEM;
+
+	p = c->gap_lebs;
+	do {
+		ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+		written = layout_leb_in_gaps(c, p);
+		if (written < 0) {
+			err = written;
+			if (err != -ENOSPC) {
+				kfree(c->gap_lebs);
+				c->gap_lebs = NULL;
+				return err;
+			}
+			if (!dbg_is_chk_index(c)) {
+				/*
+				 * Do not print scary warnings if the debugging
+				 * option which forces in-the-gaps is enabled.
+				 */
+				ubifs_warn(c, "out of space");
+				ubifs_dump_budg(c, &c->bi);
+				ubifs_dump_lprops(c);
+			}
+			/* Try to commit anyway */
+			break;
+		}
+		p++;
+		cnt -= written;
+		leb_needed_cnt = get_leb_cnt(c, cnt);
+		dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
+		       leb_needed_cnt, c->ileb_cnt);
+	} while (leb_needed_cnt > c->ileb_cnt);
+
+	*p = -1;
+	return 0;
+}
+
+/**
+ * layout_in_empty_space - layout index nodes in empty space.
+ * @c: UBIFS file-system description object
+ *
+ * This function lays out new index nodes for dirty znodes using empty LEBs.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_empty_space(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext, *zp;
+	int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
+	int wlen, blen, err;
+
+	cnext = c->enext;
+	if (!cnext)
+		return 0;
+
+	lnum = c->ihead_lnum;
+	buf_offs = c->ihead_offs;
+
+	buf_len = ubifs_idx_node_sz(c, c->fanout);
+	buf_len = ALIGN(buf_len, c->min_io_size);
+	used = 0;
+	avail = buf_len;
+
+	/* Ensure there is enough room for first write */
+	next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+	if (buf_offs + next_len > c->leb_size)
+		lnum = -1;
+
+	while (1) {
+		znode = cnext;
+
+		len = ubifs_idx_node_sz(c, znode->child_cnt);
+
+		/* Determine the index node position */
+		if (lnum == -1) {
+			if (c->ileb_nxt >= c->ileb_cnt) {
+				ubifs_err(c, "out of space");
+				return -ENOSPC;
+			}
+			lnum = c->ilebs[c->ileb_nxt++];
+			buf_offs = 0;
+			used = 0;
+			avail = buf_len;
+		}
+
+		offs = buf_offs + used;
+
+		znode->lnum = lnum;
+		znode->offs = offs;
+		znode->len = len;
+
+		/* Update the parent */
+		zp = znode->parent;
+		if (zp) {
+			struct ubifs_zbranch *zbr;
+			int i;
+
+			i = znode->iip;
+			zbr = &zp->zbranch[i];
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+		} else {
+			c->zroot.lnum = lnum;
+			c->zroot.offs = offs;
+			c->zroot.len = len;
+		}
+		c->calc_idx_sz += ALIGN(len, 8);
+
+		/*
+		 * Once lprops is updated, we can decrease the dirty znode count
+		 * but it is easier to just do it here.
+		 */
+		atomic_long_dec(&c->dirty_zn_cnt);
+
+		/*
+		 * Calculate the next index node length to see if there is
+		 * enough room for it
+		 */
+		cnext = znode->cnext;
+		if (cnext == c->cnext)
+			next_len = 0;
+		else
+			next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+		/* Update buffer positions */
+		wlen = used + len;
+		used += ALIGN(len, 8);
+		avail -= ALIGN(len, 8);
+
+		if (next_len != 0 &&
+		    buf_offs + used + next_len <= c->leb_size &&
+		    avail > 0)
+			continue;
+
+		if (avail <= 0 && next_len &&
+		    buf_offs + used + next_len <= c->leb_size)
+			blen = buf_len;
+		else
+			blen = ALIGN(wlen, c->min_io_size);
+
+		/* The buffer is full or there are no more znodes to do */
+		buf_offs += blen;
+		if (next_len) {
+			if (buf_offs + next_len > c->leb_size) {
+				err = ubifs_update_one_lp(c, lnum,
+					c->leb_size - buf_offs, blen - used,
+					0, 0);
+				if (err)
+					return err;
+				lnum = -1;
+			}
+			used -= blen;
+			if (used < 0)
+				used = 0;
+			avail = buf_len - used;
+			continue;
+		}
+		err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
+					  blen - used, 0, 0);
+		if (err)
+			return err;
+		break;
+	}
+
+	c->dbg->new_ihead_lnum = lnum;
+	c->dbg->new_ihead_offs = buf_offs;
+
+	return 0;
+}
+
+/**
+ * layout_commit - determine positions of index nodes to commit.
+ * @c: UBIFS file-system description object
+ * @no_space: indicates that insufficient empty LEBs were allocated
+ * @cnt: number of znodes to commit
+ *
+ * Calculate and update the positions of index nodes to commit.  If there were
+ * an insufficient number of empty LEBs allocated, then index nodes are placed
+ * into the gaps created by obsolete index nodes in non-empty index LEBs.  For
+ * this purpose, an obsolete index node is one that was not in the index as at
+ * the end of the last commit.  To write "in-the-gaps" requires that those index
+ * LEBs are updated atomically in-place.
+ */
+static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
+{
+	int err;
+
+	if (no_space) {
+		err = layout_in_gaps(c, cnt);
+		if (err)
+			return err;
+	}
+	err = layout_in_empty_space(c);
+	return err;
+}
+
+/**
+ * find_first_dirty - find first dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
+{
+	int i, cont;
+
+	if (!znode)
+		return NULL;
+
+	while (1) {
+		if (znode->level == 0) {
+			if (ubifs_zn_dirty(znode))
+				return znode;
+			return NULL;
+		}
+		cont = 0;
+		for (i = 0; i < znode->child_cnt; i++) {
+			struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+			if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
+				znode = zbr->znode;
+				cont = 1;
+				break;
+			}
+		}
+		if (!cont) {
+			if (ubifs_zn_dirty(znode))
+				return znode;
+			return NULL;
+		}
+	}
+}
+
+/**
+ * find_next_dirty - find next dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
+{
+	int n = znode->iip + 1;
+
+	znode = znode->parent;
+	if (!znode)
+		return NULL;
+	for (; n < znode->child_cnt; n++) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		if (zbr->znode && ubifs_zn_dirty(zbr->znode))
+			return find_first_dirty(zbr->znode);
+	}
+	return znode;
+}
+
+/**
+ * get_znodes_to_commit - create list of dirty znodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of znodes to commit.
+ */
+static int get_znodes_to_commit(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext;
+	int cnt = 0;
+
+	c->cnext = find_first_dirty(c->zroot.znode);
+	znode = c->enext = c->cnext;
+	if (!znode) {
+		dbg_cmt("no znodes to commit");
+		return 0;
+	}
+	cnt += 1;
+	while (1) {
+		ubifs_assert(!ubifs_zn_cow(znode));
+		__set_bit(COW_ZNODE, &znode->flags);
+		znode->alt = 0;
+		cnext = find_next_dirty(znode);
+		if (!cnext) {
+			znode->cnext = c->cnext;
+			break;
+		}
+		znode->cnext = cnext;
+		znode = cnext;
+		cnt += 1;
+	}
+	dbg_cmt("committing %d znodes", cnt);
+	ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
+	return cnt;
+}
+
+/**
+ * alloc_idx_lebs - allocate empty LEBs to be used to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns %-ENOSPC if it cannot allocate a sufficient number of
+ * empty LEBs.  %0 is returned on success, otherwise a negative error code
+ * is returned.
+ */
+static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
+{
+	int i, leb_cnt, lnum;
+
+	c->ileb_cnt = 0;
+	c->ileb_nxt = 0;
+	leb_cnt = get_leb_cnt(c, cnt);
+	dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
+	if (!leb_cnt)
+		return 0;
+	c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
+	if (!c->ilebs)
+		return -ENOMEM;
+	for (i = 0; i < leb_cnt; i++) {
+		lnum = ubifs_find_free_leb_for_idx(c);
+		if (lnum < 0)
+			return lnum;
+		c->ilebs[c->ileb_cnt++] = lnum;
+		dbg_cmt("LEB %d", lnum);
+	}
+	if (dbg_is_chk_index(c) && !(prandom_u32() & 7))
+		return -ENOSPC;
+	return 0;
+}
+
+/**
+ * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
+ * @c: UBIFS file-system description object
+ *
+ * It is possible that we allocate more empty LEBs for the commit than we need.
+ * This functions frees the surplus.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_unused_idx_lebs(struct ubifs_info *c)
+{
+	int i, err = 0, lnum, er;
+
+	for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
+		lnum = c->ilebs[i];
+		dbg_cmt("LEB %d", lnum);
+		er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+					 LPROPS_INDEX | LPROPS_TAKEN, 0);
+		if (!err)
+			err = er;
+	}
+	return err;
+}
+
+/**
+ * free_idx_lebs - free unused LEBs after commit end.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_idx_lebs(struct ubifs_info *c)
+{
+	int err;
+
+	err = free_unused_idx_lebs(c);
+	kfree(c->ilebs);
+	c->ilebs = NULL;
+	return err;
+}
+
+/**
+ * ubifs_tnc_start_commit - start TNC commit.
+ * @c: UBIFS file-system description object
+ * @zroot: new index root position is returned here
+ *
+ * This function prepares the list of indexing nodes to commit and lays out
+ * their positions on flash. If there is not enough free space it uses the
+ * in-gap commit method. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	int err = 0, cnt;
+
+	mutex_lock(&c->tnc_mutex);
+	err = dbg_check_tnc(c, 1);
+	if (err)
+		goto out;
+	cnt = get_znodes_to_commit(c);
+	if (cnt != 0) {
+		int no_space = 0;
+
+		err = alloc_idx_lebs(c, cnt);
+		if (err == -ENOSPC)
+			no_space = 1;
+		else if (err)
+			goto out_free;
+		err = layout_commit(c, no_space, cnt);
+		if (err)
+			goto out_free;
+		ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+		err = free_unused_idx_lebs(c);
+		if (err)
+			goto out;
+	}
+	destroy_old_idx(c);
+	memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
+
+	err = ubifs_save_dirty_idx_lnums(c);
+	if (err)
+		goto out;
+
+	spin_lock(&c->space_lock);
+	/*
+	 * Although we have not finished committing yet, update size of the
+	 * committed index ('c->bi.old_idx_sz') and zero out the index growth
+	 * budget. It is OK to do this now, because we've reserved all the
+	 * space which is needed to commit the index, and it is save for the
+	 * budgeting subsystem to assume the index is already committed,
+	 * even though it is not.
+	 */
+	ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	c->bi.old_idx_sz = c->calc_idx_sz;
+	c->bi.uncommitted_idx = 0;
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	spin_unlock(&c->space_lock);
+	mutex_unlock(&c->tnc_mutex);
+
+	dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
+	dbg_cmt("size of index %llu", c->calc_idx_sz);
+	return err;
+
+out_free:
+	free_idx_lebs(c);
+out:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * write_index - write index nodes.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the index nodes whose positions were laid out in the
+ * layout_in_empty_space function.
+ */
+static int write_index(struct ubifs_info *c)
+{
+	struct ubifs_idx_node *idx;
+	struct ubifs_znode *znode, *cnext;
+	int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
+	int avail, wlen, err, lnum_pos = 0, blen, nxt_offs;
+
+	cnext = c->enext;
+	if (!cnext)
+		return 0;
+
+	/*
+	 * Always write index nodes to the index head so that index nodes and
+	 * other types of nodes are never mixed in the same erase block.
+	 */
+	lnum = c->ihead_lnum;
+	buf_offs = c->ihead_offs;
+
+	/* Allocate commit buffer */
+	buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
+	used = 0;
+	avail = buf_len;
+
+	/* Ensure there is enough room for first write */
+	next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+	if (buf_offs + next_len > c->leb_size) {
+		err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
+					  LPROPS_TAKEN);
+		if (err)
+			return err;
+		lnum = -1;
+	}
+
+	while (1) {
+		cond_resched();
+
+		znode = cnext;
+		idx = c->cbuf + used;
+
+		/* Make index node */
+		idx->ch.node_type = UBIFS_IDX_NODE;
+		idx->child_cnt = cpu_to_le16(znode->child_cnt);
+		idx->level = cpu_to_le16(znode->level);
+		for (i = 0; i < znode->child_cnt; i++) {
+			struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+			struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+			key_write_idx(c, &zbr->key, &br->key);
+			br->lnum = cpu_to_le32(zbr->lnum);
+			br->offs = cpu_to_le32(zbr->offs);
+			br->len = cpu_to_le32(zbr->len);
+			if (!zbr->lnum || !zbr->len) {
+				ubifs_err(c, "bad ref in znode");
+				ubifs_dump_znode(c, znode);
+				if (zbr->znode)
+					ubifs_dump_znode(c, zbr->znode);
+			}
+		}
+		len = ubifs_idx_node_sz(c, znode->child_cnt);
+		ubifs_prepare_node(c, idx, len, 0);
+
+		/* Determine the index node position */
+		if (lnum == -1) {
+			lnum = c->ilebs[lnum_pos++];
+			buf_offs = 0;
+			used = 0;
+			avail = buf_len;
+		}
+		offs = buf_offs + used;
+
+		if (lnum != znode->lnum || offs != znode->offs ||
+		    len != znode->len) {
+			ubifs_err(c, "inconsistent znode posn");
+			return -EINVAL;
+		}
+
+		/* Grab some stuff from znode while we still can */
+		cnext = znode->cnext;
+
+		ubifs_assert(ubifs_zn_dirty(znode));
+		ubifs_assert(ubifs_zn_cow(znode));
+
+		/*
+		 * It is important that other threads should see %DIRTY_ZNODE
+		 * flag cleared before %COW_ZNODE. Specifically, it matters in
+		 * the 'dirty_cow_znode()' function. This is the reason for the
+		 * first barrier. Also, we want the bit changes to be seen to
+		 * other threads ASAP, to avoid unnecesarry copying, which is
+		 * the reason for the second barrier.
+		 */
+		clear_bit(DIRTY_ZNODE, &znode->flags);
+		smp_mb__before_atomic();
+		clear_bit(COW_ZNODE, &znode->flags);
+		smp_mb__after_atomic();
+
+		/*
+		 * We have marked the znode as clean but have not updated the
+		 * @c->clean_zn_cnt counter. If this znode becomes dirty again
+		 * before 'free_obsolete_znodes()' is called, then
+		 * @c->clean_zn_cnt will be decremented before it gets
+		 * incremented (resulting in 2 decrements for the same znode).
+		 * This means that @c->clean_zn_cnt may become negative for a
+		 * while.
+		 *
+		 * Q: why we cannot increment @c->clean_zn_cnt?
+		 * A: because we do not have the @c->tnc_mutex locked, and the
+		 *    following code would be racy and buggy:
+		 *
+		 *    if (!ubifs_zn_obsolete(znode)) {
+		 *            atomic_long_inc(&c->clean_zn_cnt);
+		 *            atomic_long_inc(&ubifs_clean_zn_cnt);
+		 *    }
+		 *
+		 *    Thus, we just delay the @c->clean_zn_cnt update until we
+		 *    have the mutex locked.
+		 */
+
+		/* Do not access znode from this point on */
+
+		/* Update buffer positions */
+		wlen = used + len;
+		used += ALIGN(len, 8);
+		avail -= ALIGN(len, 8);
+
+		/*
+		 * Calculate the next index node length to see if there is
+		 * enough room for it
+		 */
+		if (cnext == c->cnext)
+			next_len = 0;
+		else
+			next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+		nxt_offs = buf_offs + used + next_len;
+		if (next_len && nxt_offs <= c->leb_size) {
+			if (avail > 0)
+				continue;
+			else
+				blen = buf_len;
+		} else {
+			wlen = ALIGN(wlen, 8);
+			blen = ALIGN(wlen, c->min_io_size);
+			ubifs_pad(c, c->cbuf + wlen, blen - wlen);
+		}
+
+		/* The buffer is full or there are no more znodes to do */
+		err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen);
+		if (err)
+			return err;
+		buf_offs += blen;
+		if (next_len) {
+			if (nxt_offs > c->leb_size) {
+				err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0,
+							  0, LPROPS_TAKEN);
+				if (err)
+					return err;
+				lnum = -1;
+			}
+			used -= blen;
+			if (used < 0)
+				used = 0;
+			avail = buf_len - used;
+			memmove(c->cbuf, c->cbuf + blen, used);
+			continue;
+		}
+		break;
+	}
+
+	if (lnum != c->dbg->new_ihead_lnum ||
+	    buf_offs != c->dbg->new_ihead_offs) {
+		ubifs_err(c, "inconsistent ihead");
+		return -EINVAL;
+	}
+
+	c->ihead_lnum = lnum;
+	c->ihead_offs = buf_offs;
+
+	return 0;
+}
+
+/**
+ * free_obsolete_znodes - free obsolete znodes.
+ * @c: UBIFS file-system description object
+ *
+ * At the end of commit end, obsolete znodes are freed.
+ */
+static void free_obsolete_znodes(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext;
+
+	cnext = c->cnext;
+	do {
+		znode = cnext;
+		cnext = znode->cnext;
+		if (ubifs_zn_obsolete(znode))
+			kfree(znode);
+		else {
+			znode->cnext = NULL;
+			atomic_long_inc(&c->clean_zn_cnt);
+			atomic_long_inc(&ubifs_clean_zn_cnt);
+		}
+	} while (cnext != c->cnext);
+}
+
+/**
+ * return_gap_lebs - return LEBs used by the in-gap commit method.
+ * @c: UBIFS file-system description object
+ *
+ * This function clears the "taken" flag for the LEBs which were used by the
+ * "commit in-the-gaps" method.
+ */
+static int return_gap_lebs(struct ubifs_info *c)
+{
+	int *p, err;
+
+	if (!c->gap_lebs)
+		return 0;
+
+	dbg_cmt("");
+	for (p = c->gap_lebs; *p != -1; p++) {
+		err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
+					  LPROPS_TAKEN, 0);
+		if (err)
+			return err;
+	}
+
+	kfree(c->gap_lebs);
+	c->gap_lebs = NULL;
+	return 0;
+}
+
+/**
+ * ubifs_tnc_end_commit - update the TNC for commit end.
+ * @c: UBIFS file-system description object
+ *
+ * Write the dirty znodes.
+ */
+int ubifs_tnc_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	if (!c->cnext)
+		return 0;
+
+	err = return_gap_lebs(c);
+	if (err)
+		return err;
+
+	err = write_index(c);
+	if (err)
+		return err;
+
+	mutex_lock(&c->tnc_mutex);
+
+	dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
+
+	free_obsolete_znodes(c);
+
+	c->cnext = NULL;
+	kfree(c->ilebs);
+	c->ilebs = NULL;
+
+	mutex_unlock(&c->tnc_mutex);
+
+	return 0;
+}
diff --git a/kernel/fs/ubifs/tnc_misc.c b/kernel/fs/ubifs/tnc_misc.c
new file mode 100644
index 000000000..93f5b7859
--- /dev/null
+++ b/kernel/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains miscelanious TNC-related functions shared betweend
+ * different files. This file does not form any logically separate TNC
+ * sub-system. The file was created because there is a lot of TNC code and
+ * putting it all in one file would make that file too big and unreadable.
+ */
+
+#include "ubifs.h"
+
+/**
+ * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
+ * @zr: root of the subtree to traverse
+ * @znode: previous znode
+ *
+ * This function implements levelorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+					      struct ubifs_znode *znode)
+{
+	int level, iip, level_search = 0;
+	struct ubifs_znode *zn;
+
+	ubifs_assert(zr);
+
+	if (unlikely(!znode))
+		return zr;
+
+	if (unlikely(znode == zr)) {
+		if (znode->level == 0)
+			return NULL;
+		return ubifs_tnc_find_child(zr, 0);
+	}
+
+	level = znode->level;
+
+	iip = znode->iip;
+	while (1) {
+		ubifs_assert(znode->level <= zr->level);
+
+		/*
+		 * First walk up until there is a znode with next branch to
+		 * look at.
+		 */
+		while (znode->parent != zr && iip >= znode->parent->child_cnt) {
+			znode = znode->parent;
+			iip = znode->iip;
+		}
+
+		if (unlikely(znode->parent == zr &&
+			     iip >= znode->parent->child_cnt)) {
+			/* This level is done, switch to the lower one */
+			level -= 1;
+			if (level_search || level < 0)
+				/*
+				 * We were already looking for znode at lower
+				 * level ('level_search'). As we are here
+				 * again, it just does not exist. Or all levels
+				 * were finished ('level < 0').
+				 */
+				return NULL;
+
+			level_search = 1;
+			iip = -1;
+			znode = ubifs_tnc_find_child(zr, 0);
+			ubifs_assert(znode);
+		}
+
+		/* Switch to the next index */
+		zn = ubifs_tnc_find_child(znode->parent, iip + 1);
+		if (!zn) {
+			/* No more children to look at, we have walk up */
+			iip = znode->parent->child_cnt;
+			continue;
+		}
+
+		/* Walk back down to the level we came from ('level') */
+		while (zn->level != level) {
+			znode = zn;
+			zn = ubifs_tnc_find_child(zn, 0);
+			if (!zn) {
+				/*
+				 * This path is not too deep so it does not
+				 * reach 'level'. Try next path.
+				 */
+				iip = znode->iip;
+				break;
+			}
+		}
+
+		if (zn) {
+			ubifs_assert(zn->level >= 0);
+			return zn;
+		}
+	}
+}
+
+/**
+ * ubifs_search_zbranch - search znode branch.
+ * @c: UBIFS file-system description object
+ * @znode: znode to search in
+ * @key: key to search for
+ * @n: znode branch slot number is returned here
+ *
+ * This is a helper function which search branch with key @key in @znode using
+ * binary search. The result of the search may be:
+ *   o exact match, then %1 is returned, and the slot number of the branch is
+ *     stored in @n;
+ *   o no exact match, then %0 is returned and the slot number of the left
+ *     closest branch is returned in @n; the slot if all keys in this znode are
+ *     greater than @key, then %-1 is returned in @n.
+ */
+int ubifs_search_zbranch(const struct ubifs_info *c,
+			 const struct ubifs_znode *znode,
+			 const union ubifs_key *key, int *n)
+{
+	int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
+	int uninitialized_var(cmp);
+	const struct ubifs_zbranch *zbr = &znode->zbranch[0];
+
+	ubifs_assert(end > beg);
+
+	while (end > beg) {
+		mid = (beg + end) >> 1;
+		cmp = keys_cmp(c, key, &zbr[mid].key);
+		if (cmp > 0)
+			beg = mid + 1;
+		else if (cmp < 0)
+			end = mid;
+		else {
+			*n = mid;
+			return 1;
+		}
+	}
+
+	*n = end - 1;
+
+	/* The insert point is after *n */
+	ubifs_assert(*n >= -1 && *n < znode->child_cnt);
+	if (*n == -1)
+		ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
+	else
+		ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
+	if (*n + 1 < znode->child_cnt)
+		ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
+ * @znode: znode to start at (root of the sub-tree to traverse)
+ *
+ * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
+ * ignored.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
+{
+	if (unlikely(!znode))
+		return NULL;
+
+	while (znode->level > 0) {
+		struct ubifs_znode *child;
+
+		child = ubifs_tnc_find_child(znode, 0);
+		if (!child)
+			return znode;
+		znode = child;
+	}
+
+	return znode;
+}
+
+/**
+ * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
+ * @znode: previous znode
+ *
+ * This function implements postorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn;
+
+	ubifs_assert(znode);
+	if (unlikely(!znode->parent))
+		return NULL;
+
+	/* Switch to the next index in the parent */
+	zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
+	if (!zn)
+		/* This is in fact the last child, return parent */
+		return znode->parent;
+
+	/* Go to the first znode in this new subtree */
+	return ubifs_tnc_postorder_first(zn);
+}
+
+/**
+ * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
+ * @znode: znode defining subtree to destroy
+ *
+ * This function destroys subtree of the TNC tree. Returns number of clean
+ * znodes in the subtree.
+ */
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
+	long clean_freed = 0;
+	int n;
+
+	ubifs_assert(zn);
+	while (1) {
+		for (n = 0; n < zn->child_cnt; n++) {
+			if (!zn->zbranch[n].znode)
+				continue;
+
+			if (zn->level > 0 &&
+			    !ubifs_zn_dirty(zn->zbranch[n].znode))
+				clean_freed += 1;
+
+			cond_resched();
+			kfree(zn->zbranch[n].znode);
+		}
+
+		if (zn == znode) {
+			if (!ubifs_zn_dirty(zn))
+				clean_freed += 1;
+			kfree(zn);
+			return clean_freed;
+		}
+
+		zn = ubifs_tnc_postorder_next(zn);
+	}
+}
+
+/**
+ * read_znode - read an indexing node from flash and fill znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB of the indexing node to read
+ * @offs: node offset
+ * @len: node length
+ * @znode: znode to read to
+ *
+ * This function reads an indexing node from the flash media and fills znode
+ * with the read data. Returns zero in case of success and a negative error
+ * code in case of failure. The read indexing node is validated and if anything
+ * is wrong with it, this function prints complaint messages and returns
+ * %-EINVAL.
+ */
+static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+		      struct ubifs_znode *znode)
+{
+	int i, err, type, cmp;
+	struct ubifs_idx_node *idx;
+
+	idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+	if (!idx)
+		return -ENOMEM;
+
+	err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+	if (err < 0) {
+		kfree(idx);
+		return err;
+	}
+
+	znode->child_cnt = le16_to_cpu(idx->child_cnt);
+	znode->level = le16_to_cpu(idx->level);
+
+	dbg_tnc("LEB %d:%d, level %d, %d branch",
+		lnum, offs, znode->level, znode->child_cnt);
+
+	if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
+		ubifs_err(c, "current fanout %d, branch count %d",
+			  c->fanout, znode->child_cnt);
+		ubifs_err(c, "max levels %d, znode level %d",
+			  UBIFS_MAX_LEVELS, znode->level);
+		err = 1;
+		goto out_dump;
+	}
+
+	for (i = 0; i < znode->child_cnt; i++) {
+		const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+		struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+		key_read(c, &br->key, &zbr->key);
+		zbr->lnum = le32_to_cpu(br->lnum);
+		zbr->offs = le32_to_cpu(br->offs);
+		zbr->len  = le32_to_cpu(br->len);
+		zbr->znode = NULL;
+
+		/* Validate branch */
+
+		if (zbr->lnum < c->main_first ||
+		    zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
+		    zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
+			ubifs_err(c, "bad branch %d", i);
+			err = 2;
+			goto out_dump;
+		}
+
+		switch (key_type(c, &zbr->key)) {
+		case UBIFS_INO_KEY:
+		case UBIFS_DATA_KEY:
+		case UBIFS_DENT_KEY:
+		case UBIFS_XENT_KEY:
+			break;
+		default:
+			ubifs_err(c, "bad key type at slot %d: %d",
+				  i, key_type(c, &zbr->key));
+			err = 3;
+			goto out_dump;
+		}
+
+		if (znode->level)
+			continue;
+
+		type = key_type(c, &zbr->key);
+		if (c->ranges[type].max_len == 0) {
+			if (zbr->len != c->ranges[type].len) {
+				ubifs_err(c, "bad target node (type %d) length (%d)",
+					  type, zbr->len);
+				ubifs_err(c, "have to be %d", c->ranges[type].len);
+				err = 4;
+				goto out_dump;
+			}
+		} else if (zbr->len < c->ranges[type].min_len ||
+			   zbr->len > c->ranges[type].max_len) {
+			ubifs_err(c, "bad target node (type %d) length (%d)",
+				  type, zbr->len);
+			ubifs_err(c, "have to be in range of %d-%d",
+				  c->ranges[type].min_len,
+				  c->ranges[type].max_len);
+			err = 5;
+			goto out_dump;
+		}
+	}
+
+	/*
+	 * Ensure that the next key is greater or equivalent to the
+	 * previous one.
+	 */
+	for (i = 0; i < znode->child_cnt - 1; i++) {
+		const union ubifs_key *key1, *key2;
+
+		key1 = &znode->zbranch[i].key;
+		key2 = &znode->zbranch[i + 1].key;
+
+		cmp = keys_cmp(c, key1, key2);
+		if (cmp > 0) {
+			ubifs_err(c, "bad key order (keys %d and %d)", i, i + 1);
+			err = 6;
+			goto out_dump;
+		} else if (cmp == 0 && !is_hash_key(c, key1)) {
+			/* These can only be keys with colliding hash */
+			ubifs_err(c, "keys %d and %d are not hashed but equivalent",
+				  i, i + 1);
+			err = 7;
+			goto out_dump;
+		}
+	}
+
+	kfree(idx);
+	return 0;
+
+out_dump:
+	ubifs_err(c, "bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
+	ubifs_dump_node(c, idx);
+	kfree(idx);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_load_znode - load znode to TNC cache.
+ * @c: UBIFS file-system description object
+ * @zbr: znode branch
+ * @parent: znode's parent
+ * @iip: index in parent
+ *
+ * This function loads znode pointed to by @zbr into the TNC cache and
+ * returns pointer to it in case of success and a negative error code in case
+ * of failure.
+ */
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+				     struct ubifs_zbranch *zbr,
+				     struct ubifs_znode *parent, int iip)
+{
+	int err;
+	struct ubifs_znode *znode;
+
+	ubifs_assert(!zbr->znode);
+	/*
+	 * A slab cache is not presently used for znodes because the znode size
+	 * depends on the fanout which is stored in the superblock.
+	 */
+	znode = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!znode)
+		return ERR_PTR(-ENOMEM);
+
+	err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+	if (err)
+		goto out;
+
+	atomic_long_inc(&c->clean_zn_cnt);
+
+	/*
+	 * Increment the global clean znode counter as well. It is OK that
+	 * global and per-FS clean znode counters may be inconsistent for some
+	 * short time (because we might be preempted at this point), the global
+	 * one is only used in shrinker.
+	 */
+	atomic_long_inc(&ubifs_clean_zn_cnt);
+
+	zbr->znode = znode;
+	znode->parent = parent;
+	znode->time = get_seconds();
+	znode->iip = iip;
+
+	return znode;
+
+out:
+	kfree(znode);
+	return ERR_PTR(err);
+}
+
+/**
+ * ubifs_tnc_read_node - read a leaf node from the flash media.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a node defined by @zbr from the flash media. Returns
+ * zero in case of success or a negative negative error code in case of
+ * failure.
+ */
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			void *node)
+{
+	union ubifs_key key1, *key = &zbr->key;
+	int err, type = key_type(c, key);
+	struct ubifs_wbuf *wbuf;
+
+	/*
+	 * 'zbr' has to point to on-flash node. The node may sit in a bud and
+	 * may even be in a write buffer, so we have to take care about this.
+	 */
+	wbuf = ubifs_get_wbuf(c, zbr->lnum);
+	if (wbuf)
+		err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
+					   zbr->lnum, zbr->offs);
+	else
+		err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
+				      zbr->offs);
+
+	if (err) {
+		dbg_tnck(key, "key ");
+		return err;
+	}
+
+	/* Make sure the key of the read node is correct */
+	key_read(c, node + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, key, &key1)) {
+		ubifs_err(c, "bad key in node at LEB %d:%d",
+			  zbr->lnum, zbr->offs);
+		dbg_tnck(key, "looked for key ");
+		dbg_tnck(&key1, "but found node's key ");
+		ubifs_dump_node(c, node);
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/ubifs/ubifs-media.h b/kernel/fs/ubifs/ubifs-media.h
new file mode 100644
index 000000000..e24380cf4
--- /dev/null
+++ b/kernel/fs/ubifs/ubifs-media.h
@@ -0,0 +1,784 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file describes UBIFS on-flash format and contains definitions of all the
+ * relevant data structures and constants.
+ *
+ * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
+ * with the UBIFS node magic number and have the same common header. Nodes
+ * always sit at 8-byte aligned positions on the media and node header sizes are
+ * also 8-byte aligned (except for the indexing node and the padding node).
+ */
+
+#ifndef __UBIFS_MEDIA_H__
+#define __UBIFS_MEDIA_H__
+
+/* UBIFS node magic number (must not have the padding byte first or last) */
+#define UBIFS_NODE_MAGIC  0x06101831
+
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
+#define UBIFS_FORMAT_VERSION 4
+
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
+
+/* Minimum logical eraseblock size in bytes */
+#define UBIFS_MIN_LEB_SZ (15*1024)
+
+/* Initial CRC32 value used when calculating CRC checksums */
+#define UBIFS_CRC32_INIT 0xFFFFFFFFU
+
+/*
+ * UBIFS does not try to compress data if its length is less than the below
+ * constant.
+ */
+#define UBIFS_MIN_COMPR_LEN 128
+
+/*
+ * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
+ * node uncompress, because it'll be read faster.
+ */
+#define UBIFS_MIN_COMPRESS_DIFF 64
+
+/* Root inode number */
+#define UBIFS_ROOT_INO 1
+
+/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
+#define UBIFS_FIRST_INO 64
+
+/*
+ * Maximum file name and extended attribute length (must be a multiple of 8,
+ * minus 1).
+ */
+#define UBIFS_MAX_NLEN 255
+
+/* Maximum number of data journal heads */
+#define UBIFS_MAX_JHEADS 1
+
+/*
+ * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
+ * which means that it does not treat the underlying media as consisting of
+ * blocks like in case of hard drives. Do not be confused. UBIFS block is just
+ * the maximum amount of data which one data node can have or which can be
+ * attached to an inode node.
+ */
+#define UBIFS_BLOCK_SIZE  4096
+#define UBIFS_BLOCK_SHIFT 12
+
+/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
+#define UBIFS_PADDING_BYTE 0xCE
+
+/* Maximum possible key length */
+#define UBIFS_MAX_KEY_LEN 16
+
+/* Key length ("simple" format) */
+#define UBIFS_SK_LEN 8
+
+/* Minimum index tree fanout */
+#define UBIFS_MIN_FANOUT 3
+
+/* Maximum number of levels in UBIFS indexing B-tree */
+#define UBIFS_MAX_LEVELS 512
+
+/* Maximum amount of data attached to an inode in bytes */
+#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
+
+/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
+#define UBIFS_LPT_FANOUT 4
+#define UBIFS_LPT_FANOUT_SHIFT 2
+
+/* LEB Properties Tree bit field sizes */
+#define UBIFS_LPT_CRC_BITS 16
+#define UBIFS_LPT_CRC_BYTES 2
+#define UBIFS_LPT_TYPE_BITS 4
+
+/* The key is always at the same position in all keyed nodes */
+#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+
+/* Garbage collector journal head number */
+#define UBIFS_GC_HEAD   0
+/* Base journal head number */
+#define UBIFS_BASE_HEAD 1
+/* Data journal head number */
+#define UBIFS_DATA_HEAD 2
+
+/*
+ * LEB Properties Tree node types.
+ *
+ * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
+ * UBIFS_LPT_NNODE: LPT internal node
+ * UBIFS_LPT_LTAB: LPT's own lprops table
+ * UBIFS_LPT_LSAVE: LPT's save table (big model only)
+ * UBIFS_LPT_NODE_CNT: count of LPT node types
+ * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
+ */
+enum {
+	UBIFS_LPT_PNODE,
+	UBIFS_LPT_NNODE,
+	UBIFS_LPT_LTAB,
+	UBIFS_LPT_LSAVE,
+	UBIFS_LPT_NODE_CNT,
+	UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
+};
+
+/*
+ * UBIFS inode types.
+ *
+ * UBIFS_ITYPE_REG: regular file
+ * UBIFS_ITYPE_DIR: directory
+ * UBIFS_ITYPE_LNK: soft link
+ * UBIFS_ITYPE_BLK: block device node
+ * UBIFS_ITYPE_CHR: character device node
+ * UBIFS_ITYPE_FIFO: fifo
+ * UBIFS_ITYPE_SOCK: socket
+ * UBIFS_ITYPES_CNT: count of supported file types
+ */
+enum {
+	UBIFS_ITYPE_REG,
+	UBIFS_ITYPE_DIR,
+	UBIFS_ITYPE_LNK,
+	UBIFS_ITYPE_BLK,
+	UBIFS_ITYPE_CHR,
+	UBIFS_ITYPE_FIFO,
+	UBIFS_ITYPE_SOCK,
+	UBIFS_ITYPES_CNT,
+};
+
+/*
+ * Supported key hash functions.
+ *
+ * UBIFS_KEY_HASH_R5: R5 hash
+ * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
+ */
+enum {
+	UBIFS_KEY_HASH_R5,
+	UBIFS_KEY_HASH_TEST,
+};
+
+/*
+ * Supported key formats.
+ *
+ * UBIFS_SIMPLE_KEY_FMT: simple key format
+ */
+enum {
+	UBIFS_SIMPLE_KEY_FMT,
+};
+
+/*
+ * The simple key format uses 29 bits for storing UBIFS block number and hash
+ * value.
+ */
+#define UBIFS_S_KEY_BLOCK_BITS 29
+#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
+#define UBIFS_S_KEY_HASH_BITS  UBIFS_S_KEY_BLOCK_BITS
+#define UBIFS_S_KEY_HASH_MASK  UBIFS_S_KEY_BLOCK_MASK
+
+/*
+ * Key types.
+ *
+ * UBIFS_INO_KEY: inode node key
+ * UBIFS_DATA_KEY: data node key
+ * UBIFS_DENT_KEY: directory entry node key
+ * UBIFS_XENT_KEY: extended attribute entry key
+ * UBIFS_KEY_TYPES_CNT: number of supported key types
+ */
+enum {
+	UBIFS_INO_KEY,
+	UBIFS_DATA_KEY,
+	UBIFS_DENT_KEY,
+	UBIFS_XENT_KEY,
+	UBIFS_KEY_TYPES_CNT,
+};
+
+/* Count of LEBs reserved for the superblock area */
+#define UBIFS_SB_LEBS 1
+/* Count of LEBs reserved for the master area */
+#define UBIFS_MST_LEBS 2
+
+/* First LEB of the superblock area */
+#define UBIFS_SB_LNUM 0
+/* First LEB of the master area */
+#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
+/* First LEB of the log area */
+#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
+
+/*
+ * The below constants define the absolute minimum values for various UBIFS
+ * media areas. Many of them actually depend of flash geometry and the FS
+ * configuration (number of journal heads, orphan LEBs, etc). This means that
+ * the smallest volume size which can be used for UBIFS cannot be pre-defined
+ * by these constants. The file-system that meets the below limitation will not
+ * necessarily mount. UBIFS does run-time calculations and validates the FS
+ * size.
+ */
+
+/* Minimum number of logical eraseblocks in the log */
+#define UBIFS_MIN_LOG_LEBS 2
+/* Minimum number of bud logical eraseblocks (one for each head) */
+#define UBIFS_MIN_BUD_LEBS 3
+/* Minimum number of journal logical eraseblocks */
+#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
+/* Minimum number of LPT area logical eraseblocks */
+#define UBIFS_MIN_LPT_LEBS 2
+/* Minimum number of orphan area logical eraseblocks */
+#define UBIFS_MIN_ORPH_LEBS 1
+/*
+ * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
+ * for GC, 1 for deletions, and at least 1 for committed data).
+ */
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
+
+/* Minimum number of logical eraseblocks */
+#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
+			   UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
+			   UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
+
+/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_CH_SZ        sizeof(struct ubifs_ch)
+#define UBIFS_INO_NODE_SZ  sizeof(struct ubifs_ino_node)
+#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
+#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
+#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
+#define UBIFS_PAD_NODE_SZ  sizeof(struct ubifs_pad_node)
+#define UBIFS_SB_NODE_SZ   sizeof(struct ubifs_sb_node)
+#define UBIFS_MST_NODE_SZ  sizeof(struct ubifs_mst_node)
+#define UBIFS_REF_NODE_SZ  sizeof(struct ubifs_ref_node)
+#define UBIFS_IDX_NODE_SZ  sizeof(struct ubifs_idx_node)
+#define UBIFS_CS_NODE_SZ   sizeof(struct ubifs_cs_node)
+#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+/* Extended attribute entry nodes are identical to directory entry nodes */
+#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
+/* Only this does not have to be multiple of 8 bytes */
+#define UBIFS_BRANCH_SZ    sizeof(struct ubifs_branch)
+
+/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_MAX_DATA_NODE_SZ  (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
+#define UBIFS_MAX_INO_NODE_SZ   (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
+#define UBIFS_MAX_DENT_NODE_SZ  (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
+#define UBIFS_MAX_XENT_NODE_SZ  UBIFS_MAX_DENT_NODE_SZ
+
+/* The largest UBIFS node */
+#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
+
+/*
+ * On-flash inode flags.
+ *
+ * UBIFS_COMPR_FL: use compression for this inode
+ * UBIFS_SYNC_FL:  I/O on this inode has to be synchronous
+ * UBIFS_IMMUTABLE_FL: inode is immutable
+ * UBIFS_APPEND_FL: writes to the inode may only append data
+ * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
+ * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ *
+ * Note, these are on-flash flags which correspond to ioctl flags
+ * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
+ * have to be the same.
+ */
+enum {
+	UBIFS_COMPR_FL     = 0x01,
+	UBIFS_SYNC_FL      = 0x02,
+	UBIFS_IMMUTABLE_FL = 0x04,
+	UBIFS_APPEND_FL    = 0x08,
+	UBIFS_DIRSYNC_FL   = 0x10,
+	UBIFS_XATTR_FL     = 0x20,
+};
+
+/* Inode flag bits used by UBIFS */
+#define UBIFS_FL_MASK 0x0000001F
+
+/*
+ * UBIFS compression algorithms.
+ *
+ * UBIFS_COMPR_NONE: no compression
+ * UBIFS_COMPR_LZO: LZO compression
+ * UBIFS_COMPR_ZLIB: ZLIB compression
+ * UBIFS_COMPR_TYPES_CNT: count of supported compression types
+ */
+enum {
+	UBIFS_COMPR_NONE,
+	UBIFS_COMPR_LZO,
+	UBIFS_COMPR_ZLIB,
+	UBIFS_COMPR_TYPES_CNT,
+};
+
+/*
+ * UBIFS node types.
+ *
+ * UBIFS_INO_NODE: inode node
+ * UBIFS_DATA_NODE: data node
+ * UBIFS_DENT_NODE: directory entry node
+ * UBIFS_XENT_NODE: extended attribute node
+ * UBIFS_TRUN_NODE: truncation node
+ * UBIFS_PAD_NODE: padding node
+ * UBIFS_SB_NODE: superblock node
+ * UBIFS_MST_NODE: master node
+ * UBIFS_REF_NODE: LEB reference node
+ * UBIFS_IDX_NODE: index node
+ * UBIFS_CS_NODE: commit start node
+ * UBIFS_ORPH_NODE: orphan node
+ * UBIFS_NODE_TYPES_CNT: count of supported node types
+ *
+ * Note, we index arrays by these numbers, so keep them low and contiguous.
+ * Node type constants for inodes, direntries and so on have to be the same as
+ * corresponding key type constants.
+ */
+enum {
+	UBIFS_INO_NODE,
+	UBIFS_DATA_NODE,
+	UBIFS_DENT_NODE,
+	UBIFS_XENT_NODE,
+	UBIFS_TRUN_NODE,
+	UBIFS_PAD_NODE,
+	UBIFS_SB_NODE,
+	UBIFS_MST_NODE,
+	UBIFS_REF_NODE,
+	UBIFS_IDX_NODE,
+	UBIFS_CS_NODE,
+	UBIFS_ORPH_NODE,
+	UBIFS_NODE_TYPES_CNT,
+};
+
+/*
+ * Master node flags.
+ *
+ * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
+ * UBIFS_MST_NO_ORPHS: no orphan inodes present
+ * UBIFS_MST_RCVRY: written by recovery
+ */
+enum {
+	UBIFS_MST_DIRTY = 1,
+	UBIFS_MST_NO_ORPHS = 2,
+	UBIFS_MST_RCVRY = 4,
+};
+
+/*
+ * Node group type (used by recovery to recover whole group or none).
+ *
+ * UBIFS_NO_NODE_GROUP: this node is not part of a group
+ * UBIFS_IN_NODE_GROUP: this node is a part of a group
+ * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
+ */
+enum {
+	UBIFS_NO_NODE_GROUP = 0,
+	UBIFS_IN_NODE_GROUP,
+	UBIFS_LAST_OF_NODE_GROUP,
+};
+
+/*
+ * Superblock flags.
+ *
+ * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
+ */
+enum {
+	UBIFS_FLG_BIGLPT = 0x02,
+	UBIFS_FLG_SPACE_FIXUP = 0x04,
+};
+
+/**
+ * struct ubifs_ch - common header node.
+ * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
+ * @crc: CRC-32 checksum of the node header
+ * @sqnum: sequence number
+ * @len: full node length
+ * @node_type: node type
+ * @group_type: node group type
+ * @padding: reserved for future, zeroes
+ *
+ * Every UBIFS node starts with this common part. If the node has a key, the
+ * key always goes next.
+ */
+struct ubifs_ch {
+	__le32 magic;
+	__le32 crc;
+	__le64 sqnum;
+	__le32 len;
+	__u8 node_type;
+	__u8 group_type;
+	__u8 padding[2];
+} __packed;
+
+/**
+ * union ubifs_dev_desc - device node descriptor.
+ * @new: new type device descriptor
+ * @huge: huge type device descriptor
+ *
+ * This data structure describes major/minor numbers of a device node. In an
+ * inode is a device node then its data contains an object of this type. UBIFS
+ * uses standard Linux "new" and "huge" device node encodings.
+ */
+union ubifs_dev_desc {
+	__le32 new;
+	__le64 huge;
+} __packed;
+
+/**
+ * struct ubifs_ino_node - inode node.
+ * @ch: common header
+ * @key: node key
+ * @creat_sqnum: sequence number at time of creation
+ * @size: inode size in bytes (amount of uncompressed data)
+ * @atime_sec: access time seconds
+ * @ctime_sec: creation time seconds
+ * @mtime_sec: modification time seconds
+ * @atime_nsec: access time nanoseconds
+ * @ctime_nsec: creation time nanoseconds
+ * @mtime_nsec: modification time nanoseconds
+ * @nlink: number of hard links
+ * @uid: owner ID
+ * @gid: group ID
+ * @mode: access flags
+ * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
+ * @data_len: inode data length
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @padding1: reserved for future, zeroes
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @compr_type: compression type used for this inode
+ * @padding2: reserved for future, zeroes
+ * @data: data attached to the inode
+ *
+ * Note, even though inode compression type is defined by @compr_type, some
+ * nodes of this inode may be compressed with different compressor - this
+ * happens if compression type is changed while the inode already has data
+ * nodes. But @compr_type will be use for further writes to the inode.
+ *
+ * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
+ * the padding fields.
+ */
+struct ubifs_ino_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le64 creat_sqnum;
+	__le64 size;
+	__le64 atime_sec;
+	__le64 ctime_sec;
+	__le64 mtime_sec;
+	__le32 atime_nsec;
+	__le32 ctime_nsec;
+	__le32 mtime_nsec;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le32 flags;
+	__le32 data_len;
+	__le32 xattr_cnt;
+	__le32 xattr_size;
+	__u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
+	__le32 xattr_names;
+	__le16 compr_type;
+	__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
+	__u8 data[];
+} __packed;
+
+/**
+ * struct ubifs_dent_node - directory entry node.
+ * @ch: common header
+ * @key: node key
+ * @inum: target inode number
+ * @padding1: reserved for future, zeroes
+ * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
+ * @nlen: name length
+ * @padding2: reserved for future, zeroes
+ * @name: zero-terminated name
+ *
+ * Note, do not forget to amend 'zero_dent_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_dent_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le64 inum;
+	__u8 padding1;
+	__u8 type;
+	__le16 nlen;
+	__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+	__u8 name[];
+} __packed;
+
+/**
+ * struct ubifs_data_node - data node.
+ * @ch: common header
+ * @key: node key
+ * @size: uncompressed data size in bytes
+ * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
+ * @padding: reserved for future, zeroes
+ * @data: data
+ *
+ * Note, do not forget to amend 'zero_data_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_data_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le32 size;
+	__le16 compr_type;
+	__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+	__u8 data[];
+} __packed;
+
+/**
+ * struct ubifs_trun_node - truncation node.
+ * @ch: common header
+ * @inum: truncated inode number
+ * @padding: reserved for future, zeroes
+ * @old_size: size before truncation
+ * @new_size: size after truncation
+ *
+ * This node exists only in the journal and never goes to the main area. Note,
+ * do not forget to amend 'zero_trun_node_unused()' function when changing the
+ * padding fields.
+ */
+struct ubifs_trun_node {
+	struct ubifs_ch ch;
+	__le32 inum;
+	__u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
+	__le64 old_size;
+	__le64 new_size;
+} __packed;
+
+/**
+ * struct ubifs_pad_node - padding node.
+ * @ch: common header
+ * @pad_len: how many bytes after this node are unused (because padded)
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_pad_node {
+	struct ubifs_ch ch;
+	__le32 pad_len;
+} __packed;
+
+/**
+ * struct ubifs_sb_node - superblock node.
+ * @ch: common header
+ * @padding: reserved for future, zeroes
+ * @key_hash: type of hash function used in keys
+ * @key_fmt: format of the key
+ * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
+ * @min_io_size: minimal input/output unit size
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_cnt: count of LEBs used by file-system
+ * @max_leb_cnt: maximum count of LEBs used by file-system
+ * @max_bud_bytes: maximum amount of data stored in buds
+ * @log_lebs: log size in logical eraseblocks
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @orph_lebs: number of LEBs used for recording orphans
+ * @jhead_cnt: count of journal heads
+ * @fanout: tree fanout (max. number of links per indexing node)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @fmt_version: UBIFS on-flash format version
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @padding1: reserved for future, zeroes
+ * @rp_uid: reserve pool UID
+ * @rp_gid: reserve pool GID
+ * @rp_size: size of the reserved pool in bytes
+ * @padding2: reserved for future, zeroes
+ * @time_gran: time granularity in nanoseconds
+ * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
+ */
+struct ubifs_sb_node {
+	struct ubifs_ch ch;
+	__u8 padding[2];
+	__u8 key_hash;
+	__u8 key_fmt;
+	__le32 flags;
+	__le32 min_io_size;
+	__le32 leb_size;
+	__le32 leb_cnt;
+	__le32 max_leb_cnt;
+	__le64 max_bud_bytes;
+	__le32 log_lebs;
+	__le32 lpt_lebs;
+	__le32 orph_lebs;
+	__le32 jhead_cnt;
+	__le32 fanout;
+	__le32 lsave_cnt;
+	__le32 fmt_version;
+	__le16 default_compr;
+	__u8 padding1[2];
+	__le32 rp_uid;
+	__le32 rp_gid;
+	__le64 rp_size;
+	__le32 time_gran;
+	__u8 uuid[16];
+	__le32 ro_compat_version;
+	__u8 padding2[3968];
+} __packed;
+
+/**
+ * struct ubifs_mst_node - master node.
+ * @ch: common header
+ * @highest_inum: highest inode number in the committed index
+ * @cmt_no: commit number
+ * @flags: various flags (%UBIFS_MST_DIRTY, etc)
+ * @log_lnum: start of the log
+ * @root_lnum: LEB number of the root indexing node
+ * @root_offs: offset within @root_lnum
+ * @root_len: root indexing node length
+ * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
+ * not reserved and should be reserved on mount)
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @index_size: size of index on flash
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @lpt_lnum: LEB number of LPT root nnode
+ * @lpt_offs: offset of LPT root nnode
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @lsave_lnum: LEB number of LPT's save table (big model only)
+ * @lsave_offs: offset of LPT's save table (big model only)
+ * @lscan_lnum: LEB number of last LPT scan
+ * @empty_lebs: number of empty logical eraseblocks
+ * @idx_lebs: number of indexing logical eraseblocks
+ * @leb_cnt: count of LEBs used by file-system
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_mst_node {
+	struct ubifs_ch ch;
+	__le64 highest_inum;
+	__le64 cmt_no;
+	__le32 flags;
+	__le32 log_lnum;
+	__le32 root_lnum;
+	__le32 root_offs;
+	__le32 root_len;
+	__le32 gc_lnum;
+	__le32 ihead_lnum;
+	__le32 ihead_offs;
+	__le64 index_size;
+	__le64 total_free;
+	__le64 total_dirty;
+	__le64 total_used;
+	__le64 total_dead;
+	__le64 total_dark;
+	__le32 lpt_lnum;
+	__le32 lpt_offs;
+	__le32 nhead_lnum;
+	__le32 nhead_offs;
+	__le32 ltab_lnum;
+	__le32 ltab_offs;
+	__le32 lsave_lnum;
+	__le32 lsave_offs;
+	__le32 lscan_lnum;
+	__le32 empty_lebs;
+	__le32 idx_lebs;
+	__le32 leb_cnt;
+	__u8 padding[344];
+} __packed;
+
+/**
+ * struct ubifs_ref_node - logical eraseblock reference node.
+ * @ch: common header
+ * @lnum: the referred logical eraseblock number
+ * @offs: start offset in the referred LEB
+ * @jhead: journal head number
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_ref_node {
+	struct ubifs_ch ch;
+	__le32 lnum;
+	__le32 offs;
+	__le32 jhead;
+	__u8 padding[28];
+} __packed;
+
+/**
+ * struct ubifs_branch - key/reference/length branch
+ * @lnum: LEB number of the target node
+ * @offs: offset within @lnum
+ * @len: target node length
+ * @key: key
+ */
+struct ubifs_branch {
+	__le32 lnum;
+	__le32 offs;
+	__le32 len;
+	__u8 key[];
+} __packed;
+
+/**
+ * struct ubifs_idx_node - indexing node.
+ * @ch: common header
+ * @child_cnt: number of child index nodes
+ * @level: tree level
+ * @branches: LEB number / offset / length / key branches
+ */
+struct ubifs_idx_node {
+	struct ubifs_ch ch;
+	__le16 child_cnt;
+	__le16 level;
+	__u8 branches[];
+} __packed;
+
+/**
+ * struct ubifs_cs_node - commit start node.
+ * @ch: common header
+ * @cmt_no: commit number
+ */
+struct ubifs_cs_node {
+	struct ubifs_ch ch;
+	__le64 cmt_no;
+} __packed;
+
+/**
+ * struct ubifs_orph_node - orphan node.
+ * @ch: common header
+ * @cmt_no: commit number (also top bit is set on the last node of the commit)
+ * @inos: inode numbers of orphans
+ */
+struct ubifs_orph_node {
+	struct ubifs_ch ch;
+	__le64 cmt_no;
+	__le64 inos[];
+} __packed;
+
+#endif /* __UBIFS_MEDIA_H__ */
diff --git a/kernel/fs/ubifs/ubifs.h b/kernel/fs/ubifs/ubifs.h
new file mode 100644
index 000000000..de759022f
--- /dev/null
+++ b/kernel/fs/ubifs/ubifs.h
@@ -0,0 +1,1803 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+#ifndef __UBIFS_H__
+#define __UBIFS_H__
+
+#include <asm/div64.h>
+#include <linux/statfs.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/mtd/ubi.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/security.h>
+#include "ubifs-media.h"
+
+/* Version of this UBIFS implementation */
+#define UBIFS_VERSION 1
+
+/* Normal UBIFS messages */
+#define ubifs_msg(c, fmt, ...)                                      \
+	pr_notice("UBIFS (ubi%d:%d): " fmt "\n",                    \
+		  (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__)
+/* UBIFS error messages */
+#define ubifs_err(c, fmt, ...)                                      \
+	pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n",      \
+	       (c)->vi.ubi_num, (c)->vi.vol_id, current->pid,       \
+	       __func__, ##__VA_ARGS__)
+/* UBIFS warning messages */
+#define ubifs_warn(c, fmt, ...)                                     \
+	pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n",   \
+		(c)->vi.ubi_num, (c)->vi.vol_id, current->pid,      \
+		__func__, ##__VA_ARGS__)
+/*
+ * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
+ * object as an argument.
+ */
+#define ubifs_errc(c, fmt, ...)                                     \
+	do {                                                        \
+		if (!(c)->probing)                                  \
+			ubifs_err(c, fmt, ##__VA_ARGS__);           \
+	} while (0)
+
+/* UBIFS file system VFS magic number */
+#define UBIFS_SUPER_MAGIC 0x24051905
+
+/* Number of UBIFS blocks per VFS page */
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+
+/* "File system end of life" sequence number watermark */
+#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
+#define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
+
+/*
+ * Minimum amount of LEBs reserved for the index. At present the index needs at
+ * least 2 LEBs: one for the index head and one for in-the-gaps method (which
+ * currently does not cater for the index head and so excludes it from
+ * consideration).
+ */
+#define MIN_INDEX_LEBS 2
+
+/* Minimum amount of data UBIFS writes to the flash */
+#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
+
+/*
+ * Currently we do not support inode number overlapping and re-using, so this
+ * watermark defines dangerous inode number level. This should be fixed later,
+ * although it is difficult to exceed current limit. Another option is to use
+ * 64-bit inode numbers, but this means more overhead.
+ */
+#define INUM_WARN_WATERMARK 0xFFF00000
+#define INUM_WATERMARK      0xFFFFFF00
+
+/* Maximum number of entries in each LPT (LEB category) heap */
+#define LPT_HEAP_SZ 256
+
+/*
+ * Background thread name pattern. The numbers are UBI device and volume
+ * numbers.
+ */
+#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
+
+/* Write-buffer synchronization timeout interval in seconds */
+#define WBUF_TIMEOUT_SOFTLIMIT 3
+#define WBUF_TIMEOUT_HARDLIMIT 5
+
+/* Maximum possible inode number (only 32-bit inodes are supported now) */
+#define MAX_INUM 0xFFFFFFFF
+
+/* Number of non-data journal heads */
+#define NONDATA_JHEADS_CNT 2
+
+/* Shorter names for journal head numbers for internal usage */
+#define GCHD   UBIFS_GC_HEAD
+#define BASEHD UBIFS_BASE_HEAD
+#define DATAHD UBIFS_DATA_HEAD
+
+/* 'No change' value for 'ubifs_change_lp()' */
+#define LPROPS_NC 0x80000001
+
+/*
+ * There is no notion of truncation key because truncation nodes do not exist
+ * in TNC. However, when replaying, it is handy to introduce fake "truncation"
+ * keys for truncation nodes because the code becomes simpler. So we define
+ * %UBIFS_TRUN_KEY type.
+ *
+ * But otherwise, out of the journal reply scope, the truncation keys are
+ * invalid.
+ */
+#define UBIFS_TRUN_KEY    UBIFS_KEY_TYPES_CNT
+#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
+
+/*
+ * How much a directory entry/extended attribute entry adds to the parent/host
+ * inode.
+ */
+#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
+
+/* How much an extended attribute adds to the host inode */
+#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
+
+/*
+ * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
+ * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
+ * considered "young". This is used by shrinker when selecting znode to trim
+ * off.
+ */
+#define OLD_ZNODE_AGE 20
+#define YOUNG_ZNODE_AGE 5
+
+/*
+ * Some compressors, like LZO, may end up with more data then the input buffer.
+ * So UBIFS always allocates larger output buffer, to be sure the compressor
+ * will not corrupt memory in case of worst case compression.
+ */
+#define WORST_COMPR_FACTOR 2
+
+/*
+ * How much memory is needed for a buffer where we compress a data node.
+ */
+#define COMPRESSED_DATA_NODE_BUF_SZ \
+	(UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
+
+/* Maximum expected tree height for use by bottom_up_buf */
+#define BOTTOM_UP_HEIGHT 64
+
+/* Maximum number of data nodes to bulk-read */
+#define UBIFS_MAX_BULK_READ 32
+
+/*
+ * Lockdep classes for UBIFS inode @ui_mutex.
+ */
+enum {
+	WB_MUTEX_1 = 0,
+	WB_MUTEX_2 = 1,
+	WB_MUTEX_3 = 2,
+};
+
+/*
+ * Znode flags (actually, bit numbers which store the flags).
+ *
+ * DIRTY_ZNODE: znode is dirty
+ * COW_ZNODE: znode is being committed and a new instance of this znode has to
+ *            be created before changing this znode
+ * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
+ *                 still in the commit list and the ongoing commit operation
+ *                 will commit it, and delete this znode after it is done
+ */
+enum {
+	DIRTY_ZNODE    = 0,
+	COW_ZNODE      = 1,
+	OBSOLETE_ZNODE = 2,
+};
+
+/*
+ * Commit states.
+ *
+ * COMMIT_RESTING: commit is not wanted
+ * COMMIT_BACKGROUND: background commit has been requested
+ * COMMIT_REQUIRED: commit is required
+ * COMMIT_RUNNING_BACKGROUND: background commit is running
+ * COMMIT_RUNNING_REQUIRED: commit is running and it is required
+ * COMMIT_BROKEN: commit failed
+ */
+enum {
+	COMMIT_RESTING = 0,
+	COMMIT_BACKGROUND,
+	COMMIT_REQUIRED,
+	COMMIT_RUNNING_BACKGROUND,
+	COMMIT_RUNNING_REQUIRED,
+	COMMIT_BROKEN,
+};
+
+/*
+ * 'ubifs_scan_a_node()' return values.
+ *
+ * SCANNED_GARBAGE:  scanned garbage
+ * SCANNED_EMPTY_SPACE: scanned empty space
+ * SCANNED_A_NODE: scanned a valid node
+ * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
+ * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
+ *
+ * Greater than zero means: 'scanned that number of padding bytes'
+ */
+enum {
+	SCANNED_GARBAGE        = 0,
+	SCANNED_EMPTY_SPACE    = -1,
+	SCANNED_A_NODE         = -2,
+	SCANNED_A_CORRUPT_NODE = -3,
+	SCANNED_A_BAD_PAD_NODE = -4,
+};
+
+/*
+ * LPT cnode flag bits.
+ *
+ * DIRTY_CNODE: cnode is dirty
+ * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
+ *                 so it can (and must) be freed when the commit is finished
+ * COW_CNODE: cnode is being committed and must be copied before writing
+ */
+enum {
+	DIRTY_CNODE    = 0,
+	OBSOLETE_CNODE = 1,
+	COW_CNODE      = 2,
+};
+
+/*
+ * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
+ *
+ * LTAB_DIRTY: ltab node is dirty
+ * LSAVE_DIRTY: lsave node is dirty
+ */
+enum {
+	LTAB_DIRTY  = 1,
+	LSAVE_DIRTY = 2,
+};
+
+/*
+ * Return codes used by the garbage collector.
+ * @LEB_FREED: the logical eraseblock was freed and is ready to use
+ * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
+ * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
+ */
+enum {
+	LEB_FREED,
+	LEB_FREED_IDX,
+	LEB_RETAINED,
+};
+
+/**
+ * struct ubifs_old_idx - index node obsoleted since last commit start.
+ * @rb: rb-tree node
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ */
+struct ubifs_old_idx {
+	struct rb_node rb;
+	int lnum;
+	int offs;
+};
+
+/* The below union makes it easier to deal with keys */
+union ubifs_key {
+	uint8_t u8[UBIFS_SK_LEN];
+	uint32_t u32[UBIFS_SK_LEN/4];
+	uint64_t u64[UBIFS_SK_LEN/8];
+	__le32 j32[UBIFS_SK_LEN/4];
+};
+
+/**
+ * struct ubifs_scan_node - UBIFS scanned node information.
+ * @list: list of scanned nodes
+ * @key: key of node scanned (if it has one)
+ * @sqnum: sequence number
+ * @type: type of node scanned
+ * @offs: offset with LEB of node scanned
+ * @len: length of node scanned
+ * @node: raw node
+ */
+struct ubifs_scan_node {
+	struct list_head list;
+	union ubifs_key key;
+	unsigned long long sqnum;
+	int type;
+	int offs;
+	int len;
+	void *node;
+};
+
+/**
+ * struct ubifs_scan_leb - UBIFS scanned LEB information.
+ * @lnum: logical eraseblock number
+ * @nodes_cnt: number of nodes scanned
+ * @nodes: list of struct ubifs_scan_node
+ * @endpt: end point (and therefore the start of empty space)
+ * @buf: buffer containing entire LEB scanned
+ */
+struct ubifs_scan_leb {
+	int lnum;
+	int nodes_cnt;
+	struct list_head nodes;
+	int endpt;
+	void *buf;
+};
+
+/**
+ * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
+ * @list: list
+ * @lnum: LEB number
+ * @unmap: OK to unmap this LEB
+ *
+ * This data structure is used to temporary store garbage-collected indexing
+ * LEBs - they are not released immediately, but only after the next commit.
+ * This is needed to guarantee recoverability.
+ */
+struct ubifs_gced_idx_leb {
+	struct list_head list;
+	int lnum;
+	int unmap;
+};
+
+/**
+ * struct ubifs_inode - UBIFS in-memory inode description.
+ * @vfs_inode: VFS inode description object
+ * @creat_sqnum: sequence number at time of creation
+ * @del_cmtno: commit number corresponding to the time the inode was deleted,
+ *             protected by @c->commit_sem;
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @dirty: non-zero if the inode is dirty
+ * @xattr: non-zero if this is an extended attribute inode
+ * @bulk_read: non-zero if bulk-read should be used
+ * @ui_mutex: serializes inode write-back with the rest of VFS operations,
+ *            serializes "clean <-> dirty" state changes, serializes bulk-read,
+ *            protects @dirty, @bulk_read, @ui_size, and @xattr_size
+ * @ui_lock: protects @synced_i_size
+ * @synced_i_size: synchronized size of inode, i.e. the value of inode size
+ *                 currently stored on the flash; used only for regular file
+ *                 inodes
+ * @ui_size: inode size used by UBIFS when writing to flash
+ * @flags: inode flags (@UBIFS_COMPR_FL, etc)
+ * @compr_type: default compression type used for this inode
+ * @last_page_read: page number of last page read (for bulk read)
+ * @read_in_a_row: number of consecutive pages read in a row (for bulk read)
+ * @data_len: length of the data attached to the inode
+ * @data: inode's data
+ *
+ * @ui_mutex exists for two main reasons. At first it prevents inodes from
+ * being written back while UBIFS changing them, being in the middle of an VFS
+ * operation. This way UBIFS makes sure the inode fields are consistent. For
+ * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * write-back must not write any of them before we have finished.
+ *
+ * The second reason is budgeting - UBIFS has to budget all operations. If an
+ * operation is going to mark an inode dirty, it has to allocate budget for
+ * this. It cannot just mark it dirty because there is no guarantee there will
+ * be enough flash space to write the inode back later. This means UBIFS has
+ * to have full control over inode "clean <-> dirty" transitions (and pages
+ * actually). But unfortunately, VFS marks inodes dirty in many places, and it
+ * does not ask the file-system if it is allowed to do so (there is a notifier,
+ * but it is not enough), i.e., there is no mechanism to synchronize with this.
+ * So UBIFS has its own inode dirty flag and its own mutex to serialize
+ * "clean <-> dirty" transitions.
+ *
+ * The @synced_i_size field is used to make sure we never write pages which are
+ * beyond last synchronized inode size. See 'ubifs_writepage()' for more
+ * information.
+ *
+ * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
+ * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
+ * make sure @inode->i_size is always changed under @ui_mutex, because it
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * could consider to rework locking and base it on "shadow" fields.
+ */
+struct ubifs_inode {
+	struct inode vfs_inode;
+	unsigned long long creat_sqnum;
+	unsigned long long del_cmtno;
+	unsigned int xattr_size;
+	unsigned int xattr_cnt;
+	unsigned int xattr_names;
+	unsigned int dirty:1;
+	unsigned int xattr:1;
+	unsigned int bulk_read:1;
+	unsigned int compr_type:2;
+	struct mutex ui_mutex;
+	spinlock_t ui_lock;
+	loff_t synced_i_size;
+	loff_t ui_size;
+	int flags;
+	pgoff_t last_page_read;
+	pgoff_t read_in_a_row;
+	int data_len;
+	void *data;
+};
+
+/**
+ * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
+ * @list: list
+ * @lnum: LEB number of recovered LEB
+ * @endpt: offset where recovery ended
+ *
+ * This structure records a LEB identified during recovery that needs to be
+ * cleaned but was not because UBIFS was mounted read-only. The information
+ * is used to clean the LEB when remounting to read-write mode.
+ */
+struct ubifs_unclean_leb {
+	struct list_head list;
+	int lnum;
+	int endpt;
+};
+
+/*
+ * LEB properties flags.
+ *
+ * LPROPS_UNCAT: not categorized
+ * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
+ * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
+ * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
+ * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
+ * LPROPS_EMPTY: LEB is empty, not taken
+ * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
+ * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
+ * LPROPS_CAT_MASK: mask for the LEB categories above
+ * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
+ * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
+ */
+enum {
+	LPROPS_UNCAT     =  0,
+	LPROPS_DIRTY     =  1,
+	LPROPS_DIRTY_IDX =  2,
+	LPROPS_FREE      =  3,
+	LPROPS_HEAP_CNT  =  3,
+	LPROPS_EMPTY     =  4,
+	LPROPS_FREEABLE  =  5,
+	LPROPS_FRDI_IDX  =  6,
+	LPROPS_CAT_MASK  = 15,
+	LPROPS_TAKEN     = 16,
+	LPROPS_INDEX     = 32,
+};
+
+/**
+ * struct ubifs_lprops - logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @flags: LEB properties flags (see above)
+ * @lnum: LEB number
+ * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
+ * @hpos: heap position in heap of same-category lprops (other categories)
+ */
+struct ubifs_lprops {
+	int free;
+	int dirty;
+	int flags;
+	int lnum;
+	union {
+		struct list_head list;
+		int hpos;
+	};
+};
+
+/**
+ * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @tgc: trivial GC flag (1 => unmap after commit end)
+ * @cmt: commit flag (1 => reserved for commit)
+ */
+struct ubifs_lpt_lprops {
+	int free;
+	int dirty;
+	unsigned tgc:1;
+	unsigned cmt:1;
+};
+
+/**
+ * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
+ * @empty_lebs: number of empty LEBs
+ * @taken_empty_lebs: number of taken LEBs
+ * @idx_lebs: number of indexing LEBs
+ * @total_free: total free space in bytes (includes all LEBs)
+ * @total_dirty: total dirty space in bytes (includes all LEBs)
+ * @total_used: total used space in bytes (does not include index LEBs)
+ * @total_dead: total dead space in bytes (does not include index LEBs)
+ * @total_dark: total dark space in bytes (does not include index LEBs)
+ *
+ * The @taken_empty_lebs field counts the LEBs that are in the transient state
+ * of having been "taken" for use but not yet written to. @taken_empty_lebs is
+ * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
+ * used by itself (in which case 'unused_lebs' would be a better name). In the
+ * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
+ * by GC, but unlike other empty LEBs that are "taken", it may not be written
+ * straight away (i.e. before the next commit start or unmount), so either
+ * @gc_lnum must be specially accounted for, or the current approach followed
+ * i.e. count it under @taken_empty_lebs.
+ *
+ * @empty_lebs includes @taken_empty_lebs.
+ *
+ * @total_used, @total_dead and @total_dark fields do not account indexing
+ * LEBs.
+ */
+struct ubifs_lp_stats {
+	int empty_lebs;
+	int taken_empty_lebs;
+	int idx_lebs;
+	long long total_free;
+	long long total_dirty;
+	long long total_used;
+	long long total_dead;
+	long long total_dark;
+};
+
+struct ubifs_nnode;
+
+/**
+ * struct ubifs_cnode - LEB Properties Tree common node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
+ * @num: node number
+ */
+struct ubifs_cnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+};
+
+/**
+ * struct ubifs_pnode - LEB Properties Tree leaf node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always zero for pnodes)
+ * @num: node number
+ * @lprops: LEB properties array
+ */
+struct ubifs_pnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+	struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_nbranch - LEB Properties Tree internal node branch.
+ * @lnum: LEB number of child
+ * @offs: offset of child
+ * @nnode: nnode child
+ * @pnode: pnode child
+ * @cnode: cnode child
+ */
+struct ubifs_nbranch {
+	int lnum;
+	int offs;
+	union {
+		struct ubifs_nnode *nnode;
+		struct ubifs_pnode *pnode;
+		struct ubifs_cnode *cnode;
+	};
+};
+
+/**
+ * struct ubifs_nnode - LEB Properties Tree internal node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always greater than zero for nnodes)
+ * @num: node number
+ * @nbranch: branches to child nodes
+ */
+struct ubifs_nnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+	struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_lpt_heap - heap of categorized lprops.
+ * @arr: heap array
+ * @cnt: number in heap
+ * @max_cnt: maximum number allowed in heap
+ *
+ * There are %LPROPS_HEAP_CNT heaps.
+ */
+struct ubifs_lpt_heap {
+	struct ubifs_lprops **arr;
+	int cnt;
+	int max_cnt;
+};
+
+/*
+ * Return codes for LPT scan callback function.
+ *
+ * LPT_SCAN_CONTINUE: continue scanning
+ * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
+ * LPT_SCAN_STOP: stop scanning
+ */
+enum {
+	LPT_SCAN_CONTINUE = 0,
+	LPT_SCAN_ADD = 1,
+	LPT_SCAN_STOP = 2,
+};
+
+struct ubifs_info;
+
+/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
+typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
+				       const struct ubifs_lprops *lprops,
+				       int in_tree, void *data);
+
+/**
+ * struct ubifs_wbuf - UBIFS write-buffer.
+ * @c: UBIFS file-system description object
+ * @buf: write-buffer (of min. flash I/O unit size)
+ * @lnum: logical eraseblock number the write-buffer points to
+ * @offs: write-buffer offset in this logical eraseblock
+ * @avail: number of bytes available in the write-buffer
+ * @used:  number of used bytes in the write-buffer
+ * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
+ * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
+ *         up by 'mutex_lock_nested()).
+ * @sync_callback: write-buffer synchronization callback
+ * @io_mutex: serializes write-buffer I/O
+ * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
+ *        fields
+ * @softlimit: soft write-buffer timeout interval
+ * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit
+ *         and @softlimit + @delta)
+ * @timer: write-buffer timer
+ * @no_timer: non-zero if this write-buffer does not have a timer
+ * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
+ * @next_ino: points to the next position of the following inode number
+ * @inodes: stores the inode numbers of the nodes which are in wbuf
+ *
+ * The write-buffer synchronization callback is called when the write-buffer is
+ * synchronized in order to notify how much space was wasted due to
+ * write-buffer padding and how much free space is left in the LEB.
+ *
+ * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
+ * spin-lock or mutex because they are written under both mutex and spin-lock.
+ * @buf is appended to under mutex but overwritten under both mutex and
+ * spin-lock. Thus the data between @buf and @buf + @used can be read under
+ * spinlock.
+ */
+struct ubifs_wbuf {
+	struct ubifs_info *c;
+	void *buf;
+	int lnum;
+	int offs;
+	int avail;
+	int used;
+	int size;
+	int jhead;
+	int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
+	struct mutex io_mutex;
+	spinlock_t lock;
+	ktime_t softlimit;
+	unsigned long long delta;
+	struct hrtimer timer;
+	unsigned int no_timer:1;
+	unsigned int need_sync:1;
+	int next_ino;
+	ino_t *inodes;
+};
+
+/**
+ * struct ubifs_bud - bud logical eraseblock.
+ * @lnum: logical eraseblock number
+ * @start: where the (uncommitted) bud data starts
+ * @jhead: journal head number this bud belongs to
+ * @list: link in the list buds belonging to the same journal head
+ * @rb: link in the tree of all buds
+ */
+struct ubifs_bud {
+	int lnum;
+	int start;
+	int jhead;
+	struct list_head list;
+	struct rb_node rb;
+};
+
+/**
+ * struct ubifs_jhead - journal head.
+ * @wbuf: head's write-buffer
+ * @buds_list: list of bud LEBs belonging to this journal head
+ * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
+ *
+ * Note, the @buds list is protected by the @c->buds_lock.
+ */
+struct ubifs_jhead {
+	struct ubifs_wbuf wbuf;
+	struct list_head buds_list;
+	unsigned int grouped:1;
+};
+
+/**
+ * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
+ * @key: key
+ * @znode: znode address in memory
+ * @lnum: LEB number of the target node (indexing node or data node)
+ * @offs: target node offset within @lnum
+ * @len: target node length
+ */
+struct ubifs_zbranch {
+	union ubifs_key key;
+	union {
+		struct ubifs_znode *znode;
+		void *leaf;
+	};
+	int lnum;
+	int offs;
+	int len;
+};
+
+/**
+ * struct ubifs_znode - in-memory representation of an indexing node.
+ * @parent: parent znode or NULL if it is the root
+ * @cnext: next znode to commit
+ * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
+ * @time: last access time (seconds)
+ * @level: level of the entry in the TNC tree
+ * @child_cnt: count of child znodes
+ * @iip: index in parent's zbranch array
+ * @alt: lower bound of key range has altered i.e. child inserted at slot 0
+ * @lnum: LEB number of the corresponding indexing node
+ * @offs: offset of the corresponding indexing node
+ * @len: length  of the corresponding indexing node
+ * @zbranch: array of znode branches (@c->fanout elements)
+ *
+ * Note! The @lnum, @offs, and @len fields are not really needed - we have them
+ * only for internal consistency check. They could be removed to save some RAM.
+ */
+struct ubifs_znode {
+	struct ubifs_znode *parent;
+	struct ubifs_znode *cnext;
+	unsigned long flags;
+	unsigned long time;
+	int level;
+	int child_cnt;
+	int iip;
+	int alt;
+	int lnum;
+	int offs;
+	int len;
+	struct ubifs_zbranch zbranch[];
+};
+
+/**
+ * struct bu_info - bulk-read information.
+ * @key: first data node key
+ * @zbranch: zbranches of data nodes to bulk read
+ * @buf: buffer to read into
+ * @buf_len: buffer length
+ * @gc_seq: GC sequence number to detect races with GC
+ * @cnt: number of data nodes for bulk read
+ * @blk_cnt: number of data blocks including holes
+ * @oef: end of file reached
+ */
+struct bu_info {
+	union ubifs_key key;
+	struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ];
+	void *buf;
+	int buf_len;
+	int gc_seq;
+	int cnt;
+	int blk_cnt;
+	int eof;
+};
+
+/**
+ * struct ubifs_node_range - node length range description data structure.
+ * @len: fixed node length
+ * @min_len: minimum possible node length
+ * @max_len: maximum possible node length
+ *
+ * If @max_len is %0, the node has fixed length @len.
+ */
+struct ubifs_node_range {
+	union {
+		int len;
+		int min_len;
+	};
+	int max_len;
+};
+
+/**
+ * struct ubifs_compressor - UBIFS compressor description structure.
+ * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
+ * @cc: cryptoapi compressor handle
+ * @comp_mutex: mutex used during compression
+ * @decomp_mutex: mutex used during decompression
+ * @name: compressor name
+ * @capi_name: cryptoapi compressor name
+ */
+struct ubifs_compressor {
+	int compr_type;
+	struct crypto_comp *cc;
+	struct mutex *comp_mutex;
+	struct mutex *decomp_mutex;
+	const char *name;
+	const char *capi_name;
+};
+
+/**
+ * struct ubifs_budget_req - budget requirements of an operation.
+ *
+ * @fast: non-zero if the budgeting should try to acquire budget quickly and
+ *        should not try to call write-back
+ * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
+ *               have to be re-calculated
+ * @new_page: non-zero if the operation adds a new page
+ * @dirtied_page: non-zero if the operation makes a page dirty
+ * @new_dent: non-zero if the operation adds a new directory entry
+ * @mod_dent: non-zero if the operation removes or modifies an existing
+ *            directory entry
+ * @new_ino: non-zero if the operation adds a new inode
+ * @new_ino_d: now much data newly created inode contains
+ * @dirtied_ino: how many inodes the operation makes dirty
+ * @dirtied_ino_d: now much data dirtied inode contains
+ * @idx_growth: how much the index will supposedly grow
+ * @data_growth: how much new data the operation will supposedly add
+ * @dd_growth: how much data that makes other data dirty the operation will
+ *             supposedly add
+ *
+ * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
+ * budgeting subsystem caches index and data growth values there to avoid
+ * re-calculating them when the budget is released. However, if @idx_growth is
+ * %-1, it is calculated by the release function using other fields.
+ *
+ * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
+ * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
+ * dirty by the re-name operation.
+ *
+ * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
+ * make sure the amount of inode data which contribute to @new_ino_d and
+ * @dirtied_ino_d fields are aligned.
+ */
+struct ubifs_budget_req {
+	unsigned int fast:1;
+	unsigned int recalculate:1;
+#ifndef UBIFS_DEBUG
+	unsigned int new_page:1;
+	unsigned int dirtied_page:1;
+	unsigned int new_dent:1;
+	unsigned int mod_dent:1;
+	unsigned int new_ino:1;
+	unsigned int new_ino_d:13;
+	unsigned int dirtied_ino:4;
+	unsigned int dirtied_ino_d:15;
+#else
+	/* Not bit-fields to check for overflows */
+	unsigned int new_page;
+	unsigned int dirtied_page;
+	unsigned int new_dent;
+	unsigned int mod_dent;
+	unsigned int new_ino;
+	unsigned int new_ino_d;
+	unsigned int dirtied_ino;
+	unsigned int dirtied_ino_d;
+#endif
+	int idx_growth;
+	int data_growth;
+	int dd_growth;
+};
+
+/**
+ * struct ubifs_orphan - stores the inode number of an orphan.
+ * @rb: rb-tree node of rb-tree of orphans sorted by inode number
+ * @list: list head of list of orphans in order added
+ * @new_list: list head of list of orphans added since the last commit
+ * @cnext: next orphan to commit
+ * @dnext: next orphan to delete
+ * @inum: inode number
+ * @new: %1 => added since the last commit, otherwise %0
+ * @cmt: %1 => commit pending, otherwise %0
+ * @del: %1 => delete pending, otherwise %0
+ */
+struct ubifs_orphan {
+	struct rb_node rb;
+	struct list_head list;
+	struct list_head new_list;
+	struct ubifs_orphan *cnext;
+	struct ubifs_orphan *dnext;
+	ino_t inum;
+	unsigned new:1;
+	unsigned cmt:1;
+	unsigned del:1;
+};
+
+/**
+ * struct ubifs_mount_opts - UBIFS-specific mount options information.
+ * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disable, %2 enable)
+ * @chk_data_crc: enable/disable CRC data checking when reading data nodes
+ *                (%0 default, %1 disable, %2 enable)
+ * @override_compr: override default compressor (%0 - do not override and use
+ *                  superblock compressor, %1 - override and use compressor
+ *                  specified in @compr_type)
+ * @compr_type: compressor type to override the superblock compressor with
+ *              (%UBIFS_COMPR_NONE, etc)
+ */
+struct ubifs_mount_opts {
+	unsigned int unmount_mode:2;
+	unsigned int bulk_read:2;
+	unsigned int chk_data_crc:2;
+	unsigned int override_compr:1;
+	unsigned int compr_type:2;
+};
+
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ *             other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ *                   which still have to be taken into account because the index
+ *                   has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ * @page_budget: budget for a page (constant, never changed after mount)
+ * @inode_budget: budget for an inode (constant, never changed after mount)
+ * @dent_budget: budget for a directory entry (constant, never changed after
+ *               mount)
+ */
+struct ubifs_budg_info {
+	long long idx_growth;
+	long long data_growth;
+	long long dd_growth;
+	long long uncommitted_idx;
+	unsigned long long old_idx_sz;
+	int min_idx_lebs;
+	unsigned int nospace:1;
+	unsigned int nospace_rp:1;
+	int page_budget;
+	int inode_budget;
+	int dent_budget;
+};
+
+struct ubifs_debug_info;
+
+/**
+ * struct ubifs_info - UBIFS file-system description data structure
+ * (per-superblock).
+ * @vfs_sb: VFS @struct super_block object
+ * @bdi: backing device info object to make VFS happy and disable read-ahead
+ *
+ * @highest_inum: highest used inode number
+ * @max_sqnum: current global sequence number
+ * @cmt_no: commit number of the last successfully completed commit, protected
+ *          by @commit_sem
+ * @cnt_lock: protects @highest_inum and @max_sqnum counters
+ * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
+ * @uuid: UUID from super block
+ *
+ * @lhead_lnum: log head logical eraseblock number
+ * @lhead_offs: log head offset
+ * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
+ * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
+ *             @bud_bytes
+ * @min_log_bytes: minimum required number of bytes in the log
+ * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
+ *                 committed buds
+ *
+ * @buds: tree of all buds indexed by bud LEB number
+ * @bud_bytes: how many bytes of flash is used by buds
+ * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
+ *             lists
+ * @jhead_cnt: count of journal heads
+ * @jheads: journal heads (head zero is base head)
+ * @max_bud_bytes: maximum number of bytes allowed in buds
+ * @bg_bud_bytes: number of bud bytes when background commit is initiated
+ * @old_buds: buds to be released after commit ends
+ * @max_bud_cnt: maximum number of buds
+ *
+ * @commit_sem: synchronizes committer with other processes
+ * @cmt_state: commit state
+ * @cs_lock: commit state lock
+ * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ *
+ * @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
+ * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
+ *                   recovery)
+ * @bulk_read: enable bulk-reads
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
+ *
+ * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
+ *             @calc_idx_sz
+ * @zroot: zbranch which points to the root index node and znode
+ * @cnext: next znode to commit
+ * @enext: next znode to commit to empty space
+ * @gap_lebs: array of LEBs used by the in-gaps commit method
+ * @cbuf: commit buffer
+ * @ileb_buf: buffer for commit in-the-gaps method
+ * @ileb_len: length of data in ileb_buf
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @ilebs: pre-allocated index LEBs
+ * @ileb_cnt: number of pre-allocated index LEBs
+ * @ileb_nxt: next pre-allocated index LEBs
+ * @old_idx: tree of index nodes obsoleted since the last commit start
+ * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
+ *
+ * @mst_node: master node
+ * @mst_offs: offset of valid master node
+ *
+ * @max_bu_buf_len: maximum bulk-read buffer length
+ * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
+ * @bu: pre-allocated bulk-read information
+ *
+ * @write_reserve_mutex: protects @write_reserve_buf
+ * @write_reserve_buf: on the write path we allocate memory, which might
+ *                     sometimes be unavailable, in which case we use this
+ *                     write reserve buffer
+ *
+ * @log_lebs: number of logical eraseblocks in the log
+ * @log_bytes: log size in bytes
+ * @log_last: last LEB of the log
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @lpt_first: first LEB of the lprops table area
+ * @lpt_last: last LEB of the lprops table area
+ * @orph_lebs: number of LEBs used for the orphan area
+ * @orph_first: first LEB of the orphan area
+ * @orph_last: last LEB of the orphan area
+ * @main_lebs: count of LEBs in the main area
+ * @main_first: first LEB of the main area
+ * @main_bytes: main area size in bytes
+ *
+ * @key_hash_type: type of the key hash
+ * @key_hash: direntry key hash function
+ * @key_fmt: key format
+ * @key_len: key length
+ * @fanout: fanout of the index tree (number of links per indexing node)
+ *
+ * @min_io_size: minimal input/output unit size
+ * @min_io_shift: number of bits in @min_io_size minus one
+ * @max_write_size: maximum amount of bytes the underlying flash can write at a
+ *                  time (MTD write buffer size)
+ * @max_write_shift: number of bits in @max_write_size minus one
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_start: starting offset of logical eraseblocks within physical
+ *             eraseblocks
+ * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
+ * @leb_cnt: count of logical eraseblocks
+ * @max_leb_cnt: maximum count of logical eraseblocks
+ * @old_leb_cnt: count of logical eraseblocks before re-size
+ * @ro_media: the underlying UBI volume is read-only
+ * @ro_mount: the file-system was mounted as read-only
+ * @ro_error: UBIFS switched to R/O mode because an error happened
+ *
+ * @dirty_pg_cnt: number of dirty pages (not used)
+ * @dirty_zn_cnt: number of dirty znodes
+ * @clean_zn_cnt: number of clean znodes
+ *
+ * @space_lock: protects @bi and @lst
+ * @lst: lprops statistics
+ * @bi: budgeting information
+ * @calc_idx_sz: temporary variable which is used to calculate new index size
+ *               (contains accurate new index size at end of TNC commit start)
+ *
+ * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
+ *                 I/O unit
+ * @mst_node_alsz: master node aligned size
+ * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
+ * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
+ * @max_inode_sz: maximum possible inode size in bytes
+ * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ *                data nodes of maximum size - used in free space reporting
+ * @dead_wm: LEB dead space watermark
+ * @dark_wm: LEB dark space watermark
+ * @block_cnt: count of 4KiB blocks on the FS
+ *
+ * @ranges: UBIFS node length ranges
+ * @ubi: UBI volume descriptor
+ * @di: UBI device information
+ * @vi: UBI volume information
+ *
+ * @orph_tree: rb-tree of orphan inode numbers
+ * @orph_list: list of orphan inode numbers in order added
+ * @orph_new: list of orphan inode numbers added since last commit
+ * @orph_cnext: next orphan to commit
+ * @orph_dnext: next orphan to delete
+ * @orphan_lock: lock for orph_tree and orph_new
+ * @orph_buf: buffer for orphan nodes
+ * @new_orphans: number of orphans since last commit
+ * @cmt_orphans: number of orphans being committed
+ * @tot_orphans: number of orphans in the rb_tree
+ * @max_orphans: maximum number of orphans allowed
+ * @ohead_lnum: orphan head LEB number
+ * @ohead_offs: orphan head offset
+ * @no_orphs: non-zero if there are no orphans
+ *
+ * @bgt: UBIFS background thread
+ * @bgt_name: background thread name
+ * @need_bgt: if background thread should run
+ * @need_wbuf_sync: if write-buffers have to be synchronized
+ *
+ * @gc_lnum: LEB number used for garbage collection
+ * @sbuf: a buffer of LEB size used by GC and replay for scanning
+ * @idx_gc: list of index LEBs that have been garbage collected
+ * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
+ *
+ * @infos_list: links all 'ubifs_info' objects
+ * @umount_mutex: serializes shrinker and un-mount
+ * @shrinker_run_no: shrinker run number
+ *
+ * @space_bits: number of bits needed to record free or dirty space
+ * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
+ * @lpt_offs_bits: number of bits needed to record an offset in the LPT
+ * @lpt_spc_bits: number of bits needed to space in the LPT
+ * @pcnt_bits: number of bits needed to record pnode or nnode number
+ * @lnum_bits: number of bits needed to record LEB number
+ * @nnode_sz: size of on-flash nnode
+ * @pnode_sz: size of on-flash pnode
+ * @ltab_sz: size of on-flash LPT lprops table
+ * @lsave_sz: size of on-flash LPT save table
+ * @pnode_cnt: number of pnodes
+ * @nnode_cnt: number of nnodes
+ * @lpt_hght: height of the LPT
+ * @pnodes_have: number of pnodes in memory
+ *
+ * @lp_mutex: protects lprops table and all the other lprops-related fields
+ * @lpt_lnum: LEB number of the root nnode of the LPT
+ * @lpt_offs: offset of the root nnode of the LPT
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
+ * @dirty_nn_cnt: number of dirty nnodes
+ * @dirty_pn_cnt: number of dirty pnodes
+ * @check_lpt_free: flag that indicates LPT GC may be needed
+ * @lpt_sz: LPT size
+ * @lpt_nod_buf: buffer for an on-flash nnode or pnode
+ * @lpt_buf: buffer of LEB size used by LPT
+ * @nroot: address in memory of the root nnode of the LPT
+ * @lpt_cnext: next LPT node to commit
+ * @lpt_heap: array of heaps of categorized lprops
+ * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
+ *             previous commit start
+ * @uncat_list: list of un-categorized LEBs
+ * @empty_list: list of empty LEBs
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
+ * @freeable_cnt: number of freeable LEBs in @freeable_list
+ * @in_a_category_cnt: count of lprops which are in a certain category, which
+ *                     basically meants that they were loaded from the flash
+ *
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @ltab: LPT's own lprops table
+ * @ltab_cmt: LPT's own lprops table (commit copy)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @lsave_lnum: LEB number of LPT's save table
+ * @lsave_offs: offset of LPT's save table
+ * @lsave: LPT's save table
+ * @lscan_lnum: LEB number of last LPT scan
+ *
+ * @rp_size: size of the reserved pool in bytes
+ * @report_rp_size: size of the reserved pool reported to user-space
+ * @rp_uid: reserved pool user ID
+ * @rp_gid: reserved pool group ID
+ *
+ * @empty: %1 if the UBI device is empty
+ * @need_recovery: %1 if the file-system needs recovery
+ * @replaying: %1 during journal replay
+ * @mounting: %1 while mounting
+ * @probing: %1 while attempting to mount if MS_SILENT mount flag is set
+ * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
+ * @replay_list: temporary list used during journal replay
+ * @replay_buds: list of buds to replay
+ * @cs_sqnum: sequence number of first node in the log (commit start node)
+ * @replay_sqnum: sequence number of node currently being replayed
+ * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
+ *                    mode
+ * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
+ *                  FS to R/W mode
+ * @size_tree: inode size information for recovery
+ * @mount_opts: UBIFS-specific mount options
+ *
+ * @dbg: debugging-related information
+ */
+struct ubifs_info {
+	struct super_block *vfs_sb;
+	struct backing_dev_info bdi;
+
+	ino_t highest_inum;
+	unsigned long long max_sqnum;
+	unsigned long long cmt_no;
+	spinlock_t cnt_lock;
+	int fmt_version;
+	int ro_compat_version;
+	unsigned char uuid[16];
+
+	int lhead_lnum;
+	int lhead_offs;
+	int ltail_lnum;
+	struct mutex log_mutex;
+	int min_log_bytes;
+	long long cmt_bud_bytes;
+
+	struct rb_root buds;
+	long long bud_bytes;
+	spinlock_t buds_lock;
+	int jhead_cnt;
+	struct ubifs_jhead *jheads;
+	long long max_bud_bytes;
+	long long bg_bud_bytes;
+	struct list_head old_buds;
+	int max_bud_cnt;
+
+	struct rw_semaphore commit_sem;
+	int cmt_state;
+	spinlock_t cs_lock;
+	wait_queue_head_t cmt_wq;
+
+	unsigned int big_lpt:1;
+	unsigned int space_fixup:1;
+	unsigned int no_chk_data_crc:1;
+	unsigned int bulk_read:1;
+	unsigned int default_compr:2;
+	unsigned int rw_incompat:1;
+
+	struct mutex tnc_mutex;
+	struct ubifs_zbranch zroot;
+	struct ubifs_znode *cnext;
+	struct ubifs_znode *enext;
+	int *gap_lebs;
+	void *cbuf;
+	void *ileb_buf;
+	int ileb_len;
+	int ihead_lnum;
+	int ihead_offs;
+	int *ilebs;
+	int ileb_cnt;
+	int ileb_nxt;
+	struct rb_root old_idx;
+	int *bottom_up_buf;
+
+	struct ubifs_mst_node *mst_node;
+	int mst_offs;
+
+	int max_bu_buf_len;
+	struct mutex bu_mutex;
+	struct bu_info bu;
+
+	struct mutex write_reserve_mutex;
+	void *write_reserve_buf;
+
+	int log_lebs;
+	long long log_bytes;
+	int log_last;
+	int lpt_lebs;
+	int lpt_first;
+	int lpt_last;
+	int orph_lebs;
+	int orph_first;
+	int orph_last;
+	int main_lebs;
+	int main_first;
+	long long main_bytes;
+
+	uint8_t key_hash_type;
+	uint32_t (*key_hash)(const char *str, int len);
+	int key_fmt;
+	int key_len;
+	int fanout;
+
+	int min_io_size;
+	int min_io_shift;
+	int max_write_size;
+	int max_write_shift;
+	int leb_size;
+	int leb_start;
+	int half_leb_size;
+	int idx_leb_size;
+	int leb_cnt;
+	int max_leb_cnt;
+	int old_leb_cnt;
+	unsigned int ro_media:1;
+	unsigned int ro_mount:1;
+	unsigned int ro_error:1;
+
+	atomic_long_t dirty_pg_cnt;
+	atomic_long_t dirty_zn_cnt;
+	atomic_long_t clean_zn_cnt;
+
+	spinlock_t space_lock;
+	struct ubifs_lp_stats lst;
+	struct ubifs_budg_info bi;
+	unsigned long long calc_idx_sz;
+
+	int ref_node_alsz;
+	int mst_node_alsz;
+	int min_idx_node_sz;
+	int max_idx_node_sz;
+	long long max_inode_sz;
+	int max_znode_sz;
+
+	int leb_overhead;
+	int dead_wm;
+	int dark_wm;
+	int block_cnt;
+
+	struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
+	struct ubi_volume_desc *ubi;
+	struct ubi_device_info di;
+	struct ubi_volume_info vi;
+
+	struct rb_root orph_tree;
+	struct list_head orph_list;
+	struct list_head orph_new;
+	struct ubifs_orphan *orph_cnext;
+	struct ubifs_orphan *orph_dnext;
+	spinlock_t orphan_lock;
+	void *orph_buf;
+	int new_orphans;
+	int cmt_orphans;
+	int tot_orphans;
+	int max_orphans;
+	int ohead_lnum;
+	int ohead_offs;
+	int no_orphs;
+
+	struct task_struct *bgt;
+	char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
+	int need_bgt;
+	int need_wbuf_sync;
+
+	int gc_lnum;
+	void *sbuf;
+	struct list_head idx_gc;
+	int idx_gc_cnt;
+	int gc_seq;
+	int gced_lnum;
+
+	struct list_head infos_list;
+	struct mutex umount_mutex;
+	unsigned int shrinker_run_no;
+
+	int space_bits;
+	int lpt_lnum_bits;
+	int lpt_offs_bits;
+	int lpt_spc_bits;
+	int pcnt_bits;
+	int lnum_bits;
+	int nnode_sz;
+	int pnode_sz;
+	int ltab_sz;
+	int lsave_sz;
+	int pnode_cnt;
+	int nnode_cnt;
+	int lpt_hght;
+	int pnodes_have;
+
+	struct mutex lp_mutex;
+	int lpt_lnum;
+	int lpt_offs;
+	int nhead_lnum;
+	int nhead_offs;
+	int lpt_drty_flgs;
+	int dirty_nn_cnt;
+	int dirty_pn_cnt;
+	int check_lpt_free;
+	long long lpt_sz;
+	void *lpt_nod_buf;
+	void *lpt_buf;
+	struct ubifs_nnode *nroot;
+	struct ubifs_cnode *lpt_cnext;
+	struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
+	struct ubifs_lpt_heap dirty_idx;
+	struct list_head uncat_list;
+	struct list_head empty_list;
+	struct list_head freeable_list;
+	struct list_head frdi_idx_list;
+	int freeable_cnt;
+	int in_a_category_cnt;
+
+	int ltab_lnum;
+	int ltab_offs;
+	struct ubifs_lpt_lprops *ltab;
+	struct ubifs_lpt_lprops *ltab_cmt;
+	int lsave_cnt;
+	int lsave_lnum;
+	int lsave_offs;
+	int *lsave;
+	int lscan_lnum;
+
+	long long rp_size;
+	long long report_rp_size;
+	kuid_t rp_uid;
+	kgid_t rp_gid;
+
+	/* The below fields are used only during mounting and re-mounting */
+	unsigned int empty:1;
+	unsigned int need_recovery:1;
+	unsigned int replaying:1;
+	unsigned int mounting:1;
+	unsigned int remounting_rw:1;
+	unsigned int probing:1;
+	struct list_head replay_list;
+	struct list_head replay_buds;
+	unsigned long long cs_sqnum;
+	unsigned long long replay_sqnum;
+	struct list_head unclean_leb_list;
+	struct ubifs_mst_node *rcvrd_mst_node;
+	struct rb_root size_tree;
+	struct ubifs_mount_opts mount_opts;
+
+	struct ubifs_debug_info *dbg;
+};
+
+extern struct list_head ubifs_infos;
+extern spinlock_t ubifs_infos_lock;
+extern atomic_long_t ubifs_clean_zn_cnt;
+extern struct kmem_cache *ubifs_inode_slab;
+extern const struct super_operations ubifs_super_operations;
+extern const struct xattr_handler *ubifs_xattr_handlers[];
+extern const struct address_space_operations ubifs_file_address_operations;
+extern const struct file_operations ubifs_file_operations;
+extern const struct inode_operations ubifs_file_inode_operations;
+extern const struct file_operations ubifs_dir_operations;
+extern const struct inode_operations ubifs_dir_inode_operations;
+extern const struct inode_operations ubifs_symlink_inode_operations;
+extern struct backing_dev_info ubifs_backing_dev_info;
+extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/* io.c */
+void ubifs_ro_mode(struct ubifs_info *c, int err);
+int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
+		   int len, int even_ebadmsg);
+int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
+		    int len);
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len);
+int ubifs_leb_unmap(struct ubifs_info *c, int lnum);
+int ubifs_leb_map(struct ubifs_info *c, int lnum);
+int ubifs_is_mapped(const struct ubifs_info *c, int lnum);
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs);
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+		    int lnum, int offs);
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+			 int lnum, int offs);
+int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
+		     int offs);
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+		     int offs, int quiet, int must_chk_crc);
+void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
+int ubifs_io_init(struct ubifs_info *c);
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
+int ubifs_bg_wbufs_sync(struct ubifs_info *c);
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
+
+/* scan.c */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+				  int offs, void *sbuf, int quiet);
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+		      int offs, int quiet);
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+					int offs, void *sbuf);
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		    int lnum, int offs);
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		   void *buf, int offs);
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+			      void *buf);
+
+/* log.c */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
+void ubifs_create_buds_lists(struct ubifs_info *c);
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
+int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
+int ubifs_consolidate_log(struct ubifs_info *c);
+
+/* journal.c */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+		     const struct qstr *nm, const struct inode *inode,
+		     int deletion, int xent);
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+			 const union ubifs_key *key, const void *buf, int len);
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+		     const struct dentry *old_dentry,
+		     const struct inode *new_dir,
+		     const struct dentry *new_dentry, int sync);
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+		       loff_t old_size, loff_t new_size);
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+			   const struct inode *inode, const struct qstr *nm);
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
+			   const struct inode *inode2);
+
+/* budget.c */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+				      struct ubifs_inode *ui);
+int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
+			  struct ubifs_budget_req *req);
+void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
+				struct ubifs_budget_req *req);
+void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
+			 struct ubifs_budget_req *req);
+long long ubifs_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space_nolock(struct ubifs_info *c);
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
+void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, long long free);
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
+
+/* find.c */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
+			  int squeeze);
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+			 int min_space, int pick_free);
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
+
+/* tnc.c */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+			struct ubifs_znode **zn, int *n);
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm);
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node, int *lnum, int *offs);
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+		  int offs, int len);
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+		      int old_lnum, int old_offs, int lnum, int offs, int len);
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+		     int lnum, int offs, int len, const struct qstr *nm);
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+			const struct qstr *nm);
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+			   union ubifs_key *to_key);
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+					   union ubifs_key *key,
+					   const struct qstr *nm);
+void ubifs_tnc_close(struct ubifs_info *c);
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs, int is_idx);
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+			 int lnum, int offs);
+/* Shared by tnc.c for tnc_commit.c */
+void destroy_old_idx(struct ubifs_info *c);
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs);
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu);
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu);
+
+/* tnc_misc.c */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+					      struct ubifs_znode *znode);
+int ubifs_search_zbranch(const struct ubifs_info *c,
+			 const struct ubifs_znode *znode,
+			 const union ubifs_key *key, int *n);
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+				     struct ubifs_zbranch *zbr,
+				     struct ubifs_znode *parent, int iip);
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			void *node);
+
+/* tnc_commit.c */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int ubifs_tnc_end_commit(struct ubifs_info *c);
+
+/* shrinker.c */
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc);
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc);
+
+/* commit.c */
+int ubifs_bg_thread(void *info);
+void ubifs_commit_required(struct ubifs_info *c);
+void ubifs_request_bg_commit(struct ubifs_info *c);
+int ubifs_run_commit(struct ubifs_info *c);
+void ubifs_recovery_commit(struct ubifs_info *c);
+int ubifs_gc_should_commit(struct ubifs_info *c);
+void ubifs_wait_for_commit(struct ubifs_info *c);
+
+/* master.c */
+int ubifs_read_master(struct ubifs_info *c);
+int ubifs_write_master(struct ubifs_info *c);
+
+/* sb.c */
+int ubifs_read_superblock(struct ubifs_info *c);
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
+
+/* replay.c */
+int ubifs_validate_entry(struct ubifs_info *c,
+			 const struct ubifs_dent_node *dent);
+int ubifs_replay_journal(struct ubifs_info *c);
+
+/* gc.c */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
+int ubifs_gc_start_commit(struct ubifs_info *c);
+int ubifs_gc_end_commit(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_get_idx_gc_leb(struct ubifs_info *c);
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
+
+/* orphan.c */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
+int ubifs_orphan_start_commit(struct ubifs_info *c);
+int ubifs_orphan_end_commit(struct ubifs_info *c);
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+int ubifs_clear_orphans(struct ubifs_info *c);
+
+/* lpt.c */
+int ubifs_calc_lpt_geom(struct ubifs_info *c);
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+			  int *lpt_lebs, int *big_lpt);
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+			  ubifs_lpt_scan_callback scan_cb, void *data);
+
+/* Shared by lpt.c for lpt_commit.c */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+		     struct ubifs_lpt_lprops *ltab);
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_pnode *pnode);
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_nnode *nnode);
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip);
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip);
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
+struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* Needed only in debugging code in lpt_commit.c */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+		       struct ubifs_nnode *nnode);
+
+/* lpt_commit.c */
+int ubifs_lpt_start_commit(struct ubifs_info *c);
+int ubifs_lpt_end_commit(struct ubifs_info *c);
+int ubifs_lpt_post_commit(struct ubifs_info *c);
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
+
+/* lprops.c */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+					   const struct ubifs_lprops *lp,
+					   int free, int dirty, int flags,
+					   int idx_gc_cnt);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+		      int cat);
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+		       struct ubifs_lprops *new_lprops);
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops);
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean, int idx_gc_cnt);
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean);
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+int ubifs_calc_dark(const struct ubifs_info *c, int spc);
+
+/* file.c */
+int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+
+/* dir.c */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+			      umode_t mode);
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+
+/* xattr.c */
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+		   const void *value, size_t size, int flags);
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size);
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ubifs_removexattr(struct dentry *dentry, const char *name);
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+			const struct qstr *qstr);
+
+/* super.c */
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
+
+/* recovery.c */
+int ubifs_recover_master_node(struct ubifs_info *c);
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+					 int offs, void *sbuf, int jhead);
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+					     int offs, void *sbuf);
+int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf);
+int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf);
+int ubifs_rcvry_gc_commit(struct ubifs_info *c);
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+			     int deletion, loff_t new_size);
+int ubifs_recover_size(struct ubifs_info *c);
+void ubifs_destroy_size_tree(struct ubifs_info *c);
+
+/* ioctl.c */
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void ubifs_set_inode_flags(struct inode *inode);
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#endif
+
+/* compressor.c */
+int __init ubifs_compressors_init(void);
+void ubifs_compressors_exit(void);
+void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len,
+		    void *out_buf, int *out_len, int *compr_type);
+int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
+		     void *out, int *out_len, int compr_type);
+
+#include "debug.h"
+#include "misc.h"
+#include "key.h"
+
+#endif /* !__UBIFS_H__ */
diff --git a/kernel/fs/ubifs/xattr.c b/kernel/fs/ubifs/xattr.c
new file mode 100644
index 000000000..96f3448b6
--- /dev/null
+++ b/kernel/fs/ubifs/xattr.c
@@ -0,0 +1,666 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS extended attributes support.
+ *
+ * Extended attributes are implemented as regular inodes with attached data,
+ * which limits extended attribute size to UBIFS block size (4KiB). Names of
+ * extended attributes are described by extended attribute entries (xentries),
+ * which are almost identical to directory entries, but have different key type.
+ *
+ * In other words, the situation with extended attributes is very similar to
+ * directories. Indeed, any inode (but of course not xattr inodes) may have a
+ * number of associated xentries, just like directory inodes have associated
+ * directory entries. Extended attribute entries store the name of the extended
+ * attribute, the host inode number, and the extended attribute inode number.
+ * Similarly, direntries store the name, the parent and the target inode
+ * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
+ * extended attributes.
+ *
+ * The number of extended attributes is not limited, but there is Linux
+ * limitation on the maximum possible size of the list of all extended
+ * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
+ * the sum of all extended attribute names of the inode does not exceed that
+ * limit.
+ *
+ * Extended attributes are synchronous, which means they are written to the
+ * flash media synchronously and there is no write-back for extended attribute
+ * inodes. The extended attribute values are not stored in compressed form on
+ * the media.
+ *
+ * Since extended attributes are represented by regular inodes, they are cached
+ * in the VFS inode cache. The xentries are cached in the LNC cache (see
+ * tnc.c).
+ *
+ * ACL support is not implemented.
+ */
+
+#include "ubifs.h"
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+/*
+ * Limit the number of extended attributes per inode so that the total size
+ * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ */
+#define MAX_XATTRS_PER_INODE 65535
+
+/*
+ * Extended attribute type constants.
+ *
+ * USER_XATTR: user extended attribute ("user.*")
+ * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
+ * SECURITY_XATTR: security extended attribute ("security.*")
+ */
+enum {
+	USER_XATTR,
+	TRUSTED_XATTR,
+	SECURITY_XATTR,
+};
+
+static const struct inode_operations empty_iops;
+static const struct file_operations empty_fops;
+
+/**
+ * create_xattr - create an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @nm: extended attribute name
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This is a helper function which creates an extended attribute of name @nm
+ * and value @value for inode @host. The host inode is also updated on flash
+ * because the ctime and extended attribute accounting data changes. This
+ * function returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+static int create_xattr(struct ubifs_info *c, struct inode *host,
+			const struct qstr *nm, const void *value, int size)
+{
+	int err, names_len;
+	struct inode *inode;
+	struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+				.new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
+				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
+
+	if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
+		ubifs_err(c, "inode %lu already has too many xattrs (%d), cannot create more",
+			  host->i_ino, host_ui->xattr_cnt);
+		return -ENOSPC;
+	}
+	/*
+	 * Linux limits the maximum size of the extended attribute names list
+	 * to %XATTR_LIST_MAX. This means we should not allow creating more
+	 * extended attributes if the name list becomes larger. This limitation
+	 * is artificial for UBIFS, though.
+	 */
+	names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
+	if (names_len > XATTR_LIST_MAX) {
+		ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
+			  host->i_ino, names_len, XATTR_LIST_MAX);
+		return -ENOSPC;
+	}
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	/* Re-define all operations to be "nothing" */
+	inode->i_mapping->a_ops = &empty_aops;
+	inode->i_op = &empty_iops;
+	inode->i_fop = &empty_fops;
+
+	inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
+	ui = ubifs_inode(inode);
+	ui->xattr = 1;
+	ui->flags |= UBIFS_XATTR_FL;
+	ui->data = kmemdup(value, size, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+	inode->i_size = ui->ui_size = size;
+	ui->data_len = size;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_cnt += 1;
+	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(size);
+	host_ui->xattr_names += nm->len;
+
+	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	iput(inode);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_cnt -= 1;
+	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+	mutex_unlock(&host_ui->ui_mutex);
+out_free:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This helper function changes the value of extended attribute @inode with new
+ * data from @value. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int change_xattr(struct ubifs_info *c, struct inode *host,
+			struct inode *inode, const void *value, int size)
+{
+	int err;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 2,
+		.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
+
+	ubifs_assert(ui->data_len == inode->i_size);
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	kfree(ui->data);
+	ui->data = kmemdup(value, size, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+	inode->i_size = ui->ui_size = size;
+	ui->data_len = size;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(size);
+
+	/*
+	 * It is important to write the host inode after the xattr inode
+	 * because if the host inode gets synchronized (via 'fsync()'), then
+	 * the extended attribute inode gets synchronized, because it goes
+	 * before the host inode in the write-buffer.
+	 */
+	err = ubifs_jnl_change_xattr(c, inode, host);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+	mutex_unlock(&host_ui->ui_mutex);
+	make_bad_inode(inode);
+out_free:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * check_namespace - check extended attribute name-space.
+ * @nm: extended attribute name
+ *
+ * This function makes sure the extended attribute name belongs to one of the
+ * supported extended attribute name-spaces. Returns name-space index in case
+ * of success and a negative error code in case of failure.
+ */
+static int check_namespace(const struct qstr *nm)
+{
+	int type;
+
+	if (nm->len > UBIFS_MAX_NLEN)
+		return -ENAMETOOLONG;
+
+	if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
+		     XATTR_TRUSTED_PREFIX_LEN)) {
+		if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+			return -EINVAL;
+		type = TRUSTED_XATTR;
+	} else if (!strncmp(nm->name, XATTR_USER_PREFIX,
+				      XATTR_USER_PREFIX_LEN)) {
+		if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
+			return -EINVAL;
+		type = USER_XATTR;
+	} else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
+				     XATTR_SECURITY_PREFIX_LEN)) {
+		if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+			return -EINVAL;
+		type = SECURITY_XATTR;
+	} else
+		return -EOPNOTSUPP;
+
+	return type;
+}
+
+static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
+{
+	struct inode *inode;
+
+	inode = ubifs_iget(c->vfs_sb, inum);
+	if (IS_ERR(inode)) {
+		ubifs_err(c, "dead extended attribute entry, error %d",
+			  (int)PTR_ERR(inode));
+		return inode;
+	}
+	if (ubifs_inode(inode)->xattr)
+		return inode;
+	ubifs_err(c, "corrupt extended attribute entry");
+	iput(inode);
+	return ERR_PTR(-EINVAL);
+}
+
+static int setxattr(struct inode *host, const char *name, const void *value,
+		    size_t size, int flags)
+{
+	struct inode *inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = QSTR_INIT(name, strlen(name));
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err, type;
+
+	ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+	if (size > UBIFS_MAX_INO_DATA)
+		return -ERANGE;
+
+	type = check_namespace(&nm);
+	if (type < 0)
+		return type;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	/*
+	 * The extended attribute entries are stored in LNC, so multiple
+	 * look-ups do not involve reading the flash.
+	 */
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err != -ENOENT)
+			goto out_free;
+
+		if (flags & XATTR_REPLACE)
+			/* We are asked not to create the xattr */
+			err = -ENODATA;
+		else
+			err = create_xattr(c, host, &nm, value, size);
+		goto out_free;
+	}
+
+	if (flags & XATTR_CREATE) {
+		/* We are asked not to replace the xattr */
+		err = -EEXIST;
+		goto out_free;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_free;
+	}
+
+	err = change_xattr(c, host, inode, value, size);
+	iput(inode);
+
+out_free:
+	kfree(xent);
+	return err;
+}
+
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+		   const void *value, size_t size, int flags)
+{
+	dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
+		name, d_inode(dentry)->i_ino, dentry, size);
+
+	return setxattr(d_inode(dentry), name, value, size, flags);
+}
+
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size)
+{
+	struct inode *inode, *host = d_inode(dentry);
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = QSTR_INIT(name, strlen(name));
+	struct ubifs_inode *ui;
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err;
+
+	dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
+		host->i_ino, dentry, size);
+
+	err = check_namespace(&nm);
+	if (err < 0)
+		return err;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err == -ENOENT)
+			err = -ENODATA;
+		goto out_unlock;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	ui = ubifs_inode(inode);
+	ubifs_assert(inode->i_size == ui->data_len);
+	ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+
+	if (buf) {
+		/* If @buf is %NULL we are supposed to return the length */
+		if (ui->data_len > size) {
+			ubifs_err(c, "buffer size %zd, xattr len %d",
+				  size, ui->data_len);
+			err = -ERANGE;
+			goto out_iput;
+		}
+
+		memcpy(buf, ui->data, ui->data_len);
+	}
+	err = ui->data_len;
+
+out_iput:
+	iput(inode);
+out_unlock:
+	kfree(xent);
+	return err;
+}
+
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	union ubifs_key key;
+	struct inode *host = d_inode(dentry);
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_dent_node *xent, *pxent = NULL;
+	int err, len, written = 0;
+	struct qstr nm = { .name = NULL };
+
+	dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino,
+		dentry, size);
+
+	len = host_ui->xattr_names + host_ui->xattr_cnt;
+	if (!buffer)
+		/*
+		 * We should return the minimum buffer size which will fit a
+		 * null-terminated list of all the extended attribute names.
+		 */
+		return len;
+
+	if (len > size)
+		return -ERANGE;
+
+	lowest_xent_key(c, &key, host->i_ino);
+	while (1) {
+		int type;
+
+		xent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(xent)) {
+			err = PTR_ERR(xent);
+			break;
+		}
+
+		nm.name = xent->name;
+		nm.len = le16_to_cpu(xent->nlen);
+
+		type = check_namespace(&nm);
+		if (unlikely(type < 0)) {
+			err = type;
+			break;
+		}
+
+		/* Show trusted namespace only for "power" users */
+		if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
+			memcpy(buffer + written, nm.name, nm.len + 1);
+			written += nm.len + 1;
+		}
+
+		kfree(pxent);
+		pxent = xent;
+		key_read(c, &xent->key, &key);
+	}
+
+	kfree(pxent);
+	if (err != -ENOENT) {
+		ubifs_err(c, "cannot find next direntry, error %d", err);
+		return err;
+	}
+
+	ubifs_assert(written <= size);
+	return written;
+}
+
+static int remove_xattr(struct ubifs_info *c, struct inode *host,
+			struct inode *inode, const struct qstr *nm)
+{
+	int err;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
+				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
+
+	ubifs_assert(ui->data_len == inode->i_size);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_cnt -= 1;
+	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+	host_ui->xattr_names -= nm->len;
+
+	err = ubifs_jnl_delete_xattr(c, host, inode, nm);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_cnt += 1;
+	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+	mutex_unlock(&host_ui->ui_mutex);
+	ubifs_release_budget(c, &req);
+	make_bad_inode(inode);
+	return err;
+}
+
+int ubifs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode, *host = d_inode(dentry);
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = QSTR_INIT(name, strlen(name));
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err;
+
+	dbg_gen("xattr '%s', ino %lu ('%pd')", name,
+		host->i_ino, dentry);
+	ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+	err = check_namespace(&nm);
+	if (err < 0)
+		return err;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err == -ENOENT)
+			err = -ENODATA;
+		goto out_free;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_free;
+	}
+
+	ubifs_assert(inode->i_nlink == 1);
+	clear_nlink(inode);
+	err = remove_xattr(c, host, inode, &nm);
+	if (err)
+		set_nlink(inode, 1);
+
+	/* If @i_nlink is 0, 'iput()' will delete the inode */
+	iput(inode);
+
+out_free:
+	kfree(xent);
+	return err;
+}
+
+static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
+				 const char *name, size_t name_len, int flags)
+{
+	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+
+	return total_len;
+}
+
+static int security_getxattr(struct dentry *d, const char *name, void *buffer,
+		      size_t size, int flags)
+{
+	return ubifs_getxattr(d, name, buffer, size);
+}
+
+static int security_setxattr(struct dentry *d, const char *name,
+			     const void *value, size_t size, int flags,
+			     int handler_flags)
+{
+	return ubifs_setxattr(d, name, value, size, flags);
+}
+
+static const struct xattr_handler ubifs_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list   = security_listxattr,
+	.get    = security_getxattr,
+	.set    = security_setxattr,
+};
+
+const struct xattr_handler *ubifs_xattr_handlers[] = {
+	&ubifs_xattr_security_handler,
+	NULL,
+};
+
+static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
+		      void *fs_info)
+{
+	const struct xattr *xattr;
+	char *name;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+			       strlen(xattr->name) + 1, GFP_NOFS);
+		if (!name) {
+			err = -ENOMEM;
+			break;
+		}
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+		err = setxattr(inode, name, xattr->value, xattr->value_len, 0);
+		kfree(name);
+		if (err < 0)
+			break;
+	}
+
+	return err;
+}
+
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+			const struct qstr *qstr)
+{
+	int err;
+
+	mutex_lock(&inode->i_mutex);
+	err = security_inode_init_security(inode, dentry, qstr,
+					   &init_xattrs, 0);
+	mutex_unlock(&inode->i_mutex);
+
+	if (err) {
+		struct ubifs_info *c = dentry->i_sb->s_fs_info;
+		ubifs_err(c, "cannot initialize security for inode %lu, error %d",
+			  inode->i_ino, err);
+	}
+	return err;
+}
diff --git a/kernel/fs/udf/Kconfig b/kernel/fs/udf/Kconfig
new file mode 100644
index 000000000..c6e17a744
--- /dev/null
+++ b/kernel/fs/udf/Kconfig
@@ -0,0 +1,20 @@
+config UDF_FS
+	tristate "UDF file system support"
+	select CRC_ITU_T
+	help
+	  This is a file system used on some CD-ROMs and DVDs. Since the
+	  file system is supported by multiple operating systems and is more
+	  compatible with standard unix file systems, it is also suitable for
+	  removable USB disks. Say Y if you intend to mount DVD discs or CDRW's
+	  written in packet mode, or if you want to use UDF for removable USB
+	  disks. Please read <file:Documentation/filesystems/udf.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called udf.
+
+	  If unsure, say N.
+
+config UDF_NLS
+	bool
+	default y
+	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
diff --git a/kernel/fs/udf/Makefile b/kernel/fs/udf/Makefile
new file mode 100644
index 000000000..eb880f66c
--- /dev/null
+++ b/kernel/fs/udf/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the linux udf-filesystem routines.
+#
+
+obj-$(CONFIG_UDF_FS) += udf.o
+
+udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
+		partition.o super.o truncate.o symlink.o \
+		directory.o misc.o udftime.o unicode.o
diff --git a/kernel/fs/udf/balloc.c b/kernel/fs/udf/balloc.c
new file mode 100644
index 000000000..6d6a96b4e
--- /dev/null
+++ b/kernel/fs/udf/balloc.c
@@ -0,0 +1,821 @@
+/*
+ * balloc.c
+ *
+ * PURPOSE
+ *	Block allocation handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1999-2001 Ben Fennema
+ *  (C) 1999 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  02/24/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+
+#include <linux/bitops.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+#define udf_clear_bit	__test_and_clear_bit_le
+#define udf_set_bit	__test_and_set_bit_le
+#define udf_test_bit	test_bit_le
+#define udf_find_next_one_bit	find_next_bit_le
+
+static int read_block_bitmap(struct super_block *sb,
+			     struct udf_bitmap *bitmap, unsigned int block,
+			     unsigned long bitmap_nr)
+{
+	struct buffer_head *bh = NULL;
+	int retval = 0;
+	struct kernel_lb_addr loc;
+
+	loc.logicalBlockNum = bitmap->s_extPosition;
+	loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
+
+	bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
+	if (!bh)
+		retval = -EIO;
+
+	bitmap->s_block_bitmap[bitmap_nr] = bh;
+	return retval;
+}
+
+static int __load_block_bitmap(struct super_block *sb,
+			       struct udf_bitmap *bitmap,
+			       unsigned int block_group)
+{
+	int retval = 0;
+	int nr_groups = bitmap->s_nr_groups;
+
+	if (block_group >= nr_groups) {
+		udf_debug("block_group (%d) > nr_groups (%d)\n",
+			  block_group, nr_groups);
+	}
+
+	if (bitmap->s_block_bitmap[block_group])
+		return block_group;
+
+	retval = read_block_bitmap(sb, bitmap, block_group, block_group);
+	if (retval < 0)
+		return retval;
+
+	return block_group;
+}
+
+static inline int load_block_bitmap(struct super_block *sb,
+				    struct udf_bitmap *bitmap,
+				    unsigned int block_group)
+{
+	int slot;
+
+	slot = __load_block_bitmap(sb, bitmap, block_group);
+
+	if (slot < 0)
+		return slot;
+
+	if (!bitmap->s_block_bitmap[slot])
+		return -EIO;
+
+	return slot;
+}
+
+static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDesc *lvid;
+
+	if (!sbi->s_lvid_bh)
+		return;
+
+	lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
+	le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
+	udf_updated_lvid(sb);
+}
+
+static void udf_bitmap_free_blocks(struct super_block *sb,
+				   struct udf_bitmap *bitmap,
+				   struct kernel_lb_addr *bloc,
+				   uint32_t offset,
+				   uint32_t count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct udf_part_map *partmap;
+	unsigned long block;
+	unsigned long block_group;
+	unsigned long bit;
+	unsigned long i;
+	int bitmap_nr;
+	unsigned long overflow;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
+		udf_debug("%d < %d || %d + %d > %d\n",
+			  bloc->logicalBlockNum, 0,
+			  bloc->logicalBlockNum, count,
+			  partmap->s_partition_len);
+		goto error_return;
+	}
+
+	block = bloc->logicalBlockNum + offset +
+		(sizeof(struct spaceBitmapDesc) << 3);
+
+	do {
+		overflow = 0;
+		block_group = block >> (sb->s_blocksize_bits + 3);
+		bit = block % (sb->s_blocksize << 3);
+
+		/*
+		* Check to see if we are freeing blocks across a group boundary.
+		*/
+		if (bit + count > (sb->s_blocksize << 3)) {
+			overflow = bit + count - (sb->s_blocksize << 3);
+			count -= overflow;
+		}
+		bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+		if (bitmap_nr < 0)
+			goto error_return;
+
+		bh = bitmap->s_block_bitmap[bitmap_nr];
+		for (i = 0; i < count; i++) {
+			if (udf_set_bit(bit + i, bh->b_data)) {
+				udf_debug("bit %ld already set\n", bit + i);
+				udf_debug("byte=%2x\n",
+					  ((char *)bh->b_data)[(bit + i) >> 3]);
+			}
+		}
+		udf_add_free_space(sb, sbi->s_partition, count);
+		mark_buffer_dirty(bh);
+		if (overflow) {
+			block += count;
+			count = overflow;
+		}
+	} while (overflow);
+
+error_return:
+	mutex_unlock(&sbi->s_alloc_mutex);
+}
+
+static int udf_bitmap_prealloc_blocks(struct super_block *sb,
+				      struct udf_bitmap *bitmap,
+				      uint16_t partition, uint32_t first_block,
+				      uint32_t block_count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int alloc_count = 0;
+	int bit, block, block_group, group_start;
+	int nr_groups, bitmap_nr;
+	struct buffer_head *bh;
+	__u32 part_len;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	part_len = sbi->s_partmaps[partition].s_partition_len;
+	if (first_block >= part_len)
+		goto out;
+
+	if (first_block + block_count > part_len)
+		block_count = part_len - first_block;
+
+	do {
+		nr_groups = udf_compute_nr_groups(sb, partition);
+		block = first_block + (sizeof(struct spaceBitmapDesc) << 3);
+		block_group = block >> (sb->s_blocksize_bits + 3);
+		group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
+
+		bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+		if (bitmap_nr < 0)
+			goto out;
+		bh = bitmap->s_block_bitmap[bitmap_nr];
+
+		bit = block % (sb->s_blocksize << 3);
+
+		while (bit < (sb->s_blocksize << 3) && block_count > 0) {
+			if (!udf_clear_bit(bit, bh->b_data))
+				goto out;
+			block_count--;
+			alloc_count++;
+			bit++;
+			block++;
+		}
+		mark_buffer_dirty(bh);
+	} while (block_count > 0);
+
+out:
+	udf_add_free_space(sb, partition, -alloc_count);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return alloc_count;
+}
+
+static int udf_bitmap_new_block(struct super_block *sb,
+				struct udf_bitmap *bitmap, uint16_t partition,
+				uint32_t goal, int *err)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int newbit, bit = 0, block, block_group, group_start;
+	int end_goal, nr_groups, bitmap_nr, i;
+	struct buffer_head *bh = NULL;
+	char *ptr;
+	int newblock = 0;
+
+	*err = -ENOSPC;
+	mutex_lock(&sbi->s_alloc_mutex);
+
+repeat:
+	if (goal >= sbi->s_partmaps[partition].s_partition_len)
+		goal = 0;
+
+	nr_groups = bitmap->s_nr_groups;
+	block = goal + (sizeof(struct spaceBitmapDesc) << 3);
+	block_group = block >> (sb->s_blocksize_bits + 3);
+	group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
+
+	bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+	if (bitmap_nr < 0)
+		goto error_return;
+	bh = bitmap->s_block_bitmap[bitmap_nr];
+	ptr = memscan((char *)bh->b_data + group_start, 0xFF,
+		      sb->s_blocksize - group_start);
+
+	if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) {
+		bit = block % (sb->s_blocksize << 3);
+		if (udf_test_bit(bit, bh->b_data))
+			goto got_block;
+
+		end_goal = (bit + 63) & ~63;
+		bit = udf_find_next_one_bit(bh->b_data, end_goal, bit);
+		if (bit < end_goal)
+			goto got_block;
+
+		ptr = memscan((char *)bh->b_data + (bit >> 3), 0xFF,
+			      sb->s_blocksize - ((bit + 7) >> 3));
+		newbit = (ptr - ((char *)bh->b_data)) << 3;
+		if (newbit < sb->s_blocksize << 3) {
+			bit = newbit;
+			goto search_back;
+		}
+
+		newbit = udf_find_next_one_bit(bh->b_data,
+					       sb->s_blocksize << 3, bit);
+		if (newbit < sb->s_blocksize << 3) {
+			bit = newbit;
+			goto got_block;
+		}
+	}
+
+	for (i = 0; i < (nr_groups * 2); i++) {
+		block_group++;
+		if (block_group >= nr_groups)
+			block_group = 0;
+		group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
+
+		bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+		if (bitmap_nr < 0)
+			goto error_return;
+		bh = bitmap->s_block_bitmap[bitmap_nr];
+		if (i < nr_groups) {
+			ptr = memscan((char *)bh->b_data + group_start, 0xFF,
+				      sb->s_blocksize - group_start);
+			if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) {
+				bit = (ptr - ((char *)bh->b_data)) << 3;
+				break;
+			}
+		} else {
+			bit = udf_find_next_one_bit(bh->b_data,
+						    sb->s_blocksize << 3,
+						    group_start << 3);
+			if (bit < sb->s_blocksize << 3)
+				break;
+		}
+	}
+	if (i >= (nr_groups * 2)) {
+		mutex_unlock(&sbi->s_alloc_mutex);
+		return newblock;
+	}
+	if (bit < sb->s_blocksize << 3)
+		goto search_back;
+	else
+		bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3,
+					    group_start << 3);
+	if (bit >= sb->s_blocksize << 3) {
+		mutex_unlock(&sbi->s_alloc_mutex);
+		return 0;
+	}
+
+search_back:
+	i = 0;
+	while (i < 7 && bit > (group_start << 3) &&
+	       udf_test_bit(bit - 1, bh->b_data)) {
+		++i;
+		--bit;
+	}
+
+got_block:
+	newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
+		(sizeof(struct spaceBitmapDesc) << 3);
+
+	if (!udf_clear_bit(bit, bh->b_data)) {
+		udf_debug("bit already cleared for block %d\n", bit);
+		goto repeat;
+	}
+
+	mark_buffer_dirty(bh);
+
+	udf_add_free_space(sb, partition, -1);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	*err = 0;
+	return newblock;
+
+error_return:
+	*err = -EIO;
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return 0;
+}
+
+static void udf_table_free_blocks(struct super_block *sb,
+				  struct inode *table,
+				  struct kernel_lb_addr *bloc,
+				  uint32_t offset,
+				  uint32_t count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *partmap;
+	uint32_t start, end;
+	uint32_t elen;
+	struct kernel_lb_addr eloc;
+	struct extent_position oepos, epos;
+	int8_t etype;
+	struct udf_inode_info *iinfo;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
+		udf_debug("%d < %d || %d + %d > %d\n",
+			  bloc->logicalBlockNum, 0,
+			  bloc->logicalBlockNum, count,
+			  partmap->s_partition_len);
+		goto error_return;
+	}
+
+	iinfo = UDF_I(table);
+	udf_add_free_space(sb, sbi->s_partition, count);
+
+	start = bloc->logicalBlockNum + offset;
+	end = bloc->logicalBlockNum + offset + count - 1;
+
+	epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
+	elen = 0;
+	epos.block = oepos.block = iinfo->i_location;
+	epos.bh = oepos.bh = NULL;
+
+	while (count &&
+	       (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
+		if (((eloc.logicalBlockNum +
+			(elen >> sb->s_blocksize_bits)) == start)) {
+			if ((0x3FFFFFFF - elen) <
+					(count << sb->s_blocksize_bits)) {
+				uint32_t tmp = ((0x3FFFFFFF - elen) >>
+							sb->s_blocksize_bits);
+				count -= tmp;
+				start += tmp;
+				elen = (etype << 30) |
+					(0x40000000 - sb->s_blocksize);
+			} else {
+				elen = (etype << 30) |
+					(elen +
+					(count << sb->s_blocksize_bits));
+				start += count;
+				count = 0;
+			}
+			udf_write_aext(table, &oepos, &eloc, elen, 1);
+		} else if (eloc.logicalBlockNum == (end + 1)) {
+			if ((0x3FFFFFFF - elen) <
+					(count << sb->s_blocksize_bits)) {
+				uint32_t tmp = ((0x3FFFFFFF - elen) >>
+						sb->s_blocksize_bits);
+				count -= tmp;
+				end -= tmp;
+				eloc.logicalBlockNum -= tmp;
+				elen = (etype << 30) |
+					(0x40000000 - sb->s_blocksize);
+			} else {
+				eloc.logicalBlockNum = start;
+				elen = (etype << 30) |
+					(elen +
+					(count << sb->s_blocksize_bits));
+				end -= count;
+				count = 0;
+			}
+			udf_write_aext(table, &oepos, &eloc, elen, 1);
+		}
+
+		if (epos.bh != oepos.bh) {
+			oepos.block = epos.block;
+			brelse(oepos.bh);
+			get_bh(epos.bh);
+			oepos.bh = epos.bh;
+			oepos.offset = 0;
+		} else {
+			oepos.offset = epos.offset;
+		}
+	}
+
+	if (count) {
+		/*
+		 * NOTE: we CANNOT use udf_add_aext here, as it can try to
+		 * allocate a new block, and since we hold the super block
+		 * lock already very bad things would happen :)
+		 *
+		 * We copy the behavior of udf_add_aext, but instead of
+		 * trying to allocate a new block close to the existing one,
+		 * we just steal a block from the extent we are trying to add.
+		 *
+		 * It would be nice if the blocks were close together, but it
+		 * isn't required.
+		 */
+
+		int adsize;
+		struct short_ad *sad = NULL;
+		struct long_ad *lad = NULL;
+		struct allocExtDesc *aed;
+
+		eloc.logicalBlockNum = start;
+		elen = EXT_RECORDED_ALLOCATED |
+			(count << sb->s_blocksize_bits);
+
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+			adsize = sizeof(struct short_ad);
+		else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+			adsize = sizeof(struct long_ad);
+		else {
+			brelse(oepos.bh);
+			brelse(epos.bh);
+			goto error_return;
+		}
+
+		if (epos.offset + (2 * adsize) > sb->s_blocksize) {
+			unsigned char *sptr, *dptr;
+			int loffset;
+
+			brelse(oepos.bh);
+			oepos = epos;
+
+			/* Steal a block from the extent being free'd */
+			epos.block.logicalBlockNum = eloc.logicalBlockNum;
+			eloc.logicalBlockNum++;
+			elen -= sb->s_blocksize;
+
+			epos.bh = udf_tread(sb,
+					udf_get_lb_pblock(sb, &epos.block, 0));
+			if (!epos.bh) {
+				brelse(oepos.bh);
+				goto error_return;
+			}
+			aed = (struct allocExtDesc *)(epos.bh->b_data);
+			aed->previousAllocExtLocation =
+				cpu_to_le32(oepos.block.logicalBlockNum);
+			if (epos.offset + adsize > sb->s_blocksize) {
+				loffset = epos.offset;
+				aed->lengthAllocDescs = cpu_to_le32(adsize);
+				sptr = iinfo->i_ext.i_data + epos.offset
+								- adsize;
+				dptr = epos.bh->b_data +
+					sizeof(struct allocExtDesc);
+				memcpy(dptr, sptr, adsize);
+				epos.offset = sizeof(struct allocExtDesc) +
+						adsize;
+			} else {
+				loffset = epos.offset + adsize;
+				aed->lengthAllocDescs = cpu_to_le32(0);
+				if (oepos.bh) {
+					sptr = oepos.bh->b_data + epos.offset;
+					aed = (struct allocExtDesc *)
+						oepos.bh->b_data;
+					le32_add_cpu(&aed->lengthAllocDescs,
+							adsize);
+				} else {
+					sptr = iinfo->i_ext.i_data +
+								epos.offset;
+					iinfo->i_lenAlloc += adsize;
+					mark_inode_dirty(table);
+				}
+				epos.offset = sizeof(struct allocExtDesc);
+			}
+			if (sbi->s_udfrev >= 0x0200)
+				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
+					    3, 1, epos.block.logicalBlockNum,
+					    sizeof(struct tag));
+			else
+				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
+					    2, 1, epos.block.logicalBlockNum,
+					    sizeof(struct tag));
+
+			switch (iinfo->i_alloc_type) {
+			case ICBTAG_FLAG_AD_SHORT:
+				sad = (struct short_ad *)sptr;
+				sad->extLength = cpu_to_le32(
+					EXT_NEXT_EXTENT_ALLOCDECS |
+					sb->s_blocksize);
+				sad->extPosition =
+					cpu_to_le32(epos.block.logicalBlockNum);
+				break;
+			case ICBTAG_FLAG_AD_LONG:
+				lad = (struct long_ad *)sptr;
+				lad->extLength = cpu_to_le32(
+					EXT_NEXT_EXTENT_ALLOCDECS |
+					sb->s_blocksize);
+				lad->extLocation =
+					cpu_to_lelb(epos.block);
+				break;
+			}
+			if (oepos.bh) {
+				udf_update_tag(oepos.bh->b_data, loffset);
+				mark_buffer_dirty(oepos.bh);
+			} else {
+				mark_inode_dirty(table);
+			}
+		}
+
+		/* It's possible that stealing the block emptied the extent */
+		if (elen) {
+			udf_write_aext(table, &epos, &eloc, elen, 1);
+
+			if (!epos.bh) {
+				iinfo->i_lenAlloc += adsize;
+				mark_inode_dirty(table);
+			} else {
+				aed = (struct allocExtDesc *)epos.bh->b_data;
+				le32_add_cpu(&aed->lengthAllocDescs, adsize);
+				udf_update_tag(epos.bh->b_data, epos.offset);
+				mark_buffer_dirty(epos.bh);
+			}
+		}
+	}
+
+	brelse(epos.bh);
+	brelse(oepos.bh);
+
+error_return:
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return;
+}
+
+static int udf_table_prealloc_blocks(struct super_block *sb,
+				     struct inode *table, uint16_t partition,
+				     uint32_t first_block, uint32_t block_count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int alloc_count = 0;
+	uint32_t elen, adsize;
+	struct kernel_lb_addr eloc;
+	struct extent_position epos;
+	int8_t etype = -1;
+	struct udf_inode_info *iinfo;
+
+	if (first_block >= sbi->s_partmaps[partition].s_partition_len)
+		return 0;
+
+	iinfo = UDF_I(table);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		return 0;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.block = iinfo->i_location;
+	epos.bh = NULL;
+	eloc.logicalBlockNum = 0xFFFFFFFF;
+
+	while (first_block != eloc.logicalBlockNum &&
+	       (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
+		udf_debug("eloc=%d, elen=%d, first_block=%d\n",
+			  eloc.logicalBlockNum, elen, first_block);
+		; /* empty loop body */
+	}
+
+	if (first_block == eloc.logicalBlockNum) {
+		epos.offset -= adsize;
+
+		alloc_count = (elen >> sb->s_blocksize_bits);
+		if (alloc_count > block_count) {
+			alloc_count = block_count;
+			eloc.logicalBlockNum += alloc_count;
+			elen -= (alloc_count << sb->s_blocksize_bits);
+			udf_write_aext(table, &epos, &eloc,
+					(etype << 30) | elen, 1);
+		} else
+			udf_delete_aext(table, epos, eloc,
+					(etype << 30) | elen);
+	} else {
+		alloc_count = 0;
+	}
+
+	brelse(epos.bh);
+
+	if (alloc_count)
+		udf_add_free_space(sb, partition, -alloc_count);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return alloc_count;
+}
+
+static int udf_table_new_block(struct super_block *sb,
+			       struct inode *table, uint16_t partition,
+			       uint32_t goal, int *err)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
+	uint32_t newblock = 0, adsize;
+	uint32_t elen, goal_elen = 0;
+	struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+	struct extent_position epos, goal_epos;
+	int8_t etype;
+	struct udf_inode_info *iinfo = UDF_I(table);
+
+	*err = -ENOSPC;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		return newblock;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	if (goal >= sbi->s_partmaps[partition].s_partition_len)
+		goal = 0;
+
+	/* We search for the closest matching block to goal. If we find
+	   a exact hit, we stop. Otherwise we keep going till we run out
+	   of extents. We store the buffer_head, bloc, and extoffset
+	   of the current closest match and use that when we are done.
+	 */
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.block = iinfo->i_location;
+	epos.bh = goal_epos.bh = NULL;
+
+	while (spread &&
+	       (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
+		if (goal >= eloc.logicalBlockNum) {
+			if (goal < eloc.logicalBlockNum +
+					(elen >> sb->s_blocksize_bits))
+				nspread = 0;
+			else
+				nspread = goal - eloc.logicalBlockNum -
+					(elen >> sb->s_blocksize_bits);
+		} else {
+			nspread = eloc.logicalBlockNum - goal;
+		}
+
+		if (nspread < spread) {
+			spread = nspread;
+			if (goal_epos.bh != epos.bh) {
+				brelse(goal_epos.bh);
+				goal_epos.bh = epos.bh;
+				get_bh(goal_epos.bh);
+			}
+			goal_epos.block = epos.block;
+			goal_epos.offset = epos.offset - adsize;
+			goal_eloc = eloc;
+			goal_elen = (etype << 30) | elen;
+		}
+	}
+
+	brelse(epos.bh);
+
+	if (spread == 0xFFFFFFFF) {
+		brelse(goal_epos.bh);
+		mutex_unlock(&sbi->s_alloc_mutex);
+		return 0;
+	}
+
+	/* Only allocate blocks from the beginning of the extent.
+	   That way, we only delete (empty) extents, never have to insert an
+	   extent because of splitting */
+	/* This works, but very poorly.... */
+
+	newblock = goal_eloc.logicalBlockNum;
+	goal_eloc.logicalBlockNum++;
+	goal_elen -= sb->s_blocksize;
+
+	if (goal_elen)
+		udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
+	else
+		udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
+	brelse(goal_epos.bh);
+
+	udf_add_free_space(sb, partition, -1);
+
+	mutex_unlock(&sbi->s_alloc_mutex);
+	*err = 0;
+	return newblock;
+}
+
+void udf_free_blocks(struct super_block *sb, struct inode *inode,
+		     struct kernel_lb_addr *bloc, uint32_t offset,
+		     uint32_t count)
+{
+	uint16_t partition = bloc->partitionReferenceNum;
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
+		udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap,
+				       bloc, offset, count);
+	} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
+		udf_table_free_blocks(sb, map->s_uspace.s_table,
+				      bloc, offset, count);
+	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
+		udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,
+				       bloc, offset, count);
+	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
+		udf_table_free_blocks(sb, map->s_fspace.s_table,
+				      bloc, offset, count);
+	}
+
+	if (inode) {
+		inode_sub_bytes(inode,
+				((sector_t)count) << sb->s_blocksize_bits);
+	}
+}
+
+inline int udf_prealloc_blocks(struct super_block *sb,
+			       struct inode *inode,
+			       uint16_t partition, uint32_t first_block,
+			       uint32_t block_count)
+{
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+	int allocated;
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+		allocated = udf_bitmap_prealloc_blocks(sb,
+						       map->s_uspace.s_bitmap,
+						       partition, first_block,
+						       block_count);
+	else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+		allocated = udf_table_prealloc_blocks(sb,
+						      map->s_uspace.s_table,
+						      partition, first_block,
+						      block_count);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+		allocated = udf_bitmap_prealloc_blocks(sb,
+						       map->s_fspace.s_bitmap,
+						       partition, first_block,
+						       block_count);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+		allocated = udf_table_prealloc_blocks(sb,
+						      map->s_fspace.s_table,
+						      partition, first_block,
+						      block_count);
+	else
+		return 0;
+
+	if (inode && allocated > 0)
+		inode_add_bytes(inode, allocated << sb->s_blocksize_bits);
+	return allocated;
+}
+
+inline int udf_new_block(struct super_block *sb,
+			 struct inode *inode,
+			 uint16_t partition, uint32_t goal, int *err)
+{
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+	int block;
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+		block = udf_bitmap_new_block(sb,
+					     map->s_uspace.s_bitmap,
+					     partition, goal, err);
+	else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+		block = udf_table_new_block(sb,
+					    map->s_uspace.s_table,
+					    partition, goal, err);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+		block = udf_bitmap_new_block(sb,
+					     map->s_fspace.s_bitmap,
+					     partition, goal, err);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+		block = udf_table_new_block(sb,
+					    map->s_fspace.s_table,
+					    partition, goal, err);
+	else {
+		*err = -EIO;
+		return 0;
+	}
+	if (inode && block)
+		inode_add_bytes(inode, sb->s_blocksize);
+	return block;
+}
diff --git a/kernel/fs/udf/dir.c b/kernel/fs/udf/dir.c
new file mode 100644
index 000000000..541a12b57
--- /dev/null
+++ b/kernel/fs/udf/dir.c
@@ -0,0 +1,199 @@
+/*
+ * dir.c
+ *
+ * PURPOSE
+ *  Directory handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2004 Ben Fennema
+ *
+ * HISTORY
+ *
+ *  10/05/98 dgb  Split directory operations into its own file
+ *                Implemented directory reads via do_udf_readdir
+ *  10/06/98      Made directory operations work!
+ *  11/17/98      Rewrote directory to support ICBTAG_FLAG_AD_LONG
+ *  11/25/98 blf  Rewrote directory handling (readdir+lookup) to support reading
+ *                across blocks.
+ *  12/12/98      Split out the lookup code to namei.c. bulk of directory
+ *                code now in directory.c:udf_fileident_read.
+ */
+
+#include "udfdecl.h"
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+
+static int udf_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *dir = file_inode(file);
+	struct udf_inode_info *iinfo = UDF_I(dir);
+	struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
+	struct fileIdentDesc *fi = NULL;
+	struct fileIdentDesc cfi;
+	int block, iblock;
+	loff_t nf_pos;
+	int flen;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
+	uint16_t liu;
+	uint8_t lfi;
+	loff_t size = udf_ext0_offset(dir) + dir->i_size;
+	struct buffer_head *tmp, *bha[16];
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	int i, num, ret = 0;
+	struct extent_position epos = { NULL, 0, {0, 0} };
+	struct super_block *sb = dir->i_sb;
+
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
+			return 0;
+		ctx->pos = 1;
+	}
+	nf_pos = (ctx->pos - 1) << 2;
+	if (nf_pos >= size)
+		goto out;
+
+	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!fname) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (nf_pos == 0)
+		nf_pos = udf_ext0_offset(dir);
+
+	fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1);
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits,
+		    &epos, &eloc, &elen, &offset)
+		    != (EXT_RECORDED_ALLOCATED >> 30)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		block = udf_get_lb_pblock(sb, &eloc, offset);
+		if ((++offset << sb->s_blocksize_bits) < elen) {
+			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (iinfo->i_alloc_type ==
+					ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else {
+			offset = 0;
+		}
+
+		if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) {
+			ret = -EIO;
+			goto out;
+		}
+
+		if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) {
+			i = 16 >> (sb->s_blocksize_bits - 9);
+			if (i + offset > (elen >> sb->s_blocksize_bits))
+				i = (elen >> sb->s_blocksize_bits) - offset;
+			for (num = 0; i > 0; i--) {
+				block = udf_get_lb_pblock(sb, &eloc, offset + i);
+				tmp = udf_tgetblk(sb, block);
+				if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
+					bha[num++] = tmp;
+				else
+					brelse(tmp);
+			}
+			if (num) {
+				ll_rw_block(READA, num, bha);
+				for (i = 0; i < num; i++)
+					brelse(bha[i]);
+			}
+		}
+	}
+
+	while (nf_pos < size) {
+		struct kernel_lb_addr tloc;
+
+		ctx->pos = (nf_pos >> 2) + 1;
+
+		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
+					&elen, &offset);
+		if (!fi)
+			goto out;
+
+		liu = le16_to_cpu(cfi.lengthOfImpUse);
+		lfi = cfi.lengthFileIdent;
+
+		if (fibh.sbh == fibh.ebh) {
+			nameptr = fi->fileIdent + liu;
+		} else {
+			int poffset;	/* Unpaded ending offset */
+
+			poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi;
+
+			if (poffset >= lfi) {
+				nameptr = (char *)(fibh.ebh->b_data + poffset - lfi);
+			} else {
+				nameptr = fname;
+				memcpy(nameptr, fi->fileIdent + liu,
+				       lfi - poffset);
+				memcpy(nameptr + lfi - poffset,
+				       fibh.ebh->b_data, poffset);
+			}
+		}
+
+		if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
+			if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
+				continue;
+		}
+
+		if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
+			if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
+				continue;
+		}
+
+		if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
+			if (!dir_emit_dotdot(file, ctx))
+				goto out;
+			continue;
+		}
+
+		flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
+		if (!flen)
+			continue;
+
+		tloc = lelb_to_cpu(cfi.icb.extLocation);
+		iblock = udf_get_lb_pblock(sb, &tloc, 0);
+		if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
+			goto out;
+	} /* end while */
+
+	ctx->pos = (nf_pos >> 2) + 1;
+
+out:
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	brelse(epos.bh);
+	kfree(fname);
+
+	return ret;
+}
+
+/* readdir and lookup functions */
+const struct file_operations udf_dir_operations = {
+	.llseek			= generic_file_llseek,
+	.read			= generic_read_dir,
+	.iterate		= udf_readdir,
+	.unlocked_ioctl		= udf_ioctl,
+	.fsync			= generic_file_fsync,
+};
diff --git a/kernel/fs/udf/directory.c b/kernel/fs/udf/directory.c
new file mode 100644
index 000000000..c763fda25
--- /dev/null
+++ b/kernel/fs/udf/directory.c
@@ -0,0 +1,240 @@
+/*
+ * directory.c
+ *
+ * PURPOSE
+ *	Directory related functions
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ */
+
+#include "udfdecl.h"
+#include "udf_i.h"
+
+#include <linux/fs.h>
+#include <linux/string.h>
+
+struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
+					 struct udf_fileident_bh *fibh,
+					 struct fileIdentDesc *cfi,
+					 struct extent_position *epos,
+					 struct kernel_lb_addr *eloc, uint32_t *elen,
+					 sector_t *offset)
+{
+	struct fileIdentDesc *fi;
+	int i, num, block;
+	struct buffer_head *tmp, *bha[16];
+	struct udf_inode_info *iinfo = UDF_I(dir);
+
+	fibh->soffset = fibh->eoffset;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		fi = udf_get_fileident(iinfo->i_ext.i_data -
+				       (iinfo->i_efe ?
+					sizeof(struct extendedFileEntry) :
+					sizeof(struct fileEntry)),
+				       dir->i_sb->s_blocksize,
+				       &(fibh->eoffset));
+		if (!fi)
+			return NULL;
+
+		*nf_pos += fibh->eoffset - fibh->soffset;
+
+		memcpy((uint8_t *)cfi, (uint8_t *)fi,
+		       sizeof(struct fileIdentDesc));
+
+		return fi;
+	}
+
+	if (fibh->eoffset == dir->i_sb->s_blocksize) {
+		int lextoffset = epos->offset;
+		unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits;
+
+		if (udf_next_aext(dir, epos, eloc, elen, 1) !=
+		    (EXT_RECORDED_ALLOCATED >> 30))
+			return NULL;
+
+		block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
+
+		(*offset)++;
+
+		if ((*offset << blocksize_bits) >= *elen)
+			*offset = 0;
+		else
+			epos->offset = lextoffset;
+
+		brelse(fibh->sbh);
+		fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+		if (!fibh->sbh)
+			return NULL;
+		fibh->soffset = fibh->eoffset = 0;
+
+		if (!(*offset & ((16 >> (blocksize_bits - 9)) - 1))) {
+			i = 16 >> (blocksize_bits - 9);
+			if (i + *offset > (*elen >> blocksize_bits))
+				i = (*elen >> blocksize_bits)-*offset;
+			for (num = 0; i > 0; i--) {
+				block = udf_get_lb_pblock(dir->i_sb, eloc,
+							  *offset + i);
+				tmp = udf_tgetblk(dir->i_sb, block);
+				if (tmp && !buffer_uptodate(tmp) &&
+						!buffer_locked(tmp))
+					bha[num++] = tmp;
+				else
+					brelse(tmp);
+			}
+			if (num) {
+				ll_rw_block(READA, num, bha);
+				for (i = 0; i < num; i++)
+					brelse(bha[i]);
+			}
+		}
+	} else if (fibh->sbh != fibh->ebh) {
+		brelse(fibh->sbh);
+		fibh->sbh = fibh->ebh;
+	}
+
+	fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize,
+			       &(fibh->eoffset));
+
+	if (!fi)
+		return NULL;
+
+	*nf_pos += fibh->eoffset - fibh->soffset;
+
+	if (fibh->eoffset <= dir->i_sb->s_blocksize) {
+		memcpy((uint8_t *)cfi, (uint8_t *)fi,
+		       sizeof(struct fileIdentDesc));
+	} else if (fibh->eoffset > dir->i_sb->s_blocksize) {
+		int lextoffset = epos->offset;
+
+		if (udf_next_aext(dir, epos, eloc, elen, 1) !=
+		    (EXT_RECORDED_ALLOCATED >> 30))
+			return NULL;
+
+		block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
+
+		(*offset)++;
+
+		if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen)
+			*offset = 0;
+		else
+			epos->offset = lextoffset;
+
+		fibh->soffset -= dir->i_sb->s_blocksize;
+		fibh->eoffset -= dir->i_sb->s_blocksize;
+
+		fibh->ebh = udf_tread(dir->i_sb, block);
+		if (!fibh->ebh)
+			return NULL;
+
+		if (sizeof(struct fileIdentDesc) > -fibh->soffset) {
+			int fi_len;
+
+			memcpy((uint8_t *)cfi, (uint8_t *)fi, -fibh->soffset);
+			memcpy((uint8_t *)cfi - fibh->soffset,
+			       fibh->ebh->b_data,
+			       sizeof(struct fileIdentDesc) + fibh->soffset);
+
+			fi_len = (sizeof(struct fileIdentDesc) +
+				  cfi->lengthFileIdent +
+				  le16_to_cpu(cfi->lengthOfImpUse) + 3) & ~3;
+
+			*nf_pos += fi_len - (fibh->eoffset - fibh->soffset);
+			fibh->eoffset = fibh->soffset + fi_len;
+		} else {
+			memcpy((uint8_t *)cfi, (uint8_t *)fi,
+			       sizeof(struct fileIdentDesc));
+		}
+	}
+	return fi;
+}
+
+struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
+{
+	struct fileIdentDesc *fi;
+	int lengthThisIdent;
+	uint8_t *ptr;
+	int padlen;
+
+	if ((!buffer) || (!offset)) {
+		udf_debug("invalidparms, buffer=%p, offset=%p\n",
+			  buffer, offset);
+		return NULL;
+	}
+
+	ptr = buffer;
+
+	if ((*offset > 0) && (*offset < bufsize))
+		ptr += *offset;
+	fi = (struct fileIdentDesc *)ptr;
+	if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) {
+		udf_debug("0x%x != TAG_IDENT_FID\n",
+			  le16_to_cpu(fi->descTag.tagIdent));
+		udf_debug("offset: %u sizeof: %lu bufsize: %u\n",
+			  *offset, (unsigned long)sizeof(struct fileIdentDesc),
+			  bufsize);
+		return NULL;
+	}
+	if ((*offset + sizeof(struct fileIdentDesc)) > bufsize)
+		lengthThisIdent = sizeof(struct fileIdentDesc);
+	else
+		lengthThisIdent = sizeof(struct fileIdentDesc) +
+			fi->lengthFileIdent + le16_to_cpu(fi->lengthOfImpUse);
+
+	/* we need to figure padding, too! */
+	padlen = lengthThisIdent % UDF_NAME_PAD;
+	if (padlen)
+		lengthThisIdent += (UDF_NAME_PAD - padlen);
+	*offset = *offset + lengthThisIdent;
+
+	return fi;
+}
+
+struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
+			      int inc)
+{
+	struct short_ad *sa;
+
+	if ((!ptr) || (!offset)) {
+		pr_err("%s: invalidparms\n", __func__);
+		return NULL;
+	}
+
+	if ((*offset + sizeof(struct short_ad)) > maxoffset)
+		return NULL;
+	else {
+		sa = (struct short_ad *)ptr;
+		if (sa->extLength == 0)
+			return NULL;
+	}
+
+	if (inc)
+		*offset += sizeof(struct short_ad);
+	return sa;
+}
+
+struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
+{
+	struct long_ad *la;
+
+	if ((!ptr) || (!offset)) {
+		pr_err("%s: invalidparms\n", __func__);
+		return NULL;
+	}
+
+	if ((*offset + sizeof(struct long_ad)) > maxoffset)
+		return NULL;
+	else {
+		la = (struct long_ad *)ptr;
+		if (la->extLength == 0)
+			return NULL;
+	}
+
+	if (inc)
+		*offset += sizeof(struct long_ad);
+	return la;
+}
diff --git a/kernel/fs/udf/ecma_167.h b/kernel/fs/udf/ecma_167.h
new file mode 100644
index 000000000..4792b771a
--- /dev/null
+++ b/kernel/fs/udf/ecma_167.h
@@ -0,0 +1,796 @@
+/*
+ * ecma_167.h
+ *
+ * This file is based on ECMA-167 3rd edition (June 1997)
+ * http://www.ecma.ch
+ *
+ * Copyright (c) 2001-2002  Ben Fennema <bfennema@falcon.csc.calpoly.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <linux/types.h>
+
+#ifndef _ECMA_167_H
+#define _ECMA_167_H 1
+
+/* Character set specification (ECMA 167r3 1/7.2.1) */
+struct charspec {
+	uint8_t		charSetType;
+	uint8_t		charSetInfo[63];
+} __attribute__ ((packed));
+
+/* Character Set Type (ECMA 167r3 1/7.2.1.1) */
+#define CHARSPEC_TYPE_CS0		0x00	/* (1/7.2.2) */
+#define CHARSPEC_TYPE_CS1		0x01	/* (1/7.2.3) */
+#define CHARSPEC_TYPE_CS2		0x02	/* (1/7.2.4) */
+#define CHARSPEC_TYPE_CS3		0x03	/* (1/7.2.5) */
+#define CHARSPEC_TYPE_CS4		0x04	/* (1/7.2.6) */
+#define CHARSPEC_TYPE_CS5		0x05	/* (1/7.2.7) */
+#define CHARSPEC_TYPE_CS6		0x06	/* (1/7.2.8) */
+#define CHARSPEC_TYPE_CS7		0x07	/* (1/7.2.9) */
+#define CHARSPEC_TYPE_CS8		0x08	/* (1/7.2.10) */
+
+typedef uint8_t		dstring;
+
+/* Timestamp (ECMA 167r3 1/7.3) */
+struct timestamp {
+	__le16		typeAndTimezone;
+	__le16		year;
+	uint8_t		month;
+	uint8_t		day;
+	uint8_t		hour;
+	uint8_t		minute;
+	uint8_t		second;
+	uint8_t		centiseconds;
+	uint8_t		hundredsOfMicroseconds;
+	uint8_t		microseconds;
+} __attribute__ ((packed));
+
+/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
+#define TIMESTAMP_TYPE_MASK		0xF000
+#define TIMESTAMP_TYPE_CUT		0x0000
+#define TIMESTAMP_TYPE_LOCAL		0x1000
+#define TIMESTAMP_TYPE_AGREEMENT	0x2000
+#define TIMESTAMP_TIMEZONE_MASK		0x0FFF
+
+/* Entity identifier (ECMA 167r3 1/7.4) */
+struct regid {
+	uint8_t		flags;
+	uint8_t		ident[23];
+	uint8_t		identSuffix[8];
+} __attribute__ ((packed));
+
+/* Flags (ECMA 167r3 1/7.4.1) */
+#define ENTITYID_FLAGS_DIRTY		0x00
+#define ENTITYID_FLAGS_PROTECTED	0x01
+
+/* Volume Structure Descriptor (ECMA 167r3 2/9.1) */
+#define VSD_STD_ID_LEN			5
+struct volStructDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		structData[2041];
+} __attribute__ ((packed));
+
+/* Standard Identifier (EMCA 167r2 2/9.1.2) */
+#define VSD_STD_ID_NSR02		"NSR02"	/* (3/9.1) */
+
+/* Standard Identifier (ECMA 167r3 2/9.1.2) */
+#define VSD_STD_ID_BEA01		"BEA01"	/* (2/9.2) */
+#define VSD_STD_ID_BOOT2		"BOOT2"	/* (2/9.4) */
+#define VSD_STD_ID_CD001		"CD001"	/* (ECMA-119) */
+#define VSD_STD_ID_CDW02		"CDW02"	/* (ECMA-168) */
+#define VSD_STD_ID_NSR03		"NSR03"	/* (3/9.1) */
+#define VSD_STD_ID_TEA01		"TEA01"	/* (2/9.3) */
+
+/* Beginning Extended Area Descriptor (ECMA 167r3 2/9.2) */
+struct beginningExtendedAreaDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		structData[2041];
+} __attribute__ ((packed));
+
+/* Terminating Extended Area Descriptor (ECMA 167r3 2/9.3) */
+struct terminatingExtendedAreaDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		structData[2041];
+} __attribute__ ((packed));
+
+/* Boot Descriptor (ECMA 167r3 2/9.4) */
+struct bootDesc {
+	uint8_t			structType;
+	uint8_t			stdIdent[VSD_STD_ID_LEN];
+	uint8_t			structVersion;
+	uint8_t			reserved1;
+	struct regid		archType;
+	struct regid		bootIdent;
+	__le32			bootExtLocation;
+	__le32			bootExtLength;
+	__le64			loadAddress;
+	__le64			startAddress;
+	struct timestamp	descCreationDateAndTime;
+	__le16			flags;
+	uint8_t			reserved2[32];
+	uint8_t			bootUse[1906];
+} __attribute__ ((packed));
+
+/* Flags (ECMA 167r3 2/9.4.12) */
+#define BOOT_FLAGS_ERASE		0x01
+
+/* Extent Descriptor (ECMA 167r3 3/7.1) */
+struct extent_ad {
+	__le32		extLength;
+	__le32		extLocation;
+} __attribute__ ((packed));
+
+struct kernel_extent_ad {
+	uint32_t	extLength;
+	uint32_t	extLocation;
+};
+
+/* Descriptor Tag (ECMA 167r3 3/7.2) */
+struct tag {
+	__le16		tagIdent;
+	__le16		descVersion;
+	uint8_t		tagChecksum;
+	uint8_t		reserved;
+	__le16		tagSerialNum;
+	__le16		descCRC;
+	__le16		descCRCLength;
+	__le32		tagLocation;
+} __attribute__ ((packed));
+
+/* Tag Identifier (ECMA 167r3 3/7.2.1) */
+#define TAG_IDENT_PVD			0x0001
+#define TAG_IDENT_AVDP			0x0002
+#define TAG_IDENT_VDP			0x0003
+#define TAG_IDENT_IUVD			0x0004
+#define TAG_IDENT_PD			0x0005
+#define TAG_IDENT_LVD			0x0006
+#define TAG_IDENT_USD			0x0007
+#define TAG_IDENT_TD			0x0008
+#define TAG_IDENT_LVID			0x0009
+
+/* NSR Descriptor (ECMA 167r3 3/9.1) */
+struct NSRDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		reserved;
+	uint8_t		structData[2040];
+} __attribute__ ((packed));
+
+/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
+struct primaryVolDesc {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	__le32			primaryVolDescNum;
+	dstring			volIdent[32];
+	__le16			volSeqNum;
+	__le16			maxVolSeqNum;
+	__le16			interchangeLvl;
+	__le16			maxInterchangeLvl;
+	__le32			charSetList;
+	__le32			maxCharSetList;
+	dstring			volSetIdent[128];
+	struct charspec		descCharSet;
+	struct charspec		explanatoryCharSet;
+	struct extent_ad	volAbstract;
+	struct extent_ad	volCopyright;
+	struct regid		appIdent;
+	struct timestamp	recordingDateAndTime;
+	struct regid		impIdent;
+	uint8_t			impUse[64];
+	__le32			predecessorVolDescSeqLocation;
+	__le16			flags;
+	uint8_t			reserved[22];
+} __attribute__ ((packed));
+
+/* Flags (ECMA 167r3 3/10.1.21) */
+#define PVD_FLAGS_VSID_COMMON		0x0001
+
+/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
+struct anchorVolDescPtr {
+	struct tag		descTag;
+	struct extent_ad	mainVolDescSeqExt;
+	struct extent_ad	reserveVolDescSeqExt;
+	uint8_t	 		reserved[480];
+} __attribute__ ((packed));
+
+/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
+struct volDescPtr {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	struct extent_ad	nextVolDescSeqExt;
+	uint8_t			reserved[484];
+} __attribute__ ((packed));
+
+/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
+struct impUseVolDesc {
+	struct tag	descTag;
+	__le32		volDescSeqNum;
+	struct regid	impIdent;
+	uint8_t		impUse[460];
+} __attribute__ ((packed));
+
+/* Partition Descriptor (ECMA 167r3 3/10.5) */
+struct partitionDesc {
+	struct tag descTag;
+	__le32 volDescSeqNum;
+	__le16 partitionFlags;
+	__le16 partitionNumber;
+	struct regid partitionContents;
+	uint8_t partitionContentsUse[128];
+	__le32 accessType;
+	__le32 partitionStartingLocation;
+	__le32 partitionLength;
+	struct regid impIdent;
+	uint8_t impUse[128];
+	uint8_t reserved[156];
+} __attribute__ ((packed));
+
+/* Partition Flags (ECMA 167r3 3/10.5.3) */
+#define PD_PARTITION_FLAGS_ALLOC	0x0001
+
+/* Partition Contents (ECMA 167r2 3/10.5.3) */
+#define PD_PARTITION_CONTENTS_NSR02	"+NSR02"
+
+/* Partition Contents (ECMA 167r3 3/10.5.5) */
+#define PD_PARTITION_CONTENTS_FDC01	"+FDC01"
+#define PD_PARTITION_CONTENTS_CD001	"+CD001"
+#define PD_PARTITION_CONTENTS_CDW02	"+CDW02"
+#define PD_PARTITION_CONTENTS_NSR03	"+NSR03"
+
+/* Access Type (ECMA 167r3 3/10.5.7) */
+#define PD_ACCESS_TYPE_NONE		0x00000000
+#define PD_ACCESS_TYPE_READ_ONLY	0x00000001
+#define PD_ACCESS_TYPE_WRITE_ONCE	0x00000002
+#define PD_ACCESS_TYPE_REWRITABLE	0x00000003
+#define PD_ACCESS_TYPE_OVERWRITABLE	0x00000004
+
+/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
+struct logicalVolDesc {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	struct charspec		descCharSet;
+	dstring			logicalVolIdent[128];
+	__le32			logicalBlockSize;
+	struct regid		domainIdent;
+	uint8_t			logicalVolContentsUse[16];
+	__le32			mapTableLength;
+	__le32			numPartitionMaps;
+	struct regid		impIdent;
+	uint8_t			impUse[128];
+	struct extent_ad	integritySeqExt;
+	uint8_t			partitionMaps[0];
+} __attribute__ ((packed));
+
+/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
+struct genericPartitionMap {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		partitionMapping[0];
+} __attribute__ ((packed));
+
+/* Partition Map Type (ECMA 167r3 3/10.7.1.1) */
+#define GP_PARTITION_MAP_TYPE_UNDEF	0x00
+#define GP_PARTIITON_MAP_TYPE_1		0x01
+#define GP_PARTITION_MAP_TYPE_2		0x02
+
+/* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */
+struct genericPartitionMap1 {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+} __attribute__ ((packed));
+
+/* Type 2 Partition Map (ECMA 167r3 3/10.7.3) */
+struct genericPartitionMap2 {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		partitionIdent[62];
+} __attribute__ ((packed));
+
+/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
+struct unallocSpaceDesc {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	__le32			numAllocDescs;
+	struct extent_ad	allocDescs[0];
+} __attribute__ ((packed));
+
+/* Terminating Descriptor (ECMA 167r3 3/10.9) */
+struct terminatingDesc {
+	struct tag	descTag;
+	uint8_t		reserved[496];
+} __attribute__ ((packed));
+
+/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
+struct logicalVolIntegrityDesc {
+	struct tag		descTag;
+	struct timestamp	recordingDateAndTime;
+	__le32			integrityType;
+	struct extent_ad	nextIntegrityExt;
+	uint8_t			logicalVolContentsUse[32];
+	__le32			numOfPartitions;
+	__le32			lengthOfImpUse;
+	__le32			freeSpaceTable[0];
+	__le32			sizeTable[0];
+	uint8_t			impUse[0];
+} __attribute__ ((packed));
+
+/* Integrity Type (ECMA 167r3 3/10.10.3) */
+#define LVID_INTEGRITY_TYPE_OPEN	0x00000000
+#define LVID_INTEGRITY_TYPE_CLOSE	0x00000001
+
+/* Recorded Address (ECMA 167r3 4/7.1) */
+struct lb_addr {
+	__le32		logicalBlockNum;
+	__le16	 	partitionReferenceNum;
+} __attribute__ ((packed));
+
+/* ... and its in-core analog */
+struct kernel_lb_addr {
+	uint32_t		logicalBlockNum;
+	uint16_t	 	partitionReferenceNum;
+};
+
+/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
+struct short_ad {
+        __le32		extLength;
+        __le32		extPosition;
+} __attribute__ ((packed));
+
+/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
+struct long_ad {
+	__le32		extLength;
+	struct lb_addr	extLocation;
+	uint8_t		impUse[6];
+} __attribute__ ((packed));
+
+struct kernel_long_ad {
+	uint32_t		extLength;
+	struct kernel_lb_addr	extLocation;
+	uint8_t			impUse[6];
+};
+
+/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
+struct ext_ad {
+	__le32		extLength;
+	__le32		recordedLength;
+	__le32		informationLength;
+	struct lb_addr	extLocation;
+} __attribute__ ((packed));
+
+struct kernel_ext_ad {
+	uint32_t		extLength;
+	uint32_t		recordedLength;
+	uint32_t		informationLength;
+	struct kernel_lb_addr	extLocation;
+};
+
+/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
+
+/* Tag Identifier (ECMA 167r3 4/7.2.1) */
+#define TAG_IDENT_FSD			0x0100
+#define TAG_IDENT_FID			0x0101
+#define TAG_IDENT_AED			0x0102
+#define TAG_IDENT_IE			0x0103
+#define TAG_IDENT_TE			0x0104
+#define TAG_IDENT_FE			0x0105
+#define TAG_IDENT_EAHD			0x0106
+#define TAG_IDENT_USE			0x0107
+#define TAG_IDENT_SBD			0x0108
+#define TAG_IDENT_PIE			0x0109
+#define TAG_IDENT_EFE			0x010A
+
+/* File Set Descriptor (ECMA 167r3 4/14.1) */
+struct fileSetDesc {
+	struct tag		descTag;
+	struct timestamp	recordingDateAndTime;
+	__le16			interchangeLvl;
+	__le16			maxInterchangeLvl;
+	__le32			charSetList;
+	__le32			maxCharSetList;
+	__le32			fileSetNum;
+	__le32			fileSetDescNum;
+	struct charspec		logicalVolIdentCharSet;
+	dstring			logicalVolIdent[128];
+	struct charspec		fileSetCharSet;
+	dstring			fileSetIdent[32];
+	dstring			copyrightFileIdent[32];
+	dstring			abstractFileIdent[32];
+	struct long_ad		rootDirectoryICB;
+	struct regid		domainIdent;
+	struct long_ad		nextExt;
+	struct long_ad		streamDirectoryICB;
+	uint8_t			reserved[32];
+} __attribute__ ((packed));
+
+/* Partition Header Descriptor (ECMA 167r3 4/14.3) */
+struct partitionHeaderDesc {
+	struct short_ad	unallocSpaceTable;
+	struct short_ad	unallocSpaceBitmap;
+	struct short_ad	partitionIntegrityTable;
+	struct short_ad	freedSpaceTable;
+	struct short_ad	freedSpaceBitmap;
+	uint8_t		reserved[88];
+} __attribute__ ((packed));
+
+/* File Identifier Descriptor (ECMA 167r3 4/14.4) */
+struct fileIdentDesc {
+	struct tag	descTag;
+	__le16		fileVersionNum;
+	uint8_t		fileCharacteristics;
+	uint8_t		lengthFileIdent;
+	struct long_ad	icb;
+	__le16		lengthOfImpUse;
+	uint8_t		impUse[0];
+	uint8_t		fileIdent[0];
+	uint8_t		padding[0];
+} __attribute__ ((packed));
+
+/* File Characteristics (ECMA 167r3 4/14.4.3) */
+#define FID_FILE_CHAR_HIDDEN		0x01
+#define FID_FILE_CHAR_DIRECTORY		0x02
+#define FID_FILE_CHAR_DELETED		0x04
+#define FID_FILE_CHAR_PARENT		0x08
+#define FID_FILE_CHAR_METADATA		0x10
+
+/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
+struct allocExtDesc {
+	struct tag	descTag;
+	__le32		previousAllocExtLocation;
+	__le32		lengthAllocDescs;
+} __attribute__ ((packed));
+
+/* ICB Tag (ECMA 167r3 4/14.6) */
+struct icbtag {
+	__le32		priorRecordedNumDirectEntries;
+	__le16		strategyType;
+	__le16		strategyParameter;
+	__le16		numEntries;
+	uint8_t		reserved;
+	uint8_t		fileType;
+	struct lb_addr	parentICBLocation;
+	__le16		flags;
+} __attribute__ ((packed));
+
+/* Strategy Type (ECMA 167r3 4/14.6.2) */
+#define ICBTAG_STRATEGY_TYPE_UNDEF	0x0000
+#define ICBTAG_STRATEGY_TYPE_1		0x0001
+#define ICBTAG_STRATEGY_TYPE_2		0x0002
+#define ICBTAG_STRATEGY_TYPE_3		0x0003
+#define ICBTAG_STRATEGY_TYPE_4		0x0004
+
+/* File Type (ECMA 167r3 4/14.6.6) */
+#define ICBTAG_FILE_TYPE_UNDEF		0x00
+#define ICBTAG_FILE_TYPE_USE		0x01
+#define ICBTAG_FILE_TYPE_PIE		0x02
+#define ICBTAG_FILE_TYPE_IE		0x03
+#define ICBTAG_FILE_TYPE_DIRECTORY	0x04
+#define ICBTAG_FILE_TYPE_REGULAR	0x05
+#define ICBTAG_FILE_TYPE_BLOCK		0x06
+#define ICBTAG_FILE_TYPE_CHAR		0x07
+#define ICBTAG_FILE_TYPE_EA		0x08
+#define ICBTAG_FILE_TYPE_FIFO		0x09
+#define ICBTAG_FILE_TYPE_SOCKET		0x0A
+#define ICBTAG_FILE_TYPE_TE		0x0B
+#define ICBTAG_FILE_TYPE_SYMLINK	0x0C
+#define ICBTAG_FILE_TYPE_STREAMDIR	0x0D
+
+/* Flags (ECMA 167r3 4/14.6.8) */
+#define ICBTAG_FLAG_AD_MASK		0x0007
+#define ICBTAG_FLAG_AD_SHORT		0x0000
+#define ICBTAG_FLAG_AD_LONG		0x0001
+#define ICBTAG_FLAG_AD_EXTENDED		0x0002
+#define ICBTAG_FLAG_AD_IN_ICB		0x0003
+#define ICBTAG_FLAG_SORTED		0x0008
+#define ICBTAG_FLAG_NONRELOCATABLE	0x0010
+#define ICBTAG_FLAG_ARCHIVE		0x0020
+#define ICBTAG_FLAG_SETUID		0x0040
+#define ICBTAG_FLAG_SETGID		0x0080
+#define ICBTAG_FLAG_STICKY		0x0100
+#define ICBTAG_FLAG_CONTIGUOUS		0x0200
+#define ICBTAG_FLAG_SYSTEM		0x0400
+#define ICBTAG_FLAG_TRANSFORMED		0x0800
+#define ICBTAG_FLAG_MULTIVERSIONS	0x1000
+#define ICBTAG_FLAG_STREAM		0x2000
+
+/* Indirect Entry (ECMA 167r3 4/14.7) */
+struct indirectEntry {
+	struct tag	descTag;
+	struct icbtag	icbTag;
+	struct long_ad	indirectICB;
+} __attribute__ ((packed));
+
+/* Terminal Entry (ECMA 167r3 4/14.8) */
+struct terminalEntry {
+	struct tag	descTag;
+	struct icbtag	icbTag;
+} __attribute__ ((packed));
+
+/* File Entry (ECMA 167r3 4/14.9) */
+struct fileEntry {
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	__le32			uid;
+	__le32			gid;
+	__le32			permissions;
+	__le16			fileLinkCount;
+	uint8_t			recordFormat;
+	uint8_t			recordDisplayAttr;
+	__le32			recordLength;
+	__le64			informationLength;
+	__le64			logicalBlocksRecorded;
+	struct timestamp	accessTime;
+	struct timestamp	modificationTime;
+	struct timestamp	attrTime;
+	__le32			checkpoint;
+	struct long_ad		extendedAttrICB;
+	struct regid		impIdent;
+	__le64			uniqueID;
+	__le32			lengthExtendedAttr;
+	__le32			lengthAllocDescs;
+	uint8_t			extendedAttr[0];
+	uint8_t			allocDescs[0];
+} __attribute__ ((packed));
+
+/* Permissions (ECMA 167r3 4/14.9.5) */
+#define FE_PERM_O_EXEC			0x00000001U
+#define FE_PERM_O_WRITE			0x00000002U
+#define FE_PERM_O_READ			0x00000004U
+#define FE_PERM_O_CHATTR		0x00000008U
+#define FE_PERM_O_DELETE		0x00000010U
+#define FE_PERM_G_EXEC			0x00000020U
+#define FE_PERM_G_WRITE			0x00000040U
+#define FE_PERM_G_READ			0x00000080U
+#define FE_PERM_G_CHATTR		0x00000100U
+#define FE_PERM_G_DELETE		0x00000200U
+#define FE_PERM_U_EXEC			0x00000400U
+#define FE_PERM_U_WRITE			0x00000800U
+#define FE_PERM_U_READ			0x00001000U
+#define FE_PERM_U_CHATTR		0x00002000U
+#define FE_PERM_U_DELETE		0x00004000U
+
+/* Record Format (ECMA 167r3 4/14.9.7) */
+#define FE_RECORD_FMT_UNDEF		0x00
+#define FE_RECORD_FMT_FIXED_PAD		0x01
+#define FE_RECORD_FMT_FIXED		0x02
+#define FE_RECORD_FMT_VARIABLE8		0x03
+#define FE_RECORD_FMT_VARIABLE16	0x04
+#define FE_RECORD_FMT_VARIABLE16_MSB	0x05
+#define FE_RECORD_FMT_VARIABLE32	0x06
+#define FE_RECORD_FMT_PRINT		0x07
+#define FE_RECORD_FMT_LF		0x08
+#define FE_RECORD_FMT_CR		0x09
+#define FE_RECORD_FMT_CRLF		0x0A
+#define FE_RECORD_FMT_LFCR		0x0B
+
+/* Record Display Attributes (ECMA 167r3 4/14.9.8) */
+#define FE_RECORD_DISPLAY_ATTR_UNDEF	0x00
+#define FE_RECORD_DISPLAY_ATTR_1	0x01
+#define FE_RECORD_DISPLAY_ATTR_2	0x02
+#define FE_RECORD_DISPLAY_ATTR_3	0x03
+
+/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
+struct extendedAttrHeaderDesc {
+	struct tag	descTag;
+	__le32		impAttrLocation;
+	__le32		appAttrLocation;
+} __attribute__ ((packed));
+
+/* Generic Format (ECMA 167r3 4/14.10.2) */
+struct genericFormat {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	uint8_t		attrData[0];
+} __attribute__ ((packed));
+
+/* Character Set Information (ECMA 167r3 4/14.10.3) */
+struct charSetInfo {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		escapeSeqLength;
+	uint8_t		charSetType;
+	uint8_t		escapeSeq[0];
+} __attribute__ ((packed));
+
+/* Alternate Permissions (ECMA 167r3 4/14.10.4) */
+struct altPerms {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le16		ownerIdent;
+	__le16		groupIdent;
+	__le16		permission;
+} __attribute__ ((packed));
+
+/* File Times Extended Attribute (ECMA 167r3 4/14.10.5) */
+struct fileTimesExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		dataLength;
+	__le32		fileTimeExistence;
+	uint8_t		fileTimes;
+} __attribute__ ((packed));
+
+/* FileTimeExistence (ECMA 167r3 4/14.10.5.6) */
+#define FTE_CREATION			0x00000001
+#define FTE_DELETION			0x00000004
+#define FTE_EFFECTIVE			0x00000008
+#define FTE_BACKUP			0x00000002
+
+/* Information Times Extended Attribute (ECMA 167r3 4/14.10.6) */
+struct infoTimesExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		dataLength;
+	__le32		infoTimeExistence;
+	uint8_t		infoTimes[0];
+} __attribute__ ((packed));
+
+/* Device Specification (ECMA 167r3 4/14.10.7) */
+struct deviceSpec {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		impUseLength;
+	__le32		majorDeviceIdent;
+	__le32		minorDeviceIdent;
+	uint8_t		impUse[0];
+} __attribute__ ((packed));
+
+/* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */
+struct impUseExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		impUseLength;
+	struct regid	impIdent;
+	uint8_t		impUse[0];
+} __attribute__ ((packed));
+
+/* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */
+struct appUseExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		appUseLength;
+	struct regid	appIdent;
+	uint8_t		appUse[0];
+} __attribute__ ((packed));
+
+#define EXTATTR_CHAR_SET		1
+#define EXTATTR_ALT_PERMS		3
+#define EXTATTR_FILE_TIMES		5
+#define EXTATTR_INFO_TIMES		6
+#define EXTATTR_DEV_SPEC		12
+#define EXTATTR_IMP_USE			2048
+#define EXTATTR_APP_USE			65536
+
+/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
+struct unallocSpaceEntry {
+	struct tag	descTag;
+	struct icbtag	icbTag;
+	__le32		lengthAllocDescs;
+	uint8_t		allocDescs[0];
+} __attribute__ ((packed));
+
+/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
+struct spaceBitmapDesc {
+	struct tag	descTag;
+	__le32		numOfBits;
+	__le32		numOfBytes;
+	uint8_t		bitmap[0];
+} __attribute__ ((packed));
+
+/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
+struct partitionIntegrityEntry {
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	struct timestamp	recordingDateAndTime;
+	uint8_t			integrityType;
+	uint8_t			reserved[175];
+	struct regid		impIdent;
+	uint8_t			impUse[256];
+} __attribute__ ((packed));
+
+/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
+
+/* Extent Length (ECMA 167r3 4/14.14.1.1) */
+#define EXT_RECORDED_ALLOCATED		0x00000000
+#define EXT_NOT_RECORDED_ALLOCATED	0x40000000
+#define EXT_NOT_RECORDED_NOT_ALLOCATED	0x80000000
+#define EXT_NEXT_EXTENT_ALLOCDECS	0xC0000000
+
+/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
+
+/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
+
+/* Logical Volume Header Descriptor (ECMA 167r3 4/14.15) */
+struct logicalVolHeaderDesc {
+	__le64		uniqueID;
+	uint8_t		reserved[24];
+} __attribute__ ((packed));
+
+/* Path Component (ECMA 167r3 4/14.16.1) */
+struct pathComponent {
+	uint8_t		componentType;
+	uint8_t		lengthComponentIdent;
+	__le16		componentFileVersionNum;
+	dstring		componentIdent[0];
+} __attribute__ ((packed));
+
+/* File Entry (ECMA 167r3 4/14.17) */
+struct extendedFileEntry {
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	__le32			uid;
+	__le32			gid;
+	__le32			permissions;
+	__le16			fileLinkCount;
+	uint8_t			recordFormat;
+	uint8_t			recordDisplayAttr;
+	__le32			recordLength;
+	__le64			informationLength;
+	__le64			objectSize;
+	__le64			logicalBlocksRecorded;
+	struct timestamp	accessTime;
+	struct timestamp	modificationTime;
+	struct timestamp	createTime;
+	struct timestamp	attrTime;
+	__le32			checkpoint;
+	__le32			reserved;
+	struct long_ad		extendedAttrICB;
+	struct long_ad		streamDirectoryICB;
+	struct regid		impIdent;
+	__le64			uniqueID;
+	__le32			lengthExtendedAttr;
+	__le32			lengthAllocDescs;
+	uint8_t			extendedAttr[0];
+	uint8_t			allocDescs[0];
+} __attribute__ ((packed));
+
+#endif /* _ECMA_167_H */
diff --git a/kernel/fs/udf/file.c b/kernel/fs/udf/file.c
new file mode 100644
index 000000000..7a95b8fed
--- /dev/null
+++ b/kernel/fs/udf/file.c
@@ -0,0 +1,273 @@
+/*
+ * file.c
+ *
+ * PURPOSE
+ *  File handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *  This file is distributed under the terms of the GNU General Public
+ *  License (GPL). Copies of the GPL can be obtained from:
+ *    ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *  Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-1999 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  10/02/98 dgb  Attempt to integrate into udf.o
+ *  10/07/98      Switched to using generic_readpage, etc., like isofs
+ *                And it works!
+ *  12/06/98 blf  Added udf_file_read. uses generic_file_read for all cases but
+ *                ICBTAG_FLAG_AD_IN_ICB.
+ *  04/06/99      64 bit file handling on 32 bit systems taken from ext2 file.c
+ *  05/12/99      Preliminary file write support
+ */
+
+#include "udfdecl.h"
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memset */
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+static void __udf_adinicb_readpage(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	kaddr = kmap(page);
+	memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
+	memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	kunmap(page);
+}
+
+static int udf_adinicb_readpage(struct file *file, struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	__udf_adinicb_readpage(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+static int udf_adinicb_writepage(struct page *page,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	BUG_ON(!PageLocked(page));
+
+	kaddr = kmap(page);
+	memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size);
+	mark_inode_dirty(inode);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+static int udf_adinicb_write_begin(struct file *file,
+			struct address_space *mapping, loff_t pos,
+			unsigned len, unsigned flags, struct page **pagep,
+			void **fsdata)
+{
+	struct page *page;
+
+	if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
+		return -EIO;
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
+		__udf_adinicb_readpage(page);
+	return 0;
+}
+
+static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+				     loff_t offset)
+{
+	/* Fallback to buffered I/O. */
+	return 0;
+}
+
+const struct address_space_operations udf_adinicb_aops = {
+	.readpage	= udf_adinicb_readpage,
+	.writepage	= udf_adinicb_writepage,
+	.write_begin	= udf_adinicb_write_begin,
+	.write_end	= simple_write_end,
+	.direct_IO	= udf_adinicb_direct_IO,
+};
+
+static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	ssize_t retval;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int err;
+
+	mutex_lock(&inode->i_mutex);
+
+	retval = generic_write_checks(iocb, from);
+	if (retval <= 0)
+		goto out;
+
+	down_write(&iinfo->i_data_sem);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		loff_t end = iocb->ki_pos + iov_iter_count(from);
+
+		if (inode->i_sb->s_blocksize <
+				(udf_file_entry_alloc_offset(inode) + end)) {
+			err = udf_expand_file_adinicb(inode);
+			if (err) {
+				mutex_unlock(&inode->i_mutex);
+				udf_debug("udf_expand_adinicb: err=%d\n", err);
+				return err;
+			}
+		} else {
+			iinfo->i_lenAlloc = max(end, inode->i_size);
+			up_write(&iinfo->i_data_sem);
+		}
+	} else
+		up_write(&iinfo->i_data_sem);
+
+	retval = __generic_file_write_iter(iocb, from);
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	if (retval > 0) {
+		ssize_t err;
+
+		mark_inode_dirty(inode);
+		err = generic_write_sync(file, iocb->ki_pos - retval, retval);
+		if (err < 0)
+			retval = err;
+	}
+
+	return retval;
+}
+
+long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	long old_block, new_block;
+	int result = -EINVAL;
+
+	if (inode_permission(inode, MAY_READ) != 0) {
+		udf_debug("no permission to access inode %lu\n", inode->i_ino);
+		result = -EPERM;
+		goto out;
+	}
+
+	if (!arg) {
+		udf_debug("invalid argument to udf_ioctl\n");
+		result = -EINVAL;
+		goto out;
+	}
+
+	switch (cmd) {
+	case UDF_GETVOLIDENT:
+		if (copy_to_user((char __user *)arg,
+				 UDF_SB(inode->i_sb)->s_volume_ident, 32))
+			result = -EFAULT;
+		else
+			result = 0;
+		goto out;
+	case UDF_RELOCATE_BLOCKS:
+		if (!capable(CAP_SYS_ADMIN)) {
+			result = -EPERM;
+			goto out;
+		}
+		if (get_user(old_block, (long __user *)arg)) {
+			result = -EFAULT;
+			goto out;
+		}
+		result = udf_relocate_blocks(inode->i_sb,
+						old_block, &new_block);
+		if (result == 0)
+			result = put_user(new_block, (long __user *)arg);
+		goto out;
+	case UDF_GETEASIZE:
+		result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
+		goto out;
+	case UDF_GETEABLOCK:
+		result = copy_to_user((char __user *)arg,
+				      UDF_I(inode)->i_ext.i_data,
+				      UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
+		goto out;
+	}
+
+out:
+	return result;
+}
+
+static int udf_release_file(struct inode *inode, struct file *filp)
+{
+	if (filp->f_mode & FMODE_WRITE &&
+	    atomic_read(&inode->i_writecount) == 1) {
+		/*
+		 * Grab i_mutex to avoid races with writes changing i_size
+		 * while we are running.
+		 */
+		mutex_lock(&inode->i_mutex);
+		down_write(&UDF_I(inode)->i_data_sem);
+		udf_discard_prealloc(inode);
+		udf_truncate_tail_extent(inode);
+		up_write(&UDF_I(inode)->i_data_sem);
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+const struct file_operations udf_file_operations = {
+	.read_iter		= generic_file_read_iter,
+	.unlocked_ioctl		= udf_ioctl,
+	.open			= generic_file_open,
+	.mmap			= generic_file_mmap,
+	.write_iter		= udf_file_write_iter,
+	.release		= udf_release_file,
+	.fsync			= generic_file_fsync,
+	.splice_read		= generic_file_splice_read,
+	.llseek			= generic_file_llseek,
+};
+
+static int udf_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = udf_setsize(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations udf_file_inode_operations = {
+	.setattr		= udf_setattr,
+};
diff --git a/kernel/fs/udf/ialloc.c b/kernel/fs/udf/ialloc.c
new file mode 100644
index 000000000..e77db621e
--- /dev/null
+++ b/kernel/fs/udf/ialloc.c
@@ -0,0 +1,133 @@
+/*
+ * ialloc.c
+ *
+ * PURPOSE
+ *	Inode allocation handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2001 Ben Fennema
+ *
+ * HISTORY
+ *
+ *  02/24/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+void udf_free_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
+
+	if (lvidiu) {
+		mutex_lock(&sbi->s_alloc_mutex);
+		if (S_ISDIR(inode->i_mode))
+			le32_add_cpu(&lvidiu->numDirs, -1);
+		else
+			le32_add_cpu(&lvidiu->numFiles, -1);
+		udf_updated_lvid(sb);
+		mutex_unlock(&sbi->s_alloc_mutex);
+	}
+
+	udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
+}
+
+struct inode *udf_new_inode(struct inode *dir, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct inode *inode;
+	int block;
+	uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
+	struct udf_inode_info *iinfo;
+	struct udf_inode_info *dinfo = UDF_I(dir);
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+	int err;
+
+	inode = new_inode(sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	iinfo = UDF_I(inode);
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
+		iinfo->i_efe = 1;
+		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
+			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct extendedFileEntry),
+					    GFP_KERNEL);
+	} else {
+		iinfo->i_efe = 0;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct fileEntry),
+					    GFP_KERNEL);
+	}
+	if (!iinfo->i_ext.i_data) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = -ENOSPC;
+	block = udf_new_block(dir->i_sb, NULL,
+			      dinfo->i_location.partitionReferenceNum,
+			      start, &err);
+	if (err) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+
+	lvidiu = udf_sb_lvidiu(sb);
+	if (lvidiu) {
+		iinfo->i_unique = lvid_get_unique_id(sb);
+		inode->i_generation = iinfo->i_unique;
+		mutex_lock(&sbi->s_alloc_mutex);
+		if (S_ISDIR(mode))
+			le32_add_cpu(&lvidiu->numDirs, 1);
+		else
+			le32_add_cpu(&lvidiu->numFiles, 1);
+		udf_updated_lvid(sb);
+		mutex_unlock(&sbi->s_alloc_mutex);
+	}
+
+	inode_init_owner(inode, dir, mode);
+
+	iinfo->i_location.logicalBlockNum = block;
+	iinfo->i_location.partitionReferenceNum =
+				dinfo->i_location.partitionReferenceNum;
+	inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
+	inode->i_blocks = 0;
+	iinfo->i_lenEAttr = 0;
+	iinfo->i_lenAlloc = 0;
+	iinfo->i_use = 0;
+	iinfo->i_checkpoint = 1;
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+	else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
+	else
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+	inode->i_mtime = inode->i_atime = inode->i_ctime =
+		iinfo->i_crtime = current_fs_time(inode->i_sb);
+	if (unlikely(insert_inode_locked(inode) < 0)) {
+		make_bad_inode(inode);
+		iput(inode);
+		return ERR_PTR(-EIO);
+	}
+	mark_inode_dirty(inode);
+
+	return inode;
+}
diff --git a/kernel/fs/udf/inode.c b/kernel/fs/udf/inode.c
new file mode 100644
index 000000000..6afac3d56
--- /dev/null
+++ b/kernel/fs/udf/inode.c
@@ -0,0 +1,2291 @@
+/*
+ * inode.c
+ *
+ * PURPOSE
+ *  Inode handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *  This file is distributed under the terms of the GNU General Public
+ *  License (GPL). Copies of the GPL can be obtained from:
+ *    ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *  Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  10/04/98 dgb  Added rudimentary directory functions
+ *  10/07/98      Fully working udf_block_map! It works!
+ *  11/25/98      bmap altered to better support extents
+ *  12/06/98 blf  partition support in udf_iget, udf_block_map
+ *                and udf_read_inode
+ *  12/12/98      rewrote udf_block_map to handle next extents and descs across
+ *                block boundaries (which is not actually allowed)
+ *  12/20/98      added support for strategy 4096
+ *  03/07/99      rewrote udf_block_map (again)
+ *                New funcs, inode_bmap, udf_next_aext
+ *  04/19/99      Support for writing device EA's for major/minor #
+ */
+
+#include "udfdecl.h"
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/crc-itu-t.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+MODULE_AUTHOR("Ben Fennema");
+MODULE_DESCRIPTION("Universal Disk Format Filesystem");
+MODULE_LICENSE("GPL");
+
+#define EXTENT_MERGE_SIZE 5
+
+static umode_t udf_convert_permissions(struct fileEntry *);
+static int udf_update_inode(struct inode *, int);
+static int udf_sync_inode(struct inode *inode);
+static int udf_alloc_i_data(struct inode *inode, size_t size);
+static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
+static int8_t udf_insert_aext(struct inode *, struct extent_position,
+			      struct kernel_lb_addr, uint32_t);
+static void udf_split_extents(struct inode *, int *, int, int,
+			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+static void udf_prealloc_extents(struct inode *, int, int,
+				 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+static void udf_merge_extents(struct inode *,
+			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+static void udf_update_extents(struct inode *,
+			       struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
+			       struct extent_position *);
+static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
+static void __udf_clear_extent_cache(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->cached_extent.lstart != -1) {
+		brelse(iinfo->cached_extent.epos.bh);
+		iinfo->cached_extent.lstart = -1;
+	}
+}
+
+/* Invalidate extent cache */
+static void udf_clear_extent_cache(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	spin_lock(&iinfo->i_extent_cache_lock);
+	__udf_clear_extent_cache(inode);
+	spin_unlock(&iinfo->i_extent_cache_lock);
+}
+
+/* Return contents of extent cache */
+static int udf_read_extent_cache(struct inode *inode, loff_t bcount,
+				 loff_t *lbcount, struct extent_position *pos)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int ret = 0;
+
+	spin_lock(&iinfo->i_extent_cache_lock);
+	if ((iinfo->cached_extent.lstart <= bcount) &&
+	    (iinfo->cached_extent.lstart != -1)) {
+		/* Cache hit */
+		*lbcount = iinfo->cached_extent.lstart;
+		memcpy(pos, &iinfo->cached_extent.epos,
+		       sizeof(struct extent_position));
+		if (pos->bh)
+			get_bh(pos->bh);
+		ret = 1;
+	}
+	spin_unlock(&iinfo->i_extent_cache_lock);
+	return ret;
+}
+
+/* Add extent to extent cache */
+static void udf_update_extent_cache(struct inode *inode, loff_t estart,
+				    struct extent_position *pos, int next_epos)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	spin_lock(&iinfo->i_extent_cache_lock);
+	/* Invalidate previously cached extent */
+	__udf_clear_extent_cache(inode);
+	if (pos->bh)
+		get_bh(pos->bh);
+	memcpy(&iinfo->cached_extent.epos, pos,
+	       sizeof(struct extent_position));
+	iinfo->cached_extent.lstart = estart;
+	if (next_epos)
+		switch (iinfo->i_alloc_type) {
+		case ICBTAG_FLAG_AD_SHORT:
+			iinfo->cached_extent.epos.offset -=
+			sizeof(struct short_ad);
+			break;
+		case ICBTAG_FLAG_AD_LONG:
+			iinfo->cached_extent.epos.offset -=
+			sizeof(struct long_ad);
+		}
+	spin_unlock(&iinfo->i_extent_cache_lock);
+}
+
+void udf_evict_inode(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		want_delete = 1;
+		udf_setsize(inode, 0);
+		udf_update_inode(inode, IS_SYNC(inode));
+	}
+	truncate_inode_pages_final(&inode->i_data);
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
+	    inode->i_size != iinfo->i_lenExtents) {
+		udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
+			 inode->i_ino, inode->i_mode,
+			 (unsigned long long)inode->i_size,
+			 (unsigned long long)iinfo->i_lenExtents);
+	}
+	kfree(iinfo->i_ext.i_data);
+	iinfo->i_ext.i_data = NULL;
+	udf_clear_extent_cache(inode);
+	if (want_delete) {
+		udf_free_inode(inode);
+	}
+}
+
+static void udf_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	loff_t isize = inode->i_size;
+
+	if (to > isize) {
+		truncate_pagecache(inode, isize);
+		if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+			down_write(&iinfo->i_data_sem);
+			udf_clear_extent_cache(inode);
+			udf_truncate_extents(inode);
+			up_write(&iinfo->i_data_sem);
+		}
+	}
+}
+
+static int udf_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, udf_get_block, wbc);
+}
+
+static int udf_writepages(struct address_space *mapping,
+			struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, udf_get_block);
+}
+
+static int udf_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, udf_get_block);
+}
+
+static int udf_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, udf_get_block);
+}
+
+static int udf_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
+	if (unlikely(ret))
+		udf_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			     loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block);
+	if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE))
+		udf_write_failed(mapping, offset + count);
+	return ret;
+}
+
+static sector_t udf_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, udf_get_block);
+}
+
+const struct address_space_operations udf_aops = {
+	.readpage	= udf_readpage,
+	.readpages	= udf_readpages,
+	.writepage	= udf_writepage,
+	.writepages	= udf_writepages,
+	.write_begin	= udf_write_begin,
+	.write_end	= generic_write_end,
+	.direct_IO	= udf_direct_IO,
+	.bmap		= udf_bmap,
+};
+
+/*
+ * Expand file stored in ICB to a normal one-block-file
+ *
+ * This function requires i_data_sem for writing and releases it.
+ * This function requires i_mutex held
+ */
+int udf_expand_file_adinicb(struct inode *inode)
+{
+	struct page *page;
+	char *kaddr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int err;
+	struct writeback_control udf_wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+	};
+
+	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+	if (!iinfo->i_lenAlloc) {
+		if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+			iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
+		else
+			iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+		/* from now on we have normal address_space methods */
+		inode->i_data.a_ops = &udf_aops;
+		up_write(&iinfo->i_data_sem);
+		mark_inode_dirty(inode);
+		return 0;
+	}
+	/*
+	 * Release i_data_sem so that we can lock a page - page lock ranks
+	 * above i_data_sem. i_mutex still protects us against file changes.
+	 */
+	up_write(&iinfo->i_data_sem);
+
+	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	if (!PageUptodate(page)) {
+		kaddr = kmap(page);
+		memset(kaddr + iinfo->i_lenAlloc, 0x00,
+		       PAGE_CACHE_SIZE - iinfo->i_lenAlloc);
+		memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
+			iinfo->i_lenAlloc);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		kunmap(page);
+	}
+	down_write(&iinfo->i_data_sem);
+	memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
+	       iinfo->i_lenAlloc);
+	iinfo->i_lenAlloc = 0;
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
+	else
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+	/* from now on we have normal address_space methods */
+	inode->i_data.a_ops = &udf_aops;
+	up_write(&iinfo->i_data_sem);
+	err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+	if (err) {
+		/* Restore everything back so that we don't lose data... */
+		lock_page(page);
+		kaddr = kmap(page);
+		down_write(&iinfo->i_data_sem);
+		memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
+		       inode->i_size);
+		kunmap(page);
+		unlock_page(page);
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+		inode->i_data.a_ops = &udf_adinicb_aops;
+		up_write(&iinfo->i_data_sem);
+	}
+	page_cache_release(page);
+	mark_inode_dirty(inode);
+
+	return err;
+}
+
+struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
+					   int *err)
+{
+	int newblock;
+	struct buffer_head *dbh = NULL;
+	struct kernel_lb_addr eloc;
+	uint8_t alloctype;
+	struct extent_position epos;
+
+	struct udf_fileident_bh sfibh, dfibh;
+	loff_t f_pos = udf_ext0_offset(inode);
+	int size = udf_ext0_offset(inode) + inode->i_size;
+	struct fileIdentDesc cfi, *sfi, *dfi;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+		alloctype = ICBTAG_FLAG_AD_SHORT;
+	else
+		alloctype = ICBTAG_FLAG_AD_LONG;
+
+	if (!inode->i_size) {
+		iinfo->i_alloc_type = alloctype;
+		mark_inode_dirty(inode);
+		return NULL;
+	}
+
+	/* alloc block, and copy data to it */
+	*block = udf_new_block(inode->i_sb, inode,
+			       iinfo->i_location.partitionReferenceNum,
+			       iinfo->i_location.logicalBlockNum, err);
+	if (!(*block))
+		return NULL;
+	newblock = udf_get_pblock(inode->i_sb, *block,
+				  iinfo->i_location.partitionReferenceNum,
+				0);
+	if (!newblock)
+		return NULL;
+	dbh = udf_tgetblk(inode->i_sb, newblock);
+	if (!dbh)
+		return NULL;
+	lock_buffer(dbh);
+	memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize);
+	set_buffer_uptodate(dbh);
+	unlock_buffer(dbh);
+	mark_buffer_dirty_inode(dbh, inode);
+
+	sfibh.soffset = sfibh.eoffset =
+			f_pos & (inode->i_sb->s_blocksize - 1);
+	sfibh.sbh = sfibh.ebh = NULL;
+	dfibh.soffset = dfibh.eoffset = 0;
+	dfibh.sbh = dfibh.ebh = dbh;
+	while (f_pos < size) {
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+		sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL,
+					 NULL, NULL, NULL);
+		if (!sfi) {
+			brelse(dbh);
+			return NULL;
+		}
+		iinfo->i_alloc_type = alloctype;
+		sfi->descTag.tagLocation = cpu_to_le32(*block);
+		dfibh.soffset = dfibh.eoffset;
+		dfibh.eoffset += (sfibh.eoffset - sfibh.soffset);
+		dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset);
+		if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse,
+				 sfi->fileIdent +
+					le16_to_cpu(sfi->lengthOfImpUse))) {
+			iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+			brelse(dbh);
+			return NULL;
+		}
+	}
+	mark_buffer_dirty_inode(dbh, inode);
+
+	memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0,
+		iinfo->i_lenAlloc);
+	iinfo->i_lenAlloc = 0;
+	eloc.logicalBlockNum = *block;
+	eloc.partitionReferenceNum =
+				iinfo->i_location.partitionReferenceNum;
+	iinfo->i_lenExtents = inode->i_size;
+	epos.bh = NULL;
+	epos.block = iinfo->i_location;
+	epos.offset = udf_file_entry_alloc_offset(inode);
+	udf_add_aext(inode, &epos, &eloc, inode->i_size, 0);
+	/* UniqueID stuff */
+
+	brelse(epos.bh);
+	mark_inode_dirty(inode);
+	return dbh;
+}
+
+static int udf_get_block(struct inode *inode, sector_t block,
+			 struct buffer_head *bh_result, int create)
+{
+	int err, new;
+	sector_t phys = 0;
+	struct udf_inode_info *iinfo;
+
+	if (!create) {
+		phys = udf_block_map(inode, block);
+		if (phys)
+			map_bh(bh_result, inode->i_sb, phys);
+		return 0;
+	}
+
+	err = -EIO;
+	new = 0;
+	iinfo = UDF_I(inode);
+
+	down_write(&iinfo->i_data_sem);
+	if (block == iinfo->i_next_alloc_block + 1) {
+		iinfo->i_next_alloc_block++;
+		iinfo->i_next_alloc_goal++;
+	}
+
+	udf_clear_extent_cache(inode);
+	phys = inode_getblk(inode, block, &err, &new);
+	if (!phys)
+		goto abort;
+
+	if (new)
+		set_buffer_new(bh_result);
+	map_bh(bh_result, inode->i_sb, phys);
+
+abort:
+	up_write(&iinfo->i_data_sem);
+	return err;
+}
+
+static struct buffer_head *udf_getblk(struct inode *inode, long block,
+				      int create, int *err)
+{
+	struct buffer_head *bh;
+	struct buffer_head dummy;
+
+	dummy.b_state = 0;
+	dummy.b_blocknr = -1000;
+	*err = udf_get_block(inode, block, &dummy, create);
+	if (!*err && buffer_mapped(&dummy)) {
+		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+		if (buffer_new(&dummy)) {
+			lock_buffer(bh);
+			memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
+			unlock_buffer(bh);
+			mark_buffer_dirty_inode(bh, inode);
+		}
+		return bh;
+	}
+
+	return NULL;
+}
+
+/* Extend the file by 'blocks' blocks, return the number of extents added */
+static int udf_do_extend_file(struct inode *inode,
+			      struct extent_position *last_pos,
+			      struct kernel_long_ad *last_ext,
+			      sector_t blocks)
+{
+	sector_t add;
+	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+	struct super_block *sb = inode->i_sb;
+	struct kernel_lb_addr prealloc_loc = {};
+	int prealloc_len = 0;
+	struct udf_inode_info *iinfo;
+	int err;
+
+	/* The previous extent is fake and we should not extend by anything
+	 * - there's nothing to do... */
+	if (!blocks && fake)
+		return 0;
+
+	iinfo = UDF_I(inode);
+	/* Round the last extent up to a multiple of block size */
+	if (last_ext->extLength & (sb->s_blocksize - 1)) {
+		last_ext->extLength =
+			(last_ext->extLength & UDF_EXTENT_FLAG_MASK) |
+			(((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) +
+			  sb->s_blocksize - 1) & ~(sb->s_blocksize - 1));
+		iinfo->i_lenExtents =
+			(iinfo->i_lenExtents + sb->s_blocksize - 1) &
+			~(sb->s_blocksize - 1);
+	}
+
+	/* Last extent are just preallocated blocks? */
+	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
+						EXT_NOT_RECORDED_ALLOCATED) {
+		/* Save the extent so that we can reattach it to the end */
+		prealloc_loc = last_ext->extLocation;
+		prealloc_len = last_ext->extLength;
+		/* Mark the extent as a hole */
+		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+		last_ext->extLocation.logicalBlockNum = 0;
+		last_ext->extLocation.partitionReferenceNum = 0;
+	}
+
+	/* Can we merge with the previous extent? */
+	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
+					EXT_NOT_RECORDED_NOT_ALLOCATED) {
+		add = ((1 << 30) - sb->s_blocksize -
+			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) >>
+			sb->s_blocksize_bits;
+		if (add > blocks)
+			add = blocks;
+		blocks -= add;
+		last_ext->extLength += add << sb->s_blocksize_bits;
+	}
+
+	if (fake) {
+		udf_add_aext(inode, last_pos, &last_ext->extLocation,
+			     last_ext->extLength, 1);
+		count++;
+	} else
+		udf_write_aext(inode, last_pos, &last_ext->extLocation,
+				last_ext->extLength, 1);
+
+	/* Managed to do everything necessary? */
+	if (!blocks)
+		goto out;
+
+	/* All further extents will be NOT_RECORDED_NOT_ALLOCATED */
+	last_ext->extLocation.logicalBlockNum = 0;
+	last_ext->extLocation.partitionReferenceNum = 0;
+	add = (1 << (30-sb->s_blocksize_bits)) - 1;
+	last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+				(add << sb->s_blocksize_bits);
+
+	/* Create enough extents to cover the whole hole */
+	while (blocks > add) {
+		blocks -= add;
+		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+				   last_ext->extLength, 1);
+		if (err)
+			return err;
+		count++;
+	}
+	if (blocks) {
+		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+			(blocks << sb->s_blocksize_bits);
+		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+				   last_ext->extLength, 1);
+		if (err)
+			return err;
+		count++;
+	}
+
+out:
+	/* Do we have some preallocated blocks saved? */
+	if (prealloc_len) {
+		err = udf_add_aext(inode, last_pos, &prealloc_loc,
+				   prealloc_len, 1);
+		if (err)
+			return err;
+		last_ext->extLocation = prealloc_loc;
+		last_ext->extLength = prealloc_len;
+		count++;
+	}
+
+	/* last_pos should point to the last written extent... */
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		last_pos->offset -= sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		last_pos->offset -= sizeof(struct long_ad);
+	else
+		return -EIO;
+
+	return count;
+}
+
+static int udf_extend_file(struct inode *inode, loff_t newsize)
+{
+
+	struct extent_position epos;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	int8_t etype;
+	struct super_block *sb = inode->i_sb;
+	sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct kernel_long_ad extent;
+	int err;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		BUG();
+
+	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+
+	/* File has extent covering the new size (could happen when extending
+	 * inside a block)? */
+	if (etype != -1)
+		return 0;
+	if (newsize & (sb->s_blocksize - 1))
+		offset++;
+	/* Extended file just to the boundary of the last file block? */
+	if (offset == 0)
+		return 0;
+
+	/* Truncate is extending the file by 'offset' blocks */
+	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+	    (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+		/* File has no extents at all or has empty last
+		 * indirect extent! Create a fake extent... */
+		extent.extLocation.logicalBlockNum = 0;
+		extent.extLocation.partitionReferenceNum = 0;
+		extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+	} else {
+		epos.offset -= adsize;
+		etype = udf_next_aext(inode, &epos, &extent.extLocation,
+				      &extent.extLength, 0);
+		extent.extLength |= etype << 30;
+	}
+	err = udf_do_extend_file(inode, &epos, &extent, offset);
+	if (err < 0)
+		goto out;
+	err = 0;
+	iinfo->i_lenExtents = newsize;
+out:
+	brelse(epos.bh);
+	return err;
+}
+
+static sector_t inode_getblk(struct inode *inode, sector_t block,
+			     int *err, int *new)
+{
+	struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
+	struct extent_position prev_epos, cur_epos, next_epos;
+	int count = 0, startnum = 0, endnum = 0;
+	uint32_t elen = 0, tmpelen;
+	struct kernel_lb_addr eloc, tmpeloc;
+	int c = 1;
+	loff_t lbcount = 0, b_off = 0;
+	uint32_t newblocknum, newblock;
+	sector_t offset = 0;
+	int8_t etype;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
+	int lastblock = 0;
+	bool isBeyondEOF;
+
+	*err = 0;
+	*new = 0;
+	prev_epos.offset = udf_file_entry_alloc_offset(inode);
+	prev_epos.block = iinfo->i_location;
+	prev_epos.bh = NULL;
+	cur_epos = next_epos = prev_epos;
+	b_off = (loff_t)block << inode->i_sb->s_blocksize_bits;
+
+	/* find the extent which contains the block we are looking for.
+	   alternate between laarr[0] and laarr[1] for locations of the
+	   current extent, and the previous extent */
+	do {
+		if (prev_epos.bh != cur_epos.bh) {
+			brelse(prev_epos.bh);
+			get_bh(cur_epos.bh);
+			prev_epos.bh = cur_epos.bh;
+		}
+		if (cur_epos.bh != next_epos.bh) {
+			brelse(cur_epos.bh);
+			get_bh(next_epos.bh);
+			cur_epos.bh = next_epos.bh;
+		}
+
+		lbcount += elen;
+
+		prev_epos.block = cur_epos.block;
+		cur_epos.block = next_epos.block;
+
+		prev_epos.offset = cur_epos.offset;
+		cur_epos.offset = next_epos.offset;
+
+		etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 1);
+		if (etype == -1)
+			break;
+
+		c = !c;
+
+		laarr[c].extLength = (etype << 30) | elen;
+		laarr[c].extLocation = eloc;
+
+		if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
+			pgoal = eloc.logicalBlockNum +
+				((elen + inode->i_sb->s_blocksize - 1) >>
+				 inode->i_sb->s_blocksize_bits);
+
+		count++;
+	} while (lbcount + elen <= b_off);
+
+	b_off -= lbcount;
+	offset = b_off >> inode->i_sb->s_blocksize_bits;
+	/*
+	 * Move prev_epos and cur_epos into indirect extent if we are at
+	 * the pointer to it
+	 */
+	udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, 0);
+	udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, 0);
+
+	/* if the extent is allocated and recorded, return the block
+	   if the extent is not a multiple of the blocksize, round up */
+
+	if (etype == (EXT_RECORDED_ALLOCATED >> 30)) {
+		if (elen & (inode->i_sb->s_blocksize - 1)) {
+			elen = EXT_RECORDED_ALLOCATED |
+				((elen + inode->i_sb->s_blocksize - 1) &
+				 ~(inode->i_sb->s_blocksize - 1));
+			udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
+		}
+		brelse(prev_epos.bh);
+		brelse(cur_epos.bh);
+		brelse(next_epos.bh);
+		newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
+		return newblock;
+	}
+
+	/* Are we beyond EOF? */
+	if (etype == -1) {
+		int ret;
+		isBeyondEOF = true;
+		if (count) {
+			if (c)
+				laarr[0] = laarr[1];
+			startnum = 1;
+		} else {
+			/* Create a fake extent when there's not one */
+			memset(&laarr[0].extLocation, 0x00,
+				sizeof(struct kernel_lb_addr));
+			laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+			/* Will udf_do_extend_file() create real extent from
+			   a fake one? */
+			startnum = (offset > 0);
+		}
+		/* Create extents for the hole between EOF and offset */
+		ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
+		if (ret < 0) {
+			brelse(prev_epos.bh);
+			brelse(cur_epos.bh);
+			brelse(next_epos.bh);
+			*err = ret;
+			return 0;
+		}
+		c = 0;
+		offset = 0;
+		count += ret;
+		/* We are not covered by a preallocated extent? */
+		if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) !=
+						EXT_NOT_RECORDED_ALLOCATED) {
+			/* Is there any real extent? - otherwise we overwrite
+			 * the fake one... */
+			if (count)
+				c = !c;
+			laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+				inode->i_sb->s_blocksize;
+			memset(&laarr[c].extLocation, 0x00,
+				sizeof(struct kernel_lb_addr));
+			count++;
+		}
+		endnum = c + 1;
+		lastblock = 1;
+	} else {
+		isBeyondEOF = false;
+		endnum = startnum = ((count > 2) ? 2 : count);
+
+		/* if the current extent is in position 0,
+		   swap it with the previous */
+		if (!c && count != 1) {
+			laarr[2] = laarr[0];
+			laarr[0] = laarr[1];
+			laarr[1] = laarr[2];
+			c = 1;
+		}
+
+		/* if the current block is located in an extent,
+		   read the next extent */
+		etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 0);
+		if (etype != -1) {
+			laarr[c + 1].extLength = (etype << 30) | elen;
+			laarr[c + 1].extLocation = eloc;
+			count++;
+			startnum++;
+			endnum++;
+		} else
+			lastblock = 1;
+	}
+
+	/* if the current extent is not recorded but allocated, get the
+	 * block in the extent corresponding to the requested block */
+	if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30))
+		newblocknum = laarr[c].extLocation.logicalBlockNum + offset;
+	else { /* otherwise, allocate a new block */
+		if (iinfo->i_next_alloc_block == block)
+			goal = iinfo->i_next_alloc_goal;
+
+		if (!goal) {
+			if (!(goal = pgoal)) /* XXX: what was intended here? */
+				goal = iinfo->i_location.logicalBlockNum + 1;
+		}
+
+		newblocknum = udf_new_block(inode->i_sb, inode,
+				iinfo->i_location.partitionReferenceNum,
+				goal, err);
+		if (!newblocknum) {
+			brelse(prev_epos.bh);
+			brelse(cur_epos.bh);
+			brelse(next_epos.bh);
+			*err = -ENOSPC;
+			return 0;
+		}
+		if (isBeyondEOF)
+			iinfo->i_lenExtents += inode->i_sb->s_blocksize;
+	}
+
+	/* if the extent the requsted block is located in contains multiple
+	 * blocks, split the extent into at most three extents. blocks prior
+	 * to requested block, requested block, and blocks after requested
+	 * block */
+	udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
+
+#ifdef UDF_PREALLOCATE
+	/* We preallocate blocks only for regular files. It also makes sense
+	 * for directories but there's a problem when to drop the
+	 * preallocation. We might use some delayed work for that but I feel
+	 * it's overengineering for a filesystem like UDF. */
+	if (S_ISREG(inode->i_mode))
+		udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+#endif
+
+	/* merge any continuous blocks in laarr */
+	udf_merge_extents(inode, laarr, &endnum);
+
+	/* write back the new extents, inserting new extents if the new number
+	 * of extents is greater than the old number, and deleting extents if
+	 * the new number of extents is less than the old number */
+	udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
+
+	brelse(prev_epos.bh);
+	brelse(cur_epos.bh);
+	brelse(next_epos.bh);
+
+	newblock = udf_get_pblock(inode->i_sb, newblocknum,
+				iinfo->i_location.partitionReferenceNum, 0);
+	if (!newblock) {
+		*err = -EIO;
+		return 0;
+	}
+	*new = 1;
+	iinfo->i_next_alloc_block = block;
+	iinfo->i_next_alloc_goal = newblocknum;
+	inode->i_ctime = current_fs_time(inode->i_sb);
+
+	if (IS_SYNC(inode))
+		udf_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+
+	return newblock;
+}
+
+static void udf_split_extents(struct inode *inode, int *c, int offset,
+			      int newblocknum,
+			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			      int *endnum)
+{
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) ||
+	    (laarr[*c].extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) {
+		int curr = *c;
+		int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) +
+			    blocksize - 1) >> blocksize_bits;
+		int8_t etype = (laarr[curr].extLength >> 30);
+
+		if (blen == 1)
+			;
+		else if (!offset || blen == offset + 1) {
+			laarr[curr + 2] = laarr[curr + 1];
+			laarr[curr + 1] = laarr[curr];
+		} else {
+			laarr[curr + 3] = laarr[curr + 1];
+			laarr[curr + 2] = laarr[curr + 1] = laarr[curr];
+		}
+
+		if (offset) {
+			if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+				udf_free_blocks(inode->i_sb, inode,
+						&laarr[curr].extLocation,
+						0, offset);
+				laarr[curr].extLength =
+					EXT_NOT_RECORDED_NOT_ALLOCATED |
+					(offset << blocksize_bits);
+				laarr[curr].extLocation.logicalBlockNum = 0;
+				laarr[curr].extLocation.
+						partitionReferenceNum = 0;
+			} else
+				laarr[curr].extLength = (etype << 30) |
+					(offset << blocksize_bits);
+			curr++;
+			(*c)++;
+			(*endnum)++;
+		}
+
+		laarr[curr].extLocation.logicalBlockNum = newblocknum;
+		if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
+			laarr[curr].extLocation.partitionReferenceNum =
+				UDF_I(inode)->i_location.partitionReferenceNum;
+		laarr[curr].extLength = EXT_RECORDED_ALLOCATED |
+			blocksize;
+		curr++;
+
+		if (blen != offset + 1) {
+			if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30))
+				laarr[curr].extLocation.logicalBlockNum +=
+								offset + 1;
+			laarr[curr].extLength = (etype << 30) |
+				((blen - (offset + 1)) << blocksize_bits);
+			curr++;
+			(*endnum)++;
+		}
+	}
+}
+
+static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
+				 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+				 int *endnum)
+{
+	int start, length = 0, currlength = 0, i;
+
+	if (*endnum >= (c + 1)) {
+		if (!lastblock)
+			return;
+		else
+			start = c;
+	} else {
+		if ((laarr[c + 1].extLength >> 30) ==
+					(EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+			start = c + 1;
+			length = currlength =
+				(((laarr[c + 1].extLength &
+					UDF_EXTENT_LENGTH_MASK) +
+				inode->i_sb->s_blocksize - 1) >>
+				inode->i_sb->s_blocksize_bits);
+		} else
+			start = c;
+	}
+
+	for (i = start + 1; i <= *endnum; i++) {
+		if (i == *endnum) {
+			if (lastblock)
+				length += UDF_DEFAULT_PREALLOC_BLOCKS;
+		} else if ((laarr[i].extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) {
+			length += (((laarr[i].extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+				    inode->i_sb->s_blocksize - 1) >>
+				    inode->i_sb->s_blocksize_bits);
+		} else
+			break;
+	}
+
+	if (length) {
+		int next = laarr[start].extLocation.logicalBlockNum +
+			(((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) +
+			  inode->i_sb->s_blocksize - 1) >>
+			  inode->i_sb->s_blocksize_bits);
+		int numalloc = udf_prealloc_blocks(inode->i_sb, inode,
+				laarr[start].extLocation.partitionReferenceNum,
+				next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ?
+				length : UDF_DEFAULT_PREALLOC_BLOCKS) -
+				currlength);
+		if (numalloc) 	{
+			if (start == (c + 1))
+				laarr[start].extLength +=
+					(numalloc <<
+					 inode->i_sb->s_blocksize_bits);
+			else {
+				memmove(&laarr[c + 2], &laarr[c + 1],
+					sizeof(struct long_ad) * (*endnum - (c + 1)));
+				(*endnum)++;
+				laarr[c + 1].extLocation.logicalBlockNum = next;
+				laarr[c + 1].extLocation.partitionReferenceNum =
+					laarr[c].extLocation.
+							partitionReferenceNum;
+				laarr[c + 1].extLength =
+					EXT_NOT_RECORDED_ALLOCATED |
+					(numalloc <<
+					 inode->i_sb->s_blocksize_bits);
+				start = c + 1;
+			}
+
+			for (i = start + 1; numalloc && i < *endnum; i++) {
+				int elen = ((laarr[i].extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					    inode->i_sb->s_blocksize - 1) >>
+					    inode->i_sb->s_blocksize_bits;
+
+				if (elen > numalloc) {
+					laarr[i].extLength -=
+						(numalloc <<
+						 inode->i_sb->s_blocksize_bits);
+					numalloc = 0;
+				} else {
+					numalloc -= elen;
+					if (*endnum > (i + 1))
+						memmove(&laarr[i],
+							&laarr[i + 1],
+							sizeof(struct long_ad) *
+							(*endnum - (i + 1)));
+					i--;
+					(*endnum)--;
+				}
+			}
+			UDF_I(inode)->i_lenExtents +=
+				numalloc << inode->i_sb->s_blocksize_bits;
+		}
+	}
+}
+
+static void udf_merge_extents(struct inode *inode,
+			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			      int *endnum)
+{
+	int i;
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	for (i = 0; i < (*endnum - 1); i++) {
+		struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
+		struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
+
+		if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
+			(((li->extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) ||
+			((lip1->extLocation.logicalBlockNum -
+			  li->extLocation.logicalBlockNum) ==
+			(((li->extLength & UDF_EXTENT_LENGTH_MASK) +
+			blocksize - 1) >> blocksize_bits)))) {
+
+			if (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
+				(lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
+				blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) {
+				lip1->extLength = (lip1->extLength -
+						  (li->extLength &
+						   UDF_EXTENT_LENGTH_MASK) +
+						   UDF_EXTENT_LENGTH_MASK) &
+							~(blocksize - 1);
+				li->extLength = (li->extLength &
+						 UDF_EXTENT_FLAG_MASK) +
+						(UDF_EXTENT_LENGTH_MASK + 1) -
+						blocksize;
+				lip1->extLocation.logicalBlockNum =
+					li->extLocation.logicalBlockNum +
+					((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) >>
+						blocksize_bits);
+			} else {
+				li->extLength = lip1->extLength +
+					(((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					 blocksize - 1) & ~(blocksize - 1));
+				if (*endnum > (i + 2))
+					memmove(&laarr[i + 1], &laarr[i + 2],
+						sizeof(struct long_ad) *
+						(*endnum - (i + 2)));
+				i--;
+				(*endnum)--;
+			}
+		} else if (((li->extLength >> 30) ==
+				(EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
+			   ((lip1->extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
+			udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
+					((li->extLength &
+					  UDF_EXTENT_LENGTH_MASK) +
+					 blocksize - 1) >> blocksize_bits);
+			li->extLocation.logicalBlockNum = 0;
+			li->extLocation.partitionReferenceNum = 0;
+
+			if (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
+			     (lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
+			     blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) {
+				lip1->extLength = (lip1->extLength -
+						   (li->extLength &
+						   UDF_EXTENT_LENGTH_MASK) +
+						   UDF_EXTENT_LENGTH_MASK) &
+						   ~(blocksize - 1);
+				li->extLength = (li->extLength &
+						 UDF_EXTENT_FLAG_MASK) +
+						(UDF_EXTENT_LENGTH_MASK + 1) -
+						blocksize;
+			} else {
+				li->extLength = lip1->extLength +
+					(((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					  blocksize - 1) & ~(blocksize - 1));
+				if (*endnum > (i + 2))
+					memmove(&laarr[i + 1], &laarr[i + 2],
+						sizeof(struct long_ad) *
+						(*endnum - (i + 2)));
+				i--;
+				(*endnum)--;
+			}
+		} else if ((li->extLength >> 30) ==
+					(EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+			udf_free_blocks(inode->i_sb, inode,
+					&li->extLocation, 0,
+					((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					 blocksize - 1) >> blocksize_bits);
+			li->extLocation.logicalBlockNum = 0;
+			li->extLocation.partitionReferenceNum = 0;
+			li->extLength = (li->extLength &
+						UDF_EXTENT_LENGTH_MASK) |
+						EXT_NOT_RECORDED_NOT_ALLOCATED;
+		}
+	}
+}
+
+static void udf_update_extents(struct inode *inode,
+			       struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			       int startnum, int endnum,
+			       struct extent_position *epos)
+{
+	int start = 0, i;
+	struct kernel_lb_addr tmploc;
+	uint32_t tmplen;
+
+	if (startnum > endnum) {
+		for (i = 0; i < (startnum - endnum); i++)
+			udf_delete_aext(inode, *epos, laarr[i].extLocation,
+					laarr[i].extLength);
+	} else if (startnum < endnum) {
+		for (i = 0; i < (endnum - startnum); i++) {
+			udf_insert_aext(inode, *epos, laarr[i].extLocation,
+					laarr[i].extLength);
+			udf_next_aext(inode, epos, &laarr[i].extLocation,
+				      &laarr[i].extLength, 1);
+			start++;
+		}
+	}
+
+	for (i = start; i < endnum; i++) {
+		udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
+		udf_write_aext(inode, epos, &laarr[i].extLocation,
+			       laarr[i].extLength, 1);
+	}
+}
+
+struct buffer_head *udf_bread(struct inode *inode, int block,
+			      int create, int *err)
+{
+	struct buffer_head *bh = NULL;
+
+	bh = udf_getblk(inode, block, create, err);
+	if (!bh)
+		return NULL;
+
+	if (buffer_uptodate(bh))
+		return bh;
+
+	ll_rw_block(READ, 1, &bh);
+
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+
+	brelse(bh);
+	*err = -EIO;
+	return NULL;
+}
+
+int udf_setsize(struct inode *inode, loff_t newsize)
+{
+	int err;
+	struct udf_inode_info *iinfo;
+	int bsize = 1 << inode->i_blkbits;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	      S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	iinfo = UDF_I(inode);
+	if (newsize > inode->i_size) {
+		down_write(&iinfo->i_data_sem);
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			if (bsize <
+			    (udf_file_entry_alloc_offset(inode) + newsize)) {
+				err = udf_expand_file_adinicb(inode);
+				if (err)
+					return err;
+				down_write(&iinfo->i_data_sem);
+			} else {
+				iinfo->i_lenAlloc = newsize;
+				goto set_size;
+			}
+		}
+		err = udf_extend_file(inode, newsize);
+		if (err) {
+			up_write(&iinfo->i_data_sem);
+			return err;
+		}
+set_size:
+		truncate_setsize(inode, newsize);
+		up_write(&iinfo->i_data_sem);
+	} else {
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			down_write(&iinfo->i_data_sem);
+			udf_clear_extent_cache(inode);
+			memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+			       0x00, bsize - newsize -
+			       udf_file_entry_alloc_offset(inode));
+			iinfo->i_lenAlloc = newsize;
+			truncate_setsize(inode, newsize);
+			up_write(&iinfo->i_data_sem);
+			goto update_time;
+		}
+		err = block_truncate_page(inode->i_mapping, newsize,
+					  udf_get_block);
+		if (err)
+			return err;
+		down_write(&iinfo->i_data_sem);
+		udf_clear_extent_cache(inode);
+		truncate_setsize(inode, newsize);
+		udf_truncate_extents(inode);
+		up_write(&iinfo->i_data_sem);
+	}
+update_time:
+	inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
+	if (IS_SYNC(inode))
+		udf_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+
+static int udf_read_inode(struct inode *inode, bool hidden_inode)
+{
+	struct buffer_head *bh = NULL;
+	struct fileEntry *fe;
+	struct extendedFileEntry *efe;
+	uint16_t ident;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+	struct kernel_lb_addr *iloc = &iinfo->i_location;
+	unsigned int link_count;
+	unsigned int indirections = 0;
+	int bs = inode->i_sb->s_blocksize;
+	int ret = -EIO;
+
+reread:
+	if (iloc->logicalBlockNum >=
+	    sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
+		udf_debug("block=%d, partition=%d out of range\n",
+			  iloc->logicalBlockNum, iloc->partitionReferenceNum);
+		return -EIO;
+	}
+
+	/*
+	 * Set defaults, but the inode is still incomplete!
+	 * Note: get_new_inode() sets the following on a new inode:
+	 *      i_sb = sb
+	 *      i_no = ino
+	 *      i_flags = sb->s_flags
+	 *      i_state = 0
+	 * clean_inode(): zero fills and sets
+	 *      i_count = 1
+	 *      i_nlink = 1
+	 *      i_op = NULL;
+	 */
+	bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
+	if (!bh) {
+		udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
+		return -EIO;
+	}
+
+	if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
+	    ident != TAG_IDENT_USE) {
+		udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
+			inode->i_ino, ident);
+		goto out;
+	}
+
+	fe = (struct fileEntry *)bh->b_data;
+	efe = (struct extendedFileEntry *)bh->b_data;
+
+	if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
+		struct buffer_head *ibh;
+
+		ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
+		if (ident == TAG_IDENT_IE && ibh) {
+			struct kernel_lb_addr loc;
+			struct indirectEntry *ie;
+
+			ie = (struct indirectEntry *)ibh->b_data;
+			loc = lelb_to_cpu(ie->indirectICB.extLocation);
+
+			if (ie->indirectICB.extLength) {
+				brelse(ibh);
+				memcpy(&iinfo->i_location, &loc,
+				       sizeof(struct kernel_lb_addr));
+				if (++indirections > UDF_MAX_ICB_NESTING) {
+					udf_err(inode->i_sb,
+						"too many ICBs in ICB hierarchy"
+						" (max %d supported)\n",
+						UDF_MAX_ICB_NESTING);
+					goto out;
+				}
+				brelse(bh);
+				goto reread;
+			}
+		}
+		brelse(ibh);
+	} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
+		udf_err(inode->i_sb, "unsupported strategy type: %d\n",
+			le16_to_cpu(fe->icbTag.strategyType));
+		goto out;
+	}
+	if (fe->icbTag.strategyType == cpu_to_le16(4))
+		iinfo->i_strat4096 = 0;
+	else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
+		iinfo->i_strat4096 = 1;
+
+	iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) &
+							ICBTAG_FLAG_AD_MASK;
+	iinfo->i_unique = 0;
+	iinfo->i_lenEAttr = 0;
+	iinfo->i_lenExtents = 0;
+	iinfo->i_lenAlloc = 0;
+	iinfo->i_next_alloc_block = 0;
+	iinfo->i_next_alloc_goal = 0;
+	if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
+		iinfo->i_efe = 1;
+		iinfo->i_use = 0;
+		ret = udf_alloc_i_data(inode, bs -
+					sizeof(struct extendedFileEntry));
+		if (ret)
+			goto out;
+		memcpy(iinfo->i_ext.i_data,
+		       bh->b_data + sizeof(struct extendedFileEntry),
+		       bs - sizeof(struct extendedFileEntry));
+	} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
+		iinfo->i_efe = 0;
+		iinfo->i_use = 0;
+		ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
+		if (ret)
+			goto out;
+		memcpy(iinfo->i_ext.i_data,
+		       bh->b_data + sizeof(struct fileEntry),
+		       bs - sizeof(struct fileEntry));
+	} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
+		iinfo->i_efe = 0;
+		iinfo->i_use = 1;
+		iinfo->i_lenAlloc = le32_to_cpu(
+				((struct unallocSpaceEntry *)bh->b_data)->
+				 lengthAllocDescs);
+		ret = udf_alloc_i_data(inode, bs -
+					sizeof(struct unallocSpaceEntry));
+		if (ret)
+			goto out;
+		memcpy(iinfo->i_ext.i_data,
+		       bh->b_data + sizeof(struct unallocSpaceEntry),
+		       bs - sizeof(struct unallocSpaceEntry));
+		return 0;
+	}
+
+	ret = -EIO;
+	read_lock(&sbi->s_cred_lock);
+	i_uid_write(inode, le32_to_cpu(fe->uid));
+	if (!uid_valid(inode->i_uid) ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET))
+		inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
+
+	i_gid_write(inode, le32_to_cpu(fe->gid));
+	if (!gid_valid(inode->i_gid) ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
+		inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
+
+	if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
+			sbi->s_fmode != UDF_INVALID_MODE)
+		inode->i_mode = sbi->s_fmode;
+	else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
+			sbi->s_dmode != UDF_INVALID_MODE)
+		inode->i_mode = sbi->s_dmode;
+	else
+		inode->i_mode = udf_convert_permissions(fe);
+	inode->i_mode &= ~sbi->s_umask;
+	read_unlock(&sbi->s_cred_lock);
+
+	link_count = le16_to_cpu(fe->fileLinkCount);
+	if (!link_count) {
+		if (!hidden_inode) {
+			ret = -ESTALE;
+			goto out;
+		}
+		link_count = 1;
+	}
+	set_nlink(inode, link_count);
+
+	inode->i_size = le64_to_cpu(fe->informationLength);
+	iinfo->i_lenExtents = inode->i_size;
+
+	if (iinfo->i_efe == 0) {
+		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
+			(inode->i_sb->s_blocksize_bits - 9);
+
+		if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime))
+			inode->i_atime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_mtime,
+					    fe->modificationTime))
+			inode->i_mtime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime))
+			inode->i_ctime = sbi->s_record_time;
+
+		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
+		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
+		iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs);
+		iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint);
+	} else {
+		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
+		    (inode->i_sb->s_blocksize_bits - 9);
+
+		if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime))
+			inode->i_atime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_mtime,
+					    efe->modificationTime))
+			inode->i_mtime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime))
+			iinfo->i_crtime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime))
+			inode->i_ctime = sbi->s_record_time;
+
+		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
+		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
+		iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
+		iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
+	}
+	inode->i_generation = iinfo->i_unique;
+
+	/*
+	 * Sanity check length of allocation descriptors and extended attrs to
+	 * avoid integer overflows
+	 */
+	if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
+		goto out;
+	/* Now do exact checks */
+	if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
+		goto out;
+	/* Sanity checks for files in ICB so that we don't get confused later */
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		/*
+		 * For file in ICB data is stored in allocation descriptor
+		 * so sizes should match
+		 */
+		if (iinfo->i_lenAlloc != inode->i_size)
+			goto out;
+		/* File in ICB has to fit in there... */
+		if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
+			goto out;
+	}
+
+	switch (fe->icbTag.fileType) {
+	case ICBTAG_FILE_TYPE_DIRECTORY:
+		inode->i_op = &udf_dir_inode_operations;
+		inode->i_fop = &udf_dir_operations;
+		inode->i_mode |= S_IFDIR;
+		inc_nlink(inode);
+		break;
+	case ICBTAG_FILE_TYPE_REALTIME:
+	case ICBTAG_FILE_TYPE_REGULAR:
+	case ICBTAG_FILE_TYPE_UNDEF:
+	case ICBTAG_FILE_TYPE_VAT20:
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+			inode->i_data.a_ops = &udf_adinicb_aops;
+		else
+			inode->i_data.a_ops = &udf_aops;
+		inode->i_op = &udf_file_inode_operations;
+		inode->i_fop = &udf_file_operations;
+		inode->i_mode |= S_IFREG;
+		break;
+	case ICBTAG_FILE_TYPE_BLOCK:
+		inode->i_mode |= S_IFBLK;
+		break;
+	case ICBTAG_FILE_TYPE_CHAR:
+		inode->i_mode |= S_IFCHR;
+		break;
+	case ICBTAG_FILE_TYPE_FIFO:
+		init_special_inode(inode, inode->i_mode | S_IFIFO, 0);
+		break;
+	case ICBTAG_FILE_TYPE_SOCKET:
+		init_special_inode(inode, inode->i_mode | S_IFSOCK, 0);
+		break;
+	case ICBTAG_FILE_TYPE_SYMLINK:
+		inode->i_data.a_ops = &udf_symlink_aops;
+		inode->i_op = &udf_symlink_inode_operations;
+		inode->i_mode = S_IFLNK | S_IRWXUGO;
+		break;
+	case ICBTAG_FILE_TYPE_MAIN:
+		udf_debug("METADATA FILE-----\n");
+		break;
+	case ICBTAG_FILE_TYPE_MIRROR:
+		udf_debug("METADATA MIRROR FILE-----\n");
+		break;
+	case ICBTAG_FILE_TYPE_BITMAP:
+		udf_debug("METADATA BITMAP FILE-----\n");
+		break;
+	default:
+		udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
+			inode->i_ino, fe->icbTag.fileType);
+		goto out;
+	}
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		struct deviceSpec *dsea =
+			(struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
+		if (dsea) {
+			init_special_inode(inode, inode->i_mode,
+				MKDEV(le32_to_cpu(dsea->majorDeviceIdent),
+				      le32_to_cpu(dsea->minorDeviceIdent)));
+			/* Developer ID ??? */
+		} else
+			goto out;
+	}
+	ret = 0;
+out:
+	brelse(bh);
+	return ret;
+}
+
+static int udf_alloc_i_data(struct inode *inode, size_t size)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
+
+	if (!iinfo->i_ext.i_data) {
+		udf_err(inode->i_sb, "(ino %ld) no free memory\n",
+			inode->i_ino);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static umode_t udf_convert_permissions(struct fileEntry *fe)
+{
+	umode_t mode;
+	uint32_t permissions;
+	uint32_t flags;
+
+	permissions = le32_to_cpu(fe->permissions);
+	flags = le16_to_cpu(fe->icbTag.flags);
+
+	mode =	((permissions) & S_IRWXO) |
+		((permissions >> 2) & S_IRWXG) |
+		((permissions >> 4) & S_IRWXU) |
+		((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) |
+		((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) |
+		((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0);
+
+	return mode;
+}
+
+int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+static int udf_sync_inode(struct inode *inode)
+{
+	return udf_update_inode(inode, 1);
+}
+
+static int udf_update_inode(struct inode *inode, int do_sync)
+{
+	struct buffer_head *bh = NULL;
+	struct fileEntry *fe;
+	struct extendedFileEntry *efe;
+	uint64_t lb_recorded;
+	uint32_t udfperms;
+	uint16_t icbflags;
+	uint16_t crclen;
+	int err = 0;
+	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	bh = udf_tgetblk(inode->i_sb,
+			udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
+	if (!bh) {
+		udf_debug("getblk failure\n");
+		return -EIO;
+	}
+
+	lock_buffer(bh);
+	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+	fe = (struct fileEntry *)bh->b_data;
+	efe = (struct extendedFileEntry *)bh->b_data;
+
+	if (iinfo->i_use) {
+		struct unallocSpaceEntry *use =
+			(struct unallocSpaceEntry *)bh->b_data;
+
+		use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+		memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
+		       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
+					sizeof(struct unallocSpaceEntry));
+		use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
+		use->descTag.tagLocation =
+				cpu_to_le32(iinfo->i_location.logicalBlockNum);
+		crclen = sizeof(struct unallocSpaceEntry) +
+				iinfo->i_lenAlloc - sizeof(struct tag);
+		use->descTag.descCRCLength = cpu_to_le16(crclen);
+		use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
+							   sizeof(struct tag),
+							   crclen));
+		use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
+
+		goto out;
+	}
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
+		fe->uid = cpu_to_le32(-1);
+	else
+		fe->uid = cpu_to_le32(i_uid_read(inode));
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
+		fe->gid = cpu_to_le32(-1);
+	else
+		fe->gid = cpu_to_le32(i_gid_read(inode));
+
+	udfperms = ((inode->i_mode & S_IRWXO)) |
+		   ((inode->i_mode & S_IRWXG) << 2) |
+		   ((inode->i_mode & S_IRWXU) << 4);
+
+	udfperms |= (le32_to_cpu(fe->permissions) &
+		    (FE_PERM_O_DELETE | FE_PERM_O_CHATTR |
+		     FE_PERM_G_DELETE | FE_PERM_G_CHATTR |
+		     FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
+	fe->permissions = cpu_to_le32(udfperms);
+
+	if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
+		fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
+	else
+		fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+
+	fe->informationLength = cpu_to_le64(inode->i_size);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		struct regid *eid;
+		struct deviceSpec *dsea =
+			(struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
+		if (!dsea) {
+			dsea = (struct deviceSpec *)
+				udf_add_extendedattr(inode,
+						     sizeof(struct deviceSpec) +
+						     sizeof(struct regid), 12, 0x3);
+			dsea->attrType = cpu_to_le32(12);
+			dsea->attrSubtype = 1;
+			dsea->attrLength = cpu_to_le32(
+						sizeof(struct deviceSpec) +
+						sizeof(struct regid));
+			dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
+		}
+		eid = (struct regid *)dsea->impUse;
+		memset(eid, 0, sizeof(struct regid));
+		strcpy(eid->ident, UDF_ID_DEVELOPER);
+		eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
+		eid->identSuffix[1] = UDF_OS_ID_LINUX;
+		dsea->majorDeviceIdent = cpu_to_le32(imajor(inode));
+		dsea->minorDeviceIdent = cpu_to_le32(iminor(inode));
+	}
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		lb_recorded = 0; /* No extents => no blocks! */
+	else
+		lb_recorded =
+			(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
+			(blocksize_bits - 9);
+
+	if (iinfo->i_efe == 0) {
+		memcpy(bh->b_data + sizeof(struct fileEntry),
+		       iinfo->i_ext.i_data,
+		       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+		fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
+
+		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
+		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
+		memset(&(fe->impIdent), 0, sizeof(struct regid));
+		strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
+		fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+		fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+		fe->uniqueID = cpu_to_le64(iinfo->i_unique);
+		fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
+		fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+		fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
+		fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE);
+		crclen = sizeof(struct fileEntry);
+	} else {
+		memcpy(bh->b_data + sizeof(struct extendedFileEntry),
+		       iinfo->i_ext.i_data,
+		       inode->i_sb->s_blocksize -
+					sizeof(struct extendedFileEntry));
+		efe->objectSize = cpu_to_le64(inode->i_size);
+		efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
+
+		if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec ||
+		    (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec &&
+		     iinfo->i_crtime.tv_nsec > inode->i_atime.tv_nsec))
+			iinfo->i_crtime = inode->i_atime;
+
+		if (iinfo->i_crtime.tv_sec > inode->i_mtime.tv_sec ||
+		    (iinfo->i_crtime.tv_sec == inode->i_mtime.tv_sec &&
+		     iinfo->i_crtime.tv_nsec > inode->i_mtime.tv_nsec))
+			iinfo->i_crtime = inode->i_mtime;
+
+		if (iinfo->i_crtime.tv_sec > inode->i_ctime.tv_sec ||
+		    (iinfo->i_crtime.tv_sec == inode->i_ctime.tv_sec &&
+		     iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec))
+			iinfo->i_crtime = inode->i_ctime;
+
+		udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
+		udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
+		udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
+
+		memset(&(efe->impIdent), 0, sizeof(struct regid));
+		strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
+		efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+		efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+		efe->uniqueID = cpu_to_le64(iinfo->i_unique);
+		efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
+		efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+		efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
+		efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
+		crclen = sizeof(struct extendedFileEntry);
+	}
+	if (iinfo->i_strat4096) {
+		fe->icbTag.strategyType = cpu_to_le16(4096);
+		fe->icbTag.strategyParameter = cpu_to_le16(1);
+		fe->icbTag.numEntries = cpu_to_le16(2);
+	} else {
+		fe->icbTag.strategyType = cpu_to_le16(4);
+		fe->icbTag.numEntries = cpu_to_le16(1);
+	}
+
+	if (S_ISDIR(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY;
+	else if (S_ISREG(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR;
+	else if (S_ISLNK(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK;
+	else if (S_ISBLK(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK;
+	else if (S_ISCHR(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR;
+	else if (S_ISFIFO(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO;
+	else if (S_ISSOCK(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET;
+
+	icbflags =	iinfo->i_alloc_type |
+			((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) |
+			((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) |
+			((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) |
+			(le16_to_cpu(fe->icbTag.flags) &
+				~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID |
+				ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY));
+
+	fe->icbTag.flags = cpu_to_le16(icbflags);
+	if (sbi->s_udfrev >= 0x0200)
+		fe->descTag.descVersion = cpu_to_le16(3);
+	else
+		fe->descTag.descVersion = cpu_to_le16(2);
+	fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
+	fe->descTag.tagLocation = cpu_to_le32(
+					iinfo->i_location.logicalBlockNum);
+	crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
+	fe->descTag.descCRCLength = cpu_to_le16(crclen);
+	fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
+						  crclen));
+	fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
+
+out:
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	/* write the data blocks */
+	mark_buffer_dirty(bh);
+	if (do_sync) {
+		sync_dirty_buffer(bh);
+		if (buffer_write_io_error(bh)) {
+			udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n",
+				 inode->i_ino);
+			err = -EIO;
+		}
+	}
+	brelse(bh);
+
+	return err;
+}
+
+struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
+			 bool hidden_inode)
+{
+	unsigned long block = udf_get_lb_pblock(sb, ino, 0);
+	struct inode *inode = iget_locked(sb, block);
+	int err;
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
+	err = udf_read_inode(inode, hidden_inode);
+	if (err < 0) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+
+	return inode;
+}
+
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+		 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+	int adsize;
+	struct short_ad *sad = NULL;
+	struct long_ad *lad = NULL;
+	struct allocExtDesc *aed;
+	uint8_t *ptr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (!epos->bh)
+		ptr = iinfo->i_ext.i_data + epos->offset -
+			udf_file_entry_alloc_offset(inode) +
+			iinfo->i_lenEAttr;
+	else
+		ptr = epos->bh->b_data + epos->offset;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		return -EIO;
+
+	if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
+		unsigned char *sptr, *dptr;
+		struct buffer_head *nbh;
+		int err, loffset;
+		struct kernel_lb_addr obloc = epos->block;
+
+		epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
+						obloc.partitionReferenceNum,
+						obloc.logicalBlockNum, &err);
+		if (!epos->block.logicalBlockNum)
+			return -ENOSPC;
+		nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
+								 &epos->block,
+								 0));
+		if (!nbh)
+			return -EIO;
+		lock_buffer(nbh);
+		memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
+		set_buffer_uptodate(nbh);
+		unlock_buffer(nbh);
+		mark_buffer_dirty_inode(nbh, inode);
+
+		aed = (struct allocExtDesc *)(nbh->b_data);
+		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
+			aed->previousAllocExtLocation =
+					cpu_to_le32(obloc.logicalBlockNum);
+		if (epos->offset + adsize > inode->i_sb->s_blocksize) {
+			loffset = epos->offset;
+			aed->lengthAllocDescs = cpu_to_le32(adsize);
+			sptr = ptr - adsize;
+			dptr = nbh->b_data + sizeof(struct allocExtDesc);
+			memcpy(dptr, sptr, adsize);
+			epos->offset = sizeof(struct allocExtDesc) + adsize;
+		} else {
+			loffset = epos->offset + adsize;
+			aed->lengthAllocDescs = cpu_to_le32(0);
+			sptr = ptr;
+			epos->offset = sizeof(struct allocExtDesc);
+
+			if (epos->bh) {
+				aed = (struct allocExtDesc *)epos->bh->b_data;
+				le32_add_cpu(&aed->lengthAllocDescs, adsize);
+			} else {
+				iinfo->i_lenAlloc += adsize;
+				mark_inode_dirty(inode);
+			}
+		}
+		if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
+			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
+				    epos->block.logicalBlockNum, sizeof(struct tag));
+		else
+			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
+				    epos->block.logicalBlockNum, sizeof(struct tag));
+		switch (iinfo->i_alloc_type) {
+		case ICBTAG_FLAG_AD_SHORT:
+			sad = (struct short_ad *)sptr;
+			sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
+						     inode->i_sb->s_blocksize);
+			sad->extPosition =
+				cpu_to_le32(epos->block.logicalBlockNum);
+			break;
+		case ICBTAG_FLAG_AD_LONG:
+			lad = (struct long_ad *)sptr;
+			lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
+						     inode->i_sb->s_blocksize);
+			lad->extLocation = cpu_to_lelb(epos->block);
+			memset(lad->impUse, 0x00, sizeof(lad->impUse));
+			break;
+		}
+		if (epos->bh) {
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(epos->bh->b_data, loffset);
+			else
+				udf_update_tag(epos->bh->b_data,
+						sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(epos->bh, inode);
+			brelse(epos->bh);
+		} else {
+			mark_inode_dirty(inode);
+		}
+		epos->bh = nbh;
+	}
+
+	udf_write_aext(inode, epos, eloc, elen, inc);
+
+	if (!epos->bh) {
+		iinfo->i_lenAlloc += adsize;
+		mark_inode_dirty(inode);
+	} else {
+		aed = (struct allocExtDesc *)epos->bh->b_data;
+		le32_add_cpu(&aed->lengthAllocDescs, adsize);
+		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+				UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+			udf_update_tag(epos->bh->b_data,
+					epos->offset + (inc ? 0 : adsize));
+		else
+			udf_update_tag(epos->bh->b_data,
+					sizeof(struct allocExtDesc));
+		mark_buffer_dirty_inode(epos->bh, inode);
+	}
+
+	return 0;
+}
+
+void udf_write_aext(struct inode *inode, struct extent_position *epos,
+		    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+	int adsize;
+	uint8_t *ptr;
+	struct short_ad *sad;
+	struct long_ad *lad;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (!epos->bh)
+		ptr = iinfo->i_ext.i_data + epos->offset -
+			udf_file_entry_alloc_offset(inode) +
+			iinfo->i_lenEAttr;
+	else
+		ptr = epos->bh->b_data + epos->offset;
+
+	switch (iinfo->i_alloc_type) {
+	case ICBTAG_FLAG_AD_SHORT:
+		sad = (struct short_ad *)ptr;
+		sad->extLength = cpu_to_le32(elen);
+		sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
+		adsize = sizeof(struct short_ad);
+		break;
+	case ICBTAG_FLAG_AD_LONG:
+		lad = (struct long_ad *)ptr;
+		lad->extLength = cpu_to_le32(elen);
+		lad->extLocation = cpu_to_lelb(*eloc);
+		memset(lad->impUse, 0x00, sizeof(lad->impUse));
+		adsize = sizeof(struct long_ad);
+		break;
+	default:
+		return;
+	}
+
+	if (epos->bh) {
+		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+		    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) {
+			struct allocExtDesc *aed =
+				(struct allocExtDesc *)epos->bh->b_data;
+			udf_update_tag(epos->bh->b_data,
+				       le32_to_cpu(aed->lengthAllocDescs) +
+				       sizeof(struct allocExtDesc));
+		}
+		mark_buffer_dirty_inode(epos->bh, inode);
+	} else {
+		mark_inode_dirty(inode);
+	}
+
+	if (inc)
+		epos->offset += adsize;
+}
+
+int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
+		     struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
+{
+	int8_t etype;
+
+	while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
+	       (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+		int block;
+		epos->block = *eloc;
+		epos->offset = sizeof(struct allocExtDesc);
+		brelse(epos->bh);
+		block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
+		epos->bh = udf_tread(inode->i_sb, block);
+		if (!epos->bh) {
+			udf_debug("reading block %d failed!\n", block);
+			return -1;
+		}
+	}
+
+	return etype;
+}
+
+int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
+			struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
+{
+	int alen;
+	int8_t etype;
+	uint8_t *ptr;
+	struct short_ad *sad;
+	struct long_ad *lad;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (!epos->bh) {
+		if (!epos->offset)
+			epos->offset = udf_file_entry_alloc_offset(inode);
+		ptr = iinfo->i_ext.i_data + epos->offset -
+			udf_file_entry_alloc_offset(inode) +
+			iinfo->i_lenEAttr;
+		alen = udf_file_entry_alloc_offset(inode) +
+							iinfo->i_lenAlloc;
+	} else {
+		if (!epos->offset)
+			epos->offset = sizeof(struct allocExtDesc);
+		ptr = epos->bh->b_data + epos->offset;
+		alen = sizeof(struct allocExtDesc) +
+			le32_to_cpu(((struct allocExtDesc *)epos->bh->b_data)->
+							lengthAllocDescs);
+	}
+
+	switch (iinfo->i_alloc_type) {
+	case ICBTAG_FLAG_AD_SHORT:
+		sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc);
+		if (!sad)
+			return -1;
+		etype = le32_to_cpu(sad->extLength) >> 30;
+		eloc->logicalBlockNum = le32_to_cpu(sad->extPosition);
+		eloc->partitionReferenceNum =
+				iinfo->i_location.partitionReferenceNum;
+		*elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK;
+		break;
+	case ICBTAG_FLAG_AD_LONG:
+		lad = udf_get_filelongad(ptr, alen, &epos->offset, inc);
+		if (!lad)
+			return -1;
+		etype = le32_to_cpu(lad->extLength) >> 30;
+		*eloc = lelb_to_cpu(lad->extLocation);
+		*elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
+		break;
+	default:
+		udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type);
+		return -1;
+	}
+
+	return etype;
+}
+
+static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
+			      struct kernel_lb_addr neloc, uint32_t nelen)
+{
+	struct kernel_lb_addr oeloc;
+	uint32_t oelen;
+	int8_t etype;
+
+	if (epos.bh)
+		get_bh(epos.bh);
+
+	while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
+		udf_write_aext(inode, &epos, &neloc, nelen, 1);
+		neloc = oeloc;
+		nelen = (etype << 30) | oelen;
+	}
+	udf_add_aext(inode, &epos, &neloc, nelen, 1);
+	brelse(epos.bh);
+
+	return (nelen >> 30);
+}
+
+int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
+		       struct kernel_lb_addr eloc, uint32_t elen)
+{
+	struct extent_position oepos;
+	int adsize;
+	int8_t etype;
+	struct allocExtDesc *aed;
+	struct udf_inode_info *iinfo;
+
+	if (epos.bh) {
+		get_bh(epos.bh);
+		get_bh(epos.bh);
+	}
+
+	iinfo = UDF_I(inode);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		adsize = 0;
+
+	oepos = epos;
+	if (udf_next_aext(inode, &epos, &eloc, &elen, 1) == -1)
+		return -1;
+
+	while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
+		udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
+		if (oepos.bh != epos.bh) {
+			oepos.block = epos.block;
+			brelse(oepos.bh);
+			get_bh(epos.bh);
+			oepos.bh = epos.bh;
+			oepos.offset = epos.offset - adsize;
+		}
+	}
+	memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
+	elen = 0;
+
+	if (epos.bh != oepos.bh) {
+		udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
+		if (!oepos.bh) {
+			iinfo->i_lenAlloc -= (adsize * 2);
+			mark_inode_dirty(inode);
+		} else {
+			aed = (struct allocExtDesc *)oepos.bh->b_data;
+			le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize));
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(oepos.bh->b_data,
+						oepos.offset - (2 * adsize));
+			else
+				udf_update_tag(oepos.bh->b_data,
+						sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(oepos.bh, inode);
+		}
+	} else {
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
+		if (!oepos.bh) {
+			iinfo->i_lenAlloc -= adsize;
+			mark_inode_dirty(inode);
+		} else {
+			aed = (struct allocExtDesc *)oepos.bh->b_data;
+			le32_add_cpu(&aed->lengthAllocDescs, -adsize);
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(oepos.bh->b_data,
+						epos.offset - adsize);
+			else
+				udf_update_tag(oepos.bh->b_data,
+						sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(oepos.bh, inode);
+		}
+	}
+
+	brelse(epos.bh);
+	brelse(oepos.bh);
+
+	return (elen >> 30);
+}
+
+int8_t inode_bmap(struct inode *inode, sector_t block,
+		  struct extent_position *pos, struct kernel_lb_addr *eloc,
+		  uint32_t *elen, sector_t *offset)
+{
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+	loff_t lbcount = 0, bcount =
+	    (loff_t) block << blocksize_bits;
+	int8_t etype;
+	struct udf_inode_info *iinfo;
+
+	iinfo = UDF_I(inode);
+	if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) {
+		pos->offset = 0;
+		pos->block = iinfo->i_location;
+		pos->bh = NULL;
+	}
+	*elen = 0;
+	do {
+		etype = udf_next_aext(inode, pos, eloc, elen, 1);
+		if (etype == -1) {
+			*offset = (bcount - lbcount) >> blocksize_bits;
+			iinfo->i_lenExtents = lbcount;
+			return -1;
+		}
+		lbcount += *elen;
+	} while (lbcount <= bcount);
+	/* update extent cache */
+	udf_update_extent_cache(inode, lbcount - *elen, pos, 1);
+	*offset = (bcount + *elen - lbcount) >> blocksize_bits;
+
+	return etype;
+}
+
+long udf_block_map(struct inode *inode, sector_t block)
+{
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = {};
+	int ret;
+
+	down_read(&UDF_I(inode)->i_data_sem);
+
+	if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
+						(EXT_RECORDED_ALLOCATED >> 30))
+		ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
+	else
+		ret = 0;
+
+	up_read(&UDF_I(inode)->i_data_sem);
+	brelse(epos.bh);
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
+		return udf_fixed_to_variable(ret);
+	else
+		return ret;
+}
diff --git a/kernel/fs/udf/lowlevel.c b/kernel/fs/udf/lowlevel.c
new file mode 100644
index 000000000..6ad5a453a
--- /dev/null
+++ b/kernel/fs/udf/lowlevel.c
@@ -0,0 +1,67 @@
+/*
+ * lowlevel.c
+ *
+ * PURPOSE
+ *  Low Level Device Routines for the UDF filesystem
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1999-2001 Ben Fennema
+ *
+ * HISTORY
+ *
+ *  03/26/99 blf  Created.
+ */
+
+#include "udfdecl.h"
+
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/uaccess.h>
+
+#include "udf_sb.h"
+
+unsigned int udf_get_last_session(struct super_block *sb)
+{
+	struct cdrom_multisession ms_info;
+	unsigned int vol_desc_start;
+	struct block_device *bdev = sb->s_bdev;
+	int i;
+
+	vol_desc_start = 0;
+	ms_info.addr_format = CDROM_LBA;
+	i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+
+	if (i == 0) {
+		udf_debug("XA disk: %s, vol_desc_start=%d\n",
+			  ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba);
+		if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
+			vol_desc_start = ms_info.addr.lba;
+	} else {
+		udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
+	}
+	return vol_desc_start;
+}
+
+unsigned long udf_get_last_block(struct super_block *sb)
+{
+	struct block_device *bdev = sb->s_bdev;
+	unsigned long lblock = 0;
+
+	/*
+	 * ioctl failed or returned obviously bogus value?
+	 * Try using the device size...
+	 */
+	if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) ||
+	    lblock == 0)
+		lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+
+	if (lblock)
+		return lblock - 1;
+	else
+		return 0;
+}
diff --git a/kernel/fs/udf/misc.c b/kernel/fs/udf/misc.c
new file mode 100644
index 000000000..71d1c25f3
--- /dev/null
+++ b/kernel/fs/udf/misc.c
@@ -0,0 +1,298 @@
+/*
+ * misc.c
+ *
+ * PURPOSE
+ *	Miscellaneous routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  04/19/99 blf  partial support for reading/writing specific EA's
+ */
+
+#include "udfdecl.h"
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/crc-itu-t.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+struct buffer_head *udf_tgetblk(struct super_block *sb, int block)
+{
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV))
+		return sb_getblk(sb, udf_fixed_to_variable(block));
+	else
+		return sb_getblk(sb, block);
+}
+
+struct buffer_head *udf_tread(struct super_block *sb, int block)
+{
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV))
+		return sb_bread(sb, udf_fixed_to_variable(block));
+	else
+		return sb_bread(sb, block);
+}
+
+struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
+					   uint32_t type, uint8_t loc)
+{
+	uint8_t *ea = NULL, *ad = NULL;
+	int offset;
+	uint16_t crclen;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	ea = iinfo->i_ext.i_data;
+	if (iinfo->i_lenEAttr) {
+		ad = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+	} else {
+		ad = ea;
+		size += sizeof(struct extendedAttrHeaderDesc);
+	}
+
+	offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) -
+		iinfo->i_lenAlloc;
+
+	/* TODO - Check for FreeEASpace */
+
+	if (loc & 0x01 && offset >= size) {
+		struct extendedAttrHeaderDesc *eahd;
+		eahd = (struct extendedAttrHeaderDesc *)ea;
+
+		if (iinfo->i_lenAlloc)
+			memmove(&ad[size], ad, iinfo->i_lenAlloc);
+
+		if (iinfo->i_lenEAttr) {
+			/* check checksum/crc */
+			if (eahd->descTag.tagIdent !=
+					cpu_to_le16(TAG_IDENT_EAHD) ||
+			    le32_to_cpu(eahd->descTag.tagLocation) !=
+					iinfo->i_location.logicalBlockNum)
+				return NULL;
+		} else {
+			struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+
+			size -= sizeof(struct extendedAttrHeaderDesc);
+			iinfo->i_lenEAttr +=
+				sizeof(struct extendedAttrHeaderDesc);
+			eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD);
+			if (sbi->s_udfrev >= 0x0200)
+				eahd->descTag.descVersion = cpu_to_le16(3);
+			else
+				eahd->descTag.descVersion = cpu_to_le16(2);
+			eahd->descTag.tagSerialNum =
+					cpu_to_le16(sbi->s_serial_number);
+			eahd->descTag.tagLocation = cpu_to_le32(
+					iinfo->i_location.logicalBlockNum);
+			eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF);
+			eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF);
+		}
+
+		offset = iinfo->i_lenEAttr;
+		if (type < 2048) {
+			if (le32_to_cpu(eahd->appAttrLocation) <
+					iinfo->i_lenEAttr) {
+				uint32_t aal =
+					le32_to_cpu(eahd->appAttrLocation);
+				memmove(&ea[offset - aal + size],
+					&ea[aal], offset - aal);
+				offset -= aal;
+				eahd->appAttrLocation =
+						cpu_to_le32(aal + size);
+			}
+			if (le32_to_cpu(eahd->impAttrLocation) <
+					iinfo->i_lenEAttr) {
+				uint32_t ial =
+					le32_to_cpu(eahd->impAttrLocation);
+				memmove(&ea[offset - ial + size],
+					&ea[ial], offset - ial);
+				offset -= ial;
+				eahd->impAttrLocation =
+						cpu_to_le32(ial + size);
+			}
+		} else if (type < 65536) {
+			if (le32_to_cpu(eahd->appAttrLocation) <
+					iinfo->i_lenEAttr) {
+				uint32_t aal =
+					le32_to_cpu(eahd->appAttrLocation);
+				memmove(&ea[offset - aal + size],
+					&ea[aal], offset - aal);
+				offset -= aal;
+				eahd->appAttrLocation =
+						cpu_to_le32(aal + size);
+			}
+		}
+		/* rewrite CRC + checksum of eahd */
+		crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
+		eahd->descTag.descCRCLength = cpu_to_le16(crclen);
+		eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
+						sizeof(struct tag), crclen));
+		eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
+		iinfo->i_lenEAttr += size;
+		return (struct genericFormat *)&ea[offset];
+	}
+	if (loc & 0x02)
+		;
+
+	return NULL;
+}
+
+struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
+					   uint8_t subtype)
+{
+	struct genericFormat *gaf;
+	uint8_t *ea = NULL;
+	uint32_t offset;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	ea = iinfo->i_ext.i_data;
+
+	if (iinfo->i_lenEAttr) {
+		struct extendedAttrHeaderDesc *eahd;
+		eahd = (struct extendedAttrHeaderDesc *)ea;
+
+		/* check checksum/crc */
+		if (eahd->descTag.tagIdent !=
+				cpu_to_le16(TAG_IDENT_EAHD) ||
+		    le32_to_cpu(eahd->descTag.tagLocation) !=
+				iinfo->i_location.logicalBlockNum)
+			return NULL;
+
+		if (type < 2048)
+			offset = sizeof(struct extendedAttrHeaderDesc);
+		else if (type < 65536)
+			offset = le32_to_cpu(eahd->impAttrLocation);
+		else
+			offset = le32_to_cpu(eahd->appAttrLocation);
+
+		while (offset < iinfo->i_lenEAttr) {
+			gaf = (struct genericFormat *)&ea[offset];
+			if (le32_to_cpu(gaf->attrType) == type &&
+					gaf->attrSubtype == subtype)
+				return gaf;
+			else
+				offset += le32_to_cpu(gaf->attrLength);
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * udf_read_tagged
+ *
+ * PURPOSE
+ *	Read the first block of a tagged descriptor.
+ *
+ * HISTORY
+ *	July 1, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
+				    uint32_t location, uint16_t *ident)
+{
+	struct tag *tag_p;
+	struct buffer_head *bh = NULL;
+	u8 checksum;
+
+	/* Read the block */
+	if (block == 0xFFFFFFFF)
+		return NULL;
+
+	bh = udf_tread(sb, block);
+	if (!bh) {
+		udf_err(sb, "read failed, block=%u, location=%d\n",
+			block, location);
+		return NULL;
+	}
+
+	tag_p = (struct tag *)(bh->b_data);
+
+	*ident = le16_to_cpu(tag_p->tagIdent);
+
+	if (location != le32_to_cpu(tag_p->tagLocation)) {
+		udf_debug("location mismatch block %u, tag %u != %u\n",
+			  block, le32_to_cpu(tag_p->tagLocation), location);
+		goto error_out;
+	}
+
+	/* Verify the tag checksum */
+	checksum = udf_tag_checksum(tag_p);
+	if (checksum != tag_p->tagChecksum) {
+		udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
+			block, checksum, tag_p->tagChecksum);
+		goto error_out;
+	}
+
+	/* Verify the tag version */
+	if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
+	    tag_p->descVersion != cpu_to_le16(0x0003U)) {
+		udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
+			le16_to_cpu(tag_p->descVersion), block);
+		goto error_out;
+	}
+
+	/* Verify the descriptor CRC */
+	if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
+	    le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
+					bh->b_data + sizeof(struct tag),
+					le16_to_cpu(tag_p->descCRCLength)))
+		return bh;
+
+	udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
+		  le16_to_cpu(tag_p->descCRC),
+		  le16_to_cpu(tag_p->descCRCLength));
+error_out:
+	brelse(bh);
+	return NULL;
+}
+
+struct buffer_head *udf_read_ptagged(struct super_block *sb,
+				     struct kernel_lb_addr *loc,
+				     uint32_t offset, uint16_t *ident)
+{
+	return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
+			       loc->logicalBlockNum + offset, ident);
+}
+
+void udf_update_tag(char *data, int length)
+{
+	struct tag *tptr = (struct tag *)data;
+	length -= sizeof(struct tag);
+
+	tptr->descCRCLength = cpu_to_le16(length);
+	tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
+	tptr->tagChecksum = udf_tag_checksum(tptr);
+}
+
+void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
+		 uint32_t loc, int length)
+{
+	struct tag *tptr = (struct tag *)data;
+	tptr->tagIdent = cpu_to_le16(ident);
+	tptr->descVersion = cpu_to_le16(version);
+	tptr->tagSerialNum = cpu_to_le16(snum);
+	tptr->tagLocation = cpu_to_le32(loc);
+	udf_update_tag(data, length);
+}
+
+u8 udf_tag_checksum(const struct tag *t)
+{
+	u8 *data = (u8 *)t;
+	u8 checksum = 0;
+	int i;
+	for (i = 0; i < sizeof(struct tag); ++i)
+		if (i != 4) /* position of checksum */
+			checksum += data[i];
+	return checksum;
+}
diff --git a/kernel/fs/udf/namei.c b/kernel/fs/udf/namei.c
new file mode 100644
index 000000000..5c03f0dfb
--- /dev/null
+++ b/kernel/fs/udf/namei.c
@@ -0,0 +1,1300 @@
+/*
+ * namei.c
+ *
+ * PURPOSE
+ *      Inode name handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *      This file is distributed under the terms of the GNU General Public
+ *      License (GPL). Copies of the GPL can be obtained from:
+ *              ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *      Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  12/12/98 blf  Created. Split out the lookup code from dir.c
+ *  04/19/99 blf  link, mknod, symlink support
+ */
+
+#include "udfdecl.h"
+
+#include "udf_i.h"
+#include "udf_sb.h"
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/crc-itu-t.h>
+#include <linux/exportfs.h>
+
+static inline int udf_match(int len1, const unsigned char *name1, int len2,
+			    const unsigned char *name2)
+{
+	if (len1 != len2)
+		return 0;
+
+	return !memcmp(name1, name2, len1);
+}
+
+int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
+		 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
+		 uint8_t *impuse, uint8_t *fileident)
+{
+	uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
+	uint16_t crc;
+	int offset;
+	uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
+	uint8_t lfi = cfi->lengthFileIdent;
+	int padlen = fibh->eoffset - fibh->soffset - liu - lfi -
+		sizeof(struct fileIdentDesc);
+	int adinicb = 0;
+
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		adinicb = 1;
+
+	offset = fibh->soffset + sizeof(struct fileIdentDesc);
+
+	if (impuse) {
+		if (adinicb || (offset + liu < 0)) {
+			memcpy((uint8_t *)sfi->impUse, impuse, liu);
+		} else if (offset >= 0) {
+			memcpy(fibh->ebh->b_data + offset, impuse, liu);
+		} else {
+			memcpy((uint8_t *)sfi->impUse, impuse, -offset);
+			memcpy(fibh->ebh->b_data, impuse - offset,
+				liu + offset);
+		}
+	}
+
+	offset += liu;
+
+	if (fileident) {
+		if (adinicb || (offset + lfi < 0)) {
+			memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi);
+		} else if (offset >= 0) {
+			memcpy(fibh->ebh->b_data + offset, fileident, lfi);
+		} else {
+			memcpy((uint8_t *)sfi->fileIdent + liu, fileident,
+				-offset);
+			memcpy(fibh->ebh->b_data, fileident - offset,
+				lfi + offset);
+		}
+	}
+
+	offset += lfi;
+
+	if (adinicb || (offset + padlen < 0)) {
+		memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen);
+	} else if (offset >= 0) {
+		memset(fibh->ebh->b_data + offset, 0x00, padlen);
+	} else {
+		memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset);
+		memset(fibh->ebh->b_data, 0x00, padlen + offset);
+	}
+
+	crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
+		      sizeof(struct fileIdentDesc) - sizeof(struct tag));
+
+	if (fibh->sbh == fibh->ebh) {
+		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
+			      crclen + sizeof(struct tag) -
+			      sizeof(struct fileIdentDesc));
+	} else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
+		crc = crc_itu_t(crc, fibh->ebh->b_data +
+					sizeof(struct fileIdentDesc) +
+					fibh->soffset,
+			      crclen + sizeof(struct tag) -
+					sizeof(struct fileIdentDesc));
+	} else {
+		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
+			      -fibh->soffset - sizeof(struct fileIdentDesc));
+		crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset);
+	}
+
+	cfi->descTag.descCRC = cpu_to_le16(crc);
+	cfi->descTag.descCRCLength = cpu_to_le16(crclen);
+	cfi->descTag.tagChecksum = udf_tag_checksum(&cfi->descTag);
+
+	if (adinicb || (sizeof(struct fileIdentDesc) <= -fibh->soffset)) {
+		memcpy((uint8_t *)sfi, (uint8_t *)cfi,
+			sizeof(struct fileIdentDesc));
+	} else {
+		memcpy((uint8_t *)sfi, (uint8_t *)cfi, -fibh->soffset);
+		memcpy(fibh->ebh->b_data, (uint8_t *)cfi - fibh->soffset,
+		       sizeof(struct fileIdentDesc) + fibh->soffset);
+	}
+
+	if (adinicb) {
+		mark_inode_dirty(inode);
+	} else {
+		if (fibh->sbh != fibh->ebh)
+			mark_buffer_dirty_inode(fibh->ebh, inode);
+		mark_buffer_dirty_inode(fibh->sbh, inode);
+	}
+	return 0;
+}
+
+static struct fileIdentDesc *udf_find_entry(struct inode *dir,
+					    const struct qstr *child,
+					    struct udf_fileident_bh *fibh,
+					    struct fileIdentDesc *cfi)
+{
+	struct fileIdentDesc *fi = NULL;
+	loff_t f_pos;
+	int block, flen;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
+	uint8_t lfi;
+	uint16_t liu;
+	loff_t size;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = {};
+	struct udf_inode_info *dinfo = UDF_I(dir);
+	int isdotdot = child->len == 2 &&
+		child->name[0] == '.' && child->name[1] == '.';
+	struct super_block *sb = dir->i_sb;
+
+	size = udf_ext0_offset(dir) + dir->i_size;
+	f_pos = udf_ext0_offset(dir);
+
+	fibh->sbh = fibh->ebh = NULL;
+	fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1);
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos,
+		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
+			goto out_err;
+		block = udf_get_lb_pblock(sb, &eloc, offset);
+		if ((++offset << sb->s_blocksize_bits) < elen) {
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else
+			offset = 0;
+
+		fibh->sbh = fibh->ebh = udf_tread(sb, block);
+		if (!fibh->sbh)
+			goto out_err;
+	}
+
+	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!fname)
+		goto out_err;
+
+	while (f_pos < size) {
+		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
+					&elen, &offset);
+		if (!fi)
+			goto out_err;
+
+		liu = le16_to_cpu(cfi->lengthOfImpUse);
+		lfi = cfi->lengthFileIdent;
+
+		if (fibh->sbh == fibh->ebh) {
+			nameptr = fi->fileIdent + liu;
+		} else {
+			int poffset;	/* Unpaded ending offset */
+
+			poffset = fibh->soffset + sizeof(struct fileIdentDesc) +
+					liu + lfi;
+
+			if (poffset >= lfi)
+				nameptr = (uint8_t *)(fibh->ebh->b_data +
+						      poffset - lfi);
+			else {
+				nameptr = fname;
+				memcpy(nameptr, fi->fileIdent + liu,
+					lfi - poffset);
+				memcpy(nameptr + lfi - poffset,
+					fibh->ebh->b_data, poffset);
+			}
+		}
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
+			if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
+				continue;
+		}
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
+			if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
+				continue;
+		}
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
+		    isdotdot)
+			goto out_ok;
+
+		if (!lfi)
+			continue;
+
+		flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
+		if (flen && udf_match(flen, fname, child->len, child->name))
+			goto out_ok;
+	}
+
+out_err:
+	fi = NULL;
+	if (fibh->sbh != fibh->ebh)
+		brelse(fibh->ebh);
+	brelse(fibh->sbh);
+out_ok:
+	brelse(epos.bh);
+	kfree(fname);
+
+	return fi;
+}
+
+static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags)
+{
+	struct inode *inode = NULL;
+	struct fileIdentDesc cfi;
+	struct udf_fileident_bh fibh;
+
+	if (dentry->d_name.len > UDF_NAME_LEN - 2)
+		return ERR_PTR(-ENAMETOOLONG);
+
+#ifdef UDF_RECOVERY
+	/* temporary shorthand for specifying files by inode number */
+	if (!strncmp(dentry->d_name.name, ".B=", 3)) {
+		struct kernel_lb_addr lb = {
+			.logicalBlockNum = 0,
+			.partitionReferenceNum =
+				simple_strtoul(dentry->d_name.name + 3,
+						NULL, 0),
+		};
+		inode = udf_iget(dir->i_sb, lb);
+		if (IS_ERR(inode))
+			return inode;
+	} else
+#endif /* UDF_RECOVERY */
+
+	if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+		struct kernel_lb_addr loc;
+
+		if (fibh.sbh != fibh.ebh)
+			brelse(fibh.ebh);
+		brelse(fibh.sbh);
+
+		loc = lelb_to_cpu(cfi.icb.extLocation);
+		inode = udf_iget(dir->i_sb, &loc);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+static struct fileIdentDesc *udf_add_entry(struct inode *dir,
+					   struct dentry *dentry,
+					   struct udf_fileident_bh *fibh,
+					   struct fileIdentDesc *cfi, int *err)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fileIdentDesc *fi = NULL;
+	unsigned char *name = NULL;
+	int namelen;
+	loff_t f_pos;
+	loff_t size = udf_ext0_offset(dir) + dir->i_size;
+	int nfidlen;
+	uint8_t lfi;
+	uint16_t liu;
+	int block;
+	struct kernel_lb_addr eloc;
+	uint32_t elen = 0;
+	sector_t offset;
+	struct extent_position epos = {};
+	struct udf_inode_info *dinfo;
+
+	fibh->sbh = fibh->ebh = NULL;
+	name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!name) {
+		*err = -ENOMEM;
+		goto out_err;
+	}
+
+	if (dentry) {
+		if (!dentry->d_name.len) {
+			*err = -EINVAL;
+			goto out_err;
+		}
+		namelen = udf_put_filename(sb, dentry->d_name.name, name,
+						 dentry->d_name.len);
+		if (!namelen) {
+			*err = -ENAMETOOLONG;
+			goto out_err;
+		}
+	} else {
+		namelen = 0;
+	}
+
+	nfidlen = (sizeof(struct fileIdentDesc) + namelen + 3) & ~3;
+
+	f_pos = udf_ext0_offset(dir);
+
+	fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
+	dinfo = UDF_I(dir);
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
+			block = udf_get_lb_pblock(dir->i_sb,
+					&dinfo->i_location, 0);
+			fibh->soffset = fibh->eoffset = sb->s_blocksize;
+			goto add;
+		}
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else
+			offset = 0;
+
+		fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+		if (!fibh->sbh) {
+			*err = -EIO;
+			goto out_err;
+		}
+
+		block = dinfo->i_location.logicalBlockNum;
+	}
+
+	while (f_pos < size) {
+		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
+					&elen, &offset);
+
+		if (!fi) {
+			*err = -EIO;
+			goto out_err;
+		}
+
+		liu = le16_to_cpu(cfi->lengthOfImpUse);
+		lfi = cfi->lengthFileIdent;
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
+			if (((sizeof(struct fileIdentDesc) +
+					liu + lfi + 3) & ~3) == nfidlen) {
+				cfi->descTag.tagSerialNum = cpu_to_le16(1);
+				cfi->fileVersionNum = cpu_to_le16(1);
+				cfi->fileCharacteristics = 0;
+				cfi->lengthFileIdent = namelen;
+				cfi->lengthOfImpUse = cpu_to_le16(0);
+				if (!udf_write_fi(dir, cfi, fi, fibh, NULL,
+						  name))
+					goto out_ok;
+				else {
+					*err = -EIO;
+					goto out_err;
+				}
+			}
+		}
+	}
+
+add:
+	f_pos += nfidlen;
+
+	if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB &&
+	    sb->s_blocksize - fibh->eoffset < nfidlen) {
+		brelse(epos.bh);
+		epos.bh = NULL;
+		fibh->soffset -= udf_ext0_offset(dir);
+		fibh->eoffset -= udf_ext0_offset(dir);
+		f_pos -= udf_ext0_offset(dir);
+		if (fibh->sbh != fibh->ebh)
+			brelse(fibh->ebh);
+		brelse(fibh->sbh);
+		fibh->sbh = fibh->ebh =
+				udf_expand_dir_adinicb(dir, &block, err);
+		if (!fibh->sbh)
+			goto out_err;
+		epos.block = dinfo->i_location;
+		epos.offset = udf_file_entry_alloc_offset(dir);
+		/* Load extent udf_expand_dir_adinicb() has created */
+		udf_current_aext(dir, &epos, &eloc, &elen, 1);
+	}
+
+	/* Entry fits into current block? */
+	if (sb->s_blocksize - fibh->eoffset >= nfidlen) {
+		fibh->soffset = fibh->eoffset;
+		fibh->eoffset += nfidlen;
+		if (fibh->sbh != fibh->ebh) {
+			brelse(fibh->sbh);
+			fibh->sbh = fibh->ebh;
+		}
+
+		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			block = dinfo->i_location.logicalBlockNum;
+			fi = (struct fileIdentDesc *)
+					(dinfo->i_ext.i_data +
+					 fibh->soffset -
+					 udf_ext0_offset(dir) +
+					 dinfo->i_lenEAttr);
+		} else {
+			block = eloc.logicalBlockNum +
+					((elen - 1) >>
+						dir->i_sb->s_blocksize_bits);
+			fi = (struct fileIdentDesc *)
+				(fibh->sbh->b_data + fibh->soffset);
+		}
+	} else {
+		/* Round up last extent in the file */
+		elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
+		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+			epos.offset -= sizeof(struct short_ad);
+		else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+			epos.offset -= sizeof(struct long_ad);
+		udf_write_aext(dir, &epos, &eloc, elen, 1);
+		dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize
+					- 1) & ~(sb->s_blocksize - 1);
+
+		fibh->soffset = fibh->eoffset - sb->s_blocksize;
+		fibh->eoffset += nfidlen - sb->s_blocksize;
+		if (fibh->sbh != fibh->ebh) {
+			brelse(fibh->sbh);
+			fibh->sbh = fibh->ebh;
+		}
+
+		block = eloc.logicalBlockNum + ((elen - 1) >>
+						dir->i_sb->s_blocksize_bits);
+		fibh->ebh = udf_bread(dir,
+				f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
+		if (!fibh->ebh)
+			goto out_err;
+		/* Extents could have been merged, invalidate our position */
+		brelse(epos.bh);
+		epos.bh = NULL;
+		epos.block = dinfo->i_location;
+		epos.offset = udf_file_entry_alloc_offset(dir);
+
+		if (!fibh->soffset) {
+			/* Find the freshly allocated block */
+			while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+				(EXT_RECORDED_ALLOCATED >> 30))
+				;
+			block = eloc.logicalBlockNum + ((elen - 1) >>
+					dir->i_sb->s_blocksize_bits);
+			brelse(fibh->sbh);
+			fibh->sbh = fibh->ebh;
+			fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
+		} else {
+			fi = (struct fileIdentDesc *)
+				(fibh->sbh->b_data + sb->s_blocksize +
+					fibh->soffset);
+		}
+	}
+
+	memset(cfi, 0, sizeof(struct fileIdentDesc));
+	if (UDF_SB(sb)->s_udfrev >= 0x0200)
+		udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
+			    sizeof(struct tag));
+	else
+		udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
+			    sizeof(struct tag));
+	cfi->fileVersionNum = cpu_to_le16(1);
+	cfi->lengthFileIdent = namelen;
+	cfi->lengthOfImpUse = cpu_to_le16(0);
+	if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) {
+		dir->i_size += nfidlen;
+		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+			dinfo->i_lenAlloc += nfidlen;
+		else {
+			/* Find the last extent and truncate it to proper size */
+			while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+				(EXT_RECORDED_ALLOCATED >> 30))
+				;
+			elen -= dinfo->i_lenExtents - dir->i_size;
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+			udf_write_aext(dir, &epos, &eloc, elen, 1);
+			dinfo->i_lenExtents = dir->i_size;
+		}
+
+		mark_inode_dirty(dir);
+		goto out_ok;
+	} else {
+		*err = -EIO;
+		goto out_err;
+	}
+
+out_err:
+	fi = NULL;
+	if (fibh->sbh != fibh->ebh)
+		brelse(fibh->ebh);
+	brelse(fibh->sbh);
+out_ok:
+	brelse(epos.bh);
+	kfree(name);
+	return fi;
+}
+
+static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
+			    struct udf_fileident_bh *fibh,
+			    struct fileIdentDesc *cfi)
+{
+	cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
+		memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
+
+	return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
+}
+
+static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc cfi, *fi;
+	int err;
+
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (unlikely(!fi)) {
+		inode_dec_link_count(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		return err;
+	}
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
+	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+	mark_inode_dirty(dir);
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+
+	return 0;
+}
+
+static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl)
+{
+	struct inode *inode = udf_new_inode(dir, mode);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		inode->i_data.a_ops = &udf_adinicb_aops;
+	else
+		inode->i_data.a_ops = &udf_aops;
+	inode->i_op = &udf_file_inode_operations;
+	inode->i_fop = &udf_file_operations;
+	mark_inode_dirty(inode);
+
+	return udf_add_nondir(dentry, inode);
+}
+
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode = udf_new_inode(dir, mode);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		inode->i_data.a_ops = &udf_adinicb_aops;
+	else
+		inode->i_data.a_ops = &udf_aops;
+	inode->i_op = &udf_file_inode_operations;
+	inode->i_fop = &udf_file_operations;
+	mark_inode_dirty(inode);
+	d_tmpfile(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+}
+
+static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		     dev_t rdev)
+{
+	struct inode *inode;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = udf_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, mode, rdev);
+	return udf_add_nondir(dentry, inode);
+}
+
+static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc cfi, *fi;
+	int err;
+	struct udf_inode_info *dinfo = UDF_I(dir);
+	struct udf_inode_info *iinfo;
+
+	inode = udf_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	iinfo = UDF_I(inode);
+	inode->i_op = &udf_dir_inode_operations;
+	inode->i_fop = &udf_dir_operations;
+	fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
+	if (!fi) {
+		inode_dec_link_count(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	set_nlink(inode, 2);
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
+	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+		cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL);
+	cfi.fileCharacteristics =
+			FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
+	udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
+	brelse(fibh.sbh);
+	mark_inode_dirty(inode);
+
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (!fi) {
+		clear_nlink(inode);
+		mark_inode_dirty(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
+	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
+	cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	inc_nlink(dir);
+	dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+	mark_inode_dirty(dir);
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	err = 0;
+
+out:
+	return err;
+}
+
+static int empty_dir(struct inode *dir)
+{
+	struct fileIdentDesc *fi, cfi;
+	struct udf_fileident_bh fibh;
+	loff_t f_pos;
+	loff_t size = udf_ext0_offset(dir) + dir->i_size;
+	int block;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = {};
+	struct udf_inode_info *dinfo = UDF_I(dir);
+
+	f_pos = udf_ext0_offset(dir);
+	fibh.soffset = fibh.eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
+
+	if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		fibh.sbh = fibh.ebh = NULL;
+	else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
+			      &epos, &eloc, &elen, &offset) ==
+					(EXT_RECORDED_ALLOCATED >> 30)) {
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else
+			offset = 0;
+
+		fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block);
+		if (!fibh.sbh) {
+			brelse(epos.bh);
+			return 0;
+		}
+	} else {
+		brelse(epos.bh);
+		return 0;
+	}
+
+	while (f_pos < size) {
+		fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc,
+					&elen, &offset);
+		if (!fi) {
+			if (fibh.sbh != fibh.ebh)
+				brelse(fibh.ebh);
+			brelse(fibh.sbh);
+			brelse(epos.bh);
+			return 0;
+		}
+
+		if (cfi.lengthFileIdent &&
+		    (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0) {
+			if (fibh.sbh != fibh.ebh)
+				brelse(fibh.ebh);
+			brelse(fibh.sbh);
+			brelse(epos.bh);
+			return 0;
+		}
+	}
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	brelse(epos.bh);
+
+	return 1;
+}
+
+static int udf_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode = d_inode(dentry);
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc *fi, cfi;
+	struct kernel_lb_addr tloc;
+
+	retval = -ENOENT;
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
+	if (!fi)
+		goto out;
+
+	retval = -EIO;
+	tloc = lelb_to_cpu(cfi.icb.extLocation);
+	if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
+		goto end_rmdir;
+	retval = -ENOTEMPTY;
+	if (!empty_dir(inode))
+		goto end_rmdir;
+	retval = udf_delete_entry(dir, fi, &fibh, &cfi);
+	if (retval)
+		goto end_rmdir;
+	if (inode->i_nlink != 2)
+		udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n",
+			 inode->i_nlink);
+	clear_nlink(inode);
+	inode->i_size = 0;
+	inode_dec_link_count(dir);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime =
+						current_fs_time(dir->i_sb);
+	mark_inode_dirty(dir);
+
+end_rmdir:
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+out:
+	return retval;
+}
+
+static int udf_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode = d_inode(dentry);
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc *fi;
+	struct fileIdentDesc cfi;
+	struct kernel_lb_addr tloc;
+
+	retval = -ENOENT;
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
+	if (!fi)
+		goto out;
+
+	retval = -EIO;
+	tloc = lelb_to_cpu(cfi.icb.extLocation);
+	if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
+		goto end_unlink;
+
+	if (!inode->i_nlink) {
+		udf_debug("Deleting nonexistent file (%lu), %d\n",
+			  inode->i_ino, inode->i_nlink);
+		set_nlink(inode, 1);
+	}
+	retval = udf_delete_entry(dir, fi, &fibh, &cfi);
+	if (retval)
+		goto end_unlink;
+	dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+	mark_inode_dirty(dir);
+	inode_dec_link_count(inode);
+	inode->i_ctime = dir->i_ctime;
+	retval = 0;
+
+end_unlink:
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+out:
+	return retval;
+}
+
+static int udf_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *symname)
+{
+	struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	struct pathComponent *pc;
+	const char *compstart;
+	struct extent_position epos = {};
+	int eoffset, elen = 0;
+	uint8_t *ea;
+	int err;
+	int block;
+	unsigned char *name = NULL;
+	int namelen;
+	struct udf_inode_info *iinfo;
+	struct super_block *sb = dir->i_sb;
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	iinfo = UDF_I(inode);
+	down_write(&iinfo->i_data_sem);
+	name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!name) {
+		err = -ENOMEM;
+		goto out_no_entry;
+	}
+
+	inode->i_data.a_ops = &udf_symlink_aops;
+	inode->i_op = &udf_symlink_inode_operations;
+
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		struct kernel_lb_addr eloc;
+		uint32_t bsize;
+
+		block = udf_new_block(sb, inode,
+				iinfo->i_location.partitionReferenceNum,
+				iinfo->i_location.logicalBlockNum, &err);
+		if (!block)
+			goto out_no_entry;
+		epos.block = iinfo->i_location;
+		epos.offset = udf_file_entry_alloc_offset(inode);
+		epos.bh = NULL;
+		eloc.logicalBlockNum = block;
+		eloc.partitionReferenceNum =
+				iinfo->i_location.partitionReferenceNum;
+		bsize = sb->s_blocksize;
+		iinfo->i_lenExtents = bsize;
+		udf_add_aext(inode, &epos, &eloc, bsize, 0);
+		brelse(epos.bh);
+
+		block = udf_get_pblock(sb, block,
+				iinfo->i_location.partitionReferenceNum,
+				0);
+		epos.bh = udf_tgetblk(sb, block);
+		lock_buffer(epos.bh);
+		memset(epos.bh->b_data, 0x00, bsize);
+		set_buffer_uptodate(epos.bh);
+		unlock_buffer(epos.bh);
+		mark_buffer_dirty_inode(epos.bh, inode);
+		ea = epos.bh->b_data + udf_ext0_offset(inode);
+	} else
+		ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+
+	eoffset = sb->s_blocksize - udf_ext0_offset(inode);
+	pc = (struct pathComponent *)ea;
+
+	if (*symname == '/') {
+		do {
+			symname++;
+		} while (*symname == '/');
+
+		pc->componentType = 1;
+		pc->lengthComponentIdent = 0;
+		pc->componentFileVersionNum = 0;
+		elen += sizeof(struct pathComponent);
+	}
+
+	err = -ENAMETOOLONG;
+
+	while (*symname) {
+		if (elen + sizeof(struct pathComponent) > eoffset)
+			goto out_no_entry;
+
+		pc = (struct pathComponent *)(ea + elen);
+
+		compstart = symname;
+
+		do {
+			symname++;
+		} while (*symname && *symname != '/');
+
+		pc->componentType = 5;
+		pc->lengthComponentIdent = 0;
+		pc->componentFileVersionNum = 0;
+		if (compstart[0] == '.') {
+			if ((symname - compstart) == 1)
+				pc->componentType = 4;
+			else if ((symname - compstart) == 2 &&
+					compstart[1] == '.')
+				pc->componentType = 3;
+		}
+
+		if (pc->componentType == 5) {
+			namelen = udf_put_filename(sb, compstart, name,
+						   symname - compstart);
+			if (!namelen)
+				goto out_no_entry;
+
+			if (elen + sizeof(struct pathComponent) + namelen >
+					eoffset)
+				goto out_no_entry;
+			else
+				pc->lengthComponentIdent = namelen;
+
+			memcpy(pc->componentIdent, name, namelen);
+		}
+
+		elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;
+
+		if (*symname) {
+			do {
+				symname++;
+			} while (*symname == '/');
+		}
+	}
+
+	brelse(epos.bh);
+	inode->i_size = elen;
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		iinfo->i_lenAlloc = inode->i_size;
+	else
+		udf_truncate_tail_extent(inode);
+	mark_inode_dirty(inode);
+	up_write(&iinfo->i_data_sem);
+
+	err = udf_add_nondir(dentry, inode);
+out:
+	kfree(name);
+	return err;
+
+out_no_entry:
+	up_write(&iinfo->i_data_sem);
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	goto out;
+}
+
+static int udf_link(struct dentry *old_dentry, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc cfi, *fi;
+	int err;
+
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (!fi) {
+		return err;
+	}
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
+	if (UDF_SB(inode->i_sb)->s_lvid_bh) {
+		*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+			cpu_to_le32(lvid_get_unique_id(inode->i_sb));
+	}
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		mark_inode_dirty(dir);
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	inc_nlink(inode);
+	inode->i_ctime = current_fs_time(inode->i_sb);
+	mark_inode_dirty(inode);
+	dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+	mark_inode_dirty(dir);
+	ihold(inode);
+	d_instantiate(dentry, inode);
+
+	return 0;
+}
+
+/* Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct udf_fileident_bh ofibh, nfibh;
+	struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL;
+	struct fileIdentDesc ocfi, ncfi;
+	struct buffer_head *dir_bh = NULL;
+	int retval = -ENOENT;
+	struct kernel_lb_addr tloc;
+	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
+	if (ofi) {
+		if (ofibh.sbh != ofibh.ebh)
+			brelse(ofibh.ebh);
+		brelse(ofibh.sbh);
+	}
+	tloc = lelb_to_cpu(ocfi.icb.extLocation);
+	if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
+	    != old_inode->i_ino)
+		goto end_rename;
+
+	nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
+	if (nfi) {
+		if (!new_inode) {
+			if (nfibh.sbh != nfibh.ebh)
+				brelse(nfibh.ebh);
+			brelse(nfibh.sbh);
+			nfi = NULL;
+		}
+	}
+	if (S_ISDIR(old_inode->i_mode)) {
+		int offset = udf_ext0_offset(old_inode);
+
+		if (new_inode) {
+			retval = -ENOTEMPTY;
+			if (!empty_dir(new_inode))
+				goto end_rename;
+		}
+		retval = -EIO;
+		if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			dir_fi = udf_get_fileident(
+					old_iinfo->i_ext.i_data -
+					  (old_iinfo->i_efe ?
+					   sizeof(struct extendedFileEntry) :
+					   sizeof(struct fileEntry)),
+					old_inode->i_sb->s_blocksize, &offset);
+		} else {
+			dir_bh = udf_bread(old_inode, 0, 0, &retval);
+			if (!dir_bh)
+				goto end_rename;
+			dir_fi = udf_get_fileident(dir_bh->b_data,
+					old_inode->i_sb->s_blocksize, &offset);
+		}
+		if (!dir_fi)
+			goto end_rename;
+		tloc = lelb_to_cpu(dir_fi->icb.extLocation);
+		if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
+				old_dir->i_ino)
+			goto end_rename;
+	}
+	if (!nfi) {
+		nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi,
+				    &retval);
+		if (!nfi)
+			goto end_rename;
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = current_fs_time(old_inode->i_sb);
+	mark_inode_dirty(old_inode);
+
+	/*
+	 * ok, that's it
+	 */
+	ncfi.fileVersionNum = ocfi.fileVersionNum;
+	ncfi.fileCharacteristics = ocfi.fileCharacteristics;
+	memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
+	udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
+
+	/* The old fid may have moved - find it again */
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
+	udf_delete_entry(old_dir, ofi, &ofibh, &ocfi);
+
+	if (new_inode) {
+		new_inode->i_ctime = current_fs_time(new_inode->i_sb);
+		inode_dec_link_count(new_inode);
+	}
+	old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb);
+	new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
+	mark_inode_dirty(old_dir);
+	mark_inode_dirty(new_dir);
+
+	if (dir_fi) {
+		dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location);
+		udf_update_tag((char *)dir_fi,
+				(sizeof(struct fileIdentDesc) +
+				le16_to_cpu(dir_fi->lengthOfImpUse) + 3) & ~3);
+		if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+			mark_inode_dirty(old_inode);
+		else
+			mark_buffer_dirty_inode(dir_bh, old_inode);
+
+		inode_dec_link_count(old_dir);
+		if (new_inode)
+			inode_dec_link_count(new_inode);
+		else {
+			inc_nlink(new_dir);
+			mark_inode_dirty(new_dir);
+		}
+	}
+
+	if (ofi) {
+		if (ofibh.sbh != ofibh.ebh)
+			brelse(ofibh.ebh);
+		brelse(ofibh.sbh);
+	}
+
+	retval = 0;
+
+end_rename:
+	brelse(dir_bh);
+	if (nfi) {
+		if (nfibh.sbh != nfibh.ebh)
+			brelse(nfibh.ebh);
+		brelse(nfibh.sbh);
+	}
+
+	return retval;
+}
+
+static struct dentry *udf_get_parent(struct dentry *child)
+{
+	struct kernel_lb_addr tloc;
+	struct inode *inode = NULL;
+	struct qstr dotdot = QSTR_INIT("..", 2);
+	struct fileIdentDesc cfi;
+	struct udf_fileident_bh fibh;
+
+	if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi))
+		return ERR_PTR(-EACCES);
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+	tloc = lelb_to_cpu(cfi.icb.extLocation);
+	inode = udf_iget(d_inode(child)->i_sb, &tloc);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_obtain_alias(inode);
+}
+
+
+static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
+					u16 partref, __u32 generation)
+{
+	struct inode *inode;
+	struct kernel_lb_addr loc;
+
+	if (block == 0)
+		return ERR_PTR(-ESTALE);
+
+	loc.logicalBlockNum = block;
+	loc.partitionReferenceNum = partref;
+	inode = udf_iget(sb, &loc);
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *udf_fh_to_dentry(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if ((fh_len != 3 && fh_len != 5) ||
+	    (fh_type != FILEID_UDF_WITH_PARENT &&
+	     fh_type != FILEID_UDF_WITHOUT_PARENT))
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref,
+			fid->udf.generation);
+}
+
+static struct dentry *udf_fh_to_parent(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT)
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.parent_block,
+				 fid->udf.parent_partref,
+				 fid->udf.parent_generation);
+}
+static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+			 struct inode *parent)
+{
+	int len = *lenp;
+	struct kernel_lb_addr location = UDF_I(inode)->i_location;
+	struct fid *fid = (struct fid *)fh;
+	int type = FILEID_UDF_WITHOUT_PARENT;
+
+	if (parent && (len < 5)) {
+		*lenp = 5;
+		return FILEID_INVALID;
+	} else if (len < 3) {
+		*lenp = 3;
+		return FILEID_INVALID;
+	}
+
+	*lenp = 3;
+	fid->udf.block = location.logicalBlockNum;
+	fid->udf.partref = location.partitionReferenceNum;
+	fid->udf.parent_partref = 0;
+	fid->udf.generation = inode->i_generation;
+
+	if (parent) {
+		location = UDF_I(parent)->i_location;
+		fid->udf.parent_block = location.logicalBlockNum;
+		fid->udf.parent_partref = location.partitionReferenceNum;
+		fid->udf.parent_generation = inode->i_generation;
+		*lenp = 5;
+		type = FILEID_UDF_WITH_PARENT;
+	}
+
+	return type;
+}
+
+const struct export_operations udf_export_ops = {
+	.encode_fh	= udf_encode_fh,
+	.fh_to_dentry   = udf_fh_to_dentry,
+	.fh_to_parent   = udf_fh_to_parent,
+	.get_parent     = udf_get_parent,
+};
+
+const struct inode_operations udf_dir_inode_operations = {
+	.lookup				= udf_lookup,
+	.create				= udf_create,
+	.link				= udf_link,
+	.unlink				= udf_unlink,
+	.symlink			= udf_symlink,
+	.mkdir				= udf_mkdir,
+	.rmdir				= udf_rmdir,
+	.mknod				= udf_mknod,
+	.rename				= udf_rename,
+	.tmpfile			= udf_tmpfile,
+};
+const struct inode_operations udf_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
diff --git a/kernel/fs/udf/osta_udf.h b/kernel/fs/udf/osta_udf.h
new file mode 100644
index 000000000..fbff74654
--- /dev/null
+++ b/kernel/fs/udf/osta_udf.h
@@ -0,0 +1,279 @@
+/*
+ * osta_udf.h
+ *
+ * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003)
+ * http://www.osta.org
+ *
+ * Copyright (c) 2001-2004  Ben Fennema <bfennema@falcon.csc.calpoly.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "ecma_167.h"
+
+#ifndef _OSTA_UDF_H
+#define _OSTA_UDF_H 1
+
+/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */
+#define UDF_CHAR_SET_TYPE		0
+#define UDF_CHAR_SET_INFO		"OSTA Compressed Unicode"
+
+/* Entity Identifier (UDF 2.50 2.1.5) */
+/* Identifiers (UDF 2.50 2.1.5.2) */
+#define UDF_ID_DEVELOPER		"*Linux UDFFS"
+#define	UDF_ID_COMPLIANT		"*OSTA UDF Compliant"
+#define UDF_ID_LV_INFO			"*UDF LV Info"
+#define UDF_ID_FREE_EA			"*UDF FreeEASpace"
+#define UDF_ID_FREE_APP_EA		"*UDF FreeAppEASpace"
+#define UDF_ID_DVD_CGMS			"*UDF DVD CGMS Info"
+#define UDF_ID_OS2_EA			"*UDF OS/2 EA"
+#define UDF_ID_OS2_EA_LENGTH		"*UDF OS/2 EALength"
+#define UDF_ID_MAC_VOLUME		"*UDF Mac VolumeInfo"
+#define UDF_ID_MAC_FINDER		"*UDF Mac FinderInfo"
+#define UDF_ID_MAC_UNIQUE		"*UDF Mac UniqueIDTable"
+#define UDF_ID_MAC_RESOURCE		"*UDF Mac ResourceFork"
+#define UDF_ID_VIRTUAL			"*UDF Virtual Partition"
+#define UDF_ID_SPARABLE			"*UDF Sparable Partition"
+#define UDF_ID_ALLOC			"*UDF Virtual Alloc Tbl"
+#define UDF_ID_SPARING			"*UDF Sparing Table"
+#define UDF_ID_METADATA			"*UDF Metadata Partition"
+
+/* Identifier Suffix (UDF 2.50 2.1.5.3) */
+#define IS_DF_HARD_WRITE_PROTECT	0x01
+#define IS_DF_SOFT_WRITE_PROTECT	0x02
+
+struct UDFIdentSuffix {
+	__le16		UDFRevision;
+	uint8_t		OSClass;
+	uint8_t		OSIdentifier;
+	uint8_t		reserved[4];
+} __attribute__ ((packed));
+
+struct impIdentSuffix {
+	uint8_t		OSClass;
+	uint8_t		OSIdentifier;
+	uint8_t		reserved[6];
+} __attribute__ ((packed));
+
+struct appIdentSuffix {
+	uint8_t		impUse[8];
+} __attribute__ ((packed));
+
+/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
+/* Implementation Use (UDF 2.50 2.2.6.4) */
+struct logicalVolIntegrityDescImpUse {
+	struct regid	impIdent;
+	__le32		numFiles;
+	__le32		numDirs;
+	__le16		minUDFReadRev;
+	__le16		minUDFWriteRev;
+	__le16		maxUDFWriteRev;
+	uint8_t		impUse[0];
+} __attribute__ ((packed));
+
+/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
+/* Implementation Use (UDF 2.50 2.2.7.2) */
+struct impUseVolDescImpUse {
+	struct charspec	LVICharset;
+	dstring		logicalVolIdent[128];
+	dstring		LVInfo1[36];
+	dstring		LVInfo2[36];
+	dstring		LVInfo3[36];
+	struct regid	impIdent;
+	uint8_t		impUse[128];
+} __attribute__ ((packed));
+
+struct udfPartitionMap2 {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		reserved1[2];
+	struct regid	partIdent;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+} __attribute__ ((packed));
+
+/* Virtual Partition Map (UDF 2.50 2.2.8) */
+struct virtualPartitionMap {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		reserved1[2];
+	struct regid	partIdent;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+	uint8_t		reserved2[24];
+} __attribute__ ((packed));
+
+/* Sparable Partition Map (UDF 2.50 2.2.9) */
+struct sparablePartitionMap {
+	uint8_t partitionMapType;
+	uint8_t partitionMapLength;
+	uint8_t reserved1[2];
+	struct regid partIdent;
+	__le16 volSeqNum;
+	__le16 partitionNum;
+	__le16 packetLength;
+	uint8_t numSparingTables;
+	uint8_t reserved2[1];
+	__le32 sizeSparingTable;
+	__le32 locSparingTable[4];
+} __attribute__ ((packed));
+
+/* Metadata Partition Map (UDF 2.4.0 2.2.10) */
+struct metadataPartitionMap {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		reserved1[2];
+	struct regid	partIdent;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+	__le32		metadataFileLoc;
+	__le32		metadataMirrorFileLoc;
+	__le32		metadataBitmapFileLoc;
+	__le32		allocUnitSize;
+	__le16		alignUnitSize;
+	uint8_t		flags;
+	uint8_t		reserved2[5];
+} __attribute__ ((packed));
+
+/* Virtual Allocation Table (UDF 1.5 2.2.10) */
+struct virtualAllocationTable15 {
+	__le32		VirtualSector[0];
+	struct regid	vatIdent;
+	__le32		previousVATICBLoc;
+} __attribute__ ((packed));
+
+#define ICBTAG_FILE_TYPE_VAT15		0x00U
+
+/* Virtual Allocation Table (UDF 2.50 2.2.11) */
+struct virtualAllocationTable20 {
+	__le16		lengthHeader;
+	__le16		lengthImpUse;
+	dstring		logicalVolIdent[128];
+	__le32		previousVATICBLoc;
+	__le32		numFiles;
+	__le32		numDirs;
+	__le16		minReadRevision;
+	__le16		minWriteRevision;
+	__le16		maxWriteRevision;
+	__le16		reserved;
+	uint8_t		impUse[0];
+	__le32		vatEntry[0];
+} __attribute__ ((packed));
+
+#define ICBTAG_FILE_TYPE_VAT20		0xF8U
+
+/* Sparing Table (UDF 2.50 2.2.12) */
+struct sparingEntry {
+	__le32		origLocation;
+	__le32		mappedLocation;
+} __attribute__ ((packed));
+
+struct sparingTable {
+	struct tag	descTag;
+	struct regid	sparingIdent;
+	__le16		reallocationTableLen;
+	__le16		reserved;
+	__le32		sequenceNum;
+	struct sparingEntry
+			mapEntry[0];
+} __attribute__ ((packed));
+
+/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */
+#define ICBTAG_FILE_TYPE_MAIN		0xFA
+#define ICBTAG_FILE_TYPE_MIRROR		0xFB
+#define ICBTAG_FILE_TYPE_BITMAP		0xFC
+
+/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+struct allocDescImpUse {
+	__le16		flags;
+	uint8_t		impUse[4];
+} __attribute__ ((packed));
+
+#define AD_IU_EXT_ERASED		0x0001
+
+/* Real-Time Files (UDF 2.50 6.11) */
+#define ICBTAG_FILE_TYPE_REALTIME	0xF9U
+
+/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */
+/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */
+struct freeEaSpace {
+	__le16		headerChecksum;
+	uint8_t		freeEASpace[0];
+} __attribute__ ((packed));
+
+/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */
+struct DVDCopyrightImpUse {
+	__le16		headerChecksum;
+	uint8_t		CGMSInfo;
+	uint8_t		dataType;
+	uint8_t		protectionSystemInfo[4];
+} __attribute__ ((packed));
+
+/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */
+/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */
+struct freeAppEASpace {
+	__le16		headerChecksum;
+	uint8_t		freeEASpace[0];
+} __attribute__ ((packed));
+
+/* UDF Defined System Stream (UDF 2.50 3.3.7) */
+#define UDF_ID_UNIQUE_ID		"*UDF Unique ID Mapping Data"
+#define UDF_ID_NON_ALLOC		"*UDF Non-Allocatable Space"
+#define UDF_ID_POWER_CAL		"*UDF Power Cal Table"
+#define UDF_ID_BACKUP			"*UDF Backup"
+
+/* Operating System Identifiers (UDF 2.50 6.3) */
+#define UDF_OS_CLASS_UNDEF		0x00U
+#define UDF_OS_CLASS_DOS		0x01U
+#define UDF_OS_CLASS_OS2		0x02U
+#define UDF_OS_CLASS_MAC		0x03U
+#define UDF_OS_CLASS_UNIX		0x04U
+#define UDF_OS_CLASS_WIN9X		0x05U
+#define UDF_OS_CLASS_WINNT		0x06U
+#define UDF_OS_CLASS_OS400		0x07U
+#define UDF_OS_CLASS_BEOS		0x08U
+#define UDF_OS_CLASS_WINCE		0x09U
+
+#define UDF_OS_ID_UNDEF			0x00U
+#define UDF_OS_ID_DOS			0x00U
+#define UDF_OS_ID_OS2			0x00U
+#define UDF_OS_ID_MAC			0x00U
+#define UDF_OS_ID_MAX_OSX		0x01U
+#define UDF_OS_ID_UNIX			0x00U
+#define UDF_OS_ID_AIX			0x01U
+#define UDF_OS_ID_SOLARIS		0x02U
+#define UDF_OS_ID_HPUX			0x03U
+#define UDF_OS_ID_IRIX			0x04U
+#define UDF_OS_ID_LINUX			0x05U
+#define UDF_OS_ID_MKLINUX		0x06U
+#define UDF_OS_ID_FREEBSD		0x07U
+#define UDF_OS_ID_WIN9X			0x00U
+#define UDF_OS_ID_WINNT			0x00U
+#define UDF_OS_ID_OS400			0x00U
+#define UDF_OS_ID_BEOS			0x00U
+#define UDF_OS_ID_WINCE			0x00U
+
+#endif /* _OSTA_UDF_H */
diff --git a/kernel/fs/udf/partition.c b/kernel/fs/udf/partition.c
new file mode 100644
index 000000000..5f861ed28
--- /dev/null
+++ b/kernel/fs/udf/partition.c
@@ -0,0 +1,338 @@
+/*
+ * partition.c
+ *
+ * PURPOSE
+ *      Partition handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *      This file is distributed under the terms of the GNU General Public
+ *      License (GPL). Copies of the GPL can be obtained from:
+ *              ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *      Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2001 Ben Fennema
+ *
+ * HISTORY
+ *
+ * 12/06/98 blf  Created file.
+ *
+ */
+
+#include "udfdecl.h"
+#include "udf_sb.h"
+#include "udf_i.h"
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/mutex.h>
+
+uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
+			uint16_t partition, uint32_t offset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	if (partition >= sbi->s_partitions) {
+		udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n",
+			  block, partition, offset);
+		return 0xFFFFFFFF;
+	}
+	map = &sbi->s_partmaps[partition];
+	if (map->s_partition_func)
+		return map->s_partition_func(sb, block, partition, offset);
+	else
+		return map->s_partition_root + block + offset;
+}
+
+uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
+			       uint16_t partition, uint32_t offset)
+{
+	struct buffer_head *bh = NULL;
+	uint32_t newblock;
+	uint32_t index;
+	uint32_t loc;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_virtual_data *vdata;
+	struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode);
+
+	map = &sbi->s_partmaps[partition];
+	vdata = &map->s_type_specific.s_virtual;
+
+	if (block > vdata->s_num_entries) {
+		udf_debug("Trying to access block beyond end of VAT (%d max %d)\n",
+			  block, vdata->s_num_entries);
+		return 0xFFFFFFFF;
+	}
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data +
+			vdata->s_start_offset))[block]);
+		goto translate;
+	}
+	index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
+	if (block >= index) {
+		block -= index;
+		newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t)));
+		index = block % (sb->s_blocksize / sizeof(uint32_t));
+	} else {
+		newblock = 0;
+		index = vdata->s_start_offset / sizeof(uint32_t) + block;
+	}
+
+	loc = udf_block_map(sbi->s_vat_inode, newblock);
+
+	bh = sb_bread(sb, loc);
+	if (!bh) {
+		udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%d,%d) VAT: %d[%d]\n",
+			  sb, block, partition, loc, index);
+		return 0xFFFFFFFF;
+	}
+
+	loc = le32_to_cpu(((__le32 *)bh->b_data)[index]);
+
+	brelse(bh);
+
+translate:
+	if (iinfo->i_location.partitionReferenceNum == partition) {
+		udf_debug("recursive call to udf_get_pblock!\n");
+		return 0xFFFFFFFF;
+	}
+
+	return udf_get_pblock(sb, loc,
+			      iinfo->i_location.partitionReferenceNum,
+			      offset);
+}
+
+inline uint32_t udf_get_pblock_virt20(struct super_block *sb, uint32_t block,
+				      uint16_t partition, uint32_t offset)
+{
+	return udf_get_pblock_virt15(sb, block, partition, offset);
+}
+
+uint32_t udf_get_pblock_spar15(struct super_block *sb, uint32_t block,
+			       uint16_t partition, uint32_t offset)
+{
+	int i;
+	struct sparingTable *st = NULL;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	uint32_t packet;
+	struct udf_sparing_data *sdata;
+
+	map = &sbi->s_partmaps[partition];
+	sdata = &map->s_type_specific.s_sparing;
+	packet = (block + offset) & ~(sdata->s_packet_len - 1);
+
+	for (i = 0; i < 4; i++) {
+		if (sdata->s_spar_map[i] != NULL) {
+			st = (struct sparingTable *)
+					sdata->s_spar_map[i]->b_data;
+			break;
+		}
+	}
+
+	if (st) {
+		for (i = 0; i < le16_to_cpu(st->reallocationTableLen); i++) {
+			struct sparingEntry *entry = &st->mapEntry[i];
+			u32 origLoc = le32_to_cpu(entry->origLocation);
+			if (origLoc >= 0xFFFFFFF0)
+				break;
+			else if (origLoc == packet)
+				return le32_to_cpu(entry->mappedLocation) +
+					((block + offset) &
+						(sdata->s_packet_len - 1));
+			else if (origLoc > packet)
+				break;
+		}
+	}
+
+	return map->s_partition_root + block + offset;
+}
+
+int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
+{
+	struct udf_sparing_data *sdata;
+	struct sparingTable *st = NULL;
+	struct sparingEntry mapEntry;
+	uint32_t packet;
+	int i, j, k, l;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	u16 reallocationTableLen;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	for (i = 0; i < sbi->s_partitions; i++) {
+		struct udf_part_map *map = &sbi->s_partmaps[i];
+		if (old_block > map->s_partition_root &&
+		    old_block < map->s_partition_root + map->s_partition_len) {
+			sdata = &map->s_type_specific.s_sparing;
+			packet = (old_block - map->s_partition_root) &
+						~(sdata->s_packet_len - 1);
+
+			for (j = 0; j < 4; j++)
+				if (sdata->s_spar_map[j] != NULL) {
+					st = (struct sparingTable *)
+						sdata->s_spar_map[j]->b_data;
+					break;
+				}
+
+			if (!st) {
+				ret = 1;
+				goto out;
+			}
+
+			reallocationTableLen =
+					le16_to_cpu(st->reallocationTableLen);
+			for (k = 0; k < reallocationTableLen; k++) {
+				struct sparingEntry *entry = &st->mapEntry[k];
+				u32 origLoc = le32_to_cpu(entry->origLocation);
+
+				if (origLoc == 0xFFFFFFFF) {
+					for (; j < 4; j++) {
+						int len;
+						bh = sdata->s_spar_map[j];
+						if (!bh)
+							continue;
+
+						st = (struct sparingTable *)
+								bh->b_data;
+						entry->origLocation =
+							cpu_to_le32(packet);
+						len =
+						  sizeof(struct sparingTable) +
+						  reallocationTableLen *
+						  sizeof(struct sparingEntry);
+						udf_update_tag((char *)st, len);
+						mark_buffer_dirty(bh);
+					}
+					*new_block = le32_to_cpu(
+							entry->mappedLocation) +
+						     ((old_block -
+							map->s_partition_root) &
+						     (sdata->s_packet_len - 1));
+					ret = 0;
+					goto out;
+				} else if (origLoc == packet) {
+					*new_block = le32_to_cpu(
+							entry->mappedLocation) +
+						     ((old_block -
+							map->s_partition_root) &
+						     (sdata->s_packet_len - 1));
+					ret = 0;
+					goto out;
+				} else if (origLoc > packet)
+					break;
+			}
+
+			for (l = k; l < reallocationTableLen; l++) {
+				struct sparingEntry *entry = &st->mapEntry[l];
+				u32 origLoc = le32_to_cpu(entry->origLocation);
+
+				if (origLoc != 0xFFFFFFFF)
+					continue;
+
+				for (; j < 4; j++) {
+					bh = sdata->s_spar_map[j];
+					if (!bh)
+						continue;
+
+					st = (struct sparingTable *)bh->b_data;
+					mapEntry = st->mapEntry[l];
+					mapEntry.origLocation =
+							cpu_to_le32(packet);
+					memmove(&st->mapEntry[k + 1],
+						&st->mapEntry[k],
+						(l - k) *
+						sizeof(struct sparingEntry));
+					st->mapEntry[k] = mapEntry;
+					udf_update_tag((char *)st,
+						sizeof(struct sparingTable) +
+						reallocationTableLen *
+						sizeof(struct sparingEntry));
+					mark_buffer_dirty(bh);
+				}
+				*new_block =
+					le32_to_cpu(
+					      st->mapEntry[k].mappedLocation) +
+					((old_block - map->s_partition_root) &
+					 (sdata->s_packet_len - 1));
+				ret = 0;
+				goto out;
+			}
+
+			ret = 1;
+			goto out;
+		} /* if old_block */
+	}
+
+	if (i == sbi->s_partitions) {
+		/* outside of partitions */
+		/* for now, fail =) */
+		ret = 1;
+	}
+
+out:
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return ret;
+}
+
+static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
+					uint16_t partition, uint32_t offset)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_part_map *map;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t ext_offset;
+	struct extent_position epos = {};
+	uint32_t phyblock;
+
+	if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) !=
+						(EXT_RECORDED_ALLOCATED >> 30))
+		phyblock = 0xFFFFFFFF;
+	else {
+		map = &UDF_SB(sb)->s_partmaps[partition];
+		/* map to sparable/physical partition desc */
+		phyblock = udf_get_pblock(sb, eloc.logicalBlockNum,
+			map->s_partition_num, ext_offset + offset);
+	}
+
+	brelse(epos.bh);
+	return phyblock;
+}
+
+uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
+				uint16_t partition, uint32_t offset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_meta_data *mdata;
+	uint32_t retblk;
+	struct inode *inode;
+
+	udf_debug("READING from METADATA\n");
+
+	map = &sbi->s_partmaps[partition];
+	mdata = &map->s_type_specific.s_metadata;
+	inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe;
+
+	/* We shouldn't mount such media... */
+	BUG_ON(!inode);
+	retblk = udf_try_read_meta(inode, block, partition, offset);
+	if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
+		udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
+		if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
+			mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+				mdata->s_mirror_file_loc, map->s_partition_num);
+			mdata->s_flags |= MF_MIRROR_FE_LOADED;
+		}
+
+		inode = mdata->s_mirror_fe;
+		if (!inode)
+			return 0xFFFFFFFF;
+		retblk = udf_try_read_meta(inode, block, partition, offset);
+	}
+
+	return retblk;
+}
diff --git a/kernel/fs/udf/super.c b/kernel/fs/udf/super.c
new file mode 100644
index 000000000..6299f3419
--- /dev/null
+++ b/kernel/fs/udf/super.c
@@ -0,0 +1,2469 @@
+/*
+ * super.c
+ *
+ * PURPOSE
+ *  Super block routines for the OSTA-UDF(tm) filesystem.
+ *
+ * DESCRIPTION
+ *  OSTA-UDF(tm) = Optical Storage Technology Association
+ *  Universal Disk Format.
+ *
+ *  This code is based on version 2.00 of the UDF specification,
+ *  and revision 3 of the ECMA 167 standard [equivalent to ISO 13346].
+ *    http://www.osta.org/
+ *    http://www.ecma.ch/
+ *    http://www.iso.org/
+ *
+ * COPYRIGHT
+ *  This file is distributed under the terms of the GNU General Public
+ *  License (GPL). Copies of the GPL can be obtained from:
+ *    ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *  Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  09/24/98 dgb  changed to allow compiling outside of kernel, and
+ *                added some debugging.
+ *  10/01/98 dgb  updated to allow (some) possibility of compiling w/2.0.34
+ *  10/16/98      attempting some multi-session support
+ *  10/17/98      added freespace count for "df"
+ *  11/11/98 gr   added novrs option
+ *  11/26/98 dgb  added fileset,anchor mount options
+ *  12/06/98 blf  really hosed things royally. vat/sparing support. sequenced
+ *                vol descs. rewrote option handling based on isofs
+ *  12/20/98      find the free space bitmap (if it exists)
+ */
+
+#include "udfdecl.h"
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/stat.h>
+#include <linux/cdrom.h>
+#include <linux/nls.h>
+#include <linux/vfs.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/crc-itu-t.h>
+#include <linux/log2.h>
+#include <asm/byteorder.h>
+
+#include "udf_sb.h"
+#include "udf_i.h"
+
+#include <linux/init.h>
+#include <linux/uaccess.h>
+
+#define VDS_POS_PRIMARY_VOL_DESC	0
+#define VDS_POS_UNALLOC_SPACE_DESC	1
+#define VDS_POS_LOGICAL_VOL_DESC	2
+#define VDS_POS_PARTITION_DESC		3
+#define VDS_POS_IMP_USE_VOL_DESC	4
+#define VDS_POS_VOL_DESC_PTR		5
+#define VDS_POS_TERMINATING_DESC	6
+#define VDS_POS_LENGTH			7
+
+#define UDF_DEFAULT_BLOCKSIZE 2048
+
+#define VSD_FIRST_SECTOR_OFFSET		32768
+#define VSD_MAX_SECTOR_OFFSET		0x800000
+
+enum { UDF_MAX_LINKS = 0xffff };
+
+/* These are the "meat" - everything else is stuffing */
+static int udf_fill_super(struct super_block *, void *, int);
+static void udf_put_super(struct super_block *);
+static int udf_sync_fs(struct super_block *, int);
+static int udf_remount_fs(struct super_block *, int *, char *);
+static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
+static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
+			    struct kernel_lb_addr *);
+static void udf_load_fileset(struct super_block *, struct buffer_head *,
+			     struct kernel_lb_addr *);
+static void udf_open_lvid(struct super_block *);
+static void udf_close_lvid(struct super_block *);
+static unsigned int udf_count_free(struct super_block *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
+static int udf_show_options(struct seq_file *, struct dentry *);
+
+struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
+{
+	struct logicalVolIntegrityDesc *lvid;
+	unsigned int partnum;
+	unsigned int offset;
+
+	if (!UDF_SB(sb)->s_lvid_bh)
+		return NULL;
+	lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data;
+	partnum = le32_to_cpu(lvid->numOfPartitions);
+	if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) -
+	     offsetof(struct logicalVolIntegrityDesc, impUse)) /
+	     (2 * sizeof(uint32_t)) < partnum) {
+		udf_err(sb, "Logical volume integrity descriptor corrupted "
+			"(numOfPartitions = %u)!\n", partnum);
+		return NULL;
+	}
+	/* The offset is to skip freeSpaceTable and sizeTable arrays */
+	offset = partnum * 2 * sizeof(uint32_t);
+	return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]);
+}
+
+/* UDF filesystem type */
+static struct dentry *udf_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+}
+
+static struct file_system_type udf_fstype = {
+	.owner		= THIS_MODULE,
+	.name		= "udf",
+	.mount		= udf_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("udf");
+
+static struct kmem_cache *udf_inode_cachep;
+
+static struct inode *udf_alloc_inode(struct super_block *sb)
+{
+	struct udf_inode_info *ei;
+	ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+
+	ei->i_unique = 0;
+	ei->i_lenExtents = 0;
+	ei->i_next_alloc_block = 0;
+	ei->i_next_alloc_goal = 0;
+	ei->i_strat4096 = 0;
+	init_rwsem(&ei->i_data_sem);
+	ei->cached_extent.lstart = -1;
+	spin_lock_init(&ei->i_extent_cache_lock);
+
+	return &ei->vfs_inode;
+}
+
+static void udf_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
+}
+
+static void udf_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, udf_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
+
+	ei->i_ext.i_data = NULL;
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	udf_inode_cachep = kmem_cache_create("udf_inode_cache",
+					     sizeof(struct udf_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT |
+						 SLAB_MEM_SPREAD),
+					     init_once);
+	if (!udf_inode_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(udf_inode_cachep);
+}
+
+/* Superblock operations */
+static const struct super_operations udf_sb_ops = {
+	.alloc_inode	= udf_alloc_inode,
+	.destroy_inode	= udf_destroy_inode,
+	.write_inode	= udf_write_inode,
+	.evict_inode	= udf_evict_inode,
+	.put_super	= udf_put_super,
+	.sync_fs	= udf_sync_fs,
+	.statfs		= udf_statfs,
+	.remount_fs	= udf_remount_fs,
+	.show_options	= udf_show_options,
+};
+
+struct udf_options {
+	unsigned char novrs;
+	unsigned int blocksize;
+	unsigned int session;
+	unsigned int lastblock;
+	unsigned int anchor;
+	unsigned int volume;
+	unsigned short partition;
+	unsigned int fileset;
+	unsigned int rootdir;
+	unsigned int flags;
+	umode_t umask;
+	kgid_t gid;
+	kuid_t uid;
+	umode_t fmode;
+	umode_t dmode;
+	struct nls_table *nls_map;
+};
+
+static int __init init_udf_fs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&udf_fstype);
+	if (err)
+		goto out;
+
+	return 0;
+
+out:
+	destroy_inodecache();
+
+out1:
+	return err;
+}
+
+static void __exit exit_udf_fs(void)
+{
+	unregister_filesystem(&udf_fstype);
+	destroy_inodecache();
+}
+
+module_init(init_udf_fs)
+module_exit(exit_udf_fs)
+
+static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
+				  GFP_KERNEL);
+	if (!sbi->s_partmaps) {
+		udf_err(sb, "Unable to allocate space for %d partition maps\n",
+			count);
+		sbi->s_partitions = 0;
+		return -ENOMEM;
+	}
+
+	sbi->s_partitions = count;
+	return 0;
+}
+
+static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
+{
+	int i;
+	int nr_groups = bitmap->s_nr_groups;
+	int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
+						nr_groups);
+
+	for (i = 0; i < nr_groups; i++)
+		if (bitmap->s_block_bitmap[i])
+			brelse(bitmap->s_block_bitmap[i]);
+
+	if (size <= PAGE_SIZE)
+		kfree(bitmap);
+	else
+		vfree(bitmap);
+}
+
+static void udf_free_partition(struct udf_part_map *map)
+{
+	int i;
+	struct udf_meta_data *mdata;
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+		iput(map->s_uspace.s_table);
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+		iput(map->s_fspace.s_table);
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+		udf_sb_free_bitmap(map->s_uspace.s_bitmap);
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+		udf_sb_free_bitmap(map->s_fspace.s_bitmap);
+	if (map->s_partition_type == UDF_SPARABLE_MAP15)
+		for (i = 0; i < 4; i++)
+			brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
+	else if (map->s_partition_type == UDF_METADATA_MAP25) {
+		mdata = &map->s_type_specific.s_metadata;
+		iput(mdata->s_metadata_fe);
+		mdata->s_metadata_fe = NULL;
+
+		iput(mdata->s_mirror_fe);
+		mdata->s_mirror_fe = NULL;
+
+		iput(mdata->s_bitmap_fe);
+		mdata->s_bitmap_fe = NULL;
+	}
+}
+
+static void udf_sb_free_partitions(struct super_block *sb)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int i;
+	if (sbi->s_partmaps == NULL)
+		return;
+	for (i = 0; i < sbi->s_partitions; i++)
+		udf_free_partition(&sbi->s_partmaps[i]);
+	kfree(sbi->s_partmaps);
+	sbi->s_partmaps = NULL;
+}
+
+static int udf_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
+		seq_puts(seq, ",nostrict");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
+		seq_printf(seq, ",bs=%lu", sb->s_blocksize);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
+		seq_puts(seq, ",unhide");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
+		seq_puts(seq, ",undelete");
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB))
+		seq_puts(seq, ",noadinicb");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD))
+		seq_puts(seq, ",shortad");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET))
+		seq_puts(seq, ",uid=forget");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE))
+		seq_puts(seq, ",uid=ignore");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET))
+		seq_puts(seq, ",gid=forget");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE))
+		seq_puts(seq, ",gid=ignore");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
+		seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid));
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
+		seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->s_gid));
+	if (sbi->s_umask != 0)
+		seq_printf(seq, ",umask=%ho", sbi->s_umask);
+	if (sbi->s_fmode != UDF_INVALID_MODE)
+		seq_printf(seq, ",mode=%ho", sbi->s_fmode);
+	if (sbi->s_dmode != UDF_INVALID_MODE)
+		seq_printf(seq, ",dmode=%ho", sbi->s_dmode);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
+		seq_printf(seq, ",session=%u", sbi->s_session);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
+		seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
+	if (sbi->s_anchor != 0)
+		seq_printf(seq, ",anchor=%u", sbi->s_anchor);
+	/*
+	 * volume, partition, fileset and rootdir seem to be ignored
+	 * currently
+	 */
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
+		seq_puts(seq, ",utf8");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map)
+		seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset);
+
+	return 0;
+}
+
+/*
+ * udf_parse_options
+ *
+ * PURPOSE
+ *	Parse mount options.
+ *
+ * DESCRIPTION
+ *	The following mount options are supported:
+ *
+ *	gid=		Set the default group.
+ *	umask=		Set the default umask.
+ *	mode=		Set the default file permissions.
+ *	dmode=		Set the default directory permissions.
+ *	uid=		Set the default user.
+ *	bs=		Set the block size.
+ *	unhide		Show otherwise hidden files.
+ *	undelete	Show deleted files in lists.
+ *	adinicb		Embed data in the inode (default)
+ *	noadinicb	Don't embed data in the inode
+ *	shortad		Use short ad's
+ *	longad		Use long ad's (default)
+ *	nostrict	Unset strict conformance
+ *	iocharset=	Set the NLS character set
+ *
+ *	The remaining are for debugging and disaster recovery:
+ *
+ *	novrs		Skip volume sequence recognition
+ *
+ *	The following expect a offset from 0.
+ *
+ *	session=	Set the CDROM session (default= last session)
+ *	anchor=		Override standard anchor location. (default= 256)
+ *	volume=		Override the VolumeDesc location. (unused)
+ *	partition=	Override the PartitionDesc location. (unused)
+ *	lastblock=	Set the last block of the filesystem/
+ *
+ *	The following expect a offset from the partition root.
+ *
+ *	fileset=	Override the fileset block location. (unused)
+ *	rootdir=	Override the root directory location. (unused)
+ *		WARNING: overriding the rootdir to a non-directory may
+ *		yield highly unpredictable results.
+ *
+ * PRE-CONDITIONS
+ *	options		Pointer to mount options string.
+ *	uopts		Pointer to mount options variable.
+ *
+ * POST-CONDITIONS
+ *	<return>	1	Mount options parsed okay.
+ *	<return>	0	Error parsing mount options.
+ *
+ * HISTORY
+ *	July 1, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+
+enum {
+	Opt_novrs, Opt_nostrict, Opt_bs, Opt_unhide, Opt_undelete,
+	Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad,
+	Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
+	Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
+	Opt_rootdir, Opt_utf8, Opt_iocharset,
+	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
+	Opt_fmode, Opt_dmode
+};
+
+static const match_table_t tokens = {
+	{Opt_novrs,	"novrs"},
+	{Opt_nostrict,	"nostrict"},
+	{Opt_bs,	"bs=%u"},
+	{Opt_unhide,	"unhide"},
+	{Opt_undelete,	"undelete"},
+	{Opt_noadinicb,	"noadinicb"},
+	{Opt_adinicb,	"adinicb"},
+	{Opt_shortad,	"shortad"},
+	{Opt_longad,	"longad"},
+	{Opt_uforget,	"uid=forget"},
+	{Opt_uignore,	"uid=ignore"},
+	{Opt_gforget,	"gid=forget"},
+	{Opt_gignore,	"gid=ignore"},
+	{Opt_gid,	"gid=%u"},
+	{Opt_uid,	"uid=%u"},
+	{Opt_umask,	"umask=%o"},
+	{Opt_session,	"session=%u"},
+	{Opt_lastblock,	"lastblock=%u"},
+	{Opt_anchor,	"anchor=%u"},
+	{Opt_volume,	"volume=%u"},
+	{Opt_partition,	"partition=%u"},
+	{Opt_fileset,	"fileset=%u"},
+	{Opt_rootdir,	"rootdir=%u"},
+	{Opt_utf8,	"utf8"},
+	{Opt_iocharset,	"iocharset=%s"},
+	{Opt_fmode,     "mode=%o"},
+	{Opt_dmode,     "dmode=%o"},
+	{Opt_err,	NULL}
+};
+
+static int udf_parse_options(char *options, struct udf_options *uopt,
+			     bool remount)
+{
+	char *p;
+	int option;
+
+	uopt->novrs = 0;
+	uopt->partition = 0xFFFF;
+	uopt->session = 0xFFFFFFFF;
+	uopt->lastblock = 0;
+	uopt->anchor = 0;
+	uopt->volume = 0xFFFFFFFF;
+	uopt->rootdir = 0xFFFFFFFF;
+	uopt->fileset = 0xFFFFFFFF;
+	uopt->nls_map = NULL;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		unsigned n;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_novrs:
+			uopt->novrs = 1;
+			break;
+		case Opt_bs:
+			if (match_int(&args[0], &option))
+				return 0;
+			n = option;
+			if (n != 512 && n != 1024 && n != 2048 && n != 4096)
+				return 0;
+			uopt->blocksize = n;
+			uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
+			break;
+		case Opt_unhide:
+			uopt->flags |= (1 << UDF_FLAG_UNHIDE);
+			break;
+		case Opt_undelete:
+			uopt->flags |= (1 << UDF_FLAG_UNDELETE);
+			break;
+		case Opt_noadinicb:
+			uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB);
+			break;
+		case Opt_adinicb:
+			uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB);
+			break;
+		case Opt_shortad:
+			uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
+			break;
+		case Opt_longad:
+			uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return 0;
+			uopt->gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(uopt->gid))
+				return 0;
+			uopt->flags |= (1 << UDF_FLAG_GID_SET);
+			break;
+		case Opt_uid:
+			if (match_int(args, &option))
+				return 0;
+			uopt->uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uopt->uid))
+				return 0;
+			uopt->flags |= (1 << UDF_FLAG_UID_SET);
+			break;
+		case Opt_umask:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->umask = option;
+			break;
+		case Opt_nostrict:
+			uopt->flags &= ~(1 << UDF_FLAG_STRICT);
+			break;
+		case Opt_session:
+			if (match_int(args, &option))
+				return 0;
+			uopt->session = option;
+			if (!remount)
+				uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
+			break;
+		case Opt_lastblock:
+			if (match_int(args, &option))
+				return 0;
+			uopt->lastblock = option;
+			if (!remount)
+				uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
+			break;
+		case Opt_anchor:
+			if (match_int(args, &option))
+				return 0;
+			uopt->anchor = option;
+			break;
+		case Opt_volume:
+			if (match_int(args, &option))
+				return 0;
+			uopt->volume = option;
+			break;
+		case Opt_partition:
+			if (match_int(args, &option))
+				return 0;
+			uopt->partition = option;
+			break;
+		case Opt_fileset:
+			if (match_int(args, &option))
+				return 0;
+			uopt->fileset = option;
+			break;
+		case Opt_rootdir:
+			if (match_int(args, &option))
+				return 0;
+			uopt->rootdir = option;
+			break;
+		case Opt_utf8:
+			uopt->flags |= (1 << UDF_FLAG_UTF8);
+			break;
+#ifdef CONFIG_UDF_NLS
+		case Opt_iocharset:
+			uopt->nls_map = load_nls(args[0].from);
+			uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
+			break;
+#endif
+		case Opt_uignore:
+			uopt->flags |= (1 << UDF_FLAG_UID_IGNORE);
+			break;
+		case Opt_uforget:
+			uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
+			break;
+		case Opt_gignore:
+			uopt->flags |= (1 << UDF_FLAG_GID_IGNORE);
+			break;
+		case Opt_gforget:
+			uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
+			break;
+		case Opt_fmode:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->fmode = option & 0777;
+			break;
+		case Opt_dmode:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->dmode = option & 0777;
+			break;
+		default:
+			pr_err("bad mount option \"%s\" or missing value\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
+{
+	struct udf_options uopt;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int error = 0;
+	struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
+
+	sync_filesystem(sb);
+	if (lvidiu) {
+		int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
+		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
+			return -EACCES;
+	}
+
+	uopt.flags = sbi->s_flags;
+	uopt.uid   = sbi->s_uid;
+	uopt.gid   = sbi->s_gid;
+	uopt.umask = sbi->s_umask;
+	uopt.fmode = sbi->s_fmode;
+	uopt.dmode = sbi->s_dmode;
+
+	if (!udf_parse_options(options, &uopt, true))
+		return -EINVAL;
+
+	write_lock(&sbi->s_cred_lock);
+	sbi->s_flags = uopt.flags;
+	sbi->s_uid   = uopt.uid;
+	sbi->s_gid   = uopt.gid;
+	sbi->s_umask = uopt.umask;
+	sbi->s_fmode = uopt.fmode;
+	sbi->s_dmode = uopt.dmode;
+	write_unlock(&sbi->s_cred_lock);
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out_unlock;
+
+	if (*flags & MS_RDONLY)
+		udf_close_lvid(sb);
+	else
+		udf_open_lvid(sb);
+
+out_unlock:
+	return error;
+}
+
+/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
+/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+static loff_t udf_check_vsd(struct super_block *sb)
+{
+	struct volStructDesc *vsd = NULL;
+	loff_t sector = VSD_FIRST_SECTOR_OFFSET;
+	int sectorsize;
+	struct buffer_head *bh = NULL;
+	int nsr02 = 0;
+	int nsr03 = 0;
+	struct udf_sb_info *sbi;
+
+	sbi = UDF_SB(sb);
+	if (sb->s_blocksize < sizeof(struct volStructDesc))
+		sectorsize = sizeof(struct volStructDesc);
+	else
+		sectorsize = sb->s_blocksize;
+
+	sector += (sbi->s_session << sb->s_blocksize_bits);
+
+	udf_debug("Starting at sector %u (%ld byte sectors)\n",
+		  (unsigned int)(sector >> sb->s_blocksize_bits),
+		  sb->s_blocksize);
+	/* Process the sequence (if applicable). The hard limit on the sector
+	 * offset is arbitrary, hopefully large enough so that all valid UDF
+	 * filesystems will be recognised. There is no mention of an upper
+	 * bound to the size of the volume recognition area in the standard.
+	 *  The limit will prevent the code to read all the sectors of a
+	 * specially crafted image (like a bluray disc full of CD001 sectors),
+	 * potentially causing minutes or even hours of uninterruptible I/O
+	 * activity. This actually happened with uninitialised SSD partitions
+	 * (all 0xFF) before the check for the limit and all valid IDs were
+	 * added */
+	for (; !nsr02 && !nsr03 && sector < VSD_MAX_SECTOR_OFFSET;
+	     sector += sectorsize) {
+		/* Read a block */
+		bh = udf_tread(sb, sector >> sb->s_blocksize_bits);
+		if (!bh)
+			break;
+
+		/* Look for ISO  descriptors */
+		vsd = (struct volStructDesc *)(bh->b_data +
+					      (sector & (sb->s_blocksize - 1)));
+
+		if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
+				    VSD_STD_ID_LEN)) {
+			switch (vsd->structType) {
+			case 0:
+				udf_debug("ISO9660 Boot Record found\n");
+				break;
+			case 1:
+				udf_debug("ISO9660 Primary Volume Descriptor found\n");
+				break;
+			case 2:
+				udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
+				break;
+			case 3:
+				udf_debug("ISO9660 Volume Partition Descriptor found\n");
+				break;
+			case 255:
+				udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
+				break;
+			default:
+				udf_debug("ISO9660 VRS (%u) found\n",
+					  vsd->structType);
+				break;
+			}
+		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BEA01,
+				    VSD_STD_ID_LEN))
+			; /* nothing */
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_TEA01,
+				    VSD_STD_ID_LEN)) {
+			brelse(bh);
+			break;
+		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR02,
+				    VSD_STD_ID_LEN))
+			nsr02 = sector;
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03,
+				    VSD_STD_ID_LEN))
+			nsr03 = sector;
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BOOT2,
+				    VSD_STD_ID_LEN))
+			; /* nothing */
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CDW02,
+				    VSD_STD_ID_LEN))
+			; /* nothing */
+		else {
+			/* invalid id : end of volume recognition area */
+			brelse(bh);
+			break;
+		}
+		brelse(bh);
+	}
+
+	if (nsr03)
+		return nsr03;
+	else if (nsr02)
+		return nsr02;
+	else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) ==
+			VSD_FIRST_SECTOR_OFFSET)
+		return -1;
+	else
+		return 0;
+}
+
+static int udf_find_fileset(struct super_block *sb,
+			    struct kernel_lb_addr *fileset,
+			    struct kernel_lb_addr *root)
+{
+	struct buffer_head *bh = NULL;
+	long lastblock;
+	uint16_t ident;
+	struct udf_sb_info *sbi;
+
+	if (fileset->logicalBlockNum != 0xFFFFFFFF ||
+	    fileset->partitionReferenceNum != 0xFFFF) {
+		bh = udf_read_ptagged(sb, fileset, 0, &ident);
+
+		if (!bh) {
+			return 1;
+		} else if (ident != TAG_IDENT_FSD) {
+			brelse(bh);
+			return 1;
+		}
+
+	}
+
+	sbi = UDF_SB(sb);
+	if (!bh) {
+		/* Search backwards through the partitions */
+		struct kernel_lb_addr newfileset;
+
+/* --> cvg: FIXME - is it reasonable? */
+		return 1;
+
+		for (newfileset.partitionReferenceNum = sbi->s_partitions - 1;
+		     (newfileset.partitionReferenceNum != 0xFFFF &&
+		      fileset->logicalBlockNum == 0xFFFFFFFF &&
+		      fileset->partitionReferenceNum == 0xFFFF);
+		     newfileset.partitionReferenceNum--) {
+			lastblock = sbi->s_partmaps
+					[newfileset.partitionReferenceNum]
+						.s_partition_len;
+			newfileset.logicalBlockNum = 0;
+
+			do {
+				bh = udf_read_ptagged(sb, &newfileset, 0,
+						      &ident);
+				if (!bh) {
+					newfileset.logicalBlockNum++;
+					continue;
+				}
+
+				switch (ident) {
+				case TAG_IDENT_SBD:
+				{
+					struct spaceBitmapDesc *sp;
+					sp = (struct spaceBitmapDesc *)
+								bh->b_data;
+					newfileset.logicalBlockNum += 1 +
+						((le32_to_cpu(sp->numOfBytes) +
+						  sizeof(struct spaceBitmapDesc)
+						  - 1) >> sb->s_blocksize_bits);
+					brelse(bh);
+					break;
+				}
+				case TAG_IDENT_FSD:
+					*fileset = newfileset;
+					break;
+				default:
+					newfileset.logicalBlockNum++;
+					brelse(bh);
+					bh = NULL;
+					break;
+				}
+			} while (newfileset.logicalBlockNum < lastblock &&
+				 fileset->logicalBlockNum == 0xFFFFFFFF &&
+				 fileset->partitionReferenceNum == 0xFFFF);
+		}
+	}
+
+	if ((fileset->logicalBlockNum != 0xFFFFFFFF ||
+	     fileset->partitionReferenceNum != 0xFFFF) && bh) {
+		udf_debug("Fileset at block=%d, partition=%d\n",
+			  fileset->logicalBlockNum,
+			  fileset->partitionReferenceNum);
+
+		sbi->s_partition = fileset->partitionReferenceNum;
+		udf_load_fileset(sb, bh, root);
+		brelse(bh);
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * Load primary Volume Descriptor Sequence
+ *
+ * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence
+ * should be tried.
+ */
+static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
+{
+	struct primaryVolDesc *pvoldesc;
+	struct ustr *instr, *outstr;
+	struct buffer_head *bh;
+	uint16_t ident;
+	int ret = -ENOMEM;
+
+	instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!instr)
+		return -ENOMEM;
+
+	outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!outstr)
+		goto out1;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh) {
+		ret = -EAGAIN;
+		goto out2;
+	}
+
+	if (ident != TAG_IDENT_PVD) {
+		ret = -EIO;
+		goto out_bh;
+	}
+
+	pvoldesc = (struct primaryVolDesc *)bh->b_data;
+
+	if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
+			      pvoldesc->recordingDateAndTime)) {
+#ifdef UDFFS_DEBUG
+		struct timestamp *ts = &pvoldesc->recordingDateAndTime;
+		udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
+			  le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
+			  ts->minute, le16_to_cpu(ts->typeAndTimezone));
+#endif
+	}
+
+	if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
+		if (udf_CS0toUTF8(outstr, instr)) {
+			strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+				outstr->u_len > 31 ? 31 : outstr->u_len);
+			udf_debug("volIdent[] = '%s'\n",
+				  UDF_SB(sb)->s_volume_ident);
+		}
+
+	if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
+		if (udf_CS0toUTF8(outstr, instr))
+			udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
+
+	ret = 0;
+out_bh:
+	brelse(bh);
+out2:
+	kfree(outstr);
+out1:
+	kfree(instr);
+	return ret;
+}
+
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+					u32 meta_file_loc, u32 partition_num)
+{
+	struct kernel_lb_addr addr;
+	struct inode *metadata_fe;
+
+	addr.logicalBlockNum = meta_file_loc;
+	addr.partitionReferenceNum = partition_num;
+
+	metadata_fe = udf_iget_special(sb, &addr);
+
+	if (IS_ERR(metadata_fe)) {
+		udf_warn(sb, "metadata inode efe not found\n");
+		return metadata_fe;
+	}
+	if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+		udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
+		iput(metadata_fe);
+		return ERR_PTR(-EIO);
+	}
+
+	return metadata_fe;
+}
+
+static int udf_load_metadata_files(struct super_block *sb, int partition)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_meta_data *mdata;
+	struct kernel_lb_addr addr;
+	struct inode *fe;
+
+	map = &sbi->s_partmaps[partition];
+	mdata = &map->s_type_specific.s_metadata;
+
+	/* metadata address */
+	udf_debug("Metadata file location: block = %d part = %d\n",
+		  mdata->s_meta_file_loc, map->s_partition_num);
+
+	fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
+					 map->s_partition_num);
+	if (IS_ERR(fe)) {
+		/* mirror file entry */
+		udf_debug("Mirror metadata file location: block = %d part = %d\n",
+			  mdata->s_mirror_file_loc, map->s_partition_num);
+
+		fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
+						 map->s_partition_num);
+
+		if (IS_ERR(fe)) {
+			udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
+			return PTR_ERR(fe);
+		}
+		mdata->s_mirror_fe = fe;
+	} else
+		mdata->s_metadata_fe = fe;
+
+
+	/*
+	 * bitmap file entry
+	 * Note:
+	 * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102)
+	*/
+	if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) {
+		addr.logicalBlockNum = mdata->s_bitmap_file_loc;
+		addr.partitionReferenceNum = map->s_partition_num;
+
+		udf_debug("Bitmap file location: block = %d part = %d\n",
+			  addr.logicalBlockNum, addr.partitionReferenceNum);
+
+		fe = udf_iget_special(sb, &addr);
+		if (IS_ERR(fe)) {
+			if (sb->s_flags & MS_RDONLY)
+				udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
+			else {
+				udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
+				return PTR_ERR(fe);
+			}
+		} else
+			mdata->s_bitmap_fe = fe;
+	}
+
+	udf_debug("udf_load_metadata_files Ok\n");
+	return 0;
+}
+
+static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
+			     struct kernel_lb_addr *root)
+{
+	struct fileSetDesc *fset;
+
+	fset = (struct fileSetDesc *)bh->b_data;
+
+	*root = lelb_to_cpu(fset->rootDirectoryICB.extLocation);
+
+	UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum);
+
+	udf_debug("Rootdir at block=%d, partition=%d\n",
+		  root->logicalBlockNum, root->partitionReferenceNum);
+}
+
+int udf_compute_nr_groups(struct super_block *sb, u32 partition)
+{
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+	return DIV_ROUND_UP(map->s_partition_len +
+			    (sizeof(struct spaceBitmapDesc) << 3),
+			    sb->s_blocksize * 8);
+}
+
+static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
+{
+	struct udf_bitmap *bitmap;
+	int nr_groups;
+	int size;
+
+	nr_groups = udf_compute_nr_groups(sb, index);
+	size = sizeof(struct udf_bitmap) +
+		(sizeof(struct buffer_head *) * nr_groups);
+
+	if (size <= PAGE_SIZE)
+		bitmap = kzalloc(size, GFP_KERNEL);
+	else
+		bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
+
+	if (bitmap == NULL)
+		return NULL;
+
+	bitmap->s_nr_groups = nr_groups;
+	return bitmap;
+}
+
+static int udf_fill_partdesc_info(struct super_block *sb,
+		struct partitionDesc *p, int p_index)
+{
+	struct udf_part_map *map;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct partitionHeaderDesc *phd;
+
+	map = &sbi->s_partmaps[p_index];
+
+	map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
+	map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);
+
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
+		map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
+		map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
+		map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
+		map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
+
+	udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n",
+		  p_index, map->s_partition_type,
+		  map->s_partition_root, map->s_partition_len);
+
+	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
+	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+		return 0;
+
+	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
+	if (phd->unallocSpaceTable.extLength) {
+		struct kernel_lb_addr loc = {
+			.logicalBlockNum = le32_to_cpu(
+				phd->unallocSpaceTable.extPosition),
+			.partitionReferenceNum = p_index,
+		};
+		struct inode *inode;
+
+		inode = udf_iget_special(sb, &loc);
+		if (IS_ERR(inode)) {
+			udf_debug("cannot load unallocSpaceTable (part %d)\n",
+				  p_index);
+			return PTR_ERR(inode);
+		}
+		map->s_uspace.s_table = inode;
+		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
+		udf_debug("unallocSpaceTable (part %d) @ %ld\n",
+			  p_index, map->s_uspace.s_table->i_ino);
+	}
+
+	if (phd->unallocSpaceBitmap.extLength) {
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+		if (!bitmap)
+			return -ENOMEM;
+		map->s_uspace.s_bitmap = bitmap;
+		bitmap->s_extPosition = le32_to_cpu(
+				phd->unallocSpaceBitmap.extPosition);
+		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
+		udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
+			  p_index, bitmap->s_extPosition);
+	}
+
+	if (phd->partitionIntegrityTable.extLength)
+		udf_debug("partitionIntegrityTable (part %d)\n", p_index);
+
+	if (phd->freedSpaceTable.extLength) {
+		struct kernel_lb_addr loc = {
+			.logicalBlockNum = le32_to_cpu(
+				phd->freedSpaceTable.extPosition),
+			.partitionReferenceNum = p_index,
+		};
+		struct inode *inode;
+
+		inode = udf_iget_special(sb, &loc);
+		if (IS_ERR(inode)) {
+			udf_debug("cannot load freedSpaceTable (part %d)\n",
+				  p_index);
+			return PTR_ERR(inode);
+		}
+		map->s_fspace.s_table = inode;
+		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
+		udf_debug("freedSpaceTable (part %d) @ %ld\n",
+			  p_index, map->s_fspace.s_table->i_ino);
+	}
+
+	if (phd->freedSpaceBitmap.extLength) {
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+		if (!bitmap)
+			return -ENOMEM;
+		map->s_fspace.s_bitmap = bitmap;
+		bitmap->s_extPosition = le32_to_cpu(
+				phd->freedSpaceBitmap.extPosition);
+		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
+		udf_debug("freedSpaceBitmap (part %d) @ %d\n",
+			  p_index, bitmap->s_extPosition);
+	}
+	return 0;
+}
+
+static void udf_find_vat_block(struct super_block *sb, int p_index,
+			       int type1_index, sector_t start_block)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map = &sbi->s_partmaps[p_index];
+	sector_t vat_block;
+	struct kernel_lb_addr ino;
+	struct inode *inode;
+
+	/*
+	 * VAT file entry is in the last recorded block. Some broken disks have
+	 * it a few blocks before so try a bit harder...
+	 */
+	ino.partitionReferenceNum = type1_index;
+	for (vat_block = start_block;
+	     vat_block >= map->s_partition_root &&
+	     vat_block >= start_block - 3; vat_block--) {
+		ino.logicalBlockNum = vat_block - map->s_partition_root;
+		inode = udf_iget_special(sb, &ino);
+		if (!IS_ERR(inode)) {
+			sbi->s_vat_inode = inode;
+			break;
+		}
+	}
+}
+
+static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map = &sbi->s_partmaps[p_index];
+	struct buffer_head *bh = NULL;
+	struct udf_inode_info *vati;
+	uint32_t pos;
+	struct virtualAllocationTable20 *vat20;
+	sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+
+	udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
+	if (!sbi->s_vat_inode &&
+	    sbi->s_last_block != blocks - 1) {
+		pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n",
+			  (unsigned long)sbi->s_last_block,
+			  (unsigned long)blocks - 1);
+		udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
+	}
+	if (!sbi->s_vat_inode)
+		return -EIO;
+
+	if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
+		map->s_type_specific.s_virtual.s_start_offset = 0;
+		map->s_type_specific.s_virtual.s_num_entries =
+			(sbi->s_vat_inode->i_size - 36) >> 2;
+	} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
+		vati = UDF_I(sbi->s_vat_inode);
+		if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+			pos = udf_block_map(sbi->s_vat_inode, 0);
+			bh = sb_bread(sb, pos);
+			if (!bh)
+				return -EIO;
+			vat20 = (struct virtualAllocationTable20 *)bh->b_data;
+		} else {
+			vat20 = (struct virtualAllocationTable20 *)
+							vati->i_ext.i_data;
+		}
+
+		map->s_type_specific.s_virtual.s_start_offset =
+			le16_to_cpu(vat20->lengthHeader);
+		map->s_type_specific.s_virtual.s_num_entries =
+			(sbi->s_vat_inode->i_size -
+				map->s_type_specific.s_virtual.
+					s_start_offset) >> 2;
+		brelse(bh);
+	}
+	return 0;
+}
+
+/*
+ * Load partition descriptor block
+ *
+ * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor
+ * sequence.
+ */
+static int udf_load_partdesc(struct super_block *sb, sector_t block)
+{
+	struct buffer_head *bh;
+	struct partitionDesc *p;
+	struct udf_part_map *map;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int i, type1_idx;
+	uint16_t partitionNumber;
+	uint16_t ident;
+	int ret;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return -EAGAIN;
+	if (ident != TAG_IDENT_PD) {
+		ret = 0;
+		goto out_bh;
+	}
+
+	p = (struct partitionDesc *)bh->b_data;
+	partitionNumber = le16_to_cpu(p->partitionNumber);
+
+	/* First scan for TYPE1, SPARABLE and METADATA partitions */
+	for (i = 0; i < sbi->s_partitions; i++) {
+		map = &sbi->s_partmaps[i];
+		udf_debug("Searching map: (%d == %d)\n",
+			  map->s_partition_num, partitionNumber);
+		if (map->s_partition_num == partitionNumber &&
+		    (map->s_partition_type == UDF_TYPE1_MAP15 ||
+		     map->s_partition_type == UDF_SPARABLE_MAP15))
+			break;
+	}
+
+	if (i >= sbi->s_partitions) {
+		udf_debug("Partition (%d) not found in partition map\n",
+			  partitionNumber);
+		ret = 0;
+		goto out_bh;
+	}
+
+	ret = udf_fill_partdesc_info(sb, p, i);
+	if (ret < 0)
+		goto out_bh;
+
+	/*
+	 * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
+	 * PHYSICAL partitions are already set up
+	 */
+	type1_idx = i;
+#ifdef UDFFS_DEBUG
+	map = NULL; /* supress 'maybe used uninitialized' warning */
+#endif
+	for (i = 0; i < sbi->s_partitions; i++) {
+		map = &sbi->s_partmaps[i];
+
+		if (map->s_partition_num == partitionNumber &&
+		    (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
+		     map->s_partition_type == UDF_VIRTUAL_MAP20 ||
+		     map->s_partition_type == UDF_METADATA_MAP25))
+			break;
+	}
+
+	if (i >= sbi->s_partitions) {
+		ret = 0;
+		goto out_bh;
+	}
+
+	ret = udf_fill_partdesc_info(sb, p, i);
+	if (ret < 0)
+		goto out_bh;
+
+	if (map->s_partition_type == UDF_METADATA_MAP25) {
+		ret = udf_load_metadata_files(sb, i);
+		if (ret < 0) {
+			udf_err(sb, "error loading MetaData partition map %d\n",
+				i);
+			goto out_bh;
+		}
+	} else {
+		/*
+		 * If we have a partition with virtual map, we don't handle
+		 * writing to it (we overwrite blocks instead of relocating
+		 * them).
+		 */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			ret = -EACCES;
+			goto out_bh;
+		}
+		ret = udf_load_vat(sb, i, type1_idx);
+		if (ret < 0)
+			goto out_bh;
+	}
+	ret = 0;
+out_bh:
+	/* In case loading failed, we handle cleanup in udf_fill_super */
+	brelse(bh);
+	return ret;
+}
+
+static int udf_load_sparable_map(struct super_block *sb,
+				 struct udf_part_map *map,
+				 struct sparablePartitionMap *spm)
+{
+	uint32_t loc;
+	uint16_t ident;
+	struct sparingTable *st;
+	struct udf_sparing_data *sdata = &map->s_type_specific.s_sparing;
+	int i;
+	struct buffer_head *bh;
+
+	map->s_partition_type = UDF_SPARABLE_MAP15;
+	sdata->s_packet_len = le16_to_cpu(spm->packetLength);
+	if (!is_power_of_2(sdata->s_packet_len)) {
+		udf_err(sb, "error loading logical volume descriptor: "
+			"Invalid packet length %u\n",
+			(unsigned)sdata->s_packet_len);
+		return -EIO;
+	}
+	if (spm->numSparingTables > 4) {
+		udf_err(sb, "error loading logical volume descriptor: "
+			"Too many sparing tables (%d)\n",
+			(int)spm->numSparingTables);
+		return -EIO;
+	}
+
+	for (i = 0; i < spm->numSparingTables; i++) {
+		loc = le32_to_cpu(spm->locSparingTable[i]);
+		bh = udf_read_tagged(sb, loc, loc, &ident);
+		if (!bh)
+			continue;
+
+		st = (struct sparingTable *)bh->b_data;
+		if (ident != 0 ||
+		    strncmp(st->sparingIdent.ident, UDF_ID_SPARING,
+			    strlen(UDF_ID_SPARING)) ||
+		    sizeof(*st) + le16_to_cpu(st->reallocationTableLen) >
+							sb->s_blocksize) {
+			brelse(bh);
+			continue;
+		}
+
+		sdata->s_spar_map[i] = bh;
+	}
+	map->s_partition_func = udf_get_pblock_spar15;
+	return 0;
+}
+
+static int udf_load_logicalvol(struct super_block *sb, sector_t block,
+			       struct kernel_lb_addr *fileset)
+{
+	struct logicalVolDesc *lvd;
+	int i, offset;
+	uint8_t type;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct genericPartitionMap *gpm;
+	uint16_t ident;
+	struct buffer_head *bh;
+	unsigned int table_len;
+	int ret;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return -EAGAIN;
+	BUG_ON(ident != TAG_IDENT_LVD);
+	lvd = (struct logicalVolDesc *)bh->b_data;
+	table_len = le32_to_cpu(lvd->mapTableLength);
+	if (table_len > sb->s_blocksize - sizeof(*lvd)) {
+		udf_err(sb, "error loading logical volume descriptor: "
+			"Partition table too long (%u > %lu)\n", table_len,
+			sb->s_blocksize - sizeof(*lvd));
+		ret = -EIO;
+		goto out_bh;
+	}
+
+	ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
+	if (ret)
+		goto out_bh;
+
+	for (i = 0, offset = 0;
+	     i < sbi->s_partitions && offset < table_len;
+	     i++, offset += gpm->partitionMapLength) {
+		struct udf_part_map *map = &sbi->s_partmaps[i];
+		gpm = (struct genericPartitionMap *)
+				&(lvd->partitionMaps[offset]);
+		type = gpm->partitionMapType;
+		if (type == 1) {
+			struct genericPartitionMap1 *gpm1 =
+				(struct genericPartitionMap1 *)gpm;
+			map->s_partition_type = UDF_TYPE1_MAP15;
+			map->s_volumeseqnum = le16_to_cpu(gpm1->volSeqNum);
+			map->s_partition_num = le16_to_cpu(gpm1->partitionNum);
+			map->s_partition_func = NULL;
+		} else if (type == 2) {
+			struct udfPartitionMap2 *upm2 =
+						(struct udfPartitionMap2 *)gpm;
+			if (!strncmp(upm2->partIdent.ident, UDF_ID_VIRTUAL,
+						strlen(UDF_ID_VIRTUAL))) {
+				u16 suf =
+					le16_to_cpu(((__le16 *)upm2->partIdent.
+							identSuffix)[0]);
+				if (suf < 0x0200) {
+					map->s_partition_type =
+							UDF_VIRTUAL_MAP15;
+					map->s_partition_func =
+							udf_get_pblock_virt15;
+				} else {
+					map->s_partition_type =
+							UDF_VIRTUAL_MAP20;
+					map->s_partition_func =
+							udf_get_pblock_virt20;
+				}
+			} else if (!strncmp(upm2->partIdent.ident,
+						UDF_ID_SPARABLE,
+						strlen(UDF_ID_SPARABLE))) {
+				ret = udf_load_sparable_map(sb, map,
+					(struct sparablePartitionMap *)gpm);
+				if (ret < 0)
+					goto out_bh;
+			} else if (!strncmp(upm2->partIdent.ident,
+						UDF_ID_METADATA,
+						strlen(UDF_ID_METADATA))) {
+				struct udf_meta_data *mdata =
+					&map->s_type_specific.s_metadata;
+				struct metadataPartitionMap *mdm =
+						(struct metadataPartitionMap *)
+						&(lvd->partitionMaps[offset]);
+				udf_debug("Parsing Logical vol part %d type %d  id=%s\n",
+					  i, type, UDF_ID_METADATA);
+
+				map->s_partition_type = UDF_METADATA_MAP25;
+				map->s_partition_func = udf_get_pblock_meta25;
+
+				mdata->s_meta_file_loc   =
+					le32_to_cpu(mdm->metadataFileLoc);
+				mdata->s_mirror_file_loc =
+					le32_to_cpu(mdm->metadataMirrorFileLoc);
+				mdata->s_bitmap_file_loc =
+					le32_to_cpu(mdm->metadataBitmapFileLoc);
+				mdata->s_alloc_unit_size =
+					le32_to_cpu(mdm->allocUnitSize);
+				mdata->s_align_unit_size =
+					le16_to_cpu(mdm->alignUnitSize);
+				if (mdm->flags & 0x01)
+					mdata->s_flags |= MF_DUPLICATE_MD;
+
+				udf_debug("Metadata Ident suffix=0x%x\n",
+					  le16_to_cpu(*(__le16 *)
+						      mdm->partIdent.identSuffix));
+				udf_debug("Metadata part num=%d\n",
+					  le16_to_cpu(mdm->partitionNum));
+				udf_debug("Metadata part alloc unit size=%d\n",
+					  le32_to_cpu(mdm->allocUnitSize));
+				udf_debug("Metadata file loc=%d\n",
+					  le32_to_cpu(mdm->metadataFileLoc));
+				udf_debug("Mirror file loc=%d\n",
+					  le32_to_cpu(mdm->metadataMirrorFileLoc));
+				udf_debug("Bitmap file loc=%d\n",
+					  le32_to_cpu(mdm->metadataBitmapFileLoc));
+				udf_debug("Flags: %d %d\n",
+					  mdata->s_flags, mdm->flags);
+			} else {
+				udf_debug("Unknown ident: %s\n",
+					  upm2->partIdent.ident);
+				continue;
+			}
+			map->s_volumeseqnum = le16_to_cpu(upm2->volSeqNum);
+			map->s_partition_num = le16_to_cpu(upm2->partitionNum);
+		}
+		udf_debug("Partition (%d:%d) type %d on volume %d\n",
+			  i, map->s_partition_num, type, map->s_volumeseqnum);
+	}
+
+	if (fileset) {
+		struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
+
+		*fileset = lelb_to_cpu(la->extLocation);
+		udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n",
+			  fileset->logicalBlockNum,
+			  fileset->partitionReferenceNum);
+	}
+	if (lvd->integritySeqExt.extLength)
+		udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
+	ret = 0;
+out_bh:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * udf_load_logicalvolint
+ *
+ */
+static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
+{
+	struct buffer_head *bh = NULL;
+	uint16_t ident;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDesc *lvid;
+
+	while (loc.extLength > 0 &&
+	       (bh = udf_read_tagged(sb, loc.extLocation,
+				     loc.extLocation, &ident)) &&
+	       ident == TAG_IDENT_LVID) {
+		sbi->s_lvid_bh = bh;
+		lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+
+		if (lvid->nextIntegrityExt.extLength)
+			udf_load_logicalvolint(sb,
+				leea_to_cpu(lvid->nextIntegrityExt));
+
+		if (sbi->s_lvid_bh != bh)
+			brelse(bh);
+		loc.extLength -= sb->s_blocksize;
+		loc.extLocation++;
+	}
+	if (sbi->s_lvid_bh != bh)
+		brelse(bh);
+}
+
+/*
+ * Process a main/reserve volume descriptor sequence.
+ *   @block		First block of first extent of the sequence.
+ *   @lastblock		Lastblock of first extent of the sequence.
+ *   @fileset		There we store extent containing root fileset
+ *
+ * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor
+ * sequence
+ */
+static noinline int udf_process_sequence(
+		struct super_block *sb,
+		sector_t block, sector_t lastblock,
+		struct kernel_lb_addr *fileset)
+{
+	struct buffer_head *bh = NULL;
+	struct udf_vds_record vds[VDS_POS_LENGTH];
+	struct udf_vds_record *curr;
+	struct generic_desc *gd;
+	struct volDescPtr *vdp;
+	bool done = false;
+	uint32_t vdsn;
+	uint16_t ident;
+	long next_s = 0, next_e = 0;
+	int ret;
+
+	memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
+
+	/*
+	 * Read the main descriptor sequence and find which descriptors
+	 * are in it.
+	 */
+	for (; (!done && block <= lastblock); block++) {
+
+		bh = udf_read_tagged(sb, block, block, &ident);
+		if (!bh) {
+			udf_err(sb,
+				"Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
+				(unsigned long long)block);
+			return -EAGAIN;
+		}
+
+		/* Process each descriptor (ISO 13346 3/8.3-8.4) */
+		gd = (struct generic_desc *)bh->b_data;
+		vdsn = le32_to_cpu(gd->volDescSeqNum);
+		switch (ident) {
+		case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */
+			curr = &vds[VDS_POS_PRIMARY_VOL_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */
+			curr = &vds[VDS_POS_VOL_DESC_PTR];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+
+				vdp = (struct volDescPtr *)bh->b_data;
+				next_s = le32_to_cpu(
+					vdp->nextVolDescSeqExt.extLocation);
+				next_e = le32_to_cpu(
+					vdp->nextVolDescSeqExt.extLength);
+				next_e = next_e >> sb->s_blocksize_bits;
+				next_e += next_s;
+			}
+			break;
+		case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */
+			curr = &vds[VDS_POS_IMP_USE_VOL_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_PD: /* ISO 13346 3/10.5 */
+			curr = &vds[VDS_POS_PARTITION_DESC];
+			if (!curr->block)
+				curr->block = block;
+			break;
+		case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */
+			curr = &vds[VDS_POS_LOGICAL_VOL_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_USD: /* ISO 13346 3/10.8 */
+			curr = &vds[VDS_POS_UNALLOC_SPACE_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
+			vds[VDS_POS_TERMINATING_DESC].block = block;
+			if (next_e) {
+				block = next_s;
+				lastblock = next_e;
+				next_s = next_e = 0;
+			} else
+				done = true;
+			break;
+		}
+		brelse(bh);
+	}
+	/*
+	 * Now read interesting descriptors again and process them
+	 * in a suitable order
+	 */
+	if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
+		udf_err(sb, "Primary Volume Descriptor not found!\n");
+		return -EAGAIN;
+	}
+	ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block);
+	if (ret < 0)
+		return ret;
+
+	if (vds[VDS_POS_LOGICAL_VOL_DESC].block) {
+		ret = udf_load_logicalvol(sb,
+					  vds[VDS_POS_LOGICAL_VOL_DESC].block,
+					  fileset);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (vds[VDS_POS_PARTITION_DESC].block) {
+		/*
+		 * We rescan the whole descriptor sequence to find
+		 * partition descriptor blocks and process them.
+		 */
+		for (block = vds[VDS_POS_PARTITION_DESC].block;
+		     block < vds[VDS_POS_TERMINATING_DESC].block;
+		     block++) {
+			ret = udf_load_partdesc(sb, block);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Load Volume Descriptor Sequence described by anchor in bh
+ *
+ * Returns <0 on error, 0 on success
+ */
+static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
+			     struct kernel_lb_addr *fileset)
+{
+	struct anchorVolDescPtr *anchor;
+	sector_t main_s, main_e, reserve_s, reserve_e;
+	int ret;
+
+	anchor = (struct anchorVolDescPtr *)bh->b_data;
+
+	/* Locate the main sequence */
+	main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+	main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+	main_e = main_e >> sb->s_blocksize_bits;
+	main_e += main_s;
+
+	/* Locate the reserve sequence */
+	reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
+	reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
+	reserve_e = reserve_e >> sb->s_blocksize_bits;
+	reserve_e += reserve_s;
+
+	/* Process the main & reserve sequences */
+	/* responsible for finding the PartitionDesc(s) */
+	ret = udf_process_sequence(sb, main_s, main_e, fileset);
+	if (ret != -EAGAIN)
+		return ret;
+	udf_sb_free_partitions(sb);
+	ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+	if (ret < 0) {
+		udf_sb_free_partitions(sb);
+		/* No sequence was OK, return -EIO */
+		if (ret == -EAGAIN)
+			ret = -EIO;
+	}
+	return ret;
+}
+
+/*
+ * Check whether there is an anchor block in the given block and
+ * load Volume Descriptor Sequence if so.
+ *
+ * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor
+ * block
+ */
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+				  struct kernel_lb_addr *fileset)
+{
+	struct buffer_head *bh;
+	uint16_t ident;
+	int ret;
+
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+	    udf_fixed_to_variable(block) >=
+	    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+		return -EAGAIN;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return -EAGAIN;
+	if (ident != TAG_IDENT_AVDP) {
+		brelse(bh);
+		return -EAGAIN;
+	}
+	ret = udf_load_sequence(sb, bh, fileset);
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * Search for an anchor volume descriptor pointer.
+ *
+ * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set
+ * of anchors.
+ */
+static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
+			    struct kernel_lb_addr *fileset)
+{
+	sector_t last[6];
+	int i;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int last_count = 0;
+	int ret;
+
+	/* First try user provided anchor */
+	if (sbi->s_anchor) {
+		ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset);
+		if (ret != -EAGAIN)
+			return ret;
+	}
+	/*
+	 * according to spec, anchor is in either:
+	 *     block 256
+	 *     lastblock-256
+	 *     lastblock
+	 *  however, if the disc isn't closed, it could be 512.
+	 */
+	ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset);
+	if (ret != -EAGAIN)
+		return ret;
+	/*
+	 * The trouble is which block is the last one. Drives often misreport
+	 * this so we try various possibilities.
+	 */
+	last[last_count++] = *lastblock;
+	if (*lastblock >= 1)
+		last[last_count++] = *lastblock - 1;
+	last[last_count++] = *lastblock + 1;
+	if (*lastblock >= 2)
+		last[last_count++] = *lastblock - 2;
+	if (*lastblock >= 150)
+		last[last_count++] = *lastblock - 150;
+	if (*lastblock >= 152)
+		last[last_count++] = *lastblock - 152;
+
+	for (i = 0; i < last_count; i++) {
+		if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+				sb->s_blocksize_bits)
+			continue;
+		ret = udf_check_anchor_block(sb, last[i], fileset);
+		if (ret != -EAGAIN) {
+			if (!ret)
+				*lastblock = last[i];
+			return ret;
+		}
+		if (last[i] < 256)
+			continue;
+		ret = udf_check_anchor_block(sb, last[i] - 256, fileset);
+		if (ret != -EAGAIN) {
+			if (!ret)
+				*lastblock = last[i];
+			return ret;
+		}
+	}
+
+	/* Finally try block 512 in case media is open */
+	return udf_check_anchor_block(sb, sbi->s_session + 512, fileset);
+}
+
+/*
+ * Find an anchor volume descriptor and load Volume Descriptor Sequence from
+ * area specified by it. The function expects sbi->s_lastblock to be the last
+ * block on the media.
+ *
+ * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor
+ * was not found.
+ */
+static int udf_find_anchor(struct super_block *sb,
+			   struct kernel_lb_addr *fileset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	sector_t lastblock = sbi->s_last_block;
+	int ret;
+
+	ret = udf_scan_anchors(sb, &lastblock, fileset);
+	if (ret != -EAGAIN)
+		goto out;
+
+	/* No anchor found? Try VARCONV conversion of block numbers */
+	UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+	lastblock = udf_variable_to_fixed(sbi->s_last_block);
+	/* Firstly, we try to not convert number of the last block */
+	ret = udf_scan_anchors(sb, &lastblock, fileset);
+	if (ret != -EAGAIN)
+		goto out;
+
+	lastblock = sbi->s_last_block;
+	/* Secondly, we try with converted number of the last block */
+	ret = udf_scan_anchors(sb, &lastblock, fileset);
+	if (ret < 0) {
+		/* VARCONV didn't help. Clear it. */
+		UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+	}
+out:
+	if (ret == 0)
+		sbi->s_last_block = lastblock;
+	return ret;
+}
+
+/*
+ * Check Volume Structure Descriptor, find Anchor block and load Volume
+ * Descriptor Sequence.
+ *
+ * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor
+ * block was not found.
+ */
+static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
+			int silent, struct kernel_lb_addr *fileset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	loff_t nsr_off;
+	int ret;
+
+	if (!sb_set_blocksize(sb, uopt->blocksize)) {
+		if (!silent)
+			udf_warn(sb, "Bad block size\n");
+		return -EINVAL;
+	}
+	sbi->s_last_block = uopt->lastblock;
+	if (!uopt->novrs) {
+		/* Check that it is NSR02 compliant */
+		nsr_off = udf_check_vsd(sb);
+		if (!nsr_off) {
+			if (!silent)
+				udf_warn(sb, "No VRS found\n");
+			return 0;
+		}
+		if (nsr_off == -1)
+			udf_debug("Failed to read sector at offset %d. "
+				  "Assuming open disc. Skipping validity "
+				  "check\n", VSD_FIRST_SECTOR_OFFSET);
+		if (!sbi->s_last_block)
+			sbi->s_last_block = udf_get_last_block(sb);
+	} else {
+		udf_debug("Validity check skipped because of novrs option\n");
+	}
+
+	/* Look for anchor block and load Volume Descriptor Sequence */
+	sbi->s_anchor = uopt->anchor;
+	ret = udf_find_anchor(sb, fileset);
+	if (ret < 0) {
+		if (!silent && ret == -EAGAIN)
+			udf_warn(sb, "No anchor found\n");
+		return ret;
+	}
+	return 0;
+}
+
+static void udf_open_lvid(struct super_block *sb)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct buffer_head *bh = sbi->s_lvid_bh;
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+
+	if (!bh)
+		return;
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvidiu = udf_sb_lvidiu(sb);
+	if (!lvidiu)
+		return;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
+				CURRENT_TIME);
+	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
+
+	lvid->descTag.descCRC = cpu_to_le16(
+		crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+			le16_to_cpu(lvid->descTag.descCRCLength)));
+
+	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	mark_buffer_dirty(bh);
+	sbi->s_lvid_dirty = 0;
+	mutex_unlock(&sbi->s_alloc_mutex);
+	/* Make opening of filesystem visible on the media immediately */
+	sync_dirty_buffer(bh);
+}
+
+static void udf_close_lvid(struct super_block *sb)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct buffer_head *bh = sbi->s_lvid_bh;
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+
+	if (!bh)
+		return;
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvidiu = udf_sb_lvidiu(sb);
+	if (!lvidiu)
+		return;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
+	if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
+		lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
+	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
+		lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
+	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
+		lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
+	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
+
+	lvid->descTag.descCRC = cpu_to_le16(
+			crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+				le16_to_cpu(lvid->descTag.descCRCLength)));
+
+	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	/*
+	 * We set buffer uptodate unconditionally here to avoid spurious
+	 * warnings from mark_buffer_dirty() when previous EIO has marked
+	 * the buffer as !uptodate
+	 */
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	sbi->s_lvid_dirty = 0;
+	mutex_unlock(&sbi->s_alloc_mutex);
+	/* Make closing of filesystem visible on the media immediately */
+	sync_dirty_buffer(bh);
+}
+
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+	struct buffer_head *bh;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolHeaderDesc *lvhd;
+	u64 uniqueID;
+	u64 ret;
+
+	bh = sbi->s_lvid_bh;
+	if (!bh)
+		return 0;
+
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+	if (!(++uniqueID & 0xFFFFFFFF))
+		uniqueID += 16;
+	lvhd->uniqueID = cpu_to_le64(uniqueID);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	mark_buffer_dirty(bh);
+
+	return ret;
+}
+
+static int udf_fill_super(struct super_block *sb, void *options, int silent)
+{
+	int ret = -EINVAL;
+	struct inode *inode = NULL;
+	struct udf_options uopt;
+	struct kernel_lb_addr rootdir, fileset;
+	struct udf_sb_info *sbi;
+
+	uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
+	uopt.uid = INVALID_UID;
+	uopt.gid = INVALID_GID;
+	uopt.umask = 0;
+	uopt.fmode = UDF_INVALID_MODE;
+	uopt.dmode = UDF_INVALID_MODE;
+
+	sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	mutex_init(&sbi->s_alloc_mutex);
+
+	if (!udf_parse_options((char *)options, &uopt, false))
+		goto parse_options_failure;
+
+	if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
+	    uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
+		udf_err(sb, "utf8 cannot be combined with iocharset\n");
+		goto parse_options_failure;
+	}
+#ifdef CONFIG_UDF_NLS
+	if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
+		uopt.nls_map = load_nls_default();
+		if (!uopt.nls_map)
+			uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP);
+		else
+			udf_debug("Using default NLS map\n");
+	}
+#endif
+	if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP)))
+		uopt.flags |= (1 << UDF_FLAG_UTF8);
+
+	fileset.logicalBlockNum = 0xFFFFFFFF;
+	fileset.partitionReferenceNum = 0xFFFF;
+
+	sbi->s_flags = uopt.flags;
+	sbi->s_uid = uopt.uid;
+	sbi->s_gid = uopt.gid;
+	sbi->s_umask = uopt.umask;
+	sbi->s_fmode = uopt.fmode;
+	sbi->s_dmode = uopt.dmode;
+	sbi->s_nls_map = uopt.nls_map;
+	rwlock_init(&sbi->s_cred_lock);
+
+	if (uopt.session == 0xFFFFFFFF)
+		sbi->s_session = udf_get_last_session(sb);
+	else
+		sbi->s_session = uopt.session;
+
+	udf_debug("Multi-session=%d\n", sbi->s_session);
+
+	/* Fill in the rest of the superblock */
+	sb->s_op = &udf_sb_ops;
+	sb->s_export_op = &udf_export_ops;
+
+	sb->s_magic = UDF_SUPER_MAGIC;
+	sb->s_time_gran = 1000;
+
+	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+	} else {
+		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
+		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+		if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+			if (!silent)
+				pr_notice("Rescanning with blocksize %d\n",
+					  UDF_DEFAULT_BLOCKSIZE);
+			brelse(sbi->s_lvid_bh);
+			sbi->s_lvid_bh = NULL;
+			uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+			ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+		}
+	}
+	if (ret < 0) {
+		if (ret == -EAGAIN) {
+			udf_warn(sb, "No partition found (1)\n");
+			ret = -EINVAL;
+		}
+		goto error_out;
+	}
+
+	udf_debug("Lastblock=%d\n", sbi->s_last_block);
+
+	if (sbi->s_lvid_bh) {
+		struct logicalVolIntegrityDescImpUse *lvidiu =
+							udf_sb_lvidiu(sb);
+		uint16_t minUDFReadRev;
+		uint16_t minUDFWriteRev;
+
+		if (!lvidiu) {
+			ret = -EINVAL;
+			goto error_out;
+		}
+		minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev);
+		minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev);
+		if (minUDFReadRev > UDF_MAX_READ_VERSION) {
+			udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
+				minUDFReadRev,
+				UDF_MAX_READ_VERSION);
+			ret = -EINVAL;
+			goto error_out;
+		} else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION &&
+			   !(sb->s_flags & MS_RDONLY)) {
+			ret = -EACCES;
+			goto error_out;
+		}
+
+		sbi->s_udfrev = minUDFWriteRev;
+
+		if (minUDFReadRev >= UDF_VERS_USE_EXTENDED_FE)
+			UDF_SET_FLAG(sb, UDF_FLAG_USE_EXTENDED_FE);
+		if (minUDFReadRev >= UDF_VERS_USE_STREAMS)
+			UDF_SET_FLAG(sb, UDF_FLAG_USE_STREAMS);
+	}
+
+	if (!sbi->s_partitions) {
+		udf_warn(sb, "No partition found (2)\n");
+		ret = -EINVAL;
+		goto error_out;
+	}
+
+	if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
+			UDF_PART_FLAG_READ_ONLY &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		ret = -EACCES;
+		goto error_out;
+	}
+
+	if (udf_find_fileset(sb, &fileset, &rootdir)) {
+		udf_warn(sb, "No fileset found\n");
+		ret = -EINVAL;
+		goto error_out;
+	}
+
+	if (!silent) {
+		struct timestamp ts;
+		udf_time_to_disk_stamp(&ts, sbi->s_record_time);
+		udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
+			 sbi->s_volume_ident,
+			 le16_to_cpu(ts.year), ts.month, ts.day,
+			 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
+	}
+	if (!(sb->s_flags & MS_RDONLY))
+		udf_open_lvid(sb);
+
+	/* Assign the root inode */
+	/* assign inodes by physical block number */
+	/* perhaps it's not extensible enough, but for now ... */
+	inode = udf_iget(sb, &rootdir);
+	if (IS_ERR(inode)) {
+		udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
+		       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
+		ret = PTR_ERR(inode);
+		goto error_out;
+	}
+
+	/* Allocate a dentry for the root inode */
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		udf_err(sb, "Couldn't allocate root dentry\n");
+		ret = -ENOMEM;
+		goto error_out;
+	}
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_max_links = UDF_MAX_LINKS;
+	return 0;
+
+error_out:
+	iput(sbi->s_vat_inode);
+parse_options_failure:
+#ifdef CONFIG_UDF_NLS
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+		unload_nls(sbi->s_nls_map);
+#endif
+	if (!(sb->s_flags & MS_RDONLY))
+		udf_close_lvid(sb);
+	brelse(sbi->s_lvid_bh);
+	udf_sb_free_partitions(sb);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+
+	return ret;
+}
+
+void _udf_err(struct super_block *sb, const char *function,
+	      const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf);
+
+	va_end(args);
+}
+
+void _udf_warn(struct super_block *sb, const char *function,
+	       const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf);
+
+	va_end(args);
+}
+
+static void udf_put_super(struct super_block *sb)
+{
+	struct udf_sb_info *sbi;
+
+	sbi = UDF_SB(sb);
+
+	iput(sbi->s_vat_inode);
+#ifdef CONFIG_UDF_NLS
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+		unload_nls(sbi->s_nls_map);
+#endif
+	if (!(sb->s_flags & MS_RDONLY))
+		udf_close_lvid(sb);
+	brelse(sbi->s_lvid_bh);
+	udf_sb_free_partitions(sb);
+	mutex_destroy(&sbi->s_alloc_mutex);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+static int udf_sync_fs(struct super_block *sb, int wait)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	if (sbi->s_lvid_dirty) {
+		/*
+		 * Blockdevice will be synced later so we don't have to submit
+		 * the buffer for IO
+		 */
+		mark_buffer_dirty(sbi->s_lvid_bh);
+		sbi->s_lvid_dirty = 0;
+	}
+	mutex_unlock(&sbi->s_alloc_mutex);
+
+	return 0;
+}
+
+static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	lvidiu = udf_sb_lvidiu(sb);
+	buf->f_type = UDF_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len;
+	buf->f_bfree = udf_count_free(sb);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) +
+					  le32_to_cpu(lvidiu->numDirs)) : 0)
+			+ buf->f_bfree;
+	buf->f_ffree = buf->f_bfree;
+	buf->f_namelen = UDF_NAME_LEN - 2;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static unsigned int udf_count_free_bitmap(struct super_block *sb,
+					  struct udf_bitmap *bitmap)
+{
+	struct buffer_head *bh = NULL;
+	unsigned int accum = 0;
+	int index;
+	int block = 0, newblock;
+	struct kernel_lb_addr loc;
+	uint32_t bytes;
+	uint8_t *ptr;
+	uint16_t ident;
+	struct spaceBitmapDesc *bm;
+
+	loc.logicalBlockNum = bitmap->s_extPosition;
+	loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
+	bh = udf_read_ptagged(sb, &loc, 0, &ident);
+
+	if (!bh) {
+		udf_err(sb, "udf_count_free failed\n");
+		goto out;
+	} else if (ident != TAG_IDENT_SBD) {
+		brelse(bh);
+		udf_err(sb, "udf_count_free failed\n");
+		goto out;
+	}
+
+	bm = (struct spaceBitmapDesc *)bh->b_data;
+	bytes = le32_to_cpu(bm->numOfBytes);
+	index = sizeof(struct spaceBitmapDesc); /* offset in first block only */
+	ptr = (uint8_t *)bh->b_data;
+
+	while (bytes > 0) {
+		u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index);
+		accum += bitmap_weight((const unsigned long *)(ptr + index),
+					cur_bytes * 8);
+		bytes -= cur_bytes;
+		if (bytes) {
+			brelse(bh);
+			newblock = udf_get_lb_pblock(sb, &loc, ++block);
+			bh = udf_tread(sb, newblock);
+			if (!bh) {
+				udf_debug("read failed\n");
+				goto out;
+			}
+			index = 0;
+			ptr = (uint8_t *)bh->b_data;
+		}
+	}
+	brelse(bh);
+out:
+	return accum;
+}
+
+static unsigned int udf_count_free_table(struct super_block *sb,
+					 struct inode *table)
+{
+	unsigned int accum = 0;
+	uint32_t elen;
+	struct kernel_lb_addr eloc;
+	int8_t etype;
+	struct extent_position epos;
+
+	mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
+	epos.block = UDF_I(table)->i_location;
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.bh = NULL;
+
+	while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
+		accum += (elen >> table->i_sb->s_blocksize_bits);
+
+	brelse(epos.bh);
+	mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
+
+	return accum;
+}
+
+static unsigned int udf_count_free(struct super_block *sb)
+{
+	unsigned int accum = 0;
+	struct udf_sb_info *sbi;
+	struct udf_part_map *map;
+
+	sbi = UDF_SB(sb);
+	if (sbi->s_lvid_bh) {
+		struct logicalVolIntegrityDesc *lvid =
+			(struct logicalVolIntegrityDesc *)
+			sbi->s_lvid_bh->b_data;
+		if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) {
+			accum = le32_to_cpu(
+					lvid->freeSpaceTable[sbi->s_partition]);
+			if (accum == 0xFFFFFFFF)
+				accum = 0;
+		}
+	}
+
+	if (accum)
+		return accum;
+
+	map = &sbi->s_partmaps[sbi->s_partition];
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
+		accum += udf_count_free_bitmap(sb,
+					       map->s_uspace.s_bitmap);
+	}
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
+		accum += udf_count_free_bitmap(sb,
+					       map->s_fspace.s_bitmap);
+	}
+	if (accum)
+		return accum;
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
+		accum += udf_count_free_table(sb,
+					      map->s_uspace.s_table);
+	}
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
+		accum += udf_count_free_table(sb,
+					      map->s_fspace.s_table);
+	}
+
+	return accum;
+}
diff --git a/kernel/fs/udf/symlink.c b/kernel/fs/udf/symlink.c
new file mode 100644
index 000000000..8dfbc4025
--- /dev/null
+++ b/kernel/fs/udf/symlink.c
@@ -0,0 +1,159 @@
+/*
+ * symlink.c
+ *
+ * PURPOSE
+ *	Symlink handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2001 Ben Fennema
+ *  (C) 1999 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  04/16/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+#include <linux/uaccess.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/pagemap.h>
+#include "udf_i.h"
+
+static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
+			  int fromlen, unsigned char *to, int tolen)
+{
+	struct pathComponent *pc;
+	int elen = 0;
+	int comp_len;
+	unsigned char *p = to;
+
+	/* Reserve one byte for terminating \0 */
+	tolen--;
+	while (elen < fromlen) {
+		pc = (struct pathComponent *)(from + elen);
+		elen += sizeof(struct pathComponent);
+		switch (pc->componentType) {
+		case 1:
+			/*
+			 * Symlink points to some place which should be agreed
+ 			 * upon between originator and receiver of the media. Ignore.
+			 */
+			if (pc->lengthComponentIdent > 0) {
+				elen += pc->lengthComponentIdent;
+				break;
+			}
+			/* Fall through */
+		case 2:
+			if (tolen == 0)
+				return -ENAMETOOLONG;
+			p = to;
+			*p++ = '/';
+			tolen--;
+			break;
+		case 3:
+			if (tolen < 3)
+				return -ENAMETOOLONG;
+			memcpy(p, "../", 3);
+			p += 3;
+			tolen -= 3;
+			break;
+		case 4:
+			if (tolen < 2)
+				return -ENAMETOOLONG;
+			memcpy(p, "./", 2);
+			p += 2;
+			tolen -= 2;
+			/* that would be . - just ignore */
+			break;
+		case 5:
+			elen += pc->lengthComponentIdent;
+			if (elen > fromlen)
+				return -EIO;
+			comp_len = udf_get_filename(sb, pc->componentIdent,
+						    pc->lengthComponentIdent,
+						    p, tolen);
+			p += comp_len;
+			tolen -= comp_len;
+			if (tolen == 0)
+				return -ENAMETOOLONG;
+			*p++ = '/';
+			tolen--;
+			break;
+		}
+	}
+	if (p > to + 1)
+		p[-1] = '\0';
+	else
+		p[0] = '\0';
+	return 0;
+}
+
+static int udf_symlink_filler(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *bh = NULL;
+	unsigned char *symlink;
+	int err;
+	unsigned char *p = kmap(page);
+	struct udf_inode_info *iinfo;
+	uint32_t pos;
+
+	/* We don't support symlinks longer than one block */
+	if (inode->i_size > inode->i_sb->s_blocksize) {
+		err = -ENAMETOOLONG;
+		goto out_unmap;
+	}
+
+	iinfo = UDF_I(inode);
+	pos = udf_block_map(inode, 0);
+
+	down_read(&iinfo->i_data_sem);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+	} else {
+		bh = sb_bread(inode->i_sb, pos);
+
+		if (!bh) {
+			err = -EIO;
+			goto out_unlock_inode;
+		}
+
+		symlink = bh->b_data;
+	}
+
+	err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE);
+	brelse(bh);
+	if (err)
+		goto out_unlock_inode;
+
+	up_read(&iinfo->i_data_sem);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+out_unlock_inode:
+	up_read(&iinfo->i_data_sem);
+	SetPageError(page);
+out_unmap:
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct address_space_operations udf_symlink_aops = {
+	.readpage		= udf_symlink_filler,
+};
diff --git a/kernel/fs/udf/truncate.c b/kernel/fs/udf/truncate.c
new file mode 100644
index 000000000..42b8c5779
--- /dev/null
+++ b/kernel/fs/udf/truncate.c
@@ -0,0 +1,286 @@
+/*
+ * truncate.c
+ *
+ * PURPOSE
+ *	Truncate handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1999-2004 Ben Fennema
+ *  (C) 1999 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  02/24/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+static void extent_trunc(struct inode *inode, struct extent_position *epos,
+			 struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
+			 uint32_t nelen)
+{
+	struct kernel_lb_addr neloc = {};
+	int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+
+	if (nelen) {
+		if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+			udf_free_blocks(inode->i_sb, inode, eloc, 0,
+					last_block);
+			etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
+		} else
+			neloc = *eloc;
+		nelen = (etype << 30) | nelen;
+	}
+
+	if (elen != nelen) {
+		udf_write_aext(inode, epos, &neloc, nelen, 0);
+		if (last_block - first_block > 0) {
+			if (etype == (EXT_RECORDED_ALLOCATED >> 30))
+				mark_inode_dirty(inode);
+
+			if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
+				udf_free_blocks(inode->i_sb, inode, eloc,
+						first_block,
+						last_block - first_block);
+		}
+	}
+}
+
+/*
+ * Truncate the last extent to match i_size. This function assumes
+ * that preallocation extent is already truncated.
+ */
+void udf_truncate_tail_extent(struct inode *inode)
+{
+	struct extent_position epos = {};
+	struct kernel_lb_addr eloc;
+	uint32_t elen, nelen;
+	uint64_t lbcount = 0;
+	int8_t etype = -1, netype;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
+	    inode->i_size == iinfo->i_lenExtents)
+		return;
+	/* Are we going to delete the file anyway? */
+	if (inode->i_nlink == 0)
+		return;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		BUG();
+
+	/* Find the last extent in the file */
+	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
+		etype = netype;
+		lbcount += elen;
+		if (lbcount > inode->i_size) {
+			if (lbcount - inode->i_size >= inode->i_sb->s_blocksize)
+				udf_warn(inode->i_sb,
+					 "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n",
+					 (unsigned)inode->i_ino,
+					 (long long)inode->i_size,
+					 (long long)lbcount,
+					 (unsigned)eloc.logicalBlockNum,
+					 (unsigned)elen);
+			nelen = elen - (lbcount - inode->i_size);
+			epos.offset -= adsize;
+			extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
+			epos.offset += adsize;
+			if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
+				udf_err(inode->i_sb,
+					"Extent after EOF in inode %u\n",
+					(unsigned)inode->i_ino);
+			break;
+		}
+	}
+	/* This inode entry is in-memory only and thus we don't have to mark
+	 * the inode dirty */
+	iinfo->i_lenExtents = inode->i_size;
+	brelse(epos.bh);
+}
+
+void udf_discard_prealloc(struct inode *inode)
+{
+	struct extent_position epos = { NULL, 0, {0, 0} };
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	uint64_t lbcount = 0;
+	int8_t etype = -1, netype;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
+	    inode->i_size == iinfo->i_lenExtents)
+		return;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		adsize = 0;
+
+	epos.block = iinfo->i_location;
+
+	/* Find the last extent in the file */
+	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
+		etype = netype;
+		lbcount += elen;
+	}
+	if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+		epos.offset -= adsize;
+		lbcount -= elen;
+		extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+		if (!epos.bh) {
+			iinfo->i_lenAlloc =
+				epos.offset -
+				udf_file_entry_alloc_offset(inode);
+			mark_inode_dirty(inode);
+		} else {
+			struct allocExtDesc *aed =
+				(struct allocExtDesc *)(epos.bh->b_data);
+			aed->lengthAllocDescs =
+				cpu_to_le32(epos.offset -
+					    sizeof(struct allocExtDesc));
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(epos.bh->b_data, epos.offset);
+			else
+				udf_update_tag(epos.bh->b_data,
+					       sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(epos.bh, inode);
+		}
+	}
+	/* This inode entry is in-memory only and thus we don't have to mark
+	 * the inode dirty */
+	iinfo->i_lenExtents = lbcount;
+	brelse(epos.bh);
+}
+
+static void udf_update_alloc_ext_desc(struct inode *inode,
+				      struct extent_position *epos,
+				      u32 lenalloc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data);
+	int len = sizeof(struct allocExtDesc);
+
+	aed->lengthAllocDescs =	cpu_to_le32(lenalloc);
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201)
+		len += lenalloc;
+
+	udf_update_tag(epos->bh->b_data, len);
+	mark_buffer_dirty_inode(epos->bh, inode);
+}
+
+/*
+ * Truncate extents of inode to inode->i_size. This function can be used only
+ * for making file shorter. For making file longer, udf_extend_file() has to
+ * be used.
+ */
+void udf_truncate_extents(struct inode *inode)
+{
+	struct extent_position epos;
+	struct kernel_lb_addr eloc, neloc = {};
+	uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
+	int8_t etype;
+	struct super_block *sb = inode->i_sb;
+	sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset;
+	loff_t byte_offset;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		BUG();
+
+	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+	byte_offset = (offset << sb->s_blocksize_bits) +
+		(inode->i_size & (sb->s_blocksize - 1));
+	if (etype == -1) {
+		/* We should extend the file? */
+		WARN_ON(byte_offset);
+		return;
+	}
+	epos.offset -= adsize;
+	extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
+	epos.offset += adsize;
+	if (byte_offset)
+		lenalloc = epos.offset;
+	else
+		lenalloc = epos.offset - adsize;
+
+	if (!epos.bh)
+		lenalloc -= udf_file_entry_alloc_offset(inode);
+	else
+		lenalloc -= sizeof(struct allocExtDesc);
+
+	while ((etype = udf_current_aext(inode, &epos, &eloc,
+					 &elen, 0)) != -1) {
+		if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+			udf_write_aext(inode, &epos, &neloc, nelen, 0);
+			if (indirect_ext_len) {
+				/* We managed to free all extents in the
+				 * indirect extent - free it too */
+				BUG_ON(!epos.bh);
+				udf_free_blocks(sb, NULL, &epos.block,
+						0, indirect_ext_len);
+			} else if (!epos.bh) {
+				iinfo->i_lenAlloc = lenalloc;
+				mark_inode_dirty(inode);
+			} else
+				udf_update_alloc_ext_desc(inode,
+						&epos, lenalloc);
+			brelse(epos.bh);
+			epos.offset = sizeof(struct allocExtDesc);
+			epos.block = eloc;
+			epos.bh = udf_tread(sb,
+					udf_get_lb_pblock(sb, &eloc, 0));
+			if (elen)
+				indirect_ext_len =
+					(elen + sb->s_blocksize - 1) >>
+					sb->s_blocksize_bits;
+			else
+				indirect_ext_len = 1;
+		} else {
+			extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+			epos.offset += adsize;
+		}
+	}
+
+	if (indirect_ext_len) {
+		BUG_ON(!epos.bh);
+		udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len);
+	} else if (!epos.bh) {
+		iinfo->i_lenAlloc = lenalloc;
+		mark_inode_dirty(inode);
+	} else
+		udf_update_alloc_ext_desc(inode, &epos, lenalloc);
+	iinfo->i_lenExtents = inode->i_size;
+
+	brelse(epos.bh);
+}
diff --git a/kernel/fs/udf/udf_i.h b/kernel/fs/udf/udf_i.h
new file mode 100644
index 000000000..b5cd8ed2a
--- /dev/null
+++ b/kernel/fs/udf/udf_i.h
@@ -0,0 +1,62 @@
+#ifndef _UDF_I_H
+#define _UDF_I_H
+
+struct extent_position {
+	struct buffer_head *bh;
+	uint32_t offset;
+	struct kernel_lb_addr block;
+};
+
+struct udf_ext_cache {
+	/* Extent position */
+	struct extent_position epos;
+	/* Start logical offset in bytes */
+	loff_t lstart;
+};
+
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
+
+struct udf_inode_info {
+	struct timespec		i_crtime;
+	/* Physical address of inode */
+	struct kernel_lb_addr		i_location;
+	__u64			i_unique;
+	__u32			i_lenEAttr;
+	__u32			i_lenAlloc;
+	__u64			i_lenExtents;
+	__u32			i_next_alloc_block;
+	__u32			i_next_alloc_goal;
+	__u32			i_checkpoint;
+	unsigned		i_alloc_type : 3;
+	unsigned		i_efe : 1;	/* extendedFileEntry */
+	unsigned		i_use : 1;	/* unallocSpaceEntry */
+	unsigned		i_strat4096 : 1;
+	unsigned		reserved : 26;
+	union {
+		struct short_ad	*i_sad;
+		struct long_ad		*i_lad;
+		__u8		*i_data;
+	} i_ext;
+	struct rw_semaphore	i_data_sem;
+	struct udf_ext_cache cached_extent;
+	/* Spinlock for protecting extent cache */
+	spinlock_t i_extent_cache_lock;
+	struct inode vfs_inode;
+};
+
+static inline struct udf_inode_info *UDF_I(struct inode *inode)
+{
+	return list_entry(inode, struct udf_inode_info, vfs_inode);
+}
+
+#endif /* _UDF_I_H) */
diff --git a/kernel/fs/udf/udf_sb.h b/kernel/fs/udf/udf_sb.h
new file mode 100644
index 000000000..1f32c7bd9
--- /dev/null
+++ b/kernel/fs/udf/udf_sb.h
@@ -0,0 +1,184 @@
+#ifndef __LINUX_UDF_SB_H
+#define __LINUX_UDF_SB_H
+
+#include <linux/mutex.h>
+#include <linux/bitops.h>
+
+/* Since UDF 2.01 is ISO 13346 based... */
+#define UDF_SUPER_MAGIC			0x15013346
+
+#define UDF_MAX_READ_VERSION		0x0250
+#define UDF_MAX_WRITE_VERSION		0x0201
+
+#define UDF_FLAG_USE_EXTENDED_FE	0
+#define UDF_VERS_USE_EXTENDED_FE	0x0200
+#define UDF_FLAG_USE_STREAMS		1
+#define UDF_VERS_USE_STREAMS		0x0200
+#define UDF_FLAG_USE_SHORT_AD		2
+#define UDF_FLAG_USE_AD_IN_ICB		3
+#define UDF_FLAG_USE_FILE_CTIME_EA	4
+#define UDF_FLAG_STRICT			5
+#define UDF_FLAG_UNDELETE		6
+#define UDF_FLAG_UNHIDE			7
+#define UDF_FLAG_VARCONV		8
+#define UDF_FLAG_NLS_MAP		9
+#define UDF_FLAG_UTF8			10
+#define UDF_FLAG_UID_FORGET     11    /* save -1 for uid to disk */
+#define UDF_FLAG_UID_IGNORE     12    /* use sb uid instead of on disk uid */
+#define UDF_FLAG_GID_FORGET     13
+#define UDF_FLAG_GID_IGNORE     14
+#define UDF_FLAG_UID_SET	15
+#define UDF_FLAG_GID_SET	16
+#define UDF_FLAG_SESSION_SET	17
+#define UDF_FLAG_LASTBLOCK_SET	18
+#define UDF_FLAG_BLOCKSIZE_SET	19
+
+#define UDF_PART_FLAG_UNALLOC_BITMAP	0x0001
+#define UDF_PART_FLAG_UNALLOC_TABLE	0x0002
+#define UDF_PART_FLAG_FREED_BITMAP	0x0004
+#define UDF_PART_FLAG_FREED_TABLE	0x0008
+#define UDF_PART_FLAG_READ_ONLY		0x0010
+#define UDF_PART_FLAG_WRITE_ONCE	0x0020
+#define UDF_PART_FLAG_REWRITABLE	0x0040
+#define UDF_PART_FLAG_OVERWRITABLE	0x0080
+
+#define UDF_MAX_BLOCK_LOADED	8
+
+#define UDF_TYPE1_MAP15			0x1511U
+#define UDF_VIRTUAL_MAP15		0x1512U
+#define UDF_VIRTUAL_MAP20		0x2012U
+#define UDF_SPARABLE_MAP15		0x1522U
+#define UDF_METADATA_MAP25		0x2511U
+
+#define UDF_INVALID_MODE		((umode_t)-1)
+
+#pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
+
+#define MF_DUPLICATE_MD		0x01
+#define MF_MIRROR_FE_LOADED	0x02
+
+struct udf_meta_data {
+	__u32	s_meta_file_loc;
+	__u32	s_mirror_file_loc;
+	__u32	s_bitmap_file_loc;
+	__u32	s_alloc_unit_size;
+	__u16	s_align_unit_size;
+	int	s_flags;
+	struct inode *s_metadata_fe;
+	struct inode *s_mirror_fe;
+	struct inode *s_bitmap_fe;
+};
+
+struct udf_sparing_data {
+	__u16	s_packet_len;
+	struct buffer_head *s_spar_map[4];
+};
+
+struct udf_virtual_data {
+	__u32	s_num_entries;
+	__u16	s_start_offset;
+};
+
+struct udf_bitmap {
+	__u32			s_extPosition;
+	int			s_nr_groups;
+	struct buffer_head 	*s_block_bitmap[0];
+};
+
+struct udf_part_map {
+	union {
+		struct udf_bitmap	*s_bitmap;
+		struct inode		*s_table;
+	} s_uspace;
+	union {
+		struct udf_bitmap	*s_bitmap;
+		struct inode		*s_table;
+	} s_fspace;
+	__u32	s_partition_root;
+	__u32	s_partition_len;
+	__u16	s_partition_type;
+	__u16	s_partition_num;
+	union {
+		struct udf_sparing_data s_sparing;
+		struct udf_virtual_data s_virtual;
+		struct udf_meta_data s_metadata;
+	} s_type_specific;
+	__u32	(*s_partition_func)(struct super_block *, __u32, __u16, __u32);
+	__u16	s_volumeseqnum;
+	__u16	s_partition_flags;
+};
+
+#pragma pack()
+
+struct udf_sb_info {
+	struct udf_part_map	*s_partmaps;
+	__u8			s_volume_ident[32];
+
+	/* Overall info */
+	__u16			s_partitions;
+	__u16			s_partition;
+
+	/* Sector headers */
+	__s32			s_session;
+	__u32			s_anchor;
+	__u32			s_last_block;
+
+	struct buffer_head	*s_lvid_bh;
+
+	/* Default permissions */
+	umode_t			s_umask;
+	kgid_t			s_gid;
+	kuid_t			s_uid;
+	umode_t			s_fmode;
+	umode_t			s_dmode;
+	/* Lock protecting consistency of above permission settings */
+	rwlock_t		s_cred_lock;
+
+	/* Root Info */
+	struct timespec		s_record_time;
+
+	/* Fileset Info */
+	__u16			s_serial_number;
+
+	/* highest UDF revision we have recorded to this media */
+	__u16			s_udfrev;
+
+	/* Miscellaneous flags */
+	unsigned long		s_flags;
+
+	/* Encoding info */
+	struct nls_table	*s_nls_map;
+
+	/* VAT inode */
+	struct inode		*s_vat_inode;
+
+	struct mutex		s_alloc_mutex;
+	/* Protected by s_alloc_mutex */
+	unsigned int		s_lvid_dirty;
+};
+
+static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb);
+
+int udf_compute_nr_groups(struct super_block *sb, u32 partition);
+
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
+{
+	return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+	set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+	clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+#endif /* __LINUX_UDF_SB_H */
diff --git a/kernel/fs/udf/udfdecl.h b/kernel/fs/udf/udfdecl.h
new file mode 100644
index 000000000..47bb3f5ca
--- /dev/null
+++ b/kernel/fs/udf/udfdecl.h
@@ -0,0 +1,255 @@
+#ifndef __UDF_DECL_H
+#define __UDF_DECL_H
+
+#define pr_fmt(fmt) "UDF-fs: " fmt
+
+#include "ecma_167.h"
+#include "osta_udf.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/udf_fs_i.h>
+
+#include "udf_sb.h"
+#include "udfend.h"
+#include "udf_i.h"
+
+#define UDF_PREALLOCATE
+#define UDF_DEFAULT_PREALLOC_BLOCKS	8
+
+extern __printf(3, 4) void _udf_err(struct super_block *sb,
+		const char *function, const char *fmt, ...);
+#define udf_err(sb, fmt, ...)					\
+	_udf_err(sb, __func__, fmt, ##__VA_ARGS__)
+
+extern __printf(3, 4) void _udf_warn(struct super_block *sb,
+		const char *function, const char *fmt, ...);
+#define udf_warn(sb, fmt, ...)					\
+	_udf_warn(sb, __func__, fmt, ##__VA_ARGS__)
+
+#define udf_info(fmt, ...)					\
+	pr_info("INFO " fmt, ##__VA_ARGS__)
+
+#undef UDFFS_DEBUG
+
+#ifdef UDFFS_DEBUG
+#define udf_debug(fmt, ...)					\
+	printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt),		\
+	       __FILE__, __LINE__, __func__, ##__VA_ARGS__)
+#else
+#define udf_debug(fmt, ...)					\
+	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
+#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
+
+#define UDF_EXTENT_LENGTH_MASK	0x3FFFFFFF
+#define UDF_EXTENT_FLAG_MASK	0xC0000000
+
+#define UDF_NAME_PAD		4
+#define UDF_NAME_LEN		256
+#define UDF_PATH_LEN		1023
+
+static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	if (iinfo->i_use)
+		return sizeof(struct unallocSpaceEntry);
+	else if (iinfo->i_efe)
+		return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr;
+	else
+		return sizeof(struct fileEntry) + iinfo->i_lenEAttr;
+}
+
+static inline size_t udf_ext0_offset(struct inode *inode)
+{
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		return udf_file_entry_alloc_offset(inode);
+	else
+		return 0;
+}
+
+/* computes tag checksum */
+u8 udf_tag_checksum(const struct tag *t);
+
+struct dentry;
+struct inode;
+struct task_struct;
+struct buffer_head;
+struct super_block;
+
+extern const struct export_operations udf_export_ops;
+extern const struct inode_operations udf_dir_inode_operations;
+extern const struct file_operations udf_dir_operations;
+extern const struct inode_operations udf_file_inode_operations;
+extern const struct file_operations udf_file_operations;
+extern const struct inode_operations udf_symlink_inode_operations;
+extern const struct address_space_operations udf_aops;
+extern const struct address_space_operations udf_adinicb_aops;
+extern const struct address_space_operations udf_symlink_aops;
+
+struct udf_fileident_bh {
+	struct buffer_head *sbh;
+	struct buffer_head *ebh;
+	int soffset;
+	int eoffset;
+};
+
+struct udf_vds_record {
+	uint32_t block;
+	uint32_t volDescSeqNum;
+};
+
+struct generic_desc {
+	struct tag	descTag;
+	__le32		volDescSeqNum;
+};
+
+struct ustr {
+	uint8_t u_cmpID;
+	uint8_t u_name[UDF_NAME_LEN - 2];
+	uint8_t u_len;
+};
+
+
+/* super.c */
+
+static inline void udf_updated_lvid(struct super_block *sb)
+{
+	struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
+
+	BUG_ON(!bh);
+	WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
+		     bh->b_data)->integrityType !=
+		     cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
+	UDF_SB(sb)->s_lvid_dirty = 1;
+}
+extern u64 lvid_get_unique_id(struct super_block *sb);
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+					u32 meta_file_loc, u32 partition_num);
+
+/* namei.c */
+extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
+			struct fileIdentDesc *, struct udf_fileident_bh *,
+			uint8_t *, uint8_t *);
+
+/* file.c */
+extern long udf_ioctl(struct file *, unsigned int, unsigned long);
+/* inode.c */
+extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *,
+				bool hidden_inode);
+static inline struct inode *udf_iget_special(struct super_block *sb,
+					     struct kernel_lb_addr *ino)
+{
+	return __udf_iget(sb, ino, true);
+}
+static inline struct inode *udf_iget(struct super_block *sb,
+				     struct kernel_lb_addr *ino)
+{
+	return __udf_iget(sb, ino, false);
+}
+extern int udf_expand_file_adinicb(struct inode *);
+extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
+extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
+extern int udf_setsize(struct inode *, loff_t);
+extern void udf_evict_inode(struct inode *);
+extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
+extern long udf_block_map(struct inode *, sector_t);
+extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
+			 struct kernel_lb_addr *, uint32_t *, sector_t *);
+extern int udf_add_aext(struct inode *, struct extent_position *,
+			struct kernel_lb_addr *, uint32_t, int);
+extern void udf_write_aext(struct inode *, struct extent_position *,
+			   struct kernel_lb_addr *, uint32_t, int);
+extern int8_t udf_delete_aext(struct inode *, struct extent_position,
+			      struct kernel_lb_addr, uint32_t);
+extern int8_t udf_next_aext(struct inode *, struct extent_position *,
+			    struct kernel_lb_addr *, uint32_t *, int);
+extern int8_t udf_current_aext(struct inode *, struct extent_position *,
+			       struct kernel_lb_addr *, uint32_t *, int);
+
+/* misc.c */
+extern struct buffer_head *udf_tgetblk(struct super_block *, int);
+extern struct buffer_head *udf_tread(struct super_block *, int);
+extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t,
+						  uint32_t, uint8_t);
+extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
+						  uint8_t);
+extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
+					   uint32_t, uint16_t *);
+extern struct buffer_head *udf_read_ptagged(struct super_block *,
+					    struct kernel_lb_addr *, uint32_t,
+					    uint16_t *);
+extern void udf_update_tag(char *, int);
+extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
+
+/* lowlevel.c */
+extern unsigned int udf_get_last_session(struct super_block *);
+extern unsigned long udf_get_last_block(struct super_block *);
+
+/* partition.c */
+extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t,
+			       uint32_t);
+extern uint32_t udf_get_pblock_virt15(struct super_block *, uint32_t, uint16_t,
+				      uint32_t);
+extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t,
+				      uint32_t);
+extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t,
+				      uint32_t);
+extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
+					  uint32_t);
+extern int udf_relocate_blocks(struct super_block *, long, long *);
+
+static inline uint32_t
+udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
+		  uint32_t offset)
+{
+	return udf_get_pblock(sb, loc->logicalBlockNum,
+			loc->partitionReferenceNum, offset);
+}
+
+/* unicode.c */
+extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
+			    int);
+extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
+			    int);
+extern int udf_build_ustr(struct ustr *, dstring *, int);
+extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
+
+/* ialloc.c */
+extern void udf_free_inode(struct inode *);
+extern struct inode *udf_new_inode(struct inode *, umode_t);
+
+/* truncate.c */
+extern void udf_truncate_tail_extent(struct inode *);
+extern void udf_discard_prealloc(struct inode *);
+extern void udf_truncate_extents(struct inode *);
+
+/* balloc.c */
+extern void udf_free_blocks(struct super_block *, struct inode *,
+			    struct kernel_lb_addr *, uint32_t, uint32_t);
+extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
+			       uint32_t, uint32_t);
+extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
+			 uint32_t, int *);
+
+/* directory.c */
+extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
+						struct udf_fileident_bh *,
+						struct fileIdentDesc *,
+						struct extent_position *,
+						struct kernel_lb_addr *, uint32_t *,
+						sector_t *);
+extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
+					       int *offset);
+extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
+extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
+
+/* udftime.c */
+extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
+						struct timestamp src);
+extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
+
+#endif				/* __UDF_DECL_H */
diff --git a/kernel/fs/udf/udfend.h b/kernel/fs/udf/udfend.h
new file mode 100644
index 000000000..6a9f3a9cc
--- /dev/null
+++ b/kernel/fs/udf/udfend.h
@@ -0,0 +1,77 @@
+#ifndef __UDF_ENDIAN_H
+#define __UDF_ENDIAN_H
+
+#include <asm/byteorder.h>
+#include <linux/string.h>
+
+static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
+{
+	struct kernel_lb_addr out;
+
+	out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
+	out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
+
+	return out;
+}
+
+static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
+{
+	struct lb_addr out;
+
+	out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
+	out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
+
+	return out;
+}
+
+static inline struct short_ad lesa_to_cpu(struct short_ad in)
+{
+	struct short_ad out;
+
+	out.extLength = le32_to_cpu(in.extLength);
+	out.extPosition = le32_to_cpu(in.extPosition);
+
+	return out;
+}
+
+static inline struct short_ad cpu_to_lesa(struct short_ad in)
+{
+	struct short_ad out;
+
+	out.extLength = cpu_to_le32(in.extLength);
+	out.extPosition = cpu_to_le32(in.extPosition);
+
+	return out;
+}
+
+static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
+{
+	struct kernel_long_ad out;
+
+	out.extLength = le32_to_cpu(in.extLength);
+	out.extLocation = lelb_to_cpu(in.extLocation);
+
+	return out;
+}
+
+static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
+{
+	struct long_ad out;
+
+	out.extLength = cpu_to_le32(in.extLength);
+	out.extLocation = cpu_to_lelb(in.extLocation);
+
+	return out;
+}
+
+static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
+{
+	struct kernel_extent_ad out;
+
+	out.extLength = le32_to_cpu(in.extLength);
+	out.extLocation = le32_to_cpu(in.extLocation);
+
+	return out;
+}
+
+#endif /* __UDF_ENDIAN_H */
diff --git a/kernel/fs/udf/udftime.c b/kernel/fs/udf/udftime.c
new file mode 100644
index 000000000..77c331f1a
--- /dev/null
+++ b/kernel/fs/udf/udftime.c
@@ -0,0 +1,170 @@
+/* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Paul Eggert (eggert@twinsun.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+/*
+ * dgb 10/02/98: ripped this from glibc source to help convert timestamps
+ *               to unix time
+ *     10/04/98: added new table-based lookup after seeing how ugly
+ *               the gnu code is
+ * blf 09/27/99: ripped out all the old code and inserted new table from
+ *		 John Brockmeyer (without leap second corrections)
+ *		 rewrote udf_stamp_to_time and fixed timezone accounting in
+ *		 udf_time_to_stamp.
+ */
+
+/*
+ * We don't take into account leap seconds. This may be correct or incorrect.
+ * For more NIST information (especially dealing with leap seconds), see:
+ * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
+ */
+
+#include "udfdecl.h"
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#define EPOCH_YEAR 1970
+
+#ifndef __isleap
+/* Nonzero if YEAR is a leap year (every 4 years,
+   except every 100th isn't, and every 400th is).  */
+#define	__isleap(year)	\
+  ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0))
+#endif
+
+/* How many days come before each month (0-12).  */
+static const unsigned short int __mon_yday[2][13] = {
+	/* Normal years.  */
+	{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+	/* Leap years.  */
+	{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
+};
+
+#define MAX_YEAR_SECONDS	69
+#define SPD			0x15180	/*3600*24 */
+#define SPY(y, l, s)		(SPD * (365 * y + l) + s)
+
+static time_t year_seconds[MAX_YEAR_SECONDS] = {
+/*1970*/ SPY(0,   0, 0), SPY(1,   0, 0), SPY(2,   0, 0), SPY(3,   1, 0),
+/*1974*/ SPY(4,   1, 0), SPY(5,   1, 0), SPY(6,   1, 0), SPY(7,   2, 0),
+/*1978*/ SPY(8,   2, 0), SPY(9,   2, 0), SPY(10,  2, 0), SPY(11,  3, 0),
+/*1982*/ SPY(12,  3, 0), SPY(13,  3, 0), SPY(14,  3, 0), SPY(15,  4, 0),
+/*1986*/ SPY(16,  4, 0), SPY(17,  4, 0), SPY(18,  4, 0), SPY(19,  5, 0),
+/*1990*/ SPY(20,  5, 0), SPY(21,  5, 0), SPY(22,  5, 0), SPY(23,  6, 0),
+/*1994*/ SPY(24,  6, 0), SPY(25,  6, 0), SPY(26,  6, 0), SPY(27,  7, 0),
+/*1998*/ SPY(28,  7, 0), SPY(29,  7, 0), SPY(30,  7, 0), SPY(31,  8, 0),
+/*2002*/ SPY(32,  8, 0), SPY(33,  8, 0), SPY(34,  8, 0), SPY(35,  9, 0),
+/*2006*/ SPY(36,  9, 0), SPY(37,  9, 0), SPY(38,  9, 0), SPY(39, 10, 0),
+/*2010*/ SPY(40, 10, 0), SPY(41, 10, 0), SPY(42, 10, 0), SPY(43, 11, 0),
+/*2014*/ SPY(44, 11, 0), SPY(45, 11, 0), SPY(46, 11, 0), SPY(47, 12, 0),
+/*2018*/ SPY(48, 12, 0), SPY(49, 12, 0), SPY(50, 12, 0), SPY(51, 13, 0),
+/*2022*/ SPY(52, 13, 0), SPY(53, 13, 0), SPY(54, 13, 0), SPY(55, 14, 0),
+/*2026*/ SPY(56, 14, 0), SPY(57, 14, 0), SPY(58, 14, 0), SPY(59, 15, 0),
+/*2030*/ SPY(60, 15, 0), SPY(61, 15, 0), SPY(62, 15, 0), SPY(63, 16, 0),
+/*2034*/ SPY(64, 16, 0), SPY(65, 16, 0), SPY(66, 16, 0), SPY(67, 17, 0),
+/*2038*/ SPY(68, 17, 0)
+};
+
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+
+struct timespec *
+udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
+{
+	int yday;
+	u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
+	u16 year = le16_to_cpu(src.year);
+	uint8_t type = typeAndTimezone >> 12;
+	int16_t offset;
+
+	if (type == 1) {
+		offset = typeAndTimezone << 4;
+		/* sign extent offset */
+		offset = (offset >> 4);
+		if (offset == -2047) /* unspecified offset */
+			offset = 0;
+	} else
+		offset = 0;
+
+	if ((year < EPOCH_YEAR) ||
+	    (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
+		return NULL;
+	}
+	dest->tv_sec = year_seconds[year - EPOCH_YEAR];
+	dest->tv_sec -= offset * 60;
+
+	yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1);
+	dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
+	dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
+			src.hundredsOfMicroseconds * 100 + src.microseconds);
+	return dest;
+}
+
+struct timestamp *
+udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
+{
+	long int days, rem, y;
+	const unsigned short int *ip;
+	int16_t offset;
+
+	offset = -sys_tz.tz_minuteswest;
+
+	if (!dest)
+		return NULL;
+
+	dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF));
+
+	ts.tv_sec += offset * 60;
+	days = ts.tv_sec / SECS_PER_DAY;
+	rem = ts.tv_sec % SECS_PER_DAY;
+	dest->hour = rem / SECS_PER_HOUR;
+	rem %= SECS_PER_HOUR;
+	dest->minute = rem / 60;
+	dest->second = rem % 60;
+	y = 1970;
+
+#define DIV(a, b) ((a) / (b) - ((a) % (b) < 0))
+#define LEAPS_THRU_END_OF(y) (DIV (y, 4) - DIV (y, 100) + DIV (y, 400))
+
+	while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
+		long int yg = y + days / 365 - (days % 365 < 0);
+
+		/* Adjust DAYS and Y to match the guessed year.  */
+		days -= ((yg - y) * 365
+			 + LEAPS_THRU_END_OF(yg - 1)
+			 - LEAPS_THRU_END_OF(y - 1));
+		y = yg;
+	}
+	dest->year = cpu_to_le16(y);
+	ip = __mon_yday[__isleap(y)];
+	for (y = 11; days < (long int)ip[y]; --y)
+		continue;
+	days -= ip[y];
+	dest->month = y + 1;
+	dest->day = days + 1;
+
+	dest->centiseconds = ts.tv_nsec / 10000000;
+	dest->hundredsOfMicroseconds = (ts.tv_nsec / 1000 -
+					dest->centiseconds * 10000) / 100;
+	dest->microseconds = (ts.tv_nsec / 1000 - dest->centiseconds * 10000 -
+			      dest->hundredsOfMicroseconds * 100);
+	return dest;
+}
+
+/* EOF */
diff --git a/kernel/fs/udf/unicode.c b/kernel/fs/udf/unicode.c
new file mode 100644
index 000000000..b84fee372
--- /dev/null
+++ b/kernel/fs/udf/unicode.c
@@ -0,0 +1,496 @@
+/*
+ * unicode.c
+ *
+ * PURPOSE
+ *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
+ *      Also handles filename mangling
+ *
+ * DESCRIPTION
+ *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
+ *		http://www.osta.org/
+ *	UTF-8 is explained in the IETF RFC XXXX.
+ *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ */
+
+#include "udfdecl.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>	/* for memset */
+#include <linux/nls.h>
+#include <linux/crc-itu-t.h>
+#include <linux/slab.h>
+
+#include "udf_sb.h"
+
+static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
+				  int);
+
+static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
+{
+	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
+		return 0;
+
+	memset(dest, 0, sizeof(struct ustr));
+	memcpy(dest->u_name, src, strlen);
+	dest->u_cmpID = 0x08;
+	dest->u_len = strlen;
+
+	return strlen;
+}
+
+/*
+ * udf_build_ustr
+ */
+int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
+{
+	int usesize;
+
+	if (!dest || !ptr || !size)
+		return -1;
+	BUG_ON(size < 2);
+
+	usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
+	usesize = min(usesize, size - 2);
+	dest->u_cmpID = ptr[0];
+	dest->u_len = usesize;
+	memcpy(dest->u_name, ptr + 1, usesize);
+	memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
+
+	return 0;
+}
+
+/*
+ * udf_build_ustr_exact
+ */
+static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
+{
+	if ((!dest) || (!ptr) || (!exactsize))
+		return -1;
+
+	memset(dest, 0, sizeof(struct ustr));
+	dest->u_cmpID = ptr[0];
+	dest->u_len = exactsize - 1;
+	memcpy(dest->u_name, ptr + 1, exactsize - 1);
+
+	return 0;
+}
+
+/*
+ * udf_ocu_to_utf8
+ *
+ * PURPOSE
+ *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
+ *
+ * PRE-CONDITIONS
+ *	utf			Pointer to UTF-8 output buffer.
+ *	ocu			Pointer to OSTA Compressed Unicode input buffer
+ *				of size UDF_NAME_LEN bytes.
+ * 				both of type "struct ustr *"
+ *
+ * POST-CONDITIONS
+ *	<return>		Zero on success.
+ *
+ * HISTORY
+ *	November 12, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
+{
+	const uint8_t *ocu;
+	uint8_t cmp_id, ocu_len;
+	int i;
+
+	ocu_len = ocu_i->u_len;
+	if (ocu_len == 0) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		return 0;
+	}
+
+	cmp_id = ocu_i->u_cmpID;
+	if (cmp_id != 8 && cmp_id != 16) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		pr_err("unknown compression code (%d) stri=%s\n",
+		       cmp_id, ocu_i->u_name);
+		return 0;
+	}
+
+	ocu = ocu_i->u_name;
+	utf_o->u_len = 0;
+	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
+
+		/* Expand OSTA compressed Unicode to Unicode */
+		uint32_t c = ocu[i++];
+		if (cmp_id == 16)
+			c = (c << 8) | ocu[i++];
+
+		/* Compress Unicode to UTF-8 */
+		if (c < 0x80U)
+			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
+		else if (c < 0x800U) {
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0xc0 | (c >> 6));
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0x80 | (c & 0x3f));
+		} else {
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0xe0 | (c >> 12));
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0x80 |
+							  ((c >> 6) & 0x3f));
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0x80 | (c & 0x3f));
+		}
+	}
+	utf_o->u_cmpID = 8;
+
+	return utf_o->u_len;
+}
+
+/*
+ *
+ * udf_utf8_to_ocu
+ *
+ * PURPOSE
+ *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
+ *
+ * DESCRIPTION
+ *	This routine is only called by udf_lookup().
+ *
+ * PRE-CONDITIONS
+ *	ocu			Pointer to OSTA Compressed Unicode output
+ *				buffer of size UDF_NAME_LEN bytes.
+ *	utf			Pointer to UTF-8 input buffer.
+ *	utf_len			Length of UTF-8 input buffer in bytes.
+ *
+ * POST-CONDITIONS
+ *	<return>		Zero on success.
+ *
+ * HISTORY
+ *	November 12, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
+{
+	unsigned c, i, max_val, utf_char;
+	int utf_cnt, u_len;
+
+	memset(ocu, 0, sizeof(dstring) * length);
+	ocu[0] = 8;
+	max_val = 0xffU;
+
+try_again:
+	u_len = 0U;
+	utf_char = 0U;
+	utf_cnt = 0U;
+	for (i = 0U; i < utf->u_len; i++) {
+		c = (uint8_t)utf->u_name[i];
+
+		/* Complete a multi-byte UTF-8 character */
+		if (utf_cnt) {
+			utf_char = (utf_char << 6) | (c & 0x3fU);
+			if (--utf_cnt)
+				continue;
+		} else {
+			/* Check for a multi-byte UTF-8 character */
+			if (c & 0x80U) {
+				/* Start a multi-byte UTF-8 character */
+				if ((c & 0xe0U) == 0xc0U) {
+					utf_char = c & 0x1fU;
+					utf_cnt = 1;
+				} else if ((c & 0xf0U) == 0xe0U) {
+					utf_char = c & 0x0fU;
+					utf_cnt = 2;
+				} else if ((c & 0xf8U) == 0xf0U) {
+					utf_char = c & 0x07U;
+					utf_cnt = 3;
+				} else if ((c & 0xfcU) == 0xf8U) {
+					utf_char = c & 0x03U;
+					utf_cnt = 4;
+				} else if ((c & 0xfeU) == 0xfcU) {
+					utf_char = c & 0x01U;
+					utf_cnt = 5;
+				} else {
+					goto error_out;
+				}
+				continue;
+			} else {
+				/* Single byte UTF-8 character (most common) */
+				utf_char = c;
+			}
+		}
+
+		/* Choose no compression if necessary */
+		if (utf_char > max_val) {
+			if (max_val == 0xffU) {
+				max_val = 0xffffU;
+				ocu[0] = (uint8_t)0x10U;
+				goto try_again;
+			}
+			goto error_out;
+		}
+
+		if (max_val == 0xffffU)
+			ocu[++u_len] = (uint8_t)(utf_char >> 8);
+		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
+	}
+
+	if (utf_cnt) {
+error_out:
+		ocu[++u_len] = '?';
+		printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
+	}
+
+	ocu[length - 1] = (uint8_t)u_len + 1;
+
+	return u_len + 1;
+}
+
+static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
+			const struct ustr *ocu_i)
+{
+	const uint8_t *ocu;
+	uint8_t cmp_id, ocu_len;
+	int i, len;
+
+
+	ocu_len = ocu_i->u_len;
+	if (ocu_len == 0) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		return 0;
+	}
+
+	cmp_id = ocu_i->u_cmpID;
+	if (cmp_id != 8 && cmp_id != 16) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		pr_err("unknown compression code (%d) stri=%s\n",
+		       cmp_id, ocu_i->u_name);
+		return 0;
+	}
+
+	ocu = ocu_i->u_name;
+	utf_o->u_len = 0;
+	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
+		/* Expand OSTA compressed Unicode to Unicode */
+		uint32_t c = ocu[i++];
+		if (cmp_id == 16)
+			c = (c << 8) | ocu[i++];
+
+		len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+				    UDF_NAME_LEN - utf_o->u_len);
+		/* Valid character? */
+		if (len >= 0)
+			utf_o->u_len += len;
+		else
+			utf_o->u_name[utf_o->u_len++] = '?';
+	}
+	utf_o->u_cmpID = 8;
+
+	return utf_o->u_len;
+}
+
+static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
+			int length)
+{
+	int len;
+	unsigned i, max_val;
+	uint16_t uni_char;
+	int u_len;
+
+	memset(ocu, 0, sizeof(dstring) * length);
+	ocu[0] = 8;
+	max_val = 0xffU;
+
+try_again:
+	u_len = 0U;
+	for (i = 0U; i < uni->u_len; i++) {
+		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
+		if (!len)
+			continue;
+		/* Invalid character, deal with it */
+		if (len < 0) {
+			len = 1;
+			uni_char = '?';
+		}
+
+		if (uni_char > max_val) {
+			max_val = 0xffffU;
+			ocu[0] = (uint8_t)0x10U;
+			goto try_again;
+		}
+
+		if (max_val == 0xffffU)
+			ocu[++u_len] = (uint8_t)(uni_char >> 8);
+		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
+		i += len - 1;
+	}
+
+	ocu[length - 1] = (uint8_t)u_len + 1;
+	return u_len + 1;
+}
+
+int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
+		     uint8_t *dname, int dlen)
+{
+	struct ustr *filename, *unifilename;
+	int len = 0;
+
+	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!filename)
+		return 0;
+
+	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!unifilename)
+		goto out1;
+
+	if (udf_build_ustr_exact(unifilename, sname, slen))
+		goto out2;
+
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
+		if (!udf_CS0toUTF8(filename, unifilename)) {
+			udf_debug("Failed in udf_get_filename: sname = %s\n",
+				  sname);
+			goto out2;
+		}
+	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+				  unifilename)) {
+			udf_debug("Failed in udf_get_filename: sname = %s\n",
+				  sname);
+			goto out2;
+		}
+	} else
+		goto out2;
+
+	len = udf_translate_to_linux(dname, dlen,
+				     filename->u_name, filename->u_len,
+				     unifilename->u_name, unifilename->u_len);
+out2:
+	kfree(unifilename);
+out1:
+	kfree(filename);
+	return len;
+}
+
+int udf_put_filename(struct super_block *sb, const uint8_t *sname,
+		     uint8_t *dname, int flen)
+{
+	struct ustr unifilename;
+	int namelen;
+
+	if (!udf_char_to_ustr(&unifilename, sname, flen))
+		return 0;
+
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
+		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
+		if (!namelen)
+			return 0;
+	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
+					&unifilename, UDF_NAME_LEN);
+		if (!namelen)
+			return 0;
+	} else
+		return 0;
+
+	return namelen;
+}
+
+#define ILLEGAL_CHAR_MARK	'_'
+#define EXT_MARK		'.'
+#define CRC_MARK		'#'
+#define EXT_SIZE 		5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN			5
+
+static int udf_translate_to_linux(uint8_t *newName, int newLen,
+				  uint8_t *udfName, int udfLen,
+				  uint8_t *fidName, int fidNameLen)
+{
+	int index, newIndex = 0, needsCRC = 0;
+	int extIndex = 0, newExtIndex = 0, hasExt = 0;
+	unsigned short valueCRC;
+	uint8_t curr;
+
+	if (udfName[0] == '.' &&
+	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
+		needsCRC = 1;
+		newIndex = udfLen;
+		memcpy(newName, udfName, udfLen);
+	} else {
+		for (index = 0; index < udfLen; index++) {
+			curr = udfName[index];
+			if (curr == '/' || curr == 0) {
+				needsCRC = 1;
+				curr = ILLEGAL_CHAR_MARK;
+				while (index + 1 < udfLen &&
+						(udfName[index + 1] == '/' ||
+						 udfName[index + 1] == 0))
+					index++;
+			}
+			if (curr == EXT_MARK &&
+					(udfLen - index - 1) <= EXT_SIZE) {
+				if (udfLen == index + 1)
+					hasExt = 0;
+				else {
+					hasExt = 1;
+					extIndex = index;
+					newExtIndex = newIndex;
+				}
+			}
+			if (newIndex < newLen)
+				newName[newIndex++] = curr;
+			else
+				needsCRC = 1;
+		}
+	}
+	if (needsCRC) {
+		uint8_t ext[EXT_SIZE];
+		int localExtIndex = 0;
+
+		if (hasExt) {
+			int maxFilenameLen;
+			for (index = 0;
+			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
+			     index++) {
+				curr = udfName[extIndex + index + 1];
+
+				if (curr == '/' || curr == 0) {
+					needsCRC = 1;
+					curr = ILLEGAL_CHAR_MARK;
+					while (extIndex + index + 2 < udfLen &&
+					      (index + 1 < EXT_SIZE &&
+						(udfName[extIndex + index + 2] == '/' ||
+						 udfName[extIndex + index + 2] == 0)))
+						index++;
+				}
+				ext[localExtIndex++] = curr;
+			}
+			maxFilenameLen = newLen - CRC_LEN - localExtIndex;
+			if (newIndex > maxFilenameLen)
+				newIndex = maxFilenameLen;
+			else
+				newIndex = newExtIndex;
+		} else if (newIndex > newLen - CRC_LEN)
+			newIndex = newLen - CRC_LEN;
+		newName[newIndex++] = CRC_MARK;
+		valueCRC = crc_itu_t(0, fidName, fidNameLen);
+		newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
+		newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
+		newName[newIndex++] = hex_asc_upper_hi(valueCRC);
+		newName[newIndex++] = hex_asc_upper_lo(valueCRC);
+
+		if (hasExt) {
+			newName[newIndex++] = EXT_MARK;
+			for (index = 0; index < localExtIndex; index++)
+				newName[newIndex++] = ext[index];
+		}
+	}
+
+	return newIndex;
+}
diff --git a/kernel/fs/ufs/Kconfig b/kernel/fs/ufs/Kconfig
new file mode 100644
index 000000000..0bf6e16f8
--- /dev/null
+++ b/kernel/fs/ufs/Kconfig
@@ -0,0 +1,43 @@
+config UFS_FS
+	tristate "UFS file system support (read only)"
+	depends on BLOCK
+	help
+	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
+	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
+	  Unixes can create and mount hard disk partitions and diskettes using
+	  this file system as well. Saying Y here will allow you to read from
+	  these partitions; if you also want to write to them, say Y to the
+	  experimental "UFS file system write support", below. Please read the
+	  file <file:Documentation/filesystems/ufs.txt> for more information.
+
+          The recently released UFS2 variant (used in FreeBSD 5.x) is
+          READ-ONLY supported.
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").
+
+	  When accessing NeXTstep files, you may need to convert them from the
+	  NeXT character set to the Latin1 character set; use the program
+	  recode ("info recode") for this purpose.
+
+	  To compile the UFS file system support as a module, choose M here: the
+	  module will be called ufs.
+
+	  If you haven't heard about all of this before, it's safe to say N.
+
+config UFS_FS_WRITE
+	bool "UFS file system write support (DANGEROUS)"
+	depends on UFS_FS
+	help
+	  Say Y here if you want to try writing to UFS partitions. This is
+	  experimental, so you should back up your UFS partitions beforehand.
+
+config UFS_DEBUG
+	bool "UFS debugging"
+	depends on UFS_FS
+	help
+	  If you are experiencing any problems with the UFS filesystem, say
+	  Y here.  This will result in _many_ additional debugging messages to be
+	  written to the system log.
diff --git a/kernel/fs/ufs/Makefile b/kernel/fs/ufs/Makefile
new file mode 100644
index 000000000..4d0e02b02
--- /dev/null
+++ b/kernel/fs/ufs/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux ufs filesystem routines.
+#
+
+obj-$(CONFIG_UFS_FS) += ufs.o
+
+ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
+	    namei.o super.o symlink.o truncate.o util.o
+ccflags-$(CONFIG_UFS_DEBUG)    += -DDEBUG
diff --git a/kernel/fs/ufs/balloc.c b/kernel/fs/ufs/balloc.c
new file mode 100644
index 000000000..a7106eda5
--- /dev/null
+++ b/kernel/fs/ufs/balloc.c
@@ -0,0 +1,938 @@
+/*
+ *  linux/fs/ufs/balloc.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ * UFS2 write support Evgeniy Dushistov <dushistov@mail.ru>, 2007
+ */
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/capability.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+#define INVBLOCK ((u64)-1L)
+
+static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned);
+static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *);
+static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *);
+static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned);
+static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[];
+static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int);
+
+/*
+ * Free 'count' fragments from fragment number 'fragment'
+ */
+void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned cgno, bit, end_bit, bbase, blkmap, i;
+	u64 blkno;
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	
+	UFSD("ENTER, fragment %llu, count %u\n",
+	     (unsigned long long)fragment, count);
+	
+	if (ufs_fragnum(fragment) + count > uspi->s_fpg)
+		ufs_error (sb, "ufs_free_fragments", "internal error");
+
+	mutex_lock(&UFS_SB(sb)->s_lock);
+	
+	cgno = ufs_dtog(uspi, fragment);
+	bit = ufs_dtogd(uspi, fragment);
+	if (cgno >= uspi->s_ncg) {
+		ufs_panic (sb, "ufs_free_fragments", "freeing blocks are outside device");
+		goto failed;
+	}
+		
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi) 
+		goto failed;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) {
+		ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
+		goto failed;
+	}
+
+	end_bit = bit + count;
+	bbase = ufs_blknum (bit);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
+	ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
+	for (i = bit; i < end_bit; i++) {
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
+		else 
+			ufs_error (sb, "ufs_free_fragments",
+				   "bit already cleared for fragment %u", i);
+	}
+	
+	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
+	uspi->cs_total.cs_nffree += count;
+	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
+	ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
+
+	/*
+	 * Trying to reassemble free fragments into block
+	 */
+	blkno = ufs_fragstoblks (bbase);
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+		fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
+		uspi->cs_total.cs_nffree -= uspi->s_fpb;
+		fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
+		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+			ufs_clusteracct (sb, ucpi, blkno, 1);
+		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
+		if (uspi->fs_magic != UFS2_MAGIC) {
+			unsigned cylno = ufs_cbtocylno (bbase);
+
+			fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
+						  ufs_cbtorpos(bbase)), 1);
+			fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
+		}
+	}
+	
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	ufs_mark_sb_dirty(sb);
+
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+	UFSD("EXIT\n");
+	return;
+
+failed:
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+	UFSD("EXIT (FAILED)\n");
+	return;
+}
+
+/*
+ * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
+ */
+void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned overflow, cgno, bit, end_bit, i;
+	u64 blkno;
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+
+	UFSD("ENTER, fragment %llu, count %u\n",
+	     (unsigned long long)fragment, count);
+	
+	if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
+		ufs_error (sb, "ufs_free_blocks", "internal error, "
+			   "fragment %llu, count %u\n",
+			   (unsigned long long)fragment, count);
+		goto failed;
+	}
+
+	mutex_lock(&UFS_SB(sb)->s_lock);
+	
+do_more:
+	overflow = 0;
+	cgno = ufs_dtog(uspi, fragment);
+	bit = ufs_dtogd(uspi, fragment);
+	if (cgno >= uspi->s_ncg) {
+		ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device");
+		goto failed_unlock;
+	}
+	end_bit = bit + count;
+	if (end_bit > uspi->s_fpg) {
+		overflow = bit + count - uspi->s_fpg;
+		count -= overflow;
+		end_bit -= overflow;
+	}
+
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi) 
+		goto failed_unlock;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) {
+		ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
+		goto failed_unlock;
+	}
+
+	for (i = bit; i < end_bit; i += uspi->s_fpb) {
+		blkno = ufs_fragstoblks(i);
+		if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+			ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
+		}
+		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+			ufs_clusteracct (sb, ucpi, blkno, 1);
+
+		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
+
+		if (uspi->fs_magic != UFS2_MAGIC) {
+			unsigned cylno = ufs_cbtocylno(i);
+
+			fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
+						  ufs_cbtorpos(i)), 1);
+			fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
+		}
+	}
+
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+
+	if (overflow) {
+		fragment += count;
+		count = overflow;
+		goto do_more;
+	}
+
+	ufs_mark_sb_dirty(sb);
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+	UFSD("EXIT\n");
+	return;
+
+failed_unlock:
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+failed:
+	UFSD("EXIT (FAILED)\n");
+	return;
+}
+
+/*
+ * Modify inode page cache in such way:
+ * have - blocks with b_blocknr equal to oldb...oldb+count-1
+ * get - blocks with b_blocknr equal to newb...newb+count-1
+ * also we suppose that oldb...oldb+count-1 blocks
+ * situated at the end of file.
+ *
+ * We can come here from ufs_writepage or ufs_prepare_write,
+ * locked_page is argument of these functions, so we already lock it.
+ */
+static void ufs_change_blocknr(struct inode *inode, sector_t beg,
+			       unsigned int count, sector_t oldb,
+			       sector_t newb, struct page *locked_page)
+{
+	const unsigned blks_per_page =
+		1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	const unsigned mask = blks_per_page - 1;
+	struct address_space * const mapping = inode->i_mapping;
+	pgoff_t index, cur_index, last_index;
+	unsigned pos, j, lblock;
+	sector_t end, i;
+	struct page *page;
+	struct buffer_head *head, *bh;
+
+	UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n",
+	      inode->i_ino, count,
+	     (unsigned long long)oldb, (unsigned long long)newb);
+
+	BUG_ON(!locked_page);
+	BUG_ON(!PageLocked(locked_page));
+
+	cur_index = locked_page->index;
+	end = count + beg;
+	last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	for (i = beg; i < end; i = (i | mask) + 1) {
+		index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+		if (likely(cur_index != index)) {
+			page = ufs_get_locked_page(mapping, index);
+			if (!page)/* it was truncated */
+				continue;
+			if (IS_ERR(page)) {/* or EIO */
+				ufs_error(inode->i_sb, __func__,
+					  "read of page %llu failed\n",
+					  (unsigned long long)index);
+				continue;
+			}
+		} else
+			page = locked_page;
+
+		head = page_buffers(page);
+		bh = head;
+		pos = i & mask;
+		for (j = 0; j < pos; ++j)
+			bh = bh->b_this_page;
+
+
+		if (unlikely(index == last_index))
+			lblock = end & mask;
+		else
+			lblock = blks_per_page;
+
+		do {
+			if (j >= lblock)
+				break;
+			pos = (i - beg) + j;
+
+			if (!buffer_mapped(bh))
+					map_bh(bh, inode->i_sb, oldb + pos);
+			if (!buffer_uptodate(bh)) {
+				ll_rw_block(READ, 1, &bh);
+				wait_on_buffer(bh);
+				if (!buffer_uptodate(bh)) {
+					ufs_error(inode->i_sb, __func__,
+						  "read of block failed\n");
+					break;
+				}
+			}
+
+			UFSD(" change from %llu to %llu, pos %u\n",
+			     (unsigned long long)(pos + oldb),
+			     (unsigned long long)(pos + newb), pos);
+
+			bh->b_blocknr = newb + pos;
+			unmap_underlying_metadata(bh->b_bdev,
+						  bh->b_blocknr);
+			mark_buffer_dirty(bh);
+			++j;
+			bh = bh->b_this_page;
+		} while (bh != head);
+
+		if (likely(cur_index != index))
+			ufs_put_locked_page(page);
+ 	}
+	UFSD("EXIT\n");
+}
+
+static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n,
+			    int sync)
+{
+	struct buffer_head *bh;
+	sector_t end = beg + n;
+
+	for (; beg < end; ++beg) {
+		bh = sb_getblk(inode->i_sb, beg);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (IS_SYNC(inode) || sync)
+			sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+}
+
+u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
+			   u64 goal, unsigned count, int *err,
+			   struct page *locked_page)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	unsigned cgno, oldcount, newcount;
+	u64 tmp, request, result;
+	
+	UFSD("ENTER, ino %lu, fragment %llu, goal %llu, count %u\n",
+	     inode->i_ino, (unsigned long long)fragment,
+	     (unsigned long long)goal, count);
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	*err = -ENOSPC;
+
+	mutex_lock(&UFS_SB(sb)->s_lock);
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+
+	if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
+		ufs_warning(sb, "ufs_new_fragments", "internal warning"
+			    " fragment %llu, count %u",
+			    (unsigned long long)fragment, count);
+		count = uspi->s_fpb - ufs_fragnum(fragment); 
+	}
+	oldcount = ufs_fragnum (fragment);
+	newcount = oldcount + count;
+
+	/*
+	 * Somebody else has just allocated our fragments
+	 */
+	if (oldcount) {
+		if (!tmp) {
+			ufs_error(sb, "ufs_new_fragments", "internal error, "
+				  "fragment %llu, tmp %llu\n",
+				  (unsigned long long)fragment,
+				  (unsigned long long)tmp);
+			mutex_unlock(&UFS_SB(sb)->s_lock);
+			return INVBLOCK;
+		}
+		if (fragment < UFS_I(inode)->i_lastfrag) {
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
+			mutex_unlock(&UFS_SB(sb)->s_lock);
+			return 0;
+		}
+	}
+	else {
+		if (tmp) {
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
+			mutex_unlock(&UFS_SB(sb)->s_lock);
+			return 0;
+		}
+	}
+
+	/*
+	 * There is not enough space for user on the device
+	 */
+	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		UFSD("EXIT (FAILED)\n");
+		return 0;
+	}
+
+	if (goal >= uspi->s_size) 
+		goal = 0;
+	if (goal == 0) 
+		cgno = ufs_inotocg (inode->i_ino);
+	else
+		cgno = ufs_dtog(uspi, goal);
+	 
+	/*
+	 * allocate new fragment
+	 */
+	if (oldcount == 0) {
+		result = ufs_alloc_fragments (inode, cgno, goal, count, err);
+		if (result) {
+			ufs_cpu_to_data_ptr(sb, p, result);
+			*err = 0;
+			UFS_I(inode)->i_lastfrag =
+				max(UFS_I(inode)->i_lastfrag, fragment + count);
+			ufs_clear_frags(inode, result + oldcount,
+					newcount - oldcount, locked_page != NULL);
+		}
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		UFSD("EXIT, result %llu\n", (unsigned long long)result);
+		return result;
+	}
+
+	/*
+	 * resize block
+	 */
+	result = ufs_add_fragments(inode, tmp, oldcount, newcount);
+	if (result) {
+		*err = 0;
+		UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+						fragment + count);
+		ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
+				locked_page != NULL);
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		UFSD("EXIT, result %llu\n", (unsigned long long)result);
+		return result;
+	}
+
+	/*
+	 * allocate new block and move data
+	 */
+	switch (fs32_to_cpu(sb, usb1->fs_optim)) {
+	    case UFS_OPTSPACE:
+		request = newcount;
+		if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
+		    > uspi->s_dsize * uspi->s_minfree / (2 * 100))
+			break;
+		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+		break;
+	    default:
+		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+	
+	    case UFS_OPTTIME:
+		request = uspi->s_fpb;
+		if (uspi->cs_total.cs_nffree < uspi->s_dsize *
+		    (uspi->s_minfree - 2) / 100)
+			break;
+		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+		break;
+	}
+	result = ufs_alloc_fragments (inode, cgno, goal, request, err);
+	if (result) {
+		ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
+				locked_page != NULL);
+		ufs_change_blocknr(inode, fragment - oldcount, oldcount,
+				   uspi->s_sbbase + tmp,
+				   uspi->s_sbbase + result, locked_page);
+		ufs_cpu_to_data_ptr(sb, p, result);
+		*err = 0;
+		UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+						fragment + count);
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		if (newcount < request)
+			ufs_free_fragments (inode, result + newcount, request - newcount);
+		ufs_free_fragments (inode, tmp, oldcount);
+		UFSD("EXIT, result %llu\n", (unsigned long long)result);
+		return result;
+	}
+
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+	UFSD("EXIT (FAILED)\n");
+	return 0;
+}		
+
+static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
+			     unsigned oldcount, unsigned newcount)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned cgno, fragno, fragoff, count, fragsize, i;
+	
+	UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
+	     (unsigned long long)fragment, oldcount, newcount);
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	count = newcount - oldcount;
+	
+	cgno = ufs_dtog(uspi, fragment);
+	if (fs32_to_cpu(sb, UFS_SB(sb)->fs_cs(cgno).cs_nffree) < count)
+		return 0;
+	if ((ufs_fragnum (fragment) + newcount) > uspi->s_fpb)
+		return 0;
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi)
+		return 0;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) {
+		ufs_panic (sb, "ufs_add_fragments",
+			"internal error, bad magic number on cg %u", cgno);
+		return 0;
+	}
+
+	fragno = ufs_dtogd(uspi, fragment);
+	fragoff = ufs_fragnum (fragno);
+	for (i = oldcount; i < newcount; i++)
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
+			return 0;
+	/*
+	 * Block can be extended
+	 */
+	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+	for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
+			break;
+	fragsize = i - oldcount;
+	if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
+		ufs_panic (sb, "ufs_add_fragments",
+			"internal error or corrupted bitmap on cg %u", cgno);
+	fs32_sub(sb, &ucg->cg_frsum[fragsize], 1);
+	if (fragsize != count)
+		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
+	for (i = oldcount; i < newcount; i++)
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
+
+	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
+	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
+	
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	ufs_mark_sb_dirty(sb);
+
+	UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
+	
+	return fragment;
+}
+
+#define UFS_TEST_FREE_SPACE_CG \
+	ucg = (struct ufs_cylinder_group *) UFS_SB(sb)->s_ucg[cgno]->b_data; \
+	if (fs32_to_cpu(sb, ucg->cg_cs.cs_nbfree)) \
+		goto cg_found; \
+	for (k = count; k < uspi->s_fpb; k++) \
+		if (fs32_to_cpu(sb, ucg->cg_frsum[k])) \
+			goto cg_found; 
+
+static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
+			       u64 goal, unsigned count, int *err)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned oldcg, i, j, k, allocsize;
+	u64 result;
+	
+	UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
+	     inode->i_ino, cgno, (unsigned long long)goal, count);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	oldcg = cgno;
+	
+	/*
+	 * 1. searching on preferred cylinder group
+	 */
+	UFS_TEST_FREE_SPACE_CG
+
+	/*
+	 * 2. quadratic rehash
+	 */
+	for (j = 1; j < uspi->s_ncg; j *= 2) {
+		cgno += j;
+		if (cgno >= uspi->s_ncg) 
+			cgno -= uspi->s_ncg;
+		UFS_TEST_FREE_SPACE_CG
+	}
+
+	/*
+	 * 3. brute force search
+	 * We start at i = 2 ( 0 is checked at 1.step, 1 at 2.step )
+	 */
+	cgno = (oldcg + 1) % uspi->s_ncg;
+	for (j = 2; j < uspi->s_ncg; j++) {
+		cgno++;
+		if (cgno >= uspi->s_ncg)
+			cgno = 0;
+		UFS_TEST_FREE_SPACE_CG
+	}
+	
+	UFSD("EXIT (FAILED)\n");
+	return 0;
+
+cg_found:
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi)
+		return 0;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) 
+		ufs_panic (sb, "ufs_alloc_fragments",
+			"internal error, bad magic number on cg %u", cgno);
+	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+
+	if (count == uspi->s_fpb) {
+		result = ufs_alloccg_block (inode, ucpi, goal, err);
+		if (result == INVBLOCK)
+			return 0;
+		goto succed;
+	}
+
+	for (allocsize = count; allocsize < uspi->s_fpb; allocsize++)
+		if (fs32_to_cpu(sb, ucg->cg_frsum[allocsize]) != 0)
+			break;
+	
+	if (allocsize == uspi->s_fpb) {
+		result = ufs_alloccg_block (inode, ucpi, goal, err);
+		if (result == INVBLOCK)
+			return 0;
+		goal = ufs_dtogd(uspi, result);
+		for (i = count; i < uspi->s_fpb; i++)
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
+		i = uspi->s_fpb - count;
+
+		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
+		uspi->cs_total.cs_nffree += i;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
+		fs32_add(sb, &ucg->cg_frsum[i], 1);
+		goto succed;
+	}
+
+	result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
+	if (result == INVBLOCK)
+		return 0;
+	for (i = 0; i < count; i++)
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
+	
+	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
+	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
+	fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
+
+	if (count != allocsize)
+		fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
+
+succed:
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	ufs_mark_sb_dirty(sb);
+
+	result += cgno * uspi->s_fpg;
+	UFSD("EXIT3, result %llu\n", (unsigned long long)result);
+	return result;
+}
+
+static u64 ufs_alloccg_block(struct inode *inode,
+			     struct ufs_cg_private_info *ucpi,
+			     u64 goal, int *err)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cylinder_group * ucg;
+	u64 result, blkno;
+
+	UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+
+	if (goal == 0) {
+		goal = ucpi->c_rotor;
+		goto norot;
+	}
+	goal = ufs_blknum (goal);
+	goal = ufs_dtogd(uspi, goal);
+	
+	/*
+	 * If the requested block is available, use it.
+	 */
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+		result = goal;
+		goto gotit;
+	}
+	
+norot:	
+	result = ufs_bitmap_search (sb, ucpi, goal, uspi->s_fpb);
+	if (result == INVBLOCK)
+		return INVBLOCK;
+	ucpi->c_rotor = result;
+gotit:
+	blkno = ufs_fragstoblks(result);
+	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+		ufs_clusteracct (sb, ucpi, blkno, -1);
+
+	fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
+	uspi->cs_total.cs_nbfree--;
+	fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
+
+	if (uspi->fs_magic != UFS2_MAGIC) {
+		unsigned cylno = ufs_cbtocylno((unsigned)result);
+
+		fs16_sub(sb, &ubh_cg_blks(ucpi, cylno,
+					  ufs_cbtorpos((unsigned)result)), 1);
+		fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
+	}
+	
+	UFSD("EXIT, result %llu\n", (unsigned long long)result);
+
+	return result;
+}
+
+static unsigned ubh_scanc(struct ufs_sb_private_info *uspi,
+			  struct ufs_buffer_head *ubh,
+			  unsigned begin, unsigned size,
+			  unsigned char *table, unsigned char mask)
+{
+	unsigned rest, offset;
+	unsigned char *cp;
+	
+
+	offset = begin & ~uspi->s_fmask;
+	begin >>= uspi->s_fshift;
+	for (;;) {
+		if ((offset + size) < uspi->s_fsize)
+			rest = size;
+		else
+			rest = uspi->s_fsize - offset;
+		size -= rest;
+		cp = ubh->bh[begin]->b_data + offset;
+		while ((table[*cp++] & mask) == 0 && --rest)
+			;
+		if (rest || !size)
+			break;
+		begin++;
+		offset = 0;
+	}
+	return (size + rest);
+}
+
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ * @sp: pointer to super block
+ * @ucpi: pointer to cylinder group info
+ * @goal: near which block we want find new one
+ * @count: specified size
+ */
+static u64 ufs_bitmap_search(struct super_block *sb,
+			     struct ufs_cg_private_info *ucpi,
+			     u64 goal, unsigned count)
+{
+	/*
+	 * Bit patterns for identifying fragments in the block map
+	 * used as ((map & mask_arr) == want_arr)
+	 */
+	static const int mask_arr[9] = {
+		0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+	};
+	static const int want_arr[9] = {
+		0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+	};
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	unsigned start, length, loc;
+	unsigned pos, want, blockmap, mask, end;
+	u64 result;
+
+	UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
+	     (unsigned long long)goal, count);
+
+	if (goal)
+		start = ufs_dtogd(uspi, goal) >> 3;
+	else
+		start = ucpi->c_frotor >> 3;
+		
+	length = ((uspi->s_fpg + 7) >> 3) - start;
+	loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
+		(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
+		1 << (count - 1 + (uspi->s_fpb & 7))); 
+	if (loc == 0) {
+		length = start + 1;
+		loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length,
+				(uspi->s_fpb == 8) ? ufs_fragtable_8fpb :
+				ufs_fragtable_other,
+				1 << (count - 1 + (uspi->s_fpb & 7)));
+		if (loc == 0) {
+			ufs_error(sb, "ufs_bitmap_search",
+				  "bitmap corrupted on cg %u, start %u,"
+				  " length %u, count %u, freeoff %u\n",
+				  ucpi->c_cgx, start, length, count,
+				  ucpi->c_freeoff);
+			return INVBLOCK;
+		}
+		start = 0;
+	}
+	result = (start + length - loc) << 3;
+	ucpi->c_frotor = result;
+
+	/*
+	 * found the byte in the map
+	 */
+
+	for (end = result + 8; result < end; result += uspi->s_fpb) {
+		blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
+		blockmap <<= 1;
+		mask = mask_arr[count];
+		want = want_arr[count];
+		for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
+			if ((blockmap & mask) == want) {
+				UFSD("EXIT, result %llu\n",
+				     (unsigned long long)result);
+				return result + pos;
+ 			}
+			mask <<= 1;
+			want <<= 1;
+ 		}
+ 	}
+
+	ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
+		  ucpi->c_cgx);
+	UFSD("EXIT (FAILED)\n");
+	return INVBLOCK;
+}
+
+static void ufs_clusteracct(struct super_block * sb,
+	struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt)
+{
+	struct ufs_sb_private_info * uspi;
+	int i, start, end, forw, back;
+	
+	uspi = UFS_SB(sb)->s_uspi;
+	if (uspi->s_contigsumsize <= 0)
+		return;
+
+	if (cnt > 0)
+		ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
+	else
+		ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
+
+	/*
+	 * Find the size of the cluster going forward.
+	 */
+	start = blkno + 1;
+	end = start + uspi->s_contigsumsize;
+	if ( end >= ucpi->c_nclusterblks)
+		end = ucpi->c_nclusterblks;
+	i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
+	if (i > end)
+		i = end;
+	forw = i - start;
+	
+	/*
+	 * Find the size of the cluster going backward.
+	 */
+	start = blkno - 1;
+	end = start - uspi->s_contigsumsize;
+	if (end < 0 ) 
+		end = -1;
+	i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
+	if ( i < end) 
+		i = end;
+	back = start - i;
+	
+	/*
+	 * Account for old cluster and the possibly new forward and
+	 * back clusters.
+	 */
+	i = back + forw + 1;
+	if (i > uspi->s_contigsumsize)
+		i = uspi->s_contigsumsize;
+	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
+	if (back > 0)
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
+	if (forw > 0)
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
+}
+
+
+static unsigned char ufs_fragtable_8fpb[] = {
+	0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,	
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A,
+	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x08, 0x09, 0x09, 0x0A, 0x10, 0x11, 0x20, 0x40,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0A, 0x12,
+	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0C,
+	0x08, 0x09, 0x09, 0x0A, 0x09, 0x09, 0x0A, 0x0C, 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80,
+};
+
+static unsigned char ufs_fragtable_other[] = {
+	0x00, 0x16, 0x16, 0x2A, 0x16, 0x16, 0x26, 0x4E, 0x16, 0x16, 0x16, 0x3E, 0x2A, 0x3E, 0x4E, 0x8A,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x26, 0x36, 0x36, 0x2E, 0x36, 0x36, 0x26, 0x6E, 0x36, 0x36, 0x36, 0x3E, 0x2E, 0x3E, 0x6E, 0xAE,
+	0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE,
+	0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA,
+	0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E,	0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE,
+	0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE,
+	0x8A, 0x9E, 0x9E, 0xAA, 0x9E, 0x9E, 0xAE, 0xCE, 0x9E, 0x9E, 0x9E, 0xBE, 0xAA, 0xBE, 0xCE, 0x8A,
+};
diff --git a/kernel/fs/ufs/cylinder.c b/kernel/fs/ufs/cylinder.c
new file mode 100644
index 000000000..b4676322d
--- /dev/null
+++ b/kernel/fs/ufs/cylinder.c
@@ -0,0 +1,201 @@
+/*
+ *  linux/fs/ufs/cylinder.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  ext2 - inode (block) bitmap caching inspired
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+
+#include <asm/byteorder.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * Read cylinder group into cache. The memory space for ufs_cg_private_info
+ * structure is already allocated during ufs_read_super.
+ */
+static void ufs_read_cylinder (struct super_block * sb,
+	unsigned cgno, unsigned bitmap_nr)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned i, j;
+
+	UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
+	uspi = sbi->s_uspi;
+	ucpi = sbi->s_ucpi[bitmap_nr];
+	ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
+
+	UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
+	UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
+	/*
+	 * We have already the first fragment of cylinder group block in buffer
+	 */
+	UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
+		if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
+			goto failed;
+	sbi->s_cgno[bitmap_nr] = cgno;
+			
+	ucpi->c_cgx	= fs32_to_cpu(sb, ucg->cg_cgx);
+	ucpi->c_ncyl	= fs16_to_cpu(sb, ucg->cg_ncyl);
+	ucpi->c_niblk	= fs16_to_cpu(sb, ucg->cg_niblk);
+	ucpi->c_ndblk	= fs32_to_cpu(sb, ucg->cg_ndblk);
+	ucpi->c_rotor	= fs32_to_cpu(sb, ucg->cg_rotor);
+	ucpi->c_frotor	= fs32_to_cpu(sb, ucg->cg_frotor);
+	ucpi->c_irotor	= fs32_to_cpu(sb, ucg->cg_irotor);
+	ucpi->c_btotoff	= fs32_to_cpu(sb, ucg->cg_btotoff);
+	ucpi->c_boff	= fs32_to_cpu(sb, ucg->cg_boff);
+	ucpi->c_iusedoff = fs32_to_cpu(sb, ucg->cg_iusedoff);
+	ucpi->c_freeoff	= fs32_to_cpu(sb, ucg->cg_freeoff);
+	ucpi->c_nextfreeoff = fs32_to_cpu(sb, ucg->cg_nextfreeoff);
+	ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
+	ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
+	ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
+	UFSD("EXIT\n");
+	return;	
+	
+failed:
+	for (j = 1; j < i; j++)
+		brelse (sbi->s_ucg[j]);
+	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
+	ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno);
+}
+
+/*
+ * Remove cylinder group from cache, doesn't release memory
+ * allocated for cylinder group (this is done at ufs_put_super only).
+ */
+void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi; 
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned i;
+
+	UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
+
+	uspi = sbi->s_uspi;
+	if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
+		UFSD("EXIT\n");
+		return;
+	}
+	ucpi = sbi->s_ucpi[bitmap_nr];
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+
+	if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
+		ufs_panic (sb, "ufs_put_cylinder", "internal error");
+		return;
+	}
+	/*
+	 * rotor is not so important data, so we put it to disk 
+	 * at the end of working with cylinder
+	 */
+	ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
+	ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
+	ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+		brelse (UCPI_UBH(ucpi)->bh[i]);
+	}
+
+	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
+	UFSD("EXIT\n");
+}
+
+/*
+ * Find cylinder group in cache and return it as pointer.
+ * If cylinder group is not in cache, we will load it from disk.
+ *
+ * The cache is managed by LRU algorithm. 
+ */
+struct ufs_cg_private_info * ufs_load_cylinder (
+	struct super_block * sb, unsigned cgno)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	unsigned cg, i, j;
+
+	UFSD("ENTER, cgno %u\n", cgno);
+
+	uspi = sbi->s_uspi;
+	if (cgno >= uspi->s_ncg) {
+		ufs_panic (sb, "ufs_load_cylinder", "internal error, high number of cg");
+		return NULL;
+	}
+	/*
+	 * Cylinder group number cg it in cache and it was last used
+	 */
+	if (sbi->s_cgno[0] == cgno) {
+		UFSD("EXIT\n");
+		return sbi->s_ucpi[0];
+	}
+	/*
+	 * Number of cylinder groups is not higher than UFS_MAX_GROUP_LOADED
+	 */
+	if (uspi->s_ncg <= UFS_MAX_GROUP_LOADED) {
+		if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
+			if (sbi->s_cgno[cgno] != cgno) {
+				ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
+				UFSD("EXIT (FAILED)\n");
+				return NULL;
+			}
+			else {
+				UFSD("EXIT\n");
+				return sbi->s_ucpi[cgno];
+			}
+		} else {
+			ufs_read_cylinder (sb, cgno, cgno);
+			UFSD("EXIT\n");
+			return sbi->s_ucpi[cgno];
+		}
+	}
+	/*
+	 * Cylinder group number cg is in cache but it was not last used, 
+	 * we will move to the first position
+	 */
+	for (i = 0; i < sbi->s_cg_loaded && sbi->s_cgno[i] != cgno; i++);
+	if (i < sbi->s_cg_loaded && sbi->s_cgno[i] == cgno) {
+		cg = sbi->s_cgno[i];
+		ucpi = sbi->s_ucpi[i];
+		for (j = i; j > 0; j--) {
+			sbi->s_cgno[j] = sbi->s_cgno[j-1];
+			sbi->s_ucpi[j] = sbi->s_ucpi[j-1];
+		}
+		sbi->s_cgno[0] = cg;
+		sbi->s_ucpi[0] = ucpi;
+	/*
+	 * Cylinder group number cg is not in cache, we will read it from disk
+	 * and put it to the first position
+	 */
+	} else {
+		if (sbi->s_cg_loaded < UFS_MAX_GROUP_LOADED)
+			sbi->s_cg_loaded++;
+		else
+			ufs_put_cylinder (sb, UFS_MAX_GROUP_LOADED-1);
+		ucpi = sbi->s_ucpi[sbi->s_cg_loaded - 1];
+		for (j = sbi->s_cg_loaded - 1; j > 0; j--) {
+			sbi->s_cgno[j] = sbi->s_cgno[j-1];
+			sbi->s_ucpi[j] = sbi->s_ucpi[j-1];
+		}
+		sbi->s_ucpi[0] = ucpi;
+		ufs_read_cylinder (sb, cgno, 0);
+	}
+	UFSD("EXIT\n");
+	return sbi->s_ucpi[0];
+}
diff --git a/kernel/fs/ufs/dir.c b/kernel/fs/ufs/dir.c
new file mode 100644
index 000000000..1bfe8cabf
--- /dev/null
+++ b/kernel/fs/ufs/dir.c
@@ -0,0 +1,662 @@
+/*
+ *  linux/fs/ufs/ufs_dir.c
+ *
+ * Copyright (C) 1996
+ * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu)
+ * Laboratory for Computer Science Research Computing Facility
+ * Rutgers, The State University of New Jersey
+ *
+ * swab support by Francois-Rene Rideau <fare@tunes.org> 19970406
+ *
+ * 4.4BSD (FreeBSD) support added on February 1st 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
+ * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
+ *
+ * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
+ */
+static inline int ufs_match(struct super_block *sb, int len,
+		const unsigned char *name, struct ufs_dir_entry *de)
+{
+	if (len != ufs_get_de_namlen(sb, de))
+		return 0;
+	if (!de->d_ino)
+		return 0;
+	return !memcmp(name, de->d_name, len);
+}
+
+static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
+
+static inline void ufs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long ufs_dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
+{
+	ino_t res = 0;
+	struct ufs_dir_entry *de;
+	struct page *page;
+	
+	de = ufs_find_entry(dir, qstr, &page);
+	if (de) {
+		res = fs32_to_cpu(dir->i_sb, de->d_ino);
+		ufs_put_page(page);
+	}
+	return res;
+}
+
+
+/* Releases the page */
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+		  struct page *page, struct inode *inode)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen);
+	int err;
+
+	lock_page(page);
+	err = ufs_prepare_chunk(page, pos, len);
+	BUG_ON(err);
+
+	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
+	ufs_set_de_type(dir->i_sb, de, inode->i_mode);
+
+	err = ufs_commit_chunk(page, pos, len);
+	ufs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
+
+
+static void ufs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1;
+	struct ufs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & chunk_mask)
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct ufs_dir_entry *)(kaddr + offs);
+		rec_len = fs16_to_cpu(sb, p->d_reclen);
+
+		if (rec_len < UFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~chunk_mask)
+			goto Espan;
+		if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
+						  UFS_SB(sb)->s_uspi->s_ncg))
+			goto Einumber;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	ufs_error(sb, "ufs_check_page",
+		  "size of directory #%lu is not a multiple of chunk size",
+		  dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "inode out of bounds";
+bad_entry:
+	ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
+		   "offset=%lu, rec_len=%d, name_len=%d",
+		   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		   rec_len, ufs_get_de_namlen(sb, p));
+	goto fail;
+Eend:
+	p = (struct ufs_dir_entry *)(kaddr + offs);
+	ufs_error(sb, __func__,
+		   "entry in directory #%lu spans the page boundary"
+		   "offset=%lu",
+		   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *ufs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			ufs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	ufs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ufs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static inline struct ufs_dir_entry *
+ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
+{
+	return (struct ufs_dir_entry *)((char *)p +
+					fs16_to_cpu(sb, p->d_reclen));
+}
+
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = ufs_get_page(dir, 0);
+	struct ufs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = ufs_next_entry(dir->i_sb,
+				    (struct ufs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+/*
+ *	ufs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
+				     struct page **res_page)
+{
+	struct super_block *sb = dir->i_sb;
+	const unsigned char *name = qstr->name;
+	int namelen = qstr->len;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = ufs_dir_pages(dir);
+	struct page *page = NULL;
+	struct ufs_inode_info *ui = UFS_I(dir);
+	struct ufs_dir_entry *de;
+
+	UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
+
+	if (npages == 0 || namelen > UFS_MAXNAMLEN)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ui->i_dir_start_lookup;
+
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = ufs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct ufs_dir_entry *) kaddr;
+			kaddr += ufs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->d_reclen == 0) {
+					ufs_error(dir->i_sb, __func__,
+						  "zero-length directory entry");
+					ufs_put_page(page);
+					goto out;
+				}
+				if (ufs_match(sb, namelen, name, de))
+					goto found;
+				de = ufs_next_entry(sb, de);
+			}
+			ufs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ui->i_dir_start_lookup = n;
+	return de;
+}
+
+/*
+ *	Parent is locked.
+ */
+int ufs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct super_block *sb = dir->i_sb;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize;
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct ufs_dir_entry *de;
+	unsigned long npages = ufs_dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	UFSD("ENTER, name %s, namelen %u\n", name, namelen);
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = ufs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + ufs_last_byte(dir, n);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->d_reclen = cpu_to_fs16(sb, chunk_size);
+				de->d_ino = 0;
+				goto got_it;
+			}
+			if (de->d_reclen == 0) {
+				ufs_error(dir->i_sb, __func__,
+					  "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (ufs_match(sb, namelen, name, de))
+				goto out_unlock;
+			name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
+			rec_len = fs16_to_cpu(sb, de->d_reclen);
+			if (!de->d_ino && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct ufs_dir_entry *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		ufs_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+			(char*)de - (char*)page_address(page);
+	err = ufs_prepare_chunk(page, pos, rec_len);
+	if (err)
+		goto out_unlock;
+	if (de->d_ino) {
+		struct ufs_dir_entry *de1 =
+			(struct ufs_dir_entry *) ((char *) de + name_len);
+		de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
+		de->d_reclen = cpu_to_fs16(sb, name_len);
+
+		de = de1;
+	}
+
+	ufs_set_de_namlen(sb, de, namelen);
+	memcpy(de->d_name, name, namelen + 1);
+	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
+	ufs_set_de_type(sb, de, inode->i_mode);
+
+	err = ufs_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ufs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+static inline unsigned
+ufs_validate_entry(struct super_block *sb, char *base,
+		   unsigned offset, unsigned mask)
+{
+	struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
+	struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
+	while ((char*)p < (char*)de) {
+		if (p->d_reclen == 0)
+			break;
+		p = ufs_next_entry(sb, p);
+	}
+	return (char *)p - base;
+}
+
+
+/*
+ * This is blatantly stolen from ext2fs
+ */
+static int
+ufs_readdir(struct file *file, struct dir_context *ctx)
+{
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = ufs_dir_pages(inode);
+	unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
+	int need_revalidate = file->f_version != inode->i_version;
+	unsigned flags = UFS_SB(sb)->s_flags;
+
+	UFSD("BEGIN\n");
+
+	if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
+		return 0;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct ufs_dir_entry *de;
+
+		struct page *page = ufs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			ufs_error(sb, __func__,
+				  "bad page in #%lu",
+				  inode->i_ino);
+			ctx->pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			file->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct ufs_dir_entry *)(kaddr+offset);
+		limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
+		for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
+			if (de->d_reclen == 0) {
+				ufs_error(sb, __func__,
+					"zero-length directory entry");
+				ufs_put_page(page);
+				return -EIO;
+			}
+			if (de->d_ino) {
+				unsigned char d_type = DT_UNKNOWN;
+
+				UFSD("filldir(%s,%u)\n", de->d_name,
+				      fs32_to_cpu(sb, de->d_ino));
+				UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
+
+				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+					d_type = de->d_u.d_44.d_type;
+
+				if (!dir_emit(ctx, de->d_name,
+					       ufs_get_de_namlen(sb, de),
+					       fs32_to_cpu(sb, de->d_ino),
+					       d_type)) {
+					ufs_put_page(page);
+					return 0;
+				}
+			}
+			ctx->pos += fs16_to_cpu(sb, de->d_reclen);
+		}
+		ufs_put_page(page);
+	}
+	return 0;
+}
+
+
+/*
+ * ufs_delete_entry deletes a directory entry by merging it with the
+ * previous entry.
+ */
+int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
+		     struct page * page)
+{
+	struct super_block *sb = inode->i_sb;
+	char *kaddr = page_address(page);
+	unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
+	unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+	loff_t pos;
+	struct ufs_dir_entry *pde = NULL;
+	struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+	int err;
+
+	UFSD("ENTER\n");
+
+	UFSD("ino %u, reclen %u, namlen %u, name %s\n",
+	      fs32_to_cpu(sb, de->d_ino),
+	      fs16_to_cpu(sb, de->d_reclen),
+	      ufs_get_de_namlen(sb, de), de->d_name);
+
+	while ((char*)de < (char*)dir) {
+		if (de->d_reclen == 0) {
+			ufs_error(inode->i_sb, __func__,
+				  "zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = ufs_next_entry(sb, de);
+	}
+	if (pde)
+		from = (char*)pde - (char*)page_address(page);
+
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = ufs_prepare_chunk(page, pos, to - from);
+	BUG_ON(err);
+	if (pde)
+		pde->d_reclen = cpu_to_fs16(sb, to - from);
+	dir->d_ino = 0;
+	err = ufs_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+out:
+	ufs_put_page(page);
+	UFSD("EXIT\n");
+	return err;
+}
+
+int ufs_make_empty(struct inode * inode, struct inode *dir)
+{
+	struct super_block * sb = dir->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize;
+	struct ufs_dir_entry * de;
+	char *base;
+	int err;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = ufs_prepare_chunk(page, 0, chunk_size);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kmap(page);
+	base = (char*)page_address(page);
+	memset(base, 0, PAGE_CACHE_SIZE);
+
+	de = (struct ufs_dir_entry *) base;
+
+	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
+	ufs_set_de_type(sb, de, inode->i_mode);
+	ufs_set_de_namlen(sb, de, 1);
+	de->d_reclen = cpu_to_fs16(sb, UFS_DIR_REC_LEN(1));
+	strcpy (de->d_name, ".");
+	de = (struct ufs_dir_entry *)
+		((char *)de + fs16_to_cpu(sb, de->d_reclen));
+	de->d_ino = cpu_to_fs32(sb, dir->i_ino);
+	ufs_set_de_type(sb, de, dir->i_mode);
+	de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1));
+	ufs_set_de_namlen(sb, de, 2);
+	strcpy (de->d_name, "..");
+	kunmap(page);
+
+	err = ufs_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ufs_empty_dir(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page = NULL;
+	unsigned long i, npages = ufs_dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct ufs_dir_entry *de;
+		page = ufs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->d_reclen == 0) {
+				ufs_error(inode->i_sb, __func__,
+					"zero-length directory entry: "
+					"kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->d_ino) {
+				u16 namelen=ufs_get_de_namlen(sb, de);
+				/* check for . and .. */
+				if (de->d_name[0] != '.')
+					goto not_empty;
+				if (namelen > 2)
+					goto not_empty;
+				if (namelen < 2) {
+					if (inode->i_ino !=
+					    fs32_to_cpu(sb, de->d_ino))
+						goto not_empty;
+				} else if (de->d_name[1] != '.')
+					goto not_empty;
+			}
+			de = ufs_next_entry(sb, de);
+		}
+		ufs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	ufs_put_page(page);
+	return 0;
+}
+
+const struct file_operations ufs_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= ufs_readdir,
+	.fsync		= generic_file_fsync,
+	.llseek		= generic_file_llseek,
+};
diff --git a/kernel/fs/ufs/file.c b/kernel/fs/ufs/file.c
new file mode 100644
index 000000000..042ddbf11
--- /dev/null
+++ b/kernel/fs/ufs/file.c
@@ -0,0 +1,44 @@
+/*
+ *  linux/fs/ufs/file.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 fs regular file handling primitives
+ */
+
+#include <linux/fs.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the ufs filesystem.
+ */
+ 
+const struct file_operations ufs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.open           = generic_file_open,
+	.fsync		= generic_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
diff --git a/kernel/fs/ufs/ialloc.c b/kernel/fs/ufs/ialloc.c
new file mode 100644
index 000000000..fd0203ce1
--- /dev/null
+++ b/kernel/fs/ufs/ialloc.c
@@ -0,0 +1,353 @@
+/*
+ *  linux/fs/ufs/ialloc.c
+ *
+ * Copyright (c) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by 
+ *  Stephen Tweedie (sct@dcs.ed.ac.uk), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * UFS2 write support added by
+ * Evgeniy Dushistov <dushistov@mail.ru>, 2007
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ufs_free_inode (struct inode * inode)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	int is_directory;
+	unsigned ino, cg, bit;
+	
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	
+	ino = inode->i_ino;
+
+	mutex_lock(&UFS_SB(sb)->s_lock);
+
+	if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
+		ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		return;
+	}
+	
+	cg = ufs_inotocg (ino);
+	bit = ufs_inotocgoff (ino);
+	ucpi = ufs_load_cylinder (sb, cg);
+	if (!ucpi) {
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		return;
+	}
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg))
+		ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
+
+	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+
+	is_directory = S_ISDIR(inode->i_mode);
+
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
+		ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
+	else {
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
+		if (ino < ucpi->c_irotor)
+			ucpi->c_irotor = ino;
+		fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
+		uspi->cs_total.cs_nifree++;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
+
+		if (is_directory) {
+			fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
+			uspi->cs_total.cs_ndir--;
+			fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
+		}
+	}
+
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	
+	ufs_mark_sb_dirty(sb);
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+	UFSD("EXIT\n");
+}
+
+/*
+ * Nullify new chunk of inodes,
+ * BSD people also set ui_gen field of inode
+ * during nullification, but we not care about
+ * that because of linux ufs do not support NFS
+ */
+static void ufs2_init_inodes_chunk(struct super_block *sb,
+				   struct ufs_cg_private_info *ucpi,
+				   struct ufs_cylinder_group *ucg)
+{
+	struct buffer_head *bh;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	sector_t beg = uspi->s_sbbase +
+		ufs_inotofsba(ucpi->c_cgx * uspi->s_ipg +
+			      fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk));
+	sector_t end = beg + uspi->s_fpb;
+
+	UFSD("ENTER cgno %d\n", ucpi->c_cgx);
+
+	for (; beg < end; ++beg) {
+		bh = sb_getblk(sb, beg);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (sb->s_flags & MS_SYNCHRONOUS)
+			sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+
+	fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
+	ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+
+	UFSD("EXIT\n");
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
+{
+	struct super_block * sb;
+	struct ufs_sb_info * sbi;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	struct inode * inode;
+	unsigned cg, bit, i, j, start;
+	struct ufs_inode_info *ufsi;
+	int err = -ENOSPC;
+
+	UFSD("ENTER\n");
+	
+	/* Cannot create files in a deleted directory */
+	if (!dir || !dir->i_nlink)
+		return ERR_PTR(-EPERM);
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	ufsi = UFS_I(inode);
+	sbi = UFS_SB(sb);
+	uspi = sbi->s_uspi;
+
+	mutex_lock(&sbi->s_lock);
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	i = ufs_inotocg(dir->i_ino);
+	if (sbi->fs_cs(i).cs_nifree) {
+		cg = i;
+		goto cg_found;
+	}
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode
+	 */
+	for ( j = 1; j < uspi->s_ncg; j <<= 1 ) {
+		i += j;
+		if (i >= uspi->s_ncg)
+			i -= uspi->s_ncg;
+		if (sbi->fs_cs(i).cs_nifree) {
+			cg = i;
+			goto cg_found;
+		}
+	}
+
+	/*
+	 * That failed: try linear search for a free inode
+	 */
+	i = ufs_inotocg(dir->i_ino) + 1;
+	for (j = 2; j < uspi->s_ncg; j++) {
+		i++;
+		if (i >= uspi->s_ncg)
+			i = 0;
+		if (sbi->fs_cs(i).cs_nifree) {
+			cg = i;
+			goto cg_found;
+		}
+	}
+
+	goto failed;
+
+cg_found:
+	ucpi = ufs_load_cylinder (sb, cg);
+	if (!ucpi) {
+		err = -EIO;
+		goto failed;
+	}
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) 
+		ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
+
+	start = ucpi->c_irotor;
+	bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
+	if (!(bit < uspi->s_ipg)) {
+		bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
+		if (!(bit < start)) {
+			ufs_error (sb, "ufs_new_inode",
+			    "cylinder group %u corrupted - error in inode bitmap\n", cg);
+			err = -EIO;
+			goto failed;
+		}
+	}
+	UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
+		ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
+	else {
+		ufs_panic (sb, "ufs_new_inode", "internal error");
+		err = -EIO;
+		goto failed;
+	}
+
+	if (uspi->fs_magic == UFS2_MAGIC) {
+		u32 initediblk = fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk);
+
+		if (bit + uspi->s_inopb > initediblk &&
+		    initediblk < fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_niblk))
+			ufs2_init_inodes_chunk(sb, ucpi, ucg);
+	}
+
+	fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
+	uspi->cs_total.cs_nifree--;
+	fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
+	
+	if (S_ISDIR(mode)) {
+		fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
+		uspi->cs_total.cs_ndir++;
+		fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
+	}
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	ufs_mark_sb_dirty(sb);
+
+	inode->i_ino = cg * uspi->s_ipg + bit;
+	inode_init_owner(inode, dir, mode);
+	inode->i_blocks = 0;
+	inode->i_generation = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	ufsi->i_flags = UFS_I(dir)->i_flags;
+	ufsi->i_lastfrag = 0;
+	ufsi->i_shadow = 0;
+	ufsi->i_osync = 0;
+	ufsi->i_oeftflag = 0;
+	ufsi->i_dir_start_lookup = 0;
+	memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
+	if (insert_inode_locked(inode) < 0) {
+		err = -EIO;
+		goto failed;
+	}
+	mark_inode_dirty(inode);
+
+	if (uspi->fs_magic == UFS2_MAGIC) {
+		struct buffer_head *bh;
+		struct ufs2_inode *ufs2_inode;
+
+		/*
+		 * setup birth date, we do it here because of there is no sense
+		 * to hold it in struct ufs_inode_info, and lose 64 bit
+		 */
+		bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+		if (!bh) {
+			ufs_warning(sb, "ufs_read_inode",
+				    "unable to read inode %lu\n",
+				    inode->i_ino);
+			err = -EIO;
+			goto fail_remove_inode;
+		}
+		lock_buffer(bh);
+		ufs2_inode = (struct ufs2_inode *)bh->b_data;
+		ufs2_inode += ufs_inotofsbo(inode->i_ino);
+		ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec);
+		ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (sb->s_flags & MS_SYNCHRONOUS)
+			sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+	mutex_unlock(&sbi->s_lock);
+
+	UFSD("allocating inode %lu\n", inode->i_ino);
+	UFSD("EXIT\n");
+	return inode;
+
+fail_remove_inode:
+	mutex_unlock(&sbi->s_lock);
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	UFSD("EXIT (FAILED): err %d\n", err);
+	return ERR_PTR(err);
+failed:
+	mutex_unlock(&sbi->s_lock);
+	make_bad_inode(inode);
+	iput (inode);
+	UFSD("EXIT (FAILED): err %d\n", err);
+	return ERR_PTR(err);
+}
diff --git a/kernel/fs/ufs/inode.c b/kernel/fs/ufs/inode.c
new file mode 100644
index 000000000..2d93ab07d
--- /dev/null
+++ b/kernel/fs/ufs/inode.c
@@ -0,0 +1,910 @@
+/*
+ *  linux/fs/ufs/inode.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie (sct@dcs.ed.ac.uk), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
+
+static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
+{
+	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
+	int ptrs = uspi->s_apb;
+	int ptrs_bits = uspi->s_apbshift;
+	const long direct_blocks = UFS_NDADDR,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+
+
+	UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
+	if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = UFS_IND_BLOCK;
+		offsets[n++] = i_block;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = UFS_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = UFS_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+	} else {
+		ufs_warning(inode->i_sb, "ufs_block_to_path", "block > big");
+	}
+	return n;
+}
+
+/*
+ * Returns the location of the fragment from
+ * the beginning of the filesystem.
+ */
+
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift;
+	int shift = uspi->s_apbshift-uspi->s_fpbshift;
+	sector_t offsets[4], *p;
+	int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets);
+	u64  ret = 0L;
+	__fs32 block;
+	__fs64 u2_block = 0L;
+	unsigned flags = UFS_SB(sb)->s_flags;
+	u64 temp = 0L;
+
+	UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
+	UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",
+		uspi->s_fpbshift, uspi->s_apbmask,
+		(unsigned long long)mask);
+
+	if (depth == 0)
+		return 0;
+
+	p = offsets;
+
+	if (needs_lock)
+		lock_ufs(sb);
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+		goto ufs2;
+
+	block = ufsi->i_u1.i_data[*p++];
+	if (!block)
+		goto out;
+	while (--depth) {
+		struct buffer_head *bh;
+		sector_t n = *p++;
+
+		bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift));
+		if (!bh)
+			goto out;
+		block = ((__fs32 *) bh->b_data)[n & mask];
+		brelse (bh);
+		if (!block)
+			goto out;
+	}
+	ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask));
+	goto out;
+ufs2:
+	u2_block = ufsi->i_u1.u2_i_data[*p++];
+	if (!u2_block)
+		goto out;
+
+
+	while (--depth) {
+		struct buffer_head *bh;
+		sector_t n = *p++;
+
+
+		temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block);
+		bh = sb_bread(sb, temp +(u64) (n>>shift));
+		if (!bh)
+			goto out;
+		u2_block = ((__fs64 *)bh->b_data)[n & mask];
+		brelse(bh);
+		if (!u2_block)
+			goto out;
+	}
+	temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block);
+	ret = temp + (u64) (frag & uspi->s_fpbmask);
+
+out:
+	if (needs_lock)
+		unlock_ufs(sb);
+	return ret;
+}
+
+/**
+ * ufs_inode_getfrag() - allocate new fragment(s)
+ * @inode: pointer to inode
+ * @fragment: number of `fragment' which hold pointer
+ *   to new allocated fragment(s)
+ * @new_fragment: number of new allocated fragment(s)
+ * @required: how many fragment(s) we require
+ * @err: we set it if something wrong
+ * @phys: pointer to where we save physical number of new allocated fragments,
+ *   NULL if we allocate not data(indirect blocks for example).
+ * @new: we set it if we allocate new block
+ * @locked_page: for ufs_new_fragments()
+ */
+static struct buffer_head *
+ufs_inode_getfrag(struct inode *inode, u64 fragment,
+		  sector_t new_fragment, unsigned int required, int *err,
+		  long *phys, int *new, struct page *locked_page)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * result;
+	unsigned blockoff, lastblockoff;
+	u64 tmp, goal, lastfrag, block, lastblock;
+	void *p, *p2;
+
+	UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, "
+	     "metadata %d\n", inode->i_ino, (unsigned long long)fragment,
+	     (unsigned long long)new_fragment, required, !phys);
+
+        /* TODO : to be done for write support
+        if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+             goto ufs2;
+         */
+
+	block = ufs_fragstoblks (fragment);
+	blockoff = ufs_fragnum (fragment);
+	p = ufs_get_direct_data_ptr(uspi, ufsi, block);
+
+	goal = 0;
+
+repeat:
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+
+	lastfrag = ufsi->i_lastfrag;
+	if (tmp && fragment < lastfrag) {
+		if (!phys) {
+			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+			if (tmp == ufs_data_ptr_to_cpu(sb, p)) {
+				UFSD("EXIT, result %llu\n",
+				     (unsigned long long)tmp + blockoff);
+				return result;
+			}
+			brelse (result);
+			goto repeat;
+		} else {
+			*phys = uspi->s_sbbase + tmp + blockoff;
+			return NULL;
+		}
+	}
+
+	lastblock = ufs_fragstoblks (lastfrag);
+	lastblockoff = ufs_fragnum (lastfrag);
+	/*
+	 * We will extend file into new block beyond last allocated block
+	 */
+	if (lastblock < block) {
+		/*
+		 * We must reallocate last allocated block
+		 */
+		if (lastblockoff) {
+			p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock);
+			tmp = ufs_new_fragments(inode, p2, lastfrag,
+						ufs_data_ptr_to_cpu(sb, p2),
+						uspi->s_fpb - lastblockoff,
+						err, locked_page);
+			if (!tmp) {
+				if (lastfrag != ufsi->i_lastfrag)
+					goto repeat;
+				else
+					return NULL;
+			}
+			lastfrag = ufsi->i_lastfrag;
+			
+		}
+		tmp = ufs_data_ptr_to_cpu(sb,
+					 ufs_get_direct_data_ptr(uspi, ufsi,
+								 lastblock));
+		if (tmp)
+			goal = tmp + uspi->s_fpb;
+		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
+					 goal, required + blockoff,
+					 err,
+					 phys != NULL ? locked_page : NULL);
+	} else if (lastblock == block) {
+	/*
+	 * We will extend last allocated block
+	 */
+		tmp = ufs_new_fragments(inode, p, fragment -
+					(blockoff - lastblockoff),
+					ufs_data_ptr_to_cpu(sb, p),
+					required +  (blockoff - lastblockoff),
+					err, phys != NULL ? locked_page : NULL);
+	} else /* (lastblock > block) */ {
+	/*
+	 * We will allocate new block before last allocated block
+	 */
+		if (block) {
+			tmp = ufs_data_ptr_to_cpu(sb,
+						 ufs_get_direct_data_ptr(uspi, ufsi, block - 1));
+			if (tmp)
+				goal = tmp + uspi->s_fpb;
+		}
+		tmp = ufs_new_fragments(inode, p, fragment - blockoff,
+					goal, uspi->s_fpb, err,
+					phys != NULL ? locked_page : NULL);
+	}
+	if (!tmp) {
+		if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) ||
+		    (blockoff && lastfrag != ufsi->i_lastfrag))
+			goto repeat;
+		*err = -ENOSPC;
+		return NULL;
+	}
+
+	if (!phys) {
+		result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+	} else {
+		*phys = uspi->s_sbbase + tmp + blockoff;
+		result = NULL;
+		*err = 0;
+		*new = 1;
+	}
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	if (IS_SYNC(inode))
+		ufs_sync_inode (inode);
+	mark_inode_dirty(inode);
+	UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff);
+	return result;
+
+     /* This part : To be implemented ....
+        Required only for writing, not required for READ-ONLY.
+ufs2:
+
+	u2_block = ufs_fragstoblks(fragment);
+	u2_blockoff = ufs_fragnum(fragment);
+	p = ufsi->i_u1.u2_i_data + block;
+	goal = 0;
+
+repeat2:
+	tmp = fs32_to_cpu(sb, *p);
+	lastfrag = ufsi->i_lastfrag;
+
+     */
+}
+
+/**
+ * ufs_inode_getblock() - allocate new block
+ * @inode: pointer to inode
+ * @bh: pointer to block which hold "pointer" to new allocated block
+ * @fragment: number of `fragment' which hold pointer
+ *   to new allocated block
+ * @new_fragment: number of new allocated fragment
+ *  (block will hold this fragment and also uspi->s_fpb-1)
+ * @err: see ufs_inode_getfrag()
+ * @phys: see ufs_inode_getfrag()
+ * @new: see ufs_inode_getfrag()
+ * @locked_page: see ufs_inode_getfrag()
+ */
+static struct buffer_head *
+ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
+		  u64 fragment, sector_t new_fragment, int *err,
+		  long *phys, int *new, struct page *locked_page)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * result;
+	unsigned blockoff;
+	u64 tmp, goal, block;
+	void *p;
+
+	block = ufs_fragstoblks (fragment);
+	blockoff = ufs_fragnum (fragment);
+
+	UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n",
+	     inode->i_ino, (unsigned long long)fragment,
+	     (unsigned long long)new_fragment, !phys);
+
+	result = NULL;
+	if (!bh)
+		goto out;
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block (READ, 1, &bh);
+		wait_on_buffer (bh);
+		if (!buffer_uptodate(bh))
+			goto out;
+	}
+	if (uspi->fs_magic == UFS2_MAGIC)
+		p = (__fs64 *)bh->b_data + block;
+	else
+		p = (__fs32 *)bh->b_data + block;
+repeat:
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (tmp) {
+		if (!phys) {
+			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+			if (tmp == ufs_data_ptr_to_cpu(sb, p))
+				goto out;
+			brelse (result);
+			goto repeat;
+		} else {
+			*phys = uspi->s_sbbase + tmp + blockoff;
+			goto out;
+		}
+	}
+
+	if (block && (uspi->fs_magic == UFS2_MAGIC ?
+		      (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) :
+		      (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1]))))
+		goal = tmp + uspi->s_fpb;
+	else
+		goal = bh->b_blocknr + uspi->s_fpb;
+	tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
+				uspi->s_fpb, err, locked_page);
+	if (!tmp) {
+		if (ufs_data_ptr_to_cpu(sb, p))
+			goto repeat;
+		goto out;
+	}		
+
+
+	if (!phys) {
+		result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+	} else {
+		*phys = uspi->s_sbbase + tmp + blockoff;
+		*new = 1;
+	}
+
+	mark_buffer_dirty(bh);
+	if (IS_SYNC(inode))
+		sync_dirty_buffer(bh);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	UFSD("result %llu\n", (unsigned long long)tmp + blockoff);
+out:
+	brelse (bh);
+	UFSD("EXIT\n");
+	return result;
+}
+
+/**
+ * ufs_getfrag_block() - `get_block_t' function, interface between UFS and
+ * readpage, writepage and so on
+ */
+
+int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+{
+	struct super_block * sb = inode->i_sb;
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi = sbi->s_uspi;
+	struct buffer_head * bh;
+	int ret, err, new;
+	unsigned long ptr,phys;
+	u64 phys64 = 0;
+	bool needs_lock = (sbi->mutex_owner != current);
+	
+	if (!create) {
+		phys64 = ufs_frag_map(inode, fragment, needs_lock);
+		UFSD("phys64 = %llu\n", (unsigned long long)phys64);
+		if (phys64)
+			map_bh(bh_result, sb, phys64);
+		return 0;
+	}
+
+        /* This code entered only while writing ....? */
+
+	err = -EIO;
+	new = 0;
+	ret = 0;
+	bh = NULL;
+
+	if (needs_lock)
+		lock_ufs(sb);
+
+	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
+	if (fragment >
+	    ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
+	     << uspi->s_fpbshift))
+		goto abort_too_big;
+
+	err = 0;
+	ptr = fragment;
+	  
+	/*
+	 * ok, these macros clean the logic up a bit and make
+	 * it much more readable:
+	 */
+#define GET_INODE_DATABLOCK(x) \
+	ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\
+			  bh_result->b_page)
+#define GET_INODE_PTR(x) \
+	ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\
+			  bh_result->b_page)
+#define GET_INDIRECT_DATABLOCK(x) \
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, &phys, &new, bh_result->b_page)
+#define GET_INDIRECT_PTR(x) \
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, NULL, NULL, NULL)
+
+	if (ptr < UFS_NDIR_FRAGMENT) {
+		bh = GET_INODE_DATABLOCK(ptr);
+		goto out;
+	}
+	ptr -= UFS_NDIR_FRAGMENT;
+	if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) {
+		bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift));
+		goto get_indirect;
+	}
+	ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift);
+	if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) {
+		bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift));
+		goto get_double;
+	}
+	ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift);
+	bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift));
+	bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask);
+get_double:
+	bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask);
+get_indirect:
+	bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask);
+
+#undef GET_INODE_DATABLOCK
+#undef GET_INODE_PTR
+#undef GET_INDIRECT_DATABLOCK
+#undef GET_INDIRECT_PTR
+
+out:
+	if (err)
+		goto abort;
+	if (new)
+		set_buffer_new(bh_result);
+	map_bh(bh_result, sb, phys);
+abort:
+	if (needs_lock)
+		unlock_ufs(sb);
+
+	return err;
+
+abort_too_big:
+	ufs_warning(sb, "ufs_get_block", "block > big");
+	goto abort;
+}
+
+static int ufs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page,ufs_getfrag_block,wbc);
+}
+
+static int ufs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,ufs_getfrag_block);
+}
+
+int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, ufs_getfrag_block);
+}
+
+static void ufs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size)
+		truncate_pagecache(inode, inode->i_size);
+}
+
+static int ufs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				ufs_getfrag_block);
+	if (unlikely(ret))
+		ufs_write_failed(mapping, pos + len);
+
+	return ret;
+}
+
+static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,ufs_getfrag_block);
+}
+
+const struct address_space_operations ufs_aops = {
+	.readpage = ufs_readpage,
+	.writepage = ufs_writepage,
+	.write_begin = ufs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = ufs_bmap
+};
+
+static void ufs_set_inode_ops(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ufs_file_inode_operations;
+		inode->i_fop = &ufs_file_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ufs_dir_inode_operations;
+		inode->i_fop = &ufs_dir_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (!inode->i_blocks)
+			inode->i_op = &ufs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &ufs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &ufs_aops;
+		}
+	} else
+		init_special_inode(inode, inode->i_mode,
+				   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
+}
+
+static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	umode_t mode;
+
+	/*
+	 * Copy data to the in-core inode.
+	 */
+	inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
+	set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
+	if (inode->i_nlink == 0) {
+		ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+		return -1;
+	}
+	
+	/*
+	 * Linux now has 32-bit uid and gid, so we can support EFT.
+	 */
+	i_uid_write(inode, ufs_get_inode_uid(sb, ufs_inode));
+	i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
+
+	inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
+	inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+	inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
+	inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
+	inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
+	ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
+	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
+	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
+
+	
+	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
+		memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
+	} else {
+		memcpy(ufsi->i_u1.i_symlink, ufs_inode->ui_u2.ui_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs_inode->ui_u2.ui_symlink) - 1] = 0;
+	}
+	return 0;
+}
+
+static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	umode_t mode;
+
+	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
+	/*
+	 * Copy data to the in-core inode.
+	 */
+	inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
+	set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
+	if (inode->i_nlink == 0) {
+		ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+		return -1;
+	}
+
+        /*
+         * Linux now has 32-bit uid and gid, so we can support EFT.
+         */
+	i_uid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_uid));
+	i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid));
+
+	inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
+	inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
+	inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime);
+	inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
+	inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
+	inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec);
+	inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
+	inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
+	inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
+	ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
+	/*
+	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
+	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
+	*/
+
+	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
+		memcpy(ufsi->i_u1.u2_i_data, &ufs2_inode->ui_u2.ui_addr,
+		       sizeof(ufs2_inode->ui_u2.ui_addr));
+	} else {
+		memcpy(ufsi->i_u1.i_symlink, ufs2_inode->ui_u2.ui_symlink,
+		       sizeof(ufs2_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs2_inode->ui_u2.ui_symlink) - 1] = 0;
+	}
+	return 0;
+}
+
+struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct ufs_inode_info *ufsi;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * bh;
+	struct inode *inode;
+	int err;
+
+	UFSD("ENTER, ino %lu\n", ino);
+
+	if (ino < UFS_ROOTINO || ino > (uspi->s_ncg * uspi->s_ipg)) {
+		ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
+			    ino);
+		return ERR_PTR(-EIO);
+	}
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ufsi = UFS_I(inode);
+
+	bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+	if (!bh) {
+		ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
+			    inode->i_ino);
+		goto bad_inode;
+	}
+	if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+
+		err = ufs2_read_inode(inode,
+				      ufs2_inode + ufs_inotofsbo(inode->i_ino));
+	} else {
+		struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
+
+		err = ufs1_read_inode(inode,
+				      ufs_inode + ufs_inotofsbo(inode->i_ino));
+	}
+
+	if (err)
+		goto bad_inode;
+	inode->i_version++;
+	ufsi->i_lastfrag =
+		(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	ufsi->i_dir_start_lookup = 0;
+	ufsi->i_osync = 0;
+
+	ufs_set_inode_ops(inode);
+
+	brelse(bh);
+
+	UFSD("EXIT\n");
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
+{
+	struct super_block *sb = inode->i_sb;
+ 	struct ufs_inode_info *ufsi = UFS_I(inode);
+
+	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
+	ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
+
+	ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode));
+	ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
+		
+	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
+	ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
+	ufs_inode->ui_atime.tv_usec = 0;
+	ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec);
+	ufs_inode->ui_ctime.tv_usec = 0;
+	ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
+	ufs_inode->ui_mtime.tv_usec = 0;
+	ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks);
+	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
+	ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation);
+
+	if ((UFS_SB(sb)->s_flags & UFS_UID_MASK) == UFS_UID_EFT) {
+		ufs_inode->ui_u3.ui_sun.ui_shadow = cpu_to_fs32(sb, ufsi->i_shadow);
+		ufs_inode->ui_u3.ui_sun.ui_oeftflag = cpu_to_fs32(sb, ufsi->i_oeftflag);
+	}
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
+		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0];
+	} else if (inode->i_blocks) {
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
+	}
+	else {
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
+	}
+
+	if (!inode->i_nlink)
+		memset (ufs_inode, 0, sizeof(struct ufs_inode));
+}
+
+static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
+{
+	struct super_block *sb = inode->i_sb;
+ 	struct ufs_inode_info *ufsi = UFS_I(inode);
+
+	UFSD("ENTER\n");
+	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
+	ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
+
+	ufs_inode->ui_uid = cpu_to_fs32(sb, i_uid_read(inode));
+	ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode));
+
+	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
+	ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
+	ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
+	ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec);
+	ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec);
+	ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
+	ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
+
+	ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks);
+	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
+	ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
+		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0];
+	} else if (inode->i_blocks) {
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.u2_i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
+	} else {
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
+ 	}
+
+	if (!inode->i_nlink)
+		memset (ufs_inode, 0, sizeof(struct ufs2_inode));
+	UFSD("EXIT\n");
+}
+
+static int ufs_update_inode(struct inode * inode, int do_sync)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * bh;
+
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
+
+	if (inode->i_ino < UFS_ROOTINO ||
+	    inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
+		ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
+		return -1;
+	}
+
+	bh = sb_bread(sb, ufs_inotofsba(inode->i_ino));
+	if (!bh) {
+		ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
+		return -1;
+	}
+	if (uspi->fs_magic == UFS2_MAGIC) {
+		struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+
+		ufs2_update_inode(inode,
+				  ufs2_inode + ufs_inotofsbo(inode->i_ino));
+	} else {
+		struct ufs_inode *ufs_inode = (struct ufs_inode *) bh->b_data;
+
+		ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+	}
+		
+	mark_buffer_dirty(bh);
+	if (do_sync)
+		sync_dirty_buffer(bh);
+	brelse (bh);
+	
+	UFSD("EXIT\n");
+	return 0;
+}
+
+int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+	lock_ufs(inode->i_sb);
+	ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+	unlock_ufs(inode->i_sb);
+	return ret;
+}
+
+int ufs_sync_inode (struct inode *inode)
+{
+	return ufs_update_inode (inode, 1);
+}
+
+void ufs_evict_inode(struct inode * inode)
+{
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode))
+		want_delete = 1;
+
+	truncate_inode_pages_final(&inode->i_data);
+	if (want_delete) {
+		loff_t old_i_size;
+		/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
+		lock_ufs(inode->i_sb);
+		mark_inode_dirty(inode);
+		ufs_update_inode(inode, IS_SYNC(inode));
+		old_i_size = inode->i_size;
+		inode->i_size = 0;
+		if (inode->i_blocks && ufs_truncate(inode, old_i_size))
+			ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
+		unlock_ufs(inode->i_sb);
+	}
+
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+
+	if (want_delete) {
+		lock_ufs(inode->i_sb);
+		ufs_free_inode(inode);
+		unlock_ufs(inode->i_sb);
+	}
+}
diff --git a/kernel/fs/ufs/namei.c b/kernel/fs/ufs/namei.c
new file mode 100644
index 000000000..60ee32249
--- /dev/null
+++ b/kernel/fs/ufs/namei.c
@@ -0,0 +1,356 @@
+/*
+ * linux/fs/ufs/namei.c
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "util.h"
+
+static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = ufs_add_link(dentry, inode);
+	if (!err) {
+		unlock_new_inode(inode);
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
+{
+	struct inode * inode = NULL;
+	ino_t ino;
+	
+	if (dentry->d_name.len > UFS_MAXNAMLEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	lock_ufs(dir->i_sb);
+	ino = ufs_inode_by_name(dir, &dentry->d_name);
+	if (ino)
+		inode = ufs_iget(dir->i_sb, ino);
+	unlock_ufs(dir->i_sb);
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate(). 
+ */
+static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
+		bool excl)
+{
+	struct inode *inode;
+	int err;
+
+	UFSD("BEGIN\n");
+
+	inode = ufs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ufs_file_inode_operations;
+		inode->i_fop = &ufs_file_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+		mark_inode_dirty(inode);
+		lock_ufs(dir->i_sb);
+		err = ufs_add_nondir(dentry, inode);
+		unlock_ufs(dir->i_sb);
+	}
+	UFSD("END: err=%d\n", err);
+	return err;
+}
+
+static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	struct inode *inode;
+	int err;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = ufs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, mode, rdev);
+		ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
+		mark_inode_dirty(inode);
+		lock_ufs(dir->i_sb);
+		err = ufs_add_nondir(dentry, inode);
+		unlock_ufs(dir->i_sb);
+	}
+	return err;
+}
+
+static int ufs_symlink (struct inode * dir, struct dentry * dentry,
+	const char * symname)
+{
+	struct super_block * sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode * inode;
+
+	if (l > sb->s_blocksize)
+		goto out_notlocked;
+
+	lock_ufs(dir->i_sb);
+	inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
+		/* slow symlink */
+		inode->i_op = &ufs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &ufs_fast_symlink_inode_operations;
+		memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = ufs_add_nondir(dentry, inode);
+out:
+	unlock_ufs(dir->i_sb);
+out_notlocked:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	goto out;
+}
+
+static int ufs_link (struct dentry * old_dentry, struct inode * dir,
+	struct dentry *dentry)
+{
+	struct inode *inode = d_inode(old_dentry);
+	int error;
+
+	lock_ufs(dir->i_sb);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	error = ufs_add_link(dentry, inode);
+	if (error) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	} else
+		d_instantiate(dentry, inode);
+	unlock_ufs(dir->i_sb);
+	return error;
+}
+
+static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+{
+	struct inode * inode;
+	int err;
+
+	lock_ufs(dir->i_sb);
+	inode_inc_link_count(dir);
+
+	inode = ufs_new_inode(dir, S_IFDIR|mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &ufs_dir_inode_operations;
+	inode->i_fop = &ufs_dir_operations;
+	inode->i_mapping->a_ops = &ufs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = ufs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = ufs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+	unlock_ufs(dir->i_sb);
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput (inode);
+out_dir:
+	inode_dec_link_count(dir);
+	unlock_ufs(dir->i_sb);
+	goto out;
+}
+
+static int ufs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode * inode = d_inode(dentry);
+	struct ufs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	de = ufs_find_entry(dir, &dentry->d_name, &page);
+	if (!de)
+		goto out;
+
+	err = ufs_delete_entry(dir, de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = d_inode(dentry);
+	int err= -ENOTEMPTY;
+
+	lock_ufs(dir->i_sb);
+	if (ufs_empty_dir (inode)) {
+		err = ufs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	unlock_ufs(dir->i_sb);
+	return err;
+}
+
+static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = d_inode(old_dentry);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct page *dir_page = NULL;
+	struct ufs_dir_entry * dir_de = NULL;
+	struct page *old_page;
+	struct ufs_dir_entry *old_de;
+	int err = -ENOENT;
+
+	old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = ufs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct ufs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !ufs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+		if (!new_de)
+			goto out_dir;
+		ufs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		err = ufs_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+
+	ufs_delete_entry(old_dir, old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations ufs_dir_inode_operations = {
+	.create		= ufs_create,
+	.lookup		= ufs_lookup,
+	.link		= ufs_link,
+	.unlink		= ufs_unlink,
+	.symlink	= ufs_symlink,
+	.mkdir		= ufs_mkdir,
+	.rmdir		= ufs_rmdir,
+	.mknod		= ufs_mknod,
+	.rename		= ufs_rename,
+};
diff --git a/kernel/fs/ufs/super.c b/kernel/fs/ufs/super.c
new file mode 100644
index 000000000..dc33f9416
--- /dev/null
+++ b/kernel/fs/ufs/super.c
@@ -0,0 +1,1524 @@
+/*
+ *  linux/fs/ufs/super.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ */
+
+/* Derived from
+ *
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+ 
+/*
+ * Inspired by
+ *
+ *  linux/fs/ufs/super.c
+ *
+ * Copyright (C) 1996
+ * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu)
+ * Laboratory for Computer Science Research Computing Facility
+ * Rutgers, The State University of New Jersey
+ *
+ * Copyright (C) 1996  Eddie C. Dost  (ecd@skynet.be)
+ *
+ * Kernel module support added on 96/04/26 by
+ * Stefan Reinauer <stepan@home.culture.mipt.ru>
+ *
+ * Module usage counts added on 96/04/29 by
+ * Gertjan van Wingerde <gwingerde@gmail.com>
+ *
+ * Clean swab support on 19970406 by
+ * Francois-Rene Rideau <fare@tunes.org>
+ *
+ * 4.4BSD (FreeBSD) support added on February 1st 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
+ * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * NeXTstep support added on February 5th 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk>.
+ *
+ * write support Daniel Pirkl <daniel.pirkl@email.cz> 1998
+ * 
+ * HP/UX hfs filesystem support added by
+ * Martin K. Petersen <mkp@mkp.net>, August 1999
+ *
+ * UFS2 (of FreeBSD 5.x) support added by
+ * Niraj Kumar <niraj17@iitbombay.org>, Jan 2004
+ *
+ * UFS2 write support added by
+ * Evgeniy Dushistov <dushistov@mail.ru>, 2007
+ */
+
+#include <linux/exportfs.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+#include <stdarg.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/log2.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+void lock_ufs(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+
+	mutex_lock(&sbi->mutex);
+	sbi->mutex_owner = current;
+}
+
+void unlock_ufs(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+
+	sbi->mutex_owner = NULL;
+	mutex_unlock(&sbi->mutex);
+}
+
+static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct inode *inode;
+
+	if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+		return ERR_PTR(-ESTALE);
+
+	inode = ufs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				       int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+
+static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
+				       int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+
+static struct dentry *ufs_get_parent(struct dentry *child)
+{
+	struct qstr dot_dot = QSTR_INIT("..", 2);
+	ino_t ino;
+
+	ino = ufs_inode_by_name(d_inode(child), &dot_dot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+	return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino));
+}
+
+static const struct export_operations ufs_export_ops = {
+	.fh_to_dentry	= ufs_fh_to_dentry,
+	.fh_to_parent	= ufs_fh_to_parent,
+	.get_parent	= ufs_get_parent,
+};
+
+#ifdef CONFIG_UFS_DEBUG
+/*
+ * Print contents of ufs_super_block, useful for debugging
+ */
+static void ufs_print_super_stuff(struct super_block *sb,
+				  struct ufs_super_block_first *usb1,
+				  struct ufs_super_block_second *usb2,
+				  struct ufs_super_block_third *usb3)
+{
+	u32 magic = fs32_to_cpu(sb, usb3->fs_magic);
+
+	pr_debug("ufs_print_super_stuff\n");
+	pr_debug("  magic:     0x%x\n", magic);
+	if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) {
+		pr_debug("  fs_size:   %llu\n", (unsigned long long)
+			 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
+		pr_debug("  fs_dsize:  %llu\n", (unsigned long long)
+			 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
+		pr_debug("  bsize:         %u\n",
+			 fs32_to_cpu(sb, usb1->fs_bsize));
+		pr_debug("  fsize:         %u\n",
+			 fs32_to_cpu(sb, usb1->fs_fsize));
+		pr_debug("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
+		pr_debug("  fs_sblockloc: %llu\n", (unsigned long long)
+			 fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
+		pr_debug("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
+			 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
+		pr_debug("  cs_nbfree(No of free blocks):  %llu\n",
+			 (unsigned long long)
+			 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
+		pr_info("  cs_nifree(Num of free inodes): %llu\n",
+			(unsigned long long)
+			fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree));
+		pr_info("  cs_nffree(Num of free frags): %llu\n",
+			(unsigned long long)
+			fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree));
+		pr_info("  fs_maxsymlinklen: %u\n",
+			fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen));
+	} else {
+		pr_debug(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+		pr_debug(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+		pr_debug(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+		pr_debug(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+		pr_debug(" cgoffset:    %u\n",
+			 fs32_to_cpu(sb, usb1->fs_cgoffset));
+		pr_debug(" ~cgmask:     0x%x\n",
+			 ~fs32_to_cpu(sb, usb1->fs_cgmask));
+		pr_debug(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
+		pr_debug(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+		pr_debug(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+		pr_debug(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+		pr_debug(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+		pr_debug(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+		pr_debug(" fragshift:   %u\n",
+			 fs32_to_cpu(sb, usb1->fs_fragshift));
+		pr_debug(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+		pr_debug(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+		pr_debug(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+		pr_debug(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+		pr_debug(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+		pr_debug(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+		pr_debug(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+		pr_debug(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+		pr_debug(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+		pr_debug(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+		pr_debug(" fstodb:      %u\n",
+			 fs32_to_cpu(sb, usb1->fs_fsbtodb));
+		pr_debug(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+		pr_debug(" ndir         %u\n",
+			 fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+		pr_debug(" nifree       %u\n",
+			 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+		pr_debug(" nbfree       %u\n",
+			 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+		pr_debug(" nffree       %u\n",
+			 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+	}
+	pr_debug("\n");
+}
+
+/*
+ * Print contents of ufs_cylinder_group, useful for debugging
+ */
+static void ufs_print_cylinder_stuff(struct super_block *sb,
+				     struct ufs_cylinder_group *cg)
+{
+	pr_debug("\nufs_print_cylinder_stuff\n");
+	pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
+	pr_debug("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
+	pr_debug("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
+	pr_debug("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
+	pr_debug("  ncyl:         %u\n", fs16_to_cpu(sb, cg->cg_ncyl));
+	pr_debug("  niblk:        %u\n", fs16_to_cpu(sb, cg->cg_niblk));
+	pr_debug("  ndblk:        %u\n", fs32_to_cpu(sb, cg->cg_ndblk));
+	pr_debug("  cs_ndir:      %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir));
+	pr_debug("  cs_nbfree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree));
+	pr_debug("  cs_nifree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree));
+	pr_debug("  cs_nffree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree));
+	pr_debug("  rotor:        %u\n", fs32_to_cpu(sb, cg->cg_rotor));
+	pr_debug("  frotor:       %u\n", fs32_to_cpu(sb, cg->cg_frotor));
+	pr_debug("  irotor:       %u\n", fs32_to_cpu(sb, cg->cg_irotor));
+	pr_debug("  frsum:        %u, %u, %u, %u, %u, %u, %u, %u\n",
+	    fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]),
+	    fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]),
+	    fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]),
+	    fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7]));
+	pr_debug("  btotoff:      %u\n", fs32_to_cpu(sb, cg->cg_btotoff));
+	pr_debug("  boff:         %u\n", fs32_to_cpu(sb, cg->cg_boff));
+	pr_debug("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
+	pr_debug("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
+	pr_debug("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
+	pr_debug("  clustersumoff %u\n",
+		 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+	pr_debug("  clusteroff    %u\n",
+		 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+	pr_debug("  nclusterblks  %u\n",
+		 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+	pr_debug("\n");
+}
+#else
+#  define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/
+#  define ufs_print_cylinder_stuff(sb, cg) /**/
+#endif /* CONFIG_UFS_DEBUG */
+
+static const struct super_operations ufs_super_ops;
+
+void ufs_error (struct super_block * sb, const char * function,
+	const char * fmt, ...)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct va_format vaf;
+	va_list args;
+
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	
+	if (!(sb->s_flags & MS_RDONLY)) {
+		usb1->fs_clean = UFS_FSBAD;
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
+		ufs_mark_sb_dirty(sb);
+		sb->s_flags |= MS_RDONLY;
+	}
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) {
+	case UFS_MOUNT_ONERROR_PANIC:
+		panic("panic (device %s): %s: %pV\n",
+		      sb->s_id, function, &vaf);
+
+	case UFS_MOUNT_ONERROR_LOCK:
+	case UFS_MOUNT_ONERROR_UMOUNT:
+	case UFS_MOUNT_ONERROR_REPAIR:
+		pr_crit("error (device %s): %s: %pV\n",
+			sb->s_id, function, &vaf);
+	}
+	va_end(args);
+}
+
+void ufs_panic (struct super_block * sb, const char * function,
+	const char * fmt, ...)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct va_format vaf;
+	va_list args;
+	
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	
+	if (!(sb->s_flags & MS_RDONLY)) {
+		usb1->fs_clean = UFS_FSBAD;
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
+		ufs_mark_sb_dirty(sb);
+	}
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	sb->s_flags |= MS_RDONLY;
+	pr_crit("panic (device %s): %s: %pV\n",
+		sb->s_id, function, &vaf);
+	va_end(args);
+}
+
+void ufs_warning (struct super_block * sb, const char * function,
+	const char * fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_warn("(device %s): %s: %pV\n",
+		sb->s_id, function, &vaf);
+	va_end(args);
+}
+
+enum {
+       Opt_type_old = UFS_MOUNT_UFSTYPE_OLD,
+       Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86,
+       Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN,
+       Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS,
+       Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD,
+       Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2,
+       Opt_type_hp = UFS_MOUNT_UFSTYPE_HP,
+       Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD,
+       Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP,
+       Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP,
+       Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC,
+       Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK,
+       Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT,
+       Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR,
+       Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_type_old, "ufstype=old"},
+	{Opt_type_sunx86, "ufstype=sunx86"},
+	{Opt_type_sun, "ufstype=sun"},
+	{Opt_type_sunos, "ufstype=sunos"},
+	{Opt_type_44bsd, "ufstype=44bsd"},
+	{Opt_type_ufs2, "ufstype=ufs2"},
+	{Opt_type_ufs2, "ufstype=5xbsd"},
+	{Opt_type_hp, "ufstype=hp"},
+	{Opt_type_nextstepcd, "ufstype=nextstep-cd"},
+	{Opt_type_nextstep, "ufstype=nextstep"},
+	{Opt_type_openstep, "ufstype=openstep"},
+/*end of possible ufs types */
+	{Opt_onerror_panic, "onerror=panic"},
+	{Opt_onerror_lock, "onerror=lock"},
+	{Opt_onerror_umount, "onerror=umount"},
+	{Opt_onerror_repair, "onerror=repair"},
+	{Opt_err, NULL}
+};
+
+static int ufs_parse_options (char * options, unsigned * mount_options)
+{
+	char * p;
+	
+	UFSD("ENTER\n");
+	
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_type_old:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_OLD);
+			break;
+		case Opt_type_sunx86:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_SUNx86);
+			break;
+		case Opt_type_sun:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_SUN);
+			break;
+		case Opt_type_sunos:
+			ufs_clear_opt(*mount_options, UFSTYPE);
+			ufs_set_opt(*mount_options, UFSTYPE_SUNOS);
+			break;
+		case Opt_type_44bsd:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_44BSD);
+			break;
+		case Opt_type_ufs2:
+			ufs_clear_opt(*mount_options, UFSTYPE);
+			ufs_set_opt(*mount_options, UFSTYPE_UFS2);
+			break;
+		case Opt_type_hp:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_HP);
+			break;
+		case Opt_type_nextstepcd:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD);
+			break;
+		case Opt_type_nextstep:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP);
+			break;
+		case Opt_type_openstep:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP);
+			break;
+		case Opt_onerror_panic:
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_PANIC);
+			break;
+		case Opt_onerror_lock:
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_LOCK);
+			break;
+		case Opt_onerror_umount:
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_UMOUNT);
+			break;
+		case Opt_onerror_repair:
+			pr_err("Unable to do repair on error, will lock lock instead\n");
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_REPAIR);
+			break;
+		default:
+			pr_err("Invalid option: \"%s\" or missing value\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * Different types of UFS hold fs_cstotal in different
+ * places, and use different data structure for it.
+ * To make things simpler we just copy fs_cstotal to ufs_sb_private_info
+ */
+static void ufs_setup_cstotal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+	unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+
+	UFSD("ENTER, mtype=%u\n", mtype);
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+		uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+	} else {
+		uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
+		uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
+	}
+	UFSD("EXIT\n");
+}
+
+/*
+ * Read on-disk structures associated with cylinder groups
+ */
+static int ufs_read_cylinder_structures(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_buffer_head * ubh;
+	unsigned char * base, * space;
+	unsigned size, blks, i;
+
+	UFSD("ENTER\n");
+
+	/*
+	 * Read cs structures from (usually) first data block
+	 * on the device. 
+	 */
+	size = uspi->s_cssize;
+	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	base = space = kmalloc(size, GFP_NOFS);
+	if (!base)
+		goto failed; 
+	sbi->s_csp = (struct ufs_csum *)space;
+	for (i = 0; i < blks; i += uspi->s_fpb) {
+		size = uspi->s_bsize;
+		if (i + uspi->s_fpb > blks)
+			size = (blks - i) * uspi->s_fsize;
+
+		ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
+		
+		if (!ubh)
+			goto failed;
+
+		ubh_ubhcpymem (space, ubh, size);
+
+		space += size;
+		ubh_brelse (ubh);
+		ubh = NULL;
+	}
+
+	/*
+	 * Read cylinder group (we read only first fragment from block
+	 * at this time) and prepare internal data structures for cg caching.
+	 */
+	if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
+		goto failed;
+	for (i = 0; i < uspi->s_ncg; i++) 
+		sbi->s_ucg[i] = NULL;
+	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
+		sbi->s_ucpi[i] = NULL;
+		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
+	}
+	for (i = 0; i < uspi->s_ncg; i++) {
+		UFSD("read cg %u\n", i);
+		if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
+			goto failed;
+		if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
+			goto failed;
+
+		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
+	}
+	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
+		if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
+			goto failed;
+		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
+	}
+	sbi->s_cg_loaded = 0;
+	UFSD("EXIT\n");
+	return 1;
+
+failed:
+	kfree (base);
+	if (sbi->s_ucg) {
+		for (i = 0; i < uspi->s_ncg; i++)
+			if (sbi->s_ucg[i])
+				brelse (sbi->s_ucg[i]);
+		kfree (sbi->s_ucg);
+		for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
+			kfree (sbi->s_ucpi[i]);
+	}
+	UFSD("EXIT (FAILED)\n");
+	return 0;
+}
+
+/*
+ * Sync our internal copy of fs_cstotal with disk
+ */
+static void ufs_put_cstotal(struct super_block *sb)
+{
+	unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+
+	UFSD("ENTER\n");
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		usb2->fs_un.fs_u2.cs_ndir =
+			cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+		usb2->fs_un.fs_u2.cs_nbfree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+		usb3->fs_un1.fs_u2.cs_nifree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+		usb3->fs_un1.fs_u2.cs_nffree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
+	} else {
+		usb1->fs_cstotal.cs_ndir =
+			cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+		usb1->fs_cstotal.cs_nbfree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+		usb1->fs_cstotal.cs_nifree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+		usb1->fs_cstotal.cs_nffree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+	}
+	ubh_mark_buffer_dirty(USPI_UBH(uspi));
+	ufs_print_super_stuff(sb, usb1, usb2, usb3);
+	UFSD("EXIT\n");
+}
+
+/**
+ * ufs_put_super_internal() - put on-disk intrenal structures
+ * @sb: pointer to super_block structure
+ * Put on-disk structures associated with cylinder groups
+ * and write them back to disk, also update cs_total on disk
+ */
+static void ufs_put_super_internal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_buffer_head * ubh;
+	unsigned char * base, * space;
+	unsigned blks, size, i;
+
+	
+	UFSD("ENTER\n");
+
+	ufs_put_cstotal(sb);
+	size = uspi->s_cssize;
+	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	base = space = (char*) sbi->s_csp;
+	for (i = 0; i < blks; i += uspi->s_fpb) {
+		size = uspi->s_bsize;
+		if (i + uspi->s_fpb > blks)
+			size = (blks - i) * uspi->s_fsize;
+
+		ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
+
+		ubh_memcpyubh (ubh, space, size);
+		space += size;
+		ubh_mark_buffer_uptodate (ubh, 1);
+		ubh_mark_buffer_dirty (ubh);
+		ubh_brelse (ubh);
+	}
+	for (i = 0; i < sbi->s_cg_loaded; i++) {
+		ufs_put_cylinder (sb, i);
+		kfree (sbi->s_ucpi[i]);
+	}
+	for (; i < UFS_MAX_GROUP_LOADED; i++) 
+		kfree (sbi->s_ucpi[i]);
+	for (i = 0; i < uspi->s_ncg; i++) 
+		brelse (sbi->s_ucg[i]);
+	kfree (sbi->s_ucg);
+	kfree (base);
+
+	UFSD("EXIT\n");
+}
+
+static int ufs_sync_fs(struct super_block *sb, int wait)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_super_block_third * usb3;
+	unsigned flags;
+
+	lock_ufs(sb);
+	mutex_lock(&UFS_SB(sb)->s_lock);
+
+	UFSD("ENTER\n");
+
+	flags = UFS_SB(sb)->s_flags;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+	if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		ufs_set_fs_state(sb, usb1, usb3,
+				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+	ufs_put_cstotal(sb);
+
+	UFSD("EXIT\n");
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+	unlock_ufs(sb);
+
+	return 0;
+}
+
+static void delayed_sync_fs(struct work_struct *work)
+{
+	struct ufs_sb_info *sbi;
+
+	sbi = container_of(work, struct ufs_sb_info, sync_work.work);
+
+	spin_lock(&sbi->work_lock);
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->work_lock);
+
+	ufs_sync_fs(sbi->sb, 1);
+}
+
+void ufs_mark_sb_dirty(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	unsigned long delay;
+
+	spin_lock(&sbi->work_lock);
+	if (!sbi->work_queued) {
+		delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+		queue_delayed_work(system_long_wq, &sbi->sync_work, delay);
+		sbi->work_queued = 1;
+	}
+	spin_unlock(&sbi->work_lock);
+}
+
+static void ufs_put_super(struct super_block *sb)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+
+	UFSD("ENTER\n");
+
+	if (!(sb->s_flags & MS_RDONLY))
+		ufs_put_super_internal(sb);
+	cancel_delayed_work_sync(&sbi->sync_work);
+
+	ubh_brelse_uspi (sbi->s_uspi);
+	kfree (sbi->s_uspi);
+	mutex_destroy(&sbi->mutex);
+	kfree (sbi);
+	sb->s_fs_info = NULL;
+	UFSD("EXIT\n");
+	return;
+}
+
+static int ufs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct ufs_sb_info * sbi;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_super_block_second * usb2;
+	struct ufs_super_block_third * usb3;
+	struct ufs_buffer_head * ubh;	
+	struct inode *inode;
+	unsigned block_size, super_block_size;
+	unsigned flags;
+	unsigned super_block_offset;
+	unsigned maxsymlen;
+	int ret = -EINVAL;
+
+	uspi = NULL;
+	ubh = NULL;
+	flags = 0;
+	
+	UFSD("ENTER\n");
+
+#ifndef CONFIG_UFS_FS_WRITE
+	if (!(sb->s_flags & MS_RDONLY)) {
+		pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
+		return -EROFS;
+	}
+#endif
+		
+	sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		goto failed_nomem;
+	sb->s_fs_info = sbi;
+	sbi->sb = sb;
+
+	UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
+	
+	mutex_init(&sbi->mutex);
+	mutex_init(&sbi->s_lock);
+	spin_lock_init(&sbi->work_lock);
+	INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
+	/*
+	 * Set default mount options
+	 * Parse mount options
+	 */
+	sbi->s_mount_opt = 0;
+	ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK);
+	if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) {
+		pr_err("wrong mount options\n");
+		goto failed;
+	}
+	if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) {
+		if (!silent)
+			pr_err("You didn't specify the type of your ufs filesystem\n\n"
+			"mount -t ufs -o ufstype="
+			"sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n"
+			">>>WARNING<<< Wrong ufstype may corrupt your filesystem, "
+			"default is ufstype=old\n");
+		ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD);
+	}
+
+	uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL);
+	sbi->s_uspi = uspi;
+	if (!uspi)
+		goto failed;
+	uspi->s_dirblksize = UFS_SECTOR_SIZE;
+	super_block_offset=UFS_SBLOCK;
+
+	/* Keep 2Gig file limit. Some UFS variants need to override 
+	   this but as I don't know which I'll let those in the know loosen
+	   the rules */
+	switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
+	case UFS_MOUNT_UFSTYPE_44BSD:
+		UFSD("ufstype=44bsd\n");
+		uspi->s_fsize = block_size = 512;
+		uspi->s_fmask = ~(512 - 1);
+		uspi->s_fshift = 9;
+		uspi->s_sbsize = super_block_size = 1536;
+		uspi->s_sbbase = 0;
+		flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
+		break;
+	case UFS_MOUNT_UFSTYPE_UFS2:
+		UFSD("ufstype=ufs2\n");
+		super_block_offset=SBLOCK_UFS2;
+		uspi->s_fsize = block_size = 512;
+		uspi->s_fmask = ~(512 - 1);
+		uspi->s_fshift = 9;
+		uspi->s_sbsize = super_block_size = 1536;
+		uspi->s_sbbase =  0;
+		flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
+		break;
+		
+	case UFS_MOUNT_UFSTYPE_SUN:
+		UFSD("ufstype=sun\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_maxsymlinklen = 0; /* Not supported on disk */
+		flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUN | UFS_CG_SUN;
+		break;
+
+	case UFS_MOUNT_UFSTYPE_SUNOS:
+		UFSD("ufstype=sunos\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = 2048;
+		super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_maxsymlinklen = 0; /* Not supported on disk */
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_SUNOS | UFS_CG_SUN;
+		break;
+
+	case UFS_MOUNT_UFSTYPE_SUNx86:
+		UFSD("ufstype=sunx86\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_maxsymlinklen = 0; /* Not supported on disk */
+		flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUNx86 | UFS_CG_SUN;
+		break;
+
+	case UFS_MOUNT_UFSTYPE_OLD:
+		UFSD("ufstype=old\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				pr_info("ufstype=old is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_NEXTSTEP:
+		UFSD("ufstype=nextstep\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				pr_info("ufstype=nextstep is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
+		UFSD("ufstype=nextstep-cd\n");
+		uspi->s_fsize = block_size = 2048;
+		uspi->s_fmask = ~(2048 - 1);
+		uspi->s_fshift = 11;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				pr_info("ufstype=nextstep-cd is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_OPENSTEP:
+		UFSD("ufstype=openstep\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
+		flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				pr_info("ufstype=openstep is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_HP:
+		UFSD("ufstype=hp\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				pr_info("ufstype=hp is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+ 		}
+ 		break;
+	default:
+		if (!silent)
+			pr_err("unknown ufstype\n");
+		goto failed;
+	}
+	
+again:	
+	if (!sb_set_blocksize(sb, block_size)) {
+		pr_err("failed to set blocksize\n");
+		goto failed;
+	}
+
+	/*
+	 * read ufs super block from device
+	 */
+
+	ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + super_block_offset/block_size, super_block_size);
+	
+	if (!ubh) 
+            goto failed;
+
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	/* Sort out mod used on SunOS 4.1.3 for fs_state */
+	uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
+	if (((flags & UFS_ST_MASK) == UFS_ST_SUNOS) &&
+	    (uspi->s_postblformat != UFS_42POSTBLFMT)) {
+		flags &= ~UFS_ST_MASK;
+		flags |=  UFS_ST_SUN;
+	}
+
+	/*
+	 * Check ufs magic number
+	 */
+	sbi->s_bytesex = BYTESEX_LE;
+	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
+		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
+		case UFS2_MAGIC:
+		case UFS_MAGIC_LFN:
+	        case UFS_MAGIC_FEA:
+	        case UFS_MAGIC_4GB:
+			goto magic_found;
+	}
+	sbi->s_bytesex = BYTESEX_BE;
+	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
+		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
+		case UFS2_MAGIC:
+		case UFS_MAGIC_LFN:
+	        case UFS_MAGIC_FEA:
+	        case UFS_MAGIC_4GB:
+			goto magic_found;
+	}
+
+	if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP) 
+	  || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD) 
+	  || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP)) 
+	  && uspi->s_sbbase < 256) {
+		ubh_brelse_uspi(uspi);
+		ubh = NULL;
+		uspi->s_sbbase += 8;
+		goto again;
+	}
+	if (!silent)
+		pr_err("%s(): bad magic number\n", __func__);
+	goto failed;
+
+magic_found:
+	/*
+	 * Check block and fragment sizes
+	 */
+	uspi->s_bsize = fs32_to_cpu(sb, usb1->fs_bsize);
+	uspi->s_fsize = fs32_to_cpu(sb, usb1->fs_fsize);
+	uspi->s_sbsize = fs32_to_cpu(sb, usb1->fs_sbsize);
+	uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
+	uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
+
+	if (!is_power_of_2(uspi->s_fsize)) {
+		pr_err("%s(): fragment size %u is not a power of 2\n",
+		       __func__, uspi->s_fsize);
+		goto failed;
+	}
+	if (uspi->s_fsize < 512) {
+		pr_err("%s(): fragment size %u is too small\n",
+		       __func__, uspi->s_fsize);
+		goto failed;
+	}
+	if (uspi->s_fsize > 4096) {
+		pr_err("%s(): fragment size %u is too large\n",
+		       __func__, uspi->s_fsize);
+		goto failed;
+	}
+	if (!is_power_of_2(uspi->s_bsize)) {
+		pr_err("%s(): block size %u is not a power of 2\n",
+		       __func__, uspi->s_bsize);
+		goto failed;
+	}
+	if (uspi->s_bsize < 4096) {
+		pr_err("%s(): block size %u is too small\n",
+		       __func__, uspi->s_bsize);
+		goto failed;
+	}
+	if (uspi->s_bsize / uspi->s_fsize > 8) {
+		pr_err("%s(): too many fragments per block (%u)\n",
+		       __func__, uspi->s_bsize / uspi->s_fsize);
+		goto failed;
+	}
+	if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) {
+		ubh_brelse_uspi(uspi);
+		ubh = NULL;
+		block_size = uspi->s_fsize;
+		super_block_size = uspi->s_sbsize;
+		UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
+		goto again;
+	}
+
+	sbi->s_flags = flags;/*after that line some functions use s_flags*/
+	ufs_print_super_stuff(sb, usb1, usb2, usb3);
+
+	/*
+	 * Check, if file system was correctly unmounted.
+	 * If not, make it read only.
+	 */
+	if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) ||
+	  ((flags & UFS_ST_MASK) == UFS_ST_OLD) ||
+	  (((flags & UFS_ST_MASK) == UFS_ST_SUN ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
+	  (flags & UFS_ST_MASK) == UFS_ST_SUNx86) &&
+	  (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
+		switch(usb1->fs_clean) {
+		case UFS_FSCLEAN:
+			UFSD("fs is clean\n");
+			break;
+		case UFS_FSSTABLE:
+			UFSD("fs is stable\n");
+			break;
+		case UFS_FSLOG:
+			UFSD("fs is logging fs\n");
+			break;
+		case UFS_FSOSF1:
+			UFSD("fs is DEC OSF/1\n");
+			break;
+		case UFS_FSACTIVE:
+			pr_err("%s(): fs is active\n", __func__);
+			sb->s_flags |= MS_RDONLY;
+			break;
+		case UFS_FSBAD:
+			pr_err("%s(): fs is bad\n", __func__);
+			sb->s_flags |= MS_RDONLY;
+			break;
+		default:
+			pr_err("%s(): can't grok fs_clean 0x%x\n",
+			       __func__, usb1->fs_clean);
+			sb->s_flags |= MS_RDONLY;
+			break;
+		}
+	} else {
+		pr_err("%s(): fs needs fsck\n", __func__);
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	/*
+	 * Read ufs_super_block into internal data structures
+	 */
+	sb->s_op = &ufs_super_ops;
+	sb->s_export_op = &ufs_export_ops;
+
+	sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
+
+	uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
+	uspi->s_cblkno = fs32_to_cpu(sb, usb1->fs_cblkno);
+	uspi->s_iblkno = fs32_to_cpu(sb, usb1->fs_iblkno);
+	uspi->s_dblkno = fs32_to_cpu(sb, usb1->fs_dblkno);
+	uspi->s_cgoffset = fs32_to_cpu(sb, usb1->fs_cgoffset);
+	uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
+
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		uspi->s_u2_size  = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
+		uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
+		uspi->s_size  =  fs32_to_cpu(sb, usb1->fs_size);
+		uspi->s_dsize =  fs32_to_cpu(sb, usb1->fs_dsize);
+	}
+
+	uspi->s_ncg = fs32_to_cpu(sb, usb1->fs_ncg);
+	/* s_bsize already set */
+	/* s_fsize already set */
+	uspi->s_fpb = fs32_to_cpu(sb, usb1->fs_frag);
+	uspi->s_minfree = fs32_to_cpu(sb, usb1->fs_minfree);
+	uspi->s_bmask = fs32_to_cpu(sb, usb1->fs_bmask);
+	uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
+	uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
+	uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
+	UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
+		uspi->s_fshift);
+	uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
+	uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
+	/* s_sbsize already set */
+	uspi->s_csmask = fs32_to_cpu(sb, usb1->fs_csmask);
+	uspi->s_csshift = fs32_to_cpu(sb, usb1->fs_csshift);
+	uspi->s_nindir = fs32_to_cpu(sb, usb1->fs_nindir);
+	uspi->s_inopb = fs32_to_cpu(sb, usb1->fs_inopb);
+	uspi->s_nspf = fs32_to_cpu(sb, usb1->fs_nspf);
+	uspi->s_npsect = ufs_get_fs_npsect(sb, usb1, usb3);
+	uspi->s_interleave = fs32_to_cpu(sb, usb1->fs_interleave);
+	uspi->s_trackskew = fs32_to_cpu(sb, usb1->fs_trackskew);
+
+	if (uspi->fs_magic == UFS2_MAGIC)
+		uspi->s_csaddr = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr);
+	else
+		uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr);
+
+	uspi->s_cssize = fs32_to_cpu(sb, usb1->fs_cssize);
+	uspi->s_cgsize = fs32_to_cpu(sb, usb1->fs_cgsize);
+	uspi->s_ntrak = fs32_to_cpu(sb, usb1->fs_ntrak);
+	uspi->s_nsect = fs32_to_cpu(sb, usb1->fs_nsect);
+	uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
+	uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
+	uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
+	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
+	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
+	uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
+	uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
+	uspi->s_nrpos = fs32_to_cpu(sb, usb3->fs_nrpos);
+	uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff);
+	uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff);
+
+	/*
+	 * Compute another frequently used values
+	 */
+	uspi->s_fpbmask = uspi->s_fpb - 1;
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+		uspi->s_apbshift = uspi->s_bshift - 3;
+	else
+		uspi->s_apbshift = uspi->s_bshift - 2;
+
+	uspi->s_2apbshift = uspi->s_apbshift * 2;
+	uspi->s_3apbshift = uspi->s_apbshift * 3;
+	uspi->s_apb = 1 << uspi->s_apbshift;
+	uspi->s_2apb = 1 << uspi->s_2apbshift;
+	uspi->s_3apb = 1 << uspi->s_3apbshift;
+	uspi->s_apbmask = uspi->s_apb - 1;
+	uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS;
+	uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift;
+	uspi->s_inopf = uspi->s_inopb >> uspi->s_fpbshift;
+	uspi->s_bpf = uspi->s_fsize << 3;
+	uspi->s_bpfshift = uspi->s_fshift + 3;
+	uspi->s_bpfmask = uspi->s_bpf - 1;
+	if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD ||
+	    (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2)
+		uspi->s_maxsymlinklen =
+		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
+
+	if (uspi->fs_magic == UFS2_MAGIC)
+		maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR);
+	else
+		maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR);
+	if (uspi->s_maxsymlinklen > maxsymlen) {
+		ufs_warning(sb, __func__, "ufs_read_super: excessive maximum "
+			    "fast symlink size (%u)\n", uspi->s_maxsymlinklen);
+		uspi->s_maxsymlinklen = maxsymlen;
+	}
+	sb->s_max_links = UFS_LINK_MAX;
+
+	inode = ufs_iget(sb, UFS_ROOTINO);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto failed;
+	}
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto failed;
+	}
+
+	ufs_setup_cstotal(sb);
+	/*
+	 * Read cylinder group structures
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		if (!ufs_read_cylinder_structures(sb))
+			goto failed;
+
+	UFSD("EXIT\n");
+	return 0;
+
+failed:
+	mutex_destroy(&sbi->mutex);
+	if (ubh)
+		ubh_brelse_uspi (uspi);
+	kfree (uspi);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+	UFSD("EXIT (FAILED)\n");
+	return ret;
+
+failed_nomem:
+	UFSD("EXIT (NOMEM)\n");
+	return -ENOMEM;
+}
+
+static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_super_block_third * usb3;
+	unsigned new_mount_opt, ufstype;
+	unsigned flags;
+
+	sync_filesystem(sb);
+	lock_ufs(sb);
+	mutex_lock(&UFS_SB(sb)->s_lock);
+	uspi = UFS_SB(sb)->s_uspi;
+	flags = UFS_SB(sb)->s_flags;
+	usb1 = ubh_get_usb_first(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+	
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 * It is not possible to change ufstype option during remount
+	 */
+	ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	new_mount_opt = 0;
+	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
+	if (!ufs_parse_options (data, &new_mount_opt)) {
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		unlock_ufs(sb);
+		return -EINVAL;
+	}
+	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
+		new_mount_opt |= ufstype;
+	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
+		pr_err("ufstype can't be changed during remount\n");
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		unlock_ufs(sb);
+		return -EINVAL;
+	}
+
+	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		UFS_SB(sb)->s_mount_opt = new_mount_opt;
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		unlock_ufs(sb);
+		return 0;
+	}
+	
+	/*
+	 * fs was mouted as rw, remounting ro
+	 */
+	if (*mount_flags & MS_RDONLY) {
+		ufs_put_super_internal(sb);
+		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
+		  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
+		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
+			ufs_set_fs_state(sb, usb1, usb3,
+				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+		ubh_mark_buffer_dirty (USPI_UBH(uspi));
+		sb->s_flags |= MS_RDONLY;
+	} else {
+	/*
+	 * fs was mounted as ro, remounting rw
+	 */
+#ifndef CONFIG_UFS_FS_WRITE
+		pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
+		mutex_unlock(&UFS_SB(sb)->s_lock);
+		unlock_ufs(sb);
+		return -EINVAL;
+#else
+		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
+		    ufstype != UFS_MOUNT_UFSTYPE_SUNOS &&
+		    ufstype != UFS_MOUNT_UFSTYPE_44BSD &&
+		    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
+		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
+			pr_err("this ufstype is read-only supported\n");
+			mutex_unlock(&UFS_SB(sb)->s_lock);
+			unlock_ufs(sb);
+			return -EINVAL;
+		}
+		if (!ufs_read_cylinder_structures(sb)) {
+			pr_err("failed during remounting\n");
+			mutex_unlock(&UFS_SB(sb)->s_lock);
+			unlock_ufs(sb);
+			return -EPERM;
+		}
+		sb->s_flags &= ~MS_RDONLY;
+#endif
+	}
+	UFS_SB(sb)->s_mount_opt = new_mount_opt;
+	mutex_unlock(&UFS_SB(sb)->s_lock);
+	unlock_ufs(sb);
+	return 0;
+}
+
+static int ufs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct ufs_sb_info *sbi = UFS_SB(root->d_sb);
+	unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	const struct match_token *tp = tokens;
+
+	while (tp->token != Opt_onerror_panic && tp->token != mval)
+		++tp;
+	BUG_ON(tp->token == Opt_onerror_panic);
+	seq_printf(seq, ",%s", tp->pattern);
+
+	mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR;
+	while (tp->token != Opt_err && tp->token != mval)
+		++tp;
+	BUG_ON(tp->token == Opt_err);
+	seq_printf(seq, ",%s", tp->pattern);
+
+	return 0;
+}
+
+static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
+	unsigned  flags = UFS_SB(sb)->s_flags;
+	struct ufs_super_block_third *usb3;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	lock_ufs(sb);
+
+	usb3 = ubh_get_usb_third(uspi);
+	
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		buf->f_type = UFS2_MAGIC;
+		buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
+		buf->f_type = UFS_MAGIC;
+		buf->f_blocks = uspi->s_dsize;
+	}
+	buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree;
+	buf->f_ffree = uspi->cs_total.cs_nifree;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
+		? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
+	buf->f_files = uspi->s_ncg * uspi->s_ipg;
+	buf->f_namelen = UFS_MAXNAMLEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	unlock_ufs(sb);
+
+	return 0;
+}
+
+static struct kmem_cache * ufs_inode_cachep;
+
+static struct inode *ufs_alloc_inode(struct super_block *sb)
+{
+	struct ufs_inode_info *ei;
+
+	ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	ei->vfs_inode.i_version = 1;
+	return &ei->vfs_inode;
+}
+
+static void ufs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
+}
+
+static void ufs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ufs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
+					     sizeof(struct ufs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ufs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ufs_inode_cachep);
+}
+
+static const struct super_operations ufs_super_ops = {
+	.alloc_inode	= ufs_alloc_inode,
+	.destroy_inode	= ufs_destroy_inode,
+	.write_inode	= ufs_write_inode,
+	.evict_inode	= ufs_evict_inode,
+	.put_super	= ufs_put_super,
+	.sync_fs	= ufs_sync_fs,
+	.statfs		= ufs_statfs,
+	.remount_fs	= ufs_remount,
+	.show_options   = ufs_show_options,
+};
+
+static struct dentry *ufs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+}
+
+static struct file_system_type ufs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ufs",
+	.mount		= ufs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ufs");
+
+static int __init init_ufs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&ufs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_ufs_fs(void)
+{
+	unregister_filesystem(&ufs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_ufs_fs)
+module_exit(exit_ufs_fs)
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/ufs/swab.h b/kernel/fs/ufs/swab.h
new file mode 100644
index 000000000..8d974c4fd
--- /dev/null
+++ b/kernel/fs/ufs/swab.h
@@ -0,0 +1,115 @@
+/*
+ *  linux/fs/ufs/swab.h
+ *
+ * Copyright (C) 1997, 1998 Francois-Rene Rideau <fare@tunes.org>
+ * Copyright (C) 1998 Jakub Jelinek <jj@ultra.linux.cz>
+ * Copyright (C) 2001 Christoph Hellwig <hch@infradead.org>
+ */
+
+#ifndef _UFS_SWAB_H
+#define _UFS_SWAB_H
+
+/*
+ * Notes:
+ *    HERE WE ASSUME EITHER BIG OR LITTLE ENDIAN UFSes
+ *    in case there are ufs implementations that have strange bytesexes,
+ *    you'll need to modify code here as well as in ufs_super.c and ufs_fs.h
+ *    to support them.
+ */
+
+enum {
+	BYTESEX_LE,
+	BYTESEX_BE
+};
+
+static inline u64
+fs64_to_cpu(struct super_block *sbp, __fs64 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return le64_to_cpu((__force __le64)n);
+	else
+		return be64_to_cpu((__force __be64)n);
+}
+
+static inline __fs64
+cpu_to_fs64(struct super_block *sbp, u64 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return (__force __fs64)cpu_to_le64(n);
+	else
+		return (__force __fs64)cpu_to_be64(n);
+}
+
+static inline u32
+fs32_to_cpu(struct super_block *sbp, __fs32 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return le32_to_cpu((__force __le32)n);
+	else
+		return be32_to_cpu((__force __be32)n);
+}
+
+static inline __fs32
+cpu_to_fs32(struct super_block *sbp, u32 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return (__force __fs32)cpu_to_le32(n);
+	else
+		return (__force __fs32)cpu_to_be32(n);
+}
+
+static inline void
+fs32_add(struct super_block *sbp, __fs32 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le32_add_cpu((__le32 *)n, d);
+	else
+		be32_add_cpu((__be32 *)n, d);
+}
+
+static inline void
+fs32_sub(struct super_block *sbp, __fs32 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le32_add_cpu((__le32 *)n, -d);
+	else
+		be32_add_cpu((__be32 *)n, -d);
+}
+
+static inline u16
+fs16_to_cpu(struct super_block *sbp, __fs16 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return le16_to_cpu((__force __le16)n);
+	else
+		return be16_to_cpu((__force __be16)n);
+}
+
+static inline __fs16
+cpu_to_fs16(struct super_block *sbp, u16 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return (__force __fs16)cpu_to_le16(n);
+	else
+		return (__force __fs16)cpu_to_be16(n);
+}
+
+static inline void
+fs16_add(struct super_block *sbp, __fs16 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le16_add_cpu((__le16 *)n, d);
+	else
+		be16_add_cpu((__be16 *)n, d);
+}
+
+static inline void
+fs16_sub(struct super_block *sbp, __fs16 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le16_add_cpu((__le16 *)n, -d);
+	else
+		be16_add_cpu((__be16 *)n, -d);
+}
+
+#endif /* _UFS_SWAB_H */
diff --git a/kernel/fs/ufs/symlink.c b/kernel/fs/ufs/symlink.c
new file mode 100644
index 000000000..5b537e2fd
--- /dev/null
+++ b/kernel/fs/ufs/symlink.c
@@ -0,0 +1,53 @@
+/*
+ *  linux/fs/ufs/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@emai.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/symlink.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+
+
+static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ufs_inode_info *p = UFS_I(d_inode(dentry));
+	nd_set_link(nd, (char*)p->i_u1.i_symlink);
+	return NULL;
+}
+
+const struct inode_operations ufs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ufs_follow_link,
+	.setattr	= ufs_setattr,
+};
+
+const struct inode_operations ufs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ufs_setattr,
+};
diff --git a/kernel/fs/ufs/truncate.c b/kernel/fs/ufs/truncate.c
new file mode 100644
index 000000000..21154704c
--- /dev/null
+++ b/kernel/fs/ufs/truncate.c
@@ -0,0 +1,523 @@
+/*
+ *  linux/fs/ufs/truncate.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/truncate.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/truncate.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+/*
+ * Real random numbers for secure rm added 94/02/18
+ * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr>
+ */
+
+/*
+ * Adoptation to use page cache and UFS2 write support by
+ * Evgeniy Dushistov <dushistov@mail.ru>, 2006-2007
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * Secure deletion currently doesn't work. It interacts very badly
+ * with buffers shared with memory mappings, and for that reason
+ * can't be done in the truncate() routines. It should instead be
+ * done separately in "release()" before calling the truncate routines
+ * that will release the actual file blocks.
+ *
+ *		Linus
+ */
+
+#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
+#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
+
+
+static int ufs_trunc_direct(struct inode *inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	void *p;
+	u64 frag1, frag2, frag3, frag4, block1, block2;
+	unsigned frag_to_free, free_count;
+	unsigned i, tmp;
+	int retry;
+	
+	UFSD("ENTER: ino %lu\n", inode->i_ino);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	
+	frag_to_free = 0;
+	free_count = 0;
+	retry = 0;
+	
+	frag1 = DIRECT_FRAGMENT;
+	frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+	frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
+	frag3 = frag4 & ~uspi->s_fpbmask;
+	block1 = block2 = 0;
+	if (frag2 > frag3) {
+		frag2 = frag4;
+		frag3 = frag4 = 0;
+	} else if (frag2 < frag3) {
+		block1 = ufs_fragstoblks (frag2);
+		block2 = ufs_fragstoblks (frag3);
+	}
+
+	UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
+	     " frag3 %llu, frag4 %llu\n", inode->i_ino,
+	     (unsigned long long)frag1, (unsigned long long)frag2,
+	     (unsigned long long)block1, (unsigned long long)block2,
+	     (unsigned long long)frag3, (unsigned long long)frag4);
+
+	if (frag1 >= frag2)
+		goto next1;		
+
+	/*
+	 * Free first free fragments
+	 */
+	p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp )
+		ufs_panic (sb, "ufs_trunc_direct", "internal error");
+	frag2 -= frag1;
+	frag1 = ufs_fragnum (frag1);
+
+	ufs_free_fragments(inode, tmp + frag1, frag2);
+	mark_inode_dirty(inode);
+	frag_to_free = tmp + frag1;
+
+next1:
+	/*
+	 * Free whole blocks
+	 */
+	for (i = block1 ; i < block2; i++) {
+		p = ufs_get_direct_data_ptr(uspi, ufsi, i);
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		if (!tmp)
+			continue;
+		ufs_data_ptr_clear(uspi, p);
+
+		if (free_count == 0) {
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		} else if (free_count > 0 && frag_to_free == tmp - free_count)
+			free_count += uspi->s_fpb;
+		else {
+			ufs_free_blocks (inode, frag_to_free, free_count);
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		}
+		mark_inode_dirty(inode);
+	}
+	
+	if (free_count > 0)
+		ufs_free_blocks (inode, frag_to_free, free_count);
+
+	if (frag3 >= frag4)
+		goto next3;
+
+	/*
+	 * Free last free fragments
+	 */
+	p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp )
+		ufs_panic(sb, "ufs_truncate_direct", "internal error");
+	frag4 = ufs_fragnum (frag4);
+	ufs_data_ptr_clear(uspi, p);
+
+	ufs_free_fragments (inode, tmp, frag4);
+	mark_inode_dirty(inode);
+ next3:
+
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	return retry;
+}
+
+
+static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_buffer_head * ind_ubh;
+	void *ind;
+	u64 tmp, indirect_block, i, frag_to_free;
+	unsigned free_count;
+	int retry;
+
+	UFSD("ENTER: ino %lu, offset %llu, p: %p\n",
+	     inode->i_ino, (unsigned long long)offset, p);
+
+	BUG_ON(!p);
+		
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+
+	frag_to_free = 0;
+	free_count = 0;
+	retry = 0;
+	
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp)
+		return 0;
+	ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize);
+	if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
+		ubh_brelse (ind_ubh);
+		return 1;
+	}
+	if (!ind_ubh) {
+		ufs_data_ptr_clear(uspi, p);
+		return 0;
+	}
+
+	indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0;
+	for (i = indirect_block; i < uspi->s_apb; i++) {
+		ind = ubh_get_data_ptr(uspi, ind_ubh, i);
+		tmp = ufs_data_ptr_to_cpu(sb, ind);
+		if (!tmp)
+			continue;
+
+		ufs_data_ptr_clear(uspi, ind);
+		ubh_mark_buffer_dirty(ind_ubh);
+		if (free_count == 0) {
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		} else if (free_count > 0 && frag_to_free == tmp - free_count)
+			free_count += uspi->s_fpb;
+		else {
+			ufs_free_blocks (inode, frag_to_free, free_count);
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		}
+
+		mark_inode_dirty(inode);
+	}
+
+	if (free_count > 0) {
+		ufs_free_blocks (inode, frag_to_free, free_count);
+	}
+	for (i = 0; i < uspi->s_apb; i++)
+		if (!ufs_is_data_ptr_zero(uspi,
+					  ubh_get_data_ptr(uspi, ind_ubh, i)))
+			break;
+	if (i >= uspi->s_apb) {
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		ufs_data_ptr_clear(uspi, p);
+
+		ufs_free_blocks (inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(ind_ubh);
+		ind_ubh = NULL;
+	}
+	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
+		ubh_sync_block(ind_ubh);
+	ubh_brelse (ind_ubh);
+	
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	
+	return retry;
+}
+
+static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_buffer_head *dind_bh;
+	u64 i, tmp, dindirect_block;
+	void *dind;
+	int retry = 0;
+	
+	UFSD("ENTER: ino %lu\n", inode->i_ino);
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+
+	dindirect_block = (DIRECT_BLOCK > offset) 
+		? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0;
+	retry = 0;
+	
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp)
+		return 0;
+	dind_bh = ubh_bread(sb, tmp, uspi->s_bsize);
+	if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
+		ubh_brelse (dind_bh);
+		return 1;
+	}
+	if (!dind_bh) {
+		ufs_data_ptr_clear(uspi, p);
+		return 0;
+	}
+
+	for (i = dindirect_block ; i < uspi->s_apb ; i++) {
+		dind = ubh_get_data_ptr(uspi, dind_bh, i);
+		tmp = ufs_data_ptr_to_cpu(sb, dind);
+		if (!tmp)
+			continue;
+		retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind);
+		ubh_mark_buffer_dirty(dind_bh);
+	}
+
+	for (i = 0; i < uspi->s_apb; i++)
+		if (!ufs_is_data_ptr_zero(uspi,
+					  ubh_get_data_ptr(uspi, dind_bh, i)))
+			break;
+	if (i >= uspi->s_apb) {
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		ufs_data_ptr_clear(uspi, p);
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(dind_bh);
+		dind_bh = NULL;
+	}
+	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
+		ubh_sync_block(dind_bh);
+	ubh_brelse (dind_bh);
+	
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	
+	return retry;
+}
+
+static int ufs_trunc_tindirect(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct ufs_buffer_head * tind_bh;
+	u64 tindirect_block, tmp, i;
+	void *tind, *p;
+	int retry;
+	
+	UFSD("ENTER: ino %lu\n", inode->i_ino);
+
+	retry = 0;
+	
+	tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb))
+		? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0;
+
+	p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK);
+	if (!(tmp = ufs_data_ptr_to_cpu(sb, p)))
+		return 0;
+	tind_bh = ubh_bread (sb, tmp, uspi->s_bsize);
+	if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
+		ubh_brelse (tind_bh);
+		return 1;
+	}
+	if (!tind_bh) {
+		ufs_data_ptr_clear(uspi, p);
+		return 0;
+	}
+
+	for (i = tindirect_block ; i < uspi->s_apb ; i++) {
+		tind = ubh_get_data_ptr(uspi, tind_bh, i);
+		retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + 
+			uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind);
+		ubh_mark_buffer_dirty(tind_bh);
+	}
+	for (i = 0; i < uspi->s_apb; i++)
+		if (!ufs_is_data_ptr_zero(uspi,
+					  ubh_get_data_ptr(uspi, tind_bh, i)))
+			break;
+	if (i >= uspi->s_apb) {
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		ufs_data_ptr_clear(uspi, p);
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(tind_bh);
+		tind_bh = NULL;
+	}
+	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
+		ubh_sync_block(tind_bh);
+	ubh_brelse (tind_bh);
+	
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	return retry;
+}
+
+static int ufs_alloc_lastblock(struct inode *inode)
+{
+	int err = 0;
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	unsigned i, end;
+	sector_t lastfrag;
+	struct page *lastpage;
+	struct buffer_head *bh;
+	u64 phys64;
+
+	lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
+
+	if (!lastfrag)
+		goto out;
+
+	lastfrag--;
+
+	lastpage = ufs_get_locked_page(mapping, lastfrag >>
+				       (PAGE_CACHE_SHIFT - inode->i_blkbits));
+       if (IS_ERR(lastpage)) {
+               err = -EIO;
+               goto out;
+       }
+
+       end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+       bh = page_buffers(lastpage);
+       for (i = 0; i < end; ++i)
+               bh = bh->b_this_page;
+
+
+       err = ufs_getfrag_block(inode, lastfrag, bh, 1);
+
+       if (unlikely(err))
+	       goto out_unlock;
+
+       if (buffer_new(bh)) {
+	       clear_buffer_new(bh);
+	       unmap_underlying_metadata(bh->b_bdev,
+					 bh->b_blocknr);
+	       /*
+		* we do not zeroize fragment, because of
+		* if it maped to hole, it already contains zeroes
+		*/
+	       set_buffer_uptodate(bh);
+	       mark_buffer_dirty(bh);
+	       set_page_dirty(lastpage);
+       }
+
+       if (lastfrag >= UFS_IND_FRAGMENT) {
+	       end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1;
+	       phys64 = bh->b_blocknr + 1;
+	       for (i = 0; i < end; ++i) {
+		       bh = sb_getblk(sb, i + phys64);
+		       lock_buffer(bh);
+		       memset(bh->b_data, 0, sb->s_blocksize);
+		       set_buffer_uptodate(bh);
+		       mark_buffer_dirty(bh);
+		       unlock_buffer(bh);
+		       sync_dirty_buffer(bh);
+		       brelse(bh);
+	       }
+       }
+out_unlock:
+       ufs_put_locked_page(lastpage);
+out:
+       return err;
+}
+
+int ufs_truncate(struct inode *inode, loff_t old_i_size)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	int retry, err = 0;
+	
+	UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n",
+	     inode->i_ino, (unsigned long long)i_size_read(inode),
+	     (unsigned long long)old_i_size);
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	      S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	err = ufs_alloc_lastblock(inode);
+
+	if (err) {
+		i_size_write(inode, old_i_size);
+		goto out;
+	}
+
+	block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
+
+	while (1) {
+		retry = ufs_trunc_direct(inode);
+		retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
+					    ufs_get_direct_data_ptr(uspi, ufsi,
+								    UFS_IND_BLOCK));
+		retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb,
+					     ufs_get_direct_data_ptr(uspi, ufsi,
+								     UFS_DIND_BLOCK));
+		retry |= ufs_trunc_tindirect (inode);
+		if (!retry)
+			break;
+		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
+			ufs_sync_inode (inode);
+		yield();
+	}
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	ufsi->i_lastfrag = DIRECT_FRAGMENT;
+	mark_inode_dirty(inode);
+out:
+	UFSD("EXIT: err %d\n", err);
+	return err;
+}
+
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	unsigned int ia_valid = attr->ia_valid;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+		loff_t old_i_size = inode->i_size;
+
+		/* XXX(truncate): truncate_setsize should be called last */
+		truncate_setsize(inode, attr->ia_size);
+
+		lock_ufs(inode->i_sb);
+		error = ufs_truncate(inode, old_i_size);
+		unlock_ufs(inode->i_sb);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations ufs_file_inode_operations = {
+	.setattr = ufs_setattr,
+};
diff --git a/kernel/fs/ufs/ufs.h b/kernel/fs/ufs/ufs.h
new file mode 100644
index 000000000..cf6368d42
--- /dev/null
+++ b/kernel/fs/ufs/ufs.h
@@ -0,0 +1,176 @@
+#ifndef _UFS_UFS_H
+#define _UFS_UFS_H 1
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define UFS_MAX_GROUP_LOADED 8
+#define UFS_CGNO_EMPTY ((unsigned)-1)
+
+struct ufs_sb_private_info;
+struct ufs_cg_private_info;
+struct ufs_csum;
+
+struct ufs_sb_info {
+	struct ufs_sb_private_info * s_uspi;
+	struct ufs_csum	* s_csp;
+	unsigned s_bytesex;
+	unsigned s_flags;
+	struct buffer_head ** s_ucg;
+	struct ufs_cg_private_info * s_ucpi[UFS_MAX_GROUP_LOADED];
+	unsigned s_cgno[UFS_MAX_GROUP_LOADED];
+	unsigned short s_cg_loaded;
+	unsigned s_mount_opt;
+	struct mutex mutex;
+	struct task_struct *mutex_owner;
+	struct super_block *sb;
+	int work_queued; /* non-zero if the delayed work is queued */
+	struct delayed_work sync_work; /* FS sync delayed work */
+	spinlock_t work_lock; /* protects sync_work and work_queued */
+	struct mutex s_lock;
+};
+
+struct ufs_inode_info {
+	union {
+		__fs32	i_data[15];
+		__u8	i_symlink[2 * 4 * 15];
+		__fs64	u2_i_data[15];
+	} i_u1;
+	__u32	i_flags;
+	__u32	i_shadow;
+	__u32	i_unused1;
+	__u32	i_unused2;
+	__u32	i_oeftflag;
+	__u16	i_osync;
+	__u64	i_lastfrag;
+	__u32   i_dir_start_lookup;
+	struct inode vfs_inode;
+};
+
+/* mount options */
+#define UFS_MOUNT_ONERROR		0x0000000F
+#define UFS_MOUNT_ONERROR_PANIC		0x00000001
+#define UFS_MOUNT_ONERROR_LOCK		0x00000002
+#define UFS_MOUNT_ONERROR_UMOUNT	0x00000004
+#define UFS_MOUNT_ONERROR_REPAIR	0x00000008
+
+#define UFS_MOUNT_UFSTYPE		0x0000FFF0
+#define UFS_MOUNT_UFSTYPE_OLD		0x00000010
+#define UFS_MOUNT_UFSTYPE_44BSD		0x00000020
+#define UFS_MOUNT_UFSTYPE_SUN		0x00000040
+#define UFS_MOUNT_UFSTYPE_NEXTSTEP	0x00000080
+#define UFS_MOUNT_UFSTYPE_NEXTSTEP_CD	0x00000100
+#define UFS_MOUNT_UFSTYPE_OPENSTEP	0x00000200
+#define UFS_MOUNT_UFSTYPE_SUNx86	0x00000400
+#define UFS_MOUNT_UFSTYPE_HP	        0x00000800
+#define UFS_MOUNT_UFSTYPE_UFS2		0x00001000
+#define UFS_MOUNT_UFSTYPE_SUNOS		0x00002000
+
+#define ufs_clear_opt(o,opt)	o &= ~UFS_MOUNT_##opt
+#define ufs_set_opt(o,opt)	o |= UFS_MOUNT_##opt
+#define ufs_test_opt(o,opt)	((o) & UFS_MOUNT_##opt)
+
+/*
+ * Debug code
+ */
+#ifdef CONFIG_UFS_DEBUG
+#	define UFSD(f, a...)	{					\
+		pr_debug("UFSD (%s, %d): %s:",				\
+			__FILE__, __LINE__, __func__);		\
+		pr_debug(f, ## a);					\
+	}
+#else
+#	define UFSD(f, a...)	/**/
+#endif
+
+/* balloc.c */
+extern void ufs_free_fragments (struct inode *, u64, unsigned);
+extern void ufs_free_blocks (struct inode *, u64, unsigned);
+extern u64 ufs_new_fragments(struct inode *, void *, u64, u64,
+			     unsigned, int *, struct page *);
+
+/* cylinder.c */
+extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned);
+extern void ufs_put_cylinder (struct super_block *, unsigned);
+
+/* dir.c */
+extern const struct inode_operations ufs_dir_inode_operations;
+extern int ufs_add_link (struct dentry *, struct inode *);
+extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
+extern int ufs_make_empty(struct inode *, struct inode *);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
+extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
+extern int ufs_empty_dir (struct inode *);
+extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
+extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+			 struct page *page, struct inode *inode);
+
+/* file.c */
+extern const struct inode_operations ufs_file_inode_operations;
+extern const struct file_operations ufs_file_operations;
+extern const struct address_space_operations ufs_aops;
+
+/* ialloc.c */
+extern void ufs_free_inode (struct inode *inode);
+extern struct inode * ufs_new_inode (struct inode *, umode_t);
+
+/* inode.c */
+extern struct inode *ufs_iget(struct super_block *, unsigned long);
+extern int ufs_write_inode (struct inode *, struct writeback_control *);
+extern int ufs_sync_inode (struct inode *);
+extern void ufs_evict_inode (struct inode *);
+extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
+
+/* namei.c */
+extern const struct file_operations ufs_dir_operations;
+
+/* super.c */
+extern __printf(3, 4)
+void ufs_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_panic(struct super_block *, const char *, const char *, ...);
+void ufs_mark_sb_dirty(struct super_block *sb);
+
+/* symlink.c */
+extern const struct inode_operations ufs_fast_symlink_inode_operations;
+extern const struct inode_operations ufs_symlink_inode_operations;
+
+/* truncate.c */
+extern int ufs_truncate (struct inode *, loff_t);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
+
+static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct ufs_inode_info *UFS_I(struct inode *inode)
+{
+	return container_of(inode, struct ufs_inode_info, vfs_inode);
+}
+
+/*
+ * Give cylinder group number for a file system block.
+ * Give cylinder group block number for a file system block.
+ */
+/* #define	ufs_dtog(d)	((d) / uspi->s_fpg) */
+static inline u64 ufs_dtog(struct ufs_sb_private_info * uspi, u64 b)
+{
+	do_div(b, uspi->s_fpg);
+	return b;
+}
+/* #define	ufs_dtogd(d)	((d) % uspi->s_fpg) */
+static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
+{
+	return do_div(b, uspi->s_fpg);
+}
+
+extern void lock_ufs(struct super_block *sb);
+extern void unlock_ufs(struct super_block *sb);
+
+#endif /* _UFS_UFS_H */
diff --git a/kernel/fs/ufs/ufs_fs.h b/kernel/fs/ufs/ufs_fs.h
new file mode 100644
index 000000000..0cbd5d340
--- /dev/null
+++ b/kernel/fs/ufs/ufs_fs.h
@@ -0,0 +1,960 @@
+/*
+ *  linux/include/linux/ufs_fs.h
+ *
+ * Copyright (C) 1996
+ * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu)
+ * Laboratory for Computer Science Research Computing Facility
+ * Rutgers, The State University of New Jersey
+ *
+ * Clean swab support by Fare <fare@tunes.org>
+ * just hope no one is using NNUUXXI on __?64 structure elements
+ * 64-bit clean thanks to Maciej W. Rozycki <macro@ds2.pg.gda.pl>
+ *
+ * 4.4BSD (FreeBSD) support added on February 1st 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
+ * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * NeXTstep support added on February 5th 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk>.
+ *
+ * Write support by Daniel Pirkl <daniel.pirkl@email.cz>
+ *
+ * HP/UX hfs filesystem support added by
+ * Martin K. Petersen <mkp@mkp.net>, August 1999
+ *
+ * UFS2 (of FreeBSD 5.x) support added by
+ * Niraj Kumar <niraj17@iitbombay.org>  , Jan 2004
+ *
+ */
+
+#ifndef __LINUX_UFS_FS_H
+#define __LINUX_UFS_FS_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/fs.h>
+#include <linux/workqueue.h>
+
+#include <asm/div64.h>
+typedef __u64 __bitwise __fs64;
+typedef __u32 __bitwise __fs32;
+typedef __u16 __bitwise __fs16;
+
+#define UFS_BBLOCK 0
+#define UFS_BBSIZE 8192
+#define UFS_SBLOCK 8192
+#define UFS_SBSIZE 8192
+
+#define UFS_SECTOR_SIZE 512
+#define UFS_SECTOR_BITS 9
+#define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
+#define UFS2_MAGIC 0x19540119
+#define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
+
+/* Copied from FreeBSD */
+/*
+ * Each disk drive contains some number of filesystems.
+ * A filesystem consists of a number of cylinder groups.
+ * Each cylinder group has inodes and data.
+ *
+ * A filesystem is described by its super-block, which in turn
+ * describes the cylinder groups.  The super-block is critical
+ * data and is replicated in each cylinder group to protect against
+ * catastrophic loss.  This is done at `newfs' time and the critical
+ * super-block data does not change, so the copies need not be
+ * referenced further unless disaster strikes.
+ *
+ * For filesystem fs, the offsets of the various blocks of interest
+ * are given in the super block as:
+ *      [fs->fs_sblkno]         Super-block
+ *      [fs->fs_cblkno]         Cylinder group block
+ *      [fs->fs_iblkno]         Inode blocks
+ *      [fs->fs_dblkno]         Data blocks
+ * The beginning of cylinder group cg in fs, is given by
+ * the ``cgbase(fs, cg)'' macro.
+ *
+ * Depending on the architecture and the media, the superblock may
+ * reside in any one of four places. For tiny media where every block
+ * counts, it is placed at the very front of the partition. Historically,
+ * UFS1 placed it 8K from the front to leave room for the disk label and
+ * a small bootstrap. For UFS2 it got moved to 64K from the front to leave
+ * room for the disk label and a bigger bootstrap, and for really piggy
+ * systems we check at 256K from the front if the first three fail. In
+ * all cases the size of the superblock will be SBLOCKSIZE. All values are
+ * given in byte-offset form, so they do not imply a sector size. The
+ * SBLOCKSEARCH specifies the order in which the locations should be searched.
+ */
+#define SBLOCK_FLOPPY        0
+#define SBLOCK_UFS1       8192
+#define SBLOCK_UFS2      65536
+#define SBLOCK_PIGGY    262144
+#define SBLOCKSIZE        8192
+#define SBLOCKSEARCH \
+        { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 }
+
+
+/* HP specific MAGIC values */
+
+#define UFS_MAGIC_LFN   0x00095014 /* fs supports filenames > 14 chars */
+#define UFS_CIGAM_LFN   0x14500900 /* srahc 41 < semanelif stroppus sf */
+
+#define UFS_MAGIC_SEC   0x00612195 /* B1 security fs */
+#define UFS_CIGAM_SEC   0x95216100
+
+#define UFS_MAGIC_FEA   0x00195612 /* fs_featurebits supported */
+#define UFS_CIGAM_FEA   0x12561900
+
+#define UFS_MAGIC_4GB   0x05231994 /* fs > 4 GB && fs_featurebits */
+#define UFS_CIGAM_4GB   0x94192305
+
+/* Seems somebody at HP goofed here. B1 and lfs are both 0x2 !?! */
+#define UFS_FSF_LFN     0x00000001 /* long file names */
+#define UFS_FSF_B1      0x00000002 /* B1 security */
+#define UFS_FSF_LFS     0x00000002 /* large files */
+#define UFS_FSF_LUID    0x00000004 /* large UIDs */
+
+/* End of HP stuff */
+
+
+#define UFS_BSIZE	8192
+#define UFS_MINBSIZE	4096
+#define UFS_FSIZE	1024
+#define UFS_MAXFRAG	(UFS_BSIZE / UFS_FSIZE)
+
+#define UFS_NDADDR 12
+#define UFS_NINDIR 3
+
+#define UFS_IND_BLOCK	(UFS_NDADDR + 0)
+#define UFS_DIND_BLOCK	(UFS_NDADDR + 1)
+#define UFS_TIND_BLOCK	(UFS_NDADDR + 2)
+
+#define UFS_NDIR_FRAGMENT (UFS_NDADDR << uspi->s_fpbshift)
+#define UFS_IND_FRAGMENT (UFS_IND_BLOCK << uspi->s_fpbshift)
+#define UFS_DIND_FRAGMENT (UFS_DIND_BLOCK << uspi->s_fpbshift)
+#define UFS_TIND_FRAGMENT (UFS_TIND_BLOCK << uspi->s_fpbshift)
+
+#define UFS_ROOTINO 2
+#define UFS_FIRST_INO (UFS_ROOTINO + 1)
+
+#define UFS_USEEFT  ((__u16)65535)
+
+/* fs_clean values */
+#define UFS_FSOK      0x7c269d38
+#define UFS_FSACTIVE  ((__s8)0x00)
+#define UFS_FSCLEAN   ((__s8)0x01)
+#define UFS_FSSTABLE  ((__s8)0x02)
+#define UFS_FSOSF1    ((__s8)0x03)	/* is this correct for DEC OSF/1? */
+#define UFS_FSBAD     ((__s8)0xff)
+
+/* Solaris-specific fs_clean values */
+#define UFS_FSSUSPEND ((__s8)0xfe)	/* temporarily suspended */
+#define UFS_FSLOG     ((__s8)0xfd)	/* logging fs */
+#define UFS_FSFIX     ((__s8)0xfc)	/* being repaired while mounted */
+
+/* From here to next blank line, s_flags for ufs_sb_info */
+/* directory entry encoding */
+#define UFS_DE_MASK		0x00000010	/* mask for the following */
+#define UFS_DE_OLD		0x00000000
+#define UFS_DE_44BSD		0x00000010
+/* uid encoding */
+#define UFS_UID_MASK		0x00000060	/* mask for the following */
+#define UFS_UID_OLD		0x00000000
+#define UFS_UID_44BSD		0x00000020
+#define UFS_UID_EFT		0x00000040
+/* superblock state encoding */
+#define UFS_ST_MASK		0x00000700	/* mask for the following */
+#define UFS_ST_OLD		0x00000000
+#define UFS_ST_44BSD		0x00000100
+#define UFS_ST_SUN		0x00000200 /* Solaris */
+#define UFS_ST_SUNOS		0x00000300
+#define UFS_ST_SUNx86		0x00000400 /* Solaris x86 */
+/*cylinder group encoding */
+#define UFS_CG_MASK		0x00003000	/* mask for the following */
+#define UFS_CG_OLD		0x00000000
+#define UFS_CG_44BSD		0x00002000
+#define UFS_CG_SUN		0x00001000
+/* filesystem type encoding */
+#define UFS_TYPE_MASK		0x00010000	/* mask for the following */
+#define UFS_TYPE_UFS1		0x00000000
+#define UFS_TYPE_UFS2		0x00010000
+
+
+/* fs_inodefmt options */
+#define UFS_42INODEFMT	-1
+#define UFS_44INODEFMT	2
+
+/*
+ * MINFREE gives the minimum acceptable percentage of file system
+ * blocks which may be free. If the freelist drops below this level
+ * only the superuser may continue to allocate blocks. This may
+ * be set to 0 if no reserve of free blocks is deemed necessary,
+ * however throughput drops by fifty percent if the file system
+ * is run at between 95% and 100% full; thus the minimum default
+ * value of fs_minfree is 5%. However, to get good clustering
+ * performance, 10% is a better choice. hence we use 10% as our
+ * default value. With 10% free space, fragmentation is not a
+ * problem, so we choose to optimize for time.
+ */
+#define UFS_MINFREE         5
+#define UFS_DEFAULTOPT      UFS_OPTTIME
+
+/*
+ * Turn file system block numbers into disk block addresses.
+ * This maps file system blocks to device size blocks.
+ */
+#define ufs_fsbtodb(uspi, b)	((b) << (uspi)->s_fsbtodb)
+#define	ufs_dbtofsb(uspi, b)	((b) >> (uspi)->s_fsbtodb)
+
+/*
+ * Cylinder group macros to locate things in cylinder groups.
+ * They calc file system addresses of cylinder group data structures.
+ */
+#define	ufs_cgbase(c)	(uspi->s_fpg * (c))
+#define ufs_cgstart(c)	((uspi)->fs_magic == UFS2_MAGIC ?  ufs_cgbase(c) : \
+	(ufs_cgbase(c)  + uspi->s_cgoffset * ((c) & ~uspi->s_cgmask)))
+#define	ufs_cgsblock(c)	(ufs_cgstart(c) + uspi->s_sblkno)	/* super blk */
+#define	ufs_cgcmin(c)	(ufs_cgstart(c) + uspi->s_cblkno)	/* cg block */
+#define	ufs_cgimin(c)	(ufs_cgstart(c) + uspi->s_iblkno)	/* inode blk */
+#define	ufs_cgdmin(c)	(ufs_cgstart(c) + uspi->s_dblkno)	/* 1st data */
+
+/*
+ * Macros for handling inode numbers:
+ *     inode number to file system block offset.
+ *     inode number to cylinder group number.
+ *     inode number to file system block address.
+ */
+#define	ufs_inotocg(x)		((x) / uspi->s_ipg)
+#define	ufs_inotocgoff(x)	((x) % uspi->s_ipg)
+#define	ufs_inotofsba(x)	(((u64)ufs_cgimin(ufs_inotocg(x))) + ufs_inotocgoff(x) / uspi->s_inopf)
+#define	ufs_inotofsbo(x)	((x) % uspi->s_inopf)
+
+/*
+ * Compute the cylinder and rotational position of a cyl block addr.
+ */
+#define ufs_cbtocylno(bno) \
+	((bno) * uspi->s_nspf / uspi->s_spc)
+#define ufs_cbtorpos(bno)				      \
+	((UFS_SB(sb)->s_flags & UFS_CG_SUN) ?		      \
+	 (((((bno) * uspi->s_nspf % uspi->s_spc) %	      \
+	    uspi->s_nsect) *				      \
+	   uspi->s_nrpos) / uspi->s_nsect)		      \
+	 :						      \
+	((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
+	* uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
+	% uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
+	  * uspi->s_nrpos) / uspi->s_npsect))
+
+/*
+ * The following macros optimize certain frequently calculated
+ * quantities by using shifts and masks in place of divisions
+ * modulos and multiplications.
+ */
+#define ufs_blkoff(loc)		((loc) & uspi->s_qbmask)
+#define ufs_fragoff(loc)	((loc) & uspi->s_qfmask)
+#define ufs_lblktosize(blk)	((blk) << uspi->s_bshift)
+#define ufs_lblkno(loc)		((loc) >> uspi->s_bshift)
+#define ufs_numfrags(loc)	((loc) >> uspi->s_fshift)
+#define ufs_blkroundup(size)	(((size) + uspi->s_qbmask) & uspi->s_bmask)
+#define ufs_fragroundup(size)	(((size) + uspi->s_qfmask) & uspi->s_fmask)
+#define ufs_fragstoblks(frags)	((frags) >> uspi->s_fpbshift)
+#define ufs_blkstofrags(blks)	((blks) << uspi->s_fpbshift)
+#define ufs_fragnum(fsb)	((fsb) & uspi->s_fpbmask)
+#define ufs_blknum(fsb)		((fsb) & ~uspi->s_fpbmask)
+
+#define	UFS_MAXNAMLEN 255
+#define UFS_MAXMNTLEN 512
+#define UFS2_MAXMNTLEN 468
+#define UFS2_MAXVOLLEN 32
+#define UFS_MAXCSBUFS 31
+#define UFS_LINK_MAX 32000
+/*
+#define	UFS2_NOCSPTRS	((128 / sizeof(void *)) - 4)
+*/
+#define	UFS2_NOCSPTRS	28
+
+/*
+ * UFS_DIR_PAD defines the directory entries boundaries
+ * (must be a multiple of 4)
+ */
+#define UFS_DIR_PAD			4
+#define UFS_DIR_ROUND			(UFS_DIR_PAD - 1)
+#define UFS_DIR_REC_LEN(name_len)	(((name_len) + 1 + 8 + UFS_DIR_ROUND) & ~UFS_DIR_ROUND)
+
+struct ufs_timeval {
+	__fs32	tv_sec;
+	__fs32	tv_usec;
+};
+
+struct ufs_dir_entry {
+	__fs32  d_ino;			/* inode number of this entry */
+	__fs16  d_reclen;		/* length of this entry */
+	union {
+		__fs16	d_namlen;		/* actual length of d_name */
+		struct {
+			__u8	d_type;		/* file type */
+			__u8	d_namlen;	/* length of string in d_name */
+		} d_44;
+	} d_u;
+	__u8	d_name[UFS_MAXNAMLEN + 1];	/* file name */
+};
+
+struct ufs_csum {
+	__fs32	cs_ndir;	/* number of directories */
+	__fs32	cs_nbfree;	/* number of free blocks */
+	__fs32	cs_nifree;	/* number of free inodes */
+	__fs32	cs_nffree;	/* number of free frags */
+};
+struct ufs2_csum_total {
+	__fs64	cs_ndir;	/* number of directories */
+	__fs64	cs_nbfree;	/* number of free blocks */
+	__fs64	cs_nifree;	/* number of free inodes */
+	__fs64	cs_nffree;	/* number of free frags */
+	__fs64   cs_numclusters;	/* number of free clusters */
+	__fs64   cs_spare[3];	/* future expansion */
+};
+
+struct ufs_csum_core {
+	__u64	cs_ndir;	/* number of directories */
+	__u64	cs_nbfree;	/* number of free blocks */
+	__u64	cs_nifree;	/* number of free inodes */
+	__u64	cs_nffree;	/* number of free frags */
+	__u64   cs_numclusters;	/* number of free clusters */
+};
+
+/*
+ * File system flags
+ */
+#define UFS_UNCLEAN      0x01    /* file system not clean at mount (unused) */
+#define UFS_DOSOFTDEP    0x02    /* file system using soft dependencies */
+#define UFS_NEEDSFSCK    0x04    /* needs sync fsck (FreeBSD compat, unused) */
+#define UFS_INDEXDIRS    0x08    /* kernel supports indexed directories */
+#define UFS_ACLS         0x10    /* file system has ACLs enabled */
+#define UFS_MULTILABEL   0x20    /* file system is MAC multi-label */
+#define UFS_FLAGS_UPDATED 0x80   /* flags have been moved to new location */
+
+#if 0
+/*
+ * This is the actual superblock, as it is laid out on the disk.
+ * Do NOT use this structure, because of sizeof(ufs_super_block) > 512 and
+ * it may occupy several blocks, use
+ * struct ufs_super_block_(first,second,third) instead.
+ */
+struct ufs_super_block {
+	union {
+		struct {
+			__fs32	fs_link;	/* UNUSED */
+		} fs_42;
+		struct {
+			__fs32	fs_state;	/* file system state flag */
+		} fs_sun;
+	} fs_u0;
+	__fs32	fs_rlink;	/* UNUSED */
+	__fs32	fs_sblkno;	/* addr of super-block in filesys */
+	__fs32	fs_cblkno;	/* offset of cyl-block in filesys */
+	__fs32	fs_iblkno;	/* offset of inode-blocks in filesys */
+	__fs32	fs_dblkno;	/* offset of first data after cg */
+	__fs32	fs_cgoffset;	/* cylinder group offset in cylinder */
+	__fs32	fs_cgmask;	/* used to calc mod fs_ntrak */
+	__fs32	fs_time;	/* last time written -- time_t */
+	__fs32	fs_size;	/* number of blocks in fs */
+	__fs32	fs_dsize;	/* number of data blocks in fs */
+	__fs32	fs_ncg;		/* number of cylinder groups */
+	__fs32	fs_bsize;	/* size of basic blocks in fs */
+	__fs32	fs_fsize;	/* size of frag blocks in fs */
+	__fs32	fs_frag;	/* number of frags in a block in fs */
+/* these are configuration parameters */
+	__fs32	fs_minfree;	/* minimum percentage of free blocks */
+	__fs32	fs_rotdelay;	/* num of ms for optimal next block */
+	__fs32	fs_rps;		/* disk revolutions per second */
+/* these fields can be computed from the others */
+	__fs32	fs_bmask;	/* ``blkoff'' calc of blk offsets */
+	__fs32	fs_fmask;	/* ``fragoff'' calc of frag offsets */
+	__fs32	fs_bshift;	/* ``lblkno'' calc of logical blkno */
+	__fs32	fs_fshift;	/* ``numfrags'' calc number of frags */
+/* these are configuration parameters */
+	__fs32	fs_maxcontig;	/* max number of contiguous blks */
+	__fs32	fs_maxbpg;	/* max number of blks per cyl group */
+/* these fields can be computed from the others */
+	__fs32	fs_fragshift;	/* block to frag shift */
+	__fs32	fs_fsbtodb;	/* fsbtodb and dbtofsb shift constant */
+	__fs32	fs_sbsize;	/* actual size of super block */
+	__fs32	fs_csmask;	/* csum block offset */
+	__fs32	fs_csshift;	/* csum block number */
+	__fs32	fs_nindir;	/* value of NINDIR */
+	__fs32	fs_inopb;	/* value of INOPB */
+	__fs32	fs_nspf;	/* value of NSPF */
+/* yet another configuration parameter */
+	__fs32	fs_optim;	/* optimization preference, see below */
+/* these fields are derived from the hardware */
+	union {
+		struct {
+			__fs32	fs_npsect;	/* # sectors/track including spares */
+		} fs_sun;
+		struct {
+			__fs32	fs_state;	/* file system state time stamp */
+		} fs_sunx86;
+	} fs_u1;
+	__fs32	fs_interleave;	/* hardware sector interleave */
+	__fs32	fs_trackskew;	/* sector 0 skew, per track */
+/* a unique id for this filesystem (currently unused and unmaintained) */
+/* In 4.3 Tahoe this space is used by fs_headswitch and fs_trkseek */
+/* Neither of those fields is used in the Tahoe code right now but */
+/* there could be problems if they are.                            */
+	__fs32	fs_id[2];	/* file system id */
+/* sizes determined by number of cylinder groups and their sizes */
+	__fs32	fs_csaddr;	/* blk addr of cyl grp summary area */
+	__fs32	fs_cssize;	/* size of cyl grp summary area */
+	__fs32	fs_cgsize;	/* cylinder group size */
+/* these fields are derived from the hardware */
+	__fs32	fs_ntrak;	/* tracks per cylinder */
+	__fs32	fs_nsect;	/* sectors per track */
+	__fs32	fs_spc;		/* sectors per cylinder */
+/* this comes from the disk driver partitioning */
+	__fs32	fs_ncyl;	/* cylinders in file system */
+/* these fields can be computed from the others */
+	__fs32	fs_cpg;		/* cylinders per group */
+	__fs32	fs_ipg;		/* inodes per cylinder group */
+	__fs32	fs_fpg;		/* blocks per group * fs_frag */
+/* this data must be re-computed after crashes */
+	struct ufs_csum fs_cstotal;	/* cylinder summary information */
+/* these fields are cleared at mount time */
+	__s8	fs_fmod;	/* super block modified flag */
+	__s8	fs_clean;	/* file system is clean flag */
+	__s8	fs_ronly;	/* mounted read-only flag */
+	__s8	fs_flags;
+	union {
+		struct {
+			__s8	fs_fsmnt[UFS_MAXMNTLEN];/* name mounted on */
+			__fs32	fs_cgrotor;	/* last cg searched */
+			__fs32	fs_csp[UFS_MAXCSBUFS];/*list of fs_cs info buffers */
+			__fs32	fs_maxcluster;
+			__fs32	fs_cpc;		/* cyl per cycle in postbl */
+			__fs16	fs_opostbl[16][8]; /* old rotation block list head */
+		} fs_u1;
+		struct {
+			__s8  fs_fsmnt[UFS2_MAXMNTLEN];	/* name mounted on */
+			__u8   fs_volname[UFS2_MAXVOLLEN]; /* volume name */
+			__fs64  fs_swuid;		/* system-wide uid */
+			__fs32  fs_pad;	/* due to alignment of fs_swuid */
+			__fs32   fs_cgrotor;     /* last cg searched */
+			__fs32   fs_ocsp[UFS2_NOCSPTRS]; /*list of fs_cs info buffers */
+			__fs32   fs_contigdirs;/*# of contiguously allocated dirs */
+			__fs32   fs_csp;	/* cg summary info buffer for fs_cs */
+			__fs32   fs_maxcluster;
+			__fs32   fs_active;/* used by snapshots to track fs */
+			__fs32   fs_old_cpc;	/* cyl per cycle in postbl */
+			__fs32   fs_maxbsize;/*maximum blocking factor permitted */
+			__fs64   fs_sparecon64[17];/*old rotation block list head */
+			__fs64   fs_sblockloc; /* byte offset of standard superblock */
+			struct  ufs2_csum_total fs_cstotal;/*cylinder summary information*/
+			struct  ufs_timeval    fs_time;		/* last time written */
+			__fs64    fs_size;		/* number of blocks in fs */
+			__fs64    fs_dsize;	/* number of data blocks in fs */
+			__fs64   fs_csaddr;	/* blk addr of cyl grp summary area */
+			__fs64    fs_pendingblocks;/* blocks in process of being freed */
+			__fs32    fs_pendinginodes;/*inodes in process of being freed */
+		} fs_u2;
+	}  fs_u11;
+	union {
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_state;	/* file system state time stamp */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sun;
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_npsect;	/* # sectors/track including spares */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sunx86;
+		struct {
+			__fs32	fs_sparecon[50];/* reserved for future constants */
+			__fs32	fs_contigsumsize;/* size of cluster summary array */
+			__fs32	fs_maxsymlinklen;/* max length of an internal symlink */
+			__fs32	fs_inodefmt;	/* format of on-disk inodes */
+			__fs32	fs_maxfilesize[2];	/* max representable file size */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+			__fs32	fs_state;	/* file system state time stamp */
+		} fs_44;
+	} fs_u2;
+	__fs32	fs_postblformat;	/* format of positional layout tables */
+	__fs32	fs_nrpos;		/* number of rotational positions */
+	__fs32	fs_postbloff;		/* (__s16) rotation block list head */
+	__fs32	fs_rotbloff;		/* (__u8) blocks for each rotation */
+	__fs32	fs_magic;		/* magic number */
+	__u8	fs_space[1];		/* list of blocks for each rotation */
+};
+#endif/*struct ufs_super_block*/
+
+/*
+ * Preference for optimization.
+ */
+#define UFS_OPTTIME	0	/* minimize allocation time */
+#define UFS_OPTSPACE	1	/* minimize disk fragmentation */
+
+/*
+ * Rotational layout table format types
+ */
+#define UFS_42POSTBLFMT		-1	/* 4.2BSD rotational table format */
+#define UFS_DYNAMICPOSTBLFMT	1	/* dynamic rotational table format */
+
+/*
+ * Convert cylinder group to base address of its global summary info.
+ */
+#define fs_cs(indx) s_csp[(indx)]
+
+/*
+ * Cylinder group block for a file system.
+ *
+ * Writable fields in the cylinder group are protected by the associated
+ * super block lock fs->fs_lock.
+ */
+#define	CG_MAGIC	0x090255
+#define ufs_cg_chkmagic(sb, ucg) \
+	(fs32_to_cpu((sb), (ucg)->cg_magic) == CG_MAGIC)
+/*
+ * Macros for access to old cylinder group array structures
+ */
+#define ufs_ocg_blktot(sb, ucg)      fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_btot)
+#define ufs_ocg_blks(sb, ucg, cylno) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_b[cylno])
+#define ufs_ocg_inosused(sb, ucg)    fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_iused)
+#define ufs_ocg_blksfree(sb, ucg)    fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_free)
+#define ufs_ocg_chkmagic(sb, ucg) \
+	(fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_magic) == CG_MAGIC)
+
+/*
+ * size of this structure is 172 B
+ */
+struct	ufs_cylinder_group {
+	__fs32	cg_link;		/* linked list of cyl groups */
+	__fs32	cg_magic;		/* magic number */
+	__fs32	cg_time;		/* time last written */
+	__fs32	cg_cgx;			/* we are the cgx'th cylinder group */
+	__fs16	cg_ncyl;		/* number of cyl's this cg */
+	__fs16	cg_niblk;		/* number of inode blocks this cg */
+	__fs32	cg_ndblk;		/* number of data blocks this cg */
+	struct	ufs_csum cg_cs;		/* cylinder summary information */
+	__fs32	cg_rotor;		/* position of last used block */
+	__fs32	cg_frotor;		/* position of last used frag */
+	__fs32	cg_irotor;		/* position of last used inode */
+	__fs32	cg_frsum[UFS_MAXFRAG];	/* counts of available frags */
+	__fs32	cg_btotoff;		/* (__u32) block totals per cylinder */
+	__fs32	cg_boff;		/* (short) free block positions */
+	__fs32	cg_iusedoff;		/* (char) used inode map */
+	__fs32	cg_freeoff;		/* (u_char) free block map */
+	__fs32	cg_nextfreeoff;		/* (u_char) next available space */
+	union {
+		struct {
+			__fs32	cg_clustersumoff;	/* (u_int32) counts of avail clusters */
+			__fs32	cg_clusteroff;		/* (u_int8) free cluster map */
+			__fs32	cg_nclusterblks;	/* number of clusters this cg */
+			__fs32	cg_sparecon[13];	/* reserved for future use */
+		} cg_44;
+		struct {
+			__fs32	cg_clustersumoff;/* (u_int32) counts of avail clusters */
+			__fs32	cg_clusteroff;	/* (u_int8) free cluster map */
+			__fs32	cg_nclusterblks;/* number of clusters this cg */
+			__fs32   cg_niblk; /* number of inode blocks this cg */
+			__fs32   cg_initediblk;	/* last initialized inode */
+			__fs32   cg_sparecon32[3];/* reserved for future use */
+			__fs64   cg_time;	/* time last written */
+			__fs64	cg_sparecon[3];	/* reserved for future use */
+		} cg_u2;
+		__fs32	cg_sparecon[16];	/* reserved for future use */
+	} cg_u;
+	__u8	cg_space[1];		/* space for cylinder group maps */
+/* actually longer */
+};
+
+/* Historic Cylinder group info */
+struct ufs_old_cylinder_group {
+	__fs32	cg_link;		/* linked list of cyl groups */
+	__fs32	cg_rlink;		/* for incore cyl groups     */
+	__fs32	cg_time;		/* time last written */
+	__fs32	cg_cgx;			/* we are the cgx'th cylinder group */
+	__fs16	cg_ncyl;		/* number of cyl's this cg */
+	__fs16	cg_niblk;		/* number of inode blocks this cg */
+	__fs32	cg_ndblk;		/* number of data blocks this cg */
+	struct	ufs_csum cg_cs;		/* cylinder summary information */
+	__fs32	cg_rotor;		/* position of last used block */
+	__fs32	cg_frotor;		/* position of last used frag */
+	__fs32	cg_irotor;		/* position of last used inode */
+	__fs32	cg_frsum[8];		/* counts of available frags */
+	__fs32	cg_btot[32];		/* block totals per cylinder */
+	__fs16	cg_b[32][8];		/* positions of free blocks */
+	__u8	cg_iused[256];		/* used inode map */
+	__fs32	cg_magic;		/* magic number */
+	__u8	cg_free[1];		/* free block map */
+/* actually longer */
+};
+
+/*
+ * structure of an on-disk inode
+ */
+struct ufs_inode {
+	__fs16	ui_mode;		/*  0x0 */
+	__fs16	ui_nlink;		/*  0x2 */
+	union {
+		struct {
+			__fs16	ui_suid;	/*  0x4 */
+			__fs16	ui_sgid;	/*  0x6 */
+		} oldids;
+		__fs32	ui_inumber;		/*  0x4 lsf: inode number */
+		__fs32	ui_author;		/*  0x4 GNU HURD: author */
+	} ui_u1;
+	__fs64	ui_size;		/*  0x8 */
+	struct ufs_timeval ui_atime;	/* 0x10 access */
+	struct ufs_timeval ui_mtime;	/* 0x18 modification */
+	struct ufs_timeval ui_ctime;	/* 0x20 creation */
+	union {
+		struct {
+			__fs32	ui_db[UFS_NDADDR];/* 0x28 data blocks */
+			__fs32	ui_ib[UFS_NINDIR];/* 0x58 indirect blocks */
+		} ui_addr;
+		__u8	ui_symlink[4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */
+	} ui_u2;
+	__fs32	ui_flags;		/* 0x64 immutable, append-only... */
+	__fs32	ui_blocks;		/* 0x68 blocks in use */
+	__fs32	ui_gen;			/* 0x6c like ext2 i_version, for NFS support */
+	union {
+		struct {
+			__fs32	ui_shadow;	/* 0x70 shadow inode with security data */
+			__fs32	ui_uid;		/* 0x74 long EFT version of uid */
+			__fs32	ui_gid;		/* 0x78 long EFT version of gid */
+			__fs32	ui_oeftflag;	/* 0x7c reserved */
+		} ui_sun;
+		struct {
+			__fs32	ui_uid;		/* 0x70 File owner */
+			__fs32	ui_gid;		/* 0x74 File group */
+			__fs32	ui_spare[2];	/* 0x78 reserved */
+		} ui_44;
+		struct {
+			__fs32	ui_uid;		/* 0x70 */
+			__fs32	ui_gid;		/* 0x74 */
+			__fs16	ui_modeh;	/* 0x78 mode high bits */
+			__fs16	ui_spare;	/* 0x7A unused */
+			__fs32	ui_trans;	/* 0x7c filesystem translator */
+		} ui_hurd;
+	} ui_u3;
+};
+
+#define UFS_NXADDR  2            /* External addresses in inode. */
+struct ufs2_inode {
+	__fs16     ui_mode;        /*   0: IFMT, permissions; see below. */
+	__fs16     ui_nlink;       /*   2: File link count. */
+	__fs32     ui_uid;         /*   4: File owner. */
+	__fs32     ui_gid;         /*   8: File group. */
+	__fs32     ui_blksize;     /*  12: Inode blocksize. */
+	__fs64     ui_size;        /*  16: File byte count. */
+	__fs64     ui_blocks;      /*  24: Bytes actually held. */
+	__fs64   ui_atime;       /*  32: Last access time. */
+	__fs64   ui_mtime;       /*  40: Last modified time. */
+	__fs64   ui_ctime;       /*  48: Last inode change time. */
+	__fs64   ui_birthtime;   /*  56: Inode creation time. */
+	__fs32     ui_mtimensec;   /*  64: Last modified time. */
+	__fs32     ui_atimensec;   /*  68: Last access time. */
+	__fs32     ui_ctimensec;   /*  72: Last inode change time. */
+	__fs32     ui_birthnsec;   /*  76: Inode creation time. */
+	__fs32     ui_gen;         /*  80: Generation number. */
+	__fs32     ui_kernflags;   /*  84: Kernel flags. */
+	__fs32     ui_flags;       /*  88: Status flags (chflags). */
+	__fs32     ui_extsize;     /*  92: External attributes block. */
+	__fs64     ui_extb[UFS_NXADDR];/*  96: External attributes block. */
+	union {
+		struct {
+			__fs64     ui_db[UFS_NDADDR]; /* 112: Direct disk blocks. */
+			__fs64     ui_ib[UFS_NINDIR];/* 208: Indirect disk blocks.*/
+		} ui_addr;
+	__u8	ui_symlink[2*4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */
+	} ui_u2;
+	__fs64     ui_spare[3];    /* 232: Reserved; currently unused */
+};
+
+
+/* FreeBSD has these in sys/stat.h */
+/* ui_flags that can be set by a file owner */
+#define UFS_UF_SETTABLE   0x0000ffff
+#define UFS_UF_NODUMP     0x00000001  /* do not dump */
+#define UFS_UF_IMMUTABLE  0x00000002  /* immutable (can't "change") */
+#define UFS_UF_APPEND     0x00000004  /* append-only */
+#define UFS_UF_OPAQUE     0x00000008  /* directory is opaque (unionfs) */
+#define UFS_UF_NOUNLINK   0x00000010  /* can't be removed or renamed */
+/* ui_flags that only root can set */
+#define UFS_SF_SETTABLE   0xffff0000
+#define UFS_SF_ARCHIVED   0x00010000  /* archived */
+#define UFS_SF_IMMUTABLE  0x00020000  /* immutable (can't "change") */
+#define UFS_SF_APPEND     0x00040000  /* append-only */
+#define UFS_SF_NOUNLINK   0x00100000  /* can't be removed or renamed */
+
+/*
+ * This structure is used for reading disk structures larger
+ * than the size of fragment.
+ */
+struct ufs_buffer_head {
+	__u64 fragment;			/* first fragment */
+	__u64 count;				/* number of fragments */
+	struct buffer_head * bh[UFS_MAXFRAG];	/* buffers */
+};
+
+struct ufs_cg_private_info {
+	struct ufs_buffer_head c_ubh;
+	__u32	c_cgx;		/* number of cylidner group */
+	__u16	c_ncyl;		/* number of cyl's this cg */
+	__u16	c_niblk;	/* number of inode blocks this cg */
+	__u32	c_ndblk;	/* number of data blocks this cg */
+	__u32	c_rotor;	/* position of last used block */
+	__u32	c_frotor;	/* position of last used frag */
+	__u32	c_irotor;	/* position of last used inode */
+	__u32	c_btotoff;	/* (__u32) block totals per cylinder */
+	__u32	c_boff;		/* (short) free block positions */
+	__u32	c_iusedoff;	/* (char) used inode map */
+	__u32	c_freeoff;	/* (u_char) free block map */
+	__u32	c_nextfreeoff;	/* (u_char) next available space */
+	__u32	c_clustersumoff;/* (u_int32) counts of avail clusters */
+	__u32	c_clusteroff;	/* (u_int8) free cluster map */
+	__u32	c_nclusterblks;	/* number of clusters this cg */
+};
+
+
+struct ufs_sb_private_info {
+	struct ufs_buffer_head s_ubh; /* buffer containing super block */
+	struct ufs_csum_core cs_total;
+	__u32	s_sblkno;	/* offset of super-blocks in filesys */
+	__u32	s_cblkno;	/* offset of cg-block in filesys */
+	__u32	s_iblkno;	/* offset of inode-blocks in filesys */
+	__u32	s_dblkno;	/* offset of first data after cg */
+	__u32	s_cgoffset;	/* cylinder group offset in cylinder */
+	__u32	s_cgmask;	/* used to calc mod fs_ntrak */
+	__u32	s_size;		/* number of blocks (fragments) in fs */
+	__u32	s_dsize;	/* number of data blocks in fs */
+	__u64	s_u2_size;	/* ufs2: number of blocks (fragments) in fs */
+	__u64	s_u2_dsize;	/*ufs2:  number of data blocks in fs */
+	__u32	s_ncg;		/* number of cylinder groups */
+	__u32	s_bsize;	/* size of basic blocks */
+	__u32	s_fsize;	/* size of fragments */
+	__u32	s_fpb;		/* fragments per block */
+	__u32	s_minfree;	/* minimum percentage of free blocks */
+	__u32	s_bmask;	/* `blkoff'' calc of blk offsets */
+	__u32	s_fmask;	/* s_fsize mask */
+	__u32	s_bshift;	/* `lblkno'' calc of logical blkno */
+	__u32   s_fshift;	/* s_fsize shift */
+	__u32	s_fpbshift;	/* fragments per block shift */
+	__u32	s_fsbtodb;	/* fsbtodb and dbtofsb shift constant */
+	__u32	s_sbsize;	/* actual size of super block */
+	__u32   s_csmask;	/* csum block offset */
+	__u32	s_csshift;	/* csum block number */
+	__u32	s_nindir;	/* value of NINDIR */
+	__u32	s_inopb;	/* value of INOPB */
+	__u32	s_nspf;		/* value of NSPF */
+	__u32	s_npsect;	/* # sectors/track including spares */
+	__u32	s_interleave;	/* hardware sector interleave */
+	__u32	s_trackskew;	/* sector 0 skew, per track */
+	__u64	s_csaddr;	/* blk addr of cyl grp summary area */
+	__u32	s_cssize;	/* size of cyl grp summary area */
+	__u32	s_cgsize;	/* cylinder group size */
+	__u32	s_ntrak;	/* tracks per cylinder */
+	__u32	s_nsect;	/* sectors per track */
+	__u32	s_spc;		/* sectors per cylinder */
+	__u32	s_ipg;		/* inodes per cylinder group */
+	__u32	s_fpg;		/* fragments per group */
+	__u32	s_cpc;		/* cyl per cycle in postbl */
+	__s32	s_contigsumsize;/* size of cluster summary array, 44bsd */
+	__s64	s_qbmask;	/* ~usb_bmask */
+	__s64	s_qfmask;	/* ~usb_fmask */
+	__s32	s_postblformat;	/* format of positional layout tables */
+	__s32	s_nrpos;	/* number of rotational positions */
+        __s32	s_postbloff;	/* (__s16) rotation block list head */
+	__s32	s_rotbloff;	/* (__u8) blocks for each rotation */
+
+	__u32	s_fpbmask;	/* fragments per block mask */
+	__u32	s_apb;		/* address per block */
+	__u32	s_2apb;		/* address per block^2 */
+	__u32	s_3apb;		/* address per block^3 */
+	__u32	s_apbmask;	/* address per block mask */
+	__u32	s_apbshift;	/* address per block shift */
+	__u32	s_2apbshift;	/* address per block shift * 2 */
+	__u32	s_3apbshift;	/* address per block shift * 3 */
+	__u32	s_nspfshift;	/* number of sector per fragment shift */
+	__u32	s_nspb;		/* number of sector per block */
+	__u32	s_inopf;	/* inodes per fragment */
+	__u32	s_sbbase;	/* offset of NeXTstep superblock */
+	__u32	s_bpf;		/* bits per fragment */
+	__u32	s_bpfshift;	/* bits per fragment shift*/
+	__u32	s_bpfmask;	/* bits per fragment mask */
+
+	__u32	s_maxsymlinklen;/* upper limit on fast symlinks' size */
+	__s32	fs_magic;       /* filesystem magic */
+	unsigned int s_dirblksize;
+};
+
+/*
+ * Sizes of this structures are:
+ *	ufs_super_block_first	512
+ *	ufs_super_block_second	512
+ *	ufs_super_block_third	356
+ */
+struct ufs_super_block_first {
+	union {
+		struct {
+			__fs32	fs_link;	/* UNUSED */
+		} fs_42;
+		struct {
+			__fs32	fs_state;	/* file system state flag */
+		} fs_sun;
+	} fs_u0;
+	__fs32	fs_rlink;
+	__fs32	fs_sblkno;
+	__fs32	fs_cblkno;
+	__fs32	fs_iblkno;
+	__fs32	fs_dblkno;
+	__fs32	fs_cgoffset;
+	__fs32	fs_cgmask;
+	__fs32	fs_time;
+	__fs32	fs_size;
+	__fs32	fs_dsize;
+	__fs32	fs_ncg;
+	__fs32	fs_bsize;
+	__fs32	fs_fsize;
+	__fs32	fs_frag;
+	__fs32	fs_minfree;
+	__fs32	fs_rotdelay;
+	__fs32	fs_rps;
+	__fs32	fs_bmask;
+	__fs32	fs_fmask;
+	__fs32	fs_bshift;
+	__fs32	fs_fshift;
+	__fs32	fs_maxcontig;
+	__fs32	fs_maxbpg;
+	__fs32	fs_fragshift;
+	__fs32	fs_fsbtodb;
+	__fs32	fs_sbsize;
+	__fs32	fs_csmask;
+	__fs32	fs_csshift;
+	__fs32	fs_nindir;
+	__fs32	fs_inopb;
+	__fs32	fs_nspf;
+	__fs32	fs_optim;
+	union {
+		struct {
+			__fs32	fs_npsect;
+		} fs_sun;
+		struct {
+			__fs32	fs_state;
+		} fs_sunx86;
+	} fs_u1;
+	__fs32	fs_interleave;
+	__fs32	fs_trackskew;
+	__fs32	fs_id[2];
+	__fs32	fs_csaddr;
+	__fs32	fs_cssize;
+	__fs32	fs_cgsize;
+	__fs32	fs_ntrak;
+	__fs32	fs_nsect;
+	__fs32	fs_spc;
+	__fs32	fs_ncyl;
+	__fs32	fs_cpg;
+	__fs32	fs_ipg;
+	__fs32	fs_fpg;
+	struct ufs_csum fs_cstotal;
+	__s8	fs_fmod;
+	__s8	fs_clean;
+	__s8	fs_ronly;
+	__s8	fs_flags;
+	__s8	fs_fsmnt[UFS_MAXMNTLEN - 212];
+
+};
+
+struct ufs_super_block_second {
+	union {
+		struct {
+			__s8	fs_fsmnt[212];
+			__fs32	fs_cgrotor;
+			__fs32	fs_csp[UFS_MAXCSBUFS];
+			__fs32	fs_maxcluster;
+			__fs32	fs_cpc;
+			__fs16	fs_opostbl[82];
+		} fs_u1;
+		struct {
+			__s8  fs_fsmnt[UFS2_MAXMNTLEN - UFS_MAXMNTLEN + 212];
+			__u8   fs_volname[UFS2_MAXVOLLEN];
+			__fs64  fs_swuid;
+			__fs32  fs_pad;
+			__fs32   fs_cgrotor;
+			__fs32   fs_ocsp[UFS2_NOCSPTRS];
+			__fs32   fs_contigdirs;
+			__fs32   fs_csp;
+			__fs32   fs_maxcluster;
+			__fs32   fs_active;
+			__fs32   fs_old_cpc;
+			__fs32   fs_maxbsize;
+			__fs64   fs_sparecon64[17];
+			__fs64   fs_sblockloc;
+			__fs64	cs_ndir;
+			__fs64	cs_nbfree;
+		} fs_u2;
+	} fs_un;
+};
+
+struct ufs_super_block_third {
+	union {
+		struct {
+			__fs16	fs_opostbl[46];
+		} fs_u1;
+		struct {
+			__fs64	cs_nifree;	/* number of free inodes */
+			__fs64	cs_nffree;	/* number of free frags */
+			__fs64   cs_numclusters;	/* number of free clusters */
+			__fs64   cs_spare[3];	/* future expansion */
+			struct  ufs_timeval    fs_time;		/* last time written */
+			__fs64    fs_size;		/* number of blocks in fs */
+			__fs64    fs_dsize;	/* number of data blocks in fs */
+			__fs64   fs_csaddr;	/* blk addr of cyl grp summary area */
+			__fs64    fs_pendingblocks;/* blocks in process of being freed */
+			__fs32    fs_pendinginodes;/*inodes in process of being freed */
+		} __attribute__ ((packed)) fs_u2;
+	} fs_un1;
+	union {
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_state;	/* file system state time stamp */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sun;
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_npsect;	/* # sectors/track including spares */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sunx86;
+		struct {
+			__fs32	fs_sparecon[50];/* reserved for future constants */
+			__fs32	fs_contigsumsize;/* size of cluster summary array */
+			__fs32	fs_maxsymlinklen;/* max length of an internal symlink */
+			__fs32	fs_inodefmt;	/* format of on-disk inodes */
+			__fs32	fs_maxfilesize[2];	/* max representable file size */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+			__fs32	fs_state;	/* file system state time stamp */
+		} fs_44;
+	} fs_un2;
+	__fs32	fs_postblformat;
+	__fs32	fs_nrpos;
+	__fs32	fs_postbloff;
+	__fs32	fs_rotbloff;
+	__fs32	fs_magic;
+	__u8	fs_space[1];
+};
+
+#endif /* __LINUX_UFS_FS_H */
diff --git a/kernel/fs/ufs/util.c b/kernel/fs/ufs/util.c
new file mode 100644
index 000000000..b6c2f94e0
--- /dev/null
+++ b/kernel/fs/ufs/util.c
@@ -0,0 +1,282 @@
+/*
+ *  linux/fs/ufs/util.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ */
+ 
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
+	struct super_block *sb, u64 fragment, u64 size)
+{
+	struct ufs_buffer_head * ubh;
+	unsigned i, j ;
+	u64  count = 0;
+	if (size & ~uspi->s_fmask)
+		return NULL;
+	count = size >> uspi->s_fshift;
+	if (count > UFS_MAXFRAG)
+		return NULL;
+	ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
+	if (!ubh)
+		return NULL;
+	ubh->fragment = fragment;
+	ubh->count = count;
+	for (i = 0; i < count; i++)
+		if (!(ubh->bh[i] = sb_bread(sb, fragment + i)))
+			goto failed;
+	for (; i < UFS_MAXFRAG; i++)
+		ubh->bh[i] = NULL;
+	return ubh;
+failed:
+	for (j = 0; j < i; j++)
+		brelse (ubh->bh[j]);
+	kfree(ubh);
+	return NULL;
+}
+
+struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
+	struct super_block *sb, u64 fragment, u64 size)
+{
+	unsigned i, j;
+	u64 count = 0;
+	if (size & ~uspi->s_fmask)
+		return NULL;
+	count = size >> uspi->s_fshift;
+	if (count <= 0 || count > UFS_MAXFRAG)
+		return NULL;
+	USPI_UBH(uspi)->fragment = fragment;
+	USPI_UBH(uspi)->count = count;
+	for (i = 0; i < count; i++)
+		if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
+			goto failed;
+	for (; i < UFS_MAXFRAG; i++)
+		USPI_UBH(uspi)->bh[i] = NULL;
+	return USPI_UBH(uspi);
+failed:
+	for (j = 0; j < i; j++)
+		brelse (USPI_UBH(uspi)->bh[j]);
+	return NULL;
+}
+
+void ubh_brelse (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	if (!ubh)
+		return;
+	for (i = 0; i < ubh->count; i++)
+		brelse (ubh->bh[i]);
+	kfree (ubh);
+}
+
+void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
+{
+	unsigned i;
+	if (!USPI_UBH(uspi))
+		return;
+	for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
+		brelse (USPI_UBH(uspi)->bh[i]);
+		USPI_UBH(uspi)->bh[i] = NULL;
+	}
+}
+
+void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	if (!ubh)
+		return;
+	for ( i = 0; i < ubh->count; i++ )
+		mark_buffer_dirty (ubh->bh[i]);
+}
+
+void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
+{
+	unsigned i;
+	if (!ubh)
+		return;
+	if (flag) {
+		for ( i = 0; i < ubh->count; i++ )
+			set_buffer_uptodate (ubh->bh[i]);
+	} else {
+		for ( i = 0; i < ubh->count; i++ )
+			clear_buffer_uptodate (ubh->bh[i]);
+	}
+}
+
+void ubh_sync_block(struct ufs_buffer_head *ubh)
+{
+	if (ubh) {
+		unsigned i;
+
+		for (i = 0; i < ubh->count; i++)
+			write_dirty_buffer(ubh->bh[i], WRITE);
+
+		for (i = 0; i < ubh->count; i++)
+			wait_on_buffer(ubh->bh[i]);
+	}
+}
+
+void ubh_bforget (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	if (!ubh) 
+		return;
+	for ( i = 0; i < ubh->count; i++ ) if ( ubh->bh[i] ) 
+		bforget (ubh->bh[i]);
+}
+ 
+int ubh_buffer_dirty (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	unsigned result = 0;
+	if (!ubh)
+		return 0;
+	for ( i = 0; i < ubh->count; i++ )
+		result |= buffer_dirty(ubh->bh[i]);
+	return result;
+}
+
+void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi, 
+	unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size)
+{
+	unsigned len, bhno;
+	if (size > (ubh->count << uspi->s_fshift))
+		size = ubh->count << uspi->s_fshift;
+	bhno = 0;
+	while (size) {
+		len = min_t(unsigned int, size, uspi->s_fsize);
+		memcpy (mem, ubh->bh[bhno]->b_data, len);
+		mem += uspi->s_fsize;
+		size -= len;
+		bhno++;
+	}
+}
+
+void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi, 
+	struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size)
+{
+	unsigned len, bhno;
+	if (size > (ubh->count << uspi->s_fshift))
+		size = ubh->count << uspi->s_fshift;
+	bhno = 0;
+	while (size) {
+		len = min_t(unsigned int, size, uspi->s_fsize);
+		memcpy (ubh->bh[bhno]->b_data, mem, len);
+		mem += uspi->s_fsize;
+		size -= len;
+		bhno++;
+	}
+}
+
+dev_t
+ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
+{
+	__u32 fs32;
+	dev_t dev;
+
+	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]);
+	else
+		fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]);
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNx86:
+	case UFS_ST_SUN:
+		if ((fs32 & 0xffff0000) == 0 ||
+		    (fs32 & 0xffff0000) == 0xffff0000)
+			dev = old_decode_dev(fs32 & 0x7fff);
+		else
+			dev = MKDEV(sysv_major(fs32), sysv_minor(fs32));
+		break;
+
+	default:
+		dev = old_decode_dev(fs32);
+		break;
+	}
+	return dev;
+}
+
+void
+ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev)
+{
+	__u32 fs32;
+
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNx86:
+	case UFS_ST_SUN:
+		fs32 = sysv_encode_dev(dev);
+		if ((fs32 & 0xffff8000) == 0) {
+			fs32 = old_encode_dev(dev);
+		}
+		break;
+
+	default:
+		fs32 = old_encode_dev(dev);
+		break;
+	}
+	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32);
+	else
+		ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32);
+}
+
+/**
+ * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist
+ * read it from disk.
+ * @mapping: the address_space to search
+ * @index: the page index
+ *
+ * Locates the desired pagecache page, if not exist we'll read it,
+ * locks it, increments its reference
+ * count and returns its address.
+ *
+ */
+
+struct page *ufs_get_locked_page(struct address_space *mapping,
+				 pgoff_t index)
+{
+	struct page *page;
+
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		page = read_mapping_page(mapping, index, NULL);
+
+		if (IS_ERR(page)) {
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "read_mapping_page error: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+			goto out;
+		}
+
+		lock_page(page);
+
+		if (unlikely(page->mapping == NULL)) {
+			/* Truncate got there first */
+			unlock_page(page);
+			page_cache_release(page);
+			page = NULL;
+			goto out;
+		}
+
+		if (!PageUptodate(page) || PageError(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "can not read page: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+
+			page = ERR_PTR(-EIO);
+		}
+	}
+out:
+	return page;
+}
diff --git a/kernel/fs/ufs/util.h b/kernel/fs/ufs/util.h
new file mode 100644
index 000000000..954175928
--- /dev/null
+++ b/kernel/fs/ufs/util.h
@@ -0,0 +1,592 @@
+/*
+ *  linux/fs/ufs/util.h
+ *
+ * Copyright (C) 1998 
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include "swab.h"
+
+
+/*
+ * some useful macros
+ */
+#define in_range(b,first,len)	((b)>=(first)&&(b)<(first)+(len))
+
+/*
+ * functions used for retyping
+ */
+static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
+{
+	return &cpi->c_ubh;
+}
+static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
+{
+	return &spi->s_ubh;
+}
+
+
+
+/*
+ * macros used for accessing structures
+ */
+static inline s32
+ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
+		 struct ufs_super_block_third *usb3)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+		if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT)
+			return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state);
+		/* Fall Through to UFS_ST_SUN */
+	case UFS_ST_SUN:
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
+	case UFS_ST_SUNx86:
+		return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
+	case UFS_ST_44BSD:
+	default:
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
+	}
+}
+
+static inline void
+ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
+		 struct ufs_super_block_third *usb3, s32 value)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+		if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) {
+			usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value);
+			break;
+		}
+		/* Fall Through to UFS_ST_SUN */
+	case UFS_ST_SUN:
+		usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
+		break;
+	case UFS_ST_SUNx86:
+		usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
+		break;
+	case UFS_ST_44BSD:
+		usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
+		break;
+	}
+}
+
+static inline u32
+ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
+		  struct ufs_super_block_third *usb3)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
+	else
+		return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
+}
+
+static inline u64
+ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
+{
+	__fs64 tmp;
+
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+	case UFS_ST_SUN:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
+		break;
+	case UFS_ST_SUNx86:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
+		break;
+	case UFS_ST_44BSD:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
+		break;
+	}
+
+	return fs64_to_cpu(sb, tmp);
+}
+
+static inline u64
+ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
+{
+	__fs64 tmp;
+
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+	case UFS_ST_SUN:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
+		break;
+	case UFS_ST_SUNx86:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
+		break;
+	case UFS_ST_44BSD:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
+		break;
+	}
+
+	return fs64_to_cpu(sb, tmp);
+}
+
+static inline u16
+ufs_get_de_namlen(struct super_block *sb, struct ufs_dir_entry *de)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD)
+		return fs16_to_cpu(sb, de->d_u.d_namlen);
+	else
+		return de->d_u.d_44.d_namlen; /* XXX this seems wrong */
+}
+
+static inline void
+ufs_set_de_namlen(struct super_block *sb, struct ufs_dir_entry *de, u16 value)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD)
+		de->d_u.d_namlen = cpu_to_fs16(sb, value);
+	else
+		de->d_u.d_44.d_namlen = value; /* XXX this seems wrong */
+}
+
+static inline void
+ufs_set_de_type(struct super_block *sb, struct ufs_dir_entry *de, int mode)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) != UFS_DE_44BSD)
+		return;
+
+	/*
+	 * TODO turn this into a table lookup
+	 */
+	switch (mode & S_IFMT) {
+	case S_IFSOCK:
+		de->d_u.d_44.d_type = DT_SOCK;
+		break;
+	case S_IFLNK:
+		de->d_u.d_44.d_type = DT_LNK;
+		break;
+	case S_IFREG:
+		de->d_u.d_44.d_type = DT_REG;
+		break;
+	case S_IFBLK:
+		de->d_u.d_44.d_type = DT_BLK;
+		break;
+	case S_IFDIR:
+		de->d_u.d_44.d_type = DT_DIR;
+		break;
+	case S_IFCHR:
+		de->d_u.d_44.d_type = DT_CHR;
+		break;
+	case S_IFIFO:
+		de->d_u.d_44.d_type = DT_FIFO;
+		break;
+	default:
+		de->d_u.d_44.d_type = DT_UNKNOWN;
+	}
+}
+
+static inline u32
+ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_uid);
+	case UFS_UID_EFT:
+		if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
+			return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid);
+		/* Fall through */
+	default:
+		return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid);
+	}
+}
+
+static inline void
+ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		inode->ui_u3.ui_44.ui_uid = cpu_to_fs32(sb, value);
+		inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value);
+		break;
+	case UFS_UID_EFT:
+		inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value);
+		if (value > 0xFFFF)
+			value = 0xFFFF;
+		/* Fall through */
+	default:
+		inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value);
+		break;
+	}
+}
+
+static inline u32
+ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid);
+	case UFS_UID_EFT:
+		if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
+			return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid);
+		/* Fall through */
+	default:
+		return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid);
+	}
+}
+
+static inline void
+ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		inode->ui_u3.ui_44.ui_gid = cpu_to_fs32(sb, value);
+		inode->ui_u1.oldids.ui_sgid =  cpu_to_fs16(sb, value);
+		break;
+	case UFS_UID_EFT:
+		inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value);
+		if (value > 0xFFFF)
+			value = 0xFFFF;
+		/* Fall through */
+	default:
+		inode->ui_u1.oldids.ui_sgid =  cpu_to_fs16(sb, value);
+		break;
+	}
+}
+
+extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
+extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
+extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+
+/*
+ * These functions manipulate ufs buffers
+ */
+#define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size)  
+extern struct ufs_buffer_head * _ubh_bread_(struct ufs_sb_private_info *, struct super_block *, u64 , u64);
+extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, struct super_block *, u64, u64);
+extern void ubh_brelse (struct ufs_buffer_head *);
+extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
+extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
+extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
+extern void ubh_sync_block(struct ufs_buffer_head *);
+extern void ubh_bforget (struct ufs_buffer_head *);
+extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
+#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
+extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned);
+#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size)
+extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
+
+/* This functions works with cache pages*/
+extern struct page *ufs_get_locked_page(struct address_space *mapping,
+					pgoff_t index);
+static inline void ufs_put_locked_page(struct page *page)
+{
+       unlock_page(page);
+       page_cache_release(page);
+}
+
+
+/*
+ * macros and inline function to get important structures from ufs_sb_private_info
+ */
+
+static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
+				   unsigned int offset)
+{
+	unsigned int index;
+	
+	index = offset >> uspi->s_fshift;
+	offset &= ~uspi->s_fmask;
+	return uspi->s_ubh.bh[index]->b_data + offset;
+}
+
+#define ubh_get_usb_first(uspi) \
+	((struct ufs_super_block_first *)get_usb_offset((uspi), 0))
+
+#define ubh_get_usb_second(uspi) \
+	((struct ufs_super_block_second *)get_usb_offset((uspi), UFS_SECTOR_SIZE))
+
+#define ubh_get_usb_third(uspi)	\
+	((struct ufs_super_block_third *)get_usb_offset((uspi), 2*UFS_SECTOR_SIZE))
+
+
+#define ubh_get_ucg(ubh) \
+	((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data))
+
+
+/*
+ * Extract byte from ufs_buffer_head
+ * Extract the bits for a block from a map inside ufs_buffer_head
+ */
+#define ubh_get_addr8(ubh,begin) \
+	((u8*)(ubh)->bh[(begin) >> uspi->s_fshift]->b_data + \
+	((begin) & ~uspi->s_fmask))
+
+#define ubh_get_addr16(ubh,begin) \
+	(((__fs16*)((ubh)->bh[(begin) >> (uspi->s_fshift-1)]->b_data)) + \
+	((begin) & ((uspi->fsize>>1) - 1)))
+
+#define ubh_get_addr32(ubh,begin) \
+	(((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \
+	((begin) & ((uspi->s_fsize>>2) - 1)))
+
+#define ubh_get_addr64(ubh,begin) \
+	(((__fs64*)((ubh)->bh[(begin) >> (uspi->s_fshift-3)]->b_data)) + \
+	((begin) & ((uspi->s_fsize>>3) - 1)))
+
+#define ubh_get_addr ubh_get_addr8
+
+static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi,
+				     struct ufs_buffer_head *ubh,
+				     u64 blk)
+{
+	if (uspi->fs_magic == UFS2_MAGIC)
+		return ubh_get_addr64(ubh, blk);
+	else
+		return ubh_get_addr32(ubh, blk);
+}
+
+#define ubh_blkmap(ubh,begin,bit) \
+	((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
+
+/*
+ * Determine the number of available frags given a
+ * percentage to hold in reserve.
+ */
+static inline u64
+ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
+{
+	return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree -
+		(uspi->s_dsize * (percentreserved) / 100);
+}
+
+/*
+ * Macros to access cylinder group array structures
+ */
+#define ubh_cg_blktot(ucpi,cylno) \
+	(*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
+
+#define ubh_cg_blks(ucpi,cylno,rpos) \
+	(*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
+	(ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
+
+/*
+ * Bitmap operations
+ * These functions work like classical bitmap operations.
+ * The difference is that we don't have the whole bitmap
+ * in one contiguous chunk of memory, but in several buffers.
+ * The parameters of each function are super_block, ufs_buffer_head and
+ * position of the beginning of the bitmap.
+ */
+#define ubh_setbit(ubh,begin,bit) \
+	(*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) |= (1 << ((bit) & 7)))
+
+#define ubh_clrbit(ubh,begin,bit) \
+	(*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) &= ~(1 << ((bit) & 7)))
+
+#define ubh_isset(ubh,begin,bit) \
+	(*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) & (1 << ((bit) & 7)))
+
+#define ubh_isclr(ubh,begin,bit) (!ubh_isset(ubh,begin,bit))
+
+#define ubh_find_first_zero_bit(ubh,begin,size) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,0)
+
+#define ubh_find_next_zero_bit(ubh,begin,size,offset) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,offset)
+static inline unsigned _ubh_find_next_zero_bit_(
+	struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh,
+	unsigned begin, unsigned size, unsigned offset)
+{
+	unsigned base, count, pos;
+
+	size -= offset;
+	begin <<= 3;
+	offset += begin;
+	base = offset >> uspi->s_bpfshift;
+	offset &= uspi->s_bpfmask;
+	for (;;) {
+		count = min_t(unsigned int, size + offset, uspi->s_bpf);
+		size -= count - offset;
+		pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset);
+		if (pos < count || !size)
+			break;
+		base++;
+		offset = 0;
+	}
+	return (base << uspi->s_bpfshift) + pos - begin;
+} 	
+
+static inline unsigned find_last_zero_bit (unsigned char * bitmap,
+	unsigned size, unsigned offset)
+{
+	unsigned bit, i;
+	unsigned char * mapp;
+	unsigned char map;
+
+	mapp = bitmap + (size >> 3);
+	map = *mapp--;
+	bit = 1 << (size & 7);
+	for (i = size; i > offset; i--) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & 7) != 0) {
+			bit >>= 1;
+		} else {
+			map = *mapp--;
+			bit = 1 << 7;
+		}
+	}
+	return i;
+}
+
+#define ubh_find_last_zero_bit(ubh,begin,size,offset) _ubh_find_last_zero_bit_(uspi,ubh,begin,size,offset)
+static inline unsigned _ubh_find_last_zero_bit_(
+	struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh,
+	unsigned begin, unsigned start, unsigned end)
+{
+	unsigned base, count, pos, size;
+
+	size = start - end;
+	begin <<= 3;
+	start += begin;
+	base = start >> uspi->s_bpfshift;
+	start &= uspi->s_bpfmask;
+	for (;;) {
+		count = min_t(unsigned int,
+			    size + (uspi->s_bpf - start), uspi->s_bpf)
+			- (uspi->s_bpf - start);
+		size -= count;
+		pos = find_last_zero_bit (ubh->bh[base]->b_data,
+			start, start - count);
+		if (pos > start - count || !size)
+			break;
+		base--;
+		start = uspi->s_bpf;
+	}
+	return (base << uspi->s_bpfshift) + pos - begin;
+} 	
+
+#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block))
+
+#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block)
+static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi,
+	struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+{
+	switch (uspi->s_fpb) {
+	case 8:
+	    	return (*ubh_get_addr (ubh, begin + block) == 0xff);
+	case 4:
+		return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2)));
+	case 2:
+		return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1)));
+	case 1:
+		return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07)));
+	}
+	return 0;	
+}
+
+#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block)
+static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi,
+	struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+{
+	switch (uspi->s_fpb) {
+	case 8:
+	    	*ubh_get_addr (ubh, begin + block) = 0x00;
+	    	return; 
+	case 4:
+		*ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2));
+		return;
+	case 2:
+		*ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1));
+		return;
+	case 1:
+		*ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07)));
+		return;
+	}
+}
+
+#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block)
+static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi,
+	struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+{
+	switch (uspi->s_fpb) {
+	case 8:
+	    	*ubh_get_addr(ubh, begin + block) = 0xff;
+	    	return;
+	case 4:
+		*ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2));
+		return;
+	case 2:
+		*ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1));
+		return;
+	case 1:
+		*ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07)));
+		return;
+	}
+}
+
+static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap,
+	__fs32 * fraglist, int cnt)
+{
+	struct ufs_sb_private_info * uspi;
+	unsigned fragsize, pos;
+	
+	uspi = UFS_SB(sb)->s_uspi;
+	
+	fragsize = 0;
+	for (pos = 0; pos < uspi->s_fpb; pos++) {
+		if (blockmap & (1 << pos)) {
+			fragsize++;
+		}
+		else if (fragsize > 0) {
+			fs32_add(sb, &fraglist[fragsize], cnt);
+			fragsize = 0;
+		}
+	}
+	if (fragsize > 0 && fragsize < uspi->s_fpb)
+		fs32_add(sb, &fraglist[fragsize], cnt);
+}
+
+static inline void *ufs_get_direct_data_ptr(struct ufs_sb_private_info *uspi,
+					    struct ufs_inode_info *ufsi,
+					    unsigned blk)
+{
+	BUG_ON(blk > UFS_TIND_BLOCK);
+	return uspi->fs_magic == UFS2_MAGIC ?
+		(void *)&ufsi->i_u1.u2_i_data[blk] :
+		(void *)&ufsi->i_u1.i_data[blk];
+}
+
+static inline u64 ufs_data_ptr_to_cpu(struct super_block *sb, void *p)
+{
+	return UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC ?
+		fs64_to_cpu(sb, *(__fs64 *)p) :
+		fs32_to_cpu(sb, *(__fs32 *)p);
+}
+
+static inline void ufs_cpu_to_data_ptr(struct super_block *sb, void *p, u64 val)
+{
+	if (UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC)
+		*(__fs64 *)p = cpu_to_fs64(sb, val);
+	else
+		*(__fs32 *)p = cpu_to_fs32(sb, val);
+}
+
+static inline void ufs_data_ptr_clear(struct ufs_sb_private_info *uspi,
+				      void *p)
+{
+	if (uspi->fs_magic == UFS2_MAGIC)
+		*(__fs64 *)p = 0;
+	else
+		*(__fs32 *)p = 0;
+}
+
+static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi,
+				       void *p)
+{
+	if (uspi->fs_magic == UFS2_MAGIC)
+		return *(__fs64 *)p == 0;
+	else
+		return *(__fs32 *)p == 0;
+}
diff --git a/kernel/fs/utimes.c b/kernel/fs/utimes.c
new file mode 100644
index 000000000..aa138d645
--- /dev/null
+++ b/kernel/fs/utimes.c
@@ -0,0 +1,235 @@
+#include <linux/compiler.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/linkage.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/stat.h>
+#include <linux/utime.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#ifdef __ARCH_WANT_SYS_UTIME
+
+/*
+ * sys_utime() can be implemented in user-level using sys_utimes().
+ * Is this for backwards compatibility?  If so, why not move it
+ * into the appropriate arch directory (for those architectures that
+ * need it).
+ */
+
+/* If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
+{
+	struct timespec tv[2];
+
+	if (times) {
+		if (get_user(tv[0].tv_sec, &times->actime) ||
+		    get_user(tv[1].tv_sec, &times->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0);
+}
+
+#endif
+
+static bool nsec_valid(long nsec)
+{
+	if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
+		return true;
+
+	return nsec >= 0 && nsec <= 999999999;
+}
+
+static int utimes_common(struct path *path, struct timespec *times)
+{
+	int error;
+	struct iattr newattrs;
+	struct inode *inode = path->dentry->d_inode;
+	struct inode *delegated_inode = NULL;
+
+	error = mnt_want_write(path->mnt);
+	if (error)
+		goto out;
+
+	if (times && times[0].tv_nsec == UTIME_NOW &&
+		     times[1].tv_nsec == UTIME_NOW)
+		times = NULL;
+
+	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
+	if (times) {
+		if (times[0].tv_nsec == UTIME_OMIT)
+			newattrs.ia_valid &= ~ATTR_ATIME;
+		else if (times[0].tv_nsec != UTIME_NOW) {
+			newattrs.ia_atime.tv_sec = times[0].tv_sec;
+			newattrs.ia_atime.tv_nsec = times[0].tv_nsec;
+			newattrs.ia_valid |= ATTR_ATIME_SET;
+		}
+
+		if (times[1].tv_nsec == UTIME_OMIT)
+			newattrs.ia_valid &= ~ATTR_MTIME;
+		else if (times[1].tv_nsec != UTIME_NOW) {
+			newattrs.ia_mtime.tv_sec = times[1].tv_sec;
+			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
+			newattrs.ia_valid |= ATTR_MTIME_SET;
+		}
+		/*
+		 * Tell inode_change_ok(), that this is an explicit time
+		 * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
+		 * were used.
+		 */
+		newattrs.ia_valid |= ATTR_TIMES_SET;
+	} else {
+		/*
+		 * If times is NULL (or both times are UTIME_NOW),
+		 * then we need to check permissions, because
+		 * inode_change_ok() won't do it.
+		 */
+		error = -EACCES;
+                if (IS_IMMUTABLE(inode))
+			goto mnt_drop_write_and_out;
+
+		if (!inode_owner_or_capable(inode)) {
+			error = inode_permission(inode, MAY_WRITE);
+			if (error)
+				goto mnt_drop_write_and_out;
+		}
+	}
+retry_deleg:
+	mutex_lock(&inode->i_mutex);
+	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	mutex_unlock(&inode->i_mutex);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+
+mnt_drop_write_and_out:
+	mnt_drop_write(path->mnt);
+out:
+	return error;
+}
+
+/*
+ * do_utimes - change times on filename or file descriptor
+ * @dfd: open file descriptor, -1 or AT_FDCWD
+ * @filename: path name or NULL
+ * @times: new times or NULL
+ * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment)
+ *
+ * If filename is NULL and dfd refers to an open file, then operate on
+ * the file.  Otherwise look up filename, possibly using dfd as a
+ * starting point.
+ *
+ * If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+	       int flags)
+{
+	int error = -EINVAL;
+
+	if (times && (!nsec_valid(times[0].tv_nsec) ||
+		      !nsec_valid(times[1].tv_nsec))) {
+		goto out;
+	}
+
+	if (flags & ~AT_SYMLINK_NOFOLLOW)
+		goto out;
+
+	if (filename == NULL && dfd != AT_FDCWD) {
+		struct fd f;
+
+		if (flags & AT_SYMLINK_NOFOLLOW)
+			goto out;
+
+		f = fdget(dfd);
+		error = -EBADF;
+		if (!f.file)
+			goto out;
+
+		error = utimes_common(&f.file->f_path, times);
+		fdput(f);
+	} else {
+		struct path path;
+		int lookup_flags = 0;
+
+		if (!(flags & AT_SYMLINK_NOFOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+retry:
+		error = user_path_at(dfd, filename, lookup_flags, &path);
+		if (error)
+			goto out;
+
+		error = utimes_common(&path, times);
+		path_put(&path);
+		if (retry_estale(error, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+
+out:
+	return error;
+}
+
+SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
+		struct timespec __user *, utimes, int, flags)
+{
+	struct timespec tstimes[2];
+
+	if (utimes) {
+		if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
+			return -EFAULT;
+
+		/* Nothing to do, we must not even check the path.  */
+		if (tstimes[0].tv_nsec == UTIME_OMIT &&
+		    tstimes[1].tv_nsec == UTIME_OMIT)
+			return 0;
+	}
+
+	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
+}
+
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
+		struct timeval __user *, utimes)
+{
+	struct timeval times[2];
+	struct timespec tstimes[2];
+
+	if (utimes) {
+		if (copy_from_user(&times, utimes, sizeof(times)))
+			return -EFAULT;
+
+		/* This test is needed to catch all invalid values.  If we
+		   would test only in do_utimes we would miss those invalid
+		   values truncated by the multiplication with 1000.  Note
+		   that we also catch UTIME_{NOW,OMIT} here which are only
+		   valid for utimensat.  */
+		if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 ||
+		    times[1].tv_usec >= 1000000 || times[1].tv_usec < 0)
+			return -EINVAL;
+
+		tstimes[0].tv_sec = times[0].tv_sec;
+		tstimes[0].tv_nsec = 1000 * times[0].tv_usec;
+		tstimes[1].tv_sec = times[1].tv_sec;
+		tstimes[1].tv_nsec = 1000 * times[1].tv_usec;
+	}
+
+	return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
+}
+
+SYSCALL_DEFINE2(utimes, char __user *, filename,
+		struct timeval __user *, utimes)
+{
+	return sys_futimesat(AT_FDCWD, filename, utimes);
+}
diff --git a/kernel/fs/xattr.c b/kernel/fs/xattr.c
new file mode 100644
index 000000000..4ef698549
--- /dev/null
+++ b/kernel/fs/xattr.c
@@ -0,0 +1,972 @@
+/*
+  File: fs/xattr.c
+
+  Extended attribute handling.
+
+  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+  Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
+  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/evm.h>
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/fsnotify.h>
+#include <linux/audit.h>
+#include <linux/vmalloc.h>
+#include <linux/posix_acl_xattr.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Check permissions for extended attribute access.  This is a bit complicated
+ * because different namespaces have very different rules.
+ */
+static int
+xattr_permission(struct inode *inode, const char *name, int mask)
+{
+	/*
+	 * We can never set or remove an extended attribute on a read-only
+	 * filesystem  or on an immutable / append-only inode.
+	 */
+	if (mask & MAY_WRITE) {
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
+	/*
+	 * No restriction for security.* and system.* from the VFS.  Decision
+	 * on these is left to the underlying filesystem / security module.
+	 */
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return 0;
+
+	/*
+	 * The trusted.* namespace can only be accessed by privileged users.
+	 */
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+		if (!capable(CAP_SYS_ADMIN))
+			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+		return 0;
+	}
+
+	/*
+	 * In the user.* namespace, only regular files and directories can have
+	 * extended attributes. For sticky directories, only the owner and
+	 * privileged users can write attributes.
+	 */
+	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
+		    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
+			return -EPERM;
+	}
+
+	return inode_permission(inode, mask);
+}
+
+/**
+ *  __vfs_setxattr_noperm - perform setxattr operation without performing
+ *  permission checks.
+ *
+ *  @dentry - object to perform setxattr on
+ *  @name - xattr name to set
+ *  @value - value to set @name to
+ *  @size - size of @value
+ *  @flags - flags to pass into filesystem operations
+ *
+ *  returns the result of the internal setxattr or setsecurity operations.
+ *
+ *  This function requires the caller to lock the inode's i_mutex before it
+ *  is executed. It also assumes that the caller will make the appropriate
+ *  permission checks.
+ */
+int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = -EOPNOTSUPP;
+	int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
+				   XATTR_SECURITY_PREFIX_LEN);
+
+	if (issec)
+		inode->i_flags &= ~S_NOSEC;
+	if (inode->i_op->setxattr) {
+		error = inode->i_op->setxattr(dentry, name, value, size, flags);
+		if (!error) {
+			fsnotify_xattr(dentry);
+			security_inode_post_setxattr(dentry, name, value,
+						     size, flags);
+		}
+	} else if (issec) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(inode, suffix, value,
+						   size, flags);
+		if (!error)
+			fsnotify_xattr(dentry);
+	}
+
+	return error;
+}
+
+
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = security_inode_setxattr(dentry, name, value, size, flags);
+	if (error)
+		goto out;
+
+	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_setxattr);
+
+ssize_t
+xattr_getsecurity(struct inode *inode, const char *name, void *value,
+			size_t size)
+{
+	void *buffer = NULL;
+	ssize_t len;
+
+	if (!value || !size) {
+		len = security_inode_getsecurity(inode, name, &buffer, false);
+		goto out_noalloc;
+	}
+
+	len = security_inode_getsecurity(inode, name, &buffer, true);
+	if (len < 0)
+		return len;
+	if (size < len) {
+		len = -ERANGE;
+		goto out;
+	}
+	memcpy(value, buffer, len);
+out:
+	security_release_secctx(buffer, len);
+out_noalloc:
+	return len;
+}
+EXPORT_SYMBOL_GPL(xattr_getsecurity);
+
+/*
+ * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
+ *
+ * Allocate memory, if not already allocated, or re-allocate correct size,
+ * before retrieving the extended attribute.
+ *
+ * Returns the result of alloc, if failed, or the getxattr operation.
+ */
+ssize_t
+vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
+		   size_t xattr_size, gfp_t flags)
+{
+	struct inode *inode = dentry->d_inode;
+	char *value = *xattr_value;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_READ);
+	if (error)
+		return error;
+
+	if (!inode->i_op->getxattr)
+		return -EOPNOTSUPP;
+
+	error = inode->i_op->getxattr(dentry, name, NULL, 0);
+	if (error < 0)
+		return error;
+
+	if (!value || (error > xattr_size)) {
+		value = krealloc(*xattr_value, error + 1, flags);
+		if (!value)
+			return -ENOMEM;
+		memset(value, 0, error + 1);
+	}
+
+	error = inode->i_op->getxattr(dentry, name, value, error);
+	*xattr_value = value;
+	return error;
+}
+
+/* Compare an extended attribute value with the given value */
+int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
+		  const char *value, size_t size, gfp_t flags)
+{
+	char *xattr_value = NULL;
+	int rc;
+
+	rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
+	if (rc < 0)
+		return rc;
+
+	if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
+		rc = -EINVAL;
+	else
+		rc = 0;
+	kfree(xattr_value);
+	return rc;
+}
+
+ssize_t
+vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_READ);
+	if (error)
+		return error;
+
+	error = security_inode_getxattr(dentry, name);
+	if (error)
+		return error;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+				XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		int ret = xattr_getsecurity(inode, suffix, value, size);
+		/*
+		 * Only overwrite the return value if a security module
+		 * is actually active.
+		 */
+		if (ret == -EOPNOTSUPP)
+			goto nolsm;
+		return ret;
+	}
+nolsm:
+	if (inode->i_op->getxattr)
+		error = inode->i_op->getxattr(dentry, name, value, size);
+	else
+		error = -EOPNOTSUPP;
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_getxattr);
+
+ssize_t
+vfs_listxattr(struct dentry *d, char *list, size_t size)
+{
+	ssize_t error;
+
+	error = security_inode_listxattr(d);
+	if (error)
+		return error;
+	error = -EOPNOTSUPP;
+	if (d->d_inode->i_op->listxattr) {
+		error = d->d_inode->i_op->listxattr(d, list, size);
+	} else {
+		error = security_inode_listsecurity(d->d_inode, list, size);
+		if (size && error > size)
+			error = -ERANGE;
+	}
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_listxattr);
+
+int
+vfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if (!inode->i_op->removexattr)
+		return -EOPNOTSUPP;
+
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = security_inode_removexattr(dentry, name);
+	if (error) {
+		mutex_unlock(&inode->i_mutex);
+		return error;
+	}
+
+	error = inode->i_op->removexattr(dentry, name);
+	mutex_unlock(&inode->i_mutex);
+
+	if (!error) {
+		fsnotify_xattr(dentry);
+		evm_inode_post_removexattr(dentry, name);
+	}
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_removexattr);
+
+
+/*
+ * Extended attribute SET operations
+ */
+static long
+setxattr(struct dentry *d, const char __user *name, const void __user *value,
+	 size_t size, int flags)
+{
+	int error;
+	void *kvalue = NULL;
+	void *vvalue = NULL;	/* If non-NULL, we used vmalloc() */
+	char kname[XATTR_NAME_MAX + 1];
+
+	if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
+		return -EINVAL;
+
+	error = strncpy_from_user(kname, name, sizeof(kname));
+	if (error == 0 || error == sizeof(kname))
+		error = -ERANGE;
+	if (error < 0)
+		return error;
+
+	if (size) {
+		if (size > XATTR_SIZE_MAX)
+			return -E2BIG;
+		kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
+		if (!kvalue) {
+			vvalue = vmalloc(size);
+			if (!vvalue)
+				return -ENOMEM;
+			kvalue = vvalue;
+		}
+		if (copy_from_user(kvalue, value, size)) {
+			error = -EFAULT;
+			goto out;
+		}
+		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
+		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
+			posix_acl_fix_xattr_from_user(kvalue, size);
+	}
+
+	error = vfs_setxattr(d, kname, kvalue, size, flags);
+out:
+	if (vvalue)
+		vfree(vvalue);
+	else
+		kfree(kvalue);
+	return error;
+}
+
+static int path_setxattr(const char __user *pathname,
+			 const char __user *name, const void __user *value,
+			 size_t size, int flags, unsigned int lookup_flags)
+{
+	struct path path;
+	int error;
+retry:
+	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+	if (error)
+		return error;
+	error = mnt_want_write(path.mnt);
+	if (!error) {
+		error = setxattr(path.dentry, name, value, size, flags);
+		mnt_drop_write(path.mnt);
+	}
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
+{
+	return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
+}
+
+SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
+{
+	return path_setxattr(pathname, name, value, size, flags, 0);
+}
+
+SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
+		const void __user *,value, size_t, size, int, flags)
+{
+	struct fd f = fdget(fd);
+	int error = -EBADF;
+
+	if (!f.file)
+		return error;
+	audit_file(f.file);
+	error = mnt_want_write_file(f.file);
+	if (!error) {
+		error = setxattr(f.file->f_path.dentry, name, value, size, flags);
+		mnt_drop_write_file(f.file);
+	}
+	fdput(f);
+	return error;
+}
+
+/*
+ * Extended attribute GET operations
+ */
+static ssize_t
+getxattr(struct dentry *d, const char __user *name, void __user *value,
+	 size_t size)
+{
+	ssize_t error;
+	void *kvalue = NULL;
+	void *vvalue = NULL;
+	char kname[XATTR_NAME_MAX + 1];
+
+	error = strncpy_from_user(kname, name, sizeof(kname));
+	if (error == 0 || error == sizeof(kname))
+		error = -ERANGE;
+	if (error < 0)
+		return error;
+
+	if (size) {
+		if (size > XATTR_SIZE_MAX)
+			size = XATTR_SIZE_MAX;
+		kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+		if (!kvalue) {
+			vvalue = vmalloc(size);
+			if (!vvalue)
+				return -ENOMEM;
+			kvalue = vvalue;
+		}
+	}
+
+	error = vfs_getxattr(d, kname, kvalue, size);
+	if (error > 0) {
+		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
+		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
+			posix_acl_fix_xattr_to_user(kvalue, size);
+		if (size && copy_to_user(value, kvalue, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+		/* The file system tried to returned a value bigger
+		   than XATTR_SIZE_MAX bytes. Not possible. */
+		error = -E2BIG;
+	}
+	if (vvalue)
+		vfree(vvalue);
+	else
+		kfree(kvalue);
+	return error;
+}
+
+static ssize_t path_getxattr(const char __user *pathname,
+			     const char __user *name, void __user *value,
+			     size_t size, unsigned int lookup_flags)
+{
+	struct path path;
+	ssize_t error;
+retry:
+	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+	if (error)
+		return error;
+	error = getxattr(path.dentry, name, value, size);
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
+{
+	return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
+}
+
+SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
+{
+	return path_getxattr(pathname, name, value, size, 0);
+}
+
+SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
+		void __user *, value, size_t, size)
+{
+	struct fd f = fdget(fd);
+	ssize_t error = -EBADF;
+
+	if (!f.file)
+		return error;
+	audit_file(f.file);
+	error = getxattr(f.file->f_path.dentry, name, value, size);
+	fdput(f);
+	return error;
+}
+
+/*
+ * Extended attribute LIST operations
+ */
+static ssize_t
+listxattr(struct dentry *d, char __user *list, size_t size)
+{
+	ssize_t error;
+	char *klist = NULL;
+	char *vlist = NULL;	/* If non-NULL, we used vmalloc() */
+
+	if (size) {
+		if (size > XATTR_LIST_MAX)
+			size = XATTR_LIST_MAX;
+		klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
+		if (!klist) {
+			vlist = vmalloc(size);
+			if (!vlist)
+				return -ENOMEM;
+			klist = vlist;
+		}
+	}
+
+	error = vfs_listxattr(d, klist, size);
+	if (error > 0) {
+		if (size && copy_to_user(list, klist, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
+		/* The file system tried to returned a list bigger
+		   than XATTR_LIST_MAX bytes. Not possible. */
+		error = -E2BIG;
+	}
+	if (vlist)
+		vfree(vlist);
+	else
+		kfree(klist);
+	return error;
+}
+
+static ssize_t path_listxattr(const char __user *pathname, char __user *list,
+			      size_t size, unsigned int lookup_flags)
+{
+	struct path path;
+	ssize_t error;
+retry:
+	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+	if (error)
+		return error;
+	error = listxattr(path.dentry, list, size);
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
+{
+	return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
+}
+
+SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
+{
+	return path_listxattr(pathname, list, size, 0);
+}
+
+SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
+{
+	struct fd f = fdget(fd);
+	ssize_t error = -EBADF;
+
+	if (!f.file)
+		return error;
+	audit_file(f.file);
+	error = listxattr(f.file->f_path.dentry, list, size);
+	fdput(f);
+	return error;
+}
+
+/*
+ * Extended attribute REMOVE operations
+ */
+static long
+removexattr(struct dentry *d, const char __user *name)
+{
+	int error;
+	char kname[XATTR_NAME_MAX + 1];
+
+	error = strncpy_from_user(kname, name, sizeof(kname));
+	if (error == 0 || error == sizeof(kname))
+		error = -ERANGE;
+	if (error < 0)
+		return error;
+
+	return vfs_removexattr(d, kname);
+}
+
+static int path_removexattr(const char __user *pathname,
+			    const char __user *name, unsigned int lookup_flags)
+{
+	struct path path;
+	int error;
+retry:
+	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+	if (error)
+		return error;
+	error = mnt_want_write(path.mnt);
+	if (!error) {
+		error = removexattr(path.dentry, name);
+		mnt_drop_write(path.mnt);
+	}
+	path_put(&path);
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
+		const char __user *, name)
+{
+	return path_removexattr(pathname, name, LOOKUP_FOLLOW);
+}
+
+SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
+		const char __user *, name)
+{
+	return path_removexattr(pathname, name, 0);
+}
+
+SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
+{
+	struct fd f = fdget(fd);
+	int error = -EBADF;
+
+	if (!f.file)
+		return error;
+	audit_file(f.file);
+	error = mnt_want_write_file(f.file);
+	if (!error) {
+		error = removexattr(f.file->f_path.dentry, name);
+		mnt_drop_write_file(f.file);
+	}
+	fdput(f);
+	return error;
+}
+
+
+static const char *
+strcmp_prefix(const char *a, const char *a_prefix)
+{
+	while (*a_prefix && *a == *a_prefix) {
+		a++;
+		a_prefix++;
+	}
+	return *a_prefix ? NULL : a;
+}
+
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix with the generic xattr API, a filesystem should create a
+ * null-terminated array of struct xattr_handler (one for each prefix) and
+ * hang a pointer to it off of the s_xattr field of the superblock.
+ *
+ * The generic_fooxattr() functions will use this list to dispatch xattr
+ * operations to the correct xattr_handler.
+ */
+#define for_each_xattr_handler(handlers, handler)		\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
+
+/*
+ * Find the xattr_handler with the matching prefix.
+ */
+static const struct xattr_handler *
+xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
+{
+	const struct xattr_handler *handler;
+
+	if (!*name)
+		return NULL;
+
+	for_each_xattr_handler(handlers, handler) {
+		const char *n = strcmp_prefix(*name, handler->prefix);
+		if (n) {
+			*name = n;
+			break;
+		}
+	}
+	return handler;
+}
+
+/*
+ * Find the handler for the prefix and dispatch its get() operation.
+ */
+ssize_t
+generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
+{
+	const struct xattr_handler *handler;
+
+	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->get(dentry, name, buffer, size, handler->flags);
+}
+
+/*
+ * Combine the results of the list() operation from every xattr_handler in the
+ * list.
+ */
+ssize_t
+generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+	unsigned int size = 0;
+
+	if (!buffer) {
+		for_each_xattr_handler(handlers, handler) {
+			size += handler->list(dentry, NULL, 0, NULL, 0,
+					      handler->flags);
+		}
+	} else {
+		char *buf = buffer;
+
+		for_each_xattr_handler(handlers, handler) {
+			size = handler->list(dentry, buf, buffer_size,
+					     NULL, 0, handler->flags);
+			if (size > buffer_size)
+				return -ERANGE;
+			buf += size;
+			buffer_size -= size;
+		}
+		size = buf - buffer;
+	}
+	return size;
+}
+
+/*
+ * Find the handler for the prefix and dispatch its set() operation.
+ */
+int
+generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+{
+	const struct xattr_handler *handler;
+
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->set(dentry, name, value, size, flags, handler->flags);
+}
+
+/*
+ * Find the handler for the prefix and dispatch its set() operation to remove
+ * any associated extended attribute.
+ */
+int
+generic_removexattr(struct dentry *dentry, const char *name)
+{
+	const struct xattr_handler *handler;
+
+	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->set(dentry, name, NULL, 0,
+			    XATTR_REPLACE, handler->flags);
+}
+
+EXPORT_SYMBOL(generic_getxattr);
+EXPORT_SYMBOL(generic_listxattr);
+EXPORT_SYMBOL(generic_setxattr);
+EXPORT_SYMBOL(generic_removexattr);
+
+/*
+ * Allocate new xattr and copy in the value; but leave the name to callers.
+ */
+struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
+{
+	struct simple_xattr *new_xattr;
+	size_t len;
+
+	/* wrap around? */
+	len = sizeof(*new_xattr) + size;
+	if (len < sizeof(*new_xattr))
+		return NULL;
+
+	new_xattr = kmalloc(len, GFP_KERNEL);
+	if (!new_xattr)
+		return NULL;
+
+	new_xattr->size = size;
+	memcpy(new_xattr->value, value, size);
+	return new_xattr;
+}
+
+/*
+ * xattr GET operation for in-memory/pseudo filesystems
+ */
+int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
+		     void *buffer, size_t size)
+{
+	struct simple_xattr *xattr;
+	int ret = -ENODATA;
+
+	spin_lock(&xattrs->lock);
+	list_for_each_entry(xattr, &xattrs->head, list) {
+		if (strcmp(name, xattr->name))
+			continue;
+
+		ret = xattr->size;
+		if (buffer) {
+			if (size < xattr->size)
+				ret = -ERANGE;
+			else
+				memcpy(buffer, xattr->value, xattr->size);
+		}
+		break;
+	}
+	spin_unlock(&xattrs->lock);
+	return ret;
+}
+
+static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+			      const void *value, size_t size, int flags)
+{
+	struct simple_xattr *xattr;
+	struct simple_xattr *new_xattr = NULL;
+	int err = 0;
+
+	/* value == NULL means remove */
+	if (value) {
+		new_xattr = simple_xattr_alloc(value, size);
+		if (!new_xattr)
+			return -ENOMEM;
+
+		new_xattr->name = kstrdup(name, GFP_KERNEL);
+		if (!new_xattr->name) {
+			kfree(new_xattr);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock(&xattrs->lock);
+	list_for_each_entry(xattr, &xattrs->head, list) {
+		if (!strcmp(name, xattr->name)) {
+			if (flags & XATTR_CREATE) {
+				xattr = new_xattr;
+				err = -EEXIST;
+			} else if (new_xattr) {
+				list_replace(&xattr->list, &new_xattr->list);
+			} else {
+				list_del(&xattr->list);
+			}
+			goto out;
+		}
+	}
+	if (flags & XATTR_REPLACE) {
+		xattr = new_xattr;
+		err = -ENODATA;
+	} else {
+		list_add(&new_xattr->list, &xattrs->head);
+		xattr = NULL;
+	}
+out:
+	spin_unlock(&xattrs->lock);
+	if (xattr) {
+		kfree(xattr->name);
+		kfree(xattr);
+	}
+	return err;
+
+}
+
+/**
+ * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
+ * @xattrs: target simple_xattr list
+ * @name: name of the new extended attribute
+ * @value: value of the new xattr. If %NULL, will remove the attribute
+ * @size: size of the new xattr
+ * @flags: %XATTR_{CREATE|REPLACE}
+ *
+ * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
+ * with -EEXIST.  If %XATTR_REPLACE is set, the xattr should exist;
+ * otherwise, fails with -ENODATA.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+		     const void *value, size_t size, int flags)
+{
+	if (size == 0)
+		value = ""; /* empty EA, do not remove */
+	return __simple_xattr_set(xattrs, name, value, size, flags);
+}
+
+/*
+ * xattr REMOVE operation for in-memory/pseudo filesystems
+ */
+int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
+{
+	return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
+}
+
+static bool xattr_is_trusted(const char *name)
+{
+	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+}
+
+/*
+ * xattr LIST operation for in-memory/pseudo filesystems
+ */
+ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
+			  size_t size)
+{
+	bool trusted = capable(CAP_SYS_ADMIN);
+	struct simple_xattr *xattr;
+	size_t used = 0;
+
+	spin_lock(&xattrs->lock);
+	list_for_each_entry(xattr, &xattrs->head, list) {
+		size_t len;
+
+		/* skip "trusted." attributes for unprivileged callers */
+		if (!trusted && xattr_is_trusted(xattr->name))
+			continue;
+
+		len = strlen(xattr->name) + 1;
+		used += len;
+		if (buffer) {
+			if (size < used) {
+				used = -ERANGE;
+				break;
+			}
+			memcpy(buffer, xattr->name, len);
+			buffer += len;
+		}
+	}
+	spin_unlock(&xattrs->lock);
+
+	return used;
+}
+
+/*
+ * Adds an extended attribute to the list
+ */
+void simple_xattr_list_add(struct simple_xattrs *xattrs,
+			   struct simple_xattr *new_xattr)
+{
+	spin_lock(&xattrs->lock);
+	list_add(&new_xattr->list, &xattrs->head);
+	spin_unlock(&xattrs->lock);
+}
diff --git a/kernel/fs/xfs/Kconfig b/kernel/fs/xfs/Kconfig
new file mode 100644
index 000000000..5d47b4df6
--- /dev/null
+++ b/kernel/fs/xfs/Kconfig
@@ -0,0 +1,97 @@
+config XFS_FS
+	tristate "XFS filesystem support"
+	depends on BLOCK
+	depends on (64BIT || LBDAF)
+	select EXPORTFS
+	select LIBCRC32C
+	help
+	  XFS is a high performance journaling filesystem which originated
+	  on the SGI IRIX platform.  It is completely multi-threaded, can
+	  support large files and large filesystems, extended attributes,
+	  variable block sizes, is extent based, and makes extensive use of
+	  Btrees (directories, extents, free space) to aid both performance
+	  and scalability.
+
+	  Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+	  for complete details.  This implementation is on-disk compatible
+	  with the IRIX version of XFS.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called xfs.  Be aware, however, that if the file
+	  system of your root partition is compiled as a module, you'll need
+	  to use an initial ramdisk (initrd) to boot.
+
+config XFS_QUOTA
+	bool "XFS Quota support"
+	depends on XFS_FS
+	select QUOTACTL
+	help
+	  If you say Y here, you will be able to set limits for disk usage on
+	  a per user and/or a per group basis under XFS.  XFS considers quota
+	  information as filesystem metadata and uses journaling to provide a
+	  higher level guarantee of consistency.  The on-disk data format for
+	  quota is also compatible with the IRIX version of XFS, allowing a
+	  filesystem to be migrated between Linux and IRIX without any need
+	  for conversion.
+
+	  If unsure, say N.  More comprehensive documentation can be found in
+	  README.quota in the xfsprogs package.  XFS quota can be used either
+	  with or without the generic quota support enabled (CONFIG_QUOTA) -
+	  they are completely independent subsystems.
+
+config XFS_POSIX_ACL
+	bool "XFS POSIX ACL support"
+	depends on XFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
+config XFS_RT
+	bool "XFS Realtime subvolume support"
+	depends on XFS_FS
+	help
+	  If you say Y here you will be able to mount and use XFS filesystems
+	  which contain a realtime subvolume.  The realtime subvolume is a
+	  separate area of disk space where only file data is stored.  It was
+	  originally designed to provide deterministic data rates suitable
+	  for media streaming applications, but is also useful as a generic
+	  mechanism for ensuring data and metadata/log I/Os are completely
+	  separated.  Regular file I/Os are isolated to a separate device
+	  from all other requests, and this can be done quite transparently
+	  to applications via the inherit-realtime directory inode flag.
+
+	  See the xfs man page in section 5 for additional information.
+
+	  If unsure, say N.
+
+config XFS_WARN
+	bool "XFS Verbose Warnings"
+	depends on XFS_FS && !XFS_DEBUG
+	help
+	  Say Y here to get an XFS build with many additional warnings.
+	  It converts ASSERT checks to WARN, so will log any out-of-bounds
+	  conditions that occur that would otherwise be missed. It is much
+	  lighter weight than XFS_DEBUG and does not modify algorithms and will
+	  not cause the kernel to panic on non-fatal errors.
+
+	  However, similar to XFS_DEBUG, it is only advisable to use this if you
+	  are debugging a particular problem.
+
+config XFS_DEBUG
+	bool "XFS Debugging support"
+	depends on XFS_FS
+	help
+	  Say Y here to get an XFS build with many debugging features,
+	  including ASSERT checks, function wrappers around macros,
+	  and extra sanity-checking functions in various code paths.
+
+	  Note that the resulting code will be HUGE and SLOW, and probably
+	  not useful unless you are debugging a particular problem.
+
+	  Say N unless you are an XFS developer, or you play one on TV.
diff --git a/kernel/fs/xfs/Makefile b/kernel/fs/xfs/Makefile
new file mode 100644
index 000000000..df6828570
--- /dev/null
+++ b/kernel/fs/xfs/Makefile
@@ -0,0 +1,124 @@
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.
+# All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+
+ccflags-y += -I$(src)			# needed for trace events
+ccflags-y += -I$(src)/libxfs
+
+ccflags-$(CONFIG_XFS_DEBUG) += -g
+
+obj-$(CONFIG_XFS_FS)		+= xfs.o
+
+# this one should be compiled first, as the tracing macros can easily blow up
+xfs-y				+= xfs_trace.o
+
+# build the libxfs code first
+xfs-y				+= $(addprefix libxfs/, \
+				   xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_leaf.o \
+				   xfs_attr_remote.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_da_btree.o \
+				   xfs_da_format.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dquot_buf.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_inode_fork.o \
+				   xfs_inode_buf.o \
+				   xfs_log_rlimit.o \
+				   xfs_sb.o \
+				   xfs_symlink_remote.o \
+				   xfs_trans_resv.o \
+				   )
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
+				   xfs_rtbitmap.o \
+				   )
+
+# highlevel code
+xfs-y				+= xfs_aops.o \
+				   xfs_attr_inactive.o \
+				   xfs_attr_list.o \
+				   xfs_bit.o \
+				   xfs_bmap_util.o \
+				   xfs_buf.o \
+				   xfs_dir2_readdir.o \
+				   xfs_discard.o \
+				   xfs_error.o \
+				   xfs_export.o \
+				   xfs_extent_busy.o \
+				   xfs_file.o \
+				   xfs_filestream.o \
+				   xfs_fsops.o \
+				   xfs_globals.o \
+				   xfs_icache.o \
+				   xfs_ioctl.o \
+				   xfs_iomap.o \
+				   xfs_iops.o \
+				   xfs_inode.o \
+				   xfs_itable.o \
+				   xfs_message.o \
+				   xfs_mount.o \
+				   xfs_mru_cache.o \
+				   xfs_super.o \
+				   xfs_symlink.o \
+				   xfs_sysfs.o \
+				   xfs_trans.o \
+				   xfs_xattr.o \
+				   kmem.o \
+				   uuid.o
+
+# low-level transaction/log code
+xfs-y				+= xfs_log.o \
+				   xfs_log_cil.o \
+				   xfs_buf_item.o \
+				   xfs_extfree_item.o \
+				   xfs_icreate_item.o \
+				   xfs_inode_item.o \
+				   xfs_log_recover.o \
+				   xfs_trans_ail.o \
+				   xfs_trans_buf.o \
+				   xfs_trans_extfree.o \
+				   xfs_trans_inode.o \
+
+# optional features
+xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm_bhv.o \
+				   xfs_qm.o \
+				   xfs_quotaops.o
+
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+
+xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
+xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
+xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
+xfs-$(CONFIG_NFSD_PNFS)		+= xfs_pnfs.o
diff --git a/kernel/fs/xfs/kmem.c b/kernel/fs/xfs/kmem.c
new file mode 100644
index 000000000..a7a3a63bb
--- /dev/null
+++ b/kernel/fs/xfs/kmem.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "kmem.h"
+#include "xfs_message.h"
+
+/*
+ * Greedy allocation.  May fail and may return vmalloced memory.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+	void		*ptr;
+	size_t		kmsize = maxsize;
+
+	while (!(ptr = vzalloc(kmsize))) {
+		if ((kmsize >>= 1) <= minsize)
+			kmsize = minsize;
+	}
+	if (ptr)
+		*size = kmsize;
+	return ptr;
+}
+
+void *
+kmem_alloc(size_t size, xfs_km_flags_t flags)
+{
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
+
+	do {
+		ptr = kmalloc(size, lflags);
+		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, lflags);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+	} while (1);
+}
+
+void *
+kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
+{
+	unsigned noio_flag = 0;
+	void	*ptr;
+	gfp_t	lflags;
+
+	ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
+	if (ptr)
+		return ptr;
+
+	/*
+	 * __vmalloc() will allocate data pages and auxillary structures (e.g.
+	 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
+	 * here. Hence we need to tell memory reclaim that we are in such a
+	 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+	 * the filesystem here and potentially deadlocking.
+	 */
+	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		noio_flag = memalloc_noio_save();
+
+	lflags = kmem_flags_convert(flags);
+	ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+
+	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		memalloc_noio_restore(noio_flag);
+
+	return ptr;
+}
+
+void *
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
+	     xfs_km_flags_t flags)
+{
+	void	*new;
+
+	new = kmem_alloc(newsize, flags);
+	if (ptr) {
+		if (new)
+			memcpy(new, ptr,
+				((oldsize < newsize) ? oldsize : newsize));
+		kmem_free(ptr);
+	}
+	return new;
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
+
+	do {
+		ptr = kmem_cache_alloc(zone, lflags);
+		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, lflags);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+	} while (1);
+}
diff --git a/kernel/fs/xfs/kmem.h b/kernel/fs/xfs/kmem.h
new file mode 100644
index 000000000..cc6b768fc
--- /dev/null
+++ b/kernel/fs/xfs/kmem.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+/*
+ * General memory allocation interfaces
+ */
+
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_SLEEP	((__force xfs_km_flags_t)0x0001u)
+#define KM_NOSLEEP	((__force xfs_km_flags_t)0x0002u)
+#define KM_NOFS		((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL	((__force xfs_km_flags_t)0x0008u)
+#define KM_ZERO		((__force xfs_km_flags_t)0x0010u)
+
+/*
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions.  We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
+ */
+static inline gfp_t
+kmem_flags_convert(xfs_km_flags_t flags)
+{
+	gfp_t	lflags;
+
+	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+
+	if (flags & KM_NOSLEEP) {
+		lflags = GFP_ATOMIC | __GFP_NOWARN;
+	} else {
+		lflags = GFP_KERNEL | __GFP_NOWARN;
+		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+			lflags &= ~__GFP_FS;
+	}
+
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
+
+	return lflags;
+}
+
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+static inline void  kmem_free(const void *ptr)
+{
+	kvfree(ptr);
+}
+
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
+static inline void *
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
+{
+	return kmem_alloc(size, flags | KM_ZERO);
+}
+
+/*
+ * Zone interfaces
+ */
+
+#define KM_ZONE_HWALIGN	SLAB_HWCACHE_ALIGN
+#define KM_ZONE_RECLAIM	SLAB_RECLAIM_ACCOUNT
+#define KM_ZONE_SPREAD	SLAB_MEM_SPREAD
+
+#define kmem_zone	kmem_cache
+#define kmem_zone_t	struct kmem_cache
+
+static inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+	return kmem_cache_create(zone_name, size, 0, 0, NULL);
+}
+
+static inline kmem_zone_t *
+kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+		     void (*construct)(void *))
+{
+	return kmem_cache_create(zone_name, size, 0, flags, construct);
+}
+
+static inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+	kmem_cache_free(zone, ptr);
+}
+
+static inline void
+kmem_zone_destroy(kmem_zone_t *zone)
+{
+	if (zone)
+		kmem_cache_destroy(zone);
+}
+
+extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
+
+static inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+	return kmem_zone_alloc(zone, flags | KM_ZERO);
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.c b/kernel/fs/xfs/libxfs/xfs_alloc.c
new file mode 100644
index 000000000..516162be1
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_alloc.c
@@ -0,0 +1,2639 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+struct workqueue_struct *xfs_alloc_wq;
+
+#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define	XFSA_FIXUP_BNO_OK	1
+#define	XFSA_FIXUP_CNT_OK	2
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_alloc_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	union xfs_btree_rec	rec;
+
+	rec.alloc.ar_startblock = cpu_to_be32(bno);
+	rec.alloc.ar_blockcount = cpu_to_be32(len);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat)	/* output: success/failure */
+{
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		*bno = be32_to_cpu(rec->alloc.ar_startblock);
+		*len = be32_to_cpu(rec->alloc.ar_blockcount);
+	}
+	return error;
+}
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC void
+xfs_alloc_compute_aligned(
+	xfs_alloc_arg_t	*args,		/* allocation argument structure */
+	xfs_agblock_t	foundbno,	/* starting block in found extent */
+	xfs_extlen_t	foundlen,	/* length in found extent */
+	xfs_agblock_t	*resbno,	/* result block number */
+	xfs_extlen_t	*reslen)	/* result length */
+{
+	xfs_agblock_t	bno;
+	xfs_extlen_t	len;
+
+	/* Trim busy sections out of found extent */
+	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+	if (args->alignment > 1 && len >= args->minlen) {
+		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
+		xfs_extlen_t	diff = aligned_bno - bno;
+
+		*resbno = aligned_bno;
+		*reslen = diff >= len ? 0 : len - diff;
+	} else {
+		*resbno = bno;
+		*reslen = len;
+	}
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t			/* difference value (absolute) */
+xfs_alloc_compute_diff(
+	xfs_agblock_t	wantbno,	/* target starting block */
+	xfs_extlen_t	wantlen,	/* target length */
+	xfs_extlen_t	alignment,	/* target alignment */
+	char		userdata,	/* are we allocating data? */
+	xfs_agblock_t	freebno,	/* freespace's starting block */
+	xfs_extlen_t	freelen,	/* freespace's length */
+	xfs_agblock_t	*newbnop)	/* result: best start block from free */
+{
+	xfs_agblock_t	freeend;	/* end of freespace extent */
+	xfs_agblock_t	newbno1;	/* return block number */
+	xfs_agblock_t	newbno2;	/* other new block number */
+	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
+	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
+	xfs_agblock_t	wantend;	/* end of target extent */
+
+	ASSERT(freelen >= wantlen);
+	freeend = freebno + freelen;
+	wantend = wantbno + wantlen;
+	/*
+	 * We want to allocate from the start of a free extent if it is past
+	 * the desired block or if we are allocating user data and the free
+	 * extent is before desired block. The second case is there to allow
+	 * for contiguous allocation from the remaining free space if the file
+	 * grows in the short term.
+	 */
+	if (freebno >= wantbno || (userdata && freeend < wantend)) {
+		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else if (freeend >= wantend && alignment > 1) {
+		newbno1 = roundup(wantbno, alignment);
+		newbno2 = newbno1 - alignment;
+		if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+		else
+			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+		if (newbno2 < freebno)
+			newbno2 = NULLAGBLOCK;
+		else
+			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+			if (newlen1 < newlen2 ||
+			    (newlen1 == newlen2 &&
+			     XFS_ABSDIFF(newbno1, wantbno) >
+			     XFS_ABSDIFF(newbno2, wantbno)))
+				newbno1 = newbno2;
+		} else if (newbno2 != NULLAGBLOCK)
+			newbno1 = newbno2;
+	} else if (freeend >= wantend) {
+		newbno1 = wantbno;
+	} else if (alignment > 1) {
+		newbno1 = roundup(freeend - wantlen, alignment);
+		if (newbno1 > freeend - wantlen &&
+		    newbno1 - alignment >= freebno)
+			newbno1 -= alignment;
+		else if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else
+		newbno1 = freeend - wantlen;
+	*newbnop = newbno1;
+	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_extlen_t	k;
+	xfs_extlen_t	rlen;
+
+	ASSERT(args->mod < args->prod);
+	rlen = args->len;
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+	    (args->mod == 0 && rlen < args->prod))
+		return;
+	k = rlen % args->prod;
+	if (k == args->mod)
+		return;
+	if (k > args->mod)
+		rlen = rlen - (k - args->mod);
+	else
+		rlen = rlen - args->prod + (args->mod - k);
+	/* casts to (int) catch length underflows */
+	if ((int)rlen < (int)args->minlen)
+		return;
+	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+	ASSERT(rlen % args->prod == args->mod);
+	args->len = rlen;
+}
+
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_agf_t	*agf;		/* a.g. freelist header */
+	int		diff;		/* free space difference */
+
+	if (args->minleft == 0)
+		return 1;
+	agf = XFS_BUF_TO_AGF(args->agbp);
+	diff = be32_to_cpu(agf->agf_freeblks)
+		- args->len - args->minleft;
+	if (diff >= 0)
+		return 1;
+	args->len += diff;		/* shrink the allocated space */
+	/* casts to (int) catch length underflows */
+	if ((int)args->len >= (int)args->minlen)
+		return 1;
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int				/* error code */
+xfs_alloc_fixup_trees(
+	xfs_btree_cur_t	*cnt_cur,	/* cursor for by-size btree */
+	xfs_btree_cur_t	*bno_cur,	/* cursor for by-block btree */
+	xfs_agblock_t	fbno,		/* starting block of free extent */
+	xfs_extlen_t	flen,		/* length of free extent */
+	xfs_agblock_t	rbno,		/* starting block of returned extent */
+	xfs_extlen_t	rlen,		/* length of returned extent */
+	int		flags)		/* flags, XFSA_FIXUP_... */
+{
+	int		error;		/* error code */
+	int		i;		/* operation results */
+	xfs_agblock_t	nfbno1;		/* first new free startblock */
+	xfs_agblock_t	nfbno2;		/* second new free startblock */
+	xfs_extlen_t	nflen1=0;	/* first new free length */
+	xfs_extlen_t	nflen2=0;	/* second new free length */
+	struct xfs_mount *mp;
+
+	mp = cnt_cur->bc_mp;
+
+	/*
+	 * Look up the record in the by-size tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp,
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+	}
+	/*
+	 * Look up the record in the by-block tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp,
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+	}
+
+#ifdef DEBUG
+	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+		struct xfs_btree_block	*bnoblock;
+		struct xfs_btree_block	*cntblock;
+
+		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+		XFS_WANT_CORRUPTED_RETURN(mp,
+			bnoblock->bb_numrecs == cntblock->bb_numrecs);
+	}
+#endif
+
+	/*
+	 * Deal with all four cases: the allocated record is contained
+	 * within the freespace record, so we can have new freespace
+	 * at either (or both) end, or no freespace remaining.
+	 */
+	if (rbno == fbno && rlen == flen)
+		nfbno1 = nfbno2 = NULLAGBLOCK;
+	else if (rbno == fbno) {
+		nfbno1 = rbno + rlen;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else if (rbno + rlen == fbno + flen) {
+		nfbno1 = fbno;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else {
+		nfbno1 = fbno;
+		nflen1 = rbno - fbno;
+		nfbno2 = rbno + rlen;
+		nflen2 = (fbno + flen) - nfbno2;
+	}
+	/*
+	 * Delete the entry from the by-size btree.
+	 */
+	if ((error = xfs_btree_delete(cnt_cur, &i)))
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+	/*
+	 * Add new by-size btree entry(s).
+	 */
+	if (nfbno1 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+	}
+	/*
+	 * Fix up the by-block btree entry(s).
+	 */
+	if (nfbno1 == NULLAGBLOCK) {
+		/*
+		 * No remaining freespace, just delete the by-block tree entry.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+	} else {
+		/*
+		 * Update the by-block entry to start later|be shorter.
+		 */
+		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+			return error;
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		/*
+		 * 2 resulting free entries, need to add one.
+		 */
+		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+	}
+	return 0;
+}
+
+static bool
+xfs_agfl_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
+	int		i;
+
+	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+		return false;
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+		if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
+		    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+			return false;
+	}
+	return true;
+}
+
+static void
+xfs_agfl_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	/*
+	 * There is no verification of non-crc AGFLs because mkfs does not
+	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
+	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
+	 * can't verify just those entries are valid.
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_agfl_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agfl_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	/* no verification of non-crc AGFLs */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_agfl_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (bip)
+		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+	.verify_read = xfs_agfl_read_verify,
+	.verify_write = xfs_agfl_write_verify,
+};
+
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int				/* error */
+xfs_alloc_read_agfl(
+	xfs_mount_t	*mp,		/* mount point structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_buf_t	**bpp)		/* buffer for the ag free block array */
+{
+	xfs_buf_t	*bp;		/* return value */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(
+			mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+	if (error)
+		return error;
+	xfs_buf_set_ref(bp, XFS_AGFL_REF);
+	*bpp = bp;
+	return 0;
+}
+
+STATIC int
+xfs_alloc_update_counters(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agbp,
+	long			len)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+
+	pag->pagf_freeblks += len;
+	be32_add_cpu(&agf->agf_freeblks, len);
+
+	xfs_trans_agblocks_delta(tp, len);
+	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+		     be32_to_cpu(agf->agf_length)))
+		return -EFSCORRUPTED;
+
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+	return 0;
+}
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent(
+	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
+{
+	int		error=0;
+
+	ASSERT(args->minlen > 0);
+	ASSERT(args->maxlen > 0);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->mod < args->prod);
+	ASSERT(args->alignment > 0);
+	/*
+	 * Branch to correct routine based on the type.
+	 */
+	args->wasfromfl = 0;
+	switch (args->type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+		error = xfs_alloc_ag_vextent_size(args);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_alloc_ag_vextent_near(args);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_alloc_ag_vextent_exact(args);
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+
+	if (error || args->agbno == NULLAGBLOCK)
+		return error;
+
+	ASSERT(args->len >= args->minlen);
+	ASSERT(args->len <= args->maxlen);
+	ASSERT(!args->wasfromfl || !args->isfl);
+	ASSERT(args->agbno % args->alignment == 0);
+
+	if (!args->wasfromfl) {
+		error = xfs_alloc_update_counters(args->tp, args->pag,
+						  args->agbp,
+						  -((long)(args->len)));
+		if (error)
+			return error;
+
+		ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+					      args->agbno, args->len));
+	}
+
+	if (!args->isfl) {
+		xfs_trans_mod_sb(args->tp, args->wasdel ?
+				 XFS_TRANS_SB_RES_FDBLOCKS :
+				 XFS_TRANS_SB_FDBLOCKS,
+				 -((long)(args->len)));
+	}
+
+	XFS_STATS_INC(xs_allocx);
+	XFS_STATS_ADD(xs_allocb, args->len);
+	return error;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_exact(
+	xfs_alloc_arg_t	*args)	/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur;/* by block-number btree cursor */
+	xfs_btree_cur_t	*cnt_cur;/* by count btree cursor */
+	int		error;
+	xfs_agblock_t	fbno;	/* start block of found extent */
+	xfs_extlen_t	flen;	/* length of found extent */
+	xfs_agblock_t	tbno;	/* start block of trimmed extent */
+	xfs_extlen_t	tlen;	/* length of trimmed extent */
+	xfs_agblock_t	tend;	/* end block of trimmed extent */
+	int		i;	/* success/failure of operation */
+
+	ASSERT(args->alignment == 1);
+
+	/*
+	 * Allocate/initialize a cursor for the by-number freespace btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					  args->agno, XFS_BTNUM_BNO);
+
+	/*
+	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+	 * Look for the closest free block <= bno, it must contain bno
+	 * if any free block does.
+	 */
+	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+	if (error)
+		goto error0;
+	if (!i)
+		goto not_found;
+
+	/*
+	 * Grab the freespace record.
+	 */
+	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+	ASSERT(fbno <= args->agbno);
+
+	/*
+	 * Check for overlapping busy extents.
+	 */
+	xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+	/*
+	 * Give up if the start of the extent is busy, or the freespace isn't
+	 * long enough for the minimum request.
+	 */
+	if (tbno > args->agbno)
+		goto not_found;
+	if (tlen < args->minlen)
+		goto not_found;
+	tend = tbno + tlen;
+	if (tend < args->agbno + args->minlen)
+		goto not_found;
+
+	/*
+	 * End of extent will be smaller of the freespace end and the
+	 * maximal requested end.
+	 *
+	 * Fix the length according to mod and prod if given.
+	 */
+	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+						- args->agbno;
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args))
+		goto not_found;
+
+	ASSERT(args->agbno + args->len <= tend);
+
+	/*
+	 * We are allocating agbno for args->len
+	 * Allocate/initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+	ASSERT(args->agbno + args->len <=
+		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+				      args->len, XFSA_FIXUP_BNO_OK);
+	if (error) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+		goto error0;
+	}
+
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+	args->wasfromfl = 0;
+	trace_xfs_alloc_exact_done(args);
+	return 0;
+
+not_found:
+	/* Didn't find it, return null. */
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	args->agbno = NULLAGBLOCK;
+	trace_xfs_alloc_exact_notfound(args);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	trace_xfs_alloc_exact_error(args);
+	return error;
+}
+
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	struct xfs_btree_cur	**gcur,	/* good cursor */
+	struct xfs_btree_cur	**scur,	/* searching cursor */
+	xfs_agblock_t		gdiff,	/* difference for search comparison */
+	xfs_agblock_t		*sbno,	/* extent found by search */
+	xfs_extlen_t		*slen,	/* extent length */
+	xfs_agblock_t		*sbnoa,	/* aligned extent found by search */
+	xfs_extlen_t		*slena,	/* aligned extent length */
+	int			dir)	/* 0 = search right, 1 = search left */
+{
+	xfs_agblock_t		new;
+	xfs_agblock_t		sdiff;
+	int			error;
+	int			i;
+
+	/* The good extent is perfect, no need to  search. */
+	if (!gdiff)
+		goto out_use_good;
+
+	/*
+	 * Look until we find a better one, run out of space or run off the end.
+	 */
+	do {
+		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+
+		/*
+		 * The good extent is closer than this one.
+		 */
+		if (!dir) {
+			if (*sbnoa >= args->agbno + gdiff)
+				goto out_use_good;
+		} else {
+			if (*sbnoa <= args->agbno - gdiff)
+				goto out_use_good;
+		}
+
+		/*
+		 * Same distance, compare length and pick the best.
+		 */
+		if (*slena >= args->minlen) {
+			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+			xfs_alloc_fix_len(args);
+
+			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+						       args->alignment,
+						       args->userdata, *sbnoa,
+						       *slena, &new);
+
+			/*
+			 * Choose closer size and invalidate other cursor.
+			 */
+			if (sdiff < gdiff)
+				goto out_use_search;
+			goto out_use_good;
+		}
+
+		if (!dir)
+			error = xfs_btree_increment(*scur, 0, &i);
+		else
+			error = xfs_btree_decrement(*scur, 0, &i);
+		if (error)
+			goto error0;
+	} while (i);
+
+out_use_good:
+	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+	*scur = NULL;
+	return 0;
+
+out_use_search:
+	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+	*gcur = NULL;
+	return 0;
+
+error0:
+	/* caller invalidates cursors */
+	return error;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_near(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur_gt;	/* cursor for bno btree, right side */
+	xfs_btree_cur_t	*bno_cur_lt;	/* cursor for bno btree, left side */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for count btree */
+	xfs_agblock_t	gtbno;		/* start bno of right side entry */
+	xfs_agblock_t	gtbnoa;		/* aligned ... */
+	xfs_extlen_t	gtdiff;		/* difference to right side entry */
+	xfs_extlen_t	gtlen;		/* length of right side entry */
+	xfs_extlen_t	gtlena;		/* aligned ... */
+	xfs_agblock_t	gtnew;		/* useful start bno of right side */
+	int		error;		/* error code */
+	int		i;		/* result code, temporary */
+	int		j;		/* result code, temporary */
+	xfs_agblock_t	ltbno;		/* start bno of left side entry */
+	xfs_agblock_t	ltbnoa;		/* aligned ... */
+	xfs_extlen_t	ltdiff;		/* difference to left side entry */
+	xfs_extlen_t	ltlen;		/* length of left side entry */
+	xfs_extlen_t	ltlena;		/* aligned ... */
+	xfs_agblock_t	ltnew;		/* useful start bno of left side */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
+#ifdef DEBUG
+	/*
+	 * Randomly don't execute the first algorithm.
+	 */
+	int		dofirst;	/* set to do first algorithm */
+
+	dofirst = prandom_u32() & 1;
+#endif
+
+restart:
+	bno_cur_lt = NULL;
+	bno_cur_gt = NULL;
+	ltlen = 0;
+	gtlena = 0;
+	ltlena = 0;
+
+	/*
+	 * Get a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+
+	/*
+	 * See if there are any free extents as big as maxlen.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+		goto error0;
+	/*
+	 * If none, then pick up the last entry in the tree unless the
+	 * tree is empty.
+	 */
+	if (!i) {
+		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+				&ltlen, &i)))
+			goto error0;
+		if (i == 0 || ltlen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_noentry(args);
+			return 0;
+		}
+		ASSERT(i == 1);
+	}
+	args->wasfromfl = 0;
+
+	/*
+	 * First algorithm.
+	 * If the requested extent is large wrt the freespaces available
+	 * in this a.g., then the cursor will be pointing to a btree entry
+	 * near the right edge of the tree.  If it's in the last btree leaf
+	 * block, then we just examine all the entries in that block
+	 * that are big enough, and pick the best one.
+	 * This is written as a while loop so we can break out of it,
+	 * but we never loop back to the top.
+	 */
+	while (xfs_btree_islastblock(cnt_cur, 0)) {
+		xfs_extlen_t	bdiff;
+		int		besti=0;
+		xfs_extlen_t	blen=0;
+		xfs_agblock_t	bnew=0;
+
+#ifdef DEBUG
+		if (dofirst)
+			break;
+#endif
+		/*
+		 * Start from the entry that lookup found, sequence through
+		 * all larger free blocks.  If we're actually pointing at a
+		 * record smaller than maxlen, go to the start of this block,
+		 * and skip all those smaller than minlen.
+		 */
+		if (ltlen || args->alignment > 1) {
+			cnt_cur->bc_ptrs[0] = 1;
+			do {
+				if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+						&ltlen, &i)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+				if (ltlen >= args->minlen)
+					break;
+				if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
+					goto error0;
+			} while (i);
+			ASSERT(ltlen >= args->minlen);
+			if (!i)
+				break;
+		}
+		i = cnt_cur->bc_ptrs[0];
+		for (j = 1, blen = 0, bdiff = 0;
+		     !error && j && (blen < args->maxlen || bdiff > 0);
+		     error = xfs_btree_increment(cnt_cur, 0, &j)) {
+			/*
+			 * For each entry, decide if it's better than
+			 * the previous best entry.
+			 */
+			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena < args->minlen)
+				continue;
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ASSERT(args->len >= args->minlen);
+			if (args->len < blen)
+				continue;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
+			if (ltnew != NULLAGBLOCK &&
+			    (args->len > blen || ltdiff < bdiff)) {
+				bdiff = ltdiff;
+				bnew = ltnew;
+				blen = args->len;
+				besti = cnt_cur->bc_ptrs[0];
+			}
+		}
+		/*
+		 * It didn't work.  We COULD be in a case where
+		 * there's a good record somewhere, so try again.
+		 */
+		if (blen == 0)
+			break;
+		/*
+		 * Point at the best entry, and retrieve it again.
+		 */
+		cnt_cur->bc_ptrs[0] = besti;
+		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+		args->len = blen;
+		if (!xfs_alloc_fix_minleft(args)) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_nominleft(args);
+			return 0;
+		}
+		blen = args->len;
+		/*
+		 * We are allocating starting at bnew for blen blocks.
+		 */
+		args->agbno = bnew;
+		ASSERT(bnew >= ltbno);
+		ASSERT(bnew + blen <= ltbno + ltlen);
+		/*
+		 * Set up a cursor for the by-bno tree.
+		 */
+		bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+			args->agbp, args->agno, XFS_BTNUM_BNO);
+		/*
+		 * Fix up the btree entries.
+		 */
+		if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+				ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+			goto error0;
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+
+		trace_xfs_alloc_near_first(args);
+		return 0;
+	}
+	/*
+	 * Second algorithm.
+	 * Search in the by-bno tree to the left and to the right
+	 * simultaneously, until in each case we find a space big enough,
+	 * or run into the edge of the tree.  When we run into the edge,
+	 * we deallocate that cursor.
+	 * If both searches succeed, we compare the two spaces and pick
+	 * the better one.
+	 * With alignment, it's possible for both to fail; the upper
+	 * level algorithm that picks allocation groups for allocations
+	 * is not supposed to do this.
+	 */
+	/*
+	 * Allocate and initialize the cursor for the leftward search.
+	 */
+	bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
+	/*
+	 * Lookup <= bno to find the leftward search's starting point.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * Didn't find anything; use this cursor for the rightward
+		 * search.
+		 */
+		bno_cur_gt = bno_cur_lt;
+		bno_cur_lt = NULL;
+	}
+	/*
+	 * Found something.  Duplicate the cursor for the rightward search.
+	 */
+	else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+		goto error0;
+	/*
+	 * Increment the cursor, so we will point at the entry just right
+	 * of the leftward entry if any, or to the leftmost entry.
+	 */
+	if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * It failed, there are no rightward entries.
+		 */
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+		bno_cur_gt = NULL;
+	}
+	/*
+	 * Loop going left with the leftward cursor, right with the
+	 * rightward cursor, until either both directions give up or
+	 * we find an entry at least as big as minlen.
+	 */
+	do {
+		if (bno_cur_lt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
+				break;
+			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_lt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
+		}
+		if (bno_cur_gt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+			xfs_alloc_compute_aligned(args, gtbno, gtlen,
+						  &gtbnoa, &gtlena);
+			if (gtlena >= args->minlen)
+				break;
+			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+	} while (bno_cur_lt || bno_cur_gt);
+
+	/*
+	 * Got both cursors still active, need to find better entry.
+	 */
+	if (bno_cur_lt && bno_cur_gt) {
+		if (ltlena >= args->minlen) {
+			/*
+			 * Left side is good, look for a right side entry.
+			 */
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_lt, &bno_cur_gt,
+						ltdiff, &gtbno, &gtlen,
+						&gtbnoa, &gtlena,
+						0 /* search right */);
+		} else {
+			ASSERT(gtlena >= args->minlen);
+
+			/*
+			 * Right side is good, look for a left side entry.
+			 */
+			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, gtbnoa,
+				gtlena, &gtnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_gt, &bno_cur_lt,
+						gtdiff, &ltbno, &ltlen,
+						&ltbnoa, &ltlena,
+						1 /* search left */);
+		}
+
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If we couldn't get anything, give up.
+	 */
+	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+		if (!forced++) {
+			trace_xfs_alloc_near_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		trace_xfs_alloc_size_neither(args);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+
+	/*
+	 * At this point we have selected a freespace entry, either to the
+	 * left or to the right.  If it's on the right, copy all the
+	 * useful variables to the "left" set so we only have one
+	 * copy of this code.
+	 */
+	if (bno_cur_gt) {
+		bno_cur_lt = bno_cur_gt;
+		bno_cur_gt = NULL;
+		ltbno = gtbno;
+		ltbnoa = gtbnoa;
+		ltlen = gtlen;
+		ltlena = gtlena;
+		j = 1;
+	} else
+		j = 0;
+
+	/*
+	 * Fix up the length and compute the useful address.
+	 */
+	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args)) {
+		trace_xfs_alloc_near_nominleft(args);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
+	rlen = args->len;
+	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+				     args->userdata, ltbnoa, ltlena, &ltnew);
+	ASSERT(ltnew >= ltbno);
+	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
+	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+	args->agbno = ltnew;
+
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+		goto error0;
+
+	if (j)
+		trace_xfs_alloc_near_greater(args);
+	else
+		trace_xfs_alloc_near_lesser(args);
+
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+	return 0;
+
+ error0:
+	trace_xfs_alloc_near_error(args);
+	if (cnt_cur != NULL)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur_lt != NULL)
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+	if (bno_cur_gt != NULL)
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_size(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur;	/* cursor for bno btree */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for cnt btree */
+	int		error;		/* error result */
+	xfs_agblock_t	fbno;		/* start of found freespace */
+	xfs_extlen_t	flen;		/* length of found freespace */
+	int		i;		/* temp status variable */
+	xfs_agblock_t	rbno;		/* returned block number */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
+
+restart:
+	/*
+	 * Allocate and initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+	bno_cur = NULL;
+
+	/*
+	 * Look for an entry >= maxlen+alignment-1 blocks.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+			args->maxlen + args->alignment - 1, &i)))
+		goto error0;
+
+	/*
+	 * If none or we have busy extents that we cannot allocate from, then
+	 * we have to settle for a smaller extent. In the case that there are
+	 * no large extents, this will return the last entry in the tree unless
+	 * the tree is empty. In the case that there are only busy large
+	 * extents, this will return the largest small extent unless there
+	 * are no smaller extents available.
+	 */
+	if (!i || forced > 1) {
+		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+						   &fbno, &flen, &i);
+		if (error)
+			goto error0;
+		if (i == 0 || flen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_noentry(args);
+			return 0;
+		}
+		ASSERT(i == 1);
+		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+	} else {
+		/*
+		 * Search for a non-busy extent that is large enough.
+		 * If we are at low space, don't check, or if we fall of
+		 * the end of the btree, turn off the busy check and
+		 * restart.
+		 */
+		for (;;) {
+			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+
+			if (rlen >= args->maxlen)
+				break;
+
+			error = xfs_btree_increment(cnt_cur, 0, &i);
+			if (error)
+				goto error0;
+			if (i == 0) {
+				/*
+				 * Our only valid extents must have been busy.
+				 * Make it unbusy by forcing the log out and
+				 * retrying. If we've been here before, forcing
+				 * the log isn't making the extents available,
+				 * which means they have probably been freed in
+				 * this transaction.  In that case, we have to
+				 * give up on them and we'll attempt a minlen
+				 * allocation the next time around.
+				 */
+				xfs_btree_del_cursor(cnt_cur,
+						     XFS_BTREE_NOERROR);
+				trace_xfs_alloc_size_busy(args);
+				if (!forced++)
+					xfs_log_force(args->mp, XFS_LOG_SYNC);
+				goto restart;
+			}
+		}
+	}
+
+	/*
+	 * In the first case above, we got the last entry in the
+	 * by-size btree.  Now we check to see if the space hits maxlen
+	 * once aligned; if not, we search left for something better.
+	 * This can't happen in the second case above.
+	 */
+	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+	XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
+			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
+	if (rlen < args->maxlen) {
+		xfs_agblock_t	bestfbno;
+		xfs_extlen_t	bestflen;
+		xfs_agblock_t	bestrbno;
+		xfs_extlen_t	bestrlen;
+
+		bestrlen = rlen;
+		bestrbno = rbno;
+		bestflen = flen;
+		bestfbno = fbno;
+		for (;;) {
+			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+				goto error0;
+			if (i == 0)
+				break;
+			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+			if (flen < bestrlen)
+				break;
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+			XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
+				(rlen <= flen && rbno + rlen <= fbno + flen),
+				error0);
+			if (rlen > bestrlen) {
+				bestrlen = rlen;
+				bestrbno = rbno;
+				bestflen = flen;
+				bestfbno = fbno;
+				if (rlen == args->maxlen)
+					break;
+			}
+		}
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+				&i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+		rlen = bestrlen;
+		rbno = bestrbno;
+		flen = bestflen;
+		fbno = bestfbno;
+	}
+	args->wasfromfl = 0;
+	/*
+	 * Fix up the length.
+	 */
+	args->len = rlen;
+	if (rlen < args->minlen) {
+		if (!forced++) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		goto out_nominleft;
+	}
+	xfs_alloc_fix_len(args);
+
+	if (!xfs_alloc_fix_minleft(args))
+		goto out_nominleft;
+	rlen = args->len;
+	XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
+	/*
+	 * Allocate and initialize a cursor for the by-block tree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			rbno, rlen, XFSA_FIXUP_CNT_OK)))
+		goto error0;
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	cnt_cur = bno_cur = NULL;
+	args->len = rlen;
+	args->agbno = rbno;
+	XFS_WANT_CORRUPTED_GOTO(args->mp,
+		args->agbno + args->len <=
+			be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+		error0);
+	trace_xfs_alloc_size_done(args);
+	return 0;
+
+error0:
+	trace_xfs_alloc_size_error(args);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	return error;
+
+out_nominleft:
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	trace_xfs_alloc_size_nominleft(args);
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_small(
+	xfs_alloc_arg_t	*args,	/* allocation argument structure */
+	xfs_btree_cur_t	*ccur,	/* by-size cursor */
+	xfs_agblock_t	*fbnop,	/* result block number */
+	xfs_extlen_t	*flenp,	/* result length */
+	int		*stat)	/* status: 0-freelist, 1-normal/none */
+{
+	int		error;
+	xfs_agblock_t	fbno;
+	xfs_extlen_t	flen;
+	int		i;
+
+	if ((error = xfs_btree_decrement(ccur, 0, &i)))
+		goto error0;
+	if (i) {
+		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+	}
+	/*
+	 * Nothing in the btree, try the freelist.  Make sure
+	 * to respect minleft even when pulling from the
+	 * freelist.
+	 */
+	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
+		  > args->minleft)) {
+		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+		if (error)
+			goto error0;
+		if (fbno != NULLAGBLOCK) {
+			xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+					     args->userdata);
+
+			if (args->userdata) {
+				xfs_buf_t	*bp;
+
+				bp = xfs_btree_get_bufs(args->mp, args->tp,
+					args->agno, fbno, 0);
+				xfs_trans_binval(args->tp, bp);
+			}
+			args->len = 1;
+			args->agbno = fbno;
+			XFS_WANT_CORRUPTED_GOTO(args->mp,
+				args->agbno + args->len <=
+				be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+				error0);
+			args->wasfromfl = 1;
+			trace_xfs_alloc_small_freelist(args);
+			*stat = 0;
+			return 0;
+		}
+		/*
+		 * Nothing in the freelist.
+		 */
+		else
+			flen = 0;
+	}
+	/*
+	 * Can't allocate from the freelist for some reason.
+	 */
+	else {
+		fbno = NULLAGBLOCK;
+		flen = 0;
+	}
+	/*
+	 * Can't do the allocation, give up.
+	 */
+	if (flen < args->minlen) {
+		args->agbno = NULLAGBLOCK;
+		trace_xfs_alloc_small_notenough(args);
+		flen = 0;
+	}
+	*fbnop = fbno;
+	*flenp = flen;
+	*stat = 1;
+	trace_xfs_alloc_small_done(args);
+	return 0;
+
+error0:
+	trace_xfs_alloc_small_error(args);
+	return error;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int			/* error */
+xfs_free_ag_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer for a.g. freelist header */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	xfs_agblock_t	bno,	/* starting block number */
+	xfs_extlen_t	len,	/* length of extent */
+	int		isfl)	/* set if is freelist blocks - no sb acctg */
+{
+	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
+	int		error;		/* error return value */
+	xfs_agblock_t	gtbno;		/* start of right neighbor block */
+	xfs_extlen_t	gtlen;		/* length of right neighbor block */
+	int		haveleft;	/* have a left neighbor block */
+	int		haveright;	/* have a right neighbor block */
+	int		i;		/* temp, result code */
+	xfs_agblock_t	ltbno;		/* start of left neighbor block */
+	xfs_extlen_t	ltlen;		/* length of left neighbor block */
+	xfs_mount_t	*mp;		/* mount point struct for filesystem */
+	xfs_agblock_t	nbno;		/* new starting block of freespace */
+	xfs_extlen_t	nlen;		/* new length of freespace */
+	xfs_perag_t	*pag;		/* per allocation group data */
+
+	mp = tp->t_mountp;
+	/*
+	 * Allocate and initialize a cursor for the by-block btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+	cnt_cur = NULL;
+	/*
+	 * Look for a neighboring block on the left (lower block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+		goto error0;
+	if (haveleft) {
+		/*
+		 * There is a block to our left.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (ltbno + ltlen < bno)
+			haveleft = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(mp,
+						ltbno + ltlen <= bno, error0);
+		}
+	}
+	/*
+	 * Look for a neighboring block on the right (higher block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+		goto error0;
+	if (haveright) {
+		/*
+		 * There is a block to our right.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (bno + len < gtbno)
+			haveright = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
+		}
+	}
+	/*
+	 * Now allocate and initialize a cursor for the by-size tree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+	/*
+	 * Have both left and right contiguous neighbors.
+	 * Merge all three into a single free block.
+	 */
+	if (haveleft && haveright) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		/*
+		 * Delete the old by-block entry for the right block.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		/*
+		 * Move the by-block cursor back to the left neighbor.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+#ifdef DEBUG
+		/*
+		 * Check that this is the right record: delete didn't
+		 * mangle the cursor.
+		 */
+		{
+			xfs_agblock_t	xxbno;
+			xfs_extlen_t	xxlen;
+
+			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(mp,
+				i == 1 && xxbno == ltbno && xxlen == ltlen,
+				error0);
+		}
+#endif
+		/*
+		 * Update remaining by-block entry to the new, joined block.
+		 */
+		nbno = ltbno;
+		nlen = len + ltlen + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a left contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveleft) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		/*
+		 * Back up the by-block cursor to the left neighbor, and
+		 * update its length.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		nbno = ltbno;
+		nlen = len + ltlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a right contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveright) {
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		/*
+		 * Update the starting block and length of the right
+		 * neighbor in the by-block tree.
+		 */
+		nbno = bno;
+		nlen = len + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * No contiguous neighbors.
+	 * Insert the new freespace into the by-block tree.
+	 */
+	else {
+		nbno = bno;
+		nlen = len;
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	bno_cur = NULL;
+	/*
+	 * In all cases we need to insert the new freespace in the by-size tree.
+	 */
+	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
+	if ((error = xfs_btree_insert(cnt_cur, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	cnt_cur = NULL;
+
+	/*
+	 * Update the freespace totals in the ag and superblock.
+	 */
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_alloc_update_counters(tp, pag, agbp, len);
+	xfs_perag_put(pag);
+	if (error)
+		goto error0;
+
+	if (!isfl)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+	XFS_STATS_INC(xs_freex);
+	XFS_STATS_ADD(xs_freeb, len);
+
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+
+	return 0;
+
+ error0:
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_ag_maxlevels = level;
+}
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag)
+{
+	xfs_extlen_t		need, delta = 0;
+
+	need = XFS_MIN_FREELIST_PAG(pag, mp);
+	if (need > pag->pagf_flcount)
+		delta = need - pag->pagf_flcount;
+
+	if (pag->pagf_longest > delta)
+		return pag->pagf_longest - delta;
+	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+STATIC int			/* error */
+xfs_alloc_fix_freelist(
+	xfs_alloc_arg_t	*args,	/* allocation argument structure */
+	int		flags)	/* XFS_ALLOC_FLAG_... */
+{
+	xfs_buf_t	*agbp;	/* agf buffer pointer */
+	xfs_agf_t	*agf;	/* a.g. freespace structure pointer */
+	xfs_buf_t	*agflbp;/* agfl buffer pointer */
+	xfs_agblock_t	bno;	/* freelist block */
+	xfs_extlen_t	delta;	/* new blocks needed in freelist */
+	int		error;	/* error result code */
+	xfs_extlen_t	longest;/* longest extent in allocation group */
+	xfs_mount_t	*mp;	/* file system mount point structure */
+	xfs_extlen_t	need;	/* total blocks needed in freelist */
+	xfs_perag_t	*pag;	/* per-ag information structure */
+	xfs_alloc_arg_t	targs;	/* local allocation arguments */
+	xfs_trans_t	*tp;	/* transaction pointer */
+
+	mp = args->mp;
+
+	pag = args->pag;
+	tp = args->tp;
+	if (!pag->pagf_init) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (!pag->pagf_init) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+			args->agbp = NULL;
+			return 0;
+		}
+	} else
+		agbp = NULL;
+
+	/*
+	 * If this is a metadata preferred pag and we are user data
+	 * then try somewhere else if we are not being asked to
+	 * try harder at this point
+	 */
+	if (pag->pagf_metadata && args->userdata &&
+	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+		args->agbp = NULL;
+		return 0;
+	}
+
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		/*
+		 * If it looks like there isn't a long enough extent, or enough
+		 * total blocks, reject it.
+		 */
+		need = XFS_MIN_FREELIST_PAG(pag, mp);
+		longest = xfs_alloc_longest_free_extent(mp, pag);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+			   need - args->total) < (int)args->minleft)) {
+			if (agbp)
+				xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+
+	/*
+	 * Get the a.g. freespace buffer.
+	 * Can fail if we're not blocking on locks, and it's held.
+	 */
+	if (agbp == NULL) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (agbp == NULL) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Figure out how many blocks we should have in the freelist.
+	 */
+	agf = XFS_BUF_TO_AGF(agbp);
+	need = XFS_MIN_FREELIST(agf, mp);
+	/*
+	 * If there isn't enough total or single-extent, reject it.
+	 */
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		delta = need > be32_to_cpu(agf->agf_flcount) ?
+			(need - be32_to_cpu(agf->agf_flcount)) : 0;
+		longest = be32_to_cpu(agf->agf_longest);
+		longest = (longest > delta) ? (longest - delta) :
+			(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(be32_to_cpu(agf->agf_freeblks) +
+		     be32_to_cpu(agf->agf_flcount) - need - args->total) <
+				(int)args->minleft)) {
+			xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Make the freelist shorter if it's too long.
+	 */
+	while (be32_to_cpu(agf->agf_flcount) > need) {
+		xfs_buf_t	*bp;
+
+		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+		if (error)
+			return error;
+		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+			return error;
+		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Initialize the args structure.
+	 */
+	memset(&targs, 0, sizeof(targs));
+	targs.tp = tp;
+	targs.mp = mp;
+	targs.agbp = agbp;
+	targs.agno = args->agno;
+	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+	targs.type = XFS_ALLOCTYPE_THIS_AG;
+	targs.pag = pag;
+	if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+		return error;
+	/*
+	 * Make the freelist longer if it's too short.
+	 */
+	while (be32_to_cpu(agf->agf_flcount) < need) {
+		targs.agbno = 0;
+		targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
+		/*
+		 * Allocate as many blocks as possible at once.
+		 */
+		if ((error = xfs_alloc_ag_vextent(&targs))) {
+			xfs_trans_brelse(tp, agflbp);
+			return error;
+		}
+		/*
+		 * Stop if we run out.  Won't happen if callers are obeying
+		 * the restrictions correctly.  Can happen for free calls
+		 * on a completely full ag.
+		 */
+		if (targs.agbno == NULLAGBLOCK) {
+			if (flags & XFS_ALLOC_FLAG_FREEING)
+				break;
+			xfs_trans_brelse(tp, agflbp);
+			args->agbp = NULL;
+			return 0;
+		}
+		/*
+		 * Put each allocated block on the list.
+		 */
+		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+			error = xfs_alloc_put_freelist(tp, agbp,
+							agflbp, bno, 0);
+			if (error)
+				return error;
+		}
+	}
+	xfs_trans_brelse(tp, agflbp);
+	args->agbp = agbp;
+	return 0;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
+	int		btreeblk) /* destination is a AGF btree */
+{
+	xfs_agf_t	*agf;	/* a.g. freespace structure */
+	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
+	xfs_agblock_t	bno;	/* block number returned */
+	__be32		*agfl_bno;
+	int		error;
+	int		logflags;
+	xfs_mount_t	*mp = tp->t_mountp;
+	xfs_perag_t	*pag;	/* per allocation group data */
+
+	/*
+	 * Freelist is empty, give up.
+	 */
+	agf = XFS_BUF_TO_AGF(agbp);
+	if (!agf->agf_flcount) {
+		*bnop = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Read the array of free blocks.
+	 */
+	error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+				    &agflbp);
+	if (error)
+		return error;
+
+
+	/*
+	 * Get the block number and update the data structures.
+	 */
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+	be32_add_cpu(&agf->agf_flfirst, 1);
+	xfs_trans_brelse(tp, agflbp);
+	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+		agf->agf_flfirst = 0;
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, -1);
+	xfs_trans_agflist_delta(tp, -1);
+	pag->pagf_flcount--;
+	xfs_perag_put(pag);
+
+	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, 1);
+		pag->pagf_btreeblks++;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+	*bnop = bno;
+
+	return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*bp,	/* buffer for a.g. freelist header */
+	int		fields)	/* mask of fields to be logged (XFS_AGF_...) */
+{
+	int	first;		/* first byte offset */
+	int	last;		/* last byte offset */
+	static const short	offsets[] = {
+		offsetof(xfs_agf_t, agf_magicnum),
+		offsetof(xfs_agf_t, agf_versionnum),
+		offsetof(xfs_agf_t, agf_seqno),
+		offsetof(xfs_agf_t, agf_length),
+		offsetof(xfs_agf_t, agf_roots[0]),
+		offsetof(xfs_agf_t, agf_levels[0]),
+		offsetof(xfs_agf_t, agf_flfirst),
+		offsetof(xfs_agf_t, agf_fllast),
+		offsetof(xfs_agf_t, agf_flcount),
+		offsetof(xfs_agf_t, agf_freeblks),
+		offsetof(xfs_agf_t, agf_longest),
+		offsetof(xfs_agf_t, agf_btreeblks),
+		offsetof(xfs_agf_t, agf_uuid),
+		sizeof(xfs_agf_t)
+	};
+
+	trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
+	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int					/* error */
+xfs_alloc_pagf_init(
+	xfs_mount_t		*mp,	/* file system mount structure */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags)	/* XFS_ALLOC_FLAGS_... */
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int					/* error */
+xfs_alloc_put_freelist(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
+	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t		bno,	/* block being freed */
+	int			btreeblk) /* block came from a AGF btree */
+{
+	xfs_agf_t		*agf;	/* a.g. freespace structure */
+	__be32			*blockp;/* pointer to array entry */
+	int			error;
+	int			logflags;
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_perag_t		*pag;	/* per allocation group data */
+	__be32			*agfl_bno;
+	int			startoff;
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	mp = tp->t_mountp;
+
+	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+			be32_to_cpu(agf->agf_seqno), &agflbp)))
+		return error;
+	be32_add_cpu(&agf->agf_fllast, 1);
+	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+		agf->agf_fllast = 0;
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, 1);
+	xfs_trans_agflist_delta(tp, 1);
+	pag->pagf_flcount++;
+
+	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, -1);
+		pag->pagf_btreeblks--;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+	*blockp = cpu_to_be32(bno);
+	startoff = (char *)blockp - (char *)agflbp->b_addr;
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+	xfs_trans_log_buf(tp, agflbp, startoff,
+			  startoff + sizeof(xfs_agblock_t) - 1);
+	return 0;
+}
+
+static bool
+xfs_agf_verify(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp)
+ {
+	struct xfs_agf	*agf = XFS_BUF_TO_AGF(bp);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+			return false;
+
+	if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+	      XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+	      be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+		return false;
+
+	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
+		return false;
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+		return false;
+
+	return true;;
+
+}
+
+static void
+xfs_agf_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+				XFS_ERRTAG_ALLOC_READ_AGF,
+				XFS_RANDOM_ALLOC_READ_AGF))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_agf_verify(mp, bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+	.verify_read = xfs_agf_read_verify,
+	.verify_write = xfs_agf_write_verify,
+};
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_BUF_ */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+{
+	int		error;
+
+	trace_xfs_read_agf(mp, agno);
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(
+			mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+	if (error)
+		return error;
+	if (!*bpp)
+		return 0;
+
+	ASSERT(!(*bpp)->b_error);
+	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_alloc_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+{
+	struct xfs_agf		*agf;		/* ag freelist header */
+	struct xfs_perag	*pag;		/* per allocation group data */
+	int			error;
+
+	trace_xfs_alloc_read_agf(mp, agno);
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_read_agf(mp, tp, agno,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+			bpp);
+	if (error)
+		return error;
+	if (!*bpp)
+		return 0;
+	ASSERT(!(*bpp)->b_error);
+
+	agf = XFS_BUF_TO_AGF(*bpp);
+	pag = xfs_perag_get(mp, agno);
+	if (!pag->pagf_init) {
+		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+		pag->pagf_levels[XFS_BTNUM_BNOi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+		pag->pagf_levels[XFS_BTNUM_CNTi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+		spin_lock_init(&pag->pagb_lock);
+		pag->pagb_count = 0;
+		pag->pagb_tree = RB_ROOT;
+		pag->pagf_init = 1;
+	}
+#ifdef DEBUG
+	else if (!XFS_FORCED_SHUTDOWN(mp)) {
+		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+	}
+#endif
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t	*args)	/* allocation argument structure */
+{
+	xfs_agblock_t	agsize;	/* allocation group size */
+	int		error;
+	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
+	xfs_extlen_t	minleft;/* minimum left value, temp copy */
+	xfs_mount_t	*mp;	/* mount structure pointer */
+	xfs_agnumber_t	sagno;	/* starting allocation group number */
+	xfs_alloctype_t	type;	/* input allocation type */
+	int		bump_rotor = 0;
+	int		no_min = 0;
+	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+
+	mp = args->mp;
+	type = args->otype = args->type;
+	args->agbno = NULLAGBLOCK;
+	/*
+	 * Just fix this up, for the case where the last a.g. is shorter
+	 * (or there's only one a.g.) and the caller couldn't easily figure
+	 * that out (xfs_bmap_alloc).
+	 */
+	agsize = mp->m_sb.sb_agblocks;
+	if (args->maxlen > agsize)
+		args->maxlen = agsize;
+	if (args->alignment == 0)
+		args->alignment = 1;
+	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->minlen <= agsize);
+	ASSERT(args->mod < args->prod);
+	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+	    args->minlen > args->maxlen || args->minlen > agsize ||
+	    args->mod >= args->prod) {
+		args->fsbno = NULLFSBLOCK;
+		trace_xfs_alloc_vextent_badargs(args);
+		return 0;
+	}
+	minleft = args->minleft;
+
+	switch (type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+	case XFS_ALLOCTYPE_NEAR_BNO:
+	case XFS_ALLOCTYPE_THIS_BNO:
+		/*
+		 * These three force us into a single a.g.
+		 */
+		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+		args->pag = xfs_perag_get(mp, args->agno);
+		args->minleft = 0;
+		error = xfs_alloc_fix_freelist(args, 0);
+		args->minleft = minleft;
+		if (error) {
+			trace_xfs_alloc_vextent_nofix(args);
+			goto error0;
+		}
+		if (!args->agbp) {
+			trace_xfs_alloc_vextent_noagbp(args);
+			break;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		if ((error = xfs_alloc_ag_vextent(args)))
+			goto error0;
+		break;
+	case XFS_ALLOCTYPE_START_BNO:
+		/*
+		 * Try near allocation first, then anywhere-in-ag after
+		 * the first a.g. fails.
+		 */
+		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+			args->fsbno = XFS_AGB_TO_FSB(mp,
+					((mp->m_agfrotor / rotorstep) %
+					mp->m_sb.sb_agcount), 0);
+			bump_rotor = 1;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+		/* FALLTHROUGH */
+	case XFS_ALLOCTYPE_ANY_AG:
+	case XFS_ALLOCTYPE_START_AG:
+	case XFS_ALLOCTYPE_FIRST_AG:
+		/*
+		 * Rotate through the allocation groups looking for a winner.
+		 */
+		if (type == XFS_ALLOCTYPE_ANY_AG) {
+			/*
+			 * Start with the last place we left off.
+			 */
+			args->agno = sagno = (mp->m_agfrotor / rotorstep) %
+					mp->m_sb.sb_agcount;
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+			/*
+			 * Start with allocation group given by bno.
+			 */
+			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			sagno = 0;
+			flags = 0;
+		} else {
+			if (type == XFS_ALLOCTYPE_START_AG)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			 * Start with the given allocation group.
+			 */
+			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		}
+		/*
+		 * Loop over allocation groups twice; first time with
+		 * trylock set, second time without.
+		 */
+		for (;;) {
+			args->pag = xfs_perag_get(mp, args->agno);
+			if (no_min) args->minleft = 0;
+			error = xfs_alloc_fix_freelist(args, flags);
+			args->minleft = minleft;
+			if (error) {
+				trace_xfs_alloc_vextent_nofix(args);
+				goto error0;
+			}
+			/*
+			 * If we get a buffer back then the allocation will fly.
+			 */
+			if (args->agbp) {
+				if ((error = xfs_alloc_ag_vextent(args)))
+					goto error0;
+				break;
+			}
+
+			trace_xfs_alloc_vextent_loopfailed(args);
+
+			/*
+			 * Didn't work, figure out the next iteration.
+			 */
+			if (args->agno == sagno &&
+			    type == XFS_ALLOCTYPE_START_BNO)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			* For the first allocation, we can try any AG to get
+			* space.  However, if we already have allocated a
+			* block, we don't want to try AGs whose number is below
+			* sagno. Otherwise, we may end up with out-of-order
+			* locking of AGF, which might cause deadlock.
+			*/
+			if (++(args->agno) == mp->m_sb.sb_agcount) {
+				if (args->firstblock != NULLFSBLOCK)
+					args->agno = sagno;
+				else
+					args->agno = 0;
+			}
+			/*
+			 * Reached the starting a.g., must either be done
+			 * or switch to non-trylock mode.
+			 */
+			if (args->agno == sagno) {
+				if (no_min == 1) {
+					args->agbno = NULLAGBLOCK;
+					trace_xfs_alloc_vextent_allfailed(args);
+					break;
+				}
+				if (flags == 0) {
+					no_min = 1;
+				} else {
+					flags = 0;
+					if (type == XFS_ALLOCTYPE_START_BNO) {
+						args->agbno = XFS_FSB_TO_AGBNO(mp,
+							args->fsbno);
+						args->type = XFS_ALLOCTYPE_NEAR_BNO;
+					}
+				}
+			}
+			xfs_perag_put(args->pag);
+		}
+		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+			if (args->agno == sagno)
+				mp->m_agfrotor = (mp->m_agfrotor + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+			else
+				mp->m_agfrotor = (args->agno * rotorstep + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+		}
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (args->agbno == NULLAGBLOCK)
+		args->fsbno = NULLFSBLOCK;
+	else {
+		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+		ASSERT(args->len >= args->minlen);
+		ASSERT(args->len <= args->maxlen);
+		ASSERT(args->agbno % args->alignment == 0);
+		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+			args->len);
+#endif
+	}
+	xfs_perag_put(args->pag);
+	return 0;
+error0:
+	xfs_perag_put(args->pag);
+	return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int				/* error */
+xfs_free_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len)	/* length of extent */
+{
+	xfs_alloc_arg_t	args;
+	int		error;
+
+	ASSERT(len != 0);
+	memset(&args, 0, sizeof(xfs_alloc_arg_t));
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * validate that the block number is legal - the enables us to detect
+	 * and handle a silent filesystem corruption rather than crashing.
+	 */
+	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+	if (args.agno >= args.mp->m_sb.sb_agcount)
+		return -EFSCORRUPTED;
+
+	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+	if (args.agbno >= args.mp->m_sb.sb_agblocks)
+		return -EFSCORRUPTED;
+
+	args.pag = xfs_perag_get(args.mp, args.agno);
+	ASSERT(args.pag);
+
+	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	if (error)
+		goto error0;
+
+	/* validate the extent size is legal now we have the agf locked */
+	if (args.agbno + len >
+			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+
+	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+	if (!error)
+		xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+	xfs_perag_put(args.pag);
+	return error;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.h b/kernel/fs/xfs/libxfs/xfs_alloc.h
new file mode 100644
index 000000000..d1b4b6a5c
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_alloc.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_H__
+#define	__XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+
+extern struct workqueue_struct *xfs_alloc_wq;
+
+/*
+ * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
+ */
+#define XFS_ALLOCTYPE_ANY_AG	0x01	/* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG	0x02	/* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG	0x04	/* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG	0x08	/* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO	0x10	/* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO	0x20	/* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO	0x40	/* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+	{ XFS_ALLOCTYPE_ANY_AG,		"ANY_AG" }, \
+	{ XFS_ALLOCTYPE_FIRST_AG,	"FIRST_AG" }, \
+	{ XFS_ALLOCTYPE_START_AG,	"START_AG" }, \
+	{ XFS_ALLOCTYPE_THIS_AG,	"THIS_AG" }, \
+	{ XFS_ALLOCTYPE_START_BNO,	"START_BNO" }, \
+	{ XFS_ALLOCTYPE_NEAR_BNO,	"NEAR_BNO" }, \
+	{ XFS_ALLOCTYPE_THIS_BNO,	"THIS_BNO" }
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+#define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *	- the AG superblock, AGF, AGI and AGFL
+ *	- the AGF (bno and cnt) and AGI btree root blocks
+ *	- 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)	\
+	((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+	struct xfs_trans *tp;		/* transaction pointer */
+	struct xfs_mount *mp;		/* file system mount point */
+	struct xfs_buf	*agbp;		/* buffer for a.g. freelist header */
+	struct xfs_perag *pag;		/* per-ag struct for this agno */
+	xfs_fsblock_t	fsbno;		/* file system block number */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agblock_t	agbno;		/* allocation group-relative block # */
+	xfs_extlen_t	minlen;		/* minimum size of extent */
+	xfs_extlen_t	maxlen;		/* maximum size of extent */
+	xfs_extlen_t	mod;		/* mod value for extent size */
+	xfs_extlen_t	prod;		/* prod value for extent size */
+	xfs_extlen_t	minleft;	/* min blocks must be left after us */
+	xfs_extlen_t	total;		/* total blocks needed in xaction */
+	xfs_extlen_t	alignment;	/* align answer to multiple of this */
+	xfs_extlen_t	minalignslop;	/* slop for minlen+alignment calcs */
+	xfs_extlen_t	len;		/* output: actual size of extent */
+	xfs_alloctype_t	type;		/* allocation type XFS_ALLOCTYPE_... */
+	xfs_alloctype_t	otype;		/* original allocation type */
+	char		wasdel;		/* set if allocation was prev delayed */
+	char		wasfromfl;	/* set if allocation is from freelist */
+	char		isfl;		/* set if is freelist blocks - !acctg */
+	char		userdata;	/* set if this is user data */
+	xfs_fsblock_t	firstblock;	/* io first block allocated */
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+		struct xfs_perag *pag);
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
+	int		btreeblk); /* destination is a AGF btree */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*bp,	/* buffer for a.g. freelist header */
+	int		fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int				/* error */
+xfs_alloc_pagf_init(
+	struct xfs_mount *mp,	/* file system mount structure */
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	int		flags);	/* XFS_ALLOC_FLAGS_... */
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int				/* error */
+xfs_alloc_put_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer for a.g. freelist header */
+	struct xfs_buf	*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t	bno,	/* block being freed */
+	int		btreeblk); /* owner was a AGF btree */
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error  */
+xfs_alloc_read_agf(
+	struct xfs_mount *mp,		/* mount point structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	int		flags,		/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf	**bpp);		/* buffer for the ag freelist header */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t	*args);	/* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int				/* error */
+xfs_free_extent(
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len);	/* length of extent */
+
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat);	/* output: success/failure */
+
+int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+
+#endif	/* __XFS_ALLOC_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_alloc_btree.c b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c
new file mode 100644
index 000000000..59d521c09
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+
+
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_btnum);
+}
+
+STATIC void
+xfs_allocbt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	int			btnum = cur->bc_btnum;
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_roots[btnum] = ptr->s;
+	be32_add_cpu(&agf->agf_levels[btnum], inc);
+	pag->pagf_levels[btnum] += inc;
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	int			error;
+	xfs_agblock_t		bno;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	/* Allocate the new block from the freelist. If we can't, give up.  */
+	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+				       &bno, 1);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	if (bno == NULLAGBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	new->s = cpu_to_be32(bno);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_allocbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agblock_t		bno;
+	int			error;
+
+	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+	if (error)
+		return error;
+
+	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+			      XFS_EXTENT_BUSY_SKIP_DISCARD);
+	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+	xfs_trans_binval(cur->bc_tp, bp);
+	return 0;
+}
+
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_rec	*rec,
+	int			ptr,
+	int			reason)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag;
+	__be32			len;
+	int			numrecs;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+	switch (reason) {
+	case LASTREC_UPDATE:
+		/*
+		 * If this is the last leaf block and it's the last record,
+		 * then update the size of the longest extent in the AG.
+		 */
+		if (ptr != xfs_btree_get_numrecs(block))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_INSREC:
+		if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+		    be32_to_cpu(agf->agf_longest))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_DELREC:
+		numrecs = xfs_btree_get_numrecs(block);
+		if (ptr <= numrecs)
+			return;
+		ASSERT(ptr == numrecs + 1);
+
+		if (numrecs) {
+			xfs_alloc_rec_t *rrp;
+
+			rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+			len = rrp->ar_blockcount;
+		} else {
+			len = 0;
+		}
+
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	agf->agf_longest = len;
+	pag = xfs_perag_get(cur->bc_mp, seqno);
+	pag->pagf_longest = be32_to_cpu(len);
+	xfs_perag_put(pag);
+	xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+}
+
+STATIC int
+xfs_allocbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+
+STATIC int
+xfs_allocbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+
+STATIC void
+xfs_allocbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(rec->alloc.ar_startblock != 0);
+
+	key->alloc.ar_startblock = rec->alloc.ar_startblock;
+	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(key->alloc.ar_startblock != 0);
+
+	rec->alloc.ar_startblock = key->alloc.ar_startblock;
+	rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(cur->bc_rec.a.ar_startblock != 0);
+
+	rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+	rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+	ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+	ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_allocbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	xfs_alloc_rec_incore_t	*rec = &cur->bc_rec.a;
+	xfs_alloc_key_t		*kp = &key->alloc;
+	__int64_t		diff;
+
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+				rec->ar_startblock;
+	}
+
+	diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+	if (diff)
+		return diff;
+
+	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+static bool
+xfs_allocbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+
+	/*
+	 * magic number and level verification
+	 *
+	 * During growfs operations, we can't verify the exact level or owner as
+	 * the perag is not fully initialised and hence not attached to the
+	 * buffer.  In this case, check against the maximum tree depth.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agf information will not yet have been initialised
+	 * from the on disk AGF. Again, we can only check against maximum limits
+	 * in this case.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_ABTB_MAGIC):
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
+		break;
+	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_ABTC_MAGIC):
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	/* numrecs verification */
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
+}
+
+static void
+xfs_allocbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_allocbt_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+static void
+xfs_allocbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_allocbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+	.verify_read = xfs_allocbt_read_verify,
+	.verify_write = xfs_allocbt_write_verify,
+};
+
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_allocbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(k1->alloc.ar_startblock) <
+		       be32_to_cpu(k2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(k1->alloc.ar_blockcount) <
+			be32_to_cpu(k2->alloc.ar_blockcount) ||
+			(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+			 be32_to_cpu(k1->alloc.ar_startblock) <
+			 be32_to_cpu(k2->alloc.ar_startblock));
+	}
+}
+
+STATIC int
+xfs_allocbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(r1->alloc.ar_startblock) +
+			be32_to_cpu(r1->alloc.ar_blockcount) <=
+			be32_to_cpu(r2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(r1->alloc.ar_blockcount) <
+			be32_to_cpu(r2->alloc.ar_blockcount) ||
+			(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+			 be32_to_cpu(r1->alloc.ar_startblock) <
+			 be32_to_cpu(r2->alloc.ar_startblock));
+	}
+}
+#endif	/* DEBUG */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+	.rec_len		= sizeof(xfs_alloc_rec_t),
+	.key_len		= sizeof(xfs_alloc_key_t),
+
+	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.set_root		= xfs_allocbt_set_root,
+	.alloc_block		= xfs_allocbt_alloc_block,
+	.free_block		= xfs_allocbt_free_block,
+	.update_lastrec		= xfs_allocbt_update_lastrec,
+	.get_minrecs		= xfs_allocbt_get_minrecs,
+	.get_maxrecs		= xfs_allocbt_get_maxrecs,
+	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_rec_from_key	= xfs_allocbt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
+	.key_diff		= xfs_allocbt_key_diff,
+	.buf_ops		= &xfs_allocbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_allocbt_keys_inorder,
+	.recs_inorder		= xfs_allocbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *			/* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agf structure */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_btnum_t		btnum)		/* btree identifier */
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = btnum;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_ops = &xfs_allocbt_ops;
+
+	if (btnum == XFS_BTNUM_CNT) {
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+	} else {
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+	}
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_alloc_rec_t);
+	return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_alloc_btree.h b/kernel/fs/xfs/libxfs/xfs_alloc_btree.h
new file mode 100644
index 000000000..45e189e7e
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define	__XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+	((xfs_alloc_rec_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+	((xfs_alloc_key_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_alloc_ptr_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+		 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_buf *,
+		xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+
+#endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_attr.c b/kernel/fs/xfs/libxfs/xfs_attr.c
new file mode 100644
index 000000000..0a472fbe0
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_attr.c
@@ -0,0 +1,1456 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+
+
+STATIC int
+xfs_attr_args_init(
+	struct xfs_da_args	*args,
+	struct xfs_inode	*dp,
+	const unsigned char	*name,
+	int			flags)
+{
+
+	if (!name)
+		return -EINVAL;
+
+	memset(args, 0, sizeof(*args));
+	args->geo = dp->i_mount->m_attr_geo;
+	args->whichfork = XFS_ATTR_FORK;
+	args->dp = dp;
+	args->flags = flags;
+	args->name = name;
+	args->namelen = strlen((const char *)name);
+	if (args->namelen >= MAXNAMELEN)
+		return -EFAULT;		/* match IRIX behaviour */
+
+	args->hashval = xfs_da_hashname(args->name, args->namelen);
+	return 0;
+}
+
+int
+xfs_inode_hasattr(
+	struct xfs_inode	*ip)
+{
+	if (!XFS_IFORK_Q(ip) ||
+	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     ip->i_d.di_anextents == 0))
+		return 0;
+	return 1;
+}
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+int
+xfs_attr_get(
+	struct xfs_inode	*ip,
+	const unsigned char	*name,
+	unsigned char		*value,
+	int			*valuelenp,
+	int			flags)
+{
+	struct xfs_da_args	args;
+	uint			lock_mode;
+	int			error;
+
+	XFS_STATS_INC(xs_attr_get);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	if (!xfs_inode_hasattr(ip))
+		return -ENOATTR;
+
+	error = xfs_attr_args_init(&args, ip, name, flags);
+	if (error)
+		return error;
+
+	args.value = value;
+	args.valuelen = *valuelenp;
+
+	lock_mode = xfs_ilock_attr_map_shared(ip);
+	if (!xfs_inode_hasattr(ip))
+		error = -ENOATTR;
+	else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+		error = xfs_attr_shortform_getvalue(&args);
+	else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+		error = xfs_attr_leaf_get(&args);
+	else
+		error = xfs_attr_node_get(&args);
+	xfs_iunlock(ip, lock_mode);
+
+	*valuelenp = args.valuelen;
+	return error == -EEXIST ? 0 : error;
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+STATIC int
+xfs_attr_calc_size(
+	struct xfs_da_args	*args,
+	int			*local)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	int			size;
+	int			nblks;
+
+	/*
+	 * Determine space new attribute will use, and if it would be
+	 * "local" or "remote" (note: local != inline).
+	 */
+	size = xfs_attr_leaf_newentsize(args, local);
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	if (*local) {
+		if (size > (args->geo->blksize / 2)) {
+			/* Double split possible */
+			nblks *= 2;
+		}
+	} else {
+		/*
+		 * Out of line attribute, cannot double split, but
+		 * make room for the attribute value itself.
+		 */
+		uint	dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+		nblks += dblocks;
+		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+	}
+
+	return nblks;
+}
+
+int
+xfs_attr_set(
+	struct xfs_inode	*dp,
+	const unsigned char	*name,
+	unsigned char		*value,
+	int			valuelen,
+	int			flags)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_args	args;
+	struct xfs_bmap_free	flist;
+	struct xfs_trans_res	tres;
+	xfs_fsblock_t		firstblock;
+	int			rsvd = (flags & ATTR_ROOT) != 0;
+	int			error, err2, committed, local;
+
+	XFS_STATS_INC(xs_attr_set);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return -EIO;
+
+	error = xfs_attr_args_init(&args, dp, name, flags);
+	if (error)
+		return error;
+
+	args.value = value;
+	args.valuelen = valuelen;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+	args.total = xfs_attr_calc_size(&args, &local);
+
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
+
+	/*
+	 * If the inode doesn't have an attribute fork, add one.
+	 * (inode must not be locked when we call this routine)
+	 */
+	if (XFS_IFORK_Q(dp) == 0) {
+		int sf_size = sizeof(xfs_attr_sf_hdr_t) +
+			XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
+
+		error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (rsvd)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+			 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+	tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+	error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
+	if (error) {
+		xfs_trans_cancel(args.trans, 0);
+		return error;
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
+				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+				       XFS_QMOPT_RES_REGBLKS);
+	if (error) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+		return error;
+	}
+
+	xfs_trans_ijoin(args.trans, dp, 0);
+
+	/*
+	 * If the attribute list is non-existent or a shortform list,
+	 * upgrade it to a single-leaf-block attribute list.
+	 */
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+
+		/*
+		 * Build initial attribute list (if required).
+		 */
+		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+			xfs_attr_shortform_create(&args);
+
+		/*
+		 * Try to add the attr to the attribute list in
+		 * the inode.
+		 */
+		error = xfs_attr_shortform_addname(&args);
+		if (error != -ENOSPC) {
+			/*
+			 * Commit the shortform mods, and we're done.
+			 * NOTE: this is also the error path (EEXIST, etc).
+			 */
+			ASSERT(args.trans != NULL);
+
+			/*
+			 * If this is a synchronous mount, make sure that
+			 * the transaction goes to disk before returning
+			 * to the user.
+			 */
+			if (mp->m_flags & XFS_MOUNT_WSYNC)
+				xfs_trans_set_sync(args.trans);
+
+			if (!error && (flags & ATTR_KERNOTIME) == 0) {
+				xfs_trans_ichgtime(args.trans, dp,
+							XFS_ICHGTIME_CHG);
+			}
+			err2 = xfs_trans_commit(args.trans,
+						 XFS_TRANS_RELEASE_LOG_RES);
+			xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+			return error ? error : err2;
+		}
+
+		/*
+		 * It won't fit in the shortform, transform to a leaf block.
+		 * GROT: another possible req'mt for a double-split btree op.
+		 */
+		xfs_bmap_init(args.flist, args.firstblock);
+		error = xfs_attr_shortform_to_leaf(&args);
+		if (!error) {
+			error = xfs_bmap_finish(&args.trans, args.flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args.trans = NULL;
+			xfs_bmap_cancel(&flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args.trans, dp, 0);
+
+		/*
+		 * Commit the leaf transformation.  We'll need another (linked)
+		 * transaction to add the new attribute to the leaf.
+		 */
+
+		error = xfs_trans_roll(&args.trans, dp);
+		if (error)
+			goto out;
+
+	}
+
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+		error = xfs_attr_leaf_addname(&args);
+	else
+		error = xfs_attr_node_addname(&args);
+	if (error)
+		goto out;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(args.trans);
+
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return error;
+
+out:
+	if (args.trans) {
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+int
+xfs_attr_remove(
+	struct xfs_inode	*dp,
+	const unsigned char	*name,
+	int			flags)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_args	args;
+	struct xfs_bmap_free	flist;
+	xfs_fsblock_t		firstblock;
+	int			error;
+
+	XFS_STATS_INC(xs_attr_remove);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return -EIO;
+
+	if (!xfs_inode_hasattr(dp))
+		return -ENOATTR;
+
+	error = xfs_attr_args_init(&args, dp, name, flags);
+	if (error)
+		return error;
+
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+
+	/*
+	 * we have no control over the attribute names that userspace passes us
+	 * to remove, so we have to allow the name lookup prior to attribute
+	 * removal to fail.
+	 */
+	args.op_flags = XFS_DA_OP_OKNOENT;
+
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (flags & ATTR_ROOT)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+				  XFS_ATTRRM_SPACE_RES(mp), 0);
+	if (error) {
+		xfs_trans_cancel(args.trans, 0);
+		return error;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks not allocate in the common case.
+	 */
+	xfs_trans_ijoin(args.trans, dp, 0);
+
+	if (!xfs_inode_hasattr(dp)) {
+		error = -ENOATTR;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+		error = xfs_attr_shortform_remove(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_removename(&args);
+	} else {
+		error = xfs_attr_node_removename(&args);
+	}
+
+	if (error)
+		goto out;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(args.trans);
+
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return error;
+
+out:
+	if (args.trans) {
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+	int newsize, forkoff, retval;
+
+	trace_xfs_attr_sf_addname(args);
+
+	retval = xfs_attr_shortform_lookup(args);
+	if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+		return retval;
+	} else if (retval == -EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			return retval;
+		retval = xfs_attr_shortform_remove(args);
+		ASSERT(retval == 0);
+	}
+
+	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+	    args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+		return -ENOSPC;
+
+	newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+	newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+
+	forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+	if (!forkoff)
+		return -ENOSPC;
+
+	xfs_attr_shortform_add(args, forkoff);
+	return 0;
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	struct xfs_buf *bp;
+	int retval, error, committed, forkoff;
+
+	trace_xfs_attr_leaf_addname(args);
+
+	/*
+	 * Read the (only) block in the attribute list in.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	/*
+	 * Look up the given attribute in the leaf block.  Figure out if
+	 * the given flags produce an error or call for an atomic rename.
+	 */
+	retval = xfs_attr3_leaf_lookup_int(bp, args);
+	if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+		xfs_trans_brelse(args->trans, bp);
+		return retval;
+	} else if (retval == -EEXIST) {
+		if (args->flags & ATTR_CREATE) {	/* pure create op */
+			xfs_trans_brelse(args->trans, bp);
+			return retval;
+		}
+
+		trace_xfs_attr_leaf_replace(args);
+
+		/* save the attribute state for later removal*/
+		args->op_flags |= XFS_DA_OP_RENAME;	/* an atomic rename */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtvaluelen2 = args->rmtvaluelen;
+
+		/*
+		 * clear the remote attr state now that it is saved so that the
+		 * values reflect the state of the attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+		args->rmtvaluelen = 0;
+	}
+
+	/*
+	 * Add the attribute to the leaf block, transitioning to a Btree
+	 * if required.
+	 */
+	retval = xfs_attr3_leaf_add(bp, args);
+	if (retval == -ENOSPC) {
+		/*
+		 * Promote the attribute list to the Btree format, then
+		 * Commit that transaction so that the node_addname() call
+		 * can manage its own transactions.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_attr3_leaf_to_node(args);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+
+		/*
+		 * Commit the current trans (including the inode) and start
+		 * a new one.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			return error;
+
+		/*
+		 * Fob the whole rest of the problem off on the Btree code.
+		 */
+		error = xfs_attr_node_addname(args);
+		return error;
+	}
+
+	/*
+	 * Commit the transaction that added the attr name so that
+	 * later routines can manage their own transactions.
+	 */
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
+		return error;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr3_leaf_flipflags(args);
+		if (error)
+			return error;
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		args->rmtvaluelen = args->rmtvaluelen2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * Read in the block containing the "old" attr, then
+		 * remove the "old" attr from that block (neat, huh!)
+		 */
+		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+					   -1, &bp);
+		if (error)
+			return error;
+
+		xfs_attr3_leaf_remove(bp, args);
+
+		/*
+		 * If the result is small enough, shrink it all into the inode.
+		 */
+		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				return error;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+		}
+
+		/*
+		 * Commit the remove and start the next trans in series.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr3_leaf_clearflag(args);
+	}
+	return error;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	struct xfs_buf *bp;
+	int error, committed, forkoff;
+
+	trace_xfs_attr_leaf_removename(args);
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	error = xfs_attr3_leaf_lookup_int(bp, args);
+	if (error == -ENOATTR) {
+		xfs_trans_brelse(args->trans, bp);
+		return error;
+	}
+
+	xfs_attr3_leaf_remove(bp, args);
+
+	/*
+	 * If the result is small enough, shrink it all into the inode.
+	 */
+	if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+		/* bp is gone due to xfs_da_shrink_inode */
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+	}
+	return 0;
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+	struct xfs_buf *bp;
+	int error;
+
+	trace_xfs_attr_leaf_get(args);
+
+	args->blkno = 0;
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	error = xfs_attr3_leaf_lookup_int(bp, args);
+	if (error != -EEXIST)  {
+		xfs_trans_brelse(args->trans, bp);
+		return error;
+	}
+	error = xfs_attr3_leaf_getvalue(bp, args);
+	xfs_trans_brelse(args->trans, bp);
+	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+		error = xfs_attr_rmtval_get(args);
+	}
+	return error;
+}
+
+/*========================================================================
+ * External routines when attribute list size > geo->blksize
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_mount_t *mp;
+	int committed, retval, error;
+
+	trace_xfs_attr_node_addname(args);
+
+	/*
+	 * Fill in bucket of arguments/results/context to carry around.
+	 */
+	dp = args->dp;
+	mp = dp->i_mount;
+restart:
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = mp;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error)
+		goto out;
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+		goto out;
+	} else if (retval == -EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			goto out;
+
+		trace_xfs_attr_node_replace(args);
+
+		/* save the attribute state for later removal*/
+		args->op_flags |= XFS_DA_OP_RENAME;	/* atomic rename op */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtvaluelen2 = args->rmtvaluelen;
+
+		/*
+		 * clear the remote attr state now that it is saved so that the
+		 * values reflect the state of the attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+		args->rmtvaluelen = 0;
+	}
+
+	retval = xfs_attr3_leaf_add(blk->bp, state->args);
+	if (retval == -ENOSPC) {
+		if (state->path.active == 1) {
+			/*
+			 * Its really a single leaf node, but it had
+			 * out-of-line values so it looked like it *might*
+			 * have been a b-tree.
+			 */
+			xfs_da_state_free(state);
+			state = NULL;
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr3_leaf_to_node(args);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+
+			/*
+			 * Commit the node conversion and start the next
+			 * trans in the chain.
+			 */
+			error = xfs_trans_roll(&args->trans, dp);
+			if (error)
+				goto out;
+
+			goto restart;
+		}
+
+		/*
+		 * Split as many Btree elements as required.
+		 * This code tracks the new and old attr's location
+		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_da3_split(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+	} else {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		xfs_da3_fixhashpath(state, &state->path);
+	}
+
+	/*
+	 * Kill the state structure, we're done with it and need to
+	 * allow the buffers to come back later.
+	 */
+	xfs_da_state_free(state);
+	state = NULL;
+
+	/*
+	 * Commit the leaf addition or btree split and start the next
+	 * trans in the chain.
+	 */
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
+		goto out;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr3_leaf_flipflags(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		args->rmtvaluelen = args->rmtvaluelen2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * Re-find the "old" attribute entry after any split ops.
+		 * The INCOMPLETE flag means that we will find the "old"
+		 * attr, not the "new" one.
+		 */
+		args->flags |= XFS_ATTR_INCOMPLETE;
+		state = xfs_da_state_alloc();
+		state->args = args;
+		state->mp = mp;
+		state->inleaf = 0;
+		error = xfs_da3_node_lookup_int(state, &retval);
+		if (error)
+			goto out;
+
+		/*
+		 * Remove the name and update the hashvals in the tree.
+		 */
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+		error = xfs_attr3_leaf_remove(blk->bp, args);
+		xfs_da3_fixhashpath(state, &state->path);
+
+		/*
+		 * Check to see if the tree needs to be collapsed.
+		 */
+		if (retval && (state->path.active > 1)) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_da3_join(state);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+		}
+
+		/*
+		 * Commit and start the next trans in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			goto out;
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr3_leaf_clearflag(args);
+		if (error)
+			goto out;
+	}
+	retval = error = 0;
+
+out:
+	if (state)
+		xfs_da_state_free(state);
+	if (error)
+		return error;
+	return retval;
+}
+
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	struct xfs_buf *bp;
+	int retval, error, committed, forkoff;
+
+	trace_xfs_attr_node_removename(args);
+
+	/*
+	 * Tie a string around our finger to remind us where we are.
+	 */
+	dp = args->dp;
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = dp->i_mount;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error || (retval != -EEXIST)) {
+		if (error == 0)
+			error = retval;
+		goto out;
+	}
+
+	/*
+	 * If there is an out-of-line value, de-allocate the blocks.
+	 * This is done before we remove the attribute so that we don't
+	 * overflow the maximum size of a transaction and/or hit a deadlock.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->bp != NULL);
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if (args->rmtblkno > 0) {
+		/*
+		 * Fill in disk block numbers in the state structure
+		 * so that we can get the buffers back after we commit
+		 * several transactions in the following calls.
+		 */
+		error = xfs_attr_fillstate(state);
+		if (error)
+			goto out;
+
+		/*
+		 * Mark the attribute as INCOMPLETE, then bunmapi() the
+		 * remote value.
+		 */
+		error = xfs_attr3_leaf_setflag(args);
+		if (error)
+			goto out;
+		error = xfs_attr_rmtval_remove(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Refill the state structure with buffers, the prior calls
+		 * released our buffers.
+		 */
+		error = xfs_attr_refillstate(state);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	retval = xfs_attr3_leaf_remove(blk->bp, args);
+	xfs_da3_fixhashpath(state, &state->path);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	if (retval && (state->path.active > 1)) {
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_da3_join(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+
+		/*
+		 * Commit the Btree join operation and start a new trans.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * If the result is small enough, push it all into the inode.
+	 */
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		/*
+		 * Have to get rid of the copy of this dabuf in the state.
+		 */
+		ASSERT(state->path.active == 1);
+		ASSERT(state->path.blk[0].bp);
+		state->path.blk[0].bp = NULL;
+
+		error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
+		if (error)
+			goto out;
+
+		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+		} else
+			xfs_trans_brelse(args->trans, bp);
+	}
+	error = 0;
+
+out:
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level;
+
+	trace_xfs_attr_fillstate(state->args);
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level, error;
+
+	trace_xfs_attr_refillstate(state->args);
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da3_node_read(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da3_node_read(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+STATIC int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int error, retval;
+	int i;
+
+	trace_xfs_attr_node_get(args);
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	} else if (retval == -EEXIST) {
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->bp != NULL);
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Get the value, local or "remote"
+		 */
+		retval = xfs_attr3_leaf_getvalue(blk->bp, args);
+		if (!retval && (args->rmtblkno > 0)
+		    && !(args->flags & ATTR_KERNOVAL)) {
+			retval = xfs_attr_rmtval_get(args);
+		}
+	}
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return retval;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_attr_leaf.c b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c
new file mode 100644
index 000000000..e9d401ce9
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -0,0 +1,2773 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dir2.h"
+
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+				 xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
+						   xfs_da_state_blk_t *blk1,
+						   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+			xfs_da_state_blk_t *leaf_blk_1,
+			struct xfs_attr3_icleaf_hdr *ichdr1,
+			xfs_da_state_blk_t *leaf_blk_2,
+			struct xfs_attr3_icleaf_hdr *ichdr2,
+			int *number_entries_in_blk1,
+			int *number_usedbytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+			struct xfs_attr_leafblock *src_leaf,
+			struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+			struct xfs_attr_leafblock *dst_leaf,
+			struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+			int move_count);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+/*
+ * attr3 block 'firstused' conversion helpers.
+ *
+ * firstused refers to the offset of the first used byte of the nameval region
+ * of an attr leaf block. The region starts at the tail of the block and expands
+ * backwards towards the middle. As such, firstused is initialized to the block
+ * size for an empty leaf block and is reduced from there.
+ *
+ * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
+ * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
+ * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
+ * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
+ * the attr block size. The following helpers manage the conversion between the
+ * in-core and on-disk formats.
+ */
+
+static void
+xfs_attr3_leaf_firstused_from_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr3_icleaf_hdr	*to,
+	struct xfs_attr_leafblock	*from)
+{
+	struct xfs_attr3_leaf_hdr	*hdr3;
+
+	if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+		hdr3 = (struct xfs_attr3_leaf_hdr *) from;
+		to->firstused = be16_to_cpu(hdr3->firstused);
+	} else {
+		to->firstused = be16_to_cpu(from->hdr.firstused);
+	}
+
+	/*
+	 * Convert from the magic fsb size value to actual blocksize. This
+	 * should only occur for empty blocks when the block size overflows
+	 * 16-bits.
+	 */
+	if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
+		ASSERT(!to->count && !to->usedbytes);
+		ASSERT(geo->blksize > USHRT_MAX);
+		to->firstused = geo->blksize;
+	}
+}
+
+static void
+xfs_attr3_leaf_firstused_to_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr_leafblock	*to,
+	struct xfs_attr3_icleaf_hdr	*from)
+{
+	struct xfs_attr3_leaf_hdr	*hdr3;
+	uint32_t			firstused;
+
+	/* magic value should only be seen on disk */
+	ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
+
+	/*
+	 * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
+	 * value. This only overflows at the max supported value of 64k. Use the
+	 * magic on-disk value to represent block size in this case.
+	 */
+	firstused = from->firstused;
+	if (firstused > USHRT_MAX) {
+		ASSERT(from->firstused == geo->blksize);
+		firstused = XFS_ATTR3_LEAF_NULLOFF;
+	}
+
+	if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+		hdr3 = (struct xfs_attr3_leaf_hdr *) to;
+		hdr3->firstused = cpu_to_be16(firstused);
+	} else {
+		to->hdr.firstused = cpu_to_be16(firstused);
+	}
+}
+
+void
+xfs_attr3_leaf_hdr_from_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr3_icleaf_hdr	*to,
+	struct xfs_attr_leafblock	*from)
+{
+	int	i;
+
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+	       from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+	if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+		to->back = be32_to_cpu(hdr3->info.hdr.back);
+		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+		to->count = be16_to_cpu(hdr3->count);
+		to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+		xfs_attr3_leaf_firstused_from_disk(geo, to, from);
+		to->holes = hdr3->holes;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+			to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+		}
+		return;
+	}
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.count);
+	to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+	xfs_attr3_leaf_firstused_from_disk(geo, to, from);
+	to->holes = from->hdr.holes;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+		to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+	}
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr_leafblock	*to,
+	struct xfs_attr3_icleaf_hdr	*from)
+{
+	int				i;
+
+	ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+	       from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+	if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+		hdr3->info.hdr.back = cpu_to_be32(from->back);
+		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+		hdr3->count = cpu_to_be16(from->count);
+		hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+		xfs_attr3_leaf_firstused_to_disk(geo, to, from);
+		hdr3->holes = from->holes;
+		hdr3->pad1 = 0;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+			hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+		}
+		return;
+	}
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.count = cpu_to_be16(from->count);
+	to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+	xfs_attr3_leaf_firstused_to_disk(geo, to, from);
+	to->hdr.holes = from->holes;
+	to->hdr.pad1 = 0;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+		to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+	}
+}
+
+static bool
+xfs_attr3_leaf_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_attr_leafblock *leaf = bp->b_addr;
+	struct xfs_attr3_icleaf_hdr ichdr;
+
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+			return false;
+
+		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+			return false;
+	}
+	if (ichdr.count == 0)
+		return false;
+
+	/* XXX: need to range check rest of attr header values */
+	/* XXX: hash order check? */
+
+	return true;
+}
+
+static void
+xfs_attr3_leaf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_attr3_leaf_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_attr3_leaf_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_attr3_leaf_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+	.verify_read = xfs_attr3_leaf_read_verify,
+	.verify_write = xfs_attr3_leaf_write_verify,
+};
+
+int
+xfs_attr3_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+				XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+	return err;
+}
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+
+
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Query whether the requested number of additional bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the di_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
+{
+	int offset;
+	int minforkoff;	/* lower limit on valid forkoff locations */
+	int maxforkoff;	/* upper limit on valid forkoff locations */
+	int dsize;
+	xfs_mount_t *mp = dp->i_mount;
+
+	/* rounded down */
+	offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+
+	switch (dp->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		return (offset >= minforkoff) ? minforkoff : 0;
+	case XFS_DINODE_FMT_UUID:
+		minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		return (offset >= minforkoff) ? minforkoff : 0;
+	}
+
+	/*
+	 * If the requested numbers of bytes is smaller or equal to the
+	 * current attribute fork size we can always proceed.
+	 *
+	 * Note that if_bytes in the data fork might actually be larger than
+	 * the current data fork size is due to delalloc extents. In that
+	 * case either the extent count will go down when they are converted
+	 * to real extents, or the delalloc conversion will take care of the
+	 * literal area rebalancing.
+	 */
+	if (bytes <= XFS_IFORK_ASIZE(dp))
+		return dp->i_d.di_forkoff;
+
+	/*
+	 * For attr2 we can try to move the forkoff if there is space in the
+	 * literal area, but for the old format we are done if there is no
+	 * space in the fixed attribute fork.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+		return 0;
+
+	dsize = dp->i_df.if_bytes;
+
+	switch (dp->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/*
+		 * If there is no attr fork and the data fork is extents, 
+		 * determine if creating the default attr fork will result
+		 * in the extents form migrating to btree. If so, the
+		 * minimum offset only needs to be the space required for
+		 * the btree root.
+		 */
+		if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+		    xfs_default_attroffset(dp))
+			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		/*
+		 * If we have a data btree then keep forkoff if we have one,
+		 * otherwise we are adding a new attr, so then we set
+		 * minforkoff to where the btree root can finish so we have
+		 * plenty of room for attrs
+		 */
+		if (dp->i_d.di_forkoff) {
+			if (offset < dp->i_d.di_forkoff)
+				return 0;
+			return dp->i_d.di_forkoff;
+		}
+		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+		break;
+	}
+
+	/*
+	 * A data fork btree root must have space for at least
+	 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+	 */
+	minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+	minforkoff = roundup(minforkoff, 8) >> 3;
+
+	/* attr fork btree root can have at least this many key/ptr pairs */
+	maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+			XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	maxforkoff = maxforkoff >> 3;	/* rounded down */
+
+	if (offset >= maxforkoff)
+		return maxforkoff;
+	if (offset >= minforkoff)
+		return offset;
+	return 0;
+}
+
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ */
+STATIC void
+xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+{
+	if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
+	    !(xfs_sb_version_hasattr2(&mp->m_sb))) {
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
+			xfs_sb_version_addattr2(&mp->m_sb);
+			spin_unlock(&mp->m_sb_lock);
+			xfs_log_sb(tp);
+		} else
+			spin_unlock(&mp->m_sb_lock);
+	}
+}
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+	xfs_attr_sf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_create(args);
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	ifp = dp->i_afp;
+	ASSERT(ifp != NULL);
+	ASSERT(ifp->if_bytes == 0);
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+		ifp->if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+		ifp->if_flags |= XFS_IFINLINE;
+	} else {
+		ASSERT(ifp->if_flags & XFS_IFINLINE);
+	}
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+	hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+	hdr->count = 0;
+	hdr->totsize = cpu_to_be16(sizeof(*hdr));
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i, offset, size;
+	xfs_mount_t *mp;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_add(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	dp->i_d.di_forkoff = forkoff;
+
+	ifp = dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+#ifdef DEBUG
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		ASSERT(0);
+#endif
+	}
+
+	offset = (char *)sfe - (char *)sf;
+	size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+
+	sfe->namelen = args->namelen;
+	sfe->valuelen = args->valuelen;
+	sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+	memcpy(sfe->nameval, args->name, args->namelen);
+	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+	sf->hdr.count++;
+	be16_add_cpu(&sf->hdr.totsize, size);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+}
+
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+void
+xfs_attr_fork_remove(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp)
+{
+	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	ip->i_d.di_forkoff = 0;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT(ip->i_afp == NULL);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int base, size=0, end, totsize, i;
+	xfs_mount_t *mp;
+	xfs_inode_t *dp;
+
+	trace_xfs_attr_sf_remove(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	base = sizeof(xfs_attr_sf_hdr_t);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	end = sf->hdr.count;
+	for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+					base += size, i++) {
+		size = XFS_ATTR_SF_ENTSIZE(sfe);
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		break;
+	}
+	if (i == end)
+		return -ENOATTR;
+
+	/*
+	 * Fix up the attribute fork data, covering the hole
+	 */
+	end = base + size;
+	totsize = be16_to_cpu(sf->hdr.totsize);
+	if (end != totsize)
+		memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+	sf->hdr.count--;
+	be16_add_cpu(&sf->hdr.totsize, -size);
+
+	/*
+	 * Fix up the start offset of the attribute fork
+	 */
+	totsize -= size;
+	if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+	    (mp->m_flags & XFS_MOUNT_ATTR2) &&
+	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+	    !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+		xfs_attr_fork_remove(dp, args->trans);
+	} else {
+		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+		dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+		ASSERT(dp->i_d.di_forkoff);
+		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+				(args->op_flags & XFS_DA_OP_ADDNAME) ||
+				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
+				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+		xfs_trans_log_inode(args->trans, dp,
+					XFS_ILOG_CORE | XFS_ILOG_ADATA);
+	}
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+
+	return 0;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_lookup(args);
+
+	ifp = args->dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		return -EEXIST;
+	}
+	return -ENOATTR;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+
+	ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = sfe->valuelen;
+			return -EEXIST;
+		}
+		if (args->valuelen < sfe->valuelen) {
+			args->valuelen = sfe->valuelen;
+			return -ERANGE;
+		}
+		args->valuelen = sfe->valuelen;
+		memcpy(args->value, &sfe->nameval[args->namelen],
+						    args->valuelen);
+		return -EEXIST;
+	}
+	return -ENOATTR;
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_da_args_t nargs;
+	char *tmpbuffer;
+	int error, i, size;
+	xfs_dablk_t blkno;
+	struct xfs_buf *bp;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_to_leaf(args);
+
+	dp = args->dp;
+	ifp = dp->i_afp;
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	size = be16_to_cpu(sf->hdr.totsize);
+	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+	sf = (xfs_attr_shortform_t *)tmpbuffer;
+
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+
+	bp = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		/*
+		 * If we hit an IO error middle of the transaction inside
+		 * grow_inode(), we may have inconsistent data. Bail out.
+		 */
+		if (error == -EIO)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+		goto out;
+	}
+
+	ASSERT(blkno == 0);
+	error = xfs_attr3_leaf_create(args, blkno, &bp);
+	if (error) {
+		error = xfs_da_shrink_inode(args, 0, bp);
+		bp = NULL;
+		if (error)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+		goto out;
+	}
+
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.geo = args->geo;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count; i++) {
+		nargs.name = sfe->nameval;
+		nargs.namelen = sfe->namelen;
+		nargs.value = &sfe->nameval[nargs.namelen];
+		nargs.valuelen = sfe->valuelen;
+		nargs.hashval = xfs_da_hashname(sfe->nameval,
+						sfe->namelen);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+		error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
+		ASSERT(error == -ENOATTR);
+		error = xfs_attr3_leaf_add(bp, &nargs);
+		ASSERT(error != -ENOSPC);
+		if (error)
+			goto out;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer);
+	return error;
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	struct xfs_attr3_icleaf_hdr leafhdr;
+	int			bytes;
+	int			i;
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
+
+	bytes = sizeof(struct xfs_attr_sf_hdr);
+	for (i = 0; i < leafhdr.count; entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* don't copy partial entries */
+		if (!(entry->flags & XFS_ATTR_LOCAL))
+			return 0;
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
+		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return 0;
+		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return 0;
+		bytes += sizeof(struct xfs_attr_sf_entry) - 1
+				+ name_loc->namelen
+				+ be16_to_cpu(name_loc->valuelen);
+	}
+	if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
+		return -1;
+	return xfs_attr_shortform_bytesfit(dp, bytes);
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr3_leaf_to_shortform(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args,
+	int			forkoff)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_da_args	nargs;
+	struct xfs_inode	*dp = args->dp;
+	char			*tmpbuffer;
+	int			error;
+	int			i;
+
+	trace_xfs_attr_leaf_to_sf(args);
+
+	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	if (!tmpbuffer)
+		return -ENOMEM;
+
+	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+
+	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
+
+	/* XXX (dgc): buffer is about to be marked stale - why zero it? */
+	memset(bp->b_addr, 0, args->geo->blksize);
+
+	/*
+	 * Clean out the prior contents of the attribute list.
+	 */
+	error = xfs_da_shrink_inode(args, 0, bp);
+	if (error)
+		goto out;
+
+	if (forkoff == -1) {
+		ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
+		ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+		xfs_attr_fork_remove(dp, args->trans);
+		goto out;
+	}
+
+	xfs_attr_shortform_create(args);
+
+	/*
+	 * Copy the attributes
+	 */
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.geo = args->geo;
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+	for (i = 0; i < ichdr.count; entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;	/* don't copy partial entries */
+		if (!entry->nameidx)
+			continue;
+		ASSERT(entry->flags & XFS_ATTR_LOCAL);
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
+		nargs.name = name_loc->nameval;
+		nargs.namelen = name_loc->namelen;
+		nargs.value = &name_loc->nameval[nargs.namelen];
+		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+		nargs.hashval = be32_to_cpu(entry->hashval);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+		xfs_attr_shortform_add(&nargs, forkoff);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer);
+	return error;
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr3_leaf_to_node(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr icleafhdr;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr icnodehdr;
+	struct xfs_da_intnode	*node;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp1 = NULL;
+	struct xfs_buf		*bp2 = NULL;
+	xfs_dablk_t		blkno;
+	int			error;
+
+	trace_xfs_attr_leaf_to_node(args);
+
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		goto out;
+	error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
+	if (error)
+		goto out;
+
+	error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+
+	/* copy leaf to new buffer, update identifiers */
+	xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
+	bp2->b_ops = bp1->b_ops;
+	memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+		hdr3->blkno = cpu_to_be64(bp2->b_bn);
+	}
+	xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	node = bp1->b_addr;
+	dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
+	btree = dp->d_ops->node_tree_p(node);
+
+	leaf = bp2->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+
+	/* both on-disk, don't endian-flip twice */
+	btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+	btree[0].before = cpu_to_be32(blkno);
+	icnodehdr.count = 1;
+	dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+	xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
+	error = 0;
+out:
+	return error;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr3_leaf_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	trace_xfs_attr_leaf_create(args);
+
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+					    XFS_ATTR_FORK);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_attr3_leaf_buf_ops;
+	xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
+	leaf = bp->b_addr;
+	memset(leaf, 0, args->geo->blksize);
+
+	memset(&ichdr, 0, sizeof(ichdr));
+	ichdr.firstused = args->geo->blksize;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+		ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+		ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+	} else {
+		ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+		ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+	}
+	ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr3_leaf_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
+{
+	xfs_dablk_t blkno;
+	int error;
+
+	trace_xfs_attr_leaf_split(state->args);
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(state->args, &blkno);
+	if (error)
+		return error;
+	error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
+	if (error)
+		return error;
+	newblk->blkno = blkno;
+	newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 * NOTE: rebalance() currently depends on the 2nd block being empty.
+	 */
+	xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
+	if (error)
+		return error;
+
+	/*
+	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+	 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+	 * "new" attrs info.  Will need the "old" info to remove it later.
+	 *
+	 * Insert the "new" entry in the correct block.
+	 */
+	if (state->inleaf) {
+		trace_xfs_attr_leaf_add_old(state->args);
+		error = xfs_attr3_leaf_add(oldblk->bp, state->args);
+	} else {
+		trace_xfs_attr_leaf_add_new(state->args);
+		error = xfs_attr3_leaf_add(newblk->bp, state->args);
+	}
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+	return error;
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr3_leaf_add(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	int			tablesize;
+	int			entsize;
+	int			sum;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_add(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(args->index >= 0 && args->index <= ichdr.count);
+	entsize = xfs_attr_leaf_newentsize(args, NULL);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+		if (tablesize > ichdr.firstused) {
+			sum += ichdr.freemap[i].size;
+			continue;
+		}
+		if (!ichdr.freemap[i].size)
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (ichdr.freemap[i].base < ichdr.firstused)
+			tmp += sizeof(xfs_attr_leaf_entry_t);
+		if (ichdr.freemap[i].size >= tmp) {
+			tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+			goto out_log_hdr;
+		}
+		sum += ichdr.freemap[i].size;
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!ichdr.holes && sum < entsize)
+		return -ENOSPC;
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * This may change the hdr->count via dropping INCOMPLETE entries.
+	 */
+	xfs_attr3_leaf_compact(args, &ichdr, bp);
+
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].  If it is not big enough, give up.
+	 */
+	if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+		tmp = -ENOSPC;
+		goto out_log_hdr;
+	}
+
+	tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
+
+out_log_hdr:
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+				xfs_attr3_leaf_hdr_size(leaf)));
+	return tmp;
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr3_leaf_add_work(
+	struct xfs_buf		*bp,
+	struct xfs_attr3_icleaf_hdr *ichdr,
+	struct xfs_da_args	*args,
+	int			mapindex)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_mount	*mp;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_add_work(args);
+
+	leaf = bp->b_addr;
+	ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+	ASSERT(args->index >= 0 && args->index <= ichdr->count);
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	if (args->index < ichdr->count) {
+		tmp  = ichdr->count - args->index;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		memmove(entry + 1, entry, tmp);
+		xfs_trans_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	}
+	ichdr->count++;
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	mp = args->trans->t_mountp;
+	ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+	ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+	ASSERT(ichdr->freemap[mapindex].size >=
+		xfs_attr_leaf_newentsize(args, NULL));
+	ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+	ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+	ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+
+	entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+				     ichdr->freemap[mapindex].size);
+	entry->hashval = cpu_to_be32(args->hashval);
+	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+	entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		if ((args->blkno2 == args->blkno) &&
+		    (args->index2 <= args->index)) {
+			args->index2++;
+		}
+	}
+	xfs_trans_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	ASSERT((args->index == 0) ||
+	       (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+	ASSERT((args->index == ichdr->count - 1) ||
+	       (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+
+	/*
+	 * For "remote" attribute values, simply note that we need to
+	 * allocate space for the "remote" value.  We can't actually
+	 * allocate the extents in this transaction, and we can't decide
+	 * which blocks they should be as we might allocate more blocks
+	 * as part of this transaction (a split operation for example).
+	 */
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		name_loc->namelen = args->namelen;
+		name_loc->valuelen = cpu_to_be16(args->valuelen);
+		memcpy((char *)name_loc->nameval, args->name, args->namelen);
+		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+				   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->namelen = args->namelen;
+		memcpy((char *)name_rmt->name, args->name, args->namelen);
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		/* just in case */
+		name_rmt->valuelen = 0;
+		name_rmt->valueblk = 0;
+		args->rmtblkno = 1;
+		args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+		args->rmtvaluelen = args->valuelen;
+	}
+	xfs_trans_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+				   xfs_attr_leaf_entsize(leaf, args->index)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+		ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+	ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf));
+	tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		if (ichdr->freemap[i].base == tmp) {
+			ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+			ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
+		}
+	}
+	ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+	return 0;
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr3_leaf_compact(
+	struct xfs_da_args	*args,
+	struct xfs_attr3_icleaf_hdr *ichdr_dst,
+	struct xfs_buf		*bp)
+{
+	struct xfs_attr_leafblock *leaf_src;
+	struct xfs_attr_leafblock *leaf_dst;
+	struct xfs_attr3_icleaf_hdr ichdr_src;
+	struct xfs_trans	*trans = args->trans;
+	char			*tmpbuffer;
+
+	trace_xfs_attr_leaf_compact(args);
+
+	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+	memset(bp->b_addr, 0, args->geo->blksize);
+	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+	leaf_dst = bp->b_addr;
+
+	/*
+	 * Copy the on-disk header back into the destination buffer to ensure
+	 * all the information in the header that is not part of the incore
+	 * header structure is preserved.
+	 */
+	memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+	/* Initialise the incore headers */
+	ichdr_src = *ichdr_dst;	/* struct copy */
+	ichdr_dst->firstused = args->geo->blksize;
+	ichdr_dst->usedbytes = 0;
+	ichdr_dst->count = 0;
+	ichdr_dst->holes = 0;
+	ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+	ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+						ichdr_dst->freemap[0].base;
+
+	/* write the header back to initialise the underlying buffer */
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate name/value pairs packed and in sequence.
+	 */
+	xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+				leaf_dst, ichdr_dst, 0, ichdr_src.count);
+	/*
+	 * this logs the entire buffer, but the caller must write the header
+	 * back to the buffer when it is finished modifying it.
+	 */
+	xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
+
+	kmem_free(tmpbuffer);
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_attr3_icleaf_hdr *leaf1hdr,
+	struct xfs_buf	*leaf2_bp,
+	struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+
+	entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+	entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+	if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+	    ((be32_to_cpu(entries2[0].hashval) <
+	      be32_to_cpu(entries1[0].hashval)) ||
+	     (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+	      be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+int
+xfs_attr_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_buf	*leaf2_bp)
+{
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
+
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
+	return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.  Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr3_leaf_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_args	*args;
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+	int			count;
+	int			totallen;
+	int			max;
+	int			space;
+	int			swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+	leaf1 = blk1->bp->b_addr;
+	leaf2 = blk2->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
+	ASSERT(ichdr2.count == 0);
+	args = state->args;
+
+	trace_xfs_attr_leaf_rebalance(args);
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 *
+	 * NOTE: Given that all (current) callers pass in an empty
+	 * second block, this code should never set "swap".
+	 */
+	swap = 0;
+	if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+		struct xfs_da_state_blk	*tmp_blk;
+		struct xfs_attr3_icleaf_hdr tmp_ichdr;
+
+		tmp_blk = blk1;
+		blk1 = blk2;
+		blk2 = tmp_blk;
+
+		/* struct copies to swap them rather than reconverting */
+		tmp_ichdr = ichdr1;
+		ichdr1 = ichdr2;
+		ichdr2 = tmp_ichdr;
+
+		leaf1 = blk1->bp->b_addr;
+		leaf2 = blk2->bp->b_addr;
+		swap = 1;
+	}
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 *
+	 * "inleaf" is true if the new entry should be inserted into blk1.
+	 * If "swap" is also true, then reverse the sense of "inleaf".
+	 */
+	state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+						      blk2, &ichdr2,
+						      &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < ichdr1.count) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count = ichdr1.count - count;
+		space  = ichdr1.usedbytes - totallen;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max)
+			xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+				ichdr1.count - count, leaf2, &ichdr2, 0, count);
+
+	} else if (count > ichdr1.count) {
+		/*
+		 * I assert that since all callers pass in an empty
+		 * second buffer, this code should never execute.
+		 */
+		ASSERT(0);
+
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count -= ichdr1.count;
+		space  = totallen - ichdr1.usedbytes;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max)
+			xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+					ichdr1.count, count);
+	}
+
+	xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
+	xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
+	xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+	xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	entries1 = xfs_attr3_leaf_entryp(leaf1);
+	entries2 = xfs_attr3_leaf_entryp(leaf2);
+	blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * NOTE: this code depends on the (current) situation that the
+	 * second block was originally empty.
+	 *
+	 * If the insertion point moved to the 2nd block, we must adjust
+	 * the index.  We must also track the entry just following the
+	 * new entry for use in an "atomic rename" operation, that entry
+	 * is always the "old" entry and the "new" entry is what we are
+	 * inserting.  The index/blkno fields refer to the "old" entry,
+	 * while the index2/blkno2 fields refer to the "new" entry.
+	 */
+	if (blk1->index > ichdr1.count) {
+		ASSERT(state->inleaf == 0);
+		blk2->index = blk1->index - ichdr1.count;
+		args->index = args->index2 = blk2->index;
+		args->blkno = args->blkno2 = blk2->blkno;
+	} else if (blk1->index == ichdr1.count) {
+		if (state->inleaf) {
+			args->index = blk1->index;
+			args->blkno = blk1->blkno;
+			args->index2 = 0;
+			args->blkno2 = blk2->blkno;
+		} else {
+			/*
+			 * On a double leaf split, the original attr location
+			 * is already stored in blkno2/index2, so don't
+			 * overwrite it overwise we corrupt the tree.
+			 */
+			blk2->index = blk1->index - ichdr1.count;
+			args->index = blk2->index;
+			args->blkno = blk2->blkno;
+			if (!state->extravalid) {
+				/*
+				 * set the new attr location to match the old
+				 * one and let the higher level split code
+				 * decide where in the leaf to place it.
+				 */
+				args->index2 = blk2->index;
+				args->blkno2 = blk2->blkno;
+			}
+		}
+	} else {
+		ASSERT(state->inleaf == 1);
+		args->index = args->index2 = blk1->index;
+		args->blkno = args->blkno2 = blk1->blkno;
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr3_leaf_figure_balance(
+	struct xfs_da_state		*state,
+	struct xfs_da_state_blk		*blk1,
+	struct xfs_attr3_icleaf_hdr	*ichdr1,
+	struct xfs_da_state_blk		*blk2,
+	struct xfs_attr3_icleaf_hdr	*ichdr2,
+	int				*countarg,
+	int				*usedbytesarg)
+{
+	struct xfs_attr_leafblock	*leaf1 = blk1->bp->b_addr;
+	struct xfs_attr_leafblock	*leaf2 = blk2->bp->b_addr;
+	struct xfs_attr_leaf_entry	*entry;
+	int				count;
+	int				max;
+	int				index;
+	int				totallen = 0;
+	int				half;
+	int				lastdelta;
+	int				foundit = 0;
+	int				tmp;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = ichdr1->count + ichdr2->count;
+	half = (max + 1) * sizeof(*entry);
+	half += ichdr1->usedbytes + ichdr2->usedbytes +
+			xfs_attr_leaf_newentsize(state->args, NULL);
+	half /= 2;
+	lastdelta = state->args->geo->blksize;
+	entry = xfs_attr3_leaf_entryp(leaf1);
+	for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A)	(((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args, NULL);
+			if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_ATTR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == ichdr1->count) {
+			leaf1 = leaf2;
+			entry = xfs_attr3_leaf_entryp(leaf1);
+			index = 0;
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+									index);
+		if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_ATTR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_ATTR_ABS
+	}
+
+	/*
+	 * Calculate the number of usedbytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -= count * sizeof(*entry);
+	if (foundit) {
+		totallen -= sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args, NULL);
+	}
+
+	*countarg = count;
+	*usedbytesarg = totallen;
+	return foundit;
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr3_leaf_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		blkno;
+	int			bytes;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
+
+	trace_xfs_attr_leaf_toosmall(state->args);
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	leaf = blk->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
+	bytes = xfs_attr3_leaf_hdr_size(leaf) +
+		ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+		ichdr.usedbytes;
+	if (bytes > (state->args->geo->blksize >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return 0;
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (ichdr.count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (ichdr.forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return error;
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return 0;
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink an attribute list over time.
+	 */
+	/* start with smaller blk num */
+	forward = ichdr.forw < ichdr.back;
+	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_attr3_icleaf_hdr ichdr2;
+		if (forward)
+			blkno = ichdr.forw;
+		else
+			blkno = ichdr.back;
+		if (blkno == 0)
+			continue;
+		error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
+					blkno, -1, &bp);
+		if (error)
+			return error;
+
+		xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
+
+		bytes = state->args->geo->blksize -
+			(state->args->geo->blksize >> 2) -
+			ichdr.usedbytes - ichdr2.usedbytes -
+			((ichdr.count + ichdr2.count) *
+					sizeof(xfs_attr_leaf_entry_t)) -
+			xfs_attr3_leaf_hdr_size(leaf);
+
+		xfs_trans_brelse(state->args->trans, bp);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da3_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return error;
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return 0;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr3_leaf_remove(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	int			before;
+	int			after;
+	int			smallest;
+	int			entsize;
+	int			tablesize;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_remove(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+
+	ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+	ASSERT(args->index >= 0 && args->index < ichdr.count);
+	ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+					xfs_attr3_leaf_hdr_size(leaf));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+	ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+	ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	tmp = ichdr.freemap[0].size;
+	before = after = -1;
+	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+	entsize = xfs_attr_leaf_entsize(leaf, args->index);
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+		ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+		if (ichdr.freemap[i].base == tablesize) {
+			ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+			ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
+		}
+
+		if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+				be16_to_cpu(entry->nameidx)) {
+			before = i;
+		} else if (ichdr.freemap[i].base ==
+				(be16_to_cpu(entry->nameidx) + entsize)) {
+			after = i;
+		} else if (ichdr.freemap[i].size < tmp) {
+			tmp = ichdr.freemap[i].size;
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			ichdr.freemap[before].size += entsize;
+			ichdr.freemap[before].size += ichdr.freemap[after].size;
+			ichdr.freemap[after].base = 0;
+			ichdr.freemap[after].size = 0;
+		} else if (before >= 0) {
+			ichdr.freemap[before].size += entsize;
+		} else {
+			ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[after].size += entsize;
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		if (ichdr.freemap[smallest].size < entsize) {
+			ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[smallest].size = entsize;
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+	ichdr.usedbytes -= entsize;
+	xfs_trans_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+				   entsize));
+
+	tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+	memmove(entry, entry + 1, tmp);
+	ichdr.count--;
+	xfs_trans_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+	memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = args->geo->blksize;
+		entry = xfs_attr3_leaf_entryp(leaf);
+		for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+			ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+			ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+			if (be16_to_cpu(entry->nameidx) < tmp)
+				tmp = be16_to_cpu(entry->nameidx);
+		}
+		ichdr.firstused = tmp;
+		ASSERT(ichdr.firstused != 0);
+	} else {
+		ichdr.holes = 1;	/* mark as needing compaction */
+	}
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+					  xfs_attr3_leaf_hdr_size(leaf)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+	      ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+	return tmp < args->geo->magicpct; /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr3_leaf_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+	struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+	struct xfs_attr3_icleaf_hdr drophdr;
+	struct xfs_attr3_icleaf_hdr savehdr;
+	struct xfs_attr_leaf_entry *entry;
+
+	trace_xfs_attr_leaf_unbalance(state->args);
+
+	drop_leaf = drop_blk->bp->b_addr;
+	save_leaf = save_blk->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
+	entry = xfs_attr3_leaf_entryp(drop_leaf);
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (savehdr.holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr, 0,
+						drophdr.count);
+		} else {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr,
+						savehdr.count, drophdr.count);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		struct xfs_attr_leafblock *tmp_leaf;
+		struct xfs_attr3_icleaf_hdr tmphdr;
+
+		tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+
+		/*
+		 * Copy the header into the temp leaf so that all the stuff
+		 * not in the incore header is present and gets copied back in
+		 * once we've moved all the entries.
+		 */
+		memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+		memset(&tmphdr, 0, sizeof(tmphdr));
+		tmphdr.magic = savehdr.magic;
+		tmphdr.forw = savehdr.forw;
+		tmphdr.back = savehdr.back;
+		tmphdr.firstused = state->args->geo->blksize;
+
+		/* write the header to the temp buffer to initialise it */
+		xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
+
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						drophdr.count);
+			xfs_attr3_leaf_moveents(state->args,
+						save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						savehdr.count);
+		} else {
+			xfs_attr3_leaf_moveents(state->args,
+						save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						savehdr.count);
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						drophdr.count);
+		}
+		memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+		savehdr = tmphdr; /* struct copy */
+		kmem_free(tmp_leaf);
+	}
+
+	xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
+	xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->args->geo->blksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	entry = xfs_attr3_leaf_entryp(save_leaf);
+	save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr3_leaf_lookup_int(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	xfs_dahash_t		hashval;
+	int			probe;
+	int			span;
+
+	trace_xfs_attr_leaf_lookup(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+	ASSERT(ichdr.count < args->geo->blksize / 8);
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = ichdr.count / 2;
+	for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
+		span /= 2;
+		if (be32_to_cpu(entry->hashval) < hashval)
+			probe += span;
+		else if (be32_to_cpu(entry->hashval) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+	ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
+		entry--;
+		probe--;
+	}
+	while (probe < ichdr.count &&
+	       be32_to_cpu(entry->hashval) < hashval) {
+		entry++;
+		probe++;
+	}
+	if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
+		args->index = probe;
+		return -ENOATTR;
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
+			entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+		/*
+		 * If we are looking for INCOMPLETE entries, show only those.
+		 * If we are looking for complete entries, show only those.
+		 */
+		if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+		    (entry->flags & XFS_ATTR_INCOMPLETE)) {
+			continue;
+		}
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = xfs_attr3_leaf_name_local(leaf, probe);
+			if (name_loc->namelen != args->namelen)
+				continue;
+			if (memcmp(args->name, name_loc->nameval,
+							args->namelen) != 0)
+				continue;
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
+				continue;
+			args->index = probe;
+			return -EEXIST;
+		} else {
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
+			if (name_rmt->namelen != args->namelen)
+				continue;
+			if (memcmp(args->name, name_rmt->name,
+							args->namelen) != 0)
+				continue;
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
+				continue;
+			args->index = probe;
+			args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+			args->rmtblkcnt = xfs_attr3_rmt_blocks(
+							args->dp->i_mount,
+							args->rmtvaluelen);
+			return -EEXIST;
+		}
+	}
+	args->index = probe;
+	return -ENOATTR;
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr3_leaf_getvalue(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	int			valuelen;
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(ichdr.count < args->geo->blksize / 8);
+	ASSERT(args->index < ichdr.count);
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		ASSERT(name_loc->namelen == args->namelen);
+		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+		valuelen = be16_to_cpu(name_loc->valuelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return 0;
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return -ERANGE;
+		}
+		args->valuelen = valuelen;
+		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		ASSERT(name_rmt->namelen == args->namelen);
+		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+		args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+		args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+						       args->rmtvaluelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = args->rmtvaluelen;
+			return 0;
+		}
+		if (args->valuelen < args->rmtvaluelen) {
+			args->valuelen = args->rmtvaluelen;
+			return -ERANGE;
+		}
+		args->valuelen = args->rmtvaluelen;
+	}
+	return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr3_leaf_moveents(
+	struct xfs_da_args		*args,
+	struct xfs_attr_leafblock	*leaf_s,
+	struct xfs_attr3_icleaf_hdr	*ichdr_s,
+	int				start_s,
+	struct xfs_attr_leafblock	*leaf_d,
+	struct xfs_attr3_icleaf_hdr	*ichdr_d,
+	int				start_d,
+	int				count)
+{
+	struct xfs_attr_leaf_entry	*entry_s;
+	struct xfs_attr_leaf_entry	*entry_d;
+	int				desti;
+	int				tmp;
+	int				i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+	       ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+	ASSERT(ichdr_s->magic == ichdr_d->magic);
+	ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+	ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+					+ xfs_attr3_leaf_hdr_size(leaf_s));
+	ASSERT(ichdr_d->count < args->geo->blksize / 8);
+	ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+					+ xfs_attr3_leaf_hdr_size(leaf_d));
+
+	ASSERT(start_s < ichdr_s->count);
+	ASSERT(start_d <= ichdr_d->count);
+	ASSERT(count <= ichdr_s->count);
+
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < ichdr_d->count) {
+		tmp  = ichdr_d->count - start_d;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+		memmove(entry_d, entry_s, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate attribute info packed and in sequence.
+	 */
+	entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+	entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+	desti = start_d;
+	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+		ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
+		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+		/*
+		 * Code to drop INCOMPLETE entries.  Difficult to use as we
+		 * may also need to change the insertion index.  Code turned
+		 * off for 6.2, should be revisited later.
+		 */
+		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_s->count -= 1;
+			entry_d--;	/* to compensate for ++ in loop hdr */
+			desti--;
+			if ((start_s + i) < offset)
+				result++;	/* insertion index adjustment */
+		} else {
+#endif /* GROT */
+			ichdr_d->firstused -= tmp;
+			/* both on-disk, don't endian flip twice */
+			entry_d->hashval = entry_s->hashval;
+			entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
+			entry_d->flags = entry_s->flags;
+			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+							<= args->geo->blksize);
+			memmove(xfs_attr3_leaf_name(leaf_d, desti),
+				xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
+			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+							<= args->geo->blksize);
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_d->usedbytes += tmp;
+			ichdr_s->count -= 1;
+			ichdr_d->count += 1;
+			tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf_d);
+			ASSERT(ichdr_d->firstused >= tmp);
+#ifdef GROT
+		}
+#endif /* GROT */
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == ichdr_s->count) {
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + args->geo->blksize));
+		memset(entry_s, 0, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+		memmove(entry_d, entry_s, tmp);
+
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + args->geo->blksize));
+		memset(entry_s, 0, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+	ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+	ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+	ichdr_d->freemap[1].base = 0;
+	ichdr_d->freemap[2].base = 0;
+	ichdr_d->freemap[1].size = 0;
+	ichdr_d->freemap[2].size = 0;
+	ichdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(
+	struct xfs_buf	*bp,
+	int		*count)
+{
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
+	entries = xfs_attr3_leaf_entryp(bp->b_addr);
+	if (count)
+		*count = ichdr.count;
+	if (!ichdr.count)
+		return 0;
+	return be32_to_cpu(entries[ichdr.count - 1].hashval);
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+	struct xfs_attr_leaf_entry *entries;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int size;
+
+	entries = xfs_attr3_leaf_entryp(leaf);
+	if (entries[index].flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, index);
+		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+						   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
+		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+	}
+	return size;
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(
+	struct xfs_da_args	*args,
+	int			*local)
+{
+	int			size;
+
+	size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+	if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+		if (local)
+			*local = 1;
+		return size;
+	}
+	if (local)
+		*local = 0;
+	return xfs_attr_leaf_entsize_remote(args->namelen);
+}
+
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_clearflag(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
+	int			error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen;
+	char *name;
+#endif /* DEBUG */
+
+	trace_xfs_attr_leaf_clearflag(args);
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	leaf = bp->b_addr;
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
+	ASSERT(args->index >= 0);
+
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		namelen = name_loc->namelen;
+		name = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		namelen = name_rmt->namelen;
+		name = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+	ASSERT(namelen == args->namelen);
+	ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+	entry->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	if (args->rmtblkno) {
+		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+		xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_setflag(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
+	int error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
+#endif
+
+	trace_xfs_attr_leaf_setflag(args);
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	leaf = bp->b_addr;
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
+	ASSERT(args->index >= 0);
+#endif
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+	entry->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp,
+			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr3_leaf_flipflags(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr_leaf_entry *entry1;
+	struct xfs_attr_leaf_entry *entry2;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp1;
+	struct xfs_buf		*bp2;
+	int error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen1, namelen2;
+	char *name1, *name2;
+#endif /* DEBUG */
+
+	trace_xfs_attr_leaf_flipflags(args);
+
+	/*
+	 * Read the block containing the "old" attr
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+	if (error)
+		return error;
+
+	/*
+	 * Read the block containing the "new" attr, if it is different
+	 */
+	if (args->blkno2 != args->blkno) {
+		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
+					   -1, &bp2);
+		if (error)
+			return error;
+	} else {
+		bp2 = bp1;
+	}
+
+	leaf1 = bp1->b_addr;
+	entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
+
+	leaf2 = bp2->b_addr;
+	entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
+
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
+	ASSERT(args->index < ichdr1.count);
+	ASSERT(args->index >= 0);
+
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
+	ASSERT(args->index2 < ichdr2.count);
+	ASSERT(args->index2 >= 0);
+
+	if (entry1->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
+		namelen1 = name_loc->namelen;
+		name1 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+		namelen1 = name_rmt->namelen;
+		name1 = (char *)name_rmt->name;
+	}
+	if (entry2->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
+		namelen2 = name_loc->namelen;
+		name2 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+		namelen2 = name_rmt->namelen;
+		name2 = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+	ASSERT(namelen1 == namelen2);
+	ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+	ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+	ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+	entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp1,
+			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+	if (args->rmtblkno) {
+		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+		xfs_trans_log_buf(args->trans, bp1,
+			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+	}
+
+	entry2->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp2,
+			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_trans_log_buf(args->trans, bp2,
+			 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+	}
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_trans_roll(&args->trans, args->dp);
+
+	return error;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_attr_leaf.h b/kernel/fs/xfs/libxfs/xfs_attr_leaf.h
new file mode 100644
index 000000000..882c8d338
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define	__XFS_ATTR_LEAF_H__
+
+struct attrlist;
+struct attrlist_cursor_kern;
+struct xfs_attr_list_context;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+	xfs_dablk_t	valueblk;	/* block number of value bytes */
+	int		valuelen;	/* number of bytes in value */
+} xfs_attr_inactive_list_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute fork size < XFS_LITINO(mp).
+ */
+void	xfs_attr_shortform_create(struct xfs_da_args *args);
+void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
+int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_shortform_remove(struct xfs_da_args *args);
+int	xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
+int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
+
+/*
+ * Internal routines when attribute fork size == XFS_LBSIZE(mp).
+ */
+int	xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int	xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
+				   struct xfs_da_args *args, int forkoff);
+int	xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int	xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int	xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_attr3_leaf_split(struct xfs_da_state *state,
+				   struct xfs_da_state_blk *oldblk,
+				   struct xfs_da_state_blk *newblk);
+int	xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
+					struct xfs_da_args *args);
+int	xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int	xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
+				 struct xfs_da_args *args);
+int	xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
+				    struct xfs_da_args *args);
+int	xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+				      struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
+				       struct xfs_da_state_blk *drop_blk,
+				       struct xfs_da_state_blk *save_blk);
+int	xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+
+/*
+ * Utility routines.
+ */
+xfs_dahash_t	xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
+int	xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
+				   struct xfs_buf *leaf2_bp);
+int	xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
+int	xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			struct xfs_buf **bpp);
+void	xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
+				     struct xfs_attr3_icleaf_hdr *to,
+				     struct xfs_attr_leafblock *from);
+void	xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
+				   struct xfs_attr_leafblock *to,
+				   struct xfs_attr3_icleaf_hdr *from);
+
+#endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_attr_remote.c b/kernel/fs/xfs/libxfs/xfs_attr_remote.c
new file mode 100644
index 000000000..20de88d1b
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_attr_remote.c
@@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+	struct xfs_mount *mp,
+	int		attrlen)
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+		return (attrlen + buflen - 1) / buflen;
+	}
+	return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (bno != be64_to_cpu(rmt->rm_blkno))
+		return false;
+	if (offset != be32_to_cpu(rmt->rm_offset))
+		return false;
+	if (size != be32_to_cpu(rmt->rm_bytes))
+		return false;
+	if (ino != be64_to_cpu(rmt->rm_owner))
+		return false;
+
+	/* ok */
+	return true;
+}
+
+static bool
+xfs_attr3_rmt_verify(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	int			fsbsize,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+		return false;
+	if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (be64_to_cpu(rmt->rm_blkno) != bno)
+		return false;
+	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+		return false;
+	if (be32_to_cpu(rmt->rm_offset) +
+				be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+		return false;
+	if (rmt->rm_owner == 0)
+		return false;
+
+	return true;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	char		*ptr;
+	int		len;
+	xfs_daddr_t	bno;
+	int		blksize = mp->m_attr_geo->blksize;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	ptr = bp->b_addr;
+	bno = bp->b_bn;
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= blksize);
+
+	while (len > 0) {
+		if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+			xfs_buf_ioerror(bp, -EFSBADCRC);
+			break;
+		}
+		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+			xfs_buf_ioerror(bp, -EFSCORRUPTED);
+			break;
+		}
+		len -= blksize;
+		ptr += blksize;
+		bno += BTOBB(blksize);
+	}
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+	else
+		ASSERT(len == 0);
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	char		*ptr;
+	int		len;
+	xfs_daddr_t	bno;
+	int		blksize = mp->m_attr_geo->blksize;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	ptr = bp->b_addr;
+	bno = bp->b_bn;
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= blksize);
+
+	while (len > 0) {
+		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+			xfs_buf_ioerror(bp, -EFSCORRUPTED);
+			xfs_verifier_error(bp);
+			return;
+		}
+		if (bip) {
+			struct xfs_attr3_rmt_hdr *rmt;
+
+			rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+			rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+		}
+		xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+
+		len -= blksize;
+		ptr += blksize;
+		bno += BTOBB(blksize);
+	}
+	ASSERT(len == 0);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+	.verify_read = xfs_attr3_rmt_read_verify,
+	.verify_write = xfs_attr3_rmt_write_verify,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return 0;
+
+	rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+	rmt->rm_offset = cpu_to_be32(offset);
+	rmt->rm_bytes = cpu_to_be32(size);
+	uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+	rmt->rm_owner = cpu_to_be64(ino);
+	rmt->rm_blkno = cpu_to_be64(bno);
+
+	return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	__uint8_t	**dst)
+{
+	char		*src = bp->b_addr;
+	xfs_daddr_t	bno = bp->b_bn;
+	int		len = BBTOB(bp->b_length);
+	int		blksize = mp->m_attr_geo->blksize;
+
+	ASSERT(len >= blksize);
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size = 0;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+		byte_cnt = min(*valuelen, byte_cnt);
+
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+						  byte_cnt, bno)) {
+				xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+					bno, *offset, byte_cnt, ino);
+				return -EFSCORRUPTED;
+			}
+			hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+		}
+
+		memcpy(*dst, src + hdr_size, byte_cnt);
+
+		/* roll buffer forwards */
+		len -= blksize;
+		src += blksize;
+		bno += BTOBB(blksize);
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*dst += byte_cnt;
+		*offset += byte_cnt;
+	}
+	return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	__uint8_t	**src)
+{
+	char		*dst = bp->b_addr;
+	xfs_daddr_t	bno = bp->b_bn;
+	int		len = BBTOB(bp->b_length);
+	int		blksize = mp->m_attr_geo->blksize;
+
+	ASSERT(len >= blksize);
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+		byte_cnt = min(*valuelen, byte_cnt);
+		hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+						 byte_cnt, bno);
+
+		memcpy(dst + hdr_size, *src, byte_cnt);
+
+		/*
+		 * If this is the last block, zero the remainder of it.
+		 * Check that we are actually the last block, too.
+		 */
+		if (byte_cnt + hdr_size < blksize) {
+			ASSERT(*valuelen - byte_cnt == 0);
+			ASSERT(len == blksize);
+			memset(dst + hdr_size + byte_cnt, 0,
+					blksize - hdr_size - byte_cnt);
+		}
+
+		/* roll buffer forwards */
+		len -= blksize;
+		dst += blksize;
+		bno += BTOBB(blksize);
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*src += byte_cnt;
+		*offset += byte_cnt;
+	}
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+	struct xfs_da_args	*args)
+{
+	struct xfs_bmbt_irec	map[ATTR_RMTVALUE_MAPSIZE];
+	struct xfs_mount	*mp = args->dp->i_mount;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		lblkno = args->rmtblkno;
+	__uint8_t		*dst = args->value;
+	int			valuelen;
+	int			nmap;
+	int			error;
+	int			blkcnt = args->rmtblkcnt;
+	int			i;
+	int			offset = 0;
+
+	trace_xfs_attr_rmtval_get(args);
+
+	ASSERT(!(args->flags & ATTR_KERNOVAL));
+	ASSERT(args->rmtvaluelen == args->valuelen);
+
+	valuelen = args->rmtvaluelen;
+	while (valuelen > 0) {
+		nmap = ATTR_RMTVALUE_MAPSIZE;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap >= 1);
+
+		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+			xfs_daddr_t	dblkno;
+			int		dblkcnt;
+
+			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+			       (map[i].br_startblock != HOLESTARTBLOCK));
+			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+						   dblkno, dblkcnt, 0, &bp,
+						   &xfs_attr3_rmt_buf_ops);
+			if (error)
+				return error;
+
+			error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+							&offset, &valuelen,
+							&dst);
+			xfs_buf_relse(bp);
+			if (error)
+				return error;
+
+			/* roll attribute extent map forwards */
+			lblkno += map[i].br_blockcount;
+			blkcnt -= map[i].br_blockcount;
+		}
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_bmbt_irec	map;
+	xfs_dablk_t		lblkno;
+	xfs_fileoff_t		lfileoff = 0;
+	__uint8_t		*src = args->value;
+	int			blkcnt;
+	int			valuelen;
+	int			nmap;
+	int			error;
+	int			offset = 0;
+
+	trace_xfs_attr_rmtval_set(args);
+
+	/*
+	 * Find a "hole" in the attribute address space large enough for
+	 * us to drop the new attribute's value into. Because CRC enable
+	 * attributes have headers, we can't just do a straight byte to FSB
+	 * conversion and have to take the header space into account.
+	 */
+	blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+						   XFS_ATTR_FORK);
+	if (error)
+		return error;
+
+	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+	args->rmtblkcnt = blkcnt;
+
+	/*
+	 * Roll through the "value", allocating blocks on disk as required.
+	 */
+	while (blkcnt > 0) {
+		int	committed;
+
+		/*
+		 * Allocate a single extent, up to the size of the value.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+				  blkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  args->firstblock, args->total, &map, &nmap,
+				  args->flist);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+
+		/*
+		 * Start the next trans in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Roll through the "value", copying the attribute value to the
+	 * already-allocated blocks.  Blocks are written synchronously
+	 * so that we can know they are all on disk before we turn off
+	 * the INCOMPLETE flag.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	valuelen = args->rmtvaluelen;
+	while (valuelen > 0) {
+		struct xfs_buf	*bp;
+		xfs_daddr_t	dblkno;
+		int		dblkcnt;
+
+		ASSERT(blkcnt > 0);
+
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+		if (!bp)
+			return -ENOMEM;
+		bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+		xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+				       &valuelen, &src);
+
+		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
+
+
+		/* roll attribute extent map forwards */
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+	struct xfs_da_args	*args)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	xfs_dablk_t		lblkno;
+	int			blkcnt;
+	int			error;
+	int			done;
+
+	trace_xfs_attr_rmtval_remove(args);
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's blocks.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	while (blkcnt > 0) {
+		struct xfs_bmbt_irec	map;
+		struct xfs_buf		*bp;
+		xfs_daddr_t		dblkno;
+		int			dblkcnt;
+		int			nmap;
+
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		/*
+		 * If the "remote" value is in the cache, remove it.
+		 */
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+		if (bp) {
+			xfs_buf_stale(bp);
+			xfs_buf_relse(bp);
+			bp = NULL;
+		}
+
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+
+	/*
+	 * Keep de-allocating extents until the remote-value region is gone.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	done = 0;
+	while (!done) {
+		int committed;
+
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				    1, args->firstblock, args->flist,
+				    &done);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, args->dp, 0);
+
+		/*
+		 * Close out trans and start the next one in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, args->dp);
+		if (error)
+			return error;
+	}
+	return 0;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_attr_remote.h b/kernel/fs/xfs/libxfs/xfs_attr_remote.h
new file mode 100644
index 000000000..5a9acfa15
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_attr_remote.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_REMOTE_H__
+#define	__XFS_ATTR_REMOTE_H__
+
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_set(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_attr_sf.h b/kernel/fs/xfs/libxfs/xfs_attr_sf.h
new file mode 100644
index 000000000..919756e3b
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_attr_sf.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define	__XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+	struct xfs_attr_sf_hdr {	/* constant-structure header block */
+		__be16	totsize;	/* total bytes in shortform list */
+		__u8	count;	/* count of active entries */
+	} hdr;
+	struct xfs_attr_sf_entry {
+		__uint8_t namelen;	/* actual length of name (no NULL) */
+		__uint8_t valuelen;	/* actual length of value (no NULL) */
+		__uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
+		__uint8_t nameval[1];	/* name & value bytes concatenated */
+	} list[1];			/* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+	__uint8_t	entno;		/* entry number in original list */
+	__uint8_t	namelen;	/* length of name value (no null) */
+	__uint8_t	valuelen;	/* length of value */
+	__uint8_t	flags;		/* flags bits (see xfs_attr_leaf.h) */
+	xfs_dahash_t	hash;		/* this entry's hash value */
+	unsigned char	*name;		/* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	/* space name/value uses */ \
+	(((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
+#define XFS_ATTR_SF_ENTSIZE_MAX			/* max space for name&value */ \
+	((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#define XFS_ATTR_SF_ENTSIZE(sfep)		/* space an entry uses */ \
+	((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#define XFS_ATTR_SF_NEXTENTRY(sfep)		/* next entry in struct */ \
+	((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#define XFS_ATTR_SF_TOTSIZE(dp)			/* total space in use */ \
+	(be16_to_cpu(((xfs_attr_shortform_t *)	\
+		((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+#endif	/* __XFS_ATTR_SF_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_bit.h b/kernel/fs/xfs/libxfs/xfs_bit.h
new file mode 100644
index 000000000..e1649c0d3
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_bit.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BIT_H__
+#define	__XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 64-bit values
+ */
+static inline __uint64_t xfs_mask64hi(int n)
+{
+	return (__uint64_t)-1 << (64 - (n));
+}
+static inline __uint32_t xfs_mask32lo(int n)
+{
+	return ((__uint32_t)1 << (n)) - 1;
+}
+static inline __uint64_t xfs_mask64lo(int n)
+{
+	return ((__uint64_t)1 << (n)) - 1;
+}
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_highbit32(__uint32_t v)
+{
+	return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+	return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+	return ffs(v) - 1;
+}
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_lowbit64(__uint64_t v)
+{
+	__uint32_t	w = (__uint32_t)v;
+	int		n = 0;
+
+	if (w) {	/* lower bits */
+		n = ffs(w);
+	} else {	/* upper bits */
+		w = (__uint32_t)(v >> 32);
+		if (w) {
+			n = ffs(w);
+			if (n)
+				n += 32;
+		}
+	}
+	return n - 1;
+}
+
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif	/* __XFS_BIT_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.c b/kernel/fs/xfs/libxfs/xfs_bmap.c
new file mode 100644
index 000000000..f1026e86d
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_bmap.c
@@ -0,0 +1,5945 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_filestream.h"
+
+
+kmem_zone_t		*xfs_bmap_free_item_zone;
+
+/*
+ * Miscellaneous helper functions
+ */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	int		whichfork)	/* data or attr fork */
+{
+	int		level;		/* btree level */
+	uint		maxblocks;	/* max blocks at this level */
+	uint		maxleafents;	/* max leaf entries possible */
+	int		maxrootrecs;	/* max records in root block */
+	int		minleafrecs;	/* min records in leaf block */
+	int		minnoderecs;	/* min records in node block */
+	int		sz;		/* root block size */
+
+	/*
+	 * The maximum number of extents in a file, hence the maximum
+	 * number of leaf entries, is controlled by the type of di_nextents
+	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+	 * (a signed 16-bit number, xfs_aextnum_t).
+	 *
+	 * Note that we can no longer assume that if we are in ATTR1 that
+	 * the fork offset of all the inodes will be
+	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+	 * with ATTR2 and then mounted back with ATTR1, keeping the
+	 * di_forkoff's fixed but probably at various positions. Therefore,
+	 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+	 * of a minimum size available.
+	 */
+	if (whichfork == XFS_DATA_FORK) {
+		maxleafents = MAXEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+	} else {
+		maxleafents = MAXAEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	}
+	maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+	minleafrecs = mp->m_bmap_dmnr[0];
+	minnoderecs = mp->m_bmap_dmnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++) {
+		if (maxblocks <= maxrootrecs)
+			maxblocks = 1;
+		else
+			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	}
+	mp->m_bm_maxlevels[whichfork] = level;
+}
+
+STATIC int				/* error */
+xfs_bmbt_lookup_eq(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int				/* error */
+xfs_bmbt_lookup_ge(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) >
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) <=
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	xfs_exntst_t		state)
+{
+	union xfs_btree_rec	rec;
+
+	xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_filblks_t	len)		/* delayed extent length */
+{
+	int		level;		/* btree level number */
+	int		maxrecs;	/* maximum record count at this level */
+	xfs_mount_t	*mp;		/* mount structure */
+	xfs_filblks_t	rval;		/* return value */
+
+	mp = ip->i_mount;
+	maxrecs = mp->m_bmap_dmxr[0];
+	for (level = 0, rval = 0;
+	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+	     level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		rval += len;
+		if (len == 1)
+			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+				level - 1;
+		if (level == 0)
+			maxrecs = mp->m_bmap_dmxr[1];
+	}
+	return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			offset;
+
+	if (mp->m_sb.sb_inodesize == 256) {
+		offset = XFS_LITINO(mp, ip->i_d.di_version) -
+				XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	} else {
+		offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+	}
+
+	ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+	return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	if (whichfork == XFS_ATTR_FORK &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+		if (dfl_forkoff > ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = dfl_forkoff;
+	}
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+	struct xfs_btree_cur	*cur,
+	xfs_fsblock_t		bno)
+{
+	struct xfs_log_item_desc *lidp;
+	int			i;
+
+	if (!cur)
+		return NULL;
+
+	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+		if (!cur->bc_bufs[i])
+			break;
+		if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+			return cur->bc_bufs[i];
+	}
+
+	/* Chase down all the log items to see if the bp is there */
+	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+		struct xfs_buf_log_item	*bip;
+		bip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (bip->bli_item.li_type == XFS_LI_BUF &&
+		    XFS_BUF_ADDR(bip->bli_buf) == bno)
+			return bip->bli_buf;
+	}
+
+	return NULL;
+}
+
+STATIC void
+xfs_check_block(
+	struct xfs_btree_block	*block,
+	xfs_mount_t		*mp,
+	int			root,
+	short			sz)
+{
+	int			i, j, dmxr;
+	__be64			*pp, *thispa;	/* pointer to block address */
+	xfs_bmbt_key_t		*prevp, *keyp;
+
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+	prevp = NULL;
+	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+		dmxr = mp->m_bmap_dmxr[0];
+		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+		if (prevp) {
+			ASSERT(be64_to_cpu(prevp->br_startoff) <
+			       be64_to_cpu(keyp->br_startoff));
+		}
+		prevp = keyp;
+
+		/*
+		 * Compare the block numbers to see if there are dups.
+		 */
+		if (root)
+			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+		else
+			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+			if (root)
+				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+			else
+				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+			if (*thispa == *pp) {
+				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+					__func__, j, i,
+					(unsigned long long)be64_to_cpu(*thispa));
+				panic("%s: ptrs are equal in node\n",
+					__func__);
+			}
+		}
+	}
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+	xfs_btree_cur_t		*cur,	/* btree cursor or null */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_extnum_t		i=0, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
+	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
+	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
+	int			bp_release = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+		return;
+	}
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		/* See if buf is in cur first */
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+		if (level == 0)
+			break;
+
+		/*
+		 * Check this block for basic sanity (increasing keys and
+		 * no duplicate blocks).
+		 */
+
+		xfs_check_block(block, mp, 0, 0);
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(mp,
+					XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	i = 0;
+
+	/*
+	 * Loop over all leaf nodes checking that all extents are in the right order.
+	 */
+	for (;;) {
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = xfs_btree_get_numrecs(block);
+
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+		/*
+		 * Check all the extents to make sure they are OK.
+		 * If we had a previous block, the last entry should
+		 * conform with the first entry in this one.
+		 */
+
+		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+		if (i) {
+			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+			       xfs_bmbt_disk_get_blockcount(&last) <=
+			       xfs_bmbt_disk_get_startoff(ep));
+		}
+		for (j = 1; j < num_recs; j++) {
+			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+			       xfs_bmbt_disk_get_blockcount(ep) <=
+			       xfs_bmbt_disk_get_startoff(nextp));
+			ep = nextp;
+		}
+
+		last = *ep;
+		i += num_recs;
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	if (bp_release) {
+		bp_release = 0;
+		xfs_trans_brelse(NULL, bp);
+	}
+	return;
+
+error0:
+	xfs_warn(mp, "%s: at error0", __func__);
+	if (bp_release)
+		xfs_trans_brelse(NULL, bp);
+error_norelse:
+	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+		__func__, i);
+	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+	return;
+}
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	cnt,		/* count of entries in the list */
+	int		whichfork,	/* data or attr fork */
+	unsigned long	caller_ip)
+{
+	xfs_extnum_t	idx;		/* extent record index */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		state = 0;
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	for (idx = 0; idx < cnt; idx++)
+		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap)
+{
+	int			i;		/* index to map values */
+
+	ASSERT(ret_nmap <= nmap);
+
+	for (i = 0; i < ret_nmap; i++) {
+		ASSERT(mval[i].br_blockcount > 0);
+		if (!(flags & XFS_BMAPI_ENTIRE)) {
+			ASSERT(mval[i].br_startoff >= bno);
+			ASSERT(mval[i].br_blockcount <= len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+			       bno + len);
+		} else {
+			ASSERT(mval[i].br_startoff < bno + len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+			       bno);
+		}
+		ASSERT(i == 0 ||
+		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+		       mval[i].br_startoff);
+		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+		       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+		       mval[i].br_state == XFS_EXT_UNWRITTEN);
+	}
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
+#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+/*
+ * bmap free list manipulation functions
+ */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	xfs_mount_t		*mp)		/* mount point structure */
+{
+	xfs_bmap_free_item_t	*cur;		/* current (next) element */
+	xfs_bmap_free_item_t	*new;		/* new element */
+	xfs_bmap_free_item_t	*prev;		/* previous element */
+#ifdef DEBUG
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(len > 0);
+	ASSERT(len <= MAXEXTLEN);
+	ASSERT(!isnullstartblock(bno));
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(agbno < mp->m_sb.sb_agblocks);
+	ASSERT(len < mp->m_sb.sb_agblocks);
+	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+	ASSERT(xfs_bmap_free_item_zone != NULL);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new->xbfi_startblock = bno;
+	new->xbfi_blockcount = (xfs_extlen_t)len;
+	for (prev = NULL, cur = flist->xbf_first;
+	     cur != NULL;
+	     prev = cur, cur = cur->xbfi_next) {
+		if (cur->xbfi_startblock >= bno)
+			break;
+	}
+	if (prev)
+		prev->xbfi_next = new;
+	else
+		flist->xbf_first = new;
+	new->xbfi_next = cur;
+	flist->xbf_count++;
+}
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist,	/* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free)	/* list item to be freed */
+{
+	if (prev)
+		prev->xbfi_next = free->xbfi_next;
+	else
+		flist->xbf_first = free->xbfi_next;
+	flist->xbf_count--;
+	kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
+{
+	xfs_bmap_free_item_t	*free;	/* free list item */
+	xfs_bmap_free_item_t	*next;
+
+	if (flist->xbf_count == 0)
+		return;
+	ASSERT(flist->xbf_first != NULL);
+	for (free = flist->xbf_first; free; free = next) {
+		next = free->xbfi_next;
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork)  /* data or attr fork */
+{
+	/* REFERENCED */
+	struct xfs_btree_block	*cblock;/* child btree block */
+	xfs_fsblock_t		cbno;	/* child block number */
+	xfs_buf_t		*cbp;	/* child block's buffer */
+	int			error;	/* error return value */
+	xfs_ifork_t		*ifp;	/* inode fork data */
+	xfs_mount_t		*mp;	/* mount point structure */
+	__be64			*pp;	/* ptr to block address */
+	struct xfs_btree_block	*rblock;/* root btree block */
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+	rblock = ifp->if_broot;
+	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+	cbno = be64_to_cpu(*pp);
+	*logflagsp = 0;
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+		return error;
+#endif
+	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+				&xfs_bmbt_buf_ops);
+	if (error)
+		return error;
+	cblock = XFS_BUF_TO_BLOCK(cbp);
+	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+		return error;
+	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+	ip->i_d.di_nblocks--;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, cbp);
+	if (cur->bc_bufs[0] == cbp)
+		cur->bc_bufs[0] = NULL;
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ASSERT(ifp->if_broot == NULL);
+	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+	return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*ablock;	/* allocated (child) bt block */
+	xfs_buf_t		*abp;		/* buffer for ablock */
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_rec_t		*arp;		/* child record pointer */
+	struct xfs_btree_block	*block;		/* btree root block */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		i, cnt;		/* extent record index */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_bmbt_key_t		*kp;		/* root block key pointer */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* number of file extents */
+	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+
+	/*
+	 * Make space in the inode incore.
+	 */
+	xfs_iroot_realloc(ip, 1, whichfork);
+	ifp->if_flags |= XFS_IFBROOT;
+
+	/*
+	 * Fill in the root.
+	 */
+	block = ifp->if_broot;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+
+	/*
+	 * Need a cursor.  Can't allocate until bb_level is filled in.
+	 */
+	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+	cur->bc_private.b.firstblock = *firstblock;
+	cur->bc_private.b.flist = flist;
+	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+	/*
+	 * Convert to a btree with two levels, one record in root.
+	 */
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = mp;
+	args.firstblock = *firstblock;
+	if (*firstblock == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+	} else if (flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = *firstblock;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.fsbno = *firstblock;
+	}
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = wasdel;
+	*logflagsp = 0;
+	if ((error = xfs_alloc_vextent(&args))) {
+		xfs_iroot_realloc(ip, -1, whichfork);
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		return error;
+	}
+	/*
+	 * Allocation can't fail, the space was reserved.
+	 */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(*firstblock == NULLFSBLOCK ||
+	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+	       (flist->xbf_low &&
+		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	ip->i_d.di_nblocks++;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+	/*
+	 * Fill in the child block.
+	 */
+	abp->b_ops = &xfs_bmbt_buf_ops;
+	ablock = XFS_BUF_TO_BLOCK(abp);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+				XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+				XFS_BTREE_LONG_PTRS);
+
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (cnt = i = 0; i < nextents; i++) {
+		ep = xfs_iext_get_ext(ifp, i);
+		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+			arp->l0 = cpu_to_be64(ep->l0);
+			arp->l1 = cpu_to_be64(ep->l1);
+			arp++; cnt++;
+		}
+	}
+	ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+	xfs_btree_set_numrecs(ablock, cnt);
+
+	/*
+	 * Fill in the root key and pointer.
+	 */
+	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+						be16_to_cpu(block->bb_level)));
+	*pp = cpu_to_be64(args.fsbno);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+	ASSERT(*curp == NULL);
+	*curp = cur;
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+	return 0;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+void
+xfs_bmap_local_to_extents_empty(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	ASSERT(ifp->if_bytes == 0);
+	ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+
+	xfs_bmap_forkoff_reset(ip, whichfork);
+	ifp->if_flags &= ~XFS_IFINLINE;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+
+
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork,
+	void		(*init_fn)(struct xfs_trans *tp,
+				   struct xfs_buf *bp,
+				   struct xfs_inode *ip,
+				   struct xfs_ifork *ifp))
+{
+	int		error = 0;
+	int		flags;		/* logging flags returned */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_alloc_arg_t	args;		/* allocation arguments */
+	xfs_buf_t	*bp;		/* buffer for extent block */
+	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
+
+	/*
+	 * We don't want to deal with the case of keeping inode data inline yet.
+	 * So sending the data fork of a regular inode is invalid.
+	 */
+	ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
+	if (!ifp->if_bytes) {
+		xfs_bmap_local_to_extents_empty(ip, whichfork);
+		flags = XFS_ILOG_CORE;
+		goto done;
+	}
+
+	flags = 0;
+	error = 0;
+	ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+								XFS_IFINLINE);
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = ip->i_mount;
+	args.firstblock = *firstblock;
+	/*
+	 * Allocate a block.  We know we need only one, since the
+	 * file currently fits in an inode.
+	 */
+	if (*firstblock == NULLFSBLOCK) {
+		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else {
+		args.fsbno = *firstblock;
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	}
+	args.total = total;
+	args.minlen = args.maxlen = args.prod = 1;
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto done;
+
+	/* Can't fail, the space was reserved. */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(args.len == 1);
+	*firstblock = args.fsbno;
+	bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+	/*
+	 * Initialise the block and copy the data
+	 *
+	 * Note: init_fn must set the buffer log item type correctly!
+	 */
+	init_fn(tp, bp, ip, ifp);
+
+	/* account for the change in fork size and log everything */
+	xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+	xfs_bmap_local_to_extents_empty(ip, whichfork);
+	flags |= XFS_ILOG_CORE;
+
+	xfs_iext_add(ifp, 0, 1);
+	ep = xfs_iext_get_ext(ifp, 0);
+	xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+	trace_xfs_bmap_post_update(ip, 0,
+			whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+			_THIS_IP_);
+	XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+	ip->i_d.di_nblocks = 1;
+	xfs_trans_mod_dquot_byino(tp, ip,
+		XFS_TRANS_DQ_BCOUNT, 1L);
+	flags |= xfs_ilog_fext(whichfork);
+
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* btree cursor */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* file system mount struct */
+	int			stat;		/* newroot status */
+
+	mp = ip->i_mount;
+	if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+		*flags |= XFS_ILOG_DBROOT;
+	else {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.firstblock = *firstblock;
+		if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+			goto error0;
+		/* must be at least one entry */
+		XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
+		if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+			goto error0;
+		if (stat == 0) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			return -ENOSPC;
+		}
+		*firstblock = cur->bc_private.b.firstblock;
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	}
+	return 0;
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	int			error;		/* error return value */
+
+	if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	cur = NULL;
+	error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+		flags, XFS_DATA_FORK);
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_da_args_t		dargs;		/* args for dir/attr code */
+
+	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+		return 0;
+
+	if (S_ISDIR(ip->i_d.di_mode)) {
+		memset(&dargs, 0, sizeof(dargs));
+		dargs.geo = ip->i_mount->m_dir_geo;
+		dargs.dp = ip;
+		dargs.firstblock = firstblock;
+		dargs.flist = flist;
+		dargs.total = dargs.geo->fsbcount;
+		dargs.whichfork = XFS_DATA_FORK;
+		dargs.trans = tp;
+		return xfs_dir2_sf_to_block(&dargs);
+	}
+
+	if (S_ISLNK(ip->i_d.di_mode))
+		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+						 flags, XFS_DATA_FORK,
+						 xfs_symlink_local_to_remote);
+
+	/* should only be called for types that support local format data */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int						/* error code */
+xfs_bmap_add_attrfork(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			size,		/* space new attribute needs */
+	int			rsvd)		/* xact may use reserved blks */
+{
+	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
+	xfs_bmap_free_t		flist;		/* freed extent records */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int			blks;		/* space reservation */
+	int			version = 1;	/* superblock attr version */
+	int			committed;	/* xaction was committed */
+	int			logflags;	/* logging flags */
+	int			error;		/* error return value */
+	int			cancel_flags = 0;
+
+	ASSERT(XFS_IFORK_Q(ip) == 0);
+
+	mp = ip->i_mount;
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+	blks = XFS_ADDAFORK_SPACE_RES(mp);
+	if (rsvd)
+		tp->t_flags |= XFS_TRANS_RESERVE;
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+			XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		goto trans_cancel;
+	cancel_flags |= XFS_TRANS_ABORT;
+	if (XFS_IFORK_Q(ip))
+		goto trans_cancel;
+	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+		/*
+		 * For inodes coming from pre-6.2 filesystems.
+		 */
+		ASSERT(ip->i_d.di_aformat == 0);
+		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	}
+	ASSERT(ip->i_d.di_anextents == 0);
+
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_UUID:
+		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+		if (!ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+		else if (mp->m_flags & XFS_MOUNT_ATTR2)
+			version = 2;
+		break;
+	default:
+		ASSERT(0);
+		error = -EINVAL;
+		goto trans_cancel;
+	}
+
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_flags = XFS_IFEXTENTS;
+	logflags = 0;
+	xfs_bmap_init(&flist, &firstblock);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+			&flist, &logflags);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (error)
+		goto bmap_cancel;
+	if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+	   (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+		bool log_sb = false;
+
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+			xfs_sb_version_addattr(&mp->m_sb);
+			log_sb = true;
+		}
+		if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+			xfs_sb_version_addattr2(&mp->m_sb);
+			log_sb = true;
+		}
+		spin_unlock(&mp->m_sb_lock);
+		if (log_sb)
+			xfs_log_sb(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &flist, &committed);
+	if (error)
+		goto bmap_cancel;
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+bmap_cancel:
+	xfs_bmap_cancel(&flist);
+trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Internal and external extent tree search functions.
+ */
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int					/* error */
+xfs_bmap_read_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode */
+	int			whichfork) /* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
+	xfs_extnum_t		i, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	/* REFERENCED */
+	xfs_extnum_t		room;	/* number of entries there's room for */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+					XFS_EXTFMT_INODE(ip);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+		if (error)
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+		if (level == 0)
+			break;
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(mp,
+			XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		xfs_trans_brelse(tp, bp);
+	}
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	i = 0;
+	/*
+	 * Loop over all leaf nodes.  Copy information to the extent records.
+	 */
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+		xfs_extnum_t	start;
+
+		num_recs = xfs_btree_get_numrecs(block);
+		if (unlikely(i + num_recs > room)) {
+			ASSERT(i + num_recs <= room);
+			xfs_warn(ip->i_mount,
+				"corrupt dinode %Lu, (btree extents).",
+				(unsigned long long) ip->i_ino);
+			XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+				XFS_ERRLEVEL_LOW, ip->i_mount, block);
+			goto error0;
+		}
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+		if (nextbno != NULLFSBLOCK)
+			xfs_btree_reada_bufl(mp, nextbno, 1,
+					     &xfs_bmbt_buf_ops);
+		/*
+		 * Copy records into the extent records.
+		 */
+		frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		start = i;
+		for (j = 0; j < num_recs; j++, i++, frp++) {
+			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+			trp->l0 = be64_to_cpu(frp->l0);
+			trp->l1 = be64_to_cpu(frp->l1);
+		}
+		if (exntf == XFS_EXTFMT_NOSTATE) {
+			/*
+			 * Check all attribute bmap btree records and
+			 * any "older" data bmap btree records for a
+			 * set bit in the "extent flag" position.
+			 */
+			if (unlikely(xfs_check_nostate_extents(ifp,
+					start, num_recs))) {
+				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+						 XFS_ERRLEVEL_LOW,
+						 ip->i_mount);
+				goto error0;
+			}
+		}
+		xfs_trans_brelse(tp, bp);
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+		if (error)
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+	return 0;
+error0:
+	xfs_trans_brelse(tp, bp);
+	return -EFSCORRUPTED;
+}
+
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none).  Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
+{
+	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
+	xfs_extnum_t	lastx;		/* last extent index */
+
+	/*
+	 * Initialize the extent entry structure to catch access to
+	 * uninitialized br_startblock field.
+	 */
+	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+	gotp->br_state = XFS_EXT_INVALID;
+	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+	prevp->br_startoff = NULLFILEOFF;
+
+	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+	if (lastx > 0) {
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+	}
+	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, gotp);
+		*eofp = 0;
+	} else {
+		if (lastx > 0) {
+			*gotp = *prevp;
+		}
+		*eofp = 1;
+		ep = NULL;
+	}
+	*lastxp = lastx;
+	return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t     *ip,            /* incore inode pointer */
+	xfs_fileoff_t   bno,            /* block number searched for */
+	int             fork,      	/* data or attr fork */
+	int             *eofp,          /* out: end of file found */
+	xfs_extnum_t    *lastxp,        /* out: last extent index */
+	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
+
+	XFS_STATS_INC(xs_look_exlist);
+	ifp = XFS_IFORK_PTR(ip, fork);
+
+	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+				"Access to block zero in inode %llu "
+				"start_block: %llx start_off: %llx "
+				"blkcnt: %llx extent-state: %x lastx: %x",
+			(unsigned long long)ip->i_ino,
+			(unsigned long long)gotp->br_startblock,
+			(unsigned long long)gotp->br_startoff,
+			(unsigned long long)gotp->br_blockcount,
+			gotp->br_state, *lastxp);
+		*lastxp = NULLEXTNUM;
+		*eofp = 1;
+		return NULL;
+	}
+	return ep;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_extlen_t	len,			/* size of hole to find */
+	xfs_fileoff_t	*first_unused,		/* unused block */
+	int		whichfork)		/* data or attr fork */
+{
+	int		error;			/* error return value */
+	int		idx;			/* extent record index */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_fileoff_t	lowest;			/* lowest useful block */
+	xfs_fileoff_t	max;			/* starting useful block */
+	xfs_fileoff_t	off;			/* offset for this block */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*first_unused = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	lowest = *first_unused;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+		off = xfs_bmbt_get_startoff(ep);
+		/*
+		 * See if the hole before this extent will work.
+		 */
+		if (off >= lowest + len && off - max >= len) {
+			*first_unused = max;
+			return 0;
+		}
+		lastaddr = off + xfs_bmbt_get_blockcount(ep);
+		max = XFS_FILEOFF_MAX(lastaddr, lowest);
+	}
+	*first_unused = max;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_fileoff_t	bno;			/* input file offset */
+	int		eof;			/* hit end of file */
+	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_bmbt_irec_t	got;			/* current extent value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	lastx;			/* last extent used */
+	xfs_bmbt_irec_t	prev;			/* previous extent value */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return -EIO;
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	bno = *last_block - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+		if (prev.br_startoff == NULLFILEOFF)
+			*last_block = 0;
+		else
+			*last_block = prev.br_startoff + prev.br_blockcount;
+	}
+	/*
+	 * Otherwise *last_block is already the right answer.
+	 */
+	return 0;
+}
+
+int
+xfs_bmap_last_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*rec,
+	int			*is_empty)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	int			error;
+	int			nextents;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*is_empty = 1;
+		return 0;
+	}
+
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+	*is_empty = 0;
+	return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	bma->aeof = 0;
+	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+				     &is_empty);
+	if (error)
+		return error;
+
+	if (is_empty) {
+		bma->aeof = 1;
+		return 0;
+	}
+
+	/*
+	 * Check if we are allocation or past the last extent, or at least into
+	 * the last delayed allocated extent.
+	 */
+	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+		(bma->offset >= rec.br_startoff &&
+		 isnullstartblock(rec.br_startblock));
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*last_block,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	*last_block = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+		return 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+	       return -EIO;
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	if (error || is_empty)
+		return error;
+
+	*last_block = rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int					/* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_host_t *ep;	/* ptr to fork's extent */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		rval;		/* return value */
+	xfs_bmbt_irec_t	s;		/* internal version of extent */
+
+#ifndef DEBUG
+	if (whichfork == XFS_DATA_FORK)
+		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif	/* !DEBUG */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+		return 0;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ep = xfs_iext_get_ext(ifp, 0);
+	xfs_bmbt_get_all(ep, &s);
+	rval = s.br_startoff == 0 && s.br_blockcount == 1;
+	if (rval && whichfork == XFS_DATA_FORK)
+		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+	return rval;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
+ * Convert a delayed allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	struct xfs_bmalloca	*bma)
+{
+	struct xfs_bmbt_irec	*new = &bma->got;
+	int			diff;	/* temp value */
+	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	xfs_filblks_t		temp=0;	/* value for da_new calculations */
+	xfs_filblks_t		temp2=0;/* value for da_new calculations */
+	int			tmp_rval;	/* partial logging flags */
+	struct xfs_mount	*mp;
+
+	mp  = bma->tp ? bma->tp->t_mountp : NULL;
+	ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!bma->cur ||
+	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	ep = xfs_iext_get_ext(ifp, bma->idx);
+	xfs_bmbt_get_all(ep, &PREV);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	da_old = startblockval(PREV.br_startblock);
+	da_new = 0;
+
+	/*
+	 * Set flags determining what part of the previous delayed allocation
+	 * extent is being replaced by a real allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (bma->idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == new->br_state &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
+
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    new->br_state == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		bma->idx--;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+		bma->ip->i_d.di_nextents--;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		bma->idx--;
+
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			LEFT.br_blockcount + PREV.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount, LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
+					new->br_startblock,
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, PREV.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is contiguous.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					new->br_blockcount,
+					LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->idx--;
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist,
+					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx + 1);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount,
+			RIGHT.br_state);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+					RIGHT.br_blockcount,
+					RIGHT.br_state);
+			if (error)
+				goto done;
+		}
+
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock));
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->idx++;
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is not contiguous.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur, 1,
+				&tmp_rval, XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->idx++;
+		break;
+
+	case 0:
+		/*
+		 * Filling in the middle part of a previous delayed allocation.
+		 * Contiguity is impossible here.
+		 * This case is avoided almost all the time.
+		 *
+		 * We start with a delayed allocation:
+		 *
+		 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+		 *  PREV @ idx
+		 *
+	         * and we are allocating:
+		 *                     +rrrrrrrrrrrrrrrrr+
+		 *			      new
+		 *
+		 * and we set it up for insertion as:
+		 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+		 *                            new
+		 *  PREV @ idx          LEFT              RIGHT
+		 *                      inserted at idx + 1
+		 */
+		temp = new->br_startoff - PREV.br_startoff;
+		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
+		LEFT = *new;
+		RIGHT.br_state = PREV.br_state;
+		RIGHT.br_startblock = nullstartblock(
+				(int)xfs_bmap_worst_indlen(bma->ip, temp2));
+		RIGHT.br_startoff = new_endoff;
+		RIGHT.br_blockcount = temp2;
+		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist, &bma->cur,
+					1, &tmp_rval, XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = xfs_bmap_worst_indlen(bma->ip, temp);
+		temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
+		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		if (diff > 0) {
+			error = xfs_mod_fdblocks(bma->ip->i_mount,
+						 -((int64_t)diff), false);
+			ASSERT(!error);
+			if (error)
+				goto done;
+		}
+
+		ep = xfs_iext_get_ext(ifp, bma->idx);
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
+			nullstartblock((int)temp2));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+
+		bma->idx++;
+		da_new = temp + temp2;
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
+				da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+		bma->logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* adjust for changes in reserved delayed indirect blocks */
+	if (da_old || da_new) {
+		temp = da_new;
+		if (bma->cur)
+			temp += bma->cur->bc_private.b.allocated;
+		ASSERT(temp <= da_old);
+		if (temp < da_old)
+			xfs_mod_fdblocks(bma->ip->i_mount,
+					(int64_t)(da_old - temp), false);
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+done:
+	bma->logflags |= rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Convert an unwritten allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	struct xfs_trans	*tp,
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	int			*logflagsp) /* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_exntst_t		newext;	/* new extent state */
+	xfs_exntst_t		oldext;	/* old extent state */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	*logflagsp = 0;
+
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+
+	XFS_STATS_INC(xs_add_exlist);
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	error = 0;
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &PREV);
+	newext = new->br_state;
+	oldext = (newext == XFS_EXT_UNWRITTEN) ?
+		XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+	ASSERT(PREV.br_state == oldext);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == newext &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    newext == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 2, state);
+		ip->i_d.di_nextents -= 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount +
+				RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount,
+				LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		xfs_bmbt_set_state(ep, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_state(ep, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock, new->br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		--*idx;
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + new->br_blockcount,
+				LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			cur->bc_rec.b = *new;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock,
+					PREV.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_btree_increment(cur, 0, &i)))
+				goto done;
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
+
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			new->br_startoff - PREV.br_startoff);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		r[1].br_blockcount =
+			PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		r[1].br_startblock = new->br_startblock + new->br_blockcount;
+		r[1].br_state = oldext;
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
+		ip->i_d.di_nextents += 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			/* new right extent - oldext */
+			if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+				r[1].br_startblock, r[1].br_blockcount,
+				r[1].br_state)))
+				goto done;
+			/* new left extent - oldext */
+			cur->bc_rec.b = PREV;
+			cur->bc_rec.b.br_blockcount =
+				new->br_startoff - PREV.br_startoff;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			/*
+			 * Reset the cursor to the position of the new extent
+			 * we are about to insert as we can't trust it after
+			 * the previous insert.
+			 */
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+			/* new middle extent - newext */
+			cur->bc_rec.b.br_state = new->br_state;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+				0, &tmp_logflags, XFS_DATA_FORK);
+		*logflagsp |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+
+	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
+done:
+	*logflagsp |= rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Convert a hole to a delayed allocation.
+ */
+STATIC void
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
+{
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_filblks_t		newlen=0;	/* new indirect size */
+	xfs_filblks_t		oldlen=0;	/* old indirect size */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			state;  /* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
+
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	state = 0;
+	ASSERT(isnullstartblock(new->br_startblock));
+
+	/*
+	 * Check and set flags if this segment has a left neighbor
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if the current (right) segment exists.
+	 * If it doesn't exist, we're converting the hole at end-of-file.
+	 */
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * Set contiguity flags on the left and right neighbors.
+	 * Don't let extents get too large, even if the pieces are contiguous.
+	 */
+	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     (left.br_blockcount + new->br_blockcount +
+	      right.br_blockcount <= MAXEXTLEN)))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the contiguity flags.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with delayed allocations
+		 * on the left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		--*idx;
+		temp = left.br_blockcount + new->br_blockcount +
+			right.br_blockcount;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+			nullstartblock((int)newlen));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		--*idx;
+		temp = left.br_blockcount + new->br_blockcount;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+			nullstartblock((int)newlen));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		temp = new->br_blockcount + right.br_blockcount;
+		oldlen = startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff,
+			nullstartblock((int)newlen), temp, right.br_state);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * delayed allocation.
+		 * Insert a new entry.
+		 */
+		oldlen = newlen = 0;
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		break;
+	}
+	if (oldlen != newlen) {
+		ASSERT(oldlen > newlen);
+		xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
+				 false);
+		/*
+		 * Nothing to do for disk quota accounting here.
+		 */
+	}
+}
+
+/*
+ * Convert a hole to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	*new = &bma->got;
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			rval=0;	/* return value (logging flags) */
+	int			state;	/* state bits, accessed thru macros */
+	struct xfs_mount	*mp;
+
+	mp = bma->tp ? bma->tp->t_mountp : NULL;
+	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!bma->cur ||
+	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
+
+	state = 0;
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 */
+	if (bma->idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if this segment has a current value.
+	 * Not true if we're inserting into the "hole" at eof.
+	 */
+	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * We're inserting a real allocation between "left" and "right".
+	 * Set the contiguity flags.  Don't let extents get too large.
+	 */
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_startblock + left.br_blockcount == new->br_startblock &&
+	    left.br_state == new->br_state &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_startblock + new->br_blockcount == right.br_startblock &&
+	    new->br_state == right.br_state &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     left.br_blockcount + new->br_blockcount +
+	     right.br_blockcount <= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Select which case we're in here, and implement it.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with real allocations on the
+		 * left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			left.br_blockcount + new->br_blockcount +
+			right.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+		if (bma->cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+					right.br_startblock, right.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount +
+						right.br_blockcount,
+					left.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			left.br_blockcount + new->br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		if (bma->cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+					left.br_startblock, left.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount,
+					left.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + right.br_blockcount,
+			right.br_state);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		if (bma->cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+						right.br_blockcount,
+					right.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * real allocation.
+		 * Insert a new entry.
+		 */
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+		if (bma->cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur,
+					new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+			bma->cur->bc_rec.b.br_state = new->br_state;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+		break;
+	}
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
+				0, &tmp_logflags, whichfork);
+		bma->logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+done:
+	bma->logflags |= rval;
+	return error;
+}
+
+/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
+ * Adjust the size of the new extent based on di_extsize and rt extsize.
+ */
+int
+xfs_bmap_extsize_align(
+	xfs_mount_t	*mp,
+	xfs_bmbt_irec_t	*gotp,		/* next extent pointer */
+	xfs_bmbt_irec_t	*prevp,		/* previous extent pointer */
+	xfs_extlen_t	extsz,		/* align to this extent size */
+	int		rt,		/* is this a realtime inode? */
+	int		eof,		/* is extent at end-of-file? */
+	int		delay,		/* creating delalloc extent? */
+	int		convert,	/* overwriting unwritten extent? */
+	xfs_fileoff_t	*offp,		/* in/out: aligned offset */
+	xfs_extlen_t	*lenp)		/* in/out: aligned length */
+{
+	xfs_fileoff_t	orig_off;	/* original offset */
+	xfs_extlen_t	orig_alen;	/* original length */
+	xfs_fileoff_t	orig_end;	/* original off+len */
+	xfs_fileoff_t	nexto;		/* next file offset */
+	xfs_fileoff_t	prevo;		/* previous file offset */
+	xfs_fileoff_t	align_off;	/* temp for offset */
+	xfs_extlen_t	align_alen;	/* temp for length */
+	xfs_extlen_t	temp;		/* temp for calculations */
+
+	if (convert)
+		return 0;
+
+	orig_off = align_off = *offp;
+	orig_alen = align_alen = *lenp;
+	orig_end = orig_off + orig_alen;
+
+	/*
+	 * If this request overlaps an existing extent, then don't
+	 * attempt to perform any additional alignment.
+	 */
+	if (!delay && !eof &&
+	    (orig_off >= gotp->br_startoff) &&
+	    (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+		return 0;
+	}
+
+	/*
+	 * If the file offset is unaligned vs. the extent size
+	 * we need to align it.  This will be possible unless
+	 * the file was previously written with a kernel that didn't
+	 * perform this alignment, or if a truncate shot us in the
+	 * foot.
+	 */
+	temp = do_mod(orig_off, extsz);
+	if (temp) {
+		align_alen += temp;
+		align_off -= temp;
+	}
+
+	/* Same adjustment for the end of the requested area. */
+	temp = (align_alen % extsz);
+	if (temp)
+		align_alen += extsz - temp;
+
+	/*
+	 * For large extent hint sizes, the aligned extent might be larger than
+	 * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
+	 * the length back under MAXEXTLEN. The outer allocation loops handle
+	 * short allocation just fine, so it is safe to do this. We only want to
+	 * do it when we are forced to, though, because it means more allocation
+	 * operations are required.
+	 */
+	while (align_alen > MAXEXTLEN)
+		align_alen -= extsz;
+	ASSERT(align_alen <= MAXEXTLEN);
+
+	/*
+	 * If the previous block overlaps with this proposed allocation
+	 * then move the start forward without adjusting the length.
+	 */
+	if (prevp->br_startoff != NULLFILEOFF) {
+		if (prevp->br_startblock == HOLESTARTBLOCK)
+			prevo = prevp->br_startoff;
+		else
+			prevo = prevp->br_startoff + prevp->br_blockcount;
+	} else
+		prevo = 0;
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	/*
+	 * If the next block overlaps with this proposed allocation
+	 * then move the start back without adjusting the length,
+	 * but not before offset 0.
+	 * This may of course make the start overlap previous block,
+	 * and if we hit the offset 0 limit then the next block
+	 * can still overlap too.
+	 */
+	if (!eof && gotp->br_startoff != NULLFILEOFF) {
+		if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+		    (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+			nexto = gotp->br_startoff + gotp->br_blockcount;
+		else
+			nexto = gotp->br_startoff;
+	} else
+		nexto = NULLFILEOFF;
+	if (!eof &&
+	    align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto)
+		align_off = nexto > align_alen ? nexto - align_alen : 0;
+	/*
+	 * If we're now overlapping the next or previous extent that
+	 * means we can't fit an extsz piece in this hole.  Just move
+	 * the start forward to the first valid spot and set
+	 * the length so we hit the end.
+	 */
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	if (align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto &&
+	    nexto != NULLFILEOFF) {
+		ASSERT(nexto > prevo);
+		align_alen = nexto - align_off;
+	}
+
+	/*
+	 * If realtime, and the result isn't a multiple of the realtime
+	 * extent size we need to remove blocks until it is.
+	 */
+	if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+		/*
+		 * We're not covering the original request, or
+		 * we won't be able to once we fix the length.
+		 */
+		if (orig_off < align_off ||
+		    orig_end > align_off + align_alen ||
+		    align_alen - temp < orig_alen)
+			return -EINVAL;
+		/*
+		 * Try to fix it by moving the start up.
+		 */
+		if (align_off + temp <= orig_off) {
+			align_alen -= temp;
+			align_off += temp;
+		}
+		/*
+		 * Try to fix it by moving the end in.
+		 */
+		else if (align_off + align_alen - temp >= orig_end)
+			align_alen -= temp;
+		/*
+		 * Set the start to the minimum then trim the length.
+		 */
+		else {
+			align_alen -= orig_off - align_off;
+			align_off = orig_off;
+			align_alen -= align_alen % mp->m_sb.sb_rextsize;
+		}
+		/*
+		 * Result doesn't cover the request, fail it.
+		 */
+		if (orig_off < align_off || orig_end > align_off + align_alen)
+			return -EINVAL;
+	} else {
+		ASSERT(orig_off >= align_off);
+		/* see MAXEXTLEN handling above */
+		ASSERT(orig_end <= align_off + align_alen ||
+		       align_alen + extsz > MAXEXTLEN);
+	}
+
+#ifdef DEBUG
+	if (!eof && gotp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off + align_alen <= gotp->br_startoff);
+	if (prevp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+	*lenp = align_alen;
+	*offp = align_off;
+	return 0;
+}
+
+#define XFS_ALLOC_GAP_UNITS	4
+
+void
+xfs_bmap_adjacent(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	xfs_fsblock_t	adjust;		/* adjustment to block numbers */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		rt;		/* true if inode is realtime */
+
+#define	ISVALID(x,y)	\
+	(rt ? \
+		(x) < mp->m_sb.sb_rblocks : \
+		XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+	mp = ap->ip->i_mount;
+	nullfb = *ap->firstblock == NULLFSBLOCK;
+	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+	/*
+	 * If allocating at eof, and there's a previous real block,
+	 * try to use its last block as our starting point.
+	 */
+	if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+	    !isnullstartblock(ap->prev.br_startblock) &&
+	    ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+		    ap->prev.br_startblock)) {
+		ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
+		/*
+		 * Adjust for the gap between prevp and us.
+		 */
+		adjust = ap->offset -
+			(ap->prev.br_startoff + ap->prev.br_blockcount);
+		if (adjust &&
+		    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+			ap->blkno += adjust;
+	}
+	/*
+	 * If not at eof, then compare the two neighbor blocks.
+	 * Figure out whether either one gives us a good starting point,
+	 * and pick the better one.
+	 */
+	else if (!ap->eof) {
+		xfs_fsblock_t	gotbno;		/* right side block number */
+		xfs_fsblock_t	gotdiff=0;	/* right side difference */
+		xfs_fsblock_t	prevbno;	/* left side block number */
+		xfs_fsblock_t	prevdiff=0;	/* left side difference */
+
+		/*
+		 * If there's a previous (left) block, select a requested
+		 * start block based on it.
+		 */
+		if (ap->prev.br_startoff != NULLFILEOFF &&
+		    !isnullstartblock(ap->prev.br_startblock) &&
+		    (prevbno = ap->prev.br_startblock +
+			       ap->prev.br_blockcount) &&
+		    ISVALID(prevbno, ap->prev.br_startblock)) {
+			/*
+			 * Calculate gap to end of previous block.
+			 */
+			adjust = prevdiff = ap->offset -
+				(ap->prev.br_startoff +
+				 ap->prev.br_blockcount);
+			/*
+			 * Figure the startblock based on the previous block's
+			 * end and the gap size.
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the end of the previous block.
+			 */
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+			    ISVALID(prevbno + prevdiff,
+				    ap->prev.br_startblock))
+				prevbno += adjust;
+			else
+				prevdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+				prevbno = NULLFSBLOCK;
+		}
+		/*
+		 * No previous block or can't follow it, just default.
+		 */
+		else
+			prevbno = NULLFSBLOCK;
+		/*
+		 * If there's a following (right) block, select a requested
+		 * start block based on it.
+		 */
+		if (!isnullstartblock(ap->got.br_startblock)) {
+			/*
+			 * Calculate gap to start of next block.
+			 */
+			adjust = gotdiff = ap->got.br_startoff - ap->offset;
+			/*
+			 * Figure the startblock based on the next block's
+			 * start and the gap size.
+			 */
+			gotbno = ap->got.br_startblock;
+			/*
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the start of the next block
+			 * offset by our length.
+			 */
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+			    ISVALID(gotbno - gotdiff, gotbno))
+				gotbno -= adjust;
+			else if (ISVALID(gotbno - ap->length, gotbno)) {
+				gotbno -= ap->length;
+				gotdiff += adjust - ap->length;
+			} else
+				gotdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+				gotbno = NULLFSBLOCK;
+		}
+		/*
+		 * No next block, just default.
+		 */
+		else
+			gotbno = NULLFSBLOCK;
+		/*
+		 * If both valid, pick the better one, else the only good
+		 * one, else ap->blkno is already set (to 0 or the inode block).
+		 */
+		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+			ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
+		else if (prevbno != NULLFSBLOCK)
+			ap->blkno = prevbno;
+		else if (gotbno != NULLFSBLOCK)
+			ap->blkno = gotbno;
+	}
+#undef ISVALID
+}
+
+static int
+xfs_bmap_longest_free_extent(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		ag,
+	xfs_extlen_t		*blen,
+	int			*notinit)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	xfs_extlen_t		longest;
+	int			error = 0;
+
+	pag = xfs_perag_get(mp, ag);
+	if (!pag->pagf_init) {
+		error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+		if (error)
+			goto out;
+
+		if (!pag->pagf_init) {
+			*notinit = 1;
+			goto out;
+		}
+	}
+
+	longest = xfs_alloc_longest_free_extent(mp, pag);
+	if (*blen < longest)
+		*blen = longest;
+
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen,
+	int			notinit)
+{
+	if (notinit || *blen < ap->minlen) {
+		/*
+		 * Since we did a BUF_TRYLOCK above, it is possible that
+		 * there is space for this request.
+		 */
+		args->minlen = ap->minlen;
+	} else if (*blen < args->maxlen) {
+		/*
+		 * If the best seen length is less than the request length,
+		 * use the best as the minimum.
+		 */
+		args->minlen = *blen;
+	} else {
+		/*
+		 * Otherwise we've seen an extent as big as maxlen, use that
+		 * as the minimum.
+		 */
+		args->minlen = args->maxlen;
+	}
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_agnumber_t		ag, startag;
+	int			notinit = 0;
+	int			error;
+
+	args->type = XFS_ALLOCTYPE_START_BNO;
+	args->total = ap->total;
+
+	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (startag == NULLAGNUMBER)
+		startag = ag = 0;
+
+	while (*blen < args->maxlen) {
+		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+						     &notinit);
+		if (error)
+			return error;
+
+		if (++ag == mp->m_sb.sb_agcount)
+			ag = 0;
+		if (ag == startag)
+			break;
+	}
+
+	xfs_bmap_select_minlen(ap, args, blen, notinit);
+	return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_agnumber_t		ag;
+	int			notinit = 0;
+	int			error;
+
+	args->type = XFS_ALLOCTYPE_NEAR_BNO;
+	args->total = ap->total;
+
+	ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (ag == NULLAGNUMBER)
+		ag = 0;
+
+	error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+	if (error)
+		return error;
+
+	if (*blen < args->maxlen) {
+		error = xfs_filestream_new_ag(ap, &ag);
+		if (error)
+			return error;
+
+		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+						     &notinit);
+		if (error)
+			return error;
+
+	}
+
+	xfs_bmap_select_minlen(ap, args, blen, notinit);
+
+	/*
+	 * Set the failure fallback case to look in the selected AG as stream
+	 * may have moved.
+	 */
+	ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+	return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	xfs_mount_t	*mp;		/* mount point structure */
+	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
+	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_agnumber_t	ag;
+	xfs_alloc_arg_t	args;
+	xfs_extlen_t	blen;
+	xfs_extlen_t	nextminlen = 0;
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		isaligned;
+	int		tryagain;
+	int		error;
+	int		stripe_align;
+
+	ASSERT(ap->length);
+
+	mp = ap->ip->i_mount;
+
+	/* stripe alignment for allocation is determined by mount parameters */
+	stripe_align = 0;
+	if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+		stripe_align = mp->m_swidth;
+	else if (mp->m_dalign)
+		stripe_align = mp->m_dalign;
+
+	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+	if (unlikely(align)) {
+		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+						align, 0, ap->eof, 0, ap->conv,
+						&ap->offset, &ap->length);
+		ASSERT(!error);
+		ASSERT(ap->length);
+	}
+
+
+	nullfb = *ap->firstblock == NULLFSBLOCK;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+	if (nullfb) {
+		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+			ag = xfs_filestream_lookup_ag(ap->ip);
+			ag = (ag != NULLAGNUMBER) ? ag : 0;
+			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
+		} else {
+			ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+		}
+	} else
+		ap->blkno = *ap->firstblock;
+
+	xfs_bmap_adjacent(ap);
+
+	/*
+	 * If allowed, use ap->blkno; otherwise must use firstblock since
+	 * it's in the right allocation group.
+	 */
+	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
+		;
+	else
+		ap->blkno = *ap->firstblock;
+	/*
+	 * Normal allocation, done through xfs_alloc_vextent.
+	 */
+	tryagain = isaligned = 0;
+	memset(&args, 0, sizeof(args));
+	args.tp = ap->tp;
+	args.mp = mp;
+	args.fsbno = ap->blkno;
+
+	/* Trim the allocation back to the maximum an AG can fit. */
+	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+	args.firstblock = *ap->firstblock;
+	blen = 0;
+	if (nullfb) {
+		/*
+		 * Search for an allocation group with a single extent large
+		 * enough for the request.  If one isn't found, then adjust
+		 * the minimum allocation size to the largest space found.
+		 */
+		if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+			error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+		else
+			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+		if (error)
+			return error;
+	} else if (ap->flist->xbf_low) {
+		if (xfs_inode_is_filestream(ap->ip))
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+		else
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		args.total = args.minlen = ap->minlen;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.total = ap->total;
+		args.minlen = ap->minlen;
+	}
+	/* apply extent size hints if obtained earlier */
+	if (unlikely(align)) {
+		args.prod = align;
+		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
+			args.mod = (xfs_extlen_t)(args.prod - args.mod);
+	} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+		args.prod = 1;
+		args.mod = 0;
+	} else {
+		args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+		if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
+			args.mod = (xfs_extlen_t)(args.prod - args.mod);
+	}
+	/*
+	 * If we are not low on available data blocks, and the
+	 * underlying logical volume manager is a stripe, and
+	 * the file offset is zero then try to allocate data
+	 * blocks on stripe unit boundary.
+	 * NOTE: ap->aeof is only set if the allocation length
+	 * is >= the stripe unit and the allocation offset is
+	 * at the end of file.
+	 */
+	if (!ap->flist->xbf_low && ap->aeof) {
+		if (!ap->offset) {
+			args.alignment = stripe_align;
+			atype = args.type;
+			isaligned = 1;
+			/*
+			 * Adjust for alignment
+			 */
+			if (blen > args.alignment && blen <= args.maxlen)
+				args.minlen = blen - args.alignment;
+			args.minalignslop = 0;
+		} else {
+			/*
+			 * First try an exact bno allocation.
+			 * If it fails then do a near or start bno
+			 * allocation with alignment turned on.
+			 */
+			atype = args.type;
+			tryagain = 1;
+			args.type = XFS_ALLOCTYPE_THIS_BNO;
+			args.alignment = 1;
+			/*
+			 * Compute the minlen+alignment for the
+			 * next case.  Set slop so that the value
+			 * of minlen+alignment+slop doesn't go up
+			 * between the calls.
+			 */
+			if (blen > stripe_align && blen <= args.maxlen)
+				nextminlen = blen - stripe_align;
+			else
+				nextminlen = args.minlen;
+			if (nextminlen + stripe_align > args.minlen + 1)
+				args.minalignslop =
+					nextminlen + stripe_align -
+					args.minlen - 1;
+			else
+				args.minalignslop = 0;
+		}
+	} else {
+		args.alignment = 1;
+		args.minalignslop = 0;
+	}
+	args.minleft = ap->minleft;
+	args.wasdel = ap->wasdel;
+	args.isfl = 0;
+	args.userdata = ap->userdata;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+	if (tryagain && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * Exact allocation failed. Now try with alignment
+		 * turned on.
+		 */
+		args.type = atype;
+		args.fsbno = ap->blkno;
+		args.alignment = stripe_align;
+		args.minlen = nextminlen;
+		args.minalignslop = 0;
+		isaligned = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * allocation failed, so turn off alignment and
+		 * try again.
+		 */
+		args.type = atype;
+		args.fsbno = ap->blkno;
+		args.alignment = 0;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb &&
+	    args.minlen > ap->minlen) {
+		args.minlen = ap->minlen;
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = ap->blkno;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb) {
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.total = ap->minlen;
+		args.minleft = 0;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+		ap->flist->xbf_low = 1;
+	}
+	if (args.fsbno != NULLFSBLOCK) {
+		/*
+		 * check the allocation happened at the same or higher AG than
+		 * the first block that was allocated.
+		 */
+		ASSERT(*ap->firstblock == NULLFSBLOCK ||
+		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+		       (ap->flist->xbf_low &&
+			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+			XFS_FSB_TO_AGNO(mp, args.fsbno)));
+
+		ap->blkno = args.fsbno;
+		if (*ap->firstblock == NULLFSBLOCK)
+			*ap->firstblock = args.fsbno;
+		ASSERT(nullfb || fb_agno == args.agno ||
+		       (ap->flist->xbf_low && fb_agno < args.agno));
+		ap->length = args.len;
+		ap->ip->i_d.di_nblocks += args.len;
+		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+		if (ap->wasdel)
+			ap->ip->i_delayed_blks -= args.len;
+		/*
+		 * Adjust the disk quota also. This was reserved
+		 * earlier.
+		 */
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+			ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+					XFS_TRANS_DQ_BCOUNT,
+			(long) args.len);
+	} else {
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
+	}
+	return 0;
+}
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int
+xfs_bmap_alloc(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+		return xfs_bmap_rtalloc(ap);
+	return xfs_bmap_btalloc(ap);
+}
+
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+	struct xfs_bmbt_irec	*mval,
+	struct xfs_bmbt_irec	*got,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			n,
+	int			flags)
+{
+	if ((flags & XFS_BMAPI_ENTIRE) ||
+	    got->br_startoff + got->br_blockcount <= obno) {
+		*mval = *got;
+		if (isnullstartblock(got->br_startblock))
+			mval->br_startblock = DELAYSTARTBLOCK;
+		return;
+	}
+
+	if (obno > *bno)
+		*bno = obno;
+	ASSERT((*bno >= obno) || (n == 0));
+	ASSERT(*bno < end);
+	mval->br_startoff = *bno;
+	if (isnullstartblock(got->br_startblock))
+		mval->br_startblock = DELAYSTARTBLOCK;
+	else
+		mval->br_startblock = got->br_startblock +
+					(*bno - got->br_startoff);
+	/*
+	 * Return the minimum of what we got and what we asked for for
+	 * the length.  We can use the len variable here because it is
+	 * modified below and we could have been there before coming
+	 * here if the first part of the allocation didn't overlap what
+	 * was asked for.
+	 */
+	mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+			got->br_blockcount - (*bno - got->br_startoff));
+	mval->br_state = got->br_state;
+	ASSERT(mval->br_blockcount <= len);
+	return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+	struct xfs_bmbt_irec	**map,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		*len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			*n,
+	int			flags)
+{
+	xfs_bmbt_irec_t	*mval = *map;
+
+	ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+	       ((mval->br_startoff + mval->br_blockcount) <= end));
+	ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+	       (mval->br_startoff < obno));
+
+	*bno = mval->br_startoff + mval->br_blockcount;
+	*len = end - *bno;
+	if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+		/* update previous map with new information */
+		ASSERT(mval->br_startblock == mval[-1].br_startblock);
+		ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+		ASSERT(mval->br_state == mval[-1].br_state);
+		mval[-1].br_blockcount = mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != HOLESTARTBLOCK &&
+		   mval->br_startblock == mval[-1].br_startblock +
+					  mval[-1].br_blockcount &&
+		   ((flags & XFS_BMAPI_IGSTATE) ||
+			mval[-1].br_state == mval->br_state)) {
+		ASSERT(mval->br_startoff ==
+		       mval[-1].br_startoff + mval[-1].br_blockcount);
+		mval[-1].br_blockcount += mval->br_blockcount;
+	} else if (*n > 0 &&
+		   mval->br_startblock == DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+		   mval->br_startoff ==
+		   mval[-1].br_startoff + mval[-1].br_blockcount) {
+		mval[-1].br_blockcount += mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (!((*n == 0) &&
+		     ((mval->br_startoff + mval->br_blockcount) <=
+		      obno))) {
+		mval++;
+		(*n)++;
+	}
+	*map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*mval,
+	int			*nmap,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmbt_irec	got;
+	struct xfs_bmbt_irec	prev;
+	xfs_fileoff_t		obno;
+	xfs_fileoff_t		end;
+	xfs_extnum_t		lastx;
+	int			error;
+	int			eof;
+	int			n = 0;
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+			   XFS_BMAPI_IGSTATE)));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	XFS_STATS_INC(xs_blk_mapr);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		/* Reading past eof, act as though there's a hole up to end. */
+		if (eof)
+			got.br_startoff = end;
+		if (got.br_startoff > bno) {
+			/* Reading in a hole.  */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+	*nmap = n;
+	return 0;
+}
+
+STATIC int
+xfs_bmapi_reserve_delalloc(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		aoff,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_bmbt_irec	*prev,
+	xfs_extnum_t		*lastx,
+	int			eof)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	xfs_extlen_t		alen;
+	xfs_extlen_t		indlen;
+	char			rt = XFS_IS_REALTIME_INODE(ip);
+	xfs_extlen_t		extsz;
+	int			error;
+
+	alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+	if (!eof)
+		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+
+	/* Figure out the extent size, adjust alen */
+	extsz = xfs_get_extsz_hint(ip);
+	if (extsz) {
+		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+					       1, 0, &aoff, &alen);
+		ASSERT(!error);
+	}
+
+	if (rt)
+		extsz = alen / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Make a transaction-less quota reservation for delayed allocation
+	 * blocks.  This number gets adjusted later.  We return if we haven't
+	 * allocated blocks already inside this loop.
+	 */
+	error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+			rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		return error;
+
+	/*
+	 * Split changing sb for alen and indlen since they could be coming
+	 * from different places.
+	 */
+	indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+	ASSERT(indlen > 0);
+
+	if (rt) {
+		error = xfs_mod_frextents(mp, -((int64_t)extsz));
+	} else {
+		error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
+	}
+
+	if (error)
+		goto out_unreserve_quota;
+
+	error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
+	if (error)
+		goto out_unreserve_blocks;
+
+
+	ip->i_delayed_blks += alen;
+
+	got->br_startoff = aoff;
+	got->br_startblock = nullstartblock(indlen);
+	got->br_blockcount = alen;
+	got->br_state = XFS_EXT_NORM;
+	xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+	 * might have merged it into one of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+
+	ASSERT(got->br_startoff <= aoff);
+	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+	ASSERT(isnullstartblock(got->br_startblock));
+	ASSERT(got->br_state == XFS_EXT_NORM);
+	return 0;
+
+out_unreserve_blocks:
+	if (rt)
+		xfs_mod_frextents(mp, extsz);
+	else
+		xfs_mod_fdblocks(mp, alen, false);
+out_unreserve_quota:
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
+				XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+	struct xfs_inode	*ip,	/* incore inode */
+	xfs_fileoff_t		bno,	/* starting file offs. mapped */
+	xfs_filblks_t		len,	/* length to map in file */
+	struct xfs_bmbt_irec	*mval,	/* output: map values */
+	int			*nmap,	/* i/o: mval size/count */
+	int			flags)	/* XFS_BMAPI_... */
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_bmbt_irec	got;	/* current file extent record */
+	struct xfs_bmbt_irec	prev;	/* previous file extent record */
+	xfs_fileoff_t		obno;	/* old block number (offset) */
+	xfs_fileoff_t		end;	/* end of mapped file region */
+	xfs_extnum_t		lastx;	/* last useful extent number */
+	int			eof;	/* we've hit the end of extents */
+	int			n = 0;	/* current extent index */
+	int			error = 0;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	XFS_STATS_INC(xs_blk_mapw);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		if (eof || got.br_startoff > bno) {
+			error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+							   &prev, &lastx, eof);
+			if (error) {
+				if (n == 0) {
+					*nmap = 0;
+					return error;
+				}
+				break;
+			}
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		prev = got;
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+
+	*nmap = n;
+	return 0;
+}
+
+
+static int
+xfs_bmapi_allocate(
+	struct xfs_bmalloca	*bma)
+{
+	struct xfs_mount	*mp = bma->ip->i_mount;
+	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+
+	ASSERT(bma->length > 0);
+
+	/*
+	 * For the wasdelay case, we could also just allocate the stuff asked
+	 * for in this bmap call but that wouldn't be as good.
+	 */
+	if (bma->wasdel) {
+		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+		bma->offset = bma->got.br_startoff;
+		if (bma->idx != NULLEXTNUM && bma->idx) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+					 &bma->prev);
+		}
+	} else {
+		bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+		if (!bma->eof)
+			bma->length = XFS_FILBLKS_MIN(bma->length,
+					bma->got.br_startoff - bma->offset);
+	}
+
+	/*
+	 * Indicate if this is the first user data in the file, or just any
+	 * user data.
+	 */
+	if (!(bma->flags & XFS_BMAPI_METADATA)) {
+		bma->userdata = (bma->offset == 0) ?
+			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+	}
+
+	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+
+	/*
+	 * Only want to do the alignment at the eof if it is userdata and
+	 * allocation length is larger than a stripe unit.
+	 */
+	if (mp->m_dalign && bma->length >= mp->m_dalign &&
+	    !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+		error = xfs_bmap_isaeof(bma, whichfork);
+		if (error)
+			return error;
+	}
+
+	error = xfs_bmap_alloc(bma);
+	if (error)
+		return error;
+
+	if (bma->flist->xbf_low)
+		bma->minleft = 0;
+	if (bma->cur)
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+	if (bma->blkno == NULLFSBLOCK)
+		return 0;
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
+	}
+	/*
+	 * Bump the number of extents we've allocated
+	 * in this call.
+	 */
+	bma->nallocs++;
+
+	if (bma->cur)
+		bma->cur->bc_private.b.flags =
+			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+
+	bma->got.br_startoff = bma->offset;
+	bma->got.br_startblock = bma->blkno;
+	bma->got.br_blockcount = bma->length;
+	bma->got.br_state = XFS_EXT_NORM;
+
+	/*
+	 * A wasdelay extent has been initialized, so shouldn't be flagged
+	 * as unwritten.
+	 */
+	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
+	    xfs_sb_version_hasextflgbit(&mp->m_sb))
+		bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+	if (bma->wasdel)
+		error = xfs_bmap_add_extent_delay_real(bma);
+	else
+		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+
+	bma->logflags |= tmp_logflags;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
+	 * the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+	ASSERT(bma->got.br_startoff <= bma->offset);
+	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+	       bma->offset + bma->length);
+	ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+	       bma->got.br_state == XFS_EXT_UNWRITTEN);
+	return 0;
+}
+
+STATIC int
+xfs_bmapi_convert_unwritten(
+	struct xfs_bmalloca	*bma,
+	struct xfs_bmbt_irec	*mval,
+	xfs_filblks_t		len,
+	int			flags)
+{
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+
+	/* check if we need to do unwritten->real conversion */
+	if (mval->br_state == XFS_EXT_UNWRITTEN &&
+	    (flags & XFS_BMAPI_PREALLOC))
+		return 0;
+
+	/* check if we need to do real->unwritten conversion */
+	if (mval->br_state == XFS_EXT_NORM &&
+	    (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+		return 0;
+
+	/*
+	 * Modify (by adding) the state flag, if writing.
+	 */
+	ASSERT(mval->br_blockcount <= len);
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+					bma->ip, whichfork);
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
+	}
+	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+			&bma->cur, mval, bma->firstblock, bma->flist,
+			&tmp_logflags);
+	bma->logflags |= tmp_logflags;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that
+	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
+	 * of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+	/*
+	 * We may have combined previously unwritten space with written space,
+	 * so generate another request.
+	 */
+	if (mval->br_blockcount < len)
+		return -EAGAIN;
+	return 0;
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int
+xfs_bmapi_write(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap,		/* i/o: mval size/count */
+	struct xfs_bmap_free	*flist)		/* i/o: list extents to free */
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
+	xfs_fileoff_t		end;		/* end of mapped file region */
+	int			eof;		/* after the end of extents */
+	int			error;		/* error return */
+	int			n;		/* current extent index */
+	xfs_fileoff_t		obno;		/* old block number (offset) */
+	int			whichfork;	/* data or attr fork */
+	char			inhole;		/* current location is hole in file */
+	char			wasdelay;	/* old extent was delayed */
+
+#ifdef DEBUG
+	xfs_fileoff_t		orig_bno;	/* original block number value */
+	int			orig_flags;	/* original flags arg value */
+	xfs_filblks_t		orig_len;	/* original value of len arg */
+	struct xfs_bmbt_irec	*orig_mval;	/* original value of mval */
+	int			orig_nmap;	/* original value of *nmap */
+
+	orig_bno = bno;
+	orig_len = len;
+	orig_flags = flags;
+	orig_mval = mval;
+	orig_nmap = *nmap;
+#endif
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+	ASSERT(tp != NULL);
+	ASSERT(len > 0);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	XFS_STATS_INC(xs_blk_mapw);
+
+	if (*firstblock == NULLFSBLOCK) {
+		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+		else
+			bma.minleft = 1;
+	} else {
+		bma.minleft = 0;
+	}
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			goto error0;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+				&bma.prev);
+	n = 0;
+	end = bno + len;
+	obno = bno;
+
+	bma.tp = tp;
+	bma.ip = ip;
+	bma.total = total;
+	bma.userdata = 0;
+	bma.flist = flist;
+	bma.firstblock = firstblock;
+
+	while (bno < end && n < *nmap) {
+		inhole = eof || bma.got.br_startoff > bno;
+		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+
+		/*
+		 * First, deal with the hole before the allocated space
+		 * that we found, if any.
+		 */
+		if (inhole || wasdelay) {
+			bma.eof = eof;
+			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+			bma.wasdel = wasdelay;
+			bma.offset = bno;
+			bma.flags = flags;
+
+			/*
+			 * There's a 32/64 bit type mismatch between the
+			 * allocation length request (which can be 64 bits in
+			 * length) and the bma length request, which is
+			 * xfs_extlen_t and therefore 32 bits. Hence we have to
+			 * check for 32-bit overflows and handle them here.
+			 */
+			if (len > (xfs_filblks_t)MAXEXTLEN)
+				bma.length = MAXEXTLEN;
+			else
+				bma.length = len;
+
+			ASSERT(len > 0);
+			ASSERT(bma.length > 0);
+			error = xfs_bmapi_allocate(&bma);
+			if (error)
+				goto error0;
+			if (bma.blkno == NULLFSBLOCK)
+				break;
+		}
+
+		/* Deal with the allocated space we found.  */
+		xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+							end, n, flags);
+
+		/* Execute unwritten extent conversion if necessary */
+		error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+		if (error == -EAGAIN)
+			continue;
+		if (error)
+			goto error0;
+
+		/* update the extent map to return */
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/*
+		 * If we're done, stop now.  Stop when we've allocated
+		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+		 * the transaction may get too big.
+		 */
+		if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		bma.prev = bma.got;
+		if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+					 &bma.got);
+		} else
+			eof = 1;
+	}
+	*nmap = n;
+
+	/*
+	 * Transform from btree to extents, give it cur.
+	 */
+	if (xfs_bmap_wants_extents(ip, whichfork)) {
+		int		tmp_logflags = 0;
+
+		ASSERT(bma.cur);
+		error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
+			&tmp_logflags, whichfork);
+		bma.logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_NEXTENTS(ip, whichfork) >
+		XFS_IFORK_MAXEXT(ip, whichfork));
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent records if we've converted to btree format.
+	 */
+	if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		bma.logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+	/*
+	 * Log whatever the flags say, even if error.  Otherwise we might miss
+	 * detecting a case where the data is changed, there's an error,
+	 * and it's not logged so we don't shutdown when we should.
+	 */
+	if (bma.logflags)
+		xfs_trans_log_inode(tp, ip, bma.logflags);
+
+	if (bma.cur) {
+		if (!error) {
+			ASSERT(*firstblock == NULLFSBLOCK ||
+			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+			       XFS_FSB_TO_AGNO(mp,
+				       bma.cur->bc_private.b.firstblock) ||
+			       (flist->xbf_low &&
+				XFS_FSB_TO_AGNO(mp, *firstblock) <
+				XFS_FSB_TO_AGNO(mp,
+					bma.cur->bc_private.b.firstblock)));
+			*firstblock = bma.cur->bc_private.b.firstblock;
+		}
+		xfs_btree_del_cursor(bma.cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	if (!error)
+		xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+			orig_nmap, *nmap);
+	return error;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current transaction pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/delete */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
+	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
+	xfs_fsblock_t		del_endblock=0;	/* first block past del */
+	xfs_fileoff_t		del_endoff;	/* first offset past del */
+	int			delay;	/* current block is delayed allocated */
+	int			do_fx;	/* free extent at end of routine */
+	xfs_bmbt_rec_host_t	*ep;	/* current extent entry pointer */
+	int			error;	/* error return value */
+	int			flags;	/* inode logging flags */
+	xfs_bmbt_irec_t		got;	/* current extent entry */
+	xfs_fileoff_t		got_endoff;	/* first offset past got */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_filblks_t		nblks;	/* quota/sb block count */
+	xfs_bmbt_irec_t		new;	/* new record to be inserted */
+	/* REFERENCED */
+	uint			qfield;	/* quota field to update */
+	xfs_filblks_t		temp;	/* for indirect length calculations */
+	xfs_filblks_t		temp2;	/* for indirect length calculations */
+	int			state = 0;
+
+	XFS_STATS_INC(xs_del_exlist);
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(del->br_blockcount > 0);
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &got);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
+	flags = 0;
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		flags = XFS_ILOG_CORE;
+		/*
+		 * Realtime allocation.  Free it and record di_nblocks update.
+		 */
+		if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+			xfs_fsblock_t	bno;
+			xfs_filblks_t	len;
+
+			ASSERT(do_mod(del->br_blockcount,
+				      mp->m_sb.sb_rextsize) == 0);
+			ASSERT(do_mod(del->br_startblock,
+				      mp->m_sb.sb_rextsize) == 0);
+			bno = del->br_startblock;
+			len = del->br_blockcount;
+			do_div(bno, mp->m_sb.sb_rextsize);
+			do_div(len, mp->m_sb.sb_rextsize);
+			error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+			if (error)
+				goto done;
+			do_fx = 0;
+			nblks = len * mp->m_sb.sb_rextsize;
+			qfield = XFS_TRANS_DQ_RTBCOUNT;
+		}
+		/*
+		 * Ordinary allocation.
+		 */
+		else {
+			do_fx = 1;
+			nblks = del->br_blockcount;
+			qfield = XFS_TRANS_DQ_BCOUNT;
+		}
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		if (cur) {
+			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+					got.br_startblock, got.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		}
+		da_old = da_new = 0;
+	} else {
+		da_old = startblockval(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+		do_fx = 0;
+	}
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, *idx, 1,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+		--*idx;
+		if (delay)
+			break;
+
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		flags |= XFS_ILOG_CORE;
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_btree_delete(cur, &i)))
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+			flags |= XFS_ILOG_CORE;
+			if (cur) {
+				if ((error = xfs_bmbt_update(cur,
+						got.br_startoff,
+						got.br_startblock, temp,
+						got.br_state)))
+					goto done;
+				if ((error = xfs_btree_increment(cur, 0, &i)))
+					goto done;
+				cur->bc_rec.b = new;
+				error = xfs_btree_insert(cur, &i);
+				if (error && error != -ENOSPC)
+					goto done;
+				/*
+				 * If get no-space back from btree insert,
+				 * it tried a split, and we have a zero
+				 * block reservation.
+				 * Fix up our state and return the error.
+				 */
+				if (error == -ENOSPC) {
+					/*
+					 * Reset the cursor, don't trust
+					 * it after any insert operation.
+					 */
+					if ((error = xfs_bmbt_lookup_eq(cur,
+							got.br_startoff,
+							got.br_startblock,
+							temp, &i)))
+						goto done;
+					XFS_WANT_CORRUPTED_GOTO(mp,
+								i == 1, done);
+					/*
+					 * Update the btree record back
+					 * to the original value.
+					 */
+					if ((error = xfs_bmbt_update(cur,
+							got.br_startoff,
+							got.br_startblock,
+							got.br_blockcount,
+							got.br_state)))
+						goto done;
+					/*
+					 * Reset the extent record back
+					 * to the original value.
+					 */
+					xfs_bmbt_set_blockcount(ep,
+						got.br_blockcount);
+					flags = 0;
+					error = -ENOSPC;
+					goto done;
+				}
+				XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+			} else
+				flags |= xfs_ilog_fext(whichfork);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		} else {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = nullstartblock((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						nullstartblock((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						nullstartblock((int)temp2);
+				}
+			}
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+		++*idx;
+		break;
+	}
+	/*
+	 * If we need to, add to list of extents to delete.
+	 */
+	if (do_fx)
+		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+			mp);
+	/*
+	 * Adjust inode # blocks in the file.
+	 */
+	if (nblks)
+		ip->i_d.di_nblocks -= nblks;
+	/*
+	 * Adjust quota data.
+	 */
+	if (qfield)
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new)
+		xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* misc flags */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done)		/* set if not done yet */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_irec_t		del;		/* extent being deleted */
+	int			eof;		/* is deleting at eof */
+	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		extno;		/* extent number in list */
+	xfs_bmbt_irec_t		got;		/* current extent record */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	int			isrt;		/* freeing in rt area */
+	xfs_extnum_t		lastx;		/* last extent index used */
+	int			logflags;	/* transaction logging flags */
+	xfs_extlen_t		mod;		/* rt extent offset */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* number of file extents */
+	xfs_bmbt_irec_t		prev;		/* previous extent record */
+	xfs_fileoff_t		start;		/* first file offset deleted */
+	int			tmp_logflags;	/* partial logging flags */
+	int			wasdel;		/* was a delayed alloc extent */
+	int			whichfork;	/* data or attribute fork */
+	xfs_fsblock_t		sum;
+
+	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (unlikely(
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+		XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
+				 ip->i_mount);
+		return -EFSCORRUPTED;
+	}
+	mp = ip->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(len > 0);
+	ASSERT(nexts >= 0);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*done = 1;
+		return 0;
+	}
+	XFS_STATS_INC(xs_blk_unmap);
+	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	start = bno;
+	bno = start + len - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+
+	/*
+	 * Check to see if the given block number is past the end of the
+	 * file, back up to the last block if so...
+	 */
+	if (eof) {
+		ep = xfs_iext_get_ext(ifp, --lastx);
+		xfs_bmbt_get_all(ep, &got);
+		bno = got.br_startoff + got.br_blockcount - 1;
+	}
+	logflags = 0;
+	if (ifp->if_flags & XFS_IFBROOT) {
+		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else
+		cur = NULL;
+
+	if (isrt) {
+		/*
+		 * Synchronize by locking the bitmap inode.
+		 */
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+	}
+
+	extno = 0;
+	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+	       (nexts == 0 || extno < nexts)) {
+		/*
+		 * Is the found extent after a hole in which bno lives?
+		 * Just back up to the previous extent, if so.
+		 */
+		if (got.br_startoff > bno) {
+			if (--lastx < 0)
+				break;
+			ep = xfs_iext_get_ext(ifp, lastx);
+			xfs_bmbt_get_all(ep, &got);
+		}
+		/*
+		 * Is the last block of this extent before the range
+		 * we're supposed to delete?  If so, we're done.
+		 */
+		bno = XFS_FILEOFF_MIN(bno,
+			got.br_startoff + got.br_blockcount - 1);
+		if (bno < start)
+			break;
+		/*
+		 * Then deal with the (possibly delayed) allocated space
+		 * we found.
+		 */
+		ASSERT(ep != NULL);
+		del = got;
+		wasdel = isnullstartblock(del.br_startblock);
+		if (got.br_startoff < start) {
+			del.br_startoff = start;
+			del.br_blockcount -= start - got.br_startoff;
+			if (!wasdel)
+				del.br_startblock += start - got.br_startoff;
+		}
+		if (del.br_startoff + del.br_blockcount > bno + 1)
+			del.br_blockcount = bno + 1 - del.br_startoff;
+		sum = del.br_startblock + del.br_blockcount;
+		if (isrt &&
+		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent not lined up at the end.
+			 * The extent could have been split into written
+			 * and unwritten pieces, or we could just be
+			 * unmapping part of it.  But we can't really
+			 * get rid of part of a realtime extent.
+			 */
+			if (del.br_state == XFS_EXT_UNWRITTEN ||
+			    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+				/*
+				 * This piece is unwritten, or we're not
+				 * using unwritten extents.  Skip over it.
+				 */
+				ASSERT(bno >= mod);
+				bno -= mod > del.br_blockcount ?
+					del.br_blockcount : mod;
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(xfs_iext_get_ext(
+							ifp, lastx), &got);
+				}
+				continue;
+			}
+			/*
+			 * It's written, turn it unwritten.
+			 * This is better than zeroing it.
+			 */
+			ASSERT(del.br_state == XFS_EXT_NORM);
+			ASSERT(xfs_trans_get_block_res(tp) > 0);
+			/*
+			 * If this spans a realtime extent boundary,
+			 * chop it back to the start of the one we end at.
+			 */
+			if (del.br_blockcount > mod) {
+				del.br_startoff += del.br_blockcount - mod;
+				del.br_startblock += del.br_blockcount - mod;
+				del.br_blockcount = mod;
+			}
+			del.br_state = XFS_EXT_UNWRITTEN;
+			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+					&lastx, &cur, &del, firstblock, flist,
+					&logflags);
+			if (error)
+				goto error0;
+			goto nodelete;
+		}
+		if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent is lined up at the end but not
+			 * at the front.  We'll get rid of full extents if
+			 * we can.
+			 */
+			mod = mp->m_sb.sb_rextsize - mod;
+			if (del.br_blockcount > mod) {
+				del.br_blockcount -= mod;
+				del.br_startoff += mod;
+				del.br_startblock += mod;
+			} else if ((del.br_startoff == start &&
+				    (del.br_state == XFS_EXT_UNWRITTEN ||
+				     xfs_trans_get_block_res(tp) == 0)) ||
+				   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+				/*
+				 * Can't make it unwritten.  There isn't
+				 * a full extent here so just skip it.
+				 */
+				ASSERT(bno >= del.br_blockcount);
+				bno -= del.br_blockcount;
+				if (got.br_startoff > bno) {
+					if (--lastx >= 0) {
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+						xfs_bmbt_get_all(ep, &got);
+					}
+				}
+				continue;
+			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				/*
+				 * This one is already unwritten.
+				 * It must have a written left neighbor.
+				 * Unwrite the killed part of that one and
+				 * try again.
+				 */
+				ASSERT(lastx > 0);
+				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						lastx - 1), &prev);
+				ASSERT(prev.br_state == XFS_EXT_NORM);
+				ASSERT(!isnullstartblock(prev.br_startblock));
+				ASSERT(del.br_startblock ==
+				       prev.br_startblock + prev.br_blockcount);
+				if (prev.br_startoff < start) {
+					mod = start - prev.br_startoff;
+					prev.br_blockcount -= mod;
+					prev.br_startblock += mod;
+					prev.br_startoff = start;
+				}
+				prev.br_state = XFS_EXT_UNWRITTEN;
+				lastx--;
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &prev,
+						firstblock, flist, &logflags);
+				if (error)
+					goto error0;
+				goto nodelete;
+			} else {
+				ASSERT(del.br_state == XFS_EXT_NORM);
+				del.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &del,
+						firstblock, flist, &logflags);
+				if (error)
+					goto error0;
+				goto nodelete;
+			}
+		}
+		if (wasdel) {
+			ASSERT(startblockval(del.br_startblock) > 0);
+			/* Update realtime/data freespace, unreserve quota */
+			if (isrt) {
+				xfs_filblks_t rtexts;
+
+				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
+				do_div(rtexts, mp->m_sb.sb_rextsize);
+				xfs_mod_frextents(mp, (int64_t)rtexts);
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_RTBLKS);
+			} else {
+				xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
+						 false);
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_REGBLKS);
+			}
+			ip->i_delayed_blks -= del.br_blockcount;
+			if (cur)
+				cur->bc_private.b.flags |=
+					XFS_BTCUR_BPRV_WASDEL;
+		} else if (cur)
+			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+		/*
+		 * If it's the case where the directory code is running
+		 * with no block reservation, and the deleted block is in
+		 * the middle of its extent, and the resulting insert
+		 * of an extent would cause transformation to btree format,
+		 * then reject it.  The calling code will then swap
+		 * blocks around instead.
+		 * We have to do this now, rather than waiting for the
+		 * conversion to btree format, since the transaction
+		 * will be dirty.
+		 */
+		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+			XFS_IFORK_MAXEXT(ip, whichfork) &&
+		    del.br_startoff > got.br_startoff &&
+		    del.br_startoff + del.br_blockcount <
+		    got.br_startoff + got.br_blockcount) {
+			error = -ENOSPC;
+			goto error0;
+		}
+		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+				&tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+		bno = del.br_startoff - 1;
+nodelete:
+		/*
+		 * If not done go on to the next (previous) record.
+		 */
+		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+			if (lastx >= 0) {
+				ep = xfs_iext_get_ext(ifp, lastx);
+				if (xfs_bmbt_get_startoff(ep) > bno) {
+					if (--lastx >= 0)
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+				}
+				xfs_bmbt_get_all(ep, &got);
+			}
+			extno++;
+		}
+	}
+	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+			&cur, 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from btree to extents, give it cur
+	 */
+	else if (xfs_bmap_wants_extents(ip, whichfork)) {
+		ASSERT(cur != NULL);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+			whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from extents to local?
+	 */
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent records if we've converted to btree format.
+	 */
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~xfs_ilog_fbroot(whichfork);
+	/*
+	 * Log inode even in the error case, if the transaction
+	 * is dirty we'll need to shut down the filesystem.
+	 */
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur) {
+		if (!error) {
+			*firstblock = cur->bc_private.b.firstblock;
+			cur->bc_private.b.allocated = 0;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Determine whether an extent shift can be accomplished by a merge with the
+ * extent that precedes the target hole of the shift.
+ */
+STATIC bool
+xfs_bmse_can_merge(
+	struct xfs_bmbt_irec	*left,	/* preceding extent */
+	struct xfs_bmbt_irec	*got,	/* current extent to shift */
+	xfs_fileoff_t		shift)	/* shift fsb */
+{
+	xfs_fileoff_t		startoff;
+
+	startoff = got->br_startoff - shift;
+
+	/*
+	 * The extent, once shifted, must be adjacent in-file and on-disk with
+	 * the preceding extent.
+	 */
+	if ((left->br_startoff + left->br_blockcount != startoff) ||
+	    (left->br_startblock + left->br_blockcount != got->br_startblock) ||
+	    (left->br_state != got->br_state) ||
+	    (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+		return false;
+
+	return true;
+}
+
+/*
+ * A bmap extent shift adjusts the file offset of an extent to fill a preceding
+ * hole in the file. If an extent shift would result in the extent being fully
+ * adjacent to the extent that currently precedes the hole, we can merge with
+ * the preceding extent rather than do the shift.
+ *
+ * This function assumes the caller has verified a shift-by-merge is possible
+ * with the provided extents via xfs_bmse_can_merge().
+ */
+STATIC int
+xfs_bmse_merge(
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			shift,		/* shift fsb */
+	int				current_ext,	/* idx of gotp */
+	struct xfs_bmbt_rec_host	*gotp,		/* extent to shift */
+	struct xfs_bmbt_rec_host	*leftp,		/* preceding extent */
+	struct xfs_btree_cur		*cur,
+	int				*logflags)	/* output */
+{
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec		left;
+	xfs_filblks_t			blockcount;
+	int				error, i;
+	struct xfs_mount		*mp = ip->i_mount;
+
+	xfs_bmbt_get_all(gotp, &got);
+	xfs_bmbt_get_all(leftp, &left);
+	blockcount = left.br_blockcount + got.br_blockcount;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_bmse_can_merge(&left, &got, shift));
+
+	/*
+	 * Merge the in-core extents. Note that the host record pointers and
+	 * current_ext index are invalid once the extent has been removed via
+	 * xfs_iext_remove().
+	 */
+	xfs_bmbt_set_blockcount(leftp, blockcount);
+	xfs_iext_remove(ip, current_ext, 1, 0);
+
+	/*
+	 * Update the on-disk extent count, the btree if necessary and log the
+	 * inode.
+	 */
+	XFS_IFORK_NEXT_SET(ip, whichfork,
+			   XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+	*logflags |= XFS_ILOG_CORE;
+	if (!cur) {
+		*logflags |= XFS_ILOG_DEXT;
+		return 0;
+	}
+
+	/* lookup and remove the extent to merge */
+	error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+				   got.br_blockcount, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+
+	error = xfs_btree_delete(cur, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+
+	/* lookup and update size of the previous extent */
+	error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
+				   left.br_blockcount, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+
+	left.br_blockcount = blockcount;
+
+	return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
+			       left.br_blockcount, left.br_state);
+}
+
+/*
+ * Shift a single extent.
+ */
+STATIC int
+xfs_bmse_shift_one(
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			offset_shift_fsb,
+	int				*current_ext,
+	struct xfs_bmbt_rec_host	*gotp,
+	struct xfs_btree_cur		*cur,
+	int				*logflags,
+	enum shift_direction		direction)
+{
+	struct xfs_ifork		*ifp;
+	struct xfs_mount		*mp;
+	xfs_fileoff_t			startoff;
+	struct xfs_bmbt_rec_host	*adj_irecp;
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec		adj_irec;
+	int				error;
+	int				i;
+	int				total_extents;
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+
+	xfs_bmbt_get_all(gotp, &got);
+
+	/* delalloc extents should be prevented by caller */
+	XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
+
+	if (direction == SHIFT_LEFT) {
+		startoff = got.br_startoff - offset_shift_fsb;
+
+		/*
+		 * Check for merge if we've got an extent to the left,
+		 * otherwise make sure there's enough room at the start
+		 * of the file for the shift.
+		 */
+		if (!*current_ext) {
+			if (got.br_startoff < offset_shift_fsb)
+				return -EINVAL;
+			goto update_current_ext;
+		}
+		/*
+		 * grab the left extent and check for a large
+		 * enough hole.
+		 */
+		adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1);
+		xfs_bmbt_get_all(adj_irecp, &adj_irec);
+
+		if (startoff <
+		    adj_irec.br_startoff + adj_irec.br_blockcount)
+			return -EINVAL;
+
+		/* check whether to merge the extent or shift it down */
+		if (xfs_bmse_can_merge(&adj_irec, &got,
+				       offset_shift_fsb)) {
+			return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+					      *current_ext, gotp, adj_irecp,
+					      cur, logflags);
+		}
+	} else {
+		startoff = got.br_startoff + offset_shift_fsb;
+		/* nothing to move if this is the last extent */
+		if (*current_ext >= (total_extents - 1))
+			goto update_current_ext;
+		/*
+		 * If this is not the last extent in the file, make sure there
+		 * is enough room between current extent and next extent for
+		 * accommodating the shift.
+		 */
+		adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1);
+		xfs_bmbt_get_all(adj_irecp, &adj_irec);
+		if (startoff + got.br_blockcount > adj_irec.br_startoff)
+			return -EINVAL;
+		/*
+		 * Unlike a left shift (which involves a hole punch),
+		 * a right shift does not modify extent neighbors
+		 * in any way. We should never find mergeable extents
+		 * in this scenario. Check anyways and warn if we
+		 * encounter two extents that could be one.
+		 */
+		if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb))
+			WARN_ON_ONCE(1);
+	}
+	/*
+	 * Increment the extent index for the next iteration, update the start
+	 * offset of the in-core extent and update the btree if applicable.
+	 */
+update_current_ext:
+	if (direction == SHIFT_LEFT)
+		(*current_ext)++;
+	else
+		(*current_ext)--;
+	xfs_bmbt_set_startoff(gotp, startoff);
+	*logflags |= XFS_ILOG_CORE;
+	if (!cur) {
+		*logflags |= XFS_ILOG_DEXT;
+		return 0;
+	}
+
+	error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+				   got.br_blockcount, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+
+	got.br_startoff = startoff;
+	return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+			       got.br_blockcount, got.br_state);
+}
+
+/*
+ * Shift extent records to the left/right to cover/create a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation is
+ * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
+ * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
+ * is the length by which each extent is shifted. If there is no hole to shift
+ * the extents into, this will be considered invalid operation and we abort
+ * immediately.
+ */
+int
+xfs_bmap_shift_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*next_fsb,
+	xfs_fileoff_t		offset_shift_fsb,
+	int			*done,
+	xfs_fileoff_t		stop_fsb,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	enum shift_direction	direction,
+	int			num_exts)
+{
+	struct xfs_btree_cur		*cur = NULL;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec            got;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	xfs_extnum_t			nexts = 0;
+	xfs_extnum_t			current_ext;
+	xfs_extnum_t			total_extents;
+	xfs_extnum_t			stop_extent;
+	int				error = 0;
+	int				whichfork = XFS_DATA_FORK;
+	int				logflags = 0;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+				 XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+	ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		/* Read in all the extents */
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	}
+
+	/*
+	 * There may be delalloc extents in the data fork before the range we
+	 * are collapsing out, so we cannot use the count of real extents here.
+	 * Instead we have to calculate it from the incore fork.
+	 */
+	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	if (total_extents == 0) {
+		*done = 1;
+		goto del_cursor;
+	}
+
+	/*
+	 * In case of first right shift, we need to initialize next_fsb
+	 */
+	if (*next_fsb == NULLFSBLOCK) {
+		gotp = xfs_iext_get_ext(ifp, total_extents - 1);
+		xfs_bmbt_get_all(gotp, &got);
+		*next_fsb = got.br_startoff;
+		if (stop_fsb > *next_fsb) {
+			*done = 1;
+			goto del_cursor;
+		}
+	}
+
+	/* Lookup the extent index at which we have to stop */
+	if (direction == SHIFT_RIGHT) {
+		gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent);
+		/* Make stop_extent exclusive of shift range */
+		stop_extent--;
+	} else
+		stop_extent = total_extents;
+
+	/*
+	 * Look up the extent index for the fsb where we start shifting. We can
+	 * henceforth iterate with current_ext as extent list changes are locked
+	 * out via ilock.
+	 *
+	 * gotp can be null in 2 cases: 1) if there are no extents or 2)
+	 * *next_fsb lies in a hole beyond which there are no extents. Either
+	 * way, we are done.
+	 */
+	gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, &current_ext);
+	if (!gotp) {
+		*done = 1;
+		goto del_cursor;
+	}
+
+	/* some sanity checking before we finally start shifting extents */
+	if ((direction == SHIFT_LEFT && current_ext >= stop_extent) ||
+	     (direction == SHIFT_RIGHT && current_ext <= stop_extent)) {
+		error = -EIO;
+		goto del_cursor;
+	}
+
+	while (nexts++ < num_exts) {
+		error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
+					   &current_ext, gotp, cur, &logflags,
+					   direction);
+		if (error)
+			goto del_cursor;
+		/*
+		 * If there was an extent merge during the shift, the extent
+		 * count can change. Update the total and grade the next record.
+		 */
+		if (direction == SHIFT_LEFT) {
+			total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+			stop_extent = total_extents;
+		}
+
+		if (current_ext == stop_extent) {
+			*done = 1;
+			*next_fsb = NULLFSBLOCK;
+			break;
+		}
+		gotp = xfs_iext_get_ext(ifp, current_ext);
+	}
+
+	if (!*done) {
+		xfs_bmbt_get_all(gotp, &got);
+		*next_fsb = got.br_startoff;
+	}
+
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+
+	return error;
+}
+
+/*
+ * Splits an extent into two extents at split_fsb block such that it is
+ * the first block of the current_ext. @current_ext is a target extent
+ * to be split. @split_fsb is a block where the extents is split.
+ * If split_fsb lies in a hole or the first block of extents, just return 0.
+ */
+STATIC int
+xfs_bmap_split_extent_at(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		split_fsb,
+	xfs_fsblock_t		*firstfsb,
+	struct xfs_bmap_free	*free_list)
+{
+	int				whichfork = XFS_DATA_FORK;
+	struct xfs_btree_cur		*cur = NULL;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec		new; /* split extent */
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	xfs_fsblock_t			gotblkcnt; /* new block count for got */
+	xfs_extnum_t			current_ext;
+	int				error = 0;
+	int				logflags = 0;
+	int				i = 0;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
+				 XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		/* Read in all the extents */
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * gotp can be null in 2 cases: 1) if there are no extents
+	 * or 2) split_fsb lies in a hole beyond which there are
+	 * no extents. Either way, we are done.
+	 */
+	gotp = xfs_iext_bno_to_ext(ifp, split_fsb, &current_ext);
+	if (!gotp)
+		return 0;
+
+	xfs_bmbt_get_all(gotp, &got);
+
+	/*
+	 * Check split_fsb lies in a hole or the start boundary offset
+	 * of the extent.
+	 */
+	if (got.br_startoff >= split_fsb)
+		return 0;
+
+	gotblkcnt = split_fsb - got.br_startoff;
+	new.br_startoff = split_fsb;
+	new.br_startblock = got.br_startblock + gotblkcnt;
+	new.br_blockcount = got.br_blockcount - gotblkcnt;
+	new.br_state = got.br_state;
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstfsb;
+		cur->bc_private.b.flist = free_list;
+		cur->bc_private.b.flags = 0;
+		error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount,
+				&i);
+		if (error)
+			goto del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+	}
+
+	xfs_bmbt_set_blockcount(gotp, gotblkcnt);
+	got.br_blockcount = gotblkcnt;
+
+	logflags = XFS_ILOG_CORE;
+	if (cur) {
+		error = xfs_bmbt_update(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount,
+				got.br_state);
+		if (error)
+			goto del_cursor;
+	} else
+		logflags |= XFS_ILOG_DEXT;
+
+	/* Add new extent */
+	current_ext++;
+	xfs_iext_insert(ip, current_ext, 1, &new, 0);
+	XFS_IFORK_NEXT_SET(ip, whichfork,
+			   XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+
+	if (cur) {
+		error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
+				new.br_startblock, new.br_blockcount,
+				&i);
+		if (error)
+			goto del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
+		cur->bc_rec.b.br_state = new.br_state;
+
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+	}
+
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		int tmp_logflags; /* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+				&cur, 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+	}
+
+del_cursor:
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur,
+				error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	return error;
+}
+
+int
+xfs_bmap_split_extent(
+	struct xfs_inode        *ip,
+	xfs_fileoff_t           split_fsb)
+{
+	struct xfs_mount        *mp = ip->i_mount;
+	struct xfs_trans        *tp;
+	struct xfs_bmap_free    free_list;
+	xfs_fsblock_t           firstfsb;
+	int                     committed;
+	int                     error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	xfs_bmap_init(&free_list, &firstfsb);
+
+	error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
+			&firstfsb, &free_list);
+	if (error)
+		goto out;
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto out;
+
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+
+out:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	return error;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.h b/kernel/fs/xfs/libxfs/xfs_bmap.h
new file mode 100644
index 000000000..6aaa0c1c7
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_bmap.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_H__
+#define	__XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t	*xfs_bmap_free_item_zone;
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+	xfs_fsblock_t		*firstblock; /* i/o first block allocated */
+	struct xfs_bmap_free	*flist;	/* bmap freelist */
+	struct xfs_trans	*tp;	/* transaction pointer */
+	struct xfs_inode	*ip;	/* incore inode pointer */
+	struct xfs_bmbt_irec	prev;	/* extent before the new one */
+	struct xfs_bmbt_irec	got;	/* extent after, or delayed */
+
+	xfs_fileoff_t		offset;	/* offset in file filling in */
+	xfs_extlen_t		length;	/* i/o length asked/allocated */
+	xfs_fsblock_t		blkno;	/* starting block of new extent */
+
+	struct xfs_btree_cur	*cur;	/* btree cursor */
+	xfs_extnum_t		idx;	/* current extent index */
+	int			nallocs;/* number of extents alloc'd */
+	int			logflags;/* flags for transaction logging */
+
+	xfs_extlen_t		total;	/* total blocks needed for xaction */
+	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
+	xfs_extlen_t		minleft; /* amount must be left after alloc */
+	bool			eof;	/* set if allocating past last extent */
+	bool			wasdel;	/* replacing a delayed allocation */
+	bool			userdata;/* set if is user data */
+	bool			aeof;	/* allocated space at eof */
+	bool			conv;	/* overwriting unwritten extents */
+	int			flags;
+};
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+	xfs_fsblock_t		xbfi_startblock;/* starting fs block number */
+	xfs_extlen_t		xbfi_blockcount;/* number of blocks in extent */
+	struct xfs_bmap_free_item *xbfi_next;	/* link to next entry */
+} xfs_bmap_free_item_t;
+
+/*
+ * Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent.  In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs.  In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0.  If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+typedef	struct xfs_bmap_free
+{
+	xfs_bmap_free_item_t	*xbf_first;	/* list of to-be-free extents */
+	int			xbf_count;	/* count of items on list */
+	int			xbf_low;	/* alloc in low mode */
+} xfs_bmap_free_t;
+
+#define	XFS_BMAP_MAX_NMAP	4
+
+/*
+ * Flags for xfs_bmapi_*
+ */
+#define XFS_BMAPI_ENTIRE	0x001	/* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA	0x002	/* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK	0x004	/* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
+					/* combine contig. space */
+#define XFS_BMAPI_CONTIG	0x020	/* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT	0x040
+
+#define XFS_BMAPI_FLAGS \
+	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
+	{ XFS_BMAPI_METADATA,	"METADATA" }, \
+	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
+	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
+	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
+	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
+	{ XFS_BMAPI_CONVERT,	"CONVERT" }
+
+
+static inline int xfs_bmapi_aflag(int w)
+{
+	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+}
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define	DELAYSTARTBLOCK		((xfs_fsblock_t)-1LL)
+#define	HOLESTARTBLOCK		((xfs_fsblock_t)-2LL)
+
+static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+		(flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+}
+
+/*
+ * Flags for xfs_bmap_add_extent*.
+ */
+#define BMAP_LEFT_CONTIG	(1 << 0)
+#define BMAP_RIGHT_CONTIG	(1 << 1)
+#define BMAP_LEFT_FILLING	(1 << 2)
+#define BMAP_RIGHT_FILLING	(1 << 3)
+#define BMAP_LEFT_DELAY		(1 << 4)
+#define BMAP_RIGHT_DELAY	(1 << 5)
+#define BMAP_LEFT_VALID		(1 << 6)
+#define BMAP_RIGHT_VALID	(1 << 7)
+#define BMAP_ATTRFORK		(1 << 8)
+
+#define XFS_BMAP_EXT_FLAGS \
+	{ BMAP_LEFT_CONTIG,	"LC" }, \
+	{ BMAP_RIGHT_CONTIG,	"RC" }, \
+	{ BMAP_LEFT_FILLING,	"LF" }, \
+	{ BMAP_RIGHT_FILLING,	"RF" }, \
+	{ BMAP_ATTRFORK,	"ATTR" }
+
+
+/*
+ * This macro is used to determine how many extents will be shifted
+ * in one write transaction. We could require two splits,
+ * an extent move on the first and an extent merge on the second,
+ * So it is proper that one extent is shifted inside write transaction
+ * at a time.
+ */
+#define XFS_BMAP_MAX_SHIFT_EXTENTS	1
+
+enum shift_direction {
+	SHIFT_LEFT = 0,
+	SHIFT_RIGHT,
+};
+
+#ifdef DEBUG
+void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
+		int whichfork, unsigned long caller_ip);
+#define	XFS_BMAP_TRACE_EXLIST(ip,c,w)	\
+	xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
+#else
+#define	XFS_BMAP_TRACE_EXLIST(ip,c,w)
+#endif
+
+int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void	xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+		struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void	xfs_bmap_cancel(struct xfs_bmap_free *flist);
+int	xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+			int *committed);
+void	xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int	xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *last_block, int whichfork);
+int	xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+		int whichfork);
+int	xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
+int	xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+		int whichfork);
+int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+		int *nmap, int flags);
+int	xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
+		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+		int *nmap, int flags);
+int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+		xfs_fsblock_t *firstblock, xfs_extlen_t total,
+		struct xfs_bmbt_irec *mval, int *nmap,
+		struct xfs_bmap_free *flist);
+int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+		struct xfs_bmap_free *flist, int *done);
+int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
+		xfs_extnum_t num);
+uint	xfs_default_attroffset(struct xfs_inode *ip);
+int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+		int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+		struct xfs_bmap_free *flist, enum shift_direction direction,
+		int num_exts);
+int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+
+#endif	/* __XFS_BMAP_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_bmap_btree.c b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c
new file mode 100644
index 000000000..2c44c8e50
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -0,0 +1,883 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+	xfs_filblks_t		blks,
+	int			extent_flag)
+{
+	if (extent_flag) {
+		ASSERT(blks != 0);	/* saved for DMIG */
+		return XFS_EXT_UNWRITTEN;
+	}
+	return XFS_EXT_NORM;
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+	struct xfs_inode	*ip,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+
+	rblock->bb_level = dblock->bb_level;
+	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+	rblock->bb_numrecs = dblock->bb_numrecs;
+	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+STATIC void
+__xfs_bmbt_get_all(
+		__uint64_t l0,
+		__uint64_t l1,
+		xfs_bmbt_irec_t *s)
+{
+	int	ext_flag;
+	xfs_exntst_t st;
+
+	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+	s->br_startoff = ((xfs_fileoff_t)l0 &
+			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
+			   (((xfs_fsblock_t)l1) >> 21);
+	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
+	/* This is xfs_extent_state() in-line */
+	if (ext_flag) {
+		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
+		st = XFS_EXT_UNWRITTEN;
+	} else
+		st = XFS_EXT_NORM;
+	s->br_state = st;
+}
+
+void
+xfs_bmbt_get_all(
+	xfs_bmbt_rec_host_t *r,
+	xfs_bmbt_irec_t *s)
+{
+	__xfs_bmbt_get_all(r->l0, r->l1, s);
+}
+
+/*
+ * Extract the blockcount field from an in memory bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+	xfs_bmbt_rec_host_t	*r)
+{
+	return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startblock field from an in memory bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+	xfs_bmbt_rec_host_t	*r)
+{
+	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
+	       (((xfs_fsblock_t)r->l1) >> 21);
+}
+
+/*
+ * Extract the startoff field from an in memory bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+	xfs_bmbt_rec_host_t	*r)
+{
+	return ((xfs_fileoff_t)r->l0 &
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+	xfs_bmbt_rec_host_t	*r)
+{
+	int	ext_flag;
+
+	ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
+	return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+				ext_flag);
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+	xfs_bmbt_rec_t	*r)
+{
+	return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+	xfs_bmbt_rec_t	*r)
+{
+	return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+	xfs_bmbt_rec_host_t	*r,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	xfs_exntst_t		state)
+{
+	int		extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		((xfs_bmbt_rec_base_t)startoff << 9) |
+		((xfs_bmbt_rec_base_t)startblock >> 43);
+	r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+		((xfs_bmbt_rec_base_t)blockcount &
+		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+	xfs_bmbt_rec_host_t *r,
+	xfs_bmbt_irec_t	*s)
+{
+	xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
+			     s->br_blockcount, s->br_state);
+}
+
+
+/*
+ * Set all the fields in a disk format bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_disk_set_allf(
+	xfs_bmbt_rec_t		*r,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	xfs_exntst_t		state)
+{
+	int			extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+	r->l0 = cpu_to_be64(
+		((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		 ((xfs_bmbt_rec_base_t)startoff << 9) |
+		 ((xfs_bmbt_rec_base_t)startblock >> 43));
+	r->l1 = cpu_to_be64(
+		((xfs_bmbt_rec_base_t)startblock << 21) |
+		 ((xfs_bmbt_rec_base_t)blockcount &
+		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+STATIC void
+xfs_bmbt_disk_set_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
+				  s->br_blockcount, s->br_state);
+}
+
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+	xfs_bmbt_rec_host_t *r,
+	xfs_filblks_t	v)
+{
+	ASSERT((v & xfs_mask64hi(43)) == 0);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+		  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
+}
+
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+	xfs_bmbt_rec_host_t *r,
+	xfs_fsblock_t	v)
+{
+	ASSERT((v & xfs_mask64hi(12)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
+		  (xfs_bmbt_rec_base_t)(v >> 43);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
+		  (xfs_bmbt_rec_base_t)(v << 21);
+}
+
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+	xfs_bmbt_rec_host_t *r,
+	xfs_fileoff_t	v)
+{
+	ASSERT((v & xfs_mask64hi(9)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
+		((xfs_bmbt_rec_base_t)v << 9) |
+		  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+}
+
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+	xfs_bmbt_rec_host_t *r,
+	xfs_exntst_t	v)
+{
+	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+	if (v == XFS_EXT_NORM)
+		r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
+	else
+		r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+		ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+		ASSERT(rblock->bb_u.l.bb_blkno ==
+		       cpu_to_be64(XFS_BUF_DADDR_NULL));
+	} else
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_level != 0);
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+	fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Check extent records, which have just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+
+int
+xfs_check_nostate_extents(
+	xfs_ifork_t		*ifp,
+	xfs_extnum_t		idx,
+	xfs_extnum_t		num)
+{
+	for (; num > 0; num--, idx++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+		if ((ep->l0 >>
+		     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+			ASSERT(0);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+	/*
+	 * Copy the firstblock, flist, and flags values,
+	 * since init cursor doesn't get them.
+	 */
+	new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+	new->bc_private.b.flist = cur->bc_private.b.flist;
+	new->bc_private.b.flags = cur->bc_private.b.flags;
+
+	return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+	struct xfs_btree_cur	*src,
+	struct xfs_btree_cur	*dst)
+{
+	ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+	       (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+	ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+	dst->bc_private.b.allocated += src->bc_private.b.allocated;
+	dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+	src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = cur->bc_private.b.firstblock;
+	args.firstblock = args.fsbno;
+
+	if (args.fsbno == NULLFSBLOCK) {
+		args.fsbno = be64_to_cpu(start->l);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		/*
+		 * Make sure there is sufficient room left in the AG to
+		 * complete a full tree split for an extent insert.  If
+		 * we are converting the middle part of an extent then
+		 * we may need space for two tree splits.
+		 *
+		 * We are relying on the caller to make the correct block
+		 * reservation for this operation to succeed.  If the
+		 * reservation amount is insufficient then we may fail a
+		 * block allocation here and corrupt the filesystem.
+		 */
+		args.minleft = xfs_trans_get_block_res(args.tp);
+	} else if (cur->bc_private.b.flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	}
+
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+		error = -ENOSPC;
+		goto error0;
+	}
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto error0;
+
+	if (args.fsbno == NULLFSBLOCK && args.minleft) {
+		/*
+		 * Could not find an AG with enough free space to satisfy
+		 * a full btree split.  Try again without minleft and if
+		 * successful activate the lowspace algorithm.
+		 */
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.minleft = 0;
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			goto error0;
+		cur->bc_private.b.flist->xbf_low = 1;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	cur->bc_private.b.ip->i_d.di_nblocks++;
+	xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+
+	new->l = cpu_to_be64(args.fsbno);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+ error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_private.b.ip;
+	struct xfs_trans	*tp = cur->bc_tp;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+	xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+	ip->i_d.di_nblocks--;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, bp);
+	return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+				    cur->bc_private.b.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+				    cur->bc_private.b.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0);
+	}
+
+	return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_bmap_dmxr[level != 0];
+	return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->bmbt.br_startoff =
+		cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(key->bmbt.br_startoff != 0);
+
+	xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+			       0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+				      cur->bc_rec.b.br_startoff;
+}
+
+static bool
+xfs_bmbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	unsigned int		level;
+
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+			return false;
+		/*
+		 * XXX: need a better way of verifying the owner here. Right now
+		 * just make sure there has been one set.
+		 */
+		if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_BMAP_MAGIC):
+		break;
+	default:
+		return false;
+	}
+
+	/*
+	 * numrecs and level verification.
+	 *
+	 * We don't know what fork we belong to, so just verify that the level
+	 * is less than the maximum of the two. Later checks will be more
+	 * precise.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+		return false;
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.l.bb_leftsib ||
+	    (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
+	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+		return false;
+	if (!block->bb_u.l.bb_rightsib ||
+	    (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
+	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+		return false;
+
+	return true;
+}
+
+static void
+xfs_bmbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_lblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_bmbt_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+static void
+xfs_bmbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_bmbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+	.verify_read = xfs_bmbt_read_verify,
+	.verify_write = xfs_bmbt_write_verify,
+};
+
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_bmbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be64_to_cpu(k1->bmbt.br_startoff) <
+		be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+		xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+		xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif	/* DEBUG */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+	.rec_len		= sizeof(xfs_bmbt_rec_t),
+	.key_len		= sizeof(xfs_bmbt_key_t),
+
+	.dup_cursor		= xfs_bmbt_dup_cursor,
+	.update_cursor		= xfs_bmbt_update_cursor,
+	.alloc_block		= xfs_bmbt_alloc_block,
+	.free_block		= xfs_bmbt_free_block,
+	.get_maxrecs		= xfs_bmbt_get_maxrecs,
+	.get_minrecs		= xfs_bmbt_get_minrecs,
+	.get_dmaxrecs		= xfs_bmbt_get_dmaxrecs,
+	.init_key_from_rec	= xfs_bmbt_init_key_from_rec,
+	.init_rec_from_key	= xfs_bmbt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
+	.key_diff		= xfs_bmbt_key_diff,
+	.buf_ops		= &xfs_bmbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_bmbt_keys_inorder,
+	.recs_inorder		= xfs_bmbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *				/* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* inode owning the btree */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_btree_cur	*cur;
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_btnum = XFS_BTNUM_BMAP;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+	cur->bc_ops = &xfs_bmbt_ops;
+	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+	cur->bc_private.b.ip = ip;
+	cur->bc_private.b.firstblock = NULLFSBLOCK;
+	cur->bc_private.b.flist = NULL;
+	cur->bc_private.b.allocated = 0;
+	cur->bc_private.b.flags = 0;
+	cur->bc_private.b.whichfork = whichfork;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_bmbt_rec_t);
+	return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= sizeof(xfs_bmdr_block_t);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_bmdr_rec_t);
+	return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_ino_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(tp || buffer_list);
+	ASSERT(!(tp && buffer_list));
+	if (whichfork == XFS_DATA_FORK)
+		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+	else
+		ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+	if (!cur)
+		return -ENOMEM;
+
+	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	return error;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_bmap_btree.h b/kernel/fs/xfs/libxfs/xfs_bmap_btree.h
new file mode 100644
index 000000000..819a8a4de
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+struct xfs_btree_cur;
+struct xfs_btree_block;
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Extent state and extent format macros.
+ */
+#define XFS_EXTFMT_INODE(x)	\
+	(xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
+		XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#define ISUNWRITTEN(x)	((x)->br_state == XFS_EXT_UNWRITTEN)
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+	((xfs_bmbt_rec_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+	((xfs_bmbt_key_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_bmbt_ptr_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+	((xfs_bmdr_rec_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+	         ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+	((xfs_bmdr_key_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+		 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+	((xfs_bmdr_ptr_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+		 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+		 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+	XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
+
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+	(int)(XFS_BMBT_BLOCK_LEN(mp) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+	(XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMDR_SPACE_CALC(nrecs) \
+	(int)(sizeof(xfs_bmdr_block_t) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+	(XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#define XFS_BM_MAXLEVELS(mp,w)		((mp)->m_bm_maxlevels[(w)])
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
+			struct xfs_btree_block *, int);
+extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
+extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
+extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
+extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
+
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
+
+extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
+			xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
+extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
+extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
+extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
+
+extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
+			xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+			xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+				 int whichfork, xfs_ino_t new_owner,
+				 struct list_head *buffer_list);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_inode *, int);
+
+#endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_btree.c b/kernel/fs/xfs/libxfs/xfs_btree.c
new file mode 100644
index 000000000..c72283dd8
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_btree.c
@@ -0,0 +1,4067 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_alloc.h"
+
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t	*xfs_btree_cur_zone;
+
+/*
+ * Btree magic numbers.
+ */
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+	  XFS_FIBT_MAGIC },
+	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+};
+#define xfs_btree_magic(cur) \
+	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
+
+
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* btree long form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer for block, if any */
+{
+	int			lblock_ok = 1; /* block passes checks */
+	struct xfs_mount	*mp;	/* file system mount point */
+
+	mp = cur->bc_mp;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		lblock_ok = lblock_ok &&
+			uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+			block->bb_u.l.bb_blkno == cpu_to_be64(
+				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+	}
+
+	lblock_ok = lblock_ok &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be16_to_cpu(block->bb_level) == level &&
+		be16_to_cpu(block->bb_numrecs) <=
+			cur->bc_ops->get_maxrecs(cur, level) &&
+		block->bb_u.l.bb_leftsib &&
+		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
+		 XFS_FSB_SANITY_CHECK(mp,
+			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+		block->bb_u.l.bb_rightsib &&
+		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
+		 XFS_FSB_SANITY_CHECK(mp,
+			be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+	if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* btree short form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer containing block */
+{
+	struct xfs_mount	*mp;	/* file system mount point */
+	struct xfs_buf		*agbp;	/* buffer for ag. freespace struct */
+	struct xfs_agf		*agf;	/* ag. freespace structure */
+	xfs_agblock_t		agflen;	/* native ag. freespace length */
+	int			sblock_ok = 1; /* block passes checks */
+
+	mp = cur->bc_mp;
+	agbp = cur->bc_private.a.agbp;
+	agf = XFS_BUF_TO_AGF(agbp);
+	agflen = be32_to_cpu(agf->agf_length);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		sblock_ok = sblock_ok &&
+			uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+			block->bb_u.s.bb_blkno == cpu_to_be64(
+				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+	}
+
+	sblock_ok = sblock_ok &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be16_to_cpu(block->bb_level) == level &&
+		be16_to_cpu(block->bb_numrecs) <=
+			cur->bc_ops->get_maxrecs(cur, level) &&
+		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+		block->bb_u.s.bb_leftsib &&
+		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+		block->bb_u.s.bb_rightsib;
+
+	if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
+			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer containing block, if any */
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_check_lblock(cur, block, level, bp);
+	else
+		return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_fsblock_t		bno,	/* btree block disk address */
+	int			level)	/* btree block level */
+{
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+		level > 0 &&
+		bno != NULLFSBLOCK &&
+		XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* btree block disk address */
+	int			level)	/* btree block level */
+{
+	xfs_agblock_t		agblocks = cur->bc_mp->m_sb.sb_agblocks;
+
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+		level > 0 &&
+		bno != NULLAGBLOCK &&
+		bno != 0 &&
+		bno < agblocks);
+	return 0;
+}
+
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	union xfs_btree_ptr	*ptr,	/* btree block disk address */
+	int			index,	/* offset from ptr to check */
+	int			level)	/* btree block level */
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		return xfs_btree_check_lptr(cur,
+				be64_to_cpu((&ptr->l)[index]), level);
+	} else {
+		return xfs_btree_check_sptr(cur,
+				be32_to_cpu((&ptr->s)[index]), level);
+	}
+}
+#endif
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return;
+	if (bip)
+		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+
+	return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return;
+	if (bip)
+		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+
+	return true;
+}
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t	*cur,		/* btree cursor */
+	int		error)		/* del because of error */
+{
+	int		i;		/* btree level */
+
+	/*
+	 * Clear the buffer pointers, and release the buffers.
+	 * If we're doing this in the face of an error, we
+	 * need to make sure to inspect all of the entries
+	 * in the bc_bufs array for buffers to be unlocked.
+	 * This is because some of the btree code works from
+	 * level n down to 0, and if we get an error along
+	 * the way we won't have initialized all the entries
+	 * down to 0.
+	 */
+	for (i = 0; i < cur->bc_nlevels; i++) {
+		if (cur->bc_bufs[i])
+			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+		else if (!error)
+			break;
+	}
+	/*
+	 * Can't free a bmap cursor without having dealt with the
+	 * allocated indirect blocks' accounting.
+	 */
+	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+	       cur->bc_private.b.allocated == 0);
+	/*
+	 * Free the cursor.
+	 */
+	kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t	*cur,		/* input cursor */
+	xfs_btree_cur_t	**ncur)		/* output cursor */
+{
+	xfs_buf_t	*bp;		/* btree block's buffer pointer */
+	int		error;		/* error return value */
+	int		i;		/* level number of btree block */
+	xfs_mount_t	*mp;		/* mount structure for filesystem */
+	xfs_btree_cur_t	*new;		/* new cursor value */
+	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
+
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+
+	/*
+	 * Allocate a new cursor like the old one.
+	 */
+	new = cur->bc_ops->dup_cursor(cur);
+
+	/*
+	 * Copy the record currently in the cursor.
+	 */
+	new->bc_rec = cur->bc_rec;
+
+	/*
+	 * For each level current, re-get the buffer and copy the ptr value.
+	 */
+	for (i = 0; i < new->bc_nlevels; i++) {
+		new->bc_ptrs[i] = cur->bc_ptrs[i];
+		new->bc_ra[i] = cur->bc_ra[i];
+		bp = cur->bc_bufs[i];
+		if (bp) {
+			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+						   XFS_BUF_ADDR(bp), mp->m_bsize,
+						   0, &bp,
+						   cur->bc_ops->buf_ops);
+			if (error) {
+				xfs_btree_del_cursor(new, error);
+				*ncur = NULL;
+				return error;
+			}
+		}
+		new->bc_bufs[i] = bp;
+	}
+	*ncur = new;
+	return 0;
+}
+
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+			return XFS_BTREE_LBLOCK_CRC_LEN;
+		return XFS_BTREE_LBLOCK_LEN;
+	}
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+		return XFS_BTREE_SBLOCK_CRC_LEN;
+	return XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+		sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	int			level)
+{
+	return xfs_btree_block_len(cur) +
+		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+		(n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_rec *)
+		((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_key *)
+		((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	int			level = xfs_btree_get_level(block);
+
+	ASSERT(block->bb_level != 0);
+
+	return (union xfs_btree_ptr *)
+		((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+STATIC struct xfs_btree_block *		/* generic btree block pointer */
+xfs_btree_get_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	struct xfs_buf		**bpp)	/* buffer containing the block */
+{
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*bpp = NULL;
+		return xfs_btree_get_iroot(cur);
+	}
+
+	*bpp = cur->bc_bufs[level];
+	return XFS_BUF_TO_BLOCK(*bpp);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to check */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
+	else
+		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to 1, that's the first record/key.
+	 */
+	cur->bc_ptrs[level] = 1;
+	return 1;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_lastrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to numrecs, that's the last record/key.
+	 */
+	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+	return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t	fields,		/* bitmask of fields */
+	const short	*offsets,	/* table of field offsets */
+	int		nbits,		/* number of bits to inspect */
+	int		*first,		/* output: first byte offset */
+	int		*last)		/* output: last byte offset */
+{
+	int		i;		/* current bit number */
+	__int64_t	imask;		/* mask for current bit number */
+
+	ASSERT(fields != 0);
+	/*
+	 * Find the lowest bit, so the first byte offset.
+	 */
+	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+		if (imask & fields) {
+			*first = offsets[i];
+			break;
+		}
+	}
+	/*
+	 * Find the highest bit, so the last byte offset.
+	 */
+	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+		if (imask & fields) {
+			*last = offsets[i + 1] - 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	uint			lock,		/* lock flags for read_buf */
+	struct xfs_buf		**bpp,		/* buffer for fsbno */
+	int			refval,		/* ref count value for buffer */
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;		/* return value */
+	xfs_daddr_t		d;		/* real disk block address */
+	int			error;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, lock, &bp, ops);
+	if (error)
+		return error;
+	if (bp)
+		xfs_buf_set_ref(bp, refval);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops)
+{
+	xfs_daddr_t		d;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_agblock_t		agbno,		/* allocation group block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops)
+{
+	xfs_daddr_t		d;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block	*block)
+{
+	int			rval = 0;
+	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+				     cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+				     cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block *block)
+{
+	int			rval = 0;
+	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				     left, 1, cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				     right, 1, cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			lev,		/* level in btree */
+	int			lr)		/* left/right bits */
+{
+	struct xfs_btree_block	*block;
+
+	/*
+	 * No readahead needed if we are at the root level and the
+	 * btree root is stored in the inode.
+	 */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (lev == cur->bc_nlevels - 1))
+		return 0;
+
+	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+		return 0;
+
+	cur->bc_ra[lev] |= lr;
+	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_readahead_lblock(cur, lr, block);
+	return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
+
+		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+	} else {
+		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+					be32_to_cpu(ptr->s));
+	}
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	xfs_extlen_t		count)
+{
+	xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+			  xfs_btree_ptr_to_daddr(cur, ptr),
+			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	xfs_buf_t		*bp)	/* new buffer to set */
+{
+	struct xfs_btree_block	*b;	/* btree block */
+
+	if (cur->bc_bufs[lev])
+		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
+	cur->bc_bufs[lev] = bp;
+	cur->bc_ra[lev] = 0;
+
+	b = XFS_BUF_TO_BLOCK(bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	} else {
+		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	}
+}
+
+STATIC int
+xfs_btree_ptr_is_null(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return ptr->l == cpu_to_be64(NULLFSBLOCK);
+	else
+		return ptr->s == cpu_to_be32(NULLAGBLOCK);
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(NULLFSBLOCK);
+	else
+		ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_ptr	*ptr,
+	int			lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->l = block->bb_u.l.bb_rightsib;
+		else
+			ptr->l = block->bb_u.l.bb_leftsib;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->s = block->bb_u.s.bb_rightsib;
+		else
+			ptr->s = block->bb_u.s.bb_leftsib;
+	}
+}
+
+STATIC void
+xfs_btree_set_sibling(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_ptr	*ptr,
+	int			lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.l.bb_rightsib = ptr->l;
+		else
+			block->bb_u.l.bb_leftsib = ptr->l;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.s.bb_rightsib = ptr->s;
+		else
+			block->bb_u.s.bb_leftsib = ptr->s;
+	}
+}
+
+void
+xfs_btree_init_block_int(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*buf,
+	xfs_daddr_t		blkno,
+	__u32			magic,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner,
+	unsigned int		flags)
+{
+	buf->bb_magic = cpu_to_be32(magic);
+	buf->bb_level = cpu_to_be16(level);
+	buf->bb_numrecs = cpu_to_be16(numrecs);
+
+	if (flags & XFS_BTREE_LONG_PTRS) {
+		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+		if (flags & XFS_BTREE_CRC_BLOCKS) {
+			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+			buf->bb_u.l.bb_pad = 0;
+			buf->bb_u.l.bb_lsn = 0;
+		}
+	} else {
+		/* owner is a 32 bit value on short blocks */
+		__u32 __owner = (__u32)owner;
+
+		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		if (flags & XFS_BTREE_CRC_BLOCKS) {
+			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+			buf->bb_u.s.bb_lsn = 0;
+		}
+	}
+}
+
+void
+xfs_btree_init_block(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	__u32		magic,
+	__u16		level,
+	__u16		numrecs,
+	__u64		owner,
+	unsigned int	flags)
+{
+	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+				 magic, level, numrecs, owner, flags);
+}
+
+STATIC void
+xfs_btree_init_block_cur(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	int			numrecs)
+{
+	__u64 owner;
+
+	/*
+	 * we can pull the owner from the cursor right now as the different
+	 * owners align directly with the pointer size of the btree. This may
+	 * change in future, but is safe for current users of the generic btree
+	 * code.
+	 */
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		owner = cur->bc_private.b.ip->i_ino;
+	else
+		owner = cur->bc_private.a.agno;
+
+	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+				 xfs_btree_magic(cur), level, numrecs,
+				 owner, cur->bc_flags);
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updates to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level)
+{
+	union xfs_btree_ptr	ptr;
+
+	if (level > 0)
+		return 0;
+	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+		return 0;
+
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &ptr))
+		return 0;
+	return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+					XFS_BUF_ADDR(bp)));
+	else {
+		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+					XFS_BUF_ADDR(bp)));
+	}
+}
+
+STATIC void
+xfs_btree_set_refs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
+		break;
+	case XFS_BTNUM_INO:
+	case XFS_BTNUM_FINO:
+		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
+		break;
+	case XFS_BTNUM_BMAP:
+		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			flags,
+	struct xfs_btree_block	**block,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+
+	/* need to sort out how callers deal with failures first */
+	ASSERT(!(flags & XBF_TRYLOCK));
+
+	d = xfs_btree_ptr_to_daddr(cur, ptr);
+	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+				 mp->m_bsize, flags);
+
+	if (!*bpp)
+		return -ENOMEM;
+
+	(*bpp)->b_ops = cur->bc_ops->buf_ops;
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+	return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			flags,
+	struct xfs_btree_block	**block,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+	int			error;
+
+	/* need to sort out how callers deal with failures first */
+	ASSERT(!(flags & XBF_TRYLOCK));
+
+	d = xfs_btree_ptr_to_daddr(cur, ptr);
+	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, flags, bpp,
+				   cur->bc_ops->buf_ops);
+	if (error)
+		return error;
+
+	xfs_btree_set_refs(cur, *bpp);
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+	return 0;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*dst_key,
+	union xfs_btree_key	*src_key,
+	int			numkeys)
+{
+	ASSERT(numkeys >= 0);
+	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*dst_rec,
+	union xfs_btree_rec	*src_rec,
+	int			numrecs)
+{
+	ASSERT(numrecs >= 0);
+	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*dst_ptr,
+	union xfs_btree_ptr	*src_ptr,
+	int			numptrs)
+{
+	ASSERT(numptrs >= 0);
+	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key,
+	int			dir,
+	int			numkeys)
+{
+	char			*dst_key;
+
+	ASSERT(numkeys >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	int			dir,
+	int			numrecs)
+{
+	char			*dst_rec;
+
+	ASSERT(numrecs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			dir,
+	int			numptrs)
+{
+	char			*dst_ptr;
+
+	ASSERT(numptrs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	if (bp) {
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				  xfs_btree_key_offset(cur, first),
+				  xfs_btree_key_offset(cur, last + 1) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+	xfs_trans_log_buf(cur->bc_tp, bp,
+			  xfs_btree_rec_offset(cur, first),
+			  xfs_btree_rec_offset(cur, last + 1) - 1);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	int			first,	/* index of first pointer to log */
+	int			last)	/* index of last pointer to log */
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	if (bp) {
+		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+		int			level = xfs_btree_get_level(block);
+
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				xfs_btree_ptr_offset(cur, first, level),
+				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	int			fields)	/* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	soffsets[] = {	/* table of offsets (short) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+		XFS_BTREE_SBLOCK_CRC_LEN
+	};
+	static const short	loffsets[] = {	/* table of offsets (long) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+		XFS_BTREE_LBLOCK_CRC_LEN
+	};
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+	if (bp) {
+		int nbits;
+
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+			/*
+			 * We don't log the CRC when updating a btree
+			 * block but instead recreate it during log
+			 * recovery.  As the log buffers have checksums
+			 * of their own this is safe and avoids logging a crc
+			 * update in a lot of places.
+			 */
+			if (fields == XFS_BB_ALL_BITS)
+				fields = XFS_BB_ALL_BITS_CRC;
+			nbits = XFS_BB_NUM_BITS_CRC;
+		} else {
+			nbits = XFS_BB_NUM_BITS;
+		}
+		xfs_btree_offsets(fields,
+				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+					loffsets : soffsets,
+				  nbits, &first, &last);
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_increment(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	union xfs_btree_ptr	ptr;
+	struct xfs_buf		*bp;
+	int			error;		/* error return value */
+	int			lev;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the right at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* We're done if we remain in the block after the increment. */
+	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+		goto out1;
+
+	/* Fail if we just went off the right edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, increment);
+
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, lev, bp);
+		if (error)
+			goto error0;
+#endif
+
+		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+			break;
+
+		/* Read-ahead the right block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+
+	/*
+	 * If we went off the root then we are either seriously
+	 * confused or have the tree root in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+		--lev;
+		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+		if (error)
+			goto error0;
+
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_ptrs[lev] = 1;
+	}
+out1:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_decrement(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	xfs_buf_t		*bp;
+	int			error;		/* error return value */
+	int			lev;
+	union xfs_btree_ptr	ptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the left at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+	/* We're done if we remain in the block after the decrement. */
+	if (--cur->bc_ptrs[level] > 0)
+		goto out1;
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we just went off the left edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, decrement);
+
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		/* Read-ahead the left block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+
+	/*
+	 * If we went off the root then we are seriously confused.
+	 * or the root of the tree is in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+		--lev;
+		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+		if (error)
+			goto error0;
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+	}
+out1:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in the btree */
+	union xfs_btree_ptr	*pp,	/* ptr to btree block */
+	struct xfs_btree_block	**blkp) /* return btree block */
+{
+	struct xfs_buf		*bp;	/* buffer pointer for btree block */
+	int			error = 0;
+
+	/* special case the root block if in an inode */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*blkp = xfs_btree_get_iroot(cur);
+		return 0;
+	}
+
+	/*
+	 * If the old buffer at this level for the disk address we are
+	 * looking for re-use it.
+	 *
+	 * Otherwise throw it away and get a new one.
+	 */
+	bp = cur->bc_bufs[level];
+	if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+		*blkp = XFS_BUF_TO_BLOCK(bp);
+		return 0;
+	}
+
+	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
+	if (error)
+		return error;
+
+	xfs_btree_setbuf(cur, level, bp);
+	return 0;
+}
+
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			keyno,
+	struct xfs_btree_block	*block,
+	union xfs_btree_key	*kp)
+{
+	if (level == 0) {
+		cur->bc_ops->init_key_from_rec(kp,
+				xfs_btree_rec_addr(cur, keyno, block));
+		return kp;
+	}
+
+	return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * stat is set to 0 if can't find any such record, 1 for success.
+ */
+int					/* error */
+xfs_btree_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	__int64_t		diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno;	/* current key number */
+	int			level;	/* level in the btree */
+	union xfs_btree_ptr	*pp;	/* ptr to btree block */
+	union xfs_btree_ptr	ptr;	/* ptr to btree block */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, dir);
+
+	XFS_BTREE_STATS_INC(cur, lookup);
+
+	block = NULL;
+	keyno = 0;
+
+	/* initialise start pointer from cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	pp = &ptr;
+
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		/* Get the block we need to do the lookup on. */
+		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+		if (error)
+			goto error0;
+
+		if (diff == 0) {
+			/*
+			 * If we already had a key match at a higher level, we
+			 * know we need to use the first entry in this block.
+			 */
+			keyno = 1;
+		} else {
+			/* Otherwise search this block. Do a binary search. */
+
+			int	high;	/* high entry number */
+			int	low;	/* low entry number */
+
+			/* Set low and high entry numbers, 1-based. */
+			low = 1;
+			high = xfs_btree_get_numrecs(block);
+			if (!high) {
+				/* Block is empty, must be an empty leaf. */
+				ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+				*stat = 0;
+				return 0;
+			}
+
+			/* Binary search the block. */
+			while (low <= high) {
+				union xfs_btree_key	key;
+				union xfs_btree_key	*kp;
+
+				XFS_BTREE_STATS_INC(cur, compare);
+
+				/* keyno is average of low and high. */
+				keyno = (low + high) >> 1;
+
+				/* Get current search key */
+				kp = xfs_lookup_get_search_key(cur, level,
+						keyno, block, &key);
+
+				/*
+				 * Compute difference to get next direction:
+				 *  - less than, move right
+				 *  - greater than, move left
+				 *  - equal, we're done
+				 */
+				diff = cur->bc_ops->key_diff(cur, kp);
+				if (diff < 0)
+					low = keyno + 1;
+				else if (diff > 0)
+					high = keyno - 1;
+				else
+					break;
+			}
+		}
+
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+			error = xfs_btree_check_ptr(cur, pp, 0, level);
+			if (error)
+				goto error0;
+#endif
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+
+	/* Done with the search. See if we need to adjust the results. */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > xfs_btree_get_numrecs(block) &&
+		    !xfs_btree_ptr_is_null(cur, &ptr)) {
+			int	i;
+
+			cur->bc_ptrs[0] = keyno;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+			*stat = 1;
+			return 0;
+		}
+	} else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+
+	/* Return if we succeeded or not. */
+	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+		*stat = 0;
+	else if (dir != XFS_LOOKUP_EQ || diff == 0)
+		*stat = 1;
+	else
+		*stat = 0;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*keyp,
+	int			level)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	union xfs_btree_key	*kp;
+	int			ptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+		int		error;
+#endif
+		block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, level, bp);
+		if (error) {
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+			return error;
+		}
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		xfs_btree_copy_keys(cur, kp, keyp, 1);
+		xfs_btree_log_keys(cur, bp, ptr, ptr);
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	int			error;
+	int			ptr;
+	union xfs_btree_rec	*rp;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGR(cur, rec);
+
+	/* Pick up the current block. */
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		goto error0;
+#endif
+	/* Get the address of the rec to be updated. */
+	ptr = cur->bc_ptrs[0];
+	rp = xfs_btree_rec_addr(cur, ptr, block);
+
+	/* Fill in the new contents and log them. */
+	xfs_btree_copy_recs(cur, rp, rec, 1);
+	xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, 0)) {
+		cur->bc_ops->update_lastrec(cur, block, rec,
+					    ptr, LASTREC_UPDATE);
+	}
+
+	/* Updating first rec in leaf. Pass new key value up to our parent. */
+	if (ptr == 1) {
+		union xfs_btree_key	key;
+
+		cur->bc_ops->init_key_from_rec(&key, rec);
+		error = xfs_btree_updkey(cur, &key, 1);
+		if (error)
+			goto error0;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_lshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_key	key;		/* btree key */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs;		/* left record count */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	int			rrecs;		/* right record count */
+	union xfs_btree_ptr	lptr;		/* left btree pointer */
+	union xfs_btree_key	*rkp = NULL;	/* right btree key */
+	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
+	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
+	int			error;		/* error return value */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1)
+		goto out0;
+
+	/* Set up variables for this block as "right". */
+	right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, right, level, rbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no left sibling then we can't shift an entry left. */
+	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &lptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] <= 1)
+		goto out0;
+
+	/* Set up the left neighbor as "left". */
+	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	rrecs = xfs_btree_get_numrecs(right);
+
+	/*
+	 * We add one entry to the left side and remove one for the right side.
+	 * Account for it here, the changes will be updated on disk and logged
+	 * later.
+	 */
+	lrecs++;
+	rrecs--;
+
+	XFS_BTREE_STATS_INC(cur, lshift);
+	XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 * Log the changes to the left block.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, rpp, 0, level);
+		if (error)
+			goto error0;
+#endif
+		xfs_btree_copy_keys(cur, lkp, rkp, 1);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur,
+			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, 1);
+		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->recs_inorder(cur,
+			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+	}
+
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+		int			i;		/* loop index */
+
+		for (i = 0; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+			if (error)
+				goto error0;
+		}
+#endif
+		xfs_btree_shift_keys(cur,
+				xfs_btree_key_addr(cur, 2, right),
+				-1, rrecs);
+		xfs_btree_shift_ptrs(cur,
+				xfs_btree_ptr_addr(cur, 2, right),
+				-1, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+	} else {
+		/* It's a leaf. operate on records */
+		xfs_btree_shift_recs(cur,
+			xfs_btree_rec_addr(cur, 2, right),
+			-1, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		cur->bc_ops->init_key_from_rec(&key,
+			xfs_btree_rec_addr(cur, 1, right));
+		rkp = &key;
+	}
+
+	/* Update the parent key values of right. */
+	error = xfs_btree_updkey(cur, rkp, level + 1);
+	if (error)
+		goto error0;
+
+	/* Slide the cursor value left one. */
+	cur->bc_ptrs[level]--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_rshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_key	key;		/* btree key */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	union xfs_btree_ptr	rptr;		/* right block pointer */
+	union xfs_btree_key	*rkp;		/* right btree key */
+	int			rrecs;		/* right record count */
+	int			lrecs;		/* left record count */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1))
+		goto out0;
+
+	/* Set up variables for this block as "left". */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no right sibling then we can't shift an entry right. */
+	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (cur->bc_ptrs[level] >= lrecs)
+		goto out0;
+
+	/* Set up the right neighbor as "right". */
+	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	rrecs = xfs_btree_get_numrecs(right);
+	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, rshift);
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+		union xfs_btree_ptr	*rpp;
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+		for (i = rrecs - 1; i >= 0; i--) {
+			error = xfs_btree_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, lpp, 0, level);
+		if (error)
+			goto error0;
+#endif
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_keys(cur, rkp, lkp, 1);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+			xfs_btree_key_addr(cur, 2, right)));
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec	*lrp;
+		union xfs_btree_rec	*rrp;
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_recs(cur, rrp, lrp, 1);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+		cur->bc_ops->init_key_from_rec(&key, rrp);
+		rkp = &key;
+
+		ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+			xfs_btree_rec_addr(cur, 2, right)));
+	}
+
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	xfs_btree_set_numrecs(left, --lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, ++rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+	i = xfs_btree_lastrec(tcur, level);
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+
+	error = xfs_btree_increment(tcur, level, &i);
+	if (error)
+		goto error1;
+
+	error = xfs_btree_updkey(tcur, rkp, level + 1);
+	if (error)
+		goto error1;
+
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+
+error1:
+	XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int					/* error */
+__xfs_btree_split(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	union xfs_btree_ptr	*ptrp,
+	union xfs_btree_key	*key,
+	struct xfs_btree_cur	**curp,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	int			lrecs;
+	int			rrecs;
+	int			src_index;
+	int			error;		/* error return value */
+#ifdef DEBUG
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+	XFS_BTREE_STATS_INC(cur, split);
+
+	/* Set up left block (current one). */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block as "right". */
+	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* Fill in the btree header for the new right block. */
+	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
+
+	/*
+	 * Split the entries between the old and the new block evenly.
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	rrecs = lrecs / 2;
+	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+		rrecs++;
+	src_index = (lrecs - rrecs + 1);
+
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/*
+	 * Copy btree block entries from the left block over to the
+	 * new block, the right. Update the right block and log the
+	 * changes.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, src_index, left);
+		lpp = xfs_btree_ptr_addr(cur, src_index, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+		for (i = src_index; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+		/* Grab the keys to the entries moved to the right block */
+		xfs_btree_copy_keys(cur, key, rkp, 1);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, src_index, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+		cur->bc_ops->init_key_from_rec(key,
+			xfs_btree_rec_addr(cur, 1, right));
+	}
+
+
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust numrecs, sibling pointers.
+	 */
+	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+	lrecs -= rrecs;
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+		error = xfs_btree_read_buf_block(cur, &rrptr,
+							0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_ptrs[level] > lrecs + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= lrecs;
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		error = xfs_btree_dup_cursor(cur, curp);
+		if (error)
+			goto error0;
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*ptrp = rptr;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+struct xfs_btree_split_args {
+	struct xfs_btree_cur	*cur;
+	int			level;
+	union xfs_btree_ptr	*ptrp;
+	union xfs_btree_key	*key;
+	struct xfs_btree_cur	**curp;
+	int			*stat;		/* success/failure */
+	int			result;
+	bool			kswapd;	/* allocation in kswapd context */
+	struct completion	*done;
+	struct work_struct	work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+	struct work_struct	*work)
+{
+	struct xfs_btree_split_args	*args = container_of(work,
+						struct xfs_btree_split_args, work);
+	unsigned long		pflags;
+	unsigned long		new_pflags = PF_FSTRANS;
+
+	/*
+	 * we are in a transaction context here, but may also be doing work
+	 * in kswapd context, and hence we may need to inherit that state
+	 * temporarily to ensure that we don't block waiting for memory reclaim
+	 * in any way.
+	 */
+	if (args->kswapd)
+		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+	current_set_flags_nested(&pflags, new_pflags);
+
+	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+					 args->key, args->curp, args->stat);
+	complete(args->done);
+
+	current_restore_flags_nested(&pflags, new_pflags);
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int					/* error */
+xfs_btree_split(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	union xfs_btree_ptr	*ptrp,
+	union xfs_btree_key	*key,
+	struct xfs_btree_cur	**curp,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_split_args	args;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (cur->bc_btnum != XFS_BTNUM_BMAP)
+		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+	args.cur = cur;
+	args.level = level;
+	args.ptrp = ptrp;
+	args.key = key;
+	args.curp = curp;
+	args.stat = stat;
+	args.done = &done;
+	args.kswapd = current_is_kswapd();
+	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+	queue_work(xfs_alloc_wq, &args.work);
+	wait_for_completion(&done);
+	destroy_work_on_stack(&args.work);
+	return args.result;
+}
+
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int						/* error */
+xfs_btree_new_iroot(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat)		/* return status - 0 fail */
+{
+	struct xfs_buf		*cbp;		/* buffer for cblock */
+	struct xfs_btree_block	*block;		/* btree block */
+	struct xfs_btree_block	*cblock;	/* child btree block */
+	union xfs_btree_key	*ckp;		/* child key pointer */
+	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
+	union xfs_btree_key	*kp;		/* pointer to btree key */
+	union xfs_btree_ptr	*pp;		/* pointer to block addr */
+	union xfs_btree_ptr	nptr;		/* new block addr */
+	int			level;		/* btree level */
+	int			error;		/* error return code */
+#ifdef DEBUG
+	int			i;		/* loop counter */
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	level = cur->bc_nlevels - 1;
+
+	block = xfs_btree_get_iroot(cur);
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		return 0;
+	}
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Copy the root into a real block. */
+	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+	if (error)
+		goto error0;
+
+	/*
+	 * we can't just memcpy() the root in for CRC enabled btree blocks.
+	 * In that case have to also ensure the blkno remains correct
+	 */
+	memcpy(cblock, block, xfs_btree_block_len(cur));
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+		else
+			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+	}
+
+	be16_add_cpu(&block->bb_level, 1);
+	xfs_btree_set_numrecs(block, 1);
+	cur->bc_nlevels++;
+	cur->bc_ptrs[level + 1] = 1;
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+		error = xfs_btree_check_ptr(cur, pp, i, level);
+		if (error)
+			goto error0;
+	}
+#endif
+	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+	error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+	if (error)
+		goto error0;
+#endif
+	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+	xfs_iroot_realloc(cur->bc_private.b.ip,
+			  1 - xfs_btree_get_numrecs(cblock),
+			  cur->bc_private.b.whichfork);
+
+	xfs_btree_setbuf(cur, level, cbp);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+	*logflags |=
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+	*stat = 1;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_btree_new_root(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* one half of the old root block */
+	struct xfs_buf		*bp;	/* buffer containing block */
+	int			error;	/* error return value */
+	struct xfs_buf		*lbp;	/* left buffer pointer */
+	struct xfs_btree_block	*left;	/* left btree block */
+	struct xfs_buf		*nbp;	/* new (root) buffer */
+	struct xfs_btree_block	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	struct xfs_buf		*rbp;	/* right buffer pointer */
+	struct xfs_btree_block	*right;	/* right btree block */
+	union xfs_btree_ptr	rptr;
+	union xfs_btree_ptr	lptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	/* initialise our start point from the cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block. */
+	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+	if (error)
+		goto error0;
+
+	/* Set the root in the holding structure  increasing the level by 1. */
+	cur->bc_ops->set_root(cur, &lptr, 1);
+
+	/*
+	 * At the previous root level there are now two blocks: the old root,
+	 * and the new block generated when it was split.  We don't know which
+	 * one the cursor is pointing at, so we set up variables "left" and
+	 * "right" for each case.
+	 */
+	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/* Our block is left, pick up the right block. */
+		lbp = bp;
+		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+		left = block;
+		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+		if (error)
+			goto error0;
+		bp = rbp;
+		nptr = 1;
+	} else {
+		/* Our block is right, pick up the left block. */
+		rbp = bp;
+		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+		right = block;
+		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+		if (error)
+			goto error0;
+		bp = lbp;
+		nptr = 2;
+	}
+	/* Fill in the new block's btree header and log it. */
+	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
+	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+			!xfs_btree_ptr_is_null(cur, &rptr));
+
+	/* Fill in the key data in the new root. */
+	if (xfs_btree_get_level(left) > 0) {
+		xfs_btree_copy_keys(cur,
+				xfs_btree_key_addr(cur, 1, new),
+				xfs_btree_key_addr(cur, 1, left), 1);
+		xfs_btree_copy_keys(cur,
+				xfs_btree_key_addr(cur, 2, new),
+				xfs_btree_key_addr(cur, 1, right), 1);
+	} else {
+		cur->bc_ops->init_key_from_rec(
+				xfs_btree_key_addr(cur, 1, new),
+				xfs_btree_rec_addr(cur, 1, left));
+		cur->bc_ops->init_key_from_rec(
+				xfs_btree_key_addr(cur, 2, new),
+				xfs_btree_rec_addr(cur, 1, right));
+	}
+	xfs_btree_log_keys(cur, nbp, 1, 2);
+
+	/* Fill in the pointer data in the new root. */
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+	xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+	/* Fix up the cursor. */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_ptrs[cur->bc_nlevels] = nptr;
+	cur->bc_nlevels++;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* btree level */
+	int			numrecs,/* # of recs in block */
+	int			*oindex,/* old tree index */
+	int			*index,	/* new tree index */
+	union xfs_btree_ptr	*nptr,	/* new btree ptr */
+	struct xfs_btree_cur	**ncur,	/* new btree cursor */
+	union xfs_btree_rec	*nrec,	/* new record */
+	int			*stat)
+{
+	union xfs_btree_key	key;	/* new btree key value */
+	int			error = 0;
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1) {
+	    	struct xfs_inode *ip = cur->bc_private.b.ip;
+
+		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+			/* A root block that can be made bigger. */
+			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+		} else {
+			/* A root block that needs replacing */
+			int	logflags = 0;
+
+			error = xfs_btree_new_iroot(cur, &logflags, stat);
+			if (error || *stat == 0)
+				return error;
+
+			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+		}
+
+		return 0;
+	}
+
+	/* First, try shifting an entry to the right neighbor. */
+	error = xfs_btree_rshift(cur, level, stat);
+	if (error || *stat)
+		return error;
+
+	/* Next, try shifting an entry to the left neighbor. */
+	error = xfs_btree_lshift(cur, level, stat);
+	if (error)
+		return error;
+
+	if (*stat) {
+		*oindex = *index = cur->bc_ptrs[level];
+		return 0;
+	}
+
+	/*
+	 * Next, try splitting the current block in half.
+	 *
+	 * If this works we have to re-set our variables because we
+	 * could be in a different block now.
+	 */
+	error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+	if (error || *stat == 0)
+		return error;
+
+
+	*index = cur->bc_ptrs[level];
+	cur->bc_ops->init_rec_from_key(&key, nrec);
+	return 0;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
+	union xfs_btree_rec	*recp,	/* i/o: record data inserted */
+	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer for block */
+	union xfs_btree_key	key;	/* btree key */
+	union xfs_btree_ptr	nptr;	/* new block ptr */
+	struct xfs_btree_cur	*ncur;	/* new btree cursor */
+	union xfs_btree_rec	nrec;	/* new record count */
+	int			optr;	/* old key/record index */
+	int			ptr;	/* key/record index */
+	int			numrecs;/* number of records */
+	int			error;	/* error return value */
+#ifdef DEBUG
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+	ncur = NULL;
+
+	/*
+	 * If we have an external root pointer, and we've made it to the
+	 * root level, allocate a new root block and we're done.
+	 */
+	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level >= cur->bc_nlevels)) {
+		error = xfs_btree_new_root(cur, stat);
+		xfs_btree_set_ptr_null(cur, ptrp);
+
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		return error;
+	}
+
+	/* If we're off the left edge, return failure. */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	/* Make a key out of the record data to be inserted, and save it. */
+	cur->bc_ops->init_key_from_rec(&key, recp);
+
+	optr = ptr;
+
+	XFS_BTREE_STATS_INC(cur, insrec);
+
+	/* Get pointers to the btree buffer and block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+
+	/* Check that the new entry is being inserted in the right place. */
+	if (ptr <= numrecs) {
+		if (level == 0) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+				xfs_btree_rec_addr(cur, ptr, block)));
+		} else {
+			ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+				xfs_btree_key_addr(cur, ptr, block)));
+		}
+	}
+#endif
+
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	xfs_btree_set_ptr_null(cur, &nptr);
+	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+		error = xfs_btree_make_block_unfull(cur, level, numrecs,
+					&optr, &ptr, &nptr, &ncur, &nrec, stat);
+		if (error || *stat == 0)
+			goto error0;
+	}
+
+	/*
+	 * The current block may have changed if the block was
+	 * previously full and we have just made space in it.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		return error;
+#endif
+
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*kp;
+		union xfs_btree_ptr	*pp;
+
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+		for (i = numrecs - ptr; i >= 0; i--) {
+			error = xfs_btree_check_ptr(cur, pp, i, level);
+			if (error)
+				return error;
+		}
+#endif
+
+		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+		if (error)
+			goto error0;
+#endif
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_keys(cur, kp, &key, 1);
+		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+		numrecs++;
+		xfs_btree_set_numrecs(block, numrecs);
+		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+		xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+				xfs_btree_key_addr(cur, ptr + 1, block)));
+		}
+#endif
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec             *rp;
+
+		rp = xfs_btree_rec_addr(cur, ptr, block);
+
+		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_recs(cur, rp, recp, 1);
+		xfs_btree_set_numrecs(block, ++numrecs);
+		xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+				xfs_btree_rec_addr(cur, ptr + 1, block)));
+		}
+#endif
+	}
+
+	/* Log the new number of records in the btree header. */
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/* If we inserted at the start of a block, update the parents' keys. */
+	if (optr == 1) {
+		error = xfs_btree_updkey(cur, &key, level + 1);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, recp,
+					    ptr, LASTREC_INSREC);
+	}
+
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*ptrp = nptr;
+	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+		*recp = nrec;
+		*curp = ncur;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+	struct xfs_btree_cur	*cur,
+	int			*stat)
+{
+	int			error;	/* error return value */
+	int			i;	/* result value, 0 for failure */
+	int			level;	/* current level number in btree */
+	union xfs_btree_ptr	nptr;	/* new block number (split result) */
+	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
+	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
+	union xfs_btree_rec	rec;	/* record to insert */
+
+	level = 0;
+	ncur = NULL;
+	pcur = cur;
+
+	xfs_btree_set_ptr_null(cur, &nptr);
+	cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nptr into this level of the tree.
+		 * Note if we fail, nptr will be null.
+		 */
+		error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+		if (error) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			goto error0;
+		}
+
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+		level++;
+
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur &&
+		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+			/* Save the state from the cursor before we trash it */
+			if (cur->bc_ops->update_cursor)
+				cur->bc_ops->update_cursor(pcur, cur);
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/* If we got a new cursor, switch to it. */
+		if (ncur) {
+			pcur = ncur;
+			ncur = NULL;
+		}
+	} while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+	struct xfs_btree_cur	*cur)
+{
+	int			whichfork = cur->bc_private.b.whichfork;
+	struct xfs_inode	*ip = cur->bc_private.b.ip;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_btree_block	*block;
+	struct xfs_btree_block	*cblock;
+	union xfs_btree_key	*kp;
+	union xfs_btree_key	*ckp;
+	union xfs_btree_ptr	*pp;
+	union xfs_btree_ptr	*cpp;
+	struct xfs_buf		*cbp;
+	int			level;
+	int			index;
+	int			numrecs;
+#ifdef DEBUG
+	union xfs_btree_ptr	ptr;
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_nlevels > 1);
+
+	/*
+	 * Don't deal with the root block needs to be a leaf case.
+	 * We're just going to turn the thing back into extents anyway.
+	 */
+	level = cur->bc_nlevels - 1;
+	if (level == 1)
+		goto out0;
+
+	/*
+	 * Give up if the root has multiple children.
+	 */
+	block = xfs_btree_get_iroot(cur);
+	if (xfs_btree_get_numrecs(block) != 1)
+		goto out0;
+
+	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+	numrecs = xfs_btree_get_numrecs(cblock);
+
+	/*
+	 * Only do this if the next level will fit.
+	 * Then the data must be copied up to the inode,
+	 * instead of freeing the root you free the next level.
+	 */
+	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+	if (index) {
+		xfs_iroot_realloc(cur->bc_private.b.ip, index,
+				  cur->bc_private.b.whichfork);
+		block = ifp->if_broot;
+	}
+
+	be16_add_cpu(&block->bb_numrecs, index);
+	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+	for (i = 0; i < numrecs; i++) {
+		int		error;
+
+		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+		if (error) {
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+			return error;
+		}
+	}
+#endif
+	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+	cur->bc_ops->free_block(cur, cbp);
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level - 1] = NULL;
+	be16_add_cpu(&block->bb_level, -1);
+	xfs_trans_log_inode(cur->bc_tp, ip,
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	cur->bc_nlevels--;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	union xfs_btree_ptr	*newroot)
+{
+	int			error;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+	/*
+	 * Update the root pointer, decreasing the level by 1 and then
+	 * free the old root.
+	 */
+	cur->bc_ops->set_root(cur, newroot, -1);
+
+	error = cur->bc_ops->free_block(cur, bp);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level] = NULL;
+	cur->bc_ra[level] = 0;
+	cur->bc_nlevels--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)
+{
+	int			error;
+	int			i;
+
+	if (level > 0) {
+		error = xfs_btree_decrement(cur, level, &i);
+		if (error)
+			return error;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int					/* error */
+xfs_btree_delrec(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			level,		/* level removing record from */
+	int			*stat)		/* fail/done/go-on */
+{
+	struct xfs_btree_block	*block;		/* btree block */
+	union xfs_btree_ptr	cptr;		/* current block ptr */
+	struct xfs_buf		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+	union xfs_btree_key	key;		/* storage for keyp */
+	union xfs_btree_key	*keyp = &key;	/* passed to the next level */
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs = 0;	/* left record count */
+	int			ptr;		/* key/record index */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	int			rrecs = 0;	/* right record count */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	int			numrecs;	/* temporary numrec count */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	tcur = NULL;
+
+	/* Get the index of the entry being deleted, check for nothing there. */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	/* Get the buffer & block containing the record or key/ptr. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we're off the end of the block. */
+	if (ptr > numrecs) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	XFS_BTREE_STATS_INC(cur, delrec);
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+	/* Excise the entries being deleted. */
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+
+		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+		for (i = 0; i < numrecs - ptr; i++) {
+			error = xfs_btree_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		if (ptr < numrecs) {
+			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+		}
+
+		/*
+		 * If it's the first record in the block, we'll need to pass a
+		 * key up to the next level (updkey).
+		 */
+		if (ptr == 1)
+			keyp = xfs_btree_key_addr(cur, 1, block);
+	} else {
+		/* It's a leaf. operate on records */
+		if (ptr < numrecs) {
+			xfs_btree_shift_recs(cur,
+				xfs_btree_rec_addr(cur, ptr + 1, block),
+				-1, numrecs - ptr);
+			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+		}
+
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		if (ptr == 1) {
+			cur->bc_ops->init_key_from_rec(&key,
+					xfs_btree_rec_addr(cur, 1, block));
+			keyp = &key;
+		}
+	}
+
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	xfs_btree_set_numrecs(block, --numrecs);
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, NULL,
+					    ptr, LASTREC_DELREC);
+	}
+
+	/*
+	 * We're at the root level.  First, shrink the root block in-memory.
+	 * Try to get rid of the next level down.  If we can't then there's
+	 * nothing left to do.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+					  cur->bc_private.b.whichfork);
+
+			error = xfs_btree_kill_iroot(cur);
+			if (error)
+				goto error0;
+
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			*stat = 1;
+			return 0;
+		}
+
+		/*
+		 * If this is the root level, and there's only one entry left,
+		 * and it's NOT the leaf level, then we can get rid of this
+		 * level.
+		 */
+		if (numrecs == 1 && level > 0) {
+			union xfs_btree_ptr	*pp;
+			/*
+			 * pp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			pp = xfs_btree_ptr_addr(cur, 1, block);
+			error = xfs_btree_kill_root(cur, bp, level, pp);
+			if (error)
+				goto error0;
+		} else if (level > 0) {
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+		}
+		*stat = 1;
+		return 0;
+	}
+
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (ptr == 1) {
+		error = xfs_btree_updkey(cur, keyp, level + 1);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		/*
+		 * One child of root, need to get a chance to copy its contents
+		 * into the root and delete it. Can't go up to next level,
+		 * there's nothing to delete there.
+		 */
+		if (xfs_btree_ptr_is_null(cur, &rptr) &&
+		    xfs_btree_ptr_is_null(cur, &lptr) &&
+		    level == cur->bc_nlevels - 2) {
+			error = xfs_btree_kill_iroot(cur);
+			if (!error)
+				error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			return 0;
+		}
+	}
+
+	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+	       !xfs_btree_ptr_is_null(cur, &lptr));
+
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+
+		error = xfs_btree_increment(tcur, level, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+
+		/* Grab a pointer to the block. */
+		right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(tcur, right, level, rbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(right) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_lshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+
+				error = xfs_btree_dec_cursor(cur, level, stat);
+				if (error)
+					goto error0;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = xfs_btree_get_numrecs(right);
+		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+			i = xfs_btree_firstrec(tcur, level);
+			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+
+			error = xfs_btree_decrement(tcur, level, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+		}
+	}
+
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+
+		error = xfs_btree_decrement(tcur, level, &i);
+		if (error)
+			goto error0;
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+
+		/* Grab a pointer to the block. */
+		left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, left, level, lbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(left) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_rshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+				*stat = 1;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = xfs_btree_get_numrecs(left);
+	}
+
+	/* Delete the temp cursor, we're done with it. */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	tcur = NULL;
+
+	/* If here, we need to do a join to keep the tree balanced. */
+	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+	    lrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rptr = cptr;
+		right = block;
+		rbp = bp;
+		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+		   rrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lptr = cptr;
+		left = block;
+		lbp = bp;
+		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.  This is probably a logic error, but it's not fatal.
+	 */
+	} else {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	rrecs = xfs_btree_get_numrecs(right);
+	lrecs = xfs_btree_get_numrecs(left);
+
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+		for (i = 1; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	}
+
+	XFS_BTREE_STATS_INC(cur, join);
+
+	/*
+	 * Fix up the number of records and right block pointer in the
+	 * surviving block, and log it.
+	 */
+	xfs_btree_set_numrecs(left, lrecs + rrecs);
+	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/* If there is a right sibling, point it to the remaining block. */
+	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+
+	/* Free the deleted block. */
+	error = cur->bc_ops->free_block(cur, rbp);
+	if (error)
+		goto error0;
+	XFS_BTREE_STATS_INC(cur, free);
+
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		cur->bc_bufs[level] = lbp;
+		cur->bc_ptrs[level] += lrecs;
+		cur->bc_ra[level] = 0;
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+		   (level + 1 < cur->bc_nlevels)) {
+		error = xfs_btree_increment(cur, level + 1, &i);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * Readjust the ptr at this level if it's not a leaf, since it's
+	 * still pointing at the deletion point, which makes the cursor
+	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+	 * We can't use decrement because it would change the next level up.
+	 */
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	/* Return value means the next level up has something to do. */
+	*stat = 2;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	if (tcur)
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_btree_delete(
+	struct xfs_btree_cur	*cur,
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			level;
+	int			i;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 *
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		error = xfs_btree_delrec(cur, level, &i);
+		if (error)
+			goto error0;
+	}
+
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				error = xfs_btree_decrement(cur, level, &i);
+				if (error)
+					goto error0;
+				break;
+			}
+		}
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_btree_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	union xfs_btree_rec	**recp,	/* output: btree record */
+	int			*stat)	/* output: success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer pointer */
+	int			ptr;	/* record number */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+
+	ptr = cur->bc_ptrs[0];
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		return error;
+#endif
+
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+
+	/*
+	 * Point to the record and extract its data.
+	 */
+	*recp = xfs_btree_rec_addr(cur, ptr, block);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	__uint64_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	union xfs_btree_ptr     rptr;
+
+	/* do right sibling readahead */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+	/* modify the owner */
+	block = xfs_btree_get_block(cur, level, &bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+	else
+		block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+	/*
+	 * If the block is a root block hosted in an inode, we might not have a
+	 * buffer pointer here and we shouldn't attempt to log the change as the
+	 * information is already held in the inode and discarded when the root
+	 * block is formatted into the on-disk inode fork. We still change it,
+	 * though, so everything is consistent in memory.
+	 */
+	if (bp) {
+		if (cur->bc_tp) {
+			xfs_trans_ordered_buf(cur->bc_tp, bp);
+			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+		} else {
+			xfs_buf_delwri_queue(bp, buffer_list);
+		}
+	} else {
+		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+		ASSERT(level == cur->bc_nlevels - 1);
+	}
+
+	/* now read rh sibling block for next iteration */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		return -ENOENT;
+
+	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+	struct xfs_btree_cur	*cur,
+	__uint64_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	union xfs_btree_ptr     lptr;
+	int			level;
+	struct xfs_btree_block	*block = NULL;
+	int			error = 0;
+
+	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+	/* for each level */
+	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+		/* grab the left hand block */
+		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+		if (error)
+			return error;
+
+		/* readahead the left most block for the next level down */
+		if (level > 0) {
+			union xfs_btree_ptr     *ptr;
+
+			ptr = xfs_btree_ptr_addr(cur, 1, block);
+			xfs_btree_readahead_ptr(cur, ptr, 1);
+
+			/* save for the next iteration of the loop */
+			lptr = *ptr;
+		}
+
+		/* for each buffer in the level */
+		do {
+			error = xfs_btree_block_change_owner(cur, level,
+							     new_owner,
+							     buffer_list);
+		} while (!error);
+
+		if (error != -ENOENT)
+			return error;
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_btree.h b/kernel/fs/xfs/libxfs/xfs_btree.h
new file mode 100644
index 000000000..8f18bab73
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_btree.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_H__
+#define	__XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t	*xfs_btree_cur_zone;
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+	__be32			s;	/* short form ptr */
+	__be64			l;	/* long form ptr */
+};
+
+union xfs_btree_key {
+	xfs_bmbt_key_t		bmbt;
+	xfs_bmdr_key_t		bmbr;	/* bmbt root block */
+	xfs_alloc_key_t		alloc;
+	xfs_inobt_key_t		inobt;
+};
+
+union xfs_btree_rec {
+	xfs_bmbt_rec_t		bmbt;
+	xfs_bmdr_rec_t		bmbr;	/* bmbt root block */
+	xfs_alloc_rec_t		alloc;
+	xfs_inobt_rec_t		inobt;
+};
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define	XFS_LOOKUP_EQ	((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define	XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define	XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define	XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define	XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define	XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
+#define	XFS_BTNUM_FINO	((xfs_btnum_t)XFS_BTNUM_FINOi)
+
+/*
+ * For logging record fields.
+ */
+#define	XFS_BB_MAGIC		(1 << 0)
+#define	XFS_BB_LEVEL		(1 << 1)
+#define	XFS_BB_NUMRECS		(1 << 2)
+#define	XFS_BB_LEFTSIB		(1 << 3)
+#define	XFS_BB_RIGHTSIB		(1 << 4)
+#define	XFS_BB_BLKNO		(1 << 5)
+#define	XFS_BB_LSN		(1 << 6)
+#define	XFS_BB_UUID		(1 << 7)
+#define	XFS_BB_OWNER		(1 << 8)
+#define	XFS_BB_NUM_BITS		5
+#define	XFS_BB_ALL_BITS		((1 << XFS_BB_NUM_BITS) - 1)
+#define	XFS_BB_NUM_BITS_CRC	9
+#define	XFS_BB_ALL_BITS_CRC	((1 << XFS_BB_NUM_BITS_CRC) - 1)
+
+/*
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+	XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat)  \
+do {    \
+	switch (cur->bc_btnum) {  \
+	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;	\
+	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;	\
+	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;	\
+	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;	\
+	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break;	\
+	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
+	}       \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+	XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
+do {    \
+	switch (cur->bc_btnum) {  \
+	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+	case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
+	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
+	}       \
+} while (0)
+
+#define	XFS_BTREE_MAXLEVELS	8	/* max of all btrees */
+
+struct xfs_btree_ops {
+	/* size of the key and record structures */
+	size_t	key_len;
+	size_t	rec_len;
+
+	/* cursor operations */
+	struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+	void	(*update_cursor)(struct xfs_btree_cur *src,
+				 struct xfs_btree_cur *dst);
+
+	/* update btree root pointer */
+	void	(*set_root)(struct xfs_btree_cur *cur,
+			    union xfs_btree_ptr *nptr, int level_change);
+
+	/* block allocation / freeing */
+	int	(*alloc_block)(struct xfs_btree_cur *cur,
+			       union xfs_btree_ptr *start_bno,
+			       union xfs_btree_ptr *new_bno,
+			       int *stat);
+	int	(*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+	/* update last record information */
+	void	(*update_lastrec)(struct xfs_btree_cur *cur,
+				  struct xfs_btree_block *block,
+				  union xfs_btree_rec *rec,
+				  int ptr, int reason);
+
+	/* records in block/level */
+	int	(*get_minrecs)(struct xfs_btree_cur *cur, int level);
+	int	(*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+	/* records on disk.  Matter for the root in inode case. */
+	int	(*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+	/* init values of btree structures */
+	void	(*init_key_from_rec)(union xfs_btree_key *key,
+				     union xfs_btree_rec *rec);
+	void	(*init_rec_from_key)(union xfs_btree_key *key,
+				     union xfs_btree_rec *rec);
+	void	(*init_rec_from_cur)(struct xfs_btree_cur *cur,
+				     union xfs_btree_rec *rec);
+	void	(*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+				     union xfs_btree_ptr *ptr);
+
+	/* difference between key value and cursor value */
+	__int64_t (*key_diff)(struct xfs_btree_cur *cur,
+			      union xfs_btree_key *key);
+
+	const struct xfs_buf_ops	*buf_ops;
+
+#if defined(DEBUG) || defined(XFS_WARN)
+	/* check that k1 is lower than k2 */
+	int	(*keys_inorder)(struct xfs_btree_cur *cur,
+				union xfs_btree_key *k1,
+				union xfs_btree_key *k2);
+
+	/* check that r1 is lower than r2 */
+	int	(*recs_inorder)(struct xfs_btree_cur *cur,
+				union xfs_btree_rec *r1,
+				union xfs_btree_rec *r2);
+#endif
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE	0
+#define LASTREC_INSREC	1
+#define LASTREC_DELREC	2
+
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+	struct xfs_trans	*bc_tp;	/* transaction we're in, if any */
+	struct xfs_mount	*bc_mp;	/* file system mount struct */
+	const struct xfs_btree_ops *bc_ops;
+	uint			bc_flags; /* btree features - below */
+	union {
+		xfs_alloc_rec_incore_t	a;
+		xfs_bmbt_irec_t		b;
+		xfs_inobt_rec_incore_t	i;
+	}		bc_rec;		/* current insert/search record value */
+	struct xfs_buf	*bc_bufs[XFS_BTREE_MAXLEVELS];	/* buf ptr per level */
+	int		bc_ptrs[XFS_BTREE_MAXLEVELS];	/* key/record # */
+	__uint8_t	bc_ra[XFS_BTREE_MAXLEVELS];	/* readahead bits */
+#define	XFS_BTCUR_LEFTRA	1	/* left sibling has been read-ahead */
+#define	XFS_BTCUR_RIGHTRA	2	/* right sibling has been read-ahead */
+	__uint8_t	bc_nlevels;	/* number of levels in the tree */
+	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
+	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
+	union {
+		struct {			/* needed for BNO, CNT, INO */
+			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
+			xfs_agnumber_t	agno;	/* ag number */
+		} a;
+		struct {			/* needed for BMAP */
+			struct xfs_inode *ip;	/* pointer to our inode */
+			struct xfs_bmap_free *flist;	/* list to free after */
+			xfs_fsblock_t	firstblock;	/* 1st blk allocated */
+			int		allocated;	/* count of alloced */
+			short		forksize;	/* fork's inode space */
+			char		whichfork;	/* data or attr fork */
+			char		flags;		/* flags */
+#define	XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
+		} b;
+	}		bc_private;	/* per-btree type data */
+} xfs_btree_cur_t;
+
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS		(1<<0)	/* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE		(1<<1)	/* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
+
+
+#define	XFS_BTREE_NOERROR	0
+#define	XFS_BTREE_ERROR		1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#define	XFS_BUF_TO_BLOCK(bp)	((struct xfs_btree_block *)((bp)->b_addr))
+
+
+/*
+ * Check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block, if any */
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_fsblock_t		ptr,	/* btree block disk address */
+	int			level);	/* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			error);	/* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t		*cur,	/* input cursor */
+	xfs_btree_cur_t		**ncur);/* output cursor */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level);	/* level to check */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t		fields,	/* bitmask of fields */
+	const short		*offsets,/* table of field offsets */
+	int			nbits,	/* number of bits to inspect */
+	int			*first,	/* output: first byte offset */
+	int			*last);	/* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock,	/* lock flags for read_buf */
+	struct xfs_buf		**bpp,	/* buffer for fsbno */
+	int			refval,	/* ref count value for buffer */
+	const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	xfs_extlen_t		count,	/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	xfs_extlen_t		count,	/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops);
+
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	__u32		magic,
+	__u16		level,
+	__u16		numrecs,
+	__u64		owner,
+	unsigned int	flags);
+
+void
+xfs_btree_init_block_int(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*buf,
+	xfs_daddr_t		blkno,
+	__u32			magic,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner,
+	unsigned int		flags);
+
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+			   struct list_head *buffer_list);
+
+/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+		__uint16_t numrecs)
+{
+	block->bb_numrecs = cpu_to_be16(numrecs);
+}
+
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+	return be16_to_cpu(block->bb_level);
+}
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#define	XFS_EXTLEN_MIN(a,b)	min_t(xfs_extlen_t, (a), (b))
+#define	XFS_EXTLEN_MAX(a,b)	max_t(xfs_extlen_t, (a), (b))
+#define	XFS_AGBLOCK_MIN(a,b)	min_t(xfs_agblock_t, (a), (b))
+#define	XFS_AGBLOCK_MAX(a,b)	max_t(xfs_agblock_t, (a), (b))
+#define	XFS_FILEOFF_MIN(a,b)	min_t(xfs_fileoff_t, (a), (b))
+#define	XFS_FILEOFF_MAX(a,b)	max_t(xfs_fileoff_t, (a), (b))
+#define	XFS_FILBLKS_MIN(a,b)	min_t(xfs_filblks_t, (a), (b))
+#define	XFS_FILBLKS_MAX(a,b)	max_t(xfs_filblks_t, (a), (b))
+
+#define	XFS_FSB_SANITY_CHECK(mp,fsb)	\
+	(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+
+/*
+ * Trace hooks.  Currently not implemented as they need to be ported
+ * over to the generic tracing functionality, which is some effort.
+ *
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+#define	XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define	XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define	XFS_BTREE_TRACE_ARGI(c, i)
+#define	XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define	XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define	XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define	XFS_BTREE_TRACE_CURSOR(c, t)
+
+#endif	/* __XFS_BTREE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_cksum.h b/kernel/fs/xfs/libxfs/xfs_cksum.h
new file mode 100644
index 000000000..fad1676ad
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED	(~(__uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	__uint32_t zero = 0;
+	__uint32_t crc;
+
+	/* Calculate CRC up to the checksum. */
+	crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+	/* Skip checksum field */
+	crc = crc32c(crc, &zero, sizeof(__u32));
+
+	/* Calculate the rest of the CRC. */
+	return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+		      length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+	return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/kernel/fs/xfs/libxfs/xfs_da_btree.c b/kernel/fs/xfs/libxfs/xfs_da_btree.c
new file mode 100644
index 000000000..2385f8cd0
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_da_btree.c
@@ -0,0 +1,2660 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_root,
+					    xfs_da_state_blk_t *new_child);
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_blk,
+					    xfs_da_state_blk_t *split_blk,
+					    xfs_da_state_blk_t *blk_to_add,
+					    int treelevel,
+					    int *result);
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *node_blk_1,
+					 xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
+				   xfs_da_state_blk_t *old_node_blk,
+				   xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
+					      xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *src_node_blk,
+					 xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC int	xfs_da3_blk_unlink(xfs_da_state_t *state,
+				  xfs_da_state_blk_t *drop_blk,
+				  xfs_da_state_blk_t *save_blk);
+
+
+kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+	int	i;
+
+	for (i = 0; i < state->altpath.active; i++)
+		state->altpath.blk[i].bp = NULL;
+	state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+	xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+	memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+	kmem_zone_free(xfs_da_state_zone, state);
+}
+
+static bool
+xfs_da3_node_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_da_intnode	*hdr = bp->b_addr;
+	struct xfs_da3_icnode_hdr ichdr;
+	const struct xfs_dir_ops *ops;
+
+	ops = xfs_dir_get_ops(mp, NULL);
+
+	ops->node_hdr_from_disk(&ichdr, hdr);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+			return false;
+
+		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (ichdr.magic != XFS_DA_NODE_MAGIC)
+			return false;
+	}
+	if (ichdr.level == 0)
+		return false;
+	if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+		return false;
+	if (ichdr.count == 0)
+		return false;
+
+	/*
+	 * we don't know if the node is for and attribute or directory tree,
+	 * so only fail if the count is outside both bounds
+	 */
+	if (ichdr.count > mp->m_dir_geo->node_ents &&
+	    ichdr.count > mp->m_attr_geo->node_ents)
+		return false;
+
+	/* XXX: hash order check? */
+
+	return true;
+}
+
+static void
+xfs_da3_node_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_da3_node_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da3_node_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+		case XFS_DA3_NODE_MAGIC:
+			if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+				xfs_buf_ioerror(bp, -EFSBADCRC);
+				break;
+			}
+			/* fall through */
+		case XFS_DA_NODE_MAGIC:
+			if (!xfs_da3_node_verify(bp)) {
+				xfs_buf_ioerror(bp, -EFSCORRUPTED);
+				break;
+			}
+			return;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			bp->b_ops = &xfs_attr3_leaf_buf_ops;
+			bp->b_ops->verify_read(bp);
+			return;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			bp->b_ops = &xfs_dir3_leafn_buf_ops;
+			bp->b_ops->verify_read(bp);
+			return;
+		default:
+			break;
+	}
+
+	/* corrupt block */
+	xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+	.verify_read = xfs_da3_node_read_verify,
+	.verify_write = xfs_da3_node_write_verify,
+};
+
+int
+xfs_da3_node_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			which_fork)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+					which_fork, &xfs_da3_node_buf_ops);
+	if (!err && tp) {
+		struct xfs_da_blkinfo	*info = (*bpp)->b_addr;
+		int			type;
+
+		switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+		case XFS_DA3_NODE_MAGIC:
+			type = XFS_BLFT_DA_NODE_BUF;
+			break;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			type = XFS_BLFT_ATTR_LEAF_BUF;
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			type = XFS_BLFT_DIR_LEAFN_BUF;
+			break;
+		default:
+			type = 0;
+			ASSERT(0);
+			break;
+		}
+		xfs_trans_buf_set_type(tp, *bpp, type);
+	}
+	return err;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da3_node_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	int			level,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_da3_icnode_hdr ichdr = {0};
+	struct xfs_buf		*bp;
+	int			error;
+	struct xfs_inode	*dp = args->dp;
+
+	trace_xfs_da_node_create(args);
+	ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
+
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_da3_node_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+	node = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		ichdr.magic = XFS_DA3_NODE_MAGIC;
+		hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+		hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+		uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+	} else {
+		ichdr.magic = XFS_DA_NODE_MAGIC;
+	}
+	ichdr.level = level;
+
+	dp->d_ops->node_hdr_to_disk(node, &ichdr);
+	xfs_trans_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int							/* error */
+xfs_da3_split(
+	struct xfs_da_state	*state)
+{
+	struct xfs_da_state_blk	*oldblk;
+	struct xfs_da_state_blk	*newblk;
+	struct xfs_da_state_blk	*addblk;
+	struct xfs_da_intnode	*node;
+	struct xfs_buf		*bp;
+	int			max;
+	int			action = 0;
+	int			error;
+	int			i;
+
+	trace_xfs_da_split(state->args);
+
+	/*
+	 * Walk back up the tree splitting/inserting/adjusting as necessary.
+	 * If we need to insert and there isn't room, split the node, then
+	 * decide which fragment to insert the new block from below into.
+	 * Note that we may split the root this way, but we need more fixup.
+	 */
+	max = state->path.active - 1;
+	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+	addblk = &state->path.blk[max];		/* initial dummy value */
+	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+		oldblk = &state->path.blk[i];
+		newblk = &state->altpath.blk[i];
+
+		/*
+		 * If a leaf node then
+		 *     Allocate a new leaf node, then rebalance across them.
+		 * else if an intermediate node then
+		 *     We split on the last layer, must we split the node?
+		 */
+		switch (oldblk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr3_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != -ENOSPC)) {
+				return error;	/* GROT: attr is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				trace_xfs_attr_leaf_split_before(state->args);
+				error = xfs_attr3_leaf_split(state, oldblk,
+							    &state->extrablk);
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				trace_xfs_attr_leaf_split_after(state->args);
+				error = xfs_attr3_leaf_split(state, newblk,
+							    &state->extrablk);
+			}
+			if (error)
+				return error;	/* GROT: attr inconsistent */
+			addblk = newblk;
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_split(state, oldblk, newblk);
+			if (error)
+				return error;
+			addblk = newblk;
+			break;
+		case XFS_DA_NODE_MAGIC:
+			error = xfs_da3_node_split(state, oldblk, newblk, addblk,
+							 max - i, &action);
+			addblk->bp = NULL;
+			if (error)
+				return error;	/* GROT: dir is inconsistent */
+			/*
+			 * Record the newly split block for the next time thru?
+			 */
+			if (action)
+				addblk = newblk;
+			else
+				addblk = NULL;
+			break;
+		}
+
+		/*
+		 * Update the btree to show the new hashval for this child.
+		 */
+		xfs_da3_fixhashpath(state, &state->path);
+	}
+	if (!addblk)
+		return 0;
+
+	/*
+	 * Split the root node.
+	 */
+	ASSERT(state->path.active == 0);
+	oldblk = &state->path.blk[0];
+	error = xfs_da3_root_split(state, oldblk, addblk);
+	if (error) {
+		addblk->bp = NULL;
+		return error;	/* GROT: dir is inconsistent */
+	}
+
+	/*
+	 * Update pointers to the node which used to be block 0 and
+	 * just got bumped because of the addition of a new root node.
+	 * There might be three blocks involved if a double split occurred,
+	 * and the original block 0 could be at any position in the list.
+	 *
+	 * Note: the magic numbers and sibling pointers are in the same
+	 * physical place for both v2 and v3 headers (by design). Hence it
+	 * doesn't matter which version of the xfs_da_intnode structure we use
+	 * here as the result will be the same using either structure.
+	 */
+	node = oldblk->bp->b_addr;
+	if (node->hdr.info.forw) {
+		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->b_addr;
+		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+		xfs_trans_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	node = oldblk->bp->b_addr;
+	if (node->hdr.info.back) {
+		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->b_addr;
+		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+		xfs_trans_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	addblk->bp = NULL;
+	return 0;
+}
+
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int						/* error */
+xfs_da3_root_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	struct xfs_inode	*dp;
+	struct xfs_trans	*tp;
+	struct xfs_dir2_leaf	*leaf;
+	xfs_dablk_t		blkno;
+	int			level;
+	int			error;
+	int			size;
+
+	trace_xfs_da_root_split(state->args);
+
+	/*
+	 * Copy the existing (incorrect) block from the root node position
+	 * to a free space somewhere.
+	 */
+	args = state->args;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return error;
+
+	dp = args->dp;
+	tp = args->trans;
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+	if (error)
+		return error;
+	node = bp->b_addr;
+	oldroot = blk1->bp->b_addr;
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+		struct xfs_da3_icnode_hdr icnodehdr;
+
+		dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
+		btree = dp->d_ops->node_tree_p(oldroot);
+		size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
+		level = icnodehdr.level;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+	} else {
+		struct xfs_dir3_icleaf_hdr leafhdr;
+		struct xfs_dir2_leaf_entry *ents;
+
+		leaf = (xfs_dir2_leaf_t *)oldroot;
+		dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+		ents = dp->d_ops->leaf_ents_p(leaf);
+
+		ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+		       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+		size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+		level = 0;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+	}
+
+	/*
+	 * we can copy most of the information in the node from one block to
+	 * another, but for CRC enabled headers we have to make sure that the
+	 * block specific identifiers are kept intact. We update the buffer
+	 * directly for this.
+	 */
+	memcpy(node, oldroot, size);
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+		node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+	}
+	xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+	bp->b_ops = blk1->bp->b_ops;
+	xfs_trans_buf_copy_type(bp, blk1->bp);
+	blk1->bp = bp;
+	blk1->blkno = blkno;
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da3_node_create(args,
+		(args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+		level + 1, &bp, args->whichfork);
+	if (error)
+		return error;
+
+	node = bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	btree = dp->d_ops->node_tree_p(node);
+	btree[0].hashval = cpu_to_be32(blk1->hashval);
+	btree[0].before = cpu_to_be32(blk1->blkno);
+	btree[1].hashval = cpu_to_be32(blk2->hashval);
+	btree[1].before = cpu_to_be32(blk2->blkno);
+	nodehdr.count = 2;
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+
+#ifdef DEBUG
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		ASSERT(blk1->blkno >= args->geo->leafblk &&
+		       blk1->blkno < args->geo->freeblk);
+		ASSERT(blk2->blkno >= args->geo->leafblk &&
+		       blk2->blkno < args->geo->freeblk);
+	}
+#endif
+
+	/* Header is already logged by xfs_da_node_create */
+	xfs_trans_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
+
+	return 0;
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int						/* error */
+xfs_da3_node_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk,
+	struct xfs_da_state_blk	*addblk,
+	int			treelevel,
+	int			*result)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	xfs_dablk_t		blkno;
+	int			newcount;
+	int			error;
+	int			useextra;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_split(state->args);
+
+	node = oldblk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+
+	/*
+	 * With V2 dirs the extra block is data or freespace.
+	 */
+	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+	newcount = 1 + useextra;
+	/*
+	 * Do we have to split the node?
+	 */
+	if (nodehdr.count + newcount > state->args->geo->node_ents) {
+		/*
+		 * Allocate a new node, add to the doubly linked chain of
+		 * nodes, then move some of our excess entries into it.
+		 */
+		error = xfs_da_grow_inode(state->args, &blkno);
+		if (error)
+			return error;	/* GROT: dir is inconsistent */
+
+		error = xfs_da3_node_create(state->args, blkno, treelevel,
+					   &newblk->bp, state->args->whichfork);
+		if (error)
+			return error;	/* GROT: dir is inconsistent */
+		newblk->blkno = blkno;
+		newblk->magic = XFS_DA_NODE_MAGIC;
+		xfs_da3_node_rebalance(state, oldblk, newblk);
+		error = xfs_da3_blk_link(state, oldblk, newblk);
+		if (error)
+			return error;
+		*result = 1;
+	} else {
+		*result = 0;
+	}
+
+	/*
+	 * Insert the new entry(s) into the correct block
+	 * (updating last hashval in the process).
+	 *
+	 * xfs_da3_node_add() inserts BEFORE the given index,
+	 * and as a result of using node_lookup_int() we always
+	 * point to a valid entry (not after one), but a split
+	 * operation always results in a new block whose hashvals
+	 * FOLLOW the current block.
+	 *
+	 * If we had double-split op below us, then add the extra block too.
+	 */
+	node = oldblk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	if (oldblk->index <= nodehdr.count) {
+		oldblk->index++;
+		xfs_da3_node_add(state, oldblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				oldblk->index++;
+			xfs_da3_node_add(state, oldblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	} else {
+		newblk->index++;
+		xfs_da3_node_add(state, newblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				newblk->index++;
+			xfs_da3_node_add(state, newblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da3_node_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_intnode	*tmpnode;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da_node_entry *btree_s;
+	struct xfs_da_node_entry *btree_d;
+	struct xfs_da3_icnode_hdr nodehdr1;
+	struct xfs_da3_icnode_hdr nodehdr2;
+	struct xfs_trans	*tp;
+	int			count;
+	int			tmp;
+	int			swap = 0;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_rebalance(state->args);
+
+	node1 = blk1->bp->b_addr;
+	node2 = blk2->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+	dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+	btree1 = dp->d_ops->node_tree_p(node1);
+	btree2 = dp->d_ops->node_tree_p(node2);
+
+	/*
+	 * Figure out how many entries need to move, and in which direction.
+	 * Swap the nodes around if that makes it simpler.
+	 */
+	if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+			be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
+		tmpnode = node1;
+		node1 = node2;
+		node2 = tmpnode;
+		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+		btree1 = dp->d_ops->node_tree_p(node1);
+		btree2 = dp->d_ops->node_tree_p(node2);
+		swap = 1;
+	}
+
+	count = (nodehdr1.count - nodehdr2.count) / 2;
+	if (count == 0)
+		return;
+	tp = state->args->trans;
+	/*
+	 * Two cases: high-to-low and low-to-high.
+	 */
+	if (count > 0) {
+		/*
+		 * Move elements in node2 up to make a hole.
+		 */
+		tmp = nodehdr2.count;
+		if (tmp > 0) {
+			tmp *= (uint)sizeof(xfs_da_node_entry_t);
+			btree_s = &btree2[0];
+			btree_d = &btree2[count];
+			memmove(btree_d, btree_s, tmp);
+		}
+
+		/*
+		 * Move the req'd B-tree elements from high in node1 to
+		 * low in node2.
+		 */
+		nodehdr2.count += count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree1[nodehdr1.count - count];
+		btree_d = &btree2[0];
+		memcpy(btree_d, btree_s, tmp);
+		nodehdr1.count -= count;
+	} else {
+		/*
+		 * Move the req'd B-tree elements from low in node2 to
+		 * high in node1.
+		 */
+		count = -count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree2[0];
+		btree_d = &btree1[nodehdr1.count];
+		memcpy(btree_d, btree_s, tmp);
+		nodehdr1.count += count;
+
+		xfs_trans_log_buf(tp, blk1->bp,
+			XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+		/*
+		 * Move elements in node2 down to fill the hole.
+		 */
+		tmp  = nodehdr2.count - count;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree2[count];
+		btree_d = &btree2[0];
+		memmove(btree_d, btree_s, tmp);
+		nodehdr2.count -= count;
+	}
+
+	/*
+	 * Log header of node 1 and all current bits of node 2.
+	 */
+	dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
+	xfs_trans_log_buf(tp, blk1->bp,
+		XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+
+	dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
+	xfs_trans_log_buf(tp, blk2->bp,
+		XFS_DA_LOGRANGE(node2, &node2->hdr,
+				dp->d_ops->node_hdr_size +
+				(sizeof(btree2[0]) * nodehdr2.count)));
+
+	/*
+	 * Record the last hashval from each block for upward propagation.
+	 * (note: don't use the swapped node pointers)
+	 */
+	if (swap) {
+		node1 = blk1->bp->b_addr;
+		node2 = blk2->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+		btree1 = dp->d_ops->node_tree_p(node1);
+		btree2 = dp->d_ops->node_tree_p(node2);
+	}
+	blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (blk1->index >= nodehdr1.count) {
+		blk2->index = blk1->index - nodehdr1.count;
+		blk1->index = nodehdr1.count + 1;	/* make it invalid */
+	}
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da3_node_add(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_add(state->args);
+
+	node = oldblk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	btree = dp->d_ops->node_tree_p(node);
+
+	ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
+	ASSERT(newblk->blkno != 0);
+	if (state->args->whichfork == XFS_DATA_FORK)
+		ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+		       newblk->blkno < state->args->geo->freeblk);
+
+	/*
+	 * We may need to make some room before we insert the new node.
+	 */
+	tmp = 0;
+	if (oldblk->index < nodehdr.count) {
+		tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+		memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
+	}
+	btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+	btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
+	xfs_trans_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+				tmp + sizeof(*btree)));
+
+	nodehdr.count += 1;
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+	xfs_trans_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+	/*
+	 * Copy the last hash value from the oldblk to propagate upwards.
+	 */
+	oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da3_join(
+	struct xfs_da_state	*state)
+{
+	struct xfs_da_state_blk	*drop_blk;
+	struct xfs_da_state_blk	*save_blk;
+	int			action = 0;
+	int			error;
+
+	trace_xfs_da_join(state->args);
+
+	drop_blk = &state->path.blk[ state->path.active-1 ];
+	save_blk = &state->altpath.blk[ state->path.active-1 ];
+	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	/*
+	 * Walk back up the tree joining/deallocating as necessary.
+	 * When we stop dropping blocks, break out.
+	 */
+	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+		 state->path.active--) {
+		/*
+		 * See if we can combine the block with a neighbor.
+		 *   (action == 0) => no options, just leave
+		 *   (action == 1) => coalesce, then unlink
+		 *   (action == 2) => block empty, unlink it
+		 */
+		switch (drop_blk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr3_leaf_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DA_NODE_MAGIC:
+			/*
+			 * Remove the offending node, fixup hashvals,
+			 * check for a toosmall neighbor.
+			 */
+			xfs_da3_node_remove(state, drop_blk);
+			xfs_da3_fixhashpath(state, &state->path);
+			error = xfs_da3_node_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_da3_node_unbalance(state, drop_blk, save_blk);
+			break;
+		}
+		xfs_da3_fixhashpath(state, &state->altpath);
+		error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
+		xfs_da_state_kill_altpath(state);
+		if (error)
+			return error;
+		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+							 drop_blk->bp);
+		drop_blk->bp = NULL;
+		if (error)
+			return error;
+	}
+	/*
+	 * We joined all the way to the top.  If it turns out that
+	 * we only have one entry in the root, make the child block
+	 * the new root.
+	 */
+	xfs_da3_node_remove(state, drop_blk);
+	xfs_da3_fixhashpath(state, &state->path);
+	error = xfs_da3_root_join(state, &state->path.blk[0]);
+	return error;
+}
+
+#ifdef	DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+	__be16	magic = blkinfo->magic;
+
+	if (level == 1) {
+		ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+		       magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+	} else {
+		ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+	}
+	ASSERT(!blkinfo->forw);
+	ASSERT(!blkinfo->back);
+}
+#else	/* !DEBUG */
+#define	xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif	/* !DEBUG */
+
+/*
+ * We have only one entry in the root.  Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da3_root_join(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*root_blk)
+{
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		child;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr oldroothdr;
+	struct xfs_da_node_entry *btree;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_root_join(state->args);
+
+	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+	args = state->args;
+	oldroot = root_blk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+	ASSERT(oldroothdr.forw == 0);
+	ASSERT(oldroothdr.back == 0);
+
+	/*
+	 * If the root has more than one child, then don't do anything.
+	 */
+	if (oldroothdr.count > 1)
+		return 0;
+
+	/*
+	 * Read in the (only) child block, then copy those bytes into
+	 * the root block's buffer and free the original child block.
+	 */
+	btree = dp->d_ops->node_tree_p(oldroot);
+	child = be32_to_cpu(btree[0].before);
+	ASSERT(child != 0);
+	error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
+					     args->whichfork);
+	if (error)
+		return error;
+	xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
+
+	/*
+	 * This could be copying a leaf back into the root block in the case of
+	 * there only being a single leaf block left in the tree. Hence we have
+	 * to update the b_ops pointer as well to match the buffer type change
+	 * that could occur. For dir3 blocks we also need to update the block
+	 * number in the buffer header.
+	 */
+	memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
+	root_blk->bp->b_ops = bp->b_ops;
+	xfs_trans_buf_copy_type(root_blk->bp, bp);
+	if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+		struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+		da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+	}
+	xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+			  args->geo->blksize - 1);
+	error = xfs_da_shrink_inode(args, child, bp);
+	return error;
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da3_node_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	xfs_dablk_t		blkno;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr nodehdr;
+	int			count;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_toosmall(state->args);
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->b_addr;
+	node = (xfs_da_intnode_t *)info;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return 0;	/* blk over 50%, don't try to join */
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (nodehdr.count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (info->forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return error;
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return 0;
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	count  = state->args->geo->node_ents;
+	count -= state->args->geo->node_ents >> 2;
+	count -= nodehdr.count;
+
+	/* start with smaller blk num */
+	forward = nodehdr.forw < nodehdr.back;
+	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_da3_icnode_hdr thdr;
+		if (forward)
+			blkno = nodehdr.forw;
+		else
+			blkno = nodehdr.back;
+		if (blkno == 0)
+			continue;
+		error = xfs_da3_node_read(state->args->trans, dp,
+					blkno, -1, &bp, state->args->whichfork);
+		if (error)
+			return error;
+
+		node = bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&thdr, node);
+		xfs_trans_brelse(state->args->trans, bp);
+
+		if (count - thdr.count >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da3_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return error;
+	if (retval) {
+		*action = 0;
+		return 0;
+	}
+	*action = 1;
+	return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp,
+	int			*count)
+{
+	struct xfs_da_intnode	 *node;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+
+	node = bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	if (count)
+		*count = nodehdr.count;
+	if (!nodehdr.count)
+		return 0;
+	btree = dp->d_ops->node_tree_p(node);
+	return be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da3_fixhashpath(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	xfs_dahash_t		lasthash=0;
+	int			level;
+	int			count;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_fixhashpath(state->args);
+
+	level = path->active-1;
+	blk = &path->blk[ level ];
+	switch (blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DA_NODE_MAGIC:
+		lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	}
+	for (blk--, level--; level >= 0; blk--, level--) {
+		struct xfs_da3_icnode_hdr nodehdr;
+
+		node = blk->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+		btree = dp->d_ops->node_tree_p(node);
+		if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
+			break;
+		blk->hashval = lasthash;
+		btree[blk->index].hashval = cpu_to_be32(lasthash);
+		xfs_trans_log_buf(state->args->trans, blk->bp,
+				  XFS_DA_LOGRANGE(node, &btree[blk->index],
+						  sizeof(*btree)));
+
+		lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+	}
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da3_node_remove(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			index;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_remove(state->args);
+
+	node = drop_blk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	ASSERT(drop_blk->index < nodehdr.count);
+	ASSERT(drop_blk->index >= 0);
+
+	/*
+	 * Copy over the offending entry, or just zero it out.
+	 */
+	index = drop_blk->index;
+	btree = dp->d_ops->node_tree_p(node);
+	if (index < nodehdr.count - 1) {
+		tmp  = nodehdr.count - index - 1;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		memmove(&btree[index], &btree[index + 1], tmp);
+		xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+		    XFS_DA_LOGRANGE(node, &btree[index], tmp));
+		index = nodehdr.count - 1;
+	}
+	memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
+	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+	nodehdr.count -= 1;
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+	/*
+	 * Copy the last hash value from the block to propagate upwards.
+	 */
+	drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
+}
+
+/*
+ * Unbalance the elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da3_node_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_da_intnode	*drop_node;
+	struct xfs_da_intnode	*save_node;
+	struct xfs_da_node_entry *drop_btree;
+	struct xfs_da_node_entry *save_btree;
+	struct xfs_da3_icnode_hdr drop_hdr;
+	struct xfs_da3_icnode_hdr save_hdr;
+	struct xfs_trans	*tp;
+	int			sindex;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_unbalance(state->args);
+
+	drop_node = drop_blk->bp->b_addr;
+	save_node = save_blk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+	dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
+	drop_btree = dp->d_ops->node_tree_p(drop_node);
+	save_btree = dp->d_ops->node_tree_p(save_node);
+	tp = state->args->trans;
+
+	/*
+	 * If the dying block has lower hashvals, then move all the
+	 * elements in the remaining block up to make a hole.
+	 */
+	if ((be32_to_cpu(drop_btree[0].hashval) <
+			be32_to_cpu(save_btree[0].hashval)) ||
+	    (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+			be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+		/* XXX: check this - is memmove dst correct? */
+		tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+		memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+		sindex = 0;
+		xfs_trans_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, &save_btree[0],
+				(save_hdr.count + drop_hdr.count) *
+						sizeof(xfs_da_node_entry_t)));
+	} else {
+		sindex = save_hdr.count;
+		xfs_trans_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+				drop_hdr.count * sizeof(xfs_da_node_entry_t)));
+	}
+
+	/*
+	 * Move all the B-tree elements from drop_blk to save_blk.
+	 */
+	tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+	memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+	save_hdr.count += drop_hdr.count;
+
+	dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
+	xfs_trans_log_buf(tp, save_blk->bp,
+		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+				dp->d_ops->node_hdr_size));
+
+	/*
+	 * Save the last hashval in the remaining block for upward propagation.
+	 */
+	save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int							/* error */
+xfs_da3_node_lookup_int(
+	struct xfs_da_state	*state,
+	int			*result)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*curr;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		blkno;
+	xfs_dahash_t		hashval;
+	xfs_dahash_t		btreehashval;
+	int			probe;
+	int			span;
+	int			max;
+	int			error;
+	int			retval;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+
+	/*
+	 * Descend thru the B-tree searching each level for the right
+	 * node to use, until the right hashval is found.
+	 */
+	blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
+	for (blk = &state->path.blk[0], state->path.active = 1;
+			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
+			 blk++, state->path.active++) {
+		/*
+		 * Read the next node down in the tree.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da3_node_read(args->trans, args->dp, blkno,
+					-1, &blk->bp, args->whichfork);
+		if (error) {
+			blk->blkno = 0;
+			state->path.active--;
+			return error;
+		}
+		curr = blk->bp->b_addr;
+		blk->magic = be16_to_cpu(curr->magic);
+
+		if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+		    blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+		    blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+							       blk->bp, NULL);
+			break;
+		}
+
+		blk->magic = XFS_DA_NODE_MAGIC;
+
+
+		/*
+		 * Search an intermediate node for a match.
+		 */
+		node = blk->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+		btree = dp->d_ops->node_tree_p(node);
+
+		max = nodehdr.count;
+		blk->hashval = be32_to_cpu(btree[max - 1].hashval);
+
+		/*
+		 * Binary search.  (note: small blocks will skip loop)
+		 */
+		probe = span = max / 2;
+		hashval = args->hashval;
+		while (span > 4) {
+			span /= 2;
+			btreehashval = be32_to_cpu(btree[probe].hashval);
+			if (btreehashval < hashval)
+				probe += span;
+			else if (btreehashval > hashval)
+				probe -= span;
+			else
+				break;
+		}
+		ASSERT((probe >= 0) && (probe < max));
+		ASSERT((span <= 4) ||
+			(be32_to_cpu(btree[probe].hashval) == hashval));
+
+		/*
+		 * Since we may have duplicate hashval's, find the first
+		 * matching hashval in the node.
+		 */
+		while (probe > 0 &&
+		       be32_to_cpu(btree[probe].hashval) >= hashval) {
+			probe--;
+		}
+		while (probe < max &&
+		       be32_to_cpu(btree[probe].hashval) < hashval) {
+			probe++;
+		}
+
+		/*
+		 * Pick the right block to descend on.
+		 */
+		if (probe == max) {
+			blk->index = max - 1;
+			blkno = be32_to_cpu(btree[max - 1].before);
+		} else {
+			blk->index = probe;
+			blkno = be32_to_cpu(btree[probe].before);
+		}
+	}
+
+	/*
+	 * A leaf block that ends in the hashval that we are interested in
+	 * (final hashval == search hashval) means that the next block may
+	 * contain more entries with the same hashval, shift upward to the
+	 * next leaf and keep searching.
+	 */
+	for (;;) {
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+							&blk->index, state);
+		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
+			blk->index = args->index;
+			args->blkno = blk->blkno;
+		} else {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+		if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
+		    (blk->hashval == args->hashval)) {
+			error = xfs_da3_path_shift(state, &state->path, 1, 1,
+							 &retval);
+			if (error)
+				return error;
+			if (retval == 0) {
+				continue;
+			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+				/* path_shift() gives ENOENT */
+				retval = -ENOATTR;
+			}
+		}
+		break;
+	}
+	*result = retval;
+	return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+	struct xfs_inode *dp,
+	struct xfs_buf	*node1_bp,
+	struct xfs_buf	*node2_bp)
+{
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da3_icnode_hdr node1hdr;
+	struct xfs_da3_icnode_hdr node2hdr;
+
+	node1 = node1_bp->b_addr;
+	node2 = node2_bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+	dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
+	btree1 = dp->d_ops->node_tree_p(node1);
+	btree2 = dp->d_ops->node_tree_p(node2);
+
+	if (node1hdr.count > 0 && node2hdr.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+	      be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int							/* error */
+xfs_da3_blk_link(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*old_blk,
+	struct xfs_da_state_blk	*new_blk)
+{
+	struct xfs_da_blkinfo	*old_info;
+	struct xfs_da_blkinfo	*new_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			before = 0;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	old_info = old_blk->bp->b_addr;
+	new_info = new_blk->bp->b_addr;
+	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+	switch (old_blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DA_NODE_MAGIC:
+		before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
+		break;
+	}
+
+	/*
+	 * Link blocks in appropriate order.
+	 */
+	if (before) {
+		/*
+		 * Link new block in before existing block.
+		 */
+		trace_xfs_da_link_before(args);
+		new_info->forw = cpu_to_be32(old_blk->blkno);
+		new_info->back = old_info->back;
+		if (old_info->back) {
+			error = xfs_da3_node_read(args->trans, dp,
+						be32_to_cpu(old_info->back),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == old_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+			tmp_info->forw = cpu_to_be32(new_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+		}
+		old_info->back = cpu_to_be32(new_blk->blkno);
+	} else {
+		/*
+		 * Link new block in after existing block.
+		 */
+		trace_xfs_da_link_after(args);
+		new_info->forw = old_info->forw;
+		new_info->back = cpu_to_be32(old_blk->blkno);
+		if (old_info->forw) {
+			error = xfs_da3_node_read(args->trans, dp,
+						be32_to_cpu(old_info->forw),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == old_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+			tmp_info->back = cpu_to_be32(new_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+		}
+		old_info->forw = cpu_to_be32(new_blk->blkno);
+	}
+
+	xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+	xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+	return 0;
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int						/* error */
+xfs_da3_blk_unlink(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_da_blkinfo	*drop_info;
+	struct xfs_da_blkinfo	*save_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			error;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	save_info = save_blk->bp->b_addr;
+	drop_info = drop_blk->bp->b_addr;
+	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == drop_blk->magic);
+	ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+	       (be32_to_cpu(save_info->back) == drop_blk->blkno));
+	ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+	       (be32_to_cpu(drop_info->back) == save_blk->blkno));
+
+	/*
+	 * Unlink the leaf block from the doubly linked chain of leaves.
+	 */
+	if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+		trace_xfs_da_unlink_back(args);
+		save_info->back = drop_info->back;
+		if (drop_info->back) {
+			error = xfs_da3_node_read(args->trans, args->dp,
+						be32_to_cpu(drop_info->back),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+			tmp_info->forw = cpu_to_be32(save_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+		}
+	} else {
+		trace_xfs_da_unlink_forward(args);
+		save_info->forw = drop_info->forw;
+		if (drop_info->forw) {
+			error = xfs_da3_node_read(args->trans, args->dp,
+						be32_to_cpu(drop_info->forw),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+			tmp_info->back = cpu_to_be32(save_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+		}
+	}
+
+	xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+	return 0;
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int							/* error */
+xfs_da3_path_shift(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path,
+	int			forward,
+	int			release,
+	int			*result)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_args	*args;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	xfs_dablk_t		blkno = 0;
+	int			level;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_path_shift(state->args);
+
+	/*
+	 * Roll up the Btree looking for the first block where our
+	 * current index is not at the edge of the block.  Note that
+	 * we skip the bottom layer because we want the sibling block.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(path != NULL);
+	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	level = (path->active-1) - 1;	/* skip bottom layer in path */
+	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+		node = blk->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+		btree = dp->d_ops->node_tree_p(node);
+
+		if (forward && (blk->index < nodehdr.count - 1)) {
+			blk->index++;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		} else if (!forward && (blk->index > 0)) {
+			blk->index--;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		}
+	}
+	if (level < 0) {
+		*result = -ENOENT;	/* we're out of our tree */
+		ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+		return 0;
+	}
+
+	/*
+	 * Roll down the edge of the subtree until we reach the
+	 * same depth we were at originally.
+	 */
+	for (blk++, level++; level < path->active; blk++, level++) {
+		/*
+		 * Release the old block.
+		 * (if it's dirty, trans won't actually let go)
+		 */
+		if (release)
+			xfs_trans_brelse(args->trans, blk->bp);
+
+		/*
+		 * Read the next child block.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da3_node_read(args->trans, dp, blkno, -1,
+					&blk->bp, args->whichfork);
+		if (error)
+			return error;
+		info = blk->bp->b_addr;
+		ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+		/*
+		 * Note: we flatten the magic number to a single type so we
+		 * don't have to compare against crc/non-crc types elsewhere.
+		 */
+		switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+		case XFS_DA3_NODE_MAGIC:
+			blk->magic = XFS_DA_NODE_MAGIC;
+			node = (xfs_da_intnode_t *)info;
+			dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+			btree = dp->d_ops->node_tree_p(node);
+			blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+			if (forward)
+				blk->index = 0;
+			else
+				blk->index = nodehdr.count - 1;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+							       blk->bp, NULL);
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+	}
+	*result = 0;
+	return 0;
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const __uint8_t *name, int namelen)
+{
+	xfs_dahash_t hash;
+
+	/*
+	 * Do four characters at a time as long as we can.
+	 */
+	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+		       (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+	/*
+	 * Now do the rest of the characters.
+	 */
+	switch (namelen) {
+	case 3:
+		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+		       rol32(hash, 7 * 3);
+	case 2:
+		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+	case 1:
+		return (name[0] << 0) ^ rol32(hash, 7 * 1);
+	default: /* case 0: */
+		return hash;
+	}
+}
+
+enum xfs_dacmp
+xfs_da_compname(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+	struct xfs_name	*name)
+{
+	return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+	.hashname	= xfs_default_hashname,
+	.compname	= xfs_da_compname
+};
+
+int
+xfs_da_grow_inode_int(
+	struct xfs_da_args	*args,
+	xfs_fileoff_t		*bno,
+	int			count)
+{
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	int			w = args->whichfork;
+	xfs_rfsblock_t		nblks = dp->i_d.di_nblocks;
+	struct xfs_bmbt_irec	map, *mapp;
+	int			nmap, error, got, i, mapi;
+
+	/*
+	 * Find a spot in the file space to put the new block.
+	 */
+	error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+	if (error)
+		return error;
+
+	/*
+	 * Try mapping it in one filesystem block.
+	 */
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	error = xfs_bmapi_write(tp, dp, *bno, count,
+			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist);
+	if (error)
+		return error;
+
+	ASSERT(nmap <= 1);
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	} else if (nmap == 0 && count > 1) {
+		xfs_fileoff_t		b;
+		int			c;
+
+		/*
+		 * If we didn't get it and the block might work if fragmented,
+		 * try without the CONTIG flag.  Loop until we get it all.
+		 */
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		for (b = *bno, mapi = 0; b < *bno + count; ) {
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(*bno + count - b);
+			error = xfs_bmapi_write(tp, dp, b, c,
+					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist);
+			if (error)
+				goto out_free_map;
+			if (nmap < 1)
+				break;
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	} else {
+		mapi = 0;
+		mapp = NULL;
+	}
+
+	/*
+	 * Count the blocks we got, make sure it matches the total.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	if (got != count || mapp[0].br_startoff != *bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    *bno + count) {
+		error = -ENOSPC;
+		goto out_free_map;
+	}
+
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
+
+out_free_map:
+	if (mapp != &map)
+		kmem_free(mapp);
+	return error;
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		*new_blkno)
+{
+	xfs_fileoff_t		bno;
+	int			error;
+
+	trace_xfs_da_grow_inode(args);
+
+	bno = args->geo->leafblk;
+	error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
+	if (!error)
+		*new_blkno = (xfs_dablk_t)bno;
+	return error;
+}
+
+/*
+ * Ick.  We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da3_swap_lastblock(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		*dead_blknop,
+	struct xfs_buf		**dead_bufp)
+{
+	struct xfs_da_blkinfo	*dead_info;
+	struct xfs_da_blkinfo	*sib_info;
+	struct xfs_da_intnode	*par_node;
+	struct xfs_da_intnode	*dead_node;
+	struct xfs_dir2_leaf	*dead_leaf2;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr par_hdr;
+	struct xfs_inode	*dp;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp;
+	struct xfs_buf		*dead_buf;
+	struct xfs_buf		*last_buf;
+	struct xfs_buf		*sib_buf;
+	struct xfs_buf		*par_buf;
+	xfs_dahash_t		dead_hash;
+	xfs_fileoff_t		lastoff;
+	xfs_dablk_t		dead_blkno;
+	xfs_dablk_t		last_blkno;
+	xfs_dablk_t		sib_blkno;
+	xfs_dablk_t		par_blkno;
+	int			error;
+	int			w;
+	int			entno;
+	int			level;
+	int			dead_level;
+
+	trace_xfs_da_swap_lastblock(args);
+
+	dead_buf = *dead_bufp;
+	dead_blkno = *dead_blknop;
+	tp = args->trans;
+	dp = args->dp;
+	w = args->whichfork;
+	ASSERT(w == XFS_DATA_FORK);
+	mp = dp->i_mount;
+	lastoff = args->geo->freeblk;
+	error = xfs_bmap_last_before(tp, dp, &lastoff, w);
+	if (error)
+		return error;
+	if (unlikely(lastoff == 0)) {
+		XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
+				 mp);
+		return -EFSCORRUPTED;
+	}
+	/*
+	 * Read the last block in the btree space.
+	 */
+	last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+	error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
+	if (error)
+		return error;
+	/*
+	 * Copy the last block into the dead buffer and log it.
+	 */
+	memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+	xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
+	dead_info = dead_buf->b_addr;
+	/*
+	 * Get values from the moved block.
+	 */
+	if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_dir3_icleaf_hdr leafhdr;
+		struct xfs_dir2_leaf_entry *ents;
+
+		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+		dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+		ents = dp->d_ops->leaf_ents_p(dead_leaf2);
+		dead_level = 0;
+		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
+	} else {
+		struct xfs_da3_icnode_hdr deadhdr;
+
+		dead_node = (xfs_da_intnode_t *)dead_info;
+		dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
+		btree = dp->d_ops->node_tree_p(dead_node);
+		dead_level = deadhdr.level;
+		dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
+	}
+	sib_buf = par_buf = NULL;
+	/*
+	 * If the moved block has a left sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+		if (error)
+			goto done;
+		sib_info = sib_buf->b_addr;
+		if (unlikely(
+		    be32_to_cpu(sib_info->forw) != last_blkno ||
+		    sib_info->magic != dead_info->magic)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		sib_info->forw = cpu_to_be32(dead_blkno);
+		xfs_trans_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+					sizeof(sib_info->forw)));
+		sib_buf = NULL;
+	}
+	/*
+	 * If the moved block has a right sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+		if (error)
+			goto done;
+		sib_info = sib_buf->b_addr;
+		if (unlikely(
+		       be32_to_cpu(sib_info->back) != last_blkno ||
+		       sib_info->magic != dead_info->magic)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		sib_info->back = cpu_to_be32(dead_blkno);
+		xfs_trans_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+					sizeof(sib_info->back)));
+		sib_buf = NULL;
+	}
+	par_blkno = args->geo->leafblk;
+	level = -1;
+	/*
+	 * Walk down the tree looking for the parent of the moved block.
+	 */
+	for (;;) {
+		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+		if (error)
+			goto done;
+		par_node = par_buf->b_addr;
+		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+		if (level >= 0 && level != par_hdr.level + 1) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		level = par_hdr.level;
+		btree = dp->d_ops->node_tree_p(par_node);
+		for (entno = 0;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].hashval) < dead_hash;
+		     entno++)
+			continue;
+		if (entno == par_hdr.count) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		par_blkno = be32_to_cpu(btree[entno].before);
+		if (level == dead_level + 1)
+			break;
+		xfs_trans_brelse(tp, par_buf);
+		par_buf = NULL;
+	}
+	/*
+	 * We're in the right parent block.
+	 * Look for the right entry.
+	 */
+	for (;;) {
+		for (;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].before) != last_blkno;
+		     entno++)
+			continue;
+		if (entno < par_hdr.count)
+			break;
+		par_blkno = par_hdr.forw;
+		xfs_trans_brelse(tp, par_buf);
+		par_buf = NULL;
+		if (unlikely(par_blkno == 0)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+		if (error)
+			goto done;
+		par_node = par_buf->b_addr;
+		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+		if (par_hdr.level != level) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		btree = dp->d_ops->node_tree_p(par_node);
+		entno = 0;
+	}
+	/*
+	 * Update the parent entry pointing to the moved block.
+	 */
+	btree[entno].before = cpu_to_be32(dead_blkno);
+	xfs_trans_log_buf(tp, par_buf,
+		XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+				sizeof(btree[entno].before)));
+	*dead_blknop = last_blkno;
+	*dead_bufp = last_buf;
+	return 0;
+done:
+	if (par_buf)
+		xfs_trans_brelse(tp, par_buf);
+	if (sib_buf)
+		xfs_trans_brelse(tp, sib_buf);
+	xfs_trans_brelse(tp, last_buf);
+	return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(
+	xfs_da_args_t	*args,
+	xfs_dablk_t	dead_blkno,
+	struct xfs_buf	*dead_buf)
+{
+	xfs_inode_t *dp;
+	int done, error, w, count;
+	xfs_trans_t *tp;
+
+	trace_xfs_da_shrink_inode(args);
+
+	dp = args->dp;
+	w = args->whichfork;
+	tp = args->trans;
+	count = args->geo->fsbcount;
+	for (;;) {
+		/*
+		 * Remove extents.  If we get ENOSPC for a dir we have to move
+		 * the last block to the place we want to kill.
+		 */
+		error = xfs_bunmapi(tp, dp, dead_blkno, count,
+				    xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+				    0, args->firstblock, args->flist, &done);
+		if (error == -ENOSPC) {
+			if (w != XFS_DATA_FORK)
+				break;
+			error = xfs_da3_swap_lastblock(args, &dead_blkno,
+						      &dead_buf);
+			if (error)
+				break;
+		} else {
+			break;
+		}
+	}
+	xfs_trans_binval(tp, dead_buf);
+	return error;
+}
+
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+	int		nmap,
+	xfs_bmbt_irec_t	*mapp,
+	xfs_dablk_t	bno,
+	int		count)
+{
+	int		i;
+	xfs_fileoff_t	off;
+
+	for (i = 0, off = bno; i < nmap; i++) {
+		if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+		    mapp[i].br_startblock == DELAYSTARTBLOCK) {
+			return 0;
+		}
+		if (off != mapp[i].br_startoff) {
+			return 0;
+		}
+		off += mapp[i].br_blockcount;
+	}
+	return off == bno + count;
+}
+
+/*
+ * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
+ *
+ * For the single map case, it is assumed that the caller has provided a pointer
+ * to a valid xfs_buf_map.  For the multiple map case, this function will
+ * allocate the xfs_buf_map to hold all the maps and replace the caller's single
+ * map pointer with the allocated map.
+ */
+static int
+xfs_buf_map_from_irec(
+	struct xfs_mount	*mp,
+	struct xfs_buf_map	**mapp,
+	int			*nmaps,
+	struct xfs_bmbt_irec	*irecs,
+	int			nirecs)
+{
+	struct xfs_buf_map	*map;
+	int			i;
+
+	ASSERT(*nmaps == 1);
+	ASSERT(nirecs >= 1);
+
+	if (nirecs > 1) {
+		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+				  KM_SLEEP | KM_NOFS);
+		if (!map)
+			return -ENOMEM;
+		*mapp = map;
+	}
+
+	*nmaps = nirecs;
+	map = *mapp;
+	for (i = 0; i < *nmaps; i++) {
+		ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
+		       irecs[i].br_startblock != HOLESTARTBLOCK);
+		map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+		map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+	}
+	return 0;
+}
+
+/*
+ * Map the block we are given ready for reading. There are three possible return
+ * values:
+ *	-1 - will be returned if we land in a hole and mappedbno == -2 so the
+ *	     caller knows not to execute a subsequent read.
+ *	 0 - if we mapped the block successfully
+ *	>0 - positive error number if there was an error.
+ */
+static int
+xfs_dabuf_map(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	int			whichfork,
+	struct xfs_buf_map	**map,
+	int			*nmaps)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int			nfsb;
+	int			error = 0;
+	struct xfs_bmbt_irec	irec;
+	struct xfs_bmbt_irec	*irecs = &irec;
+	int			nirecs;
+
+	ASSERT(map && *map);
+	ASSERT(*nmaps == 1);
+
+	if (whichfork == XFS_DATA_FORK)
+		nfsb = mp->m_dir_geo->fsbcount;
+	else
+		nfsb = mp->m_attr_geo->fsbcount;
+
+	/*
+	 * Caller doesn't have a mapping.  -2 means don't complain
+	 * if we land in a hole.
+	 */
+	if (mappedbno == -1 || mappedbno == -2) {
+		/*
+		 * Optimize the one-block case.
+		 */
+		if (nfsb != 1)
+			irecs = kmem_zalloc(sizeof(irec) * nfsb,
+					    KM_SLEEP | KM_NOFS);
+
+		nirecs = nfsb;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
+				       &nirecs, xfs_bmapi_aflag(whichfork));
+		if (error)
+			goto out;
+	} else {
+		irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+		irecs->br_startoff = (xfs_fileoff_t)bno;
+		irecs->br_blockcount = nfsb;
+		irecs->br_state = 0;
+		nirecs = 1;
+	}
+
+	if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
+		error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
+		if (unlikely(error == -EFSCORRUPTED)) {
+			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+				int i;
+				xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+					__func__, (long long)bno,
+					(long long)dp->i_ino);
+				for (i = 0; i < *nmaps; i++) {
+					xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+						i,
+						(long long)irecs[i].br_startoff,
+						(long long)irecs[i].br_startblock,
+						(long long)irecs[i].br_blockcount,
+						irecs[i].br_state);
+				}
+			}
+			XFS_ERROR_REPORT("xfs_da_do_buf(1)",
+					 XFS_ERRLEVEL_LOW, mp);
+		}
+		goto out;
+	}
+	error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
+out:
+	if (irecs != &irec)
+		kmem_free(irecs);
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+	struct xfs_trans	*trans,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	struct xfs_buf		*bp;
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp;
+	int			nmap;
+	int			error;
+
+	*bpp = NULL;
+	mapp = &map;
+	nmap = 1;
+	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+				&mapp, &nmap);
+	if (error) {
+		/* mapping a hole is not an error, but we don't continue */
+		if (error == -1)
+			error = 0;
+		goto out_free;
+	}
+
+	bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
+				    mapp, nmap, 0);
+	error = bp ? bp->b_error : -EIO;
+	if (error) {
+		if (bp)
+			xfs_trans_brelse(trans, bp);
+		goto out_free;
+	}
+
+	*bpp = bp;
+
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+	struct xfs_trans	*trans,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			whichfork,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp;
+	int			nmap;
+	int			error;
+
+	*bpp = NULL;
+	mapp = &map;
+	nmap = 1;
+	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+				&mapp, &nmap);
+	if (error) {
+		/* mapping a hole is not an error, but we don't continue */
+		if (error == -1)
+			error = 0;
+		goto out_free;
+	}
+
+	error = xfs_trans_read_buf_map(dp->i_mount, trans,
+					dp->i_mount->m_ddev_targp,
+					mapp, nmap, 0, &bp, ops);
+	if (error)
+		goto out_free;
+
+	if (whichfork == XFS_ATTR_FORK)
+		xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+	else
+		xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+	*bpp = bp;
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	return error;
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	int			whichfork,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp;
+	int			nmap;
+	int			error;
+
+	mapp = &map;
+	nmap = 1;
+	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+				&mapp, &nmap);
+	if (error) {
+		/* mapping a hole is not an error, but we don't continue */
+		if (error == -1)
+			error = 0;
+		goto out_free;
+	}
+
+	mappedbno = mapp[0].bm_bn;
+	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
+
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	if (error)
+		return -1;
+	return mappedbno;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_da_btree.h b/kernel/fs/xfs/libxfs/xfs_da_btree.h
new file mode 100644
index 000000000..6e153e399
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_da_btree.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define	__XFS_DA_BTREE_H__
+
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_trans;
+struct zone;
+struct xfs_dir_ops;
+
+/*
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
+ */
+struct xfs_da_geometry {
+	int		blksize;	/* da block size in bytes */
+	int		fsbcount;	/* da block size in filesystem blocks */
+	uint8_t		fsblog;		/* log2 of _filesystem_ block size */
+	uint8_t		blklog;		/* log2 of da block size */
+	uint		node_ents;	/* # of entries in a danode */
+	int		magicpct;	/* 37% of block size in bytes */
+	xfs_dablk_t	datablk;	/* blockno of dir data v2 */
+	xfs_dablk_t	leafblk;	/* blockno of leaf data v2 */
+	xfs_dablk_t	freeblk;	/* blockno of free data v2 */
+};
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+	XFS_CMP_DIFFERENT,	/* names are completely different */
+	XFS_CMP_EXACT,		/* names are exactly the same */
+	XFS_CMP_CASE		/* names are same but differ in case */
+};
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+	struct xfs_da_geometry *geo;	/* da block geometry */
+	const __uint8_t	*name;		/* string (maybe not NULL terminated) */
+	int		namelen;	/* length of string (maybe no NULL) */
+	__uint8_t	filetype;	/* filetype of inode for directories */
+	__uint8_t	*value;		/* set of bytes (maybe contain NULLs) */
+	int		valuelen;	/* length of value */
+	int		flags;		/* argument flags (eg: ATTR_NOCREATE) */
+	xfs_dahash_t	hashval;	/* hash value of name */
+	xfs_ino_t	inumber;	/* input/output inode number */
+	struct xfs_inode *dp;		/* directory inode to manipulate */
+	xfs_fsblock_t	*firstblock;	/* ptr to firstblock for bmap calls */
+	struct xfs_bmap_free *flist;	/* ptr to freelist for bmap_finish */
+	struct xfs_trans *trans;	/* current trans (changes over time) */
+	xfs_extlen_t	total;		/* total blocks needed, for 1st bmap */
+	int		whichfork;	/* data or attribute fork */
+	xfs_dablk_t	blkno;		/* blkno of attr leaf of interest */
+	int		index;		/* index of attr of interest in blk */
+	xfs_dablk_t	rmtblkno;	/* remote attr value starting blkno */
+	int		rmtblkcnt;	/* remote attr value block count */
+	int		rmtvaluelen;	/* remote attr value length in bytes */
+	xfs_dablk_t	blkno2;		/* blkno of 2nd attr leaf of interest */
+	int		index2;		/* index of 2nd attr in blk */
+	xfs_dablk_t	rmtblkno2;	/* remote attr value starting blkno */
+	int		rmtblkcnt2;	/* remote attr value block count */
+	int		rmtvaluelen2;	/* remote attr value length in bytes */
+	int		op_flags;	/* operation flags */
+	enum xfs_dacmp	cmpresult;	/* name compare result for lookups */
+} xfs_da_args_t;
+
+/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK	0x0001	/* check for ok with no space */
+#define XFS_DA_OP_RENAME	0x0002	/* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME	0x0004	/* this is an add operation */
+#define XFS_DA_OP_OKNOENT	0x0008	/* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP	0x0010	/* lookup to return CI name if found */
+
+#define XFS_DA_OP_FLAGS \
+	{ XFS_DA_OP_JUSTCHECK,	"JUSTCHECK" }, \
+	{ XFS_DA_OP_RENAME,	"RENAME" }, \
+	{ XFS_DA_OP_ADDNAME,	"ADDNAME" }, \
+	{ XFS_DA_OP_OKNOENT,	"OKNOENT" }, \
+	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+	struct xfs_buf	*bp;		/* buffer containing block */
+	xfs_dablk_t	blkno;		/* filesystem blkno of buffer */
+	xfs_daddr_t	disk_blkno;	/* on-disk blkno (in BBs) of buffer */
+	int		index;		/* relevant index into block */
+	xfs_dahash_t	hashval;	/* last hash value in block */
+	int		magic;		/* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+	int			active;		/* number of active levels */
+	xfs_da_state_blk_t	blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+	xfs_da_args_t		*args;		/* filename arguments */
+	struct xfs_mount	*mp;		/* filesystem mount point */
+	xfs_da_state_path_t	path;		/* search/split paths */
+	xfs_da_state_path_t	altpath;	/* alternate path for join */
+	unsigned char		inleaf;		/* insert into 1->lf, 0->splf */
+	unsigned char		extravalid;	/* T/F: extrablk is in use */
+	unsigned char		extraafter;	/* T/F: extrablk is after new */
+	xfs_da_state_blk_t	extrablk;	/* for double-splits on leaves */
+						/* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)	((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)	\
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+	xfs_dahash_t	(*hashname)(struct xfs_name *);
+	enum xfs_dacmp	(*compname)(struct xfs_da_args *,
+					const unsigned char *, int);
+};
+
+
+/*========================================================================
+ * Function prototypes.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+			    int level, struct xfs_buf **bpp, int whichfork);
+int	xfs_da3_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_da3_join(xfs_da_state_t *state);
+void	xfs_da3_fixhashpath(struct xfs_da_state *state,
+			    struct xfs_da_state_path *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int	xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int	xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+					 int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int	xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+				       xfs_da_state_blk_t *new_blk);
+int	xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			 xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			 struct xfs_buf **bpp, int which_fork);
+
+/*
+ * Utility routines.
+ */
+int	xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int	xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
+			      int count);
+int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			      xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			      struct xfs_buf **bp, int whichfork);
+int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			       struct xfs_buf **bpp, int whichfork,
+			       const struct xfs_buf_ops *ops);
+xfs_daddr_t	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+				xfs_daddr_t mapped_bno, int whichfork,
+				const struct xfs_buf_ops *ops);
+int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+					  struct xfs_buf *dead_buf);
+
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+				const unsigned char *name, int len);
+
+
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+
+extern struct kmem_zone *xfs_da_state_zone;
+extern const struct xfs_nameops xfs_default_nameops;
+
+#endif	/* __XFS_DA_BTREE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_da_format.c b/kernel/fs/xfs/libxfs/xfs_da_format.c
new file mode 100644
index 000000000..9d624a622
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_da_format.c
@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+
+/*
+ * Shortform directory ops
+ */
+static int
+xfs_dir2_sf_entsize(
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	int count = sizeof(struct xfs_dir2_sf_entry);	/* namelen + offset */
+
+	count += len;					/* name */
+	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+				sizeof(xfs_dir2_ino4_t); /* ino # */
+	return count;
+}
+
+static int
+xfs_dir3_sf_entsize(
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+
+
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name.  Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sfe_put_ftype(
+	struct xfs_dir2_sf_entry *sfep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+	struct xfs_dir2_sf_entry *sfep)
+{
+	__uint8_t	ftype;
+
+	ftype = sfep->name[sfep->namelen];
+	if (ftype >= XFS_DIR3_FT_MAX)
+		return XFS_DIR3_FT_UNKNOWN;
+	return ftype;
+}
+
+static void
+xfs_dir3_sfe_put_ftype(
+	struct xfs_dir2_sf_entry *sfep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+	sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide.  These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_dir2_inou_t		*from)
+{
+	if (hdr->i8count)
+		return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+	else
+		return get_unaligned_be32(&from->i4.i);
+}
+
+static void
+xfs_dir2_sf_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_dir2_inou_t		*to,
+	xfs_ino_t		ino)
+{
+	ASSERT((ino & 0xff00000000000000ULL) == 0);
+
+	if (hdr->i8count)
+		put_unaligned_be64(ino, &to->i8.i);
+	else
+		put_unaligned_be32(ino, &to->i4.i);
+}
+
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+	struct xfs_dir2_sf_hdr	*hdr)
+{
+	return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+
+static void
+xfs_dir2_sf_put_parent_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return xfs_dir2_sf_get_ino(hdr,
+				(xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+
+static void
+xfs_dir2_sfe_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr,
+			    (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return xfs_dir2_sf_get_ino(hdr,
+			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+
+static void
+xfs_dir3_sfe_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr,
+			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+
+
+/*
+ * Directory data block operations
+ */
+
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n)					\
+	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
+		 sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+
+#define XFS_DIR3_DATA_ENTSIZE(n)					\
+	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
+		 sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)),	\
+		XFS_DIR2_DATA_ALIGN)
+
+static int
+xfs_dir2_data_entsize(
+	int			n)
+{
+	return XFS_DIR2_DATA_ENTSIZE(n);
+}
+
+static int
+xfs_dir3_data_entsize(
+	int			n)
+{
+	return XFS_DIR3_DATA_ENTSIZE(n);
+}
+
+static __uint8_t
+xfs_dir2_data_get_ftype(
+	struct xfs_dir2_data_entry *dep)
+{
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_data_put_ftype(
+	struct xfs_dir2_data_entry *dep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_data_get_ftype(
+	struct xfs_dir2_data_entry *dep)
+{
+	__uint8_t	ftype = dep->name[dep->namelen];
+
+	if (ftype >= XFS_DIR3_FT_MAX)
+		return XFS_DIR3_FT_UNKNOWN;
+	return ftype;
+}
+
+static void
+xfs_dir3_data_put_ftype(
+	struct xfs_dir2_data_entry *dep,
+	__uint8_t		type)
+{
+	ASSERT(type < XFS_DIR3_FT_MAX);
+	ASSERT(dep->namelen != 0);
+
+	dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+	struct xfs_dir2_data_entry *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+	struct xfs_dir2_data_entry *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1) +
+				XFS_DIR2_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return hdr->bestfree;
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+		(uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	return lp->__ents;
+}
+
+static int
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+		(uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+
+static void
+xfs_dir2_leaf_hdr_from_disk(
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.count);
+	to->stale = be16_to_cpu(from->hdr.stale);
+
+	ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir2_leaf_hdr_to_disk(
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.count = cpu_to_be16(from->count);
+	to->hdr.stale = cpu_to_be16(from->stale);
+}
+
+static void
+xfs_dir3_leaf_hdr_from_disk(
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+	to->back = be32_to_cpu(hdr3->info.hdr.back);
+	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+	to->count = be16_to_cpu(hdr3->count);
+	to->stale = be16_to_cpu(hdr3->stale);
+
+	ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leaf_hdr_to_disk(
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+	ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+	hdr3->info.hdr.back = cpu_to_be32(from->back);
+	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+	hdr3->count = cpu_to_be16(from->count);
+	hdr3->stale = cpu_to_be16(from->stale);
+}
+
+
+/*
+ * Directory/Attribute Node block operations
+ */
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+	return dap->__btree;
+}
+
+static struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+	return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+
+static void
+xfs_da2_node_hdr_from_disk(
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.__count);
+	to->level = be16_to_cpu(from->hdr.__level);
+}
+
+static void
+xfs_da2_node_hdr_to_disk(
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.__count = cpu_to_be16(from->count);
+	to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static void
+xfs_da3_node_hdr_from_disk(
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+	to->back = be32_to_cpu(hdr3->info.hdr.back);
+	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+	to->count = be16_to_cpu(hdr3->__count);
+	to->level = be16_to_cpu(hdr3->__level);
+}
+
+static void
+xfs_da3_node_hdr_to_disk(
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+	ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+	hdr3->info.hdr.back = cpu_to_be32(from->back);
+	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+	hdr3->__count = cpu_to_be16(from->count);
+	hdr3->__level = cpu_to_be16(from->level);
+}
+
+
+/*
+ * Directory free space block operations
+ */
+static int
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
+		sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+			(db / xfs_dir2_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return db % xfs_dir2_free_max_bests(geo);
+}
+
+static int
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
+		sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+			(db / xfs_dir3_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return db % xfs_dir3_free_max_bests(geo);
+}
+
+static void
+xfs_dir2_free_hdr_from_disk(
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	to->magic = be32_to_cpu(from->hdr.magic);
+	to->firstdb = be32_to_cpu(from->hdr.firstdb);
+	to->nvalid = be32_to_cpu(from->hdr.nvalid);
+	to->nused = be32_to_cpu(from->hdr.nused);
+	ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+	to->hdr.magic = cpu_to_be32(from->magic);
+	to->hdr.firstdb = cpu_to_be32(from->firstdb);
+	to->hdr.nvalid = cpu_to_be32(from->nvalid);
+	to->hdr.nused = cpu_to_be32(from->nused);
+}
+
+static void
+xfs_dir3_free_hdr_from_disk(
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+	to->magic = be32_to_cpu(hdr3->hdr.magic);
+	to->firstdb = be32_to_cpu(hdr3->firstdb);
+	to->nvalid = be32_to_cpu(hdr3->nvalid);
+	to->nused = be32_to_cpu(hdr3->nused);
+
+	ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+	ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+	hdr3->hdr.magic = cpu_to_be32(from->magic);
+	hdr3->firstdb = cpu_to_be32(from->firstdb);
+	hdr3->nvalid = cpu_to_be32(from->nvalid);
+	hdr3->nused = cpu_to_be32(from->nused);
+}
+
+static const struct xfs_dir_ops xfs_dir2_ops = {
+	.sf_entsize = xfs_dir2_sf_entsize,
+	.sf_nextentry = xfs_dir2_sf_nextentry,
+	.sf_get_ftype = xfs_dir2_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir2_sfe_put_ftype,
+	.sf_get_ino = xfs_dir2_sfe_get_ino,
+	.sf_put_ino = xfs_dir2_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir2_data_entsize,
+	.data_get_ftype = xfs_dir2_data_get_ftype,
+	.data_put_ftype = xfs_dir2_data_put_ftype,
+	.data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1) +
+				XFS_DIR2_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir2_data_first_entry_p,
+	.data_entry_p = xfs_dir2_data_entry_p,
+	.data_unused_p = xfs_dir2_data_unused_p,
+
+	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+	.leaf_max_ents = xfs_dir2_max_leaf_ents,
+	.leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+	.node_tree_p = xfs_da2_node_tree_p,
+
+	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+	.free_max_bests = xfs_dir2_free_max_bests,
+	.free_bests_p = xfs_dir2_free_bests_p,
+	.db_to_fdb = xfs_dir2_db_to_fdb,
+	.db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+	.sf_entsize = xfs_dir3_sf_entsize,
+	.sf_nextentry = xfs_dir3_sf_nextentry,
+	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
+	.sf_get_ino = xfs_dir3_sfe_get_ino,
+	.sf_put_ino = xfs_dir3_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir3_data_entsize,
+	.data_get_ftype = xfs_dir3_data_get_ftype,
+	.data_put_ftype = xfs_dir3_data_put_ftype,
+	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
+	.data_entry_p = xfs_dir2_data_entry_p,
+	.data_unused_p = xfs_dir2_data_unused_p,
+
+	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+	.leaf_max_ents = xfs_dir2_max_leaf_ents,
+	.leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+	.node_tree_p = xfs_da2_node_tree_p,
+
+	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+	.free_max_bests = xfs_dir2_free_max_bests,
+	.free_bests_p = xfs_dir2_free_bests_p,
+	.db_to_fdb = xfs_dir2_db_to_fdb,
+	.db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir3_ops = {
+	.sf_entsize = xfs_dir3_sf_entsize,
+	.sf_nextentry = xfs_dir3_sf_nextentry,
+	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
+	.sf_get_ino = xfs_dir3_sfe_get_ino,
+	.sf_put_ino = xfs_dir3_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir3_data_entsize,
+	.data_get_ftype = xfs_dir3_data_get_ftype,
+	.data_put_ftype = xfs_dir3_data_put_ftype,
+	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir3_data_bestfree_p,
+
+	.data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
+
+	.data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir3_data_first_entry_p,
+	.data_entry_p = xfs_dir3_data_entry_p,
+	.data_unused_p = xfs_dir3_data_unused_p,
+
+	.leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
+	.leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
+	.leaf_max_ents = xfs_dir3_max_leaf_ents,
+	.leaf_ents_p = xfs_dir3_leaf_ents_p,
+
+	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+	.node_tree_p = xfs_da3_node_tree_p,
+
+	.free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
+	.free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+	.free_max_bests = xfs_dir3_free_max_bests,
+	.free_bests_p = xfs_dir3_free_bests_p,
+	.db_to_fdb = xfs_dir3_db_to_fdb,
+	.db_to_fdindex = xfs_dir3_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+	.node_tree_p = xfs_da2_node_tree_p,
+};
+
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+	.node_tree_p = xfs_da3_node_tree_p,
+};
+
+/*
+ * Return the ops structure according to the current config.  If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp)
+{
+	if (dp)
+		return dp->d_ops;
+	if (mp->m_dir_inode_ops)
+		return mp->m_dir_inode_ops;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return &xfs_dir3_ops;
+	if (xfs_sb_version_hasftype(&mp->m_sb))
+		return &xfs_dir2_ftype_ops;
+	return &xfs_dir2_ops;
+}
+
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp)
+{
+	if (dp)
+		return dp->d_ops;
+	if (mp->m_nondir_inode_ops)
+		return mp->m_nondir_inode_ops;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return &xfs_dir3_nondir_ops;
+	return &xfs_dir2_nondir_ops;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_da_format.h b/kernel/fs/xfs/libxfs/xfs_da_format.h
new file mode 100644
index 000000000..74bcbabfa
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_da_format.h
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
+#define	XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
+#define	XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+	__be32		forw;			/* previous block in list */
+	__be32		back;			/* following block in list */
+	__be16		magic;			/* validity check on block */
+	__be16		pad;			/* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC	0x3ebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC	0x3bee	/* magic number: attribute leaf blks */
+#define	XFS_DIR3_LEAF1_MAGIC	0x3df1	/* magic number: v2 dirlf single blks */
+#define	XFS_DIR3_LEAFN_MAGIC	0x3dff	/* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+	/*
+	 * the node link manipulation code relies on the fact that the first
+	 * element of this structure is the struct xfs_da_blkinfo so it can
+	 * ignore the differences in the rest of the structures.
+	 */
+	struct xfs_da_blkinfo	hdr;
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define	XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+	struct xfs_da_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+	struct xfs_da3_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+	__be32			__pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF	(offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+	__be32	hashval;	/* hash value for this descendant */
+	__be32	before;		/* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+	struct xfs_da_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+	struct xfs_da3_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+	__uint32_t	forw;
+	__uint32_t	back;
+	__uint16_t	magic;
+	__uint16_t	count;
+	__uint16_t	level;
+};
+
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ *  - shortform - embedded into the inode
+ *  - single block - data with embedded leaf at the end
+ *  - multiple data blocks, single leaf+freeindex block
+ *  - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+
+#define	XFS_DIR2_BLOCK_MAGIC	0x58443242	/* XD2B: single block dirs */
+#define	XFS_DIR2_DATA_MAGIC	0x58443244	/* XD2D: multiblock dirs */
+#define	XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F: free index blocks */
+
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories.  The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ *	- a larger block header for CRC and identification purposes and so the
+ *	offsets of all the structures inside the blocks are different.
+ *
+ *	- new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define	XFS_DIR3_BLOCK_MAGIC	0x58444233	/* XDB3: single block dirs */
+#define	XFS_DIR3_DATA_MAGIC	0x58444433	/* XDD3: multiblock dirs */
+#define	XFS_DIR3_FREE_MAGIC	0x58444633	/* XDF3: free index blocks */
+
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN		0
+#define XFS_DIR3_FT_REG_FILE		1
+#define XFS_DIR3_FT_DIR			2
+#define XFS_DIR3_FT_CHRDEV		3
+#define XFS_DIR3_FT_BLKDEV		4
+#define XFS_DIR3_FT_FIFO		5
+#define XFS_DIR3_FT_SOCK		6
+#define XFS_DIR3_FT_SYMLINK		7
+#define XFS_DIR3_FT_WHT			8
+
+#define XFS_DIR3_FT_MAX			9
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef	__uint16_t	xfs_dir2_data_off_t;
+#define	NULLDATAOFF	0xffffU
+typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef	__uint32_t	xfs_dir2_dataptr_t;
+#define	XFS_DIR2_MAX_DATAPTR	((xfs_dir2_dataptr_t)0xffffffff)
+#define	XFS_DIR2_NULL_DATAPTR	((xfs_dir2_dataptr_t)0)
+
+/*
+ * Byte offset in a directory.
+ */
+typedef	xfs_off_t	xfs_dir2_off_t;
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef	__uint32_t	xfs_dir2_db_t;
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef	struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+
+typedef union {
+	xfs_dir2_ino8_t	i8;
+	xfs_dir2_ino4_t	i4;
+} xfs_dir2_inou_t;
+#define	XFS_DIR2_MAX_SHORT_INUM	((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode.  These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures.  Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+	__uint8_t		count;		/* count of entries */
+	__uint8_t		i8count;	/* count of 8-byte inode #s */
+	xfs_dir2_inou_t		parent;		/* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+	__u8			namelen;	/* actual name length */
+	xfs_dir2_sf_off_t	offset;		/* saved offset */
+	__u8			name[];		/* name, variable size */
+	/*
+	 * A single byte containing the file type field follows the inode
+	 * number for version 3 directory entries.
+	 *
+	 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
+	 * variable offset after the name.
+	 */
+} __arch_pack xfs_dir2_sf_entry_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+	return sizeof(struct xfs_dir2_sf_hdr) -
+		(i8count == 0) *
+		(sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+	return get_unaligned_be16(&sfep->offset.i);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+	put_unaligned_be16(off, &sfep->offset.i);
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+
+#define	XFS_DIR2_DATA_ALIGN_LOG	3		/* i.e., 8 bytes */
+#define	XFS_DIR2_DATA_ALIGN	(1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define	XFS_DIR2_DATA_FREE_TAG	0xffff
+#define	XFS_DIR2_DATA_FD_COUNT	3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define	XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define	XFS_DIR2_DATA_SPACE	0
+#define	XFS_DIR2_DATA_OFFSET	(XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+	__be16			offset;		/* start of freespace */
+	__be16			length;		/* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+	__be32			magic;		/* XFS_DIR2_DATA_MAGIC or */
+						/* XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_free_t	bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+	__be32			magic;	/* magic number */
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	xfs_dir2_data_free_t	best_free[XFS_DIR2_DATA_FD_COUNT];
+	__be32			pad;	/* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes.  After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+	__be64			inumber;	/* inode number */
+	__u8			namelen;	/* name length */
+	__u8			name[];		/* name bytes, no null */
+     /* __u8			filetype; */	/* type of inode we point to */
+     /*	__be16                  tag; */		/* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes.  Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+	__be16			freetag;	/* XFS_DIR2_DATA_FREE_TAG */
+	__be16			length;		/* total free length */
+						/* variable offset */
+	__be16			tag;		/* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+	return (__be16 *)((char *)dup +
+			be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_hdr_t       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_tail_t      |
+ *    +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define	XFS_DIR2_LEAF_SPACE	1
+#define	XFS_DIR2_LEAF_OFFSET	(XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+	xfs_da_blkinfo_t	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+struct xfs_dir3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+struct xfs_dir3_icleaf_hdr {
+	__uint32_t		forw;
+	__uint32_t		back;
+	__uint16_t		magic;
+	__uint16_t		count;
+	__uint16_t		stale;
+};
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+	__be32			hashval;	/* hash value of name */
+	__be32			address;	/* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+	__be32			bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+	xfs_dir2_leaf_hdr_t	hdr;			/* leaf header */
+	xfs_dir2_leaf_entry_t	__ents[];		/* entries */
+} xfs_dir2_leaf_t;
+
+struct xfs_dir3_leaf {
+	struct xfs_dir3_leaf_hdr	hdr;		/* leaf header */
+	struct xfs_dir2_leaf_entry	__ents[];	/* entries */
+};
+
+#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+	return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Free space block defintions for the node format.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define	XFS_DIR2_FREE_SPACE	2
+#define	XFS_DIR2_FREE_OFFSET	(XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+
+typedef	struct xfs_dir2_free_hdr {
+	__be32			magic;		/* XFS_DIR2_FREE_MAGIC */
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+	xfs_dir2_free_hdr_t	hdr;		/* block header */
+	__be16			bests[];	/* best free counts */
+						/* unused entries are -1 */
+} xfs_dir2_free_t;
+
+struct xfs_dir3_free_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+	struct xfs_dir3_free_hdr hdr;
+	__be16			bests[];	/* best free counts */
+						/* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+	__uint32_t	magic;
+	__uint32_t	firstdb;
+	__uint32_t	nvalid;
+	__uint32_t	nused;
+
+};
+
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *    | ...                                             |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_block_tail_t                           |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+typedef struct xfs_dir2_block_tail {
+	__be32		count;			/* count of leaf entries */
+	__be32		stale;			/* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+	return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top.  Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry.  It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.  We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
+	__be16	base;			  /* base of free region */
+	__be16	size;			  /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
+	xfs_da_blkinfo_t info;		/* block type, links, etc. */
+	__be16	count;			/* count of active leaf_entry's */
+	__be16	usedbytes;		/* num bytes of names/values stored */
+	__be16	firstused;		/* first used byte in name area */
+	__u8	holes;			/* != 0 if blk needs compaction */
+	__u8	pad1;
+	xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+					/* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
+	__be32	hashval;		/* hash value of name */
+	__be16	nameidx;		/* index into buffer of name/value */
+	__u8	flags;			/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+	__u8	pad2;			/* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+	__be16	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	nameval[1];		/* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+	__be32	valueblk;		/* block number of value bytes */
+	__be32	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	name[1];		/* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+	xfs_attr_leaf_hdr_t	hdr;	/* constant-structure header block */
+	xfs_attr_leaf_entry_t	entries[1];	/* sorted on key, not name */
+	xfs_attr_leaf_name_local_t namelist;	/* grows from bottom of buf */
+	xfs_attr_leaf_name_remote_t valuelist;	/* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;
+	__be16			count;
+	__be16			usedbytes;
+	__be16			firstused;
+	__u8			holes;
+	__u8			pad1;
+	struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+	__be32			pad2;		/* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF	(offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+	struct xfs_attr3_leaf_hdr	hdr;
+	struct xfs_attr_leaf_entry	entries[1];
+
+	/*
+	 * The rest of the block contains the following structures after the
+	 * leaf entries, growing from the bottom up. The variables are never
+	 * referenced, the locations accessed purely from helper functions.
+	 *
+	 * struct xfs_attr_leaf_name_local
+	 * struct xfs_attr_leaf_name_remote
+	 */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+	__uint32_t	forw;
+	__uint32_t	back;
+	__uint16_t	magic;
+	__uint16_t	count;
+	__uint16_t	usedbytes;
+	/*
+	 * firstused is 32-bit here instead of 16-bit like the on-disk variant
+	 * to support maximum fsb size of 64k without overflow issues throughout
+	 * the attr code. Instead, the overflow condition is handled on
+	 * conversion to/from disk.
+	 */
+	__uint32_t	firstused;
+	__u8		holes;
+	struct {
+		__uint16_t	base;
+		__uint16_t	size;
+	} freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
+ * Special value to represent fs block size in the leaf header firstused field.
+ * Only used when block size overflows the 2-bytes available on disk.
+ */
+#define XFS_ATTR3_LEAF_NULLOFF	0
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define	XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
+#define	XFS_ATTR_ROOT_BIT	1	/* limit access to trusted attrs */
+#define	XFS_ATTR_SECURE_BIT	2	/* limit access to secure attrs */
+#define	XFS_ATTR_INCOMPLETE_BIT	7	/* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL		(1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT		(1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE		(1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK		(ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK	(XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags)	((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags)	((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)	(((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+					 ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)	(((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+					 ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define	XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return sizeof(struct xfs_attr3_leaf_hdr);
+	return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+	return &leafp->entries[0];
+}
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+	struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+	return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+	return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+	return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+	return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC	0x5841524d	/* XARM */
+
+struct xfs_attr3_rmt_hdr {
+	__be32	rm_magic;
+	__be32	rm_offset;
+	__be32	rm_bytes;
+	__be32	rm_crc;
+	uuid_t	rm_uuid;
+	__be64	rm_owner;
+	__be64	rm_blkno;
+	__be64	rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF	offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)	\
+	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+			sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2.c b/kernel/fs/xfs/libxfs/xfs_dir2.c
new file mode 100644
index 000000000..a69fb3a1e
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2.c
@@ -0,0 +1,731 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+
+/*
+ * @mode, if set, indicates that the type field needs to be set up.
+ * This uses the transformation from file mode to DT_* as defined in linux/fs.h
+ * for file type specification. This will be propagated into the directory
+ * structure if appropriate for the given operation and filesystem config.
+ */
+const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
+	[0]			= XFS_DIR3_FT_UNKNOWN,
+	[S_IFREG >> S_SHIFT]    = XFS_DIR3_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]    = XFS_DIR3_FT_DIR,
+	[S_IFCHR >> S_SHIFT]    = XFS_DIR3_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]    = XFS_DIR3_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]    = XFS_DIR3_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]   = XFS_DIR3_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]    = XFS_DIR3_FT_SYMLINK,
+};
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+	struct xfs_name	*name)
+{
+	xfs_dahash_t	hash;
+	int		i;
+
+	for (i = 0, hash = 0; i < name->len; i++)
+		hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+	return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	enum xfs_dacmp	result;
+	int		i;
+
+	if (args->namelen != len)
+		return XFS_CMP_DIFFERENT;
+
+	result = XFS_CMP_EXACT;
+	for (i = 0; i < len; i++) {
+		if (args->name[i] == name[i])
+			continue;
+		if (tolower(args->name[i]) != tolower(name[i]))
+			return XFS_CMP_DIFFERENT;
+		result = XFS_CMP_CASE;
+	}
+
+	return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+	.hashname	= xfs_ascii_ci_hashname,
+	.compname	= xfs_ascii_ci_compname,
+};
+
+int
+xfs_da_mount(
+	struct xfs_mount	*mp)
+{
+	struct xfs_da_geometry	*dageo;
+	int			nodehdr_size;
+
+
+	ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+	       XFS_MAX_BLOCKSIZE);
+
+	mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+	mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+
+	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
+	mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+				    KM_SLEEP | KM_MAYFAIL);
+	mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+				     KM_SLEEP | KM_MAYFAIL);
+	if (!mp->m_dir_geo || !mp->m_attr_geo) {
+		kmem_free(mp->m_dir_geo);
+		kmem_free(mp->m_attr_geo);
+		return -ENOMEM;
+	}
+
+	/* set up directory geometry */
+	dageo = mp->m_dir_geo;
+	dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+	dageo->fsblog = mp->m_sb.sb_blocklog;
+	dageo->blksize = 1 << dageo->blklog;
+	dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+
+	/*
+	 * Now we've set up the block conversion variables, we can calculate the
+	 * segment block constants using the geometry structure.
+	 */
+	dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+	dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+	dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+	dageo->node_ents = (dageo->blksize - nodehdr_size) /
+				(uint)sizeof(xfs_da_node_entry_t);
+	dageo->magicpct = (dageo->blksize * 37) / 100;
+
+	/* set up attribute geometry - single fsb only */
+	dageo = mp->m_attr_geo;
+	dageo->blklog = mp->m_sb.sb_blocklog;
+	dageo->fsblog = mp->m_sb.sb_blocklog;
+	dageo->blksize = 1 << dageo->blklog;
+	dageo->fsbcount = 1;
+	dageo->node_ents = (dageo->blksize - nodehdr_size) /
+				(uint)sizeof(xfs_da_node_entry_t);
+	dageo->magicpct = (dageo->blksize * 37) / 100;
+
+	if (xfs_sb_version_hasasciici(&mp->m_sb))
+		mp->m_dirnameops = &xfs_ascii_ci_nameops;
+	else
+		mp->m_dirnameops = &xfs_default_nameops;
+
+	return 0;
+}
+
+void
+xfs_da_unmount(
+	struct xfs_mount	*mp)
+{
+	kmem_free(mp->m_dir_geo);
+	kmem_free(mp->m_attr_geo);
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+	xfs_inode_t	*dp)
+{
+	xfs_dir2_sf_hdr_t	*sfp;
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	if (dp->i_d.di_size == 0)	/* might happen during shutdown. */
+		return 1;
+	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+		return 0;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	return !sfp->count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	xfs_agblock_t	agblkno;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		ino_ok;
+	int		ioff;
+
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agblkno = XFS_INO_TO_AGBNO(mp, ino);
+	ioff = XFS_INO_TO_OFFSET(mp, ino);
+	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+	ino_ok =
+		agno < mp->m_sb.sb_agcount &&
+		agblkno < mp->m_sb.sb_agblocks &&
+		agblkno != 0 &&
+		ioff < (1 << mp->m_sb.sb_inopblog) &&
+		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+			XFS_RANDOM_DIR_INO_VALIDATE))) {
+		xfs_warn(mp, "Invalid inode number 0x%Lx",
+				(unsigned long long) ino);
+		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	xfs_inode_t	*pdp)
+{
+	struct xfs_da_args *args;
+	int		error;
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+	if (error)
+		return error;
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->dp = dp;
+	args->trans = tp;
+	error = xfs_dir2_sf_create(args, pdp->i_ino);
+	kmem_free(args);
+	return error;
+}
+
+/*
+ * Enter a name in a directory, or check for available space.
+ * If inum is 0, only the available space test is performed.
+ */
+int
+xfs_dir_createname(
+	xfs_trans_t		*tp,
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	xfs_ino_t		inum,		/* new entry inode number */
+	xfs_fsblock_t		*first,		/* bmap's firstblock */
+	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	struct xfs_da_args	*args;
+	int			rval;
+	int			v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	if (inum) {
+		rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+		if (rval)
+			return rval;
+		XFS_STATS_INC(xs_dir_create);
+	}
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->inumber = inum;
+	args->dp = dp;
+	args->firstblock = first;
+	args->flist = flist;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+	args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+	if (!inum)
+		args->op_flags |= XFS_DA_OP_JUSTCHECK;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_addname(args);
+	else
+		rval = xfs_dir2_node_addname(args);
+
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	if (args->cmpresult == XFS_CMP_DIFFERENT)
+		return -ENOENT;
+	if (args->cmpresult != XFS_CMP_CASE ||
+					!(args->op_flags & XFS_DA_OP_CILOOKUP))
+		return -EEXIST;
+
+	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+	if (!args->value)
+		return -ENOMEM;
+
+	memcpy(args->value, name, len);
+	args->valuelen = len;
+	return -EEXIST;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+
+int
+xfs_dir_lookup(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,
+	xfs_ino_t	*inum,		/* out: inode number */
+	struct xfs_name *ci_name)	/* out: actual name if CI match */
+{
+	struct xfs_da_args *args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	XFS_STATS_INC(xs_dir_lookup);
+
+	/*
+	 * We need to use KM_NOFS here so that lockdep will not throw false
+	 * positive deadlock warnings on a non-transactional lookup path. It is
+	 * safe to recurse into inode recalim in that case, but lockdep can't
+	 * easily be taught about it. Hence KM_NOFS avoids having to add more
+	 * lockdep Doing this avoids having to add a bunch of lockdep class
+	 * annotations into the reclaim path for the ilock.
+	 */
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->dp = dp;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+	args->op_flags = XFS_DA_OP_OKNOENT;
+	if (ci_name)
+		args->op_flags |= XFS_DA_OP_CILOOKUP;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_lookup(args);
+		goto out_check_rval;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_lookup(args);
+		goto out_check_rval;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_lookup(args);
+	else
+		rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
+	if (rval == -EEXIST)
+		rval = 0;
+	if (!rval) {
+		*inum = args->inumber;
+		if (ci_name) {
+			ci_name->name = args->value;
+			ci_name->len = args->valuelen;
+		}
+	}
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,
+	xfs_ino_t	ino,
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	struct xfs_da_args *args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	XFS_STATS_INC(xs_dir_remove);
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->inumber = ino;
+	args->dp = dp;
+	args->firstblock = first;
+	args->flist = flist;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_removename(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_removename(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_removename(args);
+	else
+		rval = xfs_dir2_node_removename(args);
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,		/* name of entry to replace */
+	xfs_ino_t	inum,		/* new inode number */
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	struct xfs_da_args *args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+
+	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+	if (rval)
+		return rval;
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->inumber = inum;
+	args->dp = dp;
+	args->firstblock = first;
+	args->flist = flist;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_replace(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_replace(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_replace(args);
+	else
+		rval = xfs_dir2_node_replace(args);
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ */
+int
+xfs_dir_canenter(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name)		/* name of entry to add */
+{
+	return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0);
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ *
+ * This routine is for data and free blocks, not leaf/node blocks which are
+ * handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+	struct xfs_da_args	*args,
+	int			space,	/* v2 dir's space XFS_DIR2_xxx_SPACE */
+	xfs_dir2_db_t		*dbp)	/* out: block number added */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	xfs_fileoff_t		bno;	/* directory offset of new block */
+	int			count;	/* count of filesystem blocks */
+	int			error;
+
+	trace_xfs_dir2_grow_inode(args, space);
+
+	/*
+	 * Set lowest possible block in the space requested.
+	 */
+	bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+	count = args->geo->fsbcount;
+
+	error = xfs_da_grow_inode_int(args, &bno, count);
+	if (error)
+		return error;
+
+	*dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
+
+	/*
+	 * Update file's size if this is the data space and it grew.
+	 */
+	if (space == XFS_DIR2_DATA_SPACE) {
+		xfs_fsize_t	size;		/* directory file (data) size */
+
+		size = XFS_FSB_TO_B(mp, bno + count);
+		if (size > dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+	struct xfs_da_args	*args,
+	int			*vp)	/* out: 1 is block, 0 is not block */
+{
+	xfs_fileoff_t		last;	/* last file offset */
+	int			rval;
+
+	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+		return rval;
+	rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+	ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
+	*vp = rval;
+	return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+	struct xfs_da_args	*args,
+	int			*vp)	/* out: 1 is block, 0 is not block */
+{
+	xfs_fileoff_t		last;	/* last file offset */
+	int			rval;
+
+	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+		return rval;
+	*vp = last == args->geo->leafblk + args->geo->fsbcount;
+	return 0;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+	xfs_da_args_t	*args,
+	xfs_dir2_db_t	db,
+	struct xfs_buf	*bp)
+{
+	xfs_fileoff_t	bno;		/* directory file offset */
+	xfs_dablk_t	da;		/* directory file offset */
+	int		done;		/* bunmap is finished */
+	xfs_inode_t	*dp;
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*tp;
+
+	trace_xfs_dir2_shrink_inode(args, db);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	da = xfs_dir2_db_to_da(args->geo, db);
+	/*
+	 * Unmap the fsblock(s).
+	 */
+	if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
+			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+			&done))) {
+		/*
+		 * ENOSPC actually can happen if we're in a removename with
+		 * no space reservation, and the resulting block removal
+		 * would cause a bmap btree split or conversion from extents
+		 * to btree.  This can only happen for un-fragmented
+		 * directory blocks, since you need to be punching out
+		 * the middle of an extent.
+		 * In this case we need to leave the block in the file,
+		 * and not binval it.
+		 * So the block has to be in a consistent empty state
+		 * and appropriately logged.
+		 * We don't free up the buffer, the caller can tell it
+		 * hasn't happened since it got an error back.
+		 */
+		return error;
+	}
+	ASSERT(done);
+	/*
+	 * Invalidate the buffer from the transaction.
+	 */
+	xfs_trans_binval(tp, bp);
+	/*
+	 * If it's not a data block, we're done.
+	 */
+	if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
+		return 0;
+	/*
+	 * If the block isn't the last one in the directory, we're done.
+	 */
+	if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
+		return 0;
+	bno = da;
+	if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+		/*
+		 * This can't really happen unless there's kernel corruption.
+		 */
+		return error;
+	}
+	if (db == args->geo->datablk)
+		ASSERT(bno == 0);
+	else
+		ASSERT(bno > 0);
+	/*
+	 * Set the size to the new last block.
+	 */
+	dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2.h b/kernel/fs/xfs/libxfs/xfs_dir2.h
new file mode 100644
index 000000000..e55353651
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2.h
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+
+struct xfs_bmap_free;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
+
+extern struct xfs_name	xfs_name_dotdot;
+
+/*
+ * directory filetype conversion tables.
+ */
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+
+/*
+ * directory operations vector for encode/decode routines
+ */
+struct xfs_dir_ops {
+	int	(*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
+	struct xfs_dir2_sf_entry *
+		(*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
+				struct xfs_dir2_sf_entry *sfep);
+	__uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+	void	(*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
+				__uint8_t ftype);
+	xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
+				struct xfs_dir2_sf_entry *sfep);
+	void	(*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
+			      struct xfs_dir2_sf_entry *sfep,
+			      xfs_ino_t ino);
+	xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
+	void	(*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
+				     xfs_ino_t ino);
+
+	int	(*data_entsize)(int len);
+	__uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+	void	(*data_put_ftype)(struct xfs_dir2_data_entry *dep,
+				__uint8_t ftype);
+	__be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+	struct xfs_dir2_data_free *
+		(*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
+
+	xfs_dir2_data_aoff_t data_dot_offset;
+	xfs_dir2_data_aoff_t data_dotdot_offset;
+	xfs_dir2_data_aoff_t data_first_offset;
+	size_t	data_entry_offset;
+
+	struct xfs_dir2_data_entry *
+		(*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_entry *
+		(*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_entry *
+		(*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_entry *
+		(*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_unused *
+		(*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
+
+	int	leaf_hdr_size;
+	void	(*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
+				    struct xfs_dir3_icleaf_hdr *from);
+	void	(*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
+				      struct xfs_dir2_leaf *from);
+	int	(*leaf_max_ents)(struct xfs_da_geometry *geo);
+	struct xfs_dir2_leaf_entry *
+		(*leaf_ents_p)(struct xfs_dir2_leaf *lp);
+
+	int	node_hdr_size;
+	void	(*node_hdr_to_disk)(struct xfs_da_intnode *to,
+				    struct xfs_da3_icnode_hdr *from);
+	void	(*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
+				      struct xfs_da_intnode *from);
+	struct xfs_da_node_entry *
+		(*node_tree_p)(struct xfs_da_intnode *dap);
+
+	int	free_hdr_size;
+	void	(*free_hdr_to_disk)(struct xfs_dir2_free *to,
+				    struct xfs_dir3_icfree_hdr *from);
+	void	(*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
+				      struct xfs_dir2_free *from);
+	int	(*free_max_bests)(struct xfs_da_geometry *geo);
+	__be16 * (*free_bests_p)(struct xfs_dir2_free *free);
+	xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
+				   xfs_dir2_db_t db);
+	int	(*db_to_fdindex)(struct xfs_da_geometry *geo,
+				 xfs_dir2_db_t db);
+};
+
+extern const struct xfs_dir_ops *
+	xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+extern const struct xfs_dir_ops *
+	xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+
+/*
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t *inum,
+				struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t ino,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name);
+
+/*
+ * Direct call from the bmap code, bypassing the generic directory layer.
+ */
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+/*
+ * Interface routines used by userspace utilities
+ */
+extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+				struct xfs_buf *bp);
+
+extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
+		struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+		struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
+		struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+		struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+		struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
+		struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
+		xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+		int *needlogp, int *needscanp);
+
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+		struct xfs_dir2_data_unused *dup);
+
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+	return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+	return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o)
+{
+	return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o)
+{
+	return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+	return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+	return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+	return ((struct xfs_dir2_block_tail *)
+		((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+	return (struct xfs_dir2_leaf_tail *)
+		((char *)lp + geo->blksize -
+		  sizeof(struct xfs_dir2_leaf_tail));
+}
+
+#endif	/* __XFS_DIR2_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_block.c b/kernel/fs/xfs/libxfs/xfs_dir2_block.c
new file mode 100644
index 000000000..9354e190b
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_block.c
@@ -0,0 +1,1254 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+				    int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
+				     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+static bool
+xfs_dir3_block_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+			return false;
+	}
+	if (__xfs_dir3_data_check(NULL, bp))
+		return false;
+	return true;
+}
+
+static void
+xfs_dir3_block_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_dir3_block_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_block_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_block_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+	.verify_read = xfs_dir3_block_read_verify,
+	.verify_write = xfs_dir3_block_write_verify,
+};
+
+int
+xfs_dir3_block_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+	return err;
+}
+
+static void
+xfs_dir3_block_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+	bp->b_ops = &xfs_dir3_block_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+		return;
+
+	}
+	hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+}
+
+static void
+xfs_dir2_block_need_space(
+	struct xfs_inode		*dp,
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	__be16				**tagpp,
+	struct xfs_dir2_data_unused	**dupp,
+	struct xfs_dir2_data_unused	**enddupp,
+	int				*compact,
+	int				len)
+{
+	struct xfs_dir2_data_free	*bf;
+	__be16				*tagp = NULL;
+	struct xfs_dir2_data_unused	*dup = NULL;
+	struct xfs_dir2_data_unused	*enddup = NULL;
+
+	*compact = 0;
+	bf = dp->d_ops->data_bestfree_p(hdr);
+
+	/*
+	 * If there are stale entries we'll use one for the leaf.
+	 */
+	if (btp->stale) {
+		if (be16_to_cpu(bf[0].length) >= len) {
+			/*
+			 * The biggest entry enough to avoid compaction.
+			 */
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)hdr + be16_to_cpu(bf[0].offset));
+			goto out;
+		}
+
+		/*
+		 * Will need to compact to make this work.
+		 * Tag just before the first leaf entry.
+		 */
+		*compact = 1;
+		tagp = (__be16 *)blp - 1;
+
+		/* Data object just before the first leaf entry.  */
+		dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+		/*
+		 * If it's not free then the data will go where the
+		 * leaf data starts now, if it works at all.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+			    (uint)sizeof(*blp) < len)
+				dup = NULL;
+		} else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+			dup = NULL;
+		else
+			dup = (xfs_dir2_data_unused_t *)blp;
+		goto out;
+	}
+
+	/*
+	 * no stale entries, so just use free space.
+	 * Tag just before the first leaf entry.
+	 */
+	tagp = (__be16 *)blp - 1;
+
+	/* Data object just before the first leaf entry.  */
+	enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+	/*
+	 * If it's not free then can't do this add without cleaning up:
+	 * the space before the first leaf entry needs to be free so it
+	 * can be expanded to hold the pointer to the new entry.
+	 */
+	if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+		/*
+		 * Check out the biggest freespace and see if it's the same one.
+		 */
+		dup = (xfs_dir2_data_unused_t *)
+		      ((char *)hdr + be16_to_cpu(bf[0].offset));
+		if (dup != enddup) {
+			/*
+			 * Not the same free entry, just check its length.
+			 */
+			if (be16_to_cpu(dup->length) < len)
+				dup = NULL;
+			goto out;
+		}
+
+		/*
+		 * It is the biggest freespace, can it hold the leaf too?
+		 */
+		if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+			/*
+			 * Yes, use the second-largest entry instead if it works.
+			 */
+			if (be16_to_cpu(bf[1].length) >= len)
+				dup = (xfs_dir2_data_unused_t *)
+				      ((char *)hdr + be16_to_cpu(bf[1].offset));
+			else
+				dup = NULL;
+		}
+	}
+out:
+	*tagpp = tagp;
+	*dupp = dup;
+	*enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+	struct xfs_da_args		*args,
+	struct xfs_buf			*bp,
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	int				*needlog,
+	int				*lfloghigh,
+	int				*lfloglow)
+{
+	int			fromidx;	/* source leaf index */
+	int			toidx;		/* target leaf index */
+	int			needscan = 0;
+	int			highstale;	/* high stale index */
+
+	fromidx = toidx = be32_to_cpu(btp->count) - 1;
+	highstale = *lfloghigh = -1;
+	for (; fromidx >= 0; fromidx--) {
+		if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+			if (highstale == -1)
+				highstale = toidx;
+			else {
+				if (*lfloghigh == -1)
+					*lfloghigh = toidx;
+				continue;
+			}
+		}
+		if (fromidx < toidx)
+			blp[toidx] = blp[fromidx];
+		toidx--;
+	}
+	*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+	*lfloghigh -= be32_to_cpu(btp->stale) - 1;
+	be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+	xfs_dir2_data_make_free(args, bp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+		(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+		needlog, &needscan);
+	btp->stale = cpu_to_be32(1);
+	/*
+	 * If we now need to rebuild the bestfree map, do so.
+	 * This needs to happen before the next call to use_free.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(args->dp, hdr, needlog);
+}
+
+/*
+ * Add an entry to a block directory.
+ */
+int						/* error */
+xfs_dir2_block_addname(
+	xfs_da_args_t		*args)		/* directory op arguments */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	int			compact;	/* need to compact leaf ents */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	int			error;		/* error return value */
+	xfs_dir2_data_unused_t	*enddup=NULL;	/* unused at end of data */
+	xfs_dahash_t		hash;		/* hash value of found entry */
+	int			high;		/* high index for binary srch */
+	int			highstale;	/* high stale index */
+	int			lfloghigh=0;	/* last final leaf to log */
+	int			lfloglow=0;	/* first final leaf to log */
+	int			len;		/* length of the new entry */
+	int			low;		/* low index for binary srch */
+	int			lowstale;	/* low stale index */
+	int			mid=0;		/* midpoint for binary srch */
+	int			needlog;	/* need to log header */
+	int			needscan;	/* need to rescan freespace */
+	__be16			*tagp;		/* pointer to tag value */
+	xfs_trans_t		*tp;		/* transaction structure */
+
+	trace_xfs_dir2_block_addname(args);
+
+	dp = args->dp;
+	tp = args->trans;
+
+	/* Read the (one and only) directory block into bp. */
+	error = xfs_dir3_block_read(tp, dp, &bp);
+	if (error)
+		return error;
+
+	len = dp->d_ops->data_entsize(args->namelen);
+
+	/*
+	 * Set up pointers to parts of the block.
+	 */
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Find out if we can reuse stale entries or whether we need extra
+	 * space for entry and new leaf.
+	 */
+	xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
+				  &enddup, &compact, len);
+
+	/*
+	 * Done everything we need for a space check now.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		xfs_trans_brelse(tp, bp);
+		if (!dup)
+			return -ENOSPC;
+		return 0;
+	}
+
+	/*
+	 * If we don't have space for the new entry & leaf ...
+	 */
+	if (!dup) {
+		/* Don't have a space reservation: return no-space.  */
+		if (args->total == 0)
+			return -ENOSPC;
+		/*
+		 * Convert to the next larger format.
+		 * Then add the new entry in that format.
+		 */
+		error = xfs_dir2_block_to_leaf(args, bp);
+		if (error)
+			return error;
+		return xfs_dir2_leaf_addname(args);
+	}
+
+	needlog = needscan = 0;
+
+	/*
+	 * If need to compact the leaf entries, do it now.
+	 */
+	if (compact) {
+		xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
+				      &lfloghigh, &lfloglow);
+		/* recalculate blp post-compaction */
+		blp = xfs_dir2_block_leaf_p(btp);
+	} else if (btp->stale) {
+		/*
+		 * Set leaf logging boundaries to impossible state.
+		 * For the no-stale case they're set explicitly.
+		 */
+		lfloglow = be32_to_cpu(btp->count);
+		lfloghigh = -1;
+	}
+
+	/*
+	 * Find the slot that's first lower than our hash value, -1 if none.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+		mid--;
+	}
+	/*
+	 * No stale entries, will use enddup space to hold new leaf.
+	 */
+	if (!btp->stale) {
+		/*
+		 * Mark the space needed for the new leaf entry, now in use.
+		 */
+		xfs_dir2_data_use_free(args, bp, enddup,
+			(xfs_dir2_data_aoff_t)
+			((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
+			 sizeof(*blp)),
+			(xfs_dir2_data_aoff_t)sizeof(*blp),
+			&needlog, &needscan);
+		/*
+		 * Update the tail (entry count).
+		 */
+		be32_add_cpu(&btp->count, 1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(dp, hdr, &needlog);
+			needscan = 0;
+		}
+		/*
+		 * Adjust pointer to the first leaf entry, we're about to move
+		 * the table up one to open up space for the new leaf entry.
+		 * Then adjust our index to match.
+		 */
+		blp--;
+		mid++;
+		if (mid)
+			memmove(blp, &blp[1], mid * sizeof(*blp));
+		lfloglow = 0;
+		lfloghigh = mid;
+	}
+	/*
+	 * Use a stale leaf for our new entry.
+	 */
+	else {
+		for (lowstale = mid;
+		     lowstale >= 0 &&
+			blp[lowstale].address !=
+			cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+		     lowstale--)
+			continue;
+		for (highstale = mid + 1;
+		     highstale < be32_to_cpu(btp->count) &&
+			blp[highstale].address !=
+			cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
+			(lowstale < 0 || mid - lowstale > highstale - mid);
+		     highstale++)
+			continue;
+		/*
+		 * Move entries toward the low-numbered stale entry.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == be32_to_cpu(btp->count) ||
+		     mid - lowstale <= highstale - mid)) {
+			if (mid - lowstale)
+				memmove(&blp[lowstale], &blp[lowstale + 1],
+					(mid - lowstale) * sizeof(*blp));
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(mid, lfloghigh);
+		}
+		/*
+		 * Move entries toward the high-numbered stale entry.
+		 */
+		else {
+			ASSERT(highstale < be32_to_cpu(btp->count));
+			mid++;
+			if (highstale - mid)
+				memmove(&blp[mid + 1], &blp[mid],
+					(highstale - mid) * sizeof(*blp));
+			lfloglow = MIN(mid, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		be32_add_cpu(&btp->stale, -1);
+	}
+	/*
+	 * Point to the new data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	/*
+	 * Fill in the leaf entry.
+	 */
+	blp[mid].hashval = cpu_to_be32(args->hashval);
+	blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+				(char *)dep - (char *)hdr));
+	xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+	/*
+	 * Mark space for the data entry used.
+	 */
+	xfs_dir2_data_use_free(args, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+		(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+	/*
+	 * Create the new data entry.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, args->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	/*
+	 * Clean up the bestfree array and log the header, tail, and entry.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, bp);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+	xfs_trans_t		*tp,		/* transaction structure */
+	struct xfs_buf		*bp,		/* block buffer */
+	int			first,		/* index of first logged leaf */
+	int			last)		/* index of last logged leaf */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+	xfs_dir2_leaf_entry_t	*blp;
+	xfs_dir2_block_tail_t	*btp;
+
+	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+		(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+	xfs_trans_t		*tp,		/* transaction structure */
+	struct xfs_buf		*bp)		/* block buffer */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+	xfs_dir2_block_tail_t	*btp;
+
+	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+	xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+		(uint)((char *)(btp + 1) - (char *)hdr - 1));
+}
+
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int						/* error */
+xfs_dir2_block_lookup(
+	xfs_da_args_t		*args)		/* dir lookup arguments */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* entry index */
+	int			error;		/* error return value */
+
+	trace_xfs_dir2_block_lookup(args);
+
+	/*
+	 * Get the buffer, look up the entry.
+	 * If not found (ENOENT) then return, have no buffer.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+		return error;
+	dp = args->dp;
+	hdr = bp->b_addr;
+	xfs_dir3_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Get the offset from the leaf entry, to point to the data.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	/*
+	 * Fill in inode number, CI name if appropriate, release the block.
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	args->filetype = dp->d_ops->data_get_ftype(dep);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_trans_brelse(args->trans, bp);
+	return error;
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int					/* error */
+xfs_dir2_block_lookup_int(
+	xfs_da_args_t		*args,		/* dir lookup arguments */
+	struct xfs_buf		**bpp,		/* returned block buffer */
+	int			*entno)		/* returned entry number */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			error;		/* error return value */
+	xfs_dahash_t		hash;		/* found hash value */
+	int			high;		/* binary search high index */
+	int			low;		/* binary search low index */
+	int			mid;		/* binary search current idx */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+
+	error = xfs_dir3_block_read(tp, dp, &bp);
+	if (error)
+		return error;
+
+	hdr = bp->b_addr;
+	xfs_dir3_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Loop doing a binary search for our hash value.
+	 * Find our entry, ENOENT if it's not there.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+		ASSERT(low <= high);
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+		if (low > high) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+			xfs_trans_brelse(tp, bp);
+			return -ENOENT;
+		}
+	}
+	/*
+	 * Back up to the first one with the right hash value.
+	 */
+	while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+		mid--;
+	}
+	/*
+	 * Now loop forward through all the entries with the
+	 * right hash value looking for our name.
+	 */
+	do {
+		if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get pointer to the entry from the leaf.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*bpp = bp;
+			*entno = mid;
+			if (cmp == XFS_CMP_EXACT)
+				return 0;
+		}
+	} while (++mid < be32_to_cpu(btp->count) &&
+			be32_to_cpu(blp[mid].hashval) == hash);
+
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was found earlier, return success.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE)
+		return 0;
+	/*
+	 * No match, release the buffer and return ENOENT.
+	 */
+	xfs_trans_brelse(tp, bp);
+	return -ENOENT;
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int						/* error */
+xfs_dir2_block_removename(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf pointer */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* block leaf entry index */
+	int			error;		/* error return value */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to fixup bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* shortform size */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_block_removename(args);
+
+	/*
+	 * Look up the entry in the block.  Gets the buffer and entry index.
+	 * It will always be there, the vnodeops level does a lookup first.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry using the leaf entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	/*
+	 * Mark the data entry's space free.
+	 */
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(args, bp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * Fix up the block tail.
+	 */
+	be32_add_cpu(&btp->stale, 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	/*
+	 * Remove the leaf entry by marking it stale.
+	 */
+	blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+	/*
+	 * Fix up bestfree, log the header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, bp);
+	xfs_dir3_data_check(dp, bp);
+	/*
+	 * See if the size as a shortform is good enough.
+	 */
+	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+	if (size > XFS_IFORK_DSIZE(dp))
+		return 0;
+
+	/*
+	 * If it works, do the conversion.
+	 */
+	return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int						/* error */
+xfs_dir2_block_replace(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* leaf entry index */
+	int			error;		/* error return value */
+
+	trace_xfs_dir2_block_replace(args);
+
+	/*
+	 * Lookup the entry in the directory.  Get buffer and entry index.
+	 * This will always succeed since the caller has already done a lookup.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry we need to change.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+	/*
+	 * Change the inode number to the new value.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int					/* sort order */
+xfs_dir2_block_sort(
+	const void			*a,	/* first leaf entry */
+	const void			*b)	/* second leaf entry */
+{
+	const xfs_dir2_leaf_entry_t	*la;	/* first leaf entry */
+	const xfs_dir2_leaf_entry_t	*lb;	/* second leaf entry */
+
+	la = a;
+	lb = b;
+	return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+		(be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int						/* error */
+xfs_dir2_leaf_to_block(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp,		/* leaf buffer */
+	struct xfs_buf		*dbp)		/* data buffer */
+{
+	__be16			*bestsp;	/* leaf bests table */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	int			error;		/* error return value */
+	int			from;		/* leaf from index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to scan for bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* bytes used */
+	__be16			*tagp;		/* end of entry (tag) */
+	int			to;		/* block/leaf to index */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_to_block(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+	/*
+	 * If there are data blocks other than the first one, take this
+	 * opportunity to remove trailing empty data blocks that may have
+	 * been left behind during no-space-reservation operations.
+	 * These will show up in the leaf bests table.
+	 */
+	while (dp->i_d.di_size > args->geo->blksize) {
+		int hdrsz;
+
+		hdrsz = dp->d_ops->data_entry_offset;
+		bestsp = xfs_dir2_leaf_bests_p(ltp);
+		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+					    args->geo->blksize - hdrsz) {
+			if ((error =
+			    xfs_dir2_leaf_trim_data(args, lbp,
+				    (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+				return error;
+		} else
+			return 0;
+	}
+	/*
+	 * Read the data block if we don't already have it, give up if it fails.
+	 */
+	if (!dbp) {
+		error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
+		if (error)
+			return error;
+	}
+	hdr = dbp->b_addr;
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
+	/*
+	 * Size of the "leaf" area in the block.
+	 */
+	size = (uint)sizeof(xfs_dir2_block_tail_t) +
+	       (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
+	/*
+	 * Look at the last data entry.
+	 */
+	tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
+	dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+	/*
+	 * If it's not free or is too short we can't do it.
+	 */
+	if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+	    be16_to_cpu(dup->length) < size)
+		return 0;
+
+	/*
+	 * Start converting it to block form.
+	 */
+	xfs_dir3_block_init(mp, tp, dbp, dp);
+
+	needlog = 1;
+	needscan = 0;
+	/*
+	 * Use up the space at the end of the block (blp/btp).
+	 */
+	xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
+		&needlog, &needscan);
+	/*
+	 * Initialize the block tail.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
+	btp->stale = 0;
+	xfs_dir2_block_log_tail(tp, dbp);
+	/*
+	 * Initialize the block leaf area.  We compact out stale entries.
+	 */
+	lep = xfs_dir2_block_leaf_p(btp);
+	for (from = to = 0; from < leafhdr.count; from++) {
+		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			continue;
+		lep[to++] = ents[from];
+	}
+	ASSERT(to == be32_to_cpu(btp->count));
+	xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+	/*
+	 * Scan the bestfree if we need it and log the data block header.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * Pitch the old leaf block.
+	 */
+	error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
+	if (error)
+		return error;
+
+	/*
+	 * Now see if the resulting block can be shrunken to shortform.
+	 */
+	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+	if (size > XFS_IFORK_DSIZE(dp))
+		return 0;
+
+	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int						/* error */
+xfs_dir2_sf_to_block(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			dummy;		/* trash */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			endoffset;	/* end of data objects */
+	int			error;		/* error return value */
+	int			i;		/* index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to scan block freespc */
+	int			newoffset;	/* offset from current entry */
+	int			offset;		/* target block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old shortform header  */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform header  */
+	__be16			*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_name		name;
+	struct xfs_ifork	*ifp;
+
+	trace_xfs_dir2_sf_to_block(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	/*
+	 * Bomb out if the shortform directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return -EIO;
+	}
+
+	oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
+
+	ASSERT(ifp->if_bytes == dp->i_d.di_size);
+	ASSERT(ifp->if_u1.if_data != NULL);
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+	ASSERT(dp->i_d.di_nextents == 0);
+
+	/*
+	 * Copy the directory into a temporary buffer.
+	 * Then pitch the incore inode data so we can make extents.
+	 */
+	sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+	memcpy(sfp, oldsfp, ifp->if_bytes);
+
+	xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+	xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+	dp->i_d.di_size = 0;
+
+	/*
+	 * Add block 0 to the inode.
+	 */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+	if (error) {
+		kmem_free(sfp);
+		return error;
+	}
+	/*
+	 * Initialize the data block, then convert it to block format.
+	 */
+	error = xfs_dir3_data_init(args, blkno, &bp);
+	if (error) {
+		kmem_free(sfp);
+		return error;
+	}
+	xfs_dir3_block_init(mp, tp, bp, dp);
+	hdr = bp->b_addr;
+
+	/*
+	 * Compute size of block "tail" area.
+	 */
+	i = (uint)sizeof(*btp) +
+	    (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+	/*
+	 * The whole thing is initialized to free by the init routine.
+	 * Say we're using the leaf and tail area.
+	 */
+	dup = dp->d_ops->data_unused_p(hdr);
+	needlog = needscan = 0;
+	xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+			       i, &needlog, &needscan);
+	ASSERT(needscan == 0);
+	/*
+	 * Fill in the tail.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	btp->count = cpu_to_be32(sfp->count + 2);	/* ., .. */
+	btp->stale = 0;
+	blp = xfs_dir2_block_leaf_p(btp);
+	endoffset = (uint)((char *)blp - (char *)hdr);
+	/*
+	 * Remove the freespace, we'll manage it.
+	 */
+	xfs_dir2_data_use_free(args, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+		be16_to_cpu(dup->length), &needlog, &needscan);
+	/*
+	 * Create entry for .
+	 */
+	dep = dp->d_ops->data_dot_entry_p(hdr);
+	dep->inumber = cpu_to_be64(dp->i_ino);
+	dep->namelen = 1;
+	dep->name[0] = '.';
+	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+	blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+				(char *)dep - (char *)hdr));
+	/*
+	 * Create entry for ..
+	 */
+	dep = dp->d_ops->data_dotdot_entry_p(hdr);
+	dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
+	dep->namelen = 2;
+	dep->name[0] = dep->name[1] = '.';
+	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+				(char *)dep - (char *)hdr));
+	offset = dp->d_ops->data_first_offset;
+	/*
+	 * Loop over existing entries, stuff them in.
+	 */
+	i = 0;
+	if (!sfp->count)
+		sfep = NULL;
+	else
+		sfep = xfs_dir2_sf_firstentry(sfp);
+	/*
+	 * Need to preserve the existing offset values in the sf directory.
+	 * Insert holes (unused entries) where necessary.
+	 */
+	while (offset < endoffset) {
+		/*
+		 * sfep is null when we reach the end of the list.
+		 */
+		if (sfep == NULL)
+			newoffset = endoffset;
+		else
+			newoffset = xfs_dir2_sf_get_offset(sfep);
+		/*
+		 * There should be a hole here, make one.
+		 */
+		if (offset < newoffset) {
+			dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+			dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+			dup->length = cpu_to_be16(newoffset - offset);
+			*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
+				((char *)dup - (char *)hdr));
+			xfs_dir2_data_log_unused(args, bp, dup);
+			xfs_dir2_data_freeinsert(hdr,
+						 dp->d_ops->data_bestfree_p(hdr),
+						 dup, &dummy);
+			offset += be16_to_cpu(dup->length);
+			continue;
+		}
+		/*
+		 * Copy a real entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
+		dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
+		dep->namelen = sfep->namelen;
+		dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
+		memcpy(dep->name, sfep->name, dep->namelen);
+		tagp = dp->d_ops->data_entry_tag_p(dep);
+		*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+		xfs_dir2_data_log_entry(args, bp, dep);
+		name.name = sfep->name;
+		name.len = sfep->namelen;
+		blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+							hashname(&name));
+		blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+						 (char *)dep - (char *)hdr));
+		offset = (int)((char *)(tagp + 1) - (char *)hdr);
+		if (++i == sfp->count)
+			sfep = NULL;
+		else
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+	}
+	/* Done with the temporary buffer */
+	kmem_free(sfp);
+	/*
+	 * Sort the leaf entries by hash value.
+	 */
+	xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+	/*
+	 * Log the leaf entry area and tail.
+	 * Already logged the header in data_init, ignore needlog.
+	 */
+	ASSERT(needscan == 0);
+	xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_data.c b/kernel/fs/xfs/libxfs/xfs_dir2_data.c
new file mode 100644
index 000000000..de1ea16f5
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_data.c
@@ -0,0 +1,1049 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Return 0 is the buffer is good, otherwise an error.
+ */
+int
+__xfs_dir3_data_check(
+	struct xfs_inode	*dp,		/* incore inode pointer */
+	struct xfs_buf		*bp)		/* data block's buffer */
+{
+	xfs_dir2_dataptr_t	addr;		/* addr for leaf lookup */
+	xfs_dir2_data_free_t	*bf;		/* bestfree table */
+	xfs_dir2_block_tail_t	*btp=NULL;	/* block tail */
+	int			count;		/* count of entries found */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	char			*endp;		/* end of useful data */
+	int			freeseen;	/* mask of bestfrees seen */
+	xfs_dahash_t		hash;		/* hash of current name */
+	int			i;		/* leaf index */
+	int			lastfree;	/* last entry was unused */
+	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*p;		/* current data position */
+	int			stale;		/* count of stale leaves */
+	struct xfs_name		name;
+	const struct xfs_dir_ops *ops;
+	struct xfs_da_geometry	*geo;
+
+	mp = bp->b_target->bt_mount;
+	geo = mp->m_dir_geo;
+
+	/*
+	 * We can be passed a null dp here from a verifier, so we need to go the
+	 * hard way to get them.
+	 */
+	ops = xfs_dir_get_ops(mp, dp);
+
+	hdr = bp->b_addr;
+	p = (char *)ops->data_entry_p(hdr);
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+		btp = xfs_dir2_block_tail_p(geo, hdr);
+		lep = xfs_dir2_block_leaf_p(btp);
+		endp = (char *)lep;
+
+		/*
+		 * The number of leaf entries is limited by the size of the
+		 * block and the amount of space used by the data entries.
+		 * We don't know how much space is used by the data entries yet,
+		 * so just ensure that the count falls somewhere inside the
+		 * block right now.
+		 */
+		XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) <
+			((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
+		break;
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+		endp = (char *)hdr + geo->blksize;
+		break;
+	default:
+		XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Account for zero bestfree entries.
+	 */
+	bf = ops->data_bestfree_p(hdr);
+	count = lastfree = freeseen = 0;
+	if (!bf[0].length) {
+		XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset);
+		freeseen |= 1 << 0;
+	}
+	if (!bf[1].length) {
+		XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset);
+		freeseen |= 1 << 1;
+	}
+	if (!bf[2].length) {
+		XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset);
+		freeseen |= 1 << 2;
+	}
+
+	XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >=
+						be16_to_cpu(bf[1].length));
+	XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >=
+						be16_to_cpu(bf[2].length));
+	/*
+	 * Loop over the data/unused entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's unused, look for the space in the bestfree table.
+		 * If we find it, account for that, else make sure it
+		 * doesn't need to be there.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0);
+			XFS_WANT_CORRUPTED_RETURN(mp,
+				be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+					       (char *)dup - (char *)hdr);
+			dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+			if (dfp) {
+				i = (int)(dfp - bf);
+				XFS_WANT_CORRUPTED_RETURN(mp,
+					(freeseen & (1 << i)) == 0);
+				freeseen |= 1 << i;
+			} else {
+				XFS_WANT_CORRUPTED_RETURN(mp,
+					be16_to_cpu(dup->length) <=
+						be16_to_cpu(bf[2].length));
+			}
+			p += be16_to_cpu(dup->length);
+			lastfree = 1;
+			continue;
+		}
+		/*
+		 * It's a real entry.  Validate the fields.
+		 * If this is a block directory then make sure it's
+		 * in the leaf section of the block.
+		 * The linear search is crude but this is DEBUG code.
+		 */
+		dep = (xfs_dir2_data_entry_t *)p;
+		XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0);
+		XFS_WANT_CORRUPTED_RETURN(mp,
+			!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+		XFS_WANT_CORRUPTED_RETURN(mp,
+			be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
+					       (char *)dep - (char *)hdr);
+		XFS_WANT_CORRUPTED_RETURN(mp,
+				ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
+		count++;
+		lastfree = 0;
+		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+			addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+						(xfs_dir2_data_aoff_t)
+						((char *)dep - (char *)hdr));
+			name.name = dep->name;
+			name.len = dep->namelen;
+			hash = mp->m_dirnameops->hashname(&name);
+			for (i = 0; i < be32_to_cpu(btp->count); i++) {
+				if (be32_to_cpu(lep[i].address) == addr &&
+				    be32_to_cpu(lep[i].hashval) == hash)
+					break;
+			}
+			XFS_WANT_CORRUPTED_RETURN(mp,
+						  i < be32_to_cpu(btp->count));
+		}
+		p += ops->data_entsize(dep->namelen);
+	}
+	/*
+	 * Need to have seen all the entries and all the bestfree slots.
+	 */
+	XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+		for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+			if (lep[i].address ==
+			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+				stale++;
+			if (i > 0)
+				XFS_WANT_CORRUPTED_RETURN(mp,
+					be32_to_cpu(lep[i].hashval) >=
+						be32_to_cpu(lep[i - 1].hashval));
+		}
+		XFS_WANT_CORRUPTED_RETURN(mp, count ==
+			be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+		XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale));
+	}
+	return 0;
+}
+
+static bool
+xfs_dir3_data_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+			return false;
+	}
+	if (__xfs_dir3_data_check(NULL, bp))
+		return false;
+	return true;
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir3_data_reada_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+		bp->b_ops = &xfs_dir3_block_buf_ops;
+		bp->b_ops->verify_read(bp);
+		return;
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+		xfs_dir3_data_verify(bp);
+		return;
+	default:
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		break;
+	}
+}
+
+static void
+xfs_dir3_data_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+		 xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_dir3_data_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_data_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_data_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+	.verify_read = xfs_dir3_data_read_verify,
+	.verify_write = xfs_dir3_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+	.verify_read = xfs_dir3_data_reada_verify,
+	.verify_write = xfs_dir3_data_write_verify,
+};
+
+
+int
+xfs_dir3_data_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mapped_bno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+	return err;
+}
+
+int
+xfs_dir3_data_readahead(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mapped_bno)
+{
+	return xfs_da_reada_buf(dp, bno, mapped_bno,
+				XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
+}
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup)	/* unused space */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_aoff_t	off;		/* offset value needed */
+#ifdef DEBUG
+	int			matched;	/* matched the value */
+	int			seenzero;	/* saw a 0 bestfree entry */
+#endif
+
+	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+#ifdef DEBUG
+	/*
+	 * Validate some consistency in the bestfree table.
+	 * Check order, non-overlapping entries, and if we find the
+	 * one we're looking for it has to be exact.
+	 */
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+	for (dfp = &bf[0], seenzero = matched = 0;
+	     dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (!dfp->offset) {
+			ASSERT(!dfp->length);
+			seenzero = 1;
+			continue;
+		}
+		ASSERT(seenzero == 0);
+		if (be16_to_cpu(dfp->offset) == off) {
+			matched = 1;
+			ASSERT(dfp->length == dup->length);
+		} else if (off < be16_to_cpu(dfp->offset))
+			ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
+		else
+			ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
+		ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
+		if (dfp > &bf[0])
+			ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
+	}
+#endif
+	/*
+	 * If this is smaller than the smallest bestfree entry,
+	 * it can't be there since they're sorted.
+	 */
+	if (be16_to_cpu(dup->length) <
+	    be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+		return NULL;
+	/*
+	 * Look at the three bestfree entries for our guy.
+	 */
+	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+		if (!dfp->offset)
+			return NULL;
+		if (be16_to_cpu(dfp->offset) == off)
+			return dfp;
+	}
+	/*
+	 * Didn't find it.  This only happens if there are duplicate lengths.
+	 */
+	return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *				/* entry inserted */
+xfs_dir2_data_freeinsert(
+	struct xfs_dir2_data_hdr *hdr,		/* data block pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup,	/* unused space */
+	int			*loghead)	/* log the data header (out) */
+{
+	xfs_dir2_data_free_t	new;		/* new bestfree entry */
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	new.length = dup->length;
+	new.offset = cpu_to_be16((char *)dup - (char *)hdr);
+
+	/*
+	 * Insert at position 0, 1, or 2; or not at all.
+	 */
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = dfp[0];
+		dfp[0] = new;
+		*loghead = 1;
+		return &dfp[0];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = new;
+		*loghead = 1;
+		return &dfp[1];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+		dfp[2] = new;
+		*loghead = 1;
+		return &dfp[2];
+	}
+	return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree entry pointer */
+	int			*loghead)	/* out: log data header */
+{
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * It's the first entry, slide the next 2 up.
+	 */
+	if (dfp == &bf[0]) {
+		bf[0] = bf[1];
+		bf[1] = bf[2];
+	}
+	/*
+	 * It's the second entry, slide the 3rd entry up.
+	 */
+	else if (dfp == &bf[1])
+		bf[1] = bf[2];
+	/*
+	 * Must be the last entry.
+	 */
+	else
+		ASSERT(dfp == &bf[2]);
+	/*
+	 * Clear the 3rd entry, must be zero now.
+	 */
+	bf[2].length = 0;
+	bf[2].offset = 0;
+	*loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+	struct xfs_inode	*dp,
+	struct xfs_dir2_data_hdr *hdr,
+	int			*loghead)
+{
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* active data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	struct xfs_dir2_data_free *bf;
+	char			*endp;		/* end of block's data */
+	char			*p;		/* current entry pointer */
+	struct xfs_da_geometry	*geo = dp->i_mount->m_dir_geo;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * Start by clearing the table.
+	 */
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
+	*loghead = 1;
+	/*
+	 * Set up pointers.
+	 */
+	p = (char *)dp->d_ops->data_entry_p(hdr);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+		btp = xfs_dir2_block_tail_p(geo, hdr);
+		endp = (char *)xfs_dir2_block_leaf_p(btp);
+	} else
+		endp = (char *)hdr + geo->blksize;
+	/*
+	 * Loop over the block's entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's a free entry, insert it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT((char *)dup - (char *)hdr ==
+			       be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+			xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
+			p += be16_to_cpu(dup->length);
+		}
+		/*
+		 * For active entries, check their tags and skip them.
+		 */
+		else {
+			dep = (xfs_dir2_data_entry_t *)p;
+			ASSERT((char *)dep - (char *)hdr ==
+			       be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+			p += dp->d_ops->data_entsize(dep->namelen);
+		}
+	}
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int						/* error */
+xfs_dir3_data_init(
+	xfs_da_args_t		*args,		/* directory operation args */
+	xfs_dir2_db_t		blkno,		/* logical dir block number */
+	struct xfs_buf		**bpp)		/* output block buffer */
+{
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	struct xfs_dir2_data_free *bf;
+	int			error;		/* error return value */
+	int			i;		/* bestfree index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int                     t;              /* temp */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Get the buffer set up for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+			       -1, &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
+
+	/*
+	 * Initialize the header.
+	 */
+	hdr = bp->b_addr;
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+	} else
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
+	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+		bf[i].length = 0;
+		bf[i].offset = 0;
+	}
+
+	/*
+	 * Set up an unused entry for the block's body.
+	 */
+	dup = dp->d_ops->data_unused_p(hdr);
+	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+
+	t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
+	bf[0].length = cpu_to_be16(t);
+	dup->length = cpu_to_be16(t);
+	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+	/*
+	 * Log it and return it.
+	 */
+	xfs_dir2_data_log_header(args, bp);
+	xfs_dir2_data_log_unused(args, bp, dup);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
+{
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+		(uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
+		       (char *)hdr - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+#ifdef DEBUG
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
+
+	xfs_trans_log_buf(args->trans, bp, 0,
+			  args->dp->d_ops->data_entry_offset - 1);
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_unused_t	*dup)		/* data unused pointer */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * Log the first part of the unused entry.
+	 */
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
+		(uint)((char *)&dup->length + sizeof(dup->length) -
+		       1 - (char *)hdr));
+	/*
+	 * Log the end (tag) of the unused entry.
+	 */
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
+		       sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
+	xfs_dir2_data_aoff_t	len,		/* length in bytes */
+	int			*needlogp,	/* out: log header */
+	int			*needscanp)	/* out: regen bestfree */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	char			*endptr;	/* end of data area */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
+	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
+	struct xfs_dir2_data_free *bf;
+
+	hdr = bp->b_addr;
+
+	/*
+	 * Figure out where the end of the data area is.
+	 */
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+		endptr = (char *)hdr + args->geo->blksize;
+	else {
+		xfs_dir2_block_tail_t	*btp;	/* block tail */
+
+		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+			hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+		btp = xfs_dir2_block_tail_p(args->geo, hdr);
+		endptr = (char *)xfs_dir2_block_leaf_p(btp);
+	}
+	/*
+	 * If this isn't the start of the block, then back up to
+	 * the previous entry and see if it's free.
+	 */
+	if (offset > args->dp->d_ops->data_entry_offset) {
+		__be16			*tagp;	/* tag just before us */
+
+		tagp = (__be16 *)((char *)hdr + offset) - 1;
+		prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+		if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			prevdup = NULL;
+	} else
+		prevdup = NULL;
+	/*
+	 * If this isn't the end of the block, see if the entry after
+	 * us is free.
+	 */
+	if ((char *)hdr + offset + len < endptr) {
+		postdup =
+			(xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			postdup = NULL;
+	} else
+		postdup = NULL;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * Previous and following entries are both free,
+	 * merge everything into a single free entry.
+	 */
+	bf = args->dp->d_ops->data_bestfree_p(hdr);
+	if (prevdup && postdup) {
+		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
+
+		/*
+		 * See if prevdup and/or postdup are in bestfree table.
+		 */
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+		dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
+		/*
+		 * We need a rescan unless there are exactly 2 free entries
+		 * namely our two.  Then we know what's happening, otherwise
+		 * since the third bestfree is there, there might be more
+		 * entries.
+		 */
+		needscan = (bf[2].length != 0);
+		/*
+		 * Fix up the new big freespace.
+		 */
+		be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, prevdup);
+		if (!needscan) {
+			/*
+			 * Has to be the case that entries 0 and 1 are
+			 * dfp and dfp2 (don't know which is which), and
+			 * entry 2 is empty.
+			 * Remove entry 1 first then entry 0.
+			 */
+			ASSERT(dfp && dfp2);
+			if (dfp == &bf[1]) {
+				dfp = &bf[0];
+				ASSERT(dfp2 == dfp);
+				dfp2 = &bf[1];
+			}
+			xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			/*
+			 * Now insert the new entry.
+			 */
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+						       needlogp);
+			ASSERT(dfp == &bf[0]);
+			ASSERT(dfp->length == prevdup->length);
+			ASSERT(!dfp[1].length);
+			ASSERT(!dfp[2].length);
+		}
+	}
+	/*
+	 * The entry before us is free, merge with it.
+	 */
+	else if (prevdup) {
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+		be16_add_cpu(&prevdup->length, len);
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, prevdup);
+		/*
+		 * If the previous entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(prevdup->length) >
+				   be16_to_cpu(bf[2].length);
+		}
+	}
+	/*
+	 * The following entry is free, merge with it.
+	 */
+	else if (postdup) {
+		dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If the following entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(newdup->length) >
+				   be16_to_cpu(bf[2].length);
+		}
+	}
+	/*
+	 * Neither neighbor is free.  Make a new entry.
+	 */
+	else {
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+	}
+	*needscanp = needscan;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_unused_t	*dup,		/* unused entry */
+	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
+	xfs_dir2_data_aoff_t	len,		/* length to use */
+	int			*needlogp,	/* out: need to log header */
+	int			*needscanp)	/* out: need regen bestfree */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	int			matchback;	/* matches end of freespace */
+	int			matchfront;	/* matches start of freespace */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
+	int			oldlen;		/* old unused entry's length */
+	struct xfs_dir2_data_free *bf;
+
+	hdr = bp->b_addr;
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+	ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
+	ASSERT(offset >= (char *)dup - (char *)hdr);
+	ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
+	ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+	/*
+	 * Look up the entry in the bestfree table.
+	 */
+	oldlen = be16_to_cpu(dup->length);
+	bf = args->dp->d_ops->data_bestfree_p(hdr);
+	dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+	ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
+	/*
+	 * Check for alignment with front and back of the entry.
+	 */
+	matchfront = (char *)dup - (char *)hdr == offset;
+	matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * If we matched it exactly we just need to get rid of it from
+	 * the bestfree table.
+	 */
+	if (matchfront && matchback) {
+		if (dfp) {
+			needscan = (bf[2].offset != 0);
+			if (!needscan)
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
+		}
+	}
+	/*
+	 * We match the first part of the entry.
+	 * Make a new entry with the remaining freespace.
+	 */
+	else if (matchfront) {
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(oldlen - len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(dfp->length == newdup->length);
+			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &bf[2];
+		}
+	}
+	/*
+	 * We match the last part of the entry.
+	 * Trim the allocated space off the tail of the entry.
+	 */
+	else if (matchback) {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(dfp->length == newdup->length);
+			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &bf[2];
+		}
+	}
+	/*
+	 * Poking out the middle of an entry.
+	 * Make two new entries.
+	 */
+	else {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup2) =
+			cpu_to_be16((char *)newdup2 - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup2);
+		/*
+		 * If the old entry was in the table, we need to scan
+		 * if the 3rd entry was valid, since these entries
+		 * are smaller than the old one.
+		 * If we don't need to scan that means there were 1 or 2
+		 * entries in the table, and removing the old and adding
+		 * the 2 new will work.
+		 */
+		if (dfp) {
+			needscan = (bf[2].length != 0);
+			if (!needscan) {
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup2,
+							 needlogp);
+			}
+		}
+	}
+	*needscanp = needscan;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000..106119955
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -0,0 +1,1819 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Local function declarations.
+ */
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+				    int *indexp, struct xfs_buf **dbpp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+				    struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+				   struct xfs_buf *bp);
+
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+#define	xfs_dir3_leaf_check(dp, bp) \
+do { \
+	if (!xfs_dir3_leaf1_check((dp), (bp))) \
+		ASSERT(0); \
+} while (0);
+
+STATIC bool
+xfs_dir3_leaf1_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+		return false;
+
+	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define	xfs_dir3_leaf_check(dp, bp)
+#endif
+
+bool
+xfs_dir3_leaf_check_int(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp,
+	struct xfs_dir3_icleaf_hdr *hdr,
+	struct xfs_dir2_leaf	*leaf)
+{
+	struct xfs_dir2_leaf_entry *ents;
+	xfs_dir2_leaf_tail_t	*ltp;
+	int			stale;
+	int			i;
+	const struct xfs_dir_ops *ops;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_da_geometry	*geo = mp->m_dir_geo;
+
+	/*
+	 * we can be passed a null dp here from a verifier, so we need to go the
+	 * hard way to get them.
+	 */
+	ops = xfs_dir_get_ops(mp, dp);
+
+	if (!hdr) {
+		ops->leaf_hdr_from_disk(&leafhdr, leaf);
+		hdr = &leafhdr;
+	}
+
+	ents = ops->leaf_ents_p(leaf);
+	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+	/*
+	 * XXX (dgc): This value is not restrictive enough.
+	 * Should factor in the size of the bests table as well.
+	 * We can deduce a value for that from di_size.
+	 */
+	if (hdr->count > ops->leaf_max_ents(geo))
+		return false;
+
+	/* Leaves and bests don't overlap in leaf format. */
+	if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+	     hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+	    (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+		return false;
+
+	/* Check hash value order, count stale entries.  */
+	for (i = stale = 0; i < hdr->count; i++) {
+		if (i + 1 < hdr->count) {
+			if (be32_to_cpu(ents[i].hashval) >
+					be32_to_cpu(ents[i + 1].hashval))
+				return false;
+		}
+		if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			stale++;
+	}
+	if (hdr->stale != stale)
+		return false;
+	return true;
+}
+
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static bool
+xfs_dir3_leaf_verify(
+	struct xfs_buf		*bp,
+	__uint16_t		magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		__uint16_t		magic3;
+
+		magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+							 : XFS_DIR3_LEAFN_MAGIC;
+
+		if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+			return false;
+		if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (leaf->hdr.info.magic != cpu_to_be16(magic))
+			return false;
+	}
+
+	return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+}
+
+static void
+__read_verify(
+	struct xfs_buf  *bp,
+	__uint16_t	magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_dir3_leaf_verify(bp, magic))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+__write_verify(
+	struct xfs_buf  *bp,
+	__uint16_t	magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_leaf_verify(bp, magic)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
+}
+
+static void
+xfs_dir3_leaf1_read_verify(
+	struct xfs_buf	*bp)
+{
+	__read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leaf1_write_verify(
+	struct xfs_buf	*bp)
+{
+	__write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_read_verify(
+	struct xfs_buf	*bp)
+{
+	__read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_write_verify(
+	struct xfs_buf	*bp)
+{
+	__write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+	.verify_read = xfs_dir3_leaf1_read_verify,
+	.verify_write = xfs_dir3_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+	.verify_read = xfs_dir3_leafn_read_verify,
+	.verify_write = xfs_dir3_leafn_write_verify,
+};
+
+static int
+xfs_dir3_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+	return err;
+}
+
+int
+xfs_dir3_leafn_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+	return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	xfs_ino_t		owner,
+	__uint16_t		type)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+		memset(leaf3, 0, sizeof(*leaf3));
+
+		leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+					 ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+					 : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+		leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+		leaf3->info.owner = cpu_to_be64(owner);
+		uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+	} else {
+		memset(leaf, 0, sizeof(*leaf));
+		leaf->hdr.info.magic = cpu_to_be16(type);
+	}
+
+	/*
+	 * If it's a leaf-format directory initialize the tail.
+	 * Caller is responsible for initialising the bests table.
+	 */
+	if (type == XFS_DIR2_LEAF1_MAGIC) {
+		struct xfs_dir2_leaf_tail *ltp;
+
+		ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+		ltp->bestcount = 0;
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+	} else {
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+	}
+}
+
+int
+xfs_dir3_leaf_get_buf(
+	xfs_da_args_t		*args,
+	xfs_dir2_db_t		bno,
+	struct xfs_buf		**bpp,
+	__uint16_t		magic)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+	       bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+			       -1, &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+	xfs_dir3_leaf_log_header(args, bp);
+	if (magic == XFS_DIR2_LEAF1_MAGIC)
+		xfs_dir3_leaf_log_tail(args, bp);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_block_to_leaf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*dbp)		/* input block's buffer */
+{
+	__be16			*bestsp;	/* leaf's bestsp entries */
+	xfs_dablk_t		blkno;		/* leaf block's bno */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block's leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block's tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	struct xfs_buf		*lbp;		/* leaf block's buffer */
+	xfs_dir2_db_t		ldb;		/* leaf block's bno */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf's tail */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to rescan bestfree */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_block_to_leaf(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Add the leaf block to the inode.
+	 * This interface will only put blocks in the leaf/node range.
+	 * Since that's empty now, we'll get the root (block 0 in range).
+	 */
+	if ((error = xfs_da_grow_inode(args, &blkno))) {
+		return error;
+	}
+	ldb = xfs_dir2_da_to_db(args->geo, blkno);
+	ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
+	/*
+	 * Initialize the leaf block, get a buffer for it.
+	 */
+	error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+	if (error)
+		return error;
+
+	leaf = lbp->b_addr;
+	hdr = dbp->b_addr;
+	xfs_dir3_data_check(dp, dbp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	/*
+	 * Set the counts in the leaf header.
+	 */
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	leafhdr.count = be32_to_cpu(btp->count);
+	leafhdr.stale = be32_to_cpu(btp->stale);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+
+	/*
+	 * Could compact these but I think we always do the conversion
+	 * after squeezing out stale entries.
+	 */
+	memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
+	needscan = 0;
+	needlog = 1;
+	/*
+	 * Make the space formerly occupied by the leaf entries and block
+	 * tail be free.
+	 */
+	xfs_dir2_data_make_free(args, dbp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+		(xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
+				       (char *)blp),
+		&needlog, &needscan);
+	/*
+	 * Fix up the block header, make it a data block.
+	 */
+	dbp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	else
+		hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	/*
+	 * Set up leaf tail and bests table.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ltp->bestcount = cpu_to_be32(1);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	bestsp[0] =  bf[0].length;
+	/*
+	 * Log the data header and leaf bests table.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_dir3_data_check(dp, dbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
+	return 0;
+}
+
+STATIC void
+xfs_dir3_leaf_find_stale(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int			index,
+	int			*lowstale,
+	int			*highstale)
+{
+	/*
+	 * Find the first stale entry before our index, if any.
+	 */
+	for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
+		if (ents[*lowstale].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			break;
+	}
+
+	/*
+	 * Find the first stale entry at or after our index, if any.
+	 * Stop if the result would require moving more entries than using
+	 * lowstale.
+	 */
+	for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+		if (ents[*highstale].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			break;
+		if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
+			break;
+	}
+}
+
+struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int			index,		/* leaf table position */
+	int			compact,	/* need to compact leaves */
+	int			lowstale,	/* index of prev stale leaf */
+	int			highstale,	/* index of next stale leaf */
+	int			*lfloglow,	/* low leaf logging index */
+	int			*lfloghigh)	/* high leaf logging index */
+{
+	if (!leafhdr->stale) {
+		xfs_dir2_leaf_entry_t	*lep;	/* leaf entry table pointer */
+
+		/*
+		 * Now we need to make room to insert the leaf entry.
+		 *
+		 * If there are no stale entries, just insert a hole at index.
+		 */
+		lep = &ents[index];
+		if (index < leafhdr->count)
+			memmove(lep + 1, lep,
+				(leafhdr->count - index) * sizeof(*lep));
+
+		/*
+		 * Record low and high logging indices for the leaf.
+		 */
+		*lfloglow = index;
+		*lfloghigh = leafhdr->count++;
+		return lep;
+	}
+
+	/*
+	 * There are stale entries.
+	 *
+	 * We will use one of them for the new entry.  It's probably not at
+	 * the right location, so we'll have to shift some up or down first.
+	 *
+	 * If we didn't compact before, we need to find the nearest stale
+	 * entries before and after our insertion point.
+	 */
+	if (compact == 0)
+		xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+					 &lowstale, &highstale);
+
+	/*
+	 * If the low one is better, use it.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == leafhdr->count ||
+	     index - lowstale - 1 < highstale - index)) {
+		ASSERT(index - lowstale - 1 >= 0);
+		ASSERT(ents[lowstale].address ==
+		       cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+		/*
+		 * Copy entries up to cover the stale entry and make room
+		 * for the new entry.
+		 */
+		if (index - lowstale - 1 > 0) {
+			memmove(&ents[lowstale], &ents[lowstale + 1],
+				(index - lowstale - 1) *
+					sizeof(xfs_dir2_leaf_entry_t));
+		}
+		*lfloglow = MIN(lowstale, *lfloglow);
+		*lfloghigh = MAX(index - 1, *lfloghigh);
+		leafhdr->stale--;
+		return &ents[index - 1];
+	}
+
+	/*
+	 * The high one is better, so use that one.
+	 */
+	ASSERT(highstale - index >= 0);
+	ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+	/*
+	 * Copy entries down to cover the stale entry and make room for the
+	 * new entry.
+	 */
+	if (highstale - index > 0) {
+		memmove(&ents[index + 1], &ents[index],
+			(highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
+	}
+	*lfloglow = MIN(index, *lfloglow);
+	*lfloghigh = MAX(highstale, *lfloghigh);
+	leafhdr->stale--;
+	return &ents[index];
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_leaf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	__be16			*bestsp;	/* freespace table in leaf */
+	int			compact;	/* need to compact leaves */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry */
+	int			error;		/* error return value */
+	int			grown;		/* allocated new data block */
+	int			highstale;	/* index of next stale leaf */
+	int			i;		/* temporary, index */
+	int			index;		/* leaf table position */
+	struct xfs_buf		*lbp;		/* leaf's buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry table pointer */
+	int			lfloglow;	/* low leaf logging index */
+	int			lfloghigh;	/* high leaf logging index */
+	int			lowstale;	/* index of prev stale leaf */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	int			needbytes;	/* leaf block bytes needed */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data free */
+	__be16			*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		use_block;	/* data block number */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_addname(args);
+
+	dp = args->dp;
+	tp = args->trans;
+
+	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+	if (error)
+		return error;
+
+	/*
+	 * Look up the entry by hash value and name.
+	 * We know it's not there, our caller has already done a lookup.
+	 * So the index is of the entry to insert in front of.
+	 * But if there are dup hash values the index is of the first of those.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	length = dp->d_ops->data_entsize(args->namelen);
+
+	/*
+	 * See if there are any entries with the same hash value
+	 * and space in their block for the new entry.
+	 * This is good because it puts multiple same-hash value entries
+	 * in a data block, improving the lookup of those entries.
+	 */
+	for (use_block = -1, lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     index++, lep++) {
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+		ASSERT(i < be32_to_cpu(ltp->bestcount));
+		ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
+		if (be16_to_cpu(bestsp[i]) >= length) {
+			use_block = i;
+			break;
+		}
+	}
+	/*
+	 * Didn't find a block yet, linear search all the data blocks.
+	 */
+	if (use_block == -1) {
+		for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+			/*
+			 * Remember a block we see that's missing.
+			 */
+			if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
+			    use_block == -1)
+				use_block = i;
+			else if (be16_to_cpu(bestsp[i]) >= length) {
+				use_block = i;
+				break;
+			}
+		}
+	}
+	/*
+	 * How many bytes do we need in the leaf block?
+	 */
+	needbytes = 0;
+	if (!leafhdr.stale)
+		needbytes += sizeof(xfs_dir2_leaf_entry_t);
+	if (use_block == -1)
+		needbytes += sizeof(xfs_dir2_data_off_t);
+
+	/*
+	 * Now kill use_block if it refers to a missing block, so we
+	 * can use it as an indication of allocation needed.
+	 */
+	if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
+		use_block = -1;
+	/*
+	 * If we don't have enough free bytes but we can make enough
+	 * by compacting out stale entries, we'll do that.
+	 */
+	if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+	    leafhdr.stale > 1)
+		compact = 1;
+
+	/*
+	 * Otherwise if we don't have enough free bytes we need to
+	 * convert to node form.
+	 */
+	else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
+		/*
+		 * Just checking or no space reservation, give up.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+							args->total == 0) {
+			xfs_trans_brelse(tp, lbp);
+			return -ENOSPC;
+		}
+		/*
+		 * Convert to node form.
+		 */
+		error = xfs_dir2_leaf_to_node(args, lbp);
+		if (error)
+			return error;
+		/*
+		 * Then add the new entry.
+		 */
+		return xfs_dir2_node_addname(args);
+	}
+	/*
+	 * Otherwise it will fit without compaction.
+	 */
+	else
+		compact = 0;
+	/*
+	 * If just checking, then it will fit unless we needed to allocate
+	 * a new data block.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		xfs_trans_brelse(tp, lbp);
+		return use_block == -1 ? -ENOSPC : 0;
+	}
+	/*
+	 * If no allocations are allowed, return now before we've
+	 * changed anything.
+	 */
+	if (args->total == 0 && use_block == -1) {
+		xfs_trans_brelse(tp, lbp);
+		return -ENOSPC;
+	}
+	/*
+	 * Need to compact the leaf entries, removing stale ones.
+	 * Leave one stale entry behind - the one closest to our
+	 * insertion index - and we'll shift that one to our insertion
+	 * point later.
+	 */
+	if (compact) {
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+			&highstale, &lfloglow, &lfloghigh);
+	}
+	/*
+	 * There are stale entries, so we'll need log-low and log-high
+	 * impossibly bad values later.
+	 */
+	else if (leafhdr.stale) {
+		lfloglow = leafhdr.count;
+		lfloghigh = -1;
+	}
+	/*
+	 * If there was no data block space found, we need to allocate
+	 * a new one.
+	 */
+	if (use_block == -1) {
+		/*
+		 * Add the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&use_block))) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * Initialize the block.
+		 */
+		if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * If we're adding a new data block on the end we need to
+		 * extend the bests table.  Copy it up one entry.
+		 */
+		if (use_block >= be32_to_cpu(ltp->bestcount)) {
+			bestsp--;
+			memmove(&bestsp[0], &bestsp[1],
+				be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+			be32_add_cpu(&ltp->bestcount, 1);
+			xfs_dir3_leaf_log_tail(args, lbp);
+			xfs_dir3_leaf_log_bests(args, lbp, 0,
+						be32_to_cpu(ltp->bestcount) - 1);
+		}
+		/*
+		 * If we're filling in a previously empty block just log it.
+		 */
+		else
+			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		bestsp[use_block] = bf[0].length;
+		grown = 1;
+	} else {
+		/*
+		 * Already had space in some data block.
+		 * Just read that one in.
+		 */
+		error = xfs_dir3_data_read(tp, dp,
+				   xfs_dir2_db_to_da(args->geo, use_block),
+				   -1, &dbp);
+		if (error) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		grown = 0;
+	}
+	/*
+	 * Point to the biggest freespace in our data block.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+	ASSERT(be16_to_cpu(dup->length) >= length);
+	needscan = needlog = 0;
+	/*
+	 * Mark the initial part of our freespace in use for the new entry.
+	 */
+	xfs_dir2_data_use_free(args, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+		&needlog, &needscan);
+	/*
+	 * Initialize our new entry (at last).
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	/*
+	 * Need to scan fix up the bestfree table.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	/*
+	 * Need to log the data block's header.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	/*
+	 * If the bests table needs to be changed, do it.
+	 * Log the change unless we've already done that.
+	 */
+	if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+		bestsp[use_block] = bf[0].length;
+		if (!grown)
+			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+	}
+
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+				       highstale, &lfloglow, &lfloghigh);
+
+	/*
+	 * Fill in the new leaf entry.
+	 */
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(
+				xfs_dir2_db_off_to_dataptr(args->geo, use_block,
+				be16_to_cpu(*tagp)));
+	/*
+	 * Log the leaf fields and give up the buffers.
+	 */
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_dir3_data_check(dp, dbp);
+	return 0;
+}
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir3_leaf_compact(
+	xfs_da_args_t	*args,		/* operation arguments */
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_buf	*bp)		/* leaf buffer */
+{
+	int		from;		/* source leaf index */
+	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
+	int		loglow;		/* first leaf entry to log */
+	int		to;		/* target leaf index */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_inode *dp = args->dp;
+
+	leaf = bp->b_addr;
+	if (!leafhdr->stale)
+		return;
+
+	/*
+	 * Compress out the stale entries in place.
+	 */
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			continue;
+		/*
+		 * Only actually copy the entries that are different.
+		 */
+		if (from > to) {
+			if (loglow == -1)
+				loglow = to;
+			ents[to] = ents[from];
+		}
+		to++;
+	}
+	/*
+	 * Update and log the header, log the leaf entries.
+	 */
+	ASSERT(leafhdr->stale == from - to);
+	leafhdr->count -= leafhdr->stale;
+	leafhdr->stale = 0;
+
+	dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+	if (loglow != -1)
+		xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir3_leaf_compact_x1(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int		*indexp,	/* insertion index */
+	int		*lowstalep,	/* out: stale entry before us */
+	int		*highstalep,	/* out: stale entry after us */
+	int		*lowlogp,	/* out: low log index */
+	int		*highlogp)	/* out: high log index */
+{
+	int		from;		/* source copy index */
+	int		highstale;	/* stale entry at/after index */
+	int		index;		/* insertion index */
+	int		keepstale;	/* source index of kept stale */
+	int		lowstale;	/* stale entry before index */
+	int		newindex=0;	/* new insertion index */
+	int		to;		/* destination copy index */
+
+	ASSERT(leafhdr->stale > 1);
+	index = *indexp;
+
+	xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
+
+	/*
+	 * Pick the better of lowstale and highstale.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == leafhdr->count ||
+	     index - lowstale <= highstale - index))
+		keepstale = lowstale;
+	else
+		keepstale = highstale;
+	/*
+	 * Copy the entries in place, removing all the stale entries
+	 * except keepstale.
+	 */
+	for (from = to = 0; from < leafhdr->count; from++) {
+		/*
+		 * Notice the new value of index.
+		 */
+		if (index == from)
+			newindex = to;
+		if (from != keepstale &&
+		    ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+			if (from == to)
+				*lowlogp = to;
+			continue;
+		}
+		/*
+		 * Record the new keepstale value for the insertion.
+		 */
+		if (from == keepstale)
+			lowstale = highstale = to;
+		/*
+		 * Copy only the entries that have moved.
+		 */
+		if (from > to)
+			ents[to] = ents[from];
+		to++;
+	}
+	ASSERT(from > to);
+	/*
+	 * If the insertion point was past the last entry,
+	 * set the new insertion point accordingly.
+	 */
+	if (index == from)
+		newindex = to;
+	*indexp = newindex;
+	/*
+	 * Adjust the leaf header values.
+	 */
+	leafhdr->count -= from - to;
+	leafhdr->stale = 1;
+	/*
+	 * Remember the low/high stale value only in the "right"
+	 * direction.
+	 */
+	if (lowstale >= newindex)
+		lowstale = -1;
+	else
+		highstale = leafhdr->count;
+	*highlogp = leafhdr->count - 1;
+	*lowstalep = lowstale;
+	*highstalep = highstale;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir3_leaf_log_bests(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	__be16			*firstb;	/* pointer to first entry */
+	__be16			*lastb;		/* pointer to last entry */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+	lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)firstb - (char *)leaf),
+		(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_ents(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
+	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents;
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	ents = args->dp->d_ops->leaf_ents_p(leaf);
+	firstlep = &ents[first];
+	lastlep = &ents[last];
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)firstlep - (char *)leaf),
+		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	xfs_trans_log_buf(args->trans, bp,
+			  (uint)((char *)&leaf->hdr - (char *)leaf),
+			  args->dp->d_ops->leaf_hdr_size - 1);
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir3_leaf_log_tail(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+		(uint)(args->geo->blksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* found entry index */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leaf_lookup(args);
+
+	/*
+	 * Look up name in the leaf block, returning both buffers and index.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	tp = args->trans;
+	dp = args->dp;
+	xfs_dir3_leaf_check(dp, lbp);
+	leaf = lbp->b_addr;
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	/*
+	 * Get to the leaf entry and contained data entry address.
+	 */
+	lep = &ents[index];
+
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->b_addr +
+	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	/*
+	 * Return the found inode number & CI name if appropriate
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	args->filetype = dp->d_ops->data_get_ftype(dep);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_trans_brelse(tp, dbp);
+	xfs_trans_brelse(tp, lbp);
+	return error;
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int					/* error */
+xfs_dir2_leaf_lookup_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		**lbpp,		/* out: leaf buffer */
+	int			*indexp,	/* out: index in leaf block */
+	struct xfs_buf		**dbpp)		/* out: data buffer */
+{
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	struct xfs_buf		*dbp = NULL;	/* data buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index in leaf block */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		cidb = -1;	/* case match data block no. */
+	enum xfs_dacmp		cmp;		/* name compare result */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+
+	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+	if (error)
+		return error;
+
+	*lbpp = lbp;
+	leaf = lbp->b_addr;
+	xfs_dir3_leaf_check(dp, lbp);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	/*
+	 * Look for the first leaf entry with our hash value.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	/*
+	 * Loop over all the entries with the right hash value
+	 * looking to match the name.
+	 */
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip over stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get the new data block number.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * If it's not the same as the old data block number,
+		 * need to pitch the old one and read the new one.
+		 */
+		if (newdb != curdb) {
+			if (dbp)
+				xfs_trans_brelse(tp, dbp);
+			error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, newdb),
+					   -1, &dbp);
+			if (error) {
+				xfs_trans_brelse(tp, lbp);
+				return error;
+			}
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(lep->address)));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*indexp = index;
+			/* case exact match: return the current buffer. */
+			if (cmp == XFS_CMP_EXACT) {
+				*dbpp = dbp;
+				return 0;
+			}
+			cidb = curdb;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or remove).
+	 * If a case-insensitive match was found earlier, re-read the
+	 * appropriate data block if required and return it.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE) {
+		ASSERT(cidb != -1);
+		if (cidb != curdb) {
+			xfs_trans_brelse(tp, dbp);
+			error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, cidb),
+					   -1, &dbp);
+			if (error) {
+				xfs_trans_brelse(tp, lbp);
+				return error;
+			}
+		}
+		*dbpp = dbp;
+		return 0;
+	}
+	/*
+	 * No match found, return -ENOENT.
+	 */
+	ASSERT(cidb == -1);
+	if (dbp)
+		xfs_trans_brelse(tp, dbp);
+	xfs_trans_brelse(tp, lbp);
+	return -ENOENT;
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int						/* error */
+xfs_dir2_leaf_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	__be16			*bestsp;	/* leaf block best freespace */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		db;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry structure */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_db_t		i;		/* temporary data block # */
+	int			index;		/* index into leaf entries */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_removename(args);
+
+	/*
+	 * Lookup the leaf entry, get the leaf and data blocks read in.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	leaf = lbp->b_addr;
+	hdr = dbp->b_addr;
+	xfs_dir3_data_check(dp, dbp);
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	/*
+	 * Point to the leaf entry, use that to point to the data entry.
+	 */
+	lep = &ents[index];
+	db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+		xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	needscan = needlog = 0;
+	oldbest = be16_to_cpu(bf[0].length);
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
+	/*
+	 * Mark the former data entry unused.
+	 */
+	xfs_dir2_data_make_free(args, dbp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * We just mark the leaf entry stale by putting a null in it.
+	 */
+	leafhdr.stale++;
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir3_leaf_log_ents(args, lbp, index, index);
+
+	/*
+	 * Scan the freespace in the data block again if necessary,
+	 * log the data block header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * If the longest freespace in the data block has changed,
+	 * put the new value in the bests table and log that.
+	 */
+	if (be16_to_cpu(bf[0].length) != oldbest) {
+		bestsp[db] = bf[0].length;
+		xfs_dir3_leaf_log_bests(args, lbp, db, db);
+	}
+	xfs_dir3_data_check(dp, dbp);
+	/*
+	 * If the data block is now empty then get rid of the data block.
+	 */
+	if (be16_to_cpu(bf[0].length) ==
+			args->geo->blksize - dp->d_ops->data_entry_offset) {
+		ASSERT(db != args->geo->datablk);
+		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+			/*
+			 * Nope, can't get rid of it because it caused
+			 * allocation of a bmap btree block to do so.
+			 * Just go on, returning success, leaving the
+			 * empty block in place.
+			 */
+			if (error == -ENOSPC && args->total == 0)
+				error = 0;
+			xfs_dir3_leaf_check(dp, lbp);
+			return error;
+		}
+		dbp = NULL;
+		/*
+		 * If this is the last data block then compact the
+		 * bests table by getting rid of entries.
+		 */
+		if (db == be32_to_cpu(ltp->bestcount) - 1) {
+			/*
+			 * Look for the last active entry (i).
+			 */
+			for (i = db - 1; i > 0; i--) {
+				if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
+					break;
+			}
+			/*
+			 * Copy the table down so inactive entries at the
+			 * end are removed.
+			 */
+			memmove(&bestsp[db - i], bestsp,
+				(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+			be32_add_cpu(&ltp->bestcount, -(db - i));
+			xfs_dir3_leaf_log_tail(args, lbp);
+			xfs_dir3_leaf_log_bests(args, lbp, 0,
+						be32_to_cpu(ltp->bestcount) - 1);
+		} else
+			bestsp[db] = cpu_to_be16(NULLDATAOFF);
+	}
+	/*
+	 * If the data block was not the first one, drop it.
+	 */
+	else if (db != args->geo->datablk)
+		dbp = NULL;
+
+	xfs_dir3_leaf_check(dp, lbp);
+	/*
+	 * See if we can convert to block form.
+	 */
+	return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int						/* error */
+xfs_dir2_leaf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index of leaf entry */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leaf_replace(args);
+
+	/*
+	 * Look up the entry.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	leaf = lbp->b_addr;
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	/*
+	 * Point to the leaf entry, get data address from it.
+	 */
+	lep = &ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->b_addr +
+	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+	/*
+	 * Put the new inode number in, log it.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tp = args->trans;
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_trans_brelse(tp, lbp);
+	return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int						/* index value */
+xfs_dir2_leaf_search_hash(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp)		/* leaf buffer */
+{
+	xfs_dahash_t		hash=0;		/* hash from this entry */
+	xfs_dahash_t		hashwant;	/* hash value looking for */
+	int			high;		/* high leaf index */
+	int			low;		/* low leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			mid=0;		/* current leaf index */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	leaf = lbp->b_addr;
+	ents = args->dp->d_ops->leaf_ents_p(leaf);
+	args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	/*
+	 * Note, the table cannot be empty, so we have to go through the loop.
+	 * Binary search the leaf entries looking for our hash value.
+	 */
+	for (lep = ents, low = 0, high = leafhdr.count - 1,
+		hashwant = args->hashval;
+	     low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+			break;
+		if (hash < hashwant)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	/*
+	 * Found one, back up through all the equal hash values.
+	 */
+	if (hash == hashwant) {
+		while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+			mid--;
+		}
+	}
+	/*
+	 * Need to point to an entry higher than ours.
+	 */
+	else if (hash < hashwant)
+		mid++;
+	return mid;
+}
+
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int						/* error */
+xfs_dir2_leaf_trim_data(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp,		/* leaf buffer */
+	xfs_dir2_db_t		db)		/* data block number */
+{
+	__be16			*bestsp;	/* leaf bests table */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Read the offending data block.  We need its buffer.
+	 */
+	error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+				   -1, &dbp);
+	if (error)
+		return error;
+
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+#ifdef DEBUG
+{
+	struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+	struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+	ASSERT(be16_to_cpu(bf[0].length) ==
+	       args->geo->blksize - dp->d_ops->data_entry_offset);
+	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+}
+#endif
+
+	/*
+	 * Get rid of the data block.
+	 */
+	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+		ASSERT(error != -ENOSPC);
+		xfs_trans_brelse(tp, dbp);
+		return error;
+	}
+	/*
+	 * Eliminate the last bests entry from the table.
+	 */
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	be32_add_cpu(&ltp->bestcount, -1);
+	memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+	xfs_dir3_leaf_log_tail(args, lbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	return 0;
+}
+
+static inline size_t
+xfs_dir3_leaf_size(
+	struct xfs_dir3_icleaf_hdr	*hdr,
+	int				counts)
+{
+	int	entries;
+	int	hdrsize;
+
+	entries = hdr->count - hdr->stale;
+	if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+	    hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+		hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+	else
+		hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
+
+	return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+	               + counts * sizeof(xfs_dir2_data_off_t)
+		       + sizeof(xfs_dir2_leaf_tail_t);
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int						/* error */
+xfs_dir2_node_to_leaf(
+	xfs_da_state_t		*state)		/* directory operation state */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	struct xfs_buf		*fbp;		/* buffer for freespace block */
+	xfs_fileoff_t		fo;		/* freespace file offset */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	struct xfs_buf		*lbp;		/* buffer for leaf block */
+	xfs_dir2_leaf_tail_t	*ltp;		/* tail of leaf structure */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			rval;		/* successful free trim? */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	/*
+	 * There's more than a leaf level in the btree, so there must
+	 * be multiple leafn blocks.  Give up.
+	 */
+	if (state->path.active > 1)
+		return 0;
+	args = state->args;
+
+	trace_xfs_dir2_node_to_leaf(args);
+
+	mp = state->mp;
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Get the last offset in the file.
+	 */
+	if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	fo -= args->geo->fsbcount;
+	/*
+	 * If there are freespace blocks other than the first one,
+	 * take this opportunity to remove trailing empty freespace blocks
+	 * that may have been left behind during no-space-reservation
+	 * operations.
+	 */
+	while (fo > args->geo->freeblk) {
+		if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+			return error;
+		}
+		if (rval)
+			fo -= args->geo->fsbcount;
+		else
+			return 0;
+	}
+	/*
+	 * Now find the block just before the freespace block.
+	 */
+	if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	/*
+	 * If it's not the single leaf block, give up.
+	 */
+	if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
+		return 0;
+	lbp = state->path.blk[0].bp;
+	leaf = lbp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+	/*
+	 * Read the freespace block.
+	 */
+	error = xfs_dir2_free_read(tp, dp,  args->geo->freeblk, &fbp);
+	if (error)
+		return error;
+	free = fbp->b_addr;
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+	ASSERT(!freehdr.firstdb);
+
+	/*
+	 * Now see if the leafn and free data will fit in a leaf1.
+	 * If not, release the buffer and give up.
+	 */
+	if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
+		xfs_trans_brelse(tp, fbp);
+		return 0;
+	}
+
+	/*
+	 * If the leaf has any stale entries in it, compress them out.
+	 */
+	if (leafhdr.stale)
+		xfs_dir3_leaf_compact(args, &leafhdr, lbp);
+
+	lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+	leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+					? XFS_DIR2_LEAF1_MAGIC
+					: XFS_DIR3_LEAF1_MAGIC;
+
+	/*
+	 * Set up the leaf tail from the freespace block.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
+	/*
+	 * Set up the leaf bests table.
+	 */
+	memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	xfs_dir3_leaf_log_tail(args, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
+
+	/*
+	 * Get rid of the freespace block.
+	 */
+	error = xfs_dir2_shrink_inode(args,
+			xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+			fbp);
+	if (error) {
+		/*
+		 * This can't fail here because it can only happen when
+		 * punching out the middle of an extent, and this is an
+		 * isolated block.
+		 */
+		ASSERT(error != -ENOSPC);
+		return error;
+	}
+	fbp = NULL;
+	/*
+	 * Now see if we can convert the single-leaf directory
+	 * down to a block form directory.
+	 * This routine always kills the dabuf for the leaf, so
+	 * eliminate it from the path.
+	 */
+	error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+	state->path.blk[0].bp = NULL;
+	return error;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_node.c b/kernel/fs/xfs/libxfs/xfs_dir2_node.c
new file mode 100644
index 000000000..41b80d3d3
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_node.c
@@ -0,0 +1,2270 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Function declarations.
+ */
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+			      int index);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+				     xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
+				 int index, xfs_da_state_blk_t *dblk,
+				 int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+				     xfs_da_state_blk_t *fblk);
+
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define	xfs_dir3_leaf_check(dp, bp) \
+do { \
+	if (!xfs_dir3_leafn_check((dp), (bp))) \
+		ASSERT(0); \
+} while (0);
+
+static bool
+xfs_dir3_leafn_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+		return false;
+
+	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define	xfs_dir3_leaf_check(dp, bp)
+#endif
+
+static bool
+xfs_dir3_free_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+			return false;
+	}
+
+	/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+	return true;
+}
+
+static void
+xfs_dir3_free_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_dir3_free_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_free_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_free_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+	.verify_read = xfs_dir3_free_read_verify,
+	.verify_write = xfs_dir3_free_write_verify,
+};
+
+
+static int
+__xfs_dir3_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+
+	/* try read returns without an error or *bpp if it lands in a hole */
+	if (!err && tp && *bpp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+	return err;
+}
+
+int
+xfs_dir2_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+
+static int
+xfs_dir3_free_get_buf(
+	xfs_da_args_t		*args,
+	xfs_dir2_db_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+	struct xfs_dir3_icfree_hdr hdr;
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+				   -1, &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+	bp->b_ops = &xfs_dir3_free_buf_ops;
+
+	/*
+	 * Initialize the new block to be empty, and remember
+	 * its first slot as our empty slot.
+	 */
+	memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+	memset(&hdr, 0, sizeof(hdr));
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+		hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+		hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+		hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+	} else
+		hdr.magic = XFS_DIR2_FREE_MAGIC;
+	dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	__be16			*bests;
+
+	free = bp->b_addr;
+	bests = args->dp->d_ops->free_bests_p(free);
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)&bests[first] - (char *)free),
+		(uint)((char *)&bests[last] - (char *)free +
+		       sizeof(bests[0]) - 1));
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+#ifdef DEBUG
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->b_addr;
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+	xfs_trans_log_buf(args->trans, bp, 0,
+			  args->dp->d_ops->free_hdr_size - 1);
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int						/* error */
+xfs_dir2_leaf_to_node(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp)		/* leaf buffer */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	xfs_dir2_db_t		fdb;		/* freespace block number */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	__be16			*from;		/* pointer to freespace entry */
+	int			i;		/* leaf freespace index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	int			n;		/* count of live freespc ents */
+	xfs_dir2_data_off_t	off;		/* freespace entry value */
+	__be16			*to;		/* pointer to freespace entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	trace_xfs_dir2_leaf_to_node(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Add a freespace block to the directory.
+	 */
+	if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+		return error;
+	}
+	ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+	/*
+	 * Get the buffer for the new freespace block.
+	 */
+	error = xfs_dir3_free_get_buf(args, fdb, &fbp);
+	if (error)
+		return error;
+
+	free = fbp->b_addr;
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ASSERT(be32_to_cpu(ltp->bestcount) <=
+				(uint)dp->i_d.di_size / args->geo->blksize);
+
+	/*
+	 * Copy freespace entries from the leaf block to the new block.
+	 * Count active entries.
+	 */
+	from = xfs_dir2_leaf_bests_p(ltp);
+	to = dp->d_ops->free_bests_p(free);
+	for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+		if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+			n++;
+		*to = cpu_to_be16(off);
+	}
+
+	/*
+	 * Now initialize the freespace block header.
+	 */
+	freehdr.nused = n;
+	freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+	dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+	xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+	xfs_dir2_free_log_header(args, fbp);
+
+	/*
+	 * Converting the leaf to a leafnode is just a matter of changing the
+	 * magic number and the ops. Do the change directly to the buffer as
+	 * it's less work (and less code) than decoding the header to host
+	 * format and back again.
+	 */
+	if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+	else
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+	lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
+	return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int					/* error */
+xfs_dir2_leafn_add(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			index)		/* insertion pt for new entry */
+{
+	int			compact;	/* compacting stale leaves */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			highstale;	/* next stale entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			lfloghigh;	/* high leaf entry logging */
+	int			lfloglow;	/* low leaf entry logging */
+	int			lowstale;	/* previous stale entry */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leafn_add(args, index);
+
+	dp = args->dp;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	/*
+	 * Quick check just to make sure we are not going to index
+	 * into other peoples memory
+	 */
+	if (index < 0)
+		return -EFSCORRUPTED;
+
+	/*
+	 * If there are already the maximum number of leaf entries in
+	 * the block, if there are no stale entries it won't fit.
+	 * Caller will do a split.  If there are stale entries we'll do
+	 * a compact.
+	 */
+
+	if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+		if (!leafhdr.stale)
+			return -ENOSPC;
+		compact = leafhdr.stale > 1;
+	} else
+		compact = 0;
+	ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+	ASSERT(index == leafhdr.count ||
+	       be32_to_cpu(ents[index].hashval) >= args->hashval);
+
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+
+	/*
+	 * Compact out all but one stale leaf entry.  Leaves behind
+	 * the entry closest to index.
+	 */
+	if (compact)
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+					 &highstale, &lfloglow, &lfloghigh);
+	else if (leafhdr.stale) {
+		/*
+		 * Set impossible logging indices for this case.
+		 */
+		lfloglow = leafhdr.count;
+		lfloghigh = -1;
+	}
+
+	/*
+	 * Insert the new entry, log everything.
+	 */
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+				       highstale, &lfloglow, &lfloghigh);
+
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
+				args->blkno, args->index));
+
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+	xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, bp);
+	return 0;
+}
+
+#ifdef DEBUG
+static void
+xfs_dir2_free_hdr_check(
+	struct xfs_inode *dp,
+	struct xfs_buf	*bp,
+	xfs_dir2_db_t	db)
+{
+	struct xfs_dir3_icfree_hdr hdr;
+
+	dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+
+	ASSERT((hdr.firstdb %
+		dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+	ASSERT(hdr.firstdb <= db);
+	ASSERT(db < hdr.firstdb + hdr.nvalid);
+}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
+#endif	/* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t					/* hash value */
+xfs_dir2_leafn_lasthash(
+	struct xfs_inode *dp,
+	struct xfs_buf	*bp,			/* leaf buffer */
+	int		*count)			/* count of entries in leaf */
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+	if (count)
+		*count = leafhdr.count;
+	if (!leafhdr.count)
+		return 0;
+
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	return be32_to_cpu(ents[leafhdr.count - 1].hashval);
+}
+
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_db_t		curfdb = -1;	/* current free block number */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			fi;		/* free entry index */
+	xfs_dir2_free_t		*free = NULL;	/* free block structure */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new data entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_dir2_db_t		newfdb;		/* new free block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	xfs_dir3_leaf_check(dp, bp);
+	ASSERT(leafhdr.count > 0);
+
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		/* If so, it's a free block buffer, get the block number. */
+		curbp = state->extrablk.bp;
+		curfdb = state->extrablk.blkno;
+		free = curbp->b_addr;
+		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+		       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+	}
+	length = dp->d_ops->data_entsize(args->namelen);
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * For addname, we're looking for a place to put the new entry.
+		 * We want to use a data block with an entry of equal
+		 * hash value to ours if there is one with room.
+		 *
+		 * If this block isn't the data block we already have
+		 * in hand, take a look at it.
+		 */
+		if (newdb != curdb) {
+			__be16 *bests;
+
+			curdb = newdb;
+			/*
+			 * Convert the data block to the free block
+			 * holding its freespace information.
+			 */
+			newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
+			/*
+			 * If it's not the one we have in hand, read it in.
+			 */
+			if (newfdb != curfdb) {
+				/*
+				 * If we had one before, drop it.
+				 */
+				if (curbp)
+					xfs_trans_brelse(tp, curbp);
+
+				error = xfs_dir2_free_read(tp, dp,
+						xfs_dir2_db_to_da(args->geo,
+								  newfdb),
+						&curbp);
+				if (error)
+					return error;
+				free = curbp->b_addr;
+
+				xfs_dir2_free_hdr_check(dp, curbp, curdb);
+			}
+			/*
+			 * Get the index for our entry.
+			 */
+			fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
+			/*
+			 * If it has room, return it.
+			 */
+			bests = dp->d_ops->free_bests_p(free);
+			if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
+				XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+							XFS_ERRLEVEL_LOW, mp);
+				if (curfdb != newfdb)
+					xfs_trans_brelse(tp, curbp);
+				return -EFSCORRUPTED;
+			}
+			curfdb = newfdb;
+			if (be16_to_cpu(bests[fi]) >= length)
+				goto out;
+		}
+	}
+	/* Didn't find any space */
+	fi = -1;
+out:
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	if (curbp) {
+		/* Giving back a free block. */
+		state->extravalid = 1;
+		state->extrablk.bp = curbp;
+		state->extrablk.index = fi;
+		state->extrablk.blkno = curfdb;
+
+		/*
+		 * Important: this magic number is not in the buffer - it's for
+		 * buffer type information and therefore only the free/data type
+		 * matters here, not whether CRCs are enabled or not.
+		 */
+		state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+	} else {
+		state->extravalid = 0;
+	}
+	/*
+	 * Return the index, that will be the insertion point.
+	 */
+	*indexp = index;
+	return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	xfs_dir3_leaf_check(dp, bp);
+	ASSERT(leafhdr.count > 0);
+
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		curbp = state->extrablk.bp;
+		curdb = state->extrablk.blkno;
+	}
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * Not adding a new entry, so we really want to find
+		 * the name given to us.
+		 *
+		 * If it's a different data block, go get it.
+		 */
+		if (newdb != curdb) {
+			/*
+			 * If we had a block before that we aren't saving
+			 * for a CI name, drop it
+			 */
+			if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+						curdb != state->extrablk.blkno))
+				xfs_trans_brelse(tp, curbp);
+			/*
+			 * If needing the block that is saved with a CI match,
+			 * use it otherwise read in the new data block.
+			 */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+					newdb == state->extrablk.blkno) {
+				ASSERT(state->extravalid);
+				curbp = state->extrablk.bp;
+			} else {
+				error = xfs_dir3_data_read(tp, dp,
+						xfs_dir2_db_to_da(args->geo,
+								  newdb),
+						-1, &curbp);
+				if (error)
+					return error;
+			}
+			xfs_dir3_data_check(dp, curbp);
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(lep->address)));
+		/*
+		 * Compare the entry and if it's an exact match, return
+		 * EEXIST immediately. If it's the first case-insensitive
+		 * match, store the block & inode number and continue looking.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			/* If there is a CI match block, drop it */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+						curdb != state->extrablk.blkno)
+				xfs_trans_brelse(tp, state->extrablk.bp);
+			args->cmpresult = cmp;
+			args->inumber = be64_to_cpu(dep->inumber);
+			args->filetype = dp->d_ops->data_get_ftype(dep);
+			*indexp = index;
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.blkno = curdb;
+			state->extrablk.index = (int)((char *)dep -
+							(char *)curbp->b_addr);
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+			if (cmp == XFS_CMP_EXACT)
+				return -EEXIST;
+		}
+	}
+	ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
+	if (curbp) {
+		if (args->cmpresult == XFS_CMP_DIFFERENT) {
+			/* Giving back last used data block. */
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.index = -1;
+			state->extrablk.blkno = curdb;
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+		} else {
+			/* If the curbp is not the CI match block, drop it */
+			if (state->extrablk.bp != curbp)
+				xfs_trans_brelse(tp, curbp);
+		}
+	} else {
+		state->extravalid = 0;
+	}
+	*indexp = index;
+	return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	if (args->op_flags & XFS_DA_OP_ADDNAME)
+		return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+							state);
+	return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir3_leafn_moveents(
+	xfs_da_args_t			*args,	/* operation arguments */
+	struct xfs_buf			*bp_s,	/* source */
+	struct xfs_dir3_icleaf_hdr	*shdr,
+	struct xfs_dir2_leaf_entry	*sents,
+	int				start_s,/* source leaf index */
+	struct xfs_buf			*bp_d,	/* destination */
+	struct xfs_dir3_icleaf_hdr	*dhdr,
+	struct xfs_dir2_leaf_entry	*dents,
+	int				start_d,/* destination leaf index */
+	int				count)	/* count of leaves to copy */
+{
+	int				stale;	/* count stale leaves copied */
+
+	trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
+	/*
+	 * Silently return if nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * If the destination index is not the end of the current
+	 * destination leaf entries, open up a hole in the destination
+	 * to hold the new entries.
+	 */
+	if (start_d < dhdr->count) {
+		memmove(&dents[start_d + count], &dents[start_d],
+			(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+				       count + dhdr->count - 1);
+	}
+	/*
+	 * If the source has stale leaves, count the ones in the copy range
+	 * so we can update the header correctly.
+	 */
+	if (shdr->stale) {
+		int	i;			/* temp leaf index */
+
+		for (i = start_s, stale = 0; i < start_s + count; i++) {
+			if (sents[i].address ==
+					cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+				stale++;
+		}
+	} else
+		stale = 0;
+	/*
+	 * Copy the leaf entries from source to destination.
+	 */
+	memcpy(&dents[start_d], &sents[start_s],
+		count * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+
+	/*
+	 * If there are source entries after the ones we copied,
+	 * delete the ones we copied by sliding the next ones down.
+	 */
+	if (start_s + count < shdr->count) {
+		memmove(&sents[start_s], &sents[start_s + count],
+			count * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
+	}
+
+	/*
+	 * Update the headers and log them.
+	 */
+	shdr->count -= count;
+	shdr->stale -= stale;
+	dhdr->count += count;
+	dhdr->stale += stale;
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int						/* sort order */
+xfs_dir2_leafn_order(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*leaf1_bp,		/* leaf1 buffer */
+	struct xfs_buf		*leaf2_bp)		/* leaf2 buffer */
+{
+	struct xfs_dir2_leaf	*leaf1 = leaf1_bp->b_addr;
+	struct xfs_dir2_leaf	*leaf2 = leaf2_bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
+
+	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+	ents1 = dp->d_ops->leaf_ents_p(leaf1);
+	ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+	if (hdr1.count > 0 && hdr2.count > 0 &&
+	    (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+	     be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+				be32_to_cpu(ents1[hdr1.count - 1].hashval)))
+		return 1;
+	return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*blk1,		/* first btree block */
+	xfs_da_state_blk_t	*blk2)		/* second btree block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	int			count;		/* count (& direction) leaves */
+	int			isleft;		/* new goes in left leaf */
+	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
+	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
+	int			mid;		/* midpoint leaf index */
+#if defined(DEBUG) || defined(XFS_WARN)
+	int			oldstale;	/* old count of stale leaves */
+#endif
+	int			oldsum;		/* old total leaf count */
+	int			swap;		/* swapped leaf blocks */
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+	/*
+	 * If the block order is wrong, swap the arguments.
+	 */
+	if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
+		xfs_da_state_blk_t	*tmp;	/* temp for block swap */
+
+		tmp = blk1;
+		blk1 = blk2;
+		blk2 = tmp;
+	}
+	leaf1 = blk1->bp->b_addr;
+	leaf2 = blk2->bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+	ents1 = dp->d_ops->leaf_ents_p(leaf1);
+	ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+	oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+	oldstale = hdr1.stale + hdr2.stale;
+#endif
+	mid = oldsum >> 1;
+
+	/*
+	 * If the old leaf count was odd then the new one will be even,
+	 * so we need to divide the new count evenly.
+	 */
+	if (oldsum & 1) {
+		xfs_dahash_t	midhash;	/* middle entry hash value */
+
+		if (mid >= hdr1.count)
+			midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
+		else
+			midhash = be32_to_cpu(ents1[mid].hashval);
+		isleft = args->hashval <= midhash;
+	}
+	/*
+	 * If the old count is even then the new count is odd, so there's
+	 * no preferred side for the new entry.
+	 * Pick the left one.
+	 */
+	else
+		isleft = 1;
+	/*
+	 * Calculate moved entry count.  Positive means left-to-right,
+	 * negative means right-to-left.  Then move the entries.
+	 */
+	count = hdr1.count - mid + (isleft == 0);
+	if (count > 0)
+		xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+					hdr1.count - count, blk2->bp,
+					&hdr2, ents2, 0, count);
+	else if (count < 0)
+		xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+					blk1->bp, &hdr1, ents1,
+					hdr1.count, count);
+
+	ASSERT(hdr1.count + hdr2.count == oldsum);
+	ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+	/* log the changes made when moving the entries */
+	dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+	dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+	xfs_dir3_leaf_log_header(args, blk1->bp);
+	xfs_dir3_leaf_log_header(args, blk2->bp);
+
+	xfs_dir3_leaf_check(dp, blk1->bp);
+	xfs_dir3_leaf_check(dp, blk2->bp);
+
+	/*
+	 * Mark whether we're inserting into the old or new leaf.
+	 */
+	if (hdr1.count < hdr2.count)
+		state->inleaf = swap;
+	else if (hdr1.count > hdr2.count)
+		state->inleaf = !swap;
+	else
+		state->inleaf = swap ^ (blk1->index <= hdr1.count);
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (!state->inleaf)
+		blk2->index = blk1->index - hdr1.count;
+
+	/*
+	 * Finally sanity check just to make sure we are not returning a
+	 * negative index
+	 */
+	if (blk2->index < 0) {
+		state->inleaf = 1;
+		blk2->index = 0;
+		xfs_alert(dp->i_mount,
+	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
+			__func__, blk1->index);
+	}
+}
+
+static int
+xfs_dir3_data_block_free(
+	xfs_da_args_t		*args,
+	struct xfs_dir2_data_hdr *hdr,
+	struct xfs_dir2_free	*free,
+	xfs_dir2_db_t		fdb,
+	int			findex,
+	struct xfs_buf		*fbp,
+	int			longest)
+{
+	int			logfree = 0;
+	__be16			*bests;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_inode	*dp = args->dp;
+
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+	bests = dp->d_ops->free_bests_p(free);
+	if (hdr) {
+		/*
+		 * Data block is not empty, just set the free entry to the new
+		 * value.
+		 */
+		bests[findex] = cpu_to_be16(longest);
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+		return 0;
+	}
+
+	/* One less used entry in the free table. */
+	freehdr.nused--;
+
+	/*
+	 * If this was the last entry in the table, we can trim the table size
+	 * back.  There might be other entries at the end referring to
+	 * non-existent data blocks, get those too.
+	 */
+	if (findex == freehdr.nvalid - 1) {
+		int	i;		/* free entry index */
+
+		for (i = findex - 1; i >= 0; i--) {
+			if (bests[i] != cpu_to_be16(NULLDATAOFF))
+				break;
+		}
+		freehdr.nvalid = i + 1;
+		logfree = 0;
+	} else {
+		/* Not the last entry, just punch it out.  */
+		bests[findex] = cpu_to_be16(NULLDATAOFF);
+		logfree = 1;
+	}
+
+	dp->d_ops->free_hdr_to_disk(free, &freehdr);
+	xfs_dir2_free_log_header(args, fbp);
+
+	/*
+	 * If there are no useful entries left in the block, get rid of the
+	 * block if we can.
+	 */
+	if (!freehdr.nused) {
+		int error;
+
+		error = xfs_dir2_shrink_inode(args, fdb, fbp);
+		if (error == 0) {
+			fbp = NULL;
+			logfree = 0;
+		} else if (error != -ENOSPC || args->total != 0)
+			return error;
+		/*
+		 * It's possible to get ENOSPC if there is no
+		 * space reservation.  In this case some one
+		 * else will eventually get rid of this block.
+		 */
+	}
+
+	/* Log the free entry that changed, unless we got rid of it.  */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+	return 0;
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int					/* error */
+xfs_dir2_leafn_remove(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*bp,		/* leaf buffer */
+	int			index,		/* leaf entry index */
+	xfs_da_state_blk_t	*dblk,		/* data block */
+	int			*rval)		/* resulting block needs join */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		db;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			longest;	/* longest data free entry */
+	int			off;		/* data block entry offset */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leafn_remove(args, index);
+
+	dp = args->dp;
+	tp = args->trans;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	/*
+	 * Point to the entry we're removing.
+	 */
+	lep = &ents[index];
+
+	/*
+	 * Extract the data block and offset from the entry.
+	 */
+	db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+	ASSERT(dblk->blkno == db);
+	off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
+	ASSERT(dblk->index == off);
+
+	/*
+	 * Kill the leaf entry by marking it stale.
+	 * Log the leaf block changes.
+	 */
+	leafhdr.stale++;
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir3_leaf_log_ents(args, bp, index, index);
+
+	/*
+	 * Make the data entry free.  Keep track of the longest freespace
+	 * in the data block in case it changes.
+	 */
+	dbp = dblk->bp;
+	hdr = dbp->b_addr;
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	longest = be16_to_cpu(bf[0].length);
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(args, dbp, off,
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * Rescan the data block freespaces for bestfree.
+	 * Log the data block header if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir3_data_check(dp, dbp);
+	/*
+	 * If the longest data block freespace changes, need to update
+	 * the corresponding freeblock entry.
+	 */
+	if (longest < be16_to_cpu(bf[0].length)) {
+		int		error;		/* error return value */
+		struct xfs_buf	*fbp;		/* freeblock buffer */
+		xfs_dir2_db_t	fdb;		/* freeblock block number */
+		int		findex;		/* index in freeblock entries */
+		xfs_dir2_free_t	*free;		/* freeblock structure */
+
+		/*
+		 * Convert the data block number to a free block,
+		 * read in the free block.
+		 */
+		fdb = dp->d_ops->db_to_fdb(args->geo, db);
+		error = xfs_dir2_free_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, fdb),
+					   &fbp);
+		if (error)
+			return error;
+		free = fbp->b_addr;
+#ifdef DEBUG
+	{
+		struct xfs_dir3_icfree_hdr freehdr;
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+		ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+			(fdb - xfs_dir2_byte_to_db(args->geo,
+						   XFS_DIR2_FREE_OFFSET)));
+	}
+#endif
+		/*
+		 * Calculate which entry we need to fix.
+		 */
+		findex = dp->d_ops->db_to_fdindex(args->geo, db);
+		longest = be16_to_cpu(bf[0].length);
+		/*
+		 * If the data block is now empty we can get rid of it
+		 * (usually).
+		 */
+		if (longest == args->geo->blksize -
+			       dp->d_ops->data_entry_offset) {
+			/*
+			 * Try to punch out the data block.
+			 */
+			error = xfs_dir2_shrink_inode(args, db, dbp);
+			if (error == 0) {
+				dblk->bp = NULL;
+				hdr = NULL;
+			}
+			/*
+			 * We can get ENOSPC if there's no space reservation.
+			 * In this case just drop the buffer and some one else
+			 * will eventually get rid of the empty block.
+			 */
+			else if (!(error == -ENOSPC && args->total == 0))
+				return error;
+		}
+		/*
+		 * If we got rid of the data block, we can eliminate that entry
+		 * in the free block.
+		 */
+		error = xfs_dir3_data_block_free(args, hdr, free,
+						 fdb, findex, fbp, longest);
+		if (error)
+			return error;
+	}
+
+	xfs_dir3_leaf_check(dp, bp);
+	/*
+	 * Return indication of whether this leaf block is empty enough
+	 * to justify trying to join it with a neighbor.
+	 */
+	*rval = (dp->d_ops->leaf_hdr_size +
+		 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
+		args->geo->magicpct;
+	return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int						/* error */
+xfs_dir2_leafn_split(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*oldblk,	/* original block */
+	xfs_da_state_blk_t	*newblk)	/* newly created block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dablk_t		blkno;		/* new leaf block number */
+	int			error;		/* error return value */
+	struct xfs_inode	*dp;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	dp = args->dp;
+	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Initialize the new leaf block.
+	 */
+	error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+				      &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+	if (error)
+		return error;
+
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+	/*
+	 * Rebalance the entries across the two leaves, link the new
+	 * block into the leaves.
+	 */
+	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+	else
+		error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+	newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+	xfs_dir3_leaf_check(dp, oldblk->bp);
+	xfs_dir3_leaf_check(dp, newblk->bp);
+	return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int						/* error */
+xfs_dir2_leafn_toosmall(
+	xfs_da_state_t		*state,		/* btree cursor */
+	int			*action)	/* resulting action to take */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dablk_t		blkno;		/* leaf block number */
+	struct xfs_buf		*bp;		/* leaf buffer */
+	int			bytes;		/* bytes in use */
+	int			count;		/* leaf live entry count */
+	int			error;		/* error return value */
+	int			forward;	/* sibling block direction */
+	int			i;		/* sibling counter */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			rval;		/* result from path_shift */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_inode	*dp = state->args->dp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	leaf = blk->bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	xfs_dir3_leaf_check(dp, blk->bp);
+
+	count = leafhdr.count - leafhdr.stale;
+	bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+	if (bytes > (state->args->geo->blksize >> 1)) {
+		/*
+		 * Blk over 50%, don't try to join.
+		 */
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (leafhdr.forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+		if (error)
+			return error;
+		*action = rval ? 2 : 0;
+		return 0;
+	}
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = leafhdr.forw < leafhdr.back;
+	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+		struct xfs_dir3_icleaf_hdr hdr2;
+
+		blkno = forward ? leafhdr.forw : leafhdr.back;
+		if (blkno == 0)
+			continue;
+		/*
+		 * Read the sibling leaf block.
+		 */
+		error = xfs_dir3_leafn_read(state->args->trans, dp,
+					    blkno, -1, &bp);
+		if (error)
+			return error;
+
+		/*
+		 * Count bytes in the two blocks combined.
+		 */
+		count = leafhdr.count - leafhdr.stale;
+		bytes = state->args->geo->blksize -
+			(state->args->geo->blksize >> 2);
+
+		leaf = bp->b_addr;
+		dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
+		ents = dp->d_ops->leaf_ents_p(leaf);
+		count += hdr2.count - hdr2.stale;
+		bytes -= count * sizeof(ents[0]);
+
+		/*
+		 * Fits with at least 25% to spare.
+		 */
+		if (bytes >= 0)
+			break;
+		xfs_trans_brelse(state->args->trans, bp);
+	}
+	/*
+	 * Didn't like either block, give up.
+	 */
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno)
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+	else
+		error = xfs_da3_path_shift(state, &state->path, forward, 0,
+			&rval);
+	if (error) {
+		return error;
+	}
+	*action = rval ? 0 : 1;
+	return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+	xfs_da_state_t		*state,		/* cursor */
+	xfs_da_state_blk_t	*drop_blk,	/* dead block */
+	xfs_da_state_blk_t	*save_blk)	/* surviving block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
+	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
+	struct xfs_dir3_icleaf_hdr savehdr;
+	struct xfs_dir3_icleaf_hdr drophdr;
+	struct xfs_dir2_leaf_entry *sents;
+	struct xfs_dir2_leaf_entry *dents;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	drop_leaf = drop_blk->bp->b_addr;
+	save_leaf = save_blk->bp->b_addr;
+
+	dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+	dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+	sents = dp->d_ops->leaf_ents_p(save_leaf);
+	dents = dp->d_ops->leaf_ents_p(drop_leaf);
+
+	/*
+	 * If there are any stale leaf entries, take this opportunity
+	 * to purge them.
+	 */
+	if (drophdr.stale)
+		xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+	if (savehdr.stale)
+		xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
+	/*
+	 * Move the entries from drop to the appropriate end of save.
+	 */
+	drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+	if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents, 0,
+					drophdr.count);
+	else
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents,
+					savehdr.count, drophdr.count);
+	save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+	/* log the changes made when moving the entries */
+	dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+	dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+	xfs_dir3_leaf_log_header(args, save_blk->bp);
+	xfs_dir3_leaf_log_header(args, drop_blk->bp);
+
+	xfs_dir3_leaf_check(dp, save_blk->bp);
+	xfs_dir3_leaf_check(dp, drop_blk->bp);
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int						/* error */
+xfs_dir2_node_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block for insert */
+	int			error;		/* error return value */
+	int			rval;		/* sub-return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_addname(args);
+
+	/*
+	 * Allocate and initialize the state (btree cursor).
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	/*
+	 * Look up the name.  We're not supposed to find it, but
+	 * this gives us the insertion point.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	if (rval != -ENOENT) {
+		goto done;
+	}
+	/*
+	 * Add the data entry to a data block.
+	 * Extravalid is set to a freeblock found by lookup.
+	 */
+	rval = xfs_dir2_node_addname_int(args,
+		state->extravalid ? &state->extrablk : NULL);
+	if (rval) {
+		goto done;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Add the new leaf entry.
+	 */
+	rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+	if (rval == 0) {
+		/*
+		 * It worked, fix the hash values up the btree.
+		 */
+		if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+			xfs_da3_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * It didn't work, we need to split the leaf block.
+		 */
+		if (args->total == 0) {
+			ASSERT(rval == -ENOSPC);
+			goto done;
+		}
+		/*
+		 * Split the leaf block and insert the new entry.
+		 */
+		rval = xfs_da3_split(state);
+	}
+done:
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int					/* error */
+xfs_dir2_node_addname_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		dbno;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
+	int			error;		/* error return value */
+	xfs_dir2_db_t		fbno;		/* freespace block number */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	int			findex;		/* freespace entry index */
+	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
+	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
+	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
+	int			length;		/* length of the new entry */
+	int			logfree;	/* need to log free entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	__be16			*tagp;		/* data entry tag pointer */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	__be16			*bests;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_dir2_data_free *bf;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	length = dp->d_ops->data_entsize(args->namelen);
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.  This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		/*
+		 * Remember initial freespace block number.
+		 */
+		ifbno = fblk->blkno;
+		free = fbp->b_addr;
+		findex = fblk->index;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+		/*
+		 * This means the free entry showed that the data block had
+		 * space for our entry, so we remembered it.
+		 * Use that data block.
+		 */
+		if (findex >= 0) {
+			ASSERT(findex < freehdr.nvalid);
+			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+			ASSERT(be16_to_cpu(bests[findex]) >= length);
+			dbno = freehdr.firstdb + findex;
+		} else {
+			/*
+			 * The data block looked at didn't have enough room.
+			 * We'll start at the beginning of the freespace entries.
+			 */
+			dbno = -1;
+			findex = 0;
+		}
+	} else {
+		/*
+		 * Didn't come in with a freespace block, so no data block.
+		 */
+		ifbno = dbno = -1;
+		fbp = NULL;
+		findex = 0;
+	}
+
+	/*
+	 * If we don't have a data block yet, we're going to scan the
+	 * freespace blocks looking for one.  Figure out what the
+	 * highest freespace block number is.
+	 */
+	if (dbno == -1) {
+		xfs_fileoff_t	fo;		/* freespace block number */
+
+		if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
+			return error;
+		lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+		fbno = ifbno;
+	}
+	/*
+	 * While we haven't identified a data block, search the freeblock
+	 * data for a good data block.  If we find a null freeblock entry,
+	 * indicating a hole in the data blocks, remember that.
+	 */
+	while (dbno == -1) {
+		/*
+		 * If we don't have a freeblock in hand, get the next one.
+		 */
+		if (fbp == NULL) {
+			/*
+			 * Happens the first time through unless lookup gave
+			 * us a freespace block to start with.
+			 */
+			if (++fbno == 0)
+				fbno = xfs_dir2_byte_to_db(args->geo,
+							XFS_DIR2_FREE_OFFSET);
+			/*
+			 * If it's ifbno we already looked at it.
+			 */
+			if (fbno == ifbno)
+				fbno++;
+			/*
+			 * If it's off the end we're done.
+			 */
+			if (fbno >= lastfbno)
+				break;
+			/*
+			 * Read the block.  There can be holes in the
+			 * freespace blocks, so this might not succeed.
+			 * This should be really rare, so there's no reason
+			 * to avoid it.
+			 */
+			error = xfs_dir2_free_try_read(tp, dp,
+					xfs_dir2_db_to_da(args->geo, fbno),
+					&fbp);
+			if (error)
+				return error;
+			if (!fbp)
+				continue;
+			free = fbp->b_addr;
+			findex = 0;
+		}
+		/*
+		 * Look at the current free entry.  Is it good enough?
+		 *
+		 * The bests initialisation should be where the bufer is read in
+		 * the above branch. But gcc is too stupid to realise that bests
+		 * and the freehdr are actually initialised if they are placed
+		 * there, so we have to do it here to avoid warnings. Blech.
+		 */
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+		    be16_to_cpu(bests[findex]) >= length)
+			dbno = freehdr.firstdb + findex;
+		else {
+			/*
+			 * Are we done with the freeblock?
+			 */
+			if (++findex == freehdr.nvalid) {
+				/*
+				 * Drop the block.
+				 */
+				xfs_trans_brelse(tp, fbp);
+				fbp = NULL;
+				if (fblk && fblk->bp)
+					fblk->bp = NULL;
+			}
+		}
+	}
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (unlikely(dbno == -1)) {
+		/*
+		 * Not allowed to allocate, return failure.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+			return -ENOSPC;
+
+		/*
+		 * Allocate and initialize the new data block.
+		 */
+		if (unlikely((error = xfs_dir2_grow_inode(args,
+							 XFS_DIR2_DATA_SPACE,
+							 &dbno)) ||
+		    (error = xfs_dir3_data_init(args, dbno, &dbp))))
+			return error;
+
+		/*
+		 * If (somehow) we have a freespace block, get rid of it.
+		 */
+		if (fbp)
+			xfs_trans_brelse(tp, fbp);
+		if (fblk && fblk->bp)
+			fblk->bp = NULL;
+
+		/*
+		 * Get the freespace block corresponding to the data block
+		 * that was just allocated.
+		 */
+		fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
+		error = xfs_dir2_free_try_read(tp, dp,
+				       xfs_dir2_db_to_da(args->geo, fbno),
+				       &fbp);
+		if (error)
+			return error;
+
+		/*
+		 * If there wasn't a freespace block, the read will
+		 * return a NULL fbp.  Allocate and initialize a new one.
+		 */
+		if (!fbp) {
+			error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+						    &fbno);
+			if (error)
+				return error;
+
+			if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
+				xfs_alert(mp,
+			"%s: dir ino %llu needed freesp block %lld for\n"
+			"  data block %lld, got %lld ifbno %llu lastfbno %d",
+					__func__, (unsigned long long)dp->i_ino,
+					(long long)dp->d_ops->db_to_fdb(
+								args->geo, dbno),
+					(long long)dbno, (long long)fbno,
+					(unsigned long long)ifbno, lastfbno);
+				if (fblk) {
+					xfs_alert(mp,
+				" fblk 0x%p blkno %llu index %d magic 0x%x",
+						fblk,
+						(unsigned long long)fblk->blkno,
+						fblk->index,
+						fblk->magic);
+				} else {
+					xfs_alert(mp, " ... fblk is NULL");
+				}
+				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
+						 XFS_ERRLEVEL_LOW, mp);
+				return -EFSCORRUPTED;
+			}
+
+			/*
+			 * Get a buffer for the new block.
+			 */
+			error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+			if (error)
+				return error;
+			free = fbp->b_addr;
+			bests = dp->d_ops->free_bests_p(free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+			/*
+			 * Remember the first slot as our empty slot.
+			 */
+			freehdr.firstdb =
+				(fbno - xfs_dir2_byte_to_db(args->geo,
+							XFS_DIR2_FREE_OFFSET)) *
+					dp->d_ops->free_max_bests(args->geo);
+		} else {
+			free = fbp->b_addr;
+			bests = dp->d_ops->free_bests_p(free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
+		}
+
+		/*
+		 * Set the freespace block index from the data block number.
+		 */
+		findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
+		/*
+		 * If it's after the end of the current entries in the
+		 * freespace block, extend that table.
+		 */
+		if (findex >= freehdr.nvalid) {
+			ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
+			freehdr.nvalid = findex + 1;
+			/*
+			 * Tag new entry so nused will go up.
+			 */
+			bests[findex] = cpu_to_be16(NULLDATAOFF);
+		}
+		/*
+		 * If this entry was for an empty data block
+		 * (this should always be true) then update the header.
+		 */
+		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+			freehdr.nused++;
+			dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+			xfs_dir2_free_log_header(args, fbp);
+		}
+		/*
+		 * Update the real value in the table.
+		 * We haven't allocated the data entry yet so this will
+		 * change again.
+		 */
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		bests[findex] = bf[0].length;
+		logfree = 1;
+	}
+	/*
+	 * We had a data block so we don't have to make a new one.
+	 */
+	else {
+		/*
+		 * If just checking, we succeeded.
+		 */
+		if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+			return 0;
+
+		/*
+		 * Read the data block in.
+		 */
+		error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, dbno),
+					   -1, &dbp);
+		if (error)
+			return error;
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		logfree = 0;
+	}
+	ASSERT(be16_to_cpu(bf[0].length) >= length);
+	/*
+	 * Point to the existing unused space.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+	needscan = needlog = 0;
+	/*
+	 * Mark the first part of the unused space, inuse for us.
+	 */
+	xfs_dir2_data_use_free(args, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+		&needlog, &needscan);
+	/*
+	 * Fill in the new entry and log it.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	/*
+	 * Rescan the block for bestfree if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	/*
+	 * Log the data block header if needed.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * If the freespace entry is now wrong, update it.
+	 */
+	bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+	if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+		bests[findex] = bf[0].length;
+		logfree = 1;
+	}
+	/*
+	 * Log the freespace entry if needed.
+	 */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+	/*
+	 * Return the data block and offset in args, then drop the data block.
+	 */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = be16_to_cpu(*tagp);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da3_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int						/* error */
+xfs_dir2_node_lookup(
+	xfs_da_args_t	*args)			/* operation arguments */
+{
+	int		error;			/* error return value */
+	int		i;			/* btree level */
+	int		rval;			/* operation return value */
+	xfs_da_state_t	*state;			/* btree cursor */
+
+	trace_xfs_dir2_node_lookup(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	/*
+	 * Fill in the path to the entry in the cursor.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
+		/* If a CI match, dup the actual name and return -EEXIST */
+		xfs_dir2_data_entry_t	*dep;
+
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)state->extrablk.bp->b_addr +
+						 state->extrablk.index);
+		rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	}
+	/*
+	 * Release the btree blocks and leaf block.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	/*
+	 * Release the data block if we have it.
+	 */
+	if (state->extravalid && state->extrablk.bp) {
+		xfs_trans_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_removename(
+	struct xfs_da_args	*args)		/* operation arguments */
+{
+	struct xfs_da_state_blk	*blk;		/* leaf block */
+	int			error;		/* error return value */
+	int			rval;		/* operation return value */
+	struct xfs_da_state	*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_removename(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+
+	/* Look up the entry we're deleting, set up the cursor. */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		goto out_free;
+
+	/* Didn't find it, upper layer screwed up. */
+	if (rval != -EEXIST) {
+		error = rval;
+		goto out_free;
+	}
+
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(state->extravalid);
+	/*
+	 * Remove the leaf and data entries.
+	 * Extrablk refers to the data block.
+	 */
+	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+		&state->extrablk, &rval);
+	if (error)
+		goto out_free;
+	/*
+	 * Fix the hash values up the btree.
+	 */
+	xfs_da3_fixhashpath(state, &state->path);
+	/*
+	 * If we need to join leaf blocks, do it.
+	 */
+	if (rval && state->path.active > 1)
+		error = xfs_da3_join(state);
+	/*
+	 * If no errors so far, try conversion to leaf format.
+	 */
+	if (!error)
+		error = xfs_dir2_node_to_leaf(state);
+out_free:
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_entry_t	*dep;		/* data entry changed */
+	int			error;		/* error return value */
+	int			i;		/* btree level */
+	xfs_ino_t		inum;		/* new inode number */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry being changed */
+	int			rval;		/* internal return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_replace(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	inum = args->inumber;
+	/*
+	 * Lookup the entry to change in the btree.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * It should be found, since the vnodeops layer has looked it up
+	 * and locked it.  But paranoia is good.
+	 */
+	if (rval == -EEXIST) {
+		struct xfs_dir2_leaf_entry *ents;
+		/*
+		 * Find the leaf entry.
+		 */
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+		leaf = blk->bp->b_addr;
+		ents = args->dp->d_ops->leaf_ents_p(leaf);
+		lep = &ents[blk->index];
+		ASSERT(state->extravalid);
+		/*
+		 * Point to the data entry.
+		 */
+		hdr = state->extrablk.bp->b_addr;
+		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+		       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)hdr +
+		       xfs_dir2_dataptr_to_off(args->geo,
+					       be32_to_cpu(lep->address)));
+		ASSERT(inum != be64_to_cpu(dep->inumber));
+		/*
+		 * Fill in the new inode number and log the entry.
+		 */
+		dep->inumber = cpu_to_be64(inum);
+		args->dp->d_ops->data_put_ftype(dep, args->filetype);
+		xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
+		rval = 0;
+	}
+	/*
+	 * Didn't find it, and we're holding a data block.  Drop it.
+	 */
+	else if (state->extravalid) {
+		xfs_trans_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	/*
+	 * Release all the buffers in the cursor.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int						/* error */
+xfs_dir2_node_trim_free(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_fileoff_t		fo,		/* free block number */
+	int			*rvalp)		/* out: did something */
+{
+	struct xfs_buf		*bp;		/* freespace buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Read the freespace block.
+	 */
+	error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+	if (error)
+		return error;
+	/*
+	 * There can be holes in freespace.  If fo is a hole, there's
+	 * nothing to do.
+	 */
+	if (!bp)
+		return 0;
+	free = bp->b_addr;
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+	/*
+	 * If there are used entries, there's nothing to do.
+	 */
+	if (freehdr.nused > 0) {
+		xfs_trans_brelse(tp, bp);
+		*rvalp = 0;
+		return 0;
+	}
+	/*
+	 * Blow the block away.
+	 */
+	error = xfs_dir2_shrink_inode(args,
+			xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+	if (error) {
+		/*
+		 * Can't fail with ENOSPC since that only happens with no
+		 * space reservation, when breaking up an extent into two
+		 * pieces.  This is the last block of an extent.
+		 */
+		ASSERT(error != -ENOSPC);
+		xfs_trans_brelse(tp, bp);
+		return error;
+	}
+	/*
+	 * Return that we succeeded.
+	 */
+	*rvalp = 1;
+	return 0;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_priv.h b/kernel/fs/xfs/libxfs/xfs_dir2_priv.h
new file mode 100644
index 000000000..ef9f6ead9
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_PRIV_H__
+#define __XFS_DIR2_PRIV_H__
+
+struct dir_context;
+
+/* xfs_dir2.c */
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+				xfs_dir2_db_t *dbp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+				const unsigned char *name, int len);
+
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			       struct xfs_buf **bpp);
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+		struct xfs_buf *lbp, struct xfs_buf *dbp);
+
+/* xfs_dir2_data.c */
+#ifdef DEBUG
+#define	xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
+#else
+#define	xfs_dir3_data_check(dp,bp)
+#endif
+
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+		xfs_daddr_t mapped_bno);
+
+extern struct xfs_dir2_data_free *
+xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
+		struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+		int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+		struct xfs_buf **bpp);
+
+/* xfs_dir2_leaf.c */
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+		struct xfs_buf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+		struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int *indexp,
+		int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+		struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+		struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
+		struct xfs_buf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+		struct xfs_buf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+		struct xfs_buf *lbp, xfs_dir2_db_t db);
+extern struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int index, int compact,
+		int lowstale, int highstale, int *lfloglow, int *lfloghigh);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
+		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+
+/* xfs_dir2_node.c */
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+		struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+		struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
+		struct xfs_da_args *args, int *indexp,
+		struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
+		struct xfs_buf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+	struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+		struct xfs_da_state_blk *drop_blk,
+		struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+		int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, struct xfs_buf **bpp);
+
+/* xfs_dir2_sf.c */
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+		struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
+		int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+		       size_t bufsize);
+
+#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_sf.c b/kernel/fs/xfs/libxfs/xfs_dir2_sf.c
new file mode 100644
index 000000000..974d62e67
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -0,0 +1,1142 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+				     xfs_dir2_sf_entry_t *sfep,
+				     xfs_dir2_data_aoff_t offset,
+				     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+				     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+				    xfs_dir2_sf_entry_t **sfepp,
+				    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define	xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int						/* size for sf form */
+xfs_dir2_block_sfsize(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dir2_data_hdr_t	*hdr,		/* block directory data */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* output: header for sf form */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_leaf_entry_t	*blp;		/* leaf area of the block */
+	xfs_dir2_block_tail_t	*btp;		/* tail area of the block */
+	int			count;		/* shortform entry count */
+	xfs_dir2_data_entry_t	*dep;		/* data entry in the block */
+	int			i;		/* block entry index */
+	int			i8count;	/* count of big-inode entries */
+	int			isdot;		/* entry is "." */
+	int			isdotdot;	/* entry is ".." */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+	int			namelen;	/* total name bytes */
+	xfs_ino_t		parent = 0;	/* parent inode number */
+	int			size=0;		/* total computed size */
+	int			has_ftype;
+	struct xfs_da_geometry	*geo;
+
+	mp = dp->i_mount;
+	geo = mp->m_dir_geo;
+
+	/*
+	 * if there is a filetype field, add the extra byte to the namelen
+	 * for each entry that we see.
+	 */
+	has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+
+	count = i8count = namelen = 0;
+	btp = xfs_dir2_block_tail_p(geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Iterate over the block's data entries by using the leaf pointers.
+	 */
+	for (i = 0; i < be32_to_cpu(btp->count); i++) {
+		if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Calculate the pointer to the entry at hand.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+				xfs_dir2_dataptr_to_off(geo, addr));
+		/*
+		 * Detect . and .., so we can special-case them.
+		 * . is not included in sf directories.
+		 * .. is included by just the parent inode number.
+		 */
+		isdot = dep->namelen == 1 && dep->name[0] == '.';
+		isdotdot =
+			dep->namelen == 2 &&
+			dep->name[0] == '.' && dep->name[1] == '.';
+
+		if (!isdot)
+			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+
+		/* take into account the file type field */
+		if (!isdot && !isdotdot) {
+			count++;
+			namelen += dep->namelen + has_ftype;
+		} else if (isdotdot)
+			parent = be64_to_cpu(dep->inumber);
+		/*
+		 * Calculate the new size, see if we should give up yet.
+		 */
+		size = xfs_dir2_sf_hdr_size(i8count) +		/* header */
+		       count +					/* namelen */
+		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+		       namelen +				/* name */
+		       (i8count ?				/* inumber */
+				(uint)sizeof(xfs_dir2_ino8_t) * count :
+				(uint)sizeof(xfs_dir2_ino4_t) * count);
+		if (size > XFS_IFORK_DSIZE(dp))
+			return size;		/* size value is a failure */
+	}
+	/*
+	 * Create the output header, if it worked.
+	 */
+	sfhp->count = count;
+	sfhp->i8count = i8count;
+	dp->d_ops->sf_put_parent_ino(sfhp, parent);
+	return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int						/* error */
+xfs_dir2_block_to_sf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*bp,
+	int			size,		/* shortform directory size */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* shortform directory hdr */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data pointer */
+	char			*endptr;	/* end of data entries */
+	int			error;		/* error return value */
+	int			logflags;	/* inode logging flags */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*ptr;		/* current data pointer */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform directory header */
+	xfs_dir2_sf_hdr_t	*dst;		/* temporary data buffer */
+
+	trace_xfs_dir2_block_to_sf(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	/*
+	 * allocate a temporary destination buffer the size of the inode
+	 * to format the data into. Once we have formatted the data, we
+	 * can free the block and copy the formatted data into the inode literal
+	 * area.
+	 */
+	dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+	hdr = bp->b_addr;
+
+	/*
+	 * Copy the header into the newly allocate local space.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dst;
+	memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+
+	/*
+	 * Set up to loop over the block's entries.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	ptr = (char *)dp->d_ops->data_entry_p(hdr);
+	endptr = (char *)xfs_dir2_block_leaf_p(btp);
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	/*
+	 * Loop over the active and unused entries.
+	 * Stop when we reach the leaf/tail portion of the block.
+	 */
+	while (ptr < endptr) {
+		/*
+		 * If it's unused, just skip over it.
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += be16_to_cpu(dup->length);
+			continue;
+		}
+		dep = (xfs_dir2_data_entry_t *)ptr;
+		/*
+		 * Skip .
+		 */
+		if (dep->namelen == 1 && dep->name[0] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+		/*
+		 * Skip .., but make sure the inode number is right.
+		 */
+		else if (dep->namelen == 2 &&
+			 dep->name[0] == '.' && dep->name[1] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) ==
+			       dp->d_ops->sf_get_parent_ino(sfp));
+		/*
+		 * Normal entry, copy it into shortform.
+		 */
+		else {
+			sfep->namelen = dep->namelen;
+			xfs_dir2_sf_put_offset(sfep,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)hdr));
+			memcpy(sfep->name, dep->name, dep->namelen);
+			dp->d_ops->sf_put_ino(sfp, sfep,
+					      be64_to_cpu(dep->inumber));
+			dp->d_ops->sf_put_ftype(sfep,
+					dp->d_ops->data_get_ftype(dep));
+
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+		}
+		ptr += dp->d_ops->data_entsize(dep->namelen);
+	}
+	ASSERT((char *)sfep - (char *)sfp == size);
+
+	/* now we are done with the block, we can shrink the inode */
+	logflags = XFS_ILOG_CORE;
+	error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		goto out;
+	}
+
+	/*
+	 * The buffer is now unconditionally gone, whether
+	 * xfs_dir2_shrink_inode worked or not.
+	 *
+	 * Convert the inode to local format and copy the data in.
+	 */
+	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+	dp->i_df.if_flags |= XFS_IFINLINE;
+	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+
+	logflags |= XFS_ILOG_DDATA;
+	memcpy(dp->i_df.if_u1.if_data, dst, size);
+	dp->i_d.di_size = size;
+	xfs_dir2_sf_check(args);
+out:
+	xfs_trans_log_inode(args->trans, dp, logflags);
+	kmem_free(dst);
+	return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int						/* error */
+xfs_dir2_sf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			incr_isize;	/* total change in size */
+	int			new_isize;	/* di_size after adding name */
+	int			objchange;	/* changing to 8-byte inodes */
+	xfs_dir2_data_aoff_t	offset = 0;	/* offset for new entry */
+	int			pick;		/* which algorithm to use */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	xfs_dir2_sf_entry_t	*sfep = NULL;	/* shortform entry */
+
+	trace_xfs_dir2_sf_addname(args);
+
+	ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Make sure the shortform value has some of its header.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return -EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Compute entry (and change in) size.
+	 */
+	incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
+	objchange = 0;
+
+	/*
+	 * Do we have to change to 8 byte inodes?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+		/*
+		 * Yes, adjust the inode size.  old count + (parent + new)
+		 */
+		incr_isize +=
+			(sfp->count + 2) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		objchange = 1;
+	}
+
+	new_isize = (int)dp->i_d.di_size + incr_isize;
+	/*
+	 * Won't fit as shortform any more (due to size),
+	 * or the pick routine says it won't (due to offset values).
+	 */
+	if (new_isize > XFS_IFORK_DSIZE(dp) ||
+	    (pick =
+	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+		/*
+		 * Just checking or no space reservation, it doesn't fit.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+			return -ENOSPC;
+		/*
+		 * Convert to block form then add the name.
+		 */
+		error = xfs_dir2_sf_to_block(args);
+		if (error)
+			return error;
+		return xfs_dir2_block_addname(args);
+	}
+	/*
+	 * Just checking, it fits.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+	/*
+	 * Do it the easy way - just add it at the end.
+	 */
+	if (pick == 1)
+		xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+	/*
+	 * Do it the hard way - look for a place to insert the new entry.
+	 * Convert to 8 byte inode numbers first if necessary.
+	 */
+	else {
+		ASSERT(pick == 2);
+		if (objchange)
+			xfs_dir2_sf_toino8(args);
+		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+	}
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_sf_entry_t	*sfep,		/* pointer to new entry */
+	xfs_dir2_data_aoff_t	offset,		/* offset to use for new ent */
+	int			new_isize)	/* new directory size */
+{
+	int			byteoff;	/* byte offset in sf dir */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	/*
+	 * Grow the in-inode space.
+	 */
+	xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+			  XFS_DATA_FORK);
+	/*
+	 * Need to set up again due to realloc of the inode data.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+	/*
+	 * Fill in the new entry.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+	dp->d_ops->sf_put_ftype(sfep, args->filetype);
+
+	/*
+	 * Update the header and inode.
+	 */
+	sfp->count++;
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+		sfp->i8count++;
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* changing inode number size */
+	int			new_isize)	/* new directory size */
+{
+	int			add_datasize;	/* data size need for new ent */
+	char			*buf;		/* buffer for old */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			eof;		/* reached end of old dir */
+	int			nbytes;		/* temp for byte copies */
+	xfs_dir2_data_aoff_t	new_offset;	/* next offset value */
+	xfs_dir2_data_aoff_t	offset;		/* current offset value */
+	int			old_isize;	/* previous di_size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* entry in original dir */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* original shortform dir */
+	xfs_dir2_sf_entry_t	*sfep;		/* entry in new dir */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new shortform dir */
+
+	/*
+	 * Copy the old directory to the stack buffer.
+	 */
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	old_isize = (int)dp->i_d.di_size;
+	buf = kmem_alloc(old_isize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	memcpy(oldsfp, sfp, old_isize);
+	/*
+	 * Loop over the old directory finding the place we're going
+	 * to insert the new entry.
+	 * If it's going to end up at the end then oldsfep will point there.
+	 */
+	for (offset = dp->d_ops->data_first_offset,
+	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+	      add_datasize = dp->d_ops->data_entsize(args->namelen),
+	      eof = (char *)oldsfep == &buf[old_isize];
+	     !eof;
+	     offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
+	      oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
+	      eof = (char *)oldsfep == &buf[old_isize]) {
+		new_offset = xfs_dir2_sf_get_offset(oldsfep);
+		if (offset + add_datasize <= new_offset)
+			break;
+	}
+	/*
+	 * Get rid of the old directory, then allocate space for
+	 * the new one.  We do this so xfs_idata_realloc won't copy
+	 * the data.
+	 */
+	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+	/*
+	 * Reset the pointer since the buffer was reallocated.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Copy the first part of the directory, including the header.
+	 */
+	nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+	memcpy(sfp, oldsfp, nbytes);
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+	/*
+	 * Fill in the new entry, and update the header counts.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+	dp->d_ops->sf_put_ftype(sfep, args->filetype);
+	sfp->count++;
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+		sfp->i8count++;
+	/*
+	 * If there's more left to copy, do that.
+	 */
+	if (!eof) {
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+		memcpy(sfep, oldsfep, old_isize - nbytes);
+	}
+	kmem_free(buf);
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int					/* pick result */
+xfs_dir2_sf_addname_pick(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* inode # size changes */
+	xfs_dir2_sf_entry_t	**sfepp,	/* out(1): new entry ptr */
+	xfs_dir2_data_aoff_t	*offsetp)	/* out(1): new offset */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			holefit;	/* found hole it will fit in */
+	int			i;		/* entry number */
+	xfs_dir2_data_aoff_t	offset;		/* data block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	int			size;		/* entry's data size */
+	int			used;		/* data bytes used */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	size = dp->d_ops->data_entsize(args->namelen);
+	offset = dp->d_ops->data_first_offset;
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	holefit = 0;
+	/*
+	 * Loop over sf entries.
+	 * Keep track of data offset and whether we've seen a place
+	 * to insert the new entry.
+	 */
+	for (i = 0; i < sfp->count; i++) {
+		if (!holefit)
+			holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+		offset = xfs_dir2_sf_get_offset(sfep) +
+			 dp->d_ops->data_entsize(sfep->namelen);
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+	}
+	/*
+	 * Calculate data bytes used excluding the new entry, if this
+	 * was a data block (block form directory).
+	 */
+	used = offset +
+	       (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t);
+	/*
+	 * If it won't fit in a block form then we can't insert it,
+	 * we'll go back, convert to block, then try the insert and convert
+	 * to leaf.
+	 */
+	if (used + (holefit ? 0 : size) > args->geo->blksize)
+		return 0;
+	/*
+	 * If changing the inode number size, do it the hard way.
+	 */
+	if (objchange)
+		return 2;
+	/*
+	 * If it won't fit at the end then do it the hard way (use the hole).
+	 */
+	if (used + size > args->geo->blksize)
+		return 2;
+	/*
+	 * Do it the easy way.
+	 */
+	*sfepp = sfep;
+	*offsetp = offset;
+	return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry number */
+	int			i8count;	/* number of big inode#s */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			offset;		/* data offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	offset = dp->d_ops->data_first_offset;
+	ino = dp->d_ops->sf_get_parent_ino(sfp);
+	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+	     i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+		ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+		ino = dp->d_ops->sf_get_ino(sfp, sfep);
+		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+		offset =
+			xfs_dir2_sf_get_offset(sfep) +
+			dp->d_ops->data_entsize(sfep->namelen);
+		ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
+	}
+	ASSERT(i8count == sfp->i8count);
+	ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+	ASSERT(offset +
+	       (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
+}
+#endif	/* DEBUG */
+
+/*
+ * Create a new (shortform) directory.
+ */
+int					/* error, always 0 */
+xfs_dir2_sf_create(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_ino_t	pino)		/* parent inode number */
+{
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		i8count;	/* parent inode is an 8-byte number */
+	xfs_dir2_sf_hdr_t *sfp;		/* shortform structure */
+	int		size;		/* directory size */
+
+	trace_xfs_dir2_sf_create(args);
+
+	dp = args->dp;
+
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_d.di_size == 0);
+	/*
+	 * If it's currently a zero-length extent file,
+	 * convert it to local format.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		dp->i_df.if_flags |= XFS_IFINLINE;
+	}
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	ASSERT(dp->i_df.if_bytes == 0);
+	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+	size = xfs_dir2_sf_hdr_size(i8count);
+	/*
+	 * Make a buffer for the data.
+	 */
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	/*
+	 * Fill in the header,
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp->i8count = i8count;
+	/*
+	 * Now can put in the inode number, since i8count is set.
+	 */
+	dp->d_ops->sf_put_parent_ino(sfp, pino);
+	sfp->count = 0;
+	dp->i_d.di_size = size;
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int						/* error */
+xfs_dir2_sf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	int			error;
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
+
+	trace_xfs_dir2_sf_lookup(args);
+
+	xfs_dir2_sf_check(args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return -EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Special case for .
+	 */
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		args->cmpresult = XFS_CMP_EXACT;
+		args->filetype = XFS_DIR3_FT_DIR;
+		return -EEXIST;
+	}
+	/*
+	 * Special case for ..
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
+		args->cmpresult = XFS_CMP_EXACT;
+		args->filetype = XFS_DIR3_FT_DIR;
+		return -EEXIST;
+	}
+	/*
+	 * Loop over all the entries trying to match ours.
+	 */
+	ci_sfep = NULL;
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+		/*
+		 * Compare name and if it's an exact match, return the inode
+		 * number. If it's the first case-insensitive match, store the
+		 * inode number and continue looking for an exact match.
+		 */
+		cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+								sfep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+			args->filetype = dp->d_ops->sf_get_ftype(sfep);
+			if (cmp == XFS_CMP_EXACT)
+				return -EEXIST;
+			ci_sfep = sfep;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was not found, return -ENOENT.
+	 */
+	if (!ci_sfep)
+		return -ENOENT;
+	/* otherwise process the CI match as required by the caller */
+	error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+	return error;
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_removename(
+	xfs_da_args_t		*args)
+{
+	int			byteoff;	/* offset of removed entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			entsize;	/* this entry's size */
+	int			i;		/* shortform entry index */
+	int			newsize;	/* new inode size */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_removename(args);
+
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	oldsize = (int)dp->i_d.di_size;
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return -EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == oldsize);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Loop over the old directory entries.
+	 * Find the one we're deleting.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+			ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
+			       args->inumber);
+			break;
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	if (i == sfp->count)
+		return -ENOENT;
+	/*
+	 * Calculate sizes.
+	 */
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
+	newsize = oldsize - entsize;
+	/*
+	 * Copy the part if any after the removed entry, sliding it down.
+	 */
+	if (byteoff + entsize < oldsize)
+		memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+			oldsize - (byteoff + entsize));
+	/*
+	 * Fix up the header and file size.
+	 */
+	sfp->count--;
+	dp->i_d.di_size = newsize;
+	/*
+	 * Reallocate, making it smaller.
+	 */
+	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Are we changing inode number size?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		if (sfp->i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->i8count--;
+	}
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_ino_t		ino=0;		/* entry old inode number */
+	int			i8elevated;	/* sf_toino8 set i8count=1 */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_replace(args);
+
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the shortform directory is way too small.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return -EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+	/*
+	 * New inode number is large, and need to convert to 8-byte inodes.
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+		int	error;			/* error return value */
+		int	newsize;		/* new inode size */
+
+		newsize =
+			dp->i_df.if_bytes +
+			(sfp->count + 1) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		/*
+		 * Won't fit as shortform, convert to block then do replace.
+		 */
+		if (newsize > XFS_IFORK_DSIZE(dp)) {
+			error = xfs_dir2_sf_to_block(args);
+			if (error) {
+				return error;
+			}
+			return xfs_dir2_block_replace(args);
+		}
+		/*
+		 * Still fits, convert to 8-byte now.
+		 */
+		xfs_dir2_sf_toino8(args);
+		i8elevated = 1;
+		sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	} else
+		i8elevated = 0;
+
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	/*
+	 * Replace ..'s entry.
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		ino = dp->d_ops->sf_get_parent_ino(sfp);
+		ASSERT(args->inumber != ino);
+		dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
+	}
+	/*
+	 * Normal entry, look for the name.
+	 */
+	else {
+		for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+		     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+				ino = dp->d_ops->sf_get_ino(sfp, sfep);
+				ASSERT(args->inumber != ino);
+				dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+				dp->d_ops->sf_put_ftype(sfep, args->filetype);
+				break;
+			}
+		}
+		/*
+		 * Didn't find it.
+		 */
+		if (i == sfp->count) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+			if (i8elevated)
+				xfs_dir2_sf_toino4(args);
+			return -ENOENT;
+		}
+	}
+	/*
+	 * See if the old number was large, the new number is small.
+	 */
+	if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * And the old count was one, so need to convert to small.
+		 */
+		if (sfp->i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->i8count--;
+	}
+	/*
+	 * See if the old number was small, the new number is large.
+	 */
+	if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * add to the i8count unless we just converted to 8-byte
+		 * inodes (which does an implied i8count = 1)
+		 */
+		ASSERT(sfp->i8count != 0);
+		if (!i8elevated)
+			sfp->i8count++;
+	}
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
+
+	trace_xfs_dir2_sf_toino4(args);
+
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->i8count == 1);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize -
+		(oldsfp->count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->count = oldsfp->count;
+	sfp->i8count = 0;
+	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		dp->d_ops->sf_put_ino(sfp, sfep,
+				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
+
+	trace_xfs_dir2_sf_toino8(args);
+
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->i8count == 0);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size (nb: entry count + 1 for parent)
+	 */
+	newsize =
+		oldsize +
+		(oldsfp->count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->count = oldsfp->count;
+	sfp->i8count = 1;
+	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		dp->d_ops->sf_put_ino(sfp, sfep,
+				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_dquot_buf.c b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c
new file mode 100644
index 000000000..6fbf2d853
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+
+int
+xfs_calc_dquots_per_chunk(
+	unsigned int		nbblks)	/* basic block units */
+{
+	unsigned int	ndquots;
+
+	ASSERT(nbblks > 0);
+	ndquots = BBTOB(nbblks);
+	do_div(ndquots, sizeof(xfs_dqblk_t));
+
+	return ndquots;
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+	struct xfs_mount *mp,
+	xfs_disk_dquot_t *ddq,
+	xfs_dqid_t	 id,
+	uint		 type,	  /* used only when IO_dorepair is true */
+	uint		 flags,
+	char		 *str)
+{
+	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
+	int		errs = 0;
+
+	/*
+	 * We can encounter an uninitialized dquot buffer for 2 reasons:
+	 * 1. If we crash while deleting the quotainode(s), and those blks got
+	 *    used for user data. This is because we take the path of regular
+	 *    file deletion; however, the size field of quotainodes is never
+	 *    updated, so all the tricks that we play in itruncate_finish
+	 *    don't quite matter.
+	 *
+	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
+	 *    But the allocation will be replayed so we'll end up with an
+	 *    uninitialized quota block.
+	 *
+	 * This is all fine; things are still consistent, and we haven't lost
+	 * any quota information. Just don't complain about bad dquot blks.
+	 */
+	if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+		errs++;
+	}
+	if (ddq->d_version != XFS_DQUOT_VERSION) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+			str, id, ddq->d_version, XFS_DQUOT_VERSION);
+		errs++;
+	}
+
+	if (ddq->d_flags != XFS_DQ_USER &&
+	    ddq->d_flags != XFS_DQ_PROJ &&
+	    ddq->d_flags != XFS_DQ_GROUP) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+			str, id, ddq->d_flags);
+		errs++;
+	}
+
+	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : ondisk-dquot 0x%p, ID mismatch: "
+			"0x%x expected, found id 0x%x",
+			str, ddq, id, be32_to_cpu(ddq->d_id));
+		errs++;
+	}
+
+	if (!errs && ddq->d_id) {
+		if (ddq->d_blk_softlimit &&
+		    be64_to_cpu(ddq->d_bcount) >
+				be64_to_cpu(ddq->d_blk_softlimit)) {
+			if (!ddq->d_btimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_ino_softlimit &&
+		    be64_to_cpu(ddq->d_icount) >
+				be64_to_cpu(ddq->d_ino_softlimit)) {
+			if (!ddq->d_itimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_rtb_softlimit &&
+		    be64_to_cpu(ddq->d_rtbcount) >
+				be64_to_cpu(ddq->d_rtb_softlimit)) {
+			if (!ddq->d_rtbtimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+	}
+
+	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+		return errs;
+
+	if (flags & XFS_QMOPT_DOWARN)
+		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+	/*
+	 * Typically, a repair is only requested by quotacheck.
+	 */
+	ASSERT(id != -1);
+	ASSERT(flags & XFS_QMOPT_DQREPAIR);
+	memset(d, 0, sizeof(xfs_dqblk_t));
+
+	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+	d->dd_diskdq.d_flags = type;
+	d->dd_diskdq.d_id = cpu_to_be32(id);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	return errs;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	int			ndquots;
+	int			i;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return true;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(
+					XFS_BB_TO_FSB(mp, bp->b_length));
+
+	for (i = 0; i < ndquots; i++, d++) {
+		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF))
+			return false;
+		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+			return false;
+	}
+	return true;
+}
+
+STATIC bool
+xfs_dquot_buf_verify(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	xfs_dqid_t		id = 0;
+	int			ndquots;
+	int			i;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+	/*
+	 * On the first read of the buffer, verify that each dquot is valid.
+	 * We don't know what the id of the dquot is supposed to be, just that
+	 * they should be increasing monotonically within the buffer. If the
+	 * first id is corrupt, then it will fail on the second dquot in the
+	 * buffer so corruptions could point to the wrong dquot in this case.
+	 */
+	for (i = 0; i < ndquots; i++) {
+		struct xfs_disk_dquot	*ddq;
+		int			error;
+
+		ddq = &d[i].dd_diskdq;
+
+		if (i == 0)
+			id = be32_to_cpu(ddq->d_id);
+
+		error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+				       "xfs_dquot_buf_verify");
+		if (error)
+			return false;
+	}
+	return true;
+}
+
+static void
+xfs_dquot_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify_crc(mp, bp))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_dquot_buf_verify(mp, bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify(mp, bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+	.verify_read = xfs_dquot_buf_read_verify,
+	.verify_write = xfs_dquot_buf_write_verify,
+};
+
diff --git a/kernel/fs/xfs/libxfs/xfs_format.h b/kernel/fs/xfs/libxfs/xfs_format.h
new file mode 100644
index 000000000..4daaa6623
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_format.h
@@ -0,0 +1,1461 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for 
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+#define	XFS_SB_MAGIC		0x58465342	/* 'XFSB' */
+#define	XFS_SB_VERSION_1	1		/* 5.3, 6.0.1, 6.1 */
+#define	XFS_SB_VERSION_2	2		/* 6.2 - attributes */
+#define	XFS_SB_VERSION_3	3		/* 6.2 - new inode version */
+#define	XFS_SB_VERSION_4	4		/* 6.2+ - bitmask version */
+#define	XFS_SB_VERSION_5	5		/* CRC enabled filesystem */
+#define	XFS_SB_VERSION_NUMBITS		0x000f
+#define	XFS_SB_VERSION_ALLFBITS		0xfff0
+#define	XFS_SB_VERSION_ATTRBIT		0x0010
+#define	XFS_SB_VERSION_NLINKBIT		0x0020
+#define	XFS_SB_VERSION_QUOTABIT		0x0040
+#define	XFS_SB_VERSION_ALIGNBIT		0x0080
+#define	XFS_SB_VERSION_DALIGNBIT	0x0100
+#define	XFS_SB_VERSION_SHAREDBIT	0x0200
+#define XFS_SB_VERSION_LOGV2BIT		0x0400
+#define XFS_SB_VERSION_SECTORBIT	0x0800
+#define	XFS_SB_VERSION_EXTFLGBIT	0x1000
+#define	XFS_SB_VERSION_DIRV2BIT		0x2000
+#define	XFS_SB_VERSION_BORGBIT		0x4000	/* ASCII only case-insens. */
+#define	XFS_SB_VERSION_MOREBITSBIT	0x8000
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define	XFS_SB_VERSION_OKBITS		\
+	((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+		~XFS_SB_VERSION_SHAREDBIT)
+
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_RESERVED1BIT	0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT	0x00000002	/* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT	0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT	0x00000008	/* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT	0x00000010	/* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT	0x00000080	/* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT		0x00000100	/* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE		0x00000200	/* inode type in dir */
+
+#define	XFS_SB_VERSION2_OKBITS		\
+	(XFS_SB_VERSION2_LAZYSBCOUNTBIT	| \
+	 XFS_SB_VERSION2_ATTR2BIT	| \
+	 XFS_SB_VERSION2_PROJID32BIT	| \
+	 XFS_SB_VERSION2_FTYPE)
+
+/*
+ * Superblock - in core version.  Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+	__uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	__uint32_t	sb_blocksize;	/* logical block size, bytes */
+	xfs_rfsblock_t	sb_dblocks;	/* number of data blocks */
+	xfs_rfsblock_t	sb_rblocks;	/* number of realtime blocks */
+	xfs_rtblock_t	sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* file system unique id */
+	xfs_fsblock_t	sb_logstart;	/* starting block of log if internal */
+	xfs_ino_t	sb_rootino;	/* root inode number */
+	xfs_ino_t	sb_rbmino;	/* bitmap inode for realtime extents */
+	xfs_ino_t	sb_rsumino;	/* summary inode for rt bitmap */
+	xfs_agblock_t	sb_rextsize;	/* realtime extent size, blocks */
+	xfs_agblock_t	sb_agblocks;	/* size of an allocation group */
+	xfs_agnumber_t	sb_agcount;	/* number of allocation groups */
+	xfs_extlen_t	sb_rbmblocks;	/* number of rt bitmap blocks */
+	xfs_extlen_t	sb_logblocks;	/* number of log blocks */
+	__uint16_t	sb_versionnum;	/* header version == XFS_SB_VERSION */
+	__uint16_t	sb_sectsize;	/* volume sector size, bytes */
+	__uint16_t	sb_inodesize;	/* inode size, bytes */
+	__uint16_t	sb_inopblock;	/* inodes per block */
+	char		sb_fname[12];	/* file system name */
+	__uint8_t	sb_blocklog;	/* log2 of sb_blocksize */
+	__uint8_t	sb_sectlog;	/* log2 of sb_sectsize */
+	__uint8_t	sb_inodelog;	/* log2 of sb_inodesize */
+	__uint8_t	sb_inopblog;	/* log2 of sb_inopblock */
+	__uint8_t	sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	__uint8_t	sb_rextslog;	/* log2 of sb_rextents */
+	__uint8_t	sb_inprogress;	/* mkfs is in progress, don't mount */
+	__uint8_t	sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.  If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	__uint64_t	sb_icount;	/* allocated inodes */
+	__uint64_t	sb_ifree;	/* free inodes */
+	__uint64_t	sb_fdblocks;	/* free data blocks */
+	__uint64_t	sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	xfs_ino_t	sb_uquotino;	/* user quota inode */
+	xfs_ino_t	sb_gquotino;	/* group quota inode */
+	__uint16_t	sb_qflags;	/* quota flags */
+	__uint8_t	sb_flags;	/* misc. flags */
+	__uint8_t	sb_shared_vn;	/* shared version number */
+	xfs_extlen_t	sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	__uint32_t	sb_unit;	/* stripe or raid unit */
+	__uint32_t	sb_width;	/* stripe or raid width */
+	__uint8_t	sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	__uint8_t	sb_logsectlog;	/* log2 of the log sector size */
+	__uint16_t	sb_logsectsize;	/* sector size for the log, bytes */
+	__uint32_t	sb_logsunit;	/* stripe unit size for the log */
+	__uint32_t	sb_features2;	/* additional feature bits */
+
+	/*
+	 * bad features2 field as a result of failing to pad the sb structure to
+	 * 64 bits. Some machines will be using this field for features2 bits.
+	 * Easiest just to mark it bad and not use it for anything else.
+	 *
+	 * This is not kept up to date in memory; it is always overwritten by
+	 * the value in sb_features2 when formatting the incore superblock to
+	 * the disk buffer.
+	 */
+	__uint32_t	sb_bad_features2;
+
+	/* version 5 superblock fields start here */
+
+	/* feature masks */
+	__uint32_t	sb_features_compat;
+	__uint32_t	sb_features_ro_compat;
+	__uint32_t	sb_features_incompat;
+	__uint32_t	sb_features_log_incompat;
+
+	__uint32_t	sb_crc;		/* superblock crc */
+	__uint32_t	sb_pad;
+
+	xfs_ino_t	sb_pquotino;	/* project quota inode */
+	xfs_lsn_t	sb_lsn;		/* last write sequence */
+
+	/* must be padded to 64 bit alignment */
+} xfs_sb_t;
+
+#define XFS_SB_CRC_OFF		offsetof(struct xfs_sb, sb_crc)
+
+/*
+ * Superblock - on disk version.  Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_dsb {
+	__be32		sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	__be32		sb_blocksize;	/* logical block size, bytes */
+	__be64		sb_dblocks;	/* number of data blocks */
+	__be64		sb_rblocks;	/* number of realtime blocks */
+	__be64		sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* file system unique id */
+	__be64		sb_logstart;	/* starting block of log if internal */
+	__be64		sb_rootino;	/* root inode number */
+	__be64		sb_rbmino;	/* bitmap inode for realtime extents */
+	__be64		sb_rsumino;	/* summary inode for rt bitmap */
+	__be32		sb_rextsize;	/* realtime extent size, blocks */
+	__be32		sb_agblocks;	/* size of an allocation group */
+	__be32		sb_agcount;	/* number of allocation groups */
+	__be32		sb_rbmblocks;	/* number of rt bitmap blocks */
+	__be32		sb_logblocks;	/* number of log blocks */
+	__be16		sb_versionnum;	/* header version == XFS_SB_VERSION */
+	__be16		sb_sectsize;	/* volume sector size, bytes */
+	__be16		sb_inodesize;	/* inode size, bytes */
+	__be16		sb_inopblock;	/* inodes per block */
+	char		sb_fname[12];	/* file system name */
+	__u8		sb_blocklog;	/* log2 of sb_blocksize */
+	__u8		sb_sectlog;	/* log2 of sb_sectsize */
+	__u8		sb_inodelog;	/* log2 of sb_inodesize */
+	__u8		sb_inopblog;	/* log2 of sb_inopblock */
+	__u8		sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	__u8		sb_rextslog;	/* log2 of sb_rextents */
+	__u8		sb_inprogress;	/* mkfs is in progress, don't mount */
+	__u8		sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.  If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	__be64		sb_icount;	/* allocated inodes */
+	__be64		sb_ifree;	/* free inodes */
+	__be64		sb_fdblocks;	/* free data blocks */
+	__be64		sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	__be64		sb_uquotino;	/* user quota inode */
+	__be64		sb_gquotino;	/* group quota inode */
+	__be16		sb_qflags;	/* quota flags */
+	__u8		sb_flags;	/* misc. flags */
+	__u8		sb_shared_vn;	/* shared version number */
+	__be32		sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	__be32		sb_unit;	/* stripe or raid unit */
+	__be32		sb_width;	/* stripe or raid width */
+	__u8		sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	__u8		sb_logsectlog;	/* log2 of the log sector size */
+	__be16		sb_logsectsize;	/* sector size for the log, bytes */
+	__be32		sb_logsunit;	/* stripe unit size for the log */
+	__be32		sb_features2;	/* additional feature bits */
+	/*
+	 * bad features2 field as a result of failing to pad the sb
+	 * structure to 64 bits. Some machines will be using this field
+	 * for features2 bits. Easiest just to mark it bad and not use
+	 * it for anything else.
+	 */
+	__be32		sb_bad_features2;
+
+	/* version 5 superblock fields start here */
+
+	/* feature masks */
+	__be32		sb_features_compat;
+	__be32		sb_features_ro_compat;
+	__be32		sb_features_incompat;
+	__be32		sb_features_log_incompat;
+
+	__le32		sb_crc;		/* superblock crc */
+	__be32		sb_pad;
+
+	__be64		sb_pquotino;	/* project quota inode */
+	__be64		sb_lsn;		/* last write sequence */
+
+	/* must be padded to 64 bit alignment */
+} xfs_dsb_t;
+
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS		0x00	/* no flags set */
+#define XFS_SBF_READONLY	0x01	/* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN	0
+
+#define	XFS_SB_VERSION_NUM(sbp)	((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+
+/*
+ * The first XFS version we support is a v4 superblock with V2 directories.
+ */
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
+{
+	if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+		return false;
+
+	/* check for unknown features in the fs */
+	if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+	    ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+	     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+		return false;
+
+	return true;
+}
+
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
+{
+	if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+		return true;
+	if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+		return xfs_sb_good_v4_features(sbp);
+	return false;
+}
+
+/*
+ * Detect a mismatched features2 field.  Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+{
+	return sbp->sb_bad_features2 != sbp->sb_features2;
+}
+
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
+{
+	return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
+}
+
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+}
+
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
+{
+	return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+}
+
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+}
+
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
+{
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+		(sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
+}
+
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
+{
+	return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+}
+
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+	       (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+}
+
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+	       (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+}
+
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
+{
+	return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+}
+
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
+{
+	return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+	       (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+
+/*
+ * sb_features2 bit version macros.
+ */
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
+{
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+}
+
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
+{
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
+}
+
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+	sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
+{
+	sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+	if (!sbp->sb_features2)
+		sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
+{
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
+}
+
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+	sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+}
+
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN	~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)		/* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+		(XFS_SB_FEAT_RO_COMPAT_FINOBT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_ro_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_FTYPE	(1 << 0)	/* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+		(XFS_SB_FEAT_INCOMPAT_FTYPE)
+
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_incompat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_log_incompat & feature) != 0;
+}
+
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
+{
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+		xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+	       (xfs_sb_version_hasmorebits(sbp) &&
+		 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
+}
+
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
+
+/*
+ * end of superblock version macros
+ */
+
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+	return (ino == sbp->sb_uquotino ||
+		ino == sbp->sb_gquotino ||
+		ino == sbp->sb_pquotino);
+}
+
+#define XFS_SB_DADDR		((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define	XFS_SB_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#define XFS_BUF_TO_SBP(bp)	((xfs_dsb_t *)((bp)->b_addr))
+
+#define	XFS_HDR_BLOCK(mp,d)	((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define	XFS_DADDR_TO_FSB(mp,d)	XFS_AGB_TO_FSB(mp, \
+			xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define	XFS_FSB_TO_DADDR(mp,fsbno)	XFS_AGB_TO_DADDR(mp, \
+			XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec)	((sec) << (mp)->m_sectbb_log)
+
+/*
+ * File system block to basic block conversions.
+ */
+#define	XFS_FSB_TO_BB(mp,fsbno)	((fsbno) << (mp)->m_blkbb_log)
+#define	XFS_BB_TO_FSB(mp,bb)	\
+	(((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define	XFS_BB_TO_FSBT(mp,bb)	((bb) >> (mp)->m_blkbb_log)
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)	((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)	\
+	((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)	(((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b)	((b) & (mp)->m_blockmask)
+
+/*
+ * Allocation group header
+ *
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+#define	XFS_AGF_MAGIC	0x58414746	/* 'XAGF' */
+#define	XFS_AGI_MAGIC	0x58414749	/* 'XAGI' */
+#define	XFS_AGFL_MAGIC	0x5841464c	/* 'XAFL' */
+#define	XFS_AGF_VERSION	1
+#define	XFS_AGI_VERSION	1
+
+#define	XFS_AGF_GOOD_VERSION(v)	((v) == XFS_AGF_VERSION)
+#define	XFS_AGI_GOOD_VERSION(v)	((v) == XFS_AGI_VERSION)
+
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define	XFS_BTNUM_AGF	((int)XFS_BTNUM_CNTi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf {
+	/*
+	 * Common allocation group header information
+	 */
+	__be32		agf_magicnum;	/* magic number == XFS_AGF_MAGIC */
+	__be32		agf_versionnum;	/* header version == XFS_AGF_VERSION */
+	__be32		agf_seqno;	/* sequence # starting from 0 */
+	__be32		agf_length;	/* size in blocks of a.g. */
+	/*
+	 * Freespace information
+	 */
+	__be32		agf_roots[XFS_BTNUM_AGF];	/* root blocks */
+	__be32		agf_spare0;	/* spare field */
+	__be32		agf_levels[XFS_BTNUM_AGF];	/* btree levels */
+	__be32		agf_spare1;	/* spare field */
+
+	__be32		agf_flfirst;	/* first freelist block's index */
+	__be32		agf_fllast;	/* last freelist block's index */
+	__be32		agf_flcount;	/* count of blocks in freelist */
+	__be32		agf_freeblks;	/* total free blocks */
+
+	__be32		agf_longest;	/* longest free space */
+	__be32		agf_btreeblks;	/* # of blocks held in AGF btrees */
+	uuid_t		agf_uuid;	/* uuid of filesystem */
+
+	/*
+	 * reserve some contiguous space for future logged fields before we add
+	 * the unlogged fields. This makes the range logging via flags and
+	 * structure offsets much simpler.
+	 */
+	__be64		agf_spare64[16];
+
+	/* unlogged fields, written during buffer writeback. */
+	__be64		agf_lsn;	/* last write sequence */
+	__be32		agf_crc;	/* crc of agf sector */
+	__be32		agf_spare2;
+
+	/* structure must be padded to 64 bit alignment */
+} xfs_agf_t;
+
+#define XFS_AGF_CRC_OFF		offsetof(struct xfs_agf, agf_crc)
+
+#define	XFS_AGF_MAGICNUM	0x00000001
+#define	XFS_AGF_VERSIONNUM	0x00000002
+#define	XFS_AGF_SEQNO		0x00000004
+#define	XFS_AGF_LENGTH		0x00000008
+#define	XFS_AGF_ROOTS		0x00000010
+#define	XFS_AGF_LEVELS		0x00000020
+#define	XFS_AGF_FLFIRST		0x00000040
+#define	XFS_AGF_FLLAST		0x00000080
+#define	XFS_AGF_FLCOUNT		0x00000100
+#define	XFS_AGF_FREEBLKS	0x00000200
+#define	XFS_AGF_LONGEST		0x00000400
+#define	XFS_AGF_BTREEBLKS	0x00000800
+#define	XFS_AGF_UUID		0x00001000
+#define	XFS_AGF_NUM_BITS	13
+#define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
+
+#define XFS_AGF_FLAGS \
+	{ XFS_AGF_MAGICNUM,	"MAGICNUM" }, \
+	{ XFS_AGF_VERSIONNUM,	"VERSIONNUM" }, \
+	{ XFS_AGF_SEQNO,	"SEQNO" }, \
+	{ XFS_AGF_LENGTH,	"LENGTH" }, \
+	{ XFS_AGF_ROOTS,	"ROOTS" }, \
+	{ XFS_AGF_LEVELS,	"LEVELS" }, \
+	{ XFS_AGF_FLFIRST,	"FLFIRST" }, \
+	{ XFS_AGF_FLLAST,	"FLLAST" }, \
+	{ XFS_AGF_FLCOUNT,	"FLCOUNT" }, \
+	{ XFS_AGF_FREEBLKS,	"FREEBLKS" }, \
+	{ XFS_AGF_LONGEST,	"LONGEST" }, \
+	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \
+	{ XFS_AGF_UUID,		"UUID" }
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp)	((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)((bp)->b_addr))
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define	XFS_AGI_UNLINKED_BUCKETS	64
+
+typedef struct xfs_agi {
+	/*
+	 * Common allocation group header information
+	 */
+	__be32		agi_magicnum;	/* magic number == XFS_AGI_MAGIC */
+	__be32		agi_versionnum;	/* header version == XFS_AGI_VERSION */
+	__be32		agi_seqno;	/* sequence # starting from 0 */
+	__be32		agi_length;	/* size in blocks of a.g. */
+	/*
+	 * Inode information
+	 * Inodes are mapped by interpreting the inode number, so no
+	 * mapping data is needed here.
+	 */
+	__be32		agi_count;	/* count of allocated inodes */
+	__be32		agi_root;	/* root of inode btree */
+	__be32		agi_level;	/* levels in inode btree */
+	__be32		agi_freecount;	/* number of free inodes */
+
+	__be32		agi_newino;	/* new inode just allocated */
+	__be32		agi_dirino;	/* last directory inode chunk */
+	/*
+	 * Hash table of inodes which have been unlinked but are
+	 * still being referenced.
+	 */
+	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+	/*
+	 * This marks the end of logging region 1 and start of logging region 2.
+	 */
+	uuid_t		agi_uuid;	/* uuid of filesystem */
+	__be32		agi_crc;	/* crc of agi sector */
+	__be32		agi_pad32;
+	__be64		agi_lsn;	/* last write sequence */
+
+	__be32		agi_free_root; /* root of the free inode btree */
+	__be32		agi_free_level;/* levels in free inode btree */
+
+	/* structure must be padded to 64 bit alignment */
+} xfs_agi_t;
+
+#define XFS_AGI_CRC_OFF		offsetof(struct xfs_agi, agi_crc)
+
+#define	XFS_AGI_MAGICNUM	(1 << 0)
+#define	XFS_AGI_VERSIONNUM	(1 << 1)
+#define	XFS_AGI_SEQNO		(1 << 2)
+#define	XFS_AGI_LENGTH		(1 << 3)
+#define	XFS_AGI_COUNT		(1 << 4)
+#define	XFS_AGI_ROOT		(1 << 5)
+#define	XFS_AGI_LEVEL		(1 << 6)
+#define	XFS_AGI_FREECOUNT	(1 << 7)
+#define	XFS_AGI_NEWINO		(1 << 8)
+#define	XFS_AGI_DIRINO		(1 << 9)
+#define	XFS_AGI_UNLINKED	(1 << 10)
+#define	XFS_AGI_NUM_BITS_R1	11	/* end of the 1st agi logging region */
+#define	XFS_AGI_ALL_BITS_R1	((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define	XFS_AGI_FREE_ROOT	(1 << 11)
+#define	XFS_AGI_FREE_LEVEL	(1 << 12)
+#define	XFS_AGI_NUM_BITS_R2	13
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp)	((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define	XFS_AGI_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#define	XFS_BUF_TO_AGI(bp)	((xfs_agi_t *)((bp)->b_addr))
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp)	((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define	XFS_AGFL_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define	XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)((bp)->b_addr))
+
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		&(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+		(__be32 *)(bp)->b_addr)
+
+/*
+ * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
+ */
+#define XFS_AGFL_SIZE(mp) \
+	(((mp)->m_sb.sb_sectsize - \
+	 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		sizeof(struct xfs_agfl) : 0)) / \
+	  sizeof(xfs_agblock_t))
+
+typedef struct xfs_agfl {
+	__be32		agfl_magicnum;
+	__be32		agfl_seqno;
+	uuid_t		agfl_uuid;
+	__be64		agfl_lsn;
+	__be32		agfl_crc;
+	__be32		agfl_bno[];	/* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+#define XFS_AGFL_CRC_OFF	offsetof(struct xfs_agfl, agfl_crc)
+
+
+#define	XFS_AG_MAXLEVELS(mp)		((mp)->m_ag_maxlevels)
+#define	XFS_MIN_FREELIST_RAW(bl,cl,mp)	\
+	(MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#define	XFS_MIN_FREELIST(a,mp)		\
+	(XFS_MIN_FREELIST_RAW(		\
+		be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
+		be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
+#define	XFS_MIN_FREELIST_PAG(pag,mp)	\
+	(XFS_MIN_FREELIST_RAW(		\
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+
+#define XFS_AGB_TO_FSB(mp,agno,agbno)	\
+	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define	XFS_FSB_TO_AGNO(mp,fsbno)	\
+	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define	XFS_FSB_TO_AGBNO(mp,fsbno)	\
+	((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define	XFS_AGB_TO_DADDR(mp,agno,agbno)	\
+	((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define	XFS_AG_DADDR(mp,agno,d)		(XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define	XFS_AG_CHECK_DADDR(mp,d,len)	\
+	((len) == 1 ? \
+	    ASSERT((d) == XFS_SB_DADDR || \
+		   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(xfs_daddr_to_agno(mp, d) == \
+		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+
+typedef struct xfs_timestamp {
+	__be32		t_sec;		/* timestamp seconds */
+	__be32		t_nsec;		/* timestamp nanoseconds */
+} xfs_timestamp_t;
+
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
+ */
+#define	XFS_DINODE_MAGIC		0x494e	/* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v)	((v) >= 1 && (v) <= 3)
+typedef struct xfs_dinode {
+	__be16		di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__be16		di_mode;	/* mode and type of file */
+	__u8		di_version;	/* inode version */
+	__u8		di_format;	/* format of di_c data */
+	__be16		di_onlink;	/* old number of links to file */
+	__be32		di_uid;		/* owner's user id */
+	__be32		di_gid;		/* owner's group id */
+	__be32		di_nlink;	/* number of links to file */
+	__be16		di_projid_lo;	/* lower part of owner's project id */
+	__be16		di_projid_hi;	/* higher part owner's project id */
+	__u8		di_pad[6];	/* unused, zeroed space */
+	__be16		di_flushiter;	/* incremented on flush */
+	xfs_timestamp_t	di_atime;	/* time last accessed */
+	xfs_timestamp_t	di_mtime;	/* time last modified */
+	xfs_timestamp_t	di_ctime;	/* time created/inode modified */
+	__be64		di_size;	/* number of bytes in file */
+	__be64		di_nblocks;	/* # of direct & btree blocks used */
+	__be32		di_extsize;	/* basic/minimum extent size for file */
+	__be32		di_nextents;	/* number of extents in data fork */
+	__be16		di_anextents;	/* number of extents in attribute fork*/
+	__u8		di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__s8		di_aformat;	/* format of attr fork's data */
+	__be32		di_dmevmask;	/* DMIG event mask */
+	__be16		di_dmstate;	/* DMIG state info */
+	__be16		di_flags;	/* random flags, XFS_DIFLAG_... */
+	__be32		di_gen;		/* generation number */
+
+	/* di_next_unlinked is the only non-core field in the old dinode */
+	__be32		di_next_unlinked;/* agi unlinked list ptr */
+
+	/* start of the extended dinode, writable fields */
+	__le32		di_crc;		/* CRC of the inode */
+	__be64		di_changecount;	/* number of attribute changes */
+	__be64		di_lsn;		/* flush sequence */
+	__be64		di_flags2;	/* more random flags */
+	__u8		di_pad2[16];	/* more padding for future expansion */
+
+	/* fields only written to during inode creation */
+	xfs_timestamp_t	di_crtime;	/* time created */
+	__be64		di_ino;		/* inode number */
+	uuid_t		di_uuid;	/* UUID of the filesystem */
+
+	/* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
+
+#define XFS_DINODE_CRC_OFF	offsetof(struct xfs_dinode, di_crc)
+
+#define DI_MAX_FLUSH 0xffff
+
+/*
+ * Size of the core inode on disk.  Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+	if (version == 3)
+		return sizeof(struct xfs_dinode);
+	return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define	XFS_MAXLINK		((1U << 31) - 1U)
+#define	XFS_MAXLINK_1		65535U
+
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt {
+	XFS_DINODE_FMT_DEV,		/* xfs_dev_t */
+	XFS_DINODE_FMT_LOCAL,		/* bulk data */
+	XFS_DINODE_FMT_EXTENTS,		/* struct xfs_bmbt_rec */
+	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
+	XFS_DINODE_FMT_UUID		/* uuid_t */
+} xfs_dinode_fmt_t;
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define	XFS_DINODE_MIN_LOG	8
+#define	XFS_DINODE_MAX_LOG	11
+#define	XFS_DINODE_MIN_SIZE	(1 << XFS_DINODE_MIN_LOG)
+#define	XFS_DINODE_MAX_SIZE	(1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#define XFS_LITINO(mp, version) \
+	((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_Q(dip)		((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip)		((int)((dip)->di_forkoff << 3))
+
+#define XFS_DFORK_DSIZE(dip,mp) \
+	(XFS_DFORK_Q(dip) ? \
+		XFS_DFORK_BOFF(dip) : \
+		XFS_LITINO(mp, (dip)->di_version))
+#define XFS_DFORK_ASIZE(dip,mp) \
+	(XFS_DFORK_Q(dip) ? \
+		XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
+		0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_DFORK_DSIZE(dip, mp) : \
+		XFS_DFORK_ASIZE(dip, mp))
+
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+	((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip)	\
+	(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w)	\
+	((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
+#define XFS_DFORK_FORMAT(dip,w) \
+	((w) == XFS_DATA_FORK ? \
+		(dip)->di_format : \
+		(dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
+	((w) == XFS_DATA_FORK ? \
+		be32_to_cpu((dip)->di_nextents) : \
+		be16_to_cpu((dip)->di_anextents))
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+	return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+	*(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT  0	/* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT  1	/* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT   2	/* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3	/* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT    4	/* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT      5	/* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT   6	/* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT    7	/* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8	/* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT   9	/* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT   10	/* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT      11	/* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12	/* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13	/* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
+#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#define XFS_DIFLAG_ANY \
+	(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+	 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+	 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+	 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+#define	XFS_INO_MASK(k)			(__uint32_t)((1ULL << (k)) - 1)
+#define	XFS_INO_OFFSET_BITS(mp)		(mp)->m_sb.sb_inopblog
+#define	XFS_INO_AGBNO_BITS(mp)		(mp)->m_sb.sb_agblklog
+#define	XFS_INO_AGINO_BITS(mp)		(mp)->m_agino_log
+#define	XFS_INO_AGNO_BITS(mp)		(mp)->m_agno_log
+#define	XFS_INO_BITS(mp)		\
+	XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define	XFS_INO_TO_AGNO(mp,i)		\
+	((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define	XFS_INO_TO_AGINO(mp,i)		\
+	((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define	XFS_INO_TO_AGBNO(mp,i)		\
+	(((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+		XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define	XFS_INO_TO_OFFSET(mp,i)		\
+	((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_INO_TO_FSB(mp,i)		\
+	XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define	XFS_AGINO_TO_INO(mp,a,i)	\
+	(((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define	XFS_AGINO_TO_AGBNO(mp,i)	((i) >> XFS_INO_OFFSET_BITS(mp))
+#define	XFS_AGINO_TO_OFFSET(mp,i)	\
+	((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_OFFBNO_TO_AGINO(mp,b,o)	\
+	((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+
+#define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define	XFS_MAXINUMBER_32	((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define	XFS_MAX_RTEXTSIZE	(1024 * 1024 * 1024)	/* 1GB */
+#define	XFS_DFL_RTEXTSIZE	(64 * 1024)	        /* 64kB */
+#define	XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4kB */
+
+#define	XFS_BLOCKSIZE(mp)	((mp)->m_sb.sb_blocksize)
+#define	XFS_BLOCKMASK(mp)	((mp)->m_blockmask)
+#define	XFS_BLOCKWSIZE(mp)	((mp)->m_blockwsize)
+#define	XFS_BLOCKWMASK(mp)	((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define	XFS_SUMOFFS(mp,ls,bb)	((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define	XFS_SUMOFFSTOBLOCK(mp,s)	\
+	(((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define	XFS_SUMPTR(mp,bp,so)	\
+	((xfs_suminfo_t *)((bp)->b_addr + \
+		(((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define	XFS_BITTOBLOCK(mp,bi)	((bi) >> (mp)->m_blkbit_log)
+#define	XFS_BLOCKTOBIT(mp,bb)	((bb) << (mp)->m_blkbit_log)
+#define	XFS_BITTOWORD(mp,bi)	\
+	((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define	XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
+#define	XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
+
+#define	XFS_RTLOBIT(w)	xfs_lowbit32(w)
+#define	XFS_RTHIBIT(w)	xfs_highbit32(w)
+
+#define	XFS_RTBLOCKLOG(b)	xfs_highbit64(b)
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC		0x4451		/* 'DQ' */
+#define XFS_DQUOT_VERSION	(u_int8_t)0x01	/* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct	xfs_disk_dquot {
+	__be16		d_magic;	/* dquot magic = XFS_DQUOT_MAGIC */
+	__u8		d_version;	/* dquot version */
+	__u8		d_flags;	/* XFS_DQ_USER/PROJ/GROUP */
+	__be32		d_id;		/* user,project,group id */
+	__be64		d_blk_hardlimit;/* absolute limit on disk blks */
+	__be64		d_blk_softlimit;/* preferred limit on disk blks */
+	__be64		d_ino_hardlimit;/* maximum # allocated inodes */
+	__be64		d_ino_softlimit;/* preferred inode limit */
+	__be64		d_bcount;	/* disk blocks owned by the user */
+	__be64		d_icount;	/* inodes owned by the user */
+	__be32		d_itimer;	/* zero if within inode limits if not,
+					   this is when we refuse service */
+	__be32		d_btimer;	/* similar to above; for disk blocks */
+	__be16		d_iwarns;	/* warnings issued wrt num inodes */
+	__be16		d_bwarns;	/* warnings issued wrt disk blocks */
+	__be32		d_pad0;		/* 64 bit align */
+	__be64		d_rtb_hardlimit;/* absolute limit on realtime blks */
+	__be64		d_rtb_softlimit;/* preferred limit on RT disk blks */
+	__be64		d_rtbcount;	/* realtime blocks owned */
+	__be32		d_rtbtimer;	/* similar to above; for RT disk blocks */
+	__be16		d_rtbwarns;	/* warnings issued wrt RT disk blocks */
+	__be16		d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+	xfs_disk_dquot_t  dd_diskdq;	/* portion that lives incore as well */
+	char		  dd_fill[4];	/* filling for posterity */
+
+	/*
+	 * These two are only present on filesystems with the CRC bits set.
+	 */
+	__be32		  dd_crc;	/* checksum */
+	__be64		  dd_lsn;	/* last modification in log */
+	uuid_t		  dd_uuid;	/* location information */
+} xfs_dqblk_t;
+
+#define XFS_DQUOT_CRC_OFF	offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC	0x58534c4d	/* XSLM */
+
+struct xfs_dsymlink_hdr {
+	__be32	sl_magic;
+	__be32	sl_offset;
+	__be32	sl_bytes;
+	__be32	sl_crc;
+	uuid_t	sl_uuid;
+	__be64	sl_owner;
+	__be64	sl_blkno;
+	__be64	sl_lsn;
+};
+
+#define XFS_SYMLINK_CRC_OFF	offsetof(struct xfs_dsymlink_hdr, sl_crc)
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)	\
+	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+			sizeof(struct xfs_dsymlink_hdr) : 0))
+
+
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define	XFS_ABTB_MAGIC		0x41425442	/* 'ABTB' for bno tree */
+#define	XFS_ABTB_CRC_MAGIC	0x41423342	/* 'AB3B' */
+#define	XFS_ABTC_MAGIC		0x41425443	/* 'ABTC' for cnt tree */
+#define	XFS_ABTC_CRC_MAGIC	0x41423343	/* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+	__be32		ar_startblock;	/* starting block number */
+	__be32		ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+	xfs_agblock_t	ar_startblock;	/* starting block number */
+	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define	XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define	XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define	XFS_IBT_MAGIC		0x49414254	/* 'IABT' */
+#define	XFS_IBT_CRC_MAGIC	0x49414233	/* 'IAB3' */
+#define	XFS_FIBT_MAGIC		0x46494254	/* 'FIBT' */
+#define	XFS_FIBT_CRC_MAGIC	0x46494233	/* 'FIB3' */
+
+typedef	__uint64_t	xfs_inofree_t;
+#define	XFS_INODES_PER_CHUNK		(NBBY * sizeof(xfs_inofree_t))
+#define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
+#define	XFS_INOBT_ALL_FREE		((xfs_inofree_t)-1)
+#define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+	return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+	__be32		ir_startino;	/* starting inode number */
+	__be32		ir_freecount;	/* count of free inodes (set bits) */
+	__be64		ir_free;	/* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+	xfs_agino_t	ir_startino;	/* starting inode number */
+	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
+	xfs_inofree_t	ir_free;	/* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+	__be32		ir_startino;	/* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define	XFS_IBT_BLOCK(mp)		((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define	XFS_FIBT_BLOCK(mp)		((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+	(xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+	 XFS_FIBT_BLOCK(mp) + 1 : \
+	 XFS_IBT_BLOCK(mp) + 1)
+
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC		0x424d4150	/* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC	0x424d4133	/* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITLEN	21
+
+typedef struct xfs_bmbt_rec {
+	__be64			l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+	__uint64_t		l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS	17
+#define STARTBLOCKMASKBITS	(15 + 20)
+#define STARTBLOCKMASK		\
+	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+	ASSERT(k < (1 << STARTBLOCKVALBITS));
+	return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+	XFS_EXTFMT_NOSTATE = 0,
+	XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+	XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+	xfs_fileoff_t	br_startoff;	/* starting file offset */
+	xfs_fsblock_t	br_startblock;	/* starting block number */
+	xfs_filblks_t	br_blockcount;	/* number of blocks */
+	xfs_exntst_t	br_state;	/* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+	__be64		br_startoff;	/* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below.  Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+struct xfs_btree_block {
+	__be32		bb_magic;	/* magic number for block type */
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+	union {
+		struct {
+			__be32		bb_leftsib;
+			__be32		bb_rightsib;
+
+			__be64		bb_blkno;
+			__be64		bb_lsn;
+			uuid_t		bb_uuid;
+			__be32		bb_owner;
+			__le32		bb_crc;
+		} s;			/* short form pointers */
+		struct	{
+			__be64		bb_leftsib;
+			__be64		bb_rightsib;
+
+			__be64		bb_blkno;
+			__be64		bb_lsn;
+			uuid_t		bb_uuid;
+			__be64		bb_owner;
+			__le32		bb_crc;
+			__be32		bb_pad; /* padding for alignment */
+		} l;			/* long form pointers */
+	} bb_u;				/* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN	16	/* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN	24	/* size of a long form block */
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN	(XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN	(XFS_BTREE_LBLOCK_LEN + 48)
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
+/*
+ * On-disk XFS access control list structure.
+ */
+struct xfs_acl_entry {
+	__be32	ae_tag;
+	__be32	ae_id;
+	__be16	ae_perm;
+	__be16	ae_pad;		/* fill the implicit hole in the structure */
+};
+
+struct xfs_acl {
+	__be32			acl_cnt;
+	struct xfs_acl_entry	acl_entry[0];
+};
+
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp)	\
+	(xfs_sb_version_hascrc(&mp->m_sb) \
+		?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+						sizeof(struct xfs_acl_entry) \
+		: 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+	(sizeof(struct xfs_acl) + \
+		sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE		(unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		(unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
+
+#endif /* __XFS_FORMAT_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_fs.h b/kernel/fs/xfs/libxfs/xfs_fs.h
new file mode 100644
index 000000000..18dc721ca
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_fs.h
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 1995-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FS_H__
+#define __XFS_FS_H__
+
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+#ifndef HAVE_DIOATTR
+struct dioattr {
+	__u32		d_mem;		/* data buffer memory alignment */
+	__u32		d_miniosz;	/* min xfer size		*/
+	__u32		d_maxiosz;	/* max xfer size		*/
+};
+#endif
+
+/*
+ * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
+ */
+#ifndef HAVE_FSXATTR
+struct fsxattr {
+	__u32		fsx_xflags;	/* xflags field value (get/set) */
+	__u32		fsx_extsize;	/* extsize field value (get/set)*/
+	__u32		fsx_nextents;	/* nextents field value (get)	*/
+	__u32		fsx_projid;	/* project identifier (get/set) */
+	unsigned char	fsx_pad[12];
+};
+#endif
+
+/*
+ * Flags for the bs_xflags/fsx_xflags field
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_DIFLAG_s.
+ */
+#define XFS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
+#define XFS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */
+#define XFS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
+#define XFS_XFLAG_APPEND	0x00000010	/* all writes append */
+#define XFS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
+#define XFS_XFLAG_NOATIME	0x00000040	/* do not update access time */
+#define XFS_XFLAG_NODUMP	0x00000080	/* do not include in backups */
+#define XFS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
+#define XFS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
+#define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
+#define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
+#define XFS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
+#define XFS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
+#define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
+
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back.  The first structure is
+ * updated on return to give the offset and length for the next call.
+ */
+#ifndef HAVE_GETBMAP
+struct getbmap {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output)  */
+};
+#endif
+
+/*
+ *	Structure for XFS_IOC_GETBMAPX.	 Fields bmv_offset through bmv_entries
+ *	are used exactly as in the getbmap structure.  The getbmapx structure
+ *	has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ *	is only used for the first structure.  It contains input flags
+ *	specifying XFS_IOC_GETBMAPX actions.  The bmv_oflags field is filled
+ *	in by the XFS_IOC_GETBMAPX command for each returned structure after
+ *	the first.
+ */
+#ifndef HAVE_GETBMAPX
+struct getbmapx {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output). */
+	__s32		bmv_iflags;	/* input flags (1st structure)	    */
+	__s32		bmv_oflags;	/* output flags (after 1st structure)*/
+	__s32		bmv_unused1;	/* future use			    */
+	__s32		bmv_unused2;	/* future use			    */
+};
+#endif
+
+/*	bmv_iflags values - set by XFS_IOC_GETBMAPX caller.	*/
+#define BMV_IF_ATTRFORK		0x1	/* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ	0x2	/* Do not generate DMAPI read event  */
+#define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES		0x10	/* Do not return holes */
+#define BMV_IF_VALID	\
+	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\
+	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
+
+/*	bmv_oflags values - returned for each non-header segment */
+#define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC		0x2	/* segment = delayed allocation */
+#define BMV_OF_LAST		0x4	/* segment is the last in the file */
+
+/*
+ * Structure for XFS_IOC_FSSETDM.
+ * For use by backup and restore programs to set the XFS on-disk inode
+ * fields di_dmevmask and di_dmstate.  These must be set to exactly and
+ * only values previously obtained via xfs_bulkstat!  (Specifically the
+ * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ */
+#ifndef HAVE_FSDMIDATA
+struct fsdmidata {
+	__u32		fsd_dmevmask;	/* corresponds to di_dmevmask */
+	__u16		fsd_padding;
+	__u16		fsd_dmstate;	/* corresponds to di_dmstate  */
+};
+#endif
+
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+} xfs_flock64_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+typedef struct xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+} xfs_fsop_geom_v1_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+typedef struct xfs_fsop_geom {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+	__u32		logsunit;	/* log stripe unit, bytes */
+} xfs_fsop_geom_t;
+
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+	__u64	freedata;	/* free data section blocks */
+	__u64	freertx;	/* free rt extents */
+	__u64	freeino;	/* free inodes */
+	__u64	allocino;	/* total allocated inodes */
+} xfs_fsop_counts_t;
+
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+	__u64  resblks;
+	__u64  resblks_avail;
+} xfs_fsop_resblks_t;
+
+#define XFS_FSOP_GEOM_VERSION	0
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR	0x0001	/* attributes in use	*/
+#define XFS_FSOP_GEOM_FLAGS_NLINK	0x0002	/* 32-bit nlink values	*/
+#define XFS_FSOP_GEOM_FLAGS_QUOTA	0x0004	/* quotas enabled	*/
+#define XFS_FSOP_GEOM_FLAGS_IALIGN	0x0008	/* inode alignment	*/
+#define XFS_FSOP_GEOM_FLAGS_DALIGN	0x0010	/* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED	0x0020	/* read-only shared	*/
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG	0x0040	/* special extent flag	*/
+#define XFS_FSOP_GEOM_FLAGS_DIRV2	0x0080	/* directory version 2	*/
+#define XFS_FSOP_GEOM_FLAGS_LOGV2	0x0100	/* log format version 2	*/
+#define XFS_FSOP_GEOM_FLAGS_SECTOR	0x0200	/* sector sizes >1BB	*/
+#define XFS_FSOP_GEOM_FLAGS_ATTR2	0x0400	/* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32	0x0800	/* 32-bit project IDs	*/
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names	*/
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB	0x4000	/* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB	0x8000	/* version 5 superblock */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE	0x10000	/* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT	0x20000	/* free inode btree */
+
+/*
+ * Minimum and maximum sizes need for growth checks.
+ *
+ * Block counts are in units of filesystem blocks, not basic blocks.
+ */
+#define XFS_MIN_AG_BLOCKS	64
+#define XFS_MIN_LOG_BLOCKS	512ULL
+#define XFS_MAX_LOG_BLOCKS	(1024 * 1024ULL)
+#define XFS_MIN_LOG_BYTES	(10 * 1024 * 1024ULL)
+
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+	((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
+
+/* Used for sanity checks on superblock */
+#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) *	\
+			 (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
+
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} xfs_growfs_data_t;
+
+typedef struct xfs_growfs_log {
+	__u32		newblocks;	/* new log size, fsblocks */
+	__u32		isint;		/* 1 if new log is internal */
+} xfs_growfs_log_t;
+
+typedef struct xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+
+
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+	time_t		tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} xfs_bstime_t;
+
+typedef struct xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	xfs_bstime_t	bs_atime;	/* access time			*/
+	xfs_bstime_t	bs_mtime;	/* modify time			*/
+	xfs_bstime_t	bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
+	__u16		bs_projid_hi;	/* higher part of project id	*/
+	unsigned char	bs_pad[10];	/* pad space, unused		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+} xfs_bstat_t;
+
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline __uint32_t
+bstat_get_projid(struct xfs_bstat *bs)
+{
+	return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+}
+
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+typedef struct xfs_fsop_bulkreq {
+	__u64		__user *lastip;	/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	void		__user *ubuffer;/* user buffer for inode desc.	*/
+	__s32		__user *ocount;	/* output count pointer		*/
+} xfs_fsop_bulkreq_t;
+
+
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+typedef struct xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+} xfs_inogrp_t;
+
+
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+	__s32		fd;
+	__s32		errtag;
+} xfs_error_injection_t;
+
+
+/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION		1
+struct xfs_fs_eofblocks {
+	__u32		eof_version;
+	__u32		eof_flags;
+	uid_t		eof_uid;
+	gid_t		eof_gid;
+	prid_t		eof_prid;
+	__u32		pad32;
+	__u64		eof_min_file_size;
+	__u64		pad64[12];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC		(1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID		(1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID		(1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID		(1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE	(1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_UNION		(1 << 5) /* union filter algorithm;
+						  * kernel only, not included in
+						  * valid mask */
+#define XFS_EOF_FLAGS_VALID	\
+	(XFS_EOF_FLAGS_SYNC |	\
+	 XFS_EOF_FLAGS_UID |	\
+	 XFS_EOF_FLAGS_GID |	\
+	 XFS_EOF_FLAGS_PRID |	\
+	 XFS_EOF_FLAGS_MINFILESIZE)
+
+
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	void		__user *path;	/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	void		__user *ihandle;/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	void		__user *ohandle;/* user buffer for handle	*/
+	__u32		__user *ohandlen;/* user buffer length		*/
+} xfs_fsop_handlereq_t;
+
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
+ *	     XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+
+typedef struct xfs_fsop_setdm_handlereq {
+	struct xfs_fsop_handlereq	hreq;	/* handle information	*/
+	struct fsdmidata		__user *data;	/* DMAPI data	*/
+} xfs_fsop_setdm_handlereq_t;
+
+typedef struct xfs_attrlist_cursor {
+	__u32		opaque[4];
+} xfs_attrlist_cursor_t;
+
+typedef struct xfs_fsop_attrlist_handlereq {
+	struct xfs_fsop_handlereq	hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor	pos; /* opaque cookie, list offset */
+	__u32				flags;	/* which namespace to use */
+	__u32				buflen;	/* length of buffer supplied */
+	void				__user *buffer;	/* returned names */
+} xfs_fsop_attrlist_handlereq_t;
+
+typedef struct xfs_attr_multiop {
+	__u32		am_opcode;
+#define ATTR_OP_GET	1	/* return the indicated attr's value */
+#define ATTR_OP_SET	2	/* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE	3	/* remove the indicated attr */
+	__s32		am_error;
+	void		__user *am_attrname;
+	void		__user *am_attrvalue;
+	__u32		am_length;
+	__u32		am_flags;
+} xfs_attr_multiop_t;
+
+typedef struct xfs_fsop_attrmulti_handlereq {
+	struct xfs_fsop_handlereq	hreq; /* handle interface structure */
+	__u32				opcount;/* count of following multiop */
+	struct xfs_attr_multiop		__user *ops; /* attr_multi data */
+} xfs_fsop_attrmulti_handlereq_t;
+
+/*
+ * per machine unique filesystem identifier types.
+ */
+typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+
+typedef struct xfs_fid {
+	__u16	fid_len;		/* length of remainder	*/
+	__u16	fid_pad;
+	__u32	fid_gen;		/* generation number	*/
+	__u64	fid_ino;		/* 64 bits inode number */
+} xfs_fid_t;
+
+typedef struct xfs_handle {
+	union {
+		__s64	    align;	/* force alignment of ha_fid	 */
+		xfs_fsid_t  _ha_fsid;	/* unique file system identifier */
+	} ha_u;
+	xfs_fid_t	ha_fid;		/* file system specific file ID	 */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+
+#define XFS_HSIZE(handle)	(((char *) &(handle).ha_fid.fid_pad	 \
+				 - (char *) &(handle))			  \
+				 + (handle).ha_fid.fid_len)
+
+/*
+ * Structure passed to XFS_IOC_SWAPEXT
+ */
+typedef struct xfs_swapext
+{
+	__int64_t	sx_version;	/* version */
+#define XFS_SX_VERSION		0
+	__int64_t	sx_fdtarget;	/* fd of target file */
+	__int64_t	sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t	sx_offset;	/* offset into file */
+	xfs_off_t	sx_length;	/* leng from offset */
+	char		sx_pad[16];	/* pad space, unused */
+	xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
+ * Flags for going down operation
+ */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT		0x0	/* going down */
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
+
+/*
+ * ioctl commands that are used by Linux filesystems
+ */
+#define XFS_IOC_GETXFLAGS	FS_IOC_GETFLAGS
+#define XFS_IOC_SETXFLAGS	FS_IOC_SETFLAGS
+#define XFS_IOC_GETVERSION	FS_IOC_GETVERSION
+
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+#define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR	_IOR ('X', 31, struct fsxattr)
+#define XFS_IOC_FSSETXATTR	_IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
+#define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap)
+#define XFS_IOC_FSSETDM		_IOW ('X', 39, struct fsdmidata)
+#define XFS_IOC_RESVSP		_IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP	_IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64	_IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64	_IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA	_IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA	_IOR ('X', 45, struct fsxattr)
+/*	XFS_IOC_SETBIOSIZE ---- deprecated 46	   */
+/*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
+#define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_fs_eofblocks)
+
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1	     _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT	     _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE    _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS	     _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE     _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE	     _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE	     _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE	     _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE   _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT		     _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA	     _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG	     _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT	     _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS	     _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS	     _IOWR('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS	     _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION	     _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
+/*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */
+
+/*	XFS_IOC_FREEZE		  -- FIFREEZE   119	 */
+/*	XFS_IOC_THAW		  -- FITHAW     120	 */
+#ifndef FIFREEZE
+#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)
+#define XFS_IOC_THAW		     _IOWR('X', 120, int)
+#endif
+
+#define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+#define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY	     _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_GOINGDOWN	     _IOR ('X', 125, __uint32_t)
+/*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
+
+
+#ifndef HAVE_BBMACROS
+/*
+ * Block I/O parameterization.	A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512.  Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT		9
+#define BBSIZE		(1<<BBSHIFT)
+#define BBMASK		(BBSIZE-1)
+#define BTOBB(bytes)	(((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes)	((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs)	((bbs) << BBSHIFT)
+#endif
+
+#endif	/* __XFS_FS_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.c b/kernel/fs/xfs/libxfs/xfs_ialloc.c
new file mode 100644
index 000000000..1c9e75521
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc.c
@@ -0,0 +1,2202 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+
+
+/*
+ * Allocation group level functions.
+ */
+static inline int
+xfs_ialloc_cluster_alignment(
+	struct xfs_mount	*mp)
+{
+	if (xfs_sb_version_hasalign(&mp->m_sb) &&
+	    mp->m_sb.sb_inoalignmt >=
+			XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+		return mp->m_sb.sb_inoalignmt;
+	return 1;
+}
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	xfs_lookup_t		dir,	/* <=, >=, == */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = 0;
+	cur->bc_rec.i.ir_free = 0;
+	return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_inobt_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_inobt_rec_incore_t	*irec)	/* btree record */
+{
+	union xfs_btree_rec	rec;
+
+	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+	rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_inobt_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_inobt_rec_incore_t	*irec,	/* btree record */
+	int			*stat)	/* output: success/failure */
+{
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+		irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+		irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+	}
+	return error;
+}
+
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+	struct xfs_btree_cur	*cur,
+	__int32_t		freecount,
+	xfs_inofree_t		free,
+	int			*stat)
+{
+	cur->bc_rec.i.ir_freecount = freecount;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agino_t		newino,
+	xfs_agino_t		newlen,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t		agno = be32_to_cpu(agi->agi_seqno);
+	xfs_agino_t		thisino;
+	int			i;
+	int			error;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+	for (thisino = newino;
+	     thisino < newino + newlen;
+	     thisino += XFS_INODES_PER_CHUNK) {
+		error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 0);
+
+		error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+					     XFS_INOBT_ALL_FREE, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 1);
+	}
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+	return 0;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+	struct xfs_btree_cur	*cur,
+	struct xfs_agi		*agi)
+{
+	if (cur->bc_nlevels == 1) {
+		xfs_inobt_rec_incore_t rec;
+		int		freecount = 0;
+		int		error;
+		int		i;
+
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+		if (error)
+			return error;
+
+		do {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
+				return error;
+
+			if (i) {
+				freecount += rec.ir_freecount;
+				error = xfs_btree_increment(cur, 0, &i);
+				if (error)
+					return error;
+			}
+		} while (i == 1);
+
+		if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+			ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+	}
+	return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)	0
+#endif
+
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct list_head	*buffer_list,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		length,
+	unsigned int		gen)
+{
+	struct xfs_buf		*fbuf;
+	struct xfs_dinode	*free;
+	int			nbufs, blks_per_cluster, inodes_per_cluster;
+	int			version;
+	int			i, j;
+	xfs_daddr_t		d;
+	xfs_ino_t		ino = 0;
+
+	/*
+	 * Loop over the new block(s), filling in the inodes.  For small block
+	 * sizes, manipulate the inodes in buffers  which are multiples of the
+	 * blocks size.
+	 */
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+	nbufs = length / blks_per_cluster;
+
+	/*
+	 * Figure out what version number to use in the inodes we create.  If
+	 * the superblock version has caught up to the one that supports the new
+	 * inode format, then use the new inode version.  Otherwise use the old
+	 * version so that old kernels will continue to be able to use the file
+	 * system.
+	 *
+	 * For v3 inodes, we also need to write the inode number into the inode,
+	 * so calculate the first inode number of the chunk here as
+	 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+	 * across multiple filesystem blocks (such as a cluster) and so cannot
+	 * be used in the cluster buffer loop below.
+	 *
+	 * Further, because we are writing the inode directly into the buffer
+	 * and calculating a CRC on the entire inode, we have ot log the entire
+	 * inode so that the entire range the CRC covers is present in the log.
+	 * That means for v3 inode we log the entire buffer rather than just the
+	 * inode cores.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		version = 3;
+		ino = XFS_AGINO_TO_INO(mp, agno,
+				       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+		/*
+		 * log the initialisation that is about to take place as an
+		 * logical operation. This means the transaction does not
+		 * need to log the physical changes to the inode buffers as log
+		 * recovery will know what initialisation is actually needed.
+		 * Hence we only need to log the buffers as "ordered" buffers so
+		 * they track in the AIL as if they were physically logged.
+		 */
+		if (tp)
+			xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+					mp->m_sb.sb_inodesize, length, gen);
+	} else
+		version = 2;
+
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					 mp->m_bsize * blks_per_cluster,
+					 XBF_UNMAPPED);
+		if (!fbuf)
+			return -ENOMEM;
+
+		/* Initialize the inode buffers and log them appropriately. */
+		fbuf->b_ops = &xfs_inode_buf_ops;
+		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+		for (i = 0; i < inodes_per_cluster; i++) {
+			int	ioffset = i << mp->m_sb.sb_inodelog;
+			uint	isize = xfs_dinode_size(version);
+
+			free = xfs_make_iptr(mp, fbuf, i);
+			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+			free->di_version = version;
+			free->di_gen = cpu_to_be32(gen);
+			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+
+			if (version == 3) {
+				free->di_ino = cpu_to_be64(ino);
+				ino++;
+				uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+				xfs_dinode_calc_crc(mp, free);
+			} else if (tp) {
+				/* just log the inode core */
+				xfs_trans_log_buf(tp, fbuf, ioffset,
+						  ioffset + isize - 1);
+			}
+		}
+
+		if (tp) {
+			/*
+			 * Mark the buffer as an inode allocation buffer so it
+			 * sticks in AIL at the point of this allocation
+			 * transaction. This ensures the they are on disk before
+			 * the tail of the log can be moved past this
+			 * transaction (i.e. by preventing relogging from moving
+			 * it forward in the log).
+			 */
+			xfs_trans_inode_alloc_buf(tp, fbuf);
+			if (version == 3) {
+				/*
+				 * Mark the buffer as ordered so that they are
+				 * not physically logged in the transaction but
+				 * still tracked in the AIL as part of the
+				 * transaction and pin the log appropriately.
+				 */
+				xfs_trans_ordered_buf(tp, fbuf);
+				xfs_trans_log_buf(tp, fbuf, 0,
+						  BBTOB(fbuf->b_length) - 1);
+			}
+		} else {
+			fbuf->b_flags |= XBF_DONE;
+			xfs_buf_delwri_queue(fbuf, buffer_list);
+			xfs_buf_relse(fbuf);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int				/* error code or 0 */
+xfs_ialloc_ag_alloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*agbp,		/* alloc group buffer */
+	int		*alloc)
+{
+	xfs_agi_t	*agi;		/* allocation group header */
+	xfs_alloc_arg_t	args;		/* allocation argument structure */
+	xfs_agnumber_t	agno;
+	int		error;
+	xfs_agino_t	newino;		/* new first inode's number */
+	xfs_agino_t	newlen;		/* new number of inodes */
+	int		isaligned = 0;	/* inode allocation at stripe unit */
+					/* boundary */
+	struct xfs_perag *pag;
+
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * Locking will ensure that we don't have two callers in here
+	 * at one time.
+	 */
+	newlen = args.mp->m_ialloc_inos;
+	if (args.mp->m_maxicount &&
+	    percpu_counter_read_positive(&args.mp->m_icount) + newlen >
+							args.mp->m_maxicount)
+		return -ENOSPC;
+	args.minlen = args.maxlen = args.mp->m_ialloc_blks;
+	/*
+	 * First try to allocate inodes contiguous with the last-allocated
+	 * chunk of inodes.  If the filesystem is striped, this will fill
+	 * an entire stripe unit with inodes.
+	 */
+	agi = XFS_BUF_TO_AGI(agbp);
+	newino = be32_to_cpu(agi->agi_newino);
+	agno = be32_to_cpu(agi->agi_seqno);
+	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+		     args.mp->m_ialloc_blks;
+	if (likely(newino != NULLAGINO &&
+		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		args.type = XFS_ALLOCTYPE_THIS_BNO;
+		args.prod = 1;
+
+		/*
+		 * We need to take into account alignment here to ensure that
+		 * we don't modify the free list if we fail to have an exact
+		 * block. If we don't have an exact match, and every oher
+		 * attempt allocation attempt fails, we'll end up cancelling
+		 * a dirty transaction and shutting down.
+		 *
+		 * For an exact allocation, alignment must be 1,
+		 * however we need to take cluster alignment into account when
+		 * fixing up the freelist. Use the minalignslop field to
+		 * indicate that extra blocks might be required for alignment,
+		 * but not to use them in the actual exact allocation.
+		 */
+		args.alignment = 1;
+		args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
+
+		/* Allow space for the inode btree to split. */
+		args.minleft = args.mp->m_in_maxlevels - 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+
+		/*
+		 * This request might have dirtied the transaction if the AG can
+		 * satisfy the request, but the exact block was not available.
+		 * If the allocation did fail, subsequent requests will relax
+		 * the exact agbno requirement and increase the alignment
+		 * instead. It is critical that the total size of the request
+		 * (len + alignment + slop) does not increase from this point
+		 * on, so reset minalignslop to ensure it is not included in
+		 * subsequent requests.
+		 */
+		args.minalignslop = 0;
+	} else
+		args.fsbno = NULLFSBLOCK;
+
+	if (unlikely(args.fsbno == NULLFSBLOCK)) {
+		/*
+		 * Set the alignment for the allocation.
+		 * If stripe alignment is turned on then align at stripe unit
+		 * boundary.
+		 * If the cluster size is smaller than a filesystem block
+		 * then we're doing I/O for inodes in filesystem block size
+		 * pieces, so don't need alignment anyway.
+		 */
+		isaligned = 0;
+		if (args.mp->m_sinoalign) {
+			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+			args.alignment = args.mp->m_dalign;
+			isaligned = 1;
+		} else
+			args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+		/*
+		 * Need to figure out where to allocate the inode blocks.
+		 * Ideally they should be spaced out through the a.g.
+		 * For now, just allocate blocks up front.
+		 */
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		/*
+		 * Allocate a fixed-size extent of inodes.
+		 */
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.prod = 1;
+		/*
+		 * Allow space for the inode btree to split.
+		 */
+		args.minleft = args.mp->m_in_maxlevels - 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	/*
+	 * If stripe alignment is turned on, then try again with cluster
+	 * alignment.
+	 */
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	if (args.fsbno == NULLFSBLOCK) {
+		*alloc = 0;
+		return 0;
+	}
+	ASSERT(args.len == args.minlen);
+
+	/*
+	 * Stamp and write the inode buffers.
+	 *
+	 * Seed the new inode cluster with a random generation number. This
+	 * prevents short-term reuse of generation numbers if a chunk is
+	 * freed and then immediately reallocated. We use random numbers
+	 * rather than a linear progression to prevent the next generation
+	 * number from being easily guessable.
+	 */
+	error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+			args.len, prandom_u32());
+
+	if (error)
+		return error;
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+	be32_add_cpu(&agi->agi_count, newlen);
+	be32_add_cpu(&agi->agi_freecount, newlen);
+	pag = xfs_perag_get(args.mp, agno);
+	pag->pagi_freecount += newlen;
+	xfs_perag_put(pag);
+	agi->agi_newino = cpu_to_be32(newino);
+
+	/*
+	 * Insert records describing the new inode chunk into the btrees.
+	 */
+	error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+				 XFS_BTNUM_INO);
+	if (error)
+		return error;
+
+	if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+		error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+					 XFS_BTNUM_FINO);
+		if (error)
+			return error;
+	}
+	/*
+	 * Log allocation group header fields
+	 */
+	xfs_ialloc_log_agi(tp, agbp,
+		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+	/*
+	 * Modify/log superblock values for inode count and inode free count.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+	*alloc = 1;
+	return 0;
+}
+
+STATIC xfs_agnumber_t
+xfs_ialloc_next_ag(
+	xfs_mount_t	*mp)
+{
+	xfs_agnumber_t	agno;
+
+	spin_lock(&mp->m_agirotor_lock);
+	agno = mp->m_agirotor;
+	if (++mp->m_agirotor >= mp->m_maxagi)
+		mp->m_agirotor = 0;
+	spin_unlock(&mp->m_agirotor_lock);
+
+	return agno;
+}
+
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and the mode.  Return the allocation group buffer.
+ */
+STATIC xfs_agnumber_t
+xfs_ialloc_ag_select(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent directory inode number */
+	umode_t		mode,		/* bits set to indicate file type */
+	int		okalloc)	/* ok to allocate more space */
+{
+	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
+	xfs_agnumber_t	agno;		/* current ag number */
+	int		flags;		/* alloc buffer locking flags */
+	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
+	xfs_extlen_t	longest = 0;	/* longest extent available */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		needspace;	/* file mode implies space allocated */
+	xfs_perag_t	*pag;		/* per allocation group data */
+	xfs_agnumber_t	pagno;		/* parent (starting) ag number */
+	int		error;
+
+	/*
+	 * Files of these types need at least one block if length > 0
+	 * (and they won't fit in the inode, but that's hard to figure out).
+	 */
+	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+	mp = tp->t_mountp;
+	agcount = mp->m_maxagi;
+	if (S_ISDIR(mode))
+		pagno = xfs_ialloc_next_ag(mp);
+	else {
+		pagno = XFS_INO_TO_AGNO(mp, parent);
+		if (pagno >= agcount)
+			pagno = 0;
+	}
+
+	ASSERT(pagno < agcount);
+
+	/*
+	 * Loop through allocation groups, looking for one with a little
+	 * free space in it.  Note we don't look for free inodes, exactly.
+	 * Instead, we include whether there is a need to allocate inodes
+	 * to mean that blocks must be allocated for them,
+	 * if none are currently free.
+	 */
+	agno = pagno;
+	flags = XFS_ALLOC_FLAG_TRYLOCK;
+	for (;;) {
+		pag = xfs_perag_get(mp, agno);
+		if (!pag->pagi_inodeok) {
+			xfs_ialloc_next_ag(mp);
+			goto nextag;
+		}
+
+		if (!pag->pagi_init) {
+			error = xfs_ialloc_pagi_init(mp, tp, agno);
+			if (error)
+				goto nextag;
+		}
+
+		if (pag->pagi_freecount) {
+			xfs_perag_put(pag);
+			return agno;
+		}
+
+		if (!okalloc)
+			goto nextag;
+
+		if (!pag->pagf_init) {
+			error = xfs_alloc_pagf_init(mp, tp, agno, flags);
+			if (error)
+				goto nextag;
+		}
+
+		/*
+		 * Check that there is enough free space for the file plus a
+		 * chunk of inodes if we need to allocate some. If this is the
+		 * first pass across the AGs, take into account the potential
+		 * space needed for alignment of inode chunks when checking the
+		 * longest contiguous free space in the AG - this prevents us
+		 * from getting ENOSPC because we have free space larger than
+		 * m_ialloc_blks but alignment constraints prevent us from using
+		 * it.
+		 *
+		 * If we can't find an AG with space for full alignment slack to
+		 * be taken into account, we must be near ENOSPC in all AGs.
+		 * Hence we don't include alignment for the second pass and so
+		 * if we fail allocation due to alignment issues then it is most
+		 * likely a real ENOSPC condition.
+		 */
+		ineed = mp->m_ialloc_blks;
+		if (flags && ineed > 1)
+			ineed += xfs_ialloc_cluster_alignment(mp);
+		longest = pag->pagf_longest;
+		if (!longest)
+			longest = pag->pagf_flcount > 0;
+
+		if (pag->pagf_freeblks >= needspace + ineed &&
+		    longest >= ineed) {
+			xfs_perag_put(pag);
+			return agno;
+		}
+nextag:
+		xfs_perag_put(pag);
+		/*
+		 * No point in iterating over the rest, if we're shutting
+		 * down.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp))
+			return NULLAGNUMBER;
+		agno++;
+		if (agno >= agcount)
+			agno = 0;
+		if (agno == pagno) {
+			if (flags == 0)
+				return NULLAGNUMBER;
+			flags = 0;
+		}
+	}
+}
+
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	if (left)
+		error = xfs_btree_decrement(cur, 0, &i);
+	else
+		error = xfs_btree_increment(cur, 0, &i);
+
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_agino_t		agino,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done)
+{
+	int                     error;
+	int			i;
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_ino_t		parent,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t		agno = be32_to_cpu(agi->agi_seqno);
+	xfs_agnumber_t		pagno = XFS_INO_TO_AGNO(mp, parent);
+	xfs_agino_t		pagino = XFS_INO_TO_AGINO(mp, parent);
+	struct xfs_perag	*pag;
+	struct xfs_btree_cur	*cur, *tcur;
+	struct xfs_inobt_rec_incore rec, trec;
+	xfs_ino_t		ino;
+	int			error;
+	int			offset;
+	int			i, j;
+
+	pag = xfs_perag_get(mp, agno);
+
+	ASSERT(pag->pagi_init);
+	ASSERT(pag->pagi_inodeok);
+	ASSERT(pag->pagi_freecount > 0);
+
+ restart_pagno:
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = be32_to_cpu(agi->agi_newino);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	/*
+	 * If in the same AG as the parent, try to get near the parent.
+	 */
+	if (pagno == agno) {
+		int		doneleft;	/* done, to the left */
+		int		doneright;	/* done, to the right */
+		int		searchdistance = 10;
+
+		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+
+		error = xfs_inobt_get_rec(cur, &rec, &j);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
+
+		if (rec.ir_freecount > 0) {
+			/*
+			 * Found a free inode in the same chunk
+			 * as the parent, done.
+			 */
+			goto alloc_inode;
+		}
+
+
+		/*
+		 * In the same AG as parent, but parent's chunk is full.
+		 */
+
+		/* duplicate the cursor, search left & right simultaneously */
+		error = xfs_btree_dup_cursor(cur, &tcur);
+		if (error)
+			goto error0;
+
+		/*
+		 * Skip to last blocks looked up if same parent inode.
+		 */
+		if (pagino != NULLAGINO &&
+		    pag->pagl_pagino == pagino &&
+		    pag->pagl_leftrec != NULLAGINO &&
+		    pag->pagl_rightrec != NULLAGINO) {
+			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+						   &trec, &doneleft);
+			if (error)
+				goto error1;
+
+			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+						   &rec, &doneright);
+			if (error)
+				goto error1;
+		} else {
+			/* search left with tcur, back up 1 record */
+			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+			if (error)
+				goto error1;
+
+			/* search right with cur, go forward 1 record. */
+			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		}
+
+		/*
+		 * Loop until we find an inode chunk with a free inode.
+		 */
+		while (!doneleft || !doneright) {
+			int	useleft;  /* using left inode chunk this time */
+
+			if (!--searchdistance) {
+				/*
+				 * Not in range - save last search
+				 * location and allocate a new inode
+				 */
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto newino;
+			}
+
+			/* figure out the closer block if both are valid. */
+			if (!doneleft && !doneright) {
+				useleft = pagino -
+				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+				  rec.ir_startino - pagino;
+			} else {
+				useleft = !doneleft;
+			}
+
+			/* free inodes to the left? */
+			if (useleft && trec.ir_freecount) {
+				rec = trec;
+				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+				cur = tcur;
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto alloc_inode;
+			}
+
+			/* free inodes to the right? */
+			if (!useleft && rec.ir_freecount) {
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto alloc_inode;
+			}
+
+			/* get next record to check */
+			if (useleft) {
+				error = xfs_ialloc_next_rec(tcur, &trec,
+								 &doneleft, 1);
+			} else {
+				error = xfs_ialloc_next_rec(cur, &rec,
+								 &doneright, 0);
+			}
+			if (error)
+				goto error1;
+		}
+
+		/*
+		 * We've reached the end of the btree. because
+		 * we are only searching a small chunk of the
+		 * btree each search, there is obviously free
+		 * inodes closer to the parent inode than we
+		 * are now. restart the search again.
+		 */
+		pag->pagl_pagino = NULLAGINO;
+		pag->pagl_leftrec = NULLAGINO;
+		pag->pagl_rightrec = NULLAGINO;
+		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		goto restart_pagno;
+	}
+
+	/*
+	 * In a different AG from the parent.
+	 * See if the most recently allocated block has any free.
+	 */
+newino:
+	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
+		if (error)
+			goto error0;
+
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, &rec, &j);
+			if (error)
+				goto error0;
+
+			if (j == 1 && rec.ir_freecount > 0) {
+				/*
+				 * The last chunk allocated in the group
+				 * still has a free inode.
+				 */
+				goto alloc_inode;
+			}
+		}
+	}
+
+	/*
+	 * None left in the last group, search the whole AG
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+
+	for (;;) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+		if (rec.ir_freecount > 0)
+			break;
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+	}
+
+alloc_inode:
+	offset = xfs_lowbit64(rec.ir_free);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+	error = xfs_inobt_update(cur, &rec);
+	if (error)
+		goto error0;
+	be32_add_cpu(&agi->agi_freecount, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	pag->pagi_freecount--;
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	xfs_perag_put(pag);
+	*inop = ino;
+	return 0;
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+	xfs_agino_t			pagino,
+	struct xfs_btree_cur		**ocur,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	struct xfs_btree_cur		*lcur = *ocur;	/* left search cursor */
+	struct xfs_btree_cur		*rcur;	/* right search cursor */
+	struct xfs_inobt_rec_incore	rrec;
+	int				error;
+	int				i, j;
+
+	error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+	if (error)
+		return error;
+
+	if (i == 1) {
+		error = xfs_inobt_get_rec(lcur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
+
+		/*
+		 * See if we've landed in the parent inode record. The finobt
+		 * only tracks chunks with at least one free inode, so record
+		 * existence is enough.
+		 */
+		if (pagino >= rec->ir_startino &&
+		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+			return 0;
+	}
+
+	error = xfs_btree_dup_cursor(lcur, &rcur);
+	if (error)
+		return error;
+
+	error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+	if (error)
+		goto error_rcur;
+	if (j == 1) {
+		error = xfs_inobt_get_rec(rcur, &rrec, &j);
+		if (error)
+			goto error_rcur;
+		XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
+	}
+
+	XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
+	if (i == 1 && j == 1) {
+		/*
+		 * Both the left and right records are valid. Choose the closer
+		 * inode chunk to the target.
+		 */
+		if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+		    (rrec.ir_startino - pagino)) {
+			*rec = rrec;
+			xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+			*ocur = rcur;
+		} else {
+			xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+		}
+	} else if (j == 1) {
+		/* only the right record is valid */
+		*rec = rrec;
+		xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+		*ocur = rcur;
+	} else if (i == 1) {
+		/* only the left record is valid */
+		xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+	}
+
+	return 0;
+
+error_rcur:
+	xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+	struct xfs_agi			*agi,
+	struct xfs_btree_cur		*cur,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	int error;
+	int i;
+
+	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
+		if (error)
+			return error;
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, rec, &i);
+			if (error)
+				return error;
+			XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+			return 0;
+		}
+	}
+
+	/*
+	 * Find the first inode available in the AG.
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+
+	error = xfs_inobt_get_rec(cur, rec, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+
+	return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+	struct xfs_btree_cur		*cur,	/* inobt cursor */
+	struct xfs_inobt_rec_incore	*frec,	/* finobt record */
+	int				offset) /* inode offset */
+{
+	struct xfs_inobt_rec_incore	rec;
+	int				error;
+	int				i;
+
+	error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+	ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
+				  (rec.ir_freecount == frec->ir_freecount));
+
+	return xfs_inobt_update(cur, &rec);
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_ino_t		parent,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
+	xfs_agnumber_t			pagno = XFS_INO_TO_AGNO(mp, parent);
+	xfs_agino_t			pagino = XFS_INO_TO_AGINO(mp, parent);
+	struct xfs_perag		*pag;
+	struct xfs_btree_cur		*cur;	/* finobt cursor */
+	struct xfs_btree_cur		*icur;	/* inobt cursor */
+	struct xfs_inobt_rec_incore	rec;
+	xfs_ino_t			ino;
+	int				error;
+	int				offset;
+	int				i;
+
+	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+		return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+	pag = xfs_perag_get(mp, agno);
+
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = be32_to_cpu(agi->agi_newino);
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error_cur;
+
+	/*
+	 * The search algorithm depends on whether we're in the same AG as the
+	 * parent. If so, find the closest available inode to the parent. If
+	 * not, consider the agi hint or find the first free inode in the AG.
+	 */
+	if (agno == pagno)
+		error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+	else
+		error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+	if (error)
+		goto error_cur;
+
+	offset = xfs_lowbit64(rec.ir_free);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+	/*
+	 * Modify or remove the finobt record.
+	 */
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+	if (rec.ir_freecount)
+		error = xfs_inobt_update(cur, &rec);
+	else
+		error = xfs_btree_delete(cur, &i);
+	if (error)
+		goto error_cur;
+
+	/*
+	 * The finobt has now been updated appropriately. We haven't updated the
+	 * agi and superblock yet, so we can create an inobt cursor and validate
+	 * the original freecount. If all is well, make the equivalent update to
+	 * the inobt using the finobt record and offset information.
+	 */
+	icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+	error = xfs_check_agi_freecount(icur, agi);
+	if (error)
+		goto error_icur;
+
+	error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+	if (error)
+		goto error_icur;
+
+	/*
+	 * Both trees have now been updated. We must update the perag and
+	 * superblock before we can check the freecount for each btree.
+	 */
+	be32_add_cpu(&agi->agi_freecount, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	pag->pagi_freecount--;
+
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+	error = xfs_check_agi_freecount(icur, agi);
+	if (error)
+		goto error_icur;
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error_icur;
+
+	xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_perag_put(pag);
+	*inop = ino;
+	return 0;
+
+error_icur:
+	xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Allocate an inode on disk.
+ *
+ * Mode is used to tell whether the new inode will need space, and whether it
+ * is a directory.
+ *
+ * This function is designed to be called twice if it has to do an allocation
+ * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
+ * If an inode is available without having to performn an allocation, an inode
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
+ * new transaction, and call xfs_dialloc() again, passing in the previous value
+ * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
+ * buffer is locked across the two calls, the second call is guaranteed to have
+ * a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the on-disk
+ * data structures are updated.  The inode itself is not read in, since doing so
+ * would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+	struct xfs_trans	*tp,
+	xfs_ino_t		parent,
+	umode_t			mode,
+	int			okalloc,
+	struct xfs_buf		**IO_agbp,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*agbp;
+	xfs_agnumber_t		agno;
+	int			error;
+	int			ialloced;
+	int			noroom = 0;
+	xfs_agnumber_t		start_agno;
+	struct xfs_perag	*pag;
+
+	if (*IO_agbp) {
+		/*
+		 * If the caller passes in a pointer to the AGI buffer,
+		 * continue where we left off before.  In this case, we
+		 * know that the allocation group has free inodes.
+		 */
+		agbp = *IO_agbp;
+		goto out_alloc;
+	}
+
+	/*
+	 * We do not have an agbp, so select an initial allocation
+	 * group for inode allocation.
+	 */
+	start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+	if (start_agno == NULLAGNUMBER) {
+		*inop = NULLFSINO;
+		return 0;
+	}
+
+	/*
+	 * If we have already hit the ceiling of inode blocks then clear
+	 * okalloc so we scan all available agi structures for a free
+	 * inode.
+	 *
+	 * Read rough value of mp->m_icount by percpu_counter_read_positive,
+	 * which will sacrifice the preciseness but improve the performance.
+	 */
+	if (mp->m_maxicount &&
+	    percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
+							> mp->m_maxicount) {
+		noroom = 1;
+		okalloc = 0;
+	}
+
+	/*
+	 * Loop until we find an allocation group that either has free inodes
+	 * or in which we can allocate some inodes.  Iterate through the
+	 * allocation groups upward, wrapping at the end.
+	 */
+	agno = start_agno;
+	for (;;) {
+		pag = xfs_perag_get(mp, agno);
+		if (!pag->pagi_inodeok) {
+			xfs_ialloc_next_ag(mp);
+			goto nextag;
+		}
+
+		if (!pag->pagi_init) {
+			error = xfs_ialloc_pagi_init(mp, tp, agno);
+			if (error)
+				goto out_error;
+		}
+
+		/*
+		 * Do a first racy fast path check if this AG is usable.
+		 */
+		if (!pag->pagi_freecount && !okalloc)
+			goto nextag;
+
+		/*
+		 * Then read in the AGI buffer and recheck with the AGI buffer
+		 * lock held.
+		 */
+		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+		if (error)
+			goto out_error;
+
+		if (pag->pagi_freecount) {
+			xfs_perag_put(pag);
+			goto out_alloc;
+		}
+
+		if (!okalloc)
+			goto nextag_relse_buffer;
+
+
+		error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
+		if (error) {
+			xfs_trans_brelse(tp, agbp);
+
+			if (error != -ENOSPC)
+				goto out_error;
+
+			xfs_perag_put(pag);
+			*inop = NULLFSINO;
+			return 0;
+		}
+
+		if (ialloced) {
+			/*
+			 * We successfully allocated some inodes, return
+			 * the current context to the caller so that it
+			 * can commit the current transaction and call
+			 * us again where we left off.
+			 */
+			ASSERT(pag->pagi_freecount > 0);
+			xfs_perag_put(pag);
+
+			*IO_agbp = agbp;
+			*inop = NULLFSINO;
+			return 0;
+		}
+
+nextag_relse_buffer:
+		xfs_trans_brelse(tp, agbp);
+nextag:
+		xfs_perag_put(pag);
+		if (++agno == mp->m_sb.sb_agcount)
+			agno = 0;
+		if (agno == start_agno) {
+			*inop = NULLFSINO;
+			return noroom ? -ENOSPC : 0;
+		}
+	}
+
+out_alloc:
+	*IO_agbp = NULL;
+	return xfs_dialloc_ag(tp, agbp, parent, inop);
+out_error:
+	xfs_perag_put(pag);
+	return error;
+}
+
+STATIC int
+xfs_difree_inobt(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agino_t			agino,
+	struct xfs_bmap_free		*flist,
+	int				*deleted,
+	xfs_ino_t			*first_ino,
+	struct xfs_inobt_rec_incore	*orec)
+{
+	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
+	struct xfs_perag		*pag;
+	struct xfs_btree_cur		*cur;
+	struct xfs_inobt_rec_incore	rec;
+	int				ilen;
+	int				error;
+	int				i;
+	int				off;
+
+	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+	ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
+	/*
+	 * Initialize the cursor.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	/*
+	 * Look for the entry describing this inode.
+	 */
+	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+		xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+	/*
+	 * Get the offset in the inode chunk.
+	 */
+	off = agino - rec.ir_startino;
+	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+	ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+	/*
+	 * Mark the inode free & increment the count.
+	 */
+	rec.ir_free |= XFS_INOBT_MASK(off);
+	rec.ir_freecount++;
+
+	/*
+	 * When an inode cluster is free, it becomes eligible for removal
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
+	    (rec.ir_freecount == mp->m_ialloc_inos)) {
+
+		*deleted = 1;
+		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+
+		/*
+		 * Remove the inode cluster from the AGI B+Tree, adjust the
+		 * AGI and Superblock inode counts, and mark the disk space
+		 * to be freed when the transaction is committed.
+		 */
+		ilen = mp->m_ialloc_inos;
+		be32_add_cpu(&agi->agi_count, -ilen);
+		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount -= ilen - 1;
+		xfs_perag_put(pag);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+
+		if ((error = xfs_btree_delete(cur, &i))) {
+			xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+				  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+				  mp->m_ialloc_blks, flist, mp);
+	} else {
+		*deleted = 0;
+
+		error = xfs_inobt_update(cur, &rec);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		/* 
+		 * Change the inode free counts and log the ag/sb changes.
+		 */
+		be32_add_cpu(&agi->agi_freecount, 1);
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount++;
+		xfs_perag_put(pag);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+	}
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	*orec = rec;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agino_t			agino,
+	struct xfs_inobt_rec_incore	*ibtrec) /* inobt record */
+{
+	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
+	struct xfs_btree_cur		*cur;
+	struct xfs_inobt_rec_incore	rec;
+	int				offset = agino - ibtrec->ir_startino;
+	int				error;
+	int				i;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+	error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		goto error;
+	if (i == 0) {
+		/*
+		 * If the record does not exist in the finobt, we must have just
+		 * freed an inode in a previously fully allocated chunk. If not,
+		 * something is out of sync.
+		 */
+		XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
+
+		error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+					     ibtrec->ir_free, &i);
+		if (error)
+			goto error;
+		ASSERT(i == 1);
+
+		goto out;
+	}
+
+	/*
+	 * Read and update the existing record. We could just copy the ibtrec
+	 * across here, but that would defeat the purpose of having redundant
+	 * metadata. By making the modifications independently, we can catch
+	 * corruptions that we wouldn't see if we just copied from one record
+	 * to another.
+	 */
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error)
+		goto error;
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+
+	rec.ir_free |= XFS_INOBT_MASK(offset);
+	rec.ir_freecount++;
+
+	XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
+				(rec.ir_freecount == ibtrec->ir_freecount),
+				error);
+
+	/*
+	 * The content of inobt records should always match between the inobt
+	 * and finobt. The lifecycle of records in the finobt is different from
+	 * the inobt in that the finobt only tracks records with at least one
+	 * free inode. Hence, if all of the inodes are free and we aren't
+	 * keeping inode chunks permanently on disk, remove the record.
+	 * Otherwise, update the record with the new information.
+	 */
+	if (rec.ir_freecount == mp->m_ialloc_inos &&
+	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto error;
+		ASSERT(i == 1);
+	} else {
+		error = xfs_inobt_update(cur, &rec);
+		if (error)
+			goto error;
+	}
+
+out:
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+
+error:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	xfs_ino_t		inode,		/* inode to be freed */
+	struct xfs_bmap_free	*flist,		/* extents to free */
+	int			*deleted,/* set if inode cluster was deleted */
+	xfs_ino_t		*first_ino)/* first inode in deleted cluster */
+{
+	/* REFERENCED */
+	xfs_agblock_t		agbno;	/* block number containing inode */
+	struct xfs_buf		*agbp;	/* buffer for allocation group header */
+	xfs_agino_t		agino;	/* allocation group inode number */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	int			error;	/* error return value */
+	struct xfs_mount	*mp;	/* mount structure for filesystem */
+	struct xfs_inobt_rec_incore rec;/* btree record */
+
+	mp = tp->t_mountp;
+
+	/*
+	 * Break up inode number into its components.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, inode);
+	if (agno >= mp->m_sb.sb_agcount)  {
+		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+			__func__, agno, mp->m_sb.sb_agcount);
+		ASSERT(0);
+		return -EINVAL;
+	}
+	agino = XFS_INO_TO_AGINO(mp, inode);
+	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
+		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+			__func__, (unsigned long long)inode,
+			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+		ASSERT(0);
+		return -EINVAL;
+	}
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agbno >= mp->m_sb.sb_agblocks)  {
+		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+			__func__, agbno, mp->m_sb.sb_agblocks);
+		ASSERT(0);
+		return -EINVAL;
+	}
+	/*
+	 * Get the allocation group header.
+	 */
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+			__func__, error);
+		return error;
+	}
+
+	/*
+	 * Fix up the inode allocation btree.
+	 */
+	error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+				 &rec);
+	if (error)
+		goto error0;
+
+	/*
+	 * Fix up the free inode btree.
+	 */
+	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+		error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+		if (error)
+			goto error0;
+	}
+
+	return 0;
+
+error0:
+	return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		*chunk_agbno,
+	xfs_agblock_t		*offset_agbno,
+	int			flags)
+{
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	int			error;
+	int			i;
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	if (error) {
+		xfs_alert(mp,
+			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+			__func__, error, agno);
+		return error;
+	}
+
+	/*
+	 * Lookup the inode record for the given agino. If the record cannot be
+	 * found, then it's an invalid inode number and we should abort. Once
+	 * we have a record, we need to ensure it contains the inode number
+	 * we are looking up.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+	if (!error) {
+		if (i)
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (!error && i == 0)
+			error = -EINVAL;
+	}
+
+	xfs_trans_brelse(tp, agbp);
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	if (error)
+		return error;
+
+	/* check that the returned record contains the required inode */
+	if (rec.ir_startino > agino ||
+	    rec.ir_startino + mp->m_ialloc_inos <= agino)
+		return -EINVAL;
+
+	/* for untrusted inodes check it is allocated first */
+	if ((flags & XFS_IGET_UNTRUSTED) &&
+	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+		return -EINVAL;
+
+	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+	*offset_agbno = agbno - *chunk_agbno;
+	return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+	xfs_mount_t	 *mp,	/* file system mount structure */
+	xfs_trans_t	 *tp,	/* transaction pointer */
+	xfs_ino_t	ino,	/* inode to locate */
+	struct xfs_imap	*imap,	/* location map structure */
+	uint		flags)	/* flags for inode btree lookup */
+{
+	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
+	xfs_agino_t	agino;	/* inode number within alloc group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	int		blks_per_cluster; /* num blocks per inode cluster */
+	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
+	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
+	int		error;	/* error code */
+	int		offset;	/* index of inode in its buffer */
+	xfs_agblock_t	offset_agbno;	/* blks from chunk start to inode */
+
+	ASSERT(ino != NULLFSINO);
+
+	/*
+	 * Split up the inode number into its parts.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+		/*
+		 * Don't output diagnostic information for untrusted inodes
+		 * as they can be invalid without implying corruption.
+		 */
+		if (flags & XFS_IGET_UNTRUSTED)
+			return -EINVAL;
+		if (agno >= mp->m_sb.sb_agcount) {
+			xfs_alert(mp,
+				"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+				__func__, agno, mp->m_sb.sb_agcount);
+		}
+		if (agbno >= mp->m_sb.sb_agblocks) {
+			xfs_alert(mp,
+		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+				__func__, (unsigned long long)agbno,
+				(unsigned long)mp->m_sb.sb_agblocks);
+		}
+		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+			xfs_alert(mp,
+		"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+				__func__, ino,
+				XFS_AGINO_TO_INO(mp, agno, agino));
+		}
+		xfs_stack_trace();
+#endif /* DEBUG */
+		return -EINVAL;
+	}
+
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+
+	/*
+	 * For bulkstat and handle lookups, we have an untrusted inode number
+	 * that we have to verify is valid. We cannot do this just by reading
+	 * the inode buffer as it may have been unlinked and removed leaving
+	 * inodes in stale state on disk. Hence we have to do a btree lookup
+	 * in all cases where an untrusted inode number is passed.
+	 */
+	if (flags & XFS_IGET_UNTRUSTED) {
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			return error;
+		goto out_map;
+	}
+
+	/*
+	 * If the inode cluster size is the same as the blocksize or
+	 * smaller we get to the buffer by simple arithmetics.
+	 */
+	if (blks_per_cluster == 1) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+
+		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+		imap->im_len = XFS_FSB_TO_BB(mp, 1);
+		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+		return 0;
+	}
+
+	/*
+	 * If the inode chunks are aligned then use simple maths to
+	 * find the location. Otherwise we have to do a btree
+	 * lookup to find the location.
+	 */
+	if (mp->m_inoalign_mask) {
+		offset_agbno = agbno & mp->m_inoalign_mask;
+		chunk_agbno = agbno - offset_agbno;
+	} else {
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			return error;
+	}
+
+out_map:
+	ASSERT(agbno >= chunk_agbno);
+	cluster_agbno = chunk_agbno +
+		((offset_agbno / blks_per_cluster) * blks_per_cluster);
+	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+		XFS_INO_TO_OFFSET(mp, ino);
+
+	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+	imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+	/*
+	 * If the inode number maps to a block outside the bounds
+	 * of the file system then return NULL rather than calling
+	 * read_buf and panicing when we get an error from the
+	 * driver.
+	 */
+	if ((imap->im_blkno + imap->im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		xfs_alert(mp,
+	"%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+			__func__, (unsigned long long) imap->im_blkno,
+			(unsigned long long) imap->im_len,
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+		XFS_INODES_PER_CHUNK_LOG;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_in_maxlevels = level;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
+ */
+void
+xfs_ialloc_log_agi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*bp,		/* allocation group header buffer */
+	int		fields)		/* bitmask of fields to log */
+{
+	int			first;		/* first byte number */
+	int			last;		/* last byte number */
+	static const short	offsets[] = {	/* field starting offsets */
+					/* keep in sync with bit definitions */
+		offsetof(xfs_agi_t, agi_magicnum),
+		offsetof(xfs_agi_t, agi_versionnum),
+		offsetof(xfs_agi_t, agi_seqno),
+		offsetof(xfs_agi_t, agi_length),
+		offsetof(xfs_agi_t, agi_count),
+		offsetof(xfs_agi_t, agi_root),
+		offsetof(xfs_agi_t, agi_level),
+		offsetof(xfs_agi_t, agi_freecount),
+		offsetof(xfs_agi_t, agi_newino),
+		offsetof(xfs_agi_t, agi_dirino),
+		offsetof(xfs_agi_t, agi_unlinked),
+		offsetof(xfs_agi_t, agi_free_root),
+		offsetof(xfs_agi_t, agi_free_level),
+		sizeof(xfs_agi_t)
+	};
+#ifdef DEBUG
+	xfs_agi_t		*agi;	/* allocation group header */
+
+	agi = XFS_BUF_TO_AGI(bp);
+	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+#endif
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
+	/*
+	 * Compute byte offsets for the first and last fields in the first
+	 * region and log the agi buffer. This only logs up through
+	 * agi_unlinked.
+	 */
+	if (fields & XFS_AGI_ALL_BITS_R1) {
+		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+				  &first, &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	}
+
+	/*
+	 * Mask off the bits in the first region and calculate the first and
+	 * last field offsets for any bits in the second region.
+	 */
+	fields &= ~XFS_AGI_ALL_BITS_R1;
+	if (fields) {
+		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+				  &first, &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	}
+}
+
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+	struct xfs_agi		*agi)
+{
+	int			i;
+
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+		ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
+static bool
+xfs_agi_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agi	*agi = XFS_BUF_TO_AGI(bp);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+			return false;
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+		return false;
+	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+		return false;
+
+	if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+		return false;
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	xfs_check_agi_unlinked(agi);
+	return true;
+}
+
+static void
+xfs_agi_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+				XFS_ERRTAG_IALLOC_READ_AGI,
+				XFS_RANDOM_IALLOC_READ_AGI))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agi_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_agi_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+	.verify_read = xfs_agi_read_verify,
+	.verify_write = xfs_agi_write_verify,
+};
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+{
+	int			error;
+
+	trace_xfs_read_agi(mp, agno);
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+	if (error)
+		return error;
+
+	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+	return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+{
+	struct xfs_agi		*agi;	/* allocation group header */
+	struct xfs_perag	*pag;	/* per allocation group data */
+	int			error;
+
+	trace_xfs_ialloc_read_agi(mp, agno);
+
+	error = xfs_read_agi(mp, tp, agno, bpp);
+	if (error)
+		return error;
+
+	agi = XFS_BUF_TO_AGI(*bpp);
+	pag = xfs_perag_get(mp, agno);
+	if (!pag->pagi_init) {
+		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+		pag->pagi_count = be32_to_cpu(agi->agi_count);
+		pag->pagi_init = 1;
+	}
+
+	/*
+	 * It's possible for these to be out of sync if
+	 * we are in the middle of a forced shutdown.
+	 */
+	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+		XFS_FORCED_SHUTDOWN(mp));
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * Read in the agi to initialise the per-ag data in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno)		/* allocation group number */
+{
+	xfs_buf_t	*bp = NULL;
+	int		error;
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+	if (error)
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.h b/kernel/fs/xfs/libxfs/xfs_ialloc.h
new file mode 100644
index 000000000..100007d56
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_H__
+#define	__XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_imap;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_btree_cur;
+
+/* Move inodes in clusters of this size */
+#define	XFS_INODE_BIG_CLUSTER_SIZE	8192
+
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+		return 1;
+	return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+static inline struct xfs_dinode *
+xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
+{
+	return (struct xfs_dinode *)
+		(xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+}
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.  If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction.  xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.  The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int					/* error */
+xfs_dialloc(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent inode (directory) */
+	umode_t		mode,		/* mode bits for new inode */
+	int		okalloc,	/* ok to allocate more space */
+	struct xfs_buf	**agbp,		/* buf for a.g. inode header */
+	xfs_ino_t	*inop);		/* inode number allocated */
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int					/* error */
+xfs_difree(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	inode,		/* inode to be freed */
+	struct xfs_bmap_free *flist,	/* extents to free */
+	int		*deleted,	/* set if inode cluster was deleted */
+	xfs_ino_t	*first_ino);	/* first inode in deleted cluster */
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode to locate */
+	struct xfs_imap	*imap,		/* location map structure */
+	uint		flags);		/* flags for inode btree lookup */
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	struct xfs_mount *mp);		/* file system mount structure */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	struct xfs_trans *tp,		/* transaction pointer */
+	struct xfs_buf	*bp,		/* allocation group header buffer */
+	int		fields);	/* bitmask of fields to log */
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int					/* error */
+xfs_ialloc_read_agi(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	struct xfs_buf	**bpp);		/* allocation group hdr buf */
+
+/*
+ * Read in the allocation group header to initialise the per-ag data
+ * in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+        xfs_agnumber_t  agno);		/* allocation group number */
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+		xfs_lookup_t dir, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+		xfs_inobt_rec_incore_t *rec, int *stat);
+
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+			  struct list_head *buffer_list,
+			  xfs_agnumber_t agno, xfs_agblock_t agbno,
+			  xfs_agblock_t length, unsigned int gen);
+
+int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+		xfs_agnumber_t agno, struct xfs_buf **bpp);
+
+
+#endif	/* __XFS_IALLOC_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000..964c465ca
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+
+
+STATIC int
+xfs_inobt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_inobt_mnr[level != 0];
+}
+
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_btnum);
+}
+
+STATIC void
+xfs_inobt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*nptr,
+	int			inc)	/* level change */
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+
+	agi->agi_root = nptr->s;
+	be32_add_cpu(&agi->agi_level, inc);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+
+STATIC void
+xfs_finobt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*nptr,
+	int			inc)	/* level change */
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+
+	agi->agi_free_root = nptr->s;
+	be32_add_cpu(&agi->agi_free_level, inc);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp,
+			   XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
+STATIC int
+xfs_inobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+	xfs_agblock_t		sbno = be32_to_cpu(start->s);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+	args.minlen = 1;
+	args.maxlen = 1;
+	args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+	error = xfs_alloc_vextent(&args);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+	new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_inobt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+	error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+	if (error)
+		return error;
+
+	xfs_trans_binval(cur->bc_tp, bp);
+	return error;
+}
+
+STATIC int
+xfs_inobt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_inobt_mxr[level != 0];
+}
+
+STATIC void
+xfs_inobt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	rec->inobt.ir_startino = key->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+	rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+	rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+
+	ptr->s = agi->agi_root;
+}
+
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+	ptr->s = agi->agi_free_root;
+}
+
+STATIC __int64_t
+xfs_inobt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+			  cur->bc_rec.i.ir_startino;
+}
+
+static int
+xfs_inobt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+
+	/*
+	 * During growfs operations, we can't verify the exact owner as the
+	 * perag is not fully initialised and hence not attached to the buffer.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agi information will not yet have been initialised
+	 * from the on disk AGI. We don't currently use any of this information,
+	 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+	 * ever do.
+	 */
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+	case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_IBT_MAGIC):
+	case cpu_to_be32(XFS_FIBT_MAGIC):
+		break;
+	default:
+		return 0;
+	}
+
+	/* numrecs and level verification */
+	level = be16_to_cpu(block->bb_level);
+	if (level >= mp->m_in_maxlevels)
+		return false;
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
+}
+
+static void
+xfs_inobt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_inobt_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+static void
+xfs_inobt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_inobt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+	.verify_read = xfs_inobt_read_verify,
+	.verify_write = xfs_inobt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_inobt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->inobt.ir_startino) <
+		be32_to_cpu(k2->inobt.ir_startino);
+}
+
+STATIC int
+xfs_inobt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+		be32_to_cpu(r2->inobt.ir_startino);
+}
+#endif	/* DEBUG */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+	.rec_len		= sizeof(xfs_inobt_rec_t),
+	.key_len		= sizeof(xfs_inobt_key_t),
+
+	.dup_cursor		= xfs_inobt_dup_cursor,
+	.set_root		= xfs_inobt_set_root,
+	.alloc_block		= xfs_inobt_alloc_block,
+	.free_block		= xfs_inobt_free_block,
+	.get_minrecs		= xfs_inobt_get_minrecs,
+	.get_maxrecs		= xfs_inobt_get_maxrecs,
+	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
+	.init_rec_from_key	= xfs_inobt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
+	.key_diff		= xfs_inobt_key_diff,
+	.buf_ops		= &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_inobt_keys_inorder,
+	.recs_inorder		= xfs_inobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_finobt_ops = {
+	.rec_len		= sizeof(xfs_inobt_rec_t),
+	.key_len		= sizeof(xfs_inobt_key_t),
+
+	.dup_cursor		= xfs_inobt_dup_cursor,
+	.set_root		= xfs_finobt_set_root,
+	.alloc_block		= xfs_inobt_alloc_block,
+	.free_block		= xfs_inobt_free_block,
+	.get_minrecs		= xfs_inobt_get_minrecs,
+	.get_maxrecs		= xfs_inobt_get_maxrecs,
+	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
+	.init_rec_from_key	= xfs_inobt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_finobt_init_ptr_from_cur,
+	.key_diff		= xfs_inobt_key_diff,
+	.buf_ops		= &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_inobt_keys_inorder,
+	.recs_inorder		= xfs_inobt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new inode btree cursor.
+ */
+struct xfs_btree_cur *				/* new inode btree cursor */
+xfs_inobt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agi structure */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_btnum_t		btnum)		/* ialloc or free ino btree */
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	struct xfs_btree_cur	*cur;
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = btnum;
+	if (btnum == XFS_BTNUM_INO) {
+		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+		cur->bc_ops = &xfs_inobt_ops;
+	} else {
+		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+		cur->bc_ops = &xfs_finobt_ops;
+	}
+
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_inobt_rec_t);
+	return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000..d7ebea72c
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define	__XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_INOBT_BLOCK_LEN(mp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+	((xfs_inobt_rec_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+	((xfs_inobt_key_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_inobt_ptr_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_inobt_key_t) + \
+		 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+		xfs_btnum_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+
+#endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_inode_buf.c b/kernel/fs/xfs/libxfs/xfs_inode_buf.c
new file mode 100644
index 000000000..002b6b3a1
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_inode_buf.c
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp)
+{
+	int		i;
+	int		j;
+	xfs_dinode_t	*dip;
+
+	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+	for (i = 0; i < j; i++) {
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					i * mp->m_sb.sb_inodesize);
+		if (!dip->di_next_unlinked)  {
+			xfs_alert(mp,
+	"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
+				i, (long long)bp->b_bn);
+		}
+	}
+}
+#endif
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we don't want to mark it with an error,
+ * but we do want to clear the DONE status of the buffer so that a followup read
+ * will re-read it from disk. This will ensure that we don't get an unnecessary
+ * warnings during log recovery and we don't get unnecssary panics on debug
+ * kernels.
+ */
+static void
+xfs_inode_buf_verify(
+	struct xfs_buf	*bp,
+	bool		readahead)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	int		i;
+	int		ni;
+
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 */
+	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+			    XFS_DINODE_GOOD_VERSION(dip->di_version);
+		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+						XFS_ERRTAG_ITOBP_INOTOBP,
+						XFS_RANDOM_ITOBP_INOTOBP))) {
+			if (readahead) {
+				bp->b_flags &= ~XBF_DONE;
+				return;
+			}
+
+			xfs_buf_ioerror(bp, -EFSCORRUPTED);
+			xfs_verifier_error(bp);
+#ifdef DEBUG
+			xfs_alert(mp,
+				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
+				(unsigned long long)bp->b_bn, i,
+				be16_to_cpu(dip->di_magic));
+#endif
+		}
+	}
+	xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+	.verify_read = xfs_inode_buf_read_verify,
+	.verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+	.verify_read = xfs_inode_buf_readahead_verify,
+	.verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
+ */
+int
+xfs_imap_to_bp(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_imap		*imap,
+	struct xfs_dinode       **dipp,
+	struct xfs_buf		**bpp,
+	uint			buf_flags,
+	uint			iget_flags)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	buf_flags |= XBF_UNMAPPED;
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+				   (int)imap->im_len, buf_flags, &bp,
+				   &xfs_inode_buf_ops);
+	if (error) {
+		if (error == -EAGAIN) {
+			ASSERT(buf_flags & XBF_TRYLOCK);
+			return error;
+		}
+
+		if (error == -EFSCORRUPTED &&
+		    (iget_flags & XFS_IGET_UNTRUSTED))
+			return -EINVAL;
+
+		xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+			__func__, error);
+		return error;
+	}
+
+	*bpp = bp;
+	*dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+	return 0;
+}
+
+void
+xfs_dinode_from_disk(
+	xfs_icdinode_t		*to,
+	xfs_dinode_t		*from)
+{
+	to->di_magic = be16_to_cpu(from->di_magic);
+	to->di_mode = be16_to_cpu(from->di_mode);
+	to->di_version = from ->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = be16_to_cpu(from->di_onlink);
+	to->di_uid = be32_to_cpu(from->di_uid);
+	to->di_gid = be32_to_cpu(from->di_gid);
+	to->di_nlink = be32_to_cpu(from->di_nlink);
+	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+	to->di_flushiter = be16_to_cpu(from->di_flushiter);
+	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+	to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+	to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+	to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+	to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+	to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+	to->di_size = be64_to_cpu(from->di_size);
+	to->di_nblocks = be64_to_cpu(from->di_nblocks);
+	to->di_extsize = be32_to_cpu(from->di_extsize);
+	to->di_nextents = be32_to_cpu(from->di_nextents);
+	to->di_anextents = be16_to_cpu(from->di_anextents);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat	= from->di_aformat;
+	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
+	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
+	to->di_flags	= be16_to_cpu(from->di_flags);
+	to->di_gen	= be32_to_cpu(from->di_gen);
+
+	if (to->di_version == 3) {
+		to->di_changecount = be64_to_cpu(from->di_changecount);
+		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+		to->di_flags2 = be64_to_cpu(from->di_flags2);
+		to->di_ino = be64_to_cpu(from->di_ino);
+		to->di_lsn = be64_to_cpu(from->di_lsn);
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+	}
+}
+
+void
+xfs_dinode_to_disk(
+	xfs_dinode_t		*to,
+	xfs_icdinode_t		*from)
+{
+	to->di_magic = cpu_to_be16(from->di_magic);
+	to->di_mode = cpu_to_be16(from->di_mode);
+	to->di_version = from ->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = cpu_to_be16(from->di_onlink);
+	to->di_uid = cpu_to_be32(from->di_uid);
+	to->di_gid = cpu_to_be32(from->di_gid);
+	to->di_nlink = cpu_to_be32(from->di_nlink);
+	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+	to->di_size = cpu_to_be64(from->di_size);
+	to->di_nblocks = cpu_to_be64(from->di_nblocks);
+	to->di_extsize = cpu_to_be32(from->di_extsize);
+	to->di_nextents = cpu_to_be32(from->di_nextents);
+	to->di_anextents = cpu_to_be16(from->di_anextents);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat = from->di_aformat;
+	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+	to->di_dmstate = cpu_to_be16(from->di_dmstate);
+	to->di_flags = cpu_to_be16(from->di_flags);
+	to->di_gen = cpu_to_be32(from->di_gen);
+
+	if (from->di_version == 3) {
+		to->di_changecount = cpu_to_be64(from->di_changecount);
+		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+		to->di_flags2 = cpu_to_be64(from->di_flags2);
+		to->di_ino = cpu_to_be64(from->di_ino);
+		to->di_lsn = cpu_to_be64(from->di_lsn);
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+		to->di_flushiter = 0;
+	} else {
+		to->di_flushiter = cpu_to_be16(from->di_flushiter);
+	}
+}
+
+static bool
+xfs_dinode_verify(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+		return false;
+
+	/* only version 3 or greater inodes are extensively verified here */
+	if (dip->di_version < 3)
+		return true;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+			      XFS_DINODE_CRC_OFF))
+		return false;
+	if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+		return false;
+	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	return true;
+}
+
+void
+xfs_dinode_calc_crc(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip)
+{
+	__uint32_t		crc;
+
+	if (dip->di_version < 3)
+		return;
+
+	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+			      XFS_DINODE_CRC_OFF);
+	dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
+ */
+int
+xfs_iread(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		iget_flags)
+{
+	xfs_buf_t	*bp;
+	xfs_dinode_t	*dip;
+	int		error;
+
+	/*
+	 * Fill in the location information in the in-core inode.
+	 */
+	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+	if (error)
+		return error;
+
+	/* shortcut IO on inode allocation if possible */
+	if ((iget_flags & XFS_IGET_CREATE) &&
+	    xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+		/* initialise the on-disk inode core */
+		memset(&ip->i_d, 0, sizeof(ip->i_d));
+		ip->i_d.di_magic = XFS_DINODE_MAGIC;
+		ip->i_d.di_gen = prandom_u32();
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			ip->i_d.di_version = 3;
+			ip->i_d.di_ino = ip->i_ino;
+			uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+		} else
+			ip->i_d.di_version = 2;
+		return 0;
+	}
+
+	/*
+	 * Get pointers to the on-disk inode and the buffer containing it.
+	 */
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
+	if (error)
+		return error;
+
+	/* even unallocated inodes are verified */
+	if (!xfs_dinode_verify(mp, ip, dip)) {
+		xfs_alert(mp, "%s: validation failed for inode %lld failed",
+				__func__, ip->i_ino);
+
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+		error = -EFSCORRUPTED;
+		goto out_brelse;
+	}
+
+	/*
+	 * If the on-disk inode is already linked to a directory
+	 * entry, copy all of the inode into the in-core inode.
+	 * xfs_iformat_fork() handles copying in the inode format
+	 * specific information.
+	 * Otherwise, just get the truly permanent information.
+	 */
+	if (dip->di_mode) {
+		xfs_dinode_from_disk(&ip->i_d, dip);
+		error = xfs_iformat_fork(ip, dip);
+		if (error)  {
+#ifdef DEBUG
+			xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+				__func__, error);
+#endif /* DEBUG */
+			goto out_brelse;
+		}
+	} else {
+		/*
+		 * Partial initialisation of the in-core inode. Just the bits
+		 * that xfs_ialloc won't overwrite or relies on being correct.
+		 */
+		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+		ip->i_d.di_version = dip->di_version;
+		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+		if (dip->di_version == 3) {
+			ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+			uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+		}
+
+		/*
+		 * Make sure to pull in the mode here as well in
+		 * case the inode is released without being used.
+		 * This ensures that xfs_inactive() will see that
+		 * the inode is already free and not try to mess
+		 * with the uninitialized part of it.
+		 */
+		ip->i_d.di_mode = 0;
+	}
+
+	/*
+	 * Automatically convert version 1 inode formats in memory to version 2
+	 * inode format. If the inode is modified, it will get logged and
+	 * rewritten as a version 2 inode. We can do this because we set the
+	 * superblock feature bit for v2 inodes unconditionally during mount
+	 * and it means the reast of the code can assume the inode version is 2
+	 * or higher.
+	 */
+	if (ip->i_d.di_version == 1) {
+		ip->i_d.di_version = 2;
+		memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+		ip->i_d.di_nlink = ip->i_d.di_onlink;
+		ip->i_d.di_onlink = 0;
+		xfs_set_projid(ip, 0);
+	}
+
+	ip->i_delayed_blks = 0;
+
+	/*
+	 * Mark the buffer containing the inode as something to keep
+	 * around for a while.  This helps to keep recently accessed
+	 * meta-data in-core longer.
+	 */
+	xfs_buf_set_ref(bp, XFS_INO_REF);
+
+	/*
+	 * Use xfs_trans_brelse() to release the buffer containing the on-disk
+	 * inode, because it was acquired with xfs_trans_read_buf() in
+	 * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
+	 * will only release the buffer if it is not dirty within the
+	 * transaction.  It will be OK to release the buffer in this case,
+	 * because inodes on disk are never destroyed and we will be locking the
+	 * new in-core inode before putting it in the cache where other
+	 * processes can find it.  Thus we don't have to worry about the inode
+	 * being changed just because we released the buffer.
+	 */
+ out_brelse:
+	xfs_trans_brelse(tp, bp);
+	return error;
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_inode_buf.h b/kernel/fs/xfs/libxfs/xfs_inode_buf.h
new file mode 100644
index 000000000..9308c47f2
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_inode_buf.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_INODE_BUF_H__
+#define	__XFS_INODE_BUF_H__
+
+struct xfs_inode;
+struct xfs_dinode;
+struct xfs_icdinode;
+
+/*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
+	ushort		im_len;		/* length in BBs of inode chunk */
+	ushort		im_boffset;	/* inode offset in block in bytes */
+};
+
+int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+		       struct xfs_imap *, struct xfs_dinode **,
+		       struct xfs_buf **, uint, uint);
+int	xfs_iread(struct xfs_mount *, struct xfs_trans *,
+		  struct xfs_inode *, uint);
+void	xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void	xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void	xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+
+#if defined(DEBUG)
+void	xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define	xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+#endif	/* __XFS_INODE_BUF_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_inode_fork.c b/kernel/fs/xfs/libxfs/xfs_inode_fork.c
new file mode 100644
index 000000000..0defbd02f
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_inode_fork.c
@@ -0,0 +1,1902 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_attr_sf.h"
+
+kmem_zone_t *xfs_ifork_zone;
+
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+void
+xfs_validate_extents(
+	xfs_ifork_t		*ifp,
+	int			nrecs,
+	xfs_exntfmt_t		fmt)
+{
+	xfs_bmbt_irec_t		irec;
+	xfs_bmbt_rec_host_t	rec;
+	int			i;
+
+	for (i = 0; i < nrecs; i++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+		rec.l0 = get_unaligned(&ep->l0);
+		rec.l1 = get_unaligned(&ep->l1);
+		xfs_bmbt_get_all(&rec, &irec);
+		if (fmt == XFS_EXTFMT_NOSTATE)
+			ASSERT(irec.br_state == XFS_EXT_NORM);
+	}
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+int
+xfs_iformat_fork(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip)
+{
+	xfs_attr_shortform_t	*atp;
+	int			size;
+	int			error = 0;
+	xfs_fsize_t             di_size;
+
+	if (unlikely(be32_to_cpu(dip->di_nextents) +
+		     be16_to_cpu(dip->di_anextents) >
+		     be64_to_cpu(dip->di_nblocks))) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+			(unsigned long long)ip->i_ino,
+			(int)(be32_to_cpu(dip->di_nextents) +
+			      be16_to_cpu(dip->di_anextents)),
+			(unsigned long long)
+				be64_to_cpu(dip->di_nblocks));
+		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
+	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+		xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+			(unsigned long long)ip->i_ino,
+			dip->di_forkoff);
+		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
+	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+		     !ip->i_mount->m_rtdev_targp)) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %Lu, has realtime flag set.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+					      ip->i_mount, dip);
+			return -EFSCORRUPTED;
+		}
+		ip->i_d.di_size = 0;
+		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+		break;
+
+	case S_IFREG:
+	case S_IFLNK:
+	case S_IFDIR:
+		switch (dip->di_format) {
+		case XFS_DINODE_FMT_LOCAL:
+			/*
+			 * no local regular files yet
+			 */
+			if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (local format for regular file).",
+					(unsigned long long) ip->i_ino);
+				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+						     XFS_ERRLEVEL_LOW,
+						     ip->i_mount, dip);
+				return -EFSCORRUPTED;
+			}
+
+			di_size = be64_to_cpu(dip->di_size);
+			if (unlikely(di_size < 0 ||
+				     di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (bad size %Ld for local inode).",
+					(unsigned long long) ip->i_ino,
+					(long long) di_size);
+				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+						     XFS_ERRLEVEL_LOW,
+						     ip->i_mount, dip);
+				return -EFSCORRUPTED;
+			}
+
+			size = (int)di_size;
+			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+			break;
+		case XFS_DINODE_FMT_EXTENTS:
+			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+			break;
+		case XFS_DINODE_FMT_BTREE:
+			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+			break;
+		default:
+			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+					 ip->i_mount);
+			return -EFSCORRUPTED;
+		}
+		break;
+
+	default:
+		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+		return -EFSCORRUPTED;
+	}
+	if (error) {
+		return error;
+	}
+	if (!XFS_DFORK_Q(dip))
+		return 0;
+
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+
+	switch (dip->di_aformat) {
+	case XFS_DINODE_FMT_LOCAL:
+		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+		size = be16_to_cpu(atp->hdr.totsize);
+
+		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+			xfs_warn(ip->i_mount,
+				"corrupt inode %Lu (bad attr fork size %Ld).",
+				(unsigned long long) ip->i_ino,
+				(long long) size);
+			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+					     XFS_ERRLEVEL_LOW,
+					     ip->i_mount, dip);
+			return -EFSCORRUPTED;
+		}
+
+		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+		break;
+	default:
+		error = -EFSCORRUPTED;
+		break;
+	}
+	if (error) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+	}
+	return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork,
+	int		size)
+{
+	xfs_ifork_t	*ifp;
+	int		real_size;
+
+	/*
+	 * If the size is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+		xfs_warn(ip->i_mount,
+	"corrupt inode %Lu (bad size %d for local fork, size = %d).",
+			(unsigned long long) ip->i_ino, size,
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	real_size = 0;
+	if (size == 0)
+		ifp->if_u1.if_data = NULL;
+	else if (size <= sizeof(ifp->if_u2.if_inline_data))
+		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+	else {
+		real_size = roundup(size, 4);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+	}
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	if (size)
+		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFINLINE;
+	return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork)
+{
+	xfs_bmbt_rec_t	*dp;
+	xfs_ifork_t	*ifp;
+	int		nex;
+	int		size;
+	int		i;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+	/*
+	 * If the number of extents is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+		xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+			(unsigned long long) ip->i_ino, nex);
+		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
+	ifp->if_real_bytes = 0;
+	if (nex == 0)
+		ifp->if_u1.if_extents = NULL;
+	else if (nex <= XFS_INLINE_EXTS)
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	else
+		xfs_iext_add(ifp, 0, nex);
+
+	ifp->if_bytes = size;
+	if (size) {
+		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+		for (i = 0; i < nex; i++, dp++) {
+			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+			ep->l0 = get_unaligned_be64(&dp->l0);
+			ep->l1 = get_unaligned_be64(&dp->l1);
+		}
+		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+		if (whichfork != XFS_DATA_FORK ||
+			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+				if (unlikely(xfs_check_nostate_extents(
+				    ifp, 0, nex))) {
+					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+							 XFS_ERRLEVEL_LOW,
+							 ip->i_mount);
+					return -EFSCORRUPTED;
+				}
+	}
+	ifp->if_flags |= XFS_IFEXTENTS;
+	return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_bmdr_block_t	*dfp;
+	xfs_ifork_t		*ifp;
+	/* REFERENCED */
+	int			nrecs;
+	int			size;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+	/*
+	 * blow out if -- fork has less extents than can fit in
+	 * fork (fork shouldn't be a btree format), root btree
+	 * block has more records than can fit into the fork,
+	 * or the number of extents is greater than the number of
+	 * blocks.
+	 */
+	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+					XFS_IFORK_MAXEXT(ip, whichfork) ||
+		     XFS_BMDR_SPACE_CALC(nrecs) >
+					XFS_DFORK_SIZE(dip, mp, whichfork) ||
+		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+		xfs_warn(mp, "corrupt inode %Lu (btree).",
+					(unsigned long long) ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+					 mp, dip);
+		return -EFSCORRUPTED;
+	}
+
+	ifp->if_broot_bytes = size;
+	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+	ASSERT(ifp->if_broot != NULL);
+	/*
+	 * Copy and convert from the on-disk structure
+	 * to the in-memory structure.
+	 */
+	xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+			 ifp->if_broot, size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFBROOT;
+
+	return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	int		error;
+	xfs_ifork_t	*ifp;
+	xfs_extnum_t	nextents;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+				 ip->i_mount);
+		return -EFSCORRUPTED;
+	}
+	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	/*
+	 * We know that the size is valid (it's checked in iformat_btree)
+	 */
+	ifp->if_bytes = ifp->if_real_bytes = 0;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	xfs_iext_add(ifp, 0, nextents);
+	error = xfs_bmap_read_extents(tp, ip, whichfork);
+	if (error) {
+		xfs_iext_destroy(ifp);
+		ifp->if_flags &= ~XFS_IFEXTENTS;
+		return error;
+	}
+	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+	return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *	 requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+	xfs_inode_t		*ip,
+	int			rec_diff,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			cur_max;
+	xfs_ifork_t		*ifp;
+	struct xfs_btree_block	*new_broot;
+	int			new_max;
+	size_t			new_size;
+	char			*np;
+	char			*op;
+
+	/*
+	 * Handle the degenerate case quietly.
+	 */
+	if (rec_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (rec_diff > 0) {
+		/*
+		 * If there wasn't any memory allocated before, just
+		 * allocate it now and get out.
+		 */
+		if (ifp->if_broot_bytes == 0) {
+			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+			ifp->if_broot_bytes = (int)new_size;
+			return;
+		}
+
+		/*
+		 * If there is already an existing if_broot, then we need
+		 * to realloc() it and shift the pointers to their new
+		 * location.  The records don't change location because
+		 * they are kept butted up against the btree block header.
+		 */
+		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+		new_max = cur_max + rec_diff;
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+				XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
+				KM_SLEEP | KM_NOFS);
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     (int)new_size);
+		ifp->if_broot_bytes = (int)new_size;
+		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			XFS_IFORK_SIZE(ip, whichfork));
+		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
+		return;
+	}
+
+	/*
+	 * rec_diff is less than 0.  In this case, we are shrinking the
+	 * if_broot buffer.  It must already exist.  If we go to zero
+	 * records, just get rid of the root and clear the status bit.
+	 */
+	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	if (new_max > 0)
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+	else
+		new_size = 0;
+	if (new_size > 0) {
+		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+		/*
+		 * First copy over the btree block header.
+		 */
+		memcpy(new_broot, ifp->if_broot,
+			XFS_BMBT_BLOCK_LEN(ip->i_mount));
+	} else {
+		new_broot = NULL;
+		ifp->if_flags &= ~XFS_IFBROOT;
+	}
+
+	/*
+	 * Only copy the records and pointers if there are any.
+	 */
+	if (new_max > 0) {
+		/*
+		 * First copy the records.
+		 */
+		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+		/*
+		 * Then copy the pointers.
+		 */
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+						     (int)new_size);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
+	}
+	kmem_free(ifp->if_broot);
+	ifp->if_broot = new_broot;
+	ifp->if_broot_bytes = (int)new_size;
+	if (ifp->if_broot)
+		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			XFS_IFORK_SIZE(ip, whichfork));
+	return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *	 requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+	xfs_inode_t	*ip,
+	int		byte_diff,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+	int		new_size;
+	int		real_size;
+
+	if (byte_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	new_size = (int)ifp->if_bytes + byte_diff;
+	ASSERT(new_size >= 0);
+
+	if (new_size == 0) {
+		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			kmem_free(ifp->if_u1.if_data);
+		}
+		ifp->if_u1.if_data = NULL;
+		real_size = 0;
+	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+		/*
+		 * If the valid extents/data can fit in if_inline_ext/data,
+		 * copy them from the malloc'd vector and free it.
+		 */
+		if (ifp->if_u1.if_data == NULL) {
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			ASSERT(ifp->if_real_bytes != 0);
+			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+			      new_size);
+			kmem_free(ifp->if_u1.if_data);
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		}
+		real_size = 0;
+	} else {
+		/*
+		 * Stuck with malloc/realloc.
+		 * For inline data, the underlying buffer must be
+		 * a multiple of 4 bytes in size so that it can be
+		 * logged and stay on word boundaries.  We enforce
+		 * that here.
+		 */
+		real_size = roundup(new_size, 4);
+		if (ifp->if_u1.if_data == NULL) {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			/*
+			 * Only do the realloc if the underlying size
+			 * is really changing.
+			 */
+			if (ifp->if_real_bytes != real_size) {
+				ifp->if_u1.if_data =
+					kmem_realloc(ifp->if_u1.if_data,
+							real_size,
+							ifp->if_real_bytes,
+							KM_SLEEP | KM_NOFS);
+			}
+		} else {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
+			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+				ifp->if_bytes);
+		}
+	}
+	ifp->if_real_bytes = real_size;
+	ifp->if_bytes = new_size;
+	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (ifp->if_broot != NULL) {
+		kmem_free(ifp->if_broot);
+		ifp->if_broot = NULL;
+	}
+
+	/*
+	 * If the format is local, then we can't have an extents
+	 * array so just look for an inline data array.  If we're
+	 * not local then we may or may not have an extents list,
+	 * so check and free it up if we do.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+		    (ifp->if_u1.if_data != NULL)) {
+			ASSERT(ifp->if_real_bytes != 0);
+			kmem_free(ifp->if_u1.if_data);
+			ifp->if_u1.if_data = NULL;
+			ifp->if_real_bytes = 0;
+		}
+	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+		   ((ifp->if_flags & XFS_IFEXTIREC) ||
+		    ((ifp->if_u1.if_extents != NULL) &&
+		     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+		ASSERT(ifp->if_real_bytes != 0);
+		xfs_iext_destroy(ifp);
+	}
+	ASSERT(ifp->if_u1.if_extents == NULL ||
+	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+	ASSERT(ifp->if_real_bytes == 0);
+	if (whichfork == XFS_ATTR_FORK) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+	}
+}
+
+/*
+ * Convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
+ */
+int
+xfs_iextents_copy(
+	xfs_inode_t		*ip,
+	xfs_bmbt_rec_t		*dp,
+	int			whichfork)
+{
+	int			copied;
+	int			i;
+	xfs_ifork_t		*ifp;
+	int			nrecs;
+	xfs_fsblock_t		start_block;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(ifp->if_bytes > 0);
+
+	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+	ASSERT(nrecs > 0);
+
+	/*
+	 * There are some delayed allocation extents in the
+	 * inode, so copy the extents one at a time and skip
+	 * the delayed ones.  There must be at least one
+	 * non-delayed extent.
+	 */
+	copied = 0;
+	for (i = 0; i < nrecs; i++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+		start_block = xfs_bmbt_get_startblock(ep);
+		if (isnullstartblock(start_block)) {
+			/*
+			 * It's a delayed allocation extent, so skip it.
+			 */
+			continue;
+		}
+
+		/* Translate to on disk format */
+		put_unaligned_be64(ep->l0, &dp->l0);
+		put_unaligned_be64(ep->l1, &dp->l1);
+		dp++;
+		copied++;
+	}
+	ASSERT(copied != 0);
+	xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	xfs_inode_log_item_t	*iip,
+	int			whichfork)
+{
+	char			*cp;
+	xfs_ifork_t		*ifp;
+	xfs_mount_t		*mp;
+	static const short	brootflag[2] =
+		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+	static const short	dataflag[2] =
+		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+	static const short	extflag[2] =
+		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+	if (!iip)
+		return;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/*
+	 * This can happen if we gave up in iformat in an error path,
+	 * for the attribute fork.
+	 */
+	if (!ifp) {
+		ASSERT(whichfork == XFS_ATTR_FORK);
+		return;
+	}
+	cp = XFS_DFORK_PTR(dip, whichfork);
+	mp = ip->i_mount;
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_fields & dataflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(ifp->if_u1.if_data != NULL);
+			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+		}
+		break;
+
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+		       !(iip->ili_fields & extflag[whichfork]));
+		if ((iip->ili_fields & extflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(xfs_iext_get_ext(ifp, 0));
+			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+				whichfork);
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_fields & brootflag[whichfork]) &&
+		    (ifp->if_broot_bytes > 0)) {
+			ASSERT(ifp->if_broot != NULL);
+			ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			        XFS_IFORK_SIZE(ip, whichfork));
+			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+				(xfs_bmdr_block_t *)cp,
+				XFS_DFORK_SIZE(dip, mp, whichfork));
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		if (iip->ili_fields & XFS_ILOG_DEV) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		if (iip->ili_fields & XFS_ILOG_UUID) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			memcpy(XFS_DFORK_DPTR(dip),
+			       &ip->i_df.if_u2.if_uuid,
+			       sizeof(uuid_t));
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx)		/* index of target extent */
+{
+	ASSERT(idx >= 0);
+	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+		return ifp->if_u1.if_ext_irec->er_extbuf;
+	} else if (ifp->if_flags & XFS_IFEXTIREC) {
+		xfs_ext_irec_t	*erp;		/* irec pointer */
+		int		erp_idx = 0;	/* irec index */
+		xfs_extnum_t	page_idx = idx;	/* ext index in target list */
+
+		erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+		return &erp->er_extbuf[page_idx];
+	} else if (ifp->if_bytes) {
+		return &ifp->if_u1.if_extents[idx];
+	} else {
+		return NULL;
+	}
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'.  'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting index of new items */
+	xfs_extnum_t	count,		/* number of inserted items */
+	xfs_bmbt_irec_t	*new,		/* items to insert */
+	int		state)		/* type of extent conversion */
+{
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_extnum_t	i;		/* extent record index */
+
+	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	xfs_iext_add(ifp, idx, count);
+	for (i = idx; i < idx + count; i++, new++)
+		xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin adding exts */
+	int		ext_diff)	/* number of extents to add */
+{
+	int		byte_diff;	/* new bytes being added */
+	int		new_size;	/* size of extents after adding */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT((idx >= 0) && (idx <= nextents));
+	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+	new_size = ifp->if_bytes + byte_diff;
+	/*
+	 * If the new number of extents (nextents + ext_diff)
+	 * fits inside the inode, then continue to use the inline
+	 * extent buffer.
+	 */
+	if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+		if (idx < nextents) {
+			memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+				&ifp->if_u2.if_inline_ext[idx],
+				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
+			memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+		}
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+		ifp->if_real_bytes = 0;
+	}
+	/*
+	 * Otherwise use a linear (direct) extent list.
+	 * If the extents are currently inside the inode,
+	 * xfs_iext_realloc_direct will switch us from
+	 * inline to direct extent allocation mode.
+	 */
+	else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+		xfs_iext_realloc_direct(ifp, new_size);
+		if (idx < nextents) {
+			memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+				&ifp->if_u1.if_extents[idx],
+				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
+			memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+		}
+	}
+	/* Indirection array */
+	else {
+		xfs_ext_irec_t	*erp;
+		int		erp_idx = 0;
+		int		page_idx = idx;
+
+		ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+		if (ifp->if_flags & XFS_IFEXTIREC) {
+			erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+		} else {
+			xfs_iext_irec_init(ifp);
+			ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+			erp = ifp->if_u1.if_ext_irec;
+		}
+		/* Extents fit in target extent page */
+		if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+			if (page_idx < erp->er_extcount) {
+				memmove(&erp->er_extbuf[page_idx + ext_diff],
+					&erp->er_extbuf[page_idx],
+					(erp->er_extcount - page_idx) *
+					sizeof(xfs_bmbt_rec_t));
+				memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+			}
+			erp->er_extcount += ext_diff;
+			xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		}
+		/* Insert a new extent page */
+		else if (erp) {
+			xfs_iext_add_indirect_multi(ifp,
+				erp_idx, page_idx, ext_diff);
+		}
+		/*
+		 * If extent(s) are being appended to the last page in
+		 * the indirection array and the new extent(s) don't fit
+		 * in the page, then erp is NULL and erp_idx is set to
+		 * the next index needed in the indirection array.
+		 */
+		else {
+			uint	count = ext_diff;
+
+			while (count) {
+				erp = xfs_iext_irec_new(ifp, erp_idx);
+				erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+				count -= erp->er_extcount;
+				if (count)
+					erp_idx++;
+			}
+		}
+	}
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ *    |-------|   |-------|
+ *    |       |   |       |    idx - number of extents before idx
+ *    |  idx  |   | count |
+ *    |       |   |       |    count - number of extents being inserted at idx
+ *    |-------|   |-------|
+ *    | count |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+	xfs_ifork_t	*ifp,			/* inode fork pointer */
+	int		erp_idx,		/* target extent irec index */
+	xfs_extnum_t	idx,			/* index within target list */
+	int		count)			/* new extents being added */
+{
+	int		byte_diff;		/* new bytes being added */
+	xfs_ext_irec_t	*erp;			/* pointer to irec entry */
+	xfs_extnum_t	ext_diff;		/* number of extents to add */
+	xfs_extnum_t	ext_cnt;		/* new extents still needed */
+	xfs_extnum_t	nex2;			/* extents after idx + count */
+	xfs_bmbt_rec_t	*nex2_ep = NULL;	/* temp list for nex2 extents */
+	int		nlists;			/* number of irec's (lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	erp = &ifp->if_u1.if_ext_irec[erp_idx];
+	nex2 = erp->er_extcount - idx;
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+	/*
+	 * Save second part of target extent list
+	 * (all extents past */
+	if (nex2) {
+		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+		erp->er_extcount -= nex2;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+		memset(&erp->er_extbuf[idx], 0, byte_diff);
+	}
+
+	/*
+	 * Add the new extents to the end of the target
+	 * list, then allocate new irec record(s) and
+	 * extent buffer(s) as needed to store the rest
+	 * of the new extents.
+	 */
+	ext_cnt = count;
+	ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+	if (ext_diff) {
+		erp->er_extcount += ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		ext_cnt -= ext_diff;
+	}
+	while (ext_cnt) {
+		erp_idx++;
+		erp = xfs_iext_irec_new(ifp, erp_idx);
+		ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+		erp->er_extcount = ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		ext_cnt -= ext_diff;
+	}
+
+	/* Add nex2 extents back to indirection array */
+	if (nex2) {
+		xfs_extnum_t	ext_avail;
+		int		i;
+
+		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+		i = 0;
+		/*
+		 * If nex2 extents fit in the current page, append
+		 * nex2_ep after the new extents.
+		 */
+		if (nex2 <= ext_avail) {
+			i = erp->er_extcount;
+		}
+		/*
+		 * Otherwise, check if space is available in the
+		 * next page.
+		 */
+		else if ((erp_idx < nlists - 1) &&
+			 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+			  ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+			erp_idx++;
+			erp++;
+			/* Create a hole for nex2 extents */
+			memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+				erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+		}
+		/*
+		 * Final choice, create a new extent page for
+		 * nex2 extents.
+		 */
+		else {
+			erp_idx++;
+			erp = xfs_iext_irec_new(ifp, erp_idx);
+		}
+		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+		kmem_free(nex2_ep);
+		erp->er_extcount += nex2;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+	}
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array.  Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff,	/* number of extents to remove */
+	int		state)		/* type of extent conversion */
+{
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		new_size;	/* size of extents after removal */
+
+	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+	ASSERT(ext_diff > 0);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	} else if (ifp->if_flags & XFS_IFEXTIREC) {
+		xfs_iext_remove_indirect(ifp, idx, ext_diff);
+	} else if (ifp->if_real_bytes) {
+		xfs_iext_remove_direct(ifp, idx, ext_diff);
+	} else {
+		xfs_iext_remove_inline(ifp, idx, ext_diff);
+	}
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff)	/* number of extents to remove */
+{
+	int		nextents;	/* number of extents in file */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	ASSERT(idx < XFS_INLINE_EXTS);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(((nextents - ext_diff) > 0) &&
+		(nextents - ext_diff) < XFS_INLINE_EXTS);
+
+	if (idx + ext_diff < nextents) {
+		memmove(&ifp->if_u2.if_inline_ext[idx],
+			&ifp->if_u2.if_inline_ext[idx + ext_diff],
+			(nextents - (idx + ext_diff)) *
+			 sizeof(xfs_bmbt_rec_t));
+		memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+			0, ext_diff * sizeof(xfs_bmbt_rec_t));
+	} else {
+		memset(&ifp->if_u2.if_inline_ext[idx], 0,
+			ext_diff * sizeof(xfs_bmbt_rec_t));
+	}
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff)	/* number of extents to remove */
+{
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		new_size;	/* size of extents after removal */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	new_size = ifp->if_bytes -
+		(ext_diff * sizeof(xfs_bmbt_rec_t));
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+		return;
+	}
+	/* Move extents up in the list (if needed) */
+	if (idx + ext_diff < nextents) {
+		memmove(&ifp->if_u1.if_extents[idx],
+			&ifp->if_u1.if_extents[idx + ext_diff],
+			(nextents - (idx + ext_diff)) *
+			 sizeof(xfs_bmbt_rec_t));
+	}
+	memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+		0, ext_diff * sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Reallocate the direct extent list. If the extents
+	 * will fit inside the inode then xfs_iext_realloc_direct
+	 * will switch from direct to inline extent allocation
+	 * mode for us.
+	 */
+	xfs_iext_realloc_direct(ifp, new_size);
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ *    |-------|   |-------|
+ *    | nex1  |   |       |    nex1 - number of extents before idx
+ *    |-------|   | count |
+ *    |       |   |       |    count - number of extents being removed at idx
+ *    | count |   |-------|
+ *    |       |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_remove_indirect(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing extents */
+	int		count)		/* number of extents to remove */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		erp_idx = 0;	/* indirection array index */
+	xfs_extnum_t	ext_cnt;	/* extents left to remove */
+	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
+	xfs_extnum_t	nex1;		/* number of extents before idx */
+	xfs_extnum_t	nex2;		/* extents after idx + count */
+	int		page_idx = idx;	/* index in target extent list */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
+	ASSERT(erp != NULL);
+	nex1 = page_idx;
+	ext_cnt = count;
+	while (ext_cnt) {
+		nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+		ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+		/*
+		 * Check for deletion of entire list;
+		 * xfs_iext_irec_remove() updates extent offsets.
+		 */
+		if (ext_diff == erp->er_extcount) {
+			xfs_iext_irec_remove(ifp, erp_idx);
+			ext_cnt -= ext_diff;
+			nex1 = 0;
+			if (ext_cnt) {
+				ASSERT(erp_idx < ifp->if_real_bytes /
+					XFS_IEXT_BUFSZ);
+				erp = &ifp->if_u1.if_ext_irec[erp_idx];
+				nex1 = 0;
+				continue;
+			} else {
+				break;
+			}
+		}
+		/* Move extents up (if needed) */
+		if (nex2) {
+			memmove(&erp->er_extbuf[nex1],
+				&erp->er_extbuf[nex1 + ext_diff],
+				nex2 * sizeof(xfs_bmbt_rec_t));
+		}
+		/* Zero out rest of page */
+		memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+			((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+		/* Update remaining counters */
+		erp->er_extcount -= ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+		ext_cnt -= ext_diff;
+		nex1 = 0;
+		erp_idx++;
+		erp++;
+	}
+	ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+	xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* new size of extents after adding */
+{
+	int		rnew_size;	/* real new size of extents */
+
+	rnew_size = new_size;
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+		((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+		 (new_size != ifp->if_real_bytes)));
+
+	/* Free extent records */
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	}
+	/* Resize direct extent list and zero any new bytes */
+	else if (ifp->if_real_bytes) {
+		/* Check if extents will fit inside the inode */
+		if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+			xfs_iext_direct_to_inline(ifp, new_size /
+				(uint)sizeof(xfs_bmbt_rec_t));
+			ifp->if_bytes = new_size;
+			return;
+		}
+		if (!is_power_of_2(new_size)){
+			rnew_size = roundup_pow_of_two(new_size);
+		}
+		if (rnew_size != ifp->if_real_bytes) {
+			ifp->if_u1.if_extents =
+				kmem_realloc(ifp->if_u1.if_extents,
+						rnew_size,
+						ifp->if_real_bytes, KM_NOFS);
+		}
+		if (rnew_size > ifp->if_real_bytes) {
+			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t)], 0,
+				rnew_size - ifp->if_real_bytes);
+		}
+	}
+	/* Switch from the inline extent buffer to a direct extent list */
+	else {
+		if (!is_power_of_2(new_size)) {
+			rnew_size = roundup_pow_of_two(new_size);
+		}
+		xfs_iext_inline_to_direct(ifp, rnew_size);
+	}
+	ifp->if_real_bytes = rnew_size;
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	nextents)	/* number of extents in file */
+{
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(nextents <= XFS_INLINE_EXTS);
+	/*
+	 * The inline buffer was zeroed when we switched
+	 * from inline to direct extent allocation mode,
+	 * so we don't need to clear it here.
+	 */
+	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+		nextents * sizeof(xfs_bmbt_rec_t));
+	kmem_free(ifp->if_u1.if_extents);
+	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* number of extents in file */
+{
+	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+	memset(ifp->if_u1.if_extents, 0, new_size);
+	if (ifp->if_bytes) {
+		memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+			ifp->if_bytes);
+		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+			sizeof(xfs_bmbt_rec_t));
+	}
+	ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* new indirection array size */
+{
+	int		nlists;		/* number of irec's (ex lists) */
+	int		size;		/* current indirection array size */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	size = nlists * sizeof(xfs_ext_irec_t);
+	ASSERT(ifp->if_real_bytes);
+	ASSERT((new_size >= 0) && (new_size != size));
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	} else {
+		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+			kmem_realloc(ifp->if_u1.if_ext_irec,
+				new_size, size, KM_NOFS);
+	}
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+	 xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		size;		/* size of file extents */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(nextents <= XFS_LINEAR_EXTS);
+	size = nextents * sizeof(xfs_bmbt_rec_t);
+
+	xfs_iext_irec_compact_pages(ifp);
+	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+	ep = ifp->if_u1.if_ext_irec->er_extbuf;
+	kmem_free(ifp->if_u1.if_ext_irec);
+	ifp->if_flags &= ~XFS_IFEXTIREC;
+	ifp->if_u1.if_extents = ep;
+	ifp->if_bytes = size;
+	if (nextents < XFS_LINEAR_EXTS) {
+		xfs_iext_realloc_direct(ifp, size);
+	}
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		int	erp_idx;
+		int	nlists;
+
+		nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+		for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+			xfs_iext_irec_remove(ifp, erp_idx);
+		}
+		ifp->if_flags &= ~XFS_IFEXTIREC;
+	} else if (ifp->if_real_bytes) {
+		kmem_free(ifp->if_u1.if_extents);
+	} else if (ifp->if_bytes) {
+		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+			sizeof(xfs_bmbt_rec_t));
+	}
+	ifp->if_u1.if_extents = NULL;
+	ifp->if_real_bytes = 0;
+	ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t *			/* pointer to found extent record */
+xfs_iext_bno_to_ext(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number to search for */
+	xfs_extnum_t	*idxp)		/* index of target extent */
+{
+	xfs_bmbt_rec_host_t *base;	/* pointer to first extent */
+	xfs_filblks_t	blockcount = 0;	/* number of blocks in extent */
+	xfs_bmbt_rec_host_t *ep = NULL;	/* pointer to target extent */
+	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
+	int		high;		/* upper boundary in search */
+	xfs_extnum_t	idx = 0;	/* index of target extent */
+	int		low;		/* lower boundary in search */
+	xfs_extnum_t	nextents;	/* number of file extents */
+	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
+
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*idxp = 0;
+		return NULL;
+	}
+	low = 0;
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		/* Find target extent list */
+		int	erp_idx = 0;
+		erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+		base = erp->er_extbuf;
+		high = erp->er_extcount - 1;
+	} else {
+		base = ifp->if_u1.if_extents;
+		high = nextents - 1;
+	}
+	/* Binary search extent records */
+	while (low <= high) {
+		idx = (low + high) >> 1;
+		ep = base + idx;
+		startoff = xfs_bmbt_get_startoff(ep);
+		blockcount = xfs_bmbt_get_blockcount(ep);
+		if (bno < startoff) {
+			high = idx - 1;
+		} else if (bno >= startoff + blockcount) {
+			low = idx + 1;
+		} else {
+			/* Convert back to file-based extent index */
+			if (ifp->if_flags & XFS_IFEXTIREC) {
+				idx += erp->er_extoff;
+			}
+			*idxp = idx;
+			return ep;
+		}
+	}
+	/* Convert back to file-based extent index */
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		idx += erp->er_extoff;
+	}
+	if (bno >= startoff + blockcount) {
+		if (++idx == nextents) {
+			ep = NULL;
+		} else {
+			ep = xfs_iext_get_ext(ifp, idx);
+		}
+	}
+	*idxp = idx;
+	return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t *			/* pointer to found extent record */
+xfs_iext_bno_to_irec(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number to search for */
+	int		*erp_idxp)	/* irec index of target ext list */
+{
+	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
+	xfs_ext_irec_t	*erp_next;	/* next indirection array entry */
+	int		erp_idx;	/* indirection array index */
+	int		nlists;		/* number of extent irec's (lists) */
+	int		high;		/* binary search upper limit */
+	int		low;		/* binary search lower limit */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp_idx = 0;
+	low = 0;
+	high = nlists - 1;
+	while (low <= high) {
+		erp_idx = (low + high) >> 1;
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+		if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+			high = erp_idx - 1;
+		} else if (erp_next && bno >=
+			   xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+			low = erp_idx + 1;
+		} else {
+			break;
+		}
+	}
+	*erp_idxp = erp_idx;
+	return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	*idxp,		/* extent index (file -> page) */
+	int		*erp_idxp,	/* pointer to target irec */
+	int		realloc)	/* new bytes were just added */
+{
+	xfs_ext_irec_t	*prev;		/* pointer to previous irec */
+	xfs_ext_irec_t	*erp = NULL;	/* pointer to current irec */
+	int		erp_idx;	/* indirection array index */
+	int		nlists;		/* number of irec's (ex lists) */
+	int		high;		/* binary search upper limit */
+	int		low;		/* binary search lower limit */
+	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	ASSERT(page_idx >= 0);
+	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp_idx = 0;
+	low = 0;
+	high = nlists - 1;
+
+	/* Binary search extent irec's */
+	while (low <= high) {
+		erp_idx = (low + high) >> 1;
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		prev = erp_idx > 0 ? erp - 1 : NULL;
+		if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+		     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+			high = erp_idx - 1;
+		} else if (page_idx > erp->er_extoff + erp->er_extcount ||
+			   (page_idx == erp->er_extoff + erp->er_extcount &&
+			    !realloc)) {
+			low = erp_idx + 1;
+		} else if (page_idx == erp->er_extoff + erp->er_extcount &&
+			   erp->er_extcount == XFS_LINEAR_EXTS) {
+			ASSERT(realloc);
+			page_idx = 0;
+			erp_idx++;
+			erp = erp_idx < nlists ? erp + 1 : NULL;
+			break;
+		} else {
+			page_idx -= erp->er_extoff;
+			break;
+		}
+	}
+	*idxp = page_idx;
+	*erp_idxp = erp_idx;
+	return erp;
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+	if (nextents == 0) {
+		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+	} else if (!ifp->if_real_bytes) {
+		xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+		xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+	}
+	erp->er_extbuf = ifp->if_u1.if_extents;
+	erp->er_extcount = nextents;
+	erp->er_extoff = 0;
+
+	ifp->if_flags |= XFS_IFEXTIREC;
+	ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+	ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+	ifp->if_u1.if_ext_irec = erp;
+
+	return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx)	/* index for new irec */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+	/* Resize indirection array */
+	xfs_iext_realloc_indirect(ifp, ++nlists *
+				  sizeof(xfs_ext_irec_t));
+	/*
+	 * Move records down in the array so the
+	 * new page can use erp_idx.
+	 */
+	erp = ifp->if_u1.if_ext_irec;
+	for (i = nlists - 1; i > erp_idx; i--) {
+		memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+	}
+	ASSERT(i == erp_idx);
+
+	/* Initialize new extent record */
+	erp = ifp->if_u1.if_ext_irec;
+	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+	erp[erp_idx].er_extcount = 0;
+	erp[erp_idx].er_extoff = erp_idx > 0 ?
+		erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+	return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx)	/* irec index to remove */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp = &ifp->if_u1.if_ext_irec[erp_idx];
+	if (erp->er_extbuf) {
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+			-erp->er_extcount);
+		kmem_free(erp->er_extbuf);
+	}
+	/* Compact extent records */
+	erp = ifp->if_u1.if_ext_irec;
+	for (i = erp_idx; i < nlists - 1; i++) {
+		memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+	}
+	/*
+	 * Manually free the last extent record from the indirection
+	 * array.  A call to xfs_iext_realloc_indirect() with a size
+	 * of zero would result in a call to xfs_iext_destroy() which
+	 * would in turn call this function again, creating a nasty
+	 * infinite loop.
+	 */
+	if (--nlists) {
+		xfs_iext_realloc_indirect(ifp,
+			nlists * sizeof(xfs_ext_irec_t));
+	} else {
+		kmem_free(ifp->if_u1.if_ext_irec);
+	}
+	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array.  Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible.  The
+ * compaction policy is as follows:
+ *
+ *    Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ *      No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+	if (nextents == 0) {
+		xfs_iext_destroy(ifp);
+	} else if (nextents <= XFS_INLINE_EXTS) {
+		xfs_iext_indirect_to_direct(ifp);
+		xfs_iext_direct_to_inline(ifp, nextents);
+	} else if (nextents <= XFS_LINEAR_EXTS) {
+		xfs_iext_indirect_to_direct(ifp);
+	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+		xfs_iext_irec_compact_pages(ifp);
+	}
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_ext_irec_t	*erp, *erp_next;/* pointers to irec entries */
+	int		erp_idx = 0;	/* indirection array index */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	while (erp_idx < nlists - 1) {
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		erp_next = erp + 1;
+		if (erp_next->er_extcount <=
+		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
+			memcpy(&erp->er_extbuf[erp->er_extcount],
+				erp_next->er_extbuf, erp_next->er_extcount *
+				sizeof(xfs_bmbt_rec_t));
+			erp->er_extcount += erp_next->er_extcount;
+			/*
+			 * Free page before removing extent record
+			 * so er_extoffs don't get modified in
+			 * xfs_iext_irec_remove.
+			 */
+			kmem_free(erp_next->er_extbuf);
+			erp_next->er_extbuf = NULL;
+			xfs_iext_irec_remove(ifp, erp_idx + 1);
+			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+		} else {
+			erp_idx++;
+		}
+	}
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx,	/* irec index to update */
+	int		ext_diff)	/* number of new extents */
+{
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	for (i = erp_idx; i < nlists; i++) {
+		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+	}
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_inode_fork.h b/kernel/fs/xfs/libxfs/xfs_inode_fork.h
new file mode 100644
index 000000000..7d3b1ed6d
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_inode_fork.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_INODE_FORK_H__
+#define	__XFS_INODE_FORK_H__
+
+struct xfs_inode_log_item;
+struct xfs_dinode;
+
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+	xfs_bmbt_rec_host_t *er_extbuf;	/* block of extent records */
+	xfs_extnum_t	er_extoff;	/* extent offset in file */
+	xfs_extnum_t	er_extcount;	/* number of extents in page/block */
+} xfs_ext_irec_t;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define	XFS_IEXT_BUFSZ		4096
+#define	XFS_LINEAR_EXTS		(XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define	XFS_INLINE_EXTS		2
+#define	XFS_INLINE_DATA		32
+typedef struct xfs_ifork {
+	int			if_bytes;	/* bytes in if_u1 */
+	int			if_real_bytes;	/* bytes allocated in if_u1 */
+	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
+	short			if_broot_bytes;	/* bytes allocated for root */
+	unsigned char		if_flags;	/* per-fork flags */
+	union {
+		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
+		char		*if_data;	/* inline file data */
+	} if_u1;
+	union {
+		xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+						/* very small file extents */
+		char		if_inline_data[XFS_INLINE_DATA];
+						/* very small file data */
+		xfs_dev_t	if_rdev;	/* dev number if special */
+		uuid_t		if_uuid;	/* mount point value */
+	} if_u2;
+} xfs_ifork_t;
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define	XFS_IFINLINE	0x01	/* Inline data is read in */
+#define	XFS_IFEXTENTS	0x02	/* All extent pointers are read in */
+#define	XFS_IFBROOT	0x04	/* i_broot points to the bmap b-tree root */
+#define	XFS_IFEXTIREC	0x08	/* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip)			((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)		((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w)		\
+	((w) == XFS_DATA_FORK ? \
+		&(ip)->i_df : \
+		(ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+	(XFS_IFORK_Q(ip) ? \
+		XFS_IFORK_BOFF(ip) : \
+		XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+#define XFS_IFORK_ASIZE(ip) \
+	(XFS_IFORK_Q(ip) ? \
+		XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+			XFS_IFORK_BOFF(ip) : \
+		0)
+#define XFS_IFORK_SIZE(ip,w) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_IFORK_DSIZE(ip) : \
+		XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+	((w) == XFS_DATA_FORK ? \
+		(ip)->i_d.di_format : \
+		(ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+	((w) == XFS_DATA_FORK ? \
+		((ip)->i_d.di_format = (n)) : \
+		((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+	((w) == XFS_DATA_FORK ? \
+		(ip)->i_d.di_nextents : \
+		(ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+	((w) == XFS_DATA_FORK ? \
+		((ip)->i_d.di_nextents = (n)) : \
+		((ip)->i_d.di_anextents = (n)))
+#define XFS_IFORK_MAXEXT(ip, w) \
+	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+int		xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+				struct xfs_inode_log_item *, int);
+void		xfs_idestroy_fork(struct xfs_inode *, int);
+void		xfs_idata_realloc(struct xfs_inode *, int, int);
+void		xfs_iroot_realloc(struct xfs_inode *, int, int);
+int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+				  int);
+
+struct xfs_bmbt_rec_host *
+		xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+void		xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
+				struct xfs_bmbt_irec *, int);
+void		xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
+void		xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
+					    xfs_extnum_t, int);
+void		xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
+void		xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
+void		xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
+void		xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
+void		xfs_iext_realloc_direct(struct xfs_ifork *, int);
+void		xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
+void		xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+void		xfs_iext_destroy(struct xfs_ifork *);
+struct xfs_bmbt_rec_host *
+		xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+		xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+		xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
+				     int);
+void		xfs_iext_irec_init(struct xfs_ifork *);
+struct xfs_ext_irec *
+		xfs_iext_irec_new(struct xfs_ifork *, int);
+void		xfs_iext_irec_remove(struct xfs_ifork *, int);
+void		xfs_iext_irec_compact(struct xfs_ifork *);
+void		xfs_iext_irec_compact_pages(struct xfs_ifork *);
+void		xfs_iext_irec_compact_full(struct xfs_ifork *);
+void		xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+
+extern struct kmem_zone	*xfs_ifork_zone;
+
+#endif	/* __XFS_INODE_FORK_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_log_format.h b/kernel/fs/xfs/libxfs/xfs_log_format.h
new file mode 100644
index 000000000..265314690
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_log_format.h
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+
+struct xfs_mount;
+struct xfs_trans_res;
+
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef __uint32_t xlog_tid_t;
+
+#define XLOG_MIN_ICLOGS		2
+#define XLOG_MAX_ICLOGS		8
+#define XLOG_HEADER_MAGIC_NUM	0xFEEDbabe	/* Invalid cycle number */
+#define XLOG_VERSION_1		1
+#define XLOG_VERSION_2		2		/* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS	(XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE	(16*1024)	/* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE	(32*1024)	/* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE	(256*1024)
+#define XLOG_HEADER_CYCLE_SIZE	(32*1024)	/* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT	14		/* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT	15		/* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT	18		/* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+
+#define XLOG_HEADER_SIZE	512
+
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR	3
+
+#define XLOG_REC_SHIFT(log) \
+	BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+	BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+	return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+	if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+		return be32_to_cpu(*((__be32 *)ptr + 1));
+	else
+		return be32_to_cpu(*(__be32 *)ptr);
+}
+
+/* Log Clients */
+#define XFS_TRANSACTION		0x69
+#define XFS_VOLUME		0x2
+#define XFS_LOG			0xaa
+
+#define XLOG_UNMOUNT_TYPE	0x556e	/* Un for Unmount */
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT		1
+#define XLOG_REG_TYPE_BCHUNK		2
+#define XLOG_REG_TYPE_EFI_FORMAT	3
+#define XLOG_REG_TYPE_EFD_FORMAT	4
+#define XLOG_REG_TYPE_IFORMAT		5
+#define XLOG_REG_TYPE_ICORE		6
+#define XLOG_REG_TYPE_IEXT		7
+#define XLOG_REG_TYPE_IBROOT		8
+#define XLOG_REG_TYPE_ILOCAL		9
+#define XLOG_REG_TYPE_IATTR_EXT		10
+#define XLOG_REG_TYPE_IATTR_BROOT	11
+#define XLOG_REG_TYPE_IATTR_LOCAL	12
+#define XLOG_REG_TYPE_QFORMAT		13
+#define XLOG_REG_TYPE_DQUOT		14
+#define XLOG_REG_TYPE_QUOTAOFF		15
+#define XLOG_REG_TYPE_LRHEADER		16
+#define XLOG_REG_TYPE_UNMOUNT		17
+#define XLOG_REG_TYPE_COMMIT		18
+#define XLOG_REG_TYPE_TRANSHDR		19
+#define XLOG_REG_TYPE_ICREATE		20
+#define XLOG_REG_TYPE_MAX		20
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS	0x01	/* Start a new transaction */
+#define XLOG_COMMIT_TRANS	0x02	/* Commit this transaction */
+#define XLOG_CONTINUE_TRANS	0x04	/* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS	0x08	/* Cont this trans into new region */
+#define XLOG_END_TRANS		0x10	/* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */
+
+
+typedef struct xlog_op_header {
+	__be32	   oh_tid;	/* transaction id of operation	:  4 b */
+	__be32	   oh_len;	/* bytes in data region		:  4 b */
+	__u8	   oh_clientid;	/* who sent me this		:  1 b */
+	__u8	   oh_flags;	/*				:  1 b */
+	__u16	   oh_res2;	/* 32 bit align			:  2 b */
+} xlog_op_header_t;
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+	__be32	  h_magicno;	/* log record (LR) identifier		:  4 */
+	__be32	  h_cycle;	/* write cycle of log			:  4 */
+	__be32	  h_version;	/* LR version				:  4 */
+	__be32	  h_len;	/* len in bytes; should be 64-bit aligned: 4 */
+	__be64	  h_lsn;	/* lsn of this LR			:  8 */
+	__be64	  h_tail_lsn;	/* lsn of 1st LR w/ buffers not committed: 8 */
+	__le32	  h_crc;	/* crc of log record                    :  4 */
+	__be32	  h_prev_block; /* block number to previous LR		:  4 */
+	__be32	  h_num_logops;	/* number of log operations in this LR	:  4 */
+	__be32	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+	/* new fields */
+	__be32    h_fmt;        /* format of log record                 :  4 */
+	uuid_t	  h_fs_uuid;    /* uuid of FS                           : 16 */
+	__be32	  h_size;	/* iclog size				:  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+	__be32	  xh_cycle;	/* write cycle of log			: 4 */
+	__be32	  xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*	: 256 */
+} xlog_rec_ext_header_t;
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+	xlog_rec_header_t	hic_header;
+	xlog_rec_ext_header_t	hic_xheader;
+	char			hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+	void		*i_addr;	/* beginning address of region */
+	int		i_len;		/* length in bytes of region */
+	uint		i_type;		/* type of region */
+} xfs_log_iovec_t;
+
+
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+	uint		th_magic;		/* magic number */
+	uint		th_type;		/* transaction type */
+	__int32_t	th_tid;			/* transaction id (unused) */
+	uint		th_num_items;		/* num items logged by trans */
+} xfs_trans_header_t;
+
+#define	XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */
+
+/*
+ * Log item types.
+ */
+#define	XFS_LI_EFI		0x1236
+#define	XFS_LI_EFD		0x1237
+#define	XFS_LI_IUNLINK		0x1238
+#define	XFS_LI_INODE		0x123b	/* aligned ino chunks, var-size ibufs */
+#define	XFS_LI_BUF		0x123c	/* v2 bufs, variable sized inode bufs */
+#define	XFS_LI_DQUOT		0x123d
+#define	XFS_LI_QUOTAOFF		0x123e
+#define	XFS_LI_ICREATE		0x123f
+
+#define XFS_LI_TYPE_DESC \
+	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
+	{ XFS_LI_EFD,		"XFS_LI_EFD" }, \
+	{ XFS_LI_IUNLINK,	"XFS_LI_IUNLINK" }, \
+	{ XFS_LI_INODE,		"XFS_LI_INODE" }, \
+	{ XFS_LI_BUF,		"XFS_LI_BUF" }, \
+	{ XFS_LI_DQUOT,		"XFS_LI_DQUOT" }, \
+	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }, \
+	{ XFS_LI_ICREATE,	"XFS_LI_ICREATE" }
+
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint64_t		ilf_ino;	/* inode number */
+	union {
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+typedef struct xfs_inode_log_format_32 {
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint64_t		ilf_ino;	/* inode number */
+	union {
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint32_t		ilf_pad;	/* pad for 64 bit boundary */
+	__uint64_t		ilf_ino;	/* inode number */
+	union {
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define	XFS_ILOG_CORE	0x001	/* log standard inode fields */
+#define	XFS_ILOG_DDATA	0x002	/* log i_df.if_data */
+#define	XFS_ILOG_DEXT	0x004	/* log i_df.if_extents */
+#define	XFS_ILOG_DBROOT	0x008	/* log i_df.i_broot */
+#define	XFS_ILOG_DEV	0x010	/* log the dev field */
+#define	XFS_ILOG_UUID	0x020	/* log the uuid field */
+#define	XFS_ILOG_ADATA	0x040	/* log i_af.if_data */
+#define	XFS_ILOG_AEXT	0x080	/* log i_af.if_extents */
+#define	XFS_ILOG_ABROOT	0x100	/* log i_af.i_broot */
+#define XFS_ILOG_DOWNER	0x200	/* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER	0x400	/* change the attr fork owner on replay */
+
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP	0x4000
+
+#define	XFS_ILOG_NONCORE	(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+				 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+				 XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+				 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+
+#define	XFS_ILOG_DFORK		(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT)
+
+#define	XFS_ILOG_AFORK		(XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+#define	XFS_ILOG_ALL		(XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+				 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+				 XFS_ILOG_DEV | XFS_ILOG_UUID | \
+				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+				 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef struct xfs_ictimestamp {
+	__int32_t	t_sec;		/* timestamp seconds */
+	__int32_t	t_nsec;		/* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+
+/*
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
+ *	  except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__uint16_t	di_mode;	/* mode and type of file */
+	__int8_t	di_version;	/* inode version */
+	__int8_t	di_format;	/* format of di_c data */
+	__uint16_t	di_onlink;	/* old number of links to file */
+	__uint32_t	di_uid;		/* owner's user id */
+	__uint32_t	di_gid;		/* owner's group id */
+	__uint32_t	di_nlink;	/* number of links to file */
+	__uint16_t	di_projid_lo;	/* lower part of owner's project id */
+	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
+	__uint8_t	di_pad[6];	/* unused, zeroed space */
+	__uint16_t	di_flushiter;	/* incremented on flush */
+	xfs_ictimestamp_t di_atime;	/* time last accessed */
+	xfs_ictimestamp_t di_mtime;	/* time last modified */
+	xfs_ictimestamp_t di_ctime;	/* time created/inode modified */
+	xfs_fsize_t	di_size;	/* number of bytes in file */
+	xfs_rfsblock_t	di_nblocks;	/* # of direct & btree blocks used */
+	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
+	xfs_extnum_t	di_nextents;	/* number of extents in data fork */
+	xfs_aextnum_t	di_anextents;	/* number of extents in attribute fork*/
+	__uint8_t	di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__int8_t	di_aformat;	/* format of attr fork's data */
+	__uint32_t	di_dmevmask;	/* DMIG event mask */
+	__uint16_t	di_dmstate;	/* DMIG state info */
+	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
+	__uint32_t	di_gen;		/* generation number */
+
+	/* di_next_unlinked is the only non-core field in the old dinode */
+	xfs_agino_t	di_next_unlinked;/* agi unlinked list ptr */
+
+	/* start of the extended dinode, writable fields */
+	__uint32_t	di_crc;		/* CRC of the inode */
+	__uint64_t	di_changecount;	/* number of attribute changes */
+	xfs_lsn_t	di_lsn;		/* flush sequence */
+	__uint64_t	di_flags2;	/* more random flags */
+	__uint8_t	di_pad2[16];	/* more padding for future expansion */
+
+	/* fields only written to during inode creation */
+	xfs_ictimestamp_t di_crtime;	/* time created */
+	xfs_ino_t	di_ino;		/* inode number */
+	uuid_t		di_uuid;	/* UUID of the filesystem */
+
+	/* structure must be padded to 64 bit alignment */
+} xfs_icdinode_t;
+
+static inline uint xfs_icdinode_size(int version)
+{
+	if (version == 3)
+		return sizeof(struct xfs_icdinode);
+	return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+
+/*
+ * Buffer Log Format defintions
+ *
+ * These are the physical dirty bitmap defintions for the log format structure.
+ */
+#define	XFS_BLF_CHUNK		128
+#define	XFS_BLF_SHIFT		7
+#define	BIT_TO_WORD_SHIFT	5
+#define	NBWORD			(NBBY * sizeof(unsigned int))
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define	XFS_BLF_INODE_BUF	(1<<0)
+
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define	XFS_BLF_CANCEL		(1<<1)
+
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define	XFS_BLF_UDQUOT_BUF	(1<<2)
+#define XFS_BLF_PDQUOT_BUF	(1<<3)
+#define	XFS_BLF_GDQUOT_BUF	(1<<4)
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.  The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ */
+#define XFS_BLF_DATAMAP_SIZE	((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+
+typedef struct xfs_buf_log_format {
+	unsigned short	blf_type;	/* buf log item type indicator */
+	unsigned short	blf_size;	/* size of this item */
+	ushort		blf_flags;	/* misc state */
+	ushort		blf_len;	/* number of blocks in this buf */
+	__int64_t	blf_blkno;	/* starting blkno of this buf */
+	unsigned int	blf_map_size;	/* used size of data bitmap in words */
+	unsigned int	blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS	5
+#define XFS_BLFT_SHIFT	11
+#define XFS_BLFT_MASK	(((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+	XFS_BLFT_UNKNOWN_BUF = 0,
+	XFS_BLFT_UDQUOT_BUF,
+	XFS_BLFT_PDQUOT_BUF,
+	XFS_BLFT_GDQUOT_BUF,
+	XFS_BLFT_BTREE_BUF,
+	XFS_BLFT_AGF_BUF,
+	XFS_BLFT_AGFL_BUF,
+	XFS_BLFT_AGI_BUF,
+	XFS_BLFT_DINO_BUF,
+	XFS_BLFT_SYMLINK_BUF,
+	XFS_BLFT_DIR_BLOCK_BUF,
+	XFS_BLFT_DIR_DATA_BUF,
+	XFS_BLFT_DIR_FREE_BUF,
+	XFS_BLFT_DIR_LEAF1_BUF,
+	XFS_BLFT_DIR_LEAFN_BUF,
+	XFS_BLFT_DA_NODE_BUF,
+	XFS_BLFT_ATTR_LEAF_BUF,
+	XFS_BLFT_ATTR_RMT_BUF,
+	XFS_BLFT_SB_BUF,
+	XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+	ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+	blf->blf_flags &= ~XFS_BLFT_MASK;
+	blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+	return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+	xfs_fsblock_t	ext_start;
+	xfs_extlen_t	ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+	__uint64_t	ext_start;
+	__uint32_t	ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+	__uint64_t	ext_start;
+	__uint32_t	ext_len;
+	__uint32_t	ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_t		efi_extents[1];	/* array of extents to free */
+} xfs_efi_log_format_t;
+
+typedef struct xfs_efi_log_format_32 {
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_32_t		efi_extents[1];	/* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_64_t		efi_extents[1];	/* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_t		efd_extents[1];	/* array of extents freed */
+} xfs_efd_log_format_t;
+
+typedef struct xfs_efd_log_format_32 {
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_32_t		efd_extents[1];	/* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_64_t		efd_extents[1];	/* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+	__uint16_t		qlf_type;      /* dquot log item type */
+	__uint16_t		qlf_size;      /* size of this item */
+	xfs_dqid_t		qlf_id;	       /* usr/grp/proj id : 32 bits */
+	__int64_t		qlf_blkno;     /* blkno of dquot buffer */
+	__int32_t		qlf_len;       /* len of dquot buffer */
+	__uint32_t		qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+	unsigned short		qf_type;	/* quotaoff log item type */
+	unsigned short		qf_size;	/* size of this item */
+	unsigned int		qf_flags;	/* USR and/or GRP */
+	char			qf_pad[12];	/* padding for future */
+} xfs_qoff_logformat_t;
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT	0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD	0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD	0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT	0x0008  /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD	0x0010  /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD	0x0020  /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT	0x0040  /* group quota accounting ON */
+
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD	0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD	0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD	0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD	0x0400  /* quotacheck run on project quotas */
+
+#define XFS_ALL_QUOTA_ACCT	\
+		(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD	\
+		(XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD	\
+		(XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+				 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+				 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+				 XFS_PQUOTA_CHKD)
+
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+	__uint16_t	icl_type;	/* type of log format structure */
+	__uint16_t	icl_size;	/* size of log format structure */
+	__be32		icl_ag;		/* ag being allocated in */
+	__be32		icl_agbno;	/* start block of inode range */
+	__be32		icl_count;	/* number of inodes to initialise */
+	__be32		icl_isize;	/* size of inodes */
+	__be32		icl_length;	/* length of extent to initialise */
+	__be32		icl_gen;	/* inode generation number to use */
+};
+
+#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_log_recover.h b/kernel/fs/xfs/libxfs/xfs_log_recover.h
new file mode 100644
index 000000000..1c55ccbb3
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_log_recover.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS  4
+#define XLOG_RHASH_SIZE	16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid)	\
+	((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+	struct list_head	ri_list;
+	int			ri_type;
+	int			ri_cnt;	/* count of regions found */
+	int			ri_total;	/* total regions */
+	xfs_log_iovec_t		*ri_buf;	/* ptr to regions buffer */
+} xlog_recover_item_t;
+
+struct xlog_tid;
+typedef struct xlog_recover {
+	struct hlist_node	r_list;
+	xlog_tid_t		r_log_tid;	/* log's transaction id */
+	xfs_trans_header_t	r_theader;	/* trans header for partial */
+	int			r_state;	/* not needed */
+	xfs_lsn_t		r_lsn;		/* xact lsn */
+	struct list_head	r_itemq;	/* q for items */
+} xlog_recover_t;
+
+#define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define	XLOG_BC_TABLE_SIZE	64
+
+#define	XLOG_RECOVER_PASS1	1
+#define	XLOG_RECOVER_PASS2	2
+
+#endif	/* __XFS_LOG_RECOVER_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_log_rlimit.c b/kernel/fs/xfs/libxfs/xfs_log_rlimit.c
new file mode 100644
index 000000000..c10597973
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_bmap_btree.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+	struct xfs_mount	*mp)
+{
+	int			size;
+	int			nblks;
+
+	size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
+	       MAXNAMELEN - 1;
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	nblks += XFS_B_TO_FSB(mp, size);
+	nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+	return  M_RES(mp)->tr_attrsetm.tr_logres +
+		M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+STATIC void
+xfs_log_get_max_trans_res(
+	struct xfs_mount	*mp,
+	struct xfs_trans_res	*max_resp)
+{
+	struct xfs_trans_res	*resp;
+	struct xfs_trans_res	*end_resp;
+	int			log_space = 0;
+	int			attr_space;
+
+	attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+	resp = (struct xfs_trans_res *)M_RES(mp);
+	end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
+	for (; resp < end_resp; resp++) {
+		int		tmp = resp->tr_logcount > 1 ?
+				      resp->tr_logres * resp->tr_logcount :
+				      resp->tr_logres;
+		if (log_space < tmp) {
+			log_space = tmp;
+			*max_resp = *resp;		/* struct copy */
+		}
+	}
+
+	if (attr_space > log_space) {
+		*max_resp = M_RES(mp)->tr_attrsetm;	/* struct copy */
+		max_resp->tr_logres = attr_space;
+	}
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans_res	tres = {0};
+	int			max_logres;
+	int			min_logblks = 0;
+	int			lsunit = 0;
+
+	xfs_log_get_max_trans_res(mp, &tres);
+
+	max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+	if (tres.tr_logcount > 1)
+		max_logres *= tres.tr_logcount;
+
+	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+		lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+	/*
+	 * Two factors should be taken into account for calculating the minimum
+	 * log space.
+	 * 1) The fundamental limitation is that no single transaction can be
+	 *    larger than half size of the log.
+	 *
+	 *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+	 *    define, which is set to 3. That means we can definitely fit
+	 *    maximally sized 2 transactions in the log. We'll use this same
+	 *    value here.
+	 *
+	 * 2) If the lsunit option is specified, a transaction requires 2 LSU
+	 *    for the reservation because there are two log writes that can
+	 *    require padding - the transaction data and the commit record which
+	 *    are written separately and both can require padding to the LSU.
+	 *    Consider that we can have an active CIL reservation holding 2*LSU,
+	 *    but the CIL is not over a push threshold, in this case, if we
+	 *    don't have enough log space for at one new transaction, which
+	 *    includes another 2*LSU in the reservation, we will run into dead
+	 *    loop situation in log space grant procedure. i.e.
+	 *    xlog_grant_head_wait().
+	 *
+	 *    Hence the log size needs to be able to contain two maximally sized
+	 *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
+	 *
+	 * Also, the log size should be a multiple of the log stripe unit, round
+	 * it up to lsunit boundary if lsunit is specified.
+	 */
+	if (lsunit) {
+		min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+			      2 * lsunit;
+	} else
+		min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+	min_logblks *= XFS_MIN_LOG_FACTOR;
+
+	return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_quota_defs.h b/kernel/fs/xfs/libxfs/xfs_quota_defs.h
new file mode 100644
index 000000000..1b0a08379
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_quota_defs.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t	xfs_qcnt_t;
+typedef __uint16_t	xfs_qwarncnt_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER		0x0001		/* a user quota */
+#define XFS_DQ_PROJ		0x0002		/* project quota */
+#define XFS_DQ_GROUP		0x0004		/* a group quota */
+#define XFS_DQ_DIRTY		0x0008		/* dquot is dirty */
+#define XFS_DQ_FREEING		0x0010		/* dquot is beeing torn down */
+
+#define XFS_DQ_ALLTYPES		(XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+
+#define XFS_DQ_FLAGS \
+	{ XFS_DQ_USER,		"USER" }, \
+	{ XFS_DQ_PROJ,		"PROJ" }, \
+	{ XFS_DQ_GROUP,		"GROUP" }, \
+	{ XFS_DQ_DIRTY,		"DIRTY" }, \
+	{ XFS_DQ_FREEING,	"FREEING" }
+
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp)	\
+	((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+
+#define XFS_IS_QUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_PQUOTA_ENFD)
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE	0x1000  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE	0x2000  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE	0x4000  /* pquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE	\
+	(XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)	((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+						   XFS_GQUOTA_ACTIVE | \
+						   XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)	((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)	((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp)	((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC	0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA	0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA	0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES	0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION	0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA	0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC	0x0004000 /* enospc instead of edquot (prj) */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS	0x0010000
+#define XFS_QMOPT_RES_RTBLKS	0x0020000
+#define XFS_QMOPT_BCOUNT	0x0040000
+#define XFS_QMOPT_ICOUNT	0x0080000
+#define XFS_QMOPT_RTBCOUNT	0x0100000
+#define XFS_QMOPT_DELBCOUNT	0x0200000
+#define XFS_QMOPT_DELRTBCOUNT	0x0400000
+#define XFS_QMOPT_RES_INOS	0x0800000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT	0x1000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS	XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS	XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS	XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT	XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT	XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT	XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT	XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL	\
+		(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
+		       xfs_dqid_t id, uint type, uint flags, char *str);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+
+#endif	/* __XFS_QUOTA_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_rtbitmap.c b/kernel/fs/xfs/libxfs/xfs_rtbitmap.c
new file mode 100644
index 000000000..9b59ffa1f
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_rtalloc.h"
+
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	block,		/* block number in bitmap or summary */
+	int		issum,		/* is summary not bitmap */
+	xfs_buf_t	**bpp)		/* output: buffer for the block */
+{
+	xfs_buf_t	*bp;		/* block buffer, result */
+	xfs_inode_t	*ip;		/* bitmap or summary inode */
+	xfs_bmbt_irec_t	map;
+	int		nmap = 1;
+	int		error;		/* error value */
+
+	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	ASSERT(map.br_startblock != NULLFSBLOCK);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
+				   mp->m_bsize, 0, &bp, NULL);
+	if (error)
+		return error;
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = start - limit + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit < XFS_NBWORD - 1) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+			firstbit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = bit - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i = bit - firstbit + 1;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the previous one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if (len - i) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_NBWORD - (len - i);
+		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start - i + 1;
+	return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = limit - start + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit) {
+		/*
+		 * Calculate last (rightmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Calculate mask for all the relevant bits in this word.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start + i - 1;
+	return 0;
+}
+
+/*
+ * Read and/or modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ *
+ * Summary information is returned in *sum if specified.
+ * If no delta is specified, returns summary only.
+ */
+int
+xfs_rtmodify_summary_int(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+{
+	xfs_buf_t	*bp;		/* buffer for the summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (*rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (*rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		*rbpp = bp;
+		*rsb = sb;
+	}
+	/*
+	 * Point to the summary information, modify/log it, and/or copy it out.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	if (delta) {
+		uint first = (uint)((char *)sp - (char *)bp->b_addr);
+
+		*sp += delta;
+		xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+	}
+	if (sum)
+		*sum = *sp;
+	return 0;
+}
+
+int
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	return xfs_rtmodify_summary_int(mp, tp, log, bbno,
+					delta, rbpp, rsb, NULL);
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to modify */
+	xfs_extlen_t	len,		/* length of extent to modify */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtword_t	*first;		/* first used word in the buffer */
+	int		i;		/* current bit number rel. to start */
+	int		lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block, and point to its data.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	first = b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not changed and mask of relevant bits.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		i = lastbit - bit;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Set the word value correctly.
+		 */
+		*b = val;
+		i += XFS_NBWORD;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Compute a mask of relevant bits.
+		 */
+		bit = 0;
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		b++;
+	}
+	/*
+	 * Log any remaining changed bytes.
+	 */
+	if (b > first)
+		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+			(uint)((char *)b - (char *)bufp - 1));
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to free */
+	xfs_extlen_t	len,		/* length to free */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the freed extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block freed > end */
+	xfs_rtblock_t	preblock;	/* first block freed < start */
+
+	end = start + len - 1;
+	/*
+	 * Modify the bitmap to mark this extent freed.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Assume we're freeing out of the middle of an allocated extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of allocated extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error)
+		return error;
+	/*
+	 * If there are blocks not being freed at the front of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being freed at the end of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Increment the summary information corresponding to the entire
+	 * (new) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+	return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		val,		/* 1 for free, 0 for allocated */
+	xfs_rtblock_t	*new,		/* out: first block not matching */
+	int		*stat)		/* out: 1 for matches, 0 for not */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zero's; 1 (free) => all one's.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not examined.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ val)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * Successful, return.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*new = start + i;
+	*stat = 1;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_alloc_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len)		/* length of extent */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+	int		stat;
+	int		error;
+
+	error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+	if (error)
+		return error;
+	ASSERT(stat);
+	return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l)	(0)
+#endif
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to free */
+	xfs_extlen_t	len)		/* length of extent freed */
+{
+	int		error;		/* error value */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp = NULL;	/* summary file block buffer */
+
+	mp = tp->t_mountp;
+
+	ASSERT(mp->m_rbmip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+	if (error)
+		return error;
+
+	/*
+	 * Free the range of realtime blocks.
+	 */
+	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Mark more blocks free in the superblock.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+	/*
+	 * If we've now freed all the blocks, reset the file sequence
+	 * number to 0.
+	 */
+	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	    mp->m_sb.sb_rextents) {
+		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	}
+	return 0;
+}
+
diff --git a/kernel/fs/xfs/libxfs/xfs_sb.c b/kernel/fs/xfs/libxfs/xfs_sb.c
new file mode 100644
index 000000000..dc4bfc5d8
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_sb.c
@@ -0,0 +1,803 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag;
+	int			ref = 0;
+
+	rcu_read_lock();
+	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	if (pag) {
+		ASSERT(atomic_read(&pag->pag_ref) >= 0);
+		ref = atomic_inc_return(&pag->pag_ref);
+	}
+	rcu_read_unlock();
+	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+	return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		first,
+	int			tag)
+{
+	struct xfs_perag	*pag;
+	int			found;
+	int			ref;
+
+	rcu_read_lock();
+	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+					(void **)&pag, first, 1, tag);
+	if (found <= 0) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	ref = atomic_inc_return(&pag->pag_ref);
+	rcu_read_unlock();
+	trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+	return pag;
+}
+
+void
+xfs_perag_put(
+	struct xfs_perag	*pag)
+{
+	int	ref;
+
+	ASSERT(atomic_read(&pag->pag_ref) > 0);
+	ref = atomic_dec_return(&pag->pag_ref);
+	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+	xfs_mount_t	*mp,
+	xfs_sb_t	*sbp,
+	bool		check_inprogress,
+	bool		check_version)
+{
+	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+		xfs_warn(mp, "bad magic number");
+		return -EWRONGFS;
+	}
+
+
+	if (!xfs_sb_good_version(sbp)) {
+		xfs_warn(mp, "bad version");
+		return -EWRONGFS;
+	}
+
+	/*
+	 * Version 5 superblock feature mask validation. Reject combinations the
+	 * kernel cannot support up front before checking anything else. For
+	 * write validation, we don't need to check feature masks.
+	 */
+	if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+		if (xfs_sb_has_compat_feature(sbp,
+					XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+			xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+				(sbp->sb_features_compat &
+						XFS_SB_FEAT_COMPAT_UNKNOWN));
+		}
+
+		if (xfs_sb_has_ro_compat_feature(sbp,
+					XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+			xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+				(sbp->sb_features_ro_compat &
+						XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+			if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+				xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+				return -EINVAL;
+			}
+		}
+		if (xfs_sb_has_incompat_feature(sbp,
+					XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+			xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+				(sbp->sb_features_incompat &
+						XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+			return -EINVAL;
+		}
+	}
+
+	if (xfs_sb_version_has_pquotino(sbp)) {
+		if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+			xfs_notice(mp,
+			   "Version 5 of Super block has XFS_OQUOTA bits.");
+			return -EFSCORRUPTED;
+		}
+	} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+				XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+			xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
+			return -EFSCORRUPTED;
+	}
+
+	if (unlikely(
+	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+		xfs_warn(mp,
+		"filesystem is marked as having an external log; "
+		"specify logdev on the mount command line.");
+		return -EINVAL;
+	}
+
+	if (unlikely(
+	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+		xfs_warn(mp,
+		"filesystem is marked as having an internal log; "
+		"do not specify logdev on the mount command line.");
+		return -EINVAL;
+	}
+
+	/*
+	 * More sanity checking.  Most of these were stolen directly from
+	 * xfs_repair.
+	 */
+	if (unlikely(
+	    sbp->sb_agcount <= 0					||
+	    sbp->sb_sectsize < XFS_MIN_SECTORSIZE			||
+	    sbp->sb_sectsize > XFS_MAX_SECTORSIZE			||
+	    sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG			||
+	    sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG			||
+	    sbp->sb_sectsize != (1 << sbp->sb_sectlog)			||
+	    sbp->sb_blocksize < XFS_MIN_BLOCKSIZE			||
+	    sbp->sb_blocksize > XFS_MAX_BLOCKSIZE			||
+	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
+	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
+	    sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
+	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
+	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
+	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
+	    sbp->sb_inodesize != (1 << sbp->sb_inodelog)		||
+	    sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE			||
+	    sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
+	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
+	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)	||
+	    sbp->sb_dblocks == 0					||
+	    sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)			||
+	    sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)			||
+	    sbp->sb_shared_vn != 0)) {
+		xfs_notice(mp, "SB sanity check failed");
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Until this is fixed only page-sized or smaller data blocks work.
+	 */
+	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+		xfs_warn(mp,
+		"File system with blocksize %d bytes. "
+		"Only pagesize (%ld) or less will currently work.",
+				sbp->sb_blocksize, PAGE_SIZE);
+		return -ENOSYS;
+	}
+
+	/*
+	 * Currently only very few inode sizes are supported.
+	 */
+	switch (sbp->sb_inodesize) {
+	case 256:
+	case 512:
+	case 1024:
+	case 2048:
+		break;
+	default:
+		xfs_warn(mp, "inode size of %d bytes not supported",
+				sbp->sb_inodesize);
+		return -ENOSYS;
+	}
+
+	if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+		xfs_warn(mp,
+		"file system too large to be mounted on this system.");
+		return -EFBIG;
+	}
+
+	if (check_inprogress && sbp->sb_inprogress) {
+		xfs_warn(mp, "Offline file system operation in progress!");
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+	/*
+	 * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+	 * leads to in-core values having two different values for a quota
+	 * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+	 * NULLFSINO.
+	 *
+	 * Note that this change affect only the in-core values. These
+	 * values are not written back to disk unless any quota information
+	 * is written to the disk. Even in that case, sb_pquotino field is
+	 * not written to disk unless the superblock supports pquotino.
+	 */
+	if (sbp->sb_uquotino == 0)
+		sbp->sb_uquotino = NULLFSINO;
+	if (sbp->sb_gquotino == 0)
+		sbp->sb_gquotino = NULLFSINO;
+	if (sbp->sb_pquotino == 0)
+		sbp->sb_pquotino = NULLFSINO;
+
+	/*
+	 * We need to do these manipilations only if we are working
+	 * with an older version of on-disk superblock.
+	 */
+	if (xfs_sb_version_has_pquotino(sbp))
+		return;
+
+	if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+		sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+					XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+	if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+		sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+					XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+	sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+	if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
+		/*
+		 * In older version of superblock, on-disk superblock only
+		 * has sb_gquotino, and in-core superblock has both sb_gquotino
+		 * and sb_pquotino. But, only one of them is supported at any
+		 * point of time. So, if PQUOTA is set in disk superblock,
+		 * copy over sb_gquotino to sb_pquotino.
+		 */
+		sbp->sb_pquotino = sbp->sb_gquotino;
+		sbp->sb_gquotino = NULLFSINO;
+	}
+}
+
+static void
+__xfs_sb_from_disk(
+	struct xfs_sb	*to,
+	xfs_dsb_t	*from,
+	bool		convert_xquota)
+{
+	to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+	to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+	to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+	to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+	to->sb_rextents = be64_to_cpu(from->sb_rextents);
+	memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+	to->sb_logstart = be64_to_cpu(from->sb_logstart);
+	to->sb_rootino = be64_to_cpu(from->sb_rootino);
+	to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+	to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+	to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+	to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+	to->sb_agcount = be32_to_cpu(from->sb_agcount);
+	to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+	to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+	to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+	to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+	to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+	to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+	memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+	to->sb_blocklog = from->sb_blocklog;
+	to->sb_sectlog = from->sb_sectlog;
+	to->sb_inodelog = from->sb_inodelog;
+	to->sb_inopblog = from->sb_inopblog;
+	to->sb_agblklog = from->sb_agblklog;
+	to->sb_rextslog = from->sb_rextslog;
+	to->sb_inprogress = from->sb_inprogress;
+	to->sb_imax_pct = from->sb_imax_pct;
+	to->sb_icount = be64_to_cpu(from->sb_icount);
+	to->sb_ifree = be64_to_cpu(from->sb_ifree);
+	to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+	to->sb_frextents = be64_to_cpu(from->sb_frextents);
+	to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+	to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+	to->sb_qflags = be16_to_cpu(from->sb_qflags);
+	to->sb_flags = from->sb_flags;
+	to->sb_shared_vn = from->sb_shared_vn;
+	to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+	to->sb_unit = be32_to_cpu(from->sb_unit);
+	to->sb_width = be32_to_cpu(from->sb_width);
+	to->sb_dirblklog = from->sb_dirblklog;
+	to->sb_logsectlog = from->sb_logsectlog;
+	to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+	to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+	to->sb_features2 = be32_to_cpu(from->sb_features2);
+	to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+	to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+	to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+	to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+	to->sb_features_log_incompat =
+				be32_to_cpu(from->sb_features_log_incompat);
+	/* crc is only used on disk, not in memory; just init to 0 here. */
+	to->sb_crc = 0;
+	to->sb_pad = 0;
+	to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+	to->sb_lsn = be64_to_cpu(from->sb_lsn);
+	/* Convert on-disk flags to in-memory flags? */
+	if (convert_xquota)
+		xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+	struct xfs_sb	*to,
+	xfs_dsb_t	*from)
+{
+	__xfs_sb_from_disk(to, from, true);
+}
+
+static void
+xfs_sb_quota_to_disk(
+	struct xfs_dsb	*to,
+	struct xfs_sb	*from)
+{
+	__uint16_t	qflags = from->sb_qflags;
+
+	to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
+	if (xfs_sb_version_has_pquotino(from)) {
+		to->sb_qflags = cpu_to_be16(from->sb_qflags);
+		to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+		to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
+		return;
+	}
+
+	/*
+	 * The in-core version of sb_qflags do not have XFS_OQUOTA_*
+	 * flags, whereas the on-disk version does.  So, convert incore
+	 * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
+	 */
+	qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+			XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+	if (from->sb_qflags &
+			(XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+		qflags |= XFS_OQUOTA_ENFD;
+	if (from->sb_qflags &
+			(XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+		qflags |= XFS_OQUOTA_CHKD;
+	to->sb_qflags = cpu_to_be16(qflags);
+
+	/*
+	 * GQUOTINO and PQUOTINO cannot be used together in versions
+	 * of superblock that do not have pquotino. from->sb_flags
+	 * tells us which quota is active and should be copied to
+	 * disk. If neither are active, we should NULL the inode.
+	 *
+	 * In all cases, the separate pquotino must remain 0 because it
+	 * it beyond the "end" of the valid non-pquotino superblock.
+	 */
+	if (from->sb_qflags & XFS_GQUOTA_ACCT)
+		to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+	else if (from->sb_qflags & XFS_PQUOTA_ACCT)
+		to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+	else {
+		/*
+		 * We can't rely on just the fields being logged to tell us
+		 * that it is safe to write NULLFSINO - we should only do that
+		 * if quotas are not actually enabled. Hence only write
+		 * NULLFSINO if both in-core quota inodes are NULL.
+		 */
+		if (from->sb_gquotino == NULLFSINO &&
+		    from->sb_pquotino == NULLFSINO)
+			to->sb_gquotino = cpu_to_be64(NULLFSINO);
+	}
+
+	to->sb_pquotino = 0;
+}
+
+void
+xfs_sb_to_disk(
+	struct xfs_dsb	*to,
+	struct xfs_sb	*from)
+{
+	xfs_sb_quota_to_disk(to, from);
+
+	to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
+	to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
+	to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
+	to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
+	to->sb_rextents = cpu_to_be64(from->sb_rextents);
+	memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+	to->sb_logstart = cpu_to_be64(from->sb_logstart);
+	to->sb_rootino = cpu_to_be64(from->sb_rootino);
+	to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
+	to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
+	to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
+	to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
+	to->sb_agcount = cpu_to_be32(from->sb_agcount);
+	to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
+	to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
+	to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
+	to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
+	to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
+	to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
+	memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+	to->sb_blocklog = from->sb_blocklog;
+	to->sb_sectlog = from->sb_sectlog;
+	to->sb_inodelog = from->sb_inodelog;
+	to->sb_inopblog = from->sb_inopblog;
+	to->sb_agblklog = from->sb_agblklog;
+	to->sb_rextslog = from->sb_rextslog;
+	to->sb_inprogress = from->sb_inprogress;
+	to->sb_imax_pct = from->sb_imax_pct;
+	to->sb_icount = cpu_to_be64(from->sb_icount);
+	to->sb_ifree = cpu_to_be64(from->sb_ifree);
+	to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
+	to->sb_frextents = cpu_to_be64(from->sb_frextents);
+
+	to->sb_flags = from->sb_flags;
+	to->sb_shared_vn = from->sb_shared_vn;
+	to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
+	to->sb_unit = cpu_to_be32(from->sb_unit);
+	to->sb_width = cpu_to_be32(from->sb_width);
+	to->sb_dirblklog = from->sb_dirblklog;
+	to->sb_logsectlog = from->sb_logsectlog;
+	to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
+	to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
+
+	/*
+	 * We need to ensure that bad_features2 always matches features2.
+	 * Hence we enforce that here rather than having to remember to do it
+	 * everywhere else that updates features2.
+	 */
+	from->sb_bad_features2 = from->sb_features2;
+	to->sb_features2 = cpu_to_be32(from->sb_features2);
+	to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
+
+	if (xfs_sb_version_hascrc(from)) {
+		to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+		to->sb_features_ro_compat =
+				cpu_to_be32(from->sb_features_ro_compat);
+		to->sb_features_incompat =
+				cpu_to_be32(from->sb_features_incompat);
+		to->sb_features_log_incompat =
+				cpu_to_be32(from->sb_features_log_incompat);
+		to->sb_pad = 0;
+		to->sb_lsn = cpu_to_be64(from->sb_lsn);
+	}
+}
+
+static int
+xfs_sb_verify(
+	struct xfs_buf	*bp,
+	bool		check_version)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_sb	sb;
+
+	/*
+	 * Use call variant which doesn't convert quota flags from disk 
+	 * format, because xfs_mount_validate_sb checks the on-disk flags.
+	 */
+	__xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+
+	/*
+	 * Only check the in progress field for the primary superblock as
+	 * mkfs.xfs doesn't clear it from secondary superblocks.
+	 */
+	return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+				     check_version);
+}
+
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid.  We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks.  So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
+ */
+static void
+xfs_sb_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_dsb	*dsb = XFS_BUF_TO_SBP(bp);
+	int		error;
+
+	/*
+	 * open code the version check to avoid needing to convert the entire
+	 * superblock from disk order just to check the version number
+	 */
+	if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+	    (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+						XFS_SB_VERSION_5) ||
+	     dsb->sb_crc != 0)) {
+
+		if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
+			/* Only fail bad secondaries on a known V5 filesystem */
+			if (bp->b_bn == XFS_SB_DADDR ||
+			    xfs_sb_version_hascrc(&mp->m_sb)) {
+				error = -EFSBADCRC;
+				goto out_error;
+			}
+		}
+	}
+	error = xfs_sb_verify(bp, true);
+
+out_error:
+	if (error) {
+		xfs_buf_ioerror(bp, error);
+		if (error == -EFSCORRUPTED || error == -EFSBADCRC)
+			xfs_verifier_error(bp);
+	}
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_dsb	*dsb = XFS_BUF_TO_SBP(bp);
+
+	if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+		/* XFS filesystem, verify noisily! */
+		xfs_sb_read_verify(bp);
+		return;
+	}
+	/* quietly fail */
+	xfs_buf_ioerror(bp, -EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	int			error;
+
+	error = xfs_sb_verify(bp, false);
+	if (error) {
+		xfs_buf_ioerror(bp, error);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+	.verify_read = xfs_sb_read_verify,
+	.verify_write = xfs_sb_write_verify,
+};
+
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+	.verify_read = xfs_sb_quiet_read_verify,
+	.verify_write = xfs_sb_write_verify,
+};
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_sb_mount_common(
+	struct xfs_mount *mp,
+	struct xfs_sb	*sbp)
+{
+	mp->m_agfrotor = mp->m_agirotor = 0;
+	spin_lock_init(&mp->m_agirotor_lock);
+	mp->m_maxagi = mp->m_sb.sb_agcount;
+	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+	mp->m_blockmask = sbp->sb_blocksize - 1;
+	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+	mp->m_blockwmask = mp->m_blockwsize - 1;
+
+	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+	mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+	mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+	mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+	mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+					sbp->sb_inopblock);
+	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+	struct xfs_mount *mp,
+	xfs_agnumber_t	agcount)
+{
+	xfs_agnumber_t	index;
+	xfs_perag_t	*pag;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	uint64_t	ifree = 0;
+	uint64_t	ialloc = 0;
+	uint64_t	bfree = 0;
+	uint64_t	bfreelst = 0;
+	uint64_t	btree = 0;
+	int		error;
+
+	for (index = 0; index < agcount; index++) {
+		/*
+		 * read the agf, then the agi. This gets us
+		 * all the information we need and populates the
+		 * per-ag structures for us.
+		 */
+		error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+		if (error)
+			return error;
+
+		error = xfs_ialloc_pagi_init(mp, NULL, index);
+		if (error)
+			return error;
+		pag = xfs_perag_get(mp, index);
+		ifree += pag->pagi_freecount;
+		ialloc += pag->pagi_count;
+		bfree += pag->pagf_freeblks;
+		bfreelst += pag->pagf_flcount;
+		btree += pag->pagf_btreeblks;
+		xfs_perag_put(pag);
+	}
+
+	/* Overwrite incore superblock counters with just-read data */
+	spin_lock(&mp->m_sb_lock);
+	sbp->sb_ifree = ifree;
+	sbp->sb_icount = ialloc;
+	sbp->sb_fdblocks = bfree + bfreelst + btree;
+	spin_unlock(&mp->m_sb_lock);
+
+	xfs_reinit_percpu_counters(mp);
+
+	return 0;
+}
+
+/*
+ * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
+ * into the superblock buffer to be logged.  It does not provide the higher
+ * level of locking that is needed to protect the in-core superblock from
+ * concurrent access.
+ */
+void
+xfs_log_sb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp = xfs_trans_getsb(tp, mp, 0);
+
+	mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+	mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+	mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
+	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+}
+
+/*
+ * xfs_sync_sb
+ *
+ * Sync the superblock to disk.
+ *
+ * Note that the caller is responsible for checking the frozen state of the
+ * filesystem. This procedure uses the non-blocking transaction allocator and
+ * thus will allow modifications to a frozen fs. This is required because this
+ * code can be called during the process of freezing where use of the high-level
+ * allocator would deadlock.
+ */
+int
+xfs_sync_sb(
+	struct xfs_mount	*mp,
+	bool			wait)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_log_sb(tp);
+	if (wait)
+		xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp, 0);
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_sb.h b/kernel/fs/xfs/libxfs/xfs_sb.h
new file mode 100644
index 000000000..b25bb9a34
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_sb.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SB_H__
+#define	__XFS_SB_H__
+
+/*
+ * perag get/put wrappers for ref counting
+ */
+extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
+extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
+					   int tag);
+extern void	xfs_perag_put(struct xfs_perag *pag);
+extern int	xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
+
+extern void	xfs_sb_calc_crc(struct xfs_buf *bp);
+extern void	xfs_log_sb(struct xfs_trans *tp);
+extern int	xfs_sync_sb(struct xfs_mount *mp, bool wait);
+extern void	xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
+extern void	xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
+extern void	xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
+extern void	xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+
+#endif	/* __XFS_SB_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_shared.h b/kernel/fs/xfs/libxfs/xfs_shared.h
new file mode 100644
index 000000000..8dda4b321
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_shared.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+/*
+ * Transaction types.  Used to distinguish types of buffers. These never reach
+ * the log.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE	1
+#define XFS_TRANS_SETATTR_SIZE		2
+#define XFS_TRANS_INACTIVE		3
+#define XFS_TRANS_CREATE		4
+#define XFS_TRANS_CREATE_TRUNC		5
+#define XFS_TRANS_TRUNCATE_FILE		6
+#define XFS_TRANS_REMOVE		7
+#define XFS_TRANS_LINK			8
+#define XFS_TRANS_RENAME		9
+#define XFS_TRANS_MKDIR			10
+#define XFS_TRANS_RMDIR			11
+#define XFS_TRANS_SYMLINK		12
+#define XFS_TRANS_SET_DMATTRS		13
+#define XFS_TRANS_GROWFS		14
+#define XFS_TRANS_STRAT_WRITE		15
+#define XFS_TRANS_DIOSTRAT		16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define	XFS_TRANS_WRITEID		18
+#define	XFS_TRANS_ADDAFORK		19
+#define	XFS_TRANS_ATTRINVAL		20
+#define	XFS_TRANS_ATRUNCATE		21
+#define	XFS_TRANS_ATTR_SET		22
+#define	XFS_TRANS_ATTR_RM		23
+#define	XFS_TRANS_ATTR_FLAG		24
+#define	XFS_TRANS_CLEAR_AGI_BUCKET	25
+#define XFS_TRANS_SB_CHANGE		26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1		27
+#define XFS_TRANS_DUMMY2		28
+#define XFS_TRANS_QM_QUOTAOFF		29
+#define XFS_TRANS_QM_DQALLOC		30
+#define XFS_TRANS_QM_SETQLIM		31
+#define XFS_TRANS_QM_DQCLUSTER		32
+#define XFS_TRANS_QM_QINOCREATE		33
+#define XFS_TRANS_QM_QUOTAOFF_END	34
+#define XFS_TRANS_FSYNC_TS		35
+#define	XFS_TRANS_GROWFSRT_ALLOC	36
+#define	XFS_TRANS_GROWFSRT_ZERO		37
+#define	XFS_TRANS_GROWFSRT_FREE		38
+#define	XFS_TRANS_SWAPEXT		39
+#define	XFS_TRANS_CHECKPOINT		40
+#define	XFS_TRANS_ICREATE		41
+#define	XFS_TRANS_CREATE_TMPFILE	42
+#define	XFS_TRANS_TYPE_MAX		43
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+	{ XFS_TRANS_SETATTR_NOT_SIZE,	"SETATTR_NOT_SIZE" }, \
+	{ XFS_TRANS_SETATTR_SIZE,	"SETATTR_SIZE" }, \
+	{ XFS_TRANS_INACTIVE,		"INACTIVE" }, \
+	{ XFS_TRANS_CREATE,		"CREATE" }, \
+	{ XFS_TRANS_CREATE_TRUNC,	"CREATE_TRUNC" }, \
+	{ XFS_TRANS_TRUNCATE_FILE,	"TRUNCATE_FILE" }, \
+	{ XFS_TRANS_REMOVE,		"REMOVE" }, \
+	{ XFS_TRANS_LINK,		"LINK" }, \
+	{ XFS_TRANS_RENAME,		"RENAME" }, \
+	{ XFS_TRANS_MKDIR,		"MKDIR" }, \
+	{ XFS_TRANS_RMDIR,		"RMDIR" }, \
+	{ XFS_TRANS_SYMLINK,		"SYMLINK" }, \
+	{ XFS_TRANS_SET_DMATTRS,	"SET_DMATTRS" }, \
+	{ XFS_TRANS_GROWFS,		"GROWFS" }, \
+	{ XFS_TRANS_STRAT_WRITE,	"STRAT_WRITE" }, \
+	{ XFS_TRANS_DIOSTRAT,		"DIOSTRAT" }, \
+	{ XFS_TRANS_WRITEID,		"WRITEID" }, \
+	{ XFS_TRANS_ADDAFORK,		"ADDAFORK" }, \
+	{ XFS_TRANS_ATTRINVAL,		"ATTRINVAL" }, \
+	{ XFS_TRANS_ATRUNCATE,		"ATRUNCATE" }, \
+	{ XFS_TRANS_ATTR_SET,		"ATTR_SET" }, \
+	{ XFS_TRANS_ATTR_RM,		"ATTR_RM" }, \
+	{ XFS_TRANS_ATTR_FLAG,		"ATTR_FLAG" }, \
+	{ XFS_TRANS_CLEAR_AGI_BUCKET,	"CLEAR_AGI_BUCKET" }, \
+	{ XFS_TRANS_SB_CHANGE,		"SBCHANGE" }, \
+	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
+	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
+	{ XFS_TRANS_QM_QUOTAOFF,	"QM_QUOTAOFF" }, \
+	{ XFS_TRANS_QM_DQALLOC,		"QM_DQALLOC" }, \
+	{ XFS_TRANS_QM_SETQLIM,		"QM_SETQLIM" }, \
+	{ XFS_TRANS_QM_DQCLUSTER,	"QM_DQCLUSTER" }, \
+	{ XFS_TRANS_QM_QINOCREATE,	"QM_QINOCREATE" }, \
+	{ XFS_TRANS_QM_QUOTAOFF_END,	"QM_QOFF_END" }, \
+	{ XFS_TRANS_FSYNC_TS,		"FSYNC_TS" }, \
+	{ XFS_TRANS_GROWFSRT_ALLOC,	"GROWFSRT_ALLOC" }, \
+	{ XFS_TRANS_GROWFSRT_ZERO,	"GROWFSRT_ZERO" }, \
+	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
+	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
+	{ XFS_TRANS_CHECKPOINT,		"CHECKPOINT" }, \
+	{ XFS_TRANS_ICREATE,		"ICREATE" }, \
+	{ XFS_TRANS_CREATE_TMPFILE,	"CREATE_TMPFILE" }, \
+	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+	struct xfs_log_item	*lid_item;
+	struct list_head	lid_trans;
+	unsigned char		lid_flags;
+};
+
+#define XFS_LID_DIRTY		0x1
+
+/* log size calculation functions */
+int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int	xfs_log_calc_minimum_size(struct xfs_mount *);
+
+
+/*
+ * Values for t_flags.
+ */
+#define	XFS_TRANS_DIRTY		0x01	/* something needs to be logged */
+#define	XFS_TRANS_SB_DIRTY	0x02	/* superblock is modified */
+#define	XFS_TRANS_PERM_LOG_RES	0x04	/* xact took a permanent log res */
+#define	XFS_TRANS_SYNC		0x08	/* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT	0x40	/* Transaction has elevated writer
+					   count in superblock */
+/*
+ * Values for call flags parameter.
+ */
+#define	XFS_TRANS_RELEASE_LOG_RES	0x4
+#define	XFS_TRANS_ABORT			0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define	XFS_TRANS_SB_ICOUNT		0x00000001
+#define	XFS_TRANS_SB_IFREE		0x00000002
+#define	XFS_TRANS_SB_FDBLOCKS		0x00000004
+#define	XFS_TRANS_SB_RES_FDBLOCKS	0x00000008
+#define	XFS_TRANS_SB_FREXTENTS		0x00000010
+#define	XFS_TRANS_SB_RES_FREXTENTS	0x00000020
+#define	XFS_TRANS_SB_DBLOCKS		0x00000040
+#define	XFS_TRANS_SB_AGCOUNT		0x00000080
+#define	XFS_TRANS_SB_IMAXPCT		0x00000100
+#define	XFS_TRANS_SB_REXTSIZE		0x00000200
+#define	XFS_TRANS_SB_RBMBLOCKS		0x00000400
+#define	XFS_TRANS_SB_RBLOCKS		0x00000800
+#define	XFS_TRANS_SB_REXTENTS		0x00001000
+#define	XFS_TRANS_SB_REXTSLOG		0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values.  This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define	XFS_AGF_REF		4
+#define	XFS_AGI_REF		4
+#define	XFS_AGFL_REF		3
+#define	XFS_INO_BTREE_REF	3
+#define	XFS_ALLOC_BTREE_REF	2
+#define	XFS_BMAP_BTREE_REF	2
+#define	XFS_DIR_BTREE_REF	2
+#define	XFS_INO_REF		2
+#define	XFS_ATTR_BTREE_REF	1
+#define	XFS_DQUOT_REF		1
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
+#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
+
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+				 struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+#endif /* __XFS_SHARED_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_symlink_remote.c b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c
new file mode 100644
index 000000000..e7e26bd64
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+	struct xfs_mount *mp,
+	int		pathlen)
+{
+	int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+	return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return 0;
+
+	dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+	dsl->sl_offset = cpu_to_be32(offset);
+	dsl->sl_bytes = cpu_to_be32(size);
+	uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+	dsl->sl_owner = cpu_to_be64(ino);
+	dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+	if (offset != be32_to_cpu(dsl->sl_offset))
+		return false;
+	if (size != be32_to_cpu(dsl->sl_bytes))
+		return false;
+	if (ino != be64_to_cpu(dsl->sl_owner))
+		return false;
+
+	/* ok */
+	return true;
+}
+
+static bool
+xfs_symlink_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+		return false;
+	if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+		return false;
+	if (be32_to_cpu(dsl->sl_offset) +
+				be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+		return false;
+	if (dsl->sl_owner == 0)
+		return false;
+
+	return true;
+}
+
+static void
+xfs_symlink_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_symlink_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_symlink_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_symlink_verify(bp)) {
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (bip) {
+		struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+		dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	}
+	xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+	.verify_read = xfs_symlink_read_verify,
+	.verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	char			*buf;
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+		bp->b_ops = NULL;
+		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+		return;
+	}
+
+	/*
+	 * As this symlink fits in an inode literal area, it must also fit in
+	 * the smallest buffer the filesystem supports.
+	 */
+	ASSERT(BBTOB(bp->b_length) >=
+			ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	buf = bp->b_addr;
+	buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+	memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_trans_resv.c b/kernel/fs/xfs/libxfs/xfs_trans_resv.c
new file mode 100644
index 000000000..68cb1e7bf
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_trans_resv.c
@@ -0,0 +1,878 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+	return round_up(sizeof(struct xlog_op_header) +
+			sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+	uint		nbufs,
+	uint		size)
+{
+	return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - 4 log op headers for object
+ *	- for the ilf, the inode core and 2 forks
+ * - inode log format object
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ *	- the btree data contained by both forks will fit into the inode size,
+ *	  hence when combined with the inode core above, we have a total of the
+ *	  actual inode size.
+ *	- the BMBT headers need to be accounted separately, as they are
+ *	  additional to the records and pointers that fit inside the inode
+ *	  forks.
+ */
+STATIC uint
+xfs_calc_inode_res(
+	struct xfs_mount	*mp,
+	uint			ninodes)
+{
+	return ninodes *
+		(4 * sizeof(struct xlog_op_header) +
+		 sizeof(struct xfs_inode_log_format) +
+		 mp->m_sb.sb_inodesize +
+		 2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ * 	- inode allocation
+ * 	- inode free
+ * 	- inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+	struct xfs_mount 	*mp,
+	int			alloc,
+	int			modify)
+{
+	uint res;
+
+	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+		return 0;
+
+	res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+	if (alloc)
+		res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
+					XFS_FSB_TO_B(mp, 1));
+	if (modify)
+		res += (uint)XFS_FSB_TO_B(mp, 1);
+
+	return res;
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				      XFS_FSB_TO_B(mp, 1)) +
+		     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+				      XFS_FSB_TO_B(mp, 1)) +
+		    xfs_calc_buf_res(5, 0) +
+		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				     XFS_FSB_TO_B(mp, 1)) +
+		    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+				     mp->m_in_maxlevels, 0)));
+}
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 4) +
+		     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing an inode from unlinked list at first, we can modify:
+ *    the agi hash list and counters: sector size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+	struct xfs_mount        *mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+	       max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+}
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_iunlink_remove_reservation(mp) +
+		MAX((xfs_calc_inode_res(mp, 2) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For adding an inode to unlinked list we can modify:
+ *    the agi hash list: sector size
+ *    the unlinked inode: inode size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_iunlink_add_reservation(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the finobt (record modification and allocation btrees)
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 2) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		(uint)XFS_FSB_TO_B(mp, 1) +
+		xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_finobt_res(mp, 1, 1);
+}
+
+/*
+ * For create we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_alloc(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		mp->m_sb.sb_sectsize +
+		xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX(xfs_calc_create_resv_alloc(mp),
+		    xfs_calc_create_resv_modify(mp));
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion)
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		mp->m_sb.sb_sectsize +
+		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_finobt_res(mp, 0, 0);
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX(xfs_calc_icreate_resv_alloc(mp),
+		    xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return xfs_calc_icreate_reservation(mp);
+	return __xfs_calc_create_reservation(mp);
+
+}
+
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+	struct xfs_mount        *mp)
+{
+	uint	res = XFS_DQUOT_LOGRES(mp);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		res += xfs_calc_icreate_resv_alloc(mp);
+	else
+		res += xfs_calc_create_resv_alloc(mp);
+
+	return res + xfs_calc_iunlink_add_reservation(mp);
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_create_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_create_reservation(mp) +
+	       xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion, removal or modification)
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_iunlink_remove_reservation(mp) +
+		xfs_calc_buf_res(1, 0) +
+		xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+				 mp->m_in_maxlevels, 0) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_finobt_res(mp, 0, 1);
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_inode_res(mp, 2) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+		xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
+		xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+	struct xfs_mount	*mp)
+{
+	return MAX((xfs_calc_inode_res(mp, 1) +
+		    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				     XFS_FSB_TO_B(mp, 1))),
+		   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+				     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ * 	the superblock for allocations: sector size
+ *	the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ *	ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+				      XFS_FSB_TO_B(mp, 1)) +
+		     (uint)XFS_FSB_TO_B(mp,
+					XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ *	the write transaction log space for quota file extent allocation
+ *	the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_write_reservation(mp) +
+		xfs_calc_buf_res(1,
+			XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+	struct xfs_mount	*mp)
+{
+	return sizeof(struct xfs_qoff_logitem) * 2 +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+	struct xfs_mount	*mp)
+{
+	return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	/*
+	 * The following transactions are logged in physical format and
+	 * require a permanent reservation on space.
+	 */
+	resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
+	resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+	resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
+	resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+	resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+	resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+	resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+	resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+	resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+	resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+	resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+	resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+	resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
+	resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+	resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_create_tmpfile.tr_logres =
+			xfs_calc_create_tmpfile_reservation(mp);
+	resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+	resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+	resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+	resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+	resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+	resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+	resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+	resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+	resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+	resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+	resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+	resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+	resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+	resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+	resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+	resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
+	resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+	resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	/*
+	 * The following transactions are logged in logical format with
+	 * a default log count.
+	 */
+	resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+	resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
+	resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_qm_equotaoff.tr_logres =
+		xfs_calc_qm_quotaoff_end_reservation(mp);
+	resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+	resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	/* The following transaction are logged in logical format */
+	resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+	resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+	resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+	resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+	resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+	resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+	resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+	resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+}
diff --git a/kernel/fs/xfs/libxfs/xfs_trans_resv.h b/kernel/fs/xfs/libxfs/xfs_trans_resv.h
new file mode 100644
index 000000000..2d5bdfce6
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_trans_resv.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_TRANS_RESV_H__
+#define	__XFS_TRANS_RESV_H__
+
+struct xfs_mount;
+
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+	uint	tr_logres;	/* log space unit in bytes per log ticket */
+	int	tr_logcount;	/* number of log operations per log ticket */
+	int	tr_logflags;	/* log flags, currently only used for indicating
+				 * a reservation request is permanent or not */
+};
+
+struct xfs_trans_resv {
+	struct xfs_trans_res	tr_write;	/* extent alloc trans */
+	struct xfs_trans_res	tr_itruncate;	/* truncate trans */
+	struct xfs_trans_res	tr_rename;	/* rename trans */
+	struct xfs_trans_res	tr_link;	/* link trans */
+	struct xfs_trans_res	tr_remove;	/* unlink trans */
+	struct xfs_trans_res	tr_symlink;	/* symlink trans */
+	struct xfs_trans_res	tr_create;	/* create trans */
+	struct xfs_trans_res	tr_create_tmpfile; /* create O_TMPFILE trans */
+	struct xfs_trans_res	tr_mkdir;	/* mkdir trans */
+	struct xfs_trans_res	tr_ifree;	/* inode free trans */
+	struct xfs_trans_res	tr_ichange;	/* inode update trans */
+	struct xfs_trans_res	tr_growdata;	/* fs data section grow trans */
+	struct xfs_trans_res	tr_addafork;	/* add inode attr fork trans */
+	struct xfs_trans_res	tr_writeid;	/* write setuid/setgid file */
+	struct xfs_trans_res	tr_attrinval;	/* attr fork buffer
+						 * invalidation */
+	struct xfs_trans_res	tr_attrsetm;	/* set/create an attribute at
+						 * mount time */
+	struct xfs_trans_res	tr_attrsetrt;	/* set/create an attribute at
+						 * runtime */
+	struct xfs_trans_res	tr_attrrm;	/* remove an attribute */
+	struct xfs_trans_res	tr_clearagi;	/* clear agi unlinked bucket */
+	struct xfs_trans_res	tr_growrtalloc;	/* grow realtime allocations */
+	struct xfs_trans_res	tr_growrtzero;	/* grow realtime zeroing */
+	struct xfs_trans_res	tr_growrtfree;	/* grow realtime freeing */
+	struct xfs_trans_res	tr_qm_setqlim;	/* adjust quota limits */
+	struct xfs_trans_res	tr_qm_dqalloc;	/* allocate quota on disk */
+	struct xfs_trans_res	tr_qm_quotaoff;	/* turn quota off */
+	struct xfs_trans_res	tr_qm_equotaoff;/* end of turn quota off */
+	struct xfs_trans_res	tr_sb;		/* modify superblock */
+	struct xfs_trans_res	tr_fsyncts;	/* update timestamps on fsync */
+};
+
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp)	(&(mp)->m_resv)
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define	XFS_ALLOCFREE_LOG_RES(mp,nx) \
+	((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define	XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+	((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define	XFS_DIROP_LOG_RES(mp)	\
+	(XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+	 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define	XFS_DIROP_LOG_COUNT(mp)	\
+	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * Various log count values.
+ */
+#define	XFS_DEFAULT_LOG_COUNT		1
+#define	XFS_DEFAULT_PERM_LOG_COUNT	2
+#define	XFS_ITRUNCATE_LOG_COUNT		2
+#define XFS_INACTIVE_LOG_COUNT		2
+#define	XFS_CREATE_LOG_COUNT		2
+#define	XFS_CREATE_TMPFILE_LOG_COUNT	2
+#define	XFS_MKDIR_LOG_COUNT		3
+#define	XFS_SYMLINK_LOG_COUNT		3
+#define	XFS_REMOVE_LOG_COUNT		2
+#define	XFS_LINK_LOG_COUNT		2
+#define	XFS_RENAME_LOG_COUNT		2
+#define	XFS_WRITE_LOG_COUNT		2
+#define	XFS_ADDAFORK_LOG_COUNT		2
+#define	XFS_ATTRINVAL_LOG_COUNT		1
+#define	XFS_ATTRSET_LOG_COUNT		3
+#define	XFS_ATTRRM_LOG_COUNT		3
+
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+
+#endif	/* __XFS_TRANS_RESV_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_trans_space.h b/kernel/fs/xfs/libxfs/xfs_trans_space.h
new file mode 100644
index 000000000..bf9c45793
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_trans_space.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
+		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define	XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+	  XFS_EXTENTADD_SPACE_RES(mp,w))
+#define	XFS_DAENTER_1B(mp,w)	\
+	((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
+#define	XFS_DAENTER_DBS(mp,w)	\
+	(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
+#define	XFS_DAENTER_BLOCKS(mp,w)	\
+	(XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define	XFS_DAENTER_BMAP1B(mp,w)	\
+	XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define	XFS_DAENTER_BMAPS(mp,w)		\
+	(XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define	XFS_DAENTER_SPACE_RES(mp,w)	\
+	(XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define	XFS_DAREMOVE_SPACE_RES(mp,w)	XFS_DAENTER_BMAPS(mp,w)
+#define	XFS_DIRENTER_MAX_SPLIT(mp,nl)	1
+#define	XFS_DIRENTER_SPACE_RES(mp,nl)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+	 XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define	XFS_DIRREMOVE_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define	XFS_IALLOC_SPACE_RES(mp)	\
+	((mp)->m_ialloc_blks + \
+	 (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+	  ((mp)->m_in_maxlevels - 1)))
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define	XFS_ADDAFORK_SPACE_RES(mp)	\
+	((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+#define	XFS_ATTRRM_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define	XFS_ATTRSET_SPACE_RES(mp, v)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define	XFS_CREATE_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_DIOSTRAT_SPACE_RES(mp, v)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define	XFS_GROWFS_SPACE_RES(mp)	\
+	(2 * XFS_AG_MAXLEVELS(mp))
+#define	XFS_GROWFSRT_SPACE_RES(mp,b)	\
+	((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define	XFS_LINK_SPACE_RES(mp,nl)	\
+	XFS_DIRENTER_SPACE_RES(mp,nl)
+#define	XFS_MKDIR_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_QM_DQALLOC_SPACE_RES(mp)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+	 XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define	XFS_QM_QINOCREATE_SPACE_RES(mp)	\
+	XFS_IALLOC_SPACE_RES(mp)
+#define	XFS_REMOVE_SPACE_RES(mp)	\
+	XFS_DIRREMOVE_SPACE_RES(mp)
+#define	XFS_RENAME_SPACE_RES(mp,nl)	\
+	(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_SYMLINK_SPACE_RES(mp,nl,b)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp)		\
+	(xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+
+
+#endif	/* __XFS_TRANS_SPACE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_types.h b/kernel/fs/xfs/libxfs/xfs_types.h
new file mode 100644
index 000000000..b79dc66b2
--- /dev/null
+++ b/kernel/fs/xfs/libxfs/xfs_types.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TYPES_H__
+#define	__XFS_TYPES_H__
+
+typedef __uint32_t	prid_t;		/* project ID */
+
+typedef __uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef	__uint32_t	xfs_agino_t;	/* inode # within allocation grp */
+typedef	__uint32_t	xfs_extlen_t;	/* extent length in blocks */
+typedef	__uint32_t	xfs_agnumber_t;	/* allocation group number */
+typedef __int32_t	xfs_extnum_t;	/* # of extents in a file */
+typedef __int16_t	xfs_aextnum_t;	/* # extents in an attribute fork */
+typedef	__int64_t	xfs_fsize_t;	/* bytes in a file */
+typedef __uint64_t	xfs_ufsize_t;	/* unsigned bytes in a file */
+
+typedef	__int32_t	xfs_suminfo_t;	/* type of bitmap summary info */
+typedef	__int32_t	xfs_rtword_t;	/* word type for bitmap manipulations */
+
+typedef	__int64_t	xfs_lsn_t;	/* log sequence number */
+typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
+
+typedef	__uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
+typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
+
+typedef	__uint64_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint64_t	xfs_rfsblock_t;	/* blockno in filesystem (raw) */
+typedef __uint64_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
+typedef __uint64_t	xfs_filblks_t;	/* number of blocks in a file */
+
+typedef	__int64_t	xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
+typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
+
+/*
+ * Null values for the types.
+ */
+#define	NULLFSBLOCK	((xfs_fsblock_t)-1)
+#define	NULLRFSBLOCK	((xfs_rfsblock_t)-1)
+#define	NULLRTBLOCK	((xfs_rtblock_t)-1)
+#define	NULLFILEOFF	((xfs_fileoff_t)-1)
+
+#define	NULLAGBLOCK	((xfs_agblock_t)-1)
+#define	NULLAGNUMBER	((xfs_agnumber_t)-1)
+#define	NULLEXTNUM	((xfs_extnum_t)-1)
+
+#define NULLCOMMITLSN	((xfs_lsn_t)-1)
+
+#define	NULLFSINO	((xfs_ino_t)-1)
+#define	NULLAGINO	((xfs_agino_t)-1)
+
+/*
+ * Max values for extlen, extnum, aextnum.
+ */
+#define	MAXEXTLEN	((xfs_extlen_t)0x001fffff)	/* 21 bits */
+#define	MAXEXTNUM	((xfs_extnum_t)0x7fffffff)	/* signed int */
+#define	MAXAEXTNUM	((xfs_aextnum_t)0x7fff)		/* signed short */
+
+/*
+ * Minimum and maximum blocksize and sectorsize.
+ * The blocksize upper limit is pretty much arbitrary.
+ * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ */
+#define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_SECTORSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_SECTORSIZE_LOG	15	/* i.e. 32768 bytes */
+#define XFS_MIN_SECTORSIZE	(1 << XFS_MIN_SECTORSIZE_LOG)
+#define XFS_MAX_SECTORSIZE	(1 << XFS_MAX_SECTORSIZE_LOG)
+
+/*
+ * Inode fork identifiers.
+ */
+#define	XFS_DATA_FORK	0
+#define	XFS_ATTR_FORK	1
+
+/*
+ * Min numbers of data/attr fork btree root pointers.
+ */
+#define MINDBTPTRS	3
+#define MINABTPTRS	2
+
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN	256
+
+typedef enum {
+	XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+
+typedef enum {
+	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
+	XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+} xfs_btnum_t;
+
+struct xfs_name {
+	const unsigned char	*name;
+	int			len;
+	int			type;
+};
+
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __uint32_t	xfs_dqid_t;
+
+/*
+ * Constants for bit manipulations.
+ */
+#define	XFS_NBBYLOG	3		/* log2(NBBY) */
+#define	XFS_WORDLOG	2		/* log2(sizeof(xfs_rtword_t)) */
+#define	XFS_NBWORDLOG	(XFS_NBBYLOG + XFS_WORDLOG)
+#define	XFS_NBWORD	(1 << XFS_NBWORDLOG)
+#define	XFS_WORDMASK	((1 << XFS_WORDLOG) - 1)
+
+
+#endif	/* __XFS_TYPES_H__ */
diff --git a/kernel/fs/xfs/mrlock.h b/kernel/fs/xfs/mrlock.h
new file mode 100644
index 000000000..e3c92d19e
--- /dev/null
+++ b/kernel/fs/xfs/mrlock.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/rwsem.h>
+
+typedef struct {
+	struct rw_semaphore	mr_lock;
+#if defined(DEBUG) || defined(XFS_WARN)
+	int			mr_writer;
+#endif
+} mrlock_t;
+
+#if defined(DEBUG) || defined(XFS_WARN)
+#define mrinit(mrp, name)	\
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
+#define mrinit(mrp, name)	\
+	do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
+#define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
+#define mrfree(mrp)		do { } while (0)
+
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
+{
+	down_read_nested(&mrp->mr_lock, subclass);
+}
+
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
+{
+	down_write_nested(&mrp->mr_lock, subclass);
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 1;
+#endif
+}
+
+static inline int mrtryaccess(mrlock_t *mrp)
+{
+	return down_read_trylock(&mrp->mr_lock);
+}
+
+static inline int mrtryupdate(mrlock_t *mrp)
+{
+	if (!down_write_trylock(&mrp->mr_lock))
+		return 0;
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 1;
+#endif
+	return 1;
+}
+
+static inline void mrunlock_excl(mrlock_t *mrp)
+{
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 0;
+#endif
+	up_write(&mrp->mr_lock);
+}
+
+static inline void mrunlock_shared(mrlock_t *mrp)
+{
+	up_read(&mrp->mr_lock);
+}
+
+static inline void mrdemote(mrlock_t *mrp)
+{
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 0;
+#endif
+	downgrade_write(&mrp->mr_lock);
+}
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/kernel/fs/xfs/uuid.c b/kernel/fs/xfs/uuid.c
new file mode 100644
index 000000000..b83f76b6d
--- /dev/null
+++ b/kernel/fs/xfs/uuid.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <xfs.h>
+
+/* IRIX interpretation of an uuid_t */
+typedef struct {
+	__be32	uu_timelow;
+	__be16	uu_timemid;
+	__be16	uu_timehi;
+	__be16	uu_clockseq;
+	__be16	uu_node[3];
+} xfs_uu_t;
+
+/*
+ * uuid_getnodeuniq - obtain the node unique fields of a UUID.
+ *
+ * This is not in any way a standard or condoned UUID function;
+ * it just something that's needed for user-level file handles.
+ */
+void
+uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
+{
+	xfs_uu_t *uup = (xfs_uu_t *)uuid;
+
+	fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
+		   be16_to_cpu(uup->uu_timemid);
+	fsid[1] = be32_to_cpu(uup->uu_timelow);
+}
+
+int
+uuid_is_nil(uuid_t *uuid)
+{
+	int	i;
+	char	*cp = (char *)uuid;
+
+	if (uuid == NULL)
+		return 0;
+	/* implied check of version number here... */
+	for (i = 0; i < sizeof *uuid; i++)
+		if (*cp++) return 0;	/* not nil */
+	return 1;	/* is nil */
+}
+
+int
+uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
+{
+	return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
+}
diff --git a/kernel/fs/xfs/uuid.h b/kernel/fs/xfs/uuid.h
new file mode 100644
index 000000000..104db0f3b
--- /dev/null
+++ b/kernel/fs/xfs/uuid.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_UUID_H__
+#define __XFS_SUPPORT_UUID_H__
+
+typedef struct {
+	unsigned char	__u_bits[16];
+} uuid_t;
+
+extern int uuid_is_nil(uuid_t *uuid);
+extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
+extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+	memcpy(dst, src, sizeof(uuid_t));
+}
+
+#endif	/* __XFS_SUPPORT_UUID_H__ */
diff --git a/kernel/fs/xfs/xfs.h b/kernel/fs/xfs/xfs.h
new file mode 100644
index 000000000..a742c47f7
--- /dev/null
+++ b/kernel/fs/xfs/xfs.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_H__
+#define __XFS_H__
+
+#ifdef CONFIG_XFS_DEBUG
+#define STATIC
+#define DEBUG 1
+#define XFS_BUF_LOCK_TRACKING 1
+#endif
+
+#ifdef CONFIG_XFS_WARN
+#define XFS_WARN 1
+#endif
+
+
+#include "xfs_linux.h"
+
+#endif	/* __XFS_H__ */
diff --git a/kernel/fs/xfs/xfs_acl.c b/kernel/fs/xfs/xfs_acl.c
new file mode 100644
index 000000000..4b641676f
--- /dev/null
+++ b/kernel/fs/xfs/xfs_acl.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_trace.h"
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+
+/*
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
+ */
+
+STATIC struct posix_acl *
+xfs_acl_from_disk(
+	struct xfs_acl	*aclp,
+	int		max_entries)
+{
+	struct posix_acl_entry *acl_e;
+	struct posix_acl *acl;
+	struct xfs_acl_entry *ace;
+	unsigned int count, i;
+
+	count = be32_to_cpu(aclp->acl_cnt);
+	if (count > max_entries)
+		return ERR_PTR(-EFSCORRUPTED);
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		acl_e = &acl->a_entries[i];
+		ace = &aclp->acl_entry[i];
+
+		/*
+		 * The tag is 32 bits on disk and 16 bits in core.
+		 *
+		 * Because every access to it goes through the core
+		 * format first this is not a problem.
+		 */
+		acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+		acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+			break;
+		case ACL_GROUP:
+			acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			break;
+		default:
+			goto fail;
+		}
+	}
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+	const struct posix_acl_entry *acl_e;
+	struct xfs_acl_entry *ace;
+	int i;
+
+	aclp->acl_cnt = cpu_to_be32(acl->a_count);
+	for (i = 0; i < acl->a_count; i++) {
+		ace = &aclp->acl_entry[i];
+		acl_e = &acl->a_entries[i];
+
+		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+			break;
+		default:
+			ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+			break;
+		}
+
+		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+	}
+}
+
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	struct posix_acl *acl = NULL;
+	struct xfs_acl *xfs_acl;
+	unsigned char *ea_name;
+	int error;
+	int len;
+
+	trace_xfs_get_acl(ip);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		ea_name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	/*
+	 * If we have a cached ACLs value just return it, not need to
+	 * go out to the disk.
+	 */
+	len = XFS_ACL_MAX_SIZE(ip->i_mount);
+	xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+	if (!xfs_acl)
+		return ERR_PTR(-ENOMEM);
+
+	error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+							&len, ATTR_ROOT);
+	if (error) {
+		/*
+		 * If the attribute doesn't exist make sure we have a negative
+		 * cache entry, for any other error assume it is transient and
+		 * leave the cache entry as ACL_NOT_CACHED.
+		 */
+		if (error == -ENOATTR)
+			goto out_update_cache;
+		goto out;
+	}
+
+	acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+	if (IS_ERR(acl))
+		goto out;
+
+out_update_cache:
+	set_cached_acl(inode, type, acl);
+out:
+	kmem_free(xfs_acl);
+	return acl;
+}
+
+STATIC int
+__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	unsigned char *ea_name;
+	int error;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		ea_name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		struct xfs_acl *xfs_acl;
+		int len = XFS_ACL_MAX_SIZE(ip->i_mount);
+
+		xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+		if (!xfs_acl)
+			return -ENOMEM;
+
+		xfs_acl_to_disk(xfs_acl, acl);
+
+		/* subtract away the unused acl entries */
+		len -= sizeof(struct xfs_acl_entry) *
+			 (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
+
+		error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+				len, ATTR_ROOT);
+
+		kmem_free(xfs_acl);
+	} else {
+		/*
+		 * A NULL ACL argument means we want to remove the ACL.
+		 */
+		error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+
+		/*
+		 * If the attribute didn't exist to start with that's fine.
+		 */
+		if (error == -ENOATTR)
+			error = 0;
+	}
+
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+static int
+xfs_set_mode(struct inode *inode, umode_t mode)
+{
+	int error = 0;
+
+	if (mode != inode->i_mode) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+		iattr.ia_mode = mode;
+		iattr.ia_ctime = current_fs_time(inode->i_sb);
+
+		error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+	}
+
+	return error;
+}
+
+static int
+xfs_acl_exists(struct inode *inode, unsigned char *name)
+{
+	int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
+
+	return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+			    ATTR_ROOT|ATTR_KERNOVAL) == 0);
+}
+
+int
+posix_acl_access_exists(struct inode *inode)
+{
+	return xfs_acl_exists(inode, SGI_ACL_FILE);
+}
+
+int
+posix_acl_default_exists(struct inode *inode)
+{
+	if (!S_ISDIR(inode->i_mode))
+		return 0;
+	return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
+}
+
+int
+xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int error = 0;
+
+	if (!acl)
+		goto set_acl;
+
+	error = -E2BIG;
+	if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
+		return error;
+
+	if (type == ACL_TYPE_ACCESS) {
+		umode_t mode = inode->i_mode;
+		error = posix_acl_equiv_mode(acl, &mode);
+
+		if (error <= 0) {
+			acl = NULL;
+
+			if (error < 0)
+				return error;
+		}
+
+		error = xfs_set_mode(inode, mode);
+		if (error)
+			return error;
+	}
+
+ set_acl:
+	return __xfs_set_acl(inode, type, acl);
+}
diff --git a/kernel/fs/xfs/xfs_acl.h b/kernel/fs/xfs/xfs_acl.h
new file mode 100644
index 000000000..3841b07f2
--- /dev/null
+++ b/kernel/fs/xfs/xfs_acl.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ACL_H__
+#define __XFS_ACL_H__
+
+struct inode;
+struct posix_acl;
+struct xfs_inode;
+
+#ifdef CONFIG_XFS_POSIX_ACL
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int posix_acl_access_exists(struct inode *inode);
+extern int posix_acl_default_exists(struct inode *inode);
+#else
+static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
+{
+	return NULL;
+}
+# define xfs_set_acl					NULL
+# define posix_acl_access_exists(inode)			0
+# define posix_acl_default_exists(inode)		0
+#endif /* CONFIG_XFS_POSIX_ACL */
+#endif	/* __XFS_ACL_H__ */
diff --git a/kernel/fs/xfs/xfs_aops.c b/kernel/fs/xfs/xfs_aops.c
new file mode 100644
index 000000000..a56960dd1
--- /dev/null
+++ b/kernel/fs/xfs/xfs_aops.c
@@ -0,0 +1,1931 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+void
+xfs_count_page_state(
+	struct page		*page,
+	int			*delalloc,
+	int			*unwritten)
+{
+	struct buffer_head	*bh, *head;
+
+	*delalloc = *unwritten = 0;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_unwritten(bh))
+			(*unwritten) = 1;
+		else if (buffer_delay(bh))
+			(*delalloc) = 1;
+	} while ((bh = bh->b_this_page) != head);
+}
+
+STATIC struct block_device *
+xfs_find_bdev_for_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		return mp->m_rtdev_targp->bt_bdev;
+	else
+		return mp->m_ddev_targp->bt_bdev;
+}
+
+/*
+ * We're now finished for good with this ioend structure.
+ * Update the page state via the associated buffer_heads,
+ * release holds on the inode and bio, and finally free
+ * up memory.  Do not use the ioend after this.
+ */
+STATIC void
+xfs_destroy_ioend(
+	xfs_ioend_t		*ioend)
+{
+	struct buffer_head	*bh, *next;
+
+	for (bh = ioend->io_buffer_head; bh; bh = next) {
+		next = bh->b_private;
+		bh->b_end_io(bh, !ioend->io_error);
+	}
+
+	mempool_free(ioend, xfs_ioend_pool);
+}
+
+/*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+	return ioend->io_offset + ioend->io_size >
+		XFS_I(ioend->io_inode)->i_d.di_size;
+}
+
+STATIC int
+xfs_setfilesize_trans_alloc(
+	struct xfs_ioend	*ioend)
+{
+	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	ioend->io_append_trans = tp;
+
+	/*
+	 * We may pass freeze protection with a transaction.  So tell lockdep
+	 * we released it.
+	 */
+	rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		      1, _THIS_IP_);
+	/*
+	 * We hand off the transaction to the completion thread now, so
+	 * clear the flag here.
+	 */
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	return 0;
+}
+
+/*
+ * Update on-disk file size now that data has been written to disk.
+ */
+STATIC int
+xfs_setfilesize(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp,
+	xfs_off_t		offset,
+	size_t			size)
+{
+	xfs_fsize_t		isize;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	isize = xfs_new_eof(ip, offset + size);
+	if (!isize) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(tp, 0);
+		return 0;
+	}
+
+	trace_xfs_setfilesize(ip, offset, size);
+
+	ip->i_d.di_size = isize;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	return xfs_trans_commit(tp, 0);
+}
+
+STATIC int
+xfs_setfilesize_ioend(
+	struct xfs_ioend	*ioend)
+{
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_trans	*tp = ioend->io_append_trans;
+
+	/*
+	 * The transaction may have been allocated in the I/O submission thread,
+	 * thus we need to mark ourselves as being in a transaction manually.
+	 * Similarly for freeze protection.
+	 */
+	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+			   0, 1, _THIS_IP_);
+
+	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+}
+
+/*
+ * Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
+ */
+STATIC void
+xfs_finish_ioend(
+	struct xfs_ioend	*ioend)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining)) {
+		struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+
+		if (ioend->io_type == XFS_IO_UNWRITTEN)
+			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+		else if (ioend->io_append_trans)
+			queue_work(mp->m_data_workqueue, &ioend->io_work);
+		else
+			xfs_destroy_ioend(ioend);
+	}
+}
+
+/*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+	struct work_struct *work)
+{
+	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	int		error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		ioend->io_error = -EIO;
+		goto done;
+	}
+	if (ioend->io_error)
+		goto done;
+
+	/*
+	 * For unwritten extents we need to issue transactions to convert a
+	 * range to normal written extens after the data I/O has finished.
+	 */
+	if (ioend->io_type == XFS_IO_UNWRITTEN) {
+		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+						  ioend->io_size);
+	} else if (ioend->io_append_trans) {
+		error = xfs_setfilesize_ioend(ioend);
+	} else {
+		ASSERT(!xfs_ioend_is_append(ioend));
+	}
+
+done:
+	if (error)
+		ioend->io_error = error;
+	xfs_destroy_ioend(ioend);
+}
+
+/*
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
+ */
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+	struct inode		*inode,
+	unsigned int		type)
+{
+	xfs_ioend_t		*ioend;
+
+	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
+
+	/*
+	 * Set the count to 1 initially, which will prevent an I/O
+	 * completion callback from happening before we have started
+	 * all the I/O from calling the completion routine too early.
+	 */
+	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_error = 0;
+	ioend->io_list = NULL;
+	ioend->io_type = type;
+	ioend->io_inode = inode;
+	ioend->io_buffer_head = NULL;
+	ioend->io_buffer_tail = NULL;
+	ioend->io_offset = 0;
+	ioend->io_size = 0;
+	ioend->io_append_trans = NULL;
+
+	INIT_WORK(&ioend->io_work, xfs_end_io);
+	return ioend;
+}
+
+STATIC int
+xfs_map_blocks(
+	struct inode		*inode,
+	loff_t			offset,
+	struct xfs_bmbt_irec	*imap,
+	int			type,
+	int			nonblocking)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			count = 1 << inode->i_blkbits;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (type == XFS_IO_UNWRITTEN)
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		if (nonblocking)
+			return -EAGAIN;
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+	}
+
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       (ip->i_df.if_flags & XFS_IFEXTENTS));
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+
+	if (offset + count > mp->m_super->s_maxbytes)
+		count = mp->m_super->s_maxbytes - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				imap, &nimaps, bmapi_flags);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (error)
+		return error;
+
+	if (type == XFS_IO_DELALLOC &&
+	    (!nimaps || isnullstartblock(imap->br_startblock))) {
+		error = xfs_iomap_write_allocate(ip, offset, imap);
+		if (!error)
+			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+		return error;
+	}
+
+#ifdef DEBUG
+	if (type == XFS_IO_UNWRITTEN) {
+		ASSERT(nimaps);
+		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+	}
+#endif
+	if (nimaps)
+		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+	return 0;
+}
+
+STATIC int
+xfs_imap_valid(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	offset >>= inode->i_blkbits;
+
+	return offset >= imap->br_startoff &&
+		offset < imap->br_startoff + imap->br_blockcount;
+}
+
+/*
+ * BIO completion handler for buffered IO.
+ */
+STATIC void
+xfs_end_bio(
+	struct bio		*bio,
+	int			error)
+{
+	xfs_ioend_t		*ioend = bio->bi_private;
+
+	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+
+	/* Toss bio and pass work off to an xfsdatad thread */
+	bio->bi_private = NULL;
+	bio->bi_end_io = NULL;
+	bio_put(bio);
+
+	xfs_finish_ioend(ioend);
+}
+
+STATIC void
+xfs_submit_ioend_bio(
+	struct writeback_control *wbc,
+	xfs_ioend_t		*ioend,
+	struct bio		*bio)
+{
+	atomic_inc(&ioend->io_remaining);
+	bio->bi_private = ioend;
+	bio->bi_end_io = xfs_end_bio;
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
+}
+
+STATIC struct bio *
+xfs_alloc_ioend_bio(
+	struct buffer_head	*bh)
+{
+	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
+
+	ASSERT(bio->bi_private == NULL);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	return bio;
+}
+
+STATIC void
+xfs_start_buffer_writeback(
+	struct buffer_head	*bh)
+{
+	ASSERT(buffer_mapped(bh));
+	ASSERT(buffer_locked(bh));
+	ASSERT(!buffer_delay(bh));
+	ASSERT(!buffer_unwritten(bh));
+
+	mark_buffer_async_write(bh);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+}
+
+STATIC void
+xfs_start_page_writeback(
+	struct page		*page,
+	int			clear_dirty,
+	int			buffers)
+{
+	ASSERT(PageLocked(page));
+	ASSERT(!PageWriteback(page));
+
+	/*
+	 * if the page was not fully cleaned, we need to ensure that the higher
+	 * layers come back to it correctly. That means we need to keep the page
+	 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
+	 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
+	 * write this page in this writeback sweep will be made.
+	 */
+	if (clear_dirty) {
+		clear_page_dirty_for_io(page);
+		set_page_writeback(page);
+	} else
+		set_page_writeback_keepwrite(page);
+
+	unlock_page(page);
+
+	/* If no buffers on the page are to be written, finish it here */
+	if (!buffers)
+		end_page_writeback(page);
+}
+
+static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+{
+	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/*
+ * Submit all of the bios for all of the ioends we have saved up, covering the
+ * initial writepage page and also any probed pages.
+ *
+ * Because we may have multiple ioends spanning a page, we need to start
+ * writeback on all the buffers before we submit them for I/O. If we mark the
+ * buffers as we got, then we can end up with a page that only has buffers
+ * marked async write and I/O complete on can occur before we mark the other
+ * buffers async write.
+ *
+ * The end result of this is that we trip a bug in end_page_writeback() because
+ * we call it twice for the one page as the code in end_buffer_async_write()
+ * assumes that all buffers on the page are started at the same time.
+ *
+ * The fix is two passes across the ioend list - one to start writeback on the
+ * buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
+ */
+STATIC void
+xfs_submit_ioend(
+	struct writeback_control *wbc,
+	xfs_ioend_t		*ioend,
+	int			fail)
+{
+	xfs_ioend_t		*head = ioend;
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh;
+	struct bio		*bio;
+	sector_t		lastblock = 0;
+
+	/* Pass 1 - start writeback */
+	do {
+		next = ioend->io_list;
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+			xfs_start_buffer_writeback(bh);
+	} while ((ioend = next) != NULL);
+
+	/* Pass 2 - submit I/O */
+	ioend = head;
+	do {
+		next = ioend->io_list;
+		bio = NULL;
+
+		/*
+		 * If we are failing the IO now, just mark the ioend with an
+		 * error and finish it. This will run IO completion immediately
+		 * as there is only one reference to the ioend at this point in
+		 * time.
+		 */
+		if (fail) {
+			ioend->io_error = fail;
+			xfs_finish_ioend(ioend);
+			continue;
+		}
+
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+
+			if (!bio) {
+ retry:
+				bio = xfs_alloc_ioend_bio(bh);
+			} else if (bh->b_blocknr != lastblock + 1) {
+				xfs_submit_ioend_bio(wbc, ioend, bio);
+				goto retry;
+			}
+
+			if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
+				xfs_submit_ioend_bio(wbc, ioend, bio);
+				goto retry;
+			}
+
+			lastblock = bh->b_blocknr;
+		}
+		if (bio)
+			xfs_submit_ioend_bio(wbc, ioend, bio);
+		xfs_finish_ioend(ioend);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Cancel submission of all buffer_heads so far in this endio.
+ * Toss the endio too.  Only ever called for the initial page
+ * in a writepage request, so only ever one page.
+ */
+STATIC void
+xfs_cancel_ioend(
+	xfs_ioend_t		*ioend)
+{
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh, *next_bh;
+
+	do {
+		next = ioend->io_list;
+		bh = ioend->io_buffer_head;
+		do {
+			next_bh = bh->b_private;
+			clear_buffer_async_write(bh);
+			/*
+			 * The unwritten flag is cleared when added to the
+			 * ioend. We're not submitting for I/O so mark the
+			 * buffer unwritten again for next time around.
+			 */
+			if (ioend->io_type == XFS_IO_UNWRITTEN)
+				set_buffer_unwritten(bh);
+			unlock_buffer(bh);
+		} while ((bh = next_bh) != NULL);
+
+		mempool_free(ioend, xfs_ioend_pool);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Test to see if we've been building up a completion structure for
+ * earlier buffers -- if so, we try to append to this ioend if we
+ * can, otherwise we finish off any current ioend and start another.
+ * Return true if we've finished the given ioend.
+ */
+STATIC void
+xfs_add_to_ioend(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	xfs_off_t		offset,
+	unsigned int		type,
+	xfs_ioend_t		**result,
+	int			need_ioend)
+{
+	xfs_ioend_t		*ioend = *result;
+
+	if (!ioend || need_ioend || type != ioend->io_type) {
+		xfs_ioend_t	*previous = *result;
+
+		ioend = xfs_alloc_ioend(inode, type);
+		ioend->io_offset = offset;
+		ioend->io_buffer_head = bh;
+		ioend->io_buffer_tail = bh;
+		if (previous)
+			previous->io_list = ioend;
+		*result = ioend;
+	} else {
+		ioend->io_buffer_tail->b_private = bh;
+		ioend->io_buffer_tail = bh;
+	}
+
+	bh->b_private = NULL;
+	ioend->io_size += bh->b_size;
+}
+
+STATIC void
+xfs_map_buffer(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	sector_t		bn;
+	struct xfs_mount	*m = XFS_I(inode)->i_mount;
+	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+	      ((offset - iomap_offset) >> inode->i_blkbits);
+
+	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
+
+	bh->b_blocknr = bn;
+	set_buffer_mapped(bh);
+}
+
+STATIC void
+xfs_map_at_offset(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+	xfs_map_buffer(inode, bh, imap, offset);
+	set_buffer_mapped(bh);
+	clear_buffer_delay(bh);
+	clear_buffer_unwritten(bh);
+}
+
+/*
+ * Test if a given page contains at least one buffer of a given @type.
+ * If @check_all_buffers is true, then we walk all the buffers in the page to
+ * try to find one of the type passed in. If it is not set, then the caller only
+ * needs to check the first buffer on the page for a match.
+ */
+STATIC bool
+xfs_check_page_type(
+	struct page		*page,
+	unsigned int		type,
+	bool			check_all_buffers)
+{
+	struct buffer_head	*bh;
+	struct buffer_head	*head;
+
+	if (PageWriteback(page))
+		return false;
+	if (!page->mapping)
+		return false;
+	if (!page_has_buffers(page))
+		return false;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_unwritten(bh)) {
+			if (type == XFS_IO_UNWRITTEN)
+				return true;
+		} else if (buffer_delay(bh)) {
+			if (type == XFS_IO_DELALLOC)
+				return true;
+		} else if (buffer_dirty(bh) && buffer_mapped(bh)) {
+			if (type == XFS_IO_OVERWRITE)
+				return true;
+		}
+
+		/* If we are only checking the first buffer, we are done now. */
+		if (!check_all_buffers)
+			break;
+	} while ((bh = bh->b_this_page) != head);
+
+	return false;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc/unwritten pages only, for the original page it is possible
+ * that the page has no mapping at all.
+ */
+STATIC int
+xfs_convert_page(
+	struct inode		*inode,
+	struct page		*page,
+	loff_t			tindex,
+	struct xfs_bmbt_irec	*imap,
+	xfs_ioend_t		**ioendp,
+	struct writeback_control *wbc)
+{
+	struct buffer_head	*bh, *head;
+	xfs_off_t		end_offset;
+	unsigned long		p_offset;
+	unsigned int		type;
+	int			len, page_dirty;
+	int			count = 0, done = 0, uptodate = 1;
+ 	xfs_off_t		offset = page_offset(page);
+
+	if (page->index != tindex)
+		goto fail;
+	if (!trylock_page(page))
+		goto fail;
+	if (PageWriteback(page))
+		goto fail_unlock_page;
+	if (page->mapping != inode->i_mapping)
+		goto fail_unlock_page;
+	if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
+		goto fail_unlock_page;
+
+	/*
+	 * page_dirty is initially a count of buffers on the page before
+	 * EOF and is decremented as we move each into a cleanable state.
+	 *
+	 * Derivation:
+	 *
+	 * End offset is the highest offset that this page should represent.
+	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+	 * hence give us the correct page_dirty count. On any other page,
+	 * it will be zero and in that case we need page_dirty to be the
+	 * count of buffers on the page.
+	 */
+	end_offset = min_t(unsigned long long,
+			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+			i_size_read(inode));
+
+	/*
+	 * If the current map does not span the entire page we are about to try
+	 * to write, then give up. The only way we can write a page that spans
+	 * multiple mappings in a single writeback iteration is via the
+	 * xfs_vm_writepage() function. Data integrity writeback requires the
+	 * entire page to be written in a single attempt, otherwise the part of
+	 * the page we don't write here doesn't get written as part of the data
+	 * integrity sync.
+	 *
+	 * For normal writeback, we also don't attempt to write partial pages
+	 * here as it simply means that write_cache_pages() will see it under
+	 * writeback and ignore the page until some point in the future, at
+	 * which time this will be the only page in the file that needs
+	 * writeback.  Hence for more optimal IO patterns, we should always
+	 * avoid partial page writeback due to multiple mappings on a page here.
+	 */
+	if (!xfs_imap_valid(inode, imap, end_offset))
+		goto fail_unlock_page;
+
+	len = 1 << inode->i_blkbits;
+	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+					PAGE_CACHE_SIZE);
+	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
+	page_dirty = p_offset / len;
+
+	/*
+	 * The moment we find a buffer that doesn't match our current type
+	 * specification or can't be written, abort the loop and start
+	 * writeback. As per the above xfs_imap_valid() check, only
+	 * xfs_vm_writepage() can handle partial page writeback fully - we are
+	 * limited here to the buffers that are contiguous with the current
+	 * ioend, and hence a buffer we can't write breaks that contiguity and
+	 * we have to defer the rest of the IO to xfs_vm_writepage().
+	 */
+	bh = head = page_buffers(page);
+	do {
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+			done = 1;
+			break;
+		}
+
+		if (buffer_unwritten(bh) || buffer_delay(bh) ||
+		    buffer_mapped(bh)) {
+			if (buffer_unwritten(bh))
+				type = XFS_IO_UNWRITTEN;
+			else if (buffer_delay(bh))
+				type = XFS_IO_DELALLOC;
+			else
+				type = XFS_IO_OVERWRITE;
+
+			/*
+			 * imap should always be valid because of the above
+			 * partial page end_offset check on the imap.
+			 */
+			ASSERT(xfs_imap_valid(inode, imap, offset));
+
+			lock_buffer(bh);
+			if (type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, type,
+					 ioendp, done);
+
+			page_dirty--;
+			count++;
+		} else {
+			done = 1;
+			break;
+		}
+	} while (offset += len, (bh = bh->b_this_page) != head);
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	if (count) {
+		if (--wbc->nr_to_write <= 0 &&
+		    wbc->sync_mode == WB_SYNC_NONE)
+			done = 1;
+	}
+	xfs_start_page_writeback(page, !page_dirty, count);
+
+	return done;
+ fail_unlock_page:
+	unlock_page(page);
+ fail:
+	return 1;
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+xfs_cluster_write(
+	struct inode		*inode,
+	pgoff_t			tindex,
+	struct xfs_bmbt_irec	*imap,
+	xfs_ioend_t		**ioendp,
+	struct writeback_control *wbc,
+	pgoff_t			tlast)
+{
+	struct pagevec		pvec;
+	int			done = 0, i;
+
+	pagevec_init(&pvec, 0);
+	while (!done && tindex <= tlast) {
+		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+			break;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
+					imap, ioendp, wbc);
+			if (done)
+				break;
+		}
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+STATIC void
+xfs_vm_invalidatepage(
+	struct page		*page,
+	unsigned int		offset,
+	unsigned int		length)
+{
+	trace_xfs_invalidatepage(page->mapping->host, page, offset,
+				 length);
+	block_invalidatepage(page, offset, length);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+	struct page		*page)
+{
+	struct inode		*inode = page->mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct buffer_head	*bh, *head;
+	loff_t			offset = page_offset(page);
+
+	if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
+		goto out_invalidate;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		goto out_invalidate;
+
+	xfs_alert(ip->i_mount,
+		"page discard on page %p, inode 0x%llx, offset %llu.",
+			page, ip->i_ino, offset);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	bh = head = page_buffers(page);
+	do {
+		int		error;
+		xfs_fileoff_t	start_fsb;
+
+		if (!buffer_delay(bh))
+			goto next_buffer;
+
+		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_alert(ip->i_mount,
+			"page discard unable to remove delalloc mapping.");
+			}
+			break;
+		}
+next_buffer:
+		offset += 1 << inode->i_blkbits;
+
+	} while ((bh = bh->b_this_page) != head);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+	xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+	return;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
+ */
+STATIC int
+xfs_vm_writepage(
+	struct page		*page,
+	struct writeback_control *wbc)
+{
+	struct inode		*inode = page->mapping->host;
+	struct buffer_head	*bh, *head;
+	struct xfs_bmbt_irec	imap;
+	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
+	loff_t			offset;
+	unsigned int		type;
+	__uint64_t              end_offset;
+	pgoff_t                 end_index, last_index;
+	ssize_t			len;
+	int			err, imap_valid = 0, uptodate = 1;
+	int			count = 0;
+	int			nonblocking = 0;
+
+	trace_xfs_writepage(inode, page, 0, 0);
+
+	ASSERT(page_has_buffers(page));
+
+	/*
+	 * Refuse to write the page out if we are called from reclaim context.
+	 *
+	 * This avoids stack overflows when called from deeply used stacks in
+	 * random callers for direct reclaim or memcg reclaim.  We explicitly
+	 * allow reclaim from kswapd as the stack usage there is relatively low.
+	 *
+	 * This should never happen except in the case of a VM regression so
+	 * warn about it.
+	 */
+	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+			PF_MEMALLOC))
+		goto redirty;
+
+	/*
+	 * Given that we do not allow direct reclaim to call us, we should
+	 * never be called while in a filesystem transaction.
+	 */
+	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+		goto redirty;
+
+	/* Is this page beyond the end of the file? */
+	offset = i_size_read(inode);
+	end_index = offset >> PAGE_CACHE_SHIFT;
+	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * The page index is less than the end_index, adjust the end_offset
+	 * to the highest offset that this page should represent.
+	 * -----------------------------------------------------
+	 * |			file mapping	       | <EOF> |
+	 * -----------------------------------------------------
+	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
+	 * ^--------------------------------^----------|--------
+	 * |     desired writeback range    |      see else    |
+	 * ---------------------------------^------------------|
+	 */
+	if (page->index < end_index)
+		end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+	else {
+		/*
+		 * Check whether the page to write out is beyond or straddles
+		 * i_size or not.
+		 * -------------------------------------------------------
+		 * |		file mapping		        | <EOF>  |
+		 * -------------------------------------------------------
+		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
+		 * ^--------------------------------^-----------|---------
+		 * |				    |      Straddles     |
+		 * ---------------------------------^-----------|--------|
+		 */
+		unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+
+		/*
+		 * Skip the page if it is fully outside i_size, e.g. due to a
+		 * truncate operation that is in progress. We must redirty the
+		 * page so that reclaim stops reclaiming it. Otherwise
+		 * xfs_vm_releasepage() is called on it and gets confused.
+		 *
+		 * Note that the end_index is unsigned long, it would overflow
+		 * if the given offset is greater than 16TB on 32-bit system
+		 * and if we do check the page is fully outside i_size or not
+		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
+		 * will be evaluated to 0.  Hence this page will be redirtied
+		 * and be written out repeatedly which would result in an
+		 * infinite loop, the user program that perform this operation
+		 * will hang.  Instead, we can verify this situation by checking
+		 * if the page to write is totally beyond the i_size or if it's
+		 * offset is just equal to the EOF.
+		 */
+		if (page->index > end_index ||
+		    (page->index == end_index && offset_into_page == 0))
+			goto redirty;
+
+		/*
+		 * The page straddles i_size.  It must be zeroed out on each
+		 * and every writepage invocation because it may be mmapped.
+		 * "A file is mapped in multiples of the page size.  For a file
+		 * that is not a multiple of the page size, the remaining
+		 * memory is zeroed when mapped, and writes to that region are
+		 * not written out to the file."
+		 */
+		zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+
+		/* Adjust the end_offset to the end of file */
+		end_offset = offset;
+	}
+
+	len = 1 << inode->i_blkbits;
+
+	bh = head = page_buffers(page);
+	offset = page_offset(page);
+	type = XFS_IO_OVERWRITE;
+
+	if (wbc->sync_mode == WB_SYNC_NONE)
+		nonblocking = 1;
+
+	do {
+		int new_ioend = 0;
+
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+
+		/*
+		 * set_page_dirty dirties all buffers in a page, independent
+		 * of their state.  The dirty state however is entirely
+		 * meaningless for holes (!mapped && uptodate), so skip
+		 * buffers covering holes here.
+		 */
+		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+			imap_valid = 0;
+			continue;
+		}
+
+		if (buffer_unwritten(bh)) {
+			if (type != XFS_IO_UNWRITTEN) {
+				type = XFS_IO_UNWRITTEN;
+				imap_valid = 0;
+			}
+		} else if (buffer_delay(bh)) {
+			if (type != XFS_IO_DELALLOC) {
+				type = XFS_IO_DELALLOC;
+				imap_valid = 0;
+			}
+		} else if (buffer_uptodate(bh)) {
+			if (type != XFS_IO_OVERWRITE) {
+				type = XFS_IO_OVERWRITE;
+				imap_valid = 0;
+			}
+		} else {
+			if (PageUptodate(page))
+				ASSERT(buffer_mapped(bh));
+			/*
+			 * This buffer is not uptodate and will not be
+			 * written to disk.  Ensure that we will put any
+			 * subsequent writeable buffers into a new
+			 * ioend.
+			 */
+			imap_valid = 0;
+			continue;
+		}
+
+		if (imap_valid)
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		if (!imap_valid) {
+			/*
+			 * If we didn't have a valid mapping then we need to
+			 * put the new mapping into a separate ioend structure.
+			 * This ensures non-contiguous extents always have
+			 * separate ioends, which is particularly important
+			 * for unwritten extent conversion at I/O completion
+			 * time.
+			 */
+			new_ioend = 1;
+			err = xfs_map_blocks(inode, offset, &imap, type,
+					     nonblocking);
+			if (err)
+				goto error;
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		}
+		if (imap_valid) {
+			lock_buffer(bh);
+			if (type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+					 new_ioend);
+			count++;
+		}
+
+		if (!iohead)
+			iohead = ioend;
+
+	} while (offset += len, ((bh = bh->b_this_page) != head));
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	xfs_start_page_writeback(page, 1, count);
+
+	/* if there is no IO to be submitted for this page, we are done */
+	if (!ioend)
+		return 0;
+
+	ASSERT(iohead);
+
+	/*
+	 * Any errors from this point onwards need tobe reported through the IO
+	 * completion path as we have marked the initial page as under writeback
+	 * and unlocked it.
+	 */
+	if (imap_valid) {
+		xfs_off_t		end_index;
+
+		end_index = imap.br_startoff + imap.br_blockcount;
+
+		/* to bytes */
+		end_index <<= inode->i_blkbits;
+
+		/* to pages */
+		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+		/* check against file size */
+		if (end_index > last_index)
+			end_index = last_index;
+
+		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+				  wbc, end_index);
+	}
+
+
+	/*
+	 * Reserve log space if we might write beyond the on-disk inode size.
+	 */
+	err = 0;
+	if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+		err = xfs_setfilesize_trans_alloc(ioend);
+
+	xfs_submit_ioend(wbc, iohead, err);
+
+	return 0;
+
+error:
+	if (iohead)
+		xfs_cancel_ioend(iohead);
+
+	if (err == -EAGAIN)
+		goto redirty;
+
+	xfs_aops_discard_page(page);
+	ClearPageUptodate(page);
+	unlock_page(page);
+	return err;
+
+redirty:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+STATIC int
+xfs_vm_writepages(
+	struct address_space	*mapping,
+	struct writeback_control *wbc)
+{
+	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	return generic_writepages(mapping, wbc);
+}
+
+/*
+ * Called to move a page into cleanable state - and from there
+ * to be released. The page should already be clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 1 if the page is ok to release, 0 otherwise.
+ */
+STATIC int
+xfs_vm_releasepage(
+	struct page		*page,
+	gfp_t			gfp_mask)
+{
+	int			delalloc, unwritten;
+
+	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+
+	xfs_count_page_state(page, &delalloc, &unwritten);
+
+	if (WARN_ON_ONCE(delalloc))
+		return 0;
+	if (WARN_ON_ONCE(unwritten))
+		return 0;
+
+	return try_to_free_buffers(page);
+}
+
+/*
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
+ */
+static void
+xfs_map_direct(
+	struct inode		*inode,
+	struct buffer_head	*bh_result,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	struct xfs_ioend	*ioend;
+	xfs_off_t		size = bh_result->b_size;
+	int			type;
+
+	if (ISUNWRITTEN(imap))
+		type = XFS_IO_UNWRITTEN;
+	else
+		type = XFS_IO_OVERWRITE;
+
+	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+
+	if (bh_result->b_private) {
+		ioend = bh_result->b_private;
+		ASSERT(ioend->io_size > 0);
+		ASSERT(offset >= ioend->io_offset);
+		if (offset + size > ioend->io_offset + ioend->io_size)
+			ioend->io_size = offset - ioend->io_offset + size;
+
+		if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+			ioend->io_type = XFS_IO_UNWRITTEN;
+
+		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+					      ioend->io_size, ioend->io_type,
+					      imap);
+	} else if (type == XFS_IO_UNWRITTEN ||
+		   offset + size > i_size_read(inode)) {
+		ioend = xfs_alloc_ioend(inode, type);
+		ioend->io_offset = offset;
+		ioend->io_size = size;
+
+		bh_result->b_private = ioend;
+		set_buffer_defer_completion(bh_result);
+
+		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+					   imap);
+	} else {
+		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+					    imap);
+	}
+}
+
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset,
+	ssize_t			size)
+{
+	xfs_off_t		mapping_size;
+
+	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+	mapping_size <<= inode->i_blkbits;
+
+	ASSERT(mapping_size > 0);
+	if (mapping_size > size)
+		mapping_size = size;
+	if (offset < i_size_read(inode) &&
+	    offset + mapping_size >= i_size_read(inode)) {
+		/* limit mapping to block that spans EOF */
+		mapping_size = roundup_64(i_size_read(inode) - offset,
+					  1 << inode->i_blkbits);
+	}
+	if (mapping_size > LONG_MAX)
+		mapping_size = LONG_MAX;
+
+	bh_result->b_size = mapping_size;
+}
+
+STATIC int
+__xfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create,
+	int			direct)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			lockmode = 0;
+	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1;
+	xfs_off_t		offset;
+	ssize_t			size;
+	int			new = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	offset = (xfs_off_t)iblock << inode->i_blkbits;
+	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+	size = bh_result->b_size;
+
+	if (!create && direct && offset >= i_size_read(inode))
+		return 0;
+
+	/*
+	 * Direct I/O is usually done on preallocated files, so try getting
+	 * a block mapping without an exclusive lock first.  For buffered
+	 * writes we already have the exclusive iolock anyway, so avoiding
+	 * a lock roundtrip here by taking the ilock exclusive from the
+	 * beginning is a useful micro optimization.
+	 */
+	if (create && !direct) {
+		lockmode = XFS_ILOCK_EXCL;
+		xfs_ilock(ip, lockmode);
+	} else {
+		lockmode = xfs_ilock_data_map_shared(ip);
+	}
+
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+	if (offset + size > mp->m_super->s_maxbytes)
+		size = mp->m_super->s_maxbytes - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, XFS_BMAPI_ENTIRE);
+	if (error)
+		goto out_unlock;
+
+	if (create &&
+	    (!nimaps ||
+	     (imap.br_startblock == HOLESTARTBLOCK ||
+	      imap.br_startblock == DELAYSTARTBLOCK))) {
+		if (direct || xfs_get_extsz_hint(ip)) {
+			/*
+			 * Drop the ilock in preparation for starting the block
+			 * allocation transaction.  It will be retaken
+			 * exclusively inside xfs_iomap_write_direct for the
+			 * actual allocation.
+			 */
+			xfs_iunlock(ip, lockmode);
+			error = xfs_iomap_write_direct(ip, offset, size,
+						       &imap, nimaps);
+			if (error)
+				return error;
+			new = 1;
+		} else {
+			/*
+			 * Delalloc reservations do not require a transaction,
+			 * we can go on without dropping the lock here. If we
+			 * are allocating a new delalloc block, make sure that
+			 * we set the new flag so that we mark the buffer new so
+			 * that we know that it is newly allocated if the write
+			 * fails.
+			 */
+			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+				new = 1;
+			error = xfs_iomap_write_delay(ip, offset, size, &imap);
+			if (error)
+				goto out_unlock;
+
+			xfs_iunlock(ip, lockmode);
+		}
+		trace_xfs_get_blocks_alloc(ip, offset, size,
+				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+						   : XFS_IO_DELALLOC, &imap);
+	} else if (nimaps) {
+		trace_xfs_get_blocks_found(ip, offset, size,
+				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+						   : XFS_IO_OVERWRITE, &imap);
+		xfs_iunlock(ip, lockmode);
+	} else {
+		trace_xfs_get_blocks_notfound(ip, offset, size);
+		goto out_unlock;
+	}
+
+	/* trim mapping down to size requested */
+	if (direct || size > (1 << inode->i_blkbits))
+		xfs_map_trim_size(inode, iblock, bh_result,
+				  &imap, offset, size);
+
+	/*
+	 * For unwritten extents do not report a disk address in the buffered
+	 * read case (treat as if we're reading into a hole).
+	 */
+	if (imap.br_startblock != HOLESTARTBLOCK &&
+	    imap.br_startblock != DELAYSTARTBLOCK &&
+	    (create || !ISUNWRITTEN(&imap))) {
+		xfs_map_buffer(inode, bh_result, &imap, offset);
+		if (ISUNWRITTEN(&imap))
+			set_buffer_unwritten(bh_result);
+		/* direct IO needs special help */
+		if (create && direct)
+			xfs_map_direct(inode, bh_result, &imap, offset);
+	}
+
+	/*
+	 * If this is a realtime file, data may be on a different device.
+	 * to that pointed to from the buffer_head b_bdev currently.
+	 */
+	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
+
+	/*
+	 * If we previously allocated a block out beyond eof and we are now
+	 * coming back to use it then we will need to flag it as new even if it
+	 * has a disk address.
+	 *
+	 * With sub-block writes into unwritten extents we also need to mark
+	 * the buffer as new so that the unwritten parts of the buffer gets
+	 * correctly zeroed.
+	 */
+	if (create &&
+	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+	     (offset >= i_size_read(inode)) ||
+	     (new || ISUNWRITTEN(&imap))))
+		set_buffer_new(bh_result);
+
+	if (imap.br_startblock == DELAYSTARTBLOCK) {
+		BUG_ON(direct);
+		if (create) {
+			set_buffer_uptodate(bh_result);
+			set_buffer_mapped(bh_result);
+			set_buffer_delay(bh_result);
+		}
+	}
+
+	return 0;
+
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+	return error;
+}
+
+int
+xfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+}
+
+STATIC int
+xfs_get_blocks_direct(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+}
+
+/*
+ * Complete a direct I/O write request.
+ *
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+	struct kiocb		*iocb,
+	loff_t			offset,
+	ssize_t			size,
+	void			*private)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ioend	*ioend = private;
+
+	trace_xfs_gbmap_direct_endio(ip, offset, size,
+				     ioend ? ioend->io_type : 0, NULL);
+
+	if (!ioend) {
+		ASSERT(offset + size <= i_size_read(inode));
+		return;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		goto out_end_io;
+
+	/*
+	 * dio completion end_io functions are only called on writes if more
+	 * than 0 bytes was written.
+	 */
+	ASSERT(size > 0);
+
+	/*
+	 * The ioend only maps whole blocks, while the IO may be sector aligned.
+	 * Hence the ioend offset/size may not match the IO offset/size exactly.
+	 * Because we don't map overwrites within EOF into the ioend, the offset
+	 * may not match, but only if the endio spans EOF.  Either way, write
+	 * the IO sizes into the ioend so that completion processing does the
+	 * right thing.
+	 */
+	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+	ioend->io_size = size;
+	ioend->io_offset = offset;
+
+	/*
+	 * The ioend tells us whether we are doing unwritten extent conversion
+	 * or an append transaction that updates the on-disk file size. These
+	 * cases are the only cases where we should *potentially* be needing
+	 * to update the VFS inode size.
+	 *
+	 * We need to update the in-core inode size here so that we don't end up
+	 * with the on-disk inode size being outside the in-core inode size. We
+	 * have no other method of updating EOF for AIO, so always do it here
+	 * if necessary.
+	 *
+	 * We need to lock the test/set EOF update as we can be racing with
+	 * other IO completions here to update the EOF. Failing to serialise
+	 * here can result in EOF moving backwards and Bad Things Happen when
+	 * that occurs.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (offset + size > i_size_read(inode))
+		i_size_write(inode, offset + size);
+	spin_unlock(&ip->i_flags_lock);
+
+	/*
+	 * If we are doing an append IO that needs to update the EOF on disk,
+	 * do the transaction reserve now so we can use common end io
+	 * processing. Stashing the error (if there is one) in the ioend will
+	 * result in the ioend processing passing on the error if it is
+	 * possible as we can't return it from here.
+	 */
+	if (ioend->io_type == XFS_IO_OVERWRITE)
+		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+
+out_end_io:
+	xfs_end_io(&ioend->io_work);
+	return;
+}
+
+STATIC ssize_t
+xfs_vm_direct_IO(
+	struct kiocb		*iocb,
+	struct iov_iter		*iter,
+	loff_t			offset)
+{
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
+
+	if (iov_iter_rw(iter) == WRITE) {
+		return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+					    xfs_get_blocks_direct,
+					    xfs_end_io_direct_write, NULL,
+					    DIO_ASYNC_EXTEND);
+	}
+	return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+				    xfs_get_blocks_direct, NULL, NULL, 0);
+}
+
+/*
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have made it to disk yet
+ * as the page is still locked at this point.
+ */
+STATIC void
+xfs_vm_kill_delalloc_range(
+	struct inode		*inode,
+	loff_t			start,
+	loff_t			end)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error;
+
+	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
+	if (end_fsb <= start_fsb)
+		return;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+						end_fsb - start_fsb);
+	if (error) {
+		/* something screwed, just bail */
+		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+			xfs_alert(ip->i_mount,
+		"xfs_vm_write_failed: unable to clean up ino %lld",
+					ip->i_ino);
+		}
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
+STATIC void
+xfs_vm_write_failed(
+	struct inode		*inode,
+	struct page		*page,
+	loff_t			pos,
+	unsigned		len)
+{
+	loff_t			block_offset;
+	loff_t			block_start;
+	loff_t			block_end;
+	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
+	loff_t			to = from + len;
+	struct buffer_head	*bh, *head;
+
+	/*
+	 * The request pos offset might be 32 or 64 bit, this is all fine
+	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
+	 * platform, the high 32-bit will be masked off if we evaluate the
+	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
+	 * 0xfffff000 as an unsigned long, hence the result is incorrect
+	 * which could cause the following ASSERT failed in most cases.
+	 * In order to avoid this, we can evaluate the block_offset of the
+	 * start of the page by using shifts rather than masks the mismatch
+	 * problem.
+	 */
+	block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+
+	ASSERT(block_offset + from == pos);
+
+	head = page_buffers(page);
+	block_start = 0;
+	for (bh = head; bh != head || !block_start;
+	     bh = bh->b_this_page, block_start = block_end,
+				   block_offset += bh->b_size) {
+		block_end = block_start + bh->b_size;
+
+		/* skip buffers before the write */
+		if (block_end <= from)
+			continue;
+
+		/* if the buffer is after the write, we're done */
+		if (block_start >= to)
+			break;
+
+		if (!buffer_delay(bh))
+			continue;
+
+		if (!buffer_new(bh) && block_offset < i_size_read(inode))
+			continue;
+
+		xfs_vm_kill_delalloc_range(inode, block_offset,
+					   block_offset + bh->b_size);
+
+		/*
+		 * This buffer does not contain data anymore. make sure anyone
+		 * who finds it knows that for certain.
+		 */
+		clear_buffer_delay(bh);
+		clear_buffer_uptodate(bh);
+		clear_buffer_mapped(bh);
+		clear_buffer_new(bh);
+		clear_buffer_dirty(bh);
+	}
+
+}
+
+/*
+ * This used to call block_write_begin(), but it unlocks and releases the page
+ * on error, and we need that page to be able to punch stale delalloc blocks out
+ * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
+ * the appropriate point.
+ */
+STATIC int
+xfs_vm_write_begin(
+	struct file		*file,
+	struct address_space	*mapping,
+	loff_t			pos,
+	unsigned		len,
+	unsigned		flags,
+	struct page		**pagep,
+	void			**fsdata)
+{
+	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
+	struct page		*page;
+	int			status;
+
+	ASSERT(len <= PAGE_CACHE_SIZE);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	status = __block_write_begin(page, pos, len, xfs_get_blocks);
+	if (unlikely(status)) {
+		struct inode	*inode = mapping->host;
+		size_t		isize = i_size_read(inode);
+
+		xfs_vm_write_failed(inode, page, pos, len);
+		unlock_page(page);
+
+		/*
+		 * If the write is beyond EOF, we only want to kill blocks
+		 * allocated in this write, not blocks that were previously
+		 * written successfully.
+		 */
+		if (pos + len > isize) {
+			ssize_t start = max_t(ssize_t, pos, isize);
+
+			truncate_pagecache_range(inode, start, pos + len);
+		}
+
+		page_cache_release(page);
+		page = NULL;
+	}
+
+	*pagep = page;
+	return status;
+}
+
+/*
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
+ */
+STATIC int
+xfs_vm_write_end(
+	struct file		*file,
+	struct address_space	*mapping,
+	loff_t			pos,
+	unsigned		len,
+	unsigned		copied,
+	struct page		*page,
+	void			*fsdata)
+{
+	int			ret;
+
+	ASSERT(len <= PAGE_CACHE_SIZE);
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (unlikely(ret < len)) {
+		struct inode	*inode = mapping->host;
+		size_t		isize = i_size_read(inode);
+		loff_t		to = pos + len;
+
+		if (to > isize) {
+			/* only kill blocks in this write beyond EOF */
+			if (pos > isize)
+				isize = pos;
+			xfs_vm_kill_delalloc_range(inode, isize, to);
+			truncate_pagecache_range(inode, isize, to);
+		}
+	}
+	return ret;
+}
+
+STATIC sector_t
+xfs_vm_bmap(
+	struct address_space	*mapping,
+	sector_t		block)
+{
+	struct inode		*inode = (struct inode *)mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	trace_xfs_vm_bmap(XFS_I(inode));
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	filemap_write_and_wait(mapping);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return generic_block_bmap(mapping, block, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpage(
+	struct file		*unused,
+	struct page		*page)
+{
+	return mpage_readpage(page, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpages(
+	struct file		*unused,
+	struct address_space	*mapping,
+	struct list_head	*pages,
+	unsigned		nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+}
+
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+	struct page		*page)
+{
+	struct address_space	*mapping = page->mapping;
+	struct inode		*inode = mapping->host;
+	loff_t			end_offset;
+	loff_t			offset;
+	int			newly_dirty;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	end_offset = i_size_read(inode);
+	offset = page_offset(page);
+
+	spin_lock(&mapping->private_lock);
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		do {
+			if (offset < end_offset)
+				set_buffer_dirty(bh);
+			bh = bh->b_this_page;
+			offset += 1 << inode->i_blkbits;
+		} while (bh != head);
+	}
+	newly_dirty = !TestSetPageDirty(page);
+	spin_unlock(&mapping->private_lock);
+
+	if (newly_dirty) {
+		/* sigh - __set_page_dirty() is static, so copy it here, too */
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		if (page->mapping) {	/* Race with truncate? */
+			WARN_ON_ONCE(!PageUptodate(page));
+			account_page_dirtied(page, mapping);
+			radix_tree_tag_set(&mapping->page_tree,
+					page_index(page), PAGECACHE_TAG_DIRTY);
+		}
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	}
+	return newly_dirty;
+}
+
+const struct address_space_operations xfs_address_space_operations = {
+	.readpage		= xfs_vm_readpage,
+	.readpages		= xfs_vm_readpages,
+	.writepage		= xfs_vm_writepage,
+	.writepages		= xfs_vm_writepages,
+	.set_page_dirty		= xfs_vm_set_page_dirty,
+	.releasepage		= xfs_vm_releasepage,
+	.invalidatepage		= xfs_vm_invalidatepage,
+	.write_begin		= xfs_vm_write_begin,
+	.write_end		= xfs_vm_write_end,
+	.bmap			= xfs_vm_bmap,
+	.direct_IO		= xfs_vm_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
diff --git a/kernel/fs/xfs/xfs_aops.h b/kernel/fs/xfs/xfs_aops.h
new file mode 100644
index 000000000..ac644e013
--- /dev/null
+++ b/kernel/fs/xfs/xfs_aops.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_AOPS_H__
+#define __XFS_AOPS_H__
+
+extern mempool_t *xfs_ioend_pool;
+
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+	XFS_IO_DELALLOC,	/* covers delalloc region */
+	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
+	XFS_IO_OVERWRITE,	/* covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+	{ XFS_IO_DELALLOC,		"delalloc" }, \
+	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
+	{ XFS_IO_OVERWRITE,		"overwrite" }
+
+/*
+ * xfs_ioend struct manages large extent writes for XFS.
+ * It can manage several multi-page bio's at once.
+ */
+typedef struct xfs_ioend {
+	struct xfs_ioend	*io_list;	/* next ioend in chain */
+	unsigned int		io_type;	/* delalloc / unwritten */
+	int			io_error;	/* I/O error code */
+	atomic_t		io_remaining;	/* hold count */
+	struct inode		*io_inode;	/* file being written to */
+	struct buffer_head	*io_buffer_head;/* buffer linked list head */
+	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
+	size_t			io_size;	/* size of the extent */
+	xfs_off_t		io_offset;	/* offset in the file */
+	struct work_struct	io_work;	/* xfsdatad work queue */
+	struct xfs_trans	*io_append_trans;/* xact. for size update */
+} xfs_ioend_t;
+
+extern const struct address_space_operations xfs_address_space_operations;
+extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+extern void xfs_count_page_state(struct page *, int *, int *);
+
+#endif /* __XFS_AOPS_H__ */
diff --git a/kernel/fs/xfs/xfs_attr.h b/kernel/fs/xfs/xfs_attr.h
new file mode 100644
index 000000000..dd4824589
--- /dev/null
+++ b/kernel/fs/xfs/xfs_attr.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_H__
+#define	__XFS_ATTR_H__
+
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
+/*
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+/*========================================================================
+ * External interfaces
+ *========================================================================*/
+
+
+#define ATTR_DONTFOLLOW	0x0001	/* -- unused, from IRIX -- */
+#define ATTR_ROOT	0x0002	/* use attrs in root (trusted) namespace */
+#define ATTR_TRUST	0x0004	/* -- unused, from IRIX -- */
+#define ATTR_SECURE	0x0008	/* use attrs in security namespace */
+#define ATTR_CREATE	0x0010	/* pure create: fail if attr already exists */
+#define ATTR_REPLACE	0x0020	/* pure set: fail if attr does not exist */
+
+#define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
+#define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
+
+#define XFS_ATTR_FLAGS \
+	{ ATTR_DONTFOLLOW, 	"DONTFOLLOW" }, \
+	{ ATTR_ROOT,		"ROOT" }, \
+	{ ATTR_TRUST,		"TRUST" }, \
+	{ ATTR_SECURE,		"SECURE" }, \
+	{ ATTR_CREATE,		"CREATE" }, \
+	{ ATTR_REPLACE,		"REPLACE" }, \
+	{ ATTR_KERNOTIME,	"KERNOTIME" }, \
+	{ ATTR_KERNOVAL,	"KERNOVAL" }
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call.  Larger
+ * sizes will result in an ERANGE return code.
+ */
+#define	ATTR_MAX_VALUELEN	(64*1024)	/* max length of a value */
+
+/*
+ * Define how lists of attribute names are returned to the user from
+ * the attr_list() call.  A large, 32bit aligned, buffer is passed in
+ * along with its size.  We put an array of offsets at the top that each
+ * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
+ */
+typedef struct attrlist {
+	__s32	al_count;	/* number of entries in attrlist */
+	__s32	al_more;	/* T/F: more attrs (do call again) */
+	__s32	al_offset[1];	/* byte offsets of attrs [var-sized] */
+} attrlist_t;
+
+/*
+ * Show the interesting info about one attribute.  This is what the
+ * al_offset[i] entry points to.
+ */
+typedef struct attrlist_ent {	/* data from attr_list() */
+	__u32	a_valuelen;	/* number bytes in value of attr */
+	char	a_name[1];	/* attr name (NULL terminated) */
+} attrlist_ent_t;
+
+/*
+ * Given a pointer to the (char*) buffer containing the attr_list() result,
+ * and an index, return a pointer to the indicated attribute in the buffer.
+ */
+#define	ATTR_ENTRY(buffer, index)		\
+	((attrlist_ent_t *)			\
+	 &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+typedef struct attrlist_cursor_kern {
+	__u32	hashval;	/* hash value of next entry to add */
+	__u32	blkno;		/* block containing entry (suggestion) */
+	__u32	offset;		/* offset in list of equal-hashvals */
+	__u16	pad1;		/* padding to match user-level */
+	__u8	pad2;		/* padding to match user-level */
+	__u8	initted;	/* T/F: cursor has been initialized */
+} attrlist_cursor_kern_t;
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+			      unsigned char *, int, int, unsigned char *);
+
+typedef struct xfs_attr_list_context {
+	struct xfs_inode		*dp;		/* inode */
+	struct attrlist_cursor_kern	*cursor;	/* position in list */
+	char				*alist;		/* output buffer */
+	int				seen_enough;	/* T/F: seen enough of list? */
+	ssize_t				count;		/* num used entries */
+	int				dupcnt;		/* count dup hashvals seen */
+	int				bufsize;	/* total buffer size */
+	int				firstu;		/* first used byte in buffer */
+	int				flags;		/* from VOP call */
+	int				resynch;	/* T/F: resynch with cursor */
+	int				put_value;	/* T/F: need value for listent */
+	put_listent_func_t		put_listent;	/* list output fmt function */
+	int				index;		/* index into output buffer */
+} xfs_attr_list_context_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+		 unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+		 unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+		  int flags, struct attrlist_cursor_kern *cursor);
+
+
+#endif	/* __XFS_ATTR_H__ */
diff --git a/kernel/fs/xfs/xfs_attr_inactive.c b/kernel/fs/xfs/xfs_attr_inactive.c
new file mode 100644
index 000000000..3fbf167cf
--- /dev/null
+++ b/kernel/fs/xfs/xfs_attr_inactive.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_dir2.h"
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+STATIC int
+xfs_attr3_leaf_freextent(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		blkno,
+	int			blkcnt)
+{
+	struct xfs_bmbt_irec	map;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		tblkno;
+	xfs_daddr_t		dblkno;
+	int			tblkcnt;
+	int			dblkcnt;
+	int			nmap;
+	int			error;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	tblkno = blkno;
+	tblkcnt = blkcnt;
+	while (tblkcnt > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+				       &map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error) {
+			return error;
+		}
+		ASSERT(nmap == 1);
+		ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+		/*
+		 * If it's a hole, these are already unmapped
+		 * so there's nothing to invalidate.
+		 */
+		if (map.br_startblock != HOLESTARTBLOCK) {
+
+			dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+						  map.br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+						map.br_blockcount);
+			bp = xfs_trans_get_buf(*trans,
+					dp->i_mount->m_ddev_targp,
+					dblkno, dblkcnt, 0);
+			if (!bp)
+				return -ENOMEM;
+			xfs_trans_binval(*trans, bp);
+			/*
+			 * Roll to next transaction.
+			 */
+			error = xfs_trans_roll(trans, dp);
+			if (error)
+				return error;
+		}
+
+		tblkno += map.br_blockcount;
+		tblkcnt -= map.br_blockcount;
+	}
+
+	return 0;
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr3_leaf_inactive(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_attr_inactive_list *list;
+	struct xfs_attr_inactive_list *lp;
+	int			error;
+	int			count;
+	int			size;
+	int			tmp;
+	int			i;
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+
+	/*
+	 * Count the number of "remote" value extents.
+	 */
+	count = 0;
+	entry = xfs_attr3_leaf_entryp(leaf);
+	for (i = 0; i < ichdr.count; entry++, i++) {
+		if (be16_to_cpu(entry->nameidx) &&
+		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+			if (name_rmt->valueblk)
+				count++;
+		}
+	}
+
+	/*
+	 * If there are no "remote" values, we're done.
+	 */
+	if (count == 0) {
+		xfs_trans_brelse(*trans, bp);
+		return 0;
+	}
+
+	/*
+	 * Allocate storage for a list of all the "remote" value extents.
+	 */
+	size = count * sizeof(xfs_attr_inactive_list_t);
+	list = kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Identify each of the "remote" value extents.
+	 */
+	lp = list;
+	entry = xfs_attr3_leaf_entryp(leaf);
+	for (i = 0; i < ichdr.count; entry++, i++) {
+		if (be16_to_cpu(entry->nameidx) &&
+		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+			if (name_rmt->valueblk) {
+				lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+				lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
+						    be32_to_cpu(name_rmt->valuelen));
+				lp++;
+			}
+		}
+	}
+	xfs_trans_brelse(*trans, bp);	/* unlock for trans. in freextent() */
+
+	/*
+	 * Invalidate each of the "remote" value extents.
+	 */
+	error = 0;
+	for (lp = list, i = 0; i < count; i++, lp++) {
+		tmp = xfs_attr3_leaf_freextent(trans, dp,
+				lp->valueblk, lp->valuelen);
+
+		if (error == 0)
+			error = tmp;	/* save only the 1st errno */
+	}
+
+	kmem_free(list);
+	return error;
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr3_node_inactive(
+	struct xfs_trans **trans,
+	struct xfs_inode *dp,
+	struct xfs_buf	*bp,
+	int		level)
+{
+	xfs_da_blkinfo_t *info;
+	xfs_da_intnode_t *node;
+	xfs_dablk_t child_fsb;
+	xfs_daddr_t parent_blkno, child_blkno;
+	int error, i;
+	struct xfs_buf *child_bp;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr ichdr;
+
+	/*
+	 * Since this code is recursive (gasp!) we must protect ourselves.
+	 */
+	if (level > XFS_DA_NODE_MAXDEPTH) {
+		xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
+		return -EIO;
+	}
+
+	node = bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&ichdr, node);
+	parent_blkno = bp->b_bn;
+	if (!ichdr.count) {
+		xfs_trans_brelse(*trans, bp);
+		return 0;
+	}
+	btree = dp->d_ops->node_tree_p(node);
+	child_fsb = be32_to_cpu(btree[0].before);
+	xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
+
+	/*
+	 * If this is the node level just above the leaves, simply loop
+	 * over the leaves removing all of them.  If this is higher up
+	 * in the tree, recurse downward.
+	 */
+	for (i = 0; i < ichdr.count; i++) {
+		/*
+		 * Read the subsidiary block to see what we have to work with.
+		 * Don't do this in a transaction.  This is a depth-first
+		 * traversal of the tree so we may deal with many blocks
+		 * before we come back to this one.
+		 */
+		error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
+						XFS_ATTR_FORK);
+		if (error)
+			return error;
+		if (child_bp) {
+						/* save for re-read later */
+			child_blkno = XFS_BUF_ADDR(child_bp);
+
+			/*
+			 * Invalidate the subtree, however we have to.
+			 */
+			info = child_bp->b_addr;
+			switch (info->magic) {
+			case cpu_to_be16(XFS_DA_NODE_MAGIC):
+			case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+				error = xfs_attr3_node_inactive(trans, dp,
+							child_bp, level + 1);
+				break;
+			case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+			case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+				error = xfs_attr3_leaf_inactive(trans, dp,
+							child_bp);
+				break;
+			default:
+				error = -EIO;
+				xfs_trans_brelse(*trans, child_bp);
+				break;
+			}
+			if (error)
+				return error;
+
+			/*
+			 * Remove the subsidiary block from the cache
+			 * and from the log.
+			 */
+			error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+				&child_bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+			xfs_trans_binval(*trans, child_bp);
+		}
+
+		/*
+		 * If we're not done, re-read the parent to get the next
+		 * child block number.
+		 */
+		if (i + 1 < ichdr.count) {
+			error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
+						 &bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+			child_fsb = be32_to_cpu(btree[i + 1].before);
+			xfs_trans_brelse(*trans, bp);
+		}
+		/*
+		 * Atomically commit the whole invalidate stuff.
+		 */
+		error = xfs_trans_roll(trans, dp);
+		if (error)
+			return  error;
+	}
+
+	return 0;
+}
+
+/*
+ * Indiscriminately delete the entire attribute fork
+ *
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr3_root_inactive(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp)
+{
+	struct xfs_da_blkinfo	*info;
+	struct xfs_buf		*bp;
+	xfs_daddr_t		blkno;
+	int			error;
+
+	/*
+	 * Read block 0 to see what we have to work with.
+	 * We only get here if we have extents, since we remove
+	 * the extents in reverse order the extent containing
+	 * block 0 must still be there.
+	 */
+	error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+	if (error)
+		return error;
+	blkno = bp->b_bn;
+
+	/*
+	 * Invalidate the tree, even if the "tree" is only a single leaf block.
+	 * This is a depth-first traversal!
+	 */
+	info = bp->b_addr;
+	switch (info->magic) {
+	case cpu_to_be16(XFS_DA_NODE_MAGIC):
+	case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+		error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+		break;
+	case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+	case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+		error = xfs_attr3_leaf_inactive(trans, dp, bp);
+		break;
+	default:
+		error = -EIO;
+		xfs_trans_brelse(*trans, bp);
+		break;
+	}
+	if (error)
+		return error;
+
+	/*
+	 * Invalidate the incore copy of the root block.
+	 */
+	error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+	if (error)
+		return error;
+	xfs_trans_binval(*trans, bp);	/* remove from cache */
+	/*
+	 * Commit the invalidate and start the next transaction.
+	 */
+	error = xfs_trans_roll(trans, dp);
+
+	return error;
+}
+
+/*
+ * xfs_attr_inactive kills all traces of an attribute fork on an inode. It
+ * removes both the on-disk and in-memory inode fork. Note that this also has to
+ * handle the condition of inodes without attributes but with an attribute fork
+ * configured, so we can't use xfs_inode_hasattr() here.
+ *
+ * The in-memory attribute fork is removed even on error.
+ */
+int
+xfs_attr_inactive(
+	struct xfs_inode	*dp)
+{
+	struct xfs_trans	*trans;
+	struct xfs_mount	*mp;
+	int			cancel_flags = 0;
+	int			lock_mode = XFS_ILOCK_SHARED;
+	int			error = 0;
+
+	mp = dp->i_mount;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+	xfs_ilock(dp, lock_mode);
+	if (!XFS_IFORK_Q(dp))
+		goto out_destroy_fork;
+	xfs_iunlock(dp, lock_mode);
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	lock_mode = 0;
+	trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+	error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+	if (error)
+		goto out_cancel;
+
+	lock_mode = XFS_ILOCK_EXCL;
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT;
+	xfs_ilock(dp, lock_mode);
+
+	if (!XFS_IFORK_Q(dp))
+		goto out_cancel;
+
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks, not allocate, in the common case.
+	 */
+	xfs_trans_ijoin(trans, dp, 0);
+
+	/* invalidate and truncate the attribute fork extents */
+	if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr3_root_inactive(&trans, dp);
+		if (error)
+			goto out_cancel;
+
+		error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
+		if (error)
+			goto out_cancel;
+	}
+
+	/* Reset the attribute fork - this also destroys the in-core fork */
+	xfs_attr_fork_remove(dp, trans);
+
+	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, lock_mode);
+	return error;
+
+out_cancel:
+	xfs_trans_cancel(trans, cancel_flags);
+out_destroy_fork:
+	/* kill the in-core attr fork before we drop the inode lock */
+	if (dp->i_afp)
+		xfs_idestroy_fork(dp, XFS_ATTR_FORK);
+	if (lock_mode)
+		xfs_iunlock(dp, lock_mode);
+	return error;
+}
diff --git a/kernel/fs/xfs/xfs_attr_list.c b/kernel/fs/xfs/xfs_attr_list.c
new file mode 100644
index 000000000..65fb37a18
--- /dev/null
+++ b/kernel/fs/xfs/xfs_attr_list.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dir2.h"
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+	xfs_attr_sf_sort_t *sa, *sb;
+
+	sa = (xfs_attr_sf_sort_t *)a;
+	sb = (xfs_attr_sf_sort_t *)b;
+	if (sa->hash < sb->hash) {
+		return -1;
+	} else if (sa->hash > sb->hash) {
+		return 1;
+	} else {
+		return sa->entno - sb->entno;
+	}
+}
+
+#define XFS_ISRESET_CURSOR(cursor) \
+	(!((cursor)->initted) && !((cursor)->hashval) && \
+	 !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_sf_sort_t *sbuf, *sbp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_inode_t *dp;
+	int sbsize, nsbuf, count, i;
+	int error;
+
+	ASSERT(context != NULL);
+	dp = context->dp;
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_afp != NULL);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	ASSERT(sf != NULL);
+	if (!sf->hdr.count)
+		return 0;
+	cursor = context->cursor;
+	ASSERT(cursor != NULL);
+
+	trace_xfs_attr_list_sf(context);
+
+	/*
+	 * If the buffer is large enough and the cursor is at the start,
+	 * do not bother with sorting since we will return everything in
+	 * one buffer and another call using the cursor won't need to be
+	 * made.
+	 * Note the generous fudge factor of 16 overhead bytes per entry.
+	 * If bufsize is zero then put_listent must be a search function
+	 * and can just scan through what we have.
+	 */
+	if (context->bufsize == 0 ||
+	    (XFS_ISRESET_CURSOR(cursor) &&
+             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+			error = context->put_listent(context,
+					   sfe->flags,
+					   sfe->nameval,
+					   (int)sfe->namelen,
+					   (int)sfe->valuelen,
+					   &sfe->nameval[sfe->namelen]);
+
+			/*
+			 * Either search callback finished early or
+			 * didn't fit it all in the buffer after all.
+			 */
+			if (context->seen_enough)
+				break;
+
+			if (error)
+				return error;
+			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		}
+		trace_xfs_attr_list_sf_all(context);
+		return 0;
+	}
+
+	/* do no more for a search callback */
+	if (context->bufsize == 0)
+		return 0;
+
+	/*
+	 * It didn't all fit, so we have to sort everything on hashval.
+	 */
+	sbsize = sf->hdr.count * sizeof(*sbuf);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+
+	/*
+	 * Scan the attribute list for the rest of the entries, storing
+	 * the relevant info from only those that match into a buffer.
+	 */
+	nsbuf = 0;
+	for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+		if (unlikely(
+		    ((char *)sfe < (char *)sf) ||
+		    ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+			XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+					     XFS_ERRLEVEL_LOW,
+					     context->dp->i_mount, sfe);
+			kmem_free(sbuf);
+			return -EFSCORRUPTED;
+		}
+
+		sbp->entno = i;
+		sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+		sbp->name = sfe->nameval;
+		sbp->namelen = sfe->namelen;
+		/* These are bytes, and both on-disk, don't endian-flip */
+		sbp->valuelen = sfe->valuelen;
+		sbp->flags = sfe->flags;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		sbp++;
+		nsbuf++;
+	}
+
+	/*
+	 * Sort the entries on hash then entno.
+	 */
+	xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+	/*
+	 * Re-find our place IN THE SORTED LIST.
+	 */
+	count = 0;
+	cursor->initted = 1;
+	cursor->blkno = 0;
+	for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+		if (sbp->hash == cursor->hashval) {
+			if (cursor->offset == count) {
+				break;
+			}
+			count++;
+		} else if (sbp->hash > cursor->hashval) {
+			break;
+		}
+	}
+	if (i == nsbuf) {
+		kmem_free(sbuf);
+		return 0;
+	}
+
+	/*
+	 * Loop putting entries into the user buffer.
+	 */
+	for ( ; i < nsbuf; i++, sbp++) {
+		if (cursor->hashval != sbp->hash) {
+			cursor->hashval = sbp->hash;
+			cursor->offset = 0;
+		}
+		error = context->put_listent(context,
+					sbp->flags,
+					sbp->name,
+					sbp->namelen,
+					sbp->valuelen,
+					&sbp->name[sbp->namelen]);
+		if (error)
+			return error;
+		if (context->seen_enough)
+			break;
+		cursor->offset++;
+	}
+
+	kmem_free(sbuf);
+	return 0;
+}
+
+STATIC int
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	struct xfs_attr3_icleaf_hdr leafhdr;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int error, i;
+	struct xfs_buf *bp;
+	struct xfs_inode	*dp = context->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+
+	trace_xfs_attr_node_list(context);
+
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	/*
+	 * Do all sorts of validation on the passed-in cursor structure.
+	 * If anything is amiss, ignore the cursor and look up the hashval
+	 * starting from the btree root.
+	 */
+	bp = NULL;
+	if (cursor->blkno > 0) {
+		error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
+		if ((error != 0) && (error != -EFSCORRUPTED))
+			return error;
+		if (bp) {
+			struct xfs_attr_leaf_entry *entries;
+
+			node = bp->b_addr;
+			switch (be16_to_cpu(node->hdr.info.magic)) {
+			case XFS_DA_NODE_MAGIC:
+			case XFS_DA3_NODE_MAGIC:
+				trace_xfs_attr_list_wrong_blk(context);
+				xfs_trans_brelse(NULL, bp);
+				bp = NULL;
+				break;
+			case XFS_ATTR_LEAF_MAGIC:
+			case XFS_ATTR3_LEAF_MAGIC:
+				leaf = bp->b_addr;
+				xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+							     &leafhdr, leaf);
+				entries = xfs_attr3_leaf_entryp(leaf);
+				if (cursor->hashval > be32_to_cpu(
+						entries[leafhdr.count - 1].hashval)) {
+					trace_xfs_attr_list_wrong_blk(context);
+					xfs_trans_brelse(NULL, bp);
+					bp = NULL;
+				} else if (cursor->hashval <= be32_to_cpu(
+						entries[0].hashval)) {
+					trace_xfs_attr_list_wrong_blk(context);
+					xfs_trans_brelse(NULL, bp);
+					bp = NULL;
+				}
+				break;
+			default:
+				trace_xfs_attr_list_wrong_blk(context);
+				xfs_trans_brelse(NULL, bp);
+				bp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * We did not find what we expected given the cursor's contents,
+	 * so we start from the top and work down based on the hash value.
+	 * Note that start of node block is same as start of leaf block.
+	 */
+	if (bp == NULL) {
+		cursor->blkno = 0;
+		for (;;) {
+			__uint16_t magic;
+
+			error = xfs_da3_node_read(NULL, dp,
+						      cursor->blkno, -1, &bp,
+						      XFS_ATTR_FORK);
+			if (error)
+				return error;
+			node = bp->b_addr;
+			magic = be16_to_cpu(node->hdr.info.magic);
+			if (magic == XFS_ATTR_LEAF_MAGIC ||
+			    magic == XFS_ATTR3_LEAF_MAGIC)
+				break;
+			if (magic != XFS_DA_NODE_MAGIC &&
+			    magic != XFS_DA3_NODE_MAGIC) {
+				XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+						     XFS_ERRLEVEL_LOW,
+						     context->dp->i_mount,
+						     node);
+				xfs_trans_brelse(NULL, bp);
+				return -EFSCORRUPTED;
+			}
+
+			dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+			btree = dp->d_ops->node_tree_p(node);
+			for (i = 0; i < nodehdr.count; btree++, i++) {
+				if (cursor->hashval
+						<= be32_to_cpu(btree->hashval)) {
+					cursor->blkno = be32_to_cpu(btree->before);
+					trace_xfs_attr_list_node_descend(context,
+									 btree);
+					break;
+				}
+			}
+			if (i == nodehdr.count) {
+				xfs_trans_brelse(NULL, bp);
+				return 0;
+			}
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+	ASSERT(bp != NULL);
+
+	/*
+	 * Roll upward through the blocks, processing each leaf block in
+	 * order.  As long as there is space in the result buffer, keep
+	 * adding the information.
+	 */
+	for (;;) {
+		leaf = bp->b_addr;
+		error = xfs_attr3_leaf_list_int(bp, context);
+		if (error) {
+			xfs_trans_brelse(NULL, bp);
+			return error;
+		}
+		xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+		if (context->seen_enough || leafhdr.forw == 0)
+			break;
+		cursor->blkno = leafhdr.forw;
+		xfs_trans_brelse(NULL, bp);
+		error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp);
+		if (error)
+			return error;
+	}
+	xfs_trans_brelse(NULL, bp);
+	return 0;
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr3_leaf_list_int(
+	struct xfs_buf			*bp,
+	struct xfs_attr_list_context	*context)
+{
+	struct attrlist_cursor_kern	*cursor;
+	struct xfs_attr_leafblock	*leaf;
+	struct xfs_attr3_icleaf_hdr	ichdr;
+	struct xfs_attr_leaf_entry	*entries;
+	struct xfs_attr_leaf_entry	*entry;
+	int				retval;
+	int				i;
+	struct xfs_mount		*mp = context->dp->i_mount;
+
+	trace_xfs_attr_list_leaf(context);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	/*
+	 * Re-find our place in the leaf block if this is a new syscall.
+	 */
+	if (context->resynch) {
+		entry = &entries[0];
+		for (i = 0; i < ichdr.count; entry++, i++) {
+			if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+				if (cursor->offset == context->dupcnt) {
+					context->dupcnt = 0;
+					break;
+				}
+				context->dupcnt++;
+			} else if (be32_to_cpu(entry->hashval) >
+					cursor->hashval) {
+				context->dupcnt = 0;
+				break;
+			}
+		}
+		if (i == ichdr.count) {
+			trace_xfs_attr_list_notfound(context);
+			return 0;
+		}
+	} else {
+		entry = &entries[0];
+		i = 0;
+	}
+	context->resynch = 0;
+
+	/*
+	 * We have found our place, start copying out the new attributes.
+	 */
+	retval = 0;
+	for (; i < ichdr.count; entry++, i++) {
+		if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+			cursor->hashval = be32_to_cpu(entry->hashval);
+			cursor->offset = 0;
+		}
+
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* skip incomplete entries */
+
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			xfs_attr_leaf_name_local_t *name_loc =
+				xfs_attr3_leaf_name_local(leaf, i);
+
+			retval = context->put_listent(context,
+						entry->flags,
+						name_loc->nameval,
+						(int)name_loc->namelen,
+						be16_to_cpu(name_loc->valuelen),
+						&name_loc->nameval[name_loc->namelen]);
+			if (retval)
+				return retval;
+		} else {
+			xfs_attr_leaf_name_remote_t *name_rmt =
+				xfs_attr3_leaf_name_remote(leaf, i);
+
+			int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+			if (context->put_value) {
+				xfs_da_args_t args;
+
+				memset((char *)&args, 0, sizeof(args));
+				args.geo = context->dp->i_mount->m_attr_geo;
+				args.dp = context->dp;
+				args.whichfork = XFS_ATTR_FORK;
+				args.valuelen = valuelen;
+				args.rmtvaluelen = valuelen;
+				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+				args.rmtblkcnt = xfs_attr3_rmt_blocks(
+							args.dp->i_mount, valuelen);
+				retval = xfs_attr_rmtval_get(&args);
+				if (retval)
+					return retval;
+				retval = context->put_listent(context,
+						entry->flags,
+						name_rmt->name,
+						(int)name_rmt->namelen,
+						valuelen,
+						args.value);
+				kmem_free(args.value);
+			} else {
+				retval = context->put_listent(context,
+						entry->flags,
+						name_rmt->name,
+						(int)name_rmt->namelen,
+						valuelen,
+						NULL);
+			}
+			if (retval)
+				return retval;
+		}
+		if (context->seen_enough)
+			break;
+		cursor->offset++;
+	}
+	trace_xfs_attr_list_leaf_end(context);
+	return retval;
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+	int error;
+	struct xfs_buf *bp;
+
+	trace_xfs_attr_leaf_list(context);
+
+	context->cursor->blkno = 0;
+	error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
+	if (error)
+		return error;
+
+	error = xfs_attr3_leaf_list_int(bp, context);
+	xfs_trans_brelse(NULL, bp);
+	return error;
+}
+
+int
+xfs_attr_list_int(
+	xfs_attr_list_context_t *context)
+{
+	int error;
+	xfs_inode_t *dp = context->dp;
+	uint		lock_mode;
+
+	XFS_STATS_INC(xs_attr_list);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return -EIO;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	lock_mode = xfs_ilock_attr_map_shared(dp);
+	if (!xfs_inode_hasattr(dp)) {
+		error = 0;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_list(context);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_list(context);
+	} else {
+		error = xfs_attr_node_list(context);
+	}
+	xfs_iunlock(dp, lock_mode);
+	return error;
+}
+
+#define	ATTR_ENTBASESIZE		/* minimum bytes used by an attr */ \
+	(((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define	ATTR_ENTSIZE(namelen)		/* actual bytes used by an attr */ \
+	((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+	 & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+STATIC int
+xfs_attr_put_listent(
+	xfs_attr_list_context_t *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
+{
+	struct attrlist *alist = (struct attrlist *)context->alist;
+	attrlist_ent_t *aep;
+	int arraytop;
+
+	ASSERT(!(context->flags & ATTR_KERNOVAL));
+	ASSERT(context->count >= 0);
+	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+	ASSERT(context->firstu >= sizeof(*alist));
+	ASSERT(context->firstu <= context->bufsize);
+
+	/*
+	 * Only list entries in the right namespace.
+	 */
+	if (((context->flags & ATTR_SECURE) == 0) !=
+	    ((flags & XFS_ATTR_SECURE) == 0))
+		return 0;
+	if (((context->flags & ATTR_ROOT) == 0) !=
+	    ((flags & XFS_ATTR_ROOT) == 0))
+		return 0;
+
+	arraytop = sizeof(*alist) +
+			context->count * sizeof(alist->al_offset[0]);
+	context->firstu -= ATTR_ENTSIZE(namelen);
+	if (context->firstu < arraytop) {
+		trace_xfs_attr_list_full(context);
+		alist->al_more = 1;
+		context->seen_enough = 1;
+		return 1;
+	}
+
+	aep = (attrlist_ent_t *)&context->alist[context->firstu];
+	aep->a_valuelen = valuelen;
+	memcpy(aep->a_name, name, namelen);
+	aep->a_name[namelen] = 0;
+	alist->al_offset[context->count++] = context->firstu;
+	alist->al_count = context->count;
+	trace_xfs_attr_list_add(context);
+	return 0;
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths.  Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(
+	xfs_inode_t	*dp,
+	char		*buffer,
+	int		bufsize,
+	int		flags,
+	attrlist_cursor_kern_t *cursor)
+{
+	xfs_attr_list_context_t context;
+	struct attrlist *alist;
+	int error;
+
+	/*
+	 * Validate the cursor.
+	 */
+	if (cursor->pad1 || cursor->pad2)
+		return -EINVAL;
+	if ((cursor->initted == 0) &&
+	    (cursor->hashval || cursor->blkno || cursor->offset))
+		return -EINVAL;
+
+	/*
+	 * Check for a properly aligned buffer.
+	 */
+	if (((long)buffer) & (sizeof(int)-1))
+		return -EFAULT;
+	if (flags & ATTR_KERNOVAL)
+		bufsize = 0;
+
+	/*
+	 * Initialize the output buffer.
+	 */
+	memset(&context, 0, sizeof(context));
+	context.dp = dp;
+	context.cursor = cursor;
+	context.resynch = 1;
+	context.flags = flags;
+	context.alist = buffer;
+	context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
+	context.firstu = context.bufsize;
+	context.put_listent = xfs_attr_put_listent;
+
+	alist = (struct attrlist *)context.alist;
+	alist->al_count = 0;
+	alist->al_more = 0;
+	alist->al_offset[0] = context.bufsize;
+
+	error = xfs_attr_list_int(&context);
+	ASSERT(error <= 0);
+	return error;
+}
diff --git a/kernel/fs/xfs/xfs_bit.c b/kernel/fs/xfs/xfs_bit.c
new file mode 100644
index 000000000..0e8885a59
--- /dev/null
+++ b/kernel/fs/xfs/xfs_bit.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_log_format.h"
+#include "xfs_bit.h"
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+/*
+ * Return whether bitmap is empty.
+ * Size is number of words in the bitmap, which is padded to word boundary
+ * Returns 1 for empty, 0 for non-empty.
+ */
+int
+xfs_bitmap_empty(uint *map, uint size)
+{
+	uint i;
+	uint ret = 0;
+
+	for (i = 0; i < size; i++) {
+		ret |= map[i];
+	}
+
+	return (ret == 0);
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint	size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = 0;
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	ASSERT(start_bit < size);
+	size -= start_bit & ~(NBWORD - 1);
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to one first offset bits prior to start */
+		tmp |= (~0U >> (NBWORD-start_bit));
+		if (tmp != ~0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	while (size) {
+		if ((tmp = *p++) != ~0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	return result - start_bit;
+found:
+	return result + ffz(tmp) - start_bit;
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there.  It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = start_bit & ~(NBWORD - 1);
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	if (start_bit >= size)
+		return -1;
+	size -= result;
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to zero first offset bits prior to start */
+		tmp &= (~0U << start_bit);
+		if (tmp != 0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	while (size) {
+		if ((tmp = *p++) != 0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	return -1;
+found:
+	return result + ffs(tmp) - 1;
+}
diff --git a/kernel/fs/xfs/xfs_bmap_util.c b/kernel/fs/xfs/xfs_bmap_util.c
new file mode 100644
index 000000000..a52bbd3ab
--- /dev/null
+++ b/kernel/fs/xfs/xfs_bmap_util.c
@@ -0,0 +1,1920 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+
+/* Kernel only BMAP related definitions and functions */
+
+/*
+ * Convert the given file system block to a disk block.  We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+	return (XFS_IS_REALTIME_INODE(ip) ? \
+		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.  We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int						/* error */
+xfs_bmap_finish(
+	xfs_trans_t		**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*committed)	/* xact committed or not */
+{
+	xfs_efd_log_item_t	*efd;		/* extent free data */
+	xfs_efi_log_item_t	*efi;		/* extent free intention */
+	int			error;		/* error return value */
+	xfs_bmap_free_item_t	*free;		/* free extent item */
+	struct xfs_trans_res	tres;		/* new log reservation */
+	xfs_mount_t		*mp;		/* filesystem mount structure */
+	xfs_bmap_free_item_t	*next;		/* next item on free list */
+	xfs_trans_t		*ntp;		/* new transaction pointer */
+
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	if (flist->xbf_count == 0) {
+		*committed = 0;
+		return 0;
+	}
+	ntp = *tp;
+	efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+	for (free = flist->xbf_first; free; free = free->xbfi_next)
+		xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+			free->xbfi_blockcount);
+
+	tres.tr_logres = ntp->t_log_res;
+	tres.tr_logcount = ntp->t_log_count;
+	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+	ntp = xfs_trans_dup(*tp);
+	error = xfs_trans_commit(*tp, 0);
+	*tp = ntp;
+	*committed = 1;
+	/*
+	 * We have a new transaction, so we should return committed=1,
+	 * even though we're returning an error.
+	 */
+	if (error)
+		return error;
+
+	/*
+	 * transaction commit worked ok so we can drop the extra ticket
+	 * reference that we gained in xfs_trans_dup()
+	 */
+	xfs_log_ticket_put(ntp->t_ticket);
+
+	error = xfs_trans_reserve(ntp, &tres, 0, 0);
+	if (error)
+		return error;
+	efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+	for (free = flist->xbf_first; free != NULL; free = next) {
+		next = free->xbfi_next;
+		if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+				free->xbfi_blockcount))) {
+			/*
+			 * The bmap free list will be cleaned up at a
+			 * higher level.  The EFI will be canceled when
+			 * this transaction is aborted.
+			 * Need to force shutdown here to make sure it
+			 * happens, since this transaction may not be
+			 * dirty yet.
+			 */
+			mp = ntp->t_mountp;
+			if (!XFS_FORCED_SHUTDOWN(mp))
+				xfs_force_shutdown(mp,
+						   (error == -EFSCORRUPTED) ?
+						   SHUTDOWN_CORRUPT_INCORE :
+						   SHUTDOWN_META_IO_ERROR);
+			return error;
+		}
+		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+			free->xbfi_blockcount);
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	return 0;
+}
+
+int
+xfs_bmap_rtalloc(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
+	int		error;		/* error return value */
+	xfs_mount_t	*mp;		/* mount point structure */
+	xfs_extlen_t	prod = 0;	/* product factor for allocators */
+	xfs_extlen_t	ralen = 0;	/* realtime allocation length */
+	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_rtblock_t	rtb;
+
+	mp = ap->ip->i_mount;
+	align = xfs_get_extsz_hint(ap->ip);
+	prod = align / mp->m_sb.sb_rextsize;
+	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+					align, 1, ap->eof, 0,
+					ap->conv, &ap->offset, &ap->length);
+	if (error)
+		return error;
+	ASSERT(ap->length);
+	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+
+	/*
+	 * If the offset & length are not perfectly aligned
+	 * then kill prod, it will just get us in trouble.
+	 */
+	if (do_mod(ap->offset, align) || ap->length % align)
+		prod = 1;
+	/*
+	 * Set ralen to be the actual requested length in rtextents.
+	 */
+	ralen = ap->length / mp->m_sb.sb_rextsize;
+	/*
+	 * If the old value was close enough to MAXEXTLEN that
+	 * we rounded up to it, cut it back so it's valid again.
+	 * Note that if it's a really large request (bigger than
+	 * MAXEXTLEN), we don't hear about that number, and can't
+	 * adjust the starting point to match it.
+	 */
+	if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+		ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Lock out other modifications to the RT bitmap inode.
+	 */
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If it's an allocation to an empty file at offset 0,
+	 * pick an extent that will space things out in the rt area.
+	 */
+	if (ap->eof && ap->offset == 0) {
+		xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+
+		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+		if (error)
+			return error;
+		ap->blkno = rtx * mp->m_sb.sb_rextsize;
+	} else {
+		ap->blkno = 0;
+	}
+
+	xfs_bmap_adjacent(ap);
+
+	/*
+	 * Realtime allocation, done through xfs_rtallocate_extent.
+	 */
+	atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+	do_div(ap->blkno, mp->m_sb.sb_rextsize);
+	rtb = ap->blkno;
+	ap->length = ralen;
+	if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
+				&ralen, atype, ap->wasdel, prod, &rtb)))
+		return error;
+	if (rtb == NULLFSBLOCK && prod > 1 &&
+	    (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+					   ap->length, &ralen, atype,
+					   ap->wasdel, 1, &rtb)))
+		return error;
+	ap->blkno = rtb;
+	if (ap->blkno != NULLFSBLOCK) {
+		ap->blkno *= mp->m_sb.sb_rextsize;
+		ralen *= mp->m_sb.sb_rextsize;
+		ap->length = ralen;
+		ap->ip->i_d.di_nblocks += ralen;
+		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+		if (ap->wasdel)
+			ap->ip->i_delayed_blks -= ralen;
+		/*
+		 * Adjust the disk quota also. This was reserved
+		 * earlier.
+		 */
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+	} else {
+		ap->length = 0;
+	}
+	return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.  All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		endoff,
+	int			whichfork,
+	int			*eof)
+{
+	struct xfs_bmbt_irec	rec;
+	int			error;
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+	if (error || *eof)
+		return error;
+
+	*eof = endoff >= rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+	xfs_ifork_t		*ifp,
+	xfs_extnum_t		idx,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+
+	for (b = 0; b < numrecs; b++) {
+		xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+		*count += xfs_bmbt_get_blockcount(frp);
+	}
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+	xfs_bmbt_rec_t	*frp;
+
+	for (b = 1; b <= numrecs; b++) {
+		frp = XFS_BMBT_REC_ADDR(mp, block, b);
+		*count += xfs_bmbt_disk_get_blockcount(frp);
+	}
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks in use.
+ */
+STATIC int                                     /* error */
+xfs_bmap_count_tree(
+	xfs_mount_t     *mp,            /* file system mount point */
+	xfs_trans_t     *tp,            /* transaction pointer */
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fsblock_t   blockno,	/* file system block number */
+	int             levelin,	/* level in btree */
+	int		*count)		/* Count of blocks */
+{
+	int			error;
+	xfs_buf_t		*bp, *nbp;
+	int			level = levelin;
+	__be64			*pp;
+	xfs_fsblock_t           bno = blockno;
+	xfs_fsblock_t		nextbno;
+	struct xfs_btree_block	*block, *nextblock;
+	int			numrecs;
+
+	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+	if (error)
+		return error;
+	*count += 1;
+	block = XFS_BUF_TO_BLOCK(bp);
+
+	if (--level) {
+		/* Not at node above leaves, count this level of nodes */
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+		while (nextbno != NULLFSBLOCK) {
+			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				return error;
+			*count += 1;
+			nextblock = XFS_BUF_TO_BLOCK(nbp);
+			nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+			xfs_trans_brelse(tp, nbp);
+		}
+
+		/* Dive to the next level */
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		if (unlikely((error =
+		     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+			xfs_trans_brelse(tp, bp);
+			XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+					 XFS_ERRLEVEL_LOW, mp);
+			return -EFSCORRUPTED;
+		}
+		xfs_trans_brelse(tp, bp);
+	} else {
+		/* count all level 1 nodes and their leaves */
+		for (;;) {
+			nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+			numrecs = be16_to_cpu(block->bb_numrecs);
+			xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+			xfs_trans_brelse(tp, bp);
+			if (nextbno == NULLFSBLOCK)
+				break;
+			bno = nextbno;
+			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				return error;
+			*count += 1;
+			block = XFS_BUF_TO_BLOCK(bp);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int						/* error */
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	int			*count)		/* out: count of blocks */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+		xfs_bmap_count_leaves(ifp, 0,
+			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+			count);
+		return 0;
+	}
+
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	block = ifp->if_broot;
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+				 mp);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/*
+ * returns 1 for success, 0 if we failed to map the extent.
+ */
+STATIC int
+xfs_getbmapx_fix_eof_hole(
+	xfs_inode_t		*ip,		/* xfs incore inode pointer */
+	struct getbmapx		*out,		/* output structure */
+	int			prealloced,	/* this is a file with
+						 * preallocated data space */
+	__int64_t		end,		/* last block requested */
+	xfs_fsblock_t		startblock)
+{
+	__int64_t		fixlen;
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_extnum_t		lastx;		/* last extent pointer */
+	xfs_fileoff_t		fileblock;
+
+	if (startblock == HOLESTARTBLOCK) {
+		mp = ip->i_mount;
+		out->bmv_block = -1;
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+		fixlen -= out->bmv_offset;
+		if (prealloced && out->bmv_offset + out->bmv_length == end) {
+			/* Came to hole at EOF. Trim it. */
+			if (fixlen <= 0)
+				return 0;
+			out->bmv_length = fixlen;
+		}
+	} else {
+		if (startblock == DELAYSTARTBLOCK)
+			out->bmv_block = -2;
+		else
+			out->bmv_block = xfs_fsb_to_db(ip, startblock);
+		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+			out->bmv_oflags |= BMV_OF_LAST;
+	}
+
+	return 1;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int						/* error code */
+xfs_getbmap(
+	xfs_inode_t		*ip,
+	struct getbmapx		*bmv,		/* user bmap structure */
+	xfs_bmap_format_t	formatter,	/* format to user */
+	void			*arg)		/* formatter arg */
+{
+	__int64_t		bmvend;		/* last block requested */
+	int			error = 0;	/* return value */
+	__int64_t		fixlen;		/* length for -1 case */
+	int			i;		/* extent number */
+	int			lock;		/* lock state */
+	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			nex;		/* # of user extents can do */
+	int			nexleft;	/* # of user extents left */
+	int			subnex;		/* # of bmapi's can do */
+	int			nmap;		/* number of map entries */
+	struct getbmapx		*out;		/* output structure */
+	int			whichfork;	/* data or attr fork */
+	int			prealloced;	/* this is a file with
+						 * preallocated data space */
+	int			iflags;		/* interface flags */
+	int			bmapi_flags;	/* flags for xfs_bmapi */
+	int			cur_ext = 0;
+
+	mp = ip->i_mount;
+	iflags = bmv->bmv_iflags;
+	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	if (whichfork == XFS_ATTR_FORK) {
+		if (XFS_IFORK_Q(ip)) {
+			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+				return -EINVAL;
+		} else if (unlikely(
+			   ip->i_d.di_aformat != 0 &&
+			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+					 ip->i_mount);
+			return -EFSCORRUPTED;
+		}
+
+		prealloced = 0;
+		fixlen = 1LL << 32;
+	} else {
+		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+			return -EINVAL;
+
+		if (xfs_get_extsz_hint(ip) ||
+		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
+			prealloced = 1;
+			fixlen = mp->m_super->s_maxbytes;
+		} else {
+			prealloced = 0;
+			fixlen = XFS_ISIZE(ip);
+		}
+	}
+
+	if (bmv->bmv_length == -1) {
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+		bmv->bmv_length =
+			max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+	} else if (bmv->bmv_length == 0) {
+		bmv->bmv_entries = 0;
+		return 0;
+	} else if (bmv->bmv_length < 0) {
+		return -EINVAL;
+	}
+
+	nex = bmv->bmv_count - 1;
+	if (nex <= 0)
+		return -EINVAL;
+	bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+
+	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+		return -ENOMEM;
+	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
+	if (!out)
+		return -ENOMEM;
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	if (whichfork == XFS_DATA_FORK) {
+		if (!(iflags & BMV_IF_DELALLOC) &&
+		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
+			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+			if (error)
+				goto out_unlock_iolock;
+
+			/*
+			 * Even after flushing the inode, there can still be
+			 * delalloc blocks on the inode beyond EOF due to
+			 * speculative preallocation.  These are not removed
+			 * until the release function is called or the inode
+			 * is inactivated.  Hence we cannot assert here that
+			 * ip->i_delayed_blks == 0.
+			 */
+		}
+
+		lock = xfs_ilock_data_map_shared(ip);
+	} else {
+		lock = xfs_ilock_attr_map_shared(ip);
+	}
+
+	/*
+	 * Don't let nex be bigger than the number of extents
+	 * we can have assuming alternating holes and real extents.
+	 */
+	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+	bmapi_flags = xfs_bmapi_aflag(whichfork);
+	if (!(iflags & BMV_IF_PREALLOC))
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+	/*
+	 * Allocate enough space to handle "subnex" maps at a time.
+	 */
+	error = -ENOMEM;
+	subnex = 16;
+	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
+	if (!map)
+		goto out_unlock_ilock;
+
+	bmv->bmv_entries = 0;
+
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+		error = 0;
+		goto out_free_map;
+	}
+
+	nexleft = nex;
+
+	do {
+		nmap = (nexleft > subnex) ? subnex : nexleft;
+		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
+				       map, &nmap, bmapi_flags);
+		if (error)
+			goto out_free_map;
+		ASSERT(nmap <= subnex);
+
+		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+			out[cur_ext].bmv_oflags = 0;
+			if (map[i].br_state == XFS_EXT_UNWRITTEN)
+				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
+			else if (map[i].br_startblock == DELAYSTARTBLOCK)
+				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+			out[cur_ext].bmv_offset =
+				XFS_FSB_TO_BB(mp, map[i].br_startoff);
+			out[cur_ext].bmv_length =
+				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			out[cur_ext].bmv_unused1 = 0;
+			out[cur_ext].bmv_unused2 = 0;
+
+			/*
+			 * delayed allocation extents that start beyond EOF can
+			 * occur due to speculative EOF allocation when the
+			 * delalloc extent is larger than the largest freespace
+			 * extent at conversion time. These extents cannot be
+			 * converted by data writeback, so can exist here even
+			 * if we are not supposed to be finding delalloc
+			 * extents.
+			 */
+			if (map[i].br_startblock == DELAYSTARTBLOCK &&
+			    map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
+                        if (map[i].br_startblock == HOLESTARTBLOCK &&
+			    whichfork == XFS_ATTR_FORK) {
+				/* came to the end of attribute fork */
+				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+				goto out_free_map;
+			}
+
+			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+					prealloced, bmvend,
+					map[i].br_startblock))
+				goto out_free_map;
+
+			bmv->bmv_offset =
+				out[cur_ext].bmv_offset +
+				out[cur_ext].bmv_length;
+			bmv->bmv_length =
+				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+			/*
+			 * In case we don't want to return the hole,
+			 * don't increase cur_ext so that we can reuse
+			 * it in the next loop.
+			 */
+			if ((iflags & BMV_IF_NO_HOLES) &&
+			    map[i].br_startblock == HOLESTARTBLOCK) {
+				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+				continue;
+			}
+
+			nexleft--;
+			bmv->bmv_entries++;
+			cur_ext++;
+		}
+	} while (nmap && nexleft && bmv->bmv_length);
+
+ out_free_map:
+	kmem_free(map);
+ out_unlock_ilock:
+	xfs_iunlock(ip, lock);
+ out_unlock_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	for (i = 0; i < cur_ext; i++) {
+		int full = 0;	/* user array is full */
+
+		/* format results & advance arg */
+		error = formatter(&arg, &out[i], &full);
+		if (error || full)
+			break;
+	}
+
+	kmem_free(out);
+	return error;
+}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will always punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		length)
+{
+	xfs_fileoff_t		remaining = length;
+	int			error = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	do {
+		int		done;
+		xfs_bmbt_irec_t	imap;
+		int		nimaps = 1;
+		xfs_fsblock_t	firstblock;
+		xfs_bmap_free_t flist;
+
+		/*
+		 * Map the range first and check that it is a delalloc extent
+		 * before trying to unmap the range. Otherwise we will be
+		 * trying to remove a real extent (which requires a
+		 * transaction) or a hole, which is probably a bad idea...
+		 */
+		error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+				       XFS_BMAPI_ENTIRE);
+
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_alert(ip->i_mount,
+			"Failed delalloc mapping lookup ino %lld fsb %lld.",
+						ip->i_ino, start_fsb);
+			}
+			break;
+		}
+		if (!nimaps) {
+			/* nothing there */
+			goto next_block;
+		}
+		if (imap.br_startblock != DELAYSTARTBLOCK) {
+			/* been converted, ignore */
+			goto next_block;
+		}
+		WARN_ON(imap.br_blockcount == 0);
+
+		/*
+		 * Note: while we initialise the firstblock/flist pair, they
+		 * should never be used because blocks should never be
+		 * allocated or freed for a delalloc extent and hence we need
+		 * don't cancel or finish them after the xfs_bunmapi() call.
+		 */
+		xfs_bmap_init(&flist, &firstblock);
+		error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+					&flist, &done);
+		if (error)
+			break;
+
+		ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+		start_fsb++;
+		remaining--;
+	} while(remaining > 0);
+
+	return error;
+}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+	/* prealloc/delalloc exists only on regular files */
+	if (!S_ISREG(ip->i_d.di_mode))
+		return false;
+
+	/*
+	 * Zero sized files with no cached pages and delalloc blocks will not
+	 * have speculative prealloc/delalloc blocks to remove.
+	 */
+	if (VFS_I(ip)->i_size == 0 &&
+	    VFS_I(ip)->i_mapping->nrpages == 0 &&
+	    ip->i_delayed_blks == 0)
+		return false;
+
+	/* If we haven't read in the extent list, then don't do it now. */
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+		return false;
+
+	/*
+	 * Do not free real preallocated or append-only files unless the file
+	 * has delalloc blocks and we are forced to remove them.
+	 */
+	if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+		if (!force || ip->i_delayed_blks == 0)
+			return false;
+
+	return true;
+}
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+ */
+int
+xfs_free_eofblocks(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	bool		need_iolock)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	xfs_fileoff_t	end_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	map_len;
+	int		nimaps;
+	xfs_bmbt_irec_t	imap;
+
+	/*
+	 * Figure out if there are any blocks beyond the end
+	 * of the file.  If not, then there is nothing to do.
+	 */
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	if (last_fsb <= end_fsb)
+		return 0;
+	map_len = last_fsb - end_fsb;
+
+	nimaps = 1;
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!error && (nimaps != 0) &&
+	    (imap.br_startblock != HOLESTARTBLOCK ||
+	     ip->i_delayed_blks)) {
+		/*
+		 * Attach the dquots to the inode up front.
+		 */
+		error = xfs_qm_dqattach(ip, 0);
+		if (error)
+			return error;
+
+		/*
+		 * There are blocks after the end of file.
+		 * Free them up now by truncating the file to
+		 * its current size.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+		if (need_iolock) {
+			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+				xfs_trans_cancel(tp, 0);
+				return -EAGAIN;
+			}
+		}
+
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+		if (error) {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			if (need_iolock)
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return error;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, 0);
+
+		/*
+		 * Do not update the on-disk file size.  If we update the
+		 * on-disk file size and then the system crashes before the
+		 * contents of the file are flushed to disk then the files
+		 * may be full of holes (ie NULL files bug).
+		 */
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+					      XFS_ISIZE(ip));
+		if (error) {
+			/*
+			 * If we get an error at this point we simply don't
+			 * bother truncating the file.
+			 */
+			xfs_trans_cancel(tp,
+					 (XFS_TRANS_RELEASE_LOG_RES |
+					  XFS_TRANS_ABORT));
+		} else {
+			error = xfs_trans_commit(tp,
+						XFS_TRANS_RELEASE_LOG_RES);
+			if (!error)
+				xfs_inode_clear_eofblocks_tag(ip);
+		}
+
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (need_iolock)
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	}
+	return error;
+}
+
+int
+xfs_alloc_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			alloc_type)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_off_t		count;
+	xfs_filblks_t		allocated_fsb;
+	xfs_filblks_t		allocatesize_fsb;
+	xfs_extlen_t		extsz, temp;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_fsblock_t		firstfsb;
+	int			nimaps;
+	int			quota_flag;
+	int			rt;
+	xfs_trans_t		*tp;
+	xfs_bmbt_irec_t		imaps[1], *imapp;
+	xfs_bmap_free_t		free_list;
+	uint			qblocks, resblks, resrtextents;
+	int			committed;
+	int			error;
+
+	trace_xfs_alloc_file_space(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	if (len <= 0)
+		return -EINVAL;
+
+	rt = XFS_IS_REALTIME_INODE(ip);
+	extsz = xfs_get_extsz_hint(ip);
+
+	count = len;
+	imapp = &imaps[0];
+	nimaps = 1;
+	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
+	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+	/*
+	 * Allocate file space until done or until there is an error
+	 */
+	while (allocatesize_fsb && !error) {
+		xfs_fileoff_t	s, e;
+
+		/*
+		 * Determine space reservations for data/realtime.
+		 */
+		if (unlikely(extsz)) {
+			s = startoffset_fsb;
+			do_div(s, extsz);
+			s *= extsz;
+			e = startoffset_fsb + allocatesize_fsb;
+			if ((temp = do_mod(startoffset_fsb, extsz)))
+				e += temp;
+			if ((temp = do_mod(e, extsz)))
+				e += extsz - temp;
+		} else {
+			s = 0;
+			e = allocatesize_fsb;
+		}
+
+		/*
+		 * The transaction reservation is limited to a 32-bit block
+		 * count, hence we need to limit the number of blocks we are
+		 * trying to reserve to avoid an overflow. We can't allocate
+		 * more than @nimaps extents, and an extent is limited on disk
+		 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+		 */
+		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+		if (unlikely(rt)) {
+			resrtextents = qblocks = resblks;
+			resrtextents /= mp->m_sb.sb_rextsize;
+			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+			quota_flag = XFS_QMOPT_RES_RTBLKS;
+		} else {
+			resrtextents = 0;
+			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+			quota_flag = XFS_QMOPT_RES_REGBLKS;
+		}
+
+		/*
+		 * Allocate and setup the transaction.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+					  resblks, resrtextents);
+		/*
+		 * Check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+						      0, quota_flag);
+		if (error)
+			goto error1;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		xfs_bmap_init(&free_list, &firstfsb);
+		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+					allocatesize_fsb, alloc_type, &firstfsb,
+					0, imapp, &nimaps, &free_list);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * Complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error) {
+			break;
+		}
+
+		allocated_fsb = imapp->br_blockcount;
+
+		if (nimaps == 0) {
+			error = -ENOSPC;
+			break;
+		}
+
+		startoffset_fsb += allocated_fsb;
+		allocatesize_fsb -= allocated_fsb;
+	}
+
+	return error;
+
+error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+
+error1:	/* Just cancel transaction */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+	xfs_inode_t		*ip,
+	xfs_off_t		startoff,
+	xfs_off_t		endoff)
+{
+	xfs_bmbt_irec_t		imap;
+	xfs_fileoff_t		offset_fsb;
+	xfs_off_t		lastoffset;
+	xfs_off_t		offset;
+	xfs_buf_t		*bp;
+	xfs_mount_t		*mp = ip->i_mount;
+	int			nimap;
+	int			error = 0;
+
+	/*
+	 * Avoid doing I/O beyond eof - it's not necessary
+	 * since nothing can read beyond eof.  The space will
+	 * be zeroed when the file is extended anyway.
+	 */
+	if (startoff >= XFS_ISIZE(ip))
+		return 0;
+
+	if (endoff > XFS_ISIZE(ip))
+		endoff = XFS_ISIZE(ip);
+
+	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+		uint lock_mode;
+
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		nimap = 1;
+
+		lock_mode = xfs_ilock_data_map_shared(ip);
+		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+		xfs_iunlock(ip, lock_mode);
+
+		if (error || nimap < 1)
+			break;
+		ASSERT(imap.br_blockcount >= 1);
+		ASSERT(imap.br_startoff == offset_fsb);
+		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+		if (lastoffset > endoff)
+			lastoffset = endoff;
+		if (imap.br_startblock == HOLESTARTBLOCK)
+			continue;
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+		if (imap.br_state == XFS_EXT_UNWRITTEN)
+			continue;
+
+		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp,
+				xfs_fsb_to_db(ip, imap.br_startblock),
+				BTOBB(mp->m_sb.sb_blocksize),
+				0, &bp, NULL);
+		if (error)
+			return error;
+
+		memset(bp->b_addr +
+				(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+		       0, lastoffset - offset + 1);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
+	}
+	return error;
+}
+
+int
+xfs_free_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	int			committed;
+	int			done;
+	xfs_fileoff_t		endoffset_fsb;
+	int			error;
+	xfs_fsblock_t		firstfsb;
+	xfs_bmap_free_t		free_list;
+	xfs_bmbt_irec_t		imap;
+	xfs_off_t		ioffset;
+	xfs_off_t		iendoffset;
+	xfs_extlen_t		mod=0;
+	xfs_mount_t		*mp;
+	int			nimap;
+	uint			resblks;
+	xfs_off_t		rounding;
+	int			rt;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_trans_t		*tp;
+
+	mp = ip->i_mount;
+
+	trace_xfs_free_file_space(ip);
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	error = 0;
+	if (len <= 0)	/* if nothing being freed */
+		return error;
+	rt = XFS_IS_REALTIME_INODE(ip);
+	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
+	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+	/* wait for the completion of any pending DIOs */
+	inode_dio_wait(VFS_I(ip));
+
+	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+	ioffset = round_down(offset, rounding);
+	iendoffset = round_up(offset + len, rounding) - 1;
+	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
+					     iendoffset);
+	if (error)
+		goto out;
+	truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
+
+	/*
+	 * Need to zero the stuff we're not freeing, on disk.
+	 * If it's a realtime file & can't use unwritten extents then we
+	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
+	 * will take care of it for us.
+	 */
+	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+		nimap = 1;
+		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+					&imap, &nimap, 0);
+		if (error)
+			goto out;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			xfs_daddr_t	block;
+
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			block = imap.br_startblock;
+			mod = do_div(block, mp->m_sb.sb_rextsize);
+			if (mod)
+				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+		}
+		nimap = 1;
+		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+					&imap, &nimap, 0);
+		if (error)
+			goto out;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			mod++;
+			if (mod && (mod != mp->m_sb.sb_rextsize))
+				endoffset_fsb -= mod;
+		}
+	}
+	if ((done = (endoffset_fsb <= startoffset_fsb)))
+		/*
+		 * One contiguous piece to clear
+		 */
+		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+	else {
+		/*
+		 * Some full blocks, possibly two pieces to clear
+		 */
+		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+			error = xfs_zero_remaining_bytes(ip, offset,
+				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+		if (!error &&
+		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+			error = xfs_zero_remaining_bytes(ip,
+				XFS_FSB_TO_B(mp, endoffset_fsb),
+				offset + len - 1);
+	}
+
+	/*
+	 * free file space until done or until there is an error
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+	while (!error && !done) {
+
+		/*
+		 * allocate and setup the transaction. Allow this
+		 * transaction to dip into the reserve blocks to ensure
+		 * the freeing of the space succeeds at ENOSPC.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+
+		/*
+		 * check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota(tp, mp,
+				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
+				resblks, 0, XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto error1;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		/*
+		 * issue the bunmapi() call to free the blocks
+		 */
+		xfs_bmap_init(&free_list, &firstfsb);
+		error = xfs_bunmapi(tp, ip, startoffset_fsb,
+				  endoffset_fsb - startoffset_fsb,
+				  0, 2, &firstfsb, &free_list, &done);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+ out:
+	return error;
+
+ error0:
+	xfs_bmap_cancel(&free_list);
+ error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	goto out;
+}
+
+/*
+ * Preallocate and zero a range of a file. This mechanism has the allocation
+ * semantics of fallocate and in addition converts data in the range to zeroes.
+ */
+int
+xfs_zero_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			blksize;
+	int			error;
+
+	trace_xfs_zero_file_space(ip);
+
+	blksize = 1 << mp->m_sb.sb_blocklog;
+
+	/*
+	 * Punch a hole and prealloc the range. We use hole punch rather than
+	 * unwritten extent conversion for two reasons:
+	 *
+	 * 1.) Hole punch handles partial block zeroing for us.
+	 *
+	 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
+	 * by virtue of the hole punch.
+	 */
+	error = xfs_free_file_space(ip, offset, len);
+	if (error)
+		goto out;
+
+	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+				     round_up(offset + len, blksize) -
+				     round_down(offset, blksize),
+				     XFS_BMAPI_PREALLOC);
+out:
+	return error;
+
+}
+
+/*
+ * @next_fsb will keep track of the extent currently undergoing shift.
+ * @stop_fsb will keep track of the extent at which we have to stop.
+ * If we are shifting left, we will start with block (offset + len) and
+ * shift each extent till last extent.
+ * If we are shifting right, we will start with last extent inside file space
+ * and continue until we reach the block corresponding to offset.
+ */
+static int
+xfs_shift_file_space(
+	struct xfs_inode        *ip,
+	xfs_off_t               offset,
+	xfs_off_t               len,
+	enum shift_direction	direction)
+{
+	int			done = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	int			committed;
+	xfs_fileoff_t		stop_fsb;
+	xfs_fileoff_t		next_fsb;
+	xfs_fileoff_t		shift_fsb;
+
+	ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+
+	if (direction == SHIFT_LEFT) {
+		next_fsb = XFS_B_TO_FSB(mp, offset + len);
+		stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+	} else {
+		/*
+		 * If right shift, delegate the work of initialization of
+		 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+		 */
+		next_fsb = NULLFSBLOCK;
+		stop_fsb = XFS_B_TO_FSB(mp, offset);
+	}
+
+	shift_fsb = XFS_B_TO_FSB(mp, len);
+
+	/*
+	 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
+	 * into the accessible region of the file.
+	 */
+	if (xfs_can_free_eofblocks(ip, true)) {
+		error = xfs_free_eofblocks(mp, ip, false);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Writeback and invalidate cache for the remainder of the file as we're
+	 * about to shift down every extent from offset to EOF.
+	 */
+	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+					     offset, -1);
+	if (error)
+		return error;
+	error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+					offset >> PAGE_CACHE_SHIFT, -1);
+	if (error)
+		return error;
+
+	/*
+	 * The extent shiting code works on extent granularity. So, if
+	 * stop_fsb is not the starting block of extent, we need to split
+	 * the extent at stop_fsb.
+	 */
+	if (direction == SHIFT_RIGHT) {
+		error = xfs_bmap_split_extent(ip, stop_fsb);
+		if (error)
+			return error;
+	}
+
+	while (!error && !done) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		/*
+		 * We would need to reserve permanent block for transaction.
+		 * This will come into picture when after shifting extent into
+		 * hole we found that adjacent extents can be merged which
+		 * may lead to freeing of a block during record update.
+		 */
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+		if (error) {
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+				ip->i_gdquot, ip->i_pdquot,
+				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+				XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto out;
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+		xfs_bmap_init(&free_list, &first_block);
+
+		/*
+		 * We are using the write transaction in which max 2 bmbt
+		 * updates are allowed
+		 */
+		error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
+				&done, stop_fsb, &first_block, &free_list,
+				direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
+		if (error)
+			goto out;
+
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error)
+			goto out;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	}
+
+	return error;
+
+out:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	return error;
+}
+
+/*
+ * xfs_collapse_file_space()
+ *	This routine frees disk space and shift extent for the given file.
+ *	The first thing we do is to free data blocks in the specified range
+ *	by calling xfs_free_file_space(). It would also sync dirty data
+ *	and invalidate page cache over the region on which collapse range
+ *	is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ *	0 on success
+ *	errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	int error;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	trace_xfs_collapse_file_space(ip);
+
+	error = xfs_free_file_space(ip, offset, len);
+	if (error)
+		return error;
+
+	return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
+}
+
+/*
+ * xfs_insert_file_space()
+ *	This routine create hole space by shifting extents for the given file.
+ *	The first thing we do is to sync dirty data and invalidate page cache
+ *	over the region on which insert range is working. And split an extent
+ *	to two extents at given offset by calling xfs_bmap_split_extent.
+ *	And shift all extent records which are laying between [offset,
+ *	last allocated extent] to the right to reserve hole range.
+ * RETURNS:
+ *	0 on success
+ *	errno on error
+ */
+int
+xfs_insert_file_space(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			len)
+{
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	trace_xfs_insert_file_space(ip);
+
+	return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip)	/* tmp inode */
+{
+
+	/* Should never get a local format */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		return -EINVAL;
+
+	/*
+	 * if the target inode has less extents that then temporary inode then
+	 * why did userspace call us?
+	 */
+	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+		return -EINVAL;
+
+	/*
+	 * if the target inode is in extent form and the temp inode is in btree
+	 * form then we will end up with the target inode in the wrong format
+	 * as we already know there are less extents in the temp inode.
+	 */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+		return -EINVAL;
+
+	/* Check temp in extent form to max in target */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+		return -EINVAL;
+
+	/* Check target in extent form to max in temp */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+		return -EINVAL;
+
+	/*
+	 * If we are in a btree format, check that the temp root block will fit
+	 * in the target and that it has enough extents to be in btree format
+	 * in the target.
+	 *
+	 * Note that we have to be careful to allow btree->extent conversions
+	 * (a common defrag case) which will occur when the temp inode is in
+	 * extent format...
+	 */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_BOFF(ip) &&
+		    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+			return -EINVAL;
+		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+			return -EINVAL;
+	}
+
+	/* Reciprocal target->temp btree format checks */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_BOFF(tip) &&
+		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+			return -EINVAL;
+		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+xfs_swap_extent_flush(
+	struct xfs_inode	*ip)
+{
+	int	error;
+
+	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	if (error)
+		return error;
+	truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+	/* Verify O_DIRECT for ftmp */
+	if (VFS_I(ip)->i_mapping->nrpages)
+		return -EINVAL;
+	return 0;
+}
+
+int
+xfs_swap_extents(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip,	/* tmp inode */
+	xfs_swapext_t	*sxp)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_trans_t	*tp;
+	xfs_bstat_t	*sbp = &sxp->sx_stat;
+	xfs_ifork_t	*tempifp, *ifp, *tifp;
+	int		src_log_flags, target_log_flags;
+	int		error = 0;
+	int		aforkblks = 0;
+	int		taforkblks = 0;
+	__uint64_t	tmp;
+	int		lock_flags;
+
+	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+	if (!tempifp) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Lock the inodes against other IO, page faults and truncate to
+	 * begin with.  Then we can ensure the inodes are flushed and have no
+	 * page cache safely. Once we have done this we can take the ilocks and
+	 * do the rest of the checks.
+	 */
+	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
+
+	/* Verify that both files have the same format */
+	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Verify both files are either real-time or non-realtime */
+	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	error = xfs_swap_extent_flush(ip);
+	if (error)
+		goto out_unlock;
+	error = xfs_swap_extent_flush(tip);
+	if (error)
+		goto out_unlock;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_unlock;
+	}
+
+	/*
+	 * Lock and join the inodes to the tansaction so that transaction commit
+	 * or cancel will unlock the inodes from this point onwards.
+	 */
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+	lock_flags |= XFS_ILOCK_EXCL;
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ijoin(tp, tip, lock_flags);
+
+
+	/* Verify all data are being swapped */
+	if (sxp->sx_offset != 0 ||
+	    sxp->sx_length != ip->i_d.di_size ||
+	    sxp->sx_length != tip->i_d.di_size) {
+		error = -EFAULT;
+		goto out_trans_cancel;
+	}
+
+	trace_xfs_swap_extent_before(ip, 0);
+	trace_xfs_swap_extent_before(tip, 1);
+
+	/* check inode formats now that data is flushed */
+	error = xfs_swap_extents_check_format(ip, tip);
+	if (error) {
+		xfs_notice(mp,
+		    "%s: inode 0x%llx format is incompatible for exchanging.",
+				__func__, ip->i_ino);
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Compare the current change & modify times with that
+	 * passed in.  If they differ, we abort this swap.
+	 * This is the mechanism used to ensure the calling
+	 * process that the file was not changed out from
+	 * under it.
+	 */
+	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+		error = -EBUSY;
+		goto out_trans_cancel;
+	}
+	/*
+	 * Count the number of extended attribute blocks
+	 */
+	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+		if (error)
+			goto out_trans_cancel;
+	}
+	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+			&taforkblks);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * Before we've swapped the forks, lets set the owners of the forks
+	 * appropriately. We have to do this as we are demand paging the btree
+	 * buffers, and so the validation done on read will expect the owner
+	 * field to be correctly set. Once we change the owners, we can swap the
+	 * inode forks.
+	 *
+	 * Note the trickiness in setting the log flags - we set the owner log
+	 * flag on the opposite inode (i.e. the inode we are setting the new
+	 * owner to be) because once we swap the forks and log that, log
+	 * recovery is going to see the fork as owned by the swapped inode,
+	 * not the pre-swapped inodes.
+	 */
+	src_log_flags = XFS_ILOG_CORE;
+	target_log_flags = XFS_ILOG_CORE;
+	if (ip->i_d.di_version == 3 &&
+	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		target_log_flags |= XFS_ILOG_DOWNER;
+		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+					      tip->i_ino, NULL);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	if (tip->i_d.di_version == 3 &&
+	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		src_log_flags |= XFS_ILOG_DOWNER;
+		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+					      ip->i_ino, NULL);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * Swap the data forks of the inodes
+	 */
+	ifp = &ip->i_df;
+	tifp = &tip->i_df;
+	*tempifp = *ifp;	/* struct copy */
+	*ifp = *tifp;		/* struct copy */
+	*tifp = *tempifp;	/* struct copy */
+
+	/*
+	 * Fix the on-disk inode values
+	 */
+	tmp = (__uint64_t)ip->i_d.di_nblocks;
+	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+	tmp = (__uint64_t) ip->i_d.di_nextents;
+	ip->i_d.di_nextents = tip->i_d.di_nextents;
+	tip->i_d.di_nextents = tmp;
+
+	tmp = (__uint64_t) ip->i_d.di_format;
+	ip->i_d.di_format = tip->i_d.di_format;
+	tip->i_d.di_format = tmp;
+
+	/*
+	 * The extents in the source inode could still contain speculative
+	 * preallocation beyond EOF (e.g. the file is open but not modified
+	 * while defrag is in progress). In that case, we need to copy over the
+	 * number of delalloc blocks the data fork in the source inode is
+	 * tracking beyond EOF so that when the fork is truncated away when the
+	 * temporary inode is unlinked we don't underrun the i_delayed_blks
+	 * counter on that inode.
+	 */
+	ASSERT(tip->i_delayed_blks == 0);
+	tip->i_delayed_blks = ip->i_delayed_blks;
+	ip->i_delayed_blks = 0;
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			ifp->if_u1.if_extents =
+				ifp->if_u2.if_inline_ext;
+		}
+		src_log_flags |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(ip->i_d.di_version < 3 ||
+		       (src_log_flags & XFS_ILOG_DOWNER));
+		src_log_flags |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	switch (tip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			tifp->if_u1.if_extents =
+				tifp->if_u2.if_inline_ext;
+		}
+		target_log_flags |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		target_log_flags |= XFS_ILOG_DBROOT;
+		ASSERT(tip->i_d.di_version < 3 ||
+		       (target_log_flags & XFS_ILOG_DOWNER));
+		break;
+	}
+
+	xfs_trans_log_inode(tp, ip,  src_log_flags);
+	xfs_trans_log_inode(tp, tip, target_log_flags);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp, 0);
+
+	trace_xfs_swap_extent_after(ip, 0);
+	trace_xfs_swap_extent_after(tip, 1);
+out:
+	kmem_free(tempifp);
+	return error;
+
+out_unlock:
+	xfs_iunlock(ip, lock_flags);
+	xfs_iunlock(tip, lock_flags);
+	goto out;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp, 0);
+	goto out;
+}
diff --git a/kernel/fs/xfs/xfs_bmap_util.h b/kernel/fs/xfs/xfs_bmap_util.h
new file mode 100644
index 000000000..af97d9a1d
--- /dev/null
+++ b/kernel/fs/xfs/xfs_bmap_util.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_UTIL_H__
+#define	__XFS_BMAP_UTIL_H__
+
+/* Kernel only BMAP related definitions and functions */
+
+struct xfs_bmbt_irec;
+struct xfs_bmap_free_item;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_bmalloca;
+
+int	xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+int	xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+		     int whichfork, int *eof);
+int	xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+			      int whichfork, int *count);
+int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+		xfs_bmap_format_t formatter, void *arg);
+
+/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
+void	xfs_bmap_del_free(struct xfs_bmap_free *flist,
+			  struct xfs_bmap_free_item *prev,
+			  struct xfs_bmap_free_item *free);
+int	xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
+			       struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
+			       int rt, int eof, int delay, int convert,
+			       xfs_fileoff_t *offp, xfs_extlen_t *lenp);
+void	xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+			     int whichfork, struct xfs_bmbt_irec *rec,
+			     int *is_empty);
+
+/* preallocation and hole punch interface */
+int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			     xfs_off_t len, int alloc_type);
+int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			    xfs_off_t len);
+int	xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			    xfs_off_t len);
+int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
+int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
+
+/* EOF block manipulation functions */
+bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+int	xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+			   bool need_iolock);
+
+int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+			 struct xfs_swapext *sx);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
+#endif	/* __XFS_BMAP_UTIL_H__ */
diff --git a/kernel/fs/xfs/xfs_buf.c b/kernel/fs/xfs/xfs_buf.c
new file mode 100644
index 000000000..1790b00be
--- /dev/null
+++ b/kernel/fs/xfs/xfs_buf.c
@@ -0,0 +1,1901 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/percpu.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+#include <linux/migrate.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+
+static kmem_zone_t *xfs_buf_zone;
+
+#ifdef XFS_BUF_LOCK_TRACKING
+# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
+# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
+# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
+#else
+# define XB_SET_OWNER(bp)	do { } while (0)
+# define XB_CLEAR_OWNER(bp)	do { } while (0)
+# define XB_GET_OWNER(bp)	do { } while (0)
+#endif
+
+#define xb_to_gfp(flags) \
+	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+
+
+static inline int
+xfs_buf_is_vmapped(
+	struct xfs_buf	*bp)
+{
+	/*
+	 * Return true if the buffer is vmapped.
+	 *
+	 * b_addr is null if the buffer is not mapped, but the code is clever
+	 * enough to know it doesn't have to map a single page, so the check has
+	 * to be both for b_addr and bp->b_page_count > 1.
+	 */
+	return bp->b_addr && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+	struct xfs_buf	*bp)
+{
+	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+	struct xfs_buf	*bp)
+{
+	ASSERT(xfs_buf_islocked(bp));
+
+	bp->b_flags |= XBF_STALE;
+
+	/*
+	 * Clear the delwri status so that a delwri queue walker will not
+	 * flush this buffer to disk now that it is stale. The delwri queue has
+	 * a reference to the buffer, so this is safe to do.
+	 */
+	bp->b_flags &= ~_XBF_DELWRI_Q;
+
+	spin_lock(&bp->b_lock);
+	atomic_set(&bp->b_lru_ref, 0);
+	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+		atomic_dec(&bp->b_hold);
+
+	ASSERT(atomic_read(&bp->b_hold) >= 1);
+	spin_unlock(&bp->b_lock);
+}
+
+static int
+xfs_buf_get_maps(
+	struct xfs_buf		*bp,
+	int			map_count)
+{
+	ASSERT(bp->b_maps == NULL);
+	bp->b_map_count = map_count;
+
+	if (map_count == 1) {
+		bp->b_maps = &bp->__b_map;
+		return 0;
+	}
+
+	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
+				KM_NOFS);
+	if (!bp->b_maps)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ *	Frees b_pages if it was allocated.
+ */
+static void
+xfs_buf_free_maps(
+	struct xfs_buf	*bp)
+{
+	if (bp->b_maps != &bp->__b_map) {
+		kmem_free(bp->b_maps);
+		bp->b_maps = NULL;
+	}
+}
+
+struct xfs_buf *
+_xfs_buf_alloc(
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags)
+{
+	struct xfs_buf		*bp;
+	int			error;
+	int			i;
+
+	bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
+	if (unlikely(!bp))
+		return NULL;
+
+	/*
+	 * We don't want certain flags to appear in b_flags unless they are
+	 * specifically set by later operations on the buffer.
+	 */
+	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
+
+	atomic_set(&bp->b_hold, 1);
+	atomic_set(&bp->b_lru_ref, 1);
+	init_completion(&bp->b_iowait);
+	INIT_LIST_HEAD(&bp->b_lru);
+	INIT_LIST_HEAD(&bp->b_list);
+	RB_CLEAR_NODE(&bp->b_rbnode);
+	sema_init(&bp->b_sema, 0); /* held, no waiters */
+	spin_lock_init(&bp->b_lock);
+	XB_SET_OWNER(bp);
+	bp->b_target = target;
+	bp->b_flags = flags;
+
+	/*
+	 * Set length and io_length to the same value initially.
+	 * I/O routines should use io_length, which will be the same in
+	 * most cases but may be reset (e.g. XFS recovery).
+	 */
+	error = xfs_buf_get_maps(bp, nmaps);
+	if (error)  {
+		kmem_zone_free(xfs_buf_zone, bp);
+		return NULL;
+	}
+
+	bp->b_bn = map[0].bm_bn;
+	bp->b_length = 0;
+	for (i = 0; i < nmaps; i++) {
+		bp->b_maps[i].bm_bn = map[i].bm_bn;
+		bp->b_maps[i].bm_len = map[i].bm_len;
+		bp->b_length += map[i].bm_len;
+	}
+	bp->b_io_length = bp->b_length;
+
+	atomic_set(&bp->b_pin_count, 0);
+	init_waitqueue_head(&bp->b_waiters);
+
+	XFS_STATS_INC(xb_create);
+	trace_xfs_buf_init(bp, _RET_IP_);
+
+	return bp;
+}
+
+/*
+ *	Allocate a page array capable of holding a specified number
+ *	of pages, and point the page buf at it.
+ */
+STATIC int
+_xfs_buf_get_pages(
+	xfs_buf_t		*bp,
+	int			page_count)
+{
+	/* Make sure that we have a page list */
+	if (bp->b_pages == NULL) {
+		bp->b_page_count = page_count;
+		if (page_count <= XB_PAGES) {
+			bp->b_pages = bp->b_page_array;
+		} else {
+			bp->b_pages = kmem_alloc(sizeof(struct page *) *
+						 page_count, KM_NOFS);
+			if (bp->b_pages == NULL)
+				return -ENOMEM;
+		}
+		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
+	}
+	return 0;
+}
+
+/*
+ *	Frees b_pages if it was allocated.
+ */
+STATIC void
+_xfs_buf_free_pages(
+	xfs_buf_t	*bp)
+{
+	if (bp->b_pages != bp->b_page_array) {
+		kmem_free(bp->b_pages);
+		bp->b_pages = NULL;
+	}
+}
+
+/*
+ *	Releases the specified buffer.
+ *
+ * 	The modification state of any associated pages is left unchanged.
+ * 	The buffer must not be on any hash - use xfs_buf_rele instead for
+ * 	hashed and refcounted buffers
+ */
+void
+xfs_buf_free(
+	xfs_buf_t		*bp)
+{
+	trace_xfs_buf_free(bp, _RET_IP_);
+
+	ASSERT(list_empty(&bp->b_lru));
+
+	if (bp->b_flags & _XBF_PAGES) {
+		uint		i;
+
+		if (xfs_buf_is_vmapped(bp))
+			vm_unmap_ram(bp->b_addr - bp->b_offset,
+					bp->b_page_count);
+
+		for (i = 0; i < bp->b_page_count; i++) {
+			struct page	*page = bp->b_pages[i];
+
+			__free_page(page);
+		}
+	} else if (bp->b_flags & _XBF_KMEM)
+		kmem_free(bp->b_addr);
+	_xfs_buf_free_pages(bp);
+	xfs_buf_free_maps(bp);
+	kmem_zone_free(xfs_buf_zone, bp);
+}
+
+/*
+ * Allocates all the pages for buffer in question and builds it's page list.
+ */
+STATIC int
+xfs_buf_allocate_memory(
+	xfs_buf_t		*bp,
+	uint			flags)
+{
+	size_t			size;
+	size_t			nbytes, offset;
+	gfp_t			gfp_mask = xb_to_gfp(flags);
+	unsigned short		page_count, i;
+	xfs_off_t		start, end;
+	int			error;
+
+	/*
+	 * for buffers that are contained within a single page, just allocate
+	 * the memory from the heap - there's no need for the complexity of
+	 * page arrays to keep allocation down to order 0.
+	 */
+	size = BBTOB(bp->b_length);
+	if (size < PAGE_SIZE) {
+		bp->b_addr = kmem_alloc(size, KM_NOFS);
+		if (!bp->b_addr) {
+			/* low memory - use alloc_page loop instead */
+			goto use_alloc_page;
+		}
+
+		if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
+		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
+			/* b_addr spans two pages - use alloc_page instead */
+			kmem_free(bp->b_addr);
+			bp->b_addr = NULL;
+			goto use_alloc_page;
+		}
+		bp->b_offset = offset_in_page(bp->b_addr);
+		bp->b_pages = bp->b_page_array;
+		bp->b_pages[0] = virt_to_page(bp->b_addr);
+		bp->b_page_count = 1;
+		bp->b_flags |= _XBF_KMEM;
+		return 0;
+	}
+
+use_alloc_page:
+	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
+	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
+								>> PAGE_SHIFT;
+	page_count = end - start;
+	error = _xfs_buf_get_pages(bp, page_count);
+	if (unlikely(error))
+		return error;
+
+	offset = bp->b_offset;
+	bp->b_flags |= _XBF_PAGES;
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		struct page	*page;
+		uint		retries = 0;
+retry:
+		page = alloc_page(gfp_mask);
+		if (unlikely(page == NULL)) {
+			if (flags & XBF_READ_AHEAD) {
+				bp->b_page_count = i;
+				error = -ENOMEM;
+				goto out_free_pages;
+			}
+
+			/*
+			 * This could deadlock.
+			 *
+			 * But until all the XFS lowlevel code is revamped to
+			 * handle buffer allocation failures we can't do much.
+			 */
+			if (!(++retries % 100))
+				xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, gfp_mask);
+
+			XFS_STATS_INC(xb_page_retries);
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto retry;
+		}
+
+		XFS_STATS_INC(xb_page_found);
+
+		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
+		size -= nbytes;
+		bp->b_pages[i] = page;
+		offset = 0;
+	}
+	return 0;
+
+out_free_pages:
+	for (i = 0; i < bp->b_page_count; i++)
+		__free_page(bp->b_pages[i]);
+	return error;
+}
+
+/*
+ *	Map buffer into kernel address-space if necessary.
+ */
+STATIC int
+_xfs_buf_map_pages(
+	xfs_buf_t		*bp,
+	uint			flags)
+{
+	ASSERT(bp->b_flags & _XBF_PAGES);
+	if (bp->b_page_count == 1) {
+		/* A single page buffer is always mappable */
+		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+	} else if (flags & XBF_UNMAPPED) {
+		bp->b_addr = NULL;
+	} else {
+		int retried = 0;
+		unsigned noio_flag;
+
+		/*
+		 * vm_map_ram() will allocate auxillary structures (e.g.
+		 * pagetables) with GFP_KERNEL, yet we are likely to be under
+		 * GFP_NOFS context here. Hence we need to tell memory reclaim
+		 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+		 * memory reclaim re-entering the filesystem here and
+		 * potentially deadlocking.
+		 */
+		noio_flag = memalloc_noio_save();
+		do {
+			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+						-1, PAGE_KERNEL);
+			if (bp->b_addr)
+				break;
+			vm_unmap_aliases();
+		} while (retried++ <= 1);
+		memalloc_noio_restore(noio_flag);
+
+		if (!bp->b_addr)
+			return -ENOMEM;
+		bp->b_addr += bp->b_offset;
+	}
+
+	return 0;
+}
+
+/*
+ *	Finding and Reading Buffers
+ */
+
+/*
+ *	Look up, and creates if absent, a lockable buffer for
+ *	a given range of an inode.  The buffer is returned
+ *	locked.	No I/O is implied by this call.
+ */
+xfs_buf_t *
+_xfs_buf_find(
+	struct xfs_buftarg	*btp,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	xfs_buf_t		*new_bp)
+{
+	size_t			numbytes;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent;
+	xfs_buf_t		*bp;
+	xfs_daddr_t		blkno = map[0].bm_bn;
+	xfs_daddr_t		eofs;
+	int			numblks = 0;
+	int			i;
+
+	for (i = 0; i < nmaps; i++)
+		numblks += map[i].bm_len;
+	numbytes = BBTOB(numblks);
+
+	/* Check for IOs smaller than the sector size / not sector aligned */
+	ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+
+	/*
+	 * Corrupted block numbers can get through to here, unfortunately, so we
+	 * have to check that the buffer falls within the filesystem bounds.
+	 */
+	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+	if (blkno < 0 || blkno >= eofs) {
+		/*
+		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
+		 * but none of the higher level infrastructure supports
+		 * returning a specific error on buffer lookup failures.
+		 */
+		xfs_alert(btp->bt_mount,
+			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+			  __func__, blkno, eofs);
+		WARN_ON(1);
+		return NULL;
+	}
+
+	/* get tree root */
+	pag = xfs_perag_get(btp->bt_mount,
+				xfs_daddr_to_agno(btp->bt_mount, blkno));
+
+	/* walk tree */
+	spin_lock(&pag->pag_buf_lock);
+	rbp = &pag->pag_buf_tree.rb_node;
+	parent = NULL;
+	bp = NULL;
+	while (*rbp) {
+		parent = *rbp;
+		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+		if (blkno < bp->b_bn)
+			rbp = &(*rbp)->rb_left;
+		else if (blkno > bp->b_bn)
+			rbp = &(*rbp)->rb_right;
+		else {
+			/*
+			 * found a block number match. If the range doesn't
+			 * match, the only way this is allowed is if the buffer
+			 * in the cache is stale and the transaction that made
+			 * it stale has not yet committed. i.e. we are
+			 * reallocating a busy extent. Skip this buffer and
+			 * continue searching to the right for an exact match.
+			 */
+			if (bp->b_length != numblks) {
+				ASSERT(bp->b_flags & XBF_STALE);
+				rbp = &(*rbp)->rb_right;
+				continue;
+			}
+			atomic_inc(&bp->b_hold);
+			goto found;
+		}
+	}
+
+	/* No match found */
+	if (new_bp) {
+		rb_link_node(&new_bp->b_rbnode, parent, rbp);
+		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+		/* the buffer keeps the perag reference until it is freed */
+		new_bp->b_pag = pag;
+		spin_unlock(&pag->pag_buf_lock);
+	} else {
+		XFS_STATS_INC(xb_miss_locked);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
+	}
+	return new_bp;
+
+found:
+	spin_unlock(&pag->pag_buf_lock);
+	xfs_perag_put(pag);
+
+	if (!xfs_buf_trylock(bp)) {
+		if (flags & XBF_TRYLOCK) {
+			xfs_buf_rele(bp);
+			XFS_STATS_INC(xb_busy_locked);
+			return NULL;
+		}
+		xfs_buf_lock(bp);
+		XFS_STATS_INC(xb_get_locked_waited);
+	}
+
+	/*
+	 * if the buffer is stale, clear all the external state associated with
+	 * it. We need to keep flags such as how we allocated the buffer memory
+	 * intact here.
+	 */
+	if (bp->b_flags & XBF_STALE) {
+		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+		ASSERT(bp->b_iodone == NULL);
+		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_ops = NULL;
+	}
+
+	trace_xfs_buf_find(bp, flags, _RET_IP_);
+	XFS_STATS_INC(xb_get_locked);
+	return bp;
+}
+
+/*
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
+ */
+struct xfs_buf *
+xfs_buf_get_map(
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags)
+{
+	struct xfs_buf		*bp;
+	struct xfs_buf		*new_bp;
+	int			error = 0;
+
+	bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
+	if (likely(bp))
+		goto found;
+
+	new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
+	if (unlikely(!new_bp))
+		return NULL;
+
+	error = xfs_buf_allocate_memory(new_bp, flags);
+	if (error) {
+		xfs_buf_free(new_bp);
+		return NULL;
+	}
+
+	bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
+	if (!bp) {
+		xfs_buf_free(new_bp);
+		return NULL;
+	}
+
+	if (bp != new_bp)
+		xfs_buf_free(new_bp);
+
+found:
+	if (!bp->b_addr) {
+		error = _xfs_buf_map_pages(bp, flags);
+		if (unlikely(error)) {
+			xfs_warn(target->bt_mount,
+				"%s: failed to map pagesn", __func__);
+			xfs_buf_relse(bp);
+			return NULL;
+		}
+	}
+
+	XFS_STATS_INC(xb_get);
+	trace_xfs_buf_get(bp, flags, _RET_IP_);
+	return bp;
+}
+
+STATIC int
+_xfs_buf_read(
+	xfs_buf_t		*bp,
+	xfs_buf_flags_t		flags)
+{
+	ASSERT(!(flags & XBF_WRITE));
+	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
+
+	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
+	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
+
+	if (flags & XBF_ASYNC) {
+		xfs_buf_submit(bp);
+		return 0;
+	}
+	return xfs_buf_submit_wait(bp);
+}
+
+xfs_buf_t *
+xfs_buf_read_map(
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+
+	flags |= XBF_READ;
+
+	bp = xfs_buf_get_map(target, map, nmaps, flags);
+	if (bp) {
+		trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+		if (!XFS_BUF_ISDONE(bp)) {
+			XFS_STATS_INC(xb_get_read);
+			bp->b_ops = ops;
+			_xfs_buf_read(bp, flags);
+		} else if (flags & XBF_ASYNC) {
+			/*
+			 * Read ahead call which is already satisfied,
+			 * drop the buffer
+			 */
+			xfs_buf_relse(bp);
+			return NULL;
+		} else {
+			/* We do not want read in the flags */
+			bp->b_flags &= ~XBF_READ;
+		}
+	}
+
+	return bp;
+}
+
+/*
+ *	If we are not low on memory then do the readahead in a deadlock
+ *	safe manner.
+ */
+void
+xfs_buf_readahead_map(
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	const struct xfs_buf_ops *ops)
+{
+	if (bdi_read_congested(target->bt_bdi))
+		return;
+
+	xfs_buf_read_map(target, map, nmaps,
+		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+int
+xfs_buf_read_uncached(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		daddr,
+	size_t			numblks,
+	int			flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+
+	*bpp = NULL;
+
+	bp = xfs_buf_get_uncached(target, numblks, flags);
+	if (!bp)
+		return -ENOMEM;
+
+	/* set up the buffer for a read IO */
+	ASSERT(bp->b_map_count == 1);
+	bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
+	bp->b_maps[0].bm_bn = daddr;
+	bp->b_flags |= XBF_READ;
+	bp->b_ops = ops;
+
+	xfs_buf_submit_wait(bp);
+	if (bp->b_error) {
+		int	error = bp->b_error;
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+	struct xfs_buf		*bp,
+	size_t			numblks)
+{
+	if (bp->b_pages)
+		_xfs_buf_free_pages(bp);
+
+	bp->b_pages = NULL;
+	bp->b_page_count = 0;
+	bp->b_addr = NULL;
+	bp->b_length = numblks;
+	bp->b_io_length = numblks;
+
+	ASSERT(bp->b_map_count == 1);
+	bp->b_bn = XFS_BUF_DADDR_NULL;
+	bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
+	bp->b_maps[0].bm_len = bp->b_length;
+}
+
+static inline struct page *
+mem_to_page(
+	void			*addr)
+{
+	if ((!is_vmalloc_addr(addr))) {
+		return virt_to_page(addr);
+	} else {
+		return vmalloc_to_page(addr);
+	}
+}
+
+int
+xfs_buf_associate_memory(
+	xfs_buf_t		*bp,
+	void			*mem,
+	size_t			len)
+{
+	int			rval;
+	int			i = 0;
+	unsigned long		pageaddr;
+	unsigned long		offset;
+	size_t			buflen;
+	int			page_count;
+
+	pageaddr = (unsigned long)mem & PAGE_MASK;
+	offset = (unsigned long)mem - pageaddr;
+	buflen = PAGE_ALIGN(len + offset);
+	page_count = buflen >> PAGE_SHIFT;
+
+	/* Free any previous set of page pointers */
+	if (bp->b_pages)
+		_xfs_buf_free_pages(bp);
+
+	bp->b_pages = NULL;
+	bp->b_addr = mem;
+
+	rval = _xfs_buf_get_pages(bp, page_count);
+	if (rval)
+		return rval;
+
+	bp->b_offset = offset;
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		bp->b_pages[i] = mem_to_page((void *)pageaddr);
+		pageaddr += PAGE_SIZE;
+	}
+
+	bp->b_io_length = BTOBB(len);
+	bp->b_length = BTOBB(buflen);
+
+	return 0;
+}
+
+xfs_buf_t *
+xfs_buf_get_uncached(
+	struct xfs_buftarg	*target,
+	size_t			numblks,
+	int			flags)
+{
+	unsigned long		page_count;
+	int			error, i;
+	struct xfs_buf		*bp;
+	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
+
+	bp = _xfs_buf_alloc(target, &map, 1, 0);
+	if (unlikely(bp == NULL))
+		goto fail;
+
+	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
+	error = _xfs_buf_get_pages(bp, page_count);
+	if (error)
+		goto fail_free_buf;
+
+	for (i = 0; i < page_count; i++) {
+		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
+		if (!bp->b_pages[i])
+			goto fail_free_mem;
+	}
+	bp->b_flags |= _XBF_PAGES;
+
+	error = _xfs_buf_map_pages(bp, 0);
+	if (unlikely(error)) {
+		xfs_warn(target->bt_mount,
+			"%s: failed to map pages", __func__);
+		goto fail_free_mem;
+	}
+
+	trace_xfs_buf_get_uncached(bp, _RET_IP_);
+	return bp;
+
+ fail_free_mem:
+	while (--i >= 0)
+		__free_page(bp->b_pages[i]);
+	_xfs_buf_free_pages(bp);
+ fail_free_buf:
+	xfs_buf_free_maps(bp);
+	kmem_zone_free(xfs_buf_zone, bp);
+ fail:
+	return NULL;
+}
+
+/*
+ *	Increment reference count on buffer, to hold the buffer concurrently
+ *	with another thread which may release (free) the buffer asynchronously.
+ *	Must hold the buffer already to call this function.
+ */
+void
+xfs_buf_hold(
+	xfs_buf_t		*bp)
+{
+	trace_xfs_buf_hold(bp, _RET_IP_);
+	atomic_inc(&bp->b_hold);
+}
+
+/*
+ *	Releases a hold on the specified buffer.  If the
+ *	the hold count is 1, calls xfs_buf_free.
+ */
+void
+xfs_buf_rele(
+	xfs_buf_t		*bp)
+{
+	struct xfs_perag	*pag = bp->b_pag;
+
+	trace_xfs_buf_rele(bp, _RET_IP_);
+
+	if (!pag) {
+		ASSERT(list_empty(&bp->b_lru));
+		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
+		if (atomic_dec_and_test(&bp->b_hold))
+			xfs_buf_free(bp);
+		return;
+	}
+
+	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
+	ASSERT(atomic_read(&bp->b_hold) > 0);
+	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
+		spin_lock(&bp->b_lock);
+		if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+			/*
+			 * If the buffer is added to the LRU take a new
+			 * reference to the buffer for the LRU and clear the
+			 * (now stale) dispose list state flag
+			 */
+			if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+				bp->b_state &= ~XFS_BSTATE_DISPOSE;
+				atomic_inc(&bp->b_hold);
+			}
+			spin_unlock(&bp->b_lock);
+			spin_unlock(&pag->pag_buf_lock);
+		} else {
+			/*
+			 * most of the time buffers will already be removed from
+			 * the LRU, so optimise that case by checking for the
+			 * XFS_BSTATE_DISPOSE flag indicating the last list the
+			 * buffer was on was the disposal list
+			 */
+			if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+				list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+			} else {
+				ASSERT(list_empty(&bp->b_lru));
+			}
+			spin_unlock(&bp->b_lock);
+
+			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+			spin_unlock(&pag->pag_buf_lock);
+			xfs_perag_put(pag);
+			xfs_buf_free(bp);
+		}
+	}
+}
+
+
+/*
+ *	Lock a buffer object, if it is not already locked.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we are
+ *	being asked to lock a buffer that has been reallocated. Because it is
+ *	pinned, we know that the log has not been pushed to disk and hence it
+ *	will still be locked.  Rather than continuing to have trylock attempts
+ *	fail until someone else pushes the log, push it ourselves before
+ *	returning.  This means that the xfsaild will not get stuck trying
+ *	to push on stale inode buffers.
+ */
+int
+xfs_buf_trylock(
+	struct xfs_buf		*bp)
+{
+	int			locked;
+
+	locked = down_trylock(&bp->b_sema) == 0;
+	if (locked)
+		XB_SET_OWNER(bp);
+
+	trace_xfs_buf_trylock(bp, _RET_IP_);
+	return locked;
+}
+
+/*
+ *	Lock a buffer object.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we
+ *	are being asked to lock a buffer that has been reallocated. Because
+ *	it is pinned, we know that the log has not been pushed to disk and
+ *	hence it will still be locked. Rather than sleeping until someone
+ *	else pushes the log, push it ourselves before trying to get the lock.
+ */
+void
+xfs_buf_lock(
+	struct xfs_buf		*bp)
+{
+	trace_xfs_buf_lock(bp, _RET_IP_);
+
+	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+		xfs_log_force(bp->b_target->bt_mount, 0);
+	down(&bp->b_sema);
+	XB_SET_OWNER(bp);
+
+	trace_xfs_buf_lock_done(bp, _RET_IP_);
+}
+
+void
+xfs_buf_unlock(
+	struct xfs_buf		*bp)
+{
+	XB_CLEAR_OWNER(bp);
+	up(&bp->b_sema);
+
+	trace_xfs_buf_unlock(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_buf_wait_unpin(
+	xfs_buf_t		*bp)
+{
+	DECLARE_WAITQUEUE	(wait, current);
+
+	if (atomic_read(&bp->b_pin_count) == 0)
+		return;
+
+	add_wait_queue(&bp->b_waiters, &wait);
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&bp->b_pin_count) == 0)
+			break;
+		io_schedule();
+	}
+	remove_wait_queue(&bp->b_waiters, &wait);
+	set_current_state(TASK_RUNNING);
+}
+
+/*
+ *	Buffer Utility Routines
+ */
+
+void
+xfs_buf_ioend(
+	struct xfs_buf	*bp)
+{
+	bool		read = bp->b_flags & XBF_READ;
+
+	trace_xfs_buf_iodone(bp, _RET_IP_);
+
+	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+
+	/*
+	 * Pull in IO completion errors now. We are guaranteed to be running
+	 * single threaded, so we don't need the lock to read b_io_error.
+	 */
+	if (!bp->b_error && bp->b_io_error)
+		xfs_buf_ioerror(bp, bp->b_io_error);
+
+	/* Only validate buffers that were read without errors */
+	if (read && !bp->b_error && bp->b_ops) {
+		ASSERT(!bp->b_iodone);
+		bp->b_ops->verify_read(bp);
+	}
+
+	if (!bp->b_error)
+		bp->b_flags |= XBF_DONE;
+
+	if (bp->b_iodone)
+		(*(bp->b_iodone))(bp);
+	else if (bp->b_flags & XBF_ASYNC)
+		xfs_buf_relse(bp);
+	else
+		complete(&bp->b_iowait);
+}
+
+static void
+xfs_buf_ioend_work(
+	struct work_struct	*work)
+{
+	struct xfs_buf		*bp =
+		container_of(work, xfs_buf_t, b_ioend_work);
+
+	xfs_buf_ioend(bp);
+}
+
+void
+xfs_buf_ioend_async(
+	struct xfs_buf	*bp)
+{
+	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+	queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
+}
+
+void
+xfs_buf_ioerror(
+	xfs_buf_t		*bp,
+	int			error)
+{
+	ASSERT(error <= 0 && error >= -1000);
+	bp->b_error = error;
+	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
+}
+
+void
+xfs_buf_ioerror_alert(
+	struct xfs_buf		*bp,
+	const char		*func)
+{
+	xfs_alert(bp->b_target->bt_mount,
+"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
+		(__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
+}
+
+int
+xfs_bwrite(
+	struct xfs_buf		*bp)
+{
+	int			error;
+
+	ASSERT(xfs_buf_islocked(bp));
+
+	bp->b_flags |= XBF_WRITE;
+	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+			 XBF_WRITE_FAIL | XBF_DONE);
+
+	error = xfs_buf_submit_wait(bp);
+	if (error) {
+		xfs_force_shutdown(bp->b_target->bt_mount,
+				   SHUTDOWN_META_IO_ERROR);
+	}
+	return error;
+}
+
+STATIC void
+xfs_buf_bio_end_io(
+	struct bio		*bio,
+	int			error)
+{
+	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
+
+	/*
+	 * don't overwrite existing errors - otherwise we can lose errors on
+	 * buffers that require multiple bios to complete.
+	 */
+	if (error) {
+		spin_lock(&bp->b_lock);
+		if (!bp->b_io_error)
+			bp->b_io_error = error;
+		spin_unlock(&bp->b_lock);
+	}
+
+	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+		xfs_buf_ioend_async(bp);
+	bio_put(bio);
+}
+
+static void
+xfs_buf_ioapply_map(
+	struct xfs_buf	*bp,
+	int		map,
+	int		*buf_offset,
+	int		*count,
+	int		rw)
+{
+	int		page_index;
+	int		total_nr_pages = bp->b_page_count;
+	int		nr_pages;
+	struct bio	*bio;
+	sector_t	sector =  bp->b_maps[map].bm_bn;
+	int		size;
+	int		offset;
+
+	total_nr_pages = bp->b_page_count;
+
+	/* skip the pages in the buffer before the start offset */
+	page_index = 0;
+	offset = *buf_offset;
+	while (offset >= PAGE_SIZE) {
+		page_index++;
+		offset -= PAGE_SIZE;
+	}
+
+	/*
+	 * Limit the IO size to the length of the current vector, and update the
+	 * remaining IO count for the next time around.
+	 */
+	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
+	*count -= size;
+	*buf_offset += size;
+
+next_chunk:
+	atomic_inc(&bp->b_io_remaining);
+	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+	if (nr_pages > total_nr_pages)
+		nr_pages = total_nr_pages;
+
+	bio = bio_alloc(GFP_NOIO, nr_pages);
+	bio->bi_bdev = bp->b_target->bt_bdev;
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+	bio->bi_private = bp;
+
+
+	for (; size && nr_pages; nr_pages--, page_index++) {
+		int	rbytes, nbytes = PAGE_SIZE - offset;
+
+		if (nbytes > size)
+			nbytes = size;
+
+		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
+				      offset);
+		if (rbytes < nbytes)
+			break;
+
+		offset = 0;
+		sector += BTOBB(nbytes);
+		size -= nbytes;
+		total_nr_pages--;
+	}
+
+	if (likely(bio->bi_iter.bi_size)) {
+		if (xfs_buf_is_vmapped(bp)) {
+			flush_kernel_vmap_range(bp->b_addr,
+						xfs_buf_vmap_len(bp));
+		}
+		submit_bio(rw, bio);
+		if (size)
+			goto next_chunk;
+	} else {
+		/*
+		 * This is guaranteed not to be the last io reference count
+		 * because the caller (xfs_buf_submit) holds a count itself.
+		 */
+		atomic_dec(&bp->b_io_remaining);
+		xfs_buf_ioerror(bp, -EIO);
+		bio_put(bio);
+	}
+
+}
+
+STATIC void
+_xfs_buf_ioapply(
+	struct xfs_buf	*bp)
+{
+	struct blk_plug	plug;
+	int		rw;
+	int		offset;
+	int		size;
+	int		i;
+
+	/*
+	 * Make sure we capture only current IO errors rather than stale errors
+	 * left over from previous use of the buffer (e.g. failed readahead).
+	 */
+	bp->b_error = 0;
+
+	/*
+	 * Initialize the I/O completion workqueue if we haven't yet or the
+	 * submitter has not opted to specify a custom one.
+	 */
+	if (!bp->b_ioend_wq)
+		bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
+
+	if (bp->b_flags & XBF_WRITE) {
+		if (bp->b_flags & XBF_SYNCIO)
+			rw = WRITE_SYNC;
+		else
+			rw = WRITE;
+		if (bp->b_flags & XBF_FUA)
+			rw |= REQ_FUA;
+		if (bp->b_flags & XBF_FLUSH)
+			rw |= REQ_FLUSH;
+
+		/*
+		 * Run the write verifier callback function if it exists. If
+		 * this function fails it will mark the buffer with an error and
+		 * the IO should not be dispatched.
+		 */
+		if (bp->b_ops) {
+			bp->b_ops->verify_write(bp);
+			if (bp->b_error) {
+				xfs_force_shutdown(bp->b_target->bt_mount,
+						   SHUTDOWN_CORRUPT_INCORE);
+				return;
+			}
+		} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+			struct xfs_mount *mp = bp->b_target->bt_mount;
+
+			/*
+			 * non-crc filesystems don't attach verifiers during
+			 * log recovery, so don't warn for such filesystems.
+			 */
+			if (xfs_sb_version_hascrc(&mp->m_sb)) {
+				xfs_warn(mp,
+					"%s: no ops on block 0x%llx/0x%x",
+					__func__, bp->b_bn, bp->b_length);
+				xfs_hex_dump(bp->b_addr, 64);
+				dump_stack();
+			}
+		}
+	} else if (bp->b_flags & XBF_READ_AHEAD) {
+		rw = READA;
+	} else {
+		rw = READ;
+	}
+
+	/* we only use the buffer cache for meta-data */
+	rw |= REQ_META;
+
+	/*
+	 * Walk all the vectors issuing IO on them. Set up the initial offset
+	 * into the buffer and the desired IO size before we start -
+	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
+	 * subsequent call.
+	 */
+	offset = bp->b_offset;
+	size = BBTOB(bp->b_io_length);
+	blk_start_plug(&plug);
+	for (i = 0; i < bp->b_map_count; i++) {
+		xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
+		if (bp->b_error)
+			break;
+		if (size <= 0)
+			break;	/* all done */
+	}
+	blk_finish_plug(&plug);
+}
+
+/*
+ * Asynchronous IO submission path. This transfers the buffer lock ownership and
+ * the current reference to the IO. It is not safe to reference the buffer after
+ * a call to this function unless the caller holds an additional reference
+ * itself.
+ */
+void
+xfs_buf_submit(
+	struct xfs_buf	*bp)
+{
+	trace_xfs_buf_submit(bp, _RET_IP_);
+
+	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+	ASSERT(bp->b_flags & XBF_ASYNC);
+
+	/* on shutdown we stale and complete the buffer immediately */
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+		xfs_buf_ioerror(bp, -EIO);
+		bp->b_flags &= ~XBF_DONE;
+		xfs_buf_stale(bp);
+		xfs_buf_ioend(bp);
+		return;
+	}
+
+	if (bp->b_flags & XBF_WRITE)
+		xfs_buf_wait_unpin(bp);
+
+	/* clear the internal error state to avoid spurious errors */
+	bp->b_io_error = 0;
+
+	/*
+	 * The caller's reference is released during I/O completion.
+	 * This occurs some time after the last b_io_remaining reference is
+	 * released, so after we drop our Io reference we have to have some
+	 * other reference to ensure the buffer doesn't go away from underneath
+	 * us. Take a direct reference to ensure we have safe access to the
+	 * buffer until we are finished with it.
+	 */
+	xfs_buf_hold(bp);
+
+	/*
+	 * Set the count to 1 initially, this will stop an I/O completion
+	 * callout which happens before we have started all the I/O from calling
+	 * xfs_buf_ioend too early.
+	 */
+	atomic_set(&bp->b_io_remaining, 1);
+	_xfs_buf_ioapply(bp);
+
+	/*
+	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
+	 * reference we took above. If we drop it to zero, run completion so
+	 * that we don't return to the caller with completion still pending.
+	 */
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+		if (bp->b_error)
+			xfs_buf_ioend(bp);
+		else
+			xfs_buf_ioend_async(bp);
+	}
+
+	xfs_buf_rele(bp);
+	/* Note: it is not safe to reference bp now we've dropped our ref */
+}
+
+/*
+ * Synchronous buffer IO submission path, read or write.
+ */
+int
+xfs_buf_submit_wait(
+	struct xfs_buf	*bp)
+{
+	int		error;
+
+	trace_xfs_buf_submit_wait(bp, _RET_IP_);
+
+	ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
+
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+		xfs_buf_ioerror(bp, -EIO);
+		xfs_buf_stale(bp);
+		bp->b_flags &= ~XBF_DONE;
+		return -EIO;
+	}
+
+	if (bp->b_flags & XBF_WRITE)
+		xfs_buf_wait_unpin(bp);
+
+	/* clear the internal error state to avoid spurious errors */
+	bp->b_io_error = 0;
+
+	/*
+	 * For synchronous IO, the IO does not inherit the submitters reference
+	 * count, nor the buffer lock. Hence we cannot release the reference we
+	 * are about to take until we've waited for all IO completion to occur,
+	 * including any xfs_buf_ioend_async() work that may be pending.
+	 */
+	xfs_buf_hold(bp);
+
+	/*
+	 * Set the count to 1 initially, this will stop an I/O completion
+	 * callout which happens before we have started all the I/O from calling
+	 * xfs_buf_ioend too early.
+	 */
+	atomic_set(&bp->b_io_remaining, 1);
+	_xfs_buf_ioapply(bp);
+
+	/*
+	 * make sure we run completion synchronously if it raced with us and is
+	 * already complete.
+	 */
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+		xfs_buf_ioend(bp);
+
+	/* wait for completion before gathering the error from the buffer */
+	trace_xfs_buf_iowait(bp, _RET_IP_);
+	wait_for_completion(&bp->b_iowait);
+	trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	error = bp->b_error;
+
+	/*
+	 * all done now, we can release the hold that keeps the buffer
+	 * referenced for the entire IO.
+	 */
+	xfs_buf_rele(bp);
+	return error;
+}
+
+xfs_caddr_t
+xfs_buf_offset(
+	xfs_buf_t		*bp,
+	size_t			offset)
+{
+	struct page		*page;
+
+	if (bp->b_addr)
+		return bp->b_addr + offset;
+
+	offset += bp->b_offset;
+	page = bp->b_pages[offset >> PAGE_SHIFT];
+	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+}
+
+/*
+ *	Move data into or out of a buffer.
+ */
+void
+xfs_buf_iomove(
+	xfs_buf_t		*bp,	/* buffer to process		*/
+	size_t			boff,	/* starting buffer offset	*/
+	size_t			bsize,	/* length to copy		*/
+	void			*data,	/* data address			*/
+	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
+{
+	size_t			bend;
+
+	bend = boff + bsize;
+	while (boff < bend) {
+		struct page	*page;
+		int		page_index, page_offset, csize;
+
+		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
+		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
+		page = bp->b_pages[page_index];
+		csize = min_t(size_t, PAGE_SIZE - page_offset,
+				      BBTOB(bp->b_io_length) - boff);
+
+		ASSERT((csize + page_offset) <= PAGE_SIZE);
+
+		switch (mode) {
+		case XBRW_ZERO:
+			memset(page_address(page) + page_offset, 0, csize);
+			break;
+		case XBRW_READ:
+			memcpy(data, page_address(page) + page_offset, csize);
+			break;
+		case XBRW_WRITE:
+			memcpy(page_address(page) + page_offset, data, csize);
+		}
+
+		boff += csize;
+		data += csize;
+	}
+}
+
+/*
+ *	Handling of buffer targets (buftargs).
+ */
+
+/*
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
+ */
+static enum lru_status
+xfs_buftarg_wait_rele(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	if (atomic_read(&bp->b_hold) > 1) {
+		/* need to wait, so skip it this pass */
+		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+		return LRU_SKIP;
+	}
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+
+	/*
+	 * clear the LRU reference count so the buffer doesn't get
+	 * ignored in xfs_buf_rele().
+	 */
+	atomic_set(&bp->b_lru_ref, 0);
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
+void
+xfs_wait_buftarg(
+	struct xfs_buftarg	*btp)
+{
+	LIST_HEAD(dispose);
+	int loop = 0;
+
+	/* loop until there is nothing left on the lru list. */
+	while (list_lru_count(&btp->bt_lru)) {
+		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+			      &dispose, LONG_MAX);
+
+		while (!list_empty(&dispose)) {
+			struct xfs_buf *bp;
+			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+			list_del_init(&bp->b_lru);
+			if (bp->b_flags & XBF_WRITE_FAIL) {
+				xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+					(long long)bp->b_bn);
+			}
+			xfs_buf_rele(bp);
+		}
+		if (loop++ != 0)
+			delay(100);
+	}
+}
+
+static enum lru_status
+xfs_buftarg_isolate(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	/*
+	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
+	 * Decrement the b_lru_ref count unless the value is already
+	 * zero. If the value is already zero, we need to reclaim the
+	 * buffer, otherwise it gets another trip through the LRU.
+	 */
+	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+		spin_unlock(&bp->b_lock);
+		return LRU_ROTATE;
+	}
+
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
+static unsigned long
+xfs_buftarg_shrink_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	LIST_HEAD(dispose);
+	unsigned long		freed;
+
+	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
+				     xfs_buftarg_isolate, &dispose);
+
+	while (!list_empty(&dispose)) {
+		struct xfs_buf *bp;
+		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+		list_del_init(&bp->b_lru);
+		xfs_buf_rele(bp);
+	}
+
+	return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	return list_lru_shrink_count(&btp->bt_lru, sc);
+}
+
+void
+xfs_free_buftarg(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*btp)
+{
+	unregister_shrinker(&btp->bt_shrinker);
+	list_lru_destroy(&btp->bt_lru);
+
+	if (mp->m_flags & XFS_MOUNT_BARRIER)
+		xfs_blkdev_issue_flush(btp);
+
+	kmem_free(btp);
+}
+
+int
+xfs_setsize_buftarg(
+	xfs_buftarg_t		*btp,
+	unsigned int		sectorsize)
+{
+	/* Set up metadata sector size info */
+	btp->bt_meta_sectorsize = sectorsize;
+	btp->bt_meta_sectormask = sectorsize - 1;
+
+	if (set_blocksize(btp->bt_bdev, sectorsize)) {
+		char name[BDEVNAME_SIZE];
+
+		bdevname(btp->bt_bdev, name);
+
+		xfs_warn(btp->bt_mount,
+			"Cannot set_blocksize to %u on device %s",
+			sectorsize, name);
+		return -EINVAL;
+	}
+
+	/* Set up device logical sector size mask */
+	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+
+	return 0;
+}
+
+/*
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used at this early stage.  Play safe.
+ */
+STATIC int
+xfs_setsize_buftarg_early(
+	xfs_buftarg_t		*btp,
+	struct block_device	*bdev)
+{
+	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
+}
+
+xfs_buftarg_t *
+xfs_alloc_buftarg(
+	struct xfs_mount	*mp,
+	struct block_device	*bdev)
+{
+	xfs_buftarg_t		*btp;
+
+	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+
+	btp->bt_mount = mp;
+	btp->bt_dev =  bdev->bd_dev;
+	btp->bt_bdev = bdev;
+	btp->bt_bdi = blk_get_backing_dev_info(bdev);
+
+	if (xfs_setsize_buftarg_early(btp, bdev))
+		goto error;
+
+	if (list_lru_init(&btp->bt_lru))
+		goto error;
+
+	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
+	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
+	register_shrinker(&btp->bt_shrinker);
+	return btp;
+
+error:
+	kmem_free(btp);
+	return NULL;
+}
+
+/*
+ * Add a buffer to the delayed write list.
+ *
+ * This queues a buffer for writeout if it hasn't already been.  Note that
+ * neither this routine nor the buffer list submission functions perform
+ * any internal synchronization.  It is expected that the lists are thread-local
+ * to the callers.
+ *
+ * Returns true if we queued up the buffer, or false if it already had
+ * been on the buffer list.
+ */
+bool
+xfs_buf_delwri_queue(
+	struct xfs_buf		*bp,
+	struct list_head	*list)
+{
+	ASSERT(xfs_buf_islocked(bp));
+	ASSERT(!(bp->b_flags & XBF_READ));
+
+	/*
+	 * If the buffer is already marked delwri it already is queued up
+	 * by someone else for imediate writeout.  Just ignore it in that
+	 * case.
+	 */
+	if (bp->b_flags & _XBF_DELWRI_Q) {
+		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
+		return false;
+	}
+
+	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
+	/*
+	 * If a buffer gets written out synchronously or marked stale while it
+	 * is on a delwri list we lazily remove it. To do this, the other party
+	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
+	 * It remains referenced and on the list.  In a rare corner case it
+	 * might get readded to a delwri list after the synchronous writeout, in
+	 * which case we need just need to re-add the flag here.
+	 */
+	bp->b_flags |= _XBF_DELWRI_Q;
+	if (list_empty(&bp->b_list)) {
+		atomic_inc(&bp->b_hold);
+		list_add_tail(&bp->b_list, list);
+	}
+
+	return true;
+}
+
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+	void		*priv,
+	struct list_head *a,
+	struct list_head *b)
+{
+	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
+	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
+	xfs_daddr_t		diff;
+
+	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+static int
+__xfs_buf_delwri_submit(
+	struct list_head	*buffer_list,
+	struct list_head	*io_list,
+	bool			wait)
+{
+	struct blk_plug		plug;
+	struct xfs_buf		*bp, *n;
+	int			pinned = 0;
+
+	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+		if (!wait) {
+			if (xfs_buf_ispinned(bp)) {
+				pinned++;
+				continue;
+			}
+			if (!xfs_buf_trylock(bp))
+				continue;
+		} else {
+			xfs_buf_lock(bp);
+		}
+
+		/*
+		 * Someone else might have written the buffer synchronously or
+		 * marked it stale in the meantime.  In that case only the
+		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
+		 * reference and remove it from the list here.
+		 */
+		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+			list_del_init(&bp->b_list);
+			xfs_buf_relse(bp);
+			continue;
+		}
+
+		list_move_tail(&bp->b_list, io_list);
+		trace_xfs_buf_delwri_split(bp, _RET_IP_);
+	}
+
+	list_sort(NULL, io_list, xfs_buf_cmp);
+
+	blk_start_plug(&plug);
+	list_for_each_entry_safe(bp, n, io_list, b_list) {
+		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
+		bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+
+		/*
+		 * we do all Io submission async. This means if we need to wait
+		 * for IO completion we need to take an extra reference so the
+		 * buffer is still valid on the other side.
+		 */
+		if (wait)
+			xfs_buf_hold(bp);
+		else
+			list_del_init(&bp->b_list);
+
+		xfs_buf_submit(bp);
+	}
+	blk_finish_plug(&plug);
+
+	return pinned;
+}
+
+/*
+ * Write out a buffer list asynchronously.
+ *
+ * This will take the @buffer_list, write all non-locked and non-pinned buffers
+ * out and not wait for I/O completion on any of the buffers.  This interface
+ * is only safely useable for callers that can track I/O completion by higher
+ * level means, e.g. AIL pushing as the @buffer_list is consumed in this
+ * function.
+ */
+int
+xfs_buf_delwri_submit_nowait(
+	struct list_head	*buffer_list)
+{
+	LIST_HEAD		(io_list);
+	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+}
+
+/*
+ * Write out a buffer list synchronously.
+ *
+ * This will take the @buffer_list, write all buffers out and wait for I/O
+ * completion on all of the buffers. @buffer_list is consumed by the function,
+ * so callers must have some other way of tracking buffers if they require such
+ * functionality.
+ */
+int
+xfs_buf_delwri_submit(
+	struct list_head	*buffer_list)
+{
+	LIST_HEAD		(io_list);
+	int			error = 0, error2;
+	struct xfs_buf		*bp;
+
+	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
+
+	/* Wait for IO to complete. */
+	while (!list_empty(&io_list)) {
+		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+
+		list_del_init(&bp->b_list);
+
+		/* locking the buffer will wait for async IO completion. */
+		xfs_buf_lock(bp);
+		error2 = bp->b_error;
+		xfs_buf_relse(bp);
+		if (!error)
+			error = error2;
+	}
+
+	return error;
+}
+
+int __init
+xfs_buf_init(void)
+{
+	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
+						KM_ZONE_HWALIGN, NULL);
+	if (!xfs_buf_zone)
+		goto out;
+
+	return 0;
+
+ out:
+	return -ENOMEM;
+}
+
+void
+xfs_buf_terminate(void)
+{
+	kmem_zone_destroy(xfs_buf_zone);
+}
diff --git a/kernel/fs/xfs/xfs_buf.h b/kernel/fs/xfs/xfs_buf.h
new file mode 100644
index 000000000..75ff5d5a7
--- /dev/null
+++ b/kernel/fs/xfs/xfs_buf.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+#include <linux/list_lru.h>
+
+/*
+ *	Base types
+ */
+
+#define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))
+
+typedef enum {
+	XBRW_READ = 1,			/* transfer into target memory */
+	XBRW_WRITE = 2,			/* transfer from target memory */
+	XBRW_ZERO = 3,			/* Zero target memory */
+} xfs_buf_rw_t;
+
+#define XBF_READ	 (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE	 (1 << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD	 (1 << 2) /* asynchronous read-ahead */
+#define XBF_ASYNC	 (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE	 (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE	 (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL	 (1 << 24)/* async writes have failed on this buffer */
+
+/* I/O hints for the BIO layer */
+#define XBF_SYNCIO	 (1 << 10)/* treat this buffer as synchronous I/O */
+#define XBF_FUA		 (1 << 11)/* force cache write through mode */
+#define XBF_FLUSH	 (1 << 12)/* flush the disk cache before a write */
+
+/* flags used only as arguments to access routines */
+#define XBF_TRYLOCK	 (1 << 16)/* lock requested, but do not wait */
+#define XBF_UNMAPPED	 (1 << 17)/* do not map the buffer */
+
+/* flags used only internally */
+#define _XBF_PAGES	 (1 << 20)/* backed by refcounted pages */
+#define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
+#define _XBF_COMPOUND	 (1 << 23)/* compound buffer */
+
+typedef unsigned int xfs_buf_flags_t;
+
+#define XFS_BUF_FLAGS \
+	{ XBF_READ,		"READ" }, \
+	{ XBF_WRITE,		"WRITE" }, \
+	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
+	{ XBF_ASYNC,		"ASYNC" }, \
+	{ XBF_DONE,		"DONE" }, \
+	{ XBF_STALE,		"STALE" }, \
+	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
+	{ XBF_SYNCIO,		"SYNCIO" }, \
+	{ XBF_FUA,		"FUA" }, \
+	{ XBF_FLUSH,		"FLUSH" }, \
+	{ XBF_TRYLOCK,		"TRYLOCK" },	/* should never be set */\
+	{ XBF_UNMAPPED,		"UNMAPPED" },	/* ditto */\
+	{ _XBF_PAGES,		"PAGES" }, \
+	{ _XBF_KMEM,		"KMEM" }, \
+	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
+	{ _XBF_COMPOUND,	"COMPOUND" }
+
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
+
+/*
+ * The xfs_buftarg contains 2 notions of "sector size" -
+ *
+ * 1) The metadata sector size, which is the minimum unit and
+ *    alignment of IO which will be performed by metadata operations.
+ * 2) The device logical sector size
+ *
+ * The first is specified at mkfs time, and is stored on-disk in the
+ * superblock's sb_sectsize.
+ *
+ * The latter is derived from the underlying device, and controls direct IO
+ * alignment constraints.
+ */
+typedef struct xfs_buftarg {
+	dev_t			bt_dev;
+	struct block_device	*bt_bdev;
+	struct backing_dev_info	*bt_bdi;
+	struct xfs_mount	*bt_mount;
+	unsigned int		bt_meta_sectorsize;
+	size_t			bt_meta_sectormask;
+	size_t			bt_logical_sectorsize;
+	size_t			bt_logical_sectormask;
+
+	/* LRU control structures */
+	struct shrinker		bt_shrinker;
+	struct list_lru		bt_lru;
+} xfs_buftarg_t;
+
+struct xfs_buf;
+typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+
+
+#define XB_PAGES	2
+
+struct xfs_buf_map {
+	xfs_daddr_t		bm_bn;	/* block number for I/O */
+	int			bm_len;	/* size of I/O */
+};
+
+#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
+	struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
+
+struct xfs_buf_ops {
+	void (*verify_read)(struct xfs_buf *);
+	void (*verify_write)(struct xfs_buf *);
+};
+
+typedef struct xfs_buf {
+	/*
+	 * first cacheline holds all the fields needed for an uncontended cache
+	 * hit to be fully processed. The semaphore straddles the cacheline
+	 * boundary, but the counter and lock sits on the first cacheline,
+	 * which is the only bit that is touched if we hit the semaphore
+	 * fast-path on locking.
+	 */
+	struct rb_node		b_rbnode;	/* rbtree node */
+	xfs_daddr_t		b_bn;		/* block number of buffer */
+	int			b_length;	/* size of buffer in BBs */
+	atomic_t		b_hold;		/* reference count */
+	atomic_t		b_lru_ref;	/* lru reclaim ref count */
+	xfs_buf_flags_t		b_flags;	/* status flags */
+	struct semaphore	b_sema;		/* semaphore for lockables */
+
+	/*
+	 * concurrent access to b_lru and b_lru_flags are protected by
+	 * bt_lru_lock and not by b_sema
+	 */
+	struct list_head	b_lru;		/* lru list */
+	spinlock_t		b_lock;		/* internal state lock */
+	unsigned int		b_state;	/* internal state flags */
+	int			b_io_error;	/* internal IO error state */
+	wait_queue_head_t	b_waiters;	/* unpin waiters */
+	struct list_head	b_list;
+	struct xfs_perag	*b_pag;		/* contains rbtree root */
+	xfs_buftarg_t		*b_target;	/* buffer target (device) */
+	void			*b_addr;	/* virtual address of buffer */
+	struct work_struct	b_ioend_work;
+	struct workqueue_struct	*b_ioend_wq;	/* I/O completion wq */
+	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
+	struct completion	b_iowait;	/* queue for I/O waiters */
+	void			*b_fspriv;
+	struct xfs_trans	*b_transp;
+	struct page		**b_pages;	/* array of page pointers */
+	struct page		*b_page_array[XB_PAGES]; /* inline pages */
+	struct xfs_buf_map	*b_maps;	/* compound buffer map */
+	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
+	int			b_map_count;
+	int			b_io_length;	/* IO size in BBs */
+	atomic_t		b_pin_count;	/* pin count */
+	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
+	unsigned int		b_page_count;	/* size of page array */
+	unsigned int		b_offset;	/* page offset in first page */
+	int			b_error;	/* error code on I/O */
+	const struct xfs_buf_ops	*b_ops;
+
+#ifdef XFS_BUF_LOCK_TRACKING
+	int			b_last_holder;
+#endif
+} xfs_buf_t;
+
+/* Finding and Reading Buffers */
+struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
+			      struct xfs_buf_map *map, int nmaps,
+			      xfs_buf_flags_t flags, struct xfs_buf *new_bp);
+
+static inline struct xfs_buf *
+xfs_incore(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return _xfs_buf_find(target, &map, 1, flags, NULL);
+}
+
+struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
+			       struct xfs_buf_map *map, int nmaps,
+			       xfs_buf_flags_t flags);
+
+static inline struct xfs_buf *
+xfs_buf_alloc(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return _xfs_buf_alloc(target, &map, 1, flags);
+}
+
+struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
+			       struct xfs_buf_map *map, int nmaps,
+			       xfs_buf_flags_t flags);
+struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
+			       struct xfs_buf_map *map, int nmaps,
+			       xfs_buf_flags_t flags,
+			       const struct xfs_buf_ops *ops);
+void xfs_buf_readahead_map(struct xfs_buftarg *target,
+			       struct xfs_buf_map *map, int nmaps,
+			       const struct xfs_buf_ops *ops);
+
+static inline struct xfs_buf *
+xfs_buf_get(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_buf_get_map(target, &map, 1, flags);
+}
+
+static inline struct xfs_buf *
+xfs_buf_read(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags,
+	const struct xfs_buf_ops *ops)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_buf_read_map(target, &map, 1, flags, ops);
+}
+
+static inline void
+xfs_buf_readahead(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	const struct xfs_buf_ops *ops)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_buf_readahead_map(target, &map, 1, ops);
+}
+
+struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
+void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
+int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
+
+struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+				int flags);
+int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
+			  size_t numblks, int flags, struct xfs_buf **bpp,
+			  const struct xfs_buf_ops *ops);
+void xfs_buf_hold(struct xfs_buf *bp);
+
+/* Releasing Buffers */
+extern void xfs_buf_free(xfs_buf_t *);
+extern void xfs_buf_rele(xfs_buf_t *);
+
+/* Locking and Unlocking Buffers */
+extern int xfs_buf_trylock(xfs_buf_t *);
+extern void xfs_buf_lock(xfs_buf_t *);
+extern void xfs_buf_unlock(xfs_buf_t *);
+#define xfs_buf_islocked(bp) \
+	((bp)->b_sema.count <= 0)
+
+/* Buffer Read and Write Routines */
+extern int xfs_bwrite(struct xfs_buf *bp);
+extern void xfs_buf_ioend(struct xfs_buf *bp);
+extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
+extern void xfs_buf_submit(struct xfs_buf *bp);
+extern int xfs_buf_submit_wait(struct xfs_buf *bp);
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
+				xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+
+/* Buffer Utility Routines */
+extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+
+/* Delayed Write Buffer Routines */
+extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+extern int xfs_buf_delwri_submit(struct list_head *);
+extern int xfs_buf_delwri_submit_nowait(struct list_head *);
+
+/* Buffer Daemon Setup Routines */
+extern int xfs_buf_init(void);
+extern void xfs_buf_terminate(void);
+
+#define XFS_BUF_ZEROFLAGS(bp) \
+	((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
+			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
+			    XBF_WRITE_FAIL))
+
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
+#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
+
+#define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
+#define XFS_BUF_UNDONE(bp)	((bp)->b_flags &= ~XBF_DONE)
+#define XFS_BUF_ISDONE(bp)	((bp)->b_flags & XBF_DONE)
+
+#define XFS_BUF_ASYNC(bp)	((bp)->b_flags |= XBF_ASYNC)
+#define XFS_BUF_UNASYNC(bp)	((bp)->b_flags &= ~XBF_ASYNC)
+#define XFS_BUF_ISASYNC(bp)	((bp)->b_flags & XBF_ASYNC)
+
+#define XFS_BUF_READ(bp)	((bp)->b_flags |= XBF_READ)
+#define XFS_BUF_UNREAD(bp)	((bp)->b_flags &= ~XBF_READ)
+#define XFS_BUF_ISREAD(bp)	((bp)->b_flags & XBF_READ)
+
+#define XFS_BUF_WRITE(bp)	((bp)->b_flags |= XBF_WRITE)
+#define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
+#define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
+
+/*
+ * These macros use the IO block map rather than b_bn. b_bn is now really
+ * just for the buffer cache index for cached buffers. As IO does not use b_bn
+ * anymore, uncached buffers do not use b_bn at all and hence must modify the IO
+ * map directly. Uncached buffers are not allowed to be discontiguous, so this
+ * is safe to do.
+ *
+ * In future, uncached buffers will pass the block number directly to the io
+ * request function and hence these macros will go away at that point.
+ */
+#define XFS_BUF_ADDR(bp)		((bp)->b_maps[0].bm_bn)
+#define XFS_BUF_SET_ADDR(bp, bno)	((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
+
+static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
+{
+	atomic_set(&bp->b_lru_ref, lru_ref);
+}
+
+static inline int xfs_buf_ispinned(struct xfs_buf *bp)
+{
+	return atomic_read(&bp->b_pin_count);
+}
+
+static inline void xfs_buf_relse(xfs_buf_t *bp)
+{
+	xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
+}
+
+static inline int
+xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+	return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+				cksum_offset);
+}
+
+static inline void
+xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 cksum_offset);
+}
+
+/*
+ *	Handling of buftargs.
+ */
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+			struct block_device *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
+extern void xfs_wait_buftarg(xfs_buftarg_t *);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
+
+#define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
+#define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
+
+#endif	/* __XFS_BUF_H__ */
diff --git a/kernel/fs/xfs/xfs_buf_item.c b/kernel/fs/xfs/xfs_buf_item.c
new file mode 100644
index 000000000..092d652bc
--- /dev/null
+++ b/kernel/fs/xfs/xfs_buf_item.c
@@ -0,0 +1,1155 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+
+
+kmem_zone_t	*xfs_buf_item_zone;
+
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
+
+STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);
+
+static inline int
+xfs_buf_log_format_size(
+	struct xfs_buf_log_format *blfp)
+{
+	return offsetof(struct xfs_buf_log_format, blf_data_map) +
+			(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+}
+
+/*
+ * This returns the number of log iovecs needed to log the
+ * given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure
+ * and 1 for each stretch of non-contiguous chunks to be logged.
+ * Contiguous chunks are logged in a single iovec.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing.
+ */
+STATIC void
+xfs_buf_item_size_segment(
+	struct xfs_buf_log_item	*bip,
+	struct xfs_buf_log_format *blfp,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_buf		*bp = bip->bli_buf;
+	int			next_bit;
+	int			last_bit;
+
+	last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+	if (last_bit == -1)
+		return;
+
+	/*
+	 * initial count for a dirty buffer is 2 vectors - the format structure
+	 * and the first dirty region.
+	 */
+	*nvecs += 2;
+	*nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
+
+	while (last_bit != -1) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+					last_bit + 1);
+		/*
+		 * If we run out of bits, leave the loop,
+		 * else if we find a new set of bits bump the number of vecs,
+		 * else keep scanning the current set of bits.
+		 */
+		if (next_bit == -1) {
+			break;
+		} else if (next_bit != last_bit + 1) {
+			last_bit = next_bit;
+			(*nvecs)++;
+		} else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+			   (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+			    XFS_BLF_CHUNK)) {
+			last_bit = next_bit;
+			(*nvecs)++;
+		} else {
+			last_bit++;
+		}
+		*nbytes += XFS_BLF_CHUNK;
+	}
+}
+
+/*
+ * This returns the number of log iovecs needed to log the given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
+ * in a single iovec.
+ *
+ * Discontiguous buffers need a format structure per region that that is being
+ * logged. This makes the changes in the buffer appear to log recovery as though
+ * they came from separate buffers, just like would occur if multiple buffers
+ * were used instead of a single discontiguous buffer. This enables
+ * discontiguous buffers to be in-memory constructs, completely transparent to
+ * what ends up on disk.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
+ * format structures.
+ */
+STATIC void
+xfs_buf_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	int			i;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		trace_xfs_buf_item_size_stale(bip);
+		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+		*nvecs += bip->bli_format_count;
+		for (i = 0; i < bip->bli_format_count; i++) {
+			*nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
+		}
+		return;
+	}
+
+	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+
+	if (bip->bli_flags & XFS_BLI_ORDERED) {
+		/*
+		 * The buffer has been logged just to order it.
+		 * It is not being included in the transaction
+		 * commit, so no vectors are used at all.
+		 */
+		trace_xfs_buf_item_size_ordered(bip);
+		*nvecs = XFS_LOG_VEC_ORDERED;
+		return;
+	}
+
+	/*
+	 * the vector count is based on the number of buffer vectors we have
+	 * dirty bits in. This will only be greater than one when we have a
+	 * compound buffer with more than one segment dirty. Hence for compound
+	 * buffers we need to track which segment the dirty bits correspond to,
+	 * and when we move from one segment to the next increment the vector
+	 * count for the extra buf log format structure that will need to be
+	 * written.
+	 */
+	for (i = 0; i < bip->bli_format_count; i++) {
+		xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
+					  nvecs, nbytes);
+	}
+	trace_xfs_buf_item_size(bip);
+}
+
+static inline void
+xfs_buf_item_copy_iovec(
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp,
+	struct xfs_buf		*bp,
+	uint			offset,
+	int			first_bit,
+	uint			nbits)
+{
+	offset += first_bit * XFS_BLF_CHUNK;
+	xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
+			xfs_buf_offset(bp, offset),
+			nbits * XFS_BLF_CHUNK);
+}
+
+static inline bool
+xfs_buf_item_straddle(
+	struct xfs_buf		*bp,
+	uint			offset,
+	int			next_bit,
+	int			last_bit)
+{
+	return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
+		(xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
+		 XFS_BLF_CHUNK);
+}
+
+static void
+xfs_buf_item_format_segment(
+	struct xfs_buf_log_item	*bip,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp,
+	uint			offset,
+	struct xfs_buf_log_format *blfp)
+{
+	struct xfs_buf	*bp = bip->bli_buf;
+	uint		base_size;
+	int		first_bit;
+	int		last_bit;
+	int		next_bit;
+	uint		nbits;
+
+	/* copy the flags across from the base format item */
+	blfp->blf_flags = bip->__bli_format.blf_flags;
+
+	/*
+	 * Base size is the actual size of the ondisk structure - it reflects
+	 * the actual size of the dirty bitmap rather than the size of the in
+	 * memory structure.
+	 */
+	base_size = xfs_buf_log_format_size(blfp);
+
+	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+	if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
+		/*
+		 * If the map is not be dirty in the transaction, mark
+		 * the size as zero and do not advance the vector pointer.
+		 */
+		return;
+	}
+
+	blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
+	blfp->blf_size = 1;
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		trace_xfs_buf_item_format_stale(bip);
+		ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
+		return;
+	}
+
+
+	/*
+	 * Fill in an iovec for each set of contiguous chunks.
+	 */
+	last_bit = first_bit;
+	nbits = 1;
+	for (;;) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+					(uint)last_bit + 1);
+		/*
+		 * If we run out of bits fill in the last iovec and get out of
+		 * the loop.  Else if we start a new set of bits then fill in
+		 * the iovec for the series we were looking at and start
+		 * counting the bits in the new one.  Else we're still in the
+		 * same set of bits so just keep counting and scanning.
+		 */
+		if (next_bit == -1) {
+			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+						first_bit, nbits);
+			blfp->blf_size++;
+			break;
+		} else if (next_bit != last_bit + 1 ||
+		           xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
+			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+						first_bit, nbits);
+			blfp->blf_size++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+			nbits = 1;
+		} else {
+			last_bit++;
+			nbits++;
+		}
+	}
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item.  It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+STATIC void
+xfs_buf_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	struct xfs_log_iovec	*vecp = NULL;
+	uint			offset = 0;
+	int			i;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+	ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+	       (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+	        && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
+
+
+	/*
+	 * If it is an inode buffer, transfer the in-memory state to the
+	 * format flags and clear the in-memory state.
+	 *
+	 * For buffer based inode allocation, we do not transfer
+	 * this state if the inode buffer allocation has not yet been committed
+	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+	 * correct replay of the inode allocation.
+	 *
+	 * For icreate item based inode allocation, the buffers aren't written
+	 * to the journal during allocation, and hence we should always tag the
+	 * buffer as an inode buffer so that the correct unlinked list replay
+	 * occurs during recovery.
+	 */
+	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+		if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+		    !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+		      xfs_log_item_in_current_chkpt(lip)))
+			bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+	}
+
+	if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+							XFS_BLI_ORDERED) {
+		/*
+		 * The buffer has been logged just to order it.  It is not being
+		 * included in the transaction commit, so don't format it.
+		 */
+		trace_xfs_buf_item_format_ordered(bip);
+		return;
+	}
+
+	for (i = 0; i < bip->bli_format_count; i++) {
+		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+					    &bip->bli_formats[i]);
+		offset += bp->b_maps[i].bm_len;
+	}
+
+	/*
+	 * Check to make sure everything is consistent.
+	 */
+	trace_xfs_buf_item_format(bip);
+}
+
+/*
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
+ */
+STATIC void
+xfs_buf_item_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_ORDERED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+
+	trace_xfs_buf_item_pin(bip);
+
+	atomic_inc(&bip->bli_refcount);
+	atomic_inc(&bip->bli_buf->b_pin_count);
+}
+
+/*
+ * This is called to unpin the buffer associated with the buf log
+ * item which was previously pinned with a call to xfs_buf_item_pin().
+ *
+ * Also drop the reference to the buf item for the current transaction.
+ * If the XFS_BLI_STALE flag is set and we are the last reference,
+ * then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path.  If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
+ */
+STATIC void
+xfs_buf_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	xfs_buf_t	*bp = bip->bli_buf;
+	struct xfs_ail	*ailp = lip->li_ailp;
+	int		stale = bip->bli_flags & XFS_BLI_STALE;
+	int		freed;
+
+	ASSERT(bp->b_fspriv == bip);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_buf_item_unpin(bip);
+
+	freed = atomic_dec_and_test(&bip->bli_refcount);
+
+	if (atomic_dec_and_test(&bp->b_pin_count))
+		wake_up_all(&bp->b_waiters);
+
+	if (freed && stale) {
+		ASSERT(bip->bli_flags & XFS_BLI_STALE);
+		ASSERT(xfs_buf_islocked(bp));
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+
+		trace_xfs_buf_item_unpin_stale(bip);
+
+		if (remove) {
+			/*
+			 * If we are in a transaction context, we have to
+			 * remove the log item from the transaction as we are
+			 * about to release our reference to the buffer.  If we
+			 * don't, the unlock that occurs later in
+			 * xfs_trans_uncommit() will try to reference the
+			 * buffer which we no longer have a hold on.
+			 */
+			if (lip->li_desc)
+				xfs_trans_del_item(lip);
+
+			/*
+			 * Since the transaction no longer refers to the buffer,
+			 * the buffer should no longer refer to the transaction.
+			 */
+			bp->b_transp = NULL;
+		}
+
+		/*
+		 * If we get called here because of an IO error, we may
+		 * or may not have the item on the AIL. xfs_trans_ail_delete()
+		 * will take care of that situation.
+		 * xfs_trans_ail_delete() drops the AIL lock.
+		 */
+		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+			xfs_buf_do_callbacks(bp);
+			bp->b_fspriv = NULL;
+			bp->b_iodone = NULL;
+		} else {
+			spin_lock(&ailp->xa_lock);
+			xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
+			xfs_buf_item_relse(bp);
+			ASSERT(bp->b_fspriv == NULL);
+		}
+		xfs_buf_relse(bp);
+	} else if (freed && remove) {
+		/*
+		 * There are currently two references to the buffer - the active
+		 * LRU reference and the buf log item. What we are about to do
+		 * here - simulate a failed IO completion - requires 3
+		 * references.
+		 *
+		 * The LRU reference is removed by the xfs_buf_stale() call. The
+		 * buf item reference is removed by the xfs_buf_iodone()
+		 * callback that is run by xfs_buf_do_callbacks() during ioend
+		 * processing (via the bp->b_iodone callback), and then finally
+		 * the ioend processing will drop the IO reference if the buffer
+		 * is marked XBF_ASYNC.
+		 *
+		 * Hence we need to take an additional reference here so that IO
+		 * completion processing doesn't free the buffer prematurely.
+		 */
+		xfs_buf_lock(bp);
+		xfs_buf_hold(bp);
+		bp->b_flags |= XBF_ASYNC;
+		xfs_buf_ioerror(bp, -EIO);
+		XFS_BUF_UNDONE(bp);
+		xfs_buf_stale(bp);
+		xfs_buf_ioend(bp);
+	}
+}
+
+/*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
+ * seconds so as to not spam logs too much on repeated detection of the same
+ * buffer being bad..
+ */
+
+static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+
+STATIC uint
+xfs_buf_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	uint			rval = XFS_ITEM_SUCCESS;
+
+	if (xfs_buf_ispinned(bp))
+		return XFS_ITEM_PINNED;
+	if (!xfs_buf_trylock(bp)) {
+		/*
+		 * If we have just raced with a buffer being pinned and it has
+		 * been marked stale, we could end up stalling until someone else
+		 * issues a log force to unpin the stale buffer. Check for the
+		 * race condition here so xfsaild recognizes the buffer is pinned
+		 * and queues a log force to move it along.
+		 */
+		if (xfs_buf_ispinned(bp))
+			return XFS_ITEM_PINNED;
+		return XFS_ITEM_LOCKED;
+	}
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+
+	trace_xfs_buf_item_push(bip);
+
+	/* has a previous flush failed due to IO errors? */
+	if ((bp->b_flags & XBF_WRITE_FAIL) &&
+	    ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
+		xfs_warn(bp->b_target->bt_mount,
+"Failing async write on buffer block 0x%llx. Retrying async write.",
+			 (long long)bp->b_bn);
+	}
+
+	if (!xfs_buf_delwri_queue(bp, buffer_list))
+		rval = XFS_ITEM_FLUSHING;
+	xfs_buf_unlock(bp);
+	return rval;
+}
+
+/*
+ * Release the buffer associated with the buf log item.  If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
+ *
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
+ *
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer.  This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
+ */
+STATIC void
+xfs_buf_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	bool			clean;
+	bool			aborted;
+	int			flags;
+
+	/* Clear the buffer's association with this transaction. */
+	bp->b_transp = NULL;
+
+	/*
+	 * If this is a transaction abort, don't return early.  Instead, allow
+	 * the brelse to happen.  Normally it would be done for stale
+	 * (cancelled) buffers at unpin time, but we'll never go through the
+	 * pin/unpin cycle if we abort inside commit.
+	 */
+	aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
+	/*
+	 * Before possibly freeing the buf item, copy the per-transaction state
+	 * so we can reference it safely later after clearing it from the
+	 * buffer log item.
+	 */
+	flags = bip->bli_flags;
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+
+	/*
+	 * If the buf item is marked stale, then don't do anything.  We'll
+	 * unlock the buffer and free the buf item when the buffer is unpinned
+	 * for the last time.
+	 */
+	if (flags & XFS_BLI_STALE) {
+		trace_xfs_buf_item_unlock_stale(bip);
+		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+		if (!aborted) {
+			atomic_dec(&bip->bli_refcount);
+			return;
+		}
+	}
+
+	trace_xfs_buf_item_unlock(bip);
+
+	/*
+	 * If the buf item isn't tracking any data, free it, otherwise drop the
+	 * reference we hold to it. If we are aborting the transaction, this may
+	 * be the only reference to the buf item, so we free it anyway
+	 * regardless of whether it is dirty or not. A dirty abort implies a
+	 * shutdown, anyway.
+	 *
+	 * Ordered buffers are dirty but may have no recorded changes, so ensure
+	 * we only release clean items here.
+	 */
+	clean = (flags & XFS_BLI_DIRTY) ? false : true;
+	if (clean) {
+		int i;
+		for (i = 0; i < bip->bli_format_count; i++) {
+			if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+				     bip->bli_formats[i].blf_map_size)) {
+				clean = false;
+				break;
+			}
+		}
+	}
+
+	/*
+	 * Clean buffers, by definition, cannot be in the AIL. However, aborted
+	 * buffers may be dirty and hence in the AIL. Therefore if we are
+	 * aborting a buffer and we've just taken the last refernce away, we
+	 * have to check if it is in the AIL before freeing it. We need to free
+	 * it in this case, because an aborted transaction has already shut the
+	 * filesystem down and this is the last chance we will have to do so.
+	 */
+	if (atomic_dec_and_test(&bip->bli_refcount)) {
+		if (clean)
+			xfs_buf_item_relse(bp);
+		else if (aborted) {
+			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+			if (lip->li_flags & XFS_LI_IN_AIL) {
+				spin_lock(&lip->li_ailp->xa_lock);
+				xfs_trans_ail_delete(lip->li_ailp, lip,
+						     SHUTDOWN_LOG_IO_ERROR);
+			}
+			xfs_buf_item_relse(bp);
+		}
+	}
+
+	if (!(flags & XFS_BLI_HOLD))
+		xfs_buf_relse(bp);
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * buf log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.
+ * We always re-log all the dirty data in a buffer, so usually the
+ * latest copy in the on disk log is the only one that matters.  For
+ * those cases we simply return the given lsn.
+ *
+ * The one exception to this is for buffers full of newly allocated
+ * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
+ * flag set, indicating that only the di_next_unlinked fields from the
+ * inodes in the buffers will be replayed during recovery.  If the
+ * original newly allocated inode images have not yet been flushed
+ * when the buffer is so relogged, then we need to make sure that we
+ * keep the old images in the 'active' portion of the log.  We do this
+ * by returning the original lsn of that transaction here rather than
+ * the current one.
+ */
+STATIC xfs_lsn_t
+xfs_buf_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+
+	trace_xfs_buf_item_committed(bip);
+
+	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
+		return lip->li_lsn;
+	return lsn;
+}
+
+STATIC void
+xfs_buf_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static const struct xfs_item_ops xfs_buf_item_ops = {
+	.iop_size	= xfs_buf_item_size,
+	.iop_format	= xfs_buf_item_format,
+	.iop_pin	= xfs_buf_item_pin,
+	.iop_unpin	= xfs_buf_item_unpin,
+	.iop_unlock	= xfs_buf_item_unlock,
+	.iop_committed	= xfs_buf_item_committed,
+	.iop_push	= xfs_buf_item_push,
+	.iop_committing = xfs_buf_item_committing
+};
+
+STATIC int
+xfs_buf_item_get_format(
+	struct xfs_buf_log_item	*bip,
+	int			count)
+{
+	ASSERT(bip->bli_formats == NULL);
+	bip->bli_format_count = count;
+
+	if (count == 1) {
+		bip->bli_formats = &bip->__bli_format;
+		return 0;
+	}
+
+	bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
+				KM_SLEEP);
+	if (!bip->bli_formats)
+		return -ENOMEM;
+	return 0;
+}
+
+STATIC void
+xfs_buf_item_free_format(
+	struct xfs_buf_log_item	*bip)
+{
+	if (bip->bli_formats != &bip->__bli_format) {
+		kmem_free(bip->bli_formats);
+		bip->bli_formats = NULL;
+	}
+}
+
+/*
+ * Allocate a new buf log item to go with the given buffer.
+ * Set the buffer's b_fsprivate field to point to the new
+ * buf log item.  If there are other item's attached to the
+ * buffer (see xfs_buf_attach_iodone() below), then put the
+ * buf log item at the front.
+ */
+void
+xfs_buf_item_init(
+	xfs_buf_t	*bp,
+	xfs_mount_t	*mp)
+{
+	xfs_log_item_t		*lip = bp->b_fspriv;
+	xfs_buf_log_item_t	*bip;
+	int			chunks;
+	int			map_size;
+	int			error;
+	int			i;
+
+	/*
+	 * Check to see if there is already a buf log item for
+	 * this buffer.  If there is, it is guaranteed to be
+	 * the first.  If we do already have one, there is
+	 * nothing to do here so return.
+	 */
+	ASSERT(bp->b_target->bt_mount == mp);
+	if (lip != NULL && lip->li_type == XFS_LI_BUF)
+		return;
+
+	bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
+	bip->bli_buf = bp;
+	xfs_buf_hold(bp);
+
+	/*
+	 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
+	 * can be divided into. Make sure not to truncate any pieces.
+	 * map_size is the size of the bitmap needed to describe the
+	 * chunks of the buffer.
+	 *
+	 * Discontiguous buffer support follows the layout of the underlying
+	 * buffer. This makes the implementation as simple as possible.
+	 */
+	error = xfs_buf_item_get_format(bip, bp->b_map_count);
+	ASSERT(error == 0);
+
+	for (i = 0; i < bip->bli_format_count; i++) {
+		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+				      XFS_BLF_CHUNK);
+		map_size = DIV_ROUND_UP(chunks, NBWORD);
+
+		bip->bli_formats[i].blf_type = XFS_LI_BUF;
+		bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
+		bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
+		bip->bli_formats[i].blf_map_size = map_size;
+	}
+
+	/*
+	 * Put the buf item into the list of items attached to the
+	 * buffer at the front.
+	 */
+	if (bp->b_fspriv)
+		bip->bli_item.li_bio_list = bp->b_fspriv;
+	bp->b_fspriv = bip;
+}
+
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+static void
+xfs_buf_item_log_segment(
+	uint			first,
+	uint			last,
+	uint			*map)
+{
+	uint		first_bit;
+	uint		last_bit;
+	uint		bits_to_set;
+	uint		bits_set;
+	uint		word_num;
+	uint		*wordp;
+	uint		bit;
+	uint		end_bit;
+	uint		mask;
+
+	/*
+	 * Convert byte offsets to bit numbers.
+	 */
+	first_bit = first >> XFS_BLF_SHIFT;
+	last_bit = last >> XFS_BLF_SHIFT;
+
+	/*
+	 * Calculate the total number of bits to be set.
+	 */
+	bits_to_set = last_bit - first_bit + 1;
+
+	/*
+	 * Get a pointer to the first word in the bitmap
+	 * to set a bit in.
+	 */
+	word_num = first_bit >> BIT_TO_WORD_SHIFT;
+	wordp = &map[word_num];
+
+	/*
+	 * Calculate the starting bit in the first word.
+	 */
+	bit = first_bit & (uint)(NBWORD - 1);
+
+	/*
+	 * First set any bits in the first word of our range.
+	 * If it starts at bit 0 of the word, it will be
+	 * set below rather than here.  That is what the variable
+	 * bit tells us. The variable bits_set tracks the number
+	 * of bits that have been set so far.  End_bit is the number
+	 * of the last bit to be set in this word plus one.
+	 */
+	if (bit) {
+		end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
+		mask = ((1 << (end_bit - bit)) - 1) << bit;
+		*wordp |= mask;
+		wordp++;
+		bits_set = end_bit - bit;
+	} else {
+		bits_set = 0;
+	}
+
+	/*
+	 * Now set bits a whole word at a time that are between
+	 * first_bit and last_bit.
+	 */
+	while ((bits_to_set - bits_set) >= NBWORD) {
+		*wordp |= 0xffffffff;
+		bits_set += NBWORD;
+		wordp++;
+	}
+
+	/*
+	 * Finally, set any bits left to be set in one last partial word.
+	 */
+	end_bit = bits_to_set - bits_set;
+	if (end_bit) {
+		mask = (1 << end_bit) - 1;
+		*wordp |= mask;
+	}
+}
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+	xfs_buf_log_item_t	*bip,
+	uint			first,
+	uint			last)
+{
+	int			i;
+	uint			start;
+	uint			end;
+	struct xfs_buf		*bp = bip->bli_buf;
+
+	/*
+	 * walk each buffer segment and mark them dirty appropriately.
+	 */
+	start = 0;
+	for (i = 0; i < bip->bli_format_count; i++) {
+		if (start > last)
+			break;
+		end = start + BBTOB(bp->b_maps[i].bm_len);
+		if (first > end) {
+			start += BBTOB(bp->b_maps[i].bm_len);
+			continue;
+		}
+		if (first < start)
+			first = start;
+		if (end > last)
+			end = last;
+
+		xfs_buf_item_log_segment(first, end,
+					 &bip->bli_formats[i].blf_data_map[0]);
+
+		start += bp->b_maps[i].bm_len;
+	}
+}
+
+
+/*
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
+ * point, not just the current transaction) and 0 if not.
+ */
+uint
+xfs_buf_item_dirty(
+	xfs_buf_log_item_t	*bip)
+{
+	return (bip->bli_flags & XFS_BLI_DIRTY);
+}
+
+STATIC void
+xfs_buf_item_free(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_item_free_format(bip);
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
+/*
+ * This is called when the buf log item is no longer needed.  It should
+ * free the buf log item associated with the given buffer and clear
+ * the buffer's pointer to the buf log item.  If there are no more
+ * items in the list, clear the b_iodone field of the buffer (see
+ * xfs_buf_attach_iodone() below).
+ */
+void
+xfs_buf_item_relse(
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+
+	trace_xfs_buf_item_relse(bp, _RET_IP_);
+	ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+
+	bp->b_fspriv = bip->bli_item.li_bio_list;
+	if (bp->b_fspriv == NULL)
+		bp->b_iodone = NULL;
+
+	xfs_buf_rele(bp);
+	xfs_buf_item_free(bip);
+}
+
+
+/*
+ * Add the given log item with its callback to the list of callbacks
+ * to be called when the buffer's I/O completes.  If it is not set
+ * already, set the buffer's b_iodone() routine to be
+ * xfs_buf_iodone_callbacks() and link the log item into the list of
+ * items rooted at b_fsprivate.  Items are always added as the second
+ * entry in the list if there is a first, because the buf item code
+ * assumes that the buf log item is first.
+ */
+void
+xfs_buf_attach_iodone(
+	xfs_buf_t	*bp,
+	void		(*cb)(xfs_buf_t *, xfs_log_item_t *),
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*head_lip;
+
+	ASSERT(xfs_buf_islocked(bp));
+
+	lip->li_cb = cb;
+	head_lip = bp->b_fspriv;
+	if (head_lip) {
+		lip->li_bio_list = head_lip->li_bio_list;
+		head_lip->li_bio_list = lip;
+	} else {
+		bp->b_fspriv = lip;
+	}
+
+	ASSERT(bp->b_iodone == NULL ||
+	       bp->b_iodone == xfs_buf_iodone_callbacks);
+	bp->b_iodone = xfs_buf_iodone_callbacks;
+}
+
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
+STATIC void
+xfs_buf_do_callbacks(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip;
+
+	while ((lip = bp->b_fspriv) != NULL) {
+		bp->b_fspriv = lip->li_bio_list;
+		ASSERT(lip->li_cb != NULL);
+		/*
+		 * Clear the next pointer so we don't have any
+		 * confusion if the item is added to another buf.
+		 * Don't touch the log item after calling its
+		 * callback, because it could have freed itself.
+		 */
+		lip->li_bio_list = NULL;
+		lip->li_cb(bp, lip);
+	}
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks
+ * attached to them by xfs_buf_attach_iodone().  It should remove each
+ * log item from the buffer's list and call the callback of each in turn.
+ * When done, the buffer's fsprivate field is set to NULL and the buffer
+ * is unlocked with a call to iodone().
+ */
+void
+xfs_buf_iodone_callbacks(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip = bp->b_fspriv;
+	struct xfs_mount	*mp = lip->li_mountp;
+	static ulong		lasttime;
+	static xfs_buftarg_t	*lasttarg;
+
+	if (likely(!bp->b_error))
+		goto do_callbacks;
+
+	/*
+	 * If we've already decided to shutdown the filesystem because of
+	 * I/O errors, there's no point in giving this a retry.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		xfs_buf_stale(bp);
+		XFS_BUF_DONE(bp);
+		trace_xfs_buf_item_iodone(bp, _RET_IP_);
+		goto do_callbacks;
+	}
+
+	if (bp->b_target != lasttarg ||
+	    time_after(jiffies, (lasttime + 5*HZ))) {
+		lasttime = jiffies;
+		xfs_buf_ioerror_alert(bp, __func__);
+	}
+	lasttarg = bp->b_target;
+
+	/*
+	 * If the write was asynchronous then no one will be looking for the
+	 * error.  Clear the error state and write the buffer out again.
+	 *
+	 * XXX: This helps against transient write errors, but we need to find
+	 * a way to shut the filesystem down if the writes keep failing.
+	 *
+	 * In practice we'll shut the filesystem down soon as non-transient
+	 * errors tend to affect the whole device and a failing log write
+	 * will make us give up.  But we really ought to do better here.
+	 */
+	if (XFS_BUF_ISASYNC(bp)) {
+		ASSERT(bp->b_iodone != NULL);
+
+		trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+
+		xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
+
+		if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
+			bp->b_flags |= XBF_WRITE | XBF_ASYNC |
+				       XBF_DONE | XBF_WRITE_FAIL;
+			xfs_buf_submit(bp);
+		} else {
+			xfs_buf_relse(bp);
+		}
+
+		return;
+	}
+
+	/*
+	 * If the write of the buffer was synchronous, we want to make
+	 * sure to return the error to the caller of xfs_bwrite().
+	 */
+	xfs_buf_stale(bp);
+	XFS_BUF_DONE(bp);
+
+	trace_xfs_buf_error_relse(bp, _RET_IP_);
+
+do_callbacks:
+	xfs_buf_do_callbacks(bp);
+	bp->b_fspriv = NULL;
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp);
+}
+
+/*
+ * This is the iodone() function for buffers which have been
+ * logged.  It is called when they are eventually flushed out.
+ * It should remove the buf item from the AIL, and free the buf item.
+ * It is called by xfs_buf_iodone_callbacks() above which will take
+ * care of cleaning up the buffer itself.
+ */
+void
+xfs_buf_iodone(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	ASSERT(BUF_ITEM(lip)->bli_buf == bp);
+
+	xfs_buf_rele(bp);
+
+	/*
+	 * If we are forcibly shutting down, this may well be
+	 * off the AIL already. That's because we simulate the
+	 * log-committed callbacks to unpin these buffers. Or we may never
+	 * have put this item on AIL because of the transaction was
+	 * aborted forcibly. xfs_trans_ail_delete() takes care of these.
+	 *
+	 * Either way, AIL is useless if we're forcing a shutdown.
+	 */
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+	xfs_buf_item_free(BUF_ITEM(lip));
+}
diff --git a/kernel/fs/xfs/xfs_buf_item.h b/kernel/fs/xfs/xfs_buf_item.h
new file mode 100644
index 000000000..3f3455a41
--- /dev/null
+++ b/kernel/fs/xfs/xfs_buf_item.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_BUF_ITEM_H__
+#define	__XFS_BUF_ITEM_H__
+
+/* kernel only definitions */
+
+/* buf log item flags */
+#define	XFS_BLI_HOLD		0x01
+#define	XFS_BLI_DIRTY		0x02
+#define	XFS_BLI_STALE		0x04
+#define	XFS_BLI_LOGGED		0x08
+#define	XFS_BLI_INODE_ALLOC_BUF	0x10
+#define XFS_BLI_STALE_INODE	0x20
+#define	XFS_BLI_INODE_BUF	0x40
+#define	XFS_BLI_ORDERED		0x80
+
+#define XFS_BLI_FLAGS \
+	{ XFS_BLI_HOLD,		"HOLD" }, \
+	{ XFS_BLI_DIRTY,	"DIRTY" }, \
+	{ XFS_BLI_STALE,	"STALE" }, \
+	{ XFS_BLI_LOGGED,	"LOGGED" }, \
+	{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
+	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }, \
+	{ XFS_BLI_INODE_BUF,	"INODE_BUF" }, \
+	{ XFS_BLI_ORDERED,	"ORDERED" }
+
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_buf_log_item;
+
+/*
+ * This is the in core log item structure used to track information
+ * needed to log buffers.  It tracks how many times the lock has been
+ * locked, and which 128 byte chunks of the buffer are dirty.
+ */
+typedef struct xfs_buf_log_item {
+	xfs_log_item_t		bli_item;	/* common item structure */
+	struct xfs_buf		*bli_buf;	/* real buffer pointer */
+	unsigned int		bli_flags;	/* misc flags */
+	unsigned int		bli_recur;	/* lock recursion count */
+	atomic_t		bli_refcount;	/* cnt of tp refs */
+	int			bli_format_count;	/* count of headers */
+	struct xfs_buf_log_format *bli_formats;	/* array of in-log header ptrs */
+	struct xfs_buf_log_format __bli_format;	/* embedded in-log header */
+} xfs_buf_log_item_t;
+
+void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void	xfs_buf_item_relse(struct xfs_buf *);
+void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+uint	xfs_buf_item_dirty(xfs_buf_log_item_t *);
+void	xfs_buf_attach_iodone(struct xfs_buf *,
+			      void(*)(struct xfs_buf *, xfs_log_item_t *),
+			      xfs_log_item_t *);
+void	xfs_buf_iodone_callbacks(struct xfs_buf *);
+void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+
+extern kmem_zone_t	*xfs_buf_item_zone;
+
+#endif	/* __XFS_BUF_ITEM_H__ */
diff --git a/kernel/fs/xfs/xfs_dir2_readdir.c b/kernel/fs/xfs/xfs_dir2_readdir.c
new file mode 100644
index 000000000..098cd78fe
--- /dev/null
+++ b/kernel/fs/xfs/xfs_dir2_readdir.c
@@ -0,0 +1,681 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+
+/*
+ * Directory file type support functions
+ */
+static unsigned char xfs_dir3_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
+	DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
+};
+
+static unsigned char
+xfs_dir3_get_dtype(
+	struct xfs_mount	*mp,
+	__uint8_t		filetype)
+{
+	if (!xfs_sb_version_hasftype(&mp->m_sb))
+		return DT_UNKNOWN;
+
+	if (filetype >= XFS_DIR3_FT_MAX)
+		return DT_UNKNOWN;
+
+	return xfs_dir3_filetype_table[filetype];
+}
+
+STATIC int
+xfs_dir2_sf_getdents(
+	struct xfs_da_args	*args,
+	struct dir_context	*ctx)
+{
+	int			i;		/* shortform entry number */
+	struct xfs_inode	*dp = args->dp;	/* incore directory inode */
+	xfs_dir2_dataptr_t	off;		/* current entry's offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	xfs_dir2_dataptr_t	dot_offset;
+	xfs_dir2_dataptr_t	dotdot_offset;
+	xfs_ino_t		ino;
+	struct xfs_da_geometry	*geo = args->geo;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Give up if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return -EIO;
+	}
+
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+		return 0;
+
+	/*
+	 * Precalculate offsets for . and .. as we will always need them.
+	 *
+	 * XXX(hch): the second argument is sometimes 0 and sometimes
+	 * geo->datablk
+	 */
+	dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+						dp->d_ops->data_dot_offset);
+	dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+						dp->d_ops->data_dotdot_offset);
+
+	/*
+	 * Put . entry unless we're starting past it.
+	 */
+	if (ctx->pos <= dot_offset) {
+		ctx->pos = dot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
+			return 0;
+	}
+
+	/*
+	 * Put .. entry unless we're starting past it.
+	 */
+	if (ctx->pos <= dotdot_offset) {
+		ino = dp->d_ops->sf_get_parent_ino(sfp);
+		ctx->pos = dotdot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
+			return 0;
+	}
+
+	/*
+	 * Loop while there are more entries and put'ing works.
+	 */
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	for (i = 0; i < sfp->count; i++) {
+		__uint8_t filetype;
+
+		off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+				xfs_dir2_sf_get_offset(sfep));
+
+		if (ctx->pos > off) {
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+			continue;
+		}
+
+		ino = dp->d_ops->sf_get_ino(sfp, sfep);
+		filetype = dp->d_ops->sf_get_ftype(sfep);
+		ctx->pos = off & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
+			    xfs_dir3_get_dtype(dp->i_mount, filetype)))
+			return 0;
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+	}
+
+	ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+								0x7fffffff;
+	return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+STATIC int
+xfs_dir2_block_getdents(
+	struct xfs_da_args	*args,
+	struct dir_context	*ctx)
+{
+	struct xfs_inode	*dp = args->dp;	/* incore directory inode */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	struct xfs_buf		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	char			*endptr;	/* end of the data entries */
+	int			error;		/* error return value */
+	char			*ptr;		/* current data entry */
+	int			wantoff;	/* starting block offset */
+	xfs_off_t		cook;
+	struct xfs_da_geometry	*geo = args->geo;
+
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+		return 0;
+
+	error = xfs_dir3_block_read(NULL, dp, &bp);
+	if (error)
+		return error;
+
+	/*
+	 * Extract the byte offset we start at from the seek pointer.
+	 * We'll skip entries before this.
+	 */
+	wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
+	hdr = bp->b_addr;
+	xfs_dir3_data_check(dp, bp);
+	/*
+	 * Set up values for the loop.
+	 */
+	btp = xfs_dir2_block_tail_p(geo, hdr);
+	ptr = (char *)dp->d_ops->data_entry_p(hdr);
+	endptr = (char *)xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Loop over the data portion of the block.
+	 * Each object is a real entry (dep) or an unused one (dup).
+	 */
+	while (ptr < endptr) {
+		__uint8_t filetype;
+
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * Unused, skip it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += be16_to_cpu(dup->length);
+			continue;
+		}
+
+		dep = (xfs_dir2_data_entry_t *)ptr;
+
+		/*
+		 * Bump pointer for the next iteration.
+		 */
+		ptr += dp->d_ops->data_entsize(dep->namelen);
+		/*
+		 * The entry is before the desired starting point, skip it.
+		 */
+		if ((char *)dep - (char *)hdr < wantoff)
+			continue;
+
+		cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+					    (char *)dep - (char *)hdr);
+
+		ctx->pos = cook & 0x7fffffff;
+		filetype = dp->d_ops->data_get_ftype(dep);
+		/*
+		 * If it didn't fit, set the final offset to here & return.
+		 */
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+			    be64_to_cpu(dep->inumber),
+			    xfs_dir3_get_dtype(dp->i_mount, filetype))) {
+			xfs_trans_brelse(NULL, bp);
+			return 0;
+		}
+	}
+
+	/*
+	 * Reached the end of the block.
+	 * Set the offset to a non-existent block 1 and return.
+	 */
+	ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+								0x7fffffff;
+	xfs_trans_brelse(NULL, bp);
+	return 0;
+}
+
+struct xfs_dir2_leaf_map_info {
+	xfs_extlen_t	map_blocks;	/* number of fsbs in map */
+	xfs_dablk_t	map_off;	/* last mapped file offset */
+	int		map_size;	/* total entries in *map */
+	int		map_valid;	/* valid entries in *map */
+	int		nmap;		/* mappings to ask xfs_bmapi */
+	xfs_dir2_db_t	curdb;		/* db for current block */
+	int		ra_current;	/* number of read-ahead blks */
+	int		ra_index;	/* *map index for read-ahead */
+	int		ra_offset;	/* map entry offset for ra */
+	int		ra_want;	/* readahead count wanted */
+	struct xfs_bmbt_irec map[];	/* map vector for blocks */
+};
+
+STATIC int
+xfs_dir2_leaf_readbuf(
+	struct xfs_da_args	*args,
+	size_t			bufsize,
+	struct xfs_dir2_leaf_map_info *mip,
+	xfs_dir2_off_t		*curoff,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf		*bp = *bpp;
+	struct xfs_bmbt_irec	*map = mip->map;
+	struct blk_plug		plug;
+	int			error = 0;
+	int			length;
+	int			i;
+	int			j;
+	struct xfs_da_geometry	*geo = args->geo;
+
+	/*
+	 * If we have a buffer, we need to release it and
+	 * take it out of the mapping.
+	 */
+
+	if (bp) {
+		xfs_trans_brelse(NULL, bp);
+		bp = NULL;
+		mip->map_blocks -= geo->fsbcount;
+		/*
+		 * Loop to get rid of the extents for the
+		 * directory block.
+		 */
+		for (i = geo->fsbcount; i > 0; ) {
+			j = min_t(int, map->br_blockcount, i);
+			map->br_blockcount -= j;
+			map->br_startblock += j;
+			map->br_startoff += j;
+			/*
+			 * If mapping is done, pitch it from
+			 * the table.
+			 */
+			if (!map->br_blockcount && --mip->map_valid)
+				memmove(&map[0], &map[1],
+					sizeof(map[0]) * mip->map_valid);
+			i -= j;
+		}
+	}
+
+	/*
+	 * Recalculate the readahead blocks wanted.
+	 */
+	mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
+	ASSERT(mip->ra_want >= 0);
+
+	/*
+	 * If we don't have as many as we want, and we haven't
+	 * run out of data blocks, get some more mappings.
+	 */
+	if (1 + mip->ra_want > mip->map_blocks &&
+	    mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
+		/*
+		 * Get more bmaps, fill in after the ones
+		 * we already have in the table.
+		 */
+		mip->nmap = mip->map_size - mip->map_valid;
+		error = xfs_bmapi_read(dp, mip->map_off,
+				xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
+								mip->map_off,
+				&map[mip->map_valid], &mip->nmap, 0);
+
+		/*
+		 * Don't know if we should ignore this or try to return an
+		 * error.  The trouble with returning errors is that readdir
+		 * will just stop without actually passing the error through.
+		 */
+		if (error)
+			goto out;	/* XXX */
+
+		/*
+		 * If we got all the mappings we asked for, set the final map
+		 * offset based on the last bmap value received.  Otherwise,
+		 * we've reached the end.
+		 */
+		if (mip->nmap == mip->map_size - mip->map_valid) {
+			i = mip->map_valid + mip->nmap - 1;
+			mip->map_off = map[i].br_startoff + map[i].br_blockcount;
+		} else
+			mip->map_off = xfs_dir2_byte_to_da(geo,
+							XFS_DIR2_LEAF_OFFSET);
+
+		/*
+		 * Look for holes in the mapping, and eliminate them.  Count up
+		 * the valid blocks.
+		 */
+		for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
+			if (map[i].br_startblock == HOLESTARTBLOCK) {
+				mip->nmap--;
+				length = mip->map_valid + mip->nmap - i;
+				if (length)
+					memmove(&map[i], &map[i + 1],
+						sizeof(map[i]) * length);
+			} else {
+				mip->map_blocks += map[i].br_blockcount;
+				i++;
+			}
+		}
+		mip->map_valid += mip->nmap;
+	}
+
+	/*
+	 * No valid mappings, so no more data blocks.
+	 */
+	if (!mip->map_valid) {
+		*curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
+		goto out;
+	}
+
+	/*
+	 * Read the directory block starting at the first mapping.
+	 */
+	mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
+	error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
+			map->br_blockcount >= geo->fsbcount ?
+			    XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
+			    -1, &bp);
+	/*
+	 * Should just skip over the data block instead of giving up.
+	 */
+	if (error)
+		goto out;	/* XXX */
+
+	/*
+	 * Adjust the current amount of read-ahead: we just read a block that
+	 * was previously ra.
+	 */
+	if (mip->ra_current)
+		mip->ra_current -= geo->fsbcount;
+
+	/*
+	 * Do we need more readahead?
+	 */
+	blk_start_plug(&plug);
+	for (mip->ra_index = mip->ra_offset = i = 0;
+	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
+	     i += geo->fsbcount) {
+		ASSERT(mip->ra_index < mip->map_valid);
+		/*
+		 * Read-ahead a contiguous directory block.
+		 */
+		if (i > mip->ra_current &&
+		    map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+			xfs_dir3_data_readahead(dp,
+				map[mip->ra_index].br_startoff + mip->ra_offset,
+				XFS_FSB_TO_DADDR(dp->i_mount,
+					map[mip->ra_index].br_startblock +
+							mip->ra_offset));
+			mip->ra_current = i;
+		}
+
+		/*
+		 * Read-ahead a non-contiguous directory block.  This doesn't
+		 * use our mapping, but this is a very rare case.
+		 */
+		else if (i > mip->ra_current) {
+			xfs_dir3_data_readahead(dp,
+					map[mip->ra_index].br_startoff +
+							mip->ra_offset, -1);
+			mip->ra_current = i;
+		}
+
+		/*
+		 * Advance offset through the mapping table.
+		 */
+		for (j = 0; j < geo->fsbcount; j += length ) {
+			/*
+			 * The rest of this extent but not more than a dir
+			 * block.
+			 */
+			length = min_t(int, geo->fsbcount,
+					map[mip->ra_index].br_blockcount -
+							mip->ra_offset);
+			mip->ra_offset += length;
+
+			/*
+			 * Advance to the next mapping if this one is used up.
+			 */
+			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
+				mip->ra_offset = 0;
+				mip->ra_index++;
+			}
+		}
+	}
+	blk_finish_plug(&plug);
+
+out:
+	*bpp = bp;
+	return error;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+STATIC int
+xfs_dir2_leaf_getdents(
+	struct xfs_da_args	*args,
+	struct dir_context	*ctx,
+	size_t			bufsize)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf		*bp = NULL;	/* data block buffer */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	int			error = 0;	/* error return value */
+	int			length;		/* temporary length value */
+	int			byteoff;	/* offset in current block */
+	xfs_dir2_off_t		curoff;		/* current overall offset */
+	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
+	char			*ptr = NULL;	/* pointer to current data */
+	struct xfs_dir2_leaf_map_info *map_info;
+	struct xfs_da_geometry	*geo = args->geo;
+
+	/*
+	 * If the offset is at or past the largest allowed value,
+	 * give up right away.
+	 */
+	if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
+		return 0;
+
+	/*
+	 * Set up to bmap a number of blocks based on the caller's
+	 * buffer size, the directory block size, and the filesystem
+	 * block size.
+	 */
+	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
+	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
+				(length * sizeof(struct xfs_bmbt_irec)),
+			       KM_SLEEP | KM_NOFS);
+	map_info->map_size = length;
+
+	/*
+	 * Inside the loop we keep the main offset value as a byte offset
+	 * in the directory file.
+	 */
+	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
+
+	/*
+	 * Force this conversion through db so we truncate the offset
+	 * down to get the start of the data block.
+	 */
+	map_info->map_off = xfs_dir2_db_to_da(geo,
+					      xfs_dir2_byte_to_db(geo, curoff));
+
+	/*
+	 * Loop over directory entries until we reach the end offset.
+	 * Get more blocks and readahead as necessary.
+	 */
+	while (curoff < XFS_DIR2_LEAF_OFFSET) {
+		__uint8_t filetype;
+
+		/*
+		 * If we have no buffer, or we're off the end of the
+		 * current buffer, need to get another one.
+		 */
+		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+
+			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
+						      &curoff, &bp);
+			if (error || !map_info->map_valid)
+				break;
+
+			/*
+			 * Having done a read, we need to set a new offset.
+			 */
+			newoff = xfs_dir2_db_off_to_byte(geo,
+							 map_info->curdb, 0);
+			/*
+			 * Start of the current block.
+			 */
+			if (curoff < newoff)
+				curoff = newoff;
+			/*
+			 * Make sure we're in the right block.
+			 */
+			else if (curoff > newoff)
+				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
+				       map_info->curdb);
+			hdr = bp->b_addr;
+			xfs_dir3_data_check(dp, bp);
+			/*
+			 * Find our position in the block.
+			 */
+			ptr = (char *)dp->d_ops->data_entry_p(hdr);
+			byteoff = xfs_dir2_byte_to_off(geo, curoff);
+			/*
+			 * Skip past the header.
+			 */
+			if (byteoff == 0)
+				curoff += dp->d_ops->data_entry_offset;
+			/*
+			 * Skip past entries until we reach our offset.
+			 */
+			else {
+				while ((char *)ptr - (char *)hdr < byteoff) {
+					dup = (xfs_dir2_data_unused_t *)ptr;
+
+					if (be16_to_cpu(dup->freetag)
+						  == XFS_DIR2_DATA_FREE_TAG) {
+
+						length = be16_to_cpu(dup->length);
+						ptr += length;
+						continue;
+					}
+					dep = (xfs_dir2_data_entry_t *)ptr;
+					length =
+					   dp->d_ops->data_entsize(dep->namelen);
+					ptr += length;
+				}
+				/*
+				 * Now set our real offset.
+				 */
+				curoff =
+					xfs_dir2_db_off_to_byte(geo,
+					    xfs_dir2_byte_to_db(geo, curoff),
+					    (char *)ptr - (char *)hdr);
+				if (ptr >= (char *)hdr + geo->blksize) {
+					continue;
+				}
+			}
+		}
+		/*
+		 * We have a pointer to an entry.
+		 * Is it a live one?
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * No, it's unused, skip over it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			length = be16_to_cpu(dup->length);
+			ptr += length;
+			curoff += length;
+			continue;
+		}
+
+		dep = (xfs_dir2_data_entry_t *)ptr;
+		length = dp->d_ops->data_entsize(dep->namelen);
+		filetype = dp->d_ops->data_get_ftype(dep);
+
+		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+			    be64_to_cpu(dep->inumber),
+			    xfs_dir3_get_dtype(dp->i_mount, filetype)))
+			break;
+
+		/*
+		 * Advance to next entry in the block.
+		 */
+		ptr += length;
+		curoff += length;
+		/* bufsize may have just been a guess; don't go negative */
+		bufsize = bufsize > length ? bufsize - length : 0;
+	}
+
+	/*
+	 * All done.  Set output offset value to current offset.
+	 */
+	if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
+		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+	else
+		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+	kmem_free(map_info);
+	if (bp)
+		xfs_trans_brelse(NULL, bp);
+	return error;
+}
+
+/*
+ * Read a directory.
+ */
+int
+xfs_readdir(
+	struct xfs_inode	*dp,
+	struct dir_context	*ctx,
+	size_t			bufsize)
+{
+	struct xfs_da_args	args = { NULL };
+	int			rval;
+	int			v;
+	uint			lock_mode;
+
+	trace_xfs_readdir(dp);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return -EIO;
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	XFS_STATS_INC(xs_dir_getdents);
+
+	args.dp = dp;
+	args.geo = dp->i_mount->m_dir_geo;
+
+	lock_mode = xfs_ilock_data_map_shared(dp);
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_getdents(&args, ctx);
+	else if ((rval = xfs_dir2_isblock(&args, &v)))
+		;
+	else if (v)
+		rval = xfs_dir2_block_getdents(&args, ctx);
+	else
+		rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
+	xfs_iunlock(dp, lock_mode);
+
+	return rval;
+}
diff --git a/kernel/fs/xfs/xfs_discard.c b/kernel/fs/xfs/xfs_discard.c
new file mode 100644
index 000000000..e85a9519a
--- /dev/null
+++ b/kernel/fs/xfs/xfs_discard.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+
+STATIC int
+xfs_trim_extents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_daddr_t		start,
+	xfs_daddr_t		end,
+	xfs_daddr_t		minlen,
+	__uint64_t		*blocks_trimmed)
+{
+	struct block_device	*bdev = mp->m_ddev_targp->bt_bdev;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	struct xfs_perag	*pag;
+	int			error;
+	int			i;
+
+	pag = xfs_perag_get(mp, agno);
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error || !agbp)
+		goto out_put_perag;
+
+	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+	/*
+	 * Force out the log.  This means any transactions that might have freed
+	 * space before we took the AGF buffer lock are now on disk, and the
+	 * volatile disk cache is flushed.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Look up the longest btree in the AGF and start with it.
+	 */
+	error = xfs_alloc_lookup_ge(cur, 0,
+			    be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
+	if (error)
+		goto out_del_cursor;
+
+	/*
+	 * Loop until we are done with all extents that are large
+	 * enough to be worth discarding.
+	 */
+	while (i) {
+		xfs_agblock_t	fbno;
+		xfs_extlen_t	flen;
+		xfs_daddr_t	dbno;
+		xfs_extlen_t	dlen;
+
+		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+		if (error)
+			goto out_del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
+		ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+
+		/*
+		 * use daddr format for all range/len calculations as that is
+		 * the format the range/len variables are supplied in by
+		 * userspace.
+		 */
+		dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
+		dlen = XFS_FSB_TO_BB(mp, flen);
+
+		/*
+		 * Too small?  Give up.
+		 */
+		if (dlen < minlen) {
+			trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+			goto out_del_cursor;
+		}
+
+		/*
+		 * If the extent is entirely outside of the range we are
+		 * supposed to discard skip it.  Do not bother to trim
+		 * down partially overlapping ranges for now.
+		 */
+		if (dbno + dlen < start || dbno > end) {
+			trace_xfs_discard_exclude(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		/*
+		 * If any blocks in the range are still busy, skip the
+		 * discard and try again the next time.
+		 */
+		if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
+			trace_xfs_discard_busy(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		trace_xfs_discard_extent(mp, agno, fbno, flen);
+		error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+		if (error)
+			goto out_del_cursor;
+		*blocks_trimmed += flen;
+
+next_extent:
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto out_del_cursor;
+	}
+
+out_del_cursor:
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+out_put_perag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * trim a range of the filesystem.
+ *
+ * Note: the parameters passed from userspace are byte ranges into the
+ * filesystem which does not match to the format we use for filesystem block
+ * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
+ * is a linear address range. Hence we need to use DADDR based conversions and
+ * comparisons for determining the correct offset and regions to trim.
+ */
+int
+xfs_ioc_trim(
+	struct xfs_mount		*mp,
+	struct fstrim_range __user	*urange)
+{
+	struct request_queue	*q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
+	unsigned int		granularity = q->limits.discard_granularity;
+	struct fstrim_range	range;
+	xfs_daddr_t		start, end, minlen;
+	xfs_agnumber_t		start_agno, end_agno, agno;
+	__uint64_t		blocks_trimmed = 0;
+	int			error, last_error = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+	if (copy_from_user(&range, urange, sizeof(range)))
+		return -EFAULT;
+
+	/*
+	 * Truncating down the len isn't actually quite correct, but using
+	 * BBTOB would mean we trivially get overflows for values
+	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+	 * used by the fstrim application.  In the end it really doesn't
+	 * matter as trimming blocks is an advisory interface.
+	 */
+	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+	    range.len < mp->m_sb.sb_blocksize)
+		return -EINVAL;
+
+	start = BTOBB(range.start);
+	end = start + BTOBBT(range.len) - 1;
+	minlen = BTOBB(max_t(u64, granularity, range.minlen));
+
+	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
+		end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
+
+	start_agno = xfs_daddr_to_agno(mp, start);
+	end_agno = xfs_daddr_to_agno(mp, end);
+
+	for (agno = start_agno; agno <= end_agno; agno++) {
+		error = xfs_trim_extents(mp, agno, start, end, minlen,
+					  &blocks_trimmed);
+		if (error)
+			last_error = error;
+	}
+
+	if (last_error)
+		return last_error;
+
+	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+	if (copy_to_user(urange, &range, sizeof(range)))
+		return -EFAULT;
+	return 0;
+}
+
+int
+xfs_discard_extents(
+	struct xfs_mount	*mp,
+	struct list_head	*list)
+{
+	struct xfs_extent_busy	*busyp;
+	int			error = 0;
+
+	list_for_each_entry(busyp, list, list) {
+		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+					 busyp->length);
+
+		error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+				XFS_FSB_TO_BB(mp, busyp->length),
+				GFP_NOFS, 0);
+		if (error && error != -EOPNOTSUPP) {
+			xfs_info(mp,
+	 "discard failed for extent [0x%llu,%u], error %d",
+				 (unsigned long long)busyp->bno,
+				 busyp->length,
+				 error);
+			return error;
+		}
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/xfs/xfs_discard.h b/kernel/fs/xfs/xfs_discard.h
new file mode 100644
index 000000000..344879aea
--- /dev/null
+++ b/kernel/fs/xfs/xfs_discard.h
@@ -0,0 +1,10 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+struct list_head;
+
+extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int	xfs_discard_extents(struct xfs_mount *, struct list_head *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/kernel/fs/xfs/xfs_dquot.c b/kernel/fs/xfs/xfs_dquot.c
new file mode 100644
index 000000000..02c01bbbc
--- /dev/null
+++ b/kernel/fs/xfs/xfs_dquot.c
@@ -0,0 +1,1104 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
+
+/*
+ * Lock order:
+ *
+ * ip->i_lock
+ *   qi->qi_tree_lock
+ *     dquot->q_qlock (xfs_dqlock() and friends)
+ *       dquot->q_flush (xfs_dqflock() and friends)
+ *       qi->qi_lru_lock
+ *
+ * If two dquots need to be locked the order is user before group/project,
+ * otherwise by the lowest id first, see xfs_dqlock2.
+ */
+
+#ifdef DEBUG
+xfs_buftarg_t *xfs_dqerror_target;
+int xfs_do_dqerror;
+int xfs_dqreq_num;
+int xfs_dqerror_mod = 33;
+#endif
+
+struct kmem_zone		*xfs_qm_dqtrxzone;
+static struct kmem_zone		*xfs_qm_dqzone;
+
+static struct lock_class_key xfs_dquot_group_class;
+static struct lock_class_key xfs_dquot_project_class;
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(list_empty(&dqp->q_lru));
+
+	mutex_destroy(&dqp->q_qlock);
+	kmem_zone_free(xfs_qm_dqzone, dqp);
+
+	XFS_STATS_DEC(xs_qm_dquot);
+}
+
+/*
+ * If default limits are in force, push them into the dquot now.
+ * We overwrite the dquot limits only if they are zero and this
+ * is not the root dquot.
+ */
+void
+xfs_qm_adjust_dqlimits(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dq)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_disk_dquot	*d = &dq->q_core;
+	int			prealloc = 0;
+
+	ASSERT(d->d_id);
+
+	if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
+		d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+		prealloc = 1;
+	}
+	if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
+		d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+		prealloc = 1;
+	}
+	if (q->qi_isoftlimit && !d->d_ino_softlimit)
+		d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
+	if (q->qi_ihardlimit && !d->d_ino_hardlimit)
+		d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
+	if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
+		d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
+	if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
+		d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+
+	if (prealloc)
+		xfs_dquot_set_prealloc_limits(dq);
+}
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded.  They do
+ * get reset to zero, however, when we find the count to be under
+ * the soft limit (they are only ever set non-zero via userspace).
+ */
+void
+xfs_qm_adjust_dqtimers(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*d)
+{
+	ASSERT(d->d_id);
+
+#ifdef DEBUG
+	if (d->d_blk_hardlimit)
+		ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
+		       be64_to_cpu(d->d_blk_hardlimit));
+	if (d->d_ino_hardlimit)
+		ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
+		       be64_to_cpu(d->d_ino_hardlimit));
+	if (d->d_rtb_hardlimit)
+		ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
+		       be64_to_cpu(d->d_rtb_hardlimit));
+#endif
+
+	if (!d->d_btimer) {
+		if ((d->d_blk_softlimit &&
+		     (be64_to_cpu(d->d_bcount) >
+		      be64_to_cpu(d->d_blk_softlimit))) ||
+		    (d->d_blk_hardlimit &&
+		     (be64_to_cpu(d->d_bcount) >
+		      be64_to_cpu(d->d_blk_hardlimit)))) {
+			d->d_btimer = cpu_to_be32(get_seconds() +
+					mp->m_quotainfo->qi_btimelimit);
+		} else {
+			d->d_bwarns = 0;
+		}
+	} else {
+		if ((!d->d_blk_softlimit ||
+		     (be64_to_cpu(d->d_bcount) <=
+		      be64_to_cpu(d->d_blk_softlimit))) &&
+		    (!d->d_blk_hardlimit ||
+		    (be64_to_cpu(d->d_bcount) <=
+		     be64_to_cpu(d->d_blk_hardlimit)))) {
+			d->d_btimer = 0;
+		}
+	}
+
+	if (!d->d_itimer) {
+		if ((d->d_ino_softlimit &&
+		     (be64_to_cpu(d->d_icount) >
+		      be64_to_cpu(d->d_ino_softlimit))) ||
+		    (d->d_ino_hardlimit &&
+		     (be64_to_cpu(d->d_icount) >
+		      be64_to_cpu(d->d_ino_hardlimit)))) {
+			d->d_itimer = cpu_to_be32(get_seconds() +
+					mp->m_quotainfo->qi_itimelimit);
+		} else {
+			d->d_iwarns = 0;
+		}
+	} else {
+		if ((!d->d_ino_softlimit ||
+		     (be64_to_cpu(d->d_icount) <=
+		      be64_to_cpu(d->d_ino_softlimit)))  &&
+		    (!d->d_ino_hardlimit ||
+		     (be64_to_cpu(d->d_icount) <=
+		      be64_to_cpu(d->d_ino_hardlimit)))) {
+			d->d_itimer = 0;
+		}
+	}
+
+	if (!d->d_rtbtimer) {
+		if ((d->d_rtb_softlimit &&
+		     (be64_to_cpu(d->d_rtbcount) >
+		      be64_to_cpu(d->d_rtb_softlimit))) ||
+		    (d->d_rtb_hardlimit &&
+		     (be64_to_cpu(d->d_rtbcount) >
+		      be64_to_cpu(d->d_rtb_hardlimit)))) {
+			d->d_rtbtimer = cpu_to_be32(get_seconds() +
+					mp->m_quotainfo->qi_rtbtimelimit);
+		} else {
+			d->d_rtbwarns = 0;
+		}
+	} else {
+		if ((!d->d_rtb_softlimit ||
+		     (be64_to_cpu(d->d_rtbcount) <=
+		      be64_to_cpu(d->d_rtb_softlimit))) &&
+		    (!d->d_rtb_hardlimit ||
+		     (be64_to_cpu(d->d_rtbcount) <=
+		      be64_to_cpu(d->d_rtb_hardlimit)))) {
+			d->d_rtbtimer = 0;
+		}
+	}
+}
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_buf_t	*bp)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	xfs_dqblk_t	*d;
+	int		curid, i;
+
+	ASSERT(tp);
+	ASSERT(xfs_buf_islocked(bp));
+
+	d = bp->b_addr;
+
+	/*
+	 * ID of the first dquot in the block - id's are zero based.
+	 */
+	curid = id - (id % q->qi_dqperchunk);
+	ASSERT(curid >= 0);
+	memset(d, 0, BBTOB(q->qi_dqchunklen));
+	for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
+		d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+		d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+		d->dd_diskdq.d_id = cpu_to_be32(curid);
+		d->dd_diskdq.d_flags = type;
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+			xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+					 XFS_DQUOT_CRC_OFF);
+		}
+	}
+
+	xfs_trans_dquot_buf(tp, bp,
+			    (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+			    ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+			     XFS_BLF_GDQUOT_BUF)));
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+}
+
+/*
+ * Initialize the dynamic speculative preallocation thresholds. The lo/hi
+ * watermarks correspond to the soft and hard limits by default. If a soft limit
+ * is not specified, we use 95% of the hard limit.
+ */
+void
+xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
+{
+	__uint64_t space;
+
+	dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+	dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+	if (!dqp->q_prealloc_lo_wmark) {
+		dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
+		do_div(dqp->q_prealloc_lo_wmark, 100);
+		dqp->q_prealloc_lo_wmark *= 95;
+	}
+
+	space = dqp->q_prealloc_hi_wmark;
+
+	do_div(space, 100);
+	dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
+	dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
+	dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
+}
+
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+	xfs_trans_t	**tpp,
+	xfs_mount_t	*mp,
+	xfs_dquot_t	*dqp,
+	xfs_inode_t	*quotip,
+	xfs_fileoff_t	offset_fsb,
+	xfs_buf_t	**O_bpp)
+{
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t flist;
+	xfs_bmbt_irec_t map;
+	int		nmaps, error, committed;
+	xfs_buf_t	*bp;
+	xfs_trans_t	*tp = *tpp;
+
+	ASSERT(tp != NULL);
+
+	trace_xfs_dqalloc(dqp);
+
+	/*
+	 * Initialize the bmap freelist prior to calling bmapi code.
+	 */
+	xfs_bmap_init(&flist, &firstblock);
+	xfs_ilock(quotip, XFS_ILOCK_EXCL);
+	/*
+	 * Return if this type of quotas is turned off while we didn't
+	 * have an inode lock
+	 */
+	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+		return -ESRCH;
+	}
+
+	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+	nmaps = 1;
+	error = xfs_bmapi_write(tp, quotip, offset_fsb,
+				XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+				&firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+				&map, &nmaps, &flist);
+	if (error)
+		goto error0;
+	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(nmaps == 1);
+	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+	       (map.br_startblock != HOLESTARTBLOCK));
+
+	/*
+	 * Keep track of the blkno to save a lookup later
+	 */
+	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+	/* now we can just get the buffer (there's nothing to read yet) */
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			       dqp->q_blkno,
+			       mp->m_quotainfo->qi_dqchunklen,
+			       0);
+	if (!bp) {
+		error = -ENOMEM;
+		goto error1;
+	}
+	bp->b_ops = &xfs_dquot_buf_ops;
+
+	/*
+	 * Make a chunk of dquots out of this buffer and log
+	 * the entire thing.
+	 */
+	xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+			      dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+
+	/*
+	 * xfs_bmap_finish() may commit the current transaction and
+	 * start a second transaction if the freelist is not empty.
+	 *
+	 * Since we still want to modify this buffer, we need to
+	 * ensure that the buffer is not released on commit of
+	 * the first transaction and ensure the buffer is added to the
+	 * second transaction.
+	 *
+	 * If there is only one transaction then don't stop the buffer
+	 * from being released when it commits later on.
+	 */
+
+	xfs_trans_bhold(tp, bp);
+
+	if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+		goto error1;
+	}
+
+	if (committed) {
+		tp = *tpp;
+		xfs_trans_bjoin(tp, bp);
+	} else {
+		xfs_trans_bhold_release(tp, bp);
+	}
+
+	*O_bpp = bp;
+	return 0;
+
+      error1:
+	xfs_bmap_cancel(&flist);
+      error0:
+	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+STATIC int
+xfs_qm_dqrepair(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp,
+	xfs_dqid_t		firstid,
+	struct xfs_buf		**bpp)
+{
+	int			error;
+	struct xfs_disk_dquot	*ddq;
+	struct xfs_dqblk	*d;
+	int			i;
+
+	/*
+	 * Read the buffer without verification so we get the corrupted
+	 * buffer returned to us. make sure we verify it on write, though.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen,
+				   0, bpp, NULL);
+
+	if (error) {
+		ASSERT(*bpp == NULL);
+		return error;
+	}
+	(*bpp)->b_ops = &xfs_dquot_buf_ops;
+
+	ASSERT(xfs_buf_islocked(*bpp));
+	d = (struct xfs_dqblk *)(*bpp)->b_addr;
+
+	/* Do the actual repair of dquots in this buffer */
+	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+		ddq = &d[i].dd_diskdq;
+		error = xfs_dqcheck(mp, ddq, firstid + i,
+				       dqp->dq_flags & XFS_DQ_ALLTYPES,
+				       XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+		if (error) {
+			/* repair failed, we're screwed */
+			xfs_trans_brelse(tp, *bpp);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+	xfs_trans_t		**tpp,
+	xfs_dquot_t		*dqp,
+	xfs_disk_dquot_t	**O_ddpp,
+	xfs_buf_t		**O_bpp,
+	uint			flags)
+{
+	struct xfs_bmbt_irec	map;
+	int			nmaps = 1, error;
+	struct xfs_buf		*bp;
+	struct xfs_inode	*quotip = xfs_dq_to_quota_inode(dqp);
+	struct xfs_mount	*mp = dqp->q_mount;
+	xfs_dqid_t		id = be32_to_cpu(dqp->q_core.d_id);
+	struct xfs_trans	*tp = (tpp ? *tpp : NULL);
+	uint			lock_mode;
+
+	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+
+	lock_mode = xfs_ilock_data_map_shared(quotip);
+	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+		/*
+		 * Return if this type of quotas is turned off while we
+		 * didn't have the quota inode lock.
+		 */
+		xfs_iunlock(quotip, lock_mode);
+		return -ESRCH;
+	}
+
+	/*
+	 * Find the block map; no allocations yet
+	 */
+	error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
+			       XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
+
+	xfs_iunlock(quotip, lock_mode);
+	if (error)
+		return error;
+
+	ASSERT(nmaps == 1);
+	ASSERT(map.br_blockcount == 1);
+
+	/*
+	 * Offset of dquot in the (fixed sized) dquot chunk.
+	 */
+	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+		sizeof(xfs_dqblk_t);
+
+	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+	if (map.br_startblock == HOLESTARTBLOCK) {
+		/*
+		 * We don't allocate unless we're asked to
+		 */
+		if (!(flags & XFS_QMOPT_DQALLOC))
+			return -ENOENT;
+
+		ASSERT(tp);
+		error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+					dqp->q_fileoffset, &bp);
+		if (error)
+			return error;
+		tp = *tpp;
+	} else {
+		trace_xfs_dqtobp_read(dqp);
+
+		/*
+		 * store the blkno etc so that we don't have to do the
+		 * mapping all the time
+		 */
+		dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+					   dqp->q_blkno,
+					   mp->m_quotainfo->qi_dqchunklen,
+					   0, &bp, &xfs_dquot_buf_ops);
+
+		if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+			xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
+						mp->m_quotainfo->qi_dqperchunk;
+			ASSERT(bp == NULL);
+			error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+		}
+
+		if (error) {
+			ASSERT(bp == NULL);
+			return error;
+		}
+	}
+
+	ASSERT(xfs_buf_islocked(bp));
+	*O_bpp = bp;
+	*O_ddpp = bp->b_addr + dqp->q_bufoffset;
+
+	return 0;
+}
+
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
+ */
+int
+xfs_qm_dqread(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	uint			flags,
+	struct xfs_dquot	**O_dqpp)
+{
+	struct xfs_dquot	*dqp;
+	struct xfs_disk_dquot	*ddqp;
+	struct xfs_buf		*bp;
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	int			cancelflags = 0;
+
+
+	dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+
+	dqp->dq_flags = type;
+	dqp->q_core.d_id = cpu_to_be32(id);
+	dqp->q_mount = mp;
+	INIT_LIST_HEAD(&dqp->q_lru);
+	mutex_init(&dqp->q_qlock);
+	init_waitqueue_head(&dqp->q_pinwait);
+
+	/*
+	 * Because we want to use a counting completion, complete
+	 * the flush completion once to allow a single access to
+	 * the flush completion without blocking.
+	 */
+	init_completion(&dqp->q_flush);
+	complete(&dqp->q_flush);
+
+	/*
+	 * Make sure group quotas have a different lock class than user
+	 * quotas.
+	 */
+	switch (type) {
+	case XFS_DQ_USER:
+		/* uses the default lock class */
+		break;
+	case XFS_DQ_GROUP:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
+		break;
+	case XFS_DQ_PROJ:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	XFS_STATS_INC(xs_qm_dquot);
+
+	trace_xfs_dqread(dqp);
+
+	if (flags & XFS_QMOPT_DQALLOC) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
+					  XFS_QM_DQALLOC_SPACE_RES(mp), 0);
+		if (error)
+			goto error1;
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+	}
+
+	/*
+	 * get a pointer to the on-disk dquot and the buffer containing it
+	 * dqp already knows its own type (GROUP/USER).
+	 */
+	error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
+	if (error) {
+		/*
+		 * This can happen if quotas got turned off (ESRCH),
+		 * or if the dquot didn't exist on disk and we ask to
+		 * allocate (ENOENT).
+		 */
+		trace_xfs_dqread_fail(dqp);
+		cancelflags |= XFS_TRANS_ABORT;
+		goto error1;
+	}
+
+	/* copy everything from disk dquot to the incore dquot */
+	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+	xfs_qm_dquot_logitem_init(dqp);
+
+	/*
+	 * Reservation counters are defined as reservation plus current usage
+	 * to avoid having to add every time.
+	 */
+	dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
+	dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
+	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+
+	/* initialize the dquot speculative prealloc thresholds */
+	xfs_dquot_set_prealloc_limits(dqp);
+
+	/* Mark the buf so that this will stay incore a little longer */
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+
+	/*
+	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+	 * So we need to release with xfs_trans_brelse().
+	 * The strategy here is identical to that of inodes; we lock
+	 * the dquot in xfs_qm_dqget() before making it accessible to
+	 * others. This is because dquots, like inodes, need a good level of
+	 * concurrency, and we don't want to take locks on the entire buffers
+	 * for dquot accesses.
+	 * Note also that the dquot buffer may even be dirty at this point, if
+	 * this particular dquot was repaired. We still aren't afraid to
+	 * brelse it because we have the changes incore.
+	 */
+	ASSERT(xfs_buf_islocked(bp));
+	xfs_trans_brelse(tp, bp);
+
+	if (tp) {
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			goto error0;
+	}
+
+	*O_dqpp = dqp;
+	return error;
+
+error1:
+	if (tp)
+		xfs_trans_cancel(tp, cancelflags);
+error0:
+	xfs_qm_dqdestroy(dqp);
+	*O_dqpp = NULL;
+	return error;
+}
+
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we don't hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,	  /* locked inode (optional) */
+	xfs_dqid_t	id,	  /* uid/projid/gid depending on type */
+	uint		type,	  /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
+	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
+{
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+	    (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
+	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+		return -ESRCH;
+	}
+
+#ifdef DEBUG
+	if (xfs_do_dqerror) {
+		if ((xfs_dqerror_target == mp->m_ddev_targp) &&
+		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+			xfs_debug(mp, "Returning error in dqget");
+			return -EIO;
+		}
+	}
+
+	ASSERT(type == XFS_DQ_USER ||
+	       type == XFS_DQ_PROJ ||
+	       type == XFS_DQ_GROUP);
+	if (ip) {
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+		ASSERT(xfs_inode_dquot(ip, type) == NULL);
+	}
+#endif
+
+restart:
+	mutex_lock(&qi->qi_tree_lock);
+	dqp = radix_tree_lookup(tree, id);
+	if (dqp) {
+		xfs_dqlock(dqp);
+		if (dqp->dq_flags & XFS_DQ_FREEING) {
+			xfs_dqunlock(dqp);
+			mutex_unlock(&qi->qi_tree_lock);
+			trace_xfs_dqget_freeing(dqp);
+			delay(1);
+			goto restart;
+		}
+
+		dqp->q_nrefs++;
+		mutex_unlock(&qi->qi_tree_lock);
+
+		trace_xfs_dqget_hit(dqp);
+		XFS_STATS_INC(xs_qm_dqcachehits);
+		*O_dqpp = dqp;
+		return 0;
+	}
+	mutex_unlock(&qi->qi_tree_lock);
+	XFS_STATS_INC(xs_qm_dqcachemisses);
+
+	/*
+	 * Dquot cache miss. We don't want to keep the inode lock across
+	 * a (potential) disk read. Also we don't want to deal with the lock
+	 * ordering between quotainode and this inode. OTOH, dropping the inode
+	 * lock here means dealing with a chown that can happen before
+	 * we re-acquire the lock.
+	 */
+	if (ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	error = xfs_qm_dqread(mp, id, type, flags, &dqp);
+
+	if (ip)
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	if (error)
+		return error;
+
+	if (ip) {
+		/*
+		 * A dquot could be attached to this inode by now, since
+		 * we had dropped the ilock.
+		 */
+		if (xfs_this_quota_on(mp, type)) {
+			struct xfs_dquot	*dqp1;
+
+			dqp1 = xfs_inode_dquot(ip, type);
+			if (dqp1) {
+				xfs_qm_dqdestroy(dqp);
+				dqp = dqp1;
+				xfs_dqlock(dqp);
+				goto dqret;
+			}
+		} else {
+			/* inode stays locked on return */
+			xfs_qm_dqdestroy(dqp);
+			return -ESRCH;
+		}
+	}
+
+	mutex_lock(&qi->qi_tree_lock);
+	error = radix_tree_insert(tree, id, dqp);
+	if (unlikely(error)) {
+		WARN_ON(error != -EEXIST);
+
+		/*
+		 * Duplicate found. Just throw away the new dquot and start
+		 * over.
+		 */
+		mutex_unlock(&qi->qi_tree_lock);
+		trace_xfs_dqget_dup(dqp);
+		xfs_qm_dqdestroy(dqp);
+		XFS_STATS_INC(xs_qm_dquot_dups);
+		goto restart;
+	}
+
+	/*
+	 * We return a locked dquot to the caller, with a reference taken
+	 */
+	xfs_dqlock(dqp);
+	dqp->q_nrefs = 1;
+
+	qi->qi_dquots++;
+	mutex_unlock(&qi->qi_tree_lock);
+
+ dqret:
+	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	trace_xfs_dqget_miss(dqp);
+	*O_dqpp = dqp;
+	return 0;
+}
+
+/*
+ * Release a reference to the dquot (decrement ref-count) and unlock it.
+ *
+ * If there is a group quota attached to this dquot, carefully release that
+ * too without tripping over deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(dqp->q_nrefs > 0);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	trace_xfs_dqput(dqp);
+
+	if (--dqp->q_nrefs == 0) {
+		struct xfs_quotainfo	*qi = dqp->q_mount->m_quotainfo;
+		trace_xfs_dqput_free(dqp);
+
+		if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+			XFS_STATS_INC(xs_qm_dquot_unused);
+	}
+	xfs_dqunlock(dqp);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+	xfs_dquot_t	*dqp)
+{
+	if (!dqp)
+		return;
+
+	trace_xfs_dqrele(dqp);
+
+	xfs_dqlock(dqp);
+	/*
+	 * We don't care to flush it if the dquot is dirty here.
+	 * That will create stutters that we want to avoid.
+	 * Instead we do a delayed write when we try to reclaim
+	 * a dirty dquot. Also xfs_sync will take part of the burden...
+	 */
+	xfs_qm_dqput(dqp);
+}
+
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	xfs_dq_logitem_t	*qip = (struct xfs_dq_logitem *)lip;
+	xfs_dquot_t		*dqp = qip->qli_dquot;
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	/*
+	 * We only want to pull the item from the AIL if its
+	 * location in the log has not changed since we started the flush.
+	 * Thus, we only bother if the dquot's lsn has
+	 * not changed. First we check the lsn outside the lock
+	 * since it's cheaper, and then we recheck while
+	 * holding the lock before removing the dquot from the AIL.
+	 */
+	if ((lip->li_flags & XFS_LI_IN_AIL) &&
+	    lip->li_lsn == qip->qli_flush_lsn) {
+
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		spin_lock(&ailp->xa_lock);
+		if (lip->li_lsn == qip->qli_flush_lsn)
+			xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+		else
+			spin_unlock(&ailp->xa_lock);
+	}
+
+	/*
+	 * Release the dq's flush lock since we're done with it.
+	 */
+	xfs_dqfunlock(dqp);
+}
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_buf		*bp;
+	struct xfs_disk_dquot	*ddqp;
+	int			error;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(!completion_done(&dqp->q_flush));
+
+	trace_xfs_dqflush(dqp);
+
+	*bpp = NULL;
+
+	xfs_qm_dqunpin_wait(dqp);
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this dquot
+	 * to disk, because the log record didn't make it to disk.
+	 *
+	 * We also have to remove the log item from the AIL in this case,
+	 * as we wait for an emptry AIL as part of the unmount process.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		struct xfs_log_item	*lip = &dqp->q_logitem.qli_item;
+		dqp->dq_flags &= ~XFS_DQ_DIRTY;
+
+		spin_lock(&mp->m_ail->xa_lock);
+		if (lip->li_flags & XFS_LI_IN_AIL)
+			xfs_trans_ail_delete(mp->m_ail, lip,
+					     SHUTDOWN_CORRUPT_INCORE);
+		else
+			spin_unlock(&mp->m_ail->xa_lock);
+		error = -EIO;
+		goto out_unlock;
+	}
+
+	/*
+	 * Get the buffer containing the on-disk dquot
+	 */
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+				   &xfs_dquot_buf_ops);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Calculate the location of the dquot inside the buffer.
+	 */
+	ddqp = bp->b_addr + dqp->q_bufoffset;
+
+	/*
+	 * A simple sanity check in case we got a corrupted dquot..
+	 */
+	error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+			   XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+	if (error) {
+		xfs_buf_relse(bp);
+		xfs_dqfunlock(dqp);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return -EIO;
+	}
+
+	/* This is the only portion of data that needs to persist */
+	memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+
+	/*
+	 * Clear the dirty field and remember the flush lsn for later use.
+	 */
+	dqp->dq_flags &= ~XFS_DQ_DIRTY;
+
+	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+					&dqp->q_logitem.qli_item.li_lsn);
+
+	/*
+	 * copy the lsn into the on-disk dquot now while we have the in memory
+	 * dquot here. This can't be done later in the write verifier as we
+	 * can't get access to the log item at that point in time.
+	 *
+	 * We also calculate the CRC here so that the on-disk dquot in the
+	 * buffer always has a valid CRC. This ensures there is no possibility
+	 * of a dquot without an up-to-date CRC getting to disk.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
+
+		dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+		xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	/*
+	 * Attach an iodone routine so that we can remove this dquot from the
+	 * AIL and release the flush lock once the dquot is synced to disk.
+	 */
+	xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
+				  &dqp->q_logitem.qli_item);
+
+	/*
+	 * If the buffer is pinned then push on the log so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (xfs_buf_ispinned(bp)) {
+		trace_xfs_dqflush_force(dqp);
+		xfs_log_force(mp, 0);
+	}
+
+	trace_xfs_dqflush_done(dqp);
+	*bpp = bp;
+	return 0;
+
+out_unlock:
+	xfs_dqfunlock(dqp);
+	return -EIO;
+}
+
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
+void
+xfs_dqlock2(
+	xfs_dquot_t	*d1,
+	xfs_dquot_t	*d2)
+{
+	if (d1 && d2) {
+		ASSERT(d1 != d2);
+		if (be32_to_cpu(d1->q_core.d_id) >
+		    be32_to_cpu(d2->q_core.d_id)) {
+			mutex_lock(&d2->q_qlock);
+			mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
+		} else {
+			mutex_lock(&d1->q_qlock);
+			mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
+		}
+	} else if (d1) {
+		mutex_lock(&d1->q_qlock);
+	} else if (d2) {
+		mutex_lock(&d2->q_qlock);
+	}
+}
+
+int __init
+xfs_qm_init(void)
+{
+	xfs_qm_dqzone =
+		kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+	if (!xfs_qm_dqzone)
+		goto out;
+
+	xfs_qm_dqtrxzone =
+		kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+	if (!xfs_qm_dqtrxzone)
+		goto out_free_dqzone;
+
+	return 0;
+
+out_free_dqzone:
+	kmem_zone_destroy(xfs_qm_dqzone);
+out:
+	return -ENOMEM;
+}
+
+void
+xfs_qm_exit(void)
+{
+	kmem_zone_destroy(xfs_qm_dqtrxzone);
+	kmem_zone_destroy(xfs_qm_dqzone);
+}
diff --git a/kernel/fs/xfs/xfs_dquot.h b/kernel/fs/xfs/xfs_dquot.h
new file mode 100644
index 000000000..2f536f33c
--- /dev/null
+++ b/kernel/fs/xfs/xfs_dquot.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+
+enum {
+	XFS_QLOWSP_1_PCNT = 0,
+	XFS_QLOWSP_3_PCNT,
+	XFS_QLOWSP_5_PCNT,
+	XFS_QLOWSP_MAX
+};
+
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+	uint		 dq_flags;	/* various flags (XFS_DQ_*) */
+	struct list_head q_lru;		/* global free list of dquots */
+	struct xfs_mount*q_mount;	/* filesystem this relates to */
+	struct xfs_trans*q_transp;	/* trans this belongs to currently */
+	uint		 q_nrefs;	/* # active refs from inodes */
+	xfs_daddr_t	 q_blkno;	/* blkno of dquot buffer */
+	int		 q_bufoffset;	/* off of dq in buffer (# dquots) */
+	xfs_fileoff_t	 q_fileoffset;	/* offset in quotas file */
+
+	xfs_disk_dquot_t q_core;	/* actual usage & quotas */
+	xfs_dq_logitem_t q_logitem;	/* dquot log item */
+	xfs_qcnt_t	 q_res_bcount;	/* total regular nblks used+reserved */
+	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
+	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
+	xfs_qcnt_t	 q_prealloc_lo_wmark;/* prealloc throttle wmark */
+	xfs_qcnt_t	 q_prealloc_hi_wmark;/* prealloc disabled wmark */
+	int64_t		 q_low_space[XFS_QLOWSP_MAX];
+	struct mutex	 q_qlock;	/* quota lock */
+	struct completion q_flush;	/* flush completion queue */
+	atomic_t          q_pincount;	/* dquot pin count */
+	wait_queue_head_t q_pinwait;	/* dquot pinning wait queue */
+} xfs_dquot_t;
+
+/*
+ * Lock hierarchy for q_qlock:
+ *	XFS_QLOCK_NORMAL is the implicit default,
+ * 	XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+	XFS_QLOCK_NORMAL = 0,
+	XFS_QLOCK_NESTED,
+};
+
+/*
+ * Manage the q_flush completion queue embedded in the dquot.  This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
+ */
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
+{
+	wait_for_completion(&dqp->q_flush);
+}
+
+static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+	return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+	complete(&dqp->q_flush);
+}
+
+static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp)
+{
+	return mutex_trylock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqlock(struct xfs_dquot *dqp)
+{
+	mutex_lock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqunlock(struct xfs_dquot *dqp)
+{
+	mutex_unlock(&dqp->q_qlock);
+}
+
+static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+{
+	switch (type & XFS_DQ_ALLTYPES) {
+	case XFS_DQ_USER:
+		return XFS_IS_UQUOTA_ON(mp);
+	case XFS_DQ_GROUP:
+		return XFS_IS_GQUOTA_ON(mp);
+	case XFS_DQ_PROJ:
+		return XFS_IS_PQUOTA_ON(mp);
+	default:
+		return 0;
+	}
+}
+
+static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+{
+	switch (type & XFS_DQ_ALLTYPES) {
+	case XFS_DQ_USER:
+		return ip->i_udquot;
+	case XFS_DQ_GROUP:
+		return ip->i_gdquot;
+	case XFS_DQ_PROJ:
+		return ip->i_pdquot;
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * Check whether a dquot is under low free space conditions. We assume the quota
+ * is enabled and enforced.
+ */
+static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
+{
+	int64_t freesp;
+
+	freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+	if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
+		return true;
+
+	return false;
+}
+
+#define XFS_DQ_IS_LOCKED(dqp)	(mutex_is_locked(&((dqp)->q_qlock)))
+#define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_QM_ISPDQ(dqp)	((dqp)->dq_flags & XFS_DQ_PROJ)
+#define XFS_QM_ISGDQ(dqp)	((dqp)->dq_flags & XFS_DQ_GROUP)
+
+extern int		xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
+					uint, struct xfs_dquot	**);
+extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int		xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
+extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
+					xfs_disk_dquot_t *);
+extern void		xfs_qm_adjust_dqlimits(struct xfs_mount *,
+					       struct xfs_dquot *);
+extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
+					xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern void		xfs_qm_dqput(xfs_dquot_t *);
+
+extern void		xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+
+extern void		xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+
+static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
+{
+	xfs_dqlock(dqp);
+	dqp->q_nrefs++;
+	xfs_dqunlock(dqp);
+	return dqp;
+}
+
+#endif /* __XFS_DQUOT_H__ */
diff --git a/kernel/fs/xfs/xfs_dquot_item.c b/kernel/fs/xfs/xfs_dquot_item.c
new file mode 100644
index 000000000..814cff94e
--- /dev/null
+++ b/kernel/fs/xfs/xfs_dquot_item.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_log.h"
+
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 2;
+	*nbytes += sizeof(struct xfs_dq_logformat) +
+		   sizeof(struct xfs_disk_dquot);
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+	struct xfs_dq_logformat	*qlf;
+
+	qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
+	qlf->qlf_type = XFS_LI_DQUOT;
+	qlf->qlf_size = 2;
+	qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+	qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+	qlf->qlf_len = 1;
+	qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
+			&qlip->qli_dquot->q_core,
+			sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	atomic_inc(&dqp->q_pincount);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0.	 The
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(atomic_read(&dqp->q_pincount) > 0);
+	if (atomic_dec_and_test(&dqp->q_pincount))
+		wake_up(&dqp->q_pinwait);
+}
+
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	/*
+	 * We always re-log the entire dquot when it becomes dirty,
+	 * so, the latest copy _is_ the only one that matters.
+	 */
+	return lsn;
+}
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (atomic_read(&dqp->q_pincount) == 0)
+		return;
+
+	/*
+	 * Give the log a push so we don't wait here too long.
+	 */
+	xfs_log_force(dqp->q_mount, 0);
+	wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
+}
+
+STATIC uint
+xfs_qm_dquot_logitem_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list) __releases(&lip->li_ailp->xa_lock)
+					      __acquires(&lip->li_ailp->xa_lock)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+	struct xfs_buf		*bp = NULL;
+	uint			rval = XFS_ITEM_SUCCESS;
+	int			error;
+
+	if (atomic_read(&dqp->q_pincount) > 0)
+		return XFS_ITEM_PINNED;
+
+	if (!xfs_dqlock_nowait(dqp))
+		return XFS_ITEM_LOCKED;
+
+	/*
+	 * Re-check the pincount now that we stabilized the value by
+	 * taking the quota lock.
+	 */
+	if (atomic_read(&dqp->q_pincount) > 0) {
+		rval = XFS_ITEM_PINNED;
+		goto out_unlock;
+	}
+
+	/*
+	 * Someone else is already flushing the dquot.  Nothing we can do
+	 * here but wait for the flush to finish and remove the item from
+	 * the AIL.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		rval = XFS_ITEM_FLUSHING;
+		goto out_unlock;
+	}
+
+	spin_unlock(&lip->li_ailp->xa_lock);
+
+	error = xfs_qm_dqflush(dqp, &bp);
+	if (error) {
+		xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+			__func__, error, dqp);
+	} else {
+		if (!xfs_buf_delwri_queue(bp, buffer_list))
+			rval = XFS_ITEM_FLUSHING;
+		xfs_buf_relse(bp);
+	}
+
+	spin_lock(&lip->li_ailp->xa_lock);
+out_unlock:
+	xfs_dqunlock(dqp);
+	return rval;
+}
+
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * Clear the transaction pointer in the dquot
+	 */
+	dqp->q_transp = NULL;
+
+	/*
+	 * dquots are never 'held' from getting unlocked at the end of
+	 * a transaction.  Their locking and unlocking is hidden inside the
+	 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+	 * for the logitem.
+	 */
+	xfs_dqunlock(dqp);
+}
+
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector for dquots
+ */
+static const struct xfs_item_ops xfs_dquot_item_ops = {
+	.iop_size	= xfs_qm_dquot_logitem_size,
+	.iop_format	= xfs_qm_dquot_logitem_format,
+	.iop_pin	= xfs_qm_dquot_logitem_pin,
+	.iop_unpin	= xfs_qm_dquot_logitem_unpin,
+	.iop_unlock	= xfs_qm_dquot_logitem_unlock,
+	.iop_committed	= xfs_qm_dquot_logitem_committed,
+	.iop_push	= xfs_qm_dquot_logitem_push,
+	.iop_committing = xfs_qm_dquot_logitem_committing
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_dq_logitem	*lp = &dqp->q_logitem;
+
+	xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+					&xfs_dquot_item_ops);
+	lp->qli_dquot = dqp;
+}
+
+/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
+
+
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item.  It just logs the
+ * quotaoff_log_format structure.
+ */
+STATIC void
+xfs_qm_qoff_logitem_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_qoff_logitem);
+}
+
+STATIC void
+xfs_qm_qoff_logitem_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_qoff_logitem	*qflip = QOFF_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+	struct xfs_qoff_logformat *qlf;
+
+	qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
+	qlf->qf_type = XFS_LI_QUOTAOFF;
+	qlf->qf_size = 1;
+	qlf->qf_flags = qflip->qql_flags;
+	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
+}
+
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+STATIC void
+xfs_qm_qoff_logitem_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * There isn't much you can do to push a quotaoff item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unlock(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	return lsn;
+}
+
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_qoff_logitem	*qfe = QOFF_ITEM(lip);
+	struct xfs_qoff_logitem	*qfs = qfe->qql_start_lip;
+	struct xfs_ail		*ailp = qfs->qql_item.li_ailp;
+
+	/*
+	 * Delete the qoff-start logitem from the AIL.
+	 * xfs_trans_ail_delete() drops the AIL lock.
+	 */
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+
+	kmem_free(qfs);
+	kmem_free(qfe);
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * XXX rcc - don't know quite what to do with this.  I think we can
+ * just ignore it.  The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen?  Also, do we need different routines for
+ * quotaoff start and quotaoff end?  I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable).  If we do something else,
+ * then maybe we don't need two.
+ */
+STATIC void
+xfs_qm_qoff_logitem_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn)
+{
+}
+
+static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+	.iop_size	= xfs_qm_qoff_logitem_size,
+	.iop_format	= xfs_qm_qoff_logitem_format,
+	.iop_pin	= xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
+	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= xfs_qm_qoffend_logitem_committed,
+	.iop_push	= xfs_qm_qoff_logitem_push,
+	.iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+	.iop_size	= xfs_qm_qoff_logitem_size,
+	.iop_format	= xfs_qm_qoff_logitem_format,
+	.iop_pin	= xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
+	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= xfs_qm_qoff_logitem_committed,
+	.iop_push	= xfs_qm_qoff_logitem_push,
+	.iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+struct xfs_qoff_logitem *
+xfs_qm_qoff_logitem_init(
+	struct xfs_mount	*mp,
+	struct xfs_qoff_logitem	*start,
+	uint			flags)
+{
+	struct xfs_qoff_logitem	*qf;
+
+	qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+
+	xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
+			&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
+	qf->qql_item.li_mountp = mp;
+	qf->qql_start_lip = start;
+	qf->qql_flags = flags;
+	return qf;
+}
diff --git a/kernel/fs/xfs/xfs_dquot_item.h b/kernel/fs/xfs/xfs_dquot_item.h
new file mode 100644
index 000000000..502e94646
--- /dev/null
+++ b/kernel/fs/xfs/xfs_dquot_item.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_qoff_logitem;
+
+typedef struct xfs_dq_logitem {
+	xfs_log_item_t		 qli_item;	   /* common portion */
+	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
+	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
+} xfs_dq_logitem_t;
+
+typedef struct xfs_qoff_logitem {
+	xfs_log_item_t		 qql_item;	/* common portion */
+	struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+	unsigned int		qql_flags;
+} xfs_qoff_logitem_t;
+
+
+extern void		   xfs_qm_dquot_logitem_init(struct xfs_dquot *);
+extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
+					struct xfs_qoff_logitem *, uint);
+extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
+					struct xfs_qoff_logitem *, uint);
+extern void		   xfs_trans_log_quotaoff_item(struct xfs_trans *,
+					struct xfs_qoff_logitem *);
+
+#endif	/* __XFS_DQUOT_ITEM_H__ */
diff --git a/kernel/fs/xfs/xfs_error.c b/kernel/fs/xfs/xfs_error.c
new file mode 100644
index 000000000..338e50bbf
--- /dev/null
+++ b/kernel/fs/xfs/xfs_error.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_fs.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+
+#ifdef DEBUG
+
+int	xfs_etest[XFS_NUM_INJECT_ERROR];
+int64_t	xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
+char *	xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int	xfs_error_test_active;
+
+int
+xfs_error_test(int error_tag, int *fsidp, char *expression,
+	       int line, char *file, unsigned long randfactor)
+{
+	int i;
+	int64_t fsid;
+
+	if (prandom_u32() % randfactor)
+		return 0;
+
+	memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
+			xfs_warn(NULL,
+	"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
+				expression, file, line, xfs_etest_fsname[i]);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int
+xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+{
+	int i;
+	int len;
+	int64_t fsid;
+
+	memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+			xfs_warn(mp, "error tag #%d on", error_tag);
+			return 0;
+		}
+	}
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == 0) {
+			xfs_warn(mp, "Turned on XFS error tag #%d",
+				error_tag);
+			xfs_etest[i] = error_tag;
+			xfs_etest_fsid[i] = fsid;
+			len = strlen(mp->m_fsname);
+			xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
+			strcpy(xfs_etest_fsname[i], mp->m_fsname);
+			xfs_error_test_active++;
+			return 0;
+		}
+	}
+
+	xfs_warn(mp, "error tag overflow, too many turned on");
+
+	return 1;
+}
+
+int
+xfs_errortag_clearall(xfs_mount_t *mp, int loud)
+{
+	int64_t fsid;
+	int cleared = 0;
+	int i;
+
+	memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+		if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
+		     xfs_etest[i] != 0) {
+			cleared = 1;
+			xfs_warn(mp, "Clearing XFS error tag #%d",
+				xfs_etest[i]);
+			xfs_etest[i] = 0;
+			xfs_etest_fsid[i] = 0LL;
+			kmem_free(xfs_etest_fsname[i]);
+			xfs_etest_fsname[i] = NULL;
+			xfs_error_test_active--;
+		}
+	}
+
+	if (loud || cleared)
+		xfs_warn(mp, "Cleared all XFS error tags for filesystem");
+
+	return 0;
+}
+#endif /* DEBUG */
+
+void
+xfs_error_report(
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	const char		*filename,
+	int			linenum,
+	inst_t			*ra)
+{
+	if (level <= xfs_error_level) {
+		xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
+		"Internal error %s at line %d of file %s.  Caller %pS",
+			    tag, linenum, filename, ra);
+
+		xfs_stack_trace();
+	}
+}
+
+void
+xfs_corruption_error(
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	void			*p,
+	const char		*filename,
+	int			linenum,
+	inst_t			*ra)
+{
+	if (level <= xfs_error_level)
+		xfs_hex_dump(p, 64);
+	xfs_error_report(tag, level, mp, filename, linenum, ra);
+	xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
+}
+
+/*
+ * Warnings specifically for verifier errors.  Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
+void
+xfs_verifier_error(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+		  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
+		  __return_address, bp->b_bn);
+
+	xfs_alert(mp, "Unmount and run xfs_repair");
+
+	if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+		xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
+		xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
+	}
+
+	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+		xfs_stack_trace();
+}
diff --git a/kernel/fs/xfs/xfs_error.h b/kernel/fs/xfs/xfs_error.h
new file mode 100644
index 000000000..c0394ed12
--- /dev/null
+++ b/kernel/fs/xfs/xfs_error.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_ERROR_H__
+#define	__XFS_ERROR_H__
+
+struct xfs_mount;
+
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+			const char *filename, int linenum, inst_t *ra);
+extern void xfs_corruption_error(const char *tag, int level,
+			struct xfs_mount *mp, void *p, const char *filename,
+			int linenum, inst_t *ra);
+extern void xfs_verifier_error(struct xfs_buf *bp);
+
+#define	XFS_ERROR_REPORT(e, lvl, mp)	\
+	xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
+#define	XFS_CORRUPTION_ERROR(e, lvl, mp, mem)	\
+	xfs_corruption_error(e, lvl, mp, mem, \
+			     __FILE__, __LINE__, __return_address)
+
+#define XFS_ERRLEVEL_OFF	0
+#define XFS_ERRLEVEL_LOW	1
+#define XFS_ERRLEVEL_HIGH	5
+
+/*
+ * Macros to set EFSCORRUPTED & return/branch.
+ */
+#define	XFS_WANT_CORRUPTED_GOTO(mp, x, l)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (unlikely(!fs_is_ok)) { \
+			XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
+					 XFS_ERRLEVEL_LOW, mp); \
+			error = -EFSCORRUPTED; \
+			goto l; \
+		} \
+	}
+
+#define	XFS_WANT_CORRUPTED_RETURN(mp, x)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (unlikely(!fs_is_ok)) { \
+			XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
+					 XFS_ERRLEVEL_LOW, mp); \
+			return -EFSCORRUPTED; \
+		} \
+	}
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR				0
+#define XFS_ERRTAG_IFLUSH_1				1
+#define XFS_ERRTAG_IFLUSH_2				2
+#define XFS_ERRTAG_IFLUSH_3				3
+#define XFS_ERRTAG_IFLUSH_4				4
+#define XFS_ERRTAG_IFLUSH_5				5
+#define XFS_ERRTAG_IFLUSH_6				6
+#define	XFS_ERRTAG_DA_READ_BUF				7
+#define	XFS_ERRTAG_BTREE_CHECK_LBLOCK			8
+#define	XFS_ERRTAG_BTREE_CHECK_SBLOCK			9
+#define	XFS_ERRTAG_ALLOC_READ_AGF			10
+#define	XFS_ERRTAG_IALLOC_READ_AGI			11
+#define	XFS_ERRTAG_ITOBP_INOTOBP			12
+#define	XFS_ERRTAG_IUNLINK				13
+#define	XFS_ERRTAG_IUNLINK_REMOVE			14
+#define	XFS_ERRTAG_DIR_INO_VALIDATE			15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK			16
+#define XFS_ERRTAG_IODONE_IOERR				17
+#define XFS_ERRTAG_STRATREAD_IOERR			18
+#define XFS_ERRTAG_STRATCMPL_IOERR			19
+#define XFS_ERRTAG_DIOWRITE_IOERR			20
+#define XFS_ERRTAG_BMAPIFORMAT				21
+#define XFS_ERRTAG_MAX					22
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT				100
+#define XFS_RANDOM_IFLUSH_1				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK			(XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR				(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define	XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
+
+#ifdef DEBUG
+extern int xfs_error_test_active;
+extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
+
+#define	XFS_NUM_INJECT_ERROR				10
+#define XFS_TEST_ERROR(expr, mp, tag, rf)		\
+	((expr) || (xfs_error_test_active && \
+	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
+			(rf))))
+
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
+#define xfs_errortag_add(tag, mp)		(ENOSYS)
+#define xfs_errortag_clearall(mp, loud)		(ENOSYS)
+#endif /* DEBUG */
+
+/*
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
+ *			a panic by setting xfs_panic_mask in a sysctl.
+ */
+#define		XFS_NO_PTAG			0
+#define		XFS_PTAG_IFLUSH			0x00000001
+#define		XFS_PTAG_LOGRES			0x00000002
+#define		XFS_PTAG_AILDELETE		0x00000004
+#define		XFS_PTAG_ERROR_REPORT		0x00000008
+#define		XFS_PTAG_SHUTDOWN_CORRUPT	0x00000010
+#define		XFS_PTAG_SHUTDOWN_IOERROR	0x00000020
+#define		XFS_PTAG_SHUTDOWN_LOGERROR	0x00000040
+#define		XFS_PTAG_FSBLOCK_ZERO		0x00000080
+
+#endif	/* __XFS_ERROR_H__ */
diff --git a/kernel/fs/xfs/xfs_export.c b/kernel/fs/xfs/xfs_export.c
new file mode 100644
index 000000000..652cd3c5b
--- /dev/null
+++ b/kernel/fs/xfs/xfs_export.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_export.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_pnfs.h"
+
+/*
+ * Note that we only accept fileids which are long enough rather than allow
+ * the parent generation number to default to zero.  XFS considers zero a
+ * valid generation number not an invalid/wildcard value.
+ */
+static int xfs_fileid_length(int fileid_type)
+{
+	switch (fileid_type) {
+	case FILEID_INO32_GEN:
+		return 2;
+	case FILEID_INO32_GEN_PARENT:
+		return 4;
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		return 3;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		return 6;
+	}
+	return FILEID_INVALID;
+}
+
+STATIC int
+xfs_fs_encode_fh(
+	struct inode	*inode,
+	__u32		*fh,
+	int		*max_len,
+	struct inode	*parent)
+{
+	struct fid		*fid = (struct fid *)fh;
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fh;
+	int			fileid_type;
+	int			len;
+
+	/* Directories don't need their parent encoded, they have ".." */
+	if (!parent)
+		fileid_type = FILEID_INO32_GEN;
+	else
+		fileid_type = FILEID_INO32_GEN_PARENT;
+
+	/*
+	 * If the the filesystem may contain 64bit inode numbers, we need
+	 * to use larger file handles that can represent them.
+	 *
+	 * While we only allocate inodes that do not fit into 32 bits any
+	 * large enough filesystem may contain them, thus the slightly
+	 * confusing looking conditional below.
+	 */
+	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+	    (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+		fileid_type |= XFS_FILEID_TYPE_64FLAG;
+
+	/*
+	 * Only encode if there is enough space given.  In practice
+	 * this means we can't export a filesystem with 64bit inodes
+	 * over NFSv2 with the subtree_check export option; the other
+	 * seven combinations work.  The real answer is "don't use v2".
+	 */
+	len = xfs_fileid_length(fileid_type);
+	if (*max_len < len) {
+		*max_len = len;
+		return FILEID_INVALID;
+	}
+	*max_len = len;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+		fid->i32.parent_ino = XFS_I(parent)->i_ino;
+		fid->i32.parent_gen = parent->i_generation;
+		/*FALLTHRU*/
+	case FILEID_INO32_GEN:
+		fid->i32.ino = XFS_I(inode)->i_ino;
+		fid->i32.gen = inode->i_generation;
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		fid64->parent_ino = XFS_I(parent)->i_ino;
+		fid64->parent_gen = parent->i_generation;
+		/*FALLTHRU*/
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		fid64->ino = XFS_I(inode)->i_ino;
+		fid64->gen = inode->i_generation;
+		break;
+	}
+
+	return fileid_type;
+}
+
+STATIC struct inode *
+xfs_nfs_get_inode(
+	struct super_block	*sb,
+	u64			ino,
+	u32			generation)
+ {
+ 	xfs_mount_t		*mp = XFS_M(sb);
+	xfs_inode_t		*ip;
+	int			error;
+
+	/*
+	 * NFS can sometimes send requests for ino 0.  Fail them gracefully.
+	 */
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+
+	/*
+	 * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+	 * fine and not an indication of a corrupted filesystem as clients can
+	 * send invalid file handles and we have to handle it gracefully..
+	 */
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+	if (error) {
+		/*
+		 * EINVAL means the inode cluster doesn't exist anymore.
+		 * This implies the filehandle is stale, so we should
+		 * translate it here.
+		 * We don't use ESTALE directly down the chain to not
+		 * confuse applications using bulkstat that expect EINVAL.
+		 */
+		if (error == -EINVAL || error == -ENOENT)
+			error = -ESTALE;
+		return ERR_PTR(error);
+	}
+
+	if (ip->i_d.di_gen != generation) {
+		IRELE(ip);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return VFS_I(ip);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		 int fh_len, int fileid_type)
+{
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
+	struct inode		*inode = NULL;
+
+	if (fh_len < xfs_fileid_length(fileid_type))
+		return NULL;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+	case FILEID_INO32_GEN:
+		inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		 int fh_len, int fileid_type)
+{
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
+	struct inode		*inode = NULL;
+
+	if (fh_len < xfs_fileid_length(fileid_type))
+		return NULL;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+		inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
+					      fid->i32.parent_gen);
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
+					      fid64->parent_gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_get_parent(
+	struct dentry		*child)
+{
+	int			error;
+	struct xfs_inode	*cip;
+
+	error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL);
+	if (unlikely(error))
+		return ERR_PTR(error);
+
+	return d_obtain_alias(VFS_I(cip));
+}
+
+STATIC int
+xfs_fs_nfs_commit_metadata(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_lsn_t		lsn = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip))
+		lsn = ip->i_itemp->ili_last_lsn;
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!lsn)
+		return 0;
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
+
+const struct export_operations xfs_export_operations = {
+	.encode_fh		= xfs_fs_encode_fh,
+	.fh_to_dentry		= xfs_fs_fh_to_dentry,
+	.fh_to_parent		= xfs_fs_fh_to_parent,
+	.get_parent		= xfs_fs_get_parent,
+	.commit_metadata	= xfs_fs_nfs_commit_metadata,
+#ifdef CONFIG_NFSD_PNFS
+	.get_uuid		= xfs_fs_get_uuid,
+	.map_blocks		= xfs_fs_map_blocks,
+	.commit_blocks		= xfs_fs_commit_blocks,
+#endif
+};
diff --git a/kernel/fs/xfs/xfs_export.h b/kernel/fs/xfs/xfs_export.h
new file mode 100644
index 000000000..3272b6ae7
--- /dev/null
+++ b/kernel/fs/xfs/xfs_export.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_EXPORT_H__
+#define __XFS_EXPORT_H__
+
+/*
+ * Common defines for code related to exporting XFS filesystems over NFS.
+ *
+ * The NFS fileid goes out on the wire as an array of
+ * 32bit unsigned ints in host order.  There are 5 possible
+ * formats.
+ *
+ * (1)	fileid_type=0x00
+ *	(no fileid data; handled by the generic code)
+ *
+ * (2)	fileid_type=0x01
+ *	inode-num
+ *	generation
+ *
+ * (3)	fileid_type=0x02
+ *	inode-num
+ *	generation
+ *	parent-inode-num
+ *	parent-generation
+ *
+ * (4)	fileid_type=0x81
+ *	inode-num-lo32
+ *	inode-num-hi32
+ *	generation
+ *
+ * (5)	fileid_type=0x82
+ *	inode-num-lo32
+ *	inode-num-hi32
+ *	generation
+ *	parent-inode-num-lo32
+ *	parent-inode-num-hi32
+ *	parent-generation
+ *
+ * Note, the NFS filehandle also includes an fsid portion which
+ * may have an inode number in it.  That number is hardcoded to
+ * 32bits and there is no way for XFS to intercept it.  In
+ * practice this means when exporting an XFS filesystem with 64bit
+ * inodes you should either export the mountpoint (rather than
+ * a subdirectory) or use the "fsid" export option.
+ */
+
+struct xfs_fid64 {
+	u64 ino;
+	u32 gen;
+	u64 parent_ino;
+	u32 parent_gen;
+} __attribute__((packed));
+
+/* This flag goes on the wire.  Don't play with it. */
+#define XFS_FILEID_TYPE_64FLAG	0x80	/* NFS fileid has 64bit inodes */
+
+#endif	/* __XFS_EXPORT_H__ */
diff --git a/kernel/fs/xfs/xfs_extent_busy.c b/kernel/fs/xfs/xfs_extent_busy.c
new file mode 100644
index 000000000..c263e0792
--- /dev/null
+++ b/kernel/fs/xfs/xfs_extent_busy.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+
+void
+xfs_extent_busy_insert(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	unsigned int		flags)
+{
+	struct xfs_extent_busy	*new;
+	struct xfs_extent_busy	*busyp;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent = NULL;
+
+	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+	if (!new) {
+		/*
+		 * No Memory!  Since it is now not possible to track the free
+		 * block, make this a synchronous transaction to insure that
+		 * the block is not reused before this transaction commits.
+		 */
+		trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
+		xfs_trans_set_sync(tp);
+		return;
+	}
+
+	new->agno = agno;
+	new->bno = bno;
+	new->length = len;
+	INIT_LIST_HEAD(&new->list);
+	new->flags = flags;
+
+	/* trace before insert to be able to see failed inserts */
+	trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
+
+	pag = xfs_perag_get(tp->t_mountp, new->agno);
+	spin_lock(&pag->pagb_lock);
+	rbp = &pag->pagb_tree.rb_node;
+	while (*rbp) {
+		parent = *rbp;
+		busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
+
+		if (new->bno < busyp->bno) {
+			rbp = &(*rbp)->rb_left;
+			ASSERT(new->bno + new->length <= busyp->bno);
+		} else if (new->bno > busyp->bno) {
+			rbp = &(*rbp)->rb_right;
+			ASSERT(bno >= busyp->bno + busyp->length);
+		} else {
+			ASSERT(0);
+		}
+	}
+
+	rb_link_node(&new->rb_node, parent, rbp);
+	rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+	list_add(&new->list, &tp->t_busy);
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+}
+
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate.  You need to be holding the busy extent tree lock when calling
+ * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+int
+xfs_extent_busy_search(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
+{
+	struct xfs_perag	*pag;
+	struct rb_node		*rbp;
+	struct xfs_extent_busy	*busyp;
+	int			match = 0;
+
+	pag = xfs_perag_get(mp, agno);
+	spin_lock(&pag->pagb_lock);
+
+	rbp = pag->pagb_tree.rb_node;
+
+	/* find closest start bno overlap */
+	while (rbp) {
+		busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
+		if (bno < busyp->bno) {
+			/* may overlap, but exact start block is lower */
+			if (bno + len > busyp->bno)
+				match = -1;
+			rbp = rbp->rb_left;
+		} else if (bno > busyp->bno) {
+			/* may overlap, but exact start block is higher */
+			if (bno < busyp->bno + busyp->length)
+				match = -1;
+			rbp = rbp->rb_right;
+		} else {
+			/* bno matches busyp, length determines exact match */
+			match = (busyp->length == len) ? 1 : -1;
+			break;
+		}
+	}
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+	return match;
+}
+
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entry's block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_extent_busy_update_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_extent_busy	*busyp,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata) __releases(&pag->pagb_lock)
+					  __acquires(&pag->pagb_lock)
+{
+	xfs_agblock_t		fend = fbno + flen;
+	xfs_agblock_t		bbno = busyp->bno;
+	xfs_agblock_t		bend = bbno + busyp->length;
+
+	/*
+	 * This extent is currently being discarded.  Give the thread
+	 * performing the discard a chance to mark the extent unbusy
+	 * and retry.
+	 */
+	if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
+		spin_unlock(&pag->pagb_lock);
+		delay(1);
+		spin_lock(&pag->pagb_lock);
+		return false;
+	}
+
+	/*
+	 * If there is a busy extent overlapping a user allocation, we have
+	 * no choice but to force the log and retry the search.
+	 *
+	 * Fortunately this does not happen during normal operation, but
+	 * only if the filesystem is very low on space and has to dip into
+	 * the AGFL for normal allocations.
+	 */
+	if (userdata)
+		goto out_force_log;
+
+	if (bbno < fbno && bend > fend) {
+		/*
+		 * Case 1:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +---------+
+		 *        fbno   fend
+		 */
+
+		/*
+		 * We would have to split the busy extent to be able to track
+		 * it correct, which we cannot do because we would have to
+		 * modify the list of busy extents attached to the transaction
+		 * or CIL context, which is immutable.
+		 *
+		 * Force out the log to clear the busy extent and retry the
+		 * search.
+		 */
+		goto out_force_log;
+	} else if (bbno >= fbno && bend <= fend) {
+		/*
+		 * Case 2:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------+
+		 *    fbno           fend
+		 *
+		 * Case 3:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 4:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 5:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------------------------+
+		 *    fbno                             fend
+		 *
+		 */
+
+		/*
+		 * The busy extent is fully covered by the extent we are
+		 * allocating, and can simply be removed from the rbtree.
+		 * However we cannot remove it from the immutable list
+		 * tracking busy extents in the transaction or CIL context,
+		 * so set the length to zero to mark it invalid.
+		 *
+		 * We also need to restart the busy extent search from the
+		 * tree root, because erasing the node can rearrange the
+		 * tree topology.
+		 */
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+		busyp->length = 0;
+		return false;
+	} else if (fend < bend) {
+		/*
+		 * Case 6:
+		 *              bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *             +---------+
+		 *             fbno   fend
+		 *
+		 * Case 7:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +------------------+
+		 *    fbno            fend
+		 *
+		 */
+		busyp->bno = fend;
+	} else if (bbno < fbno) {
+		/*
+		 * Case 8:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +-------------+
+		 *        fbno       fend
+		 *
+		 * Case 9:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +----------------------+
+		 *        fbno                fend
+		 */
+		busyp->length = fbno - busyp->bno;
+	} else {
+		ASSERT(0);
+	}
+
+	trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
+	return true;
+
+out_force_log:
+	spin_unlock(&pag->pagb_lock);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
+	spin_lock(&pag->pagb_lock);
+	return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
+void
+xfs_extent_busy_reuse(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata)
+{
+	struct xfs_perag	*pag;
+	struct rb_node		*rbp;
+
+	ASSERT(flen > 0);
+
+	pag = xfs_perag_get(mp, agno);
+	spin_lock(&pag->pagb_lock);
+restart:
+	rbp = pag->pagb_tree.rb_node;
+	while (rbp) {
+		struct xfs_extent_busy *busyp =
+			rb_entry(rbp, struct xfs_extent_busy, rb_node);
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
+
+		if (fbno + flen <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
+
+		if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
+						  userdata))
+			goto restart;
+	}
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+void
+xfs_extent_busy_trim(
+	struct xfs_alloc_arg	*args,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	xfs_agblock_t		*rbno,
+	xfs_extlen_t		*rlen)
+{
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	struct rb_node		*rbp;
+
+	ASSERT(len > 0);
+
+	spin_lock(&args->pag->pagb_lock);
+restart:
+	fbno = bno;
+	flen = len;
+	rbp = args->pag->pagb_tree.rb_node;
+	while (rbp && flen >= args->minlen) {
+		struct xfs_extent_busy *busyp =
+			rb_entry(rbp, struct xfs_extent_busy, rb_node);
+		xfs_agblock_t	fend = fbno + flen;
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
+
+		if (fend <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
+
+		/*
+		 * If this is a metadata allocation, try to reuse the busy
+		 * extent instead of trimming the allocation.
+		 */
+		if (!args->userdata &&
+		    !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
+			if (!xfs_extent_busy_update_extent(args->mp, args->pag,
+							  busyp, fbno, flen,
+							  false))
+				goto restart;
+			continue;
+		}
+
+		if (bbno <= fbno) {
+			/* start overlap */
+
+			/*
+			 * Case 1:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +---------+
+			 *        fbno   fend
+			 *
+			 * Case 2:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-------------+
+			 *    fbno       fend
+			 *
+			 * Case 3:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +-------------+
+			 *        fbno       fend
+			 *
+			 * Case 4:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------+
+			 *    fbno           fend
+			 *
+			 * No unbusy region in extent, return failure.
+			 */
+			if (fend <= bend)
+				goto fail;
+
+			/*
+			 * Case 5:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +----------------------+
+			 *        fbno                fend
+			 *
+			 * Case 6:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *                       +-------+
+			 *                       fbno fend
+			 */
+			fbno = bend;
+		} else if (bend >= fend) {
+			/* end overlap */
+
+			/*
+			 * Case 7:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +------------------+
+			 *    fbno            fend
+			 *
+			 * Case 8:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *    +-------+
+			 *    fbno fend
+			 */
+			fend = bbno;
+		} else {
+			/* middle overlap */
+
+			/*
+			 * Case 9:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------------------------+
+			 *    fbno                             fend
+			 *
+			 * Can be trimmed to:
+			 *    +-------+        OR         +-------+
+			 *    fbno fend                   fbno fend
+			 *
+			 * Backward allocation leads to significant
+			 * fragmentation of directories, which degrades
+			 * directory performance, therefore we always want to
+			 * choose the option that produces forward allocation
+			 * patterns.
+			 * Preferring the lower bno extent will make the next
+			 * request use "fend" as the start of the next
+			 * allocation;  if the segment is no longer busy at
+			 * that point, we'll get a contiguous allocation, but
+			 * even if it is still busy, we will get a forward
+			 * allocation.
+			 * We try to avoid choosing the segment at "bend",
+			 * because that can lead to the next allocation
+			 * taking the segment at "fbno", which would be a
+			 * backward allocation.  We only use the segment at
+			 * "fbno" if it is much larger than the current
+			 * requested size, because in that case there's a
+			 * good chance subsequent allocations will be
+			 * contiguous.
+			 */
+			if (bbno - fbno >= args->maxlen) {
+				/* left candidate fits perfect */
+				fend = bbno;
+			} else if (fend - bend >= args->maxlen * 4) {
+				/* right candidate has enough free space */
+				fbno = bend;
+			} else if (bbno - fbno >= args->minlen) {
+				/* left candidate fits minimum requirement */
+				fend = bbno;
+			} else {
+				goto fail;
+			}
+		}
+
+		flen = fend - fbno;
+	}
+	spin_unlock(&args->pag->pagb_lock);
+
+	if (fbno != bno || flen != len) {
+		trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+					  fbno, flen);
+	}
+	*rbno = fbno;
+	*rlen = flen;
+	return;
+fail:
+	/*
+	 * Return a zero extent length as failure indications.  All callers
+	 * re-check if the trimmed extent satisfies the minlen requirement.
+	 */
+	spin_unlock(&args->pag->pagb_lock);
+	trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+	*rbno = fbno;
+	*rlen = 0;
+}
+
+STATIC void
+xfs_extent_busy_clear_one(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_extent_busy	*busyp)
+{
+	if (busyp->length) {
+		trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
+						busyp->length);
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+	}
+
+	list_del_init(&busyp->list);
+	kmem_free(busyp);
+}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_extent_busy_clear(
+	struct xfs_mount	*mp,
+	struct list_head	*list,
+	bool			do_discard)
+{
+	struct xfs_extent_busy	*busyp, *n;
+	struct xfs_perag	*pag = NULL;
+	xfs_agnumber_t		agno = NULLAGNUMBER;
+
+	list_for_each_entry_safe(busyp, n, list, list) {
+		if (busyp->agno != agno) {
+			if (pag) {
+				spin_unlock(&pag->pagb_lock);
+				xfs_perag_put(pag);
+			}
+			pag = xfs_perag_get(mp, busyp->agno);
+			spin_lock(&pag->pagb_lock);
+			agno = busyp->agno;
+		}
+
+		if (do_discard && busyp->length &&
+		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
+		else
+			xfs_extent_busy_clear_one(mp, pag, busyp);
+	}
+
+	if (pag) {
+		spin_unlock(&pag->pagb_lock);
+		xfs_perag_put(pag);
+	}
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_extent_busy_ag_cmp(
+	void			*priv,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	return container_of(a, struct xfs_extent_busy, list)->agno -
+		container_of(b, struct xfs_extent_busy, list)->agno;
+}
diff --git a/kernel/fs/xfs/xfs_extent_busy.h b/kernel/fs/xfs/xfs_extent_busy.h
new file mode 100644
index 000000000..bfff284d2
--- /dev/null
+++ b/kernel/fs/xfs/xfs_extent_busy.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_EXTENT_BUSY_H__
+#define	__XFS_EXTENT_BUSY_H__
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_alloc_arg;
+
+/*
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_extent_busy_insert() for details.
+ */
+struct xfs_extent_busy {
+	struct rb_node	rb_node;	/* ag by-bno indexed search tree */
+	struct list_head list;		/* transaction busy extent list */
+	xfs_agnumber_t	agno;
+	xfs_agblock_t	bno;
+	xfs_extlen_t	length;
+	unsigned int	flags;
+#define XFS_EXTENT_BUSY_DISCARDED	0x01	/* undergoing a discard op. */
+#define XFS_EXTENT_BUSY_SKIP_DISCARD	0x02	/* do not discard */
+};
+
+void
+xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+
+void
+xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
+	bool do_discard);
+
+int
+xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+void
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
+	xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+
+int
+xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_extent_busy_sort(struct list_head *list)
+{
+	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
+}
+
+#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/kernel/fs/xfs/xfs_extfree_item.c b/kernel/fs/xfs/xfs_extfree_item.c
new file mode 100644
index 000000000..cb7fe64cd
--- /dev/null
+++ b/kernel/fs/xfs/xfs_extfree_item.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_log.h"
+
+
+kmem_zone_t	*xfs_efi_zone;
+kmem_zone_t	*xfs_efd_zone;
+
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
+
+void
+xfs_efi_item_free(
+	struct xfs_efi_log_item	*efip)
+{
+	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
+		kmem_free(efip);
+	else
+		kmem_zone_free(xfs_efi_zone, efip);
+}
+
+/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+	struct xfs_efi_log_item	*efip)
+{
+	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
+
+	if (atomic_dec_and_test(&efip->efi_refcount)) {
+		spin_lock(&ailp->xa_lock);
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		xfs_trans_ail_delete(ailp, &efip->efi_item,
+				     SHUTDOWN_LOG_IO_ERROR);
+		xfs_efi_item_free(efip);
+	}
+}
+
+/*
+ * This returns the number of iovecs needed to log the given efi item.
+ * We only need 1 iovec for an efi item.  It just logs the efi_log_format
+ * structure.
+ */
+static inline int
+xfs_efi_item_sizeof(
+	struct xfs_efi_log_item *efip)
+{
+	return sizeof(struct xfs_efi_log_format) +
+	       (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
+xfs_efi_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efi log item. We use only 1 iovec, and we point that
+ * at the efi_log_format structure embedded in the efi item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efi item have been filled.
+ */
+STATIC void
+xfs_efi_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&efip->efi_next_extent) ==
+				efip->efi_format.efi_nextents);
+
+	efip->efi_format.efi_type = XFS_LI_EFI;
+	efip->efi_format.efi_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
+			&efip->efi_format,
+			xfs_efi_item_sizeof(efip));
+}
+
+
+/*
+ * Pinning has no meaning for an efi item, so just return.
+ */
+STATIC void
+xfs_efi_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
+ * which the EFI is manipulated during a transaction.  If we are being asked to
+ * remove the EFI it's because the transaction has been cancelled and by
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release()
+ * to determine who gets to free the EFI.
+ */
+STATIC void
+xfs_efi_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+
+	if (remove) {
+		ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+		if (lip->li_desc)
+			xfs_trans_del_item(lip);
+		xfs_efi_item_free(efip);
+		return;
+	}
+	__xfs_efi_release(efip);
+}
+
+/*
+ * Efi items have no locking or pushing.  However, since EFIs are pulled from
+ * the AIL when their corresponding EFDs are committed to disk, their situation
+ * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log.  This should help in getting the EFI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_efi_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+STATIC void
+xfs_efi_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_efi_item_free(EFI_ITEM(lip));
+}
+
+/*
+ * The EFI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_efi_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	return lsn;
+}
+
+/*
+ * The EFI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_efi_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all efi log items.
+ */
+static const struct xfs_item_ops xfs_efi_item_ops = {
+	.iop_size	= xfs_efi_item_size,
+	.iop_format	= xfs_efi_item_format,
+	.iop_pin	= xfs_efi_item_pin,
+	.iop_unpin	= xfs_efi_item_unpin,
+	.iop_unlock	= xfs_efi_item_unlock,
+	.iop_committed	= xfs_efi_item_committed,
+	.iop_push	= xfs_efi_item_push,
+	.iop_committing = xfs_efi_item_committing
+};
+
+
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+struct xfs_efi_log_item *
+xfs_efi_init(
+	struct xfs_mount	*mp,
+	uint			nextents)
+
+{
+	struct xfs_efi_log_item	*efip;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efi_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efip = kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+	}
+
+	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+	efip->efi_format.efi_nextents = nextents;
+	efip->efi_format.efi_id = (__psint_t)(void*)efip;
+	atomic_set(&efip->efi_next_extent, 0);
+	atomic_set(&efip->efi_refcount, 2);
+
+	return efip;
+}
+
+/*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
+ */
+int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+{
+	xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
+	uint i;
+	uint len = sizeof(xfs_efi_log_format_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
+	uint len32 = sizeof(xfs_efi_log_format_32_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);  
+	uint len64 = sizeof(xfs_efi_log_format_64_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);  
+
+	if (buf->i_len == len) {
+		memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+		return 0;
+	} else if (buf->i_len == len32) {
+		xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_32->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_32->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_32->efi_extents[i].ext_len;
+		}
+		return 0;
+	} else if (buf->i_len == len64) {
+		xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_64->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_64->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_64->efi_extents[i].ext_len;
+		}
+		return 0;
+	}
+	return -EFSCORRUPTED;
+}
+
+/*
+ * This is called by the efd item code below to release references to the given
+ * efi item.  Each efd calls this with the number of extents that it has
+ * logged, and when the sum of these reaches the total number of extents logged
+ * by this efi item we can free the efi item.
+ */
+void
+xfs_efi_release(xfs_efi_log_item_t	*efip,
+		uint			nextents)
+{
+	ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
+	if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
+		/* recovery needs us to drop the EFI reference, too */
+		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
+			__xfs_efi_release(efip);
+
+		__xfs_efi_release(efip);
+		/* efip may now have been freed, do not reference it again. */
+	}
+}
+
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
+
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
+		kmem_free(efdp);
+	else
+		kmem_zone_free(xfs_efd_zone, efdp);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given efd item.
+ * We only need 1 iovec for an efd item.  It just logs the efd_log_format
+ * structure.
+ */
+static inline int
+xfs_efd_item_sizeof(
+	struct xfs_efd_log_item *efdp)
+{
+	return sizeof(xfs_efd_log_format_t) +
+	       (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
+xfs_efd_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efd log item. We use only 1 iovec, and we point that
+ * at the efd_log_format structure embedded in the efd item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efd item have been filled.
+ */
+STATIC void
+xfs_efd_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+
+	efdp->efd_format.efd_type = XFS_LI_EFD;
+	efdp->efd_format.efd_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
+			&efdp->efd_format,
+			xfs_efd_item_sizeof(efdp));
+}
+
+/*
+ * Pinning has no meaning for an efd item, so just return.
+ */
+STATIC void
+xfs_efd_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an efd item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_efd_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an efd item.  It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_efd_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+STATIC void
+xfs_efd_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_efd_item_free(EFD_ITEM(lip));
+}
+
+/*
+ * When the efd item is committed to disk, all we need to do
+ * is delete our reference to our partner efi item and then
+ * free ourselves.  Since we're freeing ourselves we must
+ * return -1 to keep the transaction code from further referencing
+ * this item.
+ */
+STATIC xfs_lsn_t
+xfs_efd_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+
+	/*
+	 * If we got a log I/O error, it's always the case that the LR with the
+	 * EFI got unpinned and freed before the EFD got aborted.
+	 */
+	if (!(lip->li_flags & XFS_LI_ABORTED))
+		xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
+
+	xfs_efd_item_free(efdp);
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * The EFD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_efd_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all efd log items.
+ */
+static const struct xfs_item_ops xfs_efd_item_ops = {
+	.iop_size	= xfs_efd_item_size,
+	.iop_format	= xfs_efd_item_format,
+	.iop_pin	= xfs_efd_item_pin,
+	.iop_unpin	= xfs_efd_item_unpin,
+	.iop_unlock	= xfs_efd_item_unlock,
+	.iop_committed	= xfs_efd_item_committed,
+	.iop_push	= xfs_efd_item_push,
+	.iop_committing = xfs_efd_item_committing
+};
+
+/*
+ * Allocate and initialize an efd item with the given number of extents.
+ */
+struct xfs_efd_log_item *
+xfs_efd_init(
+	struct xfs_mount	*mp,
+	struct xfs_efi_log_item	*efip,
+	uint			nextents)
+
+{
+	struct xfs_efd_log_item	*efdp;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efd_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efdp = kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+	}
+
+	xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
+	efdp->efd_efip = efip;
+	efdp->efd_format.efd_nextents = nextents;
+	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+	return efdp;
+}
diff --git a/kernel/fs/xfs/xfs_extfree_item.h b/kernel/fs/xfs/xfs_extfree_item.h
new file mode 100644
index 000000000..0ffbce32d
--- /dev/null
+++ b/kernel/fs/xfs/xfs_extfree_item.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_EXTFREE_ITEM_H__
+#define	__XFS_EXTFREE_ITEM_H__
+
+/* kernel only EFI/EFD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_EFI_MAX_FAST_EXTENTS	16
+
+/*
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define	XFS_EFI_RECOVERED	1
+
+/*
+ * This is the "extent free intention" log item.  It is used to log the fact
+ * that some extents need to be free.  It is used in conjunction with the
+ * "extent free done" log item described below.
+ *
+ * The EFI is reference counted so that it is not freed prior to both the EFI
+ * and EFD being committed and unpinned. This ensures that when the last
+ * reference goes away the EFI will always be in the AIL as it has been
+ * unpinned, regardless of whether the EFD is processed before or after the EFI.
+ */
+typedef struct xfs_efi_log_item {
+	xfs_log_item_t		efi_item;
+	atomic_t		efi_refcount;
+	atomic_t		efi_next_extent;
+	unsigned long		efi_flags;	/* misc flags */
+	xfs_efi_log_format_t	efi_format;
+} xfs_efi_log_item_t;
+
+/*
+ * This is the "extent free done" log item.  It is used to log
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+typedef struct xfs_efd_log_item {
+	xfs_log_item_t		efd_item;
+	xfs_efi_log_item_t	*efd_efip;
+	uint			efd_next_extent;
+	xfs_efd_log_format_t	efd_format;
+} xfs_efd_log_item_t;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_EFD_MAX_FAST_EXTENTS	16
+
+extern struct kmem_zone	*xfs_efi_zone;
+extern struct kmem_zone	*xfs_efd_zone;
+
+xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
+xfs_efd_log_item_t	*xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
+				      uint);
+int			xfs_efi_copy_format(xfs_log_iovec_t *buf,
+					    xfs_efi_log_format_t *dst_efi_fmt);
+void			xfs_efi_item_free(xfs_efi_log_item_t *);
+
+#endif	/* __XFS_EXTFREE_ITEM_H__ */
diff --git a/kernel/fs/xfs/xfs_file.c b/kernel/fs/xfs/xfs_file.c
new file mode 100644
index 000000000..3b7591224
--- /dev/null
+++ b/kernel/fs/xfs/xfs_file.c
@@ -0,0 +1,1534 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_icache.h"
+#include "xfs_pnfs.h"
+
+#include <linux/dcache.h>
+#include <linux/falloc.h>
+#include <linux/pagevec.h>
+
+static const struct vm_operations_struct xfs_file_vm_ops;
+
+/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_lock(&VFS_I(ip)->i_mutex);
+	xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_iunlock(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_ilock_demote(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
+ *	xfs_iozero
+ *
+ *	xfs_iozero clears the specified range of buffer supplied,
+ *	and marks all the affected blocks as valid and modified.  If
+ *	an affected block is not allocated, it will be allocated.  If
+ *	an affected block is not completely overwritten, and is not
+ *	valid before the operation, it will be read from disk before
+ *	being partially zeroed.
+ */
+int
+xfs_iozero(
+	struct xfs_inode	*ip,	/* inode			*/
+	loff_t			pos,	/* offset in file		*/
+	size_t			count)	/* size of data to zero		*/
+{
+	struct page		*page;
+	struct address_space	*mapping;
+	int			status;
+
+	mapping = VFS_I(ip)->i_mapping;
+	do {
+		unsigned offset, bytes;
+		void *fsdata;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = pagecache_write_begin(NULL, mapping, pos, bytes,
+					AOP_FLAG_UNINTERRUPTIBLE,
+					&page, &fsdata);
+		if (status)
+			break;
+
+		zero_user(page, offset, bytes);
+
+		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+					page, fsdata);
+		WARN_ON(status <= 0); /* can't return less than zero! */
+		pos += bytes;
+		count -= bytes;
+		status = 0;
+	} while (count);
+
+	return status;
+}
+
+int
+xfs_update_prealloc_flags(
+	struct xfs_inode	*ip,
+	enum xfs_prealloc_flags	flags)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+	error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
+		ip->i_d.di_mode &= ~S_ISUID;
+		if (ip->i_d.di_mode & S_IXGRP)
+			ip->i_d.di_mode &= ~S_ISGID;
+		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	}
+
+	if (flags & XFS_PREALLOC_SET)
+		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+	if (flags & XFS_PREALLOC_CLEAR)
+		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	if (flags & XFS_PREALLOC_SYNC)
+		xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * Fsync operations on directories are much simpler than on regular files,
+ * as there is no file data to flush, and thus also no need for explicit
+ * cache flush operations, and there are no non-transaction metadata updates
+ * on directories either.
+ */
+STATIC int
+xfs_dir_fsync(
+	struct file		*file,
+	loff_t			start,
+	loff_t			end,
+	int			datasync)
+{
+	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_lsn_t		lsn = 0;
+
+	trace_xfs_dir_fsync(ip);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip))
+		lsn = ip->i_itemp->ili_last_lsn;
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!lsn)
+		return 0;
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
+
+STATIC int
+xfs_file_fsync(
+	struct file		*file,
+	loff_t			start,
+	loff_t			end,
+	int			datasync)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error = 0;
+	int			log_flushed = 0;
+	xfs_lsn_t		lsn = 0;
+
+	trace_xfs_file_fsync(ip);
+
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return error;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+	if (mp->m_flags & XFS_MOUNT_BARRIER) {
+		/*
+		 * If we have an RT and/or log subvolume we need to make sure
+		 * to flush the write cache the device used for file data
+		 * first.  This is to ensure newly written file data make
+		 * it to disk before logging the new inode size in case of
+		 * an extending write.
+		 */
+		if (XFS_IS_REALTIME_INODE(ip))
+			xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+		else if (mp->m_logdev_targp != mp->m_ddev_targp)
+			xfs_blkdev_issue_flush(mp->m_ddev_targp);
+	}
+
+	/*
+	 * All metadata updates are logged, which means that we just have
+	 * to flush the log up to the latest LSN that touched the inode.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip)) {
+		if (!datasync ||
+		    (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+			lsn = ip->i_itemp->ili_last_lsn;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (lsn)
+		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+
+	/*
+	 * If we only have a single device, and the log force about was
+	 * a no-op we might have to flush the data device cache here.
+	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
+	 * an already allocated file and thus do not have any metadata to
+	 * commit.
+	 */
+	if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
+	    mp->m_logdev_targp == mp->m_ddev_targp &&
+	    !XFS_IS_REALTIME_INODE(ip) &&
+	    !log_flushed)
+		xfs_blkdev_issue_flush(mp->m_ddev_targp);
+
+	return error;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct file		*file = iocb->ki_filp;
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	size_t			size = iov_iter_count(to);
+	ssize_t			ret = 0;
+	int			ioflags = 0;
+	xfs_fsize_t		n;
+	loff_t			pos = iocb->ki_pos;
+
+	XFS_STATS_INC(xs_read_calls);
+
+	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+		ioflags |= XFS_IO_ISDIRECT;
+	if (file->f_mode & FMODE_NOCMTIME)
+		ioflags |= XFS_IO_INVIS;
+
+	if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp;
+		/* DIO must be aligned to device logical sector size */
+		if ((pos | size) & target->bt_logical_sectormask) {
+			if (pos == i_size_read(inode))
+				return 0;
+			return -EINVAL;
+		}
+	}
+
+	n = mp->m_super->s_maxbytes - pos;
+	if (n <= 0 || size == 0)
+		return 0;
+
+	if (n < size)
+		size = n;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * Locking is a bit tricky here. If we take an exclusive lock
+	 * for direct IO, we effectively serialise all new concurrent
+	 * read IO to this file and block it behind IO that is currently in
+	 * progress because IO in progress holds the IO lock shared. We only
+	 * need to hold the lock exclusive to blow away the page cache, so
+	 * only take lock exclusively if the page cache needs invalidation.
+	 * This allows the normal direct IO case of no page cache pages to
+	 * proceeed concurrently without serialisation.
+	 */
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
+		if (inode->i_mapping->nrpages) {
+			ret = filemap_write_and_wait_range(
+							VFS_I(ip)->i_mapping,
+							pos, pos + size - 1);
+			if (ret) {
+				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+				return ret;
+			}
+
+			/*
+			 * Invalidate whole pages. This can return an error if
+			 * we fail to invalidate a page, but this should never
+			 * happen on XFS. Warn if it does fail.
+			 */
+			ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+					pos >> PAGE_CACHE_SHIFT,
+					(pos + size - 1) >> PAGE_CACHE_SHIFT);
+			WARN_ON_ONCE(ret);
+			ret = 0;
+		}
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+	}
+
+	trace_xfs_file_read(ip, size, pos, ioflags);
+
+	ret = generic_file_read_iter(iocb, to);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_read_bytes, ret);
+
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_splice_read(
+	struct file		*infilp,
+	loff_t			*ppos,
+	struct pipe_inode_info	*pipe,
+	size_t			count,
+	unsigned int		flags)
+{
+	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host);
+	int			ioflags = 0;
+	ssize_t			ret;
+
+	XFS_STATS_INC(xs_read_calls);
+
+	if (infilp->f_mode & FMODE_NOCMTIME)
+		ioflags |= XFS_IO_INVIS;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+
+	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
+	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_read_bytes, ret);
+
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last block of the
+ * file that is beyond the EOF.  We do this since the size is being increased
+ * without writing anything to that block and we don't want to read the
+ * garbage on the disk.
+ */
+STATIC int				/* error (positive) */
+xfs_zero_last_block(
+	struct xfs_inode	*ip,
+	xfs_fsize_t		offset,
+	xfs_fsize_t		isize,
+	bool			*did_zeroing)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		last_fsb = XFS_B_TO_FSBT(mp, isize);
+	int			zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+	int			zero_len;
+	int			nimaps = 1;
+	int			error = 0;
+	struct xfs_bmbt_irec	imap;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		return error;
+
+	ASSERT(nimaps > 0);
+
+	/*
+	 * If the block underlying isize is just a hole, then there
+	 * is nothing to zero.
+	 */
+	if (imap.br_startblock == HOLESTARTBLOCK)
+		return 0;
+
+	zero_len = mp->m_sb.sb_blocksize - zero_offset;
+	if (isize + zero_len > offset)
+		zero_len = offset - isize;
+	*did_zeroing = true;
+	return xfs_iozero(ip, isize, zero_len);
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new, larger EOF.
+ *
+ * This handles the normal case of zeroing the remainder of the last block in
+ * the file and the unusual case of zeroing blocks out beyond the size of the
+ * file.  This second case only happens with fixed size extents and when the
+ * system crashes before the inode size was updated but after blocks were
+ * allocated.
+ *
+ * Expects the iolock to be held exclusive, and will take the ilock internally.
+ */
+int					/* error (positive) */
+xfs_zero_eof(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,		/* starting I/O offset */
+	xfs_fsize_t		isize,		/* current inode size */
+	bool			*did_zeroing)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		start_zero_fsb;
+	xfs_fileoff_t		end_zero_fsb;
+	xfs_fileoff_t		zero_count_fsb;
+	xfs_fileoff_t		last_fsb;
+	xfs_fileoff_t		zero_off;
+	xfs_fsize_t		zero_len;
+	int			nimaps;
+	int			error = 0;
+	struct xfs_bmbt_irec	imap;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(offset > isize);
+
+	/*
+	 * First handle zeroing the block on which isize resides.
+	 *
+	 * We only zero a part of that block so it is handled specially.
+	 */
+	if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
+		error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Calculate the range between the new size and the old where blocks
+	 * needing to be zeroed may exist.
+	 *
+	 * To get the block where the last byte in the file currently resides,
+	 * we need to subtract one from the size and truncate back to a block
+	 * boundary.  We subtract 1 in case the size is exactly on a block
+	 * boundary.
+	 */
+	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+	if (last_fsb == end_zero_fsb) {
+		/*
+		 * The size was only incremented on its last block.
+		 * We took care of that above, so just return.
+		 */
+		return 0;
+	}
+
+	ASSERT(start_zero_fsb <= end_zero_fsb);
+	while (start_zero_fsb <= end_zero_fsb) {
+		nimaps = 1;
+		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
+					  &imap, &nimaps, 0);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			return error;
+
+		ASSERT(nimaps > 0);
+
+		if (imap.br_state == XFS_EXT_UNWRITTEN ||
+		    imap.br_startblock == HOLESTARTBLOCK) {
+			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+			continue;
+		}
+
+		/*
+		 * There are blocks we need to zero.
+		 */
+		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+
+		if ((zero_off + zero_len) > offset)
+			zero_len = offset - zero_off;
+
+		error = xfs_iozero(ip, zero_off, zero_len);
+		if (error)
+			return error;
+
+		*did_zeroing = true;
+		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+	}
+
+	return 0;
+}
+
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
+ */
+STATIC ssize_t
+xfs_file_aio_write_checks(
+	struct kiocb		*iocb,
+	struct iov_iter		*from,
+	int			*iolock)
+{
+	struct file		*file = iocb->ki_filp;
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			error = 0;
+	size_t			count = iov_iter_count(from);
+
+restart:
+	error = generic_write_checks(iocb, from);
+	if (error <= 0)
+		return error;
+
+	error = xfs_break_layouts(inode, iolock, true);
+	if (error)
+		return error;
+
+	/*
+	 * If the offset is beyond the size of the file, we need to zero any
+	 * blocks that fall between the existing EOF and the start of this
+	 * write.  If zeroing is needed and we are currently holding the
+	 * iolock shared, we need to update it to exclusive which implies
+	 * having to redo all checks before.
+	 *
+	 * We need to serialise against EOF updates that occur in IO
+	 * completions here. We want to make sure that nobody is changing the
+	 * size while we do this check until we have placed an IO barrier (i.e.
+	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+	 * The spinlock effectively forms a memory barrier once we have the
+	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+	 * and hence be able to correctly determine if we need to run zeroing.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (iocb->ki_pos > i_size_read(inode)) {
+		bool	zero = false;
+
+		spin_unlock(&ip->i_flags_lock);
+		if (*iolock == XFS_IOLOCK_SHARED) {
+			xfs_rw_iunlock(ip, *iolock);
+			*iolock = XFS_IOLOCK_EXCL;
+			xfs_rw_ilock(ip, *iolock);
+			iov_iter_reexpand(from, count);
+
+			/*
+			 * We now have an IO submission barrier in place, but
+			 * AIO can do EOF updates during IO completion and hence
+			 * we now need to wait for all of them to drain. Non-AIO
+			 * DIO will have drained before we are given the
+			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+			 * no-op.
+			 */
+			inode_dio_wait(inode);
+			goto restart;
+		}
+		error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
+		if (error)
+			return error;
+	} else
+		spin_unlock(&ip->i_flags_lock);
+
+	/*
+	 * Updating the timestamps will grab the ilock again from
+	 * xfs_fs_dirty_inode, so we have to call it after dropping the
+	 * lock above.  Eventually we should look into a way to avoid
+	 * the pointless lock roundtrip.
+	 */
+	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+		error = file_update_time(file);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If we're writing the file then make sure to clear the setuid and
+	 * setgid bits if the process is not being run by root.  This keeps
+	 * people from modifying setuid and setgid binaries.
+	 */
+	return file_remove_suid(file);
+}
+
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0;
+	int			unaligned_io = 0;
+	int			iolock;
+	size_t			count = iov_iter_count(from);
+	loff_t			pos = iocb->ki_pos;
+	loff_t			end;
+	struct iov_iter		data;
+	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
+					mp->m_rtdev_targp : mp->m_ddev_targp;
+
+	/* DIO must be aligned to device logical sector size */
+	if ((pos | count) & target->bt_logical_sectormask)
+		return -EINVAL;
+
+	/* "unaligned" here means not aligned to a filesystem block */
+	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+		unaligned_io = 1;
+
+	/*
+	 * We don't need to take an exclusive lock unless there page cache needs
+	 * to be invalidated or unaligned IO is being executed. We don't need to
+	 * consider the EOF extension case here because
+	 * xfs_file_aio_write_checks() will relock the inode as necessary for
+	 * EOF zeroing cases and fill out the new inode size as appropriate.
+	 */
+	if (unaligned_io || mapping->nrpages)
+		iolock = XFS_IOLOCK_EXCL;
+	else
+		iolock = XFS_IOLOCK_SHARED;
+	xfs_rw_ilock(ip, iolock);
+
+	/*
+	 * Recheck if there are cached pages that need invalidate after we got
+	 * the iolock to protect against other threads adding new pages while
+	 * we were waiting for the iolock.
+	 */
+	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, iolock);
+		iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, iolock);
+	}
+
+	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out;
+	count = iov_iter_count(from);
+	pos = iocb->ki_pos;
+	end = pos + count - 1;
+
+	if (mapping->nrpages) {
+		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+						   pos, end);
+		if (ret)
+			goto out;
+		/*
+		 * Invalidate whole pages. This can return an error if
+		 * we fail to invalidate a page, but this should never
+		 * happen on XFS. Warn if it does fail.
+		 */
+		ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+					pos >> PAGE_CACHE_SHIFT,
+					end >> PAGE_CACHE_SHIFT);
+		WARN_ON_ONCE(ret);
+		ret = 0;
+	}
+
+	/*
+	 * If we are doing unaligned IO, wait for all other IO to drain,
+	 * otherwise demote the lock if we had to flush cached pages
+	 */
+	if (unaligned_io)
+		inode_dio_wait(inode);
+	else if (iolock == XFS_IOLOCK_EXCL) {
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		iolock = XFS_IOLOCK_SHARED;
+	}
+
+	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+
+	data = *from;
+	ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+
+	/* see generic_file_direct_write() for why this is necessary */
+	if (mapping->nrpages) {
+		invalidate_inode_pages2_range(mapping,
+					      pos >> PAGE_CACHE_SHIFT,
+					      end >> PAGE_CACHE_SHIFT);
+	}
+
+	if (ret > 0) {
+		pos += ret;
+		iov_iter_advance(from, ret);
+		iocb->ki_pos = pos;
+	}
+out:
+	xfs_rw_iunlock(ip, iolock);
+
+	/* No fallback to buffered IO on errors for XFS. */
+	ASSERT(ret < 0 || ret == count);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	int			enospc = 0;
+	int			iolock = XFS_IOLOCK_EXCL;
+
+	xfs_rw_ilock(ip, iolock);
+
+	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+
+write_retry:
+	trace_xfs_file_buffered_write(ip, iov_iter_count(from),
+				      iocb->ki_pos, 0);
+	ret = generic_perform_write(file, from, iocb->ki_pos);
+	if (likely(ret >= 0))
+		iocb->ki_pos += ret;
+
+	/*
+	 * If we hit a space limit, try to free up some lingering preallocated
+	 * space before returning an error. In the case of ENOSPC, first try to
+	 * write back all dirty inodes to free up some of the excess reserved
+	 * metadata space. This reduces the chances that the eofblocks scan
+	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
+	 * also behaves as a filter to prevent too many eofblocks scans from
+	 * running at the same time.
+	 */
+	if (ret == -EDQUOT && !enospc) {
+		enospc = xfs_inode_free_quota_eofblocks(ip);
+		if (enospc)
+			goto write_retry;
+	} else if (ret == -ENOSPC && !enospc) {
+		struct xfs_eofblocks eofb = {0};
+
+		enospc = 1;
+		xfs_flush_inodes(ip->i_mount);
+		eofb.eof_scan_owner = ip->i_ino; /* for locking */
+		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+		goto write_retry;
+	}
+
+	current->backing_dev_info = NULL;
+out:
+	xfs_rw_iunlock(ip, iolock);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_write_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	size_t			ocount = iov_iter_count(from);
+
+	XFS_STATS_INC(xs_write_calls);
+
+	if (ocount == 0)
+		return 0;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+		ret = xfs_file_dio_aio_write(iocb, from);
+	else
+		ret = xfs_file_buffered_aio_write(iocb, from);
+
+	if (ret > 0) {
+		ssize_t err;
+
+		XFS_STATS_ADD(xs_write_bytes, ret);
+
+		/* Handle various SYNC-type writes */
+		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
+#define	XFS_FALLOC_FL_SUPPORTED						\
+		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
+		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
+		 FALLOC_FL_INSERT_RANGE)
+
+STATIC long
+xfs_file_fallocate(
+	struct file		*file,
+	int			mode,
+	loff_t			offset,
+	loff_t			len)
+{
+	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
+	long			error;
+	enum xfs_prealloc_flags	flags = 0;
+	uint			iolock = XFS_IOLOCK_EXCL;
+	loff_t			new_size = 0;
+	bool			do_file_insert = 0;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
+		return -EOPNOTSUPP;
+
+	xfs_ilock(ip, iolock);
+	error = xfs_break_layouts(inode, &iolock, false);
+	if (error)
+		goto out_unlock;
+
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	iolock |= XFS_MMAPLOCK_EXCL;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		error = xfs_free_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
+	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+		if (offset & blksize_mask || len & blksize_mask) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		/*
+		 * There is no need to overlap collapse range with EOF,
+		 * in which case it is effectively a truncate operation
+		 */
+		if (offset + len >= i_size_read(inode)) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		new_size = i_size_read(inode) - len;
+
+		error = xfs_collapse_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
+	} else if (mode & FALLOC_FL_INSERT_RANGE) {
+		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+		new_size = i_size_read(inode) + len;
+		if (offset & blksize_mask || len & blksize_mask) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* check the new inode size does not wrap through zero */
+		if (new_size > inode->i_sb->s_maxbytes) {
+			error = -EFBIG;
+			goto out_unlock;
+		}
+
+		/* Offset should be less than i_size */
+		if (offset >= i_size_read(inode)) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+		do_file_insert = 1;
+	} else {
+		flags |= XFS_PREALLOC_SET;
+
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    offset + len > i_size_read(inode)) {
+			new_size = offset + len;
+			error = inode_newsize_ok(inode, new_size);
+			if (error)
+				goto out_unlock;
+		}
+
+		if (mode & FALLOC_FL_ZERO_RANGE)
+			error = xfs_zero_file_space(ip, offset, len);
+		else
+			error = xfs_alloc_file_space(ip, offset, len,
+						     XFS_BMAPI_PREALLOC);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (file->f_flags & O_DSYNC)
+		flags |= XFS_PREALLOC_SYNC;
+
+	error = xfs_update_prealloc_flags(ip, flags);
+	if (error)
+		goto out_unlock;
+
+	/* Change file size if needed */
+	if (new_size) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = xfs_setattr_size(ip, &iattr);
+		if (error)
+			goto out_unlock;
+	}
+
+	/*
+	 * Perform hole insertion now that the file size has been
+	 * updated so that if we crash during the operation we don't
+	 * leave shifted extents past EOF and hence losing access to
+	 * the data that is contained within them.
+	 */
+	if (do_file_insert)
+		error = xfs_insert_file_space(ip, offset, len);
+
+out_unlock:
+	xfs_iunlock(ip, iolock);
+	return error;
+}
+
+
+STATIC int
+xfs_file_open(
+	struct inode	*inode,
+	struct file	*file)
+{
+	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EFBIG;
+	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+		return -EIO;
+	return 0;
+}
+
+STATIC int
+xfs_dir_open(
+	struct inode	*inode,
+	struct file	*file)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	int		mode;
+	int		error;
+
+	error = xfs_file_open(inode, file);
+	if (error)
+		return error;
+
+	/*
+	 * If there are any blocks, read-ahead block 0 as we're almost
+	 * certain to have the next operation be a read there.
+	 */
+	mode = xfs_ilock_data_map_shared(ip);
+	if (ip->i_d.di_nextents > 0)
+		xfs_dir3_data_readahead(ip, 0, -1);
+	xfs_iunlock(ip, mode);
+	return 0;
+}
+
+STATIC int
+xfs_file_release(
+	struct inode	*inode,
+	struct file	*filp)
+{
+	return xfs_release(XFS_I(inode));
+}
+
+STATIC int
+xfs_file_readdir(
+	struct file	*file,
+	struct dir_context *ctx)
+{
+	struct inode	*inode = file_inode(file);
+	xfs_inode_t	*ip = XFS_I(inode);
+	size_t		bufsize;
+
+	/*
+	 * The Linux API doesn't pass down the total size of the buffer
+	 * we read into down to the filesystem.  With the filldir concept
+	 * it's not needed for correct information, but the XFS dir2 leaf
+	 * code wants an estimate of the buffer size to calculate it's
+	 * readahead window and size the buffers used for mapping to
+	 * physical blocks.
+	 *
+	 * Try to give it an estimate that's good enough, maybe at some
+	 * point we can change the ->readdir prototype to include the
+	 * buffer size.  For now we use the current glibc buffer size.
+	 */
+	bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+
+	return xfs_readdir(ip, ctx, bufsize);
+}
+
+STATIC int
+xfs_file_mmap(
+	struct file	*filp,
+	struct vm_area_struct *vma)
+{
+	vma->vm_ops = &xfs_file_vm_ops;
+
+	file_accessed(filp);
+	return 0;
+}
+
+/*
+ * This type is designed to indicate the type of offset we would like
+ * to search from page cache for xfs_seek_hole_data().
+ */
+enum {
+	HOLE_OFF = 0,
+	DATA_OFF,
+};
+
+/*
+ * Lookup the desired type of offset from the given page.
+ *
+ * On success, return true and the offset argument will point to the
+ * start of the region that was found.  Otherwise this function will
+ * return false and keep the offset argument unchanged.
+ */
+STATIC bool
+xfs_lookup_buffer_offset(
+	struct page		*page,
+	loff_t			*offset,
+	unsigned int		type)
+{
+	loff_t			lastoff = page_offset(page);
+	bool			found = false;
+	struct buffer_head	*bh, *head;
+
+	bh = head = page_buffers(page);
+	do {
+		/*
+		 * Unwritten extents that have data in the page
+		 * cache covering them can be identified by the
+		 * BH_Unwritten state flag.  Pages with multiple
+		 * buffers might have a mix of holes, data and
+		 * unwritten extents - any buffer with valid
+		 * data in it should have BH_Uptodate flag set
+		 * on it.
+		 */
+		if (buffer_unwritten(bh) ||
+		    buffer_uptodate(bh)) {
+			if (type == DATA_OFF)
+				found = true;
+		} else {
+			if (type == HOLE_OFF)
+				found = true;
+		}
+
+		if (found) {
+			*offset = lastoff;
+			break;
+		}
+		lastoff += bh->b_size;
+	} while ((bh = bh->b_this_page) != head);
+
+	return found;
+}
+
+/*
+ * This routine is called to find out and return a data or hole offset
+ * from the page cache for unwritten extents according to the desired
+ * type for xfs_seek_hole_data().
+ *
+ * The argument offset is used to tell where we start to search from the
+ * page cache.  Map is used to figure out the end points of the range to
+ * lookup pages.
+ *
+ * Return true if the desired type of offset was found, and the argument
+ * offset is filled with that address.  Otherwise, return false and keep
+ * offset unchanged.
+ */
+STATIC bool
+xfs_find_get_desired_pgoff(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*map,
+	unsigned int		type,
+	loff_t			*offset)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct pagevec		pvec;
+	pgoff_t			index;
+	pgoff_t			end;
+	loff_t			endoff;
+	loff_t			startoff = *offset;
+	loff_t			lastoff = startoff;
+	bool			found = false;
+
+	pagevec_init(&pvec, 0);
+
+	index = startoff >> PAGE_CACHE_SHIFT;
+	endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
+	end = endoff >> PAGE_CACHE_SHIFT;
+	do {
+		int		want;
+		unsigned	nr_pages;
+		unsigned int	i;
+
+		want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+					  want);
+		/*
+		 * No page mapped into given range.  If we are searching holes
+		 * and if this is the first time we got into the loop, it means
+		 * that the given offset is landed in a hole, return it.
+		 *
+		 * If we have already stepped through some block buffers to find
+		 * holes but they all contains data.  In this case, the last
+		 * offset is already updated and pointed to the end of the last
+		 * mapped page, if it does not reach the endpoint to search,
+		 * that means there should be a hole between them.
+		 */
+		if (nr_pages == 0) {
+			/* Data search found nothing */
+			if (type == DATA_OFF)
+				break;
+
+			ASSERT(type == HOLE_OFF);
+			if (lastoff == startoff || lastoff < endoff) {
+				found = true;
+				*offset = lastoff;
+			}
+			break;
+		}
+
+		/*
+		 * At lease we found one page.  If this is the first time we
+		 * step into the loop, and if the first page index offset is
+		 * greater than the given search offset, a hole was found.
+		 */
+		if (type == HOLE_OFF && lastoff == startoff &&
+		    lastoff < page_offset(pvec.pages[0])) {
+			found = true;
+			break;
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page	*page = pvec.pages[i];
+			loff_t		b_offset;
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL),
+			 * or even swizzled back from swapper_space to tmpfs
+			 * file mapping. However, page->index will not change
+			 * because we have a reference on the page.
+			 *
+			 * Searching done if the page index is out of range.
+			 * If the current offset is not reaches the end of
+			 * the specified search range, there should be a hole
+			 * between them.
+			 */
+			if (page->index > end) {
+				if (type == HOLE_OFF && lastoff < endoff) {
+					*offset = lastoff;
+					found = true;
+				}
+				goto out;
+			}
+
+			lock_page(page);
+			/*
+			 * Page truncated or invalidated(page->mapping == NULL).
+			 * We can freely skip it and proceed to check the next
+			 * page.
+			 */
+			if (unlikely(page->mapping != inode->i_mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!page_has_buffers(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			found = xfs_lookup_buffer_offset(page, &b_offset, type);
+			if (found) {
+				/*
+				 * The found offset may be less than the start
+				 * point to search if this is the first time to
+				 * come here.
+				 */
+				*offset = max_t(loff_t, startoff, b_offset);
+				unlock_page(page);
+				goto out;
+			}
+
+			/*
+			 * We either searching data but nothing was found, or
+			 * searching hole but found a data buffer.  In either
+			 * case, probably the next page contains the desired
+			 * things, update the last offset to it so.
+			 */
+			lastoff = page_offset(page) + PAGE_SIZE;
+			unlock_page(page);
+		}
+
+		/*
+		 * The number of returned pages less than our desired, search
+		 * done.  In this case, nothing was found for searching data,
+		 * but we found a hole behind the last offset.
+		 */
+		if (nr_pages < want) {
+			if (type == HOLE_OFF) {
+				*offset = lastoff;
+				found = true;
+			}
+			break;
+		}
+
+		index = pvec.pages[i - 1]->index + 1;
+		pagevec_release(&pvec);
+	} while (index <= end);
+
+out:
+	pagevec_release(&pvec);
+	return found;
+}
+
+STATIC loff_t
+xfs_seek_hole_data(
+	struct file		*file,
+	loff_t			start,
+	int			whence)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	loff_t			uninitialized_var(offset);
+	xfs_fsize_t		isize;
+	xfs_fileoff_t		fsbno;
+	xfs_filblks_t		end;
+	uint			lock;
+	int			error;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	lock = xfs_ilock_data_map_shared(ip);
+
+	isize = i_size_read(inode);
+	if (start >= isize) {
+		error = -ENXIO;
+		goto out_unlock;
+	}
+
+	/*
+	 * Try to read extents from the first block indicated
+	 * by fsbno to the end block of the file.
+	 */
+	fsbno = XFS_B_TO_FSBT(mp, start);
+	end = XFS_B_TO_FSB(mp, isize);
+
+	for (;;) {
+		struct xfs_bmbt_irec	map[2];
+		int			nmap = 2;
+		unsigned int		i;
+
+		error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+				       XFS_BMAPI_ENTIRE);
+		if (error)
+			goto out_unlock;
+
+		/* No extents at given offset, must be beyond EOF */
+		if (nmap == 0) {
+			error = -ENXIO;
+			goto out_unlock;
+		}
+
+		for (i = 0; i < nmap; i++) {
+			offset = max_t(loff_t, start,
+				       XFS_FSB_TO_B(mp, map[i].br_startoff));
+
+			/* Landed in the hole we wanted? */
+			if (whence == SEEK_HOLE &&
+			    map[i].br_startblock == HOLESTARTBLOCK)
+				goto out;
+
+			/* Landed in the data extent we wanted? */
+			if (whence == SEEK_DATA &&
+			    (map[i].br_startblock == DELAYSTARTBLOCK ||
+			     (map[i].br_state == XFS_EXT_NORM &&
+			      !isnullstartblock(map[i].br_startblock))))
+				goto out;
+
+			/*
+			 * Landed in an unwritten extent, try to search
+			 * for hole or data from page cache.
+			 */
+			if (map[i].br_state == XFS_EXT_UNWRITTEN) {
+				if (xfs_find_get_desired_pgoff(inode, &map[i],
+				      whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF,
+							&offset))
+					goto out;
+			}
+		}
+
+		/*
+		 * We only received one extent out of the two requested. This
+		 * means we've hit EOF and didn't find what we are looking for.
+		 */
+		if (nmap == 1) {
+			/*
+			 * If we were looking for a hole, set offset to
+			 * the end of the file (i.e., there is an implicit
+			 * hole at the end of any file).
+		 	 */
+			if (whence == SEEK_HOLE) {
+				offset = isize;
+				break;
+			}
+			/*
+			 * If we were looking for data, it's nowhere to be found
+			 */
+			ASSERT(whence == SEEK_DATA);
+			error = -ENXIO;
+			goto out_unlock;
+		}
+
+		ASSERT(i > 1);
+
+		/*
+		 * Nothing was found, proceed to the next round of search
+		 * if the next reading offset is not at or beyond EOF.
+		 */
+		fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
+		start = XFS_FSB_TO_B(mp, fsbno);
+		if (start >= isize) {
+			if (whence == SEEK_HOLE) {
+				offset = isize;
+				break;
+			}
+			ASSERT(whence == SEEK_DATA);
+			error = -ENXIO;
+			goto out_unlock;
+		}
+	}
+
+out:
+	/*
+	 * If at this point we have found the hole we wanted, the returned
+	 * offset may be bigger than the file size as it may be aligned to
+	 * page boundary for unwritten extents.  We need to deal with this
+	 * situation in particular.
+	 */
+	if (whence == SEEK_HOLE)
+		offset = min_t(loff_t, offset, isize);
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out_unlock:
+	xfs_iunlock(ip, lock);
+
+	if (error)
+		return error;
+	return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+	struct file	*file,
+	loff_t		offset,
+	int		whence)
+{
+	switch (whence) {
+	case SEEK_END:
+	case SEEK_CUR:
+	case SEEK_SET:
+		return generic_file_llseek(file, offset, whence);
+	case SEEK_HOLE:
+	case SEEK_DATA:
+		return xfs_seek_hole_data(file, offset, whence);
+	default:
+		return -EINVAL;
+	}
+}
+
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ *   i_mmap_lock (XFS - truncate serialisation)
+ *     page_lock (MM)
+ *       i_lock (XFS - extent map serialisation)
+ */
+STATIC int
+xfs_filemap_fault(
+	struct vm_area_struct	*vma,
+	struct vm_fault		*vmf)
+{
+	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
+	int			error;
+
+	trace_xfs_filemap_fault(ip);
+
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	error = filemap_fault(vma, vmf);
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+	return error;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+	struct vm_area_struct	*vma,
+	struct vm_fault		*vmf)
+{
+	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
+	int			error;
+
+	trace_xfs_filemap_page_mkwrite(ip);
+
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+	return error;
+}
+
+const struct file_operations xfs_file_operations = {
+	.llseek		= xfs_file_llseek,
+	.read_iter	= xfs_file_read_iter,
+	.write_iter	= xfs_file_write_iter,
+	.splice_read	= xfs_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.unlocked_ioctl	= xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= xfs_file_compat_ioctl,
+#endif
+	.mmap		= xfs_file_mmap,
+	.open		= xfs_file_open,
+	.release	= xfs_file_release,
+	.fsync		= xfs_file_fsync,
+	.fallocate	= xfs_file_fallocate,
+};
+
+const struct file_operations xfs_dir_file_operations = {
+	.open		= xfs_dir_open,
+	.read		= generic_read_dir,
+	.iterate	= xfs_file_readdir,
+	.llseek		= generic_file_llseek,
+	.unlocked_ioctl	= xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= xfs_file_compat_ioctl,
+#endif
+	.fsync		= xfs_dir_fsync,
+};
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+	.fault		= xfs_filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= xfs_filemap_page_mkwrite,
+};
diff --git a/kernel/fs/xfs/xfs_filestream.c b/kernel/fs/xfs/xfs_filestream.c
new file mode 100644
index 000000000..da82f1cb4
--- /dev/null
+++ b/kernel/fs/xfs/xfs_filestream.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * Copyright (c) 2014 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_alloc.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_trace.h"
+
+struct xfs_fstrm_item {
+	struct xfs_mru_cache_elem	mru;
+	struct xfs_inode		*ip;
+	xfs_agnumber_t			ag; /* AG in use for this directory */
+};
+
+enum xfs_fstrm_alloc {
+	XFS_PICK_USERDATA = 1,
+	XFS_PICK_LOWSPACE = 2,
+};
+
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters.  These counters allow xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all.  The race condition this addresses is the following:
+ *
+ *  - The work queue scheduler fires and pulls a filestream directory cache
+ *    element off the LRU end of the cache for deletion, then gets pre-empted.
+ *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ *    remaining items from the cache and reallocates the mount point's per-ag
+ *    array, resetting all the counters to zero.
+ *  - The work queue thread resumes and calls the free function for the element
+ *    it started cleaning up earlier.  In the process it decrements the
+ *    filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+int
+xfs_filestream_peek_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_read(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static int
+xfs_filestream_get_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_inc_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static void
+xfs_filestream_put_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, agno);
+	atomic_dec(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+}
+
+static void
+xfs_fstrm_free_func(
+	struct xfs_mru_cache_elem *mru)
+{
+	struct xfs_fstrm_item	*item =
+		container_of(mru, struct xfs_fstrm_item, mru);
+
+	xfs_filestream_put_ag(item->ip->i_mount, item->ag);
+
+	trace_xfs_filestream_free(item->ip, item->ag);
+
+	kmem_free(item);
+}
+
+/*
+ * Scan the AGs starting at startag looking for an AG that isn't in use and has
+ * at least minlen blocks free.
+ */
+static int
+xfs_filestream_pick_ag(
+	struct xfs_inode	*ip,
+	xfs_agnumber_t		startag,
+	xfs_agnumber_t		*agp,
+	int			flags,
+	xfs_extlen_t		minlen)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_fstrm_item	*item;
+	struct xfs_perag	*pag;
+	xfs_extlen_t		longest, free = 0, minfree, maxfree = 0;
+	xfs_agnumber_t		ag, max_ag = NULLAGNUMBER;
+	int			err, trylock, nscan;
+
+	ASSERT(S_ISDIR(ip->i_d.di_mode));
+
+	/* 2% of an AG's blocks must be free for it to be chosen. */
+	minfree = mp->m_sb.sb_agblocks / 50;
+
+	ag = startag;
+	*agp = NULLAGNUMBER;
+
+	/* For the first pass, don't sleep trying to init the per-AG. */
+	trylock = XFS_ALLOC_FLAG_TRYLOCK;
+
+	for (nscan = 0; 1; nscan++) {
+		trace_xfs_filestream_scan(ip, ag);
+
+		pag = xfs_perag_get(mp, ag);
+
+		if (!pag->pagf_init) {
+			err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+			if (err && !trylock) {
+				xfs_perag_put(pag);
+				return err;
+			}
+		}
+
+		/* Might fail sometimes during the 1st pass with trylock set. */
+		if (!pag->pagf_init)
+			goto next_ag;
+
+		/* Keep track of the AG with the most free blocks. */
+		if (pag->pagf_freeblks > maxfree) {
+			maxfree = pag->pagf_freeblks;
+			max_ag = ag;
+		}
+
+		/*
+		 * The AG reference count does two things: it enforces mutual
+		 * exclusion when examining the suitability of an AG in this
+		 * loop, and it guards against two filestreams being established
+		 * in the same AG as each other.
+		 */
+		if (xfs_filestream_get_ag(mp, ag) > 1) {
+			xfs_filestream_put_ag(mp, ag);
+			goto next_ag;
+		}
+
+		longest = xfs_alloc_longest_free_extent(mp, pag);
+		if (((minlen && longest >= minlen) ||
+		     (!minlen && pag->pagf_freeblks >= minfree)) &&
+		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
+		     (flags & XFS_PICK_LOWSPACE))) {
+
+			/* Break out, retaining the reference on the AG. */
+			free = pag->pagf_freeblks;
+			xfs_perag_put(pag);
+			*agp = ag;
+			break;
+		}
+
+		/* Drop the reference on this AG, it's not usable. */
+		xfs_filestream_put_ag(mp, ag);
+next_ag:
+		xfs_perag_put(pag);
+		/* Move to the next AG, wrapping to AG 0 if necessary. */
+		if (++ag >= mp->m_sb.sb_agcount)
+			ag = 0;
+
+		/* If a full pass of the AGs hasn't been done yet, continue. */
+		if (ag != startag)
+			continue;
+
+		/* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+		if (trylock != 0) {
+			trylock = 0;
+			continue;
+		}
+
+		/* Finally, if lowspace wasn't set, set it for the 3rd pass. */
+		if (!(flags & XFS_PICK_LOWSPACE)) {
+			flags |= XFS_PICK_LOWSPACE;
+			continue;
+		}
+
+		/*
+		 * Take the AG with the most free space, regardless of whether
+		 * it's already in use by another filestream.
+		 */
+		if (max_ag != NULLAGNUMBER) {
+			xfs_filestream_get_ag(mp, max_ag);
+			free = maxfree;
+			*agp = max_ag;
+			break;
+		}
+
+		/* take AG 0 if none matched */
+		trace_xfs_filestream_pick(ip, *agp, free, nscan);
+		*agp = 0;
+		return 0;
+	}
+
+	trace_xfs_filestream_pick(ip, *agp, free, nscan);
+
+	if (*agp == NULLAGNUMBER)
+		return 0;
+
+	err = -ENOMEM;
+	item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+	if (!item)
+		goto out_put_ag;
+
+	item->ag = *agp;
+	item->ip = ip;
+
+	err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
+	if (err) {
+		if (err == -EEXIST)
+			err = 0;
+		goto out_free_item;
+	}
+
+	return 0;
+
+out_free_item:
+	kmem_free(item);
+out_put_ag:
+	xfs_filestream_put_ag(mp, *agp);
+	return err;
+}
+
+static struct xfs_inode *
+xfs_filestream_get_parent(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip), *dir = NULL;
+	struct dentry		*dentry, *parent;
+
+	dentry = d_find_alias(inode);
+	if (!dentry)
+		goto out;
+
+	parent = dget_parent(dentry);
+	if (!parent)
+		goto out_dput;
+
+	dir = igrab(d_inode(parent));
+	dput(parent);
+
+out_dput:
+	dput(dentry);
+out:
+	return dir ? XFS_I(dir) : NULL;
+}
+
+/*
+ * Find the right allocation group for a file, either by finding an
+ * existing file stream or creating a new one.
+ *
+ * Returns NULLAGNUMBER in case of an error.
+ */
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_inode	*pip = NULL;
+	xfs_agnumber_t		startag, ag = NULLAGNUMBER;
+	struct xfs_mru_cache_elem *mru;
+
+	ASSERT(S_ISREG(ip->i_d.di_mode));
+
+	pip = xfs_filestream_get_parent(ip);
+	if (!pip)
+		return NULLAGNUMBER;
+
+	mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
+	if (mru) {
+		ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
+		xfs_mru_cache_done(mp->m_filestream);
+
+		trace_xfs_filestream_lookup(ip, ag);
+		goto out;
+	}
+
+	/*
+	 * Set the starting AG using the rotor for inode32, otherwise
+	 * use the directory inode's AG.
+	 */
+	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+		xfs_agnumber_t	 rotorstep = xfs_rotorstep;
+		startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
+		mp->m_agfrotor = (mp->m_agfrotor + 1) %
+		                 (mp->m_sb.sb_agcount * rotorstep);
+	} else
+		startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
+
+	if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
+		ag = NULLAGNUMBER;
+out:
+	IRELE(pip);
+	return ag;
+}
+
+/*
+ * Pick a new allocation group for the current file and its file stream.
+ *
+ * This is called when the allocator can't find a suitable extent in the
+ * current AG, and we have to move the stream into a new AG with more space.
+ */
+int
+xfs_filestream_new_ag(
+	struct xfs_bmalloca	*ap,
+	xfs_agnumber_t		*agp)
+{
+	struct xfs_inode	*ip = ap->ip, *pip;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_extlen_t		minlen = ap->length;
+	xfs_agnumber_t		startag = 0;
+	int			flags, err = 0;
+	struct xfs_mru_cache_elem *mru;
+
+	*agp = NULLAGNUMBER;
+
+	pip = xfs_filestream_get_parent(ip);
+	if (!pip)
+		goto exit;
+
+	mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
+	if (mru) {
+		struct xfs_fstrm_item *item =
+			container_of(mru, struct xfs_fstrm_item, mru);
+		startag = (item->ag + 1) % mp->m_sb.sb_agcount;
+	}
+
+	flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
+	        (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
+
+	err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
+
+	/*
+	 * Only free the item here so we skip over the old AG earlier.
+	 */
+	if (mru)
+		xfs_fstrm_free_func(mru);
+
+	IRELE(pip);
+exit:
+	if (*agp == NULLAGNUMBER)
+		*agp = 0;
+	return err;
+}
+
+void
+xfs_filestream_deassociate(
+	struct xfs_inode	*ip)
+{
+	xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
+}
+
+int
+xfs_filestream_mount(
+	xfs_mount_t	*mp)
+{
+	/*
+	 * The filestream timer tunable is currently fixed within the range of
+	 * one second to four minutes, with five seconds being the default.  The
+	 * group count is somewhat arbitrary, but it'd be nice to adhere to the
+	 * timer tunable to within about 10 percent.  This requires at least 10
+	 * groups.
+	 */
+	return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
+				    10, xfs_fstrm_free_func);
+}
+
+void
+xfs_filestream_unmount(
+	xfs_mount_t	*mp)
+{
+	xfs_mru_cache_destroy(mp->m_filestream);
+}
diff --git a/kernel/fs/xfs/xfs_filestream.h b/kernel/fs/xfs/xfs_filestream.h
new file mode 100644
index 000000000..2ef43406e
--- /dev/null
+++ b/kernel/fs/xfs/xfs_filestream.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FILESTREAM_H__
+#define __XFS_FILESTREAM_H__
+
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_bmalloca;
+
+int xfs_filestream_mount(struct xfs_mount *mp);
+void xfs_filestream_unmount(struct xfs_mount *mp);
+void xfs_filestream_deassociate(struct xfs_inode *ip);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
+int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
+
+static inline int
+xfs_inode_is_filestream(
+	struct xfs_inode	*ip)
+{
+	return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
+		(ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
+}
+
+#endif /* __XFS_FILESTREAM_H__ */
diff --git a/kernel/fs/xfs/xfs_fsops.c b/kernel/fs/xfs/xfs_fsops.c
new file mode 100644
index 000000000..cb7e8a29d
--- /dev/null
+++ b/kernel/fs/xfs/xfs_fsops.c
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_fsops.h"
+#include "xfs_itable.h"
+#include "xfs_trans_space.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_filestream.h"
+
+/*
+ * File system operations
+ */
+
+int
+xfs_fs_geometry(
+	xfs_mount_t		*mp,
+	xfs_fsop_geom_t		*geo,
+	int			new_version)
+{
+
+	memset(geo, 0, sizeof(*geo));
+
+	geo->blocksize = mp->m_sb.sb_blocksize;
+	geo->rtextsize = mp->m_sb.sb_rextsize;
+	geo->agblocks = mp->m_sb.sb_agblocks;
+	geo->agcount = mp->m_sb.sb_agcount;
+	geo->logblocks = mp->m_sb.sb_logblocks;
+	geo->sectsize = mp->m_sb.sb_sectsize;
+	geo->inodesize = mp->m_sb.sb_inodesize;
+	geo->imaxpct = mp->m_sb.sb_imax_pct;
+	geo->datablocks = mp->m_sb.sb_dblocks;
+	geo->rtblocks = mp->m_sb.sb_rblocks;
+	geo->rtextents = mp->m_sb.sb_rextents;
+	geo->logstart = mp->m_sb.sb_logstart;
+	ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
+	memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
+	if (new_version >= 2) {
+		geo->sunit = mp->m_sb.sb_unit;
+		geo->swidth = mp->m_sb.sb_width;
+	}
+	if (new_version >= 3) {
+		geo->version = XFS_FSOP_GEOM_VERSION;
+		geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
+			     XFS_FSOP_GEOM_FLAGS_DIRV2 |
+			(xfs_sb_version_hasattr(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
+			(xfs_sb_version_hasquota(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
+			(xfs_sb_version_hasalign(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
+			(xfs_sb_version_hasdalign(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
+			(xfs_sb_version_hasextflgbit(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
+			(xfs_sb_version_hassector(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
+			(xfs_sb_version_hasasciici(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
+			(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
+			(xfs_sb_version_hasattr2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+			(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
+			(xfs_sb_version_hascrc(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_V5SB : 0) |
+			(xfs_sb_version_hasftype(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
+			(xfs_sb_version_hasfinobt(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
+				mp->m_sb.sb_logsectsize : BBSIZE;
+		geo->rtsectsize = mp->m_sb.sb_blocksize;
+		geo->dirblocksize = mp->m_dir_geo->blksize;
+	}
+	if (new_version >= 4) {
+		geo->flags |=
+			(xfs_sb_version_haslogv2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
+		geo->logsunit = mp->m_sb.sb_logsunit;
+	}
+	return 0;
+}
+
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	int			flags,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+
+	bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+	if (!bp)
+		return NULL;
+
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	bp->b_bn = blkno;
+	bp->b_maps[0].bm_bn = blkno;
+	bp->b_ops = ops;
+
+	return bp;
+}
+
+static int
+xfs_growfs_data_private(
+	xfs_mount_t		*mp,		/* mount point for filesystem */
+	xfs_growfs_data_t	*in)		/* growfs data input struct */
+{
+	xfs_agf_t		*agf;
+	struct xfs_agfl		*agfl;
+	xfs_agi_t		*agi;
+	xfs_agnumber_t		agno;
+	xfs_extlen_t		agsize;
+	xfs_extlen_t		tmpsize;
+	xfs_alloc_rec_t		*arec;
+	xfs_buf_t		*bp;
+	int			bucket;
+	int			dpct;
+	int			error, saved_error = 0;
+	xfs_agnumber_t		nagcount;
+	xfs_agnumber_t		nagimax = 0;
+	xfs_rfsblock_t		nb, nb_mod;
+	xfs_rfsblock_t		new;
+	xfs_rfsblock_t		nfree;
+	xfs_agnumber_t		oagcount;
+	int			pct;
+	xfs_trans_t		*tp;
+
+	nb = in->newblocks;
+	pct = in->imaxpct;
+	if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+		return -EINVAL;
+	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
+		return error;
+	dpct = pct - mp->m_sb.sb_imax_pct;
+	error = xfs_buf_read_uncached(mp->m_ddev_targp,
+				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error)
+		return error;
+	xfs_buf_relse(bp);
+
+	new = nb;	/* use new as a temporary here */
+	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
+	nagcount = new + (nb_mod != 0);
+	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+		nagcount--;
+		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
+		if (nb < mp->m_sb.sb_dblocks)
+			return -EINVAL;
+	}
+	new = nb - mp->m_sb.sb_dblocks;
+	oagcount = mp->m_sb.sb_agcount;
+
+	/* allocate the new per-ag structures */
+	if (nagcount > oagcount) {
+		error = xfs_initialize_perag(mp, nagcount, &nagimax);
+		if (error)
+			return error;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+	tp->t_flags |= XFS_TRANS_RESERVE;
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+				  XFS_GROWFS_SPACE_RES(mp), 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	/*
+	 * Write new AG headers to disk. Non-transactional, but written
+	 * synchronously so they are completed prior to the growfs transaction
+	 * being logged.
+	 */
+	nfree = 0;
+	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+		__be32	*agfl_bno;
+
+		/*
+		 * AG freespace header block
+		 */
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+				XFS_FSS_TO_BB(mp, 1), 0,
+				&xfs_agf_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto error0;
+		}
+
+		agf = XFS_BUF_TO_AGF(bp);
+		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+		agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+		agf->agf_seqno = cpu_to_be32(agno);
+		if (agno == nagcount - 1)
+			agsize =
+				nb -
+				(agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+		else
+			agsize = mp->m_sb.sb_agblocks;
+		agf->agf_length = cpu_to_be32(agsize);
+		agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+		agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+		agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+		agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+		agf->agf_flfirst = 0;
+		agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+		agf->agf_flcount = 0;
+		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+		agf->agf_freeblks = cpu_to_be32(tmpsize);
+		agf->agf_longest = cpu_to_be32(tmpsize);
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto error0;
+
+		/*
+		 * AG freelist header block
+		 */
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+				XFS_FSS_TO_BB(mp, 1), 0,
+				&xfs_agfl_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto error0;
+		}
+
+		agfl = XFS_BUF_TO_AGFL(bp);
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+			agfl->agfl_seqno = cpu_to_be32(agno);
+			uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+		}
+
+		agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+			agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto error0;
+
+		/*
+		 * AG inode header block
+		 */
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+				XFS_FSS_TO_BB(mp, 1), 0,
+				&xfs_agi_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto error0;
+		}
+
+		agi = XFS_BUF_TO_AGI(bp);
+		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+		agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+		agi->agi_seqno = cpu_to_be32(agno);
+		agi->agi_length = cpu_to_be32(agsize);
+		agi->agi_count = 0;
+		agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+		agi->agi_level = cpu_to_be32(1);
+		agi->agi_freecount = 0;
+		agi->agi_newino = cpu_to_be32(NULLAGINO);
+		agi->agi_dirino = cpu_to_be32(NULLAGINO);
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+		if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+			agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+			agi->agi_free_level = cpu_to_be32(1);
+		}
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+			agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto error0;
+
+		/*
+		 * BNO btree root block
+		 */
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_allocbt_buf_ops);
+
+		if (!bp) {
+			error = -ENOMEM;
+			goto error0;
+		}
+
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
+						agno, XFS_BTREE_CRC_BLOCKS);
+		else
+			xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
+						agno, 0);
+
+		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+		arec->ar_blockcount = cpu_to_be32(
+			agsize - be32_to_cpu(arec->ar_startblock));
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto error0;
+
+		/*
+		 * CNT btree root block
+		 */
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_allocbt_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto error0;
+		}
+
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
+						agno, XFS_BTREE_CRC_BLOCKS);
+		else
+			xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
+						agno, 0);
+
+		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+		arec->ar_blockcount = cpu_to_be32(
+			agsize - be32_to_cpu(arec->ar_startblock));
+		nfree += be32_to_cpu(arec->ar_blockcount);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto error0;
+
+		/*
+		 * INO btree root block
+		 */
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_inobt_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto error0;
+		}
+
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
+						agno, XFS_BTREE_CRC_BLOCKS);
+		else
+			xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
+						agno, 0);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto error0;
+
+		/*
+		 * FINO btree root block
+		 */
+		if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+			bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_inobt_buf_ops);
+			if (!bp) {
+				error = -ENOMEM;
+				goto error0;
+			}
+
+			if (xfs_sb_version_hascrc(&mp->m_sb))
+				xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
+						     0, 0, agno,
+						     XFS_BTREE_CRC_BLOCKS);
+			else
+				xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
+						     0, agno, 0);
+
+			error = xfs_bwrite(bp);
+			xfs_buf_relse(bp);
+			if (error)
+				goto error0;
+		}
+
+	}
+	xfs_trans_agblocks_delta(tp, nfree);
+	/*
+	 * There are new blocks in the old last a.g.
+	 */
+	if (new) {
+		/*
+		 * Change the agi length.
+		 */
+		error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agi = XFS_BUF_TO_AGI(bp);
+		be32_add_cpu(&agi->agi_length, new);
+		ASSERT(nagcount == oagcount ||
+		       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
+		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+		/*
+		 * Change agf length.
+		 */
+		error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agf = XFS_BUF_TO_AGF(bp);
+		be32_add_cpu(&agf->agf_length, new);
+		ASSERT(be32_to_cpu(agf->agf_length) ==
+		       be32_to_cpu(agi->agi_length));
+
+		xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+		/*
+		 * Free the new space.
+		 */
+		error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
+			be32_to_cpu(agf->agf_length) - new), new);
+		if (error) {
+			goto error0;
+		}
+	}
+
+	/*
+	 * Update changed superblock fields transactionally. These are not
+	 * seen by the rest of the world until the transaction commit applies
+	 * them atomically to the superblock.
+	 */
+	if (nagcount > oagcount)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+	if (nb > mp->m_sb.sb_dblocks)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
+				 nb - mp->m_sb.sb_dblocks);
+	if (nfree)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
+	if (dpct)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+	if (error)
+		return error;
+
+	/* New allocation groups fully initialized, so update mount struct */
+	if (nagimax)
+		mp->m_maxagi = nagimax;
+	if (mp->m_sb.sb_imax_pct) {
+		__uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+		do_div(icount, 100);
+		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+	xfs_set_low_space_thresholds(mp);
+
+	/* update secondary superblocks. */
+	for (agno = 1; agno < nagcount; agno++) {
+		error = 0;
+		/*
+		 * new secondary superblocks need to be zeroed, not read from
+		 * disk as the contents of the new area we are growing into is
+		 * completely unknown.
+		 */
+		if (agno < oagcount) {
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  XFS_FSS_TO_BB(mp, 1), 0, &bp,
+				  &xfs_sb_buf_ops);
+		} else {
+			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  XFS_FSS_TO_BB(mp, 1), 0);
+			if (bp) {
+				bp->b_ops = &xfs_sb_buf_ops;
+				xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+			} else
+				error = -ENOMEM;
+		}
+
+		/*
+		 * If we get an error reading or writing alternate superblocks,
+		 * continue.  xfs_repair chooses the "best" superblock based
+		 * on most matches; if we break early, we'll leave more
+		 * superblocks un-updated than updated, and xfs_repair may
+		 * pick them over the properly-updated primary.
+		 */
+		if (error) {
+			xfs_warn(mp,
+		"error %d reading secondary superblock for ag %d",
+				error, agno);
+			saved_error = error;
+			continue;
+		}
+		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error) {
+			xfs_warn(mp,
+		"write error %d updating secondary superblock for ag %d",
+				error, agno);
+			saved_error = error;
+			continue;
+		}
+	}
+	return saved_error ? saved_error : error;
+
+ error0:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
+}
+
+static int
+xfs_growfs_log_private(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_growfs_log_t	*in)	/* growfs log input struct */
+{
+	xfs_extlen_t		nb;
+
+	nb = in->newblocks;
+	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+		return -EINVAL;
+	if (nb == mp->m_sb.sb_logblocks &&
+	    in->isint == (mp->m_sb.sb_logstart != 0))
+		return -EINVAL;
+	/*
+	 * Moving the log is hard, need new interfaces to sync
+	 * the log first, hold off all activity while moving it.
+	 * Can have shorter or longer log in the same space,
+	 * or transform internal to external log or vice versa.
+	 */
+	return -ENOSYS;
+}
+
+/*
+ * protected versions of growfs function acquire and release locks on the mount
+ * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
+ * XFS_IOC_FSGROWFSRT
+ */
+
+
+int
+xfs_growfs_data(
+	xfs_mount_t		*mp,
+	xfs_growfs_data_t	*in)
+{
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!mutex_trylock(&mp->m_growlock))
+		return -EWOULDBLOCK;
+	error = xfs_growfs_data_private(mp, in);
+	/*
+	 * Increment the generation unconditionally, the error could be from
+	 * updating the secondary superblocks, in which case the new size
+	 * is live already.
+	 */
+	mp->m_generation++;
+	mutex_unlock(&mp->m_growlock);
+	return error;
+}
+
+int
+xfs_growfs_log(
+	xfs_mount_t		*mp,
+	xfs_growfs_log_t	*in)
+{
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!mutex_trylock(&mp->m_growlock))
+		return -EWOULDBLOCK;
+	error = xfs_growfs_log_private(mp, in);
+	mutex_unlock(&mp->m_growlock);
+	return error;
+}
+
+/*
+ * exported through ioctl XFS_IOC_FSCOUNTS
+ */
+
+int
+xfs_fs_counts(
+	xfs_mount_t		*mp,
+	xfs_fsop_counts_t	*cnt)
+{
+	cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
+	cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
+	cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+							XFS_ALLOC_SET_ASIDE(mp);
+
+	spin_lock(&mp->m_sb_lock);
+	cnt->freertx = mp->m_sb.sb_frextents;
+	spin_unlock(&mp->m_sb_lock);
+	return 0;
+}
+
+/*
+ * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
+ *
+ * xfs_reserve_blocks is called to set m_resblks
+ * in the in-core mount table. The number of unused reserved blocks
+ * is kept in m_resblks_avail.
+ *
+ * Reserve the requested number of blocks if available. Otherwise return
+ * as many as possible to satisfy the request. The actual number
+ * reserved are returned in outval
+ *
+ * A null inval pointer indicates that only the current reserved blocks
+ * available  should  be returned no settings are changed.
+ */
+
+int
+xfs_reserve_blocks(
+	xfs_mount_t             *mp,
+	__uint64_t              *inval,
+	xfs_fsop_resblks_t      *outval)
+{
+	__int64_t		lcounter, delta, fdblks_delta;
+	__uint64_t		request;
+
+	/* If inval is null, report current values and return */
+	if (inval == (__uint64_t *)NULL) {
+		if (!outval)
+			return -EINVAL;
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+		return 0;
+	}
+
+	request = *inval;
+
+	/*
+	 * With per-cpu counters, this becomes an interesting
+	 * problem. we needto work out if we are freeing or allocation
+	 * blocks first, then we can do the modification as necessary.
+	 *
+	 * We do this under the m_sb_lock so that if we are near
+	 * ENOSPC, we will hold out any changes while we work out
+	 * what to do. This means that the amount of free space can
+	 * change while we do this, so we need to retry if we end up
+	 * trying to reserve more space than is available.
+	 */
+retry:
+	spin_lock(&mp->m_sb_lock);
+
+	/*
+	 * If our previous reservation was larger than the current value,
+	 * then move any unused blocks back to the free pool.
+	 */
+	fdblks_delta = 0;
+	if (mp->m_resblks > request) {
+		lcounter = mp->m_resblks_avail - request;
+		if (lcounter  > 0) {		/* release unused blocks */
+			fdblks_delta = lcounter;
+			mp->m_resblks_avail -= lcounter;
+		}
+		mp->m_resblks = request;
+	} else {
+		__int64_t	free;
+
+		free = percpu_counter_sum(&mp->m_fdblocks) -
+							XFS_ALLOC_SET_ASIDE(mp);
+		if (!free)
+			goto out; /* ENOSPC and fdblks_delta = 0 */
+
+		delta = request - mp->m_resblks;
+		lcounter = free - delta;
+		if (lcounter < 0) {
+			/* We can't satisfy the request, just get what we can */
+			mp->m_resblks += free;
+			mp->m_resblks_avail += free;
+			fdblks_delta = -free;
+		} else {
+			fdblks_delta = -delta;
+			mp->m_resblks = request;
+			mp->m_resblks_avail += delta;
+		}
+	}
+out:
+	if (outval) {
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+	}
+	spin_unlock(&mp->m_sb_lock);
+
+	if (fdblks_delta) {
+		/*
+		 * If we are putting blocks back here, m_resblks_avail is
+		 * already at its max so this will put it in the free pool.
+		 *
+		 * If we need space, we'll either succeed in getting it
+		 * from the free block count or we'll get an enospc. If
+		 * we get a ENOSPC, it means things changed while we were
+		 * calculating fdblks_delta and so we should try again to
+		 * see if there is anything left to reserve.
+		 *
+		 * Don't set the reserved flag here - we don't want to reserve
+		 * the extra reserve blocks from the reserve.....
+		 */
+		int error;
+		error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+		if (error == -ENOSPC)
+			goto retry;
+	}
+	return 0;
+}
+
+int
+xfs_fs_goingdown(
+	xfs_mount_t	*mp,
+	__uint32_t	inflags)
+{
+	switch (inflags) {
+	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+		struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
+
+		if (sb && !IS_ERR(sb)) {
+			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+			thaw_bdev(sb->s_bdev, sb);
+		}
+
+		break;
+	}
+	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+		break;
+	case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
+		xfs_force_shutdown(mp,
+				SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping the filesystem
+ * consistent. We don't do an unmount here; just shutdown the shop, make sure
+ * that absolutely nothing persistent happens to this filesystem after this
+ * point.
+ */
+void
+xfs_do_force_shutdown(
+	xfs_mount_t	*mp,
+	int		flags,
+	char		*fname,
+	int		lnnum)
+{
+	int		logerror;
+
+	logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		xfs_notice(mp,
+	"%s(0x%x) called from line %d of file %s.  Return address = 0x%p",
+			__func__, flags, lnnum, fname, __return_address);
+	}
+	/*
+	 * No need to duplicate efforts.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+		return;
+
+	/*
+	 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+	 * queue up anybody new on the log reservations, and wakes up
+	 * everybody who's sleeping on log reservations to tell them
+	 * the bad news.
+	 */
+	if (xfs_log_force_umount(mp, logerror))
+		return;
+
+	if (flags & SHUTDOWN_CORRUPT_INCORE) {
+		xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+    "Corruption of in-memory data detected.  Shutting down filesystem");
+		if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
+			xfs_stack_trace();
+	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		if (logerror) {
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+		"Log I/O Error Detected.  Shutting down filesystem");
+		} else if (flags & SHUTDOWN_DEVICE_REQ) {
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+		"All device paths lost.  Shutting down filesystem");
+		} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+		"I/O Error Detected. Shutting down filesystem");
+		}
+	}
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		xfs_alert(mp,
+	"Please umount the filesystem and rectify the problem(s)");
+	}
+}
diff --git a/kernel/fs/xfs/xfs_fsops.h b/kernel/fs/xfs/xfs_fsops.h
new file mode 100644
index 000000000..1b6a98b66
--- /dev/null
+++ b/kernel/fs/xfs/xfs_fsops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FSOPS_H__
+#define	__XFS_FSOPS_H__
+
+extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion);
+extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
+extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
+extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
+extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
+				xfs_fsop_resblks_t *outval);
+extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
+
+#endif	/* __XFS_FSOPS_H__ */
diff --git a/kernel/fs/xfs/xfs_globals.c b/kernel/fs/xfs/xfs_globals.c
new file mode 100644
index 000000000..4d41b2412
--- /dev/null
+++ b/kernel/fs/xfs/xfs_globals.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sysctl.h"
+
+/*
+ * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
+ * other XFS code uses these values.  Times are measured in centisecs (i.e.
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
+ */
+xfs_param_t xfs_params = {
+			  /*	MIN		DFLT		MAX	*/
+	.sgid_inherit	= {	0,		0,		1	},
+	.symlink_mode	= {	0,		0,		1	},
+	.panic_mask	= {	0,		0,		255	},
+	.error_level	= {	0,		3,		11	},
+	.syncd_timer	= {	1*100,		30*100,		7200*100},
+	.stats_clear	= {	0,		0,		1	},
+	.inherit_sync	= {	0,		1,		1	},
+	.inherit_nodump	= {	0,		1,		1	},
+	.inherit_noatim = {	0,		1,		1	},
+	.xfs_buf_timer	= {	100/2,		1*100,		30*100	},
+	.xfs_buf_age	= {	1*100,		15*100,		7200*100},
+	.inherit_nosym	= {	0,		0,		1	},
+	.rotorstep	= {	1,		1,		255	},
+	.inherit_nodfrg	= {	0,		1,		1	},
+	.fstrm_timer	= {	1,		30*100,		3600*100},
+	.eofb_timer	= {	1,		300,		3600*24},
+};
+
+struct xfs_globals xfs_globals = {
+	.log_recovery_delay	=	0,	/* no delay by default */
+};
diff --git a/kernel/fs/xfs/xfs_icache.c b/kernel/fs/xfs/xfs_icache.c
new file mode 100644
index 000000000..76a9f2783
--- /dev/null
+++ b/kernel/fs/xfs/xfs_icache.c
@@ -0,0 +1,1418 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+				struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+struct xfs_inode *
+xfs_inode_alloc(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+
+	/*
+	 * if this didn't occur in transactions, we could use
+	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+	 * code up to do this anyway.
+	 */
+	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+	if (!ip)
+		return NULL;
+	if (inode_init_always(mp->m_super, VFS_I(ip))) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return NULL;
+	}
+
+	XFS_STATS_INC(vn_active);
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(!xfs_isiflocked(ip));
+	ASSERT(ip->i_ino == 0);
+
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+	/* initialise the xfs inode */
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+	ip->i_afp = NULL;
+	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+	ip->i_flags = 0;
+	ip->i_delayed_blks = 0;
+	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+	return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	if (ip->i_itemp) {
+		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+		xfs_inode_item_destroy(ip);
+		ip->i_itemp = NULL;
+	}
+
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!xfs_isiflocked(ip));
+	XFS_STATS_DEC(vn_active);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
+	int			flags,
+	int			lock_flags) __releases(RCU)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = -EAGAIN;
+		goto out_error;
+	}
+
+
+	/*
+	 * If we are racing with another cache hit that is currently
+	 * instantiating this inode or currently recycling it out of
+	 * reclaimabe state, wait for the initialisation to complete
+	 * before continuing.
+	 *
+	 * XXX(hch): eventually we should do something equivalent to
+	 *	     wait_on_inode to wait for these flags to be cleared
+	 *	     instead of polling for it.
+	 */
+	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = -EAGAIN;
+		goto out_error;
+	}
+
+	/*
+	 * If lookup is racing with unlink return an error immediately.
+	 */
+	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+		error = -ENOENT;
+		goto out_error;
+	}
+
+	/*
+	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+	 * Need to carefully get it back into useable state.
+	 */
+	if (ip->i_flags & XFS_IRECLAIMABLE) {
+		trace_xfs_iget_reclaim(ip);
+
+		/*
+		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+		 * from stomping over us while we recycle the inode.  We can't
+		 * clear the radix tree reclaimable tag yet as it requires
+		 * pag_ici_lock to be held exclusive.
+		 */
+		ip->i_flags |= XFS_IRECLAIM;
+
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+
+		error = inode_init_always(mp->m_super, inode);
+		if (error) {
+			/*
+			 * Re-initializing the inode failed, and we are in deep
+			 * trouble.  Try to re-add it to the reclaim list.
+			 */
+			rcu_read_lock();
+			spin_lock(&ip->i_flags_lock);
+
+			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+			trace_xfs_iget_reclaim_fail(ip);
+			goto out_error;
+		}
+
+		spin_lock(&pag->pag_ici_lock);
+		spin_lock(&ip->i_flags_lock);
+
+		/*
+		 * Clear the per-lifetime state in the inode as we are now
+		 * effectively a new inode and need to return to the initial
+		 * state before reuse occurs.
+		 */
+		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+		ip->i_flags |= XFS_INEW;
+		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+		inode->i_state = I_NEW;
+
+		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+		spin_unlock(&ip->i_flags_lock);
+		spin_unlock(&pag->pag_ici_lock);
+	} else {
+		/* If the VFS inode is being torn down, pause and try again. */
+		if (!igrab(inode)) {
+			trace_xfs_iget_skip(ip);
+			error = -EAGAIN;
+			goto out_error;
+		}
+
+		/* We've got a live one. */
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		trace_xfs_iget_hit(ip);
+	}
+
+	if (lock_flags != 0)
+		xfs_ilock(ip, lock_flags);
+
+	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+	XFS_STATS_INC(xs_ig_found);
+
+	return 0;
+
+out_error:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp,
+	int			flags,
+	int			lock_flags)
+{
+	struct xfs_inode	*ip;
+	int			error;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
+	int			iflags;
+
+	ip = xfs_inode_alloc(mp, ino);
+	if (!ip)
+		return -ENOMEM;
+
+	error = xfs_iread(mp, tp, ip, flags);
+	if (error)
+		goto out_destroy;
+
+	trace_xfs_iget_miss(ip);
+
+	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+		error = -ENOENT;
+		goto out_destroy;
+	}
+
+	/*
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock. Note that we cannot sleep inside the preload
+	 * region. Since we can be called from transaction context, don't
+	 * recurse into the file system.
+	 */
+	if (radix_tree_preload(GFP_NOFS)) {
+		error = -EAGAIN;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
+	}
+
+	/*
+	 * These values must be set before inserting the inode into the radix
+	 * tree as the moment it is inserted a concurrent lookup (allowed by the
+	 * RCU locking mechanism) can find it and that lookup must see that this
+	 * is an inode currently under construction (i.e. that XFS_INEW is set).
+	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+	 * memory barrier that ensures this detection works correctly at lookup
+	 * time.
+	 */
+	iflags = XFS_INEW;
+	if (flags & XFS_IGET_DONTCACHE)
+		iflags |= XFS_IDONTCACHE;
+	ip->i_udquot = NULL;
+	ip->i_gdquot = NULL;
+	ip->i_pdquot = NULL;
+	xfs_iflags_set(ip, iflags);
+
+	/* insert the new inode */
+	spin_lock(&pag->pag_ici_lock);
+	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+	if (unlikely(error)) {
+		WARN_ON(error != -EEXIST);
+		XFS_STATS_INC(xs_ig_dup);
+		error = -EAGAIN;
+		goto out_preload_end;
+	}
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+
+	*ipp = ip;
+	return 0;
+
+out_preload_end:
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+out_destroy:
+	__destroy_inode(VFS_I(ip));
+	xfs_inode_free(ip);
+	return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *		 for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		flags,
+	uint		lock_flags,
+	xfs_inode_t	**ipp)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+
+	/*
+	 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+	 * doesn't get freed while it's being referenced during a
+	 * radix tree traversal here.  It assumes this function
+	 * aqcuires only the ILOCK (and therefore it has no need to
+	 * involve the IOLOCK in this synchronization).
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+	/* reject inode numbers outside existing AGs */
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+		return -EINVAL;
+
+	/* get the perag structure and ensure that it's inode capable */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+	error = 0;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	if (ip) {
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	} else {
+		rcu_read_unlock();
+		XFS_STATS_INC(xs_ig_missed);
+
+		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+							flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	}
+	xfs_perag_put(pag);
+
+	*ipp = ip;
+
+	/*
+	 * If we have a real type for an on-disk inode, we can setup the inode
+	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
+	 */
+	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+		xfs_setup_existing_inode(ip);
+	return 0;
+
+out_error_or_again:
+	if (error == -EAGAIN) {
+		delay(1);
+		goto again;
+	}
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(rcu_read_lock_held());
+
+	/*
+	 * check for stale RCU freed inode
+	 *
+	 * If the inode has been reallocated, it doesn't matter if it's not in
+	 * the AG we are walking - we are walking for writeback, so if it
+	 * passes all the "valid inode" checks and is dirty, then we'll write
+	 * it back anyway.  If it has been reallocated and still being
+	 * initialised, the XFS_INEW check below will catch it.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EFSCORRUPTED;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return -ENOENT;
+
+	/* inode is valid */
+	return 0;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return -ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	int			(*execute)(struct xfs_inode *ip, int flags,
+					   void *args),
+	int			flags,
+	void			*args,
+	int			tag)
+{
+	uint32_t		first_index;
+	int			last_error = 0;
+	int			skipped;
+	int			done;
+	int			nr_found;
+
+restart:
+	done = 0;
+	skipped = 0;
+	first_index = 0;
+	nr_found = 0;
+	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		rcu_read_lock();
+
+		if (tag == -1)
+			nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH);
+		else
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **) batch, first_index,
+					XFS_LOOKUP_BATCH, tag);
+
+		if (!nr_found) {
+			rcu_read_unlock();
+			break;
+		}
+
+		/*
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
+		 */
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
+
+			if (done || xfs_inode_ag_walk_grab(ip))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
+			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
+		}
+
+		/* unlock now we've grabbed the inodes. */
+		rcu_read_unlock();
+
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = execute(batch[i], flags, args);
+			IRELE(batch[i]);
+			if (error == -EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != -EFSCORRUPTED)
+				last_error = error;
+		}
+
+		/* bail out if the filesystem is corrupted.  */
+		if (error == -EFSCORRUPTED)
+			break;
+
+		cond_resched();
+
+	} while (nr_found && !done);
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+	return last_error;
+}
+
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+	struct xfs_mount *mp)
+{
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+		queue_delayed_work(mp->m_eofblocks_workqueue,
+				   &mp->m_eofblocks_work,
+				   msecs_to_jiffies(xfs_eofb_secs * 1000));
+	rcu_read_unlock();
+}
+
+void
+xfs_eofblocks_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+				struct xfs_mount, m_eofblocks_work);
+	xfs_icache_free_eofblocks(mp, NULL);
+	xfs_queue_eofblocks(mp);
+}
+
+int
+xfs_inode_ag_iterator(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip, int flags,
+					   void *args),
+	int			flags,
+	void			*args)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get(mp, ag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+		xfs_perag_put(pag);
+		if (error) {
+			last_error = error;
+			if (error == -EFSCORRUPTED)
+				break;
+		}
+	}
+	return last_error;
+}
+
+int
+xfs_inode_ag_iterator_tag(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip, int flags,
+					   void *args),
+	int			flags,
+	void			*args,
+	int			tag)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+		xfs_perag_put(pag);
+		if (error) {
+			last_error = error;
+			if (error == -EFSCORRUPTED)
+				break;
+		}
+	}
+	return last_error;
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+	struct xfs_mount        *mp)
+{
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_reclaim_work);
+
+	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+	xfs_reclaim_work_queue(mp);
+}
+
+static void
+__xfs_inode_set_reclaim_tag(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+
+	if (!pag->pag_ici_reclaimable) {
+		/* propagate the reclaim tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+
+		/* schedule periodic background inode reclaim */
+		xfs_reclaim_work_queue(ip->i_mount);
+
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+	pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+	__xfs_inode_set_reclaim_tag(pag, ip);
+	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	pag->pag_ici_reclaimable--;
+	if (!pag->pag_ici_reclaimable) {
+		/* clear the reclaim tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+}
+
+STATIC void
+__xfs_inode_clear_reclaim_tag(
+	xfs_mount_t	*mp,
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	__xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	ASSERT(rcu_read_lock_held());
+
+	/* quick check for stale RCU freed inode */
+	if (!ip->i_ino)
+		return 1;
+
+	/*
+	 * If we are asked for non-blocking operation, do unlocked checks to
+	 * see if the inode already is being flushed or in reclaim to avoid
+	 * lock traffic.
+	 */
+	if ((flags & SYNC_TRYLOCK) &&
+	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+		return 1;
+
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 *
+	 * Due to RCU lookup, we may find inodes that have been freed and only
+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+	 * aren't candidates for reclaim at all, so we must check the
+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
+		spin_unlock(&ip->i_flags_lock);
+		return 1;
+	}
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
+ *
+ *	inode state	     iflush ret		required action
+ *      ---------------      ----------         ---------------
+ *	bad			-		reclaim
+ *	shutdown		EIO		unpin and reclaim
+ *	clean, unpinned		0		reclaim
+ *	stale, unpinned		0		reclaim
+ *	clean, pinned(*)	0		requeue
+ *	stale, pinned		EAGAIN		requeue
+ *	dirty, async		-		requeue
+ *	dirty, sync		0		reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean.
+ *
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting.  For background relaim, we only
+ * bother to reclaim clean inodes anyway.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *	bad		=> reclaim
+ *	shutdown	=> unpin and reclaim
+ *	pinned, async	=> requeue
+ *	pinned, sync	=> unpin
+ *	stale		=> reclaim
+ *	clean		=> reclaim
+ *	dirty, async	=> requeue
+ *	dirty, sync	=> flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			sync_mode)
+{
+	struct xfs_buf		*bp = NULL;
+	int			error;
+
+restart:
+	error = 0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out;
+		xfs_iflock(ip);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		xfs_iflush_abort(ip, false);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out_ifunlock;
+		xfs_iunpin_wait(ip);
+	}
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		goto reclaim;
+	if (xfs_inode_clean(ip))
+		goto reclaim;
+
+	/*
+	 * Never flush out dirty data during non-blocking reclaim, as it would
+	 * just contend with AIL pushing trying to do the same job.
+	 */
+	if (!(sync_mode & SYNC_WAIT))
+		goto out_ifunlock;
+
+	/*
+	 * Now we have an inode that needs flushing.
+	 *
+	 * Note that xfs_iflush will never block on the inode buffer lock, as
+	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
+	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
+	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+	 * result in an ABBA deadlock with xfs_ifree_cluster().
+	 *
+	 * As xfs_ifree_cluser() must gather all inodes that are active in the
+	 * cache to mark them stale, if we hit this case we don't actually want
+	 * to do IO here - we want the inode marked stale so we can simply
+	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
+	 * inode, back off and try again.  Hopefully the next pass through will
+	 * see the stale flag set on the inode.
+	 */
+	error = xfs_iflush(ip, &bp);
+	if (error == -EAGAIN) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		/* backoff longer than in xfs_ifree_cluster */
+		delay(2);
+		goto restart;
+	}
+
+	if (!error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+	}
+
+	xfs_iflock(ip);
+reclaim:
+	xfs_ifunlock(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	XFS_STATS_INC(xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	spin_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root,
+				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+		ASSERT(0);
+	__xfs_inode_clear_reclaim(pag, ip);
+	spin_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	xfs_inode_free(ip);
+	return error;
+
+out_ifunlock:
+	xfs_ifunlock(ip);
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * We could return -EAGAIN here to make reclaim rescan the inode tree in
+	 * a short while. However, this just burns CPU time scanning the tree
+	 * waiting for IO to complete and the reclaim work never goes back to
+	 * the idle state. Instead, return 0 to let the next scheduled
+	 * background reclaim attempt to reclaim the inode again.
+	 */
+	return 0;
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+STATIC int
+xfs_reclaim_inodes_ag(
+	struct xfs_mount	*mp,
+	int			flags,
+	int			*nr_to_scan)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
+
+restart:
+	ag = 0;
+	skipped = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		unsigned long	first_index = 0;
+		int		done = 0;
+		int		nr_found = 0;
+
+		ag = pag->pag_agno + 1;
+
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				xfs_perag_put(pag);
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
+		do {
+			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+			int	i;
+
+			rcu_read_lock();
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH,
+					XFS_ICI_RECLAIM_TAG);
+			if (!nr_found) {
+				done = 1;
+				rcu_read_unlock();
+				break;
+			}
+
+			/*
+			 * Grab the inodes before we drop the lock. if we found
+			 * nothing, nr == 0 and the loop will be skipped.
+			 */
+			for (i = 0; i < nr_found; i++) {
+				struct xfs_inode *ip = batch[i];
+
+				if (done || xfs_reclaim_inode_grab(ip, flags))
+					batch[i] = NULL;
+
+				/*
+				 * Update the index for the next lookup. Catch
+				 * overflows into the next AG range which can
+				 * occur if we have inodes in the last block of
+				 * the AG and we are currently pointing to the
+				 * last inode.
+				 *
+				 * Because we may see inodes that are from the
+				 * wrong AG due to RCU freeing and
+				 * reallocation, only update the index if it
+				 * lies in this AG. It was a race that lead us
+				 * to see this inode, so another lookup from
+				 * the same index will not find it again.
+				 */
+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+								pag->pag_agno)
+					continue;
+				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+					done = 1;
+			}
+
+			/* unlock now we've grabbed the inodes. */
+			rcu_read_unlock();
+
+			for (i = 0; i < nr_found; i++) {
+				if (!batch[i])
+					continue;
+				error = xfs_reclaim_inode(batch[i], pag, flags);
+				if (error && last_error != -EFSCORRUPTED)
+					last_error = error;
+			}
+
+			*nr_to_scan -= XFS_LOOKUP_BATCH;
+
+			cond_resched();
+
+		} while (nr_found && !done && *nr_to_scan > 0);
+
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
+		xfs_perag_put(pag);
+	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
+	return last_error;
+}
+
+int
+xfs_reclaim_inodes(
+	xfs_mount_t	*mp,
+	int		mode)
+{
+	int		nr_to_scan = INT_MAX;
+
+	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+long
+xfs_reclaim_inodes_nr(
+	struct xfs_mount	*mp,
+	int			nr_to_scan)
+{
+	/* kick background reclaimer and push the AIL */
+	xfs_reclaim_work_queue(mp);
+	xfs_ail_push_all(mp->m_ail);
+
+	return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		ag = 0;
+	int			reclaimable = 0;
+
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
+	}
+	return reclaimable;
+}
+
+STATIC int
+xfs_inode_match_id(
+	struct xfs_inode	*ip,
+	struct xfs_eofblocks	*eofb)
+{
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+	    !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+		return 0;
+
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+	    !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+		return 0;
+
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
+	    xfs_get_projid(ip) != eofb->eof_prid)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * A union-based inode filtering algorithm. Process the inode if any of the
+ * criteria match. This is for global/internal scans only.
+ */
+STATIC int
+xfs_inode_match_id_union(
+	struct xfs_inode	*ip,
+	struct xfs_eofblocks	*eofb)
+{
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+	    uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+		return 1;
+
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+	    gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+		return 1;
+
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
+	    xfs_get_projid(ip) == eofb->eof_prid)
+		return 1;
+
+	return 0;
+}
+
+STATIC int
+xfs_inode_free_eofblocks(
+	struct xfs_inode	*ip,
+	int			flags,
+	void			*args)
+{
+	int ret;
+	struct xfs_eofblocks *eofb = args;
+	bool need_iolock = true;
+	int match;
+
+	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+
+	if (!xfs_can_free_eofblocks(ip, false)) {
+		/* inode could be preallocated or append-only */
+		trace_xfs_inode_free_eofblocks_invalid(ip);
+		xfs_inode_clear_eofblocks_tag(ip);
+		return 0;
+	}
+
+	/*
+	 * If the mapping is dirty the operation can block and wait for some
+	 * time. Unless we are waiting, skip it.
+	 */
+	if (!(flags & SYNC_WAIT) &&
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+		return 0;
+
+	if (eofb) {
+		if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+			match = xfs_inode_match_id_union(ip, eofb);
+		else
+			match = xfs_inode_match_id(ip, eofb);
+		if (!match)
+			return 0;
+
+		/* skip the inode if the file size is too small */
+		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+			return 0;
+
+		/*
+		 * A scan owner implies we already hold the iolock. Skip it in
+		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+		 * the possibility of EAGAIN being returned.
+		 */
+		if (eofb->eof_scan_owner == ip->i_ino)
+			need_iolock = false;
+	}
+
+	ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
+
+	/* don't revisit the inode if we're not waiting */
+	if (ret == -EAGAIN && !(flags & SYNC_WAIT))
+		ret = 0;
+
+	return ret;
+}
+
+int
+xfs_icache_free_eofblocks(
+	struct xfs_mount	*mp,
+	struct xfs_eofblocks	*eofb)
+{
+	int flags = SYNC_TRYLOCK;
+
+	if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+		flags = SYNC_WAIT;
+
+	return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+					 eofb, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+/*
+ * Run eofblocks scans on the quotas applicable to the inode. For inodes with
+ * multiple quotas, we don't know exactly which quota caused an allocation
+ * failure. We make a best effort by including each quota under low free space
+ * conditions (less than 1% free space) in the scan.
+ */
+int
+xfs_inode_free_quota_eofblocks(
+	struct xfs_inode *ip)
+{
+	int scan = 0;
+	struct xfs_eofblocks eofb = {0};
+	struct xfs_dquot *dq;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+	/*
+	 * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+	 * can repeatedly trylock on the inode we're currently processing. We
+	 * run a sync scan to increase effectiveness and use the union filter to
+	 * cover all applicable quotas in a single scan.
+	 */
+	eofb.eof_scan_owner = ip->i_ino;
+	eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
+
+	if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
+		dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+		if (dq && xfs_dquot_lowsp(dq)) {
+			eofb.eof_uid = VFS_I(ip)->i_uid;
+			eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+			scan = 1;
+		}
+	}
+
+	if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
+		dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+		if (dq && xfs_dquot_lowsp(dq)) {
+			eofb.eof_gid = VFS_I(ip)->i_gid;
+			eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+			scan = 1;
+		}
+	}
+
+	if (scan)
+		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+
+	return scan;
+}
+
+void
+xfs_inode_set_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+	int tagged;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	trace_xfs_inode_set_eofblocks_tag(ip);
+
+	tagged = radix_tree_tagged(&pag->pag_ici_root,
+				   XFS_ICI_EOFBLOCKS_TAG);
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_EOFBLOCKS_TAG);
+	if (!tagged) {
+		/* propagate the eofblocks tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				   XFS_ICI_EOFBLOCKS_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+
+		/* kick off background trimming */
+		xfs_queue_eofblocks(ip->i_mount);
+
+		trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+					      -1, _RET_IP_);
+	}
+
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+void
+xfs_inode_clear_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	trace_xfs_inode_clear_eofblocks_tag(ip);
+
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			     XFS_ICI_EOFBLOCKS_TAG);
+	if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+		/* clear the eofblocks tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				     XFS_ICI_EOFBLOCKS_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+					       -1, _RET_IP_);
+	}
+
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
diff --git a/kernel/fs/xfs/xfs_icache.h b/kernel/fs/xfs/xfs_icache.h
new file mode 100644
index 000000000..62f1f91c3
--- /dev/null
+++ b/kernel/fs/xfs/xfs_icache.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+struct xfs_eofblocks {
+	__u32		eof_flags;
+	kuid_t		eof_uid;
+	kgid_t		eof_gid;
+	prid_t		eof_prid;
+	__u64		eof_min_file_size;
+	xfs_ino_t	eof_scan_owner;
+};
+
+#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
+#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
+
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_NO_TAG		(-1)	/* special flag for an untagged lookup
+					   in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG	1	/* inode has blocks beyond EOF */
+
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE		0x1
+#define XFS_IGET_UNTRUSTED	0x2
+#define XFS_IGET_DONTCACHE	0x4
+
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+	     uint flags, uint lock_flags, xfs_inode_t **ipp);
+
+/* recovery needs direct inode allocation capability */
+struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
+void xfs_inode_free(struct xfs_inode *ip);
+
+void xfs_reclaim_worker(struct work_struct *work);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+int xfs_reclaim_inodes_count(struct xfs_mount *mp);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
+void xfs_eofblocks_worker(struct work_struct *);
+
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, int flags, void *args),
+	int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, int flags, void *args),
+	int flags, void *args, int tag);
+
+static inline int
+xfs_fs_eofblocks_from_user(
+	struct xfs_fs_eofblocks		*src,
+	struct xfs_eofblocks		*dst)
+{
+	if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+		return -EINVAL;
+
+	if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+		return -EINVAL;
+
+	if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+	    memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+		return -EINVAL;
+
+	dst->eof_flags = src->eof_flags;
+	dst->eof_prid = src->eof_prid;
+	dst->eof_min_file_size = src->eof_min_file_size;
+	dst->eof_scan_owner = NULLFSINO;
+
+	dst->eof_uid = INVALID_UID;
+	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+		dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
+		if (!uid_valid(dst->eof_uid))
+			return -EINVAL;
+	}
+
+	dst->eof_gid = INVALID_GID;
+	if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+		dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
+		if (!gid_valid(dst->eof_gid))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+#endif
diff --git a/kernel/fs/xfs/xfs_icreate_item.c b/kernel/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000..d45ca72af
--- /dev/null
+++ b/kernel/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+#include "xfs_log.h"
+
+kmem_zone_t	*xfs_icreate_zone;		/* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC void
+xfs_icreate_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_icreate_log);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
+			&icp->ic_format,
+			sizeof(struct xfs_icreate_log));
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+
+	if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+		kmem_zone_free(xfs_icreate_zone, icp);
+	return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+
+	kmem_zone_free(xfs_icreate_zone, icp);
+	return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	ASSERT(0);
+	return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+	.iop_size	= xfs_icreate_item_size,
+	.iop_format	= xfs_icreate_item_format,
+	.iop_pin	= xfs_icreate_item_pin,
+	.iop_unpin	= xfs_icreate_item_unpin,
+	.iop_push	= xfs_icreate_item_push,
+	.iop_unlock	= xfs_icreate_item_unlock,
+	.iop_committed	= xfs_icreate_item_committed,
+	.iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	unsigned int		count,
+	unsigned int		inode_size,
+	xfs_agblock_t		length,
+	unsigned int		generation)
+{
+	struct xfs_icreate_item	*icp;
+
+	icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+	xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+			  &xfs_icreate_item_ops);
+
+	icp->ic_format.icl_type = XFS_LI_ICREATE;
+	icp->ic_format.icl_size = 1;	/* single vector */
+	icp->ic_format.icl_ag = cpu_to_be32(agno);
+	icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+	icp->ic_format.icl_count = cpu_to_be32(count);
+	icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+	icp->ic_format.icl_length = cpu_to_be32(length);
+	icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+	xfs_trans_add_item(tp, &icp->ic_item);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/kernel/fs/xfs/xfs_icreate_item.h b/kernel/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000..59e89f87c
--- /dev/null
+++ b/kernel/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H	1
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+	struct xfs_log_item	ic_item;
+	struct xfs_icreate_log	ic_format;
+};
+
+extern kmem_zone_t *xfs_icreate_zone;	/* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+			xfs_agblock_t agbno, unsigned int count,
+			unsigned int inode_size, xfs_agblock_t length,
+			unsigned int generation);
+
+#endif	/* XFS_ICREATE_ITEM_H */
diff --git a/kernel/fs/xfs/xfs_inode.c b/kernel/fs/xfs/xfs_inode.c
new file mode 100644
index 000000000..539a85fdd
--- /dev/null
+++ b/kernel/fs/xfs/xfs_inode.c
@@ -0,0 +1,3606 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_filestream.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
+
+kmem_zone_t *xfs_inode_zone;
+
+/*
+ * Used in xfs_itruncate_extents().  This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define	XFS_ITRUNC_MAX_EXTENTS	2
+
+STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+
+STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
+
+/*
+ * helper function to extract extent size hint from inode
+ */
+xfs_extlen_t
+xfs_get_extsz_hint(
+	struct xfs_inode	*ip)
+{
+	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+		return ip->i_d.di_extsize;
+	if (XFS_IS_REALTIME_INODE(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+	return 0;
+}
+
+/*
+ * These two are wrapper routines around the xfs_ilock() routine used to
+ * centralize some grungy code.  They are used in places that wish to lock the
+ * inode solely for reading the extents.  The reason these places can't just
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
+ * bringing in of the extents from disk for a file in b-tree format.  If the
+ * inode is in b-tree format, then we need to lock the inode exclusively until
+ * the extents are read in.  Locking it exclusively all the time would limit
+ * our parallelism unnecessarily, though.  What we do instead is check to see
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
+ *
+ * The functions return a value which should be given to the corresponding
+ * xfs_iunlock() call.
+ */
+uint
+xfs_ilock_data_map_shared(
+	struct xfs_inode	*ip)
+{
+	uint			lock_mode = XFS_ILOCK_SHARED;
+
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
+		lock_mode = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lock_mode);
+	return lock_mode;
+}
+
+uint
+xfs_ilock_attr_map_shared(
+	struct xfs_inode	*ip)
+{
+	uint			lock_mode = XFS_ILOCK_SHARED;
+
+	if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+	    (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+		lock_mode = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lock_mode);
+	return lock_mode;
+}
+
+/*
+ * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
+ * the i_lock.  This routine allows various combinations of the locks to be
+ * obtained.
+ *
+ * The 3 locks should always be ordered so that the IO lock is obtained first,
+ * the mmap lock second and the ilock last in order to prevent deadlock.
+ *
+ * Basic locking order:
+ *
+ * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ *
+ * mmap_sem locking order:
+ *
+ * i_iolock -> page lock -> mmap_sem
+ * mmap_sem -> i_mmap_lock -> page_lock
+ *
+ * The difference in mmap_sem locking order mean that we cannot hold the
+ * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * in get_user_pages() to map the user pages into the kernel address space for
+ * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * page faults already hold the mmap_sem.
+ *
+ * Hence to serialise fully against both syscall and mmap based IO, we need to
+ * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * taken in places where we need to invalidate the page cache in a race
+ * free manner (e.g. truncate, hole punch and other extent manipulation
+ * functions).
+ */
+void
+xfs_ilock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+	if (lock_flags & XFS_MMAPLOCK_EXCL)
+		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_MMAPLOCK_SHARED)
+		mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ */
+int
+xfs_ilock_nowait(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_iolock))
+			goto out;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_iolock))
+			goto out;
+	}
+
+	if (lock_flags & XFS_MMAPLOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_mmaplock))
+			goto out_undo_iolock;
+	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_mmaplock))
+			goto out_undo_iolock;
+	}
+
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_lock))
+			goto out_undo_mmaplock;
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_lock))
+			goto out_undo_mmaplock;
+	}
+	return 1;
+
+out_undo_mmaplock:
+	if (lock_flags & XFS_MMAPLOCK_EXCL)
+		mrunlock_excl(&ip->i_mmaplock);
+	else if (lock_flags & XFS_MMAPLOCK_SHARED)
+		mrunlock_shared(&ip->i_mmaplock);
+out_undo_iolock:
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+out:
+	return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+	ASSERT(lock_flags != 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+
+	if (lock_flags & XFS_MMAPLOCK_EXCL)
+		mrunlock_excl(&ip->i_mmaplock);
+	else if (lock_flags & XFS_MMAPLOCK_SHARED)
+		mrunlock_shared(&ip->i_mmaplock);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrunlock_excl(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mrunlock_shared(&ip->i_lock);
+
+	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags &
+		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrdemote(&ip->i_lock);
+	if (lock_flags & XFS_MMAPLOCK_EXCL)
+		mrdemote(&ip->i_mmaplock);
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrdemote(&ip->i_iolock);
+
+	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int
+xfs_isilocked(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
+	}
+
+	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
+		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
+			return !!ip->i_mmaplock.mr_writer;
+		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+	}
+
+	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+		if (!(lock_flags & XFS_IOLOCK_SHARED))
+			return !!ip->i_iolock.mr_writer;
+		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+	}
+
+	ASSERT(0);
+	return 0;
+}
+#endif
+
+#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
+/*
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
+ * value. This shouldn't be called for page fault locking, but we also need to
+ * ensure we don't overrun the number of lockdep subclasses for the iolock or
+ * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
+ */
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
+{
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+		ASSERT(subclass + XFS_LOCK_INUMORDER <
+			(1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
+		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+	}
+
+	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
+		ASSERT(subclass + XFS_LOCK_INUMORDER <
+			(1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
+		lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
+							XFS_MMAPLOCK_SHIFT;
+	}
+
+	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+
+	return lock_mode;
+}
+
+/*
+ * The following routine will lock n inodes in exclusive mode.  We assume the
+ * caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock is in the AIL and we
+ * start waiting for another inode that is locked by a thread in a long running
+ * transaction (such as truncate). This can result in deadlock since the long
+ * running trans might need to wait for the inode we just locked in order to
+ * push the tail and free space in the log.
+ */
+void
+xfs_lock_inodes(
+	xfs_inode_t	**ips,
+	int		inodes,
+	uint		lock_mode)
+{
+	int		attempts = 0, i, j, try_lock;
+	xfs_log_item_t	*lp;
+
+	/* currently supports between 2 and 5 inodes */
+	ASSERT(ips && inodes >= 2 && inodes <= 5);
+
+	try_lock = 0;
+	i = 0;
+again:
+	for (; i < inodes; i++) {
+		ASSERT(ips[i]);
+
+		if (i && (ips[i] == ips[i - 1]))	/* Already locked */
+			continue;
+
+		/*
+		 * If try_lock is not set yet, make sure all locked inodes are
+		 * not in the AIL.  If any are, set try_lock to be used later.
+		 */
+		if (!try_lock) {
+			for (j = (i - 1); j >= 0 && !try_lock; j--) {
+				lp = (xfs_log_item_t *)ips[j]->i_itemp;
+				if (lp && (lp->li_flags & XFS_LI_IN_AIL))
+					try_lock++;
+			}
+		}
+
+		/*
+		 * If any of the previous locks we have locked is in the AIL,
+		 * we must TRY to get the second and subsequent locks. If
+		 * we can't get any, we must release all we have
+		 * and try again.
+		 */
+		if (!try_lock) {
+			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+			continue;
+		}
+
+		/* try_lock means we have an inode locked that is in the AIL. */
+		ASSERT(i != 0);
+		if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
+			continue;
+
+		/*
+		 * Unlock all previous guys and try again.  xfs_iunlock will try
+		 * to push the tail if the inode is in the AIL.
+		 */
+		attempts++;
+		for (j = i - 1; j >= 0; j--) {
+			/*
+			 * Check to see if we've already unlocked this one.  Not
+			 * the first one going back, and the inode ptr is the
+			 * same.
+			 */
+			if (j != (i - 1) && ips[j] == ips[j + 1])
+				continue;
+
+			xfs_iunlock(ips[j], lock_mode);
+		}
+
+		if ((attempts % 5) == 0) {
+			delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+			xfs_lock_delays++;
+#endif
+		}
+		i = 0;
+		try_lock = 0;
+		goto again;
+	}
+
+#ifdef DEBUG
+	if (attempts) {
+		if (attempts < 5) xfs_small_retries++;
+		else if (attempts < 100) xfs_middle_retries++;
+		else xfs_lots_retries++;
+	} else {
+		xfs_locked_n++;
+	}
+#endif
+}
+
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
+ * lock more than one at a time, lockdep will report false positives saying we
+ * have violated locking orders.
+ */
+void
+xfs_lock_two_inodes(
+	xfs_inode_t		*ip0,
+	xfs_inode_t		*ip1,
+	uint			lock_mode)
+{
+	xfs_inode_t		*temp;
+	int			attempts = 0;
+	xfs_log_item_t		*lp;
+
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+		ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+	} else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+
+	ASSERT(ip0->i_ino != ip1->i_ino);
+
+	if (ip0->i_ino > ip1->i_ino) {
+		temp = ip0;
+		ip0 = ip1;
+		ip1 = temp;
+	}
+
+ again:
+	xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+
+	/*
+	 * If the first lock we have locked is in the AIL, we must TRY to get
+	 * the second lock. If we can't get it, we must release the first one
+	 * and try again.
+	 */
+	lp = (xfs_log_item_t *)ip0->i_itemp;
+	if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+			xfs_iunlock(ip0, lock_mode);
+			if ((++attempts % 5) == 0)
+				delay(1); /* Don't just spin the CPU */
+			goto again;
+		}
+	} else {
+		xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+	}
+}
+
+
+void
+__xfs_iflock(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_isiflocked(ip))
+			io_schedule();
+	} while (!xfs_iflock_nowait(ip));
+
+	finish_wait(wq, &wait.wait);
+}
+
+STATIC uint
+_xfs_dic2xflags(
+	__uint16_t		di_flags)
+{
+	uint			flags = 0;
+
+	if (di_flags & XFS_DIFLAG_ANY) {
+		if (di_flags & XFS_DIFLAG_REALTIME)
+			flags |= XFS_XFLAG_REALTIME;
+		if (di_flags & XFS_DIFLAG_PREALLOC)
+			flags |= XFS_XFLAG_PREALLOC;
+		if (di_flags & XFS_DIFLAG_IMMUTABLE)
+			flags |= XFS_XFLAG_IMMUTABLE;
+		if (di_flags & XFS_DIFLAG_APPEND)
+			flags |= XFS_XFLAG_APPEND;
+		if (di_flags & XFS_DIFLAG_SYNC)
+			flags |= XFS_XFLAG_SYNC;
+		if (di_flags & XFS_DIFLAG_NOATIME)
+			flags |= XFS_XFLAG_NOATIME;
+		if (di_flags & XFS_DIFLAG_NODUMP)
+			flags |= XFS_XFLAG_NODUMP;
+		if (di_flags & XFS_DIFLAG_RTINHERIT)
+			flags |= XFS_XFLAG_RTINHERIT;
+		if (di_flags & XFS_DIFLAG_PROJINHERIT)
+			flags |= XFS_XFLAG_PROJINHERIT;
+		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
+			flags |= XFS_XFLAG_NOSYMLINKS;
+		if (di_flags & XFS_DIFLAG_EXTSIZE)
+			flags |= XFS_XFLAG_EXTSIZE;
+		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
+			flags |= XFS_XFLAG_EXTSZINHERIT;
+		if (di_flags & XFS_DIFLAG_NODEFRAG)
+			flags |= XFS_XFLAG_NODEFRAG;
+		if (di_flags & XFS_DIFLAG_FILESTREAM)
+			flags |= XFS_XFLAG_FILESTREAM;
+	}
+
+	return flags;
+}
+
+uint
+xfs_ip2xflags(
+	xfs_inode_t		*ip)
+{
+	xfs_icdinode_t		*dic = &ip->i_d;
+
+	return _xfs_dic2xflags(dic->di_flags) |
+				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+}
+
+uint
+xfs_dic2xflags(
+	xfs_dinode_t		*dip)
+{
+	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
+				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+}
+
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
+int
+xfs_lookup(
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	xfs_inode_t		**ipp,
+	struct xfs_name		*ci_name)
+{
+	xfs_ino_t		inum;
+	int			error;
+	uint			lock_mode;
+
+	trace_xfs_lookup(dp, name);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return -EIO;
+
+	lock_mode = xfs_ilock_data_map_shared(dp);
+	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+	xfs_iunlock(dp, lock_mode);
+
+	if (error)
+		goto out;
+
+	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+	if (error)
+		goto out_free_name;
+
+	return 0;
+
+out_free_name:
+	if (ci_name)
+		kmem_free(ci_name->name);
+out:
+	*ipp = NULL;
+	return error;
+}
+
+/*
+ * Allocate an inode on disk and return a copy of its in-core version.
+ * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
+ * appropriately within the inode.  The uid and gid for the inode are
+ * set according to the contents of the given cred structure.
+ *
+ * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
+ * has a free inode available, call xfs_iget() to obtain the in-core
+ * version of the allocated inode.  Finally, fill in the inode and
+ * log its initial contents.  In this case, ialloc_context would be
+ * set to NULL.
+ *
+ * If xfs_dialloc() does not have an available inode, it will replenish
+ * its supply by doing an allocation. Since we can only do one
+ * allocation within a transaction without deadlocks, we must commit
+ * the current transaction before returning the inode itself.
+ * In this case, therefore, we will set ialloc_context and return.
+ * The caller should then commit the current transaction, start a new
+ * transaction, and call xfs_ialloc() again to actually get the inode.
+ *
+ * To ensure that some other process does not grab the inode that
+ * was allocated during the first call to xfs_ialloc(), this routine
+ * also returns the [locked] bp pointing to the head of the freelist
+ * as ialloc_context.  The caller should hold this buffer across
+ * the commit and pass it back into this routine on the second call.
+ *
+ * If we are allocating quota inodes, we do not have a parent inode
+ * to attach to or associate with (i.e. pip == NULL) because they
+ * are not linked into the directory structure - they are attached
+ * directly to the superblock - and so have no parent.
+ */
+int
+xfs_ialloc(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*pip,
+	umode_t		mode,
+	xfs_nlink_t	nlink,
+	xfs_dev_t	rdev,
+	prid_t		prid,
+	int		okalloc,
+	xfs_buf_t	**ialloc_context,
+	xfs_inode_t	**ipp)
+{
+	struct xfs_mount *mp = tp->t_mountp;
+	xfs_ino_t	ino;
+	xfs_inode_t	*ip;
+	uint		flags;
+	int		error;
+	struct timespec	tv;
+
+	/*
+	 * Call the space management code to pick
+	 * the on-disk inode to be allocated.
+	 */
+	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+			    ialloc_context, &ino);
+	if (error)
+		return error;
+	if (*ialloc_context || ino == NULLFSINO) {
+		*ipp = NULL;
+		return 0;
+	}
+	ASSERT(*ialloc_context == NULL);
+
+	/*
+	 * Get the in-core inode with the lock held exclusively.
+	 * This is because we're setting fields here we need
+	 * to prevent others from looking at until we're done.
+	 */
+	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
+			 XFS_ILOCK_EXCL, &ip);
+	if (error)
+		return error;
+	ASSERT(ip != NULL);
+
+	/*
+	 * We always convert v1 inodes to v2 now - we only support filesystems
+	 * with >= v2 inode capability, so there is no reason for ever leaving
+	 * an inode in v1 format.
+	 */
+	if (ip->i_d.di_version == 1)
+		ip->i_d.di_version = 2;
+
+	ip->i_d.di_mode = mode;
+	ip->i_d.di_onlink = 0;
+	ip->i_d.di_nlink = nlink;
+	ASSERT(ip->i_d.di_nlink == nlink);
+	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
+	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
+	xfs_set_projid(ip, prid);
+	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+
+	if (pip && XFS_INHERIT_GID(pip)) {
+		ip->i_d.di_gid = pip->i_d.di_gid;
+		if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
+			ip->i_d.di_mode |= S_ISGID;
+		}
+	}
+
+	/*
+	 * If the group ID of the new file does not match the effective group
+	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
+	 * (and only if the irix_sgid_inherit compatibility variable is set).
+	 */
+	if ((irix_sgid_inherit) &&
+	    (ip->i_d.di_mode & S_ISGID) &&
+	    (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
+		ip->i_d.di_mode &= ~S_ISGID;
+	}
+
+	ip->i_d.di_size = 0;
+	ip->i_d.di_nextents = 0;
+	ASSERT(ip->i_d.di_nblocks == 0);
+
+	tv = current_fs_time(mp->m_super);
+	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+	ip->i_d.di_atime = ip->i_d.di_mtime;
+	ip->i_d.di_ctime = ip->i_d.di_mtime;
+
+	/*
+	 * di_gen will have been taken care of in xfs_iread.
+	 */
+	ip->i_d.di_extsize = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_dmstate = 0;
+	ip->i_d.di_flags = 0;
+
+	if (ip->i_d.di_version == 3) {
+		ASSERT(ip->i_d.di_ino == ino);
+		ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+		ip->i_d.di_crc = 0;
+		ip->i_d.di_changecount = 1;
+		ip->i_d.di_lsn = 0;
+		ip->i_d.di_flags2 = 0;
+		memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
+		ip->i_d.di_crtime = ip->i_d.di_mtime;
+	}
+
+
+	flags = XFS_ILOG_CORE;
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+		ip->i_df.if_u2.if_rdev = rdev;
+		ip->i_df.if_flags = 0;
+		flags |= XFS_ILOG_DEV;
+		break;
+	case S_IFREG:
+	case S_IFDIR:
+		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+			uint	di_flags = 0;
+
+			if (S_ISDIR(mode)) {
+				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+					di_flags |= XFS_DIFLAG_RTINHERIT;
+				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+					ip->i_d.di_extsize = pip->i_d.di_extsize;
+				}
+				if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+					di_flags |= XFS_DIFLAG_PROJINHERIT;
+			} else if (S_ISREG(mode)) {
+				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+					di_flags |= XFS_DIFLAG_REALTIME;
+				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+					di_flags |= XFS_DIFLAG_EXTSIZE;
+					ip->i_d.di_extsize = pip->i_d.di_extsize;
+				}
+			}
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
+			    xfs_inherit_noatime)
+				di_flags |= XFS_DIFLAG_NOATIME;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
+			    xfs_inherit_nodump)
+				di_flags |= XFS_DIFLAG_NODUMP;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
+			    xfs_inherit_sync)
+				di_flags |= XFS_DIFLAG_SYNC;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
+			    xfs_inherit_nosymlinks)
+				di_flags |= XFS_DIFLAG_NOSYMLINKS;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+			    xfs_inherit_nodefrag)
+				di_flags |= XFS_DIFLAG_NODEFRAG;
+			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+				di_flags |= XFS_DIFLAG_FILESTREAM;
+			ip->i_d.di_flags |= di_flags;
+		}
+		/* FALLTHROUGH */
+	case S_IFLNK:
+		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_flags = XFS_IFEXTENTS;
+		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
+		ip->i_df.if_u1.if_extents = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+	/*
+	 * Attribute fork settings for new inode.
+	 */
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_anextents = 0;
+
+	/*
+	 * Log the new values stuffed into the inode.
+	 */
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, flags);
+
+	/* now that we have an i_mode we can setup the inode structure */
+	xfs_setup_inode(ip);
+
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
+ */
+int
+xfs_dir_ialloc(
+	xfs_trans_t	**tpp,		/* input: current transaction;
+					   output: may be a new transaction. */
+	xfs_inode_t	*dp,		/* directory within whose allocate
+					   the inode. */
+	umode_t		mode,
+	xfs_nlink_t	nlink,
+	xfs_dev_t	rdev,
+	prid_t		prid,		/* project id */
+	int		okalloc,	/* ok to allocate new space */
+	xfs_inode_t	**ipp,		/* pointer to inode; it will be
+					   locked. */
+	int		*committed)
+
+{
+	xfs_trans_t	*tp;
+	xfs_trans_t	*ntp;
+	xfs_inode_t	*ip;
+	xfs_buf_t	*ialloc_context = NULL;
+	int		code;
+	void		*dqinfo;
+	uint		tflags;
+
+	tp = *tpp;
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	/*
+	 * xfs_ialloc will return a pointer to an incore inode if
+	 * the Space Manager has an available inode on the free
+	 * list. Otherwise, it will do an allocation and replenish
+	 * the freelist.  Since we can only do one allocation per
+	 * transaction without deadlocks, we will need to commit the
+	 * current transaction and start a new one.  We will then
+	 * need to call xfs_ialloc again to get the inode.
+	 *
+	 * If xfs_ialloc did an allocation to replenish the freelist,
+	 * it returns the bp containing the head of the freelist as
+	 * ialloc_context. We will hold a lock on it across the
+	 * transaction commit so that no other process can steal
+	 * the inode(s) that we've just allocated.
+	 */
+	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
+			  &ialloc_context, &ip);
+
+	/*
+	 * Return an error if we were unable to allocate a new inode.
+	 * This should only happen if we run out of space on disk or
+	 * encounter a disk error.
+	 */
+	if (code) {
+		*ipp = NULL;
+		return code;
+	}
+	if (!ialloc_context && !ip) {
+		*ipp = NULL;
+		return -ENOSPC;
+	}
+
+	/*
+	 * If the AGI buffer is non-NULL, then we were unable to get an
+	 * inode in one operation.  We need to commit the current
+	 * transaction and call xfs_ialloc() again.  It is guaranteed
+	 * to succeed the second time.
+	 */
+	if (ialloc_context) {
+		struct xfs_trans_res tres;
+
+		/*
+		 * Normally, xfs_trans_commit releases all the locks.
+		 * We call bhold to hang on to the ialloc_context across
+		 * the commit.  Holding this buffer prevents any other
+		 * processes from doing any allocations in this
+		 * allocation group.
+		 */
+		xfs_trans_bhold(tp, ialloc_context);
+		/*
+		 * Save the log reservation so we can use
+		 * them in the next transaction.
+		 */
+		tres.tr_logres = xfs_trans_get_log_res(tp);
+		tres.tr_logcount = xfs_trans_get_log_count(tp);
+
+		/*
+		 * We want the quota changes to be associated with the next
+		 * transaction, NOT this one. So, detach the dqinfo from this
+		 * and attach it to the next transaction.
+		 */
+		dqinfo = NULL;
+		tflags = 0;
+		if (tp->t_dqinfo) {
+			dqinfo = (void *)tp->t_dqinfo;
+			tp->t_dqinfo = NULL;
+			tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+		}
+
+		ntp = xfs_trans_dup(tp);
+		code = xfs_trans_commit(tp, 0);
+		tp = ntp;
+		if (committed != NULL) {
+			*committed = 1;
+		}
+		/*
+		 * If we get an error during the commit processing,
+		 * release the buffer that is still held and return
+		 * to the caller.
+		 */
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			if (dqinfo) {
+				tp->t_dqinfo = dqinfo;
+				xfs_trans_free_dqinfo(tp);
+			}
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+
+		/*
+		 * transaction commit worked ok so we can drop the extra ticket
+		 * reference that we gained in xfs_trans_dup()
+		 */
+		xfs_log_ticket_put(tp->t_ticket);
+		tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+		code = xfs_trans_reserve(tp, &tres, 0, 0);
+
+		/*
+		 * Re-attach the quota info that we detached from prev trx.
+		 */
+		if (dqinfo) {
+			tp->t_dqinfo = dqinfo;
+			tp->t_flags |= tflags;
+		}
+
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+		xfs_trans_bjoin(tp, ialloc_context);
+
+		/*
+		 * Call ialloc again. Since we've locked out all
+		 * other allocations in this allocation group,
+		 * this call should always succeed.
+		 */
+		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
+				  okalloc, &ialloc_context, &ip);
+
+		/*
+		 * If we get an error at this point, return to the caller
+		 * so that the current transaction can be aborted.
+		 */
+		if (code) {
+			*tpp = tp;
+			*ipp = NULL;
+			return code;
+		}
+		ASSERT(!ialloc_context && ip);
+
+	} else {
+		if (committed != NULL)
+			*committed = 0;
+	}
+
+	*ipp = ip;
+	*tpp = tp;
+
+	return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int				/* error */
+xfs_droplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	int	error;
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	ASSERT (ip->i_d.di_nlink > 0);
+	ip->i_d.di_nlink--;
+	drop_nlink(VFS_I(ip));
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = 0;
+	if (ip->i_d.di_nlink == 0) {
+		/*
+		 * We're dropping the last link to this file.
+		 * Move the on-disk inode to the AGI unlinked list.
+		 * From xfs_inactive() we will pull the inode from
+		 * the list and free it.
+		 */
+		error = xfs_iunlink(tp, ip);
+	}
+	return error;
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	ASSERT(ip->i_d.di_version > 1);
+	ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
+	ip->i_d.di_nlink++;
+	inc_nlink(VFS_I(ip));
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
+
+int
+xfs_create(
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	umode_t			mode,
+	xfs_dev_t		rdev,
+	xfs_inode_t		**ipp)
+{
+	int			is_dir = S_ISDIR(mode);
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_inode	*ip = NULL;
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	bool                    unlock_dp_on_error = false;
+	uint			cancel_flags;
+	int			committed;
+	prid_t			prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_trans_res	*tres;
+	uint			resblks;
+
+	trace_xfs_create(dp, name);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	prid = xfs_get_initial_prid(dp);
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+					xfs_kgid_to_gid(current_fsgid()), prid,
+					XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+					&udqp, &gdqp, &pdqp);
+	if (error)
+		return error;
+
+	if (is_dir) {
+		rdev = 0;
+		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+		tres = &M_RES(mp)->tr_mkdir;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+	} else {
+		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+		tres = &M_RES(mp)->tr_create;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+	}
+
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+	/*
+	 * Initially assume that the file does not exist and
+	 * reserve the resources for that case.  If that is not
+	 * the case we'll drop the one we have and get a more
+	 * appropriate transaction later.
+	 */
+	error = xfs_trans_reserve(tp, tres, resblks, 0);
+	if (error == -ENOSPC) {
+		/* flush outstanding delalloc blocks and retry */
+		xfs_flush_inodes(mp);
+		error = xfs_trans_reserve(tp, tres, resblks, 0);
+	}
+	if (error == -ENOSPC) {
+		/* No space at all so try a "no-allocation" reservation */
+		resblks = 0;
+		error = xfs_trans_reserve(tp, tres, 0, 0);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto out_trans_cancel;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+	unlock_dp_on_error = true;
+
+	xfs_bmap_init(&free_list, &first_block);
+
+	/*
+	 * Reserve disk quota and the inode.
+	 */
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+						pdqp, resblks, 1, 0);
+	if (error)
+		goto out_trans_cancel;
+
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, dp, name);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * A newly created regular or special file just has one directory
+	 * entry pointing to them, but a directory also the "." entry
+	 * pointing to itself.
+	 */
+	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+			       prid, resblks > 0, &ip, &committed);
+	if (error) {
+		if (error == -ENOSPC)
+			goto out_trans_cancel;
+		goto out_trans_abort;
+	}
+
+	/*
+	 * Now we join the directory inode to the transaction.  We do not do it
+	 * earlier because xfs_dir_ialloc might commit the previous transaction
+	 * (and release all the locks).  An error from here on will result in
+	 * the transaction cancel unlocking dp so don't do it explicitly in the
+	 * error path.
+	 */
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	unlock_dp_on_error = false;
+
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+					&first_block, &free_list, resblks ?
+					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		goto out_trans_abort;
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	if (is_dir) {
+		error = xfs_dir_init(tp, ip, dp);
+		if (error)
+			goto out_bmap_cancel;
+
+		error = xfs_bumplink(tp, dp);
+		if (error)
+			goto out_bmap_cancel;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * create transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+		xfs_trans_set_sync(tp);
+
+	/*
+	 * Attach the dquot(s) to the inodes and modify them incore.
+	 * These ids of the inode couldn't have changed since the new
+	 * inode has been locked ever since it was created.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	*ipp = ip;
+	return 0;
+
+ out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+	cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to finish the
+	 * setup of the inode and release the inode.  This prevents recursive
+	 * transactions and deadlocks from xfs_inactive.
+	 */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		IRELE(ip);
+	}
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	if (unlock_dp_on_error)
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+int
+xfs_create_tmpfile(
+	struct xfs_inode	*dp,
+	struct dentry		*dentry,
+	umode_t			mode,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_inode	*ip = NULL;
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	uint			cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	prid_t                  prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_trans_res	*tres;
+	uint			resblks;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	prid = xfs_get_initial_prid(dp);
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+				xfs_kgid_to_gid(current_fsgid()), prid,
+				XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+				&udqp, &gdqp, &pdqp);
+	if (error)
+		return error;
+
+	resblks = XFS_IALLOC_SPACE_RES(mp);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
+
+	tres = &M_RES(mp)->tr_create_tmpfile;
+	error = xfs_trans_reserve(tp, tres, resblks, 0);
+	if (error == -ENOSPC) {
+		/* No space at all so try a "no-allocation" reservation */
+		resblks = 0;
+		error = xfs_trans_reserve(tp, tres, 0, 0);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto out_trans_cancel;
+	}
+
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+						pdqp, resblks, 1, 0);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
+				prid, resblks > 0, &ip, NULL);
+	if (error) {
+		if (error == -ENOSPC)
+			goto out_trans_cancel;
+		goto out_trans_abort;
+	}
+
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	/*
+	 * Attach the dquot(s) to the inodes and modify them incore.
+	 * These ids of the inode couldn't have changed since the new
+	 * inode has been locked ever since it was created.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+	ip->i_d.di_nlink--;
+	error = xfs_iunlink(tp, ip);
+	if (error)
+		goto out_trans_abort;
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	*ipp = ip;
+	return 0;
+
+ out_trans_abort:
+	cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to finish the
+	 * setup of the inode and release the inode.  This prevents recursive
+	 * transactions and deadlocks from xfs_inactive.
+	 */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		IRELE(ip);
+	}
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	return error;
+}
+
+int
+xfs_link(
+	xfs_inode_t		*tdp,
+	xfs_inode_t		*sip,
+	struct xfs_name		*target_name)
+{
+	xfs_mount_t		*mp = tdp->i_mount;
+	xfs_trans_t		*tp;
+	int			error;
+	xfs_bmap_free_t         free_list;
+	xfs_fsblock_t           first_block;
+	int			cancel_flags;
+	int			committed;
+	int			resblks;
+
+	trace_xfs_link(tdp, target_name);
+
+	ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	error = xfs_qm_dqattach(sip, 0);
+	if (error)
+		goto std_return;
+
+	error = xfs_qm_dqattach(tdp, 0);
+	if (error)
+		goto std_return;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+	if (error == -ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto error_return;
+	}
+
+	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we are using project inheritance, we only allow hard link
+	 * creation in our tree when the project IDs are the same; else
+	 * the tree quota mechanism could be circumvented.
+	 */
+	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+		     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+		error = -EXDEV;
+		goto error_return;
+	}
+
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, tdp, target_name);
+		if (error)
+			goto error_return;
+	}
+
+	xfs_bmap_init(&free_list, &first_block);
+
+	if (sip->i_d.di_nlink == 0) {
+		error = xfs_iunlink_remove(tp, sip);
+		if (error)
+			goto abort_return;
+	}
+
+	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error)
+		goto abort_return;
+	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+	error = xfs_bumplink(tp, sip);
+	if (error)
+		goto abort_return;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * link transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish (&tp, &free_list, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		goto abort_return;
+	}
+
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+	return error;
+}
+
+/*
+ * Free up the underlying blocks past new_size.  The new size must be smaller
+ * than the current size.  This routine can be used both for the attribute and
+ * data fork, and does not modify the inode size, which is left to the caller.
+ *
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
+ * returned to the caller to be committed.  The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On return the inode
+ * will be "held" within the returned transaction.  This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
+ */
+int
+xfs_itruncate_extents(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_fsize_t		new_size)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp = *tpp;
+	struct xfs_trans	*ntp;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	xfs_fileoff_t		first_unmap_block;
+	xfs_fileoff_t		last_block;
+	xfs_filblks_t		unmap_len;
+	int			committed;
+	int			error = 0;
+	int			done = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
+	       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(new_size <= XFS_ISIZE(ip));
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ip->i_itemp->ili_lock_flags == 0);
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+
+	trace_xfs_itruncate_extents_start(ip, new_size);
+
+	/*
+	 * Since it is possible for space to become allocated beyond
+	 * the end of the file (in a crash where the space is allocated
+	 * but the inode size is not yet updated), simply remove any
+	 * blocks which show up between the new EOF and the maximum
+	 * possible file size.  If the first block to be removed is
+	 * beyond the maximum file size (ie it is the same as last_block),
+	 * then there is nothing to do.
+	 */
+	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	if (first_unmap_block == last_block)
+		return 0;
+
+	ASSERT(first_unmap_block < last_block);
+	unmap_len = last_block - first_unmap_block + 1;
+	while (!done) {
+		xfs_bmap_init(&free_list, &first_block);
+		error = xfs_bunmapi(tp, ip,
+				    first_unmap_block, unmap_len,
+				    xfs_bmapi_aflag(whichfork),
+				    XFS_ITRUNC_MAX_EXTENTS,
+				    &first_block, &free_list,
+				    &done);
+		if (error)
+			goto out_bmap_cancel;
+
+		/*
+		 * Duplicate the transaction that has the permanent
+		 * reservation and commit the old transaction.
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (committed)
+			xfs_trans_ijoin(tp, ip, 0);
+		if (error)
+			goto out_bmap_cancel;
+
+		if (committed) {
+			/*
+			 * Mark the inode dirty so it will be logged and
+			 * moved forward in the log as part of every commit.
+			 */
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		}
+
+		ntp = xfs_trans_dup(tp);
+		error = xfs_trans_commit(tp, 0);
+		tp = ntp;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		if (error)
+			goto out;
+
+		/*
+		 * Transaction commit worked ok so we can drop the extra ticket
+		 * reference that we gained in xfs_trans_dup()
+		 */
+		xfs_log_ticket_put(tp->t_ticket);
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Always re-log the inode so that our permanent transaction can keep
+	 * on rolling it forward in the log.
+	 */
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	trace_xfs_itruncate_extents_end(ip, new_size);
+
+out:
+	*tpp = tp;
+	return error;
+out_bmap_cancel:
+	/*
+	 * If the bunmapi call encounters an error, return to the caller where
+	 * the transaction can be properly aborted.  We just need to make sure
+	 * we're not holding any resources that we were not when we came in.
+	 */
+	xfs_bmap_cancel(&free_list);
+	goto out;
+}
+
+int
+xfs_release(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	int		error;
+
+	if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+		return 0;
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return 0;
+
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
+		int truncated;
+
+		/*
+		 * If we previously truncated this file and removed old data
+		 * in the process, we want to initiate "early" writeout on
+		 * the last close.  This is an attempt to combat the notorious
+		 * NULL files problem which is particularly noticeable from a
+		 * truncate down, buffered (re-)write (delalloc), followed by
+		 * a crash.  What we are effectively doing here is
+		 * significantly reducing the time window where we'd otherwise
+		 * be exposed to that problem.
+		 */
+		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+		if (truncated) {
+			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+			if (ip->i_delayed_blks > 0) {
+				error = filemap_flush(VFS_I(ip)->i_mapping);
+				if (error)
+					return error;
+			}
+		}
+	}
+
+	if (ip->i_d.di_nlink == 0)
+		return 0;
+
+	if (xfs_can_free_eofblocks(ip, false)) {
+
+		/*
+		 * If we can't get the iolock just skip truncating the blocks
+		 * past EOF because we could deadlock with the mmap_sem
+		 * otherwise.  We'll get another chance to drop them once the
+		 * last reference to the inode is dropped, so we'll never leak
+		 * blocks permanently.
+		 *
+		 * Further, check if the inode is being opened, written and
+		 * closed frequently and we have delayed allocation blocks
+		 * outstanding (e.g. streaming writes from the NFS server),
+		 * truncating the blocks past EOF will cause fragmentation to
+		 * occur.
+		 *
+		 * In this case don't do the truncation, either, but we have to
+		 * be careful how we detect this case. Blocks beyond EOF show
+		 * up as i_delayed_blks even when the inode is clean, so we
+		 * need to truncate them away first before checking for a dirty
+		 * release. Hence on the first dirty close we will still remove
+		 * the speculative allocation, but after that we will leave it
+		 * in place.
+		 */
+		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+			return 0;
+
+		error = xfs_free_eofblocks(mp, ip, true);
+		if (error && error != -EAGAIN)
+			return error;
+
+		/* delalloc blocks after truncation means it really is dirty */
+		if (ip->i_delayed_blks)
+			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+	}
+	return 0;
+}
+
+/*
+ * xfs_inactive_truncate
+ *
+ * Called to perform a truncate when an inode becomes unlinked.
+ */
+STATIC int
+xfs_inactive_truncate(
+	struct xfs_inode *ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Log the inode size first to prevent stale data exposure in the event
+	 * of a system crash before the truncate completes. See the related
+	 * comment in xfs_setattr_size() for details.
+	 */
+	ip->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+	if (error)
+		goto error_trans_cancel;
+
+	ASSERT(ip->i_d.di_nextents == 0);
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto error_unlock;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+
+error_trans_cancel:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_inactive_ifree()
+ *
+ * Perform the inode free when an inode is unlinked.
+ */
+STATIC int
+xfs_inactive_ifree(
+	struct xfs_inode *ip)
+{
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			committed;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+	/*
+	 * The ifree transaction might need to allocate blocks for record
+	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
+	 * allow ifree to dip into the reserved block pool if necessary.
+	 *
+	 * Freeing large sets of inodes generally means freeing inode chunks,
+	 * directory and file data blocks, so this should be relatively safe.
+	 * Only under severe circumstances should it be possible to free enough
+	 * inodes to exhaust the reserve block pool via finobt expansion while
+	 * at the same time not creating free space in the filesystem.
+	 *
+	 * Send a warning if the reservation does happen to fail, as the inode
+	 * now remains allocated and sits on the unlinked list until the fs is
+	 * repaired.
+	 */
+	tp->t_flags |= XFS_TRANS_RESERVE;
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
+				  XFS_IFREE_SPACE_RES(mp), 0);
+	if (error) {
+		if (error == -ENOSPC) {
+			xfs_warn_ratelimited(mp,
+			"Failed to remove inode(s) from unlinked list. "
+			"Please free space, unmount and run xfs_repair.");
+		} else {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		}
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	xfs_bmap_init(&free_list, &first_block);
+	error = xfs_ifree(tp, ip, &free_list);
+	if (error) {
+		/*
+		 * If we fail to free the inode, shut down.  The cancel
+		 * might do that, we need to make sure.  Otherwise the
+		 * inode might be lost for a long time or forever.
+		 */
+		if (!XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_notice(mp, "%s: xfs_ifree returned error %d",
+				__func__, error);
+			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		}
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return error;
+	}
+
+	/*
+	 * Credit the quota account(s). The inode is gone.
+	 */
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+	/*
+	 * Just ignore errors at this point.  There is nothing we can
+	 * do except to try to keep going. Make sure it's not a silent
+	 * error.
+	 */
+	error = xfs_bmap_finish(&tp,  &free_list, &committed);
+	if (error)
+		xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+			__func__, error);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+			__func__, error);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+void
+xfs_inactive(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount	*mp;
+	int			error;
+	int			truncate = 0;
+
+	/*
+	 * If the inode is already free, then there can be nothing
+	 * to clean up here.
+	 */
+	if (ip->i_d.di_mode == 0) {
+		ASSERT(ip->i_df.if_real_bytes == 0);
+		ASSERT(ip->i_df.if_broot_bytes == 0);
+		return;
+	}
+
+	mp = ip->i_mount;
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return;
+
+	if (ip->i_d.di_nlink != 0) {
+		/*
+		 * force is true because we are evicting an inode from the
+		 * cache. Post-eof blocks must be freed, lest we end up with
+		 * broken free space accounting.
+		 */
+		if (xfs_can_free_eofblocks(ip, true))
+			xfs_free_eofblocks(mp, ip, false);
+
+		return;
+	}
+
+	if (S_ISREG(ip->i_d.di_mode) &&
+	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
+	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+		truncate = 1;
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return;
+
+	if (S_ISLNK(ip->i_d.di_mode))
+		error = xfs_inactive_symlink(ip);
+	else if (truncate)
+		error = xfs_inactive_truncate(ip);
+	if (error)
+		return;
+
+	/*
+	 * If there are attributes associated with the file then blow them away
+	 * now.  The code calls a routine that recursively deconstructs the
+	 * attribute fork. If also blows away the in-core attribute fork.
+	 */
+	if (XFS_IFORK_Q(ip)) {
+		error = xfs_attr_inactive(ip);
+		if (error)
+			return;
+	}
+
+	ASSERT(!ip->i_afp);
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT(ip->i_d.di_forkoff == 0);
+
+	/*
+	 * Free the inode.
+	 */
+	error = xfs_inactive_ifree(ip);
+	if (error)
+		return;
+
+	/*
+	 * Release the dquots held by inode, if any.
+	 */
+	xfs_qm_dqdetach(ip);
+}
+
+/*
+ * This is called when the inode's link count goes to 0.
+ * We place the on-disk inode on a list in the AGI.  It
+ * will be pulled from this list when the inode is freed.
+ */
+int
+xfs_iunlink(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agino_t	agino;
+	short		bucket_index;
+	int		offset;
+	int		error;
+
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_mode != 0);
+
+	mp = tp->t_mountp;
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+	if (error)
+		return error;
+	agi = XFS_BUF_TO_AGI(agibp);
+
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(agi->agi_unlinked[bucket_index]);
+	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+
+	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
+		/*
+		 * There is already another inode in the bucket we need
+		 * to add ourselves to.  Add us at the front of the list.
+		 * Here we put the head pointer into our next pointer,
+		 * and then we fall through to point the head at us.
+		 */
+		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+				       0, 0);
+		if (error)
+			return error;
+
+		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
+		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
+		offset = ip->i_imap.im_boffset +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+
+		/* need to recalc the inode CRC if appropriate */
+		xfs_dinode_calc_crc(mp, dip);
+
+		xfs_trans_inode_buf(tp, ibp);
+		xfs_trans_log_buf(tp, ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, ibp);
+	}
+
+	/*
+	 * Point the bucket head pointer at the inode being inserted.
+	 */
+	ASSERT(agino != 0);
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+	return 0;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_ino_t	next_ino;
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agnumber_t	agno;
+	xfs_agino_t	agino;
+	xfs_agino_t	next_agino;
+	xfs_buf_t	*last_ibp;
+	xfs_dinode_t	*last_dip = NULL;
+	short		bucket_index;
+	int		offset, last_offset = 0;
+	int		error;
+
+	mp = tp->t_mountp;
+	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_read_agi(mp, tp, agno, &agibp);
+	if (error)
+		return error;
+
+	agi = XFS_BUF_TO_AGI(agibp);
+
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
+	ASSERT(agi->agi_unlinked[bucket_index]);
+
+	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
+		/*
+		 * We're at the head of the list.  Get the inode's on-disk
+		 * buffer to see if there is anyone after us on the list.
+		 * Only modify our next pointer if it is not already NULLAGINO.
+		 * This saves us the overhead of dealing with the buffer when
+		 * there is no need to change it.
+		 */
+		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+				       0, 0);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+				__func__, error);
+			return error;
+		}
+		next_agino = be32_to_cpu(dip->di_next_unlinked);
+		ASSERT(next_agino != 0);
+		if (next_agino != NULLAGINO) {
+			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+			offset = ip->i_imap.im_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+
+			/* need to recalc the inode CRC if appropriate */
+			xfs_dinode_calc_crc(mp, dip);
+
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the bucket head pointer at the next inode.
+		 */
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
+		offset = offsetof(xfs_agi_t, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+		xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
+		xfs_trans_log_buf(tp, agibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+	} else {
+		/*
+		 * We need to search the list for the inode being freed.
+		 */
+		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+		last_ibp = NULL;
+		while (next_agino != agino) {
+			struct xfs_imap	imap;
+
+			if (last_ibp)
+				xfs_trans_brelse(tp, last_ibp);
+
+			imap.im_blkno = 0;
+			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+
+			error = xfs_imap(mp, tp, next_ino, &imap, 0);
+			if (error) {
+				xfs_warn(mp,
+	"%s: xfs_imap returned error %d.",
+					 __func__, error);
+				return error;
+			}
+
+			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
+					       &last_ibp, 0, 0);
+			if (error) {
+				xfs_warn(mp,
+	"%s: xfs_imap_to_bp returned error %d.",
+					__func__, error);
+				return error;
+			}
+
+			last_offset = imap.im_boffset;
+			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
+			ASSERT(next_agino != NULLAGINO);
+			ASSERT(next_agino != 0);
+		}
+
+		/*
+		 * Now last_ibp points to the buffer previous to us on the
+		 * unlinked list.  Pull us from the list.
+		 */
+		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+				       0, 0);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
+				__func__, error);
+			return error;
+		}
+		next_agino = be32_to_cpu(dip->di_next_unlinked);
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		if (next_agino != NULLAGINO) {
+			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+			offset = ip->i_imap.im_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+
+			/* need to recalc the inode CRC if appropriate */
+			xfs_dinode_calc_crc(mp, dip);
+
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the previous inode on the list to the next inode.
+		 */
+		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
+		ASSERT(next_agino != 0);
+		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+
+		/* need to recalc the inode CRC if appropriate */
+		xfs_dinode_calc_crc(mp, last_dip);
+
+		xfs_trans_inode_buf(tp, last_ibp);
+		xfs_trans_log_buf(tp, last_ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, last_ibp);
+	}
+	return 0;
+}
+
+/*
+ * A big issue when freeing the inode cluster is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
+STATIC int
+xfs_ifree_cluster(
+	xfs_inode_t	*free_ip,
+	xfs_trans_t	*tp,
+	xfs_ino_t	inum)
+{
+	xfs_mount_t		*mp = free_ip->i_mount;
+	int			blks_per_cluster;
+	int			inodes_per_cluster;
+	int			nbufs;
+	int			i, j;
+	xfs_daddr_t		blkno;
+	xfs_buf_t		*bp;
+	xfs_inode_t		*ip;
+	xfs_inode_log_item_t	*iip;
+	xfs_log_item_t		*lip;
+	struct xfs_perag	*pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+	nbufs = mp->m_ialloc_blks / blks_per_cluster;
+
+	for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
+					 XFS_INO_TO_AGBNO(mp, inum));
+
+		/*
+		 * We obtain and lock the backing buffer first in the process
+		 * here, as we have to ensure that any dirty inode that we
+		 * can't get the flush lock on is attached to the buffer.
+		 * If we scan the in-memory inodes first, then buffer IO can
+		 * complete before we get a lock on it, and hence we may fail
+		 * to mark all the active inodes on the buffer stale.
+		 */
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+					mp->m_bsize * blks_per_cluster,
+					XBF_UNMAPPED);
+
+		if (!bp)
+			return -ENOMEM;
+
+		/*
+		 * This buffer may not have been correctly initialised as we
+		 * didn't read it from disk. That's not important because we are
+		 * only using to mark the buffer as stale in the log, and to
+		 * attach stale cached inodes on it. That means it will never be
+		 * dispatched for IO. If it is, we want to know about it, and we
+		 * want it to fail. We can acheive this by adding a write
+		 * verifier to the buffer.
+		 */
+		 bp->b_ops = &xfs_inode_buf_ops;
+
+		/*
+		 * Walk the inodes already attached to the buffer and mark them
+		 * stale. These will all have the flush locks held, so an
+		 * in-memory inode walk can't lock them. By marking them all
+		 * stale first, we will not attempt to lock them in the loop
+		 * below as the XFS_ISTALE flag will be set.
+		 */
+		lip = bp->b_fspriv;
+		while (lip) {
+			if (lip->li_type == XFS_LI_INODE) {
+				iip = (xfs_inode_log_item_t *)lip;
+				ASSERT(iip->ili_logged == 1);
+				lip->li_cb = xfs_istale_done;
+				xfs_trans_ail_copy_lsn(mp->m_ail,
+							&iip->ili_flush_lsn,
+							&iip->ili_item.li_lsn);
+				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+			}
+			lip = lip->li_bio_list;
+		}
+
+
+		/*
+		 * For each inode in memory attempt to add it to the inode
+		 * buffer and set it up for being staled on buffer IO
+		 * completion.  This is safe as we've locked out tail pushing
+		 * and flushing by locking the buffer.
+		 *
+		 * We have already marked every inode that was part of a
+		 * transaction stale above, which means there is no point in
+		 * even trying to lock them.
+		 */
+		for (i = 0; i < inodes_per_cluster; i++) {
+retry:
+			rcu_read_lock();
+			ip = radix_tree_lookup(&pag->pag_ici_root,
+					XFS_INO_TO_AGINO(mp, (inum + i)));
+
+			/* Inode not in memory, nothing to do */
+			if (!ip) {
+				rcu_read_unlock();
+				continue;
+			}
+
+			/*
+			 * because this is an RCU protected lookup, we could
+			 * find a recently freed or even reallocated inode
+			 * during the lookup. We need to check under the
+			 * i_flags_lock for a valid inode here. Skip it if it
+			 * is not valid, the wrong inode or stale.
+			 */
+			spin_lock(&ip->i_flags_lock);
+			if (ip->i_ino != inum + i ||
+			    __xfs_iflags_test(ip, XFS_ISTALE)) {
+				spin_unlock(&ip->i_flags_lock);
+				rcu_read_unlock();
+				continue;
+			}
+			spin_unlock(&ip->i_flags_lock);
+
+			/*
+			 * Don't try to lock/unlock the current inode, but we
+			 * _cannot_ skip the other inodes that we did not find
+			 * in the list attached to the buffer and are not
+			 * already marked stale. If we can't lock it, back off
+			 * and retry.
+			 */
+			if (ip != free_ip &&
+			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+				rcu_read_unlock();
+				delay(1);
+				goto retry;
+			}
+			rcu_read_unlock();
+
+			xfs_iflock(ip);
+			xfs_iflags_set(ip, XFS_ISTALE);
+
+			/*
+			 * we don't need to attach clean inodes or those only
+			 * with unlogged changes (which we throw away, anyway).
+			 */
+			iip = ip->i_itemp;
+			if (!iip || xfs_inode_clean(ip)) {
+				ASSERT(ip != free_ip);
+				xfs_ifunlock(ip);
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+				continue;
+			}
+
+			iip->ili_last_fields = iip->ili_fields;
+			iip->ili_fields = 0;
+			iip->ili_logged = 1;
+			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+						&iip->ili_item.li_lsn);
+
+			xfs_buf_attach_iodone(bp, xfs_istale_done,
+						  &iip->ili_item);
+
+			if (ip != free_ip)
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+
+		xfs_trans_stale_inode_buf(tp, bp);
+		xfs_trans_binval(tp, bp);
+	}
+
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * This is called to return an inode to the inode free list.
+ * The inode should already be truncated to 0 length and have
+ * no pages associated with it.  This routine also assumes that
+ * the inode is already a part of the transaction.
+ *
+ * The on-disk copy of the inode will have been added to the list
+ * of unlinked inodes in the AGI. We need to remove the inode from
+ * that list atomically with respect to freeing it here.
+ */
+int
+xfs_ifree(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_bmap_free_t	*flist)
+{
+	int			error;
+	int			delete;
+	xfs_ino_t		first_ino;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_nextents == 0);
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
+	ASSERT(ip->i_d.di_nblocks == 0);
+
+	/*
+	 * Pull the on-disk inode from the AGI unlinked list.
+	 */
+	error = xfs_iunlink_remove(tp, ip);
+	if (error)
+		return error;
+
+	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+	if (error)
+		return error;
+
+	ip->i_d.di_mode = 0;		/* mark incore inode as free */
+	ip->i_d.di_flags = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	/*
+	 * Bump the generation count so no one will be confused
+	 * by reincarnations of this inode.
+	 */
+	ip->i_d.di_gen++;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (delete)
+		error = xfs_ifree_cluster(ip, tp, first_ino);
+
+	return error;
+}
+
+/*
+ * This is called to unpin an inode.  The caller must have the inode locked
+ * in at least shared mode so that the buffer cannot be subsequently pinned
+ * once someone is waiting for it to be unpinned.
+ */
+static void
+xfs_iunpin(
+	struct xfs_inode	*ip)
+{
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+
+	/* Give the log a push to start the unpinning I/O */
+	xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
+
+}
+
+static void
+__xfs_iunpin_wait(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+	xfs_iunpin(ip);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_ipincount(ip))
+			io_schedule();
+	} while (xfs_ipincount(ip));
+	finish_wait(wq, &wait.wait);
+}
+
+void
+xfs_iunpin_wait(
+	struct xfs_inode	*ip)
+{
+	if (xfs_ipincount(ip))
+		__xfs_iunpin_wait(ip);
+}
+
+/*
+ * Removing an inode from the namespace involves removing the directory entry
+ * and dropping the link count on the inode. Removing the directory entry can
+ * result in locking an AGF (directory blocks were freed) and removing a link
+ * count can result in placing the inode on an unlinked list which results in
+ * locking an AGI.
+ *
+ * The big problem here is that we have an ordering constraint on AGF and AGI
+ * locking - inode allocation locks the AGI, then can allocate a new extent for
+ * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
+ * removes the inode from the unlinked list, requiring that we lock the AGI
+ * first, and then freeing the inode can result in an inode chunk being freed
+ * and hence freeing disk space requiring that we lock an AGF.
+ *
+ * Hence the ordering that is imposed by other parts of the code is AGI before
+ * AGF. This means we cannot remove the directory entry before we drop the inode
+ * reference count and put it on the unlinked list as this results in a lock
+ * order of AGF then AGI, and this can deadlock against inode allocation and
+ * freeing. Therefore we must drop the link counts before we remove the
+ * directory entry.
+ *
+ * This is still safe from a transactional point of view - it is not until we
+ * get to xfs_bmap_finish() that we have the possibility of multiple
+ * transactions in this operation. Hence as long as we remove the directory
+ * entry and drop the link count in the first transaction of the remove
+ * operation, there are no transactional constraints on the ordering here.
+ */
+int
+xfs_remove(
+	xfs_inode_t             *dp,
+	struct xfs_name		*name,
+	xfs_inode_t		*ip)
+{
+	xfs_mount_t		*mp = dp->i_mount;
+	xfs_trans_t             *tp = NULL;
+	int			is_dir = S_ISDIR(ip->i_d.di_mode);
+	int                     error = 0;
+	xfs_bmap_free_t         free_list;
+	xfs_fsblock_t           first_block;
+	int			cancel_flags;
+	int			committed;
+	uint			resblks;
+
+	trace_xfs_remove(dp, name);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		goto std_return;
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		goto std_return;
+
+	if (is_dir)
+		tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+	else
+		tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+	/*
+	 * We try to get the real space reservation first,
+	 * allowing for directory btree deletion(s) implying
+	 * possible bmap insert(s).  If we can't get the space
+	 * reservation then we use 0 instead, and avoid the bmap
+	 * btree insert(s) in the directory code by, if the bmap
+	 * insert tries to happen, instead trimming the LAST
+	 * block from the directory.
+	 */
+	resblks = XFS_REMOVE_SPACE_RES(mp);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+	if (error == -ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+	}
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		cancel_flags = 0;
+		goto out_trans_cancel;
+	}
+
+	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we're removing a directory perform some additional validation.
+	 */
+	cancel_flags |= XFS_TRANS_ABORT;
+	if (is_dir) {
+		ASSERT(ip->i_d.di_nlink >= 2);
+		if (ip->i_d.di_nlink != 2) {
+			error = -ENOTEMPTY;
+			goto out_trans_cancel;
+		}
+		if (!xfs_dir_isempty(ip)) {
+			error = -ENOTEMPTY;
+			goto out_trans_cancel;
+		}
+
+		/* Drop the link from ip's "..".  */
+		error = xfs_droplink(tp, dp);
+		if (error)
+			goto out_trans_cancel;
+
+		/* Drop the "." link from ip to self.  */
+		error = xfs_droplink(tp, ip);
+		if (error)
+			goto out_trans_cancel;
+	} else {
+		/*
+		 * When removing a non-directory we need to log the parent
+		 * inode here.  For a directory this is done implicitly
+		 * by the xfs_droplink call for the ".." entry.
+		 */
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/* Drop the link from dp to ip. */
+	error = xfs_droplink(tp, ip);
+	if (error)
+		goto out_trans_cancel;
+
+	xfs_bmap_init(&free_list, &first_block);
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error) {
+		ASSERT(error != -ENOENT);
+		goto out_bmap_cancel;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * remove transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto std_return;
+
+	if (is_dir && xfs_inode_is_filestream(ip))
+		xfs_filestream_deassociate(ip);
+
+	return 0;
+
+ out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+ out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+	return error;
+}
+
+/*
+ * Enter all inodes for a rename transaction into a sorted array.
+ */
+#define __XFS_SORT_INODES	5
+STATIC void
+xfs_sort_for_rename(
+	struct xfs_inode	*dp1,	/* in: old (source) directory inode */
+	struct xfs_inode	*dp2,	/* in: new (target) directory inode */
+	struct xfs_inode	*ip1,	/* in: inode of old entry */
+	struct xfs_inode	*ip2,	/* in: inode of new entry */
+	struct xfs_inode	*wip,	/* in: whiteout inode */
+	struct xfs_inode	**i_tab,/* out: sorted array of inodes */
+	int			*num_inodes)  /* in/out: inodes in array */
+{
+	int			i, j;
+
+	ASSERT(*num_inodes == __XFS_SORT_INODES);
+	memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
+
+	/*
+	 * i_tab contains a list of pointers to inodes.  We initialize
+	 * the table here & we'll sort it.  We will then use it to
+	 * order the acquisition of the inode locks.
+	 *
+	 * Note that the table may contain duplicates.  e.g., dp1 == dp2.
+	 */
+	i = 0;
+	i_tab[i++] = dp1;
+	i_tab[i++] = dp2;
+	i_tab[i++] = ip1;
+	if (ip2)
+		i_tab[i++] = ip2;
+	if (wip)
+		i_tab[i++] = wip;
+	*num_inodes = i;
+
+	/*
+	 * Sort the elements via bubble sort.  (Remember, there are at
+	 * most 5 elements to sort, so this is adequate.)
+	 */
+	for (i = 0; i < *num_inodes; i++) {
+		for (j = 1; j < *num_inodes; j++) {
+			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+				struct xfs_inode *temp = i_tab[j];
+				i_tab[j] = i_tab[j-1];
+				i_tab[j-1] = temp;
+			}
+		}
+	}
+}
+
+static int
+xfs_finish_rename(
+	struct xfs_trans	*tp,
+	struct xfs_bmap_free	*free_list)
+{
+	int			committed = 0;
+	int			error;
+
+	/*
+	 * If this is a synchronous mount, make sure that the rename transaction
+	 * goes to disk before returning to the user.
+	 */
+	if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_bmap_finish(&tp, free_list, &committed);
+	if (error) {
+		xfs_bmap_cancel(free_list);
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+		return error;
+	}
+
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+}
+
+/*
+ * xfs_cross_rename()
+ *
+ * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
+ */
+STATIC int
+xfs_cross_rename(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp1,
+	struct xfs_name		*name1,
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*dp2,
+	struct xfs_name		*name2,
+	struct xfs_inode	*ip2,
+	struct xfs_bmap_free	*free_list,
+	xfs_fsblock_t		*first_block,
+	int			spaceres)
+{
+	int		error = 0;
+	int		ip1_flags = 0;
+	int		ip2_flags = 0;
+	int		dp2_flags = 0;
+
+	/* Swap inode number for dirent in first parent */
+	error = xfs_dir_replace(tp, dp1, name1,
+				ip2->i_ino,
+				first_block, free_list, spaceres);
+	if (error)
+		goto out_trans_abort;
+
+	/* Swap inode number for dirent in second parent */
+	error = xfs_dir_replace(tp, dp2, name2,
+				ip1->i_ino,
+				first_block, free_list, spaceres);
+	if (error)
+		goto out_trans_abort;
+
+	/*
+	 * If we're renaming one or more directories across different parents,
+	 * update the respective ".." entries (and link counts) to match the new
+	 * parents.
+	 */
+	if (dp1 != dp2) {
+		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
+		if (S_ISDIR(ip2->i_d.di_mode)) {
+			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+						dp1->i_ino, first_block,
+						free_list, spaceres);
+			if (error)
+				goto out_trans_abort;
+
+			/* transfer ip2 ".." reference to dp1 */
+			if (!S_ISDIR(ip1->i_d.di_mode)) {
+				error = xfs_droplink(tp, dp2);
+				if (error)
+					goto out_trans_abort;
+				error = xfs_bumplink(tp, dp1);
+				if (error)
+					goto out_trans_abort;
+			}
+
+			/*
+			 * Although ip1 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+		}
+
+		if (S_ISDIR(ip1->i_d.di_mode)) {
+			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+						dp2->i_ino, first_block,
+						free_list, spaceres);
+			if (error)
+				goto out_trans_abort;
+
+			/* transfer ip1 ".." reference to dp2 */
+			if (!S_ISDIR(ip2->i_d.di_mode)) {
+				error = xfs_droplink(tp, dp1);
+				if (error)
+					goto out_trans_abort;
+				error = xfs_bumplink(tp, dp2);
+				if (error)
+					goto out_trans_abort;
+			}
+
+			/*
+			 * Although ip2 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_CHG;
+		}
+	}
+
+	if (ip1_flags) {
+		xfs_trans_ichgtime(tp, ip1, ip1_flags);
+		xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+	}
+	if (ip2_flags) {
+		xfs_trans_ichgtime(tp, ip2, ip2_flags);
+		xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+	}
+	if (dp2_flags) {
+		xfs_trans_ichgtime(tp, dp2, dp2_flags);
+		xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+	return xfs_finish_rename(tp, free_list);
+
+out_trans_abort:
+	xfs_bmap_cancel(free_list);
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	return error;
+}
+
+/*
+ * xfs_rename_alloc_whiteout()
+ *
+ * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * whiteout in a rename transaction. We use a tmpfile inode here so that if we
+ * crash between allocating the inode and linking it into the rename transaction
+ * recovery will free the inode and we won't leak it.
+ */
+static int
+xfs_rename_alloc_whiteout(
+	struct xfs_inode	*dp,
+	struct xfs_inode	**wip)
+{
+	struct xfs_inode	*tmpfile;
+	int			error;
+
+	error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+	if (error)
+		return error;
+
+	/*
+	 * Prepare the tmpfile inode as if it were created through the VFS.
+	 * Otherwise, the link increment paths will complain about nlink 0->1.
+	 * Drop the link count as done by d_tmpfile(), complete the inode setup
+	 * and flag it as linkable.
+	 */
+	drop_nlink(VFS_I(tmpfile));
+	xfs_finish_inode_setup(tmpfile);
+	VFS_I(tmpfile)->i_state |= I_LINKABLE;
+
+	*wip = tmpfile;
+	return 0;
+}
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+	struct xfs_inode	*src_dp,
+	struct xfs_name		*src_name,
+	struct xfs_inode	*src_ip,
+	struct xfs_inode	*target_dp,
+	struct xfs_name		*target_name,
+	struct xfs_inode	*target_ip,
+	unsigned int		flags)
+{
+	struct xfs_mount	*mp = src_dp->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	struct xfs_inode	*wip = NULL;		/* whiteout inode */
+	struct xfs_inode	*inodes[__XFS_SORT_INODES];
+	int			num_inodes = __XFS_SORT_INODES;
+	bool			new_parent = (src_dp != target_dp);
+	bool			src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+	int			cancel_flags = 0;
+	int			spaceres;
+	int			error;
+
+	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+
+	if ((flags & RENAME_EXCHANGE) && !target_ip)
+		return -EINVAL;
+
+	/*
+	 * If we are doing a whiteout operation, allocate the whiteout inode
+	 * we will be placing at the target and ensure the type is set
+	 * appropriately.
+	 */
+	if (flags & RENAME_WHITEOUT) {
+		ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
+		error = xfs_rename_alloc_whiteout(target_dp, &wip);
+		if (error)
+			return error;
+
+		/* setup target dirent info as whiteout */
+		src_name->type = XFS_DIR3_FT_CHRDEV;
+	}
+
+	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
+				inodes, &num_inodes);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+	if (error == -ENOSPC) {
+		spaceres = 0;
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+	}
+	if (error)
+		goto out_trans_cancel;
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+	/*
+	 * Attach the dquots to the inodes
+	 */
+	error = xfs_qm_vop_rename_dqattach(inodes);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Lock all the participating inodes. Depending upon whether
+	 * the target_name exists in the target directory, and
+	 * whether the target directory is the same as the source
+	 * directory, we can lock from 2 to 4 inodes.
+	 */
+	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+	/*
+	 * Join all the inodes to the transaction. From this point on,
+	 * we can rely on either trans_commit or trans_cancel to unlock
+	 * them.
+	 */
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+	if (new_parent)
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+	if (target_ip)
+		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+	if (wip)
+		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we are using project inheritance, we only allow renames
+	 * into our tree when the project IDs are the same; else the
+	 * tree quota mechanism would be circumvented.
+	 */
+	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+		     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+		error = -EXDEV;
+		goto out_trans_cancel;
+	}
+
+	xfs_bmap_init(&free_list, &first_block);
+
+	/* RENAME_EXCHANGE is unique from here on. */
+	if (flags & RENAME_EXCHANGE)
+		return xfs_cross_rename(tp, src_dp, src_name, src_ip,
+					target_dp, target_name, target_ip,
+					&free_list, &first_block, spaceres);
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If there's no space reservation, check the entry will
+		 * fit before actually inserting it.
+		 */
+		if (!spaceres) {
+			error = xfs_dir_canenter(tp, target_dp, target_name);
+			if (error)
+				goto out_trans_cancel;
+		}
+		/*
+		 * If target does not exist and the rename crosses
+		 * directories, adjust the target directory link count
+		 * to account for the ".." reference from the new entry.
+		 */
+		error = xfs_dir_createname(tp, target_dp, target_name,
+						src_ip->i_ino, &first_block,
+						&free_list, spaceres);
+		if (error == -ENOSPC)
+			goto out_bmap_cancel;
+		if (error)
+			goto out_trans_abort;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		if (new_parent && src_is_directory) {
+			error = xfs_bumplink(tp, target_dp);
+			if (error)
+				goto out_trans_abort;
+		}
+	} else { /* target_ip != NULL */
+		/*
+		 * If target exists and it's a directory, check that both
+		 * target and source are directories and that target can be
+		 * destroyed, or that neither is a directory.
+		 */
+		if (S_ISDIR(target_ip->i_d.di_mode)) {
+			/*
+			 * Make sure target dir is empty.
+			 */
+			if (!(xfs_dir_isempty(target_ip)) ||
+			    (target_ip->i_d.di_nlink > 2)) {
+				error = -EEXIST;
+				goto out_trans_cancel;
+			}
+		}
+
+		/*
+		 * Link the source inode under the target name.
+		 * If the source inode is a directory and we are moving
+		 * it across directories, its ".." entry will be
+		 * inconsistent until we replace that down below.
+		 *
+		 * In case there is already an entry with the same
+		 * name at the destination directory, remove it first.
+		 */
+		error = xfs_dir_replace(tp, target_dp, target_name,
+					src_ip->i_ino,
+					&first_block, &free_list, spaceres);
+		if (error)
+			goto out_trans_abort;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		/*
+		 * Decrement the link count on the target since the target
+		 * dir no longer points to it.
+		 */
+		error = xfs_droplink(tp, target_ip);
+		if (error)
+			goto out_trans_abort;
+
+		if (src_is_directory) {
+			/*
+			 * Drop the link from the old "." entry.
+			 */
+			error = xfs_droplink(tp, target_ip);
+			if (error)
+				goto out_trans_abort;
+		}
+	} /* target_ip != NULL */
+
+	/*
+	 * Remove the source.
+	 */
+	if (new_parent && src_is_directory) {
+		/*
+		 * Rewrite the ".." entry to point to the new
+		 * directory.
+		 */
+		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+					target_dp->i_ino,
+					&first_block, &free_list, spaceres);
+		ASSERT(error != -EEXIST);
+		if (error)
+			goto out_trans_abort;
+	}
+
+	/*
+	 * We always want to hit the ctime on the source inode.
+	 *
+	 * This isn't strictly required by the standards since the source
+	 * inode isn't really being changed, but old unix file systems did
+	 * it and some incremental backup programs won't work without it.
+	 */
+	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+	/*
+	 * Adjust the link count on src_dp.  This is necessary when
+	 * renaming a directory, either within one parent when
+	 * the target existed, or across two parent directories.
+	 */
+	if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+		/*
+		 * Decrement link count on src_directory since the
+		 * entry that's moved no longer points to it.
+		 */
+		error = xfs_droplink(tp, src_dp);
+		if (error)
+			goto out_trans_abort;
+	}
+
+	/*
+	 * For whiteouts, we only need to update the source dirent with the
+	 * inode number of the whiteout inode rather than removing it
+	 * altogether.
+	 */
+	if (wip) {
+		error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
+					&first_block, &free_list, spaceres);
+	} else
+		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+					   &first_block, &free_list, spaceres);
+	if (error)
+		goto out_trans_abort;
+
+	/*
+	 * For whiteouts, we need to bump the link count on the whiteout inode.
+	 * This means that failures all the way up to this point leave the inode
+	 * on the unlinked list and so cleanup is a simple matter of dropping
+	 * the remaining reference to it. If we fail here after bumping the link
+	 * count, we're shutting down the filesystem so we'll never see the
+	 * intermediate state on disk.
+	 */
+	if (wip) {
+		ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
+		error = xfs_bumplink(tp, wip);
+		if (error)
+			goto out_trans_abort;
+		error = xfs_iunlink_remove(tp, wip);
+		if (error)
+			goto out_trans_abort;
+		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+
+		/*
+		 * Now we have a real link, clear the "I'm a tmpfile" state
+		 * flag from the inode so it doesn't accidentally get misused in
+		 * future.
+		 */
+		VFS_I(wip)->i_state &= ~I_LINKABLE;
+	}
+
+	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+	if (new_parent)
+		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+	error = xfs_finish_rename(tp, &free_list);
+	if (wip)
+		IRELE(wip);
+	return error;
+
+out_trans_abort:
+	cancel_flags |= XFS_TRANS_ABORT;
+out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+	if (wip)
+		IRELE(wip);
+	return error;
+}
+
+STATIC int
+xfs_iflush_cluster(
+	xfs_inode_t	*ip,
+	xfs_buf_t	*bp)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	unsigned long		first_index, mask;
+	unsigned long		inodes_per_cluster;
+	int			ilist_size;
+	xfs_inode_t		**ilist;
+	xfs_inode_t		*iq;
+	int			nr_found;
+	int			clcount = 0;
+	int			bufwasdelwri;
+	int			i;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
+	if (!ilist)
+		goto out_put;
+
+	mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
+	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+	rcu_read_lock();
+	/* really need a gang lookup range call here */
+	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+					first_index, inodes_per_cluster);
+	if (nr_found == 0)
+		goto out_free;
+
+	for (i = 0; i < nr_found; i++) {
+		iq = ilist[i];
+		if (iq == ip)
+			continue;
+
+		/*
+		 * because this is an RCU protected lookup, we could find a
+		 * recently freed or even reallocated inode during the lookup.
+		 * We need to check under the i_flags_lock for a valid inode
+		 * here. Skip it if it is not valid or the wrong inode.
+		 */
+		spin_lock(&ip->i_flags_lock);
+		if (!ip->i_ino ||
+		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+			spin_unlock(&ip->i_flags_lock);
+			continue;
+		}
+		spin_unlock(&ip->i_flags_lock);
+
+		/*
+		 * Do an un-protected check to see if the inode is dirty and
+		 * is a candidate for flushing.  These checks will be repeated
+		 * later after the appropriate locks are acquired.
+		 */
+		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+			continue;
+
+		/*
+		 * Try to get locks.  If any are unavailable or it is pinned,
+		 * then this inode cannot be flushed and is skipped.
+		 */
+
+		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+			continue;
+		if (!xfs_iflock_nowait(iq)) {
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+		if (xfs_ipincount(iq)) {
+			xfs_ifunlock(iq);
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+
+		/*
+		 * arriving here means that this inode can be flushed.  First
+		 * re-check that it's dirty before flushing.
+		 */
+		if (!xfs_inode_clean(iq)) {
+			int	error;
+			error = xfs_iflush_int(iq, bp);
+			if (error) {
+				xfs_iunlock(iq, XFS_ILOCK_SHARED);
+				goto cluster_corrupt_out;
+			}
+			clcount++;
+		} else {
+			xfs_ifunlock(iq);
+		}
+		xfs_iunlock(iq, XFS_ILOCK_SHARED);
+	}
+
+	if (clcount) {
+		XFS_STATS_INC(xs_icluster_flushcnt);
+		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+	}
+
+out_free:
+	rcu_read_unlock();
+	kmem_free(ilist);
+out_put:
+	xfs_perag_put(pag);
+	return 0;
+
+
+cluster_corrupt_out:
+	/*
+	 * Corruption detected in the clustering loop.  Invalidate the
+	 * inode buffer and shut down the filesystem.
+	 */
+	rcu_read_unlock();
+	/*
+	 * Clean up the buffer.  If it was delwri, just release it --
+	 * brelse can handle it with no problems.  If not, shut down the
+	 * filesystem before releasing the buffer.
+	 */
+	bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
+	if (bufwasdelwri)
+		xfs_buf_relse(bp);
+
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
+	if (!bufwasdelwri) {
+		/*
+		 * Just like incore_relse: if we have b_iodone functions,
+		 * mark the buffer as an error and call them.  Otherwise
+		 * mark it as stale and brelse.
+		 */
+		if (bp->b_iodone) {
+			XFS_BUF_UNDONE(bp);
+			xfs_buf_stale(bp);
+			xfs_buf_ioerror(bp, -EIO);
+			xfs_buf_ioend(bp);
+		} else {
+			xfs_buf_stale(bp);
+			xfs_buf_relse(bp);
+		}
+	}
+
+	/*
+	 * Unlocks the flush lock
+	 */
+	xfs_iflush_abort(iq, false);
+	kmem_free(ilist);
+	xfs_perag_put(pag);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Flush dirty inode metadata into the backing buffer.
+ *
+ * The caller must have the inode lock and the inode flush lock held.  The
+ * inode lock will still be held upon return to the caller, and the inode
+ * flush lock will be released after the inode has reached the disk.
+ *
+ * The caller must write out the buffer returned in *bpp and release it.
+ */
+int
+xfs_iflush(
+	struct xfs_inode	*ip,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_buf		*bp;
+	struct xfs_dinode	*dip;
+	int			error;
+
+	XFS_STATS_INC(xs_iflush_count);
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(xfs_isiflocked(ip));
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+
+	*bpp = NULL;
+
+	xfs_iunpin_wait(ip);
+
+	/*
+	 * For stale inodes we cannot rely on the backing buffer remaining
+	 * stale in cache for the remaining life of the stale inode and so
+	 * xfs_imap_to_bp() below may give us a buffer that no longer contains
+	 * inodes below. We have to check this after ensuring the inode is
+	 * unpinned so that it is safe to reclaim the stale inode after the
+	 * flush call.
+	 */
+	if (xfs_iflags_test(ip, XFS_ISTALE)) {
+		xfs_ifunlock(ip);
+		return 0;
+	}
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this inode
+	 * to disk, because the log record didn't make it to disk.
+	 *
+	 * We also have to remove the log item from the AIL in this case,
+	 * as we wait for an empty AIL as part of the unmount process.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		error = -EIO;
+		goto abort_out;
+	}
+
+	/*
+	 * Get the buffer containing the on-disk inode.
+	 */
+	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
+			       0);
+	if (error || !bp) {
+		xfs_ifunlock(ip);
+		return error;
+	}
+
+	/*
+	 * First flush out the inode that xfs_iflush was called with.
+	 */
+	error = xfs_iflush_int(ip, bp);
+	if (error)
+		goto corrupt_out;
+
+	/*
+	 * If the buffer is pinned then push on the log now so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (xfs_buf_ispinned(bp))
+		xfs_log_force(mp, 0);
+
+	/*
+	 * inode clustering:
+	 * see if other inodes can be gathered into this write
+	 */
+	error = xfs_iflush_cluster(ip, bp);
+	if (error)
+		goto cluster_corrupt_out;
+
+	*bpp = bp;
+	return 0;
+
+corrupt_out:
+	xfs_buf_relse(bp);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+cluster_corrupt_out:
+	error = -EFSCORRUPTED;
+abort_out:
+	/*
+	 * Unlocks the flush lock
+	 */
+	xfs_iflush_abort(ip, false);
+	return error;
+}
+
+STATIC int
+xfs_iflush_int(
+	struct xfs_inode	*ip,
+	struct xfs_buf		*bp)
+{
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	struct xfs_dinode	*dip;
+	struct xfs_mount	*mp = ip->i_mount;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(xfs_isiflocked(ip));
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+	ASSERT(iip != NULL && iip->ili_fields != 0);
+	ASSERT(ip->i_d.di_version > 1);
+
+	/* set *dip = inode's place in the buffer */
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+
+	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
+			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
+				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+			__func__, ip->i_ino, ip, ip->i_d.di_magic);
+		goto corrupt_out;
+	}
+	if (S_ISREG(ip->i_d.di_mode)) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad regular inode %Lu, ptr 0x%p",
+				__func__, ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	} else if (S_ISDIR(ip->i_d.di_mode)) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad directory inode %Lu, ptr 0x%p",
+				__func__, ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
+				XFS_RANDOM_IFLUSH_5)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: detected corrupt incore inode %Lu, "
+			"total extents = %d, nblocks = %Ld, ptr 0x%p",
+			__func__, ip->i_ino,
+			ip->i_d.di_nextents + ip->i_d.di_anextents,
+			ip->i_d.di_nblocks, ip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
+				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
+		goto corrupt_out;
+	}
+
+	/*
+	 * Inode item log recovery for v2 inodes are dependent on the
+	 * di_flushiter count for correct sequencing. We bump the flush
+	 * iteration count so we can detect flushes which postdate a log record
+	 * during recovery. This is redundant as we now log every change and
+	 * hence this can't happen but we need to still do it to ensure
+	 * backwards compatibility with old kernels that predate logging all
+	 * inode changes.
+	 */
+	if (ip->i_d.di_version < 3)
+		ip->i_d.di_flushiter++;
+
+	/*
+	 * Copy the dirty parts of the inode into the on-disk
+	 * inode.  We always copy out the core of the inode,
+	 * because if the inode is dirty at all the core must
+	 * be.
+	 */
+	xfs_dinode_to_disk(dip, &ip->i_d);
+
+	/* Wrap, we never let the log put out DI_MAX_FLUSH */
+	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
+		ip->i_d.di_flushiter = 0;
+
+	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
+	if (XFS_IFORK_Q(ip))
+		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * We've recorded everything logged in the inode, so we'd like to clear
+	 * the ili_fields bits so we don't log and flush things unnecessarily.
+	 * However, we can't stop logging all this information until the data
+	 * we've copied into the disk buffer is written to disk.  If we did we
+	 * might overwrite the copy of the inode in the log with all the data
+	 * after re-logging only part of it, and in the face of a crash we
+	 * wouldn't have all the data we need to recover.
+	 *
+	 * What we do is move the bits to the ili_last_fields field.  When
+	 * logging the inode, these bits are moved back to the ili_fields field.
+	 * In the xfs_iflush_done() routine we clear ili_last_fields, since we
+	 * know that the information those bits represent is permanently on
+	 * disk.  As long as the flush completes before the inode is logged
+	 * again, then both ili_fields and ili_last_fields will be cleared.
+	 *
+	 * We can play with the ili_fields bits here, because the inode lock
+	 * must be held exclusively in order to set bits there and the flush
+	 * lock protects the ili_last_fields bits.  Set ili_logged so the flush
+	 * done routine can tell whether or not to look in the AIL.  Also, store
+	 * the current LSN of the inode so that we can tell whether the item has
+	 * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
+	 * need the AIL lock, because it is a 64 bit value that cannot be read
+	 * atomically.
+	 */
+	iip->ili_last_fields = iip->ili_fields;
+	iip->ili_fields = 0;
+	iip->ili_logged = 1;
+
+	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+				&iip->ili_item.li_lsn);
+
+	/*
+	 * Attach the function xfs_iflush_done to the inode's
+	 * buffer.  This will remove the inode from the AIL
+	 * and unlock the inode's flush lock when the inode is
+	 * completely written to disk.
+	 */
+	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+
+	/* update the lsn in the on disk inode if required */
+	if (ip->i_d.di_version == 3)
+		dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
+
+	/* generate the checksum. */
+	xfs_dinode_calc_crc(mp, dip);
+
+	ASSERT(bp->b_fspriv != NULL);
+	ASSERT(bp->b_iodone != NULL);
+	return 0;
+
+corrupt_out:
+	return -EFSCORRUPTED;
+}
diff --git a/kernel/fs/xfs/xfs_inode.h b/kernel/fs/xfs/xfs_inode.h
new file mode 100644
index 000000000..8f22d2036
--- /dev/null
+++ b/kernel/fs/xfs/xfs_inode.h
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_INODE_H__
+#define	__XFS_INODE_H__
+
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+
+/*
+ * Kernel only inode definitions
+ */
+struct xfs_dinode;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+
+typedef struct xfs_inode {
+	/* Inode linking and identification information. */
+	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
+	struct xfs_dquot	*i_udquot;	/* user dquot */
+	struct xfs_dquot	*i_gdquot;	/* group dquot */
+	struct xfs_dquot	*i_pdquot;	/* project dquot */
+
+	/* Inode location stuff */
+	xfs_ino_t		i_ino;		/* inode number (agno/agino)*/
+	struct xfs_imap		i_imap;		/* location for xfs_imap() */
+
+	/* Extent information. */
+	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
+	xfs_ifork_t		i_df;		/* data fork */
+
+	/* operations vectors */
+	const struct xfs_dir_ops *d_ops;		/* directory ops vector */
+
+	/* Transaction and locking information. */
+	struct xfs_inode_log_item *i_itemp;	/* logging information */
+	mrlock_t		i_lock;		/* inode lock */
+	mrlock_t		i_iolock;	/* inode IO lock */
+	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
+	atomic_t		i_pincount;	/* inode pin count */
+	spinlock_t		i_flags_lock;	/* inode i_flags lock */
+	/* Miscellaneous state. */
+	unsigned long		i_flags;	/* see defined flags below */
+	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
+
+	xfs_icdinode_t		i_d;		/* most of ondisk inode */
+
+	/* VFS inode */
+	struct inode		i_vnode;	/* embedded VFS inode */
+} xfs_inode_t;
+
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+	return container_of(inode, struct xfs_inode, i_vnode);
+}
+
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+	return &ip->i_vnode;
+}
+
+/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk.  Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+	if (S_ISREG(ip->i_d.di_mode))
+		return i_size_read(VFS_I(ip));
+	return ip->i_d.di_size;
+}
+
+/*
+ * If this I/O goes past the on-disk inode size update it unless it would
+ * be past the current in-core inode size.
+ */
+static inline xfs_fsize_t
+xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
+{
+	xfs_fsize_t i_size = i_size_read(VFS_I(ip));
+
+	if (new_size > i_size || new_size < 0)
+		new_size = i_size;
+	return new_size > ip->i_d.di_size ? new_size : 0;
+}
+
+/*
+ * i_flags helper functions
+ */
+static inline void
+__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+	ip->i_flags |= flags;
+}
+
+static inline void
+xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+	spin_lock(&ip->i_flags_lock);
+	__xfs_iflags_set(ip, flags);
+	spin_unlock(&ip->i_flags_lock);
+}
+
+static inline void
+xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
+{
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags &= ~flags;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+static inline int
+__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+	return (ip->i_flags & flags);
+}
+
+static inline int
+xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+	spin_lock(&ip->i_flags_lock);
+	ret = __xfs_iflags_test(ip, flags);
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+static inline int
+xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+
+	spin_lock(&ip->i_flags_lock);
+	ret = ip->i_flags & flags;
+	if (ret)
+		ip->i_flags &= ~flags;
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+
+	spin_lock(&ip->i_flags_lock);
+	ret = ip->i_flags & flags;
+	if (!ret)
+		ip->i_flags |= flags;
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was chosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+	return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+		prid_t projid)
+{
+	ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+	ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+
+static inline prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+		return xfs_get_projid(dp);
+
+	return XFS_PROJID_DEFAULT;
+}
+
+/*
+ * In-core inode flags.
+ */
+#define XFS_IRECLAIM		(1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE		(1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE	(1 << 2) /* inode can be reclaimed */
+#define XFS_INEW		(1 << 3) /* inode has just been allocated */
+#define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
+#define XFS_IFLOCK		(1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */
+#define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
+#define XFS_IDONTCACHE		(1 << 9) /* don't cache the inode long term */
+
+/*
+ * Per-lifetime flags need to be reset when re-using a reclaimable inode during
+ * inode lookup. This prevents unintended behaviour on the new inode from
+ * ocurring.
+ */
+#define XFS_IRECLAIM_RESET_FLAGS	\
+	(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
+	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
+
+/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+	return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+	if (!xfs_iflock_nowait(ip))
+		__xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+	xfs_iflags_clear(ip, XFS_IFLOCK);
+	smp_mb();
+	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+	return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
+/*
+ * Flags for inode locking.
+ * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
+ *		1<<16 - 1<<32-1 -- lockdep annotation (integers)
+ */
+#define	XFS_IOLOCK_EXCL		(1<<0)
+#define	XFS_IOLOCK_SHARED	(1<<1)
+#define	XFS_ILOCK_EXCL		(1<<2)
+#define	XFS_ILOCK_SHARED	(1<<3)
+#define	XFS_MMAPLOCK_EXCL	(1<<4)
+#define	XFS_MMAPLOCK_SHARED	(1<<5)
+
+#define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
+				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
+				| XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
+
+#define XFS_LOCK_FLAGS \
+	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
+	{ XFS_IOLOCK_SHARED,	"IOLOCK_SHARED" }, \
+	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
+	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
+	{ XFS_MMAPLOCK_EXCL,	"MMAPLOCK_EXCL" }, \
+	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }
+
+
+/*
+ * Flags for lockdep annotations.
+ *
+ * XFS_LOCK_PARENT - for directory operations that require locking a
+ * parent directory inode and a child entry inode.  The parent gets locked
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
+ * lock will have a lockdep subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
+ *
+ * XFS_LOCK_INUMORDER - for locking several inodes at the some time
+ * with xfs_lock_inodes().  This flag is used as the starting subclass
+ * and each subsequent lock acquired will increment the subclass by one.
+ * So the first lock acquired will have a lockdep subclass of 4, the
+ * second lock will have a lockdep subclass of 5, and so on. It is
+ * the responsibility of the class builder to shift this to the correct
+ * portion of the lock_mode lockdep mask.
+ */
+#define XFS_LOCK_PARENT		1
+#define XFS_LOCK_RTBITMAP	2
+#define XFS_LOCK_RTSUM		3
+#define XFS_LOCK_INUMORDER	4
+
+#define XFS_IOLOCK_SHIFT	16
+#define	XFS_IOLOCK_PARENT	(XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
+
+#define XFS_MMAPLOCK_SHIFT	20
+
+#define XFS_ILOCK_SHIFT		24
+#define	XFS_ILOCK_PARENT	(XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTBITMAP	(XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTSUM		(XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
+
+#define XFS_IOLOCK_DEP_MASK	0x000f0000
+#define XFS_MMAPLOCK_DEP_MASK	0x00f00000
+#define XFS_ILOCK_DEP_MASK	0xff000000
+#define XFS_LOCK_DEP_MASK	(XFS_IOLOCK_DEP_MASK | \
+				 XFS_MMAPLOCK_DEP_MASK | \
+				 XFS_ILOCK_DEP_MASK)
+
+#define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) \
+					>> XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_DEP(flags)	(((flags) & XFS_MMAPLOCK_DEP_MASK) \
+					>> XFS_MMAPLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) \
+					>> XFS_ILOCK_SHIFT)
+
+/*
+ * For multiple groups support: if S_ISGID bit is set in the parent
+ * directory, group of new file is set to that of the parent, and
+ * new subdirectory gets S_ISGID bit from parent.
+ */
+#define XFS_INHERIT_GID(pip)	\
+	(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
+	 ((pip)->i_d.di_mode & S_ISGID))
+
+int		xfs_release(struct xfs_inode *ip);
+void		xfs_inactive(struct xfs_inode *ip);
+int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+			   struct xfs_inode **ipp, struct xfs_name *ci_name);
+int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
+			   umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+int		xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
+			   umode_t mode, struct xfs_inode **ipp);
+int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+			   struct xfs_inode *ip);
+int		xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+			 struct xfs_name *target_name);
+int		xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+			   struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+			   struct xfs_name *target_name,
+			   struct xfs_inode *target_ip, unsigned int flags);
+
+void		xfs_ilock(xfs_inode_t *, uint);
+int		xfs_ilock_nowait(xfs_inode_t *, uint);
+void		xfs_iunlock(xfs_inode_t *, uint);
+void		xfs_ilock_demote(xfs_inode_t *, uint);
+int		xfs_isilocked(xfs_inode_t *, uint);
+uint		xfs_ilock_data_map_shared(struct xfs_inode *);
+uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
+int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
+			   xfs_nlink_t, xfs_dev_t, prid_t, int,
+			   struct xfs_buf **, xfs_inode_t **);
+
+uint		xfs_ip2xflags(struct xfs_inode *);
+uint		xfs_dic2xflags(struct xfs_dinode *);
+int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
+			   struct xfs_bmap_free *);
+int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
+				      int, xfs_fsize_t);
+int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
+
+void		xfs_iext_realloc(xfs_inode_t *, int, int);
+
+void		xfs_iunpin_wait(xfs_inode_t *);
+#define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
+
+int		xfs_iflush(struct xfs_inode *, struct xfs_buf **);
+void		xfs_lock_inodes(xfs_inode_t **, int, uint);
+void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
+
+xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+
+int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
+			       xfs_nlink_t, xfs_dev_t, prid_t, int,
+			       struct xfs_inode **, int *);
+int		xfs_droplink(struct xfs_trans *, struct xfs_inode *);
+int		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
+
+/* from xfs_file.c */
+enum xfs_prealloc_flags {
+	XFS_PREALLOC_SET	= (1 << 1),
+	XFS_PREALLOC_CLEAR	= (1 << 2),
+	XFS_PREALLOC_SYNC	= (1 << 3),
+	XFS_PREALLOC_INVISIBLE	= (1 << 4),
+};
+
+int	xfs_update_prealloc_flags(struct xfs_inode *ip,
+				  enum xfs_prealloc_flags flags);
+int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
+		     xfs_fsize_t isize, bool *did_zeroing);
+int	xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+
+
+/* from xfs_iops.c */
+/*
+ * When setting up a newly allocated inode, we need to call
+ * xfs_finish_inode_setup() once the inode is fully instantiated at
+ * the VFS level to prevent the rest of the world seeing the inode
+ * before we've completed instantiation. Otherwise we can do it
+ * the moment the inode lookup is complete.
+ */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
+{
+	xfs_iflags_clear(ip, XFS_INEW);
+	barrier();
+	unlock_new_inode(VFS_I(ip));
+}
+
+static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
+{
+	xfs_setup_inode(ip);
+	xfs_finish_inode_setup(ip);
+}
+
+#define IHOLD(ip) \
+do { \
+	ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+	ihold(VFS_I(ip)); \
+	trace_xfs_ihold(ip, _THIS_IP_); \
+} while (0)
+
+#define IRELE(ip) \
+do { \
+	trace_xfs_irele(ip, _THIS_IP_); \
+	iput(VFS_I(ip)); \
+} while (0)
+
+extern struct kmem_zone	*xfs_inode_zone;
+
+/*
+ * Flags for read/write calls
+ */
+#define XFS_IO_ISDIRECT	0x00001		/* bypass page cache */
+#define XFS_IO_INVIS	0x00002		/* don't update inode timestamps */
+
+#define XFS_IO_FLAGS \
+	{ XFS_IO_ISDIRECT,	"DIRECT" }, \
+	{ XFS_IO_INVIS,		"INVIS"}
+
+#endif	/* __XFS_INODE_H__ */
diff --git a/kernel/fs/xfs/xfs_inode_item.c b/kernel/fs/xfs/xfs_inode_item.c
new file mode 100644
index 000000000..bf13a5a7e
--- /dev/null
+++ b/kernel/fs/xfs/xfs_inode_item.c
@@ -0,0 +1,789 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+
+
+kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
+
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
+
+STATIC void
+xfs_inode_item_data_fork_size(
+	struct xfs_inode_log_item *iip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+		    ip->i_d.di_nextents > 0 &&
+		    ip->i_df.if_bytes > 0) {
+			/* worst case, doesn't subtract delalloc extents */
+			*nbytes += XFS_IFORK_DSIZE(ip);
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+		    ip->i_df.if_broot_bytes > 0) {
+			*nbytes += ip->i_df.if_broot_bytes;
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+		    ip->i_df.if_bytes > 0) {
+			*nbytes += roundup(ip->i_df.if_bytes, 4);
+			*nvecs += 1;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+	case XFS_DINODE_FMT_UUID:
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+STATIC void
+xfs_inode_item_attr_fork_size(
+	struct xfs_inode_log_item *iip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+		    ip->i_d.di_anextents > 0 &&
+		    ip->i_afp->if_bytes > 0) {
+			/* worst case, doesn't subtract unused space */
+			*nbytes += XFS_IFORK_ASIZE(ip);
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+		    ip->i_afp->if_broot_bytes > 0) {
+			*nbytes += ip->i_afp->if_broot_bytes;
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+		    ip->i_afp->if_bytes > 0) {
+			*nbytes += roundup(ip->i_afp->if_bytes, 4);
+			*nvecs += 1;
+		}
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	*nvecs += 2;
+	*nbytes += sizeof(struct xfs_inode_log_format) +
+		   xfs_icdinode_size(ip->i_d.di_version);
+
+	xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
+	if (XFS_IFORK_Q(ip))
+		xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
+}
+
+STATIC void
+xfs_inode_item_format_data_fork(
+	struct xfs_inode_log_item *iip,
+	struct xfs_inode_log_format *ilf,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+	size_t			data_bytes;
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+		    ip->i_d.di_nextents > 0 &&
+		    ip->i_df.if_bytes > 0) {
+			struct xfs_bmbt_rec *p;
+
+			ASSERT(ip->i_df.if_u1.if_extents != NULL);
+			ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
+
+			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
+			data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
+			xlog_finish_iovec(lv, *vecp, data_bytes);
+
+			ASSERT(data_bytes <= ip->i_df.if_bytes);
+
+			ilf->ilf_dsize = data_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_DEXT;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		iip->ili_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+		    ip->i_df.if_broot_bytes > 0) {
+			ASSERT(ip->i_df.if_broot != NULL);
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
+					ip->i_df.if_broot,
+					ip->i_df.if_broot_bytes);
+			ilf->ilf_dsize = ip->i_df.if_broot_bytes;
+			ilf->ilf_size++;
+		} else {
+			ASSERT(!(iip->ili_fields &
+				 XFS_ILOG_DBROOT));
+			iip->ili_fields &= ~XFS_ILOG_DBROOT;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_fields &=
+			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+		    ip->i_df.if_bytes > 0) {
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_df.if_bytes, 4);
+			ASSERT(ip->i_df.if_real_bytes == 0 ||
+			       ip->i_df.if_real_bytes == data_bytes);
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
+					ip->i_df.if_u1.if_data, data_bytes);
+			ilf->ilf_dsize = (unsigned)data_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_DDATA;
+		}
+		break;
+	case XFS_DINODE_FMT_DEV:
+		iip->ili_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_UUID);
+		if (iip->ili_fields & XFS_ILOG_DEV)
+			ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
+		break;
+	case XFS_DINODE_FMT_UUID:
+		iip->ili_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_DEV);
+		if (iip->ili_fields & XFS_ILOG_UUID)
+			ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+STATIC void
+xfs_inode_item_format_attr_fork(
+	struct xfs_inode_log_item *iip,
+	struct xfs_inode_log_format *ilf,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+	size_t			data_bytes;
+
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+
+		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+		    ip->i_d.di_anextents > 0 &&
+		    ip->i_afp->if_bytes > 0) {
+			struct xfs_bmbt_rec *p;
+
+			ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+				ip->i_d.di_anextents);
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+
+			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
+			data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
+			xlog_finish_iovec(lv, *vecp, data_bytes);
+
+			ilf->ilf_asize = data_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_AEXT;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		iip->ili_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+
+		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+		    ip->i_afp->if_broot_bytes > 0) {
+			ASSERT(ip->i_afp->if_broot != NULL);
+
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
+					ip->i_afp->if_broot,
+					ip->i_afp->if_broot_bytes);
+			ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_ABROOT;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_fields &=
+			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+
+		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+		    ip->i_afp->if_bytes > 0) {
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_afp->if_bytes, 4);
+			ASSERT(ip->i_afp->if_real_bytes == 0 ||
+			       ip->i_afp->if_real_bytes == data_bytes);
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
+					ip->i_afp->if_u1.if_data,
+					data_bytes);
+			ilf->ilf_asize = (unsigned)data_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_ADATA;
+		}
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given inode
+ * log item.  It fills the first item with an inode log format structure,
+ * the second with the on-disk inode structure, and a possible third and/or
+ * fourth with the inode data/extents/b-tree root and inode attributes
+ * data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	struct xfs_inode_log_format *ilf;
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(ip->i_d.di_version > 1);
+
+	ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
+	ilf->ilf_type = XFS_LI_INODE;
+	ilf->ilf_ino = ip->i_ino;
+	ilf->ilf_blkno = ip->i_imap.im_blkno;
+	ilf->ilf_len = ip->i_imap.im_len;
+	ilf->ilf_boffset = ip->i_imap.im_boffset;
+	ilf->ilf_fields = XFS_ILOG_CORE;
+	ilf->ilf_size = 2; /* format + core */
+	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
+			&ip->i_d,
+			xfs_icdinode_size(ip->i_d.di_version));
+
+	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
+	if (XFS_IFORK_Q(ip)) {
+		xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
+	} else {
+		iip->ili_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+	}
+
+	/* update the format with the exact fields we actually logged */
+	ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
+}
+
+/*
+ * This is called to pin the inode associated with the inode log
+ * item in memory so it cannot be written out.
+ */
+STATIC void
+xfs_inode_item_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	trace_xfs_inode_pin(ip, _RET_IP_);
+	atomic_inc(&ip->i_pincount);
+}
+
+
+/*
+ * This is called to unpin the inode associated with the inode log
+ * item which was previously pinned with a call to xfs_inode_item_pin().
+ *
+ * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
+ */
+STATIC void
+xfs_inode_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+
+	trace_xfs_inode_unpin(ip, _RET_IP_);
+	ASSERT(atomic_read(&ip->i_pincount) > 0);
+	if (atomic_dec_and_test(&ip->i_pincount))
+		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+}
+
+STATIC uint
+xfs_inode_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	struct xfs_buf		*bp = NULL;
+	uint			rval = XFS_ITEM_SUCCESS;
+	int			error;
+
+	if (xfs_ipincount(ip) > 0)
+		return XFS_ITEM_PINNED;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+		return XFS_ITEM_LOCKED;
+
+	/*
+	 * Re-check the pincount now that we stabilized the value by
+	 * taking the ilock.
+	 */
+	if (xfs_ipincount(ip) > 0) {
+		rval = XFS_ITEM_PINNED;
+		goto out_unlock;
+	}
+
+	/*
+	 * Stale inode items should force out the iclog.
+	 */
+	if (ip->i_flags & XFS_ISTALE) {
+		rval = XFS_ITEM_PINNED;
+		goto out_unlock;
+	}
+
+	/*
+	 * Someone else is already flushing the inode.  Nothing we can do
+	 * here but wait for the flush to finish and remove the item from
+	 * the AIL.
+	 */
+	if (!xfs_iflock_nowait(ip)) {
+		rval = XFS_ITEM_FLUSHING;
+		goto out_unlock;
+	}
+
+	ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
+	ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
+
+	spin_unlock(&lip->li_ailp->xa_lock);
+
+	error = xfs_iflush(ip, &bp);
+	if (!error) {
+		if (!xfs_buf_delwri_queue(bp, buffer_list))
+			rval = XFS_ITEM_FLUSHING;
+		xfs_buf_relse(bp);
+	}
+
+	spin_lock(&lip->li_ailp->xa_lock);
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return rval;
+}
+
+/*
+ * Unlock the inode associated with the inode log item.
+ * Clear the fields of the inode and inode log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the inode.
+ */
+STATIC void
+xfs_inode_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	unsigned short		lock_flags;
+
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	lock_flags = iip->ili_lock_flags;
+	iip->ili_lock_flags = 0;
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+}
+
+/*
+ * This is called to find out where the oldest active copy of the inode log
+ * item in the on disk log resides now that the last log write of it completed
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
+ * latest copy in the on disk log is the only one that matters.  Therefore,
+ * simply return the given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completes before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, just unpin the inode directly and return a LSN of -1 so the
+ * transaction committed code knows that it does not need to do any further
+ * processing on the item.
+ */
+STATIC xfs_lsn_t
+xfs_inode_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	if (xfs_iflags_test(ip, XFS_ISTALE)) {
+		xfs_inode_item_unpin(lip, 0);
+		return -1;
+	}
+	return lsn;
+}
+
+/*
+ * XXX rcc - this one really has to do something.  Probably needs
+ * to stamp in a new field in the incore inode.
+ */
+STATIC void
+xfs_inode_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	INODE_ITEM(lip)->ili_last_lsn = lsn;
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static const struct xfs_item_ops xfs_inode_item_ops = {
+	.iop_size	= xfs_inode_item_size,
+	.iop_format	= xfs_inode_item_format,
+	.iop_pin	= xfs_inode_item_pin,
+	.iop_unpin	= xfs_inode_item_unpin,
+	.iop_unlock	= xfs_inode_item_unlock,
+	.iop_committed	= xfs_inode_item_committed,
+	.iop_push	= xfs_inode_item_push,
+	.iop_committing = xfs_inode_item_committing
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ */
+void
+xfs_inode_item_init(
+	struct xfs_inode	*ip,
+	struct xfs_mount	*mp)
+{
+	struct xfs_inode_log_item *iip;
+
+	ASSERT(ip->i_itemp == NULL);
+	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+
+	iip->ili_inode = ip;
+	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
+						&xfs_inode_item_ops);
+}
+
+/*
+ * Free the inode log item and any memory hanging off of it.
+ */
+void
+xfs_inode_item_destroy(
+	xfs_inode_t	*ip)
+{
+	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+}
+
+
+/*
+ * This is the inode flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the inode is
+ * flushed to disk.  It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
+ */
+void
+xfs_iflush_done(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip;
+	struct xfs_log_item	*blip;
+	struct xfs_log_item	*next;
+	struct xfs_log_item	*prev;
+	struct xfs_ail		*ailp = lip->li_ailp;
+	int			need_ail = 0;
+
+	/*
+	 * Scan the buffer IO completions for other inodes being completed and
+	 * attach them to the current inode log item.
+	 */
+	blip = bp->b_fspriv;
+	prev = NULL;
+	while (blip != NULL) {
+		if (blip->li_cb != xfs_iflush_done) {
+			prev = blip;
+			blip = blip->li_bio_list;
+			continue;
+		}
+
+		/* remove from list */
+		next = blip->li_bio_list;
+		if (!prev) {
+			bp->b_fspriv = next;
+		} else {
+			prev->li_bio_list = next;
+		}
+
+		/* add to current list */
+		blip->li_bio_list = lip->li_bio_list;
+		lip->li_bio_list = blip;
+
+		/*
+		 * while we have the item, do the unlocked check for needing
+		 * the AIL lock.
+		 */
+		iip = INODE_ITEM(blip);
+		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+			need_ail++;
+
+		blip = next;
+	}
+
+	/* make sure we capture the state of the initial inode. */
+	iip = INODE_ITEM(lip);
+	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+		need_ail++;
+
+	/*
+	 * We only want to pull the item from the AIL if it is
+	 * actually there and its location in the log has not
+	 * changed since we started the flush.  Thus, we only bother
+	 * if the ili_logged flag is set and the inode's lsn has not
+	 * changed.  First we check the lsn outside
+	 * the lock since it's cheaper, and then we recheck while
+	 * holding the lock before removing the inode from the AIL.
+	 */
+	if (need_ail) {
+		struct xfs_log_item *log_items[need_ail];
+		int i = 0;
+		spin_lock(&ailp->xa_lock);
+		for (blip = lip; blip; blip = blip->li_bio_list) {
+			iip = INODE_ITEM(blip);
+			if (iip->ili_logged &&
+			    blip->li_lsn == iip->ili_flush_lsn) {
+				log_items[i++] = blip;
+			}
+			ASSERT(i <= need_ail);
+		}
+		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+		xfs_trans_ail_delete_bulk(ailp, log_items, i,
+					  SHUTDOWN_CORRUPT_INCORE);
+	}
+
+
+	/*
+	 * clean up and unlock the flush lock now we are done. We can clear the
+	 * ili_last_fields bits now that we know that the data corresponding to
+	 * them is safely on disk.
+	 */
+	for (blip = lip; blip; blip = next) {
+		next = blip->li_bio_list;
+		blip->li_bio_list = NULL;
+
+		iip = INODE_ITEM(blip);
+		iip->ili_logged = 0;
+		iip->ili_last_fields = 0;
+		xfs_ifunlock(iip->ili_inode);
+	}
+}
+
+/*
+ * This is the inode flushing abort routine.  It is called from xfs_iflush when
+ * the filesystem is shutting down to clean up the inode state.  It is
+ * responsible for removing the inode item from the AIL if it has not been
+ * re-logged, and unlocking the inode's flush lock.
+ */
+void
+xfs_iflush_abort(
+	xfs_inode_t		*ip,
+	bool			stale)
+{
+	xfs_inode_log_item_t	*iip = ip->i_itemp;
+
+	if (iip) {
+		struct xfs_ail	*ailp = iip->ili_item.li_ailp;
+		if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+			spin_lock(&ailp->xa_lock);
+			if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+				/* xfs_trans_ail_delete() drops the AIL lock. */
+				xfs_trans_ail_delete(ailp, &iip->ili_item,
+						stale ?
+						     SHUTDOWN_LOG_IO_ERROR :
+						     SHUTDOWN_CORRUPT_INCORE);
+			} else
+				spin_unlock(&ailp->xa_lock);
+		}
+		iip->ili_logged = 0;
+		/*
+		 * Clear the ili_last_fields bits now that we know that the
+		 * data corresponding to them is safely on disk.
+		 */
+		iip->ili_last_fields = 0;
+		/*
+		 * Clear the inode logging fields so no more flushes are
+		 * attempted.
+		 */
+		iip->ili_fields = 0;
+	}
+	/*
+	 * Release the inode's flush lock since we're done with it.
+	 */
+	xfs_ifunlock(ip);
+}
+
+void
+xfs_istale_done(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true);
+}
+
+/*
+ * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
+ * (which can have different field alignments) to the native version
+ */
+int
+xfs_inode_item_format_convert(
+	xfs_log_iovec_t		*buf,
+	xfs_inode_log_format_t	*in_f)
+{
+	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
+		xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
+
+		in_f->ilf_type = in_f32->ilf_type;
+		in_f->ilf_size = in_f32->ilf_size;
+		in_f->ilf_fields = in_f32->ilf_fields;
+		in_f->ilf_asize = in_f32->ilf_asize;
+		in_f->ilf_dsize = in_f32->ilf_dsize;
+		in_f->ilf_ino = in_f32->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f32->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f32->ilf_blkno;
+		in_f->ilf_len = in_f32->ilf_len;
+		in_f->ilf_boffset = in_f32->ilf_boffset;
+		return 0;
+	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
+		xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
+
+		in_f->ilf_type = in_f64->ilf_type;
+		in_f->ilf_size = in_f64->ilf_size;
+		in_f->ilf_fields = in_f64->ilf_fields;
+		in_f->ilf_asize = in_f64->ilf_asize;
+		in_f->ilf_dsize = in_f64->ilf_dsize;
+		in_f->ilf_ino = in_f64->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f64->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f64->ilf_blkno;
+		in_f->ilf_len = in_f64->ilf_len;
+		in_f->ilf_boffset = in_f64->ilf_boffset;
+		return 0;
+	}
+	return -EFSCORRUPTED;
+}
diff --git a/kernel/fs/xfs/xfs_inode_item.h b/kernel/fs/xfs/xfs_inode_item.h
new file mode 100644
index 000000000..488d81254
--- /dev/null
+++ b/kernel/fs/xfs/xfs_inode_item.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_INODE_ITEM_H__
+#define	__XFS_INODE_ITEM_H__
+
+/* kernel only definitions */
+
+struct xfs_buf;
+struct xfs_bmbt_rec;
+struct xfs_inode;
+struct xfs_mount;
+
+typedef struct xfs_inode_log_item {
+	xfs_log_item_t		ili_item;	   /* common portion */
+	struct xfs_inode	*ili_inode;	   /* inode ptr */
+	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */
+	unsigned short		ili_lock_flags;	   /* lock flags */
+	unsigned short		ili_logged;	   /* flushed logged data */
+	unsigned int		ili_last_fields;   /* fields when flushed */
+	unsigned int		ili_fields;	   /* fields to be logged */
+} xfs_inode_log_item_t;
+
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+	return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
+}
+
+extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
+extern void xfs_inode_item_destroy(struct xfs_inode *);
+extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_iflush_abort(struct xfs_inode *, bool);
+extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
+					 xfs_inode_log_format_t *);
+
+extern struct kmem_zone	*xfs_ili_zone;
+
+#endif	/* __XFS_INODE_ITEM_H__ */
diff --git a/kernel/fs/xfs/xfs_ioctl.c b/kernel/fs/xfs/xfs_ioctl.c
new file mode 100644
index 000000000..87f67c6b6
--- /dev/null
+++ b/kernel/fs/xfs/xfs_ioctl.c
@@ -0,0 +1,1806 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_ioctl.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_fsops.h"
+#include "xfs_discard.h"
+#include "xfs_quota.h"
+#include "xfs_export.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_trans.h"
+#include "xfs_pnfs.h"
+
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/exportfs.h>
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ *    returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ *    returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ *    returns full handle for a path
+ */
+int
+xfs_find_handle(
+	unsigned int		cmd,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	int			hsize;
+	xfs_handle_t		handle;
+	struct inode		*inode;
+	struct fd		f = {NULL};
+	struct path		path;
+	int			error;
+	struct xfs_inode	*ip;
+
+	if (cmd == XFS_IOC_FD_TO_HANDLE) {
+		f = fdget(hreq->fd);
+		if (!f.file)
+			return -EBADF;
+		inode = file_inode(f.file);
+	} else {
+		error = user_lpath((const char __user *)hreq->path, &path);
+		if (error)
+			return error;
+		inode = d_inode(path.dentry);
+	}
+	ip = XFS_I(inode);
+
+	/*
+	 * We can only generate handles for inodes residing on a XFS filesystem,
+	 * and only for regular files, directories or symbolic links.
+	 */
+	error = -EINVAL;
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+		goto out_put;
+
+	error = -EBADF;
+	if (!S_ISREG(inode->i_mode) &&
+	    !S_ISDIR(inode->i_mode) &&
+	    !S_ISLNK(inode->i_mode))
+		goto out_put;
+
+
+	memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+	if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+		/*
+		 * This handle only contains an fsid, zero the rest.
+		 */
+		memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+		hsize = sizeof(xfs_fsid_t);
+	} else {
+		handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+					sizeof(handle.ha_fid.fid_len);
+		handle.ha_fid.fid_pad = 0;
+		handle.ha_fid.fid_gen = ip->i_d.di_gen;
+		handle.ha_fid.fid_ino = ip->i_ino;
+
+		hsize = XFS_HSIZE(handle);
+	}
+
+	error = -EFAULT;
+	if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+	    copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+		goto out_put;
+
+	error = 0;
+
+ out_put:
+	if (cmd == XFS_IOC_FD_TO_HANDLE)
+		fdput(f);
+	else
+		path_put(&path);
+	return error;
+}
+
+/*
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
+ */
+STATIC int
+xfs_handle_acceptable(
+	void			*context,
+	struct dentry		*dentry)
+{
+	return 1;
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen)
+{
+	xfs_handle_t		handle;
+	struct xfs_fid64	fid;
+
+	/*
+	 * Only allow handle opens under a directory.
+	 */
+	if (!S_ISDIR(file_inode(parfilp)->i_mode))
+		return ERR_PTR(-ENOTDIR);
+
+	if (hlen != sizeof(xfs_handle_t))
+		return ERR_PTR(-EINVAL);
+	if (copy_from_user(&handle, uhandle, hlen))
+		return ERR_PTR(-EFAULT);
+	if (handle.ha_fid.fid_len !=
+	    sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
+		return ERR_PTR(-EINVAL);
+
+	memset(&fid, 0, sizeof(struct fid));
+	fid.ino = handle.ha_fid.fid_ino;
+	fid.gen = handle.ha_fid.fid_gen;
+
+	return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
+			FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+			xfs_handle_acceptable, NULL);
+}
+
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
+}
+
+int
+xfs_open_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	const struct cred	*cred = current_cred();
+	int			error;
+	int			fd;
+	int			permflag;
+	struct file		*filp;
+	struct inode		*inode;
+	struct dentry		*dentry;
+	fmode_t			fmode;
+	struct path		path;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	inode = d_inode(dentry);
+
+	/* Restrict xfs_open_by_handle to directories & regular files. */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+		error = -EPERM;
+		goto out_dput;
+	}
+
+#if BITS_PER_LONG != 32
+	hreq->oflags |= O_LARGEFILE;
+#endif
+
+	permflag = hreq->oflags;
+	fmode = OPEN_FMODE(permflag);
+	if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+	    (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
+		error = -EPERM;
+		goto out_dput;
+	}
+
+	if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+		error = -EACCES;
+		goto out_dput;
+	}
+
+	/* Can't write directories. */
+	if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
+		error = -EISDIR;
+		goto out_dput;
+	}
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0) {
+		error = fd;
+		goto out_dput;
+	}
+
+	path.mnt = parfilp->f_path.mnt;
+	path.dentry = dentry;
+	filp = dentry_open(&path, hreq->oflags, cred);
+	dput(dentry);
+	if (IS_ERR(filp)) {
+		put_unused_fd(fd);
+		return PTR_ERR(filp);
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		filp->f_flags |= O_NOATIME;
+		filp->f_mode |= FMODE_NOCMTIME;
+	}
+
+	fd_install(fd, filp);
+	return fd;
+
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+int
+xfs_readlink_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	struct dentry		*dentry;
+	__u32			olen;
+	void			*link;
+	int			error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	/* Restrict this handle operation to symlinks only. */
+	if (!d_is_symlink(dentry)) {
+		error = -EINVAL;
+		goto out_dput;
+	}
+
+	if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
+		error = -EFAULT;
+		goto out_dput;
+	}
+
+	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+	if (!link) {
+		error = -ENOMEM;
+		goto out_dput;
+	}
+
+	error = xfs_readlink(XFS_I(d_inode(dentry)), link);
+	if (error)
+		goto out_kfree;
+	error = readlink_copy(hreq->ohandle, olen, link);
+	if (error)
+		goto out_kfree;
+
+ out_kfree:
+	kfree(link);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+int
+xfs_set_dmattrs(
+	xfs_inode_t     *ip,
+	u_int		evmask,
+	u_int16_t	state)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_trans_t	*tp;
+	int		error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	ip->i_d.di_dmevmask = evmask;
+	ip->i_d.di_dmstate  = state;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	error = xfs_trans_commit(tp, 0);
+
+	return error;
+}
+
+STATIC int
+xfs_fssetdm_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	struct fsdmidata	fsd;
+	xfs_fsop_setdm_handlereq_t dmhreq;
+	struct dentry		*dentry;
+
+	if (!capable(CAP_MKNOD))
+		return -EPERM;
+	if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
+		return -EFAULT;
+
+	error = mnt_want_write_file(parfilp);
+	if (error)
+		return error;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry)) {
+		mnt_drop_write_file(parfilp);
+		return PTR_ERR(dentry);
+	}
+
+	if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
+		error = -EPERM;
+		goto out;
+	}
+
+	if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+		error = -EFAULT;
+		goto out;
+	}
+
+	error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
+				 fsd.fsd_dmstate);
+
+ out:
+	mnt_drop_write_file(parfilp);
+	dput(dentry);
+	return error;
+}
+
+STATIC int
+xfs_attrlist_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error = -ENOMEM;
+	attrlist_cursor_kern_t	*cursor;
+	xfs_fsop_attrlist_handlereq_t al_hreq;
+	struct dentry		*dentry;
+	char			*kbuf;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
+		return -EFAULT;
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
+		return -EINVAL;
+
+	/*
+	 * Reject flags, only allow namespaces.
+	 */
+	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+		return -EINVAL;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	if (!kbuf)
+		goto out_dput;
+
+	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+	error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
+					al_hreq.flags, cursor);
+	if (error)
+		goto out_kfree;
+
+	if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
+		error = -EFAULT;
+
+out_kfree:
+	kmem_free(kbuf);
+out_dput:
+	dput(dentry);
+	return error;
+}
+
+int
+xfs_attrmulti_attr_get(
+	struct inode		*inode,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
+	__uint32_t		*len,
+	__uint32_t		flags)
+{
+	unsigned char		*kbuf;
+	int			error = -EFAULT;
+
+	if (*len > XATTR_SIZE_MAX)
+		return -EINVAL;
+	kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+	if (!kbuf)
+		return -ENOMEM;
+
+	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+	if (error)
+		goto out_kfree;
+
+	if (copy_to_user(ubuf, kbuf, *len))
+		error = -EFAULT;
+
+out_kfree:
+	kmem_free(kbuf);
+	return error;
+}
+
+int
+xfs_attrmulti_attr_set(
+	struct inode		*inode,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
+	__uint32_t		len,
+	__uint32_t		flags)
+{
+	unsigned char		*kbuf;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	if (len > XATTR_SIZE_MAX)
+		return -EINVAL;
+
+	kbuf = memdup_user(ubuf, len);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+}
+
+int
+xfs_attrmulti_attr_remove(
+	struct inode		*inode,
+	unsigned char		*name,
+	__uint32_t		flags)
+{
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	return xfs_attr_remove(XFS_I(inode), name, flags);
+}
+
+STATIC int
+xfs_attrmulti_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	xfs_attr_multiop_t	*ops;
+	xfs_fsop_attrmulti_handlereq_t am_hreq;
+	struct dentry		*dentry;
+	unsigned int		i, size;
+	unsigned char		*attr_name;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+		return -EFAULT;
+
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+		return -E2BIG;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = -E2BIG;
+	size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
+	if (!size || size > 16 * PAGE_SIZE)
+		goto out_dput;
+
+	ops = memdup_user(am_hreq.ops, size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
+		goto out_dput;
+	}
+
+	error = -ENOMEM;
+	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+	if (!attr_name)
+		goto out_kfree_ops;
+
+	error = 0;
+	for (i = 0; i < am_hreq.opcount; i++) {
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
+				ops[i].am_attrname, MAXNAMELEN);
+		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+			error = -ERANGE;
+		if (ops[i].am_error < 0)
+			break;
+
+		switch (ops[i].am_opcode) {
+		case ATTR_OP_GET:
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					d_inode(dentry), attr_name,
+					ops[i].am_attrvalue, &ops[i].am_length,
+					ops[i].am_flags);
+			break;
+		case ATTR_OP_SET:
+			ops[i].am_error = mnt_want_write_file(parfilp);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					d_inode(dentry), attr_name,
+					ops[i].am_attrvalue, ops[i].am_length,
+					ops[i].am_flags);
+			mnt_drop_write_file(parfilp);
+			break;
+		case ATTR_OP_REMOVE:
+			ops[i].am_error = mnt_want_write_file(parfilp);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					d_inode(dentry), attr_name,
+					ops[i].am_flags);
+			mnt_drop_write_file(parfilp);
+			break;
+		default:
+			ops[i].am_error = -EINVAL;
+		}
+	}
+
+	if (copy_to_user(am_hreq.ops, ops, size))
+		error = -EFAULT;
+
+	kfree(attr_name);
+ out_kfree_ops:
+	kfree(ops);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+int
+xfs_ioc_space(
+	struct xfs_inode	*ip,
+	struct inode		*inode,
+	struct file		*filp,
+	int			ioflags,
+	unsigned int		cmd,
+	xfs_flock64_t		*bf)
+{
+	struct iattr		iattr;
+	enum xfs_prealloc_flags	flags = 0;
+	uint			iolock = XFS_IOLOCK_EXCL;
+	int			error;
+
+	/*
+	 * Only allow the sys admin to reserve space unless
+	 * unwritten extents are enabled.
+	 */
+	if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
+		return -EPERM;
+
+	if (!(filp->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (filp->f_flags & O_DSYNC)
+		flags |= XFS_PREALLOC_SYNC;
+	if (ioflags & XFS_IO_INVIS)
+		flags |= XFS_PREALLOC_INVISIBLE;
+
+	error = mnt_want_write_file(filp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, iolock);
+	error = xfs_break_layouts(inode, &iolock, false);
+	if (error)
+		goto out_unlock;
+
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	iolock |= XFS_MMAPLOCK_EXCL;
+
+	switch (bf->l_whence) {
+	case 0: /*SEEK_SET*/
+		break;
+	case 1: /*SEEK_CUR*/
+		bf->l_start += filp->f_pos;
+		break;
+	case 2: /*SEEK_END*/
+		bf->l_start += XFS_ISIZE(ip);
+		break;
+	default:
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * length of <= 0 for resv/unresv/zero is invalid.  length for
+	 * alloc/free is ignored completely and we have no idea what userspace
+	 * might have set it to, so set it to zero to allow range
+	 * checks to pass.
+	 */
+	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		if (bf->l_len <= 0) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+		break;
+	default:
+		bf->l_len = 0;
+		break;
+	}
+
+	if (bf->l_start < 0 ||
+	    bf->l_start > inode->i_sb->s_maxbytes ||
+	    bf->l_start + bf->l_len < 0 ||
+	    bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+		flags |= XFS_PREALLOC_SET;
+		error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
+		break;
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+		flags |= XFS_PREALLOC_SET;
+		error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
+						XFS_BMAPI_PREALLOC);
+		break;
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		error = xfs_free_file_space(ip, bf->l_start, bf->l_len);
+		break;
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_FREESP64:
+		flags |= XFS_PREALLOC_CLEAR;
+		if (bf->l_start > XFS_ISIZE(ip)) {
+			error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
+					bf->l_start - XFS_ISIZE(ip), 0);
+			if (error)
+				goto out_unlock;
+		}
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = bf->l_start;
+
+		error = xfs_setattr_size(ip, &iattr);
+		break;
+	default:
+		ASSERT(0);
+		error = -EINVAL;
+	}
+
+	if (error)
+		goto out_unlock;
+
+	error = xfs_update_prealloc_flags(ip, flags);
+
+out_unlock:
+	xfs_iunlock(ip, iolock);
+	mnt_drop_write_file(filp);
+	return error;
+}
+
+STATIC int
+xfs_ioc_bulkstat(
+	xfs_mount_t		*mp,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	xfs_fsop_bulkreq_t	bulkreq;
+	int			count;	/* # of records returned */
+	xfs_ino_t		inlast;	/* last inode number */
+	int			done;
+	int			error;
+
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+		return -EFAULT;
+
+	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+		return -EFAULT;
+
+	if ((count = bulkreq.icount) <= 0)
+		return -EINVAL;
+
+	if (bulkreq.ubuffer == NULL)
+		return -EINVAL;
+
+	if (cmd == XFS_IOC_FSINUMBERS)
+		error = xfs_inumbers(mp, &inlast, &count,
+					bulkreq.ubuffer, xfs_inumbers_fmt);
+	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
+		error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
+					sizeof(xfs_bstat_t), NULL, &done);
+	else	/* XFS_IOC_FSBULKSTAT */
+		error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
+				     sizeof(xfs_bstat_t), bulkreq.ubuffer,
+				     &done);
+
+	if (error)
+		return error;
+
+	if (bulkreq.ocount != NULL) {
+		if (copy_to_user(bulkreq.lastip, &inlast,
+						sizeof(xfs_ino_t)))
+			return -EFAULT;
+
+		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry_v1(
+	xfs_mount_t		*mp,
+	void			__user *arg)
+{
+	xfs_fsop_geom_t         fsgeo;
+	int			error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 3);
+	if (error)
+		return error;
+
+	/*
+	 * Caller should have passed an argument of type
+	 * xfs_fsop_geom_v1_t.  This is a proper subset of the
+	 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+	 */
+	if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry(
+	xfs_mount_t		*mp,
+	void			__user *arg)
+{
+	xfs_fsop_geom_t		fsgeo;
+	int			error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 4);
+	if (error)
+		return error;
+
+	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * Linux extended inode flags interface.
+ */
+
+STATIC unsigned int
+xfs_merge_ioc_xflags(
+	unsigned int	flags,
+	unsigned int	start)
+{
+	unsigned int	xflags = start;
+
+	if (flags & FS_IMMUTABLE_FL)
+		xflags |= XFS_XFLAG_IMMUTABLE;
+	else
+		xflags &= ~XFS_XFLAG_IMMUTABLE;
+	if (flags & FS_APPEND_FL)
+		xflags |= XFS_XFLAG_APPEND;
+	else
+		xflags &= ~XFS_XFLAG_APPEND;
+	if (flags & FS_SYNC_FL)
+		xflags |= XFS_XFLAG_SYNC;
+	else
+		xflags &= ~XFS_XFLAG_SYNC;
+	if (flags & FS_NOATIME_FL)
+		xflags |= XFS_XFLAG_NOATIME;
+	else
+		xflags &= ~XFS_XFLAG_NOATIME;
+	if (flags & FS_NODUMP_FL)
+		xflags |= XFS_XFLAG_NODUMP;
+	else
+		xflags &= ~XFS_XFLAG_NODUMP;
+
+	return xflags;
+}
+
+STATIC unsigned int
+xfs_di2lxflags(
+	__uint16_t	di_flags)
+{
+	unsigned int	flags = 0;
+
+	if (di_flags & XFS_DIFLAG_IMMUTABLE)
+		flags |= FS_IMMUTABLE_FL;
+	if (di_flags & XFS_DIFLAG_APPEND)
+		flags |= FS_APPEND_FL;
+	if (di_flags & XFS_DIFLAG_SYNC)
+		flags |= FS_SYNC_FL;
+	if (di_flags & XFS_DIFLAG_NOATIME)
+		flags |= FS_NOATIME_FL;
+	if (di_flags & XFS_DIFLAG_NODUMP)
+		flags |= FS_NODUMP_FL;
+	return flags;
+}
+
+STATIC int
+xfs_ioc_fsgetxattr(
+	xfs_inode_t		*ip,
+	int			attr,
+	void			__user *arg)
+{
+	struct fsxattr		fa;
+
+	memset(&fa, 0, sizeof(struct fsxattr));
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	fa.fsx_xflags = xfs_ip2xflags(ip);
+	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+	fa.fsx_projid = xfs_get_projid(ip);
+
+	if (attr) {
+		if (ip->i_afp) {
+			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
+				fa.fsx_nextents = ip->i_afp->if_bytes /
+							sizeof(xfs_bmbt_rec_t);
+			else
+				fa.fsx_nextents = ip->i_d.di_anextents;
+		} else
+			fa.fsx_nextents = 0;
+	} else {
+		if (ip->i_df.if_flags & XFS_IFEXTENTS)
+			fa.fsx_nextents = ip->i_df.if_bytes /
+						sizeof(xfs_bmbt_rec_t);
+		else
+			fa.fsx_nextents = ip->i_d.di_nextents;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC void
+xfs_set_diflags(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	unsigned int		di_flags;
+
+	/* can't set PREALLOC this way, just preserve it */
+	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+	if (xflags & XFS_XFLAG_IMMUTABLE)
+		di_flags |= XFS_DIFLAG_IMMUTABLE;
+	if (xflags & XFS_XFLAG_APPEND)
+		di_flags |= XFS_DIFLAG_APPEND;
+	if (xflags & XFS_XFLAG_SYNC)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if (xflags & XFS_XFLAG_NOATIME)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if (xflags & XFS_XFLAG_NODUMP)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if (xflags & XFS_XFLAG_NODEFRAG)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (xflags & XFS_XFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+	if (S_ISDIR(ip->i_d.di_mode)) {
+		if (xflags & XFS_XFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (xflags & XFS_XFLAG_NOSYMLINKS)
+			di_flags |= XFS_DIFLAG_NOSYMLINKS;
+		if (xflags & XFS_XFLAG_EXTSZINHERIT)
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+		if (xflags & XFS_XFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
+	} else if (S_ISREG(ip->i_d.di_mode)) {
+		if (xflags & XFS_XFLAG_REALTIME)
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (xflags & XFS_XFLAG_EXTSIZE)
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+	}
+
+	ip->i_d.di_flags = di_flags;
+}
+
+STATIC void
+xfs_diflags_to_linux(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+	unsigned int		xflags = xfs_ip2xflags(ip);
+
+	if (xflags & XFS_XFLAG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (xflags & XFS_XFLAG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+	if (xflags & XFS_XFLAG_SYNC)
+		inode->i_flags |= S_SYNC;
+	else
+		inode->i_flags &= ~S_SYNC;
+	if (xflags & XFS_XFLAG_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	else
+		inode->i_flags &= ~S_NOATIME;
+}
+
+static int
+xfs_ioctl_setattr_xflags(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct fsxattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	/* Can't change realtime flag if any extents are allocated. */
+	if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+		return -EINVAL;
+
+	/* If realtime flag is set then must have realtime device */
+	if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
+		    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
+			return -EINVAL;
+	}
+
+	/*
+	 * Can't modify an immutable/append-only file unless
+	 * we have appropriate permission.
+	 */
+	if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
+	     (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	xfs_set_diflags(ip, fa->fsx_xflags);
+	xfs_diflags_to_linux(ip);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	XFS_STATS_INC(xs_ig_attrchg);
+	return 0;
+}
+
+/*
+ * Set up the transaction structure for the setattr operation, checking that we
+ * have permission to do so. On success, return a clean transaction and the
+ * inode locked exclusively ready for further operation specific checks. On
+ * failure, return an error without modifying or locking the inode.
+ */
+static struct xfs_trans *
+xfs_ioctl_setattr_get_trans(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return ERR_PTR(-EROFS);
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return ERR_PTR(-EIO);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error)
+		goto out_cancel;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * CAP_FOWNER overrides the following restrictions:
+	 *
+	 * The user ID of the calling process must be equal to the file owner
+	 * ID, except in cases where the CAP_FSETID capability is applicable.
+	 */
+	if (!inode_owner_or_capable(VFS_I(ip))) {
+		error = -EPERM;
+		goto out_cancel;
+	}
+
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	return tp;
+
+out_cancel:
+	xfs_trans_cancel(tp, 0);
+	return ERR_PTR(error);
+}
+
+/*
+ * extent size hint validation is somewhat cumbersome. Rules are:
+ *
+ * 1. extent size hint is only valid for directories and regular files
+ * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. can only be changed on regular files if no extents are allocated
+ * 5. can be changed on directories at any time
+ * 6. extsize hint of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ * 8. for non-realtime files, the extent size hint must be limited
+ *    to half the AG size to avoid alignment extending the extent beyond the
+ *    limits of the AG.
+ */
+static int
+xfs_ioctl_setattr_check_extsize(
+	struct xfs_inode	*ip,
+	struct fsxattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+		return -EINVAL;
+
+	if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+	    !S_ISDIR(ip->i_d.di_mode))
+		return -EINVAL;
+
+	if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+	    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
+		return -EINVAL;
+
+	if (fa->fsx_extsize != 0) {
+		xfs_extlen_t    size;
+		xfs_fsblock_t   extsize_fsb;
+
+		extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+		if (extsize_fsb > MAXEXTLEN)
+			return -EINVAL;
+
+		if (XFS_IS_REALTIME_INODE(ip) ||
+		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+			size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+		} else {
+			size = mp->m_sb.sb_blocksize;
+			if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
+				return -EINVAL;
+		}
+
+		if (fa->fsx_extsize % size)
+			return -EINVAL;
+	} else
+		fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+
+	return 0;
+}
+
+static int
+xfs_ioctl_setattr_check_projid(
+	struct xfs_inode	*ip,
+	struct fsxattr		*fa)
+{
+	/* Disallow 32bit project ids if projid32bit feature is not enabled. */
+	if (fa->fsx_projid > (__uint16_t)-1 &&
+	    !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+		return -EINVAL;
+
+	/*
+	 * Project Quota ID state is only allowed to change from within the init
+	 * namespace. Enforce that restriction only if we are trying to change
+	 * the quota ID state. Everything else is allowed in user namespaces.
+	 */
+	if (current_user_ns() == &init_user_ns)
+		return 0;
+
+	if (xfs_get_projid(ip) != fa->fsx_projid)
+		return -EINVAL;
+	if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+	    (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
+		return -EINVAL;
+
+	return 0;
+}
+
+STATIC int
+xfs_ioctl_setattr(
+	xfs_inode_t		*ip,
+	struct fsxattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_dquot	*olddquot = NULL;
+	int			code;
+
+	trace_xfs_ioctl_setattr(ip);
+
+	code = xfs_ioctl_setattr_check_projid(ip, fa);
+	if (code)
+		return code;
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
+					 ip->i_d.di_gid, fa->fsx_projid,
+					 XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
+		if (code)
+			return code;
+	}
+
+	tp = xfs_ioctl_setattr_get_trans(ip);
+	if (IS_ERR(tp)) {
+		code = PTR_ERR(tp);
+		goto error_free_dquots;
+	}
+
+
+	if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
+	    xfs_get_projid(ip) != fa->fsx_projid) {
+		code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
+				capable(CAP_FOWNER) ?  XFS_QMOPT_FORCE_RES : 0);
+		if (code)	/* out of quota */
+			goto error_trans_cancel;
+	}
+
+	code = xfs_ioctl_setattr_check_extsize(ip, fa);
+	if (code)
+		goto error_trans_cancel;
+
+	code = xfs_ioctl_setattr_xflags(tp, ip, fa);
+	if (code)
+		goto error_trans_cancel;
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.  CAP_FSETID
+	 * overrides the following restrictions:
+	 *
+	 * The set-user-ID and set-group-ID bits of a file will be cleared upon
+	 * successful return from chown()
+	 */
+
+	if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+	    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+		ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+	/* Change the ownerships and register project quota modifications */
+	if (xfs_get_projid(ip) != fa->fsx_projid) {
+		if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+			olddquot = xfs_qm_vop_chown(tp, ip,
+						&ip->i_pdquot, pdqp);
+		}
+		ASSERT(ip->i_d.di_version > 1);
+		xfs_set_projid(ip, fa->fsx_projid);
+	}
+
+	/*
+	 * Only set the extent size hint if we've already determined that the
+	 * extent size hint should be set on the inode. If no extent size flags
+	 * are set on the inode then unconditionally clear the extent size hint.
+	 */
+	if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
+		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+	else
+		ip->i_d.di_extsize = 0;
+
+	code = xfs_trans_commit(tp, 0);
+
+	/*
+	 * Release any dquot(s) the inode had kept before chown.
+	 */
+	xfs_qm_dqrele(olddquot);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(pdqp);
+
+	return code;
+
+error_trans_cancel:
+	xfs_trans_cancel(tp, 0);
+error_free_dquots:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(pdqp);
+	return code;
+}
+
+STATIC int
+xfs_ioc_fssetxattr(
+	xfs_inode_t		*ip,
+	struct file		*filp,
+	void			__user *arg)
+{
+	struct fsxattr		fa;
+	int error;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	error = mnt_want_write_file(filp);
+	if (error)
+		return error;
+	error = xfs_ioctl_setattr(ip, &fa);
+	mnt_drop_write_file(filp);
+	return error;
+}
+
+STATIC int
+xfs_ioc_getxflags(
+	xfs_inode_t		*ip,
+	void			__user *arg)
+{
+	unsigned int		flags;
+
+	flags = xfs_di2lxflags(ip->i_d.di_flags);
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_ioc_setxflags(
+	struct xfs_inode	*ip,
+	struct file		*filp,
+	void			__user *arg)
+{
+	struct xfs_trans	*tp;
+	struct fsxattr		fa;
+	unsigned int		flags;
+	int			error;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL))
+		return -EOPNOTSUPP;
+
+	fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
+
+	error = mnt_want_write_file(filp);
+	if (error)
+		return error;
+
+	tp = xfs_ioctl_setattr_get_trans(ip);
+	if (IS_ERR(tp)) {
+		error = PTR_ERR(tp);
+		goto out_drop_write;
+	}
+
+	error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_drop_write;
+	}
+
+	error = xfs_trans_commit(tp, 0);
+out_drop_write:
+	mnt_drop_write_file(filp);
+	return error;
+}
+
+STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+	struct getbmap __user	*base = (struct getbmap __user *)*ap;
+
+	/* copy only getbmap portion (not getbmapx) */
+	if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+		return -EFAULT;
+
+	*ap += sizeof(struct getbmap);
+	return 0;
+}
+
+STATIC int
+xfs_ioc_getbmap(
+	struct xfs_inode	*ip,
+	int			ioflags,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	struct getbmapx		bmx;
+	int			error;
+
+	if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+		return -EFAULT;
+
+	if (bmx.bmv_count < 2)
+		return -EINVAL;
+
+	bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+	if (ioflags & XFS_IO_INVIS)
+		bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
+
+	error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+			    (__force struct getbmap *)arg+1);
+	if (error)
+		return error;
+
+	/* copy back header - only size of getbmap */
+	if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+	struct getbmapx __user	*base = (struct getbmapx __user *)*ap;
+
+	if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+		return -EFAULT;
+
+	*ap += sizeof(struct getbmapx);
+	return 0;
+}
+
+STATIC int
+xfs_ioc_getbmapx(
+	struct xfs_inode	*ip,
+	void			__user *arg)
+{
+	struct getbmapx		bmx;
+	int			error;
+
+	if (copy_from_user(&bmx, arg, sizeof(bmx)))
+		return -EFAULT;
+
+	if (bmx.bmv_count < 2)
+		return -EINVAL;
+
+	if (bmx.bmv_iflags & (~BMV_IF_VALID))
+		return -EINVAL;
+
+	error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+			    (__force struct getbmapx *)arg+1);
+	if (error)
+		return error;
+
+	/* copy back header */
+	if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+xfs_ioc_swapext(
+	xfs_swapext_t	*sxp)
+{
+	xfs_inode_t     *ip, *tip;
+	struct fd	f, tmp;
+	int		error = 0;
+
+	/* Pull information for the target fd */
+	f = fdget((int)sxp->sx_fdtarget);
+	if (!f.file) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	if (!(f.file->f_mode & FMODE_WRITE) ||
+	    !(f.file->f_mode & FMODE_READ) ||
+	    (f.file->f_flags & O_APPEND)) {
+		error = -EBADF;
+		goto out_put_file;
+	}
+
+	tmp = fdget((int)sxp->sx_fdtmp);
+	if (!tmp.file) {
+		error = -EINVAL;
+		goto out_put_file;
+	}
+
+	if (!(tmp.file->f_mode & FMODE_WRITE) ||
+	    !(tmp.file->f_mode & FMODE_READ) ||
+	    (tmp.file->f_flags & O_APPEND)) {
+		error = -EBADF;
+		goto out_put_tmp_file;
+	}
+
+	if (IS_SWAPFILE(file_inode(f.file)) ||
+	    IS_SWAPFILE(file_inode(tmp.file))) {
+		error = -EINVAL;
+		goto out_put_tmp_file;
+	}
+
+	ip = XFS_I(file_inode(f.file));
+	tip = XFS_I(file_inode(tmp.file));
+
+	if (ip->i_mount != tip->i_mount) {
+		error = -EINVAL;
+		goto out_put_tmp_file;
+	}
+
+	if (ip->i_ino == tip->i_ino) {
+		error = -EINVAL;
+		goto out_put_tmp_file;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		error = -EIO;
+		goto out_put_tmp_file;
+	}
+
+	error = xfs_swap_extents(ip, tip, sxp);
+
+ out_put_tmp_file:
+	fdput(tmp);
+ out_put_file:
+	fdput(f);
+ out:
+	return error;
+}
+
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		p)
+{
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	void			__user *arg = (void __user *)p;
+	int			ioflags = 0;
+	int			error;
+
+	if (filp->f_mode & FMODE_NOCMTIME)
+		ioflags |= XFS_IO_INVIS;
+
+	trace_xfs_file_ioctl(ip);
+
+	switch (cmd) {
+	case FITRIM:
+		return xfs_ioc_trim(mp, arg);
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+	case XFS_IOC_ZERO_RANGE: {
+		xfs_flock64_t		bf;
+
+		if (copy_from_user(&bf, arg, sizeof(bf)))
+			return -EFAULT;
+		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+	}
+	case XFS_IOC_DIOINFO: {
+		struct dioattr	da;
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+			mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		da.d_mem =  da.d_miniosz = target->bt_logical_sectorsize;
+		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+		if (copy_to_user(arg, &da, sizeof(da)))
+			return -EFAULT;
+		return 0;
+	}
+
+	case XFS_IOC_FSBULKSTAT_SINGLE:
+	case XFS_IOC_FSBULKSTAT:
+	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_bulkstat(mp, cmd, arg);
+
+	case XFS_IOC_FSGEOMETRY_V1:
+		return xfs_ioc_fsgeometry_v1(mp, arg);
+
+	case XFS_IOC_FSGEOMETRY:
+		return xfs_ioc_fsgeometry(mp, arg);
+
+	case XFS_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *)arg);
+
+	case XFS_IOC_FSGETXATTR:
+		return xfs_ioc_fsgetxattr(ip, 0, arg);
+	case XFS_IOC_FSGETXATTRA:
+		return xfs_ioc_fsgetxattr(ip, 1, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_fssetxattr(ip, filp, arg);
+	case XFS_IOC_GETXFLAGS:
+		return xfs_ioc_getxflags(ip, arg);
+	case XFS_IOC_SETXFLAGS:
+		return xfs_ioc_setxflags(ip, filp, arg);
+
+	case XFS_IOC_FSSETDM: {
+		struct fsdmidata	dmi;
+
+		if (copy_from_user(&dmi, arg, sizeof(dmi)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+
+		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+				dmi.fsd_dmstate);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+
+	case XFS_IOC_GETBMAPX:
+		return xfs_ioc_getbmapx(ip, arg);
+
+	case XFS_IOC_FD_TO_HANDLE:
+	case XFS_IOC_PATH_TO_HANDLE:
+	case XFS_IOC_PATH_TO_FSHANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(hreq)))
+			return -EFAULT;
+		return xfs_find_handle(cmd, &hreq);
+	}
+	case XFS_IOC_OPEN_BY_HANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+			return -EFAULT;
+		return xfs_open_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_FSSETDM_BY_HANDLE:
+		return xfs_fssetdm_by_handle(filp, arg);
+
+	case XFS_IOC_READLINK_BY_HANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+			return -EFAULT;
+		return xfs_readlink_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_ATTRLIST_BY_HANDLE:
+		return xfs_attrlist_by_handle(filp, arg);
+
+	case XFS_IOC_ATTRMULTI_BY_HANDLE:
+		return xfs_attrmulti_by_handle(filp, arg);
+
+	case XFS_IOC_SWAPEXT: {
+		struct xfs_swapext	sxp;
+
+		if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_ioc_swapext(&sxp);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_FSCOUNTS: {
+		xfs_fsop_counts_t out;
+
+		error = xfs_fs_counts(mp, &out);
+		if (error)
+			return error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -EFAULT;
+		return 0;
+	}
+
+	case XFS_IOC_SET_RESBLKS: {
+		xfs_fsop_resblks_t inout;
+		__uint64_t	   in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (mp->m_flags & XFS_MOUNT_RDONLY)
+			return -EROFS;
+
+		if (copy_from_user(&inout, arg, sizeof(inout)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+
+		/* input parameter is passed in resblks field of structure */
+		in = inout.resblks;
+		error = xfs_reserve_blocks(mp, &in, &inout);
+		mnt_drop_write_file(filp);
+		if (error)
+			return error;
+
+		if (copy_to_user(arg, &inout, sizeof(inout)))
+			return -EFAULT;
+		return 0;
+	}
+
+	case XFS_IOC_GET_RESBLKS: {
+		xfs_fsop_resblks_t out;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_reserve_blocks(mp, NULL, &out);
+		if (error)
+			return error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSDATA: {
+		xfs_growfs_data_t in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_data(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_FSGROWFSLOG: {
+		xfs_growfs_log_t in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_log(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_FSGROWFSRT: {
+		xfs_growfs_rt_t in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_rt(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_GOINGDOWN: {
+		__uint32_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(in, (__uint32_t __user *)arg))
+			return -EFAULT;
+
+		return xfs_fs_goingdown(mp, in);
+	}
+
+	case XFS_IOC_ERROR_INJECTION: {
+		xfs_error_injection_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		return xfs_errortag_add(in.errtag, mp);
+	}
+
+	case XFS_IOC_ERROR_CLEARALL:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		return xfs_errortag_clearall(mp, 1);
+
+	case XFS_IOC_FREE_EOFBLOCKS: {
+		struct xfs_fs_eofblocks eofb;
+		struct xfs_eofblocks keofb;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (mp->m_flags & XFS_MOUNT_RDONLY)
+			return -EROFS;
+
+		if (copy_from_user(&eofb, arg, sizeof(eofb)))
+			return -EFAULT;
+
+		error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
+		if (error)
+			return error;
+
+		return xfs_icache_free_eofblocks(mp, &keofb);
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
diff --git a/kernel/fs/xfs/xfs_ioctl.h b/kernel/fs/xfs/xfs_ioctl.h
new file mode 100644
index 000000000..77c02c790
--- /dev/null
+++ b/kernel/fs/xfs/xfs_ioctl.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+
+extern int
+xfs_ioc_space(
+	struct xfs_inode	*ip,
+	struct inode		*inode,
+	struct file		*filp,
+	int			ioflags,
+	unsigned int		cmd,
+	xfs_flock64_t		*bf);
+
+int
+xfs_ioc_swapext(
+	xfs_swapext_t	*sxp);
+
+extern int
+xfs_find_handle(
+	unsigned int		cmd,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_open_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_readlink_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_attrmulti_attr_get(
+	struct inode		*inode,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
+	__uint32_t		*len,
+	__uint32_t		flags);
+
+extern int
+xfs_attrmulti_attr_set(
+	struct inode		*inode,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
+	__uint32_t		len,
+	__uint32_t		flags);
+
+extern int
+xfs_attrmulti_attr_remove(
+	struct inode		*inode,
+	unsigned char		*name,
+	__uint32_t		flags);
+
+extern struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen);
+
+extern long
+xfs_file_ioctl(
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		p);
+
+extern long
+xfs_file_compat_ioctl(
+	struct file		*file,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+extern int
+xfs_set_dmattrs(
+	struct xfs_inode	*ip,
+	u_int			evmask,
+	u_int16_t		state);
+
+#endif
diff --git a/kernel/fs/xfs/xfs_ioctl32.c b/kernel/fs/xfs/xfs_ioctl32.c
new file mode 100644
index 000000000..b88bdc85d
--- /dev/null
+++ b/kernel/fs/xfs/xfs_ioctl32.c
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
+#include "xfs_ioctl32.h"
+#include "xfs_trace.h"
+
+#define  _NATIVE_IOC(cmd, type) \
+	  _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
+
+#ifdef BROKEN_X86_ALIGNMENT
+STATIC int
+xfs_compat_flock64_copyin(
+	xfs_flock64_t		*bf,
+	compat_xfs_flock64_t	__user *arg32)
+{
+	if (get_user(bf->l_type,	&arg32->l_type) ||
+	    get_user(bf->l_whence,	&arg32->l_whence) ||
+	    get_user(bf->l_start,	&arg32->l_start) ||
+	    get_user(bf->l_len,		&arg32->l_len) ||
+	    get_user(bf->l_sysid,	&arg32->l_sysid) ||
+	    get_user(bf->l_pid,		&arg32->l_pid) ||
+	    copy_from_user(bf->l_pad,	&arg32->l_pad,	4*sizeof(u32)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_compat_ioc_fsgeometry_v1(
+	struct xfs_mount	  *mp,
+	compat_xfs_fsop_geom_v1_t __user *arg32)
+{
+	xfs_fsop_geom_t		  fsgeo;
+	int			  error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 3);
+	if (error)
+		return error;
+	/* The 32-bit variant simply has some padding at the end */
+	if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_compat_growfs_data_copyin(
+	struct xfs_growfs_data	 *in,
+	compat_xfs_growfs_data_t __user *arg32)
+{
+	if (get_user(in->newblocks, &arg32->newblocks) ||
+	    get_user(in->imaxpct,   &arg32->imaxpct))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_compat_growfs_rt_copyin(
+	struct xfs_growfs_rt	 *in,
+	compat_xfs_growfs_rt_t	__user *arg32)
+{
+	if (get_user(in->newblocks, &arg32->newblocks) ||
+	    get_user(in->extsize,   &arg32->extsize))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt_compat(
+	void			__user *ubuffer,
+	const struct xfs_inogrp	*buffer,
+	long			count,
+	long			*written)
+{
+	compat_xfs_inogrp_t	__user *p32 = ubuffer;
+	long			i;
+
+	for (i = 0; i < count; i++) {
+		if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
+		    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
+		    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
+			return -EFAULT;
+	}
+	*written = count * sizeof(*p32);
+	return 0;
+}
+
+#else
+#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
+#endif	/* BROKEN_X86_ALIGNMENT */
+
+STATIC int
+xfs_ioctl32_bstime_copyin(
+	xfs_bstime_t		*bstime,
+	compat_xfs_bstime_t	__user *bstime32)
+{
+	compat_time_t		sec32;	/* tv_sec differs on 64 vs. 32 */
+
+	if (get_user(sec32,		&bstime32->tv_sec)	||
+	    get_user(bstime->tv_nsec,	&bstime32->tv_nsec))
+		return -EFAULT;
+	bstime->tv_sec = sec32;
+	return 0;
+}
+
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+	xfs_bstat_t		*bstat,
+	compat_xfs_bstat_t	__user *bstat32)
+{
+	if (get_user(bstat->bs_ino,	&bstat32->bs_ino)	||
+	    get_user(bstat->bs_mode,	&bstat32->bs_mode)	||
+	    get_user(bstat->bs_nlink,	&bstat32->bs_nlink)	||
+	    get_user(bstat->bs_uid,	&bstat32->bs_uid)	||
+	    get_user(bstat->bs_gid,	&bstat32->bs_gid)	||
+	    get_user(bstat->bs_rdev,	&bstat32->bs_rdev)	||
+	    get_user(bstat->bs_blksize,	&bstat32->bs_blksize)	||
+	    get_user(bstat->bs_size,	&bstat32->bs_size)	||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+	    get_user(bstat->bs_blocks,	&bstat32->bs_size)	||
+	    get_user(bstat->bs_xflags,	&bstat32->bs_size)	||
+	    get_user(bstat->bs_extsize,	&bstat32->bs_extsize)	||
+	    get_user(bstat->bs_extents,	&bstat32->bs_extents)	||
+	    get_user(bstat->bs_gen,	&bstat32->bs_gen)	||
+	    get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+	    get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+	    get_user(bstat->bs_forkoff,	&bstat32->bs_forkoff)	||
+	    get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask)	||
+	    get_user(bstat->bs_dmstate,	&bstat32->bs_dmstate)	||
+	    get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+		return -EFAULT;
+	return 0;
+}
+
+/* XFS_IOC_FSBULKSTAT and friends */
+
+STATIC int
+xfs_bstime_store_compat(
+	compat_xfs_bstime_t	__user *p32,
+	const xfs_bstime_t	*p)
+{
+	__s32			sec32;
+
+	sec32 = p->tv_sec;
+	if (put_user(sec32, &p32->tv_sec) ||
+	    put_user(p->tv_nsec, &p32->tv_nsec))
+		return -EFAULT;
+	return 0;
+}
+
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
+STATIC int
+xfs_bulkstat_one_fmt_compat(
+	void			__user *ubuffer,
+	int			ubsize,
+	int			*ubused,
+	const xfs_bstat_t	*buffer)
+{
+	compat_xfs_bstat_t	__user *p32 = ubuffer;
+
+	if (ubsize < sizeof(*p32))
+		return -ENOMEM;
+
+	if (put_user(buffer->bs_ino,	  &p32->bs_ino)		||
+	    put_user(buffer->bs_mode,	  &p32->bs_mode)	||
+	    put_user(buffer->bs_nlink,	  &p32->bs_nlink)	||
+	    put_user(buffer->bs_uid,	  &p32->bs_uid)		||
+	    put_user(buffer->bs_gid,	  &p32->bs_gid)		||
+	    put_user(buffer->bs_rdev,	  &p32->bs_rdev)	||
+	    put_user(buffer->bs_blksize,  &p32->bs_blksize)	||
+	    put_user(buffer->bs_size,	  &p32->bs_size)	||
+	    xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
+	    xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
+	    xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
+	    put_user(buffer->bs_blocks,	  &p32->bs_blocks)	||
+	    put_user(buffer->bs_xflags,	  &p32->bs_xflags)	||
+	    put_user(buffer->bs_extsize,  &p32->bs_extsize)	||
+	    put_user(buffer->bs_extents,  &p32->bs_extents)	||
+	    put_user(buffer->bs_gen,	  &p32->bs_gen)		||
+	    put_user(buffer->bs_projid,	  &p32->bs_projid)	||
+	    put_user(buffer->bs_projid_hi,	&p32->bs_projid_hi)	||
+	    put_user(buffer->bs_forkoff,  &p32->bs_forkoff)	||
+	    put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)	||
+	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
+	    put_user(buffer->bs_aextents, &p32->bs_aextents))
+		return -EFAULT;
+	if (ubused)
+		*ubused = sizeof(*p32);
+	return 0;
+}
+
+STATIC int
+xfs_bulkstat_one_compat(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		__user *buffer,	/* buffer to place output in */
+	int		ubsize,		/* size of buffer */
+	int		*ubused,	/* bytes used by me */
+	int		*stat)		/* BULKSTAT_RV_... */
+{
+	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+				    xfs_bulkstat_one_fmt_compat,
+				    ubused, stat);
+}
+
+/* copied from xfs_ioctl.c */
+STATIC int
+xfs_compat_ioc_bulkstat(
+	xfs_mount_t		  *mp,
+	unsigned int		  cmd,
+	compat_xfs_fsop_bulkreq_t __user *p32)
+{
+	u32			addr;
+	xfs_fsop_bulkreq_t	bulkreq;
+	int			count;	/* # of records returned */
+	xfs_ino_t		inlast;	/* last inode number */
+	int			done;
+	int			error;
+
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (get_user(addr, &p32->lastip))
+		return -EFAULT;
+	bulkreq.lastip = compat_ptr(addr);
+	if (get_user(bulkreq.icount, &p32->icount) ||
+	    get_user(addr, &p32->ubuffer))
+		return -EFAULT;
+	bulkreq.ubuffer = compat_ptr(addr);
+	if (get_user(addr, &p32->ocount))
+		return -EFAULT;
+	bulkreq.ocount = compat_ptr(addr);
+
+	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+		return -EFAULT;
+
+	if ((count = bulkreq.icount) <= 0)
+		return -EINVAL;
+
+	if (bulkreq.ubuffer == NULL)
+		return -EINVAL;
+
+	if (cmd == XFS_IOC_FSINUMBERS_32) {
+		error = xfs_inumbers(mp, &inlast, &count,
+				bulkreq.ubuffer, xfs_inumbers_fmt_compat);
+	} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+		int res;
+
+		error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+				sizeof(compat_xfs_bstat_t), NULL, &res);
+	} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
+		error = xfs_bulkstat(mp, &inlast, &count,
+			xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+			bulkreq.ubuffer, &done);
+	} else
+		error = -EINVAL;
+	if (error)
+		return error;
+
+	if (bulkreq.ocount != NULL) {
+		if (copy_to_user(bulkreq.lastip, &inlast,
+						sizeof(xfs_ino_t)))
+			return -EFAULT;
+
+		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_compat_handlereq_copyin(
+	xfs_fsop_handlereq_t		*hreq,
+	compat_xfs_fsop_handlereq_t	__user *arg32)
+{
+	compat_xfs_fsop_handlereq_t	hreq32;
+
+	if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+		return -EFAULT;
+
+	hreq->fd = hreq32.fd;
+	hreq->path = compat_ptr(hreq32.path);
+	hreq->oflags = hreq32.oflags;
+	hreq->ihandle = compat_ptr(hreq32.ihandle);
+	hreq->ihandlen = hreq32.ihandlen;
+	hreq->ohandle = compat_ptr(hreq32.ohandle);
+	hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+
+	return 0;
+}
+
+STATIC struct dentry *
+xfs_compat_handlereq_to_dentry(
+	struct file		*parfilp,
+	compat_xfs_fsop_handlereq_t *hreq)
+{
+	return xfs_handle_to_dentry(parfilp,
+			compat_ptr(hreq->ihandle), hreq->ihandlen);
+}
+
+STATIC int
+xfs_compat_attrlist_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	attrlist_cursor_kern_t	*cursor;
+	compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+	struct dentry		*dentry;
+	char			*kbuf;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&al_hreq, arg,
+			   sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+		return -EFAULT;
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
+		return -EINVAL;
+
+	/*
+	 * Reject flags, only allow namespaces.
+	 */
+	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+		return -EINVAL;
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = -ENOMEM;
+	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	if (!kbuf)
+		goto out_dput;
+
+	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+	error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
+					al_hreq.flags, cursor);
+	if (error)
+		goto out_kfree;
+
+	if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+		error = -EFAULT;
+
+out_kfree:
+	kmem_free(kbuf);
+out_dput:
+	dput(dentry);
+	return error;
+}
+
+STATIC int
+xfs_compat_attrmulti_by_handle(
+	struct file				*parfilp,
+	void					__user *arg)
+{
+	int					error;
+	compat_xfs_attr_multiop_t		*ops;
+	compat_xfs_fsop_attrmulti_handlereq_t	am_hreq;
+	struct dentry				*dentry;
+	unsigned int				i, size;
+	unsigned char				*attr_name;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&am_hreq, arg,
+			   sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+		return -EFAULT;
+
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+		return -E2BIG;
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = -E2BIG;
+	size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+	if (!size || size > 16 * PAGE_SIZE)
+		goto out_dput;
+
+	ops = memdup_user(compat_ptr(am_hreq.ops), size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
+		goto out_dput;
+	}
+
+	error = -ENOMEM;
+	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+	if (!attr_name)
+		goto out_kfree_ops;
+
+	error = 0;
+	for (i = 0; i < am_hreq.opcount; i++) {
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
+				compat_ptr(ops[i].am_attrname),
+				MAXNAMELEN);
+		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+			error = -ERANGE;
+		if (ops[i].am_error < 0)
+			break;
+
+		switch (ops[i].am_opcode) {
+		case ATTR_OP_GET:
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					d_inode(dentry), attr_name,
+					compat_ptr(ops[i].am_attrvalue),
+					&ops[i].am_length, ops[i].am_flags);
+			break;
+		case ATTR_OP_SET:
+			ops[i].am_error = mnt_want_write_file(parfilp);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					d_inode(dentry), attr_name,
+					compat_ptr(ops[i].am_attrvalue),
+					ops[i].am_length, ops[i].am_flags);
+			mnt_drop_write_file(parfilp);
+			break;
+		case ATTR_OP_REMOVE:
+			ops[i].am_error = mnt_want_write_file(parfilp);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					d_inode(dentry), attr_name,
+					ops[i].am_flags);
+			mnt_drop_write_file(parfilp);
+			break;
+		default:
+			ops[i].am_error = -EINVAL;
+		}
+	}
+
+	if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+		error = -EFAULT;
+
+	kfree(attr_name);
+ out_kfree_ops:
+	kfree(ops);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+STATIC int
+xfs_compat_fssetdm_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	struct fsdmidata	fsd;
+	compat_xfs_fsop_setdm_handlereq_t dmhreq;
+	struct dentry		*dentry;
+
+	if (!capable(CAP_MKNOD))
+		return -EPERM;
+	if (copy_from_user(&dmhreq, arg,
+			   sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+		return -EFAULT;
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
+		error = -EPERM;
+		goto out;
+	}
+
+	if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+		error = -EFAULT;
+		goto out;
+	}
+
+	error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
+				 fsd.fsd_dmstate);
+
+out:
+	dput(dentry);
+	return error;
+}
+
+long
+xfs_file_compat_ioctl(
+	struct file		*filp,
+	unsigned		cmd,
+	unsigned long		p)
+{
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	void			__user *arg = (void __user *)p;
+	int			ioflags = 0;
+	int			error;
+
+	if (filp->f_mode & FMODE_NOCMTIME)
+		ioflags |= XFS_IO_INVIS;
+
+	trace_xfs_file_compat_ioctl(ip);
+
+	switch (cmd) {
+	/* No size or alignment issues on any arch */
+	case XFS_IOC_DIOINFO:
+	case XFS_IOC_FSGEOMETRY:
+	case XFS_IOC_FSGETXATTR:
+	case XFS_IOC_FSSETXATTR:
+	case XFS_IOC_FSGETXATTRA:
+	case XFS_IOC_FSSETDM:
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+	case XFS_IOC_GETBMAPX:
+	case XFS_IOC_FSCOUNTS:
+	case XFS_IOC_SET_RESBLKS:
+	case XFS_IOC_GET_RESBLKS:
+	case XFS_IOC_FSGROWFSLOG:
+	case XFS_IOC_GOINGDOWN:
+	case XFS_IOC_ERROR_INJECTION:
+	case XFS_IOC_ERROR_CLEARALL:
+		return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
+	/* These are handled fine if no alignment issues */
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+	case XFS_IOC_FSGEOMETRY_V1:
+	case XFS_IOC_FSGROWFSDATA:
+	case XFS_IOC_FSGROWFSRT:
+	case XFS_IOC_ZERO_RANGE:
+		return xfs_file_ioctl(filp, cmd, p);
+#else
+	case XFS_IOC_ALLOCSP_32:
+	case XFS_IOC_FREESP_32:
+	case XFS_IOC_ALLOCSP64_32:
+	case XFS_IOC_FREESP64_32:
+	case XFS_IOC_RESVSP_32:
+	case XFS_IOC_UNRESVSP_32:
+	case XFS_IOC_RESVSP64_32:
+	case XFS_IOC_UNRESVSP64_32:
+	case XFS_IOC_ZERO_RANGE_32: {
+		struct xfs_flock64	bf;
+
+		if (xfs_compat_flock64_copyin(&bf, arg))
+			return -EFAULT;
+		cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+	}
+	case XFS_IOC_FSGEOMETRY_V1_32:
+		return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+	case XFS_IOC_FSGROWFSDATA_32: {
+		struct xfs_growfs_data	in;
+
+		if (xfs_compat_growfs_data_copyin(&in, arg))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_data(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+	case XFS_IOC_FSGROWFSRT_32: {
+		struct xfs_growfs_rt	in;
+
+		if (xfs_compat_growfs_rt_copyin(&in, arg))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_rt(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+#endif
+	/* long changes size, but xfs only copiese out 32 bits */
+	case XFS_IOC_GETXFLAGS_32:
+	case XFS_IOC_SETXFLAGS_32:
+	case XFS_IOC_GETVERSION_32:
+		cmd = _NATIVE_IOC(cmd, long);
+		return xfs_file_ioctl(filp, cmd, p);
+	case XFS_IOC_SWAPEXT_32: {
+		struct xfs_swapext	  sxp;
+		struct compat_xfs_swapext __user *sxu = arg;
+
+		/* Bulk copy in up to the sx_stat field, then copy bstat */
+		if (copy_from_user(&sxp, sxu,
+				   offsetof(struct xfs_swapext, sx_stat)) ||
+		    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_ioc_swapext(&sxp);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+	case XFS_IOC_FSBULKSTAT_32:
+	case XFS_IOC_FSBULKSTAT_SINGLE_32:
+	case XFS_IOC_FSINUMBERS_32:
+		return xfs_compat_ioc_bulkstat(mp, cmd, arg);
+	case XFS_IOC_FD_TO_HANDLE_32:
+	case XFS_IOC_PATH_TO_HANDLE_32:
+	case XFS_IOC_PATH_TO_FSHANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -EFAULT;
+		cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
+		return xfs_find_handle(cmd, &hreq);
+	}
+	case XFS_IOC_OPEN_BY_HANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -EFAULT;
+		return xfs_open_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_READLINK_BY_HANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -EFAULT;
+		return xfs_readlink_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_ATTRLIST_BY_HANDLE_32:
+		return xfs_compat_attrlist_by_handle(filp, arg);
+	case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+		return xfs_compat_attrmulti_by_handle(filp, arg);
+	case XFS_IOC_FSSETDM_BY_HANDLE_32:
+		return xfs_compat_fssetdm_by_handle(filp, arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
diff --git a/kernel/fs/xfs/xfs_ioctl32.h b/kernel/fs/xfs/xfs_ioctl32.h
new file mode 100644
index 000000000..b1bb45444
--- /dev/null
+++ b/kernel/fs/xfs/xfs_ioctl32.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL32_H__
+#define __XFS_IOCTL32_H__
+
+#include <linux/compat.h>
+
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32	FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32	FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32	FS_IOC32_GETVERSION
+
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+
+typedef struct compat_xfs_bstime {
+	compat_time_t	tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} compat_xfs_bstime_t;
+
+typedef struct compat_xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	compat_xfs_bstime_t bs_atime;	/* access time			*/
+	compat_xfs_bstime_t bs_mtime;	/* modify time			*/
+	compat_xfs_bstime_t bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
+	__u16		bs_projid_hi;	/* high part of project id	*/
+	unsigned char	bs_pad[10];	/* pad space, unused		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+} __compat_packed compat_xfs_bstat_t;
+
+typedef struct compat_xfs_fsop_bulkreq {
+	compat_uptr_t	lastip;		/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	compat_uptr_t	ubuffer;	/* user buffer for inode desc.	*/
+	compat_uptr_t	ocount;		/* output count pointer		*/
+} compat_xfs_fsop_bulkreq_t;
+
+#define XFS_IOC_FSBULKSTAT_32 \
+	_IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+	_IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+	_IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+
+typedef struct compat_xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	compat_uptr_t	path;		/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	compat_uptr_t	ihandle;	/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	compat_uptr_t	ohandle;	/* user buffer for handle	*/
+	compat_uptr_t	ohandlen;	/* user buffer length		*/
+} compat_xfs_fsop_handlereq_t;
+
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+	_IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+	_IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+	_IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+	_IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+	_IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+	__int64_t		sx_version;	/* version */
+	__int64_t		sx_fdtarget;	/* fd of target file */
+	__int64_t		sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t		sx_offset;	/* offset into file */
+	xfs_off_t		sx_length;	/* leng from offset */
+	char			sx_pad[16];	/* pad space, unused */
+	compat_xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+
+#define XFS_IOC_SWAPEXT_32	_IOWR('X', 109, struct compat_xfs_swapext)
+
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+	struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor	pos; /* opaque cookie, list offset */
+	__u32				flags;	/* which namespace to use */
+	__u32				buflen;	/* length of buffer supplied */
+	compat_uptr_t			buffer;	/* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+	_IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+	__u32		am_opcode;
+	__s32		am_error;
+	compat_uptr_t	am_attrname;
+	compat_uptr_t	am_attrvalue;
+	__u32		am_length;
+	__u32		am_flags;
+} compat_xfs_attr_multiop_t;
+
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+	struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+	__u32				opcount;/* count of following multiop */
+	/* ptr to compat_xfs_attr_multiop */
+	compat_uptr_t			ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+	_IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+
+typedef struct compat_xfs_fsop_setdm_handlereq {
+	struct compat_xfs_fsop_handlereq hreq;	/* handle information   */
+	/* ptr to struct fsdmidata */
+	compat_uptr_t			data;	/* DMAPI data   */
+} compat_xfs_fsop_setdm_handlereq_t;
+
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+	_IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start	__attribute__((packed));
+			/* len == 0 means until end of file */
+	__s64		l_len __attribute__((packed));
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area */
+} compat_xfs_flock64_t;
+
+#define XFS_IOC_ALLOCSP_32	_IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32	_IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32	_IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32	_IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32	_IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32	_IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32	_IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32	_IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32	_IOW('X', 57, struct compat_xfs_flock64)
+
+typedef struct compat_xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+	_IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+
+typedef struct compat_xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+} __attribute__((packed)) compat_xfs_inogrp_t;
+
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+
+typedef struct compat_xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+
+#endif /* BROKEN_X86_ALIGNMENT */
+
+#endif /* __XFS_IOCTL32_H__ */
diff --git a/kernel/fs/xfs/xfs_iomap.c b/kernel/fs/xfs/xfs_iomap.c
new file mode 100644
index 000000000..38e633bad
--- /dev/null
+++ b/kernel/fs/xfs/xfs_iomap.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+
+
+#define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
+						<< mp->m_writeio_log)
+#define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
+
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	xfs_extlen_t	extsize,
+	xfs_fileoff_t	*last_fsb)
+{
+	xfs_extlen_t	align = 0;
+	int		eof, error;
+
+	if (!XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * Round up the allocation request to a stripe unit
+		 * (m_dalign) boundary if the file size is >= stripe unit
+		 * size, and we are allocating past the allocation eof.
+		 *
+		 * If mounted with the "-o swalloc" option the alignment is
+		 * increased from the strip unit size to the stripe width.
+		 */
+		if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+			align = mp->m_swidth;
+		else if (mp->m_dalign)
+			align = mp->m_dalign;
+
+		if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
+			align = 0;
+	}
+
+	/*
+	 * Always round up the allocation request to an extent boundary
+	 * (when file on a real-time subvolume or has di_extsize hint).
+	 */
+	if (extsize) {
+		if (align)
+			align = roundup_64(align, extsize);
+		else
+			align = extsize;
+	}
+
+	if (align) {
+		xfs_fileoff_t	new_last_fsb = roundup_64(*last_fsb, align);
+		error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+		if (error)
+			return error;
+		if (eof)
+			*last_fsb = new_last_fsb;
+	}
+	return 0;
+}
+
+STATIC int
+xfs_alert_fsblock_zero(
+	xfs_inode_t	*ip,
+	xfs_bmbt_irec_t	*imap)
+{
+	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+			"Access to block zero in inode %llu "
+			"start_block: %llx start_off: %llx "
+			"blkcnt: %llx extent-state: %x",
+		(unsigned long long)ip->i_ino,
+		(unsigned long long)imap->br_startblock,
+		(unsigned long long)imap->br_startoff,
+		(unsigned long long)imap->br_blockcount,
+		imap->br_state);
+	return -EFSCORRUPTED;
+}
+
+int
+xfs_iomap_write_direct(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count,
+	xfs_bmbt_irec_t *imap,
+	int		nmaps)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	count_fsb, resaligned;
+	xfs_fsblock_t	firstfsb;
+	xfs_extlen_t	extsz, temp;
+	int		nimaps;
+	int		quota_flag;
+	int		rt;
+	xfs_trans_t	*tp;
+	xfs_bmap_free_t free_list;
+	uint		qblocks, resblks, resrtextents;
+	int		committed;
+	int		error;
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	rt = XFS_IS_REALTIME_INODE(ip);
+	extsz = xfs_get_extsz_hint(ip);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	if ((offset + count) > XFS_ISIZE(ip)) {
+		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+		if (error)
+			return error;
+	} else {
+		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
+					imap->br_blockcount +
+					imap->br_startoff);
+	}
+	count_fsb = last_fsb - offset_fsb;
+	ASSERT(count_fsb > 0);
+
+	resaligned = count_fsb;
+	if (unlikely(extsz)) {
+		if ((temp = do_mod(offset_fsb, extsz)))
+			resaligned += temp;
+		if ((temp = do_mod(resaligned, extsz)))
+			resaligned += extsz - temp;
+	}
+
+	if (unlikely(rt)) {
+		resrtextents = qblocks = resaligned;
+		resrtextents /= mp->m_sb.sb_rextsize;
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+		quota_flag = XFS_QMOPT_RES_RTBLKS;
+	} else {
+		resrtextents = 0;
+		resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+		quota_flag = XFS_QMOPT_RES_REGBLKS;
+	}
+
+	/*
+	 * Allocate and setup the transaction
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+				  resblks, resrtextents);
+	/*
+	 * Check for running out of space, note: need lock to return
+	 */
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+	if (error)
+		goto out_trans_cancel;
+
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * From this point onwards we overwrite the imap pointer that the
+	 * caller gave to us.
+	 */
+	xfs_bmap_init(&free_list, &firstfsb);
+	nimaps = 1;
+	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+				XFS_BMAPI_PREALLOC, &firstfsb, 0,
+				imap, &nimaps, &free_list);
+	if (error)
+		goto out_bmap_cancel;
+
+	/*
+	 * Complete the transaction
+	 */
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto out_bmap_cancel;
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Copy any maps to caller's array and return any error.
+	 */
+	if (nimaps == 0) {
+		error = -ENOSPC;
+		goto out_unlock;
+	}
+
+	if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+		error = xfs_alert_fsblock_zero(ip, imap);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+out_trans_cancel:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	goto out_unlock;
+}
+
+/*
+ * If the caller is doing a write at the end of the file, then extend the
+ * allocation out to the file system's write iosize.  We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
+ */
+STATIC int
+xfs_iomap_eof_want_preallocate(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count,
+	xfs_bmbt_irec_t *imap,
+	int		nimaps,
+	int		*prealloc)
+{
+	xfs_fileoff_t   start_fsb;
+	xfs_filblks_t   count_fsb;
+	int		n, error, imaps;
+	int		found_delalloc = 0;
+
+	*prealloc = 0;
+	if (offset + count <= XFS_ISIZE(ip))
+		return 0;
+
+	/*
+	 * If the file is smaller than the minimum prealloc and we are using
+	 * dynamic preallocation, don't do any preallocation at all as it is
+	 * likely this is the only write to the file that is going to be done.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+		return 0;
+
+	/*
+	 * If there are any real blocks past eof, then don't
+	 * do any speculative allocation.
+	 */
+	start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
+	count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	while (count_fsb > 0) {
+		imaps = nimaps;
+		error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
+				       0);
+		if (error)
+			return error;
+		for (n = 0; n < imaps; n++) {
+			if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+			    (imap[n].br_startblock != DELAYSTARTBLOCK))
+				return 0;
+			start_fsb += imap[n].br_blockcount;
+			count_fsb -= imap[n].br_blockcount;
+
+			if (imap[n].br_startblock == DELAYSTARTBLOCK)
+				found_delalloc = 1;
+		}
+	}
+	if (!found_delalloc)
+		*prealloc = 1;
+	return 0;
+}
+
+/*
+ * Determine the initial size of the preallocation. We are beyond the current
+ * EOF here, but we need to take into account whether this is a sparse write or
+ * an extending write when determining the preallocation size.  Hence we need to
+ * look up the extent that ends at the current write offset and use the result
+ * to determine the preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceeding data extent as the basis for the
+ * preallocation size. If the size of the extent is greater than half the
+ * maximum extent length, then use the current offset as the basis. This ensures
+ * that for large files the preallocation size always extends to MAXEXTLEN
+ * rather than falling short due to things like stripe unit/width alignment of
+ * real extents.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_eof_prealloc_initial_size(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_bmbt_irec_t		*imap,
+	int			nimaps)
+{
+	xfs_fileoff_t   start_fsb;
+	int		imaps = 1;
+	int		error;
+
+	ASSERT(nimaps >= imaps);
+
+	/* if we are using a specific prealloc size, return now */
+	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+		return 0;
+
+	/* If the file is small, then use the minimum prealloc */
+	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+		return 0;
+
+	/*
+	 * As we write multiple pages, the offset will always align to the
+	 * start of a page and hence point to a hole at EOF. i.e. if the size is
+	 * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
+	 * will return FSB 1. Hence if there are blocks in the file, we want to
+	 * point to the block prior to the EOF block and not the hole that maps
+	 * directly at @offset.
+	 */
+	start_fsb = XFS_B_TO_FSB(mp, offset);
+	if (start_fsb)
+		start_fsb--;
+	error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
+	if (error)
+		return 0;
+
+	ASSERT(imaps == 1);
+	if (imap[0].br_startblock == HOLESTARTBLOCK)
+		return 0;
+	if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
+		return imap[0].br_blockcount << 1;
+	return XFS_B_TO_FSB(mp, offset);
+}
+
+STATIC bool
+xfs_quota_need_throttle(
+	struct xfs_inode *ip,
+	int type,
+	xfs_fsblock_t alloc_blocks)
+{
+	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+	if (!dq || !xfs_this_quota_on(ip->i_mount, type))
+		return false;
+
+	/* no hi watermark, no throttle */
+	if (!dq->q_prealloc_hi_wmark)
+		return false;
+
+	/* under the lo watermark, no throttle */
+	if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
+		return false;
+
+	return true;
+}
+
+STATIC void
+xfs_quota_calc_throttle(
+	struct xfs_inode *ip,
+	int type,
+	xfs_fsblock_t *qblocks,
+	int *qshift,
+	int64_t	*qfreesp)
+{
+	int64_t freesp;
+	int shift = 0;
+	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+	/* no dq, or over hi wmark, squash the prealloc completely */
+	if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+		*qblocks = 0;
+		*qfreesp = 0;
+		return;
+	}
+
+	freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
+	if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
+			shift += 2;
+		if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
+			shift += 2;
+	}
+
+	if (freesp < *qfreesp)
+		*qfreesp = freesp;
+
+	/* only overwrite the throttle values if we are more aggressive */
+	if ((freesp >> shift) < (*qblocks >> *qshift)) {
+		*qblocks = freesp;
+		*qshift = shift;
+	}
+}
+
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	struct xfs_bmbt_irec	*imap,
+	int			nimaps)
+{
+	xfs_fsblock_t		alloc_blocks = 0;
+	int			shift = 0;
+	int64_t			freesp;
+	xfs_fsblock_t		qblocks;
+	int			qshift = 0;
+
+	alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
+							   imap, nimaps);
+	if (!alloc_blocks)
+		goto check_writeio;
+	qblocks = alloc_blocks;
+
+	/*
+	 * MAXEXTLEN is not a power of two value but we round the prealloc down
+	 * to the nearest power of two value after throttling. To prevent the
+	 * round down from unconditionally reducing the maximum supported prealloc
+	 * size, we round up first, apply appropriate throttling, round down and
+	 * cap the value to MAXEXTLEN.
+	 */
+	alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+				       alloc_blocks);
+
+	freesp = percpu_counter_read_positive(&mp->m_fdblocks);
+	if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+			shift++;
+	}
+
+	/*
+	 * Check each quota to cap the prealloc size, provide a shift value to
+	 * throttle with and adjust amount of available space.
+	 */
+	if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+					&freesp);
+	if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+					&freesp);
+	if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+					&freesp);
+
+	/*
+	 * The final prealloc size is set to the minimum of free space available
+	 * in each of the quotas and the overall filesystem.
+	 *
+	 * The shift throttle value is set to the maximum value as determined by
+	 * the global low free space values and per-quota low free space values.
+	 */
+	alloc_blocks = MIN(alloc_blocks, qblocks);
+	shift = MAX(shift, qshift);
+
+	if (shift)
+		alloc_blocks >>= shift;
+	/*
+	 * rounddown_pow_of_two() returns an undefined result if we pass in
+	 * alloc_blocks = 0.
+	 */
+	if (alloc_blocks)
+		alloc_blocks = rounddown_pow_of_two(alloc_blocks);
+	if (alloc_blocks > MAXEXTLEN)
+		alloc_blocks = MAXEXTLEN;
+
+	/*
+	 * If we are still trying to allocate more space than is
+	 * available, squash the prealloc hard. This can happen if we
+	 * have a large file on a small filesystem and the above
+	 * lowspace thresholds are smaller than MAXEXTLEN.
+	 */
+	while (alloc_blocks && alloc_blocks >= freesp)
+		alloc_blocks >>= 4;
+
+check_writeio:
+	if (alloc_blocks < mp->m_writeio_blocks)
+		alloc_blocks = mp->m_writeio_blocks;
+
+	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
+				      mp->m_writeio_blocks);
+
+	return alloc_blocks;
+}
+
+int
+xfs_iomap_write_delay(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count,
+	xfs_bmbt_irec_t *ret_imap)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_off_t	aligned_offset;
+	xfs_fileoff_t	ioalign;
+	xfs_extlen_t	extsz;
+	int		nimaps;
+	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+	int		prealloc;
+	int		error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	/*
+	 * Make sure that the dquots are there. This doesn't hold
+	 * the ilock across a disk read.
+	 */
+	error = xfs_qm_dqattach_locked(ip, 0);
+	if (error)
+		return error;
+
+	extsz = xfs_get_extsz_hint(ip);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
+				imap, XFS_WRITE_IMAPS, &prealloc);
+	if (error)
+		return error;
+
+retry:
+	if (prealloc) {
+		xfs_fsblock_t	alloc_blocks;
+
+		alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
+						       XFS_WRITE_IMAPS);
+
+		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+		last_fsb = ioalign + alloc_blocks;
+	} else {
+		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	}
+
+	if (prealloc || extsz) {
+		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Make sure preallocation does not create extents beyond the range we
+	 * actually support in this filesystem.
+	 */
+	if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
+		last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+
+	ASSERT(last_fsb > offset_fsb);
+
+	nimaps = XFS_WRITE_IMAPS;
+	error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
+				imap, &nimaps, XFS_BMAPI_ENTIRE);
+	switch (error) {
+	case 0:
+	case -ENOSPC:
+	case -EDQUOT:
+		break;
+	default:
+		return error;
+	}
+
+	/*
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
+	 * without EOF preallocation.
+	 */
+	if (nimaps == 0) {
+		trace_xfs_delalloc_enospc(ip, offset, count);
+		if (prealloc) {
+			prealloc = 0;
+			error = 0;
+			goto retry;
+		}
+		return error ? error : -ENOSPC;
+	}
+
+	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
+		return xfs_alert_fsblock_zero(ip, &imap[0]);
+
+	/*
+	 * Tag the inode as speculatively preallocated so we can reclaim this
+	 * space on demand, if necessary.
+	 */
+	if (prealloc)
+		xfs_inode_set_eofblocks_tag(ip);
+
+	*ret_imap = imap[0];
+	return 0;
+}
+
+/*
+ * Pass in a delayed allocate extent, convert it to real extents;
+ * return to the caller the extent we create which maps on top of
+ * the originating callers request.
+ *
+ * Called without a lock on the inode.
+ *
+ * We no longer bother to look at the incoming map - all we have to
+ * guarantee is that whatever we allocate fills the required range.
+ */
+int
+xfs_iomap_write_allocate(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	xfs_bmbt_irec_t *imap)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb, last_block;
+	xfs_fileoff_t	end_fsb, map_start_fsb;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t	free_list;
+	xfs_filblks_t	count_fsb;
+	xfs_trans_t	*tp;
+	int		nimaps, committed;
+	int		error = 0;
+	int		nres;
+
+	/*
+	 * Make sure that the dquots are there.
+	 */
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	count_fsb = imap->br_blockcount;
+	map_start_fsb = imap->br_startoff;
+
+	XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+
+	while (count_fsb != 0) {
+		/*
+		 * Set up a transaction with which to allocate the
+		 * backing store for the file.  Do allocations in a
+		 * loop until we get some space in the range we are
+		 * interested in.  The other space that might be allocated
+		 * is in the delayed allocation extent on which we sit
+		 * but before our buffer starts.
+		 */
+
+		nimaps = 0;
+		while (nimaps == 0) {
+			tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+			tp->t_flags |= XFS_TRANS_RESERVE;
+			nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+						  nres, 0);
+			if (error) {
+				xfs_trans_cancel(tp, 0);
+				return error;
+			}
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, 0);
+
+			xfs_bmap_init(&free_list, &first_block);
+
+			/*
+			 * it is possible that the extents have changed since
+			 * we did the read call as we dropped the ilock for a
+			 * while. We have to be careful about truncates or hole
+			 * punchs here - we are not allowed to allocate
+			 * non-delalloc blocks here.
+			 *
+			 * The only protection against truncation is the pages
+			 * for the range we are being asked to convert are
+			 * locked and hence a truncate will block on them
+			 * first.
+			 *
+			 * As a result, if we go beyond the range we really
+			 * need and hit an delalloc extent boundary followed by
+			 * a hole while we have excess blocks in the map, we
+			 * will fill the hole incorrectly and overrun the
+			 * transaction reservation.
+			 *
+			 * Using a single map prevents this as we are forced to
+			 * check each map we look for overlap with the desired
+			 * range and abort as soon as we find it. Also, given
+			 * that we only return a single map, having one beyond
+			 * what we can return is probably a bit silly.
+			 *
+			 * We also need to check that we don't go beyond EOF;
+			 * this is a truncate optimisation as a truncate sets
+			 * the new file size before block on the pages we
+			 * currently have locked under writeback. Because they
+			 * are about to be tossed, we don't need to write them
+			 * back....
+			 */
+			nimaps = 1;
+			end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+			error = xfs_bmap_last_offset(ip, &last_block,
+							XFS_DATA_FORK);
+			if (error)
+				goto trans_cancel;
+
+			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+			if ((map_start_fsb + count_fsb) > last_block) {
+				count_fsb = last_block - map_start_fsb;
+				if (count_fsb == 0) {
+					error = -EAGAIN;
+					goto trans_cancel;
+				}
+			}
+
+			/*
+			 * From this point onwards we overwrite the imap
+			 * pointer that the caller gave to us.
+			 */
+			error = xfs_bmapi_write(tp, ip, map_start_fsb,
+						count_fsb, 0,
+						&first_block, 1,
+						imap, &nimaps, &free_list);
+			if (error)
+				goto trans_cancel;
+
+			error = xfs_bmap_finish(&tp, &free_list, &committed);
+			if (error)
+				goto trans_cancel;
+
+			error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+			if (error)
+				goto error0;
+
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+
+		/*
+		 * See if we were able to allocate an extent that
+		 * covers at least part of the callers request
+		 */
+		if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+			return xfs_alert_fsblock_zero(ip, imap);
+
+		if ((offset_fsb >= imap->br_startoff) &&
+		    (offset_fsb < (imap->br_startoff +
+				   imap->br_blockcount))) {
+			XFS_STATS_INC(xs_xstrat_quick);
+			return 0;
+		}
+
+		/*
+		 * So far we have not mapped the requested part of the
+		 * file, just surrounding data, try again.
+		 */
+		count_fsb -= imap->br_blockcount;
+		map_start_fsb = imap->br_startoff + imap->br_blockcount;
+	}
+
+trans_cancel:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error0:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+int
+xfs_iomap_write_unwritten(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	xfs_off_t	count)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb;
+	xfs_filblks_t	count_fsb;
+	xfs_filblks_t	numblks_fsb;
+	xfs_fsblock_t	firstfsb;
+	int		nimaps;
+	xfs_trans_t	*tp;
+	xfs_bmbt_irec_t imap;
+	xfs_bmap_free_t free_list;
+	xfs_fsize_t	i_size;
+	uint		resblks;
+	int		committed;
+	int		error;
+
+	trace_xfs_unwritten_convert(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+
+	/*
+	 * Reserve enough blocks in this transaction for two complete extent
+	 * btree splits.  We may be converting the middle part of an unwritten
+	 * extent and in this case we will insert two new extents in the btree
+	 * each of which could cause a full split.
+	 *
+	 * This reservation amount will be used in the first call to
+	 * xfs_bmbt_split() to select an AG with enough space to satisfy the
+	 * rest of the operation.
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+
+	do {
+		/*
+		 * set up a transaction to convert the range of extents
+		 * from unwritten to real. Do allocations in a loop until
+		 * we have covered the range passed in.
+		 *
+		 * Note that we open code the transaction allocation here
+		 * to pass KM_NOFS--we can't risk to recursing back into
+		 * the filesystem here as we might be asked to write out
+		 * the same inode that we complete here and might deadlock
+		 * on the iolock.
+		 */
+		sb_start_intwrite(mp->m_super);
+		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+		tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+					  resblks, 0);
+		if (error) {
+			xfs_trans_cancel(tp, 0);
+			return error;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, 0);
+
+		/*
+		 * Modify the unwritten extent state of the buffer.
+		 */
+		xfs_bmap_init(&free_list, &firstfsb);
+		nimaps = 1;
+		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+				  XFS_BMAPI_CONVERT, &firstfsb,
+				  1, &imap, &nimaps, &free_list);
+		if (error)
+			goto error_on_bmapi_transaction;
+
+		/*
+		 * Log the updated inode size as we go.  We have to be careful
+		 * to only log it up to the actual write offset if it is
+		 * halfway into a block.
+		 */
+		i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+		if (i_size > offset + count)
+			i_size = offset + count;
+
+		i_size = xfs_new_eof(ip, i_size);
+		if (i_size) {
+			ip->i_d.di_size = i_size;
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		}
+
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error)
+			goto error_on_bmapi_transaction;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			return error;
+
+		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+			return xfs_alert_fsblock_zero(ip, &imap);
+
+		if ((numblks_fsb = imap.br_blockcount) == 0) {
+			/*
+			 * The numblks_fsb value should always get
+			 * smaller, otherwise the loop is stuck.
+			 */
+			ASSERT(imap.br_blockcount);
+			break;
+		}
+		offset_fsb += numblks_fsb;
+		count_fsb -= numblks_fsb;
+	} while (count_fsb > 0);
+
+	return 0;
+
+error_on_bmapi_transaction:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
diff --git a/kernel/fs/xfs/xfs_iomap.h b/kernel/fs/xfs/xfs_iomap.h
new file mode 100644
index 000000000..8688e663d
--- /dev/null
+++ b/kernel/fs/xfs/xfs_iomap.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOMAP_H__
+#define __XFS_IOMAP_H__
+
+struct xfs_inode;
+struct xfs_bmbt_irec;
+
+int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *, int);
+int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *);
+int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
+			struct xfs_bmbt_irec *);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+
+#endif /* __XFS_IOMAP_H__*/
diff --git a/kernel/fs/xfs/xfs_iops.c b/kernel/fs/xfs/xfs_iops.c
new file mode 100644
index 000000000..f4cd7204e
--- /dev/null
+++ b/kernel/fs/xfs/xfs_iops.c
@@ -0,0 +1,1305 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_acl.h"
+#include "xfs_quota.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_trans_space.h"
+#include "xfs_pnfs.h"
+
+#include <linux/capability.h>
+#include <linux/xattr.h>
+#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/fiemap.h>
+#include <linux/slab.h>
+
+/*
+ * Directories have different lock order w.r.t. mmap_sem compared to regular
+ * files. This is due to readdir potentially triggering page faults on a user
+ * buffer inside filldir(), and this happens with the ilock on the directory
+ * held. For regular files, the lock order is the other way around - the
+ * mmap_sem is taken during the page fault, and then we lock the ilock to do
+ * block mapping. Hence we need a different class for the directory ilock so
+ * that lockdep can tell them apart.
+ */
+static struct lock_class_key xfs_nondir_ilock_class;
+static struct lock_class_key xfs_dir_ilock_class;
+
+static int
+xfs_initxattrs(
+	struct inode		*inode,
+	const struct xattr	*xattr_array,
+	void			*fs_info)
+{
+	const struct xattr	*xattr;
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		error = xfs_attr_set(ip, xattr->name, xattr->value,
+				      xattr->value_len, ATTR_SECURE);
+		if (error < 0)
+			break;
+	}
+	return error;
+}
+
+/*
+ * Hook in SELinux.  This is not quite correct yet, what we really need
+ * here (as we do for default ACLs) is a mechanism by which creation of
+ * these attrs can be journalled at inode creation time (along with the
+ * inode, of course, such that log replay can't cause these to be lost).
+ */
+
+STATIC int
+xfs_init_security(
+	struct inode	*inode,
+	struct inode	*dir,
+	const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					     &xfs_initxattrs, NULL);
+}
+
+static void
+xfs_dentry_to_name(
+	struct xfs_name	*namep,
+	struct dentry	*dentry,
+	int		mode)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+	namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
+}
+
+STATIC void
+xfs_cleanup_inode(
+	struct inode	*dir,
+	struct inode	*inode,
+	struct dentry	*dentry)
+{
+	struct xfs_name	teardown;
+
+	/* Oh, the horror.
+	 * If we can't add the ACL or we fail in
+	 * xfs_init_security we must back out.
+	 * ENOSPC can hit here, among other things.
+	 */
+	xfs_dentry_to_name(&teardown, dentry, 0);
+
+	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+}
+
+STATIC int
+xfs_generic_create(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	umode_t		mode,
+	dev_t		rdev,
+	bool		tmpfile)	/* unnamed file */
+{
+	struct inode	*inode;
+	struct xfs_inode *ip = NULL;
+	struct posix_acl *default_acl, *acl;
+	struct xfs_name	name;
+	int		error;
+
+	/*
+	 * Irix uses Missed'em'V split, but doesn't want to see
+	 * the upper 5 bits of (14bit) major.
+	 */
+	if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+			return -EINVAL;
+		rdev = sysv_encode_dev(rdev);
+	} else {
+		rdev = 0;
+	}
+
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (!tmpfile) {
+		xfs_dentry_to_name(&name, dentry, mode);
+		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+	} else {
+		error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+	}
+	if (unlikely(error))
+		goto out_free_acl;
+
+	inode = VFS_I(ip);
+
+	error = xfs_init_security(inode, dir, &dentry->d_name);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+#ifdef CONFIG_XFS_POSIX_ACL
+	if (default_acl) {
+		error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		if (error)
+			goto out_cleanup_inode;
+	}
+	if (acl) {
+		error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (error)
+			goto out_cleanup_inode;
+	}
+#endif
+
+	if (tmpfile)
+		d_tmpfile(dentry, inode);
+	else
+		d_instantiate(dentry, inode);
+
+	xfs_finish_inode_setup(ip);
+
+ out_free_acl:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
+	return error;
+
+ out_cleanup_inode:
+	xfs_finish_inode_setup(ip);
+	if (!tmpfile)
+		xfs_cleanup_inode(dir, inode, dentry);
+	iput(inode);
+	goto out_free_acl;
+}
+
+STATIC int
+xfs_vn_mknod(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	umode_t		mode,
+	dev_t		rdev)
+{
+	return xfs_generic_create(dir, dentry, mode, rdev, false);
+}
+
+STATIC int
+xfs_vn_create(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	umode_t		mode,
+	bool		flags)
+{
+	return xfs_vn_mknod(dir, dentry, mode, 0);
+}
+
+STATIC int
+xfs_vn_mkdir(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	umode_t		mode)
+{
+	return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+}
+
+STATIC struct dentry *
+xfs_vn_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	unsigned int flags)
+{
+	struct xfs_inode *cip;
+	struct xfs_name	name;
+	int		error;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	xfs_dentry_to_name(&name, dentry, 0);
+	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
+	if (unlikely(error)) {
+		if (unlikely(error != -ENOENT))
+			return ERR_PTR(error);
+		d_add(dentry, NULL);
+		return NULL;
+	}
+
+	return d_splice_alias(VFS_I(cip), dentry);
+}
+
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	unsigned int flags)
+{
+	struct xfs_inode *ip;
+	struct xfs_name	xname;
+	struct xfs_name ci_name;
+	struct qstr	dname;
+	int		error;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	xfs_dentry_to_name(&xname, dentry, 0);
+	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+	if (unlikely(error)) {
+		if (unlikely(error != -ENOENT))
+			return ERR_PTR(error);
+		/*
+		 * call d_add(dentry, NULL) here when d_drop_negative_children
+		 * is called in xfs_vn_mknod (ie. allow negative dentries
+		 * with CI filesystems).
+		 */
+		return NULL;
+	}
+
+	/* if exact match, just splice and exit */
+	if (!ci_name.name)
+		return d_splice_alias(VFS_I(ip), dentry);
+
+	/* else case-insensitive match... */
+	dname.name = ci_name.name;
+	dname.len = ci_name.len;
+	dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+	kmem_free(ci_name.name);
+	return dentry;
+}
+
+STATIC int
+xfs_vn_link(
+	struct dentry	*old_dentry,
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct inode	*inode = d_inode(old_dentry);
+	struct xfs_name	name;
+	int		error;
+
+	xfs_dentry_to_name(&name, dentry, inode->i_mode);
+
+	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
+	if (unlikely(error))
+		return error;
+
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+STATIC int
+xfs_vn_unlink(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct xfs_name	name;
+	int		error;
+
+	xfs_dentry_to_name(&name, dentry, 0);
+
+	error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));
+	if (error)
+		return error;
+
+	/*
+	 * With unlink, the VFS makes the dentry "negative": no inode,
+	 * but still hashed. This is incompatible with case-insensitive
+	 * mode, so invalidate (unhash) the dentry in CI-mode.
+	 */
+	if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+		d_invalidate(dentry);
+	return 0;
+}
+
+STATIC int
+xfs_vn_symlink(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	const char	*symname)
+{
+	struct inode	*inode;
+	struct xfs_inode *cip = NULL;
+	struct xfs_name	name;
+	int		error;
+	umode_t		mode;
+
+	mode = S_IFLNK |
+		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
+	xfs_dentry_to_name(&name, dentry, mode);
+
+	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+	if (unlikely(error))
+		goto out;
+
+	inode = VFS_I(cip);
+
+	error = xfs_init_security(inode, dir, &dentry->d_name);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	d_instantiate(dentry, inode);
+	xfs_finish_inode_setup(cip);
+	return 0;
+
+ out_cleanup_inode:
+	xfs_finish_inode_setup(cip);
+	xfs_cleanup_inode(dir, inode, dentry);
+	iput(inode);
+ out:
+	return error;
+}
+
+STATIC int
+xfs_vn_rename(
+	struct inode	*odir,
+	struct dentry	*odentry,
+	struct inode	*ndir,
+	struct dentry	*ndentry,
+	unsigned int	flags)
+{
+	struct inode	*new_inode = d_inode(ndentry);
+	int		omode = 0;
+	struct xfs_name	oname;
+	struct xfs_name	nname;
+
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+		return -EINVAL;
+
+	/* if we are exchanging files, we need to set i_mode of both files */
+	if (flags & RENAME_EXCHANGE)
+		omode = d_inode(ndentry)->i_mode;
+
+	xfs_dentry_to_name(&oname, odentry, omode);
+	xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode);
+
+	return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
+			  XFS_I(ndir), &nname,
+			  new_inode ? XFS_I(new_inode) : NULL, flags);
+}
+
+/*
+ * careful here - this function can get called recursively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC void *
+xfs_vn_follow_link(
+	struct dentry		*dentry,
+	struct nameidata	*nd)
+{
+	char			*link;
+	int			error = -ENOMEM;
+
+	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+	if (!link)
+		goto out_err;
+
+	error = xfs_readlink(XFS_I(d_inode(dentry)), link);
+	if (unlikely(error))
+		goto out_kfree;
+
+	nd_set_link(nd, link);
+	return NULL;
+
+ out_kfree:
+	kfree(link);
+ out_err:
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+STATIC int
+xfs_vn_getattr(
+	struct vfsmount		*mnt,
+	struct dentry		*dentry,
+	struct kstat		*stat)
+{
+	struct inode		*inode = d_inode(dentry);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+
+	trace_xfs_getattr(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	stat->size = XFS_ISIZE(ip);
+	stat->dev = inode->i_sb->s_dev;
+	stat->mode = ip->i_d.di_mode;
+	stat->nlink = ip->i_d.di_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->ino = ip->i_ino;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blocks =
+		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		stat->blksize = BLKDEV_IOSIZE;
+		stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+				   sysv_minor(ip->i_df.if_u2.if_rdev));
+		break;
+	default:
+		if (XFS_IS_REALTIME_INODE(ip)) {
+			/*
+			 * If the file blocks are being allocated from a
+			 * realtime volume, then return the inode's realtime
+			 * extent size or the realtime volume's extent size.
+			 */
+			stat->blksize =
+				xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+		} else
+			stat->blksize = xfs_preferred_iosize(mp);
+		stat->rdev = 0;
+		break;
+	}
+
+	return 0;
+}
+
+static void
+xfs_setattr_mode(
+	struct xfs_inode	*ip,
+	struct iattr		*iattr)
+{
+	struct inode		*inode = VFS_I(ip);
+	umode_t			mode = iattr->ia_mode;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	ip->i_d.di_mode &= S_IFMT;
+	ip->i_d.di_mode |= mode & ~S_IFMT;
+
+	inode->i_mode &= S_IFMT;
+	inode->i_mode |= mode & ~S_IFMT;
+}
+
+void
+xfs_setattr_time(
+	struct xfs_inode	*ip,
+	struct iattr		*iattr)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (iattr->ia_valid & ATTR_ATIME) {
+		inode->i_atime = iattr->ia_atime;
+		ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+		ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+	}
+	if (iattr->ia_valid & ATTR_CTIME) {
+		inode->i_ctime = iattr->ia_ctime;
+		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+	}
+	if (iattr->ia_valid & ATTR_MTIME) {
+		inode->i_mtime = iattr->ia_mtime;
+		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+	}
+}
+
+int
+xfs_setattr_nonsize(
+	struct xfs_inode	*ip,
+	struct iattr		*iattr,
+	int			flags)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	int			mask = iattr->ia_valid;
+	xfs_trans_t		*tp;
+	int			error;
+	kuid_t			uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
+	kgid_t			gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
+	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
+	struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
+
+	trace_xfs_setattr(ip);
+
+	/* If acls are being inherited, we already have this checked */
+	if (!(flags & XFS_ATTR_NOACL)) {
+		if (mp->m_flags & XFS_MOUNT_RDONLY)
+			return -EROFS;
+
+		if (XFS_FORCED_SHUTDOWN(mp))
+			return -EIO;
+
+		error = inode_change_ok(inode, iattr);
+		if (error)
+			return error;
+	}
+
+	ASSERT((mask & ATTR_SIZE) == 0);
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
+		uint	qflags = 0;
+
+		if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+			uid = iattr->ia_uid;
+			qflags |= XFS_QMOPT_UQUOTA;
+		} else {
+			uid = inode->i_uid;
+		}
+		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+			gid = iattr->ia_gid;
+			qflags |= XFS_QMOPT_GQUOTA;
+		}  else {
+			gid = inode->i_gid;
+		}
+
+		/*
+		 * We take a reference when we initialize udqp and gdqp,
+		 * so it is important that we never blindly double trip on
+		 * the same variable. See xfs_create() for an example.
+		 */
+		ASSERT(udqp == NULL);
+		ASSERT(gdqp == NULL);
+		error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
+					   xfs_kgid_to_gid(gid),
+					   xfs_get_projid(ip),
+					   qflags, &udqp, &gdqp, NULL);
+		if (error)
+			return error;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error)
+		goto out_dqrele;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 */
+	if (mask & (ATTR_UID|ATTR_GID)) {
+		/*
+		 * These IDs could have changed since we last looked at them.
+		 * But, we're assured that if the ownership did change
+		 * while we didn't have the inode locked, inode's dquot(s)
+		 * would have changed also.
+		 */
+		iuid = inode->i_uid;
+		igid = inode->i_gid;
+		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
+		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
+
+		/*
+		 * Do a quota reservation only if uid/gid is actually
+		 * going to change.
+		 */
+		if (XFS_IS_QUOTA_RUNNING(mp) &&
+		    ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
+		     (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
+			ASSERT(tp);
+			error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+						NULL, capable(CAP_FOWNER) ?
+						XFS_QMOPT_FORCE_RES : 0);
+			if (error)	/* out of quota */
+				goto out_trans_cancel;
+		}
+	}
+
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 */
+	if (mask & (ATTR_UID|ATTR_GID)) {
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The set-user-ID and set-group-ID bits of a file will be
+		 * cleared upon successful return from chown()
+		 */
+		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+		    !capable(CAP_FSETID))
+			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+		/*
+		 * Change the ownerships and register quota modifications
+		 * in the transaction.
+		 */
+		if (!uid_eq(iuid, uid)) {
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
+				ASSERT(mask & ATTR_UID);
+				ASSERT(udqp);
+				olddquot1 = xfs_qm_vop_chown(tp, ip,
+							&ip->i_udquot, udqp);
+			}
+			ip->i_d.di_uid = xfs_kuid_to_uid(uid);
+			inode->i_uid = uid;
+		}
+		if (!gid_eq(igid, gid)) {
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
+				ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
+				       !XFS_IS_PQUOTA_ON(mp));
+				ASSERT(mask & ATTR_GID);
+				ASSERT(gdqp);
+				olddquot2 = xfs_qm_vop_chown(tp, ip,
+							&ip->i_gdquot, gdqp);
+			}
+			ip->i_d.di_gid = xfs_kgid_to_gid(gid);
+			inode->i_gid = gid;
+		}
+	}
+
+	if (mask & ATTR_MODE)
+		xfs_setattr_mode(ip, iattr);
+	if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+		xfs_setattr_time(ip, iattr);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	XFS_STATS_INC(xs_ig_attrchg);
+
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Release any dquot(s) the inode had kept before chown.
+	 */
+	xfs_qm_dqrele(olddquot1);
+	xfs_qm_dqrele(olddquot2);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	if (error)
+		return error;
+
+	/*
+	 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+	 * 	     update.  We could avoid this with linked transactions
+	 * 	     and passing down the transaction pointer all the way
+	 *	     to attr_set.  No previous user of the generic
+	 * 	     Posix ACL code seems to care about this issue either.
+	 */
+	if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+		error = posix_acl_chmod(inode, inode->i_mode);
+		if (error)
+			return error;
+	}
+
+	return 0;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp, 0);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_dqrele:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	return error;
+}
+
+/*
+ * Truncate file.  Must have write permission and not be a directory.
+ */
+int
+xfs_setattr_size(
+	struct xfs_inode	*ip,
+	struct iattr		*iattr)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	xfs_off_t		oldsize, newsize;
+	struct xfs_trans	*tp;
+	int			error;
+	uint			lock_flags = 0;
+	uint			commit_flags = 0;
+	bool			did_zeroing = false;
+
+	trace_xfs_setattr(ip);
+
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return -EROFS;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+	ASSERT(S_ISREG(ip->i_d.di_mode));
+	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+		ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+
+	oldsize = inode->i_size;
+	newsize = iattr->ia_size;
+
+	/*
+	 * Short circuit the truncate case for zero length files.
+	 */
+	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
+		if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
+			return 0;
+
+		/*
+		 * Use the regular setattr path to update the timestamps.
+		 */
+		iattr->ia_valid &= ~ATTR_SIZE;
+		return xfs_setattr_nonsize(ip, iattr, 0);
+	}
+
+	/*
+	 * Make sure that the dquots are attached to the inode.
+	 */
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	/*
+	 * File data changes must be complete before we start the transaction to
+	 * modify the inode.  This needs to be done before joining the inode to
+	 * the transaction because the inode cannot be unlocked once it is a
+	 * part of the transaction.
+	 *
+	 * Start with zeroing any data block beyond EOF that we may expose on
+	 * file extension.
+	 */
+	if (newsize > oldsize) {
+		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * We are going to log the inode size change in this transaction so
+	 * any previous writes that are beyond the on disk EOF and the new
+	 * EOF that have not been written out need to be written here.  If we
+	 * do not write the data out, we expose ourselves to the null files
+	 * problem. Note that this includes any block zeroing we did above;
+	 * otherwise those blocks may not be zeroed after a crash.
+	 */
+	if (newsize > ip->i_d.di_size &&
+	    (oldsize != ip->i_d.di_size || did_zeroing)) {
+		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+						      ip->i_d.di_size, newsize);
+		if (error)
+			return error;
+	}
+
+	/* Now wait for all direct I/O to complete. */
+	inode_dio_wait(inode);
+
+	/*
+	 * We've already locked out new page faults, so now we can safely remove
+	 * pages from the page cache knowing they won't get refaulted until we
+	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
+	 * complete. The truncate_setsize() call also cleans partial EOF page
+	 * PTEs on extending truncates and hence ensures sub-page block size
+	 * filesystems are correctly handled, too.
+	 *
+	 * We have to do all the page cache truncate work outside the
+	 * transaction context as the "lock" order is page lock->log space
+	 * reservation as defined by extent allocation in the writeback path.
+	 * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+	 * having already truncated the in-memory version of the file (i.e. made
+	 * user visible changes). There's not much we can do about this, except
+	 * to hope that the caller sees ENOMEM and retries the truncate
+	 * operation.
+	 */
+	error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+	if (error)
+		return error;
+	truncate_setsize(inode, newsize);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	if (error)
+		goto out_trans_cancel;
+
+	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+	lock_flags |= XFS_ILOCK_EXCL;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Only change the c/mtime if we are changing the size or we are
+	 * explicitly asked to change it.  This handles the semantic difference
+	 * between truncate() and ftruncate() as implemented in the VFS.
+	 *
+	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+	 * special case where we need to update the times despite not having
+	 * these flags set.  For all other operations the VFS set these flags
+	 * explicitly if it wants a timestamp update.
+	 */
+	if (newsize != oldsize &&
+	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
+		iattr->ia_ctime = iattr->ia_mtime =
+			current_fs_time(inode->i_sb);
+		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+	}
+
+	/*
+	 * The first thing we do is set the size to new_size permanently on
+	 * disk.  This way we don't have to worry about anyone ever being able
+	 * to look at the data being freed even in the face of a crash.
+	 * What we're getting around here is the case where we free a block, it
+	 * is allocated to another file, it is written to, and then we crash.
+	 * If the new data gets written to the file but the log buffers
+	 * containing the free and reallocation don't, then we'd end up with
+	 * garbage in the blocks being freed.  As long as we make the new size
+	 * permanent before actually freeing any blocks it doesn't matter if
+	 * they get written to.
+	 */
+	ip->i_d.di_size = newsize;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (newsize <= oldsize) {
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
+		if (error)
+			goto out_trans_abort;
+
+		/*
+		 * Truncated "down", so we're removing references to old data
+		 * here - if we delay flushing for a long time, we expose
+		 * ourselves unduly to the notorious NULL files problem.  So,
+		 * we mark this inode and flush it when the file is closed,
+		 * and do not wait the usual (long) time for writeout.
+		 */
+		xfs_iflags_set(ip, XFS_ITRUNCATED);
+
+		/* A truncate down always removes post-EOF blocks. */
+		xfs_inode_clear_eofblocks_tag(ip);
+	}
+
+	if (iattr->ia_valid & ATTR_MODE)
+		xfs_setattr_mode(ip, iattr);
+	if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+		xfs_setattr_time(ip, iattr);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	XFS_STATS_INC(xs_ig_attrchg);
+
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+out_unlock:
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+	return error;
+
+out_trans_abort:
+	commit_flags |= XFS_TRANS_ABORT;
+out_trans_cancel:
+	xfs_trans_cancel(tp, commit_flags);
+	goto out_unlock;
+}
+
+STATIC int
+xfs_vn_setattr(
+	struct dentry		*dentry,
+	struct iattr		*iattr)
+{
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	int			error;
+
+	if (iattr->ia_valid & ATTR_SIZE) {
+		uint		iolock = XFS_IOLOCK_EXCL;
+
+		xfs_ilock(ip, iolock);
+		error = xfs_break_layouts(d_inode(dentry), &iolock, true);
+		if (!error) {
+			xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+			iolock |= XFS_MMAPLOCK_EXCL;
+
+			error = xfs_setattr_size(ip, iattr);
+		}
+		xfs_iunlock(ip, iolock);
+	} else {
+		error = xfs_setattr_nonsize(ip, iattr, 0);
+	}
+
+	return error;
+}
+
+STATIC int
+xfs_vn_update_time(
+	struct inode		*inode,
+	struct timespec		*now,
+	int			flags)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	trace_xfs_update_time(ip);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (flags & S_CTIME) {
+		inode->i_ctime = *now;
+		ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec;
+		ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec;
+	}
+	if (flags & S_MTIME) {
+		inode->i_mtime = *now;
+		ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec;
+		ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec;
+	}
+	if (flags & S_ATIME) {
+		inode->i_atime = *now;
+		ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec;
+		ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec;
+	}
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+	return xfs_trans_commit(tp, 0);
+}
+
+#define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+	void			**arg,
+	struct getbmapx		*bmv,
+	int			*full)
+{
+	int			error;
+	struct fiemap_extent_info *fieinfo = *arg;
+	u32			fiemap_flags = 0;
+	u64			logical, physical, length;
+
+	/* Do nothing for a hole */
+	if (bmv->bmv_block == -1LL)
+		return 0;
+
+	logical = BBTOB(bmv->bmv_offset);
+	physical = BBTOB(bmv->bmv_block);
+	length = BBTOB(bmv->bmv_length);
+
+	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+		fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+				 FIEMAP_EXTENT_UNKNOWN);
+		physical = 0;   /* no block yet */
+	}
+	if (bmv->bmv_oflags & BMV_OF_LAST)
+		fiemap_flags |= FIEMAP_EXTENT_LAST;
+
+	error = fiemap_fill_next_extent(fieinfo, logical, physical,
+					length, fiemap_flags);
+	if (error > 0) {
+		error = 0;
+		*full = 1;	/* user array now full */
+	}
+
+	return error;
+}
+
+STATIC int
+xfs_vn_fiemap(
+	struct inode		*inode,
+	struct fiemap_extent_info *fieinfo,
+	u64			start,
+	u64			length)
+{
+	xfs_inode_t		*ip = XFS_I(inode);
+	struct getbmapx		bm;
+	int			error;
+
+	error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+	if (error)
+		return error;
+
+	/* Set up bmap header for xfs internal routine */
+	bm.bmv_offset = BTOBBT(start);
+	/* Special case for whole file */
+	if (length == FIEMAP_MAX_OFFSET)
+		bm.bmv_length = -1LL;
+	else
+		bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
+
+	/* We add one because in getbmap world count includes the header */
+	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+					fieinfo->fi_extents_max + 1;
+	bm.bmv_count = min_t(__s32, bm.bmv_count,
+			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
+	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+		bm.bmv_iflags |= BMV_IF_ATTRFORK;
+	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+		bm.bmv_iflags |= BMV_IF_DELALLOC;
+
+	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+STATIC int
+xfs_vn_tmpfile(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	umode_t		mode)
+{
+	return xfs_generic_create(dir, dentry, mode, 0, true);
+}
+
+static const struct inode_operations xfs_inode_operations = {
+	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+	.fiemap			= xfs_vn_fiemap,
+	.update_time		= xfs_vn_update_time,
+};
+
+static const struct inode_operations xfs_dir_inode_operations = {
+	.create			= xfs_vn_create,
+	.lookup			= xfs_vn_lookup,
+	.link			= xfs_vn_link,
+	.unlink			= xfs_vn_unlink,
+	.symlink		= xfs_vn_symlink,
+	.mkdir			= xfs_vn_mkdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
+	.mknod			= xfs_vn_mknod,
+	.rename2		= xfs_vn_rename,
+	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+	.update_time		= xfs_vn_update_time,
+	.tmpfile		= xfs_vn_tmpfile,
+};
+
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+	.create			= xfs_vn_create,
+	.lookup			= xfs_vn_ci_lookup,
+	.link			= xfs_vn_link,
+	.unlink			= xfs_vn_unlink,
+	.symlink		= xfs_vn_symlink,
+	.mkdir			= xfs_vn_mkdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
+	.mknod			= xfs_vn_mknod,
+	.rename2		= xfs_vn_rename,
+	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+	.update_time		= xfs_vn_update_time,
+	.tmpfile		= xfs_vn_tmpfile,
+};
+
+static const struct inode_operations xfs_symlink_inode_operations = {
+	.readlink		= generic_readlink,
+	.follow_link		= xfs_vn_follow_link,
+	.put_link		= kfree_put_link,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+	.update_time		= xfs_vn_update_time,
+};
+
+STATIC void
+xfs_diflags_to_iflags(
+	struct inode		*inode,
+	struct xfs_inode	*ip)
+{
+	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+	if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+		inode->i_flags |= S_SYNC;
+	else
+		inode->i_flags &= ~S_SYNC;
+	if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	else
+		inode->i_flags &= ~S_NOATIME;
+}
+
+/*
+ * Initialize the Linux inode and set up the operation vectors.
+ *
+ * When reading existing inodes from disk this is called directly from xfs_iget,
+ * when creating a new inode it is called from xfs_ialloc after setting up the
+ * inode. These callers have different criteria for clearing XFS_INEW, so leave
+ * it up to the caller to deal with unlocking the inode appropriately.
+ */
+void
+xfs_setup_inode(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = &ip->i_vnode;
+	gfp_t			gfp_mask;
+
+	inode->i_ino = ip->i_ino;
+	inode->i_state = I_NEW;
+
+	inode_sb_list_add(inode);
+	/* make the inode look hashed for the writeback code */
+	hlist_add_fake(&inode->i_hash);
+
+	inode->i_mode	= ip->i_d.di_mode;
+	set_nlink(inode, ip->i_d.di_nlink);
+	inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
+	inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_rdev =
+			MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+			      sysv_minor(ip->i_df.if_u2.if_rdev));
+		break;
+	default:
+		inode->i_rdev = 0;
+		break;
+	}
+
+	inode->i_generation = ip->i_d.di_gen;
+	i_size_write(inode, ip->i_d.di_size);
+	inode->i_atime.tv_sec	= ip->i_d.di_atime.t_sec;
+	inode->i_atime.tv_nsec	= ip->i_d.di_atime.t_nsec;
+	inode->i_mtime.tv_sec	= ip->i_d.di_mtime.t_sec;
+	inode->i_mtime.tv_nsec	= ip->i_d.di_mtime.t_nsec;
+	inode->i_ctime.tv_sec	= ip->i_d.di_ctime.t_sec;
+	inode->i_ctime.tv_nsec	= ip->i_d.di_ctime.t_nsec;
+	xfs_diflags_to_iflags(inode, ip);
+
+	ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+	lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &xfs_inode_operations;
+		inode->i_fop = &xfs_file_operations;
+		inode->i_mapping->a_ops = &xfs_address_space_operations;
+		break;
+	case S_IFDIR:
+		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+		if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+			inode->i_op = &xfs_dir_ci_inode_operations;
+		else
+			inode->i_op = &xfs_dir_inode_operations;
+		inode->i_fop = &xfs_dir_file_operations;
+		ip->d_ops = ip->i_mount->m_dir_inode_ops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &xfs_symlink_inode_operations;
+		if (!(ip->i_df.if_flags & XFS_IFINLINE))
+			inode->i_mapping->a_ops = &xfs_address_space_operations;
+		break;
+	default:
+		inode->i_op = &xfs_inode_operations;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	}
+
+	/*
+	 * Ensure all page cache allocations are done from GFP_NOFS context to
+	 * prevent direct reclaim recursion back into the filesystem and blowing
+	 * stacks or deadlocking.
+	 */
+	gfp_mask = mapping_gfp_mask(inode->i_mapping);
+	mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
+
+	/*
+	 * If there is no attribute fork no ACL can exist on this inode,
+	 * and it can't have any file capabilities attached to it either.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		inode_has_no_xattr(inode);
+		cache_no_acl(inode);
+	}
+}
diff --git a/kernel/fs/xfs/xfs_iops.h b/kernel/fs/xfs/xfs_iops.h
new file mode 100644
index 000000000..a0f84abb0
--- /dev/null
+++ b/kernel/fs/xfs/xfs_iops.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+
+struct xfs_inode;
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
+
+/*
+ * Internal setattr interfaces.
+ */
+#define XFS_ATTR_NOACL		0x01	/* Don't call posix_acl_chmod */
+
+extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
+extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
+			       int flags);
+extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
+
+#endif /* __XFS_IOPS_H__ */
diff --git a/kernel/fs/xfs/xfs_itable.c b/kernel/fs/xfs/xfs_itable.c
new file mode 100644
index 000000000..80429891d
--- /dev/null
+++ b/kernel/fs/xfs/xfs_itable.c
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+
+STATIC int
+xfs_internal_inum(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+		(xfs_sb_version_hasquota(&mp->m_sb) &&
+		 xfs_is_quota_inode(&mp->m_sb, ino)));
+}
+
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int
+xfs_bulkstat_one_int(
+	struct xfs_mount	*mp,		/* mount point for filesystem */
+	xfs_ino_t		ino,		/* inode to get data for */
+	void __user		*buffer,	/* buffer to place output in */
+	int			ubsize,		/* size of buffer */
+	bulkstat_one_fmt_pf	formatter,	/* formatter, copy to user */
+	int			*ubused,	/* bytes used by me */
+	int			*stat)		/* BULKSTAT_RV_... */
+{
+	struct xfs_icdinode	*dic;		/* dinode core info pointer */
+	struct xfs_inode	*ip;		/* incore inode pointer */
+	struct xfs_bstat	*buf;		/* return buffer */
+	int			error = 0;	/* error value */
+
+	*stat = BULKSTAT_RV_NOTHING;
+
+	if (!buffer || xfs_internal_inum(mp, ino))
+		return -EINVAL;
+
+	buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+	if (!buf)
+		return -ENOMEM;
+
+	error = xfs_iget(mp, NULL, ino,
+			 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
+			 XFS_ILOCK_SHARED, &ip);
+	if (error)
+		goto out_free;
+
+	ASSERT(ip != NULL);
+	ASSERT(ip->i_imap.im_blkno != 0);
+
+	dic = &ip->i_d;
+
+	/* xfs_iget returns the following without needing
+	 * further change.
+	 */
+	buf->bs_nlink = dic->di_nlink;
+	buf->bs_projid_lo = dic->di_projid_lo;
+	buf->bs_projid_hi = dic->di_projid_hi;
+	buf->bs_ino = ino;
+	buf->bs_mode = dic->di_mode;
+	buf->bs_uid = dic->di_uid;
+	buf->bs_gid = dic->di_gid;
+	buf->bs_size = dic->di_size;
+	buf->bs_atime.tv_sec = dic->di_atime.t_sec;
+	buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
+	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
+	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
+	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
+	buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+	buf->bs_xflags = xfs_ip2xflags(ip);
+	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+	buf->bs_extents = dic->di_nextents;
+	buf->bs_gen = dic->di_gen;
+	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
+	buf->bs_dmevmask = dic->di_dmevmask;
+	buf->bs_dmstate = dic->di_dmstate;
+	buf->bs_aextents = dic->di_anextents;
+	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+
+	switch (dic->di_format) {
+	case XFS_DINODE_FMT_DEV:
+		buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+		buf->bs_blksize = BLKDEV_IOSIZE;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_UUID:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
+		break;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	IRELE(ip);
+
+	error = formatter(buffer, ubsize, ubused, buf);
+	if (!error)
+		*stat = BULKSTAT_RV_DIDONE;
+
+ out_free:
+	kmem_free(buf);
+	return error;
+}
+
+/* Return 0 on success or positive error */
+STATIC int
+xfs_bulkstat_one_fmt(
+	void			__user *ubuffer,
+	int			ubsize,
+	int			*ubused,
+	const xfs_bstat_t	*buffer)
+{
+	if (ubsize < sizeof(*buffer))
+		return -ENOMEM;
+	if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
+		return -EFAULT;
+	if (ubused)
+		*ubused = sizeof(*buffer);
+	return 0;
+}
+
+int
+xfs_bulkstat_one(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		__user *buffer,	/* buffer to place output in */
+	int		ubsize,		/* size of buffer */
+	int		*ubused,	/* bytes used by me */
+	int		*stat)		/* BULKSTAT_RV_... */
+{
+	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+				    xfs_bulkstat_one_fmt, ubused, stat);
+}
+
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record.  Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_bulkstat_ichunk_ra(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	xfs_agblock_t			agbno;
+	struct blk_plug			plug;
+	int				blks_per_cluster;
+	int				inodes_per_cluster;
+	int				i;	/* inode chunk index */
+
+	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+
+	blk_start_plug(&plug);
+	for (i = 0; i < XFS_INODES_PER_CHUNK;
+	     i += inodes_per_cluster, agbno += blks_per_cluster) {
+		if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
+			xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
+					     &xfs_inode_buf_ops);
+		}
+	}
+	blk_finish_plug(&plug);
+}
+
+/*
+ * Lookup the inode chunk that the given inode lives in and then get the record
+ * if we found the chunk.  If the inode was not the last in the chunk and there
+ * are some left allocated, update the data for the pointed-to record as well as
+ * return the count of grabbed inodes.
+ */
+STATIC int
+xfs_bulkstat_grab_ichunk(
+	struct xfs_btree_cur		*cur,	/* btree cursor */
+	xfs_agino_t			agino,	/* starting inode of chunk */
+	int				*icount,/* return # of inodes grabbed */
+	struct xfs_inobt_rec_incore	*irec)	/* btree record */
+{
+	int				idx;	/* index into inode chunk */
+	int				stat;
+	int				error = 0;
+
+	/* Lookup the inode chunk that this inode lives in */
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
+	if (error)
+		return error;
+	if (!stat) {
+		*icount = 0;
+		return error;
+	}
+
+	/* Get the record, should always work */
+	error = xfs_inobt_get_rec(cur, irec, &stat);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
+
+	/* Check if the record contains the inode in request */
+	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
+		*icount = 0;
+		return 0;
+	}
+
+	idx = agino - irec->ir_startino + 1;
+	if (idx < XFS_INODES_PER_CHUNK &&
+	    (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
+		int	i;
+
+		/* We got a right chunk with some left inodes allocated at it.
+		 * Grab the chunk record.  Mark all the uninteresting inodes
+		 * free -- because they're before our start point.
+		 */
+		for (i = 0; i < idx; i++) {
+			if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+				irec->ir_freecount++;
+		}
+
+		irec->ir_free |= xfs_inobt_maskn(0, idx);
+		*icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+	}
+
+	return 0;
+}
+
+#define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
+
+struct xfs_bulkstat_agichunk {
+	char		__user **ac_ubuffer;/* pointer into user's buffer */
+	int		ac_ubleft;	/* bytes left in user's buffer */
+	int		ac_ubelem;	/* spaces used in user's buffer */
+};
+
+/*
+ * Process inodes in chunk with a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ */
+static int
+xfs_bulkstat_ag_ichunk(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*irbp,
+	bulkstat_one_pf			formatter,
+	size_t				statstruct_size,
+	struct xfs_bulkstat_agichunk	*acp,
+	xfs_agino_t			*last_agino)
+{
+	char				__user **ubufp = acp->ac_ubuffer;
+	int				chunkidx;
+	int				error = 0;
+	xfs_agino_t			agino = irbp->ir_startino;
+
+	for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
+	     chunkidx++, agino++) {
+		int		fmterror;
+		int		ubused;
+
+		/* inode won't fit in buffer, we are done */
+		if (acp->ac_ubleft < statstruct_size)
+			break;
+
+		/* Skip if this inode is free */
+		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
+			continue;
+
+		/* Get the inode and fill in a single buffer */
+		ubused = statstruct_size;
+		error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
+				  *ubufp, acp->ac_ubleft, &ubused, &fmterror);
+
+		if (fmterror == BULKSTAT_RV_GIVEUP ||
+		    (error && error != -ENOENT && error != -EINVAL)) {
+			acp->ac_ubleft = 0;
+			ASSERT(error);
+			break;
+		}
+
+		/* be careful not to leak error if at end of chunk */
+		if (fmterror == BULKSTAT_RV_NOTHING || error) {
+			error = 0;
+			continue;
+		}
+
+		*ubufp += ubused;
+		acp->ac_ubleft -= ubused;
+		acp->ac_ubelem++;
+	}
+
+	/*
+	 * Post-update *last_agino. At this point, agino will always point one
+	 * inode past the last inode we processed successfully. Hence we
+	 * substract that inode when setting the *last_agino cursor so that we
+	 * return the correct cookie to userspace. On the next bulkstat call,
+	 * the inode under the lastino cookie will be skipped as we have already
+	 * processed it here.
+	 */
+	*last_agino = agino - 1;
+
+	return error;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_ino_t		*lastinop, /* last inode returned */
+	int			*ubcountp, /* size of buffer/count returned */
+	bulkstat_one_pf		formatter, /* func that'd fill a single buf */
+	size_t			statstruct_size, /* sizeof struct filling */
+	char			__user *ubuffer, /* buffer with inode stats */
+	int			*done)	/* 1 if there are more stats to get */
+{
+	xfs_buf_t		*agbp;	/* agi header buffer */
+	xfs_agino_t		agino;	/* inode # in allocation group */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
+	size_t			irbsize; /* size of irec buffer in bytes */
+	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
+	int			nirbuf;	/* size of irbuf */
+	int			ubcount; /* size of user's buffer */
+	struct xfs_bulkstat_agichunk ac;
+	int			error = 0;
+
+	/*
+	 * Get the last inode value, see if there's nothing to do.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, *lastinop);
+	agino = XFS_INO_TO_AGINO(mp, *lastinop);
+	if (agno >= mp->m_sb.sb_agcount ||
+	    *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
+		*done = 1;
+		*ubcountp = 0;
+		return 0;
+	}
+
+	ubcount = *ubcountp; /* statstruct's */
+	ac.ac_ubuffer = &ubuffer;
+	ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
+	ac.ac_ubelem = 0;
+
+	*ubcountp = 0;
+	*done = 0;
+
+	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
+	if (!irbuf)
+		return -ENOMEM;
+
+	nirbuf = irbsize / sizeof(*irbuf);
+
+	/*
+	 * Loop over the allocation groups, starting from the last
+	 * inode returned; 0 means start of the allocation group.
+	 */
+	while (agno < mp->m_sb.sb_agcount) {
+		struct xfs_inobt_rec_incore	*irbp = irbuf;
+		struct xfs_inobt_rec_incore	*irbufend = irbuf + nirbuf;
+		bool				end_of_ag = false;
+		int				icount = 0;
+		int				stat;
+
+		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+		if (error)
+			break;
+		/*
+		 * Allocate and initialize a btree cursor for ialloc btree.
+		 */
+		cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+					    XFS_BTNUM_INO);
+		if (agino > 0) {
+			/*
+			 * In the middle of an allocation group, we need to get
+			 * the remainder of the chunk we're in.
+			 */
+			struct xfs_inobt_rec_incore	r;
+
+			error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
+			if (error)
+				goto del_cursor;
+			if (icount) {
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
+				irbp++;
+			}
+			/* Increment to the next record */
+			error = xfs_btree_increment(cur, 0, &stat);
+		} else {
+			/* Start of ag.  Lookup the first inode chunk */
+			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
+		}
+		if (error || stat == 0) {
+			end_of_ag = true;
+			goto del_cursor;
+		}
+
+		/*
+		 * Loop through inode btree records in this ag,
+		 * until we run out of inodes or space in the buffer.
+		 */
+		while (irbp < irbufend && icount < ubcount) {
+			struct xfs_inobt_rec_incore	r;
+
+			error = xfs_inobt_get_rec(cur, &r, &stat);
+			if (error || stat == 0) {
+				end_of_ag = true;
+				goto del_cursor;
+			}
+
+			/*
+			 * If this chunk has any allocated inodes, save it.
+			 * Also start read-ahead now for this chunk.
+			 */
+			if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+				xfs_bulkstat_ichunk_ra(mp, agno, &r);
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
+				irbp++;
+				icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+			}
+			error = xfs_btree_increment(cur, 0, &stat);
+			if (error || stat == 0) {
+				end_of_ag = true;
+				goto del_cursor;
+			}
+			cond_resched();
+		}
+
+		/*
+		 * Drop the btree buffers and the agi buffer as we can't hold any
+		 * of the locks these represent when calling iget. If there is a
+		 * pending error, then we are done.
+		 */
+del_cursor:
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		xfs_buf_relse(agbp);
+		if (error)
+			break;
+		/*
+		 * Now format all the good inodes into the user's buffer. The
+		 * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
+		 * for the next loop iteration.
+		 */
+		irbufend = irbp;
+		for (irbp = irbuf;
+		     irbp < irbufend && ac.ac_ubleft >= statstruct_size;
+		     irbp++) {
+			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
+					formatter, statstruct_size, &ac,
+					&agino);
+			if (error)
+				break;
+
+			cond_resched();
+		}
+
+		/*
+		 * If we've run out of space or had a formatting error, we
+		 * are now done
+		 */
+		if (ac.ac_ubleft < statstruct_size || error)
+			break;
+
+		if (end_of_ag) {
+			agno++;
+			agino = 0;
+		}
+	}
+	/*
+	 * Done, we're either out of filesystem or space to put the data.
+	 */
+	kmem_free(irbuf);
+	*ubcountp = ac.ac_ubelem;
+
+	/*
+	 * We found some inodes, so clear the error status and return them.
+	 * The lastino pointer will point directly at the inode that triggered
+	 * any error that occurred, so on the next call the error will be
+	 * triggered again and propagated to userspace as there will be no
+	 * formatted inodes in the buffer.
+	 */
+	if (ac.ac_ubelem)
+		error = 0;
+
+	/*
+	 * If we ran out of filesystem, lastino will point off the end of
+	 * the filesystem so the next call will return immediately.
+	 */
+	*lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
+	if (agno >= mp->m_sb.sb_agcount)
+		*done = 1;
+
+	return error;
+}
+
+int
+xfs_inumbers_fmt(
+	void			__user *ubuffer, /* buffer to write to */
+	const struct xfs_inogrp	*buffer,	/* buffer to read from */
+	long			count,		/* # of elements to read */
+	long			*written)	/* # of bytes written */
+{
+	if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
+		return -EFAULT;
+	*written = count * sizeof(*buffer);
+	return 0;
+}
+
+/*
+ * Return inode number table for the filesystem.
+ */
+int					/* error status */
+xfs_inumbers(
+	struct xfs_mount	*mp,/* mount point for filesystem */
+	xfs_ino_t		*lastino,/* last inode returned */
+	int			*count,/* size of buffer/count returned */
+	void			__user *ubuffer,/* buffer with inode descriptions */
+	inumbers_fmt_pf		formatter)
+{
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, *lastino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, *lastino);
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_buf		*agbp = NULL;
+	struct xfs_inogrp	*buffer;
+	int			bcount;
+	int			left = *count;
+	int			bufidx = 0;
+	int			error = 0;
+
+	*count = 0;
+	if (agno >= mp->m_sb.sb_agcount ||
+	    *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
+		return error;
+
+	bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
+	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+	do {
+		struct xfs_inobt_rec_incore	r;
+		int				stat;
+
+		if (!agbp) {
+			error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+			if (error)
+				break;
+
+			cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+						    XFS_BTNUM_INO);
+			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+						 &stat);
+			if (error)
+				break;
+			if (!stat)
+				goto next_ag;
+		}
+
+		error = xfs_inobt_get_rec(cur, &r, &stat);
+		if (error)
+			break;
+		if (!stat)
+			goto next_ag;
+
+		agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+		buffer[bufidx].xi_startino =
+			XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+		buffer[bufidx].xi_alloccount =
+			XFS_INODES_PER_CHUNK - r.ir_freecount;
+		buffer[bufidx].xi_allocmask = ~r.ir_free;
+		if (++bufidx == bcount) {
+			long	written;
+
+			error = formatter(ubuffer, buffer, bufidx, &written);
+			if (error)
+				break;
+			ubuffer += written;
+			*count += bufidx;
+			bufidx = 0;
+		}
+		if (!--left)
+			break;
+
+		error = xfs_btree_increment(cur, 0, &stat);
+		if (error)
+			break;
+		if (stat)
+			continue;
+
+next_ag:
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		cur = NULL;
+		xfs_buf_relse(agbp);
+		agbp = NULL;
+		agino = 0;
+		agno++;
+	} while (agno < mp->m_sb.sb_agcount);
+
+	if (!error) {
+		if (bufidx) {
+			long	written;
+
+			error = formatter(ubuffer, buffer, bufidx, &written);
+			if (!error)
+				*count += bufidx;
+		}
+		*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
+	}
+
+	kmem_free(buffer);
+	if (cur)
+		xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
+					   XFS_BTREE_NOERROR));
+	if (agbp)
+		xfs_buf_relse(agbp);
+
+	return error;
+}
diff --git a/kernel/fs/xfs/xfs_itable.h b/kernel/fs/xfs/xfs_itable.h
new file mode 100644
index 000000000..6ea8b3912
--- /dev/null
+++ b/kernel/fs/xfs/xfs_itable.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ITABLE_H__
+#define	__XFS_ITABLE_H__
+
+/*
+ * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
+ * structures (by the dmi library). This is a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
+ */
+typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
+			       xfs_ino_t	ino,
+			       void		__user *buffer,
+			       int		ubsize,
+			       int		*ubused,
+			       int		*stat);
+
+/*
+ * Values for stat return value.
+ */
+#define BULKSTAT_RV_NOTHING	0
+#define BULKSTAT_RV_DIDONE	1
+#define BULKSTAT_RV_GIVEUP	2
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	*lastino,	/* last inode returned */
+	int		*count,		/* size of buffer/count returned */
+	bulkstat_one_pf formatter,	/* func that'd fill a single buf */
+	size_t		statstruct_size,/* sizeof struct that we're filling */
+	char		__user *ubuffer,/* buffer with inode stats */
+	int		*done);		/* 1 if there are more stats to get */
+
+typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
+	void			__user *ubuffer, /* buffer to write to */
+	int			ubsize,		 /* remaining user buffer sz */
+	int			*ubused,	 /* bytes used by formatter */
+	const xfs_bstat_t	*buffer);        /* buffer to read from */
+
+int
+xfs_bulkstat_one_int(
+	xfs_mount_t		*mp,
+	xfs_ino_t		ino,
+	void			__user *buffer,
+	int			ubsize,
+	bulkstat_one_fmt_pf	formatter,
+	int			*ubused,
+	int			*stat);
+
+int
+xfs_bulkstat_one(
+	xfs_mount_t		*mp,
+	xfs_ino_t		ino,
+	void			__user *buffer,
+	int			ubsize,
+	int			*ubused,
+	int			*stat);
+
+typedef int (*inumbers_fmt_pf)(
+	void			__user *ubuffer, /* buffer to write to */
+	const xfs_inogrp_t	*buffer,	/* buffer to read from */
+	long			count,		/* # of elements to read */
+	long			*written);	/* # of bytes written */
+
+int
+xfs_inumbers_fmt(
+	void			__user *ubuffer, /* buffer to write to */
+	const xfs_inogrp_t	*buffer,	/* buffer to read from */
+	long			count,		/* # of elements to read */
+	long			*written);	/* # of bytes written */
+
+int					/* error status */
+xfs_inumbers(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_ino_t		*last,	/* last inode returned */
+	int			*count,	/* size of buffer/count returned */
+	void			__user *buffer, /* buffer with inode info */
+	inumbers_fmt_pf		formatter);
+
+#endif	/* __XFS_ITABLE_H__ */
diff --git a/kernel/fs/xfs/xfs_linux.h b/kernel/fs/xfs/xfs_linux.h
new file mode 100644
index 000000000..7c7842c85
--- /dev/null
+++ b/kernel/fs/xfs/xfs_linux.h
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+
+#include <linux/types.h>
+
+/*
+ * Kernel specific type declarations for XFS
+ */
+typedef signed char		__int8_t;
+typedef unsigned char		__uint8_t;
+typedef signed short int	__int16_t;
+typedef unsigned short int	__uint16_t;
+typedef signed int		__int32_t;
+typedef unsigned int		__uint32_t;
+typedef signed long long int	__int64_t;
+typedef unsigned long long int	__uint64_t;
+
+typedef __uint32_t		inst_t;		/* an instruction */
+
+typedef __s64			xfs_off_t;	/* <file offset> type */
+typedef unsigned long long	xfs_ino_t;	/* <inode> type */
+typedef __s64			xfs_daddr_t;	/* <disk address> type */
+typedef char *			xfs_caddr_t;	/* <core address> type */
+typedef __u32			xfs_dev_t;
+typedef __u32			xfs_nlink_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
+#include "xfs_types.h"
+
+#include "kmem.h"
+#include "mrlock.h"
+#include "uuid.h"
+
+#include <linux/semaphore.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/major.h>
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/delay.h>
+#include <linux/log2.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/ctype.h>
+#include <linux/writeback.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list_sort.h>
+#include <linux/ratelimit.h>
+
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include "xfs_fs.h"
+#include "xfs_stats.h"
+#include "xfs_sysctl.h"
+#include "xfs_iops.h"
+#include "xfs_aops.h"
+#include "xfs_super.h"
+#include "xfs_cksum.h"
+#include "xfs_buf.h"
+#include "xfs_message.h"
+
+#ifdef __BIG_ENDIAN
+#define XFS_NATIVE_HOST 1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#define irix_sgid_inherit	xfs_params.sgid_inherit.val
+#define irix_symlink_mode	xfs_params.symlink_mode.val
+#define xfs_panic_mask		xfs_params.panic_mask.val
+#define xfs_error_level		xfs_params.error_level.val
+#define xfs_syncd_centisecs	xfs_params.syncd_timer.val
+#define xfs_stats_clear		xfs_params.stats_clear.val
+#define xfs_inherit_sync	xfs_params.inherit_sync.val
+#define xfs_inherit_nodump	xfs_params.inherit_nodump.val
+#define xfs_inherit_noatime	xfs_params.inherit_noatim.val
+#define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
+#define xfs_rotorstep		xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
+#define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
+#define xfs_eofb_secs		xfs_params.eofb_timer.val
+
+#define current_cpu()		(raw_smp_processor_id())
+#define current_pid()		(current->pid)
+#define current_test_flags(f)	(current->flags & (f))
+#define current_set_flags_nested(sp, f)		\
+		(*(sp) = current->flags, current->flags |= (f))
+#define current_clear_flags_nested(sp, f)	\
+		(*(sp) = current->flags, current->flags &= ~(f))
+#define current_restore_flags_nested(sp, f)	\
+		(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
+
+#define spinlock_destroy(lock)
+
+#define NBBY		8		/* number of bits per byte */
+
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define	BLKDEV_IOSHIFT		PAGE_CACHE_SHIFT
+#define	BLKDEV_IOSIZE		(1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define	BLKDEV_BB		BTOBB(BLKDEV_IOSIZE)
+
+#define ENOATTR		ENODATA		/* Attribute not found */
+#define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
+#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
+#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
+
+#define SYNCHRONIZE()	barrier()
+#define __return_address __builtin_return_address(0)
+
+#define XFS_PROJID_DEFAULT	0
+#define MAXPATHLEN	1024
+
+#define MIN(a,b)	(min(a,b))
+#define MAX(a,b)	(max(a,b))
+#define howmany(x, y)	(((x)+((y)-1))/(y))
+
+static inline void delay(long ticks)
+{
+	schedule_timeout_uninterruptible(ticks);
+}
+
+/*
+ * XFS wrapper structure for sysfs support. It depends on external data
+ * structures and is embedded in various internal data structures to implement
+ * the XFS sysfs object heirarchy. Define it here for broad access throughout
+ * the codebase.
+ */
+struct xfs_kobj {
+	struct kobject		kobject;
+	struct completion	complete;
+};
+
+/* Kernel uid/gid conversion. These are used to convert to/from the on disk
+ * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
+ * The conversion here is type only, the value will remain the same since we
+ * are converting to the init_user_ns. The uid is later mapped to a particular
+ * user namespace value when crossing the kernel/user boundary.
+ */
+static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
+{
+	return from_kuid(&init_user_ns, uid);
+}
+
+static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
+{
+	return make_kuid(&init_user_ns, uid);
+}
+
+static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
+{
+	return from_kgid(&init_user_ns, gid);
+}
+
+static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
+{
+	return make_kgid(&init_user_ns, gid);
+}
+
+/*
+ * Various platform dependent calls that don't fit anywhere else
+ */
+#define xfs_sort(a,n,s,fn)	sort(a,n,s,fn,NULL)
+#define xfs_stack_trace()	dump_stack()
+
+
+/* Move the kernel do_div definition off to one side */
+
+#if defined __i386__
+/* For ia32 we need to pull some tricks to get past various versions
+ * of the compiler which do not like us using do_div in the middle
+ * of large functions.
+ */
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			*(__u64 *)a = c;
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#else
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			mod = do_div(*(__u64 *)a, b);
+			return mod;
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			__u64	c = *(__u64 *)a;
+			return do_div(c, b);
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#endif
+
+#undef do_div
+#define do_div(a, b)	xfs_do_div(&(a), (b), sizeof(a))
+#define do_mod(a, b)	xfs_do_mod(&(a), (b), sizeof(a))
+
+static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return x * y;
+}
+
+static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return x;
+}
+
+/* ARM old ABI has some weird alignment/padding */
+#if defined(__arm__) && !defined(__ARM_EABI__)
+#define __arch_pack __attribute__((packed))
+#else
+#define __arch_pack
+#endif
+
+#define ASSERT_ALWAYS(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifdef DEBUG
+#define ASSERT(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef STATIC
+# define STATIC noinline
+#endif
+
+#else	/* !DEBUG */
+
+#ifdef XFS_WARN
+
+#define ASSERT(expr)	\
+	(unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
+
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+
+#else	/* !DEBUG && !XFS_WARN */
+
+#define ASSERT(expr)	((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+
+#endif /* XFS_WARN */
+#endif /* DEBUG */
+
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
+
+#endif /* __XFS_LINUX__ */
diff --git a/kernel/fs/xfs/xfs_log.c b/kernel/fs/xfs/xfs_log.c
new file mode 100644
index 000000000..bcc7cfabb
--- /dev/null
+++ b/kernel/fs/xfs/xfs_log.c
@@ -0,0 +1,4007 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_cksum.h"
+#include "xfs_sysfs.h"
+#include "xfs_sb.h"
+
+kmem_zone_t	*xfs_log_ticket_zone;
+
+/* Local miscellaneous function prototypes */
+STATIC int
+xlog_commit_record(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclog,
+	xfs_lsn_t		*commitlsnp);
+
+STATIC struct xlog *
+xlog_alloc_log(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*log_target,
+	xfs_daddr_t		blk_offset,
+	int			num_bblks);
+STATIC int
+xlog_space_left(
+	struct xlog		*log,
+	atomic64_t		*head);
+STATIC int
+xlog_sync(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog);
+STATIC void
+xlog_dealloc_log(
+	struct xlog		*log);
+
+/* local state machine functions */
+STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
+STATIC void
+xlog_state_do_callback(
+	struct xlog		*log,
+	int			aborted,
+	struct xlog_in_core	*iclog);
+STATIC int
+xlog_state_get_iclog_space(
+	struct xlog		*log,
+	int			len,
+	struct xlog_in_core	**iclog,
+	struct xlog_ticket	*ticket,
+	int			*continued_write,
+	int			*logoffsetp);
+STATIC int
+xlog_state_release_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog);
+STATIC void
+xlog_state_switch_iclogs(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			eventual_size);
+STATIC void
+xlog_state_want_sync(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog);
+
+STATIC void
+xlog_grant_push_ail(
+	struct xlog		*log,
+	int			need_bytes);
+STATIC void
+xlog_regrant_reserve_log_space(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket);
+STATIC void
+xlog_ungrant_log_space(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket);
+
+#if defined(DEBUG)
+STATIC void
+xlog_verify_dest_ptr(
+	struct xlog		*log,
+	char			*ptr);
+STATIC void
+xlog_verify_grant_tail(
+	struct xlog *log);
+STATIC void
+xlog_verify_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			count,
+	bool                    syncing);
+STATIC void
+xlog_verify_tail_lsn(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	xfs_lsn_t		tail_lsn);
+#else
+#define xlog_verify_dest_ptr(a,b)
+#define xlog_verify_grant_tail(a)
+#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_tail_lsn(a,b,c)
+#endif
+
+STATIC int
+xlog_iclogs_empty(
+	struct xlog		*log);
+
+static void
+xlog_grant_sub_space(
+	struct xlog		*log,
+	atomic64_t		*head,
+	int			bytes)
+{
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
+
+	do {
+		int	cycle, space;
+
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+		space -= bytes;
+		if (space < 0) {
+			space += log->l_logsize;
+			cycle--;
+		}
+
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
+}
+
+static void
+xlog_grant_add_space(
+	struct xlog		*log,
+	atomic64_t		*head,
+	int			bytes)
+{
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
+
+	do {
+		int		tmp;
+		int		cycle, space;
+
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+		tmp = log->l_logsize - space;
+		if (tmp > bytes)
+			space += bytes;
+		else {
+			space = bytes - tmp;
+			cycle++;
+		}
+
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
+}
+
+STATIC void
+xlog_grant_head_init(
+	struct xlog_grant_head	*head)
+{
+	xlog_assign_grant_head(&head->grant, 1, 0);
+	INIT_LIST_HEAD(&head->waiters);
+	spin_lock_init(&head->lock);
+}
+
+STATIC void
+xlog_grant_head_wake_all(
+	struct xlog_grant_head	*head)
+{
+	struct xlog_ticket	*tic;
+
+	spin_lock(&head->lock);
+	list_for_each_entry(tic, &head->waiters, t_queue)
+		wake_up_process(tic->t_task);
+	spin_unlock(&head->lock);
+}
+
+static inline int
+xlog_ticket_reservation(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	struct xlog_ticket	*tic)
+{
+	if (head == &log->l_write_head) {
+		ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+		return tic->t_unit_res;
+	} else {
+		if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+			return tic->t_unit_res * tic->t_cnt;
+		else
+			return tic->t_unit_res;
+	}
+}
+
+STATIC bool
+xlog_grant_head_wake(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	int			*free_bytes)
+{
+	struct xlog_ticket	*tic;
+	int			need_bytes;
+
+	list_for_each_entry(tic, &head->waiters, t_queue) {
+		need_bytes = xlog_ticket_reservation(log, head, tic);
+		if (*free_bytes < need_bytes)
+			return false;
+
+		*free_bytes -= need_bytes;
+		trace_xfs_log_grant_wake_up(log, tic);
+		wake_up_process(tic->t_task);
+	}
+
+	return true;
+}
+
+STATIC int
+xlog_grant_head_wait(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	struct xlog_ticket	*tic,
+	int			need_bytes) __releases(&head->lock)
+					    __acquires(&head->lock)
+{
+	list_add_tail(&tic->t_queue, &head->waiters);
+
+	do {
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+		xlog_grant_push_ail(log, need_bytes);
+
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock(&head->lock);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+
+		trace_xfs_log_grant_sleep(log, tic);
+		schedule();
+		trace_xfs_log_grant_wake(log, tic);
+
+		spin_lock(&head->lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+	} while (xlog_space_left(log, &head->grant) < need_bytes);
+
+	list_del_init(&tic->t_queue);
+	return 0;
+shutdown:
+	list_del_init(&tic->t_queue);
+	return -EIO;
+}
+
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto head->waiters, it will only return after the
+ * needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off head->waiters under head->lock, we
+ * only need to take that lock if we are going to add the ticket to the queue
+ * and sleep. We can avoid taking the lock if the ticket was never added to
+ * head->waiters because the t_queue list head will be empty and we hold the
+ * only reference to it so it can safely be checked unlocked.
+ */
+STATIC int
+xlog_grant_head_check(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	struct xlog_ticket	*tic,
+	int			*need_bytes)
+{
+	int			free_bytes;
+	int			error = 0;
+
+	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+
+	/*
+	 * If there are other waiters on the queue then give them a chance at
+	 * logspace before us.  Wake up the first waiters, if we do not wake
+	 * up all the waiters then go to sleep waiting for more free space,
+	 * otherwise try to get some space for this transaction.
+	 */
+	*need_bytes = xlog_ticket_reservation(log, head, tic);
+	free_bytes = xlog_space_left(log, &head->grant);
+	if (!list_empty_careful(&head->waiters)) {
+		spin_lock(&head->lock);
+		if (!xlog_grant_head_wake(log, head, &free_bytes) ||
+		    free_bytes < *need_bytes) {
+			error = xlog_grant_head_wait(log, head, tic,
+						     *need_bytes);
+		}
+		spin_unlock(&head->lock);
+	} else if (free_bytes < *need_bytes) {
+		spin_lock(&head->lock);
+		error = xlog_grant_head_wait(log, head, tic, *need_bytes);
+		spin_unlock(&head->lock);
+	}
+
+	return error;
+}
+
+static void
+xlog_tic_reset_res(xlog_ticket_t *tic)
+{
+	tic->t_res_num = 0;
+	tic->t_res_arr_sum = 0;
+	tic->t_res_num_ophdrs = 0;
+}
+
+static void
+xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
+{
+	if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
+		/* add to overflow and start again */
+		tic->t_res_o_flow += tic->t_res_arr_sum;
+		tic->t_res_num = 0;
+		tic->t_res_arr_sum = 0;
+	}
+
+	tic->t_res_arr[tic->t_res_num].r_len = len;
+	tic->t_res_arr[tic->t_res_num].r_type = type;
+	tic->t_res_arr_sum += len;
+	tic->t_res_num++;
+}
+
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ */
+int
+xfs_log_regrant(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*tic)
+{
+	struct xlog		*log = mp->m_log;
+	int			need_bytes;
+	int			error = 0;
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return -EIO;
+
+	XFS_STATS_INC(xs_try_logspace);
+
+	/*
+	 * This is a new transaction on the ticket, so we need to change the
+	 * transaction ID so that the next transaction has a different TID in
+	 * the log. Just add one to the existing tid so that we can see chains
+	 * of rolling transactions in the log easily.
+	 */
+	tic->t_tid++;
+
+	xlog_grant_push_ail(log, tic->t_unit_res);
+
+	tic->t_curr_res = tic->t_unit_res;
+	xlog_tic_reset_res(tic);
+
+	if (tic->t_cnt > 0)
+		return 0;
+
+	trace_xfs_log_regrant(log, tic);
+
+	error = xlog_grant_head_check(log, &log->l_write_head, tic,
+				      &need_bytes);
+	if (error)
+		goto out_error;
+
+	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+	trace_xfs_log_regrant_exit(log, tic);
+	xlog_verify_grant_tail(log);
+	return 0;
+
+out_error:
+	/*
+	 * If we are failing, make sure the ticket doesn't have any current
+	 * reservations.  We don't want to add this back when the ticket/
+	 * transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0;	/* ungrant will give back unit_res * t_cnt. */
+	return error;
+}
+
+/*
+ * Reserve log space and return a ticket corresponding the reservation.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+	struct xfs_mount	*mp,
+	int		 	unit_bytes,
+	int		 	cnt,
+	struct xlog_ticket	**ticp,
+	__uint8_t	 	client,
+	bool			permanent,
+	uint		 	t_type)
+{
+	struct xlog		*log = mp->m_log;
+	struct xlog_ticket	*tic;
+	int			need_bytes;
+	int			error = 0;
+
+	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return -EIO;
+
+	XFS_STATS_INC(xs_try_logspace);
+
+	ASSERT(*ticp == NULL);
+	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
+				KM_SLEEP | KM_MAYFAIL);
+	if (!tic)
+		return -ENOMEM;
+
+	tic->t_trans_type = t_type;
+	*ticp = tic;
+
+	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+					    : tic->t_unit_res);
+
+	trace_xfs_log_reserve(log, tic);
+
+	error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
+				      &need_bytes);
+	if (error)
+		goto out_error;
+
+	xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
+	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+	trace_xfs_log_reserve_exit(log, tic);
+	xlog_verify_grant_tail(log);
+	return 0;
+
+out_error:
+	/*
+	 * If we are failing, make sure the ticket doesn't have any current
+	 * reservations.  We don't want to add this back when the ticket/
+	 * transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0;	/* ungrant will give back unit_res * t_cnt. */
+	return error;
+}
+
+
+/*
+ * NOTES:
+ *
+ *	1. currblock field gets updated at startup and after in-core logs
+ *		marked as with WANT_SYNC.
+ */
+
+/*
+ * This routine is called when a user of a log manager ticket is done with
+ * the reservation.  If the ticket was ever used, then a commit record for
+ * the associated transaction is written out as a log operation header with
+ * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
+ * a given ticket.  If the ticket was one with a permanent reservation, then
+ * a few operations are done differently.  Permanent reservation tickets by
+ * default don't release the reservation.  They just commit the current
+ * transaction with the belief that the reservation is still needed.  A flag
+ * must be passed in before permanent reservations are actually released.
+ * When these type of tickets are not released, they need to be set into
+ * the inited state again.  By doing this, a start record will be written
+ * out when the next write occurs.
+ */
+xfs_lsn_t
+xfs_log_done(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclog,
+	uint			flags)
+{
+	struct xlog		*log = mp->m_log;
+	xfs_lsn_t		lsn = 0;
+
+	if (XLOG_FORCED_SHUTDOWN(log) ||
+	    /*
+	     * If nothing was ever written, don't write out commit record.
+	     * If we get an error, just continue and give back the log ticket.
+	     */
+	    (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
+	     (xlog_commit_record(log, ticket, iclog, &lsn)))) {
+		lsn = (xfs_lsn_t) -1;
+		if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
+			flags |= XFS_LOG_REL_PERM_RESERV;
+		}
+	}
+
+
+	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
+	    (flags & XFS_LOG_REL_PERM_RESERV)) {
+		trace_xfs_log_done_nonperm(log, ticket);
+
+		/*
+		 * Release ticket if not permanent reservation or a specific
+		 * request has been made to release a permanent reservation.
+		 */
+		xlog_ungrant_log_space(log, ticket);
+		xfs_log_ticket_put(ticket);
+	} else {
+		trace_xfs_log_done_perm(log, ticket);
+
+		xlog_regrant_reserve_log_space(log, ticket);
+		/* If this ticket was a permanent reservation and we aren't
+		 * trying to release it, reset the inited flags; so next time
+		 * we write, a start record will be written out.
+		 */
+		ticket->t_flags |= XLOG_TIC_INITED;
+	}
+
+	return lsn;
+}
+
+/*
+ * Attaches a new iclog I/O completion callback routine during
+ * transaction commit.  If the log is in error state, a non-zero
+ * return code is handed back and the caller is responsible for
+ * executing the callback at an appropriate time.
+ */
+int
+xfs_log_notify(
+	struct xfs_mount	*mp,
+	struct xlog_in_core	*iclog,
+	xfs_log_callback_t	*cb)
+{
+	int	abortflg;
+
+	spin_lock(&iclog->ic_callback_lock);
+	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
+	if (!abortflg) {
+		ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
+			      (iclog->ic_state == XLOG_STATE_WANT_SYNC));
+		cb->cb_next = NULL;
+		*(iclog->ic_callback_tail) = cb;
+		iclog->ic_callback_tail = &(cb->cb_next);
+	}
+	spin_unlock(&iclog->ic_callback_lock);
+	return abortflg;
+}
+
+int
+xfs_log_release_iclog(
+	struct xfs_mount	*mp,
+	struct xlog_in_core	*iclog)
+{
+	if (xlog_state_release_iclog(mp->m_log, iclog)) {
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Mount a log filesystem
+ *
+ * mp		- ubiquitous xfs mount point structure
+ * log_target	- buftarg of on-disk log device
+ * blk_offset	- Start block # where block size is 512 bytes (BBSIZE)
+ * num_bblocks	- Number of BBSIZE blocks in on-disk log
+ *
+ * Return error or zero.
+ */
+int
+xfs_log_mount(
+	xfs_mount_t	*mp,
+	xfs_buftarg_t	*log_target,
+	xfs_daddr_t	blk_offset,
+	int		num_bblks)
+{
+	int		error = 0;
+	int		min_logfsbs;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+		xfs_notice(mp, "Mounting V%d Filesystem",
+			   XFS_SB_VERSION_NUM(&mp->m_sb));
+	} else {
+		xfs_notice(mp,
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+			   XFS_SB_VERSION_NUM(&mp->m_sb));
+		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+	}
+
+	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+	if (IS_ERR(mp->m_log)) {
+		error = PTR_ERR(mp->m_log);
+		goto out;
+	}
+
+	/*
+	 * Validate the given log space and drop a critical message via syslog
+	 * if the log size is too small that would lead to some unexpected
+	 * situations in transaction log space reservation stage.
+	 *
+	 * Note: we can't just reject the mount if the validation fails.  This
+	 * would mean that people would have to downgrade their kernel just to
+	 * remedy the situation as there is no way to grow the log (short of
+	 * black magic surgery with xfs_db).
+	 *
+	 * We can, however, reject mounts for CRC format filesystems, as the
+	 * mkfs binary being used to make the filesystem should never create a
+	 * filesystem with a log that is too small.
+	 */
+	min_logfsbs = xfs_log_calc_minimum_size(mp);
+
+	if (mp->m_sb.sb_logblocks < min_logfsbs) {
+		xfs_warn(mp,
+		"Log size %d blocks too small, minimum size is %d blocks",
+			 mp->m_sb.sb_logblocks, min_logfsbs);
+		error = -EINVAL;
+	} else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
+		xfs_warn(mp,
+		"Log size %d blocks too large, maximum size is %lld blocks",
+			 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
+		error = -EINVAL;
+	} else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
+		xfs_warn(mp,
+		"log size %lld bytes too large, maximum size is %lld bytes",
+			 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
+			 XFS_MAX_LOG_BYTES);
+		error = -EINVAL;
+	}
+	if (error) {
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
+			ASSERT(0);
+			goto out_free_log;
+		}
+		xfs_crit(mp,
+"Log size out of supported range. Continuing onwards, but if log hangs are\n"
+"experienced then please report this message in the bug report.");
+	}
+
+	/*
+	 * Initialize the AIL now we have a log.
+	 */
+	error = xfs_trans_ail_init(mp);
+	if (error) {
+		xfs_warn(mp, "AIL initialisation failed: error %d", error);
+		goto out_free_log;
+	}
+	mp->m_log->l_ailp = mp->m_ail;
+
+	/*
+	 * skip log recovery on a norecovery mount.  pretend it all
+	 * just worked.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+		int	readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+		if (readonly)
+			mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+		error = xlog_recover(mp->m_log);
+
+		if (readonly)
+			mp->m_flags |= XFS_MOUNT_RDONLY;
+		if (error) {
+			xfs_warn(mp, "log mount/recovery failed: error %d",
+				error);
+			goto out_destroy_ail;
+		}
+	}
+
+	error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+			       "log");
+	if (error)
+		goto out_destroy_ail;
+
+	/* Normal transactions can now occur */
+	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+
+	/*
+	 * Now the log has been fully initialised and we know were our
+	 * space grant counters are, we can initialise the permanent ticket
+	 * needed for delayed logging to work.
+	 */
+	xlog_cil_init_post_recovery(mp->m_log);
+
+	return 0;
+
+out_destroy_ail:
+	xfs_trans_ail_destroy(mp);
+out_free_log:
+	xlog_dealloc_log(mp->m_log);
+out:
+	return error;
+}
+
+/*
+ * Finish the recovery of the file system.  This is separate from the
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
+ * here.
+ *
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
+ */
+int
+xfs_log_mount_finish(xfs_mount_t *mp)
+{
+	int	error = 0;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+		error = xlog_recover_finish(mp->m_log);
+		if (!error)
+			xfs_log_work_queue(mp);
+	} else {
+		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+	}
+
+
+	return error;
+}
+
+/*
+ * Final log writes as part of unmount.
+ *
+ * Mark the filesystem clean as unmount happens.  Note that during relocation
+ * this routine needs to be executed as part of source-bag while the
+ * deallocation must not be done until source-end.
+ */
+
+/*
+ * Unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now since that particular field isn't
+ * currently architecture converted and "Unmount" is a bit foo.
+ * As far as I know, there weren't any dependencies on the old behaviour.
+ */
+
+int
+xfs_log_unmount_write(xfs_mount_t *mp)
+{
+	struct xlog	 *log = mp->m_log;
+	xlog_in_core_t	 *iclog;
+#ifdef DEBUG
+	xlog_in_core_t	 *first_iclog;
+#endif
+	xlog_ticket_t	*tic = NULL;
+	xfs_lsn_t	 lsn;
+	int		 error;
+
+	/*
+	 * Don't write out unmount record on read-only mounts.
+	 * Or, if we are doing a forced umount (typically because of IO errors).
+	 */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return 0;
+
+	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+
+#ifdef DEBUG
+	first_iclog = iclog = log->l_iclog;
+	do {
+		if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+			ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
+			ASSERT(iclog->ic_offset == 0);
+		}
+		iclog = iclog->ic_next;
+	} while (iclog != first_iclog);
+#endif
+	if (! (XLOG_FORCED_SHUTDOWN(log))) {
+		error = xfs_log_reserve(mp, 600, 1, &tic,
+					XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+		if (!error) {
+			/* the data section must be 32 bit size aligned */
+			struct {
+			    __uint16_t magic;
+			    __uint16_t pad1;
+			    __uint32_t pad2; /* may as well make it 64 bits */
+			} magic = {
+				.magic = XLOG_UNMOUNT_TYPE,
+			};
+			struct xfs_log_iovec reg = {
+				.i_addr = &magic,
+				.i_len = sizeof(magic),
+				.i_type = XLOG_REG_TYPE_UNMOUNT,
+			};
+			struct xfs_log_vec vec = {
+				.lv_niovecs = 1,
+				.lv_iovecp = &reg,
+			};
+
+			/* remove inited flag, and account for space used */
+			tic->t_flags = 0;
+			tic->t_curr_res -= sizeof(magic);
+			error = xlog_write(log, &vec, tic, &lsn,
+					   NULL, XLOG_UNMOUNT_TRANS);
+			/*
+			 * At this point, we're umounting anyway,
+			 * so there's no point in transitioning log state
+			 * to IOERROR. Just continue...
+			 */
+		}
+
+		if (error)
+			xfs_alert(mp, "%s: unmount record failed", __func__);
+
+
+		spin_lock(&log->l_icloglock);
+		iclog = log->l_iclog;
+		atomic_inc(&iclog->ic_refcnt);
+		xlog_state_want_sync(log, iclog);
+		spin_unlock(&log->l_icloglock);
+		error = xlog_state_release_iclog(log, iclog);
+
+		spin_lock(&log->l_icloglock);
+		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
+		      iclog->ic_state == XLOG_STATE_DIRTY)) {
+			if (!XLOG_FORCED_SHUTDOWN(log)) {
+				xlog_wait(&iclog->ic_force_wait,
+							&log->l_icloglock);
+			} else {
+				spin_unlock(&log->l_icloglock);
+			}
+		} else {
+			spin_unlock(&log->l_icloglock);
+		}
+		if (tic) {
+			trace_xfs_log_umount_write(log, tic);
+			xlog_ungrant_log_space(log, tic);
+			xfs_log_ticket_put(tic);
+		}
+	} else {
+		/*
+		 * We're already in forced_shutdown mode, couldn't
+		 * even attempt to write out the unmount transaction.
+		 *
+		 * Go through the motions of sync'ing and releasing
+		 * the iclog, even though no I/O will actually happen,
+		 * we need to wait for other log I/Os that may already
+		 * be in progress.  Do this as a separate section of
+		 * code so we'll know if we ever get stuck here that
+		 * we're in this odd situation of trying to unmount
+		 * a file system that went into forced_shutdown as
+		 * the result of an unmount..
+		 */
+		spin_lock(&log->l_icloglock);
+		iclog = log->l_iclog;
+		atomic_inc(&iclog->ic_refcnt);
+
+		xlog_state_want_sync(log, iclog);
+		spin_unlock(&log->l_icloglock);
+		error =  xlog_state_release_iclog(log, iclog);
+
+		spin_lock(&log->l_icloglock);
+
+		if ( ! (   iclog->ic_state == XLOG_STATE_ACTIVE
+			|| iclog->ic_state == XLOG_STATE_DIRTY
+			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
+
+				xlog_wait(&iclog->ic_force_wait,
+							&log->l_icloglock);
+		} else {
+			spin_unlock(&log->l_icloglock);
+		}
+	}
+
+	return error;
+}	/* xfs_log_unmount_write */
+
+/*
+ * Empty the log for unmount/freeze.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record.
+ */
+void
+xfs_log_quiesce(
+	struct xfs_mount	*mp)
+{
+	cancel_delayed_work_sync(&mp->m_log->l_work);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+	 * will push it, xfs_wait_buftarg() will not wait for it. Further,
+	 * xfs_buf_iowait() cannot be used because it was pushed with the
+	 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+	 * the IO to complete.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+
+	xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
+ *
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
+ */
+void
+xfs_log_unmount(
+	struct xfs_mount	*mp)
+{
+	xfs_log_quiesce(mp);
+
+	xfs_trans_ail_destroy(mp);
+
+	xfs_sysfs_del(&mp->m_log->l_kobj);
+
+	xlog_dealloc_log(mp->m_log);
+}
+
+void
+xfs_log_item_init(
+	struct xfs_mount	*mp,
+	struct xfs_log_item	*item,
+	int			type,
+	const struct xfs_item_ops *ops)
+{
+	item->li_mountp = mp;
+	item->li_ailp = mp->m_ail;
+	item->li_type = type;
+	item->li_ops = ops;
+	item->li_lv = NULL;
+
+	INIT_LIST_HEAD(&item->li_ail);
+	INIT_LIST_HEAD(&item->li_cil);
+}
+
+/*
+ * Wake up processes waiting for log space after we have moved the log tail.
+ */
+void
+xfs_log_space_wake(
+	struct xfs_mount	*mp)
+{
+	struct xlog		*log = mp->m_log;
+	int			free_bytes;
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return;
+
+	if (!list_empty_careful(&log->l_write_head.waiters)) {
+		ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+
+		spin_lock(&log->l_write_head.lock);
+		free_bytes = xlog_space_left(log, &log->l_write_head.grant);
+		xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
+		spin_unlock(&log->l_write_head.lock);
+	}
+
+	if (!list_empty_careful(&log->l_reserve_head.waiters)) {
+		ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+
+		spin_lock(&log->l_reserve_head.lock);
+		free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+		xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
+		spin_unlock(&log->l_reserve_head.lock);
+	}
+}
+
+/*
+ * Determine if we have a transaction that has gone to disk that needs to be
+ * covered. To begin the transition to the idle state firstly the log needs to
+ * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
+ * we start attempting to cover the log.
+ *
+ * Only if we are then in a state where covering is needed, the caller is
+ * informed that dummy transactions are required to move the log into the idle
+ * state.
+ *
+ * If there are any items in the AIl or CIL, then we do not want to attempt to
+ * cover the log as we may be in a situation where there isn't log space
+ * available to run a dummy transaction and this can lead to deadlocks when the
+ * tail of the log is pinned by an item that is modified in the CIL.  Hence
+ * there's no point in running a dummy transaction at this point because we
+ * can't start trying to idle the log until both the CIL and AIL are empty.
+ */
+int
+xfs_log_need_covered(xfs_mount_t *mp)
+{
+	struct xlog	*log = mp->m_log;
+	int		needed = 0;
+
+	if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
+		return 0;
+
+	if (!xlog_cil_empty(log))
+		return 0;
+
+	spin_lock(&log->l_icloglock);
+	switch (log->l_covered_state) {
+	case XLOG_STATE_COVER_DONE:
+	case XLOG_STATE_COVER_DONE2:
+	case XLOG_STATE_COVER_IDLE:
+		break;
+	case XLOG_STATE_COVER_NEED:
+	case XLOG_STATE_COVER_NEED2:
+		if (xfs_ail_min_lsn(log->l_ailp))
+			break;
+		if (!xlog_iclogs_empty(log))
+			break;
+
+		needed = 1;
+		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+			log->l_covered_state = XLOG_STATE_COVER_DONE;
+		else
+			log->l_covered_state = XLOG_STATE_COVER_DONE2;
+		break;
+	default:
+		needed = 1;
+		break;
+	}
+	spin_unlock(&log->l_icloglock);
+	return needed;
+}
+
+/*
+ * We may be holding the log iclog lock upon entering this routine.
+ */
+xfs_lsn_t
+xlog_assign_tail_lsn_locked(
+	struct xfs_mount	*mp)
+{
+	struct xlog		*log = mp->m_log;
+	struct xfs_log_item	*lip;
+	xfs_lsn_t		tail_lsn;
+
+	assert_spin_locked(&mp->m_ail->xa_lock);
+
+	/*
+	 * To make sure we always have a valid LSN for the log tail we keep
+	 * track of the last LSN which was committed in log->l_last_sync_lsn,
+	 * and use that when the AIL was empty.
+	 */
+	lip = xfs_ail_min(mp->m_ail);
+	if (lip)
+		tail_lsn = lip->li_lsn;
+	else
+		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+	trace_xfs_log_assign_tail_lsn(log, tail_lsn);
+	atomic64_set(&log->l_tail_lsn, tail_lsn);
+	return tail_lsn;
+}
+
+xfs_lsn_t
+xlog_assign_tail_lsn(
+	struct xfs_mount	*mp)
+{
+	xfs_lsn_t		tail_lsn;
+
+	spin_lock(&mp->m_ail->xa_lock);
+	tail_lsn = xlog_assign_tail_lsn_locked(mp);
+	spin_unlock(&mp->m_ail->xa_lock);
+
+	return tail_lsn;
+}
+
+/*
+ * Return the space in the log between the tail and the head.  The head
+ * is passed in the cycle/bytes formal parms.  In the special case where
+ * the reserve head has wrapped passed the tail, this calculation is no
+ * longer valid.  In this case, just return 0 which means there is no space
+ * in the log.  This works for all places where this function is called
+ * with the reserve head.  Of course, if the write head were to ever
+ * wrap the tail, we should blow up.  Rather than catch this case here,
+ * we depend on other ASSERTions in other parts of the code.   XXXmiken
+ *
+ * This code also handles the case where the reservation head is behind
+ * the tail.  The details of this case are described below, but the end
+ * result is that we return the size of the log as the amount of space left.
+ */
+STATIC int
+xlog_space_left(
+	struct xlog	*log,
+	atomic64_t	*head)
+{
+	int		free_bytes;
+	int		tail_bytes;
+	int		tail_cycle;
+	int		head_cycle;
+	int		head_bytes;
+
+	xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+	tail_bytes = BBTOB(tail_bytes);
+	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+		free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+	else if (tail_cycle + 1 < head_cycle)
+		return 0;
+	else if (tail_cycle < head_cycle) {
+		ASSERT(tail_cycle == (head_cycle - 1));
+		free_bytes = tail_bytes - head_bytes;
+	} else {
+		/*
+		 * The reservation head is behind the tail.
+		 * In this case we just want to return the size of the
+		 * log as the amount of space left.
+		 */
+		xfs_alert(log->l_mp,
+			"xlog_space_left: head behind tail\n"
+			"  tail_cycle = %d, tail_bytes = %d\n"
+			"  GH   cycle = %d, GH   bytes = %d",
+			tail_cycle, tail_bytes, head_cycle, head_bytes);
+		ASSERT(0);
+		free_bytes = log->l_logsize;
+	}
+	return free_bytes;
+}
+
+
+/*
+ * Log function which is called when an io completes.
+ *
+ * The log manager needs its own routine, in order to control what
+ * happens with the buffer after the write completes.
+ */
+void
+xlog_iodone(xfs_buf_t *bp)
+{
+	struct xlog_in_core	*iclog = bp->b_fspriv;
+	struct xlog		*l = iclog->ic_log;
+	int			aborted = 0;
+
+	/*
+	 * Race to shutdown the filesystem if we see an error.
+	 */
+	if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
+			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+		xfs_buf_ioerror_alert(bp, __func__);
+		xfs_buf_stale(bp);
+		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
+		/*
+		 * This flag will be propagated to the trans-committed
+		 * callback routines to let them know that the log-commit
+		 * didn't succeed.
+		 */
+		aborted = XFS_LI_ABORTED;
+	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		aborted = XFS_LI_ABORTED;
+	}
+
+	/* log I/O is always issued ASYNC */
+	ASSERT(XFS_BUF_ISASYNC(bp));
+	xlog_state_done_syncing(iclog, aborted);
+
+	/*
+	 * drop the buffer lock now that we are done. Nothing references
+	 * the buffer after this, so an unmount waiting on this lock can now
+	 * tear it down safely. As such, it is unsafe to reference the buffer
+	 * (bp) after the unlock as we could race with it being freed.
+	 */
+	xfs_buf_unlock(bp);
+}
+
+/*
+ * Return size of each in-core log record buffer.
+ *
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
+ *
+ * If the filesystem blocksize is too large, we may need to choose a
+ * larger size since the directory code currently logs entire blocks.
+ */
+
+STATIC void
+xlog_get_iclog_buffer_size(
+	struct xfs_mount	*mp,
+	struct xlog		*log)
+{
+	int size;
+	int xhdrs;
+
+	if (mp->m_logbufs <= 0)
+		log->l_iclog_bufs = XLOG_MAX_ICLOGS;
+	else
+		log->l_iclog_bufs = mp->m_logbufs;
+
+	/*
+	 * Buffer size passed in from mount system call.
+	 */
+	if (mp->m_logbsize > 0) {
+		size = log->l_iclog_size = mp->m_logbsize;
+		log->l_iclog_size_log = 0;
+		while (size != 1) {
+			log->l_iclog_size_log++;
+			size >>= 1;
+		}
+
+		if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+			/* # headers = size / 32k
+			 * one header holds cycles from 32k of data
+			 */
+
+			xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
+			if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
+				xhdrs++;
+			log->l_iclog_hsize = xhdrs << BBSHIFT;
+			log->l_iclog_heads = xhdrs;
+		} else {
+			ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
+			log->l_iclog_hsize = BBSIZE;
+			log->l_iclog_heads = 1;
+		}
+		goto done;
+	}
+
+	/* All machines use 32kB buffers by default. */
+	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
+	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+
+	/* the default log size is 16k or 32k which is one header sector */
+	log->l_iclog_hsize = BBSIZE;
+	log->l_iclog_heads = 1;
+
+done:
+	/* are we being asked to make the sizes selected above visible? */
+	if (mp->m_logbufs == 0)
+		mp->m_logbufs = log->l_iclog_bufs;
+	if (mp->m_logbsize == 0)
+		mp->m_logbsize = log->l_iclog_size;
+}	/* xlog_get_iclog_buffer_size */
+
+
+void
+xfs_log_work_queue(
+	struct xfs_mount        *mp)
+{
+	queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+				msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+	struct work_struct	*work)
+{
+	struct xlog		*log = container_of(to_delayed_work(work),
+						struct xlog, l_work);
+	struct xfs_mount	*mp = log->l_mp;
+
+	/* dgc: errors ignored - not fatal and nowhere to report them */
+	if (xfs_log_need_covered(mp)) {
+		/*
+		 * Dump a transaction into the log that contains no real change.
+		 * This is needed to stamp the current tail LSN into the log
+		 * during the covering operation.
+		 *
+		 * We cannot use an inode here for this - that will push dirty
+		 * state back up into the VFS and then periodic inode flushing
+		 * will prevent log covering from making progress. Hence we
+		 * synchronously log the superblock instead to ensure the
+		 * superblock is immediately unpinned and can be written back.
+		 */
+		xfs_sync_sb(mp, true);
+	} else
+		xfs_log_force(mp, 0);
+
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
+
+	/* queue us up again */
+	xfs_log_work_queue(mp);
+}
+
+/*
+ * This routine initializes some of the log structure for a given mount point.
+ * Its primary purpose is to fill in enough, so recovery can occur.  However,
+ * some other stuff may be filled in too.
+ */
+STATIC struct xlog *
+xlog_alloc_log(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*log_target,
+	xfs_daddr_t		blk_offset,
+	int			num_bblks)
+{
+	struct xlog		*log;
+	xlog_rec_header_t	*head;
+	xlog_in_core_t		**iclogp;
+	xlog_in_core_t		*iclog, *prev_iclog=NULL;
+	xfs_buf_t		*bp;
+	int			i;
+	int			error = -ENOMEM;
+	uint			log2_size = 0;
+
+	log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
+	if (!log) {
+		xfs_warn(mp, "Log allocation failed: No memory!");
+		goto out;
+	}
+
+	log->l_mp	   = mp;
+	log->l_targ	   = log_target;
+	log->l_logsize     = BBTOB(num_bblks);
+	log->l_logBBstart  = blk_offset;
+	log->l_logBBsize   = num_bblks;
+	log->l_covered_state = XLOG_STATE_COVER_IDLE;
+	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
+	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
+
+	log->l_prev_block  = -1;
+	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
+	xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
+	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
+
+	xlog_grant_head_init(&log->l_reserve_head);
+	xlog_grant_head_init(&log->l_write_head);
+
+	error = -EFSCORRUPTED;
+	if (xfs_sb_version_hassector(&mp->m_sb)) {
+	        log2_size = mp->m_sb.sb_logsectlog;
+		if (log2_size < BBSHIFT) {
+			xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
+				log2_size, BBSHIFT);
+			goto out_free_log;
+		}
+
+	        log2_size -= BBSHIFT;
+		if (log2_size > mp->m_sectbb_log) {
+			xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
+				log2_size, mp->m_sectbb_log);
+			goto out_free_log;
+		}
+
+		/* for larger sector sizes, must have v2 or external log */
+		if (log2_size && log->l_logBBstart > 0 &&
+			    !xfs_sb_version_haslogv2(&mp->m_sb)) {
+			xfs_warn(mp,
+		"log sector size (0x%x) invalid for configuration.",
+				log2_size);
+			goto out_free_log;
+		}
+	}
+	log->l_sectBBsize = 1 << log2_size;
+
+	xlog_get_iclog_buffer_size(mp, log);
+
+	/*
+	 * Use a NULL block for the extra log buffer used during splits so that
+	 * it will trigger errors if we ever try to do IO on it without first
+	 * having set it up properly.
+	 */
+	error = -ENOMEM;
+	bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
+			   BTOBB(log->l_iclog_size), 0);
+	if (!bp)
+		goto out_free_log;
+
+	/*
+	 * The iclogbuf buffer locks are held over IO but we are not going to do
+	 * IO yet.  Hence unlock the buffer so that the log IO path can grab it
+	 * when appropriately.
+	 */
+	ASSERT(xfs_buf_islocked(bp));
+	xfs_buf_unlock(bp);
+
+	/* use high priority wq for log I/O completion */
+	bp->b_ioend_wq = mp->m_log_workqueue;
+	bp->b_iodone = xlog_iodone;
+	log->l_xbuf = bp;
+
+	spin_lock_init(&log->l_icloglock);
+	init_waitqueue_head(&log->l_flush_wait);
+
+	iclogp = &log->l_iclog;
+	/*
+	 * The amount of memory to allocate for the iclog structure is
+	 * rather funky due to the way the structure is defined.  It is
+	 * done this way so that we can use different sizes for machines
+	 * with different amounts of memory.  See the definition of
+	 * xlog_in_core_t in xfs_log_priv.h for details.
+	 */
+	ASSERT(log->l_iclog_size >= 4096);
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
+		if (!*iclogp)
+			goto out_free_iclog;
+
+		iclog = *iclogp;
+		iclog->ic_prev = prev_iclog;
+		prev_iclog = iclog;
+
+		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+						BTOBB(log->l_iclog_size), 0);
+		if (!bp)
+			goto out_free_iclog;
+
+		ASSERT(xfs_buf_islocked(bp));
+		xfs_buf_unlock(bp);
+
+		/* use high priority wq for log I/O completion */
+		bp->b_ioend_wq = mp->m_log_workqueue;
+		bp->b_iodone = xlog_iodone;
+		iclog->ic_bp = bp;
+		iclog->ic_data = bp->b_addr;
+#ifdef DEBUG
+		log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+#endif
+		head = &iclog->ic_header;
+		memset(head, 0, sizeof(xlog_rec_header_t));
+		head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+		head->h_version = cpu_to_be32(
+			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+		head->h_size = cpu_to_be32(log->l_iclog_size);
+		/* new fields */
+		head->h_fmt = cpu_to_be32(XLOG_FMT);
+		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+
+		iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
+		iclog->ic_state = XLOG_STATE_ACTIVE;
+		iclog->ic_log = log;
+		atomic_set(&iclog->ic_refcnt, 0);
+		spin_lock_init(&iclog->ic_callback_lock);
+		iclog->ic_callback_tail = &(iclog->ic_callback);
+		iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+
+		init_waitqueue_head(&iclog->ic_force_wait);
+		init_waitqueue_head(&iclog->ic_write_wait);
+
+		iclogp = &iclog->ic_next;
+	}
+	*iclogp = log->l_iclog;			/* complete ring */
+	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
+
+	error = xlog_cil_init(log);
+	if (error)
+		goto out_free_iclog;
+	return log;
+
+out_free_iclog:
+	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+		prev_iclog = iclog->ic_next;
+		if (iclog->ic_bp)
+			xfs_buf_free(iclog->ic_bp);
+		kmem_free(iclog);
+	}
+	spinlock_destroy(&log->l_icloglock);
+	xfs_buf_free(log->l_xbuf);
+out_free_log:
+	kmem_free(log);
+out:
+	return ERR_PTR(error);
+}	/* xlog_alloc_log */
+
+
+/*
+ * Write out the commit record of a transaction associated with the given
+ * ticket.  Return the lsn of the commit record.
+ */
+STATIC int
+xlog_commit_record(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclog,
+	xfs_lsn_t		*commitlsnp)
+{
+	struct xfs_mount *mp = log->l_mp;
+	int	error;
+	struct xfs_log_iovec reg = {
+		.i_addr = NULL,
+		.i_len = 0,
+		.i_type = XLOG_REG_TYPE_COMMIT,
+	};
+	struct xfs_log_vec vec = {
+		.lv_niovecs = 1,
+		.lv_iovecp = &reg,
+	};
+
+	ASSERT_ALWAYS(iclog);
+	error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
+					XLOG_COMMIT_TRANS);
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+	return error;
+}
+
+/*
+ * Push on the buffer cache code if we ever use more than 75% of the on-disk
+ * log space.  This code pushes on the lsn which would supposedly free up
+ * the 25% which we want to leave free.  We may need to adopt a policy which
+ * pushes on an lsn which is further along in the log once we reach the high
+ * water mark.  In this manner, we would be creating a low water mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+	struct xlog	*log,
+	int		need_bytes)
+{
+	xfs_lsn_t	threshold_lsn = 0;
+	xfs_lsn_t	last_sync_lsn;
+	int		free_blocks;
+	int		free_bytes;
+	int		threshold_block;
+	int		threshold_cycle;
+	int		free_threshold;
+
+	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+	free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+	free_blocks = BTOBBT(free_bytes);
+
+	/*
+	 * Set the threshold for the minimum number of free blocks in the
+	 * log to the maximum of what the caller needs, one quarter of the
+	 * log, and 256 blocks.
+	 */
+	free_threshold = BTOBB(need_bytes);
+	free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+	free_threshold = MAX(free_threshold, 256);
+	if (free_blocks >= free_threshold)
+		return;
+
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+						&threshold_block);
+	threshold_block += free_threshold;
+	if (threshold_block >= log->l_logBBsize) {
+		threshold_block -= log->l_logBBsize;
+		threshold_cycle += 1;
+	}
+	threshold_lsn = xlog_assign_lsn(threshold_cycle,
+					threshold_block);
+	/*
+	 * Don't pass in an lsn greater than the lsn of the last
+	 * log record known to be on disk. Use a snapshot of the last sync lsn
+	 * so that it doesn't change between the compare and the set.
+	 */
+	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+		threshold_lsn = last_sync_lsn;
+
+	/*
+	 * Get the transaction layer to kick the dirty buffers out to
+	 * disk asynchronously. No point in trying to do this if
+	 * the filesystem is shutting down.
+	 */
+	if (!XLOG_FORCED_SHUTDOWN(log))
+		xfs_ail_push(log->l_ailp, threshold_lsn);
+}
+
+/*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			roundoff)
+{
+	int			i, j, k;
+	int			size = iclog->ic_offset + roundoff;
+	__be32			cycle_lsn;
+	xfs_caddr_t		dp;
+
+	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+	dp = iclog->ic_datap;
+	for (i = 0; i < BTOBB(size); i++) {
+		if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+			break;
+		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+		*(__be32 *)dp = cycle_lsn;
+		dp += BBSIZE;
+	}
+
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+		for ( ; i < BTOBB(size); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+			*(__be32 *)dp = cycle_lsn;
+			dp += BBSIZE;
+		}
+
+		for (i = 1; i < log->l_iclog_heads; i++)
+			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+	}
+}
+
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+	struct xlog		*log,
+	struct xlog_rec_header	*rhead,
+	char			*dp,
+	int			size)
+{
+	__uint32_t		crc;
+
+	/* first generate the crc for the record header ... */
+	crc = xfs_start_cksum((char *)rhead,
+			      sizeof(struct xlog_rec_header),
+			      offsetof(struct xlog_rec_header, h_crc));
+
+	/* ... then for additional cycle data for v2 logs ... */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+		int		i;
+
+		for (i = 1; i < log->l_iclog_heads; i++) {
+			crc = crc32c(crc, &xhdr[i].hic_xheader,
+				     sizeof(struct xlog_rec_ext_header));
+		}
+	}
+
+	/* ... and finally for the payload */
+	crc = crc32c(crc, dp, size);
+
+	return xfs_end_cksum(crc);
+}
+
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
+ */
+STATIC int
+xlog_bdstrat(
+	struct xfs_buf		*bp)
+{
+	struct xlog_in_core	*iclog = bp->b_fspriv;
+
+	xfs_buf_lock(bp);
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		xfs_buf_ioerror(bp, -EIO);
+		xfs_buf_stale(bp);
+		xfs_buf_ioend(bp);
+		/*
+		 * It would seem logical to return EIO here, but we rely on
+		 * the log state machine to propagate I/O errors instead of
+		 * doing it here. Similarly, IO completion will unlock the
+		 * buffer, so we don't do it here.
+		 */
+		return 0;
+	}
+
+	xfs_buf_submit(bp);
+	return 0;
+}
+
+/*
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
+ * fashion.  Previously, we should have moved the current iclog
+ * ptr in the log to point to the next available iclog.  This allows further
+ * write to continue while this code syncs out an iclog ready to go.
+ * Before an in-core log can be written out, the data section must be scanned
+ * to save away the 1st word of each BBSIZE block into the header.  We replace
+ * it with the current cycle count.  Each BBSIZE block is tagged with the
+ * cycle count because there in an implicit assumption that drives will
+ * guarantee that entire 512 byte blocks get written at once.  In other words,
+ * we can't have part of a 512 byte block written and part not written.  By
+ * tagging each block, we will know which blocks are valid when recovering
+ * after an unclean shutdown.
+ *
+ * This routine is single threaded on the iclog.  No other thread can be in
+ * this routine with the same iclog.  Changing contents of iclog can there-
+ * fore be done without grabbing the state machine lock.  Updating the global
+ * log will require grabbing the lock though.
+ *
+ * The entire log manager uses a logical block numbering scheme.  Only
+ * log_sync (and then only bwrite()) know about the fact that the log may
+ * not start with block zero on a given device.  The log block start offset
+ * is added immediately before calling bwrite().
+ */
+
+STATIC int
+xlog_sync(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog)
+{
+	xfs_buf_t	*bp;
+	int		i;
+	uint		count;		/* byte count of bwrite */
+	uint		count_init;	/* initial count before roundup */
+	int		roundoff;       /* roundoff to BB or stripe */
+	int		split = 0;	/* split write into two regions */
+	int		error;
+	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+	int		size;
+
+	XFS_STATS_INC(xs_log_writes);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+
+	/* Add for LR header */
+	count_init = log->l_iclog_hsize + iclog->ic_offset;
+
+	/* Round out the log write size */
+	if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
+		/* we have a v2 stripe unit to use */
+		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+	} else {
+		count = BBTOB(BTOBB(count_init));
+	}
+	roundoff = count - count_init;
+	ASSERT(roundoff >= 0);
+	ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && 
+                roundoff < log->l_mp->m_sb.sb_logsunit)
+		|| 
+		(log->l_mp->m_sb.sb_logsunit <= 1 && 
+		 roundoff < BBTOB(1)));
+
+	/* move grant heads by roundoff in sync */
+	xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+	xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+
+	/* put cycle number in every block */
+	xlog_pack_data(log, iclog, roundoff); 
+
+	/* real byte length */
+	size = iclog->ic_offset;
+	if (v2)
+		size += roundoff;
+	iclog->ic_header.h_len = cpu_to_be32(size);
+
+	bp = iclog->ic_bp;
+	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
+
+	XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+
+	/* Do we need to split this write into 2 parts? */
+	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+		char		*dptr;
+
+		split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
+		count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
+		iclog->ic_bwritecnt = 2;
+
+		/*
+		 * Bump the cycle numbers at the start of each block in the
+		 * part of the iclog that ends up in the buffer that gets
+		 * written to the start of the log.
+		 *
+		 * Watch out for the header magic number case, though.
+		 */
+		dptr = (char *)&iclog->ic_header + count;
+		for (i = 0; i < split; i += BBSIZE) {
+			__uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+			if (++cycle == XLOG_HEADER_MAGIC_NUM)
+				cycle++;
+			*(__be32 *)dptr = cpu_to_be32(cycle);
+
+			dptr += BBSIZE;
+		}
+	} else {
+		iclog->ic_bwritecnt = 1;
+	}
+
+	/* calculcate the checksum */
+	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+					    iclog->ic_datap, size);
+
+	bp->b_io_length = BTOBB(count);
+	bp->b_fspriv = iclog;
+	XFS_BUF_ZEROFLAGS(bp);
+	XFS_BUF_ASYNC(bp);
+	bp->b_flags |= XBF_SYNCIO;
+
+	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
+		bp->b_flags |= XBF_FUA;
+
+		/*
+		 * Flush the data device before flushing the log to make
+		 * sure all meta data written back from the AIL actually made
+		 * it to disk before stamping the new log tail LSN into the
+		 * log buffer.  For an external log we need to issue the
+		 * flush explicitly, and unfortunately synchronously here;
+		 * for an internal log we can simply use the block layer
+		 * state machine for preflushes.
+		 */
+		if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+			xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
+		else
+			bp->b_flags |= XBF_FLUSH;
+	}
+
+	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+	xlog_verify_iclog(log, iclog, count, true);
+
+	/* account for log which doesn't start at block #0 */
+	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+	/*
+	 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
+	 * is shutting down.
+	 */
+	XFS_BUF_WRITE(bp);
+
+	error = xlog_bdstrat(bp);
+	if (error) {
+		xfs_buf_ioerror_alert(bp, "xlog_sync");
+		return error;
+	}
+	if (split) {
+		bp = iclog->ic_log->l_xbuf;
+		XFS_BUF_SET_ADDR(bp, 0);	     /* logical 0 */
+		xfs_buf_associate_memory(bp,
+				(char *)&iclog->ic_header + count, split);
+		bp->b_fspriv = iclog;
+		XFS_BUF_ZEROFLAGS(bp);
+		XFS_BUF_ASYNC(bp);
+		bp->b_flags |= XBF_SYNCIO;
+		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+			bp->b_flags |= XBF_FUA;
+
+		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+		/* account for internal log which doesn't start at block #0 */
+		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+		XFS_BUF_WRITE(bp);
+		error = xlog_bdstrat(bp);
+		if (error) {
+			xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
+			return error;
+		}
+	}
+	return 0;
+}	/* xlog_sync */
+
+/*
+ * Deallocate a log structure
+ */
+STATIC void
+xlog_dealloc_log(
+	struct xlog	*log)
+{
+	xlog_in_core_t	*iclog, *next_iclog;
+	int		i;
+
+	xlog_cil_destroy(log);
+
+	/*
+	 * Cycle all the iclogbuf locks to make sure all log IO completion
+	 * is done before we tear down these buffers.
+	 */
+	iclog = log->l_iclog;
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		xfs_buf_lock(iclog->ic_bp);
+		xfs_buf_unlock(iclog->ic_bp);
+		iclog = iclog->ic_next;
+	}
+
+	/*
+	 * Always need to ensure that the extra buffer does not point to memory
+	 * owned by another log buffer before we free it. Also, cycle the lock
+	 * first to ensure we've completed IO on it.
+	 */
+	xfs_buf_lock(log->l_xbuf);
+	xfs_buf_unlock(log->l_xbuf);
+	xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
+	xfs_buf_free(log->l_xbuf);
+
+	iclog = log->l_iclog;
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		xfs_buf_free(iclog->ic_bp);
+		next_iclog = iclog->ic_next;
+		kmem_free(iclog);
+		iclog = next_iclog;
+	}
+	spinlock_destroy(&log->l_icloglock);
+
+	log->l_mp->m_log = NULL;
+	kmem_free(log);
+}	/* xlog_dealloc_log */
+
+/*
+ * Update counters atomically now that memcpy is done.
+ */
+/* ARGSUSED */
+static inline void
+xlog_state_finish_copy(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			record_cnt,
+	int			copy_bytes)
+{
+	spin_lock(&log->l_icloglock);
+
+	be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
+	iclog->ic_offset += copy_bytes;
+
+	spin_unlock(&log->l_icloglock);
+}	/* xlog_state_finish_copy */
+
+
+
+
+/*
+ * print out info relating to regions written which consume
+ * the reservation
+ */
+void
+xlog_print_tic_res(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket)
+{
+	uint i;
+	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
+
+	/* match with XLOG_REG_TYPE_* in xfs_log.h */
+	static char *res_type_str[XLOG_REG_TYPE_MAX] = {
+	    "bformat",
+	    "bchunk",
+	    "efi_format",
+	    "efd_format",
+	    "iformat",
+	    "icore",
+	    "iext",
+	    "ibroot",
+	    "ilocal",
+	    "iattr_ext",
+	    "iattr_broot",
+	    "iattr_local",
+	    "qformat",
+	    "dquot",
+	    "quotaoff",
+	    "LR header",
+	    "unmount",
+	    "commit",
+	    "trans header"
+	};
+	static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
+	    "SETATTR_NOT_SIZE",
+	    "SETATTR_SIZE",
+	    "INACTIVE",
+	    "CREATE",
+	    "CREATE_TRUNC",
+	    "TRUNCATE_FILE",
+	    "REMOVE",
+	    "LINK",
+	    "RENAME",
+	    "MKDIR",
+	    "RMDIR",
+	    "SYMLINK",
+	    "SET_DMATTRS",
+	    "GROWFS",
+	    "STRAT_WRITE",
+	    "DIOSTRAT",
+	    "WRITE_SYNC",
+	    "WRITEID",
+	    "ADDAFORK",
+	    "ATTRINVAL",
+	    "ATRUNCATE",
+	    "ATTR_SET",
+	    "ATTR_RM",
+	    "ATTR_FLAG",
+	    "CLEAR_AGI_BUCKET",
+	    "QM_SBCHANGE",
+	    "DUMMY1",
+	    "DUMMY2",
+	    "QM_QUOTAOFF",
+	    "QM_DQALLOC",
+	    "QM_SETQLIM",
+	    "QM_DQCLUSTER",
+	    "QM_QINOCREATE",
+	    "QM_QUOTAOFF_END",
+	    "SB_UNIT",
+	    "FSYNC_TS",
+	    "GROWFSRT_ALLOC",
+	    "GROWFSRT_ZERO",
+	    "GROWFSRT_FREE",
+	    "SWAPEXT"
+	};
+
+	xfs_warn(mp,
+		"xlog_write: reservation summary:\n"
+		"  trans type  = %s (%u)\n"
+		"  unit res    = %d bytes\n"
+		"  current res = %d bytes\n"
+		"  total reg   = %u bytes (o/flow = %u bytes)\n"
+		"  ophdrs      = %u (ophdr space = %u bytes)\n"
+		"  ophdr + reg = %u bytes\n"
+		"  num regions = %u",
+		((ticket->t_trans_type <= 0 ||
+		  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+		  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+		ticket->t_trans_type,
+		ticket->t_unit_res,
+		ticket->t_curr_res,
+		ticket->t_res_arr_sum, ticket->t_res_o_flow,
+		ticket->t_res_num_ophdrs, ophdr_spc,
+		ticket->t_res_arr_sum +
+		ticket->t_res_o_flow + ophdr_spc,
+		ticket->t_res_num);
+
+	for (i = 0; i < ticket->t_res_num; i++) {
+		uint r_type = ticket->t_res_arr[i].r_type;
+		xfs_warn(mp, "region[%u]: %s - %u bytes", i,
+			    ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
+			    "bad-rtype" : res_type_str[r_type-1]),
+			    ticket->t_res_arr[i].r_len);
+	}
+
+	xfs_alert_tag(mp, XFS_PTAG_LOGRES,
+		"xlog_write: reservation ran out. Need to up reservation");
+	xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+}
+
+/*
+ * Calculate the potential space needed by the log vector.  Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+	struct xlog_ticket	*ticket,
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+	int			headers = 0;
+	int			len = 0;
+	int			i;
+
+	/* acct for start rec of xact */
+	if (ticket->t_flags & XLOG_TIC_INITED)
+		headers++;
+
+	for (lv = log_vector; lv; lv = lv->lv_next) {
+		/* we don't write ordered log vectors */
+		if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+			continue;
+
+		headers += lv->lv_niovecs;
+
+		for (i = 0; i < lv->lv_niovecs; i++) {
+			struct xfs_log_iovec	*vecp = &lv->lv_iovecp[i];
+
+			len += vecp->i_len;
+			xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+		}
+	}
+
+	ticket->t_res_num_ophdrs += headers;
+	len += headers * sizeof(struct xlog_op_header);
+
+	return len;
+}
+
+/*
+ * If first write for transaction, insert start record  We can't be trying to
+ * commit if we are inited.  We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+	struct xlog_op_header	*ophdr,
+	struct xlog_ticket	*ticket)
+{
+	if (!(ticket->t_flags & XLOG_TIC_INITED))
+		return 0;
+
+	ophdr->oh_tid	= cpu_to_be32(ticket->t_tid);
+	ophdr->oh_clientid = ticket->t_clientid;
+	ophdr->oh_len = 0;
+	ophdr->oh_flags = XLOG_START_TRANS;
+	ophdr->oh_res2 = 0;
+
+	ticket->t_flags &= ~XLOG_TIC_INITED;
+
+	return sizeof(struct xlog_op_header);
+}
+
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+	struct xlog		*log,
+	struct xlog_op_header	*ophdr,
+	struct xlog_ticket	*ticket,
+	uint			flags)
+{
+	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+	ophdr->oh_clientid = ticket->t_clientid;
+	ophdr->oh_res2 = 0;
+
+	/* are we copying a commit or unmount record? */
+	ophdr->oh_flags = flags;
+
+	/*
+	 * We've seen logs corrupted with bad transaction client ids.  This
+	 * makes sure that XFS doesn't generate them on.  Turn this into an EIO
+	 * and shut down the filesystem.
+	 */
+	switch (ophdr->oh_clientid)  {
+	case XFS_TRANSACTION:
+	case XFS_VOLUME:
+	case XFS_LOG:
+		break;
+	default:
+		xfs_warn(log->l_mp,
+			"Bad XFS transaction clientid 0x%x in ticket 0x%p",
+			ophdr->oh_clientid, ticket);
+		return NULL;
+	}
+
+	return ophdr;
+}
+
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+	struct xlog_ticket	*ticket,
+	struct xlog_op_header	*ophdr,
+	int			space_available,
+	int			space_required,
+	int			*copy_off,
+	int			*copy_len,
+	int			*last_was_partial_copy,
+	int			*bytes_consumed)
+{
+	int			still_to_copy;
+
+	still_to_copy = space_required - *bytes_consumed;
+	*copy_off = *bytes_consumed;
+
+	if (still_to_copy <= space_available) {
+		/* write of region completes here */
+		*copy_len = still_to_copy;
+		ophdr->oh_len = cpu_to_be32(*copy_len);
+		if (*last_was_partial_copy)
+			ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+		*last_was_partial_copy = 0;
+		*bytes_consumed = 0;
+		return 0;
+	}
+
+	/* partial write of region, needs extra log op header reservation */
+	*copy_len = space_available;
+	ophdr->oh_len = cpu_to_be32(*copy_len);
+	ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+	if (*last_was_partial_copy)
+		ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+	*bytes_consumed += *copy_len;
+	(*last_was_partial_copy)++;
+
+	/* account for new log op header */
+	ticket->t_curr_res -= sizeof(struct xlog_op_header);
+	ticket->t_res_num_ophdrs++;
+
+	return sizeof(struct xlog_op_header);
+}
+
+static int
+xlog_write_copy_finish(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	uint			flags,
+	int			*record_cnt,
+	int			*data_cnt,
+	int			*partial_copy,
+	int			*partial_copy_len,
+	int			log_offset,
+	struct xlog_in_core	**commit_iclog)
+{
+	if (*partial_copy) {
+		/*
+		 * This iclog has already been marked WANT_SYNC by
+		 * xlog_state_get_iclog_space.
+		 */
+		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+		*record_cnt = 0;
+		*data_cnt = 0;
+		return xlog_state_release_iclog(log, iclog);
+	}
+
+	*partial_copy = 0;
+	*partial_copy_len = 0;
+
+	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+		/* no more space in this iclog - push it. */
+		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+		*record_cnt = 0;
+		*data_cnt = 0;
+
+		spin_lock(&log->l_icloglock);
+		xlog_state_want_sync(log, iclog);
+		spin_unlock(&log->l_icloglock);
+
+		if (!commit_iclog)
+			return xlog_state_release_iclog(log, iclog);
+		ASSERT(flags & XLOG_COMMIT_TRANS);
+		*commit_iclog = iclog;
+	}
+
+	return 0;
+}
+
+/*
+ * Write some region out to in-core log
+ *
+ * This will be called when writing externally provided regions or when
+ * writing out a commit record for a given transaction.
+ *
+ * General algorithm:
+ *	1. Find total length of this write.  This may include adding to the
+ *		lengths passed in.
+ *	2. Check whether we violate the tickets reservation.
+ *	3. While writing to this iclog
+ *	    A. Reserve as much space in this iclog as can get
+ *	    B. If this is first write, save away start lsn
+ *	    C. While writing this region:
+ *		1. If first write of transaction, write start record
+ *		2. Write log operation header (header per region)
+ *		3. Find out if we can fit entire region into this iclog
+ *		4. Potentially, verify destination memcpy ptr
+ *		5. Memcpy (partial) region
+ *		6. If partial copy, release iclog; otherwise, continue
+ *			copying more regions into current iclog
+ *	4. Mark want sync bit (in simulation mode)
+ *	5. Release iclog for potential flush to on-disk log.
+ *
+ * ERRORS:
+ * 1.	Panic if reservation is overrun.  This should never happen since
+ *	reservation amounts are generated internal to the filesystem.
+ * NOTES:
+ * 1. Tickets are single threaded data structures.
+ * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
+ *	syncing routine.  When a single log_write region needs to span
+ *	multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
+ *	on all log operation writes which don't contain the end of the
+ *	region.  The XLOG_END_TRANS bit is used for the in-core log
+ *	operation which contains the end of the continued log_write region.
+ * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
+ *	we don't really know exactly how much space will be used.  As a result,
+ *	we don't update ic_offset until the end when we know exactly how many
+ *	bytes have been written out.
+ */
+int
+xlog_write(
+	struct xlog		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*ticket,
+	xfs_lsn_t		*start_lsn,
+	struct xlog_in_core	**commit_iclog,
+	uint			flags)
+{
+	struct xlog_in_core	*iclog = NULL;
+	struct xfs_log_iovec	*vecp;
+	struct xfs_log_vec	*lv;
+	int			len;
+	int			index;
+	int			partial_copy = 0;
+	int			partial_copy_len = 0;
+	int			contwr = 0;
+	int			record_cnt = 0;
+	int			data_cnt = 0;
+	int			error;
+
+	*start_lsn = 0;
+
+	len = xlog_write_calc_vec_length(ticket, log_vector);
+
+	/*
+	 * Region headers and bytes are already accounted for.
+	 * We only need to take into account start records and
+	 * split regions in this function.
+	 */
+	if (ticket->t_flags & XLOG_TIC_INITED)
+		ticket->t_curr_res -= sizeof(xlog_op_header_t);
+
+	/*
+	 * Commit record headers need to be accounted for. These
+	 * come in as separate writes so are easy to detect.
+	 */
+	if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+		ticket->t_curr_res -= sizeof(xlog_op_header_t);
+
+	if (ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, ticket);
+
+	index = 0;
+	lv = log_vector;
+	vecp = lv->lv_iovecp;
+	while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+		void		*ptr;
+		int		log_offset;
+
+		error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+						   &contwr, &log_offset);
+		if (error)
+			return error;
+
+		ASSERT(log_offset <= iclog->ic_size - 1);
+		ptr = iclog->ic_datap + log_offset;
+
+		/* start_lsn is the first lsn written to. That's all we need. */
+		if (!*start_lsn)
+			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+
+		/*
+		 * This loop writes out as many regions as can fit in the amount
+		 * of space which was allocated by xlog_state_get_iclog_space().
+		 */
+		while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+			struct xfs_log_iovec	*reg;
+			struct xlog_op_header	*ophdr;
+			int			start_rec_copy;
+			int			copy_len;
+			int			copy_off;
+			bool			ordered = false;
+
+			/* ordered log vectors have no regions to write */
+			if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+				ASSERT(lv->lv_niovecs == 0);
+				ordered = true;
+				goto next_lv;
+			}
+
+			reg = &vecp[index];
+			ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+			ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+
+			start_rec_copy = xlog_write_start_rec(ptr, ticket);
+			if (start_rec_copy) {
+				record_cnt++;
+				xlog_write_adv_cnt(&ptr, &len, &log_offset,
+						   start_rec_copy);
+			}
+
+			ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
+			if (!ophdr)
+				return -EIO;
+
+			xlog_write_adv_cnt(&ptr, &len, &log_offset,
+					   sizeof(struct xlog_op_header));
+
+			len += xlog_write_setup_copy(ticket, ophdr,
+						     iclog->ic_size-log_offset,
+						     reg->i_len,
+						     &copy_off, &copy_len,
+						     &partial_copy,
+						     &partial_copy_len);
+			xlog_verify_dest_ptr(log, ptr);
+
+			/* copy region */
+			ASSERT(copy_len >= 0);
+			memcpy(ptr, reg->i_addr + copy_off, copy_len);
+			xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+
+			copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+			record_cnt++;
+			data_cnt += contwr ? copy_len : 0;
+
+			error = xlog_write_copy_finish(log, iclog, flags,
+						       &record_cnt, &data_cnt,
+						       &partial_copy,
+						       &partial_copy_len,
+						       log_offset,
+						       commit_iclog);
+			if (error)
+				return error;
+
+			/*
+			 * if we had a partial copy, we need to get more iclog
+			 * space but we don't want to increment the region
+			 * index because there is still more is this region to
+			 * write.
+			 *
+			 * If we completed writing this region, and we flushed
+			 * the iclog (indicated by resetting of the record
+			 * count), then we also need to get more log space. If
+			 * this was the last record, though, we are done and
+			 * can just return.
+			 */
+			if (partial_copy)
+				break;
+
+			if (++index == lv->lv_niovecs) {
+next_lv:
+				lv = lv->lv_next;
+				index = 0;
+				if (lv)
+					vecp = lv->lv_iovecp;
+			}
+			if (record_cnt == 0 && ordered == false) {
+				if (!lv)
+					return 0;
+				break;
+			}
+		}
+	}
+
+	ASSERT(len == 0);
+
+	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+	if (!commit_iclog)
+		return xlog_state_release_iclog(log, iclog);
+
+	ASSERT(flags & XLOG_COMMIT_TRANS);
+	*commit_iclog = iclog;
+	return 0;
+}
+
+
+/*****************************************************************************
+ *
+ *		State Machine functions
+ *
+ *****************************************************************************
+ */
+
+/* Clean iclogs starting from the head.  This ordering must be
+ * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * is SYNCING.  This is also required to maintain the notion that we use
+ * a ordered wait queue to hold off would be writers to the log when every
+ * iclog is trying to sync to disk.
+ *
+ * State Change: DIRTY -> ACTIVE
+ */
+STATIC void
+xlog_state_clean_log(
+	struct xlog *log)
+{
+	xlog_in_core_t	*iclog;
+	int changed = 0;
+
+	iclog = log->l_iclog;
+	do {
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			iclog->ic_state	= XLOG_STATE_ACTIVE;
+			iclog->ic_offset       = 0;
+			ASSERT(iclog->ic_callback == NULL);
+			/*
+			 * If the number of ops in this iclog indicate it just
+			 * contains the dummy transaction, we can
+			 * change state into IDLE (the second time around).
+			 * Otherwise we should change the state into
+			 * NEED a dummy.
+			 * We don't need to cover the dummy.
+			 */
+			if (!changed &&
+			   (be32_to_cpu(iclog->ic_header.h_num_logops) ==
+			   		XLOG_COVER_OPS)) {
+				changed = 1;
+			} else {
+				/*
+				 * We have two dirty iclogs so start over
+				 * This could also be num of ops indicates
+				 * this is not the dummy going out.
+				 */
+				changed = 2;
+			}
+			iclog->ic_header.h_num_logops = 0;
+			memset(iclog->ic_header.h_cycle_data, 0,
+			      sizeof(iclog->ic_header.h_cycle_data));
+			iclog->ic_header.h_lsn = 0;
+		} else if (iclog->ic_state == XLOG_STATE_ACTIVE)
+			/* do nothing */;
+		else
+			break;	/* stop cleaning */
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+
+	/* log is locked when we are called */
+	/*
+	 * Change state for the dummy log recording.
+	 * We usually go to NEED. But we go to NEED2 if the changed indicates
+	 * we are done writing the dummy record.
+	 * If we are done with the second dummy recored (DONE2), then
+	 * we go to IDLE.
+	 */
+	if (changed) {
+		switch (log->l_covered_state) {
+		case XLOG_STATE_COVER_IDLE:
+		case XLOG_STATE_COVER_NEED:
+		case XLOG_STATE_COVER_NEED2:
+			log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_NEED2;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE2:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_IDLE;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		default:
+			ASSERT(0);
+		}
+	}
+}	/* xlog_state_clean_log */
+
+STATIC xfs_lsn_t
+xlog_get_lowest_lsn(
+	struct xlog	*log)
+{
+	xlog_in_core_t  *lsn_log;
+	xfs_lsn_t	lowest_lsn, lsn;
+
+	lsn_log = log->l_iclog;
+	lowest_lsn = 0;
+	do {
+	    if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
+		lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
+		if ((lsn && !lowest_lsn) ||
+		    (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+			lowest_lsn = lsn;
+		}
+	    }
+	    lsn_log = lsn_log->ic_next;
+	} while (lsn_log != log->l_iclog);
+	return lowest_lsn;
+}
+
+
+STATIC void
+xlog_state_do_callback(
+	struct xlog		*log,
+	int			aborted,
+	struct xlog_in_core	*ciclog)
+{
+	xlog_in_core_t	   *iclog;
+	xlog_in_core_t	   *first_iclog;	/* used to know when we've
+						 * processed all iclogs once */
+	xfs_log_callback_t *cb, *cb_next;
+	int		   flushcnt = 0;
+	xfs_lsn_t	   lowest_lsn;
+	int		   ioerrors;	/* counter: iclogs with errors */
+	int		   loopdidcallbacks; /* flag: inner loop did callbacks*/
+	int		   funcdidcallbacks; /* flag: function did callbacks */
+	int		   repeats;	/* for issuing console warnings if
+					 * looping too many times */
+	int		   wake = 0;
+
+	spin_lock(&log->l_icloglock);
+	first_iclog = iclog = log->l_iclog;
+	ioerrors = 0;
+	funcdidcallbacks = 0;
+	repeats = 0;
+
+	do {
+		/*
+		 * Scan all iclogs starting with the one pointed to by the
+		 * log.  Reset this starting point each time the log is
+		 * unlocked (during callbacks).
+		 *
+		 * Keep looping through iclogs until one full pass is made
+		 * without running any callbacks.
+		 */
+		first_iclog = log->l_iclog;
+		iclog = log->l_iclog;
+		loopdidcallbacks = 0;
+		repeats++;
+
+		do {
+
+			/* skip all iclogs in the ACTIVE & DIRTY states */
+			if (iclog->ic_state &
+			    (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+				iclog = iclog->ic_next;
+				continue;
+			}
+
+			/*
+			 * Between marking a filesystem SHUTDOWN and stopping
+			 * the log, we do flush all iclogs to disk (if there
+			 * wasn't a log I/O error). So, we do want things to
+			 * go smoothly in case of just a SHUTDOWN  w/o a
+			 * LOG_IO_ERROR.
+			 */
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+				/*
+				 * Can only perform callbacks in order.  Since
+				 * this iclog is not in the DONE_SYNC/
+				 * DO_CALLBACK state, we skip the rest and
+				 * just try to clean up.  If we set our iclog
+				 * to DO_CALLBACK, we will not process it when
+				 * we retry since a previous iclog is in the
+				 * CALLBACK and the state cannot change since
+				 * we are holding the l_icloglock.
+				 */
+				if (!(iclog->ic_state &
+					(XLOG_STATE_DONE_SYNC |
+						 XLOG_STATE_DO_CALLBACK))) {
+					if (ciclog && (ciclog->ic_state ==
+							XLOG_STATE_DONE_SYNC)) {
+						ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
+					}
+					break;
+				}
+				/*
+				 * We now have an iclog that is in either the
+				 * DO_CALLBACK or DONE_SYNC states. The other
+				 * states (WANT_SYNC, SYNCING, or CALLBACK were
+				 * caught by the above if and are going to
+				 * clean (i.e. we aren't doing their callbacks)
+				 * see the above if.
+				 */
+
+				/*
+				 * We will do one more check here to see if we
+				 * have chased our tail around.
+				 */
+
+				lowest_lsn = xlog_get_lowest_lsn(log);
+				if (lowest_lsn &&
+				    XFS_LSN_CMP(lowest_lsn,
+						be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+					iclog = iclog->ic_next;
+					continue; /* Leave this iclog for
+						   * another thread */
+				}
+
+				iclog->ic_state = XLOG_STATE_CALLBACK;
+
+
+				/*
+				 * Completion of a iclog IO does not imply that
+				 * a transaction has completed, as transactions
+				 * can be large enough to span many iclogs. We
+				 * cannot change the tail of the log half way
+				 * through a transaction as this may be the only
+				 * transaction in the log and moving th etail to
+				 * point to the middle of it will prevent
+				 * recovery from finding the start of the
+				 * transaction. Hence we should only update the
+				 * last_sync_lsn if this iclog contains
+				 * transaction completion callbacks on it.
+				 *
+				 * We have to do this before we drop the
+				 * icloglock to ensure we are the only one that
+				 * can update it.
+				 */
+				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+				if (iclog->ic_callback)
+					atomic64_set(&log->l_last_sync_lsn,
+						be64_to_cpu(iclog->ic_header.h_lsn));
+
+			} else
+				ioerrors++;
+
+			spin_unlock(&log->l_icloglock);
+
+			/*
+			 * Keep processing entries in the callback list until
+			 * we come around and it is empty.  We need to
+			 * atomically see that the list is empty and change the
+			 * state to DIRTY so that we don't miss any more
+			 * callbacks being added.
+			 */
+			spin_lock(&iclog->ic_callback_lock);
+			cb = iclog->ic_callback;
+			while (cb) {
+				iclog->ic_callback_tail = &(iclog->ic_callback);
+				iclog->ic_callback = NULL;
+				spin_unlock(&iclog->ic_callback_lock);
+
+				/* perform callbacks in the order given */
+				for (; cb; cb = cb_next) {
+					cb_next = cb->cb_next;
+					cb->cb_func(cb->cb_arg, aborted);
+				}
+				spin_lock(&iclog->ic_callback_lock);
+				cb = iclog->ic_callback;
+			}
+
+			loopdidcallbacks++;
+			funcdidcallbacks++;
+
+			spin_lock(&log->l_icloglock);
+			ASSERT(iclog->ic_callback == NULL);
+			spin_unlock(&iclog->ic_callback_lock);
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
+				iclog->ic_state = XLOG_STATE_DIRTY;
+
+			/*
+			 * Transition from DIRTY to ACTIVE if applicable.
+			 * NOP if STATE_IOERROR.
+			 */
+			xlog_state_clean_log(log);
+
+			/* wake up threads waiting in xfs_log_force() */
+			wake_up_all(&iclog->ic_force_wait);
+
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+
+		if (repeats > 5000) {
+			flushcnt += repeats;
+			repeats = 0;
+			xfs_warn(log->l_mp,
+				"%s: possible infinite loop (%d iterations)",
+				__func__, flushcnt);
+		}
+	} while (!ioerrors && loopdidcallbacks);
+
+	/*
+	 * make one last gasp attempt to see if iclogs are being left in
+	 * limbo..
+	 */
+#ifdef DEBUG
+	if (funcdidcallbacks) {
+		first_iclog = iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+			/*
+			 * Terminate the loop if iclogs are found in states
+			 * which will cause other threads to clean up iclogs.
+			 *
+			 * SYNCING - i/o completion will go through logs
+			 * DONE_SYNC - interrupt thread should be waiting for
+			 *              l_icloglock
+			 * IOERROR - give up hope all ye who enter here
+			 */
+			if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+			    iclog->ic_state == XLOG_STATE_SYNCING ||
+			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+			    iclog->ic_state == XLOG_STATE_IOERROR )
+				break;
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+	}
+#endif
+
+	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+		wake = 1;
+	spin_unlock(&log->l_icloglock);
+
+	if (wake)
+		wake_up_all(&log->l_flush_wait);
+}
+
+
+/*
+ * Finish transitioning this iclog to the dirty state.
+ *
+ * Make sure that we completely execute this routine only when this is
+ * the last call to the iclog.  There is a good chance that iclog flushes,
+ * when we reach the end of the physical log, get turned into 2 separate
+ * calls to bwrite.  Hence, one iclog flush could generate two calls to this
+ * routine.  By using the reference count bwritecnt, we guarantee that only
+ * the second completion goes through.
+ *
+ * Callbacks could take time, so they are done outside the scope of the
+ * global state machine log lock.
+ */
+STATIC void
+xlog_state_done_syncing(
+	xlog_in_core_t	*iclog,
+	int		aborted)
+{
+	struct xlog	   *log = iclog->ic_log;
+
+	spin_lock(&log->l_icloglock);
+
+	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
+	       iclog->ic_state == XLOG_STATE_IOERROR);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
+
+
+	/*
+	 * If we got an error, either on the first buffer, or in the case of
+	 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
+	 * and none should ever be attempted to be written to disk
+	 * again.
+	 */
+	if (iclog->ic_state != XLOG_STATE_IOERROR) {
+		if (--iclog->ic_bwritecnt == 1) {
+			spin_unlock(&log->l_icloglock);
+			return;
+		}
+		iclog->ic_state = XLOG_STATE_DONE_SYNC;
+	}
+
+	/*
+	 * Someone could be sleeping prior to writing out the next
+	 * iclog buffer, we wake them all, one will get to do the
+	 * I/O, the others get to wait for the result.
+	 */
+	wake_up_all(&iclog->ic_write_wait);
+	spin_unlock(&log->l_icloglock);
+	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
+}	/* xlog_state_done_syncing */
+
+
+/*
+ * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
+ * sleep.  We wait on the flush queue on the head iclog as that should be
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
+ * we will wait here and all new writes will sleep until a sync completes.
+ *
+ * The in-core logs are used in a circular fashion. They are not used
+ * out-of-order even when an iclog past the head is free.
+ *
+ * return:
+ *	* log_offset where xlog_write() can start writing into the in-core
+ *		log's data space.
+ *	* in-core log pointer to which xlog_write() should write.
+ *	* boolean indicating this is a continued write to an in-core log.
+ *		If this is the last write, then the in-core log's offset field
+ *		needs to be incremented, depending on the amount of data which
+ *		is copied.
+ */
+STATIC int
+xlog_state_get_iclog_space(
+	struct xlog		*log,
+	int			len,
+	struct xlog_in_core	**iclogp,
+	struct xlog_ticket	*ticket,
+	int			*continued_write,
+	int			*logoffsetp)
+{
+	int		  log_offset;
+	xlog_rec_header_t *head;
+	xlog_in_core_t	  *iclog;
+	int		  error;
+
+restart:
+	spin_lock(&log->l_icloglock);
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		spin_unlock(&log->l_icloglock);
+		return -EIO;
+	}
+
+	iclog = log->l_iclog;
+	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
+		XFS_STATS_INC(xs_log_noiclogs);
+
+		/* Wait for log writes to have flushed */
+		xlog_wait(&log->l_flush_wait, &log->l_icloglock);
+		goto restart;
+	}
+
+	head = &iclog->ic_header;
+
+	atomic_inc(&iclog->ic_refcnt);	/* prevents sync */
+	log_offset = iclog->ic_offset;
+
+	/* On the 1st write to an iclog, figure out lsn.  This works
+	 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
+	 * committing to.  If the offset is set, that's how many blocks
+	 * must be written.
+	 */
+	if (log_offset == 0) {
+		ticket->t_curr_res -= log->l_iclog_hsize;
+		xlog_tic_add_region(ticket,
+				    log->l_iclog_hsize,
+				    XLOG_REG_TYPE_LRHEADER);
+		head->h_cycle = cpu_to_be32(log->l_curr_cycle);
+		head->h_lsn = cpu_to_be64(
+			xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
+		ASSERT(log->l_curr_block >= 0);
+	}
+
+	/* If there is enough room to write everything, then do it.  Otherwise,
+	 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
+	 * bit is on, so this will get flushed out.  Don't update ic_offset
+	 * until you know exactly how many bytes get copied.  Therefore, wait
+	 * until later to update ic_offset.
+	 *
+	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+	 * can fit into remaining data section.
+	 */
+	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+
+		/*
+		 * If I'm the only one writing to this iclog, sync it to disk.
+		 * We need to do an atomic compare and decrement here to avoid
+		 * racing with concurrent atomic_dec_and_lock() calls in
+		 * xlog_state_release_iclog() when there is more than one
+		 * reference to the iclog.
+		 */
+		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+			/* we are the only one */
+			spin_unlock(&log->l_icloglock);
+			error = xlog_state_release_iclog(log, iclog);
+			if (error)
+				return error;
+		} else {
+			spin_unlock(&log->l_icloglock);
+		}
+		goto restart;
+	}
+
+	/* Do we have enough room to write the full amount in the remainder
+	 * of this iclog?  Or must we continue a write on the next iclog and
+	 * mark this iclog as completely taken?  In the case where we switch
+	 * iclogs (to mark it taken), this particular iclog will release/sync
+	 * to disk in xlog_write().
+	 */
+	if (len <= iclog->ic_size - iclog->ic_offset) {
+		*continued_write = 0;
+		iclog->ic_offset += len;
+	} else {
+		*continued_write = 1;
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+	}
+	*iclogp = iclog;
+
+	ASSERT(iclog->ic_offset <= iclog->ic_size);
+	spin_unlock(&log->l_icloglock);
+
+	*logoffsetp = log_offset;
+	return 0;
+}	/* xlog_state_get_iclog_space */
+
+/* The first cnt-1 times through here we don't need to
+ * move the grant write head because the permanent
+ * reservation has reserved cnt times the unit amount.
+ * Release part of current permanent unit reservation and
+ * reset current reservation to be one units worth.  Also
+ * move grant reservation head forward.
+ */
+STATIC void
+xlog_regrant_reserve_log_space(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket)
+{
+	trace_xfs_log_regrant_reserve_enter(log, ticket);
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	xlog_grant_sub_space(log, &log->l_reserve_head.grant,
+					ticket->t_curr_res);
+	xlog_grant_sub_space(log, &log->l_write_head.grant,
+					ticket->t_curr_res);
+	ticket->t_curr_res = ticket->t_unit_res;
+	xlog_tic_reset_res(ticket);
+
+	trace_xfs_log_regrant_reserve_sub(log, ticket);
+
+	/* just return if we still have some of the pre-reserved space */
+	if (ticket->t_cnt > 0)
+		return;
+
+	xlog_grant_add_space(log, &log->l_reserve_head.grant,
+					ticket->t_unit_res);
+
+	trace_xfs_log_regrant_reserve_exit(log, ticket);
+
+	ticket->t_curr_res = ticket->t_unit_res;
+	xlog_tic_reset_res(ticket);
+}	/* xlog_regrant_reserve_log_space */
+
+
+/*
+ * Give back the space left from a reservation.
+ *
+ * All the information we need to make a correct determination of space left
+ * is present.  For non-permanent reservations, things are quite easy.  The
+ * count should have been decremented to zero.  We only need to deal with the
+ * space remaining in the current reservation part of the ticket.  If the
+ * ticket contains a permanent reservation, there may be left over space which
+ * needs to be released.  A count of N means that N-1 refills of the current
+ * reservation can be done before we need to ask for more space.  The first
+ * one goes to fill up the first current reservation.  Once we run out of
+ * space, the count will stay at zero and the only space remaining will be
+ * in the current reservation field.
+ */
+STATIC void
+xlog_ungrant_log_space(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket)
+{
+	int	bytes;
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	trace_xfs_log_ungrant_enter(log, ticket);
+	trace_xfs_log_ungrant_sub(log, ticket);
+
+	/*
+	 * If this is a permanent reservation ticket, we may be able to free
+	 * up more space based on the remaining count.
+	 */
+	bytes = ticket->t_curr_res;
+	if (ticket->t_cnt > 0) {
+		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
+		bytes += ticket->t_unit_res*ticket->t_cnt;
+	}
+
+	xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
+	xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
+
+	trace_xfs_log_ungrant_exit(log, ticket);
+
+	xfs_log_space_wake(log->l_mp);
+}
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and
+ * the WANT_SYNC bit is set.
+ *
+ * When this function is entered, the iclog is not necessarily in the
+ * WANT_SYNC state.  It may be sitting around waiting to get filled.
+ *
+ *
+ */
+STATIC int
+xlog_state_release_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog)
+{
+	int		sync = 0;	/* do we sync? */
+
+	if (iclog->ic_state & XLOG_STATE_IOERROR)
+		return -EIO;
+
+	ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+	if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+		return 0;
+
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		spin_unlock(&log->l_icloglock);
+		return -EIO;
+	}
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
+	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
+
+	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+		/* update tail before writing to iclog */
+		xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+		sync++;
+		iclog->ic_state = XLOG_STATE_SYNCING;
+		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+		xlog_verify_tail_lsn(log, iclog, tail_lsn);
+		/* cycle incremented when incrementing curr_block */
+	}
+	spin_unlock(&log->l_icloglock);
+
+	/*
+	 * We let the log lock go, so it's possible that we hit a log I/O
+	 * error or some other SHUTDOWN condition that marks the iclog
+	 * as XLOG_STATE_IOERROR before the bwrite. However, we know that
+	 * this iclog has consistent data, so we ignore IOERROR
+	 * flags after this point.
+	 */
+	if (sync)
+		return xlog_sync(log, iclog);
+	return 0;
+}	/* xlog_state_release_iclog */
+
+
+/*
+ * This routine will mark the current iclog in the ring as WANT_SYNC
+ * and move the current iclog pointer to the next iclog in the ring.
+ * When this routine is called from xlog_state_get_iclog_space(), the
+ * exact size of the iclog has not yet been determined.  All we know is
+ * that every data block.  We have run out of space in this log record.
+ */
+STATIC void
+xlog_state_switch_iclogs(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			eventual_size)
+{
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+	if (!eventual_size)
+		eventual_size = iclog->ic_offset;
+	iclog->ic_state = XLOG_STATE_WANT_SYNC;
+	iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
+	log->l_prev_block = log->l_curr_block;
+	log->l_prev_cycle = log->l_curr_cycle;
+
+	/* roll log?: ic_offset changed later */
+	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
+
+	/* Round up to next log-sunit */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+	    log->l_mp->m_sb.sb_logsunit > 1) {
+		__uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+		log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
+	}
+
+	if (log->l_curr_block >= log->l_logBBsize) {
+		log->l_curr_cycle++;
+		if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
+			log->l_curr_cycle++;
+		log->l_curr_block -= log->l_logBBsize;
+		ASSERT(log->l_curr_block >= 0);
+	}
+	ASSERT(iclog == log->l_iclog);
+	log->l_iclog = iclog->ic_next;
+}	/* xlog_state_switch_iclogs */
+
+/*
+ * Write out all data in the in-core log as of this exact moment in time.
+ *
+ * Data may be written to the in-core log during this call.  However,
+ * we don't guarantee this data will be written out.  A change from past
+ * implementation means this routine will *not* write out zero length LRs.
+ *
+ * Basically, we try and perform an intelligent scan of the in-core logs.
+ * If we determine there is no flushable data, we just return.  There is no
+ * flushable data if:
+ *
+ *	1. the current iclog is active and has no data; the previous iclog
+ *		is in the active or dirty state.
+ *	2. the current iclog is drity, and the previous iclog is in the
+ *		active or dirty state.
+ *
+ * We may sleep if:
+ *
+ *	1. the current iclog is not in the active nor dirty state.
+ *	2. the current iclog dirty, and the previous iclog is not in the
+ *		active nor dirty state.
+ *	3. the current iclog is active, and there is another thread writing
+ *		to this particular iclog.
+ *	4. a) the current iclog is active and has no other writers
+ *	   b) when we return from flushing out this iclog, it is still
+ *		not in the active nor dirty state.
+ */
+int
+_xfs_log_force(
+	struct xfs_mount	*mp,
+	uint			flags,
+	int			*log_flushed)
+{
+	struct xlog		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	xfs_lsn_t		lsn;
+
+	XFS_STATS_INC(xs_log_force);
+
+	xlog_cil_force(log);
+
+	spin_lock(&log->l_icloglock);
+
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		spin_unlock(&log->l_icloglock);
+		return -EIO;
+	}
+
+	/* If the head iclog is not active nor dirty, we just attach
+	 * ourselves to the head and go to sleep.
+	 */
+	if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+	    iclog->ic_state == XLOG_STATE_DIRTY) {
+		/*
+		 * If the head is dirty or (active and empty), then
+		 * we need to look at the previous iclog.  If the previous
+		 * iclog is active or dirty we are done.  There is nothing
+		 * to sync out.  Otherwise, we attach ourselves to the
+		 * previous iclog and go to sleep.
+		 */
+		if (iclog->ic_state == XLOG_STATE_DIRTY ||
+		    (atomic_read(&iclog->ic_refcnt) == 0
+		     && iclog->ic_offset == 0)) {
+			iclog = iclog->ic_prev;
+			if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+			    iclog->ic_state == XLOG_STATE_DIRTY)
+				goto no_sleep;
+			else
+				goto maybe_sleep;
+		} else {
+			if (atomic_read(&iclog->ic_refcnt) == 0) {
+				/* We are the only one with access to this
+				 * iclog.  Flush it out now.  There should
+				 * be a roundoff of zero to show that someone
+				 * has already taken care of the roundoff from
+				 * the previous sync.
+				 */
+				atomic_inc(&iclog->ic_refcnt);
+				lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+				xlog_state_switch_iclogs(log, iclog, 0);
+				spin_unlock(&log->l_icloglock);
+
+				if (xlog_state_release_iclog(log, iclog))
+					return -EIO;
+
+				if (log_flushed)
+					*log_flushed = 1;
+				spin_lock(&log->l_icloglock);
+				if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
+				    iclog->ic_state != XLOG_STATE_DIRTY)
+					goto maybe_sleep;
+				else
+					goto no_sleep;
+			} else {
+				/* Someone else is writing to this iclog.
+				 * Use its call to flush out the data.  However,
+				 * the other thread may not force out this LR,
+				 * so we mark it WANT_SYNC.
+				 */
+				xlog_state_switch_iclogs(log, iclog, 0);
+				goto maybe_sleep;
+			}
+		}
+	}
+
+	/* By the time we come around again, the iclog could've been filled
+	 * which would give it another lsn.  If we have a new lsn, just
+	 * return because the relevant data has been flushed.
+	 */
+maybe_sleep:
+	if (flags & XFS_LOG_SYNC) {
+		/*
+		 * We must check if we're shutting down here, before
+		 * we wait, while we're holding the l_icloglock.
+		 * Then we check again after waking up, in case our
+		 * sleep was disturbed by a bad news.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			spin_unlock(&log->l_icloglock);
+			return -EIO;
+		}
+		XFS_STATS_INC(xs_log_force_sleep);
+		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+		/*
+		 * No need to grab the log lock here since we're
+		 * only deciding whether or not to return EIO
+		 * and the memory read should be atomic.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR)
+			return -EIO;
+		if (log_flushed)
+			*log_flushed = 1;
+	} else {
+
+no_sleep:
+		spin_unlock(&log->l_icloglock);
+	}
+	return 0;
+}
+
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int	error;
+
+	trace_xfs_log_force(mp, 0);
+	error = _xfs_log_force(mp, flags, NULL);
+	if (error)
+		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+}
+
+/*
+ * Force the in-core log to disk for a specific LSN.
+ *
+ * Find in-core log with lsn.
+ *	If it is in the DIRTY state, just return.
+ *	If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ *		state and go to sleep or return.
+ *	If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a signal variable. All callers
+ * to force a given lsn to disk will wait on a the sv attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * sv.
+ */
+int
+_xfs_log_force_lsn(
+	struct xfs_mount	*mp,
+	xfs_lsn_t		lsn,
+	uint			flags,
+	int			*log_flushed)
+{
+	struct xlog		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	int			already_slept = 0;
+
+	ASSERT(lsn != 0);
+
+	XFS_STATS_INC(xs_log_force);
+
+	lsn = xlog_cil_force_lsn(log, lsn);
+	if (lsn == NULLCOMMITLSN)
+		return 0;
+
+try_again:
+	spin_lock(&log->l_icloglock);
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		spin_unlock(&log->l_icloglock);
+		return -EIO;
+	}
+
+	do {
+		if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+			iclog = iclog->ic_next;
+			continue;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			spin_unlock(&log->l_icloglock);
+			return 0;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+			/*
+			 * We sleep here if we haven't already slept (e.g.
+			 * this is the first time we've looked at the correct
+			 * iclog buf) and the buffer before us is going to
+			 * be sync'ed. The reason for this is that if we
+			 * are doing sync transactions here, by waiting for
+			 * the previous I/O to complete, we can allow a few
+			 * more transactions into this iclog before we close
+			 * it down.
+			 *
+			 * Otherwise, we mark the buffer WANT_SYNC, and bump
+			 * up the refcnt so we can release the log (which
+			 * drops the ref count).  The state switch keeps new
+			 * transaction commits from using this buffer.  When
+			 * the current commits finish writing into the buffer,
+			 * the refcount will drop to zero and the buffer will
+			 * go out then.
+			 */
+			if (!already_slept &&
+			    (iclog->ic_prev->ic_state &
+			     (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
+				ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+
+				XFS_STATS_INC(xs_log_force_sleep);
+
+				xlog_wait(&iclog->ic_prev->ic_write_wait,
+							&log->l_icloglock);
+				if (log_flushed)
+					*log_flushed = 1;
+				already_slept = 1;
+				goto try_again;
+			}
+			atomic_inc(&iclog->ic_refcnt);
+			xlog_state_switch_iclogs(log, iclog, 0);
+			spin_unlock(&log->l_icloglock);
+			if (xlog_state_release_iclog(log, iclog))
+				return -EIO;
+			if (log_flushed)
+				*log_flushed = 1;
+			spin_lock(&log->l_icloglock);
+		}
+
+		if ((flags & XFS_LOG_SYNC) && /* sleep */
+		    !(iclog->ic_state &
+		      (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+			/*
+			 * Don't wait on completion if we know that we've
+			 * gotten a log write error.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR) {
+				spin_unlock(&log->l_icloglock);
+				return -EIO;
+			}
+			XFS_STATS_INC(xs_log_force_sleep);
+			xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+			/*
+			 * No need to grab the log lock here since we're
+			 * only deciding whether or not to return EIO
+			 * and the memory read should be atomic.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR)
+				return -EIO;
+
+			if (log_flushed)
+				*log_flushed = 1;
+		} else {		/* just return */
+			spin_unlock(&log->l_icloglock);
+		}
+
+		return 0;
+	} while (iclog != log->l_iclog);
+
+	spin_unlock(&log->l_icloglock);
+	return 0;
+}
+
+/*
+ * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force_lsn(
+	xfs_mount_t	*mp,
+	xfs_lsn_t	lsn,
+	uint		flags)
+{
+	int	error;
+
+	trace_xfs_log_force(mp, lsn);
+	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
+	if (error)
+		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+}
+
+/*
+ * Called when we want to mark the current iclog as being ready to sync to
+ * disk.
+ */
+STATIC void
+xlog_state_want_sync(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog)
+{
+	assert_spin_locked(&log->l_icloglock);
+
+	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+		xlog_state_switch_iclogs(log, iclog, 0);
+	} else {
+		ASSERT(iclog->ic_state &
+			(XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
+	}
+}
+
+
+/*****************************************************************************
+ *
+ *		TICKET functions
+ *
+ *****************************************************************************
+ */
+
+/*
+ * Free a used ticket when its refcount falls to zero.
+ */
+void
+xfs_log_ticket_put(
+	xlog_ticket_t	*ticket)
+{
+	ASSERT(atomic_read(&ticket->t_ref) > 0);
+	if (atomic_dec_and_test(&ticket->t_ref))
+		kmem_zone_free(xfs_log_ticket_zone, ticket);
+}
+
+xlog_ticket_t *
+xfs_log_ticket_get(
+	xlog_ticket_t	*ticket)
+{
+	ASSERT(atomic_read(&ticket->t_ref) > 0);
+	atomic_inc(&ticket->t_ref);
+	return ticket;
+}
+
+/*
+ * Figure out the total log space unit (in bytes) that would be
+ * required for a log ticket.
+ */
+int
+xfs_log_calc_unit_res(
+	struct xfs_mount	*mp,
+	int			unit_bytes)
+{
+	struct xlog		*log = mp->m_log;
+	int			iclog_space;
+	uint			num_headers;
+
+	/*
+	 * Permanent reservations have up to 'cnt'-1 active log operations
+	 * in the log.  A unit in this case is the amount of space for one
+	 * of these log operations.  Normal reservations have a cnt of 1
+	 * and their unit amount is the total amount of space required.
+	 *
+	 * The following lines of code account for non-transaction data
+	 * which occupy space in the on-disk log.
+	 *
+	 * Normal form of a transaction is:
+	 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
+	 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
+	 *
+	 * We need to account for all the leadup data and trailer data
+	 * around the transaction data.
+	 * And then we need to account for the worst case in terms of using
+	 * more space.
+	 * The worst case will happen if:
+	 * - the placement of the transaction happens to be such that the
+	 *   roundoff is at its maximum
+	 * - the transaction data is synced before the commit record is synced
+	 *   i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
+	 *   Therefore the commit record is in its own Log Record.
+	 *   This can happen as the commit record is called with its
+	 *   own region to xlog_write().
+	 *   This then means that in the worst case, roundoff can happen for
+	 *   the commit-rec as well.
+	 *   The commit-rec is smaller than padding in this scenario and so it is
+	 *   not added separately.
+	 */
+
+	/* for trans header */
+	unit_bytes += sizeof(xlog_op_header_t);
+	unit_bytes += sizeof(xfs_trans_header_t);
+
+	/* for start-rec */
+	unit_bytes += sizeof(xlog_op_header_t);
+
+	/*
+	 * for LR headers - the space for data in an iclog is the size minus
+	 * the space used for the headers. If we use the iclog size, then we
+	 * undercalculate the number of headers required.
+	 *
+	 * Furthermore - the addition of op headers for split-recs might
+	 * increase the space required enough to require more log and op
+	 * headers, so take that into account too.
+	 *
+	 * IMPORTANT: This reservation makes the assumption that if this
+	 * transaction is the first in an iclog and hence has the LR headers
+	 * accounted to it, then the remaining space in the iclog is
+	 * exclusively for this transaction.  i.e. if the transaction is larger
+	 * than the iclog, it will be the only thing in that iclog.
+	 * Fundamentally, this means we must pass the entire log vector to
+	 * xlog_write to guarantee this.
+	 */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	num_headers = howmany(unit_bytes, iclog_space);
+
+	/* for split-recs - ophdrs added when data split over LRs */
+	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+
+	/* add extra header reservations if we overrun */
+	while (!num_headers ||
+	       howmany(unit_bytes, iclog_space) > num_headers) {
+		unit_bytes += sizeof(xlog_op_header_t);
+		num_headers++;
+	}
+	unit_bytes += log->l_iclog_hsize * num_headers;
+
+	/* for commit-rec LR header - note: padding will subsume the ophdr */
+	unit_bytes += log->l_iclog_hsize;
+
+	/* for roundoff padding for transaction data and one for commit record */
+	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
+		/* log su roundoff */
+		unit_bytes += 2 * mp->m_sb.sb_logsunit;
+	} else {
+		/* BB roundoff */
+		unit_bytes += 2 * BBSIZE;
+        }
+
+	return unit_bytes;
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+struct xlog_ticket *
+xlog_ticket_alloc(
+	struct xlog		*log,
+	int			unit_bytes,
+	int			cnt,
+	char			client,
+	bool			permanent,
+	xfs_km_flags_t		alloc_flags)
+{
+	struct xlog_ticket	*tic;
+	int			unit_res;
+
+	tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
+	if (!tic)
+		return NULL;
+
+	unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+
+	atomic_set(&tic->t_ref, 1);
+	tic->t_task		= current;
+	INIT_LIST_HEAD(&tic->t_queue);
+	tic->t_unit_res		= unit_res;
+	tic->t_curr_res		= unit_res;
+	tic->t_cnt		= cnt;
+	tic->t_ocnt		= cnt;
+	tic->t_tid		= prandom_u32();
+	tic->t_clientid		= client;
+	tic->t_flags		= XLOG_TIC_INITED;
+	tic->t_trans_type	= 0;
+	if (permanent)
+		tic->t_flags |= XLOG_TIC_PERM_RESERV;
+
+	xlog_tic_reset_res(tic);
+
+	return tic;
+}
+
+
+/******************************************************************************
+ *
+ *		Log debug routines
+ *
+ ******************************************************************************
+ */
+#if defined(DEBUG)
+/*
+ * Make sure that the destination ptr is within the valid data region of
+ * one of the iclogs.  This uses backup pointers stored in a different
+ * part of the log in case we trash the log structure.
+ */
+void
+xlog_verify_dest_ptr(
+	struct xlog	*log,
+	char		*ptr)
+{
+	int i;
+	int good_ptr = 0;
+
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		if (ptr >= log->l_iclog_bak[i] &&
+		    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
+			good_ptr++;
+	}
+
+	if (!good_ptr)
+		xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
+}
+
+/*
+ * Check to make sure the grant write head didn't just over lap the tail.  If
+ * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
+STATIC void
+xlog_verify_grant_tail(
+	struct xlog	*log)
+{
+	int		tail_cycle, tail_blocks;
+	int		cycle, space;
+
+	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+	if (tail_cycle != cycle) {
+		if (cycle - 1 != tail_cycle &&
+		    !(log->l_flags & XLOG_TAIL_WARN)) {
+			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+				"%s: cycle - 1 != tail_cycle", __func__);
+			log->l_flags |= XLOG_TAIL_WARN;
+		}
+
+		if (space > BBTOB(tail_blocks) &&
+		    !(log->l_flags & XLOG_TAIL_WARN)) {
+			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+				"%s: space > BBTOB(tail_blocks)", __func__);
+			log->l_flags |= XLOG_TAIL_WARN;
+		}
+	}
+}
+
+/* check if it will fit */
+STATIC void
+xlog_verify_tail_lsn(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	xfs_lsn_t		tail_lsn)
+{
+    int blocks;
+
+    if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
+	blocks =
+	    log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
+	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+    } else {
+	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
+
+	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
+		xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
+
+	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
+	if (blocks < BTOBB(iclog->ic_offset) + 1)
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+    }
+}	/* xlog_verify_tail_lsn */
+
+/*
+ * Perform a number of checks on the iclog before writing to disk.
+ *
+ * 1. Make sure the iclogs are still circular
+ * 2. Make sure we have a good magic number
+ * 3. Make sure we don't have magic numbers in the data
+ * 4. Check fields of each log operation header for:
+ *	A. Valid client identifier
+ *	B. tid ptr value falls in valid ptr space (user space code)
+ *	C. Length in log record header is correct according to the
+ *		individual operation headers within record.
+ * 5. When a bwrite will occur within 5 blocks of the front of the physical
+ *	log, check the preceding blocks of the physical log to make sure all
+ *	the cycle numbers agree with the current cycle number.
+ */
+STATIC void
+xlog_verify_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			count,
+	bool                    syncing)
+{
+	xlog_op_header_t	*ophead;
+	xlog_in_core_t		*icptr;
+	xlog_in_core_2_t	*xhdr;
+	xfs_caddr_t		ptr;
+	xfs_caddr_t		base_ptr;
+	__psint_t		field_offset;
+	__uint8_t		clientid;
+	int			len, i, j, k, op_len;
+	int			idx;
+
+	/* check validity of iclog pointers */
+	spin_lock(&log->l_icloglock);
+	icptr = log->l_iclog;
+	for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
+		ASSERT(icptr);
+
+	if (icptr != log->l_iclog)
+		xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
+	spin_unlock(&log->l_icloglock);
+
+	/* check log magic numbers */
+	if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
+		xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
+
+	ptr = (xfs_caddr_t) &iclog->ic_header;
+	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
+	     ptr += BBSIZE) {
+		if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
+			xfs_emerg(log->l_mp, "%s: unexpected magic num",
+				__func__);
+	}
+
+	/* check fields */
+	len = be32_to_cpu(iclog->ic_header.h_num_logops);
+	ptr = iclog->ic_datap;
+	base_ptr = ptr;
+	ophead = (xlog_op_header_t *)ptr;
+	xhdr = iclog->ic_data;
+	for (i = 0; i < len; i++) {
+		ophead = (xlog_op_header_t *)ptr;
+
+		/* clientid is only 1 byte */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+		if (!syncing || (field_offset & 0x1ff)) {
+			clientid = ophead->oh_clientid;
+		} else {
+			idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				clientid = xlog_get_client_id(
+					xhdr[j].hic_xheader.xh_cycle_data[k]);
+			} else {
+				clientid = xlog_get_client_id(
+					iclog->ic_header.h_cycle_data[idx]);
+			}
+		}
+		if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+			xfs_warn(log->l_mp,
+				"%s: invalid clientid %d op 0x%p offset 0x%lx",
+				__func__, clientid, ophead,
+				(unsigned long)field_offset);
+
+		/* check length */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+		if (!syncing || (field_offset & 0x1ff)) {
+			op_len = be32_to_cpu(ophead->oh_len);
+		} else {
+			idx = BTOBBT((__psint_t)&ophead->oh_len -
+				    (__psint_t)iclog->ic_datap);
+			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
+			} else {
+				op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
+			}
+		}
+		ptr += sizeof(xlog_op_header_t) + op_len;
+	}
+}	/* xlog_verify_iclog */
+#endif
+
+/*
+ * Mark all iclogs IOERROR. l_icloglock is held by the caller.
+ */
+STATIC int
+xlog_state_ioerror(
+	struct xlog	*log)
+{
+	xlog_in_core_t	*iclog, *ic;
+
+	iclog = log->l_iclog;
+	if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+		/*
+		 * Mark all the incore logs IOERROR.
+		 * From now on, no log flushes will result.
+		 */
+		ic = iclog;
+		do {
+			ic->ic_state = XLOG_STATE_IOERROR;
+			ic = ic->ic_next;
+		} while (ic != iclog);
+		return 0;
+	}
+	/*
+	 * Return non-zero, if state transition has already happened.
+	 */
+	return 1;
+}
+
+/*
+ * This is called from xfs_force_shutdown, when we're forcibly
+ * shutting down the filesystem, typically because of an IO error.
+ * Our main objectives here are to make sure that:
+ *	a. if !logerror, flush the logs to disk. Anything modified
+ *	   after this is ignored.
+ *	b. the filesystem gets marked 'SHUTDOWN' for all interested
+ *	   parties to find out, 'atomically'.
+ *	c. those who're sleeping on log reservations, pinned objects and
+ *	    other resources get woken up, and be told the bad news.
+ *	d. nothing new gets queued up after (b) and (c) are done.
+ *
+ * Note: for the !logerror case we need to flush the regions held in memory out
+ * to disk first. This needs to be done before the log is marked as shutdown,
+ * otherwise the iclog writes will fail.
+ */
+int
+xfs_log_force_umount(
+	struct xfs_mount	*mp,
+	int			logerror)
+{
+	struct xlog	*log;
+	int		retval;
+
+	log = mp->m_log;
+
+	/*
+	 * If this happens during log recovery, don't worry about
+	 * locking; the log isn't open for business yet.
+	 */
+	if (!log ||
+	    log->l_flags & XLOG_ACTIVE_RECOVERY) {
+		mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+		if (mp->m_sb_bp)
+			XFS_BUF_DONE(mp->m_sb_bp);
+		return 0;
+	}
+
+	/*
+	 * Somebody could've already done the hard work for us.
+	 * No need to get locks for this.
+	 */
+	if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+		ASSERT(XLOG_FORCED_SHUTDOWN(log));
+		return 1;
+	}
+
+	/*
+	 * Flush all the completed transactions to disk before marking the log
+	 * being shut down. We need to do it in this order to ensure that
+	 * completed operations are safely on disk before we shut down, and that
+	 * we don't have to issue any buffer IO after the shutdown flags are set
+	 * to guarantee this.
+	 */
+	if (!logerror)
+		_xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+
+	/*
+	 * mark the filesystem and the as in a shutdown state and wake
+	 * everybody up to tell them the bad news.
+	 */
+	spin_lock(&log->l_icloglock);
+	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+	if (mp->m_sb_bp)
+		XFS_BUF_DONE(mp->m_sb_bp);
+
+	/*
+	 * Mark the log and the iclogs with IO error flags to prevent any
+	 * further log IO from being issued or completed.
+	 */
+	log->l_flags |= XLOG_IO_ERROR;
+	retval = xlog_state_ioerror(log);
+	spin_unlock(&log->l_icloglock);
+
+	/*
+	 * We don't want anybody waiting for log reservations after this. That
+	 * means we have to wake up everybody queued up on reserveq as well as
+	 * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
+	 * we don't enqueue anything once the SHUTDOWN flag is set, and this
+	 * action is protected by the grant locks.
+	 */
+	xlog_grant_head_wake_all(&log->l_reserve_head);
+	xlog_grant_head_wake_all(&log->l_write_head);
+
+	/*
+	 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
+	 * as if the log writes were completed. The abort handling in the log
+	 * item committed callback functions will do this again under lock to
+	 * avoid races.
+	 */
+	wake_up_all(&log->l_cilp->xc_commit_wait);
+	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+
+#ifdef XFSERRORDEBUG
+	{
+		xlog_in_core_t	*iclog;
+
+		spin_lock(&log->l_icloglock);
+		iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_callback == 0);
+			iclog = iclog->ic_next;
+		} while (iclog != log->l_iclog);
+		spin_unlock(&log->l_icloglock);
+	}
+#endif
+	/* return non-zero if log IOERROR transition had already happened */
+	return retval;
+}
+
+STATIC int
+xlog_iclogs_empty(
+	struct xlog	*log)
+{
+	xlog_in_core_t	*iclog;
+
+	iclog = log->l_iclog;
+	do {
+		/* endianness does not matter here, zero is zero in
+		 * any language.
+		 */
+		if (iclog->ic_header.h_num_logops)
+			return 0;
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+	return 1;
+}
+
diff --git a/kernel/fs/xfs/xfs_log.h b/kernel/fs/xfs/xfs_log.h
new file mode 100644
index 000000000..84e0deb95
--- /dev/null
+++ b/kernel/fs/xfs/xfs_log.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_LOG_H__
+#define __XFS_LOG_H__
+
+struct xfs_log_vec {
+	struct xfs_log_vec	*lv_next;	/* next lv in build list */
+	int			lv_niovecs;	/* number of iovecs in lv */
+	struct xfs_log_iovec	*lv_iovecp;	/* iovec array */
+	struct xfs_log_item	*lv_item;	/* owner */
+	char			*lv_buf;	/* formatted buffer */
+	int			lv_bytes;	/* accounted space in buffer */
+	int			lv_buf_len;	/* aligned size of buffer */
+	int			lv_size;	/* size of allocated lv */
+};
+
+#define XFS_LOG_VEC_ORDERED	(-1)
+
+static inline void *
+xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+		uint type)
+{
+	struct xfs_log_iovec *vec = *vecp;
+
+	if (vec) {
+		ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+		vec++;
+	} else {
+		vec = &lv->lv_iovecp[0];
+	}
+
+	vec->i_type = type;
+	vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+	ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+
+	*vecp = vec;
+	return vec->i_addr;
+}
+
+/*
+ * We need to make sure the next buffer is naturally aligned for the biggest
+ * basic data type we put into it.  We already accounted for this padding when
+ * sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ */
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
+{
+	lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+	lv->lv_bytes += len;
+	vec->i_len = len;
+}
+
+static inline void *
+xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+		uint type, void *data, int len)
+{
+	void *buf;
+
+	buf = xlog_prepare_iovec(lv, vecp, type);
+	memcpy(buf, data, len);
+	xlog_finish_iovec(lv, *vecp, len);
+	return buf;
+}
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+	struct xfs_log_callback	*cb_next;
+	void			(*cb_func)(void *, int);
+	void			*cb_arg;
+} xfs_log_callback_t;
+
+/*
+ * By comparing each component, we don't have to worry about extra
+ * endian issues in treating two 32 bit numbers as one 64 bit number
+ */
+static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+{
+	if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
+		return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
+
+	if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2))
+		return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999;
+
+	return 0;
+}
+
+#define	XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
+
+/*
+ * Macros, structures, prototypes for interface to the log manager.
+ */
+
+/*
+ * Flags to xfs_log_done()
+ */
+#define XFS_LOG_REL_PERM_RESERV	0x1
+
+/*
+ * Flags to xfs_log_force()
+ *
+ *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
+ */
+#define XFS_LOG_SYNC		0x1
+
+/* Log manager interfaces */
+struct xfs_mount;
+struct xlog_in_core;
+struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+struct xfs_log_callback;
+
+xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
+		       struct xlog_ticket *ticket,
+		       struct xlog_in_core **iclog,
+		       uint		flags);
+int	  _xfs_log_force(struct xfs_mount *mp,
+			 uint		flags,
+			 int		*log_forced);
+void	  xfs_log_force(struct xfs_mount	*mp,
+			uint			flags);
+int	  _xfs_log_force_lsn(struct xfs_mount *mp,
+			     xfs_lsn_t		lsn,
+			     uint		flags,
+			     int		*log_forced);
+void	  xfs_log_force_lsn(struct xfs_mount	*mp,
+			    xfs_lsn_t		lsn,
+			    uint		flags);
+int	  xfs_log_mount(struct xfs_mount	*mp,
+			struct xfs_buftarg	*log_target,
+			xfs_daddr_t		start_block,
+			int		 	num_bblocks);
+int	  xfs_log_mount_finish(struct xfs_mount *mp);
+xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
+void	  xfs_log_space_wake(struct xfs_mount *mp);
+int	  xfs_log_notify(struct xfs_mount	*mp,
+			 struct xlog_in_core	*iclog,
+			 struct xfs_log_callback *callback_entry);
+int	  xfs_log_release_iclog(struct xfs_mount *mp,
+			 struct xlog_in_core	 *iclog);
+int	  xfs_log_reserve(struct xfs_mount *mp,
+			  int		   length,
+			  int		   count,
+			  struct xlog_ticket **ticket,
+			  __uint8_t	   clientid,
+			  bool		   permanent,
+			  uint		   t_type);
+int	  xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+int	  xfs_log_unmount_write(struct xfs_mount *mp);
+void      xfs_log_unmount(struct xfs_mount *mp);
+int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+int	  xfs_log_need_covered(struct xfs_mount *mp);
+
+void	  xlog_iodone(struct xfs_buf *);
+
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
+void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
+
+void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+				xfs_lsn_t *commit_lsn, int flags);
+bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+
+void	xfs_log_work_queue(struct xfs_mount *mp);
+void	xfs_log_worker(struct work_struct *work);
+void	xfs_log_quiesce(struct xfs_mount *mp);
+
+#endif	/* __XFS_LOG_H__ */
diff --git a/kernel/fs/xfs/xfs_log_cil.c b/kernel/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000..45cc0ce18
--- /dev/null
+++ b/kernel/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_discard.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+	struct xlog	*log)
+{
+	struct xlog_ticket *tic;
+
+	tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+				KM_SLEEP|KM_NOFS);
+	tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+
+	/*
+	 * set the current reservation to zero so we know to steal the basic
+	 * transaction overhead reservation from the first transaction commit.
+	 */
+	tic->t_curr_res = 0;
+	return tic;
+}
+
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+	struct xlog	*log)
+{
+	log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+	log->l_cilp->xc_ctx->sequence = 1;
+}
+
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+	struct xlog		*log,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_vec	*old_lv,
+	int			*diff_len,
+	int			*diff_iovecs)
+{
+	/* Account for the new LV being passed in */
+	if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+		*diff_len += lv->lv_bytes;
+		*diff_iovecs += lv->lv_niovecs;
+	}
+
+	/*
+	 * If there is no old LV, this is the first time we've seen the item in
+	 * this CIL context and so we need to pin it. If we are replacing the
+	 * old_lv, then remove the space it accounts for and free it.
+	 */
+	if (!old_lv)
+		lv->lv_item->li_ops->iop_pin(lv->lv_item);
+	else if (old_lv != lv) {
+		ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+
+		*diff_len -= old_lv->lv_bytes;
+		*diff_iovecs -= old_lv->lv_niovecs;
+		kmem_free(old_lv);
+	}
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	/*
+	 * If this is the first time the item is being committed to the
+	 * CIL, store the sequence number on the log item so we can
+	 * tell in future commits whether this is the first checkpoint
+	 * the item is being committed into.
+	 */
+	if (!lv->lv_item->li_seq)
+		lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_insert_format_items(
+	struct xlog		*log,
+	struct xfs_trans	*tp,
+	int			*diff_len,
+	int			*diff_iovecs)
+{
+	struct xfs_log_item_desc *lidp;
+
+
+	/* Bail out if we didn't find a log item.  */
+	if (list_empty(&tp->t_items)) {
+		ASSERT(0);
+		return;
+	}
+
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		struct xfs_log_item *lip = lidp->lid_item;
+		struct xfs_log_vec *lv;
+		struct xfs_log_vec *old_lv;
+		int	niovecs = 0;
+		int	nbytes = 0;
+		int	buf_size;
+		bool	ordered = false;
+
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			continue;
+
+		/* get number of vecs and size of data to be stored */
+		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+		/* Skip items that do not have any vectors for writing */
+		if (!niovecs)
+			continue;
+
+		/*
+		 * Ordered items need to be tracked but we do not wish to write
+		 * them. We need a logvec to track the object, but we do not
+		 * need an iovec or buffer to be allocated for copying data.
+		 */
+		if (niovecs == XFS_LOG_VEC_ORDERED) {
+			ordered = true;
+			niovecs = 0;
+			nbytes = 0;
+		}
+
+		/*
+		 * We 64-bit align the length of each iovec so that the start
+		 * of the next one is naturally aligned.  We'll need to
+		 * account for that slack space here. Then round nbytes up
+		 * to 64-bit alignment so that the initial buffer alignment is
+		 * easy to calculate and verify.
+		 */
+		nbytes += niovecs * sizeof(uint64_t);
+		nbytes = round_up(nbytes, sizeof(uint64_t));
+
+		/* grab the old item if it exists for reservation accounting */
+		old_lv = lip->li_lv;
+
+		/*
+		 * The data buffer needs to start 64-bit aligned, so round up
+		 * that space to ensure we can align it appropriately and not
+		 * overrun the buffer.
+		 */
+		buf_size = nbytes +
+			   round_up((sizeof(struct xfs_log_vec) +
+				     niovecs * sizeof(struct xfs_log_iovec)),
+				    sizeof(uint64_t));
+
+		/* compare to existing item size */
+		if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+			/* same or smaller, optimise common overwrite case */
+			lv = lip->li_lv;
+			lv->lv_next = NULL;
+
+			if (ordered)
+				goto insert;
+
+			/*
+			 * set the item up as though it is a new insertion so
+			 * that the space reservation accounting is correct.
+			 */
+			*diff_iovecs -= lv->lv_niovecs;
+			*diff_len -= lv->lv_bytes;
+		} else {
+			/* allocate new data chunk */
+			lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+			lv->lv_item = lip;
+			lv->lv_size = buf_size;
+			if (ordered) {
+				/* track as an ordered logvec */
+				ASSERT(lip->li_lv == NULL);
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+				goto insert;
+			}
+			lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+		}
+
+		/* Ensure the lv is set up according to ->iop_size */
+		lv->lv_niovecs = niovecs;
+
+		/* The allocated data region lies beyond the iovec region */
+		lv->lv_buf_len = 0;
+		lv->lv_bytes = 0;
+		lv->lv_buf = (char *)lv + buf_size - nbytes;
+		ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
+
+		lip->li_ops->iop_format(lip, lv);
+insert:
+		ASSERT(lv->lv_buf_len <= nbytes);
+		xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+	}
+}
+
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we added to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
+static void
+xlog_cil_insert_items(
+	struct xlog		*log,
+	struct xfs_trans	*tp)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	struct xfs_log_item_desc *lidp;
+	int			len = 0;
+	int			diff_iovecs = 0;
+	int			iclog_space;
+
+	ASSERT(tp);
+
+	/*
+	 * We can do this safely because the context can't checkpoint until we
+	 * are done so it doesn't matter exactly how we update the CIL.
+	 */
+	xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+
+	/*
+	 * Now (re-)position everything modified at the tail of the CIL.
+	 * We do this here so we only need to take the CIL lock once during
+	 * the transaction commit.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		struct xfs_log_item	*lip = lidp->lid_item;
+
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			continue;
+
+		list_move_tail(&lip->li_cil, &cil->xc_cil);
+	}
+
+	/* account for space used by new iovec headers  */
+	len += diff_iovecs * sizeof(xlog_op_header_t);
+	ctx->nvecs += diff_iovecs;
+
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy))
+		list_splice_init(&tp->t_busy, &ctx->busy_extents);
+
+	/*
+	 * Now transfer enough transaction reservation to the context ticket
+	 * for the checkpoint. The context ticket is special - the unit
+	 * reservation has to grow as well as the current reservation as we
+	 * steal from tickets so we can correctly determine the space used
+	 * during the transaction commit.
+	 */
+	if (ctx->ticket->t_curr_res == 0) {
+		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+		tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
+	}
+
+	/* do we need space for more log record headers? */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	if (len > 0 && (ctx->space_used / iclog_space !=
+				(ctx->space_used + len) / iclog_space)) {
+		int hdrs;
+
+		hdrs = (len + iclog_space - 1) / iclog_space;
+		/* need to take into account split region headers, too */
+		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+		ctx->ticket->t_unit_res += hdrs;
+		ctx->ticket->t_curr_res += hdrs;
+		tp->t_ticket->t_curr_res -= hdrs;
+		ASSERT(tp->t_ticket->t_curr_res >= len);
+	}
+	tp->t_ticket->t_curr_res -= len;
+	ctx->space_used += len;
+
+	spin_unlock(&cil->xc_cil_lock);
+}
+
+static void
+xlog_cil_free_logvec(
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+
+	for (lv = log_vector; lv; ) {
+		struct xfs_log_vec *next = lv->lv_next;
+		kmem_free(lv);
+		lv = next;
+	}
+}
+
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+	void	*args,
+	int	abort)
+{
+	struct xfs_cil_ctx	*ctx = args;
+	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
+
+	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+					ctx->start_lsn, abort);
+
+	xfs_extent_busy_sort(&ctx->busy_extents);
+	xfs_extent_busy_clear(mp, &ctx->busy_extents,
+			     (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+
+	/*
+	 * If we are aborting the commit, wake up anyone waiting on the
+	 * committing list.  If we don't, then a shutdown we can leave processes
+	 * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
+	 * will never happen because we aborted it.
+	 */
+	spin_lock(&ctx->cil->xc_push_lock);
+	if (abort)
+		wake_up_all(&ctx->cil->xc_commit_wait);
+	list_del(&ctx->committing);
+	spin_unlock(&ctx->cil->xc_push_lock);
+
+	xlog_cil_free_logvec(ctx->lv_chain);
+
+	if (!list_empty(&ctx->busy_extents)) {
+		ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+
+		xfs_discard_extents(mp, &ctx->busy_extents);
+		xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
+	}
+
+	kmem_free(ctx);
+}
+
+/*
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
+ */
+STATIC int
+xlog_cil_push(
+	struct xlog		*log)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_vec	*lv;
+	struct xfs_cil_ctx	*ctx;
+	struct xfs_cil_ctx	*new_ctx;
+	struct xlog_in_core	*commit_iclog;
+	struct xlog_ticket	*tic;
+	int			num_iovecs;
+	int			error = 0;
+	struct xfs_trans_header thdr;
+	struct xfs_log_iovec	lhdr;
+	struct xfs_log_vec	lvhdr = { NULL };
+	xfs_lsn_t		commit_lsn;
+	xfs_lsn_t		push_seq;
+
+	if (!cil)
+		return 0;
+
+	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+	new_ctx->ticket = xlog_cil_ticket_alloc(log);
+
+	down_write(&cil->xc_ctx_lock);
+	ctx = cil->xc_ctx;
+
+	spin_lock(&cil->xc_push_lock);
+	push_seq = cil->xc_push_seq;
+	ASSERT(push_seq <= ctx->sequence);
+
+	/*
+	 * Check if we've anything to push. If there is nothing, then we don't
+	 * move on to a new sequence number and so we have to be able to push
+	 * this sequence again later.
+	 */
+	if (list_empty(&cil->xc_cil)) {
+		cil->xc_push_seq = 0;
+		spin_unlock(&cil->xc_push_lock);
+		goto out_skip;
+	}
+
+
+	/* check for a previously pushed seqeunce */
+	if (push_seq < cil->xc_ctx->sequence) {
+		spin_unlock(&cil->xc_push_lock);
+		goto out_skip;
+	}
+
+	/*
+	 * We are now going to push this context, so add it to the committing
+	 * list before we do anything else. This ensures that anyone waiting on
+	 * this push can easily detect the difference between a "push in
+	 * progress" and "CIL is empty, nothing to do".
+	 *
+	 * IOWs, a wait loop can now check for:
+	 *	the current sequence not being found on the committing list;
+	 *	an empty CIL; and
+	 *	an unchanged sequence number
+	 * to detect a push that had nothing to do and therefore does not need
+	 * waiting on. If the CIL is not empty, we get put on the committing
+	 * list before emptying the CIL and bumping the sequence number. Hence
+	 * an empty CIL and an unchanged sequence number means we jumped out
+	 * above after doing nothing.
+	 *
+	 * Hence the waiter will either find the commit sequence on the
+	 * committing list or the sequence number will be unchanged and the CIL
+	 * still dirty. In that latter case, the push has not yet started, and
+	 * so the waiter will have to continue trying to check the CIL
+	 * committing list until it is found. In extreme cases of delay, the
+	 * sequence may fully commit between the attempts the wait makes to wait
+	 * on the commit sequence.
+	 */
+	list_add(&ctx->committing, &cil->xc_committing);
+	spin_unlock(&cil->xc_push_lock);
+
+	/*
+	 * pull all the log vectors off the items in the CIL, and
+	 * remove the items from the CIL. We don't need the CIL lock
+	 * here because it's only needed on the transaction commit
+	 * side which is currently locked out by the flush lock.
+	 */
+	lv = NULL;
+	num_iovecs = 0;
+	while (!list_empty(&cil->xc_cil)) {
+		struct xfs_log_item	*item;
+
+		item = list_first_entry(&cil->xc_cil,
+					struct xfs_log_item, li_cil);
+		list_del_init(&item->li_cil);
+		if (!ctx->lv_chain)
+			ctx->lv_chain = item->li_lv;
+		else
+			lv->lv_next = item->li_lv;
+		lv = item->li_lv;
+		item->li_lv = NULL;
+		num_iovecs += lv->lv_niovecs;
+	}
+
+	/*
+	 * initialise the new context and attach it to the CIL. Then attach
+	 * the current context to the CIL committing lsit so it can be found
+	 * during log forces to extract the commit lsn of the sequence that
+	 * needs to be forced.
+	 */
+	INIT_LIST_HEAD(&new_ctx->committing);
+	INIT_LIST_HEAD(&new_ctx->busy_extents);
+	new_ctx->sequence = ctx->sequence + 1;
+	new_ctx->cil = cil;
+	cil->xc_ctx = new_ctx;
+
+	/*
+	 * The switch is now done, so we can drop the context lock and move out
+	 * of a shared context. We can't just go straight to the commit record,
+	 * though - we need to synchronise with previous and future commits so
+	 * that the commit records are correctly ordered in the log to ensure
+	 * that we process items during log IO completion in the correct order.
+	 *
+	 * For example, if we get an EFI in one checkpoint and the EFD in the
+	 * next (e.g. due to log forces), we do not want the checkpoint with
+	 * the EFD to be committed before the checkpoint with the EFI.  Hence
+	 * we must strictly order the commit records of the checkpoints so
+	 * that: a) the checkpoint callbacks are attached to the iclogs in the
+	 * correct order; and b) the checkpoints are replayed in correct order
+	 * in log recovery.
+	 *
+	 * Hence we need to add this context to the committing context list so
+	 * that higher sequences will wait for us to write out a commit record
+	 * before they do.
+	 *
+	 * xfs_log_force_lsn requires us to mirror the new sequence into the cil
+	 * structure atomically with the addition of this sequence to the
+	 * committing list. This also ensures that we can do unlocked checks
+	 * against the current sequence in log forces without risking
+	 * deferencing a freed context pointer.
+	 */
+	spin_lock(&cil->xc_push_lock);
+	cil->xc_current_sequence = new_ctx->sequence;
+	spin_unlock(&cil->xc_push_lock);
+	up_write(&cil->xc_ctx_lock);
+
+	/*
+	 * Build a checkpoint transaction header and write it to the log to
+	 * begin the transaction. We need to account for the space used by the
+	 * transaction header here as it is not accounted for in xlog_write().
+	 *
+	 * The LSN we need to pass to the log items on transaction commit is
+	 * the LSN reported by the first log vector write. If we use the commit
+	 * record lsn then we can move the tail beyond the grant write head.
+	 */
+	tic = ctx->ticket;
+	thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+	thdr.th_type = XFS_TRANS_CHECKPOINT;
+	thdr.th_tid = tic->t_tid;
+	thdr.th_num_items = num_iovecs;
+	lhdr.i_addr = &thdr;
+	lhdr.i_len = sizeof(xfs_trans_header_t);
+	lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+	tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+
+	lvhdr.lv_niovecs = 1;
+	lvhdr.lv_iovecp = &lhdr;
+	lvhdr.lv_next = ctx->lv_chain;
+
+	error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+	if (error)
+		goto out_abort_free_ticket;
+
+	/*
+	 * now that we've written the checkpoint into the log, strictly
+	 * order the commit records so replay will get them in the right order.
+	 */
+restart:
+	spin_lock(&cil->xc_push_lock);
+	list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+		/*
+		 * Avoid getting stuck in this loop because we were woken by the
+		 * shutdown, but then went back to sleep once already in the
+		 * shutdown state.
+		 */
+		if (XLOG_FORCED_SHUTDOWN(log)) {
+			spin_unlock(&cil->xc_push_lock);
+			goto out_abort_free_ticket;
+		}
+
+		/*
+		 * Higher sequences will wait for this one so skip them.
+		 * Don't wait for our own sequence, either.
+		 */
+		if (new_ctx->sequence >= ctx->sequence)
+			continue;
+		if (!new_ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
+			goto restart;
+		}
+	}
+	spin_unlock(&cil->xc_push_lock);
+
+	/* xfs_log_done always frees the ticket on error. */
+	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+	if (commit_lsn == -1)
+		goto out_abort;
+
+	/* attach all the transactions w/ busy extents to iclog */
+	ctx->log_cb.cb_func = xlog_cil_committed;
+	ctx->log_cb.cb_arg = ctx;
+	error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+	if (error)
+		goto out_abort;
+
+	/*
+	 * now the checkpoint commit is complete and we've attached the
+	 * callbacks to the iclog we can assign the commit LSN to the context
+	 * and wake up anyone who is waiting for the commit to complete.
+	 */
+	spin_lock(&cil->xc_push_lock);
+	ctx->commit_lsn = commit_lsn;
+	wake_up_all(&cil->xc_commit_wait);
+	spin_unlock(&cil->xc_push_lock);
+
+	/* release the hounds! */
+	return xfs_log_release_iclog(log->l_mp, commit_iclog);
+
+out_skip:
+	up_write(&cil->xc_ctx_lock);
+	xfs_log_ticket_put(new_ctx->ticket);
+	kmem_free(new_ctx);
+	return 0;
+
+out_abort_free_ticket:
+	xfs_log_ticket_put(tic);
+out_abort:
+	xlog_cil_committed(ctx, XFS_LI_ABORTED);
+	return -EIO;
+}
+
+static void
+xlog_cil_push_work(
+	struct work_struct	*work)
+{
+	struct xfs_cil		*cil = container_of(work, struct xfs_cil,
+							xc_push_work);
+	xlog_cil_push(cil->xc_log);
+}
+
+/*
+ * We need to push CIL every so often so we don't cache more than we can fit in
+ * the log. The limit really is that a checkpoint can't be more than half the
+ * log (the current checkpoint is not allowed to overwrite the previous
+ * checkpoint), but commit latency and memory usage limit this to a smaller
+ * size.
+ */
+static void
+xlog_cil_push_background(
+	struct xlog	*log)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+
+	/*
+	 * The cil won't be empty because we are called while holding the
+	 * context lock so whatever we added to the CIL will still be there
+	 */
+	ASSERT(!list_empty(&cil->xc_cil));
+
+	/*
+	 * don't do a background push if we haven't used up all the
+	 * space available yet.
+	 */
+	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+		return;
+
+	spin_lock(&cil->xc_push_lock);
+	if (cil->xc_push_seq < cil->xc_current_sequence) {
+		cil->xc_push_seq = cil->xc_current_sequence;
+		queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+	}
+	spin_unlock(&cil->xc_push_lock);
+
+}
+
+/*
+ * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
+ * number that is passed. When it returns, the work will be queued for
+ * @push_seq, but it won't be completed. The caller is expected to do any
+ * waiting for push_seq to complete if it is required.
+ */
+static void
+xlog_cil_push_now(
+	struct xlog	*log,
+	xfs_lsn_t	push_seq)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+
+	if (!cil)
+		return;
+
+	ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
+
+	/* start on any pending background push to minimise wait time on it */
+	flush_work(&cil->xc_push_work);
+
+	/*
+	 * If the CIL is empty or we've already pushed the sequence then
+	 * there's no work we need to do.
+	 */
+	spin_lock(&cil->xc_push_lock);
+	if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+		spin_unlock(&cil->xc_push_lock);
+		return;
+	}
+
+	cil->xc_push_seq = push_seq;
+	queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+	spin_unlock(&cil->xc_push_lock);
+}
+
+bool
+xlog_cil_empty(
+	struct xlog	*log)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+	bool		empty = false;
+
+	spin_lock(&cil->xc_push_lock);
+	if (list_empty(&cil->xc_cil))
+		empty = true;
+	spin_unlock(&cil->xc_push_lock);
+	return empty;
+}
+
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+void
+xfs_log_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct xlog		*log = mp->m_log;
+	struct xfs_cil		*cil = log->l_cilp;
+	int			log_flags = 0;
+
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
+	/* lock out background commit */
+	down_read(&cil->xc_ctx_lock);
+
+	xlog_cil_insert_items(log, tp);
+
+	/* check we didn't blow the reservation */
+	if (tp->t_ticket->t_curr_res < 0)
+		xlog_print_tic_res(mp, tp->t_ticket);
+
+	tp->t_commit_lsn = cil->xc_ctx->sequence;
+	if (commit_lsn)
+		*commit_lsn = tp->t_commit_lsn;
+
+	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * Once all the items of the transaction have been copied to the CIL,
+	 * the items can be unlocked and freed.
+	 *
+	 * This needs to be done before we drop the CIL context lock because we
+	 * have to update state in the log items and unlock them before they go
+	 * to disk. If we don't, then the CIL checkpoint can race with us and
+	 * we can run checkpoint completion before we've updated and unlocked
+	 * the log items. This affects (at least) processing of stale buffers,
+	 * inodes and EFIs.
+	 */
+	xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
+
+	xlog_cil_push_background(log);
+
+	up_read(&cil->xc_ctx_lock);
+}
+
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ */
+xfs_lsn_t
+xlog_cil_force_lsn(
+	struct xlog	*log,
+	xfs_lsn_t	sequence)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx;
+	xfs_lsn_t		commit_lsn = NULLCOMMITLSN;
+
+	ASSERT(sequence <= cil->xc_current_sequence);
+
+	/*
+	 * check to see if we need to force out the current context.
+	 * xlog_cil_push() handles racing pushes for the same sequence,
+	 * so no need to deal with it here.
+	 */
+restart:
+	xlog_cil_push_now(log, sequence);
+
+	/*
+	 * See if we can find a previous sequence still committing.
+	 * We need to wait for all previous sequence commits to complete
+	 * before allowing the force of push_seq to go ahead. Hence block
+	 * on commits for those as well.
+	 */
+	spin_lock(&cil->xc_push_lock);
+	list_for_each_entry(ctx, &cil->xc_committing, committing) {
+		/*
+		 * Avoid getting stuck in this loop because we were woken by the
+		 * shutdown, but then went back to sleep once already in the
+		 * shutdown state.
+		 */
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto out_shutdown;
+		if (ctx->sequence > sequence)
+			continue;
+		if (!ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
+			goto restart;
+		}
+		if (ctx->sequence != sequence)
+			continue;
+		/* found it! */
+		commit_lsn = ctx->commit_lsn;
+	}
+
+	/*
+	 * The call to xlog_cil_push_now() executes the push in the background.
+	 * Hence by the time we have got here it our sequence may not have been
+	 * pushed yet. This is true if the current sequence still matches the
+	 * push sequence after the above wait loop and the CIL still contains
+	 * dirty objects. This is guaranteed by the push code first adding the
+	 * context to the committing list before emptying the CIL.
+	 *
+	 * Hence if we don't find the context in the committing list and the
+	 * current sequence number is unchanged then the CIL contents are
+	 * significant.  If the CIL is empty, if means there was nothing to push
+	 * and that means there is nothing to wait for. If the CIL is not empty,
+	 * it means we haven't yet started the push, because if it had started
+	 * we would have found the context on the committing list.
+	 */
+	if (sequence == cil->xc_current_sequence &&
+	    !list_empty(&cil->xc_cil)) {
+		spin_unlock(&cil->xc_push_lock);
+		goto restart;
+	}
+
+	spin_unlock(&cil->xc_push_lock);
+	return commit_lsn;
+
+	/*
+	 * We detected a shutdown in progress. We need to trigger the log force
+	 * to pass through it's iclog state machine error handling, even though
+	 * we are already in a shutdown state. Hence we can't return
+	 * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
+	 * LSN is already stable), so we return a zero LSN instead.
+	 */
+out_shutdown:
+	spin_unlock(&cil->xc_push_lock);
+	return 0;
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+	struct xfs_log_item *lip)
+{
+	struct xfs_cil_ctx *ctx;
+
+	if (list_empty(&lip->li_cil))
+		return false;
+
+	ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+
+	/*
+	 * li_seq is written on the first commit of a log item to record the
+	 * first checkpoint it is written to. Hence if it is different to the
+	 * current sequence, we're in a new checkpoint.
+	 */
+	if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+		return false;
+	return true;
+}
+
+/*
+ * Perform initial CIL structure initialisation.
+ */
+int
+xlog_cil_init(
+	struct xlog	*log)
+{
+	struct xfs_cil	*cil;
+	struct xfs_cil_ctx *ctx;
+
+	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+	if (!cil)
+		return -ENOMEM;
+
+	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+	if (!ctx) {
+		kmem_free(cil);
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
+	INIT_LIST_HEAD(&cil->xc_cil);
+	INIT_LIST_HEAD(&cil->xc_committing);
+	spin_lock_init(&cil->xc_cil_lock);
+	spin_lock_init(&cil->xc_push_lock);
+	init_rwsem(&cil->xc_ctx_lock);
+	init_waitqueue_head(&cil->xc_commit_wait);
+
+	INIT_LIST_HEAD(&ctx->committing);
+	INIT_LIST_HEAD(&ctx->busy_extents);
+	ctx->sequence = 1;
+	ctx->cil = cil;
+	cil->xc_ctx = ctx;
+	cil->xc_current_sequence = ctx->sequence;
+
+	cil->xc_log = log;
+	log->l_cilp = cil;
+	return 0;
+}
+
+void
+xlog_cil_destroy(
+	struct xlog	*log)
+{
+	if (log->l_cilp->xc_ctx) {
+		if (log->l_cilp->xc_ctx->ticket)
+			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+		kmem_free(log->l_cilp->xc_ctx);
+	}
+
+	ASSERT(list_empty(&log->l_cilp->xc_cil));
+	kmem_free(log->l_cilp);
+}
+
diff --git a/kernel/fs/xfs/xfs_log_priv.h b/kernel/fs/xfs/xfs_log_priv.h
new file mode 100644
index 000000000..db7cbdeb2
--- /dev/null
+++ b/kernel/fs/xfs/xfs_log_priv.h
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_LOG_PRIV_H__
+#define __XFS_LOG_PRIV_H__
+
+struct xfs_buf;
+struct xlog;
+struct xlog_ticket;
+struct xfs_mount;
+struct xfs_log_callback;
+
+/*
+ * Flags for log structure
+ */
+#define XLOG_ACTIVE_RECOVERY	0x2	/* in the middle of recovery */
+#define	XLOG_RECOVERY_NEEDED	0x4	/* log was recovered */
+#define XLOG_IO_ERROR		0x8	/* log hit an I/O error, and being
+					   shutdown */
+#define XLOG_TAIL_WARN		0x10	/* log tail verify warning issued */
+
+/*
+ * get client id from packed copy.
+ *
+ * this hack is here because the xlog_pack code copies four bytes
+ * of xlog_op_header containing the fields oh_clientid, oh_flags
+ * and oh_res2 into the packed copy.
+ *
+ * later on this four byte chunk is treated as an int and the
+ * client id is pulled out.
+ *
+ * this has endian issues, of course.
+ */
+static inline uint xlog_get_client_id(__be32 i)
+{
+	return be32_to_cpu(i) >> 24;
+}
+
+/*
+ * In core log state
+ */
+#define XLOG_STATE_ACTIVE    0x0001 /* Current IC log being written to */
+#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
+#define XLOG_STATE_SYNCING   0x0004 /* This IC log is syncing */
+#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
+#define XLOG_STATE_DO_CALLBACK \
+			     0x0010 /* Process callback functions */
+#define XLOG_STATE_CALLBACK  0x0020 /* Callback functions now */
+#define XLOG_STATE_DIRTY     0x0040 /* Dirty IC log, not ready for ACTIVE status*/
+#define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_ALL	     0x7FFF /* All possible valid flags */
+#define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
+
+/*
+ * Flags to log ticket
+ */
+#define XLOG_TIC_INITED		0x1	/* has been initialized */
+#define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
+
+#define XLOG_TIC_FLAGS \
+	{ XLOG_TIC_INITED,	"XLOG_TIC_INITED" }, \
+	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }
+
+/*
+ * Below are states for covering allocation transactions.
+ * By covering, we mean changing the h_tail_lsn in the last on-disk
+ * log write such that no allocation transactions will be re-done during
+ * recovery after a system crash. Recovery starts at the last on-disk
+ * log write.
+ *
+ * These states are used to insert dummy log entries to cover
+ * space allocation transactions which can undo non-transactional changes
+ * after a crash. Writes to a file with space
+ * already allocated do not result in any transactions. Allocations
+ * might include space beyond the EOF. So if we just push the EOF a
+ * little, the last transaction for the file could contain the wrong
+ * size. If there is no file system activity, after an allocation
+ * transaction, and the system crashes, the allocation transaction
+ * will get replayed and the file will be truncated. This could
+ * be hours/days/... after the allocation occurred.
+ *
+ * The fix for this is to do two dummy transactions when the
+ * system is idle. We need two dummy transaction because the h_tail_lsn
+ * in the log record header needs to point beyond the last possible
+ * non-dummy transaction. The first dummy changes the h_tail_lsn to
+ * the first transaction before the dummy. The second dummy causes
+ * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
+ *
+ * These dummy transactions get committed when everything
+ * is idle (after there has been some activity).
+ *
+ * There are 5 states used to control this.
+ *
+ *  IDLE -- no logging has been done on the file system or
+ *		we are done covering previous transactions.
+ *  NEED -- logging has occurred and we need a dummy transaction
+ *		when the log becomes idle.
+ *  DONE -- we were in the NEED state and have committed a dummy
+ *		transaction.
+ *  NEED2 -- we detected that a dummy transaction has gone to the
+ *		on disk log with no other transactions.
+ *  DONE2 -- we committed a dummy transaction when in the NEED2 state.
+ *
+ * There are two places where we switch states:
+ *
+ * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
+ *	We commit the dummy transaction and switch to DONE or DONE2,
+ *	respectively. In all other states, we don't do anything.
+ *
+ * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
+ *
+ *	No matter what state we are in, if this isn't the dummy
+ *	transaction going out, the next state is NEED.
+ *	So, if we aren't in the DONE or DONE2 states, the next state
+ *	is NEED. We can't be finishing a write of the dummy record
+ *	unless it was committed and the state switched to DONE or DONE2.
+ *
+ *	If we are in the DONE state and this was a write of the
+ *		dummy transaction, we move to NEED2.
+ *
+ *	If we are in the DONE2 state and this was a write of the
+ *		dummy transaction, we move to IDLE.
+ *
+ *
+ * Writing only one dummy transaction can get appended to
+ * one file space allocation. When this happens, the log recovery
+ * code replays the space allocation and a file could be truncated.
+ * This is why we have the NEED2 and DONE2 states before going idle.
+ */
+
+#define XLOG_STATE_COVER_IDLE	0
+#define XLOG_STATE_COVER_NEED	1
+#define XLOG_STATE_COVER_DONE	2
+#define XLOG_STATE_COVER_NEED2	3
+#define XLOG_STATE_COVER_DONE2	4
+
+#define XLOG_COVER_OPS		5
+
+/* Ticket reservation region accounting */ 
+#define XLOG_TIC_LEN_MAX	15
+
+/*
+ * Reservation region
+ * As would be stored in xfs_log_iovec but without the i_addr which
+ * we don't care about.
+ */
+typedef struct xlog_res {
+	uint	r_len;	/* region length		:4 */
+	uint	r_type;	/* region's transaction type	:4 */
+} xlog_res_t;
+
+typedef struct xlog_ticket {
+	struct list_head   t_queue;	 /* reserve/write queue */
+	struct task_struct *t_task;	 /* task that owns this ticket */
+	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
+	atomic_t	   t_ref;	 /* ticket reference count       : 4  */
+	int		   t_curr_res;	 /* current reservation in bytes : 4  */
+	int		   t_unit_res;	 /* unit reservation in bytes    : 4  */
+	char		   t_ocnt;	 /* original count		 : 1  */
+	char		   t_cnt;	 /* current count		 : 1  */
+	char		   t_clientid;	 /* who does this belong to;	 : 1  */
+	char		   t_flags;	 /* properties of reservation	 : 1  */
+	uint		   t_trans_type; /* transaction type             : 4  */
+
+        /* reservation array fields */
+	uint		   t_res_num;                    /* num in array : 4 */
+	uint		   t_res_num_ophdrs;		 /* num op hdrs  : 4 */
+	uint		   t_res_arr_sum;		 /* array sum    : 4 */
+	uint		   t_res_o_flow;		 /* sum overflow : 4 */
+	xlog_res_t	   t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : 8 * 15 */ 
+} xlog_ticket_t;
+
+/*
+ * - A log record header is 512 bytes.  There is plenty of room to grow the
+ *	xlog_rec_header_t into the reserved space.
+ * - ic_data follows, so a write to disk can start at the beginning of
+ *	the iclog.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
+ * - ic_next is the pointer to the next iclog in the ring.
+ * - ic_bp is a pointer to the buffer used to write this incore log to disk.
+ * - ic_log is a pointer back to the global log structure.
+ * - ic_callback is a linked list of callback function/argument pairs to be
+ *	called after an iclog finishes writing.
+ * - ic_size is the full size of the header plus data.
+ * - ic_offset is the current number of bytes written to in this iclog.
+ * - ic_refcnt is bumped when someone is writing to the log.
+ * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ *	- ic_callback_*
+ *	- ic_refcnt
+ *	- fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
+ */
+typedef struct xlog_in_core {
+	wait_queue_head_t	ic_force_wait;
+	wait_queue_head_t	ic_write_wait;
+	struct xlog_in_core	*ic_next;
+	struct xlog_in_core	*ic_prev;
+	struct xfs_buf		*ic_bp;
+	struct xlog		*ic_log;
+	int			ic_size;
+	int			ic_offset;
+	int			ic_bwritecnt;
+	unsigned short		ic_state;
+	char			*ic_datap;	/* pointer to iclog data */
+
+	/* Callback structures need their own cacheline */
+	spinlock_t		ic_callback_lock ____cacheline_aligned_in_smp;
+	struct xfs_log_callback	*ic_callback;
+	struct xfs_log_callback	**ic_callback_tail;
+
+	/* reference counts need their own cacheline */
+	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
+	xlog_in_core_2_t	*ic_data;
+#define ic_header	ic_data->hic_header
+} xlog_in_core_t;
+
+/*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+
+struct xfs_cil_ctx {
+	struct xfs_cil		*cil;
+	xfs_lsn_t		sequence;	/* chkpt sequence # */
+	xfs_lsn_t		start_lsn;	/* first LSN of chkpt commit */
+	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
+	struct xlog_ticket	*ticket;	/* chkpt ticket */
+	int			nvecs;		/* number of regions */
+	int			space_used;	/* aggregate size of regions */
+	struct list_head	busy_extents;	/* busy extents in chkpt */
+	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
+	struct xfs_log_callback	log_cb;		/* completion callback hook. */
+	struct list_head	committing;	/* ctx committing list */
+};
+
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+	struct xlog		*xc_log;
+	struct list_head	xc_cil;
+	spinlock_t		xc_cil_lock;
+
+	struct rw_semaphore	xc_ctx_lock ____cacheline_aligned_in_smp;
+	struct xfs_cil_ctx	*xc_ctx;
+
+	spinlock_t		xc_push_lock ____cacheline_aligned_in_smp;
+	xfs_lsn_t		xc_push_seq;
+	struct list_head	xc_committing;
+	wait_queue_head_t	xc_commit_wait;
+	xfs_lsn_t		xc_current_sequence;
+	struct work_struct	xc_push_work;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
+ * Whatever we choose, we have to make sure we can get a reservation for the
+ * log space effectively, that it is large enough to capture sufficient
+ * relogging to reduce log buffer IO significantly, but it is not too large for
+ * the log or induces too much latency when writing out through the iclogs. We
+ * track both space consumed and the number of vectors in the checkpoint
+ * context, so we need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can effectively make up arbitrary limits for
+ * the checkpoint size so long as they don't violate any other size rules.
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
+ * limited by that.  Furthermore, the log transaction reservation subsystem
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits.  A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)	(log->l_logsize >> 3)
+
+/*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+struct xlog_grant_head {
+	spinlock_t		lock ____cacheline_aligned_in_smp;
+	struct list_head	waiters;
+	atomic64_t		grant;
+};
+
+/*
+ * The reservation head lsn is not made up of a cycle number and block number.
+ * Instead, it uses a cycle number and byte number.  Logs don't expect to
+ * overflow 31 bits worth of byte offset, so using a byte number will mean
+ * that round off problems won't occur when releasing partial reservations.
+ */
+struct xlog {
+	/* The following fields don't need locking */
+	struct xfs_mount	*l_mp;	        /* mount point */
+	struct xfs_ail		*l_ailp;	/* AIL log is working with */
+	struct xfs_cil		*l_cilp;	/* CIL log is working with */
+	struct xfs_buf		*l_xbuf;        /* extra buffer for log
+						 * wrapping */
+	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	struct delayed_work	l_work;		/* background flush work */
+	uint			l_flags;
+	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+	struct list_head	*l_buf_cancel_table;
+	int			l_iclog_hsize;  /* size of iclog header */
+	int			l_iclog_heads;  /* # of iclog header sectors */
+	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
+	int			l_iclog_size;	/* size of log in bytes */
+	int			l_iclog_size_log; /* log power size of log */
+	int			l_iclog_bufs;	/* number of iclog buffers */
+	xfs_daddr_t		l_logBBstart;   /* start block of log */
+	int			l_logsize;      /* size of log in bytes */
+	int			l_logBBsize;    /* size of log in BB chunks */
+
+	/* The following block of fields are changed while holding icloglock */
+	wait_queue_head_t	l_flush_wait ____cacheline_aligned_in_smp;
+						/* waiting for iclog flush */
+	int			l_covered_state;/* state of "covering disk
+						 * log entries" */
+	xlog_in_core_t		*l_iclog;       /* head log queue	*/
+	spinlock_t		l_icloglock;    /* grab to change iclog state */
+	int			l_curr_cycle;   /* Cycle number of log writes */
+	int			l_prev_cycle;   /* Cycle number before last
+						 * block increment */
+	int			l_curr_block;   /* current logical log block */
+	int			l_prev_block;   /* previous logical log block */
+
+	/*
+	 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
+	 * read without needing to hold specific locks. To avoid operations
+	 * contending with other hot objects, place each of them on a separate
+	 * cacheline.
+	 */
+	/* lsn of last LR on disk */
+	atomic64_t		l_last_sync_lsn ____cacheline_aligned_in_smp;
+	/* lsn of 1st LR with unflushed * buffers */
+	atomic64_t		l_tail_lsn ____cacheline_aligned_in_smp;
+
+	struct xlog_grant_head	l_reserve_head;
+	struct xlog_grant_head	l_write_head;
+
+	struct xfs_kobj		l_kobj;
+
+	/* The following field are used for debugging; need to hold icloglock */
+#ifdef DEBUG
+	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
+#endif
+
+};
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+	((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
+#define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
+
+/* common routines */
+extern int
+xlog_recover(
+	struct xlog		*log);
+extern int
+xlog_recover_finish(
+	struct xlog		*log);
+
+extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+			    char *dp, int size);
+
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *
+xlog_ticket_alloc(
+	struct xlog	*log,
+	int		unit_bytes,
+	int		count,
+	char		client,
+	bool		permanent,
+	xfs_km_flags_t	alloc_flags);
+
+
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+	*ptr += bytes;
+	*len -= bytes;
+	*off += bytes;
+}
+
+void	xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int
+xlog_write(
+	struct xlog		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*tic,
+	xfs_lsn_t		*start_lsn,
+	struct xlog_in_core	**commit_iclog,
+	uint			flags);
+
+/*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to sample and crack LSNs that are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+	xfs_lsn_t val = atomic64_read(lsn);
+
+	*cycle = CYCLE_LSN(val);
+	*block = BLOCK_LSN(val);
+}
+
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+	*cycle = val >> 32;
+	*space = val & 0xffffffff;
+}
+
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+	return ((int64_t)cycle << 32) | space;
+}
+
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+
+/*
+ * Committed Item List interfaces
+ */
+int	xlog_cil_init(struct xlog *log);
+void	xlog_cil_init_post_recovery(struct xlog *log);
+void	xlog_cil_destroy(struct xlog *log);
+bool	xlog_cil_empty(struct xlog *log);
+
+/*
+ * CIL force routines
+ */
+xfs_lsn_t
+xlog_cil_force_lsn(
+	struct xlog *log,
+	xfs_lsn_t sequence);
+
+static inline void
+xlog_cil_force(struct xlog *log)
+{
+	xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
+
+/*
+ * Unmount record type is used as a pseudo transaction type for the ticket.
+ * It's value must be outside the range of XFS_TRANS_* values.
+ */
+#define XLOG_UNMOUNT_REC_TYPE	(-1U)
+
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(wq, &wait);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+}
+
+#endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/kernel/fs/xfs/xfs_log_recover.c b/kernel/fs/xfs/xfs_log_recover.c
new file mode 100644
index 000000000..4f5784f85
--- /dev/null
+++ b/kernel/fs/xfs/xfs_log_recover.c
@@ -0,0 +1,4651 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+
+#define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
+
+STATIC int
+xlog_find_zeroed(
+	struct xlog	*,
+	xfs_daddr_t	*);
+STATIC int
+xlog_clear_stale_blocks(
+	struct xlog	*,
+	xfs_lsn_t);
+#if defined(DEBUG)
+STATIC void
+xlog_recover_check_summary(
+	struct xlog *);
+#else
+#define	xlog_recover_check_summary(log)
+#endif
+
+/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+	xfs_daddr_t		bc_blkno;
+	uint			bc_len;
+	int			bc_refcount;
+	struct list_head	bc_list;
+};
+
+/*
+ * Sector aligned buffer routines for buffer create/read/write/access
+ */
+
+/*
+ * Verify the given count of basic blocks is valid number of blocks
+ * to specify for an operation involving the given XFS log buffer.
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+
+static inline int
+xlog_buf_bbcount_valid(
+	struct xlog	*log,
+	int		bbcount)
+{
+	return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
+STATIC xfs_buf_t *
+xlog_get_bp(
+	struct xlog	*log,
+	int		nbblks)
+{
+	struct xfs_buf	*bp;
+
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+		return NULL;
+	}
+
+	/*
+	 * We do log I/O in units of log sectors (a power-of-2
+	 * multiple of the basic block size), so we round up the
+	 * requested size to accommodate the basic blocks required
+	 * for complete log sectors.
+	 *
+	 * In addition, the buffer may be used for a non-sector-
+	 * aligned block offset, in which case an I/O of the
+	 * requested size could extend beyond the end of the
+	 * buffer.  If the requested size is only 1 basic block it
+	 * will never straddle a sector boundary, so this won't be
+	 * an issue.  Nor will this be a problem if the log I/O is
+	 * done in basic blocks (sector size 1).  But otherwise we
+	 * extend the buffer by one extra log sector to ensure
+	 * there's space to accommodate this possibility.
+	 */
+	if (nbblks > 1 && log->l_sectBBsize > 1)
+		nbblks += log->l_sectBBsize;
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+
+	bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
+	if (bp)
+		xfs_buf_unlock(bp);
+	return bp;
+}
+
+STATIC void
+xlog_put_bp(
+	xfs_buf_t	*bp)
+{
+	xfs_buf_free(bp);
+}
+
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
+STATIC xfs_caddr_t
+xlog_align(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	struct xfs_buf	*bp)
+{
+	xfs_daddr_t	offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
+
+	ASSERT(offset + nbblks <= bp->b_length);
+	return bp->b_addr + BBTOB(offset);
+}
+
+
+/*
+ * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
+ */
+STATIC int
+xlog_bread_noalign(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	struct xfs_buf	*bp)
+{
+	int		error;
+
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+
+	blk_no = round_down(blk_no, log->l_sectBBsize);
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+
+	ASSERT(nbblks > 0);
+	ASSERT(nbblks <= bp->b_length);
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_READ(bp);
+	bp->b_io_length = nbblks;
+	bp->b_error = 0;
+
+	error = xfs_buf_submit_wait(bp);
+	if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
+		xfs_buf_ioerror_alert(bp, __func__);
+	return error;
+}
+
+STATIC int
+xlog_bread(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	struct xfs_buf	*bp,
+	xfs_caddr_t	*offset)
+{
+	int		error;
+
+	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+	if (error)
+		return error;
+
+	*offset = xlog_align(log, blk_no, nbblks, bp);
+	return 0;
+}
+
+/*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,		/* block to read from */
+	int		nbblks,		/* blocks to read */
+	struct xfs_buf	*bp,
+	xfs_caddr_t	offset)
+{
+	xfs_caddr_t	orig_offset = bp->b_addr;
+	int		orig_len = BBTOB(bp->b_length);
+	int		error, error2;
+
+	error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
+	if (error)
+		return error;
+
+	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+
+	/* must reset buffer pointer even on error */
+	error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
+	if (error)
+		return error;
+	return error2;
+}
+
+/*
+ * Write out the buffer at the given block for the given number of blocks.
+ * The buffer is kept locked across the write and is returned locked.
+ * This can only be used for synchronous log writes.
+ */
+STATIC int
+xlog_bwrite(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	struct xfs_buf	*bp)
+{
+	int		error;
+
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+
+	blk_no = round_down(blk_no, log->l_sectBBsize);
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+
+	ASSERT(nbblks > 0);
+	ASSERT(nbblks <= bp->b_length);
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_ZEROFLAGS(bp);
+	xfs_buf_hold(bp);
+	xfs_buf_lock(bp);
+	bp->b_io_length = nbblks;
+	bp->b_error = 0;
+
+	error = xfs_bwrite(bp);
+	if (error)
+		xfs_buf_ioerror_alert(bp, __func__);
+	xfs_buf_relse(bp);
+	return error;
+}
+
+#ifdef DEBUG
+/*
+ * dump debug superblock and log record information
+ */
+STATIC void
+xlog_header_check_dump(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
+		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
+	xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
+		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
+}
+#else
+#define xlog_header_check_dump(mp, head)
+#endif
+
+/*
+ * check log record header for recovery
+ */
+STATIC int
+xlog_header_check_recover(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
+
+	/*
+	 * IRIX doesn't write the h_fmt field and leaves it zeroed
+	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
+	 * a dirty log created in IRIX.
+	 */
+	if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
+		xfs_warn(mp,
+	"dirty log written in incompatible format - can't recover");
+		xlog_header_check_dump(mp, head);
+		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
+				 XFS_ERRLEVEL_HIGH, mp);
+		return -EFSCORRUPTED;
+	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+		xfs_warn(mp,
+	"dirty log entry has mismatched uuid - can't recover");
+		xlog_header_check_dump(mp, head);
+		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
+				 XFS_ERRLEVEL_HIGH, mp);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * read the head block of the log and check the header
+ */
+STATIC int
+xlog_header_check_mount(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
+
+	if (uuid_is_nil(&head->h_fs_uuid)) {
+		/*
+		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
+		 * h_fs_uuid is nil, we assume this log was last mounted
+		 * by IRIX and continue.
+		 */
+		xfs_warn(mp, "nil uuid in log - IRIX style log");
+	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+		xfs_warn(mp, "log has mismatched uuid - can't recover");
+		xlog_header_check_dump(mp, head);
+		XFS_ERROR_REPORT("xlog_header_check_mount",
+				 XFS_ERRLEVEL_HIGH, mp);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+STATIC void
+xlog_recover_iodone(
+	struct xfs_buf	*bp)
+{
+	if (bp->b_error) {
+		/*
+		 * We're not going to bother about retrying
+		 * this during recovery. One strike!
+		 */
+		if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+			xfs_buf_ioerror_alert(bp, __func__);
+			xfs_force_shutdown(bp->b_target->bt_mount,
+						SHUTDOWN_META_IO_ERROR);
+		}
+	}
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp);
+}
+
+/*
+ * This routine finds (to an approximation) the first block in the physical
+ * log which contains the given cycle.  It uses a binary search algorithm.
+ * Note that the algorithm can not be perfect because the disk will not
+ * necessarily be perfect.
+ */
+STATIC int
+xlog_find_cycle_start(
+	struct xlog	*log,
+	struct xfs_buf	*bp,
+	xfs_daddr_t	first_blk,
+	xfs_daddr_t	*last_blk,
+	uint		cycle)
+{
+	xfs_caddr_t	offset;
+	xfs_daddr_t	mid_blk;
+	xfs_daddr_t	end_blk;
+	uint		mid_cycle;
+	int		error;
+
+	end_blk = *last_blk;
+	mid_blk = BLK_AVG(first_blk, end_blk);
+	while (mid_blk != first_blk && mid_blk != end_blk) {
+		error = xlog_bread(log, mid_blk, 1, bp, &offset);
+		if (error)
+			return error;
+		mid_cycle = xlog_get_cycle(offset);
+		if (mid_cycle == cycle)
+			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
+		else
+			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
+		mid_blk = BLK_AVG(first_blk, end_blk);
+	}
+	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
+	       (mid_blk == end_blk && mid_blk-1 == first_blk));
+
+	*last_blk = end_blk;
+
+	return 0;
+}
+
+/*
+ * Check that a range of blocks does not contain stop_on_cycle_no.
+ * Fill in *new_blk with the block offset where such a block is
+ * found, or with -1 (an invalid block number) if there is no such
+ * block in the range.  The scan needs to occur from front to back
+ * and the pointer into the region must be updated since a later
+ * routine will need to perform another test.
+ */
+STATIC int
+xlog_find_verify_cycle(
+	struct xlog	*log,
+	xfs_daddr_t	start_blk,
+	int		nbblks,
+	uint		stop_on_cycle_no,
+	xfs_daddr_t	*new_blk)
+{
+	xfs_daddr_t	i, j;
+	uint		cycle;
+	xfs_buf_t	*bp;
+	xfs_daddr_t	bufblks;
+	xfs_caddr_t	buf = NULL;
+	int		error = 0;
+
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks we'll be examining.  If that fails,
+	 * try a smaller size.  We need to be able to read at least
+	 * a log sector, or we're out of luck.
+	 */
+	bufblks = 1 << ffs(nbblks);
+	while (bufblks > log->l_logBBsize)
+		bufblks >>= 1;
+	while (!(bp = xlog_get_bp(log, bufblks))) {
+		bufblks >>= 1;
+		if (bufblks < log->l_sectBBsize)
+			return -ENOMEM;
+	}
+
+	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
+		int	bcount;
+
+		bcount = min(bufblks, (start_blk + nbblks - i));
+
+		error = xlog_bread(log, i, bcount, bp, &buf);
+		if (error)
+			goto out;
+
+		for (j = 0; j < bcount; j++) {
+			cycle = xlog_get_cycle(buf);
+			if (cycle == stop_on_cycle_no) {
+				*new_blk = i+j;
+				goto out;
+			}
+
+			buf += BBSIZE;
+		}
+	}
+
+	*new_blk = -1;
+
+out:
+	xlog_put_bp(bp);
+	return error;
+}
+
+/*
+ * Potentially backup over partial log record write.
+ *
+ * In the typical case, last_blk is the number of the block directly after
+ * a good log record.  Therefore, we subtract one to get the block number
+ * of the last block in the given buffer.  extra_bblks contains the number
+ * of blocks we would have read on a previous read.  This happens when the
+ * last log record is split over the end of the physical log.
+ *
+ * extra_bblks is the number of blocks potentially verified on a previous
+ * call to this routine.
+ */
+STATIC int
+xlog_find_verify_log_record(
+	struct xlog		*log,
+	xfs_daddr_t		start_blk,
+	xfs_daddr_t		*last_blk,
+	int			extra_bblks)
+{
+	xfs_daddr_t		i;
+	xfs_buf_t		*bp;
+	xfs_caddr_t		offset = NULL;
+	xlog_rec_header_t	*head = NULL;
+	int			error = 0;
+	int			smallmem = 0;
+	int			num_blks = *last_blk - start_blk;
+	int			xhdrs;
+
+	ASSERT(start_blk != 0 || *last_blk != start_blk);
+
+	if (!(bp = xlog_get_bp(log, num_blks))) {
+		if (!(bp = xlog_get_bp(log, 1)))
+			return -ENOMEM;
+		smallmem = 1;
+	} else {
+		error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+		if (error)
+			goto out;
+		offset += ((num_blks - 1) << BBSHIFT);
+	}
+
+	for (i = (*last_blk) - 1; i >= 0; i--) {
+		if (i < start_blk) {
+			/* valid log record not found */
+			xfs_warn(log->l_mp,
+		"Log inconsistent (didn't find previous header)");
+			ASSERT(0);
+			error = -EIO;
+			goto out;
+		}
+
+		if (smallmem) {
+			error = xlog_bread(log, i, 1, bp, &offset);
+			if (error)
+				goto out;
+		}
+
+		head = (xlog_rec_header_t *)offset;
+
+		if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
+			break;
+
+		if (!smallmem)
+			offset -= BBSIZE;
+	}
+
+	/*
+	 * We hit the beginning of the physical log & still no header.  Return
+	 * to caller.  If caller can handle a return of -1, then this routine
+	 * will be called again for the end of the physical log.
+	 */
+	if (i == -1) {
+		error = 1;
+		goto out;
+	}
+
+	/*
+	 * We have the final block of the good log (the first block
+	 * of the log record _before_ the head. So we check the uuid.
+	 */
+	if ((error = xlog_header_check_mount(log->l_mp, head)))
+		goto out;
+
+	/*
+	 * We may have found a log record header before we expected one.
+	 * last_blk will be the 1st block # with a given cycle #.  We may end
+	 * up reading an entire log record.  In this case, we don't want to
+	 * reset last_blk.  Only when last_blk points in the middle of a log
+	 * record do we update last_blk.
+	 */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		uint	h_size = be32_to_cpu(head->h_size);
+
+		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
+		if (h_size % XLOG_HEADER_CYCLE_SIZE)
+			xhdrs++;
+	} else {
+		xhdrs = 1;
+	}
+
+	if (*last_blk - i + extra_bblks !=
+	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
+		*last_blk = i;
+
+out:
+	xlog_put_bp(bp);
+	return error;
+}
+
+/*
+ * Head is defined to be the point of the log where the next log write
+ * could go.  This means that incomplete LR writes at the end are
+ * eliminated when calculating the head.  We aren't guaranteed that previous
+ * LR have complete transactions.  We only know that a cycle number of
+ * current cycle number -1 won't be present in the log if we start writing
+ * from our current block number.
+ *
+ * last_blk contains the block number of the first block with a given
+ * cycle number.
+ *
+ * Return: zero if normal, non-zero if error.
+ */
+STATIC int
+xlog_find_head(
+	struct xlog	*log,
+	xfs_daddr_t	*return_head_blk)
+{
+	xfs_buf_t	*bp;
+	xfs_caddr_t	offset;
+	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
+	int		num_scan_bblks;
+	uint		first_half_cycle, last_half_cycle;
+	uint		stop_on_cycle;
+	int		error, log_bbnum = log->l_logBBsize;
+
+	/* Is the end of the log device zeroed? */
+	error = xlog_find_zeroed(log, &first_blk);
+	if (error < 0) {
+		xfs_warn(log->l_mp, "empty log check failed");
+		return error;
+	}
+	if (error == 1) {
+		*return_head_blk = first_blk;
+
+		/* Is the whole lot zeroed? */
+		if (!first_blk) {
+			/* Linux XFS shouldn't generate totally zeroed logs -
+			 * mkfs etc write a dummy unmount record to a fresh
+			 * log so we can store the uuid in there
+			 */
+			xfs_warn(log->l_mp, "totally zeroed log");
+		}
+
+		return 0;
+	}
+
+	first_blk = 0;			/* get cycle # of 1st block */
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return -ENOMEM;
+
+	error = xlog_bread(log, 0, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	first_half_cycle = xlog_get_cycle(offset);
+
+	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
+	error = xlog_bread(log, last_blk, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	last_half_cycle = xlog_get_cycle(offset);
+	ASSERT(last_half_cycle != 0);
+
+	/*
+	 * If the 1st half cycle number is equal to the last half cycle number,
+	 * then the entire log is stamped with the same cycle number.  In this
+	 * case, head_blk can't be set to zero (which makes sense).  The below
+	 * math doesn't work out properly with head_blk equal to zero.  Instead,
+	 * we set it to log_bbnum which is an invalid block number, but this
+	 * value makes the math correct.  If head_blk doesn't changed through
+	 * all the tests below, *head_blk is set to zero at the very end rather
+	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
+	 * in a circular file.
+	 */
+	if (first_half_cycle == last_half_cycle) {
+		/*
+		 * In this case we believe that the entire log should have
+		 * cycle number last_half_cycle.  We need to scan backwards
+		 * from the end verifying that there are no holes still
+		 * containing last_half_cycle - 1.  If we find such a hole,
+		 * then the start of that hole will be the new head.  The
+		 * simple case looks like
+		 *        x | x ... | x - 1 | x
+		 * Another case that fits this picture would be
+		 *        x | x + 1 | x ... | x
+		 * In this case the head really is somewhere at the end of the
+		 * log, as one of the latest writes at the beginning was
+		 * incomplete.
+		 * One more case is
+		 *        x | x + 1 | x ... | x - 1 | x
+		 * This is really the combination of the above two cases, and
+		 * the head has to end up at the start of the x-1 hole at the
+		 * end of the log.
+		 *
+		 * In the 256k log case, we will read from the beginning to the
+		 * end of the log and search for cycle numbers equal to x-1.
+		 * We don't worry about the x+1 blocks that we encounter,
+		 * because we know that they cannot be the head since the log
+		 * started with x.
+		 */
+		head_blk = log_bbnum;
+		stop_on_cycle = last_half_cycle - 1;
+	} else {
+		/*
+		 * In this case we want to find the first block with cycle
+		 * number matching last_half_cycle.  We expect the log to be
+		 * some variation on
+		 *        x + 1 ... | x ... | x
+		 * The first block with cycle number x (last_half_cycle) will
+		 * be where the new head belongs.  First we do a binary search
+		 * for the first occurrence of last_half_cycle.  The binary
+		 * search may not be totally accurate, so then we scan back
+		 * from there looking for occurrences of last_half_cycle before
+		 * us.  If that backwards scan wraps around the beginning of
+		 * the log, then we look for occurrences of last_half_cycle - 1
+		 * at the end of the log.  The cases we're looking for look
+		 * like
+		 *                               v binary search stopped here
+		 *        x + 1 ... | x | x + 1 | x ... | x
+		 *                   ^ but we want to locate this spot
+		 * or
+		 *        <---------> less than scan distance
+		 *        x + 1 ... | x ... | x - 1 | x
+		 *                           ^ we want to locate this spot
+		 */
+		stop_on_cycle = last_half_cycle;
+		if ((error = xlog_find_cycle_start(log, bp, first_blk,
+						&head_blk, last_half_cycle)))
+			goto bp_err;
+	}
+
+	/*
+	 * Now validate the answer.  Scan back some number of maximum possible
+	 * blocks and make sure each one has the expected cycle number.  The
+	 * maximum is determined by the total possible amount of buffering
+	 * in the in-core log.  The following number can be made tighter if
+	 * we actually look at the block size of the filesystem.
+	 */
+	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+	if (head_blk >= num_scan_bblks) {
+		/*
+		 * We are guaranteed that the entire check can be performed
+		 * in one buffer.
+		 */
+		start_blk = head_blk - num_scan_bblks;
+		if ((error = xlog_find_verify_cycle(log,
+						start_blk, num_scan_bblks,
+						stop_on_cycle, &new_blk)))
+			goto bp_err;
+		if (new_blk != -1)
+			head_blk = new_blk;
+	} else {		/* need to read 2 parts of log */
+		/*
+		 * We are going to scan backwards in the log in two parts.
+		 * First we scan the physical end of the log.  In this part
+		 * of the log, we are looking for blocks with cycle number
+		 * last_half_cycle - 1.
+		 * If we find one, then we know that the log starts there, as
+		 * we've found a hole that didn't get written in going around
+		 * the end of the physical log.  The simple case for this is
+		 *        x + 1 ... | x ... | x - 1 | x
+		 *        <---------> less than scan distance
+		 * If all of the blocks at the end of the log have cycle number
+		 * last_half_cycle, then we check the blocks at the start of
+		 * the log looking for occurrences of last_half_cycle.  If we
+		 * find one, then our current estimate for the location of the
+		 * first occurrence of last_half_cycle is wrong and we move
+		 * back to the hole we've found.  This case looks like
+		 *        x + 1 ... | x | x + 1 | x ...
+		 *                               ^ binary search stopped here
+		 * Another case we need to handle that only occurs in 256k
+		 * logs is
+		 *        x + 1 ... | x ... | x+1 | x ...
+		 *                   ^ binary search stops here
+		 * In a 256k log, the scan at the end of the log will see the
+		 * x + 1 blocks.  We need to skip past those since that is
+		 * certainly not the head of the log.  By searching for
+		 * last_half_cycle-1 we accomplish that.
+		 */
+		ASSERT(head_blk <= INT_MAX &&
+			(xfs_daddr_t) num_scan_bblks >= head_blk);
+		start_blk = log_bbnum - (num_scan_bblks - head_blk);
+		if ((error = xlog_find_verify_cycle(log, start_blk,
+					num_scan_bblks - (int)head_blk,
+					(stop_on_cycle - 1), &new_blk)))
+			goto bp_err;
+		if (new_blk != -1) {
+			head_blk = new_blk;
+			goto validate_head;
+		}
+
+		/*
+		 * Scan beginning of log now.  The last part of the physical
+		 * log is good.  This scan needs to verify that it doesn't find
+		 * the last_half_cycle.
+		 */
+		start_blk = 0;
+		ASSERT(head_blk <= INT_MAX);
+		if ((error = xlog_find_verify_cycle(log,
+					start_blk, (int)head_blk,
+					stop_on_cycle, &new_blk)))
+			goto bp_err;
+		if (new_blk != -1)
+			head_blk = new_blk;
+	}
+
+validate_head:
+	/*
+	 * Now we need to make sure head_blk is not pointing to a block in
+	 * the middle of a log record.
+	 */
+	num_scan_bblks = XLOG_REC_SHIFT(log);
+	if (head_blk >= num_scan_bblks) {
+		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
+
+		/* start ptr at last block ptr before head_blk */
+		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+		if (error == 1)
+			error = -EIO;
+		if (error)
+			goto bp_err;
+	} else {
+		start_blk = 0;
+		ASSERT(head_blk <= INT_MAX);
+		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+		if (error < 0)
+			goto bp_err;
+		if (error == 1) {
+			/* We hit the beginning of the log during our search */
+			start_blk = log_bbnum - (num_scan_bblks - head_blk);
+			new_blk = log_bbnum;
+			ASSERT(start_blk <= INT_MAX &&
+				(xfs_daddr_t) log_bbnum-start_blk >= 0);
+			ASSERT(head_blk <= INT_MAX);
+			error = xlog_find_verify_log_record(log, start_blk,
+							&new_blk, (int)head_blk);
+			if (error == 1)
+				error = -EIO;
+			if (error)
+				goto bp_err;
+			if (new_blk != log_bbnum)
+				head_blk = new_blk;
+		} else if (error)
+			goto bp_err;
+	}
+
+	xlog_put_bp(bp);
+	if (head_blk == log_bbnum)
+		*return_head_blk = 0;
+	else
+		*return_head_blk = head_blk;
+	/*
+	 * When returning here, we have a good block number.  Bad block
+	 * means that during a previous crash, we didn't have a clean break
+	 * from cycle number N to cycle number N-1.  In this case, we need
+	 * to find the first block with cycle number N-1.
+	 */
+	return 0;
+
+ bp_err:
+	xlog_put_bp(bp);
+
+	if (error)
+		xfs_warn(log->l_mp, "failed to find log head");
+	return error;
+}
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk.  Every log record header has
+ * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
+ * to get a sync block number.  The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn.  The entire log record does not need to be valid.  We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+STATIC int
+xlog_find_tail(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,
+	xfs_daddr_t		*tail_blk)
+{
+	xlog_rec_header_t	*rhead;
+	xlog_op_header_t	*op_head;
+	xfs_caddr_t		offset = NULL;
+	xfs_buf_t		*bp;
+	int			error, i, found;
+	xfs_daddr_t		umount_data_blk;
+	xfs_daddr_t		after_umount_blk;
+	xfs_lsn_t		tail_lsn;
+	int			hblks;
+
+	found = 0;
+
+	/*
+	 * Find previous log record
+	 */
+	if ((error = xlog_find_head(log, head_blk)))
+		return error;
+
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return -ENOMEM;
+	if (*head_blk == 0) {				/* special case */
+		error = xlog_bread(log, 0, 1, bp, &offset);
+		if (error)
+			goto done;
+
+		if (xlog_get_cycle(offset) == 0) {
+			*tail_blk = 0;
+			/* leave all other log inited values alone */
+			goto done;
+		}
+	}
+
+	/*
+	 * Search backwards looking for log record header block
+	 */
+	ASSERT(*head_blk < INT_MAX);
+	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
+		error = xlog_bread(log, i, 1, bp, &offset);
+		if (error)
+			goto done;
+
+		if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+			found = 1;
+			break;
+		}
+	}
+	/*
+	 * If we haven't found the log record header block, start looking
+	 * again from the end of the physical log.  XXXmiken: There should be
+	 * a check here to make sure we didn't search more than N blocks in
+	 * the previous code.
+	 */
+	if (!found) {
+		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
+			error = xlog_bread(log, i, 1, bp, &offset);
+			if (error)
+				goto done;
+
+			if (*(__be32 *)offset ==
+			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+				found = 2;
+				break;
+			}
+		}
+	}
+	if (!found) {
+		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+		xlog_put_bp(bp);
+		ASSERT(0);
+		return -EIO;
+	}
+
+	/* find blk_no of tail of log */
+	rhead = (xlog_rec_header_t *)offset;
+	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+
+	/*
+	 * Reset log values according to the state of the log when we
+	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
+	 * one because the next write starts a new cycle rather than
+	 * continuing the cycle of the last good log record.  At this
+	 * point we have guaranteed that all partial log records have been
+	 * accounted for.  Therefore, we know that the last good log record
+	 * written was complete and ended exactly on the end boundary
+	 * of the physical log.
+	 */
+	log->l_prev_block = i;
+	log->l_curr_block = (int)*head_blk;
+	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+	if (found == 2)
+		log->l_curr_cycle++;
+	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+
+	/*
+	 * Look for unmount record.  If we find it, then we know there
+	 * was a clean unmount.  Since 'i' could be the last block in
+	 * the physical log, we convert to a log block before comparing
+	 * to the head_blk.
+	 *
+	 * Save the current tail lsn to use to pass to
+	 * xlog_clear_stale_blocks() below.  We won't want to clear the
+	 * unmount record if there is one, so we pass the lsn of the
+	 * unmount record rather than the block after it.
+	 */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		int	h_size = be32_to_cpu(rhead->h_size);
+		int	h_version = be32_to_cpu(rhead->h_version);
+
+		if ((h_version & XLOG_VERSION_2) &&
+		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+			if (h_size % XLOG_HEADER_CYCLE_SIZE)
+				hblks++;
+		} else {
+			hblks = 1;
+		}
+	} else {
+		hblks = 1;
+	}
+	after_umount_blk = (i + hblks + (int)
+		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
+	tail_lsn = atomic64_read(&log->l_tail_lsn);
+	if (*head_blk == after_umount_blk &&
+	    be32_to_cpu(rhead->h_num_logops) == 1) {
+		umount_data_blk = (i + hblks) % log->l_logBBsize;
+		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+		if (error)
+			goto done;
+
+		op_head = (xlog_op_header_t *)offset;
+		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+			/*
+			 * Set tail and last sync so that newly written
+			 * log records will point recovery to after the
+			 * current unmount record.
+			 */
+			xlog_assign_atomic_lsn(&log->l_tail_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			*tail_blk = after_umount_blk;
+
+			/*
+			 * Note that the unmount was clean. If the unmount
+			 * was not clean, we need to know this to rebuild the
+			 * superblock counters from the perag headers if we
+			 * have a filesystem using non-persistent counters.
+			 */
+			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+		}
+	}
+
+	/*
+	 * Make sure that there are no blocks in front of the head
+	 * with the same cycle number as the head.  This can happen
+	 * because we allow multiple outstanding log writes concurrently,
+	 * and the later writes might make it out before earlier ones.
+	 *
+	 * We use the lsn from before modifying it so that we'll never
+	 * overwrite the unmount record after a clean unmount.
+	 *
+	 * Do this only if we are going to recover the filesystem
+	 *
+	 * NOTE: This used to say "if (!readonly)"
+	 * However on Linux, we can & do recover a read-only filesystem.
+	 * We only skip recovery if NORECOVERY is specified on mount,
+	 * in which case we would not be here.
+	 *
+	 * But... if the -device- itself is readonly, just skip this.
+	 * We can't recover this device anyway, so it won't matter.
+	 */
+	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
+		error = xlog_clear_stale_blocks(log, tail_lsn);
+
+done:
+	xlog_put_bp(bp);
+
+	if (error)
+		xfs_warn(log->l_mp, "failed to locate log tail");
+	return error;
+}
+
+/*
+ * Is the log zeroed at all?
+ *
+ * The last binary search should be changed to perform an X block read
+ * once X becomes small enough.  You can then search linearly through
+ * the X blocks.  This will cut down on the number of reads we need to do.
+ *
+ * If the log is partially zeroed, this routine will pass back the blkno
+ * of the first block with cycle number 0.  It won't have a complete LR
+ * preceding it.
+ *
+ * Return:
+ *	0  => the log is completely written to
+ *	1 => use *blk_no as the first block of the log
+ *	<0 => error has occurred
+ */
+STATIC int
+xlog_find_zeroed(
+	struct xlog	*log,
+	xfs_daddr_t	*blk_no)
+{
+	xfs_buf_t	*bp;
+	xfs_caddr_t	offset;
+	uint	        first_cycle, last_cycle;
+	xfs_daddr_t	new_blk, last_blk, start_blk;
+	xfs_daddr_t     num_scan_bblks;
+	int	        error, log_bbnum = log->l_logBBsize;
+
+	*blk_no = 0;
+
+	/* check totally zeroed log */
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return -ENOMEM;
+	error = xlog_bread(log, 0, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	first_cycle = xlog_get_cycle(offset);
+	if (first_cycle == 0) {		/* completely zeroed log */
+		*blk_no = 0;
+		xlog_put_bp(bp);
+		return 1;
+	}
+
+	/* check partially zeroed log */
+	error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	last_cycle = xlog_get_cycle(offset);
+	if (last_cycle != 0) {		/* log completely written to */
+		xlog_put_bp(bp);
+		return 0;
+	} else if (first_cycle != 1) {
+		/*
+		 * If the cycle of the last block is zero, the cycle of
+		 * the first block must be 1. If it's not, maybe we're
+		 * not looking at a log... Bail out.
+		 */
+		xfs_warn(log->l_mp,
+			"Log inconsistent or not a log (last==0, first!=1)");
+		error = -EINVAL;
+		goto bp_err;
+	}
+
+	/* we have a partially zeroed log */
+	last_blk = log_bbnum-1;
+	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
+		goto bp_err;
+
+	/*
+	 * Validate the answer.  Because there is no way to guarantee that
+	 * the entire log is made up of log records which are the same size,
+	 * we scan over the defined maximum blocks.  At this point, the maximum
+	 * is not chosen to mean anything special.   XXXmiken
+	 */
+	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+	ASSERT(num_scan_bblks <= INT_MAX);
+
+	if (last_blk < num_scan_bblks)
+		num_scan_bblks = last_blk;
+	start_blk = last_blk - num_scan_bblks;
+
+	/*
+	 * We search for any instances of cycle number 0 that occur before
+	 * our current estimate of the head.  What we're trying to detect is
+	 *        1 ... | 0 | 1 | 0...
+	 *                       ^ binary search ends here
+	 */
+	if ((error = xlog_find_verify_cycle(log, start_blk,
+					 (int)num_scan_bblks, 0, &new_blk)))
+		goto bp_err;
+	if (new_blk != -1)
+		last_blk = new_blk;
+
+	/*
+	 * Potentially backup over partial log record write.  We don't need
+	 * to search the end of the log because we know it is zero.
+	 */
+	error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
+	if (error == 1)
+		error = -EIO;
+	if (error)
+		goto bp_err;
+
+	*blk_no = last_blk;
+bp_err:
+	xlog_put_bp(bp);
+	if (error)
+		return error;
+	return 1;
+}
+
+/*
+ * These are simple subroutines used by xlog_clear_stale_blocks() below
+ * to initialize a buffer full of empty log record headers and write
+ * them into the log.
+ */
+STATIC void
+xlog_add_record(
+	struct xlog		*log,
+	xfs_caddr_t		buf,
+	int			cycle,
+	int			block,
+	int			tail_cycle,
+	int			tail_block)
+{
+	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
+
+	memset(buf, 0, BBSIZE);
+	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+	recp->h_cycle = cpu_to_be32(cycle);
+	recp->h_version = cpu_to_be32(
+			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
+	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
+	recp->h_fmt = cpu_to_be32(XLOG_FMT);
+	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
+}
+
+STATIC int
+xlog_write_log_records(
+	struct xlog	*log,
+	int		cycle,
+	int		start_block,
+	int		blocks,
+	int		tail_cycle,
+	int		tail_block)
+{
+	xfs_caddr_t	offset;
+	xfs_buf_t	*bp;
+	int		balign, ealign;
+	int		sectbb = log->l_sectBBsize;
+	int		end_block = start_block + blocks;
+	int		bufblks;
+	int		error = 0;
+	int		i, j = 0;
+
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks to be written.  If that fails, try
+	 * a smaller size.  We need to be able to write at least a
+	 * log sector, or we're out of luck.
+	 */
+	bufblks = 1 << ffs(blocks);
+	while (bufblks > log->l_logBBsize)
+		bufblks >>= 1;
+	while (!(bp = xlog_get_bp(log, bufblks))) {
+		bufblks >>= 1;
+		if (bufblks < sectbb)
+			return -ENOMEM;
+	}
+
+	/* We may need to do a read at the start to fill in part of
+	 * the buffer in the starting sector not covered by the first
+	 * write below.
+	 */
+	balign = round_down(start_block, sectbb);
+	if (balign != start_block) {
+		error = xlog_bread_noalign(log, start_block, 1, bp);
+		if (error)
+			goto out_put_bp;
+
+		j = start_block - balign;
+	}
+
+	for (i = start_block; i < end_block; i += bufblks) {
+		int		bcount, endcount;
+
+		bcount = min(bufblks, end_block - start_block);
+		endcount = bcount - j;
+
+		/* We may need to do a read at the end to fill in part of
+		 * the buffer in the final sector not covered by the write.
+		 * If this is the same sector as the above read, skip it.
+		 */
+		ealign = round_down(end_block, sectbb);
+		if (j == 0 && (start_block + endcount > ealign)) {
+			offset = bp->b_addr + BBTOB(ealign - start_block);
+			error = xlog_bread_offset(log, ealign, sectbb,
+							bp, offset);
+			if (error)
+				break;
+
+		}
+
+		offset = xlog_align(log, start_block, endcount, bp);
+		for (; j < endcount; j++) {
+			xlog_add_record(log, offset, cycle, i+j,
+					tail_cycle, tail_block);
+			offset += BBSIZE;
+		}
+		error = xlog_bwrite(log, start_block, endcount, bp);
+		if (error)
+			break;
+		start_block += endcount;
+		j = 0;
+	}
+
+ out_put_bp:
+	xlog_put_bp(bp);
+	return error;
+}
+
+/*
+ * This routine is called to blow away any incomplete log writes out
+ * in front of the log head.  We do this so that we won't become confused
+ * if we come up, write only a little bit more, and then crash again.
+ * If we leave the partial log records out there, this situation could
+ * cause us to think those partial writes are valid blocks since they
+ * have the current cycle number.  We get rid of them by overwriting them
+ * with empty log records with the old cycle number rather than the
+ * current one.
+ *
+ * The tail lsn is passed in rather than taken from
+ * the log so that we will not write over the unmount record after a
+ * clean unmount in a 512 block log.  Doing so would leave the log without
+ * any valid log records in it until a new one was written.  If we crashed
+ * during that time we would not be able to recover.
+ */
+STATIC int
+xlog_clear_stale_blocks(
+	struct xlog	*log,
+	xfs_lsn_t	tail_lsn)
+{
+	int		tail_cycle, head_cycle;
+	int		tail_block, head_block;
+	int		tail_distance, max_distance;
+	int		distance;
+	int		error;
+
+	tail_cycle = CYCLE_LSN(tail_lsn);
+	tail_block = BLOCK_LSN(tail_lsn);
+	head_cycle = log->l_curr_cycle;
+	head_block = log->l_curr_block;
+
+	/*
+	 * Figure out the distance between the new head of the log
+	 * and the tail.  We want to write over any blocks beyond the
+	 * head that we may have written just before the crash, but
+	 * we don't want to overwrite the tail of the log.
+	 */
+	if (head_cycle == tail_cycle) {
+		/*
+		 * The tail is behind the head in the physical log,
+		 * so the distance from the head to the tail is the
+		 * distance from the head to the end of the log plus
+		 * the distance from the beginning of the log to the
+		 * tail.
+		 */
+		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
+			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
+					 XFS_ERRLEVEL_LOW, log->l_mp);
+			return -EFSCORRUPTED;
+		}
+		tail_distance = tail_block + (log->l_logBBsize - head_block);
+	} else {
+		/*
+		 * The head is behind the tail in the physical log,
+		 * so the distance from the head to the tail is just
+		 * the tail block minus the head block.
+		 */
+		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
+			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
+					 XFS_ERRLEVEL_LOW, log->l_mp);
+			return -EFSCORRUPTED;
+		}
+		tail_distance = tail_block - head_block;
+	}
+
+	/*
+	 * If the head is right up against the tail, we can't clear
+	 * anything.
+	 */
+	if (tail_distance <= 0) {
+		ASSERT(tail_distance == 0);
+		return 0;
+	}
+
+	max_distance = XLOG_TOTAL_REC_SHIFT(log);
+	/*
+	 * Take the smaller of the maximum amount of outstanding I/O
+	 * we could have and the distance to the tail to clear out.
+	 * We take the smaller so that we don't overwrite the tail and
+	 * we don't waste all day writing from the head to the tail
+	 * for no reason.
+	 */
+	max_distance = MIN(max_distance, tail_distance);
+
+	if ((head_block + max_distance) <= log->l_logBBsize) {
+		/*
+		 * We can stomp all the blocks we need to without
+		 * wrapping around the end of the log.  Just do it
+		 * in a single write.  Use the cycle number of the
+		 * current cycle minus one so that the log will look like:
+		 *     n ... | n - 1 ...
+		 */
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, max_distance, tail_cycle,
+				tail_block);
+		if (error)
+			return error;
+	} else {
+		/*
+		 * We need to wrap around the end of the physical log in
+		 * order to clear all the blocks.  Do it in two separate
+		 * I/Os.  The first write should be from the head to the
+		 * end of the physical log, and it should use the current
+		 * cycle number minus one just like above.
+		 */
+		distance = log->l_logBBsize - head_block;
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, distance, tail_cycle,
+				tail_block);
+
+		if (error)
+			return error;
+
+		/*
+		 * Now write the blocks at the start of the physical log.
+		 * This writes the remainder of the blocks we want to clear.
+		 * It uses the current cycle number since we're now on the
+		 * same cycle as the head so that we get:
+		 *    n ... n ... | n - 1 ...
+		 *    ^^^^^ blocks we're writing
+		 */
+		distance = max_distance - (log->l_logBBsize - head_block);
+		error = xlog_write_log_records(log, head_cycle, 0, distance,
+				tail_cycle, tail_block);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ *
+ *		Log recover routines
+ *
+ ******************************************************************************
+ */
+
+/*
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ *	1. Every item is only logged once in a given transaction. Hence it
+ *	   represents the last logged state of the item. Hence ordering is
+ *	   dependent on the order in which operations need to be performed so
+ *	   required initial conditions are always met.
+ *
+ *	2. Cancelled buffers are recorded in pass 1 in a separate table and
+ *	   there's nothing to replay from them so we can simply cull them
+ *	   from the transaction. However, we can't do that until after we've
+ *	   replayed all the other items because they may be dependent on the
+ *	   cancelled buffer and replaying the cancelled buffer can remove it
+ *	   form the cancelled buffer table. Hence they have tobe done last.
+ *
+ *	3. Inode allocation buffers must be replayed before inode items that
+ *	   read the buffer and replay changes into it. For filesystems using the
+ *	   ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ *	   treated the same as inode allocation buffers as they create and
+ *	   initialise the buffers directly.
+ *
+ *	4. Inode unlink buffers must be replayed after inode items are replayed.
+ *	   This ensures that inodes are completely flushed to the inode buffer
+ *	   in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ *	- buffer_list for all buffers except cancelled/inode unlink buffers
+ *	- item_list for all non-buffer items
+ *	- inode_buffer_list for inode unlink buffers
+ *	- cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
+ */
+STATIC int
+xlog_recover_reorder_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	int			pass)
+{
+	xlog_recover_item_t	*item, *n;
+	int			error = 0;
+	LIST_HEAD(sort_list);
+	LIST_HEAD(cancel_list);
+	LIST_HEAD(buffer_list);
+	LIST_HEAD(inode_buffer_list);
+	LIST_HEAD(inode_list);
+
+	list_splice_init(&trans->r_itemq, &sort_list);
+	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
+		xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
+
+		switch (ITEM_TYPE(item)) {
+		case XFS_LI_ICREATE:
+			list_move_tail(&item->ri_list, &buffer_list);
+			break;
+		case XFS_LI_BUF:
+			if (buf_f->blf_flags & XFS_BLF_CANCEL) {
+				trace_xfs_log_recover_item_reorder_head(log,
+							trans, item, pass);
+				list_move(&item->ri_list, &cancel_list);
+				break;
+			}
+			if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+				list_move(&item->ri_list, &inode_buffer_list);
+				break;
+			}
+			list_move_tail(&item->ri_list, &buffer_list);
+			break;
+		case XFS_LI_INODE:
+		case XFS_LI_DQUOT:
+		case XFS_LI_QUOTAOFF:
+		case XFS_LI_EFD:
+		case XFS_LI_EFI:
+			trace_xfs_log_recover_item_reorder_tail(log,
+							trans, item, pass);
+			list_move_tail(&item->ri_list, &inode_list);
+			break;
+		default:
+			xfs_warn(log->l_mp,
+				"%s: unrecognized type of log operation",
+				__func__);
+			ASSERT(0);
+			/*
+			 * return the remaining items back to the transaction
+			 * item list so they can be freed in caller.
+			 */
+			if (!list_empty(&sort_list))
+				list_splice_init(&sort_list, &trans->r_itemq);
+			error = -EIO;
+			goto out;
+		}
+	}
+out:
+	ASSERT(list_empty(&sort_list));
+	if (!list_empty(&buffer_list))
+		list_splice(&buffer_list, &trans->r_itemq);
+	if (!list_empty(&inode_list))
+		list_splice_tail(&inode_list, &trans->r_itemq);
+	if (!list_empty(&inode_buffer_list))
+		list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+	if (!list_empty(&cancel_list))
+		list_splice_tail(&cancel_list, &trans->r_itemq);
+	return error;
+}
+
+/*
+ * Build up the table of buf cancel records so that we don't replay
+ * cancelled data in the second pass.  For buffer records that are
+ * not cancel records, there is nothing to do here so we just return.
+ *
+ * If we get a cancel record which is already in the table, this indicates
+ * that the buffer was cancelled multiple times.  In order to ensure
+ * that during pass 2 we keep the record in the table until we reach its
+ * last occurrence in the log, we keep a reference count in the cancel
+ * record in the table to tell us how many times we expect to see this
+ * record during the second pass.
+ */
+STATIC int
+xlog_recover_buffer_pass1(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
+	struct list_head	*bucket;
+	struct xfs_buf_cancel	*bcp;
+
+	/*
+	 * If this isn't a cancel buffer item, then just return.
+	 */
+	if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
+		return 0;
+	}
+
+	/*
+	 * Insert an xfs_buf_cancel record into the hash table of them.
+	 * If there is already an identical record, bump its reference count.
+	 */
+	bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
+	list_for_each_entry(bcp, bucket, bc_list) {
+		if (bcp->bc_blkno == buf_f->blf_blkno &&
+		    bcp->bc_len == buf_f->blf_len) {
+			bcp->bc_refcount++;
+			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
+			return 0;
+		}
+	}
+
+	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+	bcp->bc_blkno = buf_f->blf_blkno;
+	bcp->bc_len = buf_f->blf_len;
+	bcp->bc_refcount = 1;
+	list_add_tail(&bcp->bc_list, bucket);
+
+	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+	return 0;
+}
+
+/*
+ * Check to see whether the buffer being recovered has a corresponding
+ * entry in the buffer cancel record table. If it is, return the cancel
+ * buffer structure to the caller.
+ */
+STATIC struct xfs_buf_cancel *
+xlog_peek_buffer_cancelled(
+	struct xlog		*log,
+	xfs_daddr_t		blkno,
+	uint			len,
+	ushort			flags)
+{
+	struct list_head	*bucket;
+	struct xfs_buf_cancel	*bcp;
+
+	if (!log->l_buf_cancel_table) {
+		/* empty table means no cancelled buffers in the log */
+		ASSERT(!(flags & XFS_BLF_CANCEL));
+		return NULL;
+	}
+
+	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+	list_for_each_entry(bcp, bucket, bc_list) {
+		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+			return bcp;
+	}
+
+	/*
+	 * We didn't find a corresponding entry in the table, so return 0 so
+	 * that the buffer is NOT cancelled.
+	 */
+	ASSERT(!(flags & XFS_BLF_CANCEL));
+	return NULL;
+}
+
+/*
+ * If the buffer is being cancelled then return 1 so that it will be cancelled,
+ * otherwise return 0.  If the buffer is actually a buffer cancel item
+ * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
+ * table and remove it from the table if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its last
+ * occurrence in the log so that if the same buffer is re-used again after its
+ * last cancellation we actually replay the changes made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+	struct xlog		*log,
+	xfs_daddr_t		blkno,
+	uint			len,
+	ushort			flags)
+{
+	struct xfs_buf_cancel	*bcp;
+
+	bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
+	if (!bcp)
+		return 0;
+
+	/*
+	 * We've go a match, so return 1 so that the recovery of this buffer
+	 * is cancelled.  If this buffer is actually a buffer cancel log
+	 * item, then decrement the refcount on the one in the table and
+	 * remove it if this is the last reference.
+	 */
+	if (flags & XFS_BLF_CANCEL) {
+		if (--bcp->bc_refcount == 0) {
+			list_del(&bcp->bc_list);
+			kmem_free(bcp);
+		}
+	}
+	return 1;
+}
+
+/*
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ *
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes.  In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(
+	struct xfs_mount	*mp,
+	xlog_recover_item_t	*item,
+	struct xfs_buf		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	int			i;
+	int			item_index = 0;
+	int			bit = 0;
+	int			nbits = 0;
+	int			reg_buf_offset = 0;
+	int			reg_buf_bytes = 0;
+	int			next_unlinked_offset;
+	int			inodes_per_buf;
+	xfs_agino_t		*logged_nextp;
+	xfs_agino_t		*buffer_nextp;
+
+	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
+	/*
+	 * Post recovery validation only works properly on CRC enabled
+	 * filesystems.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		bp->b_ops = &xfs_inode_buf_ops;
+
+	inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
+	for (i = 0; i < inodes_per_buf; i++) {
+		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+
+		while (next_unlinked_offset >=
+		       (reg_buf_offset + reg_buf_bytes)) {
+			/*
+			 * The next di_next_unlinked field is beyond
+			 * the current logged region.  Find the next
+			 * logged region that contains or is beyond
+			 * the current di_next_unlinked field.
+			 */
+			bit += nbits;
+			bit = xfs_next_bit(buf_f->blf_data_map,
+					   buf_f->blf_map_size, bit);
+
+			/*
+			 * If there are no more logged regions in the
+			 * buffer, then we're done.
+			 */
+			if (bit == -1)
+				return 0;
+
+			nbits = xfs_contig_bits(buf_f->blf_data_map,
+						buf_f->blf_map_size, bit);
+			ASSERT(nbits > 0);
+			reg_buf_offset = bit << XFS_BLF_SHIFT;
+			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
+			item_index++;
+		}
+
+		/*
+		 * If the current logged region starts after the current
+		 * di_next_unlinked field, then move on to the next
+		 * di_next_unlinked field.
+		 */
+		if (next_unlinked_offset < reg_buf_offset)
+			continue;
+
+		ASSERT(item->ri_buf[item_index].i_addr != NULL);
+		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+		ASSERT((reg_buf_offset + reg_buf_bytes) <=
+							BBTOB(bp->b_io_length));
+
+		/*
+		 * The current logged region contains a copy of the
+		 * current di_next_unlinked field.  Extract its value
+		 * and copy it to the buffer copy.
+		 */
+		logged_nextp = item->ri_buf[item_index].i_addr +
+				next_unlinked_offset - reg_buf_offset;
+		if (unlikely(*logged_nextp == 0)) {
+			xfs_alert(mp,
+		"Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+		"Trying to replay bad (0) inode di_next_unlinked field.",
+				item, bp);
+			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
+					 XFS_ERRLEVEL_LOW, mp);
+			return -EFSCORRUPTED;
+		}
+
+		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
+					      next_unlinked_offset);
+		*buffer_nextp = *logged_nextp;
+
+		/*
+		 * If necessary, recalculate the CRC in the on-disk inode. We
+		 * have to leave the inode in a consistent state for whoever
+		 * reads it next....
+		 */
+		xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
+	}
+
+	return 0;
+}
+
+/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number.  If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	__uint32_t		magic32;
+	__uint16_t		magic16;
+	__uint16_t		magicda;
+	void			*blk = bp->b_addr;
+	uuid_t			*uuid;
+	xfs_lsn_t		lsn = -1;
+
+	/* v4 filesystems always recover immediately */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		goto recover_immediately;
+
+	magic32 = be32_to_cpu(*(__be32 *)blk);
+	switch (magic32) {
+	case XFS_ABTB_CRC_MAGIC:
+	case XFS_ABTC_CRC_MAGIC:
+	case XFS_ABTB_MAGIC:
+	case XFS_ABTC_MAGIC:
+	case XFS_IBT_CRC_MAGIC:
+	case XFS_IBT_MAGIC: {
+		struct xfs_btree_block *btb = blk;
+
+		lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+		uuid = &btb->bb_u.s.bb_uuid;
+		break;
+	}
+	case XFS_BMAP_CRC_MAGIC:
+	case XFS_BMAP_MAGIC: {
+		struct xfs_btree_block *btb = blk;
+
+		lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+		uuid = &btb->bb_u.l.bb_uuid;
+		break;
+	}
+	case XFS_AGF_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+		uuid = &((struct xfs_agf *)blk)->agf_uuid;
+		break;
+	case XFS_AGFL_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+		uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+		break;
+	case XFS_AGI_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+		uuid = &((struct xfs_agi *)blk)->agi_uuid;
+		break;
+	case XFS_SYMLINK_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+		uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+		break;
+	case XFS_DIR3_BLOCK_MAGIC:
+	case XFS_DIR3_DATA_MAGIC:
+	case XFS_DIR3_FREE_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+		uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+		break;
+	case XFS_ATTR3_RMT_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+		uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+		break;
+	case XFS_SB_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+		uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+		break;
+	default:
+		break;
+	}
+
+	if (lsn != (xfs_lsn_t)-1) {
+		if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+			goto recover_immediately;
+		return lsn;
+	}
+
+	magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+	switch (magicda) {
+	case XFS_DIR3_LEAF1_MAGIC:
+	case XFS_DIR3_LEAFN_MAGIC:
+	case XFS_DA3_NODE_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+		uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+		break;
+	default:
+		break;
+	}
+
+	if (lsn != (xfs_lsn_t)-1) {
+		if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+			goto recover_immediately;
+		return lsn;
+	}
+
+	/*
+	 * We do individual object checks on dquot and inode buffers as they
+	 * have their own individual LSN records. Also, we could have a stale
+	 * buffer here, so we have to at least recognise these buffer types.
+	 *
+	 * A notd complexity here is inode unlinked list processing - it logs
+	 * the inode directly in the buffer, but we don't know which inodes have
+	 * been modified, and there is no global buffer LSN. Hence we need to
+	 * recover all inode buffer types immediately. This problem will be
+	 * fixed by logical logging of the unlinked list modifications.
+	 */
+	magic16 = be16_to_cpu(*(__be16 *)blk);
+	switch (magic16) {
+	case XFS_DQUOT_MAGIC:
+	case XFS_DINODE_MAGIC:
+		goto recover_immediately;
+	default:
+		break;
+	}
+
+	/* unknown buffer contents, recover immediately */
+
+recover_immediately:
+	return (xfs_lsn_t)-1;
+
+}
+
+/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ *	the first 16 bits of the buffer (inode buffer, dquot buffer),
+ *	the first 32 bits of the buffer (most blocks),
+ *	inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recover_validate_buf_type(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+	__uint32_t		magic32;
+	__uint16_t		magic16;
+	__uint16_t		magicda;
+
+	/*
+	 * We can only do post recovery validation on items on CRC enabled
+	 * fielsystems as we need to know when the buffer was written to be able
+	 * to determine if we should have replayed the item. If we replay old
+	 * metadata over a newer buffer, then it will enter a temporarily
+	 * inconsistent state resulting in verification failures. Hence for now
+	 * just avoid the verification stage for non-crc filesystems
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+	magicda = be16_to_cpu(info->magic);
+	switch (xfs_blft_from_flags(buf_f)) {
+	case XFS_BLFT_BTREE_BUF:
+		switch (magic32) {
+		case XFS_ABTB_CRC_MAGIC:
+		case XFS_ABTC_CRC_MAGIC:
+		case XFS_ABTB_MAGIC:
+		case XFS_ABTC_MAGIC:
+			bp->b_ops = &xfs_allocbt_buf_ops;
+			break;
+		case XFS_IBT_CRC_MAGIC:
+		case XFS_FIBT_CRC_MAGIC:
+		case XFS_IBT_MAGIC:
+		case XFS_FIBT_MAGIC:
+			bp->b_ops = &xfs_inobt_buf_ops;
+			break;
+		case XFS_BMAP_CRC_MAGIC:
+		case XFS_BMAP_MAGIC:
+			bp->b_ops = &xfs_bmbt_buf_ops;
+			break;
+		default:
+			xfs_warn(mp, "Bad btree block magic!");
+			ASSERT(0);
+			break;
+		}
+		break;
+	case XFS_BLFT_AGF_BUF:
+		if (magic32 != XFS_AGF_MAGIC) {
+			xfs_warn(mp, "Bad AGF block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_agf_buf_ops;
+		break;
+	case XFS_BLFT_AGFL_BUF:
+		if (magic32 != XFS_AGFL_MAGIC) {
+			xfs_warn(mp, "Bad AGFL block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_agfl_buf_ops;
+		break;
+	case XFS_BLFT_AGI_BUF:
+		if (magic32 != XFS_AGI_MAGIC) {
+			xfs_warn(mp, "Bad AGI block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_agi_buf_ops;
+		break;
+	case XFS_BLFT_UDQUOT_BUF:
+	case XFS_BLFT_PDQUOT_BUF:
+	case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+		if (magic16 != XFS_DQUOT_MAGIC) {
+			xfs_warn(mp, "Bad DQUOT block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dquot_buf_ops;
+#else
+		xfs_alert(mp,
+	"Trying to recover dquots without QUOTA support built in!");
+		ASSERT(0);
+#endif
+		break;
+	case XFS_BLFT_DINO_BUF:
+		if (magic16 != XFS_DINODE_MAGIC) {
+			xfs_warn(mp, "Bad INODE block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_inode_buf_ops;
+		break;
+	case XFS_BLFT_SYMLINK_BUF:
+		if (magic32 != XFS_SYMLINK_MAGIC) {
+			xfs_warn(mp, "Bad symlink block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_symlink_buf_ops;
+		break;
+	case XFS_BLFT_DIR_BLOCK_BUF:
+		if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+		    magic32 != XFS_DIR3_BLOCK_MAGIC) {
+			xfs_warn(mp, "Bad dir block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_block_buf_ops;
+		break;
+	case XFS_BLFT_DIR_DATA_BUF:
+		if (magic32 != XFS_DIR2_DATA_MAGIC &&
+		    magic32 != XFS_DIR3_DATA_MAGIC) {
+			xfs_warn(mp, "Bad dir data magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_data_buf_ops;
+		break;
+	case XFS_BLFT_DIR_FREE_BUF:
+		if (magic32 != XFS_DIR2_FREE_MAGIC &&
+		    magic32 != XFS_DIR3_FREE_MAGIC) {
+			xfs_warn(mp, "Bad dir3 free magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_free_buf_ops;
+		break;
+	case XFS_BLFT_DIR_LEAF1_BUF:
+		if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+		    magicda != XFS_DIR3_LEAF1_MAGIC) {
+			xfs_warn(mp, "Bad dir leaf1 magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		break;
+	case XFS_BLFT_DIR_LEAFN_BUF:
+		if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+		    magicda != XFS_DIR3_LEAFN_MAGIC) {
+			xfs_warn(mp, "Bad dir leafn magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		break;
+	case XFS_BLFT_DA_NODE_BUF:
+		if (magicda != XFS_DA_NODE_MAGIC &&
+		    magicda != XFS_DA3_NODE_MAGIC) {
+			xfs_warn(mp, "Bad da node magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_da3_node_buf_ops;
+		break;
+	case XFS_BLFT_ATTR_LEAF_BUF:
+		if (magicda != XFS_ATTR_LEAF_MAGIC &&
+		    magicda != XFS_ATTR3_LEAF_MAGIC) {
+			xfs_warn(mp, "Bad attr leaf magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_attr3_leaf_buf_ops;
+		break;
+	case XFS_BLFT_ATTR_RMT_BUF:
+		if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+			xfs_warn(mp, "Bad attr remote magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_attr3_rmt_buf_ops;
+		break;
+	case XFS_BLFT_SB_BUF:
+		if (magic32 != XFS_SB_MAGIC) {
+			xfs_warn(mp, "Bad SB block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_sb_buf_ops;
+		break;
+	default:
+		xfs_warn(mp, "Unknown buffer type %d!",
+			 xfs_blft_from_flags(buf_f));
+		break;
+	}
+}
+
+/*
+ * Perform a 'normal' buffer recovery.  Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer.  The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+STATIC void
+xlog_recover_do_reg_buffer(
+	struct xfs_mount	*mp,
+	xlog_recover_item_t	*item,
+	struct xfs_buf		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	int			i;
+	int			bit;
+	int			nbits;
+	int                     error;
+
+	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
+	bit = 0;
+	i = 1;  /* 0 is the buf format structure */
+	while (1) {
+		bit = xfs_next_bit(buf_f->blf_data_map,
+				   buf_f->blf_map_size, bit);
+		if (bit == -1)
+			break;
+		nbits = xfs_contig_bits(buf_f->blf_data_map,
+					buf_f->blf_map_size, bit);
+		ASSERT(nbits > 0);
+		ASSERT(item->ri_buf[i].i_addr != NULL);
+		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+		ASSERT(BBTOB(bp->b_io_length) >=
+		       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
+
+		/*
+		 * The dirty regions logged in the buffer, even though
+		 * contiguous, may span multiple chunks. This is because the
+		 * dirty region may span a physical page boundary in a buffer
+		 * and hence be split into two separate vectors for writing into
+		 * the log. Hence we need to trim nbits back to the length of
+		 * the current region being copied out of the log.
+		 */
+		if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+			nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+		/*
+		 * Do a sanity check if this is a dquot buffer. Just checking
+		 * the first dquot in the buffer should do. XXXThis is
+		 * probably a good thing to do for other buf types also.
+		 */
+		error = 0;
+		if (buf_f->blf_flags &
+		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+			if (item->ri_buf[i].i_addr == NULL) {
+				xfs_alert(mp,
+					"XFS: NULL dquot in %s.", __func__);
+				goto next;
+			}
+			if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
+				xfs_alert(mp,
+					"XFS: dquot too small (%d) in %s.",
+					item->ri_buf[i].i_len, __func__);
+				goto next;
+			}
+			error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
+					       -1, 0, XFS_QMOPT_DOWARN,
+					       "dquot_buf_recover");
+			if (error)
+				goto next;
+		}
+
+		memcpy(xfs_buf_offset(bp,
+			(uint)bit << XFS_BLF_SHIFT),	/* dest */
+			item->ri_buf[i].i_addr,		/* source */
+			nbits<<XFS_BLF_SHIFT);		/* length */
+ next:
+		i++;
+		bit += nbits;
+	}
+
+	/* Shouldn't be any more regions */
+	ASSERT(i == item->ri_total);
+
+	xlog_recover_validate_buf_type(mp, bp, buf_f);
+}
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
+ */
+STATIC bool
+xlog_recover_do_dquot_buffer(
+	struct xfs_mount		*mp,
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	struct xfs_buf			*bp,
+	struct xfs_buf_log_format	*buf_f)
+{
+	uint			type;
+
+	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
+	/*
+	 * Filesystems are required to send in quota flags at mount time.
+	 */
+	if (!mp->m_qflags)
+		return false;
+
+	type = 0;
+	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
+		type |= XFS_DQ_USER;
+	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
+		type |= XFS_DQ_PROJ;
+	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
+		type |= XFS_DQ_GROUP;
+	/*
+	 * This type of quotas was turned off, so ignore this buffer
+	 */
+	if (log->l_quotaoffs_flag & type)
+		return false;
+
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	return true;
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently.  Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields.  This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery.  During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table.  See xlog_recover_buffer_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_buffer_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			current_lsn)
+{
+	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_buf_t		*bp;
+	int			error;
+	uint			buf_flags;
+	xfs_lsn_t		lsn;
+
+	/*
+	 * In this pass we only want to recover all the buffers which have
+	 * not been cancelled and are not cancellation buffers themselves.
+	 */
+	if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
+			buf_f->blf_len, buf_f->blf_flags)) {
+		trace_xfs_log_recover_buf_cancel(log, buf_f);
+		return 0;
+	}
+
+	trace_xfs_log_recover_buf_recover(log, buf_f);
+
+	buf_flags = 0;
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+		buf_flags |= XBF_UNMAPPED;
+
+	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+			  buf_flags, NULL);
+	if (!bp)
+		return -ENOMEM;
+	error = bp->b_error;
+	if (error) {
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
+		goto out_release;
+	}
+
+	/*
+	 * Recover the buffer only if we get an LSN from it and it's less than
+	 * the lsn of the transaction we are replaying.
+	 *
+	 * Note that we have to be extremely careful of readahead here.
+	 * Readahead does not attach verfiers to the buffers so if we don't
+	 * actually do any replay after readahead because of the LSN we found
+	 * in the buffer if more recent than that current transaction then we
+	 * need to attach the verifier directly. Failure to do so can lead to
+	 * future recovery actions (e.g. EFI and unlinked list recovery) can
+	 * operate on the buffers and they won't get the verifier attached. This
+	 * can lead to blocks on disk having the correct content but a stale
+	 * CRC.
+	 *
+	 * It is safe to assume these clean buffers are currently up to date.
+	 * If the buffer is dirtied by a later transaction being replayed, then
+	 * the verifier will be reset to match whatever recover turns that
+	 * buffer into.
+	 */
+	lsn = xlog_recover_get_buf_lsn(mp, bp);
+	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+		xlog_recover_validate_buf_type(mp, bp, buf_f);
+		goto out_release;
+	}
+
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+		if (error)
+			goto out_release;
+	} else if (buf_f->blf_flags &
+		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+		bool	dirty;
+
+		dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+		if (!dirty)
+			goto out_release;
+	} else {
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	}
+
+	/*
+	 * Perform delayed write on the buffer.  Asynchronous writes will be
+	 * slower when taking into account all the buffers to be flushed.
+	 *
+	 * Also make sure that only inode buffers with good sizes stay in
+	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
+	 * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
+	 * buffers in the log can be a different size if the log was generated
+	 * by an older kernel using unclustered inode buffers or a newer kernel
+	 * running with a different inode cluster size.  Regardless, if the
+	 * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
+	 * for *our* value of mp->m_inode_cluster_size, then we need to keep
+	 * the buffer out of the buffer cache so that the buffer won't
+	 * overlap with future reads of those inodes.
+	 */
+	if (XFS_DINODE_MAGIC ==
+	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
+	    (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
+			(__uint32_t)log->l_mp->m_inode_cluster_size))) {
+		xfs_buf_stale(bp);
+		error = xfs_bwrite(bp);
+	} else {
+		ASSERT(bp->b_target->bt_mount == mp);
+		bp->b_iodone = xlog_recover_iodone;
+		xfs_buf_delwri_queue(bp, buffer_list);
+	}
+
+out_release:
+	xfs_buf_relse(bp);
+	return error;
+}
+
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one.  This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip,
+	struct xfs_inode_log_format *in_f,
+	struct list_head	*buffer_list)
+{
+	struct xfs_inode	*ip;
+	int			error;
+
+	ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+	ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+	if (!ip)
+		return -ENOMEM;
+
+	/* instantiate the inode */
+	xfs_dinode_from_disk(&ip->i_d, dip);
+	ASSERT(ip->i_d.di_version >= 3);
+
+	error = xfs_iformat_fork(ip, dip);
+	if (error)
+		goto out_free_ip;
+
+
+	if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+		ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+		error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+					      ip->i_ino, buffer_list);
+		if (error)
+			goto out_free_ip;
+	}
+
+	if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+		ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+		error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+					      ip->i_ino, buffer_list);
+		if (error)
+			goto out_free_ip;
+	}
+
+out_free_ip:
+	xfs_inode_free(ip);
+	return error;
+}
+
+STATIC int
+xlog_recover_inode_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			current_lsn)
+{
+	xfs_inode_log_format_t	*in_f;
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_buf_t		*bp;
+	xfs_dinode_t		*dip;
+	int			len;
+	xfs_caddr_t		src;
+	xfs_caddr_t		dest;
+	int			error;
+	int			attr_index;
+	uint			fields;
+	xfs_icdinode_t		*dicp;
+	uint			isize;
+	int			need_free = 0;
+
+	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+		in_f = item->ri_buf[0].i_addr;
+	} else {
+		in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
+		need_free = 1;
+		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+		if (error)
+			goto error;
+	}
+
+	/*
+	 * Inode buffers can be freed, look out for it,
+	 * and do not replay the inode.
+	 */
+	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+					in_f->ilf_len, 0)) {
+		error = 0;
+		trace_xfs_log_recover_inode_cancel(log, in_f);
+		goto error;
+	}
+	trace_xfs_log_recover_inode_recover(log, in_f);
+
+	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+			  &xfs_inode_buf_ops);
+	if (!bp) {
+		error = -ENOMEM;
+		goto error;
+	}
+	error = bp->b_error;
+	if (error) {
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
+		goto out_release;
+	}
+	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+
+	/*
+	 * Make sure the place we're flushing out to really looks
+	 * like an inode!
+	 */
+	if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+		xfs_alert(mp,
+	"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
+			__func__, dip, bp, in_f->ilf_ino);
+		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
+				 XFS_ERRLEVEL_LOW, mp);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+	dicp = item->ri_buf[1].i_addr;
+	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+		xfs_alert(mp,
+			"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
+			__func__, item, in_f->ilf_ino);
+		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
+				 XFS_ERRLEVEL_LOW, mp);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+
+	/*
+	 * If the inode has an LSN in it, recover the inode only if it's less
+	 * than the lsn of the transaction we are replaying. Note: we still
+	 * need to replay an owner change even though the inode is more recent
+	 * than the transaction as there is no guarantee that all the btree
+	 * blocks are more recent than this transaction, too.
+	 */
+	if (dip->di_version >= 3) {
+		xfs_lsn_t	lsn = be64_to_cpu(dip->di_lsn);
+
+		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+			trace_xfs_log_recover_inode_skip(log, in_f);
+			error = 0;
+			goto out_owner_change;
+		}
+	}
+
+	/*
+	 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+	 * are transactional and if ordering is necessary we can determine that
+	 * more accurately by the LSN field in the V3 inode core. Don't trust
+	 * the inode versions we might be changing them here - use the
+	 * superblock flag to determine whether we need to look at di_flushiter
+	 * to skip replay when the on disk inode is newer than the log one
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+	    dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+		/*
+		 * Deal with the wrap case, DI_MAX_FLUSH is less
+		 * than smaller numbers
+		 */
+		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+			/* do nothing */
+		} else {
+			trace_xfs_log_recover_inode_skip(log, in_f);
+			error = 0;
+			goto out_release;
+		}
+	}
+
+	/* Take the opportunity to reset the flush iteration count */
+	dicp->di_flushiter = 0;
+
+	if (unlikely(S_ISREG(dicp->di_mode))) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
+					 XFS_ERRLEVEL_LOW, mp, dicp);
+			xfs_alert(mp,
+		"%s: Bad regular inode log record, rec ptr 0x%p, "
+		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				__func__, item, dip, bp, in_f->ilf_ino);
+			error = -EFSCORRUPTED;
+			goto out_release;
+		}
+	} else if (unlikely(S_ISDIR(dicp->di_mode))) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
+		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
+					     XFS_ERRLEVEL_LOW, mp, dicp);
+			xfs_alert(mp,
+		"%s: Bad dir inode log record, rec ptr 0x%p, "
+		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				__func__, item, dip, bp, in_f->ilf_ino);
+			error = -EFSCORRUPTED;
+			goto out_release;
+		}
+	}
+	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
+				     XFS_ERRLEVEL_LOW, mp, dicp);
+		xfs_alert(mp,
+	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+	"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+			__func__, item, dip, bp, in_f->ilf_ino,
+			dicp->di_nextents + dicp->di_anextents,
+			dicp->di_nblocks);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
+				     XFS_ERRLEVEL_LOW, mp, dicp);
+		xfs_alert(mp,
+	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
+			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+	isize = xfs_icdinode_size(dicp->di_version);
+	if (unlikely(item->ri_buf[1].i_len > isize)) {
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
+				     XFS_ERRLEVEL_LOW, mp, dicp);
+		xfs_alert(mp,
+			"%s: Bad inode log record length %d, rec ptr 0x%p",
+			__func__, item->ri_buf[1].i_len, item);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+
+	/* The core is in in-core format */
+	xfs_dinode_to_disk(dip, dicp);
+
+	/* the rest is in on-disk format */
+	if (item->ri_buf[1].i_len > isize) {
+		memcpy((char *)dip + isize,
+			item->ri_buf[1].i_addr + isize,
+			item->ri_buf[1].i_len - isize);
+	}
+
+	fields = in_f->ilf_fields;
+	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
+	case XFS_ILOG_DEV:
+		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
+		break;
+	case XFS_ILOG_UUID:
+		memcpy(XFS_DFORK_DPTR(dip),
+		       &in_f->ilf_u.ilfu_uuid,
+		       sizeof(uuid_t));
+		break;
+	}
+
+	if (in_f->ilf_size == 2)
+		goto out_owner_change;
+	len = item->ri_buf[2].i_len;
+	src = item->ri_buf[2].i_addr;
+	ASSERT(in_f->ilf_size <= 4);
+	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+	ASSERT(!(fields & XFS_ILOG_DFORK) ||
+	       (len == in_f->ilf_dsize));
+
+	switch (fields & XFS_ILOG_DFORK) {
+	case XFS_ILOG_DDATA:
+	case XFS_ILOG_DEXT:
+		memcpy(XFS_DFORK_DPTR(dip), src, len);
+		break;
+
+	case XFS_ILOG_DBROOT:
+		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+				 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
+				 XFS_DFORK_DSIZE(dip, mp));
+		break;
+
+	default:
+		/*
+		 * There are no data fork flags set.
+		 */
+		ASSERT((fields & XFS_ILOG_DFORK) == 0);
+		break;
+	}
+
+	/*
+	 * If we logged any attribute data, recover it.  There may or
+	 * may not have been any other non-core data logged in this
+	 * transaction.
+	 */
+	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+			attr_index = 3;
+		} else {
+			attr_index = 2;
+		}
+		len = item->ri_buf[attr_index].i_len;
+		src = item->ri_buf[attr_index].i_addr;
+		ASSERT(len == in_f->ilf_asize);
+
+		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		case XFS_ILOG_ADATA:
+		case XFS_ILOG_AEXT:
+			dest = XFS_DFORK_APTR(dip);
+			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+			memcpy(dest, src, len);
+			break;
+
+		case XFS_ILOG_ABROOT:
+			dest = XFS_DFORK_APTR(dip);
+			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+					 len, (xfs_bmdr_block_t*)dest,
+					 XFS_DFORK_ASIZE(dip, mp));
+			break;
+
+		default:
+			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
+			ASSERT(0);
+			error = -EIO;
+			goto out_release;
+		}
+	}
+
+out_owner_change:
+	if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+		error = xfs_recover_inode_owner_change(mp, dip, in_f,
+						       buffer_list);
+	/* re-generate the checksum. */
+	xfs_dinode_calc_crc(log->l_mp, dip);
+
+	ASSERT(bp->b_target->bt_mount == mp);
+	bp->b_iodone = xlog_recover_iodone;
+	xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+	xfs_buf_relse(bp);
+error:
+	if (need_free)
+		kmem_free(in_f);
+	return error;
+}
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_quotaoff_pass1(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	xfs_qoff_logformat_t	*qoff_f = item->ri_buf[0].i_addr;
+	ASSERT(qoff_f);
+
+	/*
+	 * The logitem format's flag tells us if this was user quotaoff,
+	 * group/project quotaoff or both.
+	 */
+	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_USER;
+	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
+	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
+
+	return 0;
+}
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_dquot_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			current_lsn)
+{
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_buf_t		*bp;
+	struct xfs_disk_dquot	*ddq, *recddq;
+	int			error;
+	xfs_dq_logformat_t	*dq_f;
+	uint			type;
+
+
+	/*
+	 * Filesystems are required to send in quota flags at mount time.
+	 */
+	if (mp->m_qflags == 0)
+		return 0;
+
+	recddq = item->ri_buf[1].i_addr;
+	if (recddq == NULL) {
+		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
+		return -EIO;
+	}
+	if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
+		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
+			item->ri_buf[1].i_len, __func__);
+		return -EIO;
+	}
+
+	/*
+	 * This type of quotas was turned off, so ignore this record.
+	 */
+	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+	ASSERT(type);
+	if (log->l_quotaoffs_flag & type)
+		return 0;
+
+	/*
+	 * At this point we know that quota was _not_ turned off.
+	 * Since the mount flags are not indicating to us otherwise, this
+	 * must mean that quota is on, and the dquot needs to be replayed.
+	 * Remember that we may not have fully recovered the superblock yet,
+	 * so we can't do the usual trick of looking at the SB quota bits.
+	 *
+	 * The other possibility, of course, is that the quota subsystem was
+	 * removed since the last mount - ENOSYS.
+	 */
+	dq_f = item->ri_buf[0].i_addr;
+	ASSERT(dq_f);
+	error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_dquot_pass2 (log copy)");
+	if (error)
+		return -EIO;
+	ASSERT(dq_f->qlf_len == 1);
+
+	/*
+	 * At this point we are assuming that the dquots have been allocated
+	 * and hence the buffer has valid dquots stamped in it. It should,
+	 * therefore, pass verifier validation. If the dquot is bad, then the
+	 * we'll return an error here, so we don't need to specifically check
+	 * the dquot in the buffer after the verifier has run.
+	 */
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+				   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+				   &xfs_dquot_buf_ops);
+	if (error)
+		return error;
+
+	ASSERT(bp);
+	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+	/*
+	 * If the dquot has an LSN in it, recover the dquot only if it's less
+	 * than the lsn of the transaction we are replaying.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+		xfs_lsn_t	lsn = be64_to_cpu(dqb->dd_lsn);
+
+		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+			goto out_release;
+		}
+	}
+
+	memcpy(ddq, recddq, item->ri_buf[1].i_len);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	ASSERT(dq_f->qlf_size == 2);
+	ASSERT(bp->b_target->bt_mount == mp);
+	bp->b_iodone = xlog_recover_iodone;
+	xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_efi_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	int			error;
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_efi_log_item_t	*efip;
+	xfs_efi_log_format_t	*efi_formatp;
+
+	efi_formatp = item->ri_buf[0].i_addr;
+
+	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
+					 &(efip->efi_format)))) {
+		xfs_efi_item_free(efip);
+		return error;
+	}
+	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+
+	spin_lock(&log->l_ailp->xa_lock);
+	/*
+	 * xfs_trans_ail_update() drops the AIL lock.
+	 */
+	xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+	return 0;
+}
+
+
+/*
+ * This routine is called when an efd format structure is found in
+ * a committed transaction in the log.  It's purpose is to cancel
+ * the corresponding efi if it was still in the log.  To do this
+ * it searches the AIL for the efi with an id equal to that in the
+ * efd format structure.  If we find it, we remove the efi from the
+ * AIL and free it.
+ */
+STATIC int
+xlog_recover_efd_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	xfs_efd_log_format_t	*efd_formatp;
+	xfs_efi_log_item_t	*efip = NULL;
+	xfs_log_item_t		*lip;
+	__uint64_t		efi_id;
+	struct xfs_ail_cursor	cur;
+	struct xfs_ail		*ailp = log->l_ailp;
+
+	efd_formatp = item->ri_buf[0].i_addr;
+	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
+	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
+	efi_id = efd_formatp->efd_efi_id;
+
+	/*
+	 * Search for the efi with the id in the efd format structure
+	 * in the AIL.
+	 */
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_EFI) {
+			efip = (xfs_efi_log_item_t *)lip;
+			if (efip->efi_format.efi_id == efi_id) {
+				/*
+				 * xfs_trans_ail_delete() drops the
+				 * AIL lock.
+				 */
+				xfs_trans_ail_delete(ailp, lip,
+						     SHUTDOWN_CORRUPT_INCORE);
+				xfs_efi_item_free(efip);
+				spin_lock(&ailp->xa_lock);
+				break;
+			}
+		}
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+
+	return 0;
+}
+
+/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log.  It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+	struct xlog		*log,
+	struct list_head	*buffer_list,
+	xlog_recover_item_t	*item)
+{
+	struct xfs_mount	*mp = log->l_mp;
+	struct xfs_icreate_log	*icl;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	unsigned int		count;
+	unsigned int		isize;
+	xfs_agblock_t		length;
+
+	icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+	if (icl->icl_type != XFS_LI_ICREATE) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+		return -EINVAL;
+	}
+
+	if (icl->icl_size != 1) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+		return -EINVAL;
+	}
+
+	agno = be32_to_cpu(icl->icl_ag);
+	if (agno >= mp->m_sb.sb_agcount) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+		return -EINVAL;
+	}
+	agbno = be32_to_cpu(icl->icl_agbno);
+	if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+		return -EINVAL;
+	}
+	isize = be32_to_cpu(icl->icl_isize);
+	if (isize != mp->m_sb.sb_inodesize) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+		return -EINVAL;
+	}
+	count = be32_to_cpu(icl->icl_count);
+	if (!count) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+		return -EINVAL;
+	}
+	length = be32_to_cpu(icl->icl_length);
+	if (!length || length >= mp->m_sb.sb_agblocks) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+		return -EINVAL;
+	}
+
+	/* existing allocation is fixed value */
+	ASSERT(count == mp->m_ialloc_inos);
+	ASSERT(length == mp->m_ialloc_blks);
+	if (count != mp->m_ialloc_inos ||
+	     length != mp->m_ialloc_blks) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+		return -EINVAL;
+	}
+
+	/*
+	 * Inode buffers can be freed. Do not replay the inode initialisation as
+	 * we could be overwriting something written after this inode buffer was
+	 * cancelled.
+	 *
+	 * XXX: we need to iterate all buffers and only init those that are not
+	 * cancelled. I think that a more fine grained factoring of
+	 * xfs_ialloc_inode_init may be appropriate here to enable this to be
+	 * done easily.
+	 */
+	if (xlog_check_buffer_cancelled(log,
+			XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+		return 0;
+
+	xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+					be32_to_cpu(icl->icl_gen));
+	return 0;
+}
+
+STATIC void
+xlog_recover_buffer_ra_pass2(
+	struct xlog                     *log,
+	struct xlog_recover_item        *item)
+{
+	struct xfs_buf_log_format	*buf_f = item->ri_buf[0].i_addr;
+	struct xfs_mount		*mp = log->l_mp;
+
+	if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
+			buf_f->blf_len, buf_f->blf_flags)) {
+		return;
+	}
+
+	xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
+				buf_f->blf_len, NULL);
+}
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+	struct xlog                     *log,
+	struct xlog_recover_item        *item)
+{
+	struct xfs_inode_log_format	ilf_buf;
+	struct xfs_inode_log_format	*ilfp;
+	struct xfs_mount		*mp = log->l_mp;
+	int			error;
+
+	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+		ilfp = item->ri_buf[0].i_addr;
+	} else {
+		ilfp = &ilf_buf;
+		memset(ilfp, 0, sizeof(*ilfp));
+		error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
+		if (error)
+			return;
+	}
+
+	if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
+		return;
+
+	xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
+				ilfp->ilf_len, &xfs_inode_buf_ra_ops);
+}
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_mount	*mp = log->l_mp;
+	struct xfs_disk_dquot	*recddq;
+	struct xfs_dq_logformat	*dq_f;
+	uint			type;
+
+
+	if (mp->m_qflags == 0)
+		return;
+
+	recddq = item->ri_buf[1].i_addr;
+	if (recddq == NULL)
+		return;
+	if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+		return;
+
+	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+	ASSERT(type);
+	if (log->l_quotaoffs_flag & type)
+		return;
+
+	dq_f = item->ri_buf[0].i_addr;
+	ASSERT(dq_f);
+	ASSERT(dq_f->qlf_len == 1);
+
+	xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+			  XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+}
+
+STATIC void
+xlog_recover_ra_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	switch (ITEM_TYPE(item)) {
+	case XFS_LI_BUF:
+		xlog_recover_buffer_ra_pass2(log, item);
+		break;
+	case XFS_LI_INODE:
+		xlog_recover_inode_ra_pass2(log, item);
+		break;
+	case XFS_LI_DQUOT:
+		xlog_recover_dquot_ra_pass2(log, item);
+		break;
+	case XFS_LI_EFI:
+	case XFS_LI_EFD:
+	case XFS_LI_QUOTAOFF:
+	default:
+		break;
+	}
+}
+
+STATIC int
+xlog_recover_commit_pass1(
+	struct xlog			*log,
+	struct xlog_recover		*trans,
+	struct xlog_recover_item	*item)
+{
+	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+
+	switch (ITEM_TYPE(item)) {
+	case XFS_LI_BUF:
+		return xlog_recover_buffer_pass1(log, item);
+	case XFS_LI_QUOTAOFF:
+		return xlog_recover_quotaoff_pass1(log, item);
+	case XFS_LI_INODE:
+	case XFS_LI_EFI:
+	case XFS_LI_EFD:
+	case XFS_LI_DQUOT:
+	case XFS_LI_ICREATE:
+		/* nothing to do in pass 1 */
+		return 0;
+	default:
+		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+			__func__, ITEM_TYPE(item));
+		ASSERT(0);
+		return -EIO;
+	}
+}
+
+STATIC int
+xlog_recover_commit_pass2(
+	struct xlog			*log,
+	struct xlog_recover		*trans,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item)
+{
+	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+
+	switch (ITEM_TYPE(item)) {
+	case XFS_LI_BUF:
+		return xlog_recover_buffer_pass2(log, buffer_list, item,
+						 trans->r_lsn);
+	case XFS_LI_INODE:
+		return xlog_recover_inode_pass2(log, buffer_list, item,
+						 trans->r_lsn);
+	case XFS_LI_EFI:
+		return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+	case XFS_LI_EFD:
+		return xlog_recover_efd_pass2(log, item);
+	case XFS_LI_DQUOT:
+		return xlog_recover_dquot_pass2(log, buffer_list, item,
+						trans->r_lsn);
+	case XFS_LI_ICREATE:
+		return xlog_recover_do_icreate_pass2(log, buffer_list, item);
+	case XFS_LI_QUOTAOFF:
+		/* nothing to do in pass2 */
+		return 0;
+	default:
+		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+			__func__, ITEM_TYPE(item));
+		ASSERT(0);
+		return -EIO;
+	}
+}
+
+STATIC int
+xlog_recover_items_pass2(
+	struct xlog                     *log,
+	struct xlog_recover             *trans,
+	struct list_head                *buffer_list,
+	struct list_head                *item_list)
+{
+	struct xlog_recover_item	*item;
+	int				error = 0;
+
+	list_for_each_entry(item, item_list, ri_list) {
+		error = xlog_recover_commit_pass2(log, trans,
+					  buffer_list, item);
+		if (error)
+			return error;
+	}
+
+	return error;
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_commit_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	int			pass)
+{
+	int				error = 0;
+	int				error2;
+	int				items_queued = 0;
+	struct xlog_recover_item	*item;
+	struct xlog_recover_item	*next;
+	LIST_HEAD			(buffer_list);
+	LIST_HEAD			(ra_list);
+	LIST_HEAD			(done_list);
+
+	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
+
+	hlist_del(&trans->r_list);
+
+	error = xlog_recover_reorder_trans(log, trans, pass);
+	if (error)
+		return error;
+
+	list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
+		switch (pass) {
+		case XLOG_RECOVER_PASS1:
+			error = xlog_recover_commit_pass1(log, trans, item);
+			break;
+		case XLOG_RECOVER_PASS2:
+			xlog_recover_ra_pass2(log, item);
+			list_move_tail(&item->ri_list, &ra_list);
+			items_queued++;
+			if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+				error = xlog_recover_items_pass2(log, trans,
+						&buffer_list, &ra_list);
+				list_splice_tail_init(&ra_list, &done_list);
+				items_queued = 0;
+			}
+
+			break;
+		default:
+			ASSERT(0);
+		}
+
+		if (error)
+			goto out;
+	}
+
+out:
+	if (!list_empty(&ra_list)) {
+		if (!error)
+			error = xlog_recover_items_pass2(log, trans,
+					&buffer_list, &ra_list);
+		list_splice_tail_init(&ra_list, &done_list);
+	}
+
+	if (!list_empty(&done_list))
+		list_splice_init(&done_list, &trans->r_itemq);
+
+	error2 = xfs_buf_delwri_submit(&buffer_list);
+	return error ? error : error2;
+}
+
+STATIC void
+xlog_recover_add_item(
+	struct list_head	*head)
+{
+	xlog_recover_item_t	*item;
+
+	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+	INIT_LIST_HEAD(&item->ri_list);
+	list_add_tail(&item->ri_list, head);
+}
+
+STATIC int
+xlog_recover_add_to_cont_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	xfs_caddr_t		dp,
+	int			len)
+{
+	xlog_recover_item_t	*item;
+	xfs_caddr_t		ptr, old_ptr;
+	int			old_len;
+
+	if (list_empty(&trans->r_itemq)) {
+		/* finish copying rest of trans header */
+		xlog_recover_add_item(&trans->r_itemq);
+		ptr = (xfs_caddr_t) &trans->r_theader +
+				sizeof(xfs_trans_header_t) - len;
+		memcpy(ptr, dp, len);
+		return 0;
+	}
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+
+	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+	old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+	ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+	memcpy(&ptr[old_len], dp, len);
+	item->ri_buf[item->ri_cnt-1].i_len += len;
+	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
+	return 0;
+}
+
+/*
+ * The next region to add is the start of a new region.  It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	xfs_caddr_t		dp,
+	int			len)
+{
+	xfs_inode_log_format_t	*in_f;			/* any will do */
+	xlog_recover_item_t	*item;
+	xfs_caddr_t		ptr;
+
+	if (!len)
+		return 0;
+	if (list_empty(&trans->r_itemq)) {
+		/* we need to catch log corruptions here */
+		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+			xfs_warn(log->l_mp, "%s: bad header magic number",
+				__func__);
+			ASSERT(0);
+			return -EIO;
+		}
+		if (len == sizeof(xfs_trans_header_t))
+			xlog_recover_add_item(&trans->r_itemq);
+		memcpy(&trans->r_theader, dp, len);
+		return 0;
+	}
+
+	ptr = kmem_alloc(len, KM_SLEEP);
+	memcpy(ptr, dp, len);
+	in_f = (xfs_inode_log_format_t *)ptr;
+
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+	if (item->ri_total != 0 &&
+	     item->ri_total == item->ri_cnt) {
+		/* tail item is in use, get a new one */
+		xlog_recover_add_item(&trans->r_itemq);
+		item = list_entry(trans->r_itemq.prev,
+					xlog_recover_item_t, ri_list);
+	}
+
+	if (item->ri_total == 0) {		/* first region to be added */
+		if (in_f->ilf_size == 0 ||
+		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+			xfs_warn(log->l_mp,
+		"bad number of regions (%d) in inode log format",
+				  in_f->ilf_size);
+			ASSERT(0);
+			kmem_free(ptr);
+			return -EIO;
+		}
+
+		item->ri_total = in_f->ilf_size;
+		item->ri_buf =
+			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+				    KM_SLEEP);
+	}
+	ASSERT(item->ri_total > item->ri_cnt);
+	/* Description region is ri_buf[0] */
+	item->ri_buf[item->ri_cnt].i_addr = ptr;
+	item->ri_buf[item->ri_cnt].i_len  = len;
+	item->ri_cnt++;
+	trace_xfs_log_recover_item_add(log, trans, item, 0);
+	return 0;
+}
+
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+	struct xlog_recover	*trans)
+{
+	xlog_recover_item_t	*item, *n;
+	int			i;
+
+	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+		/* Free the regions in the item. */
+		list_del(&item->ri_list);
+		for (i = 0; i < item->ri_cnt; i++)
+			kmem_free(item->ri_buf[i].i_addr);
+		/* Free the item itself */
+		kmem_free(item->ri_buf);
+		kmem_free(item);
+	}
+	/* Free the transaction recover structure */
+	kmem_free(trans);
+}
+
+/*
+ * On error or completion, trans is freed.
+ */
+STATIC int
+xlog_recovery_process_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	xfs_caddr_t		dp,
+	unsigned int		len,
+	unsigned int		flags,
+	int			pass)
+{
+	int			error = 0;
+	bool			freeit = false;
+
+	/* mask off ophdr transaction container flags */
+	flags &= ~XLOG_END_TRANS;
+	if (flags & XLOG_WAS_CONT_TRANS)
+		flags &= ~XLOG_CONTINUE_TRANS;
+
+	/*
+	 * Callees must not free the trans structure. We'll decide if we need to
+	 * free it or not based on the operation being done and it's result.
+	 */
+	switch (flags) {
+	/* expected flag values */
+	case 0:
+	case XLOG_CONTINUE_TRANS:
+		error = xlog_recover_add_to_trans(log, trans, dp, len);
+		break;
+	case XLOG_WAS_CONT_TRANS:
+		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
+		break;
+	case XLOG_COMMIT_TRANS:
+		error = xlog_recover_commit_trans(log, trans, pass);
+		/* success or fail, we are now done with this transaction. */
+		freeit = true;
+		break;
+
+	/* unexpected flag values */
+	case XLOG_UNMOUNT_TRANS:
+		/* just skip trans */
+		xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+		freeit = true;
+		break;
+	case XLOG_START_TRANS:
+	default:
+		xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
+		ASSERT(0);
+		error = -EIO;
+		break;
+	}
+	if (error || freeit)
+		xlog_recover_free_trans(trans);
+	return error;
+}
+
+/*
+ * Lookup the transaction recovery structure associated with the ID in the
+ * current ophdr. If the transaction doesn't exist and the start flag is set in
+ * the ophdr, then allocate a new transaction for future ID matches to find.
+ * Either way, return what we found during the lookup - an existing transaction
+ * or nothing.
+ */
+STATIC struct xlog_recover *
+xlog_recover_ophdr_to_trans(
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	struct xlog_op_header	*ohead)
+{
+	struct xlog_recover	*trans;
+	xlog_tid_t		tid;
+	struct hlist_head	*rhp;
+
+	tid = be32_to_cpu(ohead->oh_tid);
+	rhp = &rhash[XLOG_RHASH(tid)];
+	hlist_for_each_entry(trans, rhp, r_list) {
+		if (trans->r_log_tid == tid)
+			return trans;
+	}
+
+	/*
+	 * skip over non-start transaction headers - we could be
+	 * processing slack space before the next transaction starts
+	 */
+	if (!(ohead->oh_flags & XLOG_START_TRANS))
+		return NULL;
+
+	ASSERT(be32_to_cpu(ohead->oh_len) == 0);
+
+	/*
+	 * This is a new transaction so allocate a new recovery container to
+	 * hold the recovery ops that will follow.
+	 */
+	trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+	trans->r_log_tid = tid;
+	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
+	INIT_LIST_HEAD(&trans->r_itemq);
+	INIT_HLIST_NODE(&trans->r_list);
+	hlist_add_head(&trans->r_list, rhp);
+
+	/*
+	 * Nothing more to do for this ophdr. Items to be added to this new
+	 * transaction will be in subsequent ophdr containers.
+	 */
+	return NULL;
+}
+
+STATIC int
+xlog_recover_process_ophdr(
+	struct xlog		*log,
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	struct xlog_op_header	*ohead,
+	xfs_caddr_t		dp,
+	xfs_caddr_t		end,
+	int			pass)
+{
+	struct xlog_recover	*trans;
+	unsigned int		len;
+
+	/* Do we understand who wrote this op? */
+	if (ohead->oh_clientid != XFS_TRANSACTION &&
+	    ohead->oh_clientid != XFS_LOG) {
+		xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+			__func__, ohead->oh_clientid);
+		ASSERT(0);
+		return -EIO;
+	}
+
+	/*
+	 * Check the ophdr contains all the data it is supposed to contain.
+	 */
+	len = be32_to_cpu(ohead->oh_len);
+	if (dp + len > end) {
+		xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
+		WARN_ON(1);
+		return -EIO;
+	}
+
+	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
+	if (!trans) {
+		/* nothing to do, so skip over this ophdr */
+		return 0;
+	}
+
+	return xlog_recovery_process_trans(log, trans, dp, len,
+					   ohead->oh_flags, pass);
+}
+
+/*
+ * There are two valid states of the r_state field.  0 indicates that the
+ * transaction structure is in a normal state.  We have either seen the
+ * start of the transaction or the last operation we added was not a partial
+ * operation.  If the last operation we added to the transaction was a
+ * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
+ *
+ * NOTE: skip LRs with 0 data length.
+ */
+STATIC int
+xlog_recover_process_data(
+	struct xlog		*log,
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	xfs_caddr_t		dp,
+	int			pass)
+{
+	struct xlog_op_header	*ohead;
+	xfs_caddr_t		end;
+	int			num_logops;
+	int			error;
+
+	end = dp + be32_to_cpu(rhead->h_len);
+	num_logops = be32_to_cpu(rhead->h_num_logops);
+
+	/* check the log format matches our own - else we can't recover */
+	if (xlog_header_check_recover(log->l_mp, rhead))
+		return -EIO;
+
+	while ((dp < end) && num_logops) {
+
+		ohead = (struct xlog_op_header *)dp;
+		dp += sizeof(*ohead);
+		ASSERT(dp <= end);
+
+		/* errors will abort recovery */
+		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
+						    dp, end, pass);
+		if (error)
+			return error;
+
+		dp += be32_to_cpu(ohead->oh_len);
+		num_logops--;
+	}
+	return 0;
+}
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log.  We need to free the extents that it describes.
+ */
+STATIC int
+xlog_recover_process_efi(
+	xfs_mount_t		*mp,
+	xfs_efi_log_item_t	*efip)
+{
+	xfs_efd_log_item_t	*efdp;
+	xfs_trans_t		*tp;
+	int			i;
+	int			error = 0;
+	xfs_extent_t		*extp;
+	xfs_fsblock_t		startblock_fsb;
+
+	ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+
+	/*
+	 * First check the validity of the extents described by the
+	 * EFI.  If any are bad, then assume that all are bad and
+	 * just toss the EFI.
+	 */
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		startblock_fsb = XFS_BB_TO_FSB(mp,
+				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
+		if ((startblock_fsb == 0) ||
+		    (extp->ext_len == 0) ||
+		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
+		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
+			/*
+			 * This will pull the EFI from the AIL and
+			 * free the memory associated with it.
+			 */
+			set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+			xfs_efi_release(efip, efip->efi_format.efi_nextents);
+			return -EIO;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, 0);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	if (error)
+		goto abort_error;
+	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		if (error)
+			goto abort_error;
+		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
+					 extp->ext_len);
+	}
+
+	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+	error = xfs_trans_commit(tp, 0);
+	return error;
+
+abort_error:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
+}
+
+/*
+ * When this is called, all of the EFIs which did not have
+ * corresponding EFDs should be in the AIL.  What we do now
+ * is free the extents associated with each one.
+ *
+ * Since we process the EFIs in normal transactions, they
+ * will be removed at some point after the commit.  This prevents
+ * us from just walking down the list processing each one.
+ * We'll use a flag in the EFI to skip those that we've already
+ * processed and use the AIL iteration mechanism's generation
+ * count to try to speed this up at least a bit.
+ *
+ * When we start, we know that the EFIs are the only things in
+ * the AIL.  As we process them, however, other items are added
+ * to the AIL.  Since everything added to the AIL must come after
+ * everything already in the AIL, we stop processing as soon as
+ * we see something other than an EFI in the AIL.
+ */
+STATIC int
+xlog_recover_process_efis(
+	struct xlog	*log)
+{
+	xfs_log_item_t		*lip;
+	xfs_efi_log_item_t	*efip;
+	int			error = 0;
+	struct xfs_ail_cursor	cur;
+	struct xfs_ail		*ailp;
+
+	ailp = log->l_ailp;
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		/*
+		 * We're done when we see something other than an EFI.
+		 * There should be no EFIs left in the AIL now.
+		 */
+		if (lip->li_type != XFS_LI_EFI) {
+#ifdef DEBUG
+			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+				ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
+			break;
+		}
+
+		/*
+		 * Skip EFIs that we've already processed.
+		 */
+		efip = (xfs_efi_log_item_t *)lip;
+		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
+			lip = xfs_trans_ail_cursor_next(ailp, &cur);
+			continue;
+		}
+
+		spin_unlock(&ailp->xa_lock);
+		error = xlog_recover_process_efi(log->l_mp, efip);
+		spin_lock(&ailp->xa_lock);
+		if (error)
+			goto out;
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+out:
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+	return error;
+}
+
+/*
+ * This routine performs a transaction to null out a bad inode pointer
+ * in an agi unlinked inode hash bucket.
+ */
+STATIC void
+xlog_recover_clear_agi_bucket(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno,
+	int		bucket)
+{
+	xfs_trans_t	*tp;
+	xfs_agi_t	*agi;
+	xfs_buf_t	*agibp;
+	int		offset;
+	int		error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
+	if (error)
+		goto out_abort;
+
+	error = xfs_read_agi(mp, tp, agno, &agibp);
+	if (error)
+		goto out_abort;
+
+	agi = XFS_BUF_TO_AGI(agibp);
+	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		 (sizeof(xfs_agino_t) * bucket);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+
+	error = xfs_trans_commit(tp, 0);
+	if (error)
+		goto out_error;
+	return;
+
+out_abort:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
+	return;
+}
+
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	xfs_agino_t			agino,
+	int				bucket)
+{
+	struct xfs_buf			*ibp;
+	struct xfs_dinode		*dip;
+	struct xfs_inode		*ip;
+	xfs_ino_t			ino;
+	int				error;
+
+	ino = XFS_AGINO_TO_INO(mp, agno, agino);
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	if (error)
+		goto fail;
+
+	/*
+	 * Get the on disk inode to find the next inode in the bucket.
+	 */
+	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
+	if (error)
+		goto fail_iput;
+
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_mode != 0);
+
+	/* setup for the next pass */
+	agino = be32_to_cpu(dip->di_next_unlinked);
+	xfs_buf_relse(ibp);
+
+	/*
+	 * Prevent any DMAPI event from being sent when the reference on
+	 * the inode is dropped.
+	 */
+	ip->i_d.di_dmevmask = 0;
+
+	IRELE(ip);
+	return agino;
+
+ fail_iput:
+	IRELE(ip);
+ fail:
+	/*
+	 * We can't read in the inode this bucket points to, or this inode
+	 * is messed up.  Just ditch this bucket of inodes.  We will lose
+	 * some inodes and space, but at least we won't hang.
+	 *
+	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+	 * clear the inode pointer in the bucket.
+	 */
+	xlog_recover_clear_agi_bucket(mp, agno, bucket);
+	return NULLAGINO;
+}
+
+/*
+ * xlog_iunlink_recover
+ *
+ * This is called during recovery to process any inodes which
+ * we unlinked but not freed when the system crashed.  These
+ * inodes will be on the lists in the AGI blocks.  What we do
+ * here is scan all the AGIs and fully truncate and free any
+ * inodes found on the lists.  Each inode is removed from the
+ * lists when it has been fully truncated and is freed.  The
+ * freeing of the inode and its removal from the list must be
+ * atomic.
+ */
+STATIC void
+xlog_recover_process_iunlinks(
+	struct xlog	*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agnumber_t	agno;
+	xfs_agi_t	*agi;
+	xfs_buf_t	*agibp;
+	xfs_agino_t	agino;
+	int		bucket;
+	int		error;
+	uint		mp_dmevmask;
+
+	mp = log->l_mp;
+
+	/*
+	 * Prevent any DMAPI event from being sent while in this function.
+	 */
+	mp_dmevmask = mp->m_dmevmask;
+	mp->m_dmevmask = 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		/*
+		 * Find the agi for this ag.
+		 */
+		error = xfs_read_agi(mp, NULL, agno, &agibp);
+		if (error) {
+			/*
+			 * AGI is b0rked. Don't process it.
+			 *
+			 * We should probably mark the filesystem as corrupt
+			 * after we've recovered all the ag's we can....
+			 */
+			continue;
+		}
+		/*
+		 * Unlock the buffer so that it can be acquired in the normal
+		 * course of the transaction to truncate and free each inode.
+		 * Because we are not racing with anyone else here for the AGI
+		 * buffer, we don't even need to hold it locked to read the
+		 * initial unlinked bucket entries out of the buffer. We keep
+		 * buffer reference though, so that it stays pinned in memory
+		 * while we need the buffer.
+		 */
+		agi = XFS_BUF_TO_AGI(agibp);
+		xfs_buf_unlock(agibp);
+
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+			while (agino != NULLAGINO) {
+				agino = xlog_recover_process_one_iunlink(mp,
+							agno, agino, bucket);
+			}
+		}
+		xfs_buf_rele(agibp);
+	}
+
+	mp->m_dmevmask = mp_dmevmask;
+}
+
+/*
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
+ */
+STATIC int
+xlog_unpack_data_crc(
+	struct xlog_rec_header	*rhead,
+	xfs_caddr_t		dp,
+	struct xlog		*log)
+{
+	__le32			crc;
+
+	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+	if (crc != rhead->h_crc) {
+		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+			xfs_alert(log->l_mp,
+		"log record CRC mismatch: found 0x%x, expected 0x%x.",
+					le32_to_cpu(rhead->h_crc),
+					le32_to_cpu(crc));
+			xfs_hex_dump(dp, 32);
+		}
+
+		/*
+		 * If we've detected a log record corruption, then we can't
+		 * recover past this point. Abort recovery if we are enforcing
+		 * CRC protection by punting an error back up the stack.
+		 */
+		if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+			return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+STATIC int
+xlog_unpack_data(
+	struct xlog_rec_header	*rhead,
+	xfs_caddr_t		dp,
+	struct xlog		*log)
+{
+	int			i, j, k;
+	int			error;
+
+	error = xlog_unpack_data_crc(rhead, dp, log);
+	if (error)
+		return error;
+
+	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+		dp += BBSIZE;
+	}
+
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+			dp += BBSIZE;
+		}
+	}
+
+	return 0;
+}
+
+STATIC int
+xlog_valid_rec_header(
+	struct xlog		*log,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		blkno)
+{
+	int			hlen;
+
+	if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
+		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
+				XFS_ERRLEVEL_LOW, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+	if (unlikely(
+	    (!rhead->h_version ||
+	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
+		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
+			__func__, be32_to_cpu(rhead->h_version));
+		return -EIO;
+	}
+
+	/* LR body must have data or it wouldn't have been written */
+	hlen = be32_to_cpu(rhead->h_len);
+	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
+		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
+				XFS_ERRLEVEL_LOW, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
+		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
+				XFS_ERRLEVEL_LOW, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Read the log from tail to head and process the log records found.
+ * Handle the two cases where the tail and head are in the same cycle
+ * and where the active portion of the log wraps around the end of
+ * the physical log separately.  The pass parameter is passed through
+ * to the routines called to process the data and is not looked at
+ * here.
+ */
+STATIC int
+xlog_do_recovery_pass(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk,
+	int			pass)
+{
+	xlog_rec_header_t	*rhead;
+	xfs_daddr_t		blk_no;
+	xfs_caddr_t		offset;
+	xfs_buf_t		*hbp, *dbp;
+	int			error = 0, h_size;
+	int			bblks, split_bblks;
+	int			hblks, split_hblks, wrapped_hblks;
+	struct hlist_head	rhash[XLOG_RHASH_SIZE];
+
+	ASSERT(head_blk != tail_blk);
+
+	/*
+	 * Read the header of the tail block and get the iclog buffer size from
+	 * h_size.  Use this to tell how many sectors make up the log header.
+	 */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		/*
+		 * When using variable length iclogs, read first sector of
+		 * iclog header and extract the header size from it.  Get a
+		 * new hbp that is the correct size.
+		 */
+		hbp = xlog_get_bp(log, 1);
+		if (!hbp)
+			return -ENOMEM;
+
+		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+		if (error)
+			goto bread_err1;
+
+		rhead = (xlog_rec_header_t *)offset;
+		error = xlog_valid_rec_header(log, rhead, tail_blk);
+		if (error)
+			goto bread_err1;
+		h_size = be32_to_cpu(rhead->h_size);
+		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
+		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+			if (h_size % XLOG_HEADER_CYCLE_SIZE)
+				hblks++;
+			xlog_put_bp(hbp);
+			hbp = xlog_get_bp(log, hblks);
+		} else {
+			hblks = 1;
+		}
+	} else {
+		ASSERT(log->l_sectBBsize == 1);
+		hblks = 1;
+		hbp = xlog_get_bp(log, 1);
+		h_size = XLOG_BIG_RECORD_BSIZE;
+	}
+
+	if (!hbp)
+		return -ENOMEM;
+	dbp = xlog_get_bp(log, BTOBB(h_size));
+	if (!dbp) {
+		xlog_put_bp(hbp);
+		return -ENOMEM;
+	}
+
+	memset(rhash, 0, sizeof(rhash));
+	blk_no = tail_blk;
+	if (tail_blk > head_blk) {
+		/*
+		 * Perform recovery around the end of the physical log.
+		 * When the head is not on the same cycle number as the tail,
+		 * we can't do a sequential recovery.
+		 */
+		while (blk_no < log->l_logBBsize) {
+			/*
+			 * Check for header wrapping around physical end-of-log
+			 */
+			offset = hbp->b_addr;
+			split_hblks = 0;
+			wrapped_hblks = 0;
+			if (blk_no + hblks <= log->l_logBBsize) {
+				/* Read header in one read */
+				error = xlog_bread(log, blk_no, hblks, hbp,
+						   &offset);
+				if (error)
+					goto bread_err2;
+			} else {
+				/* This LR is split across physical log end */
+				if (blk_no != log->l_logBBsize) {
+					/* some data before physical log end */
+					ASSERT(blk_no <= INT_MAX);
+					split_hblks = log->l_logBBsize - (int)blk_no;
+					ASSERT(split_hblks > 0);
+					error = xlog_bread(log, blk_no,
+							   split_hblks, hbp,
+							   &offset);
+					if (error)
+						goto bread_err2;
+				}
+
+				/*
+				 * Note: this black magic still works with
+				 * large sector sizes (non-512) only because:
+				 * - we increased the buffer size originally
+				 *   by 1 sector giving us enough extra space
+				 *   for the second read;
+				 * - the log start is guaranteed to be sector
+				 *   aligned;
+				 * - we read the log end (LR header start)
+				 *   _first_, then the log start (LR header end)
+				 *   - order is important.
+				 */
+				wrapped_hblks = hblks - split_hblks;
+				error = xlog_bread_offset(log, 0,
+						wrapped_hblks, hbp,
+						offset + BBTOB(split_hblks));
+				if (error)
+					goto bread_err2;
+			}
+			rhead = (xlog_rec_header_t *)offset;
+			error = xlog_valid_rec_header(log, rhead,
+						split_hblks ? blk_no : 0);
+			if (error)
+				goto bread_err2;
+
+			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+			blk_no += hblks;
+
+			/* Read in data for log record */
+			if (blk_no + bblks <= log->l_logBBsize) {
+				error = xlog_bread(log, blk_no, bblks, dbp,
+						   &offset);
+				if (error)
+					goto bread_err2;
+			} else {
+				/* This log record is split across the
+				 * physical end of log */
+				offset = dbp->b_addr;
+				split_bblks = 0;
+				if (blk_no != log->l_logBBsize) {
+					/* some data is before the physical
+					 * end of log */
+					ASSERT(!wrapped_hblks);
+					ASSERT(blk_no <= INT_MAX);
+					split_bblks =
+						log->l_logBBsize - (int)blk_no;
+					ASSERT(split_bblks > 0);
+					error = xlog_bread(log, blk_no,
+							split_bblks, dbp,
+							&offset);
+					if (error)
+						goto bread_err2;
+				}
+
+				/*
+				 * Note: this black magic still works with
+				 * large sector sizes (non-512) only because:
+				 * - we increased the buffer size originally
+				 *   by 1 sector giving us enough extra space
+				 *   for the second read;
+				 * - the log start is guaranteed to be sector
+				 *   aligned;
+				 * - we read the log end (LR header start)
+				 *   _first_, then the log start (LR header end)
+				 *   - order is important.
+				 */
+				error = xlog_bread_offset(log, 0,
+						bblks - split_bblks, dbp,
+						offset + BBTOB(split_bblks));
+				if (error)
+					goto bread_err2;
+			}
+
+			error = xlog_unpack_data(rhead, offset, log);
+			if (error)
+				goto bread_err2;
+
+			error = xlog_recover_process_data(log, rhash,
+							rhead, offset, pass);
+			if (error)
+				goto bread_err2;
+			blk_no += bblks;
+		}
+
+		ASSERT(blk_no >= log->l_logBBsize);
+		blk_no -= log->l_logBBsize;
+	}
+
+	/* read first part of physical log */
+	while (blk_no < head_blk) {
+		error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+		if (error)
+			goto bread_err2;
+
+		rhead = (xlog_rec_header_t *)offset;
+		error = xlog_valid_rec_header(log, rhead, blk_no);
+		if (error)
+			goto bread_err2;
+
+		/* blocks in data section */
+		bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+		error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+				   &offset);
+		if (error)
+			goto bread_err2;
+
+		error = xlog_unpack_data(rhead, offset, log);
+		if (error)
+			goto bread_err2;
+
+		error = xlog_recover_process_data(log, rhash,
+						rhead, offset, pass);
+		if (error)
+			goto bread_err2;
+		blk_no += bblks + hblks;
+	}
+
+ bread_err2:
+	xlog_put_bp(dbp);
+ bread_err1:
+	xlog_put_bp(hbp);
+	return error;
+}
+
+/*
+ * Do the recovery of the log.  We actually do this in two phases.
+ * The two passes are necessary in order to implement the function
+ * of cancelling a record written into the log.  The first pass
+ * determines those things which have been cancelled, and the
+ * second pass replays log items normally except for those which
+ * have been cancelled.  The handling of the replay and cancellations
+ * takes place in the log item type specific routines.
+ *
+ * The table of items which have cancel records in the log is allocated
+ * and freed at this level, since only here do we know when all of
+ * the log recovery has been completed.
+ */
+STATIC int
+xlog_do_log_recovery(
+	struct xlog	*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	int		error, i;
+
+	ASSERT(head_blk != tail_blk);
+
+	/*
+	 * First do a pass to find all of the cancelled buf log items.
+	 * Store them in the buf_cancel_table for use in the second pass.
+	 */
+	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+						 sizeof(struct list_head),
+						 KM_SLEEP);
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS1);
+	if (error != 0) {
+		kmem_free(log->l_buf_cancel_table);
+		log->l_buf_cancel_table = NULL;
+		return error;
+	}
+	/*
+	 * Then do a second pass to actually recover the items in the log.
+	 * When it is complete free the table of buf cancel items.
+	 */
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS2);
+#ifdef DEBUG
+	if (!error) {
+		int	i;
+
+		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+	}
+#endif	/* DEBUG */
+
+	kmem_free(log->l_buf_cancel_table);
+	log->l_buf_cancel_table = NULL;
+
+	return error;
+}
+
+/*
+ * Do the actual recovery
+ */
+STATIC int
+xlog_do_recover(
+	struct xlog	*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	int		error;
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp;
+
+	/*
+	 * First replay the images in the log.
+	 */
+	error = xlog_do_log_recovery(log, head_blk, tail_blk);
+	if (error)
+		return error;
+
+	/*
+	 * If IO errors happened during recovery, bail out.
+	 */
+	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+		return -EIO;
+	}
+
+	/*
+	 * We now update the tail_lsn since much of the recovery has completed
+	 * and there may be space available to use.  If there were no extent
+	 * or iunlinks, we can free up the entire log and set the tail_lsn to
+	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
+	 * lsn of the last known good LR on disk.  If there are extent frees
+	 * or iunlinks they will have some entries in the AIL; so we look at
+	 * the AIL to determine how to set the tail_lsn.
+	 */
+	xlog_assign_tail_lsn(log->l_mp);
+
+	/*
+	 * Now that we've finished replaying all buffer and inode
+	 * updates, re-read in the superblock and reverify it.
+	 */
+	bp = xfs_getsb(log->l_mp, 0);
+	XFS_BUF_UNDONE(bp);
+	ASSERT(!(XFS_BUF_ISWRITE(bp)));
+	XFS_BUF_READ(bp);
+	XFS_BUF_UNASYNC(bp);
+	bp->b_ops = &xfs_sb_buf_ops;
+
+	error = xfs_buf_submit_wait(bp);
+	if (error) {
+		if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
+			xfs_buf_ioerror_alert(bp, __func__);
+			ASSERT(0);
+		}
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	/* Convert superblock from on-disk format */
+	sbp = &log->l_mp->m_sb;
+	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
+	ASSERT(xfs_sb_good_version(sbp));
+	xfs_reinit_percpu_counters(log->l_mp);
+
+	xfs_buf_relse(bp);
+
+
+	xlog_recover_check_summary(log);
+
+	/* Normal transactions can now occur */
+	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+	return 0;
+}
+
+/*
+ * Perform recovery and re-initialize some log variables in xlog_find_tail.
+ *
+ * Return error or zero.
+ */
+int
+xlog_recover(
+	struct xlog	*log)
+{
+	xfs_daddr_t	head_blk, tail_blk;
+	int		error;
+
+	/* find the tail of the log */
+	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+		return error;
+
+	if (tail_blk != head_blk) {
+		/* There used to be a comment here:
+		 *
+		 * disallow recovery on read-only mounts.  note -- mount
+		 * checks for ENOSPC and turns it into an intelligent
+		 * error message.
+		 * ...but this is no longer true.  Now, unless you specify
+		 * NORECOVERY (in which case this function would never be
+		 * called), we just go ahead and recover.  We do this all
+		 * under the vfs layer, so we can get away with it unless
+		 * the device itself is read-only, in which case we fail.
+		 */
+		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
+			return error;
+		}
+
+		/*
+		 * Version 5 superblock log feature mask validation. We know the
+		 * log is dirty so check if there are any unknown log features
+		 * in what we need to recover. If there are unknown features
+		 * (e.g. unsupported transactions, then simply reject the
+		 * attempt at recovery before touching anything.
+		 */
+		if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+		    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
+					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+			xfs_warn(log->l_mp,
+"Superblock has unknown incompatible log features (0x%x) enabled.\n"
+"The log can not be fully and/or safely recovered by this kernel.\n"
+"Please recover the log on a kernel that supports the unknown features.",
+				(log->l_mp->m_sb.sb_features_log_incompat &
+					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+			return -EINVAL;
+		}
+
+		/*
+		 * Delay log recovery if the debug hook is set. This is debug
+		 * instrumention to coordinate simulation of I/O failures with
+		 * log recovery.
+		 */
+		if (xfs_globals.log_recovery_delay) {
+			xfs_notice(log->l_mp,
+				"Delaying log recovery for %d seconds.",
+				xfs_globals.log_recovery_delay);
+			msleep(xfs_globals.log_recovery_delay * 1000);
+		}
+
+		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
+				log->l_mp->m_logname ? log->l_mp->m_logname
+						     : "internal");
+
+		error = xlog_do_recover(log, head_blk, tail_blk);
+		log->l_flags |= XLOG_RECOVERY_NEEDED;
+	}
+	return error;
+}
+
+/*
+ * In the first part of recovery we replay inodes and buffers and build
+ * up the list of extent free items which need to be processed.  Here
+ * we process the extent free items and clean up the on disk unlinked
+ * inode lists.  This is separated from the first part of recovery so
+ * that the root and real-time bitmap inodes can be read in from disk in
+ * between the two stages.  This is necessary so that we can free space
+ * in the real-time portion of the file system.
+ */
+int
+xlog_recover_finish(
+	struct xlog	*log)
+{
+	/*
+	 * Now we're ready to do the transactions needed for the
+	 * rest of recovery.  Start with completing all the extent
+	 * free intent records and then process the unlinked inode
+	 * lists.  At this point, we essentially run in normal mode
+	 * except that we're still performing recovery actions
+	 * rather than accepting new requests.
+	 */
+	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
+		int	error;
+		error = xlog_recover_process_efis(log);
+		if (error) {
+			xfs_alert(log->l_mp, "Failed to recover EFIs");
+			return error;
+		}
+		/*
+		 * Sync the log to get all the EFIs out of the AIL.
+		 * This isn't absolutely necessary, but it helps in
+		 * case the unlink transactions would have problems
+		 * pushing the EFIs out of the way.
+		 */
+		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
+
+		xlog_recover_process_iunlinks(log);
+
+		xlog_recover_check_summary(log);
+
+		xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
+				log->l_mp->m_logname ? log->l_mp->m_logname
+						     : "internal");
+		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
+	} else {
+		xfs_info(log->l_mp, "Ending clean mount");
+	}
+	return 0;
+}
+
+
+#if defined(DEBUG)
+/*
+ * Read all of the agf and agi counters and check that they
+ * are consistent with the superblock counters.
+ */
+void
+xlog_recover_check_summary(
+	struct xlog	*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agf_t	*agfp;
+	xfs_buf_t	*agfbp;
+	xfs_buf_t	*agibp;
+	xfs_agnumber_t	agno;
+	__uint64_t	freeblks;
+	__uint64_t	itotal;
+	__uint64_t	ifree;
+	int		error;
+
+	mp = log->l_mp;
+
+	freeblks = 0LL;
+	itotal = 0LL;
+	ifree = 0LL;
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
+		if (error) {
+			xfs_alert(mp, "%s agf read failed agno %d error %d",
+						__func__, agno, error);
+		} else {
+			agfp = XFS_BUF_TO_AGF(agfbp);
+			freeblks += be32_to_cpu(agfp->agf_freeblks) +
+				    be32_to_cpu(agfp->agf_flcount);
+			xfs_buf_relse(agfbp);
+		}
+
+		error = xfs_read_agi(mp, NULL, agno, &agibp);
+		if (error) {
+			xfs_alert(mp, "%s agi read failed agno %d error %d",
+						__func__, agno, error);
+		} else {
+			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
+
+			itotal += be32_to_cpu(agi->agi_count);
+			ifree += be32_to_cpu(agi->agi_freecount);
+			xfs_buf_relse(agibp);
+		}
+	}
+}
+#endif /* DEBUG */
diff --git a/kernel/fs/xfs/xfs_message.c b/kernel/fs/xfs/xfs_message.c
new file mode 100644
index 000000000..d8b67547a
--- /dev/null
+++ b/kernel/fs/xfs/xfs_message.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+	const char		*level,
+	const struct xfs_mount	*mp,
+	struct va_format	*vaf)
+{
+	if (mp && mp->m_fsname) {
+		printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+		return;
+	}
+	printk("%sXFS: %pV\n", level, vaf);
+}
+
+#define define_xfs_printk_level(func, kern_level)		\
+void func(const struct xfs_mount *mp, const char *fmt, ...)	\
+{								\
+	struct va_format	vaf;				\
+	va_list			args;				\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	__xfs_printk(kern_level, mp, &vaf);			\
+	va_end(args);						\
+}								\
+
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+
+void
+xfs_alert_tag(
+	const struct xfs_mount	*mp,
+	int			panic_tag,
+	const char		*fmt, ...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			do_panic = 0;
+
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+		xfs_alert(mp, "Transforming an alert into a BUG.");
+		do_panic = 1;
+	}
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	__xfs_printk(KERN_ALERT, mp, &vaf);
+	va_end(args);
+
+	BUG_ON(do_panic);
+}
+
+void
+asswarn(char *expr, char *file, int line)
+{
+	xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d",
+		expr, file, line);
+	WARN_ON(1);
+}
+
+void
+assfail(char *expr, char *file, int line)
+{
+	xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+		expr, file, line);
+	BUG();
+}
+
+void
+xfs_hex_dump(void *p, int length)
+{
+	print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/kernel/fs/xfs/xfs_message.h b/kernel/fs/xfs/xfs_message.h
new file mode 100644
index 000000000..854011557
--- /dev/null
+++ b/kernel/fs/xfs/xfs_message.h
@@ -0,0 +1,64 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+
+struct xfs_mount;
+
+extern __printf(2, 3)
+void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(3, 4)
+void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
+
+#ifdef DEBUG
+extern __printf(2, 3)
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
+#endif
+
+#define xfs_printk_ratelimited(func, dev, fmt, ...)		\
+do {									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+	if (__ratelimit(&_rs))						\
+		func(dev, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define xfs_emerg_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
+#define xfs_alert_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__)
+#define xfs_crit_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__)
+#define xfs_err_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__)
+#define xfs_debug_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+
+extern void assfail(char *expr, char *f, int l);
+extern void asswarn(char *expr, char *f, int l);
+
+extern void xfs_hex_dump(void *p, int length);
+
+#endif	/* __XFS_MESSAGE_H */
diff --git a/kernel/fs/xfs/xfs_mount.c b/kernel/fs/xfs/xfs_mount.c
new file mode 100644
index 000000000..6f23fbdfb
--- /dev/null
+++ b/kernel/fs/xfs/xfs_mount.c
@@ -0,0 +1,1283 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_sysfs.h"
+
+
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			hole, i;
+
+	if (mp->m_flags & XFS_MOUNT_NOUUID)
+		return 0;
+
+	if (uuid_is_nil(uuid)) {
+		xfs_warn(mp, "Filesystem has nil UUID - can't mount");
+		return -EINVAL;
+	}
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_nil(&xfs_uuid_table[i])) {
+			hole = i;
+			continue;
+		}
+		if (uuid_equal(uuid, &xfs_uuid_table[i]))
+			goto out_duplicate;
+	}
+
+	if (hole < 0) {
+		xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+			xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
+			KM_SLEEP);
+		hole = xfs_uuid_table_size++;
+	}
+	xfs_uuid_table[hole] = *uuid;
+	mutex_unlock(&xfs_uuid_table_mutex);
+
+	return 0;
+
+ out_duplicate:
+	mutex_unlock(&xfs_uuid_table_mutex);
+	xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
+	return -EINVAL;
+}
+
+STATIC void
+xfs_uuid_unmount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			i;
+
+	if (mp->m_flags & XFS_MOUNT_NOUUID)
+		return;
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_nil(&xfs_uuid_table[i]))
+			continue;
+		if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+			continue;
+		memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+		break;
+	}
+	ASSERT(i < xfs_uuid_table_size);
+	mutex_unlock(&xfs_uuid_table_mutex);
+}
+
+
+STATIC void
+__xfs_free_perag(
+	struct rcu_head	*head)
+{
+	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+	ASSERT(atomic_read(&pag->pag_ref) == 0);
+	kmem_free(pag);
+}
+
+/*
+ * Free up the per-ag resources associated with the mount structure.
+ */
+STATIC void
+xfs_free_perag(
+	xfs_mount_t	*mp)
+{
+	xfs_agnumber_t	agno;
+	struct xfs_perag *pag;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		spin_lock(&mp->m_perag_lock);
+		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		spin_unlock(&mp->m_perag_lock);
+		ASSERT(pag);
+		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		call_rcu(&pag->rcu_head, __xfs_free_perag);
+	}
+}
+
+/*
+ * Check size of device based on the (data/realtime) block count.
+ * Note: this check is used by the growfs code as well as mount.
+ */
+int
+xfs_sb_validate_fsb_count(
+	xfs_sb_t	*sbp,
+	__uint64_t	nblocks)
+{
+	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+	ASSERT(sbp->sb_blocklog >= BBSHIFT);
+
+	/* Limited by ULONG_MAX of page cache index */
+	if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+		return -EFBIG;
+	return 0;
+}
+
+int
+xfs_initialize_perag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agcount,
+	xfs_agnumber_t	*maxagi)
+{
+	xfs_agnumber_t	index;
+	xfs_agnumber_t	first_initialised = 0;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	int		error = -ENOMEM;
+
+	/*
+	 * Walk the current per-ag tree so we don't try to initialise AGs
+	 * that already exist (growfs case). Allocate and insert all the
+	 * AGs we don't find ready for initialisation.
+	 */
+	for (index = 0; index < agcount; index++) {
+		pag = xfs_perag_get(mp, index);
+		if (pag) {
+			xfs_perag_put(pag);
+			continue;
+		}
+		if (!first_initialised)
+			first_initialised = index;
+
+		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+		if (!pag)
+			goto out_unwind;
+		pag->pag_agno = index;
+		pag->pag_mount = mp;
+		spin_lock_init(&pag->pag_ici_lock);
+		mutex_init(&pag->pag_ici_reclaim_lock);
+		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		spin_lock_init(&pag->pag_buf_lock);
+		pag->pag_buf_tree = RB_ROOT;
+
+		if (radix_tree_preload(GFP_NOFS))
+			goto out_unwind;
+
+		spin_lock(&mp->m_perag_lock);
+		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+			BUG();
+			spin_unlock(&mp->m_perag_lock);
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_unwind;
+		}
+		spin_unlock(&mp->m_perag_lock);
+		radix_tree_preload_end();
+	}
+
+	/*
+	 * If we mount with the inode64 option, or no inode overflows
+	 * the legacy 32-bit address space clear the inode32 option.
+	 */
+	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
+		mp->m_flags |= XFS_MOUNT_32BITINODES;
+	else
+		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+
+	if (mp->m_flags & XFS_MOUNT_32BITINODES)
+		index = xfs_set_inode32(mp, agcount);
+	else
+		index = xfs_set_inode64(mp, agcount);
+
+	if (maxagi)
+		*maxagi = index;
+	return 0;
+
+out_unwind:
+	kmem_free(pag);
+	for (; index > first_initialised; index--) {
+		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		kmem_free(pag);
+	}
+	return error;
+}
+
+/*
+ * xfs_readsb
+ *
+ * Does the initial read of the superblock.
+ */
+int
+xfs_readsb(
+	struct xfs_mount *mp,
+	int		flags)
+{
+	unsigned int	sector_size;
+	struct xfs_buf	*bp;
+	struct xfs_sb	*sbp = &mp->m_sb;
+	int		error;
+	int		loud = !(flags & XFS_MFSI_QUIET);
+	const struct xfs_buf_ops *buf_ops;
+
+	ASSERT(mp->m_sb_bp == NULL);
+	ASSERT(mp->m_ddev_targp != NULL);
+
+	/*
+	 * For the initial read, we must guess at the sector
+	 * size based on the block device.  It's enough to
+	 * get the sb_sectsize out of the superblock and
+	 * then reread with the proper length.
+	 * We don't verify it yet, because it may not be complete.
+	 */
+	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+	buf_ops = NULL;
+
+	/*
+	 * Allocate a (locked) buffer to hold the superblock.
+	 * This will be kept around at all times to optimize
+	 * access to the superblock.
+	 */
+reread:
+	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+				   BTOBB(sector_size), 0, &bp, buf_ops);
+	if (error) {
+		if (loud)
+			xfs_warn(mp, "SB validate failed with error %d.", error);
+		/* bad CRC means corrupted metadata */
+		if (error == -EFSBADCRC)
+			error = -EFSCORRUPTED;
+		return error;
+	}
+
+	/*
+	 * Initialize the mount structure from the superblock.
+	 */
+	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+
+	/*
+	 * If we haven't validated the superblock, do so now before we try
+	 * to check the sector size and reread the superblock appropriately.
+	 */
+	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+		if (loud)
+			xfs_warn(mp, "Invalid superblock magic number");
+		error = -EINVAL;
+		goto release_buf;
+	}
+
+	/*
+	 * We must be able to do sector-sized and sector-aligned IO.
+	 */
+	if (sector_size > sbp->sb_sectsize) {
+		if (loud)
+			xfs_warn(mp, "device supports %u byte sectors (not %u)",
+				sector_size, sbp->sb_sectsize);
+		error = -ENOSYS;
+		goto release_buf;
+	}
+
+	if (buf_ops == NULL) {
+		/*
+		 * Re-read the superblock so the buffer is correctly sized,
+		 * and properly verified.
+		 */
+		xfs_buf_relse(bp);
+		sector_size = sbp->sb_sectsize;
+		buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
+		goto reread;
+	}
+
+	xfs_reinit_percpu_counters(mp);
+
+	/* no need to be quiet anymore, so reset the buf ops */
+	bp->b_ops = &xfs_sb_buf_ops;
+
+	mp->m_sb_bp = bp;
+	xfs_buf_unlock(bp);
+	return 0;
+
+release_buf:
+	xfs_buf_relse(bp);
+	return error;
+}
+
+/*
+ * Update alignment values based on mount options and sb values
+ */
+STATIC int
+xfs_update_alignment(xfs_mount_t *mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+
+	if (mp->m_dalign) {
+		/*
+		 * If stripe unit and stripe width are not multiples
+		 * of the fs blocksize turn off alignment.
+		 */
+		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+			xfs_warn(mp,
+		"alignment check failed: sunit/swidth vs. blocksize(%d)",
+				sbp->sb_blocksize);
+			return -EINVAL;
+		} else {
+			/*
+			 * Convert the stripe unit and width to FSBs.
+			 */
+			mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+			if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
+				xfs_warn(mp,
+			"alignment check failed: sunit/swidth vs. agsize(%d)",
+					 sbp->sb_agblocks);
+				return -EINVAL;
+			} else if (mp->m_dalign) {
+				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+			} else {
+				xfs_warn(mp,
+			"alignment check failed: sunit(%d) less than bsize(%d)",
+					 mp->m_dalign, sbp->sb_blocksize);
+				return -EINVAL;
+			}
+		}
+
+		/*
+		 * Update superblock with new values
+		 * and log changes
+		 */
+		if (xfs_sb_version_hasdalign(sbp)) {
+			if (sbp->sb_unit != mp->m_dalign) {
+				sbp->sb_unit = mp->m_dalign;
+				mp->m_update_sb = true;
+			}
+			if (sbp->sb_width != mp->m_swidth) {
+				sbp->sb_width = mp->m_swidth;
+				mp->m_update_sb = true;
+			}
+		} else {
+			xfs_warn(mp,
+	"cannot change alignment: superblock does not support data alignment");
+			return -EINVAL;
+		}
+	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
+		    xfs_sb_version_hasdalign(&mp->m_sb)) {
+			mp->m_dalign = sbp->sb_unit;
+			mp->m_swidth = sbp->sb_width;
+	}
+
+	return 0;
+}
+
+/*
+ * Set the maximum inode count for this filesystem
+ */
+STATIC void
+xfs_set_maxicount(xfs_mount_t *mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	__uint64_t	icount;
+
+	if (sbp->sb_imax_pct) {
+		/*
+		 * Make sure the maximum inode count is a multiple
+		 * of the units we allocate inodes in.
+		 */
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		do_div(icount, mp->m_ialloc_blks);
+		mp->m_maxicount = (icount * mp->m_ialloc_blks)  <<
+				   sbp->sb_inopblog;
+	} else {
+		mp->m_maxicount = 0;
+	}
+}
+
+/*
+ * Set the default minimum read and write sizes unless
+ * already specified in a mount option.
+ * We use smaller I/O sizes when the file system
+ * is being used for NFS service (wsync mount option).
+ */
+STATIC void
+xfs_set_rw_sizes(xfs_mount_t *mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	int		readio_log, writeio_log;
+
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+		if (mp->m_flags & XFS_MOUNT_WSYNC) {
+			readio_log = XFS_WSYNC_READIO_LOG;
+			writeio_log = XFS_WSYNC_WRITEIO_LOG;
+		} else {
+			readio_log = XFS_READIO_LOG_LARGE;
+			writeio_log = XFS_WRITEIO_LOG_LARGE;
+		}
+	} else {
+		readio_log = mp->m_readio_log;
+		writeio_log = mp->m_writeio_log;
+	}
+
+	if (sbp->sb_blocklog > readio_log) {
+		mp->m_readio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_readio_log = readio_log;
+	}
+	mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
+	if (sbp->sb_blocklog > writeio_log) {
+		mp->m_writeio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_writeio_log = writeio_log;
+	}
+	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
+}
+
+/*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+	struct xfs_mount	*mp)
+{
+	int i;
+
+	for (i = 0; i < XFS_LOWSP_MAX; i++) {
+		__uint64_t space = mp->m_sb.sb_dblocks;
+
+		do_div(space, 100);
+		mp->m_low_space[i] = space * (i + 1);
+	}
+}
+
+
+/*
+ * Set whether we're using inode alignment.
+ */
+STATIC void
+xfs_set_inoalignment(xfs_mount_t *mp)
+{
+	if (xfs_sb_version_hasalign(&mp->m_sb) &&
+	    mp->m_sb.sb_inoalignmt >=
+	    XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+	else
+		mp->m_inoalign_mask = 0;
+	/*
+	 * If we are using stripe alignment, check whether
+	 * the stripe unit is a multiple of the inode alignment
+	 */
+	if (mp->m_dalign && mp->m_inoalign_mask &&
+	    !(mp->m_dalign & mp->m_inoalign_mask))
+		mp->m_sinoalign = mp->m_dalign;
+	else
+		mp->m_sinoalign = 0;
+}
+
+/*
+ * Check that the data (and log if separate) is an ok size.
+ */
+STATIC int
+xfs_check_sizes(
+	struct xfs_mount *mp)
+{
+	struct xfs_buf	*bp;
+	xfs_daddr_t	d;
+	int		error;
+
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
+		xfs_warn(mp, "filesystem size mismatch detected");
+		return -EFBIG;
+	}
+	error = xfs_buf_read_uncached(mp->m_ddev_targp,
+					d - XFS_FSS_TO_BB(mp, 1),
+					XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
+		xfs_warn(mp, "last sector read failed");
+		return error;
+	}
+	xfs_buf_relse(bp);
+
+	if (mp->m_logdev_targp == mp->m_ddev_targp)
+		return 0;
+
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+		xfs_warn(mp, "log size mismatch detected");
+		return -EFBIG;
+	}
+	error = xfs_buf_read_uncached(mp->m_logdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
+		xfs_warn(mp, "log device read failed");
+		return error;
+	}
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(
+	struct xfs_mount	*mp)
+{
+	mp->m_qflags = 0;
+
+	/* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
+	if (mp->m_sb.sb_qflags == 0)
+		return 0;
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_qflags = 0;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
+		return 0;
+
+	return xfs_sync_sb(mp, false);
+}
+
+__uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+	__uint64_t resblks;
+
+	/*
+	 * We default to 5% or 8192 fsbs of space reserved, whichever is
+	 * smaller.  This is intended to cover concurrent allocation
+	 * transactions when we initially hit enospc. These each require a 4
+	 * block reservation. Hence by default we cover roughly 2000 concurrent
+	 * allocation reservations.
+	 */
+	resblks = mp->m_sb.sb_dblocks;
+	do_div(resblks, 20);
+	resblks = min_t(__uint64_t, resblks, 8192);
+	return resblks;
+}
+
+/*
+ * This function does the following on an initial mount of a file system:
+ *	- reads the superblock from disk and init the mount struct
+ *	- if we're a 32-bit kernel, do a size check on the superblock
+ *		so we don't mount terabyte filesystems
+ *	- init mount struct realtime fields
+ *	- allocate inode hash table for fs
+ *	- init directory manager
+ *	- perform recovery and init the log manager
+ */
+int
+xfs_mountfs(
+	xfs_mount_t	*mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	xfs_inode_t	*rip;
+	__uint64_t	resblks;
+	uint		quotamount = 0;
+	uint		quotaflags = 0;
+	int		error = 0;
+
+	xfs_sb_mount_common(mp, sbp);
+
+	/*
+	 * Check for a mismatched features2 values.  Older kernels read & wrote
+	 * into the wrong sb offset for sb_features2 on some platforms due to
+	 * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
+	 * which made older superblock reading/writing routines swap it as a
+	 * 64-bit value.
+	 *
+	 * For backwards compatibility, we make both slots equal.
+	 *
+	 * If we detect a mismatched field, we OR the set bits into the existing
+	 * features2 field in case it has already been modified; we don't want
+	 * to lose any features.  We then update the bad location with the ORed
+	 * value so that older kernels will see any features2 flags. The
+	 * superblock writeback code ensures the new sb_features2 is copied to
+	 * sb_bad_features2 before it is logged or written to disk.
+	 */
+	if (xfs_sb_has_mismatched_features2(sbp)) {
+		xfs_warn(mp, "correcting sb_features alignment problem");
+		sbp->sb_features2 |= sbp->sb_bad_features2;
+		mp->m_update_sb = true;
+
+		/*
+		 * Re-check for ATTR2 in case it was found in bad_features2
+		 * slot.
+		 */
+		if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+		   !(mp->m_flags & XFS_MOUNT_NOATTR2))
+			mp->m_flags |= XFS_MOUNT_ATTR2;
+	}
+
+	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+	   (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+		xfs_sb_version_removeattr2(&mp->m_sb);
+		mp->m_update_sb = true;
+
+		/* update sb_versionnum for the clearing of the morebits */
+		if (!sbp->sb_features2)
+			mp->m_update_sb = true;
+	}
+
+	/* always use v2 inodes by default now */
+	if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
+		mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+		mp->m_update_sb = true;
+	}
+
+	/*
+	 * Check if sb_agblocks is aligned at stripe boundary
+	 * If sb_agblocks is NOT aligned turn off m_dalign since
+	 * allocator alignment is within an ag, therefore ag has
+	 * to be aligned at stripe boundary.
+	 */
+	error = xfs_update_alignment(mp);
+	if (error)
+		goto out;
+
+	xfs_alloc_compute_maxlevels(mp);
+	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
+	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
+	xfs_ialloc_compute_maxlevels(mp);
+
+	xfs_set_maxicount(mp);
+
+	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
+	if (error)
+		goto out;
+
+	error = xfs_uuid_mount(mp);
+	if (error)
+		goto out_remove_sysfs;
+
+	/*
+	 * Set the minimum read and write sizes
+	 */
+	xfs_set_rw_sizes(mp);
+
+	/* set the low space thresholds for dynamic preallocation */
+	xfs_set_low_space_thresholds(mp);
+
+	/*
+	 * Set the inode cluster size.
+	 * This may still be overridden by the file system
+	 * block size if it is larger than the chosen cluster size.
+	 *
+	 * For v5 filesystems, scale the cluster size with the inode size to
+	 * keep a constant ratio of inode per cluster buffer, but only if mkfs
+	 * has set the inode alignment value appropriately for larger cluster
+	 * sizes.
+	 */
+	mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		int	new_size = mp->m_inode_cluster_size;
+
+		new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+			mp->m_inode_cluster_size = new_size;
+	}
+
+	/*
+	 * Set inode alignment fields
+	 */
+	xfs_set_inoalignment(mp);
+
+	/*
+	 * Check that the data (and log if separate) is an ok size.
+	 */
+	error = xfs_check_sizes(mp);
+	if (error)
+		goto out_remove_uuid;
+
+	/*
+	 * Initialize realtime fields in the mount structure
+	 */
+	error = xfs_rtmount_init(mp);
+	if (error) {
+		xfs_warn(mp, "RT mount failed");
+		goto out_remove_uuid;
+	}
+
+	/*
+	 *  Copies the low order bits of the timestamp and the randomly
+	 *  set "sequence" number out of a UUID.
+	 */
+	uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
+
+	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
+
+	error = xfs_da_mount(mp);
+	if (error) {
+		xfs_warn(mp, "Failed dir/attr init: %d", error);
+		goto out_remove_uuid;
+	}
+
+	/*
+	 * Initialize the precomputed transaction reservations values.
+	 */
+	xfs_trans_init(mp);
+
+	/*
+	 * Allocate and initialize the per-ag data.
+	 */
+	spin_lock_init(&mp->m_perag_lock);
+	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	if (error) {
+		xfs_warn(mp, "Failed per-ag init: %d", error);
+		goto out_free_dir;
+	}
+
+	if (!sbp->sb_logblocks) {
+		xfs_warn(mp, "no log defined");
+		XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
+		error = -EFSCORRUPTED;
+		goto out_free_perag;
+	}
+
+	/*
+	 * log's mount-time initialization. Perform 1st part recovery if needed
+	 */
+	error = xfs_log_mount(mp, mp->m_logdev_targp,
+			      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+			      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+	if (error) {
+		xfs_warn(mp, "log mount failed");
+		goto out_fail_wait;
+	}
+
+	/*
+	 * Now the log is mounted, we know if it was an unclean shutdown or
+	 * not. If it was, with the first phase of recovery has completed, we
+	 * have consistent AG blocks on disk. We have not recovered EFIs yet,
+	 * but they are recovered transactionally in the second recovery phase
+	 * later.
+	 *
+	 * Hence we can safely re-initialise incore superblock counters from
+	 * the per-ag data. These may not be correct if the filesystem was not
+	 * cleanly unmounted, so we need to wait for recovery to finish before
+	 * doing this.
+	 *
+	 * If the filesystem was cleanly unmounted, then we can trust the
+	 * values in the superblock to be correct and we don't need to do
+	 * anything here.
+	 *
+	 * If we are currently making the filesystem, the initialisation will
+	 * fail as the perag data is in an undefined state.
+	 */
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+	    !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
+	     !mp->m_sb.sb_inprogress) {
+		error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
+		if (error)
+			goto out_log_dealloc;
+	}
+
+	/*
+	 * Get and sanity-check the root inode.
+	 * Save the pointer to it in the mount structure.
+	 */
+	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
+	if (error) {
+		xfs_warn(mp, "failed to read root inode");
+		goto out_log_dealloc;
+	}
+
+	ASSERT(rip != NULL);
+
+	if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
+		xfs_warn(mp, "corrupted root inode %llu: not a directory",
+			(unsigned long long)rip->i_ino);
+		xfs_iunlock(rip, XFS_ILOCK_EXCL);
+		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
+				 mp);
+		error = -EFSCORRUPTED;
+		goto out_rele_rip;
+	}
+	mp->m_rootip = rip;	/* save it */
+
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Initialize realtime inode pointers in the mount structure
+	 */
+	error = xfs_rtmount_inodes(mp);
+	if (error) {
+		/*
+		 * Free up the root inode.
+		 */
+		xfs_warn(mp, "failed to read RT inodes");
+		goto out_rele_rip;
+	}
+
+	/*
+	 * If this is a read-only mount defer the superblock updates until
+	 * the next remount into writeable mode.  Otherwise we would never
+	 * perform the update e.g. for the root filesystem.
+	 */
+	if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		error = xfs_sync_sb(mp, false);
+		if (error) {
+			xfs_warn(mp, "failed to write sb changes");
+			goto out_rtunmount;
+		}
+	}
+
+	/*
+	 * Initialise the XFS quota management subsystem for this mount
+	 */
+	if (XFS_IS_QUOTA_RUNNING(mp)) {
+		error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
+		if (error)
+			goto out_rtunmount;
+	} else {
+		ASSERT(!XFS_IS_QUOTA_ON(mp));
+
+		/*
+		 * If a file system had quotas running earlier, but decided to
+		 * mount without -o uquota/pquota/gquota options, revoke the
+		 * quotachecked license.
+		 */
+		if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+			xfs_notice(mp, "resetting quota flags");
+			error = xfs_mount_reset_sbqflags(mp);
+			if (error)
+				goto out_rtunmount;
+		}
+	}
+
+	/*
+	 * Finish recovering the file system.  This part needed to be
+	 * delayed until after the root and real-time bitmap inodes
+	 * were consistently read in.
+	 */
+	error = xfs_log_mount_finish(mp);
+	if (error) {
+		xfs_warn(mp, "log mount finish failed");
+		goto out_rtunmount;
+	}
+
+	/*
+	 * Complete the quota initialisation, post-log-replay component.
+	 */
+	if (quotamount) {
+		ASSERT(mp->m_qflags == 0);
+		mp->m_qflags = quotaflags;
+
+		xfs_qm_mount_quotas(mp);
+	}
+
+	/*
+	 * Now we are mounted, reserve a small amount of unused space for
+	 * privileged transactions. This is needed so that transaction
+	 * space required for critical operations can dip into this pool
+	 * when at ENOSPC. This is needed for operations like create with
+	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
+	 * are not allowed to use this reserved space.
+	 *
+	 * This may drive us straight to ENOSPC on mount, but that implies
+	 * we were already there on the last unmount. Warn if this occurs.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		resblks = xfs_default_resblks(mp);
+		error = xfs_reserve_blocks(mp, &resblks, NULL);
+		if (error)
+			xfs_warn(mp,
+	"Unable to allocate reserve blocks. Continuing without reserve pool.");
+	}
+
+	return 0;
+
+ out_rtunmount:
+	xfs_rtunmount_inodes(mp);
+ out_rele_rip:
+	IRELE(rip);
+ out_log_dealloc:
+	xfs_log_unmount(mp);
+ out_fail_wait:
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+		xfs_wait_buftarg(mp->m_logdev_targp);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+ out_free_perag:
+	xfs_free_perag(mp);
+ out_free_dir:
+	xfs_da_unmount(mp);
+ out_remove_uuid:
+	xfs_uuid_unmount(mp);
+ out_remove_sysfs:
+	xfs_sysfs_del(&mp->m_kobj);
+ out:
+	return error;
+}
+
+/*
+ * This flushes out the inodes,dquots and the superblock, unmounts the
+ * log and makes sure that incore structures are freed.
+ */
+void
+xfs_unmountfs(
+	struct xfs_mount	*mp)
+{
+	__uint64_t		resblks;
+	int			error;
+
+	cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
+	xfs_qm_unmount_quotas(mp);
+	xfs_rtunmount_inodes(mp);
+	IRELE(mp->m_rootip);
+
+	/*
+	 * We can potentially deadlock here if we have an inode cluster
+	 * that has been freed has its buffer still pinned in memory because
+	 * the transaction is still sitting in a iclog. The stale inodes
+	 * on that buffer will have their flush locks held until the
+	 * transaction hits the disk and the callbacks run. the inode
+	 * flush takes the flush lock unconditionally and with nothing to
+	 * push out the iclog we will never get that unlocked. hence we
+	 * need to force the log first.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Flush all pending changes from the AIL.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+
+	/*
+	 * And reclaim all inodes.  At this point there should be no dirty
+	 * inodes and none should be pinned or locked, but use synchronous
+	 * reclaim just to be sure. We can stop background inode reclaim
+	 * here as well if it is still running.
+	 */
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+	xfs_qm_unmount(mp);
+
+	/*
+	 * Unreserve any blocks we have so that when we unmount we don't account
+	 * the reserved free space as used. This is really only necessary for
+	 * lazy superblock counting because it trusts the incore superblock
+	 * counters to be absolutely correct on clean unmount.
+	 *
+	 * We don't bother correcting this elsewhere for lazy superblock
+	 * counting because on mount of an unclean filesystem we reconstruct the
+	 * correct counter value and this is irrelevant.
+	 *
+	 * For non-lazy counter filesystems, this doesn't matter at all because
+	 * we only every apply deltas to the superblock and hence the incore
+	 * value does not matter....
+	 */
+	resblks = 0;
+	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	if (error)
+		xfs_warn(mp, "Unable to free reserved block pool. "
+				"Freespace may not be correct on next mount.");
+
+	error = xfs_log_sbcount(mp);
+	if (error)
+		xfs_warn(mp, "Unable to update superblock counters. "
+				"Freespace may not be correct on next mount.");
+
+	xfs_log_unmount(mp);
+	xfs_da_unmount(mp);
+	xfs_uuid_unmount(mp);
+
+#if defined(DEBUG)
+	xfs_errortag_clearall(mp, 0);
+#endif
+	xfs_free_perag(mp);
+
+	xfs_sysfs_del(&mp->m_kobj);
+}
+
+/*
+ * Determine whether modifications can proceed. The caller specifies the minimum
+ * freeze level for which modifications should not be allowed. This allows
+ * certain operations to proceed while the freeze sequence is in progress, if
+ * necessary.
+ */
+bool
+xfs_fs_writable(
+	struct xfs_mount	*mp,
+	int			level)
+{
+	ASSERT(level > SB_UNFROZEN);
+	if ((mp->m_super->s_writers.frozen >= level) ||
+	    XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
+		return false;
+
+	return true;
+}
+
+/*
+ * xfs_log_sbcount
+ *
+ * Sync the superblock counters to disk.
+ *
+ * Note this code can be called during the process of freezing, so we use the
+ * transaction allocator that does not block when the transaction subsystem is
+ * in its frozen state.
+ */
+int
+xfs_log_sbcount(xfs_mount_t *mp)
+{
+	/* allow this to proceed during the freeze sequence... */
+	if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
+		return 0;
+
+	/*
+	 * we don't need to do this if we are updating the superblock
+	 * counters on every modification.
+	 */
+	if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+		return 0;
+
+	return xfs_sync_sb(mp, true);
+}
+
+/*
+ * Deltas for the inode count are +/-64, hence we use a large batch size
+ * of 128 so we don't need to take the counter lock on every update.
+ */
+#define XFS_ICOUNT_BATCH	128
+int
+xfs_mod_icount(
+	struct xfs_mount	*mp,
+	int64_t			delta)
+{
+	__percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
+	if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
+		ASSERT(0);
+		percpu_counter_add(&mp->m_icount, -delta);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int
+xfs_mod_ifree(
+	struct xfs_mount	*mp,
+	int64_t			delta)
+{
+	percpu_counter_add(&mp->m_ifree, delta);
+	if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
+		ASSERT(0);
+		percpu_counter_add(&mp->m_ifree, -delta);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Deltas for the block count can vary from 1 to very large, but lock contention
+ * only occurs on frequent small block count updates such as in the delayed
+ * allocation path for buffered writes (page a time updates). Hence we set
+ * a large batch count (1024) to minimise global counter updates except when
+ * we get near to ENOSPC and we have to be very accurate with our updates.
+ */
+#define XFS_FDBLOCKS_BATCH	1024
+int
+xfs_mod_fdblocks(
+	struct xfs_mount	*mp,
+	int64_t			delta,
+	bool			rsvd)
+{
+	int64_t			lcounter;
+	long long		res_used;
+	s32			batch;
+
+	if (delta > 0) {
+		/*
+		 * If the reserve pool is depleted, put blocks back into it
+		 * first. Most of the time the pool is full.
+		 */
+		if (likely(mp->m_resblks == mp->m_resblks_avail)) {
+			percpu_counter_add(&mp->m_fdblocks, delta);
+			return 0;
+		}
+
+		spin_lock(&mp->m_sb_lock);
+		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+		if (res_used > delta) {
+			mp->m_resblks_avail += delta;
+		} else {
+			delta -= res_used;
+			mp->m_resblks_avail = mp->m_resblks;
+			percpu_counter_add(&mp->m_fdblocks, delta);
+		}
+		spin_unlock(&mp->m_sb_lock);
+		return 0;
+	}
+
+	/*
+	 * Taking blocks away, need to be more accurate the closer we
+	 * are to zero.
+	 *
+	 * If the counter has a value of less than 2 * max batch size,
+	 * then make everything serialise as we are real close to
+	 * ENOSPC.
+	 */
+	if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+				     XFS_FDBLOCKS_BATCH) < 0)
+		batch = 1;
+	else
+		batch = XFS_FDBLOCKS_BATCH;
+
+	__percpu_counter_add(&mp->m_fdblocks, delta, batch);
+	if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp),
+				     XFS_FDBLOCKS_BATCH) >= 0) {
+		/* we had space! */
+		return 0;
+	}
+
+	/*
+	 * lock up the sb for dipping into reserves before releasing the space
+	 * that took us to ENOSPC.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	percpu_counter_add(&mp->m_fdblocks, -delta);
+	if (!rsvd)
+		goto fdblocks_enospc;
+
+	lcounter = (long long)mp->m_resblks_avail + delta;
+	if (lcounter >= 0) {
+		mp->m_resblks_avail = lcounter;
+		spin_unlock(&mp->m_sb_lock);
+		return 0;
+	}
+	printk_once(KERN_WARNING
+		"Filesystem \"%s\": reserve blocks depleted! "
+		"Consider increasing reserve pool size.",
+		mp->m_fsname);
+fdblocks_enospc:
+	spin_unlock(&mp->m_sb_lock);
+	return -ENOSPC;
+}
+
+int
+xfs_mod_frextents(
+	struct xfs_mount	*mp,
+	int64_t			delta)
+{
+	int64_t			lcounter;
+	int			ret = 0;
+
+	spin_lock(&mp->m_sb_lock);
+	lcounter = mp->m_sb.sb_frextents + delta;
+	if (lcounter < 0)
+		ret = -ENOSPC;
+	else
+		mp->m_sb.sb_frextents = lcounter;
+	spin_unlock(&mp->m_sb_lock);
+	return ret;
+}
+
+/*
+ * xfs_getsb() is called to obtain the buffer for the superblock.
+ * The buffer is returned locked and read in from disk.
+ * The buffer should be released with a call to xfs_brelse().
+ *
+ * If the flags parameter is BUF_TRYLOCK, then we'll only return
+ * the superblock buffer if it can be locked without sleeping.
+ * If it can't then we'll return NULL.
+ */
+struct xfs_buf *
+xfs_getsb(
+	struct xfs_mount	*mp,
+	int			flags)
+{
+	struct xfs_buf		*bp = mp->m_sb_bp;
+
+	if (!xfs_buf_trylock(bp)) {
+		if (flags & XBF_TRYLOCK)
+			return NULL;
+		xfs_buf_lock(bp);
+	}
+
+	xfs_buf_hold(bp);
+	ASSERT(XFS_BUF_ISDONE(bp));
+	return bp;
+}
+
+/*
+ * Used to free the superblock along various error paths.
+ */
+void
+xfs_freesb(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp = mp->m_sb_bp;
+
+	xfs_buf_lock(bp);
+	mp->m_sb_bp = NULL;
+	xfs_buf_relse(bp);
+}
+
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+	struct xfs_mount	*mp,
+	char			*message)
+{
+	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
+	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+		xfs_notice(mp, "%s required on read-only device.", message);
+		xfs_notice(mp, "write access unavailable, cannot proceed.");
+		return -EROFS;
+	}
+	return 0;
+}
diff --git a/kernel/fs/xfs/xfs_mount.h b/kernel/fs/xfs/xfs_mount.h
new file mode 100644
index 000000000..8c995a2cc
--- /dev/null
+++ b/kernel/fs/xfs/xfs_mount.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_MOUNT_H__
+#define	__XFS_MOUNT_H__
+
+struct xlog;
+struct xfs_inode;
+struct xfs_mru_cache;
+struct xfs_nameops;
+struct xfs_ail;
+struct xfs_quotainfo;
+struct xfs_dir_ops;
+struct xfs_da_geometry;
+
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+	XFS_LOWSP_1_PCNT = 0,
+	XFS_LOWSP_2_PCNT,
+	XFS_LOWSP_3_PCNT,
+	XFS_LOWSP_4_PCNT,
+	XFS_LOWSP_5_PCNT,
+	XFS_LOWSP_MAX,
+};
+
+typedef struct xfs_mount {
+	struct super_block	*m_super;
+	xfs_tid_t		m_tid;		/* next unused tid for fs */
+	struct xfs_ail		*m_ail;		/* fs active log item list */
+
+	struct xfs_sb		m_sb;		/* copy of fs superblock */
+	spinlock_t		m_sb_lock;	/* sb counter lock */
+	struct percpu_counter	m_icount;	/* allocated inodes counter */
+	struct percpu_counter	m_ifree;	/* free inodes counter */
+	struct percpu_counter	m_fdblocks;	/* free block counter */
+
+	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
+	char			*m_fsname;	/* filesystem name */
+	int			m_fsname_len;	/* strlen of fs name */
+	char			*m_rtname;	/* realtime device name */
+	char			*m_logname;	/* external log device name */
+	int			m_bsize;	/* fs logical block size */
+	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
+	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
+	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
+	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
+	uint			m_readio_log;	/* min read size log bytes */
+	uint			m_readio_blocks; /* min read size blocks */
+	uint			m_writeio_log;	/* min write size log bytes */
+	uint			m_writeio_blocks; /* min write size blocks */
+	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
+	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
+	struct xlog		*m_log;		/* log specific stuff */
+	int			m_logbufs;	/* number of log buffers */
+	int			m_logbsize;	/* size of each log buffer */
+	uint			m_rsumlevels;	/* rt summary levels */
+	uint			m_rsumsize;	/* size of rt summary, bytes */
+	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
+	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
+	struct xfs_inode	*m_rootip;	/* pointer to root directory */
+	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
+	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
+	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
+	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
+	__uint8_t		m_blkbit_log;	/* blocklog + NBBY */
+	__uint8_t		m_blkbb_log;	/* blocklog - BBSHIFT */
+	__uint8_t		m_agno_log;	/* log #ag's */
+	__uint8_t		m_agino_log;	/* #bits for agino in inum */
+	uint			m_inode_cluster_size;/* min inode buf size */
+	uint			m_blockmask;	/* sb_blocksize-1 */
+	uint			m_blockwsize;	/* sb_blocksize in words */
+	uint			m_blockwmask;	/* blockwsize-1 */
+	uint			m_alloc_mxr[2];	/* max alloc btree records */
+	uint			m_alloc_mnr[2];	/* min alloc btree records */
+	uint			m_bmap_dmxr[2];	/* max bmap btree records */
+	uint			m_bmap_dmnr[2];	/* min bmap btree records */
+	uint			m_inobt_mxr[2];	/* max inobt btree records */
+	uint			m_inobt_mnr[2];	/* min inobt btree records */
+	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
+	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+	uint			m_in_maxlevels;	/* max inobt btree levels. */
+	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
+	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
+	struct mutex		m_growlock;	/* growfs mutex */
+	int			m_fixedfsid[2];	/* unchanged for life of FS */
+	uint			m_dmevmask;	/* DMI events for this FS */
+	__uint64_t		m_flags;	/* global mount flags */
+	int			m_ialloc_inos;	/* inodes in inode allocation */
+	int			m_ialloc_blks;	/* blocks in inode allocation */
+	int			m_inoalign_mask;/* mask sb_inoalignmt if used */
+	uint			m_qflags;	/* quota status flags */
+	struct xfs_trans_resv	m_resv;		/* precomputed res values */
+	__uint64_t		m_maxicount;	/* maximum inode count */
+	__uint64_t		m_resblks;	/* total reserved blocks */
+	__uint64_t		m_resblks_avail;/* available reserved blocks */
+	__uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
+	int			m_dalign;	/* stripe unit */
+	int			m_swidth;	/* stripe width */
+	int			m_sinoalign;	/* stripe unit inode alignment */
+	__uint8_t		m_sectbb_log;	/* sectlog - BBSHIFT */
+	const struct xfs_nameops *m_dirnameops;	/* vector of dir name ops */
+	const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
+	const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
+	uint			m_chsize;	/* size of next field */
+	atomic_t		m_active_trans;	/* number trans frozen */
+	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
+	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct delayed_work	m_eofblocks_work; /* background eof blocks
+						     trimming */
+	bool			m_update_sb;	/* sb needs update in mount */
+	int64_t			m_low_space[XFS_LOWSP_MAX];
+						/* low free space thresholds */
+	struct xfs_kobj		m_kobj;
+
+	struct workqueue_struct *m_buf_workqueue;
+	struct workqueue_struct	*m_data_workqueue;
+	struct workqueue_struct	*m_unwritten_workqueue;
+	struct workqueue_struct	*m_cil_workqueue;
+	struct workqueue_struct	*m_reclaim_workqueue;
+	struct workqueue_struct	*m_log_workqueue;
+	struct workqueue_struct *m_eofblocks_workqueue;
+
+	/*
+	 * Generation of the filesysyem layout.  This is incremented by each
+	 * growfs, and used by the pNFS server to ensure the client updates
+	 * its view of the block device once it gets a layout that might
+	 * reference the newly added blocks.  Does not need to be persistent
+	 * as long as we only allow file system size increments, but if we
+	 * ever support shrinks it would have to be persisted in addition
+	 * to various other kinds of pain inflicted on the pNFS server.
+	 */
+	__uint32_t		m_generation;
+} xfs_mount_t;
+
+/*
+ * Flags for m_flags.
+ */
+#define XFS_MOUNT_WSYNC		(1ULL << 0)	/* for nfs - all metadata ops
+						   must be synchronous except
+						   for space allocations */
+#define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
+#define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
+						   operations, typically for
+						   disk errors in metadata */
+#define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
+#define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
+						   allocations */
+#define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
+#define XFS_MOUNT_GRPID		(1ULL << 9)	/* group-ID assigned from directory */
+#define XFS_MOUNT_NORECOVERY	(1ULL << 10)	/* no recovery - dirty fs */
+#define XFS_MOUNT_DFLT_IOSIZE	(1ULL << 12)	/* set default i/o size */
+#define XFS_MOUNT_32BITINODES	(1ULL << 14)	/* do not create inodes above
+						 * 32 bits in size */
+#define XFS_MOUNT_SMALL_INUMS	(1ULL << 15)	/* users wants 32bit inodes */
+#define XFS_MOUNT_NOUUID	(1ULL << 16)	/* ignore uuid during mount */
+#define XFS_MOUNT_BARRIER	(1ULL << 17)
+#define XFS_MOUNT_IKEEP		(1ULL << 18)	/* keep empty inode clusters*/
+#define XFS_MOUNT_SWALLOC	(1ULL << 19)	/* turn on stripe width
+						 * allocation */
+#define XFS_MOUNT_RDONLY	(1ULL << 20)	/* read-only fs */
+#define XFS_MOUNT_DIRSYNC	(1ULL << 21)	/* synchronous directory ops */
+#define XFS_MOUNT_COMPAT_IOSIZE	(1ULL << 22)	/* don't report large preferred
+						 * I/O size in stat() */
+#define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
+						   allocator */
+#define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
+
+
+/*
+ * Default minimum read and write sizes.
+ */
+#define XFS_READIO_LOG_LARGE	16
+#define XFS_WRITEIO_LOG_LARGE	16
+
+/*
+ * Max and min values for mount-option defined I/O
+ * preallocation sizes.
+ */
+#define XFS_MAX_IO_LOG		30	/* 1G */
+#define XFS_MIN_IO_LOG		PAGE_SHIFT
+
+/*
+ * Synchronous read and write sizes.  This should be
+ * better for NFSv2 wsync filesystems.
+ */
+#define	XFS_WSYNC_READIO_LOG	15	/* 32k */
+#define	XFS_WSYNC_WRITEIO_LOG	14	/* 16k */
+
+/*
+ * Allow large block sizes to be reported to userspace programs if the
+ * "largeio" mount option is used.
+ *
+ * If compatibility mode is specified, simply return the basic unit of caching
+ * so that we don't get inefficient read/modify/write I/O from user apps.
+ * Otherwise....
+ *
+ * If the underlying volume is a stripe, then return the stripe width in bytes
+ * as the recommended I/O size. It is not a stripe and we've set a default
+ * buffered I/O size, return that, otherwise return the compat default.
+ */
+static inline unsigned long
+xfs_preferred_iosize(xfs_mount_t *mp)
+{
+	if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
+		return PAGE_CACHE_SIZE;
+	return (mp->m_swidth ?
+		(mp->m_swidth << mp->m_sb.sb_blocklog) :
+		((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
+			(1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
+			PAGE_CACHE_SIZE));
+}
+
+#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp)	\
+				((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
+#define XFS_FORCED_SHUTDOWN(mp)	((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
+void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+		int lnnum);
+#define xfs_force_shutdown(m,f)	\
+	xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+
+#define SHUTDOWN_META_IO_ERROR	0x0001	/* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR	0x0002	/* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT	0x0004	/* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE	0x0008	/* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ	0x0010	/* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ	0x0020	/* failed all paths to the device */
+
+/*
+ * Flags for xfs_mountfs
+ */
+#define XFS_MFSI_QUIET		0x40	/* Be silent if mount errors found */
+
+static inline xfs_agnumber_t
+xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+	xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+	do_div(ld, mp->m_sb.sb_agblocks);
+	return (xfs_agnumber_t) ld;
+}
+
+static inline xfs_agblock_t
+xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+	xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+	return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection.
+ */
+typedef struct xfs_perag {
+	struct xfs_mount *pag_mount;	/* owner filesystem */
+	xfs_agnumber_t	pag_agno;	/* AG this structure belongs to */
+	atomic_t	pag_ref;	/* perag reference count */
+	char		pagf_init;	/* this agf's entry is initialized */
+	char		pagi_init;	/* this agi's entry is initialized */
+	char		pagf_metadata;	/* the agf is preferred to be metadata */
+	char		pagi_inodeok;	/* The agi is ok for inodes */
+	__uint8_t	pagf_levels[XFS_BTNUM_AGF];
+					/* # of levels in bno & cnt btree */
+	__uint32_t	pagf_flcount;	/* count of blocks in freelist */
+	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
+	xfs_extlen_t	pagf_longest;	/* longest free space */
+	__uint32_t	pagf_btreeblks;	/* # of blocks held in AGF btrees */
+	xfs_agino_t	pagi_freecount;	/* number of free inodes */
+	xfs_agino_t	pagi_count;	/* number of allocated inodes */
+
+	/*
+	 * Inode allocation search lookup optimisation.
+	 * If the pagino matches, the search for new inodes
+	 * doesn't need to search the near ones again straight away
+	 */
+	xfs_agino_t	pagl_pagino;
+	xfs_agino_t	pagl_leftrec;
+	xfs_agino_t	pagl_rightrec;
+	spinlock_t	pagb_lock;	/* lock for pagb_tree */
+	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
+
+	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
+
+	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
+	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
+	int		pag_ici_reclaimable;	/* reclaimable inodes */
+	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
+	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
+
+	/* buffer cache index */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_tree */
+	struct rb_root	pag_buf_tree;	/* ordered tree of active buffers */
+
+	/* for rcu-safe freeing */
+	struct rcu_head	rcu_head;
+	int		pagb_count;	/* pagb slots in use */
+} xfs_perag_t;
+
+extern int	xfs_log_sbcount(xfs_mount_t *);
+extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
+extern int	xfs_mountfs(xfs_mount_t *mp);
+extern int	xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
+				     xfs_agnumber_t *maxagi);
+extern void	xfs_unmountfs(xfs_mount_t *);
+
+extern int	xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
+extern int	xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
+extern int	xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+				 bool reserved);
+extern int	xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+
+extern int	xfs_mount_log_sb(xfs_mount_t *);
+extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
+extern int	xfs_readsb(xfs_mount_t *, int);
+extern void	xfs_freesb(xfs_mount_t *);
+extern bool	xfs_fs_writable(struct xfs_mount *mp, int level);
+extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
+
+extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
+
+extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
+
+#endif	/* __XFS_MOUNT_H__ */
diff --git a/kernel/fs/xfs/xfs_mru_cache.c b/kernel/fs/xfs/xfs_mru_cache.c
new file mode 100644
index 000000000..f8a674d7f
--- /dev/null
+++ b/kernel/fs/xfs/xfs_mru_cache.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_mru_cache.h"
+
+/*
+ * The MRU Cache data structure consists of a data store, an array of lists and
+ * a lock to protect its internal state.  At initialisation time, the client
+ * supplies an element lifetime in milliseconds and a group count, as well as a
+ * function pointer to call when deleting elements.  A data structure for
+ * queueing up work in the form of timed callbacks is also included.
+ *
+ * The group count controls how many lists are created, and thereby how finely
+ * the elements are grouped in time.  When reaping occurs, all the elements in
+ * all the lists whose time has expired are deleted.
+ *
+ * To give an example of how this works in practice, consider a client that
+ * initialises an MRU Cache with a lifetime of ten seconds and a group count of
+ * five.  Five internal lists will be created, each representing a two second
+ * period in time.  When the first element is added, time zero for the data
+ * structure is initialised to the current time.
+ *
+ * All the elements added in the first two seconds are appended to the first
+ * list.  Elements added in the third second go into the second list, and so on.
+ * If an element is accessed at any point, it is removed from its list and
+ * inserted at the head of the current most-recently-used list.
+ *
+ * The reaper function will have nothing to do until at least twelve seconds
+ * have elapsed since the first element was added.  The reason for this is that
+ * if it were called at t=11s, there could be elements in the first list that
+ * have only been inactive for nine seconds, so it still does nothing.  If it is
+ * called anywhere between t=12 and t=14 seconds, it will delete all the
+ * elements that remain in the first list.  It's therefore possible for elements
+ * to remain in the data store even after they've been inactive for up to
+ * (t + t/g) seconds, where t is the inactive element lifetime and g is the
+ * number of groups.
+ *
+ * The above example assumes that the reaper function gets called at least once
+ * every (t/g) seconds.  If it is called less frequently, unused elements will
+ * accumulate in the reap list until the reaper function is eventually called.
+ * The current implementation uses work queue callbacks to carefully time the
+ * reaper function calls, so this should happen rarely, if at all.
+ *
+ * From a design perspective, the primary reason for the choice of a list array
+ * representing discrete time intervals is that it's only practical to reap
+ * expired elements in groups of some appreciable size.  This automatically
+ * introduces a granularity to element lifetimes, so there's no point storing an
+ * individual timeout with each element that specifies a more precise reap time.
+ * The bonus is a saving of sizeof(long) bytes of memory per element stored.
+ *
+ * The elements could have been stored in just one list, but an array of
+ * counters or pointers would need to be maintained to allow them to be divided
+ * up into discrete time groups.  More critically, the process of touching or
+ * removing an element would involve walking large portions of the entire list,
+ * which would have a detrimental effect on performance.  The additional memory
+ * requirement for the array of list heads is minimal.
+ *
+ * When an element is touched or deleted, it needs to be removed from its
+ * current list.  Doubly linked lists are used to make the list maintenance
+ * portion of these operations O(1).  Since reaper timing can be imprecise,
+ * inserts and lookups can occur when there are no free lists available.  When
+ * this happens, all the elements on the LRU list need to be migrated to the end
+ * of the reap list.  To keep the list maintenance portion of these operations
+ * O(1) also, list tails need to be accessible without walking the entire list.
+ * This is the reason why doubly linked list heads are used.
+ */
+
+/*
+ * An MRU Cache is a dynamic data structure that stores its elements in a way
+ * that allows efficient lookups, but also groups them into discrete time
+ * intervals based on insertion time.  This allows elements to be efficiently
+ * and automatically reaped after a fixed period of inactivity.
+ *
+ * When a client data pointer is stored in the MRU Cache it needs to be added to
+ * both the data store and to one of the lists.  It must also be possible to
+ * access each of these entries via the other, i.e. to:
+ *
+ *    a) Walk a list, removing the corresponding data store entry for each item.
+ *    b) Look up a data store entry, then access its list entry directly.
+ *
+ * To achieve both of these goals, each entry must contain both a list entry and
+ * a key, in addition to the user's data pointer.  Note that it's not a good
+ * idea to have the client embed one of these structures at the top of their own
+ * data structure, because inserting the same item more than once would most
+ * likely result in a loop in one of the lists.  That's a sure-fire recipe for
+ * an infinite loop in the code.
+ */
+struct xfs_mru_cache {
+	struct radix_tree_root	store;     /* Core storage data structure.  */
+	struct list_head	*lists;    /* Array of lists, one per grp.  */
+	struct list_head	reap_list; /* Elements overdue for reaping. */
+	spinlock_t		lock;      /* Lock to protect this struct.  */
+	unsigned int		grp_count; /* Number of discrete groups.    */
+	unsigned int		grp_time;  /* Time period spanned by grps.  */
+	unsigned int		lru_grp;   /* Group containing time zero.   */
+	unsigned long		time_zero; /* Time first element was added. */
+	xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+	struct delayed_work	work;      /* Workqueue data for reaping.   */
+	unsigned int		queued;	   /* work has been queued */
+};
+
+static struct workqueue_struct	*xfs_mru_reap_wq;
+
+/*
+ * When inserting, destroying or reaping, it's first necessary to update the
+ * lists relative to a particular time.  In the case of destroying, that time
+ * will be well in the future to ensure that all items are moved to the reap
+ * list.  In all other cases though, the time will be the current time.
+ *
+ * This function enters a loop, moving the contents of the LRU list to the reap
+ * list again and again until either a) the lists are all empty, or b) time zero
+ * has been advanced sufficiently to be within the immediate element lifetime.
+ *
+ * Case a) above is detected by counting how many groups are migrated and
+ * stopping when they've all been moved.  Case b) is detected by monitoring the
+ * time_zero field, which is updated as each group is migrated.
+ *
+ * The return value is the earliest time that more migration could be needed, or
+ * zero if there's no need to schedule more work because the lists are empty.
+ */
+STATIC unsigned long
+_xfs_mru_cache_migrate(
+	struct xfs_mru_cache	*mru,
+	unsigned long		now)
+{
+	unsigned int		grp;
+	unsigned int		migrated = 0;
+	struct list_head	*lru_list;
+
+	/* Nothing to do if the data store is empty. */
+	if (!mru->time_zero)
+		return 0;
+
+	/* While time zero is older than the time spanned by all the lists. */
+	while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
+
+		/*
+		 * If the LRU list isn't empty, migrate its elements to the tail
+		 * of the reap list.
+		 */
+		lru_list = mru->lists + mru->lru_grp;
+		if (!list_empty(lru_list))
+			list_splice_init(lru_list, mru->reap_list.prev);
+
+		/*
+		 * Advance the LRU group number, freeing the old LRU list to
+		 * become the new MRU list; advance time zero accordingly.
+		 */
+		mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
+		mru->time_zero += mru->grp_time;
+
+		/*
+		 * If reaping is so far behind that all the elements on all the
+		 * lists have been migrated to the reap list, it's now empty.
+		 */
+		if (++migrated == mru->grp_count) {
+			mru->lru_grp = 0;
+			mru->time_zero = 0;
+			return 0;
+		}
+	}
+
+	/* Find the first non-empty list from the LRU end. */
+	for (grp = 0; grp < mru->grp_count; grp++) {
+
+		/* Check the grp'th list from the LRU end. */
+		lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
+		if (!list_empty(lru_list))
+			return mru->time_zero +
+			       (mru->grp_count + grp) * mru->grp_time;
+	}
+
+	/* All the lists must be empty. */
+	mru->lru_grp = 0;
+	mru->time_zero = 0;
+	return 0;
+}
+
+/*
+ * When inserting or doing a lookup, an element needs to be inserted into the
+ * MRU list.  The lists must be migrated first to ensure that they're
+ * up-to-date, otherwise the new element could be given a shorter lifetime in
+ * the cache than it should.
+ */
+STATIC void
+_xfs_mru_cache_list_insert(
+	struct xfs_mru_cache	*mru,
+	struct xfs_mru_cache_elem *elem)
+{
+	unsigned int		grp = 0;
+	unsigned long		now = jiffies;
+
+	/*
+	 * If the data store is empty, initialise time zero, leave grp set to
+	 * zero and start the work queue timer if necessary.  Otherwise, set grp
+	 * to the number of group times that have elapsed since time zero.
+	 */
+	if (!_xfs_mru_cache_migrate(mru, now)) {
+		mru->time_zero = now;
+		if (!mru->queued) {
+			mru->queued = 1;
+			queue_delayed_work(xfs_mru_reap_wq, &mru->work,
+			                   mru->grp_count * mru->grp_time);
+		}
+	} else {
+		grp = (now - mru->time_zero) / mru->grp_time;
+		grp = (mru->lru_grp + grp) % mru->grp_count;
+	}
+
+	/* Insert the element at the tail of the corresponding list. */
+	list_add_tail(&elem->list_node, mru->lists + grp);
+}
+
+/*
+ * When destroying or reaping, all the elements that were migrated to the reap
+ * list need to be deleted.  For each element this involves removing it from the
+ * data store, removing it from the reap list, calling the client's free
+ * function and deleting the element from the element zone.
+ *
+ * We get called holding the mru->lock, which we drop and then reacquire.
+ * Sparse need special help with this to tell it we know what we are doing.
+ */
+STATIC void
+_xfs_mru_cache_clear_reap_list(
+	struct xfs_mru_cache	*mru)
+		__releases(mru->lock) __acquires(mru->lock)
+{
+	struct xfs_mru_cache_elem *elem, *next;
+	struct list_head	tmp;
+
+	INIT_LIST_HEAD(&tmp);
+	list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
+
+		/* Remove the element from the data store. */
+		radix_tree_delete(&mru->store, elem->key);
+
+		/*
+		 * remove to temp list so it can be freed without
+		 * needing to hold the lock
+		 */
+		list_move(&elem->list_node, &tmp);
+	}
+	spin_unlock(&mru->lock);
+
+	list_for_each_entry_safe(elem, next, &tmp, list_node) {
+		list_del_init(&elem->list_node);
+		mru->free_func(elem);
+	}
+
+	spin_lock(&mru->lock);
+}
+
+/*
+ * We fire the reap timer every group expiry interval so
+ * we always have a reaper ready to run. This makes shutdown
+ * and flushing of the reaper easy to do. Hence we need to
+ * keep when the next reap must occur so we can determine
+ * at each interval whether there is anything we need to do.
+ */
+STATIC void
+_xfs_mru_cache_reap(
+	struct work_struct	*work)
+{
+	struct xfs_mru_cache	*mru =
+		container_of(work, struct xfs_mru_cache, work.work);
+	unsigned long		now, next;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return;
+
+	spin_lock(&mru->lock);
+	next = _xfs_mru_cache_migrate(mru, jiffies);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	mru->queued = next;
+	if ((mru->queued > 0)) {
+		now = jiffies;
+		if (next <= now)
+			next = 0;
+		else
+			next -= now;
+		queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
+	}
+
+	spin_unlock(&mru->lock);
+}
+
+int
+xfs_mru_cache_init(void)
+{
+	xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
+				WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
+	if (!xfs_mru_reap_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_mru_cache_uninit(void)
+{
+	destroy_workqueue(xfs_mru_reap_wq);
+}
+
+/*
+ * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
+ * with the address of the pointer, a lifetime value in milliseconds, a group
+ * count and a free function to use when deleting elements.  This function
+ * returns 0 if the initialisation was successful.
+ */
+int
+xfs_mru_cache_create(
+	struct xfs_mru_cache	**mrup,
+	unsigned int		lifetime_ms,
+	unsigned int		grp_count,
+	xfs_mru_cache_free_func_t free_func)
+{
+	struct xfs_mru_cache	*mru = NULL;
+	int			err = 0, grp;
+	unsigned int		grp_time;
+
+	if (mrup)
+		*mrup = NULL;
+
+	if (!mrup || !grp_count || !lifetime_ms || !free_func)
+		return -EINVAL;
+
+	if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
+		return -EINVAL;
+
+	if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+		return -ENOMEM;
+
+	/* An extra list is needed to avoid reaping up to a grp_time early. */
+	mru->grp_count = grp_count + 1;
+	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+
+	if (!mru->lists) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	for (grp = 0; grp < mru->grp_count; grp++)
+		INIT_LIST_HEAD(mru->lists + grp);
+
+	/*
+	 * We use GFP_KERNEL radix tree preload and do inserts under a
+	 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
+	 */
+	INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
+	INIT_LIST_HEAD(&mru->reap_list);
+	spin_lock_init(&mru->lock);
+	INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
+
+	mru->grp_time  = grp_time;
+	mru->free_func = free_func;
+
+	*mrup = mru;
+
+exit:
+	if (err && mru && mru->lists)
+		kmem_free(mru->lists);
+	if (err && mru)
+		kmem_free(mru);
+
+	return err;
+}
+
+/*
+ * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
+ * free functions as they're deleted.  When this function returns, the caller is
+ * guaranteed that all the free functions for all the elements have finished
+ * executing and the reaper is not running.
+ */
+static void
+xfs_mru_cache_flush(
+	struct xfs_mru_cache	*mru)
+{
+	if (!mru || !mru->lists)
+		return;
+
+	spin_lock(&mru->lock);
+	if (mru->queued) {
+		spin_unlock(&mru->lock);
+		cancel_delayed_work_sync(&mru->work);
+		spin_lock(&mru->lock);
+	}
+
+	_xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	spin_unlock(&mru->lock);
+}
+
+void
+xfs_mru_cache_destroy(
+	struct xfs_mru_cache	*mru)
+{
+	if (!mru || !mru->lists)
+		return;
+
+	xfs_mru_cache_flush(mru);
+
+	kmem_free(mru->lists);
+	kmem_free(mru);
+}
+
+/*
+ * To insert an element, call xfs_mru_cache_insert() with the data store, the
+ * element's key and the client data pointer.  This function returns 0 on
+ * success or ENOMEM if memory for the data element couldn't be allocated.
+ */
+int
+xfs_mru_cache_insert(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key,
+	struct xfs_mru_cache_elem *elem)
+{
+	int			error;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return -EINVAL;
+
+	if (radix_tree_preload(GFP_NOFS))
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&elem->list_node);
+	elem->key = key;
+
+	spin_lock(&mru->lock);
+	error = radix_tree_insert(&mru->store, key, elem);
+	radix_tree_preload_end();
+	if (!error)
+		_xfs_mru_cache_list_insert(mru, elem);
+	spin_unlock(&mru->lock);
+
+	return error;
+}
+
+/*
+ * To remove an element without calling the free function, call
+ * xfs_mru_cache_remove() with the data store and the element's key.  On success
+ * the client data pointer for the removed element is returned, otherwise this
+ * function will return a NULL pointer.
+ */
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key)
+{
+	struct xfs_mru_cache_elem *elem;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return NULL;
+
+	spin_lock(&mru->lock);
+	elem = radix_tree_delete(&mru->store, key);
+	if (elem)
+		list_del(&elem->list_node);
+	spin_unlock(&mru->lock);
+
+	return elem;
+}
+
+/*
+ * To remove and element and call the free function, call xfs_mru_cache_delete()
+ * with the data store and the element's key.
+ */
+void
+xfs_mru_cache_delete(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key)
+{
+	struct xfs_mru_cache_elem *elem;
+
+	elem = xfs_mru_cache_remove(mru, key);
+	if (elem)
+		mru->free_func(elem);
+}
+
+/*
+ * To look up an element using its key, call xfs_mru_cache_lookup() with the
+ * data store and the element's key.  If found, the element will be moved to the
+ * head of the MRU list to indicate that it's been touched.
+ *
+ * The internal data structures are protected by a spinlock that is STILL HELD
+ * when this function returns.  Call xfs_mru_cache_done() to release it.  Note
+ * that it is not safe to call any function that might sleep in the interim.
+ *
+ * The implementation could have used reference counting to avoid this
+ * restriction, but since most clients simply want to get, set or test a member
+ * of the returned data structure, the extra per-element memory isn't warranted.
+ *
+ * If the element isn't found, this function returns NULL and the spinlock is
+ * released.  xfs_mru_cache_done() should NOT be called when this occurs.
+ *
+ * Because sparse isn't smart enough to know about conditional lock return
+ * status, we need to help it get it right by annotating the path that does
+ * not release the lock.
+ */
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key)
+{
+	struct xfs_mru_cache_elem *elem;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return NULL;
+
+	spin_lock(&mru->lock);
+	elem = radix_tree_lookup(&mru->store, key);
+	if (elem) {
+		list_del(&elem->list_node);
+		_xfs_mru_cache_list_insert(mru, elem);
+		__release(mru_lock); /* help sparse not be stupid */
+	} else
+		spin_unlock(&mru->lock);
+
+	return elem;
+}
+
+/*
+ * To release the internal data structure spinlock after having performed an
+ * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
+ * with the data store pointer.
+ */
+void
+xfs_mru_cache_done(
+	struct xfs_mru_cache	*mru)
+		__releases(mru->lock)
+{
+	spin_unlock(&mru->lock);
+}
diff --git a/kernel/fs/xfs/xfs_mru_cache.h b/kernel/fs/xfs/xfs_mru_cache.h
new file mode 100644
index 000000000..fb5245ba5
--- /dev/null
+++ b/kernel/fs/xfs/xfs_mru_cache.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_MRU_CACHE_H__
+#define __XFS_MRU_CACHE_H__
+
+struct xfs_mru_cache;
+
+struct xfs_mru_cache_elem {
+	struct list_head list_node;
+	unsigned long	key;
+};
+
+/* Function pointer type for callback to free a client's data pointer. */
+typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem);
+
+int xfs_mru_cache_init(void);
+void xfs_mru_cache_uninit(void);
+int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
+			     unsigned int grp_count,
+			     xfs_mru_cache_free_func_t free_func);
+void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
+int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
+		struct xfs_mru_cache_elem *elem);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_done(struct xfs_mru_cache *mru);
+
+#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/kernel/fs/xfs/xfs_pnfs.c b/kernel/fs/xfs/xfs_pnfs.c
new file mode 100644
index 000000000..981a657ec
--- /dev/null
+++ b/kernel/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_pnfs.h"
+
+/*
+ * Ensure that we do not have any outstanding pNFS layouts that can be used by
+ * clients to directly read from or write to this inode.  This must be called
+ * before every operation that can remove blocks from the extent map.
+ * Additionally we call it during the write operation, where aren't concerned
+ * about exposing unallocated blocks but just want to provide basic
+ * synchronization between a local writer and pNFS clients.  mmap writes would
+ * also benefit from this sort of synchronization, but due to the tricky locking
+ * rules in the page fault path we don't bother.
+ */
+int
+xfs_break_layouts(
+	struct inode		*inode,
+	uint			*iolock,
+	bool			with_imutex)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+
+	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
+		xfs_iunlock(ip, *iolock);
+		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
+			mutex_unlock(&inode->i_mutex);
+		error = break_layout(inode, true);
+		*iolock = XFS_IOLOCK_EXCL;
+		if (with_imutex)
+			mutex_lock(&inode->i_mutex);
+		xfs_ilock(ip, *iolock);
+	}
+
+	return error;
+}
+
+/*
+ * Get a unique ID including its location so that the client can identify
+ * the exported device.
+ */
+int
+xfs_fs_get_uuid(
+	struct super_block	*sb,
+	u8			*buf,
+	u32			*len,
+	u64			*offset)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	printk_once(KERN_NOTICE
+"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
+		mp->m_fsname);
+
+	if (*len < sizeof(uuid_t))
+		return -EINVAL;
+
+	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+	*len = sizeof(uuid_t);
+	*offset = offsetof(struct xfs_dsb, sb_uuid);
+	return 0;
+}
+
+static void
+xfs_bmbt_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (imap->br_startblock == HOLESTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_DELALLOC;
+	} else {
+		iomap->blkno =
+			XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+}
+
+/*
+ * Get a layout for the pNFS client.
+ */
+int
+xfs_fs_map_blocks(
+	struct inode		*inode,
+	loff_t			offset,
+	u64			length,
+	struct iomap		*iomap,
+	bool			write,
+	u32			*device_generation)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	loff_t			limit;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+	uint			lock_flags;
+	int			error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * We can't export inodes residing on the realtime device.  The realtime
+	 * device doesn't have a UUID to identify it, so the client has no way
+	 * to find it.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip))
+		return -ENXIO;
+
+	/*
+	 * Lock out any other I/O before we flush and invalidate the pagecache,
+	 * and then hand out a layout to the remote system.  This is very
+	 * similar to direct I/O, except that the synchronization is much more
+	 * complicated.  See the comment near xfs_break_layouts for a detailed
+	 * explanation.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	error = -EINVAL;
+	limit = mp->m_super->s_maxbytes;
+	if (!write)
+		limit = max(limit, round_up(i_size_read(inode),
+				     inode->i_sb->s_blocksize));
+	if (offset > limit)
+		goto out_unlock;
+	if (offset > limit - length)
+		length = limit - offset;
+
+	error = filemap_write_and_wait(inode->i_mapping);
+	if (error)
+		goto out_unlock;
+	error = invalidate_inode_pages2(inode->i_mapping);
+	if (WARN_ON_ONCE(error))
+		return error;
+
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	lock_flags = xfs_ilock_data_map_shared(ip);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, bmapi_flags);
+	xfs_iunlock(ip, lock_flags);
+
+	if (error)
+		goto out_unlock;
+
+	if (write) {
+		enum xfs_prealloc_flags	flags = 0;
+
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+		if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+			error = xfs_iomap_write_direct(ip, offset, length,
+						       &imap, nimaps);
+			if (error)
+				goto out_unlock;
+
+			/*
+			 * Ensure the next transaction is committed
+			 * synchronously so that the blocks allocated and
+			 * handed out to the client are guaranteed to be
+			 * present even after a server crash.
+			 */
+			flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
+		}
+
+		error = xfs_update_prealloc_flags(ip, flags);
+		if (error)
+			goto out_unlock;
+	}
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+	xfs_bmbt_to_iomap(ip, iomap, &imap);
+	*device_generation = mp->m_generation;
+	return error;
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * Ensure the size update falls into a valid allocated block.
+ */
+static int
+xfs_pnfs_validate_isize(
+	struct xfs_inode	*ip,
+	xfs_off_t		isize)
+{
+	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1;
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
+				&imap, &nimaps, 0);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (error)
+		return error;
+
+	if (imap.br_startblock == HOLESTARTBLOCK ||
+	    imap.br_startblock == DELAYSTARTBLOCK ||
+	    imap.br_state == XFS_EXT_UNWRITTEN)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * Make sure the blocks described by maps are stable on disk.  This includes
+ * converting any unwritten extents, flushing the disk cache and updating the
+ * time stamps.
+ *
+ * Note that we rely on the caller to always send us a timestamp update so that
+ * we always commit a transaction here.  If that stops being true we will have
+ * to manually flush the cache here similar to what the fsync code path does
+ * for datasyncs on files that have no dirty metadata.
+ */
+int
+xfs_fs_commit_blocks(
+	struct inode		*inode,
+	struct iomap		*maps,
+	int			nr_maps,
+	struct iattr		*iattr)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	bool			update_isize = false;
+	int			error, i;
+	loff_t			size;
+
+	ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	size = i_size_read(inode);
+	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
+		update_isize = true;
+		size = iattr->ia_size;
+	}
+
+	for (i = 0; i < nr_maps; i++) {
+		u64 start, length, end;
+
+		start = maps[i].offset;
+		if (start > size)
+			continue;
+
+		end = start + maps[i].length;
+		if (end > size)
+			end = size;
+
+		length = end - start;
+		if (!length)
+			continue;
+	
+		/*
+		 * Make sure reads through the pagecache see the new data.
+		 */
+		error = invalidate_inode_pages2_range(inode->i_mapping,
+					start >> PAGE_CACHE_SHIFT,
+					(end - 1) >> PAGE_CACHE_SHIFT);
+		WARN_ON_ONCE(error);
+
+		error = xfs_iomap_write_unwritten(ip, start, length);
+		if (error)
+			goto out_drop_iolock;
+	}
+
+	if (update_isize) {
+		error = xfs_pnfs_validate_isize(ip, size);
+		if (error)
+			goto out_drop_iolock;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_drop_iolock;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	xfs_setattr_time(ip, iattr);
+	if (update_isize) {
+		i_size_write(inode, iattr->ia_size);
+		ip->i_d.di_size = iattr->ia_size;
+	}
+
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+
+out_drop_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
diff --git a/kernel/fs/xfs/xfs_pnfs.h b/kernel/fs/xfs/xfs_pnfs.h
new file mode 100644
index 000000000..8147ac108
--- /dev/null
+++ b/kernel/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,19 @@
+#ifndef _XFS_PNFS_H
+#define _XFS_PNFS_H 1
+
+#ifdef CONFIG_NFSD_PNFS
+int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
+		struct iomap *iomap, bool write, u32 *device_generation);
+int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
+		struct iattr *iattr);
+
+int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
+#else
+static inline int
+xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
+{
+	return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _XFS_PNFS_H */
diff --git a/kernel/fs/xfs/xfs_qm.c b/kernel/fs/xfs/xfs_qm.c
new file mode 100644
index 000000000..5538468c7
--- /dev/null
+++ b/kernel/fs/xfs/xfs_qm.c
@@ -0,0 +1,1939 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_quota.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_cksum.h"
+
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
+STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
+
+
+STATIC void	xfs_qm_dqfree_one(struct xfs_dquot *dqp);
+/*
+ * We use the batch lookup interface to iterate over the dquots as it
+ * currently is the only interface into the radix tree code that allows
+ * fuzzy lookups instead of exact matches.  Holding the lock over multiple
+ * operations is fine as all callers are used either during mount/umount
+ * or quotaoff.
+ */
+#define XFS_DQ_LOOKUP_BATCH	32
+
+STATIC int
+xfs_qm_dquot_walk(
+	struct xfs_mount	*mp,
+	int			type,
+	int			(*execute)(struct xfs_dquot *dqp, void *data),
+	void			*data)
+{
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
+	uint32_t		next_index;
+	int			last_error = 0;
+	int			skipped;
+	int			nr_found;
+
+restart:
+	skipped = 0;
+	next_index = 0;
+	nr_found = 0;
+
+	while (1) {
+		struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		mutex_lock(&qi->qi_tree_lock);
+		nr_found = radix_tree_gang_lookup(tree, (void **)batch,
+					next_index, XFS_DQ_LOOKUP_BATCH);
+		if (!nr_found) {
+			mutex_unlock(&qi->qi_tree_lock);
+			break;
+		}
+
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_dquot *dqp = batch[i];
+
+			next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
+
+			error = execute(batch[i], data);
+			if (error == -EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != -EFSCORRUPTED)
+				last_error = error;
+		}
+
+		mutex_unlock(&qi->qi_tree_lock);
+
+		/* bail out if the filesystem is corrupted.  */
+		if (last_error == -EFSCORRUPTED) {
+			skipped = 0;
+			break;
+		}
+	}
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+
+	return last_error;
+}
+
+
+/*
+ * Purge a dquot from all tracking data structures and free it.
+ */
+STATIC int
+xfs_qm_dqpurge(
+	struct xfs_dquot	*dqp,
+	void			*data)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+
+	xfs_dqlock(dqp);
+	if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
+		xfs_dqunlock(dqp);
+		return -EAGAIN;
+	}
+
+	dqp->dq_flags |= XFS_DQ_FREEING;
+
+	xfs_dqflock(dqp);
+
+	/*
+	 * If we are turning this type of quotas off, we don't care
+	 * about the dirty metadata sitting in this dquot. OTOH, if
+	 * we're unmounting, we do care, so we flush it and wait.
+	 */
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		struct xfs_buf	*bp = NULL;
+		int		error;
+
+		/*
+		 * We don't care about getting disk errors here. We need
+		 * to purge this dquot anyway, so we go ahead regardless.
+		 */
+		error = xfs_qm_dqflush(dqp, &bp);
+		if (error) {
+			xfs_warn(mp, "%s: dquot %p flush failed",
+				__func__, dqp);
+		} else {
+			error = xfs_bwrite(bp);
+			xfs_buf_relse(bp);
+		}
+		xfs_dqflock(dqp);
+	}
+
+	ASSERT(atomic_read(&dqp->q_pincount) == 0);
+	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+	       !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+	xfs_dqfunlock(dqp);
+	xfs_dqunlock(dqp);
+
+	radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
+			  be32_to_cpu(dqp->q_core.d_id));
+	qi->qi_dquots--;
+
+	/*
+	 * We move dquots to the freelist as soon as their reference count
+	 * hits zero, so it really should be on the freelist here.
+	 */
+	ASSERT(!list_empty(&dqp->q_lru));
+	list_lru_del(&qi->qi_lru, &dqp->q_lru);
+	XFS_STATS_DEC(xs_qm_dquot_unused);
+
+	xfs_qm_dqdestroy(dqp);
+	return 0;
+}
+
+/*
+ * Purge the dquot cache.
+ */
+void
+xfs_qm_dqpurge_all(
+	struct xfs_mount	*mp,
+	uint			flags)
+{
+	if (flags & XFS_QMOPT_UQUOTA)
+		xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
+	if (flags & XFS_QMOPT_GQUOTA)
+		xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
+	if (flags & XFS_QMOPT_PQUOTA)
+		xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
+}
+
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_quotainfo) {
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+		xfs_qm_destroy_quotainfo(mp);
+	}
+}
+
+/*
+ * Called from the vfsops layer.
+ */
+void
+xfs_qm_unmount_quotas(
+	xfs_mount_t	*mp)
+{
+	/*
+	 * Release the dquots that root inode, et al might be holding,
+	 * before we flush quotas and blow away the quotainfo structure.
+	 */
+	ASSERT(mp->m_rootip);
+	xfs_qm_dqdetach(mp->m_rootip);
+	if (mp->m_rbmip)
+		xfs_qm_dqdetach(mp->m_rbmip);
+	if (mp->m_rsumip)
+		xfs_qm_dqdetach(mp->m_rsumip);
+
+	/*
+	 * Release the quota inodes.
+	 */
+	if (mp->m_quotainfo) {
+		if (mp->m_quotainfo->qi_uquotaip) {
+			IRELE(mp->m_quotainfo->qi_uquotaip);
+			mp->m_quotainfo->qi_uquotaip = NULL;
+		}
+		if (mp->m_quotainfo->qi_gquotaip) {
+			IRELE(mp->m_quotainfo->qi_gquotaip);
+			mp->m_quotainfo->qi_gquotaip = NULL;
+		}
+		if (mp->m_quotainfo->qi_pquotaip) {
+			IRELE(mp->m_quotainfo->qi_pquotaip);
+			mp->m_quotainfo->qi_pquotaip = NULL;
+		}
+	}
+}
+
+STATIC int
+xfs_qm_dqattach_one(
+	xfs_inode_t	*ip,
+	xfs_dqid_t	id,
+	uint		type,
+	uint		doalloc,
+	xfs_dquot_t	**IO_idqpp)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	error = 0;
+
+	/*
+	 * See if we already have it in the inode itself. IO_idqpp is &i_udquot
+	 * or &i_gdquot. This made the code look weird, but made the logic a lot
+	 * simpler.
+	 */
+	dqp = *IO_idqpp;
+	if (dqp) {
+		trace_xfs_dqattach_found(dqp);
+		return 0;
+	}
+
+	/*
+	 * Find the dquot from somewhere. This bumps the reference count of
+	 * dquot and returns it locked.  This can return ENOENT if dquot didn't
+	 * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
+	 * turned off suddenly.
+	 */
+	error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+			     doalloc | XFS_QMOPT_DOWARN, &dqp);
+	if (error)
+		return error;
+
+	trace_xfs_dqattach_get(dqp);
+
+	/*
+	 * dqget may have dropped and re-acquired the ilock, but it guarantees
+	 * that the dquot returned is the one that should go in the inode.
+	 */
+	*IO_idqpp = dqp;
+	xfs_dqunlock(dqp);
+	return 0;
+}
+
+static bool
+xfs_qm_need_dqattach(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return false;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return false;
+	if (!XFS_NOT_DQATTACHED(mp, ip))
+		return false;
+	if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+		return false;
+	return true;
+}
+
+/*
+ * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
+ * into account.
+ * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach_locked(
+	xfs_inode_t	*ip,
+	uint		flags)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	int		error = 0;
+
+	if (!xfs_qm_need_dqattach(ip))
+		return 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
+		error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+						flags & XFS_QMOPT_DQALLOC,
+						&ip->i_udquot);
+		if (error)
+			goto done;
+		ASSERT(ip->i_udquot);
+	}
+
+	if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
+		error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+						flags & XFS_QMOPT_DQALLOC,
+						&ip->i_gdquot);
+		if (error)
+			goto done;
+		ASSERT(ip->i_gdquot);
+	}
+
+	if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
+		error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+						flags & XFS_QMOPT_DQALLOC,
+						&ip->i_pdquot);
+		if (error)
+			goto done;
+		ASSERT(ip->i_pdquot);
+	}
+
+done:
+	/*
+	 * Don't worry about the dquots that we may have attached before any
+	 * error - they'll get detached later if it has not already been done.
+	 */
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	return error;
+}
+
+int
+xfs_qm_dqattach(
+	struct xfs_inode	*ip,
+	uint			flags)
+{
+	int			error;
+
+	if (!xfs_qm_need_dqattach(ip))
+		return 0;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_qm_dqattach_locked(ip, flags);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdetach(
+	xfs_inode_t	*ip)
+{
+	if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
+		return;
+
+	trace_xfs_dquot_dqdetach(ip);
+
+	ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
+	if (ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if (ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+	if (ip->i_pdquot) {
+		xfs_qm_dqrele(ip->i_pdquot);
+		ip->i_pdquot = NULL;
+	}
+}
+
+struct xfs_qm_isolate {
+	struct list_head	buffers;
+	struct list_head	dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+		__releases(lru_lock) __acquires(lru_lock)
+{
+	struct xfs_dquot	*dqp = container_of(item,
+						struct xfs_dquot, q_lru);
+	struct xfs_qm_isolate	*isol = arg;
+
+	if (!xfs_dqlock_nowait(dqp))
+		goto out_miss_busy;
+
+	/*
+	 * This dquot has acquired a reference in the meantime remove it from
+	 * the freelist and try again.
+	 */
+	if (dqp->q_nrefs) {
+		xfs_dqunlock(dqp);
+		XFS_STATS_INC(xs_qm_dqwants);
+
+		trace_xfs_dqreclaim_want(dqp);
+		list_lru_isolate(lru, &dqp->q_lru);
+		XFS_STATS_DEC(xs_qm_dquot_unused);
+		return LRU_REMOVED;
+	}
+
+	/*
+	 * If the dquot is dirty, flush it. If it's already being flushed, just
+	 * skip it so there is time for the IO to complete before we try to
+	 * reclaim it again on the next LRU pass.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		xfs_dqunlock(dqp);
+		goto out_miss_busy;
+	}
+
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		struct xfs_buf	*bp = NULL;
+		int		error;
+
+		trace_xfs_dqreclaim_dirty(dqp);
+
+		/* we have to drop the LRU lock to flush the dquot */
+		spin_unlock(lru_lock);
+
+		error = xfs_qm_dqflush(dqp, &bp);
+		if (error) {
+			xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+				 __func__, dqp);
+			goto out_unlock_dirty;
+		}
+
+		xfs_buf_delwri_queue(bp, &isol->buffers);
+		xfs_buf_relse(bp);
+		goto out_unlock_dirty;
+	}
+	xfs_dqfunlock(dqp);
+
+	/*
+	 * Prevent lookups now that we are past the point of no return.
+	 */
+	dqp->dq_flags |= XFS_DQ_FREEING;
+	xfs_dqunlock(dqp);
+
+	ASSERT(dqp->q_nrefs == 0);
+	list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
+	XFS_STATS_DEC(xs_qm_dquot_unused);
+	trace_xfs_dqreclaim_done(dqp);
+	XFS_STATS_INC(xs_qm_dqreclaims);
+	return LRU_REMOVED;
+
+out_miss_busy:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(xs_qm_dqreclaim_misses);
+	return LRU_SKIP;
+
+out_unlock_dirty:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(xs_qm_dqreclaim_misses);
+	xfs_dqunlock(dqp);
+	spin_lock(lru_lock);
+	return LRU_RETRY;
+}
+
+static unsigned long
+xfs_qm_shrink_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+	struct xfs_qm_isolate	isol;
+	unsigned long		freed;
+	int			error;
+
+	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+		return 0;
+
+	INIT_LIST_HEAD(&isol.buffers);
+	INIT_LIST_HEAD(&isol.dispose);
+
+	freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+				     xfs_qm_dquot_isolate, &isol);
+
+	error = xfs_buf_delwri_submit(&isol.buffers);
+	if (error)
+		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+	while (!list_empty(&isol.dispose)) {
+		struct xfs_dquot	*dqp;
+
+		dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+		list_del_init(&dqp->q_lru);
+		xfs_qm_dqfree_one(dqp);
+	}
+
+	return freed;
+}
+
+static unsigned long
+xfs_qm_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+
+	return list_lru_shrink_count(&qi->qi_lru, sc);
+}
+
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+STATIC int
+xfs_qm_init_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qinf;
+	int		error;
+	xfs_dquot_t	*dqp;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+
+	error = list_lru_init(&qinf->qi_lru);
+	if (error)
+		goto out_free_qinf;
+
+	/*
+	 * See if quotainodes are setup, and if not, allocate them,
+	 * and change the superblock accordingly.
+	 */
+	error = xfs_qm_init_quotainos(mp);
+	if (error)
+		goto out_free_lru;
+
+	INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
+	INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+	INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
+	mutex_init(&qinf->qi_tree_lock);
+
+	/* mutex used to serialize quotaoffs */
+	mutex_init(&qinf->qi_quotaofflock);
+
+	/* Precalc some constants */
+	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
+
+	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+
+	/*
+	 * We try to get the limits from the superuser's limits fields.
+	 * This is quite hacky, but it is standard quota practice.
+	 *
+	 * We look at the USR dquot with id == 0 first, but if user quotas
+	 * are not enabled we goto the GRP dquot with id == 0.
+	 * We don't really care to keep separate default limits for user
+	 * and group quotas, at least not at this point.
+	 *
+	 * Since we may not have done a quotacheck by this point, just read
+	 * the dquot without attaching it to any hashtables or lists.
+	 */
+	error = xfs_qm_dqread(mp, 0,
+			XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
+			 (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
+			  XFS_DQ_PROJ),
+			XFS_QMOPT_DOWARN, &dqp);
+	if (!error) {
+		xfs_disk_dquot_t	*ddqp = &dqp->q_core;
+
+		/*
+		 * The warnings and timers set the grace period given to
+		 * a user or group before he or she can not perform any
+		 * more writing. If it is zero, a default is used.
+		 */
+		qinf->qi_btimelimit = ddqp->d_btimer ?
+			be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = ddqp->d_itimer ?
+			be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
+			be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = ddqp->d_bwarns ?
+			be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = ddqp->d_iwarns ?
+			be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
+		qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
+			be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
+		qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+		qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+		qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+		qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+		qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+		qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+
+		xfs_qm_dqdestroy(dqp);
+	} else {
+		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+		qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+	}
+
+	qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
+	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
+	register_shrinker(&qinf->qi_shrinker);
+	return 0;
+
+out_free_lru:
+	list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+	kmem_free(qinf);
+	mp->m_quotainfo = NULL;
+	return error;
+}
+
+
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qi;
+
+	qi = mp->m_quotainfo;
+	ASSERT(qi != NULL);
+
+	unregister_shrinker(&qi->qi_shrinker);
+	list_lru_destroy(&qi->qi_lru);
+
+	if (qi->qi_uquotaip) {
+		IRELE(qi->qi_uquotaip);
+		qi->qi_uquotaip = NULL; /* paranoia */
+	}
+	if (qi->qi_gquotaip) {
+		IRELE(qi->qi_gquotaip);
+		qi->qi_gquotaip = NULL;
+	}
+	if (qi->qi_pquotaip) {
+		IRELE(qi->qi_pquotaip);
+		qi->qi_pquotaip = NULL;
+	}
+	mutex_destroy(&qi->qi_quotaofflock);
+	kmem_free(qi);
+	mp->m_quotainfo = NULL;
+}
+
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+	xfs_mount_t	*mp,
+	xfs_inode_t	**ip,
+	uint		flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	int		committed;
+	bool		need_alloc = true;
+
+	*ip = NULL;
+	/*
+	 * With superblock that doesn't have separate pquotino, we
+	 * share an inode between gquota and pquota. If the on-disk
+	 * superblock has GQUOTA and the filesystem is now mounted
+	 * with PQUOTA, just use sb_gquotino for sb_pquotino and
+	 * vice-versa.
+	 */
+	if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
+			(flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
+		xfs_ino_t ino = NULLFSINO;
+
+		if ((flags & XFS_QMOPT_PQUOTA) &&
+			     (mp->m_sb.sb_gquotino != NULLFSINO)) {
+			ino = mp->m_sb.sb_gquotino;
+			ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+		} else if ((flags & XFS_QMOPT_GQUOTA) &&
+			     (mp->m_sb.sb_pquotino != NULLFSINO)) {
+			ino = mp->m_sb.sb_pquotino;
+			ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+		}
+		if (ino != NULLFSINO) {
+			error = xfs_iget(mp, NULL, ino, 0, 0, ip);
+			if (error)
+				return error;
+			mp->m_sb.sb_gquotino = NULLFSINO;
+			mp->m_sb.sb_pquotino = NULLFSINO;
+			need_alloc = false;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
+				  XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	if (need_alloc) {
+		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
+								&committed);
+		if (error) {
+			xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+					 XFS_TRANS_ABORT);
+			return error;
+		}
+	}
+
+	/*
+	 * Make the changes in the superblock, and log those too.
+	 * sbfields arg may contain fields other than *QUOTINO;
+	 * VERSIONNUM for example.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	if (flags & XFS_QMOPT_SBVERSION) {
+		ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
+
+		xfs_sb_version_addquota(&mp->m_sb);
+		mp->m_sb.sb_uquotino = NULLFSINO;
+		mp->m_sb.sb_gquotino = NULLFSINO;
+		mp->m_sb.sb_pquotino = NULLFSINO;
+
+		/* qflags will get updated fully _after_ quotacheck */
+		mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
+	}
+	if (flags & XFS_QMOPT_UQUOTA)
+		mp->m_sb.sb_uquotino = (*ip)->i_ino;
+	else if (flags & XFS_QMOPT_GQUOTA)
+		mp->m_sb.sb_gquotino = (*ip)->i_ino;
+	else
+		mp->m_sb.sb_pquotino = (*ip)->i_ino;
+	spin_unlock(&mp->m_sb_lock);
+	xfs_log_sb(tp);
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_alert(mp, "%s failed (error %d)!", __func__, error);
+	}
+	if (need_alloc)
+		xfs_finish_inode_setup(*ip);
+	return error;
+}
+
+
+STATIC void
+xfs_qm_reset_dqcounts(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp,
+	xfs_dqid_t	id,
+	uint		type)
+{
+	struct xfs_dqblk	*dqb;
+	int			j;
+
+	trace_xfs_reset_dqcounts(bp, _RET_IP_);
+
+	/*
+	 * Reset all counters and timers. They'll be
+	 * started afresh by xfs_qm_quotacheck.
+	 */
+#ifdef DEBUG
+	j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	do_div(j, sizeof(xfs_dqblk_t));
+	ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
+#endif
+	dqb = bp->b_addr;
+	for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+		struct xfs_disk_dquot	*ddq;
+
+		ddq = (struct xfs_disk_dquot *)&dqb[j];
+
+		/*
+		 * Do a sanity check, and if needed, repair the dqblk. Don't
+		 * output any warnings because it's perfectly possible to
+		 * find uninitialised dquot blks. See comment in xfs_dqcheck.
+		 */
+		xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+			    "xfs_quotacheck");
+		/*
+		 * Reset type in case we are reusing group quota file for
+		 * project quotas or vice versa
+		 */
+		ddq->d_flags = type;
+		ddq->d_bcount = 0;
+		ddq->d_icount = 0;
+		ddq->d_rtbcount = 0;
+		ddq->d_btimer = 0;
+		ddq->d_itimer = 0;
+		ddq->d_rtbtimer = 0;
+		ddq->d_bwarns = 0;
+		ddq->d_iwarns = 0;
+		ddq->d_rtbwarns = 0;
+
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			xfs_update_cksum((char *)&dqb[j],
+					 sizeof(struct xfs_dqblk),
+					 XFS_DQUOT_CRC_OFF);
+		}
+	}
+}
+
+STATIC int
+xfs_qm_dqiter_bufs(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		firstid,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		blkcnt,
+	uint			flags,
+	struct list_head	*buffer_list)
+{
+	struct xfs_buf		*bp;
+	int			error;
+	int			type;
+
+	ASSERT(blkcnt > 0);
+	type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
+		(flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
+	error = 0;
+
+	/*
+	 * Blkcnt arg can be a very big number, and might even be
+	 * larger than the log itself. So, we have to break it up into
+	 * manageable-sized transactions.
+	 * Note that we don't start a permanent transaction here; we might
+	 * not be able to get a log reservation for the whole thing up front,
+	 * and we don't really care to either, because we just discard
+	 * everything if we were to crash in the middle of this loop.
+	 */
+	while (blkcnt--) {
+		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+			      XFS_FSB_TO_DADDR(mp, bno),
+			      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+			      &xfs_dquot_buf_ops);
+
+		/*
+		 * CRC and validation errors will return a EFSCORRUPTED here. If
+		 * this occurs, re-read without CRC validation so that we can
+		 * repair the damage via xfs_qm_reset_dqcounts(). This process
+		 * will leave a trace in the log indicating corruption has
+		 * been detected.
+		 */
+		if (error == -EFSCORRUPTED) {
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+				      XFS_FSB_TO_DADDR(mp, bno),
+				      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+				      NULL);
+		}
+
+		if (error)
+			break;
+
+		/*
+		 * A corrupt buffer might not have a verifier attached, so
+		 * make sure we have the correct one attached before writeback
+		 * occurs.
+		 */
+		bp->b_ops = &xfs_dquot_buf_ops;
+		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+		xfs_buf_delwri_queue(bp, buffer_list);
+		xfs_buf_relse(bp);
+
+		/* goto the next block. */
+		bno++;
+		firstid += mp->m_quotainfo->qi_dqperchunk;
+	}
+
+	return error;
+}
+
+/*
+ * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
+ * caller supplied function for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_dqiterate(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*qip,
+	uint			flags,
+	struct list_head	*buffer_list)
+{
+	struct xfs_bmbt_irec	*map;
+	int			i, nmaps;	/* number of map entries */
+	int			error;		/* return value */
+	xfs_fileoff_t		lblkno;
+	xfs_filblks_t		maxlblkcnt;
+	xfs_dqid_t		firstid;
+	xfs_fsblock_t		rablkno;
+	xfs_filblks_t		rablkcnt;
+
+	error = 0;
+	/*
+	 * This looks racy, but we can't keep an inode lock across a
+	 * trans_reserve. But, this gets called during quotacheck, and that
+	 * happens only at mount time which is single threaded.
+	 */
+	if (qip->i_d.di_nblocks == 0)
+		return 0;
+
+	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+
+	lblkno = 0;
+	maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	do {
+		uint		lock_mode;
+
+		nmaps = XFS_DQITER_MAP_SIZE;
+		/*
+		 * We aren't changing the inode itself. Just changing
+		 * some of its data. No new blocks are added here, and
+		 * the inode is never added to the transaction.
+		 */
+		lock_mode = xfs_ilock_data_map_shared(qip);
+		error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
+				       map, &nmaps, 0);
+		xfs_iunlock(qip, lock_mode);
+		if (error)
+			break;
+
+		ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+		for (i = 0; i < nmaps; i++) {
+			ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+			ASSERT(map[i].br_blockcount);
+
+
+			lblkno += map[i].br_blockcount;
+
+			if (map[i].br_startblock == HOLESTARTBLOCK)
+				continue;
+
+			firstid = (xfs_dqid_t) map[i].br_startoff *
+				mp->m_quotainfo->qi_dqperchunk;
+			/*
+			 * Do a read-ahead on the next extent.
+			 */
+			if ((i+1 < nmaps) &&
+			    (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+				rablkcnt =  map[i+1].br_blockcount;
+				rablkno = map[i+1].br_startblock;
+				while (rablkcnt--) {
+					xfs_buf_readahead(mp->m_ddev_targp,
+					       XFS_FSB_TO_DADDR(mp, rablkno),
+					       mp->m_quotainfo->qi_dqchunklen,
+					       &xfs_dquot_buf_ops);
+					rablkno++;
+				}
+			}
+			/*
+			 * Iterate thru all the blks in the extent and
+			 * reset the counters of all the dquots inside them.
+			 */
+			error = xfs_qm_dqiter_bufs(mp, firstid,
+						   map[i].br_startblock,
+						   map[i].br_blockcount,
+						   flags, buffer_list);
+			if (error)
+				goto out;
+		}
+	} while (nmaps > 0);
+
+out:
+	kmem_free(map);
+	return error;
+}
+
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ *
+ * Given the inode, and a dquot id this updates both the incore dqout as well
+ * as the buffer copy. This is so that once the quotacheck is done, we can
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
+ */
+STATIC int
+xfs_qm_quotacheck_dqadjust(
+	struct xfs_inode	*ip,
+	xfs_dqid_t		id,
+	uint			type,
+	xfs_qcnt_t		nblks,
+	xfs_qcnt_t		rtblks)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	error = xfs_qm_dqget(mp, ip, id, type,
+			     XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+	if (error) {
+		/*
+		 * Shouldn't be able to turn off quotas here.
+		 */
+		ASSERT(error != -ESRCH);
+		ASSERT(error != -ENOENT);
+		return error;
+	}
+
+	trace_xfs_dqadjust(dqp);
+
+	/*
+	 * Adjust the inode count and the block count to reflect this inode's
+	 * resource usage.
+	 */
+	be64_add_cpu(&dqp->q_core.d_icount, 1);
+	dqp->q_res_icount++;
+	if (nblks) {
+		be64_add_cpu(&dqp->q_core.d_bcount, nblks);
+		dqp->q_res_bcount += nblks;
+	}
+	if (rtblks) {
+		be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
+		dqp->q_res_rtbcount += rtblks;
+	}
+
+	/*
+	 * Set default limits, adjust timers (since we changed usages)
+	 *
+	 * There are no timers for the default values set in the root dquot.
+	 */
+	if (dqp->q_core.d_id) {
+		xfs_qm_adjust_dqlimits(mp, dqp);
+		xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
+	}
+
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_qm_dqput(dqp);
+	return 0;
+}
+
+STATIC int
+xfs_qm_get_rtblks(
+	xfs_inode_t	*ip,
+	xfs_qcnt_t	*O_rtblks)
+{
+	xfs_filblks_t	rtblks;			/* total rt blks */
+	xfs_extnum_t	idx;			/* extent record index */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+	int		error;
+
+	ASSERT(XFS_IS_REALTIME_INODE(ip));
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+			return error;
+	}
+	rtblks = 0;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (idx = 0; idx < nextents; idx++)
+		rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
+	*O_rtblks = (xfs_qcnt_t)rtblks;
+	return 0;
+}
+
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		__user *buffer,	/* not used */
+	int		ubsize,		/* not used */
+	int		*ubused,	/* not used */
+	int		*res)		/* result code value */
+{
+	xfs_inode_t	*ip;
+	xfs_qcnt_t	nblks, rtblks = 0;
+	int		error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * rootino must have its resources accounted for, not so with the quota
+	 * inodes.
+	 */
+	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
+		*res = BULKSTAT_RV_NOTHING;
+		return -EINVAL;
+	}
+
+	/*
+	 * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
+	 * interface expects the inode to be exclusively locked because that's
+	 * the case in all other instances. It's OK that we do this because
+	 * quotacheck is done only at mount time.
+	 */
+	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+	if (error) {
+		*res = BULKSTAT_RV_NOTHING;
+		return error;
+	}
+
+	ASSERT(ip->i_delayed_blks == 0);
+
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * Walk thru the extent list and count the realtime blocks.
+		 */
+		error = xfs_qm_get_rtblks(ip, &rtblks);
+		if (error)
+			goto error0;
+	}
+
+	nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+
+	/*
+	 * Add the (disk blocks and inode) resources occupied by this
+	 * inode to its dquots. We do this adjustment in the incore dquot,
+	 * and also copy the changes to its buffer.
+	 * We don't care about putting these changes in a transaction
+	 * envelope because if we crash in the middle of a 'quotacheck'
+	 * we have to start from the beginning anyway.
+	 * Once we're done, we'll log all the dquot bufs.
+	 *
+	 * The *QUOTA_ON checks below may look pretty racy, but quotachecks
+	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
+						   XFS_DQ_USER, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
+						   XFS_DQ_GROUP, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	if (XFS_IS_PQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
+						   XFS_DQ_PROJ, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+	*res = BULKSTAT_RV_DIDONE;
+	return 0;
+
+error0:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+	*res = BULKSTAT_RV_GIVEUP;
+	return error;
+}
+
+STATIC int
+xfs_qm_flush_one(
+	struct xfs_dquot	*dqp,
+	void			*data)
+{
+	struct list_head	*buffer_list = data;
+	struct xfs_buf		*bp = NULL;
+	int			error = 0;
+
+	xfs_dqlock(dqp);
+	if (dqp->dq_flags & XFS_DQ_FREEING)
+		goto out_unlock;
+	if (!XFS_DQ_IS_DIRTY(dqp))
+		goto out_unlock;
+
+	xfs_dqflock(dqp);
+	error = xfs_qm_dqflush(dqp, &bp);
+	if (error)
+		goto out_unlock;
+
+	xfs_buf_delwri_queue(bp, buffer_list);
+	xfs_buf_relse(bp);
+out_unlock:
+	xfs_dqunlock(dqp);
+	return error;
+}
+
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world. If the quotacheck fails, disable quotas.
+ */
+STATIC int
+xfs_qm_quotacheck(
+	xfs_mount_t	*mp)
+{
+	int			done, count, error, error2;
+	xfs_ino_t		lastino;
+	size_t			structsz;
+	uint			flags;
+	LIST_HEAD		(buffer_list);
+	struct xfs_inode	*uip = mp->m_quotainfo->qi_uquotaip;
+	struct xfs_inode	*gip = mp->m_quotainfo->qi_gquotaip;
+	struct xfs_inode	*pip = mp->m_quotainfo->qi_pquotaip;
+
+	count = INT_MAX;
+	structsz = 1;
+	lastino = 0;
+	flags = 0;
+
+	ASSERT(uip || gip || pip);
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	xfs_notice(mp, "Quotacheck needed: Please wait.");
+
+	/*
+	 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
+	 * their counters to zero. We need a clean slate.
+	 * We don't log our changes till later.
+	 */
+	if (uip) {
+		error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
+					 &buffer_list);
+		if (error)
+			goto error_return;
+		flags |= XFS_UQUOTA_CHKD;
+	}
+
+	if (gip) {
+		error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
+					 &buffer_list);
+		if (error)
+			goto error_return;
+		flags |= XFS_GQUOTA_CHKD;
+	}
+
+	if (pip) {
+		error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
+					 &buffer_list);
+		if (error)
+			goto error_return;
+		flags |= XFS_PQUOTA_CHKD;
+	}
+
+	do {
+		/*
+		 * Iterate thru all the inodes in the file system,
+		 * adjusting the corresponding dquot counters in core.
+		 */
+		error = xfs_bulkstat(mp, &lastino, &count,
+				     xfs_qm_dqusage_adjust,
+				     structsz, NULL, &done);
+		if (error)
+			break;
+
+	} while (!done);
+
+	/*
+	 * We've made all the changes that we need to make incore.  Flush them
+	 * down to disk buffers if everything was updated successfully.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
+					  &buffer_list);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
+					   &buffer_list);
+		if (!error)
+			error = error2;
+	}
+	if (XFS_IS_PQUOTA_ON(mp)) {
+		error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
+					   &buffer_list);
+		if (!error)
+			error = error2;
+	}
+
+	error2 = xfs_buf_delwri_submit(&buffer_list);
+	if (!error)
+		error = error2;
+
+	/*
+	 * We can get this error if we couldn't do a dquot allocation inside
+	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+	 * dirty dquots that might be cached, we just want to get rid of them
+	 * and turn quotaoff. The dquots won't be attached to any of the inodes
+	 * at this point (because we intentionally didn't in dqget_noattach).
+	 */
+	if (error) {
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+		goto error_return;
+	}
+
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
+	mp->m_qflags |= flags;
+
+ error_return:
+	while (!list_empty(&buffer_list)) {
+		struct xfs_buf *bp =
+			list_first_entry(&buffer_list, struct xfs_buf, b_list);
+		list_del_init(&bp->b_list);
+		xfs_buf_relse(bp);
+	}
+
+	if (error) {
+		xfs_warn(mp,
+	"Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
+			error);
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo != NULL);
+		xfs_qm_destroy_quotainfo(mp);
+		if (xfs_mount_reset_sbqflags(mp)) {
+			xfs_warn(mp,
+				"Quotacheck: Failed to reset quota flags.");
+		}
+	} else
+		xfs_notice(mp, "Quotacheck: Done.");
+	return error;
+}
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+	struct xfs_mount	*mp)
+{
+	int			error = 0;
+	uint			sbf;
+
+	/*
+	 * If quotas on realtime volumes is not supported, we disable
+	 * quotas immediately.
+	 */
+	if (mp->m_sb.sb_rextents) {
+		xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * Allocate the quotainfo structure inside the mount struct, and
+	 * create quotainode(s), and change/rev superblock if necessary.
+	 */
+	error = xfs_qm_init_quotainfo(mp);
+	if (error) {
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo == NULL);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+	/*
+	 * If any of the quotas are not consistent, do a quotacheck.
+	 */
+	if (XFS_QM_NEED_QUOTACHECK(mp)) {
+		error = xfs_qm_quotacheck(mp);
+		if (error) {
+			/* Quotacheck failed and disabled quotas. */
+			return;
+		}
+	}
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	if (!XFS_IS_UQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+	if (!XFS_IS_GQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+	if (!XFS_IS_PQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_PQUOTA_CHKD;
+
+ write_changes:
+	/*
+	 * We actually don't have to acquire the m_sb_lock at all.
+	 * This can only be called from mount, and that's single threaded. XXX
+	 */
+	spin_lock(&mp->m_sb_lock);
+	sbf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+		if (xfs_sync_sb(mp, false)) {
+			/*
+			 * We could only have been turning quotas off.
+			 * We aren't in very good shape actually because
+			 * the incore structures are convinced that quotas are
+			 * off, but the on disk superblock doesn't know that !
+			 */
+			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+			xfs_alert(mp, "%s: Superblock update failed!",
+				__func__);
+		}
+	}
+
+	if (error) {
+		xfs_warn(mp, "Failed to initialize disk quotas.");
+		return;
+	}
+}
+
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+	xfs_mount_t	*mp)
+{
+	struct xfs_inode	*uip = NULL;
+	struct xfs_inode	*gip = NULL;
+	struct xfs_inode	*pip = NULL;
+	int			error;
+	uint			flags = 0;
+
+	ASSERT(mp->m_quotainfo);
+
+	/*
+	 * Get the uquota and gquota inodes
+	 */
+	if (xfs_sb_version_hasquota(&mp->m_sb)) {
+		if (XFS_IS_UQUOTA_ON(mp) &&
+		    mp->m_sb.sb_uquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_uquotino > 0);
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+					     0, 0, &uip);
+			if (error)
+				return error;
+		}
+		if (XFS_IS_GQUOTA_ON(mp) &&
+		    mp->m_sb.sb_gquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_gquotino > 0);
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+					     0, 0, &gip);
+			if (error)
+				goto error_rele;
+		}
+		if (XFS_IS_PQUOTA_ON(mp) &&
+		    mp->m_sb.sb_pquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_pquotino > 0);
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+					     0, 0, &pip);
+			if (error)
+				goto error_rele;
+		}
+	} else {
+		flags |= XFS_QMOPT_SBVERSION;
+	}
+
+	/*
+	 * Create the three inodes, if they don't exist already. The changes
+	 * made above will get added to a transaction and logged in one of
+	 * the qino_alloc calls below.  If the device is readonly,
+	 * temporarily switch to read-write to do this.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+		error = xfs_qm_qino_alloc(mp, &uip,
+					      flags | XFS_QMOPT_UQUOTA);
+		if (error)
+			goto error_rele;
+
+		flags &= ~XFS_QMOPT_SBVERSION;
+	}
+	if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
+		error = xfs_qm_qino_alloc(mp, &gip,
+					  flags | XFS_QMOPT_GQUOTA);
+		if (error)
+			goto error_rele;
+
+		flags &= ~XFS_QMOPT_SBVERSION;
+	}
+	if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
+		error = xfs_qm_qino_alloc(mp, &pip,
+					  flags | XFS_QMOPT_PQUOTA);
+		if (error)
+			goto error_rele;
+	}
+
+	mp->m_quotainfo->qi_uquotaip = uip;
+	mp->m_quotainfo->qi_gquotaip = gip;
+	mp->m_quotainfo->qi_pquotaip = pip;
+
+	return 0;
+
+error_rele:
+	if (uip)
+		IRELE(uip);
+	if (gip)
+		IRELE(gip);
+	if (pip)
+		IRELE(pip);
+	return error;
+}
+
+STATIC void
+xfs_qm_dqfree_one(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+
+	mutex_lock(&qi->qi_tree_lock);
+	radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
+			  be32_to_cpu(dqp->q_core.d_id));
+
+	qi->qi_dquots--;
+	mutex_unlock(&qi->qi_tree_lock);
+
+	xfs_qm_dqdestroy(dqp);
+}
+
+/* --------------- utility functions for vnodeops ---------------- */
+
+
+/*
+ * Given an inode, a uid, gid and prid make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in	: inode (unlocked)
+ * out	: udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+	struct xfs_inode	*ip,
+	xfs_dqid_t		uid,
+	xfs_dqid_t		gid,
+	prid_t			prid,
+	uint			flags,
+	struct xfs_dquot	**O_udqpp,
+	struct xfs_dquot	**O_gdqpp,
+	struct xfs_dquot	**O_pdqpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*uq = NULL;
+	struct xfs_dquot	*gq = NULL;
+	struct xfs_dquot	*pq = NULL;
+	int			error;
+	uint			lockflags;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	lockflags = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lockflags);
+
+	if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
+		gid = ip->i_d.di_gid;
+
+	/*
+	 * Attach the dquot(s) to this inode, doing a dquot allocation
+	 * if necessary. The dquot(s) will not be locked.
+	 */
+	if (XFS_NOT_DQATTACHED(mp, ip)) {
+		error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+		if (error) {
+			xfs_iunlock(ip, lockflags);
+			return error;
+		}
+	}
+
+	if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
+		if (ip->i_d.di_uid != uid) {
+			/*
+			 * What we need is the dquot that has this uid, and
+			 * if we send the inode to dqget, the uid of the inode
+			 * takes priority over what's sent in the uid argument.
+			 * We must unlock inode here before calling dqget if
+			 * we're not sending the inode, because otherwise
+			 * we'll deadlock by doing trans_reserve while
+			 * holding ilock.
+			 */
+			xfs_iunlock(ip, lockflags);
+			error = xfs_qm_dqget(mp, NULL, uid,
+						 XFS_DQ_USER,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &uq);
+			if (error) {
+				ASSERT(error != -ENOENT);
+				return error;
+			}
+			/*
+			 * Get the ilock in the right order.
+			 */
+			xfs_dqunlock(uq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			/*
+			 * Take an extra reference, because we'll return
+			 * this to caller
+			 */
+			ASSERT(ip->i_udquot);
+			uq = xfs_qm_dqhold(ip->i_udquot);
+		}
+	}
+	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
+		if (ip->i_d.di_gid != gid) {
+			xfs_iunlock(ip, lockflags);
+			error = xfs_qm_dqget(mp, NULL, gid,
+						 XFS_DQ_GROUP,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &gq);
+			if (error) {
+				ASSERT(error != -ENOENT);
+				goto error_rele;
+			}
+			xfs_dqunlock(gq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_gdquot);
+			gq = xfs_qm_dqhold(ip->i_gdquot);
+		}
+	}
+	if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+		if (xfs_get_projid(ip) != prid) {
+			xfs_iunlock(ip, lockflags);
+			error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+						 XFS_DQ_PROJ,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &pq);
+			if (error) {
+				ASSERT(error != -ENOENT);
+				goto error_rele;
+			}
+			xfs_dqunlock(pq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_pdquot);
+			pq = xfs_qm_dqhold(ip->i_pdquot);
+		}
+	}
+	if (uq)
+		trace_xfs_dquot_dqalloc(ip);
+
+	xfs_iunlock(ip, lockflags);
+	if (O_udqpp)
+		*O_udqpp = uq;
+	else
+		xfs_qm_dqrele(uq);
+	if (O_gdqpp)
+		*O_gdqpp = gq;
+	else
+		xfs_qm_dqrele(gq);
+	if (O_pdqpp)
+		*O_pdqpp = pq;
+	else
+		xfs_qm_dqrele(pq);
+	return 0;
+
+error_rele:
+	xfs_qm_dqrele(gq);
+	xfs_qm_dqrele(uq);
+	return error;
+}
+
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+xfs_dquot_t *
+xfs_qm_vop_chown(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	**IO_olddq,
+	xfs_dquot_t	*newdq)
+{
+	xfs_dquot_t	*prevdq;
+	uint		bfield = XFS_IS_REALTIME_INODE(ip) ?
+				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
+
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+
+	/* old dquot */
+	prevdq = *IO_olddq;
+	ASSERT(prevdq);
+	ASSERT(prevdq != newdq);
+
+	xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
+	xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+
+	/* the sparkling new dquot */
+	xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
+	xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+
+	/*
+	 * Take an extra reference, because the inode is going to keep
+	 * this dquot pointer even after the trans_commit.
+	 */
+	*IO_olddq = xfs_qm_dqhold(newdq);
+
+	return prevdq;
+}
+
+/*
+ * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
+ */
+int
+xfs_qm_vop_chown_reserve(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp,
+	uint			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			delblks, blkflags, prjflags = 0;
+	struct xfs_dquot	*udq_unres = NULL;
+	struct xfs_dquot	*gdq_unres = NULL;
+	struct xfs_dquot	*pdq_unres = NULL;
+	struct xfs_dquot	*udq_delblks = NULL;
+	struct xfs_dquot	*gdq_delblks = NULL;
+	struct xfs_dquot	*pdq_delblks = NULL;
+	int			error;
+
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	delblks = ip->i_delayed_blks;
+	blkflags = XFS_IS_REALTIME_INODE(ip) ?
+			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
+
+	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
+	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
+		udq_delblks = udqp;
+		/*
+		 * If there are delayed allocation blocks, then we have to
+		 * unreserve those from the old dquot, and add them to the
+		 * new dquot.
+		 */
+		if (delblks) {
+			ASSERT(ip->i_udquot);
+			udq_unres = ip->i_udquot;
+		}
+	}
+	if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
+	    ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
+		gdq_delblks = gdqp;
+		if (delblks) {
+			ASSERT(ip->i_gdquot);
+			gdq_unres = ip->i_gdquot;
+		}
+	}
+
+	if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
+	    xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
+		prjflags = XFS_QMOPT_ENOSPC;
+		pdq_delblks = pdqp;
+		if (delblks) {
+			ASSERT(ip->i_pdquot);
+			pdq_unres = ip->i_pdquot;
+		}
+	}
+
+	error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+				udq_delblks, gdq_delblks, pdq_delblks,
+				ip->i_d.di_nblocks, 1,
+				flags | blkflags | prjflags);
+	if (error)
+		return error;
+
+	/*
+	 * Do the delayed blks reservations/unreservations now. Since, these
+	 * are done without the help of a transaction, if a reservation fails
+	 * its previous reservations won't be automatically undone by trans
+	 * code. So, we have to do it manually here.
+	 */
+	if (delblks) {
+		/*
+		 * Do the reservations first. Unreservation can't fail.
+		 */
+		ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
+		ASSERT(udq_unres || gdq_unres || pdq_unres);
+		error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+			    udq_delblks, gdq_delblks, pdq_delblks,
+			    (xfs_qcnt_t)delblks, 0,
+			    flags | blkflags | prjflags);
+		if (error)
+			return error;
+		xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+				udq_unres, gdq_unres, pdq_unres,
+				-((xfs_qcnt_t)delblks), 0, blkflags);
+	}
+
+	return 0;
+}
+
+int
+xfs_qm_vop_rename_dqattach(
+	struct xfs_inode	**i_tab)
+{
+	struct xfs_mount	*mp = i_tab[0]->i_mount;
+	int			i;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	for (i = 0; (i < 4 && i_tab[i]); i++) {
+		struct xfs_inode	*ip = i_tab[i];
+		int			error;
+
+		/*
+		 * Watch out for duplicate entries in the table.
+		 */
+		if (i == 0 || ip != i_tab[i-1]) {
+			if (XFS_NOT_DQATTACHED(mp, ip)) {
+				error = xfs_qm_dqattach(ip, 0);
+				if (error)
+					return error;
+			}
+		}
+	}
+	return 0;
+}
+
+void
+xfs_qm_vop_create_dqattach(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	if (udqp && XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(ip->i_udquot == NULL);
+		ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+
+		ip->i_udquot = xfs_qm_dqhold(udqp);
+		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+	if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(ip->i_gdquot == NULL);
+		ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
+		ip->i_gdquot = xfs_qm_dqhold(gdqp);
+		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+	if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
+		ASSERT(ip->i_pdquot == NULL);
+		ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+
+		ip->i_pdquot = xfs_qm_dqhold(pdqp);
+		xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+}
+
diff --git a/kernel/fs/xfs/xfs_qm.h b/kernel/fs/xfs/xfs_qm.h
new file mode 100644
index 000000000..996a04064
--- /dev/null
+++ b/kernel/fs/xfs/xfs_qm.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+
+struct xfs_inode;
+
+extern struct kmem_zone	*xfs_qm_dqtrxzone;
+
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE	10
+
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+	!dqp->q_core.d_blk_hardlimit && \
+	!dqp->q_core.d_blk_softlimit && \
+	!dqp->q_core.d_rtb_hardlimit && \
+	!dqp->q_core.d_rtb_softlimit && \
+	!dqp->q_core.d_ino_hardlimit && \
+	!dqp->q_core.d_ino_softlimit && \
+	!dqp->q_core.d_bcount && \
+	!dqp->q_core.d_rtbcount && \
+	!dqp->q_core.d_icount)
+
+/*
+ * This defines the unit of allocation of dquots.
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ * XXXsup However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
+
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+typedef struct xfs_quotainfo {
+	struct radix_tree_root qi_uquota_tree;
+	struct radix_tree_root qi_gquota_tree;
+	struct radix_tree_root qi_pquota_tree;
+	struct mutex qi_tree_lock;
+	struct xfs_inode	*qi_uquotaip;	/* user quota inode */
+	struct xfs_inode	*qi_gquotaip;	/* group quota inode */
+	struct xfs_inode	*qi_pquotaip;	/* project quota inode */
+	struct list_lru	 qi_lru;
+	int		 qi_dquots;
+	time_t		 qi_btimelimit;	 /* limit for blks timer */
+	time_t		 qi_itimelimit;	 /* limit for inodes timer */
+	time_t		 qi_rtbtimelimit;/* limit for rt blks timer */
+	xfs_qwarncnt_t	 qi_bwarnlimit;	 /* limit for blks warnings */
+	xfs_qwarncnt_t	 qi_iwarnlimit;	 /* limit for inodes warnings */
+	xfs_qwarncnt_t	 qi_rtbwarnlimit;/* limit for rt blks warnings */
+	struct mutex	 qi_quotaofflock;/* to serialize quotaoff */
+	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
+	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
+	xfs_qcnt_t	 qi_bhardlimit;	 /* default data blk hard limit */
+	xfs_qcnt_t	 qi_bsoftlimit;	 /* default data blk soft limit */
+	xfs_qcnt_t	 qi_ihardlimit;	 /* default inode count hard limit */
+	xfs_qcnt_t	 qi_isoftlimit;	 /* default inode count soft limit */
+	xfs_qcnt_t	 qi_rtbhardlimit;/* default realtime blk hard limit */
+	xfs_qcnt_t	 qi_rtbsoftlimit;/* default realtime blk soft limit */
+	struct shrinker  qi_shrinker;
+} xfs_quotainfo_t;
+
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+	struct xfs_quotainfo	*qi,
+	int			type)
+{
+	switch (type) {
+	case XFS_DQ_USER:
+		return &qi->qi_uquota_tree;
+	case XFS_DQ_GROUP:
+		return &qi->qi_gquota_tree;
+	case XFS_DQ_PROJ:
+		return &qi->qi_pquota_tree;
+	default:
+		ASSERT(0);
+	}
+	return NULL;
+}
+
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+	switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+	case XFS_DQ_USER:
+		return dqp->q_mount->m_quotainfo->qi_uquotaip;
+	case XFS_DQ_GROUP:
+		return dqp->q_mount->m_quotainfo->qi_gquotaip;
+	case XFS_DQ_PROJ:
+		return dqp->q_mount->m_quotainfo->qi_pquotaip;
+	default:
+		ASSERT(0);
+	}
+	return NULL;
+}
+
+extern void	xfs_trans_mod_dquot(struct xfs_trans *,
+					struct xfs_dquot *, uint, long);
+extern int	xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+			struct xfs_mount *, struct xfs_dquot *,
+			struct xfs_dquot *, struct xfs_dquot *,
+			long, long, uint);
+extern void	xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void	xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
+
+/*
+ * We keep the usr, grp, and prj dquots separately so that locking will be
+ * easier to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+enum {
+	XFS_QM_TRANS_USR = 0,
+	XFS_QM_TRANS_GRP,
+	XFS_QM_TRANS_PRJ,
+	XFS_QM_TRANS_DQTYPES
+};
+#define XFS_QM_TRANS_MAXDQS		2
+struct xfs_dquot_acct {
+	struct xfs_dqtrx	dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
+
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT	(7 * 24*60*60)          /* 1 week */
+#define XFS_QM_RTBTIMELIMIT	(7 * 24*60*60)          /* 1 week */
+#define XFS_QM_ITIMELIMIT	(7 * 24*60*60)          /* 1 week */
+
+#define XFS_QM_BWARNLIMIT	5
+#define XFS_QM_IWARNLIMIT	5
+#define XFS_QM_RTBWARNLIMIT	5
+
+extern void		xfs_qm_destroy_quotainfo(struct xfs_mount *);
+
+/* dquot stuff */
+extern void		xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
+
+/* quota ops */
+extern int		xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+					uint, struct qc_dqblk *);
+extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
+					struct qc_dqblk *);
+extern int		xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int		xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
+
+#endif /* __XFS_QM_H__ */
diff --git a/kernel/fs/xfs/xfs_qm_bhv.c b/kernel/fs/xfs/xfs_qm_bhv.c
new file mode 100644
index 000000000..3e52d5de7
--- /dev/null
+++ b/kernel/fs/xfs/xfs_qm_bhv.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+
+
+STATIC void
+xfs_fill_statvfs_from_dquot(
+	struct kstatfs		*statp,
+	struct xfs_dquot	*dqp)
+{
+	__uint64_t		limit;
+
+	limit = dqp->q_core.d_blk_softlimit ?
+		be64_to_cpu(dqp->q_core.d_blk_softlimit) :
+		be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+	if (limit && statp->f_blocks > limit) {
+		statp->f_blocks = limit;
+		statp->f_bfree = statp->f_bavail =
+			(statp->f_blocks > dqp->q_res_bcount) ?
+			 (statp->f_blocks - dqp->q_res_bcount) : 0;
+	}
+
+	limit = dqp->q_core.d_ino_softlimit ?
+		be64_to_cpu(dqp->q_core.d_ino_softlimit) :
+		be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+	if (limit && statp->f_files > limit) {
+		statp->f_files = limit;
+		statp->f_ffree =
+			(statp->f_files > dqp->q_res_icount) ?
+			 (statp->f_ffree - dqp->q_res_icount) : 0;
+	}
+}
+
+
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+void
+xfs_qm_statvfs(
+	xfs_inode_t		*ip,
+	struct kstatfs		*statp)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_dquot_t		*dqp;
+
+	if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+		xfs_fill_statvfs_from_dquot(statp, dqp);
+		xfs_qm_dqput(dqp);
+	}
+}
+
+int
+xfs_qm_newmount(
+	xfs_mount_t	*mp,
+	uint		*needquotamount,
+	uint		*quotaflags)
+{
+	uint		quotaondisk;
+	uint		uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
+
+	quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
+				(mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
+
+	if (quotaondisk) {
+		uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
+		pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
+		gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
+	}
+
+	/*
+	 * If the device itself is read-only, we can't allow
+	 * the user to change the state of quota on the mount -
+	 * this would generate a transaction on the ro device,
+	 * which would lead to an I/O error and shutdown
+	 */
+
+	if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
+	    (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
+	     (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
+	    (!gquotaondisk &&  XFS_IS_GQUOTA_ON(mp)) ||
+	     (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+	    (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)))  &&
+	    xfs_dev_is_read_only(mp, "changing quota state")) {
+		xfs_warn(mp, "please mount with%s%s%s%s.",
+			(!quotaondisk ? "out quota" : ""),
+			(uquotaondisk ? " usrquota" : ""),
+			(gquotaondisk ? " grpquota" : ""),
+			(pquotaondisk ? " prjquota" : ""));
+		return -EPERM;
+	}
+
+	if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+		/*
+		 * Call mount_quotas at this point only if we won't have to do
+		 * a quotacheck.
+		 */
+		if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
+			/*
+			 * If an error occurred, qm_mount_quotas code
+			 * has already disabled quotas. So, just finish
+			 * mounting, and get on with the boring life
+			 * without disk quotas.
+			 */
+			xfs_qm_mount_quotas(mp);
+		} else {
+			/*
+			 * Clear the quota flags, but remember them. This
+			 * is so that the quota code doesn't get invoked
+			 * before we're ready. This can happen when an
+			 * inode goes inactive and wants to free blocks,
+			 * or via xfs_log_mount_finish.
+			 */
+			*needquotamount = true;
+			*quotaflags = mp->m_qflags;
+			mp->m_qflags = 0;
+		}
+	}
+
+	return 0;
+}
diff --git a/kernel/fs/xfs/xfs_qm_syscalls.c b/kernel/fs/xfs/xfs_qm_syscalls.c
new file mode 100644
index 000000000..9a25c9275
--- /dev/null
+++ b/kernel/fs/xfs/xfs_qm_syscalls.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/capability.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+
+STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
+STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
+					uint);
+
+/*
+ * Turn off quota accounting and/or enforcement for all udquots and/or
+ * gdquots. Called only at unmount time.
+ *
+ * This assumes that there are no dquots of this file system cached
+ * incore, and modifies the ondisk dquot directly. Therefore, for example,
+ * it is an error to call this twice, without purging the cache.
+ */
+int
+xfs_qm_scall_quotaoff(
+	xfs_mount_t		*mp,
+	uint			flags)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	uint			dqtype;
+	int			error;
+	uint			inactivate_flags;
+	xfs_qoff_logitem_t	*qoffstart;
+
+	/*
+	 * No file system can have quotas enabled on disk but not in core.
+	 * Note that quota utilities (like quotaoff) _expect_
+	 * errno == -EEXIST here.
+	 */
+	if ((mp->m_qflags & flags) == 0)
+		return -EEXIST;
+	error = 0;
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+
+	/*
+	 * We don't want to deal with two quotaoffs messing up each other,
+	 * so we're going to serialize it. quotaoff isn't exactly a performance
+	 * critical thing.
+	 * If quotaoff, then we must be dealing with the root filesystem.
+	 */
+	ASSERT(q);
+	mutex_lock(&q->qi_quotaofflock);
+
+	/*
+	 * If we're just turning off quota enforcement, change mp and go.
+	 */
+	if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
+		mp->m_qflags &= ~(flags);
+
+		spin_lock(&mp->m_sb_lock);
+		mp->m_sb.sb_qflags = mp->m_qflags;
+		spin_unlock(&mp->m_sb_lock);
+		mutex_unlock(&q->qi_quotaofflock);
+
+		/* XXX what to do if error ? Revert back to old vals incore ? */
+		return xfs_sync_sb(mp, false);
+	}
+
+	dqtype = 0;
+	inactivate_flags = 0;
+	/*
+	 * If accounting is off, we must turn enforcement off, clear the
+	 * quota 'CHKD' certificate to make it known that we have to
+	 * do a quotacheck the next time this quota is turned on.
+	 */
+	if (flags & XFS_UQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_UQUOTA;
+		flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
+		inactivate_flags |= XFS_UQUOTA_ACTIVE;
+	}
+	if (flags & XFS_GQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_GQUOTA;
+		flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
+		inactivate_flags |= XFS_GQUOTA_ACTIVE;
+	}
+	if (flags & XFS_PQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_PQUOTA;
+		flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
+		inactivate_flags |= XFS_PQUOTA_ACTIVE;
+	}
+
+	/*
+	 * Nothing to do?  Don't complain. This happens when we're just
+	 * turning off quota enforcement.
+	 */
+	if ((mp->m_qflags & flags) == 0)
+		goto out_unlock;
+
+	/*
+	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
+	 * and synchronously. If we fail to write, we should abort the
+	 * operation as it cannot be recovered safely if we crash.
+	 */
+	error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
+	 * to take care of the race between dqget and quotaoff. We don't take
+	 * any special locks to reset these bits. All processes need to check
+	 * these bits *after* taking inode lock(s) to see if the particular
+	 * quota type is in the process of being turned off. If *ACTIVE, it is
+	 * guaranteed that all dquot structures and all quotainode ptrs will all
+	 * stay valid as long as that inode is kept locked.
+	 *
+	 * There is no turning back after this.
+	 */
+	mp->m_qflags &= ~inactivate_flags;
+
+	/*
+	 * Give back all the dquot reference(s) held by inodes.
+	 * Here we go thru every single incore inode in this file system, and
+	 * do a dqrele on the i_udquot/i_gdquot that it may have.
+	 * Essentially, as long as somebody has an inode locked, this guarantees
+	 * that quotas will not be turned off. This is handy because in a
+	 * transaction once we lock the inode(s) and check for quotaon, we can
+	 * depend on the quota inodes (and other things) being valid as long as
+	 * we keep the lock(s).
+	 */
+	xfs_qm_dqrele_all_inodes(mp, flags);
+
+	/*
+	 * Next we make the changes in the quota flag in the mount struct.
+	 * This isn't protected by a particular lock directly, because we
+	 * don't want to take a mrlock every time we depend on quotas being on.
+	 */
+	mp->m_qflags &= ~flags;
+
+	/*
+	 * Go through all the dquots of this file system and purge them,
+	 * according to what was turned off.
+	 */
+	xfs_qm_dqpurge_all(mp, dqtype);
+
+	/*
+	 * Transactions that had started before ACTIVE state bit was cleared
+	 * could have logged many dquots, so they'd have higher LSNs than
+	 * the first QUOTAOFF log record does. If we happen to crash when
+	 * the tail of the log has gone past the QUOTAOFF record, but
+	 * before the last dquot modification, those dquots __will__
+	 * recover, and that's not good.
+	 *
+	 * So, we have QUOTAOFF start and end logitems; the start
+	 * logitem won't get overwritten until the end logitem appears...
+	 */
+	error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+	if (error) {
+		/* We're screwed now. Shutdown is the only option. */
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		goto out_unlock;
+	}
+
+	/*
+	 * If all quotas are completely turned off, close shop.
+	 */
+	if (mp->m_qflags == 0) {
+		mutex_unlock(&q->qi_quotaofflock);
+		xfs_qm_destroy_quotainfo(mp);
+		return 0;
+	}
+
+	/*
+	 * Release our quotainode references if we don't need them anymore.
+	 */
+	if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
+		IRELE(q->qi_uquotaip);
+		q->qi_uquotaip = NULL;
+	}
+	if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
+		IRELE(q->qi_gquotaip);
+		q->qi_gquotaip = NULL;
+	}
+	if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
+		IRELE(q->qi_pquotaip);
+		q->qi_pquotaip = NULL;
+	}
+
+out_unlock:
+	mutex_unlock(&q->qi_quotaofflock);
+	return error;
+}
+
+STATIC int
+xfs_qm_scall_trunc_qfile(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (ino == NULLFSINO)
+		return 0;
+
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		goto out_put;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	ip->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				     XFS_TRANS_ABORT);
+		goto out_unlock;
+	}
+
+	ASSERT(ip->i_d.di_nextents == 0);
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+	IRELE(ip);
+	return error;
+}
+
+int
+xfs_qm_scall_trunc_qfiles(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error = -EINVAL;
+
+	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
+	    (flags & ~XFS_DQ_ALLTYPES)) {
+		xfs_debug(mp, "%s: flags=%x m_qflags=%x",
+			__func__, flags, mp->m_qflags);
+		return -EINVAL;
+	}
+
+	if (flags & XFS_DQ_USER) {
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+		if (error)
+			return error;
+	}
+	if (flags & XFS_DQ_GROUP) {
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+		if (error)
+			return error;
+	}
+	if (flags & XFS_DQ_PROJ)
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
+
+	return error;
+}
+
+/*
+ * Switch on (a given) quota enforcement for a filesystem.  This takes
+ * effect immediately.
+ * (Switching on quota accounting must be done at mount time.)
+ */
+int
+xfs_qm_scall_quotaon(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error;
+	uint		qf;
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+	/*
+	 * Switching on quota accounting must be done at mount time.
+	 */
+	flags &= ~(XFS_ALL_QUOTA_ACCT);
+
+	if (flags == 0) {
+		xfs_debug(mp, "%s: zero flags, m_qflags=%x",
+			__func__, mp->m_qflags);
+		return -EINVAL;
+	}
+
+	/*
+	 * Can't enforce without accounting. We check the superblock
+	 * qflags here instead of m_qflags because rootfs can have
+	 * quota acct on ondisk without m_qflags' knowing.
+	 */
+	if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+	     (flags & XFS_UQUOTA_ENFD)) ||
+	    ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+	     (flags & XFS_GQUOTA_ENFD)) ||
+	    ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+	     (flags & XFS_PQUOTA_ENFD))) {
+		xfs_debug(mp,
+			"%s: Can't enforce without acct, flags=%x sbflags=%x",
+			__func__, flags, mp->m_sb.sb_qflags);
+		return -EINVAL;
+	}
+	/*
+	 * If everything's up to-date incore, then don't waste time.
+	 */
+	if ((mp->m_qflags & flags) == flags)
+		return -EEXIST;
+
+	/*
+	 * Change sb_qflags on disk but not incore mp->qflags
+	 * if this is the root filesystem.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	qf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = qf | flags;
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * There's nothing to change if it's the same.
+	 */
+	if ((qf & flags) == flags)
+		return -EEXIST;
+
+	error = xfs_sync_sb(mp, false);
+	if (error)
+		return error;
+	/*
+	 * If we aren't trying to switch on quota enforcement, we are done.
+	 */
+	if  (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+	     ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
+	     ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_GQUOTA_ACCT)))
+		return 0;
+
+	if (! XFS_IS_QUOTA_RUNNING(mp))
+		return -ESRCH;
+
+	/*
+	 * Switch on quota enforcement in core.
+	 */
+	mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+	mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
+
+	return 0;
+}
+
+#define XFS_QC_MASK \
+	(QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
+
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+int
+xfs_qm_scall_setqlim(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	struct qc_dqblk		*newlim)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_disk_dquot	*ddq;
+	struct xfs_dquot	*dqp;
+	struct xfs_trans	*tp;
+	int			error;
+	xfs_qcnt_t		hard, soft;
+
+	if (newlim->d_fieldmask & ~XFS_QC_MASK)
+		return -EINVAL;
+	if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
+		return 0;
+
+	/*
+	 * We don't want to race with a quotaoff so take the quotaoff lock.
+	 * We don't hold an inode lock, so there's nothing else to stop
+	 * a quotaoff from happening.
+	 */
+	mutex_lock(&q->qi_quotaofflock);
+
+	/*
+	 * Get the dquot (locked) before we start, as we need to do a
+	 * transaction to allocate it if it doesn't exist. Once we have the
+	 * dquot, unlock it so we can start the next transaction safely. We hold
+	 * a reference to the dquot, so it's safe to do this unlock/lock without
+	 * it being reclaimed in the mean time.
+	 */
+	error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+	if (error) {
+		ASSERT(error != -ENOENT);
+		goto out_unlock;
+	}
+	xfs_dqunlock(dqp);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_rele;
+	}
+
+	xfs_dqlock(dqp);
+	xfs_trans_dqjoin(tp, dqp);
+	ddq = &dqp->q_core;
+
+	/*
+	 * Make sure that hardlimits are >= soft limits before changing.
+	 */
+	hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
+			be64_to_cpu(ddq->d_blk_hardlimit);
+	soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
+			be64_to_cpu(ddq->d_blk_softlimit);
+	if (hard == 0 || hard >= soft) {
+		ddq->d_blk_hardlimit = cpu_to_be64(hard);
+		ddq->d_blk_softlimit = cpu_to_be64(soft);
+		xfs_dquot_set_prealloc_limits(dqp);
+		if (id == 0) {
+			q->qi_bhardlimit = hard;
+			q->qi_bsoftlimit = soft;
+		}
+	} else {
+		xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
+	}
+	hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
+			be64_to_cpu(ddq->d_rtb_hardlimit);
+	soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
+			be64_to_cpu(ddq->d_rtb_softlimit);
+	if (hard == 0 || hard >= soft) {
+		ddq->d_rtb_hardlimit = cpu_to_be64(hard);
+		ddq->d_rtb_softlimit = cpu_to_be64(soft);
+		if (id == 0) {
+			q->qi_rtbhardlimit = hard;
+			q->qi_rtbsoftlimit = soft;
+		}
+	} else {
+		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
+	}
+
+	hard = (newlim->d_fieldmask & QC_INO_HARD) ?
+		(xfs_qcnt_t) newlim->d_ino_hardlimit :
+			be64_to_cpu(ddq->d_ino_hardlimit);
+	soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
+		(xfs_qcnt_t) newlim->d_ino_softlimit :
+			be64_to_cpu(ddq->d_ino_softlimit);
+	if (hard == 0 || hard >= soft) {
+		ddq->d_ino_hardlimit = cpu_to_be64(hard);
+		ddq->d_ino_softlimit = cpu_to_be64(soft);
+		if (id == 0) {
+			q->qi_ihardlimit = hard;
+			q->qi_isoftlimit = soft;
+		}
+	} else {
+		xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
+	}
+
+	/*
+	 * Update warnings counter(s) if requested
+	 */
+	if (newlim->d_fieldmask & QC_SPC_WARNS)
+		ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
+	if (newlim->d_fieldmask & QC_INO_WARNS)
+		ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
+	if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+		ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
+
+	if (id == 0) {
+		/*
+		 * Timelimits for the super user set the relative time
+		 * the other users can be over quota for this file system.
+		 * If it is zero a default is used.  Ditto for the default
+		 * soft and hard limit values (already done, above), and
+		 * for warnings.
+		 */
+		if (newlim->d_fieldmask & QC_SPC_TIMER) {
+			q->qi_btimelimit = newlim->d_spc_timer;
+			ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
+		}
+		if (newlim->d_fieldmask & QC_INO_TIMER) {
+			q->qi_itimelimit = newlim->d_ino_timer;
+			ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
+		}
+		if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
+			q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
+			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
+		}
+		if (newlim->d_fieldmask & QC_SPC_WARNS)
+			q->qi_bwarnlimit = newlim->d_spc_warns;
+		if (newlim->d_fieldmask & QC_INO_WARNS)
+			q->qi_iwarnlimit = newlim->d_ino_warns;
+		if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+			q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
+	} else {
+		/*
+		 * If the user is now over quota, start the timelimit.
+		 * The user will not be 'warned'.
+		 * Note that we keep the timers ticking, whether enforcement
+		 * is on or off. We don't really want to bother with iterating
+		 * over all ondisk dquots and turning the timers on/off.
+		 */
+		xfs_qm_adjust_dqtimers(mp, ddq);
+	}
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_trans_log_dquot(tp, dqp);
+
+	error = xfs_trans_commit(tp, 0);
+
+out_rele:
+	xfs_qm_dqrele(dqp);
+out_unlock:
+	mutex_unlock(&q->qi_quotaofflock);
+	return error;
+}
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+	xfs_mount_t		*mp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_trans_t		*tp;
+	int			error;
+	xfs_qoff_logitem_t	*qoffi;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+					flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+	return error;
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff(
+	xfs_mount_t	       *mp,
+	xfs_qoff_logitem_t     **qoffstartp,
+	uint		       flags)
+{
+	xfs_trans_t	       *tp;
+	int			error;
+	xfs_qoff_logitem_t     *qoffi;
+
+	*qoffstartp = NULL;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out;
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+	spin_unlock(&mp->m_sb_lock);
+
+	xfs_log_sb(tp);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+	if (error)
+		goto out;
+
+	*qoffstartp = qoffi;
+out:
+	return error;
+}
+
+
+int
+xfs_qm_scall_getquota(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	struct qc_dqblk		*dst)
+{
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	/*
+	 * Try to get the dquot. We don't want it allocated on disk, so
+	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+	 * exist, we'll get ENOENT back.
+	 */
+	error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+	if (error)
+		return error;
+
+	/*
+	 * If everything's NULL, this dquot doesn't quite exist as far as
+	 * our utility programs are concerned.
+	 */
+	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+		error = -ENOENT;
+		goto out_put;
+	}
+
+	memset(dst, 0, sizeof(*dst));
+	dst->d_spc_hardlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
+	dst->d_spc_softlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
+	dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+	dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+	dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
+	dst->d_ino_count = dqp->q_res_icount;
+	dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
+	dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
+	dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
+	dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
+	dst->d_rt_spc_hardlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
+	dst->d_rt_spc_softlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+	dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
+	dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+	dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+
+	/*
+	 * Internally, we don't reset all the timers when quota enforcement
+	 * gets turned off. No need to confuse the user level code,
+	 * so return zeroes in that case.
+	 */
+	if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
+	     dqp->q_core.d_flags == XFS_DQ_USER) ||
+	    (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+	     dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+	    (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+	     dqp->q_core.d_flags == XFS_DQ_PROJ)) {
+		dst->d_spc_timer = 0;
+		dst->d_ino_timer = 0;
+		dst->d_rt_spc_timer = 0;
+	}
+
+#ifdef DEBUG
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
+	     (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
+	     (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
+	    id != 0) {
+		if ((dst->d_space > dst->d_spc_softlimit) &&
+		    (dst->d_spc_softlimit > 0)) {
+			ASSERT(dst->d_spc_timer != 0);
+		}
+		if ((dst->d_ino_count > dst->d_ino_softlimit) &&
+		    (dst->d_ino_softlimit > 0)) {
+			ASSERT(dst->d_ino_timer != 0);
+		}
+	}
+#endif
+out_put:
+	xfs_qm_dqput(dqp);
+	return error;
+}
+
+
+STATIC int
+xfs_dqrele_inode(
+	struct xfs_inode	*ip,
+	int			flags,
+	void			*args)
+{
+	/* skip quota inodes */
+	if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+	    ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
+	    ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
+		ASSERT(ip->i_udquot == NULL);
+		ASSERT(ip->i_gdquot == NULL);
+		ASSERT(ip->i_pdquot == NULL);
+		return 0;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+	if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
+		xfs_qm_dqrele(ip->i_pdquot);
+		ip->i_pdquot = NULL;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ *
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+	struct xfs_mount *mp,
+	uint		 flags)
+{
+	ASSERT(mp->m_quotainfo);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
+}
diff --git a/kernel/fs/xfs/xfs_quota.h b/kernel/fs/xfs/xfs_quota.h
new file mode 100644
index 000000000..5376dd406
--- /dev/null
+++ b/kernel/fs/xfs/xfs_quota.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_H__
+#define __XFS_QUOTA_H__
+
+#include "xfs_quota_defs.h"
+
+/*
+ * Kernel only quota definitions and functions
+ */
+
+struct xfs_trans;
+
+/*
+ * This check is done typically without holding the inode lock;
+ * that may seem racy, but it is harmless in the context that it is used.
+ * The inode cannot go inactive as long a reference is kept, and
+ * therefore if dquot(s) were attached, they'll stay consistent.
+ * If, for example, the ownership of the inode changes while
+ * we didn't have the inode locked, the appropriate dquot(s) will be
+ * attached atomically.
+ */
+#define XFS_NOT_DQATTACHED(mp, ip) \
+	((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
+	 (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
+	 (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
+
+#define XFS_QM_NEED_QUOTACHECK(mp) \
+	((XFS_IS_UQUOTA_ON(mp) && \
+		(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
+	 (XFS_IS_GQUOTA_ON(mp) && \
+		(mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
+	 (XFS_IS_PQUOTA_ON(mp) && \
+		(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
+
+/*
+ * The structure kept inside the xfs_trans_t keep track of dquot changes
+ * within a transaction and apply them later.
+ */
+typedef struct xfs_dqtrx {
+	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
+	ulong		qt_blk_res;	  /* blks reserved on a dquot */
+	ulong		qt_blk_res_used;  /* blks used from the reservation */
+	ulong		qt_ino_res;	  /* inode reserved on a dquot */
+	ulong		qt_ino_res_used;  /* inodes used from the reservation */
+	long		qt_bcount_delta;  /* dquot blk count changes */
+	long		qt_delbcnt_delta; /* delayed dquot blk count changes */
+	long		qt_icount_delta;  /* dquot inode count changes */
+	ulong		qt_rtblk_res;	  /* # blks reserved on a dquot */
+	ulong		qt_rtblk_res_used;/* # blks used from reservation */
+	long		qt_rtbcount_delta;/* dquot realtime blk changes */
+	long		qt_delrtb_delta;  /* delayed RT blk count changes */
+} xfs_dqtrx_t;
+
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
+extern void xfs_trans_free_dqinfo(struct xfs_trans *);
+extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
+		uint, long);
+extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
+extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
+		struct xfs_inode *, long, long, uint);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+		struct xfs_mount *, struct xfs_dquot *,
+		struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
+
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
+		struct xfs_dquot **);
+extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
+		struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
+extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
+extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
+		struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
+extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
+		struct xfs_dquot *, struct xfs_dquot *,
+		struct xfs_dquot *, uint);
+extern int xfs_qm_dqattach(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
+extern void xfs_qm_dqdetach(struct xfs_inode *);
+extern void xfs_qm_dqrele(struct xfs_dquot *);
+extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
+extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+extern void xfs_qm_mount_quotas(struct xfs_mount *);
+extern void xfs_qm_unmount(struct xfs_mount *);
+extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+
+#else
+static inline int
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+		prid_t prid, uint flags, struct xfs_dquot **udqp,
+		struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
+{
+	*udqp = NULL;
+	*gdqp = NULL;
+	*pdqp = NULL;
+	return 0;
+}
+#define xfs_trans_dup_dqinfo(tp, tp2)
+#define xfs_trans_free_dqinfo(tp)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
+#define xfs_trans_apply_dquot_deltas(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
+		struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+	return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+		struct xfs_mount *mp, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+		long nblks, long nions, uint flags)
+{
+	return 0;
+}
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
+#define xfs_qm_vop_rename_dqattach(it)					(0)
+#define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl)			(0)
+#define xfs_qm_dqattach(ip, fl)						(0)
+#define xfs_qm_dqattach_locked(ip, fl)					(0)
+#define xfs_qm_dqdetach(ip)
+#define xfs_qm_dqrele(d)
+#define xfs_qm_statvfs(ip, s)
+#define xfs_qm_newmount(mp, a, b)					(0)
+#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_unmount(mp)
+#define xfs_qm_unmount_quotas(mp)
+#endif /* CONFIG_XFS_QUOTA */
+
+#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
+	xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
+	xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
+				f | XFS_QMOPT_RES_REGBLKS)
+
+extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
+
+#endif	/* __XFS_QUOTA_H__ */
diff --git a/kernel/fs/xfs/xfs_quotaops.c b/kernel/fs/xfs/xfs_quotaops.c
new file mode 100644
index 000000000..7795e0d01
--- /dev/null
+++ b/kernel/fs/xfs/xfs_quotaops.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_qm.h"
+#include <linux/quota.h>
+
+
+static void
+xfs_qm_fill_state(
+	struct qc_type_state	*tstate,
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino)
+{
+	struct xfs_quotainfo *q = mp->m_quotainfo;
+	bool tempqip = false;
+
+	tstate->ino = ino;
+	if (!ip && ino == NULLFSINO)
+		return;
+	if (!ip) {
+		if (xfs_iget(mp, NULL, ino, 0, 0, &ip))
+			return;
+		tempqip = true;
+	}
+	tstate->flags |= QCI_SYSFILE;
+	tstate->blocks = ip->i_d.di_nblocks;
+	tstate->nextents = ip->i_d.di_nextents;
+	tstate->spc_timelimit = q->qi_btimelimit;
+	tstate->ino_timelimit = q->qi_itimelimit;
+	tstate->rt_spc_timelimit = q->qi_rtbtimelimit;
+	tstate->spc_warnlimit = q->qi_bwarnlimit;
+	tstate->ino_warnlimit = q->qi_iwarnlimit;
+	tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit;
+	if (tempqip)
+		IRELE(ip);
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int
+xfs_fs_get_quota_state(
+	struct super_block	*sb,
+	struct qc_state		*state)
+{
+	struct xfs_mount *mp = XFS_M(sb);
+	struct xfs_quotainfo *q = mp->m_quotainfo;
+
+	memset(state, 0, sizeof(*state));
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return 0;
+	state->s_incoredqs = q->qi_dquots;
+	if (XFS_IS_UQUOTA_RUNNING(mp))
+		state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED;
+	if (XFS_IS_UQUOTA_ENFORCED(mp))
+		state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
+	if (XFS_IS_GQUOTA_RUNNING(mp))
+		state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED;
+	if (XFS_IS_GQUOTA_ENFORCED(mp))
+		state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
+	if (XFS_IS_PQUOTA_RUNNING(mp))
+		state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED;
+	if (XFS_IS_PQUOTA_ENFORCED(mp))
+		state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED;
+
+	xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip,
+			  mp->m_sb.sb_uquotino);
+	xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip,
+			  mp->m_sb.sb_gquotino);
+	xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip,
+			  mp->m_sb.sb_pquotino);
+	return 0;
+}
+
+STATIC int
+xfs_quota_type(int type)
+{
+	switch (type) {
+	case USRQUOTA:
+		return XFS_DQ_USER;
+	case GRPQUOTA:
+		return XFS_DQ_GROUP;
+	default:
+		return XFS_DQ_PROJ;
+	}
+}
+
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int
+xfs_fs_set_info(
+	struct super_block	*sb,
+	int			type,
+	struct qc_info		*info)
+{
+	struct xfs_mount *mp = XFS_M(sb);
+	struct qc_dqblk newlim;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+	if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK)
+		return -EINVAL;
+	if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0)
+		return 0;
+
+	newlim.d_fieldmask = info->i_fieldmask;
+	newlim.d_spc_timer = info->i_spc_timelimit;
+	newlim.d_ino_timer = info->i_ino_timelimit;
+	newlim.d_rt_spc_timer = info->i_rt_spc_timelimit;
+	newlim.d_ino_warns = info->i_ino_warnlimit;
+	newlim.d_spc_warns = info->i_spc_warnlimit;
+	newlim.d_rt_spc_warns = info->i_rt_spc_warnlimit;
+
+	return xfs_qm_scall_setqlim(mp, 0, xfs_quota_type(type), &newlim);
+}
+
+static unsigned int
+xfs_quota_flags(unsigned int uflags)
+{
+	unsigned int flags = 0;
+
+	if (uflags & FS_QUOTA_UDQ_ACCT)
+		flags |= XFS_UQUOTA_ACCT;
+	if (uflags & FS_QUOTA_PDQ_ACCT)
+		flags |= XFS_PQUOTA_ACCT;
+	if (uflags & FS_QUOTA_GDQ_ACCT)
+		flags |= XFS_GQUOTA_ACCT;
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		flags |= XFS_UQUOTA_ENFD;
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		flags |= XFS_GQUOTA_ENFD;
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		flags |= XFS_PQUOTA_ENFD;
+
+	return flags;
+}
+
+STATIC int
+xfs_quota_enable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+
+	return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+
+STATIC int
+xfs_quota_disable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -EINVAL;
+
+	return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
+}
+
+STATIC int
+xfs_fs_rm_xquota(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	unsigned int		flags = 0;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (XFS_IS_QUOTA_ON(mp))
+		return -EINVAL;
+
+	if (uflags & FS_USER_QUOTA)
+		flags |= XFS_DQ_USER;
+	if (uflags & FS_GROUP_QUOTA)
+		flags |= XFS_DQ_GROUP;
+	if (uflags & FS_PROJ_QUOTA)
+		flags |= XFS_DQ_PROJ;
+
+	return xfs_qm_scall_trunc_qfiles(mp, flags);
+}
+
+STATIC int
+xfs_fs_get_dqblk(
+	struct super_block	*sb,
+	struct kqid		qid,
+	struct qc_dqblk		*qdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+
+	return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
+				      xfs_quota_type(qid.type), qdq);
+}
+
+STATIC int
+xfs_fs_set_dqblk(
+	struct super_block	*sb,
+	struct kqid		qid,
+	struct qc_dqblk		*qdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+
+	return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
+				     xfs_quota_type(qid.type), qdq);
+}
+
+const struct quotactl_ops xfs_quotactl_operations = {
+	.get_state		= xfs_fs_get_quota_state,
+	.set_info		= xfs_fs_set_info,
+	.quota_enable		= xfs_quota_enable,
+	.quota_disable		= xfs_quota_disable,
+	.rm_xquota		= xfs_fs_rm_xquota,
+	.get_dqblk		= xfs_fs_get_dqblk,
+	.set_dqblk		= xfs_fs_set_dqblk,
+};
diff --git a/kernel/fs/xfs/xfs_rtalloc.c b/kernel/fs/xfs/xfs_rtalloc.c
new file mode 100644
index 000000000..f2079b691
--- /dev/null
+++ b/kernel/fs/xfs/xfs_rtalloc.c
@@ -0,0 +1,1302 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_rtalloc.h"
+
+
+/*
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+static int
+xfs_rtget_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+{
+	return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
+}
+
+/*
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
+ */
+STATIC int				/* error */
+xfs_rtany_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		low,		/* low log2 extent size */
+	int		high,		/* high log2 extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	int		*stat)		/* out: any good extents here? */
+{
+	int		error;		/* error value */
+	int		log;		/* loop counter, log2 of ext. size */
+	xfs_suminfo_t	sum;		/* summary data */
+
+	/*
+	 * Loop over logs of extent sizes.  Order is irrelevant.
+	 */
+	for (log = low; log <= high; log++) {
+		/*
+		 * Get one summary datum.
+		 */
+		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any, return success.
+		 */
+		if (sum) {
+			*stat = 1;
+			return 0;
+		}
+	}
+	/*
+	 * Found nothing, return failure.
+	 */
+	*stat = 0;
+	return 0;
+}
+
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int				/* error */
+xfs_rtcopy_summary(
+	xfs_mount_t	*omp,		/* old file system mount point */
+	xfs_mount_t	*nmp,		/* new file system mount point */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* summary buffer */
+	int		error;		/* error return value */
+	int		log;		/* summary level number (log length) */
+	xfs_suminfo_t	sum;		/* summary data */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	bp = NULL;
+	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+		for (bbno = omp->m_sb.sb_rbmblocks - 1;
+		     (xfs_srtblock_t)bbno >= 0;
+		     bbno--) {
+			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+				&sumbno, &sum);
+			if (error)
+				return error;
+			if (sum == 0)
+				continue;
+			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			ASSERT(sum > 0);
+		}
+	}
+	return 0;
+}
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtallocate_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* start block to allocate */
+	xfs_extlen_t	len,		/* length to allocate */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the allocated extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock = 0;	/* first block allocated > end */
+	xfs_rtblock_t	preblock = 0;	/* first block allocated < start */
+
+	end = start + len - 1;
+	/*
+	 * Assume we're allocating out of the middle of a free extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of free extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Decrement the summary information corresponding to the entire
+	 * (old) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If there are blocks not being allocated at the front of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being allocated at the end of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Modify the bitmap to mark this extent allocated.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 0);
+	return error;
+}
+
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno.  If we don't get maxlen then use prod to trim
+ * the length, if given.  Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_block(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_rtblock_t	*nextp,		/* out: next block to try */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_rtblock_t	besti;		/* best rtblock found so far */
+	xfs_rtblock_t	bestlen;	/* best length found so far */
+	xfs_rtblock_t	end;		/* last rtblock in chunk */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current rtblock trying */
+	xfs_rtblock_t	next;		/* next rtblock to try */
+	int		stat;		/* status from internal calls */
+
+	/*
+	 * Loop over all the extents starting in this bitmap block,
+	 * looking for one that's long enough.
+	 */
+	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+	     i <= end;
+	     i++) {
+		/*
+		 * See if there's a free extent of maxlen starting at i.
+		 * If it's not so then next will contain the first non-free.
+		 */
+		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		if (error) {
+			return error;
+		}
+		if (stat) {
+			/*
+			 * i for maxlen is all free, allocate and return that.
+			 */
+			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+				rsb);
+			if (error) {
+				return error;
+			}
+			*len = maxlen;
+			*rtblock = i;
+			return 0;
+		}
+		/*
+		 * In the case where we have a variable-sized allocation
+		 * request, figure out how big this free piece is,
+		 * and if it's big enough for the minimum, and the best
+		 * so far, remember it.
+		 */
+		if (minlen < maxlen) {
+			xfs_rtblock_t	thislen;	/* this extent size */
+
+			thislen = next - i;
+			if (thislen >= minlen && thislen > bestlen) {
+				besti = i;
+				bestlen = thislen;
+			}
+		}
+		/*
+		 * If not done yet, find the start of the next free space.
+		 */
+		if (next < end) {
+			error = xfs_rtfind_forw(mp, tp, next, end, &i);
+			if (error) {
+				return error;
+			}
+		} else
+			break;
+	}
+	/*
+	 * Searched the whole thing & didn't find a maxlen free extent.
+	 */
+	if (minlen < maxlen && besti != -1) {
+		xfs_extlen_t	p;	/* amount to trim length by */
+
+		/*
+		 * If size should be a multiple of prod, make that so.
+		 */
+		if (prod > 1 && (p = do_mod(bestlen, prod)))
+			bestlen -= p;
+		/*
+		 * Allocate besti for bestlen & return that.
+		 */
+		error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = bestlen;
+		*rtblock = besti;
+		return 0;
+	}
+	/*
+	 * Allocation failed.  Set *nextp to the next block to try.
+	 */
+	*nextp = next;
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting at block
+ * bno.  If we don't get maxlen then use prod to trim the length, if given.
+ * Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_exact(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	xfs_extlen_t	i;		/* extent length trimmed due to prod */
+	int		isfree;		/* extent is free */
+	xfs_rtblock_t	next;		/* next block to try (dummy) */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * Check if the range in question (for maxlen) is free.
+	 */
+	error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+	if (error) {
+		return error;
+	}
+	if (isfree) {
+		/*
+		 * If it is, allocate it and return success.
+		 */
+		error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = maxlen;
+		*rtblock = bno;
+		return 0;
+	}
+	/*
+	 * If not, allocate what there is, if it's at least minlen.
+	 */
+	maxlen = next - bno;
+	if (maxlen < minlen) {
+		/*
+		 * Failed, return failure status.
+		 */
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	/*
+	 * Trim off tail of extent, if prod is specified.
+	 */
+	if (prod > 1 && (i = maxlen % prod)) {
+		maxlen -= i;
+		if (maxlen < minlen) {
+			/*
+			 * Now we can't do it, return failure status.
+			 */
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+	/*
+	 * Allocate what we can and return it.
+	 */
+	error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	*len = maxlen;
+	*rtblock = bno;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting as near
+ * to bno as possible.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_near(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		any;		/* any useful extents from summary */
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	int		error;		/* error value */
+	int		i;		/* bitmap block offset (loop control) */
+	int		j;		/* secondary loop control */
+	int		log2len;	/* log2 of minlen */
+	xfs_rtblock_t	n;		/* next block to try */
+	xfs_rtblock_t	r;		/* result block */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * If the block number given is off the end, silently set it to
+	 * the last block.
+	 */
+	if (bno >= mp->m_sb.sb_rextents)
+		bno = mp->m_sb.sb_rextents - 1;
+	/*
+	 * Try the exact allocation first.
+	 */
+	error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
+		rbpp, rsb, prod, &r);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If the exact allocation worked, return that.
+	 */
+	if (r != NULLRTBLOCK) {
+		*rtblock = r;
+		return 0;
+	}
+	bbno = XFS_BITTOBLOCK(mp, bno);
+	i = 0;
+	ASSERT(minlen != 0);
+	log2len = xfs_highbit32(minlen);
+	/*
+	 * Loop over all bitmap blocks (bbno + i is current block).
+	 */
+	for (;;) {
+		/*
+		 * Get summary information of extents of all useful levels
+		 * starting in this bitmap block.
+		 */
+		error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
+			bbno + i, rbpp, rsb, &any);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any useful extents starting here, try
+		 * allocating one.
+		 */
+		if (any) {
+			/*
+			 * On the positive side of the starting location.
+			 */
+			if (i >= 0) {
+				/*
+				 * Try to allocate an extent starting in
+				 * this block.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it worked, return it.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+			/*
+			 * On the negative side of the starting location.
+			 */
+			else {		/* i < 0 */
+				/*
+				 * Loop backwards through the bitmap blocks from
+				 * the starting point-1 up to where we are now.
+				 * There should be an extent which ends in this
+				 * bitmap block and is long enough.
+				 */
+				for (j = -1; j > i; j--) {
+					/*
+					 * Grab the summary information for
+					 * this bitmap block.
+					 */
+					error = xfs_rtany_summary(mp, tp,
+						log2len, mp->m_rsumlevels - 1,
+						bbno + j, rbpp, rsb, &any);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If there's no extent given in the
+					 * summary that means the extent we
+					 * found must carry over from an
+					 * earlier block.  If there is an
+					 * extent given, we've already tried
+					 * that allocation, don't do it again.
+					 */
+					if (any)
+						continue;
+					error = xfs_rtallocate_extent_block(mp,
+						tp, bbno + j, minlen, maxlen,
+						len, &n, rbpp, rsb, prod, &r);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If it works, return the extent.
+					 */
+					if (r != NULLRTBLOCK) {
+						*rtblock = r;
+						return 0;
+					}
+				}
+				/*
+				 * There weren't intervening bitmap blocks
+				 * with a long enough extent, or the
+				 * allocation didn't work for some reason
+				 * (i.e. it's a little * too short).
+				 * Try to allocate from the summary block
+				 * that we found.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it works, return the extent.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+		}
+		/*
+		 * Loop control.  If we were on the positive side, and there's
+		 * still more blocks on the negative side, go there.
+		 */
+		if (i > 0 && (int)bbno - i >= 0)
+			i = -i;
+		/*
+		 * If positive, and no more negative, but there are more
+		 * positive, go there.
+		 */
+		else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
+			i++;
+		/*
+		 * If negative or 0 (just started), and there are positive
+		 * blocks to go, go there.  The 0 case moves to block 1.
+		 */
+		else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
+			i = 1 - i;
+		/*
+		 * If negative or 0 and there are more negative blocks,
+		 * go there.
+		 */
+		else if (i <= 0 && (int)bbno + i > 0)
+			i--;
+		/*
+		 * Must be done.  Return failure.
+		 */
+		else
+			break;
+	}
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, with no position
+ * specified.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_size(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	int		i;		/* bitmap block number */
+	int		l;		/* level number (loop control) */
+	xfs_rtblock_t	n;		/* next block to be tried */
+	xfs_rtblock_t	r;		/* result block number */
+	xfs_suminfo_t	sum;		/* summary information for extents */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	ASSERT(maxlen != 0);
+
+	/*
+	 * Loop over all the levels starting with maxlen.
+	 * At each level, look at all the bitmap blocks, to see if there
+	 * are extents starting there that are long enough (>= maxlen).
+	 * Note, only on the initial level can the allocation fail if
+	 * the summary says there's an extent.
+	 */
+	for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
+		/*
+		 * Loop over all the bitmap blocks.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary for this level/block.
+			 */
+			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+				&sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * Nothing there, on to the next block.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try allocating the extent.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
+				maxlen, len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Didn't find any maxlen blocks.  Try smaller ones, unless
+	 * we're asking for a fixed size extent.
+	 */
+	if (minlen > --maxlen) {
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	ASSERT(minlen != 0);
+	ASSERT(maxlen != 0);
+
+	/*
+	 * Loop over sizes, from maxlen down to minlen.
+	 * This time, when we do the allocations, allow smaller ones
+	 * to succeed.
+	 */
+	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
+		/*
+		 * Loop over all the bitmap blocks, try an allocation
+		 * starting in that block.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary information for this level/block.
+			 */
+			error =	xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+						  &sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If nothing there, go on to next.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try the allocation.  Make sure the specified
+			 * minlen/maxlen are in the possible range for
+			 * this summary level.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i,
+					XFS_RTMAX(minlen, 1 << l),
+					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+					len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that extent.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Got nothing, return failure.
+	 */
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ */
+STATIC int				/* error */
+xfs_growfs_rt_alloc(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_extlen_t	oblocks,	/* old count of blocks */
+	xfs_extlen_t	nblocks,	/* new count of blocks */
+	xfs_inode_t	*ip)		/* inode (bitmap/summary) */
+{
+	xfs_fileoff_t	bno;		/* block number in file */
+	xfs_buf_t	*bp;		/* temporary buffer for zeroing */
+	int		committed;	/* transaction committed flag */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;		/* error return value */
+	xfs_fsblock_t	firstblock;	/* first block allocated in xaction */
+	xfs_bmap_free_t	flist;		/* list of freed blocks */
+	xfs_fsblock_t	fsbno;		/* filesystem block for bno */
+	xfs_bmbt_irec_t	map;		/* block map output */
+	int		nmap;		/* number of block maps */
+	int		resblks;	/* space reservation */
+
+	/*
+	 * Allocate space to the file, as necessary.
+	 */
+	while (oblocks < nblocks) {
+		int		cancelflags = 0;
+		xfs_trans_t	*tp;
+
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+		/*
+		 * Reserve space & log for one extent added to the file.
+		 */
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
+					  resblks, 0);
+		if (error)
+			goto error_cancel;
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+		/*
+		 * Lock the inode.
+		 */
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+		xfs_bmap_init(&flist, &firstblock);
+		/*
+		 * Allocate blocks to the bitmap file.
+		 */
+		nmap = 1;
+		cancelflags |= XFS_TRANS_ABORT;
+		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+					XFS_BMAPI_METADATA, &firstblock,
+					resblks, &map, &nmap, &flist);
+		if (!error && nmap < 1)
+			error = -ENOSPC;
+		if (error)
+			goto error_cancel;
+		/*
+		 * Free any blocks freed up in the transaction, then commit.
+		 */
+		error = xfs_bmap_finish(&tp, &flist, &committed);
+		if (error)
+			goto error_cancel;
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			goto error;
+		/*
+		 * Now we need to clear the allocated blocks.
+		 * Do this one block per transaction, to keep it simple.
+		 */
+		cancelflags = 0;
+		for (bno = map.br_startoff, fsbno = map.br_startblock;
+		     bno < map.br_startoff + map.br_blockcount;
+		     bno++, fsbno++) {
+			tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
+			/*
+			 * Reserve log for one block zeroing.
+			 */
+			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
+						  0, 0);
+			if (error)
+				goto error_cancel;
+			/*
+			 * Lock the bitmap inode.
+			 */
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+			/*
+			 * Get a buffer for the block.
+			 */
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+				mp->m_bsize, 0);
+			if (bp == NULL) {
+				error = -EIO;
+error_cancel:
+				xfs_trans_cancel(tp, cancelflags);
+				goto error;
+			}
+			memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
+			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+			/*
+			 * Commit the transaction.
+			 */
+			error = xfs_trans_commit(tp, 0);
+			if (error)
+				goto error;
+		}
+		/*
+		 * Go on to the next extent, if any.
+		 */
+		oblocks = map.br_startoff + map.br_blockcount;
+	}
+	return 0;
+
+error:
+	return error;
+}
+
+/*
+ * Visible (exported) functions.
+ */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_growfs_rt_t	*in)		/* growfs rt input struct */
+{
+	xfs_rtblock_t	bmbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* temporary buffer */
+	int		error;		/* error return value */
+	xfs_mount_t	*nmp;		/* new (fake) mount structure */
+	xfs_rfsblock_t	nrblocks;	/* new number of realtime blocks */
+	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
+	xfs_rtblock_t	nrextents;	/* new number of realtime extents */
+	uint8_t		nrextslog;	/* new log2 of sb_rextents */
+	xfs_extlen_t	nrsumblocks;	/* new number of summary blocks */
+	uint		nrsumlevels;	/* new rt summary levels */
+	uint		nrsumsize;	/* new size of rt summary, bytes */
+	xfs_sb_t	*nsbp;		/* new superblock */
+	xfs_extlen_t	rbmblocks;	/* current number of rt bitmap blocks */
+	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
+	xfs_sb_t	*sbp;		/* old superblock */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	sbp = &mp->m_sb;
+	/*
+	 * Initial error checking.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
+	    (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
+	    (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+		return -EINVAL;
+	if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
+		return error;
+	/*
+	 * Read in the last block of the device, make sure it exists.
+	 */
+	error = xfs_buf_read_uncached(mp->m_rtdev_targp,
+				XFS_FSB_TO_BB(mp, nrblocks - 1),
+				XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error)
+		return error;
+	xfs_buf_relse(bp);
+
+	/*
+	 * Calculate new parameters.  These are the final values to be reached.
+	 */
+	nrextents = nrblocks;
+	do_div(nrextents, in->extsize);
+	nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
+	nrextslog = xfs_highbit32(nrextents);
+	nrsumlevels = nrextslog + 1;
+	nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
+	nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+	/*
+	 * New summary size can't be more than half the size of
+	 * the log.  This prevents us from getting a log overflow,
+	 * since we'll log basically the whole summary file at once.
+	 */
+	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
+		return -EINVAL;
+	/*
+	 * Get the old block counts for bitmap and summary inodes.
+	 * These can't change since other growfs callers are locked out.
+	 */
+	rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
+	rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
+	/*
+	 * Allocate space to the bitmap and summary files, as necessary.
+	 */
+	error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
+	if (error)
+		return error;
+	error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
+	if (error)
+		return error;
+	/*
+	 * Allocate a new (fake) mount/sb.
+	 */
+	nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+	/*
+	 * Loop over the bitmap blocks.
+	 * We will do everything one bitmap block at a time.
+	 * Skip the current block if it is exactly full.
+	 * This also deals with the case where there were no rtextents before.
+	 */
+	for (bmbno = sbp->sb_rbmblocks -
+		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
+	     bmbno < nrbmblocks;
+	     bmbno++) {
+		xfs_trans_t	*tp;
+		int		cancelflags = 0;
+
+		*nmp = *mp;
+		nsbp = &nmp->m_sb;
+		/*
+		 * Calculate new sb and mount fields for this round.
+		 */
+		nsbp->sb_rextsize = in->extsize;
+		nsbp->sb_rbmblocks = bmbno + 1;
+		nsbp->sb_rblocks =
+			XFS_RTMIN(nrblocks,
+				  nsbp->sb_rbmblocks * NBBY *
+				  nsbp->sb_blocksize * nsbp->sb_rextsize);
+		nsbp->sb_rextents = nsbp->sb_rblocks;
+		do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+		ASSERT(nsbp->sb_rextents != 0);
+		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
+		nrsumsize =
+			(uint)sizeof(xfs_suminfo_t) * nrsumlevels *
+			nsbp->sb_rbmblocks;
+		nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+		nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+		/*
+		 * Start a transaction, get the log reservation.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
+					  0, 0);
+		if (error)
+			goto error_cancel;
+		/*
+		 * Lock out other callers by grabbing the bitmap inode lock.
+		 */
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		/*
+		 * Update the bitmap inode's size.
+		 */
+		mp->m_rbmip->i_d.di_size =
+			nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		cancelflags |= XFS_TRANS_ABORT;
+		/*
+		 * Get the summary inode into the transaction.
+		 */
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		/*
+		 * Update the summary inode's size.
+		 */
+		mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+		xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
+		/*
+		 * Copy summary data from old to new sizes.
+		 * Do this when the real size (not block-aligned) changes.
+		 */
+		if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
+		    mp->m_rsumlevels != nmp->m_rsumlevels) {
+			error = xfs_rtcopy_summary(mp, nmp, tp);
+			if (error)
+				goto error_cancel;
+		}
+		/*
+		 * Update superblock fields.
+		 */
+		if (nsbp->sb_rextsize != sbp->sb_rextsize)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+				nsbp->sb_rextsize - sbp->sb_rextsize);
+		if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+				nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
+		if (nsbp->sb_rblocks != sbp->sb_rblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+				nsbp->sb_rblocks - sbp->sb_rblocks);
+		if (nsbp->sb_rextents != sbp->sb_rextents)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+				nsbp->sb_rextents - sbp->sb_rextents);
+		if (nsbp->sb_rextslog != sbp->sb_rextslog)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+				nsbp->sb_rextslog - sbp->sb_rextslog);
+		/*
+		 * Free new extent.
+		 */
+		bp = NULL;
+		error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
+			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+		if (error) {
+error_cancel:
+			xfs_trans_cancel(tp, cancelflags);
+			break;
+		}
+		/*
+		 * Mark more blocks free in the superblock.
+		 */
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
+			nsbp->sb_rextents - sbp->sb_rextents);
+		/*
+		 * Update mp values into the real mp structure.
+		 */
+		mp->m_rsumlevels = nrsumlevels;
+		mp->m_rsumsize = nrsumsize;
+
+		error = xfs_trans_commit(tp, 0);
+		if (error)
+			break;
+	}
+
+	/*
+	 * Free the fake mp structure.
+	 */
+	kmem_free(nmp);
+
+	return error;
+}
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_alloctype_t	type,		/* allocation type XFS_ALLOCTYPE... */
+	int		wasdel,		/* was a delayed allocation extent */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_mount_t	*mp = tp->t_mountp;
+	int		error;		/* error value */
+	xfs_rtblock_t	r;		/* result allocated block */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp;		/* summary file block buffer */
+
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+	ASSERT(minlen > 0 && minlen <= maxlen);
+
+	/*
+	 * If prod is set then figure out what to do to minlen and maxlen.
+	 */
+	if (prod > 1) {
+		xfs_extlen_t	i;
+
+		if ((i = maxlen % prod))
+			maxlen -= i;
+		if ((i = minlen % prod))
+			minlen += prod - i;
+		if (maxlen < minlen) {
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+
+	sumbp = NULL;
+	/*
+	 * Allocate by size, or near another block, or exactly at some block.
+	 */
+	switch (type) {
+	case XFS_ALLOCTYPE_ANY_AG:
+		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
+				&sumbp,	&sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	default:
+		error = -EIO;
+		ASSERT(0);
+	}
+	if (error)
+		return error;
+
+	/*
+	 * If it worked, update the superblock.
+	 */
+	if (r != NULLRTBLOCK) {
+		long	slen = (long)*len;
+
+		ASSERT(*len >= minlen && *len <= maxlen);
+		if (wasdel)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
+		else
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+	}
+	*rtblock = r;
+	return 0;
+}
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int				/* error */
+xfs_rtmount_init(
+	struct xfs_mount	*mp)	/* file system mount structure */
+{
+	struct xfs_buf		*bp;	/* buffer for last block of subvolume */
+	struct xfs_sb		*sbp;	/* filesystem superblock copy in mount */
+	xfs_daddr_t		d;	/* address of last block of subvolume */
+	int			error;
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rblocks == 0)
+		return 0;
+	if (mp->m_rtdev_targp == NULL) {
+		xfs_warn(mp,
+	"Filesystem has a realtime volume, use rtdev=device option");
+		return -ENODEV;
+	}
+	mp->m_rsumlevels = sbp->sb_rextslog + 1;
+	mp->m_rsumsize =
+		(uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
+		sbp->sb_rbmblocks;
+	mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+	mp->m_rbmip = mp->m_rsumip = NULL;
+	/*
+	 * Check that the realtime section is an ok size.
+	 */
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
+		xfs_warn(mp, "realtime mount -- %llu != %llu",
+			(unsigned long long) XFS_BB_TO_FSB(mp, d),
+			(unsigned long long) mp->m_sb.sb_rblocks);
+		return -EFBIG;
+	}
+	error = xfs_buf_read_uncached(mp->m_rtdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
+		xfs_warn(mp, "realtime device size check failed");
+		return error;
+	}
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		error;		/* error return value */
+	xfs_sb_t	*sbp;
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rbmino == NULLFSINO)
+		return 0;
+	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+	if (error)
+		return error;
+	ASSERT(mp->m_rbmip != NULL);
+	ASSERT(sbp->sb_rsumino != NULLFSINO);
+	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+	if (error) {
+		IRELE(mp->m_rbmip);
+		return error;
+	}
+	ASSERT(mp->m_rsumip != NULL);
+	return 0;
+}
+
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_rbmip)
+		IRELE(mp->m_rbmip);
+	if (mp->m_rsumip)
+		IRELE(mp->m_rsumip);
+}
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	len,		/* allocation length (rtextents) */
+	xfs_rtblock_t	*pick)		/* result rt extent */
+{
+	xfs_rtblock_t	b;		/* result block */
+	int		log2;		/* log of sequence number */
+	__uint64_t	resid;		/* residual after log removed */
+	__uint64_t	seq;		/* sequence number of file creation */
+	__uint64_t	*seqp;		/* pointer to seqno in inode */
+
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+	if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
+		mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*seqp = 0;
+	}
+	seq = *seqp;
+	if ((log2 = xfs_highbit64(seq)) == -1)
+		b = 0;
+	else {
+		resid = seq - (1ULL << log2);
+		b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
+		    (log2 + 1);
+		if (b >= mp->m_sb.sb_rextents)
+			b = do_mod(b, mp->m_sb.sb_rextents);
+		if (b + len > mp->m_sb.sb_rextents)
+			b = mp->m_sb.sb_rextents - len;
+	}
+	*seqp = seq + 1;
+	xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	*pick = b;
+	return 0;
+}
diff --git a/kernel/fs/xfs/xfs_rtalloc.h b/kernel/fs/xfs/xfs_rtalloc.h
new file mode 100644
index 000000000..76c0a4a9b
--- /dev/null
+++ b/kernel/fs/xfs/xfs_rtalloc.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_RTALLOC_H__
+#define	__XFS_RTALLOC_H__
+
+/* kernel only definitions and functions */
+
+struct xfs_mount;
+struct xfs_trans;
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Function prototypes for exported functions.
+ */
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to allocate */
+	xfs_extlen_t		minlen,	/* minimum length to allocate */
+	xfs_extlen_t		maxlen,	/* maximum length to allocate */
+	xfs_extlen_t		*len,	/* out: actual length allocated */
+	xfs_alloctype_t		type,	/* allocation type XFS_ALLOCTYPE... */
+	int			wasdel,	/* was a delayed allocation extent */
+	xfs_extlen_t		prod,	/* extent product factor */
+	xfs_rtblock_t		*rtblock); /* out: start block allocated */
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to free */
+	xfs_extlen_t		len);	/* length of extent freed */
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int					/* error */
+xfs_rtmount_init(
+	struct xfs_mount	*mp);	/* file system mount structure */
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp);
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_extlen_t		len,	/* allocation length (rtextents) */
+	xfs_rtblock_t		*pick);	/* result rt extent */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
+
+/*
+ * From xfs_rtbitmap.c
+ */
+int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
+		  xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
+int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		      xfs_rtblock_t start, xfs_extlen_t len, int val,
+		      xfs_rtblock_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
+		    xfs_rtblock_t start, xfs_rtblock_t limit,
+		    xfs_rtblock_t *rtblock);
+int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
+		    xfs_rtblock_t start, xfs_rtblock_t limit,
+		    xfs_rtblock_t *rtblock);
+int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		       xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
+			     int log, xfs_rtblock_t bbno, int delta,
+			     xfs_buf_t **rbpp, xfs_fsblock_t *rsb,
+			     xfs_suminfo_t *sum);
+int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
+			 xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
+			 xfs_fsblock_t *rsb);
+int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		     xfs_rtblock_t start, xfs_extlen_t len,
+		     struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
+
+
+#else
+# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)  (ENOSYS)
+# define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
+# define xfs_growfs_rt(mp,in)                           (ENOSYS)
+static inline int		/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	if (mp->m_sb.sb_rblocks == 0)
+		return 0;
+
+	xfs_warn(mp, "Not built with CONFIG_XFS_RT");
+	return -ENOSYS;
+}
+# define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
+#endif	/* CONFIG_XFS_RT */
+
+#endif	/* __XFS_RTALLOC_H__ */
diff --git a/kernel/fs/xfs/xfs_stats.c b/kernel/fs/xfs/xfs_stats.c
new file mode 100644
index 000000000..f2240383d
--- /dev/null
+++ b/kernel/fs/xfs/xfs_stats.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/proc_fs.h>
+
+DEFINE_PER_CPU(struct xfsstats, xfsstats);
+
+static int counter_val(int idx)
+{
+	int val = 0, cpu;
+
+	for_each_possible_cpu(cpu)
+		val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+	return val;
+}
+
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
+{
+	int		i, j;
+	__uint64_t	xs_xstrat_bytes = 0;
+	__uint64_t	xs_write_bytes = 0;
+	__uint64_t	xs_read_bytes = 0;
+
+	static const struct xstats_entry {
+		char	*desc;
+		int	endpoint;
+	} xstats[] = {
+		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
+		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
+		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
+		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
+		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
+		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
+		{ "ig",			XFSSTAT_END_INODE_OPS		},
+		{ "log",		XFSSTAT_END_LOG_OPS		},
+		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
+		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
+		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
+		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
+		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
+		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
+		{ "buf",		XFSSTAT_END_BUF			},
+		{ "abtb2",		XFSSTAT_END_ABTB_V2		},
+		{ "abtc2",		XFSSTAT_END_ABTC_V2		},
+		{ "bmbt2",		XFSSTAT_END_BMBT_V2		},
+		{ "ibt2",		XFSSTAT_END_IBT_V2		},
+		{ "fibt2",		XFSSTAT_END_FIBT_V2		},
+		/* we print both series of quota information together */
+		{ "qm",			XFSSTAT_END_QM			},
+	};
+
+	/* Loop over all stats groups */
+	for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
+		seq_printf(m, "%s", xstats[i].desc);
+		/* inner loop does each group */
+		for (; j < xstats[i].endpoint; j++)
+			seq_printf(m, " %u", counter_val(j));
+		seq_putc(m, '\n');
+	}
+	/* extra precision counters */
+	for_each_possible_cpu(i) {
+		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
+		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
+		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+	}
+
+	seq_printf(m, "xpc %Lu %Lu %Lu\n",
+			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+	seq_printf(m, "debug %u\n",
+#if defined(DEBUG)
+		1);
+#else
+		0);
+#endif
+	return 0;
+}
+
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xfs_stat_proc_show, NULL);
+}
+
+static const struct file_operations xfs_stat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xfs_stat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/* legacy quota interfaces */
+#ifdef CONFIG_XFS_QUOTA
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+	/* maximum; incore; ratio free to inuse; freelist */
+	seq_printf(m, "%d\t%d\t%d\t%u\n",
+			0,
+			counter_val(XFSSTAT_END_XQMSTAT),
+			0,
+			counter_val(XFSSTAT_END_XQMSTAT + 1));
+	return 0;
+}
+
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqm_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/* legacy quota stats interface no 2 */
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+	int j;
+
+	seq_printf(m, "qm");
+	for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+		seq_printf(m, " %u", counter_val(j));
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xqmstat_proc_show, NULL);
+}
+
+static const struct file_operations xqmstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqmstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_XFS_QUOTA */
+
+int
+xfs_init_procfs(void)
+{
+	if (!proc_mkdir("fs/xfs", NULL))
+		goto out;
+
+	if (!proc_create("fs/xfs/stat", 0, NULL,
+			 &xfs_stat_proc_fops))
+		goto out_remove_xfs_dir;
+#ifdef CONFIG_XFS_QUOTA
+	if (!proc_create("fs/xfs/xqmstat", 0, NULL,
+			 &xqmstat_proc_fops))
+		goto out_remove_stat_file;
+	if (!proc_create("fs/xfs/xqm", 0, NULL,
+			 &xqm_proc_fops))
+		goto out_remove_xqmstat_file;
+#endif
+	return 0;
+
+#ifdef CONFIG_XFS_QUOTA
+ out_remove_xqmstat_file:
+	remove_proc_entry("fs/xfs/xqmstat", NULL);
+ out_remove_stat_file:
+	remove_proc_entry("fs/xfs/stat", NULL);
+#endif
+ out_remove_xfs_dir:
+	remove_proc_entry("fs/xfs", NULL);
+ out:
+	return -ENOMEM;
+}
+
+void
+xfs_cleanup_procfs(void)
+{
+#ifdef CONFIG_XFS_QUOTA
+	remove_proc_entry("fs/xfs/xqm", NULL);
+	remove_proc_entry("fs/xfs/xqmstat", NULL);
+#endif
+	remove_proc_entry("fs/xfs/stat", NULL);
+	remove_proc_entry("fs/xfs", NULL);
+}
diff --git a/kernel/fs/xfs/xfs_stats.h b/kernel/fs/xfs/xfs_stats.h
new file mode 100644
index 000000000..c8f238b82
--- /dev/null
+++ b/kernel/fs/xfs/xfs_stats.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+
+
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+
+#include <linux/percpu.h>
+
+/*
+ * XFS global statistics
+ */
+struct xfsstats {
+# define XFSSTAT_END_EXTENT_ALLOC	4
+	__uint32_t		xs_allocx;
+	__uint32_t		xs_allocb;
+	__uint32_t		xs_freex;
+	__uint32_t		xs_freeb;
+# define XFSSTAT_END_ALLOC_BTREE	(XFSSTAT_END_EXTENT_ALLOC+4)
+	__uint32_t		xs_abt_lookup;
+	__uint32_t		xs_abt_compare;
+	__uint32_t		xs_abt_insrec;
+	__uint32_t		xs_abt_delrec;
+# define XFSSTAT_END_BLOCK_MAPPING	(XFSSTAT_END_ALLOC_BTREE+7)
+	__uint32_t		xs_blk_mapr;
+	__uint32_t		xs_blk_mapw;
+	__uint32_t		xs_blk_unmap;
+	__uint32_t		xs_add_exlist;
+	__uint32_t		xs_del_exlist;
+	__uint32_t		xs_look_exlist;
+	__uint32_t		xs_cmp_exlist;
+# define XFSSTAT_END_BLOCK_MAP_BTREE	(XFSSTAT_END_BLOCK_MAPPING+4)
+	__uint32_t		xs_bmbt_lookup;
+	__uint32_t		xs_bmbt_compare;
+	__uint32_t		xs_bmbt_insrec;
+	__uint32_t		xs_bmbt_delrec;
+# define XFSSTAT_END_DIRECTORY_OPS	(XFSSTAT_END_BLOCK_MAP_BTREE+4)
+	__uint32_t		xs_dir_lookup;
+	__uint32_t		xs_dir_create;
+	__uint32_t		xs_dir_remove;
+	__uint32_t		xs_dir_getdents;
+# define XFSSTAT_END_TRANSACTIONS	(XFSSTAT_END_DIRECTORY_OPS+3)
+	__uint32_t		xs_trans_sync;
+	__uint32_t		xs_trans_async;
+	__uint32_t		xs_trans_empty;
+# define XFSSTAT_END_INODE_OPS		(XFSSTAT_END_TRANSACTIONS+7)
+	__uint32_t		xs_ig_attempts;
+	__uint32_t		xs_ig_found;
+	__uint32_t		xs_ig_frecycle;
+	__uint32_t		xs_ig_missed;
+	__uint32_t		xs_ig_dup;
+	__uint32_t		xs_ig_reclaims;
+	__uint32_t		xs_ig_attrchg;
+# define XFSSTAT_END_LOG_OPS		(XFSSTAT_END_INODE_OPS+5)
+	__uint32_t		xs_log_writes;
+	__uint32_t		xs_log_blocks;
+	__uint32_t		xs_log_noiclogs;
+	__uint32_t		xs_log_force;
+	__uint32_t		xs_log_force_sleep;
+# define XFSSTAT_END_TAIL_PUSHING	(XFSSTAT_END_LOG_OPS+10)
+	__uint32_t		xs_try_logspace;
+	__uint32_t		xs_sleep_logspace;
+	__uint32_t		xs_push_ail;
+	__uint32_t		xs_push_ail_success;
+	__uint32_t		xs_push_ail_pushbuf;
+	__uint32_t		xs_push_ail_pinned;
+	__uint32_t		xs_push_ail_locked;
+	__uint32_t		xs_push_ail_flushing;
+	__uint32_t		xs_push_ail_restarts;
+	__uint32_t		xs_push_ail_flush;
+# define XFSSTAT_END_WRITE_CONVERT	(XFSSTAT_END_TAIL_PUSHING+2)
+	__uint32_t		xs_xstrat_quick;
+	__uint32_t		xs_xstrat_split;
+# define XFSSTAT_END_READ_WRITE_OPS	(XFSSTAT_END_WRITE_CONVERT+2)
+	__uint32_t		xs_write_calls;
+	__uint32_t		xs_read_calls;
+# define XFSSTAT_END_ATTRIBUTE_OPS	(XFSSTAT_END_READ_WRITE_OPS+4)
+	__uint32_t		xs_attr_get;
+	__uint32_t		xs_attr_set;
+	__uint32_t		xs_attr_remove;
+	__uint32_t		xs_attr_list;
+# define XFSSTAT_END_INODE_CLUSTER	(XFSSTAT_END_ATTRIBUTE_OPS+3)
+	__uint32_t		xs_iflush_count;
+	__uint32_t		xs_icluster_flushcnt;
+	__uint32_t		xs_icluster_flushinode;
+# define XFSSTAT_END_VNODE_OPS		(XFSSTAT_END_INODE_CLUSTER+8)
+	__uint32_t		vn_active;	/* # vnodes not on free lists */
+	__uint32_t		vn_alloc;	/* # times vn_alloc called */
+	__uint32_t		vn_get;		/* # times vn_get called */
+	__uint32_t		vn_hold;	/* # times vn_hold called */
+	__uint32_t		vn_rele;	/* # times vn_rele called */
+	__uint32_t		vn_reclaim;	/* # times vn_reclaim called */
+	__uint32_t		vn_remove;	/* # times vn_remove called */
+	__uint32_t		vn_free;	/* # times vn_free called */
+#define XFSSTAT_END_BUF			(XFSSTAT_END_VNODE_OPS+9)
+	__uint32_t		xb_get;
+	__uint32_t		xb_create;
+	__uint32_t		xb_get_locked;
+	__uint32_t		xb_get_locked_waited;
+	__uint32_t		xb_busy_locked;
+	__uint32_t		xb_miss_locked;
+	__uint32_t		xb_page_retries;
+	__uint32_t		xb_page_found;
+	__uint32_t		xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2		(XFSSTAT_END_BUF+15)
+	__uint32_t		xs_abtb_2_lookup;
+	__uint32_t		xs_abtb_2_compare;
+	__uint32_t		xs_abtb_2_insrec;
+	__uint32_t		xs_abtb_2_delrec;
+	__uint32_t		xs_abtb_2_newroot;
+	__uint32_t		xs_abtb_2_killroot;
+	__uint32_t		xs_abtb_2_increment;
+	__uint32_t		xs_abtb_2_decrement;
+	__uint32_t		xs_abtb_2_lshift;
+	__uint32_t		xs_abtb_2_rshift;
+	__uint32_t		xs_abtb_2_split;
+	__uint32_t		xs_abtb_2_join;
+	__uint32_t		xs_abtb_2_alloc;
+	__uint32_t		xs_abtb_2_free;
+	__uint32_t		xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2		(XFSSTAT_END_ABTB_V2+15)
+	__uint32_t		xs_abtc_2_lookup;
+	__uint32_t		xs_abtc_2_compare;
+	__uint32_t		xs_abtc_2_insrec;
+	__uint32_t		xs_abtc_2_delrec;
+	__uint32_t		xs_abtc_2_newroot;
+	__uint32_t		xs_abtc_2_killroot;
+	__uint32_t		xs_abtc_2_increment;
+	__uint32_t		xs_abtc_2_decrement;
+	__uint32_t		xs_abtc_2_lshift;
+	__uint32_t		xs_abtc_2_rshift;
+	__uint32_t		xs_abtc_2_split;
+	__uint32_t		xs_abtc_2_join;
+	__uint32_t		xs_abtc_2_alloc;
+	__uint32_t		xs_abtc_2_free;
+	__uint32_t		xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2		(XFSSTAT_END_ABTC_V2+15)
+	__uint32_t		xs_bmbt_2_lookup;
+	__uint32_t		xs_bmbt_2_compare;
+	__uint32_t		xs_bmbt_2_insrec;
+	__uint32_t		xs_bmbt_2_delrec;
+	__uint32_t		xs_bmbt_2_newroot;
+	__uint32_t		xs_bmbt_2_killroot;
+	__uint32_t		xs_bmbt_2_increment;
+	__uint32_t		xs_bmbt_2_decrement;
+	__uint32_t		xs_bmbt_2_lshift;
+	__uint32_t		xs_bmbt_2_rshift;
+	__uint32_t		xs_bmbt_2_split;
+	__uint32_t		xs_bmbt_2_join;
+	__uint32_t		xs_bmbt_2_alloc;
+	__uint32_t		xs_bmbt_2_free;
+	__uint32_t		xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2		(XFSSTAT_END_BMBT_V2+15)
+	__uint32_t		xs_ibt_2_lookup;
+	__uint32_t		xs_ibt_2_compare;
+	__uint32_t		xs_ibt_2_insrec;
+	__uint32_t		xs_ibt_2_delrec;
+	__uint32_t		xs_ibt_2_newroot;
+	__uint32_t		xs_ibt_2_killroot;
+	__uint32_t		xs_ibt_2_increment;
+	__uint32_t		xs_ibt_2_decrement;
+	__uint32_t		xs_ibt_2_lshift;
+	__uint32_t		xs_ibt_2_rshift;
+	__uint32_t		xs_ibt_2_split;
+	__uint32_t		xs_ibt_2_join;
+	__uint32_t		xs_ibt_2_alloc;
+	__uint32_t		xs_ibt_2_free;
+	__uint32_t		xs_ibt_2_moves;
+#define XFSSTAT_END_FIBT_V2		(XFSSTAT_END_IBT_V2+15)
+	__uint32_t		xs_fibt_2_lookup;
+	__uint32_t		xs_fibt_2_compare;
+	__uint32_t		xs_fibt_2_insrec;
+	__uint32_t		xs_fibt_2_delrec;
+	__uint32_t		xs_fibt_2_newroot;
+	__uint32_t		xs_fibt_2_killroot;
+	__uint32_t		xs_fibt_2_increment;
+	__uint32_t		xs_fibt_2_decrement;
+	__uint32_t		xs_fibt_2_lshift;
+	__uint32_t		xs_fibt_2_rshift;
+	__uint32_t		xs_fibt_2_split;
+	__uint32_t		xs_fibt_2_join;
+	__uint32_t		xs_fibt_2_alloc;
+	__uint32_t		xs_fibt_2_free;
+	__uint32_t		xs_fibt_2_moves;
+#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_FIBT_V2+6)
+	__uint32_t		xs_qm_dqreclaims;
+	__uint32_t		xs_qm_dqreclaim_misses;
+	__uint32_t		xs_qm_dquot_dups;
+	__uint32_t		xs_qm_dqcachemisses;
+	__uint32_t		xs_qm_dqcachehits;
+	__uint32_t		xs_qm_dqwants;
+#define XFSSTAT_END_QM			(XFSSTAT_END_XQMSTAT+2)
+	__uint32_t		xs_qm_dquot;
+	__uint32_t		xs_qm_dquot_unused;
+/* Extra precision counters */
+	__uint64_t		xs_xstrat_bytes;
+	__uint64_t		xs_write_bytes;
+	__uint64_t		xs_read_bytes;
+};
+
+DECLARE_PER_CPU(struct xfsstats, xfsstats);
+
+/*
+ * We don't disable preempt, not too worried about poking the
+ * wrong CPU's stat for now (also aggregated before reporting).
+ */
+#define XFS_STATS_INC(v)	(per_cpu(xfsstats, current_cpu()).v++)
+#define XFS_STATS_DEC(v)	(per_cpu(xfsstats, current_cpu()).v--)
+#define XFS_STATS_ADD(v, inc)	(per_cpu(xfsstats, current_cpu()).v += (inc))
+
+extern int xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+
+
+#else	/* !CONFIG_PROC_FS */
+
+# define XFS_STATS_INC(count)
+# define XFS_STATS_DEC(count)
+# define XFS_STATS_ADD(count, inc)
+
+static inline int xfs_init_procfs(void)
+{
+	return 0;
+}
+
+static inline void xfs_cleanup_procfs(void)
+{
+}
+
+#endif	/* !CONFIG_PROC_FS */
+
+#endif /* __XFS_STATS_H__ */
diff --git a/kernel/fs/xfs/xfs_super.c b/kernel/fs/xfs/xfs_super.c
new file mode 100644
index 000000000..858e1e62b
--- /dev/null
+++ b/kernel/fs/xfs/xfs_super.c
@@ -0,0 +1,1889 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_fsops.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+#include "xfs_icreate_item.h"
+#include "xfs_filestream.h"
+#include "xfs_quota.h"
+#include "xfs_sysfs.h"
+
+#include <linux/namei.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/mempool.h>
+#include <linux/writeback.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/parser.h>
+
+static const struct super_operations xfs_super_operations;
+static kmem_zone_t *xfs_ioend_zone;
+mempool_t *xfs_ioend_pool;
+
+static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
+#ifdef DEBUG
+static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
+#endif
+
+#define MNTOPT_LOGBUFS	"logbufs"	/* number of XFS log buffers */
+#define MNTOPT_LOGBSIZE	"logbsize"	/* size of XFS log buffers */
+#define MNTOPT_LOGDEV	"logdev"	/* log device */
+#define MNTOPT_RTDEV	"rtdev"		/* realtime I/O device */
+#define MNTOPT_BIOSIZE	"biosize"	/* log2 of preferred buffered io size */
+#define MNTOPT_WSYNC	"wsync"		/* safe-mode nfs compatible mount */
+#define MNTOPT_NOALIGN	"noalign"	/* turn off stripe alignment */
+#define MNTOPT_SWALLOC	"swalloc"	/* turn on stripe width allocation */
+#define MNTOPT_SUNIT	"sunit"		/* data volume stripe unit */
+#define MNTOPT_SWIDTH	"swidth"	/* data volume stripe width */
+#define MNTOPT_NOUUID	"nouuid"	/* ignore filesystem UUID */
+#define MNTOPT_MTPT	"mtpt"		/* filesystem mount point */
+#define MNTOPT_GRPID	"grpid"		/* group-ID from parent directory */
+#define MNTOPT_NOGRPID	"nogrpid"	/* group-ID from current process */
+#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
+#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
+#define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
+#define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
+#define MNTOPT_BARRIER	"barrier"	/* use writer barriers for log write and
+					 * unwritten extent conversion */
+#define MNTOPT_NOBARRIER "nobarrier"	/* .. disable */
+#define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
+#define MNTOPT_32BITINODE   "inode32"	/* inode allocation limited to
+					 * XFS_MAXINUMBER_32 */
+#define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
+#define MNTOPT_NOIKEEP	"noikeep"	/* free empty inode clusters */
+#define MNTOPT_LARGEIO	   "largeio"	/* report large I/O sizes in stat() */
+#define MNTOPT_NOLARGEIO   "nolargeio"	/* do not report large I/O sizes
+					 * in stat(). */
+#define MNTOPT_ATTR2	"attr2"		/* do use attr2 attribute format */
+#define MNTOPT_NOATTR2	"noattr2"	/* do not use attr2 attribute format */
+#define MNTOPT_FILESTREAM  "filestreams" /* use filestreams allocator */
+#define MNTOPT_QUOTA	"quota"		/* disk quotas (user) */
+#define MNTOPT_NOQUOTA	"noquota"	/* no quotas */
+#define MNTOPT_USRQUOTA	"usrquota"	/* user quota enabled */
+#define MNTOPT_GRPQUOTA	"grpquota"	/* group quota enabled */
+#define MNTOPT_PRJQUOTA	"prjquota"	/* project quota enabled */
+#define MNTOPT_UQUOTA	"uquota"	/* user quota (IRIX variant) */
+#define MNTOPT_GQUOTA	"gquota"	/* group quota (IRIX variant) */
+#define MNTOPT_PQUOTA	"pquota"	/* project quota (IRIX variant) */
+#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
+#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
+#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
+#define MNTOPT_QUOTANOENF  "qnoenforce"	/* same as uqnoenforce */
+#define MNTOPT_DISCARD	   "discard"	/* Discard unused blocks */
+#define MNTOPT_NODISCARD   "nodiscard"	/* Do not discard unused blocks */
+
+/*
+ * Table driven mount option parser.
+ *
+ * Currently only used for remount, but it will be used for mount
+ * in the future, too.
+ */
+enum {
+	Opt_barrier,
+	Opt_nobarrier,
+	Opt_inode64,
+	Opt_inode32,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_inode64, "inode64"},
+	{Opt_inode32, "inode32"},
+	{Opt_err, NULL}
+};
+
+
+STATIC unsigned long
+suffix_kstrtoint(char *s, unsigned int base, int *res)
+{
+	int	last, shift_left_factor = 0, _res;
+	char	*value = s;
+
+	last = strlen(value) - 1;
+	if (value[last] == 'K' || value[last] == 'k') {
+		shift_left_factor = 10;
+		value[last] = '\0';
+	}
+	if (value[last] == 'M' || value[last] == 'm') {
+		shift_left_factor = 20;
+		value[last] = '\0';
+	}
+	if (value[last] == 'G' || value[last] == 'g') {
+		shift_left_factor = 30;
+		value[last] = '\0';
+	}
+
+	if (kstrtoint(s, base, &_res))
+		return -EINVAL;
+	*res = _res << shift_left_factor;
+	return 0;
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
+STATIC int
+xfs_parseargs(
+	struct xfs_mount	*mp,
+	char			*options)
+{
+	struct super_block	*sb = mp->m_super;
+	char			*this_char, *value;
+	int			dsunit = 0;
+	int			dswidth = 0;
+	int			iosize = 0;
+	__uint8_t		iosizelog = 0;
+
+	/*
+	 * set up the mount name first so all the errors will refer to the
+	 * correct device.
+	 */
+	mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+	if (!mp->m_fsname)
+		return -ENOMEM;
+	mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
+	/*
+	 * Copy binary VFS mount flags we are interested in.
+	 */
+	if (sb->s_flags & MS_RDONLY)
+		mp->m_flags |= XFS_MOUNT_RDONLY;
+	if (sb->s_flags & MS_DIRSYNC)
+		mp->m_flags |= XFS_MOUNT_DIRSYNC;
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		mp->m_flags |= XFS_MOUNT_WSYNC;
+
+	/*
+	 * Set some default flags that could be cleared by the mount option
+	 * parsing.
+	 */
+	mp->m_flags |= XFS_MOUNT_BARRIER;
+	mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+
+	/*
+	 * These can be overridden by the mount option parsing.
+	 */
+	mp->m_logbufs = -1;
+	mp->m_logbsize = -1;
+
+	if (!options)
+		goto done;
+
+	while ((this_char = strsep(&options, ",")) != NULL) {
+		if (!*this_char)
+			continue;
+		if ((value = strchr(this_char, '=')) != NULL)
+			*value++ = 0;
+
+		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			if (kstrtoint(value, 10, &mp->m_logbufs))
+				return -EINVAL;
+		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
+				return -EINVAL;
+		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+			if (!mp->m_logname)
+				return -ENOMEM;
+		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
+			xfs_warn(mp, "%s option not allowed on this system",
+				this_char);
+			return -EINVAL;
+		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+			if (!mp->m_rtname)
+				return -ENOMEM;
+		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			if (kstrtoint(value, 10, &iosize))
+				return -EINVAL;
+			iosizelog = ffs(iosize) - 1;
+		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			if (suffix_kstrtoint(value, 10, &iosize))
+				return -EINVAL;
+			iosizelog = ffs(iosize) - 1;
+		} else if (!strcmp(this_char, MNTOPT_GRPID) ||
+			   !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+			mp->m_flags |= XFS_MOUNT_GRPID;
+		} else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
+			   !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+			mp->m_flags &= ~XFS_MOUNT_GRPID;
+		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+			mp->m_flags |= XFS_MOUNT_WSYNC;
+		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+			mp->m_flags |= XFS_MOUNT_NORECOVERY;
+		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+			mp->m_flags |= XFS_MOUNT_NOALIGN;
+		} else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+			mp->m_flags |= XFS_MOUNT_SWALLOC;
+		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			if (kstrtoint(value, 10, &dsunit))
+				return -EINVAL;
+		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return -EINVAL;
+			}
+			if (kstrtoint(value, 10, &dswidth))
+				return -EINVAL;
+		} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
+			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+			mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+		} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+			mp->m_flags |= XFS_MOUNT_NOUUID;
+		} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+			mp->m_flags |= XFS_MOUNT_BARRIER;
+		} else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+		} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+			mp->m_flags |= XFS_MOUNT_IKEEP;
+		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+			mp->m_flags &= ~XFS_MOUNT_IKEEP;
+		} else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+			mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
+		} else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+			mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+		} else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+			mp->m_flags |= XFS_MOUNT_ATTR2;
+		} else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+			mp->m_flags &= ~XFS_MOUNT_ATTR2;
+			mp->m_flags |= XFS_MOUNT_NOATTR2;
+		} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+			mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+		} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+			mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+			mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+			mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
+		} else if (!strcmp(this_char, MNTOPT_QUOTA) ||
+			   !strcmp(this_char, MNTOPT_UQUOTA) ||
+			   !strcmp(this_char, MNTOPT_USRQUOTA)) {
+			mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+					 XFS_UQUOTA_ENFD);
+		} else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
+			   !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+			mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+			mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+		} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
+			   !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+					 XFS_PQUOTA_ENFD);
+		} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+			mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+		} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
+			   !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+					 XFS_GQUOTA_ENFD);
+		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+			mp->m_qflags &= ~XFS_GQUOTA_ENFD;
+		} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+			mp->m_flags |= XFS_MOUNT_DISCARD;
+		} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+			mp->m_flags &= ~XFS_MOUNT_DISCARD;
+		} else {
+			xfs_warn(mp, "unknown mount option [%s].", this_char);
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * no recovery flag requires a read-only mount
+	 */
+	if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		xfs_warn(mp, "no-recovery mounts must be read-only.");
+		return -EINVAL;
+	}
+
+	if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
+		xfs_warn(mp,
+	"sunit and swidth options incompatible with the noalign option");
+		return -EINVAL;
+	}
+
+#ifndef CONFIG_XFS_QUOTA
+	if (XFS_IS_QUOTA_RUNNING(mp)) {
+		xfs_warn(mp, "quota support not available in this kernel.");
+		return -EINVAL;
+	}
+#endif
+
+	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
+		xfs_warn(mp, "sunit and swidth must be specified together");
+		return -EINVAL;
+	}
+
+	if (dsunit && (dswidth % dsunit != 0)) {
+		xfs_warn(mp,
+	"stripe width (%d) must be a multiple of the stripe unit (%d)",
+			dswidth, dsunit);
+		return -EINVAL;
+	}
+
+done:
+	if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+		/*
+		 * At this point the superblock has not been read
+		 * in, therefore we do not know the block size.
+		 * Before the mount call ends we will convert
+		 * these to FSBs.
+		 */
+		mp->m_dalign = dsunit;
+		mp->m_swidth = dswidth;
+	}
+
+	if (mp->m_logbufs != -1 &&
+	    mp->m_logbufs != 0 &&
+	    (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+	     mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+		xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
+			mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+		return -EINVAL;
+	}
+	if (mp->m_logbsize != -1 &&
+	    mp->m_logbsize !=  0 &&
+	    (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+	     mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+	     !is_power_of_2(mp->m_logbsize))) {
+		xfs_warn(mp,
+			"invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+			mp->m_logbsize);
+		return -EINVAL;
+	}
+
+	if (iosizelog) {
+		if (iosizelog > XFS_MAX_IO_LOG ||
+		    iosizelog < XFS_MIN_IO_LOG) {
+			xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
+				iosizelog, XFS_MIN_IO_LOG,
+				XFS_MAX_IO_LOG);
+			return -EINVAL;
+		}
+
+		mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+		mp->m_readio_log = iosizelog;
+		mp->m_writeio_log = iosizelog;
+	}
+
+	return 0;
+}
+
+struct proc_xfs_info {
+	int	flag;
+	char	*str;
+};
+
+STATIC int
+xfs_showargs(
+	struct xfs_mount	*mp,
+	struct seq_file		*m)
+{
+	static struct proc_xfs_info xfs_info_set[] = {
+		/* the few simple ones we can get from the mount struct */
+		{ XFS_MOUNT_IKEEP,		"," MNTOPT_IKEEP },
+		{ XFS_MOUNT_WSYNC,		"," MNTOPT_WSYNC },
+		{ XFS_MOUNT_NOALIGN,		"," MNTOPT_NOALIGN },
+		{ XFS_MOUNT_SWALLOC,		"," MNTOPT_SWALLOC },
+		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
+		{ XFS_MOUNT_NORECOVERY,		"," MNTOPT_NORECOVERY },
+		{ XFS_MOUNT_ATTR2,		"," MNTOPT_ATTR2 },
+		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
+		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
+		{ XFS_MOUNT_DISCARD,		"," MNTOPT_DISCARD },
+		{ XFS_MOUNT_SMALL_INUMS,	"," MNTOPT_32BITINODE },
+		{ 0, NULL }
+	};
+	static struct proc_xfs_info xfs_info_unset[] = {
+		/* the few simple ones we can get from the mount struct */
+		{ XFS_MOUNT_COMPAT_IOSIZE,	"," MNTOPT_LARGEIO },
+		{ XFS_MOUNT_BARRIER,		"," MNTOPT_NOBARRIER },
+		{ XFS_MOUNT_SMALL_INUMS,	"," MNTOPT_64BITINODE },
+		{ 0, NULL }
+	};
+	struct proc_xfs_info	*xfs_infop;
+
+	for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
+		if (mp->m_flags & xfs_infop->flag)
+			seq_puts(m, xfs_infop->str);
+	}
+	for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) {
+		if (!(mp->m_flags & xfs_infop->flag))
+			seq_puts(m, xfs_infop->str);
+	}
+
+	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+		seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+				(int)(1 << mp->m_writeio_log) >> 10);
+
+	if (mp->m_logbufs > 0)
+		seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+	if (mp->m_logbsize > 0)
+		seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+
+	if (mp->m_logname)
+		seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+	if (mp->m_rtname)
+		seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+
+	if (mp->m_dalign > 0)
+		seq_printf(m, "," MNTOPT_SUNIT "=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+	if (mp->m_swidth > 0)
+		seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+
+	if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
+		seq_puts(m, "," MNTOPT_USRQUOTA);
+	else if (mp->m_qflags & XFS_UQUOTA_ACCT)
+		seq_puts(m, "," MNTOPT_UQUOTANOENF);
+
+	if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_PQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_PRJQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_PQUOTANOENF);
+	}
+	if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_GQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_GRPQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_GQUOTANOENF);
+	}
+
+	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
+		seq_puts(m, "," MNTOPT_NOQUOTA);
+
+	return 0;
+}
+__uint64_t
+xfs_max_file_offset(
+	unsigned int		blockshift)
+{
+	unsigned int		pagefactor = 1;
+	unsigned int		bitshift = BITS_PER_LONG - 1;
+
+	/* Figure out maximum filesize, on Linux this can depend on
+	 * the filesystem blocksize (on 32 bit platforms).
+	 * __block_write_begin does this in an [unsigned] long...
+	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
+	 * So, for page sized blocks (4K on 32 bit platforms),
+	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+	 *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+	 * but for smaller blocksizes it is less (bbits = log2 bsize).
+	 * Note1: get_block_t takes a long (implicit cast from above)
+	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+	 * can optionally convert the [unsigned] long from above into
+	 * an [unsigned] long long.
+	 */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBDAF)
+	ASSERT(sizeof(sector_t) == 8);
+	pagefactor = PAGE_CACHE_SIZE;
+	bitshift = BITS_PER_LONG;
+# else
+	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+
+	return (((__uint64_t)pagefactor) << bitshift) - 1;
+}
+
+/*
+ * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
+ * because in the growfs case, mp->m_sb.sb_agcount is not updated
+ * yet to the potentially higher ag count.
+ */
+xfs_agnumber_t
+xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
+{
+	xfs_agnumber_t	index = 0;
+	xfs_agnumber_t	maxagi = 0;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	xfs_agnumber_t	max_metadata;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+	xfs_perag_t	*pag;
+
+	/* Calculate how much should be reserved for inodes to meet
+	 * the max inode percentage.
+	 */
+	if (mp->m_maxicount) {
+		__uint64_t	icount;
+
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		icount += sbp->sb_agblocks - 1;
+		do_div(icount, sbp->sb_agblocks);
+		max_metadata = icount;
+	} else {
+		max_metadata = agcount;
+	}
+
+	agino =	XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+
+	for (index = 0; index < agcount; index++) {
+		ino = XFS_AGINO_TO_INO(mp, index, agino);
+
+		if (ino > XFS_MAXINUMBER_32) {
+			pag = xfs_perag_get(mp, index);
+			pag->pagi_inodeok = 0;
+			pag->pagf_metadata = 0;
+			xfs_perag_put(pag);
+			continue;
+		}
+
+		pag = xfs_perag_get(mp, index);
+		pag->pagi_inodeok = 1;
+		maxagi++;
+		if (index < max_metadata)
+			pag->pagf_metadata = 1;
+		xfs_perag_put(pag);
+	}
+	mp->m_flags |= (XFS_MOUNT_32BITINODES |
+			XFS_MOUNT_SMALL_INUMS);
+
+	return maxagi;
+}
+
+xfs_agnumber_t
+xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
+{
+	xfs_agnumber_t index = 0;
+
+	for (index = 0; index < agcount; index++) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, index);
+		pag->pagi_inodeok = 1;
+		pag->pagf_metadata = 0;
+		xfs_perag_put(pag);
+	}
+
+	/* There is no need for lock protection on m_flags,
+	 * the rw_semaphore of the VFS superblock is locked
+	 * during mount/umount/remount operations, so this is
+	 * enough to avoid concurency on the m_flags field
+	 */
+	mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
+			 XFS_MOUNT_SMALL_INUMS);
+	return index;
+}
+
+STATIC int
+xfs_blkdev_get(
+	xfs_mount_t		*mp,
+	const char		*name,
+	struct block_device	**bdevp)
+{
+	int			error = 0;
+
+	*bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				    mp);
+	if (IS_ERR(*bdevp)) {
+		error = PTR_ERR(*bdevp);
+		xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
+	}
+
+	return error;
+}
+
+STATIC void
+xfs_blkdev_put(
+	struct block_device	*bdev)
+{
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+void
+xfs_blkdev_issue_flush(
+	xfs_buftarg_t		*buftarg)
+{
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL);
+}
+
+STATIC void
+xfs_close_devices(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+		xfs_free_buftarg(mp, mp->m_logdev_targp);
+		xfs_blkdev_put(logdev);
+	}
+	if (mp->m_rtdev_targp) {
+		struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
+		xfs_blkdev_put(rtdev);
+	}
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
+}
+
+/*
+ * The file system configurations are:
+ *	(1) device (partition) with data and internal log
+ *	(2) logical volume with data and log subvolumes.
+ *	(3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present.  The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+	struct xfs_mount	*mp)
+{
+	struct block_device	*ddev = mp->m_super->s_bdev;
+	struct block_device	*logdev = NULL, *rtdev = NULL;
+	int			error;
+
+	/*
+	 * Open real time and log devices - order is important.
+	 */
+	if (mp->m_logname) {
+		error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+		if (error)
+			goto out;
+	}
+
+	if (mp->m_rtname) {
+		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+		if (error)
+			goto out_close_logdev;
+
+		if (rtdev == ddev || rtdev == logdev) {
+			xfs_warn(mp,
+	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
+			error = -EINVAL;
+			goto out_close_rtdev;
+		}
+	}
+
+	/*
+	 * Setup xfs_mount buffer target pointers
+	 */
+	error = -ENOMEM;
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+	if (!mp->m_ddev_targp)
+		goto out_close_rtdev;
+
+	if (rtdev) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+		if (!mp->m_rtdev_targp)
+			goto out_free_ddev_targ;
+	}
+
+	if (logdev && logdev != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+		if (!mp->m_logdev_targp)
+			goto out_free_rtdev_targ;
+	} else {
+		mp->m_logdev_targp = mp->m_ddev_targp;
+	}
+
+	return 0;
+
+ out_free_rtdev_targ:
+	if (mp->m_rtdev_targp)
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
+ out_free_ddev_targ:
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
+ out_close_rtdev:
+	xfs_blkdev_put(rtdev);
+ out_close_logdev:
+	if (logdev && logdev != ddev)
+		xfs_blkdev_put(logdev);
+ out:
+	return error;
+}
+
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
+	if (error)
+		return error;
+
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		unsigned int	log_sector_size = BBSIZE;
+
+		if (xfs_sb_version_hassector(&mp->m_sb))
+			log_sector_size = mp->m_sb.sb_logsectsize;
+		error = xfs_setsize_buftarg(mp->m_logdev_targp,
+					    log_sector_size);
+		if (error)
+			return error;
+	}
+	if (mp->m_rtdev_targp) {
+		error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+					    mp->m_sb.sb_sectsize);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_init_mount_workqueues(
+	struct xfs_mount	*mp)
+{
+	mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname);
+	if (!mp->m_buf_workqueue)
+		goto out;
+
+	mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+	if (!mp->m_data_workqueue)
+		goto out_destroy_buf;
+
+	mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+	if (!mp->m_unwritten_workqueue)
+		goto out_destroy_data_iodone_queue;
+
+	mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+	if (!mp->m_cil_workqueue)
+		goto out_destroy_unwritten;
+
+	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+			WQ_FREEZABLE, 0, mp->m_fsname);
+	if (!mp->m_reclaim_workqueue)
+		goto out_destroy_cil;
+
+	mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+			WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
+	if (!mp->m_log_workqueue)
+		goto out_destroy_reclaim;
+
+	mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+			WQ_FREEZABLE, 0, mp->m_fsname);
+	if (!mp->m_eofblocks_workqueue)
+		goto out_destroy_log;
+
+	return 0;
+
+out_destroy_log:
+	destroy_workqueue(mp->m_log_workqueue);
+out_destroy_reclaim:
+	destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+	destroy_workqueue(mp->m_cil_workqueue);
+out_destroy_unwritten:
+	destroy_workqueue(mp->m_unwritten_workqueue);
+out_destroy_data_iodone_queue:
+	destroy_workqueue(mp->m_data_workqueue);
+out_destroy_buf:
+	destroy_workqueue(mp->m_buf_workqueue);
+out:
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_mount_workqueues(
+	struct xfs_mount	*mp)
+{
+	destroy_workqueue(mp->m_eofblocks_workqueue);
+	destroy_workqueue(mp->m_log_workqueue);
+	destroy_workqueue(mp->m_reclaim_workqueue);
+	destroy_workqueue(mp->m_cil_workqueue);
+	destroy_workqueue(mp->m_data_workqueue);
+	destroy_workqueue(mp->m_unwritten_workqueue);
+	destroy_workqueue(mp->m_buf_workqueue);
+}
+
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		sync_inodes_sb(sb);
+		up_read(&sb->s_umount);
+	}
+}
+
+/* Catch misguided souls that try to use this interface on XFS */
+STATIC struct inode *
+xfs_fs_alloc_inode(
+	struct super_block	*sb)
+{
+	BUG();
+	return NULL;
+}
+
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
+STATIC void
+xfs_fs_destroy_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	trace_xfs_destroy_inode(ip);
+
+	XFS_STATS_INC(vn_reclaim);
+
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+	/*
+	 * We should never get here with one of the reclaim flags already set.
+	 */
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+	/*
+	 * We always use background reclaim here because even if the
+	 * inode is clean, it still may be under IO and hence we have
+	 * to take the flush lock. The background reclaim path handles
+	 * this more efficiently than we can here, so simply let background
+	 * reclaim tear down all inodes.
+	 */
+	xfs_inode_set_reclaim_tag(ip);
+}
+
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly initialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
+STATIC void
+xfs_fs_inode_init_once(
+	void			*inode)
+{
+	struct xfs_inode	*ip = inode;
+
+	memset(ip, 0, sizeof(struct xfs_inode));
+
+	/* vfs inode */
+	inode_init_once(VFS_I(ip));
+
+	/* xfs inode */
+	atomic_set(&ip->i_pincount, 0);
+	spin_lock_init(&ip->i_flags_lock);
+
+	mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+		     "xfsino", ip->i_ino);
+	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+		     "xfsino", ip->i_ino);
+}
+
+STATIC void
+xfs_fs_evict_inode(
+	struct inode		*inode)
+{
+	xfs_inode_t		*ip = XFS_I(inode);
+
+	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+
+	trace_xfs_evict_inode(ip);
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	XFS_STATS_INC(vn_rele);
+	XFS_STATS_INC(vn_remove);
+
+	xfs_inactive(ip);
+}
+
+/*
+ * We do an unlocked check for XFS_IDONTCACHE here because we are already
+ * serialised against cache hits here via the inode->i_lock and igrab() in
+ * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be
+ * racing with us, and it avoids needing to grab a spinlock here for every inode
+ * we drop the final reference on.
+ */
+STATIC int
+xfs_fs_drop_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
+}
+
+STATIC void
+xfs_free_fsname(
+	struct xfs_mount	*mp)
+{
+	kfree(mp->m_fsname);
+	kfree(mp->m_rtname);
+	kfree(mp->m_logname);
+}
+
+STATIC int
+xfs_fs_sync_fs(
+	struct super_block	*sb,
+	int			wait)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	/*
+	 * Doing anything during the async pass would be counterproductive.
+	 */
+	if (!wait)
+		return 0;
+
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	if (laptop_mode) {
+		/*
+		 * The disk must be active because we're syncing.
+		 * We schedule log work now (now that the disk is
+		 * active) instead of later (when it might not be).
+		 */
+		flush_delayed_work(&mp->m_log->l_work);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_fs_statfs(
+	struct dentry		*dentry,
+	struct kstatfs		*statp)
+{
+	struct xfs_mount	*mp = XFS_M(dentry->d_sb);
+	xfs_sb_t		*sbp = &mp->m_sb;
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	__uint64_t		fakeinos, id;
+	__uint64_t		icount;
+	__uint64_t		ifree;
+	__uint64_t		fdblocks;
+	xfs_extlen_t		lsize;
+	__int64_t		ffree;
+
+	statp->f_type = XFS_SB_MAGIC;
+	statp->f_namelen = MAXNAMELEN - 1;
+
+	id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
+	statp->f_fsid.val[0] = (u32)id;
+	statp->f_fsid.val[1] = (u32)(id >> 32);
+
+	icount = percpu_counter_sum(&mp->m_icount);
+	ifree = percpu_counter_sum(&mp->m_ifree);
+	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
+	spin_lock(&mp->m_sb_lock);
+	statp->f_bsize = sbp->sb_blocksize;
+	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+	statp->f_blocks = sbp->sb_dblocks - lsize;
+	spin_unlock(&mp->m_sb_lock);
+
+	statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+	statp->f_bavail = statp->f_bfree;
+
+	fakeinos = statp->f_bfree << sbp->sb_inopblog;
+	statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+	if (mp->m_maxicount)
+		statp->f_files = min_t(typeof(statp->f_files),
+					statp->f_files,
+					mp->m_maxicount);
+
+	/* If sb_icount overshot maxicount, report actual allocation */
+	statp->f_files = max_t(typeof(statp->f_files),
+					statp->f_files,
+					sbp->sb_icount);
+
+	/* make sure statp->f_ffree does not underflow */
+	ffree = statp->f_files - (icount - ifree);
+	statp->f_ffree = max_t(__int64_t, ffree, 0);
+
+
+	if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+			      (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
+		xfs_qm_statvfs(ip, statp);
+	return 0;
+}
+
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks = 0;
+
+	mp->m_resblks_save = mp->m_resblks;
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks;
+
+	if (mp->m_resblks_save) {
+		resblks = mp->m_resblks_save;
+		mp->m_resblks_save = 0;
+	} else
+		resblks = xfs_default_resblks(mp);
+
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
+ *
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
+ */
+static void
+xfs_quiesce_attr(
+	struct xfs_mount	*mp)
+{
+	int	error = 0;
+
+	/* wait for all modifications to complete */
+	while (atomic_read(&mp->m_active_trans) > 0)
+		delay(100);
+
+	/* force the log to unpin objects from the now complete transactions */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/* reclaim inodes to do any IO before the freeze completes */
+	xfs_reclaim_inodes(mp, 0);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+	/* Push the superblock and write an unmount record */
+	error = xfs_log_sbcount(mp);
+	if (error)
+		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+				"Frozen image may not be consistent.");
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+	xfs_log_quiesce(mp);
+}
+
+STATIC int
+xfs_fs_remount(
+	struct super_block	*sb,
+	int			*flags,
+	char			*options)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	xfs_sb_t		*sbp = &mp->m_sb;
+	substring_t		args[MAX_OPT_ARGS];
+	char			*p;
+	int			error;
+
+	sync_filesystem(sb);
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_barrier:
+			mp->m_flags |= XFS_MOUNT_BARRIER;
+			break;
+		case Opt_nobarrier:
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+			break;
+		case Opt_inode64:
+			mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
+			break;
+		case Opt_inode32:
+			mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
+			break;
+		default:
+			/*
+			 * Logically we would return an error here to prevent
+			 * users from believing they might have changed
+			 * mount options using remount which can't be changed.
+			 *
+			 * But unfortunately mount(8) adds all options from
+			 * mtab and fstab to the mount arguments in some cases
+			 * so we can't blindly reject options, but have to
+			 * check for each specified option if it actually
+			 * differs from the currently set option and only
+			 * reject it if that's the case.
+			 *
+			 * Until that is implemented we return success for
+			 * every remount request, and silently ignore all
+			 * options that we can't actually change.
+			 */
+#if 0
+			xfs_info(mp,
+		"mount option \"%s\" not supported for remount", p);
+			return -EINVAL;
+#else
+			break;
+#endif
+		}
+	}
+
+	/* ro -> rw */
+	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+		if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+			xfs_warn(mp,
+		"ro->rw transition prohibited on norecovery mount");
+			return -EINVAL;
+		}
+
+		mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+		/*
+		 * If this is the first remount to writeable state we
+		 * might have some superblock changes to update.
+		 */
+		if (mp->m_update_sb) {
+			error = xfs_sync_sb(mp, false);
+			if (error) {
+				xfs_warn(mp, "failed to write sb changes");
+				return error;
+			}
+			mp->m_update_sb = false;
+		}
+
+		/*
+		 * Fill out the reserve pool if it is empty. Use the stashed
+		 * value if it is non-zero, otherwise go with the default.
+		 */
+		xfs_restore_resvblks(mp);
+		xfs_log_work_queue(mp);
+	}
+
+	/* rw -> ro */
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		/*
+		 * Before we sync the metadata, we need to free up the reserve
+		 * block pool so that the used block count in the superblock on
+		 * disk is correct at the end of the remount. Stash the current
+		 * reserve pool size so that if we get remounted rw, we can
+		 * return it to the same size.
+		 */
+		xfs_save_resvblks(mp);
+		xfs_quiesce_attr(mp);
+		mp->m_flags |= XFS_MOUNT_RDONLY;
+	}
+
+	return 0;
+}
+
+/*
+ * Second stage of a freeze. The data is already frozen so we only
+ * need to take care of the metadata. Once that's done sync the superblock
+ * to the log to dirty it in case of a crash while frozen. This ensures that we
+ * will recover the unlinked inode lists on the next mount.
+ */
+STATIC int
+xfs_fs_freeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_save_resvblks(mp);
+	xfs_quiesce_attr(mp);
+	return xfs_sync_sb(mp, true);
+}
+
+STATIC int
+xfs_fs_unfreeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_restore_resvblks(mp);
+	xfs_log_work_queue(mp);
+	return 0;
+}
+
+STATIC int
+xfs_fs_show_options(
+	struct seq_file		*m,
+	struct dentry		*root)
+{
+	return xfs_showargs(XFS_M(root->d_sb), m);
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+	struct xfs_mount	*mp)
+{
+	int			ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+	/* Fail a mount where the logbuf is smaller than the log stripe */
+	if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+		if (mp->m_logbsize <= 0 &&
+		    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
+			mp->m_logbsize = mp->m_sb.sb_logsunit;
+		} else if (mp->m_logbsize > 0 &&
+			   mp->m_logbsize < mp->m_sb.sb_logsunit) {
+			xfs_warn(mp,
+		"logbuf size must be greater than or equal to log stripe size");
+			return -EINVAL;
+		}
+	} else {
+		/* Fail a mount if the logbuf is larger than 32K */
+		if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
+			xfs_warn(mp,
+		"logbuf size for version 1 logs must be 16K or 32K");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * V5 filesystems always use attr2 format for attributes.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+		xfs_warn(mp,
+"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
+			MNTOPT_NOATTR2, MNTOPT_ATTR2);
+		return -EINVAL;
+	}
+
+	/*
+	 * mkfs'ed attr2 will turn on attr2 mount unless explicitly
+	 * told by noattr2 to turn it off
+	 */
+	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+	    !(mp->m_flags & XFS_MOUNT_NOATTR2))
+		mp->m_flags |= XFS_MOUNT_ATTR2;
+
+	/*
+	 * prohibit r/w mounts of read-only filesystems
+	 */
+	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+		xfs_warn(mp,
+			"cannot mount a read-only filesystem as read-write");
+		return -EROFS;
+	}
+
+	if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+	    (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
+	    !xfs_sb_version_has_pquotino(&mp->m_sb)) {
+		xfs_warn(mp,
+		  "Super block does not support project and group quota together");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+xfs_init_percpu_counters(
+	struct xfs_mount	*mp)
+{
+	int		error;
+
+	error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
+	if (error)
+		return -ENOMEM;
+
+	error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
+	if (error)
+		goto free_icount;
+
+	error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
+	if (error)
+		goto free_ifree;
+
+	return 0;
+
+free_ifree:
+	percpu_counter_destroy(&mp->m_ifree);
+free_icount:
+	percpu_counter_destroy(&mp->m_icount);
+	return -ENOMEM;
+}
+
+void
+xfs_reinit_percpu_counters(
+	struct xfs_mount	*mp)
+{
+	percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
+	percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
+	percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+}
+
+static void
+xfs_destroy_percpu_counters(
+	struct xfs_mount	*mp)
+{
+	percpu_counter_destroy(&mp->m_icount);
+	percpu_counter_destroy(&mp->m_ifree);
+	percpu_counter_destroy(&mp->m_fdblocks);
+}
+
+STATIC int
+xfs_fs_fill_super(
+	struct super_block	*sb,
+	void			*data,
+	int			silent)
+{
+	struct inode		*root;
+	struct xfs_mount	*mp = NULL;
+	int			flags = 0, error = -ENOMEM;
+
+	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
+	if (!mp)
+		goto out;
+
+	spin_lock_init(&mp->m_sb_lock);
+	mutex_init(&mp->m_growlock);
+	atomic_set(&mp->m_active_trans, 0);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+	mp->m_kobj.kobject.kset = xfs_kset;
+
+	mp->m_super = sb;
+	sb->s_fs_info = mp;
+
+	error = xfs_parseargs(mp, (char *)data);
+	if (error)
+		goto out_free_fsname;
+
+	sb_min_blocksize(sb, BBSIZE);
+	sb->s_xattr = xfs_xattr_handlers;
+	sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
+	sb->s_qcop = &xfs_quotactl_operations;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
+#endif
+	sb->s_op = &xfs_super_operations;
+
+	if (silent)
+		flags |= XFS_MFSI_QUIET;
+
+	error = xfs_open_devices(mp);
+	if (error)
+		goto out_free_fsname;
+
+	error = xfs_init_mount_workqueues(mp);
+	if (error)
+		goto out_close_devices;
+
+	error = xfs_init_percpu_counters(mp);
+	if (error)
+		goto out_destroy_workqueues;
+
+	error = xfs_readsb(mp, flags);
+	if (error)
+		goto out_destroy_counters;
+
+	error = xfs_finish_flags(mp);
+	if (error)
+		goto out_free_sb;
+
+	error = xfs_setup_devices(mp);
+	if (error)
+		goto out_free_sb;
+
+	error = xfs_filestream_mount(mp);
+	if (error)
+		goto out_free_sb;
+
+	/*
+	 * we must configure the block size in the superblock before we run the
+	 * full mount process as the mount process can lookup and cache inodes.
+	 */
+	sb->s_magic = XFS_SB_MAGIC;
+	sb->s_blocksize = mp->m_sb.sb_blocksize;
+	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
+	sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+	sb->s_max_links = XFS_MAXLINK;
+	sb->s_time_gran = 1;
+	set_posix_acl_flag(sb);
+
+	/* version 5 superblocks support inode version counters. */
+	if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+		sb->s_flags |= MS_I_VERSION;
+
+	error = xfs_mountfs(mp);
+	if (error)
+		goto out_filestream_unmount;
+
+	root = igrab(VFS_I(mp->m_rootip));
+	if (!root) {
+		error = -ENOENT;
+		goto out_unmount;
+	}
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		error = -ENOMEM;
+		goto out_unmount;
+	}
+
+	return 0;
+
+ out_filestream_unmount:
+	xfs_filestream_unmount(mp);
+ out_free_sb:
+	xfs_freesb(mp);
+ out_destroy_counters:
+	xfs_destroy_percpu_counters(mp);
+out_destroy_workqueues:
+	xfs_destroy_mount_workqueues(mp);
+ out_close_devices:
+	xfs_close_devices(mp);
+ out_free_fsname:
+	xfs_free_fsname(mp);
+	kfree(mp);
+ out:
+	return error;
+
+ out_unmount:
+	xfs_filestream_unmount(mp);
+	xfs_unmountfs(mp);
+	goto out_free_sb;
+}
+
+STATIC void
+xfs_fs_put_super(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_notice(mp, "Unmounting Filesystem");
+	xfs_filestream_unmount(mp);
+	xfs_unmountfs(mp);
+
+	xfs_freesb(mp);
+	xfs_destroy_percpu_counters(mp);
+	xfs_destroy_mount_workqueues(mp);
+	xfs_close_devices(mp);
+	xfs_free_fsname(mp);
+	kfree(mp);
+}
+
+STATIC struct dentry *
+xfs_fs_mount(
+	struct file_system_type	*fs_type,
+	int			flags,
+	const char		*dev_name,
+	void			*data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+}
+
+static long
+xfs_fs_nr_cached_objects(
+	struct super_block	*sb,
+	struct shrink_control	*sc)
+{
+	return xfs_reclaim_inodes_count(XFS_M(sb));
+}
+
+static long
+xfs_fs_free_cached_objects(
+	struct super_block	*sb,
+	struct shrink_control	*sc)
+{
+	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
+}
+
+static const struct super_operations xfs_super_operations = {
+	.alloc_inode		= xfs_fs_alloc_inode,
+	.destroy_inode		= xfs_fs_destroy_inode,
+	.evict_inode		= xfs_fs_evict_inode,
+	.drop_inode		= xfs_fs_drop_inode,
+	.put_super		= xfs_fs_put_super,
+	.sync_fs		= xfs_fs_sync_fs,
+	.freeze_fs		= xfs_fs_freeze,
+	.unfreeze_fs		= xfs_fs_unfreeze,
+	.statfs			= xfs_fs_statfs,
+	.remount_fs		= xfs_fs_remount,
+	.show_options		= xfs_fs_show_options,
+	.nr_cached_objects	= xfs_fs_nr_cached_objects,
+	.free_cached_objects	= xfs_fs_free_cached_objects,
+};
+
+static struct file_system_type xfs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "xfs",
+	.mount			= xfs_fs_mount,
+	.kill_sb		= kill_block_super,
+	.fs_flags		= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("xfs");
+
+STATIC int __init
+xfs_init_zones(void)
+{
+
+	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+	if (!xfs_ioend_zone)
+		goto out;
+
+	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+						  xfs_ioend_zone);
+	if (!xfs_ioend_pool)
+		goto out_destroy_ioend_zone;
+
+	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+						"xfs_log_ticket");
+	if (!xfs_log_ticket_zone)
+		goto out_destroy_ioend_pool;
+
+	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+						"xfs_bmap_free_item");
+	if (!xfs_bmap_free_item_zone)
+		goto out_destroy_log_ticket_zone;
+
+	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+						"xfs_btree_cur");
+	if (!xfs_btree_cur_zone)
+		goto out_destroy_bmap_free_item_zone;
+
+	xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+						"xfs_da_state");
+	if (!xfs_da_state_zone)
+		goto out_destroy_btree_cur_zone;
+
+	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+	if (!xfs_ifork_zone)
+		goto out_destroy_da_state_zone;
+
+	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+	if (!xfs_trans_zone)
+		goto out_destroy_ifork_zone;
+
+	xfs_log_item_desc_zone =
+		kmem_zone_init(sizeof(struct xfs_log_item_desc),
+			       "xfs_log_item_desc");
+	if (!xfs_log_item_desc_zone)
+		goto out_destroy_trans_zone;
+
+	/*
+	 * The size of the zone allocated buf log item is the maximum
+	 * size possible under XFS.  This wastes a little bit of memory,
+	 * but it is much faster.
+	 */
+	xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
+					   "xfs_buf_item");
+	if (!xfs_buf_item_zone)
+		goto out_destroy_log_item_desc_zone;
+
+	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+			((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+				 sizeof(xfs_extent_t))), "xfs_efd_item");
+	if (!xfs_efd_zone)
+		goto out_destroy_buf_item_zone;
+
+	xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+			((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+				sizeof(xfs_extent_t))), "xfs_efi_item");
+	if (!xfs_efi_zone)
+		goto out_destroy_efd_zone;
+
+	xfs_inode_zone =
+		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
+			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+			xfs_fs_inode_init_once);
+	if (!xfs_inode_zone)
+		goto out_destroy_efi_zone;
+
+	xfs_ili_zone =
+		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+					KM_ZONE_SPREAD, NULL);
+	if (!xfs_ili_zone)
+		goto out_destroy_inode_zone;
+	xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+					"xfs_icr");
+	if (!xfs_icreate_zone)
+		goto out_destroy_ili_zone;
+
+	return 0;
+
+ out_destroy_ili_zone:
+	kmem_zone_destroy(xfs_ili_zone);
+ out_destroy_inode_zone:
+	kmem_zone_destroy(xfs_inode_zone);
+ out_destroy_efi_zone:
+	kmem_zone_destroy(xfs_efi_zone);
+ out_destroy_efd_zone:
+	kmem_zone_destroy(xfs_efd_zone);
+ out_destroy_buf_item_zone:
+	kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+	kmem_zone_destroy(xfs_log_item_desc_zone);
+ out_destroy_trans_zone:
+	kmem_zone_destroy(xfs_trans_zone);
+ out_destroy_ifork_zone:
+	kmem_zone_destroy(xfs_ifork_zone);
+ out_destroy_da_state_zone:
+	kmem_zone_destroy(xfs_da_state_zone);
+ out_destroy_btree_cur_zone:
+	kmem_zone_destroy(xfs_btree_cur_zone);
+ out_destroy_bmap_free_item_zone:
+	kmem_zone_destroy(xfs_bmap_free_item_zone);
+ out_destroy_log_ticket_zone:
+	kmem_zone_destroy(xfs_log_ticket_zone);
+ out_destroy_ioend_pool:
+	mempool_destroy(xfs_ioend_pool);
+ out_destroy_ioend_zone:
+	kmem_zone_destroy(xfs_ioend_zone);
+ out:
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_zones(void)
+{
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
+	kmem_zone_destroy(xfs_icreate_zone);
+	kmem_zone_destroy(xfs_ili_zone);
+	kmem_zone_destroy(xfs_inode_zone);
+	kmem_zone_destroy(xfs_efi_zone);
+	kmem_zone_destroy(xfs_efd_zone);
+	kmem_zone_destroy(xfs_buf_item_zone);
+	kmem_zone_destroy(xfs_log_item_desc_zone);
+	kmem_zone_destroy(xfs_trans_zone);
+	kmem_zone_destroy(xfs_ifork_zone);
+	kmem_zone_destroy(xfs_da_state_zone);
+	kmem_zone_destroy(xfs_btree_cur_zone);
+	kmem_zone_destroy(xfs_bmap_free_item_zone);
+	kmem_zone_destroy(xfs_log_ticket_zone);
+	mempool_destroy(xfs_ioend_pool);
+	kmem_zone_destroy(xfs_ioend_zone);
+
+}
+
+STATIC int __init
+xfs_init_workqueues(void)
+{
+	/*
+	 * The allocation workqueue can be used in memory reclaim situations
+	 * (writepage path), and parallelism is only limited by the number of
+	 * AGs in all the filesystems mounted. Hence use the default large
+	 * max_active value for this workqueue.
+	 */
+	xfs_alloc_wq = alloc_workqueue("xfsalloc",
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
+	if (!xfs_alloc_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+STATIC void
+xfs_destroy_workqueues(void)
+{
+	destroy_workqueue(xfs_alloc_wq);
+}
+
+STATIC int __init
+init_xfs_fs(void)
+{
+	int			error;
+
+	printk(KERN_INFO XFS_VERSION_STRING " with "
+			 XFS_BUILD_OPTIONS " enabled\n");
+
+	xfs_dir_startup();
+
+	error = xfs_init_zones();
+	if (error)
+		goto out;
+
+	error = xfs_init_workqueues();
+	if (error)
+		goto out_destroy_zones;
+
+	error = xfs_mru_cache_init();
+	if (error)
+		goto out_destroy_wq;
+
+	error = xfs_buf_init();
+	if (error)
+		goto out_mru_cache_uninit;
+
+	error = xfs_init_procfs();
+	if (error)
+		goto out_buf_terminate;
+
+	error = xfs_sysctl_register();
+	if (error)
+		goto out_cleanup_procfs;
+
+	xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
+	if (!xfs_kset) {
+		error = -ENOMEM;
+		goto out_sysctl_unregister;;
+	}
+
+#ifdef DEBUG
+	xfs_dbg_kobj.kobject.kset = xfs_kset;
+	error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
+	if (error)
+		goto out_kset_unregister;
+#endif
+
+	error = xfs_qm_init();
+	if (error)
+		goto out_remove_kobj;
+
+	error = register_filesystem(&xfs_fs_type);
+	if (error)
+		goto out_qm_exit;
+	return 0;
+
+ out_qm_exit:
+	xfs_qm_exit();
+ out_remove_kobj:
+#ifdef DEBUG
+	xfs_sysfs_del(&xfs_dbg_kobj);
+ out_kset_unregister:
+#endif
+	kset_unregister(xfs_kset);
+ out_sysctl_unregister:
+	xfs_sysctl_unregister();
+ out_cleanup_procfs:
+	xfs_cleanup_procfs();
+ out_buf_terminate:
+	xfs_buf_terminate();
+ out_mru_cache_uninit:
+	xfs_mru_cache_uninit();
+ out_destroy_wq:
+	xfs_destroy_workqueues();
+ out_destroy_zones:
+	xfs_destroy_zones();
+ out:
+	return error;
+}
+
+STATIC void __exit
+exit_xfs_fs(void)
+{
+	xfs_qm_exit();
+	unregister_filesystem(&xfs_fs_type);
+#ifdef DEBUG
+	xfs_sysfs_del(&xfs_dbg_kobj);
+#endif
+	kset_unregister(xfs_kset);
+	xfs_sysctl_unregister();
+	xfs_cleanup_procfs();
+	xfs_buf_terminate();
+	xfs_mru_cache_uninit();
+	xfs_destroy_workqueues();
+	xfs_destroy_zones();
+}
+
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fs/xfs/xfs_super.h b/kernel/fs/xfs/xfs_super.h
new file mode 100644
index 000000000..499058fea
--- /dev/null
+++ b/kernel/fs/xfs/xfs_super.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+
+#include <linux/exportfs.h>
+
+#ifdef CONFIG_XFS_QUOTA
+extern int xfs_qm_init(void);
+extern void xfs_qm_exit(void);
+#else
+# define xfs_qm_init()	(0)
+# define xfs_qm_exit()	do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_POSIX_ACL
+# define XFS_ACL_STRING		"ACLs, "
+# define set_posix_acl_flag(sb)	((sb)->s_flags |= MS_POSIXACL)
+#else
+# define XFS_ACL_STRING
+# define set_posix_acl_flag(sb)	do { } while (0)
+#endif
+
+#define XFS_SECURITY_STRING	"security attributes, "
+
+#ifdef CONFIG_XFS_RT
+# define XFS_REALTIME_STRING	"realtime, "
+#else
+# define XFS_REALTIME_STRING
+#endif
+
+#ifdef DEBUG
+# define XFS_DBG_STRING		"debug"
+#else
+# define XFS_DBG_STRING		"no debug"
+#endif
+
+#define XFS_VERSION_STRING	"SGI XFS"
+#define XFS_BUILD_OPTIONS	XFS_ACL_STRING \
+				XFS_SECURITY_STRING \
+				XFS_REALTIME_STRING \
+				XFS_DBG_STRING /* DBG must be last */
+
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_buftarg;
+struct block_device;
+
+extern __uint64_t xfs_max_file_offset(unsigned int);
+
+extern void xfs_flush_inodes(struct xfs_mount *mp);
+extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
+extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
+
+extern const struct export_operations xfs_export_operations;
+extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct quotactl_ops xfs_quotactl_operations;
+
+extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
+
+#define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
+
+#endif	/* __XFS_SUPER_H__ */
diff --git a/kernel/fs/xfs/xfs_symlink.c b/kernel/fs/xfs/xfs_symlink.c
new file mode 100644
index 000000000..3df411ead
--- /dev/null
+++ b/kernel/fs/xfs/xfs_symlink.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+
+/* ----- Kernel only functions below ----- */
+STATIC int
+xfs_readlink_bmap(
+	struct xfs_inode	*ip,
+	char			*link)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	struct xfs_buf		*bp;
+	xfs_daddr_t		d;
+	char			*cur_chunk;
+	int			pathlen = ip->i_d.di_size;
+	int			nmaps = XFS_SYMLINK_MAPS;
+	int			byte_cnt;
+	int			n;
+	int			error = 0;
+	int			fsblocks = 0;
+	int			offset;
+
+	fsblocks = xfs_symlink_blocks(mp, pathlen);
+	error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+	if (error)
+		goto out;
+
+	offset = 0;
+	for (n = 0; n < nmaps; n++) {
+		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+				  &xfs_symlink_buf_ops);
+		if (!bp)
+			return -ENOMEM;
+		error = bp->b_error;
+		if (error) {
+			xfs_buf_ioerror_alert(bp, __func__);
+			xfs_buf_relse(bp);
+
+			/* bad CRC means corrupted metadata */
+			if (error == -EFSBADCRC)
+				error = -EFSCORRUPTED;
+			goto out;
+		}
+		byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+		if (pathlen < byte_cnt)
+			byte_cnt = pathlen;
+
+		cur_chunk = bp->b_addr;
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
+							byte_cnt, bp)) {
+				error = -EFSCORRUPTED;
+				xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
+					offset, byte_cnt, ip->i_ino);
+				xfs_buf_relse(bp);
+				goto out;
+
+			}
+
+			cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+		}
+
+		memcpy(link + offset, bp->b_addr, byte_cnt);
+
+		pathlen -= byte_cnt;
+		offset += byte_cnt;
+
+		xfs_buf_relse(bp);
+	}
+	ASSERT(pathlen == 0);
+
+	link[ip->i_d.di_size] = '\0';
+	error = 0;
+
+ out:
+	return error;
+}
+
+int
+xfs_readlink(
+	struct xfs_inode *ip,
+	char		*link)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	xfs_fsize_t	pathlen;
+	int		error = 0;
+
+	trace_xfs_readlink(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	pathlen = ip->i_d.di_size;
+	if (!pathlen)
+		goto out;
+
+	if (pathlen < 0 || pathlen > MAXPATHLEN) {
+		xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+			 __func__, (unsigned long long) ip->i_ino,
+			 (long long) pathlen);
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+
+	if (ip->i_df.if_flags & XFS_IFINLINE) {
+		memcpy(link, ip->i_df.if_u1.if_data, pathlen);
+		link[pathlen] = '\0';
+	} else {
+		error = xfs_readlink_bmap(ip, link);
+	}
+
+ out:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+int
+xfs_symlink(
+	struct xfs_inode	*dp,
+	struct xfs_name		*link_name,
+	const char		*target_path,
+	umode_t			mode,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_trans	*tp = NULL;
+	struct xfs_inode	*ip = NULL;
+	int			error = 0;
+	int			pathlen;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	bool                    unlock_dp_on_error = false;
+	uint			cancel_flags;
+	int			committed;
+	xfs_fileoff_t		first_fsb;
+	xfs_filblks_t		fs_blocks;
+	int			nmaps;
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	xfs_daddr_t		d;
+	const char		*cur_chunk;
+	int			byte_cnt;
+	int			n;
+	xfs_buf_t		*bp;
+	prid_t			prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	uint			resblks;
+
+	*ipp = NULL;
+
+	trace_xfs_symlink(dp, link_name);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * Check component lengths of the target path name.
+	 */
+	pathlen = strlen(target_path);
+	if (pathlen >= MAXPATHLEN)      /* total string too long */
+		return -ENAMETOOLONG;
+
+	udqp = gdqp = NULL;
+	prid = xfs_get_initial_prid(dp);
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp,
+			xfs_kuid_to_uid(current_fsuid()),
+			xfs_kgid_to_gid(current_fsgid()), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+			&udqp, &gdqp, &pdqp);
+	if (error)
+		return error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * The symlink will fit into the inode data fork?
+	 * There can't be any attributes so we get the whole variable part.
+	 */
+	if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+		fs_blocks = 0;
+	else
+		fs_blocks = xfs_symlink_blocks(mp, pathlen);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+	if (error == -ENOSPC && fs_blocks == 0) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto out_trans_cancel;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+	unlock_dp_on_error = true;
+
+	/*
+	 * Check whether the directory allows new symlinks or not.
+	 */
+	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+		error = -EPERM;
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Reserve disk quota : blocks and inode.
+	 */
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+						pdqp, resblks, 1, 0);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Check for ability to enter directory entry, if no space reserved.
+	 */
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, dp, link_name);
+		if (error)
+			goto out_trans_cancel;
+	}
+	/*
+	 * Initialize the bmap freelist prior to calling either
+	 * bmapi or the directory create code.
+	 */
+	xfs_bmap_init(&free_list, &first_block);
+
+	/*
+	 * Allocate an inode for the symlink.
+	 */
+	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+			       prid, resblks > 0, &ip, NULL);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Now we join the directory inode to the transaction.  We do not do it
+	 * earlier because xfs_dir_ialloc might commit the previous transaction
+	 * (and release all the locks).  An error from here on will result in
+	 * the transaction cancel unlocking dp so don't do it explicitly in the
+	 * error path.
+	 */
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	unlock_dp_on_error = false;
+
+	/*
+	 * Also attach the dquot(s) to it, if applicable.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+	if (resblks)
+		resblks -= XFS_IALLOC_SPACE_RES(mp);
+	/*
+	 * If the symlink will fit into the inode, write it inline.
+	 */
+	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
+		ip->i_d.di_size = pathlen;
+
+		/*
+		 * The inode was initially created in extent format.
+		 */
+		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+		ip->i_df.if_flags |= XFS_IFINLINE;
+
+		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+	} else {
+		int	offset;
+
+		first_fsb = 0;
+		nmaps = XFS_SYMLINK_MAPS;
+
+		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_METADATA, &first_block, resblks,
+				  mval, &nmaps, &free_list);
+		if (error)
+			goto out_bmap_cancel;
+
+		if (resblks)
+			resblks -= fs_blocks;
+		ip->i_d.di_size = pathlen;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		cur_chunk = target_path;
+		offset = 0;
+		for (n = 0; n < nmaps; n++) {
+			char	*buf;
+
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					       BTOBB(byte_cnt), 0);
+			if (!bp) {
+				error = -ENOMEM;
+				goto out_bmap_cancel;
+			}
+			bp->b_ops = &xfs_symlink_buf_ops;
+
+			byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+			byte_cnt = min(byte_cnt, pathlen);
+
+			buf = bp->b_addr;
+			buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
+						   byte_cnt, bp);
+
+			memcpy(buf, cur_chunk, byte_cnt);
+
+			cur_chunk += byte_cnt;
+			pathlen -= byte_cnt;
+			offset += byte_cnt;
+
+			xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+			xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+							(char *)bp->b_addr);
+		}
+		ASSERT(pathlen == 0);
+	}
+
+	/*
+	 * Create the directory entry for the symlink.
+	 */
+	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error)
+		goto out_bmap_cancel;
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * symlink transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	*ipp = ip;
+	return 0;
+
+out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to finish the
+	 * setup of the inode and release the inode.  This prevents recursive
+	 * transactions and deadlocks from xfs_inactive.
+	 */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		IRELE(ip);
+	}
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	if (unlock_dp_on_error)
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+	struct xfs_inode *ip)
+{
+	xfs_buf_t	*bp;
+	int		committed;
+	int		done;
+	int		error;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t	free_list;
+	int		i;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t	mval[XFS_SYMLINK_MAPS];
+	int		nmaps;
+	int		size;
+	xfs_trans_t	*tp;
+
+	mp = ip->i_mount;
+	ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
+	/*
+	 * We're freeing a symlink that has some
+	 * blocks allocated to it.  Free the
+	 * blocks here.  We know that we've got
+	 * either 1 or 2 extents and that we can
+	 * free them all in one bunmapi call.
+	 */
+	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Lock the inode, fix the size, and join it to the transaction.
+	 * Hold it so in the normal path, we still have it locked for
+	 * the second transaction.  In the error paths we need it
+	 * held so the cancel won't rele it, see below.
+	 */
+	size = (int)ip->i_d.di_size;
+	ip->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Find the block(s) so we can inval and unmap them.
+	 */
+	done = 0;
+	xfs_bmap_init(&free_list, &first_block);
+	nmaps = ARRAY_SIZE(mval);
+	error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
+				mval, &nmaps, 0);
+	if (error)
+		goto error_trans_cancel;
+	/*
+	 * Invalidate the block(s). No validation is done.
+	 */
+	for (i = 0; i < nmaps; i++) {
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+		if (!bp) {
+			error = -ENOMEM;
+			goto error_bmap_cancel;
+		}
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Unmap the dead block(s) to the free_list.
+	 */
+	error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+			    &first_block, &free_list, &done);
+	if (error)
+		goto error_bmap_cancel;
+	ASSERT(done);
+	/*
+	 * Commit the first transaction.  This logs the EFI and the inode.
+	 */
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto error_bmap_cancel;
+	/*
+	 * The transaction must have been committed, since there were
+	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
+	 * The new tp has the extent freeing and EFDs.
+	 */
+	ASSERT(committed);
+	/*
+	 * The first xact was committed, so add the inode to the new one.
+	 * Mark it dirty so it will be logged and moved forward in the log as
+	 * part of every commit.
+	 */
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Commit the transaction containing extent freeing and EFDs.
+	 */
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error_unlock;
+	}
+
+	/*
+	 * Remove the memory for extent descriptions (just bookkeeping).
+	 */
+	if (ip->i_df.if_bytes)
+		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+	ASSERT(ip->i_df.if_bytes == 0);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+
+error_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+error_trans_cancel:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			pathlen;
+
+	trace_xfs_inactive_symlink(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Zero length symlinks _can_ exist.
+	 */
+	pathlen = (int)ip->i_d.di_size;
+	if (!pathlen) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return 0;
+	}
+
+	if (pathlen < 0 || pathlen > MAXPATHLEN) {
+		xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+			 __func__, (unsigned long long)ip->i_ino, pathlen);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	if (ip->i_df.if_flags & XFS_IFINLINE) {
+		if (ip->i_df.if_bytes > 0) 
+			xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+					  XFS_DATA_FORK);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		ASSERT(ip->i_df.if_bytes == 0);
+		return 0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/* remove the remote symlink */
+	return xfs_inactive_symlink_rmt(ip);
+}
diff --git a/kernel/fs/xfs/xfs_symlink.h b/kernel/fs/xfs/xfs_symlink.h
new file mode 100644
index 000000000..e75245d09
--- /dev/null
+++ b/kernel/fs/xfs/xfs_symlink.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2012 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SYMLINK_H
+#define __XFS_SYMLINK_H 1
+
+/* Kernel only symlink defintions */
+
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+		const char *target_path, umode_t mode, struct xfs_inode **ipp);
+int xfs_readlink(struct xfs_inode *ip, char *link);
+int xfs_inactive_symlink(struct xfs_inode *ip);
+
+#endif /* __XFS_SYMLINK_H */
diff --git a/kernel/fs/xfs/xfs_sysctl.c b/kernel/fs/xfs/xfs_sysctl.c
new file mode 100644
index 000000000..a0c8067ce
--- /dev/null
+++ b/kernel/fs/xfs/xfs_sysctl.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include "xfs_error.h"
+
+static struct ctl_table_header *xfs_table_header;
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+xfs_stats_clear_proc_handler(
+	struct ctl_table	*ctl,
+	int			write,
+	void			__user *buffer,
+	size_t			*lenp,
+	loff_t			*ppos)
+{
+	int		c, ret, *valp = ctl->data;
+	__uint32_t	vn_active;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+
+	if (!ret && write && *valp) {
+		xfs_notice(NULL, "Clearing xfsstats");
+		for_each_possible_cpu(c) {
+			preempt_disable();
+			/* save vn_active, it's a universal truth! */
+			vn_active = per_cpu(xfsstats, c).vn_active;
+			memset(&per_cpu(xfsstats, c), 0,
+			       sizeof(struct xfsstats));
+			per_cpu(xfsstats, c).vn_active = vn_active;
+			preempt_enable();
+		}
+		xfs_stats_clear = 0;
+	}
+
+	return ret;
+}
+
+STATIC int
+xfs_panic_mask_proc_handler(
+	struct ctl_table	*ctl,
+	int			write,
+	void			__user *buffer,
+	size_t			*lenp,
+	loff_t			*ppos)
+{
+	int		ret, *valp = ctl->data;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		xfs_panic_mask = *valp;
+#ifdef DEBUG
+		xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+	}
+	return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
+static struct ctl_table xfs_table[] = {
+	{
+		.procname	= "irix_sgid_inherit",
+		.data		= &xfs_params.sgid_inherit.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.sgid_inherit.min,
+		.extra2		= &xfs_params.sgid_inherit.max
+	},
+	{
+		.procname	= "irix_symlink_mode",
+		.data		= &xfs_params.symlink_mode.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.symlink_mode.min,
+		.extra2		= &xfs_params.symlink_mode.max
+	},
+	{
+		.procname	= "panic_mask",
+		.data		= &xfs_params.panic_mask.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_panic_mask_proc_handler,
+		.extra1		= &xfs_params.panic_mask.min,
+		.extra2		= &xfs_params.panic_mask.max
+	},
+
+	{
+		.procname	= "error_level",
+		.data		= &xfs_params.error_level.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.error_level.min,
+		.extra2		= &xfs_params.error_level.max
+	},
+	{
+		.procname	= "xfssyncd_centisecs",
+		.data		= &xfs_params.syncd_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.syncd_timer.min,
+		.extra2		= &xfs_params.syncd_timer.max
+	},
+	{
+		.procname	= "inherit_sync",
+		.data		= &xfs_params.inherit_sync.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_sync.min,
+		.extra2		= &xfs_params.inherit_sync.max
+	},
+	{
+		.procname	= "inherit_nodump",
+		.data		= &xfs_params.inherit_nodump.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nodump.min,
+		.extra2		= &xfs_params.inherit_nodump.max
+	},
+	{
+		.procname	= "inherit_noatime",
+		.data		= &xfs_params.inherit_noatim.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_noatim.min,
+		.extra2		= &xfs_params.inherit_noatim.max
+	},
+	{
+		.procname	= "inherit_nosymlinks",
+		.data		= &xfs_params.inherit_nosym.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nosym.min,
+		.extra2		= &xfs_params.inherit_nosym.max
+	},
+	{
+		.procname	= "rotorstep",
+		.data		= &xfs_params.rotorstep.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.rotorstep.min,
+		.extra2		= &xfs_params.rotorstep.max
+	},
+	{
+		.procname	= "inherit_nodefrag",
+		.data		= &xfs_params.inherit_nodfrg.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nodfrg.min,
+		.extra2		= &xfs_params.inherit_nodfrg.max
+	},
+	{
+		.procname	= "filestream_centisecs",
+		.data		= &xfs_params.fstrm_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.fstrm_timer.min,
+		.extra2		= &xfs_params.fstrm_timer.max,
+	},
+	{
+		.procname	= "speculative_prealloc_lifetime",
+		.data		= &xfs_params.eofb_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.eofb_timer.min,
+		.extra2		= &xfs_params.eofb_timer.max,
+	},
+	/* please keep this the last entry */
+#ifdef CONFIG_PROC_FS
+	{
+		.procname	= "stats_clear",
+		.data		= &xfs_params.stats_clear.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_stats_clear_proc_handler,
+		.extra1		= &xfs_params.stats_clear.min,
+		.extra2		= &xfs_params.stats_clear.max
+	},
+#endif /* CONFIG_PROC_FS */
+
+	{}
+};
+
+static struct ctl_table xfs_dir_table[] = {
+	{
+		.procname	= "xfs",
+		.mode		= 0555,
+		.child		= xfs_table
+	},
+	{}
+};
+
+static struct ctl_table xfs_root_table[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= xfs_dir_table
+	},
+	{}
+};
+
+int
+xfs_sysctl_register(void)
+{
+	xfs_table_header = register_sysctl_table(xfs_root_table);
+	if (!xfs_table_header)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_sysctl_unregister(void)
+{
+	unregister_sysctl_table(xfs_table_header);
+}
diff --git a/kernel/fs/xfs/xfs_sysctl.h b/kernel/fs/xfs/xfs_sysctl.h
new file mode 100644
index 000000000..ffef45375
--- /dev/null
+++ b/kernel/fs/xfs/xfs_sysctl.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+
+#include <linux/sysctl.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+typedef struct xfs_sysctl_val {
+	int min;
+	int val;
+	int max;
+} xfs_sysctl_val_t;
+
+typedef struct xfs_param {
+	xfs_sysctl_val_t sgid_inherit;	/* Inherit S_ISGID if process' GID is
+					 * not a member of parent dir GID. */
+	xfs_sysctl_val_t symlink_mode;	/* Link creat mode affected by umask */
+	xfs_sysctl_val_t panic_mask;	/* bitmask to cause panic on errors. */
+	xfs_sysctl_val_t error_level;	/* Degree of reporting for problems  */
+	xfs_sysctl_val_t syncd_timer;	/* Interval between xfssyncd wakeups */
+	xfs_sysctl_val_t stats_clear;	/* Reset all XFS statistics to zero. */
+	xfs_sysctl_val_t inherit_sync;	/* Inherit the "sync" inode flag. */
+	xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
+	xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
+	xfs_sysctl_val_t xfs_buf_timer;	/* Interval between xfsbufd wakeups. */
+	xfs_sysctl_val_t xfs_buf_age;	/* Metadata buffer age before flush. */
+	xfs_sysctl_val_t inherit_nosym;	/* Inherit the "nosymlinks" flag. */
+	xfs_sysctl_val_t rotorstep;	/* inode32 AG rotoring control knob */
+	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
+	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
+	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
+} xfs_param_t;
+
+/*
+ * xfs_error_level:
+ *
+ * How much error reporting will be done when internal problems are
+ * encountered.  These problems normally return an EFSCORRUPTED to their
+ * caller, with no other information reported.
+ *
+ * 0	No error reports
+ * 1	Report EFSCORRUPTED errors that will cause a filesystem shutdown
+ * 5	Report all EFSCORRUPTED errors (all of the above errors, plus any
+ *	additional errors that are known to not cause shutdowns)
+ *
+ * xfs_panic_mask bit 0x8 turns the error reports into panics
+ */
+
+enum {
+	/* XFS_REFCACHE_SIZE = 1 */
+	/* XFS_REFCACHE_PURGE = 2 */
+	/* XFS_RESTRICT_CHOWN = 3 */
+	XFS_SGID_INHERIT = 4,
+	XFS_SYMLINK_MODE = 5,
+	XFS_PANIC_MASK = 6,
+	XFS_ERRLEVEL = 7,
+	XFS_SYNCD_TIMER = 8,
+	/* XFS_PROBE_DMAPI = 9 */
+	/* XFS_PROBE_IOOPS = 10 */
+	/* XFS_PROBE_QUOTA = 11 */
+	XFS_STATS_CLEAR = 12,
+	XFS_INHERIT_SYNC = 13,
+	XFS_INHERIT_NODUMP = 14,
+	XFS_INHERIT_NOATIME = 15,
+	XFS_BUF_TIMER = 16,
+	XFS_BUF_AGE = 17,
+	/* XFS_IO_BYPASS = 18 */
+	XFS_INHERIT_NOSYM = 19,
+	XFS_ROTORSTEP = 20,
+	XFS_INHERIT_NODFRG = 21,
+	XFS_FILESTREAM_TIMER = 22,
+};
+
+extern xfs_param_t	xfs_params;
+
+struct xfs_globals {
+	int	log_recovery_delay;	/* log recovery delay (secs) */
+};
+extern struct xfs_globals	xfs_globals;
+
+#ifdef CONFIG_SYSCTL
+extern int xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+# define xfs_sysctl_register()		(0)
+# define xfs_sysctl_unregister()	do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+#endif /* __XFS_SYSCTL_H__ */
diff --git a/kernel/fs/xfs/xfs_sysfs.c b/kernel/fs/xfs/xfs_sysfs.c
new file mode 100644
index 000000000..aa0367085
--- /dev/null
+++ b/kernel/fs/xfs/xfs_sysfs.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_sysfs.h"
+#include "xfs_log_format.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+
+struct xfs_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(char *buf, void *data);
+	ssize_t (*store)(const char *buf, size_t count, void *data);
+};
+
+static inline struct xfs_sysfs_attr *
+to_attr(struct attribute *attr)
+{
+	return container_of(attr, struct xfs_sysfs_attr, attr);
+}
+
+#define XFS_SYSFS_ATTR_RW(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
+#define XFS_SYSFS_ATTR_RO(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+
+#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
+
+/*
+ * xfs_mount kobject. This currently has no attributes and thus no need for show
+ * and store helpers. The mp kobject serves as the per-mount parent object that
+ * is identified by the fsname under sysfs.
+ */
+
+struct kobj_type xfs_mp_ktype = {
+	.release = xfs_sysfs_release,
+};
+
+#ifdef DEBUG
+/* debug */
+
+STATIC ssize_t
+log_recovery_delay_store(
+	const char	*buf,
+	size_t		count,
+	void		*data)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > 60)
+		return -EINVAL;
+
+	xfs_globals.log_recovery_delay = val;
+
+	return count;
+}
+
+STATIC ssize_t
+log_recovery_delay_show(
+	char	*buf,
+	void	*data)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+}
+XFS_SYSFS_ATTR_RW(log_recovery_delay);
+
+static struct attribute *xfs_dbg_attrs[] = {
+	ATTR_LIST(log_recovery_delay),
+	NULL,
+};
+
+STATIC ssize_t
+xfs_dbg_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+}
+
+STATIC ssize_t
+xfs_dbg_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+}
+
+static struct sysfs_ops xfs_dbg_ops = {
+	.show = xfs_dbg_show,
+	.store = xfs_dbg_store,
+};
+
+struct kobj_type xfs_dbg_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_dbg_ops,
+	.default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
+/* xlog */
+
+STATIC ssize_t
+log_head_lsn_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int block;
+
+	spin_lock(&log->l_icloglock);
+	cycle = log->l_curr_cycle;
+	block = log->l_curr_block;
+	spin_unlock(&log->l_icloglock);
+
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_head_lsn);
+
+STATIC ssize_t
+log_tail_lsn_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int block;
+
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_tail_lsn);
+
+STATIC ssize_t
+reserve_grant_head_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int bytes;
+
+	xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(reserve_grant_head);
+
+STATIC ssize_t
+write_grant_head_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int bytes;
+
+	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(write_grant_head);
+
+static struct attribute *xfs_log_attrs[] = {
+	ATTR_LIST(log_head_lsn),
+	ATTR_LIST(log_tail_lsn),
+	ATTR_LIST(reserve_grant_head),
+	ATTR_LIST(write_grant_head),
+	NULL,
+};
+
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	return container_of(kobj, struct xlog, l_kobj);
+}
+
+STATIC ssize_t
+xfs_log_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xlog *log = to_xlog(kobject);
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+}
+
+STATIC ssize_t
+xfs_log_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xlog *log = to_xlog(kobject);
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+}
+
+static struct sysfs_ops xfs_log_ops = {
+	.show = xfs_log_show,
+	.store = xfs_log_store,
+};
+
+struct kobj_type xfs_log_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_log_ops,
+	.default_attrs = xfs_log_attrs,
+};
diff --git a/kernel/fs/xfs/xfs_sysfs.h b/kernel/fs/xfs/xfs_sysfs.h
new file mode 100644
index 000000000..240eee35f
--- /dev/null
+++ b/kernel/fs/xfs/xfs_sysfs.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __XFS_SYSFS_H__
+#define __XFS_SYSFS_H__
+
+extern struct kobj_type xfs_mp_ktype;	/* xfs_mount */
+extern struct kobj_type xfs_dbg_ktype;	/* debug */
+extern struct kobj_type xfs_log_ktype;	/* xlog */
+
+static inline struct xfs_kobj *
+to_kobj(struct kobject *kobject)
+{
+	return container_of(kobject, struct xfs_kobj, kobject);
+}
+
+static inline void
+xfs_sysfs_release(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	complete(&kobj->complete);
+}
+
+static inline int
+xfs_sysfs_init(
+	struct xfs_kobj		*kobj,
+	struct kobj_type	*ktype,
+	struct xfs_kobj		*parent_kobj,
+	const char		*name)
+{
+	init_completion(&kobj->complete);
+	return kobject_init_and_add(&kobj->kobject, ktype,
+				    &parent_kobj->kobject, "%s", name);
+}
+
+static inline void
+xfs_sysfs_del(
+	struct xfs_kobj	*kobj)
+{
+	kobject_del(&kobj->kobject);
+	kobject_put(&kobj->kobject);
+	wait_for_completion(&kobj->complete);
+}
+
+#endif	/* __XFS_SYSFS_H__ */
diff --git a/kernel/fs/xfs/xfs_trace.c b/kernel/fs/xfs/xfs_trace.c
new file mode 100644
index 000000000..13a029806
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trace.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_da_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_quota.h"
+#include "xfs_iomap.h"
+#include "xfs_aops.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_filestream.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "xfs_trace.h"
diff --git a/kernel/fs/xfs/xfs_trace.h b/kernel/fs/xfs/xfs_trace.h
new file mode 100644
index 000000000..615781bf4
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trace.h
@@ -0,0 +1,2083 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs
+
+#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_H
+
+#include <linux/tracepoint.h>
+
+struct xfs_agf;
+struct xfs_alloc_arg;
+struct xfs_attr_list_context;
+struct xfs_buf_log_item;
+struct xfs_da_args;
+struct xfs_da_node_entry;
+struct xfs_dquot;
+struct xfs_log_item;
+struct xlog;
+struct xlog_ticket;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
+struct xfs_bmbt_irec;
+
+DECLARE_EVENT_CLASS(xfs_attr_list_class,
+	TP_PROTO(struct xfs_attr_list_context *ctx),
+	TP_ARGS(ctx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, alist)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor->hashval;
+		__entry->blkno = ctx->cursor->blkno;
+		__entry->offset = ctx->cursor->offset;
+		__entry->alist = ctx->alist;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->flags = ctx->flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "alist 0x%p size %u count %u firstu %u flags %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->alist,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __entry->flags,
+		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+	)
+)
+
+#define DEFINE_ATTR_LIST_EVENT(name) \
+DEFINE_EVENT(xfs_attr_list_class, name, \
+	TP_PROTO(struct xfs_attr_list_context *ctx), \
+	TP_ARGS(ctx))
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
+
+DECLARE_EVENT_CLASS(xfs_perag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, refcount, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->refcount = refcount;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u refcount %d caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
+		 unsigned long caller_ip),					\
+	TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
+
+DECLARE_EVENT_CLASS(xfs_ag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
+	TP_ARGS(mp, agno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+	),
+	TP_printk("dev %d:%d agno %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno)
+);
+#define DEFINE_AG_EVENT(name)	\
+DEFINE_EVENT(xfs_ag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),	\
+	TP_ARGS(mp, agno))
+
+DEFINE_AG_EVENT(xfs_read_agf);
+DEFINE_AG_EVENT(xfs_alloc_read_agf);
+DEFINE_AG_EVENT(xfs_read_agi);
+DEFINE_AG_EVENT(xfs_ialloc_read_agi);
+
+TRACE_EVENT(xfs_attr_list_node_descend,
+	TP_PROTO(struct xfs_attr_list_context *ctx,
+		 struct xfs_da_node_entry *btree),
+	TP_ARGS(ctx, btree),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, alist)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(int, flags)
+		__field(u32, bt_hashval)
+		__field(u32, bt_before)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor->hashval;
+		__entry->blkno = ctx->cursor->blkno;
+		__entry->offset = ctx->cursor->offset;
+		__entry->alist = ctx->alist;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->flags = ctx->flags;
+		__entry->bt_hashval = be32_to_cpu(btree->hashval);
+		__entry->bt_before = be32_to_cpu(btree->before);
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "alist 0x%p size %u count %u firstu %u flags %d %s "
+		  "node hashval %u, node before %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->alist,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __entry->flags,
+		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+		   __entry->bt_hashval,
+		   __entry->bt_before)
+);
+
+TRACE_EVENT(xfs_iext_insert,
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
+		 struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
+	TP_ARGS(ip, idx, r, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_extnum_t, idx)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(int, bmap_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->idx = idx;
+		__entry->startoff = r->br_startoff;
+		__entry->startblock = r->br_startblock;
+		__entry->blockcount = r->br_blockcount;
+		__entry->state = r->br_state;
+		__entry->bmap_state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+		  "offset %lld block %lld count %lld flag %d caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+		  (long)__entry->idx,
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_bmap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+		 unsigned long caller_ip),
+	TP_ARGS(ip, idx, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_extnum_t, idx)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(int, bmap_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ?
+						ip->i_afp : &ip->i_df;
+		struct xfs_bmbt_irec	r;
+
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->idx = idx;
+		__entry->startoff = r.br_startoff;
+		__entry->startblock = r.br_startblock;
+		__entry->blockcount = r.br_blockcount;
+		__entry->state = r.br_state;
+		__entry->bmap_state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+		  "offset %lld block %lld count %lld flag %d caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+		  (long)__entry->idx,
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_BMAP_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip, idx, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_remove);
+DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
+DEFINE_BMAP_EVENT(xfs_bmap_post_update);
+DEFINE_BMAP_EVENT(xfs_extlist);
+
+DECLARE_EVENT_CLASS(xfs_buf_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
+	TP_ARGS(bp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(int, nblks)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->nblks = bp->b_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = bp->b_sema.count;
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
+		  "lock %d flags %s caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->nblks,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_EVENT(name) \
+DEFINE_EVENT(xfs_buf_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+	TP_ARGS(bp, caller_ip))
+DEFINE_BUF_EVENT(xfs_buf_init);
+DEFINE_BUF_EVENT(xfs_buf_free);
+DEFINE_BUF_EVENT(xfs_buf_hold);
+DEFINE_BUF_EVENT(xfs_buf_rele);
+DEFINE_BUF_EVENT(xfs_buf_iodone);
+DEFINE_BUF_EVENT(xfs_buf_submit);
+DEFINE_BUF_EVENT(xfs_buf_submit_wait);
+DEFINE_BUF_EVENT(xfs_buf_bawrite);
+DEFINE_BUF_EVENT(xfs_buf_lock);
+DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock);
+DEFINE_BUF_EVENT(xfs_buf_unlock);
+DEFINE_BUF_EVENT(xfs_buf_iowait);
+DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
+DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+DEFINE_BUF_EVENT(xfs_buf_item_relse);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+
+/* not really buffer traces, but the buf provides useful information */
+DEFINE_BUF_EVENT(xfs_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_reset_dqcounts);
+DEFINE_BUF_EVENT(xfs_inode_item_push);
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(xfs_buf_flags_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
+	TP_ARGS(bp, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = BBTOB(bp->b_length);
+		__entry->flags = flags;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = bp->b_sema.count;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_FLAGS_EVENT(name) \
+DEFINE_EVENT(xfs_buf_flags_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+	TP_ARGS(bp, flags, caller_ip))
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+
+TRACE_EVENT(xfs_buf_ioerror,
+	TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
+	TP_ARGS(bp, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(unsigned, flags)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = BBTOB(bp->b_length);
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = bp->b_sema.count;
+		__entry->error = error;
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d error %d flags %s caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __entry->error,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_buf_item_class,
+	TP_PROTO(struct xfs_buf_log_item *bip),
+	TP_ARGS(bip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, buf_bno)
+		__field(size_t, buf_len)
+		__field(int, buf_hold)
+		__field(int, buf_pincount)
+		__field(int, buf_lockval)
+		__field(unsigned, buf_flags)
+		__field(unsigned, bli_recur)
+		__field(int, bli_refcount)
+		__field(unsigned, bli_flags)
+		__field(void *, li_desc)
+		__field(unsigned, li_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = bip->bli_buf->b_target->bt_dev;
+		__entry->bli_flags = bip->bli_flags;
+		__entry->bli_recur = bip->bli_recur;
+		__entry->bli_refcount = atomic_read(&bip->bli_refcount);
+		__entry->buf_bno = bip->bli_buf->b_bn;
+		__entry->buf_len = BBTOB(bip->bli_buf->b_length);
+		__entry->buf_flags = bip->bli_buf->b_flags;
+		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
+		__entry->buf_lockval = bip->bli_buf->b_sema.count;
+		__entry->li_desc = bip->bli_item.li_desc;
+		__entry->li_flags = bip->bli_item.li_flags;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s recur %d refcount %d bliflags %s "
+		  "lidesc 0x%p liflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->buf_bno,
+		  __entry->buf_len,
+		  __entry->buf_hold,
+		  __entry->buf_pincount,
+		  __entry->buf_lockval,
+		  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
+		  __entry->bli_recur,
+		  __entry->bli_refcount,
+		  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
+		  __entry->li_desc,
+		  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_BUF_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_buf_item_class, name, \
+	TP_PROTO(struct xfs_buf_log_item *bip), \
+	TP_ARGS(bip))
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
+
+DECLARE_EVENT_CLASS(xfs_filestream_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
+	TP_ARGS(ip, agno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_agnumber_t, agno)
+		__field(int, streams)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->agno = agno;
+		__entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+	),
+	TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->agno,
+		  __entry->streams)
+)
+#define DEFINE_FILESTREAM_EVENT(name) \
+DEFINE_EVENT(xfs_filestream_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
+	TP_ARGS(ip, agno))
+DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
+
+TRACE_EVENT(xfs_filestream_pick,
+	TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
+		 xfs_extlen_t free, int nscan),
+	TP_ARGS(ip, agno, free, nscan),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_agnumber_t, agno)
+		__field(int, streams)
+		__field(xfs_extlen_t, free)
+		__field(int, nscan)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->agno = agno;
+		__entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+		__entry->free = free;
+		__entry->nscan = nscan;
+	),
+	TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->agno,
+		  __entry->streams,
+		  __entry->free,
+		  __entry->nscan)
+);
+
+DECLARE_EVENT_CLASS(xfs_lock_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
+		 unsigned long caller_ip),
+	TP_ARGS(ip,  lock_flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, lock_flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lock_flags = lock_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_LOCK_EVENT(name) \
+DEFINE_EVENT(xfs_lock_class, name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip,  lock_flags, caller_ip))
+DEFINE_LOCK_EVENT(xfs_ilock);
+DEFINE_LOCK_EVENT(xfs_ilock_nowait);
+DEFINE_LOCK_EVENT(xfs_ilock_demote);
+DEFINE_LOCK_EVENT(xfs_iunlock);
+
+DECLARE_EVENT_CLASS(xfs_inode_class,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+	),
+	TP_printk("dev %d:%d ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino)
+)
+
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
+	TP_PROTO(struct xfs_inode *ip), \
+	TP_ARGS(ip))
+DEFINE_INODE_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
+
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
+DEFINE_INODE_EVENT(xfs_collapse_file_space);
+DEFINE_INODE_EVENT(xfs_insert_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_get_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_dir_fsync);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_evict_inode);
+DEFINE_INODE_EVENT(xfs_update_time);
+
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+
+DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+
+DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+
+DECLARE_EVENT_CLASS(xfs_iref_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
+	TP_ARGS(ip, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, count)
+		__field(int, pincount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->count = atomic_read(&VFS_I(ip)->i_count);
+		__entry->pincount = atomic_read(&ip->i_pincount);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->count,
+		  __entry->pincount,
+		  (char *)__entry->caller_ip)
+)
+
+TRACE_EVENT(xfs_iomap_prealloc_size,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift,
+		 unsigned int writeio_blocks),
+	TP_ARGS(ip, blocks, shift, writeio_blocks),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsblock_t, blocks)
+		__field(int, shift)
+		__field(unsigned int, writeio_blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->blocks = blocks;
+		__entry->shift = shift;
+		__entry->writeio_blocks = writeio_blocks;
+	),
+	TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
+		  "m_writeio_blocks %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
+		  __entry->blocks, __entry->shift, __entry->writeio_blocks)
+)
+
+#define DEFINE_IREF_EVENT(name) \
+DEFINE_EVENT(xfs_iref_class, name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
+	TP_ARGS(ip, caller_ip))
+DEFINE_IREF_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+	TP_ARGS(dp, name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, dp_ino)
+		__field(int, namelen)
+		__dynamic_array(char, name, name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(dp)->i_sb->s_dev;
+		__entry->dp_ino = dp->i_ino;
+		__entry->namelen = name->len;
+		memcpy(__get_str(name), name->name, name->len);
+	),
+	TP_printk("dev %d:%d dp ino 0x%llx name %.*s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dp_ino,
+		  __entry->namelen,
+		  __get_str(name))
+)
+
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+	TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+
+TRACE_EVENT(xfs_rename,
+	TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+		 struct xfs_name *src_name, struct xfs_name *target_name),
+	TP_ARGS(src_dp, target_dp, src_name, target_name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_dp_ino)
+		__field(xfs_ino_t, target_dp_ino)
+		__field(int, src_namelen)
+		__field(int, target_namelen)
+		__dynamic_array(char, src_name, src_name->len)
+		__dynamic_array(char, target_name, target_name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+		__entry->src_dp_ino = src_dp->i_ino;
+		__entry->target_dp_ino = target_dp->i_ino;
+		__entry->src_namelen = src_name->len;
+		__entry->target_namelen = target_name->len;
+		memcpy(__get_str(src_name), src_name->name, src_name->len);
+		memcpy(__get_str(target_name), target_name->name,
+			target_name->len);
+	),
+	TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+		  " src name %.*s target name %.*s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->src_dp_ino,
+		  __entry->target_dp_ino,
+		  __entry->src_namelen,
+		  __get_str(src_name),
+		  __entry->target_namelen,
+		  __get_str(target_name))
+)
+
+DECLARE_EVENT_CLASS(xfs_dquot_class,
+	TP_PROTO(struct xfs_dquot *dqp),
+	TP_ARGS(dqp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u32, id)
+		__field(unsigned, flags)
+		__field(unsigned, nrefs)
+		__field(unsigned long long, res_bcount)
+		__field(unsigned long long, bcount)
+		__field(unsigned long long, icount)
+		__field(unsigned long long, blk_hardlimit)
+		__field(unsigned long long, blk_softlimit)
+		__field(unsigned long long, ino_hardlimit)
+		__field(unsigned long long, ino_softlimit)
+	), \
+	TP_fast_assign(
+		__entry->dev = dqp->q_mount->m_super->s_dev;
+		__entry->id = be32_to_cpu(dqp->q_core.d_id);
+		__entry->flags = dqp->dq_flags;
+		__entry->nrefs = dqp->q_nrefs;
+		__entry->res_bcount = dqp->q_res_bcount;
+		__entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
+		__entry->icount = be64_to_cpu(dqp->q_core.d_icount);
+		__entry->blk_hardlimit =
+			be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+		__entry->blk_softlimit =
+			be64_to_cpu(dqp->q_core.d_blk_softlimit);
+		__entry->ino_hardlimit =
+			be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+		__entry->ino_softlimit =
+			be64_to_cpu(dqp->q_core.d_ino_softlimit);
+	),
+	TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+		  "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+		  "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->id,
+		  __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+		  __entry->nrefs,
+		  __entry->res_bcount,
+		  __entry->bcount,
+		  __entry->blk_hardlimit,
+		  __entry->blk_softlimit,
+		  __entry->icount,
+		  __entry->ino_hardlimit,
+		  __entry->ino_softlimit)
+)
+
+#define DEFINE_DQUOT_EVENT(name) \
+DEFINE_EVENT(xfs_dquot_class, name, \
+	TP_PROTO(struct xfs_dquot *dqp), \
+	TP_ARGS(dqp))
+DEFINE_DQUOT_EVENT(xfs_dqadjust);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
+DEFINE_DQUOT_EVENT(xfs_dqattach_found);
+DEFINE_DQUOT_EVENT(xfs_dqattach_get);
+DEFINE_DQUOT_EVENT(xfs_dqalloc);
+DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
+DEFINE_DQUOT_EVENT(xfs_dqread);
+DEFINE_DQUOT_EVENT(xfs_dqread_fail);
+DEFINE_DQUOT_EVENT(xfs_dqget_hit);
+DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
+DEFINE_DQUOT_EVENT(xfs_dqget_dup);
+DEFINE_DQUOT_EVENT(xfs_dqput);
+DEFINE_DQUOT_EVENT(xfs_dqput_wait);
+DEFINE_DQUOT_EVENT(xfs_dqput_free);
+DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqflush);
+DEFINE_DQUOT_EVENT(xfs_dqflush_force);
+DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+
+DECLARE_EVENT_CLASS(xfs_loggrant_class,
+	TP_PROTO(struct xlog *log, struct xlog_ticket *tic),
+	TP_ARGS(log, tic),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned, trans_type)
+		__field(char, ocnt)
+		__field(char, cnt)
+		__field(int, curr_res)
+		__field(int, unit_res)
+		__field(unsigned int, flags)
+		__field(int, reserveq)
+		__field(int, writeq)
+		__field(int, grant_reserve_cycle)
+		__field(int, grant_reserve_bytes)
+		__field(int, grant_write_cycle)
+		__field(int, grant_write_bytes)
+		__field(int, curr_cycle)
+		__field(int, curr_block)
+		__field(xfs_lsn_t, tail_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->trans_type = tic->t_trans_type;
+		__entry->ocnt = tic->t_ocnt;
+		__entry->cnt = tic->t_cnt;
+		__entry->curr_res = tic->t_curr_res;
+		__entry->unit_res = tic->t_unit_res;
+		__entry->flags = tic->t_flags;
+		__entry->reserveq = list_empty(&log->l_reserve_head.waiters);
+		__entry->writeq = list_empty(&log->l_write_head.waiters);
+		xlog_crack_grant_head(&log->l_reserve_head.grant,
+				&__entry->grant_reserve_cycle,
+				&__entry->grant_reserve_bytes);
+		xlog_crack_grant_head(&log->l_write_head.grant,
+				&__entry->grant_write_cycle,
+				&__entry->grant_write_bytes);
+		__entry->curr_cycle = log->l_curr_cycle;
+		__entry->curr_block = log->l_curr_block;
+		__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
+	),
+	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+		  "t_unit_res %u t_flags %s reserveq %s "
+		  "writeq %s grant_reserve_cycle %d "
+		  "grant_reserve_bytes %d grant_write_cycle %d "
+		  "grant_write_bytes %d curr_cycle %d curr_block %d "
+		  "tail_cycle %d tail_block %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
+		  __entry->ocnt,
+		  __entry->cnt,
+		  __entry->curr_res,
+		  __entry->unit_res,
+		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
+		  __entry->reserveq ? "empty" : "active",
+		  __entry->writeq ? "empty" : "active",
+		  __entry->grant_reserve_cycle,
+		  __entry->grant_reserve_bytes,
+		  __entry->grant_write_cycle,
+		  __entry->grant_write_bytes,
+		  __entry->curr_cycle,
+		  __entry->curr_block,
+		  CYCLE_LSN(__entry->tail_lsn),
+		  BLOCK_LSN(__entry->tail_lsn)
+	)
+)
+
+#define DEFINE_LOGGRANT_EVENT(name) \
+DEFINE_EVENT(xfs_loggrant_class, name, \
+	TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
+	TP_ARGS(log, tic))
+DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
+DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
+DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+
+DECLARE_EVENT_CLASS(xfs_log_item_class,
+	TP_PROTO(struct xfs_log_item *lip),
+	TP_ARGS(lip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(void *, lip)
+		__field(uint, type)
+		__field(uint, flags)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = lip->li_mountp->m_super->s_dev;
+		__entry->lip = lip;
+		__entry->type = lip->li_type;
+		__entry->flags = lip->li_flags;
+		__entry->lsn = lip->li_lsn;
+	),
+	TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lip,
+		  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+TRACE_EVENT(xfs_log_force,
+	TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
+	TP_ARGS(mp, lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->lsn = lsn;
+	),
+	TP_printk("dev %d:%d lsn 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lsn)
+)
+
+#define DEFINE_LOG_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_log_item_class, name, \
+	TP_PROTO(struct xfs_log_item *lip), \
+	TP_ARGS(lip))
+DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+
+DECLARE_EVENT_CLASS(xfs_ail_class,
+	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
+	TP_ARGS(lip, old_lsn, new_lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(void *, lip)
+		__field(uint, type)
+		__field(uint, flags)
+		__field(xfs_lsn_t, old_lsn)
+		__field(xfs_lsn_t, new_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = lip->li_mountp->m_super->s_dev;
+		__entry->lip = lip;
+		__entry->type = lip->li_type;
+		__entry->flags = lip->li_flags;
+		__entry->old_lsn = old_lsn;
+		__entry->new_lsn = new_lsn;
+	),
+	TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lip,
+		  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+		  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_AIL_EVENT(name) \
+DEFINE_EVENT(xfs_ail_class, name, \
+	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \
+	TP_ARGS(lip, old_lsn, new_lsn))
+DEFINE_AIL_EVENT(xfs_ail_insert);
+DEFINE_AIL_EVENT(xfs_ail_move);
+DEFINE_AIL_EVENT(xfs_ail_delete);
+
+TRACE_EVENT(xfs_log_assign_tail_lsn,
+	TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn),
+	TP_ARGS(log, new_lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_lsn_t, new_lsn)
+		__field(xfs_lsn_t, old_lsn)
+		__field(xfs_lsn_t, last_sync_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->new_lsn = new_lsn;
+		__entry->old_lsn = atomic64_read(&log->l_tail_lsn);
+		__entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+	),
+	TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+		  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+		  CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn))
+)
+
+DECLARE_EVENT_CLASS(xfs_file_class,
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+	TP_ARGS(ip, count, offset, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
+		  "offset 0x%llx count 0x%zx ioflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+)
+
+#define DEFINE_RW_EVENT(name)		\
+DEFINE_EVENT(xfs_file_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),	\
+	TP_ARGS(ip, count, offset, flags))
+DEFINE_RW_EVENT(xfs_file_read);
+DEFINE_RW_EVENT(xfs_file_buffered_write);
+DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_splice_read);
+
+DECLARE_EVENT_CLASS(xfs_page_class,
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+		 unsigned int len),
+	TP_ARGS(inode, page, off, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(pgoff_t, pgoff)
+		__field(loff_t, size)
+		__field(unsigned long, offset)
+		__field(unsigned int, length)
+		__field(int, delalloc)
+		__field(int, unwritten)
+	),
+	TP_fast_assign(
+		int delalloc = -1, unwritten = -1;
+
+		if (page_has_buffers(page))
+			xfs_count_page_state(page, &delalloc, &unwritten);
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = XFS_I(inode)->i_ino;
+		__entry->pgoff = page_offset(page);
+		__entry->size = i_size_read(inode);
+		__entry->offset = off;
+		__entry->length = len;
+		__entry->delalloc = delalloc;
+		__entry->unwritten = unwritten;
+	),
+	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+		  "length %x delalloc %d unwritten %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->pgoff,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->length,
+		  __entry->delalloc,
+		  __entry->unwritten)
+)
+
+#define DEFINE_PAGE_EVENT(name)		\
+DEFINE_EVENT(xfs_page_class, name,	\
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+		 unsigned int len),	\
+	TP_ARGS(inode, page, off, len))
+DEFINE_PAGE_EVENT(xfs_writepage);
+DEFINE_PAGE_EVENT(xfs_releasepage);
+DEFINE_PAGE_EVENT(xfs_invalidatepage);
+
+DECLARE_EVENT_CLASS(xfs_imap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+		 int type, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, type, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, type)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->type = type;
+		__entry->startoff = irec ? irec->br_startoff : 0;
+		__entry->startblock = irec ? irec->br_startblock : 0;
+		__entry->blockcount = irec ? irec->br_blockcount : 0;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+		  "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_symbolic(__entry->type, XFS_IO_TYPES),
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount)
+)
+
+#define DEFINE_IOMAP_EVENT(name)	\
+DEFINE_EVENT(xfs_imap_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
+		 int type, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, type, irec))
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
+
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+	TP_ARGS(ip, offset, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, isize)
+		__field(loff_t, disize)
+		__field(loff_t, offset)
+		__field(size_t, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->isize = VFS_I(ip)->i_size;
+		__entry->disize = ip->i_d.di_size;
+		__entry->offset = offset;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
+		  "offset 0x%llx count %zd",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->isize,
+		  __entry->disize,
+		  __entry->offset,
+		  __entry->count)
+);
+
+#define DEFINE_SIMPLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_simple_io_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	\
+	TP_ARGS(ip, offset, count))
+DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
+DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
+DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+
+DECLARE_EVENT_CLASS(xfs_itrunc_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
+	TP_ARGS(ip, new_size),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = new_size;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size)
+)
+
+#define DEFINE_ITRUNC_EVENT(name) \
+DEFINE_EVENT(xfs_itrunc_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
+	TP_ARGS(ip, new_size))
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
+
+TRACE_EVENT(xfs_pagecache_inval,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
+	TP_ARGS(ip, start, finish),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_off_t, start)
+		__field(xfs_off_t, finish)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->start = start;
+		__entry->finish = finish;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->start,
+		  __entry->finish)
+);
+
+TRACE_EVENT(xfs_bunmap,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+		 int flags, unsigned long caller_ip),
+	TP_ARGS(ip, bno, len, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fileoff_t, bno)
+		__field(xfs_filblks_t, len)
+		__field(unsigned long, caller_ip)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->bno = bno;
+		__entry->len = len;
+		__entry->caller_ip = caller_ip;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+		  "flags %s caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->bno,
+		  __entry->len,
+		  __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
+		  (void *)__entry->caller_ip)
+
+);
+
+DECLARE_EVENT_CLASS(xfs_extent_busy_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_extent_busy_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_extent_busy);
+DEFINE_BUSY_EVENT(xfs_extent_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_extent_busy_force);
+DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
+
+TRACE_EVENT(xfs_extent_busy_trim,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t tbno, xfs_extlen_t tlen),
+	TP_ARGS(mp, agno, agbno, len, tbno, tlen),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(xfs_agblock_t, tbno)
+		__field(xfs_extlen_t, tlen)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->tbno = tbno;
+		__entry->tlen = tlen;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->tbno,
+		  __entry->tlen)
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+	TP_PROTO(struct xfs_trans *trans),
+	TP_ARGS(trans),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = trans->t_mountp->m_super->s_dev;
+		__entry->tp = trans;
+		__entry->lsn = trans->t_commit_lsn;
+	),
+	TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
+		  __entry->lsn)
+);
+
+TRACE_EVENT(xfs_agf,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agf, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, flags)
+		__field(__u32, length)
+		__field(__u32, bno_root)
+		__field(__u32, cnt_root)
+		__field(__u32, bno_level)
+		__field(__u32, cnt_level)
+		__field(__u32, flfirst)
+		__field(__u32, fllast)
+		__field(__u32, flcount)
+		__field(__u32, freeblks)
+		__field(__u32, longest)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = be32_to_cpu(agf->agf_seqno),
+		__entry->flags = flags;
+		__entry->length = be32_to_cpu(agf->agf_length),
+		__entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
+		__entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
+		__entry->bno_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
+		__entry->cnt_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+		__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
+		__entry->fllast = be32_to_cpu(agf->agf_fllast),
+		__entry->flcount = be32_to_cpu(agf->agf_flcount),
+		__entry->freeblks = be32_to_cpu(agf->agf_freeblks),
+		__entry->longest = be32_to_cpu(agf->agf_longest);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+		  "levels b %u c %u flfirst %u fllast %u flcount %u "
+		  "freeblks %u longest %u caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
+		  __entry->length,
+		  __entry->bno_root,
+		  __entry->cnt_root,
+		  __entry->bno_level,
+		  __entry->cnt_level,
+		  __entry->flfirst,
+		  __entry->fllast,
+		  __entry->flcount,
+		  __entry->freeblks,
+		  __entry->longest,
+		  (void *)__entry->caller_ip)
+);
+
+TRACE_EVENT(xfs_free_extent,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_extlen_t len, bool isfl, int haveleft, int haveright),
+	TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int, isfl)
+		__field(int, haveleft)
+		__field(int, haveright)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->isfl = isfl;
+		__entry->haveleft = haveleft;
+		__entry->haveright = haveright;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->isfl,
+		  __entry->haveleft ?
+			(__entry->haveright ? "both" : "left") :
+			(__entry->haveright ? "right" : "none"))
+
+);
+
+DECLARE_EVENT_CLASS(xfs_alloc_class,
+	TP_PROTO(struct xfs_alloc_arg *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, minlen)
+		__field(xfs_extlen_t, maxlen)
+		__field(xfs_extlen_t, mod)
+		__field(xfs_extlen_t, prod)
+		__field(xfs_extlen_t, minleft)
+		__field(xfs_extlen_t, total)
+		__field(xfs_extlen_t, alignment)
+		__field(xfs_extlen_t, minalignslop)
+		__field(xfs_extlen_t, len)
+		__field(short, type)
+		__field(short, otype)
+		__field(char, wasdel)
+		__field(char, wasfromfl)
+		__field(char, isfl)
+		__field(char, userdata)
+		__field(xfs_fsblock_t, firstblock)
+	),
+	TP_fast_assign(
+		__entry->dev = args->mp->m_super->s_dev;
+		__entry->agno = args->agno;
+		__entry->agbno = args->agbno;
+		__entry->minlen = args->minlen;
+		__entry->maxlen = args->maxlen;
+		__entry->mod = args->mod;
+		__entry->prod = args->prod;
+		__entry->minleft = args->minleft;
+		__entry->total = args->total;
+		__entry->alignment = args->alignment;
+		__entry->minalignslop = args->minalignslop;
+		__entry->len = args->len;
+		__entry->type = args->type;
+		__entry->otype = args->otype;
+		__entry->wasdel = args->wasdel;
+		__entry->wasfromfl = args->wasfromfl;
+		__entry->isfl = args->isfl;
+		__entry->userdata = args->userdata;
+		__entry->firstblock = args->firstblock;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+		  "prod %u minleft %u total %u alignment %u minalignslop %u "
+		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+		  "userdata %d firstblock 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->minlen,
+		  __entry->maxlen,
+		  __entry->mod,
+		  __entry->prod,
+		  __entry->minleft,
+		  __entry->total,
+		  __entry->alignment,
+		  __entry->minalignslop,
+		  __entry->len,
+		  __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
+		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
+		  __entry->wasdel,
+		  __entry->wasfromfl,
+		  __entry->isfl,
+		  __entry->userdata,
+		  (unsigned long long)__entry->firstblock)
+)
+
+#define DEFINE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_alloc_class, name, \
+	TP_PROTO(struct xfs_alloc_arg *args), \
+	TP_ARGS(args))
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+
+DECLARE_EVENT_CLASS(xfs_da_class,
+	TP_PROTO(struct xfs_da_args *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__dynamic_array(char, name, args->namelen)
+		__field(int, namelen)
+		__field(xfs_dahash_t, hashval)
+		__field(xfs_ino_t, inumber)
+		__field(int, op_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		if (args->namelen)
+			memcpy(__get_str(name), args->name, args->namelen);
+		__entry->namelen = args->namelen;
+		__entry->hashval = args->hashval;
+		__entry->inumber = args->inumber;
+		__entry->op_flags = args->op_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
+		  "inumber 0x%llx op_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->namelen,
+		  __entry->namelen ? __get_str(name) : NULL,
+		  __entry->namelen,
+		  __entry->hashval,
+		  __entry->inumber,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_DIR2_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args))
+DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+
+DECLARE_EVENT_CLASS(xfs_attr_class,
+	TP_PROTO(struct xfs_da_args *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__dynamic_array(char, name, args->namelen)
+		__field(int, namelen)
+		__field(int, valuelen)
+		__field(xfs_dahash_t, hashval)
+		__field(int, op_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		if (args->namelen)
+			memcpy(__get_str(name), args->name, args->namelen);
+		__entry->namelen = args->namelen;
+		__entry->valuelen = args->valuelen;
+		__entry->hashval = args->hashval;
+		__entry->op_flags = args->op_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
+		  "hashval 0x%x op_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->namelen,
+		  __entry->namelen ? __get_str(name) : NULL,
+		  __entry->namelen,
+		  __entry->valuelen,
+		  __entry->hashval,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_ATTR_EVENT(name) \
+DEFINE_EVENT(xfs_attr_class, name, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args))
+DEFINE_ATTR_EVENT(xfs_attr_sf_add);
+DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_sf_create);
+DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
+DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
+
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
+
+DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_get);
+DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_node_replace);
+DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+
+DEFINE_ATTR_EVENT(xfs_attr_fillstate);
+DEFINE_ATTR_EVENT(xfs_attr_refillstate);
+
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
+
+#define DEFINE_DA_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args))
+DEFINE_DA_EVENT(xfs_da_split);
+DEFINE_DA_EVENT(xfs_da_join);
+DEFINE_DA_EVENT(xfs_da_link_before);
+DEFINE_DA_EVENT(xfs_da_link_after);
+DEFINE_DA_EVENT(xfs_da_unlink_back);
+DEFINE_DA_EVENT(xfs_da_unlink_forward);
+DEFINE_DA_EVENT(xfs_da_root_split);
+DEFINE_DA_EVENT(xfs_da_root_join);
+DEFINE_DA_EVENT(xfs_da_node_add);
+DEFINE_DA_EVENT(xfs_da_node_create);
+DEFINE_DA_EVENT(xfs_da_node_split);
+DEFINE_DA_EVENT(xfs_da_node_remove);
+DEFINE_DA_EVENT(xfs_da_node_rebalance);
+DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_node_toosmall);
+DEFINE_DA_EVENT(xfs_da_swap_lastblock);
+DEFINE_DA_EVENT(xfs_da_grow_inode);
+DEFINE_DA_EVENT(xfs_da_shrink_inode);
+DEFINE_DA_EVENT(xfs_da_fixhashpath);
+DEFINE_DA_EVENT(xfs_da_path_shift);
+
+DECLARE_EVENT_CLASS(xfs_dir2_space_class,
+	TP_PROTO(struct xfs_da_args *args, int idx),
+	TP_ARGS(args, idx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, op_flags)
+		__field(int, idx)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->idx = idx;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->idx)
+)
+
+#define DEFINE_DIR2_SPACE_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_space_class, name, \
+	TP_PROTO(struct xfs_da_args *args, int idx), \
+	TP_ARGS(args, idx))
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
+
+TRACE_EVENT(xfs_dir2_leafn_moveents,
+	TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
+	TP_ARGS(args, src_idx, dst_idx, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, op_flags)
+		__field(int, src_idx)
+		__field(int, dst_idx)
+		__field(int, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->src_idx = src_idx;
+		__entry->dst_idx = dst_idx;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s "
+		  "src_idx %d dst_idx %d count %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->src_idx,
+		  __entry->dst_idx,
+		  __entry->count)
+);
+
+#define XFS_SWAPEXT_INODES \
+	{ 0,	"target" }, \
+	{ 1,	"temp" }
+
+#define XFS_INODE_FORMAT_STR \
+	{ 0,	"invalid" }, \
+	{ 1,	"local" }, \
+	{ 2,	"extent" }, \
+	{ 3,	"btree" }
+
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+	TP_PROTO(struct xfs_inode *ip, int which),
+	TP_ARGS(ip, which),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, which)
+		__field(xfs_ino_t, ino)
+		__field(int, format)
+		__field(int, nex)
+		__field(int, broot_size)
+		__field(int, fork_off)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->which = which;
+		__entry->ino = ip->i_ino;
+		__entry->format = ip->i_d.di_format;
+		__entry->nex = ip->i_d.di_nextents;
+		__entry->broot_size = ip->i_df.if_broot_bytes;
+		__entry->fork_off = XFS_IFORK_BOFF(ip);
+	),
+	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+		  "broot size %d, fork offset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+		  __entry->nex,
+		  __entry->broot_size,
+		  __entry->fork_off)
+)
+
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int which), \
+	TP_ARGS(ip, which))
+
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+	TP_PROTO(struct xlog *log, struct xlog_recover *trans,
+		struct xlog_recover_item *item, int pass),
+	TP_ARGS(log, trans, item, pass),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, item)
+		__field(xlog_tid_t, tid)
+		__field(int, type)
+		__field(int, pass)
+		__field(int, count)
+		__field(int, total)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->item = (unsigned long)item;
+		__entry->tid = trans->r_log_tid;
+		__entry->type = ITEM_TYPE(item);
+		__entry->pass = pass;
+		__entry->count = item->ri_cnt;
+		__entry->total = item->ri_total;
+	),
+	TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+		  "item region count/total %d/%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tid,
+		  __entry->pass,
+		  (void *)__entry->item,
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __entry->count,
+		  __entry->total)
+)
+
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+	TP_PROTO(struct xlog *log, struct xlog_recover *trans, \
+		struct xlog_recover_item *item, int pass), \
+	TP_ARGS(log, trans, item, pass))
+
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+	TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f),
+	TP_ARGS(log, buf_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(__int64_t, blkno)
+		__field(unsigned short, len)
+		__field(unsigned short, flags)
+		__field(unsigned short, size)
+		__field(unsigned int, map_size)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->blkno = buf_f->blf_blkno;
+		__entry->len = buf_f->blf_len;
+		__entry->flags = buf_f->blf_flags;
+		__entry->size = buf_f->blf_size;
+		__entry->map_size = buf_f->blf_map_size;
+	),
+	TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+			"map_size %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->flags,
+		  __entry->size,
+		  __entry->map_size)
+)
+
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+	TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \
+	TP_ARGS(log, buf_f))
+
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+	TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f),
+	TP_ARGS(log, in_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned short, size)
+		__field(int, fields)
+		__field(unsigned short, asize)
+		__field(unsigned short, dsize)
+		__field(__int64_t, blkno)
+		__field(int, len)
+		__field(int, boffset)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->ino = in_f->ilf_ino;
+		__entry->size = in_f->ilf_size;
+		__entry->fields = in_f->ilf_fields;
+		__entry->asize = in_f->ilf_asize;
+		__entry->dsize = in_f->ilf_dsize;
+		__entry->blkno = in_f->ilf_blkno;
+		__entry->len = in_f->ilf_len;
+		__entry->boffset = in_f->ilf_boffset;
+	),
+	TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+			"dsize %d, blkno 0x%llx, len %d, boffset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->fields,
+		  __entry->asize,
+		  __entry->dsize,
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+	TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \
+	TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+
+DECLARE_EVENT_CLASS(xfs_discard_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
+#endif /* _TRACE_XFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE xfs_trace
+#include <trace/define_trace.h>
diff --git a/kernel/fs/xfs/xfs_trans.c b/kernel/fs/xfs/xfs_trans.c
new file mode 100644
index 000000000..220ef2c90
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans.c
@@ -0,0 +1,1105 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_extent_busy.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+kmem_zone_t	*xfs_trans_zone;
+kmem_zone_t	*xfs_log_item_desc_zone;
+
+/*
+ * Initialize the precomputed transaction reservation values
+ * in the mount structure.
+ */
+void
+xfs_trans_init(
+	struct xfs_mount	*mp)
+{
+	xfs_trans_resv_calc(mp, M_RES(mp));
+}
+
+/*
+ * This routine is called to allocate a transaction structure.
+ * The type parameter indicates the type of the transaction.  These
+ * are enumerated in xfs_trans.h.
+ *
+ * Dynamically allocate the transaction structure from the transaction
+ * zone, initialize it, and return it to the caller.
+ */
+xfs_trans_t *
+xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type)
+{
+	xfs_trans_t     *tp;
+
+	sb_start_intwrite(mp->m_super);
+	tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
+	tp->t_flags |= XFS_TRANS_FREEZE_PROT;
+	return tp;
+}
+
+xfs_trans_t *
+_xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type,
+	xfs_km_flags_t	memflags)
+{
+	xfs_trans_t	*tp;
+
+	WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+	atomic_inc(&mp->m_active_trans);
+
+	tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
+	tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+	tp->t_type = type;
+	tp->t_mountp = mp;
+	INIT_LIST_HEAD(&tp->t_items);
+	INIT_LIST_HEAD(&tp->t_busy);
+	return tp;
+}
+
+/*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+	struct xfs_trans	*tp)
+{
+	xfs_extent_busy_sort(&tp->t_busy);
+	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
+
+	atomic_dec(&tp->t_mountp->m_active_trans);
+	if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+		sb_end_intwrite(tp->t_mountp->m_super);
+	xfs_trans_free_dqinfo(tp);
+	kmem_zone_free(xfs_trans_zone, tp);
+}
+
+/*
+ * This is called to create a new transaction which will share the
+ * permanent log reservation of the given transaction.  The remaining
+ * unused block and rt extent reservations are also inherited.  This
+ * implies that the original transaction is no longer allowed to allocate
+ * blocks.  Locks and log items, however, are no inherited.  They must
+ * be added to the new transaction explicitly.
+ */
+xfs_trans_t *
+xfs_trans_dup(
+	xfs_trans_t	*tp)
+{
+	xfs_trans_t	*ntp;
+
+	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+
+	/*
+	 * Initialize the new transaction structure.
+	 */
+	ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
+	ntp->t_type = tp->t_type;
+	ntp->t_mountp = tp->t_mountp;
+	INIT_LIST_HEAD(&ntp->t_items);
+	INIT_LIST_HEAD(&ntp->t_busy);
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(tp->t_ticket != NULL);
+
+	ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
+		       (tp->t_flags & XFS_TRANS_RESERVE) |
+		       (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+	/* We gave our writer reference to the new transaction */
+	tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
+	ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
+	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+	tp->t_blk_res = tp->t_blk_res_used;
+	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
+	tp->t_rtx_res = tp->t_rtx_res_used;
+	ntp->t_pflags = tp->t_pflags;
+
+	xfs_trans_dup_dqinfo(tp, ntp);
+
+	atomic_inc(&tp->t_mountp->m_active_trans);
+	return ntp;
+}
+
+/*
+ * This is called to reserve free disk blocks and log space for the
+ * given transaction.  This must be done before allocating any resources
+ * within the transaction.
+ *
+ * This will return ENOSPC if there are not enough blocks available.
+ * It will sleep waiting for available log space.
+ * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
+ * is used by long running transactions.  If any one of the reservations
+ * fails then they will all be backed out.
+ *
+ * This does not do quota reservations. That typically is done by the
+ * caller afterwards.
+ */
+int
+xfs_trans_reserve(
+	struct xfs_trans	*tp,
+	struct xfs_trans_res	*resp,
+	uint			blocks,
+	uint			rtextents)
+{
+	int		error = 0;
+	bool		rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/* Mark this thread as being in a transaction */
+	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	/*
+	 * Attempt to reserve the needed disk blocks by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (blocks > 0) {
+		error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+		if (error != 0) {
+			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+			return -ENOSPC;
+		}
+		tp->t_blk_res += blocks;
+	}
+
+	/*
+	 * Reserve the log space needed for this transaction.
+	 */
+	if (resp->tr_logres > 0) {
+		bool	permanent = false;
+
+		ASSERT(tp->t_log_res == 0 ||
+		       tp->t_log_res == resp->tr_logres);
+		ASSERT(tp->t_log_count == 0 ||
+		       tp->t_log_count == resp->tr_logcount);
+
+		if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
+			tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+			permanent = true;
+		} else {
+			ASSERT(tp->t_ticket == NULL);
+			ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+		}
+
+		if (tp->t_ticket != NULL) {
+			ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
+			error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+		} else {
+			error = xfs_log_reserve(tp->t_mountp,
+						resp->tr_logres,
+						resp->tr_logcount,
+						&tp->t_ticket, XFS_TRANSACTION,
+						permanent, tp->t_type);
+		}
+
+		if (error)
+			goto undo_blocks;
+
+		tp->t_log_res = resp->tr_logres;
+		tp->t_log_count = resp->tr_logcount;
+	}
+
+	/*
+	 * Attempt to reserve the needed realtime extents by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (rtextents > 0) {
+		error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
+		if (error) {
+			error = -ENOSPC;
+			goto undo_log;
+		}
+		tp->t_rtx_res += rtextents;
+	}
+
+	return 0;
+
+	/*
+	 * Error cases jump to one of these labels to undo any
+	 * reservations which have already been performed.
+	 */
+undo_log:
+	if (resp->tr_logres > 0) {
+		int		log_flags;
+
+		if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+		tp->t_ticket = NULL;
+		tp->t_log_res = 0;
+		tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
+	}
+
+undo_blocks:
+	if (blocks > 0) {
+		xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+		tp->t_blk_res = 0;
+	}
+
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	return error;
+}
+
+/*
+ * Record the indicated change to the given field for application
+ * to the file system's superblock when the transaction commits.
+ * For now, just store the change in the transaction structure.
+ *
+ * Mark the transaction structure to indicate that the superblock
+ * needs to be updated before committing.
+ *
+ * Because we may not be keeping track of allocated/free inodes and
+ * used filesystem blocks in the superblock, we do not mark the
+ * superblock dirty in this transaction if we modify these fields.
+ * We still need to update the transaction deltas so that they get
+ * applied to the incore superblock, but we don't want them to
+ * cause the superblock to get locked and logged if these are the
+ * only fields in the superblock that the transaction modifies.
+ */
+void
+xfs_trans_mod_sb(
+	xfs_trans_t	*tp,
+	uint		field,
+	int64_t		delta)
+{
+	uint32_t	flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY);
+	xfs_mount_t	*mp = tp->t_mountp;
+
+	switch (field) {
+	case XFS_TRANS_SB_ICOUNT:
+		tp->t_icount_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_IFREE:
+		tp->t_ifree_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_FDBLOCKS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.  Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_blk_res_used += (uint)-delta;
+			ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
+		}
+		tp->t_fdblocks_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_RES_FDBLOCKS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_fdblocks_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_FREXTENTS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.  Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_rtx_res_used += (uint)-delta;
+			ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
+		}
+		tp->t_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_RES_FREXTENTS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_DBLOCKS:
+		ASSERT(delta > 0);
+		tp->t_dblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_AGCOUNT:
+		ASSERT(delta > 0);
+		tp->t_agcount_delta += delta;
+		break;
+	case XFS_TRANS_SB_IMAXPCT:
+		tp->t_imaxpct_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSIZE:
+		tp->t_rextsize_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBMBLOCKS:
+		tp->t_rbmblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBLOCKS:
+		tp->t_rblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTENTS:
+		tp->t_rextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSLOG:
+		tp->t_rextslog_delta += delta;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	tp->t_flags |= flags;
+}
+
+/*
+ * xfs_trans_apply_sb_deltas() is called from the commit code
+ * to bring the superblock buffer into the current transaction
+ * and modify it as requested by earlier calls to xfs_trans_mod_sb().
+ *
+ * For now we just look at each field allowed to change and change
+ * it if necessary.
+ */
+STATIC void
+xfs_trans_apply_sb_deltas(
+	xfs_trans_t	*tp)
+{
+	xfs_dsb_t	*sbp;
+	xfs_buf_t	*bp;
+	int		whole = 0;
+
+	bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+	sbp = XFS_BUF_TO_SBP(bp);
+
+	/*
+	 * Check that superblock mods match the mods made to AGF counters.
+	 */
+	ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
+	       (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
+		tp->t_ag_btree_delta));
+
+	/*
+	 * Only update the superblock counters if we are logging them
+	 */
+	if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
+		if (tp->t_icount_delta)
+			be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
+		if (tp->t_ifree_delta)
+			be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta);
+		if (tp->t_fdblocks_delta)
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta);
+		if (tp->t_res_fdblocks_delta)
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
+	}
+
+	if (tp->t_frextents_delta)
+		be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
+	if (tp->t_res_frextents_delta)
+		be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+
+	if (tp->t_dblocks_delta) {
+		be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_agcount_delta) {
+		be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta);
+		whole = 1;
+	}
+	if (tp->t_imaxpct_delta) {
+		sbp->sb_imax_pct += tp->t_imaxpct_delta;
+		whole = 1;
+	}
+	if (tp->t_rextsize_delta) {
+		be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta);
+		whole = 1;
+	}
+	if (tp->t_rbmblocks_delta) {
+		be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rblocks_delta) {
+		be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rextents_delta) {
+		be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta);
+		whole = 1;
+	}
+	if (tp->t_rextslog_delta) {
+		sbp->sb_rextslog += tp->t_rextslog_delta;
+		whole = 1;
+	}
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+	if (whole)
+		/*
+		 * Log the whole thing, the fields are noncontiguous.
+		 */
+		xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1);
+	else
+		/*
+		 * Since all the modifiable fields are contiguous, we
+		 * can get away with this.
+		 */
+		xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
+				  offsetof(xfs_dsb_t, sb_frextents) +
+				  sizeof(sbp->sb_frextents) - 1);
+}
+
+STATIC int
+xfs_sb_mod8(
+	uint8_t			*field,
+	int8_t			delta)
+{
+	int8_t			counter = *field;
+
+	counter += delta;
+	if (counter < 0) {
+		ASSERT(0);
+		return -EINVAL;
+	}
+	*field = counter;
+	return 0;
+}
+
+STATIC int
+xfs_sb_mod32(
+	uint32_t		*field,
+	int32_t			delta)
+{
+	int32_t			counter = *field;
+
+	counter += delta;
+	if (counter < 0) {
+		ASSERT(0);
+		return -EINVAL;
+	}
+	*field = counter;
+	return 0;
+}
+
+STATIC int
+xfs_sb_mod64(
+	uint64_t		*field,
+	int64_t			delta)
+{
+	int64_t			counter = *field;
+
+	counter += delta;
+	if (counter < 0) {
+		ASSERT(0);
+		return -EINVAL;
+	}
+	*field = counter;
+	return 0;
+}
+
+/*
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
+ * and apply superblock counter changes to the in-core superblock.  The
+ * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
+ * applied to the in-core superblock.  The idea is that that has already been
+ * done.
+ *
+ * If we are not logging superblock counters, then the inode allocated/free and
+ * used block counts are not updated in the on disk superblock. In this case,
+ * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
+ * still need to update the incore superblock with the changes.
+ */
+void
+xfs_trans_unreserve_and_mod_sb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	bool			rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+	int64_t			blkdelta = 0;
+	int64_t			rtxdelta = 0;
+	int64_t			idelta = 0;
+	int64_t			ifreedelta = 0;
+	int			error;
+
+	/* calculate deltas */
+	if (tp->t_blk_res > 0)
+		blkdelta = tp->t_blk_res;
+	if ((tp->t_fdblocks_delta != 0) &&
+	    (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)))
+	        blkdelta += tp->t_fdblocks_delta;
+
+	if (tp->t_rtx_res > 0)
+		rtxdelta = tp->t_rtx_res;
+	if ((tp->t_frextents_delta != 0) &&
+	    (tp->t_flags & XFS_TRANS_SB_DIRTY))
+		rtxdelta += tp->t_frextents_delta;
+
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+		idelta = tp->t_icount_delta;
+		ifreedelta = tp->t_ifree_delta;
+	}
+
+	/* apply the per-cpu counters */
+	if (blkdelta) {
+		error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
+		if (error)
+			goto out;
+	}
+
+	if (idelta) {
+		error = xfs_mod_icount(mp, idelta);
+		if (error)
+			goto out_undo_fdblocks;
+	}
+
+	if (ifreedelta) {
+		error = xfs_mod_ifree(mp, ifreedelta);
+		if (error)
+			goto out_undo_icount;
+	}
+
+	if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+		return;
+
+	/* apply remaining deltas */
+	spin_lock(&mp->m_sb_lock);
+	if (rtxdelta) {
+		error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
+		if (error)
+			goto out_undo_ifree;
+	}
+
+	if (tp->t_dblocks_delta != 0) {
+		error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
+		if (error)
+			goto out_undo_frextents;
+	}
+	if (tp->t_agcount_delta != 0) {
+		error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
+		if (error)
+			goto out_undo_dblocks;
+	}
+	if (tp->t_imaxpct_delta != 0) {
+		error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
+		if (error)
+			goto out_undo_agcount;
+	}
+	if (tp->t_rextsize_delta != 0) {
+		error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
+				     tp->t_rextsize_delta);
+		if (error)
+			goto out_undo_imaxpct;
+	}
+	if (tp->t_rbmblocks_delta != 0) {
+		error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
+				     tp->t_rbmblocks_delta);
+		if (error)
+			goto out_undo_rextsize;
+	}
+	if (tp->t_rblocks_delta != 0) {
+		error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
+		if (error)
+			goto out_undo_rbmblocks;
+	}
+	if (tp->t_rextents_delta != 0) {
+		error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
+				     tp->t_rextents_delta);
+		if (error)
+			goto out_undo_rblocks;
+	}
+	if (tp->t_rextslog_delta != 0) {
+		error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
+				     tp->t_rextslog_delta);
+		if (error)
+			goto out_undo_rextents;
+	}
+	spin_unlock(&mp->m_sb_lock);
+	return;
+
+out_undo_rextents:
+	if (tp->t_rextents_delta)
+		xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
+out_undo_rblocks:
+	if (tp->t_rblocks_delta)
+		xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
+out_undo_rbmblocks:
+	if (tp->t_rbmblocks_delta)
+		xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
+out_undo_rextsize:
+	if (tp->t_rextsize_delta)
+		xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
+out_undo_imaxpct:
+	if (tp->t_rextsize_delta)
+		xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
+out_undo_agcount:
+	if (tp->t_agcount_delta)
+		xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
+out_undo_dblocks:
+	if (tp->t_dblocks_delta)
+		xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
+out_undo_frextents:
+	if (rtxdelta)
+		xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
+out_undo_ifree:
+	spin_unlock(&mp->m_sb_lock);
+	if (ifreedelta)
+		xfs_mod_ifree(mp, -ifreedelta);
+out_undo_icount:
+	if (idelta)
+		xfs_mod_icount(mp, -idelta);
+out_undo_fdblocks:
+	if (blkdelta)
+		xfs_mod_fdblocks(mp, -blkdelta, rsvd);
+out:
+	ASSERT(error == 0);
+	return;
+}
+
+/*
+ * Add the given log item to the transaction's list of log items.
+ *
+ * The log item will now point to its new descriptor with its li_desc field.
+ */
+void
+xfs_trans_add_item(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_log_item_desc *lidp;
+
+	ASSERT(lip->li_mountp == tp->t_mountp);
+	ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
+
+	lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
+
+	lidp->lid_item = lip;
+	lidp->lid_flags = 0;
+	list_add_tail(&lidp->lid_trans, &tp->t_items);
+
+	lip->li_desc = lidp;
+}
+
+STATIC void
+xfs_trans_free_item_desc(
+	struct xfs_log_item_desc *lidp)
+{
+	list_del_init(&lidp->lid_trans);
+	kmem_zone_free(xfs_log_item_desc_zone, lidp);
+}
+
+/*
+ * Unlink and free the given descriptor.
+ */
+void
+xfs_trans_del_item(
+	struct xfs_log_item	*lip)
+{
+	xfs_trans_free_item_desc(lip->li_desc);
+	lip->li_desc = NULL;
+}
+
+/*
+ * Unlock all of the items of a transaction and free all the descriptors
+ * of that transaction.
+ */
+void
+xfs_trans_free_items(
+	struct xfs_trans	*tp,
+	xfs_lsn_t		commit_lsn,
+	int			flags)
+{
+	struct xfs_log_item_desc *lidp, *next;
+
+	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+		struct xfs_log_item	*lip = lidp->lid_item;
+
+		lip->li_desc = NULL;
+
+		if (commit_lsn != NULLCOMMITLSN)
+			lip->li_ops->iop_committing(lip, commit_lsn);
+		if (flags & XFS_TRANS_ABORT)
+			lip->li_flags |= XFS_LI_ABORTED;
+		lip->li_ops->iop_unlock(lip);
+
+		xfs_trans_free_item_desc(lidp);
+	}
+}
+
+static inline void
+xfs_log_item_batch_insert(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		commit_lsn)
+{
+	int	i;
+
+	spin_lock(&ailp->xa_lock);
+	/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+	xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
+
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+
+		lip->li_ops->iop_unpin(lip, 0);
+	}
+}
+
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through iop_commited and iop_unlock, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is iop_committed processing, followed by an
+ * iop_unpin(aborted) call.
+ *
+ * The AIL cursor is used to optimise the insert process. If commit_lsn is not
+ * at the end of the AIL, the insert cursor avoids the need to walk
+ * the AIL to find the insertion point on every xfs_log_item_batch_insert()
+ * call. This saves a lot of needless list walking and is a net win, even
+ * though it slightly increases that amount of AIL lock traffic to set it up
+ * and tear it down.
+ */
+void
+xfs_trans_committed_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		commit_lsn,
+	int			aborted)
+{
+#define LOG_ITEM_BATCH_SIZE	32
+	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
+	struct xfs_log_vec	*lv;
+	struct xfs_ail_cursor	cur;
+	int			i = 0;
+
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn);
+	spin_unlock(&ailp->xa_lock);
+
+	/* unpin all the log items */
+	for (lv = log_vector; lv; lv = lv->lv_next ) {
+		struct xfs_log_item	*lip = lv->lv_item;
+		xfs_lsn_t		item_lsn;
+
+		if (aborted)
+			lip->li_flags |= XFS_LI_ABORTED;
+		item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+
+		/* item_lsn of -1 means the item needs no further processing */
+		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+			continue;
+
+		/*
+		 * if we are aborting the operation, no point in inserting the
+		 * object into the AIL as we are in a shutdown situation.
+		 */
+		if (aborted) {
+			ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+			lip->li_ops->iop_unpin(lip, 1);
+			continue;
+		}
+
+		if (item_lsn != commit_lsn) {
+
+			/*
+			 * Not a bulk update option due to unusual item_lsn.
+			 * Push into AIL immediately, rechecking the lsn once
+			 * we have the ail lock. Then unpin the item. This does
+			 * not affect the AIL cursor the bulk insert path is
+			 * using.
+			 */
+			spin_lock(&ailp->xa_lock);
+			if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+				xfs_trans_ail_update(ailp, lip, item_lsn);
+			else
+				spin_unlock(&ailp->xa_lock);
+			lip->li_ops->iop_unpin(lip, 0);
+			continue;
+		}
+
+		/* Item is a candidate for bulk AIL insert.  */
+		log_items[i++] = lv->lv_item;
+		if (i >= LOG_ITEM_BATCH_SIZE) {
+			xfs_log_item_batch_insert(ailp, &cur, log_items,
+					LOG_ITEM_BATCH_SIZE, commit_lsn);
+			i = 0;
+		}
+	}
+
+	/* make sure we insert the remainder! */
+	if (i)
+		xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
+
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+}
+
+/*
+ * Commit the given transaction to the log.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
+ */
+int
+xfs_trans_commit(
+	struct xfs_trans	*tp,
+	uint			flags)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_lsn_t		commit_lsn = -1;
+	int			error = 0;
+	int			log_flags = 0;
+	int			sync = tp->t_flags & XFS_TRANS_SYNC;
+
+	/*
+	 * Determine whether this commit is releasing a permanent
+	 * log reservation or not.
+	 */
+	if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+	}
+
+	/*
+	 * If there is nothing to be logged by the transaction,
+	 * then unlock all of the items associated with the
+	 * transaction and free the transaction structure.
+	 * Also make sure to return any reserved blocks to
+	 * the free pool.
+	 */
+	if (!(tp->t_flags & XFS_TRANS_DIRTY))
+		goto out_unreserve;
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		error = -EIO;
+		goto out_unreserve;
+	}
+
+	ASSERT(tp->t_ticket != NULL);
+
+	/*
+	 * If we need to update the superblock, then do it now.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+		xfs_trans_apply_sb_deltas(tp);
+	xfs_trans_apply_dquot_deltas(tp);
+
+	xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	xfs_trans_free(tp);
+
+	/*
+	 * If the transaction needs to be synchronous, then force the
+	 * log out now and wait for it.
+	 */
+	if (sync) {
+		error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
+		XFS_STATS_INC(xs_trans_sync);
+	} else {
+		XFS_STATS_INC(xs_trans_async);
+	}
+
+	return error;
+
+out_unreserve:
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * It is indeed possible for the transaction to be not dirty but
+	 * the dqinfo portion to be.  All that means is that we have some
+	 * (non-persistent) quota reservations that need to be unreserved.
+	 */
+	xfs_trans_unreserve_and_mod_dquots(tp);
+	if (tp->t_ticket) {
+		commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+		if (commit_lsn == -1 && !error)
+			error = -EIO;
+	}
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+	xfs_trans_free(tp);
+
+	XFS_STATS_INC(xs_trans_empty);
+	return error;
+}
+
+/*
+ * Unlock all of the transaction's items and free the transaction.
+ * The transaction must not have modified any of its items, because
+ * there is no way to restore them to their previous state.
+ *
+ * If the transaction has made a log reservation, make sure to release
+ * it as well.
+ */
+void
+xfs_trans_cancel(
+	xfs_trans_t		*tp,
+	int			flags)
+{
+	int			log_flags;
+	xfs_mount_t		*mp = tp->t_mountp;
+
+	/*
+	 * See if the caller is being too lazy to figure out if
+	 * the transaction really needs an abort.
+	 */
+	if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
+		flags &= ~XFS_TRANS_ABORT;
+	/*
+	 * See if the caller is relying on us to shut down the
+	 * filesystem.  This happens in paths where we detect
+	 * corruption and decide to give up.
+	 */
+	if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	}
+#ifdef DEBUG
+	if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+		struct xfs_log_item_desc *lidp;
+
+		list_for_each_entry(lidp, &tp->t_items, lid_trans)
+			ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
+	}
+#endif
+	xfs_trans_unreserve_and_mod_sb(tp);
+	xfs_trans_unreserve_and_mod_dquots(tp);
+
+	if (tp->t_ticket) {
+		if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+			ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	}
+
+	/* mark this thread as no longer being in a transaction */
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+	xfs_trans_free(tp);
+}
+
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_trans	*trans;
+	struct xfs_trans_res	tres;
+	int			error;
+
+	/*
+	 * Ensure that the inode is always logged.
+	 */
+	trans = *tpp;
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+
+	/*
+	 * Copy the critical parameters from one trans to the next.
+	 */
+	tres.tr_logres = trans->t_log_res;
+	tres.tr_logcount = trans->t_log_count;
+	*tpp = xfs_trans_dup(trans);
+
+	/*
+	 * Commit the current transaction.
+	 * If this commit failed, then it'd just unlock those items that
+	 * are not marked ihold. That also means that a filesystem shutdown
+	 * is in progress. The caller takes the responsibility to cancel
+	 * the duplicate transaction that gets returned.
+	 */
+	error = xfs_trans_commit(trans, 0);
+	if (error)
+		return error;
+
+	trans = *tpp;
+
+	/*
+	 * transaction commit worked ok so we can drop the extra ticket
+	 * reference that we gained in xfs_trans_dup()
+	 */
+	xfs_log_ticket_put(trans->t_ticket);
+
+
+	/*
+	 * Reserve space in the log for th next transaction.
+	 * This also pushes items in the "AIL", the list of logged items,
+	 * out to disk if they are taking up space at the tail of the log
+	 * that we want to use.  This requires that either nothing be locked
+	 * across this call, or that anything that is locked be logged in
+	 * the prior and the next transactions.
+	 */
+	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+	error = xfs_trans_reserve(trans, &tres, 0, 0);
+	/*
+	 *  Ensure that the inode is in the new transaction and locked.
+	 */
+	if (error)
+		return error;
+
+	xfs_trans_ijoin(trans, dp, 0);
+	return 0;
+}
diff --git a/kernel/fs/xfs/xfs_trans.h b/kernel/fs/xfs/xfs_trans.h
new file mode 100644
index 000000000..b5bc1ab3c
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_TRANS_H__
+#define	__XFS_TRANS_H__
+
+/* kernel only transaction subsystem defines */
+
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_trans_res;
+struct xfs_dquot_acct;
+struct xfs_busy_extent;
+
+typedef struct xfs_log_item {
+	struct list_head		li_ail;		/* AIL pointers */
+	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
+	struct xfs_log_item_desc	*li_desc;	/* ptr to current desc*/
+	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
+	struct xfs_ail			*li_ailp;	/* ptr to AIL */
+	uint				li_type;	/* item type */
+	uint				li_flags;	/* misc flags */
+	struct xfs_log_item		*li_bio_list;	/* buffer item list */
+	void				(*li_cb)(struct xfs_buf *,
+						 struct xfs_log_item *);
+							/* buffer item iodone */
+							/* callback func */
+	const struct xfs_item_ops	*li_ops;	/* function list */
+
+	/* delayed logging */
+	struct list_head		li_cil;		/* CIL pointers */
+	struct xfs_log_vec		*li_lv;		/* active log vector */
+	xfs_lsn_t			li_seq;		/* CIL commit seq */
+} xfs_log_item_t;
+
+#define	XFS_LI_IN_AIL	0x1
+#define XFS_LI_ABORTED	0x2
+
+#define XFS_LI_FLAGS \
+	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
+	{ XFS_LI_ABORTED,	"ABORTED" }
+
+struct xfs_item_ops {
+	void (*iop_size)(xfs_log_item_t *, int *, int *);
+	void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
+	void (*iop_pin)(xfs_log_item_t *);
+	void (*iop_unpin)(xfs_log_item_t *, int remove);
+	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	void (*iop_unlock)(xfs_log_item_t *);
+	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+};
+
+void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
+			  int type, const struct xfs_item_ops *ops);
+
+/*
+ * Return values for the iop_push() routines.
+ */
+#define XFS_ITEM_SUCCESS	0
+#define XFS_ITEM_PINNED		1
+#define XFS_ITEM_LOCKED		2
+#define XFS_ITEM_FLUSHING	3
+
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+	unsigned int		t_magic;	/* magic number */
+	unsigned int		t_type;		/* transaction type */
+	unsigned int		t_log_res;	/* amt of log space resvd */
+	unsigned int		t_log_count;	/* count for perm log res */
+	unsigned int		t_blk_res;	/* # of blocks resvd */
+	unsigned int		t_blk_res_used;	/* # of resvd blocks used */
+	unsigned int		t_rtx_res;	/* # of rt extents resvd */
+	unsigned int		t_rtx_res_used;	/* # of resvd rt extents used */
+	struct xlog_ticket	*t_ticket;	/* log mgr ticket */
+	xfs_lsn_t		t_lsn;		/* log seq num of start of
+						 * transaction. */
+	xfs_lsn_t		t_commit_lsn;	/* log seq num of end of
+						 * transaction. */
+	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
+	struct xfs_dquot_acct   *t_dqinfo;	/* acctg info for dquots */
+	unsigned int		t_flags;	/* misc flags */
+	int64_t			t_icount_delta;	/* superblock icount change */
+	int64_t			t_ifree_delta;	/* superblock ifree change */
+	int64_t			t_fdblocks_delta; /* superblock fdblocks chg */
+	int64_t			t_res_fdblocks_delta; /* on-disk only chg */
+	int64_t			t_frextents_delta;/* superblock freextents chg*/
+	int64_t			t_res_frextents_delta; /* on-disk only chg */
+#if defined(DEBUG) || defined(XFS_WARN)
+	int64_t			t_ag_freeblks_delta; /* debugging counter */
+	int64_t			t_ag_flist_delta; /* debugging counter */
+	int64_t			t_ag_btree_delta; /* debugging counter */
+#endif
+	int64_t			t_dblocks_delta;/* superblock dblocks change */
+	int64_t			t_agcount_delta;/* superblock agcount change */
+	int64_t			t_imaxpct_delta;/* superblock imaxpct change */
+	int64_t			t_rextsize_delta;/* superblock rextsize chg */
+	int64_t			t_rbmblocks_delta;/* superblock rbmblocks chg */
+	int64_t			t_rblocks_delta;/* superblock rblocks change */
+	int64_t			t_rextents_delta;/* superblocks rextents chg */
+	int64_t			t_rextslog_delta;/* superblocks rextslog chg */
+	struct list_head	t_items;	/* log item descriptors */
+	struct list_head	t_busy;		/* list of busy extents */
+	unsigned long		t_pflags;	/* saved process flags state */
+} xfs_trans_t;
+
+/*
+ * XFS transaction mechanism exported interfaces that are
+ * actually macros.
+ */
+#define	xfs_trans_get_log_res(tp)	((tp)->t_log_res)
+#define	xfs_trans_get_log_count(tp)	((tp)->t_log_count)
+#define	xfs_trans_get_block_res(tp)	((tp)->t_blk_res)
+#define	xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
+
+#if defined(DEBUG) || defined(XFS_WARN)
+#define	xfs_trans_agblocks_delta(tp, d)	((tp)->t_ag_freeblks_delta += (int64_t)d)
+#define	xfs_trans_agflist_delta(tp, d)	((tp)->t_ag_flist_delta += (int64_t)d)
+#define	xfs_trans_agbtree_delta(tp, d)	((tp)->t_ag_btree_delta += (int64_t)d)
+#else
+#define	xfs_trans_agblocks_delta(tp, d)
+#define	xfs_trans_agflist_delta(tp, d)
+#define	xfs_trans_agbtree_delta(tp, d)
+#endif
+
+/*
+ * XFS transaction mechanism exported interfaces.
+ */
+xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
+xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
+int		xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
+				  uint, uint);
+void		xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
+
+struct xfs_buf	*xfs_trans_get_buf_map(struct xfs_trans *tp,
+				       struct xfs_buftarg *target,
+				       struct xfs_buf_map *map, int nmaps,
+				       uint flags);
+
+static inline struct xfs_buf *
+xfs_trans_get_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	int			numblks,
+	uint			flags)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
+}
+
+int		xfs_trans_read_buf_map(struct xfs_mount *mp,
+				       struct xfs_trans *tp,
+				       struct xfs_buftarg *target,
+				       struct xfs_buf_map *map, int nmaps,
+				       xfs_buf_flags_t flags,
+				       struct xfs_buf **bpp,
+				       const struct xfs_buf_ops *ops);
+
+static inline int
+xfs_trans_read_buf(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	int			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
+				      flags, bpp, ops);
+}
+
+struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+
+void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
+void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+struct xfs_efi_log_item	*xfs_trans_get_efi(xfs_trans_t *, uint);
+void		xfs_efi_release(struct xfs_efi_log_item *, uint);
+void		xfs_trans_log_efi_extent(xfs_trans_t *,
+					 struct xfs_efi_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+struct xfs_efd_log_item	*xfs_trans_get_efd(xfs_trans_t *,
+				  struct xfs_efi_log_item *,
+				  uint);
+void		xfs_trans_log_efd_extent(xfs_trans_t *,
+					 struct xfs_efd_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+int		xfs_trans_commit(xfs_trans_t *, uint flags);
+int		xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+void		xfs_trans_cancel(xfs_trans_t *, int);
+int		xfs_trans_ail_init(struct xfs_mount *);
+void		xfs_trans_ail_destroy(struct xfs_mount *);
+
+void		xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
+				       enum xfs_blft);
+void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
+					struct xfs_buf *src_bp);
+
+extern kmem_zone_t	*xfs_trans_zone;
+extern kmem_zone_t	*xfs_log_item_desc_zone;
+
+#endif	/* __XFS_TRANS_H__ */
diff --git a/kernel/fs/xfs/xfs_trans_ail.c b/kernel/fs/xfs/xfs_trans_ail.c
new file mode 100644
index 000000000..573aefb5a
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans_ail.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_log.h"
+
+#ifdef DEBUG
+/*
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*prev_lip;
+
+	if (list_empty(&ailp->xa_ail))
+		return;
+
+	/*
+	 * Check the next and previous entries are valid.
+	 */
+	ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
+		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+
+	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
+		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+
+
+}
+#else /* !DEBUG */
+#define	xfs_ail_check(a,l)
+#endif /* DEBUG */
+
+/*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_max(
+	struct xfs_ail  *ailp)
+{
+	if (list_empty(&ailp->xa_ail))
+		return NULL;
+
+	return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+}
+
+/*
+ * Return a pointer to the item which follows the given item in the AIL.  If
+ * the given item is the last item in the list, then return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_next(
+	struct xfs_ail  *ailp,
+	xfs_log_item_t  *lip)
+{
+	if (lip->li_ail.next == &ailp->xa_ail)
+		return NULL;
+
+	return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+}
+
+/*
+ * This is called by the log manager code to determine the LSN of the tail of
+ * the log.  This is exactly the LSN of the first item in the AIL.  If the AIL
+ * is empty, then this function returns 0.
+ *
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
+ * item in the AIL.
+ */
+xfs_lsn_t
+xfs_ail_min_lsn(
+	struct xfs_ail	*ailp)
+{
+	xfs_lsn_t	lsn = 0;
+	xfs_log_item_t	*lip;
+
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_ail_min(ailp);
+	if (lip)
+		lsn = lip->li_lsn;
+	spin_unlock(&ailp->xa_lock);
+
+	return lsn;
+}
+
+/*
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
+ */
+static xfs_lsn_t
+xfs_ail_max_lsn(
+	struct xfs_ail  *ailp)
+{
+	xfs_lsn_t       lsn = 0;
+	xfs_log_item_t  *lip;
+
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_ail_max(ailp);
+	if (lip)
+		lsn = lip->li_lsn;
+	spin_unlock(&ailp->xa_lock);
+
+	return lsn;
+}
+
+/*
+ * The cursor keeps track of where our current traversal is up to by tracking
+ * the next item in the list for us. However, for this to be safe, removing an
+ * object from the AIL needs to invalidate any cursor that points to it. hence
+ * the traversal cursor needs to be linked to the struct xfs_ail so that
+ * deletion can search all the active cursors for invalidation.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur)
+{
+	cur->item = NULL;
+	list_add_tail(&cur->list, &ailp->xa_cursors);
+}
+
+/*
+ * Get the next item in the traversal and advance the cursor.  If the cursor
+ * was invalidated (indicated by a lip of 1), restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur)
+{
+	struct xfs_log_item	*lip = cur->item;
+
+	if ((__psint_t)lip & 1)
+		lip = xfs_ail_min(ailp);
+	if (lip)
+		cur->item = xfs_ail_next(ailp, lip);
+	return lip;
+}
+
+/*
+ * When the traversal is complete, we need to remove the cursor from the list
+ * of traversing cursors.
+ */
+void
+xfs_trans_ail_cursor_done(
+	struct xfs_ail_cursor	*cur)
+{
+	cur->item = NULL;
+	list_del_init(&cur->list);
+}
+
+/*
+ * Invalidate any cursor that is pointing to this item. This is called when an
+ * item is removed from the AIL. Any cursor pointing to this object is now
+ * invalid and the traversal needs to be terminated so it doesn't reference a
+ * freed object. We set the low bit of the cursor item pointer so we can
+ * distinguish between an invalidation and the end of the list when getting the
+ * next item from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_ail_cursor	*cur;
+
+	list_for_each_entry(cur, &ailp->xa_cursors, list) {
+		if (cur->item == lip)
+			cur->item = (struct xfs_log_item *)
+					((__psint_t)cur->item | 1);
+	}
+}
+
+/*
+ * Find the first item in the AIL with the given @lsn by searching in ascending
+ * LSN order and initialise the cursor to point to the next item for a
+ * ascending traversal.  Pass a @lsn of zero to initialise the cursor to the
+ * first item in the AIL. Returns NULL if the list is empty.
+ */
+xfs_log_item_t *
+xfs_trans_ail_cursor_first(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	xfs_lsn_t		lsn)
+{
+	xfs_log_item_t		*lip;
+
+	xfs_trans_ail_cursor_init(ailp, cur);
+
+	if (lsn == 0) {
+		lip = xfs_ail_min(ailp);
+		goto out;
+	}
+
+	list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+			goto out;
+	}
+	return NULL;
+
+out:
+	if (lip)
+		cur->item = xfs_ail_next(ailp, lip);
+	return lip;
+}
+
+static struct xfs_log_item *
+__xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	xfs_lsn_t		lsn)
+{
+	xfs_log_item_t		*lip;
+
+	list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
+			return lip;
+	}
+	return NULL;
+}
+
+/*
+ * Find the last item in the AIL with the given @lsn by searching in descending
+ * LSN order and initialise the cursor to point to that item.  If there is no
+ * item with the value of @lsn, then it sets the cursor to the last item with an
+ * LSN lower than @lsn.  Returns NULL if the list is empty.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	xfs_lsn_t		lsn)
+{
+	xfs_trans_ail_cursor_init(ailp, cur);
+	cur->item = __xfs_trans_ail_cursor_last(ailp, lsn);
+	return cur->item;
+}
+
+/*
+ * Splice the log item list into the AIL at the given LSN. We splice to the
+ * tail of the given LSN to maintain insert order for push traversals. The
+ * cursor is optional, allowing repeated updates to the same LSN to avoid
+ * repeated traversals.  This should not be called with an empty list.
+ */
+static void
+xfs_ail_splice(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct list_head	*list,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_log_item	*lip;
+
+	ASSERT(!list_empty(list));
+
+	/*
+	 * Use the cursor to determine the insertion point if one is
+	 * provided.  If not, or if the one we got is not valid,
+	 * find the place in the AIL where the items belong.
+	 */
+	lip = cur ? cur->item : NULL;
+	if (!lip || (__psint_t) lip & 1)
+		lip = __xfs_trans_ail_cursor_last(ailp, lsn);
+
+	/*
+	 * If a cursor is provided, we know we're processing the AIL
+	 * in lsn order, and future items to be spliced in will
+	 * follow the last one being inserted now.  Update the
+	 * cursor to point to that last item, now while we have a
+	 * reliable pointer to it.
+	 */
+	if (cur)
+		cur->item = list_entry(list->prev, struct xfs_log_item, li_ail);
+
+	/*
+	 * Finally perform the splice.  Unless the AIL was empty,
+	 * lip points to the item in the AIL _after_ which the new
+	 * items should go.  If lip is null the AIL was empty, so
+	 * the new items go at the head of the AIL.
+	 */
+	if (lip)
+		list_splice(list, &lip->li_ail);
+	else
+		list_splice(list, &ailp->xa_ail);
+}
+
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+	struct xfs_ail  *ailp,
+	xfs_log_item_t  *lip)
+{
+	xfs_ail_check(ailp, lip);
+	list_del(&lip->li_ail);
+	xfs_trans_ail_cursor_clear(ailp, lip);
+}
+
+static long
+xfsaild_push(
+	struct xfs_ail		*ailp)
+{
+	xfs_mount_t		*mp = ailp->xa_mount;
+	struct xfs_ail_cursor	cur;
+	xfs_log_item_t		*lip;
+	xfs_lsn_t		lsn;
+	xfs_lsn_t		target;
+	long			tout;
+	int			stuck = 0;
+	int			flushing = 0;
+	int			count = 0;
+
+	/*
+	 * If we encountered pinned items or did not finish writing out all
+	 * buffers the last time we ran, force the log first and wait for it
+	 * before pushing again.
+	 */
+	if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 &&
+	    (!list_empty_careful(&ailp->xa_buf_list) ||
+	     xfs_ail_min_lsn(ailp))) {
+		ailp->xa_log_flush = 0;
+
+		XFS_STATS_INC(xs_push_ail_flush);
+		xfs_log_force(mp, XFS_LOG_SYNC);
+	}
+
+	spin_lock(&ailp->xa_lock);
+
+	/* barrier matches the xa_target update in xfs_ail_push() */
+	smp_rmb();
+	target = ailp->xa_target;
+	ailp->xa_target_prev = target;
+
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
+	if (!lip) {
+		/*
+		 * If the AIL is empty or our push has reached the end we are
+		 * done now.
+		 */
+		xfs_trans_ail_cursor_done(&cur);
+		spin_unlock(&ailp->xa_lock);
+		goto out_done;
+	}
+
+	XFS_STATS_INC(xs_push_ail);
+
+	lsn = lip->li_lsn;
+	while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
+		int	lock_result;
+
+		/*
+		 * Note that iop_push may unlock and reacquire the AIL lock.  We
+		 * rely on the AIL cursor implementation to be able to deal with
+		 * the dropped lock.
+		 */
+		lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
+		switch (lock_result) {
+		case XFS_ITEM_SUCCESS:
+			XFS_STATS_INC(xs_push_ail_success);
+			trace_xfs_ail_push(lip);
+
+			ailp->xa_last_pushed_lsn = lsn;
+			break;
+
+		case XFS_ITEM_FLUSHING:
+			/*
+			 * The item or its backing buffer is already beeing
+			 * flushed.  The typical reason for that is that an
+			 * inode buffer is locked because we already pushed the
+			 * updates to it as part of inode clustering.
+			 *
+			 * We do not want to to stop flushing just because lots
+			 * of items are already beeing flushed, but we need to
+			 * re-try the flushing relatively soon if most of the
+			 * AIL is beeing flushed.
+			 */
+			XFS_STATS_INC(xs_push_ail_flushing);
+			trace_xfs_ail_flushing(lip);
+
+			flushing++;
+			ailp->xa_last_pushed_lsn = lsn;
+			break;
+
+		case XFS_ITEM_PINNED:
+			XFS_STATS_INC(xs_push_ail_pinned);
+			trace_xfs_ail_pinned(lip);
+
+			stuck++;
+			ailp->xa_log_flush++;
+			break;
+		case XFS_ITEM_LOCKED:
+			XFS_STATS_INC(xs_push_ail_locked);
+			trace_xfs_ail_locked(lip);
+
+			stuck++;
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+
+		count++;
+
+		/*
+		 * Are there too many items we can't do anything with?
+		 *
+		 * If we we are skipping too many items because we can't flush
+		 * them or they are already being flushed, we back off and
+		 * given them time to complete whatever operation is being
+		 * done. i.e. remove pressure from the AIL while we can't make
+		 * progress so traversals don't slow down further inserts and
+		 * removals to/from the AIL.
+		 *
+		 * The value of 100 is an arbitrary magic number based on
+		 * observation.
+		 */
+		if (stuck > 100)
+			break;
+
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+		if (lip == NULL)
+			break;
+		lsn = lip->li_lsn;
+	}
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+
+	if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
+		ailp->xa_log_flush++;
+
+	if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
+out_done:
+		/*
+		 * We reached the target or the AIL is empty, so wait a bit
+		 * longer for I/O to complete and remove pushed items from the
+		 * AIL before we start the next scan from the start of the AIL.
+		 */
+		tout = 50;
+		ailp->xa_last_pushed_lsn = 0;
+	} else if (((stuck + flushing) * 100) / count > 90) {
+		/*
+		 * Either there is a lot of contention on the AIL or we are
+		 * stuck due to operations in progress. "Stuck" in this case
+		 * is defined as >90% of the items we tried to push were stuck.
+		 *
+		 * Backoff a bit more to allow some I/O to complete before
+		 * restarting from the start of the AIL. This prevents us from
+		 * spinning on the same items, and if they are pinned will all
+		 * the restart to issue a log force to unpin the stuck items.
+		 */
+		tout = 20;
+		ailp->xa_last_pushed_lsn = 0;
+	} else {
+		/*
+		 * Assume we have more work to do in a short while.
+		 */
+		tout = 10;
+	}
+
+	return tout;
+}
+
+static int
+xfsaild(
+	void		*data)
+{
+	struct xfs_ail	*ailp = data;
+	long		tout = 0;	/* milliseconds */
+
+	current->flags |= PF_MEMALLOC;
+
+	while (!kthread_should_stop()) {
+		if (tout && tout <= 20)
+			__set_current_state(TASK_KILLABLE);
+		else
+			__set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_lock(&ailp->xa_lock);
+
+		/*
+		 * Idle if the AIL is empty and we are not racing with a target
+		 * update. We check the AIL after we set the task to a sleep
+		 * state to guarantee that we either catch an xa_target update
+		 * or that a wake_up resets the state to TASK_RUNNING.
+		 * Otherwise, we run the risk of sleeping indefinitely.
+		 *
+		 * The barrier matches the xa_target update in xfs_ail_push().
+		 */
+		smp_rmb();
+		if (!xfs_ail_min(ailp) &&
+		    ailp->xa_target == ailp->xa_target_prev) {
+			spin_unlock(&ailp->xa_lock);
+			schedule();
+			tout = 0;
+			continue;
+		}
+		spin_unlock(&ailp->xa_lock);
+
+		if (tout)
+			schedule_timeout(msecs_to_jiffies(tout));
+
+		__set_current_state(TASK_RUNNING);
+
+		try_to_freeze();
+
+		tout = xfsaild_push(ailp);
+	}
+
+	return 0;
+}
+
+/*
+ * This routine is called to move the tail of the AIL forward.  It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit approriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	threshold_lsn)
+{
+	xfs_log_item_t	*lip;
+
+	lip = xfs_ail_min(ailp);
+	if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
+	    XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+		return;
+
+	/*
+	 * Ensure that the new target is noticed in push code before it clears
+	 * the XFS_AIL_PUSHING_BIT.
+	 */
+	smp_wmb();
+	xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
+	smp_wmb();
+
+	wake_up_process(ailp->xa_task);
+}
+
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+	struct xfs_ail  *ailp)
+{
+	xfs_lsn_t       threshold_lsn = xfs_ail_max_lsn(ailp);
+
+	if (threshold_lsn)
+		xfs_ail_push(ailp, threshold_lsn);
+}
+
+/*
+ * Push out all items in the AIL immediately and wait until the AIL is empty.
+ */
+void
+xfs_ail_push_all_sync(
+	struct xfs_ail  *ailp)
+{
+	struct xfs_log_item	*lip;
+	DEFINE_WAIT(wait);
+
+	spin_lock(&ailp->xa_lock);
+	while ((lip = xfs_ail_max(ailp)) != NULL) {
+		prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE);
+		ailp->xa_target = lip->li_lsn;
+		wake_up_process(ailp->xa_task);
+		spin_unlock(&ailp->xa_lock);
+		schedule();
+		spin_lock(&ailp->xa_lock);
+	}
+	spin_unlock(&ailp->xa_lock);
+
+	finish_wait(&ailp->xa_empty, &wait);
+}
+
+/*
+ * xfs_trans_ail_update - bulk AIL insertion operation.
+ *
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
+ *
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
+ *
+ * To optimise the insert operation, we delete all the items from the AIL in
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
+ */
+void
+xfs_trans_ail_update_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
+{
+	xfs_log_item_t		*mlip;
+	int			mlip_changed = 0;
+	int			i;
+	LIST_HEAD(tmp);
+
+	ASSERT(nr_items > 0);		/* Not required, but true. */
+	mlip = xfs_ail_min(ailp);
+
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+		if (lip->li_flags & XFS_LI_IN_AIL) {
+			/* check if we really need to move the item */
+			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
+				continue;
+
+			trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+			xfs_ail_delete(ailp, lip);
+			if (mlip == lip)
+				mlip_changed = 1;
+		} else {
+			lip->li_flags |= XFS_LI_IN_AIL;
+			trace_xfs_ail_insert(lip, 0, lsn);
+		}
+		lip->li_lsn = lsn;
+		list_add(&lip->li_ail, &tmp);
+	}
+
+	if (!list_empty(&tmp))
+		xfs_ail_splice(ailp, cur, &tmp, lsn);
+
+	if (mlip_changed) {
+		if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+			xlog_assign_tail_lsn_locked(ailp->xa_mount);
+		spin_unlock(&ailp->xa_lock);
+
+		xfs_log_space_wake(ailp->xa_mount);
+	} else {
+		spin_unlock(&ailp->xa_lock);
+	}
+}
+
+/*
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+ *
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
+ *
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
+ * flag from the item and reset the item's lsn to 0. If we remove the first
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
+ *
+ * This function will not drop the AIL lock until all items are removed from
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
+ */
+void
+xfs_trans_ail_delete_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	int			shutdown_type) __releases(ailp->xa_lock)
+{
+	xfs_log_item_t		*mlip;
+	int			mlip_changed = 0;
+	int			i;
+
+	mlip = xfs_ail_min(ailp);
+
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+			struct xfs_mount	*mp = ailp->xa_mount;
+
+			spin_unlock(&ailp->xa_lock);
+			if (!XFS_FORCED_SHUTDOWN(mp)) {
+				xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+		"%s: attempting to delete a log item that is not in the AIL",
+						__func__);
+				xfs_force_shutdown(mp, shutdown_type);
+			}
+			return;
+		}
+
+		trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+		xfs_ail_delete(ailp, lip);
+		lip->li_flags &= ~XFS_LI_IN_AIL;
+		lip->li_lsn = 0;
+		if (mlip == lip)
+			mlip_changed = 1;
+	}
+
+	if (mlip_changed) {
+		if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+			xlog_assign_tail_lsn_locked(ailp->xa_mount);
+		if (list_empty(&ailp->xa_ail))
+			wake_up_all(&ailp->xa_empty);
+		spin_unlock(&ailp->xa_lock);
+
+		xfs_log_space_wake(ailp->xa_mount);
+	} else {
+		spin_unlock(&ailp->xa_lock);
+	}
+}
+
+int
+xfs_trans_ail_init(
+	xfs_mount_t	*mp)
+{
+	struct xfs_ail	*ailp;
+
+	ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+	if (!ailp)
+		return -ENOMEM;
+
+	ailp->xa_mount = mp;
+	INIT_LIST_HEAD(&ailp->xa_ail);
+	INIT_LIST_HEAD(&ailp->xa_cursors);
+	spin_lock_init(&ailp->xa_lock);
+	INIT_LIST_HEAD(&ailp->xa_buf_list);
+	init_waitqueue_head(&ailp->xa_empty);
+
+	ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+			ailp->xa_mount->m_fsname);
+	if (IS_ERR(ailp->xa_task))
+		goto out_free_ailp;
+
+	mp->m_ail = ailp;
+	return 0;
+
+out_free_ailp:
+	kmem_free(ailp);
+	return -ENOMEM;
+}
+
+void
+xfs_trans_ail_destroy(
+	xfs_mount_t	*mp)
+{
+	struct xfs_ail	*ailp = mp->m_ail;
+
+	kthread_stop(ailp->xa_task);
+	kmem_free(ailp);
+}
diff --git a/kernel/fs/xfs/xfs_trans_buf.c b/kernel/fs/xfs/xfs_trans_buf.c
new file mode 100644
index 000000000..757984128
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans_buf.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps)
+{
+	struct xfs_log_item_desc *lidp;
+	struct xfs_buf_log_item	*blip;
+	int			len = 0;
+	int			i;
+
+	for (i = 0; i < nmaps; i++)
+		len += map[i].bm_len;
+
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		blip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (blip->bli_item.li_type == XFS_LI_BUF &&
+		    blip->bli_buf->b_target == target &&
+		    XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn &&
+		    blip->bli_buf->b_length == len) {
+			ASSERT(blip->bli_buf->b_map_count == nmaps);
+			return blip->bli_buf;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Add the locked buffer to the transaction.
+ *
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+STATIC void
+_xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	int			reset_recur)
+{
+	struct xfs_buf_log_item	*bip;
+
+	ASSERT(bp->b_transp == NULL);
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+	bip = bp->b_fspriv;
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	if (reset_recur)
+		bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &bip->bli_item);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * in xfs_trans_get_buf() and friends above.
+	 */
+	bp->b_transp = tp;
+
+}
+
+void
+xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	_xfs_trans_bjoin(tp, bp, 0);
+	trace_xfs_trans_bjoin(bp->b_fspriv);
+}
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * get_buf() call.
+ */
+struct xfs_buf *
+xfs_trans_get_buf_map(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	if (!tp)
+		return xfs_buf_get_map(target, map, nmaps, flags);
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
+	if (bp != NULL) {
+		ASSERT(xfs_buf_islocked(bp));
+		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+			xfs_buf_stale(bp);
+			XFS_BUF_DONE(bp);
+		}
+
+		ASSERT(bp->b_transp == tp);
+		bip = bp->b_fspriv;
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		trace_xfs_trans_get_buf_recur(bip);
+		return bp;
+	}
+
+	bp = xfs_buf_get_map(target, map, nmaps, flags);
+	if (bp == NULL) {
+		return NULL;
+	}
+
+	ASSERT(!bp->b_error);
+
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_get_buf(bp->b_fspriv);
+	return bp;
+}
+
+/*
+ * Get and lock the superblock buffer of this file system for the
+ * given transaction.
+ *
+ * We don't need to use incore_match() here, because the superblock
+ * buffer is a private buffer which we keep a pointer to in the
+ * mount structure.
+ */
+xfs_buf_t *
+xfs_trans_getsb(xfs_trans_t	*tp,
+		struct xfs_mount *mp,
+		int		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	/*
+	 * Default to just trying to lock the superblock buffer
+	 * if tp is NULL.
+	 */
+	if (tp == NULL)
+		return xfs_getsb(mp, flags);
+
+	/*
+	 * If the superblock buffer already has this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	bp = mp->m_sb_bp;
+	if (bp->b_transp == tp) {
+		bip = bp->b_fspriv;
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		trace_xfs_trans_getsb_recur(bip);
+		return bp;
+	}
+
+	bp = xfs_getsb(mp, flags);
+	if (bp == NULL)
+		return NULL;
+
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_getsb(bp->b_fspriv);
+	return bp;
+}
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it has not yet been
+ * read in, read it from disk. If it is already locked
+ * within the transaction and already read in, just increment its
+ * lock recursion count and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * read_buf() call.
+ */
+int
+xfs_trans_read_buf_map(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp = NULL;
+	struct xfs_buf_log_item	*bip;
+	int			error;
+
+	*bpp = NULL;
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  If it is already read in we just increment
+	 * the lock recursion count and return the buffer to the caller.
+	 * If the buffer is not yet read in, then we read it in, increment
+	 * the lock recursion count, and return it to the caller.
+	 */
+	if (tp)
+		bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
+	if (bp) {
+		ASSERT(xfs_buf_islocked(bp));
+		ASSERT(bp->b_transp == tp);
+		ASSERT(bp->b_fspriv != NULL);
+		ASSERT(!bp->b_error);
+		ASSERT(bp->b_flags & XBF_DONE);
+
+		/*
+		 * We never locked this buf ourselves, so we shouldn't
+		 * brelse it either. Just get out.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+			return -EIO;
+		}
+
+		bip = bp->b_fspriv;
+		bip->bli_recur++;
+
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		trace_xfs_trans_read_buf_recur(bip);
+		*bpp = bp;
+		return 0;
+	}
+
+	bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
+	if (!bp) {
+		if (!(flags & XBF_TRYLOCK))
+			return -ENOMEM;
+		return tp ? 0 : -EAGAIN;
+	}
+
+	/*
+	 * If we've had a read error, then the contents of the buffer are
+	 * invalid and should not be used. To ensure that a followup read tries
+	 * to pull the buffer from disk again, we clear the XBF_DONE flag and
+	 * mark the buffer stale. This ensures that anyone who has a current
+	 * reference to the buffer will interpret it's contents correctly and
+	 * future cache lookups will also treat it as an empty, uninitialised
+	 * buffer.
+	 */
+	if (bp->b_error) {
+		error = bp->b_error;
+		if (!XFS_FORCED_SHUTDOWN(mp))
+			xfs_buf_ioerror_alert(bp, __func__);
+		bp->b_flags &= ~XBF_DONE;
+		xfs_buf_stale(bp);
+
+		if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
+			xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
+		xfs_buf_relse(bp);
+
+		/* bad CRC means corrupted metadata */
+		if (error == -EFSBADCRC)
+			error = -EFSCORRUPTED;
+		return error;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		xfs_buf_relse(bp);
+		trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+		return -EIO;
+	}
+
+	if (tp) {
+		_xfs_trans_bjoin(tp, bp, 1);
+		trace_xfs_trans_read_buf(bp->b_fspriv);
+	}
+	*bpp = bp;
+	return 0;
+
+}
+
+/*
+ * Release the buffer bp which was previously acquired with one of the
+ * xfs_trans_... buffer allocation routines if the buffer has not
+ * been modified within this transaction.  If the buffer is modified
+ * within this transaction, do decrement the recursion count but do
+ * not release the buffer even if the count goes to 0.  If the buffer is not
+ * modified within the transaction, decrement the recursion count and
+ * release the buffer if the recursion count goes to 0.
+ *
+ * If the buffer is to be released and it was not modified before
+ * this transaction began, then free the buf_log_item associated with it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * brelse() call.
+ */
+void
+xfs_trans_brelse(xfs_trans_t	*tp,
+		 xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	/*
+	 * Default to a normal brelse() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		ASSERT(bp->b_transp == NULL);
+		xfs_buf_relse(bp);
+		return;
+	}
+
+	ASSERT(bp->b_transp == tp);
+	bip = bp->b_fspriv;
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_trans_brelse(bip);
+
+	/*
+	 * If the release is just for a recursive lock,
+	 * then decrement the count and return.
+	 */
+	if (bip->bli_recur > 0) {
+		bip->bli_recur--;
+		return;
+	}
+
+	/*
+	 * If the buffer is dirty within this transaction, we can't
+	 * release it until we commit.
+	 */
+	if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
+		return;
+
+	/*
+	 * If the buffer has been invalidated, then we can't release
+	 * it until the transaction commits to disk unless it is re-dirtied
+	 * as part of this transaction.  This prevents us from pulling
+	 * the item from the AIL before we should.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE)
+		return;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+
+	/*
+	 * Free up the log item descriptor tracking the released item.
+	 */
+	xfs_trans_del_item(&bip->bli_item);
+
+	/*
+	 * Clear the hold flag in the buf log item if it is set.
+	 * We wouldn't want the next user of the buffer to
+	 * get confused.
+	 */
+	if (bip->bli_flags & XFS_BLI_HOLD) {
+		bip->bli_flags &= ~XFS_BLI_HOLD;
+	}
+
+	/*
+	 * Drop our reference to the buf log item.
+	 */
+	atomic_dec(&bip->bli_refcount);
+
+	/*
+	 * If the buf item is not tracking data in the log, then
+	 * we must free it before releasing the buffer back to the
+	 * free pool.  Before releasing the buffer to the free pool,
+	 * clear the transaction pointer in b_fsprivate2 to dissolve
+	 * its relation to this transaction.
+	 */
+	if (!xfs_buf_item_dirty(bip)) {
+/***
+		ASSERT(bp->b_pincount == 0);
+***/
+		ASSERT(atomic_read(&bip->bli_refcount) == 0);
+		ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+		ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+		xfs_buf_item_relse(bp);
+	}
+
+	bp->b_transp = NULL;
+	xfs_buf_relse(bp);
+}
+
+/*
+ * Mark the buffer as not needing to be unlocked when the buf item's
+ * iop_unlock() routine is called.  The buffer must already be locked
+ * and associated with the given transaction.
+ */
+/* ARGSUSED */
+void
+xfs_trans_bhold(xfs_trans_t	*tp,
+		xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_HOLD;
+	trace_xfs_trans_bhold(bip);
+}
+
+/*
+ * Cancel the previous buffer hold request made on this buffer
+ * for this transaction.
+ */
+void
+xfs_trans_bhold_release(xfs_trans_t	*tp,
+			xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT(bip->bli_flags & XFS_BLI_HOLD);
+
+	bip->bli_flags &= ~XFS_BLI_HOLD;
+	trace_xfs_trans_bhold_release(bip);
+}
+
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(xfs_trans_t	*tp,
+		  xfs_buf_t	*bp,
+		  uint		first,
+		  uint		last)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(first <= last && last < BBTOB(bp->b_length));
+	ASSERT(bp->b_iodone == NULL ||
+	       bp->b_iodone == xfs_buf_iodone_callbacks);
+
+	/*
+	 * Mark the buffer as needing to be written out eventually,
+	 * and set its iodone function to remove the buffer's buf log
+	 * item from the AIL and free it when the buffer is flushed
+	 * to disk.  See xfs_buf_attach_iodone() for more details
+	 * on li_cb and xfs_buf_iodone_callbacks().
+	 * If we end up aborting this transaction, we trap this buffer
+	 * inside the b_bdstrat callback so that this won't get written to
+	 * disk.
+	 */
+	XFS_BUF_DONE(bp);
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	bp->b_iodone = xfs_buf_iodone_callbacks;
+	bip->bli_item.li_cb = xfs_buf_iodone;
+
+	trace_xfs_trans_log_buf(bip);
+
+	/*
+	 * If we invalidated the buffer within this transaction, then
+	 * cancel the invalidation now that we're dirtying the buffer
+	 * again.  There are no races with the code in xfs_buf_item_unpin(),
+	 * because we have a reference to the buffer this entire time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		bip->bli_flags &= ~XFS_BLI_STALE;
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		XFS_BUF_UNSTALE(bp);
+		bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+	}
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * If we have an ordered buffer we are not logging any dirty range but
+	 * it still needs to be marked dirty and that it has been logged.
+	 */
+	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+	if (!(bip->bli_flags & XFS_BLI_ORDERED))
+		xfs_buf_item_log(bip, first, last);
+}
+
+
+/*
+ * Invalidate a buffer that is being used within a transaction.
+ *
+ * Typically this is because the blocks in the buffer are being freed, so we
+ * need to prevent it from being written out when we're done.  Allowing it
+ * to be written again might overwrite data in the free blocks if they are
+ * reallocated to a file.
+ *
+ * We prevent the buffer from being written out by marking it stale.  We can't
+ * get rid of the buf log item at this point because the buffer may still be
+ * pinned by another transaction.  If that is the case, then we'll wait until
+ * the buffer is committed to disk for the last time (we can tell by the ref
+ * count) and free it in xfs_buf_item_unpin().  Until that happens we will
+ * keep the buffer locked so that the buffer and buf log item are not reused.
+ *
+ * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log
+ * the buf item.  This will be used at recovery time to determine that copies
+ * of the buffer in the log before this should not be replayed.
+ *
+ * We mark the item descriptor and the transaction dirty so that we'll hold
+ * the buffer until after the commit.
+ *
+ * Since we're invalidating the buffer, we also clear the state about which
+ * parts of the buffer have been logged.  We also clear the flag indicating
+ * that this is an inode buffer since the data in the buffer will no longer
+ * be valid.
+ *
+ * We set the stale bit in the buffer as well since we're getting rid of it.
+ */
+void
+xfs_trans_binval(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+	int			i;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_trans_binval(bip);
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * If the buffer is already invalidated, then
+		 * just return.
+		 */
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
+		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
+		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
+		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+		ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
+		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+		return;
+	}
+
+	xfs_buf_stale(bp);
+
+	bip->bli_flags |= XFS_BLI_STALE;
+	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+	bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+	bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
+	bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK;
+	for (i = 0; i < bip->bli_format_count; i++) {
+		memset(bip->bli_formats[i].blf_data_map, 0,
+		       (bip->bli_formats[i].blf_map_size * sizeof(uint)));
+	}
+	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	tp->t_flags |= XFS_TRANS_DIRTY;
+}
+
+/*
+ * This call is used to indicate that the buffer contains on-disk inodes which
+ * must be handled specially during recovery.  They require special handling
+ * because only the di_next_unlinked from the inodes in the buffer should be
+ * recovered.  The rest of the data in the buffer is logged via the inodes
+ * themselves.
+ *
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
+ */
+void
+xfs_trans_inode_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_INODE_BUF;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * This call is used to indicate that the buffer is going to
+ * be staled and was an inode buffer. This means it gets
+ * special processing during unpin - where any inodes
+ * associated with the buffer should be removed from ail.
+ * There is also special processing during recovery,
+ * any replay of the inodes in the buffer needs to be
+ * prevented as the buffer may have been reused.
+ */
+void
+xfs_trans_stale_inode_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_STALE_INODE;
+	bip->bli_item.li_cb = xfs_buf_iodone;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * Mark the buffer as being one which contains newly allocated
+ * inodes.  We need to make sure that even if this buffer is
+ * relogged as an 'inode buf' we still recover all of the inode
+ * images in the face of a crash.  This works in coordination with
+ * xfs_buf_item_committed() to ensure that the buffer remains in the
+ * AIL at its original location even after it has been relogged.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_alloc_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_ORDERED;
+	trace_xfs_buf_item_ordered(bip);
+}
+
+/*
+ * Set the type of the buffer for log recovery so that it can correctly identify
+ * and hence attach the correct buffer ops to the buffer after replay.
+ */
+void
+xfs_trans_buf_set_type(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	enum xfs_blft		type)
+{
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!tp)
+		return;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	xfs_blft_to_flags(&bip->__bli_format, type);
+}
+
+void
+xfs_trans_buf_copy_type(
+	struct xfs_buf		*dst_bp,
+	struct xfs_buf		*src_bp)
+{
+	struct xfs_buf_log_item	*sbip = src_bp->b_fspriv;
+	struct xfs_buf_log_item	*dbip = dst_bp->b_fspriv;
+	enum xfs_blft		type;
+
+	type = xfs_blft_from_flags(&sbip->__bli_format);
+	xfs_blft_to_flags(&dbip->__bli_format, type);
+}
+
+/*
+ * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
+ * dquots. However, unlike in inode buffer recovery, dquot buffers get
+ * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
+ * The only thing that makes dquot buffers different from regular
+ * buffers is that we must not replay dquot bufs when recovering
+ * if a _corresponding_ quotaoff has happened. We also have to distinguish
+ * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
+ * can be turned off independently.
+ */
+/* ARGSUSED */
+void
+xfs_trans_dquot_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp,
+	uint		type)
+{
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	ASSERT(type == XFS_BLF_UDQUOT_BUF ||
+	       type == XFS_BLF_PDQUOT_BUF ||
+	       type == XFS_BLF_GDQUOT_BUF);
+
+	bip->__bli_format.blf_flags |= type;
+
+	switch (type) {
+	case XFS_BLF_UDQUOT_BUF:
+		type = XFS_BLFT_UDQUOT_BUF;
+		break;
+	case XFS_BLF_PDQUOT_BUF:
+		type = XFS_BLFT_PDQUOT_BUF;
+		break;
+	case XFS_BLF_GDQUOT_BUF:
+		type = XFS_BLFT_GDQUOT_BUF;
+		break;
+	default:
+		type = XFS_BLFT_UNKNOWN_BUF;
+		break;
+	}
+
+	xfs_trans_buf_set_type(tp, bp, type);
+}
diff --git a/kernel/fs/xfs/xfs_trans_dquot.c b/kernel/fs/xfs/xfs_trans_dquot.c
new file mode 100644
index 000000000..76a16df55
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans_dquot.c
@@ -0,0 +1,887 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+
+STATIC void	xfs_trans_alloc_dqinfo(xfs_trans_t *);
+
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(dqp->q_transp != tp);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(dqp->q_logitem.qli_dquot == dqp);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
+
+	/*
+	 * Initialize d_transp so we can later determine if this dquot is
+	 * associated with this transaction.
+	 */
+	dqp->q_transp = tp;
+}
+
+
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed.  The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(dqp->q_transp == tp);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+void
+xfs_trans_dup_dqinfo(
+	xfs_trans_t	*otp,
+	xfs_trans_t	*ntp)
+{
+	xfs_dqtrx_t	*oq, *nq;
+	int		i,j;
+	xfs_dqtrx_t	*oqa, *nqa;
+
+	if (!otp->t_dqinfo)
+		return;
+
+	xfs_trans_alloc_dqinfo(ntp);
+
+	/*
+	 * Because the quota blk reservation is carried forward,
+	 * it is also necessary to carry forward the DQ_DIRTY flag.
+	 */
+	if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+		ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
+
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		oqa = otp->t_dqinfo->dqs[j];
+		nqa = ntp->t_dqinfo->dqs[j];
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			if (oqa[i].qt_dquot == NULL)
+				break;
+			oq = &oqa[i];
+			nq = &nqa[i];
+
+			nq->qt_dquot = oq->qt_dquot;
+			nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+			nq->qt_rtbcount_delta = 0;
+
+			/*
+			 * Transfer whatever is left of the reservations.
+			 */
+			nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
+			oq->qt_blk_res = oq->qt_blk_res_used;
+
+			nq->qt_rtblk_res = oq->qt_rtblk_res -
+				oq->qt_rtblk_res_used;
+			oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+
+			nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+			oq->qt_ino_res = oq->qt_ino_res_used;
+
+		}
+	}
+}
+
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		field,
+	long		delta)
+{
+	xfs_mount_t	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) ||
+	    !XFS_IS_QUOTA_ON(mp) ||
+	    xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+		return;
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+	if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+	if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
+}
+
+STATIC struct xfs_dqtrx *
+xfs_trans_get_dqtrx(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp)
+{
+	int			i;
+	struct xfs_dqtrx	*qa;
+
+	if (XFS_QM_ISUDQ(dqp))
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+	else if (XFS_QM_ISGDQ(dqp))
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
+	else if (XFS_QM_ISPDQ(dqp))
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
+	else
+		return NULL;
+
+	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+		if (qa[i].qt_dquot == NULL ||
+		    qa[i].qt_dquot == dqp)
+			return &qa[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp,
+	uint		field,
+	long		delta)
+{
+	xfs_dqtrx_t	*qtrx;
+
+	ASSERT(tp);
+	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+	qtrx = NULL;
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+	/*
+	 * Find either the first free slot or the slot that belongs
+	 * to this dquot.
+	 */
+	qtrx = xfs_trans_get_dqtrx(tp, dqp);
+	ASSERT(qtrx);
+	if (qtrx->qt_dquot == NULL)
+		qtrx->qt_dquot = dqp;
+
+	switch (field) {
+
+		/*
+		 * regular disk blk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_BLKS:
+		qtrx->qt_blk_res += (ulong)delta;
+		break;
+
+		/*
+		 * inode reservation
+		 */
+	      case XFS_TRANS_DQ_RES_INOS:
+		qtrx->qt_ino_res += (ulong)delta;
+		break;
+
+		/*
+		 * disk blocks used.
+		 */
+	      case XFS_TRANS_DQ_BCOUNT:
+		if (qtrx->qt_blk_res && delta > 0) {
+			qtrx->qt_blk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
+		}
+		qtrx->qt_bcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELBCOUNT:
+		qtrx->qt_delbcnt_delta += delta;
+		break;
+
+		/*
+		 * Inode Count
+		 */
+	      case XFS_TRANS_DQ_ICOUNT:
+		if (qtrx->qt_ino_res && delta > 0) {
+			qtrx->qt_ino_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+		}
+		qtrx->qt_icount_delta += delta;
+		break;
+
+		/*
+		 * rtblk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_RTBLKS:
+		qtrx->qt_rtblk_res += (ulong)delta;
+		break;
+
+		/*
+		 * rtblk count
+		 */
+	      case XFS_TRANS_DQ_RTBCOUNT:
+		if (qtrx->qt_rtblk_res && delta > 0) {
+			qtrx->qt_rtblk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+		}
+		qtrx->qt_rtbcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELRTBCOUNT:
+		qtrx->qt_delrtb_delta += delta;
+		break;
+
+	      default:
+		ASSERT(0);
+	}
+	tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+}
+
+
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated and join
+ * them to the transaction, provided they have been modified.  We know that the
+ * highest number of dquots of one type - usr, grp and prj - involved in a
+ * transaction is 3 so we don't need to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+	xfs_trans_t	*tp,
+	xfs_dqtrx_t	*q)
+{
+	ASSERT(q[0].qt_dquot != NULL);
+	if (q[1].qt_dquot == NULL) {
+		xfs_dqlock(q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+	} else {
+		ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+		xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[1].qt_dquot);
+	}
+}
+
+
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction.
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+	struct xfs_trans	*tp)
+{
+	int			i, j;
+	struct xfs_dquot	*dqp;
+	struct xfs_dqtrx	*qtrx, *qa;
+	struct xfs_disk_dquot	*d;
+	long			totalbdelta;
+	long			totalrtbdelta;
+
+	if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+		return;
+
+	ASSERT(tp->t_dqinfo);
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		qa = tp->t_dqinfo->dqs[j];
+		if (qa[0].qt_dquot == NULL)
+			continue;
+
+		/*
+		 * Lock all of the dquots and join them to the transaction.
+		 */
+		xfs_trans_dqlockedjoin(tp, qa);
+
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * The array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+
+			ASSERT(XFS_DQ_IS_LOCKED(dqp));
+			ASSERT(dqp->q_transp == tp);
+
+			/*
+			 * adjust the actual number of blocks used
+			 */
+			d = &dqp->q_core;
+
+			/*
+			 * The issue here is - sometimes we don't make a blkquota
+			 * reservation intentionally to be fair to users
+			 * (when the amount is small). On the other hand,
+			 * delayed allocs do make reservations, but that's
+			 * outside of a transaction, so we have no
+			 * idea how much was really reserved.
+			 * So, here we've accumulated delayed allocation blks and
+			 * non-delay blks. The assumption is that the
+			 * delayed ones are always reserved (outside of a
+			 * transaction), and the others may or may not have
+			 * quota reservations.
+			 */
+			totalbdelta = qtrx->qt_bcount_delta +
+				qtrx->qt_delbcnt_delta;
+			totalrtbdelta = qtrx->qt_rtbcount_delta +
+				qtrx->qt_delrtb_delta;
+#ifdef DEBUG
+			if (totalbdelta < 0)
+				ASSERT(be64_to_cpu(d->d_bcount) >=
+				       -totalbdelta);
+
+			if (totalrtbdelta < 0)
+				ASSERT(be64_to_cpu(d->d_rtbcount) >=
+				       -totalrtbdelta);
+
+			if (qtrx->qt_icount_delta < 0)
+				ASSERT(be64_to_cpu(d->d_icount) >=
+				       -qtrx->qt_icount_delta);
+#endif
+			if (totalbdelta)
+				be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+
+			if (qtrx->qt_icount_delta)
+				be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+
+			if (totalrtbdelta)
+				be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+
+			/*
+			 * Get any default limits in use.
+			 * Start/reset the timer(s) if needed.
+			 */
+			if (d->d_id) {
+				xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
+				xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+			}
+
+			dqp->dq_flags |= XFS_DQ_DIRTY;
+			/*
+			 * add this to the list of items to get logged
+			 */
+			xfs_trans_log_dquot(tp, dqp);
+			/*
+			 * Take off what's left of the original reservation.
+			 * In case of delayed allocations, there's no
+			 * reservation that a transaction structure knows of.
+			 */
+			if (qtrx->qt_blk_res != 0) {
+				if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+					if (qtrx->qt_blk_res >
+					    qtrx->qt_blk_res_used)
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res -
+							 qtrx->qt_blk_res_used);
+					else
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res_used -
+							 qtrx->qt_blk_res);
+				}
+			} else {
+				/*
+				 * These blks were never reserved, either inside
+				 * a transaction or outside one (in a delayed
+				 * allocation). Also, this isn't always a
+				 * negative number since we sometimes
+				 * deliberately skip quota reservations.
+				 */
+				if (qtrx->qt_bcount_delta) {
+					dqp->q_res_bcount +=
+					      (xfs_qcnt_t)qtrx->qt_bcount_delta;
+				}
+			}
+			/*
+			 * Adjust the RT reservation.
+			 */
+			if (qtrx->qt_rtblk_res != 0) {
+				if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
+					if (qtrx->qt_rtblk_res >
+					    qtrx->qt_rtblk_res_used)
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res -
+							qtrx->qt_rtblk_res_used);
+					else
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res_used -
+							qtrx->qt_rtblk_res);
+				}
+			} else {
+				if (qtrx->qt_rtbcount_delta)
+					dqp->q_res_rtbcount +=
+					    (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
+			}
+
+			/*
+			 * Adjust the inode reservation.
+			 */
+			if (qtrx->qt_ino_res != 0) {
+				ASSERT(qtrx->qt_ino_res >=
+				       qtrx->qt_ino_res_used);
+				if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
+					dqp->q_res_icount -= (xfs_qcnt_t)
+						(qtrx->qt_ino_res -
+						 qtrx->qt_ino_res_used);
+			} else {
+				if (qtrx->qt_icount_delta)
+					dqp->q_res_icount +=
+					    (xfs_qcnt_t)qtrx->qt_icount_delta;
+			}
+
+			ASSERT(dqp->q_res_bcount >=
+				be64_to_cpu(dqp->q_core.d_bcount));
+			ASSERT(dqp->q_res_icount >=
+				be64_to_cpu(dqp->q_core.d_icount));
+			ASSERT(dqp->q_res_rtbcount >=
+				be64_to_cpu(dqp->q_core.d_rtbcount));
+		}
+	}
+}
+
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+void
+xfs_trans_unreserve_and_mod_dquots(
+	xfs_trans_t		*tp)
+{
+	int			i, j;
+	xfs_dquot_t		*dqp;
+	xfs_dqtrx_t		*qtrx, *qa;
+	bool                    locked;
+
+	if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+		return;
+
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		qa = tp->t_dqinfo->dqs[j];
+
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * We assume that the array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+			/*
+			 * Unreserve the original reservation. We don't care
+			 * about the number of blocks used field, or deltas.
+			 * Also we don't bother to zero the fields.
+			 */
+			locked = false;
+			if (qtrx->qt_blk_res) {
+				xfs_dqlock(dqp);
+				locked = true;
+				dqp->q_res_bcount -=
+					(xfs_qcnt_t)qtrx->qt_blk_res;
+			}
+			if (qtrx->qt_ino_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = true;
+				}
+				dqp->q_res_icount -=
+					(xfs_qcnt_t)qtrx->qt_ino_res;
+			}
+
+			if (qtrx->qt_rtblk_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = true;
+				}
+				dqp->q_res_rtbcount -=
+					(xfs_qcnt_t)qtrx->qt_rtblk_res;
+			}
+			if (locked)
+				xfs_dqunlock(dqp);
+
+		}
+	}
+}
+
+STATIC void
+xfs_quota_warn(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	int			type)
+{
+	/* no warnings for project quotas - we just return ENOSPC later */
+	if (dqp->dq_flags & XFS_DQ_PROJ)
+		return;
+	quota_send_warning(make_kqid(&init_user_ns,
+				     (dqp->dq_flags & XFS_DQ_USER) ?
+				     USRQUOTA : GRPQUOTA,
+				     be32_to_cpu(dqp->q_core.d_id)),
+			   mp->m_super->s_dev, type);
+}
+
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ */
+STATIC int
+xfs_trans_dqresv(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dquot_t	*dqp,
+	long		nblks,
+	long		ninos,
+	uint		flags)
+{
+	xfs_qcnt_t	hardlimit;
+	xfs_qcnt_t	softlimit;
+	time_t		timer;
+	xfs_qwarncnt_t	warns;
+	xfs_qwarncnt_t	warnlimit;
+	xfs_qcnt_t	total_count;
+	xfs_qcnt_t	*resbcountp;
+	xfs_quotainfo_t	*q = mp->m_quotainfo;
+
+
+	xfs_dqlock(dqp);
+
+	if (flags & XFS_TRANS_DQ_RES_BLKS) {
+		hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+		if (!hardlimit)
+			hardlimit = q->qi_bhardlimit;
+		softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+		if (!softlimit)
+			softlimit = q->qi_bsoftlimit;
+		timer = be32_to_cpu(dqp->q_core.d_btimer);
+		warns = be16_to_cpu(dqp->q_core.d_bwarns);
+		warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
+		resbcountp = &dqp->q_res_bcount;
+	} else {
+		ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
+		hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
+		if (!hardlimit)
+			hardlimit = q->qi_rtbhardlimit;
+		softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
+		if (!softlimit)
+			softlimit = q->qi_rtbsoftlimit;
+		timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+		warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+		warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
+		resbcountp = &dqp->q_res_rtbcount;
+	}
+
+	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
+	    dqp->q_core.d_id &&
+	    ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
+	     (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
+	     (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
+		if (nblks > 0) {
+			/*
+			 * dquot is locked already. See if we'd go over the
+			 * hardlimit or exceed the timelimit if we allocate
+			 * nblks.
+			 */
+			total_count = *resbcountp + nblks;
+			if (hardlimit && total_count > hardlimit) {
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
+				goto error_return;
+			}
+			if (softlimit && total_count > softlimit) {
+				if ((timer != 0 && get_seconds() > timer) ||
+				    (warns != 0 && warns >= warnlimit)) {
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_BSOFTLONGWARN);
+					goto error_return;
+				}
+
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
+			}
+		}
+		if (ninos > 0) {
+			total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
+			timer = be32_to_cpu(dqp->q_core.d_itimer);
+			warns = be16_to_cpu(dqp->q_core.d_iwarns);
+			warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
+			hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+			if (!hardlimit)
+				hardlimit = q->qi_ihardlimit;
+			softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+			if (!softlimit)
+				softlimit = q->qi_isoftlimit;
+
+			if (hardlimit && total_count > hardlimit) {
+				xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
+				goto error_return;
+			}
+			if (softlimit && total_count > softlimit) {
+				if  ((timer != 0 && get_seconds() > timer) ||
+				     (warns != 0 && warns >= warnlimit)) {
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_ISOFTLONGWARN);
+					goto error_return;
+				}
+				xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
+			}
+		}
+	}
+
+	/*
+	 * Change the reservation, but not the actual usage.
+	 * Note that q_res_bcount = q_core.d_bcount + resv
+	 */
+	(*resbcountp) += (xfs_qcnt_t)nblks;
+	if (ninos != 0)
+		dqp->q_res_icount += (xfs_qcnt_t)ninos;
+
+	/*
+	 * note the reservation amt in the trans struct too,
+	 * so that the transaction knows how much was reserved by
+	 * it against this particular dquot.
+	 * We don't do this when we are reserving for a delayed allocation,
+	 * because we don't have the luxury of a transaction envelope then.
+	 */
+	if (tp) {
+		ASSERT(tp->t_dqinfo);
+		ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+		if (nblks != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    flags & XFS_QMOPT_RESBLK_MASK,
+					    nblks);
+		if (ninos != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    XFS_TRANS_DQ_RES_INOS,
+					    ninos);
+	}
+	ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
+	ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
+	ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+
+	xfs_dqunlock(dqp);
+	return 0;
+
+error_return:
+	xfs_dqunlock(dqp);
+	if (flags & XFS_QMOPT_ENOSPC)
+		return -ENOSPC;
+	return -EDQUOT;
+}
+
+
+/*
+ * Given dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against user, group and
+ * project quotas is important, because this follows a all-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ *	   XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
+ *	   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ *	   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+	struct xfs_trans	*tp,
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp,
+	long			nblks,
+	long			ninos,
+	uint			flags)
+{
+	int		error;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	if (tp && tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+
+	if (udqp) {
+		error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
+					(flags & ~XFS_QMOPT_ENOSPC));
+		if (error)
+			return error;
+	}
+
+	if (gdqp) {
+		error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
+		if (error)
+			goto unwind_usr;
+	}
+
+	if (pdqp) {
+		error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
+		if (error)
+			goto unwind_grp;
+	}
+
+	/*
+	 * Didn't change anything critical, so, no need to log
+	 */
+	return 0;
+
+unwind_grp:
+	flags |= XFS_QMOPT_FORCE_RES;
+	if (gdqp)
+		xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
+unwind_usr:
+	flags |= XFS_QMOPT_FORCE_RES;
+	if (udqp)
+		xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+	return error;
+}
+
+
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesn't change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ */
+int
+xfs_trans_reserve_quota_nblks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	long			nblks,
+	long			ninos,
+	uint			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+	if (XFS_IS_PQUOTA_ON(mp))
+		flags |= XFS_QMOPT_ENOSPC;
+
+	ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+				XFS_TRANS_DQ_RES_RTBLKS ||
+	       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+				XFS_TRANS_DQ_RES_BLKS);
+
+	/*
+	 * Reserve nblks against these dquots, with trans as the mediator.
+	 */
+	return xfs_trans_reserve_quota_bydquots(tp, mp,
+						ip->i_udquot, ip->i_gdquot,
+						ip->i_pdquot,
+						nblks, ninos, flags);
+}
+
+/*
+ * This routine is called to allocate a quotaoff log item.
+ */
+xfs_qoff_logitem_t *
+xfs_trans_get_qoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_qoff_logitem_t	*q;
+
+	ASSERT(tp != NULL);
+
+	q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
+	ASSERT(q != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &q->qql_item);
+	return q;
+}
+
+
+/*
+ * This is called to mark the quotaoff logitem as needing
+ * to be logged when the transaction is committed.  The logitem must
+ * already be associated with the given transaction.
+ */
+void
+xfs_trans_log_quotaoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*qlp)
+{
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+STATIC void
+xfs_trans_alloc_dqinfo(
+	xfs_trans_t	*tp)
+{
+	tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
+}
+
+void
+xfs_trans_free_dqinfo(
+	xfs_trans_t	*tp)
+{
+	if (!tp->t_dqinfo)
+		return;
+	kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
+	tp->t_dqinfo = NULL;
+}
diff --git a/kernel/fs/xfs/xfs_trans_extfree.c b/kernel/fs/xfs/xfs_trans_extfree.c
new file mode 100644
index 000000000..284397dd7
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans_extfree.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+
+/*
+ * This routine is called to allocate an "extent free intention"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efi_log_item_t *
+xfs_trans_get_efi(xfs_trans_t	*tp,
+		  uint		nextents)
+{
+	xfs_efi_log_item_t	*efip;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efip = xfs_efi_init(tp->t_mountp, nextents);
+	ASSERT(efip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &efip->efi_item);
+	return efip;
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as needing to be freed.  It should
+ * be called once for each extent to be freed.
+ */
+void
+xfs_trans_log_efi_extent(xfs_trans_t		*tp,
+			 xfs_efi_log_item_t	*efip,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+	ASSERT(next_extent < efip->efi_format.efi_nextents);
+	extp = &(efip->efi_format.efi_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+}
+
+
+/*
+ * This routine is called to allocate an "extent free done"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efd_log_item_t *
+xfs_trans_get_efd(xfs_trans_t		*tp,
+		  xfs_efi_log_item_t	*efip,
+		  uint			nextents)
+{
+	xfs_efd_log_item_t	*efdp;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
+	ASSERT(efdp != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &efdp->efd_item);
+	return efdp;
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as having been freed.  It should
+ * be called once for each extent freed.
+ */
+void
+xfs_trans_log_efd_extent(xfs_trans_t		*tp,
+			 xfs_efd_log_item_t	*efdp,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efdp->efd_next_extent++;
+}
diff --git a/kernel/fs/xfs/xfs_trans_inode.c b/kernel/fs/xfs/xfs_trans_inode.c
new file mode 100644
index 000000000..17280cd71
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans_inode.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+/*
+ * Add a locked inode to the transaction.
+ *
+ * The inode must be locked, and it cannot be associated with any transaction.
+ * If lock_flags is non-zero the inode will be unlocked on transaction commit.
+ */
+void
+xfs_trans_ijoin(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	uint			lock_flags)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	if (ip->i_itemp == NULL)
+		xfs_inode_item_init(ip, ip->i_mount);
+	iip = ip->i_itemp;
+
+	ASSERT(iip->ili_lock_flags == 0);
+	iip->ili_lock_flags = lock_flags;
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &iip->ili_item);
+}
+
+/*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct timespec		tv;
+
+	ASSERT(tp);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	tv = current_fs_time(inode->i_sb);
+
+	if ((flags & XFS_ICHGTIME_MOD) &&
+	    !timespec_equal(&inode->i_mtime, &tv)) {
+		inode->i_mtime = tv;
+		ip->i_d.di_mtime.t_sec = tv.tv_sec;
+		ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
+	}
+	if ((flags & XFS_ICHGTIME_CHG) &&
+	    !timespec_equal(&inode->i_ctime, &tv)) {
+		inode->i_ctime = tv;
+		ip->i_d.di_ctime.t_sec = tv.tv_sec;
+		ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
+	}
+}
+
+/*
+ * This is called to mark the fields indicated in fieldmask as needing
+ * to be logged when the transaction is committed.  The inode must
+ * already be associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h.  We always
+ * log all of the core inode if any of it has changed, and we always log
+ * all of the inline data/extents/b-tree root if any of them has changed.
+ */
+void
+xfs_trans_log_inode(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		flags)
+{
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	/*
+	 * First time we log the inode in a transaction, bump the inode change
+	 * counter if it is configured for this to occur. We don't use
+	 * inode_inc_version() because there is no need for extra locking around
+	 * i_version as we already hold the inode locked exclusively for
+	 * metadata modification.
+	 */
+	if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+	    IS_I_VERSION(VFS_I(ip))) {
+		ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
+		flags |= XFS_ILOG_CORE;
+	}
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * Always OR in the bits from the ili_last_fields field.
+	 * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
+	 * routines in the eventual clearing of the ili_fields bits.
+	 * See the big comment in xfs_iflush() for an explanation of
+	 * this coordination mechanism.
+	 */
+	flags |= ip->i_itemp->ili_last_fields;
+	ip->i_itemp->ili_fields |= flags;
+}
diff --git a/kernel/fs/xfs/xfs_trans_priv.h b/kernel/fs/xfs/xfs_trans_priv.h
new file mode 100644
index 000000000..bd1281862
--- /dev/null
+++ b/kernel/fs/xfs/xfs_trans_priv.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TRANS_PRIV_H__
+#define	__XFS_TRANS_PRIV_H__
+
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
+
+
+void	xfs_trans_init(struct xfs_mount *);
+void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
+void	xfs_trans_del_item(struct xfs_log_item *);
+void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+				int flags);
+void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+
+void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+				xfs_lsn_t commit_lsn, int aborted);
+/*
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
+ */
+struct xfs_ail_cursor {
+	struct list_head	list;
+	struct xfs_log_item	*item;
+};
+
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+	struct xfs_mount	*xa_mount;
+	struct task_struct	*xa_task;
+	struct list_head	xa_ail;
+	xfs_lsn_t		xa_target;
+	xfs_lsn_t		xa_target_prev;
+	struct list_head	xa_cursors;
+	spinlock_t		xa_lock;
+	xfs_lsn_t		xa_last_pushed_lsn;
+	int			xa_log_flush;
+	struct list_head	xa_buf_list;
+	wait_queue_head_t	xa_empty;
+};
+
+/*
+ * From xfs_trans_ail.c
+ */
+void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+				struct xfs_ail_cursor *cur,
+				struct xfs_log_item **log_items, int nr_items,
+				xfs_lsn_t lsn) __releases(ailp->xa_lock);
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static inline struct xfs_log_item *
+xfs_ail_min(
+	struct xfs_ail  *ailp)
+{
+	return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item,
+					li_ail);
+}
+
+static inline void
+xfs_trans_ail_update(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
+{
+	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+}
+
+void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+				struct xfs_log_item **log_items, int nr_items,
+				int shutdown_type)
+				__releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip,
+	int		shutdown_type) __releases(ailp->xa_lock)
+{
+	xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
+}
+
+void			xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void			xfs_ail_push_all(struct xfs_ail *);
+void			xfs_ail_push_all_sync(struct xfs_ail *);
+struct xfs_log_item	*xfs_ail_min(struct xfs_ail  *ailp);
+xfs_lsn_t		xfs_ail_min_lsn(struct xfs_ail *ailp);
+
+struct xfs_log_item *	xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur,
+					xfs_lsn_t lsn);
+struct xfs_log_item *	xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur,
+					xfs_lsn_t lsn);
+struct xfs_log_item *	xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur);
+void			xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur);
+
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	*dst,
+	xfs_lsn_t	*src)
+{
+	ASSERT(sizeof(xfs_lsn_t) == 8);	/* don't lock if it shrinks */
+	spin_lock(&ailp->xa_lock);
+	*dst = *src;
+	spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	*dst,
+	xfs_lsn_t	*src)
+{
+	ASSERT(sizeof(xfs_lsn_t) == 8);
+	*dst = *src;
+}
+#endif
+#endif	/* __XFS_TRANS_PRIV_H__ */
diff --git a/kernel/fs/xfs/xfs_xattr.c b/kernel/fs/xfs/xfs_xattr.c
new file mode 100644
index 000000000..c03681518
--- /dev/null
+++ b/kernel/fs/xfs/xfs_xattr.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_acl.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+static int
+xfs_xattr_get(struct dentry *dentry, const char *name,
+		void *value, size_t size, int xflags)
+{
+	struct xfs_inode *ip = XFS_I(d_inode(dentry));
+	int error, asize = size;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (!size) {
+		xflags |= ATTR_KERNOVAL;
+		value = NULL;
+	}
+
+	error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+	if (error)
+		return error;
+	return asize;
+}
+
+static int
+xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags, int xflags)
+{
+	struct xfs_inode *ip = XFS_I(d_inode(dentry));
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (flags & XATTR_CREATE)
+		xflags |= ATTR_CREATE;
+	if (flags & XATTR_REPLACE)
+		xflags |= ATTR_REPLACE;
+
+	if (!value)
+		return xfs_attr_remove(ip, (unsigned char *)name, xflags);
+	return xfs_attr_set(ip, (unsigned char *)name,
+				(void *)value, size, xflags);
+}
+
+static const struct xattr_handler xfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.flags	= 0, /* no flags implies user namespace */
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.flags	= ATTR_ROOT,
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= ATTR_SECURE,
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+const struct xattr_handler *xfs_xattr_handlers[] = {
+	&xfs_xattr_user_handler,
+	&xfs_xattr_trusted_handler,
+	&xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL
+};
+
+static unsigned int xfs_xattr_prefix_len(int flags)
+{
+	if (flags & XFS_ATTR_SECURE)
+		return sizeof("security");
+	else if (flags & XFS_ATTR_ROOT)
+		return sizeof("trusted");
+	else
+		return sizeof("user");
+}
+
+static const char *xfs_xattr_prefix(int flags)
+{
+	if (flags & XFS_ATTR_SECURE)
+		return xfs_xattr_security_handler.prefix;
+	else if (flags & XFS_ATTR_ROOT)
+		return xfs_xattr_trusted_handler.prefix;
+	else
+		return xfs_xattr_user_handler.prefix;
+}
+
+static int
+xfs_xattr_put_listent(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
+{
+	unsigned int prefix_len = xfs_xattr_prefix_len(flags);
+	char *offset;
+	int arraytop;
+
+	ASSERT(context->count >= 0);
+
+	/*
+	 * Only show root namespace entries if we are actually allowed to
+	 * see them.
+	 */
+	if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
+		return 0;
+
+	arraytop = context->count + prefix_len + namelen + 1;
+	if (arraytop > context->firstu) {
+		context->count = -1;	/* insufficient space */
+		return 1;
+	}
+	offset = (char *)context->alist + context->count;
+	strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+	offset += prefix_len;
+	strncpy(offset, (char *)name, namelen);			/* real name */
+	offset += namelen;
+	*offset = '\0';
+	context->count += prefix_len + namelen + 1;
+	return 0;
+}
+
+static int
+xfs_xattr_put_listent_sizes(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
+{
+	context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
+	return 0;
+}
+
+static int
+list_one_attr(const char *name, const size_t len, void *data,
+		size_t size, ssize_t *result)
+{
+	char *p = data + *result;
+
+	*result += len;
+	if (!size)
+		return 0;
+	if (*result > size)
+		return -ERANGE;
+
+	strcpy(p, name);
+	return 0;
+}
+
+ssize_t
+xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+	struct xfs_attr_list_context context;
+	struct attrlist_cursor_kern cursor = { 0 };
+	struct inode		*inode = d_inode(dentry);
+	int			error;
+
+	/*
+	 * First read the regular on-disk attributes.
+	 */
+	memset(&context, 0, sizeof(context));
+	context.dp = XFS_I(inode);
+	context.cursor = &cursor;
+	context.resynch = 1;
+	context.alist = data;
+	context.bufsize = size;
+	context.firstu = context.bufsize;
+
+	if (size)
+		context.put_listent = xfs_xattr_put_listent;
+	else
+		context.put_listent = xfs_xattr_put_listent_sizes;
+
+	xfs_attr_list_int(&context);
+	if (context.count < 0)
+		return -ERANGE;
+
+	/*
+	 * Then add the two synthetic ACL attributes.
+	 */
+	if (posix_acl_access_exists(inode)) {
+		error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
+				strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+				data, size, &context.count);
+		if (error)
+			return error;
+	}
+
+	if (posix_acl_default_exists(inode)) {
+		error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
+				strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+				data, size, &context.count);
+		if (error)
+			return error;
+	}
+
+	return context.count;
+}
-- 
cgit 1.2.3-korg